diff --git a/checkpoint-10000/config.json b/checkpoint-10000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-10000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-10000/generation_config.json b/checkpoint-10000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-10000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-10000/model.safetensors.index.json b/checkpoint-10000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-10000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-10000/rng_state_0.pth b/checkpoint-10000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-10000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-10000/rng_state_1.pth b/checkpoint-10000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-10000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-10000/rng_state_2.pth b/checkpoint-10000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-10000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-10000/rng_state_3.pth b/checkpoint-10000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-10000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-10000/rng_state_5.pth b/checkpoint-10000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-10000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-10000/rng_state_6.pth b/checkpoint-10000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-10000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-10000/rng_state_7.pth b/checkpoint-10000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-10000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-10000/scheduler.pt b/checkpoint-10000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..267e6715d6409d5e295c9a3252faf1100497ae0b --- /dev/null +++ b/checkpoint-10000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb98a8002bee025b16ac554d2084a5e57c94b23dba5e88767538b57f9a953457 +size 1064 diff --git a/checkpoint-10000/trainer_state.json b/checkpoint-10000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..f0912076f157aab26b3fd6909acaf60c3a4e5646 --- /dev/null +++ b/checkpoint-10000/trainer_state.json @@ -0,0 +1,70034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.059472832809972405, + "eval_steps": 500, + "global_step": 10000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.947283280997241e-06, + "grad_norm": 179.1047821044922, + "learning_rate": 5e-05, + "loss": 14.5158, + "step": 1 + }, + { + "epoch": 1.1894566561994482e-05, + "grad_norm": 40.39401626586914, + "learning_rate": 4.999999999563638e-05, + "loss": 14.152, + "step": 2 + }, + { + "epoch": 1.7841849842991722e-05, + "grad_norm": 137.05079650878906, + "learning_rate": 4.999999998254552e-05, + "loss": 14.6334, + "step": 3 + }, + { + "epoch": 2.3789133123988963e-05, + "grad_norm": 23.315088272094727, + "learning_rate": 4.9999999960727415e-05, + "loss": 12.6458, + "step": 4 + }, + { + "epoch": 2.97364164049862e-05, + "grad_norm": 7.943603992462158, + "learning_rate": 4.9999999930182065e-05, + "loss": 11.8435, + "step": 5 + }, + { + "epoch": 3.5683699685983445e-05, + "grad_norm": 6.374181270599365, + "learning_rate": 4.999999989090948e-05, + "loss": 11.4544, + "step": 6 + }, + { + "epoch": 4.1630982966980686e-05, + "grad_norm": 8.948652267456055, + "learning_rate": 4.999999984290965e-05, + "loss": 11.3516, + "step": 7 + }, + { + "epoch": 4.7578266247977927e-05, + "grad_norm": 3.2318713665008545, + "learning_rate": 4.999999978618258e-05, + "loss": 11.1021, + "step": 8 + }, + { + "epoch": 5.352554952897517e-05, + "grad_norm": 5.6542534828186035, + "learning_rate": 4.9999999720728266e-05, + "loss": 11.0132, + "step": 9 + }, + { + "epoch": 5.94728328099724e-05, + "grad_norm": 3.623577356338501, + "learning_rate": 4.999999964654671e-05, + "loss": 10.8896, + "step": 10 + }, + { + "epoch": 6.542011609096965e-05, + "grad_norm": 3.3209445476531982, + "learning_rate": 4.9999999563637915e-05, + "loss": 10.7339, + "step": 11 + }, + { + "epoch": 7.136739937196689e-05, + "grad_norm": 3.4527082443237305, + "learning_rate": 4.999999947200188e-05, + "loss": 10.5472, + "step": 12 + }, + { + "epoch": 7.731468265296413e-05, + "grad_norm": 3.784444570541382, + "learning_rate": 4.99999993716386e-05, + "loss": 10.4353, + "step": 13 + }, + { + "epoch": 8.326196593396137e-05, + "grad_norm": 4.304569244384766, + "learning_rate": 4.999999926254808e-05, + "loss": 10.4652, + "step": 14 + }, + { + "epoch": 8.920924921495861e-05, + "grad_norm": 3.5867838859558105, + "learning_rate": 4.999999914473032e-05, + "loss": 10.5746, + "step": 15 + }, + { + "epoch": 9.515653249595585e-05, + "grad_norm": 6.1308207511901855, + "learning_rate": 4.9999999018185316e-05, + "loss": 10.4129, + "step": 16 + }, + { + "epoch": 0.0001011038157769531, + "grad_norm": 3.4687230587005615, + "learning_rate": 4.999999888291307e-05, + "loss": 10.2246, + "step": 17 + }, + { + "epoch": 0.00010705109905795033, + "grad_norm": 4.041895866394043, + "learning_rate": 4.9999998738913586e-05, + "loss": 10.0852, + "step": 18 + }, + { + "epoch": 0.00011299838233894758, + "grad_norm": 4.437602519989014, + "learning_rate": 4.999999858618686e-05, + "loss": 9.8841, + "step": 19 + }, + { + "epoch": 0.0001189456656199448, + "grad_norm": 3.9608142375946045, + "learning_rate": 4.9999998424732884e-05, + "loss": 10.0537, + "step": 20 + }, + { + "epoch": 0.00012489294890094204, + "grad_norm": 3.799363613128662, + "learning_rate": 4.999999825455168e-05, + "loss": 9.8487, + "step": 21 + }, + { + "epoch": 0.0001308402321819393, + "grad_norm": 3.626058340072632, + "learning_rate": 4.999999807564323e-05, + "loss": 9.8048, + "step": 22 + }, + { + "epoch": 0.00013678751546293653, + "grad_norm": 4.21406364440918, + "learning_rate": 4.999999788800754e-05, + "loss": 9.6091, + "step": 23 + }, + { + "epoch": 0.00014273479874393378, + "grad_norm": 5.26548957824707, + "learning_rate": 4.9999997691644605e-05, + "loss": 9.3935, + "step": 24 + }, + { + "epoch": 0.000148682082024931, + "grad_norm": 6.5113396644592285, + "learning_rate": 4.999999748655443e-05, + "loss": 9.2602, + "step": 25 + }, + { + "epoch": 0.00015462936530592826, + "grad_norm": 4.6141133308410645, + "learning_rate": 4.9999997272737014e-05, + "loss": 9.1492, + "step": 26 + }, + { + "epoch": 0.0001605766485869255, + "grad_norm": 4.645262241363525, + "learning_rate": 4.999999705019236e-05, + "loss": 9.2238, + "step": 27 + }, + { + "epoch": 0.00016652393186792274, + "grad_norm": 4.599213123321533, + "learning_rate": 4.9999996818920464e-05, + "loss": 9.1673, + "step": 28 + }, + { + "epoch": 0.00017247121514891997, + "grad_norm": 4.820634365081787, + "learning_rate": 4.999999657892133e-05, + "loss": 9.0044, + "step": 29 + }, + { + "epoch": 0.00017841849842991722, + "grad_norm": 4.57854700088501, + "learning_rate": 4.9999996330194956e-05, + "loss": 8.8746, + "step": 30 + }, + { + "epoch": 0.00018436578171091445, + "grad_norm": 4.567880153656006, + "learning_rate": 4.999999607274133e-05, + "loss": 8.7224, + "step": 31 + }, + { + "epoch": 0.0001903130649919117, + "grad_norm": 4.545701503753662, + "learning_rate": 4.9999995806560475e-05, + "loss": 8.6979, + "step": 32 + }, + { + "epoch": 0.00019626034827290893, + "grad_norm": 4.098274230957031, + "learning_rate": 4.9999995531652374e-05, + "loss": 8.5787, + "step": 33 + }, + { + "epoch": 0.0002022076315539062, + "grad_norm": 4.341195106506348, + "learning_rate": 4.999999524801704e-05, + "loss": 8.4452, + "step": 34 + }, + { + "epoch": 0.00020815491483490341, + "grad_norm": 4.651747703552246, + "learning_rate": 4.999999495565446e-05, + "loss": 8.4383, + "step": 35 + }, + { + "epoch": 0.00021410219811590067, + "grad_norm": 4.187220573425293, + "learning_rate": 4.999999465456464e-05, + "loss": 8.2441, + "step": 36 + }, + { + "epoch": 0.0002200494813968979, + "grad_norm": 4.094058990478516, + "learning_rate": 4.999999434474758e-05, + "loss": 8.2784, + "step": 37 + }, + { + "epoch": 0.00022599676467789515, + "grad_norm": 4.6094794273376465, + "learning_rate": 4.999999402620329e-05, + "loss": 8.3893, + "step": 38 + }, + { + "epoch": 0.00023194404795889238, + "grad_norm": 5.391327381134033, + "learning_rate": 4.999999369893175e-05, + "loss": 8.6491, + "step": 39 + }, + { + "epoch": 0.0002378913312398896, + "grad_norm": 5.03748893737793, + "learning_rate": 4.9999993362932974e-05, + "loss": 8.5279, + "step": 40 + }, + { + "epoch": 0.00024383861452088686, + "grad_norm": 5.306002616882324, + "learning_rate": 4.9999993018206956e-05, + "loss": 9.9965, + "step": 41 + }, + { + "epoch": 0.0002497858978018841, + "grad_norm": 5.5374274253845215, + "learning_rate": 4.99999926647537e-05, + "loss": 10.5594, + "step": 42 + }, + { + "epoch": 0.00025573318108288134, + "grad_norm": 3.8107693195343018, + "learning_rate": 4.999999230257321e-05, + "loss": 10.5451, + "step": 43 + }, + { + "epoch": 0.0002616804643638786, + "grad_norm": 3.922286033630371, + "learning_rate": 4.999999193166547e-05, + "loss": 10.4123, + "step": 44 + }, + { + "epoch": 0.0002676277476448758, + "grad_norm": 3.2090535163879395, + "learning_rate": 4.99999915520305e-05, + "loss": 10.0646, + "step": 45 + }, + { + "epoch": 0.00027357503092587305, + "grad_norm": 3.153404474258423, + "learning_rate": 4.9999991163668285e-05, + "loss": 10.237, + "step": 46 + }, + { + "epoch": 0.0002795223142068703, + "grad_norm": 4.83523416519165, + "learning_rate": 4.999999076657884e-05, + "loss": 8.9392, + "step": 47 + }, + { + "epoch": 0.00028546959748786756, + "grad_norm": 3.954632043838501, + "learning_rate": 4.999999036076215e-05, + "loss": 8.8562, + "step": 48 + }, + { + "epoch": 0.00029141688076886476, + "grad_norm": 4.452631950378418, + "learning_rate": 4.999998994621822e-05, + "loss": 9.8819, + "step": 49 + }, + { + "epoch": 0.000297364164049862, + "grad_norm": 4.71603536605835, + "learning_rate": 4.9999989522947055e-05, + "loss": 9.8503, + "step": 50 + }, + { + "epoch": 0.00030331144733085927, + "grad_norm": 3.8008105754852295, + "learning_rate": 4.999998909094865e-05, + "loss": 9.8072, + "step": 51 + }, + { + "epoch": 0.0003092587306118565, + "grad_norm": 3.9906716346740723, + "learning_rate": 4.999998865022301e-05, + "loss": 9.168, + "step": 52 + }, + { + "epoch": 0.0003152060138928537, + "grad_norm": 3.9425785541534424, + "learning_rate": 4.999998820077013e-05, + "loss": 9.8441, + "step": 53 + }, + { + "epoch": 0.000321153297173851, + "grad_norm": 3.6698031425476074, + "learning_rate": 4.999998774259002e-05, + "loss": 10.036, + "step": 54 + }, + { + "epoch": 0.00032710058045484823, + "grad_norm": 3.3027005195617676, + "learning_rate": 4.999998727568266e-05, + "loss": 9.8701, + "step": 55 + }, + { + "epoch": 0.0003330478637358455, + "grad_norm": 3.312570333480835, + "learning_rate": 4.999998680004807e-05, + "loss": 9.3354, + "step": 56 + }, + { + "epoch": 0.0003389951470168427, + "grad_norm": 3.323969602584839, + "learning_rate": 4.999998631568624e-05, + "loss": 9.2899, + "step": 57 + }, + { + "epoch": 0.00034494243029783994, + "grad_norm": 3.1319313049316406, + "learning_rate": 4.999998582259717e-05, + "loss": 9.1033, + "step": 58 + }, + { + "epoch": 0.0003508897135788372, + "grad_norm": 3.655060291290283, + "learning_rate": 4.999998532078087e-05, + "loss": 9.1574, + "step": 59 + }, + { + "epoch": 0.00035683699685983445, + "grad_norm": 3.2051918506622314, + "learning_rate": 4.999998481023733e-05, + "loss": 9.564, + "step": 60 + }, + { + "epoch": 0.00036278428014083165, + "grad_norm": 3.223015308380127, + "learning_rate": 4.999998429096656e-05, + "loss": 9.46, + "step": 61 + }, + { + "epoch": 0.0003687315634218289, + "grad_norm": 4.121186256408691, + "learning_rate": 4.999998376296855e-05, + "loss": 8.4136, + "step": 62 + }, + { + "epoch": 0.00037467884670282616, + "grad_norm": 3.5580086708068848, + "learning_rate": 4.9999983226243296e-05, + "loss": 9.3504, + "step": 63 + }, + { + "epoch": 0.0003806261299838234, + "grad_norm": 3.664219379425049, + "learning_rate": 4.999998268079081e-05, + "loss": 9.2889, + "step": 64 + }, + { + "epoch": 0.0003865734132648206, + "grad_norm": 2.955582618713379, + "learning_rate": 4.99999821266111e-05, + "loss": 8.9193, + "step": 65 + }, + { + "epoch": 0.00039252069654581787, + "grad_norm": 3.0592539310455322, + "learning_rate": 4.9999981563704144e-05, + "loss": 9.6739, + "step": 66 + }, + { + "epoch": 0.0003984679798268151, + "grad_norm": 3.32024884223938, + "learning_rate": 4.999998099206995e-05, + "loss": 9.3648, + "step": 67 + }, + { + "epoch": 0.0004044152631078124, + "grad_norm": 3.2716033458709717, + "learning_rate": 4.9999980411708524e-05, + "loss": 9.3652, + "step": 68 + }, + { + "epoch": 0.0004103625463888096, + "grad_norm": 3.1926631927490234, + "learning_rate": 4.999997982261987e-05, + "loss": 9.2924, + "step": 69 + }, + { + "epoch": 0.00041630982966980683, + "grad_norm": 3.589841604232788, + "learning_rate": 4.999997922480397e-05, + "loss": 9.2185, + "step": 70 + }, + { + "epoch": 0.0004222571129508041, + "grad_norm": 2.902132034301758, + "learning_rate": 4.999997861826084e-05, + "loss": 9.1047, + "step": 71 + }, + { + "epoch": 0.00042820439623180134, + "grad_norm": 3.2352359294891357, + "learning_rate": 4.999997800299048e-05, + "loss": 9.0309, + "step": 72 + }, + { + "epoch": 0.00043415167951279854, + "grad_norm": 2.683664560317993, + "learning_rate": 4.9999977378992884e-05, + "loss": 8.9977, + "step": 73 + }, + { + "epoch": 0.0004400989627937958, + "grad_norm": 3.0073423385620117, + "learning_rate": 4.9999976746268055e-05, + "loss": 9.0967, + "step": 74 + }, + { + "epoch": 0.00044604624607479305, + "grad_norm": 3.364819288253784, + "learning_rate": 4.9999976104815994e-05, + "loss": 8.9401, + "step": 75 + }, + { + "epoch": 0.0004519935293557903, + "grad_norm": 3.478936195373535, + "learning_rate": 4.9999975454636695e-05, + "loss": 8.8173, + "step": 76 + }, + { + "epoch": 0.0004579408126367875, + "grad_norm": 3.059669017791748, + "learning_rate": 4.9999974795730165e-05, + "loss": 9.2588, + "step": 77 + }, + { + "epoch": 0.00046388809591778476, + "grad_norm": 3.1980936527252197, + "learning_rate": 4.999997412809639e-05, + "loss": 9.3374, + "step": 78 + }, + { + "epoch": 0.000469835379198782, + "grad_norm": 2.859935998916626, + "learning_rate": 4.9999973451735405e-05, + "loss": 8.8996, + "step": 79 + }, + { + "epoch": 0.0004757826624797792, + "grad_norm": 3.6268489360809326, + "learning_rate": 4.9999972766647175e-05, + "loss": 8.7878, + "step": 80 + }, + { + "epoch": 0.00048172994576077647, + "grad_norm": 3.0187010765075684, + "learning_rate": 4.9999972072831714e-05, + "loss": 8.9177, + "step": 81 + }, + { + "epoch": 0.0004876772290417737, + "grad_norm": 3.304633378982544, + "learning_rate": 4.9999971370289014e-05, + "loss": 8.8098, + "step": 82 + }, + { + "epoch": 0.0004936245123227709, + "grad_norm": 3.678696870803833, + "learning_rate": 4.999997065901909e-05, + "loss": 8.9408, + "step": 83 + }, + { + "epoch": 0.0004995717956037682, + "grad_norm": 3.485488176345825, + "learning_rate": 4.9999969939021936e-05, + "loss": 8.7374, + "step": 84 + }, + { + "epoch": 0.0005055190788847654, + "grad_norm": 3.276916265487671, + "learning_rate": 4.999996921029755e-05, + "loss": 8.7177, + "step": 85 + }, + { + "epoch": 0.0005114663621657627, + "grad_norm": 3.060227632522583, + "learning_rate": 4.9999968472845926e-05, + "loss": 8.9673, + "step": 86 + }, + { + "epoch": 0.0005174136454467599, + "grad_norm": 3.359055995941162, + "learning_rate": 4.999996772666708e-05, + "loss": 8.8029, + "step": 87 + }, + { + "epoch": 0.0005233609287277572, + "grad_norm": 3.8916943073272705, + "learning_rate": 4.9999966971761004e-05, + "loss": 8.8363, + "step": 88 + }, + { + "epoch": 0.0005293082120087544, + "grad_norm": 3.825075387954712, + "learning_rate": 4.9999966208127694e-05, + "loss": 8.5683, + "step": 89 + }, + { + "epoch": 0.0005352554952897516, + "grad_norm": 3.475759267807007, + "learning_rate": 4.999996543576715e-05, + "loss": 8.5723, + "step": 90 + }, + { + "epoch": 0.0005412027785707488, + "grad_norm": 3.609776020050049, + "learning_rate": 4.9999964654679385e-05, + "loss": 8.6123, + "step": 91 + }, + { + "epoch": 0.0005471500618517461, + "grad_norm": 3.3749685287475586, + "learning_rate": 4.999996386486439e-05, + "loss": 8.4887, + "step": 92 + }, + { + "epoch": 0.0005530973451327434, + "grad_norm": 3.3853306770324707, + "learning_rate": 4.999996306632215e-05, + "loss": 8.56, + "step": 93 + }, + { + "epoch": 0.0005590446284137406, + "grad_norm": 3.9347422122955322, + "learning_rate": 4.99999622590527e-05, + "loss": 8.5053, + "step": 94 + }, + { + "epoch": 0.0005649919116947379, + "grad_norm": 3.6037611961364746, + "learning_rate": 4.999996144305601e-05, + "loss": 8.3367, + "step": 95 + }, + { + "epoch": 0.0005709391949757351, + "grad_norm": 3.4608941078186035, + "learning_rate": 4.99999606183321e-05, + "loss": 8.0674, + "step": 96 + }, + { + "epoch": 0.0005768864782567324, + "grad_norm": 3.4882898330688477, + "learning_rate": 4.999995978488096e-05, + "loss": 8.1728, + "step": 97 + }, + { + "epoch": 0.0005828337615377295, + "grad_norm": 3.6789562702178955, + "learning_rate": 4.999995894270258e-05, + "loss": 7.9535, + "step": 98 + }, + { + "epoch": 0.0005887810448187268, + "grad_norm": 3.57328200340271, + "learning_rate": 4.9999958091796986e-05, + "loss": 8.2048, + "step": 99 + }, + { + "epoch": 0.000594728328099724, + "grad_norm": 3.803468942642212, + "learning_rate": 4.999995723216416e-05, + "loss": 7.8073, + "step": 100 + }, + { + "epoch": 0.0006006756113807213, + "grad_norm": 3.8187785148620605, + "learning_rate": 4.9999956363804116e-05, + "loss": 7.6325, + "step": 101 + }, + { + "epoch": 0.0006066228946617185, + "grad_norm": 3.8681981563568115, + "learning_rate": 4.999995548671684e-05, + "loss": 7.7104, + "step": 102 + }, + { + "epoch": 0.0006125701779427158, + "grad_norm": 3.869074583053589, + "learning_rate": 4.9999954600902334e-05, + "loss": 7.8445, + "step": 103 + }, + { + "epoch": 0.000618517461223713, + "grad_norm": 3.852057695388794, + "learning_rate": 4.99999537063606e-05, + "loss": 7.872, + "step": 104 + }, + { + "epoch": 0.0006244647445047103, + "grad_norm": 4.784586429595947, + "learning_rate": 4.9999952803091654e-05, + "loss": 9.2218, + "step": 105 + }, + { + "epoch": 0.0006304120277857074, + "grad_norm": 4.296675682067871, + "learning_rate": 4.9999951891095474e-05, + "loss": 9.0957, + "step": 106 + }, + { + "epoch": 0.0006363593110667047, + "grad_norm": 3.9155995845794678, + "learning_rate": 4.999995097037207e-05, + "loss": 8.9829, + "step": 107 + }, + { + "epoch": 0.000642306594347702, + "grad_norm": 3.8967478275299072, + "learning_rate": 4.999995004092144e-05, + "loss": 8.2017, + "step": 108 + }, + { + "epoch": 0.0006482538776286992, + "grad_norm": 5.238500595092773, + "learning_rate": 4.999994910274358e-05, + "loss": 7.7976, + "step": 109 + }, + { + "epoch": 0.0006542011609096965, + "grad_norm": 3.7043144702911377, + "learning_rate": 4.9999948155838504e-05, + "loss": 8.3116, + "step": 110 + }, + { + "epoch": 0.0006601484441906937, + "grad_norm": 2.9745211601257324, + "learning_rate": 4.99999472002062e-05, + "loss": 8.69, + "step": 111 + }, + { + "epoch": 0.000666095727471691, + "grad_norm": 3.172652006149292, + "learning_rate": 4.999994623584668e-05, + "loss": 8.6244, + "step": 112 + }, + { + "epoch": 0.0006720430107526882, + "grad_norm": 3.224888801574707, + "learning_rate": 4.999994526275993e-05, + "loss": 8.6823, + "step": 113 + }, + { + "epoch": 0.0006779902940336854, + "grad_norm": 3.53104305267334, + "learning_rate": 4.9999944280945964e-05, + "loss": 8.495, + "step": 114 + }, + { + "epoch": 0.0006839375773146826, + "grad_norm": 3.013505697250366, + "learning_rate": 4.999994329040477e-05, + "loss": 8.4807, + "step": 115 + }, + { + "epoch": 0.0006898848605956799, + "grad_norm": 4.4741339683532715, + "learning_rate": 4.999994229113636e-05, + "loss": 8.94, + "step": 116 + }, + { + "epoch": 0.0006958321438766771, + "grad_norm": 4.78712272644043, + "learning_rate": 4.999994128314072e-05, + "loss": 8.9367, + "step": 117 + }, + { + "epoch": 0.0007017794271576744, + "grad_norm": 3.6983933448791504, + "learning_rate": 4.999994026641787e-05, + "loss": 8.7524, + "step": 118 + }, + { + "epoch": 0.0007077267104386716, + "grad_norm": 3.74997615814209, + "learning_rate": 4.9999939240967784e-05, + "loss": 8.3417, + "step": 119 + }, + { + "epoch": 0.0007136739937196689, + "grad_norm": 3.614593982696533, + "learning_rate": 4.999993820679049e-05, + "loss": 8.4848, + "step": 120 + }, + { + "epoch": 0.000719621277000666, + "grad_norm": 2.903045654296875, + "learning_rate": 4.999993716388597e-05, + "loss": 8.5519, + "step": 121 + }, + { + "epoch": 0.0007255685602816633, + "grad_norm": 3.402444839477539, + "learning_rate": 4.999993611225423e-05, + "loss": 8.2905, + "step": 122 + }, + { + "epoch": 0.0007315158435626606, + "grad_norm": 3.663893938064575, + "learning_rate": 4.9999935051895274e-05, + "loss": 8.4842, + "step": 123 + }, + { + "epoch": 0.0007374631268436578, + "grad_norm": 3.7535622119903564, + "learning_rate": 4.99999339828091e-05, + "loss": 8.4766, + "step": 124 + }, + { + "epoch": 0.0007434104101246551, + "grad_norm": 3.1285574436187744, + "learning_rate": 4.99999329049957e-05, + "loss": 8.3716, + "step": 125 + }, + { + "epoch": 0.0007493576934056523, + "grad_norm": 3.648869752883911, + "learning_rate": 4.9999931818455086e-05, + "loss": 8.3413, + "step": 126 + }, + { + "epoch": 0.0007553049766866496, + "grad_norm": 3.253399133682251, + "learning_rate": 4.9999930723187255e-05, + "loss": 8.0412, + "step": 127 + }, + { + "epoch": 0.0007612522599676468, + "grad_norm": 3.5694124698638916, + "learning_rate": 4.999992961919221e-05, + "loss": 8.0895, + "step": 128 + }, + { + "epoch": 0.000767199543248644, + "grad_norm": 4.106658458709717, + "learning_rate": 4.999992850646994e-05, + "loss": 8.3654, + "step": 129 + }, + { + "epoch": 0.0007731468265296412, + "grad_norm": 4.082829475402832, + "learning_rate": 4.9999927385020455e-05, + "loss": 8.2663, + "step": 130 + }, + { + "epoch": 0.0007790941098106385, + "grad_norm": 4.349386215209961, + "learning_rate": 4.9999926254843753e-05, + "loss": 8.2435, + "step": 131 + }, + { + "epoch": 0.0007850413930916357, + "grad_norm": 3.375697135925293, + "learning_rate": 4.999992511593984e-05, + "loss": 8.0827, + "step": 132 + }, + { + "epoch": 0.000790988676372633, + "grad_norm": 3.2566957473754883, + "learning_rate": 4.999992396830871e-05, + "loss": 8.4891, + "step": 133 + }, + { + "epoch": 0.0007969359596536302, + "grad_norm": 3.791579008102417, + "learning_rate": 4.999992281195036e-05, + "loss": 8.1567, + "step": 134 + }, + { + "epoch": 0.0008028832429346275, + "grad_norm": 3.8741838932037354, + "learning_rate": 4.99999216468648e-05, + "loss": 8.4033, + "step": 135 + }, + { + "epoch": 0.0008088305262156248, + "grad_norm": 4.229452133178711, + "learning_rate": 4.999992047305203e-05, + "loss": 8.3897, + "step": 136 + }, + { + "epoch": 0.0008147778094966219, + "grad_norm": 3.2732088565826416, + "learning_rate": 4.9999919290512034e-05, + "loss": 8.1758, + "step": 137 + }, + { + "epoch": 0.0008207250927776192, + "grad_norm": 3.2048966884613037, + "learning_rate": 4.9999918099244836e-05, + "loss": 8.1459, + "step": 138 + }, + { + "epoch": 0.0008266723760586164, + "grad_norm": 3.8639938831329346, + "learning_rate": 4.999991689925042e-05, + "loss": 7.9437, + "step": 139 + }, + { + "epoch": 0.0008326196593396137, + "grad_norm": 3.297252655029297, + "learning_rate": 4.9999915690528794e-05, + "loss": 8.1751, + "step": 140 + }, + { + "epoch": 0.0008385669426206109, + "grad_norm": 3.878218173980713, + "learning_rate": 4.999991447307995e-05, + "loss": 8.0572, + "step": 141 + }, + { + "epoch": 0.0008445142259016082, + "grad_norm": 3.6870739459991455, + "learning_rate": 4.9999913246903895e-05, + "loss": 8.0958, + "step": 142 + }, + { + "epoch": 0.0008504615091826054, + "grad_norm": 3.1817922592163086, + "learning_rate": 4.9999912012000636e-05, + "loss": 8.2683, + "step": 143 + }, + { + "epoch": 0.0008564087924636027, + "grad_norm": 3.4008772373199463, + "learning_rate": 4.999991076837016e-05, + "loss": 8.4171, + "step": 144 + }, + { + "epoch": 0.0008623560757445998, + "grad_norm": 3.002333641052246, + "learning_rate": 4.999990951601247e-05, + "loss": 8.1149, + "step": 145 + }, + { + "epoch": 0.0008683033590255971, + "grad_norm": 3.51910662651062, + "learning_rate": 4.999990825492757e-05, + "loss": 8.5284, + "step": 146 + }, + { + "epoch": 0.0008742506423065943, + "grad_norm": 2.978875160217285, + "learning_rate": 4.999990698511548e-05, + "loss": 8.4855, + "step": 147 + }, + { + "epoch": 0.0008801979255875916, + "grad_norm": 3.4708774089813232, + "learning_rate": 4.999990570657616e-05, + "loss": 8.333, + "step": 148 + }, + { + "epoch": 0.0008861452088685888, + "grad_norm": 2.994084596633911, + "learning_rate": 4.999990441930963e-05, + "loss": 8.3456, + "step": 149 + }, + { + "epoch": 0.0008920924921495861, + "grad_norm": 3.1295697689056396, + "learning_rate": 4.99999031233159e-05, + "loss": 8.2204, + "step": 150 + }, + { + "epoch": 0.0008980397754305833, + "grad_norm": 3.349720001220703, + "learning_rate": 4.9999901818594966e-05, + "loss": 8.2739, + "step": 151 + }, + { + "epoch": 0.0009039870587115806, + "grad_norm": 3.852964401245117, + "learning_rate": 4.999990050514681e-05, + "loss": 8.4225, + "step": 152 + }, + { + "epoch": 0.0009099343419925777, + "grad_norm": 3.92203950881958, + "learning_rate": 4.9999899182971456e-05, + "loss": 8.2882, + "step": 153 + }, + { + "epoch": 0.000915881625273575, + "grad_norm": 3.9960269927978516, + "learning_rate": 4.99998978520689e-05, + "loss": 8.2091, + "step": 154 + }, + { + "epoch": 0.0009218289085545723, + "grad_norm": 3.952327251434326, + "learning_rate": 4.999989651243913e-05, + "loss": 8.1726, + "step": 155 + }, + { + "epoch": 0.0009277761918355695, + "grad_norm": 3.9594647884368896, + "learning_rate": 4.9999895164082156e-05, + "loss": 8.0241, + "step": 156 + }, + { + "epoch": 0.0009337234751165668, + "grad_norm": 3.1129961013793945, + "learning_rate": 4.999989380699798e-05, + "loss": 8.14, + "step": 157 + }, + { + "epoch": 0.000939670758397564, + "grad_norm": 4.7737860679626465, + "learning_rate": 4.9999892441186604e-05, + "loss": 7.869, + "step": 158 + }, + { + "epoch": 0.0009456180416785613, + "grad_norm": 3.351327657699585, + "learning_rate": 4.9999891066648006e-05, + "loss": 8.1831, + "step": 159 + }, + { + "epoch": 0.0009515653249595584, + "grad_norm": 3.0245375633239746, + "learning_rate": 4.999988968338222e-05, + "loss": 8.3871, + "step": 160 + }, + { + "epoch": 0.0009575126082405557, + "grad_norm": 4.766855716705322, + "learning_rate": 4.999988829138923e-05, + "loss": 8.0078, + "step": 161 + }, + { + "epoch": 0.0009634598915215529, + "grad_norm": 3.975804090499878, + "learning_rate": 4.999988689066903e-05, + "loss": 7.6923, + "step": 162 + }, + { + "epoch": 0.0009694071748025502, + "grad_norm": 4.024605751037598, + "learning_rate": 4.999988548122163e-05, + "loss": 8.2986, + "step": 163 + }, + { + "epoch": 0.0009753544580835474, + "grad_norm": 4.230019569396973, + "learning_rate": 4.999988406304703e-05, + "loss": 8.2903, + "step": 164 + }, + { + "epoch": 0.0009813017413645446, + "grad_norm": 3.972825050354004, + "learning_rate": 4.9999882636145236e-05, + "loss": 8.3589, + "step": 165 + }, + { + "epoch": 0.0009872490246455418, + "grad_norm": 3.6381688117980957, + "learning_rate": 4.999988120051623e-05, + "loss": 8.2648, + "step": 166 + }, + { + "epoch": 0.000993196307926539, + "grad_norm": 4.203462600708008, + "learning_rate": 4.9999879756160025e-05, + "loss": 8.363, + "step": 167 + }, + { + "epoch": 0.0009991435912075363, + "grad_norm": 2.944103479385376, + "learning_rate": 4.9999878303076624e-05, + "loss": 7.9752, + "step": 168 + }, + { + "epoch": 0.0010050908744885336, + "grad_norm": 3.4115283489227295, + "learning_rate": 4.9999876841266025e-05, + "loss": 8.1044, + "step": 169 + }, + { + "epoch": 0.0010110381577695309, + "grad_norm": 4.185582160949707, + "learning_rate": 4.999987537072822e-05, + "loss": 8.0347, + "step": 170 + }, + { + "epoch": 0.0010169854410505281, + "grad_norm": 3.333649158477783, + "learning_rate": 4.999987389146323e-05, + "loss": 8.0545, + "step": 171 + }, + { + "epoch": 0.0010229327243315254, + "grad_norm": 3.7702765464782715, + "learning_rate": 4.999987240347103e-05, + "loss": 7.8936, + "step": 172 + }, + { + "epoch": 0.0010288800076125226, + "grad_norm": 4.113167762756348, + "learning_rate": 4.9999870906751636e-05, + "loss": 7.9447, + "step": 173 + }, + { + "epoch": 0.0010348272908935199, + "grad_norm": 3.370821714401245, + "learning_rate": 4.999986940130505e-05, + "loss": 7.9745, + "step": 174 + }, + { + "epoch": 0.0010407745741745171, + "grad_norm": 3.552391767501831, + "learning_rate": 4.999986788713126e-05, + "loss": 7.8882, + "step": 175 + }, + { + "epoch": 0.0010467218574555144, + "grad_norm": 3.3497536182403564, + "learning_rate": 4.999986636423028e-05, + "loss": 7.8601, + "step": 176 + }, + { + "epoch": 0.0010526691407365116, + "grad_norm": 3.256685733795166, + "learning_rate": 4.9999864832602105e-05, + "loss": 7.8341, + "step": 177 + }, + { + "epoch": 0.001058616424017509, + "grad_norm": 3.028108835220337, + "learning_rate": 4.999986329224674e-05, + "loss": 7.884, + "step": 178 + }, + { + "epoch": 0.0010645637072985061, + "grad_norm": 2.9583778381347656, + "learning_rate": 4.9999861743164165e-05, + "loss": 7.7875, + "step": 179 + }, + { + "epoch": 0.0010705109905795032, + "grad_norm": 3.109215497970581, + "learning_rate": 4.999986018535441e-05, + "loss": 8.4081, + "step": 180 + }, + { + "epoch": 0.0010764582738605004, + "grad_norm": 3.8907759189605713, + "learning_rate": 4.999985861881746e-05, + "loss": 8.0971, + "step": 181 + }, + { + "epoch": 0.0010824055571414977, + "grad_norm": 4.20400857925415, + "learning_rate": 4.9999857043553314e-05, + "loss": 7.9077, + "step": 182 + }, + { + "epoch": 0.001088352840422495, + "grad_norm": 3.580486297607422, + "learning_rate": 4.999985545956198e-05, + "loss": 7.8935, + "step": 183 + }, + { + "epoch": 0.0010943001237034922, + "grad_norm": 3.3833847045898438, + "learning_rate": 4.999985386684345e-05, + "loss": 7.9956, + "step": 184 + }, + { + "epoch": 0.0011002474069844895, + "grad_norm": 2.8848624229431152, + "learning_rate": 4.9999852265397734e-05, + "loss": 8.0718, + "step": 185 + }, + { + "epoch": 0.0011061946902654867, + "grad_norm": 3.8933818340301514, + "learning_rate": 4.999985065522483e-05, + "loss": 8.0517, + "step": 186 + }, + { + "epoch": 0.001112141973546484, + "grad_norm": 3.6559605598449707, + "learning_rate": 4.999984903632473e-05, + "loss": 8.3664, + "step": 187 + }, + { + "epoch": 0.0011180892568274812, + "grad_norm": 3.4633536338806152, + "learning_rate": 4.999984740869744e-05, + "loss": 8.3481, + "step": 188 + }, + { + "epoch": 0.0011240365401084785, + "grad_norm": 3.483020305633545, + "learning_rate": 4.999984577234297e-05, + "loss": 8.3407, + "step": 189 + }, + { + "epoch": 0.0011299838233894757, + "grad_norm": 2.772434711456299, + "learning_rate": 4.999984412726131e-05, + "loss": 8.4524, + "step": 190 + }, + { + "epoch": 0.001135931106670473, + "grad_norm": 3.3341007232666016, + "learning_rate": 4.999984247345246e-05, + "loss": 8.1063, + "step": 191 + }, + { + "epoch": 0.0011418783899514702, + "grad_norm": 3.0063467025756836, + "learning_rate": 4.999984081091642e-05, + "loss": 8.0077, + "step": 192 + }, + { + "epoch": 0.0011478256732324675, + "grad_norm": 2.9670779705047607, + "learning_rate": 4.99998391396532e-05, + "loss": 8.2338, + "step": 193 + }, + { + "epoch": 0.0011537729565134647, + "grad_norm": 3.024505138397217, + "learning_rate": 4.999983745966279e-05, + "loss": 8.1794, + "step": 194 + }, + { + "epoch": 0.0011597202397944618, + "grad_norm": 2.834131956100464, + "learning_rate": 4.9999835770945195e-05, + "loss": 8.2078, + "step": 195 + }, + { + "epoch": 0.001165667523075459, + "grad_norm": 3.555525064468384, + "learning_rate": 4.999983407350042e-05, + "loss": 8.0838, + "step": 196 + }, + { + "epoch": 0.0011716148063564563, + "grad_norm": 3.5013587474823, + "learning_rate": 4.999983236732846e-05, + "loss": 8.092, + "step": 197 + }, + { + "epoch": 0.0011775620896374535, + "grad_norm": 3.3721518516540527, + "learning_rate": 4.9999830652429314e-05, + "loss": 8.1137, + "step": 198 + }, + { + "epoch": 0.0011835093729184508, + "grad_norm": 3.364952564239502, + "learning_rate": 4.9999828928802986e-05, + "loss": 8.1197, + "step": 199 + }, + { + "epoch": 0.001189456656199448, + "grad_norm": 3.691249132156372, + "learning_rate": 4.999982719644948e-05, + "loss": 8.0922, + "step": 200 + }, + { + "epoch": 0.0011954039394804453, + "grad_norm": 6.919185161590576, + "learning_rate": 4.9999825455368785e-05, + "loss": 7.9215, + "step": 201 + }, + { + "epoch": 0.0012013512227614426, + "grad_norm": 3.3332598209381104, + "learning_rate": 4.999982370556091e-05, + "loss": 7.7605, + "step": 202 + }, + { + "epoch": 0.0012072985060424398, + "grad_norm": 2.842517375946045, + "learning_rate": 4.999982194702586e-05, + "loss": 8.0527, + "step": 203 + }, + { + "epoch": 0.001213245789323437, + "grad_norm": 3.086371660232544, + "learning_rate": 4.999982017976364e-05, + "loss": 8.2637, + "step": 204 + }, + { + "epoch": 0.0012191930726044343, + "grad_norm": 3.0870208740234375, + "learning_rate": 4.999981840377422e-05, + "loss": 8.3538, + "step": 205 + }, + { + "epoch": 0.0012251403558854316, + "grad_norm": 3.1244094371795654, + "learning_rate": 4.9999816619057633e-05, + "loss": 8.4604, + "step": 206 + }, + { + "epoch": 0.0012310876391664288, + "grad_norm": 2.7808034420013428, + "learning_rate": 4.999981482561387e-05, + "loss": 8.3227, + "step": 207 + }, + { + "epoch": 0.001237034922447426, + "grad_norm": 2.791182518005371, + "learning_rate": 4.999981302344292e-05, + "loss": 8.1481, + "step": 208 + }, + { + "epoch": 0.0012429822057284233, + "grad_norm": 3.045971632003784, + "learning_rate": 4.99998112125448e-05, + "loss": 7.7842, + "step": 209 + }, + { + "epoch": 0.0012489294890094206, + "grad_norm": 3.2548067569732666, + "learning_rate": 4.99998093929195e-05, + "loss": 7.9935, + "step": 210 + }, + { + "epoch": 0.0012548767722904176, + "grad_norm": 3.5448713302612305, + "learning_rate": 4.999980756456704e-05, + "loss": 8.0323, + "step": 211 + }, + { + "epoch": 0.0012608240555714149, + "grad_norm": 3.717900514602661, + "learning_rate": 4.9999805727487395e-05, + "loss": 8.0532, + "step": 212 + }, + { + "epoch": 0.0012667713388524121, + "grad_norm": 3.2943921089172363, + "learning_rate": 4.9999803881680576e-05, + "loss": 8.0326, + "step": 213 + }, + { + "epoch": 0.0012727186221334094, + "grad_norm": 3.4586269855499268, + "learning_rate": 4.999980202714658e-05, + "loss": 7.8765, + "step": 214 + }, + { + "epoch": 0.0012786659054144067, + "grad_norm": 3.1898810863494873, + "learning_rate": 4.9999800163885414e-05, + "loss": 7.8859, + "step": 215 + }, + { + "epoch": 0.001284613188695404, + "grad_norm": 2.977229595184326, + "learning_rate": 4.9999798291897084e-05, + "loss": 7.8841, + "step": 216 + }, + { + "epoch": 0.0012905604719764012, + "grad_norm": 3.368680000305176, + "learning_rate": 4.999979641118157e-05, + "loss": 7.8055, + "step": 217 + }, + { + "epoch": 0.0012965077552573984, + "grad_norm": 4.295344352722168, + "learning_rate": 4.9999794521738894e-05, + "loss": 7.6456, + "step": 218 + }, + { + "epoch": 0.0013024550385383957, + "grad_norm": 3.985480546951294, + "learning_rate": 4.999979262356904e-05, + "loss": 7.6987, + "step": 219 + }, + { + "epoch": 0.001308402321819393, + "grad_norm": 3.8719842433929443, + "learning_rate": 4.999979071667202e-05, + "loss": 7.6994, + "step": 220 + }, + { + "epoch": 0.0013143496051003902, + "grad_norm": 4.699835300445557, + "learning_rate": 4.999978880104784e-05, + "loss": 8.1815, + "step": 221 + }, + { + "epoch": 0.0013202968883813874, + "grad_norm": 3.9221127033233643, + "learning_rate": 4.9999786876696485e-05, + "loss": 7.8765, + "step": 222 + }, + { + "epoch": 0.0013262441716623847, + "grad_norm": 4.4223504066467285, + "learning_rate": 4.9999784943617964e-05, + "loss": 7.7244, + "step": 223 + }, + { + "epoch": 0.001332191454943382, + "grad_norm": 3.4598348140716553, + "learning_rate": 4.999978300181227e-05, + "loss": 7.7072, + "step": 224 + }, + { + "epoch": 0.0013381387382243792, + "grad_norm": 3.536752223968506, + "learning_rate": 4.999978105127941e-05, + "loss": 7.6337, + "step": 225 + }, + { + "epoch": 0.0013440860215053765, + "grad_norm": 3.6432204246520996, + "learning_rate": 4.99997790920194e-05, + "loss": 7.8078, + "step": 226 + }, + { + "epoch": 0.0013500333047863735, + "grad_norm": 4.8305768966674805, + "learning_rate": 4.999977712403221e-05, + "loss": 7.9003, + "step": 227 + }, + { + "epoch": 0.0013559805880673707, + "grad_norm": 3.773876428604126, + "learning_rate": 4.999977514731786e-05, + "loss": 8.0513, + "step": 228 + }, + { + "epoch": 0.001361927871348368, + "grad_norm": 4.465645790100098, + "learning_rate": 4.999977316187635e-05, + "loss": 7.9847, + "step": 229 + }, + { + "epoch": 0.0013678751546293653, + "grad_norm": 3.9466493129730225, + "learning_rate": 4.9999771167707674e-05, + "loss": 7.9902, + "step": 230 + }, + { + "epoch": 0.0013738224379103625, + "grad_norm": 4.432138919830322, + "learning_rate": 4.9999769164811846e-05, + "loss": 7.8929, + "step": 231 + }, + { + "epoch": 0.0013797697211913598, + "grad_norm": 3.5211949348449707, + "learning_rate": 4.999976715318885e-05, + "loss": 8.1838, + "step": 232 + }, + { + "epoch": 0.001385717004472357, + "grad_norm": 3.0819287300109863, + "learning_rate": 4.9999765132838686e-05, + "loss": 8.2823, + "step": 233 + }, + { + "epoch": 0.0013916642877533543, + "grad_norm": 3.436112880706787, + "learning_rate": 4.9999763103761374e-05, + "loss": 7.7796, + "step": 234 + }, + { + "epoch": 0.0013976115710343515, + "grad_norm": 3.6699061393737793, + "learning_rate": 4.99997610659569e-05, + "loss": 7.5792, + "step": 235 + }, + { + "epoch": 0.0014035588543153488, + "grad_norm": 3.814182758331299, + "learning_rate": 4.999975901942526e-05, + "loss": 7.5631, + "step": 236 + }, + { + "epoch": 0.001409506137596346, + "grad_norm": 3.84110164642334, + "learning_rate": 4.9999756964166465e-05, + "loss": 7.4244, + "step": 237 + }, + { + "epoch": 0.0014154534208773433, + "grad_norm": 3.278045415878296, + "learning_rate": 4.999975490018052e-05, + "loss": 7.9049, + "step": 238 + }, + { + "epoch": 0.0014214007041583405, + "grad_norm": 3.5502712726593018, + "learning_rate": 4.999975282746742e-05, + "loss": 8.0021, + "step": 239 + }, + { + "epoch": 0.0014273479874393378, + "grad_norm": 2.7919108867645264, + "learning_rate": 4.9999750746027153e-05, + "loss": 8.2854, + "step": 240 + }, + { + "epoch": 0.001433295270720335, + "grad_norm": 3.1689581871032715, + "learning_rate": 4.999974865585973e-05, + "loss": 8.3177, + "step": 241 + }, + { + "epoch": 0.001439242554001332, + "grad_norm": 2.728679656982422, + "learning_rate": 4.999974655696517e-05, + "loss": 8.3181, + "step": 242 + }, + { + "epoch": 0.0014451898372823293, + "grad_norm": 3.5175108909606934, + "learning_rate": 4.9999744449343445e-05, + "loss": 8.03, + "step": 243 + }, + { + "epoch": 0.0014511371205633266, + "grad_norm": 3.714219808578491, + "learning_rate": 4.999974233299457e-05, + "loss": 8.0824, + "step": 244 + }, + { + "epoch": 0.0014570844038443239, + "grad_norm": 3.42090106010437, + "learning_rate": 4.9999740207918546e-05, + "loss": 8.0455, + "step": 245 + }, + { + "epoch": 0.001463031687125321, + "grad_norm": 3.035047769546509, + "learning_rate": 4.999973807411537e-05, + "loss": 8.0117, + "step": 246 + }, + { + "epoch": 0.0014689789704063184, + "grad_norm": 3.4878122806549072, + "learning_rate": 4.9999735931585034e-05, + "loss": 8.1368, + "step": 247 + }, + { + "epoch": 0.0014749262536873156, + "grad_norm": 3.648115873336792, + "learning_rate": 4.999973378032756e-05, + "loss": 7.9987, + "step": 248 + }, + { + "epoch": 0.0014808735369683129, + "grad_norm": 3.171255588531494, + "learning_rate": 4.9999731620342936e-05, + "loss": 7.9733, + "step": 249 + }, + { + "epoch": 0.0014868208202493101, + "grad_norm": 3.157804250717163, + "learning_rate": 4.999972945163116e-05, + "loss": 7.8511, + "step": 250 + }, + { + "epoch": 0.0014927681035303074, + "grad_norm": 3.4346978664398193, + "learning_rate": 4.999972727419224e-05, + "loss": 7.9075, + "step": 251 + }, + { + "epoch": 0.0014987153868113046, + "grad_norm": 3.281135082244873, + "learning_rate": 4.9999725088026175e-05, + "loss": 7.876, + "step": 252 + }, + { + "epoch": 0.0015046626700923019, + "grad_norm": 3.1481714248657227, + "learning_rate": 4.9999722893132954e-05, + "loss": 8.1458, + "step": 253 + }, + { + "epoch": 0.0015106099533732991, + "grad_norm": 2.821460247039795, + "learning_rate": 4.99997206895126e-05, + "loss": 7.9141, + "step": 254 + }, + { + "epoch": 0.0015165572366542964, + "grad_norm": 2.887997627258301, + "learning_rate": 4.999971847716509e-05, + "loss": 8.2246, + "step": 255 + }, + { + "epoch": 0.0015225045199352936, + "grad_norm": 2.8097078800201416, + "learning_rate": 4.999971625609044e-05, + "loss": 7.8576, + "step": 256 + }, + { + "epoch": 0.001528451803216291, + "grad_norm": 2.9272890090942383, + "learning_rate": 4.999971402628866e-05, + "loss": 7.6856, + "step": 257 + }, + { + "epoch": 0.001534399086497288, + "grad_norm": 3.487027168273926, + "learning_rate": 4.999971178775973e-05, + "loss": 7.8179, + "step": 258 + }, + { + "epoch": 0.0015403463697782852, + "grad_norm": 3.575681209564209, + "learning_rate": 4.9999709540503656e-05, + "loss": 7.8115, + "step": 259 + }, + { + "epoch": 0.0015462936530592824, + "grad_norm": 3.457756757736206, + "learning_rate": 4.9999707284520435e-05, + "loss": 7.7985, + "step": 260 + }, + { + "epoch": 0.0015522409363402797, + "grad_norm": 3.732728958129883, + "learning_rate": 4.999970501981009e-05, + "loss": 7.8369, + "step": 261 + }, + { + "epoch": 0.001558188219621277, + "grad_norm": 4.1466898918151855, + "learning_rate": 4.99997027463726e-05, + "loss": 8.2435, + "step": 262 + }, + { + "epoch": 0.0015641355029022742, + "grad_norm": 4.028534889221191, + "learning_rate": 4.9999700464207965e-05, + "loss": 8.2338, + "step": 263 + }, + { + "epoch": 0.0015700827861832715, + "grad_norm": 3.7445273399353027, + "learning_rate": 4.99996981733162e-05, + "loss": 8.1182, + "step": 264 + }, + { + "epoch": 0.0015760300694642687, + "grad_norm": 3.455228567123413, + "learning_rate": 4.99996958736973e-05, + "loss": 8.1932, + "step": 265 + }, + { + "epoch": 0.001581977352745266, + "grad_norm": 3.1530332565307617, + "learning_rate": 4.9999693565351256e-05, + "loss": 7.8304, + "step": 266 + }, + { + "epoch": 0.0015879246360262632, + "grad_norm": 3.113161325454712, + "learning_rate": 4.999969124827809e-05, + "loss": 7.6625, + "step": 267 + }, + { + "epoch": 0.0015938719193072605, + "grad_norm": 3.621076822280884, + "learning_rate": 4.999968892247778e-05, + "loss": 8.0983, + "step": 268 + }, + { + "epoch": 0.0015998192025882577, + "grad_norm": 3.533395767211914, + "learning_rate": 4.9999686587950346e-05, + "loss": 7.9564, + "step": 269 + }, + { + "epoch": 0.001605766485869255, + "grad_norm": 3.6486849784851074, + "learning_rate": 4.999968424469577e-05, + "loss": 7.9864, + "step": 270 + }, + { + "epoch": 0.0016117137691502522, + "grad_norm": 3.223167657852173, + "learning_rate": 4.999968189271407e-05, + "loss": 7.8516, + "step": 271 + }, + { + "epoch": 0.0016176610524312495, + "grad_norm": 3.282062530517578, + "learning_rate": 4.999967953200523e-05, + "loss": 7.9247, + "step": 272 + }, + { + "epoch": 0.0016236083357122465, + "grad_norm": 2.8589930534362793, + "learning_rate": 4.999967716256927e-05, + "loss": 7.8871, + "step": 273 + }, + { + "epoch": 0.0016295556189932438, + "grad_norm": 3.136882781982422, + "learning_rate": 4.9999674784406174e-05, + "loss": 7.8793, + "step": 274 + }, + { + "epoch": 0.001635502902274241, + "grad_norm": 3.9103915691375732, + "learning_rate": 4.999967239751595e-05, + "loss": 7.9005, + "step": 275 + }, + { + "epoch": 0.0016414501855552383, + "grad_norm": 4.40267276763916, + "learning_rate": 4.99996700018986e-05, + "loss": 7.9247, + "step": 276 + }, + { + "epoch": 0.0016473974688362356, + "grad_norm": 3.6620242595672607, + "learning_rate": 4.9999667597554136e-05, + "loss": 8.0719, + "step": 277 + }, + { + "epoch": 0.0016533447521172328, + "grad_norm": 3.1278858184814453, + "learning_rate": 4.999966518448253e-05, + "loss": 8.0822, + "step": 278 + }, + { + "epoch": 0.00165929203539823, + "grad_norm": 3.321831464767456, + "learning_rate": 4.9999662762683805e-05, + "loss": 8.1266, + "step": 279 + }, + { + "epoch": 0.0016652393186792273, + "grad_norm": 3.4116811752319336, + "learning_rate": 4.999966033215795e-05, + "loss": 8.2159, + "step": 280 + }, + { + "epoch": 0.0016711866019602246, + "grad_norm": 3.58381724357605, + "learning_rate": 4.999965789290498e-05, + "loss": 8.0275, + "step": 281 + }, + { + "epoch": 0.0016771338852412218, + "grad_norm": 3.0357518196105957, + "learning_rate": 4.9999655444924884e-05, + "loss": 8.1171, + "step": 282 + }, + { + "epoch": 0.001683081168522219, + "grad_norm": 3.237764596939087, + "learning_rate": 4.999965298821767e-05, + "loss": 7.822, + "step": 283 + }, + { + "epoch": 0.0016890284518032163, + "grad_norm": 3.0861873626708984, + "learning_rate": 4.999965052278334e-05, + "loss": 7.7991, + "step": 284 + }, + { + "epoch": 0.0016949757350842136, + "grad_norm": 2.8045542240142822, + "learning_rate": 4.999964804862187e-05, + "loss": 7.9659, + "step": 285 + }, + { + "epoch": 0.0017009230183652108, + "grad_norm": 3.1282641887664795, + "learning_rate": 4.9999645565733297e-05, + "loss": 7.8354, + "step": 286 + }, + { + "epoch": 0.001706870301646208, + "grad_norm": 2.980001211166382, + "learning_rate": 4.999964307411761e-05, + "loss": 7.806, + "step": 287 + }, + { + "epoch": 0.0017128175849272054, + "grad_norm": 3.114238977432251, + "learning_rate": 4.99996405737748e-05, + "loss": 7.6173, + "step": 288 + }, + { + "epoch": 0.0017187648682082024, + "grad_norm": 2.6732640266418457, + "learning_rate": 4.9999638064704866e-05, + "loss": 7.5944, + "step": 289 + }, + { + "epoch": 0.0017247121514891996, + "grad_norm": 3.2139906883239746, + "learning_rate": 4.999963554690783e-05, + "loss": 7.5738, + "step": 290 + }, + { + "epoch": 0.001730659434770197, + "grad_norm": 3.0964555740356445, + "learning_rate": 4.999963302038368e-05, + "loss": 7.4431, + "step": 291 + }, + { + "epoch": 0.0017366067180511942, + "grad_norm": 3.0611374378204346, + "learning_rate": 4.99996304851324e-05, + "loss": 7.3748, + "step": 292 + }, + { + "epoch": 0.0017425540013321914, + "grad_norm": 2.88114333152771, + "learning_rate": 4.999962794115402e-05, + "loss": 7.3554, + "step": 293 + }, + { + "epoch": 0.0017485012846131887, + "grad_norm": 2.895141363143921, + "learning_rate": 4.999962538844852e-05, + "loss": 7.2801, + "step": 294 + }, + { + "epoch": 0.001754448567894186, + "grad_norm": 3.0645008087158203, + "learning_rate": 4.9999622827015914e-05, + "loss": 7.1753, + "step": 295 + }, + { + "epoch": 0.0017603958511751832, + "grad_norm": 3.0750465393066406, + "learning_rate": 4.99996202568562e-05, + "loss": 7.1905, + "step": 296 + }, + { + "epoch": 0.0017663431344561804, + "grad_norm": 3.1322436332702637, + "learning_rate": 4.9999617677969374e-05, + "loss": 7.0851, + "step": 297 + }, + { + "epoch": 0.0017722904177371777, + "grad_norm": 3.8287153244018555, + "learning_rate": 4.999961509035544e-05, + "loss": 7.0842, + "step": 298 + }, + { + "epoch": 0.001778237701018175, + "grad_norm": 2.874312162399292, + "learning_rate": 4.9999612494014403e-05, + "loss": 6.9588, + "step": 299 + }, + { + "epoch": 0.0017841849842991722, + "grad_norm": 2.916250705718994, + "learning_rate": 4.999960988894625e-05, + "loss": 7.1342, + "step": 300 + }, + { + "epoch": 0.0017901322675801694, + "grad_norm": 2.71624755859375, + "learning_rate": 4.9999607275151e-05, + "loss": 7.0418, + "step": 301 + }, + { + "epoch": 0.0017960795508611667, + "grad_norm": 2.655630350112915, + "learning_rate": 4.999960465262864e-05, + "loss": 6.937, + "step": 302 + }, + { + "epoch": 0.001802026834142164, + "grad_norm": 2.8819122314453125, + "learning_rate": 4.999960202137918e-05, + "loss": 7.0116, + "step": 303 + }, + { + "epoch": 0.0018079741174231612, + "grad_norm": 2.909701108932495, + "learning_rate": 4.999959938140262e-05, + "loss": 6.9588, + "step": 304 + }, + { + "epoch": 0.0018139214007041582, + "grad_norm": 3.276395797729492, + "learning_rate": 4.999959673269895e-05, + "loss": 6.9066, + "step": 305 + }, + { + "epoch": 0.0018198686839851555, + "grad_norm": 2.8774867057800293, + "learning_rate": 4.9999594075268186e-05, + "loss": 7.0112, + "step": 306 + }, + { + "epoch": 0.0018258159672661528, + "grad_norm": 2.9667818546295166, + "learning_rate": 4.999959140911032e-05, + "loss": 7.1467, + "step": 307 + }, + { + "epoch": 0.00183176325054715, + "grad_norm": 6.6612958908081055, + "learning_rate": 4.999958873422536e-05, + "loss": 8.4457, + "step": 308 + }, + { + "epoch": 0.0018377105338281473, + "grad_norm": 4.234557628631592, + "learning_rate": 4.999958605061329e-05, + "loss": 8.904, + "step": 309 + }, + { + "epoch": 0.0018436578171091445, + "grad_norm": 4.049502372741699, + "learning_rate": 4.999958335827413e-05, + "loss": 7.5174, + "step": 310 + }, + { + "epoch": 0.0018496051003901418, + "grad_norm": 3.574474334716797, + "learning_rate": 4.999958065720787e-05, + "loss": 8.6537, + "step": 311 + }, + { + "epoch": 0.001855552383671139, + "grad_norm": 3.6154026985168457, + "learning_rate": 4.9999577947414515e-05, + "loss": 8.5833, + "step": 312 + }, + { + "epoch": 0.0018614996669521363, + "grad_norm": 2.9204158782958984, + "learning_rate": 4.999957522889407e-05, + "loss": 8.5486, + "step": 313 + }, + { + "epoch": 0.0018674469502331335, + "grad_norm": 3.095310688018799, + "learning_rate": 4.999957250164653e-05, + "loss": 8.3855, + "step": 314 + }, + { + "epoch": 0.0018733942335141308, + "grad_norm": 3.872267723083496, + "learning_rate": 4.999956976567189e-05, + "loss": 8.2715, + "step": 315 + }, + { + "epoch": 0.001879341516795128, + "grad_norm": 3.5560686588287354, + "learning_rate": 4.9999567020970175e-05, + "loss": 8.1571, + "step": 316 + }, + { + "epoch": 0.0018852888000761253, + "grad_norm": 2.6759164333343506, + "learning_rate": 4.9999564267541356e-05, + "loss": 8.4072, + "step": 317 + }, + { + "epoch": 0.0018912360833571226, + "grad_norm": 4.034712791442871, + "learning_rate": 4.999956150538545e-05, + "loss": 7.7622, + "step": 318 + }, + { + "epoch": 0.0018971833666381198, + "grad_norm": 3.8927831649780273, + "learning_rate": 4.999955873450246e-05, + "loss": 7.5012, + "step": 319 + }, + { + "epoch": 0.0019031306499191168, + "grad_norm": 3.4422812461853027, + "learning_rate": 4.999955595489237e-05, + "loss": 7.6894, + "step": 320 + }, + { + "epoch": 0.001909077933200114, + "grad_norm": 3.0367283821105957, + "learning_rate": 4.999955316655521e-05, + "loss": 7.8151, + "step": 321 + }, + { + "epoch": 0.0019150252164811114, + "grad_norm": 3.7553489208221436, + "learning_rate": 4.9999550369490955e-05, + "loss": 8.0462, + "step": 322 + }, + { + "epoch": 0.0019209724997621086, + "grad_norm": 3.432591438293457, + "learning_rate": 4.999954756369962e-05, + "loss": 7.8782, + "step": 323 + }, + { + "epoch": 0.0019269197830431059, + "grad_norm": 2.7325966358184814, + "learning_rate": 4.9999544749181196e-05, + "loss": 7.9045, + "step": 324 + }, + { + "epoch": 0.0019328670663241031, + "grad_norm": 4.31963586807251, + "learning_rate": 4.9999541925935686e-05, + "loss": 7.7791, + "step": 325 + }, + { + "epoch": 0.0019388143496051004, + "grad_norm": 2.840189218521118, + "learning_rate": 4.999953909396311e-05, + "loss": 7.8334, + "step": 326 + }, + { + "epoch": 0.0019447616328860976, + "grad_norm": 3.2388041019439697, + "learning_rate": 4.9999536253263434e-05, + "loss": 7.6756, + "step": 327 + }, + { + "epoch": 0.0019507089161670949, + "grad_norm": 3.6291563510894775, + "learning_rate": 4.999953340383669e-05, + "loss": 7.6511, + "step": 328 + }, + { + "epoch": 0.001956656199448092, + "grad_norm": 3.35703706741333, + "learning_rate": 4.999953054568287e-05, + "loss": 7.6382, + "step": 329 + }, + { + "epoch": 0.001962603482729089, + "grad_norm": 3.117281198501587, + "learning_rate": 4.999952767880196e-05, + "loss": 7.6233, + "step": 330 + }, + { + "epoch": 0.0019685507660100864, + "grad_norm": 2.8385257720947266, + "learning_rate": 4.999952480319398e-05, + "loss": 7.6594, + "step": 331 + }, + { + "epoch": 0.0019744980492910837, + "grad_norm": 2.5914418697357178, + "learning_rate": 4.999952191885893e-05, + "loss": 8.2647, + "step": 332 + }, + { + "epoch": 0.001980445332572081, + "grad_norm": 2.5847742557525635, + "learning_rate": 4.9999519025796795e-05, + "loss": 8.339, + "step": 333 + }, + { + "epoch": 0.001986392615853078, + "grad_norm": 2.7022132873535156, + "learning_rate": 4.999951612400759e-05, + "loss": 7.9114, + "step": 334 + }, + { + "epoch": 0.0019923398991340754, + "grad_norm": 3.0290884971618652, + "learning_rate": 4.999951321349131e-05, + "loss": 7.4531, + "step": 335 + }, + { + "epoch": 0.0019982871824150727, + "grad_norm": 2.8910324573516846, + "learning_rate": 4.999951029424796e-05, + "loss": 7.398, + "step": 336 + }, + { + "epoch": 0.00200423446569607, + "grad_norm": 2.8917605876922607, + "learning_rate": 4.9999507366277545e-05, + "loss": 7.48, + "step": 337 + }, + { + "epoch": 0.002010181748977067, + "grad_norm": 2.8957982063293457, + "learning_rate": 4.999950442958005e-05, + "loss": 7.8662, + "step": 338 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 3.562232255935669, + "learning_rate": 4.9999501484155485e-05, + "loss": 7.8388, + "step": 339 + }, + { + "epoch": 0.0020220763155390617, + "grad_norm": 2.51676607131958, + "learning_rate": 4.9999498530003866e-05, + "loss": 8.2834, + "step": 340 + }, + { + "epoch": 0.002028023598820059, + "grad_norm": 2.326110363006592, + "learning_rate": 4.999949556712517e-05, + "loss": 8.2528, + "step": 341 + }, + { + "epoch": 0.0020339708821010562, + "grad_norm": 2.7621335983276367, + "learning_rate": 4.999949259551941e-05, + "loss": 7.9791, + "step": 342 + }, + { + "epoch": 0.0020399181653820535, + "grad_norm": 3.045431137084961, + "learning_rate": 4.999948961518659e-05, + "loss": 7.8575, + "step": 343 + }, + { + "epoch": 0.0020458654486630507, + "grad_norm": 3.1940131187438965, + "learning_rate": 4.9999486626126703e-05, + "loss": 7.8581, + "step": 344 + }, + { + "epoch": 0.002051812731944048, + "grad_norm": 2.964136838912964, + "learning_rate": 4.999948362833975e-05, + "loss": 7.9656, + "step": 345 + }, + { + "epoch": 0.0020577600152250452, + "grad_norm": 3.167573928833008, + "learning_rate": 4.999948062182574e-05, + "loss": 7.7448, + "step": 346 + }, + { + "epoch": 0.0020637072985060425, + "grad_norm": 3.062666177749634, + "learning_rate": 4.9999477606584666e-05, + "loss": 7.7655, + "step": 347 + }, + { + "epoch": 0.0020696545817870397, + "grad_norm": 3.1097402572631836, + "learning_rate": 4.999947458261653e-05, + "loss": 7.643, + "step": 348 + }, + { + "epoch": 0.002075601865068037, + "grad_norm": 3.1663928031921387, + "learning_rate": 4.999947154992135e-05, + "loss": 7.8348, + "step": 349 + }, + { + "epoch": 0.0020815491483490343, + "grad_norm": 2.8295886516571045, + "learning_rate": 4.99994685084991e-05, + "loss": 7.7752, + "step": 350 + }, + { + "epoch": 0.0020874964316300315, + "grad_norm": 2.7384233474731445, + "learning_rate": 4.99994654583498e-05, + "loss": 7.7644, + "step": 351 + }, + { + "epoch": 0.0020934437149110288, + "grad_norm": 2.6654486656188965, + "learning_rate": 4.999946239947344e-05, + "loss": 7.7489, + "step": 352 + }, + { + "epoch": 0.002099390998192026, + "grad_norm": 2.8949942588806152, + "learning_rate": 4.999945933187003e-05, + "loss": 7.7105, + "step": 353 + }, + { + "epoch": 0.0021053382814730233, + "grad_norm": 2.590036630630493, + "learning_rate": 4.999945625553957e-05, + "loss": 7.6821, + "step": 354 + }, + { + "epoch": 0.0021112855647540205, + "grad_norm": 3.4601457118988037, + "learning_rate": 4.999945317048205e-05, + "loss": 7.3552, + "step": 355 + }, + { + "epoch": 0.002117232848035018, + "grad_norm": 4.022705078125, + "learning_rate": 4.999945007669748e-05, + "loss": 7.0281, + "step": 356 + }, + { + "epoch": 0.002123180131316015, + "grad_norm": 3.249699592590332, + "learning_rate": 4.999944697418587e-05, + "loss": 7.9279, + "step": 357 + }, + { + "epoch": 0.0021291274145970123, + "grad_norm": 2.8424601554870605, + "learning_rate": 4.99994438629472e-05, + "loss": 8.1485, + "step": 358 + }, + { + "epoch": 0.002135074697878009, + "grad_norm": 3.0473172664642334, + "learning_rate": 4.9999440742981486e-05, + "loss": 8.0877, + "step": 359 + }, + { + "epoch": 0.0021410219811590064, + "grad_norm": 3.0614171028137207, + "learning_rate": 4.9999437614288726e-05, + "loss": 7.7817, + "step": 360 + }, + { + "epoch": 0.0021469692644400036, + "grad_norm": 3.309464931488037, + "learning_rate": 4.9999434476868925e-05, + "loss": 7.857, + "step": 361 + }, + { + "epoch": 0.002152916547721001, + "grad_norm": 3.031921148300171, + "learning_rate": 4.999943133072207e-05, + "loss": 7.6393, + "step": 362 + }, + { + "epoch": 0.002158863831001998, + "grad_norm": 3.3756978511810303, + "learning_rate": 4.999942817584818e-05, + "loss": 7.7422, + "step": 363 + }, + { + "epoch": 0.0021648111142829954, + "grad_norm": 3.53362774848938, + "learning_rate": 4.999942501224724e-05, + "loss": 7.9388, + "step": 364 + }, + { + "epoch": 0.0021707583975639926, + "grad_norm": 3.4082882404327393, + "learning_rate": 4.999942183991927e-05, + "loss": 7.3578, + "step": 365 + }, + { + "epoch": 0.00217670568084499, + "grad_norm": 4.035211086273193, + "learning_rate": 4.999941865886425e-05, + "loss": 7.7833, + "step": 366 + }, + { + "epoch": 0.002182652964125987, + "grad_norm": 3.0394630432128906, + "learning_rate": 4.99994154690822e-05, + "loss": 7.9392, + "step": 367 + }, + { + "epoch": 0.0021886002474069844, + "grad_norm": 3.088926076889038, + "learning_rate": 4.99994122705731e-05, + "loss": 7.8149, + "step": 368 + }, + { + "epoch": 0.0021945475306879817, + "grad_norm": 2.3173277378082275, + "learning_rate": 4.9999409063336976e-05, + "loss": 8.2211, + "step": 369 + }, + { + "epoch": 0.002200494813968979, + "grad_norm": 2.9960854053497314, + "learning_rate": 4.9999405847373815e-05, + "loss": 7.6764, + "step": 370 + }, + { + "epoch": 0.002206442097249976, + "grad_norm": 2.841848134994507, + "learning_rate": 4.999940262268361e-05, + "loss": 7.9418, + "step": 371 + }, + { + "epoch": 0.0022123893805309734, + "grad_norm": 3.748779058456421, + "learning_rate": 4.999939938926638e-05, + "loss": 7.7843, + "step": 372 + }, + { + "epoch": 0.0022183366638119707, + "grad_norm": 2.8345019817352295, + "learning_rate": 4.999939614712212e-05, + "loss": 7.592, + "step": 373 + }, + { + "epoch": 0.002224283947092968, + "grad_norm": 3.12503719329834, + "learning_rate": 4.9999392896250826e-05, + "loss": 7.9543, + "step": 374 + }, + { + "epoch": 0.002230231230373965, + "grad_norm": 2.7812912464141846, + "learning_rate": 4.99993896366525e-05, + "loss": 7.8738, + "step": 375 + }, + { + "epoch": 0.0022361785136549624, + "grad_norm": 2.9477410316467285, + "learning_rate": 4.9999386368327144e-05, + "loss": 7.7738, + "step": 376 + }, + { + "epoch": 0.0022421257969359597, + "grad_norm": 2.305204391479492, + "learning_rate": 4.999938309127477e-05, + "loss": 7.9123, + "step": 377 + }, + { + "epoch": 0.002248073080216957, + "grad_norm": 3.3839781284332275, + "learning_rate": 4.999937980549536e-05, + "loss": 7.8542, + "step": 378 + }, + { + "epoch": 0.002254020363497954, + "grad_norm": 3.6973462104797363, + "learning_rate": 4.9999376510988924e-05, + "loss": 7.6953, + "step": 379 + }, + { + "epoch": 0.0022599676467789515, + "grad_norm": 3.8176333904266357, + "learning_rate": 4.999937320775547e-05, + "loss": 7.6548, + "step": 380 + }, + { + "epoch": 0.0022659149300599487, + "grad_norm": 3.0237386226654053, + "learning_rate": 4.999936989579499e-05, + "loss": 7.7843, + "step": 381 + }, + { + "epoch": 0.002271862213340946, + "grad_norm": 2.699695348739624, + "learning_rate": 4.999936657510749e-05, + "loss": 7.8841, + "step": 382 + }, + { + "epoch": 0.0022778094966219432, + "grad_norm": 3.7468206882476807, + "learning_rate": 4.9999363245692965e-05, + "loss": 7.8069, + "step": 383 + }, + { + "epoch": 0.0022837567799029405, + "grad_norm": 3.1074821949005127, + "learning_rate": 4.999935990755142e-05, + "loss": 7.8392, + "step": 384 + }, + { + "epoch": 0.0022897040631839377, + "grad_norm": 2.420884609222412, + "learning_rate": 4.999935656068287e-05, + "loss": 7.9238, + "step": 385 + }, + { + "epoch": 0.002295651346464935, + "grad_norm": 3.1354825496673584, + "learning_rate": 4.9999353205087296e-05, + "loss": 7.9766, + "step": 386 + }, + { + "epoch": 0.0023015986297459322, + "grad_norm": 2.7911901473999023, + "learning_rate": 4.9999349840764695e-05, + "loss": 7.9118, + "step": 387 + }, + { + "epoch": 0.0023075459130269295, + "grad_norm": 2.59529447555542, + "learning_rate": 4.999934646771509e-05, + "loss": 7.8839, + "step": 388 + }, + { + "epoch": 0.0023134931963079267, + "grad_norm": 4.121276378631592, + "learning_rate": 4.999934308593848e-05, + "loss": 7.8406, + "step": 389 + }, + { + "epoch": 0.0023194404795889236, + "grad_norm": 2.9091265201568604, + "learning_rate": 4.999933969543485e-05, + "loss": 7.86, + "step": 390 + }, + { + "epoch": 0.002325387762869921, + "grad_norm": 3.0700483322143555, + "learning_rate": 4.9999336296204195e-05, + "loss": 7.8214, + "step": 391 + }, + { + "epoch": 0.002331335046150918, + "grad_norm": 3.3008790016174316, + "learning_rate": 4.999933288824654e-05, + "loss": 7.5863, + "step": 392 + }, + { + "epoch": 0.0023372823294319153, + "grad_norm": 3.1414108276367188, + "learning_rate": 4.999932947156188e-05, + "loss": 7.5815, + "step": 393 + }, + { + "epoch": 0.0023432296127129126, + "grad_norm": 2.6881701946258545, + "learning_rate": 4.999932604615021e-05, + "loss": 7.959, + "step": 394 + }, + { + "epoch": 0.00234917689599391, + "grad_norm": 2.45609712600708, + "learning_rate": 4.9999322612011534e-05, + "loss": 7.9668, + "step": 395 + }, + { + "epoch": 0.002355124179274907, + "grad_norm": 3.1126747131347656, + "learning_rate": 4.999931916914585e-05, + "loss": 7.774, + "step": 396 + }, + { + "epoch": 0.0023610714625559043, + "grad_norm": 2.806708574295044, + "learning_rate": 4.999931571755316e-05, + "loss": 7.6297, + "step": 397 + }, + { + "epoch": 0.0023670187458369016, + "grad_norm": 3.220013380050659, + "learning_rate": 4.999931225723348e-05, + "loss": 7.3856, + "step": 398 + }, + { + "epoch": 0.002372966029117899, + "grad_norm": 3.0159943103790283, + "learning_rate": 4.9999308788186786e-05, + "loss": 7.3822, + "step": 399 + }, + { + "epoch": 0.002378913312398896, + "grad_norm": 3.1066205501556396, + "learning_rate": 4.9999305310413094e-05, + "loss": 7.3905, + "step": 400 + }, + { + "epoch": 0.0023848605956798934, + "grad_norm": 2.8004367351531982, + "learning_rate": 4.99993018239124e-05, + "loss": 7.8548, + "step": 401 + }, + { + "epoch": 0.0023908078789608906, + "grad_norm": 3.004378318786621, + "learning_rate": 4.999929832868471e-05, + "loss": 7.7846, + "step": 402 + }, + { + "epoch": 0.002396755162241888, + "grad_norm": 3.42901349067688, + "learning_rate": 4.9999294824730025e-05, + "loss": 7.9188, + "step": 403 + }, + { + "epoch": 0.002402702445522885, + "grad_norm": 3.7258527278900146, + "learning_rate": 4.9999291312048343e-05, + "loss": 7.7302, + "step": 404 + }, + { + "epoch": 0.0024086497288038824, + "grad_norm": 4.215145111083984, + "learning_rate": 4.999928779063967e-05, + "loss": 7.6597, + "step": 405 + }, + { + "epoch": 0.0024145970120848796, + "grad_norm": 3.157273769378662, + "learning_rate": 4.9999284260504004e-05, + "loss": 7.7262, + "step": 406 + }, + { + "epoch": 0.002420544295365877, + "grad_norm": 2.9977381229400635, + "learning_rate": 4.999928072164135e-05, + "loss": 7.72, + "step": 407 + }, + { + "epoch": 0.002426491578646874, + "grad_norm": 2.791682720184326, + "learning_rate": 4.9999277174051696e-05, + "loss": 7.8022, + "step": 408 + }, + { + "epoch": 0.0024324388619278714, + "grad_norm": 3.4143035411834717, + "learning_rate": 4.999927361773506e-05, + "loss": 7.5116, + "step": 409 + }, + { + "epoch": 0.0024383861452088687, + "grad_norm": 3.3458821773529053, + "learning_rate": 4.9999270052691425e-05, + "loss": 7.4337, + "step": 410 + }, + { + "epoch": 0.002444333428489866, + "grad_norm": 3.3339595794677734, + "learning_rate": 4.999926647892081e-05, + "loss": 7.7345, + "step": 411 + }, + { + "epoch": 0.002450280711770863, + "grad_norm": 4.285780429840088, + "learning_rate": 4.999926289642321e-05, + "loss": 7.9388, + "step": 412 + }, + { + "epoch": 0.0024562279950518604, + "grad_norm": 3.9473414421081543, + "learning_rate": 4.9999259305198624e-05, + "loss": 7.6038, + "step": 413 + }, + { + "epoch": 0.0024621752783328577, + "grad_norm": 3.504227638244629, + "learning_rate": 4.999925570524706e-05, + "loss": 7.4818, + "step": 414 + }, + { + "epoch": 0.002468122561613855, + "grad_norm": 3.2182157039642334, + "learning_rate": 4.999925209656851e-05, + "loss": 7.3493, + "step": 415 + }, + { + "epoch": 0.002474069844894852, + "grad_norm": 3.1944262981414795, + "learning_rate": 4.999924847916297e-05, + "loss": 7.3646, + "step": 416 + }, + { + "epoch": 0.0024800171281758494, + "grad_norm": 2.957244634628296, + "learning_rate": 4.999924485303047e-05, + "loss": 7.4403, + "step": 417 + }, + { + "epoch": 0.0024859644114568467, + "grad_norm": 2.971285343170166, + "learning_rate": 4.999924121817098e-05, + "loss": 7.7266, + "step": 418 + }, + { + "epoch": 0.002491911694737844, + "grad_norm": 4.029009819030762, + "learning_rate": 4.999923757458451e-05, + "loss": 7.3919, + "step": 419 + }, + { + "epoch": 0.002497858978018841, + "grad_norm": 3.9034767150878906, + "learning_rate": 4.999923392227107e-05, + "loss": 7.2349, + "step": 420 + }, + { + "epoch": 0.002503806261299838, + "grad_norm": 3.23218035697937, + "learning_rate": 4.9999230261230656e-05, + "loss": 7.5146, + "step": 421 + }, + { + "epoch": 0.0025097535445808353, + "grad_norm": 3.193225622177124, + "learning_rate": 4.9999226591463265e-05, + "loss": 7.1699, + "step": 422 + }, + { + "epoch": 0.0025157008278618325, + "grad_norm": 2.9796435832977295, + "learning_rate": 4.999922291296891e-05, + "loss": 7.5719, + "step": 423 + }, + { + "epoch": 0.0025216481111428298, + "grad_norm": 2.6746885776519775, + "learning_rate": 4.999921922574758e-05, + "loss": 7.8086, + "step": 424 + }, + { + "epoch": 0.002527595394423827, + "grad_norm": 3.0622920989990234, + "learning_rate": 4.999921552979928e-05, + "loss": 7.3233, + "step": 425 + }, + { + "epoch": 0.0025335426777048243, + "grad_norm": 3.0908501148223877, + "learning_rate": 4.999921182512402e-05, + "loss": 7.2582, + "step": 426 + }, + { + "epoch": 0.0025394899609858215, + "grad_norm": 2.6913537979125977, + "learning_rate": 4.999920811172178e-05, + "loss": 7.6643, + "step": 427 + }, + { + "epoch": 0.002545437244266819, + "grad_norm": 2.7793848514556885, + "learning_rate": 4.999920438959258e-05, + "loss": 7.9445, + "step": 428 + }, + { + "epoch": 0.002551384527547816, + "grad_norm": 2.741617202758789, + "learning_rate": 4.999920065873642e-05, + "loss": 8.0755, + "step": 429 + }, + { + "epoch": 0.0025573318108288133, + "grad_norm": 2.7102227210998535, + "learning_rate": 4.999919691915329e-05, + "loss": 7.8908, + "step": 430 + }, + { + "epoch": 0.0025632790941098106, + "grad_norm": 2.687788248062134, + "learning_rate": 4.9999193170843206e-05, + "loss": 7.9025, + "step": 431 + }, + { + "epoch": 0.002569226377390808, + "grad_norm": 2.923664093017578, + "learning_rate": 4.999918941380616e-05, + "loss": 7.9331, + "step": 432 + }, + { + "epoch": 0.002575173660671805, + "grad_norm": 2.934735059738159, + "learning_rate": 4.999918564804215e-05, + "loss": 7.722, + "step": 433 + }, + { + "epoch": 0.0025811209439528023, + "grad_norm": 3.8156228065490723, + "learning_rate": 4.999918187355119e-05, + "loss": 7.9392, + "step": 434 + }, + { + "epoch": 0.0025870682272337996, + "grad_norm": 2.333798408508301, + "learning_rate": 4.999917809033327e-05, + "loss": 7.9093, + "step": 435 + }, + { + "epoch": 0.002593015510514797, + "grad_norm": 2.078932046890259, + "learning_rate": 4.99991742983884e-05, + "loss": 7.8484, + "step": 436 + }, + { + "epoch": 0.002598962793795794, + "grad_norm": 2.433375835418701, + "learning_rate": 4.999917049771657e-05, + "loss": 7.9124, + "step": 437 + }, + { + "epoch": 0.0026049100770767913, + "grad_norm": 3.1881024837493896, + "learning_rate": 4.999916668831779e-05, + "loss": 7.3966, + "step": 438 + }, + { + "epoch": 0.0026108573603577886, + "grad_norm": 2.4724855422973633, + "learning_rate": 4.9999162870192065e-05, + "loss": 7.535, + "step": 439 + }, + { + "epoch": 0.002616804643638786, + "grad_norm": 2.8757777214050293, + "learning_rate": 4.999915904333938e-05, + "loss": 7.6728, + "step": 440 + }, + { + "epoch": 0.002622751926919783, + "grad_norm": 3.5439565181732178, + "learning_rate": 4.999915520775975e-05, + "loss": 7.5308, + "step": 441 + }, + { + "epoch": 0.0026286992102007804, + "grad_norm": 2.8345577716827393, + "learning_rate": 4.999915136345318e-05, + "loss": 7.7083, + "step": 442 + }, + { + "epoch": 0.0026346464934817776, + "grad_norm": 3.0842509269714355, + "learning_rate": 4.999914751041965e-05, + "loss": 7.9281, + "step": 443 + }, + { + "epoch": 0.002640593776762775, + "grad_norm": 3.0017757415771484, + "learning_rate": 4.999914364865919e-05, + "loss": 7.4727, + "step": 444 + }, + { + "epoch": 0.002646541060043772, + "grad_norm": 2.637838125228882, + "learning_rate": 4.9999139778171785e-05, + "loss": 7.5284, + "step": 445 + }, + { + "epoch": 0.0026524883433247694, + "grad_norm": 2.7749550342559814, + "learning_rate": 4.999913589895743e-05, + "loss": 7.7006, + "step": 446 + }, + { + "epoch": 0.0026584356266057666, + "grad_norm": 3.1636059284210205, + "learning_rate": 4.9999132011016146e-05, + "loss": 7.6441, + "step": 447 + }, + { + "epoch": 0.002664382909886764, + "grad_norm": 2.623776435852051, + "learning_rate": 4.9999128114347913e-05, + "loss": 7.8027, + "step": 448 + }, + { + "epoch": 0.002670330193167761, + "grad_norm": 2.803612232208252, + "learning_rate": 4.9999124208952755e-05, + "loss": 7.553, + "step": 449 + }, + { + "epoch": 0.0026762774764487584, + "grad_norm": 3.3169047832489014, + "learning_rate": 4.9999120294830656e-05, + "loss": 8.0965, + "step": 450 + }, + { + "epoch": 0.0026822247597297556, + "grad_norm": 3.9928581714630127, + "learning_rate": 4.999911637198161e-05, + "loss": 7.8152, + "step": 451 + }, + { + "epoch": 0.002688172043010753, + "grad_norm": 2.8126320838928223, + "learning_rate": 4.9999112440405646e-05, + "loss": 7.4843, + "step": 452 + }, + { + "epoch": 0.0026941193262917497, + "grad_norm": 2.773427963256836, + "learning_rate": 4.999910850010275e-05, + "loss": 7.7074, + "step": 453 + }, + { + "epoch": 0.002700066609572747, + "grad_norm": 2.8877642154693604, + "learning_rate": 4.999910455107292e-05, + "loss": 7.7764, + "step": 454 + }, + { + "epoch": 0.0027060138928537442, + "grad_norm": 2.6323535442352295, + "learning_rate": 4.9999100593316155e-05, + "loss": 7.7336, + "step": 455 + }, + { + "epoch": 0.0027119611761347415, + "grad_norm": 2.939509153366089, + "learning_rate": 4.9999096626832465e-05, + "loss": 7.8184, + "step": 456 + }, + { + "epoch": 0.0027179084594157387, + "grad_norm": 2.6926229000091553, + "learning_rate": 4.9999092651621855e-05, + "loss": 7.5027, + "step": 457 + }, + { + "epoch": 0.002723855742696736, + "grad_norm": 2.889389991760254, + "learning_rate": 4.999908866768431e-05, + "loss": 7.1138, + "step": 458 + }, + { + "epoch": 0.0027298030259777332, + "grad_norm": 2.951796531677246, + "learning_rate": 4.999908467501985e-05, + "loss": 7.7549, + "step": 459 + }, + { + "epoch": 0.0027357503092587305, + "grad_norm": 2.9076783657073975, + "learning_rate": 4.999908067362847e-05, + "loss": 7.6577, + "step": 460 + }, + { + "epoch": 0.0027416975925397278, + "grad_norm": 3.010636806488037, + "learning_rate": 4.9999076663510155e-05, + "loss": 7.6467, + "step": 461 + }, + { + "epoch": 0.002747644875820725, + "grad_norm": 2.7591371536254883, + "learning_rate": 4.9999072644664935e-05, + "loss": 7.5825, + "step": 462 + }, + { + "epoch": 0.0027535921591017223, + "grad_norm": 2.503632068634033, + "learning_rate": 4.9999068617092795e-05, + "loss": 7.711, + "step": 463 + }, + { + "epoch": 0.0027595394423827195, + "grad_norm": 2.6518661975860596, + "learning_rate": 4.999906458079373e-05, + "loss": 7.557, + "step": 464 + }, + { + "epoch": 0.0027654867256637168, + "grad_norm": 2.6865615844726562, + "learning_rate": 4.9999060535767764e-05, + "loss": 7.5788, + "step": 465 + }, + { + "epoch": 0.002771434008944714, + "grad_norm": 2.715190887451172, + "learning_rate": 4.999905648201487e-05, + "loss": 7.517, + "step": 466 + }, + { + "epoch": 0.0027773812922257113, + "grad_norm": 3.1603381633758545, + "learning_rate": 4.999905241953506e-05, + "loss": 7.6176, + "step": 467 + }, + { + "epoch": 0.0027833285755067085, + "grad_norm": 3.1451528072357178, + "learning_rate": 4.999904834832836e-05, + "loss": 7.6051, + "step": 468 + }, + { + "epoch": 0.002789275858787706, + "grad_norm": 2.5310862064361572, + "learning_rate": 4.9999044268394736e-05, + "loss": 7.6075, + "step": 469 + }, + { + "epoch": 0.002795223142068703, + "grad_norm": 2.9285359382629395, + "learning_rate": 4.99990401797342e-05, + "loss": 7.5399, + "step": 470 + }, + { + "epoch": 0.0028011704253497003, + "grad_norm": 3.2180614471435547, + "learning_rate": 4.9999036082346766e-05, + "loss": 7.6952, + "step": 471 + }, + { + "epoch": 0.0028071177086306976, + "grad_norm": 4.041499614715576, + "learning_rate": 4.9999031976232426e-05, + "loss": 7.841, + "step": 472 + }, + { + "epoch": 0.002813064991911695, + "grad_norm": 3.233492612838745, + "learning_rate": 4.999902786139118e-05, + "loss": 7.5267, + "step": 473 + }, + { + "epoch": 0.002819012275192692, + "grad_norm": 2.7749760150909424, + "learning_rate": 4.9999023737823034e-05, + "loss": 7.3703, + "step": 474 + }, + { + "epoch": 0.0028249595584736893, + "grad_norm": 2.9886162281036377, + "learning_rate": 4.999901960552798e-05, + "loss": 7.4684, + "step": 475 + }, + { + "epoch": 0.0028309068417546866, + "grad_norm": 2.934190511703491, + "learning_rate": 4.999901546450604e-05, + "loss": 7.4432, + "step": 476 + }, + { + "epoch": 0.002836854125035684, + "grad_norm": 3.696247100830078, + "learning_rate": 4.9999011314757196e-05, + "loss": 7.4944, + "step": 477 + }, + { + "epoch": 0.002842801408316681, + "grad_norm": 3.6706700325012207, + "learning_rate": 4.9999007156281454e-05, + "loss": 7.3726, + "step": 478 + }, + { + "epoch": 0.0028487486915976783, + "grad_norm": 3.8638553619384766, + "learning_rate": 4.999900298907881e-05, + "loss": 7.072, + "step": 479 + }, + { + "epoch": 0.0028546959748786756, + "grad_norm": 4.307566165924072, + "learning_rate": 4.999899881314928e-05, + "loss": 6.9371, + "step": 480 + }, + { + "epoch": 0.002860643258159673, + "grad_norm": 3.337372064590454, + "learning_rate": 4.9998994628492854e-05, + "loss": 7.7299, + "step": 481 + }, + { + "epoch": 0.00286659054144067, + "grad_norm": 3.1284921169281006, + "learning_rate": 4.9998990435109535e-05, + "loss": 7.5629, + "step": 482 + }, + { + "epoch": 0.0028725378247216674, + "grad_norm": 3.06904935836792, + "learning_rate": 4.999898623299933e-05, + "loss": 7.5332, + "step": 483 + }, + { + "epoch": 0.002878485108002664, + "grad_norm": 2.985121011734009, + "learning_rate": 4.999898202216224e-05, + "loss": 7.5972, + "step": 484 + }, + { + "epoch": 0.0028844323912836614, + "grad_norm": 2.9188039302825928, + "learning_rate": 4.999897780259827e-05, + "loss": 7.6242, + "step": 485 + }, + { + "epoch": 0.0028903796745646587, + "grad_norm": 3.2263259887695312, + "learning_rate": 4.9998973574307406e-05, + "loss": 7.5746, + "step": 486 + }, + { + "epoch": 0.002896326957845656, + "grad_norm": 2.645188331604004, + "learning_rate": 4.999896933728966e-05, + "loss": 7.6122, + "step": 487 + }, + { + "epoch": 0.002902274241126653, + "grad_norm": 2.89583158493042, + "learning_rate": 4.9998965091545035e-05, + "loss": 7.6157, + "step": 488 + }, + { + "epoch": 0.0029082215244076504, + "grad_norm": 3.6182286739349365, + "learning_rate": 4.9998960837073524e-05, + "loss": 7.4056, + "step": 489 + }, + { + "epoch": 0.0029141688076886477, + "grad_norm": 3.377560615539551, + "learning_rate": 4.9998956573875135e-05, + "loss": 7.4408, + "step": 490 + }, + { + "epoch": 0.002920116090969645, + "grad_norm": 3.0581517219543457, + "learning_rate": 4.9998952301949874e-05, + "loss": 7.5776, + "step": 491 + }, + { + "epoch": 0.002926063374250642, + "grad_norm": 3.5199148654937744, + "learning_rate": 4.999894802129773e-05, + "loss": 7.4747, + "step": 492 + }, + { + "epoch": 0.0029320106575316395, + "grad_norm": 3.866055727005005, + "learning_rate": 4.9998943731918714e-05, + "loss": 7.5985, + "step": 493 + }, + { + "epoch": 0.0029379579408126367, + "grad_norm": 2.856255054473877, + "learning_rate": 4.999893943381283e-05, + "loss": 7.9698, + "step": 494 + }, + { + "epoch": 0.002943905224093634, + "grad_norm": 3.0758626461029053, + "learning_rate": 4.999893512698007e-05, + "loss": 7.6311, + "step": 495 + }, + { + "epoch": 0.0029498525073746312, + "grad_norm": 3.739844560623169, + "learning_rate": 4.999893081142044e-05, + "loss": 7.6829, + "step": 496 + }, + { + "epoch": 0.0029557997906556285, + "grad_norm": 4.025709629058838, + "learning_rate": 4.999892648713394e-05, + "loss": 7.2717, + "step": 497 + }, + { + "epoch": 0.0029617470739366257, + "grad_norm": 3.6604738235473633, + "learning_rate": 4.999892215412057e-05, + "loss": 7.2985, + "step": 498 + }, + { + "epoch": 0.002967694357217623, + "grad_norm": 3.230109930038452, + "learning_rate": 4.999891781238034e-05, + "loss": 8.1041, + "step": 499 + }, + { + "epoch": 0.0029736416404986202, + "grad_norm": 2.5046725273132324, + "learning_rate": 4.999891346191325e-05, + "loss": 8.0888, + "step": 500 + }, + { + "epoch": 0.0029795889237796175, + "grad_norm": 2.916459798812866, + "learning_rate": 4.999890910271929e-05, + "loss": 7.8675, + "step": 501 + }, + { + "epoch": 0.0029855362070606148, + "grad_norm": 2.7806055545806885, + "learning_rate": 4.999890473479848e-05, + "loss": 7.8903, + "step": 502 + }, + { + "epoch": 0.002991483490341612, + "grad_norm": 2.9877662658691406, + "learning_rate": 4.99989003581508e-05, + "loss": 7.473, + "step": 503 + }, + { + "epoch": 0.0029974307736226093, + "grad_norm": 3.1581692695617676, + "learning_rate": 4.999889597277626e-05, + "loss": 7.5654, + "step": 504 + }, + { + "epoch": 0.0030033780569036065, + "grad_norm": 3.102539539337158, + "learning_rate": 4.9998891578674866e-05, + "loss": 7.8865, + "step": 505 + }, + { + "epoch": 0.0030093253401846038, + "grad_norm": 3.0357863903045654, + "learning_rate": 4.999888717584662e-05, + "loss": 7.291, + "step": 506 + }, + { + "epoch": 0.003015272623465601, + "grad_norm": 2.604048252105713, + "learning_rate": 4.999888276429152e-05, + "loss": 7.4892, + "step": 507 + }, + { + "epoch": 0.0030212199067465983, + "grad_norm": 2.734354257583618, + "learning_rate": 4.999887834400957e-05, + "loss": 7.1182, + "step": 508 + }, + { + "epoch": 0.0030271671900275955, + "grad_norm": 2.5255348682403564, + "learning_rate": 4.9998873915000775e-05, + "loss": 7.449, + "step": 509 + }, + { + "epoch": 0.003033114473308593, + "grad_norm": 2.864072322845459, + "learning_rate": 4.999886947726512e-05, + "loss": 7.3213, + "step": 510 + }, + { + "epoch": 0.00303906175658959, + "grad_norm": 2.764187812805176, + "learning_rate": 4.999886503080262e-05, + "loss": 7.337, + "step": 511 + }, + { + "epoch": 0.0030450090398705873, + "grad_norm": 3.5725066661834717, + "learning_rate": 4.9998860575613285e-05, + "loss": 7.8398, + "step": 512 + }, + { + "epoch": 0.0030509563231515846, + "grad_norm": 3.8559648990631104, + "learning_rate": 4.9998856111697096e-05, + "loss": 7.395, + "step": 513 + }, + { + "epoch": 0.003056903606432582, + "grad_norm": 2.9047908782958984, + "learning_rate": 4.999885163905407e-05, + "loss": 7.7016, + "step": 514 + }, + { + "epoch": 0.0030628508897135786, + "grad_norm": 3.1485037803649902, + "learning_rate": 4.99988471576842e-05, + "loss": 6.9411, + "step": 515 + }, + { + "epoch": 0.003068798172994576, + "grad_norm": 3.2763617038726807, + "learning_rate": 4.999884266758749e-05, + "loss": 6.4778, + "step": 516 + }, + { + "epoch": 0.003074745456275573, + "grad_norm": 2.7609500885009766, + "learning_rate": 4.999883816876394e-05, + "loss": 7.0576, + "step": 517 + }, + { + "epoch": 0.0030806927395565704, + "grad_norm": 3.7407751083374023, + "learning_rate": 4.999883366121356e-05, + "loss": 7.7389, + "step": 518 + }, + { + "epoch": 0.0030866400228375676, + "grad_norm": 3.3356568813323975, + "learning_rate": 4.999882914493634e-05, + "loss": 7.7, + "step": 519 + }, + { + "epoch": 0.003092587306118565, + "grad_norm": 2.635594129562378, + "learning_rate": 4.999882461993229e-05, + "loss": 7.6103, + "step": 520 + }, + { + "epoch": 0.003098534589399562, + "grad_norm": 3.7604281902313232, + "learning_rate": 4.9998820086201406e-05, + "loss": 7.6814, + "step": 521 + }, + { + "epoch": 0.0031044818726805594, + "grad_norm": 3.6567211151123047, + "learning_rate": 4.99988155437437e-05, + "loss": 7.6729, + "step": 522 + }, + { + "epoch": 0.0031104291559615567, + "grad_norm": 3.605442523956299, + "learning_rate": 4.999881099255916e-05, + "loss": 7.7464, + "step": 523 + }, + { + "epoch": 0.003116376439242554, + "grad_norm": 3.015500783920288, + "learning_rate": 4.99988064326478e-05, + "loss": 7.5168, + "step": 524 + }, + { + "epoch": 0.003122323722523551, + "grad_norm": 2.9037563800811768, + "learning_rate": 4.9998801864009604e-05, + "loss": 7.7059, + "step": 525 + }, + { + "epoch": 0.0031282710058045484, + "grad_norm": 2.812509059906006, + "learning_rate": 4.999879728664458e-05, + "loss": 7.4178, + "step": 526 + }, + { + "epoch": 0.0031342182890855457, + "grad_norm": 3.340226888656616, + "learning_rate": 4.9998792700552746e-05, + "loss": 7.7872, + "step": 527 + }, + { + "epoch": 0.003140165572366543, + "grad_norm": 3.0951550006866455, + "learning_rate": 4.999878810573409e-05, + "loss": 8.0153, + "step": 528 + }, + { + "epoch": 0.00314611285564754, + "grad_norm": 3.1077651977539062, + "learning_rate": 4.9998783502188616e-05, + "loss": 7.7053, + "step": 529 + }, + { + "epoch": 0.0031520601389285374, + "grad_norm": 3.442451000213623, + "learning_rate": 4.999877888991632e-05, + "loss": 7.5149, + "step": 530 + }, + { + "epoch": 0.0031580074222095347, + "grad_norm": 3.7479207515716553, + "learning_rate": 4.9998774268917215e-05, + "loss": 7.3448, + "step": 531 + }, + { + "epoch": 0.003163954705490532, + "grad_norm": 2.660789966583252, + "learning_rate": 4.999876963919129e-05, + "loss": 7.8348, + "step": 532 + }, + { + "epoch": 0.003169901988771529, + "grad_norm": 2.6255943775177, + "learning_rate": 4.9998765000738556e-05, + "loss": 7.542, + "step": 533 + }, + { + "epoch": 0.0031758492720525265, + "grad_norm": 3.121521472930908, + "learning_rate": 4.9998760353559017e-05, + "loss": 7.46, + "step": 534 + }, + { + "epoch": 0.0031817965553335237, + "grad_norm": 2.958880662918091, + "learning_rate": 4.999875569765266e-05, + "loss": 7.5385, + "step": 535 + }, + { + "epoch": 0.003187743838614521, + "grad_norm": 3.4153661727905273, + "learning_rate": 4.99987510330195e-05, + "loss": 7.4989, + "step": 536 + }, + { + "epoch": 0.0031936911218955182, + "grad_norm": 3.0877597332000732, + "learning_rate": 4.999874635965953e-05, + "loss": 7.5512, + "step": 537 + }, + { + "epoch": 0.0031996384051765155, + "grad_norm": 3.109522581100464, + "learning_rate": 4.9998741677572756e-05, + "loss": 7.4679, + "step": 538 + }, + { + "epoch": 0.0032055856884575127, + "grad_norm": 3.4434239864349365, + "learning_rate": 4.999873698675919e-05, + "loss": 7.0599, + "step": 539 + }, + { + "epoch": 0.00321153297173851, + "grad_norm": 3.83335018157959, + "learning_rate": 4.999873228721882e-05, + "loss": 7.5355, + "step": 540 + }, + { + "epoch": 0.0032174802550195072, + "grad_norm": 3.0679752826690674, + "learning_rate": 4.999872757895164e-05, + "loss": 7.7231, + "step": 541 + }, + { + "epoch": 0.0032234275383005045, + "grad_norm": 3.272196054458618, + "learning_rate": 4.999872286195767e-05, + "loss": 7.6674, + "step": 542 + }, + { + "epoch": 0.0032293748215815017, + "grad_norm": 2.8453965187072754, + "learning_rate": 4.9998718136236897e-05, + "loss": 7.4451, + "step": 543 + }, + { + "epoch": 0.003235322104862499, + "grad_norm": 3.074399709701538, + "learning_rate": 4.999871340178934e-05, + "loss": 7.6011, + "step": 544 + }, + { + "epoch": 0.0032412693881434963, + "grad_norm": 3.173004150390625, + "learning_rate": 4.999870865861499e-05, + "loss": 7.5268, + "step": 545 + }, + { + "epoch": 0.003247216671424493, + "grad_norm": 2.820848226547241, + "learning_rate": 4.999870390671384e-05, + "loss": 7.9872, + "step": 546 + }, + { + "epoch": 0.0032531639547054903, + "grad_norm": 2.692702293395996, + "learning_rate": 4.9998699146085906e-05, + "loss": 7.4676, + "step": 547 + }, + { + "epoch": 0.0032591112379864876, + "grad_norm": 2.2766902446746826, + "learning_rate": 4.999869437673119e-05, + "loss": 7.3826, + "step": 548 + }, + { + "epoch": 0.003265058521267485, + "grad_norm": 2.1190011501312256, + "learning_rate": 4.9998689598649686e-05, + "loss": 7.4767, + "step": 549 + }, + { + "epoch": 0.003271005804548482, + "grad_norm": 2.687633514404297, + "learning_rate": 4.999868481184139e-05, + "loss": 7.9922, + "step": 550 + }, + { + "epoch": 0.0032769530878294794, + "grad_norm": 3.403298854827881, + "learning_rate": 4.999868001630632e-05, + "loss": 7.8035, + "step": 551 + }, + { + "epoch": 0.0032829003711104766, + "grad_norm": 3.074881076812744, + "learning_rate": 4.999867521204446e-05, + "loss": 7.7106, + "step": 552 + }, + { + "epoch": 0.003288847654391474, + "grad_norm": 3.28725004196167, + "learning_rate": 4.9998670399055827e-05, + "loss": 7.4661, + "step": 553 + }, + { + "epoch": 0.003294794937672471, + "grad_norm": 3.8624775409698486, + "learning_rate": 4.999866557734041e-05, + "loss": 7.7156, + "step": 554 + }, + { + "epoch": 0.0033007422209534684, + "grad_norm": 2.53586745262146, + "learning_rate": 4.999866074689823e-05, + "loss": 7.945, + "step": 555 + }, + { + "epoch": 0.0033066895042344656, + "grad_norm": 3.8261072635650635, + "learning_rate": 4.9998655907729265e-05, + "loss": 8.0446, + "step": 556 + }, + { + "epoch": 0.003312636787515463, + "grad_norm": 2.7173407077789307, + "learning_rate": 4.999865105983353e-05, + "loss": 7.8363, + "step": 557 + }, + { + "epoch": 0.00331858407079646, + "grad_norm": 4.68424654006958, + "learning_rate": 4.999864620321102e-05, + "loss": 7.667, + "step": 558 + }, + { + "epoch": 0.0033245313540774574, + "grad_norm": 2.8763632774353027, + "learning_rate": 4.999864133786175e-05, + "loss": 7.6133, + "step": 559 + }, + { + "epoch": 0.0033304786373584546, + "grad_norm": 3.0986382961273193, + "learning_rate": 4.9998636463785705e-05, + "loss": 7.6257, + "step": 560 + }, + { + "epoch": 0.003336425920639452, + "grad_norm": 2.6826348304748535, + "learning_rate": 4.9998631580982905e-05, + "loss": 7.5187, + "step": 561 + }, + { + "epoch": 0.003342373203920449, + "grad_norm": 2.2172515392303467, + "learning_rate": 4.9998626689453334e-05, + "loss": 7.961, + "step": 562 + }, + { + "epoch": 0.0033483204872014464, + "grad_norm": 2.6083858013153076, + "learning_rate": 4.9998621789197e-05, + "loss": 7.7887, + "step": 563 + }, + { + "epoch": 0.0033542677704824437, + "grad_norm": 3.6838009357452393, + "learning_rate": 4.99986168802139e-05, + "loss": 7.4945, + "step": 564 + }, + { + "epoch": 0.003360215053763441, + "grad_norm": 3.2091991901397705, + "learning_rate": 4.999861196250405e-05, + "loss": 7.4243, + "step": 565 + }, + { + "epoch": 0.003366162337044438, + "grad_norm": 3.142982244491577, + "learning_rate": 4.9998607036067434e-05, + "loss": 7.4684, + "step": 566 + }, + { + "epoch": 0.0033721096203254354, + "grad_norm": 3.7751007080078125, + "learning_rate": 4.9998602100904065e-05, + "loss": 7.3722, + "step": 567 + }, + { + "epoch": 0.0033780569036064327, + "grad_norm": 3.276843547821045, + "learning_rate": 4.9998597157013946e-05, + "loss": 7.4012, + "step": 568 + }, + { + "epoch": 0.00338400418688743, + "grad_norm": 2.840106725692749, + "learning_rate": 4.999859220439708e-05, + "loss": 7.4013, + "step": 569 + }, + { + "epoch": 0.003389951470168427, + "grad_norm": 2.7816810607910156, + "learning_rate": 4.999858724305346e-05, + "loss": 7.3136, + "step": 570 + }, + { + "epoch": 0.0033958987534494244, + "grad_norm": 4.523340225219727, + "learning_rate": 4.999858227298308e-05, + "loss": 7.0553, + "step": 571 + }, + { + "epoch": 0.0034018460367304217, + "grad_norm": 3.9653191566467285, + "learning_rate": 4.9998577294185964e-05, + "loss": 7.1907, + "step": 572 + }, + { + "epoch": 0.003407793320011419, + "grad_norm": 3.243089199066162, + "learning_rate": 4.999857230666211e-05, + "loss": 7.0749, + "step": 573 + }, + { + "epoch": 0.003413740603292416, + "grad_norm": 3.3622777462005615, + "learning_rate": 4.99985673104115e-05, + "loss": 7.0005, + "step": 574 + }, + { + "epoch": 0.0034196878865734135, + "grad_norm": 2.561732292175293, + "learning_rate": 4.9998562305434154e-05, + "loss": 7.271, + "step": 575 + }, + { + "epoch": 0.0034256351698544107, + "grad_norm": 3.1846745014190674, + "learning_rate": 4.999855729173006e-05, + "loss": 7.7333, + "step": 576 + }, + { + "epoch": 0.0034315824531354075, + "grad_norm": 3.0318918228149414, + "learning_rate": 4.999855226929924e-05, + "loss": 7.5535, + "step": 577 + }, + { + "epoch": 0.003437529736416405, + "grad_norm": 2.993086099624634, + "learning_rate": 4.999854723814168e-05, + "loss": 7.6272, + "step": 578 + }, + { + "epoch": 0.003443477019697402, + "grad_norm": 2.8511712551116943, + "learning_rate": 4.999854219825738e-05, + "loss": 7.6619, + "step": 579 + }, + { + "epoch": 0.0034494243029783993, + "grad_norm": 2.6181185245513916, + "learning_rate": 4.9998537149646355e-05, + "loss": 7.7452, + "step": 580 + }, + { + "epoch": 0.0034553715862593965, + "grad_norm": 2.9932363033294678, + "learning_rate": 4.9998532092308593e-05, + "loss": 7.7475, + "step": 581 + }, + { + "epoch": 0.003461318869540394, + "grad_norm": 3.541944742202759, + "learning_rate": 4.99985270262441e-05, + "loss": 7.5808, + "step": 582 + }, + { + "epoch": 0.003467266152821391, + "grad_norm": 2.780372381210327, + "learning_rate": 4.9998521951452895e-05, + "loss": 7.8167, + "step": 583 + }, + { + "epoch": 0.0034732134361023883, + "grad_norm": 2.9156363010406494, + "learning_rate": 4.9998516867934945e-05, + "loss": 7.74, + "step": 584 + }, + { + "epoch": 0.0034791607193833856, + "grad_norm": 3.9492485523223877, + "learning_rate": 4.9998511775690285e-05, + "loss": 7.1128, + "step": 585 + }, + { + "epoch": 0.003485108002664383, + "grad_norm": 2.8288252353668213, + "learning_rate": 4.9998506674718896e-05, + "loss": 7.4884, + "step": 586 + }, + { + "epoch": 0.00349105528594538, + "grad_norm": 2.8906798362731934, + "learning_rate": 4.999850156502078e-05, + "loss": 7.6378, + "step": 587 + }, + { + "epoch": 0.0034970025692263773, + "grad_norm": 2.8806405067443848, + "learning_rate": 4.9998496446595955e-05, + "loss": 7.4641, + "step": 588 + }, + { + "epoch": 0.0035029498525073746, + "grad_norm": 3.1794772148132324, + "learning_rate": 4.999849131944441e-05, + "loss": 7.1633, + "step": 589 + }, + { + "epoch": 0.003508897135788372, + "grad_norm": 2.886009454727173, + "learning_rate": 4.999848618356615e-05, + "loss": 7.1793, + "step": 590 + }, + { + "epoch": 0.003514844419069369, + "grad_norm": 2.76184344291687, + "learning_rate": 4.999848103896118e-05, + "loss": 7.1377, + "step": 591 + }, + { + "epoch": 0.0035207917023503663, + "grad_norm": 3.127793788909912, + "learning_rate": 4.999847588562949e-05, + "loss": 7.2793, + "step": 592 + }, + { + "epoch": 0.0035267389856313636, + "grad_norm": 3.7768073081970215, + "learning_rate": 4.99984707235711e-05, + "loss": 7.8203, + "step": 593 + }, + { + "epoch": 0.003532686268912361, + "grad_norm": 3.1750540733337402, + "learning_rate": 4.9998465552786e-05, + "loss": 7.7078, + "step": 594 + }, + { + "epoch": 0.003538633552193358, + "grad_norm": 2.8884522914886475, + "learning_rate": 4.999846037327419e-05, + "loss": 7.6864, + "step": 595 + }, + { + "epoch": 0.0035445808354743554, + "grad_norm": 2.783928394317627, + "learning_rate": 4.999845518503568e-05, + "loss": 7.7329, + "step": 596 + }, + { + "epoch": 0.0035505281187553526, + "grad_norm": 2.8093652725219727, + "learning_rate": 4.9998449988070465e-05, + "loss": 7.7157, + "step": 597 + }, + { + "epoch": 0.00355647540203635, + "grad_norm": 2.54380464553833, + "learning_rate": 4.999844478237855e-05, + "loss": 7.6353, + "step": 598 + }, + { + "epoch": 0.003562422685317347, + "grad_norm": 3.478878974914551, + "learning_rate": 4.999843956795993e-05, + "loss": 7.4221, + "step": 599 + }, + { + "epoch": 0.0035683699685983444, + "grad_norm": 3.882807493209839, + "learning_rate": 4.999843434481463e-05, + "loss": 7.4857, + "step": 600 + }, + { + "epoch": 0.0035743172518793416, + "grad_norm": 3.0975584983825684, + "learning_rate": 4.999842911294261e-05, + "loss": 7.5121, + "step": 601 + }, + { + "epoch": 0.003580264535160339, + "grad_norm": 3.1857712268829346, + "learning_rate": 4.999842387234391e-05, + "loss": 7.4469, + "step": 602 + }, + { + "epoch": 0.003586211818441336, + "grad_norm": 2.892927885055542, + "learning_rate": 4.999841862301853e-05, + "loss": 7.4047, + "step": 603 + }, + { + "epoch": 0.0035921591017223334, + "grad_norm": 4.186185359954834, + "learning_rate": 4.999841336496645e-05, + "loss": 7.5146, + "step": 604 + }, + { + "epoch": 0.0035981063850033307, + "grad_norm": 3.27422833442688, + "learning_rate": 4.9998408098187674e-05, + "loss": 7.3347, + "step": 605 + }, + { + "epoch": 0.003604053668284328, + "grad_norm": 4.817208290100098, + "learning_rate": 4.9998402822682225e-05, + "loss": 7.9883, + "step": 606 + }, + { + "epoch": 0.003610000951565325, + "grad_norm": 5.903015613555908, + "learning_rate": 4.999839753845008e-05, + "loss": 7.9043, + "step": 607 + }, + { + "epoch": 0.0036159482348463224, + "grad_norm": 4.720086574554443, + "learning_rate": 4.999839224549127e-05, + "loss": 7.8456, + "step": 608 + }, + { + "epoch": 0.0036218955181273192, + "grad_norm": 4.518443584442139, + "learning_rate": 4.9998386943805764e-05, + "loss": 7.3659, + "step": 609 + }, + { + "epoch": 0.0036278428014083165, + "grad_norm": 2.621833086013794, + "learning_rate": 4.999838163339358e-05, + "loss": 8.0512, + "step": 610 + }, + { + "epoch": 0.0036337900846893137, + "grad_norm": 4.015076160430908, + "learning_rate": 4.9998376314254726e-05, + "loss": 7.8581, + "step": 611 + }, + { + "epoch": 0.003639737367970311, + "grad_norm": 3.8145275115966797, + "learning_rate": 4.999837098638919e-05, + "loss": 7.4288, + "step": 612 + }, + { + "epoch": 0.0036456846512513083, + "grad_norm": 3.396488904953003, + "learning_rate": 4.9998365649796985e-05, + "loss": 7.7812, + "step": 613 + }, + { + "epoch": 0.0036516319345323055, + "grad_norm": 2.931187391281128, + "learning_rate": 4.999836030447811e-05, + "loss": 7.5898, + "step": 614 + }, + { + "epoch": 0.0036575792178133028, + "grad_norm": 2.6349267959594727, + "learning_rate": 4.999835495043257e-05, + "loss": 7.5345, + "step": 615 + }, + { + "epoch": 0.0036635265010943, + "grad_norm": 3.014085531234741, + "learning_rate": 4.999834958766035e-05, + "loss": 7.5985, + "step": 616 + }, + { + "epoch": 0.0036694737843752973, + "grad_norm": 2.971475124359131, + "learning_rate": 4.999834421616147e-05, + "loss": 7.589, + "step": 617 + }, + { + "epoch": 0.0036754210676562945, + "grad_norm": 3.867366075515747, + "learning_rate": 4.999833883593593e-05, + "loss": 7.4026, + "step": 618 + }, + { + "epoch": 0.0036813683509372918, + "grad_norm": 2.3917908668518066, + "learning_rate": 4.9998333446983734e-05, + "loss": 7.4361, + "step": 619 + }, + { + "epoch": 0.003687315634218289, + "grad_norm": 4.583080768585205, + "learning_rate": 4.999832804930487e-05, + "loss": 7.5525, + "step": 620 + }, + { + "epoch": 0.0036932629174992863, + "grad_norm": 2.6039721965789795, + "learning_rate": 4.999832264289934e-05, + "loss": 7.636, + "step": 621 + }, + { + "epoch": 0.0036992102007802835, + "grad_norm": 4.123409748077393, + "learning_rate": 4.9998317227767165e-05, + "loss": 7.7803, + "step": 622 + }, + { + "epoch": 0.003705157484061281, + "grad_norm": 4.220766544342041, + "learning_rate": 4.999831180390834e-05, + "loss": 7.8086, + "step": 623 + }, + { + "epoch": 0.003711104767342278, + "grad_norm": 3.0759594440460205, + "learning_rate": 4.999830637132285e-05, + "loss": 7.4815, + "step": 624 + }, + { + "epoch": 0.0037170520506232753, + "grad_norm": 2.7870442867279053, + "learning_rate": 4.999830093001071e-05, + "loss": 7.3925, + "step": 625 + }, + { + "epoch": 0.0037229993339042726, + "grad_norm": 2.5292582511901855, + "learning_rate": 4.999829547997193e-05, + "loss": 7.2049, + "step": 626 + }, + { + "epoch": 0.00372894661718527, + "grad_norm": 2.5836963653564453, + "learning_rate": 4.99982900212065e-05, + "loss": 7.2858, + "step": 627 + }, + { + "epoch": 0.003734893900466267, + "grad_norm": 2.6433279514312744, + "learning_rate": 4.9998284553714425e-05, + "loss": 7.5894, + "step": 628 + }, + { + "epoch": 0.0037408411837472643, + "grad_norm": 3.1093215942382812, + "learning_rate": 4.999827907749571e-05, + "loss": 7.2859, + "step": 629 + }, + { + "epoch": 0.0037467884670282616, + "grad_norm": 2.313305616378784, + "learning_rate": 4.9998273592550346e-05, + "loss": 7.6275, + "step": 630 + }, + { + "epoch": 0.003752735750309259, + "grad_norm": 3.7002785205841064, + "learning_rate": 4.9998268098878355e-05, + "loss": 7.7068, + "step": 631 + }, + { + "epoch": 0.003758683033590256, + "grad_norm": 3.090707778930664, + "learning_rate": 4.9998262596479715e-05, + "loss": 7.7304, + "step": 632 + }, + { + "epoch": 0.0037646303168712533, + "grad_norm": 2.425614833831787, + "learning_rate": 4.999825708535445e-05, + "loss": 7.927, + "step": 633 + }, + { + "epoch": 0.0037705776001522506, + "grad_norm": 2.1477420330047607, + "learning_rate": 4.999825156550254e-05, + "loss": 8.1082, + "step": 634 + }, + { + "epoch": 0.003776524883433248, + "grad_norm": 2.434638738632202, + "learning_rate": 4.999824603692401e-05, + "loss": 7.8808, + "step": 635 + }, + { + "epoch": 0.003782472166714245, + "grad_norm": 2.563283681869507, + "learning_rate": 4.999824049961884e-05, + "loss": 7.8515, + "step": 636 + }, + { + "epoch": 0.0037884194499952424, + "grad_norm": 2.6878623962402344, + "learning_rate": 4.9998234953587054e-05, + "loss": 7.6393, + "step": 637 + }, + { + "epoch": 0.0037943667332762396, + "grad_norm": 2.6270666122436523, + "learning_rate": 4.999822939882863e-05, + "loss": 7.8246, + "step": 638 + }, + { + "epoch": 0.003800314016557237, + "grad_norm": 3.300494909286499, + "learning_rate": 4.9998223835343596e-05, + "loss": 7.4991, + "step": 639 + }, + { + "epoch": 0.0038062612998382337, + "grad_norm": 2.726902723312378, + "learning_rate": 4.9998218263131925e-05, + "loss": 7.6663, + "step": 640 + }, + { + "epoch": 0.003812208583119231, + "grad_norm": 2.8147871494293213, + "learning_rate": 4.9998212682193645e-05, + "loss": 7.5272, + "step": 641 + }, + { + "epoch": 0.003818155866400228, + "grad_norm": 2.324422597885132, + "learning_rate": 4.9998207092528745e-05, + "loss": 7.6577, + "step": 642 + }, + { + "epoch": 0.0038241031496812255, + "grad_norm": 2.4525058269500732, + "learning_rate": 4.999820149413723e-05, + "loss": 7.6793, + "step": 643 + }, + { + "epoch": 0.0038300504329622227, + "grad_norm": 2.4011337757110596, + "learning_rate": 4.9998195887019094e-05, + "loss": 7.4869, + "step": 644 + }, + { + "epoch": 0.00383599771624322, + "grad_norm": 2.3403005599975586, + "learning_rate": 4.9998190271174364e-05, + "loss": 7.9552, + "step": 645 + }, + { + "epoch": 0.003841944999524217, + "grad_norm": 2.1421074867248535, + "learning_rate": 4.9998184646603005e-05, + "loss": 7.4021, + "step": 646 + }, + { + "epoch": 0.0038478922828052145, + "grad_norm": 2.4157450199127197, + "learning_rate": 4.9998179013305046e-05, + "loss": 7.6666, + "step": 647 + }, + { + "epoch": 0.0038538395660862117, + "grad_norm": 2.737692356109619, + "learning_rate": 4.999817337128048e-05, + "loss": 7.7441, + "step": 648 + }, + { + "epoch": 0.003859786849367209, + "grad_norm": 3.2240428924560547, + "learning_rate": 4.999816772052931e-05, + "loss": 7.5691, + "step": 649 + }, + { + "epoch": 0.0038657341326482062, + "grad_norm": 2.8538997173309326, + "learning_rate": 4.9998162061051534e-05, + "loss": 7.4994, + "step": 650 + }, + { + "epoch": 0.0038716814159292035, + "grad_norm": 2.6562373638153076, + "learning_rate": 4.9998156392847164e-05, + "loss": 7.5156, + "step": 651 + }, + { + "epoch": 0.0038776286992102007, + "grad_norm": 2.5513811111450195, + "learning_rate": 4.999815071591619e-05, + "loss": 7.6503, + "step": 652 + }, + { + "epoch": 0.003883575982491198, + "grad_norm": 2.4196572303771973, + "learning_rate": 4.999814503025863e-05, + "loss": 7.9868, + "step": 653 + }, + { + "epoch": 0.0038895232657721952, + "grad_norm": 3.0201921463012695, + "learning_rate": 4.999813933587447e-05, + "loss": 7.5405, + "step": 654 + }, + { + "epoch": 0.0038954705490531925, + "grad_norm": 2.352625846862793, + "learning_rate": 4.9998133632763714e-05, + "loss": 7.5461, + "step": 655 + }, + { + "epoch": 0.0039014178323341898, + "grad_norm": 2.5318710803985596, + "learning_rate": 4.999812792092637e-05, + "loss": 7.5596, + "step": 656 + }, + { + "epoch": 0.003907365115615187, + "grad_norm": 2.710785388946533, + "learning_rate": 4.9998122200362444e-05, + "loss": 7.4828, + "step": 657 + }, + { + "epoch": 0.003913312398896184, + "grad_norm": 2.7441353797912598, + "learning_rate": 4.999811647107192e-05, + "loss": 7.2496, + "step": 658 + }, + { + "epoch": 0.0039192596821771815, + "grad_norm": 2.4602885246276855, + "learning_rate": 4.9998110733054824e-05, + "loss": 7.6134, + "step": 659 + }, + { + "epoch": 0.003925206965458178, + "grad_norm": 2.6842973232269287, + "learning_rate": 4.999810498631114e-05, + "loss": 7.3544, + "step": 660 + }, + { + "epoch": 0.003931154248739176, + "grad_norm": 2.8062961101531982, + "learning_rate": 4.9998099230840875e-05, + "loss": 7.5162, + "step": 661 + }, + { + "epoch": 0.003937101532020173, + "grad_norm": 4.0753679275512695, + "learning_rate": 4.9998093466644036e-05, + "loss": 7.5241, + "step": 662 + }, + { + "epoch": 0.0039430488153011705, + "grad_norm": 3.0165748596191406, + "learning_rate": 4.999808769372061e-05, + "loss": 7.5313, + "step": 663 + }, + { + "epoch": 0.003948996098582167, + "grad_norm": 2.73825740814209, + "learning_rate": 4.9998081912070623e-05, + "loss": 7.4433, + "step": 664 + }, + { + "epoch": 0.003954943381863165, + "grad_norm": 2.6649749279022217, + "learning_rate": 4.9998076121694056e-05, + "loss": 7.4852, + "step": 665 + }, + { + "epoch": 0.003960890665144162, + "grad_norm": 2.609389066696167, + "learning_rate": 4.999807032259092e-05, + "loss": 7.4127, + "step": 666 + }, + { + "epoch": 0.0039668379484251596, + "grad_norm": 2.50502610206604, + "learning_rate": 4.999806451476122e-05, + "loss": 7.3113, + "step": 667 + }, + { + "epoch": 0.003972785231706156, + "grad_norm": 2.565142869949341, + "learning_rate": 4.999805869820495e-05, + "loss": 7.1875, + "step": 668 + }, + { + "epoch": 0.003978732514987154, + "grad_norm": 2.582742214202881, + "learning_rate": 4.9998052872922117e-05, + "loss": 7.3251, + "step": 669 + }, + { + "epoch": 0.003984679798268151, + "grad_norm": 2.718780279159546, + "learning_rate": 4.999804703891272e-05, + "loss": 7.3599, + "step": 670 + }, + { + "epoch": 0.003990627081549149, + "grad_norm": 2.5971410274505615, + "learning_rate": 4.999804119617677e-05, + "loss": 7.2304, + "step": 671 + }, + { + "epoch": 0.003996574364830145, + "grad_norm": 2.5905725955963135, + "learning_rate": 4.9998035344714255e-05, + "loss": 7.3664, + "step": 672 + }, + { + "epoch": 0.004002521648111143, + "grad_norm": 2.659102439880371, + "learning_rate": 4.999802948452519e-05, + "loss": 7.4296, + "step": 673 + }, + { + "epoch": 0.00400846893139214, + "grad_norm": 2.5933544635772705, + "learning_rate": 4.999802361560957e-05, + "loss": 7.4605, + "step": 674 + }, + { + "epoch": 0.004014416214673138, + "grad_norm": 3.3860044479370117, + "learning_rate": 4.999801773796739e-05, + "loss": 7.5159, + "step": 675 + }, + { + "epoch": 0.004020363497954134, + "grad_norm": 3.742635726928711, + "learning_rate": 4.9998011851598666e-05, + "loss": 7.4988, + "step": 676 + }, + { + "epoch": 0.004026310781235132, + "grad_norm": 3.5960240364074707, + "learning_rate": 4.999800595650339e-05, + "loss": 7.4607, + "step": 677 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 2.654444694519043, + "learning_rate": 4.9998000052681585e-05, + "loss": 7.2166, + "step": 678 + }, + { + "epoch": 0.004038205347797127, + "grad_norm": 2.4538326263427734, + "learning_rate": 4.999799414013322e-05, + "loss": 7.2334, + "step": 679 + }, + { + "epoch": 0.004044152631078123, + "grad_norm": 2.5899672508239746, + "learning_rate": 4.9997988218858316e-05, + "loss": 7.2754, + "step": 680 + }, + { + "epoch": 0.004050099914359121, + "grad_norm": 2.721224069595337, + "learning_rate": 4.999798228885687e-05, + "loss": 7.188, + "step": 681 + }, + { + "epoch": 0.004056047197640118, + "grad_norm": 6.5863189697265625, + "learning_rate": 4.9997976350128894e-05, + "loss": 7.369, + "step": 682 + }, + { + "epoch": 0.004061994480921116, + "grad_norm": 2.6562674045562744, + "learning_rate": 4.999797040267438e-05, + "loss": 7.176, + "step": 683 + }, + { + "epoch": 0.0040679417642021124, + "grad_norm": 2.503666877746582, + "learning_rate": 4.9997964446493326e-05, + "loss": 7.2765, + "step": 684 + }, + { + "epoch": 0.00407388904748311, + "grad_norm": 9.070426940917969, + "learning_rate": 4.9997958481585756e-05, + "loss": 7.5187, + "step": 685 + }, + { + "epoch": 0.004079836330764107, + "grad_norm": 2.7480480670928955, + "learning_rate": 4.9997952507951645e-05, + "loss": 7.5244, + "step": 686 + }, + { + "epoch": 0.004085783614045104, + "grad_norm": 3.8338348865509033, + "learning_rate": 4.999794652559101e-05, + "loss": 7.6672, + "step": 687 + }, + { + "epoch": 0.0040917308973261015, + "grad_norm": 3.1132454872131348, + "learning_rate": 4.999794053450385e-05, + "loss": 7.9594, + "step": 688 + }, + { + "epoch": 0.004097678180607098, + "grad_norm": 2.6279757022857666, + "learning_rate": 4.999793453469017e-05, + "loss": 7.4737, + "step": 689 + }, + { + "epoch": 0.004103625463888096, + "grad_norm": 3.440145492553711, + "learning_rate": 4.9997928526149966e-05, + "loss": 7.2968, + "step": 690 + }, + { + "epoch": 0.004109572747169093, + "grad_norm": 2.3300867080688477, + "learning_rate": 4.9997922508883244e-05, + "loss": 7.3693, + "step": 691 + }, + { + "epoch": 0.0041155200304500905, + "grad_norm": 2.9034078121185303, + "learning_rate": 4.999791648289001e-05, + "loss": 7.7227, + "step": 692 + }, + { + "epoch": 0.004121467313731087, + "grad_norm": 2.5685503482818604, + "learning_rate": 4.9997910448170254e-05, + "loss": 7.9706, + "step": 693 + }, + { + "epoch": 0.004127414597012085, + "grad_norm": 3.260779619216919, + "learning_rate": 4.9997904404723986e-05, + "loss": 7.7231, + "step": 694 + }, + { + "epoch": 0.004133361880293082, + "grad_norm": 2.668193817138672, + "learning_rate": 4.999789835255121e-05, + "loss": 7.7677, + "step": 695 + }, + { + "epoch": 0.0041393091635740795, + "grad_norm": 2.545276641845703, + "learning_rate": 4.999789229165193e-05, + "loss": 7.9297, + "step": 696 + }, + { + "epoch": 0.004145256446855076, + "grad_norm": 3.2137503623962402, + "learning_rate": 4.9997886222026146e-05, + "loss": 7.697, + "step": 697 + }, + { + "epoch": 0.004151203730136074, + "grad_norm": 2.7501730918884277, + "learning_rate": 4.999788014367385e-05, + "loss": 7.3686, + "step": 698 + }, + { + "epoch": 0.004157151013417071, + "grad_norm": 2.2456486225128174, + "learning_rate": 4.9997874056595055e-05, + "loss": 7.7238, + "step": 699 + }, + { + "epoch": 0.0041630982966980685, + "grad_norm": 2.3958070278167725, + "learning_rate": 4.9997867960789764e-05, + "loss": 7.8349, + "step": 700 + }, + { + "epoch": 0.004169045579979065, + "grad_norm": 2.509744644165039, + "learning_rate": 4.9997861856257974e-05, + "loss": 7.5884, + "step": 701 + }, + { + "epoch": 0.004174992863260063, + "grad_norm": 3.6095783710479736, + "learning_rate": 4.9997855742999684e-05, + "loss": 7.4726, + "step": 702 + }, + { + "epoch": 0.00418094014654106, + "grad_norm": 3.3515326976776123, + "learning_rate": 4.99978496210149e-05, + "loss": 7.5214, + "step": 703 + }, + { + "epoch": 0.0041868874298220575, + "grad_norm": 4.7553791999816895, + "learning_rate": 4.999784349030363e-05, + "loss": 7.4577, + "step": 704 + }, + { + "epoch": 0.004192834713103054, + "grad_norm": 5.959117412567139, + "learning_rate": 4.9997837350865874e-05, + "loss": 7.2559, + "step": 705 + }, + { + "epoch": 0.004198781996384052, + "grad_norm": 2.9650065898895264, + "learning_rate": 4.999783120270163e-05, + "loss": 7.3712, + "step": 706 + }, + { + "epoch": 0.004204729279665049, + "grad_norm": 3.4171416759490967, + "learning_rate": 4.9997825045810895e-05, + "loss": 7.5014, + "step": 707 + }, + { + "epoch": 0.0042106765629460466, + "grad_norm": 3.297393798828125, + "learning_rate": 4.9997818880193684e-05, + "loss": 7.4553, + "step": 708 + }, + { + "epoch": 0.004216623846227043, + "grad_norm": 3.193859338760376, + "learning_rate": 4.999781270584999e-05, + "loss": 7.3414, + "step": 709 + }, + { + "epoch": 0.004222571129508041, + "grad_norm": 2.5028324127197266, + "learning_rate": 4.999780652277982e-05, + "loss": 7.4615, + "step": 710 + }, + { + "epoch": 0.004228518412789038, + "grad_norm": 3.43390154838562, + "learning_rate": 4.999780033098317e-05, + "loss": 7.3801, + "step": 711 + }, + { + "epoch": 0.004234465696070036, + "grad_norm": 3.3093984127044678, + "learning_rate": 4.999779413046004e-05, + "loss": 7.2938, + "step": 712 + }, + { + "epoch": 0.004240412979351032, + "grad_norm": 2.6643831729888916, + "learning_rate": 4.999778792121046e-05, + "loss": 7.3916, + "step": 713 + }, + { + "epoch": 0.00424636026263203, + "grad_norm": 2.779407501220703, + "learning_rate": 4.999778170323439e-05, + "loss": 7.5783, + "step": 714 + }, + { + "epoch": 0.004252307545913027, + "grad_norm": 2.959345817565918, + "learning_rate": 4.999777547653186e-05, + "loss": 7.9854, + "step": 715 + }, + { + "epoch": 0.004258254829194025, + "grad_norm": 2.9909780025482178, + "learning_rate": 4.9997769241102866e-05, + "loss": 7.997, + "step": 716 + }, + { + "epoch": 0.004264202112475021, + "grad_norm": 3.081831932067871, + "learning_rate": 4.9997762996947405e-05, + "loss": 7.9393, + "step": 717 + }, + { + "epoch": 0.004270149395756018, + "grad_norm": 2.8901429176330566, + "learning_rate": 4.9997756744065485e-05, + "loss": 7.8152, + "step": 718 + }, + { + "epoch": 0.004276096679037016, + "grad_norm": 3.3065547943115234, + "learning_rate": 4.9997750482457106e-05, + "loss": 7.1176, + "step": 719 + }, + { + "epoch": 0.004282043962318013, + "grad_norm": 3.1083710193634033, + "learning_rate": 4.9997744212122276e-05, + "loss": 7.6215, + "step": 720 + }, + { + "epoch": 0.00428799124559901, + "grad_norm": 4.010551452636719, + "learning_rate": 4.9997737933060987e-05, + "loss": 7.7665, + "step": 721 + }, + { + "epoch": 0.004293938528880007, + "grad_norm": 3.9287984371185303, + "learning_rate": 4.9997731645273245e-05, + "loss": 7.7185, + "step": 722 + }, + { + "epoch": 0.004299885812161005, + "grad_norm": 2.7739338874816895, + "learning_rate": 4.999772534875905e-05, + "loss": 7.7226, + "step": 723 + }, + { + "epoch": 0.004305833095442002, + "grad_norm": 2.675567865371704, + "learning_rate": 4.9997719043518414e-05, + "loss": 7.686, + "step": 724 + }, + { + "epoch": 0.0043117803787229994, + "grad_norm": 3.8513898849487305, + "learning_rate": 4.999771272955133e-05, + "loss": 7.6584, + "step": 725 + }, + { + "epoch": 0.004317727662003996, + "grad_norm": 10.309504508972168, + "learning_rate": 4.99977064068578e-05, + "loss": 7.4006, + "step": 726 + }, + { + "epoch": 0.004323674945284994, + "grad_norm": 2.712939977645874, + "learning_rate": 4.9997700075437836e-05, + "loss": 7.6275, + "step": 727 + }, + { + "epoch": 0.004329622228565991, + "grad_norm": 2.7880115509033203, + "learning_rate": 4.999769373529143e-05, + "loss": 7.4154, + "step": 728 + }, + { + "epoch": 0.0043355695118469885, + "grad_norm": 3.2352819442749023, + "learning_rate": 4.999768738641859e-05, + "loss": 7.4827, + "step": 729 + }, + { + "epoch": 0.004341516795127985, + "grad_norm": 3.5176644325256348, + "learning_rate": 4.999768102881931e-05, + "loss": 7.4748, + "step": 730 + }, + { + "epoch": 0.004347464078408983, + "grad_norm": 2.996829032897949, + "learning_rate": 4.99976746624936e-05, + "loss": 7.445, + "step": 731 + }, + { + "epoch": 0.00435341136168998, + "grad_norm": 4.5892534255981445, + "learning_rate": 4.9997668287441454e-05, + "loss": 7.6464, + "step": 732 + }, + { + "epoch": 0.0043593586449709775, + "grad_norm": 3.689419984817505, + "learning_rate": 4.999766190366289e-05, + "loss": 7.4215, + "step": 733 + }, + { + "epoch": 0.004365305928251974, + "grad_norm": 2.9146885871887207, + "learning_rate": 4.9997655511157896e-05, + "loss": 7.4852, + "step": 734 + }, + { + "epoch": 0.004371253211532972, + "grad_norm": 3.8503024578094482, + "learning_rate": 4.9997649109926484e-05, + "loss": 7.4779, + "step": 735 + }, + { + "epoch": 0.004377200494813969, + "grad_norm": 3.929422616958618, + "learning_rate": 4.9997642699968646e-05, + "loss": 7.3526, + "step": 736 + }, + { + "epoch": 0.0043831477780949665, + "grad_norm": 3.3365838527679443, + "learning_rate": 4.999763628128439e-05, + "loss": 7.3895, + "step": 737 + }, + { + "epoch": 0.004389095061375963, + "grad_norm": 3.147660970687866, + "learning_rate": 4.999762985387372e-05, + "loss": 7.1885, + "step": 738 + }, + { + "epoch": 0.004395042344656961, + "grad_norm": 3.3230104446411133, + "learning_rate": 4.9997623417736626e-05, + "loss": 7.5839, + "step": 739 + }, + { + "epoch": 0.004400989627937958, + "grad_norm": 3.285144090652466, + "learning_rate": 4.999761697287313e-05, + "loss": 7.4859, + "step": 740 + }, + { + "epoch": 0.0044069369112189555, + "grad_norm": 3.3811442852020264, + "learning_rate": 4.9997610519283216e-05, + "loss": 7.4871, + "step": 741 + }, + { + "epoch": 0.004412884194499952, + "grad_norm": 2.9662907123565674, + "learning_rate": 4.9997604056966904e-05, + "loss": 7.2546, + "step": 742 + }, + { + "epoch": 0.00441883147778095, + "grad_norm": 3.1432855129241943, + "learning_rate": 4.999759758592418e-05, + "loss": 7.5273, + "step": 743 + }, + { + "epoch": 0.004424778761061947, + "grad_norm": 3.0559749603271484, + "learning_rate": 4.9997591106155054e-05, + "loss": 7.0754, + "step": 744 + }, + { + "epoch": 0.0044307260443429445, + "grad_norm": 2.6778409481048584, + "learning_rate": 4.999758461765953e-05, + "loss": 7.1723, + "step": 745 + }, + { + "epoch": 0.004436673327623941, + "grad_norm": 2.592228412628174, + "learning_rate": 4.9997578120437606e-05, + "loss": 7.2671, + "step": 746 + }, + { + "epoch": 0.004442620610904939, + "grad_norm": 2.5546112060546875, + "learning_rate": 4.999757161448928e-05, + "loss": 7.2571, + "step": 747 + }, + { + "epoch": 0.004448567894185936, + "grad_norm": 2.745755672454834, + "learning_rate": 4.999756509981457e-05, + "loss": 7.3895, + "step": 748 + }, + { + "epoch": 0.004454515177466933, + "grad_norm": 2.9785144329071045, + "learning_rate": 4.999755857641346e-05, + "loss": 7.2431, + "step": 749 + }, + { + "epoch": 0.00446046246074793, + "grad_norm": 2.918891191482544, + "learning_rate": 4.9997552044285965e-05, + "loss": 7.3805, + "step": 750 + }, + { + "epoch": 0.004466409744028927, + "grad_norm": 2.7858519554138184, + "learning_rate": 4.999754550343209e-05, + "loss": 7.5942, + "step": 751 + }, + { + "epoch": 0.004472357027309925, + "grad_norm": 2.7758638858795166, + "learning_rate": 4.999753895385181e-05, + "loss": 7.5896, + "step": 752 + }, + { + "epoch": 0.004478304310590922, + "grad_norm": 2.7125916481018066, + "learning_rate": 4.999753239554517e-05, + "loss": 7.4341, + "step": 753 + }, + { + "epoch": 0.004484251593871919, + "grad_norm": 4.241726875305176, + "learning_rate": 4.999752582851214e-05, + "loss": 7.0517, + "step": 754 + }, + { + "epoch": 0.004490198877152916, + "grad_norm": 2.9547781944274902, + "learning_rate": 4.999751925275272e-05, + "loss": 7.2616, + "step": 755 + }, + { + "epoch": 0.004496146160433914, + "grad_norm": 4.2594122886657715, + "learning_rate": 4.9997512668266945e-05, + "loss": 7.3069, + "step": 756 + }, + { + "epoch": 0.004502093443714911, + "grad_norm": 4.1758246421813965, + "learning_rate": 4.9997506075054776e-05, + "loss": 7.3417, + "step": 757 + }, + { + "epoch": 0.004508040726995908, + "grad_norm": 2.8398962020874023, + "learning_rate": 4.999749947311625e-05, + "loss": 7.107, + "step": 758 + }, + { + "epoch": 0.004513988010276905, + "grad_norm": 3.487478017807007, + "learning_rate": 4.9997492862451354e-05, + "loss": 7.0014, + "step": 759 + }, + { + "epoch": 0.004519935293557903, + "grad_norm": 2.883409261703491, + "learning_rate": 4.999748624306009e-05, + "loss": 7.4691, + "step": 760 + }, + { + "epoch": 0.0045258825768389, + "grad_norm": 3.0092155933380127, + "learning_rate": 4.999747961494246e-05, + "loss": 7.3771, + "step": 761 + }, + { + "epoch": 0.004531829860119897, + "grad_norm": 2.9571943283081055, + "learning_rate": 4.999747297809847e-05, + "loss": 7.4664, + "step": 762 + }, + { + "epoch": 0.004537777143400894, + "grad_norm": 2.7476816177368164, + "learning_rate": 4.999746633252812e-05, + "loss": 7.2943, + "step": 763 + }, + { + "epoch": 0.004543724426681892, + "grad_norm": 4.903059959411621, + "learning_rate": 4.9997459678231415e-05, + "loss": 7.3467, + "step": 764 + }, + { + "epoch": 0.004549671709962889, + "grad_norm": 3.8205373287200928, + "learning_rate": 4.999745301520835e-05, + "loss": 7.2807, + "step": 765 + }, + { + "epoch": 0.0045556189932438864, + "grad_norm": 2.6003127098083496, + "learning_rate": 4.9997446343458934e-05, + "loss": 7.2736, + "step": 766 + }, + { + "epoch": 0.004561566276524883, + "grad_norm": 3.288313627243042, + "learning_rate": 4.999743966298317e-05, + "loss": 7.3832, + "step": 767 + }, + { + "epoch": 0.004567513559805881, + "grad_norm": 3.4839234352111816, + "learning_rate": 4.999743297378106e-05, + "loss": 7.2932, + "step": 768 + }, + { + "epoch": 0.004573460843086878, + "grad_norm": 3.2667462825775146, + "learning_rate": 4.99974262758526e-05, + "loss": 7.4855, + "step": 769 + }, + { + "epoch": 0.0045794081263678755, + "grad_norm": 3.3637850284576416, + "learning_rate": 4.99974195691978e-05, + "loss": 7.4864, + "step": 770 + }, + { + "epoch": 0.004585355409648872, + "grad_norm": 4.691596508026123, + "learning_rate": 4.999741285381666e-05, + "loss": 7.4751, + "step": 771 + }, + { + "epoch": 0.00459130269292987, + "grad_norm": 3.8831942081451416, + "learning_rate": 4.999740612970918e-05, + "loss": 7.4554, + "step": 772 + }, + { + "epoch": 0.004597249976210867, + "grad_norm": 2.9129562377929688, + "learning_rate": 4.999739939687536e-05, + "loss": 7.7096, + "step": 773 + }, + { + "epoch": 0.0046031972594918645, + "grad_norm": 3.928882598876953, + "learning_rate": 4.9997392655315207e-05, + "loss": 7.6453, + "step": 774 + }, + { + "epoch": 0.004609144542772861, + "grad_norm": 4.19191312789917, + "learning_rate": 4.9997385905028726e-05, + "loss": 7.6038, + "step": 775 + }, + { + "epoch": 0.004615091826053859, + "grad_norm": 2.4585883617401123, + "learning_rate": 4.999737914601591e-05, + "loss": 7.5734, + "step": 776 + }, + { + "epoch": 0.004621039109334856, + "grad_norm": 3.500932455062866, + "learning_rate": 4.9997372378276776e-05, + "loss": 7.6535, + "step": 777 + }, + { + "epoch": 0.0046269863926158535, + "grad_norm": 3.1256210803985596, + "learning_rate": 4.9997365601811306e-05, + "loss": 7.4844, + "step": 778 + }, + { + "epoch": 0.00463293367589685, + "grad_norm": 2.083902597427368, + "learning_rate": 4.999735881661952e-05, + "loss": 7.646, + "step": 779 + }, + { + "epoch": 0.004638880959177847, + "grad_norm": 2.2990450859069824, + "learning_rate": 4.999735202270142e-05, + "loss": 7.5756, + "step": 780 + }, + { + "epoch": 0.004644828242458845, + "grad_norm": 2.782463550567627, + "learning_rate": 4.9997345220057004e-05, + "loss": 7.6191, + "step": 781 + }, + { + "epoch": 0.004650775525739842, + "grad_norm": 4.157378673553467, + "learning_rate": 4.9997338408686255e-05, + "loss": 7.5265, + "step": 782 + }, + { + "epoch": 0.004656722809020839, + "grad_norm": 2.850106716156006, + "learning_rate": 4.999733158858921e-05, + "loss": 7.4562, + "step": 783 + }, + { + "epoch": 0.004662670092301836, + "grad_norm": 2.8073840141296387, + "learning_rate": 4.999732475976585e-05, + "loss": 7.3913, + "step": 784 + }, + { + "epoch": 0.004668617375582834, + "grad_norm": 2.85048770904541, + "learning_rate": 4.999731792221618e-05, + "loss": 7.3945, + "step": 785 + }, + { + "epoch": 0.004674564658863831, + "grad_norm": 2.760990619659424, + "learning_rate": 4.999731107594021e-05, + "loss": 7.6088, + "step": 786 + }, + { + "epoch": 0.004680511942144828, + "grad_norm": 2.4395666122436523, + "learning_rate": 4.9997304220937933e-05, + "loss": 7.6996, + "step": 787 + }, + { + "epoch": 0.004686459225425825, + "grad_norm": 2.5826008319854736, + "learning_rate": 4.9997297357209354e-05, + "loss": 7.5888, + "step": 788 + }, + { + "epoch": 0.004692406508706823, + "grad_norm": 3.434957981109619, + "learning_rate": 4.999729048475448e-05, + "loss": 7.4659, + "step": 789 + }, + { + "epoch": 0.00469835379198782, + "grad_norm": 4.103111743927002, + "learning_rate": 4.9997283603573306e-05, + "loss": 7.6704, + "step": 790 + }, + { + "epoch": 0.004704301075268817, + "grad_norm": 3.7879343032836914, + "learning_rate": 4.999727671366584e-05, + "loss": 7.5387, + "step": 791 + }, + { + "epoch": 0.004710248358549814, + "grad_norm": 3.706599235534668, + "learning_rate": 4.999726981503209e-05, + "loss": 7.3413, + "step": 792 + }, + { + "epoch": 0.004716195641830812, + "grad_norm": 2.1999869346618652, + "learning_rate": 4.999726290767204e-05, + "loss": 7.1809, + "step": 793 + }, + { + "epoch": 0.004722142925111809, + "grad_norm": 2.8561251163482666, + "learning_rate": 4.999725599158571e-05, + "loss": 7.3496, + "step": 794 + }, + { + "epoch": 0.004728090208392806, + "grad_norm": 3.0696613788604736, + "learning_rate": 4.99972490667731e-05, + "loss": 7.542, + "step": 795 + }, + { + "epoch": 0.004734037491673803, + "grad_norm": 2.706404685974121, + "learning_rate": 4.99972421332342e-05, + "loss": 7.4233, + "step": 796 + }, + { + "epoch": 0.004739984774954801, + "grad_norm": 2.388360023498535, + "learning_rate": 4.9997235190969025e-05, + "loss": 7.5754, + "step": 797 + }, + { + "epoch": 0.004745932058235798, + "grad_norm": 2.3414177894592285, + "learning_rate": 4.999722823997758e-05, + "loss": 7.438, + "step": 798 + }, + { + "epoch": 0.004751879341516795, + "grad_norm": 2.46012544631958, + "learning_rate": 4.999722128025985e-05, + "loss": 6.9522, + "step": 799 + }, + { + "epoch": 0.004757826624797792, + "grad_norm": 2.5721335411071777, + "learning_rate": 4.9997214311815855e-05, + "loss": 6.9632, + "step": 800 + }, + { + "epoch": 0.00476377390807879, + "grad_norm": 2.4028279781341553, + "learning_rate": 4.999720733464559e-05, + "loss": 7.3834, + "step": 801 + }, + { + "epoch": 0.004769721191359787, + "grad_norm": 2.378971576690674, + "learning_rate": 4.9997200348749055e-05, + "loss": 7.7919, + "step": 802 + }, + { + "epoch": 0.004775668474640784, + "grad_norm": 2.1871516704559326, + "learning_rate": 4.999719335412626e-05, + "loss": 7.6832, + "step": 803 + }, + { + "epoch": 0.004781615757921781, + "grad_norm": 2.4183239936828613, + "learning_rate": 4.9997186350777206e-05, + "loss": 7.5013, + "step": 804 + }, + { + "epoch": 0.004787563041202779, + "grad_norm": 2.2322120666503906, + "learning_rate": 4.9997179338701884e-05, + "loss": 7.4224, + "step": 805 + }, + { + "epoch": 0.004793510324483776, + "grad_norm": 3.2633447647094727, + "learning_rate": 4.99971723179003e-05, + "loss": 7.1966, + "step": 806 + }, + { + "epoch": 0.004799457607764773, + "grad_norm": 3.1195995807647705, + "learning_rate": 4.999716528837247e-05, + "loss": 7.4057, + "step": 807 + }, + { + "epoch": 0.00480540489104577, + "grad_norm": 2.6904098987579346, + "learning_rate": 4.9997158250118395e-05, + "loss": 7.4585, + "step": 808 + }, + { + "epoch": 0.004811352174326768, + "grad_norm": 2.6955599784851074, + "learning_rate": 4.999715120313806e-05, + "loss": 7.6053, + "step": 809 + }, + { + "epoch": 0.004817299457607765, + "grad_norm": 3.569037675857544, + "learning_rate": 4.999714414743148e-05, + "loss": 7.5085, + "step": 810 + }, + { + "epoch": 0.004823246740888762, + "grad_norm": 3.5231528282165527, + "learning_rate": 4.9997137082998655e-05, + "loss": 7.4554, + "step": 811 + }, + { + "epoch": 0.004829194024169759, + "grad_norm": 2.7118120193481445, + "learning_rate": 4.999713000983959e-05, + "loss": 7.4323, + "step": 812 + }, + { + "epoch": 0.004835141307450756, + "grad_norm": 3.229548931121826, + "learning_rate": 4.9997122927954284e-05, + "loss": 7.3098, + "step": 813 + }, + { + "epoch": 0.004841088590731754, + "grad_norm": 2.4224696159362793, + "learning_rate": 4.999711583734273e-05, + "loss": 7.3488, + "step": 814 + }, + { + "epoch": 0.004847035874012751, + "grad_norm": 2.627565383911133, + "learning_rate": 4.999710873800496e-05, + "loss": 7.457, + "step": 815 + }, + { + "epoch": 0.004852983157293748, + "grad_norm": 2.5339515209198, + "learning_rate": 4.999710162994094e-05, + "loss": 7.6602, + "step": 816 + }, + { + "epoch": 0.004858930440574745, + "grad_norm": 2.663694143295288, + "learning_rate": 4.9997094513150706e-05, + "loss": 7.1064, + "step": 817 + }, + { + "epoch": 0.004864877723855743, + "grad_norm": 2.372504472732544, + "learning_rate": 4.9997087387634234e-05, + "loss": 7.341, + "step": 818 + }, + { + "epoch": 0.00487082500713674, + "grad_norm": 2.145191192626953, + "learning_rate": 4.999708025339154e-05, + "loss": 7.3216, + "step": 819 + }, + { + "epoch": 0.004876772290417737, + "grad_norm": 2.39685320854187, + "learning_rate": 4.9997073110422626e-05, + "loss": 7.3463, + "step": 820 + }, + { + "epoch": 0.004882719573698734, + "grad_norm": 2.2227275371551514, + "learning_rate": 4.999706595872749e-05, + "loss": 7.2517, + "step": 821 + }, + { + "epoch": 0.004888666856979732, + "grad_norm": 2.7770352363586426, + "learning_rate": 4.999705879830614e-05, + "loss": 7.3117, + "step": 822 + }, + { + "epoch": 0.004894614140260729, + "grad_norm": 2.448026180267334, + "learning_rate": 4.999705162915857e-05, + "loss": 6.9883, + "step": 823 + }, + { + "epoch": 0.004900561423541726, + "grad_norm": 2.2304437160491943, + "learning_rate": 4.999704445128479e-05, + "loss": 7.2644, + "step": 824 + }, + { + "epoch": 0.004906508706822723, + "grad_norm": 2.351707696914673, + "learning_rate": 4.9997037264684796e-05, + "loss": 7.1984, + "step": 825 + }, + { + "epoch": 0.004912455990103721, + "grad_norm": 2.7631921768188477, + "learning_rate": 4.99970300693586e-05, + "loss": 7.3774, + "step": 826 + }, + { + "epoch": 0.004918403273384718, + "grad_norm": 2.4636785984039307, + "learning_rate": 4.9997022865306195e-05, + "loss": 7.3778, + "step": 827 + }, + { + "epoch": 0.004924350556665715, + "grad_norm": 3.5510878562927246, + "learning_rate": 4.999701565252759e-05, + "loss": 7.166, + "step": 828 + }, + { + "epoch": 0.004930297839946712, + "grad_norm": 3.2581429481506348, + "learning_rate": 4.999700843102278e-05, + "loss": 7.286, + "step": 829 + }, + { + "epoch": 0.00493624512322771, + "grad_norm": 2.4304182529449463, + "learning_rate": 4.999700120079178e-05, + "loss": 7.5076, + "step": 830 + }, + { + "epoch": 0.004942192406508707, + "grad_norm": 2.428854465484619, + "learning_rate": 4.999699396183458e-05, + "loss": 7.405, + "step": 831 + }, + { + "epoch": 0.004948139689789704, + "grad_norm": 2.7680416107177734, + "learning_rate": 4.9996986714151195e-05, + "loss": 7.4944, + "step": 832 + }, + { + "epoch": 0.004954086973070701, + "grad_norm": 2.6787109375, + "learning_rate": 4.999697945774161e-05, + "loss": 7.5946, + "step": 833 + }, + { + "epoch": 0.004960034256351699, + "grad_norm": 2.6396615505218506, + "learning_rate": 4.9996972192605845e-05, + "loss": 7.5405, + "step": 834 + }, + { + "epoch": 0.004965981539632696, + "grad_norm": 2.89387583732605, + "learning_rate": 4.999696491874389e-05, + "loss": 7.3809, + "step": 835 + }, + { + "epoch": 0.004971928822913693, + "grad_norm": 2.332838535308838, + "learning_rate": 4.999695763615576e-05, + "loss": 7.3638, + "step": 836 + }, + { + "epoch": 0.00497787610619469, + "grad_norm": 2.2880585193634033, + "learning_rate": 4.9996950344841444e-05, + "loss": 7.3557, + "step": 837 + }, + { + "epoch": 0.004983823389475688, + "grad_norm": 2.7478256225585938, + "learning_rate": 4.999694304480096e-05, + "loss": 7.4, + "step": 838 + }, + { + "epoch": 0.004989770672756685, + "grad_norm": 3.4789531230926514, + "learning_rate": 4.999693573603429e-05, + "loss": 7.4438, + "step": 839 + }, + { + "epoch": 0.004995717956037682, + "grad_norm": 2.7377078533172607, + "learning_rate": 4.9996928418541455e-05, + "loss": 7.4074, + "step": 840 + }, + { + "epoch": 0.005001665239318679, + "grad_norm": 3.04420804977417, + "learning_rate": 4.9996921092322444e-05, + "loss": 7.3834, + "step": 841 + }, + { + "epoch": 0.005007612522599676, + "grad_norm": 2.759244203567505, + "learning_rate": 4.999691375737727e-05, + "loss": 7.4492, + "step": 842 + }, + { + "epoch": 0.005013559805880674, + "grad_norm": 2.5327556133270264, + "learning_rate": 4.9996906413705933e-05, + "loss": 7.4403, + "step": 843 + }, + { + "epoch": 0.0050195070891616705, + "grad_norm": 2.8170409202575684, + "learning_rate": 4.9996899061308434e-05, + "loss": 7.623, + "step": 844 + }, + { + "epoch": 0.005025454372442668, + "grad_norm": 3.8642547130584717, + "learning_rate": 4.9996891700184774e-05, + "loss": 7.6099, + "step": 845 + }, + { + "epoch": 0.005031401655723665, + "grad_norm": 4.704552173614502, + "learning_rate": 4.999688433033496e-05, + "loss": 7.6755, + "step": 846 + }, + { + "epoch": 0.005037348939004663, + "grad_norm": 4.128530979156494, + "learning_rate": 4.9996876951758986e-05, + "loss": 7.5246, + "step": 847 + }, + { + "epoch": 0.0050432962222856596, + "grad_norm": 2.233447551727295, + "learning_rate": 4.9996869564456865e-05, + "loss": 7.1139, + "step": 848 + }, + { + "epoch": 0.005049243505566657, + "grad_norm": 5.96085262298584, + "learning_rate": 4.999686216842859e-05, + "loss": 7.4114, + "step": 849 + }, + { + "epoch": 0.005055190788847654, + "grad_norm": 4.828244686126709, + "learning_rate": 4.9996854763674175e-05, + "loss": 7.6743, + "step": 850 + }, + { + "epoch": 0.005061138072128652, + "grad_norm": 3.0259342193603516, + "learning_rate": 4.999684735019362e-05, + "loss": 7.7537, + "step": 851 + }, + { + "epoch": 0.005067085355409649, + "grad_norm": 2.807244062423706, + "learning_rate": 4.999683992798692e-05, + "loss": 7.7744, + "step": 852 + }, + { + "epoch": 0.005073032638690646, + "grad_norm": 2.81384015083313, + "learning_rate": 4.999683249705408e-05, + "loss": 7.2922, + "step": 853 + }, + { + "epoch": 0.005078979921971643, + "grad_norm": 2.582836627960205, + "learning_rate": 4.9996825057395105e-05, + "loss": 7.3421, + "step": 854 + }, + { + "epoch": 0.005084927205252641, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9996817609009996e-05, + "loss": 7.6249, + "step": 855 + }, + { + "epoch": 0.005090874488533638, + "grad_norm": 2.3322219848632812, + "learning_rate": 4.999681015189875e-05, + "loss": 7.4695, + "step": 856 + }, + { + "epoch": 0.005096821771814635, + "grad_norm": 2.5582947731018066, + "learning_rate": 4.9996802686061384e-05, + "loss": 7.2747, + "step": 857 + }, + { + "epoch": 0.005102769055095632, + "grad_norm": 3.192093849182129, + "learning_rate": 4.999679521149789e-05, + "loss": 7.504, + "step": 858 + }, + { + "epoch": 0.00510871633837663, + "grad_norm": 4.1585588455200195, + "learning_rate": 4.999678772820827e-05, + "loss": 7.5966, + "step": 859 + }, + { + "epoch": 0.005114663621657627, + "grad_norm": 5.052750587463379, + "learning_rate": 4.999678023619253e-05, + "loss": 7.3243, + "step": 860 + }, + { + "epoch": 0.005120610904938624, + "grad_norm": 2.395909070968628, + "learning_rate": 4.999677273545068e-05, + "loss": 7.4477, + "step": 861 + }, + { + "epoch": 0.005126558188219621, + "grad_norm": 2.487334966659546, + "learning_rate": 4.999676522598271e-05, + "loss": 7.591, + "step": 862 + }, + { + "epoch": 0.005132505471500619, + "grad_norm": 3.7094171047210693, + "learning_rate": 4.999675770778863e-05, + "loss": 7.5387, + "step": 863 + }, + { + "epoch": 0.005138452754781616, + "grad_norm": 4.468298435211182, + "learning_rate": 4.9996750180868435e-05, + "loss": 7.5754, + "step": 864 + }, + { + "epoch": 0.005144400038062613, + "grad_norm": 3.2769386768341064, + "learning_rate": 4.999674264522213e-05, + "loss": 7.459, + "step": 865 + }, + { + "epoch": 0.00515034732134361, + "grad_norm": 2.7162864208221436, + "learning_rate": 4.9996735100849726e-05, + "loss": 7.3473, + "step": 866 + }, + { + "epoch": 0.005156294604624608, + "grad_norm": 3.646401882171631, + "learning_rate": 4.999672754775122e-05, + "loss": 7.4446, + "step": 867 + }, + { + "epoch": 0.005162241887905605, + "grad_norm": 8.917684555053711, + "learning_rate": 4.999671998592662e-05, + "loss": 7.2016, + "step": 868 + }, + { + "epoch": 0.005168189171186602, + "grad_norm": 2.949993133544922, + "learning_rate": 4.999671241537591e-05, + "loss": 7.3081, + "step": 869 + }, + { + "epoch": 0.005174136454467599, + "grad_norm": 2.4531025886535645, + "learning_rate": 4.999670483609912e-05, + "loss": 7.402, + "step": 870 + }, + { + "epoch": 0.005180083737748597, + "grad_norm": 3.1903798580169678, + "learning_rate": 4.999669724809623e-05, + "loss": 7.2514, + "step": 871 + }, + { + "epoch": 0.005186031021029594, + "grad_norm": 3.461353302001953, + "learning_rate": 4.999668965136726e-05, + "loss": 7.1637, + "step": 872 + }, + { + "epoch": 0.005191978304310591, + "grad_norm": 2.623075246810913, + "learning_rate": 4.9996682045912194e-05, + "loss": 7.5482, + "step": 873 + }, + { + "epoch": 0.005197925587591588, + "grad_norm": 2.9072840213775635, + "learning_rate": 4.9996674431731044e-05, + "loss": 7.484, + "step": 874 + }, + { + "epoch": 0.005203872870872585, + "grad_norm": 3.0219666957855225, + "learning_rate": 4.999666680882382e-05, + "loss": 7.5223, + "step": 875 + }, + { + "epoch": 0.005209820154153583, + "grad_norm": 2.9892475605010986, + "learning_rate": 4.9996659177190514e-05, + "loss": 7.3843, + "step": 876 + }, + { + "epoch": 0.0052157674374345795, + "grad_norm": 2.6199591159820557, + "learning_rate": 4.9996651536831126e-05, + "loss": 7.2728, + "step": 877 + }, + { + "epoch": 0.005221714720715577, + "grad_norm": 2.6897647380828857, + "learning_rate": 4.999664388774567e-05, + "loss": 7.5323, + "step": 878 + }, + { + "epoch": 0.005227662003996574, + "grad_norm": 3.5945560932159424, + "learning_rate": 4.9996636229934155e-05, + "loss": 7.5001, + "step": 879 + }, + { + "epoch": 0.005233609287277572, + "grad_norm": 2.9064812660217285, + "learning_rate": 4.9996628563396563e-05, + "loss": 7.5463, + "step": 880 + }, + { + "epoch": 0.0052395565705585685, + "grad_norm": 3.6150660514831543, + "learning_rate": 4.999662088813291e-05, + "loss": 7.6596, + "step": 881 + }, + { + "epoch": 0.005245503853839566, + "grad_norm": 2.729684591293335, + "learning_rate": 4.99966132041432e-05, + "loss": 7.5342, + "step": 882 + }, + { + "epoch": 0.005251451137120563, + "grad_norm": 2.6782853603363037, + "learning_rate": 4.9996605511427416e-05, + "loss": 7.5837, + "step": 883 + }, + { + "epoch": 0.005257398420401561, + "grad_norm": 4.171568393707275, + "learning_rate": 4.9996597809985576e-05, + "loss": 7.3626, + "step": 884 + }, + { + "epoch": 0.0052633457036825575, + "grad_norm": 2.189725637435913, + "learning_rate": 4.999659009981769e-05, + "loss": 7.5431, + "step": 885 + }, + { + "epoch": 0.005269292986963555, + "grad_norm": 2.2473320960998535, + "learning_rate": 4.999658238092375e-05, + "loss": 7.4731, + "step": 886 + }, + { + "epoch": 0.005275240270244552, + "grad_norm": 3.4393012523651123, + "learning_rate": 4.999657465330376e-05, + "loss": 7.6839, + "step": 887 + }, + { + "epoch": 0.00528118755352555, + "grad_norm": 2.717742919921875, + "learning_rate": 4.9996566916957735e-05, + "loss": 7.6812, + "step": 888 + }, + { + "epoch": 0.0052871348368065466, + "grad_norm": 3.829698085784912, + "learning_rate": 4.9996559171885655e-05, + "loss": 7.4525, + "step": 889 + }, + { + "epoch": 0.005293082120087544, + "grad_norm": 2.764598846435547, + "learning_rate": 4.9996551418087536e-05, + "loss": 7.5379, + "step": 890 + }, + { + "epoch": 0.005299029403368541, + "grad_norm": 2.4230268001556396, + "learning_rate": 4.999654365556338e-05, + "loss": 7.454, + "step": 891 + }, + { + "epoch": 0.005304976686649539, + "grad_norm": 2.31870436668396, + "learning_rate": 4.999653588431319e-05, + "loss": 7.5306, + "step": 892 + }, + { + "epoch": 0.005310923969930536, + "grad_norm": 2.332259178161621, + "learning_rate": 4.999652810433697e-05, + "loss": 7.4008, + "step": 893 + }, + { + "epoch": 0.005316871253211533, + "grad_norm": 2.630568504333496, + "learning_rate": 4.999652031563471e-05, + "loss": 7.4046, + "step": 894 + }, + { + "epoch": 0.00532281853649253, + "grad_norm": 3.327211856842041, + "learning_rate": 4.999651251820643e-05, + "loss": 7.2901, + "step": 895 + }, + { + "epoch": 0.005328765819773528, + "grad_norm": 2.2383713722229004, + "learning_rate": 4.999650471205213e-05, + "loss": 7.5116, + "step": 896 + }, + { + "epoch": 0.005334713103054525, + "grad_norm": 2.972820997238159, + "learning_rate": 4.99964968971718e-05, + "loss": 7.4013, + "step": 897 + }, + { + "epoch": 0.005340660386335522, + "grad_norm": 2.7254672050476074, + "learning_rate": 4.999648907356545e-05, + "loss": 7.3174, + "step": 898 + }, + { + "epoch": 0.005346607669616519, + "grad_norm": 2.6943607330322266, + "learning_rate": 4.9996481241233096e-05, + "loss": 7.386, + "step": 899 + }, + { + "epoch": 0.005352554952897517, + "grad_norm": 2.9217519760131836, + "learning_rate": 4.999647340017473e-05, + "loss": 7.5398, + "step": 900 + }, + { + "epoch": 0.005358502236178514, + "grad_norm": 2.7950780391693115, + "learning_rate": 4.999646555039034e-05, + "loss": 7.6336, + "step": 901 + }, + { + "epoch": 0.005364449519459511, + "grad_norm": 2.763364553451538, + "learning_rate": 4.999645769187995e-05, + "loss": 7.5161, + "step": 902 + }, + { + "epoch": 0.005370396802740508, + "grad_norm": 2.3095102310180664, + "learning_rate": 4.999644982464355e-05, + "loss": 7.5859, + "step": 903 + }, + { + "epoch": 0.005376344086021506, + "grad_norm": 2.7287917137145996, + "learning_rate": 4.999644194868115e-05, + "loss": 7.3983, + "step": 904 + }, + { + "epoch": 0.005382291369302503, + "grad_norm": 2.6175942420959473, + "learning_rate": 4.999643406399275e-05, + "loss": 7.4278, + "step": 905 + }, + { + "epoch": 0.0053882386525834994, + "grad_norm": 2.3898375034332275, + "learning_rate": 4.999642617057835e-05, + "loss": 7.4537, + "step": 906 + }, + { + "epoch": 0.005394185935864497, + "grad_norm": 2.964381694793701, + "learning_rate": 4.999641826843796e-05, + "loss": 7.3258, + "step": 907 + }, + { + "epoch": 0.005400133219145494, + "grad_norm": 3.1146717071533203, + "learning_rate": 4.999641035757158e-05, + "loss": 7.5412, + "step": 908 + }, + { + "epoch": 0.005406080502426492, + "grad_norm": 3.4733238220214844, + "learning_rate": 4.999640243797921e-05, + "loss": 7.423, + "step": 909 + }, + { + "epoch": 0.0054120277857074885, + "grad_norm": 3.621044158935547, + "learning_rate": 4.999639450966085e-05, + "loss": 7.5885, + "step": 910 + }, + { + "epoch": 0.005417975068988486, + "grad_norm": 2.4800662994384766, + "learning_rate": 4.999638657261651e-05, + "loss": 7.5231, + "step": 911 + }, + { + "epoch": 0.005423922352269483, + "grad_norm": 3.3247363567352295, + "learning_rate": 4.999637862684619e-05, + "loss": 7.2367, + "step": 912 + }, + { + "epoch": 0.005429869635550481, + "grad_norm": 4.293686866760254, + "learning_rate": 4.999637067234989e-05, + "loss": 6.8423, + "step": 913 + }, + { + "epoch": 0.0054358169188314775, + "grad_norm": 2.6713979244232178, + "learning_rate": 4.999636270912762e-05, + "loss": 6.7962, + "step": 914 + }, + { + "epoch": 0.005441764202112475, + "grad_norm": 2.9386653900146484, + "learning_rate": 4.9996354737179376e-05, + "loss": 6.7582, + "step": 915 + }, + { + "epoch": 0.005447711485393472, + "grad_norm": 2.8030481338500977, + "learning_rate": 4.999634675650516e-05, + "loss": 6.6516, + "step": 916 + }, + { + "epoch": 0.00545365876867447, + "grad_norm": 2.7315666675567627, + "learning_rate": 4.9996338767104985e-05, + "loss": 6.6159, + "step": 917 + }, + { + "epoch": 0.0054596060519554665, + "grad_norm": 3.116098403930664, + "learning_rate": 4.999633076897884e-05, + "loss": 7.2121, + "step": 918 + }, + { + "epoch": 0.005465553335236464, + "grad_norm": 2.867687940597534, + "learning_rate": 4.999632276212673e-05, + "loss": 7.5124, + "step": 919 + }, + { + "epoch": 0.005471500618517461, + "grad_norm": 2.9864203929901123, + "learning_rate": 4.9996314746548676e-05, + "loss": 7.5168, + "step": 920 + }, + { + "epoch": 0.005477447901798459, + "grad_norm": 2.9083375930786133, + "learning_rate": 4.9996306722244656e-05, + "loss": 7.5027, + "step": 921 + }, + { + "epoch": 0.0054833951850794555, + "grad_norm": 2.5569801330566406, + "learning_rate": 4.9996298689214686e-05, + "loss": 7.2988, + "step": 922 + }, + { + "epoch": 0.005489342468360453, + "grad_norm": 3.7101242542266846, + "learning_rate": 4.9996290647458765e-05, + "loss": 7.33, + "step": 923 + }, + { + "epoch": 0.00549528975164145, + "grad_norm": 2.848881244659424, + "learning_rate": 4.99962825969769e-05, + "loss": 7.4534, + "step": 924 + }, + { + "epoch": 0.005501237034922448, + "grad_norm": 3.072282075881958, + "learning_rate": 4.999627453776909e-05, + "loss": 7.4398, + "step": 925 + }, + { + "epoch": 0.0055071843182034445, + "grad_norm": 2.8132996559143066, + "learning_rate": 4.999626646983534e-05, + "loss": 7.5617, + "step": 926 + }, + { + "epoch": 0.005513131601484442, + "grad_norm": 2.2710142135620117, + "learning_rate": 4.999625839317565e-05, + "loss": 7.5975, + "step": 927 + }, + { + "epoch": 0.005519078884765439, + "grad_norm": 2.745007276535034, + "learning_rate": 4.9996250307790026e-05, + "loss": 7.4599, + "step": 928 + }, + { + "epoch": 0.005525026168046437, + "grad_norm": 3.2031302452087402, + "learning_rate": 4.999624221367847e-05, + "loss": 7.3528, + "step": 929 + }, + { + "epoch": 0.0055309734513274336, + "grad_norm": 6.417830467224121, + "learning_rate": 4.999623411084098e-05, + "loss": 7.5118, + "step": 930 + }, + { + "epoch": 0.005536920734608431, + "grad_norm": 2.7960314750671387, + "learning_rate": 4.999622599927756e-05, + "loss": 6.5016, + "step": 931 + }, + { + "epoch": 0.005542868017889428, + "grad_norm": 2.959507703781128, + "learning_rate": 4.999621787898822e-05, + "loss": 7.6521, + "step": 932 + }, + { + "epoch": 0.005548815301170426, + "grad_norm": 3.328834056854248, + "learning_rate": 4.999620974997296e-05, + "loss": 7.6267, + "step": 933 + }, + { + "epoch": 0.005554762584451423, + "grad_norm": 2.5232200622558594, + "learning_rate": 4.9996201612231786e-05, + "loss": 7.471, + "step": 934 + }, + { + "epoch": 0.00556070986773242, + "grad_norm": 2.2766942977905273, + "learning_rate": 4.999619346576468e-05, + "loss": 7.4204, + "step": 935 + }, + { + "epoch": 0.005566657151013417, + "grad_norm": 2.584068536758423, + "learning_rate": 4.999618531057168e-05, + "loss": 7.4384, + "step": 936 + }, + { + "epoch": 0.005572604434294414, + "grad_norm": 3.004523277282715, + "learning_rate": 4.999617714665276e-05, + "loss": 7.5681, + "step": 937 + }, + { + "epoch": 0.005578551717575412, + "grad_norm": 4.102936267852783, + "learning_rate": 4.999616897400794e-05, + "loss": 7.4571, + "step": 938 + }, + { + "epoch": 0.005584499000856408, + "grad_norm": 2.745293378829956, + "learning_rate": 4.99961607926372e-05, + "loss": 7.588, + "step": 939 + }, + { + "epoch": 0.005590446284137406, + "grad_norm": 2.9720282554626465, + "learning_rate": 4.9996152602540576e-05, + "loss": 7.4761, + "step": 940 + }, + { + "epoch": 0.005596393567418403, + "grad_norm": 3.150047540664673, + "learning_rate": 4.999614440371805e-05, + "loss": 7.4525, + "step": 941 + }, + { + "epoch": 0.005602340850699401, + "grad_norm": 2.6735856533050537, + "learning_rate": 4.999613619616962e-05, + "loss": 7.2754, + "step": 942 + }, + { + "epoch": 0.005608288133980397, + "grad_norm": 2.6451661586761475, + "learning_rate": 4.9996127979895304e-05, + "loss": 7.5742, + "step": 943 + }, + { + "epoch": 0.005614235417261395, + "grad_norm": 2.7551536560058594, + "learning_rate": 4.9996119754895095e-05, + "loss": 7.4981, + "step": 944 + }, + { + "epoch": 0.005620182700542392, + "grad_norm": 2.7445640563964844, + "learning_rate": 4.9996111521168995e-05, + "loss": 7.4761, + "step": 945 + }, + { + "epoch": 0.00562612998382339, + "grad_norm": 2.537924289703369, + "learning_rate": 4.9996103278717013e-05, + "loss": 7.5483, + "step": 946 + }, + { + "epoch": 0.0056320772671043864, + "grad_norm": 3.503661632537842, + "learning_rate": 4.9996095027539156e-05, + "loss": 7.3074, + "step": 947 + }, + { + "epoch": 0.005638024550385384, + "grad_norm": 2.8088479042053223, + "learning_rate": 4.999608676763542e-05, + "loss": 7.5675, + "step": 948 + }, + { + "epoch": 0.005643971833666381, + "grad_norm": 2.6219863891601562, + "learning_rate": 4.99960784990058e-05, + "loss": 7.6037, + "step": 949 + }, + { + "epoch": 0.005649919116947379, + "grad_norm": 2.88737416267395, + "learning_rate": 4.999607022165031e-05, + "loss": 7.4815, + "step": 950 + }, + { + "epoch": 0.0056558664002283755, + "grad_norm": 2.455707550048828, + "learning_rate": 4.999606193556895e-05, + "loss": 7.553, + "step": 951 + }, + { + "epoch": 0.005661813683509373, + "grad_norm": 2.2502405643463135, + "learning_rate": 4.999605364076173e-05, + "loss": 7.387, + "step": 952 + }, + { + "epoch": 0.00566776096679037, + "grad_norm": 2.754972457885742, + "learning_rate": 4.9996045337228635e-05, + "loss": 7.3088, + "step": 953 + }, + { + "epoch": 0.005673708250071368, + "grad_norm": 3.111553192138672, + "learning_rate": 4.9996037024969686e-05, + "loss": 7.5063, + "step": 954 + }, + { + "epoch": 0.0056796555333523645, + "grad_norm": 2.4000720977783203, + "learning_rate": 4.9996028703984875e-05, + "loss": 7.5705, + "step": 955 + }, + { + "epoch": 0.005685602816633362, + "grad_norm": 2.495659351348877, + "learning_rate": 4.9996020374274215e-05, + "loss": 7.5421, + "step": 956 + }, + { + "epoch": 0.005691550099914359, + "grad_norm": 3.025509834289551, + "learning_rate": 4.99960120358377e-05, + "loss": 7.5406, + "step": 957 + }, + { + "epoch": 0.005697497383195357, + "grad_norm": 2.224342107772827, + "learning_rate": 4.999600368867533e-05, + "loss": 7.4323, + "step": 958 + }, + { + "epoch": 0.0057034446664763535, + "grad_norm": 2.661423683166504, + "learning_rate": 4.999599533278712e-05, + "loss": 7.565, + "step": 959 + }, + { + "epoch": 0.005709391949757351, + "grad_norm": 2.503293037414551, + "learning_rate": 4.999598696817307e-05, + "loss": 7.3552, + "step": 960 + }, + { + "epoch": 0.005715339233038348, + "grad_norm": 2.2878923416137695, + "learning_rate": 4.999597859483316e-05, + "loss": 7.4542, + "step": 961 + }, + { + "epoch": 0.005721286516319346, + "grad_norm": 2.759594678878784, + "learning_rate": 4.999597021276743e-05, + "loss": 7.2349, + "step": 962 + }, + { + "epoch": 0.0057272337996003425, + "grad_norm": 4.5453314781188965, + "learning_rate": 4.999596182197586e-05, + "loss": 7.4728, + "step": 963 + }, + { + "epoch": 0.00573318108288134, + "grad_norm": 2.4369568824768066, + "learning_rate": 4.999595342245846e-05, + "loss": 7.4396, + "step": 964 + }, + { + "epoch": 0.005739128366162337, + "grad_norm": 2.4081692695617676, + "learning_rate": 4.999594501421523e-05, + "loss": 7.536, + "step": 965 + }, + { + "epoch": 0.005745075649443335, + "grad_norm": 3.0494678020477295, + "learning_rate": 4.9995936597246176e-05, + "loss": 7.4061, + "step": 966 + }, + { + "epoch": 0.0057510229327243315, + "grad_norm": 3.3492188453674316, + "learning_rate": 4.999592817155129e-05, + "loss": 7.5419, + "step": 967 + }, + { + "epoch": 0.005756970216005328, + "grad_norm": 2.254714012145996, + "learning_rate": 4.999591973713059e-05, + "loss": 7.4568, + "step": 968 + }, + { + "epoch": 0.005762917499286326, + "grad_norm": 2.3336634635925293, + "learning_rate": 4.999591129398407e-05, + "loss": 7.4386, + "step": 969 + }, + { + "epoch": 0.005768864782567323, + "grad_norm": 2.545154094696045, + "learning_rate": 4.999590284211174e-05, + "loss": 7.226, + "step": 970 + }, + { + "epoch": 0.0057748120658483205, + "grad_norm": 2.891068458557129, + "learning_rate": 4.99958943815136e-05, + "loss": 7.4235, + "step": 971 + }, + { + "epoch": 0.005780759349129317, + "grad_norm": 3.0321712493896484, + "learning_rate": 4.999588591218964e-05, + "loss": 7.2918, + "step": 972 + }, + { + "epoch": 0.005786706632410315, + "grad_norm": 2.935490846633911, + "learning_rate": 4.9995877434139884e-05, + "loss": 7.4172, + "step": 973 + }, + { + "epoch": 0.005792653915691312, + "grad_norm": 3.0021424293518066, + "learning_rate": 4.9995868947364324e-05, + "loss": 7.521, + "step": 974 + }, + { + "epoch": 0.0057986011989723096, + "grad_norm": 2.2784783840179443, + "learning_rate": 4.9995860451862964e-05, + "loss": 7.5716, + "step": 975 + }, + { + "epoch": 0.005804548482253306, + "grad_norm": 2.9321484565734863, + "learning_rate": 4.999585194763581e-05, + "loss": 7.0965, + "step": 976 + }, + { + "epoch": 0.005810495765534304, + "grad_norm": 2.284874439239502, + "learning_rate": 4.999584343468285e-05, + "loss": 7.4376, + "step": 977 + }, + { + "epoch": 0.005816443048815301, + "grad_norm": 2.2066683769226074, + "learning_rate": 4.9995834913004115e-05, + "loss": 7.4478, + "step": 978 + }, + { + "epoch": 0.005822390332096299, + "grad_norm": 2.286323070526123, + "learning_rate": 4.999582638259959e-05, + "loss": 7.4139, + "step": 979 + }, + { + "epoch": 0.005828337615377295, + "grad_norm": 2.5052928924560547, + "learning_rate": 4.999581784346927e-05, + "loss": 7.4278, + "step": 980 + }, + { + "epoch": 0.005834284898658293, + "grad_norm": 2.273698091506958, + "learning_rate": 4.9995809295613175e-05, + "loss": 7.4019, + "step": 981 + }, + { + "epoch": 0.00584023218193929, + "grad_norm": 2.729466676712036, + "learning_rate": 4.999580073903129e-05, + "loss": 7.4716, + "step": 982 + }, + { + "epoch": 0.005846179465220288, + "grad_norm": 2.5776185989379883, + "learning_rate": 4.999579217372365e-05, + "loss": 7.4708, + "step": 983 + }, + { + "epoch": 0.005852126748501284, + "grad_norm": 2.4125893115997314, + "learning_rate": 4.9995783599690226e-05, + "loss": 7.4505, + "step": 984 + }, + { + "epoch": 0.005858074031782282, + "grad_norm": 2.975911855697632, + "learning_rate": 4.9995775016931035e-05, + "loss": 7.4095, + "step": 985 + }, + { + "epoch": 0.005864021315063279, + "grad_norm": 2.4155962467193604, + "learning_rate": 4.9995766425446076e-05, + "loss": 7.3084, + "step": 986 + }, + { + "epoch": 0.005869968598344277, + "grad_norm": 2.436950922012329, + "learning_rate": 4.999575782523535e-05, + "loss": 7.2782, + "step": 987 + }, + { + "epoch": 0.0058759158816252734, + "grad_norm": 2.2371575832366943, + "learning_rate": 4.999574921629887e-05, + "loss": 7.3879, + "step": 988 + }, + { + "epoch": 0.005881863164906271, + "grad_norm": 2.3079733848571777, + "learning_rate": 4.999574059863663e-05, + "loss": 7.5117, + "step": 989 + }, + { + "epoch": 0.005887810448187268, + "grad_norm": 2.4018514156341553, + "learning_rate": 4.9995731972248626e-05, + "loss": 7.4486, + "step": 990 + }, + { + "epoch": 0.005893757731468266, + "grad_norm": 2.3437294960021973, + "learning_rate": 4.9995723337134884e-05, + "loss": 7.461, + "step": 991 + }, + { + "epoch": 0.0058997050147492625, + "grad_norm": 3.15254545211792, + "learning_rate": 4.999571469329538e-05, + "loss": 7.014, + "step": 992 + }, + { + "epoch": 0.00590565229803026, + "grad_norm": 2.4809768199920654, + "learning_rate": 4.999570604073014e-05, + "loss": 7.4339, + "step": 993 + }, + { + "epoch": 0.005911599581311257, + "grad_norm": 3.4286630153656006, + "learning_rate": 4.9995697379439154e-05, + "loss": 7.3086, + "step": 994 + }, + { + "epoch": 0.005917546864592255, + "grad_norm": 3.9362127780914307, + "learning_rate": 4.999568870942243e-05, + "loss": 7.2635, + "step": 995 + }, + { + "epoch": 0.0059234941478732515, + "grad_norm": 2.6632091999053955, + "learning_rate": 4.9995680030679965e-05, + "loss": 7.2779, + "step": 996 + }, + { + "epoch": 0.005929441431154249, + "grad_norm": 5.218096733093262, + "learning_rate": 4.999567134321177e-05, + "loss": 7.4285, + "step": 997 + }, + { + "epoch": 0.005935388714435246, + "grad_norm": 3.441894769668579, + "learning_rate": 4.9995662647017835e-05, + "loss": 7.5576, + "step": 998 + }, + { + "epoch": 0.005941335997716243, + "grad_norm": 2.560178279876709, + "learning_rate": 4.9995653942098184e-05, + "loss": 7.5692, + "step": 999 + }, + { + "epoch": 0.0059472832809972405, + "grad_norm": 2.458313226699829, + "learning_rate": 4.999564522845281e-05, + "loss": 7.0495, + "step": 1000 + }, + { + "epoch": 0.005953230564278237, + "grad_norm": 2.539314031600952, + "learning_rate": 4.999563650608171e-05, + "loss": 7.1919, + "step": 1001 + }, + { + "epoch": 0.005959177847559235, + "grad_norm": 3.6134390830993652, + "learning_rate": 4.999562777498489e-05, + "loss": 7.0725, + "step": 1002 + }, + { + "epoch": 0.005965125130840232, + "grad_norm": 2.6582295894622803, + "learning_rate": 4.9995619035162355e-05, + "loss": 7.3008, + "step": 1003 + }, + { + "epoch": 0.0059710724141212295, + "grad_norm": 2.4968035221099854, + "learning_rate": 4.999561028661411e-05, + "loss": 7.2862, + "step": 1004 + }, + { + "epoch": 0.005977019697402226, + "grad_norm": 3.002840042114258, + "learning_rate": 4.999560152934015e-05, + "loss": 7.1721, + "step": 1005 + }, + { + "epoch": 0.005982966980683224, + "grad_norm": 3.4327914714813232, + "learning_rate": 4.999559276334049e-05, + "loss": 7.242, + "step": 1006 + }, + { + "epoch": 0.005988914263964221, + "grad_norm": 2.4082493782043457, + "learning_rate": 4.999558398861513e-05, + "loss": 7.1588, + "step": 1007 + }, + { + "epoch": 0.0059948615472452185, + "grad_norm": 2.39475417137146, + "learning_rate": 4.9995575205164056e-05, + "loss": 7.1713, + "step": 1008 + }, + { + "epoch": 0.006000808830526215, + "grad_norm": 2.946331024169922, + "learning_rate": 4.99955664129873e-05, + "loss": 7.1553, + "step": 1009 + }, + { + "epoch": 0.006006756113807213, + "grad_norm": 2.4334871768951416, + "learning_rate": 4.999555761208484e-05, + "loss": 7.1898, + "step": 1010 + }, + { + "epoch": 0.00601270339708821, + "grad_norm": 2.3159971237182617, + "learning_rate": 4.999554880245669e-05, + "loss": 7.0642, + "step": 1011 + }, + { + "epoch": 0.0060186506803692075, + "grad_norm": 2.9773905277252197, + "learning_rate": 4.9995539984102854e-05, + "loss": 7.3285, + "step": 1012 + }, + { + "epoch": 0.006024597963650204, + "grad_norm": 3.444267749786377, + "learning_rate": 4.999553115702334e-05, + "loss": 7.1263, + "step": 1013 + }, + { + "epoch": 0.006030545246931202, + "grad_norm": 2.6518173217773438, + "learning_rate": 4.9995522321218136e-05, + "loss": 7.3915, + "step": 1014 + }, + { + "epoch": 0.006036492530212199, + "grad_norm": 2.46230149269104, + "learning_rate": 4.9995513476687254e-05, + "loss": 7.1808, + "step": 1015 + }, + { + "epoch": 0.0060424398134931966, + "grad_norm": 2.2243192195892334, + "learning_rate": 4.99955046234307e-05, + "loss": 7.4262, + "step": 1016 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 3.0834670066833496, + "learning_rate": 4.999549576144847e-05, + "loss": 7.4028, + "step": 1017 + }, + { + "epoch": 0.006054334380055191, + "grad_norm": 3.2453930377960205, + "learning_rate": 4.9995486890740573e-05, + "loss": 7.5537, + "step": 1018 + }, + { + "epoch": 0.006060281663336188, + "grad_norm": 2.7142229080200195, + "learning_rate": 4.9995478011307015e-05, + "loss": 7.4131, + "step": 1019 + }, + { + "epoch": 0.006066228946617186, + "grad_norm": 2.9567463397979736, + "learning_rate": 4.9995469123147784e-05, + "loss": 7.5969, + "step": 1020 + }, + { + "epoch": 0.006072176229898182, + "grad_norm": 2.5698695182800293, + "learning_rate": 4.99954602262629e-05, + "loss": 7.2721, + "step": 1021 + }, + { + "epoch": 0.00607812351317918, + "grad_norm": 2.3958864212036133, + "learning_rate": 4.999545132065235e-05, + "loss": 7.3414, + "step": 1022 + }, + { + "epoch": 0.006084070796460177, + "grad_norm": 2.528024911880493, + "learning_rate": 4.9995442406316156e-05, + "loss": 7.2821, + "step": 1023 + }, + { + "epoch": 0.006090018079741175, + "grad_norm": 2.6904075145721436, + "learning_rate": 4.999543348325431e-05, + "loss": 7.3726, + "step": 1024 + }, + { + "epoch": 0.006095965363022171, + "grad_norm": 2.8618202209472656, + "learning_rate": 4.999542455146681e-05, + "loss": 7.4232, + "step": 1025 + }, + { + "epoch": 0.006101912646303169, + "grad_norm": 1.978455662727356, + "learning_rate": 4.999541561095367e-05, + "loss": 7.5949, + "step": 1026 + }, + { + "epoch": 0.006107859929584166, + "grad_norm": 2.882568836212158, + "learning_rate": 4.999540666171489e-05, + "loss": 7.4868, + "step": 1027 + }, + { + "epoch": 0.006113807212865164, + "grad_norm": 2.9586474895477295, + "learning_rate": 4.999539770375047e-05, + "loss": 7.1556, + "step": 1028 + }, + { + "epoch": 0.00611975449614616, + "grad_norm": 2.5675363540649414, + "learning_rate": 4.999538873706041e-05, + "loss": 7.3306, + "step": 1029 + }, + { + "epoch": 0.006125701779427157, + "grad_norm": 3.440857410430908, + "learning_rate": 4.999537976164472e-05, + "loss": 7.3654, + "step": 1030 + }, + { + "epoch": 0.006131649062708155, + "grad_norm": 3.7741217613220215, + "learning_rate": 4.999537077750341e-05, + "loss": 6.8088, + "step": 1031 + }, + { + "epoch": 0.006137596345989152, + "grad_norm": 3.801609754562378, + "learning_rate": 4.999536178463647e-05, + "loss": 6.989, + "step": 1032 + }, + { + "epoch": 0.0061435436292701495, + "grad_norm": 2.627225875854492, + "learning_rate": 4.9995352783043905e-05, + "loss": 7.4066, + "step": 1033 + }, + { + "epoch": 0.006149490912551146, + "grad_norm": 3.3529040813446045, + "learning_rate": 4.9995343772725725e-05, + "loss": 7.0403, + "step": 1034 + }, + { + "epoch": 0.006155438195832144, + "grad_norm": 3.248558521270752, + "learning_rate": 4.999533475368192e-05, + "loss": 7.2664, + "step": 1035 + }, + { + "epoch": 0.006161385479113141, + "grad_norm": 3.1260814666748047, + "learning_rate": 4.9995325725912515e-05, + "loss": 7.3257, + "step": 1036 + }, + { + "epoch": 0.0061673327623941385, + "grad_norm": 2.379659414291382, + "learning_rate": 4.999531668941748e-05, + "loss": 7.4448, + "step": 1037 + }, + { + "epoch": 0.006173280045675135, + "grad_norm": 2.8478498458862305, + "learning_rate": 4.999530764419685e-05, + "loss": 7.3892, + "step": 1038 + }, + { + "epoch": 0.006179227328956133, + "grad_norm": 4.104954719543457, + "learning_rate": 4.999529859025062e-05, + "loss": 7.5172, + "step": 1039 + }, + { + "epoch": 0.00618517461223713, + "grad_norm": 2.50160813331604, + "learning_rate": 4.999528952757879e-05, + "loss": 7.1894, + "step": 1040 + }, + { + "epoch": 0.0061911218955181275, + "grad_norm": 2.5545871257781982, + "learning_rate": 4.999528045618136e-05, + "loss": 7.3892, + "step": 1041 + }, + { + "epoch": 0.006197069178799124, + "grad_norm": 2.9980626106262207, + "learning_rate": 4.999527137605833e-05, + "loss": 7.3517, + "step": 1042 + }, + { + "epoch": 0.006203016462080122, + "grad_norm": 2.5920562744140625, + "learning_rate": 4.999526228720971e-05, + "loss": 7.1716, + "step": 1043 + }, + { + "epoch": 0.006208963745361119, + "grad_norm": 2.5224244594573975, + "learning_rate": 4.999525318963551e-05, + "loss": 7.1892, + "step": 1044 + }, + { + "epoch": 0.0062149110286421165, + "grad_norm": 2.7092106342315674, + "learning_rate": 4.999524408333572e-05, + "loss": 7.178, + "step": 1045 + }, + { + "epoch": 0.006220858311923113, + "grad_norm": 2.523320198059082, + "learning_rate": 4.999523496831035e-05, + "loss": 7.1486, + "step": 1046 + }, + { + "epoch": 0.006226805595204111, + "grad_norm": 2.4491217136383057, + "learning_rate": 4.99952258445594e-05, + "loss": 7.121, + "step": 1047 + }, + { + "epoch": 0.006232752878485108, + "grad_norm": 2.29109263420105, + "learning_rate": 4.9995216712082875e-05, + "loss": 7.4323, + "step": 1048 + }, + { + "epoch": 0.0062387001617661055, + "grad_norm": 2.5234057903289795, + "learning_rate": 4.9995207570880783e-05, + "loss": 7.1552, + "step": 1049 + }, + { + "epoch": 0.006244647445047102, + "grad_norm": 2.301316499710083, + "learning_rate": 4.9995198420953115e-05, + "loss": 7.3625, + "step": 1050 + }, + { + "epoch": 0.0062505947283281, + "grad_norm": 2.4358527660369873, + "learning_rate": 4.999518926229989e-05, + "loss": 7.2462, + "step": 1051 + }, + { + "epoch": 0.006256542011609097, + "grad_norm": 2.3915181159973145, + "learning_rate": 4.999518009492109e-05, + "loss": 7.173, + "step": 1052 + }, + { + "epoch": 0.0062624892948900945, + "grad_norm": 2.5529091358184814, + "learning_rate": 4.999517091881674e-05, + "loss": 7.2463, + "step": 1053 + }, + { + "epoch": 0.006268436578171091, + "grad_norm": 3.235435724258423, + "learning_rate": 4.999516173398683e-05, + "loss": 7.1149, + "step": 1054 + }, + { + "epoch": 0.006274383861452089, + "grad_norm": 2.692140817642212, + "learning_rate": 4.9995152540431375e-05, + "loss": 7.3554, + "step": 1055 + }, + { + "epoch": 0.006280331144733086, + "grad_norm": 2.910116195678711, + "learning_rate": 4.999514333815036e-05, + "loss": 7.4424, + "step": 1056 + }, + { + "epoch": 0.0062862784280140836, + "grad_norm": 2.897463798522949, + "learning_rate": 4.9995134127143804e-05, + "loss": 7.2345, + "step": 1057 + }, + { + "epoch": 0.00629222571129508, + "grad_norm": 2.5925514698028564, + "learning_rate": 4.999512490741171e-05, + "loss": 7.1539, + "step": 1058 + }, + { + "epoch": 0.006298172994576078, + "grad_norm": 2.693816900253296, + "learning_rate": 4.999511567895407e-05, + "loss": 7.0905, + "step": 1059 + }, + { + "epoch": 0.006304120277857075, + "grad_norm": 3.3717474937438965, + "learning_rate": 4.9995106441770896e-05, + "loss": 7.1407, + "step": 1060 + }, + { + "epoch": 0.006310067561138072, + "grad_norm": 2.6128973960876465, + "learning_rate": 4.999509719586218e-05, + "loss": 7.2748, + "step": 1061 + }, + { + "epoch": 0.006316014844419069, + "grad_norm": 2.24324369430542, + "learning_rate": 4.999508794122795e-05, + "loss": 7.2553, + "step": 1062 + }, + { + "epoch": 0.006321962127700066, + "grad_norm": 2.7593698501586914, + "learning_rate": 4.999507867786818e-05, + "loss": 7.1039, + "step": 1063 + }, + { + "epoch": 0.006327909410981064, + "grad_norm": 2.6210618019104004, + "learning_rate": 4.999506940578289e-05, + "loss": 7.0247, + "step": 1064 + }, + { + "epoch": 0.006333856694262061, + "grad_norm": 2.410187244415283, + "learning_rate": 4.9995060124972084e-05, + "loss": 7.3931, + "step": 1065 + }, + { + "epoch": 0.006339803977543058, + "grad_norm": 2.795302391052246, + "learning_rate": 4.999505083543575e-05, + "loss": 7.3168, + "step": 1066 + }, + { + "epoch": 0.006345751260824055, + "grad_norm": 2.3720662593841553, + "learning_rate": 4.999504153717391e-05, + "loss": 7.3719, + "step": 1067 + }, + { + "epoch": 0.006351698544105053, + "grad_norm": 2.721585988998413, + "learning_rate": 4.9995032230186556e-05, + "loss": 7.3847, + "step": 1068 + }, + { + "epoch": 0.00635764582738605, + "grad_norm": 2.967153549194336, + "learning_rate": 4.99950229144737e-05, + "loss": 7.3224, + "step": 1069 + }, + { + "epoch": 0.006363593110667047, + "grad_norm": 3.8144783973693848, + "learning_rate": 4.999501359003533e-05, + "loss": 7.0767, + "step": 1070 + }, + { + "epoch": 0.006369540393948044, + "grad_norm": 3.7694199085235596, + "learning_rate": 4.999500425687147e-05, + "loss": 7.4486, + "step": 1071 + }, + { + "epoch": 0.006375487677229042, + "grad_norm": 2.9668312072753906, + "learning_rate": 4.999499491498211e-05, + "loss": 7.3415, + "step": 1072 + }, + { + "epoch": 0.006381434960510039, + "grad_norm": 4.196050643920898, + "learning_rate": 4.999498556436725e-05, + "loss": 7.3784, + "step": 1073 + }, + { + "epoch": 0.0063873822437910364, + "grad_norm": 4.676602363586426, + "learning_rate": 4.99949762050269e-05, + "loss": 7.3773, + "step": 1074 + }, + { + "epoch": 0.006393329527072033, + "grad_norm": 2.8828656673431396, + "learning_rate": 4.999496683696107e-05, + "loss": 7.2359, + "step": 1075 + }, + { + "epoch": 0.006399276810353031, + "grad_norm": 2.7532308101654053, + "learning_rate": 4.9994957460169745e-05, + "loss": 7.356, + "step": 1076 + }, + { + "epoch": 0.006405224093634028, + "grad_norm": 5.535451412200928, + "learning_rate": 4.999494807465293e-05, + "loss": 7.261, + "step": 1077 + }, + { + "epoch": 0.0064111713769150255, + "grad_norm": 3.6439530849456787, + "learning_rate": 4.999493868041066e-05, + "loss": 7.4664, + "step": 1078 + }, + { + "epoch": 0.006417118660196022, + "grad_norm": 3.563948154449463, + "learning_rate": 4.99949292774429e-05, + "loss": 7.0427, + "step": 1079 + }, + { + "epoch": 0.00642306594347702, + "grad_norm": 3.6243784427642822, + "learning_rate": 4.9994919865749675e-05, + "loss": 7.3292, + "step": 1080 + }, + { + "epoch": 0.006429013226758017, + "grad_norm": 5.1197590827941895, + "learning_rate": 4.999491044533098e-05, + "loss": 7.3717, + "step": 1081 + }, + { + "epoch": 0.0064349605100390145, + "grad_norm": 4.3969902992248535, + "learning_rate": 4.999490101618682e-05, + "loss": 7.2875, + "step": 1082 + }, + { + "epoch": 0.006440907793320011, + "grad_norm": 2.6302945613861084, + "learning_rate": 4.999489157831719e-05, + "loss": 7.1958, + "step": 1083 + }, + { + "epoch": 0.006446855076601009, + "grad_norm": 3.782078504562378, + "learning_rate": 4.9994882131722116e-05, + "loss": 7.2951, + "step": 1084 + }, + { + "epoch": 0.006452802359882006, + "grad_norm": 3.432082414627075, + "learning_rate": 4.999487267640158e-05, + "loss": 7.0974, + "step": 1085 + }, + { + "epoch": 0.0064587496431630035, + "grad_norm": 3.364793300628662, + "learning_rate": 4.999486321235559e-05, + "loss": 7.0847, + "step": 1086 + }, + { + "epoch": 0.006464696926444, + "grad_norm": 2.7063019275665283, + "learning_rate": 4.999485373958416e-05, + "loss": 7.1421, + "step": 1087 + }, + { + "epoch": 0.006470644209724998, + "grad_norm": 3.0648648738861084, + "learning_rate": 4.999484425808727e-05, + "loss": 7.2723, + "step": 1088 + }, + { + "epoch": 0.006476591493005995, + "grad_norm": 3.3968300819396973, + "learning_rate": 4.999483476786495e-05, + "loss": 7.1438, + "step": 1089 + }, + { + "epoch": 0.0064825387762869925, + "grad_norm": 2.864647150039673, + "learning_rate": 4.999482526891719e-05, + "loss": 7.1512, + "step": 1090 + }, + { + "epoch": 0.006488486059567989, + "grad_norm": 2.577043056488037, + "learning_rate": 4.999481576124399e-05, + "loss": 6.8914, + "step": 1091 + }, + { + "epoch": 0.006494433342848986, + "grad_norm": 2.83754563331604, + "learning_rate": 4.999480624484536e-05, + "loss": 6.9999, + "step": 1092 + }, + { + "epoch": 0.006500380626129984, + "grad_norm": 3.5623857975006104, + "learning_rate": 4.999479671972131e-05, + "loss": 7.0567, + "step": 1093 + }, + { + "epoch": 0.006506327909410981, + "grad_norm": 2.35555362701416, + "learning_rate": 4.9994787185871814e-05, + "loss": 7.3075, + "step": 1094 + }, + { + "epoch": 0.006512275192691978, + "grad_norm": 3.8677117824554443, + "learning_rate": 4.9994777643296914e-05, + "loss": 7.3608, + "step": 1095 + }, + { + "epoch": 0.006518222475972975, + "grad_norm": 3.8163843154907227, + "learning_rate": 4.999476809199659e-05, + "loss": 7.4368, + "step": 1096 + }, + { + "epoch": 0.006524169759253973, + "grad_norm": 2.5424652099609375, + "learning_rate": 4.999475853197085e-05, + "loss": 7.4968, + "step": 1097 + }, + { + "epoch": 0.00653011704253497, + "grad_norm": 2.876898765563965, + "learning_rate": 4.99947489632197e-05, + "loss": 6.9948, + "step": 1098 + }, + { + "epoch": 0.006536064325815967, + "grad_norm": 3.3934860229492188, + "learning_rate": 4.999473938574314e-05, + "loss": 6.9588, + "step": 1099 + }, + { + "epoch": 0.006542011609096964, + "grad_norm": 2.1184024810791016, + "learning_rate": 4.9994729799541176e-05, + "loss": 7.1933, + "step": 1100 + }, + { + "epoch": 0.006547958892377962, + "grad_norm": 2.2882895469665527, + "learning_rate": 4.999472020461381e-05, + "loss": 7.0796, + "step": 1101 + }, + { + "epoch": 0.006553906175658959, + "grad_norm": 3.239429235458374, + "learning_rate": 4.9994710600961045e-05, + "loss": 6.9535, + "step": 1102 + }, + { + "epoch": 0.006559853458939956, + "grad_norm": 2.4653263092041016, + "learning_rate": 4.9994700988582884e-05, + "loss": 6.9316, + "step": 1103 + }, + { + "epoch": 0.006565800742220953, + "grad_norm": 2.511516571044922, + "learning_rate": 4.999469136747933e-05, + "loss": 6.9844, + "step": 1104 + }, + { + "epoch": 0.006571748025501951, + "grad_norm": 2.9725844860076904, + "learning_rate": 4.9994681737650384e-05, + "loss": 7.1955, + "step": 1105 + }, + { + "epoch": 0.006577695308782948, + "grad_norm": 3.04697585105896, + "learning_rate": 4.9994672099096066e-05, + "loss": 7.1044, + "step": 1106 + }, + { + "epoch": 0.006583642592063945, + "grad_norm": 3.395076274871826, + "learning_rate": 4.999466245181635e-05, + "loss": 7.1968, + "step": 1107 + }, + { + "epoch": 0.006589589875344942, + "grad_norm": 2.362884044647217, + "learning_rate": 4.999465279581127e-05, + "loss": 7.3114, + "step": 1108 + }, + { + "epoch": 0.00659553715862594, + "grad_norm": 2.730980396270752, + "learning_rate": 4.99946431310808e-05, + "loss": 7.1978, + "step": 1109 + }, + { + "epoch": 0.006601484441906937, + "grad_norm": 3.288687229156494, + "learning_rate": 4.9994633457624974e-05, + "loss": 7.4397, + "step": 1110 + }, + { + "epoch": 0.006607431725187934, + "grad_norm": 3.3060662746429443, + "learning_rate": 4.999462377544377e-05, + "loss": 7.1638, + "step": 1111 + }, + { + "epoch": 0.006613379008468931, + "grad_norm": 2.2697036266326904, + "learning_rate": 4.9994614084537204e-05, + "loss": 7.2654, + "step": 1112 + }, + { + "epoch": 0.006619326291749929, + "grad_norm": 2.330495595932007, + "learning_rate": 4.999460438490528e-05, + "loss": 7.2132, + "step": 1113 + }, + { + "epoch": 0.006625273575030926, + "grad_norm": 2.8239340782165527, + "learning_rate": 4.999459467654799e-05, + "loss": 7.3477, + "step": 1114 + }, + { + "epoch": 0.0066312208583119234, + "grad_norm": 2.591614246368408, + "learning_rate": 4.999458495946535e-05, + "loss": 7.0377, + "step": 1115 + }, + { + "epoch": 0.00663716814159292, + "grad_norm": 4.554818630218506, + "learning_rate": 4.999457523365736e-05, + "loss": 7.1266, + "step": 1116 + }, + { + "epoch": 0.006643115424873918, + "grad_norm": 2.21018123626709, + "learning_rate": 4.999456549912401e-05, + "loss": 7.1433, + "step": 1117 + }, + { + "epoch": 0.006649062708154915, + "grad_norm": 2.0298593044281006, + "learning_rate": 4.999455575586533e-05, + "loss": 7.257, + "step": 1118 + }, + { + "epoch": 0.0066550099914359125, + "grad_norm": 2.4532642364501953, + "learning_rate": 4.9994546003881305e-05, + "loss": 7.0618, + "step": 1119 + }, + { + "epoch": 0.006660957274716909, + "grad_norm": 2.428380012512207, + "learning_rate": 4.999453624317194e-05, + "loss": 7.2039, + "step": 1120 + }, + { + "epoch": 0.006666904557997907, + "grad_norm": 2.5572609901428223, + "learning_rate": 4.999452647373724e-05, + "loss": 7.0991, + "step": 1121 + }, + { + "epoch": 0.006672851841278904, + "grad_norm": 2.379640817642212, + "learning_rate": 4.999451669557721e-05, + "loss": 7.1424, + "step": 1122 + }, + { + "epoch": 0.006678799124559901, + "grad_norm": 2.5764007568359375, + "learning_rate": 4.999450690869185e-05, + "loss": 7.1218, + "step": 1123 + }, + { + "epoch": 0.006684746407840898, + "grad_norm": 2.6560606956481934, + "learning_rate": 4.999449711308117e-05, + "loss": 7.2994, + "step": 1124 + }, + { + "epoch": 0.006690693691121895, + "grad_norm": 2.4687581062316895, + "learning_rate": 4.999448730874518e-05, + "loss": 7.4169, + "step": 1125 + }, + { + "epoch": 0.006696640974402893, + "grad_norm": 2.8232173919677734, + "learning_rate": 4.999447749568386e-05, + "loss": 7.291, + "step": 1126 + }, + { + "epoch": 0.00670258825768389, + "grad_norm": 2.6960325241088867, + "learning_rate": 4.9994467673897224e-05, + "loss": 7.3162, + "step": 1127 + }, + { + "epoch": 0.006708535540964887, + "grad_norm": 2.222391366958618, + "learning_rate": 4.999445784338528e-05, + "loss": 7.221, + "step": 1128 + }, + { + "epoch": 0.006714482824245884, + "grad_norm": 2.334995985031128, + "learning_rate": 4.9994448004148024e-05, + "loss": 7.4813, + "step": 1129 + }, + { + "epoch": 0.006720430107526882, + "grad_norm": 2.653491497039795, + "learning_rate": 4.999443815618548e-05, + "loss": 7.3515, + "step": 1130 + }, + { + "epoch": 0.006726377390807879, + "grad_norm": 2.6943631172180176, + "learning_rate": 4.999442829949762e-05, + "loss": 7.2674, + "step": 1131 + }, + { + "epoch": 0.006732324674088876, + "grad_norm": 2.395573377609253, + "learning_rate": 4.999441843408447e-05, + "loss": 7.483, + "step": 1132 + }, + { + "epoch": 0.006738271957369873, + "grad_norm": 2.3801541328430176, + "learning_rate": 4.999440855994603e-05, + "loss": 7.3355, + "step": 1133 + }, + { + "epoch": 0.006744219240650871, + "grad_norm": 2.8566555976867676, + "learning_rate": 4.999439867708229e-05, + "loss": 6.8323, + "step": 1134 + }, + { + "epoch": 0.006750166523931868, + "grad_norm": 2.5987985134124756, + "learning_rate": 4.999438878549327e-05, + "loss": 6.957, + "step": 1135 + }, + { + "epoch": 0.006756113807212865, + "grad_norm": 2.4411563873291016, + "learning_rate": 4.9994378885178964e-05, + "loss": 6.9935, + "step": 1136 + }, + { + "epoch": 0.006762061090493862, + "grad_norm": 2.4227802753448486, + "learning_rate": 4.9994368976139386e-05, + "loss": 7.2856, + "step": 1137 + }, + { + "epoch": 0.00676800837377486, + "grad_norm": 2.55317759513855, + "learning_rate": 4.999435905837453e-05, + "loss": 7.1741, + "step": 1138 + }, + { + "epoch": 0.006773955657055857, + "grad_norm": 2.3329968452453613, + "learning_rate": 4.9994349131884396e-05, + "loss": 7.2007, + "step": 1139 + }, + { + "epoch": 0.006779902940336854, + "grad_norm": 2.538499593734741, + "learning_rate": 4.999433919666899e-05, + "loss": 7.1755, + "step": 1140 + }, + { + "epoch": 0.006785850223617851, + "grad_norm": 2.3580374717712402, + "learning_rate": 4.999432925272833e-05, + "loss": 7.2249, + "step": 1141 + }, + { + "epoch": 0.006791797506898849, + "grad_norm": 2.2783255577087402, + "learning_rate": 4.99943193000624e-05, + "loss": 7.3627, + "step": 1142 + }, + { + "epoch": 0.006797744790179846, + "grad_norm": 3.0798208713531494, + "learning_rate": 4.999430933867122e-05, + "loss": 7.2718, + "step": 1143 + }, + { + "epoch": 0.006803692073460843, + "grad_norm": 2.703232526779175, + "learning_rate": 4.9994299368554776e-05, + "loss": 7.116, + "step": 1144 + }, + { + "epoch": 0.00680963935674184, + "grad_norm": 2.480327606201172, + "learning_rate": 4.9994289389713076e-05, + "loss": 6.9743, + "step": 1145 + }, + { + "epoch": 0.006815586640022838, + "grad_norm": 2.2707130908966064, + "learning_rate": 4.9994279402146137e-05, + "loss": 6.9919, + "step": 1146 + }, + { + "epoch": 0.006821533923303835, + "grad_norm": 2.0424580574035645, + "learning_rate": 4.999426940585396e-05, + "loss": 7.0366, + "step": 1147 + }, + { + "epoch": 0.006827481206584832, + "grad_norm": 1.9720054864883423, + "learning_rate": 4.999425940083653e-05, + "loss": 6.8622, + "step": 1148 + }, + { + "epoch": 0.006833428489865829, + "grad_norm": 2.7109742164611816, + "learning_rate": 4.9994249387093864e-05, + "loss": 7.5375, + "step": 1149 + }, + { + "epoch": 0.006839375773146827, + "grad_norm": 2.267328977584839, + "learning_rate": 4.999423936462596e-05, + "loss": 7.5606, + "step": 1150 + }, + { + "epoch": 0.006845323056427824, + "grad_norm": 2.958360433578491, + "learning_rate": 4.999422933343283e-05, + "loss": 7.3503, + "step": 1151 + }, + { + "epoch": 0.006851270339708821, + "grad_norm": 2.2681283950805664, + "learning_rate": 4.9994219293514475e-05, + "loss": 6.9278, + "step": 1152 + }, + { + "epoch": 0.006857217622989818, + "grad_norm": 2.4755337238311768, + "learning_rate": 4.999420924487089e-05, + "loss": 7.1385, + "step": 1153 + }, + { + "epoch": 0.006863164906270815, + "grad_norm": 2.283277988433838, + "learning_rate": 4.999419918750209e-05, + "loss": 6.9287, + "step": 1154 + }, + { + "epoch": 0.006869112189551813, + "grad_norm": 2.3692893981933594, + "learning_rate": 4.999418912140808e-05, + "loss": 7.0648, + "step": 1155 + }, + { + "epoch": 0.00687505947283281, + "grad_norm": 2.2676453590393066, + "learning_rate": 4.999417904658884e-05, + "loss": 6.9754, + "step": 1156 + }, + { + "epoch": 0.006881006756113807, + "grad_norm": 2.4106669425964355, + "learning_rate": 4.9994168963044405e-05, + "loss": 7.033, + "step": 1157 + }, + { + "epoch": 0.006886954039394804, + "grad_norm": 2.947758913040161, + "learning_rate": 4.9994158870774754e-05, + "loss": 7.0821, + "step": 1158 + }, + { + "epoch": 0.006892901322675802, + "grad_norm": 2.5338058471679688, + "learning_rate": 4.9994148769779905e-05, + "loss": 6.9426, + "step": 1159 + }, + { + "epoch": 0.006898848605956799, + "grad_norm": 2.4848148822784424, + "learning_rate": 4.999413866005985e-05, + "loss": 7.2488, + "step": 1160 + }, + { + "epoch": 0.006904795889237796, + "grad_norm": 2.444077730178833, + "learning_rate": 4.999412854161461e-05, + "loss": 6.871, + "step": 1161 + }, + { + "epoch": 0.006910743172518793, + "grad_norm": 2.376962661743164, + "learning_rate": 4.9994118414444174e-05, + "loss": 7.0258, + "step": 1162 + }, + { + "epoch": 0.006916690455799791, + "grad_norm": 3.502023458480835, + "learning_rate": 4.9994108278548545e-05, + "loss": 7.4869, + "step": 1163 + }, + { + "epoch": 0.006922637739080788, + "grad_norm": 3.117741584777832, + "learning_rate": 4.999409813392774e-05, + "loss": 7.4437, + "step": 1164 + }, + { + "epoch": 0.006928585022361785, + "grad_norm": 3.805560827255249, + "learning_rate": 4.999408798058175e-05, + "loss": 7.3796, + "step": 1165 + }, + { + "epoch": 0.006934532305642782, + "grad_norm": 3.67065167427063, + "learning_rate": 4.9994077818510576e-05, + "loss": 7.2304, + "step": 1166 + }, + { + "epoch": 0.00694047958892378, + "grad_norm": 2.5749545097351074, + "learning_rate": 4.9994067647714236e-05, + "loss": 7.0943, + "step": 1167 + }, + { + "epoch": 0.006946426872204777, + "grad_norm": 2.561405897140503, + "learning_rate": 4.9994057468192724e-05, + "loss": 6.9496, + "step": 1168 + }, + { + "epoch": 0.006952374155485774, + "grad_norm": 2.477344512939453, + "learning_rate": 4.999404727994604e-05, + "loss": 7.3494, + "step": 1169 + }, + { + "epoch": 0.006958321438766771, + "grad_norm": 2.897580146789551, + "learning_rate": 4.999403708297419e-05, + "loss": 7.6081, + "step": 1170 + }, + { + "epoch": 0.006964268722047769, + "grad_norm": 3.899249792098999, + "learning_rate": 4.999402687727719e-05, + "loss": 7.4448, + "step": 1171 + }, + { + "epoch": 0.006970216005328766, + "grad_norm": 3.0791561603546143, + "learning_rate": 4.9994016662855025e-05, + "loss": 7.1616, + "step": 1172 + }, + { + "epoch": 0.006976163288609763, + "grad_norm": 2.8212931156158447, + "learning_rate": 4.999400643970771e-05, + "loss": 7.1824, + "step": 1173 + }, + { + "epoch": 0.00698211057189076, + "grad_norm": 4.33271598815918, + "learning_rate": 4.9993996207835246e-05, + "loss": 7.2432, + "step": 1174 + }, + { + "epoch": 0.006988057855171758, + "grad_norm": 2.985125780105591, + "learning_rate": 4.999398596723764e-05, + "loss": 7.6521, + "step": 1175 + }, + { + "epoch": 0.006994005138452755, + "grad_norm": 3.1069905757904053, + "learning_rate": 4.9993975717914885e-05, + "loss": 7.0071, + "step": 1176 + }, + { + "epoch": 0.006999952421733752, + "grad_norm": 2.915214776992798, + "learning_rate": 4.9993965459866995e-05, + "loss": 7.6192, + "step": 1177 + }, + { + "epoch": 0.007005899705014749, + "grad_norm": 5.314033031463623, + "learning_rate": 4.999395519309397e-05, + "loss": 6.9447, + "step": 1178 + }, + { + "epoch": 0.007011846988295747, + "grad_norm": 2.2723114490509033, + "learning_rate": 4.999394491759581e-05, + "loss": 7.1228, + "step": 1179 + }, + { + "epoch": 0.007017794271576744, + "grad_norm": 2.936365842819214, + "learning_rate": 4.999393463337253e-05, + "loss": 7.136, + "step": 1180 + }, + { + "epoch": 0.007023741554857741, + "grad_norm": 2.864250898361206, + "learning_rate": 4.9993924340424115e-05, + "loss": 7.026, + "step": 1181 + }, + { + "epoch": 0.007029688838138738, + "grad_norm": 3.299370050430298, + "learning_rate": 4.9993914038750586e-05, + "loss": 7.1114, + "step": 1182 + }, + { + "epoch": 0.007035636121419736, + "grad_norm": 3.0609943866729736, + "learning_rate": 4.999390372835193e-05, + "loss": 7.3052, + "step": 1183 + }, + { + "epoch": 0.007041583404700733, + "grad_norm": 3.54488468170166, + "learning_rate": 4.9993893409228176e-05, + "loss": 7.4845, + "step": 1184 + }, + { + "epoch": 0.0070475306879817295, + "grad_norm": 2.5196385383605957, + "learning_rate": 4.99938830813793e-05, + "loss": 7.312, + "step": 1185 + }, + { + "epoch": 0.007053477971262727, + "grad_norm": 3.570802927017212, + "learning_rate": 4.9993872744805326e-05, + "loss": 7.0038, + "step": 1186 + }, + { + "epoch": 0.007059425254543724, + "grad_norm": 2.631058931350708, + "learning_rate": 4.999386239950624e-05, + "loss": 7.5574, + "step": 1187 + }, + { + "epoch": 0.007065372537824722, + "grad_norm": 3.027251958847046, + "learning_rate": 4.999385204548206e-05, + "loss": 6.9837, + "step": 1188 + }, + { + "epoch": 0.0070713198211057185, + "grad_norm": 3.00128173828125, + "learning_rate": 4.999384168273279e-05, + "loss": 7.4479, + "step": 1189 + }, + { + "epoch": 0.007077267104386716, + "grad_norm": 2.127028226852417, + "learning_rate": 4.999383131125842e-05, + "loss": 7.3609, + "step": 1190 + }, + { + "epoch": 0.007083214387667713, + "grad_norm": 2.375511646270752, + "learning_rate": 4.9993820931058965e-05, + "loss": 7.3695, + "step": 1191 + }, + { + "epoch": 0.007089161670948711, + "grad_norm": 2.527743101119995, + "learning_rate": 4.999381054213442e-05, + "loss": 7.1478, + "step": 1192 + }, + { + "epoch": 0.0070951089542297075, + "grad_norm": 2.1600632667541504, + "learning_rate": 4.99938001444848e-05, + "loss": 7.7111, + "step": 1193 + }, + { + "epoch": 0.007101056237510705, + "grad_norm": 2.3242850303649902, + "learning_rate": 4.99937897381101e-05, + "loss": 7.6751, + "step": 1194 + }, + { + "epoch": 0.007107003520791702, + "grad_norm": 3.4553158283233643, + "learning_rate": 4.9993779323010334e-05, + "loss": 7.775, + "step": 1195 + }, + { + "epoch": 0.0071129508040727, + "grad_norm": 2.4339516162872314, + "learning_rate": 4.999376889918549e-05, + "loss": 7.099, + "step": 1196 + }, + { + "epoch": 0.0071188980873536966, + "grad_norm": 2.531851291656494, + "learning_rate": 4.9993758466635574e-05, + "loss": 7.5222, + "step": 1197 + }, + { + "epoch": 0.007124845370634694, + "grad_norm": 2.6549220085144043, + "learning_rate": 4.999374802536061e-05, + "loss": 7.4917, + "step": 1198 + }, + { + "epoch": 0.007130792653915691, + "grad_norm": 2.9149320125579834, + "learning_rate": 4.999373757536058e-05, + "loss": 7.0438, + "step": 1199 + }, + { + "epoch": 0.007136739937196689, + "grad_norm": 3.0234971046447754, + "learning_rate": 4.999372711663549e-05, + "loss": 7.6838, + "step": 1200 + }, + { + "epoch": 0.007142687220477686, + "grad_norm": 2.4006800651550293, + "learning_rate": 4.999371664918535e-05, + "loss": 7.6607, + "step": 1201 + }, + { + "epoch": 0.007148634503758683, + "grad_norm": 2.6191699504852295, + "learning_rate": 4.9993706173010164e-05, + "loss": 7.4727, + "step": 1202 + }, + { + "epoch": 0.00715458178703968, + "grad_norm": 3.040844440460205, + "learning_rate": 4.999369568810993e-05, + "loss": 7.1459, + "step": 1203 + }, + { + "epoch": 0.007160529070320678, + "grad_norm": 2.8474466800689697, + "learning_rate": 4.9993685194484654e-05, + "loss": 7.4615, + "step": 1204 + }, + { + "epoch": 0.007166476353601675, + "grad_norm": 1.928662657737732, + "learning_rate": 4.999367469213435e-05, + "loss": 7.4259, + "step": 1205 + }, + { + "epoch": 0.007172423636882672, + "grad_norm": 2.369540214538574, + "learning_rate": 4.999366418105901e-05, + "loss": 6.9342, + "step": 1206 + }, + { + "epoch": 0.007178370920163669, + "grad_norm": 4.003239154815674, + "learning_rate": 4.999365366125863e-05, + "loss": 7.3289, + "step": 1207 + }, + { + "epoch": 0.007184318203444667, + "grad_norm": 4.491976261138916, + "learning_rate": 4.9993643132733234e-05, + "loss": 7.3479, + "step": 1208 + }, + { + "epoch": 0.007190265486725664, + "grad_norm": 2.3678557872772217, + "learning_rate": 4.9993632595482806e-05, + "loss": 7.3091, + "step": 1209 + }, + { + "epoch": 0.007196212770006661, + "grad_norm": 2.9310050010681152, + "learning_rate": 4.999362204950737e-05, + "loss": 7.1996, + "step": 1210 + }, + { + "epoch": 0.007202160053287658, + "grad_norm": 3.6861345767974854, + "learning_rate": 4.999361149480691e-05, + "loss": 7.43, + "step": 1211 + }, + { + "epoch": 0.007208107336568656, + "grad_norm": 2.657515287399292, + "learning_rate": 4.9993600931381446e-05, + "loss": 6.9888, + "step": 1212 + }, + { + "epoch": 0.007214054619849653, + "grad_norm": 2.8346996307373047, + "learning_rate": 4.999359035923097e-05, + "loss": 7.0366, + "step": 1213 + }, + { + "epoch": 0.00722000190313065, + "grad_norm": 3.494162082672119, + "learning_rate": 4.9993579778355487e-05, + "loss": 7.499, + "step": 1214 + }, + { + "epoch": 0.007225949186411647, + "grad_norm": 2.9848556518554688, + "learning_rate": 4.999356918875501e-05, + "loss": 7.2064, + "step": 1215 + }, + { + "epoch": 0.007231896469692645, + "grad_norm": 2.391390562057495, + "learning_rate": 4.999355859042953e-05, + "loss": 7.2752, + "step": 1216 + }, + { + "epoch": 0.007237843752973642, + "grad_norm": 2.872891902923584, + "learning_rate": 4.9993547983379065e-05, + "loss": 6.9865, + "step": 1217 + }, + { + "epoch": 0.0072437910362546385, + "grad_norm": 2.760213613510132, + "learning_rate": 4.99935373676036e-05, + "loss": 7.0211, + "step": 1218 + }, + { + "epoch": 0.007249738319535636, + "grad_norm": 2.8857531547546387, + "learning_rate": 4.9993526743103156e-05, + "loss": 6.9162, + "step": 1219 + }, + { + "epoch": 0.007255685602816633, + "grad_norm": 3.150836229324341, + "learning_rate": 4.999351610987772e-05, + "loss": 7.2929, + "step": 1220 + }, + { + "epoch": 0.007261632886097631, + "grad_norm": 2.2004289627075195, + "learning_rate": 4.999350546792732e-05, + "loss": 7.4729, + "step": 1221 + }, + { + "epoch": 0.0072675801693786275, + "grad_norm": 2.5004026889801025, + "learning_rate": 4.999349481725194e-05, + "loss": 7.5235, + "step": 1222 + }, + { + "epoch": 0.007273527452659625, + "grad_norm": 2.8355395793914795, + "learning_rate": 4.999348415785159e-05, + "loss": 7.3535, + "step": 1223 + }, + { + "epoch": 0.007279474735940622, + "grad_norm": 2.559330701828003, + "learning_rate": 4.9993473489726276e-05, + "loss": 6.9634, + "step": 1224 + }, + { + "epoch": 0.00728542201922162, + "grad_norm": 2.3559181690216064, + "learning_rate": 4.999346281287599e-05, + "loss": 6.9246, + "step": 1225 + }, + { + "epoch": 0.0072913693025026165, + "grad_norm": 2.3852717876434326, + "learning_rate": 4.999345212730075e-05, + "loss": 6.6417, + "step": 1226 + }, + { + "epoch": 0.007297316585783614, + "grad_norm": 2.2604117393493652, + "learning_rate": 4.999344143300055e-05, + "loss": 7.4182, + "step": 1227 + }, + { + "epoch": 0.007303263869064611, + "grad_norm": 2.57983660697937, + "learning_rate": 4.9993430729975396e-05, + "loss": 7.4841, + "step": 1228 + }, + { + "epoch": 0.007309211152345609, + "grad_norm": 2.653935670852661, + "learning_rate": 4.99934200182253e-05, + "loss": 7.5477, + "step": 1229 + }, + { + "epoch": 0.0073151584356266055, + "grad_norm": 2.0740158557891846, + "learning_rate": 4.999340929775026e-05, + "loss": 7.4359, + "step": 1230 + }, + { + "epoch": 0.007321105718907603, + "grad_norm": 2.62064528465271, + "learning_rate": 4.9993398568550275e-05, + "loss": 7.1817, + "step": 1231 + }, + { + "epoch": 0.0073270530021886, + "grad_norm": 2.318244457244873, + "learning_rate": 4.999338783062536e-05, + "loss": 7.1663, + "step": 1232 + }, + { + "epoch": 0.007333000285469598, + "grad_norm": 3.0533225536346436, + "learning_rate": 4.99933770839755e-05, + "loss": 7.3051, + "step": 1233 + }, + { + "epoch": 0.0073389475687505945, + "grad_norm": 4.821422100067139, + "learning_rate": 4.999336632860072e-05, + "loss": 7.3435, + "step": 1234 + }, + { + "epoch": 0.007344894852031592, + "grad_norm": 2.680873155593872, + "learning_rate": 4.999335556450101e-05, + "loss": 7.3447, + "step": 1235 + }, + { + "epoch": 0.007350842135312589, + "grad_norm": 3.287454605102539, + "learning_rate": 4.999334479167638e-05, + "loss": 7.1957, + "step": 1236 + }, + { + "epoch": 0.007356789418593587, + "grad_norm": 3.7452759742736816, + "learning_rate": 4.999333401012682e-05, + "loss": 7.2093, + "step": 1237 + }, + { + "epoch": 0.0073627367018745836, + "grad_norm": 3.363443374633789, + "learning_rate": 4.999332321985236e-05, + "loss": 7.297, + "step": 1238 + }, + { + "epoch": 0.007368683985155581, + "grad_norm": 3.070962905883789, + "learning_rate": 4.999331242085299e-05, + "loss": 7.0831, + "step": 1239 + }, + { + "epoch": 0.007374631268436578, + "grad_norm": 3.635183095932007, + "learning_rate": 4.9993301613128706e-05, + "loss": 7.3116, + "step": 1240 + }, + { + "epoch": 0.007380578551717576, + "grad_norm": 2.532179594039917, + "learning_rate": 4.9993290796679516e-05, + "loss": 7.5238, + "step": 1241 + }, + { + "epoch": 0.007386525834998573, + "grad_norm": 2.1147687435150146, + "learning_rate": 4.999327997150543e-05, + "loss": 7.2279, + "step": 1242 + }, + { + "epoch": 0.00739247311827957, + "grad_norm": 2.1221182346343994, + "learning_rate": 4.999326913760645e-05, + "loss": 7.6575, + "step": 1243 + }, + { + "epoch": 0.007398420401560567, + "grad_norm": 2.2920000553131104, + "learning_rate": 4.999325829498257e-05, + "loss": 7.5652, + "step": 1244 + }, + { + "epoch": 0.007404367684841565, + "grad_norm": 2.3444230556488037, + "learning_rate": 4.9993247443633814e-05, + "loss": 7.3992, + "step": 1245 + }, + { + "epoch": 0.007410314968122562, + "grad_norm": 2.2778663635253906, + "learning_rate": 4.9993236583560164e-05, + "loss": 7.1212, + "step": 1246 + }, + { + "epoch": 0.007416262251403559, + "grad_norm": 2.38369083404541, + "learning_rate": 4.999322571476164e-05, + "loss": 7.4605, + "step": 1247 + }, + { + "epoch": 0.007422209534684556, + "grad_norm": 3.578537702560425, + "learning_rate": 4.999321483723823e-05, + "loss": 7.1446, + "step": 1248 + }, + { + "epoch": 0.007428156817965553, + "grad_norm": 5.227176666259766, + "learning_rate": 4.9993203950989954e-05, + "loss": 7.2308, + "step": 1249 + }, + { + "epoch": 0.007434104101246551, + "grad_norm": 2.665844440460205, + "learning_rate": 4.9993193056016805e-05, + "loss": 7.102, + "step": 1250 + }, + { + "epoch": 0.007440051384527547, + "grad_norm": 4.462922096252441, + "learning_rate": 4.9993182152318796e-05, + "loss": 7.003, + "step": 1251 + }, + { + "epoch": 0.007445998667808545, + "grad_norm": 4.9459099769592285, + "learning_rate": 4.999317123989592e-05, + "loss": 7.1338, + "step": 1252 + }, + { + "epoch": 0.007451945951089542, + "grad_norm": 3.127427339553833, + "learning_rate": 4.9993160318748186e-05, + "loss": 7.045, + "step": 1253 + }, + { + "epoch": 0.00745789323437054, + "grad_norm": 3.03910231590271, + "learning_rate": 4.9993149388875606e-05, + "loss": 6.8523, + "step": 1254 + }, + { + "epoch": 0.0074638405176515365, + "grad_norm": 2.931033134460449, + "learning_rate": 4.9993138450278166e-05, + "loss": 7.3065, + "step": 1255 + }, + { + "epoch": 0.007469787800932534, + "grad_norm": 4.60735559463501, + "learning_rate": 4.999312750295588e-05, + "loss": 7.5384, + "step": 1256 + }, + { + "epoch": 0.007475735084213531, + "grad_norm": 3.0745065212249756, + "learning_rate": 4.9993116546908755e-05, + "loss": 7.6279, + "step": 1257 + }, + { + "epoch": 0.007481682367494529, + "grad_norm": 2.7158751487731934, + "learning_rate": 4.9993105582136804e-05, + "loss": 7.1885, + "step": 1258 + }, + { + "epoch": 0.0074876296507755255, + "grad_norm": 3.5049819946289062, + "learning_rate": 4.999309460864e-05, + "loss": 6.6833, + "step": 1259 + }, + { + "epoch": 0.007493576934056523, + "grad_norm": 3.229778289794922, + "learning_rate": 4.999308362641837e-05, + "loss": 6.784, + "step": 1260 + }, + { + "epoch": 0.00749952421733752, + "grad_norm": 2.7032854557037354, + "learning_rate": 4.999307263547191e-05, + "loss": 6.8003, + "step": 1261 + }, + { + "epoch": 0.007505471500618518, + "grad_norm": 5.892059326171875, + "learning_rate": 4.999306163580063e-05, + "loss": 7.2365, + "step": 1262 + }, + { + "epoch": 0.0075114187838995145, + "grad_norm": 5.8021135330200195, + "learning_rate": 4.999305062740453e-05, + "loss": 7.3822, + "step": 1263 + }, + { + "epoch": 0.007517366067180512, + "grad_norm": 5.1242899894714355, + "learning_rate": 4.9993039610283614e-05, + "loss": 7.2192, + "step": 1264 + }, + { + "epoch": 0.007523313350461509, + "grad_norm": 3.102980375289917, + "learning_rate": 4.9993028584437884e-05, + "loss": 7.4895, + "step": 1265 + }, + { + "epoch": 0.007529260633742507, + "grad_norm": 4.993838310241699, + "learning_rate": 4.999301754986735e-05, + "loss": 7.4771, + "step": 1266 + }, + { + "epoch": 0.0075352079170235035, + "grad_norm": 4.003589630126953, + "learning_rate": 4.999300650657201e-05, + "loss": 7.3591, + "step": 1267 + }, + { + "epoch": 0.007541155200304501, + "grad_norm": 3.6125710010528564, + "learning_rate": 4.999299545455187e-05, + "loss": 7.262, + "step": 1268 + }, + { + "epoch": 0.007547102483585498, + "grad_norm": 3.182196617126465, + "learning_rate": 4.999298439380693e-05, + "loss": 7.2689, + "step": 1269 + }, + { + "epoch": 0.007553049766866496, + "grad_norm": 2.428313732147217, + "learning_rate": 4.99929733243372e-05, + "loss": 7.2364, + "step": 1270 + }, + { + "epoch": 0.0075589970501474925, + "grad_norm": 2.673356771469116, + "learning_rate": 4.999296224614268e-05, + "loss": 7.2356, + "step": 1271 + }, + { + "epoch": 0.00756494433342849, + "grad_norm": 2.508026361465454, + "learning_rate": 4.9992951159223376e-05, + "loss": 7.1052, + "step": 1272 + }, + { + "epoch": 0.007570891616709487, + "grad_norm": 2.7501845359802246, + "learning_rate": 4.99929400635793e-05, + "loss": 7.5041, + "step": 1273 + }, + { + "epoch": 0.007576838899990485, + "grad_norm": 2.4604434967041016, + "learning_rate": 4.999292895921044e-05, + "loss": 7.5042, + "step": 1274 + }, + { + "epoch": 0.0075827861832714815, + "grad_norm": 2.4926865100860596, + "learning_rate": 4.99929178461168e-05, + "loss": 7.2104, + "step": 1275 + }, + { + "epoch": 0.007588733466552479, + "grad_norm": 2.631985664367676, + "learning_rate": 4.999290672429839e-05, + "loss": 6.8608, + "step": 1276 + }, + { + "epoch": 0.007594680749833476, + "grad_norm": 2.5684268474578857, + "learning_rate": 4.999289559375523e-05, + "loss": 7.1199, + "step": 1277 + }, + { + "epoch": 0.007600628033114474, + "grad_norm": 2.4312644004821777, + "learning_rate": 4.99928844544873e-05, + "loss": 7.1814, + "step": 1278 + }, + { + "epoch": 0.0076065753163954706, + "grad_norm": 2.794407367706299, + "learning_rate": 4.99928733064946e-05, + "loss": 7.2909, + "step": 1279 + }, + { + "epoch": 0.007612522599676467, + "grad_norm": 2.5903992652893066, + "learning_rate": 4.9992862149777166e-05, + "loss": 7.354, + "step": 1280 + }, + { + "epoch": 0.007618469882957465, + "grad_norm": 2.266364336013794, + "learning_rate": 4.999285098433497e-05, + "loss": 7.5697, + "step": 1281 + }, + { + "epoch": 0.007624417166238462, + "grad_norm": 3.1871070861816406, + "learning_rate": 4.999283981016803e-05, + "loss": 7.4393, + "step": 1282 + }, + { + "epoch": 0.00763036444951946, + "grad_norm": 2.137981653213501, + "learning_rate": 4.999282862727635e-05, + "loss": 7.3591, + "step": 1283 + }, + { + "epoch": 0.007636311732800456, + "grad_norm": 2.3166019916534424, + "learning_rate": 4.999281743565993e-05, + "loss": 7.4307, + "step": 1284 + }, + { + "epoch": 0.007642259016081454, + "grad_norm": 2.331110954284668, + "learning_rate": 4.999280623531878e-05, + "loss": 7.3214, + "step": 1285 + }, + { + "epoch": 0.007648206299362451, + "grad_norm": 2.7417728900909424, + "learning_rate": 4.999279502625289e-05, + "loss": 7.3593, + "step": 1286 + }, + { + "epoch": 0.007654153582643449, + "grad_norm": 3.089448928833008, + "learning_rate": 4.999278380846228e-05, + "loss": 7.3347, + "step": 1287 + }, + { + "epoch": 0.007660100865924445, + "grad_norm": 2.9446022510528564, + "learning_rate": 4.999277258194694e-05, + "loss": 7.3109, + "step": 1288 + }, + { + "epoch": 0.007666048149205443, + "grad_norm": 2.713355302810669, + "learning_rate": 4.9992761346706896e-05, + "loss": 7.2962, + "step": 1289 + }, + { + "epoch": 0.00767199543248644, + "grad_norm": 2.9480702877044678, + "learning_rate": 4.9992750102742125e-05, + "loss": 7.2081, + "step": 1290 + }, + { + "epoch": 0.007677942715767438, + "grad_norm": 2.737271785736084, + "learning_rate": 4.999273885005265e-05, + "loss": 7.2251, + "step": 1291 + }, + { + "epoch": 0.007683889999048434, + "grad_norm": 2.6954190731048584, + "learning_rate": 4.9992727588638466e-05, + "loss": 7.3437, + "step": 1292 + }, + { + "epoch": 0.007689837282329432, + "grad_norm": 3.0270752906799316, + "learning_rate": 4.999271631849958e-05, + "loss": 7.2516, + "step": 1293 + }, + { + "epoch": 0.007695784565610429, + "grad_norm": 2.824052333831787, + "learning_rate": 4.999270503963599e-05, + "loss": 7.2706, + "step": 1294 + }, + { + "epoch": 0.007701731848891427, + "grad_norm": 2.800713300704956, + "learning_rate": 4.999269375204771e-05, + "loss": 7.2497, + "step": 1295 + }, + { + "epoch": 0.0077076791321724234, + "grad_norm": 3.2510271072387695, + "learning_rate": 4.999268245573474e-05, + "loss": 7.025, + "step": 1296 + }, + { + "epoch": 0.007713626415453421, + "grad_norm": 3.095862627029419, + "learning_rate": 4.999267115069708e-05, + "loss": 7.1815, + "step": 1297 + }, + { + "epoch": 0.007719573698734418, + "grad_norm": 3.2238826751708984, + "learning_rate": 4.999265983693473e-05, + "loss": 7.2268, + "step": 1298 + }, + { + "epoch": 0.007725520982015416, + "grad_norm": 3.18687105178833, + "learning_rate": 4.999264851444771e-05, + "loss": 7.2076, + "step": 1299 + }, + { + "epoch": 0.0077314682652964125, + "grad_norm": 3.1385931968688965, + "learning_rate": 4.9992637183236016e-05, + "loss": 7.2323, + "step": 1300 + }, + { + "epoch": 0.00773741554857741, + "grad_norm": 2.3172361850738525, + "learning_rate": 4.999262584329964e-05, + "loss": 7.1225, + "step": 1301 + }, + { + "epoch": 0.007743362831858407, + "grad_norm": 3.3223013877868652, + "learning_rate": 4.99926144946386e-05, + "loss": 7.2108, + "step": 1302 + }, + { + "epoch": 0.007749310115139405, + "grad_norm": 3.197218894958496, + "learning_rate": 4.99926031372529e-05, + "loss": 7.5123, + "step": 1303 + }, + { + "epoch": 0.0077552573984204015, + "grad_norm": 2.8411800861358643, + "learning_rate": 4.999259177114254e-05, + "loss": 7.3047, + "step": 1304 + }, + { + "epoch": 0.007761204681701399, + "grad_norm": 2.7549736499786377, + "learning_rate": 4.9992580396307524e-05, + "loss": 7.3478, + "step": 1305 + }, + { + "epoch": 0.007767151964982396, + "grad_norm": 2.8829352855682373, + "learning_rate": 4.999256901274786e-05, + "loss": 7.1871, + "step": 1306 + }, + { + "epoch": 0.007773099248263394, + "grad_norm": 2.710076332092285, + "learning_rate": 4.999255762046354e-05, + "loss": 7.0891, + "step": 1307 + }, + { + "epoch": 0.0077790465315443905, + "grad_norm": 2.6598877906799316, + "learning_rate": 4.999254621945458e-05, + "loss": 7.6178, + "step": 1308 + }, + { + "epoch": 0.007784993814825388, + "grad_norm": 2.4012649059295654, + "learning_rate": 4.999253480972099e-05, + "loss": 7.5925, + "step": 1309 + }, + { + "epoch": 0.007790941098106385, + "grad_norm": 2.1501622200012207, + "learning_rate": 4.999252339126275e-05, + "loss": 7.6471, + "step": 1310 + }, + { + "epoch": 0.007796888381387382, + "grad_norm": 3.2150895595550537, + "learning_rate": 4.9992511964079886e-05, + "loss": 7.3995, + "step": 1311 + }, + { + "epoch": 0.0078028356646683795, + "grad_norm": 2.450465440750122, + "learning_rate": 4.9992500528172395e-05, + "loss": 7.219, + "step": 1312 + }, + { + "epoch": 0.007808782947949376, + "grad_norm": 2.714510679244995, + "learning_rate": 4.9992489083540274e-05, + "loss": 7.2023, + "step": 1313 + }, + { + "epoch": 0.007814730231230374, + "grad_norm": 2.660019636154175, + "learning_rate": 4.999247763018354e-05, + "loss": 6.8686, + "step": 1314 + }, + { + "epoch": 0.00782067751451137, + "grad_norm": 2.1031477451324463, + "learning_rate": 4.999246616810218e-05, + "loss": 7.305, + "step": 1315 + }, + { + "epoch": 0.007826624797792368, + "grad_norm": 3.0037856101989746, + "learning_rate": 4.999245469729622e-05, + "loss": 6.9788, + "step": 1316 + }, + { + "epoch": 0.007832572081073366, + "grad_norm": 3.1931207180023193, + "learning_rate": 4.999244321776565e-05, + "loss": 6.9312, + "step": 1317 + }, + { + "epoch": 0.007838519364354363, + "grad_norm": 2.7419891357421875, + "learning_rate": 4.999243172951047e-05, + "loss": 6.7732, + "step": 1318 + }, + { + "epoch": 0.00784446664763536, + "grad_norm": 2.772061824798584, + "learning_rate": 4.99924202325307e-05, + "loss": 6.9576, + "step": 1319 + }, + { + "epoch": 0.007850413930916357, + "grad_norm": 2.9300522804260254, + "learning_rate": 4.999240872682632e-05, + "loss": 6.8366, + "step": 1320 + }, + { + "epoch": 0.007856361214197355, + "grad_norm": 3.4697458744049072, + "learning_rate": 4.9992397212397365e-05, + "loss": 6.9234, + "step": 1321 + }, + { + "epoch": 0.007862308497478352, + "grad_norm": 3.044647693634033, + "learning_rate": 4.999238568924381e-05, + "loss": 6.8406, + "step": 1322 + }, + { + "epoch": 0.007868255780759349, + "grad_norm": 2.4429051876068115, + "learning_rate": 4.999237415736567e-05, + "loss": 6.9815, + "step": 1323 + }, + { + "epoch": 0.007874203064040346, + "grad_norm": 2.6193530559539795, + "learning_rate": 4.999236261676296e-05, + "loss": 7.3867, + "step": 1324 + }, + { + "epoch": 0.007880150347321344, + "grad_norm": 3.9543204307556152, + "learning_rate": 4.999235106743567e-05, + "loss": 7.2391, + "step": 1325 + }, + { + "epoch": 0.007886097630602341, + "grad_norm": 3.12777042388916, + "learning_rate": 4.9992339509383814e-05, + "loss": 7.0976, + "step": 1326 + }, + { + "epoch": 0.007892044913883338, + "grad_norm": 2.4543895721435547, + "learning_rate": 4.999232794260739e-05, + "loss": 7.1865, + "step": 1327 + }, + { + "epoch": 0.007897992197164335, + "grad_norm": 4.254832744598389, + "learning_rate": 4.999231636710639e-05, + "loss": 6.777, + "step": 1328 + }, + { + "epoch": 0.007903939480445333, + "grad_norm": 2.7835497856140137, + "learning_rate": 4.999230478288084e-05, + "loss": 6.8508, + "step": 1329 + }, + { + "epoch": 0.00790988676372633, + "grad_norm": 3.2724666595458984, + "learning_rate": 4.999229318993073e-05, + "loss": 6.7636, + "step": 1330 + }, + { + "epoch": 0.007915834047007327, + "grad_norm": 4.657248020172119, + "learning_rate": 4.9992281588256075e-05, + "loss": 7.3677, + "step": 1331 + }, + { + "epoch": 0.007921781330288324, + "grad_norm": 6.201416492462158, + "learning_rate": 4.999226997785686e-05, + "loss": 7.5804, + "step": 1332 + }, + { + "epoch": 0.007927728613569322, + "grad_norm": 4.955161094665527, + "learning_rate": 4.999225835873312e-05, + "loss": 7.1867, + "step": 1333 + }, + { + "epoch": 0.007933675896850319, + "grad_norm": 3.4105887413024902, + "learning_rate": 4.9992246730884826e-05, + "loss": 7.0948, + "step": 1334 + }, + { + "epoch": 0.007939623180131316, + "grad_norm": 2.514570951461792, + "learning_rate": 4.999223509431201e-05, + "loss": 6.9367, + "step": 1335 + }, + { + "epoch": 0.007945570463412313, + "grad_norm": 3.7689249515533447, + "learning_rate": 4.9992223449014654e-05, + "loss": 7.2209, + "step": 1336 + }, + { + "epoch": 0.007951517746693311, + "grad_norm": 4.997833728790283, + "learning_rate": 4.999221179499277e-05, + "loss": 7.3336, + "step": 1337 + }, + { + "epoch": 0.007957465029974308, + "grad_norm": 5.1314287185668945, + "learning_rate": 4.999220013224637e-05, + "loss": 6.933, + "step": 1338 + }, + { + "epoch": 0.007963412313255305, + "grad_norm": 3.708528518676758, + "learning_rate": 4.9992188460775447e-05, + "loss": 6.9598, + "step": 1339 + }, + { + "epoch": 0.007969359596536302, + "grad_norm": 3.029602289199829, + "learning_rate": 4.999217678058001e-05, + "loss": 7.3674, + "step": 1340 + }, + { + "epoch": 0.007975306879817299, + "grad_norm": 3.000312089920044, + "learning_rate": 4.999216509166006e-05, + "loss": 7.2705, + "step": 1341 + }, + { + "epoch": 0.007981254163098297, + "grad_norm": 4.852355480194092, + "learning_rate": 4.999215339401561e-05, + "loss": 7.1842, + "step": 1342 + }, + { + "epoch": 0.007987201446379294, + "grad_norm": 3.0430521965026855, + "learning_rate": 4.999214168764664e-05, + "loss": 7.5616, + "step": 1343 + }, + { + "epoch": 0.00799314872966029, + "grad_norm": 2.793760061264038, + "learning_rate": 4.999212997255319e-05, + "loss": 7.4867, + "step": 1344 + }, + { + "epoch": 0.007999096012941288, + "grad_norm": 3.516545295715332, + "learning_rate": 4.9992118248735245e-05, + "loss": 7.5857, + "step": 1345 + }, + { + "epoch": 0.008005043296222286, + "grad_norm": 4.272013187408447, + "learning_rate": 4.9992106516192796e-05, + "loss": 7.5686, + "step": 1346 + }, + { + "epoch": 0.008010990579503283, + "grad_norm": 3.176974058151245, + "learning_rate": 4.999209477492587e-05, + "loss": 7.1826, + "step": 1347 + }, + { + "epoch": 0.00801693786278428, + "grad_norm": 3.2615413665771484, + "learning_rate": 4.999208302493447e-05, + "loss": 7.3933, + "step": 1348 + }, + { + "epoch": 0.008022885146065277, + "grad_norm": 2.9548113346099854, + "learning_rate": 4.999207126621858e-05, + "loss": 7.339, + "step": 1349 + }, + { + "epoch": 0.008028832429346275, + "grad_norm": 3.445829153060913, + "learning_rate": 4.999205949877822e-05, + "loss": 7.4223, + "step": 1350 + }, + { + "epoch": 0.008034779712627272, + "grad_norm": 3.471991777420044, + "learning_rate": 4.999204772261338e-05, + "loss": 7.4192, + "step": 1351 + }, + { + "epoch": 0.008040726995908269, + "grad_norm": 3.1682589054107666, + "learning_rate": 4.999203593772409e-05, + "loss": 7.3433, + "step": 1352 + }, + { + "epoch": 0.008046674279189266, + "grad_norm": 4.693798065185547, + "learning_rate": 4.999202414411033e-05, + "loss": 7.1479, + "step": 1353 + }, + { + "epoch": 0.008052621562470264, + "grad_norm": 3.0599937438964844, + "learning_rate": 4.9992012341772114e-05, + "loss": 7.3137, + "step": 1354 + }, + { + "epoch": 0.008058568845751261, + "grad_norm": 2.9557557106018066, + "learning_rate": 4.999200053070945e-05, + "loss": 7.4466, + "step": 1355 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 2.5595791339874268, + "learning_rate": 4.999198871092233e-05, + "loss": 7.4716, + "step": 1356 + }, + { + "epoch": 0.008070463412313255, + "grad_norm": 2.919729709625244, + "learning_rate": 4.999197688241076e-05, + "loss": 7.0754, + "step": 1357 + }, + { + "epoch": 0.008076410695594253, + "grad_norm": 2.5880625247955322, + "learning_rate": 4.9991965045174763e-05, + "loss": 7.2794, + "step": 1358 + }, + { + "epoch": 0.00808235797887525, + "grad_norm": 2.9933066368103027, + "learning_rate": 4.999195319921432e-05, + "loss": 7.3547, + "step": 1359 + }, + { + "epoch": 0.008088305262156247, + "grad_norm": 5.097862243652344, + "learning_rate": 4.999194134452945e-05, + "loss": 7.1922, + "step": 1360 + }, + { + "epoch": 0.008094252545437244, + "grad_norm": 4.1795830726623535, + "learning_rate": 4.9991929481120146e-05, + "loss": 7.0437, + "step": 1361 + }, + { + "epoch": 0.008100199828718242, + "grad_norm": 3.292961835861206, + "learning_rate": 4.999191760898642e-05, + "loss": 6.8637, + "step": 1362 + }, + { + "epoch": 0.008106147111999239, + "grad_norm": 3.052610397338867, + "learning_rate": 4.999190572812828e-05, + "loss": 7.1675, + "step": 1363 + }, + { + "epoch": 0.008112094395280236, + "grad_norm": 2.975646734237671, + "learning_rate": 4.999189383854571e-05, + "loss": 7.1309, + "step": 1364 + }, + { + "epoch": 0.008118041678561233, + "grad_norm": 2.71195912361145, + "learning_rate": 4.999188194023874e-05, + "loss": 7.2247, + "step": 1365 + }, + { + "epoch": 0.008123988961842231, + "grad_norm": 2.751002311706543, + "learning_rate": 4.9991870033207354e-05, + "loss": 6.8553, + "step": 1366 + }, + { + "epoch": 0.008129936245123228, + "grad_norm": 3.4521234035491943, + "learning_rate": 4.999185811745157e-05, + "loss": 6.8373, + "step": 1367 + }, + { + "epoch": 0.008135883528404225, + "grad_norm": 3.054330348968506, + "learning_rate": 4.999184619297138e-05, + "loss": 6.6982, + "step": 1368 + }, + { + "epoch": 0.008141830811685222, + "grad_norm": 3.513794183731079, + "learning_rate": 4.99918342597668e-05, + "loss": 6.5567, + "step": 1369 + }, + { + "epoch": 0.00814777809496622, + "grad_norm": 3.681838274002075, + "learning_rate": 4.9991822317837836e-05, + "loss": 6.6335, + "step": 1370 + }, + { + "epoch": 0.008153725378247217, + "grad_norm": 4.144393444061279, + "learning_rate": 4.999181036718447e-05, + "loss": 6.5361, + "step": 1371 + }, + { + "epoch": 0.008159672661528214, + "grad_norm": 2.9771196842193604, + "learning_rate": 4.9991798407806736e-05, + "loss": 7.0085, + "step": 1372 + }, + { + "epoch": 0.00816561994480921, + "grad_norm": 3.114884376525879, + "learning_rate": 4.9991786439704615e-05, + "loss": 7.1498, + "step": 1373 + }, + { + "epoch": 0.008171567228090208, + "grad_norm": 2.76042103767395, + "learning_rate": 4.9991774462878115e-05, + "loss": 6.8462, + "step": 1374 + }, + { + "epoch": 0.008177514511371206, + "grad_norm": 3.257528066635132, + "learning_rate": 4.999176247732725e-05, + "loss": 6.4595, + "step": 1375 + }, + { + "epoch": 0.008183461794652203, + "grad_norm": 3.377774238586426, + "learning_rate": 4.999175048305202e-05, + "loss": 6.3131, + "step": 1376 + }, + { + "epoch": 0.0081894090779332, + "grad_norm": 3.029477834701538, + "learning_rate": 4.999173848005243e-05, + "loss": 6.7182, + "step": 1377 + }, + { + "epoch": 0.008195356361214197, + "grad_norm": 3.0353076457977295, + "learning_rate": 4.9991726468328476e-05, + "loss": 7.009, + "step": 1378 + }, + { + "epoch": 0.008201303644495195, + "grad_norm": 2.465014934539795, + "learning_rate": 4.999171444788017e-05, + "loss": 7.6277, + "step": 1379 + }, + { + "epoch": 0.008207250927776192, + "grad_norm": 3.025954484939575, + "learning_rate": 4.999170241870752e-05, + "loss": 7.2815, + "step": 1380 + }, + { + "epoch": 0.008213198211057189, + "grad_norm": 3.8414018154144287, + "learning_rate": 4.999169038081052e-05, + "loss": 7.2238, + "step": 1381 + }, + { + "epoch": 0.008219145494338186, + "grad_norm": 3.2927470207214355, + "learning_rate": 4.999167833418918e-05, + "loss": 7.1505, + "step": 1382 + }, + { + "epoch": 0.008225092777619184, + "grad_norm": 2.6132330894470215, + "learning_rate": 4.999166627884351e-05, + "loss": 7.2499, + "step": 1383 + }, + { + "epoch": 0.008231040060900181, + "grad_norm": 2.523366689682007, + "learning_rate": 4.9991654214773497e-05, + "loss": 6.9812, + "step": 1384 + }, + { + "epoch": 0.008236987344181178, + "grad_norm": 3.977471351623535, + "learning_rate": 4.9991642141979154e-05, + "loss": 7.3196, + "step": 1385 + }, + { + "epoch": 0.008242934627462175, + "grad_norm": 2.731952428817749, + "learning_rate": 4.99916300604605e-05, + "loss": 7.1014, + "step": 1386 + }, + { + "epoch": 0.008248881910743173, + "grad_norm": 2.6128756999969482, + "learning_rate": 4.999161797021752e-05, + "loss": 7.0235, + "step": 1387 + }, + { + "epoch": 0.00825482919402417, + "grad_norm": 2.263430595397949, + "learning_rate": 4.999160587125023e-05, + "loss": 7.0183, + "step": 1388 + }, + { + "epoch": 0.008260776477305167, + "grad_norm": 2.799994707107544, + "learning_rate": 4.9991593763558614e-05, + "loss": 6.9553, + "step": 1389 + }, + { + "epoch": 0.008266723760586164, + "grad_norm": 2.5443058013916016, + "learning_rate": 4.99915816471427e-05, + "loss": 7.2302, + "step": 1390 + }, + { + "epoch": 0.008272671043867162, + "grad_norm": 2.304185152053833, + "learning_rate": 4.999156952200248e-05, + "loss": 7.2589, + "step": 1391 + }, + { + "epoch": 0.008278618327148159, + "grad_norm": 2.1639649868011475, + "learning_rate": 4.999155738813797e-05, + "loss": 7.0067, + "step": 1392 + }, + { + "epoch": 0.008284565610429156, + "grad_norm": 2.276514768600464, + "learning_rate": 4.999154524554915e-05, + "loss": 7.2721, + "step": 1393 + }, + { + "epoch": 0.008290512893710153, + "grad_norm": 2.212200880050659, + "learning_rate": 4.9991533094236055e-05, + "loss": 7.1183, + "step": 1394 + }, + { + "epoch": 0.008296460176991151, + "grad_norm": 2.5289459228515625, + "learning_rate": 4.999152093419867e-05, + "loss": 7.0289, + "step": 1395 + }, + { + "epoch": 0.008302407460272148, + "grad_norm": 2.5915603637695312, + "learning_rate": 4.999150876543699e-05, + "loss": 6.7497, + "step": 1396 + }, + { + "epoch": 0.008308354743553145, + "grad_norm": 2.680513858795166, + "learning_rate": 4.999149658795105e-05, + "loss": 6.7139, + "step": 1397 + }, + { + "epoch": 0.008314302026834142, + "grad_norm": 2.65744948387146, + "learning_rate": 4.999148440174083e-05, + "loss": 6.6151, + "step": 1398 + }, + { + "epoch": 0.00832024931011514, + "grad_norm": 3.8028745651245117, + "learning_rate": 4.9991472206806334e-05, + "loss": 7.1992, + "step": 1399 + }, + { + "epoch": 0.008326196593396137, + "grad_norm": 2.8436119556427, + "learning_rate": 4.999146000314758e-05, + "loss": 7.165, + "step": 1400 + }, + { + "epoch": 0.008332143876677134, + "grad_norm": 2.6658496856689453, + "learning_rate": 4.999144779076457e-05, + "loss": 7.5945, + "step": 1401 + }, + { + "epoch": 0.00833809115995813, + "grad_norm": 2.909703016281128, + "learning_rate": 4.99914355696573e-05, + "loss": 7.6378, + "step": 1402 + }, + { + "epoch": 0.00834403844323913, + "grad_norm": 2.5827598571777344, + "learning_rate": 4.9991423339825776e-05, + "loss": 7.5441, + "step": 1403 + }, + { + "epoch": 0.008349985726520126, + "grad_norm": 3.0283706188201904, + "learning_rate": 4.999141110127e-05, + "loss": 7.1162, + "step": 1404 + }, + { + "epoch": 0.008355933009801123, + "grad_norm": 3.11690354347229, + "learning_rate": 4.999139885398999e-05, + "loss": 6.5123, + "step": 1405 + }, + { + "epoch": 0.00836188029308212, + "grad_norm": 2.6188690662384033, + "learning_rate": 4.999138659798574e-05, + "loss": 7.6384, + "step": 1406 + }, + { + "epoch": 0.008367827576363117, + "grad_norm": 3.4412481784820557, + "learning_rate": 4.999137433325725e-05, + "loss": 7.4067, + "step": 1407 + }, + { + "epoch": 0.008373774859644115, + "grad_norm": 3.1690893173217773, + "learning_rate": 4.999136205980454e-05, + "loss": 7.3937, + "step": 1408 + }, + { + "epoch": 0.008379722142925112, + "grad_norm": 2.1589877605438232, + "learning_rate": 4.999134977762759e-05, + "loss": 7.454, + "step": 1409 + }, + { + "epoch": 0.008385669426206109, + "grad_norm": 2.485901117324829, + "learning_rate": 4.999133748672642e-05, + "loss": 7.3421, + "step": 1410 + }, + { + "epoch": 0.008391616709487106, + "grad_norm": 2.543128252029419, + "learning_rate": 4.999132518710104e-05, + "loss": 7.3162, + "step": 1411 + }, + { + "epoch": 0.008397563992768104, + "grad_norm": 2.8048489093780518, + "learning_rate": 4.999131287875144e-05, + "loss": 7.297, + "step": 1412 + }, + { + "epoch": 0.008403511276049101, + "grad_norm": 3.0391035079956055, + "learning_rate": 4.9991300561677634e-05, + "loss": 7.2409, + "step": 1413 + }, + { + "epoch": 0.008409458559330098, + "grad_norm": 2.3196053504943848, + "learning_rate": 4.999128823587962e-05, + "loss": 7.1358, + "step": 1414 + }, + { + "epoch": 0.008415405842611095, + "grad_norm": 3.1876983642578125, + "learning_rate": 4.999127590135741e-05, + "loss": 7.1501, + "step": 1415 + }, + { + "epoch": 0.008421353125892093, + "grad_norm": 3.6832327842712402, + "learning_rate": 4.9991263558111e-05, + "loss": 7.181, + "step": 1416 + }, + { + "epoch": 0.00842730040917309, + "grad_norm": 3.7491936683654785, + "learning_rate": 4.99912512061404e-05, + "loss": 6.9669, + "step": 1417 + }, + { + "epoch": 0.008433247692454087, + "grad_norm": 3.1583478450775146, + "learning_rate": 4.9991238845445615e-05, + "loss": 7.2155, + "step": 1418 + }, + { + "epoch": 0.008439194975735084, + "grad_norm": 3.11611008644104, + "learning_rate": 4.999122647602664e-05, + "loss": 7.164, + "step": 1419 + }, + { + "epoch": 0.008445142259016082, + "grad_norm": 6.127118110656738, + "learning_rate": 4.9991214097883495e-05, + "loss": 7.232, + "step": 1420 + }, + { + "epoch": 0.008451089542297079, + "grad_norm": 4.736495494842529, + "learning_rate": 4.9991201711016166e-05, + "loss": 7.3685, + "step": 1421 + }, + { + "epoch": 0.008457036825578076, + "grad_norm": 2.9656684398651123, + "learning_rate": 4.999118931542467e-05, + "loss": 7.2658, + "step": 1422 + }, + { + "epoch": 0.008462984108859073, + "grad_norm": 2.5959243774414062, + "learning_rate": 4.999117691110901e-05, + "loss": 7.0908, + "step": 1423 + }, + { + "epoch": 0.008468931392140071, + "grad_norm": 4.546379089355469, + "learning_rate": 4.999116449806919e-05, + "loss": 7.1343, + "step": 1424 + }, + { + "epoch": 0.008474878675421068, + "grad_norm": 3.6856796741485596, + "learning_rate": 4.9991152076305206e-05, + "loss": 6.9205, + "step": 1425 + }, + { + "epoch": 0.008480825958702065, + "grad_norm": 3.293973922729492, + "learning_rate": 4.9991139645817075e-05, + "loss": 6.9954, + "step": 1426 + }, + { + "epoch": 0.008486773241983062, + "grad_norm": 3.2511162757873535, + "learning_rate": 4.999112720660479e-05, + "loss": 6.7661, + "step": 1427 + }, + { + "epoch": 0.00849272052526406, + "grad_norm": 3.990840196609497, + "learning_rate": 4.9991114758668364e-05, + "loss": 6.7402, + "step": 1428 + }, + { + "epoch": 0.008498667808545057, + "grad_norm": 3.306809186935425, + "learning_rate": 4.9991102302007804e-05, + "loss": 6.6801, + "step": 1429 + }, + { + "epoch": 0.008504615091826054, + "grad_norm": 5.208675384521484, + "learning_rate": 4.99910898366231e-05, + "loss": 7.0128, + "step": 1430 + }, + { + "epoch": 0.00851056237510705, + "grad_norm": 4.131346225738525, + "learning_rate": 4.9991077362514266e-05, + "loss": 7.0992, + "step": 1431 + }, + { + "epoch": 0.00851650965838805, + "grad_norm": 2.60927152633667, + "learning_rate": 4.99910648796813e-05, + "loss": 7.2731, + "step": 1432 + }, + { + "epoch": 0.008522456941669046, + "grad_norm": 5.654631614685059, + "learning_rate": 4.9991052388124224e-05, + "loss": 6.6105, + "step": 1433 + }, + { + "epoch": 0.008528404224950043, + "grad_norm": 6.108455657958984, + "learning_rate": 4.9991039887843025e-05, + "loss": 6.3548, + "step": 1434 + }, + { + "epoch": 0.00853435150823104, + "grad_norm": 3.758371591567993, + "learning_rate": 4.9991027378837705e-05, + "loss": 6.6171, + "step": 1435 + }, + { + "epoch": 0.008540298791512036, + "grad_norm": 2.1995320320129395, + "learning_rate": 4.9991014861108285e-05, + "loss": 6.5987, + "step": 1436 + }, + { + "epoch": 0.008546246074793035, + "grad_norm": 2.3778254985809326, + "learning_rate": 4.999100233465476e-05, + "loss": 6.8067, + "step": 1437 + }, + { + "epoch": 0.008552193358074032, + "grad_norm": 2.521928310394287, + "learning_rate": 4.999098979947713e-05, + "loss": 6.7756, + "step": 1438 + }, + { + "epoch": 0.008558140641355029, + "grad_norm": 2.109605073928833, + "learning_rate": 4.99909772555754e-05, + "loss": 6.7091, + "step": 1439 + }, + { + "epoch": 0.008564087924636025, + "grad_norm": 2.55838680267334, + "learning_rate": 4.9990964702949585e-05, + "loss": 6.8989, + "step": 1440 + }, + { + "epoch": 0.008570035207917024, + "grad_norm": 2.4499685764312744, + "learning_rate": 4.9990952141599675e-05, + "loss": 6.6241, + "step": 1441 + }, + { + "epoch": 0.00857598249119802, + "grad_norm": 2.265371322631836, + "learning_rate": 4.9990939571525685e-05, + "loss": 7.6681, + "step": 1442 + }, + { + "epoch": 0.008581929774479018, + "grad_norm": 2.4496965408325195, + "learning_rate": 4.999092699272762e-05, + "loss": 6.8177, + "step": 1443 + }, + { + "epoch": 0.008587877057760014, + "grad_norm": 2.5555005073547363, + "learning_rate": 4.999091440520548e-05, + "loss": 6.6402, + "step": 1444 + }, + { + "epoch": 0.008593824341041013, + "grad_norm": 2.042592763900757, + "learning_rate": 4.999090180895927e-05, + "loss": 6.6114, + "step": 1445 + }, + { + "epoch": 0.00859977162432201, + "grad_norm": 2.3100671768188477, + "learning_rate": 4.9990889203988986e-05, + "loss": 6.712, + "step": 1446 + }, + { + "epoch": 0.008605718907603007, + "grad_norm": 2.7600841522216797, + "learning_rate": 4.999087659029465e-05, + "loss": 6.6531, + "step": 1447 + }, + { + "epoch": 0.008611666190884004, + "grad_norm": 3.292684316635132, + "learning_rate": 4.999086396787625e-05, + "loss": 6.9896, + "step": 1448 + }, + { + "epoch": 0.008617613474165002, + "grad_norm": 2.7579386234283447, + "learning_rate": 4.999085133673381e-05, + "loss": 7.1559, + "step": 1449 + }, + { + "epoch": 0.008623560757445999, + "grad_norm": 2.7898707389831543, + "learning_rate": 4.999083869686731e-05, + "loss": 6.9861, + "step": 1450 + }, + { + "epoch": 0.008629508040726996, + "grad_norm": 3.439809799194336, + "learning_rate": 4.999082604827677e-05, + "loss": 6.759, + "step": 1451 + }, + { + "epoch": 0.008635455324007993, + "grad_norm": 2.924859046936035, + "learning_rate": 4.999081339096219e-05, + "loss": 6.5438, + "step": 1452 + }, + { + "epoch": 0.008641402607288991, + "grad_norm": 3.363886594772339, + "learning_rate": 4.999080072492358e-05, + "loss": 7.0477, + "step": 1453 + }, + { + "epoch": 0.008647349890569988, + "grad_norm": 2.924988031387329, + "learning_rate": 4.999078805016093e-05, + "loss": 6.9228, + "step": 1454 + }, + { + "epoch": 0.008653297173850985, + "grad_norm": 3.2283847332000732, + "learning_rate": 4.999077536667426e-05, + "loss": 6.8763, + "step": 1455 + }, + { + "epoch": 0.008659244457131982, + "grad_norm": 2.635744094848633, + "learning_rate": 4.999076267446357e-05, + "loss": 6.6438, + "step": 1456 + }, + { + "epoch": 0.00866519174041298, + "grad_norm": 2.829801559448242, + "learning_rate": 4.9990749973528864e-05, + "loss": 6.9466, + "step": 1457 + }, + { + "epoch": 0.008671139023693977, + "grad_norm": 3.3631057739257812, + "learning_rate": 4.999073726387014e-05, + "loss": 7.2652, + "step": 1458 + }, + { + "epoch": 0.008677086306974974, + "grad_norm": 3.9970719814300537, + "learning_rate": 4.999072454548741e-05, + "loss": 7.053, + "step": 1459 + }, + { + "epoch": 0.00868303359025597, + "grad_norm": 3.322787046432495, + "learning_rate": 4.9990711818380674e-05, + "loss": 7.0272, + "step": 1460 + }, + { + "epoch": 0.008688980873536969, + "grad_norm": 2.7370798587799072, + "learning_rate": 4.999069908254995e-05, + "loss": 6.8545, + "step": 1461 + }, + { + "epoch": 0.008694928156817966, + "grad_norm": 2.845191240310669, + "learning_rate": 4.999068633799522e-05, + "loss": 6.9393, + "step": 1462 + }, + { + "epoch": 0.008700875440098963, + "grad_norm": 3.064960241317749, + "learning_rate": 4.99906735847165e-05, + "loss": 6.7734, + "step": 1463 + }, + { + "epoch": 0.00870682272337996, + "grad_norm": 7.113090515136719, + "learning_rate": 4.99906608227138e-05, + "loss": 7.0532, + "step": 1464 + }, + { + "epoch": 0.008712770006660958, + "grad_norm": 5.90821647644043, + "learning_rate": 4.999064805198711e-05, + "loss": 7.1494, + "step": 1465 + }, + { + "epoch": 0.008718717289941955, + "grad_norm": 3.9366238117218018, + "learning_rate": 4.9990635272536454e-05, + "loss": 7.623, + "step": 1466 + }, + { + "epoch": 0.008724664573222952, + "grad_norm": 3.1239330768585205, + "learning_rate": 4.9990622484361814e-05, + "loss": 7.4938, + "step": 1467 + }, + { + "epoch": 0.008730611856503949, + "grad_norm": 2.6688928604125977, + "learning_rate": 4.9990609687463216e-05, + "loss": 7.3445, + "step": 1468 + }, + { + "epoch": 0.008736559139784945, + "grad_norm": 3.047154664993286, + "learning_rate": 4.9990596881840646e-05, + "loss": 7.158, + "step": 1469 + }, + { + "epoch": 0.008742506423065944, + "grad_norm": 2.5230467319488525, + "learning_rate": 4.999058406749412e-05, + "loss": 7.1368, + "step": 1470 + }, + { + "epoch": 0.00874845370634694, + "grad_norm": 2.729705333709717, + "learning_rate": 4.999057124442364e-05, + "loss": 7.0144, + "step": 1471 + }, + { + "epoch": 0.008754400989627938, + "grad_norm": 2.5796756744384766, + "learning_rate": 4.999055841262921e-05, + "loss": 7.2157, + "step": 1472 + }, + { + "epoch": 0.008760348272908934, + "grad_norm": 3.458691358566284, + "learning_rate": 4.999054557211084e-05, + "loss": 6.7631, + "step": 1473 + }, + { + "epoch": 0.008766295556189933, + "grad_norm": 2.7262747287750244, + "learning_rate": 4.999053272286851e-05, + "loss": 6.9784, + "step": 1474 + }, + { + "epoch": 0.00877224283947093, + "grad_norm": 2.6003808975219727, + "learning_rate": 4.9990519864902267e-05, + "loss": 7.1369, + "step": 1475 + }, + { + "epoch": 0.008778190122751927, + "grad_norm": 3.4032137393951416, + "learning_rate": 4.999050699821207e-05, + "loss": 6.9569, + "step": 1476 + }, + { + "epoch": 0.008784137406032923, + "grad_norm": 4.099828243255615, + "learning_rate": 4.9990494122797957e-05, + "loss": 6.9977, + "step": 1477 + }, + { + "epoch": 0.008790084689313922, + "grad_norm": 3.1837944984436035, + "learning_rate": 4.999048123865992e-05, + "loss": 7.1331, + "step": 1478 + }, + { + "epoch": 0.008796031972594919, + "grad_norm": 2.618847131729126, + "learning_rate": 4.999046834579796e-05, + "loss": 7.0043, + "step": 1479 + }, + { + "epoch": 0.008801979255875916, + "grad_norm": 3.0132501125335693, + "learning_rate": 4.999045544421209e-05, + "loss": 6.7836, + "step": 1480 + }, + { + "epoch": 0.008807926539156912, + "grad_norm": 2.4608371257781982, + "learning_rate": 4.999044253390231e-05, + "loss": 7.0721, + "step": 1481 + }, + { + "epoch": 0.008813873822437911, + "grad_norm": 3.280649423599243, + "learning_rate": 4.999042961486863e-05, + "loss": 7.959, + "step": 1482 + }, + { + "epoch": 0.008819821105718908, + "grad_norm": 2.7038395404815674, + "learning_rate": 4.999041668711104e-05, + "loss": 7.1256, + "step": 1483 + }, + { + "epoch": 0.008825768388999905, + "grad_norm": 2.1451892852783203, + "learning_rate": 4.9990403750629556e-05, + "loss": 7.2219, + "step": 1484 + }, + { + "epoch": 0.008831715672280901, + "grad_norm": 2.3731601238250732, + "learning_rate": 4.999039080542418e-05, + "loss": 7.2023, + "step": 1485 + }, + { + "epoch": 0.0088376629555619, + "grad_norm": 2.444089651107788, + "learning_rate": 4.999037785149492e-05, + "loss": 7.0988, + "step": 1486 + }, + { + "epoch": 0.008843610238842897, + "grad_norm": 2.644712448120117, + "learning_rate": 4.999036488884177e-05, + "loss": 7.1916, + "step": 1487 + }, + { + "epoch": 0.008849557522123894, + "grad_norm": 5.477145671844482, + "learning_rate": 4.999035191746475e-05, + "loss": 6.7256, + "step": 1488 + }, + { + "epoch": 0.00885550480540489, + "grad_norm": 2.2691709995269775, + "learning_rate": 4.999033893736386e-05, + "loss": 7.2505, + "step": 1489 + }, + { + "epoch": 0.008861452088685889, + "grad_norm": 2.5880343914031982, + "learning_rate": 4.999032594853909e-05, + "loss": 6.9549, + "step": 1490 + }, + { + "epoch": 0.008867399371966886, + "grad_norm": 2.2748520374298096, + "learning_rate": 4.999031295099046e-05, + "loss": 6.8269, + "step": 1491 + }, + { + "epoch": 0.008873346655247883, + "grad_norm": 2.262706995010376, + "learning_rate": 4.999029994471797e-05, + "loss": 6.8876, + "step": 1492 + }, + { + "epoch": 0.00887929393852888, + "grad_norm": 2.264256238937378, + "learning_rate": 4.999028692972162e-05, + "loss": 7.1545, + "step": 1493 + }, + { + "epoch": 0.008885241221809878, + "grad_norm": 2.489259719848633, + "learning_rate": 4.9990273906001424e-05, + "loss": 7.194, + "step": 1494 + }, + { + "epoch": 0.008891188505090875, + "grad_norm": 2.7545981407165527, + "learning_rate": 4.999026087355738e-05, + "loss": 7.0148, + "step": 1495 + }, + { + "epoch": 0.008897135788371872, + "grad_norm": 2.6869328022003174, + "learning_rate": 4.999024783238949e-05, + "loss": 7.2535, + "step": 1496 + }, + { + "epoch": 0.008903083071652869, + "grad_norm": 2.5216503143310547, + "learning_rate": 4.999023478249777e-05, + "loss": 6.4351, + "step": 1497 + }, + { + "epoch": 0.008909030354933865, + "grad_norm": 2.5090575218200684, + "learning_rate": 4.9990221723882216e-05, + "loss": 7.3068, + "step": 1498 + }, + { + "epoch": 0.008914977638214864, + "grad_norm": 2.5026490688323975, + "learning_rate": 4.999020865654283e-05, + "loss": 7.1274, + "step": 1499 + }, + { + "epoch": 0.00892092492149586, + "grad_norm": 2.8030898571014404, + "learning_rate": 4.999019558047963e-05, + "loss": 7.0016, + "step": 1500 + }, + { + "epoch": 0.008926872204776858, + "grad_norm": 2.533383846282959, + "learning_rate": 4.99901824956926e-05, + "loss": 6.8991, + "step": 1501 + }, + { + "epoch": 0.008932819488057854, + "grad_norm": 2.5584118366241455, + "learning_rate": 4.999016940218175e-05, + "loss": 6.9237, + "step": 1502 + }, + { + "epoch": 0.008938766771338853, + "grad_norm": 2.778592586517334, + "learning_rate": 4.99901562999471e-05, + "loss": 7.0941, + "step": 1503 + }, + { + "epoch": 0.00894471405461985, + "grad_norm": 4.023860931396484, + "learning_rate": 4.999014318898865e-05, + "loss": 6.5188, + "step": 1504 + }, + { + "epoch": 0.008950661337900847, + "grad_norm": 3.018118143081665, + "learning_rate": 4.999013006930639e-05, + "loss": 7.0557, + "step": 1505 + }, + { + "epoch": 0.008956608621181843, + "grad_norm": 2.802061080932617, + "learning_rate": 4.999011694090033e-05, + "loss": 7.2645, + "step": 1506 + }, + { + "epoch": 0.008962555904462842, + "grad_norm": 2.3782076835632324, + "learning_rate": 4.999010380377049e-05, + "loss": 7.3707, + "step": 1507 + }, + { + "epoch": 0.008968503187743839, + "grad_norm": 2.451878309249878, + "learning_rate": 4.999009065791686e-05, + "loss": 7.2783, + "step": 1508 + }, + { + "epoch": 0.008974450471024836, + "grad_norm": 3.85514235496521, + "learning_rate": 4.999007750333945e-05, + "loss": 6.3543, + "step": 1509 + }, + { + "epoch": 0.008980397754305832, + "grad_norm": 2.617177963256836, + "learning_rate": 4.999006434003825e-05, + "loss": 7.0175, + "step": 1510 + }, + { + "epoch": 0.008986345037586831, + "grad_norm": 2.6909587383270264, + "learning_rate": 4.999005116801329e-05, + "loss": 7.3282, + "step": 1511 + }, + { + "epoch": 0.008992292320867828, + "grad_norm": 2.332165241241455, + "learning_rate": 4.9990037987264546e-05, + "loss": 7.0993, + "step": 1512 + }, + { + "epoch": 0.008998239604148825, + "grad_norm": 2.5398497581481934, + "learning_rate": 4.9990024797792055e-05, + "loss": 7.2867, + "step": 1513 + }, + { + "epoch": 0.009004186887429821, + "grad_norm": 2.432264566421509, + "learning_rate": 4.9990011599595796e-05, + "loss": 7.1619, + "step": 1514 + }, + { + "epoch": 0.00901013417071082, + "grad_norm": 2.2937278747558594, + "learning_rate": 4.998999839267578e-05, + "loss": 7.1138, + "step": 1515 + }, + { + "epoch": 0.009016081453991817, + "grad_norm": 2.3305680751800537, + "learning_rate": 4.998998517703202e-05, + "loss": 7.0569, + "step": 1516 + }, + { + "epoch": 0.009022028737272814, + "grad_norm": 3.0785884857177734, + "learning_rate": 4.998997195266451e-05, + "loss": 7.0922, + "step": 1517 + }, + { + "epoch": 0.00902797602055381, + "grad_norm": 2.354283571243286, + "learning_rate": 4.998995871957326e-05, + "loss": 7.0024, + "step": 1518 + }, + { + "epoch": 0.009033923303834809, + "grad_norm": 2.488194465637207, + "learning_rate": 4.998994547775827e-05, + "loss": 7.0045, + "step": 1519 + }, + { + "epoch": 0.009039870587115806, + "grad_norm": 2.6196579933166504, + "learning_rate": 4.998993222721956e-05, + "loss": 6.9416, + "step": 1520 + }, + { + "epoch": 0.009045817870396803, + "grad_norm": 2.6524155139923096, + "learning_rate": 4.998991896795711e-05, + "loss": 6.9562, + "step": 1521 + }, + { + "epoch": 0.0090517651536778, + "grad_norm": 3.308661460876465, + "learning_rate": 4.998990569997094e-05, + "loss": 6.8602, + "step": 1522 + }, + { + "epoch": 0.009057712436958798, + "grad_norm": 2.7995994091033936, + "learning_rate": 4.9989892423261055e-05, + "loss": 7.7049, + "step": 1523 + }, + { + "epoch": 0.009063659720239795, + "grad_norm": 2.547189235687256, + "learning_rate": 4.9989879137827456e-05, + "loss": 7.0254, + "step": 1524 + }, + { + "epoch": 0.009069607003520792, + "grad_norm": 2.796393871307373, + "learning_rate": 4.998986584367015e-05, + "loss": 7.0124, + "step": 1525 + }, + { + "epoch": 0.009075554286801788, + "grad_norm": 2.9441823959350586, + "learning_rate": 4.9989852540789136e-05, + "loss": 7.0174, + "step": 1526 + }, + { + "epoch": 0.009081501570082787, + "grad_norm": 2.509150743484497, + "learning_rate": 4.998983922918443e-05, + "loss": 6.9405, + "step": 1527 + }, + { + "epoch": 0.009087448853363784, + "grad_norm": 2.3686184883117676, + "learning_rate": 4.998982590885603e-05, + "loss": 6.794, + "step": 1528 + }, + { + "epoch": 0.00909339613664478, + "grad_norm": 2.937530755996704, + "learning_rate": 4.998981257980393e-05, + "loss": 6.9716, + "step": 1529 + }, + { + "epoch": 0.009099343419925777, + "grad_norm": 2.493178606033325, + "learning_rate": 4.998979924202814e-05, + "loss": 6.5986, + "step": 1530 + }, + { + "epoch": 0.009105290703206774, + "grad_norm": 2.071356773376465, + "learning_rate": 4.9989785895528686e-05, + "loss": 6.536, + "step": 1531 + }, + { + "epoch": 0.009111237986487773, + "grad_norm": 1.9372920989990234, + "learning_rate": 4.998977254030554e-05, + "loss": 6.4036, + "step": 1532 + }, + { + "epoch": 0.00911718526976877, + "grad_norm": 2.3329098224639893, + "learning_rate": 4.998975917635873e-05, + "loss": 6.4861, + "step": 1533 + }, + { + "epoch": 0.009123132553049767, + "grad_norm": 2.9681191444396973, + "learning_rate": 4.998974580368826e-05, + "loss": 6.939, + "step": 1534 + }, + { + "epoch": 0.009129079836330763, + "grad_norm": 2.5993690490722656, + "learning_rate": 4.9989732422294125e-05, + "loss": 7.0809, + "step": 1535 + }, + { + "epoch": 0.009135027119611762, + "grad_norm": 2.827244997024536, + "learning_rate": 4.998971903217633e-05, + "loss": 7.597, + "step": 1536 + }, + { + "epoch": 0.009140974402892759, + "grad_norm": 2.712247848510742, + "learning_rate": 4.9989705633334884e-05, + "loss": 7.3695, + "step": 1537 + }, + { + "epoch": 0.009146921686173756, + "grad_norm": 1.7997468709945679, + "learning_rate": 4.998969222576978e-05, + "loss": 7.6497, + "step": 1538 + }, + { + "epoch": 0.009152868969454752, + "grad_norm": 2.234931230545044, + "learning_rate": 4.998967880948104e-05, + "loss": 7.1636, + "step": 1539 + }, + { + "epoch": 0.009158816252735751, + "grad_norm": 2.150766611099243, + "learning_rate": 4.9989665384468666e-05, + "loss": 6.8621, + "step": 1540 + }, + { + "epoch": 0.009164763536016748, + "grad_norm": 2.9628021717071533, + "learning_rate": 4.998965195073265e-05, + "loss": 6.5059, + "step": 1541 + }, + { + "epoch": 0.009170710819297745, + "grad_norm": 2.720155715942383, + "learning_rate": 4.998963850827301e-05, + "loss": 7.0129, + "step": 1542 + }, + { + "epoch": 0.009176658102578741, + "grad_norm": 2.994684934616089, + "learning_rate": 4.9989625057089744e-05, + "loss": 7.3621, + "step": 1543 + }, + { + "epoch": 0.00918260538585974, + "grad_norm": 2.5991618633270264, + "learning_rate": 4.998961159718286e-05, + "loss": 6.7278, + "step": 1544 + }, + { + "epoch": 0.009188552669140737, + "grad_norm": 2.406353712081909, + "learning_rate": 4.9989598128552355e-05, + "loss": 7.5987, + "step": 1545 + }, + { + "epoch": 0.009194499952421734, + "grad_norm": 3.1308467388153076, + "learning_rate": 4.998958465119824e-05, + "loss": 7.1947, + "step": 1546 + }, + { + "epoch": 0.00920044723570273, + "grad_norm": 2.5381908416748047, + "learning_rate": 4.998957116512053e-05, + "loss": 6.8415, + "step": 1547 + }, + { + "epoch": 0.009206394518983729, + "grad_norm": 2.666410446166992, + "learning_rate": 4.998955767031921e-05, + "loss": 6.9052, + "step": 1548 + }, + { + "epoch": 0.009212341802264726, + "grad_norm": 2.156036138534546, + "learning_rate": 4.9989544166794286e-05, + "loss": 7.6604, + "step": 1549 + }, + { + "epoch": 0.009218289085545723, + "grad_norm": 2.620114803314209, + "learning_rate": 4.998953065454578e-05, + "loss": 6.5475, + "step": 1550 + }, + { + "epoch": 0.00922423636882672, + "grad_norm": 3.2780802249908447, + "learning_rate": 4.9989517133573694e-05, + "loss": 7.0572, + "step": 1551 + }, + { + "epoch": 0.009230183652107718, + "grad_norm": 3.6108100414276123, + "learning_rate": 4.998950360387802e-05, + "loss": 7.0149, + "step": 1552 + }, + { + "epoch": 0.009236130935388715, + "grad_norm": 3.4336259365081787, + "learning_rate": 4.998949006545876e-05, + "loss": 7.2436, + "step": 1553 + }, + { + "epoch": 0.009242078218669712, + "grad_norm": 3.271630048751831, + "learning_rate": 4.9989476518315934e-05, + "loss": 7.3807, + "step": 1554 + }, + { + "epoch": 0.009248025501950708, + "grad_norm": 3.0718438625335693, + "learning_rate": 4.998946296244954e-05, + "loss": 7.2313, + "step": 1555 + }, + { + "epoch": 0.009253972785231707, + "grad_norm": 2.2010579109191895, + "learning_rate": 4.9989449397859575e-05, + "loss": 7.4269, + "step": 1556 + }, + { + "epoch": 0.009259920068512704, + "grad_norm": 2.9805495738983154, + "learning_rate": 4.998943582454607e-05, + "loss": 7.2107, + "step": 1557 + }, + { + "epoch": 0.0092658673517937, + "grad_norm": 2.8313159942626953, + "learning_rate": 4.9989422242508995e-05, + "loss": 7.0453, + "step": 1558 + }, + { + "epoch": 0.009271814635074697, + "grad_norm": 2.7660701274871826, + "learning_rate": 4.998940865174837e-05, + "loss": 7.2205, + "step": 1559 + }, + { + "epoch": 0.009277761918355694, + "grad_norm": 3.808122396469116, + "learning_rate": 4.998939505226421e-05, + "loss": 6.9966, + "step": 1560 + }, + { + "epoch": 0.009283709201636693, + "grad_norm": 3.188976526260376, + "learning_rate": 4.99893814440565e-05, + "loss": 7.0049, + "step": 1561 + }, + { + "epoch": 0.00928965648491769, + "grad_norm": 2.5491533279418945, + "learning_rate": 4.998936782712526e-05, + "loss": 7.0451, + "step": 1562 + }, + { + "epoch": 0.009295603768198686, + "grad_norm": 3.4607698917388916, + "learning_rate": 4.99893542014705e-05, + "loss": 7.0304, + "step": 1563 + }, + { + "epoch": 0.009301551051479683, + "grad_norm": 3.4761910438537598, + "learning_rate": 4.99893405670922e-05, + "loss": 6.9787, + "step": 1564 + }, + { + "epoch": 0.009307498334760682, + "grad_norm": 3.15938138961792, + "learning_rate": 4.998932692399039e-05, + "loss": 7.0203, + "step": 1565 + }, + { + "epoch": 0.009313445618041679, + "grad_norm": 2.600304126739502, + "learning_rate": 4.9989313272165064e-05, + "loss": 7.0782, + "step": 1566 + }, + { + "epoch": 0.009319392901322675, + "grad_norm": 2.54158616065979, + "learning_rate": 4.9989299611616216e-05, + "loss": 6.8354, + "step": 1567 + }, + { + "epoch": 0.009325340184603672, + "grad_norm": 3.4649429321289062, + "learning_rate": 4.9989285942343864e-05, + "loss": 6.8238, + "step": 1568 + }, + { + "epoch": 0.00933128746788467, + "grad_norm": 2.522388458251953, + "learning_rate": 4.998927226434802e-05, + "loss": 6.9544, + "step": 1569 + }, + { + "epoch": 0.009337234751165668, + "grad_norm": 4.074129581451416, + "learning_rate": 4.9989258577628675e-05, + "loss": 6.7229, + "step": 1570 + }, + { + "epoch": 0.009343182034446664, + "grad_norm": 3.395894765853882, + "learning_rate": 4.998924488218584e-05, + "loss": 7.1372, + "step": 1571 + }, + { + "epoch": 0.009349129317727661, + "grad_norm": 2.9850378036499023, + "learning_rate": 4.9989231178019516e-05, + "loss": 6.8966, + "step": 1572 + }, + { + "epoch": 0.00935507660100866, + "grad_norm": 3.1391544342041016, + "learning_rate": 4.9989217465129704e-05, + "loss": 6.6744, + "step": 1573 + }, + { + "epoch": 0.009361023884289657, + "grad_norm": 3.8727803230285645, + "learning_rate": 4.9989203743516414e-05, + "loss": 6.9359, + "step": 1574 + }, + { + "epoch": 0.009366971167570654, + "grad_norm": 3.466169595718384, + "learning_rate": 4.998919001317966e-05, + "loss": 6.979, + "step": 1575 + }, + { + "epoch": 0.00937291845085165, + "grad_norm": 3.3481826782226562, + "learning_rate": 4.998917627411943e-05, + "loss": 6.7749, + "step": 1576 + }, + { + "epoch": 0.009378865734132649, + "grad_norm": 2.425971031188965, + "learning_rate": 4.9989162526335745e-05, + "loss": 7.0127, + "step": 1577 + }, + { + "epoch": 0.009384813017413646, + "grad_norm": 2.8379313945770264, + "learning_rate": 4.9989148769828595e-05, + "loss": 6.5782, + "step": 1578 + }, + { + "epoch": 0.009390760300694643, + "grad_norm": 3.0456466674804688, + "learning_rate": 4.9989135004597994e-05, + "loss": 6.9832, + "step": 1579 + }, + { + "epoch": 0.00939670758397564, + "grad_norm": 2.690138101577759, + "learning_rate": 4.9989121230643944e-05, + "loss": 7.0079, + "step": 1580 + }, + { + "epoch": 0.009402654867256638, + "grad_norm": 3.683105945587158, + "learning_rate": 4.9989107447966444e-05, + "loss": 7.2734, + "step": 1581 + }, + { + "epoch": 0.009408602150537635, + "grad_norm": 2.3310985565185547, + "learning_rate": 4.9989093656565513e-05, + "loss": 7.2388, + "step": 1582 + }, + { + "epoch": 0.009414549433818632, + "grad_norm": 2.353322982788086, + "learning_rate": 4.998907985644115e-05, + "loss": 7.0612, + "step": 1583 + }, + { + "epoch": 0.009420496717099628, + "grad_norm": 2.8458571434020996, + "learning_rate": 4.9989066047593344e-05, + "loss": 7.3093, + "step": 1584 + }, + { + "epoch": 0.009426444000380627, + "grad_norm": 2.3322811126708984, + "learning_rate": 4.9989052230022125e-05, + "loss": 6.983, + "step": 1585 + }, + { + "epoch": 0.009432391283661624, + "grad_norm": 2.7431764602661133, + "learning_rate": 4.998903840372748e-05, + "loss": 6.9694, + "step": 1586 + }, + { + "epoch": 0.00943833856694262, + "grad_norm": 2.7704508304595947, + "learning_rate": 4.998902456870942e-05, + "loss": 6.7727, + "step": 1587 + }, + { + "epoch": 0.009444285850223617, + "grad_norm": 2.4920814037323, + "learning_rate": 4.998901072496796e-05, + "loss": 7.0612, + "step": 1588 + }, + { + "epoch": 0.009450233133504616, + "grad_norm": 2.5911498069763184, + "learning_rate": 4.998899687250308e-05, + "loss": 6.8774, + "step": 1589 + }, + { + "epoch": 0.009456180416785613, + "grad_norm": 2.7269680500030518, + "learning_rate": 4.998898301131481e-05, + "loss": 7.0782, + "step": 1590 + }, + { + "epoch": 0.00946212770006661, + "grad_norm": 2.9707436561584473, + "learning_rate": 4.998896914140314e-05, + "loss": 7.307, + "step": 1591 + }, + { + "epoch": 0.009468074983347606, + "grad_norm": 3.064683675765991, + "learning_rate": 4.998895526276808e-05, + "loss": 7.3708, + "step": 1592 + }, + { + "epoch": 0.009474022266628603, + "grad_norm": 2.4465317726135254, + "learning_rate": 4.998894137540963e-05, + "loss": 7.0085, + "step": 1593 + }, + { + "epoch": 0.009479969549909602, + "grad_norm": 3.3061211109161377, + "learning_rate": 4.99889274793278e-05, + "loss": 6.8353, + "step": 1594 + }, + { + "epoch": 0.009485916833190599, + "grad_norm": 3.283397912979126, + "learning_rate": 4.9988913574522594e-05, + "loss": 6.6848, + "step": 1595 + }, + { + "epoch": 0.009491864116471595, + "grad_norm": 2.770745277404785, + "learning_rate": 4.9988899660994014e-05, + "loss": 7.1742, + "step": 1596 + }, + { + "epoch": 0.009497811399752592, + "grad_norm": 2.7975432872772217, + "learning_rate": 4.998888573874207e-05, + "loss": 6.7329, + "step": 1597 + }, + { + "epoch": 0.00950375868303359, + "grad_norm": 2.545919418334961, + "learning_rate": 4.998887180776677e-05, + "loss": 6.7203, + "step": 1598 + }, + { + "epoch": 0.009509705966314588, + "grad_norm": 2.7961528301239014, + "learning_rate": 4.99888578680681e-05, + "loss": 7.384, + "step": 1599 + }, + { + "epoch": 0.009515653249595584, + "grad_norm": 2.570570230484009, + "learning_rate": 4.9988843919646096e-05, + "loss": 7.0246, + "step": 1600 + }, + { + "epoch": 0.009521600532876581, + "grad_norm": 2.5365843772888184, + "learning_rate": 4.9988829962500734e-05, + "loss": 6.8801, + "step": 1601 + }, + { + "epoch": 0.00952754781615758, + "grad_norm": 2.4713737964630127, + "learning_rate": 4.998881599663203e-05, + "loss": 7.1974, + "step": 1602 + }, + { + "epoch": 0.009533495099438577, + "grad_norm": 2.5286331176757812, + "learning_rate": 4.998880202203999e-05, + "loss": 7.26, + "step": 1603 + }, + { + "epoch": 0.009539442382719573, + "grad_norm": 2.2333719730377197, + "learning_rate": 4.998878803872461e-05, + "loss": 7.3254, + "step": 1604 + }, + { + "epoch": 0.00954538966600057, + "grad_norm": 2.544095277786255, + "learning_rate": 4.9988774046685915e-05, + "loss": 7.407, + "step": 1605 + }, + { + "epoch": 0.009551336949281569, + "grad_norm": 3.057140588760376, + "learning_rate": 4.9988760045923886e-05, + "loss": 6.5303, + "step": 1606 + }, + { + "epoch": 0.009557284232562566, + "grad_norm": 3.0190670490264893, + "learning_rate": 4.998874603643854e-05, + "loss": 6.3276, + "step": 1607 + }, + { + "epoch": 0.009563231515843562, + "grad_norm": 2.208249568939209, + "learning_rate": 4.998873201822989e-05, + "loss": 6.856, + "step": 1608 + }, + { + "epoch": 0.00956917879912456, + "grad_norm": 2.3519229888916016, + "learning_rate": 4.998871799129793e-05, + "loss": 6.9854, + "step": 1609 + }, + { + "epoch": 0.009575126082405558, + "grad_norm": 2.604816198348999, + "learning_rate": 4.9988703955642655e-05, + "loss": 7.3127, + "step": 1610 + }, + { + "epoch": 0.009581073365686555, + "grad_norm": 2.320030927658081, + "learning_rate": 4.9988689911264094e-05, + "loss": 7.216, + "step": 1611 + }, + { + "epoch": 0.009587020648967551, + "grad_norm": 2.8475282192230225, + "learning_rate": 4.998867585816224e-05, + "loss": 6.6743, + "step": 1612 + }, + { + "epoch": 0.009592967932248548, + "grad_norm": 2.518707036972046, + "learning_rate": 4.998866179633709e-05, + "loss": 7.0257, + "step": 1613 + }, + { + "epoch": 0.009598915215529547, + "grad_norm": 2.7348618507385254, + "learning_rate": 4.998864772578866e-05, + "loss": 7.1933, + "step": 1614 + }, + { + "epoch": 0.009604862498810544, + "grad_norm": 2.5701184272766113, + "learning_rate": 4.9988633646516946e-05, + "loss": 7.1071, + "step": 1615 + }, + { + "epoch": 0.00961080978209154, + "grad_norm": 2.916544198989868, + "learning_rate": 4.998861955852197e-05, + "loss": 7.1331, + "step": 1616 + }, + { + "epoch": 0.009616757065372537, + "grad_norm": 2.390934944152832, + "learning_rate": 4.998860546180371e-05, + "loss": 7.3252, + "step": 1617 + }, + { + "epoch": 0.009622704348653536, + "grad_norm": 2.6720097064971924, + "learning_rate": 4.998859135636219e-05, + "loss": 7.0105, + "step": 1618 + }, + { + "epoch": 0.009628651631934533, + "grad_norm": 2.3859329223632812, + "learning_rate": 4.998857724219742e-05, + "loss": 7.023, + "step": 1619 + }, + { + "epoch": 0.00963459891521553, + "grad_norm": 2.9713187217712402, + "learning_rate": 4.998856311930939e-05, + "loss": 7.0338, + "step": 1620 + }, + { + "epoch": 0.009640546198496526, + "grad_norm": 2.33858060836792, + "learning_rate": 4.998854898769811e-05, + "loss": 7.0103, + "step": 1621 + }, + { + "epoch": 0.009646493481777523, + "grad_norm": 2.8897042274475098, + "learning_rate": 4.9988534847363585e-05, + "loss": 7.1225, + "step": 1622 + }, + { + "epoch": 0.009652440765058522, + "grad_norm": 2.354513645172119, + "learning_rate": 4.9988520698305826e-05, + "loss": 6.9272, + "step": 1623 + }, + { + "epoch": 0.009658388048339519, + "grad_norm": 2.5571863651275635, + "learning_rate": 4.9988506540524826e-05, + "loss": 6.3418, + "step": 1624 + }, + { + "epoch": 0.009664335331620515, + "grad_norm": 2.342381238937378, + "learning_rate": 4.99884923740206e-05, + "loss": 6.4265, + "step": 1625 + }, + { + "epoch": 0.009670282614901512, + "grad_norm": 2.5594370365142822, + "learning_rate": 4.998847819879315e-05, + "loss": 6.9801, + "step": 1626 + }, + { + "epoch": 0.00967622989818251, + "grad_norm": 3.6932148933410645, + "learning_rate": 4.9988464014842476e-05, + "loss": 7.0231, + "step": 1627 + }, + { + "epoch": 0.009682177181463508, + "grad_norm": 2.713508367538452, + "learning_rate": 4.998844982216859e-05, + "loss": 6.9041, + "step": 1628 + }, + { + "epoch": 0.009688124464744504, + "grad_norm": 2.703103542327881, + "learning_rate": 4.99884356207715e-05, + "loss": 6.9272, + "step": 1629 + }, + { + "epoch": 0.009694071748025501, + "grad_norm": 3.228708267211914, + "learning_rate": 4.9988421410651197e-05, + "loss": 6.9242, + "step": 1630 + }, + { + "epoch": 0.0097000190313065, + "grad_norm": 3.3407063484191895, + "learning_rate": 4.9988407191807694e-05, + "loss": 6.8871, + "step": 1631 + }, + { + "epoch": 0.009705966314587497, + "grad_norm": 2.3833165168762207, + "learning_rate": 4.9988392964241005e-05, + "loss": 6.9667, + "step": 1632 + }, + { + "epoch": 0.009711913597868493, + "grad_norm": 3.607023239135742, + "learning_rate": 4.9988378727951123e-05, + "loss": 6.93, + "step": 1633 + }, + { + "epoch": 0.00971786088114949, + "grad_norm": 3.797107219696045, + "learning_rate": 4.9988364482938056e-05, + "loss": 6.8115, + "step": 1634 + }, + { + "epoch": 0.009723808164430489, + "grad_norm": 2.5586941242218018, + "learning_rate": 4.998835022920181e-05, + "loss": 6.7322, + "step": 1635 + }, + { + "epoch": 0.009729755447711486, + "grad_norm": 2.377680540084839, + "learning_rate": 4.9988335966742385e-05, + "loss": 6.7127, + "step": 1636 + }, + { + "epoch": 0.009735702730992482, + "grad_norm": 2.510584592819214, + "learning_rate": 4.998832169555979e-05, + "loss": 6.836, + "step": 1637 + }, + { + "epoch": 0.00974165001427348, + "grad_norm": 2.8817014694213867, + "learning_rate": 4.9988307415654025e-05, + "loss": 6.7812, + "step": 1638 + }, + { + "epoch": 0.009747597297554478, + "grad_norm": 2.878535509109497, + "learning_rate": 4.998829312702511e-05, + "loss": 6.7852, + "step": 1639 + }, + { + "epoch": 0.009753544580835475, + "grad_norm": 2.5870323181152344, + "learning_rate": 4.998827882967304e-05, + "loss": 6.8569, + "step": 1640 + }, + { + "epoch": 0.009759491864116471, + "grad_norm": 2.7275760173797607, + "learning_rate": 4.998826452359782e-05, + "loss": 6.8304, + "step": 1641 + }, + { + "epoch": 0.009765439147397468, + "grad_norm": 2.24550461769104, + "learning_rate": 4.998825020879945e-05, + "loss": 6.7609, + "step": 1642 + }, + { + "epoch": 0.009771386430678467, + "grad_norm": 2.2101621627807617, + "learning_rate": 4.9988235885277934e-05, + "loss": 6.7548, + "step": 1643 + }, + { + "epoch": 0.009777333713959464, + "grad_norm": 2.289870023727417, + "learning_rate": 4.9988221553033294e-05, + "loss": 6.8899, + "step": 1644 + }, + { + "epoch": 0.00978328099724046, + "grad_norm": 2.6337740421295166, + "learning_rate": 4.9988207212065516e-05, + "loss": 6.7605, + "step": 1645 + }, + { + "epoch": 0.009789228280521457, + "grad_norm": 2.442605972290039, + "learning_rate": 4.998819286237462e-05, + "loss": 6.6299, + "step": 1646 + }, + { + "epoch": 0.009795175563802456, + "grad_norm": 2.6570451259613037, + "learning_rate": 4.9988178503960606e-05, + "loss": 6.6933, + "step": 1647 + }, + { + "epoch": 0.009801122847083453, + "grad_norm": 2.597043752670288, + "learning_rate": 4.9988164136823467e-05, + "loss": 6.7667, + "step": 1648 + }, + { + "epoch": 0.00980707013036445, + "grad_norm": 3.2576608657836914, + "learning_rate": 4.998814976096323e-05, + "loss": 7.1774, + "step": 1649 + }, + { + "epoch": 0.009813017413645446, + "grad_norm": 3.110119342803955, + "learning_rate": 4.998813537637988e-05, + "loss": 7.2139, + "step": 1650 + }, + { + "epoch": 0.009818964696926445, + "grad_norm": 3.038086414337158, + "learning_rate": 4.998812098307343e-05, + "loss": 7.2752, + "step": 1651 + }, + { + "epoch": 0.009824911980207442, + "grad_norm": 2.965916872024536, + "learning_rate": 4.998810658104389e-05, + "loss": 7.1151, + "step": 1652 + }, + { + "epoch": 0.009830859263488438, + "grad_norm": 3.011476755142212, + "learning_rate": 4.998809217029126e-05, + "loss": 7.1335, + "step": 1653 + }, + { + "epoch": 0.009836806546769435, + "grad_norm": 3.8196349143981934, + "learning_rate": 4.9988077750815534e-05, + "loss": 7.0865, + "step": 1654 + }, + { + "epoch": 0.009842753830050432, + "grad_norm": 3.2577872276306152, + "learning_rate": 4.998806332261674e-05, + "loss": 7.4285, + "step": 1655 + }, + { + "epoch": 0.00984870111333143, + "grad_norm": 2.847039222717285, + "learning_rate": 4.998804888569487e-05, + "loss": 7.3251, + "step": 1656 + }, + { + "epoch": 0.009854648396612428, + "grad_norm": 3.4066355228424072, + "learning_rate": 4.998803444004992e-05, + "loss": 7.3137, + "step": 1657 + }, + { + "epoch": 0.009860595679893424, + "grad_norm": 3.6774044036865234, + "learning_rate": 4.998801998568192e-05, + "loss": 7.0772, + "step": 1658 + }, + { + "epoch": 0.009866542963174421, + "grad_norm": 3.1404600143432617, + "learning_rate": 4.998800552259085e-05, + "loss": 7.1143, + "step": 1659 + }, + { + "epoch": 0.00987249024645542, + "grad_norm": 3.6337625980377197, + "learning_rate": 4.998799105077674e-05, + "loss": 7.1296, + "step": 1660 + }, + { + "epoch": 0.009878437529736417, + "grad_norm": 4.551114082336426, + "learning_rate": 4.9987976570239566e-05, + "loss": 7.1343, + "step": 1661 + }, + { + "epoch": 0.009884384813017413, + "grad_norm": 3.2305374145507812, + "learning_rate": 4.998796208097935e-05, + "loss": 7.0852, + "step": 1662 + }, + { + "epoch": 0.00989033209629841, + "grad_norm": 2.5174615383148193, + "learning_rate": 4.99879475829961e-05, + "loss": 7.2315, + "step": 1663 + }, + { + "epoch": 0.009896279379579409, + "grad_norm": 3.623525381088257, + "learning_rate": 4.9987933076289804e-05, + "loss": 7.4222, + "step": 1664 + }, + { + "epoch": 0.009902226662860406, + "grad_norm": 4.217465877532959, + "learning_rate": 4.998791856086049e-05, + "loss": 7.4003, + "step": 1665 + }, + { + "epoch": 0.009908173946141402, + "grad_norm": 2.42301344871521, + "learning_rate": 4.998790403670815e-05, + "loss": 7.3295, + "step": 1666 + }, + { + "epoch": 0.0099141212294224, + "grad_norm": 2.3003029823303223, + "learning_rate": 4.998788950383279e-05, + "loss": 7.2072, + "step": 1667 + }, + { + "epoch": 0.009920068512703398, + "grad_norm": 3.3792307376861572, + "learning_rate": 4.9987874962234414e-05, + "loss": 7.2882, + "step": 1668 + }, + { + "epoch": 0.009926015795984395, + "grad_norm": 3.42130184173584, + "learning_rate": 4.998786041191303e-05, + "loss": 7.1231, + "step": 1669 + }, + { + "epoch": 0.009931963079265391, + "grad_norm": 3.496676445007324, + "learning_rate": 4.9987845852868644e-05, + "loss": 7.2535, + "step": 1670 + }, + { + "epoch": 0.009937910362546388, + "grad_norm": 2.695780038833618, + "learning_rate": 4.9987831285101255e-05, + "loss": 7.3784, + "step": 1671 + }, + { + "epoch": 0.009943857645827387, + "grad_norm": 2.2745561599731445, + "learning_rate": 4.998781670861088e-05, + "loss": 7.1184, + "step": 1672 + }, + { + "epoch": 0.009949804929108384, + "grad_norm": 3.8487844467163086, + "learning_rate": 4.99878021233975e-05, + "loss": 7.277, + "step": 1673 + }, + { + "epoch": 0.00995575221238938, + "grad_norm": 2.6628305912017822, + "learning_rate": 4.998778752946115e-05, + "loss": 6.8204, + "step": 1674 + }, + { + "epoch": 0.009961699495670377, + "grad_norm": 3.6330301761627197, + "learning_rate": 4.998777292680182e-05, + "loss": 7.3003, + "step": 1675 + }, + { + "epoch": 0.009967646778951376, + "grad_norm": 2.644237995147705, + "learning_rate": 4.998775831541952e-05, + "loss": 7.1492, + "step": 1676 + }, + { + "epoch": 0.009973594062232373, + "grad_norm": 2.895193099975586, + "learning_rate": 4.998774369531424e-05, + "loss": 7.3986, + "step": 1677 + }, + { + "epoch": 0.00997954134551337, + "grad_norm": 3.2180328369140625, + "learning_rate": 4.998772906648601e-05, + "loss": 7.1085, + "step": 1678 + }, + { + "epoch": 0.009985488628794366, + "grad_norm": 3.5874838829040527, + "learning_rate": 4.9987714428934815e-05, + "loss": 6.9554, + "step": 1679 + }, + { + "epoch": 0.009991435912075365, + "grad_norm": 2.419516086578369, + "learning_rate": 4.9987699782660666e-05, + "loss": 6.6222, + "step": 1680 + }, + { + "epoch": 0.009997383195356362, + "grad_norm": 2.715808153152466, + "learning_rate": 4.9987685127663574e-05, + "loss": 6.8417, + "step": 1681 + }, + { + "epoch": 0.010003330478637358, + "grad_norm": 2.2847111225128174, + "learning_rate": 4.9987670463943534e-05, + "loss": 7.1649, + "step": 1682 + }, + { + "epoch": 0.010009277761918355, + "grad_norm": 2.402684450149536, + "learning_rate": 4.998765579150056e-05, + "loss": 7.6113, + "step": 1683 + }, + { + "epoch": 0.010015225045199352, + "grad_norm": 2.54388689994812, + "learning_rate": 4.998764111033465e-05, + "loss": 7.1261, + "step": 1684 + }, + { + "epoch": 0.01002117232848035, + "grad_norm": 2.8077542781829834, + "learning_rate": 4.9987626420445823e-05, + "loss": 7.1349, + "step": 1685 + }, + { + "epoch": 0.010027119611761347, + "grad_norm": 2.228707790374756, + "learning_rate": 4.9987611721834063e-05, + "loss": 7.1123, + "step": 1686 + }, + { + "epoch": 0.010033066895042344, + "grad_norm": 2.648607015609741, + "learning_rate": 4.998759701449939e-05, + "loss": 7.0263, + "step": 1687 + }, + { + "epoch": 0.010039014178323341, + "grad_norm": 3.0278162956237793, + "learning_rate": 4.99875822984418e-05, + "loss": 6.6463, + "step": 1688 + }, + { + "epoch": 0.01004496146160434, + "grad_norm": 3.1550052165985107, + "learning_rate": 4.998756757366131e-05, + "loss": 6.8773, + "step": 1689 + }, + { + "epoch": 0.010050908744885336, + "grad_norm": 3.3911843299865723, + "learning_rate": 4.998755284015792e-05, + "loss": 7.5045, + "step": 1690 + }, + { + "epoch": 0.010056856028166333, + "grad_norm": 2.668861150741577, + "learning_rate": 4.998753809793162e-05, + "loss": 7.5545, + "step": 1691 + }, + { + "epoch": 0.01006280331144733, + "grad_norm": 2.182792901992798, + "learning_rate": 4.998752334698244e-05, + "loss": 7.2315, + "step": 1692 + }, + { + "epoch": 0.010068750594728329, + "grad_norm": 2.981476068496704, + "learning_rate": 4.998750858731037e-05, + "loss": 7.3455, + "step": 1693 + }, + { + "epoch": 0.010074697878009325, + "grad_norm": 3.1855525970458984, + "learning_rate": 4.998749381891542e-05, + "loss": 7.3408, + "step": 1694 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 2.5677361488342285, + "learning_rate": 4.998747904179759e-05, + "loss": 6.7591, + "step": 1695 + }, + { + "epoch": 0.010086592444571319, + "grad_norm": 2.7397539615631104, + "learning_rate": 4.9987464255956894e-05, + "loss": 7.3976, + "step": 1696 + }, + { + "epoch": 0.010092539727852318, + "grad_norm": 2.1141586303710938, + "learning_rate": 4.998744946139333e-05, + "loss": 7.4287, + "step": 1697 + }, + { + "epoch": 0.010098487011133314, + "grad_norm": 2.1999096870422363, + "learning_rate": 4.998743465810691e-05, + "loss": 7.4804, + "step": 1698 + }, + { + "epoch": 0.010104434294414311, + "grad_norm": 2.4150960445404053, + "learning_rate": 4.9987419846097634e-05, + "loss": 7.1743, + "step": 1699 + }, + { + "epoch": 0.010110381577695308, + "grad_norm": 2.564270496368408, + "learning_rate": 4.998740502536551e-05, + "loss": 7.262, + "step": 1700 + }, + { + "epoch": 0.010116328860976307, + "grad_norm": 3.045964241027832, + "learning_rate": 4.9987390195910536e-05, + "loss": 7.0778, + "step": 1701 + }, + { + "epoch": 0.010122276144257304, + "grad_norm": 3.2720210552215576, + "learning_rate": 4.998737535773272e-05, + "loss": 7.2188, + "step": 1702 + }, + { + "epoch": 0.0101282234275383, + "grad_norm": 2.54496693611145, + "learning_rate": 4.998736051083207e-05, + "loss": 6.9985, + "step": 1703 + }, + { + "epoch": 0.010134170710819297, + "grad_norm": 3.6252541542053223, + "learning_rate": 4.998734565520859e-05, + "loss": 7.3502, + "step": 1704 + }, + { + "epoch": 0.010140117994100296, + "grad_norm": 3.468963146209717, + "learning_rate": 4.99873307908623e-05, + "loss": 6.9642, + "step": 1705 + }, + { + "epoch": 0.010146065277381293, + "grad_norm": 2.8778045177459717, + "learning_rate": 4.9987315917793174e-05, + "loss": 6.8675, + "step": 1706 + }, + { + "epoch": 0.01015201256066229, + "grad_norm": 2.4492053985595703, + "learning_rate": 4.9987301036001236e-05, + "loss": 7.3484, + "step": 1707 + }, + { + "epoch": 0.010157959843943286, + "grad_norm": 2.5170838832855225, + "learning_rate": 4.99872861454865e-05, + "loss": 7.6004, + "step": 1708 + }, + { + "epoch": 0.010163907127224285, + "grad_norm": 2.3539648056030273, + "learning_rate": 4.998727124624895e-05, + "loss": 7.3304, + "step": 1709 + }, + { + "epoch": 0.010169854410505282, + "grad_norm": 2.6097705364227295, + "learning_rate": 4.998725633828861e-05, + "loss": 7.3227, + "step": 1710 + }, + { + "epoch": 0.010175801693786278, + "grad_norm": 2.5909392833709717, + "learning_rate": 4.9987241421605466e-05, + "loss": 7.3797, + "step": 1711 + }, + { + "epoch": 0.010181748977067275, + "grad_norm": 3.143157958984375, + "learning_rate": 4.998722649619954e-05, + "loss": 7.1236, + "step": 1712 + }, + { + "epoch": 0.010187696260348274, + "grad_norm": 2.0621843338012695, + "learning_rate": 4.9987211562070835e-05, + "loss": 7.5322, + "step": 1713 + }, + { + "epoch": 0.01019364354362927, + "grad_norm": 1.7781084775924683, + "learning_rate": 4.9987196619219354e-05, + "loss": 7.428, + "step": 1714 + }, + { + "epoch": 0.010199590826910267, + "grad_norm": 2.3108980655670166, + "learning_rate": 4.9987181667645094e-05, + "loss": 7.3814, + "step": 1715 + }, + { + "epoch": 0.010205538110191264, + "grad_norm": 2.5184621810913086, + "learning_rate": 4.998716670734807e-05, + "loss": 7.374, + "step": 1716 + }, + { + "epoch": 0.010211485393472261, + "grad_norm": 1.9185826778411865, + "learning_rate": 4.9987151738328284e-05, + "loss": 7.3352, + "step": 1717 + }, + { + "epoch": 0.01021743267675326, + "grad_norm": 2.794224262237549, + "learning_rate": 4.998713676058574e-05, + "loss": 7.0293, + "step": 1718 + }, + { + "epoch": 0.010223379960034256, + "grad_norm": 3.601804733276367, + "learning_rate": 4.998712177412045e-05, + "loss": 7.0277, + "step": 1719 + }, + { + "epoch": 0.010229327243315253, + "grad_norm": 3.3258707523345947, + "learning_rate": 4.998710677893241e-05, + "loss": 6.9478, + "step": 1720 + }, + { + "epoch": 0.01023527452659625, + "grad_norm": 3.147439956665039, + "learning_rate": 4.9987091775021625e-05, + "loss": 6.7295, + "step": 1721 + }, + { + "epoch": 0.010241221809877249, + "grad_norm": 2.7821006774902344, + "learning_rate": 4.998707676238811e-05, + "loss": 6.7587, + "step": 1722 + }, + { + "epoch": 0.010247169093158245, + "grad_norm": 2.580597400665283, + "learning_rate": 4.998706174103186e-05, + "loss": 6.9091, + "step": 1723 + }, + { + "epoch": 0.010253116376439242, + "grad_norm": 2.5501208305358887, + "learning_rate": 4.998704671095289e-05, + "loss": 7.3262, + "step": 1724 + }, + { + "epoch": 0.010259063659720239, + "grad_norm": 2.5460124015808105, + "learning_rate": 4.99870316721512e-05, + "loss": 7.278, + "step": 1725 + }, + { + "epoch": 0.010265010943001238, + "grad_norm": 2.0253796577453613, + "learning_rate": 4.998701662462679e-05, + "loss": 7.1757, + "step": 1726 + }, + { + "epoch": 0.010270958226282234, + "grad_norm": 2.3127388954162598, + "learning_rate": 4.998700156837968e-05, + "loss": 7.1057, + "step": 1727 + }, + { + "epoch": 0.010276905509563231, + "grad_norm": 2.931878089904785, + "learning_rate": 4.998698650340986e-05, + "loss": 6.9993, + "step": 1728 + }, + { + "epoch": 0.010282852792844228, + "grad_norm": 3.239272356033325, + "learning_rate": 4.998697142971734e-05, + "loss": 6.7754, + "step": 1729 + }, + { + "epoch": 0.010288800076125227, + "grad_norm": 2.388212203979492, + "learning_rate": 4.998695634730213e-05, + "loss": 7.2794, + "step": 1730 + }, + { + "epoch": 0.010294747359406223, + "grad_norm": 2.7766799926757812, + "learning_rate": 4.998694125616423e-05, + "loss": 7.4636, + "step": 1731 + }, + { + "epoch": 0.01030069464268722, + "grad_norm": 2.543757915496826, + "learning_rate": 4.9986926156303646e-05, + "loss": 6.8801, + "step": 1732 + }, + { + "epoch": 0.010306641925968217, + "grad_norm": 1.8907097578048706, + "learning_rate": 4.9986911047720384e-05, + "loss": 7.0353, + "step": 1733 + }, + { + "epoch": 0.010312589209249216, + "grad_norm": 1.9585598707199097, + "learning_rate": 4.9986895930414444e-05, + "loss": 7.0469, + "step": 1734 + }, + { + "epoch": 0.010318536492530212, + "grad_norm": 2.5191497802734375, + "learning_rate": 4.998688080438585e-05, + "loss": 7.1469, + "step": 1735 + }, + { + "epoch": 0.01032448377581121, + "grad_norm": 3.5709545612335205, + "learning_rate": 4.998686566963459e-05, + "loss": 7.0499, + "step": 1736 + }, + { + "epoch": 0.010330431059092206, + "grad_norm": 2.3778624534606934, + "learning_rate": 4.998685052616067e-05, + "loss": 7.5897, + "step": 1737 + }, + { + "epoch": 0.010336378342373205, + "grad_norm": 2.0795674324035645, + "learning_rate": 4.9986835373964094e-05, + "loss": 6.8778, + "step": 1738 + }, + { + "epoch": 0.010342325625654201, + "grad_norm": 2.7674901485443115, + "learning_rate": 4.9986820213044875e-05, + "loss": 6.4428, + "step": 1739 + }, + { + "epoch": 0.010348272908935198, + "grad_norm": 2.7203595638275146, + "learning_rate": 4.998680504340302e-05, + "loss": 7.4668, + "step": 1740 + }, + { + "epoch": 0.010354220192216195, + "grad_norm": 2.840240955352783, + "learning_rate": 4.998678986503853e-05, + "loss": 7.2219, + "step": 1741 + }, + { + "epoch": 0.010360167475497194, + "grad_norm": 2.7803452014923096, + "learning_rate": 4.9986774677951404e-05, + "loss": 6.5674, + "step": 1742 + }, + { + "epoch": 0.01036611475877819, + "grad_norm": 2.467574119567871, + "learning_rate": 4.998675948214165e-05, + "loss": 6.9621, + "step": 1743 + }, + { + "epoch": 0.010372062042059187, + "grad_norm": 2.1437904834747314, + "learning_rate": 4.998674427760929e-05, + "loss": 7.1564, + "step": 1744 + }, + { + "epoch": 0.010378009325340184, + "grad_norm": 2.504685163497925, + "learning_rate": 4.9986729064354304e-05, + "loss": 6.8836, + "step": 1745 + }, + { + "epoch": 0.010383956608621183, + "grad_norm": 2.401296615600586, + "learning_rate": 4.998671384237671e-05, + "loss": 7.2906, + "step": 1746 + }, + { + "epoch": 0.01038990389190218, + "grad_norm": 2.233701705932617, + "learning_rate": 4.9986698611676516e-05, + "loss": 6.6854, + "step": 1747 + }, + { + "epoch": 0.010395851175183176, + "grad_norm": 2.9597983360290527, + "learning_rate": 4.998668337225373e-05, + "loss": 6.8859, + "step": 1748 + }, + { + "epoch": 0.010401798458464173, + "grad_norm": 3.2164804935455322, + "learning_rate": 4.998666812410834e-05, + "loss": 6.8255, + "step": 1749 + }, + { + "epoch": 0.01040774574174517, + "grad_norm": 3.010002374649048, + "learning_rate": 4.9986652867240364e-05, + "loss": 6.7092, + "step": 1750 + }, + { + "epoch": 0.010413693025026169, + "grad_norm": 2.8442068099975586, + "learning_rate": 4.998663760164981e-05, + "loss": 6.7231, + "step": 1751 + }, + { + "epoch": 0.010419640308307165, + "grad_norm": 3.127922773361206, + "learning_rate": 4.9986622327336676e-05, + "loss": 6.6072, + "step": 1752 + }, + { + "epoch": 0.010425587591588162, + "grad_norm": 2.7306833267211914, + "learning_rate": 4.998660704430097e-05, + "loss": 6.696, + "step": 1753 + }, + { + "epoch": 0.010431534874869159, + "grad_norm": 2.9005799293518066, + "learning_rate": 4.99865917525427e-05, + "loss": 6.6598, + "step": 1754 + }, + { + "epoch": 0.010437482158150158, + "grad_norm": 3.17934513092041, + "learning_rate": 4.9986576452061865e-05, + "loss": 6.5887, + "step": 1755 + }, + { + "epoch": 0.010443429441431154, + "grad_norm": 2.9390244483947754, + "learning_rate": 4.9986561142858476e-05, + "loss": 6.5375, + "step": 1756 + }, + { + "epoch": 0.010449376724712151, + "grad_norm": 2.5547196865081787, + "learning_rate": 4.998654582493254e-05, + "loss": 6.7484, + "step": 1757 + }, + { + "epoch": 0.010455324007993148, + "grad_norm": 2.9969568252563477, + "learning_rate": 4.9986530498284054e-05, + "loss": 6.6496, + "step": 1758 + }, + { + "epoch": 0.010461271291274147, + "grad_norm": 2.843932867050171, + "learning_rate": 4.998651516291303e-05, + "loss": 6.5713, + "step": 1759 + }, + { + "epoch": 0.010467218574555143, + "grad_norm": 2.9114811420440674, + "learning_rate": 4.9986499818819476e-05, + "loss": 7.5248, + "step": 1760 + }, + { + "epoch": 0.01047316585783614, + "grad_norm": 3.0292229652404785, + "learning_rate": 4.998648446600339e-05, + "loss": 7.2346, + "step": 1761 + }, + { + "epoch": 0.010479113141117137, + "grad_norm": 2.553088426589966, + "learning_rate": 4.998646910446478e-05, + "loss": 7.1531, + "step": 1762 + }, + { + "epoch": 0.010485060424398136, + "grad_norm": 2.9838356971740723, + "learning_rate": 4.998645373420365e-05, + "loss": 6.6561, + "step": 1763 + }, + { + "epoch": 0.010491007707679132, + "grad_norm": 2.8948864936828613, + "learning_rate": 4.9986438355220014e-05, + "loss": 6.463, + "step": 1764 + }, + { + "epoch": 0.01049695499096013, + "grad_norm": 2.805084228515625, + "learning_rate": 4.9986422967513856e-05, + "loss": 6.701, + "step": 1765 + }, + { + "epoch": 0.010502902274241126, + "grad_norm": 2.748077869415283, + "learning_rate": 4.998640757108522e-05, + "loss": 7.3223, + "step": 1766 + }, + { + "epoch": 0.010508849557522125, + "grad_norm": 3.0048258304595947, + "learning_rate": 4.998639216593406e-05, + "loss": 7.2582, + "step": 1767 + }, + { + "epoch": 0.010514796840803121, + "grad_norm": 2.538522958755493, + "learning_rate": 4.998637675206043e-05, + "loss": 7.1208, + "step": 1768 + }, + { + "epoch": 0.010520744124084118, + "grad_norm": 2.2091188430786133, + "learning_rate": 4.99863613294643e-05, + "loss": 7.0577, + "step": 1769 + }, + { + "epoch": 0.010526691407365115, + "grad_norm": 2.8454909324645996, + "learning_rate": 4.998634589814569e-05, + "loss": 7.1296, + "step": 1770 + }, + { + "epoch": 0.010532638690646114, + "grad_norm": 3.4139351844787598, + "learning_rate": 4.998633045810461e-05, + "loss": 6.9565, + "step": 1771 + }, + { + "epoch": 0.01053858597392711, + "grad_norm": 2.3192107677459717, + "learning_rate": 4.9986315009341066e-05, + "loss": 6.6027, + "step": 1772 + }, + { + "epoch": 0.010544533257208107, + "grad_norm": 2.309290647506714, + "learning_rate": 4.998629955185505e-05, + "loss": 7.0417, + "step": 1773 + }, + { + "epoch": 0.010550480540489104, + "grad_norm": 3.2046520709991455, + "learning_rate": 4.998628408564657e-05, + "loss": 7.0368, + "step": 1774 + }, + { + "epoch": 0.010556427823770103, + "grad_norm": 2.459064483642578, + "learning_rate": 4.9986268610715646e-05, + "loss": 7.2726, + "step": 1775 + }, + { + "epoch": 0.0105623751070511, + "grad_norm": 2.602522134780884, + "learning_rate": 4.998625312706227e-05, + "loss": 7.3377, + "step": 1776 + }, + { + "epoch": 0.010568322390332096, + "grad_norm": 3.9599175453186035, + "learning_rate": 4.998623763468645e-05, + "loss": 6.9146, + "step": 1777 + }, + { + "epoch": 0.010574269673613093, + "grad_norm": 3.312527894973755, + "learning_rate": 4.99862221335882e-05, + "loss": 6.7457, + "step": 1778 + }, + { + "epoch": 0.01058021695689409, + "grad_norm": 2.5287606716156006, + "learning_rate": 4.9986206623767506e-05, + "loss": 7.2651, + "step": 1779 + }, + { + "epoch": 0.010586164240175088, + "grad_norm": 2.4065616130828857, + "learning_rate": 4.99861911052244e-05, + "loss": 7.1135, + "step": 1780 + }, + { + "epoch": 0.010592111523456085, + "grad_norm": 2.321385383605957, + "learning_rate": 4.998617557795886e-05, + "loss": 7.1985, + "step": 1781 + }, + { + "epoch": 0.010598058806737082, + "grad_norm": 2.118995189666748, + "learning_rate": 4.9986160041970906e-05, + "loss": 7.2832, + "step": 1782 + }, + { + "epoch": 0.010604006090018079, + "grad_norm": 2.2536606788635254, + "learning_rate": 4.9986144497260544e-05, + "loss": 7.191, + "step": 1783 + }, + { + "epoch": 0.010609953373299078, + "grad_norm": 2.2956738471984863, + "learning_rate": 4.998612894382778e-05, + "loss": 7.0496, + "step": 1784 + }, + { + "epoch": 0.010615900656580074, + "grad_norm": 2.4258289337158203, + "learning_rate": 4.9986113381672614e-05, + "loss": 7.2767, + "step": 1785 + }, + { + "epoch": 0.010621847939861071, + "grad_norm": 2.4731507301330566, + "learning_rate": 4.998609781079505e-05, + "loss": 6.8805, + "step": 1786 + }, + { + "epoch": 0.010627795223142068, + "grad_norm": 2.3245391845703125, + "learning_rate": 4.9986082231195105e-05, + "loss": 6.8921, + "step": 1787 + }, + { + "epoch": 0.010633742506423067, + "grad_norm": 2.6239898204803467, + "learning_rate": 4.998606664287278e-05, + "loss": 6.9353, + "step": 1788 + }, + { + "epoch": 0.010639689789704063, + "grad_norm": 2.186162233352661, + "learning_rate": 4.9986051045828065e-05, + "loss": 6.8466, + "step": 1789 + }, + { + "epoch": 0.01064563707298506, + "grad_norm": 2.2362232208251953, + "learning_rate": 4.998603544006098e-05, + "loss": 6.82, + "step": 1790 + }, + { + "epoch": 0.010651584356266057, + "grad_norm": 2.2302427291870117, + "learning_rate": 4.998601982557153e-05, + "loss": 6.7034, + "step": 1791 + }, + { + "epoch": 0.010657531639547056, + "grad_norm": 2.0393195152282715, + "learning_rate": 4.998600420235972e-05, + "loss": 6.6646, + "step": 1792 + }, + { + "epoch": 0.010663478922828052, + "grad_norm": 1.976536512374878, + "learning_rate": 4.9985988570425556e-05, + "loss": 6.4994, + "step": 1793 + }, + { + "epoch": 0.01066942620610905, + "grad_norm": 2.4167046546936035, + "learning_rate": 4.998597292976904e-05, + "loss": 6.7849, + "step": 1794 + }, + { + "epoch": 0.010675373489390046, + "grad_norm": 2.3077776432037354, + "learning_rate": 4.998595728039018e-05, + "loss": 6.8356, + "step": 1795 + }, + { + "epoch": 0.010681320772671045, + "grad_norm": 2.5263309478759766, + "learning_rate": 4.998594162228898e-05, + "loss": 6.6351, + "step": 1796 + }, + { + "epoch": 0.010687268055952041, + "grad_norm": 2.153365135192871, + "learning_rate": 4.9985925955465443e-05, + "loss": 6.7911, + "step": 1797 + }, + { + "epoch": 0.010693215339233038, + "grad_norm": 3.3034393787384033, + "learning_rate": 4.998591027991958e-05, + "loss": 6.7589, + "step": 1798 + }, + { + "epoch": 0.010699162622514035, + "grad_norm": 2.2177388668060303, + "learning_rate": 4.998589459565139e-05, + "loss": 6.571, + "step": 1799 + }, + { + "epoch": 0.010705109905795034, + "grad_norm": 2.3165230751037598, + "learning_rate": 4.9985878902660886e-05, + "loss": 6.9124, + "step": 1800 + }, + { + "epoch": 0.01071105718907603, + "grad_norm": 2.270045757293701, + "learning_rate": 4.998586320094807e-05, + "loss": 6.4442, + "step": 1801 + }, + { + "epoch": 0.010717004472357027, + "grad_norm": 2.1198744773864746, + "learning_rate": 4.9985847490512945e-05, + "loss": 6.555, + "step": 1802 + }, + { + "epoch": 0.010722951755638024, + "grad_norm": 2.5428359508514404, + "learning_rate": 4.998583177135552e-05, + "loss": 6.8991, + "step": 1803 + }, + { + "epoch": 0.010728899038919023, + "grad_norm": 1.983817219734192, + "learning_rate": 4.99858160434758e-05, + "loss": 6.6428, + "step": 1804 + }, + { + "epoch": 0.01073484632220002, + "grad_norm": 2.2749712467193604, + "learning_rate": 4.998580030687379e-05, + "loss": 6.7294, + "step": 1805 + }, + { + "epoch": 0.010740793605481016, + "grad_norm": 1.914762258529663, + "learning_rate": 4.998578456154949e-05, + "loss": 7.0395, + "step": 1806 + }, + { + "epoch": 0.010746740888762013, + "grad_norm": 1.6850765943527222, + "learning_rate": 4.998576880750292e-05, + "loss": 6.862, + "step": 1807 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 2.2930233478546143, + "learning_rate": 4.9985753044734076e-05, + "loss": 6.8213, + "step": 1808 + }, + { + "epoch": 0.010758635455324008, + "grad_norm": 2.193464756011963, + "learning_rate": 4.998573727324295e-05, + "loss": 6.9303, + "step": 1809 + }, + { + "epoch": 0.010764582738605005, + "grad_norm": 2.2451658248901367, + "learning_rate": 4.9985721493029576e-05, + "loss": 6.8061, + "step": 1810 + }, + { + "epoch": 0.010770530021886002, + "grad_norm": 2.164214849472046, + "learning_rate": 4.998570570409394e-05, + "loss": 6.6485, + "step": 1811 + }, + { + "epoch": 0.010776477305166999, + "grad_norm": 2.3530375957489014, + "learning_rate": 4.9985689906436054e-05, + "loss": 6.6826, + "step": 1812 + }, + { + "epoch": 0.010782424588447997, + "grad_norm": 3.007641553878784, + "learning_rate": 4.998567410005591e-05, + "loss": 6.0781, + "step": 1813 + }, + { + "epoch": 0.010788371871728994, + "grad_norm": 2.500411033630371, + "learning_rate": 4.998565828495354e-05, + "loss": 7.0544, + "step": 1814 + }, + { + "epoch": 0.010794319155009991, + "grad_norm": 2.329221725463867, + "learning_rate": 4.998564246112893e-05, + "loss": 7.2505, + "step": 1815 + }, + { + "epoch": 0.010800266438290988, + "grad_norm": 2.05120849609375, + "learning_rate": 4.998562662858209e-05, + "loss": 7.3094, + "step": 1816 + }, + { + "epoch": 0.010806213721571986, + "grad_norm": 1.83049738407135, + "learning_rate": 4.9985610787313023e-05, + "loss": 6.7752, + "step": 1817 + }, + { + "epoch": 0.010812161004852983, + "grad_norm": 2.2754576206207275, + "learning_rate": 4.998559493732174e-05, + "loss": 6.9396, + "step": 1818 + }, + { + "epoch": 0.01081810828813398, + "grad_norm": 2.104849338531494, + "learning_rate": 4.998557907860825e-05, + "loss": 7.2624, + "step": 1819 + }, + { + "epoch": 0.010824055571414977, + "grad_norm": 3.152069568634033, + "learning_rate": 4.998556321117254e-05, + "loss": 6.6763, + "step": 1820 + }, + { + "epoch": 0.010830002854695975, + "grad_norm": 3.4046475887298584, + "learning_rate": 4.9985547335014636e-05, + "loss": 6.7145, + "step": 1821 + }, + { + "epoch": 0.010835950137976972, + "grad_norm": 1.9208084344863892, + "learning_rate": 4.9985531450134534e-05, + "loss": 6.8985, + "step": 1822 + }, + { + "epoch": 0.010841897421257969, + "grad_norm": 2.4949824810028076, + "learning_rate": 4.998551555653224e-05, + "loss": 6.8196, + "step": 1823 + }, + { + "epoch": 0.010847844704538966, + "grad_norm": 2.613175392150879, + "learning_rate": 4.998549965420776e-05, + "loss": 6.7918, + "step": 1824 + }, + { + "epoch": 0.010853791987819965, + "grad_norm": 2.3322529792785645, + "learning_rate": 4.9985483743161105e-05, + "loss": 6.6133, + "step": 1825 + }, + { + "epoch": 0.010859739271100961, + "grad_norm": 3.116680860519409, + "learning_rate": 4.998546782339227e-05, + "loss": 7.4026, + "step": 1826 + }, + { + "epoch": 0.010865686554381958, + "grad_norm": 2.673938274383545, + "learning_rate": 4.998545189490127e-05, + "loss": 6.9181, + "step": 1827 + }, + { + "epoch": 0.010871633837662955, + "grad_norm": 2.135727643966675, + "learning_rate": 4.998543595768811e-05, + "loss": 6.9514, + "step": 1828 + }, + { + "epoch": 0.010877581120943954, + "grad_norm": 2.241696357727051, + "learning_rate": 4.9985420011752784e-05, + "loss": 7.126, + "step": 1829 + }, + { + "epoch": 0.01088352840422495, + "grad_norm": 2.316342830657959, + "learning_rate": 4.9985404057095315e-05, + "loss": 6.9752, + "step": 1830 + }, + { + "epoch": 0.010889475687505947, + "grad_norm": 2.591611623764038, + "learning_rate": 4.998538809371569e-05, + "loss": 6.8721, + "step": 1831 + }, + { + "epoch": 0.010895422970786944, + "grad_norm": 2.2846317291259766, + "learning_rate": 4.9985372121613935e-05, + "loss": 6.9468, + "step": 1832 + }, + { + "epoch": 0.010901370254067943, + "grad_norm": 2.0799343585968018, + "learning_rate": 4.998535614079004e-05, + "loss": 7.0839, + "step": 1833 + }, + { + "epoch": 0.01090731753734894, + "grad_norm": 2.1908833980560303, + "learning_rate": 4.998534015124401e-05, + "loss": 6.7228, + "step": 1834 + }, + { + "epoch": 0.010913264820629936, + "grad_norm": 2.329401969909668, + "learning_rate": 4.998532415297587e-05, + "loss": 6.715, + "step": 1835 + }, + { + "epoch": 0.010919212103910933, + "grad_norm": 1.9492794275283813, + "learning_rate": 4.998530814598559e-05, + "loss": 6.6762, + "step": 1836 + }, + { + "epoch": 0.010925159387191932, + "grad_norm": 1.9564979076385498, + "learning_rate": 4.998529213027321e-05, + "loss": 6.8545, + "step": 1837 + }, + { + "epoch": 0.010931106670472928, + "grad_norm": 1.8424931764602661, + "learning_rate": 4.998527610583872e-05, + "loss": 6.8505, + "step": 1838 + }, + { + "epoch": 0.010937053953753925, + "grad_norm": 1.9743967056274414, + "learning_rate": 4.998526007268213e-05, + "loss": 6.8413, + "step": 1839 + }, + { + "epoch": 0.010943001237034922, + "grad_norm": 2.31296968460083, + "learning_rate": 4.998524403080345e-05, + "loss": 6.7327, + "step": 1840 + }, + { + "epoch": 0.010948948520315919, + "grad_norm": 2.049689292907715, + "learning_rate": 4.9985227980202665e-05, + "loss": 7.0029, + "step": 1841 + }, + { + "epoch": 0.010954895803596917, + "grad_norm": 2.1640658378601074, + "learning_rate": 4.99852119208798e-05, + "loss": 7.0749, + "step": 1842 + }, + { + "epoch": 0.010960843086877914, + "grad_norm": 1.8896230459213257, + "learning_rate": 4.998519585283486e-05, + "loss": 6.7249, + "step": 1843 + }, + { + "epoch": 0.010966790370158911, + "grad_norm": 2.4835314750671387, + "learning_rate": 4.998517977606785e-05, + "loss": 6.5605, + "step": 1844 + }, + { + "epoch": 0.010972737653439908, + "grad_norm": 2.2472622394561768, + "learning_rate": 4.998516369057876e-05, + "loss": 6.8291, + "step": 1845 + }, + { + "epoch": 0.010978684936720906, + "grad_norm": 2.499096155166626, + "learning_rate": 4.998514759636762e-05, + "loss": 6.6921, + "step": 1846 + }, + { + "epoch": 0.010984632220001903, + "grad_norm": 2.296786308288574, + "learning_rate": 4.998513149343442e-05, + "loss": 7.0475, + "step": 1847 + }, + { + "epoch": 0.0109905795032829, + "grad_norm": 2.2896368503570557, + "learning_rate": 4.998511538177916e-05, + "loss": 6.775, + "step": 1848 + }, + { + "epoch": 0.010996526786563897, + "grad_norm": 2.025575637817383, + "learning_rate": 4.998509926140186e-05, + "loss": 6.9538, + "step": 1849 + }, + { + "epoch": 0.011002474069844895, + "grad_norm": 2.23502779006958, + "learning_rate": 4.9985083132302525e-05, + "loss": 7.0595, + "step": 1850 + }, + { + "epoch": 0.011008421353125892, + "grad_norm": 2.7158777713775635, + "learning_rate": 4.998506699448115e-05, + "loss": 7.0086, + "step": 1851 + }, + { + "epoch": 0.011014368636406889, + "grad_norm": 2.2707183361053467, + "learning_rate": 4.998505084793775e-05, + "loss": 6.6396, + "step": 1852 + }, + { + "epoch": 0.011020315919687886, + "grad_norm": 3.196085214614868, + "learning_rate": 4.998503469267232e-05, + "loss": 6.6026, + "step": 1853 + }, + { + "epoch": 0.011026263202968884, + "grad_norm": 2.4472603797912598, + "learning_rate": 4.9985018528684876e-05, + "loss": 7.1332, + "step": 1854 + }, + { + "epoch": 0.011032210486249881, + "grad_norm": 2.7070915699005127, + "learning_rate": 4.998500235597542e-05, + "loss": 6.9669, + "step": 1855 + }, + { + "epoch": 0.011038157769530878, + "grad_norm": 2.127729654312134, + "learning_rate": 4.998498617454396e-05, + "loss": 6.9589, + "step": 1856 + }, + { + "epoch": 0.011044105052811875, + "grad_norm": 2.2897160053253174, + "learning_rate": 4.99849699843905e-05, + "loss": 7.0402, + "step": 1857 + }, + { + "epoch": 0.011050052336092873, + "grad_norm": 1.888961672782898, + "learning_rate": 4.998495378551504e-05, + "loss": 6.9406, + "step": 1858 + }, + { + "epoch": 0.01105599961937387, + "grad_norm": 1.9889254570007324, + "learning_rate": 4.9984937577917594e-05, + "loss": 6.8392, + "step": 1859 + }, + { + "epoch": 0.011061946902654867, + "grad_norm": 3.042891025543213, + "learning_rate": 4.998492136159817e-05, + "loss": 6.7743, + "step": 1860 + }, + { + "epoch": 0.011067894185935864, + "grad_norm": 2.423988103866577, + "learning_rate": 4.998490513655676e-05, + "loss": 6.9802, + "step": 1861 + }, + { + "epoch": 0.011073841469216862, + "grad_norm": 2.6415674686431885, + "learning_rate": 4.998488890279338e-05, + "loss": 6.7104, + "step": 1862 + }, + { + "epoch": 0.01107978875249786, + "grad_norm": 2.686969518661499, + "learning_rate": 4.998487266030804e-05, + "loss": 7.0539, + "step": 1863 + }, + { + "epoch": 0.011085736035778856, + "grad_norm": 2.6695480346679688, + "learning_rate": 4.998485640910072e-05, + "loss": 6.9812, + "step": 1864 + }, + { + "epoch": 0.011091683319059853, + "grad_norm": 2.6251392364501953, + "learning_rate": 4.9984840149171466e-05, + "loss": 6.9954, + "step": 1865 + }, + { + "epoch": 0.011097630602340851, + "grad_norm": 2.487593650817871, + "learning_rate": 4.998482388052025e-05, + "loss": 7.0847, + "step": 1866 + }, + { + "epoch": 0.011103577885621848, + "grad_norm": 2.3249282836914062, + "learning_rate": 4.998480760314709e-05, + "loss": 6.9936, + "step": 1867 + }, + { + "epoch": 0.011109525168902845, + "grad_norm": 2.170452833175659, + "learning_rate": 4.9984791317052e-05, + "loss": 6.9155, + "step": 1868 + }, + { + "epoch": 0.011115472452183842, + "grad_norm": 3.331779718399048, + "learning_rate": 4.9984775022234975e-05, + "loss": 6.9128, + "step": 1869 + }, + { + "epoch": 0.01112141973546484, + "grad_norm": 2.7665064334869385, + "learning_rate": 4.9984758718696026e-05, + "loss": 6.9002, + "step": 1870 + }, + { + "epoch": 0.011127367018745837, + "grad_norm": 2.2872116565704346, + "learning_rate": 4.998474240643515e-05, + "loss": 6.9058, + "step": 1871 + }, + { + "epoch": 0.011133314302026834, + "grad_norm": 2.2125210762023926, + "learning_rate": 4.998472608545236e-05, + "loss": 6.932, + "step": 1872 + }, + { + "epoch": 0.011139261585307831, + "grad_norm": 2.1135666370391846, + "learning_rate": 4.998470975574766e-05, + "loss": 7.0018, + "step": 1873 + }, + { + "epoch": 0.011145208868588828, + "grad_norm": 2.0649492740631104, + "learning_rate": 4.998469341732105e-05, + "loss": 7.0132, + "step": 1874 + }, + { + "epoch": 0.011151156151869826, + "grad_norm": 4.0558576583862305, + "learning_rate": 4.9984677070172546e-05, + "loss": 6.8826, + "step": 1875 + }, + { + "epoch": 0.011157103435150823, + "grad_norm": 2.5675904750823975, + "learning_rate": 4.998466071430216e-05, + "loss": 7.0314, + "step": 1876 + }, + { + "epoch": 0.01116305071843182, + "grad_norm": 2.9773342609405518, + "learning_rate": 4.998464434970987e-05, + "loss": 6.8608, + "step": 1877 + }, + { + "epoch": 0.011168998001712817, + "grad_norm": 2.804995059967041, + "learning_rate": 4.9984627976395705e-05, + "loss": 6.6857, + "step": 1878 + }, + { + "epoch": 0.011174945284993815, + "grad_norm": 3.758509874343872, + "learning_rate": 4.9984611594359664e-05, + "loss": 6.9995, + "step": 1879 + }, + { + "epoch": 0.011180892568274812, + "grad_norm": 2.583061933517456, + "learning_rate": 4.998459520360176e-05, + "loss": 6.5844, + "step": 1880 + }, + { + "epoch": 0.011186839851555809, + "grad_norm": 2.357642889022827, + "learning_rate": 4.998457880412198e-05, + "loss": 6.6435, + "step": 1881 + }, + { + "epoch": 0.011192787134836806, + "grad_norm": 2.181558609008789, + "learning_rate": 4.9984562395920356e-05, + "loss": 7.045, + "step": 1882 + }, + { + "epoch": 0.011198734418117804, + "grad_norm": 2.4768264293670654, + "learning_rate": 4.998454597899688e-05, + "loss": 7.2053, + "step": 1883 + }, + { + "epoch": 0.011204681701398801, + "grad_norm": 2.4422380924224854, + "learning_rate": 4.998452955335154e-05, + "loss": 6.8038, + "step": 1884 + }, + { + "epoch": 0.011210628984679798, + "grad_norm": 3.3173701763153076, + "learning_rate": 4.998451311898437e-05, + "loss": 6.8619, + "step": 1885 + }, + { + "epoch": 0.011216576267960795, + "grad_norm": 2.4492833614349365, + "learning_rate": 4.9984496675895366e-05, + "loss": 6.6681, + "step": 1886 + }, + { + "epoch": 0.011222523551241793, + "grad_norm": 3.065016031265259, + "learning_rate": 4.998448022408453e-05, + "loss": 6.7439, + "step": 1887 + }, + { + "epoch": 0.01122847083452279, + "grad_norm": 3.327730655670166, + "learning_rate": 4.998446376355187e-05, + "loss": 6.735, + "step": 1888 + }, + { + "epoch": 0.011234418117803787, + "grad_norm": 3.428292751312256, + "learning_rate": 4.998444729429739e-05, + "loss": 6.5277, + "step": 1889 + }, + { + "epoch": 0.011240365401084784, + "grad_norm": 2.4982972145080566, + "learning_rate": 4.9984430816321095e-05, + "loss": 6.8228, + "step": 1890 + }, + { + "epoch": 0.011246312684365782, + "grad_norm": 2.568232297897339, + "learning_rate": 4.9984414329623e-05, + "loss": 7.0772, + "step": 1891 + }, + { + "epoch": 0.01125225996764678, + "grad_norm": 2.534109115600586, + "learning_rate": 4.99843978342031e-05, + "loss": 7.0259, + "step": 1892 + }, + { + "epoch": 0.011258207250927776, + "grad_norm": 2.6394994258880615, + "learning_rate": 4.998438133006141e-05, + "loss": 6.8692, + "step": 1893 + }, + { + "epoch": 0.011264154534208773, + "grad_norm": 2.4049339294433594, + "learning_rate": 4.998436481719792e-05, + "loss": 6.8653, + "step": 1894 + }, + { + "epoch": 0.011270101817489771, + "grad_norm": 2.661191701889038, + "learning_rate": 4.998434829561266e-05, + "loss": 6.628, + "step": 1895 + }, + { + "epoch": 0.011276049100770768, + "grad_norm": 2.395829916000366, + "learning_rate": 4.998433176530561e-05, + "loss": 6.9876, + "step": 1896 + }, + { + "epoch": 0.011281996384051765, + "grad_norm": 2.547858715057373, + "learning_rate": 4.99843152262768e-05, + "loss": 7.3832, + "step": 1897 + }, + { + "epoch": 0.011287943667332762, + "grad_norm": 2.364246368408203, + "learning_rate": 4.998429867852621e-05, + "loss": 7.3771, + "step": 1898 + }, + { + "epoch": 0.01129389095061376, + "grad_norm": 2.3385260105133057, + "learning_rate": 4.998428212205387e-05, + "loss": 6.971, + "step": 1899 + }, + { + "epoch": 0.011299838233894757, + "grad_norm": 2.253760576248169, + "learning_rate": 4.998426555685977e-05, + "loss": 7.0588, + "step": 1900 + }, + { + "epoch": 0.011305785517175754, + "grad_norm": 2.4103500843048096, + "learning_rate": 4.998424898294392e-05, + "loss": 6.8731, + "step": 1901 + }, + { + "epoch": 0.011311732800456751, + "grad_norm": 2.4819014072418213, + "learning_rate": 4.998423240030633e-05, + "loss": 6.9502, + "step": 1902 + }, + { + "epoch": 0.011317680083737748, + "grad_norm": 2.503901243209839, + "learning_rate": 4.998421580894701e-05, + "loss": 7.017, + "step": 1903 + }, + { + "epoch": 0.011323627367018746, + "grad_norm": 2.2224137783050537, + "learning_rate": 4.9984199208865943e-05, + "loss": 7.1938, + "step": 1904 + }, + { + "epoch": 0.011329574650299743, + "grad_norm": 2.1291286945343018, + "learning_rate": 4.998418260006316e-05, + "loss": 7.1152, + "step": 1905 + }, + { + "epoch": 0.01133552193358074, + "grad_norm": 2.4611241817474365, + "learning_rate": 4.9984165982538655e-05, + "loss": 7.0316, + "step": 1906 + }, + { + "epoch": 0.011341469216861737, + "grad_norm": 2.329432487487793, + "learning_rate": 4.998414935629243e-05, + "loss": 7.0032, + "step": 1907 + }, + { + "epoch": 0.011347416500142735, + "grad_norm": 2.0618371963500977, + "learning_rate": 4.9984132721324505e-05, + "loss": 7.2566, + "step": 1908 + }, + { + "epoch": 0.011353363783423732, + "grad_norm": 2.063511371612549, + "learning_rate": 4.998411607763487e-05, + "loss": 7.0144, + "step": 1909 + }, + { + "epoch": 0.011359311066704729, + "grad_norm": 2.188871145248413, + "learning_rate": 4.998409942522355e-05, + "loss": 6.9652, + "step": 1910 + }, + { + "epoch": 0.011365258349985726, + "grad_norm": 2.499746322631836, + "learning_rate": 4.998408276409053e-05, + "loss": 6.9173, + "step": 1911 + }, + { + "epoch": 0.011371205633266724, + "grad_norm": 2.2809276580810547, + "learning_rate": 4.9984066094235826e-05, + "loss": 6.9202, + "step": 1912 + }, + { + "epoch": 0.011377152916547721, + "grad_norm": 1.7967042922973633, + "learning_rate": 4.998404941565944e-05, + "loss": 7.0652, + "step": 1913 + }, + { + "epoch": 0.011383100199828718, + "grad_norm": 2.339747667312622, + "learning_rate": 4.9984032728361384e-05, + "loss": 6.943, + "step": 1914 + }, + { + "epoch": 0.011389047483109715, + "grad_norm": 2.65795636177063, + "learning_rate": 4.998401603234166e-05, + "loss": 6.7197, + "step": 1915 + }, + { + "epoch": 0.011394994766390713, + "grad_norm": 2.181105852127075, + "learning_rate": 4.998399932760027e-05, + "loss": 6.7358, + "step": 1916 + }, + { + "epoch": 0.01140094204967171, + "grad_norm": 2.4130990505218506, + "learning_rate": 4.998398261413723e-05, + "loss": 6.8653, + "step": 1917 + }, + { + "epoch": 0.011406889332952707, + "grad_norm": 2.23822021484375, + "learning_rate": 4.998396589195254e-05, + "loss": 7.2125, + "step": 1918 + }, + { + "epoch": 0.011412836616233704, + "grad_norm": 2.176309823989868, + "learning_rate": 4.9983949161046207e-05, + "loss": 7.1077, + "step": 1919 + }, + { + "epoch": 0.011418783899514702, + "grad_norm": 2.2468202114105225, + "learning_rate": 4.9983932421418226e-05, + "loss": 7.1411, + "step": 1920 + }, + { + "epoch": 0.0114247311827957, + "grad_norm": 2.0748138427734375, + "learning_rate": 4.998391567306862e-05, + "loss": 7.0605, + "step": 1921 + }, + { + "epoch": 0.011430678466076696, + "grad_norm": 2.93007230758667, + "learning_rate": 4.998389891599738e-05, + "loss": 6.5832, + "step": 1922 + }, + { + "epoch": 0.011436625749357693, + "grad_norm": 2.125582218170166, + "learning_rate": 4.9983882150204534e-05, + "loss": 7.0761, + "step": 1923 + }, + { + "epoch": 0.011442573032638691, + "grad_norm": 2.3291571140289307, + "learning_rate": 4.998386537569005e-05, + "loss": 6.8781, + "step": 1924 + }, + { + "epoch": 0.011448520315919688, + "grad_norm": 2.8930649757385254, + "learning_rate": 4.9983848592453975e-05, + "loss": 7.1694, + "step": 1925 + }, + { + "epoch": 0.011454467599200685, + "grad_norm": 2.8450441360473633, + "learning_rate": 4.998383180049629e-05, + "loss": 7.1474, + "step": 1926 + }, + { + "epoch": 0.011460414882481682, + "grad_norm": 2.5900778770446777, + "learning_rate": 4.9983814999817016e-05, + "loss": 7.0423, + "step": 1927 + }, + { + "epoch": 0.01146636216576268, + "grad_norm": 2.289428949356079, + "learning_rate": 4.998379819041614e-05, + "loss": 6.9777, + "step": 1928 + }, + { + "epoch": 0.011472309449043677, + "grad_norm": 2.609384059906006, + "learning_rate": 4.998378137229368e-05, + "loss": 7.0488, + "step": 1929 + }, + { + "epoch": 0.011478256732324674, + "grad_norm": 2.1039459705352783, + "learning_rate": 4.998376454544964e-05, + "loss": 6.9308, + "step": 1930 + }, + { + "epoch": 0.01148420401560567, + "grad_norm": 2.1776134967803955, + "learning_rate": 4.9983747709884024e-05, + "loss": 6.9951, + "step": 1931 + }, + { + "epoch": 0.01149015129888667, + "grad_norm": 2.3150827884674072, + "learning_rate": 4.998373086559684e-05, + "loss": 6.9165, + "step": 1932 + }, + { + "epoch": 0.011496098582167666, + "grad_norm": 2.308370590209961, + "learning_rate": 4.99837140125881e-05, + "loss": 7.0155, + "step": 1933 + }, + { + "epoch": 0.011502045865448663, + "grad_norm": 2.234208106994629, + "learning_rate": 4.99836971508578e-05, + "loss": 6.9901, + "step": 1934 + }, + { + "epoch": 0.01150799314872966, + "grad_norm": 2.2340307235717773, + "learning_rate": 4.9983680280405953e-05, + "loss": 7.004, + "step": 1935 + }, + { + "epoch": 0.011513940432010657, + "grad_norm": 2.9458208084106445, + "learning_rate": 4.998366340123256e-05, + "loss": 7.3797, + "step": 1936 + }, + { + "epoch": 0.011519887715291655, + "grad_norm": 2.8516271114349365, + "learning_rate": 4.998364651333762e-05, + "loss": 7.3503, + "step": 1937 + }, + { + "epoch": 0.011525834998572652, + "grad_norm": 1.974025845527649, + "learning_rate": 4.998362961672116e-05, + "loss": 7.21, + "step": 1938 + }, + { + "epoch": 0.011531782281853649, + "grad_norm": 2.110117197036743, + "learning_rate": 4.998361271138317e-05, + "loss": 6.9494, + "step": 1939 + }, + { + "epoch": 0.011537729565134646, + "grad_norm": 2.2003207206726074, + "learning_rate": 4.9983595797323646e-05, + "loss": 6.8858, + "step": 1940 + }, + { + "epoch": 0.011543676848415644, + "grad_norm": 2.200982093811035, + "learning_rate": 4.998357887454262e-05, + "loss": 6.9512, + "step": 1941 + }, + { + "epoch": 0.011549624131696641, + "grad_norm": 2.303903102874756, + "learning_rate": 4.998356194304008e-05, + "loss": 7.2823, + "step": 1942 + }, + { + "epoch": 0.011555571414977638, + "grad_norm": 2.1376724243164062, + "learning_rate": 4.9983545002816035e-05, + "loss": 7.0321, + "step": 1943 + }, + { + "epoch": 0.011561518698258635, + "grad_norm": 2.3128151893615723, + "learning_rate": 4.99835280538705e-05, + "loss": 6.9714, + "step": 1944 + }, + { + "epoch": 0.011567465981539633, + "grad_norm": 2.359212636947632, + "learning_rate": 4.9983511096203465e-05, + "loss": 7.0496, + "step": 1945 + }, + { + "epoch": 0.01157341326482063, + "grad_norm": 2.346946954727173, + "learning_rate": 4.9983494129814945e-05, + "loss": 6.9865, + "step": 1946 + }, + { + "epoch": 0.011579360548101627, + "grad_norm": 2.447598934173584, + "learning_rate": 4.998347715470495e-05, + "loss": 6.9609, + "step": 1947 + }, + { + "epoch": 0.011585307831382624, + "grad_norm": 2.355300188064575, + "learning_rate": 4.998346017087348e-05, + "loss": 7.03, + "step": 1948 + }, + { + "epoch": 0.011591255114663622, + "grad_norm": 2.3207437992095947, + "learning_rate": 4.9983443178320545e-05, + "loss": 6.8181, + "step": 1949 + }, + { + "epoch": 0.011597202397944619, + "grad_norm": 2.359839677810669, + "learning_rate": 4.998342617704615e-05, + "loss": 6.8828, + "step": 1950 + }, + { + "epoch": 0.011603149681225616, + "grad_norm": 2.264890432357788, + "learning_rate": 4.9983409167050284e-05, + "loss": 7.3467, + "step": 1951 + }, + { + "epoch": 0.011609096964506613, + "grad_norm": 2.2720789909362793, + "learning_rate": 4.998339214833298e-05, + "loss": 7.3912, + "step": 1952 + }, + { + "epoch": 0.011615044247787611, + "grad_norm": 2.414433240890503, + "learning_rate": 4.9983375120894226e-05, + "loss": 7.1505, + "step": 1953 + }, + { + "epoch": 0.011620991531068608, + "grad_norm": 2.095290422439575, + "learning_rate": 4.998335808473404e-05, + "loss": 7.1642, + "step": 1954 + }, + { + "epoch": 0.011626938814349605, + "grad_norm": 2.118901252746582, + "learning_rate": 4.998334103985242e-05, + "loss": 7.0528, + "step": 1955 + }, + { + "epoch": 0.011632886097630602, + "grad_norm": 2.4361472129821777, + "learning_rate": 4.998332398624937e-05, + "loss": 7.3064, + "step": 1956 + }, + { + "epoch": 0.0116388333809116, + "grad_norm": 2.0978667736053467, + "learning_rate": 4.99833069239249e-05, + "loss": 7.0041, + "step": 1957 + }, + { + "epoch": 0.011644780664192597, + "grad_norm": 3.156329393386841, + "learning_rate": 4.998328985287902e-05, + "loss": 6.9169, + "step": 1958 + }, + { + "epoch": 0.011650727947473594, + "grad_norm": 2.311004400253296, + "learning_rate": 4.9983272773111735e-05, + "loss": 7.1128, + "step": 1959 + }, + { + "epoch": 0.01165667523075459, + "grad_norm": 2.406993865966797, + "learning_rate": 4.9983255684623036e-05, + "loss": 7.1403, + "step": 1960 + }, + { + "epoch": 0.01166262251403559, + "grad_norm": 2.0262861251831055, + "learning_rate": 4.998323858741295e-05, + "loss": 7.1014, + "step": 1961 + }, + { + "epoch": 0.011668569797316586, + "grad_norm": 2.369420051574707, + "learning_rate": 4.998322148148147e-05, + "loss": 7.1422, + "step": 1962 + }, + { + "epoch": 0.011674517080597583, + "grad_norm": 2.156019687652588, + "learning_rate": 4.998320436682861e-05, + "loss": 6.8405, + "step": 1963 + }, + { + "epoch": 0.01168046436387858, + "grad_norm": 2.35737681388855, + "learning_rate": 4.998318724345436e-05, + "loss": 6.8004, + "step": 1964 + }, + { + "epoch": 0.011686411647159577, + "grad_norm": 2.443676233291626, + "learning_rate": 4.998317011135875e-05, + "loss": 7.1959, + "step": 1965 + }, + { + "epoch": 0.011692358930440575, + "grad_norm": 2.1023004055023193, + "learning_rate": 4.998315297054177e-05, + "loss": 7.0684, + "step": 1966 + }, + { + "epoch": 0.011698306213721572, + "grad_norm": 2.5166187286376953, + "learning_rate": 4.998313582100342e-05, + "loss": 6.5876, + "step": 1967 + }, + { + "epoch": 0.011704253497002569, + "grad_norm": 2.1868557929992676, + "learning_rate": 4.9983118662743726e-05, + "loss": 6.6097, + "step": 1968 + }, + { + "epoch": 0.011710200780283566, + "grad_norm": 2.196786880493164, + "learning_rate": 4.998310149576269e-05, + "loss": 6.9798, + "step": 1969 + }, + { + "epoch": 0.011716148063564564, + "grad_norm": 2.361915111541748, + "learning_rate": 4.998308432006029e-05, + "loss": 6.8441, + "step": 1970 + }, + { + "epoch": 0.011722095346845561, + "grad_norm": 2.3234047889709473, + "learning_rate": 4.998306713563657e-05, + "loss": 6.9481, + "step": 1971 + }, + { + "epoch": 0.011728042630126558, + "grad_norm": 2.4995763301849365, + "learning_rate": 4.9983049942491514e-05, + "loss": 6.9903, + "step": 1972 + }, + { + "epoch": 0.011733989913407555, + "grad_norm": 2.21274995803833, + "learning_rate": 4.998303274062514e-05, + "loss": 7.1484, + "step": 1973 + }, + { + "epoch": 0.011739937196688553, + "grad_norm": 2.4777519702911377, + "learning_rate": 4.998301553003743e-05, + "loss": 7.144, + "step": 1974 + }, + { + "epoch": 0.01174588447996955, + "grad_norm": 2.089796304702759, + "learning_rate": 4.9982998310728426e-05, + "loss": 6.6765, + "step": 1975 + }, + { + "epoch": 0.011751831763250547, + "grad_norm": 3.012753963470459, + "learning_rate": 4.998298108269811e-05, + "loss": 6.8501, + "step": 1976 + }, + { + "epoch": 0.011757779046531544, + "grad_norm": 2.5427911281585693, + "learning_rate": 4.9982963845946486e-05, + "loss": 7.0171, + "step": 1977 + }, + { + "epoch": 0.011763726329812542, + "grad_norm": 2.8591670989990234, + "learning_rate": 4.998294660047358e-05, + "loss": 6.9881, + "step": 1978 + }, + { + "epoch": 0.011769673613093539, + "grad_norm": 2.952085256576538, + "learning_rate": 4.998292934627937e-05, + "loss": 6.9459, + "step": 1979 + }, + { + "epoch": 0.011775620896374536, + "grad_norm": 2.451958656311035, + "learning_rate": 4.998291208336388e-05, + "loss": 6.9515, + "step": 1980 + }, + { + "epoch": 0.011781568179655533, + "grad_norm": 2.448319435119629, + "learning_rate": 4.998289481172713e-05, + "loss": 6.8618, + "step": 1981 + }, + { + "epoch": 0.011787515462936531, + "grad_norm": 3.1797080039978027, + "learning_rate": 4.99828775313691e-05, + "loss": 6.7528, + "step": 1982 + }, + { + "epoch": 0.011793462746217528, + "grad_norm": 2.841120719909668, + "learning_rate": 4.99828602422898e-05, + "loss": 6.8, + "step": 1983 + }, + { + "epoch": 0.011799410029498525, + "grad_norm": 3.128098726272583, + "learning_rate": 4.998284294448925e-05, + "loss": 6.7574, + "step": 1984 + }, + { + "epoch": 0.011805357312779522, + "grad_norm": 2.7724568843841553, + "learning_rate": 4.998282563796744e-05, + "loss": 6.6119, + "step": 1985 + }, + { + "epoch": 0.01181130459606052, + "grad_norm": 2.8025269508361816, + "learning_rate": 4.998280832272439e-05, + "loss": 6.4676, + "step": 1986 + }, + { + "epoch": 0.011817251879341517, + "grad_norm": 2.5756618976593018, + "learning_rate": 4.99827909987601e-05, + "loss": 6.5421, + "step": 1987 + }, + { + "epoch": 0.011823199162622514, + "grad_norm": 2.9116249084472656, + "learning_rate": 4.998277366607457e-05, + "loss": 6.5446, + "step": 1988 + }, + { + "epoch": 0.01182914644590351, + "grad_norm": 2.571019411087036, + "learning_rate": 4.9982756324667815e-05, + "loss": 6.7898, + "step": 1989 + }, + { + "epoch": 0.01183509372918451, + "grad_norm": 2.818885326385498, + "learning_rate": 4.998273897453984e-05, + "loss": 6.6604, + "step": 1990 + }, + { + "epoch": 0.011841041012465506, + "grad_norm": 2.8561007976531982, + "learning_rate": 4.998272161569064e-05, + "loss": 6.5473, + "step": 1991 + }, + { + "epoch": 0.011846988295746503, + "grad_norm": 2.5539605617523193, + "learning_rate": 4.998270424812024e-05, + "loss": 6.5492, + "step": 1992 + }, + { + "epoch": 0.0118529355790275, + "grad_norm": 2.3242900371551514, + "learning_rate": 4.998268687182863e-05, + "loss": 6.4577, + "step": 1993 + }, + { + "epoch": 0.011858882862308498, + "grad_norm": 2.874807596206665, + "learning_rate": 4.998266948681582e-05, + "loss": 6.6071, + "step": 1994 + }, + { + "epoch": 0.011864830145589495, + "grad_norm": 2.9014296531677246, + "learning_rate": 4.9982652093081827e-05, + "loss": 7.2221, + "step": 1995 + }, + { + "epoch": 0.011870777428870492, + "grad_norm": 2.5874252319335938, + "learning_rate": 4.998263469062665e-05, + "loss": 6.593, + "step": 1996 + }, + { + "epoch": 0.011876724712151489, + "grad_norm": 2.4252052307128906, + "learning_rate": 4.998261727945028e-05, + "loss": 7.0138, + "step": 1997 + }, + { + "epoch": 0.011882671995432486, + "grad_norm": 2.3569211959838867, + "learning_rate": 4.998259985955275e-05, + "loss": 6.8743, + "step": 1998 + }, + { + "epoch": 0.011888619278713484, + "grad_norm": 2.560659408569336, + "learning_rate": 4.9982582430934045e-05, + "loss": 6.8926, + "step": 1999 + }, + { + "epoch": 0.011894566561994481, + "grad_norm": 2.0855636596679688, + "learning_rate": 4.9982564993594184e-05, + "loss": 7.1691, + "step": 2000 + }, + { + "epoch": 0.011900513845275478, + "grad_norm": 2.024829387664795, + "learning_rate": 4.998254754753316e-05, + "loss": 7.1797, + "step": 2001 + }, + { + "epoch": 0.011906461128556475, + "grad_norm": 2.093733549118042, + "learning_rate": 4.998253009275099e-05, + "loss": 6.9706, + "step": 2002 + }, + { + "epoch": 0.011912408411837473, + "grad_norm": 1.9211688041687012, + "learning_rate": 4.998251262924768e-05, + "loss": 7.018, + "step": 2003 + }, + { + "epoch": 0.01191835569511847, + "grad_norm": 2.3146321773529053, + "learning_rate": 4.998249515702323e-05, + "loss": 6.9384, + "step": 2004 + }, + { + "epoch": 0.011924302978399467, + "grad_norm": 2.346309185028076, + "learning_rate": 4.998247767607765e-05, + "loss": 6.5674, + "step": 2005 + }, + { + "epoch": 0.011930250261680464, + "grad_norm": 2.39471697807312, + "learning_rate": 4.998246018641094e-05, + "loss": 6.769, + "step": 2006 + }, + { + "epoch": 0.011936197544961462, + "grad_norm": 2.1689298152923584, + "learning_rate": 4.998244268802312e-05, + "loss": 7.0945, + "step": 2007 + }, + { + "epoch": 0.011942144828242459, + "grad_norm": 2.4209859371185303, + "learning_rate": 4.998242518091418e-05, + "loss": 6.98, + "step": 2008 + }, + { + "epoch": 0.011948092111523456, + "grad_norm": 2.6378684043884277, + "learning_rate": 4.998240766508414e-05, + "loss": 6.6833, + "step": 2009 + }, + { + "epoch": 0.011954039394804453, + "grad_norm": 2.2804839611053467, + "learning_rate": 4.9982390140532995e-05, + "loss": 6.7129, + "step": 2010 + }, + { + "epoch": 0.011959986678085451, + "grad_norm": 2.1788251399993896, + "learning_rate": 4.998237260726075e-05, + "loss": 7.0175, + "step": 2011 + }, + { + "epoch": 0.011965933961366448, + "grad_norm": 1.8988546133041382, + "learning_rate": 4.998235506526743e-05, + "loss": 7.0857, + "step": 2012 + }, + { + "epoch": 0.011971881244647445, + "grad_norm": 2.560107469558716, + "learning_rate": 4.9982337514553026e-05, + "loss": 7.0771, + "step": 2013 + }, + { + "epoch": 0.011977828527928442, + "grad_norm": 2.1771798133850098, + "learning_rate": 4.998231995511754e-05, + "loss": 7.071, + "step": 2014 + }, + { + "epoch": 0.01198377581120944, + "grad_norm": 1.9619860649108887, + "learning_rate": 4.998230238696098e-05, + "loss": 6.9109, + "step": 2015 + }, + { + "epoch": 0.011989723094490437, + "grad_norm": 2.16719126701355, + "learning_rate": 4.998228481008337e-05, + "loss": 6.903, + "step": 2016 + }, + { + "epoch": 0.011995670377771434, + "grad_norm": 2.4643077850341797, + "learning_rate": 4.998226722448469e-05, + "loss": 6.5301, + "step": 2017 + }, + { + "epoch": 0.01200161766105243, + "grad_norm": 2.5153393745422363, + "learning_rate": 4.9982249630164965e-05, + "loss": 7.107, + "step": 2018 + }, + { + "epoch": 0.01200756494433343, + "grad_norm": 2.6180920600891113, + "learning_rate": 4.998223202712419e-05, + "loss": 6.9905, + "step": 2019 + }, + { + "epoch": 0.012013512227614426, + "grad_norm": 2.333186149597168, + "learning_rate": 4.998221441536238e-05, + "loss": 7.074, + "step": 2020 + }, + { + "epoch": 0.012019459510895423, + "grad_norm": 2.138176918029785, + "learning_rate": 4.998219679487953e-05, + "loss": 7.0211, + "step": 2021 + }, + { + "epoch": 0.01202540679417642, + "grad_norm": 2.9845499992370605, + "learning_rate": 4.998217916567567e-05, + "loss": 6.7341, + "step": 2022 + }, + { + "epoch": 0.012031354077457418, + "grad_norm": 3.1216208934783936, + "learning_rate": 4.998216152775077e-05, + "loss": 7.1569, + "step": 2023 + }, + { + "epoch": 0.012037301360738415, + "grad_norm": 2.4693727493286133, + "learning_rate": 4.998214388110487e-05, + "loss": 6.6427, + "step": 2024 + }, + { + "epoch": 0.012043248644019412, + "grad_norm": 2.784562349319458, + "learning_rate": 4.9982126225737955e-05, + "loss": 6.6898, + "step": 2025 + }, + { + "epoch": 0.012049195927300409, + "grad_norm": 3.0549166202545166, + "learning_rate": 4.9982108561650036e-05, + "loss": 6.6004, + "step": 2026 + }, + { + "epoch": 0.012055143210581406, + "grad_norm": 2.565505266189575, + "learning_rate": 4.998209088884113e-05, + "loss": 6.5981, + "step": 2027 + }, + { + "epoch": 0.012061090493862404, + "grad_norm": 2.862548828125, + "learning_rate": 4.998207320731122e-05, + "loss": 6.4329, + "step": 2028 + }, + { + "epoch": 0.012067037777143401, + "grad_norm": 2.835280179977417, + "learning_rate": 4.998205551706033e-05, + "loss": 6.6854, + "step": 2029 + }, + { + "epoch": 0.012072985060424398, + "grad_norm": 2.4550364017486572, + "learning_rate": 4.9982037818088474e-05, + "loss": 6.7115, + "step": 2030 + }, + { + "epoch": 0.012078932343705395, + "grad_norm": 2.9977426528930664, + "learning_rate": 4.998202011039564e-05, + "loss": 6.341, + "step": 2031 + }, + { + "epoch": 0.012084879626986393, + "grad_norm": 2.258370876312256, + "learning_rate": 4.998200239398184e-05, + "loss": 6.7094, + "step": 2032 + }, + { + "epoch": 0.01209082691026739, + "grad_norm": 2.4484050273895264, + "learning_rate": 4.9981984668847085e-05, + "loss": 7.1115, + "step": 2033 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 2.4668514728546143, + "learning_rate": 4.9981966934991366e-05, + "loss": 6.9411, + "step": 2034 + }, + { + "epoch": 0.012102721476829384, + "grad_norm": 2.218479871749878, + "learning_rate": 4.998194919241471e-05, + "loss": 6.7175, + "step": 2035 + }, + { + "epoch": 0.012108668760110382, + "grad_norm": 2.201815366744995, + "learning_rate": 4.9981931441117115e-05, + "loss": 6.8684, + "step": 2036 + }, + { + "epoch": 0.012114616043391379, + "grad_norm": 2.4610331058502197, + "learning_rate": 4.998191368109858e-05, + "loss": 6.7214, + "step": 2037 + }, + { + "epoch": 0.012120563326672376, + "grad_norm": 2.7274906635284424, + "learning_rate": 4.998189591235912e-05, + "loss": 6.7611, + "step": 2038 + }, + { + "epoch": 0.012126510609953373, + "grad_norm": 2.7716658115386963, + "learning_rate": 4.9981878134898735e-05, + "loss": 6.7679, + "step": 2039 + }, + { + "epoch": 0.012132457893234371, + "grad_norm": 3.3206236362457275, + "learning_rate": 4.9981860348717434e-05, + "loss": 6.6283, + "step": 2040 + }, + { + "epoch": 0.012138405176515368, + "grad_norm": 2.511906862258911, + "learning_rate": 4.9981842553815225e-05, + "loss": 6.9537, + "step": 2041 + }, + { + "epoch": 0.012144352459796365, + "grad_norm": 2.7797024250030518, + "learning_rate": 4.998182475019212e-05, + "loss": 7.0488, + "step": 2042 + }, + { + "epoch": 0.012150299743077362, + "grad_norm": 3.523092031478882, + "learning_rate": 4.998180693784811e-05, + "loss": 6.9249, + "step": 2043 + }, + { + "epoch": 0.01215624702635836, + "grad_norm": 3.1001851558685303, + "learning_rate": 4.998178911678322e-05, + "loss": 7.0998, + "step": 2044 + }, + { + "epoch": 0.012162194309639357, + "grad_norm": 2.5291028022766113, + "learning_rate": 4.998177128699743e-05, + "loss": 6.8381, + "step": 2045 + }, + { + "epoch": 0.012168141592920354, + "grad_norm": 3.308398723602295, + "learning_rate": 4.998175344849077e-05, + "loss": 6.6849, + "step": 2046 + }, + { + "epoch": 0.01217408887620135, + "grad_norm": 3.4255475997924805, + "learning_rate": 4.998173560126323e-05, + "loss": 6.7816, + "step": 2047 + }, + { + "epoch": 0.01218003615948235, + "grad_norm": 3.4510817527770996, + "learning_rate": 4.998171774531484e-05, + "loss": 6.7961, + "step": 2048 + }, + { + "epoch": 0.012185983442763346, + "grad_norm": 3.15468168258667, + "learning_rate": 4.998169988064558e-05, + "loss": 6.9409, + "step": 2049 + }, + { + "epoch": 0.012191930726044343, + "grad_norm": 2.5568132400512695, + "learning_rate": 4.998168200725547e-05, + "loss": 6.8573, + "step": 2050 + }, + { + "epoch": 0.01219787800932534, + "grad_norm": 1.9745045900344849, + "learning_rate": 4.9981664125144515e-05, + "loss": 6.7948, + "step": 2051 + }, + { + "epoch": 0.012203825292606338, + "grad_norm": 2.2304463386535645, + "learning_rate": 4.9981646234312714e-05, + "loss": 6.6896, + "step": 2052 + }, + { + "epoch": 0.012209772575887335, + "grad_norm": 2.4391567707061768, + "learning_rate": 4.998162833476008e-05, + "loss": 6.7129, + "step": 2053 + }, + { + "epoch": 0.012215719859168332, + "grad_norm": 3.243905544281006, + "learning_rate": 4.9981610426486615e-05, + "loss": 7.0744, + "step": 2054 + }, + { + "epoch": 0.012221667142449329, + "grad_norm": 3.2596933841705322, + "learning_rate": 4.998159250949233e-05, + "loss": 6.9361, + "step": 2055 + }, + { + "epoch": 0.012227614425730327, + "grad_norm": 2.554436445236206, + "learning_rate": 4.998157458377723e-05, + "loss": 6.9354, + "step": 2056 + }, + { + "epoch": 0.012233561709011324, + "grad_norm": 2.3636975288391113, + "learning_rate": 4.998155664934132e-05, + "loss": 6.849, + "step": 2057 + }, + { + "epoch": 0.01223950899229232, + "grad_norm": 2.224684953689575, + "learning_rate": 4.99815387061846e-05, + "loss": 6.7011, + "step": 2058 + }, + { + "epoch": 0.012245456275573318, + "grad_norm": 2.6892964839935303, + "learning_rate": 4.9981520754307096e-05, + "loss": 6.753, + "step": 2059 + }, + { + "epoch": 0.012251403558854315, + "grad_norm": 2.7645084857940674, + "learning_rate": 4.9981502793708796e-05, + "loss": 6.5437, + "step": 2060 + }, + { + "epoch": 0.012257350842135313, + "grad_norm": 2.1315746307373047, + "learning_rate": 4.9981484824389716e-05, + "loss": 6.8843, + "step": 2061 + }, + { + "epoch": 0.01226329812541631, + "grad_norm": 2.6275408267974854, + "learning_rate": 4.998146684634984e-05, + "loss": 6.7275, + "step": 2062 + }, + { + "epoch": 0.012269245408697307, + "grad_norm": 2.530688762664795, + "learning_rate": 4.998144885958921e-05, + "loss": 6.6089, + "step": 2063 + }, + { + "epoch": 0.012275192691978304, + "grad_norm": 2.0959835052490234, + "learning_rate": 4.998143086410781e-05, + "loss": 6.7425, + "step": 2064 + }, + { + "epoch": 0.012281139975259302, + "grad_norm": 2.887242078781128, + "learning_rate": 4.998141285990565e-05, + "loss": 6.6867, + "step": 2065 + }, + { + "epoch": 0.012287087258540299, + "grad_norm": 2.430122137069702, + "learning_rate": 4.9981394846982734e-05, + "loss": 6.6636, + "step": 2066 + }, + { + "epoch": 0.012293034541821296, + "grad_norm": 2.269162654876709, + "learning_rate": 4.998137682533907e-05, + "loss": 7.1165, + "step": 2067 + }, + { + "epoch": 0.012298981825102293, + "grad_norm": 2.6741089820861816, + "learning_rate": 4.998135879497467e-05, + "loss": 6.6678, + "step": 2068 + }, + { + "epoch": 0.012304929108383291, + "grad_norm": 2.3362507820129395, + "learning_rate": 4.998134075588953e-05, + "loss": 7.0103, + "step": 2069 + }, + { + "epoch": 0.012310876391664288, + "grad_norm": 2.310638189315796, + "learning_rate": 4.9981322708083666e-05, + "loss": 6.9235, + "step": 2070 + }, + { + "epoch": 0.012316823674945285, + "grad_norm": 2.161853790283203, + "learning_rate": 4.998130465155708e-05, + "loss": 6.9392, + "step": 2071 + }, + { + "epoch": 0.012322770958226282, + "grad_norm": 2.2609059810638428, + "learning_rate": 4.9981286586309786e-05, + "loss": 6.888, + "step": 2072 + }, + { + "epoch": 0.01232871824150728, + "grad_norm": 2.6072967052459717, + "learning_rate": 4.998126851234177e-05, + "loss": 6.7739, + "step": 2073 + }, + { + "epoch": 0.012334665524788277, + "grad_norm": 3.092834711074829, + "learning_rate": 4.9981250429653056e-05, + "loss": 6.5529, + "step": 2074 + }, + { + "epoch": 0.012340612808069274, + "grad_norm": 2.303149461746216, + "learning_rate": 4.998123233824366e-05, + "loss": 6.618, + "step": 2075 + }, + { + "epoch": 0.01234656009135027, + "grad_norm": 2.888063907623291, + "learning_rate": 4.998121423811355e-05, + "loss": 6.9224, + "step": 2076 + }, + { + "epoch": 0.012352507374631269, + "grad_norm": 2.990727424621582, + "learning_rate": 4.998119612926277e-05, + "loss": 6.94, + "step": 2077 + }, + { + "epoch": 0.012358454657912266, + "grad_norm": 3.016002893447876, + "learning_rate": 4.998117801169131e-05, + "loss": 6.6231, + "step": 2078 + }, + { + "epoch": 0.012364401941193263, + "grad_norm": 2.057124614715576, + "learning_rate": 4.998115988539918e-05, + "loss": 6.803, + "step": 2079 + }, + { + "epoch": 0.01237034922447426, + "grad_norm": 2.371136426925659, + "learning_rate": 4.998114175038639e-05, + "loss": 6.8244, + "step": 2080 + }, + { + "epoch": 0.012376296507755258, + "grad_norm": 2.804365873336792, + "learning_rate": 4.998112360665292e-05, + "loss": 6.8787, + "step": 2081 + }, + { + "epoch": 0.012382243791036255, + "grad_norm": 3.4987633228302, + "learning_rate": 4.998110545419882e-05, + "loss": 6.6946, + "step": 2082 + }, + { + "epoch": 0.012388191074317252, + "grad_norm": 2.950608968734741, + "learning_rate": 4.998108729302407e-05, + "loss": 6.7915, + "step": 2083 + }, + { + "epoch": 0.012394138357598249, + "grad_norm": 2.4327776432037354, + "learning_rate": 4.998106912312868e-05, + "loss": 6.727, + "step": 2084 + }, + { + "epoch": 0.012400085640879247, + "grad_norm": 2.46014142036438, + "learning_rate": 4.998105094451265e-05, + "loss": 6.6797, + "step": 2085 + }, + { + "epoch": 0.012406032924160244, + "grad_norm": 2.947566270828247, + "learning_rate": 4.9981032757175995e-05, + "loss": 6.6401, + "step": 2086 + }, + { + "epoch": 0.01241198020744124, + "grad_norm": 2.5999064445495605, + "learning_rate": 4.9981014561118724e-05, + "loss": 6.58, + "step": 2087 + }, + { + "epoch": 0.012417927490722238, + "grad_norm": 2.9761807918548584, + "learning_rate": 4.9980996356340836e-05, + "loss": 6.8538, + "step": 2088 + }, + { + "epoch": 0.012423874774003236, + "grad_norm": 2.690925121307373, + "learning_rate": 4.9980978142842336e-05, + "loss": 6.9087, + "step": 2089 + }, + { + "epoch": 0.012429822057284233, + "grad_norm": 2.218524217605591, + "learning_rate": 4.998095992062325e-05, + "loss": 6.7221, + "step": 2090 + }, + { + "epoch": 0.01243576934056523, + "grad_norm": 2.630094051361084, + "learning_rate": 4.998094168968355e-05, + "loss": 6.7346, + "step": 2091 + }, + { + "epoch": 0.012441716623846227, + "grad_norm": 2.7839179039001465, + "learning_rate": 4.9980923450023276e-05, + "loss": 6.8668, + "step": 2092 + }, + { + "epoch": 0.012447663907127223, + "grad_norm": 2.422914743423462, + "learning_rate": 4.9980905201642415e-05, + "loss": 6.7953, + "step": 2093 + }, + { + "epoch": 0.012453611190408222, + "grad_norm": 2.525883674621582, + "learning_rate": 4.998088694454097e-05, + "loss": 6.6322, + "step": 2094 + }, + { + "epoch": 0.012459558473689219, + "grad_norm": 2.515536308288574, + "learning_rate": 4.998086867871896e-05, + "loss": 7.4297, + "step": 2095 + }, + { + "epoch": 0.012465505756970216, + "grad_norm": 2.689542055130005, + "learning_rate": 4.998085040417639e-05, + "loss": 7.4316, + "step": 2096 + }, + { + "epoch": 0.012471453040251212, + "grad_norm": 2.4374492168426514, + "learning_rate": 4.998083212091327e-05, + "loss": 6.8035, + "step": 2097 + }, + { + "epoch": 0.012477400323532211, + "grad_norm": 2.284153699874878, + "learning_rate": 4.998081382892959e-05, + "loss": 6.6644, + "step": 2098 + }, + { + "epoch": 0.012483347606813208, + "grad_norm": 2.113539218902588, + "learning_rate": 4.9980795528225366e-05, + "loss": 6.5201, + "step": 2099 + }, + { + "epoch": 0.012489294890094205, + "grad_norm": 2.2590157985687256, + "learning_rate": 4.998077721880061e-05, + "loss": 6.8074, + "step": 2100 + }, + { + "epoch": 0.012495242173375202, + "grad_norm": 2.077986717224121, + "learning_rate": 4.9980758900655316e-05, + "loss": 6.6986, + "step": 2101 + }, + { + "epoch": 0.0125011894566562, + "grad_norm": 2.495882987976074, + "learning_rate": 4.99807405737895e-05, + "loss": 6.6949, + "step": 2102 + }, + { + "epoch": 0.012507136739937197, + "grad_norm": 2.224621295928955, + "learning_rate": 4.998072223820317e-05, + "loss": 6.5723, + "step": 2103 + }, + { + "epoch": 0.012513084023218194, + "grad_norm": 2.515867233276367, + "learning_rate": 4.998070389389632e-05, + "loss": 6.4327, + "step": 2104 + }, + { + "epoch": 0.01251903130649919, + "grad_norm": 2.3134326934814453, + "learning_rate": 4.998068554086897e-05, + "loss": 6.2818, + "step": 2105 + }, + { + "epoch": 0.012524978589780189, + "grad_norm": 2.7688093185424805, + "learning_rate": 4.998066717912112e-05, + "loss": 6.4585, + "step": 2106 + }, + { + "epoch": 0.012530925873061186, + "grad_norm": 3.211790084838867, + "learning_rate": 4.998064880865277e-05, + "loss": 6.5227, + "step": 2107 + }, + { + "epoch": 0.012536873156342183, + "grad_norm": 2.9701578617095947, + "learning_rate": 4.998063042946395e-05, + "loss": 6.5674, + "step": 2108 + }, + { + "epoch": 0.01254282043962318, + "grad_norm": 2.1295664310455322, + "learning_rate": 4.998061204155463e-05, + "loss": 6.5697, + "step": 2109 + }, + { + "epoch": 0.012548767722904178, + "grad_norm": 2.841683864593506, + "learning_rate": 4.998059364492485e-05, + "loss": 6.453, + "step": 2110 + }, + { + "epoch": 0.012554715006185175, + "grad_norm": 2.481001615524292, + "learning_rate": 4.99805752395746e-05, + "loss": 6.555, + "step": 2111 + }, + { + "epoch": 0.012560662289466172, + "grad_norm": 2.357745885848999, + "learning_rate": 4.998055682550389e-05, + "loss": 6.7916, + "step": 2112 + }, + { + "epoch": 0.012566609572747169, + "grad_norm": 2.349417209625244, + "learning_rate": 4.9980538402712725e-05, + "loss": 6.7257, + "step": 2113 + }, + { + "epoch": 0.012572556856028167, + "grad_norm": 2.846930742263794, + "learning_rate": 4.998051997120111e-05, + "loss": 6.7095, + "step": 2114 + }, + { + "epoch": 0.012578504139309164, + "grad_norm": 2.362506628036499, + "learning_rate": 4.998050153096906e-05, + "loss": 6.675, + "step": 2115 + }, + { + "epoch": 0.01258445142259016, + "grad_norm": 2.3275344371795654, + "learning_rate": 4.998048308201656e-05, + "loss": 6.9031, + "step": 2116 + }, + { + "epoch": 0.012590398705871158, + "grad_norm": 2.194359540939331, + "learning_rate": 4.9980464624343644e-05, + "loss": 6.8258, + "step": 2117 + }, + { + "epoch": 0.012596345989152156, + "grad_norm": 2.3926312923431396, + "learning_rate": 4.99804461579503e-05, + "loss": 6.7136, + "step": 2118 + }, + { + "epoch": 0.012602293272433153, + "grad_norm": 2.7430222034454346, + "learning_rate": 4.9980427682836546e-05, + "loss": 6.5475, + "step": 2119 + }, + { + "epoch": 0.01260824055571415, + "grad_norm": 2.1563844680786133, + "learning_rate": 4.998040919900237e-05, + "loss": 6.7105, + "step": 2120 + }, + { + "epoch": 0.012614187838995147, + "grad_norm": 2.1061437129974365, + "learning_rate": 4.998039070644781e-05, + "loss": 6.6411, + "step": 2121 + }, + { + "epoch": 0.012620135122276143, + "grad_norm": 2.6192378997802734, + "learning_rate": 4.9980372205172844e-05, + "loss": 6.6831, + "step": 2122 + }, + { + "epoch": 0.012626082405557142, + "grad_norm": 2.794616222381592, + "learning_rate": 4.9980353695177495e-05, + "loss": 6.8128, + "step": 2123 + }, + { + "epoch": 0.012632029688838139, + "grad_norm": 2.3656489849090576, + "learning_rate": 4.998033517646176e-05, + "loss": 6.8109, + "step": 2124 + }, + { + "epoch": 0.012637976972119136, + "grad_norm": 2.658433437347412, + "learning_rate": 4.998031664902564e-05, + "loss": 6.7979, + "step": 2125 + }, + { + "epoch": 0.012643924255400132, + "grad_norm": 2.889954090118408, + "learning_rate": 4.9980298112869154e-05, + "loss": 6.6745, + "step": 2126 + }, + { + "epoch": 0.012649871538681131, + "grad_norm": 2.469790458679199, + "learning_rate": 4.9980279567992304e-05, + "loss": 6.7056, + "step": 2127 + }, + { + "epoch": 0.012655818821962128, + "grad_norm": 2.4310262203216553, + "learning_rate": 4.9980261014395094e-05, + "loss": 6.8809, + "step": 2128 + }, + { + "epoch": 0.012661766105243125, + "grad_norm": 2.772359609603882, + "learning_rate": 4.998024245207754e-05, + "loss": 7.0383, + "step": 2129 + }, + { + "epoch": 0.012667713388524121, + "grad_norm": 2.292144775390625, + "learning_rate": 4.9980223881039635e-05, + "loss": 6.9062, + "step": 2130 + }, + { + "epoch": 0.01267366067180512, + "grad_norm": 2.590363025665283, + "learning_rate": 4.998020530128139e-05, + "loss": 6.5803, + "step": 2131 + }, + { + "epoch": 0.012679607955086117, + "grad_norm": 2.78432035446167, + "learning_rate": 4.9980186712802824e-05, + "loss": 6.788, + "step": 2132 + }, + { + "epoch": 0.012685555238367114, + "grad_norm": 2.6188290119171143, + "learning_rate": 4.998016811560392e-05, + "loss": 6.5827, + "step": 2133 + }, + { + "epoch": 0.01269150252164811, + "grad_norm": 2.868215560913086, + "learning_rate": 4.99801495096847e-05, + "loss": 6.5845, + "step": 2134 + }, + { + "epoch": 0.012697449804929109, + "grad_norm": 2.4738945960998535, + "learning_rate": 4.998013089504518e-05, + "loss": 6.5019, + "step": 2135 + }, + { + "epoch": 0.012703397088210106, + "grad_norm": 2.5315287113189697, + "learning_rate": 4.998011227168534e-05, + "loss": 6.6765, + "step": 2136 + }, + { + "epoch": 0.012709344371491103, + "grad_norm": 2.7871086597442627, + "learning_rate": 4.998009363960521e-05, + "loss": 6.64, + "step": 2137 + }, + { + "epoch": 0.0127152916547721, + "grad_norm": 2.267502784729004, + "learning_rate": 4.998007499880479e-05, + "loss": 6.8665, + "step": 2138 + }, + { + "epoch": 0.012721238938053098, + "grad_norm": 2.5014212131500244, + "learning_rate": 4.998005634928408e-05, + "loss": 6.6757, + "step": 2139 + }, + { + "epoch": 0.012727186221334095, + "grad_norm": 2.3600070476531982, + "learning_rate": 4.998003769104308e-05, + "loss": 6.5425, + "step": 2140 + }, + { + "epoch": 0.012733133504615092, + "grad_norm": 2.32123064994812, + "learning_rate": 4.998001902408182e-05, + "loss": 6.5192, + "step": 2141 + }, + { + "epoch": 0.012739080787896088, + "grad_norm": 2.5059258937835693, + "learning_rate": 4.998000034840029e-05, + "loss": 6.6315, + "step": 2142 + }, + { + "epoch": 0.012745028071177087, + "grad_norm": 2.2143092155456543, + "learning_rate": 4.99799816639985e-05, + "loss": 6.6058, + "step": 2143 + }, + { + "epoch": 0.012750975354458084, + "grad_norm": 2.3660342693328857, + "learning_rate": 4.997996297087645e-05, + "loss": 6.554, + "step": 2144 + }, + { + "epoch": 0.01275692263773908, + "grad_norm": 2.4286036491394043, + "learning_rate": 4.9979944269034164e-05, + "loss": 6.4857, + "step": 2145 + }, + { + "epoch": 0.012762869921020078, + "grad_norm": 2.4002180099487305, + "learning_rate": 4.997992555847163e-05, + "loss": 6.5083, + "step": 2146 + }, + { + "epoch": 0.012768817204301076, + "grad_norm": 2.418942451477051, + "learning_rate": 4.997990683918886e-05, + "loss": 6.5471, + "step": 2147 + }, + { + "epoch": 0.012774764487582073, + "grad_norm": 2.535654067993164, + "learning_rate": 4.997988811118587e-05, + "loss": 6.5999, + "step": 2148 + }, + { + "epoch": 0.01278071177086307, + "grad_norm": 2.581505298614502, + "learning_rate": 4.9979869374462655e-05, + "loss": 6.2525, + "step": 2149 + }, + { + "epoch": 0.012786659054144067, + "grad_norm": 2.681297779083252, + "learning_rate": 4.997985062901923e-05, + "loss": 6.1463, + "step": 2150 + }, + { + "epoch": 0.012792606337425065, + "grad_norm": 2.3542990684509277, + "learning_rate": 4.997983187485559e-05, + "loss": 6.433, + "step": 2151 + }, + { + "epoch": 0.012798553620706062, + "grad_norm": 2.2994048595428467, + "learning_rate": 4.997981311197175e-05, + "loss": 6.5952, + "step": 2152 + }, + { + "epoch": 0.012804500903987059, + "grad_norm": 2.4703454971313477, + "learning_rate": 4.9979794340367724e-05, + "loss": 6.5581, + "step": 2153 + }, + { + "epoch": 0.012810448187268056, + "grad_norm": 2.511383533477783, + "learning_rate": 4.9979775560043504e-05, + "loss": 6.577, + "step": 2154 + }, + { + "epoch": 0.012816395470549052, + "grad_norm": 2.3300156593322754, + "learning_rate": 4.99797567709991e-05, + "loss": 6.4349, + "step": 2155 + }, + { + "epoch": 0.012822342753830051, + "grad_norm": 2.523878574371338, + "learning_rate": 4.997973797323452e-05, + "loss": 6.5044, + "step": 2156 + }, + { + "epoch": 0.012828290037111048, + "grad_norm": 2.4185073375701904, + "learning_rate": 4.9979719166749776e-05, + "loss": 6.537, + "step": 2157 + }, + { + "epoch": 0.012834237320392045, + "grad_norm": 2.324090003967285, + "learning_rate": 4.997970035154487e-05, + "loss": 6.803, + "step": 2158 + }, + { + "epoch": 0.012840184603673041, + "grad_norm": 2.468872547149658, + "learning_rate": 4.9979681527619804e-05, + "loss": 7.0837, + "step": 2159 + }, + { + "epoch": 0.01284613188695404, + "grad_norm": 2.1467936038970947, + "learning_rate": 4.99796626949746e-05, + "loss": 6.7373, + "step": 2160 + }, + { + "epoch": 0.012852079170235037, + "grad_norm": 2.3208062648773193, + "learning_rate": 4.9979643853609246e-05, + "loss": 6.5483, + "step": 2161 + }, + { + "epoch": 0.012858026453516034, + "grad_norm": 2.2797584533691406, + "learning_rate": 4.997962500352376e-05, + "loss": 6.5857, + "step": 2162 + }, + { + "epoch": 0.01286397373679703, + "grad_norm": 2.3447721004486084, + "learning_rate": 4.9979606144718135e-05, + "loss": 6.8511, + "step": 2163 + }, + { + "epoch": 0.012869921020078029, + "grad_norm": 2.6456334590911865, + "learning_rate": 4.9979587277192395e-05, + "loss": 6.9457, + "step": 2164 + }, + { + "epoch": 0.012875868303359026, + "grad_norm": 3.2567737102508545, + "learning_rate": 4.997956840094654e-05, + "loss": 6.6405, + "step": 2165 + }, + { + "epoch": 0.012881815586640023, + "grad_norm": 2.847371816635132, + "learning_rate": 4.9979549515980574e-05, + "loss": 6.751, + "step": 2166 + }, + { + "epoch": 0.01288776286992102, + "grad_norm": 2.999779462814331, + "learning_rate": 4.99795306222945e-05, + "loss": 6.7437, + "step": 2167 + }, + { + "epoch": 0.012893710153202018, + "grad_norm": 2.3793458938598633, + "learning_rate": 4.9979511719888336e-05, + "loss": 6.6864, + "step": 2168 + }, + { + "epoch": 0.012899657436483015, + "grad_norm": 2.284724473953247, + "learning_rate": 4.9979492808762084e-05, + "loss": 6.4237, + "step": 2169 + }, + { + "epoch": 0.012905604719764012, + "grad_norm": 2.560758352279663, + "learning_rate": 4.997947388891575e-05, + "loss": 6.5964, + "step": 2170 + }, + { + "epoch": 0.012911552003045008, + "grad_norm": 2.7461421489715576, + "learning_rate": 4.997945496034934e-05, + "loss": 6.5354, + "step": 2171 + }, + { + "epoch": 0.012917499286326007, + "grad_norm": 3.0868208408355713, + "learning_rate": 4.9979436023062854e-05, + "loss": 6.6445, + "step": 2172 + }, + { + "epoch": 0.012923446569607004, + "grad_norm": 2.565009593963623, + "learning_rate": 4.997941707705631e-05, + "loss": 6.6015, + "step": 2173 + }, + { + "epoch": 0.012929393852888, + "grad_norm": 2.9424686431884766, + "learning_rate": 4.997939812232971e-05, + "loss": 6.4887, + "step": 2174 + }, + { + "epoch": 0.012935341136168997, + "grad_norm": 3.0674476623535156, + "learning_rate": 4.997937915888305e-05, + "loss": 6.4728, + "step": 2175 + }, + { + "epoch": 0.012941288419449996, + "grad_norm": 3.040189266204834, + "learning_rate": 4.997936018671636e-05, + "loss": 6.3788, + "step": 2176 + }, + { + "epoch": 0.012947235702730993, + "grad_norm": 2.756211042404175, + "learning_rate": 4.9979341205829626e-05, + "loss": 6.4167, + "step": 2177 + }, + { + "epoch": 0.01295318298601199, + "grad_norm": 2.6333322525024414, + "learning_rate": 4.997932221622287e-05, + "loss": 6.6392, + "step": 2178 + }, + { + "epoch": 0.012959130269292986, + "grad_norm": 2.6951076984405518, + "learning_rate": 4.997930321789608e-05, + "loss": 6.3299, + "step": 2179 + }, + { + "epoch": 0.012965077552573985, + "grad_norm": 2.5388028621673584, + "learning_rate": 4.997928421084928e-05, + "loss": 6.2646, + "step": 2180 + }, + { + "epoch": 0.012971024835854982, + "grad_norm": 3.312171459197998, + "learning_rate": 4.997926519508247e-05, + "loss": 6.6331, + "step": 2181 + }, + { + "epoch": 0.012976972119135979, + "grad_norm": 3.437025547027588, + "learning_rate": 4.997924617059565e-05, + "loss": 5.5981, + "step": 2182 + }, + { + "epoch": 0.012982919402416975, + "grad_norm": 2.74035906791687, + "learning_rate": 4.997922713738884e-05, + "loss": 5.1641, + "step": 2183 + }, + { + "epoch": 0.012988866685697972, + "grad_norm": 2.618525505065918, + "learning_rate": 4.9979208095462036e-05, + "loss": 5.9978, + "step": 2184 + }, + { + "epoch": 0.012994813968978971, + "grad_norm": 2.633692502975464, + "learning_rate": 4.9979189044815254e-05, + "loss": 6.2812, + "step": 2185 + }, + { + "epoch": 0.013000761252259968, + "grad_norm": 2.087557792663574, + "learning_rate": 4.997916998544849e-05, + "loss": 6.2864, + "step": 2186 + }, + { + "epoch": 0.013006708535540965, + "grad_norm": 3.365112066268921, + "learning_rate": 4.997915091736176e-05, + "loss": 5.3517, + "step": 2187 + }, + { + "epoch": 0.013012655818821961, + "grad_norm": 2.7561593055725098, + "learning_rate": 4.997913184055506e-05, + "loss": 6.3667, + "step": 2188 + }, + { + "epoch": 0.01301860310210296, + "grad_norm": 2.630976676940918, + "learning_rate": 4.9979112755028415e-05, + "loss": 6.5858, + "step": 2189 + }, + { + "epoch": 0.013024550385383957, + "grad_norm": 2.56007981300354, + "learning_rate": 4.9979093660781805e-05, + "loss": 6.6862, + "step": 2190 + }, + { + "epoch": 0.013030497668664954, + "grad_norm": 2.509631633758545, + "learning_rate": 4.997907455781526e-05, + "loss": 6.4699, + "step": 2191 + }, + { + "epoch": 0.01303644495194595, + "grad_norm": 2.442028522491455, + "learning_rate": 4.997905544612878e-05, + "loss": 6.5755, + "step": 2192 + }, + { + "epoch": 0.013042392235226949, + "grad_norm": 2.561016321182251, + "learning_rate": 4.997903632572236e-05, + "loss": 6.4529, + "step": 2193 + }, + { + "epoch": 0.013048339518507946, + "grad_norm": 2.585753917694092, + "learning_rate": 4.9979017196596025e-05, + "loss": 6.188, + "step": 2194 + }, + { + "epoch": 0.013054286801788943, + "grad_norm": 2.3657655715942383, + "learning_rate": 4.997899805874977e-05, + "loss": 6.1414, + "step": 2195 + }, + { + "epoch": 0.01306023408506994, + "grad_norm": 2.818251609802246, + "learning_rate": 4.997897891218361e-05, + "loss": 6.5276, + "step": 2196 + }, + { + "epoch": 0.013066181368350938, + "grad_norm": 2.9687695503234863, + "learning_rate": 4.997895975689754e-05, + "loss": 6.131, + "step": 2197 + }, + { + "epoch": 0.013072128651631935, + "grad_norm": 2.8505353927612305, + "learning_rate": 4.997894059289157e-05, + "loss": 6.5269, + "step": 2198 + }, + { + "epoch": 0.013078075934912932, + "grad_norm": 2.331573486328125, + "learning_rate": 4.997892142016573e-05, + "loss": 6.1101, + "step": 2199 + }, + { + "epoch": 0.013084023218193928, + "grad_norm": 2.3241569995880127, + "learning_rate": 4.997890223871998e-05, + "loss": 6.5081, + "step": 2200 + }, + { + "epoch": 0.013089970501474927, + "grad_norm": 2.658834218978882, + "learning_rate": 4.997888304855437e-05, + "loss": 6.554, + "step": 2201 + }, + { + "epoch": 0.013095917784755924, + "grad_norm": 2.703911304473877, + "learning_rate": 4.997886384966889e-05, + "loss": 6.337, + "step": 2202 + }, + { + "epoch": 0.01310186506803692, + "grad_norm": 3.020775318145752, + "learning_rate": 4.997884464206354e-05, + "loss": 6.4375, + "step": 2203 + }, + { + "epoch": 0.013107812351317917, + "grad_norm": 3.324218273162842, + "learning_rate": 4.9978825425738334e-05, + "loss": 6.4871, + "step": 2204 + }, + { + "epoch": 0.013113759634598916, + "grad_norm": 3.822019577026367, + "learning_rate": 4.9978806200693276e-05, + "loss": 6.6372, + "step": 2205 + }, + { + "epoch": 0.013119706917879913, + "grad_norm": 3.3639512062072754, + "learning_rate": 4.997878696692838e-05, + "loss": 6.1826, + "step": 2206 + }, + { + "epoch": 0.01312565420116091, + "grad_norm": 3.580603837966919, + "learning_rate": 4.997876772444365e-05, + "loss": 6.793, + "step": 2207 + }, + { + "epoch": 0.013131601484441906, + "grad_norm": 2.472733497619629, + "learning_rate": 4.9978748473239084e-05, + "loss": 6.9054, + "step": 2208 + }, + { + "epoch": 0.013137548767722905, + "grad_norm": 3.327461004257202, + "learning_rate": 4.99787292133147e-05, + "loss": 6.6735, + "step": 2209 + }, + { + "epoch": 0.013143496051003902, + "grad_norm": 3.493234157562256, + "learning_rate": 4.99787099446705e-05, + "loss": 6.9702, + "step": 2210 + }, + { + "epoch": 0.013149443334284899, + "grad_norm": 2.2516424655914307, + "learning_rate": 4.9978690667306483e-05, + "loss": 7.196, + "step": 2211 + }, + { + "epoch": 0.013155390617565895, + "grad_norm": 1.8846355676651, + "learning_rate": 4.9978671381222665e-05, + "loss": 7.0373, + "step": 2212 + }, + { + "epoch": 0.013161337900846894, + "grad_norm": 2.9334232807159424, + "learning_rate": 4.997865208641906e-05, + "loss": 6.2065, + "step": 2213 + }, + { + "epoch": 0.01316728518412789, + "grad_norm": 2.713006019592285, + "learning_rate": 4.997863278289565e-05, + "loss": 6.788, + "step": 2214 + }, + { + "epoch": 0.013173232467408888, + "grad_norm": 2.6246018409729004, + "learning_rate": 4.9978613470652466e-05, + "loss": 6.7979, + "step": 2215 + }, + { + "epoch": 0.013179179750689884, + "grad_norm": 2.2770373821258545, + "learning_rate": 4.997859414968951e-05, + "loss": 6.8307, + "step": 2216 + }, + { + "epoch": 0.013185127033970881, + "grad_norm": 2.6244993209838867, + "learning_rate": 4.997857482000679e-05, + "loss": 6.3176, + "step": 2217 + }, + { + "epoch": 0.01319107431725188, + "grad_norm": 3.4668054580688477, + "learning_rate": 4.997855548160429e-05, + "loss": 6.8962, + "step": 2218 + }, + { + "epoch": 0.013197021600532877, + "grad_norm": 2.711785078048706, + "learning_rate": 4.9978536134482047e-05, + "loss": 6.7111, + "step": 2219 + }, + { + "epoch": 0.013202968883813873, + "grad_norm": 2.6757078170776367, + "learning_rate": 4.997851677864005e-05, + "loss": 6.5501, + "step": 2220 + }, + { + "epoch": 0.01320891616709487, + "grad_norm": 2.150338888168335, + "learning_rate": 4.997849741407831e-05, + "loss": 6.43, + "step": 2221 + }, + { + "epoch": 0.013214863450375869, + "grad_norm": 3.115309953689575, + "learning_rate": 4.9978478040796836e-05, + "loss": 6.4074, + "step": 2222 + }, + { + "epoch": 0.013220810733656866, + "grad_norm": 2.8754189014434814, + "learning_rate": 4.997845865879564e-05, + "loss": 6.2663, + "step": 2223 + }, + { + "epoch": 0.013226758016937862, + "grad_norm": 2.6169707775115967, + "learning_rate": 4.9978439268074716e-05, + "loss": 6.5987, + "step": 2224 + }, + { + "epoch": 0.01323270530021886, + "grad_norm": 2.3814637660980225, + "learning_rate": 4.997841986863408e-05, + "loss": 6.8124, + "step": 2225 + }, + { + "epoch": 0.013238652583499858, + "grad_norm": 2.0276811122894287, + "learning_rate": 4.997840046047373e-05, + "loss": 6.6632, + "step": 2226 + }, + { + "epoch": 0.013244599866780855, + "grad_norm": 2.7943263053894043, + "learning_rate": 4.997838104359368e-05, + "loss": 6.5452, + "step": 2227 + }, + { + "epoch": 0.013250547150061852, + "grad_norm": 2.4058234691619873, + "learning_rate": 4.997836161799393e-05, + "loss": 6.4697, + "step": 2228 + }, + { + "epoch": 0.013256494433342848, + "grad_norm": 2.2487008571624756, + "learning_rate": 4.9978342183674504e-05, + "loss": 6.3361, + "step": 2229 + }, + { + "epoch": 0.013262441716623847, + "grad_norm": 2.3470170497894287, + "learning_rate": 4.997832274063539e-05, + "loss": 6.4024, + "step": 2230 + }, + { + "epoch": 0.013268388999904844, + "grad_norm": 2.589695692062378, + "learning_rate": 4.9978303288876606e-05, + "loss": 6.4184, + "step": 2231 + }, + { + "epoch": 0.01327433628318584, + "grad_norm": 2.691371440887451, + "learning_rate": 4.997828382839815e-05, + "loss": 6.4225, + "step": 2232 + }, + { + "epoch": 0.013280283566466837, + "grad_norm": 3.110410213470459, + "learning_rate": 4.997826435920003e-05, + "loss": 6.5307, + "step": 2233 + }, + { + "epoch": 0.013286230849747836, + "grad_norm": 2.688519239425659, + "learning_rate": 4.9978244881282266e-05, + "loss": 6.568, + "step": 2234 + }, + { + "epoch": 0.013292178133028833, + "grad_norm": 2.3346059322357178, + "learning_rate": 4.997822539464485e-05, + "loss": 6.8837, + "step": 2235 + }, + { + "epoch": 0.01329812541630983, + "grad_norm": 2.679826021194458, + "learning_rate": 4.997820589928779e-05, + "loss": 6.3961, + "step": 2236 + }, + { + "epoch": 0.013304072699590826, + "grad_norm": 2.388120412826538, + "learning_rate": 4.99781863952111e-05, + "loss": 6.4363, + "step": 2237 + }, + { + "epoch": 0.013310019982871825, + "grad_norm": 2.834341049194336, + "learning_rate": 4.997816688241478e-05, + "loss": 6.4855, + "step": 2238 + }, + { + "epoch": 0.013315967266152822, + "grad_norm": 2.8623831272125244, + "learning_rate": 4.997814736089885e-05, + "loss": 6.8607, + "step": 2239 + }, + { + "epoch": 0.013321914549433819, + "grad_norm": 3.001241683959961, + "learning_rate": 4.99781278306633e-05, + "loss": 6.9777, + "step": 2240 + }, + { + "epoch": 0.013327861832714815, + "grad_norm": 2.9721016883850098, + "learning_rate": 4.9978108291708135e-05, + "loss": 6.9821, + "step": 2241 + }, + { + "epoch": 0.013333809115995814, + "grad_norm": 2.798360824584961, + "learning_rate": 4.997808874403338e-05, + "loss": 7.0096, + "step": 2242 + }, + { + "epoch": 0.01333975639927681, + "grad_norm": 3.2242093086242676, + "learning_rate": 4.997806918763903e-05, + "loss": 6.9091, + "step": 2243 + }, + { + "epoch": 0.013345703682557808, + "grad_norm": 2.681920289993286, + "learning_rate": 4.99780496225251e-05, + "loss": 6.7769, + "step": 2244 + }, + { + "epoch": 0.013351650965838804, + "grad_norm": 3.199514865875244, + "learning_rate": 4.9978030048691584e-05, + "loss": 6.6202, + "step": 2245 + }, + { + "epoch": 0.013357598249119801, + "grad_norm": 2.89886474609375, + "learning_rate": 4.9978010466138496e-05, + "loss": 6.7075, + "step": 2246 + }, + { + "epoch": 0.0133635455324008, + "grad_norm": 2.7091262340545654, + "learning_rate": 4.997799087486584e-05, + "loss": 6.9129, + "step": 2247 + }, + { + "epoch": 0.013369492815681797, + "grad_norm": 2.2538888454437256, + "learning_rate": 4.997797127487364e-05, + "loss": 6.6412, + "step": 2248 + }, + { + "epoch": 0.013375440098962793, + "grad_norm": 2.668286085128784, + "learning_rate": 4.997795166616187e-05, + "loss": 6.8506, + "step": 2249 + }, + { + "epoch": 0.01338138738224379, + "grad_norm": 3.915975570678711, + "learning_rate": 4.997793204873057e-05, + "loss": 6.567, + "step": 2250 + }, + { + "epoch": 0.013387334665524789, + "grad_norm": 2.5549614429473877, + "learning_rate": 4.997791242257972e-05, + "loss": 6.7971, + "step": 2251 + }, + { + "epoch": 0.013393281948805786, + "grad_norm": 2.511810064315796, + "learning_rate": 4.997789278770935e-05, + "loss": 7.1949, + "step": 2252 + }, + { + "epoch": 0.013399229232086782, + "grad_norm": 2.026937484741211, + "learning_rate": 4.9977873144119445e-05, + "loss": 7.2067, + "step": 2253 + }, + { + "epoch": 0.01340517651536778, + "grad_norm": 3.6016058921813965, + "learning_rate": 4.997785349181002e-05, + "loss": 6.549, + "step": 2254 + }, + { + "epoch": 0.013411123798648778, + "grad_norm": 2.867418050765991, + "learning_rate": 4.9977833830781094e-05, + "loss": 6.5562, + "step": 2255 + }, + { + "epoch": 0.013417071081929775, + "grad_norm": 2.2168800830841064, + "learning_rate": 4.9977814161032665e-05, + "loss": 7.1798, + "step": 2256 + }, + { + "epoch": 0.013423018365210771, + "grad_norm": 2.728299856185913, + "learning_rate": 4.997779448256473e-05, + "loss": 6.9314, + "step": 2257 + }, + { + "epoch": 0.013428965648491768, + "grad_norm": 2.7336437702178955, + "learning_rate": 4.997777479537732e-05, + "loss": 7.0643, + "step": 2258 + }, + { + "epoch": 0.013434912931772767, + "grad_norm": 3.1546053886413574, + "learning_rate": 4.997775509947041e-05, + "loss": 6.8853, + "step": 2259 + }, + { + "epoch": 0.013440860215053764, + "grad_norm": 3.037036180496216, + "learning_rate": 4.997773539484404e-05, + "loss": 6.6892, + "step": 2260 + }, + { + "epoch": 0.01344680749833476, + "grad_norm": 2.8779382705688477, + "learning_rate": 4.997771568149818e-05, + "loss": 6.4991, + "step": 2261 + }, + { + "epoch": 0.013452754781615757, + "grad_norm": 3.1105282306671143, + "learning_rate": 4.997769595943288e-05, + "loss": 6.4253, + "step": 2262 + }, + { + "epoch": 0.013458702064896756, + "grad_norm": 4.604808330535889, + "learning_rate": 4.997767622864811e-05, + "loss": 6.504, + "step": 2263 + }, + { + "epoch": 0.013464649348177753, + "grad_norm": 4.345273017883301, + "learning_rate": 4.9977656489143896e-05, + "loss": 6.2, + "step": 2264 + }, + { + "epoch": 0.01347059663145875, + "grad_norm": 2.9744133949279785, + "learning_rate": 4.9977636740920243e-05, + "loss": 6.5458, + "step": 2265 + }, + { + "epoch": 0.013476543914739746, + "grad_norm": 3.3981447219848633, + "learning_rate": 4.9977616983977146e-05, + "loss": 6.9791, + "step": 2266 + }, + { + "epoch": 0.013482491198020745, + "grad_norm": 2.5855109691619873, + "learning_rate": 4.997759721831463e-05, + "loss": 6.7425, + "step": 2267 + }, + { + "epoch": 0.013488438481301742, + "grad_norm": 3.961195707321167, + "learning_rate": 4.997757744393269e-05, + "loss": 6.4042, + "step": 2268 + }, + { + "epoch": 0.013494385764582739, + "grad_norm": 3.8216230869293213, + "learning_rate": 4.997755766083133e-05, + "loss": 6.4962, + "step": 2269 + }, + { + "epoch": 0.013500333047863735, + "grad_norm": 3.077279567718506, + "learning_rate": 4.9977537869010574e-05, + "loss": 6.4298, + "step": 2270 + }, + { + "epoch": 0.013506280331144734, + "grad_norm": 2.56152081489563, + "learning_rate": 4.9977518068470406e-05, + "loss": 6.35, + "step": 2271 + }, + { + "epoch": 0.01351222761442573, + "grad_norm": 2.4069855213165283, + "learning_rate": 4.9977498259210854e-05, + "loss": 6.2923, + "step": 2272 + }, + { + "epoch": 0.013518174897706728, + "grad_norm": 2.9591124057769775, + "learning_rate": 4.9977478441231904e-05, + "loss": 6.2477, + "step": 2273 + }, + { + "epoch": 0.013524122180987724, + "grad_norm": 2.627110481262207, + "learning_rate": 4.997745861453359e-05, + "loss": 6.1012, + "step": 2274 + }, + { + "epoch": 0.013530069464268723, + "grad_norm": 2.3042867183685303, + "learning_rate": 4.997743877911589e-05, + "loss": 6.1155, + "step": 2275 + }, + { + "epoch": 0.01353601674754972, + "grad_norm": 2.709324359893799, + "learning_rate": 4.997741893497882e-05, + "loss": 6.0103, + "step": 2276 + }, + { + "epoch": 0.013541964030830717, + "grad_norm": 2.7087934017181396, + "learning_rate": 4.997739908212241e-05, + "loss": 6.0709, + "step": 2277 + }, + { + "epoch": 0.013547911314111713, + "grad_norm": 3.560149669647217, + "learning_rate": 4.997737922054664e-05, + "loss": 6.1775, + "step": 2278 + }, + { + "epoch": 0.01355385859739271, + "grad_norm": 4.623898506164551, + "learning_rate": 4.997735935025152e-05, + "loss": 6.1993, + "step": 2279 + }, + { + "epoch": 0.013559805880673709, + "grad_norm": 2.9960882663726807, + "learning_rate": 4.997733947123707e-05, + "loss": 6.4211, + "step": 2280 + }, + { + "epoch": 0.013565753163954706, + "grad_norm": 3.8918421268463135, + "learning_rate": 4.9977319583503276e-05, + "loss": 6.0194, + "step": 2281 + }, + { + "epoch": 0.013571700447235702, + "grad_norm": 3.4164741039276123, + "learning_rate": 4.997729968705017e-05, + "loss": 5.9824, + "step": 2282 + }, + { + "epoch": 0.0135776477305167, + "grad_norm": 2.4005794525146484, + "learning_rate": 4.997727978187774e-05, + "loss": 5.9727, + "step": 2283 + }, + { + "epoch": 0.013583595013797698, + "grad_norm": 2.4654550552368164, + "learning_rate": 4.9977259867986e-05, + "loss": 6.2681, + "step": 2284 + }, + { + "epoch": 0.013589542297078695, + "grad_norm": 3.193905830383301, + "learning_rate": 4.997723994537496e-05, + "loss": 6.4996, + "step": 2285 + }, + { + "epoch": 0.013595489580359691, + "grad_norm": 2.4845757484436035, + "learning_rate": 4.997722001404462e-05, + "loss": 7.0464, + "step": 2286 + }, + { + "epoch": 0.013601436863640688, + "grad_norm": 3.170182466506958, + "learning_rate": 4.9977200073995e-05, + "loss": 6.1071, + "step": 2287 + }, + { + "epoch": 0.013607384146921687, + "grad_norm": 2.2331149578094482, + "learning_rate": 4.997718012522609e-05, + "loss": 6.6823, + "step": 2288 + }, + { + "epoch": 0.013613331430202684, + "grad_norm": 2.4146671295166016, + "learning_rate": 4.9977160167737904e-05, + "loss": 6.4398, + "step": 2289 + }, + { + "epoch": 0.01361927871348368, + "grad_norm": 3.23956561088562, + "learning_rate": 4.9977140201530445e-05, + "loss": 6.9295, + "step": 2290 + }, + { + "epoch": 0.013625225996764677, + "grad_norm": 3.402979850769043, + "learning_rate": 4.997712022660374e-05, + "loss": 6.7116, + "step": 2291 + }, + { + "epoch": 0.013631173280045676, + "grad_norm": 3.241320848464966, + "learning_rate": 4.997710024295777e-05, + "loss": 6.8871, + "step": 2292 + }, + { + "epoch": 0.013637120563326673, + "grad_norm": 2.5378634929656982, + "learning_rate": 4.997708025059255e-05, + "loss": 6.9548, + "step": 2293 + }, + { + "epoch": 0.01364306784660767, + "grad_norm": 3.1968839168548584, + "learning_rate": 4.9977060249508087e-05, + "loss": 6.6388, + "step": 2294 + }, + { + "epoch": 0.013649015129888666, + "grad_norm": 2.6951656341552734, + "learning_rate": 4.99770402397044e-05, + "loss": 6.9654, + "step": 2295 + }, + { + "epoch": 0.013654962413169665, + "grad_norm": 2.4168484210968018, + "learning_rate": 4.997702022118147e-05, + "loss": 6.6666, + "step": 2296 + }, + { + "epoch": 0.013660909696450662, + "grad_norm": 3.1395177841186523, + "learning_rate": 4.997700019393934e-05, + "loss": 6.4957, + "step": 2297 + }, + { + "epoch": 0.013666856979731658, + "grad_norm": 3.1591687202453613, + "learning_rate": 4.9976980157977985e-05, + "loss": 6.4392, + "step": 2298 + }, + { + "epoch": 0.013672804263012655, + "grad_norm": 2.2415151596069336, + "learning_rate": 4.9976960113297436e-05, + "loss": 6.4543, + "step": 2299 + }, + { + "epoch": 0.013678751546293654, + "grad_norm": 3.9113616943359375, + "learning_rate": 4.997694005989767e-05, + "loss": 6.7088, + "step": 2300 + }, + { + "epoch": 0.01368469882957465, + "grad_norm": 4.218390941619873, + "learning_rate": 4.997691999777873e-05, + "loss": 6.7199, + "step": 2301 + }, + { + "epoch": 0.013690646112855647, + "grad_norm": 4.200760841369629, + "learning_rate": 4.997689992694059e-05, + "loss": 6.6343, + "step": 2302 + }, + { + "epoch": 0.013696593396136644, + "grad_norm": 3.7164547443389893, + "learning_rate": 4.997687984738328e-05, + "loss": 6.772, + "step": 2303 + }, + { + "epoch": 0.013702540679417643, + "grad_norm": 2.1898231506347656, + "learning_rate": 4.99768597591068e-05, + "loss": 6.6165, + "step": 2304 + }, + { + "epoch": 0.01370848796269864, + "grad_norm": 2.72632098197937, + "learning_rate": 4.9976839662111166e-05, + "loss": 6.6474, + "step": 2305 + }, + { + "epoch": 0.013714435245979636, + "grad_norm": 3.64900279045105, + "learning_rate": 4.997681955639636e-05, + "loss": 6.4322, + "step": 2306 + }, + { + "epoch": 0.013720382529260633, + "grad_norm": 3.978445053100586, + "learning_rate": 4.997679944196241e-05, + "loss": 6.5434, + "step": 2307 + }, + { + "epoch": 0.01372632981254163, + "grad_norm": 5.709702491760254, + "learning_rate": 4.997677931880931e-05, + "loss": 6.5234, + "step": 2308 + }, + { + "epoch": 0.013732277095822629, + "grad_norm": 3.0389838218688965, + "learning_rate": 4.997675918693708e-05, + "loss": 6.4163, + "step": 2309 + }, + { + "epoch": 0.013738224379103625, + "grad_norm": 2.695113182067871, + "learning_rate": 4.9976739046345725e-05, + "loss": 6.6956, + "step": 2310 + }, + { + "epoch": 0.013744171662384622, + "grad_norm": 2.9768142700195312, + "learning_rate": 4.997671889703525e-05, + "loss": 6.5315, + "step": 2311 + }, + { + "epoch": 0.01375011894566562, + "grad_norm": 3.750454902648926, + "learning_rate": 4.997669873900566e-05, + "loss": 6.5568, + "step": 2312 + }, + { + "epoch": 0.013756066228946618, + "grad_norm": 3.390232801437378, + "learning_rate": 4.9976678572256955e-05, + "loss": 6.4916, + "step": 2313 + }, + { + "epoch": 0.013762013512227615, + "grad_norm": 3.1487748622894287, + "learning_rate": 4.997665839678915e-05, + "loss": 6.6378, + "step": 2314 + }, + { + "epoch": 0.013767960795508611, + "grad_norm": 2.5654940605163574, + "learning_rate": 4.997663821260226e-05, + "loss": 6.5817, + "step": 2315 + }, + { + "epoch": 0.013773908078789608, + "grad_norm": 2.7092552185058594, + "learning_rate": 4.9976618019696275e-05, + "loss": 6.982, + "step": 2316 + }, + { + "epoch": 0.013779855362070607, + "grad_norm": 3.642826557159424, + "learning_rate": 4.9976597818071214e-05, + "loss": 6.7951, + "step": 2317 + }, + { + "epoch": 0.013785802645351604, + "grad_norm": 3.4288947582244873, + "learning_rate": 4.997657760772708e-05, + "loss": 6.4366, + "step": 2318 + }, + { + "epoch": 0.0137917499286326, + "grad_norm": 2.7620253562927246, + "learning_rate": 4.997655738866389e-05, + "loss": 6.6588, + "step": 2319 + }, + { + "epoch": 0.013797697211913597, + "grad_norm": 2.4266698360443115, + "learning_rate": 4.997653716088163e-05, + "loss": 6.697, + "step": 2320 + }, + { + "epoch": 0.013803644495194596, + "grad_norm": 2.289365768432617, + "learning_rate": 4.9976516924380325e-05, + "loss": 6.7583, + "step": 2321 + }, + { + "epoch": 0.013809591778475593, + "grad_norm": 2.4238948822021484, + "learning_rate": 4.9976496679159976e-05, + "loss": 6.7949, + "step": 2322 + }, + { + "epoch": 0.01381553906175659, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.997647642522059e-05, + "loss": 6.5914, + "step": 2323 + }, + { + "epoch": 0.013821486345037586, + "grad_norm": 2.961089849472046, + "learning_rate": 4.997645616256217e-05, + "loss": 6.3513, + "step": 2324 + }, + { + "epoch": 0.013827433628318585, + "grad_norm": 2.437685251235962, + "learning_rate": 4.997643589118472e-05, + "loss": 6.4626, + "step": 2325 + }, + { + "epoch": 0.013833380911599582, + "grad_norm": 2.769731044769287, + "learning_rate": 4.9976415611088267e-05, + "loss": 6.2801, + "step": 2326 + }, + { + "epoch": 0.013839328194880578, + "grad_norm": 2.700697183609009, + "learning_rate": 4.9976395322272805e-05, + "loss": 6.1969, + "step": 2327 + }, + { + "epoch": 0.013845275478161575, + "grad_norm": 3.8049886226654053, + "learning_rate": 4.997637502473834e-05, + "loss": 6.769, + "step": 2328 + }, + { + "epoch": 0.013851222761442574, + "grad_norm": 3.748903512954712, + "learning_rate": 4.9976354718484875e-05, + "loss": 6.6486, + "step": 2329 + }, + { + "epoch": 0.01385717004472357, + "grad_norm": 3.7807834148406982, + "learning_rate": 4.9976334403512426e-05, + "loss": 6.6251, + "step": 2330 + }, + { + "epoch": 0.013863117328004567, + "grad_norm": 2.5358874797821045, + "learning_rate": 4.997631407982099e-05, + "loss": 6.4425, + "step": 2331 + }, + { + "epoch": 0.013869064611285564, + "grad_norm": 2.4619522094726562, + "learning_rate": 4.9976293747410596e-05, + "loss": 7.2166, + "step": 2332 + }, + { + "epoch": 0.013875011894566563, + "grad_norm": 2.740412473678589, + "learning_rate": 4.997627340628123e-05, + "loss": 6.8907, + "step": 2333 + }, + { + "epoch": 0.01388095917784756, + "grad_norm": 2.872852087020874, + "learning_rate": 4.9976253056432895e-05, + "loss": 6.6142, + "step": 2334 + }, + { + "epoch": 0.013886906461128556, + "grad_norm": 2.01629900932312, + "learning_rate": 4.997623269786562e-05, + "loss": 6.398, + "step": 2335 + }, + { + "epoch": 0.013892853744409553, + "grad_norm": 2.4405698776245117, + "learning_rate": 4.99762123305794e-05, + "loss": 6.9282, + "step": 2336 + }, + { + "epoch": 0.013898801027690552, + "grad_norm": 2.2520413398742676, + "learning_rate": 4.9976191954574235e-05, + "loss": 6.5565, + "step": 2337 + }, + { + "epoch": 0.013904748310971549, + "grad_norm": 2.314852476119995, + "learning_rate": 4.997617156985014e-05, + "loss": 6.3055, + "step": 2338 + }, + { + "epoch": 0.013910695594252545, + "grad_norm": 2.9049081802368164, + "learning_rate": 4.9976151176407124e-05, + "loss": 7.1806, + "step": 2339 + }, + { + "epoch": 0.013916642877533542, + "grad_norm": 2.7533769607543945, + "learning_rate": 4.9976130774245197e-05, + "loss": 7.0047, + "step": 2340 + }, + { + "epoch": 0.013922590160814539, + "grad_norm": 2.124826431274414, + "learning_rate": 4.997611036336435e-05, + "loss": 7.1897, + "step": 2341 + }, + { + "epoch": 0.013928537444095538, + "grad_norm": 2.5205366611480713, + "learning_rate": 4.997608994376461e-05, + "loss": 6.8592, + "step": 2342 + }, + { + "epoch": 0.013934484727376534, + "grad_norm": 2.8026719093322754, + "learning_rate": 4.9976069515445975e-05, + "loss": 6.6622, + "step": 2343 + }, + { + "epoch": 0.013940432010657531, + "grad_norm": 3.045438051223755, + "learning_rate": 4.997604907840845e-05, + "loss": 6.6176, + "step": 2344 + }, + { + "epoch": 0.013946379293938528, + "grad_norm": 2.820199489593506, + "learning_rate": 4.997602863265204e-05, + "loss": 6.4489, + "step": 2345 + }, + { + "epoch": 0.013952326577219527, + "grad_norm": 2.997990369796753, + "learning_rate": 4.997600817817676e-05, + "loss": 7.0989, + "step": 2346 + }, + { + "epoch": 0.013958273860500523, + "grad_norm": 3.316575050354004, + "learning_rate": 4.9975987714982606e-05, + "loss": 6.9042, + "step": 2347 + }, + { + "epoch": 0.01396422114378152, + "grad_norm": 2.3339803218841553, + "learning_rate": 4.99759672430696e-05, + "loss": 6.8831, + "step": 2348 + }, + { + "epoch": 0.013970168427062517, + "grad_norm": 2.510274648666382, + "learning_rate": 4.997594676243775e-05, + "loss": 7.1093, + "step": 2349 + }, + { + "epoch": 0.013976115710343516, + "grad_norm": 2.893909215927124, + "learning_rate": 4.997592627308705e-05, + "loss": 6.5477, + "step": 2350 + }, + { + "epoch": 0.013982062993624512, + "grad_norm": 3.6036674976348877, + "learning_rate": 4.9975905775017505e-05, + "loss": 6.3278, + "step": 2351 + }, + { + "epoch": 0.01398801027690551, + "grad_norm": 2.1260125637054443, + "learning_rate": 4.9975885268229127e-05, + "loss": 6.7883, + "step": 2352 + }, + { + "epoch": 0.013993957560186506, + "grad_norm": 2.328247308731079, + "learning_rate": 4.997586475272193e-05, + "loss": 6.4832, + "step": 2353 + }, + { + "epoch": 0.013999904843467505, + "grad_norm": 2.8075780868530273, + "learning_rate": 4.997584422849593e-05, + "loss": 6.9333, + "step": 2354 + }, + { + "epoch": 0.014005852126748502, + "grad_norm": 1.9339990615844727, + "learning_rate": 4.9975823695551106e-05, + "loss": 6.6856, + "step": 2355 + }, + { + "epoch": 0.014011799410029498, + "grad_norm": 2.842968225479126, + "learning_rate": 4.997580315388748e-05, + "loss": 6.48, + "step": 2356 + }, + { + "epoch": 0.014017746693310495, + "grad_norm": 1.8715558052062988, + "learning_rate": 4.997578260350506e-05, + "loss": 6.8702, + "step": 2357 + }, + { + "epoch": 0.014023693976591494, + "grad_norm": 2.4310202598571777, + "learning_rate": 4.9975762044403865e-05, + "loss": 7.0112, + "step": 2358 + }, + { + "epoch": 0.01402964125987249, + "grad_norm": 2.292121648788452, + "learning_rate": 4.997574147658387e-05, + "loss": 6.6505, + "step": 2359 + }, + { + "epoch": 0.014035588543153487, + "grad_norm": 2.374007225036621, + "learning_rate": 4.997572090004511e-05, + "loss": 6.7332, + "step": 2360 + }, + { + "epoch": 0.014041535826434484, + "grad_norm": 2.198131561279297, + "learning_rate": 4.997570031478759e-05, + "loss": 6.6358, + "step": 2361 + }, + { + "epoch": 0.014047483109715483, + "grad_norm": 2.3109302520751953, + "learning_rate": 4.997567972081131e-05, + "loss": 6.6194, + "step": 2362 + }, + { + "epoch": 0.01405343039299648, + "grad_norm": 2.49338698387146, + "learning_rate": 4.997565911811627e-05, + "loss": 6.5036, + "step": 2363 + }, + { + "epoch": 0.014059377676277476, + "grad_norm": 2.6462419033050537, + "learning_rate": 4.997563850670249e-05, + "loss": 6.4294, + "step": 2364 + }, + { + "epoch": 0.014065324959558473, + "grad_norm": 3.0072524547576904, + "learning_rate": 4.997561788656997e-05, + "loss": 6.8814, + "step": 2365 + }, + { + "epoch": 0.014071272242839472, + "grad_norm": 2.435209035873413, + "learning_rate": 4.997559725771872e-05, + "loss": 6.4684, + "step": 2366 + }, + { + "epoch": 0.014077219526120469, + "grad_norm": 2.8023672103881836, + "learning_rate": 4.997557662014875e-05, + "loss": 6.7922, + "step": 2367 + }, + { + "epoch": 0.014083166809401465, + "grad_norm": 2.6129658222198486, + "learning_rate": 4.9975555973860065e-05, + "loss": 6.4539, + "step": 2368 + }, + { + "epoch": 0.014089114092682462, + "grad_norm": 2.559117317199707, + "learning_rate": 4.997553531885267e-05, + "loss": 6.4713, + "step": 2369 + }, + { + "epoch": 0.014095061375963459, + "grad_norm": 2.4535956382751465, + "learning_rate": 4.9975514655126575e-05, + "loss": 6.963, + "step": 2370 + }, + { + "epoch": 0.014101008659244458, + "grad_norm": 2.3025150299072266, + "learning_rate": 4.997549398268178e-05, + "loss": 6.9299, + "step": 2371 + }, + { + "epoch": 0.014106955942525454, + "grad_norm": 2.834411382675171, + "learning_rate": 4.997547330151831e-05, + "loss": 6.299, + "step": 2372 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 2.8046083450317383, + "learning_rate": 4.997545261163615e-05, + "loss": 5.7691, + "step": 2373 + }, + { + "epoch": 0.014118850509087448, + "grad_norm": 2.663776159286499, + "learning_rate": 4.997543191303532e-05, + "loss": 5.969, + "step": 2374 + }, + { + "epoch": 0.014124797792368447, + "grad_norm": 2.725154161453247, + "learning_rate": 4.997541120571582e-05, + "loss": 5.7473, + "step": 2375 + }, + { + "epoch": 0.014130745075649443, + "grad_norm": 2.9021074771881104, + "learning_rate": 4.9975390489677663e-05, + "loss": 6.3177, + "step": 2376 + }, + { + "epoch": 0.01413669235893044, + "grad_norm": 2.4043307304382324, + "learning_rate": 4.9975369764920866e-05, + "loss": 6.358, + "step": 2377 + }, + { + "epoch": 0.014142639642211437, + "grad_norm": 2.4163010120391846, + "learning_rate": 4.997534903144542e-05, + "loss": 6.6807, + "step": 2378 + }, + { + "epoch": 0.014148586925492436, + "grad_norm": 3.0710666179656982, + "learning_rate": 4.9975328289251335e-05, + "loss": 6.2416, + "step": 2379 + }, + { + "epoch": 0.014154534208773432, + "grad_norm": 2.159627676010132, + "learning_rate": 4.997530753833862e-05, + "loss": 7.1434, + "step": 2380 + }, + { + "epoch": 0.01416048149205443, + "grad_norm": 2.308382034301758, + "learning_rate": 4.997528677870729e-05, + "loss": 7.1243, + "step": 2381 + }, + { + "epoch": 0.014166428775335426, + "grad_norm": 2.7461323738098145, + "learning_rate": 4.997526601035734e-05, + "loss": 6.3066, + "step": 2382 + }, + { + "epoch": 0.014172376058616425, + "grad_norm": 2.8835322856903076, + "learning_rate": 4.997524523328878e-05, + "loss": 6.28, + "step": 2383 + }, + { + "epoch": 0.014178323341897421, + "grad_norm": 2.5195534229278564, + "learning_rate": 4.997522444750162e-05, + "loss": 6.9561, + "step": 2384 + }, + { + "epoch": 0.014184270625178418, + "grad_norm": 3.1697885990142822, + "learning_rate": 4.997520365299587e-05, + "loss": 6.7432, + "step": 2385 + }, + { + "epoch": 0.014190217908459415, + "grad_norm": 3.6300339698791504, + "learning_rate": 4.997518284977154e-05, + "loss": 6.3676, + "step": 2386 + }, + { + "epoch": 0.014196165191740414, + "grad_norm": 3.261981964111328, + "learning_rate": 4.9975162037828625e-05, + "loss": 6.0991, + "step": 2387 + }, + { + "epoch": 0.01420211247502141, + "grad_norm": 3.6291120052337646, + "learning_rate": 4.9975141217167146e-05, + "loss": 6.1239, + "step": 2388 + }, + { + "epoch": 0.014208059758302407, + "grad_norm": 3.192958116531372, + "learning_rate": 4.997512038778709e-05, + "loss": 6.4455, + "step": 2389 + }, + { + "epoch": 0.014214007041583404, + "grad_norm": 2.8887948989868164, + "learning_rate": 4.997509954968849e-05, + "loss": 6.9441, + "step": 2390 + }, + { + "epoch": 0.014219954324864403, + "grad_norm": 2.3568248748779297, + "learning_rate": 4.9975078702871336e-05, + "loss": 7.0207, + "step": 2391 + }, + { + "epoch": 0.0142259016081454, + "grad_norm": 2.2629294395446777, + "learning_rate": 4.997505784733564e-05, + "loss": 6.9575, + "step": 2392 + }, + { + "epoch": 0.014231848891426396, + "grad_norm": 2.5458898544311523, + "learning_rate": 4.99750369830814e-05, + "loss": 6.8533, + "step": 2393 + }, + { + "epoch": 0.014237796174707393, + "grad_norm": 2.5125060081481934, + "learning_rate": 4.997501611010865e-05, + "loss": 6.8615, + "step": 2394 + }, + { + "epoch": 0.014243743457988392, + "grad_norm": 2.9903738498687744, + "learning_rate": 4.997499522841737e-05, + "loss": 6.6927, + "step": 2395 + }, + { + "epoch": 0.014249690741269389, + "grad_norm": 2.7536470890045166, + "learning_rate": 4.997497433800758e-05, + "loss": 6.6454, + "step": 2396 + }, + { + "epoch": 0.014255638024550385, + "grad_norm": 3.5041043758392334, + "learning_rate": 4.997495343887928e-05, + "loss": 6.485, + "step": 2397 + }, + { + "epoch": 0.014261585307831382, + "grad_norm": 3.8025100231170654, + "learning_rate": 4.997493253103249e-05, + "loss": 6.3731, + "step": 2398 + }, + { + "epoch": 0.01426753259111238, + "grad_norm": 3.2657718658447266, + "learning_rate": 4.99749116144672e-05, + "loss": 6.23, + "step": 2399 + }, + { + "epoch": 0.014273479874393378, + "grad_norm": 2.721632719039917, + "learning_rate": 4.997489068918343e-05, + "loss": 6.7292, + "step": 2400 + }, + { + "epoch": 0.014279427157674374, + "grad_norm": 2.3483569622039795, + "learning_rate": 4.9974869755181186e-05, + "loss": 6.4842, + "step": 2401 + }, + { + "epoch": 0.014285374440955371, + "grad_norm": 2.4931676387786865, + "learning_rate": 4.997484881246047e-05, + "loss": 7.0529, + "step": 2402 + }, + { + "epoch": 0.014291321724236368, + "grad_norm": 2.4944825172424316, + "learning_rate": 4.99748278610213e-05, + "loss": 7.0185, + "step": 2403 + }, + { + "epoch": 0.014297269007517367, + "grad_norm": 2.9124202728271484, + "learning_rate": 4.997480690086367e-05, + "loss": 6.9847, + "step": 2404 + }, + { + "epoch": 0.014303216290798363, + "grad_norm": 2.5802674293518066, + "learning_rate": 4.997478593198759e-05, + "loss": 7.0389, + "step": 2405 + }, + { + "epoch": 0.01430916357407936, + "grad_norm": 2.636709451675415, + "learning_rate": 4.9974764954393075e-05, + "loss": 6.7281, + "step": 2406 + }, + { + "epoch": 0.014315110857360357, + "grad_norm": 3.801760196685791, + "learning_rate": 4.997474396808012e-05, + "loss": 5.9962, + "step": 2407 + }, + { + "epoch": 0.014321058140641356, + "grad_norm": 3.7983996868133545, + "learning_rate": 4.997472297304875e-05, + "loss": 6.3821, + "step": 2408 + }, + { + "epoch": 0.014327005423922352, + "grad_norm": 2.863408088684082, + "learning_rate": 4.997470196929895e-05, + "loss": 6.2206, + "step": 2409 + }, + { + "epoch": 0.01433295270720335, + "grad_norm": 2.6187095642089844, + "learning_rate": 4.997468095683076e-05, + "loss": 6.2205, + "step": 2410 + }, + { + "epoch": 0.014338899990484346, + "grad_norm": 3.202986240386963, + "learning_rate": 4.997465993564414e-05, + "loss": 6.259, + "step": 2411 + }, + { + "epoch": 0.014344847273765345, + "grad_norm": 2.9131264686584473, + "learning_rate": 4.9974638905739146e-05, + "loss": 6.4159, + "step": 2412 + }, + { + "epoch": 0.014350794557046341, + "grad_norm": 2.384477376937866, + "learning_rate": 4.9974617867115754e-05, + "loss": 6.6669, + "step": 2413 + }, + { + "epoch": 0.014356741840327338, + "grad_norm": 2.448495626449585, + "learning_rate": 4.997459681977398e-05, + "loss": 6.5679, + "step": 2414 + }, + { + "epoch": 0.014362689123608335, + "grad_norm": 2.1945343017578125, + "learning_rate": 4.997457576371384e-05, + "loss": 6.3856, + "step": 2415 + }, + { + "epoch": 0.014368636406889334, + "grad_norm": 1.867848515510559, + "learning_rate": 4.997455469893533e-05, + "loss": 6.3127, + "step": 2416 + }, + { + "epoch": 0.01437458369017033, + "grad_norm": 2.560976266860962, + "learning_rate": 4.997453362543846e-05, + "loss": 6.4619, + "step": 2417 + }, + { + "epoch": 0.014380530973451327, + "grad_norm": 3.2440431118011475, + "learning_rate": 4.997451254322323e-05, + "loss": 6.399, + "step": 2418 + }, + { + "epoch": 0.014386478256732324, + "grad_norm": 3.0021307468414307, + "learning_rate": 4.9974491452289664e-05, + "loss": 6.174, + "step": 2419 + }, + { + "epoch": 0.014392425540013323, + "grad_norm": 2.6046524047851562, + "learning_rate": 4.997447035263776e-05, + "loss": 6.8284, + "step": 2420 + }, + { + "epoch": 0.01439837282329432, + "grad_norm": 3.1395344734191895, + "learning_rate": 4.997444924426753e-05, + "loss": 6.3395, + "step": 2421 + }, + { + "epoch": 0.014404320106575316, + "grad_norm": 3.056152582168579, + "learning_rate": 4.997442812717897e-05, + "loss": 6.3468, + "step": 2422 + }, + { + "epoch": 0.014410267389856313, + "grad_norm": 2.2532267570495605, + "learning_rate": 4.9974407001372105e-05, + "loss": 6.5187, + "step": 2423 + }, + { + "epoch": 0.014416214673137312, + "grad_norm": 2.0228383541107178, + "learning_rate": 4.997438586684693e-05, + "loss": 6.4452, + "step": 2424 + }, + { + "epoch": 0.014422161956418308, + "grad_norm": 3.2889909744262695, + "learning_rate": 4.997436472360345e-05, + "loss": 6.6466, + "step": 2425 + }, + { + "epoch": 0.014428109239699305, + "grad_norm": 2.957916498184204, + "learning_rate": 4.9974343571641677e-05, + "loss": 6.9617, + "step": 2426 + }, + { + "epoch": 0.014434056522980302, + "grad_norm": 2.7629241943359375, + "learning_rate": 4.997432241096162e-05, + "loss": 6.1687, + "step": 2427 + }, + { + "epoch": 0.0144400038062613, + "grad_norm": 2.849297285079956, + "learning_rate": 4.997430124156329e-05, + "loss": 6.4647, + "step": 2428 + }, + { + "epoch": 0.014445951089542297, + "grad_norm": 2.2432122230529785, + "learning_rate": 4.997428006344669e-05, + "loss": 7.1739, + "step": 2429 + }, + { + "epoch": 0.014451898372823294, + "grad_norm": 2.814807891845703, + "learning_rate": 4.997425887661181e-05, + "loss": 5.945, + "step": 2430 + }, + { + "epoch": 0.014457845656104291, + "grad_norm": 3.140153646469116, + "learning_rate": 4.997423768105869e-05, + "loss": 6.5948, + "step": 2431 + }, + { + "epoch": 0.01446379293938529, + "grad_norm": 2.5276620388031006, + "learning_rate": 4.997421647678732e-05, + "loss": 6.9813, + "step": 2432 + }, + { + "epoch": 0.014469740222666286, + "grad_norm": 2.462204694747925, + "learning_rate": 4.9974195263797705e-05, + "loss": 6.8987, + "step": 2433 + }, + { + "epoch": 0.014475687505947283, + "grad_norm": 3.117255210876465, + "learning_rate": 4.997417404208986e-05, + "loss": 5.883, + "step": 2434 + }, + { + "epoch": 0.01448163478922828, + "grad_norm": 2.6207518577575684, + "learning_rate": 4.997415281166379e-05, + "loss": 6.8065, + "step": 2435 + }, + { + "epoch": 0.014487582072509277, + "grad_norm": 2.996624231338501, + "learning_rate": 4.99741315725195e-05, + "loss": 6.5162, + "step": 2436 + }, + { + "epoch": 0.014493529355790276, + "grad_norm": 2.1946496963500977, + "learning_rate": 4.9974110324656996e-05, + "loss": 6.9521, + "step": 2437 + }, + { + "epoch": 0.014499476639071272, + "grad_norm": 2.273017406463623, + "learning_rate": 4.997408906807629e-05, + "loss": 7.0144, + "step": 2438 + }, + { + "epoch": 0.01450542392235227, + "grad_norm": 2.516509771347046, + "learning_rate": 4.997406780277739e-05, + "loss": 7.013, + "step": 2439 + }, + { + "epoch": 0.014511371205633266, + "grad_norm": 3.0296435356140137, + "learning_rate": 4.9974046528760296e-05, + "loss": 6.934, + "step": 2440 + }, + { + "epoch": 0.014517318488914265, + "grad_norm": 2.6135010719299316, + "learning_rate": 4.9974025246025024e-05, + "loss": 6.7151, + "step": 2441 + }, + { + "epoch": 0.014523265772195261, + "grad_norm": 2.6850788593292236, + "learning_rate": 4.997400395457158e-05, + "loss": 6.5223, + "step": 2442 + }, + { + "epoch": 0.014529213055476258, + "grad_norm": 3.0401692390441895, + "learning_rate": 4.9973982654399966e-05, + "loss": 7.2006, + "step": 2443 + }, + { + "epoch": 0.014535160338757255, + "grad_norm": 3.016805410385132, + "learning_rate": 4.997396134551019e-05, + "loss": 7.0633, + "step": 2444 + }, + { + "epoch": 0.014541107622038254, + "grad_norm": 3.107154130935669, + "learning_rate": 4.9973940027902264e-05, + "loss": 6.9096, + "step": 2445 + }, + { + "epoch": 0.01454705490531925, + "grad_norm": 2.720054864883423, + "learning_rate": 4.9973918701576196e-05, + "loss": 6.7061, + "step": 2446 + }, + { + "epoch": 0.014553002188600247, + "grad_norm": 2.386401414871216, + "learning_rate": 4.9973897366531984e-05, + "loss": 6.5877, + "step": 2447 + }, + { + "epoch": 0.014558949471881244, + "grad_norm": 2.488243579864502, + "learning_rate": 4.997387602276965e-05, + "loss": 6.7792, + "step": 2448 + }, + { + "epoch": 0.014564896755162243, + "grad_norm": 2.7504360675811768, + "learning_rate": 4.9973854670289196e-05, + "loss": 6.6164, + "step": 2449 + }, + { + "epoch": 0.01457084403844324, + "grad_norm": 3.001441240310669, + "learning_rate": 4.9973833309090626e-05, + "loss": 6.5933, + "step": 2450 + }, + { + "epoch": 0.014576791321724236, + "grad_norm": 2.6449999809265137, + "learning_rate": 4.997381193917394e-05, + "loss": 6.5323, + "step": 2451 + }, + { + "epoch": 0.014582738605005233, + "grad_norm": 2.81846022605896, + "learning_rate": 4.9973790560539156e-05, + "loss": 6.5146, + "step": 2452 + }, + { + "epoch": 0.014588685888286232, + "grad_norm": 2.662916421890259, + "learning_rate": 4.997376917318629e-05, + "loss": 6.161, + "step": 2453 + }, + { + "epoch": 0.014594633171567228, + "grad_norm": 2.689601421356201, + "learning_rate": 4.997374777711533e-05, + "loss": 6.2008, + "step": 2454 + }, + { + "epoch": 0.014600580454848225, + "grad_norm": 2.6690561771392822, + "learning_rate": 4.99737263723263e-05, + "loss": 6.4418, + "step": 2455 + }, + { + "epoch": 0.014606527738129222, + "grad_norm": 2.897270917892456, + "learning_rate": 4.997370495881919e-05, + "loss": 6.3968, + "step": 2456 + }, + { + "epoch": 0.01461247502141022, + "grad_norm": 2.9327831268310547, + "learning_rate": 4.997368353659402e-05, + "loss": 6.4665, + "step": 2457 + }, + { + "epoch": 0.014618422304691217, + "grad_norm": 2.658013343811035, + "learning_rate": 4.99736621056508e-05, + "loss": 6.399, + "step": 2458 + }, + { + "epoch": 0.014624369587972214, + "grad_norm": 2.6055238246917725, + "learning_rate": 4.997364066598953e-05, + "loss": 6.4679, + "step": 2459 + }, + { + "epoch": 0.014630316871253211, + "grad_norm": 3.0595951080322266, + "learning_rate": 4.997361921761022e-05, + "loss": 5.8797, + "step": 2460 + }, + { + "epoch": 0.01463626415453421, + "grad_norm": 2.994694471359253, + "learning_rate": 4.997359776051288e-05, + "loss": 5.704, + "step": 2461 + }, + { + "epoch": 0.014642211437815206, + "grad_norm": 2.78153657913208, + "learning_rate": 4.9973576294697514e-05, + "loss": 5.7289, + "step": 2462 + }, + { + "epoch": 0.014648158721096203, + "grad_norm": 2.5119385719299316, + "learning_rate": 4.997355482016414e-05, + "loss": 5.5494, + "step": 2463 + }, + { + "epoch": 0.0146541060043772, + "grad_norm": 2.7880990505218506, + "learning_rate": 4.997353333691274e-05, + "loss": 5.5905, + "step": 2464 + }, + { + "epoch": 0.014660053287658197, + "grad_norm": 2.827352523803711, + "learning_rate": 4.9973511844943346e-05, + "loss": 6.4429, + "step": 2465 + }, + { + "epoch": 0.014666000570939195, + "grad_norm": 2.4297358989715576, + "learning_rate": 4.997349034425595e-05, + "loss": 6.8647, + "step": 2466 + }, + { + "epoch": 0.014671947854220192, + "grad_norm": 2.649064064025879, + "learning_rate": 4.997346883485057e-05, + "loss": 6.5568, + "step": 2467 + }, + { + "epoch": 0.014677895137501189, + "grad_norm": 3.2215452194213867, + "learning_rate": 4.9973447316727215e-05, + "loss": 5.5684, + "step": 2468 + }, + { + "epoch": 0.014683842420782186, + "grad_norm": 2.8760056495666504, + "learning_rate": 4.9973425789885884e-05, + "loss": 5.6395, + "step": 2469 + }, + { + "epoch": 0.014689789704063184, + "grad_norm": 2.4002890586853027, + "learning_rate": 4.9973404254326585e-05, + "loss": 5.9525, + "step": 2470 + }, + { + "epoch": 0.014695736987344181, + "grad_norm": 2.32314395904541, + "learning_rate": 4.997338271004933e-05, + "loss": 6.9675, + "step": 2471 + }, + { + "epoch": 0.014701684270625178, + "grad_norm": 2.262680768966675, + "learning_rate": 4.997336115705413e-05, + "loss": 7.1361, + "step": 2472 + }, + { + "epoch": 0.014707631553906175, + "grad_norm": 2.2855215072631836, + "learning_rate": 4.997333959534098e-05, + "loss": 7.1141, + "step": 2473 + }, + { + "epoch": 0.014713578837187173, + "grad_norm": 2.5461738109588623, + "learning_rate": 4.99733180249099e-05, + "loss": 7.0492, + "step": 2474 + }, + { + "epoch": 0.01471952612046817, + "grad_norm": 2.455561399459839, + "learning_rate": 4.99732964457609e-05, + "loss": 6.9303, + "step": 2475 + }, + { + "epoch": 0.014725473403749167, + "grad_norm": 3.3767740726470947, + "learning_rate": 4.997327485789397e-05, + "loss": 6.8531, + "step": 2476 + }, + { + "epoch": 0.014731420687030164, + "grad_norm": 2.9320104122161865, + "learning_rate": 4.9973253261309125e-05, + "loss": 6.9258, + "step": 2477 + }, + { + "epoch": 0.014737367970311162, + "grad_norm": 2.380960464477539, + "learning_rate": 4.997323165600638e-05, + "loss": 6.8581, + "step": 2478 + }, + { + "epoch": 0.01474331525359216, + "grad_norm": 2.727154016494751, + "learning_rate": 4.997321004198574e-05, + "loss": 7.3814, + "step": 2479 + }, + { + "epoch": 0.014749262536873156, + "grad_norm": 2.8693020343780518, + "learning_rate": 4.997318841924721e-05, + "loss": 6.3793, + "step": 2480 + }, + { + "epoch": 0.014755209820154153, + "grad_norm": 2.941622734069824, + "learning_rate": 4.997316678779079e-05, + "loss": 7.3567, + "step": 2481 + }, + { + "epoch": 0.014761157103435152, + "grad_norm": 3.0310213565826416, + "learning_rate": 4.9973145147616505e-05, + "loss": 6.8832, + "step": 2482 + }, + { + "epoch": 0.014767104386716148, + "grad_norm": 1.9184696674346924, + "learning_rate": 4.9973123498724353e-05, + "loss": 6.7369, + "step": 2483 + }, + { + "epoch": 0.014773051669997145, + "grad_norm": 2.3090195655822754, + "learning_rate": 4.9973101841114335e-05, + "loss": 6.8927, + "step": 2484 + }, + { + "epoch": 0.014778998953278142, + "grad_norm": 2.2947685718536377, + "learning_rate": 4.997308017478647e-05, + "loss": 6.9441, + "step": 2485 + }, + { + "epoch": 0.01478494623655914, + "grad_norm": 2.363690137863159, + "learning_rate": 4.997305849974076e-05, + "loss": 6.9397, + "step": 2486 + }, + { + "epoch": 0.014790893519840137, + "grad_norm": 1.7546948194503784, + "learning_rate": 4.997303681597721e-05, + "loss": 6.7888, + "step": 2487 + }, + { + "epoch": 0.014796840803121134, + "grad_norm": 1.8824211359024048, + "learning_rate": 4.997301512349584e-05, + "loss": 6.6486, + "step": 2488 + }, + { + "epoch": 0.014802788086402131, + "grad_norm": 3.68865704536438, + "learning_rate": 4.9972993422296636e-05, + "loss": 7.0318, + "step": 2489 + }, + { + "epoch": 0.01480873536968313, + "grad_norm": 3.0788486003875732, + "learning_rate": 4.997297171237962e-05, + "loss": 6.814, + "step": 2490 + }, + { + "epoch": 0.014814682652964126, + "grad_norm": 2.6903607845306396, + "learning_rate": 4.997294999374481e-05, + "loss": 6.9752, + "step": 2491 + }, + { + "epoch": 0.014820629936245123, + "grad_norm": 2.6673712730407715, + "learning_rate": 4.9972928266392194e-05, + "loss": 6.9083, + "step": 2492 + }, + { + "epoch": 0.01482657721952612, + "grad_norm": 2.335632801055908, + "learning_rate": 4.9972906530321786e-05, + "loss": 7.027, + "step": 2493 + }, + { + "epoch": 0.014832524502807119, + "grad_norm": 3.2885966300964355, + "learning_rate": 4.997288478553359e-05, + "loss": 6.6551, + "step": 2494 + }, + { + "epoch": 0.014838471786088115, + "grad_norm": 2.7297918796539307, + "learning_rate": 4.997286303202762e-05, + "loss": 6.7345, + "step": 2495 + }, + { + "epoch": 0.014844419069369112, + "grad_norm": 2.640814781188965, + "learning_rate": 4.997284126980388e-05, + "loss": 6.743, + "step": 2496 + }, + { + "epoch": 0.014850366352650109, + "grad_norm": 2.699632167816162, + "learning_rate": 4.997281949886239e-05, + "loss": 6.4633, + "step": 2497 + }, + { + "epoch": 0.014856313635931106, + "grad_norm": 2.5185790061950684, + "learning_rate": 4.9972797719203135e-05, + "loss": 6.5496, + "step": 2498 + }, + { + "epoch": 0.014862260919212104, + "grad_norm": 2.659393548965454, + "learning_rate": 4.9972775930826144e-05, + "loss": 6.5066, + "step": 2499 + }, + { + "epoch": 0.014868208202493101, + "grad_norm": 2.160808563232422, + "learning_rate": 4.99727541337314e-05, + "loss": 6.9851, + "step": 2500 + }, + { + "epoch": 0.014874155485774098, + "grad_norm": 2.656506299972534, + "learning_rate": 4.997273232791894e-05, + "loss": 7.5696, + "step": 2501 + }, + { + "epoch": 0.014880102769055095, + "grad_norm": 2.490612506866455, + "learning_rate": 4.9972710513388754e-05, + "loss": 7.2623, + "step": 2502 + }, + { + "epoch": 0.014886050052336093, + "grad_norm": 2.1744866371154785, + "learning_rate": 4.997268869014085e-05, + "loss": 6.5208, + "step": 2503 + }, + { + "epoch": 0.01489199733561709, + "grad_norm": 2.8058252334594727, + "learning_rate": 4.9972666858175236e-05, + "loss": 6.1527, + "step": 2504 + }, + { + "epoch": 0.014897944618898087, + "grad_norm": 2.418827533721924, + "learning_rate": 4.997264501749193e-05, + "loss": 6.2244, + "step": 2505 + }, + { + "epoch": 0.014903891902179084, + "grad_norm": 2.499648332595825, + "learning_rate": 4.997262316809092e-05, + "loss": 6.8904, + "step": 2506 + }, + { + "epoch": 0.014909839185460082, + "grad_norm": 2.3598594665527344, + "learning_rate": 4.9972601309972235e-05, + "loss": 7.0794, + "step": 2507 + }, + { + "epoch": 0.01491578646874108, + "grad_norm": 2.2443082332611084, + "learning_rate": 4.997257944313587e-05, + "loss": 7.3078, + "step": 2508 + }, + { + "epoch": 0.014921733752022076, + "grad_norm": 2.407501459121704, + "learning_rate": 4.9972557567581835e-05, + "loss": 7.0677, + "step": 2509 + }, + { + "epoch": 0.014927681035303073, + "grad_norm": 2.060865640640259, + "learning_rate": 4.997253568331014e-05, + "loss": 6.7128, + "step": 2510 + }, + { + "epoch": 0.014933628318584071, + "grad_norm": 2.3876516819000244, + "learning_rate": 4.997251379032078e-05, + "loss": 6.7562, + "step": 2511 + }, + { + "epoch": 0.014939575601865068, + "grad_norm": 2.387176990509033, + "learning_rate": 4.997249188861379e-05, + "loss": 6.8237, + "step": 2512 + }, + { + "epoch": 0.014945522885146065, + "grad_norm": 2.7324886322021484, + "learning_rate": 4.997246997818915e-05, + "loss": 6.8963, + "step": 2513 + }, + { + "epoch": 0.014951470168427062, + "grad_norm": 2.3832128047943115, + "learning_rate": 4.997244805904689e-05, + "loss": 6.9467, + "step": 2514 + }, + { + "epoch": 0.01495741745170806, + "grad_norm": 1.8594162464141846, + "learning_rate": 4.9972426131187e-05, + "loss": 7.0712, + "step": 2515 + }, + { + "epoch": 0.014963364734989057, + "grad_norm": 2.322068691253662, + "learning_rate": 4.997240419460949e-05, + "loss": 6.8898, + "step": 2516 + }, + { + "epoch": 0.014969312018270054, + "grad_norm": 2.4850032329559326, + "learning_rate": 4.997238224931438e-05, + "loss": 6.5439, + "step": 2517 + }, + { + "epoch": 0.014975259301551051, + "grad_norm": 2.919579029083252, + "learning_rate": 4.997236029530166e-05, + "loss": 6.3987, + "step": 2518 + }, + { + "epoch": 0.01498120658483205, + "grad_norm": 2.651900053024292, + "learning_rate": 4.997233833257135e-05, + "loss": 6.2735, + "step": 2519 + }, + { + "epoch": 0.014987153868113046, + "grad_norm": 2.7912142276763916, + "learning_rate": 4.997231636112346e-05, + "loss": 6.9835, + "step": 2520 + }, + { + "epoch": 0.014993101151394043, + "grad_norm": 2.5735433101654053, + "learning_rate": 4.997229438095799e-05, + "loss": 7.1218, + "step": 2521 + }, + { + "epoch": 0.01499904843467504, + "grad_norm": 2.483186721801758, + "learning_rate": 4.997227239207494e-05, + "loss": 7.0343, + "step": 2522 + }, + { + "epoch": 0.015004995717956039, + "grad_norm": 2.9296681880950928, + "learning_rate": 4.997225039447434e-05, + "loss": 6.5455, + "step": 2523 + }, + { + "epoch": 0.015010943001237035, + "grad_norm": 2.5536422729492188, + "learning_rate": 4.997222838815618e-05, + "loss": 6.7173, + "step": 2524 + }, + { + "epoch": 0.015016890284518032, + "grad_norm": 6.365324020385742, + "learning_rate": 4.997220637312047e-05, + "loss": 6.0909, + "step": 2525 + }, + { + "epoch": 0.015022837567799029, + "grad_norm": 3.7258150577545166, + "learning_rate": 4.997218434936723e-05, + "loss": 5.9019, + "step": 2526 + }, + { + "epoch": 0.015028784851080026, + "grad_norm": 2.9021997451782227, + "learning_rate": 4.997216231689645e-05, + "loss": 5.8601, + "step": 2527 + }, + { + "epoch": 0.015034732134361024, + "grad_norm": 2.570988416671753, + "learning_rate": 4.997214027570815e-05, + "loss": 6.1513, + "step": 2528 + }, + { + "epoch": 0.015040679417642021, + "grad_norm": 3.013540029525757, + "learning_rate": 4.997211822580233e-05, + "loss": 6.6471, + "step": 2529 + }, + { + "epoch": 0.015046626700923018, + "grad_norm": 2.612210750579834, + "learning_rate": 4.997209616717901e-05, + "loss": 6.5523, + "step": 2530 + }, + { + "epoch": 0.015052573984204015, + "grad_norm": 2.93513822555542, + "learning_rate": 4.9972074099838186e-05, + "loss": 6.1845, + "step": 2531 + }, + { + "epoch": 0.015058521267485013, + "grad_norm": 3.569002389907837, + "learning_rate": 4.9972052023779865e-05, + "loss": 6.7383, + "step": 2532 + }, + { + "epoch": 0.01506446855076601, + "grad_norm": 2.560023784637451, + "learning_rate": 4.9972029939004064e-05, + "loss": 6.4978, + "step": 2533 + }, + { + "epoch": 0.015070415834047007, + "grad_norm": 2.304612398147583, + "learning_rate": 4.997200784551078e-05, + "loss": 6.3316, + "step": 2534 + }, + { + "epoch": 0.015076363117328004, + "grad_norm": 2.4442996978759766, + "learning_rate": 4.997198574330003e-05, + "loss": 6.4245, + "step": 2535 + }, + { + "epoch": 0.015082310400609002, + "grad_norm": 2.764831304550171, + "learning_rate": 4.997196363237181e-05, + "loss": 6.2251, + "step": 2536 + }, + { + "epoch": 0.01508825768389, + "grad_norm": 2.6534347534179688, + "learning_rate": 4.997194151272615e-05, + "loss": 6.6674, + "step": 2537 + }, + { + "epoch": 0.015094204967170996, + "grad_norm": 2.5901331901550293, + "learning_rate": 4.997191938436303e-05, + "loss": 6.5724, + "step": 2538 + }, + { + "epoch": 0.015100152250451993, + "grad_norm": 2.6827733516693115, + "learning_rate": 4.9971897247282474e-05, + "loss": 6.4774, + "step": 2539 + }, + { + "epoch": 0.015106099533732991, + "grad_norm": 2.087397813796997, + "learning_rate": 4.997187510148449e-05, + "loss": 6.5011, + "step": 2540 + }, + { + "epoch": 0.015112046817013988, + "grad_norm": 2.157935619354248, + "learning_rate": 4.9971852946969076e-05, + "loss": 6.3258, + "step": 2541 + }, + { + "epoch": 0.015117994100294985, + "grad_norm": 2.680481195449829, + "learning_rate": 4.997183078373625e-05, + "loss": 6.5631, + "step": 2542 + }, + { + "epoch": 0.015123941383575982, + "grad_norm": 2.897608995437622, + "learning_rate": 4.997180861178602e-05, + "loss": 6.7913, + "step": 2543 + }, + { + "epoch": 0.01512988866685698, + "grad_norm": 2.5714452266693115, + "learning_rate": 4.997178643111838e-05, + "loss": 6.767, + "step": 2544 + }, + { + "epoch": 0.015135835950137977, + "grad_norm": 2.096376419067383, + "learning_rate": 4.997176424173336e-05, + "loss": 6.7365, + "step": 2545 + }, + { + "epoch": 0.015141783233418974, + "grad_norm": 2.083101987838745, + "learning_rate": 4.9971742043630955e-05, + "loss": 6.4693, + "step": 2546 + }, + { + "epoch": 0.015147730516699971, + "grad_norm": 3.509512186050415, + "learning_rate": 4.997171983681116e-05, + "loss": 6.4068, + "step": 2547 + }, + { + "epoch": 0.01515367779998097, + "grad_norm": 3.055772304534912, + "learning_rate": 4.997169762127401e-05, + "loss": 6.3411, + "step": 2548 + }, + { + "epoch": 0.015159625083261966, + "grad_norm": 2.627429485321045, + "learning_rate": 4.997167539701949e-05, + "loss": 6.3788, + "step": 2549 + }, + { + "epoch": 0.015165572366542963, + "grad_norm": 2.408599853515625, + "learning_rate": 4.997165316404761e-05, + "loss": 6.2822, + "step": 2550 + }, + { + "epoch": 0.01517151964982396, + "grad_norm": 2.906006336212158, + "learning_rate": 4.997163092235839e-05, + "loss": 6.2615, + "step": 2551 + }, + { + "epoch": 0.015177466933104958, + "grad_norm": 2.4585347175598145, + "learning_rate": 4.997160867195183e-05, + "loss": 6.4076, + "step": 2552 + }, + { + "epoch": 0.015183414216385955, + "grad_norm": 2.495539665222168, + "learning_rate": 4.9971586412827944e-05, + "loss": 6.4893, + "step": 2553 + }, + { + "epoch": 0.015189361499666952, + "grad_norm": 2.719583034515381, + "learning_rate": 4.9971564144986734e-05, + "loss": 6.276, + "step": 2554 + }, + { + "epoch": 0.015195308782947949, + "grad_norm": 2.464207887649536, + "learning_rate": 4.9971541868428206e-05, + "loss": 6.2713, + "step": 2555 + }, + { + "epoch": 0.015201256066228947, + "grad_norm": 2.3604822158813477, + "learning_rate": 4.997151958315237e-05, + "loss": 6.2648, + "step": 2556 + }, + { + "epoch": 0.015207203349509944, + "grad_norm": 2.729820966720581, + "learning_rate": 4.997149728915924e-05, + "loss": 6.2985, + "step": 2557 + }, + { + "epoch": 0.015213150632790941, + "grad_norm": 2.565760612487793, + "learning_rate": 4.997147498644882e-05, + "loss": 6.401, + "step": 2558 + }, + { + "epoch": 0.015219097916071938, + "grad_norm": 3.091628074645996, + "learning_rate": 4.9971452675021104e-05, + "loss": 6.1774, + "step": 2559 + }, + { + "epoch": 0.015225045199352935, + "grad_norm": 2.452453851699829, + "learning_rate": 4.9971430354876125e-05, + "loss": 6.4669, + "step": 2560 + }, + { + "epoch": 0.015230992482633933, + "grad_norm": 2.4285218715667725, + "learning_rate": 4.997140802601387e-05, + "loss": 6.4086, + "step": 2561 + }, + { + "epoch": 0.01523693976591493, + "grad_norm": 2.094043254852295, + "learning_rate": 4.9971385688434356e-05, + "loss": 6.2502, + "step": 2562 + }, + { + "epoch": 0.015242887049195927, + "grad_norm": 2.5989573001861572, + "learning_rate": 4.9971363342137586e-05, + "loss": 6.2948, + "step": 2563 + }, + { + "epoch": 0.015248834332476924, + "grad_norm": 2.5372314453125, + "learning_rate": 4.9971340987123574e-05, + "loss": 6.5643, + "step": 2564 + }, + { + "epoch": 0.015254781615757922, + "grad_norm": 2.3666064739227295, + "learning_rate": 4.9971318623392325e-05, + "loss": 6.4807, + "step": 2565 + }, + { + "epoch": 0.01526072889903892, + "grad_norm": 2.3216497898101807, + "learning_rate": 4.997129625094385e-05, + "loss": 6.448, + "step": 2566 + }, + { + "epoch": 0.015266676182319916, + "grad_norm": 2.202665090560913, + "learning_rate": 4.9971273869778153e-05, + "loss": 6.3766, + "step": 2567 + }, + { + "epoch": 0.015272623465600913, + "grad_norm": 2.5678982734680176, + "learning_rate": 4.997125147989524e-05, + "loss": 6.0799, + "step": 2568 + }, + { + "epoch": 0.015278570748881911, + "grad_norm": 2.7904717922210693, + "learning_rate": 4.997122908129512e-05, + "loss": 6.3446, + "step": 2569 + }, + { + "epoch": 0.015284518032162908, + "grad_norm": 2.383120059967041, + "learning_rate": 4.99712066739778e-05, + "loss": 6.2398, + "step": 2570 + }, + { + "epoch": 0.015290465315443905, + "grad_norm": 2.4302077293395996, + "learning_rate": 4.9971184257943294e-05, + "loss": 6.2678, + "step": 2571 + }, + { + "epoch": 0.015296412598724902, + "grad_norm": 2.2923178672790527, + "learning_rate": 4.99711618331916e-05, + "loss": 6.4742, + "step": 2572 + }, + { + "epoch": 0.0153023598820059, + "grad_norm": 2.582810878753662, + "learning_rate": 4.9971139399722735e-05, + "loss": 6.4679, + "step": 2573 + }, + { + "epoch": 0.015308307165286897, + "grad_norm": 2.718228578567505, + "learning_rate": 4.997111695753671e-05, + "loss": 6.2475, + "step": 2574 + }, + { + "epoch": 0.015314254448567894, + "grad_norm": 2.4639811515808105, + "learning_rate": 4.997109450663352e-05, + "loss": 6.463, + "step": 2575 + }, + { + "epoch": 0.01532020173184889, + "grad_norm": 2.6998252868652344, + "learning_rate": 4.997107204701318e-05, + "loss": 6.2885, + "step": 2576 + }, + { + "epoch": 0.01532614901512989, + "grad_norm": 2.831291437149048, + "learning_rate": 4.997104957867569e-05, + "loss": 6.2056, + "step": 2577 + }, + { + "epoch": 0.015332096298410886, + "grad_norm": 2.9070980548858643, + "learning_rate": 4.997102710162107e-05, + "loss": 6.3247, + "step": 2578 + }, + { + "epoch": 0.015338043581691883, + "grad_norm": 2.2583134174346924, + "learning_rate": 4.997100461584933e-05, + "loss": 6.3241, + "step": 2579 + }, + { + "epoch": 0.01534399086497288, + "grad_norm": 2.1661887168884277, + "learning_rate": 4.997098212136045e-05, + "loss": 6.173, + "step": 2580 + }, + { + "epoch": 0.015349938148253878, + "grad_norm": 2.146256446838379, + "learning_rate": 4.997095961815448e-05, + "loss": 6.2267, + "step": 2581 + }, + { + "epoch": 0.015355885431534875, + "grad_norm": 2.5691211223602295, + "learning_rate": 4.997093710623139e-05, + "loss": 6.3302, + "step": 2582 + }, + { + "epoch": 0.015361832714815872, + "grad_norm": 2.5439505577087402, + "learning_rate": 4.997091458559121e-05, + "loss": 6.2111, + "step": 2583 + }, + { + "epoch": 0.015367779998096869, + "grad_norm": 2.451582670211792, + "learning_rate": 4.997089205623394e-05, + "loss": 6.2369, + "step": 2584 + }, + { + "epoch": 0.015373727281377867, + "grad_norm": 2.6275687217712402, + "learning_rate": 4.99708695181596e-05, + "loss": 6.1104, + "step": 2585 + }, + { + "epoch": 0.015379674564658864, + "grad_norm": 2.7068562507629395, + "learning_rate": 4.997084697136818e-05, + "loss": 6.1646, + "step": 2586 + }, + { + "epoch": 0.015385621847939861, + "grad_norm": 2.7819957733154297, + "learning_rate": 4.9970824415859694e-05, + "loss": 6.4203, + "step": 2587 + }, + { + "epoch": 0.015391569131220858, + "grad_norm": 2.7021708488464355, + "learning_rate": 4.9970801851634154e-05, + "loss": 6.1535, + "step": 2588 + }, + { + "epoch": 0.015397516414501855, + "grad_norm": 2.50740909576416, + "learning_rate": 4.997077927869156e-05, + "loss": 6.0139, + "step": 2589 + }, + { + "epoch": 0.015403463697782853, + "grad_norm": 2.5769078731536865, + "learning_rate": 4.997075669703193e-05, + "loss": 6.129, + "step": 2590 + }, + { + "epoch": 0.01540941098106385, + "grad_norm": 2.7379090785980225, + "learning_rate": 4.997073410665526e-05, + "loss": 6.4168, + "step": 2591 + }, + { + "epoch": 0.015415358264344847, + "grad_norm": 2.3530659675598145, + "learning_rate": 4.9970711507561565e-05, + "loss": 6.3114, + "step": 2592 + }, + { + "epoch": 0.015421305547625844, + "grad_norm": 2.6025893688201904, + "learning_rate": 4.997068889975086e-05, + "loss": 6.2506, + "step": 2593 + }, + { + "epoch": 0.015427252830906842, + "grad_norm": 2.311833143234253, + "learning_rate": 4.9970666283223145e-05, + "loss": 6.3372, + "step": 2594 + }, + { + "epoch": 0.015433200114187839, + "grad_norm": 2.339947462081909, + "learning_rate": 4.997064365797842e-05, + "loss": 6.2987, + "step": 2595 + }, + { + "epoch": 0.015439147397468836, + "grad_norm": 2.2132725715637207, + "learning_rate": 4.9970621024016714e-05, + "loss": 6.2473, + "step": 2596 + }, + { + "epoch": 0.015445094680749833, + "grad_norm": 2.7063987255096436, + "learning_rate": 4.9970598381338014e-05, + "loss": 6.1702, + "step": 2597 + }, + { + "epoch": 0.015451041964030831, + "grad_norm": 2.4952430725097656, + "learning_rate": 4.9970575729942335e-05, + "loss": 6.3301, + "step": 2598 + }, + { + "epoch": 0.015456989247311828, + "grad_norm": 2.7442502975463867, + "learning_rate": 4.997055306982969e-05, + "loss": 6.1922, + "step": 2599 + }, + { + "epoch": 0.015462936530592825, + "grad_norm": 2.860058069229126, + "learning_rate": 4.997053040100008e-05, + "loss": 6.0674, + "step": 2600 + }, + { + "epoch": 0.015468883813873822, + "grad_norm": 2.821620464324951, + "learning_rate": 4.997050772345352e-05, + "loss": 6.0445, + "step": 2601 + }, + { + "epoch": 0.01547483109715482, + "grad_norm": 2.369174003601074, + "learning_rate": 4.997048503719001e-05, + "loss": 5.8641, + "step": 2602 + }, + { + "epoch": 0.015480778380435817, + "grad_norm": 2.2836029529571533, + "learning_rate": 4.997046234220956e-05, + "loss": 5.7629, + "step": 2603 + }, + { + "epoch": 0.015486725663716814, + "grad_norm": 3.13094162940979, + "learning_rate": 4.997043963851218e-05, + "loss": 6.7871, + "step": 2604 + }, + { + "epoch": 0.01549267294699781, + "grad_norm": 2.884119749069214, + "learning_rate": 4.9970416926097885e-05, + "loss": 6.1079, + "step": 2605 + }, + { + "epoch": 0.01549862023027881, + "grad_norm": 3.0921716690063477, + "learning_rate": 4.997039420496666e-05, + "loss": 5.9221, + "step": 2606 + }, + { + "epoch": 0.015504567513559806, + "grad_norm": 2.6903741359710693, + "learning_rate": 4.997037147511855e-05, + "loss": 5.7377, + "step": 2607 + }, + { + "epoch": 0.015510514796840803, + "grad_norm": 2.177030086517334, + "learning_rate": 4.997034873655352e-05, + "loss": 5.7272, + "step": 2608 + }, + { + "epoch": 0.0155164620801218, + "grad_norm": 2.41406512260437, + "learning_rate": 4.997032598927162e-05, + "loss": 5.6456, + "step": 2609 + }, + { + "epoch": 0.015522409363402798, + "grad_norm": 2.6853182315826416, + "learning_rate": 4.997030323327282e-05, + "loss": 6.1634, + "step": 2610 + }, + { + "epoch": 0.015528356646683795, + "grad_norm": 2.734081983566284, + "learning_rate": 4.997028046855715e-05, + "loss": 6.1366, + "step": 2611 + }, + { + "epoch": 0.015534303929964792, + "grad_norm": 2.234046459197998, + "learning_rate": 4.997025769512461e-05, + "loss": 5.6773, + "step": 2612 + }, + { + "epoch": 0.015540251213245789, + "grad_norm": 2.467381715774536, + "learning_rate": 4.9970234912975226e-05, + "loss": 5.6409, + "step": 2613 + }, + { + "epoch": 0.015546198496526787, + "grad_norm": 2.4890551567077637, + "learning_rate": 4.997021212210897e-05, + "loss": 5.5961, + "step": 2614 + }, + { + "epoch": 0.015552145779807784, + "grad_norm": 2.254138708114624, + "learning_rate": 4.997018932252588e-05, + "loss": 5.6039, + "step": 2615 + }, + { + "epoch": 0.015558093063088781, + "grad_norm": 2.5773816108703613, + "learning_rate": 4.9970166514225955e-05, + "loss": 5.9935, + "step": 2616 + }, + { + "epoch": 0.015564040346369778, + "grad_norm": 2.308300733566284, + "learning_rate": 4.997014369720921e-05, + "loss": 5.8307, + "step": 2617 + }, + { + "epoch": 0.015569987629650776, + "grad_norm": 2.3276724815368652, + "learning_rate": 4.9970120871475634e-05, + "loss": 5.5819, + "step": 2618 + }, + { + "epoch": 0.015575934912931773, + "grad_norm": 2.7989203929901123, + "learning_rate": 4.997009803702526e-05, + "loss": 6.0816, + "step": 2619 + }, + { + "epoch": 0.01558188219621277, + "grad_norm": 2.5614469051361084, + "learning_rate": 4.997007519385807e-05, + "loss": 5.6677, + "step": 2620 + }, + { + "epoch": 0.015587829479493767, + "grad_norm": 2.4494402408599854, + "learning_rate": 4.9970052341974096e-05, + "loss": 5.7754, + "step": 2621 + }, + { + "epoch": 0.015593776762774764, + "grad_norm": 2.214578151702881, + "learning_rate": 4.997002948137333e-05, + "loss": 6.4244, + "step": 2622 + }, + { + "epoch": 0.015599724046055762, + "grad_norm": 2.8115196228027344, + "learning_rate": 4.9970006612055776e-05, + "loss": 5.9822, + "step": 2623 + }, + { + "epoch": 0.015605671329336759, + "grad_norm": 2.4020626544952393, + "learning_rate": 4.996998373402146e-05, + "loss": 6.0481, + "step": 2624 + }, + { + "epoch": 0.015611618612617756, + "grad_norm": 2.3936421871185303, + "learning_rate": 4.996996084727038e-05, + "loss": 6.0663, + "step": 2625 + }, + { + "epoch": 0.015617565895898753, + "grad_norm": 2.2710554599761963, + "learning_rate": 4.996993795180254e-05, + "loss": 6.0668, + "step": 2626 + }, + { + "epoch": 0.015623513179179751, + "grad_norm": 2.141789436340332, + "learning_rate": 4.9969915047617955e-05, + "loss": 6.2159, + "step": 2627 + }, + { + "epoch": 0.015629460462460748, + "grad_norm": 2.557889461517334, + "learning_rate": 4.9969892134716635e-05, + "loss": 6.262, + "step": 2628 + }, + { + "epoch": 0.015635407745741747, + "grad_norm": 2.3966641426086426, + "learning_rate": 4.9969869213098574e-05, + "loss": 6.0412, + "step": 2629 + }, + { + "epoch": 0.01564135502902274, + "grad_norm": 2.301426410675049, + "learning_rate": 4.99698462827638e-05, + "loss": 6.0798, + "step": 2630 + }, + { + "epoch": 0.01564730231230374, + "grad_norm": 2.4315614700317383, + "learning_rate": 4.996982334371231e-05, + "loss": 5.8736, + "step": 2631 + }, + { + "epoch": 0.015653249595584735, + "grad_norm": 2.5549440383911133, + "learning_rate": 4.9969800395944105e-05, + "loss": 5.7858, + "step": 2632 + }, + { + "epoch": 0.015659196878865734, + "grad_norm": 2.480375289916992, + "learning_rate": 4.99697774394592e-05, + "loss": 6.3261, + "step": 2633 + }, + { + "epoch": 0.015665144162146732, + "grad_norm": 2.42866849899292, + "learning_rate": 4.9969754474257614e-05, + "loss": 6.1729, + "step": 2634 + }, + { + "epoch": 0.015671091445427728, + "grad_norm": 2.32722544670105, + "learning_rate": 4.9969731500339335e-05, + "loss": 5.7746, + "step": 2635 + }, + { + "epoch": 0.015677038728708726, + "grad_norm": 2.6797266006469727, + "learning_rate": 4.996970851770438e-05, + "loss": 6.1657, + "step": 2636 + }, + { + "epoch": 0.015682986011989725, + "grad_norm": 2.87758731842041, + "learning_rate": 4.9969685526352775e-05, + "loss": 6.1475, + "step": 2637 + }, + { + "epoch": 0.01568893329527072, + "grad_norm": 2.898663282394409, + "learning_rate": 4.996966252628449e-05, + "loss": 6.2942, + "step": 2638 + }, + { + "epoch": 0.01569488057855172, + "grad_norm": 3.3087987899780273, + "learning_rate": 4.996963951749957e-05, + "loss": 5.9962, + "step": 2639 + }, + { + "epoch": 0.015700827861832713, + "grad_norm": 2.4418020248413086, + "learning_rate": 4.996961649999799e-05, + "loss": 6.1065, + "step": 2640 + }, + { + "epoch": 0.015706775145113712, + "grad_norm": 2.5839014053344727, + "learning_rate": 4.9969593473779786e-05, + "loss": 6.2303, + "step": 2641 + }, + { + "epoch": 0.01571272242839471, + "grad_norm": 2.683163642883301, + "learning_rate": 4.996957043884495e-05, + "loss": 5.7194, + "step": 2642 + }, + { + "epoch": 0.015718669711675706, + "grad_norm": 2.628574848175049, + "learning_rate": 4.99695473951935e-05, + "loss": 5.6239, + "step": 2643 + }, + { + "epoch": 0.015724616994956704, + "grad_norm": 3.0716800689697266, + "learning_rate": 4.9969524342825434e-05, + "loss": 6.1957, + "step": 2644 + }, + { + "epoch": 0.015730564278237703, + "grad_norm": 2.415626287460327, + "learning_rate": 4.996950128174077e-05, + "loss": 6.2953, + "step": 2645 + }, + { + "epoch": 0.015736511561518698, + "grad_norm": 2.6836612224578857, + "learning_rate": 4.996947821193951e-05, + "loss": 6.103, + "step": 2646 + }, + { + "epoch": 0.015742458844799696, + "grad_norm": 2.2673206329345703, + "learning_rate": 4.996945513342166e-05, + "loss": 6.2628, + "step": 2647 + }, + { + "epoch": 0.01574840612808069, + "grad_norm": 2.629955530166626, + "learning_rate": 4.996943204618724e-05, + "loss": 6.2444, + "step": 2648 + }, + { + "epoch": 0.01575435341136169, + "grad_norm": 2.6730127334594727, + "learning_rate": 4.996940895023623e-05, + "loss": 6.0595, + "step": 2649 + }, + { + "epoch": 0.01576030069464269, + "grad_norm": 2.607389450073242, + "learning_rate": 4.996938584556867e-05, + "loss": 6.0253, + "step": 2650 + }, + { + "epoch": 0.015766247977923684, + "grad_norm": 2.264345407485962, + "learning_rate": 4.996936273218456e-05, + "loss": 6.1011, + "step": 2651 + }, + { + "epoch": 0.015772195261204682, + "grad_norm": 2.218766450881958, + "learning_rate": 4.99693396100839e-05, + "loss": 6.0545, + "step": 2652 + }, + { + "epoch": 0.015778142544485677, + "grad_norm": 2.435213088989258, + "learning_rate": 4.99693164792667e-05, + "loss": 6.0679, + "step": 2653 + }, + { + "epoch": 0.015784089827766676, + "grad_norm": 2.2278120517730713, + "learning_rate": 4.996929333973297e-05, + "loss": 6.0864, + "step": 2654 + }, + { + "epoch": 0.015790037111047674, + "grad_norm": 1.983554482460022, + "learning_rate": 4.9969270191482715e-05, + "loss": 6.124, + "step": 2655 + }, + { + "epoch": 0.01579598439432867, + "grad_norm": 1.9382312297821045, + "learning_rate": 4.996924703451594e-05, + "loss": 6.392, + "step": 2656 + }, + { + "epoch": 0.015801931677609668, + "grad_norm": 2.8142831325531006, + "learning_rate": 4.9969223868832674e-05, + "loss": 6.017, + "step": 2657 + }, + { + "epoch": 0.015807878960890667, + "grad_norm": 2.3466787338256836, + "learning_rate": 4.9969200694432904e-05, + "loss": 5.9588, + "step": 2658 + }, + { + "epoch": 0.01581382624417166, + "grad_norm": 2.0172243118286133, + "learning_rate": 4.996917751131664e-05, + "loss": 5.9513, + "step": 2659 + }, + { + "epoch": 0.01581977352745266, + "grad_norm": 2.3778223991394043, + "learning_rate": 4.99691543194839e-05, + "loss": 6.2205, + "step": 2660 + }, + { + "epoch": 0.015825720810733655, + "grad_norm": 2.4351084232330322, + "learning_rate": 4.9969131118934675e-05, + "loss": 6.0916, + "step": 2661 + }, + { + "epoch": 0.015831668094014654, + "grad_norm": 2.22328519821167, + "learning_rate": 4.9969107909669e-05, + "loss": 6.5546, + "step": 2662 + }, + { + "epoch": 0.015837615377295652, + "grad_norm": 2.4626407623291016, + "learning_rate": 4.996908469168685e-05, + "loss": 6.522, + "step": 2663 + }, + { + "epoch": 0.015843562660576647, + "grad_norm": 2.1032283306121826, + "learning_rate": 4.9969061464988266e-05, + "loss": 6.3372, + "step": 2664 + }, + { + "epoch": 0.015849509943857646, + "grad_norm": 2.1436524391174316, + "learning_rate": 4.9969038229573236e-05, + "loss": 6.3792, + "step": 2665 + }, + { + "epoch": 0.015855457227138645, + "grad_norm": 2.42084002494812, + "learning_rate": 4.996901498544176e-05, + "loss": 6.701, + "step": 2666 + }, + { + "epoch": 0.01586140451041964, + "grad_norm": 2.854630947113037, + "learning_rate": 4.996899173259388e-05, + "loss": 6.3273, + "step": 2667 + }, + { + "epoch": 0.015867351793700638, + "grad_norm": 2.2480521202087402, + "learning_rate": 4.996896847102957e-05, + "loss": 6.4314, + "step": 2668 + }, + { + "epoch": 0.015873299076981633, + "grad_norm": 3.7074203491210938, + "learning_rate": 4.996894520074886e-05, + "loss": 5.9438, + "step": 2669 + }, + { + "epoch": 0.015879246360262632, + "grad_norm": 3.1037209033966064, + "learning_rate": 4.9968921921751735e-05, + "loss": 5.7915, + "step": 2670 + }, + { + "epoch": 0.01588519364354363, + "grad_norm": 2.8338170051574707, + "learning_rate": 4.996889863403823e-05, + "loss": 6.7765, + "step": 2671 + }, + { + "epoch": 0.015891140926824626, + "grad_norm": 2.6366934776306152, + "learning_rate": 4.996887533760833e-05, + "loss": 6.8019, + "step": 2672 + }, + { + "epoch": 0.015897088210105624, + "grad_norm": 2.3954126834869385, + "learning_rate": 4.996885203246207e-05, + "loss": 6.3946, + "step": 2673 + }, + { + "epoch": 0.015903035493386623, + "grad_norm": 2.5771238803863525, + "learning_rate": 4.996882871859943e-05, + "loss": 6.3767, + "step": 2674 + }, + { + "epoch": 0.015908982776667618, + "grad_norm": 3.8544304370880127, + "learning_rate": 4.9968805396020424e-05, + "loss": 7.0813, + "step": 2675 + }, + { + "epoch": 0.015914930059948616, + "grad_norm": 3.4221606254577637, + "learning_rate": 4.996878206472507e-05, + "loss": 6.4782, + "step": 2676 + }, + { + "epoch": 0.01592087734322961, + "grad_norm": 3.6425843238830566, + "learning_rate": 4.996875872471338e-05, + "loss": 5.8685, + "step": 2677 + }, + { + "epoch": 0.01592682462651061, + "grad_norm": 3.255345344543457, + "learning_rate": 4.996873537598535e-05, + "loss": 5.7099, + "step": 2678 + }, + { + "epoch": 0.01593277190979161, + "grad_norm": 2.5217175483703613, + "learning_rate": 4.9968712018540997e-05, + "loss": 5.8978, + "step": 2679 + }, + { + "epoch": 0.015938719193072604, + "grad_norm": 2.2415871620178223, + "learning_rate": 4.996868865238031e-05, + "loss": 6.8186, + "step": 2680 + }, + { + "epoch": 0.015944666476353602, + "grad_norm": 2.1412270069122314, + "learning_rate": 4.996866527750332e-05, + "loss": 6.8056, + "step": 2681 + }, + { + "epoch": 0.015950613759634597, + "grad_norm": 2.423093557357788, + "learning_rate": 4.996864189391004e-05, + "loss": 7.0769, + "step": 2682 + }, + { + "epoch": 0.015956561042915596, + "grad_norm": 2.2334039211273193, + "learning_rate": 4.9968618501600454e-05, + "loss": 6.9954, + "step": 2683 + }, + { + "epoch": 0.015962508326196594, + "grad_norm": 2.4311838150024414, + "learning_rate": 4.996859510057458e-05, + "loss": 6.8375, + "step": 2684 + }, + { + "epoch": 0.01596845560947759, + "grad_norm": 4.861137866973877, + "learning_rate": 4.996857169083242e-05, + "loss": 6.2628, + "step": 2685 + }, + { + "epoch": 0.015974402892758588, + "grad_norm": 3.064213991165161, + "learning_rate": 4.996854827237401e-05, + "loss": 6.4316, + "step": 2686 + }, + { + "epoch": 0.015980350176039586, + "grad_norm": 2.307011365890503, + "learning_rate": 4.996852484519932e-05, + "loss": 6.6212, + "step": 2687 + }, + { + "epoch": 0.01598629745932058, + "grad_norm": 2.5157034397125244, + "learning_rate": 4.9968501409308374e-05, + "loss": 7.153, + "step": 2688 + }, + { + "epoch": 0.01599224474260158, + "grad_norm": 2.4122424125671387, + "learning_rate": 4.996847796470119e-05, + "loss": 7.2244, + "step": 2689 + }, + { + "epoch": 0.015998192025882575, + "grad_norm": 2.305055618286133, + "learning_rate": 4.9968454511377773e-05, + "loss": 7.4751, + "step": 2690 + }, + { + "epoch": 0.016004139309163574, + "grad_norm": 3.068027973175049, + "learning_rate": 4.9968431049338116e-05, + "loss": 6.5709, + "step": 2691 + }, + { + "epoch": 0.016010086592444572, + "grad_norm": 2.09893798828125, + "learning_rate": 4.9968407578582246e-05, + "loss": 6.7212, + "step": 2692 + }, + { + "epoch": 0.016016033875725567, + "grad_norm": 2.3161933422088623, + "learning_rate": 4.9968384099110163e-05, + "loss": 6.6243, + "step": 2693 + }, + { + "epoch": 0.016021981159006566, + "grad_norm": 2.913304090499878, + "learning_rate": 4.9968360610921874e-05, + "loss": 6.1946, + "step": 2694 + }, + { + "epoch": 0.016027928442287565, + "grad_norm": 2.746368408203125, + "learning_rate": 4.9968337114017386e-05, + "loss": 6.3783, + "step": 2695 + }, + { + "epoch": 0.01603387572556856, + "grad_norm": 2.40331768989563, + "learning_rate": 4.9968313608396705e-05, + "loss": 6.9898, + "step": 2696 + }, + { + "epoch": 0.016039823008849558, + "grad_norm": 2.214869976043701, + "learning_rate": 4.9968290094059844e-05, + "loss": 6.4497, + "step": 2697 + }, + { + "epoch": 0.016045770292130553, + "grad_norm": 2.050436019897461, + "learning_rate": 4.996826657100682e-05, + "loss": 6.8897, + "step": 2698 + }, + { + "epoch": 0.016051717575411552, + "grad_norm": 2.294149398803711, + "learning_rate": 4.996824303923763e-05, + "loss": 6.5583, + "step": 2699 + }, + { + "epoch": 0.01605766485869255, + "grad_norm": 2.26918625831604, + "learning_rate": 4.996821949875228e-05, + "loss": 6.7411, + "step": 2700 + }, + { + "epoch": 0.016063612141973545, + "grad_norm": 2.1330158710479736, + "learning_rate": 4.9968195949550775e-05, + "loss": 6.8068, + "step": 2701 + }, + { + "epoch": 0.016069559425254544, + "grad_norm": 1.8605769872665405, + "learning_rate": 4.996817239163315e-05, + "loss": 6.4833, + "step": 2702 + }, + { + "epoch": 0.016075506708535543, + "grad_norm": 3.132803440093994, + "learning_rate": 4.996814882499938e-05, + "loss": 5.8281, + "step": 2703 + }, + { + "epoch": 0.016081453991816538, + "grad_norm": 3.1079390048980713, + "learning_rate": 4.996812524964949e-05, + "loss": 5.6894, + "step": 2704 + }, + { + "epoch": 0.016087401275097536, + "grad_norm": 2.2877023220062256, + "learning_rate": 4.996810166558349e-05, + "loss": 7.0128, + "step": 2705 + }, + { + "epoch": 0.01609334855837853, + "grad_norm": 2.415696859359741, + "learning_rate": 4.996807807280138e-05, + "loss": 6.8098, + "step": 2706 + }, + { + "epoch": 0.01609929584165953, + "grad_norm": 2.342111110687256, + "learning_rate": 4.996805447130317e-05, + "loss": 7.2452, + "step": 2707 + }, + { + "epoch": 0.01610524312494053, + "grad_norm": 2.6504852771759033, + "learning_rate": 4.996803086108887e-05, + "loss": 6.6731, + "step": 2708 + }, + { + "epoch": 0.016111190408221523, + "grad_norm": 2.6157166957855225, + "learning_rate": 4.996800724215849e-05, + "loss": 6.9377, + "step": 2709 + }, + { + "epoch": 0.016117137691502522, + "grad_norm": 2.6289443969726562, + "learning_rate": 4.9967983614512036e-05, + "loss": 6.639, + "step": 2710 + }, + { + "epoch": 0.01612308497478352, + "grad_norm": 2.966489791870117, + "learning_rate": 4.996795997814952e-05, + "loss": 6.3681, + "step": 2711 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 3.7333364486694336, + "learning_rate": 4.9967936333070944e-05, + "loss": 5.6015, + "step": 2712 + }, + { + "epoch": 0.016134979541345514, + "grad_norm": 2.942728281021118, + "learning_rate": 4.9967912679276316e-05, + "loss": 5.6548, + "step": 2713 + }, + { + "epoch": 0.01614092682462651, + "grad_norm": 2.394622802734375, + "learning_rate": 4.996788901676566e-05, + "loss": 6.5119, + "step": 2714 + }, + { + "epoch": 0.016146874107907508, + "grad_norm": 2.8388447761535645, + "learning_rate": 4.9967865345538963e-05, + "loss": 6.4424, + "step": 2715 + }, + { + "epoch": 0.016152821391188506, + "grad_norm": 2.7682905197143555, + "learning_rate": 4.9967841665596245e-05, + "loss": 6.4688, + "step": 2716 + }, + { + "epoch": 0.0161587686744695, + "grad_norm": 3.0281460285186768, + "learning_rate": 4.996781797693751e-05, + "loss": 6.52, + "step": 2717 + }, + { + "epoch": 0.0161647159577505, + "grad_norm": 2.9734318256378174, + "learning_rate": 4.996779427956276e-05, + "loss": 6.4307, + "step": 2718 + }, + { + "epoch": 0.016170663241031495, + "grad_norm": 2.7653586864471436, + "learning_rate": 4.996777057347202e-05, + "loss": 6.1783, + "step": 2719 + }, + { + "epoch": 0.016176610524312494, + "grad_norm": 2.9418516159057617, + "learning_rate": 4.996774685866529e-05, + "loss": 6.5466, + "step": 2720 + }, + { + "epoch": 0.016182557807593492, + "grad_norm": 2.789217233657837, + "learning_rate": 4.996772313514258e-05, + "loss": 6.9296, + "step": 2721 + }, + { + "epoch": 0.016188505090874487, + "grad_norm": 2.8092539310455322, + "learning_rate": 4.996769940290389e-05, + "loss": 6.6186, + "step": 2722 + }, + { + "epoch": 0.016194452374155486, + "grad_norm": 2.696572780609131, + "learning_rate": 4.996767566194923e-05, + "loss": 6.5361, + "step": 2723 + }, + { + "epoch": 0.016200399657436484, + "grad_norm": 2.5987300872802734, + "learning_rate": 4.996765191227862e-05, + "loss": 6.4029, + "step": 2724 + }, + { + "epoch": 0.01620634694071748, + "grad_norm": 2.083057165145874, + "learning_rate": 4.996762815389205e-05, + "loss": 6.4747, + "step": 2725 + }, + { + "epoch": 0.016212294223998478, + "grad_norm": 2.912338972091675, + "learning_rate": 4.9967604386789555e-05, + "loss": 6.8869, + "step": 2726 + }, + { + "epoch": 0.016218241507279473, + "grad_norm": 2.642224073410034, + "learning_rate": 4.9967580610971124e-05, + "loss": 6.6701, + "step": 2727 + }, + { + "epoch": 0.016224188790560472, + "grad_norm": 2.673652410507202, + "learning_rate": 4.996755682643676e-05, + "loss": 6.8624, + "step": 2728 + }, + { + "epoch": 0.01623013607384147, + "grad_norm": 2.5223872661590576, + "learning_rate": 4.996753303318648e-05, + "loss": 6.8247, + "step": 2729 + }, + { + "epoch": 0.016236083357122465, + "grad_norm": 2.252037525177002, + "learning_rate": 4.99675092312203e-05, + "loss": 6.7924, + "step": 2730 + }, + { + "epoch": 0.016242030640403464, + "grad_norm": 2.2854461669921875, + "learning_rate": 4.9967485420538216e-05, + "loss": 6.4761, + "step": 2731 + }, + { + "epoch": 0.016247977923684463, + "grad_norm": 2.426912546157837, + "learning_rate": 4.9967461601140244e-05, + "loss": 6.6028, + "step": 2732 + }, + { + "epoch": 0.016253925206965458, + "grad_norm": 2.7375681400299072, + "learning_rate": 4.9967437773026384e-05, + "loss": 6.5283, + "step": 2733 + }, + { + "epoch": 0.016259872490246456, + "grad_norm": 2.7669689655303955, + "learning_rate": 4.996741393619665e-05, + "loss": 6.4382, + "step": 2734 + }, + { + "epoch": 0.01626581977352745, + "grad_norm": 2.294597864151001, + "learning_rate": 4.996739009065105e-05, + "loss": 6.7479, + "step": 2735 + }, + { + "epoch": 0.01627176705680845, + "grad_norm": 2.4791014194488525, + "learning_rate": 4.996736623638959e-05, + "loss": 6.7043, + "step": 2736 + }, + { + "epoch": 0.01627771434008945, + "grad_norm": 2.4080021381378174, + "learning_rate": 4.9967342373412286e-05, + "loss": 6.6046, + "step": 2737 + }, + { + "epoch": 0.016283661623370443, + "grad_norm": 2.463109254837036, + "learning_rate": 4.996731850171914e-05, + "loss": 6.3895, + "step": 2738 + }, + { + "epoch": 0.016289608906651442, + "grad_norm": 2.665908098220825, + "learning_rate": 4.9967294621310155e-05, + "loss": 6.6482, + "step": 2739 + }, + { + "epoch": 0.01629555618993244, + "grad_norm": 2.399526357650757, + "learning_rate": 4.996727073218536e-05, + "loss": 6.7098, + "step": 2740 + }, + { + "epoch": 0.016301503473213436, + "grad_norm": 2.678091287612915, + "learning_rate": 4.996724683434473e-05, + "loss": 6.419, + "step": 2741 + }, + { + "epoch": 0.016307450756494434, + "grad_norm": 2.5573642253875732, + "learning_rate": 4.99672229277883e-05, + "loss": 6.4703, + "step": 2742 + }, + { + "epoch": 0.01631339803977543, + "grad_norm": 2.644097089767456, + "learning_rate": 4.996719901251607e-05, + "loss": 5.9854, + "step": 2743 + }, + { + "epoch": 0.016319345323056428, + "grad_norm": 2.6165592670440674, + "learning_rate": 4.996717508852805e-05, + "loss": 6.1776, + "step": 2744 + }, + { + "epoch": 0.016325292606337426, + "grad_norm": 2.175647020339966, + "learning_rate": 4.996715115582426e-05, + "loss": 6.5533, + "step": 2745 + }, + { + "epoch": 0.01633123988961842, + "grad_norm": 2.112217664718628, + "learning_rate": 4.996712721440467e-05, + "loss": 6.5572, + "step": 2746 + }, + { + "epoch": 0.01633718717289942, + "grad_norm": 2.165111541748047, + "learning_rate": 4.996710326426933e-05, + "loss": 6.2798, + "step": 2747 + }, + { + "epoch": 0.016343134456180415, + "grad_norm": 2.5812315940856934, + "learning_rate": 4.996707930541823e-05, + "loss": 6.0831, + "step": 2748 + }, + { + "epoch": 0.016349081739461414, + "grad_norm": 2.2306227684020996, + "learning_rate": 4.996705533785138e-05, + "loss": 6.5833, + "step": 2749 + }, + { + "epoch": 0.016355029022742412, + "grad_norm": 1.999974250793457, + "learning_rate": 4.996703136156878e-05, + "loss": 6.2461, + "step": 2750 + }, + { + "epoch": 0.016360976306023407, + "grad_norm": 2.0521416664123535, + "learning_rate": 4.996700737657046e-05, + "loss": 6.4606, + "step": 2751 + }, + { + "epoch": 0.016366923589304406, + "grad_norm": 1.8630053997039795, + "learning_rate": 4.996698338285642e-05, + "loss": 6.1375, + "step": 2752 + }, + { + "epoch": 0.016372870872585404, + "grad_norm": 1.7525913715362549, + "learning_rate": 4.9966959380426646e-05, + "loss": 6.1769, + "step": 2753 + }, + { + "epoch": 0.0163788181558664, + "grad_norm": 2.8151230812072754, + "learning_rate": 4.996693536928118e-05, + "loss": 5.9066, + "step": 2754 + }, + { + "epoch": 0.016384765439147398, + "grad_norm": 2.503230571746826, + "learning_rate": 4.9966911349420004e-05, + "loss": 6.3725, + "step": 2755 + }, + { + "epoch": 0.016390712722428393, + "grad_norm": 2.676284074783325, + "learning_rate": 4.996688732084314e-05, + "loss": 6.9086, + "step": 2756 + }, + { + "epoch": 0.01639666000570939, + "grad_norm": 2.3367252349853516, + "learning_rate": 4.99668632835506e-05, + "loss": 6.1323, + "step": 2757 + }, + { + "epoch": 0.01640260728899039, + "grad_norm": 3.3071084022521973, + "learning_rate": 4.996683923754237e-05, + "loss": 6.162, + "step": 2758 + }, + { + "epoch": 0.016408554572271385, + "grad_norm": 2.64388370513916, + "learning_rate": 4.9966815182818494e-05, + "loss": 6.171, + "step": 2759 + }, + { + "epoch": 0.016414501855552384, + "grad_norm": 2.2378199100494385, + "learning_rate": 4.996679111937895e-05, + "loss": 6.4466, + "step": 2760 + }, + { + "epoch": 0.016420449138833382, + "grad_norm": 2.5944395065307617, + "learning_rate": 4.996676704722376e-05, + "loss": 6.7034, + "step": 2761 + }, + { + "epoch": 0.016426396422114378, + "grad_norm": 2.768211841583252, + "learning_rate": 4.996674296635293e-05, + "loss": 6.7551, + "step": 2762 + }, + { + "epoch": 0.016432343705395376, + "grad_norm": 2.80188250541687, + "learning_rate": 4.9966718876766467e-05, + "loss": 6.8437, + "step": 2763 + }, + { + "epoch": 0.01643829098867637, + "grad_norm": 2.2422847747802734, + "learning_rate": 4.996669477846438e-05, + "loss": 6.5365, + "step": 2764 + }, + { + "epoch": 0.01644423827195737, + "grad_norm": 2.526724100112915, + "learning_rate": 4.996667067144668e-05, + "loss": 6.3735, + "step": 2765 + }, + { + "epoch": 0.01645018555523837, + "grad_norm": 3.2267372608184814, + "learning_rate": 4.996664655571337e-05, + "loss": 6.0508, + "step": 2766 + }, + { + "epoch": 0.016456132838519363, + "grad_norm": 3.393270969390869, + "learning_rate": 4.996662243126446e-05, + "loss": 6.5543, + "step": 2767 + }, + { + "epoch": 0.016462080121800362, + "grad_norm": 2.7712342739105225, + "learning_rate": 4.996659829809996e-05, + "loss": 6.5891, + "step": 2768 + }, + { + "epoch": 0.01646802740508136, + "grad_norm": 2.5687179565429688, + "learning_rate": 4.996657415621988e-05, + "loss": 6.464, + "step": 2769 + }, + { + "epoch": 0.016473974688362356, + "grad_norm": 3.059953451156616, + "learning_rate": 4.996655000562424e-05, + "loss": 6.4286, + "step": 2770 + }, + { + "epoch": 0.016479921971643354, + "grad_norm": 3.3729803562164307, + "learning_rate": 4.9966525846313015e-05, + "loss": 6.5937, + "step": 2771 + }, + { + "epoch": 0.01648586925492435, + "grad_norm": 2.907397985458374, + "learning_rate": 4.996650167828624e-05, + "loss": 6.2559, + "step": 2772 + }, + { + "epoch": 0.016491816538205348, + "grad_norm": 3.5011706352233887, + "learning_rate": 4.996647750154392e-05, + "loss": 5.7897, + "step": 2773 + }, + { + "epoch": 0.016497763821486346, + "grad_norm": 2.5495986938476562, + "learning_rate": 4.996645331608607e-05, + "loss": 6.688, + "step": 2774 + }, + { + "epoch": 0.01650371110476734, + "grad_norm": 2.486416816711426, + "learning_rate": 4.9966429121912675e-05, + "loss": 6.8169, + "step": 2775 + }, + { + "epoch": 0.01650965838804834, + "grad_norm": 2.272162437438965, + "learning_rate": 4.9966404919023755e-05, + "loss": 6.696, + "step": 2776 + }, + { + "epoch": 0.016515605671329335, + "grad_norm": 2.9408323764801025, + "learning_rate": 4.9966380707419334e-05, + "loss": 6.1711, + "step": 2777 + }, + { + "epoch": 0.016521552954610334, + "grad_norm": 3.361907958984375, + "learning_rate": 4.99663564870994e-05, + "loss": 5.6029, + "step": 2778 + }, + { + "epoch": 0.016527500237891332, + "grad_norm": 3.06835675239563, + "learning_rate": 4.996633225806397e-05, + "loss": 5.332, + "step": 2779 + }, + { + "epoch": 0.016533447521172327, + "grad_norm": 3.058638572692871, + "learning_rate": 4.9966308020313054e-05, + "loss": 6.3345, + "step": 2780 + }, + { + "epoch": 0.016539394804453326, + "grad_norm": 2.8265507221221924, + "learning_rate": 4.9966283773846654e-05, + "loss": 5.4231, + "step": 2781 + }, + { + "epoch": 0.016545342087734324, + "grad_norm": 3.128094434738159, + "learning_rate": 4.996625951866478e-05, + "loss": 5.4144, + "step": 2782 + }, + { + "epoch": 0.01655128937101532, + "grad_norm": 2.6830554008483887, + "learning_rate": 4.9966235254767445e-05, + "loss": 6.0084, + "step": 2783 + }, + { + "epoch": 0.016557236654296318, + "grad_norm": 2.7146122455596924, + "learning_rate": 4.996621098215466e-05, + "loss": 6.7104, + "step": 2784 + }, + { + "epoch": 0.016563183937577313, + "grad_norm": 3.518169403076172, + "learning_rate": 4.9966186700826425e-05, + "loss": 5.4509, + "step": 2785 + }, + { + "epoch": 0.01656913122085831, + "grad_norm": 2.7607035636901855, + "learning_rate": 4.9966162410782755e-05, + "loss": 6.2149, + "step": 2786 + }, + { + "epoch": 0.01657507850413931, + "grad_norm": 2.897862195968628, + "learning_rate": 4.996613811202365e-05, + "loss": 6.4713, + "step": 2787 + }, + { + "epoch": 0.016581025787420305, + "grad_norm": 2.6984574794769287, + "learning_rate": 4.9966113804549134e-05, + "loss": 6.2298, + "step": 2788 + }, + { + "epoch": 0.016586973070701304, + "grad_norm": 2.7281908988952637, + "learning_rate": 4.996608948835919e-05, + "loss": 6.0244, + "step": 2789 + }, + { + "epoch": 0.016592920353982302, + "grad_norm": 2.314769983291626, + "learning_rate": 4.996606516345386e-05, + "loss": 6.8523, + "step": 2790 + }, + { + "epoch": 0.016598867637263297, + "grad_norm": 2.887943744659424, + "learning_rate": 4.9966040829833115e-05, + "loss": 6.8407, + "step": 2791 + }, + { + "epoch": 0.016604814920544296, + "grad_norm": 3.4924309253692627, + "learning_rate": 4.9966016487497e-05, + "loss": 6.3646, + "step": 2792 + }, + { + "epoch": 0.01661076220382529, + "grad_norm": 2.3095340728759766, + "learning_rate": 4.9965992136445495e-05, + "loss": 6.407, + "step": 2793 + }, + { + "epoch": 0.01661670948710629, + "grad_norm": 3.771980047225952, + "learning_rate": 4.9965967776678627e-05, + "loss": 6.0596, + "step": 2794 + }, + { + "epoch": 0.016622656770387288, + "grad_norm": 3.452252149581909, + "learning_rate": 4.99659434081964e-05, + "loss": 6.1351, + "step": 2795 + }, + { + "epoch": 0.016628604053668283, + "grad_norm": 2.4391021728515625, + "learning_rate": 4.996591903099881e-05, + "loss": 6.3304, + "step": 2796 + }, + { + "epoch": 0.016634551336949282, + "grad_norm": 2.7057220935821533, + "learning_rate": 4.9965894645085885e-05, + "loss": 6.8328, + "step": 2797 + }, + { + "epoch": 0.01664049862023028, + "grad_norm": 2.392627716064453, + "learning_rate": 4.996587025045762e-05, + "loss": 6.8491, + "step": 2798 + }, + { + "epoch": 0.016646445903511276, + "grad_norm": 2.47928786277771, + "learning_rate": 4.9965845847114024e-05, + "loss": 6.6323, + "step": 2799 + }, + { + "epoch": 0.016652393186792274, + "grad_norm": 2.438870668411255, + "learning_rate": 4.9965821435055115e-05, + "loss": 6.3832, + "step": 2800 + }, + { + "epoch": 0.01665834047007327, + "grad_norm": 2.6875247955322266, + "learning_rate": 4.9965797014280895e-05, + "loss": 6.6994, + "step": 2801 + }, + { + "epoch": 0.016664287753354268, + "grad_norm": 2.71785044670105, + "learning_rate": 4.996577258479137e-05, + "loss": 6.2505, + "step": 2802 + }, + { + "epoch": 0.016670235036635266, + "grad_norm": 2.32853102684021, + "learning_rate": 4.996574814658655e-05, + "loss": 6.4409, + "step": 2803 + }, + { + "epoch": 0.01667618231991626, + "grad_norm": 2.271027088165283, + "learning_rate": 4.996572369966646e-05, + "loss": 6.4928, + "step": 2804 + }, + { + "epoch": 0.01668212960319726, + "grad_norm": 2.621448278427124, + "learning_rate": 4.996569924403108e-05, + "loss": 6.7248, + "step": 2805 + }, + { + "epoch": 0.01668807688647826, + "grad_norm": 3.621654748916626, + "learning_rate": 4.9965674779680435e-05, + "loss": 6.7268, + "step": 2806 + }, + { + "epoch": 0.016694024169759254, + "grad_norm": 2.2045094966888428, + "learning_rate": 4.9965650306614534e-05, + "loss": 6.6406, + "step": 2807 + }, + { + "epoch": 0.016699971453040252, + "grad_norm": 2.4885873794555664, + "learning_rate": 4.9965625824833376e-05, + "loss": 6.611, + "step": 2808 + }, + { + "epoch": 0.016705918736321247, + "grad_norm": 2.796971082687378, + "learning_rate": 4.996560133433697e-05, + "loss": 6.455, + "step": 2809 + }, + { + "epoch": 0.016711866019602246, + "grad_norm": 2.539395570755005, + "learning_rate": 4.996557683512535e-05, + "loss": 6.8169, + "step": 2810 + }, + { + "epoch": 0.016717813302883244, + "grad_norm": 2.322824239730835, + "learning_rate": 4.99655523271985e-05, + "loss": 6.3217, + "step": 2811 + }, + { + "epoch": 0.01672376058616424, + "grad_norm": 2.4404520988464355, + "learning_rate": 4.9965527810556424e-05, + "loss": 6.5026, + "step": 2812 + }, + { + "epoch": 0.016729707869445238, + "grad_norm": 2.287362575531006, + "learning_rate": 4.996550328519915e-05, + "loss": 6.9183, + "step": 2813 + }, + { + "epoch": 0.016735655152726233, + "grad_norm": 2.369877815246582, + "learning_rate": 4.996547875112667e-05, + "loss": 6.7488, + "step": 2814 + }, + { + "epoch": 0.01674160243600723, + "grad_norm": 2.323082685470581, + "learning_rate": 4.996545420833899e-05, + "loss": 6.6177, + "step": 2815 + }, + { + "epoch": 0.01674754971928823, + "grad_norm": 2.221214532852173, + "learning_rate": 4.9965429656836145e-05, + "loss": 6.6844, + "step": 2816 + }, + { + "epoch": 0.016753497002569225, + "grad_norm": 2.246819496154785, + "learning_rate": 4.9965405096618116e-05, + "loss": 6.5631, + "step": 2817 + }, + { + "epoch": 0.016759444285850224, + "grad_norm": 2.411806583404541, + "learning_rate": 4.996538052768493e-05, + "loss": 6.4037, + "step": 2818 + }, + { + "epoch": 0.016765391569131222, + "grad_norm": 1.941197395324707, + "learning_rate": 4.996535595003658e-05, + "loss": 6.5232, + "step": 2819 + }, + { + "epoch": 0.016771338852412217, + "grad_norm": 2.149991750717163, + "learning_rate": 4.996533136367309e-05, + "loss": 6.4166, + "step": 2820 + }, + { + "epoch": 0.016777286135693216, + "grad_norm": 2.5388433933258057, + "learning_rate": 4.9965306768594454e-05, + "loss": 6.5733, + "step": 2821 + }, + { + "epoch": 0.01678323341897421, + "grad_norm": 2.1857333183288574, + "learning_rate": 4.9965282164800694e-05, + "loss": 6.5558, + "step": 2822 + }, + { + "epoch": 0.01678918070225521, + "grad_norm": 2.1090164184570312, + "learning_rate": 4.9965257552291804e-05, + "loss": 6.6916, + "step": 2823 + }, + { + "epoch": 0.016795127985536208, + "grad_norm": 2.1102349758148193, + "learning_rate": 4.9965232931067806e-05, + "loss": 6.5852, + "step": 2824 + }, + { + "epoch": 0.016801075268817203, + "grad_norm": 2.384660005569458, + "learning_rate": 4.99652083011287e-05, + "loss": 6.5033, + "step": 2825 + }, + { + "epoch": 0.016807022552098202, + "grad_norm": 2.314896821975708, + "learning_rate": 4.9965183662474504e-05, + "loss": 6.4108, + "step": 2826 + }, + { + "epoch": 0.0168129698353792, + "grad_norm": 2.4358227252960205, + "learning_rate": 4.9965159015105215e-05, + "loss": 6.5309, + "step": 2827 + }, + { + "epoch": 0.016818917118660195, + "grad_norm": 2.179905652999878, + "learning_rate": 4.9965134359020844e-05, + "loss": 6.4593, + "step": 2828 + }, + { + "epoch": 0.016824864401941194, + "grad_norm": 2.2742464542388916, + "learning_rate": 4.99651096942214e-05, + "loss": 6.6654, + "step": 2829 + }, + { + "epoch": 0.01683081168522219, + "grad_norm": 2.211026668548584, + "learning_rate": 4.9965085020706906e-05, + "loss": 6.4527, + "step": 2830 + }, + { + "epoch": 0.016836758968503188, + "grad_norm": 2.552072763442993, + "learning_rate": 4.996506033847735e-05, + "loss": 6.5338, + "step": 2831 + }, + { + "epoch": 0.016842706251784186, + "grad_norm": 2.3208038806915283, + "learning_rate": 4.996503564753276e-05, + "loss": 6.473, + "step": 2832 + }, + { + "epoch": 0.01684865353506518, + "grad_norm": 2.3756048679351807, + "learning_rate": 4.996501094787312e-05, + "loss": 6.4223, + "step": 2833 + }, + { + "epoch": 0.01685460081834618, + "grad_norm": 2.386152982711792, + "learning_rate": 4.996498623949846e-05, + "loss": 6.317, + "step": 2834 + }, + { + "epoch": 0.01686054810162718, + "grad_norm": 2.144510507583618, + "learning_rate": 4.996496152240878e-05, + "loss": 6.4039, + "step": 2835 + }, + { + "epoch": 0.016866495384908173, + "grad_norm": 2.3362607955932617, + "learning_rate": 4.996493679660409e-05, + "loss": 6.5411, + "step": 2836 + }, + { + "epoch": 0.016872442668189172, + "grad_norm": 2.156428337097168, + "learning_rate": 4.9964912062084404e-05, + "loss": 6.3399, + "step": 2837 + }, + { + "epoch": 0.016878389951470167, + "grad_norm": 2.3429903984069824, + "learning_rate": 4.9964887318849715e-05, + "loss": 6.5159, + "step": 2838 + }, + { + "epoch": 0.016884337234751166, + "grad_norm": 2.1888442039489746, + "learning_rate": 4.9964862566900045e-05, + "loss": 6.3906, + "step": 2839 + }, + { + "epoch": 0.016890284518032164, + "grad_norm": 2.3973047733306885, + "learning_rate": 4.9964837806235396e-05, + "loss": 6.3452, + "step": 2840 + }, + { + "epoch": 0.01689623180131316, + "grad_norm": 2.232057809829712, + "learning_rate": 4.996481303685578e-05, + "loss": 6.5203, + "step": 2841 + }, + { + "epoch": 0.016902179084594158, + "grad_norm": 2.672342300415039, + "learning_rate": 4.996478825876122e-05, + "loss": 6.8615, + "step": 2842 + }, + { + "epoch": 0.016908126367875153, + "grad_norm": 2.603943347930908, + "learning_rate": 4.996476347195171e-05, + "loss": 7.1632, + "step": 2843 + }, + { + "epoch": 0.01691407365115615, + "grad_norm": 2.684616804122925, + "learning_rate": 4.9964738676427234e-05, + "loss": 6.5546, + "step": 2844 + }, + { + "epoch": 0.01692002093443715, + "grad_norm": 2.1103904247283936, + "learning_rate": 4.996471387218785e-05, + "loss": 6.4666, + "step": 2845 + }, + { + "epoch": 0.016925968217718145, + "grad_norm": 2.8278937339782715, + "learning_rate": 4.9964689059233525e-05, + "loss": 6.3685, + "step": 2846 + }, + { + "epoch": 0.016931915500999144, + "grad_norm": 3.2611489295959473, + "learning_rate": 4.9964664237564296e-05, + "loss": 6.5537, + "step": 2847 + }, + { + "epoch": 0.016937862784280142, + "grad_norm": 3.029353141784668, + "learning_rate": 4.9964639407180155e-05, + "loss": 6.6097, + "step": 2848 + }, + { + "epoch": 0.016943810067561137, + "grad_norm": 2.6735312938690186, + "learning_rate": 4.996461456808112e-05, + "loss": 6.5854, + "step": 2849 + }, + { + "epoch": 0.016949757350842136, + "grad_norm": 2.7619409561157227, + "learning_rate": 4.99645897202672e-05, + "loss": 6.5944, + "step": 2850 + }, + { + "epoch": 0.01695570463412313, + "grad_norm": 3.0398738384246826, + "learning_rate": 4.9964564863738396e-05, + "loss": 6.3804, + "step": 2851 + }, + { + "epoch": 0.01696165191740413, + "grad_norm": 3.5388784408569336, + "learning_rate": 4.996453999849472e-05, + "loss": 7.0993, + "step": 2852 + }, + { + "epoch": 0.016967599200685128, + "grad_norm": 2.3602113723754883, + "learning_rate": 4.9964515124536185e-05, + "loss": 6.4981, + "step": 2853 + }, + { + "epoch": 0.016973546483966123, + "grad_norm": 2.346632957458496, + "learning_rate": 4.996449024186278e-05, + "loss": 6.4892, + "step": 2854 + }, + { + "epoch": 0.016979493767247122, + "grad_norm": 2.9653544425964355, + "learning_rate": 4.996446535047454e-05, + "loss": 6.2772, + "step": 2855 + }, + { + "epoch": 0.01698544105052812, + "grad_norm": 3.1064538955688477, + "learning_rate": 4.996444045037147e-05, + "loss": 6.238, + "step": 2856 + }, + { + "epoch": 0.016991388333809115, + "grad_norm": 2.9617815017700195, + "learning_rate": 4.9964415541553564e-05, + "loss": 6.2991, + "step": 2857 + }, + { + "epoch": 0.016997335617090114, + "grad_norm": 2.5993905067443848, + "learning_rate": 4.996439062402084e-05, + "loss": 6.5482, + "step": 2858 + }, + { + "epoch": 0.01700328290037111, + "grad_norm": 2.5469226837158203, + "learning_rate": 4.996436569777331e-05, + "loss": 6.437, + "step": 2859 + }, + { + "epoch": 0.017009230183652108, + "grad_norm": 2.709184408187866, + "learning_rate": 4.9964340762810965e-05, + "loss": 6.1362, + "step": 2860 + }, + { + "epoch": 0.017015177466933106, + "grad_norm": 2.843942880630493, + "learning_rate": 4.9964315819133837e-05, + "loss": 6.2443, + "step": 2861 + }, + { + "epoch": 0.0170211247502141, + "grad_norm": 3.022735357284546, + "learning_rate": 4.9964290866741925e-05, + "loss": 6.3161, + "step": 2862 + }, + { + "epoch": 0.0170270720334951, + "grad_norm": 2.487271308898926, + "learning_rate": 4.996426590563523e-05, + "loss": 6.3352, + "step": 2863 + }, + { + "epoch": 0.0170330193167761, + "grad_norm": 2.624000072479248, + "learning_rate": 4.996424093581377e-05, + "loss": 6.3575, + "step": 2864 + }, + { + "epoch": 0.017038966600057093, + "grad_norm": 2.378368854522705, + "learning_rate": 4.996421595727756e-05, + "loss": 6.3284, + "step": 2865 + }, + { + "epoch": 0.017044913883338092, + "grad_norm": 2.6903984546661377, + "learning_rate": 4.996419097002659e-05, + "loss": 6.271, + "step": 2866 + }, + { + "epoch": 0.017050861166619087, + "grad_norm": 2.536391019821167, + "learning_rate": 4.9964165974060875e-05, + "loss": 6.1276, + "step": 2867 + }, + { + "epoch": 0.017056808449900086, + "grad_norm": 2.470395803451538, + "learning_rate": 4.9964140969380434e-05, + "loss": 6.1032, + "step": 2868 + }, + { + "epoch": 0.017062755733181084, + "grad_norm": 2.929818630218506, + "learning_rate": 4.996411595598528e-05, + "loss": 6.0994, + "step": 2869 + }, + { + "epoch": 0.01706870301646208, + "grad_norm": 2.548701763153076, + "learning_rate": 4.99640909338754e-05, + "loss": 6.2227, + "step": 2870 + }, + { + "epoch": 0.017074650299743078, + "grad_norm": 2.6044397354125977, + "learning_rate": 4.99640659030508e-05, + "loss": 6.0778, + "step": 2871 + }, + { + "epoch": 0.017080597583024073, + "grad_norm": 2.687392473220825, + "learning_rate": 4.996404086351153e-05, + "loss": 6.2975, + "step": 2872 + }, + { + "epoch": 0.01708654486630507, + "grad_norm": 2.740201711654663, + "learning_rate": 4.9964015815257556e-05, + "loss": 6.5955, + "step": 2873 + }, + { + "epoch": 0.01709249214958607, + "grad_norm": 2.605958938598633, + "learning_rate": 4.99639907582889e-05, + "loss": 6.2112, + "step": 2874 + }, + { + "epoch": 0.017098439432867065, + "grad_norm": 2.9691529273986816, + "learning_rate": 4.996396569260558e-05, + "loss": 6.1435, + "step": 2875 + }, + { + "epoch": 0.017104386716148064, + "grad_norm": 2.822201728820801, + "learning_rate": 4.9963940618207593e-05, + "loss": 6.1949, + "step": 2876 + }, + { + "epoch": 0.017110333999429062, + "grad_norm": 2.6231529712677, + "learning_rate": 4.996391553509495e-05, + "loss": 6.5082, + "step": 2877 + }, + { + "epoch": 0.017116281282710057, + "grad_norm": 2.6511785984039307, + "learning_rate": 4.9963890443267666e-05, + "loss": 6.4461, + "step": 2878 + }, + { + "epoch": 0.017122228565991056, + "grad_norm": 2.4790167808532715, + "learning_rate": 4.996386534272575e-05, + "loss": 6.4642, + "step": 2879 + }, + { + "epoch": 0.01712817584927205, + "grad_norm": 3.6982533931732178, + "learning_rate": 4.99638402334692e-05, + "loss": 6.2957, + "step": 2880 + }, + { + "epoch": 0.01713412313255305, + "grad_norm": 2.380385160446167, + "learning_rate": 4.996381511549804e-05, + "loss": 6.3174, + "step": 2881 + }, + { + "epoch": 0.017140070415834048, + "grad_norm": 2.425537347793579, + "learning_rate": 4.996378998881226e-05, + "loss": 6.2055, + "step": 2882 + }, + { + "epoch": 0.017146017699115043, + "grad_norm": 2.4667842388153076, + "learning_rate": 4.996376485341188e-05, + "loss": 6.245, + "step": 2883 + }, + { + "epoch": 0.01715196498239604, + "grad_norm": 2.6306424140930176, + "learning_rate": 4.996373970929691e-05, + "loss": 6.1162, + "step": 2884 + }, + { + "epoch": 0.01715791226567704, + "grad_norm": 4.439255714416504, + "learning_rate": 4.996371455646736e-05, + "loss": 5.9868, + "step": 2885 + }, + { + "epoch": 0.017163859548958035, + "grad_norm": 3.3248472213745117, + "learning_rate": 4.9963689394923224e-05, + "loss": 5.861, + "step": 2886 + }, + { + "epoch": 0.017169806832239034, + "grad_norm": 2.45271897315979, + "learning_rate": 4.996366422466453e-05, + "loss": 6.1588, + "step": 2887 + }, + { + "epoch": 0.01717575411552003, + "grad_norm": 3.1748130321502686, + "learning_rate": 4.996363904569128e-05, + "loss": 6.3607, + "step": 2888 + }, + { + "epoch": 0.017181701398801028, + "grad_norm": 3.300736427307129, + "learning_rate": 4.996361385800348e-05, + "loss": 6.0709, + "step": 2889 + }, + { + "epoch": 0.017187648682082026, + "grad_norm": 2.720550060272217, + "learning_rate": 4.9963588661601136e-05, + "loss": 6.0496, + "step": 2890 + }, + { + "epoch": 0.01719359596536302, + "grad_norm": 2.251845121383667, + "learning_rate": 4.9963563456484266e-05, + "loss": 6.0088, + "step": 2891 + }, + { + "epoch": 0.01719954324864402, + "grad_norm": 2.7863035202026367, + "learning_rate": 4.996353824265288e-05, + "loss": 5.9478, + "step": 2892 + }, + { + "epoch": 0.01720549053192502, + "grad_norm": 2.831744432449341, + "learning_rate": 4.996351302010697e-05, + "loss": 6.1629, + "step": 2893 + }, + { + "epoch": 0.017211437815206013, + "grad_norm": 4.583891868591309, + "learning_rate": 4.9963487788846556e-05, + "loss": 6.7936, + "step": 2894 + }, + { + "epoch": 0.017217385098487012, + "grad_norm": 2.4525468349456787, + "learning_rate": 4.996346254887165e-05, + "loss": 6.3188, + "step": 2895 + }, + { + "epoch": 0.017223332381768007, + "grad_norm": 3.0866281986236572, + "learning_rate": 4.9963437300182254e-05, + "loss": 6.0207, + "step": 2896 + }, + { + "epoch": 0.017229279665049006, + "grad_norm": 3.1188113689422607, + "learning_rate": 4.996341204277838e-05, + "loss": 5.9873, + "step": 2897 + }, + { + "epoch": 0.017235226948330004, + "grad_norm": 2.4119350910186768, + "learning_rate": 4.996338677666004e-05, + "loss": 5.8104, + "step": 2898 + }, + { + "epoch": 0.017241174231611, + "grad_norm": 1.9601647853851318, + "learning_rate": 4.996336150182724e-05, + "loss": 6.2166, + "step": 2899 + }, + { + "epoch": 0.017247121514891998, + "grad_norm": 3.428379535675049, + "learning_rate": 4.9963336218279986e-05, + "loss": 6.4284, + "step": 2900 + }, + { + "epoch": 0.017253068798172993, + "grad_norm": 2.629446506500244, + "learning_rate": 4.996331092601829e-05, + "loss": 6.4916, + "step": 2901 + }, + { + "epoch": 0.01725901608145399, + "grad_norm": 2.3860316276550293, + "learning_rate": 4.996328562504216e-05, + "loss": 6.5035, + "step": 2902 + }, + { + "epoch": 0.01726496336473499, + "grad_norm": 2.6754682064056396, + "learning_rate": 4.996326031535161e-05, + "loss": 6.6374, + "step": 2903 + }, + { + "epoch": 0.017270910648015985, + "grad_norm": 2.737901210784912, + "learning_rate": 4.9963234996946635e-05, + "loss": 6.5023, + "step": 2904 + }, + { + "epoch": 0.017276857931296984, + "grad_norm": 2.481691837310791, + "learning_rate": 4.996320966982726e-05, + "loss": 6.5211, + "step": 2905 + }, + { + "epoch": 0.017282805214577982, + "grad_norm": 3.3993568420410156, + "learning_rate": 4.996318433399348e-05, + "loss": 6.4239, + "step": 2906 + }, + { + "epoch": 0.017288752497858977, + "grad_norm": 3.9149057865142822, + "learning_rate": 4.9963158989445316e-05, + "loss": 6.3874, + "step": 2907 + }, + { + "epoch": 0.017294699781139976, + "grad_norm": 2.3808562755584717, + "learning_rate": 4.996313363618276e-05, + "loss": 6.2887, + "step": 2908 + }, + { + "epoch": 0.01730064706442097, + "grad_norm": 2.6186649799346924, + "learning_rate": 4.996310827420585e-05, + "loss": 6.2944, + "step": 2909 + }, + { + "epoch": 0.01730659434770197, + "grad_norm": 2.5251142978668213, + "learning_rate": 4.9963082903514554e-05, + "loss": 6.0944, + "step": 2910 + }, + { + "epoch": 0.017312541630982968, + "grad_norm": 2.8212270736694336, + "learning_rate": 4.9963057524108926e-05, + "loss": 6.6621, + "step": 2911 + }, + { + "epoch": 0.017318488914263963, + "grad_norm": 2.477485418319702, + "learning_rate": 4.996303213598894e-05, + "loss": 6.3941, + "step": 2912 + }, + { + "epoch": 0.01732443619754496, + "grad_norm": 3.6508305072784424, + "learning_rate": 4.996300673915462e-05, + "loss": 6.3234, + "step": 2913 + }, + { + "epoch": 0.01733038348082596, + "grad_norm": 2.1635468006134033, + "learning_rate": 4.996298133360598e-05, + "loss": 6.2877, + "step": 2914 + }, + { + "epoch": 0.017336330764106955, + "grad_norm": 3.431082010269165, + "learning_rate": 4.9962955919343004e-05, + "loss": 6.2627, + "step": 2915 + }, + { + "epoch": 0.017342278047387954, + "grad_norm": 3.272376775741577, + "learning_rate": 4.9962930496365736e-05, + "loss": 6.1458, + "step": 2916 + }, + { + "epoch": 0.01734822533066895, + "grad_norm": 3.5927000045776367, + "learning_rate": 4.996290506467415e-05, + "loss": 5.9828, + "step": 2917 + }, + { + "epoch": 0.017354172613949947, + "grad_norm": 3.569641351699829, + "learning_rate": 4.996287962426829e-05, + "loss": 6.5957, + "step": 2918 + }, + { + "epoch": 0.017360119897230946, + "grad_norm": 3.281855344772339, + "learning_rate": 4.9962854175148134e-05, + "loss": 6.3393, + "step": 2919 + }, + { + "epoch": 0.01736606718051194, + "grad_norm": 2.6009061336517334, + "learning_rate": 4.9962828717313706e-05, + "loss": 6.3537, + "step": 2920 + }, + { + "epoch": 0.01737201446379294, + "grad_norm": 3.964467763900757, + "learning_rate": 4.996280325076501e-05, + "loss": 6.0281, + "step": 2921 + }, + { + "epoch": 0.017377961747073938, + "grad_norm": 3.9164865016937256, + "learning_rate": 4.9962777775502064e-05, + "loss": 6.5255, + "step": 2922 + }, + { + "epoch": 0.017383909030354933, + "grad_norm": 2.349709987640381, + "learning_rate": 4.996275229152486e-05, + "loss": 6.2459, + "step": 2923 + }, + { + "epoch": 0.017389856313635932, + "grad_norm": 2.5735161304473877, + "learning_rate": 4.9962726798833425e-05, + "loss": 6.0463, + "step": 2924 + }, + { + "epoch": 0.017395803596916927, + "grad_norm": 2.228271961212158, + "learning_rate": 4.9962701297427764e-05, + "loss": 6.1147, + "step": 2925 + }, + { + "epoch": 0.017401750880197926, + "grad_norm": 2.4587175846099854, + "learning_rate": 4.9962675787307875e-05, + "loss": 7.0868, + "step": 2926 + }, + { + "epoch": 0.017407698163478924, + "grad_norm": 2.2712674140930176, + "learning_rate": 4.996265026847378e-05, + "loss": 6.175, + "step": 2927 + }, + { + "epoch": 0.01741364544675992, + "grad_norm": 3.0724384784698486, + "learning_rate": 4.996262474092547e-05, + "loss": 6.5354, + "step": 2928 + }, + { + "epoch": 0.017419592730040918, + "grad_norm": 4.872220039367676, + "learning_rate": 4.996259920466297e-05, + "loss": 6.1938, + "step": 2929 + }, + { + "epoch": 0.017425540013321916, + "grad_norm": 4.508706569671631, + "learning_rate": 4.996257365968629e-05, + "loss": 6.1813, + "step": 2930 + }, + { + "epoch": 0.01743148729660291, + "grad_norm": 3.0419485569000244, + "learning_rate": 4.996254810599543e-05, + "loss": 5.9529, + "step": 2931 + }, + { + "epoch": 0.01743743457988391, + "grad_norm": 2.8372066020965576, + "learning_rate": 4.996252254359041e-05, + "loss": 5.9422, + "step": 2932 + }, + { + "epoch": 0.017443381863164905, + "grad_norm": 4.554285526275635, + "learning_rate": 4.996249697247122e-05, + "loss": 6.9073, + "step": 2933 + }, + { + "epoch": 0.017449329146445904, + "grad_norm": 3.121094226837158, + "learning_rate": 4.996247139263788e-05, + "loss": 6.2827, + "step": 2934 + }, + { + "epoch": 0.017455276429726902, + "grad_norm": 3.936596632003784, + "learning_rate": 4.996244580409041e-05, + "loss": 6.7863, + "step": 2935 + }, + { + "epoch": 0.017461223713007897, + "grad_norm": 3.5771539211273193, + "learning_rate": 4.99624202068288e-05, + "loss": 7.0691, + "step": 2936 + }, + { + "epoch": 0.017467170996288896, + "grad_norm": 2.0674471855163574, + "learning_rate": 4.996239460085307e-05, + "loss": 6.9768, + "step": 2937 + }, + { + "epoch": 0.01747311827956989, + "grad_norm": 2.600167989730835, + "learning_rate": 4.996236898616322e-05, + "loss": 6.4235, + "step": 2938 + }, + { + "epoch": 0.01747906556285089, + "grad_norm": 2.9444847106933594, + "learning_rate": 4.9962343362759267e-05, + "loss": 6.7305, + "step": 2939 + }, + { + "epoch": 0.017485012846131888, + "grad_norm": 3.721101999282837, + "learning_rate": 4.996231773064122e-05, + "loss": 6.5147, + "step": 2940 + }, + { + "epoch": 0.017490960129412883, + "grad_norm": 5.715269565582275, + "learning_rate": 4.9962292089809086e-05, + "loss": 6.1433, + "step": 2941 + }, + { + "epoch": 0.01749690741269388, + "grad_norm": 4.245530128479004, + "learning_rate": 4.996226644026287e-05, + "loss": 6.2163, + "step": 2942 + }, + { + "epoch": 0.01750285469597488, + "grad_norm": 2.7717039585113525, + "learning_rate": 4.996224078200259e-05, + "loss": 5.877, + "step": 2943 + }, + { + "epoch": 0.017508801979255875, + "grad_norm": 3.4189441204071045, + "learning_rate": 4.9962215115028255e-05, + "loss": 5.9575, + "step": 2944 + }, + { + "epoch": 0.017514749262536874, + "grad_norm": 3.754513740539551, + "learning_rate": 4.996218943933986e-05, + "loss": 5.7512, + "step": 2945 + }, + { + "epoch": 0.01752069654581787, + "grad_norm": 3.4231228828430176, + "learning_rate": 4.9962163754937426e-05, + "loss": 6.4566, + "step": 2946 + }, + { + "epoch": 0.017526643829098867, + "grad_norm": 2.7481472492218018, + "learning_rate": 4.996213806182095e-05, + "loss": 6.1385, + "step": 2947 + }, + { + "epoch": 0.017532591112379866, + "grad_norm": 2.802342414855957, + "learning_rate": 4.996211235999046e-05, + "loss": 5.6656, + "step": 2948 + }, + { + "epoch": 0.01753853839566086, + "grad_norm": 2.60530686378479, + "learning_rate": 4.996208664944595e-05, + "loss": 5.7339, + "step": 2949 + }, + { + "epoch": 0.01754448567894186, + "grad_norm": 2.476100206375122, + "learning_rate": 4.996206093018744e-05, + "loss": 6.0447, + "step": 2950 + }, + { + "epoch": 0.017550432962222858, + "grad_norm": 2.3516924381256104, + "learning_rate": 4.9962035202214916e-05, + "loss": 6.2046, + "step": 2951 + }, + { + "epoch": 0.017556380245503853, + "grad_norm": 2.447519302368164, + "learning_rate": 4.996200946552842e-05, + "loss": 6.0279, + "step": 2952 + }, + { + "epoch": 0.017562327528784852, + "grad_norm": 2.679766893386841, + "learning_rate": 4.996198372012794e-05, + "loss": 5.9072, + "step": 2953 + }, + { + "epoch": 0.017568274812065847, + "grad_norm": 2.3413944244384766, + "learning_rate": 4.9961957966013486e-05, + "loss": 5.9214, + "step": 2954 + }, + { + "epoch": 0.017574222095346845, + "grad_norm": 2.273725986480713, + "learning_rate": 4.996193220318507e-05, + "loss": 6.2107, + "step": 2955 + }, + { + "epoch": 0.017580169378627844, + "grad_norm": 2.9424052238464355, + "learning_rate": 4.99619064316427e-05, + "loss": 5.8618, + "step": 2956 + }, + { + "epoch": 0.01758611666190884, + "grad_norm": 2.40987229347229, + "learning_rate": 4.9961880651386394e-05, + "loss": 6.1306, + "step": 2957 + }, + { + "epoch": 0.017592063945189838, + "grad_norm": 2.542084217071533, + "learning_rate": 4.9961854862416144e-05, + "loss": 6.2225, + "step": 2958 + }, + { + "epoch": 0.017598011228470836, + "grad_norm": 2.06935977935791, + "learning_rate": 4.996182906473198e-05, + "loss": 5.9899, + "step": 2959 + }, + { + "epoch": 0.01760395851175183, + "grad_norm": 2.1998584270477295, + "learning_rate": 4.99618032583339e-05, + "loss": 6.2268, + "step": 2960 + }, + { + "epoch": 0.01760990579503283, + "grad_norm": 2.5595617294311523, + "learning_rate": 4.99617774432219e-05, + "loss": 6.2856, + "step": 2961 + }, + { + "epoch": 0.017615853078313825, + "grad_norm": 2.9262382984161377, + "learning_rate": 4.9961751619396e-05, + "loss": 6.2747, + "step": 2962 + }, + { + "epoch": 0.017621800361594823, + "grad_norm": 2.3705809116363525, + "learning_rate": 4.996172578685622e-05, + "loss": 6.1376, + "step": 2963 + }, + { + "epoch": 0.017627747644875822, + "grad_norm": 2.20991849899292, + "learning_rate": 4.996169994560256e-05, + "loss": 6.0118, + "step": 2964 + }, + { + "epoch": 0.017633694928156817, + "grad_norm": 2.2801706790924072, + "learning_rate": 4.996167409563502e-05, + "loss": 6.0924, + "step": 2965 + }, + { + "epoch": 0.017639642211437816, + "grad_norm": 2.5618062019348145, + "learning_rate": 4.996164823695362e-05, + "loss": 6.0931, + "step": 2966 + }, + { + "epoch": 0.01764558949471881, + "grad_norm": 2.2933573722839355, + "learning_rate": 4.996162236955837e-05, + "loss": 6.1584, + "step": 2967 + }, + { + "epoch": 0.01765153677799981, + "grad_norm": 2.2387471199035645, + "learning_rate": 4.996159649344928e-05, + "loss": 6.1224, + "step": 2968 + }, + { + "epoch": 0.017657484061280808, + "grad_norm": 2.425929069519043, + "learning_rate": 4.9961570608626347e-05, + "loss": 6.2419, + "step": 2969 + }, + { + "epoch": 0.017663431344561803, + "grad_norm": 3.0279812812805176, + "learning_rate": 4.996154471508959e-05, + "loss": 6.0478, + "step": 2970 + }, + { + "epoch": 0.0176693786278428, + "grad_norm": 2.8950276374816895, + "learning_rate": 4.9961518812839015e-05, + "loss": 5.9663, + "step": 2971 + }, + { + "epoch": 0.0176753259111238, + "grad_norm": 2.9908859729766846, + "learning_rate": 4.996149290187463e-05, + "loss": 5.8101, + "step": 2972 + }, + { + "epoch": 0.017681273194404795, + "grad_norm": 2.900987148284912, + "learning_rate": 4.996146698219645e-05, + "loss": 6.133, + "step": 2973 + }, + { + "epoch": 0.017687220477685794, + "grad_norm": 3.3194754123687744, + "learning_rate": 4.996144105380447e-05, + "loss": 5.9763, + "step": 2974 + }, + { + "epoch": 0.01769316776096679, + "grad_norm": 2.4997923374176025, + "learning_rate": 4.996141511669872e-05, + "loss": 6.1062, + "step": 2975 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 2.3048369884490967, + "learning_rate": 4.996138917087919e-05, + "loss": 6.138, + "step": 2976 + }, + { + "epoch": 0.017705062327528786, + "grad_norm": 2.3391027450561523, + "learning_rate": 4.99613632163459e-05, + "loss": 6.0612, + "step": 2977 + }, + { + "epoch": 0.01771100961080978, + "grad_norm": 2.6164605617523193, + "learning_rate": 4.996133725309886e-05, + "loss": 6.0402, + "step": 2978 + }, + { + "epoch": 0.01771695689409078, + "grad_norm": 2.6534295082092285, + "learning_rate": 4.996131128113807e-05, + "loss": 5.9027, + "step": 2979 + }, + { + "epoch": 0.017722904177371778, + "grad_norm": 2.1807172298431396, + "learning_rate": 4.996128530046354e-05, + "loss": 5.7083, + "step": 2980 + }, + { + "epoch": 0.017728851460652773, + "grad_norm": 2.433762550354004, + "learning_rate": 4.9961259311075296e-05, + "loss": 6.1587, + "step": 2981 + }, + { + "epoch": 0.017734798743933772, + "grad_norm": 2.4656107425689697, + "learning_rate": 4.996123331297333e-05, + "loss": 5.9831, + "step": 2982 + }, + { + "epoch": 0.017740746027214767, + "grad_norm": 2.536060333251953, + "learning_rate": 4.996120730615765e-05, + "loss": 5.9083, + "step": 2983 + }, + { + "epoch": 0.017746693310495765, + "grad_norm": 2.2993409633636475, + "learning_rate": 4.996118129062828e-05, + "loss": 6.0156, + "step": 2984 + }, + { + "epoch": 0.017752640593776764, + "grad_norm": 2.0221481323242188, + "learning_rate": 4.996115526638521e-05, + "loss": 5.9836, + "step": 2985 + }, + { + "epoch": 0.01775858787705776, + "grad_norm": 2.401350498199463, + "learning_rate": 4.996112923342846e-05, + "loss": 5.8071, + "step": 2986 + }, + { + "epoch": 0.017764535160338758, + "grad_norm": 2.469214677810669, + "learning_rate": 4.996110319175804e-05, + "loss": 5.8784, + "step": 2987 + }, + { + "epoch": 0.017770482443619756, + "grad_norm": 2.454481601715088, + "learning_rate": 4.9961077141373955e-05, + "loss": 5.9168, + "step": 2988 + }, + { + "epoch": 0.01777642972690075, + "grad_norm": 2.3173487186431885, + "learning_rate": 4.996105108227621e-05, + "loss": 5.8797, + "step": 2989 + }, + { + "epoch": 0.01778237701018175, + "grad_norm": 2.1967554092407227, + "learning_rate": 4.996102501446483e-05, + "loss": 5.972, + "step": 2990 + }, + { + "epoch": 0.017788324293462745, + "grad_norm": 2.1263201236724854, + "learning_rate": 4.996099893793981e-05, + "loss": 5.9301, + "step": 2991 + }, + { + "epoch": 0.017794271576743743, + "grad_norm": 2.1959195137023926, + "learning_rate": 4.9960972852701165e-05, + "loss": 6.0422, + "step": 2992 + }, + { + "epoch": 0.017800218860024742, + "grad_norm": 2.3290374279022217, + "learning_rate": 4.99609467587489e-05, + "loss": 6.1926, + "step": 2993 + }, + { + "epoch": 0.017806166143305737, + "grad_norm": 2.3518059253692627, + "learning_rate": 4.996092065608303e-05, + "loss": 5.8583, + "step": 2994 + }, + { + "epoch": 0.017812113426586736, + "grad_norm": 2.4263339042663574, + "learning_rate": 4.996089454470355e-05, + "loss": 5.8149, + "step": 2995 + }, + { + "epoch": 0.01781806070986773, + "grad_norm": 2.0764389038085938, + "learning_rate": 4.99608684246105e-05, + "loss": 5.8782, + "step": 2996 + }, + { + "epoch": 0.01782400799314873, + "grad_norm": 2.086904764175415, + "learning_rate": 4.996084229580385e-05, + "loss": 5.7885, + "step": 2997 + }, + { + "epoch": 0.017829955276429728, + "grad_norm": 2.1907291412353516, + "learning_rate": 4.996081615828363e-05, + "loss": 5.9246, + "step": 2998 + }, + { + "epoch": 0.017835902559710723, + "grad_norm": 2.4596495628356934, + "learning_rate": 4.9960790012049854e-05, + "loss": 5.7786, + "step": 2999 + }, + { + "epoch": 0.01784184984299172, + "grad_norm": 2.0762453079223633, + "learning_rate": 4.996076385710252e-05, + "loss": 5.9901, + "step": 3000 + }, + { + "epoch": 0.01784779712627272, + "grad_norm": 2.068714141845703, + "learning_rate": 4.996073769344164e-05, + "loss": 5.9437, + "step": 3001 + }, + { + "epoch": 0.017853744409553715, + "grad_norm": 2.4760496616363525, + "learning_rate": 4.9960711521067226e-05, + "loss": 5.8633, + "step": 3002 + }, + { + "epoch": 0.017859691692834714, + "grad_norm": 2.395643949508667, + "learning_rate": 4.996068533997928e-05, + "loss": 5.8024, + "step": 3003 + }, + { + "epoch": 0.01786563897611571, + "grad_norm": 2.120586633682251, + "learning_rate": 4.996065915017783e-05, + "loss": 6.0712, + "step": 3004 + }, + { + "epoch": 0.017871586259396707, + "grad_norm": 2.384794235229492, + "learning_rate": 4.9960632951662866e-05, + "loss": 5.9089, + "step": 3005 + }, + { + "epoch": 0.017877533542677706, + "grad_norm": 2.24297833442688, + "learning_rate": 4.99606067444344e-05, + "loss": 6.0263, + "step": 3006 + }, + { + "epoch": 0.0178834808259587, + "grad_norm": 1.983299732208252, + "learning_rate": 4.996058052849245e-05, + "loss": 5.8706, + "step": 3007 + }, + { + "epoch": 0.0178894281092397, + "grad_norm": 2.2866950035095215, + "learning_rate": 4.996055430383701e-05, + "loss": 5.9031, + "step": 3008 + }, + { + "epoch": 0.017895375392520698, + "grad_norm": 2.3343560695648193, + "learning_rate": 4.996052807046811e-05, + "loss": 5.9155, + "step": 3009 + }, + { + "epoch": 0.017901322675801693, + "grad_norm": 2.079763650894165, + "learning_rate": 4.9960501828385734e-05, + "loss": 5.8102, + "step": 3010 + }, + { + "epoch": 0.01790726995908269, + "grad_norm": 2.0398895740509033, + "learning_rate": 4.996047557758991e-05, + "loss": 5.773, + "step": 3011 + }, + { + "epoch": 0.017913217242363687, + "grad_norm": 2.2478318214416504, + "learning_rate": 4.996044931808064e-05, + "loss": 5.8584, + "step": 3012 + }, + { + "epoch": 0.017919164525644685, + "grad_norm": 2.301398992538452, + "learning_rate": 4.996042304985794e-05, + "loss": 5.9053, + "step": 3013 + }, + { + "epoch": 0.017925111808925684, + "grad_norm": 2.0428216457366943, + "learning_rate": 4.996039677292181e-05, + "loss": 5.9571, + "step": 3014 + }, + { + "epoch": 0.01793105909220668, + "grad_norm": 2.049572467803955, + "learning_rate": 4.9960370487272266e-05, + "loss": 5.9464, + "step": 3015 + }, + { + "epoch": 0.017937006375487678, + "grad_norm": 2.1681618690490723, + "learning_rate": 4.996034419290931e-05, + "loss": 5.9969, + "step": 3016 + }, + { + "epoch": 0.017942953658768676, + "grad_norm": 2.3879425525665283, + "learning_rate": 4.996031788983296e-05, + "loss": 5.7962, + "step": 3017 + }, + { + "epoch": 0.01794890094204967, + "grad_norm": 2.232508420944214, + "learning_rate": 4.996029157804323e-05, + "loss": 5.8479, + "step": 3018 + }, + { + "epoch": 0.01795484822533067, + "grad_norm": 2.222257137298584, + "learning_rate": 4.9960265257540104e-05, + "loss": 5.952, + "step": 3019 + }, + { + "epoch": 0.017960795508611665, + "grad_norm": 2.213777542114258, + "learning_rate": 4.996023892832362e-05, + "loss": 5.9891, + "step": 3020 + }, + { + "epoch": 0.017966742791892663, + "grad_norm": 2.286097764968872, + "learning_rate": 4.996021259039377e-05, + "loss": 5.8995, + "step": 3021 + }, + { + "epoch": 0.017972690075173662, + "grad_norm": 2.1588432788848877, + "learning_rate": 4.996018624375056e-05, + "loss": 5.988, + "step": 3022 + }, + { + "epoch": 0.017978637358454657, + "grad_norm": 2.2468602657318115, + "learning_rate": 4.996015988839402e-05, + "loss": 5.9303, + "step": 3023 + }, + { + "epoch": 0.017984584641735656, + "grad_norm": 2.1732120513916016, + "learning_rate": 4.9960133524324135e-05, + "loss": 5.8696, + "step": 3024 + }, + { + "epoch": 0.01799053192501665, + "grad_norm": 2.2985105514526367, + "learning_rate": 4.996010715154093e-05, + "loss": 5.9251, + "step": 3025 + }, + { + "epoch": 0.01799647920829765, + "grad_norm": 2.1920788288116455, + "learning_rate": 4.996008077004441e-05, + "loss": 5.8023, + "step": 3026 + }, + { + "epoch": 0.018002426491578648, + "grad_norm": 1.9393725395202637, + "learning_rate": 4.996005437983458e-05, + "loss": 5.9576, + "step": 3027 + }, + { + "epoch": 0.018008373774859643, + "grad_norm": 2.115035057067871, + "learning_rate": 4.9960027980911455e-05, + "loss": 5.9105, + "step": 3028 + }, + { + "epoch": 0.01801432105814064, + "grad_norm": 2.143432855606079, + "learning_rate": 4.996000157327504e-05, + "loss": 5.9951, + "step": 3029 + }, + { + "epoch": 0.01802026834142164, + "grad_norm": 2.4353296756744385, + "learning_rate": 4.995997515692536e-05, + "loss": 5.9761, + "step": 3030 + }, + { + "epoch": 0.018026215624702635, + "grad_norm": 1.999054193496704, + "learning_rate": 4.995994873186239e-05, + "loss": 6.028, + "step": 3031 + }, + { + "epoch": 0.018032162907983634, + "grad_norm": 2.05645751953125, + "learning_rate": 4.995992229808617e-05, + "loss": 5.9778, + "step": 3032 + }, + { + "epoch": 0.01803811019126463, + "grad_norm": 1.948923110961914, + "learning_rate": 4.99598958555967e-05, + "loss": 5.8735, + "step": 3033 + }, + { + "epoch": 0.018044057474545627, + "grad_norm": 2.1208486557006836, + "learning_rate": 4.995986940439399e-05, + "loss": 5.7913, + "step": 3034 + }, + { + "epoch": 0.018050004757826626, + "grad_norm": 2.051079750061035, + "learning_rate": 4.995984294447804e-05, + "loss": 5.8097, + "step": 3035 + }, + { + "epoch": 0.01805595204110762, + "grad_norm": 2.021207571029663, + "learning_rate": 4.995981647584887e-05, + "loss": 5.8425, + "step": 3036 + }, + { + "epoch": 0.01806189932438862, + "grad_norm": 2.471315622329712, + "learning_rate": 4.995978999850649e-05, + "loss": 5.7735, + "step": 3037 + }, + { + "epoch": 0.018067846607669618, + "grad_norm": 2.604836940765381, + "learning_rate": 4.9959763512450896e-05, + "loss": 6.4525, + "step": 3038 + }, + { + "epoch": 0.018073793890950613, + "grad_norm": 2.375361919403076, + "learning_rate": 4.995973701768212e-05, + "loss": 5.8072, + "step": 3039 + }, + { + "epoch": 0.01807974117423161, + "grad_norm": 2.354280471801758, + "learning_rate": 4.995971051420014e-05, + "loss": 5.9434, + "step": 3040 + }, + { + "epoch": 0.018085688457512607, + "grad_norm": 2.7335755825042725, + "learning_rate": 4.9959684002005e-05, + "loss": 5.5899, + "step": 3041 + }, + { + "epoch": 0.018091635740793605, + "grad_norm": 2.244917869567871, + "learning_rate": 4.995965748109668e-05, + "loss": 5.799, + "step": 3042 + }, + { + "epoch": 0.018097583024074604, + "grad_norm": 2.2413697242736816, + "learning_rate": 4.995963095147521e-05, + "loss": 5.8635, + "step": 3043 + }, + { + "epoch": 0.0181035303073556, + "grad_norm": 2.122586488723755, + "learning_rate": 4.9959604413140584e-05, + "loss": 5.8098, + "step": 3044 + }, + { + "epoch": 0.018109477590636597, + "grad_norm": 2.407517910003662, + "learning_rate": 4.995957786609282e-05, + "loss": 6.0319, + "step": 3045 + }, + { + "epoch": 0.018115424873917596, + "grad_norm": 2.5628743171691895, + "learning_rate": 4.9959551310331934e-05, + "loss": 5.9561, + "step": 3046 + }, + { + "epoch": 0.01812137215719859, + "grad_norm": 2.335650682449341, + "learning_rate": 4.995952474585791e-05, + "loss": 6.1168, + "step": 3047 + }, + { + "epoch": 0.01812731944047959, + "grad_norm": 2.169771432876587, + "learning_rate": 4.995949817267078e-05, + "loss": 6.0555, + "step": 3048 + }, + { + "epoch": 0.018133266723760585, + "grad_norm": 2.2245211601257324, + "learning_rate": 4.995947159077056e-05, + "loss": 5.9084, + "step": 3049 + }, + { + "epoch": 0.018139214007041583, + "grad_norm": 2.2296931743621826, + "learning_rate": 4.995944500015723e-05, + "loss": 5.8878, + "step": 3050 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 2.2372493743896484, + "learning_rate": 4.995941840083082e-05, + "loss": 5.9521, + "step": 3051 + }, + { + "epoch": 0.018151108573603577, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.995939179279134e-05, + "loss": 5.899, + "step": 3052 + }, + { + "epoch": 0.018157055856884576, + "grad_norm": 2.218245267868042, + "learning_rate": 4.995936517603879e-05, + "loss": 6.0311, + "step": 3053 + }, + { + "epoch": 0.018163003140165574, + "grad_norm": 2.2877273559570312, + "learning_rate": 4.995933855057318e-05, + "loss": 6.0052, + "step": 3054 + }, + { + "epoch": 0.01816895042344657, + "grad_norm": 2.225764751434326, + "learning_rate": 4.995931191639453e-05, + "loss": 6.0373, + "step": 3055 + }, + { + "epoch": 0.018174897706727568, + "grad_norm": 2.5069313049316406, + "learning_rate": 4.995928527350284e-05, + "loss": 5.8729, + "step": 3056 + }, + { + "epoch": 0.018180844990008563, + "grad_norm": 2.089759588241577, + "learning_rate": 4.995925862189812e-05, + "loss": 5.9462, + "step": 3057 + }, + { + "epoch": 0.01818679227328956, + "grad_norm": 2.0159049034118652, + "learning_rate": 4.9959231961580376e-05, + "loss": 5.9276, + "step": 3058 + }, + { + "epoch": 0.01819273955657056, + "grad_norm": 2.207636594772339, + "learning_rate": 4.995920529254963e-05, + "loss": 5.9921, + "step": 3059 + }, + { + "epoch": 0.018198686839851555, + "grad_norm": 2.380232810974121, + "learning_rate": 4.995917861480588e-05, + "loss": 5.9092, + "step": 3060 + }, + { + "epoch": 0.018204634123132554, + "grad_norm": 2.073237895965576, + "learning_rate": 4.9959151928349134e-05, + "loss": 5.8472, + "step": 3061 + }, + { + "epoch": 0.01821058140641355, + "grad_norm": 1.824062705039978, + "learning_rate": 4.995912523317942e-05, + "loss": 5.7958, + "step": 3062 + }, + { + "epoch": 0.018216528689694547, + "grad_norm": 2.3961215019226074, + "learning_rate": 4.995909852929672e-05, + "loss": 6.1388, + "step": 3063 + }, + { + "epoch": 0.018222475972975546, + "grad_norm": 2.8391239643096924, + "learning_rate": 4.9959071816701065e-05, + "loss": 5.7564, + "step": 3064 + }, + { + "epoch": 0.01822842325625654, + "grad_norm": 2.4684112071990967, + "learning_rate": 4.995904509539244e-05, + "loss": 5.8372, + "step": 3065 + }, + { + "epoch": 0.01823437053953754, + "grad_norm": 2.419983386993408, + "learning_rate": 4.995901836537089e-05, + "loss": 5.9332, + "step": 3066 + }, + { + "epoch": 0.018240317822818538, + "grad_norm": 2.500227928161621, + "learning_rate": 4.99589916266364e-05, + "loss": 6.0848, + "step": 3067 + }, + { + "epoch": 0.018246265106099533, + "grad_norm": 2.1683971881866455, + "learning_rate": 4.9958964879188976e-05, + "loss": 6.0911, + "step": 3068 + }, + { + "epoch": 0.01825221238938053, + "grad_norm": 2.2345223426818848, + "learning_rate": 4.995893812302864e-05, + "loss": 6.016, + "step": 3069 + }, + { + "epoch": 0.018258159672661527, + "grad_norm": 2.318321466445923, + "learning_rate": 4.995891135815539e-05, + "loss": 5.9622, + "step": 3070 + }, + { + "epoch": 0.018264106955942525, + "grad_norm": 2.294602155685425, + "learning_rate": 4.9958884584569255e-05, + "loss": 5.8908, + "step": 3071 + }, + { + "epoch": 0.018270054239223524, + "grad_norm": 2.5472419261932373, + "learning_rate": 4.995885780227022e-05, + "loss": 5.7906, + "step": 3072 + }, + { + "epoch": 0.01827600152250452, + "grad_norm": 2.319101095199585, + "learning_rate": 4.995883101125831e-05, + "loss": 6.3366, + "step": 3073 + }, + { + "epoch": 0.018281948805785517, + "grad_norm": 2.3564186096191406, + "learning_rate": 4.995880421153353e-05, + "loss": 5.9863, + "step": 3074 + }, + { + "epoch": 0.018287896089066516, + "grad_norm": 2.434756278991699, + "learning_rate": 4.995877740309589e-05, + "loss": 5.885, + "step": 3075 + }, + { + "epoch": 0.01829384337234751, + "grad_norm": 2.062861442565918, + "learning_rate": 4.99587505859454e-05, + "loss": 6.0813, + "step": 3076 + }, + { + "epoch": 0.01829979065562851, + "grad_norm": 2.127049684524536, + "learning_rate": 4.995872376008206e-05, + "loss": 6.1226, + "step": 3077 + }, + { + "epoch": 0.018305737938909505, + "grad_norm": 2.288405656814575, + "learning_rate": 4.995869692550589e-05, + "loss": 5.9625, + "step": 3078 + }, + { + "epoch": 0.018311685222190503, + "grad_norm": 2.2387006282806396, + "learning_rate": 4.9958670082216905e-05, + "loss": 5.9479, + "step": 3079 + }, + { + "epoch": 0.018317632505471502, + "grad_norm": 2.18864107131958, + "learning_rate": 4.9958643230215096e-05, + "loss": 5.9223, + "step": 3080 + }, + { + "epoch": 0.018323579788752497, + "grad_norm": 2.3457415103912354, + "learning_rate": 4.995861636950049e-05, + "loss": 5.7857, + "step": 3081 + }, + { + "epoch": 0.018329527072033495, + "grad_norm": 2.6946494579315186, + "learning_rate": 4.995858950007309e-05, + "loss": 5.5546, + "step": 3082 + }, + { + "epoch": 0.018335474355314494, + "grad_norm": 2.5135412216186523, + "learning_rate": 4.99585626219329e-05, + "loss": 5.5624, + "step": 3083 + }, + { + "epoch": 0.01834142163859549, + "grad_norm": 2.6617767810821533, + "learning_rate": 4.9958535735079934e-05, + "loss": 5.8789, + "step": 3084 + }, + { + "epoch": 0.018347368921876488, + "grad_norm": 2.099261522293091, + "learning_rate": 4.9958508839514196e-05, + "loss": 5.9365, + "step": 3085 + }, + { + "epoch": 0.018353316205157483, + "grad_norm": 2.5267064571380615, + "learning_rate": 4.9958481935235715e-05, + "loss": 6.0935, + "step": 3086 + }, + { + "epoch": 0.01835926348843848, + "grad_norm": 2.3353283405303955, + "learning_rate": 4.995845502224447e-05, + "loss": 5.909, + "step": 3087 + }, + { + "epoch": 0.01836521077171948, + "grad_norm": 2.396430492401123, + "learning_rate": 4.9958428100540496e-05, + "loss": 6.0272, + "step": 3088 + }, + { + "epoch": 0.018371158055000475, + "grad_norm": 2.095308303833008, + "learning_rate": 4.9958401170123784e-05, + "loss": 5.9791, + "step": 3089 + }, + { + "epoch": 0.018377105338281473, + "grad_norm": 2.7606077194213867, + "learning_rate": 4.9958374230994357e-05, + "loss": 5.9716, + "step": 3090 + }, + { + "epoch": 0.01838305262156247, + "grad_norm": 2.4490914344787598, + "learning_rate": 4.995834728315222e-05, + "loss": 5.8763, + "step": 3091 + }, + { + "epoch": 0.018388999904843467, + "grad_norm": 2.709092855453491, + "learning_rate": 4.9958320326597385e-05, + "loss": 5.74, + "step": 3092 + }, + { + "epoch": 0.018394947188124466, + "grad_norm": 2.8829305171966553, + "learning_rate": 4.9958293361329856e-05, + "loss": 5.8469, + "step": 3093 + }, + { + "epoch": 0.01840089447140546, + "grad_norm": 2.6500396728515625, + "learning_rate": 4.995826638734964e-05, + "loss": 5.8578, + "step": 3094 + }, + { + "epoch": 0.01840684175468646, + "grad_norm": 2.0665056705474854, + "learning_rate": 4.9958239404656755e-05, + "loss": 5.9662, + "step": 3095 + }, + { + "epoch": 0.018412789037967458, + "grad_norm": 2.3198931217193604, + "learning_rate": 4.9958212413251205e-05, + "loss": 6.0663, + "step": 3096 + }, + { + "epoch": 0.018418736321248453, + "grad_norm": 2.9056031703948975, + "learning_rate": 4.9958185413133e-05, + "loss": 5.8015, + "step": 3097 + }, + { + "epoch": 0.01842468360452945, + "grad_norm": 2.446164131164551, + "learning_rate": 4.995815840430216e-05, + "loss": 5.6878, + "step": 3098 + }, + { + "epoch": 0.018430630887810447, + "grad_norm": 2.797506093978882, + "learning_rate": 4.995813138675867e-05, + "loss": 5.7675, + "step": 3099 + }, + { + "epoch": 0.018436578171091445, + "grad_norm": 3.2914962768554688, + "learning_rate": 4.995810436050256e-05, + "loss": 6.3661, + "step": 3100 + }, + { + "epoch": 0.018442525454372444, + "grad_norm": 2.444363594055176, + "learning_rate": 4.995807732553384e-05, + "loss": 5.9251, + "step": 3101 + }, + { + "epoch": 0.01844847273765344, + "grad_norm": 2.526951551437378, + "learning_rate": 4.9958050281852505e-05, + "loss": 5.8202, + "step": 3102 + }, + { + "epoch": 0.018454420020934437, + "grad_norm": 2.2046117782592773, + "learning_rate": 4.995802322945857e-05, + "loss": 6.0572, + "step": 3103 + }, + { + "epoch": 0.018460367304215436, + "grad_norm": 2.5484018325805664, + "learning_rate": 4.9957996168352055e-05, + "loss": 6.1215, + "step": 3104 + }, + { + "epoch": 0.01846631458749643, + "grad_norm": 2.4785003662109375, + "learning_rate": 4.9957969098532965e-05, + "loss": 5.9524, + "step": 3105 + }, + { + "epoch": 0.01847226187077743, + "grad_norm": 2.9028711318969727, + "learning_rate": 4.9957942020001294e-05, + "loss": 6.1175, + "step": 3106 + }, + { + "epoch": 0.018478209154058425, + "grad_norm": 2.1766602993011475, + "learning_rate": 4.995791493275707e-05, + "loss": 5.9746, + "step": 3107 + }, + { + "epoch": 0.018484156437339423, + "grad_norm": 2.079423189163208, + "learning_rate": 4.995788783680029e-05, + "loss": 5.9463, + "step": 3108 + }, + { + "epoch": 0.018490103720620422, + "grad_norm": 2.285184144973755, + "learning_rate": 4.995786073213098e-05, + "loss": 5.5174, + "step": 3109 + }, + { + "epoch": 0.018496051003901417, + "grad_norm": 2.170018196105957, + "learning_rate": 4.9957833618749126e-05, + "loss": 5.7948, + "step": 3110 + }, + { + "epoch": 0.018501998287182415, + "grad_norm": 2.284517526626587, + "learning_rate": 4.9957806496654754e-05, + "loss": 5.9455, + "step": 3111 + }, + { + "epoch": 0.018507945570463414, + "grad_norm": 2.5539982318878174, + "learning_rate": 4.9957779365847876e-05, + "loss": 5.9791, + "step": 3112 + }, + { + "epoch": 0.01851389285374441, + "grad_norm": 2.1735522747039795, + "learning_rate": 4.995775222632849e-05, + "loss": 5.9549, + "step": 3113 + }, + { + "epoch": 0.018519840137025408, + "grad_norm": 2.2272653579711914, + "learning_rate": 4.995772507809662e-05, + "loss": 5.8618, + "step": 3114 + }, + { + "epoch": 0.018525787420306403, + "grad_norm": 1.9390417337417603, + "learning_rate": 4.995769792115225e-05, + "loss": 5.9617, + "step": 3115 + }, + { + "epoch": 0.0185317347035874, + "grad_norm": 2.6526312828063965, + "learning_rate": 4.9957670755495414e-05, + "loss": 5.9296, + "step": 3116 + }, + { + "epoch": 0.0185376819868684, + "grad_norm": 2.533996105194092, + "learning_rate": 4.995764358112611e-05, + "loss": 6.0045, + "step": 3117 + }, + { + "epoch": 0.018543629270149395, + "grad_norm": 2.183347225189209, + "learning_rate": 4.995761639804436e-05, + "loss": 5.9254, + "step": 3118 + }, + { + "epoch": 0.018549576553430393, + "grad_norm": 1.9411321878433228, + "learning_rate": 4.995758920625015e-05, + "loss": 5.9404, + "step": 3119 + }, + { + "epoch": 0.01855552383671139, + "grad_norm": 4.914453029632568, + "learning_rate": 4.9957562005743514e-05, + "loss": 5.8139, + "step": 3120 + }, + { + "epoch": 0.018561471119992387, + "grad_norm": 2.3052754402160645, + "learning_rate": 4.9957534796524444e-05, + "loss": 5.6525, + "step": 3121 + }, + { + "epoch": 0.018567418403273386, + "grad_norm": 2.424464464187622, + "learning_rate": 4.995750757859296e-05, + "loss": 5.9599, + "step": 3122 + }, + { + "epoch": 0.01857336568655438, + "grad_norm": 2.1392033100128174, + "learning_rate": 4.995748035194907e-05, + "loss": 5.9558, + "step": 3123 + }, + { + "epoch": 0.01857931296983538, + "grad_norm": 4.67656135559082, + "learning_rate": 4.995745311659278e-05, + "loss": 5.7606, + "step": 3124 + }, + { + "epoch": 0.018585260253116378, + "grad_norm": 2.0772082805633545, + "learning_rate": 4.99574258725241e-05, + "loss": 5.9328, + "step": 3125 + }, + { + "epoch": 0.018591207536397373, + "grad_norm": 2.0255486965179443, + "learning_rate": 4.995739861974303e-05, + "loss": 5.9395, + "step": 3126 + }, + { + "epoch": 0.01859715481967837, + "grad_norm": 2.3629064559936523, + "learning_rate": 4.995737135824961e-05, + "loss": 5.9663, + "step": 3127 + }, + { + "epoch": 0.018603102102959367, + "grad_norm": 1.9924237728118896, + "learning_rate": 4.9957344088043814e-05, + "loss": 5.8998, + "step": 3128 + }, + { + "epoch": 0.018609049386240365, + "grad_norm": 2.096774101257324, + "learning_rate": 4.9957316809125676e-05, + "loss": 5.7178, + "step": 3129 + }, + { + "epoch": 0.018614996669521364, + "grad_norm": 2.2288100719451904, + "learning_rate": 4.9957289521495194e-05, + "loss": 5.9096, + "step": 3130 + }, + { + "epoch": 0.01862094395280236, + "grad_norm": 2.456099033355713, + "learning_rate": 4.995726222515238e-05, + "loss": 5.7738, + "step": 3131 + }, + { + "epoch": 0.018626891236083357, + "grad_norm": 2.238218069076538, + "learning_rate": 4.995723492009724e-05, + "loss": 5.6929, + "step": 3132 + }, + { + "epoch": 0.018632838519364356, + "grad_norm": 1.8309845924377441, + "learning_rate": 4.9957207606329795e-05, + "loss": 5.9339, + "step": 3133 + }, + { + "epoch": 0.01863878580264535, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.995718028385003e-05, + "loss": 5.9704, + "step": 3134 + }, + { + "epoch": 0.01864473308592635, + "grad_norm": 2.0929813385009766, + "learning_rate": 4.9957152952657995e-05, + "loss": 5.7598, + "step": 3135 + }, + { + "epoch": 0.018650680369207345, + "grad_norm": 2.2813265323638916, + "learning_rate": 4.995712561275366e-05, + "loss": 5.7986, + "step": 3136 + }, + { + "epoch": 0.018656627652488343, + "grad_norm": 2.1189653873443604, + "learning_rate": 4.995709826413705e-05, + "loss": 5.6603, + "step": 3137 + }, + { + "epoch": 0.01866257493576934, + "grad_norm": 2.1439480781555176, + "learning_rate": 4.9957070906808185e-05, + "loss": 5.6952, + "step": 3138 + }, + { + "epoch": 0.018668522219050337, + "grad_norm": 2.4345993995666504, + "learning_rate": 4.995704354076706e-05, + "loss": 5.7531, + "step": 3139 + }, + { + "epoch": 0.018674469502331335, + "grad_norm": 2.5551047325134277, + "learning_rate": 4.995701616601368e-05, + "loss": 5.544, + "step": 3140 + }, + { + "epoch": 0.018680416785612334, + "grad_norm": 2.333603620529175, + "learning_rate": 4.9956988782548075e-05, + "loss": 5.5732, + "step": 3141 + }, + { + "epoch": 0.01868636406889333, + "grad_norm": 2.2983827590942383, + "learning_rate": 4.995696139037024e-05, + "loss": 5.8779, + "step": 3142 + }, + { + "epoch": 0.018692311352174328, + "grad_norm": 2.7525672912597656, + "learning_rate": 4.995693398948018e-05, + "loss": 5.5998, + "step": 3143 + }, + { + "epoch": 0.018698258635455323, + "grad_norm": 2.3622052669525146, + "learning_rate": 4.995690657987793e-05, + "loss": 5.8851, + "step": 3144 + }, + { + "epoch": 0.01870420591873632, + "grad_norm": 2.4975669384002686, + "learning_rate": 4.995687916156346e-05, + "loss": 5.6388, + "step": 3145 + }, + { + "epoch": 0.01871015320201732, + "grad_norm": 2.5763049125671387, + "learning_rate": 4.9956851734536816e-05, + "loss": 5.4931, + "step": 3146 + }, + { + "epoch": 0.018716100485298315, + "grad_norm": 2.7156779766082764, + "learning_rate": 4.995682429879799e-05, + "loss": 5.8035, + "step": 3147 + }, + { + "epoch": 0.018722047768579313, + "grad_norm": 2.259134292602539, + "learning_rate": 4.995679685434699e-05, + "loss": 5.9519, + "step": 3148 + }, + { + "epoch": 0.018727995051860312, + "grad_norm": 2.544829845428467, + "learning_rate": 4.995676940118383e-05, + "loss": 5.7373, + "step": 3149 + }, + { + "epoch": 0.018733942335141307, + "grad_norm": 2.326660633087158, + "learning_rate": 4.995674193930853e-05, + "loss": 5.7719, + "step": 3150 + }, + { + "epoch": 0.018739889618422306, + "grad_norm": 2.25370192527771, + "learning_rate": 4.995671446872108e-05, + "loss": 5.813, + "step": 3151 + }, + { + "epoch": 0.0187458369017033, + "grad_norm": 2.1467692852020264, + "learning_rate": 4.99566869894215e-05, + "loss": 5.5836, + "step": 3152 + }, + { + "epoch": 0.0187517841849843, + "grad_norm": 2.30096697807312, + "learning_rate": 4.9956659501409796e-05, + "loss": 5.8249, + "step": 3153 + }, + { + "epoch": 0.018757731468265298, + "grad_norm": 2.3050386905670166, + "learning_rate": 4.9956632004685986e-05, + "loss": 5.6806, + "step": 3154 + }, + { + "epoch": 0.018763678751546293, + "grad_norm": 2.473008632659912, + "learning_rate": 4.995660449925007e-05, + "loss": 5.4512, + "step": 3155 + }, + { + "epoch": 0.01876962603482729, + "grad_norm": 2.0691702365875244, + "learning_rate": 4.995657698510206e-05, + "loss": 5.6582, + "step": 3156 + }, + { + "epoch": 0.018775573318108287, + "grad_norm": 2.332423686981201, + "learning_rate": 4.995654946224197e-05, + "loss": 5.6017, + "step": 3157 + }, + { + "epoch": 0.018781520601389285, + "grad_norm": 2.6423730850219727, + "learning_rate": 4.9956521930669806e-05, + "loss": 5.619, + "step": 3158 + }, + { + "epoch": 0.018787467884670284, + "grad_norm": 3.0884950160980225, + "learning_rate": 4.995649439038558e-05, + "loss": 5.7813, + "step": 3159 + }, + { + "epoch": 0.01879341516795128, + "grad_norm": 2.4923598766326904, + "learning_rate": 4.995646684138929e-05, + "loss": 5.8089, + "step": 3160 + }, + { + "epoch": 0.018799362451232277, + "grad_norm": 2.5505683422088623, + "learning_rate": 4.9956439283680965e-05, + "loss": 5.8171, + "step": 3161 + }, + { + "epoch": 0.018805309734513276, + "grad_norm": 2.7343056201934814, + "learning_rate": 4.99564117172606e-05, + "loss": 6.3472, + "step": 3162 + }, + { + "epoch": 0.01881125701779427, + "grad_norm": 2.9170796871185303, + "learning_rate": 4.995638414212821e-05, + "loss": 5.7478, + "step": 3163 + }, + { + "epoch": 0.01881720430107527, + "grad_norm": 2.392648696899414, + "learning_rate": 4.9956356558283815e-05, + "loss": 5.8105, + "step": 3164 + }, + { + "epoch": 0.018823151584356265, + "grad_norm": 2.532207727432251, + "learning_rate": 4.9956328965727394e-05, + "loss": 5.9285, + "step": 3165 + }, + { + "epoch": 0.018829098867637263, + "grad_norm": 2.6717050075531006, + "learning_rate": 4.995630136445899e-05, + "loss": 6.0344, + "step": 3166 + }, + { + "epoch": 0.01883504615091826, + "grad_norm": 2.1829564571380615, + "learning_rate": 4.99562737544786e-05, + "loss": 6.0078, + "step": 3167 + }, + { + "epoch": 0.018840993434199257, + "grad_norm": 2.2728323936462402, + "learning_rate": 4.995624613578622e-05, + "loss": 5.8211, + "step": 3168 + }, + { + "epoch": 0.018846940717480255, + "grad_norm": 2.046717882156372, + "learning_rate": 4.995621850838189e-05, + "loss": 5.9685, + "step": 3169 + }, + { + "epoch": 0.018852888000761254, + "grad_norm": 2.737494945526123, + "learning_rate": 4.995619087226559e-05, + "loss": 5.649, + "step": 3170 + }, + { + "epoch": 0.01885883528404225, + "grad_norm": 2.276503801345825, + "learning_rate": 4.9956163227437345e-05, + "loss": 5.8137, + "step": 3171 + }, + { + "epoch": 0.018864782567323247, + "grad_norm": 2.2799227237701416, + "learning_rate": 4.9956135573897155e-05, + "loss": 5.8277, + "step": 3172 + }, + { + "epoch": 0.018870729850604243, + "grad_norm": 2.131425619125366, + "learning_rate": 4.995610791164505e-05, + "loss": 5.8909, + "step": 3173 + }, + { + "epoch": 0.01887667713388524, + "grad_norm": 2.2295737266540527, + "learning_rate": 4.995608024068102e-05, + "loss": 5.8236, + "step": 3174 + }, + { + "epoch": 0.01888262441716624, + "grad_norm": 2.30082631111145, + "learning_rate": 4.9956052561005076e-05, + "loss": 5.7331, + "step": 3175 + }, + { + "epoch": 0.018888571700447235, + "grad_norm": 2.751847505569458, + "learning_rate": 4.9956024872617225e-05, + "loss": 5.8673, + "step": 3176 + }, + { + "epoch": 0.018894518983728233, + "grad_norm": 2.4597535133361816, + "learning_rate": 4.995599717551749e-05, + "loss": 5.7561, + "step": 3177 + }, + { + "epoch": 0.018900466267009232, + "grad_norm": 2.1418228149414062, + "learning_rate": 4.9955969469705874e-05, + "loss": 5.7112, + "step": 3178 + }, + { + "epoch": 0.018906413550290227, + "grad_norm": 2.0560619831085205, + "learning_rate": 4.9955941755182395e-05, + "loss": 5.7764, + "step": 3179 + }, + { + "epoch": 0.018912360833571226, + "grad_norm": 2.268781900405884, + "learning_rate": 4.9955914031947046e-05, + "loss": 5.7319, + "step": 3180 + }, + { + "epoch": 0.01891830811685222, + "grad_norm": 2.6272811889648438, + "learning_rate": 4.995588629999985e-05, + "loss": 6.0601, + "step": 3181 + }, + { + "epoch": 0.01892425540013322, + "grad_norm": 2.1991870403289795, + "learning_rate": 4.995585855934081e-05, + "loss": 5.602, + "step": 3182 + }, + { + "epoch": 0.018930202683414218, + "grad_norm": 2.0521514415740967, + "learning_rate": 4.995583080996994e-05, + "loss": 5.8075, + "step": 3183 + }, + { + "epoch": 0.018936149966695213, + "grad_norm": 2.153473138809204, + "learning_rate": 4.995580305188724e-05, + "loss": 5.8219, + "step": 3184 + }, + { + "epoch": 0.01894209724997621, + "grad_norm": 2.0663251876831055, + "learning_rate": 4.9955775285092735e-05, + "loss": 5.836, + "step": 3185 + }, + { + "epoch": 0.018948044533257206, + "grad_norm": 1.8808318376541138, + "learning_rate": 4.995574750958642e-05, + "loss": 5.7938, + "step": 3186 + }, + { + "epoch": 0.018953991816538205, + "grad_norm": 2.256012201309204, + "learning_rate": 4.995571972536831e-05, + "loss": 5.6404, + "step": 3187 + }, + { + "epoch": 0.018959939099819204, + "grad_norm": 2.29636287689209, + "learning_rate": 4.995569193243843e-05, + "loss": 5.7161, + "step": 3188 + }, + { + "epoch": 0.0189658863831002, + "grad_norm": 2.728804588317871, + "learning_rate": 4.995566413079676e-05, + "loss": 5.8165, + "step": 3189 + }, + { + "epoch": 0.018971833666381197, + "grad_norm": 2.3115599155426025, + "learning_rate": 4.995563632044333e-05, + "loss": 5.7004, + "step": 3190 + }, + { + "epoch": 0.018977780949662196, + "grad_norm": 2.1607725620269775, + "learning_rate": 4.995560850137815e-05, + "loss": 5.7788, + "step": 3191 + }, + { + "epoch": 0.01898372823294319, + "grad_norm": 2.322132110595703, + "learning_rate": 4.995558067360122e-05, + "loss": 5.5677, + "step": 3192 + }, + { + "epoch": 0.01898967551622419, + "grad_norm": 2.148022174835205, + "learning_rate": 4.995555283711256e-05, + "loss": 5.7708, + "step": 3193 + }, + { + "epoch": 0.018995622799505184, + "grad_norm": 2.339812994003296, + "learning_rate": 4.9955524991912165e-05, + "loss": 5.7945, + "step": 3194 + }, + { + "epoch": 0.019001570082786183, + "grad_norm": 1.9469980001449585, + "learning_rate": 4.995549713800006e-05, + "loss": 5.695, + "step": 3195 + }, + { + "epoch": 0.01900751736606718, + "grad_norm": 2.1744890213012695, + "learning_rate": 4.9955469275376254e-05, + "loss": 5.7544, + "step": 3196 + }, + { + "epoch": 0.019013464649348177, + "grad_norm": 2.175123691558838, + "learning_rate": 4.9955441404040745e-05, + "loss": 5.598, + "step": 3197 + }, + { + "epoch": 0.019019411932629175, + "grad_norm": 2.3011369705200195, + "learning_rate": 4.995541352399355e-05, + "loss": 5.7069, + "step": 3198 + }, + { + "epoch": 0.019025359215910174, + "grad_norm": 2.2227025032043457, + "learning_rate": 4.9955385635234675e-05, + "loss": 5.6854, + "step": 3199 + }, + { + "epoch": 0.01903130649919117, + "grad_norm": 2.5465073585510254, + "learning_rate": 4.995535773776414e-05, + "loss": 5.9085, + "step": 3200 + }, + { + "epoch": 0.019037253782472167, + "grad_norm": 2.936612844467163, + "learning_rate": 4.995532983158194e-05, + "loss": 6.0519, + "step": 3201 + }, + { + "epoch": 0.019043201065753163, + "grad_norm": 2.8298418521881104, + "learning_rate": 4.9955301916688094e-05, + "loss": 5.9473, + "step": 3202 + }, + { + "epoch": 0.01904914834903416, + "grad_norm": 2.2295944690704346, + "learning_rate": 4.9955273993082615e-05, + "loss": 5.9652, + "step": 3203 + }, + { + "epoch": 0.01905509563231516, + "grad_norm": 2.7771801948547363, + "learning_rate": 4.9955246060765505e-05, + "loss": 5.9291, + "step": 3204 + }, + { + "epoch": 0.019061042915596155, + "grad_norm": 3.0721678733825684, + "learning_rate": 4.9955218119736776e-05, + "loss": 6.2319, + "step": 3205 + }, + { + "epoch": 0.019066990198877153, + "grad_norm": 2.7866547107696533, + "learning_rate": 4.9955190169996434e-05, + "loss": 6.0412, + "step": 3206 + }, + { + "epoch": 0.019072937482158152, + "grad_norm": 2.287216901779175, + "learning_rate": 4.99551622115445e-05, + "loss": 5.6435, + "step": 3207 + }, + { + "epoch": 0.019078884765439147, + "grad_norm": 2.3618898391723633, + "learning_rate": 4.995513424438098e-05, + "loss": 5.7711, + "step": 3208 + }, + { + "epoch": 0.019084832048720145, + "grad_norm": 2.192997932434082, + "learning_rate": 4.995510626850587e-05, + "loss": 5.8351, + "step": 3209 + }, + { + "epoch": 0.01909077933200114, + "grad_norm": 2.252722978591919, + "learning_rate": 4.995507828391919e-05, + "loss": 5.5989, + "step": 3210 + }, + { + "epoch": 0.01909672661528214, + "grad_norm": 2.451167106628418, + "learning_rate": 4.995505029062095e-05, + "loss": 5.8533, + "step": 3211 + }, + { + "epoch": 0.019102673898563138, + "grad_norm": 2.1897904872894287, + "learning_rate": 4.995502228861116e-05, + "loss": 6.2807, + "step": 3212 + }, + { + "epoch": 0.019108621181844133, + "grad_norm": 2.196805715560913, + "learning_rate": 4.995499427788984e-05, + "loss": 5.9418, + "step": 3213 + }, + { + "epoch": 0.01911456846512513, + "grad_norm": 1.9791160821914673, + "learning_rate": 4.995496625845698e-05, + "loss": 5.9909, + "step": 3214 + }, + { + "epoch": 0.019120515748406126, + "grad_norm": 2.3592171669006348, + "learning_rate": 4.995493823031261e-05, + "loss": 5.807, + "step": 3215 + }, + { + "epoch": 0.019126463031687125, + "grad_norm": 2.8238747119903564, + "learning_rate": 4.9954910193456713e-05, + "loss": 5.7587, + "step": 3216 + }, + { + "epoch": 0.019132410314968123, + "grad_norm": 2.4695584774017334, + "learning_rate": 4.9954882147889326e-05, + "loss": 5.746, + "step": 3217 + }, + { + "epoch": 0.01913835759824912, + "grad_norm": 2.3983800411224365, + "learning_rate": 4.995485409361044e-05, + "loss": 5.9364, + "step": 3218 + }, + { + "epoch": 0.019144304881530117, + "grad_norm": 2.1279618740081787, + "learning_rate": 4.995482603062008e-05, + "loss": 5.9383, + "step": 3219 + }, + { + "epoch": 0.019150252164811116, + "grad_norm": 18.583581924438477, + "learning_rate": 4.9954797958918244e-05, + "loss": 5.8596, + "step": 3220 + }, + { + "epoch": 0.01915619944809211, + "grad_norm": 2.1420741081237793, + "learning_rate": 4.995476987850495e-05, + "loss": 5.9311, + "step": 3221 + }, + { + "epoch": 0.01916214673137311, + "grad_norm": 2.314380645751953, + "learning_rate": 4.99547417893802e-05, + "loss": 5.8229, + "step": 3222 + }, + { + "epoch": 0.019168094014654104, + "grad_norm": 2.3818936347961426, + "learning_rate": 4.9954713691544004e-05, + "loss": 6.1124, + "step": 3223 + }, + { + "epoch": 0.019174041297935103, + "grad_norm": 2.521789789199829, + "learning_rate": 4.9954685584996377e-05, + "loss": 5.8939, + "step": 3224 + }, + { + "epoch": 0.0191799885812161, + "grad_norm": 1.9583165645599365, + "learning_rate": 4.9954657469737334e-05, + "loss": 6.0005, + "step": 3225 + }, + { + "epoch": 0.019185935864497097, + "grad_norm": 2.349581241607666, + "learning_rate": 4.995462934576687e-05, + "loss": 5.8467, + "step": 3226 + }, + { + "epoch": 0.019191883147778095, + "grad_norm": 2.081836223602295, + "learning_rate": 4.9954601213085e-05, + "loss": 6.1001, + "step": 3227 + }, + { + "epoch": 0.019197830431059094, + "grad_norm": 2.3207972049713135, + "learning_rate": 4.995457307169175e-05, + "loss": 5.794, + "step": 3228 + }, + { + "epoch": 0.01920377771434009, + "grad_norm": 1.8516380786895752, + "learning_rate": 4.99545449215871e-05, + "loss": 5.785, + "step": 3229 + }, + { + "epoch": 0.019209724997621087, + "grad_norm": 2.3822309970855713, + "learning_rate": 4.995451676277109e-05, + "loss": 5.7861, + "step": 3230 + }, + { + "epoch": 0.019215672280902082, + "grad_norm": 2.857161283493042, + "learning_rate": 4.995448859524371e-05, + "loss": 5.8333, + "step": 3231 + }, + { + "epoch": 0.01922161956418308, + "grad_norm": 2.201551914215088, + "learning_rate": 4.9954460419004974e-05, + "loss": 5.8653, + "step": 3232 + }, + { + "epoch": 0.01922756684746408, + "grad_norm": 2.1707022190093994, + "learning_rate": 4.995443223405489e-05, + "loss": 5.772, + "step": 3233 + }, + { + "epoch": 0.019233514130745075, + "grad_norm": 2.1242458820343018, + "learning_rate": 4.995440404039348e-05, + "loss": 5.8806, + "step": 3234 + }, + { + "epoch": 0.019239461414026073, + "grad_norm": 2.106945514678955, + "learning_rate": 4.995437583802074e-05, + "loss": 5.6746, + "step": 3235 + }, + { + "epoch": 0.019245408697307072, + "grad_norm": 2.083181858062744, + "learning_rate": 4.995434762693669e-05, + "loss": 5.9332, + "step": 3236 + }, + { + "epoch": 0.019251355980588067, + "grad_norm": 2.1857783794403076, + "learning_rate": 4.995431940714134e-05, + "loss": 5.6663, + "step": 3237 + }, + { + "epoch": 0.019257303263869065, + "grad_norm": 2.031041145324707, + "learning_rate": 4.995429117863468e-05, + "loss": 5.6734, + "step": 3238 + }, + { + "epoch": 0.01926325054715006, + "grad_norm": 2.31980037689209, + "learning_rate": 4.995426294141674e-05, + "loss": 5.8851, + "step": 3239 + }, + { + "epoch": 0.01926919783043106, + "grad_norm": 2.102965831756592, + "learning_rate": 4.9954234695487535e-05, + "loss": 5.7092, + "step": 3240 + }, + { + "epoch": 0.019275145113712058, + "grad_norm": 2.031169891357422, + "learning_rate": 4.995420644084705e-05, + "loss": 5.9755, + "step": 3241 + }, + { + "epoch": 0.019281092396993053, + "grad_norm": 2.2460241317749023, + "learning_rate": 4.995417817749532e-05, + "loss": 5.8895, + "step": 3242 + }, + { + "epoch": 0.01928703968027405, + "grad_norm": 2.618539571762085, + "learning_rate": 4.9954149905432336e-05, + "loss": 5.6964, + "step": 3243 + }, + { + "epoch": 0.019292986963555046, + "grad_norm": 2.1615748405456543, + "learning_rate": 4.995412162465812e-05, + "loss": 5.7162, + "step": 3244 + }, + { + "epoch": 0.019298934246836045, + "grad_norm": 2.363663673400879, + "learning_rate": 4.995409333517268e-05, + "loss": 5.7957, + "step": 3245 + }, + { + "epoch": 0.019304881530117043, + "grad_norm": 2.131084680557251, + "learning_rate": 4.9954065036976025e-05, + "loss": 5.7925, + "step": 3246 + }, + { + "epoch": 0.01931082881339804, + "grad_norm": 2.4043118953704834, + "learning_rate": 4.9954036730068155e-05, + "loss": 5.7895, + "step": 3247 + }, + { + "epoch": 0.019316776096679037, + "grad_norm": 2.521756887435913, + "learning_rate": 4.995400841444909e-05, + "loss": 5.6279, + "step": 3248 + }, + { + "epoch": 0.019322723379960036, + "grad_norm": 2.1791021823883057, + "learning_rate": 4.9953980090118846e-05, + "loss": 5.717, + "step": 3249 + }, + { + "epoch": 0.01932867066324103, + "grad_norm": 2.6562376022338867, + "learning_rate": 4.995395175707742e-05, + "loss": 5.7407, + "step": 3250 + }, + { + "epoch": 0.01933461794652203, + "grad_norm": 2.4377942085266113, + "learning_rate": 4.995392341532483e-05, + "loss": 5.539, + "step": 3251 + }, + { + "epoch": 0.019340565229803024, + "grad_norm": 2.3716847896575928, + "learning_rate": 4.995389506486109e-05, + "loss": 5.7251, + "step": 3252 + }, + { + "epoch": 0.019346512513084023, + "grad_norm": 2.2509348392486572, + "learning_rate": 4.995386670568619e-05, + "loss": 5.8749, + "step": 3253 + }, + { + "epoch": 0.01935245979636502, + "grad_norm": 2.265608072280884, + "learning_rate": 4.995383833780016e-05, + "loss": 5.8236, + "step": 3254 + }, + { + "epoch": 0.019358407079646017, + "grad_norm": 1.972179651260376, + "learning_rate": 4.9953809961203e-05, + "loss": 5.9235, + "step": 3255 + }, + { + "epoch": 0.019364354362927015, + "grad_norm": 2.314030170440674, + "learning_rate": 4.9953781575894723e-05, + "loss": 5.7355, + "step": 3256 + }, + { + "epoch": 0.019370301646208014, + "grad_norm": 2.3061349391937256, + "learning_rate": 4.995375318187534e-05, + "loss": 5.7337, + "step": 3257 + }, + { + "epoch": 0.01937624892948901, + "grad_norm": 1.9106477499008179, + "learning_rate": 4.9953724779144864e-05, + "loss": 5.8342, + "step": 3258 + }, + { + "epoch": 0.019382196212770007, + "grad_norm": 2.313750982284546, + "learning_rate": 4.9953696367703296e-05, + "loss": 5.7981, + "step": 3259 + }, + { + "epoch": 0.019388143496051002, + "grad_norm": 2.4477834701538086, + "learning_rate": 4.9953667947550644e-05, + "loss": 5.8212, + "step": 3260 + }, + { + "epoch": 0.019394090779332, + "grad_norm": 2.072659730911255, + "learning_rate": 4.9953639518686936e-05, + "loss": 5.7335, + "step": 3261 + }, + { + "epoch": 0.019400038062613, + "grad_norm": 2.0848984718322754, + "learning_rate": 4.995361108111216e-05, + "loss": 5.7427, + "step": 3262 + }, + { + "epoch": 0.019405985345893995, + "grad_norm": 1.938265323638916, + "learning_rate": 4.9953582634826345e-05, + "loss": 5.7946, + "step": 3263 + }, + { + "epoch": 0.019411932629174993, + "grad_norm": 2.227194309234619, + "learning_rate": 4.995355417982949e-05, + "loss": 5.9095, + "step": 3264 + }, + { + "epoch": 0.01941787991245599, + "grad_norm": 2.3245849609375, + "learning_rate": 4.9953525716121604e-05, + "loss": 5.802, + "step": 3265 + }, + { + "epoch": 0.019423827195736987, + "grad_norm": 2.08950138092041, + "learning_rate": 4.9953497243702696e-05, + "loss": 5.9001, + "step": 3266 + }, + { + "epoch": 0.019429774479017985, + "grad_norm": 1.93153715133667, + "learning_rate": 4.9953468762572786e-05, + "loss": 5.9042, + "step": 3267 + }, + { + "epoch": 0.01943572176229898, + "grad_norm": 2.4099066257476807, + "learning_rate": 4.9953440272731874e-05, + "loss": 5.8181, + "step": 3268 + }, + { + "epoch": 0.01944166904557998, + "grad_norm": 2.078752279281616, + "learning_rate": 4.995341177417998e-05, + "loss": 5.8771, + "step": 3269 + }, + { + "epoch": 0.019447616328860978, + "grad_norm": 2.012592077255249, + "learning_rate": 4.9953383266917106e-05, + "loss": 5.8135, + "step": 3270 + }, + { + "epoch": 0.019453563612141973, + "grad_norm": 2.0364151000976562, + "learning_rate": 4.995335475094326e-05, + "loss": 5.8767, + "step": 3271 + }, + { + "epoch": 0.01945951089542297, + "grad_norm": 2.0447049140930176, + "learning_rate": 4.995332622625846e-05, + "loss": 5.8236, + "step": 3272 + }, + { + "epoch": 0.01946545817870397, + "grad_norm": 2.2354300022125244, + "learning_rate": 4.995329769286271e-05, + "loss": 5.7794, + "step": 3273 + }, + { + "epoch": 0.019471405461984965, + "grad_norm": 2.031331777572632, + "learning_rate": 4.995326915075602e-05, + "loss": 5.87, + "step": 3274 + }, + { + "epoch": 0.019477352745265963, + "grad_norm": 2.2116496562957764, + "learning_rate": 4.99532405999384e-05, + "loss": 5.885, + "step": 3275 + }, + { + "epoch": 0.01948330002854696, + "grad_norm": 1.9008034467697144, + "learning_rate": 4.995321204040987e-05, + "loss": 5.8646, + "step": 3276 + }, + { + "epoch": 0.019489247311827957, + "grad_norm": 2.1743087768554688, + "learning_rate": 4.995318347217042e-05, + "loss": 5.9742, + "step": 3277 + }, + { + "epoch": 0.019495194595108956, + "grad_norm": 2.09171724319458, + "learning_rate": 4.995315489522008e-05, + "loss": 5.882, + "step": 3278 + }, + { + "epoch": 0.01950114187838995, + "grad_norm": 1.816938042640686, + "learning_rate": 4.995312630955885e-05, + "loss": 5.9164, + "step": 3279 + }, + { + "epoch": 0.01950708916167095, + "grad_norm": 2.065207004547119, + "learning_rate": 4.995309771518674e-05, + "loss": 5.9273, + "step": 3280 + }, + { + "epoch": 0.019513036444951944, + "grad_norm": 2.1037240028381348, + "learning_rate": 4.9953069112103757e-05, + "loss": 5.863, + "step": 3281 + }, + { + "epoch": 0.019518983728232943, + "grad_norm": 2.011705160140991, + "learning_rate": 4.995304050030992e-05, + "loss": 5.712, + "step": 3282 + }, + { + "epoch": 0.01952493101151394, + "grad_norm": 2.2053868770599365, + "learning_rate": 4.995301187980523e-05, + "loss": 5.6988, + "step": 3283 + }, + { + "epoch": 0.019530878294794937, + "grad_norm": 2.0522396564483643, + "learning_rate": 4.995298325058971e-05, + "loss": 5.6831, + "step": 3284 + }, + { + "epoch": 0.019536825578075935, + "grad_norm": 1.9751875400543213, + "learning_rate": 4.995295461266336e-05, + "loss": 6.0187, + "step": 3285 + }, + { + "epoch": 0.019542772861356934, + "grad_norm": 2.79711651802063, + "learning_rate": 4.9952925966026185e-05, + "loss": 6.4995, + "step": 3286 + }, + { + "epoch": 0.01954872014463793, + "grad_norm": 2.1059019565582275, + "learning_rate": 4.9952897310678206e-05, + "loss": 5.9603, + "step": 3287 + }, + { + "epoch": 0.019554667427918927, + "grad_norm": 2.169428825378418, + "learning_rate": 4.995286864661942e-05, + "loss": 5.7973, + "step": 3288 + }, + { + "epoch": 0.019560614711199922, + "grad_norm": 2.165508985519409, + "learning_rate": 4.995283997384985e-05, + "loss": 5.9132, + "step": 3289 + }, + { + "epoch": 0.01956656199448092, + "grad_norm": 2.248450994491577, + "learning_rate": 4.9952811292369506e-05, + "loss": 5.8202, + "step": 3290 + }, + { + "epoch": 0.01957250927776192, + "grad_norm": 2.3068084716796875, + "learning_rate": 4.9952782602178394e-05, + "loss": 5.8223, + "step": 3291 + }, + { + "epoch": 0.019578456561042915, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.9952753903276516e-05, + "loss": 5.6231, + "step": 3292 + }, + { + "epoch": 0.019584403844323913, + "grad_norm": 2.136564254760742, + "learning_rate": 4.9952725195663895e-05, + "loss": 5.9859, + "step": 3293 + }, + { + "epoch": 0.01959035112760491, + "grad_norm": 2.6265337467193604, + "learning_rate": 4.9952696479340535e-05, + "loss": 5.9126, + "step": 3294 + }, + { + "epoch": 0.019596298410885907, + "grad_norm": 2.442678928375244, + "learning_rate": 4.9952667754306445e-05, + "loss": 5.9361, + "step": 3295 + }, + { + "epoch": 0.019602245694166905, + "grad_norm": 2.0740134716033936, + "learning_rate": 4.9952639020561644e-05, + "loss": 5.913, + "step": 3296 + }, + { + "epoch": 0.0196081929774479, + "grad_norm": 2.4088518619537354, + "learning_rate": 4.995261027810612e-05, + "loss": 5.8297, + "step": 3297 + }, + { + "epoch": 0.0196141402607289, + "grad_norm": 2.1514804363250732, + "learning_rate": 4.995258152693991e-05, + "loss": 5.8256, + "step": 3298 + }, + { + "epoch": 0.019620087544009897, + "grad_norm": 2.921570062637329, + "learning_rate": 4.9952552767063e-05, + "loss": 6.0243, + "step": 3299 + }, + { + "epoch": 0.019626034827290893, + "grad_norm": 2.398749828338623, + "learning_rate": 4.995252399847542e-05, + "loss": 6.004, + "step": 3300 + }, + { + "epoch": 0.01963198211057189, + "grad_norm": 2.2024805545806885, + "learning_rate": 4.995249522117717e-05, + "loss": 5.9201, + "step": 3301 + }, + { + "epoch": 0.01963792939385289, + "grad_norm": 2.112269401550293, + "learning_rate": 4.9952466435168266e-05, + "loss": 5.8488, + "step": 3302 + }, + { + "epoch": 0.019643876677133885, + "grad_norm": 2.04632568359375, + "learning_rate": 4.99524376404487e-05, + "loss": 5.8054, + "step": 3303 + }, + { + "epoch": 0.019649823960414883, + "grad_norm": 2.6293606758117676, + "learning_rate": 4.995240883701851e-05, + "loss": 5.6799, + "step": 3304 + }, + { + "epoch": 0.01965577124369588, + "grad_norm": 2.5172793865203857, + "learning_rate": 4.995238002487769e-05, + "loss": 5.712, + "step": 3305 + }, + { + "epoch": 0.019661718526976877, + "grad_norm": 2.549194097518921, + "learning_rate": 4.995235120402625e-05, + "loss": 5.7208, + "step": 3306 + }, + { + "epoch": 0.019667665810257876, + "grad_norm": 2.2993295192718506, + "learning_rate": 4.99523223744642e-05, + "loss": 5.7952, + "step": 3307 + }, + { + "epoch": 0.01967361309353887, + "grad_norm": 2.1270902156829834, + "learning_rate": 4.9952293536191555e-05, + "loss": 5.6988, + "step": 3308 + }, + { + "epoch": 0.01967956037681987, + "grad_norm": 2.349858283996582, + "learning_rate": 4.9952264689208315e-05, + "loss": 5.623, + "step": 3309 + }, + { + "epoch": 0.019685507660100864, + "grad_norm": 2.1501529216766357, + "learning_rate": 4.9952235833514506e-05, + "loss": 5.6498, + "step": 3310 + }, + { + "epoch": 0.019691454943381863, + "grad_norm": 2.0577821731567383, + "learning_rate": 4.995220696911012e-05, + "loss": 5.6863, + "step": 3311 + }, + { + "epoch": 0.01969740222666286, + "grad_norm": 2.0787386894226074, + "learning_rate": 4.9952178095995185e-05, + "loss": 5.6314, + "step": 3312 + }, + { + "epoch": 0.019703349509943856, + "grad_norm": 2.4042680263519287, + "learning_rate": 4.99521492141697e-05, + "loss": 5.6152, + "step": 3313 + }, + { + "epoch": 0.019709296793224855, + "grad_norm": 2.444410800933838, + "learning_rate": 4.995212032363368e-05, + "loss": 5.5375, + "step": 3314 + }, + { + "epoch": 0.019715244076505854, + "grad_norm": 2.1678028106689453, + "learning_rate": 4.995209142438712e-05, + "loss": 5.6239, + "step": 3315 + }, + { + "epoch": 0.01972119135978685, + "grad_norm": 2.5436410903930664, + "learning_rate": 4.9952062516430054e-05, + "loss": 5.4234, + "step": 3316 + }, + { + "epoch": 0.019727138643067847, + "grad_norm": 2.454561471939087, + "learning_rate": 4.9952033599762484e-05, + "loss": 5.4198, + "step": 3317 + }, + { + "epoch": 0.019733085926348842, + "grad_norm": 2.388125419616699, + "learning_rate": 4.9952004674384413e-05, + "loss": 5.5073, + "step": 3318 + }, + { + "epoch": 0.01973903320962984, + "grad_norm": 2.1900579929351807, + "learning_rate": 4.995197574029585e-05, + "loss": 5.3463, + "step": 3319 + }, + { + "epoch": 0.01974498049291084, + "grad_norm": 2.5625739097595215, + "learning_rate": 4.995194679749681e-05, + "loss": 5.4291, + "step": 3320 + }, + { + "epoch": 0.019750927776191834, + "grad_norm": 2.52402400970459, + "learning_rate": 4.995191784598731e-05, + "loss": 5.3826, + "step": 3321 + }, + { + "epoch": 0.019756875059472833, + "grad_norm": 2.5888168811798096, + "learning_rate": 4.995188888576735e-05, + "loss": 5.381, + "step": 3322 + }, + { + "epoch": 0.01976282234275383, + "grad_norm": 2.637080669403076, + "learning_rate": 4.995185991683694e-05, + "loss": 5.3321, + "step": 3323 + }, + { + "epoch": 0.019768769626034827, + "grad_norm": 2.46553111076355, + "learning_rate": 4.9951830939196095e-05, + "loss": 5.3663, + "step": 3324 + }, + { + "epoch": 0.019774716909315825, + "grad_norm": 2.2397992610931396, + "learning_rate": 4.9951801952844826e-05, + "loss": 5.3237, + "step": 3325 + }, + { + "epoch": 0.01978066419259682, + "grad_norm": 2.3519208431243896, + "learning_rate": 4.9951772957783144e-05, + "loss": 5.4166, + "step": 3326 + }, + { + "epoch": 0.01978661147587782, + "grad_norm": 2.6235291957855225, + "learning_rate": 4.9951743954011056e-05, + "loss": 5.8094, + "step": 3327 + }, + { + "epoch": 0.019792558759158817, + "grad_norm": 2.162285327911377, + "learning_rate": 4.995171494152856e-05, + "loss": 5.6491, + "step": 3328 + }, + { + "epoch": 0.019798506042439813, + "grad_norm": 2.231853485107422, + "learning_rate": 4.995168592033569e-05, + "loss": 5.69, + "step": 3329 + }, + { + "epoch": 0.01980445332572081, + "grad_norm": 2.7305827140808105, + "learning_rate": 4.995165689043244e-05, + "loss": 5.5028, + "step": 3330 + }, + { + "epoch": 0.01981040060900181, + "grad_norm": 2.9917726516723633, + "learning_rate": 4.9951627851818824e-05, + "loss": 5.3227, + "step": 3331 + }, + { + "epoch": 0.019816347892282805, + "grad_norm": 3.0039985179901123, + "learning_rate": 4.995159880449486e-05, + "loss": 5.5965, + "step": 3332 + }, + { + "epoch": 0.019822295175563803, + "grad_norm": 3.081099510192871, + "learning_rate": 4.995156974846054e-05, + "loss": 5.6945, + "step": 3333 + }, + { + "epoch": 0.0198282424588448, + "grad_norm": 2.042445182800293, + "learning_rate": 4.995154068371589e-05, + "loss": 5.693, + "step": 3334 + }, + { + "epoch": 0.019834189742125797, + "grad_norm": 2.8875865936279297, + "learning_rate": 4.995151161026091e-05, + "loss": 5.5981, + "step": 3335 + }, + { + "epoch": 0.019840137025406795, + "grad_norm": 2.4203453063964844, + "learning_rate": 4.9951482528095615e-05, + "loss": 5.6269, + "step": 3336 + }, + { + "epoch": 0.01984608430868779, + "grad_norm": 2.332151174545288, + "learning_rate": 4.995145343722002e-05, + "loss": 5.6002, + "step": 3337 + }, + { + "epoch": 0.01985203159196879, + "grad_norm": 2.556549310684204, + "learning_rate": 4.995142433763413e-05, + "loss": 5.7715, + "step": 3338 + }, + { + "epoch": 0.019857978875249784, + "grad_norm": 2.453113079071045, + "learning_rate": 4.995139522933796e-05, + "loss": 5.8958, + "step": 3339 + }, + { + "epoch": 0.019863926158530783, + "grad_norm": 1.9842414855957031, + "learning_rate": 4.995136611233151e-05, + "loss": 5.9781, + "step": 3340 + }, + { + "epoch": 0.01986987344181178, + "grad_norm": 2.3725521564483643, + "learning_rate": 4.995133698661479e-05, + "loss": 5.9902, + "step": 3341 + }, + { + "epoch": 0.019875820725092776, + "grad_norm": 2.679001808166504, + "learning_rate": 4.9951307852187824e-05, + "loss": 5.9526, + "step": 3342 + }, + { + "epoch": 0.019881768008373775, + "grad_norm": 2.272595167160034, + "learning_rate": 4.995127870905061e-05, + "loss": 5.9685, + "step": 3343 + }, + { + "epoch": 0.019887715291654774, + "grad_norm": 2.0300357341766357, + "learning_rate": 4.995124955720317e-05, + "loss": 5.7702, + "step": 3344 + }, + { + "epoch": 0.01989366257493577, + "grad_norm": 2.5023481845855713, + "learning_rate": 4.9951220396645504e-05, + "loss": 5.6612, + "step": 3345 + }, + { + "epoch": 0.019899609858216767, + "grad_norm": 2.426457166671753, + "learning_rate": 4.995119122737762e-05, + "loss": 5.767, + "step": 3346 + }, + { + "epoch": 0.019905557141497762, + "grad_norm": 2.4919028282165527, + "learning_rate": 4.995116204939954e-05, + "loss": 6.0578, + "step": 3347 + }, + { + "epoch": 0.01991150442477876, + "grad_norm": 3.099792957305908, + "learning_rate": 4.995113286271126e-05, + "loss": 7.053, + "step": 3348 + }, + { + "epoch": 0.01991745170805976, + "grad_norm": 2.597169876098633, + "learning_rate": 4.9951103667312795e-05, + "loss": 5.8467, + "step": 3349 + }, + { + "epoch": 0.019923398991340754, + "grad_norm": 2.1132469177246094, + "learning_rate": 4.995107446320416e-05, + "loss": 5.7296, + "step": 3350 + }, + { + "epoch": 0.019929346274621753, + "grad_norm": 2.4141721725463867, + "learning_rate": 4.995104525038537e-05, + "loss": 5.8705, + "step": 3351 + }, + { + "epoch": 0.01993529355790275, + "grad_norm": 1.9012199640274048, + "learning_rate": 4.995101602885642e-05, + "loss": 5.8759, + "step": 3352 + }, + { + "epoch": 0.019941240841183747, + "grad_norm": 2.168673038482666, + "learning_rate": 4.9950986798617335e-05, + "loss": 5.8161, + "step": 3353 + }, + { + "epoch": 0.019947188124464745, + "grad_norm": 2.1579155921936035, + "learning_rate": 4.995095755966811e-05, + "loss": 5.8699, + "step": 3354 + }, + { + "epoch": 0.01995313540774574, + "grad_norm": 2.1460800170898438, + "learning_rate": 4.9950928312008774e-05, + "loss": 5.9144, + "step": 3355 + }, + { + "epoch": 0.01995908269102674, + "grad_norm": 2.402167558670044, + "learning_rate": 4.995089905563932e-05, + "loss": 5.8857, + "step": 3356 + }, + { + "epoch": 0.019965029974307737, + "grad_norm": 2.6381726264953613, + "learning_rate": 4.995086979055976e-05, + "loss": 6.0021, + "step": 3357 + }, + { + "epoch": 0.019970977257588732, + "grad_norm": 2.5577943325042725, + "learning_rate": 4.995084051677012e-05, + "loss": 5.9425, + "step": 3358 + }, + { + "epoch": 0.01997692454086973, + "grad_norm": 2.188215494155884, + "learning_rate": 4.995081123427039e-05, + "loss": 6.0656, + "step": 3359 + }, + { + "epoch": 0.01998287182415073, + "grad_norm": 1.8278366327285767, + "learning_rate": 4.9950781943060596e-05, + "loss": 5.8229, + "step": 3360 + }, + { + "epoch": 0.019988819107431725, + "grad_norm": 1.9054077863693237, + "learning_rate": 4.995075264314074e-05, + "loss": 5.8158, + "step": 3361 + }, + { + "epoch": 0.019994766390712723, + "grad_norm": 2.1255416870117188, + "learning_rate": 4.9950723334510826e-05, + "loss": 5.8816, + "step": 3362 + }, + { + "epoch": 0.02000071367399372, + "grad_norm": 2.026923656463623, + "learning_rate": 4.995069401717088e-05, + "loss": 5.7463, + "step": 3363 + }, + { + "epoch": 0.020006660957274717, + "grad_norm": 2.015178680419922, + "learning_rate": 4.9950664691120905e-05, + "loss": 5.6689, + "step": 3364 + }, + { + "epoch": 0.020012608240555715, + "grad_norm": 1.7729417085647583, + "learning_rate": 4.995063535636091e-05, + "loss": 5.701, + "step": 3365 + }, + { + "epoch": 0.02001855552383671, + "grad_norm": 1.9893600940704346, + "learning_rate": 4.9950606012890905e-05, + "loss": 5.7502, + "step": 3366 + }, + { + "epoch": 0.02002450280711771, + "grad_norm": 1.8950870037078857, + "learning_rate": 4.99505766607109e-05, + "loss": 5.6094, + "step": 3367 + }, + { + "epoch": 0.020030450090398704, + "grad_norm": 2.4140830039978027, + "learning_rate": 4.995054729982091e-05, + "loss": 5.8387, + "step": 3368 + }, + { + "epoch": 0.020036397373679703, + "grad_norm": 2.1887669563293457, + "learning_rate": 4.995051793022094e-05, + "loss": 5.7348, + "step": 3369 + }, + { + "epoch": 0.0200423446569607, + "grad_norm": 1.9632731676101685, + "learning_rate": 4.9950488551911e-05, + "loss": 5.5568, + "step": 3370 + }, + { + "epoch": 0.020048291940241696, + "grad_norm": 2.116834878921509, + "learning_rate": 4.995045916489111e-05, + "loss": 5.461, + "step": 3371 + }, + { + "epoch": 0.020054239223522695, + "grad_norm": 2.021256923675537, + "learning_rate": 4.9950429769161266e-05, + "loss": 5.6601, + "step": 3372 + }, + { + "epoch": 0.020060186506803693, + "grad_norm": 2.1648659706115723, + "learning_rate": 4.9950400364721486e-05, + "loss": 5.5364, + "step": 3373 + }, + { + "epoch": 0.02006613379008469, + "grad_norm": 2.043499231338501, + "learning_rate": 4.9950370951571775e-05, + "loss": 5.7273, + "step": 3374 + }, + { + "epoch": 0.020072081073365687, + "grad_norm": 2.296121597290039, + "learning_rate": 4.995034152971215e-05, + "loss": 5.8494, + "step": 3375 + }, + { + "epoch": 0.020078028356646682, + "grad_norm": 2.401031494140625, + "learning_rate": 4.995031209914261e-05, + "loss": 5.719, + "step": 3376 + }, + { + "epoch": 0.02008397563992768, + "grad_norm": 2.3130364418029785, + "learning_rate": 4.995028265986319e-05, + "loss": 5.7998, + "step": 3377 + }, + { + "epoch": 0.02008992292320868, + "grad_norm": 2.3820009231567383, + "learning_rate": 4.9950253211873874e-05, + "loss": 6.0632, + "step": 3378 + }, + { + "epoch": 0.020095870206489674, + "grad_norm": 2.1970956325531006, + "learning_rate": 4.995022375517469e-05, + "loss": 5.9776, + "step": 3379 + }, + { + "epoch": 0.020101817489770673, + "grad_norm": 1.912102460861206, + "learning_rate": 4.995019428976564e-05, + "loss": 5.7194, + "step": 3380 + }, + { + "epoch": 0.02010776477305167, + "grad_norm": 2.3187389373779297, + "learning_rate": 4.995016481564673e-05, + "loss": 6.0225, + "step": 3381 + }, + { + "epoch": 0.020113712056332667, + "grad_norm": 1.959000587463379, + "learning_rate": 4.995013533281797e-05, + "loss": 5.8453, + "step": 3382 + }, + { + "epoch": 0.020119659339613665, + "grad_norm": 2.0283286571502686, + "learning_rate": 4.995010584127938e-05, + "loss": 5.6837, + "step": 3383 + }, + { + "epoch": 0.02012560662289466, + "grad_norm": 2.410351037979126, + "learning_rate": 4.995007634103097e-05, + "loss": 5.8172, + "step": 3384 + }, + { + "epoch": 0.02013155390617566, + "grad_norm": 2.2864298820495605, + "learning_rate": 4.995004683207275e-05, + "loss": 5.8995, + "step": 3385 + }, + { + "epoch": 0.020137501189456657, + "grad_norm": 2.830883026123047, + "learning_rate": 4.995001731440472e-05, + "loss": 5.7273, + "step": 3386 + }, + { + "epoch": 0.020143448472737652, + "grad_norm": 2.486783981323242, + "learning_rate": 4.9949987788026896e-05, + "loss": 5.88, + "step": 3387 + }, + { + "epoch": 0.02014939575601865, + "grad_norm": 2.109975576400757, + "learning_rate": 4.994995825293929e-05, + "loss": 5.8618, + "step": 3388 + }, + { + "epoch": 0.02015534303929965, + "grad_norm": 2.249293327331543, + "learning_rate": 4.994992870914191e-05, + "loss": 5.8511, + "step": 3389 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.5433366298675537, + "learning_rate": 4.9949899156634774e-05, + "loss": 5.7375, + "step": 3390 + }, + { + "epoch": 0.020167237605861643, + "grad_norm": 2.7013652324676514, + "learning_rate": 4.9949869595417876e-05, + "loss": 5.8886, + "step": 3391 + }, + { + "epoch": 0.020173184889142638, + "grad_norm": 2.536972761154175, + "learning_rate": 4.994984002549124e-05, + "loss": 5.4203, + "step": 3392 + }, + { + "epoch": 0.020179132172423637, + "grad_norm": 2.596230983734131, + "learning_rate": 4.9949810446854876e-05, + "loss": 5.7882, + "step": 3393 + }, + { + "epoch": 0.020185079455704635, + "grad_norm": 2.6889936923980713, + "learning_rate": 4.9949780859508786e-05, + "loss": 5.6822, + "step": 3394 + }, + { + "epoch": 0.02019102673898563, + "grad_norm": 2.541027069091797, + "learning_rate": 4.994975126345299e-05, + "loss": 5.7394, + "step": 3395 + }, + { + "epoch": 0.02019697402226663, + "grad_norm": 2.2267251014709473, + "learning_rate": 4.9949721658687485e-05, + "loss": 5.7847, + "step": 3396 + }, + { + "epoch": 0.020202921305547628, + "grad_norm": 2.439689874649048, + "learning_rate": 4.994969204521231e-05, + "loss": 5.6222, + "step": 3397 + }, + { + "epoch": 0.020208868588828623, + "grad_norm": 2.9407742023468018, + "learning_rate": 4.9949662423027434e-05, + "loss": 5.6629, + "step": 3398 + }, + { + "epoch": 0.02021481587210962, + "grad_norm": 2.42802357673645, + "learning_rate": 4.9949632792132894e-05, + "loss": 5.3369, + "step": 3399 + }, + { + "epoch": 0.020220763155390616, + "grad_norm": 2.465508222579956, + "learning_rate": 4.99496031525287e-05, + "loss": 5.3365, + "step": 3400 + }, + { + "epoch": 0.020226710438671615, + "grad_norm": 2.408794403076172, + "learning_rate": 4.9949573504214854e-05, + "loss": 5.3156, + "step": 3401 + }, + { + "epoch": 0.020232657721952613, + "grad_norm": 2.229372978210449, + "learning_rate": 4.9949543847191374e-05, + "loss": 5.9194, + "step": 3402 + }, + { + "epoch": 0.02023860500523361, + "grad_norm": 4.567020416259766, + "learning_rate": 4.9949514181458254e-05, + "loss": 6.3379, + "step": 3403 + }, + { + "epoch": 0.020244552288514607, + "grad_norm": 3.9927520751953125, + "learning_rate": 4.9949484507015534e-05, + "loss": 6.3351, + "step": 3404 + }, + { + "epoch": 0.020250499571795602, + "grad_norm": 2.4830081462860107, + "learning_rate": 4.9949454823863195e-05, + "loss": 6.4046, + "step": 3405 + }, + { + "epoch": 0.0202564468550766, + "grad_norm": 2.282722234725952, + "learning_rate": 4.994942513200126e-05, + "loss": 6.5473, + "step": 3406 + }, + { + "epoch": 0.0202623941383576, + "grad_norm": 2.411367416381836, + "learning_rate": 4.994939543142973e-05, + "loss": 5.7898, + "step": 3407 + }, + { + "epoch": 0.020268341421638594, + "grad_norm": 3.2052342891693115, + "learning_rate": 4.994936572214864e-05, + "loss": 5.6695, + "step": 3408 + }, + { + "epoch": 0.020274288704919593, + "grad_norm": 4.142974853515625, + "learning_rate": 4.994933600415798e-05, + "loss": 6.2037, + "step": 3409 + }, + { + "epoch": 0.02028023598820059, + "grad_norm": 2.839066982269287, + "learning_rate": 4.994930627745776e-05, + "loss": 6.7308, + "step": 3410 + }, + { + "epoch": 0.020286183271481587, + "grad_norm": 3.3138885498046875, + "learning_rate": 4.9949276542048e-05, + "loss": 5.8873, + "step": 3411 + }, + { + "epoch": 0.020292130554762585, + "grad_norm": 2.6651928424835205, + "learning_rate": 4.9949246797928704e-05, + "loss": 6.6325, + "step": 3412 + }, + { + "epoch": 0.02029807783804358, + "grad_norm": 2.919436454772949, + "learning_rate": 4.994921704509988e-05, + "loss": 6.3239, + "step": 3413 + }, + { + "epoch": 0.02030402512132458, + "grad_norm": 2.6901097297668457, + "learning_rate": 4.994918728356155e-05, + "loss": 6.1712, + "step": 3414 + }, + { + "epoch": 0.020309972404605577, + "grad_norm": 2.573249340057373, + "learning_rate": 4.9949157513313704e-05, + "loss": 5.8194, + "step": 3415 + }, + { + "epoch": 0.020315919687886572, + "grad_norm": 3.0603950023651123, + "learning_rate": 4.994912773435637e-05, + "loss": 6.3881, + "step": 3416 + }, + { + "epoch": 0.02032186697116757, + "grad_norm": 3.1800057888031006, + "learning_rate": 4.994909794668956e-05, + "loss": 5.9486, + "step": 3417 + }, + { + "epoch": 0.02032781425444857, + "grad_norm": 2.537182092666626, + "learning_rate": 4.994906815031327e-05, + "loss": 6.5454, + "step": 3418 + }, + { + "epoch": 0.020333761537729565, + "grad_norm": 2.474705457687378, + "learning_rate": 4.9949038345227525e-05, + "loss": 6.5356, + "step": 3419 + }, + { + "epoch": 0.020339708821010563, + "grad_norm": 3.054689645767212, + "learning_rate": 4.994900853143232e-05, + "loss": 6.4526, + "step": 3420 + }, + { + "epoch": 0.020345656104291558, + "grad_norm": 2.587644100189209, + "learning_rate": 4.994897870892769e-05, + "loss": 6.2811, + "step": 3421 + }, + { + "epoch": 0.020351603387572557, + "grad_norm": 2.110041618347168, + "learning_rate": 4.994894887771361e-05, + "loss": 6.0428, + "step": 3422 + }, + { + "epoch": 0.020357550670853555, + "grad_norm": 2.4931492805480957, + "learning_rate": 4.9948919037790115e-05, + "loss": 6.3683, + "step": 3423 + }, + { + "epoch": 0.02036349795413455, + "grad_norm": 2.7169463634490967, + "learning_rate": 4.994888918915721e-05, + "loss": 6.5335, + "step": 3424 + }, + { + "epoch": 0.02036944523741555, + "grad_norm": 2.164363145828247, + "learning_rate": 4.994885933181491e-05, + "loss": 6.0409, + "step": 3425 + }, + { + "epoch": 0.020375392520696547, + "grad_norm": 2.480468273162842, + "learning_rate": 4.994882946576322e-05, + "loss": 5.8816, + "step": 3426 + }, + { + "epoch": 0.020381339803977543, + "grad_norm": 2.928361415863037, + "learning_rate": 4.994879959100215e-05, + "loss": 6.1706, + "step": 3427 + }, + { + "epoch": 0.02038728708725854, + "grad_norm": 2.1536660194396973, + "learning_rate": 4.994876970753171e-05, + "loss": 6.0559, + "step": 3428 + }, + { + "epoch": 0.020393234370539536, + "grad_norm": 2.6913530826568604, + "learning_rate": 4.994873981535192e-05, + "loss": 6.7411, + "step": 3429 + }, + { + "epoch": 0.020399181653820535, + "grad_norm": 2.647124767303467, + "learning_rate": 4.994870991446278e-05, + "loss": 6.5251, + "step": 3430 + }, + { + "epoch": 0.020405128937101533, + "grad_norm": 2.621612310409546, + "learning_rate": 4.994868000486429e-05, + "loss": 6.7029, + "step": 3431 + }, + { + "epoch": 0.02041107622038253, + "grad_norm": 2.1986844539642334, + "learning_rate": 4.994865008655649e-05, + "loss": 6.4561, + "step": 3432 + }, + { + "epoch": 0.020417023503663527, + "grad_norm": 2.706897735595703, + "learning_rate": 4.994862015953936e-05, + "loss": 6.3125, + "step": 3433 + }, + { + "epoch": 0.020422970786944522, + "grad_norm": 2.403346300125122, + "learning_rate": 4.994859022381294e-05, + "loss": 6.0808, + "step": 3434 + }, + { + "epoch": 0.02042891807022552, + "grad_norm": 2.367835521697998, + "learning_rate": 4.994856027937722e-05, + "loss": 6.2634, + "step": 3435 + }, + { + "epoch": 0.02043486535350652, + "grad_norm": 2.8564250469207764, + "learning_rate": 4.9948530326232205e-05, + "loss": 6.579, + "step": 3436 + }, + { + "epoch": 0.020440812636787514, + "grad_norm": 2.9472100734710693, + "learning_rate": 4.9948500364377925e-05, + "loss": 6.3873, + "step": 3437 + }, + { + "epoch": 0.020446759920068513, + "grad_norm": 2.3005917072296143, + "learning_rate": 4.994847039381438e-05, + "loss": 6.2316, + "step": 3438 + }, + { + "epoch": 0.02045270720334951, + "grad_norm": 2.0548787117004395, + "learning_rate": 4.9948440414541584e-05, + "loss": 6.5022, + "step": 3439 + }, + { + "epoch": 0.020458654486630506, + "grad_norm": 2.1332197189331055, + "learning_rate": 4.9948410426559536e-05, + "loss": 6.1486, + "step": 3440 + }, + { + "epoch": 0.020464601769911505, + "grad_norm": 2.112738847732544, + "learning_rate": 4.994838042986827e-05, + "loss": 5.9125, + "step": 3441 + }, + { + "epoch": 0.0204705490531925, + "grad_norm": 2.714627981185913, + "learning_rate": 4.9948350424467774e-05, + "loss": 6.1164, + "step": 3442 + }, + { + "epoch": 0.0204764963364735, + "grad_norm": 2.337571382522583, + "learning_rate": 4.994832041035806e-05, + "loss": 6.0567, + "step": 3443 + }, + { + "epoch": 0.020482443619754497, + "grad_norm": 2.354389190673828, + "learning_rate": 4.994829038753915e-05, + "loss": 5.5922, + "step": 3444 + }, + { + "epoch": 0.020488390903035492, + "grad_norm": 2.3885531425476074, + "learning_rate": 4.994826035601106e-05, + "loss": 6.4178, + "step": 3445 + }, + { + "epoch": 0.02049433818631649, + "grad_norm": 2.931328058242798, + "learning_rate": 4.994823031577378e-05, + "loss": 6.356, + "step": 3446 + }, + { + "epoch": 0.02050028546959749, + "grad_norm": 2.4858877658843994, + "learning_rate": 4.994820026682733e-05, + "loss": 6.0601, + "step": 3447 + }, + { + "epoch": 0.020506232752878484, + "grad_norm": 2.626811981201172, + "learning_rate": 4.9948170209171725e-05, + "loss": 6.4372, + "step": 3448 + }, + { + "epoch": 0.020512180036159483, + "grad_norm": 2.2917356491088867, + "learning_rate": 4.994814014280696e-05, + "loss": 5.9828, + "step": 3449 + }, + { + "epoch": 0.020518127319440478, + "grad_norm": 2.174531936645508, + "learning_rate": 4.9948110067733075e-05, + "loss": 6.3382, + "step": 3450 + }, + { + "epoch": 0.020524074602721477, + "grad_norm": 2.9880006313323975, + "learning_rate": 4.994807998395005e-05, + "loss": 6.7493, + "step": 3451 + }, + { + "epoch": 0.020530021886002475, + "grad_norm": 2.6577212810516357, + "learning_rate": 4.994804989145792e-05, + "loss": 6.853, + "step": 3452 + }, + { + "epoch": 0.02053596916928347, + "grad_norm": 2.8832437992095947, + "learning_rate": 4.994801979025667e-05, + "loss": 6.5829, + "step": 3453 + }, + { + "epoch": 0.02054191645256447, + "grad_norm": 2.473177194595337, + "learning_rate": 4.994798968034633e-05, + "loss": 6.2879, + "step": 3454 + }, + { + "epoch": 0.020547863735845467, + "grad_norm": 2.7484633922576904, + "learning_rate": 4.994795956172691e-05, + "loss": 6.2037, + "step": 3455 + }, + { + "epoch": 0.020553811019126463, + "grad_norm": 1.6647555828094482, + "learning_rate": 4.9947929434398403e-05, + "loss": 6.5639, + "step": 3456 + }, + { + "epoch": 0.02055975830240746, + "grad_norm": 3.71087908744812, + "learning_rate": 4.994789929836084e-05, + "loss": 6.8464, + "step": 3457 + }, + { + "epoch": 0.020565705585688456, + "grad_norm": 2.705892324447632, + "learning_rate": 4.994786915361422e-05, + "loss": 6.8316, + "step": 3458 + }, + { + "epoch": 0.020571652868969455, + "grad_norm": 2.3619437217712402, + "learning_rate": 4.994783900015856e-05, + "loss": 6.3441, + "step": 3459 + }, + { + "epoch": 0.020577600152250453, + "grad_norm": 2.490499258041382, + "learning_rate": 4.9947808837993864e-05, + "loss": 6.1467, + "step": 3460 + }, + { + "epoch": 0.02058354743553145, + "grad_norm": 2.546614170074463, + "learning_rate": 4.994777866712015e-05, + "loss": 5.6677, + "step": 3461 + }, + { + "epoch": 0.020589494718812447, + "grad_norm": 2.473695755004883, + "learning_rate": 4.994774848753741e-05, + "loss": 5.7815, + "step": 3462 + }, + { + "epoch": 0.020595442002093442, + "grad_norm": 2.0494625568389893, + "learning_rate": 4.994771829924569e-05, + "loss": 5.674, + "step": 3463 + }, + { + "epoch": 0.02060138928537444, + "grad_norm": 2.1504273414611816, + "learning_rate": 4.9947688102244964e-05, + "loss": 5.5299, + "step": 3464 + }, + { + "epoch": 0.02060733656865544, + "grad_norm": 2.908170700073242, + "learning_rate": 4.994765789653526e-05, + "loss": 5.8448, + "step": 3465 + }, + { + "epoch": 0.020613283851936434, + "grad_norm": 3.1434714794158936, + "learning_rate": 4.994762768211659e-05, + "loss": 5.8413, + "step": 3466 + }, + { + "epoch": 0.020619231135217433, + "grad_norm": 2.4688189029693604, + "learning_rate": 4.994759745898896e-05, + "loss": 5.6458, + "step": 3467 + }, + { + "epoch": 0.02062517841849843, + "grad_norm": 2.172083854675293, + "learning_rate": 4.994756722715238e-05, + "loss": 5.723, + "step": 3468 + }, + { + "epoch": 0.020631125701779426, + "grad_norm": 2.0702707767486572, + "learning_rate": 4.994753698660687e-05, + "loss": 5.6199, + "step": 3469 + }, + { + "epoch": 0.020637072985060425, + "grad_norm": 2.2142136096954346, + "learning_rate": 4.9947506737352425e-05, + "loss": 5.5476, + "step": 3470 + }, + { + "epoch": 0.02064302026834142, + "grad_norm": 2.156874179840088, + "learning_rate": 4.994747647938907e-05, + "loss": 5.4773, + "step": 3471 + }, + { + "epoch": 0.02064896755162242, + "grad_norm": 3.3683371543884277, + "learning_rate": 4.9947446212716795e-05, + "loss": 6.4804, + "step": 3472 + }, + { + "epoch": 0.020654914834903417, + "grad_norm": 2.2435977458953857, + "learning_rate": 4.9947415937335635e-05, + "loss": 6.0622, + "step": 3473 + }, + { + "epoch": 0.020660862118184412, + "grad_norm": 3.0824263095855713, + "learning_rate": 4.994738565324558e-05, + "loss": 6.8809, + "step": 3474 + }, + { + "epoch": 0.02066680940146541, + "grad_norm": 2.6978909969329834, + "learning_rate": 4.9947355360446664e-05, + "loss": 6.823, + "step": 3475 + }, + { + "epoch": 0.02067275668474641, + "grad_norm": 3.041680097579956, + "learning_rate": 4.9947325058938874e-05, + "loss": 6.4268, + "step": 3476 + }, + { + "epoch": 0.020678703968027404, + "grad_norm": 3.5326781272888184, + "learning_rate": 4.9947294748722237e-05, + "loss": 6.3516, + "step": 3477 + }, + { + "epoch": 0.020684651251308403, + "grad_norm": 2.7611732482910156, + "learning_rate": 4.994726442979675e-05, + "loss": 6.2206, + "step": 3478 + }, + { + "epoch": 0.020690598534589398, + "grad_norm": 3.8533458709716797, + "learning_rate": 4.994723410216244e-05, + "loss": 6.7907, + "step": 3479 + }, + { + "epoch": 0.020696545817870397, + "grad_norm": 2.8091351985931396, + "learning_rate": 4.99472037658193e-05, + "loss": 6.7468, + "step": 3480 + }, + { + "epoch": 0.020702493101151395, + "grad_norm": 2.4317073822021484, + "learning_rate": 4.994717342076736e-05, + "loss": 6.4682, + "step": 3481 + }, + { + "epoch": 0.02070844038443239, + "grad_norm": 2.5132029056549072, + "learning_rate": 4.994714306700661e-05, + "loss": 6.1966, + "step": 3482 + }, + { + "epoch": 0.02071438766771339, + "grad_norm": 2.8161535263061523, + "learning_rate": 4.994711270453707e-05, + "loss": 5.6045, + "step": 3483 + }, + { + "epoch": 0.020720334950994387, + "grad_norm": 2.654115915298462, + "learning_rate": 4.994708233335875e-05, + "loss": 5.8983, + "step": 3484 + }, + { + "epoch": 0.020726282234275382, + "grad_norm": 2.5971553325653076, + "learning_rate": 4.9947051953471664e-05, + "loss": 5.4422, + "step": 3485 + }, + { + "epoch": 0.02073222951755638, + "grad_norm": 2.5758557319641113, + "learning_rate": 4.9947021564875816e-05, + "loss": 5.5921, + "step": 3486 + }, + { + "epoch": 0.020738176800837376, + "grad_norm": 2.635345458984375, + "learning_rate": 4.994699116757122e-05, + "loss": 6.2316, + "step": 3487 + }, + { + "epoch": 0.020744124084118375, + "grad_norm": 2.573514938354492, + "learning_rate": 4.9946960761557896e-05, + "loss": 6.5069, + "step": 3488 + }, + { + "epoch": 0.020750071367399373, + "grad_norm": 2.587735176086426, + "learning_rate": 4.994693034683584e-05, + "loss": 5.9114, + "step": 3489 + }, + { + "epoch": 0.02075601865068037, + "grad_norm": 2.4980244636535645, + "learning_rate": 4.9946899923405075e-05, + "loss": 6.1805, + "step": 3490 + }, + { + "epoch": 0.020761965933961367, + "grad_norm": 2.614003896713257, + "learning_rate": 4.9946869491265594e-05, + "loss": 6.2294, + "step": 3491 + }, + { + "epoch": 0.020767913217242365, + "grad_norm": 3.3819997310638428, + "learning_rate": 4.994683905041743e-05, + "loss": 5.4716, + "step": 3492 + }, + { + "epoch": 0.02077386050052336, + "grad_norm": 3.168170213699341, + "learning_rate": 4.994680860086057e-05, + "loss": 5.4041, + "step": 3493 + }, + { + "epoch": 0.02077980778380436, + "grad_norm": 3.05253267288208, + "learning_rate": 4.994677814259504e-05, + "loss": 5.4958, + "step": 3494 + }, + { + "epoch": 0.020785755067085354, + "grad_norm": 2.8560431003570557, + "learning_rate": 4.994674767562085e-05, + "loss": 5.4153, + "step": 3495 + }, + { + "epoch": 0.020791702350366353, + "grad_norm": 2.790382146835327, + "learning_rate": 4.994671719993801e-05, + "loss": 6.3581, + "step": 3496 + }, + { + "epoch": 0.02079764963364735, + "grad_norm": 2.9860496520996094, + "learning_rate": 4.9946686715546535e-05, + "loss": 6.5779, + "step": 3497 + }, + { + "epoch": 0.020803596916928346, + "grad_norm": 2.744859457015991, + "learning_rate": 4.994665622244642e-05, + "loss": 6.5748, + "step": 3498 + }, + { + "epoch": 0.020809544200209345, + "grad_norm": 2.7951292991638184, + "learning_rate": 4.9946625720637683e-05, + "loss": 6.1954, + "step": 3499 + }, + { + "epoch": 0.02081549148349034, + "grad_norm": 3.2961854934692383, + "learning_rate": 4.994659521012034e-05, + "loss": 6.243, + "step": 3500 + }, + { + "epoch": 0.02082143876677134, + "grad_norm": 2.934246301651001, + "learning_rate": 4.99465646908944e-05, + "loss": 6.1307, + "step": 3501 + }, + { + "epoch": 0.020827386050052337, + "grad_norm": 3.9152729511260986, + "learning_rate": 4.994653416295987e-05, + "loss": 6.0167, + "step": 3502 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 4.510169506072998, + "learning_rate": 4.994650362631676e-05, + "loss": 6.533, + "step": 3503 + }, + { + "epoch": 0.02083928061661433, + "grad_norm": 3.415665864944458, + "learning_rate": 4.994647308096509e-05, + "loss": 6.4978, + "step": 3504 + }, + { + "epoch": 0.02084522789989533, + "grad_norm": 2.6515185832977295, + "learning_rate": 4.9946442526904856e-05, + "loss": 6.3859, + "step": 3505 + }, + { + "epoch": 0.020851175183176324, + "grad_norm": 2.8215248584747314, + "learning_rate": 4.994641196413609e-05, + "loss": 6.243, + "step": 3506 + }, + { + "epoch": 0.020857122466457323, + "grad_norm": 2.644529104232788, + "learning_rate": 4.9946381392658773e-05, + "loss": 6.2954, + "step": 3507 + }, + { + "epoch": 0.020863069749738318, + "grad_norm": 3.349699020385742, + "learning_rate": 4.994635081247294e-05, + "loss": 6.5617, + "step": 3508 + }, + { + "epoch": 0.020869017033019317, + "grad_norm": 3.3669090270996094, + "learning_rate": 4.9946320223578596e-05, + "loss": 6.6458, + "step": 3509 + }, + { + "epoch": 0.020874964316300315, + "grad_norm": 2.5562078952789307, + "learning_rate": 4.994628962597575e-05, + "loss": 5.5041, + "step": 3510 + }, + { + "epoch": 0.02088091159958131, + "grad_norm": 2.851809501647949, + "learning_rate": 4.994625901966441e-05, + "loss": 5.4607, + "step": 3511 + }, + { + "epoch": 0.02088685888286231, + "grad_norm": 3.2769458293914795, + "learning_rate": 4.994622840464458e-05, + "loss": 5.3115, + "step": 3512 + }, + { + "epoch": 0.020892806166143307, + "grad_norm": 2.5495102405548096, + "learning_rate": 4.994619778091629e-05, + "loss": 5.9997, + "step": 3513 + }, + { + "epoch": 0.020898753449424302, + "grad_norm": 2.609463930130005, + "learning_rate": 4.994616714847954e-05, + "loss": 6.562, + "step": 3514 + }, + { + "epoch": 0.0209047007327053, + "grad_norm": 2.5731685161590576, + "learning_rate": 4.994613650733433e-05, + "loss": 6.5341, + "step": 3515 + }, + { + "epoch": 0.020910648015986296, + "grad_norm": 2.481297254562378, + "learning_rate": 4.99461058574807e-05, + "loss": 6.5878, + "step": 3516 + }, + { + "epoch": 0.020916595299267295, + "grad_norm": 2.4096593856811523, + "learning_rate": 4.9946075198918624e-05, + "loss": 6.5054, + "step": 3517 + }, + { + "epoch": 0.020922542582548293, + "grad_norm": 2.4417459964752197, + "learning_rate": 4.994604453164814e-05, + "loss": 6.3292, + "step": 3518 + }, + { + "epoch": 0.020928489865829288, + "grad_norm": 2.7062435150146484, + "learning_rate": 4.994601385566925e-05, + "loss": 5.564, + "step": 3519 + }, + { + "epoch": 0.020934437149110287, + "grad_norm": 2.613614559173584, + "learning_rate": 4.9945983170981955e-05, + "loss": 5.3929, + "step": 3520 + }, + { + "epoch": 0.020940384432391285, + "grad_norm": 2.4933719635009766, + "learning_rate": 4.994595247758629e-05, + "loss": 6.1841, + "step": 3521 + }, + { + "epoch": 0.02094633171567228, + "grad_norm": 2.251507043838501, + "learning_rate": 4.994592177548224e-05, + "loss": 6.3109, + "step": 3522 + }, + { + "epoch": 0.02095227899895328, + "grad_norm": 2.3830223083496094, + "learning_rate": 4.994589106466983e-05, + "loss": 5.9421, + "step": 3523 + }, + { + "epoch": 0.020958226282234274, + "grad_norm": 2.2940196990966797, + "learning_rate": 4.994586034514906e-05, + "loss": 6.0858, + "step": 3524 + }, + { + "epoch": 0.020964173565515273, + "grad_norm": 2.916836977005005, + "learning_rate": 4.994582961691996e-05, + "loss": 5.166, + "step": 3525 + }, + { + "epoch": 0.02097012084879627, + "grad_norm": 2.7183029651641846, + "learning_rate": 4.994579887998252e-05, + "loss": 6.9732, + "step": 3526 + }, + { + "epoch": 0.020976068132077266, + "grad_norm": 2.70143985748291, + "learning_rate": 4.994576813433676e-05, + "loss": 5.917, + "step": 3527 + }, + { + "epoch": 0.020982015415358265, + "grad_norm": 2.7375986576080322, + "learning_rate": 4.994573737998269e-05, + "loss": 5.3025, + "step": 3528 + }, + { + "epoch": 0.02098796269863926, + "grad_norm": 2.656982183456421, + "learning_rate": 4.994570661692033e-05, + "loss": 5.2383, + "step": 3529 + }, + { + "epoch": 0.02099390998192026, + "grad_norm": 2.2119734287261963, + "learning_rate": 4.994567584514968e-05, + "loss": 6.0456, + "step": 3530 + }, + { + "epoch": 0.020999857265201257, + "grad_norm": 2.9191582202911377, + "learning_rate": 4.9945645064670737e-05, + "loss": 6.3808, + "step": 3531 + }, + { + "epoch": 0.021005804548482252, + "grad_norm": 3.124101400375366, + "learning_rate": 4.994561427548354e-05, + "loss": 5.3631, + "step": 3532 + }, + { + "epoch": 0.02101175183176325, + "grad_norm": 2.803938150405884, + "learning_rate": 4.994558347758808e-05, + "loss": 5.3172, + "step": 3533 + }, + { + "epoch": 0.02101769911504425, + "grad_norm": 2.6231577396392822, + "learning_rate": 4.994555267098438e-05, + "loss": 6.4466, + "step": 3534 + }, + { + "epoch": 0.021023646398325244, + "grad_norm": 2.735590696334839, + "learning_rate": 4.994552185567244e-05, + "loss": 5.3115, + "step": 3535 + }, + { + "epoch": 0.021029593681606243, + "grad_norm": 2.730459690093994, + "learning_rate": 4.994549103165228e-05, + "loss": 5.2311, + "step": 3536 + }, + { + "epoch": 0.021035540964887238, + "grad_norm": 2.1241424083709717, + "learning_rate": 4.994546019892391e-05, + "loss": 5.6599, + "step": 3537 + }, + { + "epoch": 0.021041488248168237, + "grad_norm": 2.607807159423828, + "learning_rate": 4.994542935748733e-05, + "loss": 6.1182, + "step": 3538 + }, + { + "epoch": 0.021047435531449235, + "grad_norm": 2.6896564960479736, + "learning_rate": 4.9945398507342567e-05, + "loss": 6.2827, + "step": 3539 + }, + { + "epoch": 0.02105338281473023, + "grad_norm": 2.9237961769104004, + "learning_rate": 4.994536764848962e-05, + "loss": 5.9629, + "step": 3540 + }, + { + "epoch": 0.02105933009801123, + "grad_norm": 2.7576143741607666, + "learning_rate": 4.99453367809285e-05, + "loss": 5.7612, + "step": 3541 + }, + { + "epoch": 0.021065277381292227, + "grad_norm": 3.1622097492218018, + "learning_rate": 4.9945305904659226e-05, + "loss": 6.0415, + "step": 3542 + }, + { + "epoch": 0.021071224664573222, + "grad_norm": 2.471127510070801, + "learning_rate": 4.994527501968179e-05, + "loss": 6.1264, + "step": 3543 + }, + { + "epoch": 0.02107717194785422, + "grad_norm": 2.797504425048828, + "learning_rate": 4.994524412599623e-05, + "loss": 6.3515, + "step": 3544 + }, + { + "epoch": 0.021083119231135216, + "grad_norm": 2.4932103157043457, + "learning_rate": 4.9945213223602535e-05, + "loss": 6.4327, + "step": 3545 + }, + { + "epoch": 0.021089066514416215, + "grad_norm": 2.5194599628448486, + "learning_rate": 4.9945182312500725e-05, + "loss": 6.4003, + "step": 3546 + }, + { + "epoch": 0.021095013797697213, + "grad_norm": 2.287858247756958, + "learning_rate": 4.9945151392690814e-05, + "loss": 6.3287, + "step": 3547 + }, + { + "epoch": 0.021100961080978208, + "grad_norm": 2.941619873046875, + "learning_rate": 4.994512046417281e-05, + "loss": 6.1364, + "step": 3548 + }, + { + "epoch": 0.021106908364259207, + "grad_norm": 3.1448967456817627, + "learning_rate": 4.994508952694672e-05, + "loss": 5.8638, + "step": 3549 + }, + { + "epoch": 0.021112855647540205, + "grad_norm": 2.869966983795166, + "learning_rate": 4.994505858101255e-05, + "loss": 6.0122, + "step": 3550 + }, + { + "epoch": 0.0211188029308212, + "grad_norm": 2.421264886856079, + "learning_rate": 4.9945027626370325e-05, + "loss": 6.1243, + "step": 3551 + }, + { + "epoch": 0.0211247502141022, + "grad_norm": 2.599456310272217, + "learning_rate": 4.9944996663020047e-05, + "loss": 5.9484, + "step": 3552 + }, + { + "epoch": 0.021130697497383194, + "grad_norm": 3.1029574871063232, + "learning_rate": 4.994496569096173e-05, + "loss": 5.9347, + "step": 3553 + }, + { + "epoch": 0.021136644780664193, + "grad_norm": 3.02494478225708, + "learning_rate": 4.994493471019538e-05, + "loss": 5.814, + "step": 3554 + }, + { + "epoch": 0.02114259206394519, + "grad_norm": 2.359682559967041, + "learning_rate": 4.994490372072101e-05, + "loss": 5.8533, + "step": 3555 + }, + { + "epoch": 0.021148539347226186, + "grad_norm": 2.7072582244873047, + "learning_rate": 4.994487272253864e-05, + "loss": 5.855, + "step": 3556 + }, + { + "epoch": 0.021154486630507185, + "grad_norm": 2.3102664947509766, + "learning_rate": 4.994484171564826e-05, + "loss": 5.6701, + "step": 3557 + }, + { + "epoch": 0.02116043391378818, + "grad_norm": 2.3804259300231934, + "learning_rate": 4.9944810700049906e-05, + "loss": 5.5096, + "step": 3558 + }, + { + "epoch": 0.02116638119706918, + "grad_norm": 2.463280439376831, + "learning_rate": 4.994477967574357e-05, + "loss": 5.5178, + "step": 3559 + }, + { + "epoch": 0.021172328480350177, + "grad_norm": 2.884152412414551, + "learning_rate": 4.9944748642729265e-05, + "loss": 6.1013, + "step": 3560 + }, + { + "epoch": 0.021178275763631172, + "grad_norm": 3.009460210800171, + "learning_rate": 4.9944717601007006e-05, + "loss": 6.2725, + "step": 3561 + }, + { + "epoch": 0.02118422304691217, + "grad_norm": 2.5930371284484863, + "learning_rate": 4.9944686550576814e-05, + "loss": 6.1138, + "step": 3562 + }, + { + "epoch": 0.02119017033019317, + "grad_norm": 2.8212878704071045, + "learning_rate": 4.9944655491438684e-05, + "loss": 5.6209, + "step": 3563 + }, + { + "epoch": 0.021196117613474164, + "grad_norm": 2.9814743995666504, + "learning_rate": 4.9944624423592634e-05, + "loss": 5.8912, + "step": 3564 + }, + { + "epoch": 0.021202064896755163, + "grad_norm": 3.1456093788146973, + "learning_rate": 4.994459334703867e-05, + "loss": 5.961, + "step": 3565 + }, + { + "epoch": 0.021208012180036158, + "grad_norm": 2.9300050735473633, + "learning_rate": 4.9944562261776805e-05, + "loss": 6.773, + "step": 3566 + }, + { + "epoch": 0.021213959463317156, + "grad_norm": 2.570685625076294, + "learning_rate": 4.994453116780705e-05, + "loss": 6.3575, + "step": 3567 + }, + { + "epoch": 0.021219906746598155, + "grad_norm": 2.7060914039611816, + "learning_rate": 4.994450006512943e-05, + "loss": 6.249, + "step": 3568 + }, + { + "epoch": 0.02122585402987915, + "grad_norm": 3.0027518272399902, + "learning_rate": 4.994446895374393e-05, + "loss": 5.8243, + "step": 3569 + }, + { + "epoch": 0.02123180131316015, + "grad_norm": 2.785888195037842, + "learning_rate": 4.994443783365058e-05, + "loss": 5.9836, + "step": 3570 + }, + { + "epoch": 0.021237748596441147, + "grad_norm": 2.5480010509490967, + "learning_rate": 4.994440670484938e-05, + "loss": 6.4237, + "step": 3571 + }, + { + "epoch": 0.021243695879722142, + "grad_norm": 2.687121629714966, + "learning_rate": 4.9944375567340345e-05, + "loss": 6.4497, + "step": 3572 + }, + { + "epoch": 0.02124964316300314, + "grad_norm": 2.6066362857818604, + "learning_rate": 4.994434442112349e-05, + "loss": 6.3853, + "step": 3573 + }, + { + "epoch": 0.021255590446284136, + "grad_norm": 2.880352020263672, + "learning_rate": 4.994431326619882e-05, + "loss": 6.382, + "step": 3574 + }, + { + "epoch": 0.021261537729565134, + "grad_norm": 3.0415213108062744, + "learning_rate": 4.9944282102566345e-05, + "loss": 6.4472, + "step": 3575 + }, + { + "epoch": 0.021267485012846133, + "grad_norm": 2.4917140007019043, + "learning_rate": 4.994425093022609e-05, + "loss": 6.2546, + "step": 3576 + }, + { + "epoch": 0.021273432296127128, + "grad_norm": 2.53648042678833, + "learning_rate": 4.9944219749178044e-05, + "loss": 6.37, + "step": 3577 + }, + { + "epoch": 0.021279379579408127, + "grad_norm": 2.796342134475708, + "learning_rate": 4.994418855942223e-05, + "loss": 6.1691, + "step": 3578 + }, + { + "epoch": 0.021285326862689125, + "grad_norm": 2.9148125648498535, + "learning_rate": 4.9944157360958656e-05, + "loss": 6.2552, + "step": 3579 + }, + { + "epoch": 0.02129127414597012, + "grad_norm": 3.0777838230133057, + "learning_rate": 4.994412615378734e-05, + "loss": 6.2359, + "step": 3580 + }, + { + "epoch": 0.02129722142925112, + "grad_norm": 2.5878093242645264, + "learning_rate": 4.994409493790828e-05, + "loss": 6.0746, + "step": 3581 + }, + { + "epoch": 0.021303168712532114, + "grad_norm": 3.2084906101226807, + "learning_rate": 4.99440637133215e-05, + "loss": 6.1357, + "step": 3582 + }, + { + "epoch": 0.021309115995813113, + "grad_norm": 3.7210965156555176, + "learning_rate": 4.9944032480027004e-05, + "loss": 6.5117, + "step": 3583 + }, + { + "epoch": 0.02131506327909411, + "grad_norm": 2.8332109451293945, + "learning_rate": 4.994400123802481e-05, + "loss": 6.0908, + "step": 3584 + }, + { + "epoch": 0.021321010562375106, + "grad_norm": 2.83854341506958, + "learning_rate": 4.994396998731491e-05, + "loss": 6.1522, + "step": 3585 + }, + { + "epoch": 0.021326957845656105, + "grad_norm": 2.5171611309051514, + "learning_rate": 4.9943938727897335e-05, + "loss": 6.2253, + "step": 3586 + }, + { + "epoch": 0.0213329051289371, + "grad_norm": 2.2111763954162598, + "learning_rate": 4.9943907459772086e-05, + "loss": 5.7673, + "step": 3587 + }, + { + "epoch": 0.0213388524122181, + "grad_norm": 2.5147926807403564, + "learning_rate": 4.994387618293918e-05, + "loss": 6.8327, + "step": 3588 + }, + { + "epoch": 0.021344799695499097, + "grad_norm": 2.969285488128662, + "learning_rate": 4.9943844897398626e-05, + "loss": 6.9995, + "step": 3589 + }, + { + "epoch": 0.021350746978780092, + "grad_norm": 4.00917911529541, + "learning_rate": 4.994381360315043e-05, + "loss": 6.6377, + "step": 3590 + }, + { + "epoch": 0.02135669426206109, + "grad_norm": 3.899319887161255, + "learning_rate": 4.994378230019461e-05, + "loss": 6.162, + "step": 3591 + }, + { + "epoch": 0.02136264154534209, + "grad_norm": 2.9522764682769775, + "learning_rate": 4.994375098853117e-05, + "loss": 6.4405, + "step": 3592 + }, + { + "epoch": 0.021368588828623084, + "grad_norm": 3.0569825172424316, + "learning_rate": 4.994371966816012e-05, + "loss": 6.2631, + "step": 3593 + }, + { + "epoch": 0.021374536111904083, + "grad_norm": 2.9470009803771973, + "learning_rate": 4.994368833908148e-05, + "loss": 6.4785, + "step": 3594 + }, + { + "epoch": 0.021380483395185078, + "grad_norm": 2.913940668106079, + "learning_rate": 4.994365700129525e-05, + "loss": 6.6566, + "step": 3595 + }, + { + "epoch": 0.021386430678466076, + "grad_norm": 2.6037404537200928, + "learning_rate": 4.9943625654801465e-05, + "loss": 6.2535, + "step": 3596 + }, + { + "epoch": 0.021392377961747075, + "grad_norm": 2.998276948928833, + "learning_rate": 4.99435942996001e-05, + "loss": 6.8851, + "step": 3597 + }, + { + "epoch": 0.02139832524502807, + "grad_norm": 2.2189996242523193, + "learning_rate": 4.994356293569119e-05, + "loss": 6.8707, + "step": 3598 + }, + { + "epoch": 0.02140427252830907, + "grad_norm": 2.4528486728668213, + "learning_rate": 4.994353156307474e-05, + "loss": 6.9166, + "step": 3599 + }, + { + "epoch": 0.021410219811590067, + "grad_norm": 3.0538241863250732, + "learning_rate": 4.994350018175076e-05, + "loss": 6.3258, + "step": 3600 + }, + { + "epoch": 0.021416167094871062, + "grad_norm": 3.789745569229126, + "learning_rate": 4.994346879171926e-05, + "loss": 6.1962, + "step": 3601 + }, + { + "epoch": 0.02142211437815206, + "grad_norm": 3.2789254188537598, + "learning_rate": 4.994343739298025e-05, + "loss": 6.2126, + "step": 3602 + }, + { + "epoch": 0.021428061661433056, + "grad_norm": 3.0887696743011475, + "learning_rate": 4.994340598553375e-05, + "loss": 6.2395, + "step": 3603 + }, + { + "epoch": 0.021434008944714054, + "grad_norm": 2.9189252853393555, + "learning_rate": 4.994337456937977e-05, + "loss": 6.193, + "step": 3604 + }, + { + "epoch": 0.021439956227995053, + "grad_norm": 2.8582170009613037, + "learning_rate": 4.9943343144518306e-05, + "loss": 6.1077, + "step": 3605 + }, + { + "epoch": 0.021445903511276048, + "grad_norm": 3.076979160308838, + "learning_rate": 4.994331171094938e-05, + "loss": 6.0474, + "step": 3606 + }, + { + "epoch": 0.021451850794557047, + "grad_norm": 3.482161045074463, + "learning_rate": 4.994328026867301e-05, + "loss": 6.0551, + "step": 3607 + }, + { + "epoch": 0.021457798077838045, + "grad_norm": 3.001046895980835, + "learning_rate": 4.994324881768919e-05, + "loss": 6.0393, + "step": 3608 + }, + { + "epoch": 0.02146374536111904, + "grad_norm": 2.8006365299224854, + "learning_rate": 4.994321735799794e-05, + "loss": 6.0042, + "step": 3609 + }, + { + "epoch": 0.02146969264440004, + "grad_norm": 3.10727858543396, + "learning_rate": 4.994318588959927e-05, + "loss": 5.8981, + "step": 3610 + }, + { + "epoch": 0.021475639927681034, + "grad_norm": 2.660557985305786, + "learning_rate": 4.9943154412493194e-05, + "loss": 6.0426, + "step": 3611 + }, + { + "epoch": 0.021481587210962032, + "grad_norm": 2.8504562377929688, + "learning_rate": 4.994312292667972e-05, + "loss": 6.9774, + "step": 3612 + }, + { + "epoch": 0.02148753449424303, + "grad_norm": 3.0076539516448975, + "learning_rate": 4.994309143215886e-05, + "loss": 6.3238, + "step": 3613 + }, + { + "epoch": 0.021493481777524026, + "grad_norm": 2.2966883182525635, + "learning_rate": 4.9943059928930626e-05, + "loss": 7.0015, + "step": 3614 + }, + { + "epoch": 0.021499429060805025, + "grad_norm": 2.5054080486297607, + "learning_rate": 4.994302841699502e-05, + "loss": 6.9226, + "step": 3615 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 2.856278657913208, + "learning_rate": 4.9942996896352066e-05, + "loss": 6.7836, + "step": 3616 + }, + { + "epoch": 0.02151132362736702, + "grad_norm": 2.4902377128601074, + "learning_rate": 4.994296536700177e-05, + "loss": 6.7077, + "step": 3617 + }, + { + "epoch": 0.021517270910648017, + "grad_norm": 2.477932929992676, + "learning_rate": 4.994293382894414e-05, + "loss": 6.8284, + "step": 3618 + }, + { + "epoch": 0.021523218193929012, + "grad_norm": 2.3034260272979736, + "learning_rate": 4.994290228217919e-05, + "loss": 6.8012, + "step": 3619 + }, + { + "epoch": 0.02152916547721001, + "grad_norm": 2.3850560188293457, + "learning_rate": 4.9942870726706934e-05, + "loss": 6.6208, + "step": 3620 + }, + { + "epoch": 0.02153511276049101, + "grad_norm": 2.4397644996643066, + "learning_rate": 4.994283916252738e-05, + "loss": 6.7522, + "step": 3621 + }, + { + "epoch": 0.021541060043772004, + "grad_norm": 2.400846242904663, + "learning_rate": 4.994280758964053e-05, + "loss": 6.7529, + "step": 3622 + }, + { + "epoch": 0.021547007327053003, + "grad_norm": 2.358290195465088, + "learning_rate": 4.994277600804641e-05, + "loss": 6.6812, + "step": 3623 + }, + { + "epoch": 0.021552954610333998, + "grad_norm": 2.7409300804138184, + "learning_rate": 4.994274441774503e-05, + "loss": 6.668, + "step": 3624 + }, + { + "epoch": 0.021558901893614996, + "grad_norm": 2.6890954971313477, + "learning_rate": 4.994271281873639e-05, + "loss": 6.5537, + "step": 3625 + }, + { + "epoch": 0.021564849176895995, + "grad_norm": 2.8959596157073975, + "learning_rate": 4.9942681211020505e-05, + "loss": 6.4492, + "step": 3626 + }, + { + "epoch": 0.02157079646017699, + "grad_norm": 2.4325244426727295, + "learning_rate": 4.994264959459738e-05, + "loss": 6.9819, + "step": 3627 + }, + { + "epoch": 0.02157674374345799, + "grad_norm": 2.92891263961792, + "learning_rate": 4.9942617969467045e-05, + "loss": 6.9266, + "step": 3628 + }, + { + "epoch": 0.021582691026738987, + "grad_norm": 2.4398467540740967, + "learning_rate": 4.994258633562951e-05, + "loss": 6.514, + "step": 3629 + }, + { + "epoch": 0.021588638310019982, + "grad_norm": 2.577467203140259, + "learning_rate": 4.9942554693084756e-05, + "loss": 6.7248, + "step": 3630 + }, + { + "epoch": 0.02159458559330098, + "grad_norm": 2.3682591915130615, + "learning_rate": 4.9942523041832824e-05, + "loss": 6.7798, + "step": 3631 + }, + { + "epoch": 0.021600532876581976, + "grad_norm": 2.1863434314727783, + "learning_rate": 4.9942491381873705e-05, + "loss": 6.6636, + "step": 3632 + }, + { + "epoch": 0.021606480159862974, + "grad_norm": 2.0172441005706787, + "learning_rate": 4.9942459713207426e-05, + "loss": 6.6772, + "step": 3633 + }, + { + "epoch": 0.021612427443143973, + "grad_norm": 1.8671952486038208, + "learning_rate": 4.9942428035834e-05, + "loss": 6.3648, + "step": 3634 + }, + { + "epoch": 0.021618374726424968, + "grad_norm": 3.226900815963745, + "learning_rate": 4.9942396349753416e-05, + "loss": 6.4127, + "step": 3635 + }, + { + "epoch": 0.021624322009705967, + "grad_norm": 2.7766973972320557, + "learning_rate": 4.994236465496571e-05, + "loss": 6.4476, + "step": 3636 + }, + { + "epoch": 0.021630269292986965, + "grad_norm": 2.157118082046509, + "learning_rate": 4.9942332951470875e-05, + "loss": 6.5876, + "step": 3637 + }, + { + "epoch": 0.02163621657626796, + "grad_norm": 2.3870396614074707, + "learning_rate": 4.994230123926893e-05, + "loss": 6.5861, + "step": 3638 + }, + { + "epoch": 0.02164216385954896, + "grad_norm": 2.8139939308166504, + "learning_rate": 4.994226951835989e-05, + "loss": 6.4845, + "step": 3639 + }, + { + "epoch": 0.021648111142829954, + "grad_norm": 2.856207847595215, + "learning_rate": 4.9942237788743764e-05, + "loss": 6.1514, + "step": 3640 + }, + { + "epoch": 0.021654058426110952, + "grad_norm": 3.523162603378296, + "learning_rate": 4.9942206050420545e-05, + "loss": 5.8114, + "step": 3641 + }, + { + "epoch": 0.02166000570939195, + "grad_norm": 2.746587038040161, + "learning_rate": 4.9942174303390274e-05, + "loss": 5.7397, + "step": 3642 + }, + { + "epoch": 0.021665952992672946, + "grad_norm": 2.902067184448242, + "learning_rate": 4.9942142547652946e-05, + "loss": 6.4353, + "step": 3643 + }, + { + "epoch": 0.021671900275953945, + "grad_norm": 2.981391191482544, + "learning_rate": 4.994211078320857e-05, + "loss": 6.2153, + "step": 3644 + }, + { + "epoch": 0.021677847559234943, + "grad_norm": 2.6004254817962646, + "learning_rate": 4.994207901005716e-05, + "loss": 6.2365, + "step": 3645 + }, + { + "epoch": 0.021683794842515938, + "grad_norm": 2.748678684234619, + "learning_rate": 4.994204722819873e-05, + "loss": 5.8126, + "step": 3646 + }, + { + "epoch": 0.021689742125796937, + "grad_norm": 2.675466299057007, + "learning_rate": 4.994201543763329e-05, + "loss": 6.3032, + "step": 3647 + }, + { + "epoch": 0.021695689409077932, + "grad_norm": 2.681823253631592, + "learning_rate": 4.9941983638360855e-05, + "loss": 6.2706, + "step": 3648 + }, + { + "epoch": 0.02170163669235893, + "grad_norm": 2.481586217880249, + "learning_rate": 4.994195183038142e-05, + "loss": 6.1792, + "step": 3649 + }, + { + "epoch": 0.02170758397563993, + "grad_norm": 2.3379831314086914, + "learning_rate": 4.9941920013695024e-05, + "loss": 6.2689, + "step": 3650 + }, + { + "epoch": 0.021713531258920924, + "grad_norm": 2.5885238647460938, + "learning_rate": 4.994188818830164e-05, + "loss": 6.3018, + "step": 3651 + }, + { + "epoch": 0.021719478542201923, + "grad_norm": 2.341939687728882, + "learning_rate": 4.994185635420131e-05, + "loss": 5.6178, + "step": 3652 + }, + { + "epoch": 0.021725425825482918, + "grad_norm": 2.4126031398773193, + "learning_rate": 4.9941824511394044e-05, + "loss": 5.4044, + "step": 3653 + }, + { + "epoch": 0.021731373108763916, + "grad_norm": 2.2289719581604004, + "learning_rate": 4.994179265987983e-05, + "loss": 5.4134, + "step": 3654 + }, + { + "epoch": 0.021737320392044915, + "grad_norm": 2.5151331424713135, + "learning_rate": 4.994176079965871e-05, + "loss": 5.3321, + "step": 3655 + }, + { + "epoch": 0.02174326767532591, + "grad_norm": 2.0761523246765137, + "learning_rate": 4.9941728930730665e-05, + "loss": 5.3363, + "step": 3656 + }, + { + "epoch": 0.02174921495860691, + "grad_norm": 2.272510051727295, + "learning_rate": 4.994169705309573e-05, + "loss": 6.0208, + "step": 3657 + }, + { + "epoch": 0.021755162241887907, + "grad_norm": 2.6145198345184326, + "learning_rate": 4.994166516675389e-05, + "loss": 6.299, + "step": 3658 + }, + { + "epoch": 0.021761109525168902, + "grad_norm": 2.978618621826172, + "learning_rate": 4.994163327170519e-05, + "loss": 5.1248, + "step": 3659 + }, + { + "epoch": 0.0217670568084499, + "grad_norm": 2.398813247680664, + "learning_rate": 4.994160136794962e-05, + "loss": 5.1217, + "step": 3660 + }, + { + "epoch": 0.021773004091730896, + "grad_norm": 2.1145291328430176, + "learning_rate": 4.994156945548719e-05, + "loss": 5.2676, + "step": 3661 + }, + { + "epoch": 0.021778951375011894, + "grad_norm": 2.045334577560425, + "learning_rate": 4.9941537534317915e-05, + "loss": 5.2088, + "step": 3662 + }, + { + "epoch": 0.021784898658292893, + "grad_norm": 2.0598506927490234, + "learning_rate": 4.9941505604441806e-05, + "loss": 5.363, + "step": 3663 + }, + { + "epoch": 0.021790845941573888, + "grad_norm": 2.189143657684326, + "learning_rate": 4.9941473665858884e-05, + "loss": 6.0592, + "step": 3664 + }, + { + "epoch": 0.021796793224854887, + "grad_norm": 6.8580780029296875, + "learning_rate": 4.994144171856915e-05, + "loss": 6.0323, + "step": 3665 + }, + { + "epoch": 0.021802740508135885, + "grad_norm": 2.0607001781463623, + "learning_rate": 4.994140976257261e-05, + "loss": 6.0883, + "step": 3666 + }, + { + "epoch": 0.02180868779141688, + "grad_norm": 2.1669631004333496, + "learning_rate": 4.9941377797869284e-05, + "loss": 6.0546, + "step": 3667 + }, + { + "epoch": 0.02181463507469788, + "grad_norm": 2.912822961807251, + "learning_rate": 4.994134582445917e-05, + "loss": 6.0285, + "step": 3668 + }, + { + "epoch": 0.021820582357978874, + "grad_norm": 2.3223111629486084, + "learning_rate": 4.994131384234231e-05, + "loss": 6.0948, + "step": 3669 + }, + { + "epoch": 0.021826529641259872, + "grad_norm": 2.067002296447754, + "learning_rate": 4.994128185151868e-05, + "loss": 6.2908, + "step": 3670 + }, + { + "epoch": 0.02183247692454087, + "grad_norm": 2.593642473220825, + "learning_rate": 4.9941249851988317e-05, + "loss": 6.2878, + "step": 3671 + }, + { + "epoch": 0.021838424207821866, + "grad_norm": 2.6345975399017334, + "learning_rate": 4.994121784375121e-05, + "loss": 6.0796, + "step": 3672 + }, + { + "epoch": 0.021844371491102865, + "grad_norm": 2.398861885070801, + "learning_rate": 4.994118582680739e-05, + "loss": 6.096, + "step": 3673 + }, + { + "epoch": 0.021850318774383863, + "grad_norm": 2.102933883666992, + "learning_rate": 4.994115380115686e-05, + "loss": 6.1347, + "step": 3674 + }, + { + "epoch": 0.021856266057664858, + "grad_norm": 2.43632435798645, + "learning_rate": 4.994112176679963e-05, + "loss": 6.074, + "step": 3675 + }, + { + "epoch": 0.021862213340945857, + "grad_norm": 2.304213523864746, + "learning_rate": 4.9941089723735706e-05, + "loss": 5.8897, + "step": 3676 + }, + { + "epoch": 0.021868160624226852, + "grad_norm": 2.6283092498779297, + "learning_rate": 4.9941057671965106e-05, + "loss": 5.9605, + "step": 3677 + }, + { + "epoch": 0.02187410790750785, + "grad_norm": 2.0781428813934326, + "learning_rate": 4.994102561148785e-05, + "loss": 6.0645, + "step": 3678 + }, + { + "epoch": 0.02188005519078885, + "grad_norm": 2.229210376739502, + "learning_rate": 4.994099354230393e-05, + "loss": 6.223, + "step": 3679 + }, + { + "epoch": 0.021886002474069844, + "grad_norm": 2.4410789012908936, + "learning_rate": 4.9940961464413374e-05, + "loss": 6.1115, + "step": 3680 + }, + { + "epoch": 0.021891949757350843, + "grad_norm": 2.99076771736145, + "learning_rate": 4.994092937781618e-05, + "loss": 5.9028, + "step": 3681 + }, + { + "epoch": 0.021897897040631838, + "grad_norm": 2.8403074741363525, + "learning_rate": 4.994089728251237e-05, + "loss": 5.7286, + "step": 3682 + }, + { + "epoch": 0.021903844323912836, + "grad_norm": 2.0928149223327637, + "learning_rate": 4.994086517850195e-05, + "loss": 5.849, + "step": 3683 + }, + { + "epoch": 0.021909791607193835, + "grad_norm": 2.320279836654663, + "learning_rate": 4.994083306578492e-05, + "loss": 5.6767, + "step": 3684 + }, + { + "epoch": 0.02191573889047483, + "grad_norm": 3.0701658725738525, + "learning_rate": 4.994080094436132e-05, + "loss": 5.9555, + "step": 3685 + }, + { + "epoch": 0.02192168617375583, + "grad_norm": 2.1042048931121826, + "learning_rate": 4.994076881423113e-05, + "loss": 5.7651, + "step": 3686 + }, + { + "epoch": 0.021927633457036827, + "grad_norm": 2.35819673538208, + "learning_rate": 4.9940736675394385e-05, + "loss": 6.0203, + "step": 3687 + }, + { + "epoch": 0.021933580740317822, + "grad_norm": 2.659224510192871, + "learning_rate": 4.994070452785108e-05, + "loss": 5.9935, + "step": 3688 + }, + { + "epoch": 0.02193952802359882, + "grad_norm": 2.4628207683563232, + "learning_rate": 4.994067237160124e-05, + "loss": 5.9135, + "step": 3689 + }, + { + "epoch": 0.021945475306879816, + "grad_norm": 3.7227911949157715, + "learning_rate": 4.9940640206644865e-05, + "loss": 5.8365, + "step": 3690 + }, + { + "epoch": 0.021951422590160814, + "grad_norm": 3.5226151943206787, + "learning_rate": 4.994060803298197e-05, + "loss": 5.7807, + "step": 3691 + }, + { + "epoch": 0.021957369873441813, + "grad_norm": 2.3665735721588135, + "learning_rate": 4.994057585061256e-05, + "loss": 5.9632, + "step": 3692 + }, + { + "epoch": 0.021963317156722808, + "grad_norm": 2.877263069152832, + "learning_rate": 4.9940543659536666e-05, + "loss": 5.6425, + "step": 3693 + }, + { + "epoch": 0.021969264440003806, + "grad_norm": 2.5431532859802246, + "learning_rate": 4.994051145975428e-05, + "loss": 5.6531, + "step": 3694 + }, + { + "epoch": 0.021975211723284805, + "grad_norm": 2.7033538818359375, + "learning_rate": 4.9940479251265415e-05, + "loss": 5.6907, + "step": 3695 + }, + { + "epoch": 0.0219811590065658, + "grad_norm": 3.6627206802368164, + "learning_rate": 4.9940447034070093e-05, + "loss": 5.9118, + "step": 3696 + }, + { + "epoch": 0.0219871062898468, + "grad_norm": 3.896959066390991, + "learning_rate": 4.994041480816831e-05, + "loss": 5.9926, + "step": 3697 + }, + { + "epoch": 0.021993053573127794, + "grad_norm": 3.37575626373291, + "learning_rate": 4.994038257356009e-05, + "loss": 5.9768, + "step": 3698 + }, + { + "epoch": 0.021999000856408792, + "grad_norm": 2.7694313526153564, + "learning_rate": 4.9940350330245444e-05, + "loss": 5.8486, + "step": 3699 + }, + { + "epoch": 0.02200494813968979, + "grad_norm": 2.3815293312072754, + "learning_rate": 4.9940318078224376e-05, + "loss": 6.0663, + "step": 3700 + }, + { + "epoch": 0.022010895422970786, + "grad_norm": 2.3171627521514893, + "learning_rate": 4.99402858174969e-05, + "loss": 5.8543, + "step": 3701 + }, + { + "epoch": 0.022016842706251784, + "grad_norm": 2.5090551376342773, + "learning_rate": 4.994025354806303e-05, + "loss": 5.7005, + "step": 3702 + }, + { + "epoch": 0.022022789989532783, + "grad_norm": 2.7024855613708496, + "learning_rate": 4.9940221269922774e-05, + "loss": 5.7375, + "step": 3703 + }, + { + "epoch": 0.022028737272813778, + "grad_norm": 2.7900679111480713, + "learning_rate": 4.994018898307614e-05, + "loss": 6.0094, + "step": 3704 + }, + { + "epoch": 0.022034684556094777, + "grad_norm": 2.3678438663482666, + "learning_rate": 4.994015668752315e-05, + "loss": 5.822, + "step": 3705 + }, + { + "epoch": 0.022040631839375772, + "grad_norm": 2.5406653881073, + "learning_rate": 4.9940124383263807e-05, + "loss": 5.8984, + "step": 3706 + }, + { + "epoch": 0.02204657912265677, + "grad_norm": 2.371800422668457, + "learning_rate": 4.994009207029813e-05, + "loss": 5.9821, + "step": 3707 + }, + { + "epoch": 0.02205252640593777, + "grad_norm": 2.004669666290283, + "learning_rate": 4.994005974862612e-05, + "loss": 5.8801, + "step": 3708 + }, + { + "epoch": 0.022058473689218764, + "grad_norm": 2.777472972869873, + "learning_rate": 4.9940027418247787e-05, + "loss": 5.8821, + "step": 3709 + }, + { + "epoch": 0.022064420972499763, + "grad_norm": 2.599883556365967, + "learning_rate": 4.9939995079163156e-05, + "loss": 5.8716, + "step": 3710 + }, + { + "epoch": 0.022070368255780758, + "grad_norm": 2.5891127586364746, + "learning_rate": 4.993996273137223e-05, + "loss": 5.7607, + "step": 3711 + }, + { + "epoch": 0.022076315539061756, + "grad_norm": 2.3737518787384033, + "learning_rate": 4.993993037487501e-05, + "loss": 5.7825, + "step": 3712 + }, + { + "epoch": 0.022082262822342755, + "grad_norm": 2.421785831451416, + "learning_rate": 4.9939898009671524e-05, + "loss": 5.7143, + "step": 3713 + }, + { + "epoch": 0.02208821010562375, + "grad_norm": 2.4267804622650146, + "learning_rate": 4.9939865635761785e-05, + "loss": 5.8031, + "step": 3714 + }, + { + "epoch": 0.02209415738890475, + "grad_norm": 2.390333414077759, + "learning_rate": 4.993983325314579e-05, + "loss": 5.7985, + "step": 3715 + }, + { + "epoch": 0.022100104672185747, + "grad_norm": 2.2265970706939697, + "learning_rate": 4.993980086182356e-05, + "loss": 5.6261, + "step": 3716 + }, + { + "epoch": 0.022106051955466742, + "grad_norm": 2.3872458934783936, + "learning_rate": 4.99397684617951e-05, + "loss": 5.8185, + "step": 3717 + }, + { + "epoch": 0.02211199923874774, + "grad_norm": 2.077075958251953, + "learning_rate": 4.9939736053060425e-05, + "loss": 5.6252, + "step": 3718 + }, + { + "epoch": 0.022117946522028736, + "grad_norm": 2.0642287731170654, + "learning_rate": 4.993970363561954e-05, + "loss": 5.8034, + "step": 3719 + }, + { + "epoch": 0.022123893805309734, + "grad_norm": 3.5353951454162598, + "learning_rate": 4.9939671209472474e-05, + "loss": 6.7808, + "step": 3720 + }, + { + "epoch": 0.022129841088590733, + "grad_norm": 2.910531520843506, + "learning_rate": 4.9939638774619216e-05, + "loss": 5.9323, + "step": 3721 + }, + { + "epoch": 0.022135788371871728, + "grad_norm": 2.7450106143951416, + "learning_rate": 4.9939606331059794e-05, + "loss": 5.9926, + "step": 3722 + }, + { + "epoch": 0.022141735655152726, + "grad_norm": 2.7628188133239746, + "learning_rate": 4.993957387879421e-05, + "loss": 5.9129, + "step": 3723 + }, + { + "epoch": 0.022147682938433725, + "grad_norm": 2.6644890308380127, + "learning_rate": 4.9939541417822485e-05, + "loss": 5.7038, + "step": 3724 + }, + { + "epoch": 0.02215363022171472, + "grad_norm": 2.143744707107544, + "learning_rate": 4.993950894814461e-05, + "loss": 5.5821, + "step": 3725 + }, + { + "epoch": 0.02215957750499572, + "grad_norm": 2.1691160202026367, + "learning_rate": 4.993947646976063e-05, + "loss": 5.5929, + "step": 3726 + }, + { + "epoch": 0.022165524788276714, + "grad_norm": 2.1479709148406982, + "learning_rate": 4.993944398267052e-05, + "loss": 5.6653, + "step": 3727 + }, + { + "epoch": 0.022171472071557712, + "grad_norm": 2.7749600410461426, + "learning_rate": 4.993941148687431e-05, + "loss": 5.5682, + "step": 3728 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 2.668672561645508, + "learning_rate": 4.993937898237201e-05, + "loss": 5.5968, + "step": 3729 + }, + { + "epoch": 0.022183366638119706, + "grad_norm": 2.3903374671936035, + "learning_rate": 4.993934646916364e-05, + "loss": 5.7541, + "step": 3730 + }, + { + "epoch": 0.022189313921400704, + "grad_norm": 1.8555344343185425, + "learning_rate": 4.993931394724919e-05, + "loss": 5.5449, + "step": 3731 + }, + { + "epoch": 0.022195261204681703, + "grad_norm": 2.1140637397766113, + "learning_rate": 4.993928141662869e-05, + "loss": 5.8201, + "step": 3732 + }, + { + "epoch": 0.022201208487962698, + "grad_norm": 2.221573829650879, + "learning_rate": 4.993924887730213e-05, + "loss": 5.7583, + "step": 3733 + }, + { + "epoch": 0.022207155771243697, + "grad_norm": 2.0801634788513184, + "learning_rate": 4.993921632926956e-05, + "loss": 5.7083, + "step": 3734 + }, + { + "epoch": 0.02221310305452469, + "grad_norm": 2.0167016983032227, + "learning_rate": 4.993918377253095e-05, + "loss": 5.7798, + "step": 3735 + }, + { + "epoch": 0.02221905033780569, + "grad_norm": 2.104529619216919, + "learning_rate": 4.993915120708634e-05, + "loss": 5.7346, + "step": 3736 + }, + { + "epoch": 0.02222499762108669, + "grad_norm": 2.0807201862335205, + "learning_rate": 4.993911863293572e-05, + "loss": 5.7663, + "step": 3737 + }, + { + "epoch": 0.022230944904367684, + "grad_norm": 1.9223891496658325, + "learning_rate": 4.9939086050079115e-05, + "loss": 5.648, + "step": 3738 + }, + { + "epoch": 0.022236892187648682, + "grad_norm": 2.3831584453582764, + "learning_rate": 4.9939053458516535e-05, + "loss": 5.7988, + "step": 3739 + }, + { + "epoch": 0.02224283947092968, + "grad_norm": 2.433318853378296, + "learning_rate": 4.993902085824799e-05, + "loss": 5.7794, + "step": 3740 + }, + { + "epoch": 0.022248786754210676, + "grad_norm": 2.2488365173339844, + "learning_rate": 4.993898824927348e-05, + "loss": 5.7332, + "step": 3741 + }, + { + "epoch": 0.022254734037491675, + "grad_norm": 2.2924392223358154, + "learning_rate": 4.993895563159303e-05, + "loss": 5.8977, + "step": 3742 + }, + { + "epoch": 0.02226068132077267, + "grad_norm": 2.1601176261901855, + "learning_rate": 4.9938923005206664e-05, + "loss": 5.8588, + "step": 3743 + }, + { + "epoch": 0.02226662860405367, + "grad_norm": 2.256439447402954, + "learning_rate": 4.993889037011436e-05, + "loss": 5.6111, + "step": 3744 + }, + { + "epoch": 0.022272575887334667, + "grad_norm": 2.184950828552246, + "learning_rate": 4.993885772631615e-05, + "loss": 5.7544, + "step": 3745 + }, + { + "epoch": 0.022278523170615662, + "grad_norm": 2.250422716140747, + "learning_rate": 4.993882507381205e-05, + "loss": 5.6534, + "step": 3746 + }, + { + "epoch": 0.02228447045389666, + "grad_norm": 2.473811626434326, + "learning_rate": 4.9938792412602056e-05, + "loss": 5.5699, + "step": 3747 + }, + { + "epoch": 0.022290417737177656, + "grad_norm": 2.2859978675842285, + "learning_rate": 4.993875974268619e-05, + "loss": 5.8712, + "step": 3748 + }, + { + "epoch": 0.022296365020458654, + "grad_norm": 2.4002318382263184, + "learning_rate": 4.993872706406446e-05, + "loss": 5.8121, + "step": 3749 + }, + { + "epoch": 0.022302312303739653, + "grad_norm": 2.2692153453826904, + "learning_rate": 4.9938694376736884e-05, + "loss": 5.5516, + "step": 3750 + }, + { + "epoch": 0.022308259587020648, + "grad_norm": 2.1874892711639404, + "learning_rate": 4.9938661680703456e-05, + "loss": 5.8264, + "step": 3751 + }, + { + "epoch": 0.022314206870301646, + "grad_norm": 2.3802871704101562, + "learning_rate": 4.993862897596421e-05, + "loss": 5.6523, + "step": 3752 + }, + { + "epoch": 0.022320154153582645, + "grad_norm": 2.514646530151367, + "learning_rate": 4.9938596262519145e-05, + "loss": 5.5193, + "step": 3753 + }, + { + "epoch": 0.02232610143686364, + "grad_norm": 2.3175413608551025, + "learning_rate": 4.993856354036827e-05, + "loss": 5.5372, + "step": 3754 + }, + { + "epoch": 0.02233204872014464, + "grad_norm": 2.2071855068206787, + "learning_rate": 4.9938530809511595e-05, + "loss": 5.5002, + "step": 3755 + }, + { + "epoch": 0.022337996003425634, + "grad_norm": 2.046440839767456, + "learning_rate": 4.9938498069949144e-05, + "loss": 5.585, + "step": 3756 + }, + { + "epoch": 0.022343943286706632, + "grad_norm": 2.3971145153045654, + "learning_rate": 4.9938465321680915e-05, + "loss": 5.7858, + "step": 3757 + }, + { + "epoch": 0.02234989056998763, + "grad_norm": 2.462597131729126, + "learning_rate": 4.9938432564706936e-05, + "loss": 5.5606, + "step": 3758 + }, + { + "epoch": 0.022355837853268626, + "grad_norm": 2.3134138584136963, + "learning_rate": 4.99383997990272e-05, + "loss": 5.4587, + "step": 3759 + }, + { + "epoch": 0.022361785136549624, + "grad_norm": 2.137929916381836, + "learning_rate": 4.993836702464173e-05, + "loss": 5.4768, + "step": 3760 + }, + { + "epoch": 0.022367732419830623, + "grad_norm": 2.647691011428833, + "learning_rate": 4.993833424155053e-05, + "loss": 5.7902, + "step": 3761 + }, + { + "epoch": 0.022373679703111618, + "grad_norm": 2.535640239715576, + "learning_rate": 4.993830144975361e-05, + "loss": 5.8263, + "step": 3762 + }, + { + "epoch": 0.022379626986392617, + "grad_norm": 2.422997236251831, + "learning_rate": 4.9938268649251e-05, + "loss": 5.7751, + "step": 3763 + }, + { + "epoch": 0.02238557426967361, + "grad_norm": 2.6906728744506836, + "learning_rate": 4.9938235840042694e-05, + "loss": 5.5974, + "step": 3764 + }, + { + "epoch": 0.02239152155295461, + "grad_norm": 2.0284483432769775, + "learning_rate": 4.99382030221287e-05, + "loss": 5.6816, + "step": 3765 + }, + { + "epoch": 0.02239746883623561, + "grad_norm": 2.6392064094543457, + "learning_rate": 4.9938170195509035e-05, + "loss": 5.9052, + "step": 3766 + }, + { + "epoch": 0.022403416119516604, + "grad_norm": 2.6770617961883545, + "learning_rate": 4.993813736018372e-05, + "loss": 5.9041, + "step": 3767 + }, + { + "epoch": 0.022409363402797602, + "grad_norm": 2.5972392559051514, + "learning_rate": 4.993810451615276e-05, + "loss": 5.7834, + "step": 3768 + }, + { + "epoch": 0.0224153106860786, + "grad_norm": 2.0095736980438232, + "learning_rate": 4.993807166341616e-05, + "loss": 5.6074, + "step": 3769 + }, + { + "epoch": 0.022421257969359596, + "grad_norm": 2.412578582763672, + "learning_rate": 4.9938038801973945e-05, + "loss": 5.742, + "step": 3770 + }, + { + "epoch": 0.022427205252640595, + "grad_norm": 2.1285388469696045, + "learning_rate": 4.993800593182612e-05, + "loss": 5.7665, + "step": 3771 + }, + { + "epoch": 0.02243315253592159, + "grad_norm": 2.091252326965332, + "learning_rate": 4.993797305297268e-05, + "loss": 5.7165, + "step": 3772 + }, + { + "epoch": 0.022439099819202588, + "grad_norm": 2.5366342067718506, + "learning_rate": 4.993794016541367e-05, + "loss": 6.259, + "step": 3773 + }, + { + "epoch": 0.022445047102483587, + "grad_norm": 2.2637953758239746, + "learning_rate": 4.9937907269149063e-05, + "loss": 6.2132, + "step": 3774 + }, + { + "epoch": 0.022450994385764582, + "grad_norm": 2.570979595184326, + "learning_rate": 4.99378743641789e-05, + "loss": 5.9656, + "step": 3775 + }, + { + "epoch": 0.02245694166904558, + "grad_norm": 2.0587873458862305, + "learning_rate": 4.993784145050319e-05, + "loss": 5.7096, + "step": 3776 + }, + { + "epoch": 0.022462888952326576, + "grad_norm": 2.396812677383423, + "learning_rate": 4.993780852812192e-05, + "loss": 5.7258, + "step": 3777 + }, + { + "epoch": 0.022468836235607574, + "grad_norm": 2.081541061401367, + "learning_rate": 4.993777559703513e-05, + "loss": 5.6777, + "step": 3778 + }, + { + "epoch": 0.022474783518888573, + "grad_norm": 2.5242559909820557, + "learning_rate": 4.993774265724281e-05, + "loss": 5.961, + "step": 3779 + }, + { + "epoch": 0.022480730802169568, + "grad_norm": 2.4249329566955566, + "learning_rate": 4.993770970874499e-05, + "loss": 6.0494, + "step": 3780 + }, + { + "epoch": 0.022486678085450566, + "grad_norm": 2.7482552528381348, + "learning_rate": 4.993767675154169e-05, + "loss": 5.7579, + "step": 3781 + }, + { + "epoch": 0.022492625368731565, + "grad_norm": 4.115204811096191, + "learning_rate": 4.993764378563288e-05, + "loss": 6.3891, + "step": 3782 + }, + { + "epoch": 0.02249857265201256, + "grad_norm": 2.51346755027771, + "learning_rate": 4.99376108110186e-05, + "loss": 5.7982, + "step": 3783 + }, + { + "epoch": 0.02250451993529356, + "grad_norm": 2.2737278938293457, + "learning_rate": 4.993757782769887e-05, + "loss": 5.7576, + "step": 3784 + }, + { + "epoch": 0.022510467218574554, + "grad_norm": 2.2068402767181396, + "learning_rate": 4.9937544835673674e-05, + "loss": 5.9801, + "step": 3785 + }, + { + "epoch": 0.022516414501855552, + "grad_norm": 1.8548356294631958, + "learning_rate": 4.993751183494305e-05, + "loss": 6.2054, + "step": 3786 + }, + { + "epoch": 0.02252236178513655, + "grad_norm": 2.3499045372009277, + "learning_rate": 4.993747882550699e-05, + "loss": 6.0694, + "step": 3787 + }, + { + "epoch": 0.022528309068417546, + "grad_norm": 2.2253386974334717, + "learning_rate": 4.993744580736552e-05, + "loss": 5.709, + "step": 3788 + }, + { + "epoch": 0.022534256351698544, + "grad_norm": 2.1136696338653564, + "learning_rate": 4.993741278051864e-05, + "loss": 5.9546, + "step": 3789 + }, + { + "epoch": 0.022540203634979543, + "grad_norm": 1.8777605295181274, + "learning_rate": 4.9937379744966375e-05, + "loss": 5.7587, + "step": 3790 + }, + { + "epoch": 0.022546150918260538, + "grad_norm": 2.527571201324463, + "learning_rate": 4.9937346700708723e-05, + "loss": 5.0992, + "step": 3791 + }, + { + "epoch": 0.022552098201541537, + "grad_norm": 2.515805244445801, + "learning_rate": 4.99373136477457e-05, + "loss": 4.9766, + "step": 3792 + }, + { + "epoch": 0.02255804548482253, + "grad_norm": 2.442979574203491, + "learning_rate": 4.9937280586077315e-05, + "loss": 5.0981, + "step": 3793 + }, + { + "epoch": 0.02256399276810353, + "grad_norm": 2.575383424758911, + "learning_rate": 4.993724751570359e-05, + "loss": 5.0809, + "step": 3794 + }, + { + "epoch": 0.02256994005138453, + "grad_norm": 2.0855023860931396, + "learning_rate": 4.9937214436624524e-05, + "loss": 5.5744, + "step": 3795 + }, + { + "epoch": 0.022575887334665524, + "grad_norm": 2.237565040588379, + "learning_rate": 4.993718134884013e-05, + "loss": 5.6796, + "step": 3796 + }, + { + "epoch": 0.022581834617946522, + "grad_norm": 2.5895159244537354, + "learning_rate": 4.993714825235044e-05, + "loss": 5.2068, + "step": 3797 + }, + { + "epoch": 0.02258778190122752, + "grad_norm": 2.1277096271514893, + "learning_rate": 4.993711514715544e-05, + "loss": 5.5588, + "step": 3798 + }, + { + "epoch": 0.022593729184508516, + "grad_norm": 2.7074246406555176, + "learning_rate": 4.993708203325515e-05, + "loss": 5.0104, + "step": 3799 + }, + { + "epoch": 0.022599676467789515, + "grad_norm": 2.114569664001465, + "learning_rate": 4.993704891064958e-05, + "loss": 5.0453, + "step": 3800 + }, + { + "epoch": 0.02260562375107051, + "grad_norm": 2.4222404956817627, + "learning_rate": 4.9937015779338746e-05, + "loss": 5.3799, + "step": 3801 + }, + { + "epoch": 0.022611571034351508, + "grad_norm": 2.238755941390991, + "learning_rate": 4.993698263932266e-05, + "loss": 5.0075, + "step": 3802 + }, + { + "epoch": 0.022617518317632507, + "grad_norm": 2.0748255252838135, + "learning_rate": 4.993694949060133e-05, + "loss": 5.0007, + "step": 3803 + }, + { + "epoch": 0.022623465600913502, + "grad_norm": 2.1528635025024414, + "learning_rate": 4.993691633317477e-05, + "loss": 5.1048, + "step": 3804 + }, + { + "epoch": 0.0226294128841945, + "grad_norm": 2.0237200260162354, + "learning_rate": 4.993688316704298e-05, + "loss": 5.1465, + "step": 3805 + }, + { + "epoch": 0.022635360167475495, + "grad_norm": 2.2698304653167725, + "learning_rate": 4.993684999220599e-05, + "loss": 4.9642, + "step": 3806 + }, + { + "epoch": 0.022641307450756494, + "grad_norm": 2.7863757610321045, + "learning_rate": 4.993681680866381e-05, + "loss": 5.6277, + "step": 3807 + }, + { + "epoch": 0.022647254734037493, + "grad_norm": 2.394087553024292, + "learning_rate": 4.9936783616416436e-05, + "loss": 6.0895, + "step": 3808 + }, + { + "epoch": 0.022653202017318488, + "grad_norm": 2.8036317825317383, + "learning_rate": 4.993675041546389e-05, + "loss": 6.2002, + "step": 3809 + }, + { + "epoch": 0.022659149300599486, + "grad_norm": 2.4970054626464844, + "learning_rate": 4.993671720580618e-05, + "loss": 5.5114, + "step": 3810 + }, + { + "epoch": 0.022665096583880485, + "grad_norm": 3.2434241771698, + "learning_rate": 4.993668398744332e-05, + "loss": 5.0366, + "step": 3811 + }, + { + "epoch": 0.02267104386716148, + "grad_norm": 2.707104206085205, + "learning_rate": 4.9936650760375326e-05, + "loss": 5.5132, + "step": 3812 + }, + { + "epoch": 0.02267699115044248, + "grad_norm": 2.540231466293335, + "learning_rate": 4.9936617524602204e-05, + "loss": 5.8026, + "step": 3813 + }, + { + "epoch": 0.022682938433723474, + "grad_norm": 2.8549184799194336, + "learning_rate": 4.993658428012397e-05, + "loss": 6.0854, + "step": 3814 + }, + { + "epoch": 0.022688885717004472, + "grad_norm": 2.5972952842712402, + "learning_rate": 4.993655102694062e-05, + "loss": 5.8055, + "step": 3815 + }, + { + "epoch": 0.02269483300028547, + "grad_norm": 3.1625113487243652, + "learning_rate": 4.9936517765052184e-05, + "loss": 5.9683, + "step": 3816 + }, + { + "epoch": 0.022700780283566466, + "grad_norm": 3.239820718765259, + "learning_rate": 4.993648449445867e-05, + "loss": 5.9725, + "step": 3817 + }, + { + "epoch": 0.022706727566847464, + "grad_norm": 2.9632809162139893, + "learning_rate": 4.993645121516008e-05, + "loss": 5.9767, + "step": 3818 + }, + { + "epoch": 0.022712674850128463, + "grad_norm": 2.7486021518707275, + "learning_rate": 4.9936417927156435e-05, + "loss": 6.3471, + "step": 3819 + }, + { + "epoch": 0.022718622133409458, + "grad_norm": 3.8044490814208984, + "learning_rate": 4.993638463044775e-05, + "loss": 6.1275, + "step": 3820 + }, + { + "epoch": 0.022724569416690456, + "grad_norm": 4.851193428039551, + "learning_rate": 4.9936351325034024e-05, + "loss": 5.6658, + "step": 3821 + }, + { + "epoch": 0.02273051669997145, + "grad_norm": 3.1302716732025146, + "learning_rate": 4.993631801091528e-05, + "loss": 5.5256, + "step": 3822 + }, + { + "epoch": 0.02273646398325245, + "grad_norm": 5.310885906219482, + "learning_rate": 4.9936284688091526e-05, + "loss": 5.4771, + "step": 3823 + }, + { + "epoch": 0.02274241126653345, + "grad_norm": 5.493198394775391, + "learning_rate": 4.9936251356562765e-05, + "loss": 6.0993, + "step": 3824 + }, + { + "epoch": 0.022748358549814444, + "grad_norm": 3.5346286296844482, + "learning_rate": 4.993621801632902e-05, + "loss": 6.6862, + "step": 3825 + }, + { + "epoch": 0.022754305833095442, + "grad_norm": 4.550736904144287, + "learning_rate": 4.9936184667390304e-05, + "loss": 6.5658, + "step": 3826 + }, + { + "epoch": 0.02276025311637644, + "grad_norm": 3.3957576751708984, + "learning_rate": 4.993615130974662e-05, + "loss": 6.0596, + "step": 3827 + }, + { + "epoch": 0.022766200399657436, + "grad_norm": 2.614089012145996, + "learning_rate": 4.993611794339798e-05, + "loss": 6.77, + "step": 3828 + }, + { + "epoch": 0.022772147682938434, + "grad_norm": 3.712106704711914, + "learning_rate": 4.99360845683444e-05, + "loss": 6.4084, + "step": 3829 + }, + { + "epoch": 0.02277809496621943, + "grad_norm": 3.7331995964050293, + "learning_rate": 4.99360511845859e-05, + "loss": 6.2627, + "step": 3830 + }, + { + "epoch": 0.022784042249500428, + "grad_norm": 3.8898067474365234, + "learning_rate": 4.993601779212247e-05, + "loss": 6.6476, + "step": 3831 + }, + { + "epoch": 0.022789989532781427, + "grad_norm": 2.829078435897827, + "learning_rate": 4.9935984390954136e-05, + "loss": 6.2307, + "step": 3832 + }, + { + "epoch": 0.022795936816062422, + "grad_norm": 3.467954635620117, + "learning_rate": 4.9935950981080906e-05, + "loss": 6.5283, + "step": 3833 + }, + { + "epoch": 0.02280188409934342, + "grad_norm": 2.317840099334717, + "learning_rate": 4.99359175625028e-05, + "loss": 6.4549, + "step": 3834 + }, + { + "epoch": 0.02280783138262442, + "grad_norm": 2.7261998653411865, + "learning_rate": 4.9935884135219825e-05, + "loss": 6.2049, + "step": 3835 + }, + { + "epoch": 0.022813778665905414, + "grad_norm": 2.623098373413086, + "learning_rate": 4.993585069923198e-05, + "loss": 6.3847, + "step": 3836 + }, + { + "epoch": 0.022819725949186413, + "grad_norm": 2.4825377464294434, + "learning_rate": 4.993581725453929e-05, + "loss": 6.3532, + "step": 3837 + }, + { + "epoch": 0.022825673232467408, + "grad_norm": 2.278151750564575, + "learning_rate": 4.993578380114176e-05, + "loss": 5.8885, + "step": 3838 + }, + { + "epoch": 0.022831620515748406, + "grad_norm": 2.045839548110962, + "learning_rate": 4.9935750339039425e-05, + "loss": 6.6852, + "step": 3839 + }, + { + "epoch": 0.022837567799029405, + "grad_norm": 2.4009597301483154, + "learning_rate": 4.993571686823226e-05, + "loss": 6.1676, + "step": 3840 + }, + { + "epoch": 0.0228435150823104, + "grad_norm": 2.759819507598877, + "learning_rate": 4.9935683388720296e-05, + "loss": 6.3913, + "step": 3841 + }, + { + "epoch": 0.0228494623655914, + "grad_norm": 2.798785924911499, + "learning_rate": 4.9935649900503546e-05, + "loss": 6.8169, + "step": 3842 + }, + { + "epoch": 0.022855409648872393, + "grad_norm": 2.389890432357788, + "learning_rate": 4.9935616403582015e-05, + "loss": 6.7506, + "step": 3843 + }, + { + "epoch": 0.022861356932153392, + "grad_norm": 2.882474184036255, + "learning_rate": 4.9935582897955715e-05, + "loss": 6.2458, + "step": 3844 + }, + { + "epoch": 0.02286730421543439, + "grad_norm": 2.2487478256225586, + "learning_rate": 4.993554938362467e-05, + "loss": 6.7296, + "step": 3845 + }, + { + "epoch": 0.022873251498715386, + "grad_norm": 1.9563521146774292, + "learning_rate": 4.993551586058888e-05, + "loss": 6.6878, + "step": 3846 + }, + { + "epoch": 0.022879198781996384, + "grad_norm": 7.555780410766602, + "learning_rate": 4.993548232884835e-05, + "loss": 6.3309, + "step": 3847 + }, + { + "epoch": 0.022885146065277383, + "grad_norm": 2.2573931217193604, + "learning_rate": 4.99354487884031e-05, + "loss": 6.3384, + "step": 3848 + }, + { + "epoch": 0.022891093348558378, + "grad_norm": 2.063267946243286, + "learning_rate": 4.993541523925316e-05, + "loss": 6.2342, + "step": 3849 + }, + { + "epoch": 0.022897040631839376, + "grad_norm": 2.1032445430755615, + "learning_rate": 4.9935381681398505e-05, + "loss": 6.5458, + "step": 3850 + }, + { + "epoch": 0.02290298791512037, + "grad_norm": 2.233400583267212, + "learning_rate": 4.9935348114839176e-05, + "loss": 6.46, + "step": 3851 + }, + { + "epoch": 0.02290893519840137, + "grad_norm": 2.069182872772217, + "learning_rate": 4.9935314539575174e-05, + "loss": 6.4829, + "step": 3852 + }, + { + "epoch": 0.02291488248168237, + "grad_norm": 1.9986059665679932, + "learning_rate": 4.993528095560651e-05, + "loss": 6.4651, + "step": 3853 + }, + { + "epoch": 0.022920829764963364, + "grad_norm": 2.0529284477233887, + "learning_rate": 4.99352473629332e-05, + "loss": 6.1151, + "step": 3854 + }, + { + "epoch": 0.022926777048244362, + "grad_norm": 1.9643630981445312, + "learning_rate": 4.993521376155525e-05, + "loss": 5.991, + "step": 3855 + }, + { + "epoch": 0.02293272433152536, + "grad_norm": 2.2183501720428467, + "learning_rate": 4.9935180151472674e-05, + "loss": 6.8568, + "step": 3856 + }, + { + "epoch": 0.022938671614806356, + "grad_norm": 2.2095682621002197, + "learning_rate": 4.993514653268548e-05, + "loss": 6.8145, + "step": 3857 + }, + { + "epoch": 0.022944618898087354, + "grad_norm": 2.194451332092285, + "learning_rate": 4.9935112905193694e-05, + "loss": 6.4781, + "step": 3858 + }, + { + "epoch": 0.02295056618136835, + "grad_norm": 2.2242066860198975, + "learning_rate": 4.9935079268997306e-05, + "loss": 6.0535, + "step": 3859 + }, + { + "epoch": 0.022956513464649348, + "grad_norm": 2.336190938949585, + "learning_rate": 4.9935045624096354e-05, + "loss": 6.2453, + "step": 3860 + }, + { + "epoch": 0.022962460747930347, + "grad_norm": 1.9997279644012451, + "learning_rate": 4.9935011970490824e-05, + "loss": 6.3852, + "step": 3861 + }, + { + "epoch": 0.02296840803121134, + "grad_norm": 2.9107778072357178, + "learning_rate": 4.993497830818074e-05, + "loss": 6.0891, + "step": 3862 + }, + { + "epoch": 0.02297435531449234, + "grad_norm": 2.1357171535491943, + "learning_rate": 4.993494463716612e-05, + "loss": 6.5111, + "step": 3863 + }, + { + "epoch": 0.02298030259777334, + "grad_norm": 2.0228497982025146, + "learning_rate": 4.9934910957446954e-05, + "loss": 6.6009, + "step": 3864 + }, + { + "epoch": 0.022986249881054334, + "grad_norm": 2.8057942390441895, + "learning_rate": 4.993487726902328e-05, + "loss": 6.414, + "step": 3865 + }, + { + "epoch": 0.022992197164335332, + "grad_norm": 3.0660998821258545, + "learning_rate": 4.99348435718951e-05, + "loss": 6.3673, + "step": 3866 + }, + { + "epoch": 0.022998144447616328, + "grad_norm": 2.2440497875213623, + "learning_rate": 4.9934809866062416e-05, + "loss": 6.1793, + "step": 3867 + }, + { + "epoch": 0.023004091730897326, + "grad_norm": 2.342358350753784, + "learning_rate": 4.993477615152525e-05, + "loss": 6.5279, + "step": 3868 + }, + { + "epoch": 0.023010039014178325, + "grad_norm": 1.9231956005096436, + "learning_rate": 4.993474242828361e-05, + "loss": 6.4975, + "step": 3869 + }, + { + "epoch": 0.02301598629745932, + "grad_norm": 2.503028631210327, + "learning_rate": 4.9934708696337516e-05, + "loss": 6.5261, + "step": 3870 + }, + { + "epoch": 0.02302193358074032, + "grad_norm": 2.2343928813934326, + "learning_rate": 4.993467495568697e-05, + "loss": 6.0525, + "step": 3871 + }, + { + "epoch": 0.023027880864021313, + "grad_norm": 2.851964235305786, + "learning_rate": 4.993464120633198e-05, + "loss": 6.1271, + "step": 3872 + }, + { + "epoch": 0.023033828147302312, + "grad_norm": 2.580017328262329, + "learning_rate": 4.993460744827257e-05, + "loss": 6.2018, + "step": 3873 + }, + { + "epoch": 0.02303977543058331, + "grad_norm": 2.227879047393799, + "learning_rate": 4.9934573681508744e-05, + "loss": 6.0177, + "step": 3874 + }, + { + "epoch": 0.023045722713864306, + "grad_norm": 2.696531295776367, + "learning_rate": 4.993453990604051e-05, + "loss": 6.627, + "step": 3875 + }, + { + "epoch": 0.023051669997145304, + "grad_norm": 2.3439393043518066, + "learning_rate": 4.99345061218679e-05, + "loss": 6.5388, + "step": 3876 + }, + { + "epoch": 0.023057617280426303, + "grad_norm": 2.5400748252868652, + "learning_rate": 4.99344723289909e-05, + "loss": 5.9162, + "step": 3877 + }, + { + "epoch": 0.023063564563707298, + "grad_norm": 2.658193588256836, + "learning_rate": 4.9934438527409535e-05, + "loss": 5.6645, + "step": 3878 + }, + { + "epoch": 0.023069511846988296, + "grad_norm": 2.3102848529815674, + "learning_rate": 4.9934404717123814e-05, + "loss": 5.9969, + "step": 3879 + }, + { + "epoch": 0.02307545913026929, + "grad_norm": 2.6107916831970215, + "learning_rate": 4.993437089813376e-05, + "loss": 6.1776, + "step": 3880 + }, + { + "epoch": 0.02308140641355029, + "grad_norm": 2.6275434494018555, + "learning_rate": 4.993433707043937e-05, + "loss": 6.2563, + "step": 3881 + }, + { + "epoch": 0.02308735369683129, + "grad_norm": 2.8595218658447266, + "learning_rate": 4.993430323404066e-05, + "loss": 5.9371, + "step": 3882 + }, + { + "epoch": 0.023093300980112284, + "grad_norm": 2.2947659492492676, + "learning_rate": 4.993426938893764e-05, + "loss": 5.7263, + "step": 3883 + }, + { + "epoch": 0.023099248263393282, + "grad_norm": 3.3769729137420654, + "learning_rate": 4.9934235535130326e-05, + "loss": 6.2706, + "step": 3884 + }, + { + "epoch": 0.02310519554667428, + "grad_norm": 2.792043447494507, + "learning_rate": 4.9934201672618716e-05, + "loss": 5.9264, + "step": 3885 + }, + { + "epoch": 0.023111142829955276, + "grad_norm": 2.592167615890503, + "learning_rate": 4.993416780140285e-05, + "loss": 6.4031, + "step": 3886 + }, + { + "epoch": 0.023117090113236274, + "grad_norm": 2.429898977279663, + "learning_rate": 4.9934133921482716e-05, + "loss": 6.4609, + "step": 3887 + }, + { + "epoch": 0.02312303739651727, + "grad_norm": 2.1771554946899414, + "learning_rate": 4.993410003285834e-05, + "loss": 6.2873, + "step": 3888 + }, + { + "epoch": 0.023128984679798268, + "grad_norm": 2.7799339294433594, + "learning_rate": 4.9934066135529724e-05, + "loss": 5.7405, + "step": 3889 + }, + { + "epoch": 0.023134931963079267, + "grad_norm": 2.626492977142334, + "learning_rate": 4.993403222949688e-05, + "loss": 5.783, + "step": 3890 + }, + { + "epoch": 0.02314087924636026, + "grad_norm": 2.837663412094116, + "learning_rate": 4.993399831475982e-05, + "loss": 5.8039, + "step": 3891 + }, + { + "epoch": 0.02314682652964126, + "grad_norm": 2.68230938911438, + "learning_rate": 4.9933964391318564e-05, + "loss": 5.6587, + "step": 3892 + }, + { + "epoch": 0.02315277381292226, + "grad_norm": 3.2064061164855957, + "learning_rate": 4.993393045917312e-05, + "loss": 5.9516, + "step": 3893 + }, + { + "epoch": 0.023158721096203254, + "grad_norm": 3.5179402828216553, + "learning_rate": 4.99338965183235e-05, + "loss": 5.7925, + "step": 3894 + }, + { + "epoch": 0.023164668379484252, + "grad_norm": 2.9261434078216553, + "learning_rate": 4.993386256876971e-05, + "loss": 5.8677, + "step": 3895 + }, + { + "epoch": 0.023170615662765248, + "grad_norm": 3.092033624649048, + "learning_rate": 4.9933828610511766e-05, + "loss": 5.6248, + "step": 3896 + }, + { + "epoch": 0.023176562946046246, + "grad_norm": 2.7650182247161865, + "learning_rate": 4.9933794643549683e-05, + "loss": 5.7371, + "step": 3897 + }, + { + "epoch": 0.023182510229327245, + "grad_norm": 2.402839422225952, + "learning_rate": 4.993376066788347e-05, + "loss": 5.4802, + "step": 3898 + }, + { + "epoch": 0.02318845751260824, + "grad_norm": 2.606062889099121, + "learning_rate": 4.993372668351314e-05, + "loss": 5.5766, + "step": 3899 + }, + { + "epoch": 0.023194404795889238, + "grad_norm": 2.2177329063415527, + "learning_rate": 4.99336926904387e-05, + "loss": 5.5744, + "step": 3900 + }, + { + "epoch": 0.023200352079170233, + "grad_norm": 2.6953063011169434, + "learning_rate": 4.9933658688660166e-05, + "loss": 5.6414, + "step": 3901 + }, + { + "epoch": 0.023206299362451232, + "grad_norm": 2.90512752532959, + "learning_rate": 4.993362467817755e-05, + "loss": 5.5445, + "step": 3902 + }, + { + "epoch": 0.02321224664573223, + "grad_norm": 3.724168062210083, + "learning_rate": 4.993359065899086e-05, + "loss": 5.7733, + "step": 3903 + }, + { + "epoch": 0.023218193929013226, + "grad_norm": 2.9355592727661133, + "learning_rate": 4.993355663110012e-05, + "loss": 5.579, + "step": 3904 + }, + { + "epoch": 0.023224141212294224, + "grad_norm": 2.7822163105010986, + "learning_rate": 4.993352259450532e-05, + "loss": 5.5105, + "step": 3905 + }, + { + "epoch": 0.023230088495575223, + "grad_norm": 3.672539710998535, + "learning_rate": 4.99334885492065e-05, + "loss": 6.3865, + "step": 3906 + }, + { + "epoch": 0.023236035778856218, + "grad_norm": 2.26755952835083, + "learning_rate": 4.993345449520364e-05, + "loss": 5.5472, + "step": 3907 + }, + { + "epoch": 0.023241983062137216, + "grad_norm": 2.8935770988464355, + "learning_rate": 4.993342043249678e-05, + "loss": 5.5948, + "step": 3908 + }, + { + "epoch": 0.02324793034541821, + "grad_norm": 3.077798366546631, + "learning_rate": 4.9933386361085924e-05, + "loss": 5.288, + "step": 3909 + }, + { + "epoch": 0.02325387762869921, + "grad_norm": 2.479198694229126, + "learning_rate": 4.993335228097107e-05, + "loss": 5.3743, + "step": 3910 + }, + { + "epoch": 0.02325982491198021, + "grad_norm": 2.429049015045166, + "learning_rate": 4.9933318192152244e-05, + "loss": 5.6709, + "step": 3911 + }, + { + "epoch": 0.023265772195261204, + "grad_norm": 2.4515016078948975, + "learning_rate": 4.993328409462945e-05, + "loss": 5.4946, + "step": 3912 + }, + { + "epoch": 0.023271719478542202, + "grad_norm": 2.3859386444091797, + "learning_rate": 4.993324998840271e-05, + "loss": 5.5947, + "step": 3913 + }, + { + "epoch": 0.0232776667618232, + "grad_norm": 2.746438503265381, + "learning_rate": 4.993321587347203e-05, + "loss": 5.6743, + "step": 3914 + }, + { + "epoch": 0.023283614045104196, + "grad_norm": 2.416118621826172, + "learning_rate": 4.993318174983742e-05, + "loss": 5.7073, + "step": 3915 + }, + { + "epoch": 0.023289561328385194, + "grad_norm": 2.3427727222442627, + "learning_rate": 4.99331476174989e-05, + "loss": 5.5933, + "step": 3916 + }, + { + "epoch": 0.02329550861166619, + "grad_norm": 2.2179009914398193, + "learning_rate": 4.993311347645647e-05, + "loss": 5.7726, + "step": 3917 + }, + { + "epoch": 0.023301455894947188, + "grad_norm": 2.732923984527588, + "learning_rate": 4.993307932671014e-05, + "loss": 5.5783, + "step": 3918 + }, + { + "epoch": 0.023307403178228187, + "grad_norm": 2.5090553760528564, + "learning_rate": 4.993304516825994e-05, + "loss": 5.6598, + "step": 3919 + }, + { + "epoch": 0.02331335046150918, + "grad_norm": 2.690276622772217, + "learning_rate": 4.993301100110587e-05, + "loss": 5.9688, + "step": 3920 + }, + { + "epoch": 0.02331929774479018, + "grad_norm": 2.559215784072876, + "learning_rate": 4.993297682524794e-05, + "loss": 6.3315, + "step": 3921 + }, + { + "epoch": 0.02332524502807118, + "grad_norm": 2.2800240516662598, + "learning_rate": 4.993294264068617e-05, + "loss": 6.2787, + "step": 3922 + }, + { + "epoch": 0.023331192311352174, + "grad_norm": 2.478898525238037, + "learning_rate": 4.993290844742057e-05, + "loss": 6.1145, + "step": 3923 + }, + { + "epoch": 0.023337139594633172, + "grad_norm": 2.4902184009552, + "learning_rate": 4.993287424545115e-05, + "loss": 6.0665, + "step": 3924 + }, + { + "epoch": 0.023343086877914167, + "grad_norm": 2.4157116413116455, + "learning_rate": 4.9932840034777906e-05, + "loss": 6.1697, + "step": 3925 + }, + { + "epoch": 0.023349034161195166, + "grad_norm": 2.340575933456421, + "learning_rate": 4.993280581540087e-05, + "loss": 6.1121, + "step": 3926 + }, + { + "epoch": 0.023354981444476165, + "grad_norm": 2.586881160736084, + "learning_rate": 4.993277158732006e-05, + "loss": 6.1792, + "step": 3927 + }, + { + "epoch": 0.02336092872775716, + "grad_norm": 2.448880910873413, + "learning_rate": 4.9932737350535476e-05, + "loss": 6.084, + "step": 3928 + }, + { + "epoch": 0.023366876011038158, + "grad_norm": 2.525082588195801, + "learning_rate": 4.993270310504712e-05, + "loss": 5.6726, + "step": 3929 + }, + { + "epoch": 0.023372823294319153, + "grad_norm": 2.310445547103882, + "learning_rate": 4.993266885085503e-05, + "loss": 5.9496, + "step": 3930 + }, + { + "epoch": 0.023378770577600152, + "grad_norm": 2.275416612625122, + "learning_rate": 4.993263458795918e-05, + "loss": 6.0042, + "step": 3931 + }, + { + "epoch": 0.02338471786088115, + "grad_norm": 2.481973648071289, + "learning_rate": 4.993260031635963e-05, + "loss": 5.6177, + "step": 3932 + }, + { + "epoch": 0.023390665144162145, + "grad_norm": 2.439544677734375, + "learning_rate": 4.993256603605635e-05, + "loss": 5.9745, + "step": 3933 + }, + { + "epoch": 0.023396612427443144, + "grad_norm": 2.1909360885620117, + "learning_rate": 4.993253174704937e-05, + "loss": 5.9966, + "step": 3934 + }, + { + "epoch": 0.023402559710724143, + "grad_norm": 2.1893911361694336, + "learning_rate": 4.993249744933871e-05, + "loss": 6.0643, + "step": 3935 + }, + { + "epoch": 0.023408506994005138, + "grad_norm": 3.2023842334747314, + "learning_rate": 4.993246314292437e-05, + "loss": 6.2284, + "step": 3936 + }, + { + "epoch": 0.023414454277286136, + "grad_norm": 2.980842113494873, + "learning_rate": 4.9932428827806356e-05, + "loss": 6.2359, + "step": 3937 + }, + { + "epoch": 0.02342040156056713, + "grad_norm": 2.6659433841705322, + "learning_rate": 4.99323945039847e-05, + "loss": 6.2901, + "step": 3938 + }, + { + "epoch": 0.02342634884384813, + "grad_norm": 2.2173492908477783, + "learning_rate": 4.993236017145939e-05, + "loss": 5.8157, + "step": 3939 + }, + { + "epoch": 0.02343229612712913, + "grad_norm": 2.592771530151367, + "learning_rate": 4.993232583023046e-05, + "loss": 5.7747, + "step": 3940 + }, + { + "epoch": 0.023438243410410124, + "grad_norm": 2.328951835632324, + "learning_rate": 4.9932291480297915e-05, + "loss": 5.7367, + "step": 3941 + }, + { + "epoch": 0.023444190693691122, + "grad_norm": 2.3135616779327393, + "learning_rate": 4.993225712166176e-05, + "loss": 6.0592, + "step": 3942 + }, + { + "epoch": 0.02345013797697212, + "grad_norm": 2.49661922454834, + "learning_rate": 4.993222275432201e-05, + "loss": 5.9737, + "step": 3943 + }, + { + "epoch": 0.023456085260253116, + "grad_norm": 2.6462106704711914, + "learning_rate": 4.9932188378278683e-05, + "loss": 5.7053, + "step": 3944 + }, + { + "epoch": 0.023462032543534114, + "grad_norm": 2.102663516998291, + "learning_rate": 4.993215399353178e-05, + "loss": 5.9006, + "step": 3945 + }, + { + "epoch": 0.02346797982681511, + "grad_norm": 2.474500894546509, + "learning_rate": 4.9932119600081326e-05, + "loss": 6.092, + "step": 3946 + }, + { + "epoch": 0.023473927110096108, + "grad_norm": 2.6023428440093994, + "learning_rate": 4.993208519792732e-05, + "loss": 5.9045, + "step": 3947 + }, + { + "epoch": 0.023479874393377106, + "grad_norm": 2.76432466506958, + "learning_rate": 4.99320507870698e-05, + "loss": 5.8178, + "step": 3948 + }, + { + "epoch": 0.0234858216766581, + "grad_norm": 2.250816822052002, + "learning_rate": 4.993201636750874e-05, + "loss": 5.9091, + "step": 3949 + }, + { + "epoch": 0.0234917689599391, + "grad_norm": 2.1984071731567383, + "learning_rate": 4.993198193924417e-05, + "loss": 5.8804, + "step": 3950 + }, + { + "epoch": 0.0234977162432201, + "grad_norm": 2.5217959880828857, + "learning_rate": 4.993194750227611e-05, + "loss": 5.9879, + "step": 3951 + }, + { + "epoch": 0.023503663526501094, + "grad_norm": 2.080110788345337, + "learning_rate": 4.993191305660456e-05, + "loss": 5.6352, + "step": 3952 + }, + { + "epoch": 0.023509610809782092, + "grad_norm": 2.637500286102295, + "learning_rate": 4.9931878602229545e-05, + "loss": 5.7924, + "step": 3953 + }, + { + "epoch": 0.023515558093063087, + "grad_norm": 2.660531759262085, + "learning_rate": 4.9931844139151056e-05, + "loss": 6.1936, + "step": 3954 + }, + { + "epoch": 0.023521505376344086, + "grad_norm": 2.423699378967285, + "learning_rate": 4.993180966736913e-05, + "loss": 5.8974, + "step": 3955 + }, + { + "epoch": 0.023527452659625085, + "grad_norm": 2.581876277923584, + "learning_rate": 4.993177518688375e-05, + "loss": 5.833, + "step": 3956 + }, + { + "epoch": 0.02353339994290608, + "grad_norm": 2.586538076400757, + "learning_rate": 4.9931740697694965e-05, + "loss": 5.9649, + "step": 3957 + }, + { + "epoch": 0.023539347226187078, + "grad_norm": 2.5123441219329834, + "learning_rate": 4.993170619980276e-05, + "loss": 6.1251, + "step": 3958 + }, + { + "epoch": 0.023545294509468077, + "grad_norm": 3.076904535293579, + "learning_rate": 4.993167169320715e-05, + "loss": 5.9559, + "step": 3959 + }, + { + "epoch": 0.023551241792749072, + "grad_norm": 2.572312593460083, + "learning_rate": 4.9931637177908153e-05, + "loss": 6.0291, + "step": 3960 + }, + { + "epoch": 0.02355718907603007, + "grad_norm": 1.9910492897033691, + "learning_rate": 4.9931602653905776e-05, + "loss": 5.8413, + "step": 3961 + }, + { + "epoch": 0.023563136359311065, + "grad_norm": 2.530710458755493, + "learning_rate": 4.993156812120004e-05, + "loss": 6.1217, + "step": 3962 + }, + { + "epoch": 0.023569083642592064, + "grad_norm": 2.3089046478271484, + "learning_rate": 4.993153357979095e-05, + "loss": 5.822, + "step": 3963 + }, + { + "epoch": 0.023575030925873063, + "grad_norm": 2.8980624675750732, + "learning_rate": 4.993149902967852e-05, + "loss": 6.3906, + "step": 3964 + }, + { + "epoch": 0.023580978209154058, + "grad_norm": 2.2176012992858887, + "learning_rate": 4.993146447086275e-05, + "loss": 5.9259, + "step": 3965 + }, + { + "epoch": 0.023586925492435056, + "grad_norm": 2.01096773147583, + "learning_rate": 4.993142990334367e-05, + "loss": 6.3141, + "step": 3966 + }, + { + "epoch": 0.02359287277571605, + "grad_norm": 3.4096288681030273, + "learning_rate": 4.993139532712129e-05, + "loss": 6.3165, + "step": 3967 + }, + { + "epoch": 0.02359882005899705, + "grad_norm": 2.20595645904541, + "learning_rate": 4.9931360742195623e-05, + "loss": 6.016, + "step": 3968 + }, + { + "epoch": 0.02360476734227805, + "grad_norm": 3.543301820755005, + "learning_rate": 4.993132614856666e-05, + "loss": 5.722, + "step": 3969 + }, + { + "epoch": 0.023610714625559043, + "grad_norm": 2.82092547416687, + "learning_rate": 4.993129154623444e-05, + "loss": 5.8217, + "step": 3970 + }, + { + "epoch": 0.023616661908840042, + "grad_norm": 2.4585440158843994, + "learning_rate": 4.9931256935198954e-05, + "loss": 6.3298, + "step": 3971 + }, + { + "epoch": 0.02362260919212104, + "grad_norm": 2.104340076446533, + "learning_rate": 4.993122231546024e-05, + "loss": 5.9174, + "step": 3972 + }, + { + "epoch": 0.023628556475402036, + "grad_norm": 2.5130183696746826, + "learning_rate": 4.993118768701828e-05, + "loss": 6.3075, + "step": 3973 + }, + { + "epoch": 0.023634503758683034, + "grad_norm": 2.4567196369171143, + "learning_rate": 4.99311530498731e-05, + "loss": 6.0088, + "step": 3974 + }, + { + "epoch": 0.02364045104196403, + "grad_norm": 2.5174858570098877, + "learning_rate": 4.993111840402471e-05, + "loss": 6.6739, + "step": 3975 + }, + { + "epoch": 0.023646398325245028, + "grad_norm": 2.0032241344451904, + "learning_rate": 4.9931083749473136e-05, + "loss": 5.7052, + "step": 3976 + }, + { + "epoch": 0.023652345608526026, + "grad_norm": 2.9536757469177246, + "learning_rate": 4.993104908621837e-05, + "loss": 5.415, + "step": 3977 + }, + { + "epoch": 0.02365829289180702, + "grad_norm": 2.6650888919830322, + "learning_rate": 4.9931014414260435e-05, + "loss": 5.4333, + "step": 3978 + }, + { + "epoch": 0.02366424017508802, + "grad_norm": 2.3574490547180176, + "learning_rate": 4.9930979733599334e-05, + "loss": 5.5802, + "step": 3979 + }, + { + "epoch": 0.02367018745836902, + "grad_norm": 2.855534791946411, + "learning_rate": 4.99309450442351e-05, + "loss": 5.5131, + "step": 3980 + }, + { + "epoch": 0.023676134741650014, + "grad_norm": 2.430943727493286, + "learning_rate": 4.993091034616772e-05, + "loss": 6.2497, + "step": 3981 + }, + { + "epoch": 0.023682082024931012, + "grad_norm": 2.1671106815338135, + "learning_rate": 4.993087563939722e-05, + "loss": 5.9994, + "step": 3982 + }, + { + "epoch": 0.023688029308212007, + "grad_norm": 2.3268723487854004, + "learning_rate": 4.9930840923923606e-05, + "loss": 5.4779, + "step": 3983 + }, + { + "epoch": 0.023693976591493006, + "grad_norm": 2.3953616619110107, + "learning_rate": 4.993080619974689e-05, + "loss": 5.4044, + "step": 3984 + }, + { + "epoch": 0.023699923874774004, + "grad_norm": 2.043724775314331, + "learning_rate": 4.993077146686709e-05, + "loss": 5.6252, + "step": 3985 + }, + { + "epoch": 0.023705871158055, + "grad_norm": 2.5629520416259766, + "learning_rate": 4.9930736725284224e-05, + "loss": 5.1765, + "step": 3986 + }, + { + "epoch": 0.023711818441335998, + "grad_norm": 2.2148349285125732, + "learning_rate": 4.993070197499828e-05, + "loss": 5.5452, + "step": 3987 + }, + { + "epoch": 0.023717765724616997, + "grad_norm": 2.3913650512695312, + "learning_rate": 4.9930667216009295e-05, + "loss": 6.0882, + "step": 3988 + }, + { + "epoch": 0.02372371300789799, + "grad_norm": 2.619607925415039, + "learning_rate": 4.993063244831727e-05, + "loss": 6.4482, + "step": 3989 + }, + { + "epoch": 0.02372966029117899, + "grad_norm": 2.0585055351257324, + "learning_rate": 4.993059767192222e-05, + "loss": 6.0467, + "step": 3990 + }, + { + "epoch": 0.023735607574459985, + "grad_norm": 2.3380227088928223, + "learning_rate": 4.993056288682416e-05, + "loss": 5.9382, + "step": 3991 + }, + { + "epoch": 0.023741554857740984, + "grad_norm": 2.7252683639526367, + "learning_rate": 4.9930528093023085e-05, + "loss": 6.0444, + "step": 3992 + }, + { + "epoch": 0.023747502141021982, + "grad_norm": 2.333296060562134, + "learning_rate": 4.993049329051903e-05, + "loss": 5.6614, + "step": 3993 + }, + { + "epoch": 0.023753449424302978, + "grad_norm": 2.3571507930755615, + "learning_rate": 4.9930458479312e-05, + "loss": 6.328, + "step": 3994 + }, + { + "epoch": 0.023759396707583976, + "grad_norm": 2.7106499671936035, + "learning_rate": 4.9930423659402005e-05, + "loss": 6.0347, + "step": 3995 + }, + { + "epoch": 0.02376534399086497, + "grad_norm": 3.000009298324585, + "learning_rate": 4.9930388830789043e-05, + "loss": 5.5511, + "step": 3996 + }, + { + "epoch": 0.02377129127414597, + "grad_norm": 2.787912130355835, + "learning_rate": 4.993035399347316e-05, + "loss": 5.2059, + "step": 3997 + }, + { + "epoch": 0.02377723855742697, + "grad_norm": 2.7351326942443848, + "learning_rate": 4.993031914745433e-05, + "loss": 5.2997, + "step": 3998 + }, + { + "epoch": 0.023783185840707963, + "grad_norm": 2.770566701889038, + "learning_rate": 4.993028429273259e-05, + "loss": 5.8871, + "step": 3999 + }, + { + "epoch": 0.023789133123988962, + "grad_norm": 2.9528706073760986, + "learning_rate": 4.993024942930794e-05, + "loss": 5.8177, + "step": 4000 + }, + { + "epoch": 0.02379508040726996, + "grad_norm": 2.543329954147339, + "learning_rate": 4.993021455718041e-05, + "loss": 5.6446, + "step": 4001 + }, + { + "epoch": 0.023801027690550956, + "grad_norm": 2.7284936904907227, + "learning_rate": 4.993017967634999e-05, + "loss": 5.8404, + "step": 4002 + }, + { + "epoch": 0.023806974973831954, + "grad_norm": 2.752187728881836, + "learning_rate": 4.99301447868167e-05, + "loss": 5.6959, + "step": 4003 + }, + { + "epoch": 0.02381292225711295, + "grad_norm": 2.86651611328125, + "learning_rate": 4.993010988858056e-05, + "loss": 5.6329, + "step": 4004 + }, + { + "epoch": 0.023818869540393948, + "grad_norm": 3.9363176822662354, + "learning_rate": 4.9930074981641574e-05, + "loss": 5.31, + "step": 4005 + }, + { + "epoch": 0.023824816823674946, + "grad_norm": 3.41188907623291, + "learning_rate": 4.9930040065999764e-05, + "loss": 5.9905, + "step": 4006 + }, + { + "epoch": 0.02383076410695594, + "grad_norm": 3.4761459827423096, + "learning_rate": 4.9930005141655125e-05, + "loss": 6.0575, + "step": 4007 + }, + { + "epoch": 0.02383671139023694, + "grad_norm": 3.1562440395355225, + "learning_rate": 4.992997020860768e-05, + "loss": 5.9915, + "step": 4008 + }, + { + "epoch": 0.02384265867351794, + "grad_norm": 2.884049415588379, + "learning_rate": 4.992993526685744e-05, + "loss": 5.8051, + "step": 4009 + }, + { + "epoch": 0.023848605956798934, + "grad_norm": 3.3188138008117676, + "learning_rate": 4.992990031640442e-05, + "loss": 5.9637, + "step": 4010 + }, + { + "epoch": 0.023854553240079932, + "grad_norm": 3.2048282623291016, + "learning_rate": 4.992986535724862e-05, + "loss": 6.631, + "step": 4011 + }, + { + "epoch": 0.023860500523360927, + "grad_norm": 2.80204701423645, + "learning_rate": 4.992983038939008e-05, + "loss": 6.0063, + "step": 4012 + }, + { + "epoch": 0.023866447806641926, + "grad_norm": 2.993398427963257, + "learning_rate": 4.992979541282877e-05, + "loss": 5.9778, + "step": 4013 + }, + { + "epoch": 0.023872395089922924, + "grad_norm": 2.7519168853759766, + "learning_rate": 4.9929760427564744e-05, + "loss": 6.4272, + "step": 4014 + }, + { + "epoch": 0.02387834237320392, + "grad_norm": 2.9606168270111084, + "learning_rate": 4.992972543359799e-05, + "loss": 5.5372, + "step": 4015 + }, + { + "epoch": 0.023884289656484918, + "grad_norm": 2.1724514961242676, + "learning_rate": 4.992969043092853e-05, + "loss": 6.3115, + "step": 4016 + }, + { + "epoch": 0.023890236939765917, + "grad_norm": 2.1742191314697266, + "learning_rate": 4.9929655419556365e-05, + "loss": 6.5097, + "step": 4017 + }, + { + "epoch": 0.02389618422304691, + "grad_norm": 1.9729878902435303, + "learning_rate": 4.9929620399481526e-05, + "loss": 6.7061, + "step": 4018 + }, + { + "epoch": 0.02390213150632791, + "grad_norm": 2.6273725032806396, + "learning_rate": 4.9929585370704e-05, + "loss": 6.2838, + "step": 4019 + }, + { + "epoch": 0.023908078789608905, + "grad_norm": 2.5495283603668213, + "learning_rate": 4.9929550333223826e-05, + "loss": 6.1175, + "step": 4020 + }, + { + "epoch": 0.023914026072889904, + "grad_norm": 2.50193452835083, + "learning_rate": 4.9929515287041e-05, + "loss": 5.7689, + "step": 4021 + }, + { + "epoch": 0.023919973356170902, + "grad_norm": 2.402991771697998, + "learning_rate": 4.992948023215553e-05, + "loss": 6.4222, + "step": 4022 + }, + { + "epoch": 0.023925920639451898, + "grad_norm": 2.1722981929779053, + "learning_rate": 4.9929445168567444e-05, + "loss": 6.2335, + "step": 4023 + }, + { + "epoch": 0.023931867922732896, + "grad_norm": 1.6895688772201538, + "learning_rate": 4.992941009627675e-05, + "loss": 6.163, + "step": 4024 + }, + { + "epoch": 0.02393781520601389, + "grad_norm": 1.9944639205932617, + "learning_rate": 4.992937501528345e-05, + "loss": 6.2622, + "step": 4025 + }, + { + "epoch": 0.02394376248929489, + "grad_norm": 2.6157150268554688, + "learning_rate": 4.9929339925587565e-05, + "loss": 6.4582, + "step": 4026 + }, + { + "epoch": 0.023949709772575888, + "grad_norm": 2.021772623062134, + "learning_rate": 4.992930482718911e-05, + "loss": 6.2921, + "step": 4027 + }, + { + "epoch": 0.023955657055856883, + "grad_norm": 2.465402603149414, + "learning_rate": 4.992926972008808e-05, + "loss": 6.6426, + "step": 4028 + }, + { + "epoch": 0.023961604339137882, + "grad_norm": 2.337763547897339, + "learning_rate": 4.99292346042845e-05, + "loss": 6.4988, + "step": 4029 + }, + { + "epoch": 0.02396755162241888, + "grad_norm": 2.400064706802368, + "learning_rate": 4.9929199479778394e-05, + "loss": 6.6666, + "step": 4030 + }, + { + "epoch": 0.023973498905699876, + "grad_norm": 2.4205784797668457, + "learning_rate": 4.9929164346569756e-05, + "loss": 5.8805, + "step": 4031 + }, + { + "epoch": 0.023979446188980874, + "grad_norm": 2.312434673309326, + "learning_rate": 4.9929129204658605e-05, + "loss": 6.5161, + "step": 4032 + }, + { + "epoch": 0.02398539347226187, + "grad_norm": 2.02748966217041, + "learning_rate": 4.9929094054044944e-05, + "loss": 6.1272, + "step": 4033 + }, + { + "epoch": 0.023991340755542868, + "grad_norm": 2.280242443084717, + "learning_rate": 4.992905889472881e-05, + "loss": 5.7217, + "step": 4034 + }, + { + "epoch": 0.023997288038823866, + "grad_norm": 2.3911778926849365, + "learning_rate": 4.992902372671019e-05, + "loss": 5.7441, + "step": 4035 + }, + { + "epoch": 0.02400323532210486, + "grad_norm": 2.1767921447753906, + "learning_rate": 4.99289885499891e-05, + "loss": 5.7212, + "step": 4036 + }, + { + "epoch": 0.02400918260538586, + "grad_norm": 2.3067142963409424, + "learning_rate": 4.992895336456557e-05, + "loss": 5.6689, + "step": 4037 + }, + { + "epoch": 0.02401512988866686, + "grad_norm": 2.1564273834228516, + "learning_rate": 4.992891817043959e-05, + "loss": 6.1445, + "step": 4038 + }, + { + "epoch": 0.024021077171947854, + "grad_norm": 2.4852945804595947, + "learning_rate": 4.9928882967611184e-05, + "loss": 6.1883, + "step": 4039 + }, + { + "epoch": 0.024027024455228852, + "grad_norm": 2.9280812740325928, + "learning_rate": 4.992884775608036e-05, + "loss": 6.097, + "step": 4040 + }, + { + "epoch": 0.024032971738509847, + "grad_norm": 2.3219356536865234, + "learning_rate": 4.992881253584714e-05, + "loss": 6.3163, + "step": 4041 + }, + { + "epoch": 0.024038919021790846, + "grad_norm": 2.672386884689331, + "learning_rate": 4.9928777306911525e-05, + "loss": 5.9615, + "step": 4042 + }, + { + "epoch": 0.024044866305071844, + "grad_norm": 2.5886473655700684, + "learning_rate": 4.992874206927353e-05, + "loss": 6.0114, + "step": 4043 + }, + { + "epoch": 0.02405081358835284, + "grad_norm": 2.991230010986328, + "learning_rate": 4.992870682293318e-05, + "loss": 5.6805, + "step": 4044 + }, + { + "epoch": 0.024056760871633838, + "grad_norm": 2.3270034790039062, + "learning_rate": 4.9928671567890464e-05, + "loss": 5.7503, + "step": 4045 + }, + { + "epoch": 0.024062708154914837, + "grad_norm": 2.591627359390259, + "learning_rate": 4.99286363041454e-05, + "loss": 5.5707, + "step": 4046 + }, + { + "epoch": 0.02406865543819583, + "grad_norm": 2.1936891078948975, + "learning_rate": 4.992860103169802e-05, + "loss": 5.6503, + "step": 4047 + }, + { + "epoch": 0.02407460272147683, + "grad_norm": 2.2928214073181152, + "learning_rate": 4.992856575054832e-05, + "loss": 5.6067, + "step": 4048 + }, + { + "epoch": 0.024080550004757825, + "grad_norm": 2.4503591060638428, + "learning_rate": 4.992853046069632e-05, + "loss": 6.0067, + "step": 4049 + }, + { + "epoch": 0.024086497288038824, + "grad_norm": 2.84260630607605, + "learning_rate": 4.992849516214202e-05, + "loss": 6.4533, + "step": 4050 + }, + { + "epoch": 0.024092444571319822, + "grad_norm": 2.7172651290893555, + "learning_rate": 4.992845985488543e-05, + "loss": 6.4901, + "step": 4051 + }, + { + "epoch": 0.024098391854600817, + "grad_norm": 2.2101316452026367, + "learning_rate": 4.992842453892659e-05, + "loss": 6.3481, + "step": 4052 + }, + { + "epoch": 0.024104339137881816, + "grad_norm": 2.488199234008789, + "learning_rate": 4.992838921426549e-05, + "loss": 6.4893, + "step": 4053 + }, + { + "epoch": 0.02411028642116281, + "grad_norm": 2.3767058849334717, + "learning_rate": 4.992835388090215e-05, + "loss": 5.9828, + "step": 4054 + }, + { + "epoch": 0.02411623370444381, + "grad_norm": 2.3979814052581787, + "learning_rate": 4.992831853883657e-05, + "loss": 5.7607, + "step": 4055 + }, + { + "epoch": 0.024122180987724808, + "grad_norm": 2.766644239425659, + "learning_rate": 4.992828318806877e-05, + "loss": 5.523, + "step": 4056 + }, + { + "epoch": 0.024128128271005803, + "grad_norm": 3.3954427242279053, + "learning_rate": 4.9928247828598775e-05, + "loss": 6.1247, + "step": 4057 + }, + { + "epoch": 0.024134075554286802, + "grad_norm": 3.5597097873687744, + "learning_rate": 4.9928212460426585e-05, + "loss": 6.0877, + "step": 4058 + }, + { + "epoch": 0.0241400228375678, + "grad_norm": 2.8089418411254883, + "learning_rate": 4.992817708355221e-05, + "loss": 5.324, + "step": 4059 + }, + { + "epoch": 0.024145970120848795, + "grad_norm": 2.6756842136383057, + "learning_rate": 4.992814169797566e-05, + "loss": 5.5516, + "step": 4060 + }, + { + "epoch": 0.024151917404129794, + "grad_norm": 2.1218929290771484, + "learning_rate": 4.992810630369696e-05, + "loss": 6.102, + "step": 4061 + }, + { + "epoch": 0.02415786468741079, + "grad_norm": 2.7189652919769287, + "learning_rate": 4.992807090071611e-05, + "loss": 6.4258, + "step": 4062 + }, + { + "epoch": 0.024163811970691788, + "grad_norm": 2.4340744018554688, + "learning_rate": 4.992803548903313e-05, + "loss": 5.8059, + "step": 4063 + }, + { + "epoch": 0.024169759253972786, + "grad_norm": 2.46604323387146, + "learning_rate": 4.992800006864804e-05, + "loss": 5.8963, + "step": 4064 + }, + { + "epoch": 0.02417570653725378, + "grad_norm": 2.1969218254089355, + "learning_rate": 4.9927964639560835e-05, + "loss": 5.7835, + "step": 4065 + }, + { + "epoch": 0.02418165382053478, + "grad_norm": 2.4529223442077637, + "learning_rate": 4.9927929201771535e-05, + "loss": 6.3405, + "step": 4066 + }, + { + "epoch": 0.02418760110381578, + "grad_norm": 2.145331859588623, + "learning_rate": 4.992789375528015e-05, + "loss": 6.14, + "step": 4067 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 2.212646961212158, + "learning_rate": 4.99278583000867e-05, + "loss": 5.8793, + "step": 4068 + }, + { + "epoch": 0.024199495670377772, + "grad_norm": 2.3249876499176025, + "learning_rate": 4.992782283619118e-05, + "loss": 5.8702, + "step": 4069 + }, + { + "epoch": 0.024205442953658767, + "grad_norm": 2.180964946746826, + "learning_rate": 4.9927787363593634e-05, + "loss": 6.216, + "step": 4070 + }, + { + "epoch": 0.024211390236939766, + "grad_norm": 2.5633153915405273, + "learning_rate": 4.992775188229405e-05, + "loss": 6.031, + "step": 4071 + }, + { + "epoch": 0.024217337520220764, + "grad_norm": 2.867342233657837, + "learning_rate": 4.992771639229244e-05, + "loss": 5.9853, + "step": 4072 + }, + { + "epoch": 0.02422328480350176, + "grad_norm": 2.111253023147583, + "learning_rate": 4.992768089358882e-05, + "loss": 5.8404, + "step": 4073 + }, + { + "epoch": 0.024229232086782758, + "grad_norm": 1.9325549602508545, + "learning_rate": 4.992764538618321e-05, + "loss": 6.0175, + "step": 4074 + }, + { + "epoch": 0.024235179370063756, + "grad_norm": 2.721740484237671, + "learning_rate": 4.992760987007561e-05, + "loss": 5.9274, + "step": 4075 + }, + { + "epoch": 0.02424112665334475, + "grad_norm": 3.5240588188171387, + "learning_rate": 4.992757434526604e-05, + "loss": 5.3593, + "step": 4076 + }, + { + "epoch": 0.02424707393662575, + "grad_norm": 2.744248867034912, + "learning_rate": 4.9927538811754516e-05, + "loss": 5.8938, + "step": 4077 + }, + { + "epoch": 0.024253021219906745, + "grad_norm": 2.545384645462036, + "learning_rate": 4.992750326954104e-05, + "loss": 6.2127, + "step": 4078 + }, + { + "epoch": 0.024258968503187744, + "grad_norm": 2.7550806999206543, + "learning_rate": 4.992746771862563e-05, + "loss": 6.0784, + "step": 4079 + }, + { + "epoch": 0.024264915786468742, + "grad_norm": 2.408040761947632, + "learning_rate": 4.9927432159008305e-05, + "loss": 5.5908, + "step": 4080 + }, + { + "epoch": 0.024270863069749737, + "grad_norm": 2.581378698348999, + "learning_rate": 4.9927396590689066e-05, + "loss": 5.4438, + "step": 4081 + }, + { + "epoch": 0.024276810353030736, + "grad_norm": 2.4320218563079834, + "learning_rate": 4.992736101366794e-05, + "loss": 5.6239, + "step": 4082 + }, + { + "epoch": 0.024282757636311735, + "grad_norm": 2.4725472927093506, + "learning_rate": 4.992732542794492e-05, + "loss": 6.237, + "step": 4083 + }, + { + "epoch": 0.02428870491959273, + "grad_norm": 2.3081839084625244, + "learning_rate": 4.992728983352003e-05, + "loss": 5.9917, + "step": 4084 + }, + { + "epoch": 0.024294652202873728, + "grad_norm": 1.9090701341629028, + "learning_rate": 4.9927254230393287e-05, + "loss": 5.9125, + "step": 4085 + }, + { + "epoch": 0.024300599486154723, + "grad_norm": 2.3943240642547607, + "learning_rate": 4.992721861856468e-05, + "loss": 5.3431, + "step": 4086 + }, + { + "epoch": 0.024306546769435722, + "grad_norm": 2.226968765258789, + "learning_rate": 4.992718299803425e-05, + "loss": 5.4328, + "step": 4087 + }, + { + "epoch": 0.02431249405271672, + "grad_norm": 2.238218307495117, + "learning_rate": 4.9927147368801994e-05, + "loss": 5.4877, + "step": 4088 + }, + { + "epoch": 0.024318441335997715, + "grad_norm": 2.216540575027466, + "learning_rate": 4.992711173086794e-05, + "loss": 5.4037, + "step": 4089 + }, + { + "epoch": 0.024324388619278714, + "grad_norm": 2.3136301040649414, + "learning_rate": 4.992707608423208e-05, + "loss": 5.4576, + "step": 4090 + }, + { + "epoch": 0.02433033590255971, + "grad_norm": 2.0434980392456055, + "learning_rate": 4.9927040428894436e-05, + "loss": 5.8044, + "step": 4091 + }, + { + "epoch": 0.024336283185840708, + "grad_norm": 2.7837064266204834, + "learning_rate": 4.992700476485502e-05, + "loss": 6.4183, + "step": 4092 + }, + { + "epoch": 0.024342230469121706, + "grad_norm": 2.580411195755005, + "learning_rate": 4.992696909211384e-05, + "loss": 5.4545, + "step": 4093 + }, + { + "epoch": 0.0243481777524027, + "grad_norm": 2.1215696334838867, + "learning_rate": 4.9926933410670916e-05, + "loss": 5.5629, + "step": 4094 + }, + { + "epoch": 0.0243541250356837, + "grad_norm": 1.9621074199676514, + "learning_rate": 4.992689772052626e-05, + "loss": 5.5248, + "step": 4095 + }, + { + "epoch": 0.0243600723189647, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.992686202167988e-05, + "loss": 5.3285, + "step": 4096 + }, + { + "epoch": 0.024366019602245693, + "grad_norm": 1.9506359100341797, + "learning_rate": 4.992682631413179e-05, + "loss": 5.7989, + "step": 4097 + }, + { + "epoch": 0.024371966885526692, + "grad_norm": 1.9154741764068604, + "learning_rate": 4.9926790597882e-05, + "loss": 5.6029, + "step": 4098 + }, + { + "epoch": 0.024377914168807687, + "grad_norm": 2.2147481441497803, + "learning_rate": 4.9926754872930524e-05, + "loss": 5.5406, + "step": 4099 + }, + { + "epoch": 0.024383861452088686, + "grad_norm": 2.1268460750579834, + "learning_rate": 4.992671913927738e-05, + "loss": 5.6434, + "step": 4100 + }, + { + "epoch": 0.024389808735369684, + "grad_norm": 2.1212456226348877, + "learning_rate": 4.992668339692258e-05, + "loss": 5.6888, + "step": 4101 + }, + { + "epoch": 0.02439575601865068, + "grad_norm": 2.2292001247406006, + "learning_rate": 4.992664764586612e-05, + "loss": 5.3982, + "step": 4102 + }, + { + "epoch": 0.024401703301931678, + "grad_norm": 2.2713210582733154, + "learning_rate": 4.9926611886108035e-05, + "loss": 5.3521, + "step": 4103 + }, + { + "epoch": 0.024407650585212676, + "grad_norm": 2.273437738418579, + "learning_rate": 4.9926576117648314e-05, + "loss": 5.474, + "step": 4104 + }, + { + "epoch": 0.02441359786849367, + "grad_norm": 2.2879083156585693, + "learning_rate": 4.9926540340487e-05, + "loss": 5.4474, + "step": 4105 + }, + { + "epoch": 0.02441954515177467, + "grad_norm": 2.2517430782318115, + "learning_rate": 4.992650455462408e-05, + "loss": 5.5013, + "step": 4106 + }, + { + "epoch": 0.024425492435055665, + "grad_norm": 2.1391677856445312, + "learning_rate": 4.992646876005957e-05, + "loss": 5.3899, + "step": 4107 + }, + { + "epoch": 0.024431439718336664, + "grad_norm": 2.2989962100982666, + "learning_rate": 4.9926432956793494e-05, + "loss": 5.7995, + "step": 4108 + }, + { + "epoch": 0.024437387001617662, + "grad_norm": 2.550706386566162, + "learning_rate": 4.992639714482586e-05, + "loss": 5.6599, + "step": 4109 + }, + { + "epoch": 0.024443334284898657, + "grad_norm": 2.321398973464966, + "learning_rate": 4.992636132415667e-05, + "loss": 5.6852, + "step": 4110 + }, + { + "epoch": 0.024449281568179656, + "grad_norm": 2.300795555114746, + "learning_rate": 4.992632549478595e-05, + "loss": 5.7318, + "step": 4111 + }, + { + "epoch": 0.024455228851460654, + "grad_norm": 2.229156970977783, + "learning_rate": 4.992628965671371e-05, + "loss": 5.6617, + "step": 4112 + }, + { + "epoch": 0.02446117613474165, + "grad_norm": 2.253934144973755, + "learning_rate": 4.992625380993995e-05, + "loss": 5.5762, + "step": 4113 + }, + { + "epoch": 0.024467123418022648, + "grad_norm": 2.0932998657226562, + "learning_rate": 4.992621795446471e-05, + "loss": 5.568, + "step": 4114 + }, + { + "epoch": 0.024473070701303643, + "grad_norm": 2.5969886779785156, + "learning_rate": 4.9926182090287966e-05, + "loss": 5.6626, + "step": 4115 + }, + { + "epoch": 0.02447901798458464, + "grad_norm": 2.5260698795318604, + "learning_rate": 4.992614621740976e-05, + "loss": 5.6333, + "step": 4116 + }, + { + "epoch": 0.02448496526786564, + "grad_norm": 2.0017902851104736, + "learning_rate": 4.992611033583009e-05, + "loss": 5.793, + "step": 4117 + }, + { + "epoch": 0.024490912551146635, + "grad_norm": 2.1847705841064453, + "learning_rate": 4.992607444554898e-05, + "loss": 5.8348, + "step": 4118 + }, + { + "epoch": 0.024496859834427634, + "grad_norm": 2.141007900238037, + "learning_rate": 4.992603854656642e-05, + "loss": 5.7835, + "step": 4119 + }, + { + "epoch": 0.02450280711770863, + "grad_norm": 2.294605255126953, + "learning_rate": 4.992600263888245e-05, + "loss": 5.6615, + "step": 4120 + }, + { + "epoch": 0.024508754400989628, + "grad_norm": 2.433936357498169, + "learning_rate": 4.9925966722497064e-05, + "loss": 5.6479, + "step": 4121 + }, + { + "epoch": 0.024514701684270626, + "grad_norm": 2.1522979736328125, + "learning_rate": 4.992593079741028e-05, + "loss": 5.5761, + "step": 4122 + }, + { + "epoch": 0.02452064896755162, + "grad_norm": 2.141065835952759, + "learning_rate": 4.9925894863622114e-05, + "loss": 5.602, + "step": 4123 + }, + { + "epoch": 0.02452659625083262, + "grad_norm": 2.187838554382324, + "learning_rate": 4.9925858921132576e-05, + "loss": 5.6337, + "step": 4124 + }, + { + "epoch": 0.02453254353411362, + "grad_norm": 2.303027629852295, + "learning_rate": 4.992582296994167e-05, + "loss": 5.6126, + "step": 4125 + }, + { + "epoch": 0.024538490817394613, + "grad_norm": 1.9233589172363281, + "learning_rate": 4.992578701004943e-05, + "loss": 5.5852, + "step": 4126 + }, + { + "epoch": 0.024544438100675612, + "grad_norm": 2.0383386611938477, + "learning_rate": 4.992575104145585e-05, + "loss": 5.6477, + "step": 4127 + }, + { + "epoch": 0.024550385383956607, + "grad_norm": 2.2752933502197266, + "learning_rate": 4.9925715064160946e-05, + "loss": 5.6263, + "step": 4128 + }, + { + "epoch": 0.024556332667237606, + "grad_norm": 2.400083541870117, + "learning_rate": 4.9925679078164734e-05, + "loss": 5.5249, + "step": 4129 + }, + { + "epoch": 0.024562279950518604, + "grad_norm": 2.167365312576294, + "learning_rate": 4.992564308346722e-05, + "loss": 5.7299, + "step": 4130 + }, + { + "epoch": 0.0245682272337996, + "grad_norm": 1.9696096181869507, + "learning_rate": 4.9925607080068426e-05, + "loss": 5.7961, + "step": 4131 + }, + { + "epoch": 0.024574174517080598, + "grad_norm": 2.1817007064819336, + "learning_rate": 4.992557106796836e-05, + "loss": 5.7973, + "step": 4132 + }, + { + "epoch": 0.024580121800361596, + "grad_norm": 2.4329075813293457, + "learning_rate": 4.992553504716704e-05, + "loss": 6.2428, + "step": 4133 + }, + { + "epoch": 0.02458606908364259, + "grad_norm": 2.159193754196167, + "learning_rate": 4.9925499017664464e-05, + "loss": 5.5784, + "step": 4134 + }, + { + "epoch": 0.02459201636692359, + "grad_norm": 2.2614853382110596, + "learning_rate": 4.992546297946066e-05, + "loss": 5.7572, + "step": 4135 + }, + { + "epoch": 0.024597963650204585, + "grad_norm": 2.2874412536621094, + "learning_rate": 4.992542693255563e-05, + "loss": 5.5726, + "step": 4136 + }, + { + "epoch": 0.024603910933485584, + "grad_norm": 2.1634466648101807, + "learning_rate": 4.992539087694939e-05, + "loss": 5.5112, + "step": 4137 + }, + { + "epoch": 0.024609858216766582, + "grad_norm": 2.195528507232666, + "learning_rate": 4.9925354812641955e-05, + "loss": 5.6073, + "step": 4138 + }, + { + "epoch": 0.024615805500047577, + "grad_norm": 2.0328054428100586, + "learning_rate": 4.992531873963334e-05, + "loss": 5.5686, + "step": 4139 + }, + { + "epoch": 0.024621752783328576, + "grad_norm": 2.244218349456787, + "learning_rate": 4.992528265792355e-05, + "loss": 5.6871, + "step": 4140 + }, + { + "epoch": 0.024627700066609574, + "grad_norm": 2.081721544265747, + "learning_rate": 4.992524656751261e-05, + "loss": 5.5327, + "step": 4141 + }, + { + "epoch": 0.02463364734989057, + "grad_norm": 1.9305940866470337, + "learning_rate": 4.992521046840051e-05, + "loss": 5.5265, + "step": 4142 + }, + { + "epoch": 0.024639594633171568, + "grad_norm": 2.624286651611328, + "learning_rate": 4.992517436058728e-05, + "loss": 5.3881, + "step": 4143 + }, + { + "epoch": 0.024645541916452563, + "grad_norm": 2.204803705215454, + "learning_rate": 4.9925138244072935e-05, + "loss": 5.6686, + "step": 4144 + }, + { + "epoch": 0.02465148919973356, + "grad_norm": 2.4664852619171143, + "learning_rate": 4.992510211885748e-05, + "loss": 5.3152, + "step": 4145 + }, + { + "epoch": 0.02465743648301456, + "grad_norm": 2.3428542613983154, + "learning_rate": 4.992506598494093e-05, + "loss": 5.5875, + "step": 4146 + }, + { + "epoch": 0.024663383766295555, + "grad_norm": 2.1902847290039062, + "learning_rate": 4.992502984232329e-05, + "loss": 5.4826, + "step": 4147 + }, + { + "epoch": 0.024669331049576554, + "grad_norm": 2.0401039123535156, + "learning_rate": 4.992499369100459e-05, + "loss": 5.518, + "step": 4148 + }, + { + "epoch": 0.02467527833285755, + "grad_norm": 2.5250306129455566, + "learning_rate": 4.9924957530984825e-05, + "loss": 5.5744, + "step": 4149 + }, + { + "epoch": 0.024681225616138548, + "grad_norm": 1.9975959062576294, + "learning_rate": 4.9924921362264016e-05, + "loss": 5.6834, + "step": 4150 + }, + { + "epoch": 0.024687172899419546, + "grad_norm": 2.047011375427246, + "learning_rate": 4.992488518484217e-05, + "loss": 5.6703, + "step": 4151 + }, + { + "epoch": 0.02469312018270054, + "grad_norm": 2.142411470413208, + "learning_rate": 4.9924848998719314e-05, + "loss": 5.781, + "step": 4152 + }, + { + "epoch": 0.02469906746598154, + "grad_norm": 2.1012768745422363, + "learning_rate": 4.992481280389545e-05, + "loss": 5.618, + "step": 4153 + }, + { + "epoch": 0.024705014749262538, + "grad_norm": 2.4698173999786377, + "learning_rate": 4.9924776600370584e-05, + "loss": 6.4773, + "step": 4154 + }, + { + "epoch": 0.024710962032543533, + "grad_norm": 2.4975368976593018, + "learning_rate": 4.992474038814474e-05, + "loss": 5.2568, + "step": 4155 + }, + { + "epoch": 0.024716909315824532, + "grad_norm": 1.8329259157180786, + "learning_rate": 4.992470416721793e-05, + "loss": 5.775, + "step": 4156 + }, + { + "epoch": 0.024722856599105527, + "grad_norm": 1.9757754802703857, + "learning_rate": 4.992466793759015e-05, + "loss": 5.5408, + "step": 4157 + }, + { + "epoch": 0.024728803882386526, + "grad_norm": 1.8300005197525024, + "learning_rate": 4.9924631699261434e-05, + "loss": 5.5356, + "step": 4158 + }, + { + "epoch": 0.024734751165667524, + "grad_norm": 2.099102735519409, + "learning_rate": 4.992459545223179e-05, + "loss": 5.6811, + "step": 4159 + }, + { + "epoch": 0.02474069844894852, + "grad_norm": 2.000169277191162, + "learning_rate": 4.992455919650123e-05, + "loss": 5.511, + "step": 4160 + }, + { + "epoch": 0.024746645732229518, + "grad_norm": 2.0555150508880615, + "learning_rate": 4.992452293206976e-05, + "loss": 5.7553, + "step": 4161 + }, + { + "epoch": 0.024752593015510516, + "grad_norm": 2.0416486263275146, + "learning_rate": 4.99244866589374e-05, + "loss": 5.6965, + "step": 4162 + }, + { + "epoch": 0.02475854029879151, + "grad_norm": 2.0028059482574463, + "learning_rate": 4.9924450377104146e-05, + "loss": 5.7211, + "step": 4163 + }, + { + "epoch": 0.02476448758207251, + "grad_norm": 2.22377872467041, + "learning_rate": 4.992441408657004e-05, + "loss": 5.6384, + "step": 4164 + }, + { + "epoch": 0.024770434865353505, + "grad_norm": 2.038804531097412, + "learning_rate": 4.9924377787335064e-05, + "loss": 5.6351, + "step": 4165 + }, + { + "epoch": 0.024776382148634504, + "grad_norm": 2.357773542404175, + "learning_rate": 4.992434147939925e-05, + "loss": 5.2791, + "step": 4166 + }, + { + "epoch": 0.024782329431915502, + "grad_norm": 2.1949357986450195, + "learning_rate": 4.992430516276261e-05, + "loss": 5.7389, + "step": 4167 + }, + { + "epoch": 0.024788276715196497, + "grad_norm": 2.1015608310699463, + "learning_rate": 4.992426883742516e-05, + "loss": 5.632, + "step": 4168 + }, + { + "epoch": 0.024794223998477496, + "grad_norm": 2.166201591491699, + "learning_rate": 4.992423250338689e-05, + "loss": 5.5701, + "step": 4169 + }, + { + "epoch": 0.024800171281758494, + "grad_norm": 2.0805492401123047, + "learning_rate": 4.9924196160647836e-05, + "loss": 5.5955, + "step": 4170 + }, + { + "epoch": 0.02480611856503949, + "grad_norm": 1.803229570388794, + "learning_rate": 4.9924159809208e-05, + "loss": 5.6267, + "step": 4171 + }, + { + "epoch": 0.024812065848320488, + "grad_norm": 2.008639335632324, + "learning_rate": 4.9924123449067393e-05, + "loss": 5.6667, + "step": 4172 + }, + { + "epoch": 0.024818013131601483, + "grad_norm": 1.9843655824661255, + "learning_rate": 4.9924087080226044e-05, + "loss": 5.5981, + "step": 4173 + }, + { + "epoch": 0.02482396041488248, + "grad_norm": 2.10270357131958, + "learning_rate": 4.9924050702683946e-05, + "loss": 5.5293, + "step": 4174 + }, + { + "epoch": 0.02482990769816348, + "grad_norm": 2.315976142883301, + "learning_rate": 4.992401431644112e-05, + "loss": 5.6046, + "step": 4175 + }, + { + "epoch": 0.024835854981444475, + "grad_norm": 2.168473482131958, + "learning_rate": 4.992397792149758e-05, + "loss": 5.4271, + "step": 4176 + }, + { + "epoch": 0.024841802264725474, + "grad_norm": 2.1870200634002686, + "learning_rate": 4.9923941517853335e-05, + "loss": 5.6399, + "step": 4177 + }, + { + "epoch": 0.024847749548006472, + "grad_norm": 2.2944717407226562, + "learning_rate": 4.9923905105508394e-05, + "loss": 5.4483, + "step": 4178 + }, + { + "epoch": 0.024853696831287467, + "grad_norm": 2.1662731170654297, + "learning_rate": 4.9923868684462785e-05, + "loss": 5.6773, + "step": 4179 + }, + { + "epoch": 0.024859644114568466, + "grad_norm": 1.7448937892913818, + "learning_rate": 4.992383225471651e-05, + "loss": 5.6097, + "step": 4180 + }, + { + "epoch": 0.02486559139784946, + "grad_norm": 2.3577585220336914, + "learning_rate": 4.9923795816269576e-05, + "loss": 5.5003, + "step": 4181 + }, + { + "epoch": 0.02487153868113046, + "grad_norm": 2.4175360202789307, + "learning_rate": 4.9923759369122e-05, + "loss": 5.4925, + "step": 4182 + }, + { + "epoch": 0.024877485964411458, + "grad_norm": 2.199329137802124, + "learning_rate": 4.992372291327381e-05, + "loss": 5.6239, + "step": 4183 + }, + { + "epoch": 0.024883433247692453, + "grad_norm": 2.054450511932373, + "learning_rate": 4.9923686448724994e-05, + "loss": 5.59, + "step": 4184 + }, + { + "epoch": 0.024889380530973452, + "grad_norm": 2.0354533195495605, + "learning_rate": 4.9923649975475585e-05, + "loss": 5.6092, + "step": 4185 + }, + { + "epoch": 0.024895327814254447, + "grad_norm": 2.0409371852874756, + "learning_rate": 4.9923613493525576e-05, + "loss": 5.5009, + "step": 4186 + }, + { + "epoch": 0.024901275097535445, + "grad_norm": 2.3314719200134277, + "learning_rate": 4.992357700287501e-05, + "loss": 5.5077, + "step": 4187 + }, + { + "epoch": 0.024907222380816444, + "grad_norm": 2.050706386566162, + "learning_rate": 4.9923540503523865e-05, + "loss": 5.5857, + "step": 4188 + }, + { + "epoch": 0.02491316966409744, + "grad_norm": 2.3477721214294434, + "learning_rate": 4.992350399547218e-05, + "loss": 5.5119, + "step": 4189 + }, + { + "epoch": 0.024919116947378438, + "grad_norm": 2.365171194076538, + "learning_rate": 4.992346747871994e-05, + "loss": 5.583, + "step": 4190 + }, + { + "epoch": 0.024925064230659436, + "grad_norm": 1.9642738103866577, + "learning_rate": 4.992343095326719e-05, + "loss": 5.3527, + "step": 4191 + }, + { + "epoch": 0.02493101151394043, + "grad_norm": 2.25437593460083, + "learning_rate": 4.992339441911392e-05, + "loss": 5.4751, + "step": 4192 + }, + { + "epoch": 0.02493695879722143, + "grad_norm": 2.0476715564727783, + "learning_rate": 4.992335787626016e-05, + "loss": 5.5808, + "step": 4193 + }, + { + "epoch": 0.024942906080502425, + "grad_norm": 2.248382329940796, + "learning_rate": 4.992332132470591e-05, + "loss": 5.5771, + "step": 4194 + }, + { + "epoch": 0.024948853363783424, + "grad_norm": 2.279232978820801, + "learning_rate": 4.992328476445118e-05, + "loss": 5.3803, + "step": 4195 + }, + { + "epoch": 0.024954800647064422, + "grad_norm": 2.0171918869018555, + "learning_rate": 4.992324819549599e-05, + "loss": 5.662, + "step": 4196 + }, + { + "epoch": 0.024960747930345417, + "grad_norm": 2.14736008644104, + "learning_rate": 4.992321161784036e-05, + "loss": 5.6422, + "step": 4197 + }, + { + "epoch": 0.024966695213626416, + "grad_norm": 2.1694438457489014, + "learning_rate": 4.9923175031484284e-05, + "loss": 5.4377, + "step": 4198 + }, + { + "epoch": 0.024972642496907414, + "grad_norm": 1.9280356168746948, + "learning_rate": 4.9923138436427784e-05, + "loss": 5.5499, + "step": 4199 + }, + { + "epoch": 0.02497858978018841, + "grad_norm": 2.185974359512329, + "learning_rate": 4.992310183267088e-05, + "loss": 5.6404, + "step": 4200 + }, + { + "epoch": 0.024984537063469408, + "grad_norm": 2.102681875228882, + "learning_rate": 4.9923065220213585e-05, + "loss": 5.5888, + "step": 4201 + }, + { + "epoch": 0.024990484346750403, + "grad_norm": 2.07100772857666, + "learning_rate": 4.99230285990559e-05, + "loss": 5.6473, + "step": 4202 + }, + { + "epoch": 0.0249964316300314, + "grad_norm": 2.088634967803955, + "learning_rate": 4.992299196919784e-05, + "loss": 5.4993, + "step": 4203 + }, + { + "epoch": 0.0250023789133124, + "grad_norm": 2.2086873054504395, + "learning_rate": 4.992295533063942e-05, + "loss": 5.5797, + "step": 4204 + }, + { + "epoch": 0.025008326196593395, + "grad_norm": 2.250753164291382, + "learning_rate": 4.992291868338066e-05, + "loss": 5.5666, + "step": 4205 + }, + { + "epoch": 0.025014273479874394, + "grad_norm": 2.132636785507202, + "learning_rate": 4.992288202742156e-05, + "loss": 5.6715, + "step": 4206 + }, + { + "epoch": 0.025020220763155392, + "grad_norm": 2.8332200050354004, + "learning_rate": 4.992284536276214e-05, + "loss": 4.9687, + "step": 4207 + }, + { + "epoch": 0.025026168046436387, + "grad_norm": 2.345991849899292, + "learning_rate": 4.992280868940241e-05, + "loss": 5.2181, + "step": 4208 + }, + { + "epoch": 0.025032115329717386, + "grad_norm": 2.149568557739258, + "learning_rate": 4.992277200734239e-05, + "loss": 5.5336, + "step": 4209 + }, + { + "epoch": 0.02503806261299838, + "grad_norm": 2.031353235244751, + "learning_rate": 4.992273531658209e-05, + "loss": 5.5779, + "step": 4210 + }, + { + "epoch": 0.02504400989627938, + "grad_norm": 2.217374086380005, + "learning_rate": 4.9922698617121524e-05, + "loss": 5.782, + "step": 4211 + }, + { + "epoch": 0.025049957179560378, + "grad_norm": 2.3629000186920166, + "learning_rate": 4.992266190896069e-05, + "loss": 5.7916, + "step": 4212 + }, + { + "epoch": 0.025055904462841373, + "grad_norm": 2.2439091205596924, + "learning_rate": 4.9922625192099616e-05, + "loss": 5.8002, + "step": 4213 + }, + { + "epoch": 0.025061851746122372, + "grad_norm": 2.1707634925842285, + "learning_rate": 4.992258846653831e-05, + "loss": 6.5789, + "step": 4214 + }, + { + "epoch": 0.025067799029403367, + "grad_norm": 3.1655468940734863, + "learning_rate": 4.992255173227679e-05, + "loss": 6.3867, + "step": 4215 + }, + { + "epoch": 0.025073746312684365, + "grad_norm": 3.1309874057769775, + "learning_rate": 4.992251498931506e-05, + "loss": 6.2682, + "step": 4216 + }, + { + "epoch": 0.025079693595965364, + "grad_norm": 3.2077460289001465, + "learning_rate": 4.992247823765315e-05, + "loss": 5.8593, + "step": 4217 + }, + { + "epoch": 0.02508564087924636, + "grad_norm": 2.2944962978363037, + "learning_rate": 4.992244147729105e-05, + "loss": 5.7994, + "step": 4218 + }, + { + "epoch": 0.025091588162527358, + "grad_norm": 2.2380926609039307, + "learning_rate": 4.9922404708228776e-05, + "loss": 5.7606, + "step": 4219 + }, + { + "epoch": 0.025097535445808356, + "grad_norm": 2.601795196533203, + "learning_rate": 4.992236793046636e-05, + "loss": 5.7585, + "step": 4220 + }, + { + "epoch": 0.02510348272908935, + "grad_norm": 2.494765520095825, + "learning_rate": 4.99223311440038e-05, + "loss": 5.8102, + "step": 4221 + }, + { + "epoch": 0.02510943001237035, + "grad_norm": 2.4690544605255127, + "learning_rate": 4.992229434884111e-05, + "loss": 5.8682, + "step": 4222 + }, + { + "epoch": 0.025115377295651345, + "grad_norm": 2.1011085510253906, + "learning_rate": 4.99222575449783e-05, + "loss": 5.6982, + "step": 4223 + }, + { + "epoch": 0.025121324578932343, + "grad_norm": 2.2298128604888916, + "learning_rate": 4.992222073241539e-05, + "loss": 5.7606, + "step": 4224 + }, + { + "epoch": 0.025127271862213342, + "grad_norm": 1.93464994430542, + "learning_rate": 4.99221839111524e-05, + "loss": 5.7097, + "step": 4225 + }, + { + "epoch": 0.025133219145494337, + "grad_norm": 2.15191650390625, + "learning_rate": 4.9922147081189324e-05, + "loss": 5.5852, + "step": 4226 + }, + { + "epoch": 0.025139166428775336, + "grad_norm": 2.086954355239868, + "learning_rate": 4.992211024252619e-05, + "loss": 5.5871, + "step": 4227 + }, + { + "epoch": 0.025145113712056334, + "grad_norm": 2.212296724319458, + "learning_rate": 4.9922073395162995e-05, + "loss": 5.562, + "step": 4228 + }, + { + "epoch": 0.02515106099533733, + "grad_norm": 2.0786778926849365, + "learning_rate": 4.992203653909977e-05, + "loss": 5.6599, + "step": 4229 + }, + { + "epoch": 0.025157008278618328, + "grad_norm": 2.3243489265441895, + "learning_rate": 4.9921999674336514e-05, + "loss": 5.9791, + "step": 4230 + }, + { + "epoch": 0.025162955561899323, + "grad_norm": 2.1922898292541504, + "learning_rate": 4.9921962800873247e-05, + "loss": 5.7352, + "step": 4231 + }, + { + "epoch": 0.02516890284518032, + "grad_norm": 2.1154398918151855, + "learning_rate": 4.992192591870998e-05, + "loss": 5.6408, + "step": 4232 + }, + { + "epoch": 0.02517485012846132, + "grad_norm": 2.3520143032073975, + "learning_rate": 4.992188902784673e-05, + "loss": 5.6318, + "step": 4233 + }, + { + "epoch": 0.025180797411742315, + "grad_norm": 2.16597580909729, + "learning_rate": 4.99218521282835e-05, + "loss": 5.4978, + "step": 4234 + }, + { + "epoch": 0.025186744695023314, + "grad_norm": 2.2510032653808594, + "learning_rate": 4.992181522002032e-05, + "loss": 5.4863, + "step": 4235 + }, + { + "epoch": 0.025192691978304312, + "grad_norm": 1.9984945058822632, + "learning_rate": 4.9921778303057174e-05, + "loss": 5.7514, + "step": 4236 + }, + { + "epoch": 0.025198639261585307, + "grad_norm": 2.019435167312622, + "learning_rate": 4.9921741377394106e-05, + "loss": 5.6481, + "step": 4237 + }, + { + "epoch": 0.025204586544866306, + "grad_norm": 1.8546136617660522, + "learning_rate": 4.9921704443031114e-05, + "loss": 5.5907, + "step": 4238 + }, + { + "epoch": 0.0252105338281473, + "grad_norm": 2.012821912765503, + "learning_rate": 4.9921667499968214e-05, + "loss": 5.6942, + "step": 4239 + }, + { + "epoch": 0.0252164811114283, + "grad_norm": 2.215322971343994, + "learning_rate": 4.992163054820541e-05, + "loss": 5.6248, + "step": 4240 + }, + { + "epoch": 0.025222428394709298, + "grad_norm": 2.1009631156921387, + "learning_rate": 4.9921593587742726e-05, + "loss": 5.7769, + "step": 4241 + }, + { + "epoch": 0.025228375677990293, + "grad_norm": 2.280970335006714, + "learning_rate": 4.992155661858017e-05, + "loss": 5.4233, + "step": 4242 + }, + { + "epoch": 0.025234322961271292, + "grad_norm": 2.324589729309082, + "learning_rate": 4.992151964071776e-05, + "loss": 5.7138, + "step": 4243 + }, + { + "epoch": 0.025240270244552287, + "grad_norm": 2.01705002784729, + "learning_rate": 4.9921482654155506e-05, + "loss": 5.6946, + "step": 4244 + }, + { + "epoch": 0.025246217527833285, + "grad_norm": 2.0912036895751953, + "learning_rate": 4.9921445658893414e-05, + "loss": 5.8085, + "step": 4245 + }, + { + "epoch": 0.025252164811114284, + "grad_norm": 2.03450870513916, + "learning_rate": 4.99214086549315e-05, + "loss": 5.9129, + "step": 4246 + }, + { + "epoch": 0.02525811209439528, + "grad_norm": 2.1532092094421387, + "learning_rate": 4.9921371642269786e-05, + "loss": 5.708, + "step": 4247 + }, + { + "epoch": 0.025264059377676278, + "grad_norm": 2.2842540740966797, + "learning_rate": 4.992133462090828e-05, + "loss": 5.6693, + "step": 4248 + }, + { + "epoch": 0.025270006660957276, + "grad_norm": 2.0693325996398926, + "learning_rate": 4.9921297590846997e-05, + "loss": 5.7278, + "step": 4249 + }, + { + "epoch": 0.02527595394423827, + "grad_norm": 2.0139124393463135, + "learning_rate": 4.9921260552085934e-05, + "loss": 5.5897, + "step": 4250 + }, + { + "epoch": 0.02528190122751927, + "grad_norm": 2.4587321281433105, + "learning_rate": 4.9921223504625125e-05, + "loss": 5.6884, + "step": 4251 + }, + { + "epoch": 0.025287848510800265, + "grad_norm": 2.062640428543091, + "learning_rate": 4.992118644846457e-05, + "loss": 5.6189, + "step": 4252 + }, + { + "epoch": 0.025293795794081263, + "grad_norm": 1.9889299869537354, + "learning_rate": 4.992114938360429e-05, + "loss": 5.7326, + "step": 4253 + }, + { + "epoch": 0.025299743077362262, + "grad_norm": 2.001913547515869, + "learning_rate": 4.992111231004429e-05, + "loss": 5.6765, + "step": 4254 + }, + { + "epoch": 0.025305690360643257, + "grad_norm": 2.0345358848571777, + "learning_rate": 4.992107522778459e-05, + "loss": 5.5783, + "step": 4255 + }, + { + "epoch": 0.025311637643924256, + "grad_norm": 2.277817487716675, + "learning_rate": 4.9921038136825205e-05, + "loss": 5.6672, + "step": 4256 + }, + { + "epoch": 0.025317584927205254, + "grad_norm": 1.8992491960525513, + "learning_rate": 4.992100103716614e-05, + "loss": 5.532, + "step": 4257 + }, + { + "epoch": 0.02532353221048625, + "grad_norm": 2.202746629714966, + "learning_rate": 4.992096392880741e-05, + "loss": 5.697, + "step": 4258 + }, + { + "epoch": 0.025329479493767248, + "grad_norm": 2.020514488220215, + "learning_rate": 4.992092681174903e-05, + "loss": 5.9102, + "step": 4259 + }, + { + "epoch": 0.025335426777048243, + "grad_norm": 2.0697989463806152, + "learning_rate": 4.9920889685991e-05, + "loss": 5.5165, + "step": 4260 + }, + { + "epoch": 0.02534137406032924, + "grad_norm": 2.619258165359497, + "learning_rate": 4.992085255153336e-05, + "loss": 5.6577, + "step": 4261 + }, + { + "epoch": 0.02534732134361024, + "grad_norm": 2.1612637042999268, + "learning_rate": 4.99208154083761e-05, + "loss": 5.8193, + "step": 4262 + }, + { + "epoch": 0.025353268626891235, + "grad_norm": 1.9237465858459473, + "learning_rate": 4.9920778256519244e-05, + "loss": 5.6533, + "step": 4263 + }, + { + "epoch": 0.025359215910172234, + "grad_norm": 2.164339065551758, + "learning_rate": 4.99207410959628e-05, + "loss": 5.5566, + "step": 4264 + }, + { + "epoch": 0.025365163193453232, + "grad_norm": 2.0753626823425293, + "learning_rate": 4.992070392670678e-05, + "loss": 5.8444, + "step": 4265 + }, + { + "epoch": 0.025371110476734227, + "grad_norm": 1.977522850036621, + "learning_rate": 4.992066674875121e-05, + "loss": 5.6615, + "step": 4266 + }, + { + "epoch": 0.025377057760015226, + "grad_norm": 1.9911431074142456, + "learning_rate": 4.992062956209608e-05, + "loss": 5.6366, + "step": 4267 + }, + { + "epoch": 0.02538300504329622, + "grad_norm": 2.0334808826446533, + "learning_rate": 4.992059236674142e-05, + "loss": 5.8399, + "step": 4268 + }, + { + "epoch": 0.02538895232657722, + "grad_norm": 2.2869162559509277, + "learning_rate": 4.992055516268724e-05, + "loss": 5.7302, + "step": 4269 + }, + { + "epoch": 0.025394899609858218, + "grad_norm": 2.0845389366149902, + "learning_rate": 4.9920517949933556e-05, + "loss": 5.619, + "step": 4270 + }, + { + "epoch": 0.025400846893139213, + "grad_norm": 2.290881633758545, + "learning_rate": 4.9920480728480376e-05, + "loss": 5.5629, + "step": 4271 + }, + { + "epoch": 0.02540679417642021, + "grad_norm": 2.0897767543792725, + "learning_rate": 4.9920443498327706e-05, + "loss": 5.7009, + "step": 4272 + }, + { + "epoch": 0.025412741459701207, + "grad_norm": 1.8389668464660645, + "learning_rate": 4.9920406259475574e-05, + "loss": 5.6359, + "step": 4273 + }, + { + "epoch": 0.025418688742982205, + "grad_norm": 2.0262937545776367, + "learning_rate": 4.992036901192399e-05, + "loss": 5.6707, + "step": 4274 + }, + { + "epoch": 0.025424636026263204, + "grad_norm": 2.04280686378479, + "learning_rate": 4.992033175567295e-05, + "loss": 5.7917, + "step": 4275 + }, + { + "epoch": 0.0254305833095442, + "grad_norm": 2.0945205688476562, + "learning_rate": 4.992029449072249e-05, + "loss": 5.7208, + "step": 4276 + }, + { + "epoch": 0.025436530592825198, + "grad_norm": 1.9662036895751953, + "learning_rate": 4.992025721707261e-05, + "loss": 5.7141, + "step": 4277 + }, + { + "epoch": 0.025442477876106196, + "grad_norm": 2.582284450531006, + "learning_rate": 4.9920219934723316e-05, + "loss": 5.9514, + "step": 4278 + }, + { + "epoch": 0.02544842515938719, + "grad_norm": 1.9792051315307617, + "learning_rate": 4.992018264367464e-05, + "loss": 5.3867, + "step": 4279 + }, + { + "epoch": 0.02545437244266819, + "grad_norm": 2.0107717514038086, + "learning_rate": 4.992014534392658e-05, + "loss": 5.5985, + "step": 4280 + }, + { + "epoch": 0.025460319725949185, + "grad_norm": 2.2035727500915527, + "learning_rate": 4.9920108035479166e-05, + "loss": 5.6356, + "step": 4281 + }, + { + "epoch": 0.025466267009230183, + "grad_norm": 2.1973958015441895, + "learning_rate": 4.992007071833239e-05, + "loss": 5.3557, + "step": 4282 + }, + { + "epoch": 0.025472214292511182, + "grad_norm": 2.031371831893921, + "learning_rate": 4.9920033392486275e-05, + "loss": 5.484, + "step": 4283 + }, + { + "epoch": 0.025478161575792177, + "grad_norm": 1.9966185092926025, + "learning_rate": 4.991999605794084e-05, + "loss": 5.4137, + "step": 4284 + }, + { + "epoch": 0.025484108859073176, + "grad_norm": 1.699460506439209, + "learning_rate": 4.9919958714696085e-05, + "loss": 5.7099, + "step": 4285 + }, + { + "epoch": 0.025490056142354174, + "grad_norm": 2.270535945892334, + "learning_rate": 4.991992136275203e-05, + "loss": 5.6654, + "step": 4286 + }, + { + "epoch": 0.02549600342563517, + "grad_norm": 2.0636515617370605, + "learning_rate": 4.99198840021087e-05, + "loss": 5.6996, + "step": 4287 + }, + { + "epoch": 0.025501950708916168, + "grad_norm": 2.217365026473999, + "learning_rate": 4.991984663276608e-05, + "loss": 5.6148, + "step": 4288 + }, + { + "epoch": 0.025507897992197163, + "grad_norm": 2.182109832763672, + "learning_rate": 4.99198092547242e-05, + "loss": 5.6469, + "step": 4289 + }, + { + "epoch": 0.02551384527547816, + "grad_norm": 1.995924472808838, + "learning_rate": 4.9919771867983084e-05, + "loss": 5.7607, + "step": 4290 + }, + { + "epoch": 0.02551979255875916, + "grad_norm": 1.9308382272720337, + "learning_rate": 4.991973447254272e-05, + "loss": 5.7219, + "step": 4291 + }, + { + "epoch": 0.025525739842040155, + "grad_norm": 2.2675700187683105, + "learning_rate": 4.991969706840315e-05, + "loss": 5.7348, + "step": 4292 + }, + { + "epoch": 0.025531687125321154, + "grad_norm": 2.0441880226135254, + "learning_rate": 4.991965965556435e-05, + "loss": 5.5827, + "step": 4293 + }, + { + "epoch": 0.025537634408602152, + "grad_norm": 2.0111331939697266, + "learning_rate": 4.9919622234026376e-05, + "loss": 5.5355, + "step": 4294 + }, + { + "epoch": 0.025543581691883147, + "grad_norm": 2.214946985244751, + "learning_rate": 4.991958480378921e-05, + "loss": 5.5327, + "step": 4295 + }, + { + "epoch": 0.025549528975164146, + "grad_norm": 1.9673919677734375, + "learning_rate": 4.991954736485287e-05, + "loss": 5.5744, + "step": 4296 + }, + { + "epoch": 0.02555547625844514, + "grad_norm": 2.0662097930908203, + "learning_rate": 4.991950991721738e-05, + "loss": 5.5301, + "step": 4297 + }, + { + "epoch": 0.02556142354172614, + "grad_norm": 2.1912949085235596, + "learning_rate": 4.991947246088274e-05, + "loss": 5.6505, + "step": 4298 + }, + { + "epoch": 0.025567370825007138, + "grad_norm": 2.1073548793792725, + "learning_rate": 4.991943499584898e-05, + "loss": 5.7429, + "step": 4299 + }, + { + "epoch": 0.025573318108288133, + "grad_norm": 2.4015331268310547, + "learning_rate": 4.9919397522116096e-05, + "loss": 5.9959, + "step": 4300 + }, + { + "epoch": 0.02557926539156913, + "grad_norm": 2.5571470260620117, + "learning_rate": 4.99193600396841e-05, + "loss": 5.9058, + "step": 4301 + }, + { + "epoch": 0.02558521267485013, + "grad_norm": 2.148449182510376, + "learning_rate": 4.9919322548553026e-05, + "loss": 5.6298, + "step": 4302 + }, + { + "epoch": 0.025591159958131125, + "grad_norm": 2.3006222248077393, + "learning_rate": 4.991928504872287e-05, + "loss": 5.4854, + "step": 4303 + }, + { + "epoch": 0.025597107241412124, + "grad_norm": 2.2384679317474365, + "learning_rate": 4.9919247540193646e-05, + "loss": 5.7089, + "step": 4304 + }, + { + "epoch": 0.02560305452469312, + "grad_norm": 2.195736885070801, + "learning_rate": 4.9919210022965376e-05, + "loss": 5.986, + "step": 4305 + }, + { + "epoch": 0.025609001807974117, + "grad_norm": 2.3446342945098877, + "learning_rate": 4.991917249703806e-05, + "loss": 5.88, + "step": 4306 + }, + { + "epoch": 0.025614949091255116, + "grad_norm": 2.3800623416900635, + "learning_rate": 4.9919134962411724e-05, + "loss": 5.6897, + "step": 4307 + }, + { + "epoch": 0.02562089637453611, + "grad_norm": 1.8407396078109741, + "learning_rate": 4.991909741908637e-05, + "loss": 5.7359, + "step": 4308 + }, + { + "epoch": 0.02562684365781711, + "grad_norm": 2.3566956520080566, + "learning_rate": 4.9919059867062026e-05, + "loss": 5.5606, + "step": 4309 + }, + { + "epoch": 0.025632790941098105, + "grad_norm": 2.149317741394043, + "learning_rate": 4.991902230633869e-05, + "loss": 5.6966, + "step": 4310 + }, + { + "epoch": 0.025638738224379103, + "grad_norm": 2.3567728996276855, + "learning_rate": 4.991898473691638e-05, + "loss": 5.4694, + "step": 4311 + }, + { + "epoch": 0.025644685507660102, + "grad_norm": 1.9388068914413452, + "learning_rate": 4.9918947158795106e-05, + "loss": 5.5947, + "step": 4312 + }, + { + "epoch": 0.025650632790941097, + "grad_norm": 1.844419002532959, + "learning_rate": 4.9918909571974893e-05, + "loss": 5.6159, + "step": 4313 + }, + { + "epoch": 0.025656580074222095, + "grad_norm": 1.8664250373840332, + "learning_rate": 4.991887197645574e-05, + "loss": 5.7211, + "step": 4314 + }, + { + "epoch": 0.025662527357503094, + "grad_norm": 2.073004961013794, + "learning_rate": 4.991883437223767e-05, + "loss": 5.8873, + "step": 4315 + }, + { + "epoch": 0.02566847464078409, + "grad_norm": 2.316938877105713, + "learning_rate": 4.991879675932068e-05, + "loss": 5.4372, + "step": 4316 + }, + { + "epoch": 0.025674421924065088, + "grad_norm": 2.2646546363830566, + "learning_rate": 4.991875913770481e-05, + "loss": 5.5486, + "step": 4317 + }, + { + "epoch": 0.025680369207346083, + "grad_norm": 2.2417361736297607, + "learning_rate": 4.991872150739005e-05, + "loss": 5.2264, + "step": 4318 + }, + { + "epoch": 0.02568631649062708, + "grad_norm": 2.271566867828369, + "learning_rate": 4.9918683868376437e-05, + "loss": 5.1546, + "step": 4319 + }, + { + "epoch": 0.02569226377390808, + "grad_norm": 2.211650848388672, + "learning_rate": 4.9918646220663954e-05, + "loss": 5.382, + "step": 4320 + }, + { + "epoch": 0.025698211057189075, + "grad_norm": 2.3627288341522217, + "learning_rate": 4.991860856425263e-05, + "loss": 5.6099, + "step": 4321 + }, + { + "epoch": 0.025704158340470074, + "grad_norm": 2.3968141078948975, + "learning_rate": 4.991857089914249e-05, + "loss": 5.3689, + "step": 4322 + }, + { + "epoch": 0.025710105623751072, + "grad_norm": 2.3576786518096924, + "learning_rate": 4.991853322533352e-05, + "loss": 5.4441, + "step": 4323 + }, + { + "epoch": 0.025716052907032067, + "grad_norm": 2.0814530849456787, + "learning_rate": 4.991849554282575e-05, + "loss": 5.6137, + "step": 4324 + }, + { + "epoch": 0.025722000190313066, + "grad_norm": 2.103505849838257, + "learning_rate": 4.991845785161919e-05, + "loss": 5.5518, + "step": 4325 + }, + { + "epoch": 0.02572794747359406, + "grad_norm": 2.188350200653076, + "learning_rate": 4.991842015171386e-05, + "loss": 5.5958, + "step": 4326 + }, + { + "epoch": 0.02573389475687506, + "grad_norm": 2.124088764190674, + "learning_rate": 4.9918382443109766e-05, + "loss": 5.3851, + "step": 4327 + }, + { + "epoch": 0.025739842040156058, + "grad_norm": 2.181466579437256, + "learning_rate": 4.991834472580692e-05, + "loss": 5.4629, + "step": 4328 + }, + { + "epoch": 0.025745789323437053, + "grad_norm": 1.9634013175964355, + "learning_rate": 4.9918306999805344e-05, + "loss": 5.4768, + "step": 4329 + }, + { + "epoch": 0.02575173660671805, + "grad_norm": 2.2046115398406982, + "learning_rate": 4.991826926510503e-05, + "loss": 5.3977, + "step": 4330 + }, + { + "epoch": 0.02575768388999905, + "grad_norm": 1.8660465478897095, + "learning_rate": 4.9918231521706014e-05, + "loss": 5.4837, + "step": 4331 + }, + { + "epoch": 0.025763631173280045, + "grad_norm": 1.9825572967529297, + "learning_rate": 4.99181937696083e-05, + "loss": 5.5158, + "step": 4332 + }, + { + "epoch": 0.025769578456561044, + "grad_norm": 1.9114030599594116, + "learning_rate": 4.9918156008811906e-05, + "loss": 5.3291, + "step": 4333 + }, + { + "epoch": 0.02577552573984204, + "grad_norm": 2.008059024810791, + "learning_rate": 4.9918118239316835e-05, + "loss": 5.2993, + "step": 4334 + }, + { + "epoch": 0.025781473023123037, + "grad_norm": 2.0090153217315674, + "learning_rate": 4.991808046112311e-05, + "loss": 5.2951, + "step": 4335 + }, + { + "epoch": 0.025787420306404036, + "grad_norm": 2.013878345489502, + "learning_rate": 4.991804267423074e-05, + "loss": 5.3491, + "step": 4336 + }, + { + "epoch": 0.02579336758968503, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.9918004878639734e-05, + "loss": 5.2744, + "step": 4337 + }, + { + "epoch": 0.02579931487296603, + "grad_norm": 1.9945006370544434, + "learning_rate": 4.991796707435012e-05, + "loss": 5.5176, + "step": 4338 + }, + { + "epoch": 0.025805262156247025, + "grad_norm": 2.1205811500549316, + "learning_rate": 4.9917929261361894e-05, + "loss": 5.6534, + "step": 4339 + }, + { + "epoch": 0.025811209439528023, + "grad_norm": 2.6607353687286377, + "learning_rate": 4.991789143967508e-05, + "loss": 6.343, + "step": 4340 + }, + { + "epoch": 0.025817156722809022, + "grad_norm": 2.241818904876709, + "learning_rate": 4.991785360928968e-05, + "loss": 5.6774, + "step": 4341 + }, + { + "epoch": 0.025823104006090017, + "grad_norm": 1.9817326068878174, + "learning_rate": 4.9917815770205723e-05, + "loss": 5.7686, + "step": 4342 + }, + { + "epoch": 0.025829051289371015, + "grad_norm": 2.323802947998047, + "learning_rate": 4.991777792242321e-05, + "loss": 5.9564, + "step": 4343 + }, + { + "epoch": 0.025834998572652014, + "grad_norm": 2.3318228721618652, + "learning_rate": 4.991774006594216e-05, + "loss": 5.9057, + "step": 4344 + }, + { + "epoch": 0.02584094585593301, + "grad_norm": 2.032776355743408, + "learning_rate": 4.991770220076258e-05, + "loss": 5.9753, + "step": 4345 + }, + { + "epoch": 0.025846893139214008, + "grad_norm": 2.116837739944458, + "learning_rate": 4.9917664326884495e-05, + "loss": 5.8458, + "step": 4346 + }, + { + "epoch": 0.025852840422495003, + "grad_norm": 2.312878370285034, + "learning_rate": 4.991762644430791e-05, + "loss": 5.5128, + "step": 4347 + }, + { + "epoch": 0.025858787705776, + "grad_norm": 2.3003859519958496, + "learning_rate": 4.991758855303283e-05, + "loss": 5.7192, + "step": 4348 + }, + { + "epoch": 0.025864734989057, + "grad_norm": 1.898258924484253, + "learning_rate": 4.9917550653059286e-05, + "loss": 5.6422, + "step": 4349 + }, + { + "epoch": 0.025870682272337995, + "grad_norm": 1.9477754831314087, + "learning_rate": 4.9917512744387276e-05, + "loss": 5.7885, + "step": 4350 + }, + { + "epoch": 0.025876629555618993, + "grad_norm": 2.479979991912842, + "learning_rate": 4.991747482701683e-05, + "loss": 5.4692, + "step": 4351 + }, + { + "epoch": 0.025882576838899992, + "grad_norm": 2.324336290359497, + "learning_rate": 4.991743690094794e-05, + "loss": 5.4186, + "step": 4352 + }, + { + "epoch": 0.025888524122180987, + "grad_norm": 2.076723337173462, + "learning_rate": 4.9917398966180625e-05, + "loss": 5.4363, + "step": 4353 + }, + { + "epoch": 0.025894471405461986, + "grad_norm": 1.9004534482955933, + "learning_rate": 4.991736102271492e-05, + "loss": 5.6451, + "step": 4354 + }, + { + "epoch": 0.02590041868874298, + "grad_norm": 1.8098558187484741, + "learning_rate": 4.991732307055082e-05, + "loss": 5.8666, + "step": 4355 + }, + { + "epoch": 0.02590636597202398, + "grad_norm": 2.1158571243286133, + "learning_rate": 4.991728510968833e-05, + "loss": 5.5421, + "step": 4356 + }, + { + "epoch": 0.025912313255304978, + "grad_norm": 2.1235690116882324, + "learning_rate": 4.991724714012748e-05, + "loss": 5.9947, + "step": 4357 + }, + { + "epoch": 0.025918260538585973, + "grad_norm": 2.1306662559509277, + "learning_rate": 4.9917209161868276e-05, + "loss": 5.4648, + "step": 4358 + }, + { + "epoch": 0.02592420782186697, + "grad_norm": 1.7927355766296387, + "learning_rate": 4.991717117491073e-05, + "loss": 5.4339, + "step": 4359 + }, + { + "epoch": 0.02593015510514797, + "grad_norm": 2.314069986343384, + "learning_rate": 4.991713317925485e-05, + "loss": 5.5534, + "step": 4360 + }, + { + "epoch": 0.025936102388428965, + "grad_norm": 2.2628493309020996, + "learning_rate": 4.9917095174900665e-05, + "loss": 5.5996, + "step": 4361 + }, + { + "epoch": 0.025942049671709964, + "grad_norm": 2.1669869422912598, + "learning_rate": 4.991705716184818e-05, + "loss": 5.704, + "step": 4362 + }, + { + "epoch": 0.02594799695499096, + "grad_norm": 2.2048137187957764, + "learning_rate": 4.99170191400974e-05, + "loss": 5.6576, + "step": 4363 + }, + { + "epoch": 0.025953944238271957, + "grad_norm": 2.172398328781128, + "learning_rate": 4.991698110964835e-05, + "loss": 5.7254, + "step": 4364 + }, + { + "epoch": 0.025959891521552956, + "grad_norm": 1.9689068794250488, + "learning_rate": 4.9916943070501047e-05, + "loss": 5.7303, + "step": 4365 + }, + { + "epoch": 0.02596583880483395, + "grad_norm": 1.7037044763565063, + "learning_rate": 4.991690502265549e-05, + "loss": 5.6542, + "step": 4366 + }, + { + "epoch": 0.02597178608811495, + "grad_norm": 1.7666655778884888, + "learning_rate": 4.9916866966111695e-05, + "loss": 5.7833, + "step": 4367 + }, + { + "epoch": 0.025977733371395945, + "grad_norm": 2.0178141593933105, + "learning_rate": 4.991682890086968e-05, + "loss": 5.7759, + "step": 4368 + }, + { + "epoch": 0.025983680654676943, + "grad_norm": 1.7989983558654785, + "learning_rate": 4.991679082692946e-05, + "loss": 5.8772, + "step": 4369 + }, + { + "epoch": 0.025989627937957942, + "grad_norm": 1.8004199266433716, + "learning_rate": 4.9916752744291054e-05, + "loss": 5.6145, + "step": 4370 + }, + { + "epoch": 0.025995575221238937, + "grad_norm": 1.837074637413025, + "learning_rate": 4.991671465295446e-05, + "loss": 5.4874, + "step": 4371 + }, + { + "epoch": 0.026001522504519935, + "grad_norm": 1.7436491250991821, + "learning_rate": 4.991667655291969e-05, + "loss": 5.7212, + "step": 4372 + }, + { + "epoch": 0.026007469787800934, + "grad_norm": 1.7802095413208008, + "learning_rate": 4.991663844418678e-05, + "loss": 5.7004, + "step": 4373 + }, + { + "epoch": 0.02601341707108193, + "grad_norm": 2.112487316131592, + "learning_rate": 4.991660032675572e-05, + "loss": 5.5579, + "step": 4374 + }, + { + "epoch": 0.026019364354362928, + "grad_norm": 2.0917413234710693, + "learning_rate": 4.9916562200626535e-05, + "loss": 5.7825, + "step": 4375 + }, + { + "epoch": 0.026025311637643923, + "grad_norm": 1.8323053121566772, + "learning_rate": 4.991652406579924e-05, + "loss": 5.7699, + "step": 4376 + }, + { + "epoch": 0.02603125892092492, + "grad_norm": 1.9480723142623901, + "learning_rate": 4.9916485922273835e-05, + "loss": 5.6591, + "step": 4377 + }, + { + "epoch": 0.02603720620420592, + "grad_norm": 2.000739812850952, + "learning_rate": 4.991644777005035e-05, + "loss": 5.8919, + "step": 4378 + }, + { + "epoch": 0.026043153487486915, + "grad_norm": 2.093573808670044, + "learning_rate": 4.991640960912879e-05, + "loss": 5.7357, + "step": 4379 + }, + { + "epoch": 0.026049100770767913, + "grad_norm": 1.932019591331482, + "learning_rate": 4.991637143950916e-05, + "loss": 5.7268, + "step": 4380 + }, + { + "epoch": 0.026055048054048912, + "grad_norm": 1.820102572441101, + "learning_rate": 4.991633326119149e-05, + "loss": 5.8733, + "step": 4381 + }, + { + "epoch": 0.026060995337329907, + "grad_norm": 1.9091769456863403, + "learning_rate": 4.991629507417578e-05, + "loss": 5.5532, + "step": 4382 + }, + { + "epoch": 0.026066942620610906, + "grad_norm": 2.0037779808044434, + "learning_rate": 4.991625687846205e-05, + "loss": 5.7841, + "step": 4383 + }, + { + "epoch": 0.0260728899038919, + "grad_norm": 1.7106568813323975, + "learning_rate": 4.991621867405032e-05, + "loss": 5.4486, + "step": 4384 + }, + { + "epoch": 0.0260788371871729, + "grad_norm": 1.7802643775939941, + "learning_rate": 4.9916180460940585e-05, + "loss": 5.7494, + "step": 4385 + }, + { + "epoch": 0.026084784470453898, + "grad_norm": 2.089503288269043, + "learning_rate": 4.991614223913288e-05, + "loss": 5.6044, + "step": 4386 + }, + { + "epoch": 0.026090731753734893, + "grad_norm": 2.3315577507019043, + "learning_rate": 4.99161040086272e-05, + "loss": 5.9552, + "step": 4387 + }, + { + "epoch": 0.02609667903701589, + "grad_norm": 2.1202025413513184, + "learning_rate": 4.9916065769423566e-05, + "loss": 5.778, + "step": 4388 + }, + { + "epoch": 0.02610262632029689, + "grad_norm": 2.3448777198791504, + "learning_rate": 4.991602752152199e-05, + "loss": 5.8014, + "step": 4389 + }, + { + "epoch": 0.026108573603577885, + "grad_norm": 2.1613330841064453, + "learning_rate": 4.9915989264922495e-05, + "loss": 5.731, + "step": 4390 + }, + { + "epoch": 0.026114520886858884, + "grad_norm": 2.0314743518829346, + "learning_rate": 4.991595099962507e-05, + "loss": 5.8181, + "step": 4391 + }, + { + "epoch": 0.02612046817013988, + "grad_norm": 2.053994655609131, + "learning_rate": 4.9915912725629755e-05, + "loss": 5.7264, + "step": 4392 + }, + { + "epoch": 0.026126415453420877, + "grad_norm": 1.8720483779907227, + "learning_rate": 4.991587444293655e-05, + "loss": 5.5229, + "step": 4393 + }, + { + "epoch": 0.026132362736701876, + "grad_norm": 1.8745067119598389, + "learning_rate": 4.991583615154547e-05, + "loss": 5.612, + "step": 4394 + }, + { + "epoch": 0.02613831001998287, + "grad_norm": 2.124157428741455, + "learning_rate": 4.9915797851456525e-05, + "loss": 5.7276, + "step": 4395 + }, + { + "epoch": 0.02614425730326387, + "grad_norm": 2.2587873935699463, + "learning_rate": 4.991575954266974e-05, + "loss": 5.7994, + "step": 4396 + }, + { + "epoch": 0.026150204586544865, + "grad_norm": 1.9030078649520874, + "learning_rate": 4.9915721225185116e-05, + "loss": 5.7491, + "step": 4397 + }, + { + "epoch": 0.026156151869825863, + "grad_norm": 2.2278738021850586, + "learning_rate": 4.991568289900267e-05, + "loss": 5.4701, + "step": 4398 + }, + { + "epoch": 0.02616209915310686, + "grad_norm": 2.190974473953247, + "learning_rate": 4.991564456412242e-05, + "loss": 5.6731, + "step": 4399 + }, + { + "epoch": 0.026168046436387857, + "grad_norm": 2.3491454124450684, + "learning_rate": 4.991560622054438e-05, + "loss": 5.4041, + "step": 4400 + }, + { + "epoch": 0.026173993719668855, + "grad_norm": 2.2767796516418457, + "learning_rate": 4.991556786826854e-05, + "loss": 5.9005, + "step": 4401 + }, + { + "epoch": 0.026179941002949854, + "grad_norm": 2.3645145893096924, + "learning_rate": 4.991552950729496e-05, + "loss": 6.3108, + "step": 4402 + }, + { + "epoch": 0.02618588828623085, + "grad_norm": 2.1715476512908936, + "learning_rate": 4.9915491137623605e-05, + "loss": 5.8186, + "step": 4403 + }, + { + "epoch": 0.026191835569511848, + "grad_norm": 2.195758581161499, + "learning_rate": 4.991545275925452e-05, + "loss": 5.692, + "step": 4404 + }, + { + "epoch": 0.026197782852792843, + "grad_norm": 2.1124489307403564, + "learning_rate": 4.9915414372187705e-05, + "loss": 5.6582, + "step": 4405 + }, + { + "epoch": 0.02620373013607384, + "grad_norm": 1.9873831272125244, + "learning_rate": 4.991537597642317e-05, + "loss": 5.6309, + "step": 4406 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 1.9675770998001099, + "learning_rate": 4.991533757196094e-05, + "loss": 5.7095, + "step": 4407 + }, + { + "epoch": 0.026215624702635835, + "grad_norm": 1.9072648286819458, + "learning_rate": 4.991529915880103e-05, + "loss": 5.6449, + "step": 4408 + }, + { + "epoch": 0.026221571985916833, + "grad_norm": 2.3060495853424072, + "learning_rate": 4.9915260736943435e-05, + "loss": 5.6712, + "step": 4409 + }, + { + "epoch": 0.026227519269197832, + "grad_norm": 2.4438107013702393, + "learning_rate": 4.991522230638819e-05, + "loss": 5.2384, + "step": 4410 + }, + { + "epoch": 0.026233466552478827, + "grad_norm": 1.8102613687515259, + "learning_rate": 4.991518386713529e-05, + "loss": 5.5508, + "step": 4411 + }, + { + "epoch": 0.026239413835759826, + "grad_norm": 2.0226693153381348, + "learning_rate": 4.991514541918476e-05, + "loss": 5.4049, + "step": 4412 + }, + { + "epoch": 0.02624536111904082, + "grad_norm": 2.261418104171753, + "learning_rate": 4.991510696253661e-05, + "loss": 5.3324, + "step": 4413 + }, + { + "epoch": 0.02625130840232182, + "grad_norm": 2.232844352722168, + "learning_rate": 4.9915068497190856e-05, + "loss": 5.2601, + "step": 4414 + }, + { + "epoch": 0.026257255685602818, + "grad_norm": 2.2306487560272217, + "learning_rate": 4.99150300231475e-05, + "loss": 5.3329, + "step": 4415 + }, + { + "epoch": 0.026263202968883813, + "grad_norm": 2.1368730068206787, + "learning_rate": 4.9914991540406574e-05, + "loss": 5.573, + "step": 4416 + }, + { + "epoch": 0.02626915025216481, + "grad_norm": 1.984078288078308, + "learning_rate": 4.991495304896808e-05, + "loss": 5.6518, + "step": 4417 + }, + { + "epoch": 0.02627509753544581, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.9914914548832034e-05, + "loss": 5.7076, + "step": 4418 + }, + { + "epoch": 0.026281044818726805, + "grad_norm": 1.9880858659744263, + "learning_rate": 4.991487603999845e-05, + "loss": 5.6533, + "step": 4419 + }, + { + "epoch": 0.026286992102007804, + "grad_norm": 2.0475687980651855, + "learning_rate": 4.991483752246734e-05, + "loss": 5.6311, + "step": 4420 + }, + { + "epoch": 0.0262929393852888, + "grad_norm": 2.2796714305877686, + "learning_rate": 4.991479899623871e-05, + "loss": 5.364, + "step": 4421 + }, + { + "epoch": 0.026298886668569797, + "grad_norm": 1.8535730838775635, + "learning_rate": 4.991476046131259e-05, + "loss": 5.6153, + "step": 4422 + }, + { + "epoch": 0.026304833951850796, + "grad_norm": 1.97511887550354, + "learning_rate": 4.9914721917688976e-05, + "loss": 5.5682, + "step": 4423 + }, + { + "epoch": 0.02631078123513179, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.99146833653679e-05, + "loss": 5.5609, + "step": 4424 + }, + { + "epoch": 0.02631672851841279, + "grad_norm": 1.9997434616088867, + "learning_rate": 4.9914644804349356e-05, + "loss": 5.6196, + "step": 4425 + }, + { + "epoch": 0.026322675801693788, + "grad_norm": 1.6116957664489746, + "learning_rate": 4.991460623463337e-05, + "loss": 5.5003, + "step": 4426 + }, + { + "epoch": 0.026328623084974783, + "grad_norm": 1.8156583309173584, + "learning_rate": 4.991456765621996e-05, + "loss": 5.5875, + "step": 4427 + }, + { + "epoch": 0.02633457036825578, + "grad_norm": 2.0364272594451904, + "learning_rate": 4.991452906910912e-05, + "loss": 5.6541, + "step": 4428 + }, + { + "epoch": 0.026340517651536777, + "grad_norm": 1.8430767059326172, + "learning_rate": 4.991449047330088e-05, + "loss": 5.5408, + "step": 4429 + }, + { + "epoch": 0.026346464934817775, + "grad_norm": 2.049476385116577, + "learning_rate": 4.991445186879525e-05, + "loss": 5.5644, + "step": 4430 + }, + { + "epoch": 0.026352412218098774, + "grad_norm": 1.9186240434646606, + "learning_rate": 4.991441325559224e-05, + "loss": 5.5977, + "step": 4431 + }, + { + "epoch": 0.02635835950137977, + "grad_norm": 1.80244779586792, + "learning_rate": 4.991437463369186e-05, + "loss": 5.5114, + "step": 4432 + }, + { + "epoch": 0.026364306784660767, + "grad_norm": 2.2580177783966064, + "learning_rate": 4.991433600309414e-05, + "loss": 5.4132, + "step": 4433 + }, + { + "epoch": 0.026370254067941763, + "grad_norm": 2.0970637798309326, + "learning_rate": 4.991429736379908e-05, + "loss": 5.6211, + "step": 4434 + }, + { + "epoch": 0.02637620135122276, + "grad_norm": 2.0690932273864746, + "learning_rate": 4.9914258715806696e-05, + "loss": 5.6511, + "step": 4435 + }, + { + "epoch": 0.02638214863450376, + "grad_norm": 2.063052177429199, + "learning_rate": 4.9914220059117e-05, + "loss": 5.5169, + "step": 4436 + }, + { + "epoch": 0.026388095917784755, + "grad_norm": 1.990708827972412, + "learning_rate": 4.991418139373001e-05, + "loss": 5.5018, + "step": 4437 + }, + { + "epoch": 0.026394043201065753, + "grad_norm": 2.1311633586883545, + "learning_rate": 4.9914142719645736e-05, + "loss": 5.4714, + "step": 4438 + }, + { + "epoch": 0.026399990484346752, + "grad_norm": 1.7688508033752441, + "learning_rate": 4.991410403686419e-05, + "loss": 5.5208, + "step": 4439 + }, + { + "epoch": 0.026405937767627747, + "grad_norm": 2.3486130237579346, + "learning_rate": 4.9914065345385383e-05, + "loss": 5.4524, + "step": 4440 + }, + { + "epoch": 0.026411885050908745, + "grad_norm": 2.0333707332611084, + "learning_rate": 4.9914026645209344e-05, + "loss": 5.6747, + "step": 4441 + }, + { + "epoch": 0.02641783233418974, + "grad_norm": 1.8731845617294312, + "learning_rate": 4.991398793633607e-05, + "loss": 5.6436, + "step": 4442 + }, + { + "epoch": 0.02642377961747074, + "grad_norm": 2.003361225128174, + "learning_rate": 4.991394921876558e-05, + "loss": 5.4628, + "step": 4443 + }, + { + "epoch": 0.026429726900751738, + "grad_norm": 2.1195411682128906, + "learning_rate": 4.991391049249789e-05, + "loss": 5.4096, + "step": 4444 + }, + { + "epoch": 0.026435674184032733, + "grad_norm": 1.857364535331726, + "learning_rate": 4.991387175753301e-05, + "loss": 5.3928, + "step": 4445 + }, + { + "epoch": 0.02644162146731373, + "grad_norm": 1.8932915925979614, + "learning_rate": 4.991383301387095e-05, + "loss": 5.4917, + "step": 4446 + }, + { + "epoch": 0.02644756875059473, + "grad_norm": 1.8743010759353638, + "learning_rate": 4.991379426151174e-05, + "loss": 5.6766, + "step": 4447 + }, + { + "epoch": 0.026453516033875725, + "grad_norm": 1.910796046257019, + "learning_rate": 4.991375550045537e-05, + "loss": 5.4347, + "step": 4448 + }, + { + "epoch": 0.026459463317156724, + "grad_norm": 1.7901744842529297, + "learning_rate": 4.991371673070187e-05, + "loss": 5.5339, + "step": 4449 + }, + { + "epoch": 0.02646541060043772, + "grad_norm": 1.86943519115448, + "learning_rate": 4.9913677952251244e-05, + "loss": 5.4867, + "step": 4450 + }, + { + "epoch": 0.026471357883718717, + "grad_norm": 1.8662208318710327, + "learning_rate": 4.991363916510352e-05, + "loss": 5.4992, + "step": 4451 + }, + { + "epoch": 0.026477305166999716, + "grad_norm": 1.7465355396270752, + "learning_rate": 4.99136003692587e-05, + "loss": 5.5243, + "step": 4452 + }, + { + "epoch": 0.02648325245028071, + "grad_norm": 1.9097687005996704, + "learning_rate": 4.9913561564716794e-05, + "loss": 5.5096, + "step": 4453 + }, + { + "epoch": 0.02648919973356171, + "grad_norm": 2.1472127437591553, + "learning_rate": 4.991352275147783e-05, + "loss": 5.4462, + "step": 4454 + }, + { + "epoch": 0.026495147016842708, + "grad_norm": 2.3966939449310303, + "learning_rate": 4.9913483929541806e-05, + "loss": 5.2938, + "step": 4455 + }, + { + "epoch": 0.026501094300123703, + "grad_norm": 2.1738977432250977, + "learning_rate": 4.991344509890874e-05, + "loss": 5.317, + "step": 4456 + }, + { + "epoch": 0.0265070415834047, + "grad_norm": 1.963944435119629, + "learning_rate": 4.9913406259578646e-05, + "loss": 5.3827, + "step": 4457 + }, + { + "epoch": 0.026512988866685697, + "grad_norm": 2.1755871772766113, + "learning_rate": 4.991336741155155e-05, + "loss": 5.2941, + "step": 4458 + }, + { + "epoch": 0.026518936149966695, + "grad_norm": 2.2461934089660645, + "learning_rate": 4.991332855482744e-05, + "loss": 5.3503, + "step": 4459 + }, + { + "epoch": 0.026524883433247694, + "grad_norm": 2.2270491123199463, + "learning_rate": 4.9913289689406355e-05, + "loss": 5.417, + "step": 4460 + }, + { + "epoch": 0.02653083071652869, + "grad_norm": 2.437074661254883, + "learning_rate": 4.991325081528829e-05, + "loss": 5.1938, + "step": 4461 + }, + { + "epoch": 0.026536777999809687, + "grad_norm": 2.159170150756836, + "learning_rate": 4.991321193247328e-05, + "loss": 5.2088, + "step": 4462 + }, + { + "epoch": 0.026542725283090682, + "grad_norm": 2.08797287940979, + "learning_rate": 4.9913173040961315e-05, + "loss": 5.1829, + "step": 4463 + }, + { + "epoch": 0.02654867256637168, + "grad_norm": 2.805191993713379, + "learning_rate": 4.991313414075242e-05, + "loss": 6.3049, + "step": 4464 + }, + { + "epoch": 0.02655461984965268, + "grad_norm": 2.3204843997955322, + "learning_rate": 4.991309523184661e-05, + "loss": 5.3831, + "step": 4465 + }, + { + "epoch": 0.026560567132933675, + "grad_norm": 2.217212200164795, + "learning_rate": 4.991305631424389e-05, + "loss": 5.4647, + "step": 4466 + }, + { + "epoch": 0.026566514416214673, + "grad_norm": 2.1094207763671875, + "learning_rate": 4.991301738794429e-05, + "loss": 5.5837, + "step": 4467 + }, + { + "epoch": 0.026572461699495672, + "grad_norm": 2.225660562515259, + "learning_rate": 4.99129784529478e-05, + "loss": 5.8316, + "step": 4468 + }, + { + "epoch": 0.026578408982776667, + "grad_norm": 2.361238956451416, + "learning_rate": 4.991293950925446e-05, + "loss": 5.8358, + "step": 4469 + }, + { + "epoch": 0.026584356266057665, + "grad_norm": 2.3268609046936035, + "learning_rate": 4.991290055686426e-05, + "loss": 5.732, + "step": 4470 + }, + { + "epoch": 0.02659030354933866, + "grad_norm": 2.1456172466278076, + "learning_rate": 4.9912861595777226e-05, + "loss": 5.9, + "step": 4471 + }, + { + "epoch": 0.02659625083261966, + "grad_norm": 2.114696979522705, + "learning_rate": 4.991282262599337e-05, + "loss": 5.4464, + "step": 4472 + }, + { + "epoch": 0.026602198115900658, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9912783647512705e-05, + "loss": 5.5053, + "step": 4473 + }, + { + "epoch": 0.026608145399181653, + "grad_norm": 1.9743404388427734, + "learning_rate": 4.9912744660335245e-05, + "loss": 5.5877, + "step": 4474 + }, + { + "epoch": 0.02661409268246265, + "grad_norm": 2.052358865737915, + "learning_rate": 4.991270566446101e-05, + "loss": 5.5891, + "step": 4475 + }, + { + "epoch": 0.02662003996574365, + "grad_norm": 2.1602041721343994, + "learning_rate": 4.991266665989e-05, + "loss": 5.581, + "step": 4476 + }, + { + "epoch": 0.026625987249024645, + "grad_norm": 2.241586685180664, + "learning_rate": 4.9912627646622236e-05, + "loss": 5.5375, + "step": 4477 + }, + { + "epoch": 0.026631934532305643, + "grad_norm": 1.7952601909637451, + "learning_rate": 4.991258862465773e-05, + "loss": 5.5273, + "step": 4478 + }, + { + "epoch": 0.02663788181558664, + "grad_norm": 1.9767752885818481, + "learning_rate": 4.991254959399649e-05, + "loss": 5.4476, + "step": 4479 + }, + { + "epoch": 0.026643829098867637, + "grad_norm": 1.7997682094573975, + "learning_rate": 4.991251055463855e-05, + "loss": 5.5666, + "step": 4480 + }, + { + "epoch": 0.026649776382148636, + "grad_norm": 2.3247575759887695, + "learning_rate": 4.9912471506583905e-05, + "loss": 5.5247, + "step": 4481 + }, + { + "epoch": 0.02665572366542963, + "grad_norm": 2.165900230407715, + "learning_rate": 4.991243244983257e-05, + "loss": 5.6807, + "step": 4482 + }, + { + "epoch": 0.02666167094871063, + "grad_norm": 2.598257303237915, + "learning_rate": 4.991239338438456e-05, + "loss": 5.6609, + "step": 4483 + }, + { + "epoch": 0.026667618231991628, + "grad_norm": 2.2752041816711426, + "learning_rate": 4.991235431023989e-05, + "loss": 5.5199, + "step": 4484 + }, + { + "epoch": 0.026673565515272623, + "grad_norm": 2.3482842445373535, + "learning_rate": 4.9912315227398586e-05, + "loss": 5.6438, + "step": 4485 + }, + { + "epoch": 0.02667951279855362, + "grad_norm": 2.034403085708618, + "learning_rate": 4.991227613586065e-05, + "loss": 5.6191, + "step": 4486 + }, + { + "epoch": 0.026685460081834617, + "grad_norm": 1.9002971649169922, + "learning_rate": 4.9912237035626085e-05, + "loss": 5.6627, + "step": 4487 + }, + { + "epoch": 0.026691407365115615, + "grad_norm": 2.0305564403533936, + "learning_rate": 4.9912197926694924e-05, + "loss": 5.7009, + "step": 4488 + }, + { + "epoch": 0.026697354648396614, + "grad_norm": 2.029777765274048, + "learning_rate": 4.991215880906717e-05, + "loss": 5.5201, + "step": 4489 + }, + { + "epoch": 0.02670330193167761, + "grad_norm": 1.8889492750167847, + "learning_rate": 4.991211968274283e-05, + "loss": 5.602, + "step": 4490 + }, + { + "epoch": 0.026709249214958607, + "grad_norm": 1.9616930484771729, + "learning_rate": 4.9912080547721934e-05, + "loss": 5.5352, + "step": 4491 + }, + { + "epoch": 0.026715196498239602, + "grad_norm": 2.449345827102661, + "learning_rate": 4.9912041404004485e-05, + "loss": 5.7103, + "step": 4492 + }, + { + "epoch": 0.0267211437815206, + "grad_norm": 2.5550389289855957, + "learning_rate": 4.991200225159051e-05, + "loss": 5.5593, + "step": 4493 + }, + { + "epoch": 0.0267270910648016, + "grad_norm": 2.2512362003326416, + "learning_rate": 4.9911963090479996e-05, + "loss": 5.6329, + "step": 4494 + }, + { + "epoch": 0.026733038348082595, + "grad_norm": 2.0346968173980713, + "learning_rate": 4.9911923920672984e-05, + "loss": 5.5966, + "step": 4495 + }, + { + "epoch": 0.026738985631363593, + "grad_norm": 2.013648271560669, + "learning_rate": 4.991188474216947e-05, + "loss": 5.6532, + "step": 4496 + }, + { + "epoch": 0.026744932914644592, + "grad_norm": 1.8361715078353882, + "learning_rate": 4.9911845554969484e-05, + "loss": 5.519, + "step": 4497 + }, + { + "epoch": 0.026750880197925587, + "grad_norm": 2.1487016677856445, + "learning_rate": 4.991180635907302e-05, + "loss": 5.436, + "step": 4498 + }, + { + "epoch": 0.026756827481206585, + "grad_norm": 2.277714967727661, + "learning_rate": 4.991176715448011e-05, + "loss": 5.3574, + "step": 4499 + }, + { + "epoch": 0.02676277476448758, + "grad_norm": 2.3313565254211426, + "learning_rate": 4.9911727941190755e-05, + "loss": 5.5408, + "step": 4500 + }, + { + "epoch": 0.02676872204776858, + "grad_norm": 2.105825662612915, + "learning_rate": 4.9911688719204975e-05, + "loss": 5.4801, + "step": 4501 + }, + { + "epoch": 0.026774669331049578, + "grad_norm": 2.122138261795044, + "learning_rate": 4.991164948852278e-05, + "loss": 5.4645, + "step": 4502 + }, + { + "epoch": 0.026780616614330573, + "grad_norm": 1.8742777109146118, + "learning_rate": 4.991161024914419e-05, + "loss": 5.5646, + "step": 4503 + }, + { + "epoch": 0.02678656389761157, + "grad_norm": 1.762276291847229, + "learning_rate": 4.991157100106921e-05, + "loss": 5.5672, + "step": 4504 + }, + { + "epoch": 0.02679251118089257, + "grad_norm": 1.9174740314483643, + "learning_rate": 4.9911531744297855e-05, + "loss": 5.4296, + "step": 4505 + }, + { + "epoch": 0.026798458464173565, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.991149247883015e-05, + "loss": 5.5685, + "step": 4506 + }, + { + "epoch": 0.026804405747454563, + "grad_norm": 1.8675988912582397, + "learning_rate": 4.9911453204666094e-05, + "loss": 5.4757, + "step": 4507 + }, + { + "epoch": 0.02681035303073556, + "grad_norm": 2.3117783069610596, + "learning_rate": 4.99114139218057e-05, + "loss": 5.7057, + "step": 4508 + }, + { + "epoch": 0.026816300314016557, + "grad_norm": 2.5439465045928955, + "learning_rate": 4.9911374630249007e-05, + "loss": 5.7393, + "step": 4509 + }, + { + "epoch": 0.026822247597297556, + "grad_norm": 2.4611666202545166, + "learning_rate": 4.9911335329996e-05, + "loss": 5.7215, + "step": 4510 + }, + { + "epoch": 0.02682819488057855, + "grad_norm": 2.1540768146514893, + "learning_rate": 4.99112960210467e-05, + "loss": 5.7059, + "step": 4511 + }, + { + "epoch": 0.02683414216385955, + "grad_norm": 2.1183645725250244, + "learning_rate": 4.9911256703401134e-05, + "loss": 5.4454, + "step": 4512 + }, + { + "epoch": 0.026840089447140548, + "grad_norm": 2.1757540702819824, + "learning_rate": 4.9911217377059295e-05, + "loss": 5.6851, + "step": 4513 + }, + { + "epoch": 0.026846036730421543, + "grad_norm": 2.2770378589630127, + "learning_rate": 4.9911178042021214e-05, + "loss": 5.5957, + "step": 4514 + }, + { + "epoch": 0.02685198401370254, + "grad_norm": 2.320993185043335, + "learning_rate": 4.9911138698286895e-05, + "loss": 5.4674, + "step": 4515 + }, + { + "epoch": 0.026857931296983537, + "grad_norm": 2.2340428829193115, + "learning_rate": 4.991109934585636e-05, + "loss": 5.4514, + "step": 4516 + }, + { + "epoch": 0.026863878580264535, + "grad_norm": 2.1531431674957275, + "learning_rate": 4.991105998472962e-05, + "loss": 5.4386, + "step": 4517 + }, + { + "epoch": 0.026869825863545534, + "grad_norm": 2.1567044258117676, + "learning_rate": 4.991102061490667e-05, + "loss": 5.422, + "step": 4518 + }, + { + "epoch": 0.02687577314682653, + "grad_norm": 2.1181681156158447, + "learning_rate": 4.9910981236387554e-05, + "loss": 5.7214, + "step": 4519 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 2.3410873413085938, + "learning_rate": 4.9910941849172263e-05, + "loss": 5.8603, + "step": 4520 + }, + { + "epoch": 0.026887667713388526, + "grad_norm": 2.4943840503692627, + "learning_rate": 4.9910902453260824e-05, + "loss": 5.7084, + "step": 4521 + }, + { + "epoch": 0.02689361499666952, + "grad_norm": 2.1420044898986816, + "learning_rate": 4.991086304865325e-05, + "loss": 5.528, + "step": 4522 + }, + { + "epoch": 0.02689956227995052, + "grad_norm": 2.3257980346679688, + "learning_rate": 4.991082363534955e-05, + "loss": 5.6791, + "step": 4523 + }, + { + "epoch": 0.026905509563231515, + "grad_norm": 2.335049867630005, + "learning_rate": 4.991078421334974e-05, + "loss": 5.6184, + "step": 4524 + }, + { + "epoch": 0.026911456846512513, + "grad_norm": 3.7381551265716553, + "learning_rate": 4.9910744782653825e-05, + "loss": 5.954, + "step": 4525 + }, + { + "epoch": 0.02691740412979351, + "grad_norm": 3.1807587146759033, + "learning_rate": 4.991070534326183e-05, + "loss": 6.5662, + "step": 4526 + }, + { + "epoch": 0.026923351413074507, + "grad_norm": 2.378366708755493, + "learning_rate": 4.991066589517376e-05, + "loss": 6.2312, + "step": 4527 + }, + { + "epoch": 0.026929298696355505, + "grad_norm": 2.5797109603881836, + "learning_rate": 4.991062643838964e-05, + "loss": 5.9969, + "step": 4528 + }, + { + "epoch": 0.0269352459796365, + "grad_norm": 2.522815704345703, + "learning_rate": 4.991058697290948e-05, + "loss": 5.919, + "step": 4529 + }, + { + "epoch": 0.0269411932629175, + "grad_norm": 2.5215437412261963, + "learning_rate": 4.991054749873329e-05, + "loss": 5.8812, + "step": 4530 + }, + { + "epoch": 0.026947140546198498, + "grad_norm": 2.1608335971832275, + "learning_rate": 4.991050801586108e-05, + "loss": 5.8381, + "step": 4531 + }, + { + "epoch": 0.026953087829479493, + "grad_norm": 2.37752366065979, + "learning_rate": 4.991046852429288e-05, + "loss": 5.7612, + "step": 4532 + }, + { + "epoch": 0.02695903511276049, + "grad_norm": 2.117534875869751, + "learning_rate": 4.991042902402868e-05, + "loss": 5.6762, + "step": 4533 + }, + { + "epoch": 0.02696498239604149, + "grad_norm": 2.595797061920166, + "learning_rate": 4.991038951506851e-05, + "loss": 6.19, + "step": 4534 + }, + { + "epoch": 0.026970929679322485, + "grad_norm": 2.2216086387634277, + "learning_rate": 4.991034999741239e-05, + "loss": 6.1612, + "step": 4535 + }, + { + "epoch": 0.026976876962603483, + "grad_norm": 2.829735279083252, + "learning_rate": 4.991031047106032e-05, + "loss": 5.6955, + "step": 4536 + }, + { + "epoch": 0.02698282424588448, + "grad_norm": 2.5018115043640137, + "learning_rate": 4.991027093601231e-05, + "loss": 5.4966, + "step": 4537 + }, + { + "epoch": 0.026988771529165477, + "grad_norm": 2.334052085876465, + "learning_rate": 4.9910231392268385e-05, + "loss": 6.1603, + "step": 4538 + }, + { + "epoch": 0.026994718812446476, + "grad_norm": 2.497351884841919, + "learning_rate": 4.991019183982856e-05, + "loss": 6.0128, + "step": 4539 + }, + { + "epoch": 0.02700066609572747, + "grad_norm": 2.2976267337799072, + "learning_rate": 4.991015227869284e-05, + "loss": 5.6696, + "step": 4540 + }, + { + "epoch": 0.02700661337900847, + "grad_norm": 2.6851742267608643, + "learning_rate": 4.991011270886125e-05, + "loss": 5.7996, + "step": 4541 + }, + { + "epoch": 0.027012560662289468, + "grad_norm": 2.531029224395752, + "learning_rate": 4.991007313033379e-05, + "loss": 5.6671, + "step": 4542 + }, + { + "epoch": 0.027018507945570463, + "grad_norm": 2.195552110671997, + "learning_rate": 4.991003354311048e-05, + "loss": 6.3213, + "step": 4543 + }, + { + "epoch": 0.02702445522885146, + "grad_norm": 2.2973361015319824, + "learning_rate": 4.9909993947191336e-05, + "loss": 6.1523, + "step": 4544 + }, + { + "epoch": 0.027030402512132456, + "grad_norm": 2.4766385555267334, + "learning_rate": 4.990995434257637e-05, + "loss": 5.7894, + "step": 4545 + }, + { + "epoch": 0.027036349795413455, + "grad_norm": 2.486384630203247, + "learning_rate": 4.9909914729265606e-05, + "loss": 6.2814, + "step": 4546 + }, + { + "epoch": 0.027042297078694454, + "grad_norm": 2.5054233074188232, + "learning_rate": 4.9909875107259036e-05, + "loss": 6.2859, + "step": 4547 + }, + { + "epoch": 0.02704824436197545, + "grad_norm": 2.70576548576355, + "learning_rate": 4.990983547655669e-05, + "loss": 6.2424, + "step": 4548 + }, + { + "epoch": 0.027054191645256447, + "grad_norm": 3.0937716960906982, + "learning_rate": 4.990979583715858e-05, + "loss": 6.4392, + "step": 4549 + }, + { + "epoch": 0.027060138928537446, + "grad_norm": 2.6290581226348877, + "learning_rate": 4.9909756189064714e-05, + "loss": 6.3565, + "step": 4550 + }, + { + "epoch": 0.02706608621181844, + "grad_norm": 2.5180583000183105, + "learning_rate": 4.990971653227511e-05, + "loss": 6.1482, + "step": 4551 + }, + { + "epoch": 0.02707203349509944, + "grad_norm": 2.6096208095550537, + "learning_rate": 4.990967686678978e-05, + "loss": 5.7724, + "step": 4552 + }, + { + "epoch": 0.027077980778380435, + "grad_norm": 3.187276840209961, + "learning_rate": 4.990963719260874e-05, + "loss": 5.682, + "step": 4553 + }, + { + "epoch": 0.027083928061661433, + "grad_norm": 2.3522419929504395, + "learning_rate": 4.9909597509732006e-05, + "loss": 6.7045, + "step": 4554 + }, + { + "epoch": 0.02708987534494243, + "grad_norm": 2.6016366481781006, + "learning_rate": 4.990955781815959e-05, + "loss": 6.0653, + "step": 4555 + }, + { + "epoch": 0.027095822628223427, + "grad_norm": 2.5409183502197266, + "learning_rate": 4.99095181178915e-05, + "loss": 5.861, + "step": 4556 + }, + { + "epoch": 0.027101769911504425, + "grad_norm": 2.5297863483428955, + "learning_rate": 4.9909478408927754e-05, + "loss": 5.5301, + "step": 4557 + }, + { + "epoch": 0.02710771719478542, + "grad_norm": 2.4822275638580322, + "learning_rate": 4.990943869126837e-05, + "loss": 5.6919, + "step": 4558 + }, + { + "epoch": 0.02711366447806642, + "grad_norm": 2.3832650184631348, + "learning_rate": 4.9909398964913365e-05, + "loss": 5.9589, + "step": 4559 + }, + { + "epoch": 0.027119611761347417, + "grad_norm": 2.0038483142852783, + "learning_rate": 4.9909359229862734e-05, + "loss": 6.1847, + "step": 4560 + }, + { + "epoch": 0.027125559044628413, + "grad_norm": 2.3678700923919678, + "learning_rate": 4.990931948611651e-05, + "loss": 6.4794, + "step": 4561 + }, + { + "epoch": 0.02713150632790941, + "grad_norm": 2.7433204650878906, + "learning_rate": 4.990927973367469e-05, + "loss": 6.6997, + "step": 4562 + }, + { + "epoch": 0.02713745361119041, + "grad_norm": 3.5579798221588135, + "learning_rate": 4.990923997253731e-05, + "loss": 6.1809, + "step": 4563 + }, + { + "epoch": 0.027143400894471405, + "grad_norm": 3.254093647003174, + "learning_rate": 4.990920020270436e-05, + "loss": 6.1446, + "step": 4564 + }, + { + "epoch": 0.027149348177752403, + "grad_norm": 3.0661215782165527, + "learning_rate": 4.990916042417588e-05, + "loss": 6.6702, + "step": 4565 + }, + { + "epoch": 0.0271552954610334, + "grad_norm": 2.641291618347168, + "learning_rate": 4.9909120636951864e-05, + "loss": 6.4951, + "step": 4566 + }, + { + "epoch": 0.027161242744314397, + "grad_norm": 2.050675868988037, + "learning_rate": 4.990908084103233e-05, + "loss": 6.3365, + "step": 4567 + }, + { + "epoch": 0.027167190027595396, + "grad_norm": 2.081108331680298, + "learning_rate": 4.990904103641729e-05, + "loss": 6.1874, + "step": 4568 + }, + { + "epoch": 0.02717313731087639, + "grad_norm": 2.5833899974823, + "learning_rate": 4.9909001223106766e-05, + "loss": 6.0892, + "step": 4569 + }, + { + "epoch": 0.02717908459415739, + "grad_norm": 2.7387397289276123, + "learning_rate": 4.990896140110076e-05, + "loss": 6.1036, + "step": 4570 + }, + { + "epoch": 0.027185031877438388, + "grad_norm": 2.5665578842163086, + "learning_rate": 4.99089215703993e-05, + "loss": 5.9577, + "step": 4571 + }, + { + "epoch": 0.027190979160719383, + "grad_norm": 2.3825178146362305, + "learning_rate": 4.990888173100239e-05, + "loss": 5.9654, + "step": 4572 + }, + { + "epoch": 0.02719692644400038, + "grad_norm": 2.562509059906006, + "learning_rate": 4.990884188291005e-05, + "loss": 6.009, + "step": 4573 + }, + { + "epoch": 0.027202873727281376, + "grad_norm": 2.141941785812378, + "learning_rate": 4.9908802026122284e-05, + "loss": 5.8315, + "step": 4574 + }, + { + "epoch": 0.027208821010562375, + "grad_norm": 2.5348474979400635, + "learning_rate": 4.990876216063912e-05, + "loss": 6.3763, + "step": 4575 + }, + { + "epoch": 0.027214768293843374, + "grad_norm": 2.751520872116089, + "learning_rate": 4.990872228646056e-05, + "loss": 6.5684, + "step": 4576 + }, + { + "epoch": 0.02722071557712437, + "grad_norm": 4.626354694366455, + "learning_rate": 4.990868240358662e-05, + "loss": 6.115, + "step": 4577 + }, + { + "epoch": 0.027226662860405367, + "grad_norm": 2.648479700088501, + "learning_rate": 4.990864251201732e-05, + "loss": 6.0879, + "step": 4578 + }, + { + "epoch": 0.027232610143686366, + "grad_norm": 2.21056866645813, + "learning_rate": 4.990860261175268e-05, + "loss": 6.2923, + "step": 4579 + }, + { + "epoch": 0.02723855742696736, + "grad_norm": 2.3460421562194824, + "learning_rate": 4.9908562702792684e-05, + "loss": 6.4044, + "step": 4580 + }, + { + "epoch": 0.02724450471024836, + "grad_norm": 2.6087262630462646, + "learning_rate": 4.990852278513738e-05, + "loss": 6.5131, + "step": 4581 + }, + { + "epoch": 0.027250451993529354, + "grad_norm": 2.6969377994537354, + "learning_rate": 4.9908482858786765e-05, + "loss": 6.3483, + "step": 4582 + }, + { + "epoch": 0.027256399276810353, + "grad_norm": 2.64043927192688, + "learning_rate": 4.990844292374085e-05, + "loss": 5.8712, + "step": 4583 + }, + { + "epoch": 0.02726234656009135, + "grad_norm": 2.5738205909729004, + "learning_rate": 4.9908402979999654e-05, + "loss": 5.9165, + "step": 4584 + }, + { + "epoch": 0.027268293843372347, + "grad_norm": 2.2725625038146973, + "learning_rate": 4.99083630275632e-05, + "loss": 5.8454, + "step": 4585 + }, + { + "epoch": 0.027274241126653345, + "grad_norm": 2.5911824703216553, + "learning_rate": 4.9908323066431494e-05, + "loss": 5.6729, + "step": 4586 + }, + { + "epoch": 0.02728018840993434, + "grad_norm": 2.6691668033599854, + "learning_rate": 4.9908283096604546e-05, + "loss": 5.7726, + "step": 4587 + }, + { + "epoch": 0.02728613569321534, + "grad_norm": 2.6512796878814697, + "learning_rate": 4.990824311808238e-05, + "loss": 6.1295, + "step": 4588 + }, + { + "epoch": 0.027292082976496337, + "grad_norm": 2.816943645477295, + "learning_rate": 4.9908203130865e-05, + "loss": 5.5172, + "step": 4589 + }, + { + "epoch": 0.027298030259777332, + "grad_norm": 2.6252098083496094, + "learning_rate": 4.990816313495242e-05, + "loss": 5.5955, + "step": 4590 + }, + { + "epoch": 0.02730397754305833, + "grad_norm": 2.3711740970611572, + "learning_rate": 4.990812313034466e-05, + "loss": 5.3348, + "step": 4591 + }, + { + "epoch": 0.02730992482633933, + "grad_norm": 2.355436086654663, + "learning_rate": 4.990808311704173e-05, + "loss": 5.6171, + "step": 4592 + }, + { + "epoch": 0.027315872109620325, + "grad_norm": 2.3344695568084717, + "learning_rate": 4.990804309504365e-05, + "loss": 5.46, + "step": 4593 + }, + { + "epoch": 0.027321819392901323, + "grad_norm": 2.3890786170959473, + "learning_rate": 4.990800306435043e-05, + "loss": 5.5658, + "step": 4594 + }, + { + "epoch": 0.02732776667618232, + "grad_norm": 2.5606987476348877, + "learning_rate": 4.990796302496208e-05, + "loss": 5.4778, + "step": 4595 + }, + { + "epoch": 0.027333713959463317, + "grad_norm": 2.2443172931671143, + "learning_rate": 4.9907922976878616e-05, + "loss": 5.486, + "step": 4596 + }, + { + "epoch": 0.027339661242744315, + "grad_norm": 2.3428351879119873, + "learning_rate": 4.990788292010005e-05, + "loss": 5.3332, + "step": 4597 + }, + { + "epoch": 0.02734560852602531, + "grad_norm": 2.6336300373077393, + "learning_rate": 4.9907842854626406e-05, + "loss": 5.4606, + "step": 4598 + }, + { + "epoch": 0.02735155580930631, + "grad_norm": 2.3052382469177246, + "learning_rate": 4.990780278045769e-05, + "loss": 5.4028, + "step": 4599 + }, + { + "epoch": 0.027357503092587308, + "grad_norm": 2.4661340713500977, + "learning_rate": 4.990776269759392e-05, + "loss": 5.6011, + "step": 4600 + }, + { + "epoch": 0.027363450375868303, + "grad_norm": 2.400527238845825, + "learning_rate": 4.99077226060351e-05, + "loss": 5.5952, + "step": 4601 + }, + { + "epoch": 0.0273693976591493, + "grad_norm": 2.364900827407837, + "learning_rate": 4.9907682505781256e-05, + "loss": 5.2125, + "step": 4602 + }, + { + "epoch": 0.027375344942430296, + "grad_norm": 2.383680820465088, + "learning_rate": 4.99076423968324e-05, + "loss": 5.4253, + "step": 4603 + }, + { + "epoch": 0.027381292225711295, + "grad_norm": 2.681903839111328, + "learning_rate": 4.990760227918854e-05, + "loss": 5.3741, + "step": 4604 + }, + { + "epoch": 0.027387239508992293, + "grad_norm": 2.3454341888427734, + "learning_rate": 4.990756215284969e-05, + "loss": 5.3032, + "step": 4605 + }, + { + "epoch": 0.02739318679227329, + "grad_norm": 2.439807653427124, + "learning_rate": 4.990752201781587e-05, + "loss": 5.3368, + "step": 4606 + }, + { + "epoch": 0.027399134075554287, + "grad_norm": 2.938976764678955, + "learning_rate": 4.990748187408709e-05, + "loss": 6.1251, + "step": 4607 + }, + { + "epoch": 0.027405081358835286, + "grad_norm": 3.353973865509033, + "learning_rate": 4.990744172166337e-05, + "loss": 6.72, + "step": 4608 + }, + { + "epoch": 0.02741102864211628, + "grad_norm": 2.4661834239959717, + "learning_rate": 4.990740156054472e-05, + "loss": 5.7156, + "step": 4609 + }, + { + "epoch": 0.02741697592539728, + "grad_norm": 2.303976058959961, + "learning_rate": 4.990736139073116e-05, + "loss": 5.3493, + "step": 4610 + }, + { + "epoch": 0.027422923208678274, + "grad_norm": 2.4225149154663086, + "learning_rate": 4.990732121222268e-05, + "loss": 5.4831, + "step": 4611 + }, + { + "epoch": 0.027428870491959273, + "grad_norm": 2.5566627979278564, + "learning_rate": 4.990728102501932e-05, + "loss": 5.9159, + "step": 4612 + }, + { + "epoch": 0.02743481777524027, + "grad_norm": 2.64258074760437, + "learning_rate": 4.9907240829121085e-05, + "loss": 6.7137, + "step": 4613 + }, + { + "epoch": 0.027440765058521267, + "grad_norm": 2.967501640319824, + "learning_rate": 4.9907200624527986e-05, + "loss": 6.3333, + "step": 4614 + }, + { + "epoch": 0.027446712341802265, + "grad_norm": 2.6084952354431152, + "learning_rate": 4.990716041124005e-05, + "loss": 6.1201, + "step": 4615 + }, + { + "epoch": 0.02745265962508326, + "grad_norm": 3.0721616744995117, + "learning_rate": 4.990712018925727e-05, + "loss": 6.396, + "step": 4616 + }, + { + "epoch": 0.02745860690836426, + "grad_norm": 2.888263463973999, + "learning_rate": 4.990707995857968e-05, + "loss": 6.0773, + "step": 4617 + }, + { + "epoch": 0.027464554191645257, + "grad_norm": 2.7506093978881836, + "learning_rate": 4.990703971920728e-05, + "loss": 5.9909, + "step": 4618 + }, + { + "epoch": 0.027470501474926252, + "grad_norm": 2.8273298740386963, + "learning_rate": 4.99069994711401e-05, + "loss": 5.9591, + "step": 4619 + }, + { + "epoch": 0.02747644875820725, + "grad_norm": 2.451011896133423, + "learning_rate": 4.990695921437813e-05, + "loss": 6.1596, + "step": 4620 + }, + { + "epoch": 0.02748239604148825, + "grad_norm": 2.762265920639038, + "learning_rate": 4.990691894892141e-05, + "loss": 6.6233, + "step": 4621 + }, + { + "epoch": 0.027488343324769245, + "grad_norm": 2.4570846557617188, + "learning_rate": 4.990687867476994e-05, + "loss": 6.5025, + "step": 4622 + }, + { + "epoch": 0.027494290608050243, + "grad_norm": 3.108992576599121, + "learning_rate": 4.990683839192373e-05, + "loss": 5.921, + "step": 4623 + }, + { + "epoch": 0.02750023789133124, + "grad_norm": 2.887580156326294, + "learning_rate": 4.99067981003828e-05, + "loss": 5.9266, + "step": 4624 + }, + { + "epoch": 0.027506185174612237, + "grad_norm": 3.083556890487671, + "learning_rate": 4.990675780014718e-05, + "loss": 5.765, + "step": 4625 + }, + { + "epoch": 0.027512132457893235, + "grad_norm": 2.710231304168701, + "learning_rate": 4.990671749121685e-05, + "loss": 5.7674, + "step": 4626 + }, + { + "epoch": 0.02751807974117423, + "grad_norm": 2.738926410675049, + "learning_rate": 4.9906677173591845e-05, + "loss": 5.801, + "step": 4627 + }, + { + "epoch": 0.02752402702445523, + "grad_norm": 2.6737735271453857, + "learning_rate": 4.9906636847272176e-05, + "loss": 6.2581, + "step": 4628 + }, + { + "epoch": 0.027529974307736228, + "grad_norm": 2.623969554901123, + "learning_rate": 4.990659651225786e-05, + "loss": 5.5044, + "step": 4629 + }, + { + "epoch": 0.027535921591017223, + "grad_norm": 3.069460153579712, + "learning_rate": 4.990655616854891e-05, + "loss": 5.9639, + "step": 4630 + }, + { + "epoch": 0.02754186887429822, + "grad_norm": 2.6889147758483887, + "learning_rate": 4.990651581614534e-05, + "loss": 6.3032, + "step": 4631 + }, + { + "epoch": 0.027547816157579216, + "grad_norm": 3.5284838676452637, + "learning_rate": 4.990647545504716e-05, + "loss": 6.4104, + "step": 4632 + }, + { + "epoch": 0.027553763440860215, + "grad_norm": 2.326162338256836, + "learning_rate": 4.9906435085254384e-05, + "loss": 6.2593, + "step": 4633 + }, + { + "epoch": 0.027559710724141213, + "grad_norm": 1.946542739868164, + "learning_rate": 4.990639470676703e-05, + "loss": 6.1522, + "step": 4634 + }, + { + "epoch": 0.02756565800742221, + "grad_norm": 2.26143741607666, + "learning_rate": 4.990635431958511e-05, + "loss": 6.0189, + "step": 4635 + }, + { + "epoch": 0.027571605290703207, + "grad_norm": 2.8332626819610596, + "learning_rate": 4.990631392370865e-05, + "loss": 5.6226, + "step": 4636 + }, + { + "epoch": 0.027577552573984206, + "grad_norm": 3.919443130493164, + "learning_rate": 4.9906273519137636e-05, + "loss": 6.2147, + "step": 4637 + }, + { + "epoch": 0.0275834998572652, + "grad_norm": 2.4030275344848633, + "learning_rate": 4.9906233105872115e-05, + "loss": 5.6589, + "step": 4638 + }, + { + "epoch": 0.0275894471405462, + "grad_norm": 2.7806994915008545, + "learning_rate": 4.990619268391207e-05, + "loss": 5.4349, + "step": 4639 + }, + { + "epoch": 0.027595394423827194, + "grad_norm": 2.5759501457214355, + "learning_rate": 4.990615225325754e-05, + "loss": 6.1171, + "step": 4640 + }, + { + "epoch": 0.027601341707108193, + "grad_norm": 2.337517023086548, + "learning_rate": 4.990611181390853e-05, + "loss": 5.5514, + "step": 4641 + }, + { + "epoch": 0.02760728899038919, + "grad_norm": 2.6464250087738037, + "learning_rate": 4.990607136586505e-05, + "loss": 6.1852, + "step": 4642 + }, + { + "epoch": 0.027613236273670187, + "grad_norm": 2.030210256576538, + "learning_rate": 4.9906030909127125e-05, + "loss": 6.0919, + "step": 4643 + }, + { + "epoch": 0.027619183556951185, + "grad_norm": 2.4546520709991455, + "learning_rate": 4.990599044369475e-05, + "loss": 6.3018, + "step": 4644 + }, + { + "epoch": 0.027625130840232184, + "grad_norm": 2.508500337600708, + "learning_rate": 4.990594996956796e-05, + "loss": 5.7933, + "step": 4645 + }, + { + "epoch": 0.02763107812351318, + "grad_norm": 2.3363263607025146, + "learning_rate": 4.990590948674676e-05, + "loss": 6.4252, + "step": 4646 + }, + { + "epoch": 0.027637025406794177, + "grad_norm": 2.794673442840576, + "learning_rate": 4.990586899523116e-05, + "loss": 5.3554, + "step": 4647 + }, + { + "epoch": 0.027642972690075172, + "grad_norm": 2.5396835803985596, + "learning_rate": 4.990582849502118e-05, + "loss": 5.2352, + "step": 4648 + }, + { + "epoch": 0.02764891997335617, + "grad_norm": 2.6878976821899414, + "learning_rate": 4.990578798611684e-05, + "loss": 4.9262, + "step": 4649 + }, + { + "epoch": 0.02765486725663717, + "grad_norm": 2.2143187522888184, + "learning_rate": 4.9905747468518136e-05, + "loss": 6.0785, + "step": 4650 + }, + { + "epoch": 0.027660814539918165, + "grad_norm": 2.6812448501586914, + "learning_rate": 4.9905706942225094e-05, + "loss": 5.1692, + "step": 4651 + }, + { + "epoch": 0.027666761823199163, + "grad_norm": 2.5155227184295654, + "learning_rate": 4.9905666407237726e-05, + "loss": 5.0194, + "step": 4652 + }, + { + "epoch": 0.027672709106480158, + "grad_norm": 2.406834363937378, + "learning_rate": 4.9905625863556047e-05, + "loss": 5.1249, + "step": 4653 + }, + { + "epoch": 0.027678656389761157, + "grad_norm": 3.3666698932647705, + "learning_rate": 4.990558531118008e-05, + "loss": 5.9619, + "step": 4654 + }, + { + "epoch": 0.027684603673042155, + "grad_norm": 2.6557607650756836, + "learning_rate": 4.9905544750109826e-05, + "loss": 5.9118, + "step": 4655 + }, + { + "epoch": 0.02769055095632315, + "grad_norm": 2.60469651222229, + "learning_rate": 4.9905504180345304e-05, + "loss": 6.3746, + "step": 4656 + }, + { + "epoch": 0.02769649823960415, + "grad_norm": 2.5417349338531494, + "learning_rate": 4.9905463601886526e-05, + "loss": 5.6975, + "step": 4657 + }, + { + "epoch": 0.027702445522885148, + "grad_norm": 2.723829984664917, + "learning_rate": 4.990542301473351e-05, + "loss": 5.6189, + "step": 4658 + }, + { + "epoch": 0.027708392806166143, + "grad_norm": 3.0544204711914062, + "learning_rate": 4.990538241888627e-05, + "loss": 5.4999, + "step": 4659 + }, + { + "epoch": 0.02771434008944714, + "grad_norm": 3.0536513328552246, + "learning_rate": 4.990534181434481e-05, + "loss": 6.0636, + "step": 4660 + }, + { + "epoch": 0.027720287372728136, + "grad_norm": 3.0618786811828613, + "learning_rate": 4.990530120110916e-05, + "loss": 6.0856, + "step": 4661 + }, + { + "epoch": 0.027726234656009135, + "grad_norm": 2.6602306365966797, + "learning_rate": 4.9905260579179325e-05, + "loss": 5.8341, + "step": 4662 + }, + { + "epoch": 0.027732181939290133, + "grad_norm": 2.729137420654297, + "learning_rate": 4.990521994855532e-05, + "loss": 6.7052, + "step": 4663 + }, + { + "epoch": 0.02773812922257113, + "grad_norm": 3.0878489017486572, + "learning_rate": 4.990517930923716e-05, + "loss": 6.1308, + "step": 4664 + }, + { + "epoch": 0.027744076505852127, + "grad_norm": 2.524418354034424, + "learning_rate": 4.990513866122486e-05, + "loss": 6.2547, + "step": 4665 + }, + { + "epoch": 0.027750023789133126, + "grad_norm": 2.457075595855713, + "learning_rate": 4.990509800451844e-05, + "loss": 6.6615, + "step": 4666 + }, + { + "epoch": 0.02775597107241412, + "grad_norm": 2.474487543106079, + "learning_rate": 4.9905057339117894e-05, + "loss": 6.63, + "step": 4667 + }, + { + "epoch": 0.02776191835569512, + "grad_norm": 2.611098289489746, + "learning_rate": 4.9905016665023254e-05, + "loss": 5.8232, + "step": 4668 + }, + { + "epoch": 0.027767865638976114, + "grad_norm": 2.8012242317199707, + "learning_rate": 4.990497598223454e-05, + "loss": 5.8478, + "step": 4669 + }, + { + "epoch": 0.027773812922257113, + "grad_norm": 2.706725597381592, + "learning_rate": 4.990493529075174e-05, + "loss": 5.8585, + "step": 4670 + }, + { + "epoch": 0.02777976020553811, + "grad_norm": 2.490032196044922, + "learning_rate": 4.99048945905749e-05, + "loss": 6.2181, + "step": 4671 + }, + { + "epoch": 0.027785707488819106, + "grad_norm": 2.4735357761383057, + "learning_rate": 4.990485388170401e-05, + "loss": 6.2153, + "step": 4672 + }, + { + "epoch": 0.027791654772100105, + "grad_norm": 2.7573068141937256, + "learning_rate": 4.9904813164139094e-05, + "loss": 6.217, + "step": 4673 + }, + { + "epoch": 0.027797602055381104, + "grad_norm": 2.4663283824920654, + "learning_rate": 4.990477243788017e-05, + "loss": 6.4153, + "step": 4674 + }, + { + "epoch": 0.0278035493386621, + "grad_norm": 2.737656831741333, + "learning_rate": 4.9904731702927234e-05, + "loss": 6.5209, + "step": 4675 + }, + { + "epoch": 0.027809496621943097, + "grad_norm": 2.5112721920013428, + "learning_rate": 4.990469095928032e-05, + "loss": 5.979, + "step": 4676 + }, + { + "epoch": 0.027815443905224092, + "grad_norm": 2.6602795124053955, + "learning_rate": 4.990465020693944e-05, + "loss": 5.9206, + "step": 4677 + }, + { + "epoch": 0.02782139118850509, + "grad_norm": 2.460538625717163, + "learning_rate": 4.9904609445904606e-05, + "loss": 5.9855, + "step": 4678 + }, + { + "epoch": 0.02782733847178609, + "grad_norm": 2.750138998031616, + "learning_rate": 4.990456867617582e-05, + "loss": 5.8425, + "step": 4679 + }, + { + "epoch": 0.027833285755067085, + "grad_norm": 2.9843833446502686, + "learning_rate": 4.9904527897753114e-05, + "loss": 6.1385, + "step": 4680 + }, + { + "epoch": 0.027839233038348083, + "grad_norm": 2.586923360824585, + "learning_rate": 4.99044871106365e-05, + "loss": 5.6278, + "step": 4681 + }, + { + "epoch": 0.027845180321629078, + "grad_norm": 3.114211082458496, + "learning_rate": 4.990444631482597e-05, + "loss": 6.1259, + "step": 4682 + }, + { + "epoch": 0.027851127604910077, + "grad_norm": 2.3222453594207764, + "learning_rate": 4.990440551032157e-05, + "loss": 6.3048, + "step": 4683 + }, + { + "epoch": 0.027857074888191075, + "grad_norm": 2.15678334236145, + "learning_rate": 4.99043646971233e-05, + "loss": 5.9082, + "step": 4684 + }, + { + "epoch": 0.02786302217147207, + "grad_norm": 3.946350574493408, + "learning_rate": 4.990432387523116e-05, + "loss": 5.6907, + "step": 4685 + }, + { + "epoch": 0.02786896945475307, + "grad_norm": 2.9612419605255127, + "learning_rate": 4.9904283044645185e-05, + "loss": 5.3894, + "step": 4686 + }, + { + "epoch": 0.027874916738034067, + "grad_norm": 2.3602261543273926, + "learning_rate": 4.990424220536538e-05, + "loss": 6.0716, + "step": 4687 + }, + { + "epoch": 0.027880864021315063, + "grad_norm": 2.822300672531128, + "learning_rate": 4.990420135739177e-05, + "loss": 5.9788, + "step": 4688 + }, + { + "epoch": 0.02788681130459606, + "grad_norm": 2.766280174255371, + "learning_rate": 4.990416050072435e-05, + "loss": 5.9945, + "step": 4689 + }, + { + "epoch": 0.027892758587877056, + "grad_norm": 2.810359239578247, + "learning_rate": 4.990411963536315e-05, + "loss": 6.0598, + "step": 4690 + }, + { + "epoch": 0.027898705871158055, + "grad_norm": 2.510014295578003, + "learning_rate": 4.990407876130818e-05, + "loss": 6.1793, + "step": 4691 + }, + { + "epoch": 0.027904653154439053, + "grad_norm": 2.5394086837768555, + "learning_rate": 4.990403787855945e-05, + "loss": 6.1309, + "step": 4692 + }, + { + "epoch": 0.02791060043772005, + "grad_norm": 2.922084093093872, + "learning_rate": 4.990399698711698e-05, + "loss": 6.1956, + "step": 4693 + }, + { + "epoch": 0.027916547721001047, + "grad_norm": 3.6614181995391846, + "learning_rate": 4.9903956086980785e-05, + "loss": 6.535, + "step": 4694 + }, + { + "epoch": 0.027922495004282046, + "grad_norm": 3.3680684566497803, + "learning_rate": 4.990391517815087e-05, + "loss": 6.5729, + "step": 4695 + }, + { + "epoch": 0.02792844228756304, + "grad_norm": 2.522193431854248, + "learning_rate": 4.990387426062726e-05, + "loss": 5.9406, + "step": 4696 + }, + { + "epoch": 0.02793438957084404, + "grad_norm": 2.9665534496307373, + "learning_rate": 4.990383333440996e-05, + "loss": 6.0281, + "step": 4697 + }, + { + "epoch": 0.027940336854125034, + "grad_norm": 2.643218755722046, + "learning_rate": 4.9903792399498996e-05, + "loss": 5.8965, + "step": 4698 + }, + { + "epoch": 0.027946284137406033, + "grad_norm": 2.498765230178833, + "learning_rate": 4.990375145589436e-05, + "loss": 6.0975, + "step": 4699 + }, + { + "epoch": 0.02795223142068703, + "grad_norm": 4.380255699157715, + "learning_rate": 4.99037105035961e-05, + "loss": 6.6298, + "step": 4700 + }, + { + "epoch": 0.027958178703968026, + "grad_norm": 3.925454616546631, + "learning_rate": 4.990366954260421e-05, + "loss": 6.5742, + "step": 4701 + }, + { + "epoch": 0.027964125987249025, + "grad_norm": 2.5388591289520264, + "learning_rate": 4.99036285729187e-05, + "loss": 6.6102, + "step": 4702 + }, + { + "epoch": 0.027970073270530024, + "grad_norm": 2.6793510913848877, + "learning_rate": 4.9903587594539594e-05, + "loss": 6.4265, + "step": 4703 + }, + { + "epoch": 0.02797602055381102, + "grad_norm": 2.8652729988098145, + "learning_rate": 4.9903546607466903e-05, + "loss": 6.4567, + "step": 4704 + }, + { + "epoch": 0.027981967837092017, + "grad_norm": 2.936021089553833, + "learning_rate": 4.990350561170063e-05, + "loss": 6.404, + "step": 4705 + }, + { + "epoch": 0.027987915120373012, + "grad_norm": 3.256253719329834, + "learning_rate": 4.9903464607240816e-05, + "loss": 6.2291, + "step": 4706 + }, + { + "epoch": 0.02799386240365401, + "grad_norm": 2.8268187046051025, + "learning_rate": 4.990342359408745e-05, + "loss": 6.2582, + "step": 4707 + }, + { + "epoch": 0.02799980968693501, + "grad_norm": 2.5889041423797607, + "learning_rate": 4.9903382572240556e-05, + "loss": 6.3325, + "step": 4708 + }, + { + "epoch": 0.028005756970216004, + "grad_norm": 2.635388135910034, + "learning_rate": 4.9903341541700154e-05, + "loss": 6.1256, + "step": 4709 + }, + { + "epoch": 0.028011704253497003, + "grad_norm": 2.562976360321045, + "learning_rate": 4.990330050246625e-05, + "loss": 5.9333, + "step": 4710 + }, + { + "epoch": 0.028017651536777998, + "grad_norm": 3.488809585571289, + "learning_rate": 4.990325945453887e-05, + "loss": 6.3651, + "step": 4711 + }, + { + "epoch": 0.028023598820058997, + "grad_norm": 2.963324546813965, + "learning_rate": 4.9903218397918e-05, + "loss": 6.718, + "step": 4712 + }, + { + "epoch": 0.028029546103339995, + "grad_norm": 2.4070823192596436, + "learning_rate": 4.990317733260369e-05, + "loss": 6.2502, + "step": 4713 + }, + { + "epoch": 0.02803549338662099, + "grad_norm": 2.711190938949585, + "learning_rate": 4.9903136258595925e-05, + "loss": 6.0397, + "step": 4714 + }, + { + "epoch": 0.02804144066990199, + "grad_norm": 2.466150999069214, + "learning_rate": 4.9903095175894746e-05, + "loss": 5.9344, + "step": 4715 + }, + { + "epoch": 0.028047387953182987, + "grad_norm": 2.4558048248291016, + "learning_rate": 4.990305408450014e-05, + "loss": 6.1121, + "step": 4716 + }, + { + "epoch": 0.028053335236463982, + "grad_norm": 2.4023051261901855, + "learning_rate": 4.990301298441215e-05, + "loss": 6.0202, + "step": 4717 + }, + { + "epoch": 0.02805928251974498, + "grad_norm": 3.118098258972168, + "learning_rate": 4.9902971875630765e-05, + "loss": 6.5365, + "step": 4718 + }, + { + "epoch": 0.028065229803025976, + "grad_norm": 2.3716087341308594, + "learning_rate": 4.990293075815602e-05, + "loss": 6.1382, + "step": 4719 + }, + { + "epoch": 0.028071177086306975, + "grad_norm": 2.4663496017456055, + "learning_rate": 4.990288963198791e-05, + "loss": 5.9804, + "step": 4720 + }, + { + "epoch": 0.028077124369587973, + "grad_norm": 2.2623326778411865, + "learning_rate": 4.9902848497126466e-05, + "loss": 5.9666, + "step": 4721 + }, + { + "epoch": 0.02808307165286897, + "grad_norm": 2.4884161949157715, + "learning_rate": 4.990280735357168e-05, + "loss": 6.0203, + "step": 4722 + }, + { + "epoch": 0.028089018936149967, + "grad_norm": 2.6154520511627197, + "learning_rate": 4.990276620132359e-05, + "loss": 5.9191, + "step": 4723 + }, + { + "epoch": 0.028094966219430965, + "grad_norm": 2.692396879196167, + "learning_rate": 4.990272504038221e-05, + "loss": 6.5314, + "step": 4724 + }, + { + "epoch": 0.02810091350271196, + "grad_norm": 2.483306407928467, + "learning_rate": 4.990268387074754e-05, + "loss": 6.6522, + "step": 4725 + }, + { + "epoch": 0.02810686078599296, + "grad_norm": 3.2098593711853027, + "learning_rate": 4.99026426924196e-05, + "loss": 5.8712, + "step": 4726 + }, + { + "epoch": 0.028112808069273954, + "grad_norm": 2.7335867881774902, + "learning_rate": 4.99026015053984e-05, + "loss": 5.7678, + "step": 4727 + }, + { + "epoch": 0.028118755352554953, + "grad_norm": 2.7587473392486572, + "learning_rate": 4.990256030968396e-05, + "loss": 6.4233, + "step": 4728 + }, + { + "epoch": 0.02812470263583595, + "grad_norm": 2.7686030864715576, + "learning_rate": 4.99025191052763e-05, + "loss": 6.4572, + "step": 4729 + }, + { + "epoch": 0.028130649919116946, + "grad_norm": 2.755916118621826, + "learning_rate": 4.990247789217543e-05, + "loss": 5.9858, + "step": 4730 + }, + { + "epoch": 0.028136597202397945, + "grad_norm": 2.614316463470459, + "learning_rate": 4.990243667038135e-05, + "loss": 6.2315, + "step": 4731 + }, + { + "epoch": 0.028142544485678943, + "grad_norm": 2.0796027183532715, + "learning_rate": 4.990239543989409e-05, + "loss": 6.236, + "step": 4732 + }, + { + "epoch": 0.02814849176895994, + "grad_norm": 2.623412847518921, + "learning_rate": 4.9902354200713665e-05, + "loss": 6.3962, + "step": 4733 + }, + { + "epoch": 0.028154439052240937, + "grad_norm": 2.2746191024780273, + "learning_rate": 4.9902312952840086e-05, + "loss": 5.9101, + "step": 4734 + }, + { + "epoch": 0.028160386335521932, + "grad_norm": 2.102444887161255, + "learning_rate": 4.990227169627336e-05, + "loss": 6.4652, + "step": 4735 + }, + { + "epoch": 0.02816633361880293, + "grad_norm": 2.7720580101013184, + "learning_rate": 4.990223043101352e-05, + "loss": 5.8981, + "step": 4736 + }, + { + "epoch": 0.02817228090208393, + "grad_norm": 2.4479453563690186, + "learning_rate": 4.9902189157060564e-05, + "loss": 6.3554, + "step": 4737 + }, + { + "epoch": 0.028178228185364924, + "grad_norm": 2.7894740104675293, + "learning_rate": 4.990214787441451e-05, + "loss": 6.0017, + "step": 4738 + }, + { + "epoch": 0.028184175468645923, + "grad_norm": 2.869884490966797, + "learning_rate": 4.990210658307537e-05, + "loss": 5.9419, + "step": 4739 + }, + { + "epoch": 0.028190122751926918, + "grad_norm": 2.262723207473755, + "learning_rate": 4.990206528304316e-05, + "loss": 6.172, + "step": 4740 + }, + { + "epoch": 0.028196070035207917, + "grad_norm": 2.179358720779419, + "learning_rate": 4.99020239743179e-05, + "loss": 6.5204, + "step": 4741 + }, + { + "epoch": 0.028202017318488915, + "grad_norm": 2.085179328918457, + "learning_rate": 4.9901982656899606e-05, + "loss": 6.3972, + "step": 4742 + }, + { + "epoch": 0.02820796460176991, + "grad_norm": 1.657567024230957, + "learning_rate": 4.990194133078828e-05, + "loss": 6.4199, + "step": 4743 + }, + { + "epoch": 0.02821391188505091, + "grad_norm": 1.8054349422454834, + "learning_rate": 4.990189999598395e-05, + "loss": 6.3768, + "step": 4744 + }, + { + "epoch": 0.028219859168331907, + "grad_norm": 2.0365710258483887, + "learning_rate": 4.990185865248662e-05, + "loss": 6.3228, + "step": 4745 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 2.069211006164551, + "learning_rate": 4.9901817300296304e-05, + "loss": 5.9874, + "step": 4746 + }, + { + "epoch": 0.0282317537348939, + "grad_norm": 2.3339149951934814, + "learning_rate": 4.9901775939413026e-05, + "loss": 6.1526, + "step": 4747 + }, + { + "epoch": 0.028237701018174896, + "grad_norm": 2.0425326824188232, + "learning_rate": 4.99017345698368e-05, + "loss": 6.2157, + "step": 4748 + }, + { + "epoch": 0.028243648301455895, + "grad_norm": 2.1598799228668213, + "learning_rate": 4.9901693191567625e-05, + "loss": 6.2653, + "step": 4749 + }, + { + "epoch": 0.028249595584736893, + "grad_norm": 2.066566228866577, + "learning_rate": 4.990165180460553e-05, + "loss": 6.3788, + "step": 4750 + }, + { + "epoch": 0.02825554286801789, + "grad_norm": 2.2870383262634277, + "learning_rate": 4.9901610408950527e-05, + "loss": 6.2608, + "step": 4751 + }, + { + "epoch": 0.028261490151298887, + "grad_norm": 2.3180785179138184, + "learning_rate": 4.990156900460263e-05, + "loss": 6.3545, + "step": 4752 + }, + { + "epoch": 0.028267437434579885, + "grad_norm": 2.55261492729187, + "learning_rate": 4.990152759156185e-05, + "loss": 6.3888, + "step": 4753 + }, + { + "epoch": 0.02827338471786088, + "grad_norm": 2.087925910949707, + "learning_rate": 4.990148616982821e-05, + "loss": 6.3585, + "step": 4754 + }, + { + "epoch": 0.02827933200114188, + "grad_norm": 2.2446579933166504, + "learning_rate": 4.9901444739401714e-05, + "loss": 6.4655, + "step": 4755 + }, + { + "epoch": 0.028285279284422874, + "grad_norm": 2.2980077266693115, + "learning_rate": 4.990140330028238e-05, + "loss": 6.3776, + "step": 4756 + }, + { + "epoch": 0.028291226567703873, + "grad_norm": 2.0658226013183594, + "learning_rate": 4.9901361852470224e-05, + "loss": 6.0412, + "step": 4757 + }, + { + "epoch": 0.02829717385098487, + "grad_norm": 2.8402137756347656, + "learning_rate": 4.990132039596526e-05, + "loss": 6.0017, + "step": 4758 + }, + { + "epoch": 0.028303121134265866, + "grad_norm": 2.4620237350463867, + "learning_rate": 4.99012789307675e-05, + "loss": 5.9235, + "step": 4759 + }, + { + "epoch": 0.028309068417546865, + "grad_norm": 2.3318607807159424, + "learning_rate": 4.990123745687697e-05, + "loss": 6.2464, + "step": 4760 + }, + { + "epoch": 0.028315015700827863, + "grad_norm": 2.4998981952667236, + "learning_rate": 4.9901195974293666e-05, + "loss": 6.2731, + "step": 4761 + }, + { + "epoch": 0.02832096298410886, + "grad_norm": 2.4374287128448486, + "learning_rate": 4.9901154483017614e-05, + "loss": 6.362, + "step": 4762 + }, + { + "epoch": 0.028326910267389857, + "grad_norm": 2.6257424354553223, + "learning_rate": 4.990111298304882e-05, + "loss": 6.1456, + "step": 4763 + }, + { + "epoch": 0.028332857550670852, + "grad_norm": 2.74934458732605, + "learning_rate": 4.990107147438732e-05, + "loss": 6.0121, + "step": 4764 + }, + { + "epoch": 0.02833880483395185, + "grad_norm": 2.33137583732605, + "learning_rate": 4.9901029957033106e-05, + "loss": 6.0207, + "step": 4765 + }, + { + "epoch": 0.02834475211723285, + "grad_norm": 1.9006321430206299, + "learning_rate": 4.9900988430986196e-05, + "loss": 5.8946, + "step": 4766 + }, + { + "epoch": 0.028350699400513844, + "grad_norm": 1.9786534309387207, + "learning_rate": 4.990094689624661e-05, + "loss": 5.7782, + "step": 4767 + }, + { + "epoch": 0.028356646683794843, + "grad_norm": 2.1215951442718506, + "learning_rate": 4.9900905352814365e-05, + "loss": 5.8129, + "step": 4768 + }, + { + "epoch": 0.02836259396707584, + "grad_norm": 2.9569597244262695, + "learning_rate": 4.9900863800689465e-05, + "loss": 5.7882, + "step": 4769 + }, + { + "epoch": 0.028368541250356837, + "grad_norm": 2.720447540283203, + "learning_rate": 4.990082223987193e-05, + "loss": 5.9075, + "step": 4770 + }, + { + "epoch": 0.028374488533637835, + "grad_norm": 2.8727002143859863, + "learning_rate": 4.990078067036178e-05, + "loss": 6.1571, + "step": 4771 + }, + { + "epoch": 0.02838043581691883, + "grad_norm": 2.2992594242095947, + "learning_rate": 4.990073909215902e-05, + "loss": 6.0195, + "step": 4772 + }, + { + "epoch": 0.02838638310019983, + "grad_norm": 2.0323293209075928, + "learning_rate": 4.990069750526368e-05, + "loss": 5.8049, + "step": 4773 + }, + { + "epoch": 0.028392330383480827, + "grad_norm": 2.938795328140259, + "learning_rate": 4.9900655909675755e-05, + "loss": 6.9215, + "step": 4774 + }, + { + "epoch": 0.028398277666761822, + "grad_norm": 2.6333048343658447, + "learning_rate": 4.990061430539527e-05, + "loss": 5.868, + "step": 4775 + }, + { + "epoch": 0.02840422495004282, + "grad_norm": 2.8569674491882324, + "learning_rate": 4.990057269242223e-05, + "loss": 5.8782, + "step": 4776 + }, + { + "epoch": 0.028410172233323816, + "grad_norm": 2.62206768989563, + "learning_rate": 4.9900531070756666e-05, + "loss": 5.7751, + "step": 4777 + }, + { + "epoch": 0.028416119516604815, + "grad_norm": 2.2112414836883545, + "learning_rate": 4.990048944039858e-05, + "loss": 5.7985, + "step": 4778 + }, + { + "epoch": 0.028422066799885813, + "grad_norm": 2.1571342945098877, + "learning_rate": 4.990044780134799e-05, + "loss": 5.9089, + "step": 4779 + }, + { + "epoch": 0.028428014083166808, + "grad_norm": 2.4310410022735596, + "learning_rate": 4.9900406153604916e-05, + "loss": 5.6728, + "step": 4780 + }, + { + "epoch": 0.028433961366447807, + "grad_norm": 2.25822377204895, + "learning_rate": 4.990036449716937e-05, + "loss": 5.5808, + "step": 4781 + }, + { + "epoch": 0.028439908649728805, + "grad_norm": 2.3068299293518066, + "learning_rate": 4.990032283204136e-05, + "loss": 5.729, + "step": 4782 + }, + { + "epoch": 0.0284458559330098, + "grad_norm": 2.0582191944122314, + "learning_rate": 4.9900281158220905e-05, + "loss": 5.6877, + "step": 4783 + }, + { + "epoch": 0.0284518032162908, + "grad_norm": 2.572824239730835, + "learning_rate": 4.9900239475708015e-05, + "loss": 5.9522, + "step": 4784 + }, + { + "epoch": 0.028457750499571794, + "grad_norm": 2.299001693725586, + "learning_rate": 4.990019778450271e-05, + "loss": 5.7579, + "step": 4785 + }, + { + "epoch": 0.028463697782852793, + "grad_norm": 2.231381893157959, + "learning_rate": 4.990015608460501e-05, + "loss": 5.756, + "step": 4786 + }, + { + "epoch": 0.02846964506613379, + "grad_norm": 1.7982486486434937, + "learning_rate": 4.990011437601492e-05, + "loss": 5.8076, + "step": 4787 + }, + { + "epoch": 0.028475592349414786, + "grad_norm": 1.8788951635360718, + "learning_rate": 4.990007265873245e-05, + "loss": 5.8798, + "step": 4788 + }, + { + "epoch": 0.028481539632695785, + "grad_norm": 1.6190022230148315, + "learning_rate": 4.9900030932757623e-05, + "loss": 5.5695, + "step": 4789 + }, + { + "epoch": 0.028487486915976783, + "grad_norm": 1.9226019382476807, + "learning_rate": 4.9899989198090455e-05, + "loss": 5.671, + "step": 4790 + }, + { + "epoch": 0.02849343419925778, + "grad_norm": 1.7437139749526978, + "learning_rate": 4.989994745473097e-05, + "loss": 5.6728, + "step": 4791 + }, + { + "epoch": 0.028499381482538777, + "grad_norm": 1.624126672744751, + "learning_rate": 4.989990570267915e-05, + "loss": 5.6209, + "step": 4792 + }, + { + "epoch": 0.028505328765819772, + "grad_norm": 2.1894004344940186, + "learning_rate": 4.9899863941935046e-05, + "loss": 5.6669, + "step": 4793 + }, + { + "epoch": 0.02851127604910077, + "grad_norm": 2.2243428230285645, + "learning_rate": 4.9899822172498646e-05, + "loss": 5.4557, + "step": 4794 + }, + { + "epoch": 0.02851722333238177, + "grad_norm": 2.032611608505249, + "learning_rate": 4.989978039436998e-05, + "loss": 5.7883, + "step": 4795 + }, + { + "epoch": 0.028523170615662764, + "grad_norm": 1.8496538400650024, + "learning_rate": 4.989973860754906e-05, + "loss": 5.6329, + "step": 4796 + }, + { + "epoch": 0.028529117898943763, + "grad_norm": 1.7072707414627075, + "learning_rate": 4.989969681203589e-05, + "loss": 5.7242, + "step": 4797 + }, + { + "epoch": 0.02853506518222476, + "grad_norm": 1.7351912260055542, + "learning_rate": 4.9899655007830504e-05, + "loss": 5.648, + "step": 4798 + }, + { + "epoch": 0.028541012465505756, + "grad_norm": 2.514162302017212, + "learning_rate": 4.9899613194932904e-05, + "loss": 5.556, + "step": 4799 + }, + { + "epoch": 0.028546959748786755, + "grad_norm": 10.245063781738281, + "learning_rate": 4.98995713733431e-05, + "loss": 5.5922, + "step": 4800 + }, + { + "epoch": 0.02855290703206775, + "grad_norm": 2.012106418609619, + "learning_rate": 4.989952954306112e-05, + "loss": 5.5092, + "step": 4801 + }, + { + "epoch": 0.02855885431534875, + "grad_norm": 1.8654139041900635, + "learning_rate": 4.9899487704086966e-05, + "loss": 5.4164, + "step": 4802 + }, + { + "epoch": 0.028564801598629747, + "grad_norm": 1.778798222541809, + "learning_rate": 4.9899445856420656e-05, + "loss": 5.5537, + "step": 4803 + }, + { + "epoch": 0.028570748881910742, + "grad_norm": 2.205038547515869, + "learning_rate": 4.989940400006221e-05, + "loss": 5.9338, + "step": 4804 + }, + { + "epoch": 0.02857669616519174, + "grad_norm": 2.3908839225769043, + "learning_rate": 4.989936213501164e-05, + "loss": 5.8962, + "step": 4805 + }, + { + "epoch": 0.028582643448472736, + "grad_norm": 2.3438172340393066, + "learning_rate": 4.9899320261268966e-05, + "loss": 5.8133, + "step": 4806 + }, + { + "epoch": 0.028588590731753735, + "grad_norm": 2.4021737575531006, + "learning_rate": 4.989927837883419e-05, + "loss": 5.8366, + "step": 4807 + }, + { + "epoch": 0.028594538015034733, + "grad_norm": 1.9976004362106323, + "learning_rate": 4.989923648770734e-05, + "loss": 5.6976, + "step": 4808 + }, + { + "epoch": 0.028600485298315728, + "grad_norm": 2.2234697341918945, + "learning_rate": 4.989919458788841e-05, + "loss": 5.7871, + "step": 4809 + }, + { + "epoch": 0.028606432581596727, + "grad_norm": 2.203223705291748, + "learning_rate": 4.989915267937744e-05, + "loss": 5.5799, + "step": 4810 + }, + { + "epoch": 0.028612379864877725, + "grad_norm": 2.2155261039733887, + "learning_rate": 4.989911076217442e-05, + "loss": 5.6022, + "step": 4811 + }, + { + "epoch": 0.02861832714815872, + "grad_norm": 1.9379621744155884, + "learning_rate": 4.989906883627939e-05, + "loss": 5.8647, + "step": 4812 + }, + { + "epoch": 0.02862427443143972, + "grad_norm": 2.0589749813079834, + "learning_rate": 4.9899026901692345e-05, + "loss": 5.6048, + "step": 4813 + }, + { + "epoch": 0.028630221714720714, + "grad_norm": 2.3813774585723877, + "learning_rate": 4.9898984958413315e-05, + "loss": 5.6726, + "step": 4814 + }, + { + "epoch": 0.028636168998001713, + "grad_norm": 2.06425142288208, + "learning_rate": 4.98989430064423e-05, + "loss": 5.8505, + "step": 4815 + }, + { + "epoch": 0.02864211628128271, + "grad_norm": 2.199697494506836, + "learning_rate": 4.9898901045779326e-05, + "loss": 5.6114, + "step": 4816 + }, + { + "epoch": 0.028648063564563706, + "grad_norm": 2.136411428451538, + "learning_rate": 4.98988590764244e-05, + "loss": 5.3987, + "step": 4817 + }, + { + "epoch": 0.028654010847844705, + "grad_norm": 1.914929986000061, + "learning_rate": 4.9898817098377534e-05, + "loss": 5.702, + "step": 4818 + }, + { + "epoch": 0.028659958131125703, + "grad_norm": 2.316027879714966, + "learning_rate": 4.989877511163876e-05, + "loss": 5.5886, + "step": 4819 + }, + { + "epoch": 0.0286659054144067, + "grad_norm": 3.2775018215179443, + "learning_rate": 4.9898733116208076e-05, + "loss": 5.5337, + "step": 4820 + }, + { + "epoch": 0.028671852697687697, + "grad_norm": 2.16430926322937, + "learning_rate": 4.989869111208549e-05, + "loss": 5.7189, + "step": 4821 + }, + { + "epoch": 0.028677799980968692, + "grad_norm": 2.1936638355255127, + "learning_rate": 4.9898649099271046e-05, + "loss": 5.2942, + "step": 4822 + }, + { + "epoch": 0.02868374726424969, + "grad_norm": 2.262485980987549, + "learning_rate": 4.9898607077764736e-05, + "loss": 5.4284, + "step": 4823 + }, + { + "epoch": 0.02868969454753069, + "grad_norm": 1.7890170812606812, + "learning_rate": 4.989856504756657e-05, + "loss": 5.6021, + "step": 4824 + }, + { + "epoch": 0.028695641830811684, + "grad_norm": 1.747862696647644, + "learning_rate": 4.9898523008676585e-05, + "loss": 5.72, + "step": 4825 + }, + { + "epoch": 0.028701589114092683, + "grad_norm": 1.9750064611434937, + "learning_rate": 4.989848096109477e-05, + "loss": 5.8923, + "step": 4826 + }, + { + "epoch": 0.02870753639737368, + "grad_norm": 2.0249626636505127, + "learning_rate": 4.989843890482117e-05, + "loss": 5.4866, + "step": 4827 + }, + { + "epoch": 0.028713483680654676, + "grad_norm": 2.2737395763397217, + "learning_rate": 4.9898396839855765e-05, + "loss": 5.5498, + "step": 4828 + }, + { + "epoch": 0.028719430963935675, + "grad_norm": 2.2852187156677246, + "learning_rate": 4.98983547661986e-05, + "loss": 5.672, + "step": 4829 + }, + { + "epoch": 0.02872537824721667, + "grad_norm": 1.9441994428634644, + "learning_rate": 4.989831268384967e-05, + "loss": 5.4933, + "step": 4830 + }, + { + "epoch": 0.02873132553049767, + "grad_norm": 1.9561070203781128, + "learning_rate": 4.989827059280899e-05, + "loss": 5.7465, + "step": 4831 + }, + { + "epoch": 0.028737272813778667, + "grad_norm": 2.482849597930908, + "learning_rate": 4.9898228493076594e-05, + "loss": 5.4338, + "step": 4832 + }, + { + "epoch": 0.028743220097059662, + "grad_norm": 1.8582524061203003, + "learning_rate": 4.989818638465247e-05, + "loss": 5.5378, + "step": 4833 + }, + { + "epoch": 0.02874916738034066, + "grad_norm": 2.119783639907837, + "learning_rate": 4.9898144267536654e-05, + "loss": 5.6012, + "step": 4834 + }, + { + "epoch": 0.028755114663621656, + "grad_norm": 2.333965301513672, + "learning_rate": 4.989810214172915e-05, + "loss": 5.7376, + "step": 4835 + }, + { + "epoch": 0.028761061946902654, + "grad_norm": 2.600861072540283, + "learning_rate": 4.989806000722999e-05, + "loss": 6.2747, + "step": 4836 + }, + { + "epoch": 0.028767009230183653, + "grad_norm": 2.3250534534454346, + "learning_rate": 4.989801786403916e-05, + "loss": 5.5993, + "step": 4837 + }, + { + "epoch": 0.028772956513464648, + "grad_norm": 2.507377862930298, + "learning_rate": 4.9897975712156686e-05, + "loss": 5.3919, + "step": 4838 + }, + { + "epoch": 0.028778903796745647, + "grad_norm": 1.9882018566131592, + "learning_rate": 4.9897933551582596e-05, + "loss": 5.5939, + "step": 4839 + }, + { + "epoch": 0.028784851080026645, + "grad_norm": 2.235269784927368, + "learning_rate": 4.989789138231688e-05, + "loss": 5.4036, + "step": 4840 + }, + { + "epoch": 0.02879079836330764, + "grad_norm": 1.895071029663086, + "learning_rate": 4.989784920435959e-05, + "loss": 5.7259, + "step": 4841 + }, + { + "epoch": 0.02879674564658864, + "grad_norm": 2.0197908878326416, + "learning_rate": 4.989780701771071e-05, + "loss": 5.5114, + "step": 4842 + }, + { + "epoch": 0.028802692929869634, + "grad_norm": 1.9679557085037231, + "learning_rate": 4.989776482237025e-05, + "loss": 5.5798, + "step": 4843 + }, + { + "epoch": 0.028808640213150633, + "grad_norm": 1.980610728263855, + "learning_rate": 4.989772261833825e-05, + "loss": 5.5509, + "step": 4844 + }, + { + "epoch": 0.02881458749643163, + "grad_norm": 2.4565272331237793, + "learning_rate": 4.989768040561471e-05, + "loss": 5.4723, + "step": 4845 + }, + { + "epoch": 0.028820534779712626, + "grad_norm": 2.0567848682403564, + "learning_rate": 4.989763818419964e-05, + "loss": 5.546, + "step": 4846 + }, + { + "epoch": 0.028826482062993625, + "grad_norm": 2.0259108543395996, + "learning_rate": 4.989759595409307e-05, + "loss": 5.4138, + "step": 4847 + }, + { + "epoch": 0.028832429346274623, + "grad_norm": 1.9334442615509033, + "learning_rate": 4.9897553715295003e-05, + "loss": 5.7036, + "step": 4848 + }, + { + "epoch": 0.02883837662955562, + "grad_norm": 1.8335916996002197, + "learning_rate": 4.989751146780546e-05, + "loss": 5.6399, + "step": 4849 + }, + { + "epoch": 0.028844323912836617, + "grad_norm": 2.129821538925171, + "learning_rate": 4.989746921162445e-05, + "loss": 5.7108, + "step": 4850 + }, + { + "epoch": 0.028850271196117612, + "grad_norm": 2.4127001762390137, + "learning_rate": 4.9897426946751994e-05, + "loss": 5.3901, + "step": 4851 + }, + { + "epoch": 0.02885621847939861, + "grad_norm": 1.9506126642227173, + "learning_rate": 4.98973846731881e-05, + "loss": 5.7781, + "step": 4852 + }, + { + "epoch": 0.02886216576267961, + "grad_norm": 1.6746875047683716, + "learning_rate": 4.9897342390932786e-05, + "loss": 5.7408, + "step": 4853 + }, + { + "epoch": 0.028868113045960604, + "grad_norm": 1.95681893825531, + "learning_rate": 4.989730009998607e-05, + "loss": 5.7181, + "step": 4854 + }, + { + "epoch": 0.028874060329241603, + "grad_norm": 1.782030701637268, + "learning_rate": 4.9897257800347964e-05, + "loss": 5.5901, + "step": 4855 + }, + { + "epoch": 0.0288800076125226, + "grad_norm": 1.7590057849884033, + "learning_rate": 4.9897215492018476e-05, + "loss": 5.4566, + "step": 4856 + }, + { + "epoch": 0.028885954895803596, + "grad_norm": 2.4675025939941406, + "learning_rate": 4.989717317499764e-05, + "loss": 5.7738, + "step": 4857 + }, + { + "epoch": 0.028891902179084595, + "grad_norm": 2.221975326538086, + "learning_rate": 4.989713084928545e-05, + "loss": 5.591, + "step": 4858 + }, + { + "epoch": 0.02889784946236559, + "grad_norm": 2.21158504486084, + "learning_rate": 4.989708851488192e-05, + "loss": 5.7755, + "step": 4859 + }, + { + "epoch": 0.02890379674564659, + "grad_norm": 2.2253987789154053, + "learning_rate": 4.989704617178709e-05, + "loss": 5.8653, + "step": 4860 + }, + { + "epoch": 0.028909744028927587, + "grad_norm": 2.3298027515411377, + "learning_rate": 4.989700382000094e-05, + "loss": 5.3371, + "step": 4861 + }, + { + "epoch": 0.028915691312208582, + "grad_norm": 2.1918935775756836, + "learning_rate": 4.989696145952352e-05, + "loss": 5.4893, + "step": 4862 + }, + { + "epoch": 0.02892163859548958, + "grad_norm": 2.422117233276367, + "learning_rate": 4.989691909035482e-05, + "loss": 5.8775, + "step": 4863 + }, + { + "epoch": 0.02892758587877058, + "grad_norm": 2.4346981048583984, + "learning_rate": 4.989687671249487e-05, + "loss": 6.3671, + "step": 4864 + }, + { + "epoch": 0.028933533162051574, + "grad_norm": 2.094780921936035, + "learning_rate": 4.989683432594367e-05, + "loss": 5.7814, + "step": 4865 + }, + { + "epoch": 0.028939480445332573, + "grad_norm": 2.240318775177002, + "learning_rate": 4.9896791930701244e-05, + "loss": 5.6606, + "step": 4866 + }, + { + "epoch": 0.028945427728613568, + "grad_norm": 2.102381706237793, + "learning_rate": 4.989674952676761e-05, + "loss": 5.8477, + "step": 4867 + }, + { + "epoch": 0.028951375011894567, + "grad_norm": 2.2786238193511963, + "learning_rate": 4.989670711414277e-05, + "loss": 5.8786, + "step": 4868 + }, + { + "epoch": 0.028957322295175565, + "grad_norm": 2.079899549484253, + "learning_rate": 4.989666469282675e-05, + "loss": 6.2171, + "step": 4869 + }, + { + "epoch": 0.02896326957845656, + "grad_norm": 2.024061679840088, + "learning_rate": 4.989662226281956e-05, + "loss": 6.2889, + "step": 4870 + }, + { + "epoch": 0.02896921686173756, + "grad_norm": 2.1397578716278076, + "learning_rate": 4.989657982412122e-05, + "loss": 6.2477, + "step": 4871 + }, + { + "epoch": 0.028975164145018554, + "grad_norm": 2.1303393840789795, + "learning_rate": 4.989653737673174e-05, + "loss": 6.3005, + "step": 4872 + }, + { + "epoch": 0.028981111428299552, + "grad_norm": 2.4091451168060303, + "learning_rate": 4.989649492065114e-05, + "loss": 5.997, + "step": 4873 + }, + { + "epoch": 0.02898705871158055, + "grad_norm": 2.2236886024475098, + "learning_rate": 4.989645245587942e-05, + "loss": 5.7886, + "step": 4874 + }, + { + "epoch": 0.028993005994861546, + "grad_norm": 2.6160736083984375, + "learning_rate": 4.989640998241661e-05, + "loss": 6.1542, + "step": 4875 + }, + { + "epoch": 0.028998953278142545, + "grad_norm": 2.4163296222686768, + "learning_rate": 4.989636750026273e-05, + "loss": 6.392, + "step": 4876 + }, + { + "epoch": 0.029004900561423543, + "grad_norm": 2.079172372817993, + "learning_rate": 4.989632500941778e-05, + "loss": 6.2886, + "step": 4877 + }, + { + "epoch": 0.02901084784470454, + "grad_norm": 2.628694772720337, + "learning_rate": 4.989628250988178e-05, + "loss": 6.0359, + "step": 4878 + }, + { + "epoch": 0.029016795127985537, + "grad_norm": 2.2080392837524414, + "learning_rate": 4.989624000165474e-05, + "loss": 5.9916, + "step": 4879 + }, + { + "epoch": 0.029022742411266532, + "grad_norm": 2.4130380153656006, + "learning_rate": 4.9896197484736685e-05, + "loss": 6.3835, + "step": 4880 + }, + { + "epoch": 0.02902868969454753, + "grad_norm": 2.328511953353882, + "learning_rate": 4.989615495912762e-05, + "loss": 5.838, + "step": 4881 + }, + { + "epoch": 0.02903463697782853, + "grad_norm": 2.273345470428467, + "learning_rate": 4.989611242482757e-05, + "loss": 5.8764, + "step": 4882 + }, + { + "epoch": 0.029040584261109524, + "grad_norm": 2.1498537063598633, + "learning_rate": 4.9896069881836535e-05, + "loss": 6.1562, + "step": 4883 + }, + { + "epoch": 0.029046531544390523, + "grad_norm": 2.497267723083496, + "learning_rate": 4.989602733015455e-05, + "loss": 5.6708, + "step": 4884 + }, + { + "epoch": 0.02905247882767152, + "grad_norm": 2.232802152633667, + "learning_rate": 4.989598476978161e-05, + "loss": 5.6854, + "step": 4885 + }, + { + "epoch": 0.029058426110952516, + "grad_norm": 2.0582375526428223, + "learning_rate": 4.989594220071775e-05, + "loss": 6.5288, + "step": 4886 + }, + { + "epoch": 0.029064373394233515, + "grad_norm": 3.2556731700897217, + "learning_rate": 4.989589962296296e-05, + "loss": 5.9985, + "step": 4887 + }, + { + "epoch": 0.02907032067751451, + "grad_norm": 2.2807655334472656, + "learning_rate": 4.989585703651728e-05, + "loss": 6.1802, + "step": 4888 + }, + { + "epoch": 0.02907626796079551, + "grad_norm": 2.379136085510254, + "learning_rate": 4.989581444138071e-05, + "loss": 6.3531, + "step": 4889 + }, + { + "epoch": 0.029082215244076507, + "grad_norm": 2.9518685340881348, + "learning_rate": 4.989577183755327e-05, + "loss": 6.0689, + "step": 4890 + }, + { + "epoch": 0.029088162527357502, + "grad_norm": 2.823340654373169, + "learning_rate": 4.9895729225034973e-05, + "loss": 6.3405, + "step": 4891 + }, + { + "epoch": 0.0290941098106385, + "grad_norm": 2.4327731132507324, + "learning_rate": 4.989568660382583e-05, + "loss": 6.4928, + "step": 4892 + }, + { + "epoch": 0.0291000570939195, + "grad_norm": 2.0744240283966064, + "learning_rate": 4.9895643973925864e-05, + "loss": 6.2664, + "step": 4893 + }, + { + "epoch": 0.029106004377200494, + "grad_norm": 2.373710870742798, + "learning_rate": 4.9895601335335085e-05, + "loss": 5.9738, + "step": 4894 + }, + { + "epoch": 0.029111951660481493, + "grad_norm": 2.2934412956237793, + "learning_rate": 4.9895558688053505e-05, + "loss": 6.1353, + "step": 4895 + }, + { + "epoch": 0.029117898943762488, + "grad_norm": 2.4360926151275635, + "learning_rate": 4.989551603208114e-05, + "loss": 5.4768, + "step": 4896 + }, + { + "epoch": 0.029123846227043487, + "grad_norm": 2.8072469234466553, + "learning_rate": 4.989547336741802e-05, + "loss": 5.977, + "step": 4897 + }, + { + "epoch": 0.029129793510324485, + "grad_norm": 2.7759921550750732, + "learning_rate": 4.9895430694064135e-05, + "loss": 6.3918, + "step": 4898 + }, + { + "epoch": 0.02913574079360548, + "grad_norm": 2.4547574520111084, + "learning_rate": 4.989538801201953e-05, + "loss": 6.0461, + "step": 4899 + }, + { + "epoch": 0.02914168807688648, + "grad_norm": 2.6097168922424316, + "learning_rate": 4.9895345321284184e-05, + "loss": 5.88, + "step": 4900 + }, + { + "epoch": 0.029147635360167474, + "grad_norm": 2.8312575817108154, + "learning_rate": 4.989530262185814e-05, + "loss": 6.0314, + "step": 4901 + }, + { + "epoch": 0.029153582643448472, + "grad_norm": 2.928974151611328, + "learning_rate": 4.98952599137414e-05, + "loss": 6.3698, + "step": 4902 + }, + { + "epoch": 0.02915952992672947, + "grad_norm": 2.527578115463257, + "learning_rate": 4.989521719693398e-05, + "loss": 6.4301, + "step": 4903 + }, + { + "epoch": 0.029165477210010466, + "grad_norm": 2.392106771469116, + "learning_rate": 4.9895174471435904e-05, + "loss": 6.3515, + "step": 4904 + }, + { + "epoch": 0.029171424493291465, + "grad_norm": 1.9899437427520752, + "learning_rate": 4.989513173724717e-05, + "loss": 6.3265, + "step": 4905 + }, + { + "epoch": 0.029177371776572463, + "grad_norm": 2.057600736618042, + "learning_rate": 4.9895088994367806e-05, + "loss": 6.2402, + "step": 4906 + }, + { + "epoch": 0.029183319059853458, + "grad_norm": 2.8310391902923584, + "learning_rate": 4.989504624279783e-05, + "loss": 5.9056, + "step": 4907 + }, + { + "epoch": 0.029189266343134457, + "grad_norm": 2.904785394668579, + "learning_rate": 4.989500348253724e-05, + "loss": 5.8847, + "step": 4908 + }, + { + "epoch": 0.029195213626415452, + "grad_norm": 2.7728030681610107, + "learning_rate": 4.989496071358607e-05, + "loss": 5.8997, + "step": 4909 + }, + { + "epoch": 0.02920116090969645, + "grad_norm": 2.768862009048462, + "learning_rate": 4.989491793594432e-05, + "loss": 6.1267, + "step": 4910 + }, + { + "epoch": 0.02920710819297745, + "grad_norm": 2.4353668689727783, + "learning_rate": 4.989487514961201e-05, + "loss": 5.9087, + "step": 4911 + }, + { + "epoch": 0.029213055476258444, + "grad_norm": 2.5170469284057617, + "learning_rate": 4.9894832354589164e-05, + "loss": 6.0971, + "step": 4912 + }, + { + "epoch": 0.029219002759539443, + "grad_norm": 2.345998764038086, + "learning_rate": 4.9894789550875784e-05, + "loss": 6.2518, + "step": 4913 + }, + { + "epoch": 0.02922495004282044, + "grad_norm": 2.429123878479004, + "learning_rate": 4.98947467384719e-05, + "loss": 6.238, + "step": 4914 + }, + { + "epoch": 0.029230897326101436, + "grad_norm": 2.531514883041382, + "learning_rate": 4.9894703917377506e-05, + "loss": 6.0177, + "step": 4915 + }, + { + "epoch": 0.029236844609382435, + "grad_norm": 2.833874464035034, + "learning_rate": 4.9894661087592634e-05, + "loss": 6.2018, + "step": 4916 + }, + { + "epoch": 0.02924279189266343, + "grad_norm": 2.521381378173828, + "learning_rate": 4.9894618249117287e-05, + "loss": 6.1777, + "step": 4917 + }, + { + "epoch": 0.02924873917594443, + "grad_norm": 2.731703758239746, + "learning_rate": 4.989457540195149e-05, + "loss": 6.0237, + "step": 4918 + }, + { + "epoch": 0.029254686459225427, + "grad_norm": 2.918398141860962, + "learning_rate": 4.989453254609525e-05, + "loss": 6.5688, + "step": 4919 + }, + { + "epoch": 0.029260633742506422, + "grad_norm": 2.407552480697632, + "learning_rate": 4.989448968154859e-05, + "loss": 5.9751, + "step": 4920 + }, + { + "epoch": 0.02926658102578742, + "grad_norm": 2.575258731842041, + "learning_rate": 4.989444680831152e-05, + "loss": 5.7587, + "step": 4921 + }, + { + "epoch": 0.02927252830906842, + "grad_norm": 2.6550750732421875, + "learning_rate": 4.989440392638406e-05, + "loss": 6.6404, + "step": 4922 + }, + { + "epoch": 0.029278475592349414, + "grad_norm": 2.569438934326172, + "learning_rate": 4.989436103576621e-05, + "loss": 5.8615, + "step": 4923 + }, + { + "epoch": 0.029284422875630413, + "grad_norm": 2.4601991176605225, + "learning_rate": 4.989431813645801e-05, + "loss": 5.8969, + "step": 4924 + }, + { + "epoch": 0.029290370158911408, + "grad_norm": 3.579819917678833, + "learning_rate": 4.989427522845945e-05, + "loss": 5.8832, + "step": 4925 + }, + { + "epoch": 0.029296317442192406, + "grad_norm": 2.5762264728546143, + "learning_rate": 4.9894232311770556e-05, + "loss": 5.4841, + "step": 4926 + }, + { + "epoch": 0.029302264725473405, + "grad_norm": 3.352381706237793, + "learning_rate": 4.989418938639134e-05, + "loss": 5.8936, + "step": 4927 + }, + { + "epoch": 0.0293082120087544, + "grad_norm": 2.824322462081909, + "learning_rate": 4.9894146452321835e-05, + "loss": 5.8291, + "step": 4928 + }, + { + "epoch": 0.0293141592920354, + "grad_norm": 2.6431384086608887, + "learning_rate": 4.9894103509562026e-05, + "loss": 6.2519, + "step": 4929 + }, + { + "epoch": 0.029320106575316394, + "grad_norm": 3.0580949783325195, + "learning_rate": 4.989406055811195e-05, + "loss": 6.4141, + "step": 4930 + }, + { + "epoch": 0.029326053858597392, + "grad_norm": 2.757420778274536, + "learning_rate": 4.989401759797161e-05, + "loss": 6.1427, + "step": 4931 + }, + { + "epoch": 0.02933200114187839, + "grad_norm": 2.713111639022827, + "learning_rate": 4.989397462914103e-05, + "loss": 6.4107, + "step": 4932 + }, + { + "epoch": 0.029337948425159386, + "grad_norm": 2.7954351902008057, + "learning_rate": 4.9893931651620215e-05, + "loss": 5.7657, + "step": 4933 + }, + { + "epoch": 0.029343895708440385, + "grad_norm": 2.3637917041778564, + "learning_rate": 4.9893888665409196e-05, + "loss": 5.8209, + "step": 4934 + }, + { + "epoch": 0.029349842991721383, + "grad_norm": 2.938631296157837, + "learning_rate": 4.9893845670507964e-05, + "loss": 6.0502, + "step": 4935 + }, + { + "epoch": 0.029355790275002378, + "grad_norm": 2.8911824226379395, + "learning_rate": 4.989380266691655e-05, + "loss": 5.9736, + "step": 4936 + }, + { + "epoch": 0.029361737558283377, + "grad_norm": 2.9410245418548584, + "learning_rate": 4.989375965463498e-05, + "loss": 5.2824, + "step": 4937 + }, + { + "epoch": 0.029367684841564372, + "grad_norm": 2.4925217628479004, + "learning_rate": 4.9893716633663244e-05, + "loss": 5.5829, + "step": 4938 + }, + { + "epoch": 0.02937363212484537, + "grad_norm": 2.485349178314209, + "learning_rate": 4.9893673604001366e-05, + "loss": 5.8812, + "step": 4939 + }, + { + "epoch": 0.02937957940812637, + "grad_norm": 2.3950133323669434, + "learning_rate": 4.9893630565649376e-05, + "loss": 5.9314, + "step": 4940 + }, + { + "epoch": 0.029385526691407364, + "grad_norm": 2.28104829788208, + "learning_rate": 4.989358751860726e-05, + "loss": 6.1768, + "step": 4941 + }, + { + "epoch": 0.029391473974688363, + "grad_norm": 2.4479010105133057, + "learning_rate": 4.989354446287507e-05, + "loss": 6.1645, + "step": 4942 + }, + { + "epoch": 0.02939742125796936, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.989350139845279e-05, + "loss": 5.7145, + "step": 4943 + }, + { + "epoch": 0.029403368541250356, + "grad_norm": 2.4120032787323, + "learning_rate": 4.989345832534045e-05, + "loss": 5.695, + "step": 4944 + }, + { + "epoch": 0.029409315824531355, + "grad_norm": 2.6345109939575195, + "learning_rate": 4.989341524353805e-05, + "loss": 5.4805, + "step": 4945 + }, + { + "epoch": 0.02941526310781235, + "grad_norm": 2.8750240802764893, + "learning_rate": 4.989337215304563e-05, + "loss": 5.0352, + "step": 4946 + }, + { + "epoch": 0.02942121039109335, + "grad_norm": 2.7220489978790283, + "learning_rate": 4.989332905386318e-05, + "loss": 5.1646, + "step": 4947 + }, + { + "epoch": 0.029427157674374347, + "grad_norm": 2.464871883392334, + "learning_rate": 4.9893285945990734e-05, + "loss": 4.9989, + "step": 4948 + }, + { + "epoch": 0.029433104957655342, + "grad_norm": 2.261049270629883, + "learning_rate": 4.989324282942829e-05, + "loss": 6.2217, + "step": 4949 + }, + { + "epoch": 0.02943905224093634, + "grad_norm": 2.224818468093872, + "learning_rate": 4.9893199704175876e-05, + "loss": 6.3964, + "step": 4950 + }, + { + "epoch": 0.02944499952421734, + "grad_norm": 2.366520643234253, + "learning_rate": 4.989315657023351e-05, + "loss": 6.3572, + "step": 4951 + }, + { + "epoch": 0.029450946807498334, + "grad_norm": 2.4811010360717773, + "learning_rate": 4.989311342760119e-05, + "loss": 5.7867, + "step": 4952 + }, + { + "epoch": 0.029456894090779333, + "grad_norm": 2.246730089187622, + "learning_rate": 4.989307027627895e-05, + "loss": 6.0865, + "step": 4953 + }, + { + "epoch": 0.029462841374060328, + "grad_norm": 2.297379493713379, + "learning_rate": 4.989302711626679e-05, + "loss": 5.9257, + "step": 4954 + }, + { + "epoch": 0.029468788657341326, + "grad_norm": 2.5890488624572754, + "learning_rate": 4.989298394756473e-05, + "loss": 5.7631, + "step": 4955 + }, + { + "epoch": 0.029474735940622325, + "grad_norm": 3.3777449131011963, + "learning_rate": 4.989294077017279e-05, + "loss": 5.4014, + "step": 4956 + }, + { + "epoch": 0.02948068322390332, + "grad_norm": 2.0395402908325195, + "learning_rate": 4.9892897584090986e-05, + "loss": 6.2429, + "step": 4957 + }, + { + "epoch": 0.02948663050718432, + "grad_norm": 2.0414693355560303, + "learning_rate": 4.989285438931932e-05, + "loss": 6.4685, + "step": 4958 + }, + { + "epoch": 0.029492577790465314, + "grad_norm": 2.2383265495300293, + "learning_rate": 4.989281118585783e-05, + "loss": 6.1651, + "step": 4959 + }, + { + "epoch": 0.029498525073746312, + "grad_norm": 2.559720754623413, + "learning_rate": 4.98927679737065e-05, + "loss": 6.3822, + "step": 4960 + }, + { + "epoch": 0.02950447235702731, + "grad_norm": 2.810699939727783, + "learning_rate": 4.989272475286537e-05, + "loss": 6.2076, + "step": 4961 + }, + { + "epoch": 0.029510419640308306, + "grad_norm": 2.9151525497436523, + "learning_rate": 4.989268152333445e-05, + "loss": 5.9892, + "step": 4962 + }, + { + "epoch": 0.029516366923589304, + "grad_norm": 2.295197010040283, + "learning_rate": 4.9892638285113744e-05, + "loss": 6.1392, + "step": 4963 + }, + { + "epoch": 0.029522314206870303, + "grad_norm": 2.271088123321533, + "learning_rate": 4.989259503820328e-05, + "loss": 6.6991, + "step": 4964 + }, + { + "epoch": 0.029528261490151298, + "grad_norm": 2.338074207305908, + "learning_rate": 4.9892551782603064e-05, + "loss": 5.9615, + "step": 4965 + }, + { + "epoch": 0.029534208773432297, + "grad_norm": 2.3510494232177734, + "learning_rate": 4.989250851831312e-05, + "loss": 5.8894, + "step": 4966 + }, + { + "epoch": 0.029540156056713292, + "grad_norm": 2.1170454025268555, + "learning_rate": 4.989246524533345e-05, + "loss": 5.6921, + "step": 4967 + }, + { + "epoch": 0.02954610333999429, + "grad_norm": 3.289508104324341, + "learning_rate": 4.989242196366409e-05, + "loss": 6.1689, + "step": 4968 + }, + { + "epoch": 0.02955205062327529, + "grad_norm": 2.068229913711548, + "learning_rate": 4.989237867330504e-05, + "loss": 6.3342, + "step": 4969 + }, + { + "epoch": 0.029557997906556284, + "grad_norm": 2.198928117752075, + "learning_rate": 4.9892335374256316e-05, + "loss": 6.5125, + "step": 4970 + }, + { + "epoch": 0.029563945189837283, + "grad_norm": 2.3634228706359863, + "learning_rate": 4.989229206651793e-05, + "loss": 5.8328, + "step": 4971 + }, + { + "epoch": 0.02956989247311828, + "grad_norm": 2.1632115840911865, + "learning_rate": 4.989224875008991e-05, + "loss": 6.0702, + "step": 4972 + }, + { + "epoch": 0.029575839756399276, + "grad_norm": 2.461888313293457, + "learning_rate": 4.989220542497226e-05, + "loss": 6.01, + "step": 4973 + }, + { + "epoch": 0.029581787039680275, + "grad_norm": 2.668333053588867, + "learning_rate": 4.9892162091164997e-05, + "loss": 6.0369, + "step": 4974 + }, + { + "epoch": 0.02958773432296127, + "grad_norm": 3.0210723876953125, + "learning_rate": 4.9892118748668135e-05, + "loss": 6.0652, + "step": 4975 + }, + { + "epoch": 0.02959368160624227, + "grad_norm": 2.937350034713745, + "learning_rate": 4.98920753974817e-05, + "loss": 6.0205, + "step": 4976 + }, + { + "epoch": 0.029599628889523267, + "grad_norm": 2.904499053955078, + "learning_rate": 4.9892032037605685e-05, + "loss": 5.9561, + "step": 4977 + }, + { + "epoch": 0.029605576172804262, + "grad_norm": 2.218867778778076, + "learning_rate": 4.989198866904013e-05, + "loss": 5.4173, + "step": 4978 + }, + { + "epoch": 0.02961152345608526, + "grad_norm": 3.009920835494995, + "learning_rate": 4.9891945291785034e-05, + "loss": 5.5577, + "step": 4979 + }, + { + "epoch": 0.02961747073936626, + "grad_norm": 2.731687545776367, + "learning_rate": 4.9891901905840424e-05, + "loss": 5.6591, + "step": 4980 + }, + { + "epoch": 0.029623418022647254, + "grad_norm": 2.244101047515869, + "learning_rate": 4.98918585112063e-05, + "loss": 6.1434, + "step": 4981 + }, + { + "epoch": 0.029629365305928253, + "grad_norm": 2.3366870880126953, + "learning_rate": 4.989181510788269e-05, + "loss": 6.0132, + "step": 4982 + }, + { + "epoch": 0.029635312589209248, + "grad_norm": 3.2757890224456787, + "learning_rate": 4.98917716958696e-05, + "loss": 5.7486, + "step": 4983 + }, + { + "epoch": 0.029641259872490246, + "grad_norm": 2.361041784286499, + "learning_rate": 4.989172827516705e-05, + "loss": 5.8192, + "step": 4984 + }, + { + "epoch": 0.029647207155771245, + "grad_norm": 3.3433775901794434, + "learning_rate": 4.9891684845775054e-05, + "loss": 5.8688, + "step": 4985 + }, + { + "epoch": 0.02965315443905224, + "grad_norm": 2.6427462100982666, + "learning_rate": 4.9891641407693635e-05, + "loss": 5.9459, + "step": 4986 + }, + { + "epoch": 0.02965910172233324, + "grad_norm": 3.0931055545806885, + "learning_rate": 4.9891597960922795e-05, + "loss": 6.4822, + "step": 4987 + }, + { + "epoch": 0.029665049005614237, + "grad_norm": 2.598477840423584, + "learning_rate": 4.989155450546256e-05, + "loss": 6.0362, + "step": 4988 + }, + { + "epoch": 0.029670996288895232, + "grad_norm": 2.460313081741333, + "learning_rate": 4.989151104131294e-05, + "loss": 5.6209, + "step": 4989 + }, + { + "epoch": 0.02967694357217623, + "grad_norm": 2.4712390899658203, + "learning_rate": 4.989146756847395e-05, + "loss": 6.3849, + "step": 4990 + }, + { + "epoch": 0.029682890855457226, + "grad_norm": 2.365860939025879, + "learning_rate": 4.98914240869456e-05, + "loss": 6.2791, + "step": 4991 + }, + { + "epoch": 0.029688838138738224, + "grad_norm": 2.6213366985321045, + "learning_rate": 4.9891380596727915e-05, + "loss": 6.2888, + "step": 4992 + }, + { + "epoch": 0.029694785422019223, + "grad_norm": 2.742213487625122, + "learning_rate": 4.989133709782091e-05, + "loss": 6.3522, + "step": 4993 + }, + { + "epoch": 0.029700732705300218, + "grad_norm": 2.2428665161132812, + "learning_rate": 4.9891293590224594e-05, + "loss": 6.6735, + "step": 4994 + }, + { + "epoch": 0.029706679988581217, + "grad_norm": 2.4242279529571533, + "learning_rate": 4.989125007393898e-05, + "loss": 6.2283, + "step": 4995 + }, + { + "epoch": 0.02971262727186221, + "grad_norm": 2.422177314758301, + "learning_rate": 4.989120654896409e-05, + "loss": 6.0273, + "step": 4996 + }, + { + "epoch": 0.02971857455514321, + "grad_norm": 2.4325926303863525, + "learning_rate": 4.989116301529994e-05, + "loss": 5.9504, + "step": 4997 + }, + { + "epoch": 0.02972452183842421, + "grad_norm": 2.42901873588562, + "learning_rate": 4.9891119472946544e-05, + "loss": 5.8156, + "step": 4998 + }, + { + "epoch": 0.029730469121705204, + "grad_norm": 2.4361307621002197, + "learning_rate": 4.989107592190391e-05, + "loss": 5.9025, + "step": 4999 + }, + { + "epoch": 0.029736416404986202, + "grad_norm": 2.9486470222473145, + "learning_rate": 4.9891032362172065e-05, + "loss": 6.3204, + "step": 5000 + }, + { + "epoch": 0.0297423636882672, + "grad_norm": 2.456681966781616, + "learning_rate": 4.989098879375101e-05, + "loss": 5.8203, + "step": 5001 + }, + { + "epoch": 0.029748310971548196, + "grad_norm": 2.5065391063690186, + "learning_rate": 4.9890945216640775e-05, + "loss": 6.452, + "step": 5002 + }, + { + "epoch": 0.029754258254829195, + "grad_norm": 2.386488199234009, + "learning_rate": 4.989090163084136e-05, + "loss": 5.9195, + "step": 5003 + }, + { + "epoch": 0.02976020553811019, + "grad_norm": 2.1387040615081787, + "learning_rate": 4.9890858036352796e-05, + "loss": 6.2127, + "step": 5004 + }, + { + "epoch": 0.02976615282139119, + "grad_norm": 2.518099784851074, + "learning_rate": 4.989081443317508e-05, + "loss": 6.1099, + "step": 5005 + }, + { + "epoch": 0.029772100104672187, + "grad_norm": 3.2108826637268066, + "learning_rate": 4.989077082130825e-05, + "loss": 5.9808, + "step": 5006 + }, + { + "epoch": 0.029778047387953182, + "grad_norm": 2.176065444946289, + "learning_rate": 4.9890727200752304e-05, + "loss": 6.0825, + "step": 5007 + }, + { + "epoch": 0.02978399467123418, + "grad_norm": 2.2961249351501465, + "learning_rate": 4.9890683571507265e-05, + "loss": 5.968, + "step": 5008 + }, + { + "epoch": 0.02978994195451518, + "grad_norm": 2.1954386234283447, + "learning_rate": 4.9890639933573144e-05, + "loss": 6.0799, + "step": 5009 + }, + { + "epoch": 0.029795889237796174, + "grad_norm": 2.256039619445801, + "learning_rate": 4.989059628694995e-05, + "loss": 5.9503, + "step": 5010 + }, + { + "epoch": 0.029801836521077173, + "grad_norm": 2.4350922107696533, + "learning_rate": 4.9890552631637715e-05, + "loss": 5.6741, + "step": 5011 + }, + { + "epoch": 0.029807783804358168, + "grad_norm": 2.68904447555542, + "learning_rate": 4.989050896763645e-05, + "loss": 5.5872, + "step": 5012 + }, + { + "epoch": 0.029813731087639166, + "grad_norm": 2.2877871990203857, + "learning_rate": 4.989046529494615e-05, + "loss": 6.1273, + "step": 5013 + }, + { + "epoch": 0.029819678370920165, + "grad_norm": 2.350348711013794, + "learning_rate": 4.989042161356686e-05, + "loss": 6.1113, + "step": 5014 + }, + { + "epoch": 0.02982562565420116, + "grad_norm": 2.295382499694824, + "learning_rate": 4.989037792349858e-05, + "loss": 6.036, + "step": 5015 + }, + { + "epoch": 0.02983157293748216, + "grad_norm": 2.317863941192627, + "learning_rate": 4.989033422474131e-05, + "loss": 5.961, + "step": 5016 + }, + { + "epoch": 0.029837520220763157, + "grad_norm": 2.286289930343628, + "learning_rate": 4.9890290517295095e-05, + "loss": 5.8163, + "step": 5017 + }, + { + "epoch": 0.029843467504044152, + "grad_norm": 2.246863842010498, + "learning_rate": 4.989024680115993e-05, + "loss": 5.9689, + "step": 5018 + }, + { + "epoch": 0.02984941478732515, + "grad_norm": 1.8732661008834839, + "learning_rate": 4.989020307633585e-05, + "loss": 5.9046, + "step": 5019 + }, + { + "epoch": 0.029855362070606146, + "grad_norm": 2.0211753845214844, + "learning_rate": 4.989015934282285e-05, + "loss": 5.95, + "step": 5020 + }, + { + "epoch": 0.029861309353887144, + "grad_norm": 2.014890193939209, + "learning_rate": 4.9890115600620946e-05, + "loss": 5.7312, + "step": 5021 + }, + { + "epoch": 0.029867256637168143, + "grad_norm": 2.2749524116516113, + "learning_rate": 4.989007184973017e-05, + "loss": 6.2573, + "step": 5022 + }, + { + "epoch": 0.029873203920449138, + "grad_norm": 2.080747604370117, + "learning_rate": 4.989002809015052e-05, + "loss": 5.7607, + "step": 5023 + }, + { + "epoch": 0.029879151203730137, + "grad_norm": 2.3403279781341553, + "learning_rate": 4.988998432188202e-05, + "loss": 5.7876, + "step": 5024 + }, + { + "epoch": 0.02988509848701113, + "grad_norm": 2.573802947998047, + "learning_rate": 4.988994054492468e-05, + "loss": 5.9036, + "step": 5025 + }, + { + "epoch": 0.02989104577029213, + "grad_norm": 2.267409324645996, + "learning_rate": 4.988989675927853e-05, + "loss": 5.7433, + "step": 5026 + }, + { + "epoch": 0.02989699305357313, + "grad_norm": 2.8241517543792725, + "learning_rate": 4.9889852964943566e-05, + "loss": 6.2338, + "step": 5027 + }, + { + "epoch": 0.029902940336854124, + "grad_norm": 2.338927745819092, + "learning_rate": 4.988980916191982e-05, + "loss": 6.0226, + "step": 5028 + }, + { + "epoch": 0.029908887620135122, + "grad_norm": 2.0798492431640625, + "learning_rate": 4.9889765350207285e-05, + "loss": 5.6919, + "step": 5029 + }, + { + "epoch": 0.02991483490341612, + "grad_norm": 2.3199923038482666, + "learning_rate": 4.9889721529806e-05, + "loss": 5.7533, + "step": 5030 + }, + { + "epoch": 0.029920782186697116, + "grad_norm": 2.1074399948120117, + "learning_rate": 4.988967770071596e-05, + "loss": 5.7486, + "step": 5031 + }, + { + "epoch": 0.029926729469978115, + "grad_norm": 2.2539381980895996, + "learning_rate": 4.9889633862937205e-05, + "loss": 5.6816, + "step": 5032 + }, + { + "epoch": 0.02993267675325911, + "grad_norm": 2.1393015384674072, + "learning_rate": 4.9889590016469726e-05, + "loss": 5.6635, + "step": 5033 + }, + { + "epoch": 0.029938624036540108, + "grad_norm": 2.6661975383758545, + "learning_rate": 4.988954616131355e-05, + "loss": 6.0218, + "step": 5034 + }, + { + "epoch": 0.029944571319821107, + "grad_norm": 2.6529600620269775, + "learning_rate": 4.988950229746869e-05, + "loss": 5.8847, + "step": 5035 + }, + { + "epoch": 0.029950518603102102, + "grad_norm": 2.510859966278076, + "learning_rate": 4.988945842493517e-05, + "loss": 5.7154, + "step": 5036 + }, + { + "epoch": 0.0299564658863831, + "grad_norm": 2.875394105911255, + "learning_rate": 4.9889414543712985e-05, + "loss": 5.6304, + "step": 5037 + }, + { + "epoch": 0.0299624131696641, + "grad_norm": 2.718808650970459, + "learning_rate": 4.988937065380217e-05, + "loss": 5.6562, + "step": 5038 + }, + { + "epoch": 0.029968360452945094, + "grad_norm": 2.702265501022339, + "learning_rate": 4.988932675520273e-05, + "loss": 5.6484, + "step": 5039 + }, + { + "epoch": 0.029974307736226093, + "grad_norm": 2.765209436416626, + "learning_rate": 4.988928284791469e-05, + "loss": 5.793, + "step": 5040 + }, + { + "epoch": 0.029980255019507088, + "grad_norm": 3.386352062225342, + "learning_rate": 4.9889238931938047e-05, + "loss": 5.5392, + "step": 5041 + }, + { + "epoch": 0.029986202302788086, + "grad_norm": 2.1632583141326904, + "learning_rate": 4.988919500727284e-05, + "loss": 5.8032, + "step": 5042 + }, + { + "epoch": 0.029992149586069085, + "grad_norm": 2.4121060371398926, + "learning_rate": 4.9889151073919064e-05, + "loss": 5.9793, + "step": 5043 + }, + { + "epoch": 0.02999809686935008, + "grad_norm": 2.2160584926605225, + "learning_rate": 4.988910713187674e-05, + "loss": 5.8802, + "step": 5044 + }, + { + "epoch": 0.03000404415263108, + "grad_norm": 3.120509386062622, + "learning_rate": 4.988906318114589e-05, + "loss": 5.5691, + "step": 5045 + }, + { + "epoch": 0.030009991435912077, + "grad_norm": 3.0660078525543213, + "learning_rate": 4.988901922172652e-05, + "loss": 5.3687, + "step": 5046 + }, + { + "epoch": 0.030015938719193072, + "grad_norm": 1.939757227897644, + "learning_rate": 4.988897525361867e-05, + "loss": 5.526, + "step": 5047 + }, + { + "epoch": 0.03002188600247407, + "grad_norm": 2.2970168590545654, + "learning_rate": 4.9888931276822315e-05, + "loss": 5.6334, + "step": 5048 + }, + { + "epoch": 0.030027833285755066, + "grad_norm": 2.162632942199707, + "learning_rate": 4.988888729133749e-05, + "loss": 5.8887, + "step": 5049 + }, + { + "epoch": 0.030033780569036064, + "grad_norm": 2.027017831802368, + "learning_rate": 4.9888843297164223e-05, + "loss": 5.9237, + "step": 5050 + }, + { + "epoch": 0.030039727852317063, + "grad_norm": 1.9226456880569458, + "learning_rate": 4.988879929430251e-05, + "loss": 5.6833, + "step": 5051 + }, + { + "epoch": 0.030045675135598058, + "grad_norm": 1.6490615606307983, + "learning_rate": 4.9888755282752384e-05, + "loss": 5.5738, + "step": 5052 + }, + { + "epoch": 0.030051622418879056, + "grad_norm": 2.456385850906372, + "learning_rate": 4.9888711262513846e-05, + "loss": 5.3771, + "step": 5053 + }, + { + "epoch": 0.03005756970216005, + "grad_norm": 2.480044364929199, + "learning_rate": 4.988866723358692e-05, + "loss": 5.2456, + "step": 5054 + }, + { + "epoch": 0.03006351698544105, + "grad_norm": 2.4033162593841553, + "learning_rate": 4.988862319597161e-05, + "loss": 5.1629, + "step": 5055 + }, + { + "epoch": 0.03006946426872205, + "grad_norm": 2.7228541374206543, + "learning_rate": 4.9888579149667935e-05, + "loss": 5.0195, + "step": 5056 + }, + { + "epoch": 0.030075411552003044, + "grad_norm": 2.4641635417938232, + "learning_rate": 4.9888535094675926e-05, + "loss": 5.3259, + "step": 5057 + }, + { + "epoch": 0.030081358835284042, + "grad_norm": 2.443666458129883, + "learning_rate": 4.9888491030995575e-05, + "loss": 5.4212, + "step": 5058 + }, + { + "epoch": 0.03008730611856504, + "grad_norm": 2.3267531394958496, + "learning_rate": 4.988844695862692e-05, + "loss": 5.6517, + "step": 5059 + }, + { + "epoch": 0.030093253401846036, + "grad_norm": 1.9090640544891357, + "learning_rate": 4.988840287756996e-05, + "loss": 5.7946, + "step": 5060 + }, + { + "epoch": 0.030099200685127035, + "grad_norm": 1.6169202327728271, + "learning_rate": 4.988835878782472e-05, + "loss": 5.7332, + "step": 5061 + }, + { + "epoch": 0.03010514796840803, + "grad_norm": 1.9369432926177979, + "learning_rate": 4.9888314689391205e-05, + "loss": 5.5954, + "step": 5062 + }, + { + "epoch": 0.030111095251689028, + "grad_norm": 2.0444133281707764, + "learning_rate": 4.9888270582269434e-05, + "loss": 5.5332, + "step": 5063 + }, + { + "epoch": 0.030117042534970027, + "grad_norm": 1.949061632156372, + "learning_rate": 4.988822646645943e-05, + "loss": 5.6064, + "step": 5064 + }, + { + "epoch": 0.030122989818251022, + "grad_norm": 1.5208648443222046, + "learning_rate": 4.988818234196121e-05, + "loss": 5.6615, + "step": 5065 + }, + { + "epoch": 0.03012893710153202, + "grad_norm": 1.8466709852218628, + "learning_rate": 4.988813820877477e-05, + "loss": 5.79, + "step": 5066 + }, + { + "epoch": 0.03013488438481302, + "grad_norm": 1.7094037532806396, + "learning_rate": 4.988809406690015e-05, + "loss": 5.8194, + "step": 5067 + }, + { + "epoch": 0.030140831668094014, + "grad_norm": 1.5698916912078857, + "learning_rate": 4.988804991633734e-05, + "loss": 5.5981, + "step": 5068 + }, + { + "epoch": 0.030146778951375013, + "grad_norm": 2.032156467437744, + "learning_rate": 4.988800575708638e-05, + "loss": 5.6729, + "step": 5069 + }, + { + "epoch": 0.030152726234656008, + "grad_norm": 1.9716484546661377, + "learning_rate": 4.988796158914727e-05, + "loss": 5.5227, + "step": 5070 + }, + { + "epoch": 0.030158673517937006, + "grad_norm": 1.8809682130813599, + "learning_rate": 4.988791741252002e-05, + "loss": 5.6231, + "step": 5071 + }, + { + "epoch": 0.030164620801218005, + "grad_norm": 1.8293371200561523, + "learning_rate": 4.9887873227204675e-05, + "loss": 5.5067, + "step": 5072 + }, + { + "epoch": 0.030170568084499, + "grad_norm": 2.225281000137329, + "learning_rate": 4.988782903320122e-05, + "loss": 5.3056, + "step": 5073 + }, + { + "epoch": 0.03017651536778, + "grad_norm": 2.0776474475860596, + "learning_rate": 4.988778483050968e-05, + "loss": 5.206, + "step": 5074 + }, + { + "epoch": 0.030182462651060997, + "grad_norm": 2.068323850631714, + "learning_rate": 4.9887740619130076e-05, + "loss": 5.5975, + "step": 5075 + }, + { + "epoch": 0.030188409934341992, + "grad_norm": 2.077782392501831, + "learning_rate": 4.988769639906241e-05, + "loss": 5.6967, + "step": 5076 + }, + { + "epoch": 0.03019435721762299, + "grad_norm": 1.9837195873260498, + "learning_rate": 4.988765217030672e-05, + "loss": 5.7834, + "step": 5077 + }, + { + "epoch": 0.030200304500903986, + "grad_norm": 1.9612236022949219, + "learning_rate": 4.9887607932863e-05, + "loss": 5.5472, + "step": 5078 + }, + { + "epoch": 0.030206251784184984, + "grad_norm": 2.022251605987549, + "learning_rate": 4.988756368673127e-05, + "loss": 5.704, + "step": 5079 + }, + { + "epoch": 0.030212199067465983, + "grad_norm": 2.02227783203125, + "learning_rate": 4.988751943191156e-05, + "loss": 5.4125, + "step": 5080 + }, + { + "epoch": 0.030218146350746978, + "grad_norm": 2.0527732372283936, + "learning_rate": 4.9887475168403856e-05, + "loss": 5.464, + "step": 5081 + }, + { + "epoch": 0.030224093634027976, + "grad_norm": 2.1465423107147217, + "learning_rate": 4.9887430896208205e-05, + "loss": 5.3415, + "step": 5082 + }, + { + "epoch": 0.03023004091730897, + "grad_norm": 1.9170550107955933, + "learning_rate": 4.9887386615324606e-05, + "loss": 5.5762, + "step": 5083 + }, + { + "epoch": 0.03023598820058997, + "grad_norm": 3.367650032043457, + "learning_rate": 4.988734232575307e-05, + "loss": 6.26, + "step": 5084 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 2.0784621238708496, + "learning_rate": 4.988729802749363e-05, + "loss": 5.5316, + "step": 5085 + }, + { + "epoch": 0.030247882767151964, + "grad_norm": 1.9531089067459106, + "learning_rate": 4.988725372054629e-05, + "loss": 5.5901, + "step": 5086 + }, + { + "epoch": 0.030253830050432962, + "grad_norm": 1.9677239656448364, + "learning_rate": 4.988720940491106e-05, + "loss": 5.4963, + "step": 5087 + }, + { + "epoch": 0.03025977733371396, + "grad_norm": 1.9835426807403564, + "learning_rate": 4.988716508058797e-05, + "loss": 5.6355, + "step": 5088 + }, + { + "epoch": 0.030265724616994956, + "grad_norm": 1.908250331878662, + "learning_rate": 4.988712074757703e-05, + "loss": 5.165, + "step": 5089 + }, + { + "epoch": 0.030271671900275954, + "grad_norm": 1.9852073192596436, + "learning_rate": 4.9887076405878246e-05, + "loss": 5.6623, + "step": 5090 + }, + { + "epoch": 0.03027761918355695, + "grad_norm": 1.9073505401611328, + "learning_rate": 4.988703205549164e-05, + "loss": 5.6685, + "step": 5091 + }, + { + "epoch": 0.030283566466837948, + "grad_norm": 1.744931697845459, + "learning_rate": 4.988698769641724e-05, + "loss": 5.4004, + "step": 5092 + }, + { + "epoch": 0.030289513750118947, + "grad_norm": 2.0623345375061035, + "learning_rate": 4.9886943328655034e-05, + "loss": 5.3846, + "step": 5093 + }, + { + "epoch": 0.030295461033399942, + "grad_norm": 1.647375226020813, + "learning_rate": 4.9886898952205064e-05, + "loss": 5.5823, + "step": 5094 + }, + { + "epoch": 0.03030140831668094, + "grad_norm": 2.2364108562469482, + "learning_rate": 4.9886854567067334e-05, + "loss": 5.5959, + "step": 5095 + }, + { + "epoch": 0.03030735559996194, + "grad_norm": 2.059187650680542, + "learning_rate": 4.988681017324185e-05, + "loss": 5.6043, + "step": 5096 + }, + { + "epoch": 0.030313302883242934, + "grad_norm": 1.8996437788009644, + "learning_rate": 4.988676577072865e-05, + "loss": 5.4366, + "step": 5097 + }, + { + "epoch": 0.030319250166523933, + "grad_norm": 2.0983266830444336, + "learning_rate": 4.988672135952773e-05, + "loss": 5.5568, + "step": 5098 + }, + { + "epoch": 0.030325197449804928, + "grad_norm": 2.065119743347168, + "learning_rate": 4.988667693963911e-05, + "loss": 5.4239, + "step": 5099 + }, + { + "epoch": 0.030331144733085926, + "grad_norm": 1.9394044876098633, + "learning_rate": 4.988663251106282e-05, + "loss": 5.573, + "step": 5100 + }, + { + "epoch": 0.030337092016366925, + "grad_norm": 2.225097417831421, + "learning_rate": 4.9886588073798855e-05, + "loss": 5.5877, + "step": 5101 + }, + { + "epoch": 0.03034303929964792, + "grad_norm": 2.185018539428711, + "learning_rate": 4.9886543627847236e-05, + "loss": 5.6884, + "step": 5102 + }, + { + "epoch": 0.03034898658292892, + "grad_norm": 1.9751871824264526, + "learning_rate": 4.988649917320799e-05, + "loss": 5.4836, + "step": 5103 + }, + { + "epoch": 0.030354933866209917, + "grad_norm": 1.8753101825714111, + "learning_rate": 4.988645470988113e-05, + "loss": 5.4049, + "step": 5104 + }, + { + "epoch": 0.030360881149490912, + "grad_norm": 2.12246036529541, + "learning_rate": 4.988641023786665e-05, + "loss": 5.5365, + "step": 5105 + }, + { + "epoch": 0.03036682843277191, + "grad_norm": 2.1078991889953613, + "learning_rate": 4.988636575716459e-05, + "loss": 5.5269, + "step": 5106 + }, + { + "epoch": 0.030372775716052906, + "grad_norm": 1.9127923250198364, + "learning_rate": 4.9886321267774946e-05, + "loss": 5.48, + "step": 5107 + }, + { + "epoch": 0.030378722999333904, + "grad_norm": 1.8971906900405884, + "learning_rate": 4.988627676969776e-05, + "loss": 5.5202, + "step": 5108 + }, + { + "epoch": 0.030384670282614903, + "grad_norm": 2.162097454071045, + "learning_rate": 4.9886232262933024e-05, + "loss": 5.5229, + "step": 5109 + }, + { + "epoch": 0.030390617565895898, + "grad_norm": 2.21211838722229, + "learning_rate": 4.988618774748076e-05, + "loss": 5.3648, + "step": 5110 + }, + { + "epoch": 0.030396564849176896, + "grad_norm": 1.8907619714736938, + "learning_rate": 4.988614322334099e-05, + "loss": 5.4338, + "step": 5111 + }, + { + "epoch": 0.030402512132457895, + "grad_norm": 2.0131993293762207, + "learning_rate": 4.9886098690513725e-05, + "loss": 5.4005, + "step": 5112 + }, + { + "epoch": 0.03040845941573889, + "grad_norm": 1.9474748373031616, + "learning_rate": 4.9886054148998975e-05, + "loss": 5.5544, + "step": 5113 + }, + { + "epoch": 0.03041440669901989, + "grad_norm": 1.9809894561767578, + "learning_rate": 4.988600959879676e-05, + "loss": 5.6204, + "step": 5114 + }, + { + "epoch": 0.030420353982300884, + "grad_norm": 2.1792514324188232, + "learning_rate": 4.9885965039907104e-05, + "loss": 5.5368, + "step": 5115 + }, + { + "epoch": 0.030426301265581882, + "grad_norm": 2.050903081893921, + "learning_rate": 4.9885920472330004e-05, + "loss": 5.4717, + "step": 5116 + }, + { + "epoch": 0.03043224854886288, + "grad_norm": 1.9938042163848877, + "learning_rate": 4.988587589606549e-05, + "loss": 5.5373, + "step": 5117 + }, + { + "epoch": 0.030438195832143876, + "grad_norm": 1.7375110387802124, + "learning_rate": 4.988583131111358e-05, + "loss": 5.5621, + "step": 5118 + }, + { + "epoch": 0.030444143115424874, + "grad_norm": 2.077605962753296, + "learning_rate": 4.988578671747428e-05, + "loss": 5.5451, + "step": 5119 + }, + { + "epoch": 0.03045009039870587, + "grad_norm": 2.071706771850586, + "learning_rate": 4.988574211514761e-05, + "loss": 5.327, + "step": 5120 + }, + { + "epoch": 0.030456037681986868, + "grad_norm": 1.8317911624908447, + "learning_rate": 4.9885697504133574e-05, + "loss": 5.4123, + "step": 5121 + }, + { + "epoch": 0.030461984965267867, + "grad_norm": 2.1231188774108887, + "learning_rate": 4.988565288443221e-05, + "loss": 5.3789, + "step": 5122 + }, + { + "epoch": 0.03046793224854886, + "grad_norm": 2.1298999786376953, + "learning_rate": 4.988560825604352e-05, + "loss": 5.4382, + "step": 5123 + }, + { + "epoch": 0.03047387953182986, + "grad_norm": 1.791053056716919, + "learning_rate": 4.9885563618967525e-05, + "loss": 5.3918, + "step": 5124 + }, + { + "epoch": 0.03047982681511086, + "grad_norm": 1.9610999822616577, + "learning_rate": 4.988551897320423e-05, + "loss": 5.3232, + "step": 5125 + }, + { + "epoch": 0.030485774098391854, + "grad_norm": 1.9926520586013794, + "learning_rate": 4.9885474318753654e-05, + "loss": 5.4316, + "step": 5126 + }, + { + "epoch": 0.030491721381672852, + "grad_norm": 1.8942431211471558, + "learning_rate": 4.988542965561582e-05, + "loss": 5.4055, + "step": 5127 + }, + { + "epoch": 0.030497668664953848, + "grad_norm": 1.7872856855392456, + "learning_rate": 4.988538498379074e-05, + "loss": 5.5117, + "step": 5128 + }, + { + "epoch": 0.030503615948234846, + "grad_norm": 2.040205478668213, + "learning_rate": 4.988534030327843e-05, + "loss": 5.4068, + "step": 5129 + }, + { + "epoch": 0.030509563231515845, + "grad_norm": 2.0108931064605713, + "learning_rate": 4.988529561407891e-05, + "loss": 5.3636, + "step": 5130 + }, + { + "epoch": 0.03051551051479684, + "grad_norm": 2.0339555740356445, + "learning_rate": 4.988525091619218e-05, + "loss": 5.2811, + "step": 5131 + }, + { + "epoch": 0.03052145779807784, + "grad_norm": 1.7631195783615112, + "learning_rate": 4.988520620961828e-05, + "loss": 5.3407, + "step": 5132 + }, + { + "epoch": 0.030527405081358837, + "grad_norm": 1.6906533241271973, + "learning_rate": 4.988516149435719e-05, + "loss": 5.3121, + "step": 5133 + }, + { + "epoch": 0.030533352364639832, + "grad_norm": 2.0753448009490967, + "learning_rate": 4.988511677040897e-05, + "loss": 5.4532, + "step": 5134 + }, + { + "epoch": 0.03053929964792083, + "grad_norm": 1.9836634397506714, + "learning_rate": 4.9885072037773595e-05, + "loss": 5.4345, + "step": 5135 + }, + { + "epoch": 0.030545246931201826, + "grad_norm": 1.8526780605316162, + "learning_rate": 4.988502729645111e-05, + "loss": 5.446, + "step": 5136 + }, + { + "epoch": 0.030551194214482824, + "grad_norm": 2.126626968383789, + "learning_rate": 4.988498254644152e-05, + "loss": 5.703, + "step": 5137 + }, + { + "epoch": 0.030557141497763823, + "grad_norm": 1.9711220264434814, + "learning_rate": 4.988493778774483e-05, + "loss": 5.5872, + "step": 5138 + }, + { + "epoch": 0.030563088781044818, + "grad_norm": 2.070727586746216, + "learning_rate": 4.988489302036107e-05, + "loss": 5.4407, + "step": 5139 + }, + { + "epoch": 0.030569036064325816, + "grad_norm": 2.1414859294891357, + "learning_rate": 4.988484824429025e-05, + "loss": 5.5291, + "step": 5140 + }, + { + "epoch": 0.030574983347606815, + "grad_norm": 2.01366925239563, + "learning_rate": 4.9884803459532384e-05, + "loss": 5.3561, + "step": 5141 + }, + { + "epoch": 0.03058093063088781, + "grad_norm": 1.851836085319519, + "learning_rate": 4.988475866608749e-05, + "loss": 5.679, + "step": 5142 + }, + { + "epoch": 0.03058687791416881, + "grad_norm": 1.6984909772872925, + "learning_rate": 4.988471386395559e-05, + "loss": 5.6075, + "step": 5143 + }, + { + "epoch": 0.030592825197449804, + "grad_norm": 1.9371756315231323, + "learning_rate": 4.9884669053136696e-05, + "loss": 5.7062, + "step": 5144 + }, + { + "epoch": 0.030598772480730802, + "grad_norm": 1.9286617040634155, + "learning_rate": 4.9884624233630815e-05, + "loss": 5.573, + "step": 5145 + }, + { + "epoch": 0.0306047197640118, + "grad_norm": 2.7633650302886963, + "learning_rate": 4.988457940543797e-05, + "loss": 6.2082, + "step": 5146 + }, + { + "epoch": 0.030610667047292796, + "grad_norm": 2.6948676109313965, + "learning_rate": 4.9884534568558173e-05, + "loss": 5.7475, + "step": 5147 + }, + { + "epoch": 0.030616614330573794, + "grad_norm": 2.1618316173553467, + "learning_rate": 4.988448972299145e-05, + "loss": 5.4049, + "step": 5148 + }, + { + "epoch": 0.03062256161385479, + "grad_norm": 2.417043685913086, + "learning_rate": 4.98844448687378e-05, + "loss": 5.3663, + "step": 5149 + }, + { + "epoch": 0.030628508897135788, + "grad_norm": 1.9748867750167847, + "learning_rate": 4.988440000579725e-05, + "loss": 5.1876, + "step": 5150 + }, + { + "epoch": 0.030634456180416787, + "grad_norm": 2.0534770488739014, + "learning_rate": 4.988435513416981e-05, + "loss": 5.4519, + "step": 5151 + }, + { + "epoch": 0.03064040346369778, + "grad_norm": 1.9772714376449585, + "learning_rate": 4.98843102538555e-05, + "loss": 5.5241, + "step": 5152 + }, + { + "epoch": 0.03064635074697878, + "grad_norm": 2.4160993099212646, + "learning_rate": 4.988426536485434e-05, + "loss": 5.6535, + "step": 5153 + }, + { + "epoch": 0.03065229803025978, + "grad_norm": 1.9931175708770752, + "learning_rate": 4.9884220467166345e-05, + "loss": 5.6693, + "step": 5154 + }, + { + "epoch": 0.030658245313540774, + "grad_norm": 1.9071956872940063, + "learning_rate": 4.9884175560791516e-05, + "loss": 5.5533, + "step": 5155 + }, + { + "epoch": 0.030664192596821772, + "grad_norm": 1.8562983274459839, + "learning_rate": 4.9884130645729876e-05, + "loss": 5.5621, + "step": 5156 + }, + { + "epoch": 0.030670139880102767, + "grad_norm": 2.087606430053711, + "learning_rate": 4.9884085721981446e-05, + "loss": 5.5256, + "step": 5157 + }, + { + "epoch": 0.030676087163383766, + "grad_norm": 2.3242955207824707, + "learning_rate": 4.988404078954624e-05, + "loss": 5.3906, + "step": 5158 + }, + { + "epoch": 0.030682034446664765, + "grad_norm": 2.221330404281616, + "learning_rate": 4.988399584842427e-05, + "loss": 5.5719, + "step": 5159 + }, + { + "epoch": 0.03068798172994576, + "grad_norm": 1.7819960117340088, + "learning_rate": 4.988395089861556e-05, + "loss": 5.5823, + "step": 5160 + }, + { + "epoch": 0.030693929013226758, + "grad_norm": 1.781802773475647, + "learning_rate": 4.988390594012011e-05, + "loss": 5.6087, + "step": 5161 + }, + { + "epoch": 0.030699876296507757, + "grad_norm": 2.0003581047058105, + "learning_rate": 4.988386097293796e-05, + "loss": 5.5695, + "step": 5162 + }, + { + "epoch": 0.030705823579788752, + "grad_norm": 1.9411736726760864, + "learning_rate": 4.98838159970691e-05, + "loss": 5.441, + "step": 5163 + }, + { + "epoch": 0.03071177086306975, + "grad_norm": 2.159541368484497, + "learning_rate": 4.9883771012513556e-05, + "loss": 5.6191, + "step": 5164 + }, + { + "epoch": 0.030717718146350746, + "grad_norm": 2.1045689582824707, + "learning_rate": 4.988372601927135e-05, + "loss": 5.3261, + "step": 5165 + }, + { + "epoch": 0.030723665429631744, + "grad_norm": 2.004770040512085, + "learning_rate": 4.988368101734249e-05, + "loss": 5.3392, + "step": 5166 + }, + { + "epoch": 0.030729612712912743, + "grad_norm": 2.1851232051849365, + "learning_rate": 4.9883636006726996e-05, + "loss": 5.3048, + "step": 5167 + }, + { + "epoch": 0.030735559996193738, + "grad_norm": 2.1333882808685303, + "learning_rate": 4.988359098742488e-05, + "loss": 5.336, + "step": 5168 + }, + { + "epoch": 0.030741507279474736, + "grad_norm": 2.1911604404449463, + "learning_rate": 4.9883545959436165e-05, + "loss": 5.757, + "step": 5169 + }, + { + "epoch": 0.030747454562755735, + "grad_norm": 2.0385994911193848, + "learning_rate": 4.988350092276085e-05, + "loss": 5.7889, + "step": 5170 + }, + { + "epoch": 0.03075340184603673, + "grad_norm": 2.2300381660461426, + "learning_rate": 4.988345587739897e-05, + "loss": 5.3812, + "step": 5171 + }, + { + "epoch": 0.03075934912931773, + "grad_norm": 2.4643938541412354, + "learning_rate": 4.988341082335053e-05, + "loss": 5.2503, + "step": 5172 + }, + { + "epoch": 0.030765296412598724, + "grad_norm": 2.0791194438934326, + "learning_rate": 4.988336576061555e-05, + "loss": 5.2958, + "step": 5173 + }, + { + "epoch": 0.030771243695879722, + "grad_norm": 2.1123111248016357, + "learning_rate": 4.988332068919405e-05, + "loss": 5.3656, + "step": 5174 + }, + { + "epoch": 0.03077719097916072, + "grad_norm": 2.199747323989868, + "learning_rate": 4.9883275609086026e-05, + "loss": 5.7015, + "step": 5175 + }, + { + "epoch": 0.030783138262441716, + "grad_norm": 2.0083510875701904, + "learning_rate": 4.988323052029151e-05, + "loss": 5.7068, + "step": 5176 + }, + { + "epoch": 0.030789085545722714, + "grad_norm": 2.1027777194976807, + "learning_rate": 4.988318542281053e-05, + "loss": 5.6986, + "step": 5177 + }, + { + "epoch": 0.03079503282900371, + "grad_norm": 1.8593190908432007, + "learning_rate": 4.9883140316643074e-05, + "loss": 5.7194, + "step": 5178 + }, + { + "epoch": 0.030800980112284708, + "grad_norm": 1.9712544679641724, + "learning_rate": 4.988309520178918e-05, + "loss": 5.6472, + "step": 5179 + }, + { + "epoch": 0.030806927395565707, + "grad_norm": 2.1114501953125, + "learning_rate": 4.9883050078248836e-05, + "loss": 5.6767, + "step": 5180 + }, + { + "epoch": 0.0308128746788467, + "grad_norm": 3.0505895614624023, + "learning_rate": 4.988300494602209e-05, + "loss": 5.3705, + "step": 5181 + }, + { + "epoch": 0.0308188219621277, + "grad_norm": 2.648364782333374, + "learning_rate": 4.988295980510895e-05, + "loss": 5.3072, + "step": 5182 + }, + { + "epoch": 0.0308247692454087, + "grad_norm": 2.2162837982177734, + "learning_rate": 4.9882914655509414e-05, + "loss": 5.3359, + "step": 5183 + }, + { + "epoch": 0.030830716528689694, + "grad_norm": 2.16666316986084, + "learning_rate": 4.988286949722352e-05, + "loss": 5.3446, + "step": 5184 + }, + { + "epoch": 0.030836663811970692, + "grad_norm": 2.951157569885254, + "learning_rate": 4.988282433025126e-05, + "loss": 5.7776, + "step": 5185 + }, + { + "epoch": 0.030842611095251687, + "grad_norm": 2.9967124462127686, + "learning_rate": 4.988277915459267e-05, + "loss": 5.6004, + "step": 5186 + }, + { + "epoch": 0.030848558378532686, + "grad_norm": 2.3998372554779053, + "learning_rate": 4.988273397024777e-05, + "loss": 5.3562, + "step": 5187 + }, + { + "epoch": 0.030854505661813685, + "grad_norm": 2.290592670440674, + "learning_rate": 4.9882688777216544e-05, + "loss": 5.3211, + "step": 5188 + }, + { + "epoch": 0.03086045294509468, + "grad_norm": 2.0349433422088623, + "learning_rate": 4.988264357549904e-05, + "loss": 5.2917, + "step": 5189 + }, + { + "epoch": 0.030866400228375678, + "grad_norm": 1.922006607055664, + "learning_rate": 4.988259836509526e-05, + "loss": 5.2297, + "step": 5190 + }, + { + "epoch": 0.030872347511656677, + "grad_norm": 1.9518259763717651, + "learning_rate": 4.9882553146005225e-05, + "loss": 5.2232, + "step": 5191 + }, + { + "epoch": 0.030878294794937672, + "grad_norm": 2.1054210662841797, + "learning_rate": 4.988250791822894e-05, + "loss": 5.3705, + "step": 5192 + }, + { + "epoch": 0.03088424207821867, + "grad_norm": 2.0954079627990723, + "learning_rate": 4.988246268176644e-05, + "loss": 5.2522, + "step": 5193 + }, + { + "epoch": 0.030890189361499665, + "grad_norm": 1.8628660440444946, + "learning_rate": 4.9882417436617724e-05, + "loss": 5.3856, + "step": 5194 + }, + { + "epoch": 0.030896136644780664, + "grad_norm": 2.2788021564483643, + "learning_rate": 4.988237218278281e-05, + "loss": 5.4399, + "step": 5195 + }, + { + "epoch": 0.030902083928061663, + "grad_norm": 1.981086015701294, + "learning_rate": 4.9882326920261717e-05, + "loss": 5.2853, + "step": 5196 + }, + { + "epoch": 0.030908031211342658, + "grad_norm": 1.9278241395950317, + "learning_rate": 4.988228164905446e-05, + "loss": 5.3997, + "step": 5197 + }, + { + "epoch": 0.030913978494623656, + "grad_norm": 1.842748999595642, + "learning_rate": 4.988223636916106e-05, + "loss": 5.3215, + "step": 5198 + }, + { + "epoch": 0.030919925777904655, + "grad_norm": 1.9974339008331299, + "learning_rate": 4.988219108058153e-05, + "loss": 5.4851, + "step": 5199 + }, + { + "epoch": 0.03092587306118565, + "grad_norm": 2.015939474105835, + "learning_rate": 4.988214578331588e-05, + "loss": 5.322, + "step": 5200 + }, + { + "epoch": 0.03093182034446665, + "grad_norm": 2.035209894180298, + "learning_rate": 4.9882100477364135e-05, + "loss": 5.3896, + "step": 5201 + }, + { + "epoch": 0.030937767627747643, + "grad_norm": 1.9803009033203125, + "learning_rate": 4.9882055162726296e-05, + "loss": 5.2624, + "step": 5202 + }, + { + "epoch": 0.030943714911028642, + "grad_norm": 1.9504352807998657, + "learning_rate": 4.98820098394024e-05, + "loss": 5.2333, + "step": 5203 + }, + { + "epoch": 0.03094966219430964, + "grad_norm": 1.850542664527893, + "learning_rate": 4.9881964507392443e-05, + "loss": 5.5632, + "step": 5204 + }, + { + "epoch": 0.030955609477590636, + "grad_norm": 1.8594067096710205, + "learning_rate": 4.9881919166696456e-05, + "loss": 5.3775, + "step": 5205 + }, + { + "epoch": 0.030961556760871634, + "grad_norm": 2.019274950027466, + "learning_rate": 4.988187381731444e-05, + "loss": 5.4565, + "step": 5206 + }, + { + "epoch": 0.030967504044152633, + "grad_norm": 1.7151249647140503, + "learning_rate": 4.988182845924643e-05, + "loss": 5.5984, + "step": 5207 + }, + { + "epoch": 0.030973451327433628, + "grad_norm": 2.5127339363098145, + "learning_rate": 4.988178309249242e-05, + "loss": 6.2724, + "step": 5208 + }, + { + "epoch": 0.030979398610714626, + "grad_norm": 1.869344711303711, + "learning_rate": 4.9881737717052436e-05, + "loss": 5.5408, + "step": 5209 + }, + { + "epoch": 0.03098534589399562, + "grad_norm": 2.035419225692749, + "learning_rate": 4.98816923329265e-05, + "loss": 5.4154, + "step": 5210 + }, + { + "epoch": 0.03099129317727662, + "grad_norm": 1.7084250450134277, + "learning_rate": 4.9881646940114624e-05, + "loss": 5.6327, + "step": 5211 + }, + { + "epoch": 0.03099724046055762, + "grad_norm": 2.1035211086273193, + "learning_rate": 4.9881601538616816e-05, + "loss": 5.5041, + "step": 5212 + }, + { + "epoch": 0.031003187743838614, + "grad_norm": 1.920366883277893, + "learning_rate": 4.9881556128433105e-05, + "loss": 5.5919, + "step": 5213 + }, + { + "epoch": 0.031009135027119612, + "grad_norm": 2.000555992126465, + "learning_rate": 4.988151070956349e-05, + "loss": 5.5078, + "step": 5214 + }, + { + "epoch": 0.031015082310400607, + "grad_norm": 1.9930146932601929, + "learning_rate": 4.9881465282008e-05, + "loss": 5.5002, + "step": 5215 + }, + { + "epoch": 0.031021029593681606, + "grad_norm": 2.163329839706421, + "learning_rate": 4.988141984576665e-05, + "loss": 5.3504, + "step": 5216 + }, + { + "epoch": 0.031026976876962604, + "grad_norm": 1.766228437423706, + "learning_rate": 4.988137440083946e-05, + "loss": 5.5304, + "step": 5217 + }, + { + "epoch": 0.0310329241602436, + "grad_norm": 2.1399648189544678, + "learning_rate": 4.988132894722644e-05, + "loss": 5.4757, + "step": 5218 + }, + { + "epoch": 0.031038871443524598, + "grad_norm": 2.2287001609802246, + "learning_rate": 4.988128348492759e-05, + "loss": 5.4902, + "step": 5219 + }, + { + "epoch": 0.031044818726805597, + "grad_norm": 2.095080852508545, + "learning_rate": 4.988123801394295e-05, + "loss": 5.3462, + "step": 5220 + }, + { + "epoch": 0.031050766010086592, + "grad_norm": 2.0873003005981445, + "learning_rate": 4.988119253427253e-05, + "loss": 5.2825, + "step": 5221 + }, + { + "epoch": 0.03105671329336759, + "grad_norm": 2.0918655395507812, + "learning_rate": 4.988114704591633e-05, + "loss": 5.2859, + "step": 5222 + }, + { + "epoch": 0.031062660576648585, + "grad_norm": 1.9637762308120728, + "learning_rate": 4.9881101548874384e-05, + "loss": 5.4687, + "step": 5223 + }, + { + "epoch": 0.031068607859929584, + "grad_norm": 2.046672821044922, + "learning_rate": 4.988105604314671e-05, + "loss": 5.5095, + "step": 5224 + }, + { + "epoch": 0.031074555143210583, + "grad_norm": 2.0264053344726562, + "learning_rate": 4.988101052873332e-05, + "loss": 5.4221, + "step": 5225 + }, + { + "epoch": 0.031080502426491578, + "grad_norm": 1.9367676973342896, + "learning_rate": 4.9880965005634216e-05, + "loss": 5.1881, + "step": 5226 + }, + { + "epoch": 0.031086449709772576, + "grad_norm": 2.0398001670837402, + "learning_rate": 4.9880919473849425e-05, + "loss": 5.4938, + "step": 5227 + }, + { + "epoch": 0.031092396993053575, + "grad_norm": 2.037411689758301, + "learning_rate": 4.988087393337896e-05, + "loss": 5.0893, + "step": 5228 + }, + { + "epoch": 0.03109834427633457, + "grad_norm": 2.1337075233459473, + "learning_rate": 4.988082838422285e-05, + "loss": 4.9822, + "step": 5229 + }, + { + "epoch": 0.03110429155961557, + "grad_norm": 1.9911794662475586, + "learning_rate": 4.988078282638109e-05, + "loss": 5.2472, + "step": 5230 + }, + { + "epoch": 0.031110238842896563, + "grad_norm": 2.1050829887390137, + "learning_rate": 4.98807372598537e-05, + "loss": 5.3478, + "step": 5231 + }, + { + "epoch": 0.031116186126177562, + "grad_norm": 1.9364343881607056, + "learning_rate": 4.988069168464071e-05, + "loss": 5.2551, + "step": 5232 + }, + { + "epoch": 0.03112213340945856, + "grad_norm": 1.9834885597229004, + "learning_rate": 4.988064610074213e-05, + "loss": 5.2147, + "step": 5233 + }, + { + "epoch": 0.031128080692739556, + "grad_norm": 2.0815906524658203, + "learning_rate": 4.9880600508157974e-05, + "loss": 5.1607, + "step": 5234 + }, + { + "epoch": 0.031134027976020554, + "grad_norm": 1.9558357000350952, + "learning_rate": 4.988055490688825e-05, + "loss": 5.4, + "step": 5235 + }, + { + "epoch": 0.031139975259301553, + "grad_norm": 1.9036076068878174, + "learning_rate": 4.9880509296932986e-05, + "loss": 5.4953, + "step": 5236 + }, + { + "epoch": 0.031145922542582548, + "grad_norm": 2.4709548950195312, + "learning_rate": 4.98804636782922e-05, + "loss": 5.2628, + "step": 5237 + }, + { + "epoch": 0.031151869825863546, + "grad_norm": 2.2380030155181885, + "learning_rate": 4.988041805096589e-05, + "loss": 5.2423, + "step": 5238 + }, + { + "epoch": 0.03115781710914454, + "grad_norm": 2.348639726638794, + "learning_rate": 4.988037241495409e-05, + "loss": 5.1966, + "step": 5239 + }, + { + "epoch": 0.03116376439242554, + "grad_norm": 1.9384468793869019, + "learning_rate": 4.9880326770256805e-05, + "loss": 5.47, + "step": 5240 + }, + { + "epoch": 0.03116971167570654, + "grad_norm": 2.2664244174957275, + "learning_rate": 4.988028111687406e-05, + "loss": 5.5511, + "step": 5241 + }, + { + "epoch": 0.031175658958987534, + "grad_norm": 2.1356422901153564, + "learning_rate": 4.988023545480586e-05, + "loss": 5.6462, + "step": 5242 + }, + { + "epoch": 0.031181606242268532, + "grad_norm": 2.240190267562866, + "learning_rate": 4.9880189784052226e-05, + "loss": 5.3494, + "step": 5243 + }, + { + "epoch": 0.031187553525549527, + "grad_norm": 1.8032485246658325, + "learning_rate": 4.988014410461318e-05, + "loss": 5.2305, + "step": 5244 + }, + { + "epoch": 0.031193500808830526, + "grad_norm": 2.177501678466797, + "learning_rate": 4.988009841648873e-05, + "loss": 5.1891, + "step": 5245 + }, + { + "epoch": 0.031199448092111524, + "grad_norm": 2.157317876815796, + "learning_rate": 4.988005271967889e-05, + "loss": 5.1038, + "step": 5246 + }, + { + "epoch": 0.03120539537539252, + "grad_norm": 1.9995821714401245, + "learning_rate": 4.988000701418369e-05, + "loss": 5.1098, + "step": 5247 + }, + { + "epoch": 0.031211342658673518, + "grad_norm": 2.201558828353882, + "learning_rate": 4.987996130000313e-05, + "loss": 5.0702, + "step": 5248 + }, + { + "epoch": 0.031217289941954517, + "grad_norm": 2.065645933151245, + "learning_rate": 4.987991557713724e-05, + "loss": 5.2012, + "step": 5249 + }, + { + "epoch": 0.03122323722523551, + "grad_norm": 1.908347487449646, + "learning_rate": 4.9879869845586024e-05, + "loss": 5.0913, + "step": 5250 + }, + { + "epoch": 0.03122918450851651, + "grad_norm": 1.913979411125183, + "learning_rate": 4.98798241053495e-05, + "loss": 5.0036, + "step": 5251 + }, + { + "epoch": 0.031235131791797505, + "grad_norm": 2.217616558074951, + "learning_rate": 4.9879778356427686e-05, + "loss": 5.0621, + "step": 5252 + }, + { + "epoch": 0.031241079075078504, + "grad_norm": 2.419713258743286, + "learning_rate": 4.9879732598820605e-05, + "loss": 5.1264, + "step": 5253 + }, + { + "epoch": 0.031247026358359502, + "grad_norm": 2.298295497894287, + "learning_rate": 4.987968683252826e-05, + "loss": 5.0576, + "step": 5254 + }, + { + "epoch": 0.0312529736416405, + "grad_norm": 2.120589256286621, + "learning_rate": 4.987964105755067e-05, + "loss": 5.175, + "step": 5255 + }, + { + "epoch": 0.031258920924921496, + "grad_norm": 2.3129806518554688, + "learning_rate": 4.987959527388787e-05, + "loss": 5.1827, + "step": 5256 + }, + { + "epoch": 0.03126486820820249, + "grad_norm": 2.251680612564087, + "learning_rate": 4.9879549481539846e-05, + "loss": 5.0473, + "step": 5257 + }, + { + "epoch": 0.03127081549148349, + "grad_norm": 2.101229429244995, + "learning_rate": 4.987950368050663e-05, + "loss": 5.0453, + "step": 5258 + }, + { + "epoch": 0.03127676277476449, + "grad_norm": 2.189565420150757, + "learning_rate": 4.987945787078824e-05, + "loss": 5.087, + "step": 5259 + }, + { + "epoch": 0.03128271005804548, + "grad_norm": 2.05485463142395, + "learning_rate": 4.9879412052384687e-05, + "loss": 5.0192, + "step": 5260 + }, + { + "epoch": 0.031288657341326485, + "grad_norm": 1.8166489601135254, + "learning_rate": 4.9879366225295994e-05, + "loss": 5.0456, + "step": 5261 + }, + { + "epoch": 0.03129460462460748, + "grad_norm": 2.1403279304504395, + "learning_rate": 4.9879320389522165e-05, + "loss": 4.9455, + "step": 5262 + }, + { + "epoch": 0.031300551907888476, + "grad_norm": 1.8833802938461304, + "learning_rate": 4.9879274545063226e-05, + "loss": 5.0891, + "step": 5263 + }, + { + "epoch": 0.03130649919116947, + "grad_norm": 2.000692367553711, + "learning_rate": 4.987922869191918e-05, + "loss": 5.1125, + "step": 5264 + }, + { + "epoch": 0.03131244647445047, + "grad_norm": 1.947544813156128, + "learning_rate": 4.9879182830090065e-05, + "loss": 4.9139, + "step": 5265 + }, + { + "epoch": 0.03131839375773147, + "grad_norm": 1.8827823400497437, + "learning_rate": 4.987913695957588e-05, + "loss": 5.0154, + "step": 5266 + }, + { + "epoch": 0.03132434104101246, + "grad_norm": 2.268115997314453, + "learning_rate": 4.987909108037664e-05, + "loss": 5.0379, + "step": 5267 + }, + { + "epoch": 0.031330288324293465, + "grad_norm": 1.85139000415802, + "learning_rate": 4.987904519249237e-05, + "loss": 4.9428, + "step": 5268 + }, + { + "epoch": 0.03133623560757446, + "grad_norm": 2.208338737487793, + "learning_rate": 4.987899929592308e-05, + "loss": 4.9366, + "step": 5269 + }, + { + "epoch": 0.031342182890855455, + "grad_norm": 3.5571236610412598, + "learning_rate": 4.987895339066879e-05, + "loss": 6.8471, + "step": 5270 + }, + { + "epoch": 0.03134813017413646, + "grad_norm": 2.000157594680786, + "learning_rate": 4.9878907476729516e-05, + "loss": 5.025, + "step": 5271 + }, + { + "epoch": 0.03135407745741745, + "grad_norm": 2.0588366985321045, + "learning_rate": 4.987886155410527e-05, + "loss": 4.8955, + "step": 5272 + }, + { + "epoch": 0.03136002474069845, + "grad_norm": 2.217839241027832, + "learning_rate": 4.9878815622796074e-05, + "loss": 4.9889, + "step": 5273 + }, + { + "epoch": 0.03136597202397945, + "grad_norm": 2.2453126907348633, + "learning_rate": 4.987876968280194e-05, + "loss": 5.3774, + "step": 5274 + }, + { + "epoch": 0.031371919307260444, + "grad_norm": 1.9839471578598022, + "learning_rate": 4.9878723734122876e-05, + "loss": 4.993, + "step": 5275 + }, + { + "epoch": 0.03137786659054144, + "grad_norm": 1.9534602165222168, + "learning_rate": 4.987867777675892e-05, + "loss": 4.9079, + "step": 5276 + }, + { + "epoch": 0.031383813873822435, + "grad_norm": 1.96163809299469, + "learning_rate": 4.9878631810710066e-05, + "loss": 4.9829, + "step": 5277 + }, + { + "epoch": 0.03138976115710344, + "grad_norm": 2.0814366340637207, + "learning_rate": 4.987858583597634e-05, + "loss": 4.8731, + "step": 5278 + }, + { + "epoch": 0.03139570844038443, + "grad_norm": 1.9846211671829224, + "learning_rate": 4.987853985255776e-05, + "loss": 4.9495, + "step": 5279 + }, + { + "epoch": 0.03140165572366543, + "grad_norm": 2.1237289905548096, + "learning_rate": 4.9878493860454335e-05, + "loss": 5.3887, + "step": 5280 + }, + { + "epoch": 0.03140760300694643, + "grad_norm": 2.1526784896850586, + "learning_rate": 4.9878447859666086e-05, + "loss": 5.3603, + "step": 5281 + }, + { + "epoch": 0.031413550290227424, + "grad_norm": 2.0563082695007324, + "learning_rate": 4.987840185019303e-05, + "loss": 5.4104, + "step": 5282 + }, + { + "epoch": 0.03141949757350842, + "grad_norm": 2.0586647987365723, + "learning_rate": 4.9878355832035175e-05, + "loss": 5.517, + "step": 5283 + }, + { + "epoch": 0.03142544485678942, + "grad_norm": 1.8817695379257202, + "learning_rate": 4.9878309805192546e-05, + "loss": 5.3616, + "step": 5284 + }, + { + "epoch": 0.031431392140070416, + "grad_norm": 2.0987086296081543, + "learning_rate": 4.987826376966516e-05, + "loss": 5.3237, + "step": 5285 + }, + { + "epoch": 0.03143733942335141, + "grad_norm": 2.3505301475524902, + "learning_rate": 4.987821772545302e-05, + "loss": 5.5165, + "step": 5286 + }, + { + "epoch": 0.03144328670663241, + "grad_norm": 2.1199939250946045, + "learning_rate": 4.987817167255616e-05, + "loss": 5.3029, + "step": 5287 + }, + { + "epoch": 0.03144923398991341, + "grad_norm": 1.7463518381118774, + "learning_rate": 4.987812561097458e-05, + "loss": 5.3589, + "step": 5288 + }, + { + "epoch": 0.0314551812731944, + "grad_norm": 1.9957356452941895, + "learning_rate": 4.987807954070831e-05, + "loss": 5.2459, + "step": 5289 + }, + { + "epoch": 0.031461128556475405, + "grad_norm": 1.7865337133407593, + "learning_rate": 4.987803346175736e-05, + "loss": 5.3041, + "step": 5290 + }, + { + "epoch": 0.0314670758397564, + "grad_norm": 1.82949960231781, + "learning_rate": 4.9877987374121744e-05, + "loss": 5.5761, + "step": 5291 + }, + { + "epoch": 0.031473023123037396, + "grad_norm": 1.974692940711975, + "learning_rate": 4.9877941277801475e-05, + "loss": 5.5033, + "step": 5292 + }, + { + "epoch": 0.03147897040631839, + "grad_norm": 2.1808922290802, + "learning_rate": 4.9877895172796577e-05, + "loss": 5.6739, + "step": 5293 + }, + { + "epoch": 0.03148491768959939, + "grad_norm": 2.7555716037750244, + "learning_rate": 4.987784905910706e-05, + "loss": 5.2489, + "step": 5294 + }, + { + "epoch": 0.03149086497288039, + "grad_norm": 2.475541353225708, + "learning_rate": 4.9877802936732955e-05, + "loss": 5.2304, + "step": 5295 + }, + { + "epoch": 0.03149681225616138, + "grad_norm": 1.945482611656189, + "learning_rate": 4.987775680567425e-05, + "loss": 5.4085, + "step": 5296 + }, + { + "epoch": 0.031502759539442385, + "grad_norm": 1.9879848957061768, + "learning_rate": 4.987771066593099e-05, + "loss": 5.5372, + "step": 5297 + }, + { + "epoch": 0.03150870682272338, + "grad_norm": 2.0529556274414062, + "learning_rate": 4.987766451750317e-05, + "loss": 5.578, + "step": 5298 + }, + { + "epoch": 0.031514654106004375, + "grad_norm": 1.7769572734832764, + "learning_rate": 4.9877618360390816e-05, + "loss": 5.5348, + "step": 5299 + }, + { + "epoch": 0.03152060138928538, + "grad_norm": 1.9111005067825317, + "learning_rate": 4.987757219459395e-05, + "loss": 5.4267, + "step": 5300 + }, + { + "epoch": 0.03152654867256637, + "grad_norm": 1.9047571420669556, + "learning_rate": 4.987752602011256e-05, + "loss": 5.433, + "step": 5301 + }, + { + "epoch": 0.03153249595584737, + "grad_norm": 1.9031875133514404, + "learning_rate": 4.98774798369467e-05, + "loss": 5.4929, + "step": 5302 + }, + { + "epoch": 0.03153844323912837, + "grad_norm": 1.858656883239746, + "learning_rate": 4.987743364509637e-05, + "loss": 5.3583, + "step": 5303 + }, + { + "epoch": 0.031544390522409364, + "grad_norm": 1.9254835844039917, + "learning_rate": 4.987738744456158e-05, + "loss": 5.4885, + "step": 5304 + }, + { + "epoch": 0.03155033780569036, + "grad_norm": 1.96173095703125, + "learning_rate": 4.987734123534235e-05, + "loss": 5.4869, + "step": 5305 + }, + { + "epoch": 0.031556285088971354, + "grad_norm": 1.7857433557510376, + "learning_rate": 4.98772950174387e-05, + "loss": 5.3845, + "step": 5306 + }, + { + "epoch": 0.031562232372252357, + "grad_norm": 1.9360556602478027, + "learning_rate": 4.9877248790850636e-05, + "loss": 5.3809, + "step": 5307 + }, + { + "epoch": 0.03156817965553335, + "grad_norm": 2.2044126987457275, + "learning_rate": 4.9877202555578197e-05, + "loss": 5.2413, + "step": 5308 + }, + { + "epoch": 0.03157412693881435, + "grad_norm": 1.8200992345809937, + "learning_rate": 4.9877156311621365e-05, + "loss": 5.6241, + "step": 5309 + }, + { + "epoch": 0.03158007422209535, + "grad_norm": 2.0771358013153076, + "learning_rate": 4.987711005898019e-05, + "loss": 5.6854, + "step": 5310 + }, + { + "epoch": 0.031586021505376344, + "grad_norm": 1.8330012559890747, + "learning_rate": 4.987706379765466e-05, + "loss": 5.712, + "step": 5311 + }, + { + "epoch": 0.03159196878865734, + "grad_norm": 1.941501498222351, + "learning_rate": 4.987701752764481e-05, + "loss": 5.4131, + "step": 5312 + }, + { + "epoch": 0.03159791607193834, + "grad_norm": 1.8688616752624512, + "learning_rate": 4.987697124895065e-05, + "loss": 5.3719, + "step": 5313 + }, + { + "epoch": 0.031603863355219336, + "grad_norm": 1.8723224401474, + "learning_rate": 4.98769249615722e-05, + "loss": 5.665, + "step": 5314 + }, + { + "epoch": 0.03160981063850033, + "grad_norm": 1.9460058212280273, + "learning_rate": 4.9876878665509474e-05, + "loss": 5.7048, + "step": 5315 + }, + { + "epoch": 0.03161575792178133, + "grad_norm": 1.9752602577209473, + "learning_rate": 4.987683236076248e-05, + "loss": 5.7098, + "step": 5316 + }, + { + "epoch": 0.03162170520506233, + "grad_norm": 1.8122695684432983, + "learning_rate": 4.9876786047331244e-05, + "loss": 5.2717, + "step": 5317 + }, + { + "epoch": 0.03162765248834332, + "grad_norm": 1.961983323097229, + "learning_rate": 4.9876739725215775e-05, + "loss": 5.5593, + "step": 5318 + }, + { + "epoch": 0.031633599771624325, + "grad_norm": 1.7362732887268066, + "learning_rate": 4.98766933944161e-05, + "loss": 5.5002, + "step": 5319 + }, + { + "epoch": 0.03163954705490532, + "grad_norm": 2.084033489227295, + "learning_rate": 4.9876647054932226e-05, + "loss": 5.5398, + "step": 5320 + }, + { + "epoch": 0.031645494338186315, + "grad_norm": 1.869452953338623, + "learning_rate": 4.9876600706764165e-05, + "loss": 5.5985, + "step": 5321 + }, + { + "epoch": 0.03165144162146731, + "grad_norm": 3.597667694091797, + "learning_rate": 4.9876554349911943e-05, + "loss": 5.4143, + "step": 5322 + }, + { + "epoch": 0.03165738890474831, + "grad_norm": 2.2364773750305176, + "learning_rate": 4.9876507984375574e-05, + "loss": 5.3756, + "step": 5323 + }, + { + "epoch": 0.03166333618802931, + "grad_norm": 2.0204551219940186, + "learning_rate": 4.987646161015508e-05, + "loss": 5.4964, + "step": 5324 + }, + { + "epoch": 0.0316692834713103, + "grad_norm": 1.7375823259353638, + "learning_rate": 4.987641522725046e-05, + "loss": 5.5249, + "step": 5325 + }, + { + "epoch": 0.031675230754591305, + "grad_norm": 1.661597728729248, + "learning_rate": 4.987636883566175e-05, + "loss": 5.4828, + "step": 5326 + }, + { + "epoch": 0.0316811780378723, + "grad_norm": 1.8612693548202515, + "learning_rate": 4.9876322435388944e-05, + "loss": 5.4711, + "step": 5327 + }, + { + "epoch": 0.031687125321153295, + "grad_norm": 1.8282328844070435, + "learning_rate": 4.987627602643208e-05, + "loss": 5.5234, + "step": 5328 + }, + { + "epoch": 0.0316930726044343, + "grad_norm": 1.951170802116394, + "learning_rate": 4.987622960879116e-05, + "loss": 5.4117, + "step": 5329 + }, + { + "epoch": 0.03169901988771529, + "grad_norm": 1.819174885749817, + "learning_rate": 4.9876183182466207e-05, + "loss": 5.3446, + "step": 5330 + }, + { + "epoch": 0.03170496717099629, + "grad_norm": 1.8710874319076538, + "learning_rate": 4.9876136747457245e-05, + "loss": 5.3755, + "step": 5331 + }, + { + "epoch": 0.03171091445427729, + "grad_norm": 2.1957387924194336, + "learning_rate": 4.9876090303764264e-05, + "loss": 6.3036, + "step": 5332 + }, + { + "epoch": 0.031716861737558284, + "grad_norm": 1.774741530418396, + "learning_rate": 4.987604385138731e-05, + "loss": 5.3822, + "step": 5333 + }, + { + "epoch": 0.03172280902083928, + "grad_norm": 1.793230414390564, + "learning_rate": 4.987599739032638e-05, + "loss": 5.4224, + "step": 5334 + }, + { + "epoch": 0.031728756304120274, + "grad_norm": 1.7986340522766113, + "learning_rate": 4.98759509205815e-05, + "loss": 5.3939, + "step": 5335 + }, + { + "epoch": 0.031734703587401276, + "grad_norm": 1.7775462865829468, + "learning_rate": 4.9875904442152675e-05, + "loss": 5.4356, + "step": 5336 + }, + { + "epoch": 0.03174065087068227, + "grad_norm": 1.882104516029358, + "learning_rate": 4.987585795503994e-05, + "loss": 5.2852, + "step": 5337 + }, + { + "epoch": 0.03174659815396327, + "grad_norm": 1.9842430353164673, + "learning_rate": 4.987581145924329e-05, + "loss": 5.4089, + "step": 5338 + }, + { + "epoch": 0.03175254543724427, + "grad_norm": 1.7098103761672974, + "learning_rate": 4.9875764954762754e-05, + "loss": 5.2442, + "step": 5339 + }, + { + "epoch": 0.031758492720525264, + "grad_norm": 1.8304857015609741, + "learning_rate": 4.9875718441598354e-05, + "loss": 5.5403, + "step": 5340 + }, + { + "epoch": 0.03176444000380626, + "grad_norm": 2.0763137340545654, + "learning_rate": 4.987567191975009e-05, + "loss": 5.8295, + "step": 5341 + }, + { + "epoch": 0.03177038728708726, + "grad_norm": 1.907271385192871, + "learning_rate": 4.9875625389217984e-05, + "loss": 5.6979, + "step": 5342 + }, + { + "epoch": 0.031776334570368256, + "grad_norm": 2.1263620853424072, + "learning_rate": 4.9875578850002056e-05, + "loss": 5.7713, + "step": 5343 + }, + { + "epoch": 0.03178228185364925, + "grad_norm": 2.038358211517334, + "learning_rate": 4.987553230210232e-05, + "loss": 6.0019, + "step": 5344 + }, + { + "epoch": 0.03178822913693025, + "grad_norm": 1.5671371221542358, + "learning_rate": 4.987548574551879e-05, + "loss": 5.9237, + "step": 5345 + }, + { + "epoch": 0.03179417642021125, + "grad_norm": 1.9159321784973145, + "learning_rate": 4.987543918025149e-05, + "loss": 6.0363, + "step": 5346 + }, + { + "epoch": 0.03180012370349224, + "grad_norm": 1.8012747764587402, + "learning_rate": 4.987539260630043e-05, + "loss": 5.901, + "step": 5347 + }, + { + "epoch": 0.031806070986773245, + "grad_norm": 2.154933214187622, + "learning_rate": 4.9875346023665625e-05, + "loss": 5.6379, + "step": 5348 + }, + { + "epoch": 0.03181201827005424, + "grad_norm": 2.191539764404297, + "learning_rate": 4.98752994323471e-05, + "loss": 5.5322, + "step": 5349 + }, + { + "epoch": 0.031817965553335235, + "grad_norm": 2.0007123947143555, + "learning_rate": 4.9875252832344856e-05, + "loss": 5.7398, + "step": 5350 + }, + { + "epoch": 0.03182391283661623, + "grad_norm": 1.7119163274765015, + "learning_rate": 4.9875206223658924e-05, + "loss": 5.8507, + "step": 5351 + }, + { + "epoch": 0.03182986011989723, + "grad_norm": 1.8882098197937012, + "learning_rate": 4.987515960628931e-05, + "loss": 5.8668, + "step": 5352 + }, + { + "epoch": 0.03183580740317823, + "grad_norm": 2.005493402481079, + "learning_rate": 4.987511298023604e-05, + "loss": 5.9672, + "step": 5353 + }, + { + "epoch": 0.03184175468645922, + "grad_norm": 1.858807921409607, + "learning_rate": 4.987506634549912e-05, + "loss": 5.9344, + "step": 5354 + }, + { + "epoch": 0.031847701969740225, + "grad_norm": 2.2698724269866943, + "learning_rate": 4.987501970207858e-05, + "loss": 5.6553, + "step": 5355 + }, + { + "epoch": 0.03185364925302122, + "grad_norm": 1.7690725326538086, + "learning_rate": 4.987497304997442e-05, + "loss": 5.6255, + "step": 5356 + }, + { + "epoch": 0.031859596536302215, + "grad_norm": 2.008002758026123, + "learning_rate": 4.987492638918667e-05, + "loss": 5.5578, + "step": 5357 + }, + { + "epoch": 0.03186554381958322, + "grad_norm": 1.6483304500579834, + "learning_rate": 4.987487971971533e-05, + "loss": 5.4786, + "step": 5358 + }, + { + "epoch": 0.03187149110286421, + "grad_norm": 1.9136204719543457, + "learning_rate": 4.987483304156044e-05, + "loss": 5.6043, + "step": 5359 + }, + { + "epoch": 0.03187743838614521, + "grad_norm": 1.9811625480651855, + "learning_rate": 4.987478635472199e-05, + "loss": 5.6172, + "step": 5360 + }, + { + "epoch": 0.03188338566942621, + "grad_norm": 2.012134075164795, + "learning_rate": 4.987473965920002e-05, + "loss": 5.6715, + "step": 5361 + }, + { + "epoch": 0.031889332952707204, + "grad_norm": 1.930550217628479, + "learning_rate": 4.987469295499453e-05, + "loss": 5.516, + "step": 5362 + }, + { + "epoch": 0.0318952802359882, + "grad_norm": 2.1190578937530518, + "learning_rate": 4.987464624210554e-05, + "loss": 5.5176, + "step": 5363 + }, + { + "epoch": 0.031901227519269194, + "grad_norm": 2.428710699081421, + "learning_rate": 4.987459952053307e-05, + "loss": 5.4088, + "step": 5364 + }, + { + "epoch": 0.031907174802550196, + "grad_norm": 1.8820819854736328, + "learning_rate": 4.987455279027713e-05, + "loss": 5.3753, + "step": 5365 + }, + { + "epoch": 0.03191312208583119, + "grad_norm": 1.6506859064102173, + "learning_rate": 4.987450605133775e-05, + "loss": 5.6018, + "step": 5366 + }, + { + "epoch": 0.03191906936911219, + "grad_norm": 2.060772657394409, + "learning_rate": 4.9874459303714925e-05, + "loss": 5.3587, + "step": 5367 + }, + { + "epoch": 0.03192501665239319, + "grad_norm": 2.3591532707214355, + "learning_rate": 4.9874412547408694e-05, + "loss": 5.7685, + "step": 5368 + }, + { + "epoch": 0.031930963935674184, + "grad_norm": 2.140322685241699, + "learning_rate": 4.987436578241906e-05, + "loss": 5.9015, + "step": 5369 + }, + { + "epoch": 0.03193691121895518, + "grad_norm": 2.2479233741760254, + "learning_rate": 4.987431900874604e-05, + "loss": 5.6079, + "step": 5370 + }, + { + "epoch": 0.03194285850223618, + "grad_norm": 2.0334317684173584, + "learning_rate": 4.987427222638965e-05, + "loss": 5.6364, + "step": 5371 + }, + { + "epoch": 0.031948805785517176, + "grad_norm": 2.0599231719970703, + "learning_rate": 4.987422543534991e-05, + "loss": 5.6578, + "step": 5372 + }, + { + "epoch": 0.03195475306879817, + "grad_norm": 2.237504720687866, + "learning_rate": 4.9874178635626836e-05, + "loss": 5.5784, + "step": 5373 + }, + { + "epoch": 0.03196070035207917, + "grad_norm": 2.013193130493164, + "learning_rate": 4.987413182722044e-05, + "loss": 5.4874, + "step": 5374 + }, + { + "epoch": 0.03196664763536017, + "grad_norm": 1.9806950092315674, + "learning_rate": 4.987408501013075e-05, + "loss": 5.41, + "step": 5375 + }, + { + "epoch": 0.03197259491864116, + "grad_norm": 1.7534204721450806, + "learning_rate": 4.9874038184357766e-05, + "loss": 5.4596, + "step": 5376 + }, + { + "epoch": 0.031978542201922165, + "grad_norm": 1.5722386837005615, + "learning_rate": 4.987399134990152e-05, + "loss": 5.508, + "step": 5377 + }, + { + "epoch": 0.03198448948520316, + "grad_norm": 7.868972301483154, + "learning_rate": 4.987394450676201e-05, + "loss": 5.1734, + "step": 5378 + }, + { + "epoch": 0.031990436768484155, + "grad_norm": 2.2103798389434814, + "learning_rate": 4.9873897654939274e-05, + "loss": 5.6766, + "step": 5379 + }, + { + "epoch": 0.03199638405176515, + "grad_norm": 1.9590017795562744, + "learning_rate": 4.9873850794433306e-05, + "loss": 5.7764, + "step": 5380 + }, + { + "epoch": 0.03200233133504615, + "grad_norm": 1.96006441116333, + "learning_rate": 4.9873803925244146e-05, + "loss": 5.7933, + "step": 5381 + }, + { + "epoch": 0.03200827861832715, + "grad_norm": 1.7377163171768188, + "learning_rate": 4.987375704737178e-05, + "loss": 5.692, + "step": 5382 + }, + { + "epoch": 0.03201422590160814, + "grad_norm": 2.0734782218933105, + "learning_rate": 4.9873710160816256e-05, + "loss": 5.5466, + "step": 5383 + }, + { + "epoch": 0.032020173184889145, + "grad_norm": 2.4700942039489746, + "learning_rate": 4.9873663265577574e-05, + "loss": 5.5837, + "step": 5384 + }, + { + "epoch": 0.03202612046817014, + "grad_norm": 2.067009925842285, + "learning_rate": 4.987361636165576e-05, + "loss": 5.4777, + "step": 5385 + }, + { + "epoch": 0.032032067751451135, + "grad_norm": 1.9585732221603394, + "learning_rate": 4.9873569449050815e-05, + "loss": 5.62, + "step": 5386 + }, + { + "epoch": 0.03203801503473214, + "grad_norm": 2.0210976600646973, + "learning_rate": 4.9873522527762766e-05, + "loss": 5.3554, + "step": 5387 + }, + { + "epoch": 0.03204396231801313, + "grad_norm": 2.0345299243927, + "learning_rate": 4.987347559779163e-05, + "loss": 5.3912, + "step": 5388 + }, + { + "epoch": 0.03204990960129413, + "grad_norm": 2.0960853099823, + "learning_rate": 4.987342865913742e-05, + "loss": 5.3497, + "step": 5389 + }, + { + "epoch": 0.03205585688457513, + "grad_norm": 2.0156044960021973, + "learning_rate": 4.987338171180015e-05, + "loss": 5.2769, + "step": 5390 + }, + { + "epoch": 0.032061804167856124, + "grad_norm": 2.0021722316741943, + "learning_rate": 4.987333475577984e-05, + "loss": 5.2338, + "step": 5391 + }, + { + "epoch": 0.03206775145113712, + "grad_norm": 1.8502025604248047, + "learning_rate": 4.987328779107651e-05, + "loss": 5.4231, + "step": 5392 + }, + { + "epoch": 0.03207369873441812, + "grad_norm": 2.0788064002990723, + "learning_rate": 4.987324081769016e-05, + "loss": 5.3989, + "step": 5393 + }, + { + "epoch": 0.032079646017699116, + "grad_norm": 5.172029495239258, + "learning_rate": 4.987319383562083e-05, + "loss": 6.5943, + "step": 5394 + }, + { + "epoch": 0.03208559330098011, + "grad_norm": 1.8732082843780518, + "learning_rate": 4.987314684486852e-05, + "loss": 5.3085, + "step": 5395 + }, + { + "epoch": 0.032091540584261107, + "grad_norm": 2.0511786937713623, + "learning_rate": 4.987309984543326e-05, + "loss": 5.1598, + "step": 5396 + }, + { + "epoch": 0.03209748786754211, + "grad_norm": 2.1821703910827637, + "learning_rate": 4.987305283731505e-05, + "loss": 5.3575, + "step": 5397 + }, + { + "epoch": 0.032103435150823104, + "grad_norm": 2.1190478801727295, + "learning_rate": 4.9873005820513906e-05, + "loss": 5.2371, + "step": 5398 + }, + { + "epoch": 0.0321093824341041, + "grad_norm": 2.1476964950561523, + "learning_rate": 4.987295879502987e-05, + "loss": 5.1378, + "step": 5399 + }, + { + "epoch": 0.0321153297173851, + "grad_norm": 2.3466129302978516, + "learning_rate": 4.987291176086293e-05, + "loss": 5.0642, + "step": 5400 + }, + { + "epoch": 0.032121277000666096, + "grad_norm": 2.267949104309082, + "learning_rate": 4.9872864718013115e-05, + "loss": 5.6835, + "step": 5401 + }, + { + "epoch": 0.03212722428394709, + "grad_norm": 3.1235604286193848, + "learning_rate": 4.987281766648044e-05, + "loss": 6.2094, + "step": 5402 + }, + { + "epoch": 0.03213317156722809, + "grad_norm": 2.494929790496826, + "learning_rate": 4.987277060626493e-05, + "loss": 6.2387, + "step": 5403 + }, + { + "epoch": 0.03213911885050909, + "grad_norm": 2.554422616958618, + "learning_rate": 4.987272353736658e-05, + "loss": 5.9655, + "step": 5404 + }, + { + "epoch": 0.03214506613379008, + "grad_norm": 3.688295841217041, + "learning_rate": 4.987267645978543e-05, + "loss": 6.3994, + "step": 5405 + }, + { + "epoch": 0.032151013417071085, + "grad_norm": 2.773847818374634, + "learning_rate": 4.987262937352147e-05, + "loss": 5.515, + "step": 5406 + }, + { + "epoch": 0.03215696070035208, + "grad_norm": 3.067812204360962, + "learning_rate": 4.987258227857475e-05, + "loss": 5.7388, + "step": 5407 + }, + { + "epoch": 0.032162907983633075, + "grad_norm": 3.0557258129119873, + "learning_rate": 4.987253517494525e-05, + "loss": 6.0334, + "step": 5408 + }, + { + "epoch": 0.03216885526691407, + "grad_norm": 2.2864489555358887, + "learning_rate": 4.9872488062633026e-05, + "loss": 6.2805, + "step": 5409 + }, + { + "epoch": 0.03217480255019507, + "grad_norm": 3.2848916053771973, + "learning_rate": 4.987244094163807e-05, + "loss": 6.4782, + "step": 5410 + }, + { + "epoch": 0.03218074983347607, + "grad_norm": 3.7147631645202637, + "learning_rate": 4.987239381196039e-05, + "loss": 6.6618, + "step": 5411 + }, + { + "epoch": 0.03218669711675706, + "grad_norm": 2.740705966949463, + "learning_rate": 4.9872346673600017e-05, + "loss": 6.0261, + "step": 5412 + }, + { + "epoch": 0.032192644400038065, + "grad_norm": 2.6408498287200928, + "learning_rate": 4.9872299526556965e-05, + "loss": 5.8645, + "step": 5413 + }, + { + "epoch": 0.03219859168331906, + "grad_norm": 2.8298256397247314, + "learning_rate": 4.987225237083125e-05, + "loss": 5.9263, + "step": 5414 + }, + { + "epoch": 0.032204538966600055, + "grad_norm": 2.9417197704315186, + "learning_rate": 4.987220520642289e-05, + "loss": 5.8018, + "step": 5415 + }, + { + "epoch": 0.03221048624988106, + "grad_norm": 3.2862906455993652, + "learning_rate": 4.9872158033331904e-05, + "loss": 5.8429, + "step": 5416 + }, + { + "epoch": 0.03221643353316205, + "grad_norm": 2.7724359035491943, + "learning_rate": 4.9872110851558306e-05, + "loss": 5.9504, + "step": 5417 + }, + { + "epoch": 0.03222238081644305, + "grad_norm": 2.2753829956054688, + "learning_rate": 4.9872063661102106e-05, + "loss": 5.6443, + "step": 5418 + }, + { + "epoch": 0.03222832809972405, + "grad_norm": 2.597649097442627, + "learning_rate": 4.987201646196332e-05, + "loss": 6.4441, + "step": 5419 + }, + { + "epoch": 0.032234275383005044, + "grad_norm": 2.7298800945281982, + "learning_rate": 4.987196925414198e-05, + "loss": 6.2988, + "step": 5420 + }, + { + "epoch": 0.03224022266628604, + "grad_norm": 3.2329537868499756, + "learning_rate": 4.987192203763809e-05, + "loss": 5.8743, + "step": 5421 + }, + { + "epoch": 0.03224616994956704, + "grad_norm": 3.033226251602173, + "learning_rate": 4.987187481245167e-05, + "loss": 5.4863, + "step": 5422 + }, + { + "epoch": 0.032252117232848036, + "grad_norm": 2.7728521823883057, + "learning_rate": 4.987182757858273e-05, + "loss": 5.5722, + "step": 5423 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.6083309650421143, + "learning_rate": 4.98717803360313e-05, + "loss": 6.5257, + "step": 5424 + }, + { + "epoch": 0.032264011799410026, + "grad_norm": 2.5422329902648926, + "learning_rate": 4.987173308479738e-05, + "loss": 6.5582, + "step": 5425 + }, + { + "epoch": 0.03226995908269103, + "grad_norm": 2.7634811401367188, + "learning_rate": 4.9871685824881e-05, + "loss": 6.0987, + "step": 5426 + }, + { + "epoch": 0.032275906365972024, + "grad_norm": 3.631476640701294, + "learning_rate": 4.987163855628217e-05, + "loss": 5.8506, + "step": 5427 + }, + { + "epoch": 0.03228185364925302, + "grad_norm": 2.9783661365509033, + "learning_rate": 4.9871591279000904e-05, + "loss": 5.9387, + "step": 5428 + }, + { + "epoch": 0.03228780093253402, + "grad_norm": 2.369645357131958, + "learning_rate": 4.9871543993037225e-05, + "loss": 5.8097, + "step": 5429 + }, + { + "epoch": 0.032293748215815016, + "grad_norm": 2.782055616378784, + "learning_rate": 4.9871496698391155e-05, + "loss": 5.5301, + "step": 5430 + }, + { + "epoch": 0.03229969549909601, + "grad_norm": 2.408205270767212, + "learning_rate": 4.98714493950627e-05, + "loss": 5.6514, + "step": 5431 + }, + { + "epoch": 0.03230564278237701, + "grad_norm": 2.0641589164733887, + "learning_rate": 4.987140208305187e-05, + "loss": 5.6168, + "step": 5432 + }, + { + "epoch": 0.03231159006565801, + "grad_norm": 2.109773874282837, + "learning_rate": 4.987135476235869e-05, + "loss": 5.6678, + "step": 5433 + }, + { + "epoch": 0.032317537348939, + "grad_norm": 2.9809730052948, + "learning_rate": 4.987130743298318e-05, + "loss": 6.0531, + "step": 5434 + }, + { + "epoch": 0.032323484632220005, + "grad_norm": 2.5728509426116943, + "learning_rate": 4.9871260094925365e-05, + "loss": 6.05, + "step": 5435 + }, + { + "epoch": 0.032329431915501, + "grad_norm": 2.477074146270752, + "learning_rate": 4.9871212748185236e-05, + "loss": 6.351, + "step": 5436 + }, + { + "epoch": 0.032335379198781995, + "grad_norm": 2.3485517501831055, + "learning_rate": 4.987116539276283e-05, + "loss": 6.3033, + "step": 5437 + }, + { + "epoch": 0.03234132648206299, + "grad_norm": 2.4214296340942383, + "learning_rate": 4.987111802865816e-05, + "loss": 6.1152, + "step": 5438 + }, + { + "epoch": 0.03234727376534399, + "grad_norm": 3.5628256797790527, + "learning_rate": 4.9871070655871234e-05, + "loss": 5.6502, + "step": 5439 + }, + { + "epoch": 0.03235322104862499, + "grad_norm": 3.190075159072876, + "learning_rate": 4.987102327440208e-05, + "loss": 5.4164, + "step": 5440 + }, + { + "epoch": 0.03235916833190598, + "grad_norm": 2.402754306793213, + "learning_rate": 4.9870975884250696e-05, + "loss": 5.7116, + "step": 5441 + }, + { + "epoch": 0.032365115615186985, + "grad_norm": 2.846653938293457, + "learning_rate": 4.987092848541712e-05, + "loss": 6.1456, + "step": 5442 + }, + { + "epoch": 0.03237106289846798, + "grad_norm": 2.6700549125671387, + "learning_rate": 4.987088107790136e-05, + "loss": 5.9777, + "step": 5443 + }, + { + "epoch": 0.032377010181748975, + "grad_norm": 2.8929460048675537, + "learning_rate": 4.987083366170343e-05, + "loss": 6.1459, + "step": 5444 + }, + { + "epoch": 0.03238295746502998, + "grad_norm": 2.524376153945923, + "learning_rate": 4.987078623682335e-05, + "loss": 6.4341, + "step": 5445 + }, + { + "epoch": 0.03238890474831097, + "grad_norm": 2.0901076793670654, + "learning_rate": 4.987073880326114e-05, + "loss": 6.3968, + "step": 5446 + }, + { + "epoch": 0.03239485203159197, + "grad_norm": 3.0033867359161377, + "learning_rate": 4.9870691361016805e-05, + "loss": 5.8656, + "step": 5447 + }, + { + "epoch": 0.03240079931487297, + "grad_norm": 2.7715492248535156, + "learning_rate": 4.987064391009038e-05, + "loss": 6.1634, + "step": 5448 + }, + { + "epoch": 0.032406746598153964, + "grad_norm": 2.6102347373962402, + "learning_rate": 4.9870596450481855e-05, + "loss": 6.2521, + "step": 5449 + }, + { + "epoch": 0.03241269388143496, + "grad_norm": 2.326253890991211, + "learning_rate": 4.9870548982191265e-05, + "loss": 6.2517, + "step": 5450 + }, + { + "epoch": 0.03241864116471596, + "grad_norm": 2.3012197017669678, + "learning_rate": 4.987050150521863e-05, + "loss": 6.2261, + "step": 5451 + }, + { + "epoch": 0.032424588447996956, + "grad_norm": 2.100337505340576, + "learning_rate": 4.987045401956396e-05, + "loss": 5.6291, + "step": 5452 + }, + { + "epoch": 0.03243053573127795, + "grad_norm": 3.094754219055176, + "learning_rate": 4.987040652522727e-05, + "loss": 5.897, + "step": 5453 + }, + { + "epoch": 0.032436483014558946, + "grad_norm": 2.7406179904937744, + "learning_rate": 4.987035902220857e-05, + "loss": 6.0083, + "step": 5454 + }, + { + "epoch": 0.03244243029783995, + "grad_norm": 2.4106287956237793, + "learning_rate": 4.9870311510507895e-05, + "loss": 5.8538, + "step": 5455 + }, + { + "epoch": 0.032448377581120944, + "grad_norm": 2.7335946559906006, + "learning_rate": 4.987026399012525e-05, + "loss": 5.9181, + "step": 5456 + }, + { + "epoch": 0.03245432486440194, + "grad_norm": 2.796175003051758, + "learning_rate": 4.987021646106064e-05, + "loss": 5.6461, + "step": 5457 + }, + { + "epoch": 0.03246027214768294, + "grad_norm": 3.086470127105713, + "learning_rate": 4.987016892331411e-05, + "loss": 5.6692, + "step": 5458 + }, + { + "epoch": 0.032466219430963936, + "grad_norm": 2.394465923309326, + "learning_rate": 4.9870121376885656e-05, + "loss": 6.3046, + "step": 5459 + }, + { + "epoch": 0.03247216671424493, + "grad_norm": 2.0745291709899902, + "learning_rate": 4.98700738217753e-05, + "loss": 6.0491, + "step": 5460 + }, + { + "epoch": 0.03247811399752593, + "grad_norm": 2.66359281539917, + "learning_rate": 4.987002625798305e-05, + "loss": 5.6468, + "step": 5461 + }, + { + "epoch": 0.03248406128080693, + "grad_norm": 2.392833948135376, + "learning_rate": 4.9869978685508936e-05, + "loss": 5.8421, + "step": 5462 + }, + { + "epoch": 0.03249000856408792, + "grad_norm": 2.671710252761841, + "learning_rate": 4.9869931104352975e-05, + "loss": 5.6892, + "step": 5463 + }, + { + "epoch": 0.032495955847368925, + "grad_norm": 2.7013144493103027, + "learning_rate": 4.986988351451517e-05, + "loss": 5.7911, + "step": 5464 + }, + { + "epoch": 0.03250190313064992, + "grad_norm": 1.926703929901123, + "learning_rate": 4.9869835915995555e-05, + "loss": 5.5492, + "step": 5465 + }, + { + "epoch": 0.032507850413930915, + "grad_norm": 2.5668530464172363, + "learning_rate": 4.986978830879413e-05, + "loss": 5.8949, + "step": 5466 + }, + { + "epoch": 0.03251379769721191, + "grad_norm": 2.555305004119873, + "learning_rate": 4.986974069291092e-05, + "loss": 5.7408, + "step": 5467 + }, + { + "epoch": 0.03251974498049291, + "grad_norm": 2.551226854324341, + "learning_rate": 4.986969306834594e-05, + "loss": 5.7738, + "step": 5468 + }, + { + "epoch": 0.03252569226377391, + "grad_norm": 2.3194847106933594, + "learning_rate": 4.986964543509921e-05, + "loss": 6.2837, + "step": 5469 + }, + { + "epoch": 0.0325316395470549, + "grad_norm": 1.9618690013885498, + "learning_rate": 4.986959779317074e-05, + "loss": 5.9236, + "step": 5470 + }, + { + "epoch": 0.032537586830335904, + "grad_norm": 2.351971387863159, + "learning_rate": 4.986955014256055e-05, + "loss": 5.591, + "step": 5471 + }, + { + "epoch": 0.0325435341136169, + "grad_norm": 2.3772034645080566, + "learning_rate": 4.986950248326866e-05, + "loss": 5.6785, + "step": 5472 + }, + { + "epoch": 0.032549481396897895, + "grad_norm": 2.5764195919036865, + "learning_rate": 4.9869454815295085e-05, + "loss": 5.525, + "step": 5473 + }, + { + "epoch": 0.0325554286801789, + "grad_norm": 2.231048107147217, + "learning_rate": 4.986940713863984e-05, + "loss": 5.6789, + "step": 5474 + }, + { + "epoch": 0.03256137596345989, + "grad_norm": 2.8053946495056152, + "learning_rate": 4.986935945330294e-05, + "loss": 5.6319, + "step": 5475 + }, + { + "epoch": 0.03256732324674089, + "grad_norm": 3.4610519409179688, + "learning_rate": 4.98693117592844e-05, + "loss": 5.9855, + "step": 5476 + }, + { + "epoch": 0.03257327053002189, + "grad_norm": 2.5019664764404297, + "learning_rate": 4.986926405658425e-05, + "loss": 5.9997, + "step": 5477 + }, + { + "epoch": 0.032579217813302884, + "grad_norm": 2.6583313941955566, + "learning_rate": 4.986921634520249e-05, + "loss": 6.3755, + "step": 5478 + }, + { + "epoch": 0.03258516509658388, + "grad_norm": 2.990699291229248, + "learning_rate": 4.986916862513914e-05, + "loss": 5.8932, + "step": 5479 + }, + { + "epoch": 0.03259111237986488, + "grad_norm": 3.282546043395996, + "learning_rate": 4.986912089639423e-05, + "loss": 5.5508, + "step": 5480 + }, + { + "epoch": 0.032597059663145876, + "grad_norm": 3.1012487411499023, + "learning_rate": 4.9869073158967755e-05, + "loss": 5.5567, + "step": 5481 + }, + { + "epoch": 0.03260300694642687, + "grad_norm": 2.141892433166504, + "learning_rate": 4.986902541285975e-05, + "loss": 5.6195, + "step": 5482 + }, + { + "epoch": 0.032608954229707866, + "grad_norm": 2.173670530319214, + "learning_rate": 4.986897765807023e-05, + "loss": 5.6913, + "step": 5483 + }, + { + "epoch": 0.03261490151298887, + "grad_norm": 2.4076435565948486, + "learning_rate": 4.98689298945992e-05, + "loss": 5.8324, + "step": 5484 + }, + { + "epoch": 0.03262084879626986, + "grad_norm": 2.8968818187713623, + "learning_rate": 4.986888212244668e-05, + "loss": 6.0086, + "step": 5485 + }, + { + "epoch": 0.03262679607955086, + "grad_norm": 2.2434191703796387, + "learning_rate": 4.9868834341612696e-05, + "loss": 5.9645, + "step": 5486 + }, + { + "epoch": 0.03263274336283186, + "grad_norm": 1.9683157205581665, + "learning_rate": 4.9868786552097255e-05, + "loss": 5.9173, + "step": 5487 + }, + { + "epoch": 0.032638690646112856, + "grad_norm": 2.369816303253174, + "learning_rate": 4.9868738753900384e-05, + "loss": 6.2728, + "step": 5488 + }, + { + "epoch": 0.03264463792939385, + "grad_norm": 2.1152775287628174, + "learning_rate": 4.986869094702209e-05, + "loss": 6.0474, + "step": 5489 + }, + { + "epoch": 0.03265058521267485, + "grad_norm": 2.3219857215881348, + "learning_rate": 4.9868643131462397e-05, + "loss": 5.7451, + "step": 5490 + }, + { + "epoch": 0.03265653249595585, + "grad_norm": 2.236046075820923, + "learning_rate": 4.986859530722131e-05, + "loss": 5.7775, + "step": 5491 + }, + { + "epoch": 0.03266247977923684, + "grad_norm": 2.3334364891052246, + "learning_rate": 4.986854747429886e-05, + "loss": 5.7429, + "step": 5492 + }, + { + "epoch": 0.032668427062517845, + "grad_norm": 2.5464704036712646, + "learning_rate": 4.986849963269505e-05, + "loss": 5.5781, + "step": 5493 + }, + { + "epoch": 0.03267437434579884, + "grad_norm": 2.104419469833374, + "learning_rate": 4.986845178240991e-05, + "loss": 5.6378, + "step": 5494 + }, + { + "epoch": 0.032680321629079835, + "grad_norm": 2.3115224838256836, + "learning_rate": 4.9868403923443444e-05, + "loss": 5.7617, + "step": 5495 + }, + { + "epoch": 0.03268626891236083, + "grad_norm": 2.3370540142059326, + "learning_rate": 4.9868356055795685e-05, + "loss": 6.1278, + "step": 5496 + }, + { + "epoch": 0.03269221619564183, + "grad_norm": 2.8618736267089844, + "learning_rate": 4.986830817946663e-05, + "loss": 6.0879, + "step": 5497 + }, + { + "epoch": 0.03269816347892283, + "grad_norm": 2.3229949474334717, + "learning_rate": 4.986826029445631e-05, + "loss": 6.0915, + "step": 5498 + }, + { + "epoch": 0.03270411076220382, + "grad_norm": 2.549914598464966, + "learning_rate": 4.986821240076473e-05, + "loss": 6.2375, + "step": 5499 + }, + { + "epoch": 0.032710058045484824, + "grad_norm": 2.595916271209717, + "learning_rate": 4.986816449839192e-05, + "loss": 6.095, + "step": 5500 + }, + { + "epoch": 0.03271600532876582, + "grad_norm": 2.4409420490264893, + "learning_rate": 4.98681165873379e-05, + "loss": 5.353, + "step": 5501 + }, + { + "epoch": 0.032721952612046815, + "grad_norm": 2.550156593322754, + "learning_rate": 4.986806866760266e-05, + "loss": 5.558, + "step": 5502 + }, + { + "epoch": 0.03272789989532782, + "grad_norm": 2.7811737060546875, + "learning_rate": 4.986802073918625e-05, + "loss": 5.7174, + "step": 5503 + }, + { + "epoch": 0.03273384717860881, + "grad_norm": 2.8430123329162598, + "learning_rate": 4.986797280208866e-05, + "loss": 5.5644, + "step": 5504 + }, + { + "epoch": 0.03273979446188981, + "grad_norm": 3.021040201187134, + "learning_rate": 4.986792485630992e-05, + "loss": 5.9451, + "step": 5505 + }, + { + "epoch": 0.03274574174517081, + "grad_norm": 2.69866681098938, + "learning_rate": 4.986787690185005e-05, + "loss": 5.9934, + "step": 5506 + }, + { + "epoch": 0.032751689028451804, + "grad_norm": 2.7202444076538086, + "learning_rate": 4.986782893870906e-05, + "loss": 6.1298, + "step": 5507 + }, + { + "epoch": 0.0327576363117328, + "grad_norm": 2.223405122756958, + "learning_rate": 4.986778096688696e-05, + "loss": 5.8968, + "step": 5508 + }, + { + "epoch": 0.0327635835950138, + "grad_norm": 2.5733680725097656, + "learning_rate": 4.986773298638378e-05, + "loss": 6.0928, + "step": 5509 + }, + { + "epoch": 0.032769530878294796, + "grad_norm": 2.584397554397583, + "learning_rate": 4.986768499719953e-05, + "loss": 5.7879, + "step": 5510 + }, + { + "epoch": 0.03277547816157579, + "grad_norm": 3.160489797592163, + "learning_rate": 4.986763699933423e-05, + "loss": 5.6413, + "step": 5511 + }, + { + "epoch": 0.032781425444856786, + "grad_norm": 2.8224406242370605, + "learning_rate": 4.9867588992787894e-05, + "loss": 6.1476, + "step": 5512 + }, + { + "epoch": 0.03278737272813779, + "grad_norm": 2.2565996646881104, + "learning_rate": 4.986754097756054e-05, + "loss": 6.208, + "step": 5513 + }, + { + "epoch": 0.03279332001141878, + "grad_norm": 2.5425479412078857, + "learning_rate": 4.9867492953652184e-05, + "loss": 5.934, + "step": 5514 + }, + { + "epoch": 0.03279926729469978, + "grad_norm": 2.6598689556121826, + "learning_rate": 4.986744492106284e-05, + "loss": 5.7433, + "step": 5515 + }, + { + "epoch": 0.03280521457798078, + "grad_norm": 2.419388771057129, + "learning_rate": 4.986739687979253e-05, + "loss": 5.378, + "step": 5516 + }, + { + "epoch": 0.032811161861261776, + "grad_norm": 2.72784161567688, + "learning_rate": 4.986734882984127e-05, + "loss": 5.4089, + "step": 5517 + }, + { + "epoch": 0.03281710914454277, + "grad_norm": 3.0592923164367676, + "learning_rate": 4.9867300771209075e-05, + "loss": 5.9573, + "step": 5518 + }, + { + "epoch": 0.03282305642782377, + "grad_norm": 2.7681832313537598, + "learning_rate": 4.9867252703895965e-05, + "loss": 5.5325, + "step": 5519 + }, + { + "epoch": 0.03282900371110477, + "grad_norm": 2.6752777099609375, + "learning_rate": 4.9867204627901946e-05, + "loss": 5.7543, + "step": 5520 + }, + { + "epoch": 0.03283495099438576, + "grad_norm": 2.481203317642212, + "learning_rate": 4.9867156543227046e-05, + "loss": 5.575, + "step": 5521 + }, + { + "epoch": 0.032840898277666765, + "grad_norm": 2.6403908729553223, + "learning_rate": 4.986710844987128e-05, + "loss": 5.4381, + "step": 5522 + }, + { + "epoch": 0.03284684556094776, + "grad_norm": 2.6146085262298584, + "learning_rate": 4.986706034783466e-05, + "loss": 5.8672, + "step": 5523 + }, + { + "epoch": 0.032852792844228755, + "grad_norm": 3.453666925430298, + "learning_rate": 4.986701223711722e-05, + "loss": 5.8353, + "step": 5524 + }, + { + "epoch": 0.03285874012750975, + "grad_norm": 2.511216640472412, + "learning_rate": 4.986696411771895e-05, + "loss": 5.9567, + "step": 5525 + }, + { + "epoch": 0.03286468741079075, + "grad_norm": 2.57395601272583, + "learning_rate": 4.986691598963988e-05, + "loss": 5.6396, + "step": 5526 + }, + { + "epoch": 0.03287063469407175, + "grad_norm": 2.778801441192627, + "learning_rate": 4.986686785288003e-05, + "loss": 6.0237, + "step": 5527 + }, + { + "epoch": 0.03287658197735274, + "grad_norm": 2.5216047763824463, + "learning_rate": 4.986681970743941e-05, + "loss": 6.1305, + "step": 5528 + }, + { + "epoch": 0.032882529260633744, + "grad_norm": 2.5105085372924805, + "learning_rate": 4.986677155331804e-05, + "loss": 6.4951, + "step": 5529 + }, + { + "epoch": 0.03288847654391474, + "grad_norm": 2.4105372428894043, + "learning_rate": 4.9866723390515946e-05, + "loss": 6.291, + "step": 5530 + }, + { + "epoch": 0.032894423827195735, + "grad_norm": 2.740095853805542, + "learning_rate": 4.9866675219033125e-05, + "loss": 5.762, + "step": 5531 + }, + { + "epoch": 0.03290037111047674, + "grad_norm": 2.327892541885376, + "learning_rate": 4.9866627038869605e-05, + "loss": 6.1023, + "step": 5532 + }, + { + "epoch": 0.03290631839375773, + "grad_norm": 2.71732497215271, + "learning_rate": 4.9866578850025414e-05, + "loss": 6.0739, + "step": 5533 + }, + { + "epoch": 0.03291226567703873, + "grad_norm": 2.1895039081573486, + "learning_rate": 4.9866530652500545e-05, + "loss": 5.801, + "step": 5534 + }, + { + "epoch": 0.03291821296031973, + "grad_norm": 2.39670729637146, + "learning_rate": 4.986648244629503e-05, + "loss": 6.0105, + "step": 5535 + }, + { + "epoch": 0.032924160243600724, + "grad_norm": 2.14630126953125, + "learning_rate": 4.986643423140889e-05, + "loss": 5.8457, + "step": 5536 + }, + { + "epoch": 0.03293010752688172, + "grad_norm": 2.111196994781494, + "learning_rate": 4.9866386007842125e-05, + "loss": 6.0804, + "step": 5537 + }, + { + "epoch": 0.03293605481016272, + "grad_norm": 2.8245434761047363, + "learning_rate": 4.986633777559476e-05, + "loss": 6.3152, + "step": 5538 + }, + { + "epoch": 0.032942002093443716, + "grad_norm": 2.3561060428619385, + "learning_rate": 4.9866289534666824e-05, + "loss": 6.286, + "step": 5539 + }, + { + "epoch": 0.03294794937672471, + "grad_norm": 3.21701979637146, + "learning_rate": 4.986624128505832e-05, + "loss": 5.9775, + "step": 5540 + }, + { + "epoch": 0.032953896660005706, + "grad_norm": 3.9414072036743164, + "learning_rate": 4.9866193026769265e-05, + "loss": 5.9413, + "step": 5541 + }, + { + "epoch": 0.03295984394328671, + "grad_norm": 2.7801051139831543, + "learning_rate": 4.986614475979968e-05, + "loss": 5.8642, + "step": 5542 + }, + { + "epoch": 0.0329657912265677, + "grad_norm": 2.7095935344696045, + "learning_rate": 4.986609648414958e-05, + "loss": 5.6952, + "step": 5543 + }, + { + "epoch": 0.0329717385098487, + "grad_norm": 2.5800812244415283, + "learning_rate": 4.986604819981898e-05, + "loss": 6.0285, + "step": 5544 + }, + { + "epoch": 0.0329776857931297, + "grad_norm": 2.6105730533599854, + "learning_rate": 4.9865999906807904e-05, + "loss": 5.6683, + "step": 5545 + }, + { + "epoch": 0.032983633076410696, + "grad_norm": 2.635570764541626, + "learning_rate": 4.9865951605116366e-05, + "loss": 5.9092, + "step": 5546 + }, + { + "epoch": 0.03298958035969169, + "grad_norm": 2.3708200454711914, + "learning_rate": 4.9865903294744373e-05, + "loss": 6.0034, + "step": 5547 + }, + { + "epoch": 0.03299552764297269, + "grad_norm": 2.437201499938965, + "learning_rate": 4.986585497569196e-05, + "loss": 6.2587, + "step": 5548 + }, + { + "epoch": 0.03300147492625369, + "grad_norm": 2.076016426086426, + "learning_rate": 4.9865806647959126e-05, + "loss": 6.358, + "step": 5549 + }, + { + "epoch": 0.03300742220953468, + "grad_norm": 1.8261257410049438, + "learning_rate": 4.98657583115459e-05, + "loss": 6.0431, + "step": 5550 + }, + { + "epoch": 0.033013369492815685, + "grad_norm": 2.8339858055114746, + "learning_rate": 4.98657099664523e-05, + "loss": 5.7956, + "step": 5551 + }, + { + "epoch": 0.03301931677609668, + "grad_norm": 2.7288596630096436, + "learning_rate": 4.986566161267833e-05, + "loss": 5.7092, + "step": 5552 + }, + { + "epoch": 0.033025264059377675, + "grad_norm": 2.7197329998016357, + "learning_rate": 4.986561325022402e-05, + "loss": 5.649, + "step": 5553 + }, + { + "epoch": 0.03303121134265867, + "grad_norm": 2.6161739826202393, + "learning_rate": 4.986556487908937e-05, + "loss": 5.6935, + "step": 5554 + }, + { + "epoch": 0.03303715862593967, + "grad_norm": 2.695068597793579, + "learning_rate": 4.986551649927441e-05, + "loss": 5.6901, + "step": 5555 + }, + { + "epoch": 0.03304310590922067, + "grad_norm": 3.0315186977386475, + "learning_rate": 4.986546811077917e-05, + "loss": 5.6317, + "step": 5556 + }, + { + "epoch": 0.03304905319250166, + "grad_norm": 2.3597543239593506, + "learning_rate": 4.986541971360364e-05, + "loss": 5.8129, + "step": 5557 + }, + { + "epoch": 0.033055000475782664, + "grad_norm": 2.8090550899505615, + "learning_rate": 4.986537130774785e-05, + "loss": 6.4427, + "step": 5558 + }, + { + "epoch": 0.03306094775906366, + "grad_norm": 3.4232771396636963, + "learning_rate": 4.986532289321182e-05, + "loss": 6.5737, + "step": 5559 + }, + { + "epoch": 0.033066895042344654, + "grad_norm": 2.1425294876098633, + "learning_rate": 4.986527446999556e-05, + "loss": 6.2395, + "step": 5560 + }, + { + "epoch": 0.033072842325625657, + "grad_norm": 2.5348880290985107, + "learning_rate": 4.986522603809909e-05, + "loss": 6.0425, + "step": 5561 + }, + { + "epoch": 0.03307878960890665, + "grad_norm": 3.0824179649353027, + "learning_rate": 4.986517759752242e-05, + "loss": 5.8785, + "step": 5562 + }, + { + "epoch": 0.03308473689218765, + "grad_norm": 2.297706365585327, + "learning_rate": 4.986512914826558e-05, + "loss": 5.8989, + "step": 5563 + }, + { + "epoch": 0.03309068417546865, + "grad_norm": 2.866257667541504, + "learning_rate": 4.986508069032858e-05, + "loss": 5.8905, + "step": 5564 + }, + { + "epoch": 0.033096631458749644, + "grad_norm": 2.2450008392333984, + "learning_rate": 4.9865032223711436e-05, + "loss": 6.3302, + "step": 5565 + }, + { + "epoch": 0.03310257874203064, + "grad_norm": 2.235558271408081, + "learning_rate": 4.9864983748414166e-05, + "loss": 6.4235, + "step": 5566 + }, + { + "epoch": 0.03310852602531164, + "grad_norm": 2.5197713375091553, + "learning_rate": 4.986493526443679e-05, + "loss": 6.3999, + "step": 5567 + }, + { + "epoch": 0.033114473308592636, + "grad_norm": 2.5716195106506348, + "learning_rate": 4.986488677177932e-05, + "loss": 6.0258, + "step": 5568 + }, + { + "epoch": 0.03312042059187363, + "grad_norm": 2.468663454055786, + "learning_rate": 4.986483827044177e-05, + "loss": 6.7553, + "step": 5569 + }, + { + "epoch": 0.033126367875154626, + "grad_norm": 2.4334170818328857, + "learning_rate": 4.986478976042417e-05, + "loss": 6.4722, + "step": 5570 + }, + { + "epoch": 0.03313231515843563, + "grad_norm": 2.234487533569336, + "learning_rate": 4.986474124172652e-05, + "loss": 5.7158, + "step": 5571 + }, + { + "epoch": 0.03313826244171662, + "grad_norm": 2.8017537593841553, + "learning_rate": 4.9864692714348857e-05, + "loss": 5.9552, + "step": 5572 + }, + { + "epoch": 0.03314420972499762, + "grad_norm": 3.171354055404663, + "learning_rate": 4.986464417829118e-05, + "loss": 6.027, + "step": 5573 + }, + { + "epoch": 0.03315015700827862, + "grad_norm": 2.890169620513916, + "learning_rate": 4.9864595633553516e-05, + "loss": 6.2768, + "step": 5574 + }, + { + "epoch": 0.033156104291559615, + "grad_norm": 3.010934829711914, + "learning_rate": 4.986454708013587e-05, + "loss": 6.4054, + "step": 5575 + }, + { + "epoch": 0.03316205157484061, + "grad_norm": 2.143833875656128, + "learning_rate": 4.9864498518038274e-05, + "loss": 6.3771, + "step": 5576 + }, + { + "epoch": 0.03316799885812161, + "grad_norm": 2.2067418098449707, + "learning_rate": 4.986444994726074e-05, + "loss": 6.0158, + "step": 5577 + }, + { + "epoch": 0.03317394614140261, + "grad_norm": 2.3396403789520264, + "learning_rate": 4.986440136780328e-05, + "loss": 6.4286, + "step": 5578 + }, + { + "epoch": 0.0331798934246836, + "grad_norm": 2.8305866718292236, + "learning_rate": 4.9864352779665915e-05, + "loss": 5.7804, + "step": 5579 + }, + { + "epoch": 0.033185840707964605, + "grad_norm": 2.748194456100464, + "learning_rate": 4.9864304182848664e-05, + "loss": 6.1711, + "step": 5580 + }, + { + "epoch": 0.0331917879912456, + "grad_norm": 2.329761505126953, + "learning_rate": 4.9864255577351534e-05, + "loss": 6.2722, + "step": 5581 + }, + { + "epoch": 0.033197735274526595, + "grad_norm": 2.4633524417877197, + "learning_rate": 4.986420696317457e-05, + "loss": 6.1349, + "step": 5582 + }, + { + "epoch": 0.03320368255780759, + "grad_norm": 1.8909802436828613, + "learning_rate": 4.986415834031775e-05, + "loss": 6.2181, + "step": 5583 + }, + { + "epoch": 0.03320962984108859, + "grad_norm": 2.1794517040252686, + "learning_rate": 4.9864109708781104e-05, + "loss": 6.2808, + "step": 5584 + }, + { + "epoch": 0.03321557712436959, + "grad_norm": 2.1766669750213623, + "learning_rate": 4.986406106856466e-05, + "loss": 6.3004, + "step": 5585 + }, + { + "epoch": 0.03322152440765058, + "grad_norm": 2.27526593208313, + "learning_rate": 4.986401241966844e-05, + "loss": 5.9225, + "step": 5586 + }, + { + "epoch": 0.033227471690931584, + "grad_norm": 3.2843096256256104, + "learning_rate": 4.986396376209244e-05, + "loss": 5.8364, + "step": 5587 + }, + { + "epoch": 0.03323341897421258, + "grad_norm": 2.509831666946411, + "learning_rate": 4.9863915095836685e-05, + "loss": 5.6958, + "step": 5588 + }, + { + "epoch": 0.033239366257493574, + "grad_norm": 2.5235815048217773, + "learning_rate": 4.98638664209012e-05, + "loss": 5.4937, + "step": 5589 + }, + { + "epoch": 0.033245313540774576, + "grad_norm": 2.918334484100342, + "learning_rate": 4.986381773728599e-05, + "loss": 5.8284, + "step": 5590 + }, + { + "epoch": 0.03325126082405557, + "grad_norm": 2.8091490268707275, + "learning_rate": 4.986376904499108e-05, + "loss": 5.8126, + "step": 5591 + }, + { + "epoch": 0.03325720810733657, + "grad_norm": 2.555173635482788, + "learning_rate": 4.986372034401649e-05, + "loss": 5.6393, + "step": 5592 + }, + { + "epoch": 0.03326315539061757, + "grad_norm": 2.6366164684295654, + "learning_rate": 4.986367163436223e-05, + "loss": 6.6675, + "step": 5593 + }, + { + "epoch": 0.033269102673898564, + "grad_norm": 2.5691051483154297, + "learning_rate": 4.9863622916028316e-05, + "loss": 6.5808, + "step": 5594 + }, + { + "epoch": 0.03327504995717956, + "grad_norm": 2.239384889602661, + "learning_rate": 4.986357418901477e-05, + "loss": 6.0191, + "step": 5595 + }, + { + "epoch": 0.03328099724046056, + "grad_norm": 2.3877806663513184, + "learning_rate": 4.9863525453321614e-05, + "loss": 5.7429, + "step": 5596 + }, + { + "epoch": 0.033286944523741556, + "grad_norm": 2.559633731842041, + "learning_rate": 4.9863476708948846e-05, + "loss": 5.4866, + "step": 5597 + }, + { + "epoch": 0.03329289180702255, + "grad_norm": 3.7681171894073486, + "learning_rate": 4.98634279558965e-05, + "loss": 5.6139, + "step": 5598 + }, + { + "epoch": 0.033298839090303546, + "grad_norm": 3.999264717102051, + "learning_rate": 4.9863379194164594e-05, + "loss": 5.6031, + "step": 5599 + }, + { + "epoch": 0.03330478637358455, + "grad_norm": 3.1031601428985596, + "learning_rate": 4.986333042375313e-05, + "loss": 5.5397, + "step": 5600 + }, + { + "epoch": 0.03331073365686554, + "grad_norm": 3.104998826980591, + "learning_rate": 4.986328164466214e-05, + "loss": 5.4274, + "step": 5601 + }, + { + "epoch": 0.03331668094014654, + "grad_norm": 2.9426207542419434, + "learning_rate": 4.986323285689163e-05, + "loss": 5.5859, + "step": 5602 + }, + { + "epoch": 0.03332262822342754, + "grad_norm": 2.6912827491760254, + "learning_rate": 4.986318406044163e-05, + "loss": 5.7375, + "step": 5603 + }, + { + "epoch": 0.033328575506708535, + "grad_norm": 4.394237041473389, + "learning_rate": 4.9863135255312145e-05, + "loss": 5.8246, + "step": 5604 + }, + { + "epoch": 0.03333452278998953, + "grad_norm": 2.812197685241699, + "learning_rate": 4.986308644150319e-05, + "loss": 5.6263, + "step": 5605 + }, + { + "epoch": 0.03334047007327053, + "grad_norm": 3.1969878673553467, + "learning_rate": 4.98630376190148e-05, + "loss": 5.4174, + "step": 5606 + }, + { + "epoch": 0.03334641735655153, + "grad_norm": 2.6018595695495605, + "learning_rate": 4.9862988787846975e-05, + "loss": 5.3917, + "step": 5607 + }, + { + "epoch": 0.03335236463983252, + "grad_norm": 2.5274007320404053, + "learning_rate": 4.986293994799974e-05, + "loss": 5.4252, + "step": 5608 + }, + { + "epoch": 0.033358311923113525, + "grad_norm": 2.57043194770813, + "learning_rate": 4.9862891099473105e-05, + "loss": 5.5321, + "step": 5609 + }, + { + "epoch": 0.03336425920639452, + "grad_norm": 3.4353785514831543, + "learning_rate": 4.986284224226709e-05, + "loss": 5.6599, + "step": 5610 + }, + { + "epoch": 0.033370206489675515, + "grad_norm": 3.308945894241333, + "learning_rate": 4.986279337638172e-05, + "loss": 5.8668, + "step": 5611 + }, + { + "epoch": 0.03337615377295652, + "grad_norm": 2.789703607559204, + "learning_rate": 4.9862744501817006e-05, + "loss": 5.8352, + "step": 5612 + }, + { + "epoch": 0.03338210105623751, + "grad_norm": 1.9887118339538574, + "learning_rate": 4.986269561857296e-05, + "loss": 5.7527, + "step": 5613 + }, + { + "epoch": 0.03338804833951851, + "grad_norm": 2.5447990894317627, + "learning_rate": 4.986264672664961e-05, + "loss": 5.5539, + "step": 5614 + }, + { + "epoch": 0.0333939956227995, + "grad_norm": 2.2903668880462646, + "learning_rate": 4.9862597826046965e-05, + "loss": 5.4555, + "step": 5615 + }, + { + "epoch": 0.033399942906080504, + "grad_norm": 3.1669414043426514, + "learning_rate": 4.986254891676504e-05, + "loss": 5.6852, + "step": 5616 + }, + { + "epoch": 0.0334058901893615, + "grad_norm": 3.7491395473480225, + "learning_rate": 4.986249999880386e-05, + "loss": 5.682, + "step": 5617 + }, + { + "epoch": 0.033411837472642494, + "grad_norm": 3.0548582077026367, + "learning_rate": 4.986245107216343e-05, + "loss": 5.7844, + "step": 5618 + }, + { + "epoch": 0.033417784755923496, + "grad_norm": 2.628957509994507, + "learning_rate": 4.986240213684378e-05, + "loss": 5.5646, + "step": 5619 + }, + { + "epoch": 0.03342373203920449, + "grad_norm": 2.050936460494995, + "learning_rate": 4.986235319284492e-05, + "loss": 5.7187, + "step": 5620 + }, + { + "epoch": 0.03342967932248549, + "grad_norm": 2.2839999198913574, + "learning_rate": 4.986230424016688e-05, + "loss": 5.6613, + "step": 5621 + }, + { + "epoch": 0.03343562660576649, + "grad_norm": 2.177778959274292, + "learning_rate": 4.986225527880966e-05, + "loss": 5.7205, + "step": 5622 + }, + { + "epoch": 0.033441573889047484, + "grad_norm": 2.1690266132354736, + "learning_rate": 4.9862206308773286e-05, + "loss": 5.4344, + "step": 5623 + }, + { + "epoch": 0.03344752117232848, + "grad_norm": 2.0134127140045166, + "learning_rate": 4.9862157330057766e-05, + "loss": 5.7872, + "step": 5624 + }, + { + "epoch": 0.03345346845560948, + "grad_norm": 2.0246710777282715, + "learning_rate": 4.986210834266313e-05, + "loss": 5.3291, + "step": 5625 + }, + { + "epoch": 0.033459415738890476, + "grad_norm": 2.020939350128174, + "learning_rate": 4.986205934658939e-05, + "loss": 5.3966, + "step": 5626 + }, + { + "epoch": 0.03346536302217147, + "grad_norm": 2.3261308670043945, + "learning_rate": 4.986201034183655e-05, + "loss": 5.4667, + "step": 5627 + }, + { + "epoch": 0.033471310305452466, + "grad_norm": 2.135641574859619, + "learning_rate": 4.9861961328404646e-05, + "loss": 5.4925, + "step": 5628 + }, + { + "epoch": 0.03347725758873347, + "grad_norm": 2.3122894763946533, + "learning_rate": 4.986191230629369e-05, + "loss": 5.6665, + "step": 5629 + }, + { + "epoch": 0.03348320487201446, + "grad_norm": 2.4461214542388916, + "learning_rate": 4.98618632755037e-05, + "loss": 5.8442, + "step": 5630 + }, + { + "epoch": 0.03348915215529546, + "grad_norm": 2.189009189605713, + "learning_rate": 4.9861814236034685e-05, + "loss": 5.5793, + "step": 5631 + }, + { + "epoch": 0.03349509943857646, + "grad_norm": 2.1961586475372314, + "learning_rate": 4.986176518788667e-05, + "loss": 5.5364, + "step": 5632 + }, + { + "epoch": 0.033501046721857455, + "grad_norm": 2.120177745819092, + "learning_rate": 4.986171613105967e-05, + "loss": 5.4042, + "step": 5633 + }, + { + "epoch": 0.03350699400513845, + "grad_norm": 1.9021252393722534, + "learning_rate": 4.9861667065553696e-05, + "loss": 5.2665, + "step": 5634 + }, + { + "epoch": 0.03351294128841945, + "grad_norm": 1.8944766521453857, + "learning_rate": 4.986161799136878e-05, + "loss": 5.3853, + "step": 5635 + }, + { + "epoch": 0.03351888857170045, + "grad_norm": 2.059847354888916, + "learning_rate": 4.9861568908504916e-05, + "loss": 5.3046, + "step": 5636 + }, + { + "epoch": 0.03352483585498144, + "grad_norm": 2.1350111961364746, + "learning_rate": 4.9861519816962155e-05, + "loss": 5.3684, + "step": 5637 + }, + { + "epoch": 0.033530783138262445, + "grad_norm": 2.0733792781829834, + "learning_rate": 4.986147071674048e-05, + "loss": 5.4581, + "step": 5638 + }, + { + "epoch": 0.03353673042154344, + "grad_norm": 2.0736827850341797, + "learning_rate": 4.986142160783993e-05, + "loss": 5.7019, + "step": 5639 + }, + { + "epoch": 0.033542677704824435, + "grad_norm": 2.1903107166290283, + "learning_rate": 4.986137249026051e-05, + "loss": 5.4353, + "step": 5640 + }, + { + "epoch": 0.03354862498810544, + "grad_norm": 2.2678940296173096, + "learning_rate": 4.9861323364002244e-05, + "loss": 5.4951, + "step": 5641 + }, + { + "epoch": 0.03355457227138643, + "grad_norm": 3.590702772140503, + "learning_rate": 4.9861274229065145e-05, + "loss": 6.1522, + "step": 5642 + }, + { + "epoch": 0.03356051955466743, + "grad_norm": 2.0955893993377686, + "learning_rate": 4.9861225085449224e-05, + "loss": 5.3544, + "step": 5643 + }, + { + "epoch": 0.03356646683794842, + "grad_norm": 1.9370301961898804, + "learning_rate": 4.986117593315452e-05, + "loss": 5.4732, + "step": 5644 + }, + { + "epoch": 0.033572414121229424, + "grad_norm": 2.141752243041992, + "learning_rate": 4.986112677218103e-05, + "loss": 5.5768, + "step": 5645 + }, + { + "epoch": 0.03357836140451042, + "grad_norm": 1.9236360788345337, + "learning_rate": 4.986107760252878e-05, + "loss": 5.7641, + "step": 5646 + }, + { + "epoch": 0.033584308687791414, + "grad_norm": 1.8353725671768188, + "learning_rate": 4.9861028424197785e-05, + "loss": 5.8011, + "step": 5647 + }, + { + "epoch": 0.033590255971072416, + "grad_norm": 2.0918078422546387, + "learning_rate": 4.9860979237188055e-05, + "loss": 5.6862, + "step": 5648 + }, + { + "epoch": 0.03359620325435341, + "grad_norm": 2.2244462966918945, + "learning_rate": 4.986093004149962e-05, + "loss": 5.472, + "step": 5649 + }, + { + "epoch": 0.033602150537634407, + "grad_norm": 2.1517422199249268, + "learning_rate": 4.9860880837132495e-05, + "loss": 5.3655, + "step": 5650 + }, + { + "epoch": 0.03360809782091541, + "grad_norm": 2.241863489151001, + "learning_rate": 4.986083162408669e-05, + "loss": 5.5385, + "step": 5651 + }, + { + "epoch": 0.033614045104196404, + "grad_norm": 2.458171844482422, + "learning_rate": 4.986078240236222e-05, + "loss": 5.5531, + "step": 5652 + }, + { + "epoch": 0.0336199923874774, + "grad_norm": 2.2601864337921143, + "learning_rate": 4.986073317195911e-05, + "loss": 5.9313, + "step": 5653 + }, + { + "epoch": 0.0336259396707584, + "grad_norm": 2.243647575378418, + "learning_rate": 4.986068393287738e-05, + "loss": 5.4064, + "step": 5654 + }, + { + "epoch": 0.033631886954039396, + "grad_norm": 2.283515453338623, + "learning_rate": 4.986063468511704e-05, + "loss": 5.295, + "step": 5655 + }, + { + "epoch": 0.03363783423732039, + "grad_norm": 2.701770305633545, + "learning_rate": 4.986058542867811e-05, + "loss": 5.8548, + "step": 5656 + }, + { + "epoch": 0.033643781520601386, + "grad_norm": 2.8186864852905273, + "learning_rate": 4.98605361635606e-05, + "loss": 5.378, + "step": 5657 + }, + { + "epoch": 0.03364972880388239, + "grad_norm": 2.6508500576019287, + "learning_rate": 4.9860486889764536e-05, + "loss": 5.469, + "step": 5658 + }, + { + "epoch": 0.03365567608716338, + "grad_norm": 2.3984878063201904, + "learning_rate": 4.986043760728994e-05, + "loss": 5.3978, + "step": 5659 + }, + { + "epoch": 0.03366162337044438, + "grad_norm": 3.64663028717041, + "learning_rate": 4.9860388316136814e-05, + "loss": 5.502, + "step": 5660 + }, + { + "epoch": 0.03366757065372538, + "grad_norm": 3.1112046241760254, + "learning_rate": 4.986033901630519e-05, + "loss": 5.7347, + "step": 5661 + }, + { + "epoch": 0.033673517937006375, + "grad_norm": 2.619877338409424, + "learning_rate": 4.9860289707795074e-05, + "loss": 6.2099, + "step": 5662 + }, + { + "epoch": 0.03367946522028737, + "grad_norm": 2.0318470001220703, + "learning_rate": 4.986024039060648e-05, + "loss": 6.246, + "step": 5663 + }, + { + "epoch": 0.03368541250356837, + "grad_norm": 2.1484673023223877, + "learning_rate": 4.986019106473945e-05, + "loss": 6.1689, + "step": 5664 + }, + { + "epoch": 0.03369135978684937, + "grad_norm": 2.6159844398498535, + "learning_rate": 4.9860141730193974e-05, + "loss": 5.8217, + "step": 5665 + }, + { + "epoch": 0.03369730707013036, + "grad_norm": 2.5019965171813965, + "learning_rate": 4.9860092386970084e-05, + "loss": 6.1138, + "step": 5666 + }, + { + "epoch": 0.033703254353411365, + "grad_norm": 2.962315797805786, + "learning_rate": 4.9860043035067785e-05, + "loss": 5.7057, + "step": 5667 + }, + { + "epoch": 0.03370920163669236, + "grad_norm": 2.455721139907837, + "learning_rate": 4.9859993674487106e-05, + "loss": 5.6203, + "step": 5668 + }, + { + "epoch": 0.033715148919973355, + "grad_norm": 2.432368278503418, + "learning_rate": 4.9859944305228066e-05, + "loss": 6.2337, + "step": 5669 + }, + { + "epoch": 0.03372109620325436, + "grad_norm": 2.3222782611846924, + "learning_rate": 4.985989492729067e-05, + "loss": 6.2845, + "step": 5670 + }, + { + "epoch": 0.03372704348653535, + "grad_norm": 2.107440948486328, + "learning_rate": 4.985984554067494e-05, + "loss": 6.2404, + "step": 5671 + }, + { + "epoch": 0.03373299076981635, + "grad_norm": 1.9450268745422363, + "learning_rate": 4.98597961453809e-05, + "loss": 6.1679, + "step": 5672 + }, + { + "epoch": 0.03373893805309734, + "grad_norm": 1.7591795921325684, + "learning_rate": 4.9859746741408554e-05, + "loss": 6.3425, + "step": 5673 + }, + { + "epoch": 0.033744885336378344, + "grad_norm": 2.009420871734619, + "learning_rate": 4.985969732875794e-05, + "loss": 6.3607, + "step": 5674 + }, + { + "epoch": 0.03375083261965934, + "grad_norm": 2.097215175628662, + "learning_rate": 4.9859647907429054e-05, + "loss": 6.2009, + "step": 5675 + }, + { + "epoch": 0.033756779902940334, + "grad_norm": 1.7670379877090454, + "learning_rate": 4.985959847742192e-05, + "loss": 5.935, + "step": 5676 + }, + { + "epoch": 0.033762727186221336, + "grad_norm": 2.052022695541382, + "learning_rate": 4.985954903873656e-05, + "loss": 5.4054, + "step": 5677 + }, + { + "epoch": 0.03376867446950233, + "grad_norm": 1.9225167036056519, + "learning_rate": 4.985949959137298e-05, + "loss": 5.6905, + "step": 5678 + }, + { + "epoch": 0.033774621752783326, + "grad_norm": 2.4080653190612793, + "learning_rate": 4.985945013533122e-05, + "loss": 6.5566, + "step": 5679 + }, + { + "epoch": 0.03378056903606433, + "grad_norm": 2.8340251445770264, + "learning_rate": 4.985940067061128e-05, + "loss": 6.3556, + "step": 5680 + }, + { + "epoch": 0.033786516319345324, + "grad_norm": 2.2872672080993652, + "learning_rate": 4.985935119721317e-05, + "loss": 6.1806, + "step": 5681 + }, + { + "epoch": 0.03379246360262632, + "grad_norm": 3.309203863143921, + "learning_rate": 4.985930171513692e-05, + "loss": 6.1766, + "step": 5682 + }, + { + "epoch": 0.03379841088590732, + "grad_norm": 2.936709403991699, + "learning_rate": 4.985925222438255e-05, + "loss": 5.907, + "step": 5683 + }, + { + "epoch": 0.033804358169188316, + "grad_norm": 2.3226964473724365, + "learning_rate": 4.985920272495007e-05, + "loss": 5.5734, + "step": 5684 + }, + { + "epoch": 0.03381030545246931, + "grad_norm": 2.3053154945373535, + "learning_rate": 4.98591532168395e-05, + "loss": 6.5688, + "step": 5685 + }, + { + "epoch": 0.033816252735750306, + "grad_norm": 2.2494077682495117, + "learning_rate": 4.985910370005086e-05, + "loss": 6.3539, + "step": 5686 + }, + { + "epoch": 0.03382220001903131, + "grad_norm": 1.9559924602508545, + "learning_rate": 4.9859054174584155e-05, + "loss": 6.2015, + "step": 5687 + }, + { + "epoch": 0.0338281473023123, + "grad_norm": 2.7915425300598145, + "learning_rate": 4.985900464043942e-05, + "loss": 5.7426, + "step": 5688 + }, + { + "epoch": 0.0338340945855933, + "grad_norm": 2.448496103286743, + "learning_rate": 4.985895509761665e-05, + "loss": 6.2697, + "step": 5689 + }, + { + "epoch": 0.0338400418688743, + "grad_norm": 1.7736696004867554, + "learning_rate": 4.9858905546115885e-05, + "loss": 6.5513, + "step": 5690 + }, + { + "epoch": 0.033845989152155295, + "grad_norm": 1.668285608291626, + "learning_rate": 4.9858855985937136e-05, + "loss": 6.0179, + "step": 5691 + }, + { + "epoch": 0.03385193643543629, + "grad_norm": 2.157799243927002, + "learning_rate": 4.985880641708042e-05, + "loss": 6.1863, + "step": 5692 + }, + { + "epoch": 0.03385788371871729, + "grad_norm": 2.2437758445739746, + "learning_rate": 4.985875683954574e-05, + "loss": 6.128, + "step": 5693 + }, + { + "epoch": 0.03386383100199829, + "grad_norm": 2.8323628902435303, + "learning_rate": 4.9858707253333124e-05, + "loss": 6.2746, + "step": 5694 + }, + { + "epoch": 0.03386977828527928, + "grad_norm": 2.270587205886841, + "learning_rate": 4.98586576584426e-05, + "loss": 6.1002, + "step": 5695 + }, + { + "epoch": 0.033875725568560285, + "grad_norm": 1.9165533781051636, + "learning_rate": 4.985860805487417e-05, + "loss": 5.7016, + "step": 5696 + }, + { + "epoch": 0.03388167285184128, + "grad_norm": 2.230407953262329, + "learning_rate": 4.985855844262786e-05, + "loss": 5.9649, + "step": 5697 + }, + { + "epoch": 0.033887620135122275, + "grad_norm": 2.5094211101531982, + "learning_rate": 4.985850882170368e-05, + "loss": 6.0184, + "step": 5698 + }, + { + "epoch": 0.03389356741840328, + "grad_norm": 2.6195943355560303, + "learning_rate": 4.9858459192101656e-05, + "loss": 5.8501, + "step": 5699 + }, + { + "epoch": 0.03389951470168427, + "grad_norm": 2.747486114501953, + "learning_rate": 4.9858409553821794e-05, + "loss": 5.7066, + "step": 5700 + }, + { + "epoch": 0.03390546198496527, + "grad_norm": 2.154109001159668, + "learning_rate": 4.985835990686413e-05, + "loss": 6.1072, + "step": 5701 + }, + { + "epoch": 0.03391140926824626, + "grad_norm": 2.4329216480255127, + "learning_rate": 4.9858310251228655e-05, + "loss": 5.9552, + "step": 5702 + }, + { + "epoch": 0.033917356551527264, + "grad_norm": 2.4760935306549072, + "learning_rate": 4.9858260586915405e-05, + "loss": 5.9023, + "step": 5703 + }, + { + "epoch": 0.03392330383480826, + "grad_norm": 2.400474786758423, + "learning_rate": 4.9858210913924397e-05, + "loss": 6.1688, + "step": 5704 + }, + { + "epoch": 0.033929251118089254, + "grad_norm": 2.402930498123169, + "learning_rate": 4.9858161232255644e-05, + "loss": 6.0776, + "step": 5705 + }, + { + "epoch": 0.033935198401370256, + "grad_norm": 2.0408313274383545, + "learning_rate": 4.985811154190916e-05, + "loss": 6.1841, + "step": 5706 + }, + { + "epoch": 0.03394114568465125, + "grad_norm": 1.889190912246704, + "learning_rate": 4.9858061842884976e-05, + "loss": 5.9689, + "step": 5707 + }, + { + "epoch": 0.033947092967932246, + "grad_norm": 2.2231624126434326, + "learning_rate": 4.9858012135183086e-05, + "loss": 6.0009, + "step": 5708 + }, + { + "epoch": 0.03395304025121325, + "grad_norm": 2.0229554176330566, + "learning_rate": 4.985796241880353e-05, + "loss": 6.3237, + "step": 5709 + }, + { + "epoch": 0.033958987534494244, + "grad_norm": 2.0570971965789795, + "learning_rate": 4.985791269374631e-05, + "loss": 6.3104, + "step": 5710 + }, + { + "epoch": 0.03396493481777524, + "grad_norm": 2.584663152694702, + "learning_rate": 4.9857862960011454e-05, + "loss": 5.8493, + "step": 5711 + }, + { + "epoch": 0.03397088210105624, + "grad_norm": 1.7870328426361084, + "learning_rate": 4.985781321759897e-05, + "loss": 6.2321, + "step": 5712 + }, + { + "epoch": 0.033976829384337236, + "grad_norm": 2.201756000518799, + "learning_rate": 4.9857763466508886e-05, + "loss": 6.1936, + "step": 5713 + }, + { + "epoch": 0.03398277666761823, + "grad_norm": 2.4489476680755615, + "learning_rate": 4.9857713706741216e-05, + "loss": 6.11, + "step": 5714 + }, + { + "epoch": 0.033988723950899226, + "grad_norm": 2.007643461227417, + "learning_rate": 4.9857663938295964e-05, + "loss": 6.288, + "step": 5715 + }, + { + "epoch": 0.03399467123418023, + "grad_norm": 1.8299764394760132, + "learning_rate": 4.9857614161173165e-05, + "loss": 6.0719, + "step": 5716 + }, + { + "epoch": 0.03400061851746122, + "grad_norm": 1.7619884014129639, + "learning_rate": 4.985756437537283e-05, + "loss": 6.1418, + "step": 5717 + }, + { + "epoch": 0.03400656580074222, + "grad_norm": 1.9445360898971558, + "learning_rate": 4.985751458089498e-05, + "loss": 6.1223, + "step": 5718 + }, + { + "epoch": 0.03401251308402322, + "grad_norm": 2.2320010662078857, + "learning_rate": 4.985746477773962e-05, + "loss": 5.5239, + "step": 5719 + }, + { + "epoch": 0.034018460367304215, + "grad_norm": 2.631765365600586, + "learning_rate": 4.985741496590678e-05, + "loss": 5.6348, + "step": 5720 + }, + { + "epoch": 0.03402440765058521, + "grad_norm": 2.4715576171875, + "learning_rate": 4.985736514539647e-05, + "loss": 5.9608, + "step": 5721 + }, + { + "epoch": 0.03403035493386621, + "grad_norm": 2.633188009262085, + "learning_rate": 4.985731531620871e-05, + "loss": 5.602, + "step": 5722 + }, + { + "epoch": 0.03403630221714721, + "grad_norm": 2.4303035736083984, + "learning_rate": 4.9857265478343526e-05, + "loss": 5.495, + "step": 5723 + }, + { + "epoch": 0.0340422495004282, + "grad_norm": 2.463447332382202, + "learning_rate": 4.985721563180092e-05, + "loss": 5.4633, + "step": 5724 + }, + { + "epoch": 0.034048196783709204, + "grad_norm": 2.349965810775757, + "learning_rate": 4.985716577658092e-05, + "loss": 6.0067, + "step": 5725 + }, + { + "epoch": 0.0340541440669902, + "grad_norm": 1.8741793632507324, + "learning_rate": 4.985711591268354e-05, + "loss": 5.8658, + "step": 5726 + }, + { + "epoch": 0.034060091350271195, + "grad_norm": 1.957612156867981, + "learning_rate": 4.98570660401088e-05, + "loss": 6.2016, + "step": 5727 + }, + { + "epoch": 0.0340660386335522, + "grad_norm": 2.4883556365966797, + "learning_rate": 4.985701615885671e-05, + "loss": 6.3056, + "step": 5728 + }, + { + "epoch": 0.03407198591683319, + "grad_norm": 2.6959800720214844, + "learning_rate": 4.98569662689273e-05, + "loss": 5.7267, + "step": 5729 + }, + { + "epoch": 0.03407793320011419, + "grad_norm": 2.579802989959717, + "learning_rate": 4.985691637032057e-05, + "loss": 5.2467, + "step": 5730 + }, + { + "epoch": 0.03408388048339518, + "grad_norm": 2.136262893676758, + "learning_rate": 4.985686646303656e-05, + "loss": 5.7071, + "step": 5731 + }, + { + "epoch": 0.034089827766676184, + "grad_norm": 2.1442244052886963, + "learning_rate": 4.985681654707526e-05, + "loss": 6.3961, + "step": 5732 + }, + { + "epoch": 0.03409577504995718, + "grad_norm": 2.164340019226074, + "learning_rate": 4.9856766622436714e-05, + "loss": 6.2455, + "step": 5733 + }, + { + "epoch": 0.034101722333238174, + "grad_norm": 2.199791193008423, + "learning_rate": 4.985671668912092e-05, + "loss": 5.8804, + "step": 5734 + }, + { + "epoch": 0.034107669616519176, + "grad_norm": 2.0359933376312256, + "learning_rate": 4.9856666747127905e-05, + "loss": 6.359, + "step": 5735 + }, + { + "epoch": 0.03411361689980017, + "grad_norm": 2.17069935798645, + "learning_rate": 4.985661679645769e-05, + "loss": 6.6736, + "step": 5736 + }, + { + "epoch": 0.034119564183081166, + "grad_norm": 1.9114634990692139, + "learning_rate": 4.9856566837110275e-05, + "loss": 5.9629, + "step": 5737 + }, + { + "epoch": 0.03412551146636217, + "grad_norm": 2.2872474193573, + "learning_rate": 4.9856516869085704e-05, + "loss": 5.5856, + "step": 5738 + }, + { + "epoch": 0.03413145874964316, + "grad_norm": 2.0800466537475586, + "learning_rate": 4.9856466892383965e-05, + "loss": 5.7732, + "step": 5739 + }, + { + "epoch": 0.03413740603292416, + "grad_norm": 2.37117338180542, + "learning_rate": 4.98564169070051e-05, + "loss": 5.667, + "step": 5740 + }, + { + "epoch": 0.03414335331620516, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.985636691294911e-05, + "loss": 5.4874, + "step": 5741 + }, + { + "epoch": 0.034149300599486156, + "grad_norm": 2.0097250938415527, + "learning_rate": 4.9856316910216024e-05, + "loss": 5.5469, + "step": 5742 + }, + { + "epoch": 0.03415524788276715, + "grad_norm": 2.430954933166504, + "learning_rate": 4.985626689880586e-05, + "loss": 5.7635, + "step": 5743 + }, + { + "epoch": 0.034161195166048146, + "grad_norm": 2.1000874042510986, + "learning_rate": 4.985621687871862e-05, + "loss": 5.7102, + "step": 5744 + }, + { + "epoch": 0.03416714244932915, + "grad_norm": 2.2048611640930176, + "learning_rate": 4.9856166849954336e-05, + "loss": 5.8156, + "step": 5745 + }, + { + "epoch": 0.03417308973261014, + "grad_norm": 2.145538330078125, + "learning_rate": 4.985611681251302e-05, + "loss": 5.9101, + "step": 5746 + }, + { + "epoch": 0.03417903701589114, + "grad_norm": 2.86169695854187, + "learning_rate": 4.9856066766394685e-05, + "loss": 5.7358, + "step": 5747 + }, + { + "epoch": 0.03418498429917214, + "grad_norm": 2.0648229122161865, + "learning_rate": 4.985601671159936e-05, + "loss": 6.0529, + "step": 5748 + }, + { + "epoch": 0.034190931582453135, + "grad_norm": 2.191251039505005, + "learning_rate": 4.985596664812706e-05, + "loss": 6.1999, + "step": 5749 + }, + { + "epoch": 0.03419687886573413, + "grad_norm": 2.556640148162842, + "learning_rate": 4.985591657597779e-05, + "loss": 6.0671, + "step": 5750 + }, + { + "epoch": 0.03420282614901513, + "grad_norm": 2.1796281337738037, + "learning_rate": 4.985586649515158e-05, + "loss": 6.1537, + "step": 5751 + }, + { + "epoch": 0.03420877343229613, + "grad_norm": 2.1884169578552246, + "learning_rate": 4.985581640564845e-05, + "loss": 5.7667, + "step": 5752 + }, + { + "epoch": 0.03421472071557712, + "grad_norm": 2.3836331367492676, + "learning_rate": 4.9855766307468404e-05, + "loss": 5.6608, + "step": 5753 + }, + { + "epoch": 0.034220667998858124, + "grad_norm": 2.0464322566986084, + "learning_rate": 4.985571620061147e-05, + "loss": 5.5317, + "step": 5754 + }, + { + "epoch": 0.03422661528213912, + "grad_norm": 2.3275644779205322, + "learning_rate": 4.9855666085077654e-05, + "loss": 5.8611, + "step": 5755 + }, + { + "epoch": 0.034232562565420115, + "grad_norm": 2.7268338203430176, + "learning_rate": 4.9855615960867e-05, + "loss": 5.6323, + "step": 5756 + }, + { + "epoch": 0.03423850984870112, + "grad_norm": 2.578986406326294, + "learning_rate": 4.985556582797949e-05, + "loss": 5.6108, + "step": 5757 + }, + { + "epoch": 0.03424445713198211, + "grad_norm": 2.4127955436706543, + "learning_rate": 4.985551568641516e-05, + "loss": 5.7054, + "step": 5758 + }, + { + "epoch": 0.03425040441526311, + "grad_norm": 2.1954357624053955, + "learning_rate": 4.985546553617404e-05, + "loss": 6.194, + "step": 5759 + }, + { + "epoch": 0.0342563516985441, + "grad_norm": 2.43851900100708, + "learning_rate": 4.985541537725612e-05, + "loss": 5.9067, + "step": 5760 + }, + { + "epoch": 0.034262298981825104, + "grad_norm": 2.0910801887512207, + "learning_rate": 4.9855365209661445e-05, + "loss": 6.1017, + "step": 5761 + }, + { + "epoch": 0.0342682462651061, + "grad_norm": 1.9936187267303467, + "learning_rate": 4.985531503339e-05, + "loss": 6.1239, + "step": 5762 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 2.0663299560546875, + "learning_rate": 4.985526484844183e-05, + "loss": 6.0514, + "step": 5763 + }, + { + "epoch": 0.034280140831668096, + "grad_norm": 2.4357266426086426, + "learning_rate": 4.985521465481695e-05, + "loss": 5.3695, + "step": 5764 + }, + { + "epoch": 0.03428608811494909, + "grad_norm": 2.12214994430542, + "learning_rate": 4.985516445251537e-05, + "loss": 5.5531, + "step": 5765 + }, + { + "epoch": 0.034292035398230086, + "grad_norm": 2.731661319732666, + "learning_rate": 4.9855114241537105e-05, + "loss": 6.2403, + "step": 5766 + }, + { + "epoch": 0.03429798268151109, + "grad_norm": 2.0668931007385254, + "learning_rate": 4.985506402188217e-05, + "loss": 6.0873, + "step": 5767 + }, + { + "epoch": 0.03430392996479208, + "grad_norm": 2.3165833950042725, + "learning_rate": 4.98550137935506e-05, + "loss": 5.9365, + "step": 5768 + }, + { + "epoch": 0.03430987724807308, + "grad_norm": 1.8637720346450806, + "learning_rate": 4.98549635565424e-05, + "loss": 6.0837, + "step": 5769 + }, + { + "epoch": 0.03431582453135408, + "grad_norm": 2.1689205169677734, + "learning_rate": 4.985491331085758e-05, + "loss": 5.703, + "step": 5770 + }, + { + "epoch": 0.034321771814635076, + "grad_norm": 2.245283365249634, + "learning_rate": 4.985486305649618e-05, + "loss": 6.0134, + "step": 5771 + }, + { + "epoch": 0.03432771909791607, + "grad_norm": 2.2685303688049316, + "learning_rate": 4.98548127934582e-05, + "loss": 5.279, + "step": 5772 + }, + { + "epoch": 0.034333666381197066, + "grad_norm": 2.376253128051758, + "learning_rate": 4.985476252174365e-05, + "loss": 5.5812, + "step": 5773 + }, + { + "epoch": 0.03433961366447807, + "grad_norm": 2.2636559009552, + "learning_rate": 4.985471224135257e-05, + "loss": 5.6906, + "step": 5774 + }, + { + "epoch": 0.03434556094775906, + "grad_norm": 2.22103214263916, + "learning_rate": 4.9854661952284965e-05, + "loss": 6.2066, + "step": 5775 + }, + { + "epoch": 0.03435150823104006, + "grad_norm": 2.308610439300537, + "learning_rate": 4.985461165454085e-05, + "loss": 6.1582, + "step": 5776 + }, + { + "epoch": 0.03435745551432106, + "grad_norm": 1.9191935062408447, + "learning_rate": 4.985456134812026e-05, + "loss": 5.4587, + "step": 5777 + }, + { + "epoch": 0.034363402797602055, + "grad_norm": 2.3127100467681885, + "learning_rate": 4.9854511033023184e-05, + "loss": 5.3375, + "step": 5778 + }, + { + "epoch": 0.03436935008088305, + "grad_norm": 2.4817371368408203, + "learning_rate": 4.985446070924966e-05, + "loss": 5.4961, + "step": 5779 + }, + { + "epoch": 0.03437529736416405, + "grad_norm": 2.0995922088623047, + "learning_rate": 4.9854410376799695e-05, + "loss": 5.7676, + "step": 5780 + }, + { + "epoch": 0.03438124464744505, + "grad_norm": 2.261229991912842, + "learning_rate": 4.985436003567332e-05, + "loss": 5.4446, + "step": 5781 + }, + { + "epoch": 0.03438719193072604, + "grad_norm": 2.275536060333252, + "learning_rate": 4.985430968587055e-05, + "loss": 5.4297, + "step": 5782 + }, + { + "epoch": 0.034393139214007044, + "grad_norm": 2.3733773231506348, + "learning_rate": 4.985425932739138e-05, + "loss": 5.7658, + "step": 5783 + }, + { + "epoch": 0.03439908649728804, + "grad_norm": 2.201716184616089, + "learning_rate": 4.985420896023586e-05, + "loss": 5.5502, + "step": 5784 + }, + { + "epoch": 0.034405033780569035, + "grad_norm": 2.1012730598449707, + "learning_rate": 4.9854158584403985e-05, + "loss": 5.7199, + "step": 5785 + }, + { + "epoch": 0.03441098106385004, + "grad_norm": 2.065568685531616, + "learning_rate": 4.985410819989579e-05, + "loss": 6.1547, + "step": 5786 + }, + { + "epoch": 0.03441692834713103, + "grad_norm": 1.9217867851257324, + "learning_rate": 4.9854057806711275e-05, + "loss": 6.2556, + "step": 5787 + }, + { + "epoch": 0.03442287563041203, + "grad_norm": 2.028602123260498, + "learning_rate": 4.985400740485047e-05, + "loss": 5.9347, + "step": 5788 + }, + { + "epoch": 0.03442882291369302, + "grad_norm": 2.002855062484741, + "learning_rate": 4.9853956994313376e-05, + "loss": 5.3966, + "step": 5789 + }, + { + "epoch": 0.034434770196974024, + "grad_norm": 2.3740642070770264, + "learning_rate": 4.985390657510003e-05, + "loss": 5.7801, + "step": 5790 + }, + { + "epoch": 0.03444071748025502, + "grad_norm": 2.1149635314941406, + "learning_rate": 4.9853856147210444e-05, + "loss": 5.6504, + "step": 5791 + }, + { + "epoch": 0.034446664763536014, + "grad_norm": 2.3519630432128906, + "learning_rate": 4.985380571064463e-05, + "loss": 5.9172, + "step": 5792 + }, + { + "epoch": 0.034452612046817016, + "grad_norm": 2.38930082321167, + "learning_rate": 4.985375526540261e-05, + "loss": 5.6196, + "step": 5793 + }, + { + "epoch": 0.03445855933009801, + "grad_norm": 2.245596408843994, + "learning_rate": 4.98537048114844e-05, + "loss": 5.5034, + "step": 5794 + }, + { + "epoch": 0.034464506613379006, + "grad_norm": 2.272158622741699, + "learning_rate": 4.985365434889002e-05, + "loss": 5.5867, + "step": 5795 + }, + { + "epoch": 0.03447045389666001, + "grad_norm": 2.2090094089508057, + "learning_rate": 4.9853603877619485e-05, + "loss": 5.68, + "step": 5796 + }, + { + "epoch": 0.034476401179941, + "grad_norm": 2.0545220375061035, + "learning_rate": 4.985355339767281e-05, + "loss": 5.8382, + "step": 5797 + }, + { + "epoch": 0.034482348463222, + "grad_norm": 2.143134593963623, + "learning_rate": 4.985350290905003e-05, + "loss": 5.5753, + "step": 5798 + }, + { + "epoch": 0.034488295746503, + "grad_norm": 2.3938257694244385, + "learning_rate": 4.985345241175114e-05, + "loss": 5.7545, + "step": 5799 + }, + { + "epoch": 0.034494243029783996, + "grad_norm": 2.132998466491699, + "learning_rate": 4.985340190577616e-05, + "loss": 5.5477, + "step": 5800 + }, + { + "epoch": 0.03450019031306499, + "grad_norm": 3.141417980194092, + "learning_rate": 4.9853351391125126e-05, + "loss": 5.3509, + "step": 5801 + }, + { + "epoch": 0.034506137596345986, + "grad_norm": 2.4776933193206787, + "learning_rate": 4.9853300867798034e-05, + "loss": 6.1052, + "step": 5802 + }, + { + "epoch": 0.03451208487962699, + "grad_norm": 2.1782073974609375, + "learning_rate": 4.985325033579492e-05, + "loss": 5.9599, + "step": 5803 + }, + { + "epoch": 0.03451803216290798, + "grad_norm": 2.2631704807281494, + "learning_rate": 4.9853199795115794e-05, + "loss": 5.534, + "step": 5804 + }, + { + "epoch": 0.03452397944618898, + "grad_norm": 2.140612840652466, + "learning_rate": 4.985314924576066e-05, + "loss": 5.7479, + "step": 5805 + }, + { + "epoch": 0.03452992672946998, + "grad_norm": 2.726651668548584, + "learning_rate": 4.9853098687729563e-05, + "loss": 5.4639, + "step": 5806 + }, + { + "epoch": 0.034535874012750975, + "grad_norm": 1.852423071861267, + "learning_rate": 4.985304812102249e-05, + "loss": 5.4209, + "step": 5807 + }, + { + "epoch": 0.03454182129603197, + "grad_norm": 2.5236833095550537, + "learning_rate": 4.9852997545639485e-05, + "loss": 5.9653, + "step": 5808 + }, + { + "epoch": 0.03454776857931297, + "grad_norm": 2.2740652561187744, + "learning_rate": 4.985294696158056e-05, + "loss": 5.9457, + "step": 5809 + }, + { + "epoch": 0.03455371586259397, + "grad_norm": 2.931777000427246, + "learning_rate": 4.9852896368845715e-05, + "loss": 5.6709, + "step": 5810 + }, + { + "epoch": 0.03455966314587496, + "grad_norm": 2.6981759071350098, + "learning_rate": 4.9852845767434986e-05, + "loss": 5.1747, + "step": 5811 + }, + { + "epoch": 0.034565610429155964, + "grad_norm": 2.2675211429595947, + "learning_rate": 4.985279515734839e-05, + "loss": 5.2393, + "step": 5812 + }, + { + "epoch": 0.03457155771243696, + "grad_norm": 2.535473346710205, + "learning_rate": 4.985274453858594e-05, + "loss": 6.2184, + "step": 5813 + }, + { + "epoch": 0.034577504995717954, + "grad_norm": 2.8692495822906494, + "learning_rate": 4.985269391114765e-05, + "loss": 5.2557, + "step": 5814 + }, + { + "epoch": 0.034583452278998957, + "grad_norm": 2.908472776412964, + "learning_rate": 4.985264327503354e-05, + "loss": 5.1559, + "step": 5815 + }, + { + "epoch": 0.03458939956227995, + "grad_norm": 2.3630192279815674, + "learning_rate": 4.985259263024363e-05, + "loss": 5.3159, + "step": 5816 + }, + { + "epoch": 0.03459534684556095, + "grad_norm": 2.1287102699279785, + "learning_rate": 4.9852541976777933e-05, + "loss": 5.2069, + "step": 5817 + }, + { + "epoch": 0.03460129412884194, + "grad_norm": 2.751567840576172, + "learning_rate": 4.985249131463647e-05, + "loss": 5.6561, + "step": 5818 + }, + { + "epoch": 0.034607241412122944, + "grad_norm": 2.505608081817627, + "learning_rate": 4.985244064381927e-05, + "loss": 5.9708, + "step": 5819 + }, + { + "epoch": 0.03461318869540394, + "grad_norm": 2.351593255996704, + "learning_rate": 4.9852389964326337e-05, + "loss": 5.9046, + "step": 5820 + }, + { + "epoch": 0.034619135978684934, + "grad_norm": 2.3037939071655273, + "learning_rate": 4.985233927615769e-05, + "loss": 6.0069, + "step": 5821 + }, + { + "epoch": 0.034625083261965936, + "grad_norm": 2.2482705116271973, + "learning_rate": 4.985228857931334e-05, + "loss": 5.9492, + "step": 5822 + }, + { + "epoch": 0.03463103054524693, + "grad_norm": 2.23640513420105, + "learning_rate": 4.985223787379332e-05, + "loss": 5.6631, + "step": 5823 + }, + { + "epoch": 0.034636977828527926, + "grad_norm": 2.710275411605835, + "learning_rate": 4.985218715959764e-05, + "loss": 5.5961, + "step": 5824 + }, + { + "epoch": 0.03464292511180893, + "grad_norm": 2.7220160961151123, + "learning_rate": 4.9852136436726313e-05, + "loss": 5.6922, + "step": 5825 + }, + { + "epoch": 0.03464887239508992, + "grad_norm": 2.4542758464813232, + "learning_rate": 4.985208570517937e-05, + "loss": 5.4742, + "step": 5826 + }, + { + "epoch": 0.03465481967837092, + "grad_norm": 2.7492685317993164, + "learning_rate": 4.9852034964956816e-05, + "loss": 5.4598, + "step": 5827 + }, + { + "epoch": 0.03466076696165192, + "grad_norm": 2.757937431335449, + "learning_rate": 4.9851984216058677e-05, + "loss": 6.1865, + "step": 5828 + }, + { + "epoch": 0.034666714244932915, + "grad_norm": 2.835890531539917, + "learning_rate": 4.985193345848497e-05, + "loss": 5.3368, + "step": 5829 + }, + { + "epoch": 0.03467266152821391, + "grad_norm": 2.694884777069092, + "learning_rate": 4.98518826922357e-05, + "loss": 5.3654, + "step": 5830 + }, + { + "epoch": 0.03467860881149491, + "grad_norm": 2.443784236907959, + "learning_rate": 4.98518319173109e-05, + "loss": 5.7879, + "step": 5831 + }, + { + "epoch": 0.03468455609477591, + "grad_norm": 2.0198488235473633, + "learning_rate": 4.985178113371058e-05, + "loss": 5.766, + "step": 5832 + }, + { + "epoch": 0.0346905033780569, + "grad_norm": 2.8718788623809814, + "learning_rate": 4.985173034143476e-05, + "loss": 5.5506, + "step": 5833 + }, + { + "epoch": 0.0346964506613379, + "grad_norm": 2.4353652000427246, + "learning_rate": 4.9851679540483455e-05, + "loss": 5.7139, + "step": 5834 + }, + { + "epoch": 0.0347023979446189, + "grad_norm": 1.9376598596572876, + "learning_rate": 4.985162873085669e-05, + "loss": 6.2326, + "step": 5835 + }, + { + "epoch": 0.034708345227899895, + "grad_norm": 2.2225289344787598, + "learning_rate": 4.985157791255448e-05, + "loss": 5.5997, + "step": 5836 + }, + { + "epoch": 0.03471429251118089, + "grad_norm": 2.011493682861328, + "learning_rate": 4.985152708557684e-05, + "loss": 5.6882, + "step": 5837 + }, + { + "epoch": 0.03472023979446189, + "grad_norm": 1.8679020404815674, + "learning_rate": 4.985147624992378e-05, + "loss": 5.5427, + "step": 5838 + }, + { + "epoch": 0.03472618707774289, + "grad_norm": 1.9470884799957275, + "learning_rate": 4.9851425405595334e-05, + "loss": 5.5957, + "step": 5839 + }, + { + "epoch": 0.03473213436102388, + "grad_norm": 2.0765669345855713, + "learning_rate": 4.985137455259151e-05, + "loss": 5.4416, + "step": 5840 + }, + { + "epoch": 0.034738081644304884, + "grad_norm": 2.0521979331970215, + "learning_rate": 4.985132369091233e-05, + "loss": 5.4641, + "step": 5841 + }, + { + "epoch": 0.03474402892758588, + "grad_norm": 1.7439172267913818, + "learning_rate": 4.985127282055781e-05, + "loss": 5.1998, + "step": 5842 + }, + { + "epoch": 0.034749976210866874, + "grad_norm": 1.7347313165664673, + "learning_rate": 4.985122194152797e-05, + "loss": 5.2392, + "step": 5843 + }, + { + "epoch": 0.034755923494147876, + "grad_norm": 1.7362169027328491, + "learning_rate": 4.985117105382282e-05, + "loss": 5.1769, + "step": 5844 + }, + { + "epoch": 0.03476187077742887, + "grad_norm": 1.7468090057373047, + "learning_rate": 4.985112015744239e-05, + "loss": 5.3915, + "step": 5845 + }, + { + "epoch": 0.03476781806070987, + "grad_norm": 1.8685250282287598, + "learning_rate": 4.985106925238668e-05, + "loss": 5.6119, + "step": 5846 + }, + { + "epoch": 0.03477376534399086, + "grad_norm": 1.9595715999603271, + "learning_rate": 4.985101833865572e-05, + "loss": 5.5536, + "step": 5847 + }, + { + "epoch": 0.034779712627271864, + "grad_norm": 1.8454965353012085, + "learning_rate": 4.985096741624953e-05, + "loss": 5.8127, + "step": 5848 + }, + { + "epoch": 0.03478565991055286, + "grad_norm": 1.9182006120681763, + "learning_rate": 4.985091648516813e-05, + "loss": 5.8807, + "step": 5849 + }, + { + "epoch": 0.034791607193833854, + "grad_norm": 2.042923927307129, + "learning_rate": 4.9850865545411526e-05, + "loss": 5.9013, + "step": 5850 + }, + { + "epoch": 0.034797554477114856, + "grad_norm": 2.341055393218994, + "learning_rate": 4.985081459697974e-05, + "loss": 6.214, + "step": 5851 + }, + { + "epoch": 0.03480350176039585, + "grad_norm": 2.026190996170044, + "learning_rate": 4.985076363987279e-05, + "loss": 5.3693, + "step": 5852 + }, + { + "epoch": 0.034809449043676846, + "grad_norm": 2.045264482498169, + "learning_rate": 4.98507126740907e-05, + "loss": 5.6325, + "step": 5853 + }, + { + "epoch": 0.03481539632695785, + "grad_norm": 2.2710580825805664, + "learning_rate": 4.985066169963348e-05, + "loss": 5.8355, + "step": 5854 + }, + { + "epoch": 0.03482134361023884, + "grad_norm": 1.8813494443893433, + "learning_rate": 4.985061071650115e-05, + "loss": 5.5849, + "step": 5855 + }, + { + "epoch": 0.03482729089351984, + "grad_norm": 2.2177746295928955, + "learning_rate": 4.985055972469373e-05, + "loss": 5.5518, + "step": 5856 + }, + { + "epoch": 0.03483323817680084, + "grad_norm": 1.897653341293335, + "learning_rate": 4.9850508724211234e-05, + "loss": 5.6035, + "step": 5857 + }, + { + "epoch": 0.034839185460081835, + "grad_norm": 2.349821090698242, + "learning_rate": 4.985045771505369e-05, + "loss": 5.8181, + "step": 5858 + }, + { + "epoch": 0.03484513274336283, + "grad_norm": 1.900538682937622, + "learning_rate": 4.98504066972211e-05, + "loss": 5.2751, + "step": 5859 + }, + { + "epoch": 0.03485108002664383, + "grad_norm": 2.1902174949645996, + "learning_rate": 4.985035567071349e-05, + "loss": 5.2709, + "step": 5860 + }, + { + "epoch": 0.03485702730992483, + "grad_norm": 1.7833307981491089, + "learning_rate": 4.9850304635530884e-05, + "loss": 5.2104, + "step": 5861 + }, + { + "epoch": 0.03486297459320582, + "grad_norm": 2.017603874206543, + "learning_rate": 4.985025359167329e-05, + "loss": 5.2257, + "step": 5862 + }, + { + "epoch": 0.03486892187648682, + "grad_norm": 1.9828181266784668, + "learning_rate": 4.9850202539140724e-05, + "loss": 5.2303, + "step": 5863 + }, + { + "epoch": 0.03487486915976782, + "grad_norm": 2.0273706912994385, + "learning_rate": 4.9850151477933216e-05, + "loss": 5.1743, + "step": 5864 + }, + { + "epoch": 0.034880816443048815, + "grad_norm": 1.9634721279144287, + "learning_rate": 4.985010040805077e-05, + "loss": 5.1541, + "step": 5865 + }, + { + "epoch": 0.03488676372632981, + "grad_norm": 2.2766621112823486, + "learning_rate": 4.985004932949342e-05, + "loss": 5.1372, + "step": 5866 + }, + { + "epoch": 0.03489271100961081, + "grad_norm": 2.0768795013427734, + "learning_rate": 4.984999824226117e-05, + "loss": 5.2567, + "step": 5867 + }, + { + "epoch": 0.03489865829289181, + "grad_norm": 1.8665590286254883, + "learning_rate": 4.984994714635404e-05, + "loss": 5.1356, + "step": 5868 + }, + { + "epoch": 0.0349046055761728, + "grad_norm": 2.056450843811035, + "learning_rate": 4.984989604177205e-05, + "loss": 5.1667, + "step": 5869 + }, + { + "epoch": 0.034910552859453804, + "grad_norm": 2.1191976070404053, + "learning_rate": 4.984984492851522e-05, + "loss": 5.1898, + "step": 5870 + }, + { + "epoch": 0.0349165001427348, + "grad_norm": 2.049450397491455, + "learning_rate": 4.9849793806583566e-05, + "loss": 5.1568, + "step": 5871 + }, + { + "epoch": 0.034922447426015794, + "grad_norm": 1.79837167263031, + "learning_rate": 4.984974267597711e-05, + "loss": 5.1288, + "step": 5872 + }, + { + "epoch": 0.034928394709296796, + "grad_norm": 1.959088683128357, + "learning_rate": 4.984969153669585e-05, + "loss": 5.1063, + "step": 5873 + }, + { + "epoch": 0.03493434199257779, + "grad_norm": 1.9193873405456543, + "learning_rate": 4.9849640388739836e-05, + "loss": 5.1608, + "step": 5874 + }, + { + "epoch": 0.03494028927585879, + "grad_norm": 1.6684316396713257, + "learning_rate": 4.9849589232109065e-05, + "loss": 5.0926, + "step": 5875 + }, + { + "epoch": 0.03494623655913978, + "grad_norm": 1.8383700847625732, + "learning_rate": 4.984953806680356e-05, + "loss": 5.0474, + "step": 5876 + }, + { + "epoch": 0.034952183842420784, + "grad_norm": 2.233779191970825, + "learning_rate": 4.984948689282333e-05, + "loss": 5.5046, + "step": 5877 + }, + { + "epoch": 0.03495813112570178, + "grad_norm": 2.2267282009124756, + "learning_rate": 4.9849435710168415e-05, + "loss": 5.6235, + "step": 5878 + }, + { + "epoch": 0.034964078408982774, + "grad_norm": 1.7933586835861206, + "learning_rate": 4.9849384518838804e-05, + "loss": 5.0968, + "step": 5879 + }, + { + "epoch": 0.034970025692263776, + "grad_norm": 2.0050230026245117, + "learning_rate": 4.984933331883453e-05, + "loss": 4.9789, + "step": 5880 + }, + { + "epoch": 0.03497597297554477, + "grad_norm": 1.7422970533370972, + "learning_rate": 4.9849282110155627e-05, + "loss": 5.1556, + "step": 5881 + }, + { + "epoch": 0.034981920258825766, + "grad_norm": 2.1242151260375977, + "learning_rate": 4.984923089280209e-05, + "loss": 5.7039, + "step": 5882 + }, + { + "epoch": 0.03498786754210677, + "grad_norm": 1.8656666278839111, + "learning_rate": 4.9849179666773934e-05, + "loss": 5.7185, + "step": 5883 + }, + { + "epoch": 0.03499381482538776, + "grad_norm": 1.6954991817474365, + "learning_rate": 4.984912843207119e-05, + "loss": 5.5686, + "step": 5884 + }, + { + "epoch": 0.03499976210866876, + "grad_norm": 1.7692710161209106, + "learning_rate": 4.984907718869387e-05, + "loss": 5.4058, + "step": 5885 + }, + { + "epoch": 0.03500570939194976, + "grad_norm": 1.8496350049972534, + "learning_rate": 4.9849025936642004e-05, + "loss": 5.5037, + "step": 5886 + }, + { + "epoch": 0.035011656675230755, + "grad_norm": 2.0124640464782715, + "learning_rate": 4.984897467591559e-05, + "loss": 5.6146, + "step": 5887 + }, + { + "epoch": 0.03501760395851175, + "grad_norm": 2.5522549152374268, + "learning_rate": 4.984892340651466e-05, + "loss": 5.6403, + "step": 5888 + }, + { + "epoch": 0.03502355124179275, + "grad_norm": 2.2127344608306885, + "learning_rate": 4.9848872128439224e-05, + "loss": 5.6277, + "step": 5889 + }, + { + "epoch": 0.03502949852507375, + "grad_norm": 2.578322172164917, + "learning_rate": 4.9848820841689305e-05, + "loss": 5.849, + "step": 5890 + }, + { + "epoch": 0.03503544580835474, + "grad_norm": 1.8083957433700562, + "learning_rate": 4.9848769546264915e-05, + "loss": 5.4407, + "step": 5891 + }, + { + "epoch": 0.03504139309163574, + "grad_norm": 1.885387897491455, + "learning_rate": 4.984871824216609e-05, + "loss": 5.4486, + "step": 5892 + }, + { + "epoch": 0.03504734037491674, + "grad_norm": 1.9450737237930298, + "learning_rate": 4.9848666929392817e-05, + "loss": 5.4196, + "step": 5893 + }, + { + "epoch": 0.035053287658197735, + "grad_norm": 1.9072003364562988, + "learning_rate": 4.984861560794514e-05, + "loss": 5.6293, + "step": 5894 + }, + { + "epoch": 0.03505923494147873, + "grad_norm": 2.064192056655884, + "learning_rate": 4.984856427782307e-05, + "loss": 5.7105, + "step": 5895 + }, + { + "epoch": 0.03506518222475973, + "grad_norm": 2.0101802349090576, + "learning_rate": 4.984851293902663e-05, + "loss": 5.5623, + "step": 5896 + }, + { + "epoch": 0.03507112950804073, + "grad_norm": 1.9813642501831055, + "learning_rate": 4.984846159155581e-05, + "loss": 5.653, + "step": 5897 + }, + { + "epoch": 0.03507707679132172, + "grad_norm": 1.9213227033615112, + "learning_rate": 4.9848410235410666e-05, + "loss": 5.5194, + "step": 5898 + }, + { + "epoch": 0.035083024074602724, + "grad_norm": 1.803076982498169, + "learning_rate": 4.984835887059119e-05, + "loss": 5.4101, + "step": 5899 + }, + { + "epoch": 0.03508897135788372, + "grad_norm": 1.8419232368469238, + "learning_rate": 4.9848307497097414e-05, + "loss": 5.7329, + "step": 5900 + }, + { + "epoch": 0.035094918641164714, + "grad_norm": 1.9258531332015991, + "learning_rate": 4.984825611492935e-05, + "loss": 5.559, + "step": 5901 + }, + { + "epoch": 0.035100865924445716, + "grad_norm": 1.869529366493225, + "learning_rate": 4.984820472408701e-05, + "loss": 5.5682, + "step": 5902 + }, + { + "epoch": 0.03510681320772671, + "grad_norm": 1.753365159034729, + "learning_rate": 4.984815332457042e-05, + "loss": 5.6241, + "step": 5903 + }, + { + "epoch": 0.035112760491007707, + "grad_norm": 1.6581326723098755, + "learning_rate": 4.98481019163796e-05, + "loss": 5.4752, + "step": 5904 + }, + { + "epoch": 0.0351187077742887, + "grad_norm": 1.9120882749557495, + "learning_rate": 4.9848050499514565e-05, + "loss": 5.5678, + "step": 5905 + }, + { + "epoch": 0.035124655057569704, + "grad_norm": 1.9840329885482788, + "learning_rate": 4.984799907397533e-05, + "loss": 5.5369, + "step": 5906 + }, + { + "epoch": 0.0351306023408507, + "grad_norm": 1.7970712184906006, + "learning_rate": 4.9847947639761914e-05, + "loss": 5.5857, + "step": 5907 + }, + { + "epoch": 0.035136549624131694, + "grad_norm": 1.7219270467758179, + "learning_rate": 4.984789619687435e-05, + "loss": 5.609, + "step": 5908 + }, + { + "epoch": 0.035142496907412696, + "grad_norm": 1.8945105075836182, + "learning_rate": 4.984784474531262e-05, + "loss": 5.5893, + "step": 5909 + }, + { + "epoch": 0.03514844419069369, + "grad_norm": 1.8570127487182617, + "learning_rate": 4.984779328507678e-05, + "loss": 5.4556, + "step": 5910 + }, + { + "epoch": 0.035154391473974686, + "grad_norm": 1.9291017055511475, + "learning_rate": 4.984774181616683e-05, + "loss": 5.476, + "step": 5911 + }, + { + "epoch": 0.03516033875725569, + "grad_norm": 1.9138598442077637, + "learning_rate": 4.984769033858278e-05, + "loss": 5.6329, + "step": 5912 + }, + { + "epoch": 0.03516628604053668, + "grad_norm": 1.9484977722167969, + "learning_rate": 4.9847638852324665e-05, + "loss": 5.5305, + "step": 5913 + }, + { + "epoch": 0.03517223332381768, + "grad_norm": 1.7338584661483765, + "learning_rate": 4.984758735739249e-05, + "loss": 5.4842, + "step": 5914 + }, + { + "epoch": 0.03517818060709868, + "grad_norm": 1.8625437021255493, + "learning_rate": 4.984753585378629e-05, + "loss": 5.3696, + "step": 5915 + }, + { + "epoch": 0.035184127890379675, + "grad_norm": 1.798782229423523, + "learning_rate": 4.984748434150607e-05, + "loss": 5.5803, + "step": 5916 + }, + { + "epoch": 0.03519007517366067, + "grad_norm": 2.0596888065338135, + "learning_rate": 4.9847432820551845e-05, + "loss": 5.3274, + "step": 5917 + }, + { + "epoch": 0.03519602245694167, + "grad_norm": 2.0848498344421387, + "learning_rate": 4.984738129092364e-05, + "loss": 5.3334, + "step": 5918 + }, + { + "epoch": 0.03520196974022267, + "grad_norm": 2.000460386276245, + "learning_rate": 4.984732975262147e-05, + "loss": 5.4411, + "step": 5919 + }, + { + "epoch": 0.03520791702350366, + "grad_norm": 1.676957607269287, + "learning_rate": 4.9847278205645355e-05, + "loss": 5.47, + "step": 5920 + }, + { + "epoch": 0.03521386430678466, + "grad_norm": 1.911482334136963, + "learning_rate": 4.984722664999531e-05, + "loss": 5.5736, + "step": 5921 + }, + { + "epoch": 0.03521981159006566, + "grad_norm": 1.9573029279708862, + "learning_rate": 4.9847175085671356e-05, + "loss": 5.5509, + "step": 5922 + }, + { + "epoch": 0.035225758873346655, + "grad_norm": 1.8878334760665894, + "learning_rate": 4.984712351267351e-05, + "loss": 5.6437, + "step": 5923 + }, + { + "epoch": 0.03523170615662765, + "grad_norm": 1.9107712507247925, + "learning_rate": 4.984707193100179e-05, + "loss": 5.4471, + "step": 5924 + }, + { + "epoch": 0.03523765343990865, + "grad_norm": 1.7408612966537476, + "learning_rate": 4.9847020340656215e-05, + "loss": 5.3706, + "step": 5925 + }, + { + "epoch": 0.03524360072318965, + "grad_norm": 1.9594995975494385, + "learning_rate": 4.98469687416368e-05, + "loss": 5.4113, + "step": 5926 + }, + { + "epoch": 0.03524954800647064, + "grad_norm": 1.8772166967391968, + "learning_rate": 4.984691713394356e-05, + "loss": 5.368, + "step": 5927 + }, + { + "epoch": 0.035255495289751644, + "grad_norm": 2.1143953800201416, + "learning_rate": 4.9846865517576524e-05, + "loss": 5.3829, + "step": 5928 + }, + { + "epoch": 0.03526144257303264, + "grad_norm": 2.0923383235931396, + "learning_rate": 4.984681389253571e-05, + "loss": 5.9834, + "step": 5929 + }, + { + "epoch": 0.035267389856313634, + "grad_norm": 2.016749620437622, + "learning_rate": 4.984676225882112e-05, + "loss": 5.68, + "step": 5930 + }, + { + "epoch": 0.035273337139594636, + "grad_norm": 1.6040265560150146, + "learning_rate": 4.984671061643279e-05, + "loss": 5.7406, + "step": 5931 + }, + { + "epoch": 0.03527928442287563, + "grad_norm": 2.100774049758911, + "learning_rate": 4.984665896537072e-05, + "loss": 5.5545, + "step": 5932 + }, + { + "epoch": 0.035285231706156626, + "grad_norm": 2.008575439453125, + "learning_rate": 4.984660730563494e-05, + "loss": 5.3769, + "step": 5933 + }, + { + "epoch": 0.03529117898943762, + "grad_norm": 1.9622136354446411, + "learning_rate": 4.984655563722547e-05, + "loss": 5.5792, + "step": 5934 + }, + { + "epoch": 0.035297126272718624, + "grad_norm": 1.764647364616394, + "learning_rate": 4.9846503960142325e-05, + "loss": 5.6543, + "step": 5935 + }, + { + "epoch": 0.03530307355599962, + "grad_norm": 1.6166809797286987, + "learning_rate": 4.984645227438552e-05, + "loss": 5.7948, + "step": 5936 + }, + { + "epoch": 0.035309020839280614, + "grad_norm": 1.7368977069854736, + "learning_rate": 4.9846400579955074e-05, + "loss": 5.6288, + "step": 5937 + }, + { + "epoch": 0.035314968122561616, + "grad_norm": 1.649059772491455, + "learning_rate": 4.984634887685101e-05, + "loss": 5.8538, + "step": 5938 + }, + { + "epoch": 0.03532091540584261, + "grad_norm": 1.6092652082443237, + "learning_rate": 4.984629716507334e-05, + "loss": 5.7077, + "step": 5939 + }, + { + "epoch": 0.035326862689123606, + "grad_norm": 1.76821768283844, + "learning_rate": 4.984624544462209e-05, + "loss": 5.4206, + "step": 5940 + }, + { + "epoch": 0.03533280997240461, + "grad_norm": 1.5885004997253418, + "learning_rate": 4.984619371549727e-05, + "loss": 5.3997, + "step": 5941 + }, + { + "epoch": 0.0353387572556856, + "grad_norm": 1.6730574369430542, + "learning_rate": 4.984614197769889e-05, + "loss": 5.4952, + "step": 5942 + }, + { + "epoch": 0.0353447045389666, + "grad_norm": 1.9951595067977905, + "learning_rate": 4.984609023122699e-05, + "loss": 5.5658, + "step": 5943 + }, + { + "epoch": 0.0353506518222476, + "grad_norm": 1.8277794122695923, + "learning_rate": 4.984603847608157e-05, + "loss": 5.5313, + "step": 5944 + }, + { + "epoch": 0.035356599105528595, + "grad_norm": 1.5988150835037231, + "learning_rate": 4.984598671226266e-05, + "loss": 5.4661, + "step": 5945 + }, + { + "epoch": 0.03536254638880959, + "grad_norm": 1.8313721418380737, + "learning_rate": 4.9845934939770264e-05, + "loss": 5.3005, + "step": 5946 + }, + { + "epoch": 0.03536849367209059, + "grad_norm": 1.8441407680511475, + "learning_rate": 4.984588315860442e-05, + "loss": 5.4564, + "step": 5947 + }, + { + "epoch": 0.03537444095537159, + "grad_norm": 2.8165388107299805, + "learning_rate": 4.9845831368765126e-05, + "loss": 5.4582, + "step": 5948 + }, + { + "epoch": 0.03538038823865258, + "grad_norm": 1.8860023021697998, + "learning_rate": 4.9845779570252415e-05, + "loss": 5.4952, + "step": 5949 + }, + { + "epoch": 0.03538633552193358, + "grad_norm": 1.7752633094787598, + "learning_rate": 4.98457277630663e-05, + "loss": 5.4301, + "step": 5950 + }, + { + "epoch": 0.03539228280521458, + "grad_norm": 1.9038548469543457, + "learning_rate": 4.984567594720679e-05, + "loss": 5.2591, + "step": 5951 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 2.6449787616729736, + "learning_rate": 4.984562412267392e-05, + "loss": 5.9317, + "step": 5952 + }, + { + "epoch": 0.03540417737177657, + "grad_norm": 1.95949125289917, + "learning_rate": 4.98455722894677e-05, + "loss": 5.4686, + "step": 5953 + }, + { + "epoch": 0.03541012465505757, + "grad_norm": 2.0208640098571777, + "learning_rate": 4.984552044758814e-05, + "loss": 5.6361, + "step": 5954 + }, + { + "epoch": 0.03541607193833857, + "grad_norm": 2.2328197956085205, + "learning_rate": 4.9845468597035274e-05, + "loss": 5.455, + "step": 5955 + }, + { + "epoch": 0.03542201922161956, + "grad_norm": 2.115952968597412, + "learning_rate": 4.9845416737809105e-05, + "loss": 5.3275, + "step": 5956 + }, + { + "epoch": 0.035427966504900564, + "grad_norm": 2.023791790008545, + "learning_rate": 4.984536486990966e-05, + "loss": 5.3135, + "step": 5957 + }, + { + "epoch": 0.03543391378818156, + "grad_norm": 1.9721077680587769, + "learning_rate": 4.9845312993336945e-05, + "loss": 5.3429, + "step": 5958 + }, + { + "epoch": 0.035439861071462554, + "grad_norm": 2.047588586807251, + "learning_rate": 4.9845261108091e-05, + "loss": 5.4027, + "step": 5959 + }, + { + "epoch": 0.035445808354743556, + "grad_norm": 1.9019498825073242, + "learning_rate": 4.9845209214171826e-05, + "loss": 5.3867, + "step": 5960 + }, + { + "epoch": 0.03545175563802455, + "grad_norm": 1.9442843198776245, + "learning_rate": 4.984515731157945e-05, + "loss": 5.3189, + "step": 5961 + }, + { + "epoch": 0.035457702921305546, + "grad_norm": 2.051422357559204, + "learning_rate": 4.9845105400313885e-05, + "loss": 5.5713, + "step": 5962 + }, + { + "epoch": 0.03546365020458654, + "grad_norm": 1.811908483505249, + "learning_rate": 4.9845053480375145e-05, + "loss": 5.6221, + "step": 5963 + }, + { + "epoch": 0.035469597487867544, + "grad_norm": 2.017991542816162, + "learning_rate": 4.984500155176326e-05, + "loss": 5.2774, + "step": 5964 + }, + { + "epoch": 0.03547554477114854, + "grad_norm": 1.972644329071045, + "learning_rate": 4.9844949614478244e-05, + "loss": 5.3208, + "step": 5965 + }, + { + "epoch": 0.035481492054429534, + "grad_norm": 1.9937026500701904, + "learning_rate": 4.984489766852011e-05, + "loss": 5.455, + "step": 5966 + }, + { + "epoch": 0.035487439337710536, + "grad_norm": 1.7297019958496094, + "learning_rate": 4.984484571388887e-05, + "loss": 5.3829, + "step": 5967 + }, + { + "epoch": 0.03549338662099153, + "grad_norm": 1.6428204774856567, + "learning_rate": 4.984479375058456e-05, + "loss": 5.3638, + "step": 5968 + }, + { + "epoch": 0.035499333904272526, + "grad_norm": 1.9522719383239746, + "learning_rate": 4.9844741778607186e-05, + "loss": 5.3379, + "step": 5969 + }, + { + "epoch": 0.03550528118755353, + "grad_norm": 2.0280921459198, + "learning_rate": 4.984468979795677e-05, + "loss": 5.4366, + "step": 5970 + }, + { + "epoch": 0.03551122847083452, + "grad_norm": 2.0396251678466797, + "learning_rate": 4.9844637808633334e-05, + "loss": 5.5681, + "step": 5971 + }, + { + "epoch": 0.03551717575411552, + "grad_norm": 1.5256271362304688, + "learning_rate": 4.984458581063689e-05, + "loss": 5.602, + "step": 5972 + }, + { + "epoch": 0.03552312303739652, + "grad_norm": 1.8829892873764038, + "learning_rate": 4.984453380396745e-05, + "loss": 5.3851, + "step": 5973 + }, + { + "epoch": 0.035529070320677515, + "grad_norm": 2.047106981277466, + "learning_rate": 4.984448178862505e-05, + "loss": 5.3724, + "step": 5974 + }, + { + "epoch": 0.03553501760395851, + "grad_norm": 2.066572904586792, + "learning_rate": 4.984442976460969e-05, + "loss": 5.3352, + "step": 5975 + }, + { + "epoch": 0.03554096488723951, + "grad_norm": 1.9785430431365967, + "learning_rate": 4.98443777319214e-05, + "loss": 5.2641, + "step": 5976 + }, + { + "epoch": 0.03554691217052051, + "grad_norm": 1.8999443054199219, + "learning_rate": 4.98443256905602e-05, + "loss": 5.3402, + "step": 5977 + }, + { + "epoch": 0.0355528594538015, + "grad_norm": 1.8599263429641724, + "learning_rate": 4.98442736405261e-05, + "loss": 5.2612, + "step": 5978 + }, + { + "epoch": 0.0355588067370825, + "grad_norm": 1.7216875553131104, + "learning_rate": 4.984422158181911e-05, + "loss": 5.4041, + "step": 5979 + }, + { + "epoch": 0.0355647540203635, + "grad_norm": 2.0259687900543213, + "learning_rate": 4.984416951443926e-05, + "loss": 5.4895, + "step": 5980 + }, + { + "epoch": 0.035570701303644495, + "grad_norm": 1.705736756324768, + "learning_rate": 4.9844117438386583e-05, + "loss": 5.5845, + "step": 5981 + }, + { + "epoch": 0.03557664858692549, + "grad_norm": 1.9546462297439575, + "learning_rate": 4.9844065353661074e-05, + "loss": 5.6803, + "step": 5982 + }, + { + "epoch": 0.03558259587020649, + "grad_norm": 1.829689383506775, + "learning_rate": 4.984401326026275e-05, + "loss": 5.5816, + "step": 5983 + }, + { + "epoch": 0.03558854315348749, + "grad_norm": 1.6464663743972778, + "learning_rate": 4.984396115819164e-05, + "loss": 5.5738, + "step": 5984 + }, + { + "epoch": 0.03559449043676848, + "grad_norm": 1.7786076068878174, + "learning_rate": 4.984390904744777e-05, + "loss": 5.3667, + "step": 5985 + }, + { + "epoch": 0.035600437720049484, + "grad_norm": 2.210754871368408, + "learning_rate": 4.984385692803114e-05, + "loss": 5.5259, + "step": 5986 + }, + { + "epoch": 0.03560638500333048, + "grad_norm": 1.7361842393875122, + "learning_rate": 4.984380479994179e-05, + "loss": 5.6108, + "step": 5987 + }, + { + "epoch": 0.035612332286611474, + "grad_norm": 1.926477313041687, + "learning_rate": 4.9843752663179703e-05, + "loss": 5.593, + "step": 5988 + }, + { + "epoch": 0.035618279569892476, + "grad_norm": 1.6683733463287354, + "learning_rate": 4.984370051774493e-05, + "loss": 5.6305, + "step": 5989 + }, + { + "epoch": 0.03562422685317347, + "grad_norm": 1.790499210357666, + "learning_rate": 4.9843648363637475e-05, + "loss": 5.596, + "step": 5990 + }, + { + "epoch": 0.035630174136454466, + "grad_norm": 1.8355207443237305, + "learning_rate": 4.984359620085736e-05, + "loss": 5.5818, + "step": 5991 + }, + { + "epoch": 0.03563612141973546, + "grad_norm": 1.9352680444717407, + "learning_rate": 4.98435440294046e-05, + "loss": 5.187, + "step": 5992 + }, + { + "epoch": 0.03564206870301646, + "grad_norm": 2.063159465789795, + "learning_rate": 4.9843491849279225e-05, + "loss": 5.3245, + "step": 5993 + }, + { + "epoch": 0.03564801598629746, + "grad_norm": 1.6848958730697632, + "learning_rate": 4.984343966048123e-05, + "loss": 5.4454, + "step": 5994 + }, + { + "epoch": 0.035653963269578454, + "grad_norm": 2.1244423389434814, + "learning_rate": 4.9843387463010654e-05, + "loss": 5.5018, + "step": 5995 + }, + { + "epoch": 0.035659910552859456, + "grad_norm": 1.9100427627563477, + "learning_rate": 4.9843335256867505e-05, + "loss": 5.5597, + "step": 5996 + }, + { + "epoch": 0.03566585783614045, + "grad_norm": 1.9130252599716187, + "learning_rate": 4.984328304205181e-05, + "loss": 5.4538, + "step": 5997 + }, + { + "epoch": 0.035671805119421446, + "grad_norm": 1.6285213232040405, + "learning_rate": 4.984323081856358e-05, + "loss": 5.7361, + "step": 5998 + }, + { + "epoch": 0.03567775240270245, + "grad_norm": 1.6690980195999146, + "learning_rate": 4.984317858640283e-05, + "loss": 5.7537, + "step": 5999 + }, + { + "epoch": 0.03568369968598344, + "grad_norm": 1.5258572101593018, + "learning_rate": 4.984312634556959e-05, + "loss": 5.7419, + "step": 6000 + }, + { + "epoch": 0.03568964696926444, + "grad_norm": 1.9586881399154663, + "learning_rate": 4.984307409606386e-05, + "loss": 5.4449, + "step": 6001 + }, + { + "epoch": 0.03569559425254544, + "grad_norm": 2.1795685291290283, + "learning_rate": 4.9843021837885684e-05, + "loss": 5.3833, + "step": 6002 + }, + { + "epoch": 0.035701541535826435, + "grad_norm": 2.1241326332092285, + "learning_rate": 4.984296957103506e-05, + "loss": 5.3064, + "step": 6003 + }, + { + "epoch": 0.03570748881910743, + "grad_norm": 1.9621204137802124, + "learning_rate": 4.9842917295512004e-05, + "loss": 5.3002, + "step": 6004 + }, + { + "epoch": 0.03571343610238843, + "grad_norm": 2.041503429412842, + "learning_rate": 4.984286501131655e-05, + "loss": 5.2885, + "step": 6005 + }, + { + "epoch": 0.03571938338566943, + "grad_norm": 2.1099791526794434, + "learning_rate": 4.984281271844871e-05, + "loss": 5.3038, + "step": 6006 + }, + { + "epoch": 0.03572533066895042, + "grad_norm": 2.0209009647369385, + "learning_rate": 4.98427604169085e-05, + "loss": 5.8373, + "step": 6007 + }, + { + "epoch": 0.03573127795223142, + "grad_norm": 1.7534282207489014, + "learning_rate": 4.9842708106695934e-05, + "loss": 5.6522, + "step": 6008 + }, + { + "epoch": 0.03573722523551242, + "grad_norm": 2.3014237880706787, + "learning_rate": 4.984265578781104e-05, + "loss": 5.462, + "step": 6009 + }, + { + "epoch": 0.035743172518793415, + "grad_norm": 2.123767614364624, + "learning_rate": 4.984260346025382e-05, + "loss": 5.3901, + "step": 6010 + }, + { + "epoch": 0.03574911980207441, + "grad_norm": 2.4190175533294678, + "learning_rate": 4.9842551124024315e-05, + "loss": 5.1526, + "step": 6011 + }, + { + "epoch": 0.03575506708535541, + "grad_norm": 1.9972834587097168, + "learning_rate": 4.984249877912254e-05, + "loss": 5.2987, + "step": 6012 + }, + { + "epoch": 0.03576101436863641, + "grad_norm": 2.002969980239868, + "learning_rate": 4.9842446425548494e-05, + "loss": 5.5244, + "step": 6013 + }, + { + "epoch": 0.0357669616519174, + "grad_norm": 2.8208391666412354, + "learning_rate": 4.984239406330221e-05, + "loss": 5.834, + "step": 6014 + }, + { + "epoch": 0.035772908935198404, + "grad_norm": 2.409303665161133, + "learning_rate": 4.98423416923837e-05, + "loss": 5.1709, + "step": 6015 + }, + { + "epoch": 0.0357788562184794, + "grad_norm": 2.215888500213623, + "learning_rate": 4.984228931279298e-05, + "loss": 5.38, + "step": 6016 + }, + { + "epoch": 0.035784803501760394, + "grad_norm": 1.9130421876907349, + "learning_rate": 4.9842236924530086e-05, + "loss": 5.4551, + "step": 6017 + }, + { + "epoch": 0.035790750785041396, + "grad_norm": 1.8963314294815063, + "learning_rate": 4.9842184527595015e-05, + "loss": 5.3512, + "step": 6018 + }, + { + "epoch": 0.03579669806832239, + "grad_norm": 2.0085666179656982, + "learning_rate": 4.98421321219878e-05, + "loss": 5.3013, + "step": 6019 + }, + { + "epoch": 0.035802645351603386, + "grad_norm": 2.1059834957122803, + "learning_rate": 4.9842079707708446e-05, + "loss": 5.4052, + "step": 6020 + }, + { + "epoch": 0.03580859263488438, + "grad_norm": 1.965694785118103, + "learning_rate": 4.984202728475699e-05, + "loss": 5.5392, + "step": 6021 + }, + { + "epoch": 0.03581453991816538, + "grad_norm": 1.9495680332183838, + "learning_rate": 4.9841974853133425e-05, + "loss": 5.309, + "step": 6022 + }, + { + "epoch": 0.03582048720144638, + "grad_norm": 1.9762555360794067, + "learning_rate": 4.9841922412837795e-05, + "loss": 5.3979, + "step": 6023 + }, + { + "epoch": 0.035826434484727374, + "grad_norm": 1.7825839519500732, + "learning_rate": 4.98418699638701e-05, + "loss": 5.3502, + "step": 6024 + }, + { + "epoch": 0.035832381768008376, + "grad_norm": 1.9636192321777344, + "learning_rate": 4.984181750623037e-05, + "loss": 5.6341, + "step": 6025 + }, + { + "epoch": 0.03583832905128937, + "grad_norm": 1.833883285522461, + "learning_rate": 4.984176503991861e-05, + "loss": 5.5861, + "step": 6026 + }, + { + "epoch": 0.035844276334570366, + "grad_norm": 1.91568124294281, + "learning_rate": 4.984171256493485e-05, + "loss": 5.591, + "step": 6027 + }, + { + "epoch": 0.03585022361785137, + "grad_norm": 2.153472423553467, + "learning_rate": 4.9841660081279105e-05, + "loss": 5.3463, + "step": 6028 + }, + { + "epoch": 0.03585617090113236, + "grad_norm": 1.8164830207824707, + "learning_rate": 4.984160758895139e-05, + "loss": 5.4886, + "step": 6029 + }, + { + "epoch": 0.03586211818441336, + "grad_norm": 2.0216922760009766, + "learning_rate": 4.984155508795174e-05, + "loss": 5.5777, + "step": 6030 + }, + { + "epoch": 0.03586806546769436, + "grad_norm": 1.966779351234436, + "learning_rate": 4.984150257828014e-05, + "loss": 5.1867, + "step": 6031 + }, + { + "epoch": 0.035874012750975355, + "grad_norm": 2.091109275817871, + "learning_rate": 4.9841450059936645e-05, + "loss": 5.5302, + "step": 6032 + }, + { + "epoch": 0.03587996003425635, + "grad_norm": 1.8772802352905273, + "learning_rate": 4.984139753292125e-05, + "loss": 5.2904, + "step": 6033 + }, + { + "epoch": 0.03588590731753735, + "grad_norm": 2.049431800842285, + "learning_rate": 4.984134499723397e-05, + "loss": 5.293, + "step": 6034 + }, + { + "epoch": 0.03589185460081835, + "grad_norm": 2.0902609825134277, + "learning_rate": 4.984129245287485e-05, + "loss": 5.2689, + "step": 6035 + }, + { + "epoch": 0.03589780188409934, + "grad_norm": 1.91702139377594, + "learning_rate": 4.9841239899843886e-05, + "loss": 5.255, + "step": 6036 + }, + { + "epoch": 0.03590374916738034, + "grad_norm": 1.7073708772659302, + "learning_rate": 4.984118733814109e-05, + "loss": 5.3272, + "step": 6037 + }, + { + "epoch": 0.03590969645066134, + "grad_norm": 1.625712275505066, + "learning_rate": 4.9841134767766506e-05, + "loss": 5.5366, + "step": 6038 + }, + { + "epoch": 0.035915643733942335, + "grad_norm": 1.8465087413787842, + "learning_rate": 4.984108218872014e-05, + "loss": 5.3373, + "step": 6039 + }, + { + "epoch": 0.03592159101722333, + "grad_norm": 2.2392280101776123, + "learning_rate": 4.9841029601002e-05, + "loss": 5.5898, + "step": 6040 + }, + { + "epoch": 0.03592753830050433, + "grad_norm": 2.6571459770202637, + "learning_rate": 4.984097700461212e-05, + "loss": 5.963, + "step": 6041 + }, + { + "epoch": 0.03593348558378533, + "grad_norm": 2.7220845222473145, + "learning_rate": 4.98409243995505e-05, + "loss": 5.6997, + "step": 6042 + }, + { + "epoch": 0.03593943286706632, + "grad_norm": 2.430968999862671, + "learning_rate": 4.9840871785817185e-05, + "loss": 5.2949, + "step": 6043 + }, + { + "epoch": 0.035945380150347324, + "grad_norm": 2.3006606101989746, + "learning_rate": 4.984081916341217e-05, + "loss": 5.2045, + "step": 6044 + }, + { + "epoch": 0.03595132743362832, + "grad_norm": 2.2382659912109375, + "learning_rate": 4.984076653233548e-05, + "loss": 5.417, + "step": 6045 + }, + { + "epoch": 0.035957274716909314, + "grad_norm": 2.1896233558654785, + "learning_rate": 4.9840713892587146e-05, + "loss": 5.7215, + "step": 6046 + }, + { + "epoch": 0.035963222000190316, + "grad_norm": 1.8175956010818481, + "learning_rate": 4.9840661244167166e-05, + "loss": 5.569, + "step": 6047 + }, + { + "epoch": 0.03596916928347131, + "grad_norm": 2.066828727722168, + "learning_rate": 4.984060858707557e-05, + "loss": 5.6285, + "step": 6048 + }, + { + "epoch": 0.035975116566752306, + "grad_norm": 2.246291160583496, + "learning_rate": 4.984055592131237e-05, + "loss": 5.5583, + "step": 6049 + }, + { + "epoch": 0.0359810638500333, + "grad_norm": 2.2394871711730957, + "learning_rate": 4.984050324687759e-05, + "loss": 5.3917, + "step": 6050 + }, + { + "epoch": 0.0359870111333143, + "grad_norm": 2.5051162242889404, + "learning_rate": 4.984045056377125e-05, + "loss": 5.6955, + "step": 6051 + }, + { + "epoch": 0.0359929584165953, + "grad_norm": 2.1360414028167725, + "learning_rate": 4.984039787199336e-05, + "loss": 5.5451, + "step": 6052 + }, + { + "epoch": 0.035998905699876294, + "grad_norm": 2.0267562866210938, + "learning_rate": 4.984034517154395e-05, + "loss": 5.4559, + "step": 6053 + }, + { + "epoch": 0.036004852983157296, + "grad_norm": 1.7683112621307373, + "learning_rate": 4.984029246242303e-05, + "loss": 5.4663, + "step": 6054 + }, + { + "epoch": 0.03601080026643829, + "grad_norm": 2.0600638389587402, + "learning_rate": 4.9840239744630626e-05, + "loss": 5.5081, + "step": 6055 + }, + { + "epoch": 0.036016747549719286, + "grad_norm": 2.093698740005493, + "learning_rate": 4.984018701816674e-05, + "loss": 5.5435, + "step": 6056 + }, + { + "epoch": 0.03602269483300029, + "grad_norm": 2.217721462249756, + "learning_rate": 4.984013428303141e-05, + "loss": 5.7482, + "step": 6057 + }, + { + "epoch": 0.03602864211628128, + "grad_norm": 1.9680962562561035, + "learning_rate": 4.9840081539224636e-05, + "loss": 5.9722, + "step": 6058 + }, + { + "epoch": 0.03603458939956228, + "grad_norm": 1.8606425523757935, + "learning_rate": 4.9840028786746455e-05, + "loss": 5.8379, + "step": 6059 + }, + { + "epoch": 0.03604053668284328, + "grad_norm": 2.0129475593566895, + "learning_rate": 4.983997602559688e-05, + "loss": 5.7199, + "step": 6060 + }, + { + "epoch": 0.036046483966124275, + "grad_norm": 1.9370187520980835, + "learning_rate": 4.9839923255775917e-05, + "loss": 5.3563, + "step": 6061 + }, + { + "epoch": 0.03605243124940527, + "grad_norm": 1.775894284248352, + "learning_rate": 4.983987047728359e-05, + "loss": 5.5201, + "step": 6062 + }, + { + "epoch": 0.03605837853268627, + "grad_norm": 1.9943023920059204, + "learning_rate": 4.9839817690119934e-05, + "loss": 5.4034, + "step": 6063 + }, + { + "epoch": 0.03606432581596727, + "grad_norm": 1.9605768918991089, + "learning_rate": 4.983976489428494e-05, + "loss": 5.5314, + "step": 6064 + }, + { + "epoch": 0.03607027309924826, + "grad_norm": 1.7820254564285278, + "learning_rate": 4.983971208977866e-05, + "loss": 5.6131, + "step": 6065 + }, + { + "epoch": 0.03607622038252926, + "grad_norm": 2.010796070098877, + "learning_rate": 4.983965927660108e-05, + "loss": 5.5114, + "step": 6066 + }, + { + "epoch": 0.03608216766581026, + "grad_norm": 1.8461687564849854, + "learning_rate": 4.983960645475223e-05, + "loss": 5.4752, + "step": 6067 + }, + { + "epoch": 0.036088114949091255, + "grad_norm": 2.048119068145752, + "learning_rate": 4.983955362423214e-05, + "loss": 5.3325, + "step": 6068 + }, + { + "epoch": 0.03609406223237225, + "grad_norm": 2.021646499633789, + "learning_rate": 4.9839500785040804e-05, + "loss": 5.2238, + "step": 6069 + }, + { + "epoch": 0.03610000951565325, + "grad_norm": 1.9979503154754639, + "learning_rate": 4.9839447937178264e-05, + "loss": 5.4054, + "step": 6070 + }, + { + "epoch": 0.03610595679893425, + "grad_norm": 1.980776071548462, + "learning_rate": 4.983939508064453e-05, + "loss": 5.4094, + "step": 6071 + }, + { + "epoch": 0.03611190408221524, + "grad_norm": 1.8364293575286865, + "learning_rate": 4.9839342215439615e-05, + "loss": 5.4372, + "step": 6072 + }, + { + "epoch": 0.036117851365496244, + "grad_norm": 1.8870443105697632, + "learning_rate": 4.983928934156354e-05, + "loss": 5.4075, + "step": 6073 + }, + { + "epoch": 0.03612379864877724, + "grad_norm": 2.176180124282837, + "learning_rate": 4.9839236459016337e-05, + "loss": 5.4302, + "step": 6074 + }, + { + "epoch": 0.036129745932058234, + "grad_norm": 2.054960012435913, + "learning_rate": 4.983918356779801e-05, + "loss": 5.3796, + "step": 6075 + }, + { + "epoch": 0.036135693215339236, + "grad_norm": 2.2146401405334473, + "learning_rate": 4.9839130667908576e-05, + "loss": 5.651, + "step": 6076 + }, + { + "epoch": 0.03614164049862023, + "grad_norm": 1.908640742301941, + "learning_rate": 4.983907775934806e-05, + "loss": 5.3002, + "step": 6077 + }, + { + "epoch": 0.036147587781901226, + "grad_norm": 1.9364973306655884, + "learning_rate": 4.983902484211648e-05, + "loss": 5.2299, + "step": 6078 + }, + { + "epoch": 0.03615353506518223, + "grad_norm": 1.7405542135238647, + "learning_rate": 4.983897191621385e-05, + "loss": 5.268, + "step": 6079 + }, + { + "epoch": 0.03615948234846322, + "grad_norm": 2.0347912311553955, + "learning_rate": 4.9838918981640195e-05, + "loss": 5.4887, + "step": 6080 + }, + { + "epoch": 0.03616542963174422, + "grad_norm": 2.0755162239074707, + "learning_rate": 4.9838866038395524e-05, + "loss": 5.2208, + "step": 6081 + }, + { + "epoch": 0.03617137691502521, + "grad_norm": 1.9119634628295898, + "learning_rate": 4.9838813086479865e-05, + "loss": 5.2659, + "step": 6082 + }, + { + "epoch": 0.036177324198306215, + "grad_norm": 1.9172658920288086, + "learning_rate": 4.983876012589324e-05, + "loss": 5.4098, + "step": 6083 + }, + { + "epoch": 0.03618327148158721, + "grad_norm": 2.09004545211792, + "learning_rate": 4.983870715663565e-05, + "loss": 5.5866, + "step": 6084 + }, + { + "epoch": 0.036189218764868206, + "grad_norm": 2.0952436923980713, + "learning_rate": 4.983865417870712e-05, + "loss": 5.5288, + "step": 6085 + }, + { + "epoch": 0.03619516604814921, + "grad_norm": 1.8599412441253662, + "learning_rate": 4.9838601192107686e-05, + "loss": 5.7538, + "step": 6086 + }, + { + "epoch": 0.0362011133314302, + "grad_norm": 1.8318936824798584, + "learning_rate": 4.983854819683735e-05, + "loss": 5.9613, + "step": 6087 + }, + { + "epoch": 0.0362070606147112, + "grad_norm": 1.8312503099441528, + "learning_rate": 4.983849519289613e-05, + "loss": 5.2749, + "step": 6088 + }, + { + "epoch": 0.0362130078979922, + "grad_norm": 2.157576560974121, + "learning_rate": 4.983844218028405e-05, + "loss": 5.2826, + "step": 6089 + }, + { + "epoch": 0.036218955181273195, + "grad_norm": 2.1377198696136475, + "learning_rate": 4.983838915900112e-05, + "loss": 5.2843, + "step": 6090 + }, + { + "epoch": 0.03622490246455419, + "grad_norm": 2.0167126655578613, + "learning_rate": 4.983833612904737e-05, + "loss": 5.4713, + "step": 6091 + }, + { + "epoch": 0.03623084974783519, + "grad_norm": 1.748759388923645, + "learning_rate": 4.9838283090422814e-05, + "loss": 5.3685, + "step": 6092 + }, + { + "epoch": 0.03623679703111619, + "grad_norm": 2.0344316959381104, + "learning_rate": 4.983823004312747e-05, + "loss": 5.1093, + "step": 6093 + }, + { + "epoch": 0.03624274431439718, + "grad_norm": 1.9061161279678345, + "learning_rate": 4.9838176987161356e-05, + "loss": 5.2035, + "step": 6094 + }, + { + "epoch": 0.03624869159767818, + "grad_norm": 1.9090344905853271, + "learning_rate": 4.983812392252449e-05, + "loss": 5.3863, + "step": 6095 + }, + { + "epoch": 0.03625463888095918, + "grad_norm": 1.9536118507385254, + "learning_rate": 4.9838070849216894e-05, + "loss": 5.5349, + "step": 6096 + }, + { + "epoch": 0.036260586164240174, + "grad_norm": 1.89446222782135, + "learning_rate": 4.983801776723858e-05, + "loss": 5.7098, + "step": 6097 + }, + { + "epoch": 0.03626653344752117, + "grad_norm": 1.6403870582580566, + "learning_rate": 4.983796467658958e-05, + "loss": 5.6726, + "step": 6098 + }, + { + "epoch": 0.03627248073080217, + "grad_norm": 1.7792481184005737, + "learning_rate": 4.983791157726989e-05, + "loss": 5.6761, + "step": 6099 + }, + { + "epoch": 0.03627842801408317, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.9837858469279554e-05, + "loss": 5.6576, + "step": 6100 + }, + { + "epoch": 0.03628437529736416, + "grad_norm": 1.9885895252227783, + "learning_rate": 4.983780535261857e-05, + "loss": 5.5944, + "step": 6101 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 1.771620750427246, + "learning_rate": 4.983775222728697e-05, + "loss": 5.7949, + "step": 6102 + }, + { + "epoch": 0.03629626986392616, + "grad_norm": 1.684471845626831, + "learning_rate": 4.9837699093284765e-05, + "loss": 5.5435, + "step": 6103 + }, + { + "epoch": 0.036302217147207154, + "grad_norm": 1.8454065322875977, + "learning_rate": 4.9837645950611966e-05, + "loss": 5.4526, + "step": 6104 + }, + { + "epoch": 0.036308164430488156, + "grad_norm": 1.6522735357284546, + "learning_rate": 4.983759279926862e-05, + "loss": 5.7302, + "step": 6105 + }, + { + "epoch": 0.03631411171376915, + "grad_norm": 1.8691065311431885, + "learning_rate": 4.9837539639254713e-05, + "loss": 5.6494, + "step": 6106 + }, + { + "epoch": 0.036320058997050146, + "grad_norm": 1.9420015811920166, + "learning_rate": 4.9837486470570286e-05, + "loss": 5.77, + "step": 6107 + }, + { + "epoch": 0.03632600628033115, + "grad_norm": 1.8399784564971924, + "learning_rate": 4.9837433293215344e-05, + "loss": 5.6669, + "step": 6108 + }, + { + "epoch": 0.03633195356361214, + "grad_norm": 1.799460530281067, + "learning_rate": 4.983738010718991e-05, + "loss": 5.5557, + "step": 6109 + }, + { + "epoch": 0.03633790084689314, + "grad_norm": 1.8826879262924194, + "learning_rate": 4.9837326912494e-05, + "loss": 5.4865, + "step": 6110 + }, + { + "epoch": 0.03634384813017413, + "grad_norm": 1.9582240581512451, + "learning_rate": 4.983727370912764e-05, + "loss": 5.5882, + "step": 6111 + }, + { + "epoch": 0.036349795413455135, + "grad_norm": 2.011892795562744, + "learning_rate": 4.9837220497090846e-05, + "loss": 5.4932, + "step": 6112 + }, + { + "epoch": 0.03635574269673613, + "grad_norm": 1.7751367092132568, + "learning_rate": 4.983716727638363e-05, + "loss": 5.4981, + "step": 6113 + }, + { + "epoch": 0.036361689980017126, + "grad_norm": 1.984121322631836, + "learning_rate": 4.983711404700603e-05, + "loss": 5.4801, + "step": 6114 + }, + { + "epoch": 0.03636763726329813, + "grad_norm": 1.9601882696151733, + "learning_rate": 4.983706080895804e-05, + "loss": 5.218, + "step": 6115 + }, + { + "epoch": 0.03637358454657912, + "grad_norm": 1.800227165222168, + "learning_rate": 4.9837007562239684e-05, + "loss": 5.5178, + "step": 6116 + }, + { + "epoch": 0.03637953182986012, + "grad_norm": 1.9257889986038208, + "learning_rate": 4.983695430685099e-05, + "loss": 5.6695, + "step": 6117 + }, + { + "epoch": 0.03638547911314112, + "grad_norm": 1.8011913299560547, + "learning_rate": 4.9836901042791976e-05, + "loss": 5.7478, + "step": 6118 + }, + { + "epoch": 0.036391426396422115, + "grad_norm": 1.8668690919876099, + "learning_rate": 4.983684777006264e-05, + "loss": 5.7027, + "step": 6119 + }, + { + "epoch": 0.03639737367970311, + "grad_norm": 1.898126244544983, + "learning_rate": 4.983679448866304e-05, + "loss": 5.5206, + "step": 6120 + }, + { + "epoch": 0.03640332096298411, + "grad_norm": 1.8264409303665161, + "learning_rate": 4.983674119859316e-05, + "loss": 5.4686, + "step": 6121 + }, + { + "epoch": 0.03640926824626511, + "grad_norm": 1.8090230226516724, + "learning_rate": 4.983668789985303e-05, + "loss": 5.4761, + "step": 6122 + }, + { + "epoch": 0.0364152155295461, + "grad_norm": 1.8193403482437134, + "learning_rate": 4.983663459244266e-05, + "loss": 5.3443, + "step": 6123 + }, + { + "epoch": 0.0364211628128271, + "grad_norm": 1.8199255466461182, + "learning_rate": 4.9836581276362095e-05, + "loss": 5.427, + "step": 6124 + }, + { + "epoch": 0.0364271100961081, + "grad_norm": 1.72145414352417, + "learning_rate": 4.9836527951611325e-05, + "loss": 5.4372, + "step": 6125 + }, + { + "epoch": 0.036433057379389094, + "grad_norm": 1.8164423704147339, + "learning_rate": 4.9836474618190386e-05, + "loss": 5.4702, + "step": 6126 + }, + { + "epoch": 0.03643900466267009, + "grad_norm": 1.897775650024414, + "learning_rate": 4.9836421276099287e-05, + "loss": 5.4259, + "step": 6127 + }, + { + "epoch": 0.03644495194595109, + "grad_norm": 1.851101279258728, + "learning_rate": 4.9836367925338046e-05, + "loss": 5.3837, + "step": 6128 + }, + { + "epoch": 0.03645089922923209, + "grad_norm": 1.749374508857727, + "learning_rate": 4.98363145659067e-05, + "loss": 5.3232, + "step": 6129 + }, + { + "epoch": 0.03645684651251308, + "grad_norm": 1.95986008644104, + "learning_rate": 4.9836261197805235e-05, + "loss": 5.2692, + "step": 6130 + }, + { + "epoch": 0.036462793795794084, + "grad_norm": 1.7947750091552734, + "learning_rate": 4.98362078210337e-05, + "loss": 5.409, + "step": 6131 + }, + { + "epoch": 0.03646874107907508, + "grad_norm": 2.119044303894043, + "learning_rate": 4.983615443559209e-05, + "loss": 5.5924, + "step": 6132 + }, + { + "epoch": 0.036474688362356074, + "grad_norm": 1.7285267114639282, + "learning_rate": 4.983610104148044e-05, + "loss": 5.6955, + "step": 6133 + }, + { + "epoch": 0.036480635645637076, + "grad_norm": 2.1711652278900146, + "learning_rate": 4.983604763869877e-05, + "loss": 5.1941, + "step": 6134 + }, + { + "epoch": 0.03648658292891807, + "grad_norm": 2.060039758682251, + "learning_rate": 4.983599422724709e-05, + "loss": 5.5131, + "step": 6135 + }, + { + "epoch": 0.036492530212199066, + "grad_norm": 1.6212393045425415, + "learning_rate": 4.9835940807125415e-05, + "loss": 5.4856, + "step": 6136 + }, + { + "epoch": 0.03649847749548007, + "grad_norm": 1.7602918148040771, + "learning_rate": 4.983588737833378e-05, + "loss": 5.4177, + "step": 6137 + }, + { + "epoch": 0.03650442477876106, + "grad_norm": 2.660930633544922, + "learning_rate": 4.983583394087218e-05, + "loss": 5.5879, + "step": 6138 + }, + { + "epoch": 0.03651037206204206, + "grad_norm": 2.3608336448669434, + "learning_rate": 4.9835780494740655e-05, + "loss": 5.3894, + "step": 6139 + }, + { + "epoch": 0.03651631934532305, + "grad_norm": 2.071632146835327, + "learning_rate": 4.983572703993922e-05, + "loss": 5.6185, + "step": 6140 + }, + { + "epoch": 0.036522266628604055, + "grad_norm": 1.7023842334747314, + "learning_rate": 4.983567357646788e-05, + "loss": 5.5648, + "step": 6141 + }, + { + "epoch": 0.03652821391188505, + "grad_norm": 2.2168798446655273, + "learning_rate": 4.983562010432667e-05, + "loss": 5.4578, + "step": 6142 + }, + { + "epoch": 0.036534161195166046, + "grad_norm": 2.0916104316711426, + "learning_rate": 4.98355666235156e-05, + "loss": 5.4977, + "step": 6143 + }, + { + "epoch": 0.03654010847844705, + "grad_norm": 1.7101606130599976, + "learning_rate": 4.9835513134034686e-05, + "loss": 5.4081, + "step": 6144 + }, + { + "epoch": 0.03654605576172804, + "grad_norm": 1.9058302640914917, + "learning_rate": 4.983545963588395e-05, + "loss": 5.2145, + "step": 6145 + }, + { + "epoch": 0.03655200304500904, + "grad_norm": 2.319023847579956, + "learning_rate": 4.9835406129063424e-05, + "loss": 5.3023, + "step": 6146 + }, + { + "epoch": 0.03655795032829004, + "grad_norm": 2.1135916709899902, + "learning_rate": 4.98353526135731e-05, + "loss": 5.4796, + "step": 6147 + }, + { + "epoch": 0.036563897611571035, + "grad_norm": 2.409088373184204, + "learning_rate": 4.983529908941302e-05, + "loss": 5.3124, + "step": 6148 + }, + { + "epoch": 0.03656984489485203, + "grad_norm": 1.8679871559143066, + "learning_rate": 4.9835245556583185e-05, + "loss": 5.3741, + "step": 6149 + }, + { + "epoch": 0.03657579217813303, + "grad_norm": 1.9335602521896362, + "learning_rate": 4.983519201508363e-05, + "loss": 5.3231, + "step": 6150 + }, + { + "epoch": 0.03658173946141403, + "grad_norm": 2.0352535247802734, + "learning_rate": 4.9835138464914366e-05, + "loss": 5.4643, + "step": 6151 + }, + { + "epoch": 0.03658768674469502, + "grad_norm": 2.4156594276428223, + "learning_rate": 4.983508490607541e-05, + "loss": 5.4092, + "step": 6152 + }, + { + "epoch": 0.03659363402797602, + "grad_norm": 2.1936473846435547, + "learning_rate": 4.983503133856678e-05, + "loss": 5.5093, + "step": 6153 + }, + { + "epoch": 0.03659958131125702, + "grad_norm": 1.6346958875656128, + "learning_rate": 4.98349777623885e-05, + "loss": 5.512, + "step": 6154 + }, + { + "epoch": 0.036605528594538014, + "grad_norm": 1.9810141324996948, + "learning_rate": 4.9834924177540584e-05, + "loss": 5.4981, + "step": 6155 + }, + { + "epoch": 0.03661147587781901, + "grad_norm": 2.1253950595855713, + "learning_rate": 4.9834870584023055e-05, + "loss": 5.4022, + "step": 6156 + }, + { + "epoch": 0.03661742316110001, + "grad_norm": 2.011754274368286, + "learning_rate": 4.9834816981835926e-05, + "loss": 5.6107, + "step": 6157 + }, + { + "epoch": 0.036623370444381007, + "grad_norm": 2.210934638977051, + "learning_rate": 4.983476337097922e-05, + "loss": 5.4348, + "step": 6158 + }, + { + "epoch": 0.036629317727662, + "grad_norm": 2.1351871490478516, + "learning_rate": 4.983470975145296e-05, + "loss": 5.2022, + "step": 6159 + }, + { + "epoch": 0.036635265010943004, + "grad_norm": 2.1564714908599854, + "learning_rate": 4.983465612325715e-05, + "loss": 5.3583, + "step": 6160 + }, + { + "epoch": 0.036641212294224, + "grad_norm": 1.9411755800247192, + "learning_rate": 4.983460248639182e-05, + "loss": 5.4643, + "step": 6161 + }, + { + "epoch": 0.036647159577504994, + "grad_norm": 2.129741907119751, + "learning_rate": 4.983454884085699e-05, + "loss": 5.3834, + "step": 6162 + }, + { + "epoch": 0.036653106860785996, + "grad_norm": 2.12172269821167, + "learning_rate": 4.983449518665268e-05, + "loss": 5.4418, + "step": 6163 + }, + { + "epoch": 0.03665905414406699, + "grad_norm": 2.097452163696289, + "learning_rate": 4.9834441523778893e-05, + "loss": 5.3741, + "step": 6164 + }, + { + "epoch": 0.036665001427347986, + "grad_norm": 2.0458765029907227, + "learning_rate": 4.983438785223567e-05, + "loss": 5.373, + "step": 6165 + }, + { + "epoch": 0.03667094871062899, + "grad_norm": 1.9431376457214355, + "learning_rate": 4.983433417202301e-05, + "loss": 5.4003, + "step": 6166 + }, + { + "epoch": 0.03667689599390998, + "grad_norm": 2.136819362640381, + "learning_rate": 4.983428048314095e-05, + "loss": 5.503, + "step": 6167 + }, + { + "epoch": 0.03668284327719098, + "grad_norm": 1.863153338432312, + "learning_rate": 4.983422678558949e-05, + "loss": 5.4357, + "step": 6168 + }, + { + "epoch": 0.03668879056047197, + "grad_norm": 1.9198437929153442, + "learning_rate": 4.9834173079368665e-05, + "loss": 5.4304, + "step": 6169 + }, + { + "epoch": 0.036694737843752975, + "grad_norm": 1.9080480337142944, + "learning_rate": 4.9834119364478484e-05, + "loss": 5.4329, + "step": 6170 + }, + { + "epoch": 0.03670068512703397, + "grad_norm": 1.9116952419281006, + "learning_rate": 4.983406564091897e-05, + "loss": 5.3248, + "step": 6171 + }, + { + "epoch": 0.036706632410314965, + "grad_norm": 2.007685661315918, + "learning_rate": 4.983401190869014e-05, + "loss": 5.3554, + "step": 6172 + }, + { + "epoch": 0.03671257969359597, + "grad_norm": 1.8134535551071167, + "learning_rate": 4.983395816779201e-05, + "loss": 5.2907, + "step": 6173 + }, + { + "epoch": 0.03671852697687696, + "grad_norm": 2.093061685562134, + "learning_rate": 4.9833904418224606e-05, + "loss": 5.4055, + "step": 6174 + }, + { + "epoch": 0.03672447426015796, + "grad_norm": 2.1263599395751953, + "learning_rate": 4.9833850659987934e-05, + "loss": 5.2758, + "step": 6175 + }, + { + "epoch": 0.03673042154343896, + "grad_norm": 1.9442895650863647, + "learning_rate": 4.983379689308203e-05, + "loss": 5.4183, + "step": 6176 + }, + { + "epoch": 0.036736368826719955, + "grad_norm": 1.9587830305099487, + "learning_rate": 4.98337431175069e-05, + "loss": 5.3624, + "step": 6177 + }, + { + "epoch": 0.03674231611000095, + "grad_norm": 1.9845789670944214, + "learning_rate": 4.9833689333262565e-05, + "loss": 5.3933, + "step": 6178 + }, + { + "epoch": 0.03674826339328195, + "grad_norm": 1.9748643636703491, + "learning_rate": 4.9833635540349055e-05, + "loss": 5.5221, + "step": 6179 + }, + { + "epoch": 0.03675421067656295, + "grad_norm": 1.8139559030532837, + "learning_rate": 4.983358173876638e-05, + "loss": 5.5524, + "step": 6180 + }, + { + "epoch": 0.03676015795984394, + "grad_norm": 1.93784499168396, + "learning_rate": 4.9833527928514546e-05, + "loss": 5.7145, + "step": 6181 + }, + { + "epoch": 0.03676610524312494, + "grad_norm": 1.9064222574234009, + "learning_rate": 4.9833474109593594e-05, + "loss": 5.5283, + "step": 6182 + }, + { + "epoch": 0.03677205252640594, + "grad_norm": 1.7044670581817627, + "learning_rate": 4.9833420282003524e-05, + "loss": 5.2877, + "step": 6183 + }, + { + "epoch": 0.036777999809686934, + "grad_norm": 1.8328427076339722, + "learning_rate": 4.983336644574437e-05, + "loss": 5.5019, + "step": 6184 + }, + { + "epoch": 0.03678394709296793, + "grad_norm": 1.600780725479126, + "learning_rate": 4.983331260081614e-05, + "loss": 5.5347, + "step": 6185 + }, + { + "epoch": 0.03678989437624893, + "grad_norm": 1.8333978652954102, + "learning_rate": 4.983325874721886e-05, + "loss": 5.5127, + "step": 6186 + }, + { + "epoch": 0.036795841659529926, + "grad_norm": 1.8825682401657104, + "learning_rate": 4.9833204884952546e-05, + "loss": 5.5338, + "step": 6187 + }, + { + "epoch": 0.03680178894281092, + "grad_norm": 1.6875951290130615, + "learning_rate": 4.983315101401721e-05, + "loss": 5.2465, + "step": 6188 + }, + { + "epoch": 0.036807736226091924, + "grad_norm": 1.6224017143249512, + "learning_rate": 4.983309713441289e-05, + "loss": 5.4741, + "step": 6189 + }, + { + "epoch": 0.03681368350937292, + "grad_norm": 1.991721272468567, + "learning_rate": 4.983304324613958e-05, + "loss": 5.4547, + "step": 6190 + }, + { + "epoch": 0.036819630792653914, + "grad_norm": 1.843961238861084, + "learning_rate": 4.983298934919732e-05, + "loss": 5.3262, + "step": 6191 + }, + { + "epoch": 0.036825578075934916, + "grad_norm": 1.8342533111572266, + "learning_rate": 4.983293544358612e-05, + "loss": 5.6808, + "step": 6192 + }, + { + "epoch": 0.03683152535921591, + "grad_norm": 1.8796159029006958, + "learning_rate": 4.983288152930599e-05, + "loss": 5.5454, + "step": 6193 + }, + { + "epoch": 0.036837472642496906, + "grad_norm": 1.9033316373825073, + "learning_rate": 4.983282760635696e-05, + "loss": 5.3566, + "step": 6194 + }, + { + "epoch": 0.03684341992577791, + "grad_norm": 1.915873408317566, + "learning_rate": 4.9832773674739054e-05, + "loss": 5.4555, + "step": 6195 + }, + { + "epoch": 0.0368493672090589, + "grad_norm": 1.8510993719100952, + "learning_rate": 4.983271973445228e-05, + "loss": 5.5042, + "step": 6196 + }, + { + "epoch": 0.0368553144923399, + "grad_norm": 1.7180782556533813, + "learning_rate": 4.983266578549666e-05, + "loss": 5.4671, + "step": 6197 + }, + { + "epoch": 0.03686126177562089, + "grad_norm": 1.7828874588012695, + "learning_rate": 4.983261182787221e-05, + "loss": 5.4943, + "step": 6198 + }, + { + "epoch": 0.036867209058901895, + "grad_norm": 1.5032141208648682, + "learning_rate": 4.983255786157895e-05, + "loss": 5.3881, + "step": 6199 + }, + { + "epoch": 0.03687315634218289, + "grad_norm": 2.530954599380493, + "learning_rate": 4.983250388661691e-05, + "loss": 5.4449, + "step": 6200 + }, + { + "epoch": 0.036879103625463885, + "grad_norm": 2.011044979095459, + "learning_rate": 4.983244990298609e-05, + "loss": 5.2722, + "step": 6201 + }, + { + "epoch": 0.03688505090874489, + "grad_norm": 2.2209532260894775, + "learning_rate": 4.9832395910686525e-05, + "loss": 5.0932, + "step": 6202 + }, + { + "epoch": 0.03689099819202588, + "grad_norm": 1.8695623874664307, + "learning_rate": 4.983234190971823e-05, + "loss": 5.2891, + "step": 6203 + }, + { + "epoch": 0.03689694547530688, + "grad_norm": 2.172349691390991, + "learning_rate": 4.983228790008121e-05, + "loss": 5.578, + "step": 6204 + }, + { + "epoch": 0.03690289275858788, + "grad_norm": 2.1099209785461426, + "learning_rate": 4.9832233881775505e-05, + "loss": 5.3708, + "step": 6205 + }, + { + "epoch": 0.036908840041868875, + "grad_norm": 2.16737961769104, + "learning_rate": 4.9832179854801116e-05, + "loss": 5.303, + "step": 6206 + }, + { + "epoch": 0.03691478732514987, + "grad_norm": 2.248220682144165, + "learning_rate": 4.983212581915807e-05, + "loss": 5.362, + "step": 6207 + }, + { + "epoch": 0.03692073460843087, + "grad_norm": 2.0701045989990234, + "learning_rate": 4.983207177484639e-05, + "loss": 5.4528, + "step": 6208 + }, + { + "epoch": 0.03692668189171187, + "grad_norm": 1.9989019632339478, + "learning_rate": 4.983201772186609e-05, + "loss": 5.786, + "step": 6209 + }, + { + "epoch": 0.03693262917499286, + "grad_norm": 1.9126088619232178, + "learning_rate": 4.983196366021719e-05, + "loss": 5.2312, + "step": 6210 + }, + { + "epoch": 0.03693857645827386, + "grad_norm": 2.1317548751831055, + "learning_rate": 4.9831909589899695e-05, + "loss": 5.3028, + "step": 6211 + }, + { + "epoch": 0.03694452374155486, + "grad_norm": 2.164898157119751, + "learning_rate": 4.983185551091365e-05, + "loss": 5.3186, + "step": 6212 + }, + { + "epoch": 0.036950471024835854, + "grad_norm": 2.1085855960845947, + "learning_rate": 4.983180142325906e-05, + "loss": 5.3026, + "step": 6213 + }, + { + "epoch": 0.03695641830811685, + "grad_norm": 1.8321222066879272, + "learning_rate": 4.983174732693594e-05, + "loss": 5.6632, + "step": 6214 + }, + { + "epoch": 0.03696236559139785, + "grad_norm": 2.0537941455841064, + "learning_rate": 4.983169322194432e-05, + "loss": 5.2269, + "step": 6215 + }, + { + "epoch": 0.036968312874678846, + "grad_norm": 1.9598063230514526, + "learning_rate": 4.98316391082842e-05, + "loss": 5.4974, + "step": 6216 + }, + { + "epoch": 0.03697426015795984, + "grad_norm": 2.3764376640319824, + "learning_rate": 4.983158498595563e-05, + "loss": 5.7715, + "step": 6217 + }, + { + "epoch": 0.036980207441240844, + "grad_norm": 1.8938835859298706, + "learning_rate": 4.9831530854958595e-05, + "loss": 5.5577, + "step": 6218 + }, + { + "epoch": 0.03698615472452184, + "grad_norm": 2.2023189067840576, + "learning_rate": 4.9831476715293134e-05, + "loss": 5.2596, + "step": 6219 + }, + { + "epoch": 0.036992102007802834, + "grad_norm": 1.9010800123214722, + "learning_rate": 4.9831422566959266e-05, + "loss": 5.3313, + "step": 6220 + }, + { + "epoch": 0.036998049291083836, + "grad_norm": 1.9679474830627441, + "learning_rate": 4.9831368409957e-05, + "loss": 5.2701, + "step": 6221 + }, + { + "epoch": 0.03700399657436483, + "grad_norm": 1.903558373451233, + "learning_rate": 4.983131424428635e-05, + "loss": 5.2821, + "step": 6222 + }, + { + "epoch": 0.037009943857645826, + "grad_norm": 1.976114273071289, + "learning_rate": 4.983126006994736e-05, + "loss": 5.374, + "step": 6223 + }, + { + "epoch": 0.03701589114092683, + "grad_norm": 2.9803311824798584, + "learning_rate": 4.983120588694003e-05, + "loss": 5.3576, + "step": 6224 + }, + { + "epoch": 0.03702183842420782, + "grad_norm": 1.5921218395233154, + "learning_rate": 4.983115169526438e-05, + "loss": 5.1654, + "step": 6225 + }, + { + "epoch": 0.03702778570748882, + "grad_norm": 1.7458349466323853, + "learning_rate": 4.983109749492043e-05, + "loss": 5.1038, + "step": 6226 + }, + { + "epoch": 0.03703373299076981, + "grad_norm": 1.9425132274627686, + "learning_rate": 4.983104328590821e-05, + "loss": 5.3815, + "step": 6227 + }, + { + "epoch": 0.037039680274050815, + "grad_norm": 1.9506715536117554, + "learning_rate": 4.983098906822772e-05, + "loss": 5.2215, + "step": 6228 + }, + { + "epoch": 0.03704562755733181, + "grad_norm": 1.8596410751342773, + "learning_rate": 4.983093484187899e-05, + "loss": 5.2058, + "step": 6229 + }, + { + "epoch": 0.037051574840612805, + "grad_norm": 1.720473289489746, + "learning_rate": 4.9830880606862043e-05, + "loss": 5.2701, + "step": 6230 + }, + { + "epoch": 0.03705752212389381, + "grad_norm": 1.7786411046981812, + "learning_rate": 4.983082636317688e-05, + "loss": 5.3216, + "step": 6231 + }, + { + "epoch": 0.0370634694071748, + "grad_norm": 3.6291537284851074, + "learning_rate": 4.983077211082354e-05, + "loss": 5.2282, + "step": 6232 + }, + { + "epoch": 0.0370694166904558, + "grad_norm": 1.7453030347824097, + "learning_rate": 4.983071784980203e-05, + "loss": 5.2667, + "step": 6233 + }, + { + "epoch": 0.0370753639737368, + "grad_norm": 1.7036694288253784, + "learning_rate": 4.983066358011238e-05, + "loss": 5.3023, + "step": 6234 + }, + { + "epoch": 0.037081311257017795, + "grad_norm": 1.7196505069732666, + "learning_rate": 4.9830609301754595e-05, + "loss": 5.2211, + "step": 6235 + }, + { + "epoch": 0.03708725854029879, + "grad_norm": 3.4630305767059326, + "learning_rate": 4.983055501472871e-05, + "loss": 5.6159, + "step": 6236 + }, + { + "epoch": 0.03709320582357979, + "grad_norm": 2.9739367961883545, + "learning_rate": 4.9830500719034726e-05, + "loss": 5.4477, + "step": 6237 + }, + { + "epoch": 0.03709915310686079, + "grad_norm": 2.760664463043213, + "learning_rate": 4.983044641467267e-05, + "loss": 5.0879, + "step": 6238 + }, + { + "epoch": 0.03710510039014178, + "grad_norm": 2.166203022003174, + "learning_rate": 4.9830392101642566e-05, + "loss": 5.5635, + "step": 6239 + }, + { + "epoch": 0.03711104767342278, + "grad_norm": 2.3798410892486572, + "learning_rate": 4.9830337779944425e-05, + "loss": 5.0676, + "step": 6240 + }, + { + "epoch": 0.03711699495670378, + "grad_norm": 2.3990557193756104, + "learning_rate": 4.983028344957827e-05, + "loss": 5.2788, + "step": 6241 + }, + { + "epoch": 0.037122942239984774, + "grad_norm": 2.487978458404541, + "learning_rate": 4.9830229110544124e-05, + "loss": 5.852, + "step": 6242 + }, + { + "epoch": 0.03712888952326577, + "grad_norm": 2.304749011993408, + "learning_rate": 4.9830174762842e-05, + "loss": 6.0886, + "step": 6243 + }, + { + "epoch": 0.03713483680654677, + "grad_norm": 2.169614791870117, + "learning_rate": 4.983012040647191e-05, + "loss": 6.1178, + "step": 6244 + }, + { + "epoch": 0.037140784089827766, + "grad_norm": 2.119131326675415, + "learning_rate": 4.98300660414339e-05, + "loss": 6.25, + "step": 6245 + }, + { + "epoch": 0.03714673137310876, + "grad_norm": 2.3797547817230225, + "learning_rate": 4.9830011667727964e-05, + "loss": 5.879, + "step": 6246 + }, + { + "epoch": 0.03715267865638976, + "grad_norm": 2.303718328475952, + "learning_rate": 4.982995728535411e-05, + "loss": 6.0015, + "step": 6247 + }, + { + "epoch": 0.03715862593967076, + "grad_norm": 2.867103099822998, + "learning_rate": 4.9829902894312396e-05, + "loss": 5.8726, + "step": 6248 + }, + { + "epoch": 0.037164573222951754, + "grad_norm": 2.4248557090759277, + "learning_rate": 4.9829848494602806e-05, + "loss": 5.6579, + "step": 6249 + }, + { + "epoch": 0.037170520506232756, + "grad_norm": 2.2622148990631104, + "learning_rate": 4.982979408622538e-05, + "loss": 5.7677, + "step": 6250 + }, + { + "epoch": 0.03717646778951375, + "grad_norm": 2.320502996444702, + "learning_rate": 4.9829739669180126e-05, + "loss": 5.7362, + "step": 6251 + }, + { + "epoch": 0.037182415072794746, + "grad_norm": 2.2096636295318604, + "learning_rate": 4.9829685243467065e-05, + "loss": 5.9069, + "step": 6252 + }, + { + "epoch": 0.03718836235607575, + "grad_norm": 2.620361089706421, + "learning_rate": 4.982963080908623e-05, + "loss": 5.9419, + "step": 6253 + }, + { + "epoch": 0.03719430963935674, + "grad_norm": 2.478158950805664, + "learning_rate": 4.982957636603761e-05, + "loss": 6.4776, + "step": 6254 + }, + { + "epoch": 0.03720025692263774, + "grad_norm": 2.5912528038024902, + "learning_rate": 4.982952191432125e-05, + "loss": 5.7176, + "step": 6255 + }, + { + "epoch": 0.03720620420591873, + "grad_norm": 2.57177734375, + "learning_rate": 4.982946745393716e-05, + "loss": 5.4271, + "step": 6256 + }, + { + "epoch": 0.037212151489199735, + "grad_norm": 2.424567699432373, + "learning_rate": 4.982941298488535e-05, + "loss": 5.82, + "step": 6257 + }, + { + "epoch": 0.03721809877248073, + "grad_norm": 2.477827548980713, + "learning_rate": 4.9829358507165856e-05, + "loss": 5.7961, + "step": 6258 + }, + { + "epoch": 0.037224046055761725, + "grad_norm": 2.0598270893096924, + "learning_rate": 4.982930402077869e-05, + "loss": 5.9264, + "step": 6259 + }, + { + "epoch": 0.03722999333904273, + "grad_norm": 2.0599095821380615, + "learning_rate": 4.9829249525723875e-05, + "loss": 6.0518, + "step": 6260 + }, + { + "epoch": 0.03723594062232372, + "grad_norm": 2.110170841217041, + "learning_rate": 4.982919502200142e-05, + "loss": 5.8631, + "step": 6261 + }, + { + "epoch": 0.03724188790560472, + "grad_norm": 2.333972930908203, + "learning_rate": 4.982914050961135e-05, + "loss": 5.5361, + "step": 6262 + }, + { + "epoch": 0.03724783518888572, + "grad_norm": 2.2322769165039062, + "learning_rate": 4.982908598855369e-05, + "loss": 5.8002, + "step": 6263 + }, + { + "epoch": 0.037253782472166715, + "grad_norm": 1.9915717840194702, + "learning_rate": 4.982903145882845e-05, + "loss": 5.7096, + "step": 6264 + }, + { + "epoch": 0.03725972975544771, + "grad_norm": 2.2031619548797607, + "learning_rate": 4.9828976920435645e-05, + "loss": 5.5716, + "step": 6265 + }, + { + "epoch": 0.03726567703872871, + "grad_norm": 2.9422314167022705, + "learning_rate": 4.9828922373375295e-05, + "loss": 5.929, + "step": 6266 + }, + { + "epoch": 0.03727162432200971, + "grad_norm": 3.264784336090088, + "learning_rate": 4.982886781764744e-05, + "loss": 5.9801, + "step": 6267 + }, + { + "epoch": 0.0372775716052907, + "grad_norm": 2.8314197063446045, + "learning_rate": 4.982881325325208e-05, + "loss": 6.0173, + "step": 6268 + }, + { + "epoch": 0.0372835188885717, + "grad_norm": 2.9550328254699707, + "learning_rate": 4.9828758680189234e-05, + "loss": 5.9838, + "step": 6269 + }, + { + "epoch": 0.0372894661718527, + "grad_norm": 2.6827526092529297, + "learning_rate": 4.9828704098458924e-05, + "loss": 6.0235, + "step": 6270 + }, + { + "epoch": 0.037295413455133694, + "grad_norm": 2.7174222469329834, + "learning_rate": 4.982864950806118e-05, + "loss": 5.8315, + "step": 6271 + }, + { + "epoch": 0.03730136073841469, + "grad_norm": 2.6177315711975098, + "learning_rate": 4.9828594908996e-05, + "loss": 5.8577, + "step": 6272 + }, + { + "epoch": 0.03730730802169569, + "grad_norm": 2.449669361114502, + "learning_rate": 4.982854030126342e-05, + "loss": 5.9591, + "step": 6273 + }, + { + "epoch": 0.037313255304976686, + "grad_norm": 2.5328989028930664, + "learning_rate": 4.9828485684863446e-05, + "loss": 5.7764, + "step": 6274 + }, + { + "epoch": 0.03731920258825768, + "grad_norm": 2.2581989765167236, + "learning_rate": 4.982843105979611e-05, + "loss": 5.9524, + "step": 6275 + }, + { + "epoch": 0.03732514987153868, + "grad_norm": 2.261212110519409, + "learning_rate": 4.982837642606142e-05, + "loss": 5.5814, + "step": 6276 + }, + { + "epoch": 0.03733109715481968, + "grad_norm": 2.2957348823547363, + "learning_rate": 4.98283217836594e-05, + "loss": 5.6967, + "step": 6277 + }, + { + "epoch": 0.037337044438100674, + "grad_norm": 2.814037322998047, + "learning_rate": 4.982826713259008e-05, + "loss": 5.8787, + "step": 6278 + }, + { + "epoch": 0.037342991721381676, + "grad_norm": 2.678133249282837, + "learning_rate": 4.9828212472853464e-05, + "loss": 5.94, + "step": 6279 + }, + { + "epoch": 0.03734893900466267, + "grad_norm": 2.2949652671813965, + "learning_rate": 4.982815780444957e-05, + "loss": 5.7263, + "step": 6280 + }, + { + "epoch": 0.037354886287943666, + "grad_norm": 2.4542131423950195, + "learning_rate": 4.982810312737842e-05, + "loss": 5.8317, + "step": 6281 + }, + { + "epoch": 0.03736083357122467, + "grad_norm": 2.7850544452667236, + "learning_rate": 4.982804844164005e-05, + "loss": 5.5631, + "step": 6282 + }, + { + "epoch": 0.03736678085450566, + "grad_norm": 2.6285061836242676, + "learning_rate": 4.9827993747234454e-05, + "loss": 5.6212, + "step": 6283 + }, + { + "epoch": 0.03737272813778666, + "grad_norm": 2.602590799331665, + "learning_rate": 4.9827939044161666e-05, + "loss": 5.5529, + "step": 6284 + }, + { + "epoch": 0.03737867542106765, + "grad_norm": 2.6196670532226562, + "learning_rate": 4.98278843324217e-05, + "loss": 5.6915, + "step": 6285 + }, + { + "epoch": 0.037384622704348655, + "grad_norm": 2.7072317600250244, + "learning_rate": 4.982782961201457e-05, + "loss": 5.7535, + "step": 6286 + }, + { + "epoch": 0.03739056998762965, + "grad_norm": 2.626033067703247, + "learning_rate": 4.982777488294031e-05, + "loss": 5.6053, + "step": 6287 + }, + { + "epoch": 0.037396517270910645, + "grad_norm": 1.8426648378372192, + "learning_rate": 4.982772014519892e-05, + "loss": 5.6167, + "step": 6288 + }, + { + "epoch": 0.03740246455419165, + "grad_norm": 2.5587830543518066, + "learning_rate": 4.9827665398790445e-05, + "loss": 5.6442, + "step": 6289 + }, + { + "epoch": 0.03740841183747264, + "grad_norm": 2.6163039207458496, + "learning_rate": 4.9827610643714877e-05, + "loss": 5.699, + "step": 6290 + }, + { + "epoch": 0.03741435912075364, + "grad_norm": 2.5752358436584473, + "learning_rate": 4.982755587997225e-05, + "loss": 5.666, + "step": 6291 + }, + { + "epoch": 0.03742030640403464, + "grad_norm": 2.6609575748443604, + "learning_rate": 4.982750110756258e-05, + "loss": 5.5634, + "step": 6292 + }, + { + "epoch": 0.037426253687315635, + "grad_norm": 2.724731683731079, + "learning_rate": 4.9827446326485884e-05, + "loss": 5.6259, + "step": 6293 + }, + { + "epoch": 0.03743220097059663, + "grad_norm": 2.5849807262420654, + "learning_rate": 4.9827391536742185e-05, + "loss": 5.6182, + "step": 6294 + }, + { + "epoch": 0.03743814825387763, + "grad_norm": 2.6737449169158936, + "learning_rate": 4.9827336738331496e-05, + "loss": 5.5426, + "step": 6295 + }, + { + "epoch": 0.03744409553715863, + "grad_norm": 2.5739669799804688, + "learning_rate": 4.9827281931253844e-05, + "loss": 5.6283, + "step": 6296 + }, + { + "epoch": 0.03745004282043962, + "grad_norm": 2.652730703353882, + "learning_rate": 4.982722711550924e-05, + "loss": 5.5241, + "step": 6297 + }, + { + "epoch": 0.037455990103720624, + "grad_norm": 2.7140653133392334, + "learning_rate": 4.982717229109772e-05, + "loss": 5.7052, + "step": 6298 + }, + { + "epoch": 0.03746193738700162, + "grad_norm": 2.1617860794067383, + "learning_rate": 4.982711745801928e-05, + "loss": 5.6224, + "step": 6299 + }, + { + "epoch": 0.037467884670282614, + "grad_norm": 2.1400585174560547, + "learning_rate": 4.982706261627395e-05, + "loss": 5.5753, + "step": 6300 + }, + { + "epoch": 0.03747383195356361, + "grad_norm": 2.4439101219177246, + "learning_rate": 4.9827007765861754e-05, + "loss": 5.6219, + "step": 6301 + }, + { + "epoch": 0.03747977923684461, + "grad_norm": 2.507141351699829, + "learning_rate": 4.9826952906782697e-05, + "loss": 5.6666, + "step": 6302 + }, + { + "epoch": 0.037485726520125606, + "grad_norm": 2.2664029598236084, + "learning_rate": 4.982689803903682e-05, + "loss": 5.7792, + "step": 6303 + }, + { + "epoch": 0.0374916738034066, + "grad_norm": 2.49678635597229, + "learning_rate": 4.982684316262411e-05, + "loss": 5.5899, + "step": 6304 + }, + { + "epoch": 0.0374976210866876, + "grad_norm": 2.244603395462036, + "learning_rate": 4.9826788277544625e-05, + "loss": 5.4624, + "step": 6305 + }, + { + "epoch": 0.0375035683699686, + "grad_norm": 2.144343376159668, + "learning_rate": 4.9826733383798366e-05, + "loss": 5.3428, + "step": 6306 + }, + { + "epoch": 0.037509515653249594, + "grad_norm": 1.7709565162658691, + "learning_rate": 4.982667848138534e-05, + "loss": 5.3596, + "step": 6307 + }, + { + "epoch": 0.037515462936530596, + "grad_norm": 2.0245232582092285, + "learning_rate": 4.9826623570305574e-05, + "loss": 5.4005, + "step": 6308 + }, + { + "epoch": 0.03752141021981159, + "grad_norm": 2.5346829891204834, + "learning_rate": 4.9826568650559095e-05, + "loss": 5.5089, + "step": 6309 + }, + { + "epoch": 0.037527357503092586, + "grad_norm": 2.638684034347534, + "learning_rate": 4.982651372214592e-05, + "loss": 5.6847, + "step": 6310 + }, + { + "epoch": 0.03753330478637359, + "grad_norm": 2.024423122406006, + "learning_rate": 4.982645878506606e-05, + "loss": 5.3633, + "step": 6311 + }, + { + "epoch": 0.03753925206965458, + "grad_norm": 1.983167290687561, + "learning_rate": 4.982640383931955e-05, + "loss": 5.2086, + "step": 6312 + }, + { + "epoch": 0.03754519935293558, + "grad_norm": 1.8388524055480957, + "learning_rate": 4.982634888490639e-05, + "loss": 5.1904, + "step": 6313 + }, + { + "epoch": 0.03755114663621657, + "grad_norm": 1.8280584812164307, + "learning_rate": 4.982629392182661e-05, + "loss": 5.3072, + "step": 6314 + }, + { + "epoch": 0.037557093919497575, + "grad_norm": 1.6278408765792847, + "learning_rate": 4.982623895008023e-05, + "loss": 5.3003, + "step": 6315 + }, + { + "epoch": 0.03756304120277857, + "grad_norm": 2.0519096851348877, + "learning_rate": 4.982618396966726e-05, + "loss": 5.3494, + "step": 6316 + }, + { + "epoch": 0.037568988486059565, + "grad_norm": 1.935744285583496, + "learning_rate": 4.982612898058773e-05, + "loss": 5.6993, + "step": 6317 + }, + { + "epoch": 0.03757493576934057, + "grad_norm": 1.882163166999817, + "learning_rate": 4.9826073982841656e-05, + "loss": 5.758, + "step": 6318 + }, + { + "epoch": 0.03758088305262156, + "grad_norm": 1.7747882604599, + "learning_rate": 4.982601897642906e-05, + "loss": 5.1501, + "step": 6319 + }, + { + "epoch": 0.03758683033590256, + "grad_norm": 2.044093370437622, + "learning_rate": 4.982596396134995e-05, + "loss": 5.2801, + "step": 6320 + }, + { + "epoch": 0.03759277761918356, + "grad_norm": 1.739441990852356, + "learning_rate": 4.9825908937604346e-05, + "loss": 5.1619, + "step": 6321 + }, + { + "epoch": 0.037598724902464555, + "grad_norm": 2.0353312492370605, + "learning_rate": 4.982585390519229e-05, + "loss": 5.6796, + "step": 6322 + }, + { + "epoch": 0.03760467218574555, + "grad_norm": 2.076667308807373, + "learning_rate": 4.9825798864113774e-05, + "loss": 6.2522, + "step": 6323 + }, + { + "epoch": 0.03761061946902655, + "grad_norm": 2.773676633834839, + "learning_rate": 4.982574381436883e-05, + "loss": 5.879, + "step": 6324 + }, + { + "epoch": 0.03761656675230755, + "grad_norm": 2.2013933658599854, + "learning_rate": 4.982568875595748e-05, + "loss": 6.0341, + "step": 6325 + }, + { + "epoch": 0.03762251403558854, + "grad_norm": 2.288806915283203, + "learning_rate": 4.9825633688879736e-05, + "loss": 6.219, + "step": 6326 + }, + { + "epoch": 0.037628461318869544, + "grad_norm": 2.874372720718384, + "learning_rate": 4.982557861313561e-05, + "loss": 5.7616, + "step": 6327 + }, + { + "epoch": 0.03763440860215054, + "grad_norm": 2.7471537590026855, + "learning_rate": 4.982552352872515e-05, + "loss": 5.7214, + "step": 6328 + }, + { + "epoch": 0.037640355885431534, + "grad_norm": 2.475513458251953, + "learning_rate": 4.982546843564834e-05, + "loss": 6.0039, + "step": 6329 + }, + { + "epoch": 0.03764630316871253, + "grad_norm": 2.5376412868499756, + "learning_rate": 4.982541333390523e-05, + "loss": 6.3042, + "step": 6330 + }, + { + "epoch": 0.03765225045199353, + "grad_norm": 2.599989414215088, + "learning_rate": 4.9825358223495814e-05, + "loss": 6.488, + "step": 6331 + }, + { + "epoch": 0.037658197735274526, + "grad_norm": 2.2657089233398438, + "learning_rate": 4.9825303104420115e-05, + "loss": 6.2743, + "step": 6332 + }, + { + "epoch": 0.03766414501855552, + "grad_norm": 2.303926467895508, + "learning_rate": 4.982524797667818e-05, + "loss": 6.3888, + "step": 6333 + }, + { + "epoch": 0.03767009230183652, + "grad_norm": 2.771775007247925, + "learning_rate": 4.982519284026999e-05, + "loss": 6.0911, + "step": 6334 + }, + { + "epoch": 0.03767603958511752, + "grad_norm": 2.492748260498047, + "learning_rate": 4.982513769519559e-05, + "loss": 5.9905, + "step": 6335 + }, + { + "epoch": 0.03768198686839851, + "grad_norm": 2.294985771179199, + "learning_rate": 4.982508254145498e-05, + "loss": 6.4574, + "step": 6336 + }, + { + "epoch": 0.037687934151679515, + "grad_norm": 2.6514554023742676, + "learning_rate": 4.9825027379048205e-05, + "loss": 6.1541, + "step": 6337 + }, + { + "epoch": 0.03769388143496051, + "grad_norm": 2.0114963054656982, + "learning_rate": 4.982497220797526e-05, + "loss": 6.0602, + "step": 6338 + }, + { + "epoch": 0.037699828718241506, + "grad_norm": 2.6345295906066895, + "learning_rate": 4.982491702823618e-05, + "loss": 6.024, + "step": 6339 + }, + { + "epoch": 0.03770577600152251, + "grad_norm": 2.619980573654175, + "learning_rate": 4.982486183983097e-05, + "loss": 6.0642, + "step": 6340 + }, + { + "epoch": 0.0377117232848035, + "grad_norm": 2.491279125213623, + "learning_rate": 4.9824806642759664e-05, + "loss": 5.8517, + "step": 6341 + }, + { + "epoch": 0.0377176705680845, + "grad_norm": 2.5161385536193848, + "learning_rate": 4.982475143702227e-05, + "loss": 5.7467, + "step": 6342 + }, + { + "epoch": 0.03772361785136549, + "grad_norm": 2.3237602710723877, + "learning_rate": 4.982469622261882e-05, + "loss": 5.801, + "step": 6343 + }, + { + "epoch": 0.037729565134646495, + "grad_norm": 2.21382999420166, + "learning_rate": 4.9824640999549314e-05, + "loss": 5.968, + "step": 6344 + }, + { + "epoch": 0.03773551241792749, + "grad_norm": 2.1770498752593994, + "learning_rate": 4.9824585767813794e-05, + "loss": 6.2998, + "step": 6345 + }, + { + "epoch": 0.037741459701208485, + "grad_norm": 2.321563720703125, + "learning_rate": 4.982453052741225e-05, + "loss": 5.631, + "step": 6346 + }, + { + "epoch": 0.03774740698448949, + "grad_norm": 3.2769439220428467, + "learning_rate": 4.982447527834473e-05, + "loss": 5.4845, + "step": 6347 + }, + { + "epoch": 0.03775335426777048, + "grad_norm": 2.954331874847412, + "learning_rate": 4.9824420020611244e-05, + "loss": 5.2, + "step": 6348 + }, + { + "epoch": 0.03775930155105148, + "grad_norm": 2.735182523727417, + "learning_rate": 4.98243647542118e-05, + "loss": 5.1907, + "step": 6349 + }, + { + "epoch": 0.03776524883433248, + "grad_norm": 2.872142791748047, + "learning_rate": 4.982430947914644e-05, + "loss": 5.5159, + "step": 6350 + }, + { + "epoch": 0.037771196117613474, + "grad_norm": 3.14219331741333, + "learning_rate": 4.982425419541517e-05, + "loss": 5.0843, + "step": 6351 + }, + { + "epoch": 0.03777714340089447, + "grad_norm": 2.2689874172210693, + "learning_rate": 4.9824198903018e-05, + "loss": 6.0446, + "step": 6352 + }, + { + "epoch": 0.03778309068417547, + "grad_norm": 2.3468856811523438, + "learning_rate": 4.982414360195496e-05, + "loss": 5.952, + "step": 6353 + }, + { + "epoch": 0.03778903796745647, + "grad_norm": 2.944509983062744, + "learning_rate": 4.9824088292226065e-05, + "loss": 5.4918, + "step": 6354 + }, + { + "epoch": 0.03779498525073746, + "grad_norm": 2.8139286041259766, + "learning_rate": 4.982403297383135e-05, + "loss": 5.3296, + "step": 6355 + }, + { + "epoch": 0.037800932534018464, + "grad_norm": 2.540224552154541, + "learning_rate": 4.982397764677081e-05, + "loss": 5.3464, + "step": 6356 + }, + { + "epoch": 0.03780687981729946, + "grad_norm": 2.56709885597229, + "learning_rate": 4.982392231104448e-05, + "loss": 5.2313, + "step": 6357 + }, + { + "epoch": 0.037812827100580454, + "grad_norm": 2.2051165103912354, + "learning_rate": 4.982386696665238e-05, + "loss": 5.7783, + "step": 6358 + }, + { + "epoch": 0.03781877438386145, + "grad_norm": 2.5773870944976807, + "learning_rate": 4.9823811613594515e-05, + "loss": 5.6691, + "step": 6359 + }, + { + "epoch": 0.03782472166714245, + "grad_norm": 2.5163073539733887, + "learning_rate": 4.982375625187092e-05, + "loss": 5.7936, + "step": 6360 + }, + { + "epoch": 0.037830668950423446, + "grad_norm": 2.4268851280212402, + "learning_rate": 4.98237008814816e-05, + "loss": 5.8116, + "step": 6361 + }, + { + "epoch": 0.03783661623370444, + "grad_norm": 2.397402286529541, + "learning_rate": 4.9823645502426597e-05, + "loss": 5.9895, + "step": 6362 + }, + { + "epoch": 0.03784256351698544, + "grad_norm": 2.590672731399536, + "learning_rate": 4.98235901147059e-05, + "loss": 5.9022, + "step": 6363 + }, + { + "epoch": 0.03784851080026644, + "grad_norm": 2.268540859222412, + "learning_rate": 4.9823534718319557e-05, + "loss": 5.8958, + "step": 6364 + }, + { + "epoch": 0.03785445808354743, + "grad_norm": 2.1419460773468018, + "learning_rate": 4.982347931326757e-05, + "loss": 5.8446, + "step": 6365 + }, + { + "epoch": 0.037860405366828435, + "grad_norm": 2.3988053798675537, + "learning_rate": 4.9823423899549957e-05, + "loss": 6.2267, + "step": 6366 + }, + { + "epoch": 0.03786635265010943, + "grad_norm": 2.120121955871582, + "learning_rate": 4.9823368477166755e-05, + "loss": 6.1352, + "step": 6367 + }, + { + "epoch": 0.037872299933390426, + "grad_norm": 2.274610996246338, + "learning_rate": 4.982331304611796e-05, + "loss": 6.1342, + "step": 6368 + }, + { + "epoch": 0.03787824721667143, + "grad_norm": 1.6934765577316284, + "learning_rate": 4.98232576064036e-05, + "loss": 5.7969, + "step": 6369 + }, + { + "epoch": 0.03788419449995242, + "grad_norm": 2.62416672706604, + "learning_rate": 4.982320215802371e-05, + "loss": 5.9669, + "step": 6370 + }, + { + "epoch": 0.03789014178323342, + "grad_norm": 2.416639804840088, + "learning_rate": 4.98231467009783e-05, + "loss": 5.9628, + "step": 6371 + }, + { + "epoch": 0.03789608906651441, + "grad_norm": 2.049412965774536, + "learning_rate": 4.9823091235267375e-05, + "loss": 5.658, + "step": 6372 + }, + { + "epoch": 0.037902036349795415, + "grad_norm": 2.0502147674560547, + "learning_rate": 4.982303576089097e-05, + "loss": 5.9114, + "step": 6373 + }, + { + "epoch": 0.03790798363307641, + "grad_norm": 2.1566948890686035, + "learning_rate": 4.982298027784909e-05, + "loss": 5.6932, + "step": 6374 + }, + { + "epoch": 0.037913930916357405, + "grad_norm": 2.394083261489868, + "learning_rate": 4.9822924786141774e-05, + "loss": 6.3041, + "step": 6375 + }, + { + "epoch": 0.03791987819963841, + "grad_norm": 2.545910120010376, + "learning_rate": 4.9822869285769024e-05, + "loss": 6.2125, + "step": 6376 + }, + { + "epoch": 0.0379258254829194, + "grad_norm": 2.271461248397827, + "learning_rate": 4.9822813776730875e-05, + "loss": 6.2322, + "step": 6377 + }, + { + "epoch": 0.0379317727662004, + "grad_norm": 2.3840630054473877, + "learning_rate": 4.9822758259027336e-05, + "loss": 6.0167, + "step": 6378 + }, + { + "epoch": 0.0379377200494814, + "grad_norm": 2.600618600845337, + "learning_rate": 4.9822702732658426e-05, + "loss": 5.6722, + "step": 6379 + }, + { + "epoch": 0.037943667332762394, + "grad_norm": 2.0911965370178223, + "learning_rate": 4.982264719762417e-05, + "loss": 5.579, + "step": 6380 + }, + { + "epoch": 0.03794961461604339, + "grad_norm": 2.015505075454712, + "learning_rate": 4.9822591653924575e-05, + "loss": 5.9747, + "step": 6381 + }, + { + "epoch": 0.03795556189932439, + "grad_norm": 2.237262010574341, + "learning_rate": 4.982253610155968e-05, + "loss": 6.3792, + "step": 6382 + }, + { + "epoch": 0.03796150918260539, + "grad_norm": 2.1448137760162354, + "learning_rate": 4.982248054052949e-05, + "loss": 6.1049, + "step": 6383 + }, + { + "epoch": 0.03796745646588638, + "grad_norm": 2.2597758769989014, + "learning_rate": 4.9822424970834034e-05, + "loss": 5.8428, + "step": 6384 + }, + { + "epoch": 0.037973403749167384, + "grad_norm": 1.9935969114303589, + "learning_rate": 4.982236939247332e-05, + "loss": 6.0032, + "step": 6385 + }, + { + "epoch": 0.03797935103244838, + "grad_norm": 2.506916046142578, + "learning_rate": 4.982231380544737e-05, + "loss": 5.9221, + "step": 6386 + }, + { + "epoch": 0.037985298315729374, + "grad_norm": 2.083393096923828, + "learning_rate": 4.9822258209756214e-05, + "loss": 5.8862, + "step": 6387 + }, + { + "epoch": 0.03799124559901037, + "grad_norm": 2.631091594696045, + "learning_rate": 4.982220260539987e-05, + "loss": 5.6593, + "step": 6388 + }, + { + "epoch": 0.03799719288229137, + "grad_norm": 2.5732531547546387, + "learning_rate": 4.982214699237834e-05, + "loss": 5.5084, + "step": 6389 + }, + { + "epoch": 0.038003140165572366, + "grad_norm": 2.7797791957855225, + "learning_rate": 4.982209137069166e-05, + "loss": 5.6792, + "step": 6390 + }, + { + "epoch": 0.03800908744885336, + "grad_norm": 2.2800772190093994, + "learning_rate": 4.982203574033984e-05, + "loss": 5.6299, + "step": 6391 + }, + { + "epoch": 0.03801503473213436, + "grad_norm": 2.4182863235473633, + "learning_rate": 4.9821980101322905e-05, + "loss": 5.71, + "step": 6392 + }, + { + "epoch": 0.03802098201541536, + "grad_norm": 2.2968835830688477, + "learning_rate": 4.982192445364088e-05, + "loss": 5.6112, + "step": 6393 + }, + { + "epoch": 0.03802692929869635, + "grad_norm": 2.3713324069976807, + "learning_rate": 4.982186879729377e-05, + "loss": 5.423, + "step": 6394 + }, + { + "epoch": 0.038032876581977355, + "grad_norm": 2.745352268218994, + "learning_rate": 4.98218131322816e-05, + "loss": 5.5145, + "step": 6395 + }, + { + "epoch": 0.03803882386525835, + "grad_norm": 2.755211353302002, + "learning_rate": 4.98217574586044e-05, + "loss": 5.4399, + "step": 6396 + }, + { + "epoch": 0.038044771148539346, + "grad_norm": 2.5452096462249756, + "learning_rate": 4.982170177626217e-05, + "loss": 5.5691, + "step": 6397 + }, + { + "epoch": 0.03805071843182035, + "grad_norm": 2.6195876598358154, + "learning_rate": 4.9821646085254954e-05, + "loss": 5.4512, + "step": 6398 + }, + { + "epoch": 0.03805666571510134, + "grad_norm": 2.4931671619415283, + "learning_rate": 4.982159038558275e-05, + "loss": 6.0505, + "step": 6399 + }, + { + "epoch": 0.03806261299838234, + "grad_norm": 2.45062255859375, + "learning_rate": 4.982153467724558e-05, + "loss": 6.2367, + "step": 6400 + }, + { + "epoch": 0.03806856028166333, + "grad_norm": 2.688624620437622, + "learning_rate": 4.982147896024348e-05, + "loss": 6.0522, + "step": 6401 + }, + { + "epoch": 0.038074507564944335, + "grad_norm": 2.421660900115967, + "learning_rate": 4.982142323457645e-05, + "loss": 5.8166, + "step": 6402 + }, + { + "epoch": 0.03808045484822533, + "grad_norm": 2.594134569168091, + "learning_rate": 4.982136750024452e-05, + "loss": 5.5476, + "step": 6403 + }, + { + "epoch": 0.038086402131506325, + "grad_norm": 2.4492971897125244, + "learning_rate": 4.982131175724771e-05, + "loss": 5.2302, + "step": 6404 + }, + { + "epoch": 0.03809234941478733, + "grad_norm": 2.4200360774993896, + "learning_rate": 4.9821256005586036e-05, + "loss": 6.1404, + "step": 6405 + }, + { + "epoch": 0.03809829669806832, + "grad_norm": 2.1949775218963623, + "learning_rate": 4.982120024525951e-05, + "loss": 5.9589, + "step": 6406 + }, + { + "epoch": 0.03810424398134932, + "grad_norm": 2.3570375442504883, + "learning_rate": 4.9821144476268164e-05, + "loss": 5.9022, + "step": 6407 + }, + { + "epoch": 0.03811019126463032, + "grad_norm": 2.16460919380188, + "learning_rate": 4.9821088698612016e-05, + "loss": 5.8535, + "step": 6408 + }, + { + "epoch": 0.038116138547911314, + "grad_norm": 1.8189443349838257, + "learning_rate": 4.982103291229108e-05, + "loss": 5.9345, + "step": 6409 + }, + { + "epoch": 0.03812208583119231, + "grad_norm": 2.553919792175293, + "learning_rate": 4.9820977117305376e-05, + "loss": 5.31, + "step": 6410 + }, + { + "epoch": 0.03812803311447331, + "grad_norm": 2.8085403442382812, + "learning_rate": 4.982092131365493e-05, + "loss": 4.9902, + "step": 6411 + }, + { + "epoch": 0.03813398039775431, + "grad_norm": 2.3698999881744385, + "learning_rate": 4.982086550133976e-05, + "loss": 5.4982, + "step": 6412 + }, + { + "epoch": 0.0381399276810353, + "grad_norm": 1.996026873588562, + "learning_rate": 4.9820809680359876e-05, + "loss": 5.6556, + "step": 6413 + }, + { + "epoch": 0.038145874964316304, + "grad_norm": 2.0816900730133057, + "learning_rate": 4.9820753850715305e-05, + "loss": 5.8823, + "step": 6414 + }, + { + "epoch": 0.0381518222475973, + "grad_norm": 2.282745122909546, + "learning_rate": 4.982069801240606e-05, + "loss": 5.1641, + "step": 6415 + }, + { + "epoch": 0.038157769530878294, + "grad_norm": 2.043991804122925, + "learning_rate": 4.982064216543217e-05, + "loss": 5.7569, + "step": 6416 + }, + { + "epoch": 0.03816371681415929, + "grad_norm": 2.086071014404297, + "learning_rate": 4.982058630979365e-05, + "loss": 5.9586, + "step": 6417 + }, + { + "epoch": 0.03816966409744029, + "grad_norm": 2.295060873031616, + "learning_rate": 4.9820530445490525e-05, + "loss": 5.3733, + "step": 6418 + }, + { + "epoch": 0.038175611380721286, + "grad_norm": 2.512267827987671, + "learning_rate": 4.98204745725228e-05, + "loss": 5.0399, + "step": 6419 + }, + { + "epoch": 0.03818155866400228, + "grad_norm": 2.5434467792510986, + "learning_rate": 4.982041869089051e-05, + "loss": 4.7907, + "step": 6420 + }, + { + "epoch": 0.03818750594728328, + "grad_norm": 2.4192142486572266, + "learning_rate": 4.9820362800593666e-05, + "loss": 4.9116, + "step": 6421 + }, + { + "epoch": 0.03819345323056428, + "grad_norm": 2.867542028427124, + "learning_rate": 4.9820306901632296e-05, + "loss": 5.9905, + "step": 6422 + }, + { + "epoch": 0.03819940051384527, + "grad_norm": 2.3099327087402344, + "learning_rate": 4.982025099400641e-05, + "loss": 5.9319, + "step": 6423 + }, + { + "epoch": 0.038205347797126275, + "grad_norm": 2.28169584274292, + "learning_rate": 4.9820195077716026e-05, + "loss": 6.2533, + "step": 6424 + }, + { + "epoch": 0.03821129508040727, + "grad_norm": 2.1065595149993896, + "learning_rate": 4.9820139152761167e-05, + "loss": 5.7123, + "step": 6425 + }, + { + "epoch": 0.038217242363688265, + "grad_norm": 2.0210213661193848, + "learning_rate": 4.9820083219141865e-05, + "loss": 5.7758, + "step": 6426 + }, + { + "epoch": 0.03822318964696927, + "grad_norm": 1.6545369625091553, + "learning_rate": 4.9820027276858114e-05, + "loss": 5.6792, + "step": 6427 + }, + { + "epoch": 0.03822913693025026, + "grad_norm": 2.177621841430664, + "learning_rate": 4.981997132590996e-05, + "loss": 6.0167, + "step": 6428 + }, + { + "epoch": 0.03823508421353126, + "grad_norm": 2.3910553455352783, + "learning_rate": 4.981991536629741e-05, + "loss": 6.1161, + "step": 6429 + }, + { + "epoch": 0.03824103149681225, + "grad_norm": 2.4915859699249268, + "learning_rate": 4.981985939802047e-05, + "loss": 5.6449, + "step": 6430 + }, + { + "epoch": 0.038246978780093255, + "grad_norm": 2.0343215465545654, + "learning_rate": 4.981980342107919e-05, + "loss": 5.967, + "step": 6431 + }, + { + "epoch": 0.03825292606337425, + "grad_norm": 1.8326199054718018, + "learning_rate": 4.9819747435473565e-05, + "loss": 5.9183, + "step": 6432 + }, + { + "epoch": 0.038258873346655245, + "grad_norm": 2.1482350826263428, + "learning_rate": 4.981969144120362e-05, + "loss": 5.794, + "step": 6433 + }, + { + "epoch": 0.03826482062993625, + "grad_norm": 2.346355438232422, + "learning_rate": 4.9819635438269384e-05, + "loss": 5.6775, + "step": 6434 + }, + { + "epoch": 0.03827076791321724, + "grad_norm": 2.252150774002075, + "learning_rate": 4.981957942667087e-05, + "loss": 5.9383, + "step": 6435 + }, + { + "epoch": 0.03827671519649824, + "grad_norm": 2.1851654052734375, + "learning_rate": 4.981952340640809e-05, + "loss": 6.0555, + "step": 6436 + }, + { + "epoch": 0.03828266247977924, + "grad_norm": 2.0609381198883057, + "learning_rate": 4.9819467377481076e-05, + "loss": 6.3209, + "step": 6437 + }, + { + "epoch": 0.038288609763060234, + "grad_norm": 2.4882800579071045, + "learning_rate": 4.981941133988984e-05, + "loss": 6.2411, + "step": 6438 + }, + { + "epoch": 0.03829455704634123, + "grad_norm": 1.8794118165969849, + "learning_rate": 4.981935529363441e-05, + "loss": 5.5696, + "step": 6439 + }, + { + "epoch": 0.03830050432962223, + "grad_norm": 2.542656660079956, + "learning_rate": 4.981929923871479e-05, + "loss": 5.8106, + "step": 6440 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 2.3871288299560547, + "learning_rate": 4.981924317513101e-05, + "loss": 5.6354, + "step": 6441 + }, + { + "epoch": 0.03831239889618422, + "grad_norm": 2.4628939628601074, + "learning_rate": 4.981918710288309e-05, + "loss": 5.9695, + "step": 6442 + }, + { + "epoch": 0.038318346179465224, + "grad_norm": 2.908543586730957, + "learning_rate": 4.9819131021971056e-05, + "loss": 5.2742, + "step": 6443 + }, + { + "epoch": 0.03832429346274622, + "grad_norm": 3.353813886642456, + "learning_rate": 4.9819074932394916e-05, + "loss": 5.3823, + "step": 6444 + }, + { + "epoch": 0.038330240746027214, + "grad_norm": 2.5253870487213135, + "learning_rate": 4.981901883415469e-05, + "loss": 5.7, + "step": 6445 + }, + { + "epoch": 0.03833618802930821, + "grad_norm": 2.3375632762908936, + "learning_rate": 4.98189627272504e-05, + "loss": 5.2862, + "step": 6446 + }, + { + "epoch": 0.03834213531258921, + "grad_norm": 2.534599542617798, + "learning_rate": 4.981890661168207e-05, + "loss": 5.3961, + "step": 6447 + }, + { + "epoch": 0.038348082595870206, + "grad_norm": 2.383511781692505, + "learning_rate": 4.9818850487449716e-05, + "loss": 6.4658, + "step": 6448 + }, + { + "epoch": 0.0383540298791512, + "grad_norm": 2.2824161052703857, + "learning_rate": 4.981879435455336e-05, + "loss": 5.5221, + "step": 6449 + }, + { + "epoch": 0.0383599771624322, + "grad_norm": 2.355271100997925, + "learning_rate": 4.981873821299301e-05, + "loss": 5.5054, + "step": 6450 + }, + { + "epoch": 0.0383659244457132, + "grad_norm": 2.0071253776550293, + "learning_rate": 4.981868206276871e-05, + "loss": 5.5911, + "step": 6451 + }, + { + "epoch": 0.03837187172899419, + "grad_norm": 2.2770705223083496, + "learning_rate": 4.9818625903880445e-05, + "loss": 5.8978, + "step": 6452 + }, + { + "epoch": 0.038377819012275195, + "grad_norm": 2.2425332069396973, + "learning_rate": 4.981856973632827e-05, + "loss": 6.3189, + "step": 6453 + }, + { + "epoch": 0.03838376629555619, + "grad_norm": 2.300560235977173, + "learning_rate": 4.981851356011218e-05, + "loss": 5.745, + "step": 6454 + }, + { + "epoch": 0.038389713578837185, + "grad_norm": 2.4516983032226562, + "learning_rate": 4.981845737523221e-05, + "loss": 5.8978, + "step": 6455 + }, + { + "epoch": 0.03839566086211819, + "grad_norm": 2.3463354110717773, + "learning_rate": 4.981840118168837e-05, + "loss": 5.668, + "step": 6456 + }, + { + "epoch": 0.03840160814539918, + "grad_norm": 2.623608112335205, + "learning_rate": 4.981834497948068e-05, + "loss": 5.471, + "step": 6457 + }, + { + "epoch": 0.03840755542868018, + "grad_norm": 2.441089391708374, + "learning_rate": 4.9818288768609166e-05, + "loss": 5.0986, + "step": 6458 + }, + { + "epoch": 0.03841350271196117, + "grad_norm": 2.597635507583618, + "learning_rate": 4.981823254907384e-05, + "loss": 5.1046, + "step": 6459 + }, + { + "epoch": 0.038419449995242175, + "grad_norm": 2.344855785369873, + "learning_rate": 4.9818176320874727e-05, + "loss": 5.8878, + "step": 6460 + }, + { + "epoch": 0.03842539727852317, + "grad_norm": 2.2569222450256348, + "learning_rate": 4.981812008401184e-05, + "loss": 5.342, + "step": 6461 + }, + { + "epoch": 0.038431344561804165, + "grad_norm": 2.276780843734741, + "learning_rate": 4.981806383848522e-05, + "loss": 5.566, + "step": 6462 + }, + { + "epoch": 0.03843729184508517, + "grad_norm": 2.1354174613952637, + "learning_rate": 4.9818007584294856e-05, + "loss": 5.8678, + "step": 6463 + }, + { + "epoch": 0.03844323912836616, + "grad_norm": 2.164092779159546, + "learning_rate": 4.981795132144078e-05, + "loss": 5.7937, + "step": 6464 + }, + { + "epoch": 0.03844918641164716, + "grad_norm": 2.3034324645996094, + "learning_rate": 4.981789504992303e-05, + "loss": 5.843, + "step": 6465 + }, + { + "epoch": 0.03845513369492816, + "grad_norm": 1.9616999626159668, + "learning_rate": 4.9817838769741584e-05, + "loss": 6.0563, + "step": 6466 + }, + { + "epoch": 0.038461080978209154, + "grad_norm": 2.2784626483917236, + "learning_rate": 4.9817782480896505e-05, + "loss": 6.4152, + "step": 6467 + }, + { + "epoch": 0.03846702826149015, + "grad_norm": 1.8581526279449463, + "learning_rate": 4.981772618338779e-05, + "loss": 5.9833, + "step": 6468 + }, + { + "epoch": 0.03847297554477115, + "grad_norm": 2.2493395805358887, + "learning_rate": 4.9817669877215466e-05, + "loss": 6.2985, + "step": 6469 + }, + { + "epoch": 0.038478922828052146, + "grad_norm": 2.289125919342041, + "learning_rate": 4.981761356237955e-05, + "loss": 5.8555, + "step": 6470 + }, + { + "epoch": 0.03848487011133314, + "grad_norm": 2.11012601852417, + "learning_rate": 4.981755723888006e-05, + "loss": 6.6137, + "step": 6471 + }, + { + "epoch": 0.038490817394614144, + "grad_norm": 2.1793103218078613, + "learning_rate": 4.981750090671702e-05, + "loss": 6.0117, + "step": 6472 + }, + { + "epoch": 0.03849676467789514, + "grad_norm": 2.1857750415802, + "learning_rate": 4.9817444565890436e-05, + "loss": 5.9877, + "step": 6473 + }, + { + "epoch": 0.038502711961176134, + "grad_norm": 1.7430874109268188, + "learning_rate": 4.981738821640035e-05, + "loss": 5.829, + "step": 6474 + }, + { + "epoch": 0.03850865924445713, + "grad_norm": 1.8017771244049072, + "learning_rate": 4.981733185824676e-05, + "loss": 6.3853, + "step": 6475 + }, + { + "epoch": 0.03851460652773813, + "grad_norm": 2.1420724391937256, + "learning_rate": 4.9817275491429705e-05, + "loss": 5.982, + "step": 6476 + }, + { + "epoch": 0.038520553811019126, + "grad_norm": 2.441521167755127, + "learning_rate": 4.9817219115949195e-05, + "loss": 6.1159, + "step": 6477 + }, + { + "epoch": 0.03852650109430012, + "grad_norm": 2.158682346343994, + "learning_rate": 4.9817162731805246e-05, + "loss": 6.1306, + "step": 6478 + }, + { + "epoch": 0.03853244837758112, + "grad_norm": 2.154538869857788, + "learning_rate": 4.9817106338997884e-05, + "loss": 6.0745, + "step": 6479 + }, + { + "epoch": 0.03853839566086212, + "grad_norm": 2.077674388885498, + "learning_rate": 4.981704993752713e-05, + "loss": 6.2171, + "step": 6480 + }, + { + "epoch": 0.03854434294414311, + "grad_norm": 2.181500196456909, + "learning_rate": 4.981699352739299e-05, + "loss": 6.228, + "step": 6481 + }, + { + "epoch": 0.038550290227424115, + "grad_norm": 2.678189992904663, + "learning_rate": 4.98169371085955e-05, + "loss": 5.965, + "step": 6482 + }, + { + "epoch": 0.03855623751070511, + "grad_norm": 2.713480234146118, + "learning_rate": 4.981688068113467e-05, + "loss": 5.9078, + "step": 6483 + }, + { + "epoch": 0.038562184793986105, + "grad_norm": 2.4872853755950928, + "learning_rate": 4.981682424501053e-05, + "loss": 5.7525, + "step": 6484 + }, + { + "epoch": 0.03856813207726711, + "grad_norm": 2.274711847305298, + "learning_rate": 4.98167678002231e-05, + "loss": 5.9193, + "step": 6485 + }, + { + "epoch": 0.0385740793605481, + "grad_norm": 2.4730162620544434, + "learning_rate": 4.981671134677238e-05, + "loss": 6.2961, + "step": 6486 + }, + { + "epoch": 0.0385800266438291, + "grad_norm": 1.7856062650680542, + "learning_rate": 4.9816654884658396e-05, + "loss": 5.9005, + "step": 6487 + }, + { + "epoch": 0.03858597392711009, + "grad_norm": 1.8812140226364136, + "learning_rate": 4.981659841388119e-05, + "loss": 5.9428, + "step": 6488 + }, + { + "epoch": 0.038591921210391095, + "grad_norm": 1.9963254928588867, + "learning_rate": 4.9816541934440756e-05, + "loss": 6.0136, + "step": 6489 + }, + { + "epoch": 0.03859786849367209, + "grad_norm": 2.741892099380493, + "learning_rate": 4.981648544633713e-05, + "loss": 6.5065, + "step": 6490 + }, + { + "epoch": 0.038603815776953085, + "grad_norm": 2.226672410964966, + "learning_rate": 4.981642894957032e-05, + "loss": 5.9705, + "step": 6491 + }, + { + "epoch": 0.03860976306023409, + "grad_norm": 2.015429973602295, + "learning_rate": 4.981637244414036e-05, + "loss": 6.1418, + "step": 6492 + }, + { + "epoch": 0.03861571034351508, + "grad_norm": 2.032304286956787, + "learning_rate": 4.981631593004725e-05, + "loss": 6.2104, + "step": 6493 + }, + { + "epoch": 0.03862165762679608, + "grad_norm": 2.0174217224121094, + "learning_rate": 4.981625940729102e-05, + "loss": 5.9861, + "step": 6494 + }, + { + "epoch": 0.03862760491007708, + "grad_norm": 1.9466323852539062, + "learning_rate": 4.98162028758717e-05, + "loss": 6.0958, + "step": 6495 + }, + { + "epoch": 0.038633552193358074, + "grad_norm": 1.6796106100082397, + "learning_rate": 4.9816146335789296e-05, + "loss": 6.0708, + "step": 6496 + }, + { + "epoch": 0.03863949947663907, + "grad_norm": 2.0496580600738525, + "learning_rate": 4.9816089787043826e-05, + "loss": 6.0137, + "step": 6497 + }, + { + "epoch": 0.03864544675992007, + "grad_norm": 2.5402488708496094, + "learning_rate": 4.9816033229635324e-05, + "loss": 6.1389, + "step": 6498 + }, + { + "epoch": 0.038651394043201066, + "grad_norm": 2.2701938152313232, + "learning_rate": 4.9815976663563795e-05, + "loss": 6.1277, + "step": 6499 + }, + { + "epoch": 0.03865734132648206, + "grad_norm": 2.328554630279541, + "learning_rate": 4.9815920088829273e-05, + "loss": 6.0402, + "step": 6500 + }, + { + "epoch": 0.038663288609763063, + "grad_norm": 2.1817965507507324, + "learning_rate": 4.981586350543176e-05, + "loss": 6.2732, + "step": 6501 + }, + { + "epoch": 0.03866923589304406, + "grad_norm": 2.4273757934570312, + "learning_rate": 4.981580691337129e-05, + "loss": 6.1842, + "step": 6502 + }, + { + "epoch": 0.038675183176325054, + "grad_norm": 2.1365530490875244, + "learning_rate": 4.981575031264787e-05, + "loss": 6.1527, + "step": 6503 + }, + { + "epoch": 0.03868113045960605, + "grad_norm": 2.2198991775512695, + "learning_rate": 4.981569370326154e-05, + "loss": 6.0841, + "step": 6504 + }, + { + "epoch": 0.03868707774288705, + "grad_norm": 2.0078141689300537, + "learning_rate": 4.98156370852123e-05, + "loss": 6.0401, + "step": 6505 + }, + { + "epoch": 0.038693025026168046, + "grad_norm": 2.0243566036224365, + "learning_rate": 4.9815580458500184e-05, + "loss": 5.9111, + "step": 6506 + }, + { + "epoch": 0.03869897230944904, + "grad_norm": 2.3084707260131836, + "learning_rate": 4.98155238231252e-05, + "loss": 5.9865, + "step": 6507 + }, + { + "epoch": 0.03870491959273004, + "grad_norm": 1.8110517263412476, + "learning_rate": 4.981546717908738e-05, + "loss": 5.9132, + "step": 6508 + }, + { + "epoch": 0.03871086687601104, + "grad_norm": 2.2639706134796143, + "learning_rate": 4.981541052638673e-05, + "loss": 5.8195, + "step": 6509 + }, + { + "epoch": 0.03871681415929203, + "grad_norm": 2.2684152126312256, + "learning_rate": 4.981535386502327e-05, + "loss": 6.4894, + "step": 6510 + }, + { + "epoch": 0.038722761442573035, + "grad_norm": 2.363118886947632, + "learning_rate": 4.981529719499704e-05, + "loss": 6.1888, + "step": 6511 + }, + { + "epoch": 0.03872870872585403, + "grad_norm": 2.2158865928649902, + "learning_rate": 4.9815240516308045e-05, + "loss": 6.3361, + "step": 6512 + }, + { + "epoch": 0.038734656009135025, + "grad_norm": 2.096928834915161, + "learning_rate": 4.98151838289563e-05, + "loss": 5.8554, + "step": 6513 + }, + { + "epoch": 0.03874060329241603, + "grad_norm": 2.2228331565856934, + "learning_rate": 4.981512713294183e-05, + "loss": 5.9961, + "step": 6514 + }, + { + "epoch": 0.03874655057569702, + "grad_norm": 1.8646903038024902, + "learning_rate": 4.981507042826466e-05, + "loss": 6.1471, + "step": 6515 + }, + { + "epoch": 0.03875249785897802, + "grad_norm": 2.227267265319824, + "learning_rate": 4.98150137149248e-05, + "loss": 5.9655, + "step": 6516 + }, + { + "epoch": 0.03875844514225902, + "grad_norm": 2.6884701251983643, + "learning_rate": 4.981495699292228e-05, + "loss": 5.7958, + "step": 6517 + }, + { + "epoch": 0.038764392425540015, + "grad_norm": 2.953523635864258, + "learning_rate": 4.981490026225711e-05, + "loss": 5.8305, + "step": 6518 + }, + { + "epoch": 0.03877033970882101, + "grad_norm": 2.5009984970092773, + "learning_rate": 4.981484352292932e-05, + "loss": 5.7838, + "step": 6519 + }, + { + "epoch": 0.038776286992102005, + "grad_norm": 2.2291715145111084, + "learning_rate": 4.981478677493892e-05, + "loss": 5.7622, + "step": 6520 + }, + { + "epoch": 0.03878223427538301, + "grad_norm": 2.1492466926574707, + "learning_rate": 4.9814730018285935e-05, + "loss": 5.5379, + "step": 6521 + }, + { + "epoch": 0.038788181558664, + "grad_norm": 1.8914062976837158, + "learning_rate": 4.981467325297039e-05, + "loss": 5.8368, + "step": 6522 + }, + { + "epoch": 0.038794128841945, + "grad_norm": 2.301670789718628, + "learning_rate": 4.981461647899229e-05, + "loss": 5.9019, + "step": 6523 + }, + { + "epoch": 0.038800076125226, + "grad_norm": 2.2850520610809326, + "learning_rate": 4.981455969635167e-05, + "loss": 5.6616, + "step": 6524 + }, + { + "epoch": 0.038806023408506994, + "grad_norm": 2.4155313968658447, + "learning_rate": 4.9814502905048546e-05, + "loss": 5.7842, + "step": 6525 + }, + { + "epoch": 0.03881197069178799, + "grad_norm": 2.0731799602508545, + "learning_rate": 4.981444610508293e-05, + "loss": 6.084, + "step": 6526 + }, + { + "epoch": 0.03881791797506899, + "grad_norm": 2.990232229232788, + "learning_rate": 4.981438929645484e-05, + "loss": 5.2556, + "step": 6527 + }, + { + "epoch": 0.038823865258349986, + "grad_norm": 3.0814263820648193, + "learning_rate": 4.981433247916432e-05, + "loss": 5.1895, + "step": 6528 + }, + { + "epoch": 0.03882981254163098, + "grad_norm": 3.197000503540039, + "learning_rate": 4.9814275653211365e-05, + "loss": 4.9539, + "step": 6529 + }, + { + "epoch": 0.03883575982491198, + "grad_norm": 3.062098979949951, + "learning_rate": 4.9814218818596e-05, + "loss": 4.8417, + "step": 6530 + }, + { + "epoch": 0.03884170710819298, + "grad_norm": 3.092667579650879, + "learning_rate": 4.981416197531825e-05, + "loss": 5.0479, + "step": 6531 + }, + { + "epoch": 0.038847654391473974, + "grad_norm": 3.00508713722229, + "learning_rate": 4.981410512337813e-05, + "loss": 5.864, + "step": 6532 + }, + { + "epoch": 0.03885360167475497, + "grad_norm": 3.3760926723480225, + "learning_rate": 4.981404826277567e-05, + "loss": 6.5745, + "step": 6533 + }, + { + "epoch": 0.03885954895803597, + "grad_norm": 2.6170921325683594, + "learning_rate": 4.981399139351087e-05, + "loss": 5.7959, + "step": 6534 + }, + { + "epoch": 0.038865496241316966, + "grad_norm": 2.9855849742889404, + "learning_rate": 4.981393451558377e-05, + "loss": 4.9118, + "step": 6535 + }, + { + "epoch": 0.03887144352459796, + "grad_norm": 2.885373830795288, + "learning_rate": 4.981387762899438e-05, + "loss": 4.8342, + "step": 6536 + }, + { + "epoch": 0.03887739080787896, + "grad_norm": 2.6936960220336914, + "learning_rate": 4.981382073374272e-05, + "loss": 4.7323, + "step": 6537 + }, + { + "epoch": 0.03888333809115996, + "grad_norm": 2.7214853763580322, + "learning_rate": 4.981376382982882e-05, + "loss": 5.5414, + "step": 6538 + }, + { + "epoch": 0.03888928537444095, + "grad_norm": 2.449828863143921, + "learning_rate": 4.981370691725269e-05, + "loss": 5.6385, + "step": 6539 + }, + { + "epoch": 0.038895232657721955, + "grad_norm": 2.551046133041382, + "learning_rate": 4.981364999601434e-05, + "loss": 5.4699, + "step": 6540 + }, + { + "epoch": 0.03890117994100295, + "grad_norm": 2.1208136081695557, + "learning_rate": 4.981359306611381e-05, + "loss": 5.6674, + "step": 6541 + }, + { + "epoch": 0.038907127224283945, + "grad_norm": 2.4039392471313477, + "learning_rate": 4.9813536127551105e-05, + "loss": 6.1872, + "step": 6542 + }, + { + "epoch": 0.03891307450756495, + "grad_norm": 2.0119946002960205, + "learning_rate": 4.9813479180326256e-05, + "loss": 6.0917, + "step": 6543 + }, + { + "epoch": 0.03891902179084594, + "grad_norm": 3.2959303855895996, + "learning_rate": 4.9813422224439275e-05, + "loss": 5.5646, + "step": 6544 + }, + { + "epoch": 0.03892496907412694, + "grad_norm": 2.9011316299438477, + "learning_rate": 4.981336525989019e-05, + "loss": 5.5324, + "step": 6545 + }, + { + "epoch": 0.03893091635740794, + "grad_norm": 2.2984118461608887, + "learning_rate": 4.981330828667901e-05, + "loss": 5.4961, + "step": 6546 + }, + { + "epoch": 0.038936863640688935, + "grad_norm": 2.1745059490203857, + "learning_rate": 4.981325130480576e-05, + "loss": 5.6631, + "step": 6547 + }, + { + "epoch": 0.03894281092396993, + "grad_norm": 2.3001794815063477, + "learning_rate": 4.981319431427046e-05, + "loss": 5.5897, + "step": 6548 + }, + { + "epoch": 0.038948758207250925, + "grad_norm": 2.329446315765381, + "learning_rate": 4.9813137315073136e-05, + "loss": 5.4599, + "step": 6549 + }, + { + "epoch": 0.03895470549053193, + "grad_norm": 2.4700307846069336, + "learning_rate": 4.98130803072138e-05, + "loss": 5.2788, + "step": 6550 + }, + { + "epoch": 0.03896065277381292, + "grad_norm": 2.309767484664917, + "learning_rate": 4.9813023290692467e-05, + "loss": 5.3828, + "step": 6551 + }, + { + "epoch": 0.03896660005709392, + "grad_norm": 2.1923089027404785, + "learning_rate": 4.981296626550917e-05, + "loss": 5.225, + "step": 6552 + }, + { + "epoch": 0.03897254734037492, + "grad_norm": 2.424954652786255, + "learning_rate": 4.981290923166392e-05, + "loss": 5.2007, + "step": 6553 + }, + { + "epoch": 0.038978494623655914, + "grad_norm": 2.53446102142334, + "learning_rate": 4.981285218915674e-05, + "loss": 5.142, + "step": 6554 + }, + { + "epoch": 0.03898444190693691, + "grad_norm": 2.492788791656494, + "learning_rate": 4.9812795137987655e-05, + "loss": 5.5755, + "step": 6555 + }, + { + "epoch": 0.03899038919021791, + "grad_norm": 2.8081278800964355, + "learning_rate": 4.9812738078156674e-05, + "loss": 4.9815, + "step": 6556 + }, + { + "epoch": 0.038996336473498906, + "grad_norm": 2.535109758377075, + "learning_rate": 4.981268100966383e-05, + "loss": 5.3678, + "step": 6557 + }, + { + "epoch": 0.0390022837567799, + "grad_norm": 2.36004900932312, + "learning_rate": 4.981262393250913e-05, + "loss": 5.0422, + "step": 6558 + }, + { + "epoch": 0.0390082310400609, + "grad_norm": 2.2315657138824463, + "learning_rate": 4.98125668466926e-05, + "loss": 5.0345, + "step": 6559 + }, + { + "epoch": 0.0390141783233419, + "grad_norm": 2.293947696685791, + "learning_rate": 4.981250975221425e-05, + "loss": 4.9308, + "step": 6560 + }, + { + "epoch": 0.039020125606622894, + "grad_norm": 2.239915132522583, + "learning_rate": 4.9812452649074124e-05, + "loss": 5.3504, + "step": 6561 + }, + { + "epoch": 0.03902607288990389, + "grad_norm": 1.8740140199661255, + "learning_rate": 4.981239553727222e-05, + "loss": 5.9432, + "step": 6562 + }, + { + "epoch": 0.03903202017318489, + "grad_norm": 1.7221744060516357, + "learning_rate": 4.981233841680857e-05, + "loss": 5.8387, + "step": 6563 + }, + { + "epoch": 0.039037967456465886, + "grad_norm": 1.9648221731185913, + "learning_rate": 4.981228128768318e-05, + "loss": 5.7836, + "step": 6564 + }, + { + "epoch": 0.03904391473974688, + "grad_norm": 1.7790826559066772, + "learning_rate": 4.981222414989608e-05, + "loss": 5.842, + "step": 6565 + }, + { + "epoch": 0.03904986202302788, + "grad_norm": 2.039483070373535, + "learning_rate": 4.9812167003447296e-05, + "loss": 5.6509, + "step": 6566 + }, + { + "epoch": 0.03905580930630888, + "grad_norm": 2.1241865158081055, + "learning_rate": 4.981210984833684e-05, + "loss": 5.5626, + "step": 6567 + }, + { + "epoch": 0.03906175658958987, + "grad_norm": 2.1290524005889893, + "learning_rate": 4.981205268456473e-05, + "loss": 5.5114, + "step": 6568 + }, + { + "epoch": 0.039067703872870875, + "grad_norm": 2.181558132171631, + "learning_rate": 4.981199551213099e-05, + "loss": 5.5356, + "step": 6569 + }, + { + "epoch": 0.03907365115615187, + "grad_norm": 2.1696360111236572, + "learning_rate": 4.9811938331035635e-05, + "loss": 5.5684, + "step": 6570 + }, + { + "epoch": 0.039079598439432865, + "grad_norm": 1.8040674924850464, + "learning_rate": 4.98118811412787e-05, + "loss": 5.605, + "step": 6571 + }, + { + "epoch": 0.03908554572271387, + "grad_norm": 2.4475252628326416, + "learning_rate": 4.981182394286018e-05, + "loss": 6.4733, + "step": 6572 + }, + { + "epoch": 0.03909149300599486, + "grad_norm": 2.0800678730010986, + "learning_rate": 4.981176673578011e-05, + "loss": 5.5613, + "step": 6573 + }, + { + "epoch": 0.03909744028927586, + "grad_norm": 1.7632306814193726, + "learning_rate": 4.981170952003852e-05, + "loss": 5.5971, + "step": 6574 + }, + { + "epoch": 0.03910338757255686, + "grad_norm": 1.6671072244644165, + "learning_rate": 4.981165229563541e-05, + "loss": 5.4462, + "step": 6575 + }, + { + "epoch": 0.039109334855837855, + "grad_norm": 1.8972923755645752, + "learning_rate": 4.981159506257081e-05, + "loss": 5.7747, + "step": 6576 + }, + { + "epoch": 0.03911528213911885, + "grad_norm": 1.8343021869659424, + "learning_rate": 4.981153782084473e-05, + "loss": 5.7542, + "step": 6577 + }, + { + "epoch": 0.039121229422399845, + "grad_norm": 1.669877529144287, + "learning_rate": 4.9811480570457216e-05, + "loss": 5.6736, + "step": 6578 + }, + { + "epoch": 0.03912717670568085, + "grad_norm": 1.9555165767669678, + "learning_rate": 4.981142331140825e-05, + "loss": 5.2997, + "step": 6579 + }, + { + "epoch": 0.03913312398896184, + "grad_norm": 2.5131587982177734, + "learning_rate": 4.981136604369789e-05, + "loss": 5.2093, + "step": 6580 + }, + { + "epoch": 0.03913907127224284, + "grad_norm": 2.0637567043304443, + "learning_rate": 4.9811308767326134e-05, + "loss": 5.1671, + "step": 6581 + }, + { + "epoch": 0.03914501855552384, + "grad_norm": 2.140839099884033, + "learning_rate": 4.9811251482293e-05, + "loss": 5.3237, + "step": 6582 + }, + { + "epoch": 0.039150965838804834, + "grad_norm": 1.968489408493042, + "learning_rate": 4.981119418859852e-05, + "loss": 5.6015, + "step": 6583 + }, + { + "epoch": 0.03915691312208583, + "grad_norm": 1.873827338218689, + "learning_rate": 4.9811136886242705e-05, + "loss": 5.3316, + "step": 6584 + }, + { + "epoch": 0.03916286040536683, + "grad_norm": 1.9897359609603882, + "learning_rate": 4.981107957522558e-05, + "loss": 5.1548, + "step": 6585 + }, + { + "epoch": 0.039168807688647826, + "grad_norm": 2.004457950592041, + "learning_rate": 4.9811022255547165e-05, + "loss": 5.1977, + "step": 6586 + }, + { + "epoch": 0.03917475497192882, + "grad_norm": 2.1058437824249268, + "learning_rate": 4.9810964927207485e-05, + "loss": 5.0217, + "step": 6587 + }, + { + "epoch": 0.03918070225520982, + "grad_norm": 1.9846851825714111, + "learning_rate": 4.981090759020654e-05, + "loss": 5.1123, + "step": 6588 + }, + { + "epoch": 0.03918664953849082, + "grad_norm": 2.018026828765869, + "learning_rate": 4.981085024454437e-05, + "loss": 5.0516, + "step": 6589 + }, + { + "epoch": 0.039192596821771813, + "grad_norm": 1.7792260646820068, + "learning_rate": 4.9810792890220995e-05, + "loss": 5.5266, + "step": 6590 + }, + { + "epoch": 0.03919854410505281, + "grad_norm": 2.0855109691619873, + "learning_rate": 4.981073552723642e-05, + "loss": 5.5504, + "step": 6591 + }, + { + "epoch": 0.03920449138833381, + "grad_norm": 1.9998018741607666, + "learning_rate": 4.9810678155590676e-05, + "loss": 5.3447, + "step": 6592 + }, + { + "epoch": 0.039210438671614806, + "grad_norm": 2.332714557647705, + "learning_rate": 4.981062077528377e-05, + "loss": 5.6166, + "step": 6593 + }, + { + "epoch": 0.0392163859548958, + "grad_norm": 1.9647892713546753, + "learning_rate": 4.981056338631575e-05, + "loss": 5.0113, + "step": 6594 + }, + { + "epoch": 0.0392223332381768, + "grad_norm": 1.9961154460906982, + "learning_rate": 4.9810505988686604e-05, + "loss": 5.0143, + "step": 6595 + }, + { + "epoch": 0.0392282805214578, + "grad_norm": 1.9039133787155151, + "learning_rate": 4.981044858239637e-05, + "loss": 5.3602, + "step": 6596 + }, + { + "epoch": 0.03923422780473879, + "grad_norm": 1.9076604843139648, + "learning_rate": 4.981039116744507e-05, + "loss": 5.4165, + "step": 6597 + }, + { + "epoch": 0.039240175088019795, + "grad_norm": 1.6676216125488281, + "learning_rate": 4.981033374383272e-05, + "loss": 5.4018, + "step": 6598 + }, + { + "epoch": 0.03924612237130079, + "grad_norm": 1.7158783674240112, + "learning_rate": 4.981027631155933e-05, + "loss": 5.3233, + "step": 6599 + }, + { + "epoch": 0.039252069654581785, + "grad_norm": 1.6659481525421143, + "learning_rate": 4.9810218870624945e-05, + "loss": 5.4671, + "step": 6600 + }, + { + "epoch": 0.03925801693786279, + "grad_norm": 2.008171319961548, + "learning_rate": 4.981016142102956e-05, + "loss": 5.6424, + "step": 6601 + }, + { + "epoch": 0.03926396422114378, + "grad_norm": 2.213045835494995, + "learning_rate": 4.9810103962773204e-05, + "loss": 5.419, + "step": 6602 + }, + { + "epoch": 0.03926991150442478, + "grad_norm": 2.0159718990325928, + "learning_rate": 4.981004649585589e-05, + "loss": 5.4301, + "step": 6603 + }, + { + "epoch": 0.03927585878770578, + "grad_norm": 1.982701063156128, + "learning_rate": 4.9809989020277646e-05, + "loss": 5.6001, + "step": 6604 + }, + { + "epoch": 0.039281806070986774, + "grad_norm": 2.1933834552764893, + "learning_rate": 4.98099315360385e-05, + "loss": 5.6756, + "step": 6605 + }, + { + "epoch": 0.03928775335426777, + "grad_norm": 1.858798623085022, + "learning_rate": 4.980987404313846e-05, + "loss": 5.43, + "step": 6606 + }, + { + "epoch": 0.039293700637548765, + "grad_norm": 1.8233433961868286, + "learning_rate": 4.980981654157755e-05, + "loss": 5.4638, + "step": 6607 + }, + { + "epoch": 0.03929964792082977, + "grad_norm": 2.0368216037750244, + "learning_rate": 4.9809759031355784e-05, + "loss": 5.71, + "step": 6608 + }, + { + "epoch": 0.03930559520411076, + "grad_norm": 1.9923310279846191, + "learning_rate": 4.9809701512473196e-05, + "loss": 5.6443, + "step": 6609 + }, + { + "epoch": 0.03931154248739176, + "grad_norm": 2.391463279724121, + "learning_rate": 4.9809643984929785e-05, + "loss": 5.4701, + "step": 6610 + }, + { + "epoch": 0.03931748977067276, + "grad_norm": 1.8456658124923706, + "learning_rate": 4.98095864487256e-05, + "loss": 5.4346, + "step": 6611 + }, + { + "epoch": 0.039323437053953754, + "grad_norm": 1.7941107749938965, + "learning_rate": 4.980952890386063e-05, + "loss": 5.4198, + "step": 6612 + }, + { + "epoch": 0.03932938433723475, + "grad_norm": 1.8455369472503662, + "learning_rate": 4.980947135033492e-05, + "loss": 5.3915, + "step": 6613 + }, + { + "epoch": 0.03933533162051575, + "grad_norm": 1.8710846900939941, + "learning_rate": 4.980941378814847e-05, + "loss": 5.2744, + "step": 6614 + }, + { + "epoch": 0.039341278903796746, + "grad_norm": 2.203129768371582, + "learning_rate": 4.980935621730132e-05, + "loss": 5.4409, + "step": 6615 + }, + { + "epoch": 0.03934722618707774, + "grad_norm": 1.8944141864776611, + "learning_rate": 4.980929863779348e-05, + "loss": 5.4661, + "step": 6616 + }, + { + "epoch": 0.03935317347035874, + "grad_norm": 1.8268091678619385, + "learning_rate": 4.9809241049624966e-05, + "loss": 5.4088, + "step": 6617 + }, + { + "epoch": 0.03935912075363974, + "grad_norm": 1.838927984237671, + "learning_rate": 4.98091834527958e-05, + "loss": 5.5335, + "step": 6618 + }, + { + "epoch": 0.03936506803692073, + "grad_norm": 1.8441804647445679, + "learning_rate": 4.9809125847306e-05, + "loss": 5.4639, + "step": 6619 + }, + { + "epoch": 0.03937101532020173, + "grad_norm": 2.012754440307617, + "learning_rate": 4.980906823315561e-05, + "loss": 5.5606, + "step": 6620 + }, + { + "epoch": 0.03937696260348273, + "grad_norm": 1.8358973264694214, + "learning_rate": 4.980901061034461e-05, + "loss": 5.4217, + "step": 6621 + }, + { + "epoch": 0.039382909886763726, + "grad_norm": 2.0668959617614746, + "learning_rate": 4.980895297887305e-05, + "loss": 5.5164, + "step": 6622 + }, + { + "epoch": 0.03938885717004472, + "grad_norm": 2.032320976257324, + "learning_rate": 4.9808895338740934e-05, + "loss": 5.4914, + "step": 6623 + }, + { + "epoch": 0.03939480445332572, + "grad_norm": 1.8650145530700684, + "learning_rate": 4.980883768994829e-05, + "loss": 5.3718, + "step": 6624 + }, + { + "epoch": 0.03940075173660672, + "grad_norm": 4.494358539581299, + "learning_rate": 4.980878003249515e-05, + "loss": 5.5253, + "step": 6625 + }, + { + "epoch": 0.03940669901988771, + "grad_norm": 1.9295374155044556, + "learning_rate": 4.980872236638151e-05, + "loss": 5.3187, + "step": 6626 + }, + { + "epoch": 0.039412646303168715, + "grad_norm": 2.089717388153076, + "learning_rate": 4.980866469160741e-05, + "loss": 5.5311, + "step": 6627 + }, + { + "epoch": 0.03941859358644971, + "grad_norm": 1.701429843902588, + "learning_rate": 4.980860700817285e-05, + "loss": 5.4529, + "step": 6628 + }, + { + "epoch": 0.039424540869730705, + "grad_norm": 1.8336073160171509, + "learning_rate": 4.980854931607787e-05, + "loss": 5.2987, + "step": 6629 + }, + { + "epoch": 0.03943048815301171, + "grad_norm": 2.7922565937042236, + "learning_rate": 4.9808491615322475e-05, + "loss": 5.3492, + "step": 6630 + }, + { + "epoch": 0.0394364354362927, + "grad_norm": 1.8253742456436157, + "learning_rate": 4.980843390590669e-05, + "loss": 5.3928, + "step": 6631 + }, + { + "epoch": 0.0394423827195737, + "grad_norm": 2.646916151046753, + "learning_rate": 4.980837618783055e-05, + "loss": 5.4329, + "step": 6632 + }, + { + "epoch": 0.0394483300028547, + "grad_norm": 2.1956236362457275, + "learning_rate": 4.980831846109405e-05, + "loss": 5.4794, + "step": 6633 + }, + { + "epoch": 0.039454277286135694, + "grad_norm": 2.7274577617645264, + "learning_rate": 4.980826072569723e-05, + "loss": 5.9666, + "step": 6634 + }, + { + "epoch": 0.03946022456941669, + "grad_norm": 1.9890350103378296, + "learning_rate": 4.98082029816401e-05, + "loss": 5.5518, + "step": 6635 + }, + { + "epoch": 0.039466171852697685, + "grad_norm": 2.7760517597198486, + "learning_rate": 4.980814522892268e-05, + "loss": 5.2777, + "step": 6636 + }, + { + "epoch": 0.03947211913597869, + "grad_norm": 2.035254716873169, + "learning_rate": 4.9808087467544995e-05, + "loss": 5.5872, + "step": 6637 + }, + { + "epoch": 0.03947806641925968, + "grad_norm": 1.9728864431381226, + "learning_rate": 4.980802969750706e-05, + "loss": 5.3357, + "step": 6638 + }, + { + "epoch": 0.03948401370254068, + "grad_norm": 1.795480489730835, + "learning_rate": 4.98079719188089e-05, + "loss": 5.6414, + "step": 6639 + }, + { + "epoch": 0.03948996098582168, + "grad_norm": 1.7882109880447388, + "learning_rate": 4.980791413145054e-05, + "loss": 5.3499, + "step": 6640 + }, + { + "epoch": 0.039495908269102674, + "grad_norm": 1.8416422605514526, + "learning_rate": 4.9807856335431994e-05, + "loss": 5.3292, + "step": 6641 + }, + { + "epoch": 0.03950185555238367, + "grad_norm": 1.9525254964828491, + "learning_rate": 4.9807798530753266e-05, + "loss": 5.2782, + "step": 6642 + }, + { + "epoch": 0.03950780283566467, + "grad_norm": 1.5100830793380737, + "learning_rate": 4.9807740717414406e-05, + "loss": 5.2807, + "step": 6643 + }, + { + "epoch": 0.039513750118945666, + "grad_norm": 2.029430866241455, + "learning_rate": 4.9807682895415406e-05, + "loss": 5.4496, + "step": 6644 + }, + { + "epoch": 0.03951969740222666, + "grad_norm": 1.7976901531219482, + "learning_rate": 4.9807625064756315e-05, + "loss": 5.1021, + "step": 6645 + }, + { + "epoch": 0.03952564468550766, + "grad_norm": 1.5770336389541626, + "learning_rate": 4.980756722543714e-05, + "loss": 5.3946, + "step": 6646 + }, + { + "epoch": 0.03953159196878866, + "grad_norm": 1.8289496898651123, + "learning_rate": 4.980750937745788e-05, + "loss": 5.4821, + "step": 6647 + }, + { + "epoch": 0.03953753925206965, + "grad_norm": 1.7413506507873535, + "learning_rate": 4.980745152081859e-05, + "loss": 5.4827, + "step": 6648 + }, + { + "epoch": 0.03954348653535065, + "grad_norm": 2.048400402069092, + "learning_rate": 4.980739365551927e-05, + "loss": 5.2359, + "step": 6649 + }, + { + "epoch": 0.03954943381863165, + "grad_norm": 2.331897735595703, + "learning_rate": 4.980733578155995e-05, + "loss": 5.2988, + "step": 6650 + }, + { + "epoch": 0.039555381101912646, + "grad_norm": 2.1224608421325684, + "learning_rate": 4.980727789894065e-05, + "loss": 5.1228, + "step": 6651 + }, + { + "epoch": 0.03956132838519364, + "grad_norm": 1.5331578254699707, + "learning_rate": 4.9807220007661374e-05, + "loss": 5.184, + "step": 6652 + }, + { + "epoch": 0.03956727566847464, + "grad_norm": 1.773489236831665, + "learning_rate": 4.980716210772216e-05, + "loss": 5.1883, + "step": 6653 + }, + { + "epoch": 0.03957322295175564, + "grad_norm": 2.119302749633789, + "learning_rate": 4.9807104199123016e-05, + "loss": 5.5437, + "step": 6654 + }, + { + "epoch": 0.03957917023503663, + "grad_norm": 2.0695033073425293, + "learning_rate": 4.9807046281863974e-05, + "loss": 5.5951, + "step": 6655 + }, + { + "epoch": 0.039585117518317635, + "grad_norm": 2.0522243976593018, + "learning_rate": 4.980698835594505e-05, + "loss": 5.2736, + "step": 6656 + }, + { + "epoch": 0.03959106480159863, + "grad_norm": 2.3200113773345947, + "learning_rate": 4.980693042136626e-05, + "loss": 5.5701, + "step": 6657 + }, + { + "epoch": 0.039597012084879625, + "grad_norm": 1.8731193542480469, + "learning_rate": 4.980687247812762e-05, + "loss": 5.3929, + "step": 6658 + }, + { + "epoch": 0.03960295936816063, + "grad_norm": 1.8390223979949951, + "learning_rate": 4.980681452622916e-05, + "loss": 5.1684, + "step": 6659 + }, + { + "epoch": 0.03960890665144162, + "grad_norm": 2.24766206741333, + "learning_rate": 4.980675656567091e-05, + "loss": 5.0232, + "step": 6660 + }, + { + "epoch": 0.03961485393472262, + "grad_norm": 2.2592451572418213, + "learning_rate": 4.980669859645286e-05, + "loss": 4.9878, + "step": 6661 + }, + { + "epoch": 0.03962080121800362, + "grad_norm": 2.14709734916687, + "learning_rate": 4.9806640618575064e-05, + "loss": 5.1036, + "step": 6662 + }, + { + "epoch": 0.039626748501284614, + "grad_norm": 2.133910655975342, + "learning_rate": 4.9806582632037516e-05, + "loss": 5.0356, + "step": 6663 + }, + { + "epoch": 0.03963269578456561, + "grad_norm": 2.2513222694396973, + "learning_rate": 4.980652463684025e-05, + "loss": 5.2357, + "step": 6664 + }, + { + "epoch": 0.039638643067846605, + "grad_norm": 2.078355312347412, + "learning_rate": 4.980646663298328e-05, + "loss": 5.3857, + "step": 6665 + }, + { + "epoch": 0.03964459035112761, + "grad_norm": 2.3798105716705322, + "learning_rate": 4.980640862046663e-05, + "loss": 5.0888, + "step": 6666 + }, + { + "epoch": 0.0396505376344086, + "grad_norm": 2.241868019104004, + "learning_rate": 4.980635059929032e-05, + "loss": 5.1397, + "step": 6667 + }, + { + "epoch": 0.0396564849176896, + "grad_norm": 2.2053534984588623, + "learning_rate": 4.9806292569454365e-05, + "loss": 4.799, + "step": 6668 + }, + { + "epoch": 0.0396624322009706, + "grad_norm": 2.2996716499328613, + "learning_rate": 4.980623453095879e-05, + "loss": 4.9597, + "step": 6669 + }, + { + "epoch": 0.039668379484251594, + "grad_norm": 1.9892657995224, + "learning_rate": 4.9806176483803615e-05, + "loss": 5.0784, + "step": 6670 + }, + { + "epoch": 0.03967432676753259, + "grad_norm": 2.2087242603302, + "learning_rate": 4.980611842798887e-05, + "loss": 5.4099, + "step": 6671 + }, + { + "epoch": 0.03968027405081359, + "grad_norm": 2.215728521347046, + "learning_rate": 4.980606036351455e-05, + "loss": 5.2889, + "step": 6672 + }, + { + "epoch": 0.039686221334094586, + "grad_norm": 2.228073835372925, + "learning_rate": 4.9806002290380705e-05, + "loss": 5.3816, + "step": 6673 + }, + { + "epoch": 0.03969216861737558, + "grad_norm": 2.209808826446533, + "learning_rate": 4.980594420858733e-05, + "loss": 5.6233, + "step": 6674 + }, + { + "epoch": 0.03969811590065658, + "grad_norm": 1.8294177055358887, + "learning_rate": 4.980588611813446e-05, + "loss": 5.5756, + "step": 6675 + }, + { + "epoch": 0.03970406318393758, + "grad_norm": 2.236435890197754, + "learning_rate": 4.980582801902212e-05, + "loss": 5.4807, + "step": 6676 + }, + { + "epoch": 0.03971001046721857, + "grad_norm": 2.528804063796997, + "learning_rate": 4.980576991125031e-05, + "loss": 5.6503, + "step": 6677 + }, + { + "epoch": 0.03971595775049957, + "grad_norm": 2.312063217163086, + "learning_rate": 4.9805711794819065e-05, + "loss": 5.5517, + "step": 6678 + }, + { + "epoch": 0.03972190503378057, + "grad_norm": 2.336134672164917, + "learning_rate": 4.98056536697284e-05, + "loss": 5.5708, + "step": 6679 + }, + { + "epoch": 0.039727852317061566, + "grad_norm": 2.2809929847717285, + "learning_rate": 4.980559553597834e-05, + "loss": 5.453, + "step": 6680 + }, + { + "epoch": 0.03973379960034256, + "grad_norm": 2.0603368282318115, + "learning_rate": 4.98055373935689e-05, + "loss": 5.3482, + "step": 6681 + }, + { + "epoch": 0.03973974688362356, + "grad_norm": 1.9654933214187622, + "learning_rate": 4.980547924250011e-05, + "loss": 5.29, + "step": 6682 + }, + { + "epoch": 0.03974569416690456, + "grad_norm": 2.4211983680725098, + "learning_rate": 4.9805421082771985e-05, + "loss": 5.4261, + "step": 6683 + }, + { + "epoch": 0.03975164145018555, + "grad_norm": 2.129987955093384, + "learning_rate": 4.9805362914384533e-05, + "loss": 5.3551, + "step": 6684 + }, + { + "epoch": 0.039757588733466555, + "grad_norm": 2.127936601638794, + "learning_rate": 4.9805304737337796e-05, + "loss": 5.4647, + "step": 6685 + }, + { + "epoch": 0.03976353601674755, + "grad_norm": 2.303382158279419, + "learning_rate": 4.980524655163178e-05, + "loss": 5.1699, + "step": 6686 + }, + { + "epoch": 0.039769483300028545, + "grad_norm": 2.6889941692352295, + "learning_rate": 4.98051883572665e-05, + "loss": 5.2031, + "step": 6687 + }, + { + "epoch": 0.03977543058330955, + "grad_norm": 3.321950674057007, + "learning_rate": 4.9805130154242e-05, + "loss": 4.9815, + "step": 6688 + }, + { + "epoch": 0.03978137786659054, + "grad_norm": 3.1951568126678467, + "learning_rate": 4.980507194255827e-05, + "loss": 4.8946, + "step": 6689 + }, + { + "epoch": 0.03978732514987154, + "grad_norm": 2.355271816253662, + "learning_rate": 4.9805013722215355e-05, + "loss": 5.9223, + "step": 6690 + }, + { + "epoch": 0.03979327243315254, + "grad_norm": 2.3401644229888916, + "learning_rate": 4.9804955493213264e-05, + "loss": 6.1826, + "step": 6691 + }, + { + "epoch": 0.039799219716433534, + "grad_norm": 2.191997766494751, + "learning_rate": 4.980489725555202e-05, + "loss": 5.5617, + "step": 6692 + }, + { + "epoch": 0.03980516699971453, + "grad_norm": 2.377803087234497, + "learning_rate": 4.9804839009231644e-05, + "loss": 5.684, + "step": 6693 + }, + { + "epoch": 0.039811114282995524, + "grad_norm": 1.9084972143173218, + "learning_rate": 4.980478075425215e-05, + "loss": 6.0291, + "step": 6694 + }, + { + "epoch": 0.039817061566276526, + "grad_norm": 2.185628890991211, + "learning_rate": 4.9804722490613566e-05, + "loss": 5.5808, + "step": 6695 + }, + { + "epoch": 0.03982300884955752, + "grad_norm": 2.3253934383392334, + "learning_rate": 4.980466421831591e-05, + "loss": 5.7076, + "step": 6696 + }, + { + "epoch": 0.03982895613283852, + "grad_norm": 2.1599392890930176, + "learning_rate": 4.98046059373592e-05, + "loss": 5.9607, + "step": 6697 + }, + { + "epoch": 0.03983490341611952, + "grad_norm": 2.093137741088867, + "learning_rate": 4.980454764774346e-05, + "loss": 6.0014, + "step": 6698 + }, + { + "epoch": 0.039840850699400514, + "grad_norm": 2.4242093563079834, + "learning_rate": 4.980448934946871e-05, + "loss": 5.6255, + "step": 6699 + }, + { + "epoch": 0.03984679798268151, + "grad_norm": 2.523277521133423, + "learning_rate": 4.980443104253497e-05, + "loss": 5.5302, + "step": 6700 + }, + { + "epoch": 0.03985274526596251, + "grad_norm": 1.7926498651504517, + "learning_rate": 4.980437272694225e-05, + "loss": 5.6467, + "step": 6701 + }, + { + "epoch": 0.039858692549243506, + "grad_norm": 1.7630435228347778, + "learning_rate": 4.980431440269059e-05, + "loss": 5.9615, + "step": 6702 + }, + { + "epoch": 0.0398646398325245, + "grad_norm": 1.8051058053970337, + "learning_rate": 4.980425606978e-05, + "loss": 6.13, + "step": 6703 + }, + { + "epoch": 0.0398705871158055, + "grad_norm": 2.104901075363159, + "learning_rate": 4.98041977282105e-05, + "loss": 6.142, + "step": 6704 + }, + { + "epoch": 0.0398765343990865, + "grad_norm": 1.7022942304611206, + "learning_rate": 4.98041393779821e-05, + "loss": 5.6764, + "step": 6705 + }, + { + "epoch": 0.03988248168236749, + "grad_norm": 2.140230178833008, + "learning_rate": 4.980408101909485e-05, + "loss": 5.9796, + "step": 6706 + }, + { + "epoch": 0.03988842896564849, + "grad_norm": 1.9564754962921143, + "learning_rate": 4.9804022651548734e-05, + "loss": 6.005, + "step": 6707 + }, + { + "epoch": 0.03989437624892949, + "grad_norm": 1.9460588693618774, + "learning_rate": 4.9803964275343795e-05, + "loss": 5.9784, + "step": 6708 + }, + { + "epoch": 0.039900323532210485, + "grad_norm": 1.7314271926879883, + "learning_rate": 4.980390589048005e-05, + "loss": 5.7766, + "step": 6709 + }, + { + "epoch": 0.03990627081549148, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.9803847496957524e-05, + "loss": 5.7386, + "step": 6710 + }, + { + "epoch": 0.03991221809877248, + "grad_norm": 2.3194711208343506, + "learning_rate": 4.980378909477622e-05, + "loss": 6.1324, + "step": 6711 + }, + { + "epoch": 0.03991816538205348, + "grad_norm": 2.3532958030700684, + "learning_rate": 4.980373068393618e-05, + "loss": 6.027, + "step": 6712 + }, + { + "epoch": 0.03992411266533447, + "grad_norm": 2.5944385528564453, + "learning_rate": 4.980367226443741e-05, + "loss": 6.2892, + "step": 6713 + }, + { + "epoch": 0.039930059948615475, + "grad_norm": 1.5707015991210938, + "learning_rate": 4.9803613836279926e-05, + "loss": 5.6525, + "step": 6714 + }, + { + "epoch": 0.03993600723189647, + "grad_norm": 2.022613286972046, + "learning_rate": 4.980355539946376e-05, + "loss": 5.8943, + "step": 6715 + }, + { + "epoch": 0.039941954515177465, + "grad_norm": 1.7783907651901245, + "learning_rate": 4.980349695398894e-05, + "loss": 5.6451, + "step": 6716 + }, + { + "epoch": 0.03994790179845847, + "grad_norm": 2.098841428756714, + "learning_rate": 4.980343849985547e-05, + "loss": 6.1143, + "step": 6717 + }, + { + "epoch": 0.03995384908173946, + "grad_norm": 2.045955181121826, + "learning_rate": 4.9803380037063374e-05, + "loss": 6.1802, + "step": 6718 + }, + { + "epoch": 0.03995979636502046, + "grad_norm": 1.7324507236480713, + "learning_rate": 4.980332156561267e-05, + "loss": 6.081, + "step": 6719 + }, + { + "epoch": 0.03996574364830146, + "grad_norm": 1.795184850692749, + "learning_rate": 4.9803263085503385e-05, + "loss": 5.6075, + "step": 6720 + }, + { + "epoch": 0.039971690931582454, + "grad_norm": 2.1466586589813232, + "learning_rate": 4.980320459673554e-05, + "loss": 6.045, + "step": 6721 + }, + { + "epoch": 0.03997763821486345, + "grad_norm": 2.1261258125305176, + "learning_rate": 4.980314609930915e-05, + "loss": 6.0589, + "step": 6722 + }, + { + "epoch": 0.039983585498144444, + "grad_norm": 2.559584617614746, + "learning_rate": 4.980308759322424e-05, + "loss": 6.3894, + "step": 6723 + }, + { + "epoch": 0.039989532781425446, + "grad_norm": 2.4580929279327393, + "learning_rate": 4.980302907848083e-05, + "loss": 6.3979, + "step": 6724 + }, + { + "epoch": 0.03999548006470644, + "grad_norm": 1.8877859115600586, + "learning_rate": 4.9802970555078934e-05, + "loss": 5.5076, + "step": 6725 + }, + { + "epoch": 0.04000142734798744, + "grad_norm": 2.145123243331909, + "learning_rate": 4.9802912023018585e-05, + "loss": 6.1913, + "step": 6726 + }, + { + "epoch": 0.04000737463126844, + "grad_norm": 1.9321368932724, + "learning_rate": 4.980285348229979e-05, + "loss": 5.9614, + "step": 6727 + }, + { + "epoch": 0.040013321914549434, + "grad_norm": 1.883589506149292, + "learning_rate": 4.9802794932922577e-05, + "loss": 5.4293, + "step": 6728 + }, + { + "epoch": 0.04001926919783043, + "grad_norm": 1.9066367149353027, + "learning_rate": 4.980273637488696e-05, + "loss": 5.4299, + "step": 6729 + }, + { + "epoch": 0.04002521648111143, + "grad_norm": 1.845290184020996, + "learning_rate": 4.9802677808192963e-05, + "loss": 5.596, + "step": 6730 + }, + { + "epoch": 0.040031163764392426, + "grad_norm": 2.3295016288757324, + "learning_rate": 4.980261923284062e-05, + "loss": 6.1266, + "step": 6731 + }, + { + "epoch": 0.04003711104767342, + "grad_norm": 2.451676368713379, + "learning_rate": 4.980256064882993e-05, + "loss": 6.0578, + "step": 6732 + }, + { + "epoch": 0.04004305833095442, + "grad_norm": 2.1317830085754395, + "learning_rate": 4.9802502056160915e-05, + "loss": 6.2627, + "step": 6733 + }, + { + "epoch": 0.04004900561423542, + "grad_norm": 2.223085641860962, + "learning_rate": 4.980244345483361e-05, + "loss": 5.5751, + "step": 6734 + }, + { + "epoch": 0.04005495289751641, + "grad_norm": 2.508385181427002, + "learning_rate": 4.9802384844848035e-05, + "loss": 5.572, + "step": 6735 + }, + { + "epoch": 0.04006090018079741, + "grad_norm": 2.5150837898254395, + "learning_rate": 4.98023262262042e-05, + "loss": 5.3443, + "step": 6736 + }, + { + "epoch": 0.04006684746407841, + "grad_norm": 2.293503761291504, + "learning_rate": 4.980226759890212e-05, + "loss": 5.37, + "step": 6737 + }, + { + "epoch": 0.040072794747359405, + "grad_norm": 1.8764920234680176, + "learning_rate": 4.9802208962941834e-05, + "loss": 5.3804, + "step": 6738 + }, + { + "epoch": 0.0400787420306404, + "grad_norm": 1.8443305492401123, + "learning_rate": 4.980215031832335e-05, + "loss": 5.7787, + "step": 6739 + }, + { + "epoch": 0.0400846893139214, + "grad_norm": 2.6707816123962402, + "learning_rate": 4.980209166504669e-05, + "loss": 6.2858, + "step": 6740 + }, + { + "epoch": 0.0400906365972024, + "grad_norm": 2.3520665168762207, + "learning_rate": 4.980203300311188e-05, + "loss": 5.8069, + "step": 6741 + }, + { + "epoch": 0.04009658388048339, + "grad_norm": 2.0564348697662354, + "learning_rate": 4.980197433251893e-05, + "loss": 6.1698, + "step": 6742 + }, + { + "epoch": 0.040102531163764395, + "grad_norm": 2.205469846725464, + "learning_rate": 4.9801915653267875e-05, + "loss": 5.8401, + "step": 6743 + }, + { + "epoch": 0.04010847844704539, + "grad_norm": 2.042363405227661, + "learning_rate": 4.980185696535873e-05, + "loss": 5.9673, + "step": 6744 + }, + { + "epoch": 0.040114425730326385, + "grad_norm": 1.7575644254684448, + "learning_rate": 4.98017982687915e-05, + "loss": 5.7852, + "step": 6745 + }, + { + "epoch": 0.04012037301360739, + "grad_norm": 1.968548059463501, + "learning_rate": 4.980173956356623e-05, + "loss": 6.2085, + "step": 6746 + }, + { + "epoch": 0.04012632029688838, + "grad_norm": 2.0365097522735596, + "learning_rate": 4.980168084968292e-05, + "loss": 6.4235, + "step": 6747 + }, + { + "epoch": 0.04013226758016938, + "grad_norm": 2.7265079021453857, + "learning_rate": 4.9801622127141605e-05, + "loss": 6.0804, + "step": 6748 + }, + { + "epoch": 0.04013821486345038, + "grad_norm": 2.1604299545288086, + "learning_rate": 4.98015633959423e-05, + "loss": 5.942, + "step": 6749 + }, + { + "epoch": 0.040144162146731374, + "grad_norm": 2.4122307300567627, + "learning_rate": 4.980150465608502e-05, + "loss": 6.2877, + "step": 6750 + }, + { + "epoch": 0.04015010943001237, + "grad_norm": 2.040780782699585, + "learning_rate": 4.98014459075698e-05, + "loss": 5.645, + "step": 6751 + }, + { + "epoch": 0.040156056713293364, + "grad_norm": 2.3660147190093994, + "learning_rate": 4.980138715039665e-05, + "loss": 5.975, + "step": 6752 + }, + { + "epoch": 0.040162003996574366, + "grad_norm": 2.2332143783569336, + "learning_rate": 4.980132838456558e-05, + "loss": 6.1383, + "step": 6753 + }, + { + "epoch": 0.04016795127985536, + "grad_norm": 2.7028262615203857, + "learning_rate": 4.9801269610076635e-05, + "loss": 6.3817, + "step": 6754 + }, + { + "epoch": 0.04017389856313636, + "grad_norm": 2.4653360843658447, + "learning_rate": 4.980121082692982e-05, + "loss": 6.3079, + "step": 6755 + }, + { + "epoch": 0.04017984584641736, + "grad_norm": 2.1470963954925537, + "learning_rate": 4.980115203512515e-05, + "loss": 6.063, + "step": 6756 + }, + { + "epoch": 0.040185793129698354, + "grad_norm": 2.3440990447998047, + "learning_rate": 4.9801093234662666e-05, + "loss": 5.818, + "step": 6757 + }, + { + "epoch": 0.04019174041297935, + "grad_norm": 2.120245933532715, + "learning_rate": 4.980103442554237e-05, + "loss": 5.5867, + "step": 6758 + }, + { + "epoch": 0.04019768769626035, + "grad_norm": 3.196829080581665, + "learning_rate": 4.980097560776429e-05, + "loss": 6.0369, + "step": 6759 + }, + { + "epoch": 0.040203634979541346, + "grad_norm": 2.247997522354126, + "learning_rate": 4.9800916781328456e-05, + "loss": 5.8383, + "step": 6760 + }, + { + "epoch": 0.04020958226282234, + "grad_norm": 2.26254940032959, + "learning_rate": 4.9800857946234866e-05, + "loss": 5.8477, + "step": 6761 + }, + { + "epoch": 0.04021552954610334, + "grad_norm": 2.200495958328247, + "learning_rate": 4.9800799102483556e-05, + "loss": 5.681, + "step": 6762 + }, + { + "epoch": 0.04022147682938434, + "grad_norm": 2.136009454727173, + "learning_rate": 4.980074025007454e-05, + "loss": 5.6453, + "step": 6763 + }, + { + "epoch": 0.04022742411266533, + "grad_norm": 2.3510351181030273, + "learning_rate": 4.980068138900785e-05, + "loss": 5.5735, + "step": 6764 + }, + { + "epoch": 0.040233371395946335, + "grad_norm": 2.249199628829956, + "learning_rate": 4.980062251928349e-05, + "loss": 5.9883, + "step": 6765 + }, + { + "epoch": 0.04023931867922733, + "grad_norm": 2.426816463470459, + "learning_rate": 4.9800563640901494e-05, + "loss": 6.1658, + "step": 6766 + }, + { + "epoch": 0.040245265962508325, + "grad_norm": 2.1044836044311523, + "learning_rate": 4.9800504753861874e-05, + "loss": 5.8627, + "step": 6767 + }, + { + "epoch": 0.04025121324578932, + "grad_norm": 1.9563783407211304, + "learning_rate": 4.9800445858164656e-05, + "loss": 5.9642, + "step": 6768 + }, + { + "epoch": 0.04025716052907032, + "grad_norm": 2.3810997009277344, + "learning_rate": 4.980038695380986e-05, + "loss": 5.2938, + "step": 6769 + }, + { + "epoch": 0.04026310781235132, + "grad_norm": 2.3180932998657227, + "learning_rate": 4.98003280407975e-05, + "loss": 5.7682, + "step": 6770 + }, + { + "epoch": 0.04026905509563231, + "grad_norm": 2.420954704284668, + "learning_rate": 4.980026911912761e-05, + "loss": 5.5724, + "step": 6771 + }, + { + "epoch": 0.040275002378913315, + "grad_norm": 2.447460651397705, + "learning_rate": 4.9800210188800193e-05, + "loss": 5.4844, + "step": 6772 + }, + { + "epoch": 0.04028094966219431, + "grad_norm": 2.4059863090515137, + "learning_rate": 4.980015124981529e-05, + "loss": 5.604, + "step": 6773 + }, + { + "epoch": 0.040286896945475305, + "grad_norm": 2.251492977142334, + "learning_rate": 4.9800092302172894e-05, + "loss": 5.4565, + "step": 6774 + }, + { + "epoch": 0.04029284422875631, + "grad_norm": 2.478682279586792, + "learning_rate": 4.980003334587305e-05, + "loss": 5.9416, + "step": 6775 + }, + { + "epoch": 0.0402987915120373, + "grad_norm": 2.2685835361480713, + "learning_rate": 4.9799974380915785e-05, + "loss": 5.9659, + "step": 6776 + }, + { + "epoch": 0.0403047387953183, + "grad_norm": 2.833101987838745, + "learning_rate": 4.979991540730108e-05, + "loss": 5.3406, + "step": 6777 + }, + { + "epoch": 0.0403106860785993, + "grad_norm": 3.0967416763305664, + "learning_rate": 4.9799856425029e-05, + "loss": 5.5848, + "step": 6778 + }, + { + "epoch": 0.040316633361880294, + "grad_norm": 2.3081796169281006, + "learning_rate": 4.9799797434099536e-05, + "loss": 5.5964, + "step": 6779 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 2.359531879425049, + "learning_rate": 4.9799738434512724e-05, + "loss": 5.6614, + "step": 6780 + }, + { + "epoch": 0.040328527928442284, + "grad_norm": 2.1566221714019775, + "learning_rate": 4.979967942626858e-05, + "loss": 6.0517, + "step": 6781 + }, + { + "epoch": 0.040334475211723286, + "grad_norm": 2.3964991569519043, + "learning_rate": 4.979962040936712e-05, + "loss": 5.9516, + "step": 6782 + }, + { + "epoch": 0.04034042249500428, + "grad_norm": 1.9913266897201538, + "learning_rate": 4.9799561383808365e-05, + "loss": 5.9144, + "step": 6783 + }, + { + "epoch": 0.040346369778285276, + "grad_norm": 1.7329169511795044, + "learning_rate": 4.979950234959235e-05, + "loss": 6.0393, + "step": 6784 + }, + { + "epoch": 0.04035231706156628, + "grad_norm": 1.8278034925460815, + "learning_rate": 4.979944330671908e-05, + "loss": 5.9318, + "step": 6785 + }, + { + "epoch": 0.040358264344847274, + "grad_norm": 2.089806318283081, + "learning_rate": 4.979938425518858e-05, + "loss": 5.5726, + "step": 6786 + }, + { + "epoch": 0.04036421162812827, + "grad_norm": 2.03664231300354, + "learning_rate": 4.9799325195000874e-05, + "loss": 5.8265, + "step": 6787 + }, + { + "epoch": 0.04037015891140927, + "grad_norm": 1.8801567554473877, + "learning_rate": 4.979926612615597e-05, + "loss": 5.7575, + "step": 6788 + }, + { + "epoch": 0.040376106194690266, + "grad_norm": 1.814959168434143, + "learning_rate": 4.979920704865391e-05, + "loss": 5.8737, + "step": 6789 + }, + { + "epoch": 0.04038205347797126, + "grad_norm": 1.7018035650253296, + "learning_rate": 4.97991479624947e-05, + "loss": 5.6768, + "step": 6790 + }, + { + "epoch": 0.04038800076125226, + "grad_norm": 2.21545147895813, + "learning_rate": 4.979908886767837e-05, + "loss": 5.4206, + "step": 6791 + }, + { + "epoch": 0.04039394804453326, + "grad_norm": 2.6184499263763428, + "learning_rate": 4.979902976420492e-05, + "loss": 5.0255, + "step": 6792 + }, + { + "epoch": 0.04039989532781425, + "grad_norm": 2.3914453983306885, + "learning_rate": 4.9798970652074396e-05, + "loss": 4.884, + "step": 6793 + }, + { + "epoch": 0.040405842611095255, + "grad_norm": 2.4367334842681885, + "learning_rate": 4.97989115312868e-05, + "loss": 4.7445, + "step": 6794 + }, + { + "epoch": 0.04041178989437625, + "grad_norm": 2.794490337371826, + "learning_rate": 4.9798852401842165e-05, + "loss": 4.9686, + "step": 6795 + }, + { + "epoch": 0.040417737177657245, + "grad_norm": 2.665395736694336, + "learning_rate": 4.979879326374051e-05, + "loss": 4.854, + "step": 6796 + }, + { + "epoch": 0.04042368446093824, + "grad_norm": 2.0832581520080566, + "learning_rate": 4.979873411698184e-05, + "loss": 5.0371, + "step": 6797 + }, + { + "epoch": 0.04042963174421924, + "grad_norm": 2.4604554176330566, + "learning_rate": 4.979867496156619e-05, + "loss": 4.7524, + "step": 6798 + }, + { + "epoch": 0.04043557902750024, + "grad_norm": 2.3760480880737305, + "learning_rate": 4.979861579749359e-05, + "loss": 4.7645, + "step": 6799 + }, + { + "epoch": 0.04044152631078123, + "grad_norm": 2.468043088912964, + "learning_rate": 4.979855662476405e-05, + "loss": 4.7791, + "step": 6800 + }, + { + "epoch": 0.040447473594062235, + "grad_norm": 2.516026258468628, + "learning_rate": 4.979849744337758e-05, + "loss": 4.7978, + "step": 6801 + }, + { + "epoch": 0.04045342087734323, + "grad_norm": 2.1882307529449463, + "learning_rate": 4.979843825333421e-05, + "loss": 5.002, + "step": 6802 + }, + { + "epoch": 0.040459368160624225, + "grad_norm": 2.423140525817871, + "learning_rate": 4.979837905463397e-05, + "loss": 5.0161, + "step": 6803 + }, + { + "epoch": 0.04046531544390523, + "grad_norm": 2.485739231109619, + "learning_rate": 4.979831984727687e-05, + "loss": 4.7613, + "step": 6804 + }, + { + "epoch": 0.04047126272718622, + "grad_norm": 2.267744302749634, + "learning_rate": 4.979826063126293e-05, + "loss": 4.7496, + "step": 6805 + }, + { + "epoch": 0.04047721001046722, + "grad_norm": 2.3172249794006348, + "learning_rate": 4.9798201406592176e-05, + "loss": 4.8153, + "step": 6806 + }, + { + "epoch": 0.04048315729374822, + "grad_norm": 2.309471607208252, + "learning_rate": 4.979814217326463e-05, + "loss": 4.9874, + "step": 6807 + }, + { + "epoch": 0.040489104577029214, + "grad_norm": 1.989372968673706, + "learning_rate": 4.97980829312803e-05, + "loss": 5.1254, + "step": 6808 + }, + { + "epoch": 0.04049505186031021, + "grad_norm": 2.4409830570220947, + "learning_rate": 4.9798023680639216e-05, + "loss": 4.6476, + "step": 6809 + }, + { + "epoch": 0.040500999143591204, + "grad_norm": 2.5192453861236572, + "learning_rate": 4.97979644213414e-05, + "loss": 4.6933, + "step": 6810 + }, + { + "epoch": 0.040506946426872206, + "grad_norm": 2.294718027114868, + "learning_rate": 4.979790515338688e-05, + "loss": 4.8266, + "step": 6811 + }, + { + "epoch": 0.0405128937101532, + "grad_norm": 2.294550657272339, + "learning_rate": 4.979784587677565e-05, + "loss": 4.6691, + "step": 6812 + }, + { + "epoch": 0.040518840993434196, + "grad_norm": 2.332326889038086, + "learning_rate": 4.979778659150776e-05, + "loss": 4.8366, + "step": 6813 + }, + { + "epoch": 0.0405247882767152, + "grad_norm": 2.325439929962158, + "learning_rate": 4.979772729758322e-05, + "loss": 4.8149, + "step": 6814 + }, + { + "epoch": 0.040530735559996194, + "grad_norm": 2.165926456451416, + "learning_rate": 4.979766799500204e-05, + "loss": 4.7309, + "step": 6815 + }, + { + "epoch": 0.04053668284327719, + "grad_norm": 2.3184943199157715, + "learning_rate": 4.9797608683764264e-05, + "loss": 4.7163, + "step": 6816 + }, + { + "epoch": 0.04054263012655819, + "grad_norm": 2.2161147594451904, + "learning_rate": 4.979754936386989e-05, + "loss": 4.5549, + "step": 6817 + }, + { + "epoch": 0.040548577409839186, + "grad_norm": 2.415496587753296, + "learning_rate": 4.979749003531895e-05, + "loss": 4.7676, + "step": 6818 + }, + { + "epoch": 0.04055452469312018, + "grad_norm": 2.1700618267059326, + "learning_rate": 4.979743069811146e-05, + "loss": 4.8448, + "step": 6819 + }, + { + "epoch": 0.04056047197640118, + "grad_norm": 2.4978747367858887, + "learning_rate": 4.9797371352247446e-05, + "loss": 6.363, + "step": 6820 + }, + { + "epoch": 0.04056641925968218, + "grad_norm": 1.9293922185897827, + "learning_rate": 4.979731199772693e-05, + "loss": 5.6502, + "step": 6821 + }, + { + "epoch": 0.04057236654296317, + "grad_norm": 2.5583136081695557, + "learning_rate": 4.9797252634549915e-05, + "loss": 4.874, + "step": 6822 + }, + { + "epoch": 0.040578313826244175, + "grad_norm": 2.263460159301758, + "learning_rate": 4.979719326271645e-05, + "loss": 5.8457, + "step": 6823 + }, + { + "epoch": 0.04058426110952517, + "grad_norm": 2.5630266666412354, + "learning_rate": 4.979713388222653e-05, + "loss": 4.8668, + "step": 6824 + }, + { + "epoch": 0.040590208392806165, + "grad_norm": 2.2965216636657715, + "learning_rate": 4.9797074493080186e-05, + "loss": 5.0049, + "step": 6825 + }, + { + "epoch": 0.04059615567608716, + "grad_norm": 2.222405433654785, + "learning_rate": 4.979701509527745e-05, + "loss": 5.0204, + "step": 6826 + }, + { + "epoch": 0.04060210295936816, + "grad_norm": 2.4425504207611084, + "learning_rate": 4.979695568881833e-05, + "loss": 5.687, + "step": 6827 + }, + { + "epoch": 0.04060805024264916, + "grad_norm": 2.329901933670044, + "learning_rate": 4.979689627370284e-05, + "loss": 5.9447, + "step": 6828 + }, + { + "epoch": 0.04061399752593015, + "grad_norm": 2.3041510581970215, + "learning_rate": 4.9796836849931015e-05, + "loss": 5.9277, + "step": 6829 + }, + { + "epoch": 0.040619944809211155, + "grad_norm": 2.3020026683807373, + "learning_rate": 4.979677741750287e-05, + "loss": 5.9675, + "step": 6830 + }, + { + "epoch": 0.04062589209249215, + "grad_norm": 2.1861371994018555, + "learning_rate": 4.9796717976418426e-05, + "loss": 6.1312, + "step": 6831 + }, + { + "epoch": 0.040631839375773145, + "grad_norm": 1.9544565677642822, + "learning_rate": 4.979665852667771e-05, + "loss": 5.9218, + "step": 6832 + }, + { + "epoch": 0.04063778665905415, + "grad_norm": 2.346431016921997, + "learning_rate": 4.979659906828073e-05, + "loss": 6.1668, + "step": 6833 + }, + { + "epoch": 0.04064373394233514, + "grad_norm": 2.0405263900756836, + "learning_rate": 4.979653960122751e-05, + "loss": 6.0501, + "step": 6834 + }, + { + "epoch": 0.04064968122561614, + "grad_norm": 1.7645004987716675, + "learning_rate": 4.979648012551809e-05, + "loss": 6.0299, + "step": 6835 + }, + { + "epoch": 0.04065562850889714, + "grad_norm": 2.284703016281128, + "learning_rate": 4.979642064115246e-05, + "loss": 5.5501, + "step": 6836 + }, + { + "epoch": 0.040661575792178134, + "grad_norm": 1.7246543169021606, + "learning_rate": 4.979636114813066e-05, + "loss": 5.5733, + "step": 6837 + }, + { + "epoch": 0.04066752307545913, + "grad_norm": 2.0958921909332275, + "learning_rate": 4.9796301646452705e-05, + "loss": 5.8998, + "step": 6838 + }, + { + "epoch": 0.040673470358740124, + "grad_norm": 2.2123169898986816, + "learning_rate": 4.979624213611862e-05, + "loss": 6.0322, + "step": 6839 + }, + { + "epoch": 0.040679417642021126, + "grad_norm": 1.9541656970977783, + "learning_rate": 4.9796182617128426e-05, + "loss": 5.9255, + "step": 6840 + }, + { + "epoch": 0.04068536492530212, + "grad_norm": 2.077601909637451, + "learning_rate": 4.979612308948213e-05, + "loss": 5.6975, + "step": 6841 + }, + { + "epoch": 0.040691312208583116, + "grad_norm": 2.0595803260803223, + "learning_rate": 4.979606355317977e-05, + "loss": 6.0696, + "step": 6842 + }, + { + "epoch": 0.04069725949186412, + "grad_norm": 1.9800641536712646, + "learning_rate": 4.979600400822136e-05, + "loss": 5.7357, + "step": 6843 + }, + { + "epoch": 0.040703206775145113, + "grad_norm": 2.26238751411438, + "learning_rate": 4.979594445460692e-05, + "loss": 5.9119, + "step": 6844 + }, + { + "epoch": 0.04070915405842611, + "grad_norm": 2.0941457748413086, + "learning_rate": 4.979588489233648e-05, + "loss": 5.945, + "step": 6845 + }, + { + "epoch": 0.04071510134170711, + "grad_norm": 2.1995291709899902, + "learning_rate": 4.979582532141005e-05, + "loss": 5.8406, + "step": 6846 + }, + { + "epoch": 0.040721048624988106, + "grad_norm": 2.0138349533081055, + "learning_rate": 4.9795765741827646e-05, + "loss": 5.7984, + "step": 6847 + }, + { + "epoch": 0.0407269959082691, + "grad_norm": 1.9314415454864502, + "learning_rate": 4.9795706153589304e-05, + "loss": 5.8686, + "step": 6848 + }, + { + "epoch": 0.0407329431915501, + "grad_norm": 2.1324212551116943, + "learning_rate": 4.979564655669503e-05, + "loss": 5.8477, + "step": 6849 + }, + { + "epoch": 0.0407388904748311, + "grad_norm": 1.9601761102676392, + "learning_rate": 4.979558695114486e-05, + "loss": 5.9078, + "step": 6850 + }, + { + "epoch": 0.04074483775811209, + "grad_norm": 2.004333734512329, + "learning_rate": 4.97955273369388e-05, + "loss": 5.9852, + "step": 6851 + }, + { + "epoch": 0.040750785041393095, + "grad_norm": 1.9015164375305176, + "learning_rate": 4.979546771407688e-05, + "loss": 5.6286, + "step": 6852 + }, + { + "epoch": 0.04075673232467409, + "grad_norm": 1.9674208164215088, + "learning_rate": 4.979540808255911e-05, + "loss": 5.8715, + "step": 6853 + }, + { + "epoch": 0.040762679607955085, + "grad_norm": 2.0473713874816895, + "learning_rate": 4.9795348442385534e-05, + "loss": 5.7488, + "step": 6854 + }, + { + "epoch": 0.04076862689123608, + "grad_norm": 1.9536950588226318, + "learning_rate": 4.979528879355615e-05, + "loss": 5.6755, + "step": 6855 + }, + { + "epoch": 0.04077457417451708, + "grad_norm": 2.189659595489502, + "learning_rate": 4.979522913607099e-05, + "loss": 5.7934, + "step": 6856 + }, + { + "epoch": 0.04078052145779808, + "grad_norm": 1.999742031097412, + "learning_rate": 4.9795169469930067e-05, + "loss": 5.7341, + "step": 6857 + }, + { + "epoch": 0.04078646874107907, + "grad_norm": 2.1212494373321533, + "learning_rate": 4.9795109795133414e-05, + "loss": 5.8465, + "step": 6858 + }, + { + "epoch": 0.040792416024360074, + "grad_norm": 1.966467261314392, + "learning_rate": 4.979505011168104e-05, + "loss": 5.8699, + "step": 6859 + }, + { + "epoch": 0.04079836330764107, + "grad_norm": 2.290205955505371, + "learning_rate": 4.979499041957297e-05, + "loss": 6.387, + "step": 6860 + }, + { + "epoch": 0.040804310590922065, + "grad_norm": 2.41827130317688, + "learning_rate": 4.979493071880923e-05, + "loss": 6.893, + "step": 6861 + }, + { + "epoch": 0.04081025787420307, + "grad_norm": 2.0652520656585693, + "learning_rate": 4.979487100938983e-05, + "loss": 6.6435, + "step": 6862 + }, + { + "epoch": 0.04081620515748406, + "grad_norm": 1.8594858646392822, + "learning_rate": 4.979481129131479e-05, + "loss": 5.7441, + "step": 6863 + }, + { + "epoch": 0.04082215244076506, + "grad_norm": 2.269240617752075, + "learning_rate": 4.979475156458415e-05, + "loss": 5.8468, + "step": 6864 + }, + { + "epoch": 0.04082809972404606, + "grad_norm": 2.2355518341064453, + "learning_rate": 4.979469182919792e-05, + "loss": 5.8717, + "step": 6865 + }, + { + "epoch": 0.040834047007327054, + "grad_norm": 1.9578050374984741, + "learning_rate": 4.9794632085156105e-05, + "loss": 5.6777, + "step": 6866 + }, + { + "epoch": 0.04083999429060805, + "grad_norm": 2.354609727859497, + "learning_rate": 4.979457233245875e-05, + "loss": 5.7993, + "step": 6867 + }, + { + "epoch": 0.040845941573889044, + "grad_norm": 1.978289008140564, + "learning_rate": 4.9794512571105865e-05, + "loss": 5.7429, + "step": 6868 + }, + { + "epoch": 0.040851888857170046, + "grad_norm": 1.9695252180099487, + "learning_rate": 4.979445280109747e-05, + "loss": 6.1322, + "step": 6869 + }, + { + "epoch": 0.04085783614045104, + "grad_norm": 2.172510862350464, + "learning_rate": 4.9794393022433586e-05, + "loss": 5.9443, + "step": 6870 + }, + { + "epoch": 0.040863783423732036, + "grad_norm": 2.1992416381835938, + "learning_rate": 4.9794333235114244e-05, + "loss": 6.4094, + "step": 6871 + }, + { + "epoch": 0.04086973070701304, + "grad_norm": 2.1804773807525635, + "learning_rate": 4.979427343913945e-05, + "loss": 6.3871, + "step": 6872 + }, + { + "epoch": 0.04087567799029403, + "grad_norm": 2.2877554893493652, + "learning_rate": 4.979421363450923e-05, + "loss": 6.2509, + "step": 6873 + }, + { + "epoch": 0.04088162527357503, + "grad_norm": 2.0697927474975586, + "learning_rate": 4.979415382122361e-05, + "loss": 5.9008, + "step": 6874 + }, + { + "epoch": 0.04088757255685603, + "grad_norm": 2.2907917499542236, + "learning_rate": 4.97940939992826e-05, + "loss": 5.6137, + "step": 6875 + }, + { + "epoch": 0.040893519840137026, + "grad_norm": 1.9960983991622925, + "learning_rate": 4.979403416868623e-05, + "loss": 5.7283, + "step": 6876 + }, + { + "epoch": 0.04089946712341802, + "grad_norm": 2.2767558097839355, + "learning_rate": 4.9793974329434525e-05, + "loss": 5.3632, + "step": 6877 + }, + { + "epoch": 0.04090541440669902, + "grad_norm": 2.295635461807251, + "learning_rate": 4.97939144815275e-05, + "loss": 5.4524, + "step": 6878 + }, + { + "epoch": 0.04091136168998002, + "grad_norm": 2.247194766998291, + "learning_rate": 4.9793854624965166e-05, + "loss": 5.7846, + "step": 6879 + }, + { + "epoch": 0.04091730897326101, + "grad_norm": 2.2641420364379883, + "learning_rate": 4.9793794759747565e-05, + "loss": 5.7479, + "step": 6880 + }, + { + "epoch": 0.040923256256542015, + "grad_norm": 2.002126455307007, + "learning_rate": 4.97937348858747e-05, + "loss": 5.2694, + "step": 6881 + }, + { + "epoch": 0.04092920353982301, + "grad_norm": 2.079157590866089, + "learning_rate": 4.9793675003346596e-05, + "loss": 6.2711, + "step": 6882 + }, + { + "epoch": 0.040935150823104005, + "grad_norm": 1.9030524492263794, + "learning_rate": 4.979361511216328e-05, + "loss": 5.7259, + "step": 6883 + }, + { + "epoch": 0.040941098106385, + "grad_norm": 1.9157373905181885, + "learning_rate": 4.9793555212324774e-05, + "loss": 6.086, + "step": 6884 + }, + { + "epoch": 0.040947045389666, + "grad_norm": 1.8622015714645386, + "learning_rate": 4.979349530383108e-05, + "loss": 6.1318, + "step": 6885 + }, + { + "epoch": 0.040952992672947, + "grad_norm": 2.3341257572174072, + "learning_rate": 4.9793435386682256e-05, + "loss": 5.9421, + "step": 6886 + }, + { + "epoch": 0.04095893995622799, + "grad_norm": 2.6894209384918213, + "learning_rate": 4.979337546087828e-05, + "loss": 5.5351, + "step": 6887 + }, + { + "epoch": 0.040964887239508994, + "grad_norm": 2.5316739082336426, + "learning_rate": 4.979331552641919e-05, + "loss": 5.5056, + "step": 6888 + }, + { + "epoch": 0.04097083452278999, + "grad_norm": 2.5129077434539795, + "learning_rate": 4.979325558330502e-05, + "loss": 5.3091, + "step": 6889 + }, + { + "epoch": 0.040976781806070985, + "grad_norm": 2.275536298751831, + "learning_rate": 4.979319563153578e-05, + "loss": 5.494, + "step": 6890 + }, + { + "epoch": 0.04098272908935199, + "grad_norm": 2.749375104904175, + "learning_rate": 4.9793135671111494e-05, + "loss": 6.0139, + "step": 6891 + }, + { + "epoch": 0.04098867637263298, + "grad_norm": 2.419163227081299, + "learning_rate": 4.9793075702032177e-05, + "loss": 6.1102, + "step": 6892 + }, + { + "epoch": 0.04099462365591398, + "grad_norm": 2.311450958251953, + "learning_rate": 4.9793015724297856e-05, + "loss": 5.9798, + "step": 6893 + }, + { + "epoch": 0.04100057093919498, + "grad_norm": 2.0522212982177734, + "learning_rate": 4.979295573790854e-05, + "loss": 5.9247, + "step": 6894 + }, + { + "epoch": 0.041006518222475974, + "grad_norm": 2.1928513050079346, + "learning_rate": 4.979289574286427e-05, + "loss": 5.8001, + "step": 6895 + }, + { + "epoch": 0.04101246550575697, + "grad_norm": 2.1945207118988037, + "learning_rate": 4.979283573916505e-05, + "loss": 5.9975, + "step": 6896 + }, + { + "epoch": 0.041018412789037964, + "grad_norm": 2.274843454360962, + "learning_rate": 4.979277572681091e-05, + "loss": 5.693, + "step": 6897 + }, + { + "epoch": 0.041024360072318966, + "grad_norm": 2.2715282440185547, + "learning_rate": 4.979271570580186e-05, + "loss": 5.9952, + "step": 6898 + }, + { + "epoch": 0.04103030735559996, + "grad_norm": 2.4459903240203857, + "learning_rate": 4.9792655676137943e-05, + "loss": 6.0305, + "step": 6899 + }, + { + "epoch": 0.041036254638880956, + "grad_norm": 2.8737339973449707, + "learning_rate": 4.9792595637819165e-05, + "loss": 6.0982, + "step": 6900 + }, + { + "epoch": 0.04104220192216196, + "grad_norm": 2.382143974304199, + "learning_rate": 4.979253559084553e-05, + "loss": 5.6122, + "step": 6901 + }, + { + "epoch": 0.04104814920544295, + "grad_norm": 2.4127237796783447, + "learning_rate": 4.97924755352171e-05, + "loss": 5.7723, + "step": 6902 + }, + { + "epoch": 0.04105409648872395, + "grad_norm": 2.3108956813812256, + "learning_rate": 4.979241547093386e-05, + "loss": 6.1655, + "step": 6903 + }, + { + "epoch": 0.04106004377200495, + "grad_norm": 2.250555992126465, + "learning_rate": 4.979235539799584e-05, + "loss": 6.0627, + "step": 6904 + }, + { + "epoch": 0.041065991055285946, + "grad_norm": 2.187957525253296, + "learning_rate": 4.979229531640307e-05, + "loss": 6.1438, + "step": 6905 + }, + { + "epoch": 0.04107193833856694, + "grad_norm": 1.9089539051055908, + "learning_rate": 4.979223522615557e-05, + "loss": 6.1431, + "step": 6906 + }, + { + "epoch": 0.04107788562184794, + "grad_norm": 2.343569040298462, + "learning_rate": 4.979217512725336e-05, + "loss": 5.9774, + "step": 6907 + }, + { + "epoch": 0.04108383290512894, + "grad_norm": 2.759631633758545, + "learning_rate": 4.979211501969645e-05, + "loss": 5.7982, + "step": 6908 + }, + { + "epoch": 0.04108978018840993, + "grad_norm": 2.295811414718628, + "learning_rate": 4.979205490348487e-05, + "loss": 6.0843, + "step": 6909 + }, + { + "epoch": 0.041095727471690935, + "grad_norm": 2.6259605884552, + "learning_rate": 4.979199477861864e-05, + "loss": 5.6498, + "step": 6910 + }, + { + "epoch": 0.04110167475497193, + "grad_norm": 2.396895408630371, + "learning_rate": 4.9791934645097785e-05, + "loss": 5.9936, + "step": 6911 + }, + { + "epoch": 0.041107622038252925, + "grad_norm": 2.020845651626587, + "learning_rate": 4.979187450292231e-05, + "loss": 5.4867, + "step": 6912 + }, + { + "epoch": 0.04111356932153392, + "grad_norm": 2.6473753452301025, + "learning_rate": 4.979181435209226e-05, + "loss": 5.3556, + "step": 6913 + }, + { + "epoch": 0.04111951660481492, + "grad_norm": 2.353158712387085, + "learning_rate": 4.9791754192607636e-05, + "loss": 6.3122, + "step": 6914 + }, + { + "epoch": 0.04112546388809592, + "grad_norm": 2.499817132949829, + "learning_rate": 4.9791694024468474e-05, + "loss": 5.816, + "step": 6915 + }, + { + "epoch": 0.04113141117137691, + "grad_norm": 2.009239673614502, + "learning_rate": 4.979163384767478e-05, + "loss": 5.5982, + "step": 6916 + }, + { + "epoch": 0.041137358454657914, + "grad_norm": 2.3885819911956787, + "learning_rate": 4.9791573662226586e-05, + "loss": 5.7403, + "step": 6917 + }, + { + "epoch": 0.04114330573793891, + "grad_norm": 2.3135135173797607, + "learning_rate": 4.979151346812391e-05, + "loss": 5.3151, + "step": 6918 + }, + { + "epoch": 0.041149253021219905, + "grad_norm": 1.9801241159439087, + "learning_rate": 4.979145326536677e-05, + "loss": 5.5148, + "step": 6919 + }, + { + "epoch": 0.04115520030450091, + "grad_norm": 2.0724904537200928, + "learning_rate": 4.979139305395519e-05, + "loss": 5.5355, + "step": 6920 + }, + { + "epoch": 0.0411611475877819, + "grad_norm": 1.8104170560836792, + "learning_rate": 4.97913328338892e-05, + "loss": 5.4861, + "step": 6921 + }, + { + "epoch": 0.0411670948710629, + "grad_norm": 1.81072998046875, + "learning_rate": 4.9791272605168804e-05, + "loss": 5.5075, + "step": 6922 + }, + { + "epoch": 0.0411730421543439, + "grad_norm": 1.709191083908081, + "learning_rate": 4.979121236779403e-05, + "loss": 6.1353, + "step": 6923 + }, + { + "epoch": 0.041178989437624894, + "grad_norm": 2.004974126815796, + "learning_rate": 4.9791152121764903e-05, + "loss": 5.478, + "step": 6924 + }, + { + "epoch": 0.04118493672090589, + "grad_norm": 1.937933325767517, + "learning_rate": 4.979109186708144e-05, + "loss": 5.4022, + "step": 6925 + }, + { + "epoch": 0.041190884004186884, + "grad_norm": 1.9453305006027222, + "learning_rate": 4.979103160374367e-05, + "loss": 5.243, + "step": 6926 + }, + { + "epoch": 0.041196831287467886, + "grad_norm": 1.8552072048187256, + "learning_rate": 4.979097133175159e-05, + "loss": 5.3104, + "step": 6927 + }, + { + "epoch": 0.04120277857074888, + "grad_norm": 1.9148203134536743, + "learning_rate": 4.9790911051105246e-05, + "loss": 5.5538, + "step": 6928 + }, + { + "epoch": 0.041208725854029876, + "grad_norm": 1.9658032655715942, + "learning_rate": 4.979085076180466e-05, + "loss": 5.5285, + "step": 6929 + }, + { + "epoch": 0.04121467313731088, + "grad_norm": 1.7332781553268433, + "learning_rate": 4.9790790463849835e-05, + "loss": 5.1959, + "step": 6930 + }, + { + "epoch": 0.04122062042059187, + "grad_norm": 1.5762557983398438, + "learning_rate": 4.9790730157240804e-05, + "loss": 5.3672, + "step": 6931 + }, + { + "epoch": 0.04122656770387287, + "grad_norm": 1.7899656295776367, + "learning_rate": 4.979066984197759e-05, + "loss": 5.3588, + "step": 6932 + }, + { + "epoch": 0.04123251498715387, + "grad_norm": 1.5992622375488281, + "learning_rate": 4.97906095180602e-05, + "loss": 5.275, + "step": 6933 + }, + { + "epoch": 0.041238462270434866, + "grad_norm": 1.875116229057312, + "learning_rate": 4.9790549185488666e-05, + "loss": 5.3428, + "step": 6934 + }, + { + "epoch": 0.04124440955371586, + "grad_norm": 1.8110510110855103, + "learning_rate": 4.979048884426301e-05, + "loss": 5.2416, + "step": 6935 + }, + { + "epoch": 0.04125035683699686, + "grad_norm": 1.5512267351150513, + "learning_rate": 4.979042849438325e-05, + "loss": 5.3643, + "step": 6936 + }, + { + "epoch": 0.04125630412027786, + "grad_norm": 1.8929630517959595, + "learning_rate": 4.979036813584941e-05, + "loss": 5.4232, + "step": 6937 + }, + { + "epoch": 0.04126225140355885, + "grad_norm": 1.8569291830062866, + "learning_rate": 4.9790307768661504e-05, + "loss": 5.2949, + "step": 6938 + }, + { + "epoch": 0.041268198686839855, + "grad_norm": 1.6058611869812012, + "learning_rate": 4.9790247392819564e-05, + "loss": 5.3736, + "step": 6939 + }, + { + "epoch": 0.04127414597012085, + "grad_norm": 1.8455227613449097, + "learning_rate": 4.97901870083236e-05, + "loss": 5.2768, + "step": 6940 + }, + { + "epoch": 0.041280093253401845, + "grad_norm": 1.9346935749053955, + "learning_rate": 4.979012661517364e-05, + "loss": 5.4316, + "step": 6941 + }, + { + "epoch": 0.04128604053668284, + "grad_norm": 1.8085594177246094, + "learning_rate": 4.97900662133697e-05, + "loss": 5.365, + "step": 6942 + }, + { + "epoch": 0.04129198781996384, + "grad_norm": 1.73456871509552, + "learning_rate": 4.9790005802911804e-05, + "loss": 5.2726, + "step": 6943 + }, + { + "epoch": 0.04129793510324484, + "grad_norm": 2.1071617603302, + "learning_rate": 4.978994538379997e-05, + "loss": 6.2313, + "step": 6944 + }, + { + "epoch": 0.04130388238652583, + "grad_norm": 1.7098963260650635, + "learning_rate": 4.978988495603423e-05, + "loss": 5.3162, + "step": 6945 + }, + { + "epoch": 0.041309829669806834, + "grad_norm": 1.8131905794143677, + "learning_rate": 4.978982451961459e-05, + "loss": 5.2486, + "step": 6946 + }, + { + "epoch": 0.04131577695308783, + "grad_norm": 1.8162381649017334, + "learning_rate": 4.978976407454109e-05, + "loss": 5.2806, + "step": 6947 + }, + { + "epoch": 0.041321724236368824, + "grad_norm": 1.9250297546386719, + "learning_rate": 4.9789703620813734e-05, + "loss": 5.1742, + "step": 6948 + }, + { + "epoch": 0.041327671519649826, + "grad_norm": 1.8263678550720215, + "learning_rate": 4.978964315843254e-05, + "loss": 5.1786, + "step": 6949 + }, + { + "epoch": 0.04133361880293082, + "grad_norm": 1.6751807928085327, + "learning_rate": 4.9789582687397546e-05, + "loss": 5.4798, + "step": 6950 + }, + { + "epoch": 0.04133956608621182, + "grad_norm": 1.7842947244644165, + "learning_rate": 4.9789522207708764e-05, + "loss": 5.201, + "step": 6951 + }, + { + "epoch": 0.04134551336949282, + "grad_norm": 1.6785067319869995, + "learning_rate": 4.978946171936621e-05, + "loss": 5.3852, + "step": 6952 + }, + { + "epoch": 0.041351460652773814, + "grad_norm": 1.5475291013717651, + "learning_rate": 4.978940122236992e-05, + "loss": 5.4083, + "step": 6953 + }, + { + "epoch": 0.04135740793605481, + "grad_norm": 1.7445106506347656, + "learning_rate": 4.97893407167199e-05, + "loss": 5.3125, + "step": 6954 + }, + { + "epoch": 0.041363355219335804, + "grad_norm": 1.7334082126617432, + "learning_rate": 4.9789280202416175e-05, + "loss": 5.5388, + "step": 6955 + }, + { + "epoch": 0.041369302502616806, + "grad_norm": 1.7267119884490967, + "learning_rate": 4.9789219679458774e-05, + "loss": 5.5175, + "step": 6956 + }, + { + "epoch": 0.0413752497858978, + "grad_norm": 1.8033246994018555, + "learning_rate": 4.978915914784771e-05, + "loss": 5.3523, + "step": 6957 + }, + { + "epoch": 0.041381197069178796, + "grad_norm": 1.9836528301239014, + "learning_rate": 4.978909860758301e-05, + "loss": 5.3808, + "step": 6958 + }, + { + "epoch": 0.0413871443524598, + "grad_norm": 1.6260416507720947, + "learning_rate": 4.978903805866469e-05, + "loss": 5.4642, + "step": 6959 + }, + { + "epoch": 0.04139309163574079, + "grad_norm": 1.7260626554489136, + "learning_rate": 4.978897750109277e-05, + "loss": 5.4975, + "step": 6960 + }, + { + "epoch": 0.04139903891902179, + "grad_norm": 1.6948668956756592, + "learning_rate": 4.978891693486728e-05, + "loss": 5.5768, + "step": 6961 + }, + { + "epoch": 0.04140498620230279, + "grad_norm": 1.7885476350784302, + "learning_rate": 4.978885635998824e-05, + "loss": 5.4156, + "step": 6962 + }, + { + "epoch": 0.041410933485583785, + "grad_norm": 1.8626813888549805, + "learning_rate": 4.978879577645565e-05, + "loss": 5.354, + "step": 6963 + }, + { + "epoch": 0.04141688076886478, + "grad_norm": 1.867090106010437, + "learning_rate": 4.9788735184269553e-05, + "loss": 5.2934, + "step": 6964 + }, + { + "epoch": 0.04142282805214578, + "grad_norm": 1.7208340167999268, + "learning_rate": 4.9788674583429974e-05, + "loss": 5.2116, + "step": 6965 + }, + { + "epoch": 0.04142877533542678, + "grad_norm": 1.934480905532837, + "learning_rate": 4.9788613973936916e-05, + "loss": 5.5801, + "step": 6966 + }, + { + "epoch": 0.04143472261870777, + "grad_norm": 1.6263724565505981, + "learning_rate": 4.978855335579041e-05, + "loss": 5.3835, + "step": 6967 + }, + { + "epoch": 0.041440669901988775, + "grad_norm": 1.743996262550354, + "learning_rate": 4.9788492728990474e-05, + "loss": 5.3281, + "step": 6968 + }, + { + "epoch": 0.04144661718526977, + "grad_norm": 1.5556843280792236, + "learning_rate": 4.978843209353714e-05, + "loss": 5.442, + "step": 6969 + }, + { + "epoch": 0.041452564468550765, + "grad_norm": 1.5540435314178467, + "learning_rate": 4.978837144943041e-05, + "loss": 5.3621, + "step": 6970 + }, + { + "epoch": 0.04145851175183176, + "grad_norm": 1.7884414196014404, + "learning_rate": 4.9788310796670326e-05, + "loss": 5.571, + "step": 6971 + }, + { + "epoch": 0.04146445903511276, + "grad_norm": 1.7550957202911377, + "learning_rate": 4.9788250135256886e-05, + "loss": 5.61, + "step": 6972 + }, + { + "epoch": 0.04147040631839376, + "grad_norm": 1.9336804151535034, + "learning_rate": 4.978818946519013e-05, + "loss": 5.6142, + "step": 6973 + }, + { + "epoch": 0.04147635360167475, + "grad_norm": 1.8888505697250366, + "learning_rate": 4.978812878647008e-05, + "loss": 5.4908, + "step": 6974 + }, + { + "epoch": 0.041482300884955754, + "grad_norm": 1.940371036529541, + "learning_rate": 4.978806809909674e-05, + "loss": 5.5407, + "step": 6975 + }, + { + "epoch": 0.04148824816823675, + "grad_norm": 2.0182151794433594, + "learning_rate": 4.9788007403070146e-05, + "loss": 5.3643, + "step": 6976 + }, + { + "epoch": 0.041494195451517744, + "grad_norm": 1.7960541248321533, + "learning_rate": 4.978794669839032e-05, + "loss": 5.4994, + "step": 6977 + }, + { + "epoch": 0.041500142734798746, + "grad_norm": 1.8403207063674927, + "learning_rate": 4.978788598505727e-05, + "loss": 5.4501, + "step": 6978 + }, + { + "epoch": 0.04150609001807974, + "grad_norm": 1.7232698202133179, + "learning_rate": 4.978782526307103e-05, + "loss": 5.5406, + "step": 6979 + }, + { + "epoch": 0.04151203730136074, + "grad_norm": 1.7003169059753418, + "learning_rate": 4.9787764532431615e-05, + "loss": 5.3427, + "step": 6980 + }, + { + "epoch": 0.04151798458464174, + "grad_norm": 2.041384696960449, + "learning_rate": 4.978770379313904e-05, + "loss": 5.5121, + "step": 6981 + }, + { + "epoch": 0.041523931867922734, + "grad_norm": 1.5773900747299194, + "learning_rate": 4.978764304519334e-05, + "loss": 5.4604, + "step": 6982 + }, + { + "epoch": 0.04152987915120373, + "grad_norm": 1.8834172487258911, + "learning_rate": 4.9787582288594535e-05, + "loss": 5.5141, + "step": 6983 + }, + { + "epoch": 0.04153582643448473, + "grad_norm": 1.7956576347351074, + "learning_rate": 4.978752152334264e-05, + "loss": 5.5664, + "step": 6984 + }, + { + "epoch": 0.041541773717765726, + "grad_norm": 1.8676495552062988, + "learning_rate": 4.978746074943767e-05, + "loss": 5.2846, + "step": 6985 + }, + { + "epoch": 0.04154772100104672, + "grad_norm": 1.7709665298461914, + "learning_rate": 4.9787399966879654e-05, + "loss": 5.3375, + "step": 6986 + }, + { + "epoch": 0.041553668284327716, + "grad_norm": 2.012941837310791, + "learning_rate": 4.978733917566862e-05, + "loss": 5.6973, + "step": 6987 + }, + { + "epoch": 0.04155961556760872, + "grad_norm": 1.8220570087432861, + "learning_rate": 4.978727837580458e-05, + "loss": 5.191, + "step": 6988 + }, + { + "epoch": 0.04156556285088971, + "grad_norm": 1.6511586904525757, + "learning_rate": 4.978721756728755e-05, + "loss": 5.2787, + "step": 6989 + }, + { + "epoch": 0.04157151013417071, + "grad_norm": 1.9026141166687012, + "learning_rate": 4.978715675011757e-05, + "loss": 5.4456, + "step": 6990 + }, + { + "epoch": 0.04157745741745171, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.9787095924294633e-05, + "loss": 5.5013, + "step": 6991 + }, + { + "epoch": 0.041583404700732705, + "grad_norm": 1.8720741271972656, + "learning_rate": 4.978703508981879e-05, + "loss": 5.3952, + "step": 6992 + }, + { + "epoch": 0.0415893519840137, + "grad_norm": 1.817356824874878, + "learning_rate": 4.978697424669005e-05, + "loss": 5.4719, + "step": 6993 + }, + { + "epoch": 0.0415952992672947, + "grad_norm": 1.740702509880066, + "learning_rate": 4.978691339490843e-05, + "loss": 5.6484, + "step": 6994 + }, + { + "epoch": 0.0416012465505757, + "grad_norm": 1.8752427101135254, + "learning_rate": 4.978685253447395e-05, + "loss": 5.6394, + "step": 6995 + }, + { + "epoch": 0.04160719383385669, + "grad_norm": 1.8180509805679321, + "learning_rate": 4.978679166538665e-05, + "loss": 5.3401, + "step": 6996 + }, + { + "epoch": 0.041613141117137695, + "grad_norm": 1.9002251625061035, + "learning_rate": 4.9786730787646516e-05, + "loss": 5.3237, + "step": 6997 + }, + { + "epoch": 0.04161908840041869, + "grad_norm": 1.741176724433899, + "learning_rate": 4.978666990125361e-05, + "loss": 5.2311, + "step": 6998 + }, + { + "epoch": 0.041625035683699685, + "grad_norm": 2.0994246006011963, + "learning_rate": 4.9786609006207925e-05, + "loss": 5.3549, + "step": 6999 + }, + { + "epoch": 0.04163098296698068, + "grad_norm": 1.8438987731933594, + "learning_rate": 4.978654810250949e-05, + "loss": 5.4322, + "step": 7000 + }, + { + "epoch": 0.04163693025026168, + "grad_norm": 1.7411181926727295, + "learning_rate": 4.978648719015833e-05, + "loss": 5.455, + "step": 7001 + }, + { + "epoch": 0.04164287753354268, + "grad_norm": 1.6879174709320068, + "learning_rate": 4.978642626915446e-05, + "loss": 5.3676, + "step": 7002 + }, + { + "epoch": 0.04164882481682367, + "grad_norm": 1.8912461996078491, + "learning_rate": 4.9786365339497906e-05, + "loss": 5.6181, + "step": 7003 + }, + { + "epoch": 0.041654772100104674, + "grad_norm": 1.9234617948532104, + "learning_rate": 4.978630440118869e-05, + "loss": 5.5388, + "step": 7004 + }, + { + "epoch": 0.04166071938338567, + "grad_norm": 2.1059048175811768, + "learning_rate": 4.9786243454226824e-05, + "loss": 5.6856, + "step": 7005 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 2.1900687217712402, + "learning_rate": 4.9786182498612347e-05, + "loss": 6.2426, + "step": 7006 + }, + { + "epoch": 0.041672613949947666, + "grad_norm": 1.7580265998840332, + "learning_rate": 4.9786121534345265e-05, + "loss": 5.2342, + "step": 7007 + }, + { + "epoch": 0.04167856123322866, + "grad_norm": 1.4747200012207031, + "learning_rate": 4.97860605614256e-05, + "loss": 5.1977, + "step": 7008 + }, + { + "epoch": 0.04168450851650966, + "grad_norm": 1.8164165019989014, + "learning_rate": 4.978599957985338e-05, + "loss": 5.1362, + "step": 7009 + }, + { + "epoch": 0.04169045579979066, + "grad_norm": 1.468550443649292, + "learning_rate": 4.978593858962863e-05, + "loss": 5.1265, + "step": 7010 + }, + { + "epoch": 0.041696403083071654, + "grad_norm": 1.584343433380127, + "learning_rate": 4.9785877590751356e-05, + "loss": 5.2611, + "step": 7011 + }, + { + "epoch": 0.04170235036635265, + "grad_norm": 1.7864785194396973, + "learning_rate": 4.978581658322159e-05, + "loss": 5.5214, + "step": 7012 + }, + { + "epoch": 0.04170829764963365, + "grad_norm": 1.8359016180038452, + "learning_rate": 4.978575556703936e-05, + "loss": 5.3808, + "step": 7013 + }, + { + "epoch": 0.041714244932914646, + "grad_norm": 1.8298325538635254, + "learning_rate": 4.978569454220467e-05, + "loss": 5.5606, + "step": 7014 + }, + { + "epoch": 0.04172019221619564, + "grad_norm": 2.1555540561676025, + "learning_rate": 4.978563350871755e-05, + "loss": 5.6592, + "step": 7015 + }, + { + "epoch": 0.041726139499476636, + "grad_norm": 2.5251846313476562, + "learning_rate": 4.9785572466578026e-05, + "loss": 5.5771, + "step": 7016 + }, + { + "epoch": 0.04173208678275764, + "grad_norm": 1.7765661478042603, + "learning_rate": 4.9785511415786115e-05, + "loss": 5.5558, + "step": 7017 + }, + { + "epoch": 0.04173803406603863, + "grad_norm": 1.9711554050445557, + "learning_rate": 4.978545035634183e-05, + "loss": 5.5565, + "step": 7018 + }, + { + "epoch": 0.04174398134931963, + "grad_norm": 1.8080202341079712, + "learning_rate": 4.978538928824521e-05, + "loss": 5.5037, + "step": 7019 + }, + { + "epoch": 0.04174992863260063, + "grad_norm": 1.7506872415542603, + "learning_rate": 4.978532821149626e-05, + "loss": 5.3362, + "step": 7020 + }, + { + "epoch": 0.041755875915881625, + "grad_norm": 1.5606149435043335, + "learning_rate": 4.978526712609501e-05, + "loss": 5.3541, + "step": 7021 + }, + { + "epoch": 0.04176182319916262, + "grad_norm": 1.8840737342834473, + "learning_rate": 4.9785206032041476e-05, + "loss": 5.2315, + "step": 7022 + }, + { + "epoch": 0.04176777048244362, + "grad_norm": 2.118178606033325, + "learning_rate": 4.978514492933569e-05, + "loss": 5.6174, + "step": 7023 + }, + { + "epoch": 0.04177371776572462, + "grad_norm": 2.043907403945923, + "learning_rate": 4.978508381797766e-05, + "loss": 5.6272, + "step": 7024 + }, + { + "epoch": 0.04177966504900561, + "grad_norm": 1.764411211013794, + "learning_rate": 4.978502269796742e-05, + "loss": 5.6153, + "step": 7025 + }, + { + "epoch": 0.041785612332286615, + "grad_norm": 1.5760626792907715, + "learning_rate": 4.978496156930498e-05, + "loss": 5.5734, + "step": 7026 + }, + { + "epoch": 0.04179155961556761, + "grad_norm": 1.8857802152633667, + "learning_rate": 4.9784900431990366e-05, + "loss": 5.5295, + "step": 7027 + }, + { + "epoch": 0.041797506898848605, + "grad_norm": 1.7287275791168213, + "learning_rate": 4.97848392860236e-05, + "loss": 5.3175, + "step": 7028 + }, + { + "epoch": 0.0418034541821296, + "grad_norm": 1.915263295173645, + "learning_rate": 4.97847781314047e-05, + "loss": 5.4838, + "step": 7029 + }, + { + "epoch": 0.0418094014654106, + "grad_norm": 2.049435615539551, + "learning_rate": 4.97847169681337e-05, + "loss": 5.5508, + "step": 7030 + }, + { + "epoch": 0.0418153487486916, + "grad_norm": 1.8955415487289429, + "learning_rate": 4.97846557962106e-05, + "loss": 5.4618, + "step": 7031 + }, + { + "epoch": 0.04182129603197259, + "grad_norm": 1.8957183361053467, + "learning_rate": 4.978459461563543e-05, + "loss": 5.5293, + "step": 7032 + }, + { + "epoch": 0.041827243315253594, + "grad_norm": 2.050734043121338, + "learning_rate": 4.978453342640822e-05, + "loss": 5.8002, + "step": 7033 + }, + { + "epoch": 0.04183319059853459, + "grad_norm": 1.9867476224899292, + "learning_rate": 4.978447222852899e-05, + "loss": 5.466, + "step": 7034 + }, + { + "epoch": 0.041839137881815584, + "grad_norm": 1.7928507328033447, + "learning_rate": 4.978441102199775e-05, + "loss": 5.3312, + "step": 7035 + }, + { + "epoch": 0.041845085165096586, + "grad_norm": 1.7984018325805664, + "learning_rate": 4.978434980681453e-05, + "loss": 5.2936, + "step": 7036 + }, + { + "epoch": 0.04185103244837758, + "grad_norm": 1.8011672496795654, + "learning_rate": 4.9784288582979355e-05, + "loss": 5.484, + "step": 7037 + }, + { + "epoch": 0.041856979731658576, + "grad_norm": 1.9439928531646729, + "learning_rate": 4.9784227350492236e-05, + "loss": 5.4563, + "step": 7038 + }, + { + "epoch": 0.04186292701493958, + "grad_norm": 1.71321439743042, + "learning_rate": 4.97841661093532e-05, + "loss": 5.3909, + "step": 7039 + }, + { + "epoch": 0.041868874298220574, + "grad_norm": 1.629333734512329, + "learning_rate": 4.9784104859562266e-05, + "loss": 5.3112, + "step": 7040 + }, + { + "epoch": 0.04187482158150157, + "grad_norm": 1.5248417854309082, + "learning_rate": 4.9784043601119456e-05, + "loss": 5.3724, + "step": 7041 + }, + { + "epoch": 0.04188076886478257, + "grad_norm": 1.8886220455169678, + "learning_rate": 4.97839823340248e-05, + "loss": 5.443, + "step": 7042 + }, + { + "epoch": 0.041886716148063566, + "grad_norm": 1.5902595520019531, + "learning_rate": 4.9783921058278307e-05, + "loss": 5.4249, + "step": 7043 + }, + { + "epoch": 0.04189266343134456, + "grad_norm": 1.837579369544983, + "learning_rate": 4.978385977388e-05, + "loss": 5.3767, + "step": 7044 + }, + { + "epoch": 0.041898610714625556, + "grad_norm": 1.8306061029434204, + "learning_rate": 4.9783798480829905e-05, + "loss": 5.4206, + "step": 7045 + }, + { + "epoch": 0.04190455799790656, + "grad_norm": 1.6887965202331543, + "learning_rate": 4.9783737179128044e-05, + "loss": 5.5327, + "step": 7046 + }, + { + "epoch": 0.04191050528118755, + "grad_norm": 1.8081728219985962, + "learning_rate": 4.978367586877444e-05, + "loss": 5.4547, + "step": 7047 + }, + { + "epoch": 0.04191645256446855, + "grad_norm": 1.8341114521026611, + "learning_rate": 4.97836145497691e-05, + "loss": 5.4175, + "step": 7048 + }, + { + "epoch": 0.04192239984774955, + "grad_norm": 1.965240240097046, + "learning_rate": 4.978355322211207e-05, + "loss": 5.4253, + "step": 7049 + }, + { + "epoch": 0.041928347131030545, + "grad_norm": 1.7060484886169434, + "learning_rate": 4.9783491885803343e-05, + "loss": 5.3493, + "step": 7050 + }, + { + "epoch": 0.04193429441431154, + "grad_norm": 1.8203076124191284, + "learning_rate": 4.978343054084297e-05, + "loss": 5.4601, + "step": 7051 + }, + { + "epoch": 0.04194024169759254, + "grad_norm": 1.919954538345337, + "learning_rate": 4.9783369187230945e-05, + "loss": 5.4921, + "step": 7052 + }, + { + "epoch": 0.04194618898087354, + "grad_norm": 1.4519730806350708, + "learning_rate": 4.9783307824967306e-05, + "loss": 5.4922, + "step": 7053 + }, + { + "epoch": 0.04195213626415453, + "grad_norm": 1.8431898355484009, + "learning_rate": 4.9783246454052066e-05, + "loss": 5.384, + "step": 7054 + }, + { + "epoch": 0.041958083547435535, + "grad_norm": 1.5493370294570923, + "learning_rate": 4.978318507448526e-05, + "loss": 5.5294, + "step": 7055 + }, + { + "epoch": 0.04196403083071653, + "grad_norm": 1.6405844688415527, + "learning_rate": 4.97831236862669e-05, + "loss": 5.492, + "step": 7056 + }, + { + "epoch": 0.041969978113997525, + "grad_norm": 1.7830392122268677, + "learning_rate": 4.9783062289396996e-05, + "loss": 5.2977, + "step": 7057 + }, + { + "epoch": 0.04197592539727852, + "grad_norm": 1.8268102407455444, + "learning_rate": 4.9783000883875595e-05, + "loss": 5.3396, + "step": 7058 + }, + { + "epoch": 0.04198187268055952, + "grad_norm": 1.942901849746704, + "learning_rate": 4.9782939469702694e-05, + "loss": 5.3338, + "step": 7059 + }, + { + "epoch": 0.04198781996384052, + "grad_norm": 1.5793414115905762, + "learning_rate": 4.9782878046878334e-05, + "loss": 5.3286, + "step": 7060 + }, + { + "epoch": 0.04199376724712151, + "grad_norm": 1.5777463912963867, + "learning_rate": 4.9782816615402515e-05, + "loss": 5.2942, + "step": 7061 + }, + { + "epoch": 0.041999714530402514, + "grad_norm": 1.6393412351608276, + "learning_rate": 4.978275517527528e-05, + "loss": 5.2557, + "step": 7062 + }, + { + "epoch": 0.04200566181368351, + "grad_norm": 1.9657515287399292, + "learning_rate": 4.978269372649664e-05, + "loss": 5.3875, + "step": 7063 + }, + { + "epoch": 0.042011609096964504, + "grad_norm": 2.1419737339019775, + "learning_rate": 4.9782632269066623e-05, + "loss": 5.2014, + "step": 7064 + }, + { + "epoch": 0.042017556380245506, + "grad_norm": 2.0425620079040527, + "learning_rate": 4.978257080298523e-05, + "loss": 5.194, + "step": 7065 + }, + { + "epoch": 0.0420235036635265, + "grad_norm": 1.7248409986495972, + "learning_rate": 4.978250932825251e-05, + "loss": 5.1922, + "step": 7066 + }, + { + "epoch": 0.042029450946807496, + "grad_norm": 1.8265177011489868, + "learning_rate": 4.978244784486847e-05, + "loss": 5.4474, + "step": 7067 + }, + { + "epoch": 0.0420353982300885, + "grad_norm": 1.803701400756836, + "learning_rate": 4.9782386352833134e-05, + "loss": 6.2155, + "step": 7068 + }, + { + "epoch": 0.042041345513369494, + "grad_norm": 1.9970064163208008, + "learning_rate": 4.978232485214652e-05, + "loss": 5.3622, + "step": 7069 + }, + { + "epoch": 0.04204729279665049, + "grad_norm": 1.7449073791503906, + "learning_rate": 4.978226334280865e-05, + "loss": 5.3146, + "step": 7070 + }, + { + "epoch": 0.04205324007993149, + "grad_norm": 2.0284547805786133, + "learning_rate": 4.978220182481955e-05, + "loss": 5.0169, + "step": 7071 + }, + { + "epoch": 0.042059187363212486, + "grad_norm": 1.6801714897155762, + "learning_rate": 4.978214029817924e-05, + "loss": 5.1294, + "step": 7072 + }, + { + "epoch": 0.04206513464649348, + "grad_norm": 2.160585641860962, + "learning_rate": 4.978207876288774e-05, + "loss": 5.072, + "step": 7073 + }, + { + "epoch": 0.042071081929774476, + "grad_norm": 2.07739520072937, + "learning_rate": 4.978201721894508e-05, + "loss": 5.2065, + "step": 7074 + }, + { + "epoch": 0.04207702921305548, + "grad_norm": 2.1396286487579346, + "learning_rate": 4.978195566635127e-05, + "loss": 5.1066, + "step": 7075 + }, + { + "epoch": 0.04208297649633647, + "grad_norm": 1.883280634880066, + "learning_rate": 4.978189410510633e-05, + "loss": 5.2842, + "step": 7076 + }, + { + "epoch": 0.04208892377961747, + "grad_norm": 1.9917101860046387, + "learning_rate": 4.978183253521029e-05, + "loss": 5.0799, + "step": 7077 + }, + { + "epoch": 0.04209487106289847, + "grad_norm": 1.9387022256851196, + "learning_rate": 4.9781770956663164e-05, + "loss": 5.1898, + "step": 7078 + }, + { + "epoch": 0.042100818346179465, + "grad_norm": 1.9767060279846191, + "learning_rate": 4.978170936946498e-05, + "loss": 5.0692, + "step": 7079 + }, + { + "epoch": 0.04210676562946046, + "grad_norm": 2.0076138973236084, + "learning_rate": 4.978164777361576e-05, + "loss": 5.0255, + "step": 7080 + }, + { + "epoch": 0.04211271291274146, + "grad_norm": 1.8253445625305176, + "learning_rate": 4.978158616911552e-05, + "loss": 5.0111, + "step": 7081 + }, + { + "epoch": 0.04211866019602246, + "grad_norm": 1.6551930904388428, + "learning_rate": 4.978152455596429e-05, + "loss": 4.9849, + "step": 7082 + }, + { + "epoch": 0.04212460747930345, + "grad_norm": 1.8462406396865845, + "learning_rate": 4.9781462934162084e-05, + "loss": 5.0862, + "step": 7083 + }, + { + "epoch": 0.042130554762584455, + "grad_norm": 2.0828206539154053, + "learning_rate": 4.978140130370892e-05, + "loss": 5.031, + "step": 7084 + }, + { + "epoch": 0.04213650204586545, + "grad_norm": 1.7917357683181763, + "learning_rate": 4.978133966460483e-05, + "loss": 5.0028, + "step": 7085 + }, + { + "epoch": 0.042142449329146445, + "grad_norm": 1.7324126958847046, + "learning_rate": 4.9781278016849834e-05, + "loss": 4.9759, + "step": 7086 + }, + { + "epoch": 0.04214839661242744, + "grad_norm": 1.8673282861709595, + "learning_rate": 4.978121636044394e-05, + "loss": 5.3631, + "step": 7087 + }, + { + "epoch": 0.04215434389570844, + "grad_norm": 1.7723935842514038, + "learning_rate": 4.9781154695387186e-05, + "loss": 5.3427, + "step": 7088 + }, + { + "epoch": 0.04216029117898944, + "grad_norm": 1.4671146869659424, + "learning_rate": 4.978109302167958e-05, + "loss": 5.3003, + "step": 7089 + }, + { + "epoch": 0.04216623846227043, + "grad_norm": 1.9667481184005737, + "learning_rate": 4.9781031339321156e-05, + "loss": 5.0957, + "step": 7090 + }, + { + "epoch": 0.042172185745551434, + "grad_norm": 1.8162986040115356, + "learning_rate": 4.978096964831193e-05, + "loss": 5.1472, + "step": 7091 + }, + { + "epoch": 0.04217813302883243, + "grad_norm": 1.7793545722961426, + "learning_rate": 4.9780907948651926e-05, + "loss": 5.1771, + "step": 7092 + }, + { + "epoch": 0.042184080312113424, + "grad_norm": 1.8093308210372925, + "learning_rate": 4.9780846240341156e-05, + "loss": 5.1611, + "step": 7093 + }, + { + "epoch": 0.042190027595394426, + "grad_norm": 1.7010010480880737, + "learning_rate": 4.978078452337965e-05, + "loss": 5.4478, + "step": 7094 + }, + { + "epoch": 0.04219597487867542, + "grad_norm": 1.7978744506835938, + "learning_rate": 4.9780722797767434e-05, + "loss": 5.4443, + "step": 7095 + }, + { + "epoch": 0.042201922161956416, + "grad_norm": 1.4861794710159302, + "learning_rate": 4.9780661063504516e-05, + "loss": 5.3773, + "step": 7096 + }, + { + "epoch": 0.04220786944523742, + "grad_norm": 1.7805769443511963, + "learning_rate": 4.978059932059093e-05, + "loss": 5.0896, + "step": 7097 + }, + { + "epoch": 0.042213816728518413, + "grad_norm": 1.7392783164978027, + "learning_rate": 4.9780537569026695e-05, + "loss": 5.0602, + "step": 7098 + }, + { + "epoch": 0.04221976401179941, + "grad_norm": 1.8742554187774658, + "learning_rate": 4.978047580881182e-05, + "loss": 5.2595, + "step": 7099 + }, + { + "epoch": 0.04222571129508041, + "grad_norm": 1.6077641248703003, + "learning_rate": 4.978041403994635e-05, + "loss": 5.0925, + "step": 7100 + }, + { + "epoch": 0.042231658578361406, + "grad_norm": 1.7536481618881226, + "learning_rate": 4.9780352262430286e-05, + "loss": 5.2546, + "step": 7101 + }, + { + "epoch": 0.0422376058616424, + "grad_norm": 1.6404869556427002, + "learning_rate": 4.9780290476263656e-05, + "loss": 5.1349, + "step": 7102 + }, + { + "epoch": 0.042243553144923396, + "grad_norm": 1.7223635911941528, + "learning_rate": 4.978022868144649e-05, + "loss": 5.2894, + "step": 7103 + }, + { + "epoch": 0.0422495004282044, + "grad_norm": 1.7856663465499878, + "learning_rate": 4.9780166877978796e-05, + "loss": 5.384, + "step": 7104 + }, + { + "epoch": 0.04225544771148539, + "grad_norm": 1.6434816122055054, + "learning_rate": 4.978010506586061e-05, + "loss": 5.257, + "step": 7105 + }, + { + "epoch": 0.04226139499476639, + "grad_norm": 1.668371558189392, + "learning_rate": 4.9780043245091936e-05, + "loss": 5.2698, + "step": 7106 + }, + { + "epoch": 0.04226734227804739, + "grad_norm": 1.7553619146347046, + "learning_rate": 4.97799814156728e-05, + "loss": 5.1591, + "step": 7107 + }, + { + "epoch": 0.042273289561328385, + "grad_norm": 1.6918652057647705, + "learning_rate": 4.977991957760324e-05, + "loss": 5.2727, + "step": 7108 + }, + { + "epoch": 0.04227923684460938, + "grad_norm": 1.6634269952774048, + "learning_rate": 4.977985773088326e-05, + "loss": 5.3099, + "step": 7109 + }, + { + "epoch": 0.04228518412789038, + "grad_norm": 2.131647825241089, + "learning_rate": 4.977979587551289e-05, + "loss": 5.0885, + "step": 7110 + }, + { + "epoch": 0.04229113141117138, + "grad_norm": 1.6632722616195679, + "learning_rate": 4.977973401149215e-05, + "loss": 5.1546, + "step": 7111 + }, + { + "epoch": 0.04229707869445237, + "grad_norm": 1.762418270111084, + "learning_rate": 4.977967213882107e-05, + "loss": 5.0884, + "step": 7112 + }, + { + "epoch": 0.042303025977733374, + "grad_norm": 1.9325755834579468, + "learning_rate": 4.977961025749964e-05, + "loss": 5.1857, + "step": 7113 + }, + { + "epoch": 0.04230897326101437, + "grad_norm": 1.8359284400939941, + "learning_rate": 4.9779548367527926e-05, + "loss": 5.165, + "step": 7114 + }, + { + "epoch": 0.042314920544295365, + "grad_norm": 1.8305978775024414, + "learning_rate": 4.977948646890591e-05, + "loss": 5.1347, + "step": 7115 + }, + { + "epoch": 0.04232086782757636, + "grad_norm": 1.7374697923660278, + "learning_rate": 4.9779424561633644e-05, + "loss": 5.5219, + "step": 7116 + }, + { + "epoch": 0.04232681511085736, + "grad_norm": 1.9947689771652222, + "learning_rate": 4.9779362645711135e-05, + "loss": 5.4445, + "step": 7117 + }, + { + "epoch": 0.04233276239413836, + "grad_norm": 1.6639795303344727, + "learning_rate": 4.97793007211384e-05, + "loss": 5.3798, + "step": 7118 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 1.6983096599578857, + "learning_rate": 4.977923878791547e-05, + "loss": 5.2847, + "step": 7119 + }, + { + "epoch": 0.042344656960700354, + "grad_norm": 1.7397092580795288, + "learning_rate": 4.9779176846042366e-05, + "loss": 5.3175, + "step": 7120 + }, + { + "epoch": 0.04235060424398135, + "grad_norm": 1.5255639553070068, + "learning_rate": 4.977911489551911e-05, + "loss": 5.2735, + "step": 7121 + }, + { + "epoch": 0.042356551527262344, + "grad_norm": 1.5646785497665405, + "learning_rate": 4.9779052936345715e-05, + "loss": 5.3892, + "step": 7122 + }, + { + "epoch": 0.042362498810543346, + "grad_norm": 1.7479640245437622, + "learning_rate": 4.977899096852221e-05, + "loss": 5.4341, + "step": 7123 + }, + { + "epoch": 0.04236844609382434, + "grad_norm": 1.6275604963302612, + "learning_rate": 4.9778928992048615e-05, + "loss": 5.5209, + "step": 7124 + }, + { + "epoch": 0.042374393377105336, + "grad_norm": 1.6917749643325806, + "learning_rate": 4.977886700692496e-05, + "loss": 5.5779, + "step": 7125 + }, + { + "epoch": 0.04238034066038634, + "grad_norm": 1.683716058731079, + "learning_rate": 4.977880501315125e-05, + "loss": 5.475, + "step": 7126 + }, + { + "epoch": 0.04238628794366733, + "grad_norm": 1.7665706872940063, + "learning_rate": 4.977874301072751e-05, + "loss": 5.3666, + "step": 7127 + }, + { + "epoch": 0.04239223522694833, + "grad_norm": 1.715329885482788, + "learning_rate": 4.977868099965377e-05, + "loss": 5.407, + "step": 7128 + }, + { + "epoch": 0.04239818251022933, + "grad_norm": 1.8468618392944336, + "learning_rate": 4.977861897993006e-05, + "loss": 5.328, + "step": 7129 + }, + { + "epoch": 0.042404129793510326, + "grad_norm": 1.59178626537323, + "learning_rate": 4.977855695155638e-05, + "loss": 5.7797, + "step": 7130 + }, + { + "epoch": 0.04241007707679132, + "grad_norm": 1.4733757972717285, + "learning_rate": 4.977849491453277e-05, + "loss": 5.3019, + "step": 7131 + }, + { + "epoch": 0.042416024360072316, + "grad_norm": 1.4632091522216797, + "learning_rate": 4.977843286885923e-05, + "loss": 5.1754, + "step": 7132 + }, + { + "epoch": 0.04242197164335332, + "grad_norm": 1.530564308166504, + "learning_rate": 4.97783708145358e-05, + "loss": 5.3613, + "step": 7133 + }, + { + "epoch": 0.04242791892663431, + "grad_norm": 1.954219102859497, + "learning_rate": 4.97783087515625e-05, + "loss": 5.4013, + "step": 7134 + }, + { + "epoch": 0.04243386620991531, + "grad_norm": 1.8276890516281128, + "learning_rate": 4.977824667993935e-05, + "loss": 5.3611, + "step": 7135 + }, + { + "epoch": 0.04243981349319631, + "grad_norm": 2.1430561542510986, + "learning_rate": 4.977818459966637e-05, + "loss": 5.1501, + "step": 7136 + }, + { + "epoch": 0.042445760776477305, + "grad_norm": 1.9150115251541138, + "learning_rate": 4.977812251074357e-05, + "loss": 5.1778, + "step": 7137 + }, + { + "epoch": 0.0424517080597583, + "grad_norm": 1.6958523988723755, + "learning_rate": 4.9778060413171004e-05, + "loss": 5.5029, + "step": 7138 + }, + { + "epoch": 0.0424576553430393, + "grad_norm": 1.7183772325515747, + "learning_rate": 4.977799830694866e-05, + "loss": 5.4323, + "step": 7139 + }, + { + "epoch": 0.0424636026263203, + "grad_norm": 1.717731237411499, + "learning_rate": 4.977793619207657e-05, + "loss": 5.3418, + "step": 7140 + }, + { + "epoch": 0.04246954990960129, + "grad_norm": 1.8155564069747925, + "learning_rate": 4.9777874068554766e-05, + "loss": 5.2865, + "step": 7141 + }, + { + "epoch": 0.042475497192882294, + "grad_norm": 1.9890762567520142, + "learning_rate": 4.9777811936383254e-05, + "loss": 5.4101, + "step": 7142 + }, + { + "epoch": 0.04248144447616329, + "grad_norm": 1.8181748390197754, + "learning_rate": 4.977774979556207e-05, + "loss": 5.2719, + "step": 7143 + }, + { + "epoch": 0.042487391759444285, + "grad_norm": 1.7353019714355469, + "learning_rate": 4.9777687646091234e-05, + "loss": 5.4202, + "step": 7144 + }, + { + "epoch": 0.04249333904272528, + "grad_norm": 1.6121984720230103, + "learning_rate": 4.977762548797076e-05, + "loss": 5.3174, + "step": 7145 + }, + { + "epoch": 0.04249928632600628, + "grad_norm": 1.9579551219940186, + "learning_rate": 4.977756332120067e-05, + "loss": 5.135, + "step": 7146 + }, + { + "epoch": 0.04250523360928728, + "grad_norm": 1.9396319389343262, + "learning_rate": 4.977750114578099e-05, + "loss": 5.7521, + "step": 7147 + }, + { + "epoch": 0.04251118089256827, + "grad_norm": 1.8567198514938354, + "learning_rate": 4.977743896171173e-05, + "loss": 5.7521, + "step": 7148 + }, + { + "epoch": 0.042517128175849274, + "grad_norm": 2.139861583709717, + "learning_rate": 4.977737676899293e-05, + "loss": 5.472, + "step": 7149 + }, + { + "epoch": 0.04252307545913027, + "grad_norm": 1.6526445150375366, + "learning_rate": 4.977731456762461e-05, + "loss": 5.5557, + "step": 7150 + }, + { + "epoch": 0.042529022742411264, + "grad_norm": 1.7761725187301636, + "learning_rate": 4.9777252357606784e-05, + "loss": 5.1922, + "step": 7151 + }, + { + "epoch": 0.042534970025692266, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.977719013893947e-05, + "loss": 5.5067, + "step": 7152 + }, + { + "epoch": 0.04254091730897326, + "grad_norm": 1.746470332145691, + "learning_rate": 4.97771279116227e-05, + "loss": 5.28, + "step": 7153 + }, + { + "epoch": 0.042546864592254256, + "grad_norm": 1.9258379936218262, + "learning_rate": 4.9777065675656484e-05, + "loss": 5.7223, + "step": 7154 + }, + { + "epoch": 0.04255281187553526, + "grad_norm": 1.9928748607635498, + "learning_rate": 4.977700343104086e-05, + "loss": 5.727, + "step": 7155 + }, + { + "epoch": 0.04255875915881625, + "grad_norm": 1.7435163259506226, + "learning_rate": 4.9776941177775824e-05, + "loss": 5.6636, + "step": 7156 + }, + { + "epoch": 0.04256470644209725, + "grad_norm": 1.6818004846572876, + "learning_rate": 4.977687891586143e-05, + "loss": 5.6589, + "step": 7157 + }, + { + "epoch": 0.04257065372537825, + "grad_norm": 1.812779426574707, + "learning_rate": 4.9776816645297676e-05, + "loss": 5.2705, + "step": 7158 + }, + { + "epoch": 0.042576601008659246, + "grad_norm": 1.7637232542037964, + "learning_rate": 4.977675436608459e-05, + "loss": 5.2872, + "step": 7159 + }, + { + "epoch": 0.04258254829194024, + "grad_norm": 1.9504014253616333, + "learning_rate": 4.97766920782222e-05, + "loss": 5.1324, + "step": 7160 + }, + { + "epoch": 0.042588495575221236, + "grad_norm": 1.7741994857788086, + "learning_rate": 4.9776629781710525e-05, + "loss": 5.4164, + "step": 7161 + }, + { + "epoch": 0.04259444285850224, + "grad_norm": 2.0005195140838623, + "learning_rate": 4.9776567476549576e-05, + "loss": 5.4667, + "step": 7162 + }, + { + "epoch": 0.04260039014178323, + "grad_norm": 2.256420612335205, + "learning_rate": 4.977650516273939e-05, + "loss": 5.1116, + "step": 7163 + }, + { + "epoch": 0.04260633742506423, + "grad_norm": 2.0806920528411865, + "learning_rate": 4.977644284027998e-05, + "loss": 5.2333, + "step": 7164 + }, + { + "epoch": 0.04261228470834523, + "grad_norm": 1.898760199546814, + "learning_rate": 4.9776380509171364e-05, + "loss": 5.4761, + "step": 7165 + }, + { + "epoch": 0.042618231991626225, + "grad_norm": 1.7251659631729126, + "learning_rate": 4.977631816941358e-05, + "loss": 5.5584, + "step": 7166 + }, + { + "epoch": 0.04262417927490722, + "grad_norm": 1.741645336151123, + "learning_rate": 4.977625582100664e-05, + "loss": 5.4133, + "step": 7167 + }, + { + "epoch": 0.04263012655818822, + "grad_norm": 1.921617031097412, + "learning_rate": 4.977619346395055e-05, + "loss": 5.1829, + "step": 7168 + }, + { + "epoch": 0.04263607384146922, + "grad_norm": 1.7597262859344482, + "learning_rate": 4.977613109824536e-05, + "loss": 5.1743, + "step": 7169 + }, + { + "epoch": 0.04264202112475021, + "grad_norm": 1.8069764375686646, + "learning_rate": 4.977606872389107e-05, + "loss": 5.4004, + "step": 7170 + }, + { + "epoch": 0.042647968408031214, + "grad_norm": 1.7694367170333862, + "learning_rate": 4.9776006340887714e-05, + "loss": 5.2018, + "step": 7171 + }, + { + "epoch": 0.04265391569131221, + "grad_norm": 1.8260759115219116, + "learning_rate": 4.9775943949235316e-05, + "loss": 5.4115, + "step": 7172 + }, + { + "epoch": 0.042659862974593205, + "grad_norm": 1.71034574508667, + "learning_rate": 4.9775881548933884e-05, + "loss": 5.2781, + "step": 7173 + }, + { + "epoch": 0.0426658102578742, + "grad_norm": 1.7208900451660156, + "learning_rate": 4.977581913998345e-05, + "loss": 5.4686, + "step": 7174 + }, + { + "epoch": 0.0426717575411552, + "grad_norm": 1.8545277118682861, + "learning_rate": 4.977575672238404e-05, + "loss": 5.4545, + "step": 7175 + }, + { + "epoch": 0.0426777048244362, + "grad_norm": 1.7892229557037354, + "learning_rate": 4.9775694296135656e-05, + "loss": 5.6612, + "step": 7176 + }, + { + "epoch": 0.04268365210771719, + "grad_norm": 1.8321889638900757, + "learning_rate": 4.9775631861238343e-05, + "loss": 5.5889, + "step": 7177 + }, + { + "epoch": 0.042689599390998194, + "grad_norm": 1.7925626039505005, + "learning_rate": 4.977556941769211e-05, + "loss": 5.6218, + "step": 7178 + }, + { + "epoch": 0.04269554667427919, + "grad_norm": 1.9650121927261353, + "learning_rate": 4.9775506965496984e-05, + "loss": 5.5228, + "step": 7179 + }, + { + "epoch": 0.042701493957560184, + "grad_norm": 1.9050647020339966, + "learning_rate": 4.977544450465298e-05, + "loss": 5.5547, + "step": 7180 + }, + { + "epoch": 0.042707441240841186, + "grad_norm": 1.8334670066833496, + "learning_rate": 4.977538203516013e-05, + "loss": 5.3895, + "step": 7181 + }, + { + "epoch": 0.04271338852412218, + "grad_norm": 1.803544521331787, + "learning_rate": 4.9775319557018444e-05, + "loss": 5.6288, + "step": 7182 + }, + { + "epoch": 0.042719335807403176, + "grad_norm": 1.823440432548523, + "learning_rate": 4.9775257070227956e-05, + "loss": 5.4996, + "step": 7183 + }, + { + "epoch": 0.04272528309068418, + "grad_norm": 1.9730159044265747, + "learning_rate": 4.977519457478868e-05, + "loss": 5.5004, + "step": 7184 + }, + { + "epoch": 0.04273123037396517, + "grad_norm": 1.9566004276275635, + "learning_rate": 4.977513207070064e-05, + "loss": 5.5496, + "step": 7185 + }, + { + "epoch": 0.04273717765724617, + "grad_norm": 2.0958995819091797, + "learning_rate": 4.977506955796385e-05, + "loss": 5.5256, + "step": 7186 + }, + { + "epoch": 0.04274312494052717, + "grad_norm": 1.8957890272140503, + "learning_rate": 4.977500703657835e-05, + "loss": 5.3337, + "step": 7187 + }, + { + "epoch": 0.042749072223808166, + "grad_norm": 1.8224141597747803, + "learning_rate": 4.977494450654414e-05, + "loss": 5.1362, + "step": 7188 + }, + { + "epoch": 0.04275501950708916, + "grad_norm": 1.648296594619751, + "learning_rate": 4.977488196786126e-05, + "loss": 5.3398, + "step": 7189 + }, + { + "epoch": 0.042760966790370156, + "grad_norm": 1.6238311529159546, + "learning_rate": 4.977481942052972e-05, + "loss": 5.2083, + "step": 7190 + }, + { + "epoch": 0.04276691407365116, + "grad_norm": 1.7399996519088745, + "learning_rate": 4.977475686454956e-05, + "loss": 5.2403, + "step": 7191 + }, + { + "epoch": 0.04277286135693215, + "grad_norm": 1.7260342836380005, + "learning_rate": 4.977469429992077e-05, + "loss": 5.2282, + "step": 7192 + }, + { + "epoch": 0.04277880864021315, + "grad_norm": 4.4954447746276855, + "learning_rate": 4.9774631726643396e-05, + "loss": 5.1044, + "step": 7193 + }, + { + "epoch": 0.04278475592349415, + "grad_norm": 1.879869818687439, + "learning_rate": 4.977456914471746e-05, + "loss": 5.3431, + "step": 7194 + }, + { + "epoch": 0.042790703206775145, + "grad_norm": 1.8826582431793213, + "learning_rate": 4.977450655414297e-05, + "loss": 5.2951, + "step": 7195 + }, + { + "epoch": 0.04279665049005614, + "grad_norm": 1.8973712921142578, + "learning_rate": 4.977444395491996e-05, + "loss": 5.343, + "step": 7196 + }, + { + "epoch": 0.04280259777333714, + "grad_norm": 1.6125551462173462, + "learning_rate": 4.977438134704845e-05, + "loss": 5.2849, + "step": 7197 + }, + { + "epoch": 0.04280854505661814, + "grad_norm": 1.441159963607788, + "learning_rate": 4.9774318730528456e-05, + "loss": 5.2955, + "step": 7198 + }, + { + "epoch": 0.04281449233989913, + "grad_norm": 1.9655884504318237, + "learning_rate": 4.9774256105360004e-05, + "loss": 5.2093, + "step": 7199 + }, + { + "epoch": 0.042820439623180134, + "grad_norm": 1.7824043035507202, + "learning_rate": 4.9774193471543116e-05, + "loss": 5.2105, + "step": 7200 + }, + { + "epoch": 0.04282638690646113, + "grad_norm": 1.8331031799316406, + "learning_rate": 4.977413082907781e-05, + "loss": 5.3359, + "step": 7201 + }, + { + "epoch": 0.042832334189742124, + "grad_norm": 1.8695242404937744, + "learning_rate": 4.977406817796412e-05, + "loss": 5.3686, + "step": 7202 + }, + { + "epoch": 0.042838281473023126, + "grad_norm": 1.70205557346344, + "learning_rate": 4.977400551820205e-05, + "loss": 5.2689, + "step": 7203 + }, + { + "epoch": 0.04284422875630412, + "grad_norm": 1.700307846069336, + "learning_rate": 4.9773942849791635e-05, + "loss": 5.3946, + "step": 7204 + }, + { + "epoch": 0.04285017603958512, + "grad_norm": 1.625637173652649, + "learning_rate": 4.977388017273288e-05, + "loss": 5.095, + "step": 7205 + }, + { + "epoch": 0.04285612332286611, + "grad_norm": 1.7689390182495117, + "learning_rate": 4.977381748702583e-05, + "loss": 5.0097, + "step": 7206 + }, + { + "epoch": 0.042862070606147114, + "grad_norm": 1.856493353843689, + "learning_rate": 4.97737547926705e-05, + "loss": 5.0551, + "step": 7207 + }, + { + "epoch": 0.04286801788942811, + "grad_norm": 1.6497242450714111, + "learning_rate": 4.97736920896669e-05, + "loss": 5.031, + "step": 7208 + }, + { + "epoch": 0.042873965172709104, + "grad_norm": 1.5884608030319214, + "learning_rate": 4.977362937801506e-05, + "loss": 5.0758, + "step": 7209 + }, + { + "epoch": 0.042879912455990106, + "grad_norm": 1.5206499099731445, + "learning_rate": 4.9773566657715006e-05, + "loss": 5.049, + "step": 7210 + }, + { + "epoch": 0.0428858597392711, + "grad_norm": 1.7026933431625366, + "learning_rate": 4.977350392876676e-05, + "loss": 5.001, + "step": 7211 + }, + { + "epoch": 0.042891807022552096, + "grad_norm": 1.4197289943695068, + "learning_rate": 4.977344119117034e-05, + "loss": 5.0446, + "step": 7212 + }, + { + "epoch": 0.0428977543058331, + "grad_norm": 1.498713731765747, + "learning_rate": 4.977337844492576e-05, + "loss": 5.0574, + "step": 7213 + }, + { + "epoch": 0.04290370158911409, + "grad_norm": 1.7583528757095337, + "learning_rate": 4.9773315690033054e-05, + "loss": 4.994, + "step": 7214 + }, + { + "epoch": 0.04290964887239509, + "grad_norm": 1.8511004447937012, + "learning_rate": 4.9773252926492236e-05, + "loss": 4.9888, + "step": 7215 + }, + { + "epoch": 0.04291559615567609, + "grad_norm": 1.5799078941345215, + "learning_rate": 4.9773190154303334e-05, + "loss": 5.0028, + "step": 7216 + }, + { + "epoch": 0.042921543438957085, + "grad_norm": 1.6737205982208252, + "learning_rate": 4.977312737346637e-05, + "loss": 5.0701, + "step": 7217 + }, + { + "epoch": 0.04292749072223808, + "grad_norm": 1.537049412727356, + "learning_rate": 4.977306458398136e-05, + "loss": 5.0747, + "step": 7218 + }, + { + "epoch": 0.042933438005519076, + "grad_norm": 1.7501899003982544, + "learning_rate": 4.977300178584833e-05, + "loss": 5.0172, + "step": 7219 + }, + { + "epoch": 0.04293938528880008, + "grad_norm": 1.5130890607833862, + "learning_rate": 4.9772938979067294e-05, + "loss": 5.0196, + "step": 7220 + }, + { + "epoch": 0.04294533257208107, + "grad_norm": 1.628053903579712, + "learning_rate": 4.977287616363829e-05, + "loss": 5.0526, + "step": 7221 + }, + { + "epoch": 0.04295127985536207, + "grad_norm": 1.6736811399459839, + "learning_rate": 4.977281333956133e-05, + "loss": 5.0093, + "step": 7222 + }, + { + "epoch": 0.04295722713864307, + "grad_norm": 1.6157552003860474, + "learning_rate": 4.977275050683643e-05, + "loss": 4.9562, + "step": 7223 + }, + { + "epoch": 0.042963174421924065, + "grad_norm": 1.6699459552764893, + "learning_rate": 4.9772687665463625e-05, + "loss": 4.9603, + "step": 7224 + }, + { + "epoch": 0.04296912170520506, + "grad_norm": 1.4698256254196167, + "learning_rate": 4.9772624815442925e-05, + "loss": 4.9908, + "step": 7225 + }, + { + "epoch": 0.04297506898848606, + "grad_norm": 1.5310906171798706, + "learning_rate": 4.9772561956774365e-05, + "loss": 5.0081, + "step": 7226 + }, + { + "epoch": 0.04298101627176706, + "grad_norm": 1.6135941743850708, + "learning_rate": 4.977249908945795e-05, + "loss": 5.1394, + "step": 7227 + }, + { + "epoch": 0.04298696355504805, + "grad_norm": 1.7632607221603394, + "learning_rate": 4.977243621349372e-05, + "loss": 4.9992, + "step": 7228 + }, + { + "epoch": 0.042992910838329054, + "grad_norm": 1.574826955795288, + "learning_rate": 4.977237332888168e-05, + "loss": 4.9361, + "step": 7229 + }, + { + "epoch": 0.04299885812161005, + "grad_norm": 1.6633859872817993, + "learning_rate": 4.9772310435621874e-05, + "loss": 4.9085, + "step": 7230 + }, + { + "epoch": 0.043004805404891044, + "grad_norm": 1.6180634498596191, + "learning_rate": 4.97722475337143e-05, + "loss": 4.939, + "step": 7231 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 1.959694266319275, + "learning_rate": 4.9772184623158996e-05, + "loss": 5.231, + "step": 7232 + }, + { + "epoch": 0.04301669997145304, + "grad_norm": 1.6264785528182983, + "learning_rate": 4.977212170395598e-05, + "loss": 5.3228, + "step": 7233 + }, + { + "epoch": 0.04302264725473404, + "grad_norm": 2.109292507171631, + "learning_rate": 4.9772058776105264e-05, + "loss": 5.4579, + "step": 7234 + }, + { + "epoch": 0.04302859453801503, + "grad_norm": 1.991877555847168, + "learning_rate": 4.977199583960688e-05, + "loss": 5.355, + "step": 7235 + }, + { + "epoch": 0.043034541821296034, + "grad_norm": 2.23330020904541, + "learning_rate": 4.977193289446085e-05, + "loss": 5.3233, + "step": 7236 + }, + { + "epoch": 0.04304048910457703, + "grad_norm": 2.077359914779663, + "learning_rate": 4.9771869940667194e-05, + "loss": 5.2003, + "step": 7237 + }, + { + "epoch": 0.043046436387858024, + "grad_norm": 1.652498722076416, + "learning_rate": 4.977180697822593e-05, + "loss": 5.0232, + "step": 7238 + }, + { + "epoch": 0.043052383671139026, + "grad_norm": 1.9277194738388062, + "learning_rate": 4.977174400713709e-05, + "loss": 5.3826, + "step": 7239 + }, + { + "epoch": 0.04305833095442002, + "grad_norm": 1.9263273477554321, + "learning_rate": 4.9771681027400694e-05, + "loss": 5.5258, + "step": 7240 + }, + { + "epoch": 0.043064278237701016, + "grad_norm": 2.066934108734131, + "learning_rate": 4.9771618039016756e-05, + "loss": 5.6398, + "step": 7241 + }, + { + "epoch": 0.04307022552098202, + "grad_norm": 1.7810741662979126, + "learning_rate": 4.9771555041985295e-05, + "loss": 5.3716, + "step": 7242 + }, + { + "epoch": 0.04307617280426301, + "grad_norm": 1.7068313360214233, + "learning_rate": 4.977149203630635e-05, + "loss": 5.4042, + "step": 7243 + }, + { + "epoch": 0.04308212008754401, + "grad_norm": 1.8587994575500488, + "learning_rate": 4.977142902197992e-05, + "loss": 5.3635, + "step": 7244 + }, + { + "epoch": 0.04308806737082501, + "grad_norm": 2.101649284362793, + "learning_rate": 4.9771365999006054e-05, + "loss": 5.5292, + "step": 7245 + }, + { + "epoch": 0.043094014654106005, + "grad_norm": 1.8571972846984863, + "learning_rate": 4.9771302967384756e-05, + "loss": 5.4577, + "step": 7246 + }, + { + "epoch": 0.043099961937387, + "grad_norm": 1.9837383031845093, + "learning_rate": 4.9771239927116045e-05, + "loss": 5.4976, + "step": 7247 + }, + { + "epoch": 0.043105909220667996, + "grad_norm": 1.7688343524932861, + "learning_rate": 4.977117687819996e-05, + "loss": 5.448, + "step": 7248 + }, + { + "epoch": 0.043111856503949, + "grad_norm": 1.923824429512024, + "learning_rate": 4.9771113820636505e-05, + "loss": 5.3436, + "step": 7249 + }, + { + "epoch": 0.04311780378722999, + "grad_norm": 1.4405949115753174, + "learning_rate": 4.9771050754425715e-05, + "loss": 5.2751, + "step": 7250 + }, + { + "epoch": 0.04312375107051099, + "grad_norm": 1.7337450981140137, + "learning_rate": 4.977098767956761e-05, + "loss": 5.4693, + "step": 7251 + }, + { + "epoch": 0.04312969835379199, + "grad_norm": 2.063887119293213, + "learning_rate": 4.977092459606221e-05, + "loss": 5.4576, + "step": 7252 + }, + { + "epoch": 0.043135645637072985, + "grad_norm": 1.576517105102539, + "learning_rate": 4.9770861503909524e-05, + "loss": 5.4052, + "step": 7253 + }, + { + "epoch": 0.04314159292035398, + "grad_norm": 1.8137834072113037, + "learning_rate": 4.9770798403109596e-05, + "loss": 5.5732, + "step": 7254 + }, + { + "epoch": 0.04314754020363498, + "grad_norm": 1.7954564094543457, + "learning_rate": 4.977073529366244e-05, + "loss": 5.4213, + "step": 7255 + }, + { + "epoch": 0.04315348748691598, + "grad_norm": 1.993961215019226, + "learning_rate": 4.977067217556807e-05, + "loss": 5.2909, + "step": 7256 + }, + { + "epoch": 0.04315943477019697, + "grad_norm": 1.6993632316589355, + "learning_rate": 4.977060904882651e-05, + "loss": 5.4523, + "step": 7257 + }, + { + "epoch": 0.043165382053477974, + "grad_norm": 1.8541932106018066, + "learning_rate": 4.977054591343779e-05, + "loss": 5.3182, + "step": 7258 + }, + { + "epoch": 0.04317132933675897, + "grad_norm": 1.7425625324249268, + "learning_rate": 4.9770482769401935e-05, + "loss": 5.2527, + "step": 7259 + }, + { + "epoch": 0.043177276620039964, + "grad_norm": 1.7028024196624756, + "learning_rate": 4.9770419616718955e-05, + "loss": 5.1305, + "step": 7260 + }, + { + "epoch": 0.043183223903320966, + "grad_norm": 1.745316982269287, + "learning_rate": 4.977035645538888e-05, + "loss": 5.0368, + "step": 7261 + }, + { + "epoch": 0.04318917118660196, + "grad_norm": 1.8373509645462036, + "learning_rate": 4.977029328541173e-05, + "loss": 5.353, + "step": 7262 + }, + { + "epoch": 0.04319511846988296, + "grad_norm": 1.9976449012756348, + "learning_rate": 4.9770230106787526e-05, + "loss": 5.363, + "step": 7263 + }, + { + "epoch": 0.04320106575316395, + "grad_norm": 1.7109822034835815, + "learning_rate": 4.977016691951629e-05, + "loss": 5.3462, + "step": 7264 + }, + { + "epoch": 0.043207013036444954, + "grad_norm": 1.8688478469848633, + "learning_rate": 4.9770103723598036e-05, + "loss": 5.3564, + "step": 7265 + }, + { + "epoch": 0.04321296031972595, + "grad_norm": 1.8680217266082764, + "learning_rate": 4.9770040519032804e-05, + "loss": 5.2713, + "step": 7266 + }, + { + "epoch": 0.043218907603006944, + "grad_norm": 1.8022522926330566, + "learning_rate": 4.976997730582061e-05, + "loss": 5.153, + "step": 7267 + }, + { + "epoch": 0.043224854886287946, + "grad_norm": 1.7128162384033203, + "learning_rate": 4.976991408396147e-05, + "loss": 5.3107, + "step": 7268 + }, + { + "epoch": 0.04323080216956894, + "grad_norm": 1.8222606182098389, + "learning_rate": 4.9769850853455404e-05, + "loss": 5.3599, + "step": 7269 + }, + { + "epoch": 0.043236749452849936, + "grad_norm": 1.829373836517334, + "learning_rate": 4.976978761430244e-05, + "loss": 5.3991, + "step": 7270 + }, + { + "epoch": 0.04324269673613094, + "grad_norm": 1.8270717859268188, + "learning_rate": 4.97697243665026e-05, + "loss": 5.2434, + "step": 7271 + }, + { + "epoch": 0.04324864401941193, + "grad_norm": 1.9759695529937744, + "learning_rate": 4.976966111005591e-05, + "loss": 5.4585, + "step": 7272 + }, + { + "epoch": 0.04325459130269293, + "grad_norm": 2.0235564708709717, + "learning_rate": 4.9769597844962376e-05, + "loss": 5.3996, + "step": 7273 + }, + { + "epoch": 0.04326053858597393, + "grad_norm": 1.9220880270004272, + "learning_rate": 4.976953457122204e-05, + "loss": 5.344, + "step": 7274 + }, + { + "epoch": 0.043266485869254925, + "grad_norm": 1.6257338523864746, + "learning_rate": 4.976947128883492e-05, + "loss": 5.4012, + "step": 7275 + }, + { + "epoch": 0.04327243315253592, + "grad_norm": 1.6390771865844727, + "learning_rate": 4.976940799780103e-05, + "loss": 5.3693, + "step": 7276 + }, + { + "epoch": 0.043278380435816916, + "grad_norm": 1.5769712924957275, + "learning_rate": 4.976934469812039e-05, + "loss": 5.3214, + "step": 7277 + }, + { + "epoch": 0.04328432771909792, + "grad_norm": 1.539920687675476, + "learning_rate": 4.9769281389793035e-05, + "loss": 5.2784, + "step": 7278 + }, + { + "epoch": 0.04329027500237891, + "grad_norm": 1.662835717201233, + "learning_rate": 4.976921807281897e-05, + "loss": 5.2717, + "step": 7279 + }, + { + "epoch": 0.04329622228565991, + "grad_norm": 1.3613345623016357, + "learning_rate": 4.9769154747198234e-05, + "loss": 5.4241, + "step": 7280 + }, + { + "epoch": 0.04330216956894091, + "grad_norm": 1.5267658233642578, + "learning_rate": 4.976909141293084e-05, + "loss": 5.454, + "step": 7281 + }, + { + "epoch": 0.043308116852221905, + "grad_norm": 1.5050435066223145, + "learning_rate": 4.976902807001681e-05, + "loss": 5.4975, + "step": 7282 + }, + { + "epoch": 0.0433140641355029, + "grad_norm": 1.292698621749878, + "learning_rate": 4.976896471845617e-05, + "loss": 5.4071, + "step": 7283 + }, + { + "epoch": 0.0433200114187839, + "grad_norm": 1.6818265914916992, + "learning_rate": 4.9768901358248946e-05, + "loss": 5.3561, + "step": 7284 + }, + { + "epoch": 0.0433259587020649, + "grad_norm": 1.5995383262634277, + "learning_rate": 4.976883798939515e-05, + "loss": 5.2623, + "step": 7285 + }, + { + "epoch": 0.04333190598534589, + "grad_norm": 1.6959342956542969, + "learning_rate": 4.976877461189481e-05, + "loss": 5.3193, + "step": 7286 + }, + { + "epoch": 0.043337853268626894, + "grad_norm": 1.6978071928024292, + "learning_rate": 4.976871122574794e-05, + "loss": 5.5653, + "step": 7287 + }, + { + "epoch": 0.04334380055190789, + "grad_norm": 1.7587183713912964, + "learning_rate": 4.976864783095457e-05, + "loss": 5.545, + "step": 7288 + }, + { + "epoch": 0.043349747835188884, + "grad_norm": 1.6225430965423584, + "learning_rate": 4.976858442751473e-05, + "loss": 5.5804, + "step": 7289 + }, + { + "epoch": 0.043355695118469886, + "grad_norm": 1.5895410776138306, + "learning_rate": 4.976852101542843e-05, + "loss": 5.4798, + "step": 7290 + }, + { + "epoch": 0.04336164240175088, + "grad_norm": 1.759022831916809, + "learning_rate": 4.976845759469569e-05, + "loss": 5.4794, + "step": 7291 + }, + { + "epoch": 0.043367589685031877, + "grad_norm": 1.483383059501648, + "learning_rate": 4.976839416531654e-05, + "loss": 5.2547, + "step": 7292 + }, + { + "epoch": 0.04337353696831287, + "grad_norm": 2.136172294616699, + "learning_rate": 4.9768330727291e-05, + "loss": 5.1655, + "step": 7293 + }, + { + "epoch": 0.043379484251593874, + "grad_norm": 1.9202553033828735, + "learning_rate": 4.9768267280619094e-05, + "loss": 5.1945, + "step": 7294 + }, + { + "epoch": 0.04338543153487487, + "grad_norm": 1.7927708625793457, + "learning_rate": 4.976820382530084e-05, + "loss": 5.4936, + "step": 7295 + }, + { + "epoch": 0.043391378818155864, + "grad_norm": 1.597887396812439, + "learning_rate": 4.976814036133626e-05, + "loss": 5.5516, + "step": 7296 + }, + { + "epoch": 0.043397326101436866, + "grad_norm": 1.493356466293335, + "learning_rate": 4.9768076888725376e-05, + "loss": 5.552, + "step": 7297 + }, + { + "epoch": 0.04340327338471786, + "grad_norm": 1.6748720407485962, + "learning_rate": 4.976801340746822e-05, + "loss": 5.3957, + "step": 7298 + }, + { + "epoch": 0.043409220667998856, + "grad_norm": 1.541945457458496, + "learning_rate": 4.9767949917564794e-05, + "loss": 5.5558, + "step": 7299 + }, + { + "epoch": 0.04341516795127986, + "grad_norm": 1.6436586380004883, + "learning_rate": 4.976788641901514e-05, + "loss": 5.4918, + "step": 7300 + }, + { + "epoch": 0.04342111523456085, + "grad_norm": 1.69910728931427, + "learning_rate": 4.9767822911819274e-05, + "loss": 5.4688, + "step": 7301 + }, + { + "epoch": 0.04342706251784185, + "grad_norm": 1.8294274806976318, + "learning_rate": 4.976775939597721e-05, + "loss": 5.505, + "step": 7302 + }, + { + "epoch": 0.04343300980112285, + "grad_norm": 1.720880389213562, + "learning_rate": 4.976769587148899e-05, + "loss": 5.3509, + "step": 7303 + }, + { + "epoch": 0.043438957084403845, + "grad_norm": 1.5898194313049316, + "learning_rate": 4.976763233835461e-05, + "loss": 5.2955, + "step": 7304 + }, + { + "epoch": 0.04344490436768484, + "grad_norm": 1.569218397140503, + "learning_rate": 4.976756879657412e-05, + "loss": 5.5695, + "step": 7305 + }, + { + "epoch": 0.043450851650965835, + "grad_norm": 1.5551841259002686, + "learning_rate": 4.976750524614752e-05, + "loss": 5.5313, + "step": 7306 + }, + { + "epoch": 0.04345679893424684, + "grad_norm": 1.5870057344436646, + "learning_rate": 4.9767441687074834e-05, + "loss": 5.7525, + "step": 7307 + }, + { + "epoch": 0.04346274621752783, + "grad_norm": 1.5421022176742554, + "learning_rate": 4.97673781193561e-05, + "loss": 5.6176, + "step": 7308 + }, + { + "epoch": 0.04346869350080883, + "grad_norm": 1.9368326663970947, + "learning_rate": 4.976731454299132e-05, + "loss": 5.4239, + "step": 7309 + }, + { + "epoch": 0.04347464078408983, + "grad_norm": 1.719084620475769, + "learning_rate": 4.976725095798053e-05, + "loss": 5.3526, + "step": 7310 + }, + { + "epoch": 0.043480588067370825, + "grad_norm": 1.8004268407821655, + "learning_rate": 4.9767187364323756e-05, + "loss": 5.7112, + "step": 7311 + }, + { + "epoch": 0.04348653535065182, + "grad_norm": 1.9922735691070557, + "learning_rate": 4.9767123762021003e-05, + "loss": 5.4993, + "step": 7312 + }, + { + "epoch": 0.04349248263393282, + "grad_norm": 1.6768959760665894, + "learning_rate": 4.976706015107231e-05, + "loss": 5.4713, + "step": 7313 + }, + { + "epoch": 0.04349842991721382, + "grad_norm": 1.6070122718811035, + "learning_rate": 4.976699653147768e-05, + "loss": 5.4695, + "step": 7314 + }, + { + "epoch": 0.04350437720049481, + "grad_norm": 1.5641200542449951, + "learning_rate": 4.976693290323716e-05, + "loss": 5.3596, + "step": 7315 + }, + { + "epoch": 0.043510324483775814, + "grad_norm": 3.0344419479370117, + "learning_rate": 4.976686926635076e-05, + "loss": 5.7371, + "step": 7316 + }, + { + "epoch": 0.04351627176705681, + "grad_norm": 1.8784242868423462, + "learning_rate": 4.9766805620818494e-05, + "loss": 5.5142, + "step": 7317 + }, + { + "epoch": 0.043522219050337804, + "grad_norm": 2.0644166469573975, + "learning_rate": 4.9766741966640394e-05, + "loss": 5.276, + "step": 7318 + }, + { + "epoch": 0.043528166333618806, + "grad_norm": 1.8128771781921387, + "learning_rate": 4.976667830381649e-05, + "loss": 5.3515, + "step": 7319 + }, + { + "epoch": 0.0435341136168998, + "grad_norm": 1.8899081945419312, + "learning_rate": 4.9766614632346786e-05, + "loss": 5.3981, + "step": 7320 + }, + { + "epoch": 0.043540060900180796, + "grad_norm": 1.89181649684906, + "learning_rate": 4.976655095223131e-05, + "loss": 5.4378, + "step": 7321 + }, + { + "epoch": 0.04354600818346179, + "grad_norm": 1.6332184076309204, + "learning_rate": 4.976648726347009e-05, + "loss": 5.4023, + "step": 7322 + }, + { + "epoch": 0.043551955466742794, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.976642356606315e-05, + "loss": 5.8375, + "step": 7323 + }, + { + "epoch": 0.04355790275002379, + "grad_norm": 2.029244899749756, + "learning_rate": 4.97663598600105e-05, + "loss": 5.5617, + "step": 7324 + }, + { + "epoch": 0.043563850033304784, + "grad_norm": 2.138946056365967, + "learning_rate": 4.9766296145312175e-05, + "loss": 5.5076, + "step": 7325 + }, + { + "epoch": 0.043569797316585786, + "grad_norm": 1.8702884912490845, + "learning_rate": 4.9766232421968184e-05, + "loss": 5.123, + "step": 7326 + }, + { + "epoch": 0.04357574459986678, + "grad_norm": 1.8917137384414673, + "learning_rate": 4.976616868997856e-05, + "loss": 5.4809, + "step": 7327 + }, + { + "epoch": 0.043581691883147776, + "grad_norm": 2.2203474044799805, + "learning_rate": 4.976610494934333e-05, + "loss": 5.6359, + "step": 7328 + }, + { + "epoch": 0.04358763916642878, + "grad_norm": 2.4505302906036377, + "learning_rate": 4.976604120006251e-05, + "loss": 6.1423, + "step": 7329 + }, + { + "epoch": 0.04359358644970977, + "grad_norm": 2.4601128101348877, + "learning_rate": 4.976597744213611e-05, + "loss": 6.0908, + "step": 7330 + }, + { + "epoch": 0.04359953373299077, + "grad_norm": 1.9502687454223633, + "learning_rate": 4.976591367556417e-05, + "loss": 5.918, + "step": 7331 + }, + { + "epoch": 0.04360548101627177, + "grad_norm": 2.180250644683838, + "learning_rate": 4.9765849900346696e-05, + "loss": 5.7203, + "step": 7332 + }, + { + "epoch": 0.043611428299552765, + "grad_norm": 2.125669002532959, + "learning_rate": 4.9765786116483726e-05, + "loss": 5.7875, + "step": 7333 + }, + { + "epoch": 0.04361737558283376, + "grad_norm": 2.0372321605682373, + "learning_rate": 4.9765722323975286e-05, + "loss": 5.6777, + "step": 7334 + }, + { + "epoch": 0.043623322866114755, + "grad_norm": 2.5857362747192383, + "learning_rate": 4.976565852282137e-05, + "loss": 5.2989, + "step": 7335 + }, + { + "epoch": 0.04362927014939576, + "grad_norm": 2.5774800777435303, + "learning_rate": 4.976559471302203e-05, + "loss": 6.0479, + "step": 7336 + }, + { + "epoch": 0.04363521743267675, + "grad_norm": 2.0820937156677246, + "learning_rate": 4.976553089457727e-05, + "loss": 5.7636, + "step": 7337 + }, + { + "epoch": 0.04364116471595775, + "grad_norm": 2.287719964981079, + "learning_rate": 4.9765467067487126e-05, + "loss": 5.7706, + "step": 7338 + }, + { + "epoch": 0.04364711199923875, + "grad_norm": 2.6578378677368164, + "learning_rate": 4.9765403231751614e-05, + "loss": 6.1506, + "step": 7339 + }, + { + "epoch": 0.043653059282519745, + "grad_norm": 2.503955841064453, + "learning_rate": 4.976533938737075e-05, + "loss": 6.0658, + "step": 7340 + }, + { + "epoch": 0.04365900656580074, + "grad_norm": 2.28857684135437, + "learning_rate": 4.976527553434456e-05, + "loss": 5.833, + "step": 7341 + }, + { + "epoch": 0.04366495384908174, + "grad_norm": 2.327331781387329, + "learning_rate": 4.976521167267307e-05, + "loss": 5.934, + "step": 7342 + }, + { + "epoch": 0.04367090113236274, + "grad_norm": 1.7726761102676392, + "learning_rate": 4.976514780235631e-05, + "loss": 6.034, + "step": 7343 + }, + { + "epoch": 0.04367684841564373, + "grad_norm": 2.180790662765503, + "learning_rate": 4.9765083923394285e-05, + "loss": 6.1377, + "step": 7344 + }, + { + "epoch": 0.043682795698924734, + "grad_norm": 2.031378984451294, + "learning_rate": 4.9765020035787024e-05, + "loss": 5.7203, + "step": 7345 + }, + { + "epoch": 0.04368874298220573, + "grad_norm": 2.453611135482788, + "learning_rate": 4.9764956139534545e-05, + "loss": 5.9798, + "step": 7346 + }, + { + "epoch": 0.043694690265486724, + "grad_norm": 2.3802528381347656, + "learning_rate": 4.976489223463688e-05, + "loss": 5.9343, + "step": 7347 + }, + { + "epoch": 0.043700637548767726, + "grad_norm": 2.771704912185669, + "learning_rate": 4.976482832109406e-05, + "loss": 6.5202, + "step": 7348 + }, + { + "epoch": 0.04370658483204872, + "grad_norm": 1.9455180168151855, + "learning_rate": 4.9764764398906084e-05, + "loss": 6.1159, + "step": 7349 + }, + { + "epoch": 0.043712532115329716, + "grad_norm": 1.9527102708816528, + "learning_rate": 4.9764700468072976e-05, + "loss": 5.7773, + "step": 7350 + }, + { + "epoch": 0.04371847939861071, + "grad_norm": 1.9531358480453491, + "learning_rate": 4.976463652859478e-05, + "loss": 5.9918, + "step": 7351 + }, + { + "epoch": 0.043724426681891713, + "grad_norm": 2.375239849090576, + "learning_rate": 4.97645725804715e-05, + "loss": 5.5054, + "step": 7352 + }, + { + "epoch": 0.04373037396517271, + "grad_norm": 2.156553030014038, + "learning_rate": 4.9764508623703166e-05, + "loss": 5.664, + "step": 7353 + }, + { + "epoch": 0.043736321248453704, + "grad_norm": 2.317331075668335, + "learning_rate": 4.9764444658289796e-05, + "loss": 5.4473, + "step": 7354 + }, + { + "epoch": 0.043742268531734706, + "grad_norm": 2.1958348751068115, + "learning_rate": 4.976438068423141e-05, + "loss": 5.3584, + "step": 7355 + }, + { + "epoch": 0.0437482158150157, + "grad_norm": 2.152045249938965, + "learning_rate": 4.976431670152803e-05, + "loss": 5.4388, + "step": 7356 + }, + { + "epoch": 0.043754163098296696, + "grad_norm": 2.0661544799804688, + "learning_rate": 4.976425271017971e-05, + "loss": 5.3866, + "step": 7357 + }, + { + "epoch": 0.0437601103815777, + "grad_norm": 2.106480598449707, + "learning_rate": 4.976418871018642e-05, + "loss": 5.5928, + "step": 7358 + }, + { + "epoch": 0.04376605766485869, + "grad_norm": 2.5921759605407715, + "learning_rate": 4.976412470154821e-05, + "loss": 6.0133, + "step": 7359 + }, + { + "epoch": 0.04377200494813969, + "grad_norm": 2.4117794036865234, + "learning_rate": 4.97640606842651e-05, + "loss": 6.0988, + "step": 7360 + }, + { + "epoch": 0.04377795223142069, + "grad_norm": 1.9839050769805908, + "learning_rate": 4.976399665833712e-05, + "loss": 5.9568, + "step": 7361 + }, + { + "epoch": 0.043783899514701685, + "grad_norm": 2.166215419769287, + "learning_rate": 4.9763932623764285e-05, + "loss": 5.9205, + "step": 7362 + }, + { + "epoch": 0.04378984679798268, + "grad_norm": 2.8216545581817627, + "learning_rate": 4.9763868580546616e-05, + "loss": 5.792, + "step": 7363 + }, + { + "epoch": 0.043795794081263675, + "grad_norm": 2.907707929611206, + "learning_rate": 4.976380452868413e-05, + "loss": 5.5824, + "step": 7364 + }, + { + "epoch": 0.04380174136454468, + "grad_norm": 2.173025369644165, + "learning_rate": 4.976374046817686e-05, + "loss": 6.2752, + "step": 7365 + }, + { + "epoch": 0.04380768864782567, + "grad_norm": 2.1098685264587402, + "learning_rate": 4.9763676399024814e-05, + "loss": 5.8052, + "step": 7366 + }, + { + "epoch": 0.04381363593110667, + "grad_norm": 2.1980762481689453, + "learning_rate": 4.9763612321228035e-05, + "loss": 5.3456, + "step": 7367 + }, + { + "epoch": 0.04381958321438767, + "grad_norm": 2.091327667236328, + "learning_rate": 4.976354823478654e-05, + "loss": 5.211, + "step": 7368 + }, + { + "epoch": 0.043825530497668665, + "grad_norm": 2.37920880317688, + "learning_rate": 4.976348413970033e-05, + "loss": 5.8652, + "step": 7369 + }, + { + "epoch": 0.04383147778094966, + "grad_norm": 2.454202175140381, + "learning_rate": 4.976342003596946e-05, + "loss": 5.9654, + "step": 7370 + }, + { + "epoch": 0.04383742506423066, + "grad_norm": 2.04577898979187, + "learning_rate": 4.9763355923593927e-05, + "loss": 6.3042, + "step": 7371 + }, + { + "epoch": 0.04384337234751166, + "grad_norm": 2.358250141143799, + "learning_rate": 4.976329180257376e-05, + "loss": 6.1403, + "step": 7372 + }, + { + "epoch": 0.04384931963079265, + "grad_norm": 2.177819013595581, + "learning_rate": 4.9763227672909e-05, + "loss": 5.8993, + "step": 7373 + }, + { + "epoch": 0.043855266914073654, + "grad_norm": 2.24910569190979, + "learning_rate": 4.976316353459963e-05, + "loss": 5.9763, + "step": 7374 + }, + { + "epoch": 0.04386121419735465, + "grad_norm": 2.3985965251922607, + "learning_rate": 4.976309938764571e-05, + "loss": 6.2288, + "step": 7375 + }, + { + "epoch": 0.043867161480635644, + "grad_norm": 2.1250808238983154, + "learning_rate": 4.9763035232047244e-05, + "loss": 6.1588, + "step": 7376 + }, + { + "epoch": 0.043873108763916646, + "grad_norm": 1.9815669059753418, + "learning_rate": 4.976297106780426e-05, + "loss": 6.3202, + "step": 7377 + }, + { + "epoch": 0.04387905604719764, + "grad_norm": 2.181999683380127, + "learning_rate": 4.976290689491677e-05, + "loss": 5.9125, + "step": 7378 + }, + { + "epoch": 0.043885003330478636, + "grad_norm": 2.365546703338623, + "learning_rate": 4.9762842713384815e-05, + "loss": 6.0991, + "step": 7379 + }, + { + "epoch": 0.04389095061375963, + "grad_norm": 2.0843441486358643, + "learning_rate": 4.9762778523208406e-05, + "loss": 5.9675, + "step": 7380 + }, + { + "epoch": 0.04389689789704063, + "grad_norm": 2.271576404571533, + "learning_rate": 4.9762714324387566e-05, + "loss": 5.5703, + "step": 7381 + }, + { + "epoch": 0.04390284518032163, + "grad_norm": 2.244211435317993, + "learning_rate": 4.9762650116922314e-05, + "loss": 5.4674, + "step": 7382 + }, + { + "epoch": 0.043908792463602624, + "grad_norm": 1.728034257888794, + "learning_rate": 4.9762585900812684e-05, + "loss": 5.6264, + "step": 7383 + }, + { + "epoch": 0.043914739746883626, + "grad_norm": 2.400587320327759, + "learning_rate": 4.976252167605869e-05, + "loss": 6.052, + "step": 7384 + }, + { + "epoch": 0.04392068703016462, + "grad_norm": 1.9865821599960327, + "learning_rate": 4.9762457442660346e-05, + "loss": 5.8544, + "step": 7385 + }, + { + "epoch": 0.043926634313445616, + "grad_norm": 2.236527681350708, + "learning_rate": 4.97623932006177e-05, + "loss": 5.5033, + "step": 7386 + }, + { + "epoch": 0.04393258159672662, + "grad_norm": 2.0424020290374756, + "learning_rate": 4.9762328949930746e-05, + "loss": 5.4088, + "step": 7387 + }, + { + "epoch": 0.04393852888000761, + "grad_norm": 2.0601999759674072, + "learning_rate": 4.976226469059952e-05, + "loss": 5.8599, + "step": 7388 + }, + { + "epoch": 0.04394447616328861, + "grad_norm": 2.5052783489227295, + "learning_rate": 4.976220042262404e-05, + "loss": 5.8202, + "step": 7389 + }, + { + "epoch": 0.04395042344656961, + "grad_norm": 2.178549289703369, + "learning_rate": 4.9762136146004344e-05, + "loss": 5.4554, + "step": 7390 + }, + { + "epoch": 0.043956370729850605, + "grad_norm": 1.9407802820205688, + "learning_rate": 4.976207186074043e-05, + "loss": 5.4062, + "step": 7391 + }, + { + "epoch": 0.0439623180131316, + "grad_norm": 1.4814093112945557, + "learning_rate": 4.9762007566832336e-05, + "loss": 5.4662, + "step": 7392 + }, + { + "epoch": 0.043968265296412595, + "grad_norm": 1.8808835744857788, + "learning_rate": 4.9761943264280086e-05, + "loss": 6.1617, + "step": 7393 + }, + { + "epoch": 0.0439742125796936, + "grad_norm": 1.9318643808364868, + "learning_rate": 4.97618789530837e-05, + "loss": 6.1357, + "step": 7394 + }, + { + "epoch": 0.04398015986297459, + "grad_norm": 2.2515900135040283, + "learning_rate": 4.976181463324319e-05, + "loss": 6.11, + "step": 7395 + }, + { + "epoch": 0.04398610714625559, + "grad_norm": 2.375298023223877, + "learning_rate": 4.9761750304758584e-05, + "loss": 6.1121, + "step": 7396 + }, + { + "epoch": 0.04399205442953659, + "grad_norm": 2.2254321575164795, + "learning_rate": 4.9761685967629914e-05, + "loss": 6.0136, + "step": 7397 + }, + { + "epoch": 0.043998001712817585, + "grad_norm": 2.146164894104004, + "learning_rate": 4.976162162185719e-05, + "loss": 5.8391, + "step": 7398 + }, + { + "epoch": 0.04400394899609858, + "grad_norm": 2.3237650394439697, + "learning_rate": 4.976155726744044e-05, + "loss": 5.461, + "step": 7399 + }, + { + "epoch": 0.04400989627937958, + "grad_norm": 2.2263002395629883, + "learning_rate": 4.976149290437969e-05, + "loss": 5.5885, + "step": 7400 + }, + { + "epoch": 0.04401584356266058, + "grad_norm": 1.9597729444503784, + "learning_rate": 4.9761428532674956e-05, + "loss": 5.348, + "step": 7401 + }, + { + "epoch": 0.04402179084594157, + "grad_norm": 2.2215018272399902, + "learning_rate": 4.976136415232626e-05, + "loss": 5.933, + "step": 7402 + }, + { + "epoch": 0.044027738129222574, + "grad_norm": 2.258618116378784, + "learning_rate": 4.9761299763333635e-05, + "loss": 6.0685, + "step": 7403 + }, + { + "epoch": 0.04403368541250357, + "grad_norm": 2.3045873641967773, + "learning_rate": 4.976123536569709e-05, + "loss": 5.7277, + "step": 7404 + }, + { + "epoch": 0.044039632695784564, + "grad_norm": 2.546252489089966, + "learning_rate": 4.976117095941666e-05, + "loss": 5.8839, + "step": 7405 + }, + { + "epoch": 0.044045579979065566, + "grad_norm": 1.8963768482208252, + "learning_rate": 4.976110654449235e-05, + "loss": 6.1247, + "step": 7406 + }, + { + "epoch": 0.04405152726234656, + "grad_norm": 2.6287784576416016, + "learning_rate": 4.976104212092421e-05, + "loss": 5.9712, + "step": 7407 + }, + { + "epoch": 0.044057474545627556, + "grad_norm": 2.562612295150757, + "learning_rate": 4.976097768871223e-05, + "loss": 6.1226, + "step": 7408 + }, + { + "epoch": 0.04406342182890855, + "grad_norm": 2.2308688163757324, + "learning_rate": 4.976091324785645e-05, + "loss": 6.3235, + "step": 7409 + }, + { + "epoch": 0.04406936911218955, + "grad_norm": 2.4595553874969482, + "learning_rate": 4.976084879835691e-05, + "loss": 5.8164, + "step": 7410 + }, + { + "epoch": 0.04407531639547055, + "grad_norm": 2.3693978786468506, + "learning_rate": 4.97607843402136e-05, + "loss": 5.7727, + "step": 7411 + }, + { + "epoch": 0.044081263678751544, + "grad_norm": 4.144592761993408, + "learning_rate": 4.9760719873426546e-05, + "loss": 5.6382, + "step": 7412 + }, + { + "epoch": 0.044087210962032546, + "grad_norm": 2.5423779487609863, + "learning_rate": 4.9760655397995794e-05, + "loss": 5.7526, + "step": 7413 + }, + { + "epoch": 0.04409315824531354, + "grad_norm": 2.119281053543091, + "learning_rate": 4.976059091392135e-05, + "loss": 5.7246, + "step": 7414 + }, + { + "epoch": 0.044099105528594536, + "grad_norm": 2.177074432373047, + "learning_rate": 4.976052642120324e-05, + "loss": 5.7296, + "step": 7415 + }, + { + "epoch": 0.04410505281187554, + "grad_norm": 1.8897806406021118, + "learning_rate": 4.9760461919841486e-05, + "loss": 5.6349, + "step": 7416 + }, + { + "epoch": 0.04411100009515653, + "grad_norm": 2.445082187652588, + "learning_rate": 4.97603974098361e-05, + "loss": 5.7414, + "step": 7417 + }, + { + "epoch": 0.04411694737843753, + "grad_norm": 2.2564280033111572, + "learning_rate": 4.976033289118713e-05, + "loss": 5.6709, + "step": 7418 + }, + { + "epoch": 0.04412289466171853, + "grad_norm": 2.1907529830932617, + "learning_rate": 4.976026836389458e-05, + "loss": 5.6067, + "step": 7419 + }, + { + "epoch": 0.044128841944999525, + "grad_norm": 2.1872594356536865, + "learning_rate": 4.976020382795848e-05, + "loss": 5.5166, + "step": 7420 + }, + { + "epoch": 0.04413478922828052, + "grad_norm": 1.7740691900253296, + "learning_rate": 4.9760139283378835e-05, + "loss": 5.5833, + "step": 7421 + }, + { + "epoch": 0.044140736511561515, + "grad_norm": 2.128389358520508, + "learning_rate": 4.976007473015569e-05, + "loss": 5.6403, + "step": 7422 + }, + { + "epoch": 0.04414668379484252, + "grad_norm": 2.6193220615386963, + "learning_rate": 4.9760010168289053e-05, + "loss": 5.8139, + "step": 7423 + }, + { + "epoch": 0.04415263107812351, + "grad_norm": 2.727902412414551, + "learning_rate": 4.9759945597778955e-05, + "loss": 5.3286, + "step": 7424 + }, + { + "epoch": 0.04415857836140451, + "grad_norm": 2.4500436782836914, + "learning_rate": 4.975988101862542e-05, + "loss": 5.2647, + "step": 7425 + }, + { + "epoch": 0.04416452564468551, + "grad_norm": 2.1040356159210205, + "learning_rate": 4.975981643082846e-05, + "loss": 6.0935, + "step": 7426 + }, + { + "epoch": 0.044170472927966505, + "grad_norm": 1.9168792963027954, + "learning_rate": 4.975975183438811e-05, + "loss": 5.5147, + "step": 7427 + }, + { + "epoch": 0.0441764202112475, + "grad_norm": 2.0156469345092773, + "learning_rate": 4.9759687229304384e-05, + "loss": 6.2896, + "step": 7428 + }, + { + "epoch": 0.0441823674945285, + "grad_norm": 2.362933874130249, + "learning_rate": 4.975962261557731e-05, + "loss": 5.9514, + "step": 7429 + }, + { + "epoch": 0.0441883147778095, + "grad_norm": 2.2892727851867676, + "learning_rate": 4.9759557993206906e-05, + "loss": 5.5646, + "step": 7430 + }, + { + "epoch": 0.04419426206109049, + "grad_norm": 2.287722587585449, + "learning_rate": 4.97594933621932e-05, + "loss": 5.364, + "step": 7431 + }, + { + "epoch": 0.044200209344371494, + "grad_norm": 2.0421855449676514, + "learning_rate": 4.9759428722536194e-05, + "loss": 5.6838, + "step": 7432 + }, + { + "epoch": 0.04420615662765249, + "grad_norm": 2.2392499446868896, + "learning_rate": 4.9759364074235944e-05, + "loss": 6.0727, + "step": 7433 + }, + { + "epoch": 0.044212103910933484, + "grad_norm": 2.084768295288086, + "learning_rate": 4.975929941729245e-05, + "loss": 6.1208, + "step": 7434 + }, + { + "epoch": 0.044218051194214486, + "grad_norm": 1.817015528678894, + "learning_rate": 4.975923475170574e-05, + "loss": 6.3405, + "step": 7435 + }, + { + "epoch": 0.04422399847749548, + "grad_norm": 1.974926233291626, + "learning_rate": 4.9759170077475834e-05, + "loss": 5.9607, + "step": 7436 + }, + { + "epoch": 0.044229945760776476, + "grad_norm": 2.1244025230407715, + "learning_rate": 4.975910539460277e-05, + "loss": 6.2579, + "step": 7437 + }, + { + "epoch": 0.04423589304405747, + "grad_norm": 1.9459706544876099, + "learning_rate": 4.975904070308655e-05, + "loss": 5.5877, + "step": 7438 + }, + { + "epoch": 0.04424184032733847, + "grad_norm": 2.1891977787017822, + "learning_rate": 4.97589760029272e-05, + "loss": 5.9913, + "step": 7439 + }, + { + "epoch": 0.04424778761061947, + "grad_norm": 2.0368902683258057, + "learning_rate": 4.9758911294124756e-05, + "loss": 5.9478, + "step": 7440 + }, + { + "epoch": 0.044253734893900463, + "grad_norm": 2.2937796115875244, + "learning_rate": 4.975884657667922e-05, + "loss": 6.1529, + "step": 7441 + }, + { + "epoch": 0.044259682177181466, + "grad_norm": 2.601637125015259, + "learning_rate": 4.975878185059064e-05, + "loss": 5.4446, + "step": 7442 + }, + { + "epoch": 0.04426562946046246, + "grad_norm": 2.2025954723358154, + "learning_rate": 4.975871711585902e-05, + "loss": 5.8911, + "step": 7443 + }, + { + "epoch": 0.044271576743743456, + "grad_norm": 2.0498836040496826, + "learning_rate": 4.975865237248438e-05, + "loss": 6.0604, + "step": 7444 + }, + { + "epoch": 0.04427752402702446, + "grad_norm": 2.308239459991455, + "learning_rate": 4.975858762046676e-05, + "loss": 5.9599, + "step": 7445 + }, + { + "epoch": 0.04428347131030545, + "grad_norm": 2.286747455596924, + "learning_rate": 4.9758522859806165e-05, + "loss": 6.3528, + "step": 7446 + }, + { + "epoch": 0.04428941859358645, + "grad_norm": 2.2376902103424072, + "learning_rate": 4.975845809050264e-05, + "loss": 6.205, + "step": 7447 + }, + { + "epoch": 0.04429536587686745, + "grad_norm": 1.8052057027816772, + "learning_rate": 4.9758393312556176e-05, + "loss": 6.2188, + "step": 7448 + }, + { + "epoch": 0.044301313160148445, + "grad_norm": 1.9839476346969604, + "learning_rate": 4.975832852596682e-05, + "loss": 6.1479, + "step": 7449 + }, + { + "epoch": 0.04430726044342944, + "grad_norm": 1.8890517950057983, + "learning_rate": 4.975826373073459e-05, + "loss": 6.2524, + "step": 7450 + }, + { + "epoch": 0.04431320772671044, + "grad_norm": 2.049192428588867, + "learning_rate": 4.97581989268595e-05, + "loss": 5.5486, + "step": 7451 + }, + { + "epoch": 0.04431915500999144, + "grad_norm": 2.8271291255950928, + "learning_rate": 4.975813411434158e-05, + "loss": 5.1916, + "step": 7452 + }, + { + "epoch": 0.04432510229327243, + "grad_norm": 1.94833505153656, + "learning_rate": 4.975806929318085e-05, + "loss": 5.6747, + "step": 7453 + }, + { + "epoch": 0.04433104957655343, + "grad_norm": 2.14536190032959, + "learning_rate": 4.975800446337734e-05, + "loss": 5.4066, + "step": 7454 + }, + { + "epoch": 0.04433699685983443, + "grad_norm": 2.5557188987731934, + "learning_rate": 4.975793962493106e-05, + "loss": 5.2257, + "step": 7455 + }, + { + "epoch": 0.044342944143115424, + "grad_norm": 2.4718832969665527, + "learning_rate": 4.975787477784205e-05, + "loss": 6.0248, + "step": 7456 + }, + { + "epoch": 0.04434889142639642, + "grad_norm": 2.8627419471740723, + "learning_rate": 4.975780992211031e-05, + "loss": 5.3245, + "step": 7457 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 2.932990789413452, + "learning_rate": 4.9757745057735876e-05, + "loss": 4.8914, + "step": 7458 + }, + { + "epoch": 0.04436078599295842, + "grad_norm": 2.6231770515441895, + "learning_rate": 4.975768018471877e-05, + "loss": 5.3323, + "step": 7459 + }, + { + "epoch": 0.04436673327623941, + "grad_norm": 2.5591986179351807, + "learning_rate": 4.975761530305901e-05, + "loss": 5.4972, + "step": 7460 + }, + { + "epoch": 0.044372680559520414, + "grad_norm": 2.4060492515563965, + "learning_rate": 4.975755041275664e-05, + "loss": 5.5988, + "step": 7461 + }, + { + "epoch": 0.04437862784280141, + "grad_norm": 2.377260446548462, + "learning_rate": 4.975748551381164e-05, + "loss": 5.2137, + "step": 7462 + }, + { + "epoch": 0.044384575126082404, + "grad_norm": 2.171934127807617, + "learning_rate": 4.9757420606224076e-05, + "loss": 5.6313, + "step": 7463 + }, + { + "epoch": 0.044390522409363406, + "grad_norm": 2.1225788593292236, + "learning_rate": 4.975735568999394e-05, + "loss": 5.839, + "step": 7464 + }, + { + "epoch": 0.0443964696926444, + "grad_norm": 2.271127939224243, + "learning_rate": 4.975729076512128e-05, + "loss": 5.7111, + "step": 7465 + }, + { + "epoch": 0.044402416975925396, + "grad_norm": 2.7138264179229736, + "learning_rate": 4.975722583160609e-05, + "loss": 5.3169, + "step": 7466 + }, + { + "epoch": 0.04440836425920639, + "grad_norm": 2.8181982040405273, + "learning_rate": 4.9757160889448416e-05, + "loss": 5.3323, + "step": 7467 + }, + { + "epoch": 0.04441431154248739, + "grad_norm": 2.680816411972046, + "learning_rate": 4.975709593864828e-05, + "loss": 5.6924, + "step": 7468 + }, + { + "epoch": 0.04442025882576839, + "grad_norm": 2.3682074546813965, + "learning_rate": 4.975703097920569e-05, + "loss": 6.0049, + "step": 7469 + }, + { + "epoch": 0.04442620610904938, + "grad_norm": 2.3080508708953857, + "learning_rate": 4.9756966011120674e-05, + "loss": 6.4438, + "step": 7470 + }, + { + "epoch": 0.044432153392330385, + "grad_norm": 2.2631113529205322, + "learning_rate": 4.9756901034393265e-05, + "loss": 5.9296, + "step": 7471 + }, + { + "epoch": 0.04443810067561138, + "grad_norm": 2.283712148666382, + "learning_rate": 4.975683604902347e-05, + "loss": 5.831, + "step": 7472 + }, + { + "epoch": 0.044444047958892376, + "grad_norm": 2.2130608558654785, + "learning_rate": 4.975677105501132e-05, + "loss": 5.8757, + "step": 7473 + }, + { + "epoch": 0.04444999524217338, + "grad_norm": 1.9392763376235962, + "learning_rate": 4.975670605235684e-05, + "loss": 5.5836, + "step": 7474 + }, + { + "epoch": 0.04445594252545437, + "grad_norm": 2.097076416015625, + "learning_rate": 4.975664104106005e-05, + "loss": 6.0782, + "step": 7475 + }, + { + "epoch": 0.04446188980873537, + "grad_norm": 2.063021183013916, + "learning_rate": 4.975657602112097e-05, + "loss": 6.2171, + "step": 7476 + }, + { + "epoch": 0.04446783709201637, + "grad_norm": 2.4466049671173096, + "learning_rate": 4.9756510992539626e-05, + "loss": 5.8649, + "step": 7477 + }, + { + "epoch": 0.044473784375297365, + "grad_norm": 2.2160751819610596, + "learning_rate": 4.975644595531605e-05, + "loss": 5.9297, + "step": 7478 + }, + { + "epoch": 0.04447973165857836, + "grad_norm": 2.69352650642395, + "learning_rate": 4.975638090945024e-05, + "loss": 6.1062, + "step": 7479 + }, + { + "epoch": 0.04448567894185936, + "grad_norm": 2.2830610275268555, + "learning_rate": 4.975631585494224e-05, + "loss": 6.1663, + "step": 7480 + }, + { + "epoch": 0.04449162622514036, + "grad_norm": 2.936842203140259, + "learning_rate": 4.975625079179206e-05, + "loss": 5.9952, + "step": 7481 + }, + { + "epoch": 0.04449757350842135, + "grad_norm": 2.1398322582244873, + "learning_rate": 4.9756185719999725e-05, + "loss": 6.0005, + "step": 7482 + }, + { + "epoch": 0.04450352079170235, + "grad_norm": 2.2835536003112793, + "learning_rate": 4.9756120639565275e-05, + "loss": 5.7155, + "step": 7483 + }, + { + "epoch": 0.04450946807498335, + "grad_norm": 2.22917103767395, + "learning_rate": 4.975605555048871e-05, + "loss": 5.7134, + "step": 7484 + }, + { + "epoch": 0.044515415358264344, + "grad_norm": 2.0195605754852295, + "learning_rate": 4.975599045277006e-05, + "loss": 5.6369, + "step": 7485 + }, + { + "epoch": 0.04452136264154534, + "grad_norm": 1.8495477437973022, + "learning_rate": 4.975592534640936e-05, + "loss": 5.9035, + "step": 7486 + }, + { + "epoch": 0.04452730992482634, + "grad_norm": 2.4814226627349854, + "learning_rate": 4.9755860231406616e-05, + "loss": 6.1024, + "step": 7487 + }, + { + "epoch": 0.04453325720810734, + "grad_norm": 2.221820831298828, + "learning_rate": 4.975579510776186e-05, + "loss": 6.1193, + "step": 7488 + }, + { + "epoch": 0.04453920449138833, + "grad_norm": 1.935722827911377, + "learning_rate": 4.975572997547511e-05, + "loss": 6.1088, + "step": 7489 + }, + { + "epoch": 0.044545151774669334, + "grad_norm": 2.1287481784820557, + "learning_rate": 4.975566483454638e-05, + "loss": 6.1064, + "step": 7490 + }, + { + "epoch": 0.04455109905795033, + "grad_norm": 2.1914093494415283, + "learning_rate": 4.9755599684975716e-05, + "loss": 6.072, + "step": 7491 + }, + { + "epoch": 0.044557046341231324, + "grad_norm": 2.1979966163635254, + "learning_rate": 4.975553452676312e-05, + "loss": 6.1447, + "step": 7492 + }, + { + "epoch": 0.044562993624512326, + "grad_norm": 2.108259916305542, + "learning_rate": 4.975546935990863e-05, + "loss": 6.0109, + "step": 7493 + }, + { + "epoch": 0.04456894090779332, + "grad_norm": 2.2454450130462646, + "learning_rate": 4.975540418441226e-05, + "loss": 5.8627, + "step": 7494 + }, + { + "epoch": 0.044574888191074316, + "grad_norm": 2.151130437850952, + "learning_rate": 4.9755339000274027e-05, + "loss": 6.0241, + "step": 7495 + }, + { + "epoch": 0.04458083547435531, + "grad_norm": 1.9150489568710327, + "learning_rate": 4.975527380749397e-05, + "loss": 6.0179, + "step": 7496 + }, + { + "epoch": 0.04458678275763631, + "grad_norm": 1.9065133333206177, + "learning_rate": 4.97552086060721e-05, + "loss": 5.9991, + "step": 7497 + }, + { + "epoch": 0.04459273004091731, + "grad_norm": 1.9627622365951538, + "learning_rate": 4.975514339600844e-05, + "loss": 5.9633, + "step": 7498 + }, + { + "epoch": 0.0445986773241983, + "grad_norm": 1.7777502536773682, + "learning_rate": 4.975507817730302e-05, + "loss": 5.9426, + "step": 7499 + }, + { + "epoch": 0.044604624607479305, + "grad_norm": 1.6735023260116577, + "learning_rate": 4.9755012949955846e-05, + "loss": 5.9432, + "step": 7500 + }, + { + "epoch": 0.0446105718907603, + "grad_norm": 2.1570491790771484, + "learning_rate": 4.975494771396697e-05, + "loss": 6.2032, + "step": 7501 + }, + { + "epoch": 0.044616519174041296, + "grad_norm": 2.286522150039673, + "learning_rate": 4.9754882469336387e-05, + "loss": 5.7226, + "step": 7502 + }, + { + "epoch": 0.0446224664573223, + "grad_norm": 2.1940622329711914, + "learning_rate": 4.975481721606413e-05, + "loss": 6.2215, + "step": 7503 + }, + { + "epoch": 0.04462841374060329, + "grad_norm": 2.329263210296631, + "learning_rate": 4.9754751954150224e-05, + "loss": 5.5403, + "step": 7504 + }, + { + "epoch": 0.04463436102388429, + "grad_norm": 2.112712860107422, + "learning_rate": 4.975468668359469e-05, + "loss": 5.7581, + "step": 7505 + }, + { + "epoch": 0.04464030830716529, + "grad_norm": 2.2875239849090576, + "learning_rate": 4.975462140439755e-05, + "loss": 5.9593, + "step": 7506 + }, + { + "epoch": 0.044646255590446285, + "grad_norm": 2.282121419906616, + "learning_rate": 4.975455611655883e-05, + "loss": 5.8684, + "step": 7507 + }, + { + "epoch": 0.04465220287372728, + "grad_norm": 1.8482197523117065, + "learning_rate": 4.975449082007855e-05, + "loss": 5.753, + "step": 7508 + }, + { + "epoch": 0.04465815015700828, + "grad_norm": 2.6635684967041016, + "learning_rate": 4.9754425514956724e-05, + "loss": 5.0732, + "step": 7509 + }, + { + "epoch": 0.04466409744028928, + "grad_norm": 2.6632800102233887, + "learning_rate": 4.9754360201193395e-05, + "loss": 5.1644, + "step": 7510 + }, + { + "epoch": 0.04467004472357027, + "grad_norm": 2.630445718765259, + "learning_rate": 4.9754294878788574e-05, + "loss": 5.0322, + "step": 7511 + }, + { + "epoch": 0.04467599200685127, + "grad_norm": 2.4036223888397217, + "learning_rate": 4.975422954774228e-05, + "loss": 4.8949, + "step": 7512 + }, + { + "epoch": 0.04468193929013227, + "grad_norm": 2.381810426712036, + "learning_rate": 4.9754164208054535e-05, + "loss": 5.7921, + "step": 7513 + }, + { + "epoch": 0.044687886573413264, + "grad_norm": 2.570949077606201, + "learning_rate": 4.9754098859725377e-05, + "loss": 5.9612, + "step": 7514 + }, + { + "epoch": 0.04469383385669426, + "grad_norm": 2.510998010635376, + "learning_rate": 4.9754033502754815e-05, + "loss": 5.7273, + "step": 7515 + }, + { + "epoch": 0.04469978113997526, + "grad_norm": 2.6216115951538086, + "learning_rate": 4.975396813714288e-05, + "loss": 5.7601, + "step": 7516 + }, + { + "epoch": 0.04470572842325626, + "grad_norm": 2.5298542976379395, + "learning_rate": 4.975390276288958e-05, + "loss": 5.8007, + "step": 7517 + }, + { + "epoch": 0.04471167570653725, + "grad_norm": 2.6195290088653564, + "learning_rate": 4.975383737999496e-05, + "loss": 5.6071, + "step": 7518 + }, + { + "epoch": 0.044717622989818254, + "grad_norm": 2.5432629585266113, + "learning_rate": 4.975377198845902e-05, + "loss": 6.0224, + "step": 7519 + }, + { + "epoch": 0.04472357027309925, + "grad_norm": 2.2290337085723877, + "learning_rate": 4.97537065882818e-05, + "loss": 5.7141, + "step": 7520 + }, + { + "epoch": 0.044729517556380244, + "grad_norm": 2.627206802368164, + "learning_rate": 4.975364117946332e-05, + "loss": 6.2518, + "step": 7521 + }, + { + "epoch": 0.044735464839661246, + "grad_norm": 2.386993169784546, + "learning_rate": 4.975357576200359e-05, + "loss": 6.0494, + "step": 7522 + }, + { + "epoch": 0.04474141212294224, + "grad_norm": 2.20511794090271, + "learning_rate": 4.9753510335902656e-05, + "loss": 6.2563, + "step": 7523 + }, + { + "epoch": 0.044747359406223236, + "grad_norm": 2.5564749240875244, + "learning_rate": 4.975344490116052e-05, + "loss": 6.2498, + "step": 7524 + }, + { + "epoch": 0.04475330668950423, + "grad_norm": 2.6001932621002197, + "learning_rate": 4.975337945777721e-05, + "loss": 5.6721, + "step": 7525 + }, + { + "epoch": 0.04475925397278523, + "grad_norm": 2.6677772998809814, + "learning_rate": 4.975331400575275e-05, + "loss": 5.88, + "step": 7526 + }, + { + "epoch": 0.04476520125606623, + "grad_norm": 3.616734027862549, + "learning_rate": 4.975324854508716e-05, + "loss": 5.4835, + "step": 7527 + }, + { + "epoch": 0.04477114853934722, + "grad_norm": 3.0301461219787598, + "learning_rate": 4.975318307578048e-05, + "loss": 5.326, + "step": 7528 + }, + { + "epoch": 0.044777095822628225, + "grad_norm": 2.029836893081665, + "learning_rate": 4.975311759783271e-05, + "loss": 5.3516, + "step": 7529 + }, + { + "epoch": 0.04478304310590922, + "grad_norm": 1.9886969327926636, + "learning_rate": 4.9753052111243885e-05, + "loss": 5.3442, + "step": 7530 + }, + { + "epoch": 0.044788990389190216, + "grad_norm": 2.4227612018585205, + "learning_rate": 4.975298661601403e-05, + "loss": 5.4273, + "step": 7531 + }, + { + "epoch": 0.04479493767247122, + "grad_norm": 2.8426849842071533, + "learning_rate": 4.975292111214316e-05, + "loss": 5.6604, + "step": 7532 + }, + { + "epoch": 0.04480088495575221, + "grad_norm": 2.4818854331970215, + "learning_rate": 4.97528555996313e-05, + "loss": 6.4941, + "step": 7533 + }, + { + "epoch": 0.04480683223903321, + "grad_norm": 2.291642904281616, + "learning_rate": 4.9752790078478465e-05, + "loss": 6.404, + "step": 7534 + }, + { + "epoch": 0.04481277952231421, + "grad_norm": 2.4973669052124023, + "learning_rate": 4.9752724548684695e-05, + "loss": 5.6068, + "step": 7535 + }, + { + "epoch": 0.044818726805595205, + "grad_norm": 2.273130416870117, + "learning_rate": 4.975265901025001e-05, + "loss": 6.1689, + "step": 7536 + }, + { + "epoch": 0.0448246740888762, + "grad_norm": 3.362520456314087, + "learning_rate": 4.9752593463174424e-05, + "loss": 5.5346, + "step": 7537 + }, + { + "epoch": 0.0448306213721572, + "grad_norm": 5.170871257781982, + "learning_rate": 4.9752527907457956e-05, + "loss": 5.3831, + "step": 7538 + }, + { + "epoch": 0.0448365686554382, + "grad_norm": 4.224242687225342, + "learning_rate": 4.975246234310064e-05, + "loss": 5.2511, + "step": 7539 + }, + { + "epoch": 0.04484251593871919, + "grad_norm": 3.1753036975860596, + "learning_rate": 4.97523967701025e-05, + "loss": 5.06, + "step": 7540 + }, + { + "epoch": 0.04484846322200019, + "grad_norm": 2.4226467609405518, + "learning_rate": 4.975233118846355e-05, + "loss": 5.5225, + "step": 7541 + }, + { + "epoch": 0.04485441050528119, + "grad_norm": 2.5356781482696533, + "learning_rate": 4.9752265598183814e-05, + "loss": 5.5865, + "step": 7542 + }, + { + "epoch": 0.044860357788562184, + "grad_norm": 2.1505908966064453, + "learning_rate": 4.9752199999263326e-05, + "loss": 5.7436, + "step": 7543 + }, + { + "epoch": 0.04486630507184318, + "grad_norm": 2.675703763961792, + "learning_rate": 4.97521343917021e-05, + "loss": 5.3693, + "step": 7544 + }, + { + "epoch": 0.04487225235512418, + "grad_norm": 3.5228023529052734, + "learning_rate": 4.975206877550015e-05, + "loss": 4.8527, + "step": 7545 + }, + { + "epoch": 0.044878199638405177, + "grad_norm": 3.1165566444396973, + "learning_rate": 4.975200315065752e-05, + "loss": 4.7971, + "step": 7546 + }, + { + "epoch": 0.04488414692168617, + "grad_norm": 2.6216177940368652, + "learning_rate": 4.975193751717421e-05, + "loss": 4.9328, + "step": 7547 + }, + { + "epoch": 0.044890094204967174, + "grad_norm": 2.352031707763672, + "learning_rate": 4.975187187505026e-05, + "loss": 5.0021, + "step": 7548 + }, + { + "epoch": 0.04489604148824817, + "grad_norm": 1.8147127628326416, + "learning_rate": 4.975180622428569e-05, + "loss": 5.7009, + "step": 7549 + }, + { + "epoch": 0.044901988771529164, + "grad_norm": 2.1674726009368896, + "learning_rate": 4.9751740564880516e-05, + "loss": 5.2545, + "step": 7550 + }, + { + "epoch": 0.044907936054810166, + "grad_norm": 2.2935330867767334, + "learning_rate": 4.975167489683477e-05, + "loss": 5.2351, + "step": 7551 + }, + { + "epoch": 0.04491388333809116, + "grad_norm": 2.2964932918548584, + "learning_rate": 4.975160922014846e-05, + "loss": 5.483, + "step": 7552 + }, + { + "epoch": 0.044919830621372156, + "grad_norm": 1.8180936574935913, + "learning_rate": 4.9751543534821635e-05, + "loss": 5.668, + "step": 7553 + }, + { + "epoch": 0.04492577790465315, + "grad_norm": 1.906435251235962, + "learning_rate": 4.9751477840854286e-05, + "loss": 5.6664, + "step": 7554 + }, + { + "epoch": 0.04493172518793415, + "grad_norm": 2.459702253341675, + "learning_rate": 4.9751412138246455e-05, + "loss": 5.5272, + "step": 7555 + }, + { + "epoch": 0.04493767247121515, + "grad_norm": 2.1219170093536377, + "learning_rate": 4.975134642699817e-05, + "loss": 5.638, + "step": 7556 + }, + { + "epoch": 0.04494361975449614, + "grad_norm": 2.1492953300476074, + "learning_rate": 4.975128070710944e-05, + "loss": 5.9422, + "step": 7557 + }, + { + "epoch": 0.044949567037777145, + "grad_norm": 1.813988208770752, + "learning_rate": 4.97512149785803e-05, + "loss": 5.9875, + "step": 7558 + }, + { + "epoch": 0.04495551432105814, + "grad_norm": 1.6336817741394043, + "learning_rate": 4.975114924141075e-05, + "loss": 5.9245, + "step": 7559 + }, + { + "epoch": 0.044961461604339135, + "grad_norm": 1.9339455366134644, + "learning_rate": 4.9751083495600847e-05, + "loss": 5.3263, + "step": 7560 + }, + { + "epoch": 0.04496740888762014, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.975101774115059e-05, + "loss": 5.4625, + "step": 7561 + }, + { + "epoch": 0.04497335617090113, + "grad_norm": 2.2994346618652344, + "learning_rate": 4.9750951978060004e-05, + "loss": 5.6327, + "step": 7562 + }, + { + "epoch": 0.04497930345418213, + "grad_norm": 2.1627299785614014, + "learning_rate": 4.975088620632912e-05, + "loss": 5.4882, + "step": 7563 + }, + { + "epoch": 0.04498525073746313, + "grad_norm": 2.763397693634033, + "learning_rate": 4.9750820425957954e-05, + "loss": 5.727, + "step": 7564 + }, + { + "epoch": 0.044991198020744125, + "grad_norm": 2.0107216835021973, + "learning_rate": 4.975075463694654e-05, + "loss": 5.3852, + "step": 7565 + }, + { + "epoch": 0.04499714530402512, + "grad_norm": 1.8424763679504395, + "learning_rate": 4.975068883929489e-05, + "loss": 5.3072, + "step": 7566 + }, + { + "epoch": 0.04500309258730612, + "grad_norm": 1.946702003479004, + "learning_rate": 4.975062303300303e-05, + "loss": 5.3184, + "step": 7567 + }, + { + "epoch": 0.04500903987058712, + "grad_norm": 2.1091182231903076, + "learning_rate": 4.9750557218070984e-05, + "loss": 5.0689, + "step": 7568 + }, + { + "epoch": 0.04501498715386811, + "grad_norm": 2.0064187049865723, + "learning_rate": 4.975049139449877e-05, + "loss": 4.8495, + "step": 7569 + }, + { + "epoch": 0.04502093443714911, + "grad_norm": 1.7544279098510742, + "learning_rate": 4.9750425562286416e-05, + "loss": 4.9524, + "step": 7570 + }, + { + "epoch": 0.04502688172043011, + "grad_norm": 2.0814568996429443, + "learning_rate": 4.9750359721433945e-05, + "loss": 4.798, + "step": 7571 + }, + { + "epoch": 0.045032829003711104, + "grad_norm": 2.1185543537139893, + "learning_rate": 4.975029387194139e-05, + "loss": 4.9313, + "step": 7572 + }, + { + "epoch": 0.0450387762869921, + "grad_norm": 2.3774518966674805, + "learning_rate": 4.975022801380875e-05, + "loss": 5.5954, + "step": 7573 + }, + { + "epoch": 0.0450447235702731, + "grad_norm": 2.261306047439575, + "learning_rate": 4.975016214703606e-05, + "loss": 5.5598, + "step": 7574 + }, + { + "epoch": 0.045050670853554096, + "grad_norm": 2.128244161605835, + "learning_rate": 4.975009627162335e-05, + "loss": 5.359, + "step": 7575 + }, + { + "epoch": 0.04505661813683509, + "grad_norm": 2.0767438411712646, + "learning_rate": 4.975003038757064e-05, + "loss": 5.6855, + "step": 7576 + }, + { + "epoch": 0.045062565420116094, + "grad_norm": 1.9789010286331177, + "learning_rate": 4.974996449487794e-05, + "loss": 5.1807, + "step": 7577 + }, + { + "epoch": 0.04506851270339709, + "grad_norm": 1.9136112928390503, + "learning_rate": 4.97498985935453e-05, + "loss": 5.3811, + "step": 7578 + }, + { + "epoch": 0.045074459986678084, + "grad_norm": 2.150641441345215, + "learning_rate": 4.974983268357271e-05, + "loss": 5.3281, + "step": 7579 + }, + { + "epoch": 0.045080407269959086, + "grad_norm": 1.9636656045913696, + "learning_rate": 4.9749766764960215e-05, + "loss": 5.5003, + "step": 7580 + }, + { + "epoch": 0.04508635455324008, + "grad_norm": 1.826335072517395, + "learning_rate": 4.974970083770783e-05, + "loss": 5.4687, + "step": 7581 + }, + { + "epoch": 0.045092301836521076, + "grad_norm": 1.9246041774749756, + "learning_rate": 4.974963490181558e-05, + "loss": 5.5373, + "step": 7582 + }, + { + "epoch": 0.04509824911980207, + "grad_norm": 1.8421686887741089, + "learning_rate": 4.974956895728349e-05, + "loss": 5.386, + "step": 7583 + }, + { + "epoch": 0.04510419640308307, + "grad_norm": 1.8685556650161743, + "learning_rate": 4.974950300411158e-05, + "loss": 5.5857, + "step": 7584 + }, + { + "epoch": 0.04511014368636407, + "grad_norm": 1.7022168636322021, + "learning_rate": 4.974943704229987e-05, + "loss": 5.2562, + "step": 7585 + }, + { + "epoch": 0.04511609096964506, + "grad_norm": 1.876855731010437, + "learning_rate": 4.97493710718484e-05, + "loss": 5.1359, + "step": 7586 + }, + { + "epoch": 0.045122038252926065, + "grad_norm": 1.8728361129760742, + "learning_rate": 4.974930509275717e-05, + "loss": 5.3124, + "step": 7587 + }, + { + "epoch": 0.04512798553620706, + "grad_norm": 1.930086612701416, + "learning_rate": 4.974923910502622e-05, + "loss": 5.3261, + "step": 7588 + }, + { + "epoch": 0.045133932819488055, + "grad_norm": 2.0309081077575684, + "learning_rate": 4.9749173108655564e-05, + "loss": 5.1138, + "step": 7589 + }, + { + "epoch": 0.04513988010276906, + "grad_norm": 2.042174816131592, + "learning_rate": 4.974910710364522e-05, + "loss": 5.3521, + "step": 7590 + }, + { + "epoch": 0.04514582738605005, + "grad_norm": 1.5278770923614502, + "learning_rate": 4.9749041089995224e-05, + "loss": 5.4075, + "step": 7591 + }, + { + "epoch": 0.04515177466933105, + "grad_norm": 1.7624976634979248, + "learning_rate": 4.974897506770559e-05, + "loss": 5.1698, + "step": 7592 + }, + { + "epoch": 0.04515772195261205, + "grad_norm": 1.9077380895614624, + "learning_rate": 4.974890903677635e-05, + "loss": 5.3973, + "step": 7593 + }, + { + "epoch": 0.045163669235893045, + "grad_norm": 1.5724380016326904, + "learning_rate": 4.974884299720752e-05, + "loss": 5.6325, + "step": 7594 + }, + { + "epoch": 0.04516961651917404, + "grad_norm": 1.9702832698822021, + "learning_rate": 4.974877694899913e-05, + "loss": 5.247, + "step": 7595 + }, + { + "epoch": 0.04517556380245504, + "grad_norm": 1.9913853406906128, + "learning_rate": 4.974871089215118e-05, + "loss": 5.6393, + "step": 7596 + }, + { + "epoch": 0.04518151108573604, + "grad_norm": 1.806470274925232, + "learning_rate": 4.974864482666372e-05, + "loss": 5.302, + "step": 7597 + }, + { + "epoch": 0.04518745836901703, + "grad_norm": 1.7056912183761597, + "learning_rate": 4.974857875253678e-05, + "loss": 5.4066, + "step": 7598 + }, + { + "epoch": 0.04519340565229803, + "grad_norm": 1.5990647077560425, + "learning_rate": 4.974851266977035e-05, + "loss": 5.4087, + "step": 7599 + }, + { + "epoch": 0.04519935293557903, + "grad_norm": 1.9233685731887817, + "learning_rate": 4.974844657836447e-05, + "loss": 5.4891, + "step": 7600 + }, + { + "epoch": 0.045205300218860024, + "grad_norm": 1.8654414415359497, + "learning_rate": 4.9748380478319165e-05, + "loss": 5.4955, + "step": 7601 + }, + { + "epoch": 0.04521124750214102, + "grad_norm": 1.7592424154281616, + "learning_rate": 4.974831436963446e-05, + "loss": 5.2298, + "step": 7602 + }, + { + "epoch": 0.04521719478542202, + "grad_norm": 1.8132792711257935, + "learning_rate": 4.974824825231037e-05, + "loss": 5.3487, + "step": 7603 + }, + { + "epoch": 0.045223142068703016, + "grad_norm": 1.8109947443008423, + "learning_rate": 4.974818212634692e-05, + "loss": 5.4511, + "step": 7604 + }, + { + "epoch": 0.04522908935198401, + "grad_norm": 1.96711266040802, + "learning_rate": 4.974811599174414e-05, + "loss": 5.3249, + "step": 7605 + }, + { + "epoch": 0.045235036635265014, + "grad_norm": 1.9123655557632446, + "learning_rate": 4.9748049848502054e-05, + "loss": 5.3681, + "step": 7606 + }, + { + "epoch": 0.04524098391854601, + "grad_norm": 1.7210376262664795, + "learning_rate": 4.974798369662067e-05, + "loss": 5.3441, + "step": 7607 + }, + { + "epoch": 0.045246931201827004, + "grad_norm": 1.590617060661316, + "learning_rate": 4.974791753610002e-05, + "loss": 5.5619, + "step": 7608 + }, + { + "epoch": 0.045252878485108006, + "grad_norm": 1.77785062789917, + "learning_rate": 4.974785136694013e-05, + "loss": 5.4717, + "step": 7609 + }, + { + "epoch": 0.045258825768389, + "grad_norm": 1.66475510597229, + "learning_rate": 4.9747785189141025e-05, + "loss": 5.3501, + "step": 7610 + }, + { + "epoch": 0.045264773051669996, + "grad_norm": 1.9176442623138428, + "learning_rate": 4.974771900270272e-05, + "loss": 5.1197, + "step": 7611 + }, + { + "epoch": 0.04527072033495099, + "grad_norm": 1.8143234252929688, + "learning_rate": 4.974765280762525e-05, + "loss": 5.3103, + "step": 7612 + }, + { + "epoch": 0.04527666761823199, + "grad_norm": 1.8954168558120728, + "learning_rate": 4.974758660390861e-05, + "loss": 5.2009, + "step": 7613 + }, + { + "epoch": 0.04528261490151299, + "grad_norm": 1.7779622077941895, + "learning_rate": 4.974752039155286e-05, + "loss": 5.519, + "step": 7614 + }, + { + "epoch": 0.04528856218479398, + "grad_norm": 1.8181761503219604, + "learning_rate": 4.9747454170558e-05, + "loss": 5.4967, + "step": 7615 + }, + { + "epoch": 0.045294509468074985, + "grad_norm": 1.657665491104126, + "learning_rate": 4.9747387940924064e-05, + "loss": 5.6437, + "step": 7616 + }, + { + "epoch": 0.04530045675135598, + "grad_norm": 1.7993237972259521, + "learning_rate": 4.974732170265107e-05, + "loss": 5.3094, + "step": 7617 + }, + { + "epoch": 0.045306404034636975, + "grad_norm": 1.8798805475234985, + "learning_rate": 4.974725545573904e-05, + "loss": 5.3268, + "step": 7618 + }, + { + "epoch": 0.04531235131791798, + "grad_norm": 1.9271420240402222, + "learning_rate": 4.974718920018799e-05, + "loss": 5.3405, + "step": 7619 + }, + { + "epoch": 0.04531829860119897, + "grad_norm": 1.9256294965744019, + "learning_rate": 4.9747122935997967e-05, + "loss": 5.3118, + "step": 7620 + }, + { + "epoch": 0.04532424588447997, + "grad_norm": 2.3345041275024414, + "learning_rate": 4.9747056663168965e-05, + "loss": 4.9813, + "step": 7621 + }, + { + "epoch": 0.04533019316776097, + "grad_norm": 1.7056258916854858, + "learning_rate": 4.974699038170103e-05, + "loss": 5.4725, + "step": 7622 + }, + { + "epoch": 0.045336140451041965, + "grad_norm": 2.075711250305176, + "learning_rate": 4.9746924091594174e-05, + "loss": 5.2215, + "step": 7623 + }, + { + "epoch": 0.04534208773432296, + "grad_norm": 1.818048357963562, + "learning_rate": 4.974685779284843e-05, + "loss": 5.0463, + "step": 7624 + }, + { + "epoch": 0.04534803501760396, + "grad_norm": 1.6590908765792847, + "learning_rate": 4.9746791485463806e-05, + "loss": 5.2476, + "step": 7625 + }, + { + "epoch": 0.04535398230088496, + "grad_norm": 2.2024991512298584, + "learning_rate": 4.974672516944033e-05, + "loss": 5.6437, + "step": 7626 + }, + { + "epoch": 0.04535992958416595, + "grad_norm": 1.71639883518219, + "learning_rate": 4.974665884477803e-05, + "loss": 5.2418, + "step": 7627 + }, + { + "epoch": 0.04536587686744695, + "grad_norm": 1.75436270236969, + "learning_rate": 4.974659251147693e-05, + "loss": 5.2209, + "step": 7628 + }, + { + "epoch": 0.04537182415072795, + "grad_norm": 2.577916383743286, + "learning_rate": 4.974652616953705e-05, + "loss": 5.2385, + "step": 7629 + }, + { + "epoch": 0.045377771434008944, + "grad_norm": 1.9784717559814453, + "learning_rate": 4.9746459818958416e-05, + "loss": 5.265, + "step": 7630 + }, + { + "epoch": 0.04538371871728994, + "grad_norm": 1.971383810043335, + "learning_rate": 4.974639345974104e-05, + "loss": 5.0548, + "step": 7631 + }, + { + "epoch": 0.04538966600057094, + "grad_norm": 2.096876621246338, + "learning_rate": 4.974632709188496e-05, + "loss": 5.1491, + "step": 7632 + }, + { + "epoch": 0.045395613283851936, + "grad_norm": 1.6079102754592896, + "learning_rate": 4.974626071539019e-05, + "loss": 5.1959, + "step": 7633 + }, + { + "epoch": 0.04540156056713293, + "grad_norm": 1.6881030797958374, + "learning_rate": 4.9746194330256755e-05, + "loss": 5.1772, + "step": 7634 + }, + { + "epoch": 0.04540750785041393, + "grad_norm": 1.7459675073623657, + "learning_rate": 4.974612793648469e-05, + "loss": 5.1885, + "step": 7635 + }, + { + "epoch": 0.04541345513369493, + "grad_norm": 1.739272117614746, + "learning_rate": 4.9746061534073993e-05, + "loss": 5.318, + "step": 7636 + }, + { + "epoch": 0.045419402416975924, + "grad_norm": 1.7761027812957764, + "learning_rate": 4.974599512302471e-05, + "loss": 5.1525, + "step": 7637 + }, + { + "epoch": 0.045425349700256926, + "grad_norm": 1.8695855140686035, + "learning_rate": 4.9745928703336854e-05, + "loss": 5.5754, + "step": 7638 + }, + { + "epoch": 0.04543129698353792, + "grad_norm": 1.8737404346466064, + "learning_rate": 4.9745862275010446e-05, + "loss": 5.2908, + "step": 7639 + }, + { + "epoch": 0.045437244266818916, + "grad_norm": 1.731676459312439, + "learning_rate": 4.9745795838045515e-05, + "loss": 5.2671, + "step": 7640 + }, + { + "epoch": 0.04544319155009991, + "grad_norm": 1.6687474250793457, + "learning_rate": 4.974572939244209e-05, + "loss": 5.1629, + "step": 7641 + }, + { + "epoch": 0.04544913883338091, + "grad_norm": 2.1376633644104004, + "learning_rate": 4.974566293820018e-05, + "loss": 5.2853, + "step": 7642 + }, + { + "epoch": 0.04545508611666191, + "grad_norm": 2.0989861488342285, + "learning_rate": 4.974559647531981e-05, + "loss": 5.1311, + "step": 7643 + }, + { + "epoch": 0.0454610333999429, + "grad_norm": 2.3433620929718018, + "learning_rate": 4.974553000380102e-05, + "loss": 4.9854, + "step": 7644 + }, + { + "epoch": 0.045466980683223905, + "grad_norm": 2.306170701980591, + "learning_rate": 4.974546352364381e-05, + "loss": 5.3152, + "step": 7645 + }, + { + "epoch": 0.0454729279665049, + "grad_norm": 1.9588537216186523, + "learning_rate": 4.974539703484822e-05, + "loss": 5.3903, + "step": 7646 + }, + { + "epoch": 0.045478875249785895, + "grad_norm": 1.7994736433029175, + "learning_rate": 4.9745330537414265e-05, + "loss": 5.2505, + "step": 7647 + }, + { + "epoch": 0.0454848225330669, + "grad_norm": 1.983175277709961, + "learning_rate": 4.974526403134197e-05, + "loss": 5.2607, + "step": 7648 + }, + { + "epoch": 0.04549076981634789, + "grad_norm": 1.8853832483291626, + "learning_rate": 4.974519751663136e-05, + "loss": 5.1475, + "step": 7649 + }, + { + "epoch": 0.04549671709962889, + "grad_norm": 1.9374700784683228, + "learning_rate": 4.9745130993282464e-05, + "loss": 5.2039, + "step": 7650 + }, + { + "epoch": 0.04550266438290989, + "grad_norm": 1.8200404644012451, + "learning_rate": 4.974506446129529e-05, + "loss": 5.2794, + "step": 7651 + }, + { + "epoch": 0.045508611666190885, + "grad_norm": 1.8375320434570312, + "learning_rate": 4.974499792066987e-05, + "loss": 5.1149, + "step": 7652 + }, + { + "epoch": 0.04551455894947188, + "grad_norm": 1.7842520475387573, + "learning_rate": 4.974493137140623e-05, + "loss": 5.0332, + "step": 7653 + }, + { + "epoch": 0.04552050623275288, + "grad_norm": 2.0220818519592285, + "learning_rate": 4.974486481350439e-05, + "loss": 5.0277, + "step": 7654 + }, + { + "epoch": 0.04552645351603388, + "grad_norm": 2.0787746906280518, + "learning_rate": 4.9744798246964375e-05, + "loss": 5.0587, + "step": 7655 + }, + { + "epoch": 0.04553240079931487, + "grad_norm": 1.7024985551834106, + "learning_rate": 4.97447316717862e-05, + "loss": 5.0184, + "step": 7656 + }, + { + "epoch": 0.04553834808259587, + "grad_norm": 1.9057540893554688, + "learning_rate": 4.97446650879699e-05, + "loss": 5.3945, + "step": 7657 + }, + { + "epoch": 0.04554429536587687, + "grad_norm": 1.7963287830352783, + "learning_rate": 4.974459849551549e-05, + "loss": 4.9869, + "step": 7658 + }, + { + "epoch": 0.045550242649157864, + "grad_norm": 2.027353286743164, + "learning_rate": 4.974453189442299e-05, + "loss": 5.1389, + "step": 7659 + }, + { + "epoch": 0.04555618993243886, + "grad_norm": 1.7137126922607422, + "learning_rate": 4.9744465284692445e-05, + "loss": 5.058, + "step": 7660 + }, + { + "epoch": 0.04556213721571986, + "grad_norm": 2.0363876819610596, + "learning_rate": 4.9744398666323854e-05, + "loss": 4.9174, + "step": 7661 + }, + { + "epoch": 0.045568084499000856, + "grad_norm": 2.1440837383270264, + "learning_rate": 4.9744332039317255e-05, + "loss": 4.8894, + "step": 7662 + }, + { + "epoch": 0.04557403178228185, + "grad_norm": 1.9582308530807495, + "learning_rate": 4.9744265403672655e-05, + "loss": 5.0666, + "step": 7663 + }, + { + "epoch": 0.04557997906556285, + "grad_norm": 1.9997116327285767, + "learning_rate": 4.97441987593901e-05, + "loss": 5.0804, + "step": 7664 + }, + { + "epoch": 0.04558592634884385, + "grad_norm": 2.067361831665039, + "learning_rate": 4.9744132106469586e-05, + "loss": 4.8655, + "step": 7665 + }, + { + "epoch": 0.045591873632124844, + "grad_norm": 1.7066930532455444, + "learning_rate": 4.9744065444911165e-05, + "loss": 4.792, + "step": 7666 + }, + { + "epoch": 0.045597820915405846, + "grad_norm": 1.8526182174682617, + "learning_rate": 4.974399877471484e-05, + "loss": 4.755, + "step": 7667 + }, + { + "epoch": 0.04560376819868684, + "grad_norm": 1.8744564056396484, + "learning_rate": 4.9743932095880644e-05, + "loss": 4.7732, + "step": 7668 + }, + { + "epoch": 0.045609715481967836, + "grad_norm": 1.849574327468872, + "learning_rate": 4.97438654084086e-05, + "loss": 4.7743, + "step": 7669 + }, + { + "epoch": 0.04561566276524884, + "grad_norm": 1.87284255027771, + "learning_rate": 4.9743798712298714e-05, + "loss": 5.0582, + "step": 7670 + }, + { + "epoch": 0.04562161004852983, + "grad_norm": 2.206273078918457, + "learning_rate": 4.974373200755104e-05, + "loss": 5.4683, + "step": 7671 + }, + { + "epoch": 0.04562755733181083, + "grad_norm": 1.9849058389663696, + "learning_rate": 4.974366529416557e-05, + "loss": 5.4087, + "step": 7672 + }, + { + "epoch": 0.04563350461509182, + "grad_norm": 1.9440083503723145, + "learning_rate": 4.974359857214235e-05, + "loss": 4.9607, + "step": 7673 + }, + { + "epoch": 0.045639451898372825, + "grad_norm": 1.7112319469451904, + "learning_rate": 4.974353184148139e-05, + "loss": 5.6589, + "step": 7674 + }, + { + "epoch": 0.04564539918165382, + "grad_norm": 1.921215295791626, + "learning_rate": 4.974346510218273e-05, + "loss": 5.4495, + "step": 7675 + }, + { + "epoch": 0.045651346464934815, + "grad_norm": 1.9582061767578125, + "learning_rate": 4.974339835424637e-05, + "loss": 5.2459, + "step": 7676 + }, + { + "epoch": 0.04565729374821582, + "grad_norm": 1.9781824350357056, + "learning_rate": 4.974333159767235e-05, + "loss": 5.3424, + "step": 7677 + }, + { + "epoch": 0.04566324103149681, + "grad_norm": 1.7183479070663452, + "learning_rate": 4.974326483246069e-05, + "loss": 5.3741, + "step": 7678 + }, + { + "epoch": 0.04566918831477781, + "grad_norm": 1.7942447662353516, + "learning_rate": 4.974319805861141e-05, + "loss": 5.4008, + "step": 7679 + }, + { + "epoch": 0.04567513559805881, + "grad_norm": 1.8255115747451782, + "learning_rate": 4.974313127612454e-05, + "loss": 5.1849, + "step": 7680 + }, + { + "epoch": 0.045681082881339805, + "grad_norm": 1.7907564640045166, + "learning_rate": 4.974306448500009e-05, + "loss": 5.1757, + "step": 7681 + }, + { + "epoch": 0.0456870301646208, + "grad_norm": 2.911489486694336, + "learning_rate": 4.97429976852381e-05, + "loss": 4.8909, + "step": 7682 + }, + { + "epoch": 0.0456929774479018, + "grad_norm": 2.849125623703003, + "learning_rate": 4.9742930876838576e-05, + "loss": 4.7733, + "step": 7683 + }, + { + "epoch": 0.0456989247311828, + "grad_norm": 2.4196949005126953, + "learning_rate": 4.9742864059801565e-05, + "loss": 4.8571, + "step": 7684 + }, + { + "epoch": 0.04570487201446379, + "grad_norm": 1.9430558681488037, + "learning_rate": 4.974279723412706e-05, + "loss": 5.1338, + "step": 7685 + }, + { + "epoch": 0.04571081929774479, + "grad_norm": 1.7538554668426514, + "learning_rate": 4.9742730399815105e-05, + "loss": 5.5524, + "step": 7686 + }, + { + "epoch": 0.04571676658102579, + "grad_norm": 2.006115198135376, + "learning_rate": 4.9742663556865724e-05, + "loss": 5.3343, + "step": 7687 + }, + { + "epoch": 0.045722713864306784, + "grad_norm": 2.554234027862549, + "learning_rate": 4.974259670527893e-05, + "loss": 5.8426, + "step": 7688 + }, + { + "epoch": 0.04572866114758778, + "grad_norm": 2.656747579574585, + "learning_rate": 4.974252984505475e-05, + "loss": 5.1578, + "step": 7689 + }, + { + "epoch": 0.04573460843086878, + "grad_norm": 2.800208568572998, + "learning_rate": 4.9742462976193216e-05, + "loss": 4.8019, + "step": 7690 + }, + { + "epoch": 0.045740555714149776, + "grad_norm": 2.674938201904297, + "learning_rate": 4.974239609869433e-05, + "loss": 4.7177, + "step": 7691 + }, + { + "epoch": 0.04574650299743077, + "grad_norm": 2.751533269882202, + "learning_rate": 4.974232921255815e-05, + "loss": 4.7568, + "step": 7692 + }, + { + "epoch": 0.04575245028071177, + "grad_norm": 2.623917818069458, + "learning_rate": 4.974226231778466e-05, + "loss": 4.5908, + "step": 7693 + }, + { + "epoch": 0.04575839756399277, + "grad_norm": 2.2248899936676025, + "learning_rate": 4.9742195414373904e-05, + "loss": 5.4066, + "step": 7694 + }, + { + "epoch": 0.045764344847273764, + "grad_norm": 1.7959388494491577, + "learning_rate": 4.974212850232591e-05, + "loss": 6.1414, + "step": 7695 + }, + { + "epoch": 0.045770292130554766, + "grad_norm": 2.0049352645874023, + "learning_rate": 4.974206158164069e-05, + "loss": 6.0106, + "step": 7696 + }, + { + "epoch": 0.04577623941383576, + "grad_norm": 2.4794270992279053, + "learning_rate": 4.9741994652318276e-05, + "loss": 5.8647, + "step": 7697 + }, + { + "epoch": 0.045782186697116756, + "grad_norm": 3.9380109310150146, + "learning_rate": 4.974192771435868e-05, + "loss": 5.719, + "step": 7698 + }, + { + "epoch": 0.04578813398039776, + "grad_norm": 2.564023017883301, + "learning_rate": 4.974186076776194e-05, + "loss": 4.7294, + "step": 7699 + }, + { + "epoch": 0.04579408126367875, + "grad_norm": 3.7082693576812744, + "learning_rate": 4.974179381252807e-05, + "loss": 5.1975, + "step": 7700 + }, + { + "epoch": 0.04580002854695975, + "grad_norm": 4.0067524909973145, + "learning_rate": 4.97417268486571e-05, + "loss": 5.4047, + "step": 7701 + }, + { + "epoch": 0.04580597583024074, + "grad_norm": 3.978787660598755, + "learning_rate": 4.974165987614904e-05, + "loss": 5.7023, + "step": 7702 + }, + { + "epoch": 0.045811923113521745, + "grad_norm": 4.597605228424072, + "learning_rate": 4.974159289500392e-05, + "loss": 6.5186, + "step": 7703 + }, + { + "epoch": 0.04581787039680274, + "grad_norm": 2.8793985843658447, + "learning_rate": 4.974152590522177e-05, + "loss": 6.1476, + "step": 7704 + }, + { + "epoch": 0.045823817680083735, + "grad_norm": 2.466089963912964, + "learning_rate": 4.974145890680262e-05, + "loss": 5.5154, + "step": 7705 + }, + { + "epoch": 0.04582976496336474, + "grad_norm": 2.937228202819824, + "learning_rate": 4.974139189974647e-05, + "loss": 5.5146, + "step": 7706 + }, + { + "epoch": 0.04583571224664573, + "grad_norm": 2.4580399990081787, + "learning_rate": 4.974132488405336e-05, + "loss": 6.214, + "step": 7707 + }, + { + "epoch": 0.04584165952992673, + "grad_norm": 4.910717010498047, + "learning_rate": 4.97412578597233e-05, + "loss": 5.819, + "step": 7708 + }, + { + "epoch": 0.04584760681320773, + "grad_norm": 5.372139930725098, + "learning_rate": 4.974119082675634e-05, + "loss": 5.3242, + "step": 7709 + }, + { + "epoch": 0.045853554096488724, + "grad_norm": 2.050492525100708, + "learning_rate": 4.9741123785152474e-05, + "loss": 6.0468, + "step": 7710 + }, + { + "epoch": 0.04585950137976972, + "grad_norm": 1.7090541124343872, + "learning_rate": 4.974105673491174e-05, + "loss": 5.7652, + "step": 7711 + }, + { + "epoch": 0.04586544866305072, + "grad_norm": 2.512538194656372, + "learning_rate": 4.974098967603415e-05, + "loss": 5.3184, + "step": 7712 + }, + { + "epoch": 0.04587139594633172, + "grad_norm": 3.311289072036743, + "learning_rate": 4.974092260851975e-05, + "loss": 5.5379, + "step": 7713 + }, + { + "epoch": 0.04587734322961271, + "grad_norm": 3.3318710327148438, + "learning_rate": 4.974085553236854e-05, + "loss": 5.5543, + "step": 7714 + }, + { + "epoch": 0.04588329051289371, + "grad_norm": 2.6384379863739014, + "learning_rate": 4.9740788447580555e-05, + "loss": 6.3475, + "step": 7715 + }, + { + "epoch": 0.04588923779617471, + "grad_norm": 2.0066304206848145, + "learning_rate": 4.974072135415582e-05, + "loss": 6.3685, + "step": 7716 + }, + { + "epoch": 0.045895185079455704, + "grad_norm": 2.4189116954803467, + "learning_rate": 4.9740654252094356e-05, + "loss": 5.4128, + "step": 7717 + }, + { + "epoch": 0.0459011323627367, + "grad_norm": 2.431011438369751, + "learning_rate": 4.974058714139618e-05, + "loss": 5.34, + "step": 7718 + }, + { + "epoch": 0.0459070796460177, + "grad_norm": 2.1997156143188477, + "learning_rate": 4.974052002206132e-05, + "loss": 5.4223, + "step": 7719 + }, + { + "epoch": 0.045913026929298696, + "grad_norm": 2.0700082778930664, + "learning_rate": 4.9740452894089806e-05, + "loss": 5.4255, + "step": 7720 + }, + { + "epoch": 0.04591897421257969, + "grad_norm": 2.3476040363311768, + "learning_rate": 4.974038575748165e-05, + "loss": 5.5055, + "step": 7721 + }, + { + "epoch": 0.04592492149586069, + "grad_norm": 4.2995524406433105, + "learning_rate": 4.974031861223688e-05, + "loss": 5.8869, + "step": 7722 + }, + { + "epoch": 0.04593086877914169, + "grad_norm": 4.690639495849609, + "learning_rate": 4.974025145835552e-05, + "loss": 6.0808, + "step": 7723 + }, + { + "epoch": 0.04593681606242268, + "grad_norm": 3.9823479652404785, + "learning_rate": 4.97401842958376e-05, + "loss": 6.0844, + "step": 7724 + }, + { + "epoch": 0.045942763345703685, + "grad_norm": 3.69808030128479, + "learning_rate": 4.9740117124683136e-05, + "loss": 5.9611, + "step": 7725 + }, + { + "epoch": 0.04594871062898468, + "grad_norm": 2.5912535190582275, + "learning_rate": 4.974004994489215e-05, + "loss": 5.9669, + "step": 7726 + }, + { + "epoch": 0.045954657912265676, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.973998275646467e-05, + "loss": 5.6717, + "step": 7727 + }, + { + "epoch": 0.04596060519554668, + "grad_norm": 2.179302930831909, + "learning_rate": 4.973991555940072e-05, + "loss": 5.4077, + "step": 7728 + }, + { + "epoch": 0.04596655247882767, + "grad_norm": 2.4919214248657227, + "learning_rate": 4.973984835370031e-05, + "loss": 6.118, + "step": 7729 + }, + { + "epoch": 0.04597249976210867, + "grad_norm": 3.5036723613739014, + "learning_rate": 4.9739781139363485e-05, + "loss": 5.436, + "step": 7730 + }, + { + "epoch": 0.04597844704538966, + "grad_norm": 4.129561424255371, + "learning_rate": 4.973971391639026e-05, + "loss": 4.8414, + "step": 7731 + }, + { + "epoch": 0.045984394328670665, + "grad_norm": 2.867039203643799, + "learning_rate": 4.973964668478065e-05, + "loss": 4.7385, + "step": 7732 + }, + { + "epoch": 0.04599034161195166, + "grad_norm": 2.754023313522339, + "learning_rate": 4.973957944453469e-05, + "loss": 4.6063, + "step": 7733 + }, + { + "epoch": 0.045996288895232655, + "grad_norm": 2.1025235652923584, + "learning_rate": 4.973951219565239e-05, + "loss": 5.3233, + "step": 7734 + }, + { + "epoch": 0.04600223617851366, + "grad_norm": 2.352883815765381, + "learning_rate": 4.973944493813379e-05, + "loss": 5.5648, + "step": 7735 + }, + { + "epoch": 0.04600818346179465, + "grad_norm": 2.049377679824829, + "learning_rate": 4.97393776719789e-05, + "loss": 6.1241, + "step": 7736 + }, + { + "epoch": 0.04601413074507565, + "grad_norm": 1.7124110460281372, + "learning_rate": 4.9739310397187756e-05, + "loss": 6.1258, + "step": 7737 + }, + { + "epoch": 0.04602007802835665, + "grad_norm": 2.2592861652374268, + "learning_rate": 4.9739243113760364e-05, + "loss": 6.1972, + "step": 7738 + }, + { + "epoch": 0.046026025311637644, + "grad_norm": 2.3926188945770264, + "learning_rate": 4.973917582169677e-05, + "loss": 6.1681, + "step": 7739 + }, + { + "epoch": 0.04603197259491864, + "grad_norm": 1.9956084489822388, + "learning_rate": 4.973910852099698e-05, + "loss": 6.2068, + "step": 7740 + }, + { + "epoch": 0.04603791987819964, + "grad_norm": 1.924467921257019, + "learning_rate": 4.973904121166102e-05, + "loss": 6.4391, + "step": 7741 + }, + { + "epoch": 0.04604386716148064, + "grad_norm": 1.9410041570663452, + "learning_rate": 4.973897389368891e-05, + "loss": 5.9378, + "step": 7742 + }, + { + "epoch": 0.04604981444476163, + "grad_norm": 2.0418617725372314, + "learning_rate": 4.9738906567080686e-05, + "loss": 5.8823, + "step": 7743 + }, + { + "epoch": 0.04605576172804263, + "grad_norm": 2.696143627166748, + "learning_rate": 4.973883923183637e-05, + "loss": 5.8551, + "step": 7744 + }, + { + "epoch": 0.04606170901132363, + "grad_norm": 2.482703447341919, + "learning_rate": 4.973877188795598e-05, + "loss": 5.5752, + "step": 7745 + }, + { + "epoch": 0.046067656294604624, + "grad_norm": 2.520437240600586, + "learning_rate": 4.973870453543954e-05, + "loss": 5.571, + "step": 7746 + }, + { + "epoch": 0.04607360357788562, + "grad_norm": 2.568150758743286, + "learning_rate": 4.973863717428707e-05, + "loss": 5.9145, + "step": 7747 + }, + { + "epoch": 0.04607955086116662, + "grad_norm": 2.6373183727264404, + "learning_rate": 4.9738569804498605e-05, + "loss": 5.9414, + "step": 7748 + }, + { + "epoch": 0.046085498144447616, + "grad_norm": 2.1663565635681152, + "learning_rate": 4.973850242607415e-05, + "loss": 6.2316, + "step": 7749 + }, + { + "epoch": 0.04609144542772861, + "grad_norm": 2.044316053390503, + "learning_rate": 4.973843503901374e-05, + "loss": 5.7232, + "step": 7750 + }, + { + "epoch": 0.04609739271100961, + "grad_norm": 2.1740782260894775, + "learning_rate": 4.9738367643317405e-05, + "loss": 6.0388, + "step": 7751 + }, + { + "epoch": 0.04610333999429061, + "grad_norm": 2.0643458366394043, + "learning_rate": 4.973830023898516e-05, + "loss": 5.8201, + "step": 7752 + }, + { + "epoch": 0.0461092872775716, + "grad_norm": 1.7433217763900757, + "learning_rate": 4.973823282601703e-05, + "loss": 6.0464, + "step": 7753 + }, + { + "epoch": 0.046115234560852605, + "grad_norm": 2.657677412033081, + "learning_rate": 4.9738165404413037e-05, + "loss": 5.2849, + "step": 7754 + }, + { + "epoch": 0.0461211818441336, + "grad_norm": 1.7317034006118774, + "learning_rate": 4.9738097974173205e-05, + "loss": 6.0619, + "step": 7755 + }, + { + "epoch": 0.046127129127414596, + "grad_norm": 1.6109949350357056, + "learning_rate": 4.973803053529756e-05, + "loss": 5.7832, + "step": 7756 + }, + { + "epoch": 0.0461330764106956, + "grad_norm": 2.2980475425720215, + "learning_rate": 4.9737963087786125e-05, + "loss": 5.4346, + "step": 7757 + }, + { + "epoch": 0.04613902369397659, + "grad_norm": 2.5162737369537354, + "learning_rate": 4.973789563163892e-05, + "loss": 5.3723, + "step": 7758 + }, + { + "epoch": 0.04614497097725759, + "grad_norm": 2.3493261337280273, + "learning_rate": 4.973782816685597e-05, + "loss": 5.7474, + "step": 7759 + }, + { + "epoch": 0.04615091826053858, + "grad_norm": 2.1428544521331787, + "learning_rate": 4.9737760693437306e-05, + "loss": 5.6318, + "step": 7760 + }, + { + "epoch": 0.046156865543819585, + "grad_norm": 2.11627197265625, + "learning_rate": 4.973769321138294e-05, + "loss": 5.38, + "step": 7761 + }, + { + "epoch": 0.04616281282710058, + "grad_norm": 2.411957263946533, + "learning_rate": 4.9737625720692906e-05, + "loss": 5.1822, + "step": 7762 + }, + { + "epoch": 0.046168760110381575, + "grad_norm": 2.3566222190856934, + "learning_rate": 4.973755822136722e-05, + "loss": 5.0405, + "step": 7763 + }, + { + "epoch": 0.04617470739366258, + "grad_norm": 2.2235679626464844, + "learning_rate": 4.973749071340591e-05, + "loss": 5.4746, + "step": 7764 + }, + { + "epoch": 0.04618065467694357, + "grad_norm": 2.4175586700439453, + "learning_rate": 4.973742319680899e-05, + "loss": 5.7519, + "step": 7765 + }, + { + "epoch": 0.04618660196022457, + "grad_norm": 2.3386452198028564, + "learning_rate": 4.9737355671576496e-05, + "loss": 6.1765, + "step": 7766 + }, + { + "epoch": 0.04619254924350557, + "grad_norm": 2.084333658218384, + "learning_rate": 4.973728813770845e-05, + "loss": 6.1439, + "step": 7767 + }, + { + "epoch": 0.046198496526786564, + "grad_norm": 2.0523531436920166, + "learning_rate": 4.973722059520487e-05, + "loss": 6.294, + "step": 7768 + }, + { + "epoch": 0.04620444381006756, + "grad_norm": 2.1187572479248047, + "learning_rate": 4.973715304406578e-05, + "loss": 5.3679, + "step": 7769 + }, + { + "epoch": 0.04621039109334856, + "grad_norm": 2.5249836444854736, + "learning_rate": 4.9737085484291204e-05, + "loss": 5.9086, + "step": 7770 + }, + { + "epoch": 0.04621633837662956, + "grad_norm": 2.35662841796875, + "learning_rate": 4.973701791588117e-05, + "loss": 6.3135, + "step": 7771 + }, + { + "epoch": 0.04622228565991055, + "grad_norm": 2.070955276489258, + "learning_rate": 4.9736950338835695e-05, + "loss": 5.8748, + "step": 7772 + }, + { + "epoch": 0.04622823294319155, + "grad_norm": 2.151587963104248, + "learning_rate": 4.9736882753154814e-05, + "loss": 6.2053, + "step": 7773 + }, + { + "epoch": 0.04623418022647255, + "grad_norm": 2.2187843322753906, + "learning_rate": 4.9736815158838534e-05, + "loss": 5.762, + "step": 7774 + }, + { + "epoch": 0.046240127509753544, + "grad_norm": 1.8676223754882812, + "learning_rate": 4.973674755588689e-05, + "loss": 6.06, + "step": 7775 + }, + { + "epoch": 0.04624607479303454, + "grad_norm": 2.2110252380371094, + "learning_rate": 4.9736679944299906e-05, + "loss": 5.6474, + "step": 7776 + }, + { + "epoch": 0.04625202207631554, + "grad_norm": 2.0635151863098145, + "learning_rate": 4.9736612324077605e-05, + "loss": 5.5579, + "step": 7777 + }, + { + "epoch": 0.046257969359596536, + "grad_norm": 2.1654598712921143, + "learning_rate": 4.973654469522e-05, + "loss": 5.5388, + "step": 7778 + }, + { + "epoch": 0.04626391664287753, + "grad_norm": 2.3735673427581787, + "learning_rate": 4.973647705772713e-05, + "loss": 5.4383, + "step": 7779 + }, + { + "epoch": 0.04626986392615853, + "grad_norm": 2.344160318374634, + "learning_rate": 4.9736409411599e-05, + "loss": 5.6501, + "step": 7780 + }, + { + "epoch": 0.04627581120943953, + "grad_norm": 3.023350477218628, + "learning_rate": 4.973634175683566e-05, + "loss": 5.2688, + "step": 7781 + }, + { + "epoch": 0.04628175849272052, + "grad_norm": 2.8814494609832764, + "learning_rate": 4.973627409343711e-05, + "loss": 5.08, + "step": 7782 + }, + { + "epoch": 0.046287705776001525, + "grad_norm": 2.475191831588745, + "learning_rate": 4.973620642140339e-05, + "loss": 5.0761, + "step": 7783 + }, + { + "epoch": 0.04629365305928252, + "grad_norm": 2.5567755699157715, + "learning_rate": 4.9736138740734504e-05, + "loss": 5.46, + "step": 7784 + }, + { + "epoch": 0.046299600342563516, + "grad_norm": 2.9225175380706787, + "learning_rate": 4.973607105143049e-05, + "loss": 5.5219, + "step": 7785 + }, + { + "epoch": 0.04630554762584452, + "grad_norm": 2.3112781047821045, + "learning_rate": 4.973600335349138e-05, + "loss": 6.4204, + "step": 7786 + }, + { + "epoch": 0.04631149490912551, + "grad_norm": 2.228182554244995, + "learning_rate": 4.973593564691717e-05, + "loss": 6.3299, + "step": 7787 + }, + { + "epoch": 0.04631744219240651, + "grad_norm": 1.8612277507781982, + "learning_rate": 4.973586793170792e-05, + "loss": 5.994, + "step": 7788 + }, + { + "epoch": 0.0463233894756875, + "grad_norm": 1.9788155555725098, + "learning_rate": 4.9735800207863626e-05, + "loss": 6.1676, + "step": 7789 + }, + { + "epoch": 0.046329336758968505, + "grad_norm": 2.2335264682769775, + "learning_rate": 4.973573247538431e-05, + "loss": 6.3112, + "step": 7790 + }, + { + "epoch": 0.0463352840422495, + "grad_norm": 2.168656349182129, + "learning_rate": 4.973566473427001e-05, + "loss": 5.8326, + "step": 7791 + }, + { + "epoch": 0.046341231325530495, + "grad_norm": 1.9187591075897217, + "learning_rate": 4.9735596984520755e-05, + "loss": 5.8734, + "step": 7792 + }, + { + "epoch": 0.0463471786088115, + "grad_norm": 2.195242166519165, + "learning_rate": 4.973552922613655e-05, + "loss": 6.1325, + "step": 7793 + }, + { + "epoch": 0.04635312589209249, + "grad_norm": 1.9698888063430786, + "learning_rate": 4.973546145911743e-05, + "loss": 5.8586, + "step": 7794 + }, + { + "epoch": 0.04635907317537349, + "grad_norm": 2.2149972915649414, + "learning_rate": 4.973539368346342e-05, + "loss": 5.4087, + "step": 7795 + }, + { + "epoch": 0.04636502045865449, + "grad_norm": 1.8587820529937744, + "learning_rate": 4.973532589917453e-05, + "loss": 5.9956, + "step": 7796 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 2.022866725921631, + "learning_rate": 4.97352581062508e-05, + "loss": 6.0905, + "step": 7797 + }, + { + "epoch": 0.04637691502521648, + "grad_norm": 2.0257678031921387, + "learning_rate": 4.973519030469225e-05, + "loss": 6.02, + "step": 7798 + }, + { + "epoch": 0.04638286230849748, + "grad_norm": 1.6909089088439941, + "learning_rate": 4.973512249449889e-05, + "loss": 5.727, + "step": 7799 + }, + { + "epoch": 0.046388809591778477, + "grad_norm": 1.8882997035980225, + "learning_rate": 4.9735054675670754e-05, + "loss": 5.655, + "step": 7800 + }, + { + "epoch": 0.04639475687505947, + "grad_norm": 2.1775193214416504, + "learning_rate": 4.9734986848207876e-05, + "loss": 5.8067, + "step": 7801 + }, + { + "epoch": 0.04640070415834047, + "grad_norm": 2.136690139770508, + "learning_rate": 4.973491901211027e-05, + "loss": 5.5515, + "step": 7802 + }, + { + "epoch": 0.04640665144162147, + "grad_norm": 1.8036144971847534, + "learning_rate": 4.973485116737795e-05, + "loss": 5.8404, + "step": 7803 + }, + { + "epoch": 0.046412598724902464, + "grad_norm": 2.1350481510162354, + "learning_rate": 4.973478331401096e-05, + "loss": 6.1635, + "step": 7804 + }, + { + "epoch": 0.04641854600818346, + "grad_norm": 2.4152462482452393, + "learning_rate": 4.97347154520093e-05, + "loss": 5.9882, + "step": 7805 + }, + { + "epoch": 0.04642449329146446, + "grad_norm": 2.166402578353882, + "learning_rate": 4.9734647581373015e-05, + "loss": 5.8982, + "step": 7806 + }, + { + "epoch": 0.046430440574745456, + "grad_norm": 1.8684437274932861, + "learning_rate": 4.973457970210211e-05, + "loss": 5.9501, + "step": 7807 + }, + { + "epoch": 0.04643638785802645, + "grad_norm": 1.775829792022705, + "learning_rate": 4.973451181419663e-05, + "loss": 5.83, + "step": 7808 + }, + { + "epoch": 0.04644233514130745, + "grad_norm": 1.7500759363174438, + "learning_rate": 4.973444391765659e-05, + "loss": 6.0084, + "step": 7809 + }, + { + "epoch": 0.04644828242458845, + "grad_norm": 2.3920938968658447, + "learning_rate": 4.9734376012482e-05, + "loss": 5.559, + "step": 7810 + }, + { + "epoch": 0.04645422970786944, + "grad_norm": 2.7680983543395996, + "learning_rate": 4.97343080986729e-05, + "loss": 5.3521, + "step": 7811 + }, + { + "epoch": 0.046460176991150445, + "grad_norm": 2.6618781089782715, + "learning_rate": 4.9734240176229316e-05, + "loss": 5.6917, + "step": 7812 + }, + { + "epoch": 0.04646612427443144, + "grad_norm": 2.086775541305542, + "learning_rate": 4.9734172245151256e-05, + "loss": 5.582, + "step": 7813 + }, + { + "epoch": 0.046472071557712435, + "grad_norm": 2.190012216567993, + "learning_rate": 4.973410430543875e-05, + "loss": 5.9132, + "step": 7814 + }, + { + "epoch": 0.04647801884099344, + "grad_norm": 2.317610740661621, + "learning_rate": 4.973403635709183e-05, + "loss": 5.7055, + "step": 7815 + }, + { + "epoch": 0.04648396612427443, + "grad_norm": 2.1291167736053467, + "learning_rate": 4.973396840011051e-05, + "loss": 5.6711, + "step": 7816 + }, + { + "epoch": 0.04648991340755543, + "grad_norm": 1.5421113967895508, + "learning_rate": 4.9733900434494815e-05, + "loss": 5.6433, + "step": 7817 + }, + { + "epoch": 0.04649586069083642, + "grad_norm": 2.222355604171753, + "learning_rate": 4.973383246024477e-05, + "loss": 5.3685, + "step": 7818 + }, + { + "epoch": 0.046501807974117425, + "grad_norm": 2.097116708755493, + "learning_rate": 4.97337644773604e-05, + "loss": 5.6528, + "step": 7819 + }, + { + "epoch": 0.04650775525739842, + "grad_norm": 2.0224382877349854, + "learning_rate": 4.973369648584174e-05, + "loss": 5.8849, + "step": 7820 + }, + { + "epoch": 0.046513702540679415, + "grad_norm": 2.1581428050994873, + "learning_rate": 4.973362848568879e-05, + "loss": 5.985, + "step": 7821 + }, + { + "epoch": 0.04651964982396042, + "grad_norm": 2.43945574760437, + "learning_rate": 4.9733560476901584e-05, + "loss": 5.5682, + "step": 7822 + }, + { + "epoch": 0.04652559710724141, + "grad_norm": 3.174143075942993, + "learning_rate": 4.9733492459480157e-05, + "loss": 4.832, + "step": 7823 + }, + { + "epoch": 0.04653154439052241, + "grad_norm": 2.269339084625244, + "learning_rate": 4.973342443342452e-05, + "loss": 5.5804, + "step": 7824 + }, + { + "epoch": 0.04653749167380341, + "grad_norm": 2.3775289058685303, + "learning_rate": 4.9733356398734695e-05, + "loss": 5.8299, + "step": 7825 + }, + { + "epoch": 0.046543438957084404, + "grad_norm": 2.065579414367676, + "learning_rate": 4.9733288355410716e-05, + "loss": 5.6985, + "step": 7826 + }, + { + "epoch": 0.0465493862403654, + "grad_norm": 1.9699875116348267, + "learning_rate": 4.9733220303452604e-05, + "loss": 6.0161, + "step": 7827 + }, + { + "epoch": 0.0465553335236464, + "grad_norm": 2.1414806842803955, + "learning_rate": 4.9733152242860374e-05, + "loss": 6.2534, + "step": 7828 + }, + { + "epoch": 0.046561280806927396, + "grad_norm": 2.414738416671753, + "learning_rate": 4.973308417363406e-05, + "loss": 5.8402, + "step": 7829 + }, + { + "epoch": 0.04656722809020839, + "grad_norm": 2.4105031490325928, + "learning_rate": 4.973301609577368e-05, + "loss": 5.8728, + "step": 7830 + }, + { + "epoch": 0.04657317537348939, + "grad_norm": 2.7718660831451416, + "learning_rate": 4.9732948009279264e-05, + "loss": 5.637, + "step": 7831 + }, + { + "epoch": 0.04657912265677039, + "grad_norm": 2.205103874206543, + "learning_rate": 4.9732879914150824e-05, + "loss": 5.4119, + "step": 7832 + }, + { + "epoch": 0.046585069940051384, + "grad_norm": 1.9080390930175781, + "learning_rate": 4.9732811810388394e-05, + "loss": 5.3387, + "step": 7833 + }, + { + "epoch": 0.04659101722333238, + "grad_norm": 1.6600725650787354, + "learning_rate": 4.9732743697992e-05, + "loss": 5.3192, + "step": 7834 + }, + { + "epoch": 0.04659696450661338, + "grad_norm": 1.9428787231445312, + "learning_rate": 4.973267557696165e-05, + "loss": 5.3127, + "step": 7835 + }, + { + "epoch": 0.046602911789894376, + "grad_norm": 2.174811840057373, + "learning_rate": 4.973260744729738e-05, + "loss": 5.7181, + "step": 7836 + }, + { + "epoch": 0.04660885907317537, + "grad_norm": 2.5420422554016113, + "learning_rate": 4.9732539308999224e-05, + "loss": 5.934, + "step": 7837 + }, + { + "epoch": 0.04661480635645637, + "grad_norm": 2.079343795776367, + "learning_rate": 4.973247116206719e-05, + "loss": 5.236, + "step": 7838 + }, + { + "epoch": 0.04662075363973737, + "grad_norm": 1.7748003005981445, + "learning_rate": 4.97324030065013e-05, + "loss": 5.2929, + "step": 7839 + }, + { + "epoch": 0.04662670092301836, + "grad_norm": 2.2746875286102295, + "learning_rate": 4.973233484230159e-05, + "loss": 5.182, + "step": 7840 + }, + { + "epoch": 0.046632648206299365, + "grad_norm": 1.7846394777297974, + "learning_rate": 4.9732266669468074e-05, + "loss": 5.2682, + "step": 7841 + }, + { + "epoch": 0.04663859548958036, + "grad_norm": 2.078132152557373, + "learning_rate": 4.973219848800078e-05, + "loss": 5.3245, + "step": 7842 + }, + { + "epoch": 0.046644542772861355, + "grad_norm": 1.7784876823425293, + "learning_rate": 4.9732130297899726e-05, + "loss": 5.4582, + "step": 7843 + }, + { + "epoch": 0.04665049005614236, + "grad_norm": 1.8421920537948608, + "learning_rate": 4.973206209916495e-05, + "loss": 5.3504, + "step": 7844 + }, + { + "epoch": 0.04665643733942335, + "grad_norm": 1.9958820343017578, + "learning_rate": 4.9731993891796455e-05, + "loss": 5.2914, + "step": 7845 + }, + { + "epoch": 0.04666238462270435, + "grad_norm": 2.0615813732147217, + "learning_rate": 4.9731925675794286e-05, + "loss": 5.3318, + "step": 7846 + }, + { + "epoch": 0.04666833190598534, + "grad_norm": 1.7690422534942627, + "learning_rate": 4.973185745115846e-05, + "loss": 5.3169, + "step": 7847 + }, + { + "epoch": 0.046674279189266345, + "grad_norm": 1.7990578413009644, + "learning_rate": 4.9731789217888994e-05, + "loss": 5.3136, + "step": 7848 + }, + { + "epoch": 0.04668022647254734, + "grad_norm": 2.0028672218322754, + "learning_rate": 4.9731720975985905e-05, + "loss": 5.2115, + "step": 7849 + }, + { + "epoch": 0.046686173755828335, + "grad_norm": 2.0703940391540527, + "learning_rate": 4.973165272544924e-05, + "loss": 5.2439, + "step": 7850 + }, + { + "epoch": 0.04669212103910934, + "grad_norm": 2.1105704307556152, + "learning_rate": 4.973158446627901e-05, + "loss": 5.5812, + "step": 7851 + }, + { + "epoch": 0.04669806832239033, + "grad_norm": 1.7391036748886108, + "learning_rate": 4.9731516198475236e-05, + "loss": 5.229, + "step": 7852 + }, + { + "epoch": 0.04670401560567133, + "grad_norm": 1.6907505989074707, + "learning_rate": 4.973144792203795e-05, + "loss": 5.2674, + "step": 7853 + }, + { + "epoch": 0.04670996288895233, + "grad_norm": 1.608168125152588, + "learning_rate": 4.973137963696717e-05, + "loss": 5.389, + "step": 7854 + }, + { + "epoch": 0.046715910172233324, + "grad_norm": 1.7521610260009766, + "learning_rate": 4.9731311343262913e-05, + "loss": 5.2436, + "step": 7855 + }, + { + "epoch": 0.04672185745551432, + "grad_norm": 2.0182595252990723, + "learning_rate": 4.973124304092522e-05, + "loss": 5.2746, + "step": 7856 + }, + { + "epoch": 0.04672780473879532, + "grad_norm": 1.7990871667861938, + "learning_rate": 4.97311747299541e-05, + "loss": 5.4241, + "step": 7857 + }, + { + "epoch": 0.046733752022076316, + "grad_norm": 2.124717950820923, + "learning_rate": 4.973110641034958e-05, + "loss": 5.5133, + "step": 7858 + }, + { + "epoch": 0.04673969930535731, + "grad_norm": 2.066869020462036, + "learning_rate": 4.973103808211169e-05, + "loss": 5.252, + "step": 7859 + }, + { + "epoch": 0.04674564658863831, + "grad_norm": 1.8004878759384155, + "learning_rate": 4.9730969745240455e-05, + "loss": 5.483, + "step": 7860 + }, + { + "epoch": 0.04675159387191931, + "grad_norm": 1.6822713613510132, + "learning_rate": 4.9730901399735886e-05, + "loss": 5.3916, + "step": 7861 + }, + { + "epoch": 0.046757541155200304, + "grad_norm": 1.7024493217468262, + "learning_rate": 4.973083304559802e-05, + "loss": 5.3504, + "step": 7862 + }, + { + "epoch": 0.0467634884384813, + "grad_norm": 1.5939997434616089, + "learning_rate": 4.973076468282687e-05, + "loss": 5.4151, + "step": 7863 + }, + { + "epoch": 0.0467694357217623, + "grad_norm": 1.7603535652160645, + "learning_rate": 4.9730696311422475e-05, + "loss": 5.351, + "step": 7864 + }, + { + "epoch": 0.046775383005043296, + "grad_norm": 1.737897276878357, + "learning_rate": 4.973062793138484e-05, + "loss": 5.0834, + "step": 7865 + }, + { + "epoch": 0.04678133028832429, + "grad_norm": 2.4130520820617676, + "learning_rate": 4.973055954271401e-05, + "loss": 4.833, + "step": 7866 + }, + { + "epoch": 0.04678727757160529, + "grad_norm": 1.9712201356887817, + "learning_rate": 4.9730491145409987e-05, + "loss": 5.0048, + "step": 7867 + }, + { + "epoch": 0.04679322485488629, + "grad_norm": 1.808608055114746, + "learning_rate": 4.97304227394728e-05, + "loss": 5.3134, + "step": 7868 + }, + { + "epoch": 0.04679917213816728, + "grad_norm": 1.8121775388717651, + "learning_rate": 4.973035432490249e-05, + "loss": 5.2594, + "step": 7869 + }, + { + "epoch": 0.046805119421448285, + "grad_norm": 1.7191296815872192, + "learning_rate": 4.9730285901699064e-05, + "loss": 5.206, + "step": 7870 + }, + { + "epoch": 0.04681106670472928, + "grad_norm": 1.931894063949585, + "learning_rate": 4.973021746986255e-05, + "loss": 5.3349, + "step": 7871 + }, + { + "epoch": 0.046817013988010275, + "grad_norm": 2.5420172214508057, + "learning_rate": 4.973014902939297e-05, + "loss": 5.2894, + "step": 7872 + }, + { + "epoch": 0.04682296127129128, + "grad_norm": 2.5522336959838867, + "learning_rate": 4.973008058029036e-05, + "loss": 5.2144, + "step": 7873 + }, + { + "epoch": 0.04682890855457227, + "grad_norm": 3.1389801502227783, + "learning_rate": 4.973001212255472e-05, + "loss": 5.7229, + "step": 7874 + }, + { + "epoch": 0.04683485583785327, + "grad_norm": 1.8687554597854614, + "learning_rate": 4.97299436561861e-05, + "loss": 5.483, + "step": 7875 + }, + { + "epoch": 0.04684080312113426, + "grad_norm": 2.2526602745056152, + "learning_rate": 4.972987518118451e-05, + "loss": 5.4562, + "step": 7876 + }, + { + "epoch": 0.046846750404415265, + "grad_norm": 2.108677625656128, + "learning_rate": 4.972980669754997e-05, + "loss": 5.2005, + "step": 7877 + }, + { + "epoch": 0.04685269768769626, + "grad_norm": 2.023118019104004, + "learning_rate": 4.972973820528252e-05, + "loss": 5.3674, + "step": 7878 + }, + { + "epoch": 0.046858644970977255, + "grad_norm": 1.6553964614868164, + "learning_rate": 4.9729669704382165e-05, + "loss": 5.3256, + "step": 7879 + }, + { + "epoch": 0.04686459225425826, + "grad_norm": 1.8197314739227295, + "learning_rate": 4.972960119484894e-05, + "loss": 5.1738, + "step": 7880 + }, + { + "epoch": 0.04687053953753925, + "grad_norm": 1.6142289638519287, + "learning_rate": 4.972953267668287e-05, + "loss": 5.245, + "step": 7881 + }, + { + "epoch": 0.04687648682082025, + "grad_norm": 1.4962797164916992, + "learning_rate": 4.972946414988398e-05, + "loss": 5.3121, + "step": 7882 + }, + { + "epoch": 0.04688243410410125, + "grad_norm": 1.487801432609558, + "learning_rate": 4.972939561445228e-05, + "loss": 5.1828, + "step": 7883 + }, + { + "epoch": 0.046888381387382244, + "grad_norm": 1.9139772653579712, + "learning_rate": 4.972932707038781e-05, + "loss": 5.2432, + "step": 7884 + }, + { + "epoch": 0.04689432867066324, + "grad_norm": 1.7533615827560425, + "learning_rate": 4.972925851769058e-05, + "loss": 5.6451, + "step": 7885 + }, + { + "epoch": 0.04690027595394424, + "grad_norm": 1.8561608791351318, + "learning_rate": 4.972918995636062e-05, + "loss": 5.4293, + "step": 7886 + }, + { + "epoch": 0.046906223237225236, + "grad_norm": 1.6891844272613525, + "learning_rate": 4.972912138639797e-05, + "loss": 5.2736, + "step": 7887 + }, + { + "epoch": 0.04691217052050623, + "grad_norm": 1.9279890060424805, + "learning_rate": 4.972905280780262e-05, + "loss": 5.5733, + "step": 7888 + }, + { + "epoch": 0.04691811780378723, + "grad_norm": 1.7810181379318237, + "learning_rate": 4.9728984220574624e-05, + "loss": 5.2036, + "step": 7889 + }, + { + "epoch": 0.04692406508706823, + "grad_norm": 1.6455233097076416, + "learning_rate": 4.9728915624714004e-05, + "loss": 5.3493, + "step": 7890 + }, + { + "epoch": 0.046930012370349224, + "grad_norm": 1.5345048904418945, + "learning_rate": 4.9728847020220756e-05, + "loss": 5.2528, + "step": 7891 + }, + { + "epoch": 0.04693595965363022, + "grad_norm": 1.455165982246399, + "learning_rate": 4.9728778407094935e-05, + "loss": 5.2769, + "step": 7892 + }, + { + "epoch": 0.04694190693691122, + "grad_norm": 1.577910304069519, + "learning_rate": 4.972870978533655e-05, + "loss": 5.2182, + "step": 7893 + }, + { + "epoch": 0.046947854220192216, + "grad_norm": 1.728143334388733, + "learning_rate": 4.972864115494563e-05, + "loss": 5.3446, + "step": 7894 + }, + { + "epoch": 0.04695380150347321, + "grad_norm": 1.6157398223876953, + "learning_rate": 4.972857251592219e-05, + "loss": 5.4866, + "step": 7895 + }, + { + "epoch": 0.04695974878675421, + "grad_norm": 1.5386699438095093, + "learning_rate": 4.9728503868266266e-05, + "loss": 5.4626, + "step": 7896 + }, + { + "epoch": 0.04696569607003521, + "grad_norm": 1.874915599822998, + "learning_rate": 4.972843521197788e-05, + "loss": 5.4152, + "step": 7897 + }, + { + "epoch": 0.0469716433533162, + "grad_norm": 1.7093253135681152, + "learning_rate": 4.9728366547057046e-05, + "loss": 5.2852, + "step": 7898 + }, + { + "epoch": 0.046977590636597205, + "grad_norm": 1.6435173749923706, + "learning_rate": 4.9728297873503806e-05, + "loss": 5.3985, + "step": 7899 + }, + { + "epoch": 0.0469835379198782, + "grad_norm": 1.5776588916778564, + "learning_rate": 4.972822919131816e-05, + "loss": 5.2914, + "step": 7900 + }, + { + "epoch": 0.046989485203159195, + "grad_norm": 2.051072835922241, + "learning_rate": 4.972816050050015e-05, + "loss": 5.343, + "step": 7901 + }, + { + "epoch": 0.0469954324864402, + "grad_norm": 2.003816604614258, + "learning_rate": 4.972809180104979e-05, + "loss": 5.3577, + "step": 7902 + }, + { + "epoch": 0.04700137976972119, + "grad_norm": 1.9092657566070557, + "learning_rate": 4.9728023092967116e-05, + "loss": 5.551, + "step": 7903 + }, + { + "epoch": 0.04700732705300219, + "grad_norm": 1.763007640838623, + "learning_rate": 4.972795437625214e-05, + "loss": 5.5611, + "step": 7904 + }, + { + "epoch": 0.04701327433628318, + "grad_norm": 2.637850046157837, + "learning_rate": 4.9727885650904895e-05, + "loss": 5.937, + "step": 7905 + }, + { + "epoch": 0.047019221619564185, + "grad_norm": 1.6650307178497314, + "learning_rate": 4.9727816916925395e-05, + "loss": 5.6418, + "step": 7906 + }, + { + "epoch": 0.04702516890284518, + "grad_norm": 1.6943029165267944, + "learning_rate": 4.972774817431367e-05, + "loss": 5.4826, + "step": 7907 + }, + { + "epoch": 0.047031116186126175, + "grad_norm": 1.4689685106277466, + "learning_rate": 4.972767942306975e-05, + "loss": 5.4849, + "step": 7908 + }, + { + "epoch": 0.04703706346940718, + "grad_norm": 1.759244441986084, + "learning_rate": 4.9727610663193644e-05, + "loss": 5.3496, + "step": 7909 + }, + { + "epoch": 0.04704301075268817, + "grad_norm": 1.8706889152526855, + "learning_rate": 4.9727541894685395e-05, + "loss": 5.2836, + "step": 7910 + }, + { + "epoch": 0.04704895803596917, + "grad_norm": 1.486164927482605, + "learning_rate": 4.972747311754501e-05, + "loss": 5.4125, + "step": 7911 + }, + { + "epoch": 0.04705490531925017, + "grad_norm": 1.6479889154434204, + "learning_rate": 4.972740433177252e-05, + "loss": 5.1986, + "step": 7912 + }, + { + "epoch": 0.047060852602531164, + "grad_norm": 1.5741796493530273, + "learning_rate": 4.9727335537367944e-05, + "loss": 5.4761, + "step": 7913 + }, + { + "epoch": 0.04706679988581216, + "grad_norm": 1.5001682043075562, + "learning_rate": 4.972726673433131e-05, + "loss": 5.6267, + "step": 7914 + }, + { + "epoch": 0.04707274716909316, + "grad_norm": 1.774282455444336, + "learning_rate": 4.972719792266265e-05, + "loss": 5.5944, + "step": 7915 + }, + { + "epoch": 0.047078694452374156, + "grad_norm": 1.6656653881072998, + "learning_rate": 4.972712910236198e-05, + "loss": 5.4159, + "step": 7916 + }, + { + "epoch": 0.04708464173565515, + "grad_norm": 1.7174065113067627, + "learning_rate": 4.972706027342933e-05, + "loss": 5.4239, + "step": 7917 + }, + { + "epoch": 0.04709058901893615, + "grad_norm": 1.607878565788269, + "learning_rate": 4.9726991435864705e-05, + "loss": 5.4517, + "step": 7918 + }, + { + "epoch": 0.04709653630221715, + "grad_norm": 1.9639167785644531, + "learning_rate": 4.972692258966815e-05, + "loss": 5.5371, + "step": 7919 + }, + { + "epoch": 0.047102483585498144, + "grad_norm": 1.5418875217437744, + "learning_rate": 4.9726853734839684e-05, + "loss": 5.4798, + "step": 7920 + }, + { + "epoch": 0.04710843086877914, + "grad_norm": 1.54796302318573, + "learning_rate": 4.9726784871379326e-05, + "loss": 5.5329, + "step": 7921 + }, + { + "epoch": 0.04711437815206014, + "grad_norm": 1.8075921535491943, + "learning_rate": 4.97267159992871e-05, + "loss": 5.6049, + "step": 7922 + }, + { + "epoch": 0.047120325435341136, + "grad_norm": 1.4973857402801514, + "learning_rate": 4.972664711856304e-05, + "loss": 5.27, + "step": 7923 + }, + { + "epoch": 0.04712627271862213, + "grad_norm": 2.1028542518615723, + "learning_rate": 4.9726578229207155e-05, + "loss": 5.3626, + "step": 7924 + }, + { + "epoch": 0.04713222000190313, + "grad_norm": 2.2057480812072754, + "learning_rate": 4.9726509331219485e-05, + "loss": 5.1767, + "step": 7925 + }, + { + "epoch": 0.04713816728518413, + "grad_norm": 2.0549347400665283, + "learning_rate": 4.972644042460004e-05, + "loss": 5.3362, + "step": 7926 + }, + { + "epoch": 0.04714411456846512, + "grad_norm": 2.0960693359375, + "learning_rate": 4.972637150934885e-05, + "loss": 5.5162, + "step": 7927 + }, + { + "epoch": 0.047150061851746125, + "grad_norm": 2.2022509574890137, + "learning_rate": 4.9726302585465945e-05, + "loss": 5.3263, + "step": 7928 + }, + { + "epoch": 0.04715600913502712, + "grad_norm": 1.7065988779067993, + "learning_rate": 4.9726233652951335e-05, + "loss": 5.4349, + "step": 7929 + }, + { + "epoch": 0.047161956418308115, + "grad_norm": 1.742591142654419, + "learning_rate": 4.972616471180506e-05, + "loss": 5.2396, + "step": 7930 + }, + { + "epoch": 0.04716790370158912, + "grad_norm": 1.888846755027771, + "learning_rate": 4.972609576202713e-05, + "loss": 5.3453, + "step": 7931 + }, + { + "epoch": 0.04717385098487011, + "grad_norm": 1.6499360799789429, + "learning_rate": 4.972602680361758e-05, + "loss": 5.2819, + "step": 7932 + }, + { + "epoch": 0.04717979826815111, + "grad_norm": 1.8801236152648926, + "learning_rate": 4.9725957836576434e-05, + "loss": 5.2456, + "step": 7933 + }, + { + "epoch": 0.0471857455514321, + "grad_norm": 2.050522565841675, + "learning_rate": 4.97258888609037e-05, + "loss": 5.2069, + "step": 7934 + }, + { + "epoch": 0.047191692834713105, + "grad_norm": 2.0722391605377197, + "learning_rate": 4.972581987659942e-05, + "loss": 5.5057, + "step": 7935 + }, + { + "epoch": 0.0471976401179941, + "grad_norm": 2.728468179702759, + "learning_rate": 4.972575088366361e-05, + "loss": 5.5485, + "step": 7936 + }, + { + "epoch": 0.047203587401275095, + "grad_norm": 2.0293211936950684, + "learning_rate": 4.9725681882096295e-05, + "loss": 5.7126, + "step": 7937 + }, + { + "epoch": 0.0472095346845561, + "grad_norm": 2.1351194381713867, + "learning_rate": 4.97256128718975e-05, + "loss": 5.7313, + "step": 7938 + }, + { + "epoch": 0.04721548196783709, + "grad_norm": 1.9040015935897827, + "learning_rate": 4.972554385306726e-05, + "loss": 5.696, + "step": 7939 + }, + { + "epoch": 0.04722142925111809, + "grad_norm": 1.640110731124878, + "learning_rate": 4.9725474825605574e-05, + "loss": 5.2626, + "step": 7940 + }, + { + "epoch": 0.04722737653439909, + "grad_norm": 1.887408971786499, + "learning_rate": 4.972540578951249e-05, + "loss": 5.2734, + "step": 7941 + }, + { + "epoch": 0.047233323817680084, + "grad_norm": 1.8867583274841309, + "learning_rate": 4.972533674478801e-05, + "loss": 5.6811, + "step": 7942 + }, + { + "epoch": 0.04723927110096108, + "grad_norm": 1.811104655265808, + "learning_rate": 4.9725267691432174e-05, + "loss": 5.575, + "step": 7943 + }, + { + "epoch": 0.04724521838424208, + "grad_norm": 1.8644812107086182, + "learning_rate": 4.9725198629445014e-05, + "loss": 5.5718, + "step": 7944 + }, + { + "epoch": 0.047251165667523076, + "grad_norm": 1.693788766860962, + "learning_rate": 4.972512955882653e-05, + "loss": 5.5924, + "step": 7945 + }, + { + "epoch": 0.04725711295080407, + "grad_norm": 1.8305641412734985, + "learning_rate": 4.9725060479576766e-05, + "loss": 5.6529, + "step": 7946 + }, + { + "epoch": 0.04726306023408507, + "grad_norm": 1.7662039995193481, + "learning_rate": 4.9724991391695734e-05, + "loss": 5.6709, + "step": 7947 + }, + { + "epoch": 0.04726900751736607, + "grad_norm": 2.1799724102020264, + "learning_rate": 4.972492229518347e-05, + "loss": 5.6266, + "step": 7948 + }, + { + "epoch": 0.047274954800647064, + "grad_norm": 1.9300130605697632, + "learning_rate": 4.972485319003998e-05, + "loss": 5.6494, + "step": 7949 + }, + { + "epoch": 0.04728090208392806, + "grad_norm": 1.9196375608444214, + "learning_rate": 4.9724784076265307e-05, + "loss": 5.571, + "step": 7950 + }, + { + "epoch": 0.04728684936720906, + "grad_norm": 1.906616449356079, + "learning_rate": 4.972471495385947e-05, + "loss": 5.6537, + "step": 7951 + }, + { + "epoch": 0.047292796650490056, + "grad_norm": 1.826536774635315, + "learning_rate": 4.972464582282249e-05, + "loss": 5.6251, + "step": 7952 + }, + { + "epoch": 0.04729874393377105, + "grad_norm": 1.7790716886520386, + "learning_rate": 4.972457668315438e-05, + "loss": 5.3488, + "step": 7953 + }, + { + "epoch": 0.04730469121705205, + "grad_norm": 1.8892159461975098, + "learning_rate": 4.972450753485519e-05, + "loss": 5.4794, + "step": 7954 + }, + { + "epoch": 0.04731063850033305, + "grad_norm": 1.9409239292144775, + "learning_rate": 4.972443837792492e-05, + "loss": 5.6058, + "step": 7955 + }, + { + "epoch": 0.04731658578361404, + "grad_norm": 1.9935575723648071, + "learning_rate": 4.972436921236361e-05, + "loss": 5.6481, + "step": 7956 + }, + { + "epoch": 0.047322533066895045, + "grad_norm": 1.8507076501846313, + "learning_rate": 4.9724300038171276e-05, + "loss": 5.4723, + "step": 7957 + }, + { + "epoch": 0.04732848035017604, + "grad_norm": 1.9355841875076294, + "learning_rate": 4.972423085534794e-05, + "loss": 5.3843, + "step": 7958 + }, + { + "epoch": 0.047334427633457035, + "grad_norm": 1.9815531969070435, + "learning_rate": 4.972416166389363e-05, + "loss": 5.5635, + "step": 7959 + }, + { + "epoch": 0.04734037491673804, + "grad_norm": 1.7955007553100586, + "learning_rate": 4.972409246380838e-05, + "loss": 5.6002, + "step": 7960 + }, + { + "epoch": 0.04734632220001903, + "grad_norm": 2.0184547901153564, + "learning_rate": 4.97240232550922e-05, + "loss": 5.5458, + "step": 7961 + }, + { + "epoch": 0.04735226948330003, + "grad_norm": 1.7418156862258911, + "learning_rate": 4.972395403774512e-05, + "loss": 5.6443, + "step": 7962 + }, + { + "epoch": 0.04735821676658102, + "grad_norm": 1.9832762479782104, + "learning_rate": 4.972388481176716e-05, + "loss": 5.3799, + "step": 7963 + }, + { + "epoch": 0.047364164049862024, + "grad_norm": 1.8777718544006348, + "learning_rate": 4.972381557715835e-05, + "loss": 5.4349, + "step": 7964 + }, + { + "epoch": 0.04737011133314302, + "grad_norm": 1.519038438796997, + "learning_rate": 4.972374633391871e-05, + "loss": 5.2418, + "step": 7965 + }, + { + "epoch": 0.047376058616424015, + "grad_norm": 1.6425752639770508, + "learning_rate": 4.972367708204826e-05, + "loss": 5.1648, + "step": 7966 + }, + { + "epoch": 0.04738200589970502, + "grad_norm": 1.7461836338043213, + "learning_rate": 4.972360782154704e-05, + "loss": 5.1745, + "step": 7967 + }, + { + "epoch": 0.04738795318298601, + "grad_norm": 1.7991663217544556, + "learning_rate": 4.9723538552415064e-05, + "loss": 5.2268, + "step": 7968 + }, + { + "epoch": 0.04739390046626701, + "grad_norm": 1.9127873182296753, + "learning_rate": 4.9723469274652345e-05, + "loss": 5.5205, + "step": 7969 + }, + { + "epoch": 0.04739984774954801, + "grad_norm": 1.8836725950241089, + "learning_rate": 4.972339998825893e-05, + "loss": 5.3803, + "step": 7970 + }, + { + "epoch": 0.047405795032829004, + "grad_norm": 1.8391705751419067, + "learning_rate": 4.9723330693234825e-05, + "loss": 5.3084, + "step": 7971 + }, + { + "epoch": 0.04741174231611, + "grad_norm": 1.6707972288131714, + "learning_rate": 4.9723261389580063e-05, + "loss": 5.3275, + "step": 7972 + }, + { + "epoch": 0.047417689599391, + "grad_norm": 1.8807258605957031, + "learning_rate": 4.972319207729467e-05, + "loss": 5.0766, + "step": 7973 + }, + { + "epoch": 0.047423636882671996, + "grad_norm": 1.8980032205581665, + "learning_rate": 4.9723122756378655e-05, + "loss": 5.185, + "step": 7974 + }, + { + "epoch": 0.04742958416595299, + "grad_norm": 1.9011166095733643, + "learning_rate": 4.9723053426832055e-05, + "loss": 5.2494, + "step": 7975 + }, + { + "epoch": 0.04743553144923399, + "grad_norm": 1.6457782983779907, + "learning_rate": 4.97229840886549e-05, + "loss": 5.4205, + "step": 7976 + }, + { + "epoch": 0.04744147873251499, + "grad_norm": 1.558515191078186, + "learning_rate": 4.9722914741847206e-05, + "loss": 5.2111, + "step": 7977 + }, + { + "epoch": 0.04744742601579598, + "grad_norm": 1.4780910015106201, + "learning_rate": 4.9722845386409e-05, + "loss": 5.3365, + "step": 7978 + }, + { + "epoch": 0.04745337329907698, + "grad_norm": 1.529249668121338, + "learning_rate": 4.9722776022340296e-05, + "loss": 5.1323, + "step": 7979 + }, + { + "epoch": 0.04745932058235798, + "grad_norm": 1.66848886013031, + "learning_rate": 4.972270664964113e-05, + "loss": 5.2057, + "step": 7980 + }, + { + "epoch": 0.047465267865638976, + "grad_norm": 1.5645034313201904, + "learning_rate": 4.972263726831152e-05, + "loss": 5.1537, + "step": 7981 + }, + { + "epoch": 0.04747121514891997, + "grad_norm": 1.8793894052505493, + "learning_rate": 4.9722567878351496e-05, + "loss": 5.4403, + "step": 7982 + }, + { + "epoch": 0.04747716243220097, + "grad_norm": 1.7316640615463257, + "learning_rate": 4.972249847976108e-05, + "loss": 5.3642, + "step": 7983 + }, + { + "epoch": 0.04748310971548197, + "grad_norm": 1.7195171117782593, + "learning_rate": 4.972242907254029e-05, + "loss": 5.2603, + "step": 7984 + }, + { + "epoch": 0.04748905699876296, + "grad_norm": 1.6860026121139526, + "learning_rate": 4.972235965668916e-05, + "loss": 5.356, + "step": 7985 + }, + { + "epoch": 0.047495004282043965, + "grad_norm": 1.5396910905838013, + "learning_rate": 4.972229023220771e-05, + "loss": 5.2566, + "step": 7986 + }, + { + "epoch": 0.04750095156532496, + "grad_norm": 1.694547176361084, + "learning_rate": 4.9722220799095956e-05, + "loss": 5.0897, + "step": 7987 + }, + { + "epoch": 0.047506898848605955, + "grad_norm": 1.7608548402786255, + "learning_rate": 4.972215135735394e-05, + "loss": 5.4084, + "step": 7988 + }, + { + "epoch": 0.04751284613188696, + "grad_norm": 1.697198748588562, + "learning_rate": 4.9722081906981675e-05, + "loss": 5.4133, + "step": 7989 + }, + { + "epoch": 0.04751879341516795, + "grad_norm": 1.6107436418533325, + "learning_rate": 4.972201244797918e-05, + "loss": 5.2839, + "step": 7990 + }, + { + "epoch": 0.04752474069844895, + "grad_norm": 1.8178008794784546, + "learning_rate": 4.972194298034649e-05, + "loss": 5.3722, + "step": 7991 + }, + { + "epoch": 0.04753068798172994, + "grad_norm": 1.6542725563049316, + "learning_rate": 4.972187350408363e-05, + "loss": 5.3434, + "step": 7992 + }, + { + "epoch": 0.047536635265010944, + "grad_norm": 1.8194152116775513, + "learning_rate": 4.972180401919061e-05, + "loss": 5.3763, + "step": 7993 + }, + { + "epoch": 0.04754258254829194, + "grad_norm": 1.890317678451538, + "learning_rate": 4.9721734525667476e-05, + "loss": 5.529, + "step": 7994 + }, + { + "epoch": 0.047548529831572935, + "grad_norm": 1.813226342201233, + "learning_rate": 4.972166502351423e-05, + "loss": 5.0826, + "step": 7995 + }, + { + "epoch": 0.04755447711485394, + "grad_norm": 1.7679328918457031, + "learning_rate": 4.9721595512730905e-05, + "loss": 5.3589, + "step": 7996 + }, + { + "epoch": 0.04756042439813493, + "grad_norm": 1.8390278816223145, + "learning_rate": 4.972152599331753e-05, + "loss": 5.1568, + "step": 7997 + }, + { + "epoch": 0.04756637168141593, + "grad_norm": 2.9323909282684326, + "learning_rate": 4.972145646527413e-05, + "loss": 5.6457, + "step": 7998 + }, + { + "epoch": 0.04757231896469693, + "grad_norm": 1.8839350938796997, + "learning_rate": 4.972138692860072e-05, + "loss": 5.1204, + "step": 7999 + }, + { + "epoch": 0.047578266247977924, + "grad_norm": 1.9047685861587524, + "learning_rate": 4.972131738329733e-05, + "loss": 5.2741, + "step": 8000 + }, + { + "epoch": 0.04758421353125892, + "grad_norm": 2.39807391166687, + "learning_rate": 4.972124782936398e-05, + "loss": 5.0134, + "step": 8001 + }, + { + "epoch": 0.04759016081453992, + "grad_norm": 2.197404146194458, + "learning_rate": 4.972117826680071e-05, + "loss": 5.3012, + "step": 8002 + }, + { + "epoch": 0.047596108097820916, + "grad_norm": 2.2648651599884033, + "learning_rate": 4.9721108695607515e-05, + "loss": 5.7196, + "step": 8003 + }, + { + "epoch": 0.04760205538110191, + "grad_norm": 1.7686847448349, + "learning_rate": 4.972103911578444e-05, + "loss": 5.4261, + "step": 8004 + }, + { + "epoch": 0.04760800266438291, + "grad_norm": 1.726653814315796, + "learning_rate": 4.972096952733152e-05, + "loss": 5.33, + "step": 8005 + }, + { + "epoch": 0.04761394994766391, + "grad_norm": 1.6855807304382324, + "learning_rate": 4.972089993024875e-05, + "loss": 5.2382, + "step": 8006 + }, + { + "epoch": 0.0476198972309449, + "grad_norm": 1.644954800605774, + "learning_rate": 4.972083032453617e-05, + "loss": 5.3309, + "step": 8007 + }, + { + "epoch": 0.0476258445142259, + "grad_norm": 1.8630400896072388, + "learning_rate": 4.9720760710193816e-05, + "loss": 5.282, + "step": 8008 + }, + { + "epoch": 0.0476317917975069, + "grad_norm": 1.862716555595398, + "learning_rate": 4.972069108722168e-05, + "loss": 5.3307, + "step": 8009 + }, + { + "epoch": 0.047637739080787896, + "grad_norm": 1.8025259971618652, + "learning_rate": 4.972062145561982e-05, + "loss": 5.2236, + "step": 8010 + }, + { + "epoch": 0.04764368636406889, + "grad_norm": 1.7213356494903564, + "learning_rate": 4.972055181538825e-05, + "loss": 5.0635, + "step": 8011 + }, + { + "epoch": 0.04764963364734989, + "grad_norm": 1.5237104892730713, + "learning_rate": 4.9720482166526986e-05, + "loss": 5.3089, + "step": 8012 + }, + { + "epoch": 0.04765558093063089, + "grad_norm": 1.628957748413086, + "learning_rate": 4.972041250903605e-05, + "loss": 5.2299, + "step": 8013 + }, + { + "epoch": 0.04766152821391188, + "grad_norm": 1.9217725992202759, + "learning_rate": 4.972034284291548e-05, + "loss": 5.2504, + "step": 8014 + }, + { + "epoch": 0.047667475497192885, + "grad_norm": 2.114549160003662, + "learning_rate": 4.97202731681653e-05, + "loss": 5.219, + "step": 8015 + }, + { + "epoch": 0.04767342278047388, + "grad_norm": 1.9268896579742432, + "learning_rate": 4.9720203484785525e-05, + "loss": 5.145, + "step": 8016 + }, + { + "epoch": 0.047679370063754875, + "grad_norm": 2.04050874710083, + "learning_rate": 4.9720133792776166e-05, + "loss": 5.354, + "step": 8017 + }, + { + "epoch": 0.04768531734703588, + "grad_norm": 1.8002599477767944, + "learning_rate": 4.972006409213728e-05, + "loss": 5.0547, + "step": 8018 + }, + { + "epoch": 0.04769126463031687, + "grad_norm": 1.9655365943908691, + "learning_rate": 4.9719994382868876e-05, + "loss": 5.2188, + "step": 8019 + }, + { + "epoch": 0.04769721191359787, + "grad_norm": 1.7188535928726196, + "learning_rate": 4.971992466497097e-05, + "loss": 5.1792, + "step": 8020 + }, + { + "epoch": 0.04770315919687886, + "grad_norm": 1.582184910774231, + "learning_rate": 4.97198549384436e-05, + "loss": 5.2295, + "step": 8021 + }, + { + "epoch": 0.047709106480159864, + "grad_norm": 1.4490164518356323, + "learning_rate": 4.971978520328677e-05, + "loss": 5.1677, + "step": 8022 + }, + { + "epoch": 0.04771505376344086, + "grad_norm": 1.472896695137024, + "learning_rate": 4.971971545950054e-05, + "loss": 4.9954, + "step": 8023 + }, + { + "epoch": 0.047721001046721855, + "grad_norm": 1.5845187902450562, + "learning_rate": 4.97196457070849e-05, + "loss": 5.1273, + "step": 8024 + }, + { + "epoch": 0.04772694833000286, + "grad_norm": 1.6418551206588745, + "learning_rate": 4.9719575946039887e-05, + "loss": 5.0835, + "step": 8025 + }, + { + "epoch": 0.04773289561328385, + "grad_norm": 1.379805088043213, + "learning_rate": 4.971950617636553e-05, + "loss": 5.1058, + "step": 8026 + }, + { + "epoch": 0.04773884289656485, + "grad_norm": 1.7939400672912598, + "learning_rate": 4.9719436398061835e-05, + "loss": 5.0105, + "step": 8027 + }, + { + "epoch": 0.04774479017984585, + "grad_norm": 1.5610185861587524, + "learning_rate": 4.971936661112886e-05, + "loss": 5.032, + "step": 8028 + }, + { + "epoch": 0.047750737463126844, + "grad_norm": 1.524402379989624, + "learning_rate": 4.9719296815566594e-05, + "loss": 5.1376, + "step": 8029 + }, + { + "epoch": 0.04775668474640784, + "grad_norm": 1.7448087930679321, + "learning_rate": 4.971922701137509e-05, + "loss": 4.9496, + "step": 8030 + }, + { + "epoch": 0.04776263202968884, + "grad_norm": 1.7382763624191284, + "learning_rate": 4.971915719855435e-05, + "loss": 4.9755, + "step": 8031 + }, + { + "epoch": 0.047768579312969836, + "grad_norm": 1.6728250980377197, + "learning_rate": 4.971908737710441e-05, + "loss": 5.1436, + "step": 8032 + }, + { + "epoch": 0.04777452659625083, + "grad_norm": 1.4256306886672974, + "learning_rate": 4.971901754702529e-05, + "loss": 4.9739, + "step": 8033 + }, + { + "epoch": 0.04778047387953183, + "grad_norm": 1.660714864730835, + "learning_rate": 4.971894770831702e-05, + "loss": 5.1337, + "step": 8034 + }, + { + "epoch": 0.04778642116281283, + "grad_norm": 1.5240182876586914, + "learning_rate": 4.9718877860979615e-05, + "loss": 5.1143, + "step": 8035 + }, + { + "epoch": 0.04779236844609382, + "grad_norm": 1.478852391242981, + "learning_rate": 4.971880800501311e-05, + "loss": 4.968, + "step": 8036 + }, + { + "epoch": 0.04779831572937482, + "grad_norm": 1.5343812704086304, + "learning_rate": 4.971873814041752e-05, + "loss": 4.9393, + "step": 8037 + }, + { + "epoch": 0.04780426301265582, + "grad_norm": 1.6728276014328003, + "learning_rate": 4.971866826719288e-05, + "loss": 5.0535, + "step": 8038 + }, + { + "epoch": 0.047810210295936816, + "grad_norm": 1.4831758737564087, + "learning_rate": 4.971859838533921e-05, + "loss": 5.0705, + "step": 8039 + }, + { + "epoch": 0.04781615757921781, + "grad_norm": 1.7412161827087402, + "learning_rate": 4.971852849485653e-05, + "loss": 4.9338, + "step": 8040 + }, + { + "epoch": 0.04782210486249881, + "grad_norm": 1.4696041345596313, + "learning_rate": 4.971845859574487e-05, + "loss": 5.0643, + "step": 8041 + }, + { + "epoch": 0.04782805214577981, + "grad_norm": 1.4190481901168823, + "learning_rate": 4.9718388688004235e-05, + "loss": 5.0743, + "step": 8042 + }, + { + "epoch": 0.0478339994290608, + "grad_norm": 1.513454556465149, + "learning_rate": 4.9718318771634686e-05, + "loss": 4.8832, + "step": 8043 + }, + { + "epoch": 0.047839946712341805, + "grad_norm": 1.7310774326324463, + "learning_rate": 4.9718248846636216e-05, + "loss": 4.957, + "step": 8044 + }, + { + "epoch": 0.0478458939956228, + "grad_norm": 1.4895838499069214, + "learning_rate": 4.971817891300886e-05, + "loss": 4.9121, + "step": 8045 + }, + { + "epoch": 0.047851841278903795, + "grad_norm": 1.6848632097244263, + "learning_rate": 4.9718108970752656e-05, + "loss": 5.1337, + "step": 8046 + }, + { + "epoch": 0.0478577885621848, + "grad_norm": 1.7145766019821167, + "learning_rate": 4.97180390198676e-05, + "loss": 5.1827, + "step": 8047 + }, + { + "epoch": 0.04786373584546579, + "grad_norm": 1.668140172958374, + "learning_rate": 4.971796906035374e-05, + "loss": 5.4071, + "step": 8048 + }, + { + "epoch": 0.04786968312874679, + "grad_norm": 1.6927748918533325, + "learning_rate": 4.9717899092211094e-05, + "loss": 5.4319, + "step": 8049 + }, + { + "epoch": 0.04787563041202778, + "grad_norm": 1.6696170568466187, + "learning_rate": 4.971782911543968e-05, + "loss": 5.4137, + "step": 8050 + }, + { + "epoch": 0.047881577695308784, + "grad_norm": 1.9299427270889282, + "learning_rate": 4.971775913003953e-05, + "loss": 5.6676, + "step": 8051 + }, + { + "epoch": 0.04788752497858978, + "grad_norm": 1.7163755893707275, + "learning_rate": 4.971768913601066e-05, + "loss": 5.2916, + "step": 8052 + }, + { + "epoch": 0.047893472261870774, + "grad_norm": 1.7822209596633911, + "learning_rate": 4.971761913335311e-05, + "loss": 5.6364, + "step": 8053 + }, + { + "epoch": 0.047899419545151777, + "grad_norm": 1.725375771522522, + "learning_rate": 4.971754912206689e-05, + "loss": 5.045, + "step": 8054 + }, + { + "epoch": 0.04790536682843277, + "grad_norm": 1.5243995189666748, + "learning_rate": 4.9717479102152027e-05, + "loss": 5.4691, + "step": 8055 + }, + { + "epoch": 0.04791131411171377, + "grad_norm": 1.6673872470855713, + "learning_rate": 4.971740907360854e-05, + "loss": 5.4851, + "step": 8056 + }, + { + "epoch": 0.04791726139499477, + "grad_norm": 1.6378693580627441, + "learning_rate": 4.971733903643647e-05, + "loss": 5.2574, + "step": 8057 + }, + { + "epoch": 0.047923208678275764, + "grad_norm": 1.484250545501709, + "learning_rate": 4.9717268990635835e-05, + "loss": 5.2988, + "step": 8058 + }, + { + "epoch": 0.04792915596155676, + "grad_norm": 1.626955270767212, + "learning_rate": 4.971719893620665e-05, + "loss": 5.3502, + "step": 8059 + }, + { + "epoch": 0.04793510324483776, + "grad_norm": 2.1421375274658203, + "learning_rate": 4.9717128873148954e-05, + "loss": 5.3006, + "step": 8060 + }, + { + "epoch": 0.047941050528118756, + "grad_norm": 1.5175740718841553, + "learning_rate": 4.971705880146276e-05, + "loss": 5.4144, + "step": 8061 + }, + { + "epoch": 0.04794699781139975, + "grad_norm": 1.6170361042022705, + "learning_rate": 4.9716988721148095e-05, + "loss": 5.3635, + "step": 8062 + }, + { + "epoch": 0.04795294509468075, + "grad_norm": 1.7269384860992432, + "learning_rate": 4.971691863220499e-05, + "loss": 5.2813, + "step": 8063 + }, + { + "epoch": 0.04795889237796175, + "grad_norm": 1.5144844055175781, + "learning_rate": 4.971684853463345e-05, + "loss": 5.3242, + "step": 8064 + }, + { + "epoch": 0.04796483966124274, + "grad_norm": 1.7125827074050903, + "learning_rate": 4.971677842843353e-05, + "loss": 5.2968, + "step": 8065 + }, + { + "epoch": 0.04797078694452374, + "grad_norm": 1.6067146062850952, + "learning_rate": 4.9716708313605234e-05, + "loss": 5.4446, + "step": 8066 + }, + { + "epoch": 0.04797673422780474, + "grad_norm": 1.8911150693893433, + "learning_rate": 4.9716638190148585e-05, + "loss": 5.1875, + "step": 8067 + }, + { + "epoch": 0.047982681511085735, + "grad_norm": 1.6865830421447754, + "learning_rate": 4.971656805806362e-05, + "loss": 5.1909, + "step": 8068 + }, + { + "epoch": 0.04798862879436673, + "grad_norm": 2.009566068649292, + "learning_rate": 4.9716497917350345e-05, + "loss": 4.9392, + "step": 8069 + }, + { + "epoch": 0.04799457607764773, + "grad_norm": 1.8578897714614868, + "learning_rate": 4.97164277680088e-05, + "loss": 5.3101, + "step": 8070 + }, + { + "epoch": 0.04800052336092873, + "grad_norm": 1.8935741186141968, + "learning_rate": 4.971635761003901e-05, + "loss": 5.3952, + "step": 8071 + }, + { + "epoch": 0.04800647064420972, + "grad_norm": 2.0030407905578613, + "learning_rate": 4.9716287443440994e-05, + "loss": 5.1685, + "step": 8072 + }, + { + "epoch": 0.048012417927490725, + "grad_norm": 2.0079195499420166, + "learning_rate": 4.9716217268214775e-05, + "loss": 5.4942, + "step": 8073 + }, + { + "epoch": 0.04801836521077172, + "grad_norm": 1.7105878591537476, + "learning_rate": 4.971614708436038e-05, + "loss": 5.4124, + "step": 8074 + }, + { + "epoch": 0.048024312494052715, + "grad_norm": 1.7642161846160889, + "learning_rate": 4.971607689187784e-05, + "loss": 5.3187, + "step": 8075 + }, + { + "epoch": 0.04803025977733372, + "grad_norm": 1.7304610013961792, + "learning_rate": 4.9716006690767165e-05, + "loss": 5.308, + "step": 8076 + }, + { + "epoch": 0.04803620706061471, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.971593648102839e-05, + "loss": 5.4581, + "step": 8077 + }, + { + "epoch": 0.04804215434389571, + "grad_norm": 1.8008997440338135, + "learning_rate": 4.971586626266154e-05, + "loss": 5.3266, + "step": 8078 + }, + { + "epoch": 0.0480481016271767, + "grad_norm": 1.8691446781158447, + "learning_rate": 4.971579603566663e-05, + "loss": 5.2847, + "step": 8079 + }, + { + "epoch": 0.048054048910457704, + "grad_norm": 1.7805777788162231, + "learning_rate": 4.97157258000437e-05, + "loss": 5.446, + "step": 8080 + }, + { + "epoch": 0.0480599961937387, + "grad_norm": 1.4973244667053223, + "learning_rate": 4.971565555579275e-05, + "loss": 5.412, + "step": 8081 + }, + { + "epoch": 0.048065943477019694, + "grad_norm": 1.5994775295257568, + "learning_rate": 4.971558530291384e-05, + "loss": 5.3285, + "step": 8082 + }, + { + "epoch": 0.048071890760300696, + "grad_norm": 1.7743935585021973, + "learning_rate": 4.971551504140696e-05, + "loss": 5.326, + "step": 8083 + }, + { + "epoch": 0.04807783804358169, + "grad_norm": 1.5922112464904785, + "learning_rate": 4.9715444771272154e-05, + "loss": 5.3338, + "step": 8084 + }, + { + "epoch": 0.04808378532686269, + "grad_norm": 1.5587191581726074, + "learning_rate": 4.971537449250944e-05, + "loss": 5.2437, + "step": 8085 + }, + { + "epoch": 0.04808973261014369, + "grad_norm": 1.4972636699676514, + "learning_rate": 4.971530420511884e-05, + "loss": 5.2271, + "step": 8086 + }, + { + "epoch": 0.048095679893424684, + "grad_norm": 1.6221843957901, + "learning_rate": 4.971523390910039e-05, + "loss": 5.3225, + "step": 8087 + }, + { + "epoch": 0.04810162717670568, + "grad_norm": 1.5826990604400635, + "learning_rate": 4.971516360445411e-05, + "loss": 5.2955, + "step": 8088 + }, + { + "epoch": 0.04810757445998668, + "grad_norm": 1.729963779449463, + "learning_rate": 4.971509329118001e-05, + "loss": 5.3263, + "step": 8089 + }, + { + "epoch": 0.048113521743267676, + "grad_norm": 1.680851697921753, + "learning_rate": 4.971502296927813e-05, + "loss": 5.3579, + "step": 8090 + }, + { + "epoch": 0.04811946902654867, + "grad_norm": 2.028024673461914, + "learning_rate": 4.9714952638748504e-05, + "loss": 5.3632, + "step": 8091 + }, + { + "epoch": 0.04812541630982967, + "grad_norm": 1.6236159801483154, + "learning_rate": 4.9714882299591127e-05, + "loss": 5.222, + "step": 8092 + }, + { + "epoch": 0.04813136359311067, + "grad_norm": 1.7522811889648438, + "learning_rate": 4.971481195180605e-05, + "loss": 5.3752, + "step": 8093 + }, + { + "epoch": 0.04813731087639166, + "grad_norm": 1.7108362913131714, + "learning_rate": 4.9714741595393274e-05, + "loss": 5.2994, + "step": 8094 + }, + { + "epoch": 0.04814325815967266, + "grad_norm": 1.7863954305648804, + "learning_rate": 4.971467123035285e-05, + "loss": 5.2386, + "step": 8095 + }, + { + "epoch": 0.04814920544295366, + "grad_norm": 2.0054473876953125, + "learning_rate": 4.971460085668479e-05, + "loss": 5.3565, + "step": 8096 + }, + { + "epoch": 0.048155152726234655, + "grad_norm": 1.6878743171691895, + "learning_rate": 4.971453047438911e-05, + "loss": 5.3448, + "step": 8097 + }, + { + "epoch": 0.04816110000951565, + "grad_norm": 1.8534557819366455, + "learning_rate": 4.971446008346585e-05, + "loss": 5.1446, + "step": 8098 + }, + { + "epoch": 0.04816704729279665, + "grad_norm": 1.8549425601959229, + "learning_rate": 4.9714389683915025e-05, + "loss": 5.2433, + "step": 8099 + }, + { + "epoch": 0.04817299457607765, + "grad_norm": 1.5624927282333374, + "learning_rate": 4.9714319275736666e-05, + "loss": 5.0645, + "step": 8100 + }, + { + "epoch": 0.04817894185935864, + "grad_norm": 1.670462965965271, + "learning_rate": 4.971424885893078e-05, + "loss": 5.1213, + "step": 8101 + }, + { + "epoch": 0.048184889142639645, + "grad_norm": 2.039595603942871, + "learning_rate": 4.9714178433497414e-05, + "loss": 5.1797, + "step": 8102 + }, + { + "epoch": 0.04819083642592064, + "grad_norm": 1.9546380043029785, + "learning_rate": 4.971410799943659e-05, + "loss": 5.2432, + "step": 8103 + }, + { + "epoch": 0.048196783709201635, + "grad_norm": 1.892397403717041, + "learning_rate": 4.971403755674832e-05, + "loss": 5.1775, + "step": 8104 + }, + { + "epoch": 0.04820273099248264, + "grad_norm": 1.7021955251693726, + "learning_rate": 4.971396710543263e-05, + "loss": 5.2242, + "step": 8105 + }, + { + "epoch": 0.04820867827576363, + "grad_norm": 1.7652686834335327, + "learning_rate": 4.9713896645489556e-05, + "loss": 5.1419, + "step": 8106 + }, + { + "epoch": 0.04821462555904463, + "grad_norm": 1.8669620752334595, + "learning_rate": 4.971382617691911e-05, + "loss": 5.1392, + "step": 8107 + }, + { + "epoch": 0.04822057284232562, + "grad_norm": 1.8774491548538208, + "learning_rate": 4.971375569972133e-05, + "loss": 5.1853, + "step": 8108 + }, + { + "epoch": 0.048226520125606624, + "grad_norm": 1.6108628511428833, + "learning_rate": 4.971368521389623e-05, + "loss": 5.4858, + "step": 8109 + }, + { + "epoch": 0.04823246740888762, + "grad_norm": 1.6839191913604736, + "learning_rate": 4.9713614719443835e-05, + "loss": 5.4217, + "step": 8110 + }, + { + "epoch": 0.048238414692168614, + "grad_norm": 1.9300925731658936, + "learning_rate": 4.9713544216364176e-05, + "loss": 5.2259, + "step": 8111 + }, + { + "epoch": 0.048244361975449616, + "grad_norm": 1.9142355918884277, + "learning_rate": 4.971347370465728e-05, + "loss": 5.2, + "step": 8112 + }, + { + "epoch": 0.04825030925873061, + "grad_norm": 1.8046603202819824, + "learning_rate": 4.971340318432315e-05, + "loss": 5.0951, + "step": 8113 + }, + { + "epoch": 0.04825625654201161, + "grad_norm": 1.9129396677017212, + "learning_rate": 4.971333265536184e-05, + "loss": 5.0376, + "step": 8114 + }, + { + "epoch": 0.04826220382529261, + "grad_norm": 1.6774524450302124, + "learning_rate": 4.971326211777335e-05, + "loss": 5.4313, + "step": 8115 + }, + { + "epoch": 0.048268151108573604, + "grad_norm": 1.8156472444534302, + "learning_rate": 4.971319157155773e-05, + "loss": 5.4336, + "step": 8116 + }, + { + "epoch": 0.0482740983918546, + "grad_norm": 1.5704171657562256, + "learning_rate": 4.9713121016714976e-05, + "loss": 5.6878, + "step": 8117 + }, + { + "epoch": 0.0482800456751356, + "grad_norm": 1.585528016090393, + "learning_rate": 4.9713050453245135e-05, + "loss": 5.6208, + "step": 8118 + }, + { + "epoch": 0.048285992958416596, + "grad_norm": 1.3975930213928223, + "learning_rate": 4.9712979881148215e-05, + "loss": 5.8001, + "step": 8119 + }, + { + "epoch": 0.04829194024169759, + "grad_norm": 1.8124761581420898, + "learning_rate": 4.971290930042426e-05, + "loss": 5.6006, + "step": 8120 + }, + { + "epoch": 0.04829788752497859, + "grad_norm": 1.8448232412338257, + "learning_rate": 4.971283871107327e-05, + "loss": 5.4324, + "step": 8121 + }, + { + "epoch": 0.04830383480825959, + "grad_norm": 1.772218108177185, + "learning_rate": 4.97127681130953e-05, + "loss": 6.0943, + "step": 8122 + }, + { + "epoch": 0.04830978209154058, + "grad_norm": 2.038703441619873, + "learning_rate": 4.9712697506490345e-05, + "loss": 5.4224, + "step": 8123 + }, + { + "epoch": 0.04831572937482158, + "grad_norm": 1.576430320739746, + "learning_rate": 4.971262689125845e-05, + "loss": 5.351, + "step": 8124 + }, + { + "epoch": 0.04832167665810258, + "grad_norm": 1.857021450996399, + "learning_rate": 4.971255626739963e-05, + "loss": 5.258, + "step": 8125 + }, + { + "epoch": 0.048327623941383575, + "grad_norm": 1.7989404201507568, + "learning_rate": 4.971248563491391e-05, + "loss": 5.3925, + "step": 8126 + }, + { + "epoch": 0.04833357122466457, + "grad_norm": 1.8104023933410645, + "learning_rate": 4.9712414993801314e-05, + "loss": 5.4326, + "step": 8127 + }, + { + "epoch": 0.04833951850794557, + "grad_norm": 1.898054838180542, + "learning_rate": 4.971234434406188e-05, + "loss": 5.2094, + "step": 8128 + }, + { + "epoch": 0.04834546579122657, + "grad_norm": 1.436633586883545, + "learning_rate": 4.971227368569561e-05, + "loss": 5.2994, + "step": 8129 + }, + { + "epoch": 0.04835141307450756, + "grad_norm": 1.4576120376586914, + "learning_rate": 4.971220301870255e-05, + "loss": 5.3504, + "step": 8130 + }, + { + "epoch": 0.048357360357788565, + "grad_norm": 1.7260229587554932, + "learning_rate": 4.971213234308271e-05, + "loss": 5.1083, + "step": 8131 + }, + { + "epoch": 0.04836330764106956, + "grad_norm": 1.8110415935516357, + "learning_rate": 4.971206165883612e-05, + "loss": 5.1298, + "step": 8132 + }, + { + "epoch": 0.048369254924350555, + "grad_norm": 2.1696786880493164, + "learning_rate": 4.9711990965962804e-05, + "loss": 5.8155, + "step": 8133 + }, + { + "epoch": 0.04837520220763156, + "grad_norm": 1.9905856847763062, + "learning_rate": 4.971192026446279e-05, + "loss": 5.5814, + "step": 8134 + }, + { + "epoch": 0.04838114949091255, + "grad_norm": 1.7459521293640137, + "learning_rate": 4.97118495543361e-05, + "loss": 5.4358, + "step": 8135 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 1.8495198488235474, + "learning_rate": 4.9711778835582756e-05, + "loss": 5.3652, + "step": 8136 + }, + { + "epoch": 0.04839304405747455, + "grad_norm": 1.782850742340088, + "learning_rate": 4.971170810820279e-05, + "loss": 5.2361, + "step": 8137 + }, + { + "epoch": 0.048398991340755544, + "grad_norm": 1.7327016592025757, + "learning_rate": 4.971163737219622e-05, + "loss": 5.0802, + "step": 8138 + }, + { + "epoch": 0.04840493862403654, + "grad_norm": 1.663620114326477, + "learning_rate": 4.9711566627563066e-05, + "loss": 5.1566, + "step": 8139 + }, + { + "epoch": 0.048410885907317534, + "grad_norm": 1.5109026432037354, + "learning_rate": 4.971149587430336e-05, + "loss": 5.1499, + "step": 8140 + }, + { + "epoch": 0.048416833190598536, + "grad_norm": 1.3494226932525635, + "learning_rate": 4.971142511241714e-05, + "loss": 5.1684, + "step": 8141 + }, + { + "epoch": 0.04842278047387953, + "grad_norm": 1.721880555152893, + "learning_rate": 4.97113543419044e-05, + "loss": 5.0199, + "step": 8142 + }, + { + "epoch": 0.048428727757160527, + "grad_norm": 1.7465516328811646, + "learning_rate": 4.971128356276519e-05, + "loss": 5.1181, + "step": 8143 + }, + { + "epoch": 0.04843467504044153, + "grad_norm": 1.8127025365829468, + "learning_rate": 4.971121277499953e-05, + "loss": 5.6514, + "step": 8144 + }, + { + "epoch": 0.048440622323722524, + "grad_norm": 1.6027450561523438, + "learning_rate": 4.971114197860743e-05, + "loss": 5.3408, + "step": 8145 + }, + { + "epoch": 0.04844656960700352, + "grad_norm": 1.6985208988189697, + "learning_rate": 4.971107117358894e-05, + "loss": 5.2002, + "step": 8146 + }, + { + "epoch": 0.04845251689028452, + "grad_norm": 1.681305170059204, + "learning_rate": 4.971100035994406e-05, + "loss": 5.1389, + "step": 8147 + }, + { + "epoch": 0.048458464173565516, + "grad_norm": 1.6053674221038818, + "learning_rate": 4.971092953767282e-05, + "loss": 5.0665, + "step": 8148 + }, + { + "epoch": 0.04846441145684651, + "grad_norm": 1.743134617805481, + "learning_rate": 4.9710858706775266e-05, + "loss": 5.1427, + "step": 8149 + }, + { + "epoch": 0.04847035874012751, + "grad_norm": 1.4901342391967773, + "learning_rate": 4.9710787867251396e-05, + "loss": 5.1957, + "step": 8150 + }, + { + "epoch": 0.04847630602340851, + "grad_norm": 1.6003857851028442, + "learning_rate": 4.971071701910125e-05, + "loss": 5.0658, + "step": 8151 + }, + { + "epoch": 0.0484822533066895, + "grad_norm": 1.7036428451538086, + "learning_rate": 4.971064616232484e-05, + "loss": 5.0823, + "step": 8152 + }, + { + "epoch": 0.0484882005899705, + "grad_norm": 1.5894789695739746, + "learning_rate": 4.97105752969222e-05, + "loss": 5.093, + "step": 8153 + }, + { + "epoch": 0.0484941478732515, + "grad_norm": 1.487648367881775, + "learning_rate": 4.9710504422893364e-05, + "loss": 5.0089, + "step": 8154 + }, + { + "epoch": 0.048500095156532495, + "grad_norm": 2.0251479148864746, + "learning_rate": 4.971043354023834e-05, + "loss": 5.0552, + "step": 8155 + }, + { + "epoch": 0.04850604243981349, + "grad_norm": 1.7097325325012207, + "learning_rate": 4.971036264895715e-05, + "loss": 5.2737, + "step": 8156 + }, + { + "epoch": 0.04851198972309449, + "grad_norm": 1.784836769104004, + "learning_rate": 4.971029174904984e-05, + "loss": 5.2863, + "step": 8157 + }, + { + "epoch": 0.04851793700637549, + "grad_norm": 1.4765781164169312, + "learning_rate": 4.9710220840516416e-05, + "loss": 5.4057, + "step": 8158 + }, + { + "epoch": 0.04852388428965648, + "grad_norm": 1.4173041582107544, + "learning_rate": 4.9710149923356915e-05, + "loss": 5.187, + "step": 8159 + }, + { + "epoch": 0.048529831572937485, + "grad_norm": 1.488173007965088, + "learning_rate": 4.971007899757135e-05, + "loss": 4.975, + "step": 8160 + }, + { + "epoch": 0.04853577885621848, + "grad_norm": 1.391435980796814, + "learning_rate": 4.9710008063159756e-05, + "loss": 5.0782, + "step": 8161 + }, + { + "epoch": 0.048541726139499475, + "grad_norm": 1.7100436687469482, + "learning_rate": 4.970993712012215e-05, + "loss": 5.4953, + "step": 8162 + }, + { + "epoch": 0.04854767342278048, + "grad_norm": 1.8748459815979004, + "learning_rate": 4.970986616845856e-05, + "loss": 5.4535, + "step": 8163 + }, + { + "epoch": 0.04855362070606147, + "grad_norm": 1.901802897453308, + "learning_rate": 4.970979520816902e-05, + "loss": 5.3619, + "step": 8164 + }, + { + "epoch": 0.04855956798934247, + "grad_norm": 1.9850586652755737, + "learning_rate": 4.970972423925354e-05, + "loss": 5.039, + "step": 8165 + }, + { + "epoch": 0.04856551527262347, + "grad_norm": 1.5195177793502808, + "learning_rate": 4.970965326171214e-05, + "loss": 5.1721, + "step": 8166 + }, + { + "epoch": 0.048571462555904464, + "grad_norm": 1.4180214405059814, + "learning_rate": 4.9709582275544866e-05, + "loss": 5.2319, + "step": 8167 + }, + { + "epoch": 0.04857740983918546, + "grad_norm": 1.3797354698181152, + "learning_rate": 4.970951128075173e-05, + "loss": 5.1813, + "step": 8168 + }, + { + "epoch": 0.048583357122466454, + "grad_norm": 1.6448336839675903, + "learning_rate": 4.970944027733276e-05, + "loss": 5.1968, + "step": 8169 + }, + { + "epoch": 0.048589304405747456, + "grad_norm": 1.6626337766647339, + "learning_rate": 4.9709369265287986e-05, + "loss": 5.1303, + "step": 8170 + }, + { + "epoch": 0.04859525168902845, + "grad_norm": 1.5715514421463013, + "learning_rate": 4.970929824461742e-05, + "loss": 5.1609, + "step": 8171 + }, + { + "epoch": 0.048601198972309446, + "grad_norm": 1.5971697568893433, + "learning_rate": 4.970922721532108e-05, + "loss": 5.1489, + "step": 8172 + }, + { + "epoch": 0.04860714625559045, + "grad_norm": 1.6784114837646484, + "learning_rate": 4.970915617739903e-05, + "loss": 5.2778, + "step": 8173 + }, + { + "epoch": 0.048613093538871444, + "grad_norm": 1.7507476806640625, + "learning_rate": 4.970908513085125e-05, + "loss": 5.5719, + "step": 8174 + }, + { + "epoch": 0.04861904082215244, + "grad_norm": 1.7017735242843628, + "learning_rate": 4.970901407567779e-05, + "loss": 5.5197, + "step": 8175 + }, + { + "epoch": 0.04862498810543344, + "grad_norm": 1.8569817543029785, + "learning_rate": 4.9708943011878674e-05, + "loss": 5.3823, + "step": 8176 + }, + { + "epoch": 0.048630935388714436, + "grad_norm": 1.5183817148208618, + "learning_rate": 4.970887193945391e-05, + "loss": 5.5518, + "step": 8177 + }, + { + "epoch": 0.04863688267199543, + "grad_norm": 1.4175498485565186, + "learning_rate": 4.970880085840354e-05, + "loss": 5.4526, + "step": 8178 + }, + { + "epoch": 0.04864282995527643, + "grad_norm": 1.7228561639785767, + "learning_rate": 4.970872976872758e-05, + "loss": 5.5162, + "step": 8179 + }, + { + "epoch": 0.04864877723855743, + "grad_norm": 2.043182849884033, + "learning_rate": 4.970865867042606e-05, + "loss": 5.4212, + "step": 8180 + }, + { + "epoch": 0.04865472452183842, + "grad_norm": 1.377565622329712, + "learning_rate": 4.970858756349901e-05, + "loss": 5.2817, + "step": 8181 + }, + { + "epoch": 0.04866067180511942, + "grad_norm": 1.6977208852767944, + "learning_rate": 4.970851644794643e-05, + "loss": 5.4081, + "step": 8182 + }, + { + "epoch": 0.04866661908840042, + "grad_norm": 1.3136184215545654, + "learning_rate": 4.970844532376838e-05, + "loss": 5.4272, + "step": 8183 + }, + { + "epoch": 0.048672566371681415, + "grad_norm": 1.8863121271133423, + "learning_rate": 4.9708374190964854e-05, + "loss": 5.441, + "step": 8184 + }, + { + "epoch": 0.04867851365496241, + "grad_norm": 1.6755374670028687, + "learning_rate": 4.97083030495359e-05, + "loss": 5.5045, + "step": 8185 + }, + { + "epoch": 0.04868446093824341, + "grad_norm": 1.8439961671829224, + "learning_rate": 4.970823189948153e-05, + "loss": 5.5252, + "step": 8186 + }, + { + "epoch": 0.04869040822152441, + "grad_norm": 1.9662889242172241, + "learning_rate": 4.9708160740801765e-05, + "loss": 5.4379, + "step": 8187 + }, + { + "epoch": 0.0486963555048054, + "grad_norm": 1.691857099533081, + "learning_rate": 4.970808957349664e-05, + "loss": 5.3652, + "step": 8188 + }, + { + "epoch": 0.048702302788086405, + "grad_norm": 1.7482357025146484, + "learning_rate": 4.970801839756618e-05, + "loss": 5.1436, + "step": 8189 + }, + { + "epoch": 0.0487082500713674, + "grad_norm": 1.9221199750900269, + "learning_rate": 4.9707947213010396e-05, + "loss": 5.1936, + "step": 8190 + }, + { + "epoch": 0.048714197354648395, + "grad_norm": 1.9124062061309814, + "learning_rate": 4.970787601982933e-05, + "loss": 5.28, + "step": 8191 + }, + { + "epoch": 0.0487201446379294, + "grad_norm": 1.8999123573303223, + "learning_rate": 4.9707804818023e-05, + "loss": 5.3262, + "step": 8192 + }, + { + "epoch": 0.04872609192121039, + "grad_norm": 1.7711995840072632, + "learning_rate": 4.970773360759143e-05, + "loss": 5.1764, + "step": 8193 + }, + { + "epoch": 0.04873203920449139, + "grad_norm": 2.122689962387085, + "learning_rate": 4.970766238853465e-05, + "loss": 5.4345, + "step": 8194 + }, + { + "epoch": 0.04873798648777239, + "grad_norm": 2.1027848720550537, + "learning_rate": 4.9707591160852675e-05, + "loss": 5.4547, + "step": 8195 + }, + { + "epoch": 0.048743933771053384, + "grad_norm": 1.6944631338119507, + "learning_rate": 4.970751992454553e-05, + "loss": 5.3638, + "step": 8196 + }, + { + "epoch": 0.04874988105433438, + "grad_norm": 1.7444918155670166, + "learning_rate": 4.9707448679613256e-05, + "loss": 5.2378, + "step": 8197 + }, + { + "epoch": 0.048755828337615374, + "grad_norm": 1.8864104747772217, + "learning_rate": 4.970737742605586e-05, + "loss": 5.3142, + "step": 8198 + }, + { + "epoch": 0.048761775620896376, + "grad_norm": 1.968748927116394, + "learning_rate": 4.970730616387338e-05, + "loss": 5.0824, + "step": 8199 + }, + { + "epoch": 0.04876772290417737, + "grad_norm": 2.166405439376831, + "learning_rate": 4.9707234893065824e-05, + "loss": 5.0999, + "step": 8200 + }, + { + "epoch": 0.048773670187458366, + "grad_norm": 1.9185746908187866, + "learning_rate": 4.970716361363323e-05, + "loss": 5.1465, + "step": 8201 + }, + { + "epoch": 0.04877961747073937, + "grad_norm": 1.9191651344299316, + "learning_rate": 4.9707092325575635e-05, + "loss": 5.0713, + "step": 8202 + }, + { + "epoch": 0.048785564754020364, + "grad_norm": 1.6470153331756592, + "learning_rate": 4.9707021028893034e-05, + "loss": 5.0816, + "step": 8203 + }, + { + "epoch": 0.04879151203730136, + "grad_norm": 1.6995042562484741, + "learning_rate": 4.9706949723585475e-05, + "loss": 5.0207, + "step": 8204 + }, + { + "epoch": 0.04879745932058236, + "grad_norm": 1.8208703994750977, + "learning_rate": 4.970687840965297e-05, + "loss": 4.9789, + "step": 8205 + }, + { + "epoch": 0.048803406603863356, + "grad_norm": 1.8558207750320435, + "learning_rate": 4.9706807087095555e-05, + "loss": 5.0655, + "step": 8206 + }, + { + "epoch": 0.04880935388714435, + "grad_norm": 1.6349478960037231, + "learning_rate": 4.9706735755913234e-05, + "loss": 5.2657, + "step": 8207 + }, + { + "epoch": 0.04881530117042535, + "grad_norm": 1.587143063545227, + "learning_rate": 4.9706664416106065e-05, + "loss": 5.0765, + "step": 8208 + }, + { + "epoch": 0.04882124845370635, + "grad_norm": 1.8467018604278564, + "learning_rate": 4.9706593067674047e-05, + "loss": 5.1458, + "step": 8209 + }, + { + "epoch": 0.04882719573698734, + "grad_norm": 1.8066186904907227, + "learning_rate": 4.9706521710617214e-05, + "loss": 5.0656, + "step": 8210 + }, + { + "epoch": 0.04883314302026834, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9706450344935586e-05, + "loss": 5.1448, + "step": 8211 + }, + { + "epoch": 0.04883909030354934, + "grad_norm": 1.8924201726913452, + "learning_rate": 4.97063789706292e-05, + "loss": 4.748, + "step": 8212 + }, + { + "epoch": 0.048845037586830335, + "grad_norm": 2.091324806213379, + "learning_rate": 4.9706307587698064e-05, + "loss": 5.6537, + "step": 8213 + }, + { + "epoch": 0.04885098487011133, + "grad_norm": 3.1737043857574463, + "learning_rate": 4.970623619614221e-05, + "loss": 5.6898, + "step": 8214 + }, + { + "epoch": 0.04885693215339233, + "grad_norm": 2.194577932357788, + "learning_rate": 4.970616479596167e-05, + "loss": 5.4958, + "step": 8215 + }, + { + "epoch": 0.04886287943667333, + "grad_norm": 2.2362759113311768, + "learning_rate": 4.970609338715646e-05, + "loss": 4.9919, + "step": 8216 + }, + { + "epoch": 0.04886882671995432, + "grad_norm": 1.703684687614441, + "learning_rate": 4.970602196972661e-05, + "loss": 4.8733, + "step": 8217 + }, + { + "epoch": 0.048874774003235325, + "grad_norm": 2.0205307006835938, + "learning_rate": 4.970595054367214e-05, + "loss": 5.1177, + "step": 8218 + }, + { + "epoch": 0.04888072128651632, + "grad_norm": 2.1270928382873535, + "learning_rate": 4.970587910899308e-05, + "loss": 5.6208, + "step": 8219 + }, + { + "epoch": 0.048886668569797315, + "grad_norm": 1.8992488384246826, + "learning_rate": 4.9705807665689455e-05, + "loss": 5.7754, + "step": 8220 + }, + { + "epoch": 0.04889261585307832, + "grad_norm": 2.279099225997925, + "learning_rate": 4.9705736213761286e-05, + "loss": 5.5924, + "step": 8221 + }, + { + "epoch": 0.04889856313635931, + "grad_norm": 1.9186346530914307, + "learning_rate": 4.9705664753208594e-05, + "loss": 5.9424, + "step": 8222 + }, + { + "epoch": 0.04890451041964031, + "grad_norm": 2.0286009311676025, + "learning_rate": 4.970559328403141e-05, + "loss": 5.8461, + "step": 8223 + }, + { + "epoch": 0.04891045770292131, + "grad_norm": 1.797555685043335, + "learning_rate": 4.970552180622977e-05, + "loss": 5.4929, + "step": 8224 + }, + { + "epoch": 0.048916404986202304, + "grad_norm": 2.4879684448242188, + "learning_rate": 4.970545031980368e-05, + "loss": 5.5253, + "step": 8225 + }, + { + "epoch": 0.0489223522694833, + "grad_norm": 2.749763011932373, + "learning_rate": 4.970537882475318e-05, + "loss": 5.6001, + "step": 8226 + }, + { + "epoch": 0.048928299552764294, + "grad_norm": 2.2076292037963867, + "learning_rate": 4.970530732107827e-05, + "loss": 5.5876, + "step": 8227 + }, + { + "epoch": 0.048934246836045296, + "grad_norm": 2.6566662788391113, + "learning_rate": 4.970523580877901e-05, + "loss": 5.7151, + "step": 8228 + }, + { + "epoch": 0.04894019411932629, + "grad_norm": 2.4873850345611572, + "learning_rate": 4.97051642878554e-05, + "loss": 5.7124, + "step": 8229 + }, + { + "epoch": 0.048946141402607286, + "grad_norm": 1.8365200757980347, + "learning_rate": 4.970509275830748e-05, + "loss": 5.292, + "step": 8230 + }, + { + "epoch": 0.04895208868588829, + "grad_norm": 2.064730644226074, + "learning_rate": 4.9705021220135254e-05, + "loss": 5.2854, + "step": 8231 + }, + { + "epoch": 0.04895803596916928, + "grad_norm": 1.969298005104065, + "learning_rate": 4.970494967333877e-05, + "loss": 5.2113, + "step": 8232 + }, + { + "epoch": 0.04896398325245028, + "grad_norm": 1.8438071012496948, + "learning_rate": 4.9704878117918044e-05, + "loss": 5.2281, + "step": 8233 + }, + { + "epoch": 0.04896993053573128, + "grad_norm": 1.9163525104522705, + "learning_rate": 4.97048065538731e-05, + "loss": 5.043, + "step": 8234 + }, + { + "epoch": 0.048975877819012276, + "grad_norm": 1.802356243133545, + "learning_rate": 4.970473498120395e-05, + "loss": 5.2079, + "step": 8235 + }, + { + "epoch": 0.04898182510229327, + "grad_norm": 1.7572704553604126, + "learning_rate": 4.9704663399910645e-05, + "loss": 5.1119, + "step": 8236 + }, + { + "epoch": 0.04898777238557427, + "grad_norm": 1.848747730255127, + "learning_rate": 4.970459180999319e-05, + "loss": 5.0233, + "step": 8237 + }, + { + "epoch": 0.04899371966885527, + "grad_norm": 2.023036003112793, + "learning_rate": 4.9704520211451624e-05, + "loss": 5.2793, + "step": 8238 + }, + { + "epoch": 0.04899966695213626, + "grad_norm": 1.6738852262496948, + "learning_rate": 4.9704448604285965e-05, + "loss": 5.5255, + "step": 8239 + }, + { + "epoch": 0.04900561423541726, + "grad_norm": 1.6676057577133179, + "learning_rate": 4.970437698849624e-05, + "loss": 5.4287, + "step": 8240 + }, + { + "epoch": 0.04901156151869826, + "grad_norm": 1.9960590600967407, + "learning_rate": 4.970430536408247e-05, + "loss": 5.2939, + "step": 8241 + }, + { + "epoch": 0.049017508801979255, + "grad_norm": 2.7218708992004395, + "learning_rate": 4.9704233731044675e-05, + "loss": 5.9019, + "step": 8242 + }, + { + "epoch": 0.04902345608526025, + "grad_norm": 2.385664224624634, + "learning_rate": 4.970416208938289e-05, + "loss": 5.9146, + "step": 8243 + }, + { + "epoch": 0.04902940336854125, + "grad_norm": 2.2598092555999756, + "learning_rate": 4.970409043909714e-05, + "loss": 5.7451, + "step": 8244 + }, + { + "epoch": 0.04903535065182225, + "grad_norm": 2.3063299655914307, + "learning_rate": 4.970401878018745e-05, + "loss": 5.8675, + "step": 8245 + }, + { + "epoch": 0.04904129793510324, + "grad_norm": 2.1543853282928467, + "learning_rate": 4.9703947112653836e-05, + "loss": 5.9136, + "step": 8246 + }, + { + "epoch": 0.049047245218384244, + "grad_norm": 2.267531633377075, + "learning_rate": 4.970387543649634e-05, + "loss": 5.6834, + "step": 8247 + }, + { + "epoch": 0.04905319250166524, + "grad_norm": 2.047351121902466, + "learning_rate": 4.970380375171496e-05, + "loss": 5.5754, + "step": 8248 + }, + { + "epoch": 0.049059139784946235, + "grad_norm": 2.2565114498138428, + "learning_rate": 4.9703732058309745e-05, + "loss": 5.7067, + "step": 8249 + }, + { + "epoch": 0.04906508706822724, + "grad_norm": 1.7584022283554077, + "learning_rate": 4.970366035628073e-05, + "loss": 5.3926, + "step": 8250 + }, + { + "epoch": 0.04907103435150823, + "grad_norm": 1.9898183345794678, + "learning_rate": 4.9703588645627896e-05, + "loss": 5.7163, + "step": 8251 + }, + { + "epoch": 0.04907698163478923, + "grad_norm": 2.4134786128997803, + "learning_rate": 4.970351692635131e-05, + "loss": 5.672, + "step": 8252 + }, + { + "epoch": 0.04908292891807023, + "grad_norm": 2.1059436798095703, + "learning_rate": 4.970344519845097e-05, + "loss": 5.7719, + "step": 8253 + }, + { + "epoch": 0.049088876201351224, + "grad_norm": 2.0731539726257324, + "learning_rate": 4.970337346192692e-05, + "loss": 5.7104, + "step": 8254 + }, + { + "epoch": 0.04909482348463222, + "grad_norm": 2.3058536052703857, + "learning_rate": 4.970330171677918e-05, + "loss": 5.7435, + "step": 8255 + }, + { + "epoch": 0.049100770767913214, + "grad_norm": 2.051424980163574, + "learning_rate": 4.970322996300777e-05, + "loss": 5.7371, + "step": 8256 + }, + { + "epoch": 0.049106718051194216, + "grad_norm": 2.1715517044067383, + "learning_rate": 4.970315820061271e-05, + "loss": 5.5805, + "step": 8257 + }, + { + "epoch": 0.04911266533447521, + "grad_norm": 2.136617422103882, + "learning_rate": 4.9703086429594034e-05, + "loss": 5.8689, + "step": 8258 + }, + { + "epoch": 0.049118612617756206, + "grad_norm": 1.7089059352874756, + "learning_rate": 4.970301464995178e-05, + "loss": 6.0614, + "step": 8259 + }, + { + "epoch": 0.04912455990103721, + "grad_norm": 2.410067319869995, + "learning_rate": 4.970294286168595e-05, + "loss": 5.8762, + "step": 8260 + }, + { + "epoch": 0.0491305071843182, + "grad_norm": 2.2186291217803955, + "learning_rate": 4.970287106479657e-05, + "loss": 5.4903, + "step": 8261 + }, + { + "epoch": 0.0491364544675992, + "grad_norm": 2.312793016433716, + "learning_rate": 4.970279925928368e-05, + "loss": 6.2488, + "step": 8262 + }, + { + "epoch": 0.0491424017508802, + "grad_norm": 2.127859354019165, + "learning_rate": 4.9702727445147305e-05, + "loss": 5.9976, + "step": 8263 + }, + { + "epoch": 0.049148349034161196, + "grad_norm": 2.604367733001709, + "learning_rate": 4.9702655622387454e-05, + "loss": 5.4153, + "step": 8264 + }, + { + "epoch": 0.04915429631744219, + "grad_norm": 1.7832142114639282, + "learning_rate": 4.9702583791004165e-05, + "loss": 5.4024, + "step": 8265 + }, + { + "epoch": 0.04916024360072319, + "grad_norm": 2.04298734664917, + "learning_rate": 4.970251195099746e-05, + "loss": 5.7034, + "step": 8266 + }, + { + "epoch": 0.04916619088400419, + "grad_norm": 2.1806769371032715, + "learning_rate": 4.970244010236736e-05, + "loss": 6.1212, + "step": 8267 + }, + { + "epoch": 0.04917213816728518, + "grad_norm": 1.8740427494049072, + "learning_rate": 4.970236824511389e-05, + "loss": 5.7562, + "step": 8268 + }, + { + "epoch": 0.04917808545056618, + "grad_norm": 1.7718658447265625, + "learning_rate": 4.970229637923709e-05, + "loss": 5.5126, + "step": 8269 + }, + { + "epoch": 0.04918403273384718, + "grad_norm": 1.4966565370559692, + "learning_rate": 4.970222450473696e-05, + "loss": 5.5422, + "step": 8270 + }, + { + "epoch": 0.049189980017128175, + "grad_norm": 1.8283390998840332, + "learning_rate": 4.970215262161355e-05, + "loss": 5.9333, + "step": 8271 + }, + { + "epoch": 0.04919592730040917, + "grad_norm": 2.087460517883301, + "learning_rate": 4.970208072986687e-05, + "loss": 5.5413, + "step": 8272 + }, + { + "epoch": 0.04920187458369017, + "grad_norm": 2.2952873706817627, + "learning_rate": 4.970200882949694e-05, + "loss": 5.7848, + "step": 8273 + }, + { + "epoch": 0.04920782186697117, + "grad_norm": 1.9511842727661133, + "learning_rate": 4.9701936920503804e-05, + "loss": 5.6172, + "step": 8274 + }, + { + "epoch": 0.04921376915025216, + "grad_norm": 1.992211937904358, + "learning_rate": 4.970186500288748e-05, + "loss": 5.48, + "step": 8275 + }, + { + "epoch": 0.049219716433533164, + "grad_norm": 1.739013910293579, + "learning_rate": 4.9701793076647984e-05, + "loss": 5.6351, + "step": 8276 + }, + { + "epoch": 0.04922566371681416, + "grad_norm": 2.150797128677368, + "learning_rate": 4.970172114178534e-05, + "loss": 5.5957, + "step": 8277 + }, + { + "epoch": 0.049231611000095155, + "grad_norm": 2.074070930480957, + "learning_rate": 4.9701649198299594e-05, + "loss": 5.4751, + "step": 8278 + }, + { + "epoch": 0.04923755828337616, + "grad_norm": 2.2276322841644287, + "learning_rate": 4.970157724619075e-05, + "loss": 5.4434, + "step": 8279 + }, + { + "epoch": 0.04924350556665715, + "grad_norm": 1.9707896709442139, + "learning_rate": 4.970150528545884e-05, + "loss": 5.6935, + "step": 8280 + }, + { + "epoch": 0.04924945284993815, + "grad_norm": 2.07774019241333, + "learning_rate": 4.9701433316103895e-05, + "loss": 6.0455, + "step": 8281 + }, + { + "epoch": 0.04925540013321915, + "grad_norm": 2.3262722492218018, + "learning_rate": 4.970136133812593e-05, + "loss": 5.6039, + "step": 8282 + }, + { + "epoch": 0.049261347416500144, + "grad_norm": 2.4353108406066895, + "learning_rate": 4.970128935152498e-05, + "loss": 5.3823, + "step": 8283 + }, + { + "epoch": 0.04926729469978114, + "grad_norm": 2.7383084297180176, + "learning_rate": 4.970121735630106e-05, + "loss": 5.4039, + "step": 8284 + }, + { + "epoch": 0.049273241983062134, + "grad_norm": 2.9022698402404785, + "learning_rate": 4.9701145352454205e-05, + "loss": 5.3571, + "step": 8285 + }, + { + "epoch": 0.049279189266343136, + "grad_norm": 2.314373731613159, + "learning_rate": 4.970107333998443e-05, + "loss": 5.4877, + "step": 8286 + }, + { + "epoch": 0.04928513654962413, + "grad_norm": 1.9494023323059082, + "learning_rate": 4.970100131889177e-05, + "loss": 5.5171, + "step": 8287 + }, + { + "epoch": 0.049291083832905126, + "grad_norm": 2.7892074584960938, + "learning_rate": 4.9700929289176245e-05, + "loss": 5.5347, + "step": 8288 + }, + { + "epoch": 0.04929703111618613, + "grad_norm": 2.305204391479492, + "learning_rate": 4.970085725083788e-05, + "loss": 5.8689, + "step": 8289 + }, + { + "epoch": 0.04930297839946712, + "grad_norm": 2.4212634563446045, + "learning_rate": 4.97007852038767e-05, + "loss": 5.8982, + "step": 8290 + }, + { + "epoch": 0.04930892568274812, + "grad_norm": 3.584625482559204, + "learning_rate": 4.9700713148292734e-05, + "loss": 5.2341, + "step": 8291 + }, + { + "epoch": 0.04931487296602912, + "grad_norm": 2.874703884124756, + "learning_rate": 4.9700641084086e-05, + "loss": 5.2312, + "step": 8292 + }, + { + "epoch": 0.049320820249310116, + "grad_norm": 2.113234519958496, + "learning_rate": 4.9700569011256524e-05, + "loss": 5.5779, + "step": 8293 + }, + { + "epoch": 0.04932676753259111, + "grad_norm": 3.027318000793457, + "learning_rate": 4.970049692980434e-05, + "loss": 5.3899, + "step": 8294 + }, + { + "epoch": 0.04933271481587211, + "grad_norm": 2.779520273208618, + "learning_rate": 4.970042483972947e-05, + "loss": 5.4023, + "step": 8295 + }, + { + "epoch": 0.04933866209915311, + "grad_norm": 2.4358251094818115, + "learning_rate": 4.970035274103193e-05, + "loss": 5.4932, + "step": 8296 + }, + { + "epoch": 0.0493446093824341, + "grad_norm": 1.926193118095398, + "learning_rate": 4.970028063371176e-05, + "loss": 5.4058, + "step": 8297 + }, + { + "epoch": 0.0493505566657151, + "grad_norm": 1.7216569185256958, + "learning_rate": 4.970020851776898e-05, + "loss": 5.3265, + "step": 8298 + }, + { + "epoch": 0.0493565039489961, + "grad_norm": 1.9850976467132568, + "learning_rate": 4.97001363932036e-05, + "loss": 5.1626, + "step": 8299 + }, + { + "epoch": 0.049362451232277095, + "grad_norm": 2.1380982398986816, + "learning_rate": 4.9700064260015666e-05, + "loss": 5.3285, + "step": 8300 + }, + { + "epoch": 0.04936839851555809, + "grad_norm": 2.118781566619873, + "learning_rate": 4.969999211820518e-05, + "loss": 5.3544, + "step": 8301 + }, + { + "epoch": 0.04937434579883909, + "grad_norm": 2.0255584716796875, + "learning_rate": 4.96999199677722e-05, + "loss": 5.4256, + "step": 8302 + }, + { + "epoch": 0.04938029308212009, + "grad_norm": 2.0269806385040283, + "learning_rate": 4.9699847808716724e-05, + "loss": 5.9744, + "step": 8303 + }, + { + "epoch": 0.04938624036540108, + "grad_norm": 2.60446834564209, + "learning_rate": 4.969977564103879e-05, + "loss": 5.3926, + "step": 8304 + }, + { + "epoch": 0.049392187648682084, + "grad_norm": 2.1011881828308105, + "learning_rate": 4.9699703464738426e-05, + "loss": 5.4278, + "step": 8305 + }, + { + "epoch": 0.04939813493196308, + "grad_norm": 1.9267319440841675, + "learning_rate": 4.969963127981564e-05, + "loss": 5.6232, + "step": 8306 + }, + { + "epoch": 0.049404082215244075, + "grad_norm": 2.1958322525024414, + "learning_rate": 4.969955908627048e-05, + "loss": 5.8577, + "step": 8307 + }, + { + "epoch": 0.049410029498525077, + "grad_norm": 2.392241954803467, + "learning_rate": 4.969948688410294e-05, + "loss": 5.8013, + "step": 8308 + }, + { + "epoch": 0.04941597678180607, + "grad_norm": 2.8284695148468018, + "learning_rate": 4.969941467331308e-05, + "loss": 6.1246, + "step": 8309 + }, + { + "epoch": 0.04942192406508707, + "grad_norm": 2.8590078353881836, + "learning_rate": 4.96993424539009e-05, + "loss": 6.1068, + "step": 8310 + }, + { + "epoch": 0.04942787134836807, + "grad_norm": 1.876207709312439, + "learning_rate": 4.969927022586644e-05, + "loss": 5.5493, + "step": 8311 + }, + { + "epoch": 0.049433818631649064, + "grad_norm": 1.988061547279358, + "learning_rate": 4.969919798920972e-05, + "loss": 5.7059, + "step": 8312 + }, + { + "epoch": 0.04943976591493006, + "grad_norm": 2.8230605125427246, + "learning_rate": 4.969912574393077e-05, + "loss": 5.9381, + "step": 8313 + }, + { + "epoch": 0.049445713198211054, + "grad_norm": 2.4622697830200195, + "learning_rate": 4.96990534900296e-05, + "loss": 6.0935, + "step": 8314 + }, + { + "epoch": 0.049451660481492056, + "grad_norm": 2.0811798572540283, + "learning_rate": 4.9698981227506254e-05, + "loss": 6.3475, + "step": 8315 + }, + { + "epoch": 0.04945760776477305, + "grad_norm": 2.099489212036133, + "learning_rate": 4.9698908956360745e-05, + "loss": 5.7266, + "step": 8316 + }, + { + "epoch": 0.049463555048054046, + "grad_norm": 2.1711854934692383, + "learning_rate": 4.9698836676593104e-05, + "loss": 5.6067, + "step": 8317 + }, + { + "epoch": 0.04946950233133505, + "grad_norm": 2.195296287536621, + "learning_rate": 4.969876438820335e-05, + "loss": 5.3896, + "step": 8318 + }, + { + "epoch": 0.04947544961461604, + "grad_norm": 2.114830255508423, + "learning_rate": 4.969869209119151e-05, + "loss": 5.6922, + "step": 8319 + }, + { + "epoch": 0.04948139689789704, + "grad_norm": 2.1534018516540527, + "learning_rate": 4.969861978555762e-05, + "loss": 6.1372, + "step": 8320 + }, + { + "epoch": 0.04948734418117804, + "grad_norm": 2.151495933532715, + "learning_rate": 4.9698547471301696e-05, + "loss": 6.0915, + "step": 8321 + }, + { + "epoch": 0.049493291464459035, + "grad_norm": 1.8232096433639526, + "learning_rate": 4.9698475148423764e-05, + "loss": 6.1492, + "step": 8322 + }, + { + "epoch": 0.04949923874774003, + "grad_norm": 2.1538467407226562, + "learning_rate": 4.9698402816923844e-05, + "loss": 5.6253, + "step": 8323 + }, + { + "epoch": 0.04950518603102103, + "grad_norm": 2.278797149658203, + "learning_rate": 4.969833047680197e-05, + "loss": 6.0055, + "step": 8324 + }, + { + "epoch": 0.04951113331430203, + "grad_norm": 2.479342460632324, + "learning_rate": 4.9698258128058164e-05, + "loss": 5.7909, + "step": 8325 + }, + { + "epoch": 0.04951708059758302, + "grad_norm": 2.2959346771240234, + "learning_rate": 4.969818577069245e-05, + "loss": 5.6888, + "step": 8326 + }, + { + "epoch": 0.04952302788086402, + "grad_norm": 1.841544270515442, + "learning_rate": 4.969811340470486e-05, + "loss": 5.5091, + "step": 8327 + }, + { + "epoch": 0.04952897516414502, + "grad_norm": 2.4512903690338135, + "learning_rate": 4.969804103009541e-05, + "loss": 5.7271, + "step": 8328 + }, + { + "epoch": 0.049534922447426015, + "grad_norm": 2.035473585128784, + "learning_rate": 4.969796864686413e-05, + "loss": 5.3056, + "step": 8329 + }, + { + "epoch": 0.04954086973070701, + "grad_norm": 2.030576705932617, + "learning_rate": 4.9697896255011046e-05, + "loss": 5.2765, + "step": 8330 + }, + { + "epoch": 0.04954681701398801, + "grad_norm": 1.680253505706787, + "learning_rate": 4.9697823854536175e-05, + "loss": 5.1968, + "step": 8331 + }, + { + "epoch": 0.04955276429726901, + "grad_norm": 1.962259292602539, + "learning_rate": 4.969775144543955e-05, + "loss": 5.0743, + "step": 8332 + }, + { + "epoch": 0.04955871158055, + "grad_norm": 2.499044895172119, + "learning_rate": 4.96976790277212e-05, + "loss": 5.5204, + "step": 8333 + }, + { + "epoch": 0.049564658863831004, + "grad_norm": 2.004849672317505, + "learning_rate": 4.969760660138114e-05, + "loss": 5.5714, + "step": 8334 + }, + { + "epoch": 0.049570606147112, + "grad_norm": 2.255171775817871, + "learning_rate": 4.9697534166419405e-05, + "loss": 5.0766, + "step": 8335 + }, + { + "epoch": 0.049576553430392994, + "grad_norm": 2.1219112873077393, + "learning_rate": 4.969746172283601e-05, + "loss": 5.0613, + "step": 8336 + }, + { + "epoch": 0.049582500713673996, + "grad_norm": 1.9718400239944458, + "learning_rate": 4.9697389270631004e-05, + "loss": 5.0007, + "step": 8337 + }, + { + "epoch": 0.04958844799695499, + "grad_norm": 1.87917160987854, + "learning_rate": 4.969731680980437e-05, + "loss": 4.9533, + "step": 8338 + }, + { + "epoch": 0.04959439528023599, + "grad_norm": 1.9610000848770142, + "learning_rate": 4.969724434035618e-05, + "loss": 4.9761, + "step": 8339 + }, + { + "epoch": 0.04960034256351699, + "grad_norm": 1.859434723854065, + "learning_rate": 4.969717186228642e-05, + "loss": 5.2373, + "step": 8340 + }, + { + "epoch": 0.049606289846797984, + "grad_norm": 1.9905357360839844, + "learning_rate": 4.9697099375595144e-05, + "loss": 4.8858, + "step": 8341 + }, + { + "epoch": 0.04961223713007898, + "grad_norm": 1.995355486869812, + "learning_rate": 4.969702688028236e-05, + "loss": 4.9468, + "step": 8342 + }, + { + "epoch": 0.049618184413359974, + "grad_norm": 1.9970706701278687, + "learning_rate": 4.96969543763481e-05, + "loss": 4.8891, + "step": 8343 + }, + { + "epoch": 0.049624131696640976, + "grad_norm": 1.9036997556686401, + "learning_rate": 4.9696881863792385e-05, + "loss": 4.7622, + "step": 8344 + }, + { + "epoch": 0.04963007897992197, + "grad_norm": 1.9532603025436401, + "learning_rate": 4.9696809342615245e-05, + "loss": 4.7832, + "step": 8345 + }, + { + "epoch": 0.049636026263202966, + "grad_norm": 1.9032143354415894, + "learning_rate": 4.969673681281671e-05, + "loss": 4.7569, + "step": 8346 + }, + { + "epoch": 0.04964197354648397, + "grad_norm": 3.4294323921203613, + "learning_rate": 4.96966642743968e-05, + "loss": 5.9381, + "step": 8347 + }, + { + "epoch": 0.04964792082976496, + "grad_norm": 4.137698173522949, + "learning_rate": 4.969659172735554e-05, + "loss": 6.4081, + "step": 8348 + }, + { + "epoch": 0.04965386811304596, + "grad_norm": 2.774838447570801, + "learning_rate": 4.969651917169295e-05, + "loss": 5.9888, + "step": 8349 + }, + { + "epoch": 0.04965981539632696, + "grad_norm": 2.4056432247161865, + "learning_rate": 4.9696446607409054e-05, + "loss": 6.1239, + "step": 8350 + }, + { + "epoch": 0.049665762679607955, + "grad_norm": 2.098475456237793, + "learning_rate": 4.969637403450389e-05, + "loss": 6.4226, + "step": 8351 + }, + { + "epoch": 0.04967170996288895, + "grad_norm": 2.1402597427368164, + "learning_rate": 4.9696301452977475e-05, + "loss": 5.8836, + "step": 8352 + }, + { + "epoch": 0.04967765724616995, + "grad_norm": 2.8023130893707275, + "learning_rate": 4.9696228862829844e-05, + "loss": 6.2452, + "step": 8353 + }, + { + "epoch": 0.04968360452945095, + "grad_norm": 2.7669503688812256, + "learning_rate": 4.9696156264061e-05, + "loss": 6.0093, + "step": 8354 + }, + { + "epoch": 0.04968955181273194, + "grad_norm": 2.2357375621795654, + "learning_rate": 4.9696083656671e-05, + "loss": 6.0614, + "step": 8355 + }, + { + "epoch": 0.049695499096012945, + "grad_norm": 2.1435539722442627, + "learning_rate": 4.969601104065984e-05, + "loss": 6.0718, + "step": 8356 + }, + { + "epoch": 0.04970144637929394, + "grad_norm": 2.6372897624969482, + "learning_rate": 4.969593841602757e-05, + "loss": 5.4878, + "step": 8357 + }, + { + "epoch": 0.049707393662574935, + "grad_norm": 1.9730110168457031, + "learning_rate": 4.9695865782774186e-05, + "loss": 5.8913, + "step": 8358 + }, + { + "epoch": 0.04971334094585593, + "grad_norm": 2.262437105178833, + "learning_rate": 4.9695793140899737e-05, + "loss": 5.0382, + "step": 8359 + }, + { + "epoch": 0.04971928822913693, + "grad_norm": 1.794268250465393, + "learning_rate": 4.9695720490404254e-05, + "loss": 5.784, + "step": 8360 + }, + { + "epoch": 0.04972523551241793, + "grad_norm": 1.9568414688110352, + "learning_rate": 4.969564783128773e-05, + "loss": 5.8939, + "step": 8361 + }, + { + "epoch": 0.04973118279569892, + "grad_norm": 2.0560479164123535, + "learning_rate": 4.969557516355022e-05, + "loss": 5.8806, + "step": 8362 + }, + { + "epoch": 0.049737130078979924, + "grad_norm": 1.9009175300598145, + "learning_rate": 4.9695502487191746e-05, + "loss": 5.5568, + "step": 8363 + }, + { + "epoch": 0.04974307736226092, + "grad_norm": 2.1240882873535156, + "learning_rate": 4.9695429802212325e-05, + "loss": 5.4514, + "step": 8364 + }, + { + "epoch": 0.049749024645541914, + "grad_norm": 2.0803675651550293, + "learning_rate": 4.969535710861198e-05, + "loss": 5.7679, + "step": 8365 + }, + { + "epoch": 0.049754971928822916, + "grad_norm": 1.9357428550720215, + "learning_rate": 4.969528440639074e-05, + "loss": 6.1658, + "step": 8366 + }, + { + "epoch": 0.04976091921210391, + "grad_norm": 1.89462411403656, + "learning_rate": 4.9695211695548635e-05, + "loss": 6.0559, + "step": 8367 + }, + { + "epoch": 0.04976686649538491, + "grad_norm": 1.5986123085021973, + "learning_rate": 4.969513897608569e-05, + "loss": 5.7787, + "step": 8368 + }, + { + "epoch": 0.04977281377866591, + "grad_norm": 2.0391738414764404, + "learning_rate": 4.969506624800192e-05, + "loss": 5.5559, + "step": 8369 + }, + { + "epoch": 0.049778761061946904, + "grad_norm": 2.1463794708251953, + "learning_rate": 4.969499351129736e-05, + "loss": 5.5734, + "step": 8370 + }, + { + "epoch": 0.0497847083452279, + "grad_norm": 2.1488826274871826, + "learning_rate": 4.969492076597203e-05, + "loss": 5.7502, + "step": 8371 + }, + { + "epoch": 0.049790655628508894, + "grad_norm": 2.214439868927002, + "learning_rate": 4.9694848012025966e-05, + "loss": 5.8829, + "step": 8372 + }, + { + "epoch": 0.049796602911789896, + "grad_norm": 2.366196632385254, + "learning_rate": 4.969477524945918e-05, + "loss": 5.3428, + "step": 8373 + }, + { + "epoch": 0.04980255019507089, + "grad_norm": 2.239044189453125, + "learning_rate": 4.96947024782717e-05, + "loss": 5.7258, + "step": 8374 + }, + { + "epoch": 0.049808497478351886, + "grad_norm": 2.315492868423462, + "learning_rate": 4.9694629698463554e-05, + "loss": 5.6542, + "step": 8375 + }, + { + "epoch": 0.04981444476163289, + "grad_norm": 2.340740919113159, + "learning_rate": 4.969455691003478e-05, + "loss": 5.0699, + "step": 8376 + }, + { + "epoch": 0.04982039204491388, + "grad_norm": 2.644800901412964, + "learning_rate": 4.9694484112985386e-05, + "loss": 5.3808, + "step": 8377 + }, + { + "epoch": 0.04982633932819488, + "grad_norm": 2.7073781490325928, + "learning_rate": 4.96944113073154e-05, + "loss": 5.5233, + "step": 8378 + }, + { + "epoch": 0.04983228661147588, + "grad_norm": 2.5480713844299316, + "learning_rate": 4.969433849302485e-05, + "loss": 5.3908, + "step": 8379 + }, + { + "epoch": 0.049838233894756875, + "grad_norm": 2.494356155395508, + "learning_rate": 4.969426567011376e-05, + "loss": 5.3528, + "step": 8380 + }, + { + "epoch": 0.04984418117803787, + "grad_norm": 2.4249942302703857, + "learning_rate": 4.9694192838582155e-05, + "loss": 5.2995, + "step": 8381 + }, + { + "epoch": 0.04985012846131887, + "grad_norm": 2.5930840969085693, + "learning_rate": 4.9694119998430066e-05, + "loss": 6.0202, + "step": 8382 + }, + { + "epoch": 0.04985607574459987, + "grad_norm": 2.391972541809082, + "learning_rate": 4.969404714965752e-05, + "loss": 6.0247, + "step": 8383 + }, + { + "epoch": 0.04986202302788086, + "grad_norm": 2.2849159240722656, + "learning_rate": 4.9693974292264535e-05, + "loss": 5.892, + "step": 8384 + }, + { + "epoch": 0.049867970311161865, + "grad_norm": 2.1887097358703613, + "learning_rate": 4.9693901426251134e-05, + "loss": 6.0196, + "step": 8385 + }, + { + "epoch": 0.04987391759444286, + "grad_norm": 2.3988685607910156, + "learning_rate": 4.969382855161735e-05, + "loss": 5.5596, + "step": 8386 + }, + { + "epoch": 0.049879864877723855, + "grad_norm": 2.675144910812378, + "learning_rate": 4.9693755668363204e-05, + "loss": 5.3495, + "step": 8387 + }, + { + "epoch": 0.04988581216100485, + "grad_norm": 2.3753585815429688, + "learning_rate": 4.969368277648873e-05, + "loss": 5.8823, + "step": 8388 + }, + { + "epoch": 0.04989175944428585, + "grad_norm": 2.3168766498565674, + "learning_rate": 4.969360987599394e-05, + "loss": 5.9768, + "step": 8389 + }, + { + "epoch": 0.04989770672756685, + "grad_norm": 2.427138566970825, + "learning_rate": 4.969353696687886e-05, + "loss": 6.1823, + "step": 8390 + }, + { + "epoch": 0.04990365401084784, + "grad_norm": 2.304731845855713, + "learning_rate": 4.9693464049143526e-05, + "loss": 5.8697, + "step": 8391 + }, + { + "epoch": 0.049909601294128844, + "grad_norm": 2.2139687538146973, + "learning_rate": 4.9693391122787966e-05, + "loss": 6.0274, + "step": 8392 + }, + { + "epoch": 0.04991554857740984, + "grad_norm": 2.1165316104888916, + "learning_rate": 4.9693318187812185e-05, + "loss": 5.2499, + "step": 8393 + }, + { + "epoch": 0.049921495860690834, + "grad_norm": 2.5213639736175537, + "learning_rate": 4.969324524421624e-05, + "loss": 4.9105, + "step": 8394 + }, + { + "epoch": 0.049927443143971836, + "grad_norm": 2.2188315391540527, + "learning_rate": 4.9693172292000125e-05, + "loss": 4.8652, + "step": 8395 + }, + { + "epoch": 0.04993339042725283, + "grad_norm": 2.393179416656494, + "learning_rate": 4.9693099331163886e-05, + "loss": 4.924, + "step": 8396 + }, + { + "epoch": 0.04993933771053383, + "grad_norm": 2.150264024734497, + "learning_rate": 4.969302636170753e-05, + "loss": 4.9168, + "step": 8397 + }, + { + "epoch": 0.04994528499381483, + "grad_norm": 2.252499580383301, + "learning_rate": 4.96929533836311e-05, + "loss": 4.7822, + "step": 8398 + }, + { + "epoch": 0.049951232277095824, + "grad_norm": 2.342132806777954, + "learning_rate": 4.969288039693461e-05, + "loss": 5.3691, + "step": 8399 + }, + { + "epoch": 0.04995717956037682, + "grad_norm": 2.3533523082733154, + "learning_rate": 4.96928074016181e-05, + "loss": 5.9989, + "step": 8400 + }, + { + "epoch": 0.049963126843657814, + "grad_norm": 2.185727834701538, + "learning_rate": 4.969273439768158e-05, + "loss": 5.6101, + "step": 8401 + }, + { + "epoch": 0.049969074126938816, + "grad_norm": 2.3396189212799072, + "learning_rate": 4.969266138512509e-05, + "loss": 5.845, + "step": 8402 + }, + { + "epoch": 0.04997502141021981, + "grad_norm": 2.2145371437072754, + "learning_rate": 4.969258836394864e-05, + "loss": 5.6657, + "step": 8403 + }, + { + "epoch": 0.049980968693500806, + "grad_norm": 2.2084364891052246, + "learning_rate": 4.969251533415226e-05, + "loss": 5.8823, + "step": 8404 + }, + { + "epoch": 0.04998691597678181, + "grad_norm": 1.7423903942108154, + "learning_rate": 4.9692442295735984e-05, + "loss": 5.8209, + "step": 8405 + }, + { + "epoch": 0.0499928632600628, + "grad_norm": 2.3057217597961426, + "learning_rate": 4.9692369248699824e-05, + "loss": 5.8352, + "step": 8406 + }, + { + "epoch": 0.0499988105433438, + "grad_norm": 2.1800148487091064, + "learning_rate": 4.969229619304382e-05, + "loss": 5.783, + "step": 8407 + }, + { + "epoch": 0.0500047578266248, + "grad_norm": 1.8594306707382202, + "learning_rate": 4.969222312876799e-05, + "loss": 6.01, + "step": 8408 + }, + { + "epoch": 0.050010705109905795, + "grad_norm": 2.119917392730713, + "learning_rate": 4.9692150055872355e-05, + "loss": 5.7282, + "step": 8409 + }, + { + "epoch": 0.05001665239318679, + "grad_norm": 2.5282747745513916, + "learning_rate": 4.969207697435695e-05, + "loss": 5.0853, + "step": 8410 + }, + { + "epoch": 0.05002259967646779, + "grad_norm": 2.5683388710021973, + "learning_rate": 4.969200388422179e-05, + "loss": 4.9841, + "step": 8411 + }, + { + "epoch": 0.05002854695974879, + "grad_norm": 2.649918794631958, + "learning_rate": 4.969193078546692e-05, + "loss": 5.6365, + "step": 8412 + }, + { + "epoch": 0.05003449424302978, + "grad_norm": 2.3040120601654053, + "learning_rate": 4.969185767809234e-05, + "loss": 5.8272, + "step": 8413 + }, + { + "epoch": 0.050040441526310785, + "grad_norm": 2.033600330352783, + "learning_rate": 4.9691784562098084e-05, + "loss": 5.9779, + "step": 8414 + }, + { + "epoch": 0.05004638880959178, + "grad_norm": 2.1903419494628906, + "learning_rate": 4.96917114374842e-05, + "loss": 5.8651, + "step": 8415 + }, + { + "epoch": 0.050052336092872775, + "grad_norm": 2.4431047439575195, + "learning_rate": 4.969163830425068e-05, + "loss": 4.7787, + "step": 8416 + }, + { + "epoch": 0.05005828337615377, + "grad_norm": 2.6652824878692627, + "learning_rate": 4.969156516239756e-05, + "loss": 4.7133, + "step": 8417 + }, + { + "epoch": 0.05006423065943477, + "grad_norm": 2.4090182781219482, + "learning_rate": 4.969149201192488e-05, + "loss": 4.4506, + "step": 8418 + }, + { + "epoch": 0.05007017794271577, + "grad_norm": 2.5310218334198, + "learning_rate": 4.969141885283265e-05, + "loss": 4.5286, + "step": 8419 + }, + { + "epoch": 0.05007612522599676, + "grad_norm": 2.5333101749420166, + "learning_rate": 4.9691345685120905e-05, + "loss": 4.6012, + "step": 8420 + }, + { + "epoch": 0.050082072509277764, + "grad_norm": 2.172724485397339, + "learning_rate": 4.9691272508789665e-05, + "loss": 4.9161, + "step": 8421 + }, + { + "epoch": 0.05008801979255876, + "grad_norm": 2.034684181213379, + "learning_rate": 4.969119932383896e-05, + "loss": 5.3105, + "step": 8422 + }, + { + "epoch": 0.050093967075839754, + "grad_norm": 1.9046155214309692, + "learning_rate": 4.969112613026881e-05, + "loss": 5.4308, + "step": 8423 + }, + { + "epoch": 0.050099914359120756, + "grad_norm": 1.7256773710250854, + "learning_rate": 4.9691052928079226e-05, + "loss": 5.2232, + "step": 8424 + }, + { + "epoch": 0.05010586164240175, + "grad_norm": 2.0075321197509766, + "learning_rate": 4.969097971727027e-05, + "loss": 6.1764, + "step": 8425 + }, + { + "epoch": 0.050111808925682746, + "grad_norm": 2.1523852348327637, + "learning_rate": 4.9690906497841946e-05, + "loss": 5.8419, + "step": 8426 + }, + { + "epoch": 0.05011775620896375, + "grad_norm": 1.9675406217575073, + "learning_rate": 4.969083326979428e-05, + "loss": 5.7919, + "step": 8427 + }, + { + "epoch": 0.050123703492244744, + "grad_norm": 2.0327789783477783, + "learning_rate": 4.9690760033127295e-05, + "loss": 5.0232, + "step": 8428 + }, + { + "epoch": 0.05012965077552574, + "grad_norm": 1.677471399307251, + "learning_rate": 4.969068678784102e-05, + "loss": 5.1106, + "step": 8429 + }, + { + "epoch": 0.050135598058806734, + "grad_norm": 1.727847933769226, + "learning_rate": 4.9690613533935496e-05, + "loss": 5.1589, + "step": 8430 + }, + { + "epoch": 0.050141545342087736, + "grad_norm": 1.8167927265167236, + "learning_rate": 4.9690540271410726e-05, + "loss": 5.1207, + "step": 8431 + }, + { + "epoch": 0.05014749262536873, + "grad_norm": 2.277425527572632, + "learning_rate": 4.969046700026674e-05, + "loss": 5.6614, + "step": 8432 + }, + { + "epoch": 0.050153439908649726, + "grad_norm": 1.6471065282821655, + "learning_rate": 4.969039372050356e-05, + "loss": 5.2065, + "step": 8433 + }, + { + "epoch": 0.05015938719193073, + "grad_norm": 1.9049899578094482, + "learning_rate": 4.9690320432121226e-05, + "loss": 5.7453, + "step": 8434 + }, + { + "epoch": 0.05016533447521172, + "grad_norm": 1.9145495891571045, + "learning_rate": 4.969024713511976e-05, + "loss": 6.2207, + "step": 8435 + }, + { + "epoch": 0.05017128175849272, + "grad_norm": 1.6634061336517334, + "learning_rate": 4.969017382949918e-05, + "loss": 6.1694, + "step": 8436 + }, + { + "epoch": 0.05017722904177372, + "grad_norm": 1.9804925918579102, + "learning_rate": 4.969010051525952e-05, + "loss": 6.2917, + "step": 8437 + }, + { + "epoch": 0.050183176325054715, + "grad_norm": 1.9674698114395142, + "learning_rate": 4.969002719240079e-05, + "loss": 6.3105, + "step": 8438 + }, + { + "epoch": 0.05018912360833571, + "grad_norm": 2.1540520191192627, + "learning_rate": 4.968995386092303e-05, + "loss": 5.964, + "step": 8439 + }, + { + "epoch": 0.05019507089161671, + "grad_norm": 1.8545453548431396, + "learning_rate": 4.9689880520826274e-05, + "loss": 5.8744, + "step": 8440 + }, + { + "epoch": 0.05020101817489771, + "grad_norm": 1.8022514581680298, + "learning_rate": 4.968980717211053e-05, + "loss": 6.1547, + "step": 8441 + }, + { + "epoch": 0.0502069654581787, + "grad_norm": 1.6297475099563599, + "learning_rate": 4.968973381477582e-05, + "loss": 6.1397, + "step": 8442 + }, + { + "epoch": 0.050212912741459705, + "grad_norm": 1.6256400346755981, + "learning_rate": 4.968966044882219e-05, + "loss": 6.0529, + "step": 8443 + }, + { + "epoch": 0.0502188600247407, + "grad_norm": 1.5988365411758423, + "learning_rate": 4.968958707424965e-05, + "loss": 6.0653, + "step": 8444 + }, + { + "epoch": 0.050224807308021695, + "grad_norm": 1.7062568664550781, + "learning_rate": 4.968951369105823e-05, + "loss": 5.6761, + "step": 8445 + }, + { + "epoch": 0.05023075459130269, + "grad_norm": 2.6108970642089844, + "learning_rate": 4.968944029924796e-05, + "loss": 5.7222, + "step": 8446 + }, + { + "epoch": 0.05023670187458369, + "grad_norm": 2.2341887950897217, + "learning_rate": 4.9689366898818854e-05, + "loss": 6.057, + "step": 8447 + }, + { + "epoch": 0.05024264915786469, + "grad_norm": 2.1819159984588623, + "learning_rate": 4.968929348977095e-05, + "loss": 6.0386, + "step": 8448 + }, + { + "epoch": 0.05024859644114568, + "grad_norm": 1.9941349029541016, + "learning_rate": 4.968922007210427e-05, + "loss": 6.132, + "step": 8449 + }, + { + "epoch": 0.050254543724426684, + "grad_norm": 1.7330418825149536, + "learning_rate": 4.968914664581883e-05, + "loss": 6.0834, + "step": 8450 + }, + { + "epoch": 0.05026049100770768, + "grad_norm": 1.8946608304977417, + "learning_rate": 4.968907321091467e-05, + "loss": 5.9147, + "step": 8451 + }, + { + "epoch": 0.050266438290988674, + "grad_norm": 2.314767599105835, + "learning_rate": 4.9688999767391815e-05, + "loss": 5.7087, + "step": 8452 + }, + { + "epoch": 0.050272385574269676, + "grad_norm": 2.604673147201538, + "learning_rate": 4.968892631525028e-05, + "loss": 5.7348, + "step": 8453 + }, + { + "epoch": 0.05027833285755067, + "grad_norm": 2.3386125564575195, + "learning_rate": 4.9688852854490097e-05, + "loss": 5.7509, + "step": 8454 + }, + { + "epoch": 0.050284280140831666, + "grad_norm": 2.3919529914855957, + "learning_rate": 4.968877938511129e-05, + "loss": 5.5851, + "step": 8455 + }, + { + "epoch": 0.05029022742411267, + "grad_norm": 2.0978026390075684, + "learning_rate": 4.9688705907113886e-05, + "loss": 5.3663, + "step": 8456 + }, + { + "epoch": 0.050296174707393664, + "grad_norm": 2.1700327396392822, + "learning_rate": 4.9688632420497904e-05, + "loss": 6.0197, + "step": 8457 + }, + { + "epoch": 0.05030212199067466, + "grad_norm": 2.1657676696777344, + "learning_rate": 4.968855892526338e-05, + "loss": 6.1721, + "step": 8458 + }, + { + "epoch": 0.050308069273955654, + "grad_norm": 2.434732437133789, + "learning_rate": 4.968848542141033e-05, + "loss": 6.0217, + "step": 8459 + }, + { + "epoch": 0.050314016557236656, + "grad_norm": 1.8453216552734375, + "learning_rate": 4.96884119089388e-05, + "loss": 6.4071, + "step": 8460 + }, + { + "epoch": 0.05031996384051765, + "grad_norm": 1.930168628692627, + "learning_rate": 4.9688338387848784e-05, + "loss": 6.5024, + "step": 8461 + }, + { + "epoch": 0.050325911123798646, + "grad_norm": 2.1785950660705566, + "learning_rate": 4.968826485814033e-05, + "loss": 5.803, + "step": 8462 + }, + { + "epoch": 0.05033185840707965, + "grad_norm": 2.003187894821167, + "learning_rate": 4.968819131981346e-05, + "loss": 6.2269, + "step": 8463 + }, + { + "epoch": 0.05033780569036064, + "grad_norm": 2.9522452354431152, + "learning_rate": 4.9688117772868195e-05, + "loss": 5.5603, + "step": 8464 + }, + { + "epoch": 0.05034375297364164, + "grad_norm": 1.9813052415847778, + "learning_rate": 4.968804421730457e-05, + "loss": 6.0101, + "step": 8465 + }, + { + "epoch": 0.05034970025692264, + "grad_norm": 2.370225667953491, + "learning_rate": 4.9687970653122596e-05, + "loss": 6.3236, + "step": 8466 + }, + { + "epoch": 0.050355647540203635, + "grad_norm": 1.9233943223953247, + "learning_rate": 4.968789708032231e-05, + "loss": 6.2962, + "step": 8467 + }, + { + "epoch": 0.05036159482348463, + "grad_norm": 1.8740222454071045, + "learning_rate": 4.968782349890373e-05, + "loss": 5.5454, + "step": 8468 + }, + { + "epoch": 0.05036754210676563, + "grad_norm": 1.8627724647521973, + "learning_rate": 4.968774990886689e-05, + "loss": 5.9242, + "step": 8469 + }, + { + "epoch": 0.05037348939004663, + "grad_norm": 1.7016552686691284, + "learning_rate": 4.968767631021181e-05, + "loss": 6.3302, + "step": 8470 + }, + { + "epoch": 0.05037943667332762, + "grad_norm": 1.8826018571853638, + "learning_rate": 4.9687602702938515e-05, + "loss": 6.3308, + "step": 8471 + }, + { + "epoch": 0.050385383956608625, + "grad_norm": 1.777480959892273, + "learning_rate": 4.9687529087047036e-05, + "loss": 6.3948, + "step": 8472 + }, + { + "epoch": 0.05039133123988962, + "grad_norm": 2.10075306892395, + "learning_rate": 4.9687455462537396e-05, + "loss": 6.1615, + "step": 8473 + }, + { + "epoch": 0.050397278523170615, + "grad_norm": 2.3484537601470947, + "learning_rate": 4.9687381829409616e-05, + "loss": 5.8286, + "step": 8474 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 1.8243837356567383, + "learning_rate": 4.968730818766373e-05, + "loss": 6.014, + "step": 8475 + }, + { + "epoch": 0.05040917308973261, + "grad_norm": 1.8149470090866089, + "learning_rate": 4.9687234537299765e-05, + "loss": 5.9723, + "step": 8476 + }, + { + "epoch": 0.05041512037301361, + "grad_norm": 2.400754451751709, + "learning_rate": 4.968716087831773e-05, + "loss": 5.237, + "step": 8477 + }, + { + "epoch": 0.0504210676562946, + "grad_norm": 2.4394338130950928, + "learning_rate": 4.968708721071767e-05, + "loss": 5.1106, + "step": 8478 + }, + { + "epoch": 0.050427014939575604, + "grad_norm": 2.210686445236206, + "learning_rate": 4.96870135344996e-05, + "loss": 5.0002, + "step": 8479 + }, + { + "epoch": 0.0504329622228566, + "grad_norm": 2.302997589111328, + "learning_rate": 4.968693984966355e-05, + "loss": 5.689, + "step": 8480 + }, + { + "epoch": 0.050438909506137594, + "grad_norm": 2.0761525630950928, + "learning_rate": 4.9686866156209546e-05, + "loss": 5.4452, + "step": 8481 + }, + { + "epoch": 0.050444856789418596, + "grad_norm": 2.3239383697509766, + "learning_rate": 4.968679245413761e-05, + "loss": 5.4427, + "step": 8482 + }, + { + "epoch": 0.05045080407269959, + "grad_norm": 3.2064802646636963, + "learning_rate": 4.9686718743447766e-05, + "loss": 5.2947, + "step": 8483 + }, + { + "epoch": 0.050456751355980586, + "grad_norm": 2.680786371231079, + "learning_rate": 4.968664502414004e-05, + "loss": 5.4776, + "step": 8484 + }, + { + "epoch": 0.05046269863926159, + "grad_norm": 2.107583522796631, + "learning_rate": 4.9686571296214476e-05, + "loss": 5.5172, + "step": 8485 + }, + { + "epoch": 0.050468645922542583, + "grad_norm": 1.939788579940796, + "learning_rate": 4.9686497559671075e-05, + "loss": 5.6056, + "step": 8486 + }, + { + "epoch": 0.05047459320582358, + "grad_norm": 1.883991003036499, + "learning_rate": 4.968642381450987e-05, + "loss": 5.6511, + "step": 8487 + }, + { + "epoch": 0.050480540489104574, + "grad_norm": 1.8518444299697876, + "learning_rate": 4.96863500607309e-05, + "loss": 5.5897, + "step": 8488 + }, + { + "epoch": 0.050486487772385576, + "grad_norm": 1.6704350709915161, + "learning_rate": 4.968627629833418e-05, + "loss": 5.5002, + "step": 8489 + }, + { + "epoch": 0.05049243505566657, + "grad_norm": 1.755231261253357, + "learning_rate": 4.968620252731972e-05, + "loss": 5.6012, + "step": 8490 + }, + { + "epoch": 0.050498382338947566, + "grad_norm": 1.8532077074050903, + "learning_rate": 4.968612874768758e-05, + "loss": 5.4443, + "step": 8491 + }, + { + "epoch": 0.05050432962222857, + "grad_norm": 1.787781000137329, + "learning_rate": 4.9686054959437756e-05, + "loss": 5.5623, + "step": 8492 + }, + { + "epoch": 0.05051027690550956, + "grad_norm": 1.6963365077972412, + "learning_rate": 4.9685981162570295e-05, + "loss": 5.5349, + "step": 8493 + }, + { + "epoch": 0.05051622418879056, + "grad_norm": 4.328898906707764, + "learning_rate": 4.96859073570852e-05, + "loss": 5.8026, + "step": 8494 + }, + { + "epoch": 0.05052217147207156, + "grad_norm": 1.6906582117080688, + "learning_rate": 4.968583354298252e-05, + "loss": 5.4804, + "step": 8495 + }, + { + "epoch": 0.050528118755352555, + "grad_norm": 1.5316333770751953, + "learning_rate": 4.968575972026227e-05, + "loss": 5.6005, + "step": 8496 + }, + { + "epoch": 0.05053406603863355, + "grad_norm": 1.6029349565505981, + "learning_rate": 4.968568588892447e-05, + "loss": 5.5991, + "step": 8497 + }, + { + "epoch": 0.05054001332191455, + "grad_norm": 2.246537685394287, + "learning_rate": 4.968561204896916e-05, + "loss": 5.8537, + "step": 8498 + }, + { + "epoch": 0.05054596060519555, + "grad_norm": 2.0347564220428467, + "learning_rate": 4.9685538200396355e-05, + "loss": 5.7968, + "step": 8499 + }, + { + "epoch": 0.05055190788847654, + "grad_norm": 1.7635436058044434, + "learning_rate": 4.968546434320608e-05, + "loss": 5.6324, + "step": 8500 + }, + { + "epoch": 0.050557855171757544, + "grad_norm": 2.415397882461548, + "learning_rate": 4.9685390477398363e-05, + "loss": 5.3795, + "step": 8501 + }, + { + "epoch": 0.05056380245503854, + "grad_norm": 2.1499149799346924, + "learning_rate": 4.9685316602973245e-05, + "loss": 5.5638, + "step": 8502 + }, + { + "epoch": 0.050569749738319535, + "grad_norm": 2.0479557514190674, + "learning_rate": 4.9685242719930725e-05, + "loss": 5.3902, + "step": 8503 + }, + { + "epoch": 0.05057569702160053, + "grad_norm": 1.874993085861206, + "learning_rate": 4.9685168828270845e-05, + "loss": 5.4607, + "step": 8504 + }, + { + "epoch": 0.05058164430488153, + "grad_norm": 1.6361217498779297, + "learning_rate": 4.9685094927993623e-05, + "loss": 5.4378, + "step": 8505 + }, + { + "epoch": 0.05058759158816253, + "grad_norm": 1.598026990890503, + "learning_rate": 4.9685021019099096e-05, + "loss": 5.4336, + "step": 8506 + }, + { + "epoch": 0.05059353887144352, + "grad_norm": 1.7636823654174805, + "learning_rate": 4.968494710158728e-05, + "loss": 5.4757, + "step": 8507 + }, + { + "epoch": 0.050599486154724524, + "grad_norm": 1.7823325395584106, + "learning_rate": 4.968487317545821e-05, + "loss": 5.4872, + "step": 8508 + }, + { + "epoch": 0.05060543343800552, + "grad_norm": 2.39149808883667, + "learning_rate": 4.9684799240711896e-05, + "loss": 5.039, + "step": 8509 + }, + { + "epoch": 0.050611380721286514, + "grad_norm": 2.0295841693878174, + "learning_rate": 4.968472529734838e-05, + "loss": 5.1086, + "step": 8510 + }, + { + "epoch": 0.050617328004567516, + "grad_norm": 2.6830973625183105, + "learning_rate": 4.9684651345367684e-05, + "loss": 4.8889, + "step": 8511 + }, + { + "epoch": 0.05062327528784851, + "grad_norm": 2.3600027561187744, + "learning_rate": 4.9684577384769825e-05, + "loss": 5.5305, + "step": 8512 + }, + { + "epoch": 0.050629222571129506, + "grad_norm": 2.1680233478546143, + "learning_rate": 4.968450341555484e-05, + "loss": 5.8196, + "step": 8513 + }, + { + "epoch": 0.05063516985441051, + "grad_norm": 1.800645351409912, + "learning_rate": 4.968442943772275e-05, + "loss": 5.2689, + "step": 8514 + }, + { + "epoch": 0.0506411171376915, + "grad_norm": 1.983245849609375, + "learning_rate": 4.9684355451273566e-05, + "loss": 4.7782, + "step": 8515 + }, + { + "epoch": 0.0506470644209725, + "grad_norm": 2.12082576751709, + "learning_rate": 4.968428145620735e-05, + "loss": 4.7946, + "step": 8516 + }, + { + "epoch": 0.050653011704253494, + "grad_norm": 1.7249135971069336, + "learning_rate": 4.968420745252409e-05, + "loss": 4.7055, + "step": 8517 + }, + { + "epoch": 0.050658958987534496, + "grad_norm": 1.971240758895874, + "learning_rate": 4.968413344022384e-05, + "loss": 4.7343, + "step": 8518 + }, + { + "epoch": 0.05066490627081549, + "grad_norm": 1.780387282371521, + "learning_rate": 4.968405941930661e-05, + "loss": 4.7502, + "step": 8519 + }, + { + "epoch": 0.050670853554096486, + "grad_norm": 1.772007942199707, + "learning_rate": 4.968398538977242e-05, + "loss": 4.7439, + "step": 8520 + }, + { + "epoch": 0.05067680083737749, + "grad_norm": 1.9167592525482178, + "learning_rate": 4.9683911351621324e-05, + "loss": 4.6393, + "step": 8521 + }, + { + "epoch": 0.05068274812065848, + "grad_norm": 2.0527031421661377, + "learning_rate": 4.968383730485331e-05, + "loss": 4.6379, + "step": 8522 + }, + { + "epoch": 0.05068869540393948, + "grad_norm": 2.0608508586883545, + "learning_rate": 4.968376324946844e-05, + "loss": 4.6128, + "step": 8523 + }, + { + "epoch": 0.05069464268722048, + "grad_norm": 1.984731674194336, + "learning_rate": 4.968368918546672e-05, + "loss": 4.5969, + "step": 8524 + }, + { + "epoch": 0.050700589970501475, + "grad_norm": 1.7904438972473145, + "learning_rate": 4.968361511284817e-05, + "loss": 4.6853, + "step": 8525 + }, + { + "epoch": 0.05070653725378247, + "grad_norm": 1.8095389604568481, + "learning_rate": 4.968354103161283e-05, + "loss": 4.5748, + "step": 8526 + }, + { + "epoch": 0.05071248453706347, + "grad_norm": 1.8565012216567993, + "learning_rate": 4.968346694176073e-05, + "loss": 4.5249, + "step": 8527 + }, + { + "epoch": 0.05071843182034447, + "grad_norm": 1.7721836566925049, + "learning_rate": 4.968339284329188e-05, + "loss": 4.6593, + "step": 8528 + }, + { + "epoch": 0.05072437910362546, + "grad_norm": 1.9470161199569702, + "learning_rate": 4.968331873620631e-05, + "loss": 4.5432, + "step": 8529 + }, + { + "epoch": 0.050730326386906464, + "grad_norm": 1.8639118671417236, + "learning_rate": 4.968324462050404e-05, + "loss": 4.4464, + "step": 8530 + }, + { + "epoch": 0.05073627367018746, + "grad_norm": 1.9226467609405518, + "learning_rate": 4.9683170496185114e-05, + "loss": 4.4364, + "step": 8531 + }, + { + "epoch": 0.050742220953468455, + "grad_norm": 1.988198161125183, + "learning_rate": 4.9683096363249545e-05, + "loss": 4.6614, + "step": 8532 + }, + { + "epoch": 0.05074816823674945, + "grad_norm": 1.903645396232605, + "learning_rate": 4.9683022221697374e-05, + "loss": 4.5168, + "step": 8533 + }, + { + "epoch": 0.05075411552003045, + "grad_norm": 1.903448224067688, + "learning_rate": 4.96829480715286e-05, + "loss": 4.5899, + "step": 8534 + }, + { + "epoch": 0.05076006280331145, + "grad_norm": 1.864522099494934, + "learning_rate": 4.9682873912743274e-05, + "loss": 4.5896, + "step": 8535 + }, + { + "epoch": 0.05076601008659244, + "grad_norm": 1.8760302066802979, + "learning_rate": 4.9682799745341406e-05, + "loss": 4.593, + "step": 8536 + }, + { + "epoch": 0.050771957369873444, + "grad_norm": 1.9024009704589844, + "learning_rate": 4.968272556932303e-05, + "loss": 4.9861, + "step": 8537 + }, + { + "epoch": 0.05077790465315444, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9682651384688176e-05, + "loss": 5.6755, + "step": 8538 + }, + { + "epoch": 0.050783851936435434, + "grad_norm": 1.758934736251831, + "learning_rate": 4.9682577191436854e-05, + "loss": 5.4334, + "step": 8539 + }, + { + "epoch": 0.050789799219716436, + "grad_norm": 2.3531200885772705, + "learning_rate": 4.968250298956909e-05, + "loss": 4.9819, + "step": 8540 + }, + { + "epoch": 0.05079574650299743, + "grad_norm": 1.901681661605835, + "learning_rate": 4.968242877908494e-05, + "loss": 5.1642, + "step": 8541 + }, + { + "epoch": 0.050801693786278426, + "grad_norm": 1.7250633239746094, + "learning_rate": 4.96823545599844e-05, + "loss": 5.4847, + "step": 8542 + }, + { + "epoch": 0.05080764106955943, + "grad_norm": 1.7400966882705688, + "learning_rate": 4.968228033226751e-05, + "loss": 5.5902, + "step": 8543 + }, + { + "epoch": 0.05081358835284042, + "grad_norm": 1.5469578504562378, + "learning_rate": 4.968220609593428e-05, + "loss": 5.6432, + "step": 8544 + }, + { + "epoch": 0.05081953563612142, + "grad_norm": 1.8277182579040527, + "learning_rate": 4.968213185098475e-05, + "loss": 5.3296, + "step": 8545 + }, + { + "epoch": 0.050825482919402414, + "grad_norm": 2.0535261631011963, + "learning_rate": 4.9682057597418943e-05, + "loss": 5.5278, + "step": 8546 + }, + { + "epoch": 0.050831430202683416, + "grad_norm": 1.8631746768951416, + "learning_rate": 4.9681983335236894e-05, + "loss": 5.556, + "step": 8547 + }, + { + "epoch": 0.05083737748596441, + "grad_norm": 1.6663711071014404, + "learning_rate": 4.968190906443861e-05, + "loss": 5.4321, + "step": 8548 + }, + { + "epoch": 0.050843324769245406, + "grad_norm": 1.8302260637283325, + "learning_rate": 4.968183478502413e-05, + "loss": 5.4746, + "step": 8549 + }, + { + "epoch": 0.05084927205252641, + "grad_norm": 1.9203182458877563, + "learning_rate": 4.968176049699347e-05, + "loss": 5.4334, + "step": 8550 + }, + { + "epoch": 0.0508552193358074, + "grad_norm": 2.0406670570373535, + "learning_rate": 4.9681686200346674e-05, + "loss": 5.6509, + "step": 8551 + }, + { + "epoch": 0.0508611666190884, + "grad_norm": 2.3438572883605957, + "learning_rate": 4.968161189508374e-05, + "loss": 5.8662, + "step": 8552 + }, + { + "epoch": 0.0508671139023694, + "grad_norm": 1.9612985849380493, + "learning_rate": 4.968153758120473e-05, + "loss": 5.6813, + "step": 8553 + }, + { + "epoch": 0.050873061185650395, + "grad_norm": 1.4175993204116821, + "learning_rate": 4.968146325870964e-05, + "loss": 5.4593, + "step": 8554 + }, + { + "epoch": 0.05087900846893139, + "grad_norm": 1.3445212841033936, + "learning_rate": 4.96813889275985e-05, + "loss": 5.4195, + "step": 8555 + }, + { + "epoch": 0.05088495575221239, + "grad_norm": 1.9938427209854126, + "learning_rate": 4.968131458787135e-05, + "loss": 5.8791, + "step": 8556 + }, + { + "epoch": 0.05089090303549339, + "grad_norm": 1.7449276447296143, + "learning_rate": 4.9681240239528216e-05, + "loss": 5.3574, + "step": 8557 + }, + { + "epoch": 0.05089685031877438, + "grad_norm": 2.0117087364196777, + "learning_rate": 4.96811658825691e-05, + "loss": 5.3548, + "step": 8558 + }, + { + "epoch": 0.050902797602055384, + "grad_norm": 1.97372567653656, + "learning_rate": 4.968109151699406e-05, + "loss": 5.5281, + "step": 8559 + }, + { + "epoch": 0.05090874488533638, + "grad_norm": 1.8815237283706665, + "learning_rate": 4.9681017142803095e-05, + "loss": 5.4849, + "step": 8560 + }, + { + "epoch": 0.050914692168617375, + "grad_norm": 1.627252221107483, + "learning_rate": 4.968094275999624e-05, + "loss": 5.2125, + "step": 8561 + }, + { + "epoch": 0.05092063945189837, + "grad_norm": 1.4768601655960083, + "learning_rate": 4.968086836857353e-05, + "loss": 5.0817, + "step": 8562 + }, + { + "epoch": 0.05092658673517937, + "grad_norm": 2.0249485969543457, + "learning_rate": 4.968079396853498e-05, + "loss": 5.4025, + "step": 8563 + }, + { + "epoch": 0.05093253401846037, + "grad_norm": 2.0904550552368164, + "learning_rate": 4.968071955988062e-05, + "loss": 5.4404, + "step": 8564 + }, + { + "epoch": 0.05093848130174136, + "grad_norm": 1.935063123703003, + "learning_rate": 4.9680645142610475e-05, + "loss": 5.4961, + "step": 8565 + }, + { + "epoch": 0.050944428585022364, + "grad_norm": 1.9836292266845703, + "learning_rate": 4.968057071672457e-05, + "loss": 5.2469, + "step": 8566 + }, + { + "epoch": 0.05095037586830336, + "grad_norm": 1.8337205648422241, + "learning_rate": 4.9680496282222944e-05, + "loss": 5.4432, + "step": 8567 + }, + { + "epoch": 0.050956323151584354, + "grad_norm": 1.9169154167175293, + "learning_rate": 4.9680421839105604e-05, + "loss": 5.2606, + "step": 8568 + }, + { + "epoch": 0.050962270434865356, + "grad_norm": 1.5869332551956177, + "learning_rate": 4.968034738737258e-05, + "loss": 5.006, + "step": 8569 + }, + { + "epoch": 0.05096821771814635, + "grad_norm": 1.5824979543685913, + "learning_rate": 4.968027292702391e-05, + "loss": 5.2078, + "step": 8570 + }, + { + "epoch": 0.050974165001427346, + "grad_norm": 1.7121458053588867, + "learning_rate": 4.96801984580596e-05, + "loss": 5.3913, + "step": 8571 + }, + { + "epoch": 0.05098011228470835, + "grad_norm": 1.7111082077026367, + "learning_rate": 4.96801239804797e-05, + "loss": 5.3957, + "step": 8572 + }, + { + "epoch": 0.05098605956798934, + "grad_norm": 1.834083080291748, + "learning_rate": 4.968004949428421e-05, + "loss": 5.501, + "step": 8573 + }, + { + "epoch": 0.05099200685127034, + "grad_norm": 1.773421287536621, + "learning_rate": 4.967997499947318e-05, + "loss": 5.429, + "step": 8574 + }, + { + "epoch": 0.05099795413455134, + "grad_norm": 1.7471132278442383, + "learning_rate": 4.967990049604663e-05, + "loss": 5.4853, + "step": 8575 + }, + { + "epoch": 0.051003901417832335, + "grad_norm": 1.7264289855957031, + "learning_rate": 4.967982598400457e-05, + "loss": 5.4415, + "step": 8576 + }, + { + "epoch": 0.05100984870111333, + "grad_norm": 1.750982403755188, + "learning_rate": 4.9679751463347044e-05, + "loss": 5.1731, + "step": 8577 + }, + { + "epoch": 0.051015795984394326, + "grad_norm": 1.6106518507003784, + "learning_rate": 4.967967693407407e-05, + "loss": 5.2692, + "step": 8578 + }, + { + "epoch": 0.05102174326767533, + "grad_norm": 1.8728212118148804, + "learning_rate": 4.967960239618568e-05, + "loss": 5.2416, + "step": 8579 + }, + { + "epoch": 0.05102769055095632, + "grad_norm": 1.6410562992095947, + "learning_rate": 4.967952784968189e-05, + "loss": 5.1824, + "step": 8580 + }, + { + "epoch": 0.05103363783423732, + "grad_norm": 1.7119427919387817, + "learning_rate": 4.967945329456274e-05, + "loss": 5.2316, + "step": 8581 + }, + { + "epoch": 0.05103958511751832, + "grad_norm": 1.667602300643921, + "learning_rate": 4.967937873082824e-05, + "loss": 4.9599, + "step": 8582 + }, + { + "epoch": 0.051045532400799315, + "grad_norm": 1.9595974683761597, + "learning_rate": 4.967930415847842e-05, + "loss": 4.9613, + "step": 8583 + }, + { + "epoch": 0.05105147968408031, + "grad_norm": 1.70210862159729, + "learning_rate": 4.967922957751332e-05, + "loss": 5.3587, + "step": 8584 + }, + { + "epoch": 0.05105742696736131, + "grad_norm": 2.101145029067993, + "learning_rate": 4.967915498793295e-05, + "loss": 5.2782, + "step": 8585 + }, + { + "epoch": 0.05106337425064231, + "grad_norm": 1.8836926221847534, + "learning_rate": 4.9679080389737344e-05, + "loss": 5.3128, + "step": 8586 + }, + { + "epoch": 0.0510693215339233, + "grad_norm": 1.7542184591293335, + "learning_rate": 4.967900578292652e-05, + "loss": 5.2236, + "step": 8587 + }, + { + "epoch": 0.051075268817204304, + "grad_norm": 1.8415964841842651, + "learning_rate": 4.967893116750052e-05, + "loss": 5.1267, + "step": 8588 + }, + { + "epoch": 0.0510812161004853, + "grad_norm": 1.7702316045761108, + "learning_rate": 4.967885654345936e-05, + "loss": 5.6495, + "step": 8589 + }, + { + "epoch": 0.051087163383766294, + "grad_norm": 1.7790406942367554, + "learning_rate": 4.967878191080306e-05, + "loss": 5.2561, + "step": 8590 + }, + { + "epoch": 0.05109311066704729, + "grad_norm": 1.7282217741012573, + "learning_rate": 4.967870726953165e-05, + "loss": 5.2589, + "step": 8591 + }, + { + "epoch": 0.05109905795032829, + "grad_norm": 1.6590560674667358, + "learning_rate": 4.967863261964517e-05, + "loss": 5.1952, + "step": 8592 + }, + { + "epoch": 0.05110500523360929, + "grad_norm": 1.5948386192321777, + "learning_rate": 4.9678557961143625e-05, + "loss": 5.297, + "step": 8593 + }, + { + "epoch": 0.05111095251689028, + "grad_norm": 1.8219022750854492, + "learning_rate": 4.9678483294027046e-05, + "loss": 5.3391, + "step": 8594 + }, + { + "epoch": 0.051116899800171284, + "grad_norm": 1.547616720199585, + "learning_rate": 4.967840861829547e-05, + "loss": 5.4224, + "step": 8595 + }, + { + "epoch": 0.05112284708345228, + "grad_norm": 1.7924590110778809, + "learning_rate": 4.9678333933948914e-05, + "loss": 5.2371, + "step": 8596 + }, + { + "epoch": 0.051128794366733274, + "grad_norm": 1.7630747556686401, + "learning_rate": 4.9678259240987416e-05, + "loss": 5.4849, + "step": 8597 + }, + { + "epoch": 0.051134741650014276, + "grad_norm": 1.7853891849517822, + "learning_rate": 4.967818453941098e-05, + "loss": 5.1753, + "step": 8598 + }, + { + "epoch": 0.05114068893329527, + "grad_norm": 1.6572301387786865, + "learning_rate": 4.9678109829219654e-05, + "loss": 5.3747, + "step": 8599 + }, + { + "epoch": 0.051146636216576266, + "grad_norm": 1.6574329137802124, + "learning_rate": 4.9678035110413445e-05, + "loss": 5.417, + "step": 8600 + }, + { + "epoch": 0.05115258349985727, + "grad_norm": 1.7093894481658936, + "learning_rate": 4.9677960382992396e-05, + "loss": 5.4605, + "step": 8601 + }, + { + "epoch": 0.05115853078313826, + "grad_norm": 1.6304559707641602, + "learning_rate": 4.967788564695652e-05, + "loss": 5.6186, + "step": 8602 + }, + { + "epoch": 0.05116447806641926, + "grad_norm": 1.6134929656982422, + "learning_rate": 4.967781090230586e-05, + "loss": 5.5084, + "step": 8603 + }, + { + "epoch": 0.05117042534970026, + "grad_norm": 1.7007251977920532, + "learning_rate": 4.9677736149040426e-05, + "loss": 5.2542, + "step": 8604 + }, + { + "epoch": 0.051176372632981255, + "grad_norm": 1.6648818254470825, + "learning_rate": 4.967766138716025e-05, + "loss": 5.4136, + "step": 8605 + }, + { + "epoch": 0.05118231991626225, + "grad_norm": 1.5595816373825073, + "learning_rate": 4.967758661666535e-05, + "loss": 5.181, + "step": 8606 + }, + { + "epoch": 0.051188267199543246, + "grad_norm": 1.7358763217926025, + "learning_rate": 4.967751183755577e-05, + "loss": 5.3509, + "step": 8607 + }, + { + "epoch": 0.05119421448282425, + "grad_norm": 1.6836191415786743, + "learning_rate": 4.967743704983152e-05, + "loss": 5.4656, + "step": 8608 + }, + { + "epoch": 0.05120016176610524, + "grad_norm": 1.4641087055206299, + "learning_rate": 4.967736225349263e-05, + "loss": 5.5304, + "step": 8609 + }, + { + "epoch": 0.05120610904938624, + "grad_norm": 1.6273541450500488, + "learning_rate": 4.967728744853913e-05, + "loss": 5.4029, + "step": 8610 + }, + { + "epoch": 0.05121205633266724, + "grad_norm": 1.6471314430236816, + "learning_rate": 4.967721263497105e-05, + "loss": 5.4333, + "step": 8611 + }, + { + "epoch": 0.051218003615948235, + "grad_norm": 1.798155665397644, + "learning_rate": 4.96771378127884e-05, + "loss": 5.5214, + "step": 8612 + }, + { + "epoch": 0.05122395089922923, + "grad_norm": 1.8606700897216797, + "learning_rate": 4.967706298199122e-05, + "loss": 4.8808, + "step": 8613 + }, + { + "epoch": 0.05122989818251023, + "grad_norm": 1.7144849300384521, + "learning_rate": 4.967698814257953e-05, + "loss": 4.9451, + "step": 8614 + }, + { + "epoch": 0.05123584546579123, + "grad_norm": 1.7411640882492065, + "learning_rate": 4.9676913294553364e-05, + "loss": 4.9771, + "step": 8615 + }, + { + "epoch": 0.05124179274907222, + "grad_norm": 1.7012072801589966, + "learning_rate": 4.9676838437912736e-05, + "loss": 4.9028, + "step": 8616 + }, + { + "epoch": 0.051247740032353224, + "grad_norm": 1.8154243230819702, + "learning_rate": 4.967676357265768e-05, + "loss": 5.4115, + "step": 8617 + }, + { + "epoch": 0.05125368731563422, + "grad_norm": 2.7746822834014893, + "learning_rate": 4.967668869878823e-05, + "loss": 5.5487, + "step": 8618 + }, + { + "epoch": 0.051259634598915214, + "grad_norm": 1.8362152576446533, + "learning_rate": 4.9676613816304395e-05, + "loss": 5.486, + "step": 8619 + }, + { + "epoch": 0.05126558188219621, + "grad_norm": 1.975853681564331, + "learning_rate": 4.967653892520621e-05, + "loss": 5.4348, + "step": 8620 + }, + { + "epoch": 0.05127152916547721, + "grad_norm": 1.8126581907272339, + "learning_rate": 4.96764640254937e-05, + "loss": 5.4558, + "step": 8621 + }, + { + "epoch": 0.05127747644875821, + "grad_norm": 1.6068531274795532, + "learning_rate": 4.967638911716689e-05, + "loss": 5.4672, + "step": 8622 + }, + { + "epoch": 0.0512834237320392, + "grad_norm": 1.6384878158569336, + "learning_rate": 4.9676314200225804e-05, + "loss": 5.1591, + "step": 8623 + }, + { + "epoch": 0.051289371015320204, + "grad_norm": 2.0413742065429688, + "learning_rate": 4.9676239274670474e-05, + "loss": 4.8992, + "step": 8624 + }, + { + "epoch": 0.0512953182986012, + "grad_norm": 1.7591389417648315, + "learning_rate": 4.967616434050093e-05, + "loss": 5.3629, + "step": 8625 + }, + { + "epoch": 0.051301265581882194, + "grad_norm": 1.9222301244735718, + "learning_rate": 4.967608939771719e-05, + "loss": 5.5082, + "step": 8626 + }, + { + "epoch": 0.051307212865163196, + "grad_norm": 1.8040579557418823, + "learning_rate": 4.967601444631928e-05, + "loss": 5.4019, + "step": 8627 + }, + { + "epoch": 0.05131316014844419, + "grad_norm": 2.0685603618621826, + "learning_rate": 4.967593948630723e-05, + "loss": 5.1959, + "step": 8628 + }, + { + "epoch": 0.051319107431725186, + "grad_norm": 1.446341872215271, + "learning_rate": 4.967586451768106e-05, + "loss": 5.4233, + "step": 8629 + }, + { + "epoch": 0.05132505471500619, + "grad_norm": 1.4487289190292358, + "learning_rate": 4.9675789540440806e-05, + "loss": 5.4065, + "step": 8630 + }, + { + "epoch": 0.05133100199828718, + "grad_norm": 2.367469310760498, + "learning_rate": 4.967571455458648e-05, + "loss": 5.3512, + "step": 8631 + }, + { + "epoch": 0.05133694928156818, + "grad_norm": 2.7115249633789062, + "learning_rate": 4.967563956011812e-05, + "loss": 5.4494, + "step": 8632 + }, + { + "epoch": 0.05134289656484918, + "grad_norm": 2.6692097187042236, + "learning_rate": 4.967556455703576e-05, + "loss": 5.2747, + "step": 8633 + }, + { + "epoch": 0.051348843848130175, + "grad_norm": 2.516005754470825, + "learning_rate": 4.967548954533941e-05, + "loss": 5.2305, + "step": 8634 + }, + { + "epoch": 0.05135479113141117, + "grad_norm": 1.6234782934188843, + "learning_rate": 4.96754145250291e-05, + "loss": 5.5192, + "step": 8635 + }, + { + "epoch": 0.051360738414692166, + "grad_norm": 1.9273806810379028, + "learning_rate": 4.9675339496104855e-05, + "loss": 5.4479, + "step": 8636 + }, + { + "epoch": 0.05136668569797317, + "grad_norm": 2.510847568511963, + "learning_rate": 4.967526445856671e-05, + "loss": 4.9858, + "step": 8637 + }, + { + "epoch": 0.05137263298125416, + "grad_norm": 2.3722991943359375, + "learning_rate": 4.967518941241468e-05, + "loss": 5.2287, + "step": 8638 + }, + { + "epoch": 0.05137858026453516, + "grad_norm": 2.286569118499756, + "learning_rate": 4.96751143576488e-05, + "loss": 5.2643, + "step": 8639 + }, + { + "epoch": 0.05138452754781616, + "grad_norm": 2.493534803390503, + "learning_rate": 4.9675039294269086e-05, + "loss": 5.1207, + "step": 8640 + }, + { + "epoch": 0.051390474831097155, + "grad_norm": 2.622694969177246, + "learning_rate": 4.967496422227558e-05, + "loss": 4.9735, + "step": 8641 + }, + { + "epoch": 0.05139642211437815, + "grad_norm": 1.7518365383148193, + "learning_rate": 4.967488914166829e-05, + "loss": 5.8818, + "step": 8642 + }, + { + "epoch": 0.05140236939765915, + "grad_norm": 2.0281870365142822, + "learning_rate": 4.9674814052447256e-05, + "loss": 6.3773, + "step": 8643 + }, + { + "epoch": 0.05140831668094015, + "grad_norm": 1.880083441734314, + "learning_rate": 4.96747389546125e-05, + "loss": 5.831, + "step": 8644 + }, + { + "epoch": 0.05141426396422114, + "grad_norm": 2.0792593955993652, + "learning_rate": 4.967466384816404e-05, + "loss": 5.8799, + "step": 8645 + }, + { + "epoch": 0.051420211247502144, + "grad_norm": 2.4550280570983887, + "learning_rate": 4.967458873310192e-05, + "loss": 5.2983, + "step": 8646 + }, + { + "epoch": 0.05142615853078314, + "grad_norm": 2.5590765476226807, + "learning_rate": 4.967451360942615e-05, + "loss": 5.1157, + "step": 8647 + }, + { + "epoch": 0.051432105814064134, + "grad_norm": 2.2328450679779053, + "learning_rate": 4.967443847713677e-05, + "loss": 5.047, + "step": 8648 + }, + { + "epoch": 0.05143805309734513, + "grad_norm": 2.0624022483825684, + "learning_rate": 4.9674363336233786e-05, + "loss": 5.6819, + "step": 8649 + }, + { + "epoch": 0.05144400038062613, + "grad_norm": 2.075239658355713, + "learning_rate": 4.9674288186717246e-05, + "loss": 5.895, + "step": 8650 + }, + { + "epoch": 0.05144994766390713, + "grad_norm": 1.7228562831878662, + "learning_rate": 4.967421302858716e-05, + "loss": 5.9199, + "step": 8651 + }, + { + "epoch": 0.05145589494718812, + "grad_norm": 2.235020637512207, + "learning_rate": 4.967413786184356e-05, + "loss": 5.0644, + "step": 8652 + }, + { + "epoch": 0.051461842230469124, + "grad_norm": 1.8620972633361816, + "learning_rate": 4.967406268648648e-05, + "loss": 5.7956, + "step": 8653 + }, + { + "epoch": 0.05146778951375012, + "grad_norm": 1.7914378643035889, + "learning_rate": 4.967398750251594e-05, + "loss": 5.742, + "step": 8654 + }, + { + "epoch": 0.051473736797031114, + "grad_norm": 2.0010504722595215, + "learning_rate": 4.967391230993196e-05, + "loss": 5.7808, + "step": 8655 + }, + { + "epoch": 0.051479684080312116, + "grad_norm": 2.1851212978363037, + "learning_rate": 4.9673837108734575e-05, + "loss": 5.4217, + "step": 8656 + }, + { + "epoch": 0.05148563136359311, + "grad_norm": 1.6896641254425049, + "learning_rate": 4.967376189892382e-05, + "loss": 6.321, + "step": 8657 + }, + { + "epoch": 0.051491578646874106, + "grad_norm": 1.7083675861358643, + "learning_rate": 4.967368668049969e-05, + "loss": 5.495, + "step": 8658 + }, + { + "epoch": 0.05149752593015511, + "grad_norm": 2.537256956100464, + "learning_rate": 4.967361145346224e-05, + "loss": 5.4096, + "step": 8659 + }, + { + "epoch": 0.0515034732134361, + "grad_norm": 2.3463892936706543, + "learning_rate": 4.967353621781149e-05, + "loss": 6.2461, + "step": 8660 + }, + { + "epoch": 0.0515094204967171, + "grad_norm": 1.6834701299667358, + "learning_rate": 4.967346097354746e-05, + "loss": 6.1007, + "step": 8661 + }, + { + "epoch": 0.0515153677799981, + "grad_norm": 2.140557289123535, + "learning_rate": 4.9673385720670184e-05, + "loss": 5.9908, + "step": 8662 + }, + { + "epoch": 0.051521315063279095, + "grad_norm": 2.211639165878296, + "learning_rate": 4.9673310459179676e-05, + "loss": 6.4192, + "step": 8663 + }, + { + "epoch": 0.05152726234656009, + "grad_norm": 1.8421399593353271, + "learning_rate": 4.9673235189075975e-05, + "loss": 6.099, + "step": 8664 + }, + { + "epoch": 0.051533209629841085, + "grad_norm": 1.7775965929031372, + "learning_rate": 4.96731599103591e-05, + "loss": 5.9572, + "step": 8665 + }, + { + "epoch": 0.05153915691312209, + "grad_norm": 1.7500132322311401, + "learning_rate": 4.967308462302909e-05, + "loss": 6.0987, + "step": 8666 + }, + { + "epoch": 0.05154510419640308, + "grad_norm": 1.7952892780303955, + "learning_rate": 4.967300932708595e-05, + "loss": 6.0235, + "step": 8667 + }, + { + "epoch": 0.05155105147968408, + "grad_norm": 1.7696008682250977, + "learning_rate": 4.967293402252972e-05, + "loss": 5.8253, + "step": 8668 + }, + { + "epoch": 0.05155699876296508, + "grad_norm": 1.848975419998169, + "learning_rate": 4.967285870936042e-05, + "loss": 6.0942, + "step": 8669 + }, + { + "epoch": 0.051562946046246075, + "grad_norm": 2.412909507751465, + "learning_rate": 4.967278338757808e-05, + "loss": 5.5752, + "step": 8670 + }, + { + "epoch": 0.05156889332952707, + "grad_norm": 2.0214738845825195, + "learning_rate": 4.967270805718273e-05, + "loss": 5.5721, + "step": 8671 + }, + { + "epoch": 0.05157484061280807, + "grad_norm": 2.3830201625823975, + "learning_rate": 4.967263271817439e-05, + "loss": 6.034, + "step": 8672 + }, + { + "epoch": 0.05158078789608907, + "grad_norm": 2.213979959487915, + "learning_rate": 4.9672557370553094e-05, + "loss": 6.0169, + "step": 8673 + }, + { + "epoch": 0.05158673517937006, + "grad_norm": 1.9657354354858398, + "learning_rate": 4.967248201431887e-05, + "loss": 6.0159, + "step": 8674 + }, + { + "epoch": 0.051592682462651064, + "grad_norm": 2.0882673263549805, + "learning_rate": 4.967240664947172e-05, + "loss": 6.1088, + "step": 8675 + }, + { + "epoch": 0.05159862974593206, + "grad_norm": 2.291152000427246, + "learning_rate": 4.96723312760117e-05, + "loss": 5.4534, + "step": 8676 + }, + { + "epoch": 0.051604577029213054, + "grad_norm": 2.3495421409606934, + "learning_rate": 4.967225589393881e-05, + "loss": 5.5524, + "step": 8677 + }, + { + "epoch": 0.05161052431249405, + "grad_norm": 2.2665255069732666, + "learning_rate": 4.9672180503253106e-05, + "loss": 5.5208, + "step": 8678 + }, + { + "epoch": 0.05161647159577505, + "grad_norm": 2.1587207317352295, + "learning_rate": 4.9672105103954594e-05, + "loss": 5.7016, + "step": 8679 + }, + { + "epoch": 0.051622418879056046, + "grad_norm": 2.2260420322418213, + "learning_rate": 4.96720296960433e-05, + "loss": 5.6179, + "step": 8680 + }, + { + "epoch": 0.05162836616233704, + "grad_norm": 3.1678147315979004, + "learning_rate": 4.967195427951926e-05, + "loss": 5.4655, + "step": 8681 + }, + { + "epoch": 0.051634313445618044, + "grad_norm": 3.0126166343688965, + "learning_rate": 4.967187885438249e-05, + "loss": 5.5663, + "step": 8682 + }, + { + "epoch": 0.05164026072889904, + "grad_norm": 2.290069341659546, + "learning_rate": 4.9671803420633034e-05, + "loss": 5.7462, + "step": 8683 + }, + { + "epoch": 0.051646208012180034, + "grad_norm": 2.1958532333374023, + "learning_rate": 4.96717279782709e-05, + "loss": 5.8359, + "step": 8684 + }, + { + "epoch": 0.051652155295461036, + "grad_norm": 2.063312530517578, + "learning_rate": 4.967165252729611e-05, + "loss": 5.847, + "step": 8685 + }, + { + "epoch": 0.05165810257874203, + "grad_norm": 1.8041539192199707, + "learning_rate": 4.967157706770872e-05, + "loss": 5.9408, + "step": 8686 + }, + { + "epoch": 0.051664049862023026, + "grad_norm": 1.684831976890564, + "learning_rate": 4.967150159950873e-05, + "loss": 6.019, + "step": 8687 + }, + { + "epoch": 0.05166999714530403, + "grad_norm": 2.4915740489959717, + "learning_rate": 4.967142612269616e-05, + "loss": 5.357, + "step": 8688 + }, + { + "epoch": 0.05167594442858502, + "grad_norm": 2.2621138095855713, + "learning_rate": 4.967135063727106e-05, + "loss": 5.7726, + "step": 8689 + }, + { + "epoch": 0.05168189171186602, + "grad_norm": 1.9304747581481934, + "learning_rate": 4.967127514323345e-05, + "loss": 6.0958, + "step": 8690 + }, + { + "epoch": 0.05168783899514702, + "grad_norm": 1.7657890319824219, + "learning_rate": 4.9671199640583354e-05, + "loss": 6.1036, + "step": 8691 + }, + { + "epoch": 0.051693786278428015, + "grad_norm": 1.7449486255645752, + "learning_rate": 4.9671124129320794e-05, + "loss": 6.0843, + "step": 8692 + }, + { + "epoch": 0.05169973356170901, + "grad_norm": 2.0155117511749268, + "learning_rate": 4.96710486094458e-05, + "loss": 5.9626, + "step": 8693 + }, + { + "epoch": 0.051705680844990005, + "grad_norm": 2.1015188694000244, + "learning_rate": 4.967097308095839e-05, + "loss": 5.6053, + "step": 8694 + }, + { + "epoch": 0.05171162812827101, + "grad_norm": 1.9602909088134766, + "learning_rate": 4.967089754385861e-05, + "loss": 5.1988, + "step": 8695 + }, + { + "epoch": 0.051717575411552, + "grad_norm": 2.141657590866089, + "learning_rate": 4.9670821998146474e-05, + "loss": 5.2994, + "step": 8696 + }, + { + "epoch": 0.051723522694833, + "grad_norm": 2.1301774978637695, + "learning_rate": 4.9670746443822006e-05, + "loss": 5.7935, + "step": 8697 + }, + { + "epoch": 0.051729469978114, + "grad_norm": 1.9465678930282593, + "learning_rate": 4.9670670880885225e-05, + "loss": 5.1861, + "step": 8698 + }, + { + "epoch": 0.051735417261394995, + "grad_norm": 2.177234411239624, + "learning_rate": 4.967059530933618e-05, + "loss": 5.1114, + "step": 8699 + }, + { + "epoch": 0.05174136454467599, + "grad_norm": 2.0886077880859375, + "learning_rate": 4.967051972917488e-05, + "loss": 5.2905, + "step": 8700 + }, + { + "epoch": 0.05174731182795699, + "grad_norm": 1.8517125844955444, + "learning_rate": 4.967044414040136e-05, + "loss": 5.1672, + "step": 8701 + }, + { + "epoch": 0.05175325911123799, + "grad_norm": 1.7342808246612549, + "learning_rate": 4.967036854301564e-05, + "loss": 5.2767, + "step": 8702 + }, + { + "epoch": 0.05175920639451898, + "grad_norm": 1.7315362691879272, + "learning_rate": 4.9670292937017746e-05, + "loss": 5.2897, + "step": 8703 + }, + { + "epoch": 0.051765153677799984, + "grad_norm": 1.8794540166854858, + "learning_rate": 4.967021732240772e-05, + "loss": 5.3808, + "step": 8704 + }, + { + "epoch": 0.05177110096108098, + "grad_norm": 1.8047478199005127, + "learning_rate": 4.9670141699185565e-05, + "loss": 5.1074, + "step": 8705 + }, + { + "epoch": 0.051777048244361974, + "grad_norm": 1.699475884437561, + "learning_rate": 4.967006606735132e-05, + "loss": 5.8162, + "step": 8706 + }, + { + "epoch": 0.05178299552764297, + "grad_norm": 2.008352518081665, + "learning_rate": 4.966999042690501e-05, + "loss": 6.3593, + "step": 8707 + }, + { + "epoch": 0.05178894281092397, + "grad_norm": 1.8776370286941528, + "learning_rate": 4.966991477784667e-05, + "loss": 6.3419, + "step": 8708 + }, + { + "epoch": 0.051794890094204966, + "grad_norm": 2.018157720565796, + "learning_rate": 4.9669839120176306e-05, + "loss": 6.1927, + "step": 8709 + }, + { + "epoch": 0.05180083737748596, + "grad_norm": 1.833764910697937, + "learning_rate": 4.966976345389396e-05, + "loss": 5.0803, + "step": 8710 + }, + { + "epoch": 0.051806784660766964, + "grad_norm": 1.7809339761734009, + "learning_rate": 4.9669687778999655e-05, + "loss": 5.3891, + "step": 8711 + }, + { + "epoch": 0.05181273194404796, + "grad_norm": 1.9905017614364624, + "learning_rate": 4.966961209549341e-05, + "loss": 6.247, + "step": 8712 + }, + { + "epoch": 0.051818679227328954, + "grad_norm": 2.1396658420562744, + "learning_rate": 4.966953640337527e-05, + "loss": 6.2506, + "step": 8713 + }, + { + "epoch": 0.051824626510609956, + "grad_norm": 1.778996467590332, + "learning_rate": 4.9669460702645244e-05, + "loss": 6.1333, + "step": 8714 + }, + { + "epoch": 0.05183057379389095, + "grad_norm": 1.9936842918395996, + "learning_rate": 4.9669384993303366e-05, + "loss": 5.6486, + "step": 8715 + }, + { + "epoch": 0.051836521077171946, + "grad_norm": 1.8064475059509277, + "learning_rate": 4.9669309275349656e-05, + "loss": 6.1217, + "step": 8716 + }, + { + "epoch": 0.05184246836045295, + "grad_norm": 1.9532819986343384, + "learning_rate": 4.966923354878414e-05, + "loss": 5.5402, + "step": 8717 + }, + { + "epoch": 0.05184841564373394, + "grad_norm": 2.4843015670776367, + "learning_rate": 4.966915781360686e-05, + "loss": 4.7674, + "step": 8718 + }, + { + "epoch": 0.05185436292701494, + "grad_norm": 2.7453129291534424, + "learning_rate": 4.9669082069817835e-05, + "loss": 4.4489, + "step": 8719 + }, + { + "epoch": 0.05186031021029594, + "grad_norm": 3.0180628299713135, + "learning_rate": 4.9669006317417084e-05, + "loss": 4.1401, + "step": 8720 + }, + { + "epoch": 0.051866257493576935, + "grad_norm": 2.44638991355896, + "learning_rate": 4.966893055640464e-05, + "loss": 4.7241, + "step": 8721 + }, + { + "epoch": 0.05187220477685793, + "grad_norm": 2.0131804943084717, + "learning_rate": 4.9668854786780514e-05, + "loss": 5.6495, + "step": 8722 + }, + { + "epoch": 0.051878152060138925, + "grad_norm": 2.0331337451934814, + "learning_rate": 4.966877900854476e-05, + "loss": 5.6812, + "step": 8723 + }, + { + "epoch": 0.05188409934341993, + "grad_norm": 2.5784926414489746, + "learning_rate": 4.9668703221697385e-05, + "loss": 5.3617, + "step": 8724 + }, + { + "epoch": 0.05189004662670092, + "grad_norm": 2.599321126937866, + "learning_rate": 4.9668627426238425e-05, + "loss": 5.6273, + "step": 8725 + }, + { + "epoch": 0.05189599390998192, + "grad_norm": 2.53541898727417, + "learning_rate": 4.966855162216789e-05, + "loss": 5.2916, + "step": 8726 + }, + { + "epoch": 0.05190194119326292, + "grad_norm": 2.165160655975342, + "learning_rate": 4.9668475809485825e-05, + "loss": 5.6152, + "step": 8727 + }, + { + "epoch": 0.051907888476543915, + "grad_norm": 2.4488654136657715, + "learning_rate": 4.966839998819225e-05, + "loss": 5.4163, + "step": 8728 + }, + { + "epoch": 0.05191383575982491, + "grad_norm": 2.2756056785583496, + "learning_rate": 4.96683241582872e-05, + "loss": 5.9449, + "step": 8729 + }, + { + "epoch": 0.05191978304310591, + "grad_norm": 2.7889063358306885, + "learning_rate": 4.9668248319770683e-05, + "loss": 5.9502, + "step": 8730 + }, + { + "epoch": 0.05192573032638691, + "grad_norm": 2.620378255844116, + "learning_rate": 4.9668172472642735e-05, + "loss": 4.8344, + "step": 8731 + }, + { + "epoch": 0.0519316776096679, + "grad_norm": 2.2405688762664795, + "learning_rate": 4.9668096616903395e-05, + "loss": 5.598, + "step": 8732 + }, + { + "epoch": 0.051937624892948904, + "grad_norm": 2.3559701442718506, + "learning_rate": 4.9668020752552664e-05, + "loss": 5.7951, + "step": 8733 + }, + { + "epoch": 0.0519435721762299, + "grad_norm": 1.9856364727020264, + "learning_rate": 4.966794487959058e-05, + "loss": 5.3907, + "step": 8734 + }, + { + "epoch": 0.051949519459510894, + "grad_norm": 2.345541000366211, + "learning_rate": 4.966786899801718e-05, + "loss": 5.9875, + "step": 8735 + }, + { + "epoch": 0.05195546674279189, + "grad_norm": 2.4069056510925293, + "learning_rate": 4.9667793107832485e-05, + "loss": 6.0062, + "step": 8736 + }, + { + "epoch": 0.05196141402607289, + "grad_norm": 1.9191378355026245, + "learning_rate": 4.966771720903651e-05, + "loss": 6.1341, + "step": 8737 + }, + { + "epoch": 0.051967361309353886, + "grad_norm": 2.135986089706421, + "learning_rate": 4.9667641301629284e-05, + "loss": 5.6993, + "step": 8738 + }, + { + "epoch": 0.05197330859263488, + "grad_norm": 2.0774824619293213, + "learning_rate": 4.966756538561085e-05, + "loss": 5.9791, + "step": 8739 + }, + { + "epoch": 0.051979255875915883, + "grad_norm": 2.1451659202575684, + "learning_rate": 4.9667489460981224e-05, + "loss": 5.8181, + "step": 8740 + }, + { + "epoch": 0.05198520315919688, + "grad_norm": 2.2769901752471924, + "learning_rate": 4.966741352774043e-05, + "loss": 5.6799, + "step": 8741 + }, + { + "epoch": 0.051991150442477874, + "grad_norm": 2.22038197517395, + "learning_rate": 4.9667337585888494e-05, + "loss": 5.8781, + "step": 8742 + }, + { + "epoch": 0.051997097725758876, + "grad_norm": 2.417508125305176, + "learning_rate": 4.9667261635425446e-05, + "loss": 5.3458, + "step": 8743 + }, + { + "epoch": 0.05200304500903987, + "grad_norm": 2.0334360599517822, + "learning_rate": 4.966718567635131e-05, + "loss": 5.5241, + "step": 8744 + }, + { + "epoch": 0.052008992292320866, + "grad_norm": 2.3476316928863525, + "learning_rate": 4.9667109708666126e-05, + "loss": 5.8786, + "step": 8745 + }, + { + "epoch": 0.05201493957560187, + "grad_norm": 2.160106897354126, + "learning_rate": 4.96670337323699e-05, + "loss": 5.616, + "step": 8746 + }, + { + "epoch": 0.05202088685888286, + "grad_norm": 2.0048086643218994, + "learning_rate": 4.9666957747462665e-05, + "loss": 5.5787, + "step": 8747 + }, + { + "epoch": 0.05202683414216386, + "grad_norm": 2.9226925373077393, + "learning_rate": 4.966688175394446e-05, + "loss": 5.3708, + "step": 8748 + }, + { + "epoch": 0.05203278142544486, + "grad_norm": 1.9020568132400513, + "learning_rate": 4.9666805751815294e-05, + "loss": 5.6037, + "step": 8749 + }, + { + "epoch": 0.052038728708725855, + "grad_norm": 2.218637466430664, + "learning_rate": 4.966672974107519e-05, + "loss": 5.2983, + "step": 8750 + }, + { + "epoch": 0.05204467599200685, + "grad_norm": 2.906625270843506, + "learning_rate": 4.96666537217242e-05, + "loss": 5.1234, + "step": 8751 + }, + { + "epoch": 0.052050623275287845, + "grad_norm": 2.0095551013946533, + "learning_rate": 4.966657769376234e-05, + "loss": 5.2695, + "step": 8752 + }, + { + "epoch": 0.05205657055856885, + "grad_norm": 2.1369643211364746, + "learning_rate": 4.966650165718963e-05, + "loss": 5.5426, + "step": 8753 + }, + { + "epoch": 0.05206251784184984, + "grad_norm": 2.4762122631073, + "learning_rate": 4.966642561200608e-05, + "loss": 5.5595, + "step": 8754 + }, + { + "epoch": 0.05206846512513084, + "grad_norm": 2.199430227279663, + "learning_rate": 4.966634955821176e-05, + "loss": 5.5155, + "step": 8755 + }, + { + "epoch": 0.05207441240841184, + "grad_norm": 2.132460355758667, + "learning_rate": 4.966627349580666e-05, + "loss": 5.5344, + "step": 8756 + }, + { + "epoch": 0.052080359691692835, + "grad_norm": 2.4437100887298584, + "learning_rate": 4.966619742479082e-05, + "loss": 5.0135, + "step": 8757 + }, + { + "epoch": 0.05208630697497383, + "grad_norm": 1.5223499536514282, + "learning_rate": 4.9666121345164265e-05, + "loss": 5.5467, + "step": 8758 + }, + { + "epoch": 0.05209225425825483, + "grad_norm": 2.101797580718994, + "learning_rate": 4.966604525692702e-05, + "loss": 5.9493, + "step": 8759 + }, + { + "epoch": 0.05209820154153583, + "grad_norm": 1.9338927268981934, + "learning_rate": 4.966596916007912e-05, + "loss": 5.6625, + "step": 8760 + }, + { + "epoch": 0.05210414882481682, + "grad_norm": 2.1328654289245605, + "learning_rate": 4.966589305462058e-05, + "loss": 6.3202, + "step": 8761 + }, + { + "epoch": 0.052110096108097824, + "grad_norm": 1.963287115097046, + "learning_rate": 4.9665816940551434e-05, + "loss": 5.8885, + "step": 8762 + }, + { + "epoch": 0.05211604339137882, + "grad_norm": 2.124155282974243, + "learning_rate": 4.96657408178717e-05, + "loss": 5.6015, + "step": 8763 + }, + { + "epoch": 0.052121990674659814, + "grad_norm": 2.1011505126953125, + "learning_rate": 4.966566468658142e-05, + "loss": 5.7786, + "step": 8764 + }, + { + "epoch": 0.05212793795794081, + "grad_norm": 1.769573450088501, + "learning_rate": 4.966558854668061e-05, + "loss": 5.8229, + "step": 8765 + }, + { + "epoch": 0.05213388524122181, + "grad_norm": 1.7712751626968384, + "learning_rate": 4.966551239816929e-05, + "loss": 5.733, + "step": 8766 + }, + { + "epoch": 0.052139832524502806, + "grad_norm": 1.68185555934906, + "learning_rate": 4.9665436241047503e-05, + "loss": 6.015, + "step": 8767 + }, + { + "epoch": 0.0521457798077838, + "grad_norm": 1.8619519472122192, + "learning_rate": 4.966536007531526e-05, + "loss": 5.9545, + "step": 8768 + }, + { + "epoch": 0.0521517270910648, + "grad_norm": 1.6538097858428955, + "learning_rate": 4.96652839009726e-05, + "loss": 5.6138, + "step": 8769 + }, + { + "epoch": 0.0521576743743458, + "grad_norm": 1.721737027168274, + "learning_rate": 4.966520771801955e-05, + "loss": 6.0001, + "step": 8770 + }, + { + "epoch": 0.052163621657626794, + "grad_norm": 1.8449060916900635, + "learning_rate": 4.966513152645612e-05, + "loss": 5.6811, + "step": 8771 + }, + { + "epoch": 0.052169568940907796, + "grad_norm": 2.3810017108917236, + "learning_rate": 4.966505532628235e-05, + "loss": 5.4662, + "step": 8772 + }, + { + "epoch": 0.05217551622418879, + "grad_norm": 2.9262144565582275, + "learning_rate": 4.9664979117498265e-05, + "loss": 5.3555, + "step": 8773 + }, + { + "epoch": 0.052181463507469786, + "grad_norm": 2.1560001373291016, + "learning_rate": 4.966490290010389e-05, + "loss": 5.988, + "step": 8774 + }, + { + "epoch": 0.05218741079075079, + "grad_norm": 1.8220587968826294, + "learning_rate": 4.966482667409925e-05, + "loss": 5.8334, + "step": 8775 + }, + { + "epoch": 0.05219335807403178, + "grad_norm": 2.393651008605957, + "learning_rate": 4.9664750439484375e-05, + "loss": 5.5866, + "step": 8776 + }, + { + "epoch": 0.05219930535731278, + "grad_norm": 2.193864583969116, + "learning_rate": 4.966467419625929e-05, + "loss": 5.6642, + "step": 8777 + }, + { + "epoch": 0.05220525264059378, + "grad_norm": 2.24094820022583, + "learning_rate": 4.966459794442403e-05, + "loss": 5.7149, + "step": 8778 + }, + { + "epoch": 0.052211199923874775, + "grad_norm": 2.447439670562744, + "learning_rate": 4.9664521683978606e-05, + "loss": 5.4759, + "step": 8779 + }, + { + "epoch": 0.05221714720715577, + "grad_norm": 1.9538700580596924, + "learning_rate": 4.9664445414923055e-05, + "loss": 5.7, + "step": 8780 + }, + { + "epoch": 0.052223094490436765, + "grad_norm": 1.8960500955581665, + "learning_rate": 4.966436913725739e-05, + "loss": 5.7852, + "step": 8781 + }, + { + "epoch": 0.05222904177371777, + "grad_norm": 1.9234421253204346, + "learning_rate": 4.966429285098166e-05, + "loss": 5.9842, + "step": 8782 + }, + { + "epoch": 0.05223498905699876, + "grad_norm": 2.2879858016967773, + "learning_rate": 4.966421655609588e-05, + "loss": 5.6572, + "step": 8783 + }, + { + "epoch": 0.05224093634027976, + "grad_norm": 2.287932872772217, + "learning_rate": 4.966414025260008e-05, + "loss": 6.0675, + "step": 8784 + }, + { + "epoch": 0.05224688362356076, + "grad_norm": 1.6395118236541748, + "learning_rate": 4.9664063940494275e-05, + "loss": 5.6846, + "step": 8785 + }, + { + "epoch": 0.052252830906841755, + "grad_norm": 1.7121644020080566, + "learning_rate": 4.966398761977851e-05, + "loss": 5.7014, + "step": 8786 + }, + { + "epoch": 0.05225877819012275, + "grad_norm": 1.6225544214248657, + "learning_rate": 4.966391129045279e-05, + "loss": 5.6152, + "step": 8787 + }, + { + "epoch": 0.05226472547340375, + "grad_norm": 1.8484382629394531, + "learning_rate": 4.966383495251716e-05, + "loss": 5.8109, + "step": 8788 + }, + { + "epoch": 0.05227067275668475, + "grad_norm": 1.8225692510604858, + "learning_rate": 4.966375860597164e-05, + "loss": 6.0587, + "step": 8789 + }, + { + "epoch": 0.05227662003996574, + "grad_norm": 2.0333876609802246, + "learning_rate": 4.9663682250816255e-05, + "loss": 6.1406, + "step": 8790 + }, + { + "epoch": 0.052282567323246744, + "grad_norm": 2.0004124641418457, + "learning_rate": 4.9663605887051036e-05, + "loss": 5.6227, + "step": 8791 + }, + { + "epoch": 0.05228851460652774, + "grad_norm": 1.723655343055725, + "learning_rate": 4.9663529514676005e-05, + "loss": 5.5013, + "step": 8792 + }, + { + "epoch": 0.052294461889808734, + "grad_norm": 1.8351995944976807, + "learning_rate": 4.966345313369119e-05, + "loss": 5.3327, + "step": 8793 + }, + { + "epoch": 0.05230040917308973, + "grad_norm": 1.7514569759368896, + "learning_rate": 4.9663376744096615e-05, + "loss": 5.235, + "step": 8794 + }, + { + "epoch": 0.05230635645637073, + "grad_norm": 1.6678166389465332, + "learning_rate": 4.966330034589232e-05, + "loss": 5.2269, + "step": 8795 + }, + { + "epoch": 0.052312303739651726, + "grad_norm": 1.82132887840271, + "learning_rate": 4.9663223939078315e-05, + "loss": 5.0288, + "step": 8796 + }, + { + "epoch": 0.05231825102293272, + "grad_norm": 1.7815704345703125, + "learning_rate": 4.966314752365463e-05, + "loss": 5.4489, + "step": 8797 + }, + { + "epoch": 0.05232419830621372, + "grad_norm": 2.5268197059631348, + "learning_rate": 4.96630710996213e-05, + "loss": 5.0321, + "step": 8798 + }, + { + "epoch": 0.05233014558949472, + "grad_norm": 2.921208620071411, + "learning_rate": 4.9662994666978346e-05, + "loss": 5.0826, + "step": 8799 + }, + { + "epoch": 0.052336092872775714, + "grad_norm": 2.83243727684021, + "learning_rate": 4.9662918225725794e-05, + "loss": 4.9754, + "step": 8800 + }, + { + "epoch": 0.052342040156056716, + "grad_norm": 2.960346221923828, + "learning_rate": 4.966284177586368e-05, + "loss": 5.5808, + "step": 8801 + }, + { + "epoch": 0.05234798743933771, + "grad_norm": 2.479055643081665, + "learning_rate": 4.966276531739201e-05, + "loss": 5.3779, + "step": 8802 + }, + { + "epoch": 0.052353934722618706, + "grad_norm": 2.8753128051757812, + "learning_rate": 4.966268885031083e-05, + "loss": 5.4023, + "step": 8803 + }, + { + "epoch": 0.05235988200589971, + "grad_norm": 2.1152822971343994, + "learning_rate": 4.966261237462016e-05, + "loss": 6.1181, + "step": 8804 + }, + { + "epoch": 0.0523658292891807, + "grad_norm": 2.7178313732147217, + "learning_rate": 4.966253589032003e-05, + "loss": 5.1597, + "step": 8805 + }, + { + "epoch": 0.0523717765724617, + "grad_norm": 2.6567695140838623, + "learning_rate": 4.966245939741045e-05, + "loss": 5.0582, + "step": 8806 + }, + { + "epoch": 0.0523777238557427, + "grad_norm": 3.0211431980133057, + "learning_rate": 4.966238289589147e-05, + "loss": 4.8331, + "step": 8807 + }, + { + "epoch": 0.052383671139023695, + "grad_norm": 2.9341561794281006, + "learning_rate": 4.9662306385763114e-05, + "loss": 4.8482, + "step": 8808 + }, + { + "epoch": 0.05238961842230469, + "grad_norm": 2.781118631362915, + "learning_rate": 4.966222986702539e-05, + "loss": 4.9199, + "step": 8809 + }, + { + "epoch": 0.052395565705585685, + "grad_norm": 2.459233283996582, + "learning_rate": 4.9662153339678344e-05, + "loss": 5.4156, + "step": 8810 + }, + { + "epoch": 0.05240151298886669, + "grad_norm": 1.9862231016159058, + "learning_rate": 4.966207680372199e-05, + "loss": 5.3937, + "step": 8811 + }, + { + "epoch": 0.05240746027214768, + "grad_norm": 3.3698437213897705, + "learning_rate": 4.966200025915636e-05, + "loss": 4.6231, + "step": 8812 + }, + { + "epoch": 0.05241340755542868, + "grad_norm": 2.9254424571990967, + "learning_rate": 4.9661923705981486e-05, + "loss": 4.5612, + "step": 8813 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 2.684386968612671, + "learning_rate": 4.966184714419738e-05, + "loss": 4.8646, + "step": 8814 + }, + { + "epoch": 0.052425302121990675, + "grad_norm": 2.812406539916992, + "learning_rate": 4.966177057380409e-05, + "loss": 4.5116, + "step": 8815 + }, + { + "epoch": 0.05243124940527167, + "grad_norm": 2.1739046573638916, + "learning_rate": 4.966169399480162e-05, + "loss": 5.3369, + "step": 8816 + }, + { + "epoch": 0.05243719668855267, + "grad_norm": 2.408341407775879, + "learning_rate": 4.966161740719001e-05, + "loss": 5.0368, + "step": 8817 + }, + { + "epoch": 0.05244314397183367, + "grad_norm": 2.2844927310943604, + "learning_rate": 4.966154081096929e-05, + "loss": 5.0657, + "step": 8818 + }, + { + "epoch": 0.05244909125511466, + "grad_norm": 2.5329723358154297, + "learning_rate": 4.9661464206139475e-05, + "loss": 5.2006, + "step": 8819 + }, + { + "epoch": 0.052455038538395664, + "grad_norm": 2.154224395751953, + "learning_rate": 4.9661387592700595e-05, + "loss": 5.238, + "step": 8820 + }, + { + "epoch": 0.05246098582167666, + "grad_norm": 2.1069657802581787, + "learning_rate": 4.966131097065269e-05, + "loss": 5.0894, + "step": 8821 + }, + { + "epoch": 0.052466933104957654, + "grad_norm": 2.165954351425171, + "learning_rate": 4.9661234339995763e-05, + "loss": 5.1148, + "step": 8822 + }, + { + "epoch": 0.052472880388238656, + "grad_norm": 1.8859459161758423, + "learning_rate": 4.9661157700729866e-05, + "loss": 5.1703, + "step": 8823 + }, + { + "epoch": 0.05247882767151965, + "grad_norm": 1.9739452600479126, + "learning_rate": 4.9661081052855004e-05, + "loss": 5.3978, + "step": 8824 + }, + { + "epoch": 0.052484774954800646, + "grad_norm": 1.95566987991333, + "learning_rate": 4.966100439637122e-05, + "loss": 5.3592, + "step": 8825 + }, + { + "epoch": 0.05249072223808164, + "grad_norm": 1.8613550662994385, + "learning_rate": 4.966092773127853e-05, + "loss": 5.3746, + "step": 8826 + }, + { + "epoch": 0.05249666952136264, + "grad_norm": 2.001701831817627, + "learning_rate": 4.9660851057576966e-05, + "loss": 5.3269, + "step": 8827 + }, + { + "epoch": 0.05250261680464364, + "grad_norm": 1.8846383094787598, + "learning_rate": 4.9660774375266556e-05, + "loss": 5.7906, + "step": 8828 + }, + { + "epoch": 0.052508564087924633, + "grad_norm": 1.982998251914978, + "learning_rate": 4.966069768434732e-05, + "loss": 5.6609, + "step": 8829 + }, + { + "epoch": 0.052514511371205636, + "grad_norm": 2.3036038875579834, + "learning_rate": 4.9660620984819294e-05, + "loss": 5.6172, + "step": 8830 + }, + { + "epoch": 0.05252045865448663, + "grad_norm": 1.9227113723754883, + "learning_rate": 4.9660544276682496e-05, + "loss": 5.4734, + "step": 8831 + }, + { + "epoch": 0.052526405937767626, + "grad_norm": 2.038203716278076, + "learning_rate": 4.9660467559936964e-05, + "loss": 5.6484, + "step": 8832 + }, + { + "epoch": 0.05253235322104863, + "grad_norm": 2.217108964920044, + "learning_rate": 4.9660390834582704e-05, + "loss": 5.4064, + "step": 8833 + }, + { + "epoch": 0.05253830050432962, + "grad_norm": 2.4458765983581543, + "learning_rate": 4.966031410061976e-05, + "loss": 5.605, + "step": 8834 + }, + { + "epoch": 0.05254424778761062, + "grad_norm": 2.2767014503479004, + "learning_rate": 4.966023735804817e-05, + "loss": 5.4258, + "step": 8835 + }, + { + "epoch": 0.05255019507089162, + "grad_norm": 2.3594579696655273, + "learning_rate": 4.9660160606867936e-05, + "loss": 5.5138, + "step": 8836 + }, + { + "epoch": 0.052556142354172615, + "grad_norm": 1.8961461782455444, + "learning_rate": 4.966008384707909e-05, + "loss": 5.9879, + "step": 8837 + }, + { + "epoch": 0.05256208963745361, + "grad_norm": 1.824751615524292, + "learning_rate": 4.966000707868167e-05, + "loss": 5.4558, + "step": 8838 + }, + { + "epoch": 0.052568036920734605, + "grad_norm": 2.005291223526001, + "learning_rate": 4.9659930301675694e-05, + "loss": 5.821, + "step": 8839 + }, + { + "epoch": 0.05257398420401561, + "grad_norm": 2.0951414108276367, + "learning_rate": 4.965985351606119e-05, + "loss": 5.2816, + "step": 8840 + }, + { + "epoch": 0.0525799314872966, + "grad_norm": 2.236849069595337, + "learning_rate": 4.9659776721838194e-05, + "loss": 5.4734, + "step": 8841 + }, + { + "epoch": 0.0525858787705776, + "grad_norm": 1.8877390623092651, + "learning_rate": 4.965969991900671e-05, + "loss": 5.2445, + "step": 8842 + }, + { + "epoch": 0.0525918260538586, + "grad_norm": 2.726071834564209, + "learning_rate": 4.9659623107566785e-05, + "loss": 5.6059, + "step": 8843 + }, + { + "epoch": 0.052597773337139594, + "grad_norm": 2.279759168624878, + "learning_rate": 4.965954628751844e-05, + "loss": 5.6755, + "step": 8844 + }, + { + "epoch": 0.05260372062042059, + "grad_norm": 1.9941623210906982, + "learning_rate": 4.965946945886171e-05, + "loss": 5.5222, + "step": 8845 + }, + { + "epoch": 0.05260966790370159, + "grad_norm": 2.0556750297546387, + "learning_rate": 4.965939262159661e-05, + "loss": 5.6064, + "step": 8846 + }, + { + "epoch": 0.05261561518698259, + "grad_norm": 1.9260958433151245, + "learning_rate": 4.965931577572317e-05, + "loss": 5.6264, + "step": 8847 + }, + { + "epoch": 0.05262156247026358, + "grad_norm": 2.1252758502960205, + "learning_rate": 4.9659238921241413e-05, + "loss": 5.9832, + "step": 8848 + }, + { + "epoch": 0.052627509753544584, + "grad_norm": 1.8081480264663696, + "learning_rate": 4.9659162058151377e-05, + "loss": 5.4391, + "step": 8849 + }, + { + "epoch": 0.05263345703682558, + "grad_norm": 1.8439849615097046, + "learning_rate": 4.965908518645308e-05, + "loss": 5.5351, + "step": 8850 + }, + { + "epoch": 0.052639404320106574, + "grad_norm": 2.1782681941986084, + "learning_rate": 4.9659008306146556e-05, + "loss": 5.9692, + "step": 8851 + }, + { + "epoch": 0.052645351603387576, + "grad_norm": 2.0206944942474365, + "learning_rate": 4.965893141723182e-05, + "loss": 5.4736, + "step": 8852 + }, + { + "epoch": 0.05265129888666857, + "grad_norm": 2.283517360687256, + "learning_rate": 4.965885451970891e-05, + "loss": 5.4504, + "step": 8853 + }, + { + "epoch": 0.052657246169949566, + "grad_norm": 2.701608180999756, + "learning_rate": 4.965877761357784e-05, + "loss": 5.318, + "step": 8854 + }, + { + "epoch": 0.05266319345323056, + "grad_norm": 2.8494722843170166, + "learning_rate": 4.965870069883866e-05, + "loss": 4.9835, + "step": 8855 + }, + { + "epoch": 0.05266914073651156, + "grad_norm": 2.0555408000946045, + "learning_rate": 4.965862377549137e-05, + "loss": 5.7587, + "step": 8856 + }, + { + "epoch": 0.05267508801979256, + "grad_norm": 2.3476004600524902, + "learning_rate": 4.9658546843536014e-05, + "loss": 5.8775, + "step": 8857 + }, + { + "epoch": 0.05268103530307355, + "grad_norm": 1.8152700662612915, + "learning_rate": 4.965846990297262e-05, + "loss": 5.6274, + "step": 8858 + }, + { + "epoch": 0.052686982586354555, + "grad_norm": 2.1541671752929688, + "learning_rate": 4.965839295380119e-05, + "loss": 5.6786, + "step": 8859 + }, + { + "epoch": 0.05269292986963555, + "grad_norm": 2.1708984375, + "learning_rate": 4.965831599602179e-05, + "loss": 5.8817, + "step": 8860 + }, + { + "epoch": 0.052698877152916546, + "grad_norm": 1.6558966636657715, + "learning_rate": 4.9658239029634415e-05, + "loss": 5.5375, + "step": 8861 + }, + { + "epoch": 0.05270482443619755, + "grad_norm": 2.1165130138397217, + "learning_rate": 4.9658162054639115e-05, + "loss": 5.5936, + "step": 8862 + }, + { + "epoch": 0.05271077171947854, + "grad_norm": 2.4143176078796387, + "learning_rate": 4.9658085071035893e-05, + "loss": 5.71, + "step": 8863 + }, + { + "epoch": 0.05271671900275954, + "grad_norm": 1.9471622705459595, + "learning_rate": 4.965800807882479e-05, + "loss": 5.7588, + "step": 8864 + }, + { + "epoch": 0.05272266628604054, + "grad_norm": 2.2014408111572266, + "learning_rate": 4.9657931078005835e-05, + "loss": 5.7699, + "step": 8865 + }, + { + "epoch": 0.052728613569321535, + "grad_norm": 1.7588191032409668, + "learning_rate": 4.965785406857905e-05, + "loss": 5.3921, + "step": 8866 + }, + { + "epoch": 0.05273456085260253, + "grad_norm": 1.835635781288147, + "learning_rate": 4.965777705054446e-05, + "loss": 5.1531, + "step": 8867 + }, + { + "epoch": 0.052740508135883525, + "grad_norm": 2.3071937561035156, + "learning_rate": 4.96577000239021e-05, + "loss": 5.5926, + "step": 8868 + }, + { + "epoch": 0.05274645541916453, + "grad_norm": 2.195712089538574, + "learning_rate": 4.9657622988651995e-05, + "loss": 5.4579, + "step": 8869 + }, + { + "epoch": 0.05275240270244552, + "grad_norm": 2.273738145828247, + "learning_rate": 4.9657545944794156e-05, + "loss": 5.6138, + "step": 8870 + }, + { + "epoch": 0.05275834998572652, + "grad_norm": 2.208343982696533, + "learning_rate": 4.9657468892328626e-05, + "loss": 5.5508, + "step": 8871 + }, + { + "epoch": 0.05276429726900752, + "grad_norm": 2.2111566066741943, + "learning_rate": 4.965739183125544e-05, + "loss": 5.7044, + "step": 8872 + }, + { + "epoch": 0.052770244552288514, + "grad_norm": 1.7516666650772095, + "learning_rate": 4.96573147615746e-05, + "loss": 5.4357, + "step": 8873 + }, + { + "epoch": 0.05277619183556951, + "grad_norm": 2.0703322887420654, + "learning_rate": 4.9657237683286155e-05, + "loss": 5.5383, + "step": 8874 + }, + { + "epoch": 0.05278213911885051, + "grad_norm": 1.796243667602539, + "learning_rate": 4.965716059639012e-05, + "loss": 5.5024, + "step": 8875 + }, + { + "epoch": 0.05278808640213151, + "grad_norm": 2.322397232055664, + "learning_rate": 4.9657083500886526e-05, + "loss": 5.8814, + "step": 8876 + }, + { + "epoch": 0.0527940336854125, + "grad_norm": 2.6743311882019043, + "learning_rate": 4.96570063967754e-05, + "loss": 5.4989, + "step": 8877 + }, + { + "epoch": 0.052799980968693504, + "grad_norm": 2.4381649494171143, + "learning_rate": 4.965692928405676e-05, + "loss": 5.5807, + "step": 8878 + }, + { + "epoch": 0.0528059282519745, + "grad_norm": 2.3703296184539795, + "learning_rate": 4.9656852162730646e-05, + "loss": 5.5586, + "step": 8879 + }, + { + "epoch": 0.052811875535255494, + "grad_norm": 1.7828437089920044, + "learning_rate": 4.9656775032797075e-05, + "loss": 5.2553, + "step": 8880 + }, + { + "epoch": 0.052817822818536496, + "grad_norm": 1.730290412902832, + "learning_rate": 4.9656697894256085e-05, + "loss": 5.3558, + "step": 8881 + }, + { + "epoch": 0.05282377010181749, + "grad_norm": 1.6909739971160889, + "learning_rate": 4.9656620747107694e-05, + "loss": 5.4397, + "step": 8882 + }, + { + "epoch": 0.052829717385098486, + "grad_norm": 1.9772145748138428, + "learning_rate": 4.965654359135193e-05, + "loss": 5.5786, + "step": 8883 + }, + { + "epoch": 0.05283566466837948, + "grad_norm": 1.8624964952468872, + "learning_rate": 4.965646642698883e-05, + "loss": 5.5466, + "step": 8884 + }, + { + "epoch": 0.05284161195166048, + "grad_norm": 1.7061936855316162, + "learning_rate": 4.96563892540184e-05, + "loss": 5.3439, + "step": 8885 + }, + { + "epoch": 0.05284755923494148, + "grad_norm": 1.715483546257019, + "learning_rate": 4.965631207244069e-05, + "loss": 5.2732, + "step": 8886 + }, + { + "epoch": 0.05285350651822247, + "grad_norm": 1.7801883220672607, + "learning_rate": 4.965623488225571e-05, + "loss": 5.2427, + "step": 8887 + }, + { + "epoch": 0.052859453801503475, + "grad_norm": 1.5122452974319458, + "learning_rate": 4.9656157683463495e-05, + "loss": 5.2812, + "step": 8888 + }, + { + "epoch": 0.05286540108478447, + "grad_norm": 1.878077507019043, + "learning_rate": 4.965608047606407e-05, + "loss": 5.6385, + "step": 8889 + }, + { + "epoch": 0.052871348368065466, + "grad_norm": 2.0781304836273193, + "learning_rate": 4.965600326005746e-05, + "loss": 5.3345, + "step": 8890 + }, + { + "epoch": 0.05287729565134647, + "grad_norm": 1.953302264213562, + "learning_rate": 4.965592603544369e-05, + "loss": 5.2694, + "step": 8891 + }, + { + "epoch": 0.05288324293462746, + "grad_norm": 1.9993265867233276, + "learning_rate": 4.96558488022228e-05, + "loss": 5.3323, + "step": 8892 + }, + { + "epoch": 0.05288919021790846, + "grad_norm": 1.7653480768203735, + "learning_rate": 4.96557715603948e-05, + "loss": 5.389, + "step": 8893 + }, + { + "epoch": 0.05289513750118946, + "grad_norm": 1.8843438625335693, + "learning_rate": 4.965569430995973e-05, + "loss": 5.3334, + "step": 8894 + }, + { + "epoch": 0.052901084784470455, + "grad_norm": 1.6673407554626465, + "learning_rate": 4.9655617050917616e-05, + "loss": 5.4469, + "step": 8895 + }, + { + "epoch": 0.05290703206775145, + "grad_norm": 1.8208844661712646, + "learning_rate": 4.9655539783268476e-05, + "loss": 5.6288, + "step": 8896 + }, + { + "epoch": 0.052912979351032445, + "grad_norm": 1.755162000656128, + "learning_rate": 4.965546250701234e-05, + "loss": 5.4388, + "step": 8897 + }, + { + "epoch": 0.05291892663431345, + "grad_norm": 1.9435405731201172, + "learning_rate": 4.965538522214924e-05, + "loss": 5.5877, + "step": 8898 + }, + { + "epoch": 0.05292487391759444, + "grad_norm": 1.8579509258270264, + "learning_rate": 4.9655307928679196e-05, + "loss": 5.4405, + "step": 8899 + }, + { + "epoch": 0.05293082120087544, + "grad_norm": 1.8897236585617065, + "learning_rate": 4.9655230626602246e-05, + "loss": 5.2931, + "step": 8900 + }, + { + "epoch": 0.05293676848415644, + "grad_norm": 1.928133487701416, + "learning_rate": 4.9655153315918403e-05, + "loss": 5.2345, + "step": 8901 + }, + { + "epoch": 0.052942715767437434, + "grad_norm": 1.8830339908599854, + "learning_rate": 4.96550759966277e-05, + "loss": 5.3288, + "step": 8902 + }, + { + "epoch": 0.05294866305071843, + "grad_norm": 1.6774102449417114, + "learning_rate": 4.9654998668730167e-05, + "loss": 5.2939, + "step": 8903 + }, + { + "epoch": 0.05295461033399943, + "grad_norm": 1.7440418004989624, + "learning_rate": 4.9654921332225826e-05, + "loss": 5.4663, + "step": 8904 + }, + { + "epoch": 0.05296055761728043, + "grad_norm": 1.92295241355896, + "learning_rate": 4.965484398711471e-05, + "loss": 5.556, + "step": 8905 + }, + { + "epoch": 0.05296650490056142, + "grad_norm": 1.5319017171859741, + "learning_rate": 4.965476663339684e-05, + "loss": 5.5267, + "step": 8906 + }, + { + "epoch": 0.052972452183842424, + "grad_norm": 1.7626374959945679, + "learning_rate": 4.9654689271072255e-05, + "loss": 5.3774, + "step": 8907 + }, + { + "epoch": 0.05297839946712342, + "grad_norm": 1.745743989944458, + "learning_rate": 4.965461190014096e-05, + "loss": 5.4877, + "step": 8908 + }, + { + "epoch": 0.052984346750404414, + "grad_norm": 1.6091177463531494, + "learning_rate": 4.9654534520603e-05, + "loss": 5.2969, + "step": 8909 + }, + { + "epoch": 0.052990294033685416, + "grad_norm": 1.7392489910125732, + "learning_rate": 4.96544571324584e-05, + "loss": 5.4247, + "step": 8910 + }, + { + "epoch": 0.05299624131696641, + "grad_norm": 1.9275293350219727, + "learning_rate": 4.965437973570718e-05, + "loss": 5.2184, + "step": 8911 + }, + { + "epoch": 0.053002188600247406, + "grad_norm": 1.6901222467422485, + "learning_rate": 4.965430233034937e-05, + "loss": 5.1459, + "step": 8912 + }, + { + "epoch": 0.0530081358835284, + "grad_norm": 1.9212596416473389, + "learning_rate": 4.965422491638499e-05, + "loss": 5.2439, + "step": 8913 + }, + { + "epoch": 0.0530140831668094, + "grad_norm": 1.814706802368164, + "learning_rate": 4.965414749381409e-05, + "loss": 5.5608, + "step": 8914 + }, + { + "epoch": 0.0530200304500904, + "grad_norm": 1.7997081279754639, + "learning_rate": 4.965407006263668e-05, + "loss": 5.6099, + "step": 8915 + }, + { + "epoch": 0.05302597773337139, + "grad_norm": 1.8545546531677246, + "learning_rate": 4.9653992622852777e-05, + "loss": 5.5844, + "step": 8916 + }, + { + "epoch": 0.053031925016652395, + "grad_norm": 1.665958285331726, + "learning_rate": 4.965391517446243e-05, + "loss": 5.4967, + "step": 8917 + }, + { + "epoch": 0.05303787229993339, + "grad_norm": 1.6157240867614746, + "learning_rate": 4.9653837717465655e-05, + "loss": 5.2523, + "step": 8918 + }, + { + "epoch": 0.053043819583214386, + "grad_norm": 1.9782540798187256, + "learning_rate": 4.965376025186248e-05, + "loss": 5.2384, + "step": 8919 + }, + { + "epoch": 0.05304976686649539, + "grad_norm": 2.0229971408843994, + "learning_rate": 4.9653682777652925e-05, + "loss": 5.1703, + "step": 8920 + }, + { + "epoch": 0.05305571414977638, + "grad_norm": 1.8299061059951782, + "learning_rate": 4.965360529483703e-05, + "loss": 5.0257, + "step": 8921 + }, + { + "epoch": 0.05306166143305738, + "grad_norm": 1.9080857038497925, + "learning_rate": 4.965352780341482e-05, + "loss": 5.2516, + "step": 8922 + }, + { + "epoch": 0.05306760871633838, + "grad_norm": 1.9998538494110107, + "learning_rate": 4.965345030338631e-05, + "loss": 5.1991, + "step": 8923 + }, + { + "epoch": 0.053073555999619375, + "grad_norm": 1.7606618404388428, + "learning_rate": 4.965337279475154e-05, + "loss": 5.2194, + "step": 8924 + }, + { + "epoch": 0.05307950328290037, + "grad_norm": 1.9633625745773315, + "learning_rate": 4.9653295277510525e-05, + "loss": 5.2463, + "step": 8925 + }, + { + "epoch": 0.053085450566181365, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.9653217751663306e-05, + "loss": 5.2737, + "step": 8926 + }, + { + "epoch": 0.05309139784946237, + "grad_norm": 1.836289405822754, + "learning_rate": 4.965314021720991e-05, + "loss": 5.1157, + "step": 8927 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 1.8526496887207031, + "learning_rate": 4.965306267415035e-05, + "loss": 5.6541, + "step": 8928 + }, + { + "epoch": 0.05310329241602436, + "grad_norm": 1.9928539991378784, + "learning_rate": 4.965298512248466e-05, + "loss": 5.194, + "step": 8929 + }, + { + "epoch": 0.05310923969930536, + "grad_norm": 1.601536512374878, + "learning_rate": 4.9652907562212867e-05, + "loss": 5.285, + "step": 8930 + }, + { + "epoch": 0.053115186982586354, + "grad_norm": 1.8940081596374512, + "learning_rate": 4.9652829993335e-05, + "loss": 5.1791, + "step": 8931 + }, + { + "epoch": 0.05312113426586735, + "grad_norm": 1.7984519004821777, + "learning_rate": 4.9652752415851085e-05, + "loss": 5.2225, + "step": 8932 + }, + { + "epoch": 0.05312708154914835, + "grad_norm": 1.7474113702774048, + "learning_rate": 4.965267482976115e-05, + "loss": 5.0099, + "step": 8933 + }, + { + "epoch": 0.053133028832429346, + "grad_norm": 1.7044427394866943, + "learning_rate": 4.9652597235065214e-05, + "loss": 5.1456, + "step": 8934 + }, + { + "epoch": 0.05313897611571034, + "grad_norm": 1.5422965288162231, + "learning_rate": 4.9652519631763316e-05, + "loss": 5.0714, + "step": 8935 + }, + { + "epoch": 0.053144923398991344, + "grad_norm": 1.6831375360488892, + "learning_rate": 4.965244201985548e-05, + "loss": 5.0742, + "step": 8936 + }, + { + "epoch": 0.05315087068227234, + "grad_norm": 1.7648097276687622, + "learning_rate": 4.9652364399341734e-05, + "loss": 5.1108, + "step": 8937 + }, + { + "epoch": 0.053156817965553334, + "grad_norm": 1.669393539428711, + "learning_rate": 4.965228677022209e-05, + "loss": 5.1801, + "step": 8938 + }, + { + "epoch": 0.053162765248834336, + "grad_norm": 2.0252909660339355, + "learning_rate": 4.96522091324966e-05, + "loss": 5.3955, + "step": 8939 + }, + { + "epoch": 0.05316871253211533, + "grad_norm": 1.686355710029602, + "learning_rate": 4.965213148616527e-05, + "loss": 5.2626, + "step": 8940 + }, + { + "epoch": 0.053174659815396326, + "grad_norm": 1.7601011991500854, + "learning_rate": 4.965205383122814e-05, + "loss": 5.1603, + "step": 8941 + }, + { + "epoch": 0.05318060709867732, + "grad_norm": 1.7249791622161865, + "learning_rate": 4.9651976167685235e-05, + "loss": 5.4245, + "step": 8942 + }, + { + "epoch": 0.05318655438195832, + "grad_norm": 1.869367003440857, + "learning_rate": 4.9651898495536574e-05, + "loss": 5.2269, + "step": 8943 + }, + { + "epoch": 0.05319250166523932, + "grad_norm": 1.8296380043029785, + "learning_rate": 4.965182081478219e-05, + "loss": 5.3236, + "step": 8944 + }, + { + "epoch": 0.05319844894852031, + "grad_norm": 1.8211008310317993, + "learning_rate": 4.9651743125422115e-05, + "loss": 5.269, + "step": 8945 + }, + { + "epoch": 0.053204396231801315, + "grad_norm": 1.868295431137085, + "learning_rate": 4.965166542745637e-05, + "loss": 5.2733, + "step": 8946 + }, + { + "epoch": 0.05321034351508231, + "grad_norm": 1.6603426933288574, + "learning_rate": 4.965158772088498e-05, + "loss": 5.2685, + "step": 8947 + }, + { + "epoch": 0.053216290798363305, + "grad_norm": 1.680565357208252, + "learning_rate": 4.965151000570798e-05, + "loss": 5.4452, + "step": 8948 + }, + { + "epoch": 0.05322223808164431, + "grad_norm": 1.6473147869110107, + "learning_rate": 4.9651432281925394e-05, + "loss": 5.4476, + "step": 8949 + }, + { + "epoch": 0.0532281853649253, + "grad_norm": 1.5291423797607422, + "learning_rate": 4.965135454953724e-05, + "loss": 5.4617, + "step": 8950 + }, + { + "epoch": 0.0532341326482063, + "grad_norm": 1.4708455801010132, + "learning_rate": 4.965127680854356e-05, + "loss": 5.5431, + "step": 8951 + }, + { + "epoch": 0.0532400799314873, + "grad_norm": 1.4297362565994263, + "learning_rate": 4.9651199058944366e-05, + "loss": 5.431, + "step": 8952 + }, + { + "epoch": 0.053246027214768295, + "grad_norm": 1.726123571395874, + "learning_rate": 4.96511213007397e-05, + "loss": 5.2801, + "step": 8953 + }, + { + "epoch": 0.05325197449804929, + "grad_norm": 1.7977174520492554, + "learning_rate": 4.9651043533929584e-05, + "loss": 5.3273, + "step": 8954 + }, + { + "epoch": 0.053257921781330285, + "grad_norm": 1.8125461339950562, + "learning_rate": 4.9650965758514034e-05, + "loss": 5.3135, + "step": 8955 + }, + { + "epoch": 0.05326386906461129, + "grad_norm": 1.4925352334976196, + "learning_rate": 4.965088797449309e-05, + "loss": 5.1454, + "step": 8956 + }, + { + "epoch": 0.05326981634789228, + "grad_norm": 1.6977181434631348, + "learning_rate": 4.965081018186678e-05, + "loss": 5.3207, + "step": 8957 + }, + { + "epoch": 0.05327576363117328, + "grad_norm": 1.7767595052719116, + "learning_rate": 4.965073238063512e-05, + "loss": 5.203, + "step": 8958 + }, + { + "epoch": 0.05328171091445428, + "grad_norm": 1.53665292263031, + "learning_rate": 4.965065457079815e-05, + "loss": 5.3088, + "step": 8959 + }, + { + "epoch": 0.053287658197735274, + "grad_norm": 1.724476933479309, + "learning_rate": 4.965057675235589e-05, + "loss": 5.2628, + "step": 8960 + }, + { + "epoch": 0.05329360548101627, + "grad_norm": 1.7339463233947754, + "learning_rate": 4.965049892530837e-05, + "loss": 5.3174, + "step": 8961 + }, + { + "epoch": 0.05329955276429727, + "grad_norm": 1.8414005041122437, + "learning_rate": 4.965042108965561e-05, + "loss": 5.2121, + "step": 8962 + }, + { + "epoch": 0.053305500047578266, + "grad_norm": 1.7969903945922852, + "learning_rate": 4.9650343245397655e-05, + "loss": 5.0947, + "step": 8963 + }, + { + "epoch": 0.05331144733085926, + "grad_norm": 1.573320746421814, + "learning_rate": 4.965026539253451e-05, + "loss": 5.0624, + "step": 8964 + }, + { + "epoch": 0.053317394614140264, + "grad_norm": 1.7296351194381714, + "learning_rate": 4.9650187531066204e-05, + "loss": 5.5497, + "step": 8965 + }, + { + "epoch": 0.05332334189742126, + "grad_norm": 1.931847095489502, + "learning_rate": 4.9650109660992784e-05, + "loss": 5.537, + "step": 8966 + }, + { + "epoch": 0.053329289180702254, + "grad_norm": 1.8911564350128174, + "learning_rate": 4.965003178231427e-05, + "loss": 5.4891, + "step": 8967 + }, + { + "epoch": 0.053335236463983256, + "grad_norm": 1.933401107788086, + "learning_rate": 4.964995389503067e-05, + "loss": 5.3157, + "step": 8968 + }, + { + "epoch": 0.05334118374726425, + "grad_norm": 1.8299031257629395, + "learning_rate": 4.964987599914204e-05, + "loss": 5.2955, + "step": 8969 + }, + { + "epoch": 0.053347131030545246, + "grad_norm": 1.5823233127593994, + "learning_rate": 4.964979809464838e-05, + "loss": 5.2708, + "step": 8970 + }, + { + "epoch": 0.05335307831382624, + "grad_norm": 1.602689504623413, + "learning_rate": 4.9649720181549737e-05, + "loss": 5.3646, + "step": 8971 + }, + { + "epoch": 0.05335902559710724, + "grad_norm": 2.2379884719848633, + "learning_rate": 4.964964225984613e-05, + "loss": 5.5453, + "step": 8972 + }, + { + "epoch": 0.05336497288038824, + "grad_norm": 2.2210440635681152, + "learning_rate": 4.964956432953759e-05, + "loss": 5.2123, + "step": 8973 + }, + { + "epoch": 0.05337092016366923, + "grad_norm": 2.4450249671936035, + "learning_rate": 4.964948639062413e-05, + "loss": 5.172, + "step": 8974 + }, + { + "epoch": 0.053376867446950235, + "grad_norm": 1.7727516889572144, + "learning_rate": 4.9649408443105806e-05, + "loss": 5.3447, + "step": 8975 + }, + { + "epoch": 0.05338281473023123, + "grad_norm": 1.8239831924438477, + "learning_rate": 4.964933048698262e-05, + "loss": 5.3628, + "step": 8976 + }, + { + "epoch": 0.053388762013512225, + "grad_norm": 1.9517360925674438, + "learning_rate": 4.964925252225461e-05, + "loss": 5.6118, + "step": 8977 + }, + { + "epoch": 0.05339470929679323, + "grad_norm": 2.1735262870788574, + "learning_rate": 4.9649174548921796e-05, + "loss": 5.7332, + "step": 8978 + }, + { + "epoch": 0.05340065658007422, + "grad_norm": 1.4132062196731567, + "learning_rate": 4.964909656698421e-05, + "loss": 5.8078, + "step": 8979 + }, + { + "epoch": 0.05340660386335522, + "grad_norm": 1.5568846464157104, + "learning_rate": 4.964901857644188e-05, + "loss": 5.6328, + "step": 8980 + }, + { + "epoch": 0.05341255114663622, + "grad_norm": 1.6015586853027344, + "learning_rate": 4.964894057729484e-05, + "loss": 5.3738, + "step": 8981 + }, + { + "epoch": 0.053418498429917215, + "grad_norm": 1.492748737335205, + "learning_rate": 4.9648862569543105e-05, + "loss": 5.4336, + "step": 8982 + }, + { + "epoch": 0.05342444571319821, + "grad_norm": 1.9008845090866089, + "learning_rate": 4.96487845531867e-05, + "loss": 5.455, + "step": 8983 + }, + { + "epoch": 0.053430392996479205, + "grad_norm": 1.9590948820114136, + "learning_rate": 4.9648706528225664e-05, + "loss": 5.3308, + "step": 8984 + }, + { + "epoch": 0.05343634027976021, + "grad_norm": 1.9980428218841553, + "learning_rate": 4.964862849466002e-05, + "loss": 5.3777, + "step": 8985 + }, + { + "epoch": 0.0534422875630412, + "grad_norm": 1.769711971282959, + "learning_rate": 4.964855045248979e-05, + "loss": 5.4451, + "step": 8986 + }, + { + "epoch": 0.0534482348463222, + "grad_norm": 1.769977331161499, + "learning_rate": 4.964847240171502e-05, + "loss": 5.277, + "step": 8987 + }, + { + "epoch": 0.0534541821296032, + "grad_norm": 1.6647396087646484, + "learning_rate": 4.9648394342335705e-05, + "loss": 5.4655, + "step": 8988 + }, + { + "epoch": 0.053460129412884194, + "grad_norm": 1.861554503440857, + "learning_rate": 4.9648316274351906e-05, + "loss": 5.308, + "step": 8989 + }, + { + "epoch": 0.05346607669616519, + "grad_norm": 1.9457745552062988, + "learning_rate": 4.964823819776362e-05, + "loss": 6.2361, + "step": 8990 + }, + { + "epoch": 0.05347202397944619, + "grad_norm": 1.7702157497406006, + "learning_rate": 4.9648160112570896e-05, + "loss": 5.366, + "step": 8991 + }, + { + "epoch": 0.053477971262727186, + "grad_norm": 2.0074565410614014, + "learning_rate": 4.964808201877375e-05, + "loss": 5.3598, + "step": 8992 + }, + { + "epoch": 0.05348391854600818, + "grad_norm": 1.8686721324920654, + "learning_rate": 4.964800391637222e-05, + "loss": 5.4607, + "step": 8993 + }, + { + "epoch": 0.053489865829289183, + "grad_norm": 1.9749736785888672, + "learning_rate": 4.964792580536632e-05, + "loss": 5.3734, + "step": 8994 + }, + { + "epoch": 0.05349581311257018, + "grad_norm": 1.8435015678405762, + "learning_rate": 4.964784768575609e-05, + "loss": 5.3815, + "step": 8995 + }, + { + "epoch": 0.053501760395851174, + "grad_norm": 2.01983380317688, + "learning_rate": 4.9647769557541546e-05, + "loss": 5.4089, + "step": 8996 + }, + { + "epoch": 0.053507707679132176, + "grad_norm": 2.014798402786255, + "learning_rate": 4.964769142072272e-05, + "loss": 5.3906, + "step": 8997 + }, + { + "epoch": 0.05351365496241317, + "grad_norm": 1.8822753429412842, + "learning_rate": 4.9647613275299644e-05, + "loss": 5.3598, + "step": 8998 + }, + { + "epoch": 0.053519602245694166, + "grad_norm": 1.6534459590911865, + "learning_rate": 4.9647535121272334e-05, + "loss": 5.4577, + "step": 8999 + }, + { + "epoch": 0.05352554952897516, + "grad_norm": 1.6497015953063965, + "learning_rate": 4.964745695864083e-05, + "loss": 5.3915, + "step": 9000 + }, + { + "epoch": 0.05353149681225616, + "grad_norm": 1.5535780191421509, + "learning_rate": 4.964737878740515e-05, + "loss": 5.2444, + "step": 9001 + }, + { + "epoch": 0.05353744409553716, + "grad_norm": 1.6840674877166748, + "learning_rate": 4.964730060756533e-05, + "loss": 5.3439, + "step": 9002 + }, + { + "epoch": 0.05354339137881815, + "grad_norm": 1.7857226133346558, + "learning_rate": 4.9647222419121384e-05, + "loss": 5.3231, + "step": 9003 + }, + { + "epoch": 0.053549338662099155, + "grad_norm": 1.6067994832992554, + "learning_rate": 4.964714422207335e-05, + "loss": 5.4019, + "step": 9004 + }, + { + "epoch": 0.05355528594538015, + "grad_norm": 1.7026724815368652, + "learning_rate": 4.964706601642125e-05, + "loss": 5.2716, + "step": 9005 + }, + { + "epoch": 0.053561233228661145, + "grad_norm": 1.632804036140442, + "learning_rate": 4.964698780216512e-05, + "loss": 5.4132, + "step": 9006 + }, + { + "epoch": 0.05356718051194215, + "grad_norm": 1.6569499969482422, + "learning_rate": 4.964690957930498e-05, + "loss": 5.294, + "step": 9007 + }, + { + "epoch": 0.05357312779522314, + "grad_norm": 1.8141810894012451, + "learning_rate": 4.964683134784086e-05, + "loss": 5.3365, + "step": 9008 + }, + { + "epoch": 0.05357907507850414, + "grad_norm": 1.6555678844451904, + "learning_rate": 4.964675310777278e-05, + "loss": 5.3488, + "step": 9009 + }, + { + "epoch": 0.05358502236178514, + "grad_norm": 1.8363603353500366, + "learning_rate": 4.964667485910078e-05, + "loss": 5.3679, + "step": 9010 + }, + { + "epoch": 0.053590969645066135, + "grad_norm": 1.7839024066925049, + "learning_rate": 4.9646596601824874e-05, + "loss": 5.2514, + "step": 9011 + }, + { + "epoch": 0.05359691692834713, + "grad_norm": 1.8712091445922852, + "learning_rate": 4.96465183359451e-05, + "loss": 5.4313, + "step": 9012 + }, + { + "epoch": 0.053602864211628125, + "grad_norm": 1.9677501916885376, + "learning_rate": 4.964644006146148e-05, + "loss": 5.2442, + "step": 9013 + }, + { + "epoch": 0.05360881149490913, + "grad_norm": 1.8567090034484863, + "learning_rate": 4.964636177837404e-05, + "loss": 5.105, + "step": 9014 + }, + { + "epoch": 0.05361475877819012, + "grad_norm": 1.7319908142089844, + "learning_rate": 4.964628348668281e-05, + "loss": 5.2962, + "step": 9015 + }, + { + "epoch": 0.05362070606147112, + "grad_norm": 1.6412272453308105, + "learning_rate": 4.9646205186387824e-05, + "loss": 5.2302, + "step": 9016 + }, + { + "epoch": 0.05362665334475212, + "grad_norm": 1.9401088953018188, + "learning_rate": 4.96461268774891e-05, + "loss": 5.4425, + "step": 9017 + }, + { + "epoch": 0.053632600628033114, + "grad_norm": 1.7045506238937378, + "learning_rate": 4.964604855998666e-05, + "loss": 5.2325, + "step": 9018 + }, + { + "epoch": 0.05363854791131411, + "grad_norm": 1.8232519626617432, + "learning_rate": 4.9645970233880545e-05, + "loss": 5.5047, + "step": 9019 + }, + { + "epoch": 0.05364449519459511, + "grad_norm": 1.718833327293396, + "learning_rate": 4.964589189917077e-05, + "loss": 5.3323, + "step": 9020 + }, + { + "epoch": 0.053650442477876106, + "grad_norm": 1.608774185180664, + "learning_rate": 4.9645813555857376e-05, + "loss": 5.2374, + "step": 9021 + }, + { + "epoch": 0.0536563897611571, + "grad_norm": 1.6789363622665405, + "learning_rate": 4.964573520394039e-05, + "loss": 5.3291, + "step": 9022 + }, + { + "epoch": 0.0536623370444381, + "grad_norm": 1.6596689224243164, + "learning_rate": 4.964565684341982e-05, + "loss": 5.308, + "step": 9023 + }, + { + "epoch": 0.0536682843277191, + "grad_norm": 1.8141522407531738, + "learning_rate": 4.9645578474295703e-05, + "loss": 5.2033, + "step": 9024 + }, + { + "epoch": 0.053674231611000094, + "grad_norm": 1.428606390953064, + "learning_rate": 4.964550009656808e-05, + "loss": 5.2441, + "step": 9025 + }, + { + "epoch": 0.053680178894281096, + "grad_norm": 1.5033652782440186, + "learning_rate": 4.9645421710236965e-05, + "loss": 5.2132, + "step": 9026 + }, + { + "epoch": 0.05368612617756209, + "grad_norm": 1.7123147249221802, + "learning_rate": 4.9645343315302385e-05, + "loss": 5.3145, + "step": 9027 + }, + { + "epoch": 0.053692073460843086, + "grad_norm": 1.5851943492889404, + "learning_rate": 4.9645264911764376e-05, + "loss": 5.353, + "step": 9028 + }, + { + "epoch": 0.05369802074412408, + "grad_norm": 1.6627084016799927, + "learning_rate": 4.964518649962295e-05, + "loss": 5.1049, + "step": 9029 + }, + { + "epoch": 0.05370396802740508, + "grad_norm": 1.51585853099823, + "learning_rate": 4.964510807887815e-05, + "loss": 4.9433, + "step": 9030 + }, + { + "epoch": 0.05370991531068608, + "grad_norm": 1.7350785732269287, + "learning_rate": 4.964502964952999e-05, + "loss": 5.1761, + "step": 9031 + }, + { + "epoch": 0.05371586259396707, + "grad_norm": 1.925410509109497, + "learning_rate": 4.964495121157852e-05, + "loss": 5.0528, + "step": 9032 + }, + { + "epoch": 0.053721809877248075, + "grad_norm": 1.794162631034851, + "learning_rate": 4.964487276502374e-05, + "loss": 5.2009, + "step": 9033 + }, + { + "epoch": 0.05372775716052907, + "grad_norm": 1.6729109287261963, + "learning_rate": 4.964479430986569e-05, + "loss": 5.16, + "step": 9034 + }, + { + "epoch": 0.053733704443810065, + "grad_norm": 1.8543394804000854, + "learning_rate": 4.9644715846104406e-05, + "loss": 5.3545, + "step": 9035 + }, + { + "epoch": 0.05373965172709107, + "grad_norm": 1.6876883506774902, + "learning_rate": 4.96446373737399e-05, + "loss": 5.2074, + "step": 9036 + }, + { + "epoch": 0.05374559901037206, + "grad_norm": 1.816701054573059, + "learning_rate": 4.9644558892772205e-05, + "loss": 5.154, + "step": 9037 + }, + { + "epoch": 0.05375154629365306, + "grad_norm": 1.471283197402954, + "learning_rate": 4.964448040320135e-05, + "loss": 5.2577, + "step": 9038 + }, + { + "epoch": 0.05375749357693406, + "grad_norm": 1.5764297246932983, + "learning_rate": 4.964440190502736e-05, + "loss": 5.0115, + "step": 9039 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 1.6854795217514038, + "learning_rate": 4.964432339825027e-05, + "loss": 5.1957, + "step": 9040 + }, + { + "epoch": 0.05376938814349605, + "grad_norm": 1.889570951461792, + "learning_rate": 4.964424488287009e-05, + "loss": 5.1229, + "step": 9041 + }, + { + "epoch": 0.05377533542677705, + "grad_norm": 1.7528218030929565, + "learning_rate": 4.964416635888687e-05, + "loss": 5.0002, + "step": 9042 + }, + { + "epoch": 0.05378128271005805, + "grad_norm": 1.68081796169281, + "learning_rate": 4.964408782630062e-05, + "loss": 5.0567, + "step": 9043 + }, + { + "epoch": 0.05378722999333904, + "grad_norm": 1.6083979606628418, + "learning_rate": 4.9644009285111384e-05, + "loss": 5.0775, + "step": 9044 + }, + { + "epoch": 0.05379317727662004, + "grad_norm": 1.676720380783081, + "learning_rate": 4.9643930735319164e-05, + "loss": 5.0446, + "step": 9045 + }, + { + "epoch": 0.05379912455990104, + "grad_norm": 1.6502453088760376, + "learning_rate": 4.964385217692401e-05, + "loss": 5.3751, + "step": 9046 + }, + { + "epoch": 0.053805071843182034, + "grad_norm": 1.9226343631744385, + "learning_rate": 4.9643773609925935e-05, + "loss": 5.2442, + "step": 9047 + }, + { + "epoch": 0.05381101912646303, + "grad_norm": 1.8054014444351196, + "learning_rate": 4.964369503432498e-05, + "loss": 5.4844, + "step": 9048 + }, + { + "epoch": 0.05381696640974403, + "grad_norm": 1.5151008367538452, + "learning_rate": 4.9643616450121166e-05, + "loss": 5.2834, + "step": 9049 + }, + { + "epoch": 0.053822913693025026, + "grad_norm": 2.0237820148468018, + "learning_rate": 4.964353785731452e-05, + "loss": 5.3166, + "step": 9050 + }, + { + "epoch": 0.05382886097630602, + "grad_norm": 2.145364999771118, + "learning_rate": 4.964345925590507e-05, + "loss": 5.3803, + "step": 9051 + }, + { + "epoch": 0.05383480825958702, + "grad_norm": 1.747369408607483, + "learning_rate": 4.964338064589284e-05, + "loss": 6.1041, + "step": 9052 + }, + { + "epoch": 0.05384075554286802, + "grad_norm": 1.9964301586151123, + "learning_rate": 4.964330202727786e-05, + "loss": 5.1707, + "step": 9053 + }, + { + "epoch": 0.053846702826149014, + "grad_norm": 1.630233645439148, + "learning_rate": 4.9643223400060155e-05, + "loss": 4.9385, + "step": 9054 + }, + { + "epoch": 0.053852650109430016, + "grad_norm": 1.5782960653305054, + "learning_rate": 4.9643144764239765e-05, + "loss": 4.9953, + "step": 9055 + }, + { + "epoch": 0.05385859739271101, + "grad_norm": 2.1511783599853516, + "learning_rate": 4.9643066119816706e-05, + "loss": 5.4329, + "step": 9056 + }, + { + "epoch": 0.053864544675992006, + "grad_norm": 2.2133493423461914, + "learning_rate": 4.9642987466791004e-05, + "loss": 5.7347, + "step": 9057 + }, + { + "epoch": 0.053870491959273, + "grad_norm": 1.7669782638549805, + "learning_rate": 4.9642908805162686e-05, + "loss": 5.4129, + "step": 9058 + }, + { + "epoch": 0.053876439242554, + "grad_norm": 1.8005794286727905, + "learning_rate": 4.9642830134931787e-05, + "loss": 5.2397, + "step": 9059 + }, + { + "epoch": 0.053882386525835, + "grad_norm": 1.697607398033142, + "learning_rate": 4.9642751456098325e-05, + "loss": 5.3388, + "step": 9060 + }, + { + "epoch": 0.05388833380911599, + "grad_norm": 1.4916869401931763, + "learning_rate": 4.9642672768662344e-05, + "loss": 5.2574, + "step": 9061 + }, + { + "epoch": 0.053894281092396995, + "grad_norm": 1.7112784385681152, + "learning_rate": 4.964259407262385e-05, + "loss": 4.9881, + "step": 9062 + }, + { + "epoch": 0.05390022837567799, + "grad_norm": 1.4831846952438354, + "learning_rate": 4.964251536798289e-05, + "loss": 5.3976, + "step": 9063 + }, + { + "epoch": 0.053906175658958985, + "grad_norm": 1.626370906829834, + "learning_rate": 4.9642436654739476e-05, + "loss": 5.2409, + "step": 9064 + }, + { + "epoch": 0.05391212294223999, + "grad_norm": 1.7369413375854492, + "learning_rate": 4.964235793289365e-05, + "loss": 5.2732, + "step": 9065 + }, + { + "epoch": 0.05391807022552098, + "grad_norm": 1.7028629779815674, + "learning_rate": 4.964227920244542e-05, + "loss": 5.3161, + "step": 9066 + }, + { + "epoch": 0.05392401750880198, + "grad_norm": 1.9031678438186646, + "learning_rate": 4.964220046339483e-05, + "loss": 5.2517, + "step": 9067 + }, + { + "epoch": 0.05392996479208298, + "grad_norm": 1.8210735321044922, + "learning_rate": 4.96421217157419e-05, + "loss": 5.2819, + "step": 9068 + }, + { + "epoch": 0.053935912075363975, + "grad_norm": 1.7334645986557007, + "learning_rate": 4.9642042959486666e-05, + "loss": 5.4296, + "step": 9069 + }, + { + "epoch": 0.05394185935864497, + "grad_norm": 1.732790231704712, + "learning_rate": 4.964196419462914e-05, + "loss": 5.3589, + "step": 9070 + }, + { + "epoch": 0.05394780664192597, + "grad_norm": 1.417751669883728, + "learning_rate": 4.964188542116937e-05, + "loss": 5.0958, + "step": 9071 + }, + { + "epoch": 0.05395375392520697, + "grad_norm": 1.8562361001968384, + "learning_rate": 4.964180663910737e-05, + "loss": 5.2622, + "step": 9072 + }, + { + "epoch": 0.05395970120848796, + "grad_norm": 1.7366154193878174, + "learning_rate": 4.9641727848443166e-05, + "loss": 5.2329, + "step": 9073 + }, + { + "epoch": 0.05396564849176896, + "grad_norm": 1.8587182760238647, + "learning_rate": 4.9641649049176785e-05, + "loss": 4.9392, + "step": 9074 + }, + { + "epoch": 0.05397159577504996, + "grad_norm": 1.6152398586273193, + "learning_rate": 4.964157024130827e-05, + "loss": 5.473, + "step": 9075 + }, + { + "epoch": 0.053977543058330954, + "grad_norm": 1.5967273712158203, + "learning_rate": 4.9641491424837626e-05, + "loss": 5.2877, + "step": 9076 + }, + { + "epoch": 0.05398349034161195, + "grad_norm": 1.4986391067504883, + "learning_rate": 4.96414125997649e-05, + "loss": 5.2163, + "step": 9077 + }, + { + "epoch": 0.05398943762489295, + "grad_norm": 1.563905119895935, + "learning_rate": 4.964133376609011e-05, + "loss": 5.2043, + "step": 9078 + }, + { + "epoch": 0.053995384908173946, + "grad_norm": 1.5690317153930664, + "learning_rate": 4.964125492381329e-05, + "loss": 5.2226, + "step": 9079 + }, + { + "epoch": 0.05400133219145494, + "grad_norm": 1.7732517719268799, + "learning_rate": 4.9641176072934446e-05, + "loss": 5.3123, + "step": 9080 + }, + { + "epoch": 0.05400727947473594, + "grad_norm": 1.7045226097106934, + "learning_rate": 4.964109721345364e-05, + "loss": 5.0872, + "step": 9081 + }, + { + "epoch": 0.05401322675801694, + "grad_norm": 1.6405664682388306, + "learning_rate": 4.964101834537087e-05, + "loss": 5.3863, + "step": 9082 + }, + { + "epoch": 0.054019174041297933, + "grad_norm": 1.7410979270935059, + "learning_rate": 4.964093946868618e-05, + "loss": 5.0952, + "step": 9083 + }, + { + "epoch": 0.054025121324578936, + "grad_norm": 2.0102951526641846, + "learning_rate": 4.964086058339959e-05, + "loss": 4.9484, + "step": 9084 + }, + { + "epoch": 0.05403106860785993, + "grad_norm": 1.8228510618209839, + "learning_rate": 4.9640781689511133e-05, + "loss": 5.1141, + "step": 9085 + }, + { + "epoch": 0.054037015891140926, + "grad_norm": 1.7363582849502563, + "learning_rate": 4.964070278702083e-05, + "loss": 5.1164, + "step": 9086 + }, + { + "epoch": 0.05404296317442192, + "grad_norm": 1.6060153245925903, + "learning_rate": 4.9640623875928714e-05, + "loss": 5.1746, + "step": 9087 + }, + { + "epoch": 0.05404891045770292, + "grad_norm": 1.6690374612808228, + "learning_rate": 4.9640544956234814e-05, + "loss": 5.0931, + "step": 9088 + }, + { + "epoch": 0.05405485774098392, + "grad_norm": 1.613527774810791, + "learning_rate": 4.964046602793916e-05, + "loss": 5.2224, + "step": 9089 + }, + { + "epoch": 0.05406080502426491, + "grad_norm": 1.6461642980575562, + "learning_rate": 4.964038709104176e-05, + "loss": 5.3175, + "step": 9090 + }, + { + "epoch": 0.054066752307545915, + "grad_norm": 1.839709758758545, + "learning_rate": 4.9640308145542664e-05, + "loss": 5.3247, + "step": 9091 + }, + { + "epoch": 0.05407269959082691, + "grad_norm": 1.8977348804473877, + "learning_rate": 4.9640229191441886e-05, + "loss": 5.4256, + "step": 9092 + }, + { + "epoch": 0.054078646874107905, + "grad_norm": 1.9805532693862915, + "learning_rate": 4.9640150228739454e-05, + "loss": 4.9413, + "step": 9093 + }, + { + "epoch": 0.05408459415738891, + "grad_norm": 2.0237114429473877, + "learning_rate": 4.964007125743542e-05, + "loss": 4.8808, + "step": 9094 + }, + { + "epoch": 0.0540905414406699, + "grad_norm": 1.9848511219024658, + "learning_rate": 4.963999227752977e-05, + "loss": 5.0295, + "step": 9095 + }, + { + "epoch": 0.0540964887239509, + "grad_norm": 1.925876498222351, + "learning_rate": 4.9639913289022564e-05, + "loss": 5.0129, + "step": 9096 + }, + { + "epoch": 0.0541024360072319, + "grad_norm": 1.4887725114822388, + "learning_rate": 4.963983429191382e-05, + "loss": 4.9706, + "step": 9097 + }, + { + "epoch": 0.054108383290512894, + "grad_norm": 1.615160584449768, + "learning_rate": 4.963975528620356e-05, + "loss": 5.0066, + "step": 9098 + }, + { + "epoch": 0.05411433057379389, + "grad_norm": 1.969086766242981, + "learning_rate": 4.9639676271891816e-05, + "loss": 4.9539, + "step": 9099 + }, + { + "epoch": 0.05412027785707489, + "grad_norm": 1.8290555477142334, + "learning_rate": 4.963959724897862e-05, + "loss": 5.2467, + "step": 9100 + }, + { + "epoch": 0.05412622514035589, + "grad_norm": 2.004157066345215, + "learning_rate": 4.963951821746399e-05, + "loss": 4.8, + "step": 9101 + }, + { + "epoch": 0.05413217242363688, + "grad_norm": 1.9732778072357178, + "learning_rate": 4.9639439177347955e-05, + "loss": 4.8828, + "step": 9102 + }, + { + "epoch": 0.05413811970691788, + "grad_norm": 1.8653557300567627, + "learning_rate": 4.963936012863056e-05, + "loss": 5.0591, + "step": 9103 + }, + { + "epoch": 0.05414406699019888, + "grad_norm": 1.7854375839233398, + "learning_rate": 4.9639281071311804e-05, + "loss": 5.0914, + "step": 9104 + }, + { + "epoch": 0.054150014273479874, + "grad_norm": 1.7956377267837524, + "learning_rate": 4.963920200539174e-05, + "loss": 5.3484, + "step": 9105 + }, + { + "epoch": 0.05415596155676087, + "grad_norm": 1.7851346731185913, + "learning_rate": 4.963912293087039e-05, + "loss": 5.3146, + "step": 9106 + }, + { + "epoch": 0.05416190884004187, + "grad_norm": 1.72859787940979, + "learning_rate": 4.9639043847747756e-05, + "loss": 5.1611, + "step": 9107 + }, + { + "epoch": 0.054167856123322866, + "grad_norm": 1.5961265563964844, + "learning_rate": 4.9638964756023904e-05, + "loss": 5.247, + "step": 9108 + }, + { + "epoch": 0.05417380340660386, + "grad_norm": 1.7507922649383545, + "learning_rate": 4.963888565569884e-05, + "loss": 5.2011, + "step": 9109 + }, + { + "epoch": 0.05417975068988486, + "grad_norm": 1.8338440656661987, + "learning_rate": 4.9638806546772594e-05, + "loss": 5.2413, + "step": 9110 + }, + { + "epoch": 0.05418569797316586, + "grad_norm": 1.8935306072235107, + "learning_rate": 4.963872742924519e-05, + "loss": 5.1042, + "step": 9111 + }, + { + "epoch": 0.05419164525644685, + "grad_norm": 1.6512808799743652, + "learning_rate": 4.963864830311667e-05, + "loss": 5.2437, + "step": 9112 + }, + { + "epoch": 0.054197592539727855, + "grad_norm": 1.6099332571029663, + "learning_rate": 4.963856916838705e-05, + "loss": 5.2828, + "step": 9113 + }, + { + "epoch": 0.05420353982300885, + "grad_norm": 2.114581823348999, + "learning_rate": 4.9638490025056355e-05, + "loss": 6.1534, + "step": 9114 + }, + { + "epoch": 0.054209487106289846, + "grad_norm": 1.762335181236267, + "learning_rate": 4.963841087312462e-05, + "loss": 5.1504, + "step": 9115 + }, + { + "epoch": 0.05421543438957084, + "grad_norm": 1.7669222354888916, + "learning_rate": 4.963833171259187e-05, + "loss": 5.0365, + "step": 9116 + }, + { + "epoch": 0.05422138167285184, + "grad_norm": 1.7319819927215576, + "learning_rate": 4.963825254345814e-05, + "loss": 5.0724, + "step": 9117 + }, + { + "epoch": 0.05422732895613284, + "grad_norm": 1.618116021156311, + "learning_rate": 4.9638173365723444e-05, + "loss": 5.0964, + "step": 9118 + }, + { + "epoch": 0.05423327623941383, + "grad_norm": 1.6506006717681885, + "learning_rate": 4.9638094179387814e-05, + "loss": 5.1189, + "step": 9119 + }, + { + "epoch": 0.054239223522694835, + "grad_norm": 1.7512328624725342, + "learning_rate": 4.963801498445129e-05, + "loss": 5.2732, + "step": 9120 + }, + { + "epoch": 0.05424517080597583, + "grad_norm": 1.5639985799789429, + "learning_rate": 4.963793578091388e-05, + "loss": 5.0718, + "step": 9121 + }, + { + "epoch": 0.054251118089256825, + "grad_norm": 1.7059093713760376, + "learning_rate": 4.963785656877562e-05, + "loss": 5.0744, + "step": 9122 + }, + { + "epoch": 0.05425706537253783, + "grad_norm": 1.574802279472351, + "learning_rate": 4.9637777348036546e-05, + "loss": 5.2663, + "step": 9123 + }, + { + "epoch": 0.05426301265581882, + "grad_norm": 1.7343204021453857, + "learning_rate": 4.9637698118696674e-05, + "loss": 5.0805, + "step": 9124 + }, + { + "epoch": 0.05426895993909982, + "grad_norm": 1.6154165267944336, + "learning_rate": 4.963761888075604e-05, + "loss": 5.1402, + "step": 9125 + }, + { + "epoch": 0.05427490722238082, + "grad_norm": 1.6474148035049438, + "learning_rate": 4.9637539634214666e-05, + "loss": 5.0601, + "step": 9126 + }, + { + "epoch": 0.054280854505661814, + "grad_norm": 1.7573519945144653, + "learning_rate": 4.963746037907258e-05, + "loss": 5.1846, + "step": 9127 + }, + { + "epoch": 0.05428680178894281, + "grad_norm": 1.4558652639389038, + "learning_rate": 4.963738111532981e-05, + "loss": 5.3132, + "step": 9128 + }, + { + "epoch": 0.05429274907222381, + "grad_norm": 1.6261000633239746, + "learning_rate": 4.963730184298639e-05, + "loss": 5.2843, + "step": 9129 + }, + { + "epoch": 0.05429869635550481, + "grad_norm": 1.4502191543579102, + "learning_rate": 4.963722256204234e-05, + "loss": 5.14, + "step": 9130 + }, + { + "epoch": 0.0543046436387858, + "grad_norm": 1.6366747617721558, + "learning_rate": 4.9637143272497686e-05, + "loss": 5.1496, + "step": 9131 + }, + { + "epoch": 0.0543105909220668, + "grad_norm": 1.603745698928833, + "learning_rate": 4.963706397435246e-05, + "loss": 5.0644, + "step": 9132 + }, + { + "epoch": 0.0543165382053478, + "grad_norm": 1.419536828994751, + "learning_rate": 4.963698466760669e-05, + "loss": 5.3182, + "step": 9133 + }, + { + "epoch": 0.054322485488628794, + "grad_norm": 1.511765480041504, + "learning_rate": 4.963690535226041e-05, + "loss": 5.2808, + "step": 9134 + }, + { + "epoch": 0.05432843277190979, + "grad_norm": 1.4999688863754272, + "learning_rate": 4.963682602831364e-05, + "loss": 4.9235, + "step": 9135 + }, + { + "epoch": 0.05433438005519079, + "grad_norm": 1.5918420553207397, + "learning_rate": 4.96367466957664e-05, + "loss": 4.9293, + "step": 9136 + }, + { + "epoch": 0.054340327338471786, + "grad_norm": 1.502748727798462, + "learning_rate": 4.963666735461874e-05, + "loss": 5.2692, + "step": 9137 + }, + { + "epoch": 0.05434627462175278, + "grad_norm": 1.6474169492721558, + "learning_rate": 4.963658800487066e-05, + "loss": 5.1638, + "step": 9138 + }, + { + "epoch": 0.05435222190503378, + "grad_norm": 2.0195884704589844, + "learning_rate": 4.9636508646522204e-05, + "loss": 5.1085, + "step": 9139 + }, + { + "epoch": 0.05435816918831478, + "grad_norm": 1.7266180515289307, + "learning_rate": 4.9636429279573406e-05, + "loss": 5.0747, + "step": 9140 + }, + { + "epoch": 0.05436411647159577, + "grad_norm": 1.6965065002441406, + "learning_rate": 4.963634990402428e-05, + "loss": 5.1246, + "step": 9141 + }, + { + "epoch": 0.054370063754876775, + "grad_norm": 1.7629759311676025, + "learning_rate": 4.9636270519874856e-05, + "loss": 5.274, + "step": 9142 + }, + { + "epoch": 0.05437601103815777, + "grad_norm": 1.6365042924880981, + "learning_rate": 4.9636191127125164e-05, + "loss": 5.2469, + "step": 9143 + }, + { + "epoch": 0.054381958321438766, + "grad_norm": 1.6777831315994263, + "learning_rate": 4.9636111725775235e-05, + "loss": 5.3041, + "step": 9144 + }, + { + "epoch": 0.05438790560471976, + "grad_norm": 1.5354039669036865, + "learning_rate": 4.9636032315825096e-05, + "loss": 5.1799, + "step": 9145 + }, + { + "epoch": 0.05439385288800076, + "grad_norm": 1.508083701133728, + "learning_rate": 4.9635952897274773e-05, + "loss": 5.0822, + "step": 9146 + }, + { + "epoch": 0.05439980017128176, + "grad_norm": 1.5960441827774048, + "learning_rate": 4.963587347012429e-05, + "loss": 5.1618, + "step": 9147 + }, + { + "epoch": 0.05440574745456275, + "grad_norm": 1.4927520751953125, + "learning_rate": 4.9635794034373675e-05, + "loss": 5.1464, + "step": 9148 + }, + { + "epoch": 0.054411694737843755, + "grad_norm": 1.7420401573181152, + "learning_rate": 4.9635714590022966e-05, + "loss": 5.2866, + "step": 9149 + }, + { + "epoch": 0.05441764202112475, + "grad_norm": 1.7907800674438477, + "learning_rate": 4.9635635137072176e-05, + "loss": 5.1042, + "step": 9150 + }, + { + "epoch": 0.054423589304405745, + "grad_norm": 1.7073547840118408, + "learning_rate": 4.963555567552135e-05, + "loss": 5.1986, + "step": 9151 + }, + { + "epoch": 0.05442953658768675, + "grad_norm": 1.894405484199524, + "learning_rate": 4.96354762053705e-05, + "loss": 5.225, + "step": 9152 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 1.5830878019332886, + "learning_rate": 4.9635396726619656e-05, + "loss": 5.2902, + "step": 9153 + }, + { + "epoch": 0.05444143115424874, + "grad_norm": 1.5435214042663574, + "learning_rate": 4.963531723926885e-05, + "loss": 5.0773, + "step": 9154 + }, + { + "epoch": 0.05444737843752974, + "grad_norm": 1.4262596368789673, + "learning_rate": 4.9635237743318117e-05, + "loss": 5.129, + "step": 9155 + }, + { + "epoch": 0.054453325720810734, + "grad_norm": 1.5793390274047852, + "learning_rate": 4.9635158238767475e-05, + "loss": 5.1693, + "step": 9156 + }, + { + "epoch": 0.05445927300409173, + "grad_norm": 1.767318606376648, + "learning_rate": 4.963507872561695e-05, + "loss": 5.2541, + "step": 9157 + }, + { + "epoch": 0.05446522028737273, + "grad_norm": 1.5084065198898315, + "learning_rate": 4.963499920386658e-05, + "loss": 5.2531, + "step": 9158 + }, + { + "epoch": 0.05447116757065373, + "grad_norm": 1.797877311706543, + "learning_rate": 4.963491967351638e-05, + "loss": 5.2278, + "step": 9159 + }, + { + "epoch": 0.05447711485393472, + "grad_norm": 1.7463361024856567, + "learning_rate": 4.963484013456639e-05, + "loss": 5.1005, + "step": 9160 + }, + { + "epoch": 0.05448306213721572, + "grad_norm": 1.8208277225494385, + "learning_rate": 4.9634760587016626e-05, + "loss": 5.1437, + "step": 9161 + }, + { + "epoch": 0.05448900942049672, + "grad_norm": 1.9020015001296997, + "learning_rate": 4.9634681030867116e-05, + "loss": 5.1554, + "step": 9162 + }, + { + "epoch": 0.054494956703777714, + "grad_norm": 1.8370200395584106, + "learning_rate": 4.9634601466117904e-05, + "loss": 5.2418, + "step": 9163 + }, + { + "epoch": 0.05450090398705871, + "grad_norm": 1.785875678062439, + "learning_rate": 4.9634521892769004e-05, + "loss": 5.1916, + "step": 9164 + }, + { + "epoch": 0.05450685127033971, + "grad_norm": 1.7501643896102905, + "learning_rate": 4.963444231082045e-05, + "loss": 5.0887, + "step": 9165 + }, + { + "epoch": 0.054512798553620706, + "grad_norm": 1.6924220323562622, + "learning_rate": 4.963436272027227e-05, + "loss": 5.2458, + "step": 9166 + }, + { + "epoch": 0.0545187458369017, + "grad_norm": 1.895605206489563, + "learning_rate": 4.963428312112447e-05, + "loss": 5.1286, + "step": 9167 + }, + { + "epoch": 0.0545246931201827, + "grad_norm": 1.842207908630371, + "learning_rate": 4.963420351337711e-05, + "loss": 5.1177, + "step": 9168 + }, + { + "epoch": 0.0545306404034637, + "grad_norm": 1.7467048168182373, + "learning_rate": 4.963412389703021e-05, + "loss": 5.1616, + "step": 9169 + }, + { + "epoch": 0.05453658768674469, + "grad_norm": 1.8047499656677246, + "learning_rate": 4.963404427208378e-05, + "loss": 5.0543, + "step": 9170 + }, + { + "epoch": 0.054542534970025695, + "grad_norm": 1.5830637216567993, + "learning_rate": 4.963396463853786e-05, + "loss": 5.0989, + "step": 9171 + }, + { + "epoch": 0.05454848225330669, + "grad_norm": 1.7481937408447266, + "learning_rate": 4.9633884996392485e-05, + "loss": 5.1686, + "step": 9172 + }, + { + "epoch": 0.054554429536587686, + "grad_norm": 1.7132925987243652, + "learning_rate": 4.9633805345647664e-05, + "loss": 4.9683, + "step": 9173 + }, + { + "epoch": 0.05456037681986868, + "grad_norm": 1.8369117975234985, + "learning_rate": 4.9633725686303445e-05, + "loss": 5.154, + "step": 9174 + }, + { + "epoch": 0.05456632410314968, + "grad_norm": 1.615011215209961, + "learning_rate": 4.963364601835985e-05, + "loss": 5.0982, + "step": 9175 + }, + { + "epoch": 0.05457227138643068, + "grad_norm": 1.853742003440857, + "learning_rate": 4.963356634181689e-05, + "loss": 6.0599, + "step": 9176 + }, + { + "epoch": 0.05457821866971167, + "grad_norm": 1.5529752969741821, + "learning_rate": 4.963348665667462e-05, + "loss": 5.1355, + "step": 9177 + }, + { + "epoch": 0.054584165952992675, + "grad_norm": 1.5113881826400757, + "learning_rate": 4.963340696293305e-05, + "loss": 5.1947, + "step": 9178 + }, + { + "epoch": 0.05459011323627367, + "grad_norm": 1.6840931177139282, + "learning_rate": 4.963332726059221e-05, + "loss": 5.2163, + "step": 9179 + }, + { + "epoch": 0.054596060519554665, + "grad_norm": 1.7720422744750977, + "learning_rate": 4.963324754965214e-05, + "loss": 5.4737, + "step": 9180 + }, + { + "epoch": 0.05460200780283567, + "grad_norm": 1.632574200630188, + "learning_rate": 4.963316783011285e-05, + "loss": 5.2274, + "step": 9181 + }, + { + "epoch": 0.05460795508611666, + "grad_norm": 1.5859557390213013, + "learning_rate": 4.963308810197437e-05, + "loss": 5.3503, + "step": 9182 + }, + { + "epoch": 0.05461390236939766, + "grad_norm": 1.8342604637145996, + "learning_rate": 4.963300836523674e-05, + "loss": 5.1967, + "step": 9183 + }, + { + "epoch": 0.05461984965267866, + "grad_norm": 1.7443957328796387, + "learning_rate": 4.963292861989998e-05, + "loss": 5.0935, + "step": 9184 + }, + { + "epoch": 0.054625796935959654, + "grad_norm": 1.9289584159851074, + "learning_rate": 4.963284886596412e-05, + "loss": 5.1817, + "step": 9185 + }, + { + "epoch": 0.05463174421924065, + "grad_norm": 1.8695822954177856, + "learning_rate": 4.9632769103429186e-05, + "loss": 5.4304, + "step": 9186 + }, + { + "epoch": 0.05463769150252165, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.9632689332295206e-05, + "loss": 5.3924, + "step": 9187 + }, + { + "epoch": 0.054643638785802646, + "grad_norm": 1.6061500310897827, + "learning_rate": 4.963260955256221e-05, + "loss": 5.2309, + "step": 9188 + }, + { + "epoch": 0.05464958606908364, + "grad_norm": 1.5478893518447876, + "learning_rate": 4.963252976423022e-05, + "loss": 5.2615, + "step": 9189 + }, + { + "epoch": 0.05465553335236464, + "grad_norm": 1.4304052591323853, + "learning_rate": 4.9632449967299276e-05, + "loss": 5.2116, + "step": 9190 + }, + { + "epoch": 0.05466148063564564, + "grad_norm": 1.5438693761825562, + "learning_rate": 4.9632370161769395e-05, + "loss": 5.1176, + "step": 9191 + }, + { + "epoch": 0.054667427918926634, + "grad_norm": 1.6602065563201904, + "learning_rate": 4.9632290347640606e-05, + "loss": 5.1521, + "step": 9192 + }, + { + "epoch": 0.05467337520220763, + "grad_norm": 1.530038595199585, + "learning_rate": 4.9632210524912934e-05, + "loss": 5.1437, + "step": 9193 + }, + { + "epoch": 0.05467932248548863, + "grad_norm": 1.617691159248352, + "learning_rate": 4.963213069358643e-05, + "loss": 5.0601, + "step": 9194 + }, + { + "epoch": 0.054685269768769626, + "grad_norm": 1.722401738166809, + "learning_rate": 4.963205085366108e-05, + "loss": 5.2664, + "step": 9195 + }, + { + "epoch": 0.05469121705205062, + "grad_norm": 1.803673267364502, + "learning_rate": 4.963197100513696e-05, + "loss": 5.4164, + "step": 9196 + }, + { + "epoch": 0.05469716433533162, + "grad_norm": 1.8565739393234253, + "learning_rate": 4.963189114801405e-05, + "loss": 5.225, + "step": 9197 + }, + { + "epoch": 0.05470311161861262, + "grad_norm": 1.780698299407959, + "learning_rate": 4.963181128229242e-05, + "loss": 5.1694, + "step": 9198 + }, + { + "epoch": 0.05470905890189361, + "grad_norm": 1.820416808128357, + "learning_rate": 4.963173140797207e-05, + "loss": 5.3305, + "step": 9199 + }, + { + "epoch": 0.054715006185174615, + "grad_norm": 1.471983551979065, + "learning_rate": 4.963165152505304e-05, + "loss": 5.3217, + "step": 9200 + }, + { + "epoch": 0.05472095346845561, + "grad_norm": 1.504616141319275, + "learning_rate": 4.9631571633535354e-05, + "loss": 5.3349, + "step": 9201 + }, + { + "epoch": 0.054726900751736605, + "grad_norm": 1.5888862609863281, + "learning_rate": 4.963149173341903e-05, + "loss": 5.3431, + "step": 9202 + }, + { + "epoch": 0.0547328480350176, + "grad_norm": 1.6633155345916748, + "learning_rate": 4.963141182470412e-05, + "loss": 5.2678, + "step": 9203 + }, + { + "epoch": 0.0547387953182986, + "grad_norm": 1.7259690761566162, + "learning_rate": 4.9631331907390636e-05, + "loss": 5.348, + "step": 9204 + }, + { + "epoch": 0.0547447426015796, + "grad_norm": 1.703925371170044, + "learning_rate": 4.963125198147861e-05, + "loss": 5.4123, + "step": 9205 + }, + { + "epoch": 0.05475068988486059, + "grad_norm": 1.6619760990142822, + "learning_rate": 4.963117204696807e-05, + "loss": 5.1732, + "step": 9206 + }, + { + "epoch": 0.054756637168141595, + "grad_norm": 1.7368190288543701, + "learning_rate": 4.963109210385903e-05, + "loss": 5.0843, + "step": 9207 + }, + { + "epoch": 0.05476258445142259, + "grad_norm": 1.781179666519165, + "learning_rate": 4.9631012152151545e-05, + "loss": 5.1343, + "step": 9208 + }, + { + "epoch": 0.054768531734703585, + "grad_norm": 1.674793004989624, + "learning_rate": 4.9630932191845624e-05, + "loss": 5.4079, + "step": 9209 + }, + { + "epoch": 0.05477447901798459, + "grad_norm": 1.7708344459533691, + "learning_rate": 4.9630852222941296e-05, + "loss": 4.9702, + "step": 9210 + }, + { + "epoch": 0.05478042630126558, + "grad_norm": 1.684725046157837, + "learning_rate": 4.9630772245438594e-05, + "loss": 5.263, + "step": 9211 + }, + { + "epoch": 0.05478637358454658, + "grad_norm": 1.6064784526824951, + "learning_rate": 4.963069225933754e-05, + "loss": 5.3402, + "step": 9212 + }, + { + "epoch": 0.05479232086782758, + "grad_norm": 1.5189318656921387, + "learning_rate": 4.963061226463816e-05, + "loss": 5.1928, + "step": 9213 + }, + { + "epoch": 0.054798268151108574, + "grad_norm": 1.8095827102661133, + "learning_rate": 4.96305322613405e-05, + "loss": 5.262, + "step": 9214 + }, + { + "epoch": 0.05480421543438957, + "grad_norm": 1.8325434923171997, + "learning_rate": 4.963045224944458e-05, + "loss": 5.4975, + "step": 9215 + }, + { + "epoch": 0.05481016271767057, + "grad_norm": 1.6597868204116821, + "learning_rate": 4.963037222895042e-05, + "loss": 5.6232, + "step": 9216 + }, + { + "epoch": 0.054816110000951566, + "grad_norm": 1.6402417421340942, + "learning_rate": 4.9630292199858044e-05, + "loss": 5.5358, + "step": 9217 + }, + { + "epoch": 0.05482205728423256, + "grad_norm": 1.3956371545791626, + "learning_rate": 4.963021216216749e-05, + "loss": 5.2563, + "step": 9218 + }, + { + "epoch": 0.05482800456751356, + "grad_norm": 1.5958374738693237, + "learning_rate": 4.963013211587878e-05, + "loss": 5.1539, + "step": 9219 + }, + { + "epoch": 0.05483395185079456, + "grad_norm": 1.6152080297470093, + "learning_rate": 4.963005206099195e-05, + "loss": 5.4025, + "step": 9220 + }, + { + "epoch": 0.054839899134075554, + "grad_norm": 1.392427921295166, + "learning_rate": 4.962997199750702e-05, + "loss": 5.4149, + "step": 9221 + }, + { + "epoch": 0.05484584641735655, + "grad_norm": 1.5625338554382324, + "learning_rate": 4.962989192542403e-05, + "loss": 5.5837, + "step": 9222 + }, + { + "epoch": 0.05485179370063755, + "grad_norm": 1.6465163230895996, + "learning_rate": 4.962981184474299e-05, + "loss": 5.2934, + "step": 9223 + }, + { + "epoch": 0.054857740983918546, + "grad_norm": 1.5344611406326294, + "learning_rate": 4.962973175546394e-05, + "loss": 5.4734, + "step": 9224 + }, + { + "epoch": 0.05486368826719954, + "grad_norm": 1.2378648519515991, + "learning_rate": 4.962965165758691e-05, + "loss": 5.3368, + "step": 9225 + }, + { + "epoch": 0.05486963555048054, + "grad_norm": 1.396785020828247, + "learning_rate": 4.9629571551111915e-05, + "loss": 5.3163, + "step": 9226 + }, + { + "epoch": 0.05487558283376154, + "grad_norm": 1.639452338218689, + "learning_rate": 4.9629491436038994e-05, + "loss": 5.3933, + "step": 9227 + }, + { + "epoch": 0.05488153011704253, + "grad_norm": 1.5648834705352783, + "learning_rate": 4.9629411312368166e-05, + "loss": 5.3717, + "step": 9228 + }, + { + "epoch": 0.054887477400323535, + "grad_norm": 1.4774922132492065, + "learning_rate": 4.962933118009947e-05, + "loss": 5.1318, + "step": 9229 + }, + { + "epoch": 0.05489342468360453, + "grad_norm": 1.4987083673477173, + "learning_rate": 4.9629251039232935e-05, + "loss": 5.1436, + "step": 9230 + }, + { + "epoch": 0.054899371966885525, + "grad_norm": 1.660605788230896, + "learning_rate": 4.9629170889768586e-05, + "loss": 5.1841, + "step": 9231 + }, + { + "epoch": 0.05490531925016652, + "grad_norm": 1.4441273212432861, + "learning_rate": 4.962909073170643e-05, + "loss": 5.3108, + "step": 9232 + }, + { + "epoch": 0.05491126653344752, + "grad_norm": 1.3297922611236572, + "learning_rate": 4.962901056504653e-05, + "loss": 5.1441, + "step": 9233 + }, + { + "epoch": 0.05491721381672852, + "grad_norm": 1.2989814281463623, + "learning_rate": 4.9628930389788886e-05, + "loss": 5.5146, + "step": 9234 + }, + { + "epoch": 0.05492316110000951, + "grad_norm": 1.350948452949524, + "learning_rate": 4.962885020593354e-05, + "loss": 5.2832, + "step": 9235 + }, + { + "epoch": 0.054929108383290515, + "grad_norm": 1.5801438093185425, + "learning_rate": 4.962877001348052e-05, + "loss": 5.4251, + "step": 9236 + }, + { + "epoch": 0.05493505566657151, + "grad_norm": 1.4355653524398804, + "learning_rate": 4.9628689812429854e-05, + "loss": 5.4092, + "step": 9237 + }, + { + "epoch": 0.054941002949852505, + "grad_norm": 1.692746639251709, + "learning_rate": 4.962860960278156e-05, + "loss": 5.3858, + "step": 9238 + }, + { + "epoch": 0.05494695023313351, + "grad_norm": 1.5125641822814941, + "learning_rate": 4.962852938453567e-05, + "loss": 5.6584, + "step": 9239 + }, + { + "epoch": 0.0549528975164145, + "grad_norm": 1.4158848524093628, + "learning_rate": 4.962844915769221e-05, + "loss": 5.652, + "step": 9240 + }, + { + "epoch": 0.0549588447996955, + "grad_norm": 1.314286231994629, + "learning_rate": 4.9628368922251235e-05, + "loss": 5.501, + "step": 9241 + }, + { + "epoch": 0.0549647920829765, + "grad_norm": 1.4003247022628784, + "learning_rate": 4.962828867821273e-05, + "loss": 5.448, + "step": 9242 + }, + { + "epoch": 0.054970739366257494, + "grad_norm": 1.7670220136642456, + "learning_rate": 4.962820842557675e-05, + "loss": 5.4854, + "step": 9243 + }, + { + "epoch": 0.05497668664953849, + "grad_norm": 1.9435075521469116, + "learning_rate": 4.962812816434332e-05, + "loss": 5.3824, + "step": 9244 + }, + { + "epoch": 0.05498263393281949, + "grad_norm": 2.1733458042144775, + "learning_rate": 4.9628047894512466e-05, + "loss": 5.6771, + "step": 9245 + }, + { + "epoch": 0.054988581216100486, + "grad_norm": 1.5455420017242432, + "learning_rate": 4.962796761608421e-05, + "loss": 5.4634, + "step": 9246 + }, + { + "epoch": 0.05499452849938148, + "grad_norm": 1.623382806777954, + "learning_rate": 4.962788732905859e-05, + "loss": 5.8441, + "step": 9247 + }, + { + "epoch": 0.05500047578266248, + "grad_norm": 1.928788423538208, + "learning_rate": 4.962780703343563e-05, + "loss": 5.6553, + "step": 9248 + }, + { + "epoch": 0.05500642306594348, + "grad_norm": 1.660984992980957, + "learning_rate": 4.962772672921535e-05, + "loss": 5.5953, + "step": 9249 + }, + { + "epoch": 0.055012370349224474, + "grad_norm": 2.081026792526245, + "learning_rate": 4.962764641639779e-05, + "loss": 5.7065, + "step": 9250 + }, + { + "epoch": 0.05501831763250547, + "grad_norm": 1.8750234842300415, + "learning_rate": 4.962756609498297e-05, + "loss": 5.8814, + "step": 9251 + }, + { + "epoch": 0.05502426491578647, + "grad_norm": 1.9573127031326294, + "learning_rate": 4.9627485764970916e-05, + "loss": 5.7415, + "step": 9252 + }, + { + "epoch": 0.055030212199067466, + "grad_norm": 1.7536600828170776, + "learning_rate": 4.962740542636167e-05, + "loss": 5.5638, + "step": 9253 + }, + { + "epoch": 0.05503615948234846, + "grad_norm": 1.692557692527771, + "learning_rate": 4.962732507915525e-05, + "loss": 5.5362, + "step": 9254 + }, + { + "epoch": 0.05504210676562946, + "grad_norm": 1.9066821336746216, + "learning_rate": 4.962724472335168e-05, + "loss": 5.3094, + "step": 9255 + }, + { + "epoch": 0.05504805404891046, + "grad_norm": 2.069007158279419, + "learning_rate": 4.9627164358951e-05, + "loss": 5.766, + "step": 9256 + }, + { + "epoch": 0.05505400133219145, + "grad_norm": 2.0293545722961426, + "learning_rate": 4.9627083985953227e-05, + "loss": 5.7769, + "step": 9257 + }, + { + "epoch": 0.055059948615472455, + "grad_norm": 1.7953507900238037, + "learning_rate": 4.962700360435839e-05, + "loss": 5.8435, + "step": 9258 + }, + { + "epoch": 0.05506589589875345, + "grad_norm": 1.9281821250915527, + "learning_rate": 4.9626923214166535e-05, + "loss": 5.8342, + "step": 9259 + }, + { + "epoch": 0.055071843182034445, + "grad_norm": 1.4612617492675781, + "learning_rate": 4.962684281537766e-05, + "loss": 5.8273, + "step": 9260 + }, + { + "epoch": 0.05507779046531545, + "grad_norm": 1.8589900732040405, + "learning_rate": 4.9626762407991817e-05, + "loss": 5.7607, + "step": 9261 + }, + { + "epoch": 0.05508373774859644, + "grad_norm": 1.9395030736923218, + "learning_rate": 4.9626681992009025e-05, + "loss": 5.7573, + "step": 9262 + }, + { + "epoch": 0.05508968503187744, + "grad_norm": 1.7344708442687988, + "learning_rate": 4.962660156742931e-05, + "loss": 5.7999, + "step": 9263 + }, + { + "epoch": 0.05509563231515843, + "grad_norm": 1.7719827890396118, + "learning_rate": 4.9626521134252704e-05, + "loss": 5.7882, + "step": 9264 + }, + { + "epoch": 0.055101579598439435, + "grad_norm": 1.4955536127090454, + "learning_rate": 4.9626440692479236e-05, + "loss": 5.639, + "step": 9265 + }, + { + "epoch": 0.05510752688172043, + "grad_norm": 2.0087990760803223, + "learning_rate": 4.9626360242108925e-05, + "loss": 5.841, + "step": 9266 + }, + { + "epoch": 0.055113474165001425, + "grad_norm": 1.7334564924240112, + "learning_rate": 4.962627978314181e-05, + "loss": 5.4267, + "step": 9267 + }, + { + "epoch": 0.05511942144828243, + "grad_norm": 2.1204535961151123, + "learning_rate": 4.962619931557792e-05, + "loss": 5.4451, + "step": 9268 + }, + { + "epoch": 0.05512536873156342, + "grad_norm": 2.2374279499053955, + "learning_rate": 4.962611883941727e-05, + "loss": 5.5095, + "step": 9269 + }, + { + "epoch": 0.05513131601484442, + "grad_norm": 1.735070824623108, + "learning_rate": 4.9626038354659904e-05, + "loss": 5.3609, + "step": 9270 + }, + { + "epoch": 0.05513726329812542, + "grad_norm": 1.9748501777648926, + "learning_rate": 4.9625957861305837e-05, + "loss": 5.3366, + "step": 9271 + }, + { + "epoch": 0.055143210581406414, + "grad_norm": 1.8736618757247925, + "learning_rate": 4.96258773593551e-05, + "loss": 5.4706, + "step": 9272 + }, + { + "epoch": 0.05514915786468741, + "grad_norm": 2.571755886077881, + "learning_rate": 4.9625796848807736e-05, + "loss": 5.0393, + "step": 9273 + }, + { + "epoch": 0.05515510514796841, + "grad_norm": 2.1467013359069824, + "learning_rate": 4.962571632966375e-05, + "loss": 5.5798, + "step": 9274 + }, + { + "epoch": 0.055161052431249406, + "grad_norm": 2.4553916454315186, + "learning_rate": 4.962563580192319e-05, + "loss": 5.4323, + "step": 9275 + }, + { + "epoch": 0.0551669997145304, + "grad_norm": 2.4478797912597656, + "learning_rate": 4.962555526558607e-05, + "loss": 5.2591, + "step": 9276 + }, + { + "epoch": 0.055172946997811396, + "grad_norm": 2.2164270877838135, + "learning_rate": 4.9625474720652416e-05, + "loss": 5.3404, + "step": 9277 + }, + { + "epoch": 0.0551788942810924, + "grad_norm": 1.9161698818206787, + "learning_rate": 4.962539416712227e-05, + "loss": 5.2591, + "step": 9278 + }, + { + "epoch": 0.055184841564373394, + "grad_norm": 2.348734140396118, + "learning_rate": 4.962531360499565e-05, + "loss": 5.8132, + "step": 9279 + }, + { + "epoch": 0.05519078884765439, + "grad_norm": 2.400090456008911, + "learning_rate": 4.962523303427259e-05, + "loss": 5.7786, + "step": 9280 + }, + { + "epoch": 0.05519673613093539, + "grad_norm": 2.1626594066619873, + "learning_rate": 4.9625152454953115e-05, + "loss": 5.8488, + "step": 9281 + }, + { + "epoch": 0.055202683414216386, + "grad_norm": 1.7470853328704834, + "learning_rate": 4.962507186703725e-05, + "loss": 5.72, + "step": 9282 + }, + { + "epoch": 0.05520863069749738, + "grad_norm": 1.9191921949386597, + "learning_rate": 4.962499127052503e-05, + "loss": 5.6321, + "step": 9283 + }, + { + "epoch": 0.05521457798077838, + "grad_norm": 2.1550769805908203, + "learning_rate": 4.962491066541649e-05, + "loss": 5.4521, + "step": 9284 + }, + { + "epoch": 0.05522052526405938, + "grad_norm": 2.0529074668884277, + "learning_rate": 4.9624830051711634e-05, + "loss": 5.4108, + "step": 9285 + }, + { + "epoch": 0.05522647254734037, + "grad_norm": 1.7673834562301636, + "learning_rate": 4.962474942941051e-05, + "loss": 5.5955, + "step": 9286 + }, + { + "epoch": 0.055232419830621375, + "grad_norm": 1.9575849771499634, + "learning_rate": 4.9624668798513143e-05, + "loss": 5.6295, + "step": 9287 + }, + { + "epoch": 0.05523836711390237, + "grad_norm": 1.8054029941558838, + "learning_rate": 4.9624588159019546e-05, + "loss": 5.3372, + "step": 9288 + }, + { + "epoch": 0.055244314397183365, + "grad_norm": 1.8002424240112305, + "learning_rate": 4.962450751092978e-05, + "loss": 5.4404, + "step": 9289 + }, + { + "epoch": 0.05525026168046437, + "grad_norm": 2.052530527114868, + "learning_rate": 4.962442685424383e-05, + "loss": 5.4921, + "step": 9290 + }, + { + "epoch": 0.05525620896374536, + "grad_norm": 1.8559443950653076, + "learning_rate": 4.962434618896176e-05, + "loss": 5.5776, + "step": 9291 + }, + { + "epoch": 0.05526215624702636, + "grad_norm": 1.8794355392456055, + "learning_rate": 4.962426551508359e-05, + "loss": 5.5818, + "step": 9292 + }, + { + "epoch": 0.05526810353030735, + "grad_norm": 1.8995412588119507, + "learning_rate": 4.962418483260933e-05, + "loss": 5.6274, + "step": 9293 + }, + { + "epoch": 0.055274050813588355, + "grad_norm": 1.8608371019363403, + "learning_rate": 4.962410414153903e-05, + "loss": 5.4655, + "step": 9294 + }, + { + "epoch": 0.05527999809686935, + "grad_norm": 2.0378072261810303, + "learning_rate": 4.9624023441872715e-05, + "loss": 5.5579, + "step": 9295 + }, + { + "epoch": 0.055285945380150345, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.9623942733610397e-05, + "loss": 5.6663, + "step": 9296 + }, + { + "epoch": 0.05529189266343135, + "grad_norm": 2.4487335681915283, + "learning_rate": 4.962386201675212e-05, + "loss": 5.6792, + "step": 9297 + }, + { + "epoch": 0.05529783994671234, + "grad_norm": 2.0460383892059326, + "learning_rate": 4.96237812912979e-05, + "loss": 5.917, + "step": 9298 + }, + { + "epoch": 0.05530378722999334, + "grad_norm": 2.4838030338287354, + "learning_rate": 4.962370055724778e-05, + "loss": 5.1067, + "step": 9299 + }, + { + "epoch": 0.05530973451327434, + "grad_norm": 1.9340513944625854, + "learning_rate": 4.962361981460178e-05, + "loss": 5.2529, + "step": 9300 + }, + { + "epoch": 0.055315681796555334, + "grad_norm": 2.201068878173828, + "learning_rate": 4.9623539063359925e-05, + "loss": 5.6055, + "step": 9301 + }, + { + "epoch": 0.05532162907983633, + "grad_norm": 2.0552330017089844, + "learning_rate": 4.962345830352225e-05, + "loss": 5.3531, + "step": 9302 + }, + { + "epoch": 0.05532757636311733, + "grad_norm": 2.611407995223999, + "learning_rate": 4.9623377535088785e-05, + "loss": 5.5829, + "step": 9303 + }, + { + "epoch": 0.055333523646398326, + "grad_norm": 2.2239346504211426, + "learning_rate": 4.962329675805955e-05, + "loss": 5.3558, + "step": 9304 + }, + { + "epoch": 0.05533947092967932, + "grad_norm": 2.3899872303009033, + "learning_rate": 4.9623215972434566e-05, + "loss": 5.7277, + "step": 9305 + }, + { + "epoch": 0.055345418212960316, + "grad_norm": 2.8471267223358154, + "learning_rate": 4.962313517821389e-05, + "loss": 6.1046, + "step": 9306 + }, + { + "epoch": 0.05535136549624132, + "grad_norm": 2.426400661468506, + "learning_rate": 4.962305437539752e-05, + "loss": 5.8942, + "step": 9307 + }, + { + "epoch": 0.055357312779522314, + "grad_norm": 2.3548812866210938, + "learning_rate": 4.962297356398549e-05, + "loss": 6.0552, + "step": 9308 + }, + { + "epoch": 0.05536326006280331, + "grad_norm": 1.8423515558242798, + "learning_rate": 4.9622892743977844e-05, + "loss": 5.9377, + "step": 9309 + }, + { + "epoch": 0.05536920734608431, + "grad_norm": 2.1509203910827637, + "learning_rate": 4.96228119153746e-05, + "loss": 5.7195, + "step": 9310 + }, + { + "epoch": 0.055375154629365306, + "grad_norm": 2.3096275329589844, + "learning_rate": 4.962273107817579e-05, + "loss": 5.3461, + "step": 9311 + }, + { + "epoch": 0.0553811019126463, + "grad_norm": 1.980205774307251, + "learning_rate": 4.962265023238143e-05, + "loss": 5.8851, + "step": 9312 + }, + { + "epoch": 0.0553870491959273, + "grad_norm": 1.8162591457366943, + "learning_rate": 4.962256937799156e-05, + "loss": 5.7092, + "step": 9313 + }, + { + "epoch": 0.0553929964792083, + "grad_norm": 1.873853087425232, + "learning_rate": 4.962248851500621e-05, + "loss": 5.8939, + "step": 9314 + }, + { + "epoch": 0.05539894376248929, + "grad_norm": 1.8039345741271973, + "learning_rate": 4.96224076434254e-05, + "loss": 5.9289, + "step": 9315 + }, + { + "epoch": 0.055404891045770295, + "grad_norm": 2.3106470108032227, + "learning_rate": 4.962232676324916e-05, + "loss": 5.9103, + "step": 9316 + }, + { + "epoch": 0.05541083832905129, + "grad_norm": 2.2209455966949463, + "learning_rate": 4.962224587447752e-05, + "loss": 6.0053, + "step": 9317 + }, + { + "epoch": 0.055416785612332285, + "grad_norm": 2.0624780654907227, + "learning_rate": 4.962216497711052e-05, + "loss": 5.9258, + "step": 9318 + }, + { + "epoch": 0.05542273289561329, + "grad_norm": 2.371662139892578, + "learning_rate": 4.962208407114817e-05, + "loss": 6.4127, + "step": 9319 + }, + { + "epoch": 0.05542868017889428, + "grad_norm": 2.7035610675811768, + "learning_rate": 4.96220031565905e-05, + "loss": 5.9742, + "step": 9320 + }, + { + "epoch": 0.05543462746217528, + "grad_norm": 2.060577392578125, + "learning_rate": 4.9621922233437544e-05, + "loss": 5.9729, + "step": 9321 + }, + { + "epoch": 0.05544057474545627, + "grad_norm": 1.7935984134674072, + "learning_rate": 4.962184130168933e-05, + "loss": 5.4077, + "step": 9322 + }, + { + "epoch": 0.055446522028737275, + "grad_norm": 1.8716622591018677, + "learning_rate": 4.9621760361345885e-05, + "loss": 5.4554, + "step": 9323 + }, + { + "epoch": 0.05545246931201827, + "grad_norm": 1.9150923490524292, + "learning_rate": 4.962167941240724e-05, + "loss": 5.8121, + "step": 9324 + }, + { + "epoch": 0.055458416595299265, + "grad_norm": 1.9207059144973755, + "learning_rate": 4.962159845487342e-05, + "loss": 5.8593, + "step": 9325 + }, + { + "epoch": 0.05546436387858027, + "grad_norm": 1.962039589881897, + "learning_rate": 4.9621517488744454e-05, + "loss": 6.0174, + "step": 9326 + }, + { + "epoch": 0.05547031116186126, + "grad_norm": 2.0445704460144043, + "learning_rate": 4.9621436514020376e-05, + "loss": 5.5782, + "step": 9327 + }, + { + "epoch": 0.05547625844514226, + "grad_norm": 2.0861823558807373, + "learning_rate": 4.9621355530701204e-05, + "loss": 5.6102, + "step": 9328 + }, + { + "epoch": 0.05548220572842326, + "grad_norm": 2.0184309482574463, + "learning_rate": 4.962127453878697e-05, + "loss": 5.8072, + "step": 9329 + }, + { + "epoch": 0.055488153011704254, + "grad_norm": 1.899994134902954, + "learning_rate": 4.962119353827771e-05, + "loss": 5.7361, + "step": 9330 + }, + { + "epoch": 0.05549410029498525, + "grad_norm": 1.8874105215072632, + "learning_rate": 4.962111252917344e-05, + "loss": 5.7988, + "step": 9331 + }, + { + "epoch": 0.05550004757826625, + "grad_norm": 2.046682119369507, + "learning_rate": 4.9621031511474194e-05, + "loss": 5.7037, + "step": 9332 + }, + { + "epoch": 0.055505994861547246, + "grad_norm": 2.2552926540374756, + "learning_rate": 4.962095048517999e-05, + "loss": 5.7556, + "step": 9333 + }, + { + "epoch": 0.05551194214482824, + "grad_norm": 2.1904358863830566, + "learning_rate": 4.962086945029089e-05, + "loss": 5.6529, + "step": 9334 + }, + { + "epoch": 0.055517889428109236, + "grad_norm": 2.03745698928833, + "learning_rate": 4.9620788406806883e-05, + "loss": 5.8504, + "step": 9335 + }, + { + "epoch": 0.05552383671139024, + "grad_norm": 1.81668221950531, + "learning_rate": 4.9620707354728017e-05, + "loss": 5.3275, + "step": 9336 + }, + { + "epoch": 0.055529783994671233, + "grad_norm": 2.570976734161377, + "learning_rate": 4.962062629405432e-05, + "loss": 5.666, + "step": 9337 + }, + { + "epoch": 0.05553573127795223, + "grad_norm": 2.6855766773223877, + "learning_rate": 4.962054522478581e-05, + "loss": 5.7798, + "step": 9338 + }, + { + "epoch": 0.05554167856123323, + "grad_norm": 2.329690933227539, + "learning_rate": 4.962046414692252e-05, + "loss": 5.9334, + "step": 9339 + }, + { + "epoch": 0.055547625844514226, + "grad_norm": 1.6809495687484741, + "learning_rate": 4.962038306046449e-05, + "loss": 5.8506, + "step": 9340 + }, + { + "epoch": 0.05555357312779522, + "grad_norm": 1.7170113325119019, + "learning_rate": 4.962030196541173e-05, + "loss": 6.0863, + "step": 9341 + }, + { + "epoch": 0.05555952041107622, + "grad_norm": 2.247680902481079, + "learning_rate": 4.962022086176428e-05, + "loss": 5.2188, + "step": 9342 + }, + { + "epoch": 0.05556546769435722, + "grad_norm": 2.680091381072998, + "learning_rate": 4.9620139749522165e-05, + "loss": 4.8506, + "step": 9343 + }, + { + "epoch": 0.05557141497763821, + "grad_norm": 2.1886465549468994, + "learning_rate": 4.962005862868542e-05, + "loss": 5.5164, + "step": 9344 + }, + { + "epoch": 0.055577362260919215, + "grad_norm": 2.061368227005005, + "learning_rate": 4.961997749925405e-05, + "loss": 5.4491, + "step": 9345 + }, + { + "epoch": 0.05558330954420021, + "grad_norm": 2.368156909942627, + "learning_rate": 4.961989636122812e-05, + "loss": 5.9053, + "step": 9346 + }, + { + "epoch": 0.055589256827481205, + "grad_norm": 2.562565803527832, + "learning_rate": 4.961981521460763e-05, + "loss": 5.7683, + "step": 9347 + }, + { + "epoch": 0.05559520411076221, + "grad_norm": 2.388779640197754, + "learning_rate": 4.961973405939262e-05, + "loss": 5.1235, + "step": 9348 + }, + { + "epoch": 0.0556011513940432, + "grad_norm": 2.546994686126709, + "learning_rate": 4.9619652895583104e-05, + "loss": 4.7793, + "step": 9349 + }, + { + "epoch": 0.0556070986773242, + "grad_norm": 2.379549026489258, + "learning_rate": 4.9619571723179135e-05, + "loss": 4.8949, + "step": 9350 + }, + { + "epoch": 0.05561304596060519, + "grad_norm": 2.1621344089508057, + "learning_rate": 4.961949054218072e-05, + "loss": 4.6824, + "step": 9351 + }, + { + "epoch": 0.055618993243886194, + "grad_norm": 2.136289119720459, + "learning_rate": 4.96194093525879e-05, + "loss": 4.834, + "step": 9352 + }, + { + "epoch": 0.05562494052716719, + "grad_norm": 2.3572680950164795, + "learning_rate": 4.9619328154400694e-05, + "loss": 4.9755, + "step": 9353 + }, + { + "epoch": 0.055630887810448185, + "grad_norm": 2.2439966201782227, + "learning_rate": 4.961924694761913e-05, + "loss": 5.7662, + "step": 9354 + }, + { + "epoch": 0.05563683509372919, + "grad_norm": 2.287597894668579, + "learning_rate": 4.961916573224326e-05, + "loss": 4.6108, + "step": 9355 + }, + { + "epoch": 0.05564278237701018, + "grad_norm": 2.1382369995117188, + "learning_rate": 4.961908450827308e-05, + "loss": 4.5993, + "step": 9356 + }, + { + "epoch": 0.05564872966029118, + "grad_norm": 2.112348794937134, + "learning_rate": 4.961900327570863e-05, + "loss": 4.6798, + "step": 9357 + }, + { + "epoch": 0.05565467694357218, + "grad_norm": 2.0453972816467285, + "learning_rate": 4.9618922034549946e-05, + "loss": 4.5424, + "step": 9358 + }, + { + "epoch": 0.055660624226853174, + "grad_norm": 2.0547754764556885, + "learning_rate": 4.961884078479705e-05, + "loss": 5.0661, + "step": 9359 + }, + { + "epoch": 0.05566657151013417, + "grad_norm": 2.5003650188446045, + "learning_rate": 4.9618759526449965e-05, + "loss": 5.3388, + "step": 9360 + }, + { + "epoch": 0.05567251879341517, + "grad_norm": 2.0582423210144043, + "learning_rate": 4.9618678259508736e-05, + "loss": 5.8437, + "step": 9361 + }, + { + "epoch": 0.055678466076696166, + "grad_norm": 1.7867279052734375, + "learning_rate": 4.9618596983973376e-05, + "loss": 5.369, + "step": 9362 + }, + { + "epoch": 0.05568441335997716, + "grad_norm": 2.03729248046875, + "learning_rate": 4.961851569984392e-05, + "loss": 5.9932, + "step": 9363 + }, + { + "epoch": 0.055690360643258156, + "grad_norm": 2.2527456283569336, + "learning_rate": 4.961843440712038e-05, + "loss": 5.893, + "step": 9364 + }, + { + "epoch": 0.05569630792653916, + "grad_norm": 2.0027201175689697, + "learning_rate": 4.9618353105802815e-05, + "loss": 5.8216, + "step": 9365 + }, + { + "epoch": 0.05570225520982015, + "grad_norm": 2.236548662185669, + "learning_rate": 4.961827179589124e-05, + "loss": 5.5371, + "step": 9366 + }, + { + "epoch": 0.05570820249310115, + "grad_norm": 2.4477334022521973, + "learning_rate": 4.9618190477385666e-05, + "loss": 5.6552, + "step": 9367 + }, + { + "epoch": 0.05571414977638215, + "grad_norm": 2.504549026489258, + "learning_rate": 4.9618109150286145e-05, + "loss": 5.5732, + "step": 9368 + }, + { + "epoch": 0.055720097059663146, + "grad_norm": 2.1413187980651855, + "learning_rate": 4.9618027814592695e-05, + "loss": 5.1792, + "step": 9369 + }, + { + "epoch": 0.05572604434294414, + "grad_norm": 2.1714866161346436, + "learning_rate": 4.9617946470305344e-05, + "loss": 5.3444, + "step": 9370 + }, + { + "epoch": 0.05573199162622514, + "grad_norm": 1.7478383779525757, + "learning_rate": 4.9617865117424126e-05, + "loss": 5.7151, + "step": 9371 + }, + { + "epoch": 0.05573793890950614, + "grad_norm": 2.0415220260620117, + "learning_rate": 4.9617783755949067e-05, + "loss": 5.8765, + "step": 9372 + }, + { + "epoch": 0.05574388619278713, + "grad_norm": 1.917108416557312, + "learning_rate": 4.961770238588019e-05, + "loss": 6.0797, + "step": 9373 + }, + { + "epoch": 0.055749833476068135, + "grad_norm": 1.9404850006103516, + "learning_rate": 4.961762100721753e-05, + "loss": 6.1376, + "step": 9374 + }, + { + "epoch": 0.05575578075934913, + "grad_norm": 1.7101916074752808, + "learning_rate": 4.9617539619961104e-05, + "loss": 5.9375, + "step": 9375 + }, + { + "epoch": 0.055761728042630125, + "grad_norm": 2.591960906982422, + "learning_rate": 4.9617458224110954e-05, + "loss": 5.3716, + "step": 9376 + }, + { + "epoch": 0.05576767532591113, + "grad_norm": 2.070600986480713, + "learning_rate": 4.961737681966711e-05, + "loss": 5.3822, + "step": 9377 + }, + { + "epoch": 0.05577362260919212, + "grad_norm": 2.100820302963257, + "learning_rate": 4.9617295406629594e-05, + "loss": 5.7703, + "step": 9378 + }, + { + "epoch": 0.05577956989247312, + "grad_norm": 2.2413878440856934, + "learning_rate": 4.961721398499843e-05, + "loss": 4.9197, + "step": 9379 + }, + { + "epoch": 0.05578551717575411, + "grad_norm": 1.9762401580810547, + "learning_rate": 4.961713255477365e-05, + "loss": 5.6705, + "step": 9380 + }, + { + "epoch": 0.055791464459035114, + "grad_norm": 2.22676420211792, + "learning_rate": 4.961705111595528e-05, + "loss": 5.0196, + "step": 9381 + }, + { + "epoch": 0.05579741174231611, + "grad_norm": 2.0652241706848145, + "learning_rate": 4.9616969668543364e-05, + "loss": 5.3894, + "step": 9382 + }, + { + "epoch": 0.055803359025597105, + "grad_norm": 2.156890630722046, + "learning_rate": 4.96168882125379e-05, + "loss": 5.3063, + "step": 9383 + }, + { + "epoch": 0.05580930630887811, + "grad_norm": 2.131964683532715, + "learning_rate": 4.961680674793895e-05, + "loss": 5.9304, + "step": 9384 + }, + { + "epoch": 0.0558152535921591, + "grad_norm": 2.2117621898651123, + "learning_rate": 4.9616725274746525e-05, + "loss": 5.9553, + "step": 9385 + }, + { + "epoch": 0.0558212008754401, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.9616643792960654e-05, + "loss": 5.9911, + "step": 9386 + }, + { + "epoch": 0.0558271481587211, + "grad_norm": 1.7709077596664429, + "learning_rate": 4.961656230258136e-05, + "loss": 5.6291, + "step": 9387 + }, + { + "epoch": 0.055833095442002094, + "grad_norm": 1.838767170906067, + "learning_rate": 4.961648080360869e-05, + "loss": 6.0152, + "step": 9388 + }, + { + "epoch": 0.05583904272528309, + "grad_norm": 2.117058515548706, + "learning_rate": 4.9616399296042656e-05, + "loss": 4.8079, + "step": 9389 + }, + { + "epoch": 0.05584499000856409, + "grad_norm": 2.147491693496704, + "learning_rate": 4.9616317779883293e-05, + "loss": 4.6489, + "step": 9390 + }, + { + "epoch": 0.055850937291845086, + "grad_norm": 2.1025705337524414, + "learning_rate": 4.961623625513062e-05, + "loss": 4.4984, + "step": 9391 + }, + { + "epoch": 0.05585688457512608, + "grad_norm": 1.799986720085144, + "learning_rate": 4.961615472178468e-05, + "loss": 5.1008, + "step": 9392 + }, + { + "epoch": 0.055862831858407076, + "grad_norm": 2.2975053787231445, + "learning_rate": 4.961607317984549e-05, + "loss": 5.9754, + "step": 9393 + }, + { + "epoch": 0.05586877914168808, + "grad_norm": 1.9996155500411987, + "learning_rate": 4.961599162931309e-05, + "loss": 5.9255, + "step": 9394 + }, + { + "epoch": 0.05587472642496907, + "grad_norm": 1.7344794273376465, + "learning_rate": 4.9615910070187496e-05, + "loss": 6.0873, + "step": 9395 + }, + { + "epoch": 0.05588067370825007, + "grad_norm": 2.260706901550293, + "learning_rate": 4.961582850246875e-05, + "loss": 5.9454, + "step": 9396 + }, + { + "epoch": 0.05588662099153107, + "grad_norm": 2.1810765266418457, + "learning_rate": 4.961574692615686e-05, + "loss": 5.7548, + "step": 9397 + }, + { + "epoch": 0.055892568274812066, + "grad_norm": 2.0940003395080566, + "learning_rate": 4.961566534125188e-05, + "loss": 5.8184, + "step": 9398 + }, + { + "epoch": 0.05589851555809306, + "grad_norm": 2.066464900970459, + "learning_rate": 4.961558374775382e-05, + "loss": 5.7867, + "step": 9399 + }, + { + "epoch": 0.05590446284137406, + "grad_norm": 1.7197705507278442, + "learning_rate": 4.961550214566271e-05, + "loss": 5.9211, + "step": 9400 + }, + { + "epoch": 0.05591041012465506, + "grad_norm": 2.3055293560028076, + "learning_rate": 4.9615420534978583e-05, + "loss": 5.9531, + "step": 9401 + }, + { + "epoch": 0.05591635740793605, + "grad_norm": 2.0974669456481934, + "learning_rate": 4.961533891570147e-05, + "loss": 5.9347, + "step": 9402 + }, + { + "epoch": 0.055922304691217055, + "grad_norm": 2.5196354389190674, + "learning_rate": 4.96152572878314e-05, + "loss": 5.0729, + "step": 9403 + }, + { + "epoch": 0.05592825197449805, + "grad_norm": 2.157181978225708, + "learning_rate": 4.9615175651368395e-05, + "loss": 5.9513, + "step": 9404 + }, + { + "epoch": 0.055934199257779045, + "grad_norm": 1.94083833694458, + "learning_rate": 4.9615094006312485e-05, + "loss": 5.9239, + "step": 9405 + }, + { + "epoch": 0.05594014654106005, + "grad_norm": 2.2118191719055176, + "learning_rate": 4.9615012352663704e-05, + "loss": 5.6936, + "step": 9406 + }, + { + "epoch": 0.05594609382434104, + "grad_norm": 2.2255051136016846, + "learning_rate": 4.9614930690422065e-05, + "loss": 5.7475, + "step": 9407 + }, + { + "epoch": 0.05595204110762204, + "grad_norm": 2.1640844345092773, + "learning_rate": 4.961484901958762e-05, + "loss": 5.8138, + "step": 9408 + }, + { + "epoch": 0.05595798839090303, + "grad_norm": 2.2722928524017334, + "learning_rate": 4.961476734016038e-05, + "loss": 5.5784, + "step": 9409 + }, + { + "epoch": 0.055963935674184034, + "grad_norm": 2.0541749000549316, + "learning_rate": 4.961468565214039e-05, + "loss": 5.6871, + "step": 9410 + }, + { + "epoch": 0.05596988295746503, + "grad_norm": 2.3496010303497314, + "learning_rate": 4.9614603955527655e-05, + "loss": 5.4195, + "step": 9411 + }, + { + "epoch": 0.055975830240746025, + "grad_norm": 2.333435297012329, + "learning_rate": 4.9614522250322215e-05, + "loss": 5.4257, + "step": 9412 + }, + { + "epoch": 0.05598177752402703, + "grad_norm": 2.339057445526123, + "learning_rate": 4.9614440536524106e-05, + "loss": 5.4158, + "step": 9413 + }, + { + "epoch": 0.05598772480730802, + "grad_norm": 2.4383058547973633, + "learning_rate": 4.961435881413335e-05, + "loss": 5.4569, + "step": 9414 + }, + { + "epoch": 0.05599367209058902, + "grad_norm": 2.1405389308929443, + "learning_rate": 4.961427708314997e-05, + "loss": 5.6178, + "step": 9415 + }, + { + "epoch": 0.05599961937387002, + "grad_norm": 2.2082836627960205, + "learning_rate": 4.961419534357401e-05, + "loss": 5.386, + "step": 9416 + }, + { + "epoch": 0.056005566657151014, + "grad_norm": 2.0305027961730957, + "learning_rate": 4.961411359540548e-05, + "loss": 5.2822, + "step": 9417 + }, + { + "epoch": 0.05601151394043201, + "grad_norm": 2.606452226638794, + "learning_rate": 4.961403183864442e-05, + "loss": 5.2691, + "step": 9418 + }, + { + "epoch": 0.05601746122371301, + "grad_norm": 2.3506669998168945, + "learning_rate": 4.961395007329086e-05, + "loss": 5.3307, + "step": 9419 + }, + { + "epoch": 0.056023408506994006, + "grad_norm": 2.3472225666046143, + "learning_rate": 4.961386829934482e-05, + "loss": 5.2247, + "step": 9420 + }, + { + "epoch": 0.056029355790275, + "grad_norm": 2.1121721267700195, + "learning_rate": 4.961378651680633e-05, + "loss": 5.2857, + "step": 9421 + }, + { + "epoch": 0.056035303073555996, + "grad_norm": 2.4357142448425293, + "learning_rate": 4.9613704725675427e-05, + "loss": 5.3398, + "step": 9422 + }, + { + "epoch": 0.056041250356837, + "grad_norm": 2.639418125152588, + "learning_rate": 4.961362292595213e-05, + "loss": 5.3008, + "step": 9423 + }, + { + "epoch": 0.05604719764011799, + "grad_norm": 3.297189712524414, + "learning_rate": 4.961354111763647e-05, + "loss": 5.5908, + "step": 9424 + }, + { + "epoch": 0.05605314492339899, + "grad_norm": 2.095613718032837, + "learning_rate": 4.961345930072848e-05, + "loss": 5.2389, + "step": 9425 + }, + { + "epoch": 0.05605909220667999, + "grad_norm": 2.2495081424713135, + "learning_rate": 4.9613377475228186e-05, + "loss": 5.474, + "step": 9426 + }, + { + "epoch": 0.056065039489960986, + "grad_norm": 2.282697916030884, + "learning_rate": 4.961329564113562e-05, + "loss": 5.3253, + "step": 9427 + }, + { + "epoch": 0.05607098677324198, + "grad_norm": 2.515075206756592, + "learning_rate": 4.96132137984508e-05, + "loss": 5.238, + "step": 9428 + }, + { + "epoch": 0.05607693405652298, + "grad_norm": 2.072274684906006, + "learning_rate": 4.961313194717376e-05, + "loss": 5.3627, + "step": 9429 + }, + { + "epoch": 0.05608288133980398, + "grad_norm": 2.4552547931671143, + "learning_rate": 4.961305008730454e-05, + "loss": 6.1799, + "step": 9430 + }, + { + "epoch": 0.05608882862308497, + "grad_norm": 2.2289538383483887, + "learning_rate": 4.9612968218843146e-05, + "loss": 5.5477, + "step": 9431 + }, + { + "epoch": 0.056094775906365975, + "grad_norm": 2.6174185276031494, + "learning_rate": 4.9612886341789635e-05, + "loss": 5.1779, + "step": 9432 + }, + { + "epoch": 0.05610072318964697, + "grad_norm": 2.4489150047302246, + "learning_rate": 4.9612804456144005e-05, + "loss": 5.2067, + "step": 9433 + }, + { + "epoch": 0.056106670472927965, + "grad_norm": 2.2651829719543457, + "learning_rate": 4.96127225619063e-05, + "loss": 5.3582, + "step": 9434 + }, + { + "epoch": 0.05611261775620897, + "grad_norm": 2.1985251903533936, + "learning_rate": 4.9612640659076556e-05, + "loss": 5.2034, + "step": 9435 + }, + { + "epoch": 0.05611856503948996, + "grad_norm": 1.9510128498077393, + "learning_rate": 4.961255874765479e-05, + "loss": 5.2263, + "step": 9436 + }, + { + "epoch": 0.05612451232277096, + "grad_norm": 2.338815212249756, + "learning_rate": 4.961247682764104e-05, + "loss": 5.9091, + "step": 9437 + }, + { + "epoch": 0.05613045960605195, + "grad_norm": 2.097111225128174, + "learning_rate": 4.961239489903532e-05, + "loss": 6.3285, + "step": 9438 + }, + { + "epoch": 0.056136406889332954, + "grad_norm": 1.9965720176696777, + "learning_rate": 4.961231296183767e-05, + "loss": 6.3141, + "step": 9439 + }, + { + "epoch": 0.05614235417261395, + "grad_norm": 2.2406206130981445, + "learning_rate": 4.9612231016048114e-05, + "loss": 5.7335, + "step": 9440 + }, + { + "epoch": 0.056148301455894944, + "grad_norm": 2.2798993587493896, + "learning_rate": 4.961214906166668e-05, + "loss": 4.9959, + "step": 9441 + }, + { + "epoch": 0.056154248739175947, + "grad_norm": 2.482706069946289, + "learning_rate": 4.96120670986934e-05, + "loss": 5.295, + "step": 9442 + }, + { + "epoch": 0.05616019602245694, + "grad_norm": 2.398867607116699, + "learning_rate": 4.961198512712831e-05, + "loss": 4.9592, + "step": 9443 + }, + { + "epoch": 0.05616614330573794, + "grad_norm": 2.1979055404663086, + "learning_rate": 4.961190314697143e-05, + "loss": 5.1003, + "step": 9444 + }, + { + "epoch": 0.05617209058901894, + "grad_norm": 2.3249244689941406, + "learning_rate": 4.961182115822278e-05, + "loss": 5.1408, + "step": 9445 + }, + { + "epoch": 0.056178037872299934, + "grad_norm": 2.3679821491241455, + "learning_rate": 4.96117391608824e-05, + "loss": 5.4006, + "step": 9446 + }, + { + "epoch": 0.05618398515558093, + "grad_norm": 1.8706363439559937, + "learning_rate": 4.961165715495032e-05, + "loss": 6.1741, + "step": 9447 + }, + { + "epoch": 0.05618993243886193, + "grad_norm": 2.1825344562530518, + "learning_rate": 4.961157514042656e-05, + "loss": 6.0869, + "step": 9448 + }, + { + "epoch": 0.056195879722142926, + "grad_norm": 1.85076904296875, + "learning_rate": 4.961149311731116e-05, + "loss": 5.9252, + "step": 9449 + }, + { + "epoch": 0.05620182700542392, + "grad_norm": 1.9433631896972656, + "learning_rate": 4.961141108560413e-05, + "loss": 5.968, + "step": 9450 + }, + { + "epoch": 0.056207774288704916, + "grad_norm": 2.5718259811401367, + "learning_rate": 4.961132904530552e-05, + "loss": 5.4274, + "step": 9451 + }, + { + "epoch": 0.05621372157198592, + "grad_norm": 1.919552206993103, + "learning_rate": 4.961124699641535e-05, + "loss": 5.1943, + "step": 9452 + }, + { + "epoch": 0.05621966885526691, + "grad_norm": 2.1371817588806152, + "learning_rate": 4.961116493893364e-05, + "loss": 5.9949, + "step": 9453 + }, + { + "epoch": 0.05622561613854791, + "grad_norm": 2.5715489387512207, + "learning_rate": 4.961108287286044e-05, + "loss": 6.2061, + "step": 9454 + }, + { + "epoch": 0.05623156342182891, + "grad_norm": 2.1871471405029297, + "learning_rate": 4.961100079819575e-05, + "loss": 5.7872, + "step": 9455 + }, + { + "epoch": 0.056237510705109905, + "grad_norm": 2.011925220489502, + "learning_rate": 4.961091871493962e-05, + "loss": 5.7992, + "step": 9456 + }, + { + "epoch": 0.0562434579883909, + "grad_norm": 2.516580820083618, + "learning_rate": 4.9610836623092074e-05, + "loss": 5.9154, + "step": 9457 + }, + { + "epoch": 0.0562494052716719, + "grad_norm": 1.9336326122283936, + "learning_rate": 4.961075452265314e-05, + "loss": 5.7933, + "step": 9458 + }, + { + "epoch": 0.0562553525549529, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.961067241362285e-05, + "loss": 6.1897, + "step": 9459 + }, + { + "epoch": 0.05626129983823389, + "grad_norm": 1.9757578372955322, + "learning_rate": 4.961059029600122e-05, + "loss": 6.0909, + "step": 9460 + }, + { + "epoch": 0.056267247121514895, + "grad_norm": 1.9767241477966309, + "learning_rate": 4.9610508169788294e-05, + "loss": 6.2212, + "step": 9461 + }, + { + "epoch": 0.05627319440479589, + "grad_norm": 1.9890403747558594, + "learning_rate": 4.961042603498409e-05, + "loss": 6.5071, + "step": 9462 + }, + { + "epoch": 0.056279141688076885, + "grad_norm": 1.9011937379837036, + "learning_rate": 4.961034389158864e-05, + "loss": 5.8098, + "step": 9463 + }, + { + "epoch": 0.05628508897135789, + "grad_norm": 2.236356735229492, + "learning_rate": 4.961026173960197e-05, + "loss": 4.8901, + "step": 9464 + }, + { + "epoch": 0.05629103625463888, + "grad_norm": 1.9147372245788574, + "learning_rate": 4.961017957902412e-05, + "loss": 5.1372, + "step": 9465 + }, + { + "epoch": 0.05629698353791988, + "grad_norm": 1.9628163576126099, + "learning_rate": 4.9610097409855106e-05, + "loss": 5.1161, + "step": 9466 + }, + { + "epoch": 0.05630293082120087, + "grad_norm": 2.0323991775512695, + "learning_rate": 4.961001523209496e-05, + "loss": 5.1493, + "step": 9467 + }, + { + "epoch": 0.056308878104481874, + "grad_norm": 1.7026360034942627, + "learning_rate": 4.9609933045743714e-05, + "loss": 5.2349, + "step": 9468 + }, + { + "epoch": 0.05631482538776287, + "grad_norm": 1.7758761644363403, + "learning_rate": 4.9609850850801394e-05, + "loss": 5.231, + "step": 9469 + }, + { + "epoch": 0.056320772671043864, + "grad_norm": 2.3305037021636963, + "learning_rate": 4.9609768647268026e-05, + "loss": 5.9209, + "step": 9470 + }, + { + "epoch": 0.056326719954324866, + "grad_norm": 2.2628681659698486, + "learning_rate": 4.960968643514365e-05, + "loss": 5.4753, + "step": 9471 + }, + { + "epoch": 0.05633266723760586, + "grad_norm": 2.4022347927093506, + "learning_rate": 4.9609604214428286e-05, + "loss": 4.8414, + "step": 9472 + }, + { + "epoch": 0.05633861452088686, + "grad_norm": 2.2767343521118164, + "learning_rate": 4.9609521985121955e-05, + "loss": 4.7178, + "step": 9473 + }, + { + "epoch": 0.05634456180416786, + "grad_norm": 2.547600507736206, + "learning_rate": 4.96094397472247e-05, + "loss": 4.7365, + "step": 9474 + }, + { + "epoch": 0.056350509087448854, + "grad_norm": 2.3546998500823975, + "learning_rate": 4.960935750073654e-05, + "loss": 5.4846, + "step": 9475 + }, + { + "epoch": 0.05635645637072985, + "grad_norm": 2.9641268253326416, + "learning_rate": 4.960927524565751e-05, + "loss": 5.7409, + "step": 9476 + }, + { + "epoch": 0.05636240365401085, + "grad_norm": 3.1727824211120605, + "learning_rate": 4.960919298198764e-05, + "loss": 5.8456, + "step": 9477 + }, + { + "epoch": 0.056368350937291846, + "grad_norm": 2.620507001876831, + "learning_rate": 4.960911070972695e-05, + "loss": 5.6295, + "step": 9478 + }, + { + "epoch": 0.05637429822057284, + "grad_norm": 2.6132571697235107, + "learning_rate": 4.960902842887548e-05, + "loss": 5.697, + "step": 9479 + }, + { + "epoch": 0.056380245503853836, + "grad_norm": 2.2931299209594727, + "learning_rate": 4.960894613943324e-05, + "loss": 5.4723, + "step": 9480 + }, + { + "epoch": 0.05638619278713484, + "grad_norm": 2.176729202270508, + "learning_rate": 4.9608863841400284e-05, + "loss": 5.7403, + "step": 9481 + }, + { + "epoch": 0.05639214007041583, + "grad_norm": 1.932180404663086, + "learning_rate": 4.9608781534776616e-05, + "loss": 5.9256, + "step": 9482 + }, + { + "epoch": 0.05639808735369683, + "grad_norm": 1.7315243482589722, + "learning_rate": 4.9608699219562286e-05, + "loss": 5.9176, + "step": 9483 + }, + { + "epoch": 0.05640403463697783, + "grad_norm": 1.6548408269882202, + "learning_rate": 4.9608616895757306e-05, + "loss": 5.7495, + "step": 9484 + }, + { + "epoch": 0.056409981920258825, + "grad_norm": 1.8549202680587769, + "learning_rate": 4.960853456336172e-05, + "loss": 5.5261, + "step": 9485 + }, + { + "epoch": 0.05641592920353982, + "grad_norm": 2.5990993976593018, + "learning_rate": 4.9608452222375544e-05, + "loss": 5.5934, + "step": 9486 + }, + { + "epoch": 0.05642187648682082, + "grad_norm": 1.705051302909851, + "learning_rate": 4.9608369872798815e-05, + "loss": 5.3613, + "step": 9487 + }, + { + "epoch": 0.05642782377010182, + "grad_norm": 1.6170406341552734, + "learning_rate": 4.960828751463156e-05, + "loss": 5.2743, + "step": 9488 + }, + { + "epoch": 0.05643377105338281, + "grad_norm": 1.6247482299804688, + "learning_rate": 4.9608205147873796e-05, + "loss": 5.2772, + "step": 9489 + }, + { + "epoch": 0.056439718336663815, + "grad_norm": 1.7574137449264526, + "learning_rate": 4.9608122772525575e-05, + "loss": 5.3464, + "step": 9490 + }, + { + "epoch": 0.05644566561994481, + "grad_norm": 1.8814537525177002, + "learning_rate": 4.960804038858691e-05, + "loss": 5.3092, + "step": 9491 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 2.0222842693328857, + "learning_rate": 4.9607957996057816e-05, + "loss": 4.8234, + "step": 9492 + }, + { + "epoch": 0.05645756018650681, + "grad_norm": 1.6224759817123413, + "learning_rate": 4.960787559493836e-05, + "loss": 5.3962, + "step": 9493 + }, + { + "epoch": 0.0564635074697878, + "grad_norm": 1.4097533226013184, + "learning_rate": 4.960779318522853e-05, + "loss": 5.8302, + "step": 9494 + }, + { + "epoch": 0.0564694547530688, + "grad_norm": 1.7296205759048462, + "learning_rate": 4.960771076692839e-05, + "loss": 5.5679, + "step": 9495 + }, + { + "epoch": 0.05647540203634979, + "grad_norm": 1.6300212144851685, + "learning_rate": 4.960762834003794e-05, + "loss": 5.4315, + "step": 9496 + }, + { + "epoch": 0.056481349319630794, + "grad_norm": 1.8587864637374878, + "learning_rate": 4.960754590455723e-05, + "loss": 5.5492, + "step": 9497 + }, + { + "epoch": 0.05648729660291179, + "grad_norm": 1.8136985301971436, + "learning_rate": 4.960746346048628e-05, + "loss": 5.6363, + "step": 9498 + }, + { + "epoch": 0.056493243886192784, + "grad_norm": 2.1277284622192383, + "learning_rate": 4.960738100782511e-05, + "loss": 5.593, + "step": 9499 + }, + { + "epoch": 0.056499191169473786, + "grad_norm": 2.0262863636016846, + "learning_rate": 4.960729854657377e-05, + "loss": 5.6396, + "step": 9500 + }, + { + "epoch": 0.05650513845275478, + "grad_norm": 1.7870309352874756, + "learning_rate": 4.9607216076732266e-05, + "loss": 5.6523, + "step": 9501 + }, + { + "epoch": 0.05651108573603578, + "grad_norm": 1.734782099723816, + "learning_rate": 4.9607133598300636e-05, + "loss": 5.5313, + "step": 9502 + }, + { + "epoch": 0.05651703301931678, + "grad_norm": 2.2485032081604004, + "learning_rate": 4.9607051111278914e-05, + "loss": 5.3814, + "step": 9503 + }, + { + "epoch": 0.056522980302597774, + "grad_norm": 1.5091774463653564, + "learning_rate": 4.9606968615667125e-05, + "loss": 5.5277, + "step": 9504 + }, + { + "epoch": 0.05652892758587877, + "grad_norm": 1.7117774486541748, + "learning_rate": 4.9606886111465303e-05, + "loss": 5.2649, + "step": 9505 + }, + { + "epoch": 0.05653487486915977, + "grad_norm": 1.7309353351593018, + "learning_rate": 4.960680359867346e-05, + "loss": 5.2276, + "step": 9506 + }, + { + "epoch": 0.056540822152440766, + "grad_norm": 1.7058963775634766, + "learning_rate": 4.960672107729164e-05, + "loss": 5.1848, + "step": 9507 + }, + { + "epoch": 0.05654676943572176, + "grad_norm": 1.7862296104431152, + "learning_rate": 4.960663854731987e-05, + "loss": 5.2424, + "step": 9508 + }, + { + "epoch": 0.05655271671900276, + "grad_norm": 1.8900794982910156, + "learning_rate": 4.960655600875818e-05, + "loss": 5.283, + "step": 9509 + }, + { + "epoch": 0.05655866400228376, + "grad_norm": 1.9991587400436401, + "learning_rate": 4.960647346160658e-05, + "loss": 5.3525, + "step": 9510 + }, + { + "epoch": 0.05656461128556475, + "grad_norm": 1.6889851093292236, + "learning_rate": 4.960639090586513e-05, + "loss": 5.0592, + "step": 9511 + }, + { + "epoch": 0.05657055856884575, + "grad_norm": 1.6314234733581543, + "learning_rate": 4.9606308341533844e-05, + "loss": 5.1733, + "step": 9512 + }, + { + "epoch": 0.05657650585212675, + "grad_norm": 1.7801847457885742, + "learning_rate": 4.960622576861275e-05, + "loss": 5.2358, + "step": 9513 + }, + { + "epoch": 0.056582453135407745, + "grad_norm": 1.6572017669677734, + "learning_rate": 4.9606143187101864e-05, + "loss": 5.2429, + "step": 9514 + }, + { + "epoch": 0.05658840041868874, + "grad_norm": 1.7574421167373657, + "learning_rate": 4.960606059700124e-05, + "loss": 5.0717, + "step": 9515 + }, + { + "epoch": 0.05659434770196974, + "grad_norm": 1.8162970542907715, + "learning_rate": 4.960597799831088e-05, + "loss": 5.1513, + "step": 9516 + }, + { + "epoch": 0.05660029498525074, + "grad_norm": 1.9231795072555542, + "learning_rate": 4.960589539103084e-05, + "loss": 5.1539, + "step": 9517 + }, + { + "epoch": 0.05660624226853173, + "grad_norm": 1.624566674232483, + "learning_rate": 4.9605812775161136e-05, + "loss": 5.0999, + "step": 9518 + }, + { + "epoch": 0.056612189551812735, + "grad_norm": 1.4293668270111084, + "learning_rate": 4.960573015070179e-05, + "loss": 5.2365, + "step": 9519 + }, + { + "epoch": 0.05661813683509373, + "grad_norm": 1.789515495300293, + "learning_rate": 4.960564751765284e-05, + "loss": 5.2233, + "step": 9520 + }, + { + "epoch": 0.056624084118374725, + "grad_norm": 1.7212306261062622, + "learning_rate": 4.960556487601432e-05, + "loss": 5.1902, + "step": 9521 + }, + { + "epoch": 0.05663003140165573, + "grad_norm": 1.7691519260406494, + "learning_rate": 4.960548222578625e-05, + "loss": 5.2136, + "step": 9522 + }, + { + "epoch": 0.05663597868493672, + "grad_norm": 1.5925794839859009, + "learning_rate": 4.960539956696866e-05, + "loss": 5.4808, + "step": 9523 + }, + { + "epoch": 0.05664192596821772, + "grad_norm": 1.7014095783233643, + "learning_rate": 4.960531689956157e-05, + "loss": 5.1934, + "step": 9524 + }, + { + "epoch": 0.05664787325149871, + "grad_norm": 1.3620802164077759, + "learning_rate": 4.960523422356502e-05, + "loss": 5.0169, + "step": 9525 + }, + { + "epoch": 0.056653820534779714, + "grad_norm": 1.4778205156326294, + "learning_rate": 4.960515153897904e-05, + "loss": 5.1535, + "step": 9526 + }, + { + "epoch": 0.05665976781806071, + "grad_norm": 1.6393300294876099, + "learning_rate": 4.960506884580366e-05, + "loss": 5.2494, + "step": 9527 + }, + { + "epoch": 0.056665715101341704, + "grad_norm": 1.6070711612701416, + "learning_rate": 4.96049861440389e-05, + "loss": 5.3117, + "step": 9528 + }, + { + "epoch": 0.056671662384622706, + "grad_norm": 1.6023461818695068, + "learning_rate": 4.96049034336848e-05, + "loss": 5.1554, + "step": 9529 + }, + { + "epoch": 0.0566776096679037, + "grad_norm": 1.6061514616012573, + "learning_rate": 4.9604820714741374e-05, + "loss": 5.4123, + "step": 9530 + }, + { + "epoch": 0.056683556951184697, + "grad_norm": 1.8043792247772217, + "learning_rate": 4.960473798720866e-05, + "loss": 5.2582, + "step": 9531 + }, + { + "epoch": 0.0566895042344657, + "grad_norm": 1.6002432107925415, + "learning_rate": 4.960465525108669e-05, + "loss": 5.211, + "step": 9532 + }, + { + "epoch": 0.056695451517746694, + "grad_norm": 1.851266622543335, + "learning_rate": 4.960457250637549e-05, + "loss": 5.0949, + "step": 9533 + }, + { + "epoch": 0.05670139880102769, + "grad_norm": 1.7806520462036133, + "learning_rate": 4.9604489753075085e-05, + "loss": 5.1178, + "step": 9534 + }, + { + "epoch": 0.05670734608430869, + "grad_norm": 1.9938620328903198, + "learning_rate": 4.9604406991185506e-05, + "loss": 5.098, + "step": 9535 + }, + { + "epoch": 0.056713293367589686, + "grad_norm": 1.7983622550964355, + "learning_rate": 4.960432422070679e-05, + "loss": 4.98, + "step": 9536 + }, + { + "epoch": 0.05671924065087068, + "grad_norm": 1.845821499824524, + "learning_rate": 4.960424144163895e-05, + "loss": 4.951, + "step": 9537 + }, + { + "epoch": 0.05672518793415168, + "grad_norm": 1.8922109603881836, + "learning_rate": 4.960415865398202e-05, + "loss": 5.0327, + "step": 9538 + }, + { + "epoch": 0.05673113521743268, + "grad_norm": 2.159832239151001, + "learning_rate": 4.960407585773604e-05, + "loss": 5.5287, + "step": 9539 + }, + { + "epoch": 0.05673708250071367, + "grad_norm": 1.9966739416122437, + "learning_rate": 4.960399305290103e-05, + "loss": 5.7114, + "step": 9540 + }, + { + "epoch": 0.05674302978399467, + "grad_norm": 1.8796072006225586, + "learning_rate": 4.9603910239477026e-05, + "loss": 5.4673, + "step": 9541 + }, + { + "epoch": 0.05674897706727567, + "grad_norm": 1.6589174270629883, + "learning_rate": 4.9603827417464045e-05, + "loss": 5.3755, + "step": 9542 + }, + { + "epoch": 0.056754924350556665, + "grad_norm": 1.975807547569275, + "learning_rate": 4.960374458686212e-05, + "loss": 5.0648, + "step": 9543 + }, + { + "epoch": 0.05676087163383766, + "grad_norm": 1.7437241077423096, + "learning_rate": 4.960366174767128e-05, + "loss": 5.2338, + "step": 9544 + }, + { + "epoch": 0.05676681891711866, + "grad_norm": 1.8508884906768799, + "learning_rate": 4.9603578899891564e-05, + "loss": 5.3432, + "step": 9545 + }, + { + "epoch": 0.05677276620039966, + "grad_norm": 2.2117562294006348, + "learning_rate": 4.960349604352299e-05, + "loss": 5.0623, + "step": 9546 + }, + { + "epoch": 0.05677871348368065, + "grad_norm": 1.7681034803390503, + "learning_rate": 4.9603413178565586e-05, + "loss": 5.1998, + "step": 9547 + }, + { + "epoch": 0.056784660766961655, + "grad_norm": 2.4477179050445557, + "learning_rate": 4.960333030501939e-05, + "loss": 5.3317, + "step": 9548 + }, + { + "epoch": 0.05679060805024265, + "grad_norm": 1.8297652006149292, + "learning_rate": 4.9603247422884426e-05, + "loss": 5.3608, + "step": 9549 + }, + { + "epoch": 0.056796555333523645, + "grad_norm": 1.8361153602600098, + "learning_rate": 4.9603164532160715e-05, + "loss": 5.3914, + "step": 9550 + }, + { + "epoch": 0.05680250261680465, + "grad_norm": 1.748226523399353, + "learning_rate": 4.96030816328483e-05, + "loss": 5.3436, + "step": 9551 + }, + { + "epoch": 0.05680844990008564, + "grad_norm": 1.744964599609375, + "learning_rate": 4.96029987249472e-05, + "loss": 5.4287, + "step": 9552 + }, + { + "epoch": 0.05681439718336664, + "grad_norm": 1.9512866735458374, + "learning_rate": 4.9602915808457454e-05, + "loss": 5.3601, + "step": 9553 + }, + { + "epoch": 0.05682034446664763, + "grad_norm": 1.5863629579544067, + "learning_rate": 4.9602832883379077e-05, + "loss": 5.5491, + "step": 9554 + }, + { + "epoch": 0.056826291749928634, + "grad_norm": 1.967677354812622, + "learning_rate": 4.96027499497121e-05, + "loss": 5.2402, + "step": 9555 + }, + { + "epoch": 0.05683223903320963, + "grad_norm": 2.277714252471924, + "learning_rate": 4.960266700745657e-05, + "loss": 5.5155, + "step": 9556 + }, + { + "epoch": 0.056838186316490624, + "grad_norm": 1.8371034860610962, + "learning_rate": 4.96025840566125e-05, + "loss": 5.2694, + "step": 9557 + }, + { + "epoch": 0.056844133599771626, + "grad_norm": 1.723008155822754, + "learning_rate": 4.9602501097179915e-05, + "loss": 5.4983, + "step": 9558 + }, + { + "epoch": 0.05685008088305262, + "grad_norm": 1.6955413818359375, + "learning_rate": 4.960241812915886e-05, + "loss": 5.6888, + "step": 9559 + }, + { + "epoch": 0.056856028166333616, + "grad_norm": 1.5899012088775635, + "learning_rate": 4.960233515254935e-05, + "loss": 5.4241, + "step": 9560 + }, + { + "epoch": 0.05686197544961462, + "grad_norm": 1.493268370628357, + "learning_rate": 4.9602252167351416e-05, + "loss": 5.1889, + "step": 9561 + }, + { + "epoch": 0.056867922732895614, + "grad_norm": 1.8037081956863403, + "learning_rate": 4.9602169173565094e-05, + "loss": 5.1785, + "step": 9562 + }, + { + "epoch": 0.05687387001617661, + "grad_norm": 1.6377664804458618, + "learning_rate": 4.960208617119041e-05, + "loss": 5.2593, + "step": 9563 + }, + { + "epoch": 0.05687981729945761, + "grad_norm": 2.077209234237671, + "learning_rate": 4.960200316022739e-05, + "loss": 5.1012, + "step": 9564 + }, + { + "epoch": 0.056885764582738606, + "grad_norm": 2.3584885597229004, + "learning_rate": 4.9601920140676064e-05, + "loss": 5.1141, + "step": 9565 + }, + { + "epoch": 0.0568917118660196, + "grad_norm": 1.990319013595581, + "learning_rate": 4.960183711253646e-05, + "loss": 4.9336, + "step": 9566 + }, + { + "epoch": 0.0568976591493006, + "grad_norm": 2.037742853164673, + "learning_rate": 4.960175407580861e-05, + "loss": 4.8494, + "step": 9567 + }, + { + "epoch": 0.0569036064325816, + "grad_norm": 1.8493839502334595, + "learning_rate": 4.9601671030492546e-05, + "loss": 5.337, + "step": 9568 + }, + { + "epoch": 0.05690955371586259, + "grad_norm": 1.9864604473114014, + "learning_rate": 4.960158797658829e-05, + "loss": 5.5684, + "step": 9569 + }, + { + "epoch": 0.05691550099914359, + "grad_norm": 1.9740629196166992, + "learning_rate": 4.960150491409587e-05, + "loss": 5.444, + "step": 9570 + }, + { + "epoch": 0.05692144828242459, + "grad_norm": 1.9429807662963867, + "learning_rate": 4.960142184301533e-05, + "loss": 5.277, + "step": 9571 + }, + { + "epoch": 0.056927395565705585, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.960133876334668e-05, + "loss": 5.1694, + "step": 9572 + }, + { + "epoch": 0.05693334284898658, + "grad_norm": 1.7716888189315796, + "learning_rate": 4.960125567508996e-05, + "loss": 5.1383, + "step": 9573 + }, + { + "epoch": 0.05693929013226758, + "grad_norm": 1.8266246318817139, + "learning_rate": 4.9601172578245194e-05, + "loss": 5.4019, + "step": 9574 + }, + { + "epoch": 0.05694523741554858, + "grad_norm": 1.8929648399353027, + "learning_rate": 4.9601089472812414e-05, + "loss": 5.3948, + "step": 9575 + }, + { + "epoch": 0.05695118469882957, + "grad_norm": 1.9918208122253418, + "learning_rate": 4.960100635879165e-05, + "loss": 5.3195, + "step": 9576 + }, + { + "epoch": 0.056957131982110575, + "grad_norm": 1.4987989664077759, + "learning_rate": 4.960092323618292e-05, + "loss": 5.5292, + "step": 9577 + }, + { + "epoch": 0.05696307926539157, + "grad_norm": 1.683800220489502, + "learning_rate": 4.960084010498627e-05, + "loss": 5.5069, + "step": 9578 + }, + { + "epoch": 0.056969026548672565, + "grad_norm": 1.767561435699463, + "learning_rate": 4.960075696520171e-05, + "loss": 5.4134, + "step": 9579 + }, + { + "epoch": 0.05697497383195357, + "grad_norm": 2.077564239501953, + "learning_rate": 4.960067381682929e-05, + "loss": 5.3362, + "step": 9580 + }, + { + "epoch": 0.05698092111523456, + "grad_norm": 2.0167109966278076, + "learning_rate": 4.960059065986903e-05, + "loss": 5.4235, + "step": 9581 + }, + { + "epoch": 0.05698686839851556, + "grad_norm": 1.647669792175293, + "learning_rate": 4.9600507494320953e-05, + "loss": 5.3273, + "step": 9582 + }, + { + "epoch": 0.05699281568179655, + "grad_norm": 1.6051719188690186, + "learning_rate": 4.960042432018509e-05, + "loss": 5.2486, + "step": 9583 + }, + { + "epoch": 0.056998762965077554, + "grad_norm": 1.9283394813537598, + "learning_rate": 4.960034113746148e-05, + "loss": 5.233, + "step": 9584 + }, + { + "epoch": 0.05700471024835855, + "grad_norm": 1.6215802431106567, + "learning_rate": 4.960025794615014e-05, + "loss": 5.2322, + "step": 9585 + }, + { + "epoch": 0.057010657531639544, + "grad_norm": 1.8902918100357056, + "learning_rate": 4.960017474625111e-05, + "loss": 5.063, + "step": 9586 + }, + { + "epoch": 0.057016604814920546, + "grad_norm": 2.4694666862487793, + "learning_rate": 4.9600091537764415e-05, + "loss": 4.498, + "step": 9587 + }, + { + "epoch": 0.05702255209820154, + "grad_norm": 1.98915433883667, + "learning_rate": 4.960000832069007e-05, + "loss": 4.8781, + "step": 9588 + }, + { + "epoch": 0.057028499381482536, + "grad_norm": 2.0424818992614746, + "learning_rate": 4.9599925095028126e-05, + "loss": 5.5803, + "step": 9589 + }, + { + "epoch": 0.05703444666476354, + "grad_norm": 1.471275806427002, + "learning_rate": 4.95998418607786e-05, + "loss": 5.5604, + "step": 9590 + }, + { + "epoch": 0.057040393948044534, + "grad_norm": 1.6512761116027832, + "learning_rate": 4.959975861794152e-05, + "loss": 5.2147, + "step": 9591 + }, + { + "epoch": 0.05704634123132553, + "grad_norm": 1.6902865171432495, + "learning_rate": 4.959967536651693e-05, + "loss": 5.2654, + "step": 9592 + }, + { + "epoch": 0.05705228851460653, + "grad_norm": 1.5656665563583374, + "learning_rate": 4.9599592106504835e-05, + "loss": 5.1106, + "step": 9593 + }, + { + "epoch": 0.057058235797887526, + "grad_norm": 1.760901927947998, + "learning_rate": 4.959950883790528e-05, + "loss": 5.1833, + "step": 9594 + }, + { + "epoch": 0.05706418308116852, + "grad_norm": 1.5585325956344604, + "learning_rate": 4.9599425560718294e-05, + "loss": 5.202, + "step": 9595 + }, + { + "epoch": 0.05707013036444952, + "grad_norm": 1.5477479696273804, + "learning_rate": 4.959934227494389e-05, + "loss": 5.121, + "step": 9596 + }, + { + "epoch": 0.05707607764773052, + "grad_norm": 1.9299825429916382, + "learning_rate": 4.959925898058213e-05, + "loss": 5.0026, + "step": 9597 + }, + { + "epoch": 0.05708202493101151, + "grad_norm": 1.866237759590149, + "learning_rate": 4.959917567763301e-05, + "loss": 4.999, + "step": 9598 + }, + { + "epoch": 0.05708797221429251, + "grad_norm": 1.6670162677764893, + "learning_rate": 4.959909236609657e-05, + "loss": 5.4047, + "step": 9599 + }, + { + "epoch": 0.05709391949757351, + "grad_norm": 1.4666836261749268, + "learning_rate": 4.9599009045972844e-05, + "loss": 5.3598, + "step": 9600 + }, + { + "epoch": 0.057099866780854505, + "grad_norm": 1.928645372390747, + "learning_rate": 4.959892571726186e-05, + "loss": 5.7015, + "step": 9601 + }, + { + "epoch": 0.0571058140641355, + "grad_norm": 1.9761322736740112, + "learning_rate": 4.959884237996365e-05, + "loss": 4.8682, + "step": 9602 + }, + { + "epoch": 0.0571117613474165, + "grad_norm": 1.9823036193847656, + "learning_rate": 4.959875903407823e-05, + "loss": 4.8752, + "step": 9603 + }, + { + "epoch": 0.0571177086306975, + "grad_norm": 1.9242253303527832, + "learning_rate": 4.959867567960564e-05, + "loss": 4.9314, + "step": 9604 + }, + { + "epoch": 0.05712365591397849, + "grad_norm": 1.740980625152588, + "learning_rate": 4.9598592316545904e-05, + "loss": 4.9843, + "step": 9605 + }, + { + "epoch": 0.057129603197259494, + "grad_norm": 2.0768508911132812, + "learning_rate": 4.959850894489906e-05, + "loss": 4.8528, + "step": 9606 + }, + { + "epoch": 0.05713555048054049, + "grad_norm": 1.7417833805084229, + "learning_rate": 4.959842556466513e-05, + "loss": 5.1374, + "step": 9607 + }, + { + "epoch": 0.057141497763821485, + "grad_norm": 1.933691382408142, + "learning_rate": 4.959834217584414e-05, + "loss": 5.349, + "step": 9608 + }, + { + "epoch": 0.05714744504710249, + "grad_norm": 1.8035194873809814, + "learning_rate": 4.959825877843612e-05, + "loss": 5.0212, + "step": 9609 + }, + { + "epoch": 0.05715339233038348, + "grad_norm": 2.323709487915039, + "learning_rate": 4.9598175372441106e-05, + "loss": 5.5346, + "step": 9610 + }, + { + "epoch": 0.05715933961366448, + "grad_norm": 1.755983591079712, + "learning_rate": 4.959809195785912e-05, + "loss": 4.8425, + "step": 9611 + }, + { + "epoch": 0.05716528689694547, + "grad_norm": 1.6614432334899902, + "learning_rate": 4.95980085346902e-05, + "loss": 4.912, + "step": 9612 + }, + { + "epoch": 0.057171234180226474, + "grad_norm": 1.8319662809371948, + "learning_rate": 4.959792510293436e-05, + "loss": 5.0125, + "step": 9613 + }, + { + "epoch": 0.05717718146350747, + "grad_norm": 1.8528090715408325, + "learning_rate": 4.959784166259165e-05, + "loss": 4.898, + "step": 9614 + }, + { + "epoch": 0.057183128746788464, + "grad_norm": 2.163757562637329, + "learning_rate": 4.959775821366208e-05, + "loss": 5.2041, + "step": 9615 + }, + { + "epoch": 0.057189076030069466, + "grad_norm": 1.939430832862854, + "learning_rate": 4.959767475614569e-05, + "loss": 5.3337, + "step": 9616 + }, + { + "epoch": 0.05719502331335046, + "grad_norm": 1.7198511362075806, + "learning_rate": 4.959759129004251e-05, + "loss": 5.2682, + "step": 9617 + }, + { + "epoch": 0.057200970596631456, + "grad_norm": 1.7674570083618164, + "learning_rate": 4.959750781535255e-05, + "loss": 5.4188, + "step": 9618 + }, + { + "epoch": 0.05720691787991246, + "grad_norm": 1.7197433710098267, + "learning_rate": 4.959742433207587e-05, + "loss": 5.1725, + "step": 9619 + }, + { + "epoch": 0.05721286516319345, + "grad_norm": 1.6682969331741333, + "learning_rate": 4.959734084021248e-05, + "loss": 5.1349, + "step": 9620 + }, + { + "epoch": 0.05721881244647445, + "grad_norm": 1.3784568309783936, + "learning_rate": 4.959725733976241e-05, + "loss": 5.2408, + "step": 9621 + }, + { + "epoch": 0.05722475972975545, + "grad_norm": 1.690483808517456, + "learning_rate": 4.9597173830725686e-05, + "loss": 5.2616, + "step": 9622 + }, + { + "epoch": 0.057230707013036446, + "grad_norm": 1.5313903093338013, + "learning_rate": 4.959709031310235e-05, + "loss": 5.1481, + "step": 9623 + }, + { + "epoch": 0.05723665429631744, + "grad_norm": 1.6266121864318848, + "learning_rate": 4.959700678689242e-05, + "loss": 5.0192, + "step": 9624 + }, + { + "epoch": 0.05724260157959844, + "grad_norm": 2.3125410079956055, + "learning_rate": 4.959692325209593e-05, + "loss": 4.5513, + "step": 9625 + }, + { + "epoch": 0.05724854886287944, + "grad_norm": 1.6884924173355103, + "learning_rate": 4.9596839708712913e-05, + "loss": 5.1917, + "step": 9626 + }, + { + "epoch": 0.05725449614616043, + "grad_norm": 1.5797723531723022, + "learning_rate": 4.9596756156743385e-05, + "loss": 5.5674, + "step": 9627 + }, + { + "epoch": 0.05726044342944143, + "grad_norm": 1.6152269840240479, + "learning_rate": 4.959667259618739e-05, + "loss": 5.4566, + "step": 9628 + }, + { + "epoch": 0.05726639071272243, + "grad_norm": 1.611608624458313, + "learning_rate": 4.959658902704495e-05, + "loss": 5.3678, + "step": 9629 + }, + { + "epoch": 0.057272337996003425, + "grad_norm": 1.774327278137207, + "learning_rate": 4.9596505449316086e-05, + "loss": 5.2438, + "step": 9630 + }, + { + "epoch": 0.05727828527928442, + "grad_norm": 1.7961443662643433, + "learning_rate": 4.9596421863000856e-05, + "loss": 5.3061, + "step": 9631 + }, + { + "epoch": 0.05728423256256542, + "grad_norm": 1.709675669670105, + "learning_rate": 4.959633826809925e-05, + "loss": 5.0095, + "step": 9632 + }, + { + "epoch": 0.05729017984584642, + "grad_norm": 1.7140734195709229, + "learning_rate": 4.959625466461132e-05, + "loss": 5.313, + "step": 9633 + }, + { + "epoch": 0.05729612712912741, + "grad_norm": 1.8302016258239746, + "learning_rate": 4.95961710525371e-05, + "loss": 5.4008, + "step": 9634 + }, + { + "epoch": 0.057302074412408414, + "grad_norm": 1.8570395708084106, + "learning_rate": 4.95960874318766e-05, + "loss": 5.513, + "step": 9635 + }, + { + "epoch": 0.05730802169568941, + "grad_norm": 1.6907027959823608, + "learning_rate": 4.959600380262987e-05, + "loss": 5.1933, + "step": 9636 + }, + { + "epoch": 0.057313968978970405, + "grad_norm": 1.6505299806594849, + "learning_rate": 4.9595920164796926e-05, + "loss": 5.1537, + "step": 9637 + }, + { + "epoch": 0.05731991626225141, + "grad_norm": 1.5248258113861084, + "learning_rate": 4.95958365183778e-05, + "loss": 5.4232, + "step": 9638 + }, + { + "epoch": 0.0573258635455324, + "grad_norm": 1.4630048274993896, + "learning_rate": 4.9595752863372524e-05, + "loss": 5.565, + "step": 9639 + }, + { + "epoch": 0.0573318108288134, + "grad_norm": 1.5858573913574219, + "learning_rate": 4.959566919978112e-05, + "loss": 5.4364, + "step": 9640 + }, + { + "epoch": 0.05733775811209439, + "grad_norm": 1.7803694009780884, + "learning_rate": 4.9595585527603625e-05, + "loss": 5.1727, + "step": 9641 + }, + { + "epoch": 0.057343705395375394, + "grad_norm": 1.639163851737976, + "learning_rate": 4.959550184684007e-05, + "loss": 5.5538, + "step": 9642 + }, + { + "epoch": 0.05734965267865639, + "grad_norm": 1.5917890071868896, + "learning_rate": 4.959541815749046e-05, + "loss": 5.6788, + "step": 9643 + }, + { + "epoch": 0.057355599961937384, + "grad_norm": 1.5524990558624268, + "learning_rate": 4.959533445955487e-05, + "loss": 5.7832, + "step": 9644 + }, + { + "epoch": 0.057361547245218386, + "grad_norm": 1.7229019403457642, + "learning_rate": 4.959525075303328e-05, + "loss": 5.4417, + "step": 9645 + }, + { + "epoch": 0.05736749452849938, + "grad_norm": 1.5434623956680298, + "learning_rate": 4.959516703792575e-05, + "loss": 5.3629, + "step": 9646 + }, + { + "epoch": 0.057373441811780376, + "grad_norm": 1.4929866790771484, + "learning_rate": 4.9595083314232306e-05, + "loss": 5.8586, + "step": 9647 + }, + { + "epoch": 0.05737938909506138, + "grad_norm": 1.209796667098999, + "learning_rate": 4.959499958195297e-05, + "loss": 5.5001, + "step": 9648 + }, + { + "epoch": 0.05738533637834237, + "grad_norm": 2.703871488571167, + "learning_rate": 4.9594915841087775e-05, + "loss": 5.6564, + "step": 9649 + }, + { + "epoch": 0.05739128366162337, + "grad_norm": 1.9408828020095825, + "learning_rate": 4.959483209163674e-05, + "loss": 5.6683, + "step": 9650 + }, + { + "epoch": 0.05739723094490437, + "grad_norm": 1.8055803775787354, + "learning_rate": 4.9594748333599914e-05, + "loss": 5.3046, + "step": 9651 + }, + { + "epoch": 0.057403178228185366, + "grad_norm": 2.3453104496002197, + "learning_rate": 4.959466456697731e-05, + "loss": 6.1944, + "step": 9652 + }, + { + "epoch": 0.05740912551146636, + "grad_norm": 2.3799800872802734, + "learning_rate": 4.959458079176897e-05, + "loss": 5.6706, + "step": 9653 + }, + { + "epoch": 0.05741507279474736, + "grad_norm": 2.111069440841675, + "learning_rate": 4.959449700797491e-05, + "loss": 5.1808, + "step": 9654 + }, + { + "epoch": 0.05742102007802836, + "grad_norm": 2.237873077392578, + "learning_rate": 4.9594413215595164e-05, + "loss": 5.0609, + "step": 9655 + }, + { + "epoch": 0.05742696736130935, + "grad_norm": 1.956520438194275, + "learning_rate": 4.959432941462977e-05, + "loss": 5.1431, + "step": 9656 + }, + { + "epoch": 0.05743291464459035, + "grad_norm": 2.3761603832244873, + "learning_rate": 4.9594245605078735e-05, + "loss": 4.8722, + "step": 9657 + }, + { + "epoch": 0.05743886192787135, + "grad_norm": 1.820745825767517, + "learning_rate": 4.959416178694212e-05, + "loss": 5.0149, + "step": 9658 + }, + { + "epoch": 0.057444809211152345, + "grad_norm": 2.0804755687713623, + "learning_rate": 4.9594077960219924e-05, + "loss": 5.7698, + "step": 9659 + }, + { + "epoch": 0.05745075649443334, + "grad_norm": 1.9319117069244385, + "learning_rate": 4.9593994124912196e-05, + "loss": 5.3054, + "step": 9660 + }, + { + "epoch": 0.05745670377771434, + "grad_norm": 2.386338472366333, + "learning_rate": 4.959391028101896e-05, + "loss": 5.2093, + "step": 9661 + }, + { + "epoch": 0.05746265106099534, + "grad_norm": 1.852386474609375, + "learning_rate": 4.9593826428540244e-05, + "loss": 5.1943, + "step": 9662 + }, + { + "epoch": 0.05746859834427633, + "grad_norm": 1.9619694948196411, + "learning_rate": 4.959374256747607e-05, + "loss": 4.8275, + "step": 9663 + }, + { + "epoch": 0.057474545627557334, + "grad_norm": 2.4797024726867676, + "learning_rate": 4.9593658697826485e-05, + "loss": 5.5257, + "step": 9664 + }, + { + "epoch": 0.05748049291083833, + "grad_norm": 2.1713874340057373, + "learning_rate": 4.959357481959149e-05, + "loss": 5.4486, + "step": 9665 + }, + { + "epoch": 0.057486440194119325, + "grad_norm": 1.9605398178100586, + "learning_rate": 4.9593490932771145e-05, + "loss": 5.1512, + "step": 9666 + }, + { + "epoch": 0.05749238747740033, + "grad_norm": 1.9853549003601074, + "learning_rate": 4.959340703736547e-05, + "loss": 5.665, + "step": 9667 + }, + { + "epoch": 0.05749833476068132, + "grad_norm": 1.984279990196228, + "learning_rate": 4.9593323133374494e-05, + "loss": 5.7797, + "step": 9668 + }, + { + "epoch": 0.05750428204396232, + "grad_norm": 1.8343236446380615, + "learning_rate": 4.9593239220798225e-05, + "loss": 5.0261, + "step": 9669 + }, + { + "epoch": 0.05751022932724331, + "grad_norm": 1.8675687313079834, + "learning_rate": 4.959315529963673e-05, + "loss": 4.8754, + "step": 9670 + }, + { + "epoch": 0.057516176610524314, + "grad_norm": 1.9129834175109863, + "learning_rate": 4.959307136989e-05, + "loss": 5.1056, + "step": 9671 + }, + { + "epoch": 0.05752212389380531, + "grad_norm": 3.142893075942993, + "learning_rate": 4.95929874315581e-05, + "loss": 5.6029, + "step": 9672 + }, + { + "epoch": 0.057528071177086304, + "grad_norm": 1.80843985080719, + "learning_rate": 4.9592903484641026e-05, + "loss": 5.57, + "step": 9673 + }, + { + "epoch": 0.057534018460367306, + "grad_norm": 1.9195841550827026, + "learning_rate": 4.9592819529138835e-05, + "loss": 5.6964, + "step": 9674 + }, + { + "epoch": 0.0575399657436483, + "grad_norm": 2.026477813720703, + "learning_rate": 4.959273556505154e-05, + "loss": 5.8544, + "step": 9675 + }, + { + "epoch": 0.057545913026929296, + "grad_norm": 2.111274003982544, + "learning_rate": 4.959265159237918e-05, + "loss": 5.8014, + "step": 9676 + }, + { + "epoch": 0.0575518603102103, + "grad_norm": 1.9789505004882812, + "learning_rate": 4.9592567611121776e-05, + "loss": 5.7646, + "step": 9677 + }, + { + "epoch": 0.05755780759349129, + "grad_norm": 1.8776015043258667, + "learning_rate": 4.9592483621279365e-05, + "loss": 6.1603, + "step": 9678 + }, + { + "epoch": 0.05756375487677229, + "grad_norm": 2.135849714279175, + "learning_rate": 4.9592399622851956e-05, + "loss": 5.6372, + "step": 9679 + }, + { + "epoch": 0.05756970216005329, + "grad_norm": 2.3335585594177246, + "learning_rate": 4.959231561583961e-05, + "loss": 5.5515, + "step": 9680 + }, + { + "epoch": 0.057575649443334286, + "grad_norm": 1.9315869808197021, + "learning_rate": 4.9592231600242337e-05, + "loss": 5.9287, + "step": 9681 + }, + { + "epoch": 0.05758159672661528, + "grad_norm": 2.4559311866760254, + "learning_rate": 4.959214757606017e-05, + "loss": 5.6079, + "step": 9682 + }, + { + "epoch": 0.05758754400989628, + "grad_norm": 2.6558609008789062, + "learning_rate": 4.959206354329314e-05, + "loss": 5.5728, + "step": 9683 + }, + { + "epoch": 0.05759349129317728, + "grad_norm": 2.2376396656036377, + "learning_rate": 4.9591979501941274e-05, + "loss": 5.5318, + "step": 9684 + }, + { + "epoch": 0.05759943857645827, + "grad_norm": 1.8506240844726562, + "learning_rate": 4.95918954520046e-05, + "loss": 5.7957, + "step": 9685 + }, + { + "epoch": 0.05760538585973927, + "grad_norm": 2.2428138256073, + "learning_rate": 4.9591811393483144e-05, + "loss": 5.7223, + "step": 9686 + }, + { + "epoch": 0.05761133314302027, + "grad_norm": 2.5734875202178955, + "learning_rate": 4.9591727326376955e-05, + "loss": 5.3401, + "step": 9687 + }, + { + "epoch": 0.057617280426301265, + "grad_norm": 2.567263126373291, + "learning_rate": 4.959164325068604e-05, + "loss": 5.4853, + "step": 9688 + }, + { + "epoch": 0.05762322770958226, + "grad_norm": 2.4430556297302246, + "learning_rate": 4.959155916641043e-05, + "loss": 5.9845, + "step": 9689 + }, + { + "epoch": 0.05762917499286326, + "grad_norm": 2.039846181869507, + "learning_rate": 4.959147507355017e-05, + "loss": 6.0689, + "step": 9690 + }, + { + "epoch": 0.05763512227614426, + "grad_norm": 2.207920551300049, + "learning_rate": 4.959139097210528e-05, + "loss": 5.6658, + "step": 9691 + }, + { + "epoch": 0.05764106955942525, + "grad_norm": 1.7421616315841675, + "learning_rate": 4.959130686207578e-05, + "loss": 6.0915, + "step": 9692 + }, + { + "epoch": 0.057647016842706254, + "grad_norm": 1.7738968133926392, + "learning_rate": 4.9591222743461716e-05, + "loss": 6.2092, + "step": 9693 + }, + { + "epoch": 0.05765296412598725, + "grad_norm": 1.8665943145751953, + "learning_rate": 4.959113861626311e-05, + "loss": 6.0922, + "step": 9694 + }, + { + "epoch": 0.057658911409268244, + "grad_norm": 2.0272347927093506, + "learning_rate": 4.959105448047999e-05, + "loss": 5.8291, + "step": 9695 + }, + { + "epoch": 0.057664858692549247, + "grad_norm": 2.8527796268463135, + "learning_rate": 4.9590970336112395e-05, + "loss": 5.428, + "step": 9696 + }, + { + "epoch": 0.05767080597583024, + "grad_norm": 1.8518950939178467, + "learning_rate": 4.959088618316033e-05, + "loss": 5.4199, + "step": 9697 + }, + { + "epoch": 0.05767675325911124, + "grad_norm": 2.38712739944458, + "learning_rate": 4.959080202162386e-05, + "loss": 5.1627, + "step": 9698 + }, + { + "epoch": 0.05768270054239223, + "grad_norm": 1.8407059907913208, + "learning_rate": 4.959071785150298e-05, + "loss": 5.1827, + "step": 9699 + }, + { + "epoch": 0.057688647825673234, + "grad_norm": 2.431151866912842, + "learning_rate": 4.9590633672797744e-05, + "loss": 6.1722, + "step": 9700 + }, + { + "epoch": 0.05769459510895423, + "grad_norm": 2.498046398162842, + "learning_rate": 4.9590549485508165e-05, + "loss": 6.2321, + "step": 9701 + }, + { + "epoch": 0.057700542392235224, + "grad_norm": 1.8793575763702393, + "learning_rate": 4.959046528963428e-05, + "loss": 5.4019, + "step": 9702 + }, + { + "epoch": 0.057706489675516226, + "grad_norm": 2.137622117996216, + "learning_rate": 4.9590381085176115e-05, + "loss": 5.9118, + "step": 9703 + }, + { + "epoch": 0.05771243695879722, + "grad_norm": 1.9514268636703491, + "learning_rate": 4.959029687213371e-05, + "loss": 5.6651, + "step": 9704 + }, + { + "epoch": 0.057718384242078216, + "grad_norm": 2.3678367137908936, + "learning_rate": 4.9590212650507085e-05, + "loss": 5.2054, + "step": 9705 + }, + { + "epoch": 0.05772433152535922, + "grad_norm": 2.8808276653289795, + "learning_rate": 4.9590128420296266e-05, + "loss": 5.3066, + "step": 9706 + }, + { + "epoch": 0.05773027880864021, + "grad_norm": 2.2405474185943604, + "learning_rate": 4.9590044181501297e-05, + "loss": 5.2904, + "step": 9707 + }, + { + "epoch": 0.05773622609192121, + "grad_norm": 2.3762283325195312, + "learning_rate": 4.958995993412219e-05, + "loss": 5.5847, + "step": 9708 + }, + { + "epoch": 0.05774217337520221, + "grad_norm": 2.5258681774139404, + "learning_rate": 4.958987567815898e-05, + "loss": 5.4852, + "step": 9709 + }, + { + "epoch": 0.057748120658483205, + "grad_norm": 2.31478214263916, + "learning_rate": 4.9589791413611704e-05, + "loss": 5.5658, + "step": 9710 + }, + { + "epoch": 0.0577540679417642, + "grad_norm": 1.735771894454956, + "learning_rate": 4.958970714048038e-05, + "loss": 6.0311, + "step": 9711 + }, + { + "epoch": 0.0577600152250452, + "grad_norm": 2.2843849658966064, + "learning_rate": 4.958962285876505e-05, + "loss": 5.9535, + "step": 9712 + }, + { + "epoch": 0.0577659625083262, + "grad_norm": 2.3449392318725586, + "learning_rate": 4.958953856846573e-05, + "loss": 5.9835, + "step": 9713 + }, + { + "epoch": 0.05777190979160719, + "grad_norm": 2.319952964782715, + "learning_rate": 4.9589454269582456e-05, + "loss": 5.5318, + "step": 9714 + }, + { + "epoch": 0.05777785707488819, + "grad_norm": 2.6801493167877197, + "learning_rate": 4.958936996211526e-05, + "loss": 4.8672, + "step": 9715 + }, + { + "epoch": 0.05778380435816919, + "grad_norm": 2.622528553009033, + "learning_rate": 4.958928564606418e-05, + "loss": 6.0755, + "step": 9716 + }, + { + "epoch": 0.057789751641450185, + "grad_norm": 1.973480224609375, + "learning_rate": 4.9589201321429216e-05, + "loss": 5.8197, + "step": 9717 + }, + { + "epoch": 0.05779569892473118, + "grad_norm": 2.060497760772705, + "learning_rate": 4.958911698821043e-05, + "loss": 5.2838, + "step": 9718 + }, + { + "epoch": 0.05780164620801218, + "grad_norm": 2.068103551864624, + "learning_rate": 4.958903264640783e-05, + "loss": 5.4917, + "step": 9719 + }, + { + "epoch": 0.05780759349129318, + "grad_norm": 2.5899293422698975, + "learning_rate": 4.958894829602145e-05, + "loss": 5.1312, + "step": 9720 + }, + { + "epoch": 0.05781354077457417, + "grad_norm": 3.2153897285461426, + "learning_rate": 4.958886393705132e-05, + "loss": 4.7502, + "step": 9721 + }, + { + "epoch": 0.057819488057855174, + "grad_norm": 2.805802345275879, + "learning_rate": 4.9588779569497484e-05, + "loss": 4.6876, + "step": 9722 + }, + { + "epoch": 0.05782543534113617, + "grad_norm": 2.3670101165771484, + "learning_rate": 4.958869519335995e-05, + "loss": 4.6025, + "step": 9723 + }, + { + "epoch": 0.057831382624417164, + "grad_norm": 1.992903709411621, + "learning_rate": 4.9588610808638755e-05, + "loss": 5.3602, + "step": 9724 + }, + { + "epoch": 0.057837329907698166, + "grad_norm": 2.249572277069092, + "learning_rate": 4.958852641533394e-05, + "loss": 4.9574, + "step": 9725 + }, + { + "epoch": 0.05784327719097916, + "grad_norm": 2.500433921813965, + "learning_rate": 4.958844201344552e-05, + "loss": 5.3656, + "step": 9726 + }, + { + "epoch": 0.05784922447426016, + "grad_norm": 2.0277605056762695, + "learning_rate": 4.9588357602973526e-05, + "loss": 5.6467, + "step": 9727 + }, + { + "epoch": 0.05785517175754116, + "grad_norm": 2.1196112632751465, + "learning_rate": 4.958827318391799e-05, + "loss": 5.6257, + "step": 9728 + }, + { + "epoch": 0.057861119040822154, + "grad_norm": 3.160593271255493, + "learning_rate": 4.9588188756278945e-05, + "loss": 4.9618, + "step": 9729 + }, + { + "epoch": 0.05786706632410315, + "grad_norm": 1.90407395362854, + "learning_rate": 4.958810432005642e-05, + "loss": 5.4551, + "step": 9730 + }, + { + "epoch": 0.057873013607384144, + "grad_norm": 2.0096004009246826, + "learning_rate": 4.958801987525043e-05, + "loss": 5.6562, + "step": 9731 + }, + { + "epoch": 0.057878960890665146, + "grad_norm": 2.617847442626953, + "learning_rate": 4.958793542186103e-05, + "loss": 5.747, + "step": 9732 + }, + { + "epoch": 0.05788490817394614, + "grad_norm": 2.3982057571411133, + "learning_rate": 4.9587850959888226e-05, + "loss": 5.6146, + "step": 9733 + }, + { + "epoch": 0.057890855457227136, + "grad_norm": 2.0222113132476807, + "learning_rate": 4.9587766489332065e-05, + "loss": 6.0204, + "step": 9734 + }, + { + "epoch": 0.05789680274050814, + "grad_norm": 2.1110177040100098, + "learning_rate": 4.958768201019257e-05, + "loss": 5.2957, + "step": 9735 + }, + { + "epoch": 0.05790275002378913, + "grad_norm": 1.8278865814208984, + "learning_rate": 4.958759752246977e-05, + "loss": 5.9902, + "step": 9736 + }, + { + "epoch": 0.05790869730707013, + "grad_norm": 2.2461514472961426, + "learning_rate": 4.958751302616368e-05, + "loss": 5.8572, + "step": 9737 + }, + { + "epoch": 0.05791464459035113, + "grad_norm": 1.7453250885009766, + "learning_rate": 4.958742852127435e-05, + "loss": 5.6658, + "step": 9738 + }, + { + "epoch": 0.057920591873632125, + "grad_norm": 2.480726718902588, + "learning_rate": 4.95873440078018e-05, + "loss": 5.4231, + "step": 9739 + }, + { + "epoch": 0.05792653915691312, + "grad_norm": 2.2310776710510254, + "learning_rate": 4.958725948574607e-05, + "loss": 5.4768, + "step": 9740 + }, + { + "epoch": 0.05793248644019412, + "grad_norm": 1.9454891681671143, + "learning_rate": 4.958717495510718e-05, + "loss": 5.4503, + "step": 9741 + }, + { + "epoch": 0.05793843372347512, + "grad_norm": 2.196054458618164, + "learning_rate": 4.958709041588516e-05, + "loss": 5.1987, + "step": 9742 + }, + { + "epoch": 0.05794438100675611, + "grad_norm": 2.385000228881836, + "learning_rate": 4.958700586808004e-05, + "loss": 5.8413, + "step": 9743 + }, + { + "epoch": 0.05795032829003711, + "grad_norm": 2.0967705249786377, + "learning_rate": 4.958692131169185e-05, + "loss": 5.8531, + "step": 9744 + }, + { + "epoch": 0.05795627557331811, + "grad_norm": 2.186253309249878, + "learning_rate": 4.958683674672062e-05, + "loss": 5.8241, + "step": 9745 + }, + { + "epoch": 0.057962222856599105, + "grad_norm": 1.8932995796203613, + "learning_rate": 4.958675217316638e-05, + "loss": 5.8724, + "step": 9746 + }, + { + "epoch": 0.0579681701398801, + "grad_norm": 1.9706943035125732, + "learning_rate": 4.958666759102916e-05, + "loss": 5.6565, + "step": 9747 + }, + { + "epoch": 0.0579741174231611, + "grad_norm": 1.7686703205108643, + "learning_rate": 4.958658300030898e-05, + "loss": 5.6299, + "step": 9748 + }, + { + "epoch": 0.0579800647064421, + "grad_norm": 2.309403419494629, + "learning_rate": 4.958649840100589e-05, + "loss": 4.6907, + "step": 9749 + }, + { + "epoch": 0.05798601198972309, + "grad_norm": 2.139760971069336, + "learning_rate": 4.95864137931199e-05, + "loss": 4.7311, + "step": 9750 + }, + { + "epoch": 0.057991959273004094, + "grad_norm": 1.960402011871338, + "learning_rate": 4.958632917665105e-05, + "loss": 5.598, + "step": 9751 + }, + { + "epoch": 0.05799790655628509, + "grad_norm": 1.721853256225586, + "learning_rate": 4.958624455159936e-05, + "loss": 6.0519, + "step": 9752 + }, + { + "epoch": 0.058003853839566084, + "grad_norm": 1.8527748584747314, + "learning_rate": 4.958615991796487e-05, + "loss": 5.3347, + "step": 9753 + }, + { + "epoch": 0.058009801122847086, + "grad_norm": 2.070084810256958, + "learning_rate": 4.958607527574761e-05, + "loss": 4.6653, + "step": 9754 + }, + { + "epoch": 0.05801574840612808, + "grad_norm": 2.143115997314453, + "learning_rate": 4.9585990624947605e-05, + "loss": 4.6522, + "step": 9755 + }, + { + "epoch": 0.05802169568940908, + "grad_norm": 2.2870991230010986, + "learning_rate": 4.9585905965564884e-05, + "loss": 4.7037, + "step": 9756 + }, + { + "epoch": 0.05802764297269008, + "grad_norm": 2.0633544921875, + "learning_rate": 4.958582129759947e-05, + "loss": 4.689, + "step": 9757 + }, + { + "epoch": 0.058033590255971074, + "grad_norm": 1.8845857381820679, + "learning_rate": 4.95857366210514e-05, + "loss": 4.8077, + "step": 9758 + }, + { + "epoch": 0.05803953753925207, + "grad_norm": 1.7319310903549194, + "learning_rate": 4.9585651935920715e-05, + "loss": 5.3528, + "step": 9759 + }, + { + "epoch": 0.058045484822533064, + "grad_norm": 2.2369909286499023, + "learning_rate": 4.958556724220742e-05, + "loss": 4.6549, + "step": 9760 + }, + { + "epoch": 0.058051432105814066, + "grad_norm": 2.076901912689209, + "learning_rate": 4.9585482539911566e-05, + "loss": 4.4642, + "step": 9761 + }, + { + "epoch": 0.05805737938909506, + "grad_norm": 2.0487091541290283, + "learning_rate": 4.958539782903318e-05, + "loss": 4.6575, + "step": 9762 + }, + { + "epoch": 0.058063326672376056, + "grad_norm": 2.2116169929504395, + "learning_rate": 4.9585313109572274e-05, + "loss": 4.4866, + "step": 9763 + }, + { + "epoch": 0.05806927395565706, + "grad_norm": 1.9818168878555298, + "learning_rate": 4.958522838152889e-05, + "loss": 4.7502, + "step": 9764 + }, + { + "epoch": 0.05807522123893805, + "grad_norm": 2.1484010219573975, + "learning_rate": 4.958514364490306e-05, + "loss": 5.7809, + "step": 9765 + }, + { + "epoch": 0.05808116852221905, + "grad_norm": 2.4087398052215576, + "learning_rate": 4.958505889969481e-05, + "loss": 5.5236, + "step": 9766 + }, + { + "epoch": 0.05808711580550005, + "grad_norm": 2.000459909439087, + "learning_rate": 4.9584974145904165e-05, + "loss": 4.7356, + "step": 9767 + }, + { + "epoch": 0.058093063088781045, + "grad_norm": 2.3958399295806885, + "learning_rate": 4.958488938353116e-05, + "loss": 4.3695, + "step": 9768 + }, + { + "epoch": 0.05809901037206204, + "grad_norm": 2.039053440093994, + "learning_rate": 4.958480461257584e-05, + "loss": 4.6128, + "step": 9769 + }, + { + "epoch": 0.05810495765534304, + "grad_norm": 1.7663822174072266, + "learning_rate": 4.95847198330382e-05, + "loss": 4.8533, + "step": 9770 + }, + { + "epoch": 0.05811090493862404, + "grad_norm": 2.594289779663086, + "learning_rate": 4.9584635044918295e-05, + "loss": 5.3048, + "step": 9771 + }, + { + "epoch": 0.05811685222190503, + "grad_norm": 2.712372303009033, + "learning_rate": 4.958455024821615e-05, + "loss": 5.4435, + "step": 9772 + }, + { + "epoch": 0.05812279950518603, + "grad_norm": 2.4295241832733154, + "learning_rate": 4.9584465442931794e-05, + "loss": 5.2665, + "step": 9773 + }, + { + "epoch": 0.05812874678846703, + "grad_norm": 2.5820906162261963, + "learning_rate": 4.9584380629065245e-05, + "loss": 5.6227, + "step": 9774 + }, + { + "epoch": 0.058134694071748025, + "grad_norm": 2.140291213989258, + "learning_rate": 4.958429580661655e-05, + "loss": 5.1792, + "step": 9775 + }, + { + "epoch": 0.05814064135502902, + "grad_norm": 2.111551523208618, + "learning_rate": 4.9584210975585734e-05, + "loss": 5.7262, + "step": 9776 + }, + { + "epoch": 0.05814658863831002, + "grad_norm": 2.5887086391448975, + "learning_rate": 4.958412613597282e-05, + "loss": 5.1613, + "step": 9777 + }, + { + "epoch": 0.05815253592159102, + "grad_norm": 1.9678863286972046, + "learning_rate": 4.9584041287777835e-05, + "loss": 5.7693, + "step": 9778 + }, + { + "epoch": 0.05815848320487201, + "grad_norm": 2.000265121459961, + "learning_rate": 4.958395643100083e-05, + "loss": 5.654, + "step": 9779 + }, + { + "epoch": 0.058164430488153014, + "grad_norm": 1.8926239013671875, + "learning_rate": 4.958387156564181e-05, + "loss": 5.3004, + "step": 9780 + }, + { + "epoch": 0.05817037777143401, + "grad_norm": 2.3557002544403076, + "learning_rate": 4.958378669170082e-05, + "loss": 5.5437, + "step": 9781 + }, + { + "epoch": 0.058176325054715004, + "grad_norm": 1.9434150457382202, + "learning_rate": 4.958370180917787e-05, + "loss": 5.8442, + "step": 9782 + }, + { + "epoch": 0.058182272337996006, + "grad_norm": 1.875900387763977, + "learning_rate": 4.9583616918073026e-05, + "loss": 5.9312, + "step": 9783 + }, + { + "epoch": 0.058188219621277, + "grad_norm": 1.8945306539535522, + "learning_rate": 4.958353201838628e-05, + "loss": 5.7166, + "step": 9784 + }, + { + "epoch": 0.058194166904557997, + "grad_norm": 1.7081416845321655, + "learning_rate": 4.9583447110117684e-05, + "loss": 6.0803, + "step": 9785 + }, + { + "epoch": 0.058200114187839, + "grad_norm": 1.6520098447799683, + "learning_rate": 4.958336219326725e-05, + "loss": 6.0181, + "step": 9786 + }, + { + "epoch": 0.058206061471119994, + "grad_norm": 1.90665602684021, + "learning_rate": 4.9583277267835024e-05, + "loss": 5.586, + "step": 9787 + }, + { + "epoch": 0.05821200875440099, + "grad_norm": 1.8179740905761719, + "learning_rate": 4.958319233382104e-05, + "loss": 5.8637, + "step": 9788 + }, + { + "epoch": 0.058217956037681984, + "grad_norm": 1.8228380680084229, + "learning_rate": 4.95831073912253e-05, + "loss": 5.7406, + "step": 9789 + }, + { + "epoch": 0.058223903320962986, + "grad_norm": 1.691999912261963, + "learning_rate": 4.958302244004786e-05, + "loss": 5.8021, + "step": 9790 + }, + { + "epoch": 0.05822985060424398, + "grad_norm": 1.8590795993804932, + "learning_rate": 4.958293748028875e-05, + "loss": 5.5897, + "step": 9791 + }, + { + "epoch": 0.058235797887524976, + "grad_norm": 1.5923960208892822, + "learning_rate": 4.958285251194797e-05, + "loss": 5.7424, + "step": 9792 + }, + { + "epoch": 0.05824174517080598, + "grad_norm": 1.6928486824035645, + "learning_rate": 4.958276753502559e-05, + "loss": 5.905, + "step": 9793 + }, + { + "epoch": 0.05824769245408697, + "grad_norm": 2.120725393295288, + "learning_rate": 4.958268254952161e-05, + "loss": 5.9974, + "step": 9794 + }, + { + "epoch": 0.05825363973736797, + "grad_norm": 1.850441813468933, + "learning_rate": 4.9582597555436075e-05, + "loss": 5.7171, + "step": 9795 + }, + { + "epoch": 0.05825958702064897, + "grad_norm": 2.196037530899048, + "learning_rate": 4.9582512552769e-05, + "loss": 6.1243, + "step": 9796 + }, + { + "epoch": 0.058265534303929965, + "grad_norm": 1.9170193672180176, + "learning_rate": 4.9582427541520423e-05, + "loss": 5.8087, + "step": 9797 + }, + { + "epoch": 0.05827148158721096, + "grad_norm": 1.974478006362915, + "learning_rate": 4.958234252169039e-05, + "loss": 5.794, + "step": 9798 + }, + { + "epoch": 0.05827742887049196, + "grad_norm": 1.824965476989746, + "learning_rate": 4.9582257493278904e-05, + "loss": 5.6904, + "step": 9799 + }, + { + "epoch": 0.05828337615377296, + "grad_norm": 1.828037142753601, + "learning_rate": 4.9582172456286e-05, + "loss": 5.6793, + "step": 9800 + }, + { + "epoch": 0.05828932343705395, + "grad_norm": 1.8949617147445679, + "learning_rate": 4.9582087410711726e-05, + "loss": 5.6685, + "step": 9801 + }, + { + "epoch": 0.05829527072033495, + "grad_norm": 1.8183050155639648, + "learning_rate": 4.958200235655609e-05, + "loss": 5.7754, + "step": 9802 + }, + { + "epoch": 0.05830121800361595, + "grad_norm": 1.6816062927246094, + "learning_rate": 4.9581917293819135e-05, + "loss": 5.6931, + "step": 9803 + }, + { + "epoch": 0.058307165286896945, + "grad_norm": 1.875659465789795, + "learning_rate": 4.958183222250089e-05, + "loss": 5.7568, + "step": 9804 + }, + { + "epoch": 0.05831311257017794, + "grad_norm": 2.162404775619507, + "learning_rate": 4.958174714260137e-05, + "loss": 5.7969, + "step": 9805 + }, + { + "epoch": 0.05831905985345894, + "grad_norm": 2.2122790813446045, + "learning_rate": 4.958166205412064e-05, + "loss": 5.7301, + "step": 9806 + }, + { + "epoch": 0.05832500713673994, + "grad_norm": 1.8822424411773682, + "learning_rate": 4.9581576957058686e-05, + "loss": 5.7034, + "step": 9807 + }, + { + "epoch": 0.05833095442002093, + "grad_norm": 1.8780319690704346, + "learning_rate": 4.958149185141556e-05, + "loss": 5.6573, + "step": 9808 + }, + { + "epoch": 0.058336901703301934, + "grad_norm": 1.9177708625793457, + "learning_rate": 4.958140673719129e-05, + "loss": 5.6619, + "step": 9809 + }, + { + "epoch": 0.05834284898658293, + "grad_norm": 1.8662844896316528, + "learning_rate": 4.95813216143859e-05, + "loss": 5.5857, + "step": 9810 + }, + { + "epoch": 0.058348796269863924, + "grad_norm": 2.1798834800720215, + "learning_rate": 4.958123648299944e-05, + "loss": 5.5811, + "step": 9811 + }, + { + "epoch": 0.058354743553144926, + "grad_norm": 2.1575138568878174, + "learning_rate": 4.958115134303191e-05, + "loss": 5.6761, + "step": 9812 + }, + { + "epoch": 0.05836069083642592, + "grad_norm": 2.055314302444458, + "learning_rate": 4.958106619448336e-05, + "loss": 5.721, + "step": 9813 + }, + { + "epoch": 0.058366638119706916, + "grad_norm": 1.8962149620056152, + "learning_rate": 4.958098103735381e-05, + "loss": 5.6132, + "step": 9814 + }, + { + "epoch": 0.05837258540298792, + "grad_norm": 1.7715760469436646, + "learning_rate": 4.95808958716433e-05, + "loss": 5.6461, + "step": 9815 + }, + { + "epoch": 0.058378532686268914, + "grad_norm": 1.9166070222854614, + "learning_rate": 4.958081069735184e-05, + "loss": 5.5628, + "step": 9816 + }, + { + "epoch": 0.05838447996954991, + "grad_norm": 1.8872902393341064, + "learning_rate": 4.9580725514479484e-05, + "loss": 5.6476, + "step": 9817 + }, + { + "epoch": 0.058390427252830904, + "grad_norm": 1.8257521390914917, + "learning_rate": 4.9580640323026254e-05, + "loss": 5.6175, + "step": 9818 + }, + { + "epoch": 0.058396374536111906, + "grad_norm": 1.919291377067566, + "learning_rate": 4.958055512299217e-05, + "loss": 5.5954, + "step": 9819 + }, + { + "epoch": 0.0584023218193929, + "grad_norm": 1.8318076133728027, + "learning_rate": 4.958046991437726e-05, + "loss": 5.6255, + "step": 9820 + }, + { + "epoch": 0.058408269102673896, + "grad_norm": 1.9153858423233032, + "learning_rate": 4.958038469718158e-05, + "loss": 5.6787, + "step": 9821 + }, + { + "epoch": 0.0584142163859549, + "grad_norm": 1.967021107673645, + "learning_rate": 4.958029947140513e-05, + "loss": 5.6714, + "step": 9822 + }, + { + "epoch": 0.05842016366923589, + "grad_norm": 1.654997706413269, + "learning_rate": 4.958021423704795e-05, + "loss": 5.4809, + "step": 9823 + }, + { + "epoch": 0.05842611095251689, + "grad_norm": 1.8183335065841675, + "learning_rate": 4.9580128994110074e-05, + "loss": 5.5223, + "step": 9824 + }, + { + "epoch": 0.05843205823579789, + "grad_norm": 1.7665660381317139, + "learning_rate": 4.958004374259153e-05, + "loss": 5.5639, + "step": 9825 + }, + { + "epoch": 0.058438005519078885, + "grad_norm": 1.8233551979064941, + "learning_rate": 4.957995848249235e-05, + "loss": 5.6358, + "step": 9826 + }, + { + "epoch": 0.05844395280235988, + "grad_norm": 1.721301555633545, + "learning_rate": 4.957987321381256e-05, + "loss": 5.4989, + "step": 9827 + }, + { + "epoch": 0.05844990008564088, + "grad_norm": 1.6921659708023071, + "learning_rate": 4.957978793655218e-05, + "loss": 5.448, + "step": 9828 + }, + { + "epoch": 0.05845584736892188, + "grad_norm": 1.810354232788086, + "learning_rate": 4.957970265071126e-05, + "loss": 5.4501, + "step": 9829 + }, + { + "epoch": 0.05846179465220287, + "grad_norm": 1.7205116748809814, + "learning_rate": 4.957961735628982e-05, + "loss": 5.5222, + "step": 9830 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 1.9636965990066528, + "learning_rate": 4.957953205328788e-05, + "loss": 5.5894, + "step": 9831 + }, + { + "epoch": 0.05847368921876487, + "grad_norm": 1.9312820434570312, + "learning_rate": 4.9579446741705485e-05, + "loss": 5.6543, + "step": 9832 + }, + { + "epoch": 0.058479636502045865, + "grad_norm": 1.870448112487793, + "learning_rate": 4.9579361421542665e-05, + "loss": 5.6707, + "step": 9833 + }, + { + "epoch": 0.05848558378532686, + "grad_norm": 1.5943735837936401, + "learning_rate": 4.9579276092799435e-05, + "loss": 5.5184, + "step": 9834 + }, + { + "epoch": 0.05849153106860786, + "grad_norm": 1.6929852962493896, + "learning_rate": 4.957919075547584e-05, + "loss": 5.5188, + "step": 9835 + }, + { + "epoch": 0.05849747835188886, + "grad_norm": 2.0268075466156006, + "learning_rate": 4.95791054095719e-05, + "loss": 5.4909, + "step": 9836 + }, + { + "epoch": 0.05850342563516985, + "grad_norm": 2.047982931137085, + "learning_rate": 4.957902005508765e-05, + "loss": 5.6459, + "step": 9837 + }, + { + "epoch": 0.058509372918450854, + "grad_norm": 1.7938467264175415, + "learning_rate": 4.957893469202311e-05, + "loss": 5.4805, + "step": 9838 + }, + { + "epoch": 0.05851532020173185, + "grad_norm": 1.803093433380127, + "learning_rate": 4.957884932037833e-05, + "loss": 5.4092, + "step": 9839 + }, + { + "epoch": 0.058521267485012844, + "grad_norm": 1.8001232147216797, + "learning_rate": 4.957876394015333e-05, + "loss": 5.9168, + "step": 9840 + }, + { + "epoch": 0.058527214768293846, + "grad_norm": 1.9442622661590576, + "learning_rate": 4.9578678551348125e-05, + "loss": 6.0317, + "step": 9841 + }, + { + "epoch": 0.05853316205157484, + "grad_norm": 2.013845205307007, + "learning_rate": 4.957859315396276e-05, + "loss": 5.6855, + "step": 9842 + }, + { + "epoch": 0.058539109334855836, + "grad_norm": 2.7557523250579834, + "learning_rate": 4.9578507747997264e-05, + "loss": 5.3782, + "step": 9843 + }, + { + "epoch": 0.05854505661813684, + "grad_norm": 1.9822032451629639, + "learning_rate": 4.957842233345167e-05, + "loss": 6.22, + "step": 9844 + }, + { + "epoch": 0.058551003901417834, + "grad_norm": 1.7408699989318848, + "learning_rate": 4.9578336910326e-05, + "loss": 5.2347, + "step": 9845 + }, + { + "epoch": 0.05855695118469883, + "grad_norm": 3.2186660766601562, + "learning_rate": 4.957825147862028e-05, + "loss": 5.3282, + "step": 9846 + }, + { + "epoch": 0.058562898467979824, + "grad_norm": 3.3589892387390137, + "learning_rate": 4.957816603833455e-05, + "loss": 5.5689, + "step": 9847 + }, + { + "epoch": 0.058568845751260826, + "grad_norm": 3.4228861331939697, + "learning_rate": 4.957808058946883e-05, + "loss": 5.5797, + "step": 9848 + }, + { + "epoch": 0.05857479303454182, + "grad_norm": 2.420506238937378, + "learning_rate": 4.957799513202317e-05, + "loss": 5.735, + "step": 9849 + }, + { + "epoch": 0.058580740317822816, + "grad_norm": 1.8269212245941162, + "learning_rate": 4.957790966599758e-05, + "loss": 5.7571, + "step": 9850 + }, + { + "epoch": 0.05858668760110382, + "grad_norm": 2.011110305786133, + "learning_rate": 4.957782419139209e-05, + "loss": 5.9786, + "step": 9851 + }, + { + "epoch": 0.05859263488438481, + "grad_norm": 2.3139355182647705, + "learning_rate": 4.957773870820674e-05, + "loss": 5.8356, + "step": 9852 + }, + { + "epoch": 0.05859858216766581, + "grad_norm": 2.3406572341918945, + "learning_rate": 4.957765321644155e-05, + "loss": 5.8426, + "step": 9853 + }, + { + "epoch": 0.05860452945094681, + "grad_norm": 2.1194591522216797, + "learning_rate": 4.957756771609657e-05, + "loss": 5.6152, + "step": 9854 + }, + { + "epoch": 0.058610476734227805, + "grad_norm": 1.9966599941253662, + "learning_rate": 4.95774822071718e-05, + "loss": 5.8189, + "step": 9855 + }, + { + "epoch": 0.0586164240175088, + "grad_norm": 1.8953092098236084, + "learning_rate": 4.95773966896673e-05, + "loss": 5.8185, + "step": 9856 + }, + { + "epoch": 0.0586223713007898, + "grad_norm": 1.9035093784332275, + "learning_rate": 4.957731116358307e-05, + "loss": 5.6554, + "step": 9857 + }, + { + "epoch": 0.0586283185840708, + "grad_norm": 3.507546901702881, + "learning_rate": 4.9577225628919157e-05, + "loss": 5.8906, + "step": 9858 + }, + { + "epoch": 0.05863426586735179, + "grad_norm": 2.1840403079986572, + "learning_rate": 4.9577140085675586e-05, + "loss": 5.6084, + "step": 9859 + }, + { + "epoch": 0.05864021315063279, + "grad_norm": 2.008424758911133, + "learning_rate": 4.95770545338524e-05, + "loss": 5.8435, + "step": 9860 + }, + { + "epoch": 0.05864616043391379, + "grad_norm": 1.9004656076431274, + "learning_rate": 4.957696897344961e-05, + "loss": 5.5906, + "step": 9861 + }, + { + "epoch": 0.058652107717194785, + "grad_norm": 1.8043147325515747, + "learning_rate": 4.9576883404467255e-05, + "loss": 5.6057, + "step": 9862 + }, + { + "epoch": 0.05865805500047578, + "grad_norm": 1.6765285730361938, + "learning_rate": 4.957679782690537e-05, + "loss": 5.7246, + "step": 9863 + }, + { + "epoch": 0.05866400228375678, + "grad_norm": 2.0207018852233887, + "learning_rate": 4.9576712240763974e-05, + "loss": 5.8459, + "step": 9864 + }, + { + "epoch": 0.05866994956703778, + "grad_norm": 1.975874423980713, + "learning_rate": 4.95766266460431e-05, + "loss": 5.7313, + "step": 9865 + }, + { + "epoch": 0.05867589685031877, + "grad_norm": 2.085277557373047, + "learning_rate": 4.957654104274279e-05, + "loss": 5.1359, + "step": 9866 + }, + { + "epoch": 0.058681844133599774, + "grad_norm": 2.039437770843506, + "learning_rate": 4.957645543086305e-05, + "loss": 5.5673, + "step": 9867 + }, + { + "epoch": 0.05868779141688077, + "grad_norm": 2.0692098140716553, + "learning_rate": 4.9576369810403926e-05, + "loss": 5.6326, + "step": 9868 + }, + { + "epoch": 0.058693738700161764, + "grad_norm": 2.3873767852783203, + "learning_rate": 4.957628418136545e-05, + "loss": 5.5133, + "step": 9869 + }, + { + "epoch": 0.058699685983442766, + "grad_norm": 2.9347658157348633, + "learning_rate": 4.957619854374764e-05, + "loss": 5.5444, + "step": 9870 + }, + { + "epoch": 0.05870563326672376, + "grad_norm": 2.955348014831543, + "learning_rate": 4.957611289755054e-05, + "loss": 5.4883, + "step": 9871 + }, + { + "epoch": 0.058711580550004756, + "grad_norm": 2.147033214569092, + "learning_rate": 4.957602724277417e-05, + "loss": 5.4554, + "step": 9872 + }, + { + "epoch": 0.05871752783328576, + "grad_norm": 2.1422510147094727, + "learning_rate": 4.957594157941856e-05, + "loss": 5.56, + "step": 9873 + }, + { + "epoch": 0.05872347511656675, + "grad_norm": 2.018935203552246, + "learning_rate": 4.957585590748375e-05, + "loss": 5.5176, + "step": 9874 + }, + { + "epoch": 0.05872942239984775, + "grad_norm": 3.0146446228027344, + "learning_rate": 4.957577022696976e-05, + "loss": 5.2623, + "step": 9875 + }, + { + "epoch": 0.058735369683128744, + "grad_norm": 2.923011064529419, + "learning_rate": 4.957568453787662e-05, + "loss": 5.1828, + "step": 9876 + }, + { + "epoch": 0.058741316966409746, + "grad_norm": 2.7203526496887207, + "learning_rate": 4.9575598840204366e-05, + "loss": 5.1565, + "step": 9877 + }, + { + "epoch": 0.05874726424969074, + "grad_norm": 2.056260108947754, + "learning_rate": 4.9575513133953025e-05, + "loss": 5.1345, + "step": 9878 + }, + { + "epoch": 0.058753211532971736, + "grad_norm": 2.3120932579040527, + "learning_rate": 4.9575427419122616e-05, + "loss": 5.1792, + "step": 9879 + }, + { + "epoch": 0.05875915881625274, + "grad_norm": 2.1298701763153076, + "learning_rate": 4.9575341695713186e-05, + "loss": 5.1447, + "step": 9880 + }, + { + "epoch": 0.05876510609953373, + "grad_norm": 2.393869638442993, + "learning_rate": 4.9575255963724756e-05, + "loss": 5.2938, + "step": 9881 + }, + { + "epoch": 0.05877105338281473, + "grad_norm": 2.324061155319214, + "learning_rate": 4.9575170223157366e-05, + "loss": 5.1488, + "step": 9882 + }, + { + "epoch": 0.05877700066609573, + "grad_norm": 2.1416141986846924, + "learning_rate": 4.957508447401103e-05, + "loss": 5.0551, + "step": 9883 + }, + { + "epoch": 0.058782947949376725, + "grad_norm": 2.127350091934204, + "learning_rate": 4.9574998716285795e-05, + "loss": 5.03, + "step": 9884 + }, + { + "epoch": 0.05878889523265772, + "grad_norm": 2.317267417907715, + "learning_rate": 4.957491294998167e-05, + "loss": 5.049, + "step": 9885 + }, + { + "epoch": 0.05879484251593872, + "grad_norm": 2.3667004108428955, + "learning_rate": 4.9574827175098704e-05, + "loss": 5.009, + "step": 9886 + }, + { + "epoch": 0.05880078979921972, + "grad_norm": 2.4034934043884277, + "learning_rate": 4.9574741391636915e-05, + "loss": 4.9419, + "step": 9887 + }, + { + "epoch": 0.05880673708250071, + "grad_norm": 2.3792901039123535, + "learning_rate": 4.957465559959634e-05, + "loss": 4.8517, + "step": 9888 + }, + { + "epoch": 0.05881268436578171, + "grad_norm": 2.139249086380005, + "learning_rate": 4.957456979897701e-05, + "loss": 5.0767, + "step": 9889 + }, + { + "epoch": 0.05881863164906271, + "grad_norm": 2.5370614528656006, + "learning_rate": 4.957448398977894e-05, + "loss": 5.0243, + "step": 9890 + }, + { + "epoch": 0.058824578932343705, + "grad_norm": 2.0474746227264404, + "learning_rate": 4.957439817200218e-05, + "loss": 4.988, + "step": 9891 + }, + { + "epoch": 0.0588305262156247, + "grad_norm": 2.1323394775390625, + "learning_rate": 4.957431234564675e-05, + "loss": 5.7499, + "step": 9892 + }, + { + "epoch": 0.0588364734989057, + "grad_norm": 2.135988473892212, + "learning_rate": 4.957422651071269e-05, + "loss": 6.0197, + "step": 9893 + }, + { + "epoch": 0.0588424207821867, + "grad_norm": 2.4457356929779053, + "learning_rate": 4.957414066720001e-05, + "loss": 5.4461, + "step": 9894 + }, + { + "epoch": 0.05884836806546769, + "grad_norm": 2.3973019123077393, + "learning_rate": 4.957405481510876e-05, + "loss": 5.0372, + "step": 9895 + }, + { + "epoch": 0.058854315348748694, + "grad_norm": 2.5532052516937256, + "learning_rate": 4.957396895443896e-05, + "loss": 5.1462, + "step": 9896 + }, + { + "epoch": 0.05886026263202969, + "grad_norm": 2.3662166595458984, + "learning_rate": 4.9573883085190633e-05, + "loss": 5.1894, + "step": 9897 + }, + { + "epoch": 0.058866209915310684, + "grad_norm": 2.153883695602417, + "learning_rate": 4.9573797207363825e-05, + "loss": 5.6859, + "step": 9898 + }, + { + "epoch": 0.058872157198591686, + "grad_norm": 1.9541380405426025, + "learning_rate": 4.957371132095856e-05, + "loss": 5.5487, + "step": 9899 + }, + { + "epoch": 0.05887810448187268, + "grad_norm": 1.7920335531234741, + "learning_rate": 4.957362542597486e-05, + "loss": 5.4021, + "step": 9900 + }, + { + "epoch": 0.058884051765153676, + "grad_norm": 2.351090431213379, + "learning_rate": 4.9573539522412756e-05, + "loss": 4.9377, + "step": 9901 + }, + { + "epoch": 0.05888999904843468, + "grad_norm": 2.4780900478363037, + "learning_rate": 4.95734536102723e-05, + "loss": 5.04, + "step": 9902 + }, + { + "epoch": 0.05889594633171567, + "grad_norm": 1.7211192846298218, + "learning_rate": 4.957336768955349e-05, + "loss": 5.2959, + "step": 9903 + }, + { + "epoch": 0.05890189361499667, + "grad_norm": 1.9051212072372437, + "learning_rate": 4.957328176025638e-05, + "loss": 5.5587, + "step": 9904 + }, + { + "epoch": 0.058907840898277664, + "grad_norm": 2.009725332260132, + "learning_rate": 4.957319582238099e-05, + "loss": 5.5366, + "step": 9905 + }, + { + "epoch": 0.058913788181558666, + "grad_norm": 1.835423231124878, + "learning_rate": 4.957310987592735e-05, + "loss": 5.2522, + "step": 9906 + }, + { + "epoch": 0.05891973546483966, + "grad_norm": 1.6150819063186646, + "learning_rate": 4.957302392089549e-05, + "loss": 5.3935, + "step": 9907 + }, + { + "epoch": 0.058925682748120656, + "grad_norm": 1.825942873954773, + "learning_rate": 4.9572937957285435e-05, + "loss": 5.5435, + "step": 9908 + }, + { + "epoch": 0.05893163003140166, + "grad_norm": 1.5434985160827637, + "learning_rate": 4.957285198509724e-05, + "loss": 5.2508, + "step": 9909 + }, + { + "epoch": 0.05893757731468265, + "grad_norm": 1.7675530910491943, + "learning_rate": 4.9572766004330894e-05, + "loss": 5.2811, + "step": 9910 + }, + { + "epoch": 0.05894352459796365, + "grad_norm": 1.5196996927261353, + "learning_rate": 4.957268001498646e-05, + "loss": 5.1829, + "step": 9911 + }, + { + "epoch": 0.05894947188124465, + "grad_norm": 1.5598126649856567, + "learning_rate": 4.9572594017063964e-05, + "loss": 5.2067, + "step": 9912 + }, + { + "epoch": 0.058955419164525645, + "grad_norm": 1.6600217819213867, + "learning_rate": 4.957250801056342e-05, + "loss": 5.1591, + "step": 9913 + }, + { + "epoch": 0.05896136644780664, + "grad_norm": 2.040682315826416, + "learning_rate": 4.957242199548487e-05, + "loss": 4.8792, + "step": 9914 + }, + { + "epoch": 0.05896731373108764, + "grad_norm": 2.0122241973876953, + "learning_rate": 4.9572335971828346e-05, + "loss": 5.9489, + "step": 9915 + }, + { + "epoch": 0.05897326101436864, + "grad_norm": 2.4522452354431152, + "learning_rate": 4.957224993959386e-05, + "loss": 5.943, + "step": 9916 + }, + { + "epoch": 0.05897920829764963, + "grad_norm": 1.9101065397262573, + "learning_rate": 4.957216389878147e-05, + "loss": 5.858, + "step": 9917 + }, + { + "epoch": 0.05898515558093063, + "grad_norm": 1.6488839387893677, + "learning_rate": 4.957207784939118e-05, + "loss": 5.4935, + "step": 9918 + }, + { + "epoch": 0.05899110286421163, + "grad_norm": 1.7620775699615479, + "learning_rate": 4.957199179142303e-05, + "loss": 5.6067, + "step": 9919 + }, + { + "epoch": 0.058997050147492625, + "grad_norm": 2.6018314361572266, + "learning_rate": 4.957190572487707e-05, + "loss": 5.5249, + "step": 9920 + }, + { + "epoch": 0.05900299743077362, + "grad_norm": 1.810274600982666, + "learning_rate": 4.957181964975329e-05, + "loss": 5.4063, + "step": 9921 + }, + { + "epoch": 0.05900894471405462, + "grad_norm": 1.7467454671859741, + "learning_rate": 4.957173356605176e-05, + "loss": 5.4476, + "step": 9922 + }, + { + "epoch": 0.05901489199733562, + "grad_norm": 1.9074509143829346, + "learning_rate": 4.9571647473772483e-05, + "loss": 5.8014, + "step": 9923 + }, + { + "epoch": 0.05902083928061661, + "grad_norm": 1.6376137733459473, + "learning_rate": 4.9571561372915496e-05, + "loss": 5.6813, + "step": 9924 + }, + { + "epoch": 0.059026786563897614, + "grad_norm": 1.9984129667282104, + "learning_rate": 4.957147526348083e-05, + "loss": 5.9534, + "step": 9925 + }, + { + "epoch": 0.05903273384717861, + "grad_norm": 2.38493013381958, + "learning_rate": 4.957138914546852e-05, + "loss": 5.6903, + "step": 9926 + }, + { + "epoch": 0.059038681130459604, + "grad_norm": 1.86250901222229, + "learning_rate": 4.957130301887859e-05, + "loss": 5.1777, + "step": 9927 + }, + { + "epoch": 0.059044628413740606, + "grad_norm": 1.6241644620895386, + "learning_rate": 4.957121688371107e-05, + "loss": 5.1693, + "step": 9928 + }, + { + "epoch": 0.0590505756970216, + "grad_norm": 1.5627753734588623, + "learning_rate": 4.9571130739965996e-05, + "loss": 5.0313, + "step": 9929 + }, + { + "epoch": 0.059056522980302596, + "grad_norm": 1.6763062477111816, + "learning_rate": 4.957104458764339e-05, + "loss": 4.9973, + "step": 9930 + }, + { + "epoch": 0.0590624702635836, + "grad_norm": 1.6215085983276367, + "learning_rate": 4.957095842674329e-05, + "loss": 5.2216, + "step": 9931 + }, + { + "epoch": 0.05906841754686459, + "grad_norm": 1.5599844455718994, + "learning_rate": 4.957087225726572e-05, + "loss": 5.4525, + "step": 9932 + }, + { + "epoch": 0.05907436483014559, + "grad_norm": 1.3916441202163696, + "learning_rate": 4.957078607921072e-05, + "loss": 5.4434, + "step": 9933 + }, + { + "epoch": 0.059080312113426584, + "grad_norm": 1.524478554725647, + "learning_rate": 4.9570699892578295e-05, + "loss": 5.3979, + "step": 9934 + }, + { + "epoch": 0.059086259396707586, + "grad_norm": 1.264108657836914, + "learning_rate": 4.9570613697368505e-05, + "loss": 5.2892, + "step": 9935 + }, + { + "epoch": 0.05909220667998858, + "grad_norm": 1.7481588125228882, + "learning_rate": 4.957052749358137e-05, + "loss": 4.8539, + "step": 9936 + }, + { + "epoch": 0.059098153963269576, + "grad_norm": 1.675515055656433, + "learning_rate": 4.957044128121692e-05, + "loss": 5.4645, + "step": 9937 + }, + { + "epoch": 0.05910410124655058, + "grad_norm": 1.6560577154159546, + "learning_rate": 4.957035506027517e-05, + "loss": 4.9354, + "step": 9938 + }, + { + "epoch": 0.05911004852983157, + "grad_norm": 1.5030722618103027, + "learning_rate": 4.9570268830756174e-05, + "loss": 5.206, + "step": 9939 + }, + { + "epoch": 0.05911599581311257, + "grad_norm": 1.65435791015625, + "learning_rate": 4.957018259265994e-05, + "loss": 5.2132, + "step": 9940 + }, + { + "epoch": 0.05912194309639357, + "grad_norm": 1.6701000928878784, + "learning_rate": 4.9570096345986515e-05, + "loss": 5.2313, + "step": 9941 + }, + { + "epoch": 0.059127890379674565, + "grad_norm": 1.412954330444336, + "learning_rate": 4.957001009073593e-05, + "loss": 5.2511, + "step": 9942 + }, + { + "epoch": 0.05913383766295556, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.95699238269082e-05, + "loss": 5.3646, + "step": 9943 + }, + { + "epoch": 0.05913978494623656, + "grad_norm": 1.6969150304794312, + "learning_rate": 4.9569837554503365e-05, + "loss": 5.3001, + "step": 9944 + }, + { + "epoch": 0.05914573222951756, + "grad_norm": 1.8579715490341187, + "learning_rate": 4.9569751273521454e-05, + "loss": 5.0944, + "step": 9945 + }, + { + "epoch": 0.05915167951279855, + "grad_norm": 1.6907633543014526, + "learning_rate": 4.956966498396249e-05, + "loss": 5.1447, + "step": 9946 + }, + { + "epoch": 0.059157626796079554, + "grad_norm": 1.7581912279129028, + "learning_rate": 4.9569578685826525e-05, + "loss": 5.2065, + "step": 9947 + }, + { + "epoch": 0.05916357407936055, + "grad_norm": 1.4447051286697388, + "learning_rate": 4.9569492379113555e-05, + "loss": 5.081, + "step": 9948 + }, + { + "epoch": 0.059169521362641544, + "grad_norm": 1.731697916984558, + "learning_rate": 4.9569406063823644e-05, + "loss": 5.241, + "step": 9949 + }, + { + "epoch": 0.05917546864592254, + "grad_norm": 1.6483672857284546, + "learning_rate": 4.956931973995681e-05, + "loss": 5.306, + "step": 9950 + }, + { + "epoch": 0.05918141592920354, + "grad_norm": 2.2123141288757324, + "learning_rate": 4.956923340751306e-05, + "loss": 5.6134, + "step": 9951 + }, + { + "epoch": 0.05918736321248454, + "grad_norm": 1.8569937944412231, + "learning_rate": 4.956914706649246e-05, + "loss": 5.4819, + "step": 9952 + }, + { + "epoch": 0.05919331049576553, + "grad_norm": 1.8417435884475708, + "learning_rate": 4.956906071689502e-05, + "loss": 5.4116, + "step": 9953 + }, + { + "epoch": 0.059199257779046534, + "grad_norm": 1.7050427198410034, + "learning_rate": 4.956897435872078e-05, + "loss": 5.238, + "step": 9954 + }, + { + "epoch": 0.05920520506232753, + "grad_norm": 1.6636401414871216, + "learning_rate": 4.956888799196976e-05, + "loss": 5.0962, + "step": 9955 + }, + { + "epoch": 0.059211152345608524, + "grad_norm": 1.9194599390029907, + "learning_rate": 4.9568801616642e-05, + "loss": 5.2078, + "step": 9956 + }, + { + "epoch": 0.059217099628889526, + "grad_norm": 1.6154237985610962, + "learning_rate": 4.956871523273752e-05, + "loss": 5.3562, + "step": 9957 + }, + { + "epoch": 0.05922304691217052, + "grad_norm": 1.4500404596328735, + "learning_rate": 4.956862884025636e-05, + "loss": 5.2061, + "step": 9958 + }, + { + "epoch": 0.059228994195451516, + "grad_norm": 1.6681636571884155, + "learning_rate": 4.956854243919854e-05, + "loss": 5.3455, + "step": 9959 + }, + { + "epoch": 0.05923494147873252, + "grad_norm": 1.7175511121749878, + "learning_rate": 4.9568456029564104e-05, + "loss": 5.2967, + "step": 9960 + }, + { + "epoch": 0.05924088876201351, + "grad_norm": 1.5013905763626099, + "learning_rate": 4.956836961135306e-05, + "loss": 4.9836, + "step": 9961 + }, + { + "epoch": 0.05924683604529451, + "grad_norm": 1.6521363258361816, + "learning_rate": 4.956828318456546e-05, + "loss": 5.0295, + "step": 9962 + }, + { + "epoch": 0.0592527833285755, + "grad_norm": 1.5945814847946167, + "learning_rate": 4.9568196749201326e-05, + "loss": 4.9511, + "step": 9963 + }, + { + "epoch": 0.059258730611856505, + "grad_norm": 1.508301854133606, + "learning_rate": 4.95681103052607e-05, + "loss": 4.9469, + "step": 9964 + }, + { + "epoch": 0.0592646778951375, + "grad_norm": 1.5902310609817505, + "learning_rate": 4.956802385274358e-05, + "loss": 4.9761, + "step": 9965 + }, + { + "epoch": 0.059270625178418496, + "grad_norm": 1.739424467086792, + "learning_rate": 4.956793739165003e-05, + "loss": 5.2443, + "step": 9966 + }, + { + "epoch": 0.0592765724616995, + "grad_norm": 1.8317997455596924, + "learning_rate": 4.9567850921980056e-05, + "loss": 5.0046, + "step": 9967 + }, + { + "epoch": 0.05928251974498049, + "grad_norm": 1.8073506355285645, + "learning_rate": 4.956776444373371e-05, + "loss": 5.1779, + "step": 9968 + }, + { + "epoch": 0.05928846702826149, + "grad_norm": 1.8806017637252808, + "learning_rate": 4.956767795691101e-05, + "loss": 5.2956, + "step": 9969 + }, + { + "epoch": 0.05929441431154249, + "grad_norm": 1.8397493362426758, + "learning_rate": 4.956759146151198e-05, + "loss": 5.1775, + "step": 9970 + }, + { + "epoch": 0.059300361594823485, + "grad_norm": 2.001387119293213, + "learning_rate": 4.9567504957536656e-05, + "loss": 5.2149, + "step": 9971 + }, + { + "epoch": 0.05930630887810448, + "grad_norm": 2.011504650115967, + "learning_rate": 4.956741844498508e-05, + "loss": 5.2384, + "step": 9972 + }, + { + "epoch": 0.05931225616138548, + "grad_norm": 1.7936465740203857, + "learning_rate": 4.956733192385727e-05, + "loss": 5.2297, + "step": 9973 + }, + { + "epoch": 0.05931820344466648, + "grad_norm": 1.7336666584014893, + "learning_rate": 4.9567245394153255e-05, + "loss": 5.1637, + "step": 9974 + }, + { + "epoch": 0.05932415072794747, + "grad_norm": 1.7429137229919434, + "learning_rate": 4.956715885587307e-05, + "loss": 5.1315, + "step": 9975 + }, + { + "epoch": 0.059330098011228474, + "grad_norm": 1.6609208583831787, + "learning_rate": 4.956707230901674e-05, + "loss": 5.1554, + "step": 9976 + }, + { + "epoch": 0.05933604529450947, + "grad_norm": 1.630026936531067, + "learning_rate": 4.95669857535843e-05, + "loss": 5.1569, + "step": 9977 + }, + { + "epoch": 0.059341992577790464, + "grad_norm": 1.6968966722488403, + "learning_rate": 4.956689918957579e-05, + "loss": 5.06, + "step": 9978 + }, + { + "epoch": 0.05934793986107146, + "grad_norm": 1.6973050832748413, + "learning_rate": 4.9566812616991214e-05, + "loss": 5.2044, + "step": 9979 + }, + { + "epoch": 0.05935388714435246, + "grad_norm": 1.436073899269104, + "learning_rate": 4.9566726035830624e-05, + "loss": 5.2638, + "step": 9980 + }, + { + "epoch": 0.05935983442763346, + "grad_norm": 1.7667059898376465, + "learning_rate": 4.956663944609404e-05, + "loss": 5.0912, + "step": 9981 + }, + { + "epoch": 0.05936578171091445, + "grad_norm": 2.277327060699463, + "learning_rate": 4.9566552847781504e-05, + "loss": 5.6089, + "step": 9982 + }, + { + "epoch": 0.059371728994195454, + "grad_norm": 1.521134376525879, + "learning_rate": 4.956646624089304e-05, + "loss": 5.0213, + "step": 9983 + }, + { + "epoch": 0.05937767627747645, + "grad_norm": 1.556511402130127, + "learning_rate": 4.956637962542867e-05, + "loss": 5.1126, + "step": 9984 + }, + { + "epoch": 0.059383623560757444, + "grad_norm": 1.6691070795059204, + "learning_rate": 4.9566293001388423e-05, + "loss": 5.1351, + "step": 9985 + }, + { + "epoch": 0.059389570844038446, + "grad_norm": 1.5213310718536377, + "learning_rate": 4.956620636877235e-05, + "loss": 5.2402, + "step": 9986 + }, + { + "epoch": 0.05939551812731944, + "grad_norm": 1.5169057846069336, + "learning_rate": 4.956611972758046e-05, + "loss": 5.214, + "step": 9987 + }, + { + "epoch": 0.059401465410600436, + "grad_norm": 1.6076115369796753, + "learning_rate": 4.956603307781279e-05, + "loss": 5.1081, + "step": 9988 + }, + { + "epoch": 0.05940741269388144, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.9565946419469376e-05, + "loss": 5.1582, + "step": 9989 + }, + { + "epoch": 0.05941335997716243, + "grad_norm": 1.5118008852005005, + "learning_rate": 4.956585975255025e-05, + "loss": 5.0515, + "step": 9990 + }, + { + "epoch": 0.05941930726044343, + "grad_norm": 1.8852020502090454, + "learning_rate": 4.956577307705543e-05, + "loss": 5.3811, + "step": 9991 + }, + { + "epoch": 0.05942525454372442, + "grad_norm": 1.7066764831542969, + "learning_rate": 4.9565686392984955e-05, + "loss": 5.4599, + "step": 9992 + }, + { + "epoch": 0.059431201827005425, + "grad_norm": 1.5517010688781738, + "learning_rate": 4.956559970033885e-05, + "loss": 5.0728, + "step": 9993 + }, + { + "epoch": 0.05943714911028642, + "grad_norm": 1.508901596069336, + "learning_rate": 4.956551299911715e-05, + "loss": 5.1857, + "step": 9994 + }, + { + "epoch": 0.059443096393567416, + "grad_norm": 1.8867852687835693, + "learning_rate": 4.9565426289319874e-05, + "loss": 5.2223, + "step": 9995 + }, + { + "epoch": 0.05944904367684842, + "grad_norm": 1.4767159223556519, + "learning_rate": 4.9565339570947076e-05, + "loss": 5.1404, + "step": 9996 + }, + { + "epoch": 0.05945499096012941, + "grad_norm": 1.6351869106292725, + "learning_rate": 4.956525284399876e-05, + "loss": 5.3235, + "step": 9997 + }, + { + "epoch": 0.05946093824341041, + "grad_norm": 1.543565273284912, + "learning_rate": 4.956516610847497e-05, + "loss": 5.3365, + "step": 9998 + }, + { + "epoch": 0.05946688552669141, + "grad_norm": 1.4907768964767456, + "learning_rate": 4.9565079364375746e-05, + "loss": 5.4215, + "step": 9999 + }, + { + "epoch": 0.059472832809972405, + "grad_norm": 1.5810034275054932, + "learning_rate": 4.956499261170109e-05, + "loss": 5.3899, + "step": 10000 + } + ], + "logging_steps": 1, + "max_steps": 168144, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 4.187663442260787e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-100000/config.json b/checkpoint-100000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-100000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-100000/generation_config.json b/checkpoint-100000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-100000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-100000/model.safetensors.index.json b/checkpoint-100000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-100000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-100000/rng_state_0.pth b/checkpoint-100000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-100000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-100000/rng_state_1.pth b/checkpoint-100000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-100000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-100000/rng_state_2.pth b/checkpoint-100000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-100000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-100000/rng_state_4.pth b/checkpoint-100000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-100000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-100000/rng_state_5.pth b/checkpoint-100000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-100000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-100000/rng_state_7.pth b/checkpoint-100000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-100000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-100000/scheduler.pt b/checkpoint-100000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..1051c44882ed9c79226b461dd254ac2c053dfe71 --- /dev/null +++ b/checkpoint-100000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37e82e505e64780230a537a8bb6263160979833dfdd633f4e134b05446fdfc1f +size 1064 diff --git a/checkpoint-110000/config.json b/checkpoint-110000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-110000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-110000/generation_config.json b/checkpoint-110000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-110000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-110000/model.safetensors.index.json b/checkpoint-110000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-110000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-110000/rng_state_0.pth b/checkpoint-110000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-110000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-110000/rng_state_1.pth b/checkpoint-110000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-110000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-110000/rng_state_2.pth b/checkpoint-110000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-110000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-110000/rng_state_3.pth b/checkpoint-110000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-110000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-110000/rng_state_4.pth b/checkpoint-110000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-110000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-110000/rng_state_5.pth b/checkpoint-110000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-110000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-110000/rng_state_6.pth b/checkpoint-110000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-110000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-110000/scheduler.pt b/checkpoint-110000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..53811fb2cdd2ff4a32bede48e1c8db3c5ba6ef7b --- /dev/null +++ b/checkpoint-110000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d835cc1984eb23fcc1b97603ff98d453c83e84bf60ba41d856196a295df151f +size 1064 diff --git a/checkpoint-120000/config.json b/checkpoint-120000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-120000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-120000/generation_config.json b/checkpoint-120000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-120000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-120000/model.safetensors.index.json b/checkpoint-120000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-120000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-120000/rng_state_2.pth b/checkpoint-120000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-120000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-120000/rng_state_3.pth b/checkpoint-120000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-120000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-120000/rng_state_4.pth b/checkpoint-120000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-120000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-120000/rng_state_5.pth b/checkpoint-120000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-120000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-130000/config.json b/checkpoint-130000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-130000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-130000/generation_config.json b/checkpoint-130000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-130000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-130000/model.safetensors.index.json b/checkpoint-130000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-130000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-130000/rng_state_0.pth b/checkpoint-130000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-130000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-130000/rng_state_2.pth b/checkpoint-130000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-130000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-130000/rng_state_4.pth b/checkpoint-130000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-130000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-130000/rng_state_6.pth b/checkpoint-130000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-130000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-130000/scheduler.pt b/checkpoint-130000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..b9bd8f4e7801101eb62ebfd4f78f7a4d28f0cadd --- /dev/null +++ b/checkpoint-130000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:740724d3e98f52177e21aeade072557d2b41d0d314e877fe394d8630b71c4dec +size 1064 diff --git a/checkpoint-140000/config.json b/checkpoint-140000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-140000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-140000/generation_config.json b/checkpoint-140000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-140000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-140000/model.safetensors.index.json b/checkpoint-140000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-140000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-140000/rng_state_0.pth b/checkpoint-140000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-140000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-140000/rng_state_1.pth b/checkpoint-140000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-140000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-140000/rng_state_3.pth b/checkpoint-140000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-140000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-140000/rng_state_5.pth b/checkpoint-140000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-140000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-140000/rng_state_6.pth b/checkpoint-140000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-140000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-150000/config.json b/checkpoint-150000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-150000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-150000/generation_config.json b/checkpoint-150000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-150000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-150000/model.safetensors.index.json b/checkpoint-150000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-150000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-150000/rng_state_0.pth b/checkpoint-150000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-150000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-150000/rng_state_2.pth b/checkpoint-150000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-150000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-150000/rng_state_5.pth b/checkpoint-150000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-150000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-150000/rng_state_6.pth b/checkpoint-150000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-150000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-150000/rng_state_7.pth b/checkpoint-150000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-150000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-160000/config.json b/checkpoint-160000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-160000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-160000/generation_config.json b/checkpoint-160000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-160000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-160000/model.safetensors.index.json b/checkpoint-160000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-160000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-160000/rng_state_6.pth b/checkpoint-160000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-160000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-168144/config.json b/checkpoint-168144/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-168144/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-168144/generation_config.json b/checkpoint-168144/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-168144/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-168144/model.safetensors.index.json b/checkpoint-168144/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-168144/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-168144/rng_state_0.pth b/checkpoint-168144/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-168144/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-20000/config.json b/checkpoint-20000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-20000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-20000/generation_config.json b/checkpoint-20000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-20000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-20000/model.safetensors.index.json b/checkpoint-20000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-20000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-20000/rng_state_2.pth b/checkpoint-20000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-20000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-20000/rng_state_4.pth b/checkpoint-20000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-20000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-20000/rng_state_5.pth b/checkpoint-20000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-20000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-20000/rng_state_6.pth b/checkpoint-20000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-20000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-20000/rng_state_7.pth b/checkpoint-20000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-20000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-20000/trainer_state.json b/checkpoint-20000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..b0cf85452a9c1e65fd95b170f307af4290820773 --- /dev/null +++ b/checkpoint-20000/trainer_state.json @@ -0,0 +1,140034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.11894566561994481, + "eval_steps": 500, + "global_step": 20000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.947283280997241e-06, + "grad_norm": 179.1047821044922, + "learning_rate": 5e-05, + "loss": 14.5158, + "step": 1 + }, + { + "epoch": 1.1894566561994482e-05, + "grad_norm": 40.39401626586914, + "learning_rate": 4.999999999563638e-05, + "loss": 14.152, + "step": 2 + }, + { + "epoch": 1.7841849842991722e-05, + "grad_norm": 137.05079650878906, + "learning_rate": 4.999999998254552e-05, + "loss": 14.6334, + "step": 3 + }, + { + "epoch": 2.3789133123988963e-05, + "grad_norm": 23.315088272094727, + "learning_rate": 4.9999999960727415e-05, + "loss": 12.6458, + "step": 4 + }, + { + "epoch": 2.97364164049862e-05, + "grad_norm": 7.943603992462158, + "learning_rate": 4.9999999930182065e-05, + "loss": 11.8435, + "step": 5 + }, + { + "epoch": 3.5683699685983445e-05, + "grad_norm": 6.374181270599365, + "learning_rate": 4.999999989090948e-05, + "loss": 11.4544, + "step": 6 + }, + { + "epoch": 4.1630982966980686e-05, + "grad_norm": 8.948652267456055, + "learning_rate": 4.999999984290965e-05, + "loss": 11.3516, + "step": 7 + }, + { + "epoch": 4.7578266247977927e-05, + "grad_norm": 3.2318713665008545, + "learning_rate": 4.999999978618258e-05, + "loss": 11.1021, + "step": 8 + }, + { + "epoch": 5.352554952897517e-05, + "grad_norm": 5.6542534828186035, + "learning_rate": 4.9999999720728266e-05, + "loss": 11.0132, + "step": 9 + }, + { + "epoch": 5.94728328099724e-05, + "grad_norm": 3.623577356338501, + "learning_rate": 4.999999964654671e-05, + "loss": 10.8896, + "step": 10 + }, + { + "epoch": 6.542011609096965e-05, + "grad_norm": 3.3209445476531982, + "learning_rate": 4.9999999563637915e-05, + "loss": 10.7339, + "step": 11 + }, + { + "epoch": 7.136739937196689e-05, + "grad_norm": 3.4527082443237305, + "learning_rate": 4.999999947200188e-05, + "loss": 10.5472, + "step": 12 + }, + { + "epoch": 7.731468265296413e-05, + "grad_norm": 3.784444570541382, + "learning_rate": 4.99999993716386e-05, + "loss": 10.4353, + "step": 13 + }, + { + "epoch": 8.326196593396137e-05, + "grad_norm": 4.304569244384766, + "learning_rate": 4.999999926254808e-05, + "loss": 10.4652, + "step": 14 + }, + { + "epoch": 8.920924921495861e-05, + "grad_norm": 3.5867838859558105, + "learning_rate": 4.999999914473032e-05, + "loss": 10.5746, + "step": 15 + }, + { + "epoch": 9.515653249595585e-05, + "grad_norm": 6.1308207511901855, + "learning_rate": 4.9999999018185316e-05, + "loss": 10.4129, + "step": 16 + }, + { + "epoch": 0.0001011038157769531, + "grad_norm": 3.4687230587005615, + "learning_rate": 4.999999888291307e-05, + "loss": 10.2246, + "step": 17 + }, + { + "epoch": 0.00010705109905795033, + "grad_norm": 4.041895866394043, + "learning_rate": 4.9999998738913586e-05, + "loss": 10.0852, + "step": 18 + }, + { + "epoch": 0.00011299838233894758, + "grad_norm": 4.437602519989014, + "learning_rate": 4.999999858618686e-05, + "loss": 9.8841, + "step": 19 + }, + { + "epoch": 0.0001189456656199448, + "grad_norm": 3.9608142375946045, + "learning_rate": 4.9999998424732884e-05, + "loss": 10.0537, + "step": 20 + }, + { + "epoch": 0.00012489294890094204, + "grad_norm": 3.799363613128662, + "learning_rate": 4.999999825455168e-05, + "loss": 9.8487, + "step": 21 + }, + { + "epoch": 0.0001308402321819393, + "grad_norm": 3.626058340072632, + "learning_rate": 4.999999807564323e-05, + "loss": 9.8048, + "step": 22 + }, + { + "epoch": 0.00013678751546293653, + "grad_norm": 4.21406364440918, + "learning_rate": 4.999999788800754e-05, + "loss": 9.6091, + "step": 23 + }, + { + "epoch": 0.00014273479874393378, + "grad_norm": 5.26548957824707, + "learning_rate": 4.9999997691644605e-05, + "loss": 9.3935, + "step": 24 + }, + { + "epoch": 0.000148682082024931, + "grad_norm": 6.5113396644592285, + "learning_rate": 4.999999748655443e-05, + "loss": 9.2602, + "step": 25 + }, + { + "epoch": 0.00015462936530592826, + "grad_norm": 4.6141133308410645, + "learning_rate": 4.9999997272737014e-05, + "loss": 9.1492, + "step": 26 + }, + { + "epoch": 0.0001605766485869255, + "grad_norm": 4.645262241363525, + "learning_rate": 4.999999705019236e-05, + "loss": 9.2238, + "step": 27 + }, + { + "epoch": 0.00016652393186792274, + "grad_norm": 4.599213123321533, + "learning_rate": 4.9999996818920464e-05, + "loss": 9.1673, + "step": 28 + }, + { + "epoch": 0.00017247121514891997, + "grad_norm": 4.820634365081787, + "learning_rate": 4.999999657892133e-05, + "loss": 9.0044, + "step": 29 + }, + { + "epoch": 0.00017841849842991722, + "grad_norm": 4.57854700088501, + "learning_rate": 4.9999996330194956e-05, + "loss": 8.8746, + "step": 30 + }, + { + "epoch": 0.00018436578171091445, + "grad_norm": 4.567880153656006, + "learning_rate": 4.999999607274133e-05, + "loss": 8.7224, + "step": 31 + }, + { + "epoch": 0.0001903130649919117, + "grad_norm": 4.545701503753662, + "learning_rate": 4.9999995806560475e-05, + "loss": 8.6979, + "step": 32 + }, + { + "epoch": 0.00019626034827290893, + "grad_norm": 4.098274230957031, + "learning_rate": 4.9999995531652374e-05, + "loss": 8.5787, + "step": 33 + }, + { + "epoch": 0.0002022076315539062, + "grad_norm": 4.341195106506348, + "learning_rate": 4.999999524801704e-05, + "loss": 8.4452, + "step": 34 + }, + { + "epoch": 0.00020815491483490341, + "grad_norm": 4.651747703552246, + "learning_rate": 4.999999495565446e-05, + "loss": 8.4383, + "step": 35 + }, + { + "epoch": 0.00021410219811590067, + "grad_norm": 4.187220573425293, + "learning_rate": 4.999999465456464e-05, + "loss": 8.2441, + "step": 36 + }, + { + "epoch": 0.0002200494813968979, + "grad_norm": 4.094058990478516, + "learning_rate": 4.999999434474758e-05, + "loss": 8.2784, + "step": 37 + }, + { + "epoch": 0.00022599676467789515, + "grad_norm": 4.6094794273376465, + "learning_rate": 4.999999402620329e-05, + "loss": 8.3893, + "step": 38 + }, + { + "epoch": 0.00023194404795889238, + "grad_norm": 5.391327381134033, + "learning_rate": 4.999999369893175e-05, + "loss": 8.6491, + "step": 39 + }, + { + "epoch": 0.0002378913312398896, + "grad_norm": 5.03748893737793, + "learning_rate": 4.9999993362932974e-05, + "loss": 8.5279, + "step": 40 + }, + { + "epoch": 0.00024383861452088686, + "grad_norm": 5.306002616882324, + "learning_rate": 4.9999993018206956e-05, + "loss": 9.9965, + "step": 41 + }, + { + "epoch": 0.0002497858978018841, + "grad_norm": 5.5374274253845215, + "learning_rate": 4.99999926647537e-05, + "loss": 10.5594, + "step": 42 + }, + { + "epoch": 0.00025573318108288134, + "grad_norm": 3.8107693195343018, + "learning_rate": 4.999999230257321e-05, + "loss": 10.5451, + "step": 43 + }, + { + "epoch": 0.0002616804643638786, + "grad_norm": 3.922286033630371, + "learning_rate": 4.999999193166547e-05, + "loss": 10.4123, + "step": 44 + }, + { + "epoch": 0.0002676277476448758, + "grad_norm": 3.2090535163879395, + "learning_rate": 4.99999915520305e-05, + "loss": 10.0646, + "step": 45 + }, + { + "epoch": 0.00027357503092587305, + "grad_norm": 3.153404474258423, + "learning_rate": 4.9999991163668285e-05, + "loss": 10.237, + "step": 46 + }, + { + "epoch": 0.0002795223142068703, + "grad_norm": 4.83523416519165, + "learning_rate": 4.999999076657884e-05, + "loss": 8.9392, + "step": 47 + }, + { + "epoch": 0.00028546959748786756, + "grad_norm": 3.954632043838501, + "learning_rate": 4.999999036076215e-05, + "loss": 8.8562, + "step": 48 + }, + { + "epoch": 0.00029141688076886476, + "grad_norm": 4.452631950378418, + "learning_rate": 4.999998994621822e-05, + "loss": 9.8819, + "step": 49 + }, + { + "epoch": 0.000297364164049862, + "grad_norm": 4.71603536605835, + "learning_rate": 4.9999989522947055e-05, + "loss": 9.8503, + "step": 50 + }, + { + "epoch": 0.00030331144733085927, + "grad_norm": 3.8008105754852295, + "learning_rate": 4.999998909094865e-05, + "loss": 9.8072, + "step": 51 + }, + { + "epoch": 0.0003092587306118565, + "grad_norm": 3.9906716346740723, + "learning_rate": 4.999998865022301e-05, + "loss": 9.168, + "step": 52 + }, + { + "epoch": 0.0003152060138928537, + "grad_norm": 3.9425785541534424, + "learning_rate": 4.999998820077013e-05, + "loss": 9.8441, + "step": 53 + }, + { + "epoch": 0.000321153297173851, + "grad_norm": 3.6698031425476074, + "learning_rate": 4.999998774259002e-05, + "loss": 10.036, + "step": 54 + }, + { + "epoch": 0.00032710058045484823, + "grad_norm": 3.3027005195617676, + "learning_rate": 4.999998727568266e-05, + "loss": 9.8701, + "step": 55 + }, + { + "epoch": 0.0003330478637358455, + "grad_norm": 3.312570333480835, + "learning_rate": 4.999998680004807e-05, + "loss": 9.3354, + "step": 56 + }, + { + "epoch": 0.0003389951470168427, + "grad_norm": 3.323969602584839, + "learning_rate": 4.999998631568624e-05, + "loss": 9.2899, + "step": 57 + }, + { + "epoch": 0.00034494243029783994, + "grad_norm": 3.1319313049316406, + "learning_rate": 4.999998582259717e-05, + "loss": 9.1033, + "step": 58 + }, + { + "epoch": 0.0003508897135788372, + "grad_norm": 3.655060291290283, + "learning_rate": 4.999998532078087e-05, + "loss": 9.1574, + "step": 59 + }, + { + "epoch": 0.00035683699685983445, + "grad_norm": 3.2051918506622314, + "learning_rate": 4.999998481023733e-05, + "loss": 9.564, + "step": 60 + }, + { + "epoch": 0.00036278428014083165, + "grad_norm": 3.223015308380127, + "learning_rate": 4.999998429096656e-05, + "loss": 9.46, + "step": 61 + }, + { + "epoch": 0.0003687315634218289, + "grad_norm": 4.121186256408691, + "learning_rate": 4.999998376296855e-05, + "loss": 8.4136, + "step": 62 + }, + { + "epoch": 0.00037467884670282616, + "grad_norm": 3.5580086708068848, + "learning_rate": 4.9999983226243296e-05, + "loss": 9.3504, + "step": 63 + }, + { + "epoch": 0.0003806261299838234, + "grad_norm": 3.664219379425049, + "learning_rate": 4.999998268079081e-05, + "loss": 9.2889, + "step": 64 + }, + { + "epoch": 0.0003865734132648206, + "grad_norm": 2.955582618713379, + "learning_rate": 4.99999821266111e-05, + "loss": 8.9193, + "step": 65 + }, + { + "epoch": 0.00039252069654581787, + "grad_norm": 3.0592539310455322, + "learning_rate": 4.9999981563704144e-05, + "loss": 9.6739, + "step": 66 + }, + { + "epoch": 0.0003984679798268151, + "grad_norm": 3.32024884223938, + "learning_rate": 4.999998099206995e-05, + "loss": 9.3648, + "step": 67 + }, + { + "epoch": 0.0004044152631078124, + "grad_norm": 3.2716033458709717, + "learning_rate": 4.9999980411708524e-05, + "loss": 9.3652, + "step": 68 + }, + { + "epoch": 0.0004103625463888096, + "grad_norm": 3.1926631927490234, + "learning_rate": 4.999997982261987e-05, + "loss": 9.2924, + "step": 69 + }, + { + "epoch": 0.00041630982966980683, + "grad_norm": 3.589841604232788, + "learning_rate": 4.999997922480397e-05, + "loss": 9.2185, + "step": 70 + }, + { + "epoch": 0.0004222571129508041, + "grad_norm": 2.902132034301758, + "learning_rate": 4.999997861826084e-05, + "loss": 9.1047, + "step": 71 + }, + { + "epoch": 0.00042820439623180134, + "grad_norm": 3.2352359294891357, + "learning_rate": 4.999997800299048e-05, + "loss": 9.0309, + "step": 72 + }, + { + "epoch": 0.00043415167951279854, + "grad_norm": 2.683664560317993, + "learning_rate": 4.9999977378992884e-05, + "loss": 8.9977, + "step": 73 + }, + { + "epoch": 0.0004400989627937958, + "grad_norm": 3.0073423385620117, + "learning_rate": 4.9999976746268055e-05, + "loss": 9.0967, + "step": 74 + }, + { + "epoch": 0.00044604624607479305, + "grad_norm": 3.364819288253784, + "learning_rate": 4.9999976104815994e-05, + "loss": 8.9401, + "step": 75 + }, + { + "epoch": 0.0004519935293557903, + "grad_norm": 3.478936195373535, + "learning_rate": 4.9999975454636695e-05, + "loss": 8.8173, + "step": 76 + }, + { + "epoch": 0.0004579408126367875, + "grad_norm": 3.059669017791748, + "learning_rate": 4.9999974795730165e-05, + "loss": 9.2588, + "step": 77 + }, + { + "epoch": 0.00046388809591778476, + "grad_norm": 3.1980936527252197, + "learning_rate": 4.999997412809639e-05, + "loss": 9.3374, + "step": 78 + }, + { + "epoch": 0.000469835379198782, + "grad_norm": 2.859935998916626, + "learning_rate": 4.9999973451735405e-05, + "loss": 8.8996, + "step": 79 + }, + { + "epoch": 0.0004757826624797792, + "grad_norm": 3.6268489360809326, + "learning_rate": 4.9999972766647175e-05, + "loss": 8.7878, + "step": 80 + }, + { + "epoch": 0.00048172994576077647, + "grad_norm": 3.0187010765075684, + "learning_rate": 4.9999972072831714e-05, + "loss": 8.9177, + "step": 81 + }, + { + "epoch": 0.0004876772290417737, + "grad_norm": 3.304633378982544, + "learning_rate": 4.9999971370289014e-05, + "loss": 8.8098, + "step": 82 + }, + { + "epoch": 0.0004936245123227709, + "grad_norm": 3.678696870803833, + "learning_rate": 4.999997065901909e-05, + "loss": 8.9408, + "step": 83 + }, + { + "epoch": 0.0004995717956037682, + "grad_norm": 3.485488176345825, + "learning_rate": 4.9999969939021936e-05, + "loss": 8.7374, + "step": 84 + }, + { + "epoch": 0.0005055190788847654, + "grad_norm": 3.276916265487671, + "learning_rate": 4.999996921029755e-05, + "loss": 8.7177, + "step": 85 + }, + { + "epoch": 0.0005114663621657627, + "grad_norm": 3.060227632522583, + "learning_rate": 4.9999968472845926e-05, + "loss": 8.9673, + "step": 86 + }, + { + "epoch": 0.0005174136454467599, + "grad_norm": 3.359055995941162, + "learning_rate": 4.999996772666708e-05, + "loss": 8.8029, + "step": 87 + }, + { + "epoch": 0.0005233609287277572, + "grad_norm": 3.8916943073272705, + "learning_rate": 4.9999966971761004e-05, + "loss": 8.8363, + "step": 88 + }, + { + "epoch": 0.0005293082120087544, + "grad_norm": 3.825075387954712, + "learning_rate": 4.9999966208127694e-05, + "loss": 8.5683, + "step": 89 + }, + { + "epoch": 0.0005352554952897516, + "grad_norm": 3.475759267807007, + "learning_rate": 4.999996543576715e-05, + "loss": 8.5723, + "step": 90 + }, + { + "epoch": 0.0005412027785707488, + "grad_norm": 3.609776020050049, + "learning_rate": 4.9999964654679385e-05, + "loss": 8.6123, + "step": 91 + }, + { + "epoch": 0.0005471500618517461, + "grad_norm": 3.3749685287475586, + "learning_rate": 4.999996386486439e-05, + "loss": 8.4887, + "step": 92 + }, + { + "epoch": 0.0005530973451327434, + "grad_norm": 3.3853306770324707, + "learning_rate": 4.999996306632215e-05, + "loss": 8.56, + "step": 93 + }, + { + "epoch": 0.0005590446284137406, + "grad_norm": 3.9347422122955322, + "learning_rate": 4.99999622590527e-05, + "loss": 8.5053, + "step": 94 + }, + { + "epoch": 0.0005649919116947379, + "grad_norm": 3.6037611961364746, + "learning_rate": 4.999996144305601e-05, + "loss": 8.3367, + "step": 95 + }, + { + "epoch": 0.0005709391949757351, + "grad_norm": 3.4608941078186035, + "learning_rate": 4.99999606183321e-05, + "loss": 8.0674, + "step": 96 + }, + { + "epoch": 0.0005768864782567324, + "grad_norm": 3.4882898330688477, + "learning_rate": 4.999995978488096e-05, + "loss": 8.1728, + "step": 97 + }, + { + "epoch": 0.0005828337615377295, + "grad_norm": 3.6789562702178955, + "learning_rate": 4.999995894270258e-05, + "loss": 7.9535, + "step": 98 + }, + { + "epoch": 0.0005887810448187268, + "grad_norm": 3.57328200340271, + "learning_rate": 4.9999958091796986e-05, + "loss": 8.2048, + "step": 99 + }, + { + "epoch": 0.000594728328099724, + "grad_norm": 3.803468942642212, + "learning_rate": 4.999995723216416e-05, + "loss": 7.8073, + "step": 100 + }, + { + "epoch": 0.0006006756113807213, + "grad_norm": 3.8187785148620605, + "learning_rate": 4.9999956363804116e-05, + "loss": 7.6325, + "step": 101 + }, + { + "epoch": 0.0006066228946617185, + "grad_norm": 3.8681981563568115, + "learning_rate": 4.999995548671684e-05, + "loss": 7.7104, + "step": 102 + }, + { + "epoch": 0.0006125701779427158, + "grad_norm": 3.869074583053589, + "learning_rate": 4.9999954600902334e-05, + "loss": 7.8445, + "step": 103 + }, + { + "epoch": 0.000618517461223713, + "grad_norm": 3.852057695388794, + "learning_rate": 4.99999537063606e-05, + "loss": 7.872, + "step": 104 + }, + { + "epoch": 0.0006244647445047103, + "grad_norm": 4.784586429595947, + "learning_rate": 4.9999952803091654e-05, + "loss": 9.2218, + "step": 105 + }, + { + "epoch": 0.0006304120277857074, + "grad_norm": 4.296675682067871, + "learning_rate": 4.9999951891095474e-05, + "loss": 9.0957, + "step": 106 + }, + { + "epoch": 0.0006363593110667047, + "grad_norm": 3.9155995845794678, + "learning_rate": 4.999995097037207e-05, + "loss": 8.9829, + "step": 107 + }, + { + "epoch": 0.000642306594347702, + "grad_norm": 3.8967478275299072, + "learning_rate": 4.999995004092144e-05, + "loss": 8.2017, + "step": 108 + }, + { + "epoch": 0.0006482538776286992, + "grad_norm": 5.238500595092773, + "learning_rate": 4.999994910274358e-05, + "loss": 7.7976, + "step": 109 + }, + { + "epoch": 0.0006542011609096965, + "grad_norm": 3.7043144702911377, + "learning_rate": 4.9999948155838504e-05, + "loss": 8.3116, + "step": 110 + }, + { + "epoch": 0.0006601484441906937, + "grad_norm": 2.9745211601257324, + "learning_rate": 4.99999472002062e-05, + "loss": 8.69, + "step": 111 + }, + { + "epoch": 0.000666095727471691, + "grad_norm": 3.172652006149292, + "learning_rate": 4.999994623584668e-05, + "loss": 8.6244, + "step": 112 + }, + { + "epoch": 0.0006720430107526882, + "grad_norm": 3.224888801574707, + "learning_rate": 4.999994526275993e-05, + "loss": 8.6823, + "step": 113 + }, + { + "epoch": 0.0006779902940336854, + "grad_norm": 3.53104305267334, + "learning_rate": 4.9999944280945964e-05, + "loss": 8.495, + "step": 114 + }, + { + "epoch": 0.0006839375773146826, + "grad_norm": 3.013505697250366, + "learning_rate": 4.999994329040477e-05, + "loss": 8.4807, + "step": 115 + }, + { + "epoch": 0.0006898848605956799, + "grad_norm": 4.4741339683532715, + "learning_rate": 4.999994229113636e-05, + "loss": 8.94, + "step": 116 + }, + { + "epoch": 0.0006958321438766771, + "grad_norm": 4.78712272644043, + "learning_rate": 4.999994128314072e-05, + "loss": 8.9367, + "step": 117 + }, + { + "epoch": 0.0007017794271576744, + "grad_norm": 3.6983933448791504, + "learning_rate": 4.999994026641787e-05, + "loss": 8.7524, + "step": 118 + }, + { + "epoch": 0.0007077267104386716, + "grad_norm": 3.74997615814209, + "learning_rate": 4.9999939240967784e-05, + "loss": 8.3417, + "step": 119 + }, + { + "epoch": 0.0007136739937196689, + "grad_norm": 3.614593982696533, + "learning_rate": 4.999993820679049e-05, + "loss": 8.4848, + "step": 120 + }, + { + "epoch": 0.000719621277000666, + "grad_norm": 2.903045654296875, + "learning_rate": 4.999993716388597e-05, + "loss": 8.5519, + "step": 121 + }, + { + "epoch": 0.0007255685602816633, + "grad_norm": 3.402444839477539, + "learning_rate": 4.999993611225423e-05, + "loss": 8.2905, + "step": 122 + }, + { + "epoch": 0.0007315158435626606, + "grad_norm": 3.663893938064575, + "learning_rate": 4.9999935051895274e-05, + "loss": 8.4842, + "step": 123 + }, + { + "epoch": 0.0007374631268436578, + "grad_norm": 3.7535622119903564, + "learning_rate": 4.99999339828091e-05, + "loss": 8.4766, + "step": 124 + }, + { + "epoch": 0.0007434104101246551, + "grad_norm": 3.1285574436187744, + "learning_rate": 4.99999329049957e-05, + "loss": 8.3716, + "step": 125 + }, + { + "epoch": 0.0007493576934056523, + "grad_norm": 3.648869752883911, + "learning_rate": 4.9999931818455086e-05, + "loss": 8.3413, + "step": 126 + }, + { + "epoch": 0.0007553049766866496, + "grad_norm": 3.253399133682251, + "learning_rate": 4.9999930723187255e-05, + "loss": 8.0412, + "step": 127 + }, + { + "epoch": 0.0007612522599676468, + "grad_norm": 3.5694124698638916, + "learning_rate": 4.999992961919221e-05, + "loss": 8.0895, + "step": 128 + }, + { + "epoch": 0.000767199543248644, + "grad_norm": 4.106658458709717, + "learning_rate": 4.999992850646994e-05, + "loss": 8.3654, + "step": 129 + }, + { + "epoch": 0.0007731468265296412, + "grad_norm": 4.082829475402832, + "learning_rate": 4.9999927385020455e-05, + "loss": 8.2663, + "step": 130 + }, + { + "epoch": 0.0007790941098106385, + "grad_norm": 4.349386215209961, + "learning_rate": 4.9999926254843753e-05, + "loss": 8.2435, + "step": 131 + }, + { + "epoch": 0.0007850413930916357, + "grad_norm": 3.375697135925293, + "learning_rate": 4.999992511593984e-05, + "loss": 8.0827, + "step": 132 + }, + { + "epoch": 0.000790988676372633, + "grad_norm": 3.2566957473754883, + "learning_rate": 4.999992396830871e-05, + "loss": 8.4891, + "step": 133 + }, + { + "epoch": 0.0007969359596536302, + "grad_norm": 3.791579008102417, + "learning_rate": 4.999992281195036e-05, + "loss": 8.1567, + "step": 134 + }, + { + "epoch": 0.0008028832429346275, + "grad_norm": 3.8741838932037354, + "learning_rate": 4.99999216468648e-05, + "loss": 8.4033, + "step": 135 + }, + { + "epoch": 0.0008088305262156248, + "grad_norm": 4.229452133178711, + "learning_rate": 4.999992047305203e-05, + "loss": 8.3897, + "step": 136 + }, + { + "epoch": 0.0008147778094966219, + "grad_norm": 3.2732088565826416, + "learning_rate": 4.9999919290512034e-05, + "loss": 8.1758, + "step": 137 + }, + { + "epoch": 0.0008207250927776192, + "grad_norm": 3.2048966884613037, + "learning_rate": 4.9999918099244836e-05, + "loss": 8.1459, + "step": 138 + }, + { + "epoch": 0.0008266723760586164, + "grad_norm": 3.8639938831329346, + "learning_rate": 4.999991689925042e-05, + "loss": 7.9437, + "step": 139 + }, + { + "epoch": 0.0008326196593396137, + "grad_norm": 3.297252655029297, + "learning_rate": 4.9999915690528794e-05, + "loss": 8.1751, + "step": 140 + }, + { + "epoch": 0.0008385669426206109, + "grad_norm": 3.878218173980713, + "learning_rate": 4.999991447307995e-05, + "loss": 8.0572, + "step": 141 + }, + { + "epoch": 0.0008445142259016082, + "grad_norm": 3.6870739459991455, + "learning_rate": 4.9999913246903895e-05, + "loss": 8.0958, + "step": 142 + }, + { + "epoch": 0.0008504615091826054, + "grad_norm": 3.1817922592163086, + "learning_rate": 4.9999912012000636e-05, + "loss": 8.2683, + "step": 143 + }, + { + "epoch": 0.0008564087924636027, + "grad_norm": 3.4008772373199463, + "learning_rate": 4.999991076837016e-05, + "loss": 8.4171, + "step": 144 + }, + { + "epoch": 0.0008623560757445998, + "grad_norm": 3.002333641052246, + "learning_rate": 4.999990951601247e-05, + "loss": 8.1149, + "step": 145 + }, + { + "epoch": 0.0008683033590255971, + "grad_norm": 3.51910662651062, + "learning_rate": 4.999990825492757e-05, + "loss": 8.5284, + "step": 146 + }, + { + "epoch": 0.0008742506423065943, + "grad_norm": 2.978875160217285, + "learning_rate": 4.999990698511548e-05, + "loss": 8.4855, + "step": 147 + }, + { + "epoch": 0.0008801979255875916, + "grad_norm": 3.4708774089813232, + "learning_rate": 4.999990570657616e-05, + "loss": 8.333, + "step": 148 + }, + { + "epoch": 0.0008861452088685888, + "grad_norm": 2.994084596633911, + "learning_rate": 4.999990441930963e-05, + "loss": 8.3456, + "step": 149 + }, + { + "epoch": 0.0008920924921495861, + "grad_norm": 3.1295697689056396, + "learning_rate": 4.99999031233159e-05, + "loss": 8.2204, + "step": 150 + }, + { + "epoch": 0.0008980397754305833, + "grad_norm": 3.349720001220703, + "learning_rate": 4.9999901818594966e-05, + "loss": 8.2739, + "step": 151 + }, + { + "epoch": 0.0009039870587115806, + "grad_norm": 3.852964401245117, + "learning_rate": 4.999990050514681e-05, + "loss": 8.4225, + "step": 152 + }, + { + "epoch": 0.0009099343419925777, + "grad_norm": 3.92203950881958, + "learning_rate": 4.9999899182971456e-05, + "loss": 8.2882, + "step": 153 + }, + { + "epoch": 0.000915881625273575, + "grad_norm": 3.9960269927978516, + "learning_rate": 4.99998978520689e-05, + "loss": 8.2091, + "step": 154 + }, + { + "epoch": 0.0009218289085545723, + "grad_norm": 3.952327251434326, + "learning_rate": 4.999989651243913e-05, + "loss": 8.1726, + "step": 155 + }, + { + "epoch": 0.0009277761918355695, + "grad_norm": 3.9594647884368896, + "learning_rate": 4.9999895164082156e-05, + "loss": 8.0241, + "step": 156 + }, + { + "epoch": 0.0009337234751165668, + "grad_norm": 3.1129961013793945, + "learning_rate": 4.999989380699798e-05, + "loss": 8.14, + "step": 157 + }, + { + "epoch": 0.000939670758397564, + "grad_norm": 4.7737860679626465, + "learning_rate": 4.9999892441186604e-05, + "loss": 7.869, + "step": 158 + }, + { + "epoch": 0.0009456180416785613, + "grad_norm": 3.351327657699585, + "learning_rate": 4.9999891066648006e-05, + "loss": 8.1831, + "step": 159 + }, + { + "epoch": 0.0009515653249595584, + "grad_norm": 3.0245375633239746, + "learning_rate": 4.999988968338222e-05, + "loss": 8.3871, + "step": 160 + }, + { + "epoch": 0.0009575126082405557, + "grad_norm": 4.766855716705322, + "learning_rate": 4.999988829138923e-05, + "loss": 8.0078, + "step": 161 + }, + { + "epoch": 0.0009634598915215529, + "grad_norm": 3.975804090499878, + "learning_rate": 4.999988689066903e-05, + "loss": 7.6923, + "step": 162 + }, + { + "epoch": 0.0009694071748025502, + "grad_norm": 4.024605751037598, + "learning_rate": 4.999988548122163e-05, + "loss": 8.2986, + "step": 163 + }, + { + "epoch": 0.0009753544580835474, + "grad_norm": 4.230019569396973, + "learning_rate": 4.999988406304703e-05, + "loss": 8.2903, + "step": 164 + }, + { + "epoch": 0.0009813017413645446, + "grad_norm": 3.972825050354004, + "learning_rate": 4.9999882636145236e-05, + "loss": 8.3589, + "step": 165 + }, + { + "epoch": 0.0009872490246455418, + "grad_norm": 3.6381688117980957, + "learning_rate": 4.999988120051623e-05, + "loss": 8.2648, + "step": 166 + }, + { + "epoch": 0.000993196307926539, + "grad_norm": 4.203462600708008, + "learning_rate": 4.9999879756160025e-05, + "loss": 8.363, + "step": 167 + }, + { + "epoch": 0.0009991435912075363, + "grad_norm": 2.944103479385376, + "learning_rate": 4.9999878303076624e-05, + "loss": 7.9752, + "step": 168 + }, + { + "epoch": 0.0010050908744885336, + "grad_norm": 3.4115283489227295, + "learning_rate": 4.9999876841266025e-05, + "loss": 8.1044, + "step": 169 + }, + { + "epoch": 0.0010110381577695309, + "grad_norm": 4.185582160949707, + "learning_rate": 4.999987537072822e-05, + "loss": 8.0347, + "step": 170 + }, + { + "epoch": 0.0010169854410505281, + "grad_norm": 3.333649158477783, + "learning_rate": 4.999987389146323e-05, + "loss": 8.0545, + "step": 171 + }, + { + "epoch": 0.0010229327243315254, + "grad_norm": 3.7702765464782715, + "learning_rate": 4.999987240347103e-05, + "loss": 7.8936, + "step": 172 + }, + { + "epoch": 0.0010288800076125226, + "grad_norm": 4.113167762756348, + "learning_rate": 4.9999870906751636e-05, + "loss": 7.9447, + "step": 173 + }, + { + "epoch": 0.0010348272908935199, + "grad_norm": 3.370821714401245, + "learning_rate": 4.999986940130505e-05, + "loss": 7.9745, + "step": 174 + }, + { + "epoch": 0.0010407745741745171, + "grad_norm": 3.552391767501831, + "learning_rate": 4.999986788713126e-05, + "loss": 7.8882, + "step": 175 + }, + { + "epoch": 0.0010467218574555144, + "grad_norm": 3.3497536182403564, + "learning_rate": 4.999986636423028e-05, + "loss": 7.8601, + "step": 176 + }, + { + "epoch": 0.0010526691407365116, + "grad_norm": 3.256685733795166, + "learning_rate": 4.9999864832602105e-05, + "loss": 7.8341, + "step": 177 + }, + { + "epoch": 0.001058616424017509, + "grad_norm": 3.028108835220337, + "learning_rate": 4.999986329224674e-05, + "loss": 7.884, + "step": 178 + }, + { + "epoch": 0.0010645637072985061, + "grad_norm": 2.9583778381347656, + "learning_rate": 4.9999861743164165e-05, + "loss": 7.7875, + "step": 179 + }, + { + "epoch": 0.0010705109905795032, + "grad_norm": 3.109215497970581, + "learning_rate": 4.999986018535441e-05, + "loss": 8.4081, + "step": 180 + }, + { + "epoch": 0.0010764582738605004, + "grad_norm": 3.8907759189605713, + "learning_rate": 4.999985861881746e-05, + "loss": 8.0971, + "step": 181 + }, + { + "epoch": 0.0010824055571414977, + "grad_norm": 4.20400857925415, + "learning_rate": 4.9999857043553314e-05, + "loss": 7.9077, + "step": 182 + }, + { + "epoch": 0.001088352840422495, + "grad_norm": 3.580486297607422, + "learning_rate": 4.999985545956198e-05, + "loss": 7.8935, + "step": 183 + }, + { + "epoch": 0.0010943001237034922, + "grad_norm": 3.3833847045898438, + "learning_rate": 4.999985386684345e-05, + "loss": 7.9956, + "step": 184 + }, + { + "epoch": 0.0011002474069844895, + "grad_norm": 2.8848624229431152, + "learning_rate": 4.9999852265397734e-05, + "loss": 8.0718, + "step": 185 + }, + { + "epoch": 0.0011061946902654867, + "grad_norm": 3.8933818340301514, + "learning_rate": 4.999985065522483e-05, + "loss": 8.0517, + "step": 186 + }, + { + "epoch": 0.001112141973546484, + "grad_norm": 3.6559605598449707, + "learning_rate": 4.999984903632473e-05, + "loss": 8.3664, + "step": 187 + }, + { + "epoch": 0.0011180892568274812, + "grad_norm": 3.4633536338806152, + "learning_rate": 4.999984740869744e-05, + "loss": 8.3481, + "step": 188 + }, + { + "epoch": 0.0011240365401084785, + "grad_norm": 3.483020305633545, + "learning_rate": 4.999984577234297e-05, + "loss": 8.3407, + "step": 189 + }, + { + "epoch": 0.0011299838233894757, + "grad_norm": 2.772434711456299, + "learning_rate": 4.999984412726131e-05, + "loss": 8.4524, + "step": 190 + }, + { + "epoch": 0.001135931106670473, + "grad_norm": 3.3341007232666016, + "learning_rate": 4.999984247345246e-05, + "loss": 8.1063, + "step": 191 + }, + { + "epoch": 0.0011418783899514702, + "grad_norm": 3.0063467025756836, + "learning_rate": 4.999984081091642e-05, + "loss": 8.0077, + "step": 192 + }, + { + "epoch": 0.0011478256732324675, + "grad_norm": 2.9670779705047607, + "learning_rate": 4.99998391396532e-05, + "loss": 8.2338, + "step": 193 + }, + { + "epoch": 0.0011537729565134647, + "grad_norm": 3.024505138397217, + "learning_rate": 4.999983745966279e-05, + "loss": 8.1794, + "step": 194 + }, + { + "epoch": 0.0011597202397944618, + "grad_norm": 2.834131956100464, + "learning_rate": 4.9999835770945195e-05, + "loss": 8.2078, + "step": 195 + }, + { + "epoch": 0.001165667523075459, + "grad_norm": 3.555525064468384, + "learning_rate": 4.999983407350042e-05, + "loss": 8.0838, + "step": 196 + }, + { + "epoch": 0.0011716148063564563, + "grad_norm": 3.5013587474823, + "learning_rate": 4.999983236732846e-05, + "loss": 8.092, + "step": 197 + }, + { + "epoch": 0.0011775620896374535, + "grad_norm": 3.3721518516540527, + "learning_rate": 4.9999830652429314e-05, + "loss": 8.1137, + "step": 198 + }, + { + "epoch": 0.0011835093729184508, + "grad_norm": 3.364952564239502, + "learning_rate": 4.9999828928802986e-05, + "loss": 8.1197, + "step": 199 + }, + { + "epoch": 0.001189456656199448, + "grad_norm": 3.691249132156372, + "learning_rate": 4.999982719644948e-05, + "loss": 8.0922, + "step": 200 + }, + { + "epoch": 0.0011954039394804453, + "grad_norm": 6.919185161590576, + "learning_rate": 4.9999825455368785e-05, + "loss": 7.9215, + "step": 201 + }, + { + "epoch": 0.0012013512227614426, + "grad_norm": 3.3332598209381104, + "learning_rate": 4.999982370556091e-05, + "loss": 7.7605, + "step": 202 + }, + { + "epoch": 0.0012072985060424398, + "grad_norm": 2.842517375946045, + "learning_rate": 4.999982194702586e-05, + "loss": 8.0527, + "step": 203 + }, + { + "epoch": 0.001213245789323437, + "grad_norm": 3.086371660232544, + "learning_rate": 4.999982017976364e-05, + "loss": 8.2637, + "step": 204 + }, + { + "epoch": 0.0012191930726044343, + "grad_norm": 3.0870208740234375, + "learning_rate": 4.999981840377422e-05, + "loss": 8.3538, + "step": 205 + }, + { + "epoch": 0.0012251403558854316, + "grad_norm": 3.1244094371795654, + "learning_rate": 4.9999816619057633e-05, + "loss": 8.4604, + "step": 206 + }, + { + "epoch": 0.0012310876391664288, + "grad_norm": 2.7808034420013428, + "learning_rate": 4.999981482561387e-05, + "loss": 8.3227, + "step": 207 + }, + { + "epoch": 0.001237034922447426, + "grad_norm": 2.791182518005371, + "learning_rate": 4.999981302344292e-05, + "loss": 8.1481, + "step": 208 + }, + { + "epoch": 0.0012429822057284233, + "grad_norm": 3.045971632003784, + "learning_rate": 4.99998112125448e-05, + "loss": 7.7842, + "step": 209 + }, + { + "epoch": 0.0012489294890094206, + "grad_norm": 3.2548067569732666, + "learning_rate": 4.99998093929195e-05, + "loss": 7.9935, + "step": 210 + }, + { + "epoch": 0.0012548767722904176, + "grad_norm": 3.5448713302612305, + "learning_rate": 4.999980756456704e-05, + "loss": 8.0323, + "step": 211 + }, + { + "epoch": 0.0012608240555714149, + "grad_norm": 3.717900514602661, + "learning_rate": 4.9999805727487395e-05, + "loss": 8.0532, + "step": 212 + }, + { + "epoch": 0.0012667713388524121, + "grad_norm": 3.2943921089172363, + "learning_rate": 4.9999803881680576e-05, + "loss": 8.0326, + "step": 213 + }, + { + "epoch": 0.0012727186221334094, + "grad_norm": 3.4586269855499268, + "learning_rate": 4.999980202714658e-05, + "loss": 7.8765, + "step": 214 + }, + { + "epoch": 0.0012786659054144067, + "grad_norm": 3.1898810863494873, + "learning_rate": 4.9999800163885414e-05, + "loss": 7.8859, + "step": 215 + }, + { + "epoch": 0.001284613188695404, + "grad_norm": 2.977229595184326, + "learning_rate": 4.9999798291897084e-05, + "loss": 7.8841, + "step": 216 + }, + { + "epoch": 0.0012905604719764012, + "grad_norm": 3.368680000305176, + "learning_rate": 4.999979641118157e-05, + "loss": 7.8055, + "step": 217 + }, + { + "epoch": 0.0012965077552573984, + "grad_norm": 4.295344352722168, + "learning_rate": 4.9999794521738894e-05, + "loss": 7.6456, + "step": 218 + }, + { + "epoch": 0.0013024550385383957, + "grad_norm": 3.985480546951294, + "learning_rate": 4.999979262356904e-05, + "loss": 7.6987, + "step": 219 + }, + { + "epoch": 0.001308402321819393, + "grad_norm": 3.8719842433929443, + "learning_rate": 4.999979071667202e-05, + "loss": 7.6994, + "step": 220 + }, + { + "epoch": 0.0013143496051003902, + "grad_norm": 4.699835300445557, + "learning_rate": 4.999978880104784e-05, + "loss": 8.1815, + "step": 221 + }, + { + "epoch": 0.0013202968883813874, + "grad_norm": 3.9221127033233643, + "learning_rate": 4.9999786876696485e-05, + "loss": 7.8765, + "step": 222 + }, + { + "epoch": 0.0013262441716623847, + "grad_norm": 4.4223504066467285, + "learning_rate": 4.9999784943617964e-05, + "loss": 7.7244, + "step": 223 + }, + { + "epoch": 0.001332191454943382, + "grad_norm": 3.4598348140716553, + "learning_rate": 4.999978300181227e-05, + "loss": 7.7072, + "step": 224 + }, + { + "epoch": 0.0013381387382243792, + "grad_norm": 3.536752223968506, + "learning_rate": 4.999978105127941e-05, + "loss": 7.6337, + "step": 225 + }, + { + "epoch": 0.0013440860215053765, + "grad_norm": 3.6432204246520996, + "learning_rate": 4.99997790920194e-05, + "loss": 7.8078, + "step": 226 + }, + { + "epoch": 0.0013500333047863735, + "grad_norm": 4.8305768966674805, + "learning_rate": 4.999977712403221e-05, + "loss": 7.9003, + "step": 227 + }, + { + "epoch": 0.0013559805880673707, + "grad_norm": 3.773876428604126, + "learning_rate": 4.999977514731786e-05, + "loss": 8.0513, + "step": 228 + }, + { + "epoch": 0.001361927871348368, + "grad_norm": 4.465645790100098, + "learning_rate": 4.999977316187635e-05, + "loss": 7.9847, + "step": 229 + }, + { + "epoch": 0.0013678751546293653, + "grad_norm": 3.9466493129730225, + "learning_rate": 4.9999771167707674e-05, + "loss": 7.9902, + "step": 230 + }, + { + "epoch": 0.0013738224379103625, + "grad_norm": 4.432138919830322, + "learning_rate": 4.9999769164811846e-05, + "loss": 7.8929, + "step": 231 + }, + { + "epoch": 0.0013797697211913598, + "grad_norm": 3.5211949348449707, + "learning_rate": 4.999976715318885e-05, + "loss": 8.1838, + "step": 232 + }, + { + "epoch": 0.001385717004472357, + "grad_norm": 3.0819287300109863, + "learning_rate": 4.9999765132838686e-05, + "loss": 8.2823, + "step": 233 + }, + { + "epoch": 0.0013916642877533543, + "grad_norm": 3.436112880706787, + "learning_rate": 4.9999763103761374e-05, + "loss": 7.7796, + "step": 234 + }, + { + "epoch": 0.0013976115710343515, + "grad_norm": 3.6699061393737793, + "learning_rate": 4.99997610659569e-05, + "loss": 7.5792, + "step": 235 + }, + { + "epoch": 0.0014035588543153488, + "grad_norm": 3.814182758331299, + "learning_rate": 4.999975901942526e-05, + "loss": 7.5631, + "step": 236 + }, + { + "epoch": 0.001409506137596346, + "grad_norm": 3.84110164642334, + "learning_rate": 4.9999756964166465e-05, + "loss": 7.4244, + "step": 237 + }, + { + "epoch": 0.0014154534208773433, + "grad_norm": 3.278045415878296, + "learning_rate": 4.999975490018052e-05, + "loss": 7.9049, + "step": 238 + }, + { + "epoch": 0.0014214007041583405, + "grad_norm": 3.5502712726593018, + "learning_rate": 4.999975282746742e-05, + "loss": 8.0021, + "step": 239 + }, + { + "epoch": 0.0014273479874393378, + "grad_norm": 2.7919108867645264, + "learning_rate": 4.9999750746027153e-05, + "loss": 8.2854, + "step": 240 + }, + { + "epoch": 0.001433295270720335, + "grad_norm": 3.1689581871032715, + "learning_rate": 4.999974865585973e-05, + "loss": 8.3177, + "step": 241 + }, + { + "epoch": 0.001439242554001332, + "grad_norm": 2.728679656982422, + "learning_rate": 4.999974655696517e-05, + "loss": 8.3181, + "step": 242 + }, + { + "epoch": 0.0014451898372823293, + "grad_norm": 3.5175108909606934, + "learning_rate": 4.9999744449343445e-05, + "loss": 8.03, + "step": 243 + }, + { + "epoch": 0.0014511371205633266, + "grad_norm": 3.714219808578491, + "learning_rate": 4.999974233299457e-05, + "loss": 8.0824, + "step": 244 + }, + { + "epoch": 0.0014570844038443239, + "grad_norm": 3.42090106010437, + "learning_rate": 4.9999740207918546e-05, + "loss": 8.0455, + "step": 245 + }, + { + "epoch": 0.001463031687125321, + "grad_norm": 3.035047769546509, + "learning_rate": 4.999973807411537e-05, + "loss": 8.0117, + "step": 246 + }, + { + "epoch": 0.0014689789704063184, + "grad_norm": 3.4878122806549072, + "learning_rate": 4.9999735931585034e-05, + "loss": 8.1368, + "step": 247 + }, + { + "epoch": 0.0014749262536873156, + "grad_norm": 3.648115873336792, + "learning_rate": 4.999973378032756e-05, + "loss": 7.9987, + "step": 248 + }, + { + "epoch": 0.0014808735369683129, + "grad_norm": 3.171255588531494, + "learning_rate": 4.9999731620342936e-05, + "loss": 7.9733, + "step": 249 + }, + { + "epoch": 0.0014868208202493101, + "grad_norm": 3.157804250717163, + "learning_rate": 4.999972945163116e-05, + "loss": 7.8511, + "step": 250 + }, + { + "epoch": 0.0014927681035303074, + "grad_norm": 3.4346978664398193, + "learning_rate": 4.999972727419224e-05, + "loss": 7.9075, + "step": 251 + }, + { + "epoch": 0.0014987153868113046, + "grad_norm": 3.281135082244873, + "learning_rate": 4.9999725088026175e-05, + "loss": 7.876, + "step": 252 + }, + { + "epoch": 0.0015046626700923019, + "grad_norm": 3.1481714248657227, + "learning_rate": 4.9999722893132954e-05, + "loss": 8.1458, + "step": 253 + }, + { + "epoch": 0.0015106099533732991, + "grad_norm": 2.821460247039795, + "learning_rate": 4.99997206895126e-05, + "loss": 7.9141, + "step": 254 + }, + { + "epoch": 0.0015165572366542964, + "grad_norm": 2.887997627258301, + "learning_rate": 4.999971847716509e-05, + "loss": 8.2246, + "step": 255 + }, + { + "epoch": 0.0015225045199352936, + "grad_norm": 2.8097078800201416, + "learning_rate": 4.999971625609044e-05, + "loss": 7.8576, + "step": 256 + }, + { + "epoch": 0.001528451803216291, + "grad_norm": 2.9272890090942383, + "learning_rate": 4.999971402628866e-05, + "loss": 7.6856, + "step": 257 + }, + { + "epoch": 0.001534399086497288, + "grad_norm": 3.487027168273926, + "learning_rate": 4.999971178775973e-05, + "loss": 7.8179, + "step": 258 + }, + { + "epoch": 0.0015403463697782852, + "grad_norm": 3.575681209564209, + "learning_rate": 4.9999709540503656e-05, + "loss": 7.8115, + "step": 259 + }, + { + "epoch": 0.0015462936530592824, + "grad_norm": 3.457756757736206, + "learning_rate": 4.9999707284520435e-05, + "loss": 7.7985, + "step": 260 + }, + { + "epoch": 0.0015522409363402797, + "grad_norm": 3.732728958129883, + "learning_rate": 4.999970501981009e-05, + "loss": 7.8369, + "step": 261 + }, + { + "epoch": 0.001558188219621277, + "grad_norm": 4.1466898918151855, + "learning_rate": 4.99997027463726e-05, + "loss": 8.2435, + "step": 262 + }, + { + "epoch": 0.0015641355029022742, + "grad_norm": 4.028534889221191, + "learning_rate": 4.9999700464207965e-05, + "loss": 8.2338, + "step": 263 + }, + { + "epoch": 0.0015700827861832715, + "grad_norm": 3.7445273399353027, + "learning_rate": 4.99996981733162e-05, + "loss": 8.1182, + "step": 264 + }, + { + "epoch": 0.0015760300694642687, + "grad_norm": 3.455228567123413, + "learning_rate": 4.99996958736973e-05, + "loss": 8.1932, + "step": 265 + }, + { + "epoch": 0.001581977352745266, + "grad_norm": 3.1530332565307617, + "learning_rate": 4.9999693565351256e-05, + "loss": 7.8304, + "step": 266 + }, + { + "epoch": 0.0015879246360262632, + "grad_norm": 3.113161325454712, + "learning_rate": 4.999969124827809e-05, + "loss": 7.6625, + "step": 267 + }, + { + "epoch": 0.0015938719193072605, + "grad_norm": 3.621076822280884, + "learning_rate": 4.999968892247778e-05, + "loss": 8.0983, + "step": 268 + }, + { + "epoch": 0.0015998192025882577, + "grad_norm": 3.533395767211914, + "learning_rate": 4.9999686587950346e-05, + "loss": 7.9564, + "step": 269 + }, + { + "epoch": 0.001605766485869255, + "grad_norm": 3.6486849784851074, + "learning_rate": 4.999968424469577e-05, + "loss": 7.9864, + "step": 270 + }, + { + "epoch": 0.0016117137691502522, + "grad_norm": 3.223167657852173, + "learning_rate": 4.999968189271407e-05, + "loss": 7.8516, + "step": 271 + }, + { + "epoch": 0.0016176610524312495, + "grad_norm": 3.282062530517578, + "learning_rate": 4.999967953200523e-05, + "loss": 7.9247, + "step": 272 + }, + { + "epoch": 0.0016236083357122465, + "grad_norm": 2.8589930534362793, + "learning_rate": 4.999967716256927e-05, + "loss": 7.8871, + "step": 273 + }, + { + "epoch": 0.0016295556189932438, + "grad_norm": 3.136882781982422, + "learning_rate": 4.9999674784406174e-05, + "loss": 7.8793, + "step": 274 + }, + { + "epoch": 0.001635502902274241, + "grad_norm": 3.9103915691375732, + "learning_rate": 4.999967239751595e-05, + "loss": 7.9005, + "step": 275 + }, + { + "epoch": 0.0016414501855552383, + "grad_norm": 4.40267276763916, + "learning_rate": 4.99996700018986e-05, + "loss": 7.9247, + "step": 276 + }, + { + "epoch": 0.0016473974688362356, + "grad_norm": 3.6620242595672607, + "learning_rate": 4.9999667597554136e-05, + "loss": 8.0719, + "step": 277 + }, + { + "epoch": 0.0016533447521172328, + "grad_norm": 3.1278858184814453, + "learning_rate": 4.999966518448253e-05, + "loss": 8.0822, + "step": 278 + }, + { + "epoch": 0.00165929203539823, + "grad_norm": 3.321831464767456, + "learning_rate": 4.9999662762683805e-05, + "loss": 8.1266, + "step": 279 + }, + { + "epoch": 0.0016652393186792273, + "grad_norm": 3.4116811752319336, + "learning_rate": 4.999966033215795e-05, + "loss": 8.2159, + "step": 280 + }, + { + "epoch": 0.0016711866019602246, + "grad_norm": 3.58381724357605, + "learning_rate": 4.999965789290498e-05, + "loss": 8.0275, + "step": 281 + }, + { + "epoch": 0.0016771338852412218, + "grad_norm": 3.0357518196105957, + "learning_rate": 4.9999655444924884e-05, + "loss": 8.1171, + "step": 282 + }, + { + "epoch": 0.001683081168522219, + "grad_norm": 3.237764596939087, + "learning_rate": 4.999965298821767e-05, + "loss": 7.822, + "step": 283 + }, + { + "epoch": 0.0016890284518032163, + "grad_norm": 3.0861873626708984, + "learning_rate": 4.999965052278334e-05, + "loss": 7.7991, + "step": 284 + }, + { + "epoch": 0.0016949757350842136, + "grad_norm": 2.8045542240142822, + "learning_rate": 4.999964804862187e-05, + "loss": 7.9659, + "step": 285 + }, + { + "epoch": 0.0017009230183652108, + "grad_norm": 3.1282641887664795, + "learning_rate": 4.9999645565733297e-05, + "loss": 7.8354, + "step": 286 + }, + { + "epoch": 0.001706870301646208, + "grad_norm": 2.980001211166382, + "learning_rate": 4.999964307411761e-05, + "loss": 7.806, + "step": 287 + }, + { + "epoch": 0.0017128175849272054, + "grad_norm": 3.114238977432251, + "learning_rate": 4.99996405737748e-05, + "loss": 7.6173, + "step": 288 + }, + { + "epoch": 0.0017187648682082024, + "grad_norm": 2.6732640266418457, + "learning_rate": 4.9999638064704866e-05, + "loss": 7.5944, + "step": 289 + }, + { + "epoch": 0.0017247121514891996, + "grad_norm": 3.2139906883239746, + "learning_rate": 4.999963554690783e-05, + "loss": 7.5738, + "step": 290 + }, + { + "epoch": 0.001730659434770197, + "grad_norm": 3.0964555740356445, + "learning_rate": 4.999963302038368e-05, + "loss": 7.4431, + "step": 291 + }, + { + "epoch": 0.0017366067180511942, + "grad_norm": 3.0611374378204346, + "learning_rate": 4.99996304851324e-05, + "loss": 7.3748, + "step": 292 + }, + { + "epoch": 0.0017425540013321914, + "grad_norm": 2.88114333152771, + "learning_rate": 4.999962794115402e-05, + "loss": 7.3554, + "step": 293 + }, + { + "epoch": 0.0017485012846131887, + "grad_norm": 2.895141363143921, + "learning_rate": 4.999962538844852e-05, + "loss": 7.2801, + "step": 294 + }, + { + "epoch": 0.001754448567894186, + "grad_norm": 3.0645008087158203, + "learning_rate": 4.9999622827015914e-05, + "loss": 7.1753, + "step": 295 + }, + { + "epoch": 0.0017603958511751832, + "grad_norm": 3.0750465393066406, + "learning_rate": 4.99996202568562e-05, + "loss": 7.1905, + "step": 296 + }, + { + "epoch": 0.0017663431344561804, + "grad_norm": 3.1322436332702637, + "learning_rate": 4.9999617677969374e-05, + "loss": 7.0851, + "step": 297 + }, + { + "epoch": 0.0017722904177371777, + "grad_norm": 3.8287153244018555, + "learning_rate": 4.999961509035544e-05, + "loss": 7.0842, + "step": 298 + }, + { + "epoch": 0.001778237701018175, + "grad_norm": 2.874312162399292, + "learning_rate": 4.9999612494014403e-05, + "loss": 6.9588, + "step": 299 + }, + { + "epoch": 0.0017841849842991722, + "grad_norm": 2.916250705718994, + "learning_rate": 4.999960988894625e-05, + "loss": 7.1342, + "step": 300 + }, + { + "epoch": 0.0017901322675801694, + "grad_norm": 2.71624755859375, + "learning_rate": 4.9999607275151e-05, + "loss": 7.0418, + "step": 301 + }, + { + "epoch": 0.0017960795508611667, + "grad_norm": 2.655630350112915, + "learning_rate": 4.999960465262864e-05, + "loss": 6.937, + "step": 302 + }, + { + "epoch": 0.001802026834142164, + "grad_norm": 2.8819122314453125, + "learning_rate": 4.999960202137918e-05, + "loss": 7.0116, + "step": 303 + }, + { + "epoch": 0.0018079741174231612, + "grad_norm": 2.909701108932495, + "learning_rate": 4.999959938140262e-05, + "loss": 6.9588, + "step": 304 + }, + { + "epoch": 0.0018139214007041582, + "grad_norm": 3.276395797729492, + "learning_rate": 4.999959673269895e-05, + "loss": 6.9066, + "step": 305 + }, + { + "epoch": 0.0018198686839851555, + "grad_norm": 2.8774867057800293, + "learning_rate": 4.9999594075268186e-05, + "loss": 7.0112, + "step": 306 + }, + { + "epoch": 0.0018258159672661528, + "grad_norm": 2.9667818546295166, + "learning_rate": 4.999959140911032e-05, + "loss": 7.1467, + "step": 307 + }, + { + "epoch": 0.00183176325054715, + "grad_norm": 6.6612958908081055, + "learning_rate": 4.999958873422536e-05, + "loss": 8.4457, + "step": 308 + }, + { + "epoch": 0.0018377105338281473, + "grad_norm": 4.234557628631592, + "learning_rate": 4.999958605061329e-05, + "loss": 8.904, + "step": 309 + }, + { + "epoch": 0.0018436578171091445, + "grad_norm": 4.049502372741699, + "learning_rate": 4.999958335827413e-05, + "loss": 7.5174, + "step": 310 + }, + { + "epoch": 0.0018496051003901418, + "grad_norm": 3.574474334716797, + "learning_rate": 4.999958065720787e-05, + "loss": 8.6537, + "step": 311 + }, + { + "epoch": 0.001855552383671139, + "grad_norm": 3.6154026985168457, + "learning_rate": 4.9999577947414515e-05, + "loss": 8.5833, + "step": 312 + }, + { + "epoch": 0.0018614996669521363, + "grad_norm": 2.9204158782958984, + "learning_rate": 4.999957522889407e-05, + "loss": 8.5486, + "step": 313 + }, + { + "epoch": 0.0018674469502331335, + "grad_norm": 3.095310688018799, + "learning_rate": 4.999957250164653e-05, + "loss": 8.3855, + "step": 314 + }, + { + "epoch": 0.0018733942335141308, + "grad_norm": 3.872267723083496, + "learning_rate": 4.999956976567189e-05, + "loss": 8.2715, + "step": 315 + }, + { + "epoch": 0.001879341516795128, + "grad_norm": 3.5560686588287354, + "learning_rate": 4.9999567020970175e-05, + "loss": 8.1571, + "step": 316 + }, + { + "epoch": 0.0018852888000761253, + "grad_norm": 2.6759164333343506, + "learning_rate": 4.9999564267541356e-05, + "loss": 8.4072, + "step": 317 + }, + { + "epoch": 0.0018912360833571226, + "grad_norm": 4.034712791442871, + "learning_rate": 4.999956150538545e-05, + "loss": 7.7622, + "step": 318 + }, + { + "epoch": 0.0018971833666381198, + "grad_norm": 3.8927831649780273, + "learning_rate": 4.999955873450246e-05, + "loss": 7.5012, + "step": 319 + }, + { + "epoch": 0.0019031306499191168, + "grad_norm": 3.4422812461853027, + "learning_rate": 4.999955595489237e-05, + "loss": 7.6894, + "step": 320 + }, + { + "epoch": 0.001909077933200114, + "grad_norm": 3.0367283821105957, + "learning_rate": 4.999955316655521e-05, + "loss": 7.8151, + "step": 321 + }, + { + "epoch": 0.0019150252164811114, + "grad_norm": 3.7553489208221436, + "learning_rate": 4.9999550369490955e-05, + "loss": 8.0462, + "step": 322 + }, + { + "epoch": 0.0019209724997621086, + "grad_norm": 3.432591438293457, + "learning_rate": 4.999954756369962e-05, + "loss": 7.8782, + "step": 323 + }, + { + "epoch": 0.0019269197830431059, + "grad_norm": 2.7325966358184814, + "learning_rate": 4.9999544749181196e-05, + "loss": 7.9045, + "step": 324 + }, + { + "epoch": 0.0019328670663241031, + "grad_norm": 4.31963586807251, + "learning_rate": 4.9999541925935686e-05, + "loss": 7.7791, + "step": 325 + }, + { + "epoch": 0.0019388143496051004, + "grad_norm": 2.840189218521118, + "learning_rate": 4.999953909396311e-05, + "loss": 7.8334, + "step": 326 + }, + { + "epoch": 0.0019447616328860976, + "grad_norm": 3.2388041019439697, + "learning_rate": 4.9999536253263434e-05, + "loss": 7.6756, + "step": 327 + }, + { + "epoch": 0.0019507089161670949, + "grad_norm": 3.6291563510894775, + "learning_rate": 4.999953340383669e-05, + "loss": 7.6511, + "step": 328 + }, + { + "epoch": 0.001956656199448092, + "grad_norm": 3.35703706741333, + "learning_rate": 4.999953054568287e-05, + "loss": 7.6382, + "step": 329 + }, + { + "epoch": 0.001962603482729089, + "grad_norm": 3.117281198501587, + "learning_rate": 4.999952767880196e-05, + "loss": 7.6233, + "step": 330 + }, + { + "epoch": 0.0019685507660100864, + "grad_norm": 2.8385257720947266, + "learning_rate": 4.999952480319398e-05, + "loss": 7.6594, + "step": 331 + }, + { + "epoch": 0.0019744980492910837, + "grad_norm": 2.5914418697357178, + "learning_rate": 4.999952191885893e-05, + "loss": 8.2647, + "step": 332 + }, + { + "epoch": 0.001980445332572081, + "grad_norm": 2.5847742557525635, + "learning_rate": 4.9999519025796795e-05, + "loss": 8.339, + "step": 333 + }, + { + "epoch": 0.001986392615853078, + "grad_norm": 2.7022132873535156, + "learning_rate": 4.999951612400759e-05, + "loss": 7.9114, + "step": 334 + }, + { + "epoch": 0.0019923398991340754, + "grad_norm": 3.0290884971618652, + "learning_rate": 4.999951321349131e-05, + "loss": 7.4531, + "step": 335 + }, + { + "epoch": 0.0019982871824150727, + "grad_norm": 2.8910324573516846, + "learning_rate": 4.999951029424796e-05, + "loss": 7.398, + "step": 336 + }, + { + "epoch": 0.00200423446569607, + "grad_norm": 2.8917605876922607, + "learning_rate": 4.9999507366277545e-05, + "loss": 7.48, + "step": 337 + }, + { + "epoch": 0.002010181748977067, + "grad_norm": 2.8957982063293457, + "learning_rate": 4.999950442958005e-05, + "loss": 7.8662, + "step": 338 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 3.562232255935669, + "learning_rate": 4.9999501484155485e-05, + "loss": 7.8388, + "step": 339 + }, + { + "epoch": 0.0020220763155390617, + "grad_norm": 2.51676607131958, + "learning_rate": 4.9999498530003866e-05, + "loss": 8.2834, + "step": 340 + }, + { + "epoch": 0.002028023598820059, + "grad_norm": 2.326110363006592, + "learning_rate": 4.999949556712517e-05, + "loss": 8.2528, + "step": 341 + }, + { + "epoch": 0.0020339708821010562, + "grad_norm": 2.7621335983276367, + "learning_rate": 4.999949259551941e-05, + "loss": 7.9791, + "step": 342 + }, + { + "epoch": 0.0020399181653820535, + "grad_norm": 3.045431137084961, + "learning_rate": 4.999948961518659e-05, + "loss": 7.8575, + "step": 343 + }, + { + "epoch": 0.0020458654486630507, + "grad_norm": 3.1940131187438965, + "learning_rate": 4.9999486626126703e-05, + "loss": 7.8581, + "step": 344 + }, + { + "epoch": 0.002051812731944048, + "grad_norm": 2.964136838912964, + "learning_rate": 4.999948362833975e-05, + "loss": 7.9656, + "step": 345 + }, + { + "epoch": 0.0020577600152250452, + "grad_norm": 3.167573928833008, + "learning_rate": 4.999948062182574e-05, + "loss": 7.7448, + "step": 346 + }, + { + "epoch": 0.0020637072985060425, + "grad_norm": 3.062666177749634, + "learning_rate": 4.9999477606584666e-05, + "loss": 7.7655, + "step": 347 + }, + { + "epoch": 0.0020696545817870397, + "grad_norm": 3.1097402572631836, + "learning_rate": 4.999947458261653e-05, + "loss": 7.643, + "step": 348 + }, + { + "epoch": 0.002075601865068037, + "grad_norm": 3.1663928031921387, + "learning_rate": 4.999947154992135e-05, + "loss": 7.8348, + "step": 349 + }, + { + "epoch": 0.0020815491483490343, + "grad_norm": 2.8295886516571045, + "learning_rate": 4.99994685084991e-05, + "loss": 7.7752, + "step": 350 + }, + { + "epoch": 0.0020874964316300315, + "grad_norm": 2.7384233474731445, + "learning_rate": 4.99994654583498e-05, + "loss": 7.7644, + "step": 351 + }, + { + "epoch": 0.0020934437149110288, + "grad_norm": 2.6654486656188965, + "learning_rate": 4.999946239947344e-05, + "loss": 7.7489, + "step": 352 + }, + { + "epoch": 0.002099390998192026, + "grad_norm": 2.8949942588806152, + "learning_rate": 4.999945933187003e-05, + "loss": 7.7105, + "step": 353 + }, + { + "epoch": 0.0021053382814730233, + "grad_norm": 2.590036630630493, + "learning_rate": 4.999945625553957e-05, + "loss": 7.6821, + "step": 354 + }, + { + "epoch": 0.0021112855647540205, + "grad_norm": 3.4601457118988037, + "learning_rate": 4.999945317048205e-05, + "loss": 7.3552, + "step": 355 + }, + { + "epoch": 0.002117232848035018, + "grad_norm": 4.022705078125, + "learning_rate": 4.999945007669748e-05, + "loss": 7.0281, + "step": 356 + }, + { + "epoch": 0.002123180131316015, + "grad_norm": 3.249699592590332, + "learning_rate": 4.999944697418587e-05, + "loss": 7.9279, + "step": 357 + }, + { + "epoch": 0.0021291274145970123, + "grad_norm": 2.8424601554870605, + "learning_rate": 4.99994438629472e-05, + "loss": 8.1485, + "step": 358 + }, + { + "epoch": 0.002135074697878009, + "grad_norm": 3.0473172664642334, + "learning_rate": 4.9999440742981486e-05, + "loss": 8.0877, + "step": 359 + }, + { + "epoch": 0.0021410219811590064, + "grad_norm": 3.0614171028137207, + "learning_rate": 4.9999437614288726e-05, + "loss": 7.7817, + "step": 360 + }, + { + "epoch": 0.0021469692644400036, + "grad_norm": 3.309464931488037, + "learning_rate": 4.9999434476868925e-05, + "loss": 7.857, + "step": 361 + }, + { + "epoch": 0.002152916547721001, + "grad_norm": 3.031921148300171, + "learning_rate": 4.999943133072207e-05, + "loss": 7.6393, + "step": 362 + }, + { + "epoch": 0.002158863831001998, + "grad_norm": 3.3756978511810303, + "learning_rate": 4.999942817584818e-05, + "loss": 7.7422, + "step": 363 + }, + { + "epoch": 0.0021648111142829954, + "grad_norm": 3.53362774848938, + "learning_rate": 4.999942501224724e-05, + "loss": 7.9388, + "step": 364 + }, + { + "epoch": 0.0021707583975639926, + "grad_norm": 3.4082882404327393, + "learning_rate": 4.999942183991927e-05, + "loss": 7.3578, + "step": 365 + }, + { + "epoch": 0.00217670568084499, + "grad_norm": 4.035211086273193, + "learning_rate": 4.999941865886425e-05, + "loss": 7.7833, + "step": 366 + }, + { + "epoch": 0.002182652964125987, + "grad_norm": 3.0394630432128906, + "learning_rate": 4.99994154690822e-05, + "loss": 7.9392, + "step": 367 + }, + { + "epoch": 0.0021886002474069844, + "grad_norm": 3.088926076889038, + "learning_rate": 4.99994122705731e-05, + "loss": 7.8149, + "step": 368 + }, + { + "epoch": 0.0021945475306879817, + "grad_norm": 2.3173277378082275, + "learning_rate": 4.9999409063336976e-05, + "loss": 8.2211, + "step": 369 + }, + { + "epoch": 0.002200494813968979, + "grad_norm": 2.9960854053497314, + "learning_rate": 4.9999405847373815e-05, + "loss": 7.6764, + "step": 370 + }, + { + "epoch": 0.002206442097249976, + "grad_norm": 2.841848134994507, + "learning_rate": 4.999940262268361e-05, + "loss": 7.9418, + "step": 371 + }, + { + "epoch": 0.0022123893805309734, + "grad_norm": 3.748779058456421, + "learning_rate": 4.999939938926638e-05, + "loss": 7.7843, + "step": 372 + }, + { + "epoch": 0.0022183366638119707, + "grad_norm": 2.8345019817352295, + "learning_rate": 4.999939614712212e-05, + "loss": 7.592, + "step": 373 + }, + { + "epoch": 0.002224283947092968, + "grad_norm": 3.12503719329834, + "learning_rate": 4.9999392896250826e-05, + "loss": 7.9543, + "step": 374 + }, + { + "epoch": 0.002230231230373965, + "grad_norm": 2.7812912464141846, + "learning_rate": 4.99993896366525e-05, + "loss": 7.8738, + "step": 375 + }, + { + "epoch": 0.0022361785136549624, + "grad_norm": 2.9477410316467285, + "learning_rate": 4.9999386368327144e-05, + "loss": 7.7738, + "step": 376 + }, + { + "epoch": 0.0022421257969359597, + "grad_norm": 2.305204391479492, + "learning_rate": 4.999938309127477e-05, + "loss": 7.9123, + "step": 377 + }, + { + "epoch": 0.002248073080216957, + "grad_norm": 3.3839781284332275, + "learning_rate": 4.999937980549536e-05, + "loss": 7.8542, + "step": 378 + }, + { + "epoch": 0.002254020363497954, + "grad_norm": 3.6973462104797363, + "learning_rate": 4.9999376510988924e-05, + "loss": 7.6953, + "step": 379 + }, + { + "epoch": 0.0022599676467789515, + "grad_norm": 3.8176333904266357, + "learning_rate": 4.999937320775547e-05, + "loss": 7.6548, + "step": 380 + }, + { + "epoch": 0.0022659149300599487, + "grad_norm": 3.0237386226654053, + "learning_rate": 4.999936989579499e-05, + "loss": 7.7843, + "step": 381 + }, + { + "epoch": 0.002271862213340946, + "grad_norm": 2.699695348739624, + "learning_rate": 4.999936657510749e-05, + "loss": 7.8841, + "step": 382 + }, + { + "epoch": 0.0022778094966219432, + "grad_norm": 3.7468206882476807, + "learning_rate": 4.9999363245692965e-05, + "loss": 7.8069, + "step": 383 + }, + { + "epoch": 0.0022837567799029405, + "grad_norm": 3.1074821949005127, + "learning_rate": 4.999935990755142e-05, + "loss": 7.8392, + "step": 384 + }, + { + "epoch": 0.0022897040631839377, + "grad_norm": 2.420884609222412, + "learning_rate": 4.999935656068287e-05, + "loss": 7.9238, + "step": 385 + }, + { + "epoch": 0.002295651346464935, + "grad_norm": 3.1354825496673584, + "learning_rate": 4.9999353205087296e-05, + "loss": 7.9766, + "step": 386 + }, + { + "epoch": 0.0023015986297459322, + "grad_norm": 2.7911901473999023, + "learning_rate": 4.9999349840764695e-05, + "loss": 7.9118, + "step": 387 + }, + { + "epoch": 0.0023075459130269295, + "grad_norm": 2.59529447555542, + "learning_rate": 4.999934646771509e-05, + "loss": 7.8839, + "step": 388 + }, + { + "epoch": 0.0023134931963079267, + "grad_norm": 4.121276378631592, + "learning_rate": 4.999934308593848e-05, + "loss": 7.8406, + "step": 389 + }, + { + "epoch": 0.0023194404795889236, + "grad_norm": 2.9091265201568604, + "learning_rate": 4.999933969543485e-05, + "loss": 7.86, + "step": 390 + }, + { + "epoch": 0.002325387762869921, + "grad_norm": 3.0700483322143555, + "learning_rate": 4.9999336296204195e-05, + "loss": 7.8214, + "step": 391 + }, + { + "epoch": 0.002331335046150918, + "grad_norm": 3.3008790016174316, + "learning_rate": 4.999933288824654e-05, + "loss": 7.5863, + "step": 392 + }, + { + "epoch": 0.0023372823294319153, + "grad_norm": 3.1414108276367188, + "learning_rate": 4.999932947156188e-05, + "loss": 7.5815, + "step": 393 + }, + { + "epoch": 0.0023432296127129126, + "grad_norm": 2.6881701946258545, + "learning_rate": 4.999932604615021e-05, + "loss": 7.959, + "step": 394 + }, + { + "epoch": 0.00234917689599391, + "grad_norm": 2.45609712600708, + "learning_rate": 4.9999322612011534e-05, + "loss": 7.9668, + "step": 395 + }, + { + "epoch": 0.002355124179274907, + "grad_norm": 3.1126747131347656, + "learning_rate": 4.999931916914585e-05, + "loss": 7.774, + "step": 396 + }, + { + "epoch": 0.0023610714625559043, + "grad_norm": 2.806708574295044, + "learning_rate": 4.999931571755316e-05, + "loss": 7.6297, + "step": 397 + }, + { + "epoch": 0.0023670187458369016, + "grad_norm": 3.220013380050659, + "learning_rate": 4.999931225723348e-05, + "loss": 7.3856, + "step": 398 + }, + { + "epoch": 0.002372966029117899, + "grad_norm": 3.0159943103790283, + "learning_rate": 4.9999308788186786e-05, + "loss": 7.3822, + "step": 399 + }, + { + "epoch": 0.002378913312398896, + "grad_norm": 3.1066205501556396, + "learning_rate": 4.9999305310413094e-05, + "loss": 7.3905, + "step": 400 + }, + { + "epoch": 0.0023848605956798934, + "grad_norm": 2.8004367351531982, + "learning_rate": 4.99993018239124e-05, + "loss": 7.8548, + "step": 401 + }, + { + "epoch": 0.0023908078789608906, + "grad_norm": 3.004378318786621, + "learning_rate": 4.999929832868471e-05, + "loss": 7.7846, + "step": 402 + }, + { + "epoch": 0.002396755162241888, + "grad_norm": 3.42901349067688, + "learning_rate": 4.9999294824730025e-05, + "loss": 7.9188, + "step": 403 + }, + { + "epoch": 0.002402702445522885, + "grad_norm": 3.7258527278900146, + "learning_rate": 4.9999291312048343e-05, + "loss": 7.7302, + "step": 404 + }, + { + "epoch": 0.0024086497288038824, + "grad_norm": 4.215145111083984, + "learning_rate": 4.999928779063967e-05, + "loss": 7.6597, + "step": 405 + }, + { + "epoch": 0.0024145970120848796, + "grad_norm": 3.157273769378662, + "learning_rate": 4.9999284260504004e-05, + "loss": 7.7262, + "step": 406 + }, + { + "epoch": 0.002420544295365877, + "grad_norm": 2.9977381229400635, + "learning_rate": 4.999928072164135e-05, + "loss": 7.72, + "step": 407 + }, + { + "epoch": 0.002426491578646874, + "grad_norm": 2.791682720184326, + "learning_rate": 4.9999277174051696e-05, + "loss": 7.8022, + "step": 408 + }, + { + "epoch": 0.0024324388619278714, + "grad_norm": 3.4143035411834717, + "learning_rate": 4.999927361773506e-05, + "loss": 7.5116, + "step": 409 + }, + { + "epoch": 0.0024383861452088687, + "grad_norm": 3.3458821773529053, + "learning_rate": 4.9999270052691425e-05, + "loss": 7.4337, + "step": 410 + }, + { + "epoch": 0.002444333428489866, + "grad_norm": 3.3339595794677734, + "learning_rate": 4.999926647892081e-05, + "loss": 7.7345, + "step": 411 + }, + { + "epoch": 0.002450280711770863, + "grad_norm": 4.285780429840088, + "learning_rate": 4.999926289642321e-05, + "loss": 7.9388, + "step": 412 + }, + { + "epoch": 0.0024562279950518604, + "grad_norm": 3.9473414421081543, + "learning_rate": 4.9999259305198624e-05, + "loss": 7.6038, + "step": 413 + }, + { + "epoch": 0.0024621752783328577, + "grad_norm": 3.504227638244629, + "learning_rate": 4.999925570524706e-05, + "loss": 7.4818, + "step": 414 + }, + { + "epoch": 0.002468122561613855, + "grad_norm": 3.2182157039642334, + "learning_rate": 4.999925209656851e-05, + "loss": 7.3493, + "step": 415 + }, + { + "epoch": 0.002474069844894852, + "grad_norm": 3.1944262981414795, + "learning_rate": 4.999924847916297e-05, + "loss": 7.3646, + "step": 416 + }, + { + "epoch": 0.0024800171281758494, + "grad_norm": 2.957244634628296, + "learning_rate": 4.999924485303047e-05, + "loss": 7.4403, + "step": 417 + }, + { + "epoch": 0.0024859644114568467, + "grad_norm": 2.971285343170166, + "learning_rate": 4.999924121817098e-05, + "loss": 7.7266, + "step": 418 + }, + { + "epoch": 0.002491911694737844, + "grad_norm": 4.029009819030762, + "learning_rate": 4.999923757458451e-05, + "loss": 7.3919, + "step": 419 + }, + { + "epoch": 0.002497858978018841, + "grad_norm": 3.9034767150878906, + "learning_rate": 4.999923392227107e-05, + "loss": 7.2349, + "step": 420 + }, + { + "epoch": 0.002503806261299838, + "grad_norm": 3.23218035697937, + "learning_rate": 4.9999230261230656e-05, + "loss": 7.5146, + "step": 421 + }, + { + "epoch": 0.0025097535445808353, + "grad_norm": 3.193225622177124, + "learning_rate": 4.9999226591463265e-05, + "loss": 7.1699, + "step": 422 + }, + { + "epoch": 0.0025157008278618325, + "grad_norm": 2.9796435832977295, + "learning_rate": 4.999922291296891e-05, + "loss": 7.5719, + "step": 423 + }, + { + "epoch": 0.0025216481111428298, + "grad_norm": 2.6746885776519775, + "learning_rate": 4.999921922574758e-05, + "loss": 7.8086, + "step": 424 + }, + { + "epoch": 0.002527595394423827, + "grad_norm": 3.0622920989990234, + "learning_rate": 4.999921552979928e-05, + "loss": 7.3233, + "step": 425 + }, + { + "epoch": 0.0025335426777048243, + "grad_norm": 3.0908501148223877, + "learning_rate": 4.999921182512402e-05, + "loss": 7.2582, + "step": 426 + }, + { + "epoch": 0.0025394899609858215, + "grad_norm": 2.6913537979125977, + "learning_rate": 4.999920811172178e-05, + "loss": 7.6643, + "step": 427 + }, + { + "epoch": 0.002545437244266819, + "grad_norm": 2.7793848514556885, + "learning_rate": 4.999920438959258e-05, + "loss": 7.9445, + "step": 428 + }, + { + "epoch": 0.002551384527547816, + "grad_norm": 2.741617202758789, + "learning_rate": 4.999920065873642e-05, + "loss": 8.0755, + "step": 429 + }, + { + "epoch": 0.0025573318108288133, + "grad_norm": 2.7102227210998535, + "learning_rate": 4.999919691915329e-05, + "loss": 7.8908, + "step": 430 + }, + { + "epoch": 0.0025632790941098106, + "grad_norm": 2.687788248062134, + "learning_rate": 4.9999193170843206e-05, + "loss": 7.9025, + "step": 431 + }, + { + "epoch": 0.002569226377390808, + "grad_norm": 2.923664093017578, + "learning_rate": 4.999918941380616e-05, + "loss": 7.9331, + "step": 432 + }, + { + "epoch": 0.002575173660671805, + "grad_norm": 2.934735059738159, + "learning_rate": 4.999918564804215e-05, + "loss": 7.722, + "step": 433 + }, + { + "epoch": 0.0025811209439528023, + "grad_norm": 3.8156228065490723, + "learning_rate": 4.999918187355119e-05, + "loss": 7.9392, + "step": 434 + }, + { + "epoch": 0.0025870682272337996, + "grad_norm": 2.333798408508301, + "learning_rate": 4.999917809033327e-05, + "loss": 7.9093, + "step": 435 + }, + { + "epoch": 0.002593015510514797, + "grad_norm": 2.078932046890259, + "learning_rate": 4.99991742983884e-05, + "loss": 7.8484, + "step": 436 + }, + { + "epoch": 0.002598962793795794, + "grad_norm": 2.433375835418701, + "learning_rate": 4.999917049771657e-05, + "loss": 7.9124, + "step": 437 + }, + { + "epoch": 0.0026049100770767913, + "grad_norm": 3.1881024837493896, + "learning_rate": 4.999916668831779e-05, + "loss": 7.3966, + "step": 438 + }, + { + "epoch": 0.0026108573603577886, + "grad_norm": 2.4724855422973633, + "learning_rate": 4.9999162870192065e-05, + "loss": 7.535, + "step": 439 + }, + { + "epoch": 0.002616804643638786, + "grad_norm": 2.8757777214050293, + "learning_rate": 4.999915904333938e-05, + "loss": 7.6728, + "step": 440 + }, + { + "epoch": 0.002622751926919783, + "grad_norm": 3.5439565181732178, + "learning_rate": 4.999915520775975e-05, + "loss": 7.5308, + "step": 441 + }, + { + "epoch": 0.0026286992102007804, + "grad_norm": 2.8345577716827393, + "learning_rate": 4.999915136345318e-05, + "loss": 7.7083, + "step": 442 + }, + { + "epoch": 0.0026346464934817776, + "grad_norm": 3.0842509269714355, + "learning_rate": 4.999914751041965e-05, + "loss": 7.9281, + "step": 443 + }, + { + "epoch": 0.002640593776762775, + "grad_norm": 3.0017757415771484, + "learning_rate": 4.999914364865919e-05, + "loss": 7.4727, + "step": 444 + }, + { + "epoch": 0.002646541060043772, + "grad_norm": 2.637838125228882, + "learning_rate": 4.9999139778171785e-05, + "loss": 7.5284, + "step": 445 + }, + { + "epoch": 0.0026524883433247694, + "grad_norm": 2.7749550342559814, + "learning_rate": 4.999913589895743e-05, + "loss": 7.7006, + "step": 446 + }, + { + "epoch": 0.0026584356266057666, + "grad_norm": 3.1636059284210205, + "learning_rate": 4.9999132011016146e-05, + "loss": 7.6441, + "step": 447 + }, + { + "epoch": 0.002664382909886764, + "grad_norm": 2.623776435852051, + "learning_rate": 4.9999128114347913e-05, + "loss": 7.8027, + "step": 448 + }, + { + "epoch": 0.002670330193167761, + "grad_norm": 2.803612232208252, + "learning_rate": 4.9999124208952755e-05, + "loss": 7.553, + "step": 449 + }, + { + "epoch": 0.0026762774764487584, + "grad_norm": 3.3169047832489014, + "learning_rate": 4.9999120294830656e-05, + "loss": 8.0965, + "step": 450 + }, + { + "epoch": 0.0026822247597297556, + "grad_norm": 3.9928581714630127, + "learning_rate": 4.999911637198161e-05, + "loss": 7.8152, + "step": 451 + }, + { + "epoch": 0.002688172043010753, + "grad_norm": 2.8126320838928223, + "learning_rate": 4.9999112440405646e-05, + "loss": 7.4843, + "step": 452 + }, + { + "epoch": 0.0026941193262917497, + "grad_norm": 2.773427963256836, + "learning_rate": 4.999910850010275e-05, + "loss": 7.7074, + "step": 453 + }, + { + "epoch": 0.002700066609572747, + "grad_norm": 2.8877642154693604, + "learning_rate": 4.999910455107292e-05, + "loss": 7.7764, + "step": 454 + }, + { + "epoch": 0.0027060138928537442, + "grad_norm": 2.6323535442352295, + "learning_rate": 4.9999100593316155e-05, + "loss": 7.7336, + "step": 455 + }, + { + "epoch": 0.0027119611761347415, + "grad_norm": 2.939509153366089, + "learning_rate": 4.9999096626832465e-05, + "loss": 7.8184, + "step": 456 + }, + { + "epoch": 0.0027179084594157387, + "grad_norm": 2.6926229000091553, + "learning_rate": 4.9999092651621855e-05, + "loss": 7.5027, + "step": 457 + }, + { + "epoch": 0.002723855742696736, + "grad_norm": 2.889389991760254, + "learning_rate": 4.999908866768431e-05, + "loss": 7.1138, + "step": 458 + }, + { + "epoch": 0.0027298030259777332, + "grad_norm": 2.951796531677246, + "learning_rate": 4.999908467501985e-05, + "loss": 7.7549, + "step": 459 + }, + { + "epoch": 0.0027357503092587305, + "grad_norm": 2.9076783657073975, + "learning_rate": 4.999908067362847e-05, + "loss": 7.6577, + "step": 460 + }, + { + "epoch": 0.0027416975925397278, + "grad_norm": 3.010636806488037, + "learning_rate": 4.9999076663510155e-05, + "loss": 7.6467, + "step": 461 + }, + { + "epoch": 0.002747644875820725, + "grad_norm": 2.7591371536254883, + "learning_rate": 4.9999072644664935e-05, + "loss": 7.5825, + "step": 462 + }, + { + "epoch": 0.0027535921591017223, + "grad_norm": 2.503632068634033, + "learning_rate": 4.9999068617092795e-05, + "loss": 7.711, + "step": 463 + }, + { + "epoch": 0.0027595394423827195, + "grad_norm": 2.6518661975860596, + "learning_rate": 4.999906458079373e-05, + "loss": 7.557, + "step": 464 + }, + { + "epoch": 0.0027654867256637168, + "grad_norm": 2.6865615844726562, + "learning_rate": 4.9999060535767764e-05, + "loss": 7.5788, + "step": 465 + }, + { + "epoch": 0.002771434008944714, + "grad_norm": 2.715190887451172, + "learning_rate": 4.999905648201487e-05, + "loss": 7.517, + "step": 466 + }, + { + "epoch": 0.0027773812922257113, + "grad_norm": 3.1603381633758545, + "learning_rate": 4.999905241953506e-05, + "loss": 7.6176, + "step": 467 + }, + { + "epoch": 0.0027833285755067085, + "grad_norm": 3.1451528072357178, + "learning_rate": 4.999904834832836e-05, + "loss": 7.6051, + "step": 468 + }, + { + "epoch": 0.002789275858787706, + "grad_norm": 2.5310862064361572, + "learning_rate": 4.9999044268394736e-05, + "loss": 7.6075, + "step": 469 + }, + { + "epoch": 0.002795223142068703, + "grad_norm": 2.9285359382629395, + "learning_rate": 4.99990401797342e-05, + "loss": 7.5399, + "step": 470 + }, + { + "epoch": 0.0028011704253497003, + "grad_norm": 3.2180614471435547, + "learning_rate": 4.9999036082346766e-05, + "loss": 7.6952, + "step": 471 + }, + { + "epoch": 0.0028071177086306976, + "grad_norm": 4.041499614715576, + "learning_rate": 4.9999031976232426e-05, + "loss": 7.841, + "step": 472 + }, + { + "epoch": 0.002813064991911695, + "grad_norm": 3.233492612838745, + "learning_rate": 4.999902786139118e-05, + "loss": 7.5267, + "step": 473 + }, + { + "epoch": 0.002819012275192692, + "grad_norm": 2.7749760150909424, + "learning_rate": 4.9999023737823034e-05, + "loss": 7.3703, + "step": 474 + }, + { + "epoch": 0.0028249595584736893, + "grad_norm": 2.9886162281036377, + "learning_rate": 4.999901960552798e-05, + "loss": 7.4684, + "step": 475 + }, + { + "epoch": 0.0028309068417546866, + "grad_norm": 2.934190511703491, + "learning_rate": 4.999901546450604e-05, + "loss": 7.4432, + "step": 476 + }, + { + "epoch": 0.002836854125035684, + "grad_norm": 3.696247100830078, + "learning_rate": 4.9999011314757196e-05, + "loss": 7.4944, + "step": 477 + }, + { + "epoch": 0.002842801408316681, + "grad_norm": 3.6706700325012207, + "learning_rate": 4.9999007156281454e-05, + "loss": 7.3726, + "step": 478 + }, + { + "epoch": 0.0028487486915976783, + "grad_norm": 3.8638553619384766, + "learning_rate": 4.999900298907881e-05, + "loss": 7.072, + "step": 479 + }, + { + "epoch": 0.0028546959748786756, + "grad_norm": 4.307566165924072, + "learning_rate": 4.999899881314928e-05, + "loss": 6.9371, + "step": 480 + }, + { + "epoch": 0.002860643258159673, + "grad_norm": 3.337372064590454, + "learning_rate": 4.9998994628492854e-05, + "loss": 7.7299, + "step": 481 + }, + { + "epoch": 0.00286659054144067, + "grad_norm": 3.1284921169281006, + "learning_rate": 4.9998990435109535e-05, + "loss": 7.5629, + "step": 482 + }, + { + "epoch": 0.0028725378247216674, + "grad_norm": 3.06904935836792, + "learning_rate": 4.999898623299933e-05, + "loss": 7.5332, + "step": 483 + }, + { + "epoch": 0.002878485108002664, + "grad_norm": 2.985121011734009, + "learning_rate": 4.999898202216224e-05, + "loss": 7.5972, + "step": 484 + }, + { + "epoch": 0.0028844323912836614, + "grad_norm": 2.9188039302825928, + "learning_rate": 4.999897780259827e-05, + "loss": 7.6242, + "step": 485 + }, + { + "epoch": 0.0028903796745646587, + "grad_norm": 3.2263259887695312, + "learning_rate": 4.9998973574307406e-05, + "loss": 7.5746, + "step": 486 + }, + { + "epoch": 0.002896326957845656, + "grad_norm": 2.645188331604004, + "learning_rate": 4.999896933728966e-05, + "loss": 7.6122, + "step": 487 + }, + { + "epoch": 0.002902274241126653, + "grad_norm": 2.89583158493042, + "learning_rate": 4.9998965091545035e-05, + "loss": 7.6157, + "step": 488 + }, + { + "epoch": 0.0029082215244076504, + "grad_norm": 3.6182286739349365, + "learning_rate": 4.9998960837073524e-05, + "loss": 7.4056, + "step": 489 + }, + { + "epoch": 0.0029141688076886477, + "grad_norm": 3.377560615539551, + "learning_rate": 4.9998956573875135e-05, + "loss": 7.4408, + "step": 490 + }, + { + "epoch": 0.002920116090969645, + "grad_norm": 3.0581517219543457, + "learning_rate": 4.9998952301949874e-05, + "loss": 7.5776, + "step": 491 + }, + { + "epoch": 0.002926063374250642, + "grad_norm": 3.5199148654937744, + "learning_rate": 4.999894802129773e-05, + "loss": 7.4747, + "step": 492 + }, + { + "epoch": 0.0029320106575316395, + "grad_norm": 3.866055727005005, + "learning_rate": 4.9998943731918714e-05, + "loss": 7.5985, + "step": 493 + }, + { + "epoch": 0.0029379579408126367, + "grad_norm": 2.856255054473877, + "learning_rate": 4.999893943381283e-05, + "loss": 7.9698, + "step": 494 + }, + { + "epoch": 0.002943905224093634, + "grad_norm": 3.0758626461029053, + "learning_rate": 4.999893512698007e-05, + "loss": 7.6311, + "step": 495 + }, + { + "epoch": 0.0029498525073746312, + "grad_norm": 3.739844560623169, + "learning_rate": 4.999893081142044e-05, + "loss": 7.6829, + "step": 496 + }, + { + "epoch": 0.0029557997906556285, + "grad_norm": 4.025709629058838, + "learning_rate": 4.999892648713394e-05, + "loss": 7.2717, + "step": 497 + }, + { + "epoch": 0.0029617470739366257, + "grad_norm": 3.6604738235473633, + "learning_rate": 4.999892215412057e-05, + "loss": 7.2985, + "step": 498 + }, + { + "epoch": 0.002967694357217623, + "grad_norm": 3.230109930038452, + "learning_rate": 4.999891781238034e-05, + "loss": 8.1041, + "step": 499 + }, + { + "epoch": 0.0029736416404986202, + "grad_norm": 2.5046725273132324, + "learning_rate": 4.999891346191325e-05, + "loss": 8.0888, + "step": 500 + }, + { + "epoch": 0.0029795889237796175, + "grad_norm": 2.916459798812866, + "learning_rate": 4.999890910271929e-05, + "loss": 7.8675, + "step": 501 + }, + { + "epoch": 0.0029855362070606148, + "grad_norm": 2.7806055545806885, + "learning_rate": 4.999890473479848e-05, + "loss": 7.8903, + "step": 502 + }, + { + "epoch": 0.002991483490341612, + "grad_norm": 2.9877662658691406, + "learning_rate": 4.99989003581508e-05, + "loss": 7.473, + "step": 503 + }, + { + "epoch": 0.0029974307736226093, + "grad_norm": 3.1581692695617676, + "learning_rate": 4.999889597277626e-05, + "loss": 7.5654, + "step": 504 + }, + { + "epoch": 0.0030033780569036065, + "grad_norm": 3.102539539337158, + "learning_rate": 4.9998891578674866e-05, + "loss": 7.8865, + "step": 505 + }, + { + "epoch": 0.0030093253401846038, + "grad_norm": 3.0357863903045654, + "learning_rate": 4.999888717584662e-05, + "loss": 7.291, + "step": 506 + }, + { + "epoch": 0.003015272623465601, + "grad_norm": 2.604048252105713, + "learning_rate": 4.999888276429152e-05, + "loss": 7.4892, + "step": 507 + }, + { + "epoch": 0.0030212199067465983, + "grad_norm": 2.734354257583618, + "learning_rate": 4.999887834400957e-05, + "loss": 7.1182, + "step": 508 + }, + { + "epoch": 0.0030271671900275955, + "grad_norm": 2.5255348682403564, + "learning_rate": 4.9998873915000775e-05, + "loss": 7.449, + "step": 509 + }, + { + "epoch": 0.003033114473308593, + "grad_norm": 2.864072322845459, + "learning_rate": 4.999886947726512e-05, + "loss": 7.3213, + "step": 510 + }, + { + "epoch": 0.00303906175658959, + "grad_norm": 2.764187812805176, + "learning_rate": 4.999886503080262e-05, + "loss": 7.337, + "step": 511 + }, + { + "epoch": 0.0030450090398705873, + "grad_norm": 3.5725066661834717, + "learning_rate": 4.9998860575613285e-05, + "loss": 7.8398, + "step": 512 + }, + { + "epoch": 0.0030509563231515846, + "grad_norm": 3.8559648990631104, + "learning_rate": 4.9998856111697096e-05, + "loss": 7.395, + "step": 513 + }, + { + "epoch": 0.003056903606432582, + "grad_norm": 2.9047908782958984, + "learning_rate": 4.999885163905407e-05, + "loss": 7.7016, + "step": 514 + }, + { + "epoch": 0.0030628508897135786, + "grad_norm": 3.1485037803649902, + "learning_rate": 4.99988471576842e-05, + "loss": 6.9411, + "step": 515 + }, + { + "epoch": 0.003068798172994576, + "grad_norm": 3.2763617038726807, + "learning_rate": 4.999884266758749e-05, + "loss": 6.4778, + "step": 516 + }, + { + "epoch": 0.003074745456275573, + "grad_norm": 2.7609500885009766, + "learning_rate": 4.999883816876394e-05, + "loss": 7.0576, + "step": 517 + }, + { + "epoch": 0.0030806927395565704, + "grad_norm": 3.7407751083374023, + "learning_rate": 4.999883366121356e-05, + "loss": 7.7389, + "step": 518 + }, + { + "epoch": 0.0030866400228375676, + "grad_norm": 3.3356568813323975, + "learning_rate": 4.999882914493634e-05, + "loss": 7.7, + "step": 519 + }, + { + "epoch": 0.003092587306118565, + "grad_norm": 2.635594129562378, + "learning_rate": 4.999882461993229e-05, + "loss": 7.6103, + "step": 520 + }, + { + "epoch": 0.003098534589399562, + "grad_norm": 3.7604281902313232, + "learning_rate": 4.9998820086201406e-05, + "loss": 7.6814, + "step": 521 + }, + { + "epoch": 0.0031044818726805594, + "grad_norm": 3.6567211151123047, + "learning_rate": 4.99988155437437e-05, + "loss": 7.6729, + "step": 522 + }, + { + "epoch": 0.0031104291559615567, + "grad_norm": 3.605442523956299, + "learning_rate": 4.999881099255916e-05, + "loss": 7.7464, + "step": 523 + }, + { + "epoch": 0.003116376439242554, + "grad_norm": 3.015500783920288, + "learning_rate": 4.99988064326478e-05, + "loss": 7.5168, + "step": 524 + }, + { + "epoch": 0.003122323722523551, + "grad_norm": 2.9037563800811768, + "learning_rate": 4.9998801864009604e-05, + "loss": 7.7059, + "step": 525 + }, + { + "epoch": 0.0031282710058045484, + "grad_norm": 2.812509059906006, + "learning_rate": 4.999879728664458e-05, + "loss": 7.4178, + "step": 526 + }, + { + "epoch": 0.0031342182890855457, + "grad_norm": 3.340226888656616, + "learning_rate": 4.9998792700552746e-05, + "loss": 7.7872, + "step": 527 + }, + { + "epoch": 0.003140165572366543, + "grad_norm": 3.0951550006866455, + "learning_rate": 4.999878810573409e-05, + "loss": 8.0153, + "step": 528 + }, + { + "epoch": 0.00314611285564754, + "grad_norm": 3.1077651977539062, + "learning_rate": 4.9998783502188616e-05, + "loss": 7.7053, + "step": 529 + }, + { + "epoch": 0.0031520601389285374, + "grad_norm": 3.442451000213623, + "learning_rate": 4.999877888991632e-05, + "loss": 7.5149, + "step": 530 + }, + { + "epoch": 0.0031580074222095347, + "grad_norm": 3.7479207515716553, + "learning_rate": 4.9998774268917215e-05, + "loss": 7.3448, + "step": 531 + }, + { + "epoch": 0.003163954705490532, + "grad_norm": 2.660789966583252, + "learning_rate": 4.999876963919129e-05, + "loss": 7.8348, + "step": 532 + }, + { + "epoch": 0.003169901988771529, + "grad_norm": 2.6255943775177, + "learning_rate": 4.9998765000738556e-05, + "loss": 7.542, + "step": 533 + }, + { + "epoch": 0.0031758492720525265, + "grad_norm": 3.121521472930908, + "learning_rate": 4.9998760353559017e-05, + "loss": 7.46, + "step": 534 + }, + { + "epoch": 0.0031817965553335237, + "grad_norm": 2.958880662918091, + "learning_rate": 4.999875569765266e-05, + "loss": 7.5385, + "step": 535 + }, + { + "epoch": 0.003187743838614521, + "grad_norm": 3.4153661727905273, + "learning_rate": 4.99987510330195e-05, + "loss": 7.4989, + "step": 536 + }, + { + "epoch": 0.0031936911218955182, + "grad_norm": 3.0877597332000732, + "learning_rate": 4.999874635965953e-05, + "loss": 7.5512, + "step": 537 + }, + { + "epoch": 0.0031996384051765155, + "grad_norm": 3.109522581100464, + "learning_rate": 4.9998741677572756e-05, + "loss": 7.4679, + "step": 538 + }, + { + "epoch": 0.0032055856884575127, + "grad_norm": 3.4434239864349365, + "learning_rate": 4.999873698675919e-05, + "loss": 7.0599, + "step": 539 + }, + { + "epoch": 0.00321153297173851, + "grad_norm": 3.83335018157959, + "learning_rate": 4.999873228721882e-05, + "loss": 7.5355, + "step": 540 + }, + { + "epoch": 0.0032174802550195072, + "grad_norm": 3.0679752826690674, + "learning_rate": 4.999872757895164e-05, + "loss": 7.7231, + "step": 541 + }, + { + "epoch": 0.0032234275383005045, + "grad_norm": 3.272196054458618, + "learning_rate": 4.999872286195767e-05, + "loss": 7.6674, + "step": 542 + }, + { + "epoch": 0.0032293748215815017, + "grad_norm": 2.8453965187072754, + "learning_rate": 4.9998718136236897e-05, + "loss": 7.4451, + "step": 543 + }, + { + "epoch": 0.003235322104862499, + "grad_norm": 3.074399709701538, + "learning_rate": 4.999871340178934e-05, + "loss": 7.6011, + "step": 544 + }, + { + "epoch": 0.0032412693881434963, + "grad_norm": 3.173004150390625, + "learning_rate": 4.999870865861499e-05, + "loss": 7.5268, + "step": 545 + }, + { + "epoch": 0.003247216671424493, + "grad_norm": 2.820848226547241, + "learning_rate": 4.999870390671384e-05, + "loss": 7.9872, + "step": 546 + }, + { + "epoch": 0.0032531639547054903, + "grad_norm": 2.692702293395996, + "learning_rate": 4.9998699146085906e-05, + "loss": 7.4676, + "step": 547 + }, + { + "epoch": 0.0032591112379864876, + "grad_norm": 2.2766902446746826, + "learning_rate": 4.999869437673119e-05, + "loss": 7.3826, + "step": 548 + }, + { + "epoch": 0.003265058521267485, + "grad_norm": 2.1190011501312256, + "learning_rate": 4.9998689598649686e-05, + "loss": 7.4767, + "step": 549 + }, + { + "epoch": 0.003271005804548482, + "grad_norm": 2.687633514404297, + "learning_rate": 4.999868481184139e-05, + "loss": 7.9922, + "step": 550 + }, + { + "epoch": 0.0032769530878294794, + "grad_norm": 3.403298854827881, + "learning_rate": 4.999868001630632e-05, + "loss": 7.8035, + "step": 551 + }, + { + "epoch": 0.0032829003711104766, + "grad_norm": 3.074881076812744, + "learning_rate": 4.999867521204446e-05, + "loss": 7.7106, + "step": 552 + }, + { + "epoch": 0.003288847654391474, + "grad_norm": 3.28725004196167, + "learning_rate": 4.9998670399055827e-05, + "loss": 7.4661, + "step": 553 + }, + { + "epoch": 0.003294794937672471, + "grad_norm": 3.8624775409698486, + "learning_rate": 4.999866557734041e-05, + "loss": 7.7156, + "step": 554 + }, + { + "epoch": 0.0033007422209534684, + "grad_norm": 2.53586745262146, + "learning_rate": 4.999866074689823e-05, + "loss": 7.945, + "step": 555 + }, + { + "epoch": 0.0033066895042344656, + "grad_norm": 3.8261072635650635, + "learning_rate": 4.9998655907729265e-05, + "loss": 8.0446, + "step": 556 + }, + { + "epoch": 0.003312636787515463, + "grad_norm": 2.7173407077789307, + "learning_rate": 4.999865105983353e-05, + "loss": 7.8363, + "step": 557 + }, + { + "epoch": 0.00331858407079646, + "grad_norm": 4.68424654006958, + "learning_rate": 4.999864620321102e-05, + "loss": 7.667, + "step": 558 + }, + { + "epoch": 0.0033245313540774574, + "grad_norm": 2.8763632774353027, + "learning_rate": 4.999864133786175e-05, + "loss": 7.6133, + "step": 559 + }, + { + "epoch": 0.0033304786373584546, + "grad_norm": 3.0986382961273193, + "learning_rate": 4.9998636463785705e-05, + "loss": 7.6257, + "step": 560 + }, + { + "epoch": 0.003336425920639452, + "grad_norm": 2.6826348304748535, + "learning_rate": 4.9998631580982905e-05, + "loss": 7.5187, + "step": 561 + }, + { + "epoch": 0.003342373203920449, + "grad_norm": 2.2172515392303467, + "learning_rate": 4.9998626689453334e-05, + "loss": 7.961, + "step": 562 + }, + { + "epoch": 0.0033483204872014464, + "grad_norm": 2.6083858013153076, + "learning_rate": 4.9998621789197e-05, + "loss": 7.7887, + "step": 563 + }, + { + "epoch": 0.0033542677704824437, + "grad_norm": 3.6838009357452393, + "learning_rate": 4.99986168802139e-05, + "loss": 7.4945, + "step": 564 + }, + { + "epoch": 0.003360215053763441, + "grad_norm": 3.2091991901397705, + "learning_rate": 4.999861196250405e-05, + "loss": 7.4243, + "step": 565 + }, + { + "epoch": 0.003366162337044438, + "grad_norm": 3.142982244491577, + "learning_rate": 4.9998607036067434e-05, + "loss": 7.4684, + "step": 566 + }, + { + "epoch": 0.0033721096203254354, + "grad_norm": 3.7751007080078125, + "learning_rate": 4.9998602100904065e-05, + "loss": 7.3722, + "step": 567 + }, + { + "epoch": 0.0033780569036064327, + "grad_norm": 3.276843547821045, + "learning_rate": 4.9998597157013946e-05, + "loss": 7.4012, + "step": 568 + }, + { + "epoch": 0.00338400418688743, + "grad_norm": 2.840106725692749, + "learning_rate": 4.999859220439708e-05, + "loss": 7.4013, + "step": 569 + }, + { + "epoch": 0.003389951470168427, + "grad_norm": 2.7816810607910156, + "learning_rate": 4.999858724305346e-05, + "loss": 7.3136, + "step": 570 + }, + { + "epoch": 0.0033958987534494244, + "grad_norm": 4.523340225219727, + "learning_rate": 4.999858227298308e-05, + "loss": 7.0553, + "step": 571 + }, + { + "epoch": 0.0034018460367304217, + "grad_norm": 3.9653191566467285, + "learning_rate": 4.9998577294185964e-05, + "loss": 7.1907, + "step": 572 + }, + { + "epoch": 0.003407793320011419, + "grad_norm": 3.243089199066162, + "learning_rate": 4.999857230666211e-05, + "loss": 7.0749, + "step": 573 + }, + { + "epoch": 0.003413740603292416, + "grad_norm": 3.3622777462005615, + "learning_rate": 4.99985673104115e-05, + "loss": 7.0005, + "step": 574 + }, + { + "epoch": 0.0034196878865734135, + "grad_norm": 2.561732292175293, + "learning_rate": 4.9998562305434154e-05, + "loss": 7.271, + "step": 575 + }, + { + "epoch": 0.0034256351698544107, + "grad_norm": 3.1846745014190674, + "learning_rate": 4.999855729173006e-05, + "loss": 7.7333, + "step": 576 + }, + { + "epoch": 0.0034315824531354075, + "grad_norm": 3.0318918228149414, + "learning_rate": 4.999855226929924e-05, + "loss": 7.5535, + "step": 577 + }, + { + "epoch": 0.003437529736416405, + "grad_norm": 2.993086099624634, + "learning_rate": 4.999854723814168e-05, + "loss": 7.6272, + "step": 578 + }, + { + "epoch": 0.003443477019697402, + "grad_norm": 2.8511712551116943, + "learning_rate": 4.999854219825738e-05, + "loss": 7.6619, + "step": 579 + }, + { + "epoch": 0.0034494243029783993, + "grad_norm": 2.6181185245513916, + "learning_rate": 4.9998537149646355e-05, + "loss": 7.7452, + "step": 580 + }, + { + "epoch": 0.0034553715862593965, + "grad_norm": 2.9932363033294678, + "learning_rate": 4.9998532092308593e-05, + "loss": 7.7475, + "step": 581 + }, + { + "epoch": 0.003461318869540394, + "grad_norm": 3.541944742202759, + "learning_rate": 4.99985270262441e-05, + "loss": 7.5808, + "step": 582 + }, + { + "epoch": 0.003467266152821391, + "grad_norm": 2.780372381210327, + "learning_rate": 4.9998521951452895e-05, + "loss": 7.8167, + "step": 583 + }, + { + "epoch": 0.0034732134361023883, + "grad_norm": 2.9156363010406494, + "learning_rate": 4.9998516867934945e-05, + "loss": 7.74, + "step": 584 + }, + { + "epoch": 0.0034791607193833856, + "grad_norm": 3.9492485523223877, + "learning_rate": 4.9998511775690285e-05, + "loss": 7.1128, + "step": 585 + }, + { + "epoch": 0.003485108002664383, + "grad_norm": 2.8288252353668213, + "learning_rate": 4.9998506674718896e-05, + "loss": 7.4884, + "step": 586 + }, + { + "epoch": 0.00349105528594538, + "grad_norm": 2.8906798362731934, + "learning_rate": 4.999850156502078e-05, + "loss": 7.6378, + "step": 587 + }, + { + "epoch": 0.0034970025692263773, + "grad_norm": 2.8806405067443848, + "learning_rate": 4.9998496446595955e-05, + "loss": 7.4641, + "step": 588 + }, + { + "epoch": 0.0035029498525073746, + "grad_norm": 3.1794772148132324, + "learning_rate": 4.999849131944441e-05, + "loss": 7.1633, + "step": 589 + }, + { + "epoch": 0.003508897135788372, + "grad_norm": 2.886009454727173, + "learning_rate": 4.999848618356615e-05, + "loss": 7.1793, + "step": 590 + }, + { + "epoch": 0.003514844419069369, + "grad_norm": 2.76184344291687, + "learning_rate": 4.999848103896118e-05, + "loss": 7.1377, + "step": 591 + }, + { + "epoch": 0.0035207917023503663, + "grad_norm": 3.127793788909912, + "learning_rate": 4.999847588562949e-05, + "loss": 7.2793, + "step": 592 + }, + { + "epoch": 0.0035267389856313636, + "grad_norm": 3.7768073081970215, + "learning_rate": 4.99984707235711e-05, + "loss": 7.8203, + "step": 593 + }, + { + "epoch": 0.003532686268912361, + "grad_norm": 3.1750540733337402, + "learning_rate": 4.9998465552786e-05, + "loss": 7.7078, + "step": 594 + }, + { + "epoch": 0.003538633552193358, + "grad_norm": 2.8884522914886475, + "learning_rate": 4.999846037327419e-05, + "loss": 7.6864, + "step": 595 + }, + { + "epoch": 0.0035445808354743554, + "grad_norm": 2.783928394317627, + "learning_rate": 4.999845518503568e-05, + "loss": 7.7329, + "step": 596 + }, + { + "epoch": 0.0035505281187553526, + "grad_norm": 2.8093652725219727, + "learning_rate": 4.9998449988070465e-05, + "loss": 7.7157, + "step": 597 + }, + { + "epoch": 0.00355647540203635, + "grad_norm": 2.54380464553833, + "learning_rate": 4.999844478237855e-05, + "loss": 7.6353, + "step": 598 + }, + { + "epoch": 0.003562422685317347, + "grad_norm": 3.478878974914551, + "learning_rate": 4.999843956795993e-05, + "loss": 7.4221, + "step": 599 + }, + { + "epoch": 0.0035683699685983444, + "grad_norm": 3.882807493209839, + "learning_rate": 4.999843434481463e-05, + "loss": 7.4857, + "step": 600 + }, + { + "epoch": 0.0035743172518793416, + "grad_norm": 3.0975584983825684, + "learning_rate": 4.999842911294261e-05, + "loss": 7.5121, + "step": 601 + }, + { + "epoch": 0.003580264535160339, + "grad_norm": 3.1857712268829346, + "learning_rate": 4.999842387234391e-05, + "loss": 7.4469, + "step": 602 + }, + { + "epoch": 0.003586211818441336, + "grad_norm": 2.892927885055542, + "learning_rate": 4.999841862301853e-05, + "loss": 7.4047, + "step": 603 + }, + { + "epoch": 0.0035921591017223334, + "grad_norm": 4.186185359954834, + "learning_rate": 4.999841336496645e-05, + "loss": 7.5146, + "step": 604 + }, + { + "epoch": 0.0035981063850033307, + "grad_norm": 3.27422833442688, + "learning_rate": 4.9998408098187674e-05, + "loss": 7.3347, + "step": 605 + }, + { + "epoch": 0.003604053668284328, + "grad_norm": 4.817208290100098, + "learning_rate": 4.9998402822682225e-05, + "loss": 7.9883, + "step": 606 + }, + { + "epoch": 0.003610000951565325, + "grad_norm": 5.903015613555908, + "learning_rate": 4.999839753845008e-05, + "loss": 7.9043, + "step": 607 + }, + { + "epoch": 0.0036159482348463224, + "grad_norm": 4.720086574554443, + "learning_rate": 4.999839224549127e-05, + "loss": 7.8456, + "step": 608 + }, + { + "epoch": 0.0036218955181273192, + "grad_norm": 4.518443584442139, + "learning_rate": 4.9998386943805764e-05, + "loss": 7.3659, + "step": 609 + }, + { + "epoch": 0.0036278428014083165, + "grad_norm": 2.621833086013794, + "learning_rate": 4.999838163339358e-05, + "loss": 8.0512, + "step": 610 + }, + { + "epoch": 0.0036337900846893137, + "grad_norm": 4.015076160430908, + "learning_rate": 4.9998376314254726e-05, + "loss": 7.8581, + "step": 611 + }, + { + "epoch": 0.003639737367970311, + "grad_norm": 3.8145275115966797, + "learning_rate": 4.999837098638919e-05, + "loss": 7.4288, + "step": 612 + }, + { + "epoch": 0.0036456846512513083, + "grad_norm": 3.396488904953003, + "learning_rate": 4.9998365649796985e-05, + "loss": 7.7812, + "step": 613 + }, + { + "epoch": 0.0036516319345323055, + "grad_norm": 2.931187391281128, + "learning_rate": 4.999836030447811e-05, + "loss": 7.5898, + "step": 614 + }, + { + "epoch": 0.0036575792178133028, + "grad_norm": 2.6349267959594727, + "learning_rate": 4.999835495043257e-05, + "loss": 7.5345, + "step": 615 + }, + { + "epoch": 0.0036635265010943, + "grad_norm": 3.014085531234741, + "learning_rate": 4.999834958766035e-05, + "loss": 7.5985, + "step": 616 + }, + { + "epoch": 0.0036694737843752973, + "grad_norm": 2.971475124359131, + "learning_rate": 4.999834421616147e-05, + "loss": 7.589, + "step": 617 + }, + { + "epoch": 0.0036754210676562945, + "grad_norm": 3.867366075515747, + "learning_rate": 4.999833883593593e-05, + "loss": 7.4026, + "step": 618 + }, + { + "epoch": 0.0036813683509372918, + "grad_norm": 2.3917908668518066, + "learning_rate": 4.9998333446983734e-05, + "loss": 7.4361, + "step": 619 + }, + { + "epoch": 0.003687315634218289, + "grad_norm": 4.583080768585205, + "learning_rate": 4.999832804930487e-05, + "loss": 7.5525, + "step": 620 + }, + { + "epoch": 0.0036932629174992863, + "grad_norm": 2.6039721965789795, + "learning_rate": 4.999832264289934e-05, + "loss": 7.636, + "step": 621 + }, + { + "epoch": 0.0036992102007802835, + "grad_norm": 4.123409748077393, + "learning_rate": 4.9998317227767165e-05, + "loss": 7.7803, + "step": 622 + }, + { + "epoch": 0.003705157484061281, + "grad_norm": 4.220766544342041, + "learning_rate": 4.999831180390834e-05, + "loss": 7.8086, + "step": 623 + }, + { + "epoch": 0.003711104767342278, + "grad_norm": 3.0759594440460205, + "learning_rate": 4.999830637132285e-05, + "loss": 7.4815, + "step": 624 + }, + { + "epoch": 0.0037170520506232753, + "grad_norm": 2.7870442867279053, + "learning_rate": 4.999830093001071e-05, + "loss": 7.3925, + "step": 625 + }, + { + "epoch": 0.0037229993339042726, + "grad_norm": 2.5292582511901855, + "learning_rate": 4.999829547997193e-05, + "loss": 7.2049, + "step": 626 + }, + { + "epoch": 0.00372894661718527, + "grad_norm": 2.5836963653564453, + "learning_rate": 4.99982900212065e-05, + "loss": 7.2858, + "step": 627 + }, + { + "epoch": 0.003734893900466267, + "grad_norm": 2.6433279514312744, + "learning_rate": 4.9998284553714425e-05, + "loss": 7.5894, + "step": 628 + }, + { + "epoch": 0.0037408411837472643, + "grad_norm": 3.1093215942382812, + "learning_rate": 4.999827907749571e-05, + "loss": 7.2859, + "step": 629 + }, + { + "epoch": 0.0037467884670282616, + "grad_norm": 2.313305616378784, + "learning_rate": 4.9998273592550346e-05, + "loss": 7.6275, + "step": 630 + }, + { + "epoch": 0.003752735750309259, + "grad_norm": 3.7002785205841064, + "learning_rate": 4.9998268098878355e-05, + "loss": 7.7068, + "step": 631 + }, + { + "epoch": 0.003758683033590256, + "grad_norm": 3.090707778930664, + "learning_rate": 4.9998262596479715e-05, + "loss": 7.7304, + "step": 632 + }, + { + "epoch": 0.0037646303168712533, + "grad_norm": 2.425614833831787, + "learning_rate": 4.999825708535445e-05, + "loss": 7.927, + "step": 633 + }, + { + "epoch": 0.0037705776001522506, + "grad_norm": 2.1477420330047607, + "learning_rate": 4.999825156550254e-05, + "loss": 8.1082, + "step": 634 + }, + { + "epoch": 0.003776524883433248, + "grad_norm": 2.434638738632202, + "learning_rate": 4.999824603692401e-05, + "loss": 7.8808, + "step": 635 + }, + { + "epoch": 0.003782472166714245, + "grad_norm": 2.563283681869507, + "learning_rate": 4.999824049961884e-05, + "loss": 7.8515, + "step": 636 + }, + { + "epoch": 0.0037884194499952424, + "grad_norm": 2.6878623962402344, + "learning_rate": 4.9998234953587054e-05, + "loss": 7.6393, + "step": 637 + }, + { + "epoch": 0.0037943667332762396, + "grad_norm": 2.6270666122436523, + "learning_rate": 4.999822939882863e-05, + "loss": 7.8246, + "step": 638 + }, + { + "epoch": 0.003800314016557237, + "grad_norm": 3.300494909286499, + "learning_rate": 4.9998223835343596e-05, + "loss": 7.4991, + "step": 639 + }, + { + "epoch": 0.0038062612998382337, + "grad_norm": 2.726902723312378, + "learning_rate": 4.9998218263131925e-05, + "loss": 7.6663, + "step": 640 + }, + { + "epoch": 0.003812208583119231, + "grad_norm": 2.8147871494293213, + "learning_rate": 4.9998212682193645e-05, + "loss": 7.5272, + "step": 641 + }, + { + "epoch": 0.003818155866400228, + "grad_norm": 2.324422597885132, + "learning_rate": 4.9998207092528745e-05, + "loss": 7.6577, + "step": 642 + }, + { + "epoch": 0.0038241031496812255, + "grad_norm": 2.4525058269500732, + "learning_rate": 4.999820149413723e-05, + "loss": 7.6793, + "step": 643 + }, + { + "epoch": 0.0038300504329622227, + "grad_norm": 2.4011337757110596, + "learning_rate": 4.9998195887019094e-05, + "loss": 7.4869, + "step": 644 + }, + { + "epoch": 0.00383599771624322, + "grad_norm": 2.3403005599975586, + "learning_rate": 4.9998190271174364e-05, + "loss": 7.9552, + "step": 645 + }, + { + "epoch": 0.003841944999524217, + "grad_norm": 2.1421074867248535, + "learning_rate": 4.9998184646603005e-05, + "loss": 7.4021, + "step": 646 + }, + { + "epoch": 0.0038478922828052145, + "grad_norm": 2.4157450199127197, + "learning_rate": 4.9998179013305046e-05, + "loss": 7.6666, + "step": 647 + }, + { + "epoch": 0.0038538395660862117, + "grad_norm": 2.737692356109619, + "learning_rate": 4.999817337128048e-05, + "loss": 7.7441, + "step": 648 + }, + { + "epoch": 0.003859786849367209, + "grad_norm": 3.2240428924560547, + "learning_rate": 4.999816772052931e-05, + "loss": 7.5691, + "step": 649 + }, + { + "epoch": 0.0038657341326482062, + "grad_norm": 2.8538997173309326, + "learning_rate": 4.9998162061051534e-05, + "loss": 7.4994, + "step": 650 + }, + { + "epoch": 0.0038716814159292035, + "grad_norm": 2.6562373638153076, + "learning_rate": 4.9998156392847164e-05, + "loss": 7.5156, + "step": 651 + }, + { + "epoch": 0.0038776286992102007, + "grad_norm": 2.5513811111450195, + "learning_rate": 4.999815071591619e-05, + "loss": 7.6503, + "step": 652 + }, + { + "epoch": 0.003883575982491198, + "grad_norm": 2.4196572303771973, + "learning_rate": 4.999814503025863e-05, + "loss": 7.9868, + "step": 653 + }, + { + "epoch": 0.0038895232657721952, + "grad_norm": 3.0201921463012695, + "learning_rate": 4.999813933587447e-05, + "loss": 7.5405, + "step": 654 + }, + { + "epoch": 0.0038954705490531925, + "grad_norm": 2.352625846862793, + "learning_rate": 4.9998133632763714e-05, + "loss": 7.5461, + "step": 655 + }, + { + "epoch": 0.0039014178323341898, + "grad_norm": 2.5318710803985596, + "learning_rate": 4.999812792092637e-05, + "loss": 7.5596, + "step": 656 + }, + { + "epoch": 0.003907365115615187, + "grad_norm": 2.710785388946533, + "learning_rate": 4.9998122200362444e-05, + "loss": 7.4828, + "step": 657 + }, + { + "epoch": 0.003913312398896184, + "grad_norm": 2.7441353797912598, + "learning_rate": 4.999811647107192e-05, + "loss": 7.2496, + "step": 658 + }, + { + "epoch": 0.0039192596821771815, + "grad_norm": 2.4602885246276855, + "learning_rate": 4.9998110733054824e-05, + "loss": 7.6134, + "step": 659 + }, + { + "epoch": 0.003925206965458178, + "grad_norm": 2.6842973232269287, + "learning_rate": 4.999810498631114e-05, + "loss": 7.3544, + "step": 660 + }, + { + "epoch": 0.003931154248739176, + "grad_norm": 2.8062961101531982, + "learning_rate": 4.9998099230840875e-05, + "loss": 7.5162, + "step": 661 + }, + { + "epoch": 0.003937101532020173, + "grad_norm": 4.0753679275512695, + "learning_rate": 4.9998093466644036e-05, + "loss": 7.5241, + "step": 662 + }, + { + "epoch": 0.0039430488153011705, + "grad_norm": 3.0165748596191406, + "learning_rate": 4.999808769372061e-05, + "loss": 7.5313, + "step": 663 + }, + { + "epoch": 0.003948996098582167, + "grad_norm": 2.73825740814209, + "learning_rate": 4.9998081912070623e-05, + "loss": 7.4433, + "step": 664 + }, + { + "epoch": 0.003954943381863165, + "grad_norm": 2.6649749279022217, + "learning_rate": 4.9998076121694056e-05, + "loss": 7.4852, + "step": 665 + }, + { + "epoch": 0.003960890665144162, + "grad_norm": 2.609389066696167, + "learning_rate": 4.999807032259092e-05, + "loss": 7.4127, + "step": 666 + }, + { + "epoch": 0.0039668379484251596, + "grad_norm": 2.50502610206604, + "learning_rate": 4.999806451476122e-05, + "loss": 7.3113, + "step": 667 + }, + { + "epoch": 0.003972785231706156, + "grad_norm": 2.565142869949341, + "learning_rate": 4.999805869820495e-05, + "loss": 7.1875, + "step": 668 + }, + { + "epoch": 0.003978732514987154, + "grad_norm": 2.582742214202881, + "learning_rate": 4.9998052872922117e-05, + "loss": 7.3251, + "step": 669 + }, + { + "epoch": 0.003984679798268151, + "grad_norm": 2.718780279159546, + "learning_rate": 4.999804703891272e-05, + "loss": 7.3599, + "step": 670 + }, + { + "epoch": 0.003990627081549149, + "grad_norm": 2.5971410274505615, + "learning_rate": 4.999804119617677e-05, + "loss": 7.2304, + "step": 671 + }, + { + "epoch": 0.003996574364830145, + "grad_norm": 2.5905725955963135, + "learning_rate": 4.9998035344714255e-05, + "loss": 7.3664, + "step": 672 + }, + { + "epoch": 0.004002521648111143, + "grad_norm": 2.659102439880371, + "learning_rate": 4.999802948452519e-05, + "loss": 7.4296, + "step": 673 + }, + { + "epoch": 0.00400846893139214, + "grad_norm": 2.5933544635772705, + "learning_rate": 4.999802361560957e-05, + "loss": 7.4605, + "step": 674 + }, + { + "epoch": 0.004014416214673138, + "grad_norm": 3.3860044479370117, + "learning_rate": 4.999801773796739e-05, + "loss": 7.5159, + "step": 675 + }, + { + "epoch": 0.004020363497954134, + "grad_norm": 3.742635726928711, + "learning_rate": 4.9998011851598666e-05, + "loss": 7.4988, + "step": 676 + }, + { + "epoch": 0.004026310781235132, + "grad_norm": 3.5960240364074707, + "learning_rate": 4.999800595650339e-05, + "loss": 7.4607, + "step": 677 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 2.654444694519043, + "learning_rate": 4.9998000052681585e-05, + "loss": 7.2166, + "step": 678 + }, + { + "epoch": 0.004038205347797127, + "grad_norm": 2.4538326263427734, + "learning_rate": 4.999799414013322e-05, + "loss": 7.2334, + "step": 679 + }, + { + "epoch": 0.004044152631078123, + "grad_norm": 2.5899672508239746, + "learning_rate": 4.9997988218858316e-05, + "loss": 7.2754, + "step": 680 + }, + { + "epoch": 0.004050099914359121, + "grad_norm": 2.721224069595337, + "learning_rate": 4.999798228885687e-05, + "loss": 7.188, + "step": 681 + }, + { + "epoch": 0.004056047197640118, + "grad_norm": 6.5863189697265625, + "learning_rate": 4.9997976350128894e-05, + "loss": 7.369, + "step": 682 + }, + { + "epoch": 0.004061994480921116, + "grad_norm": 2.6562674045562744, + "learning_rate": 4.999797040267438e-05, + "loss": 7.176, + "step": 683 + }, + { + "epoch": 0.0040679417642021124, + "grad_norm": 2.503666877746582, + "learning_rate": 4.9997964446493326e-05, + "loss": 7.2765, + "step": 684 + }, + { + "epoch": 0.00407388904748311, + "grad_norm": 9.070426940917969, + "learning_rate": 4.9997958481585756e-05, + "loss": 7.5187, + "step": 685 + }, + { + "epoch": 0.004079836330764107, + "grad_norm": 2.7480480670928955, + "learning_rate": 4.9997952507951645e-05, + "loss": 7.5244, + "step": 686 + }, + { + "epoch": 0.004085783614045104, + "grad_norm": 3.8338348865509033, + "learning_rate": 4.999794652559101e-05, + "loss": 7.6672, + "step": 687 + }, + { + "epoch": 0.0040917308973261015, + "grad_norm": 3.1132454872131348, + "learning_rate": 4.999794053450385e-05, + "loss": 7.9594, + "step": 688 + }, + { + "epoch": 0.004097678180607098, + "grad_norm": 2.6279757022857666, + "learning_rate": 4.999793453469017e-05, + "loss": 7.4737, + "step": 689 + }, + { + "epoch": 0.004103625463888096, + "grad_norm": 3.440145492553711, + "learning_rate": 4.9997928526149966e-05, + "loss": 7.2968, + "step": 690 + }, + { + "epoch": 0.004109572747169093, + "grad_norm": 2.3300867080688477, + "learning_rate": 4.9997922508883244e-05, + "loss": 7.3693, + "step": 691 + }, + { + "epoch": 0.0041155200304500905, + "grad_norm": 2.9034078121185303, + "learning_rate": 4.999791648289001e-05, + "loss": 7.7227, + "step": 692 + }, + { + "epoch": 0.004121467313731087, + "grad_norm": 2.5685503482818604, + "learning_rate": 4.9997910448170254e-05, + "loss": 7.9706, + "step": 693 + }, + { + "epoch": 0.004127414597012085, + "grad_norm": 3.260779619216919, + "learning_rate": 4.9997904404723986e-05, + "loss": 7.7231, + "step": 694 + }, + { + "epoch": 0.004133361880293082, + "grad_norm": 2.668193817138672, + "learning_rate": 4.999789835255121e-05, + "loss": 7.7677, + "step": 695 + }, + { + "epoch": 0.0041393091635740795, + "grad_norm": 2.545276641845703, + "learning_rate": 4.999789229165193e-05, + "loss": 7.9297, + "step": 696 + }, + { + "epoch": 0.004145256446855076, + "grad_norm": 3.2137503623962402, + "learning_rate": 4.9997886222026146e-05, + "loss": 7.697, + "step": 697 + }, + { + "epoch": 0.004151203730136074, + "grad_norm": 2.7501730918884277, + "learning_rate": 4.999788014367385e-05, + "loss": 7.3686, + "step": 698 + }, + { + "epoch": 0.004157151013417071, + "grad_norm": 2.2456486225128174, + "learning_rate": 4.9997874056595055e-05, + "loss": 7.7238, + "step": 699 + }, + { + "epoch": 0.0041630982966980685, + "grad_norm": 2.3958070278167725, + "learning_rate": 4.9997867960789764e-05, + "loss": 7.8349, + "step": 700 + }, + { + "epoch": 0.004169045579979065, + "grad_norm": 2.509744644165039, + "learning_rate": 4.9997861856257974e-05, + "loss": 7.5884, + "step": 701 + }, + { + "epoch": 0.004174992863260063, + "grad_norm": 3.6095783710479736, + "learning_rate": 4.9997855742999684e-05, + "loss": 7.4726, + "step": 702 + }, + { + "epoch": 0.00418094014654106, + "grad_norm": 3.3515326976776123, + "learning_rate": 4.99978496210149e-05, + "loss": 7.5214, + "step": 703 + }, + { + "epoch": 0.0041868874298220575, + "grad_norm": 4.7553791999816895, + "learning_rate": 4.999784349030363e-05, + "loss": 7.4577, + "step": 704 + }, + { + "epoch": 0.004192834713103054, + "grad_norm": 5.959117412567139, + "learning_rate": 4.9997837350865874e-05, + "loss": 7.2559, + "step": 705 + }, + { + "epoch": 0.004198781996384052, + "grad_norm": 2.9650065898895264, + "learning_rate": 4.999783120270163e-05, + "loss": 7.3712, + "step": 706 + }, + { + "epoch": 0.004204729279665049, + "grad_norm": 3.4171416759490967, + "learning_rate": 4.9997825045810895e-05, + "loss": 7.5014, + "step": 707 + }, + { + "epoch": 0.0042106765629460466, + "grad_norm": 3.297393798828125, + "learning_rate": 4.9997818880193684e-05, + "loss": 7.4553, + "step": 708 + }, + { + "epoch": 0.004216623846227043, + "grad_norm": 3.193859338760376, + "learning_rate": 4.999781270584999e-05, + "loss": 7.3414, + "step": 709 + }, + { + "epoch": 0.004222571129508041, + "grad_norm": 2.5028324127197266, + "learning_rate": 4.999780652277982e-05, + "loss": 7.4615, + "step": 710 + }, + { + "epoch": 0.004228518412789038, + "grad_norm": 3.43390154838562, + "learning_rate": 4.999780033098317e-05, + "loss": 7.3801, + "step": 711 + }, + { + "epoch": 0.004234465696070036, + "grad_norm": 3.3093984127044678, + "learning_rate": 4.999779413046004e-05, + "loss": 7.2938, + "step": 712 + }, + { + "epoch": 0.004240412979351032, + "grad_norm": 2.6643831729888916, + "learning_rate": 4.999778792121046e-05, + "loss": 7.3916, + "step": 713 + }, + { + "epoch": 0.00424636026263203, + "grad_norm": 2.779407501220703, + "learning_rate": 4.999778170323439e-05, + "loss": 7.5783, + "step": 714 + }, + { + "epoch": 0.004252307545913027, + "grad_norm": 2.959345817565918, + "learning_rate": 4.999777547653186e-05, + "loss": 7.9854, + "step": 715 + }, + { + "epoch": 0.004258254829194025, + "grad_norm": 2.9909780025482178, + "learning_rate": 4.9997769241102866e-05, + "loss": 7.997, + "step": 716 + }, + { + "epoch": 0.004264202112475021, + "grad_norm": 3.081831932067871, + "learning_rate": 4.9997762996947405e-05, + "loss": 7.9393, + "step": 717 + }, + { + "epoch": 0.004270149395756018, + "grad_norm": 2.8901429176330566, + "learning_rate": 4.9997756744065485e-05, + "loss": 7.8152, + "step": 718 + }, + { + "epoch": 0.004276096679037016, + "grad_norm": 3.3065547943115234, + "learning_rate": 4.9997750482457106e-05, + "loss": 7.1176, + "step": 719 + }, + { + "epoch": 0.004282043962318013, + "grad_norm": 3.1083710193634033, + "learning_rate": 4.9997744212122276e-05, + "loss": 7.6215, + "step": 720 + }, + { + "epoch": 0.00428799124559901, + "grad_norm": 4.010551452636719, + "learning_rate": 4.9997737933060987e-05, + "loss": 7.7665, + "step": 721 + }, + { + "epoch": 0.004293938528880007, + "grad_norm": 3.9287984371185303, + "learning_rate": 4.9997731645273245e-05, + "loss": 7.7185, + "step": 722 + }, + { + "epoch": 0.004299885812161005, + "grad_norm": 2.7739338874816895, + "learning_rate": 4.999772534875905e-05, + "loss": 7.7226, + "step": 723 + }, + { + "epoch": 0.004305833095442002, + "grad_norm": 2.675567865371704, + "learning_rate": 4.9997719043518414e-05, + "loss": 7.686, + "step": 724 + }, + { + "epoch": 0.0043117803787229994, + "grad_norm": 3.8513898849487305, + "learning_rate": 4.999771272955133e-05, + "loss": 7.6584, + "step": 725 + }, + { + "epoch": 0.004317727662003996, + "grad_norm": 10.309504508972168, + "learning_rate": 4.99977064068578e-05, + "loss": 7.4006, + "step": 726 + }, + { + "epoch": 0.004323674945284994, + "grad_norm": 2.712939977645874, + "learning_rate": 4.9997700075437836e-05, + "loss": 7.6275, + "step": 727 + }, + { + "epoch": 0.004329622228565991, + "grad_norm": 2.7880115509033203, + "learning_rate": 4.999769373529143e-05, + "loss": 7.4154, + "step": 728 + }, + { + "epoch": 0.0043355695118469885, + "grad_norm": 3.2352819442749023, + "learning_rate": 4.999768738641859e-05, + "loss": 7.4827, + "step": 729 + }, + { + "epoch": 0.004341516795127985, + "grad_norm": 3.5176644325256348, + "learning_rate": 4.999768102881931e-05, + "loss": 7.4748, + "step": 730 + }, + { + "epoch": 0.004347464078408983, + "grad_norm": 2.996829032897949, + "learning_rate": 4.99976746624936e-05, + "loss": 7.445, + "step": 731 + }, + { + "epoch": 0.00435341136168998, + "grad_norm": 4.5892534255981445, + "learning_rate": 4.9997668287441454e-05, + "loss": 7.6464, + "step": 732 + }, + { + "epoch": 0.0043593586449709775, + "grad_norm": 3.689419984817505, + "learning_rate": 4.999766190366289e-05, + "loss": 7.4215, + "step": 733 + }, + { + "epoch": 0.004365305928251974, + "grad_norm": 2.9146885871887207, + "learning_rate": 4.9997655511157896e-05, + "loss": 7.4852, + "step": 734 + }, + { + "epoch": 0.004371253211532972, + "grad_norm": 3.8503024578094482, + "learning_rate": 4.9997649109926484e-05, + "loss": 7.4779, + "step": 735 + }, + { + "epoch": 0.004377200494813969, + "grad_norm": 3.929422616958618, + "learning_rate": 4.9997642699968646e-05, + "loss": 7.3526, + "step": 736 + }, + { + "epoch": 0.0043831477780949665, + "grad_norm": 3.3365838527679443, + "learning_rate": 4.999763628128439e-05, + "loss": 7.3895, + "step": 737 + }, + { + "epoch": 0.004389095061375963, + "grad_norm": 3.147660970687866, + "learning_rate": 4.999762985387372e-05, + "loss": 7.1885, + "step": 738 + }, + { + "epoch": 0.004395042344656961, + "grad_norm": 3.3230104446411133, + "learning_rate": 4.9997623417736626e-05, + "loss": 7.5839, + "step": 739 + }, + { + "epoch": 0.004400989627937958, + "grad_norm": 3.285144090652466, + "learning_rate": 4.999761697287313e-05, + "loss": 7.4859, + "step": 740 + }, + { + "epoch": 0.0044069369112189555, + "grad_norm": 3.3811442852020264, + "learning_rate": 4.9997610519283216e-05, + "loss": 7.4871, + "step": 741 + }, + { + "epoch": 0.004412884194499952, + "grad_norm": 2.9662907123565674, + "learning_rate": 4.9997604056966904e-05, + "loss": 7.2546, + "step": 742 + }, + { + "epoch": 0.00441883147778095, + "grad_norm": 3.1432855129241943, + "learning_rate": 4.999759758592418e-05, + "loss": 7.5273, + "step": 743 + }, + { + "epoch": 0.004424778761061947, + "grad_norm": 3.0559749603271484, + "learning_rate": 4.9997591106155054e-05, + "loss": 7.0754, + "step": 744 + }, + { + "epoch": 0.0044307260443429445, + "grad_norm": 2.6778409481048584, + "learning_rate": 4.999758461765953e-05, + "loss": 7.1723, + "step": 745 + }, + { + "epoch": 0.004436673327623941, + "grad_norm": 2.592228412628174, + "learning_rate": 4.9997578120437606e-05, + "loss": 7.2671, + "step": 746 + }, + { + "epoch": 0.004442620610904939, + "grad_norm": 2.5546112060546875, + "learning_rate": 4.999757161448928e-05, + "loss": 7.2571, + "step": 747 + }, + { + "epoch": 0.004448567894185936, + "grad_norm": 2.745755672454834, + "learning_rate": 4.999756509981457e-05, + "loss": 7.3895, + "step": 748 + }, + { + "epoch": 0.004454515177466933, + "grad_norm": 2.9785144329071045, + "learning_rate": 4.999755857641346e-05, + "loss": 7.2431, + "step": 749 + }, + { + "epoch": 0.00446046246074793, + "grad_norm": 2.918891191482544, + "learning_rate": 4.9997552044285965e-05, + "loss": 7.3805, + "step": 750 + }, + { + "epoch": 0.004466409744028927, + "grad_norm": 2.7858519554138184, + "learning_rate": 4.999754550343209e-05, + "loss": 7.5942, + "step": 751 + }, + { + "epoch": 0.004472357027309925, + "grad_norm": 2.7758638858795166, + "learning_rate": 4.999753895385181e-05, + "loss": 7.5896, + "step": 752 + }, + { + "epoch": 0.004478304310590922, + "grad_norm": 2.7125916481018066, + "learning_rate": 4.999753239554517e-05, + "loss": 7.4341, + "step": 753 + }, + { + "epoch": 0.004484251593871919, + "grad_norm": 4.241726875305176, + "learning_rate": 4.999752582851214e-05, + "loss": 7.0517, + "step": 754 + }, + { + "epoch": 0.004490198877152916, + "grad_norm": 2.9547781944274902, + "learning_rate": 4.999751925275272e-05, + "loss": 7.2616, + "step": 755 + }, + { + "epoch": 0.004496146160433914, + "grad_norm": 4.2594122886657715, + "learning_rate": 4.9997512668266945e-05, + "loss": 7.3069, + "step": 756 + }, + { + "epoch": 0.004502093443714911, + "grad_norm": 4.1758246421813965, + "learning_rate": 4.9997506075054776e-05, + "loss": 7.3417, + "step": 757 + }, + { + "epoch": 0.004508040726995908, + "grad_norm": 2.8398962020874023, + "learning_rate": 4.999749947311625e-05, + "loss": 7.107, + "step": 758 + }, + { + "epoch": 0.004513988010276905, + "grad_norm": 3.487478017807007, + "learning_rate": 4.9997492862451354e-05, + "loss": 7.0014, + "step": 759 + }, + { + "epoch": 0.004519935293557903, + "grad_norm": 2.883409261703491, + "learning_rate": 4.999748624306009e-05, + "loss": 7.4691, + "step": 760 + }, + { + "epoch": 0.0045258825768389, + "grad_norm": 3.0092155933380127, + "learning_rate": 4.999747961494246e-05, + "loss": 7.3771, + "step": 761 + }, + { + "epoch": 0.004531829860119897, + "grad_norm": 2.9571943283081055, + "learning_rate": 4.999747297809847e-05, + "loss": 7.4664, + "step": 762 + }, + { + "epoch": 0.004537777143400894, + "grad_norm": 2.7476816177368164, + "learning_rate": 4.999746633252812e-05, + "loss": 7.2943, + "step": 763 + }, + { + "epoch": 0.004543724426681892, + "grad_norm": 4.903059959411621, + "learning_rate": 4.9997459678231415e-05, + "loss": 7.3467, + "step": 764 + }, + { + "epoch": 0.004549671709962889, + "grad_norm": 3.8205373287200928, + "learning_rate": 4.999745301520835e-05, + "loss": 7.2807, + "step": 765 + }, + { + "epoch": 0.0045556189932438864, + "grad_norm": 2.6003127098083496, + "learning_rate": 4.9997446343458934e-05, + "loss": 7.2736, + "step": 766 + }, + { + "epoch": 0.004561566276524883, + "grad_norm": 3.288313627243042, + "learning_rate": 4.999743966298317e-05, + "loss": 7.3832, + "step": 767 + }, + { + "epoch": 0.004567513559805881, + "grad_norm": 3.4839234352111816, + "learning_rate": 4.999743297378106e-05, + "loss": 7.2932, + "step": 768 + }, + { + "epoch": 0.004573460843086878, + "grad_norm": 3.2667462825775146, + "learning_rate": 4.99974262758526e-05, + "loss": 7.4855, + "step": 769 + }, + { + "epoch": 0.0045794081263678755, + "grad_norm": 3.3637850284576416, + "learning_rate": 4.99974195691978e-05, + "loss": 7.4864, + "step": 770 + }, + { + "epoch": 0.004585355409648872, + "grad_norm": 4.691596508026123, + "learning_rate": 4.999741285381666e-05, + "loss": 7.4751, + "step": 771 + }, + { + "epoch": 0.00459130269292987, + "grad_norm": 3.8831942081451416, + "learning_rate": 4.999740612970918e-05, + "loss": 7.4554, + "step": 772 + }, + { + "epoch": 0.004597249976210867, + "grad_norm": 2.9129562377929688, + "learning_rate": 4.999739939687536e-05, + "loss": 7.7096, + "step": 773 + }, + { + "epoch": 0.0046031972594918645, + "grad_norm": 3.928882598876953, + "learning_rate": 4.9997392655315207e-05, + "loss": 7.6453, + "step": 774 + }, + { + "epoch": 0.004609144542772861, + "grad_norm": 4.19191312789917, + "learning_rate": 4.9997385905028726e-05, + "loss": 7.6038, + "step": 775 + }, + { + "epoch": 0.004615091826053859, + "grad_norm": 2.4585883617401123, + "learning_rate": 4.999737914601591e-05, + "loss": 7.5734, + "step": 776 + }, + { + "epoch": 0.004621039109334856, + "grad_norm": 3.500932455062866, + "learning_rate": 4.9997372378276776e-05, + "loss": 7.6535, + "step": 777 + }, + { + "epoch": 0.0046269863926158535, + "grad_norm": 3.1256210803985596, + "learning_rate": 4.9997365601811306e-05, + "loss": 7.4844, + "step": 778 + }, + { + "epoch": 0.00463293367589685, + "grad_norm": 2.083902597427368, + "learning_rate": 4.999735881661952e-05, + "loss": 7.646, + "step": 779 + }, + { + "epoch": 0.004638880959177847, + "grad_norm": 2.2990450859069824, + "learning_rate": 4.999735202270142e-05, + "loss": 7.5756, + "step": 780 + }, + { + "epoch": 0.004644828242458845, + "grad_norm": 2.782463550567627, + "learning_rate": 4.9997345220057004e-05, + "loss": 7.6191, + "step": 781 + }, + { + "epoch": 0.004650775525739842, + "grad_norm": 4.157378673553467, + "learning_rate": 4.9997338408686255e-05, + "loss": 7.5265, + "step": 782 + }, + { + "epoch": 0.004656722809020839, + "grad_norm": 2.850106716156006, + "learning_rate": 4.999733158858921e-05, + "loss": 7.4562, + "step": 783 + }, + { + "epoch": 0.004662670092301836, + "grad_norm": 2.8073840141296387, + "learning_rate": 4.999732475976585e-05, + "loss": 7.3913, + "step": 784 + }, + { + "epoch": 0.004668617375582834, + "grad_norm": 2.85048770904541, + "learning_rate": 4.999731792221618e-05, + "loss": 7.3945, + "step": 785 + }, + { + "epoch": 0.004674564658863831, + "grad_norm": 2.760990619659424, + "learning_rate": 4.999731107594021e-05, + "loss": 7.6088, + "step": 786 + }, + { + "epoch": 0.004680511942144828, + "grad_norm": 2.4395666122436523, + "learning_rate": 4.9997304220937933e-05, + "loss": 7.6996, + "step": 787 + }, + { + "epoch": 0.004686459225425825, + "grad_norm": 2.5826008319854736, + "learning_rate": 4.9997297357209354e-05, + "loss": 7.5888, + "step": 788 + }, + { + "epoch": 0.004692406508706823, + "grad_norm": 3.434957981109619, + "learning_rate": 4.999729048475448e-05, + "loss": 7.4659, + "step": 789 + }, + { + "epoch": 0.00469835379198782, + "grad_norm": 4.103111743927002, + "learning_rate": 4.9997283603573306e-05, + "loss": 7.6704, + "step": 790 + }, + { + "epoch": 0.004704301075268817, + "grad_norm": 3.7879343032836914, + "learning_rate": 4.999727671366584e-05, + "loss": 7.5387, + "step": 791 + }, + { + "epoch": 0.004710248358549814, + "grad_norm": 3.706599235534668, + "learning_rate": 4.999726981503209e-05, + "loss": 7.3413, + "step": 792 + }, + { + "epoch": 0.004716195641830812, + "grad_norm": 2.1999869346618652, + "learning_rate": 4.999726290767204e-05, + "loss": 7.1809, + "step": 793 + }, + { + "epoch": 0.004722142925111809, + "grad_norm": 2.8561251163482666, + "learning_rate": 4.999725599158571e-05, + "loss": 7.3496, + "step": 794 + }, + { + "epoch": 0.004728090208392806, + "grad_norm": 3.0696613788604736, + "learning_rate": 4.99972490667731e-05, + "loss": 7.542, + "step": 795 + }, + { + "epoch": 0.004734037491673803, + "grad_norm": 2.706404685974121, + "learning_rate": 4.99972421332342e-05, + "loss": 7.4233, + "step": 796 + }, + { + "epoch": 0.004739984774954801, + "grad_norm": 2.388360023498535, + "learning_rate": 4.9997235190969025e-05, + "loss": 7.5754, + "step": 797 + }, + { + "epoch": 0.004745932058235798, + "grad_norm": 2.3414177894592285, + "learning_rate": 4.999722823997758e-05, + "loss": 7.438, + "step": 798 + }, + { + "epoch": 0.004751879341516795, + "grad_norm": 2.46012544631958, + "learning_rate": 4.999722128025985e-05, + "loss": 6.9522, + "step": 799 + }, + { + "epoch": 0.004757826624797792, + "grad_norm": 2.5721335411071777, + "learning_rate": 4.9997214311815855e-05, + "loss": 6.9632, + "step": 800 + }, + { + "epoch": 0.00476377390807879, + "grad_norm": 2.4028279781341553, + "learning_rate": 4.999720733464559e-05, + "loss": 7.3834, + "step": 801 + }, + { + "epoch": 0.004769721191359787, + "grad_norm": 2.378971576690674, + "learning_rate": 4.9997200348749055e-05, + "loss": 7.7919, + "step": 802 + }, + { + "epoch": 0.004775668474640784, + "grad_norm": 2.1871516704559326, + "learning_rate": 4.999719335412626e-05, + "loss": 7.6832, + "step": 803 + }, + { + "epoch": 0.004781615757921781, + "grad_norm": 2.4183239936828613, + "learning_rate": 4.9997186350777206e-05, + "loss": 7.5013, + "step": 804 + }, + { + "epoch": 0.004787563041202779, + "grad_norm": 2.2322120666503906, + "learning_rate": 4.9997179338701884e-05, + "loss": 7.4224, + "step": 805 + }, + { + "epoch": 0.004793510324483776, + "grad_norm": 3.2633447647094727, + "learning_rate": 4.99971723179003e-05, + "loss": 7.1966, + "step": 806 + }, + { + "epoch": 0.004799457607764773, + "grad_norm": 3.1195995807647705, + "learning_rate": 4.999716528837247e-05, + "loss": 7.4057, + "step": 807 + }, + { + "epoch": 0.00480540489104577, + "grad_norm": 2.6904098987579346, + "learning_rate": 4.9997158250118395e-05, + "loss": 7.4585, + "step": 808 + }, + { + "epoch": 0.004811352174326768, + "grad_norm": 2.6955599784851074, + "learning_rate": 4.999715120313806e-05, + "loss": 7.6053, + "step": 809 + }, + { + "epoch": 0.004817299457607765, + "grad_norm": 3.569037675857544, + "learning_rate": 4.999714414743148e-05, + "loss": 7.5085, + "step": 810 + }, + { + "epoch": 0.004823246740888762, + "grad_norm": 3.5231528282165527, + "learning_rate": 4.9997137082998655e-05, + "loss": 7.4554, + "step": 811 + }, + { + "epoch": 0.004829194024169759, + "grad_norm": 2.7118120193481445, + "learning_rate": 4.999713000983959e-05, + "loss": 7.4323, + "step": 812 + }, + { + "epoch": 0.004835141307450756, + "grad_norm": 3.229548931121826, + "learning_rate": 4.9997122927954284e-05, + "loss": 7.3098, + "step": 813 + }, + { + "epoch": 0.004841088590731754, + "grad_norm": 2.4224696159362793, + "learning_rate": 4.999711583734273e-05, + "loss": 7.3488, + "step": 814 + }, + { + "epoch": 0.004847035874012751, + "grad_norm": 2.627565383911133, + "learning_rate": 4.999710873800496e-05, + "loss": 7.457, + "step": 815 + }, + { + "epoch": 0.004852983157293748, + "grad_norm": 2.5339515209198, + "learning_rate": 4.999710162994094e-05, + "loss": 7.6602, + "step": 816 + }, + { + "epoch": 0.004858930440574745, + "grad_norm": 2.663694143295288, + "learning_rate": 4.9997094513150706e-05, + "loss": 7.1064, + "step": 817 + }, + { + "epoch": 0.004864877723855743, + "grad_norm": 2.372504472732544, + "learning_rate": 4.9997087387634234e-05, + "loss": 7.341, + "step": 818 + }, + { + "epoch": 0.00487082500713674, + "grad_norm": 2.145191192626953, + "learning_rate": 4.999708025339154e-05, + "loss": 7.3216, + "step": 819 + }, + { + "epoch": 0.004876772290417737, + "grad_norm": 2.39685320854187, + "learning_rate": 4.9997073110422626e-05, + "loss": 7.3463, + "step": 820 + }, + { + "epoch": 0.004882719573698734, + "grad_norm": 2.2227275371551514, + "learning_rate": 4.999706595872749e-05, + "loss": 7.2517, + "step": 821 + }, + { + "epoch": 0.004888666856979732, + "grad_norm": 2.7770352363586426, + "learning_rate": 4.999705879830614e-05, + "loss": 7.3117, + "step": 822 + }, + { + "epoch": 0.004894614140260729, + "grad_norm": 2.448026180267334, + "learning_rate": 4.999705162915857e-05, + "loss": 6.9883, + "step": 823 + }, + { + "epoch": 0.004900561423541726, + "grad_norm": 2.2304437160491943, + "learning_rate": 4.999704445128479e-05, + "loss": 7.2644, + "step": 824 + }, + { + "epoch": 0.004906508706822723, + "grad_norm": 2.351707696914673, + "learning_rate": 4.9997037264684796e-05, + "loss": 7.1984, + "step": 825 + }, + { + "epoch": 0.004912455990103721, + "grad_norm": 2.7631921768188477, + "learning_rate": 4.99970300693586e-05, + "loss": 7.3774, + "step": 826 + }, + { + "epoch": 0.004918403273384718, + "grad_norm": 2.4636785984039307, + "learning_rate": 4.9997022865306195e-05, + "loss": 7.3778, + "step": 827 + }, + { + "epoch": 0.004924350556665715, + "grad_norm": 3.5510878562927246, + "learning_rate": 4.999701565252759e-05, + "loss": 7.166, + "step": 828 + }, + { + "epoch": 0.004930297839946712, + "grad_norm": 3.2581429481506348, + "learning_rate": 4.999700843102278e-05, + "loss": 7.286, + "step": 829 + }, + { + "epoch": 0.00493624512322771, + "grad_norm": 2.4304182529449463, + "learning_rate": 4.999700120079178e-05, + "loss": 7.5076, + "step": 830 + }, + { + "epoch": 0.004942192406508707, + "grad_norm": 2.428854465484619, + "learning_rate": 4.999699396183458e-05, + "loss": 7.405, + "step": 831 + }, + { + "epoch": 0.004948139689789704, + "grad_norm": 2.7680416107177734, + "learning_rate": 4.9996986714151195e-05, + "loss": 7.4944, + "step": 832 + }, + { + "epoch": 0.004954086973070701, + "grad_norm": 2.6787109375, + "learning_rate": 4.999697945774161e-05, + "loss": 7.5946, + "step": 833 + }, + { + "epoch": 0.004960034256351699, + "grad_norm": 2.6396615505218506, + "learning_rate": 4.9996972192605845e-05, + "loss": 7.5405, + "step": 834 + }, + { + "epoch": 0.004965981539632696, + "grad_norm": 2.89387583732605, + "learning_rate": 4.999696491874389e-05, + "loss": 7.3809, + "step": 835 + }, + { + "epoch": 0.004971928822913693, + "grad_norm": 2.332838535308838, + "learning_rate": 4.999695763615576e-05, + "loss": 7.3638, + "step": 836 + }, + { + "epoch": 0.00497787610619469, + "grad_norm": 2.2880585193634033, + "learning_rate": 4.9996950344841444e-05, + "loss": 7.3557, + "step": 837 + }, + { + "epoch": 0.004983823389475688, + "grad_norm": 2.7478256225585938, + "learning_rate": 4.999694304480096e-05, + "loss": 7.4, + "step": 838 + }, + { + "epoch": 0.004989770672756685, + "grad_norm": 3.4789531230926514, + "learning_rate": 4.999693573603429e-05, + "loss": 7.4438, + "step": 839 + }, + { + "epoch": 0.004995717956037682, + "grad_norm": 2.7377078533172607, + "learning_rate": 4.9996928418541455e-05, + "loss": 7.4074, + "step": 840 + }, + { + "epoch": 0.005001665239318679, + "grad_norm": 3.04420804977417, + "learning_rate": 4.9996921092322444e-05, + "loss": 7.3834, + "step": 841 + }, + { + "epoch": 0.005007612522599676, + "grad_norm": 2.759244203567505, + "learning_rate": 4.999691375737727e-05, + "loss": 7.4492, + "step": 842 + }, + { + "epoch": 0.005013559805880674, + "grad_norm": 2.5327556133270264, + "learning_rate": 4.9996906413705933e-05, + "loss": 7.4403, + "step": 843 + }, + { + "epoch": 0.0050195070891616705, + "grad_norm": 2.8170409202575684, + "learning_rate": 4.9996899061308434e-05, + "loss": 7.623, + "step": 844 + }, + { + "epoch": 0.005025454372442668, + "grad_norm": 3.8642547130584717, + "learning_rate": 4.9996891700184774e-05, + "loss": 7.6099, + "step": 845 + }, + { + "epoch": 0.005031401655723665, + "grad_norm": 4.704552173614502, + "learning_rate": 4.999688433033496e-05, + "loss": 7.6755, + "step": 846 + }, + { + "epoch": 0.005037348939004663, + "grad_norm": 4.128530979156494, + "learning_rate": 4.9996876951758986e-05, + "loss": 7.5246, + "step": 847 + }, + { + "epoch": 0.0050432962222856596, + "grad_norm": 2.233447551727295, + "learning_rate": 4.9996869564456865e-05, + "loss": 7.1139, + "step": 848 + }, + { + "epoch": 0.005049243505566657, + "grad_norm": 5.96085262298584, + "learning_rate": 4.999686216842859e-05, + "loss": 7.4114, + "step": 849 + }, + { + "epoch": 0.005055190788847654, + "grad_norm": 4.828244686126709, + "learning_rate": 4.9996854763674175e-05, + "loss": 7.6743, + "step": 850 + }, + { + "epoch": 0.005061138072128652, + "grad_norm": 3.0259342193603516, + "learning_rate": 4.999684735019362e-05, + "loss": 7.7537, + "step": 851 + }, + { + "epoch": 0.005067085355409649, + "grad_norm": 2.807244062423706, + "learning_rate": 4.999683992798692e-05, + "loss": 7.7744, + "step": 852 + }, + { + "epoch": 0.005073032638690646, + "grad_norm": 2.81384015083313, + "learning_rate": 4.999683249705408e-05, + "loss": 7.2922, + "step": 853 + }, + { + "epoch": 0.005078979921971643, + "grad_norm": 2.582836627960205, + "learning_rate": 4.9996825057395105e-05, + "loss": 7.3421, + "step": 854 + }, + { + "epoch": 0.005084927205252641, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9996817609009996e-05, + "loss": 7.6249, + "step": 855 + }, + { + "epoch": 0.005090874488533638, + "grad_norm": 2.3322219848632812, + "learning_rate": 4.999681015189875e-05, + "loss": 7.4695, + "step": 856 + }, + { + "epoch": 0.005096821771814635, + "grad_norm": 2.5582947731018066, + "learning_rate": 4.9996802686061384e-05, + "loss": 7.2747, + "step": 857 + }, + { + "epoch": 0.005102769055095632, + "grad_norm": 3.192093849182129, + "learning_rate": 4.999679521149789e-05, + "loss": 7.504, + "step": 858 + }, + { + "epoch": 0.00510871633837663, + "grad_norm": 4.1585588455200195, + "learning_rate": 4.999678772820827e-05, + "loss": 7.5966, + "step": 859 + }, + { + "epoch": 0.005114663621657627, + "grad_norm": 5.052750587463379, + "learning_rate": 4.999678023619253e-05, + "loss": 7.3243, + "step": 860 + }, + { + "epoch": 0.005120610904938624, + "grad_norm": 2.395909070968628, + "learning_rate": 4.999677273545068e-05, + "loss": 7.4477, + "step": 861 + }, + { + "epoch": 0.005126558188219621, + "grad_norm": 2.487334966659546, + "learning_rate": 4.999676522598271e-05, + "loss": 7.591, + "step": 862 + }, + { + "epoch": 0.005132505471500619, + "grad_norm": 3.7094171047210693, + "learning_rate": 4.999675770778863e-05, + "loss": 7.5387, + "step": 863 + }, + { + "epoch": 0.005138452754781616, + "grad_norm": 4.468298435211182, + "learning_rate": 4.9996750180868435e-05, + "loss": 7.5754, + "step": 864 + }, + { + "epoch": 0.005144400038062613, + "grad_norm": 3.2769386768341064, + "learning_rate": 4.999674264522213e-05, + "loss": 7.459, + "step": 865 + }, + { + "epoch": 0.00515034732134361, + "grad_norm": 2.7162864208221436, + "learning_rate": 4.9996735100849726e-05, + "loss": 7.3473, + "step": 866 + }, + { + "epoch": 0.005156294604624608, + "grad_norm": 3.646401882171631, + "learning_rate": 4.999672754775122e-05, + "loss": 7.4446, + "step": 867 + }, + { + "epoch": 0.005162241887905605, + "grad_norm": 8.917684555053711, + "learning_rate": 4.999671998592662e-05, + "loss": 7.2016, + "step": 868 + }, + { + "epoch": 0.005168189171186602, + "grad_norm": 2.949993133544922, + "learning_rate": 4.999671241537591e-05, + "loss": 7.3081, + "step": 869 + }, + { + "epoch": 0.005174136454467599, + "grad_norm": 2.4531025886535645, + "learning_rate": 4.999670483609912e-05, + "loss": 7.402, + "step": 870 + }, + { + "epoch": 0.005180083737748597, + "grad_norm": 3.1903798580169678, + "learning_rate": 4.999669724809623e-05, + "loss": 7.2514, + "step": 871 + }, + { + "epoch": 0.005186031021029594, + "grad_norm": 3.461353302001953, + "learning_rate": 4.999668965136726e-05, + "loss": 7.1637, + "step": 872 + }, + { + "epoch": 0.005191978304310591, + "grad_norm": 2.623075246810913, + "learning_rate": 4.9996682045912194e-05, + "loss": 7.5482, + "step": 873 + }, + { + "epoch": 0.005197925587591588, + "grad_norm": 2.9072840213775635, + "learning_rate": 4.9996674431731044e-05, + "loss": 7.484, + "step": 874 + }, + { + "epoch": 0.005203872870872585, + "grad_norm": 3.0219666957855225, + "learning_rate": 4.999666680882382e-05, + "loss": 7.5223, + "step": 875 + }, + { + "epoch": 0.005209820154153583, + "grad_norm": 2.9892475605010986, + "learning_rate": 4.9996659177190514e-05, + "loss": 7.3843, + "step": 876 + }, + { + "epoch": 0.0052157674374345795, + "grad_norm": 2.6199591159820557, + "learning_rate": 4.9996651536831126e-05, + "loss": 7.2728, + "step": 877 + }, + { + "epoch": 0.005221714720715577, + "grad_norm": 2.6897647380828857, + "learning_rate": 4.999664388774567e-05, + "loss": 7.5323, + "step": 878 + }, + { + "epoch": 0.005227662003996574, + "grad_norm": 3.5945560932159424, + "learning_rate": 4.9996636229934155e-05, + "loss": 7.5001, + "step": 879 + }, + { + "epoch": 0.005233609287277572, + "grad_norm": 2.9064812660217285, + "learning_rate": 4.9996628563396563e-05, + "loss": 7.5463, + "step": 880 + }, + { + "epoch": 0.0052395565705585685, + "grad_norm": 3.6150660514831543, + "learning_rate": 4.999662088813291e-05, + "loss": 7.6596, + "step": 881 + }, + { + "epoch": 0.005245503853839566, + "grad_norm": 2.729684591293335, + "learning_rate": 4.99966132041432e-05, + "loss": 7.5342, + "step": 882 + }, + { + "epoch": 0.005251451137120563, + "grad_norm": 2.6782853603363037, + "learning_rate": 4.9996605511427416e-05, + "loss": 7.5837, + "step": 883 + }, + { + "epoch": 0.005257398420401561, + "grad_norm": 4.171568393707275, + "learning_rate": 4.9996597809985576e-05, + "loss": 7.3626, + "step": 884 + }, + { + "epoch": 0.0052633457036825575, + "grad_norm": 2.189725637435913, + "learning_rate": 4.999659009981769e-05, + "loss": 7.5431, + "step": 885 + }, + { + "epoch": 0.005269292986963555, + "grad_norm": 2.2473320960998535, + "learning_rate": 4.999658238092375e-05, + "loss": 7.4731, + "step": 886 + }, + { + "epoch": 0.005275240270244552, + "grad_norm": 3.4393012523651123, + "learning_rate": 4.999657465330376e-05, + "loss": 7.6839, + "step": 887 + }, + { + "epoch": 0.00528118755352555, + "grad_norm": 2.717742919921875, + "learning_rate": 4.9996566916957735e-05, + "loss": 7.6812, + "step": 888 + }, + { + "epoch": 0.0052871348368065466, + "grad_norm": 3.829698085784912, + "learning_rate": 4.9996559171885655e-05, + "loss": 7.4525, + "step": 889 + }, + { + "epoch": 0.005293082120087544, + "grad_norm": 2.764598846435547, + "learning_rate": 4.9996551418087536e-05, + "loss": 7.5379, + "step": 890 + }, + { + "epoch": 0.005299029403368541, + "grad_norm": 2.4230268001556396, + "learning_rate": 4.999654365556338e-05, + "loss": 7.454, + "step": 891 + }, + { + "epoch": 0.005304976686649539, + "grad_norm": 2.31870436668396, + "learning_rate": 4.999653588431319e-05, + "loss": 7.5306, + "step": 892 + }, + { + "epoch": 0.005310923969930536, + "grad_norm": 2.332259178161621, + "learning_rate": 4.999652810433697e-05, + "loss": 7.4008, + "step": 893 + }, + { + "epoch": 0.005316871253211533, + "grad_norm": 2.630568504333496, + "learning_rate": 4.999652031563471e-05, + "loss": 7.4046, + "step": 894 + }, + { + "epoch": 0.00532281853649253, + "grad_norm": 3.327211856842041, + "learning_rate": 4.999651251820643e-05, + "loss": 7.2901, + "step": 895 + }, + { + "epoch": 0.005328765819773528, + "grad_norm": 2.2383713722229004, + "learning_rate": 4.999650471205213e-05, + "loss": 7.5116, + "step": 896 + }, + { + "epoch": 0.005334713103054525, + "grad_norm": 2.972820997238159, + "learning_rate": 4.99964968971718e-05, + "loss": 7.4013, + "step": 897 + }, + { + "epoch": 0.005340660386335522, + "grad_norm": 2.7254672050476074, + "learning_rate": 4.999648907356545e-05, + "loss": 7.3174, + "step": 898 + }, + { + "epoch": 0.005346607669616519, + "grad_norm": 2.6943607330322266, + "learning_rate": 4.9996481241233096e-05, + "loss": 7.386, + "step": 899 + }, + { + "epoch": 0.005352554952897517, + "grad_norm": 2.9217519760131836, + "learning_rate": 4.999647340017473e-05, + "loss": 7.5398, + "step": 900 + }, + { + "epoch": 0.005358502236178514, + "grad_norm": 2.7950780391693115, + "learning_rate": 4.999646555039034e-05, + "loss": 7.6336, + "step": 901 + }, + { + "epoch": 0.005364449519459511, + "grad_norm": 2.763364553451538, + "learning_rate": 4.999645769187995e-05, + "loss": 7.5161, + "step": 902 + }, + { + "epoch": 0.005370396802740508, + "grad_norm": 2.3095102310180664, + "learning_rate": 4.999644982464355e-05, + "loss": 7.5859, + "step": 903 + }, + { + "epoch": 0.005376344086021506, + "grad_norm": 2.7287917137145996, + "learning_rate": 4.999644194868115e-05, + "loss": 7.3983, + "step": 904 + }, + { + "epoch": 0.005382291369302503, + "grad_norm": 2.6175942420959473, + "learning_rate": 4.999643406399275e-05, + "loss": 7.4278, + "step": 905 + }, + { + "epoch": 0.0053882386525834994, + "grad_norm": 2.3898375034332275, + "learning_rate": 4.999642617057835e-05, + "loss": 7.4537, + "step": 906 + }, + { + "epoch": 0.005394185935864497, + "grad_norm": 2.964381694793701, + "learning_rate": 4.999641826843796e-05, + "loss": 7.3258, + "step": 907 + }, + { + "epoch": 0.005400133219145494, + "grad_norm": 3.1146717071533203, + "learning_rate": 4.999641035757158e-05, + "loss": 7.5412, + "step": 908 + }, + { + "epoch": 0.005406080502426492, + "grad_norm": 3.4733238220214844, + "learning_rate": 4.999640243797921e-05, + "loss": 7.423, + "step": 909 + }, + { + "epoch": 0.0054120277857074885, + "grad_norm": 3.621044158935547, + "learning_rate": 4.999639450966085e-05, + "loss": 7.5885, + "step": 910 + }, + { + "epoch": 0.005417975068988486, + "grad_norm": 2.4800662994384766, + "learning_rate": 4.999638657261651e-05, + "loss": 7.5231, + "step": 911 + }, + { + "epoch": 0.005423922352269483, + "grad_norm": 3.3247363567352295, + "learning_rate": 4.999637862684619e-05, + "loss": 7.2367, + "step": 912 + }, + { + "epoch": 0.005429869635550481, + "grad_norm": 4.293686866760254, + "learning_rate": 4.999637067234989e-05, + "loss": 6.8423, + "step": 913 + }, + { + "epoch": 0.0054358169188314775, + "grad_norm": 2.6713979244232178, + "learning_rate": 4.999636270912762e-05, + "loss": 6.7962, + "step": 914 + }, + { + "epoch": 0.005441764202112475, + "grad_norm": 2.9386653900146484, + "learning_rate": 4.9996354737179376e-05, + "loss": 6.7582, + "step": 915 + }, + { + "epoch": 0.005447711485393472, + "grad_norm": 2.8030481338500977, + "learning_rate": 4.999634675650516e-05, + "loss": 6.6516, + "step": 916 + }, + { + "epoch": 0.00545365876867447, + "grad_norm": 2.7315666675567627, + "learning_rate": 4.9996338767104985e-05, + "loss": 6.6159, + "step": 917 + }, + { + "epoch": 0.0054596060519554665, + "grad_norm": 3.116098403930664, + "learning_rate": 4.999633076897884e-05, + "loss": 7.2121, + "step": 918 + }, + { + "epoch": 0.005465553335236464, + "grad_norm": 2.867687940597534, + "learning_rate": 4.999632276212673e-05, + "loss": 7.5124, + "step": 919 + }, + { + "epoch": 0.005471500618517461, + "grad_norm": 2.9864203929901123, + "learning_rate": 4.9996314746548676e-05, + "loss": 7.5168, + "step": 920 + }, + { + "epoch": 0.005477447901798459, + "grad_norm": 2.9083375930786133, + "learning_rate": 4.9996306722244656e-05, + "loss": 7.5027, + "step": 921 + }, + { + "epoch": 0.0054833951850794555, + "grad_norm": 2.5569801330566406, + "learning_rate": 4.9996298689214686e-05, + "loss": 7.2988, + "step": 922 + }, + { + "epoch": 0.005489342468360453, + "grad_norm": 3.7101242542266846, + "learning_rate": 4.9996290647458765e-05, + "loss": 7.33, + "step": 923 + }, + { + "epoch": 0.00549528975164145, + "grad_norm": 2.848881244659424, + "learning_rate": 4.99962825969769e-05, + "loss": 7.4534, + "step": 924 + }, + { + "epoch": 0.005501237034922448, + "grad_norm": 3.072282075881958, + "learning_rate": 4.999627453776909e-05, + "loss": 7.4398, + "step": 925 + }, + { + "epoch": 0.0055071843182034445, + "grad_norm": 2.8132996559143066, + "learning_rate": 4.999626646983534e-05, + "loss": 7.5617, + "step": 926 + }, + { + "epoch": 0.005513131601484442, + "grad_norm": 2.2710142135620117, + "learning_rate": 4.999625839317565e-05, + "loss": 7.5975, + "step": 927 + }, + { + "epoch": 0.005519078884765439, + "grad_norm": 2.745007276535034, + "learning_rate": 4.9996250307790026e-05, + "loss": 7.4599, + "step": 928 + }, + { + "epoch": 0.005525026168046437, + "grad_norm": 3.2031302452087402, + "learning_rate": 4.999624221367847e-05, + "loss": 7.3528, + "step": 929 + }, + { + "epoch": 0.0055309734513274336, + "grad_norm": 6.417830467224121, + "learning_rate": 4.999623411084098e-05, + "loss": 7.5118, + "step": 930 + }, + { + "epoch": 0.005536920734608431, + "grad_norm": 2.7960314750671387, + "learning_rate": 4.999622599927756e-05, + "loss": 6.5016, + "step": 931 + }, + { + "epoch": 0.005542868017889428, + "grad_norm": 2.959507703781128, + "learning_rate": 4.999621787898822e-05, + "loss": 7.6521, + "step": 932 + }, + { + "epoch": 0.005548815301170426, + "grad_norm": 3.328834056854248, + "learning_rate": 4.999620974997296e-05, + "loss": 7.6267, + "step": 933 + }, + { + "epoch": 0.005554762584451423, + "grad_norm": 2.5232200622558594, + "learning_rate": 4.9996201612231786e-05, + "loss": 7.471, + "step": 934 + }, + { + "epoch": 0.00556070986773242, + "grad_norm": 2.2766942977905273, + "learning_rate": 4.999619346576468e-05, + "loss": 7.4204, + "step": 935 + }, + { + "epoch": 0.005566657151013417, + "grad_norm": 2.584068536758423, + "learning_rate": 4.999618531057168e-05, + "loss": 7.4384, + "step": 936 + }, + { + "epoch": 0.005572604434294414, + "grad_norm": 3.004523277282715, + "learning_rate": 4.999617714665276e-05, + "loss": 7.5681, + "step": 937 + }, + { + "epoch": 0.005578551717575412, + "grad_norm": 4.102936267852783, + "learning_rate": 4.999616897400794e-05, + "loss": 7.4571, + "step": 938 + }, + { + "epoch": 0.005584499000856408, + "grad_norm": 2.745293378829956, + "learning_rate": 4.99961607926372e-05, + "loss": 7.588, + "step": 939 + }, + { + "epoch": 0.005590446284137406, + "grad_norm": 2.9720282554626465, + "learning_rate": 4.9996152602540576e-05, + "loss": 7.4761, + "step": 940 + }, + { + "epoch": 0.005596393567418403, + "grad_norm": 3.150047540664673, + "learning_rate": 4.999614440371805e-05, + "loss": 7.4525, + "step": 941 + }, + { + "epoch": 0.005602340850699401, + "grad_norm": 2.6735856533050537, + "learning_rate": 4.999613619616962e-05, + "loss": 7.2754, + "step": 942 + }, + { + "epoch": 0.005608288133980397, + "grad_norm": 2.6451661586761475, + "learning_rate": 4.9996127979895304e-05, + "loss": 7.5742, + "step": 943 + }, + { + "epoch": 0.005614235417261395, + "grad_norm": 2.7551536560058594, + "learning_rate": 4.9996119754895095e-05, + "loss": 7.4981, + "step": 944 + }, + { + "epoch": 0.005620182700542392, + "grad_norm": 2.7445640563964844, + "learning_rate": 4.9996111521168995e-05, + "loss": 7.4761, + "step": 945 + }, + { + "epoch": 0.00562612998382339, + "grad_norm": 2.537924289703369, + "learning_rate": 4.9996103278717013e-05, + "loss": 7.5483, + "step": 946 + }, + { + "epoch": 0.0056320772671043864, + "grad_norm": 3.503661632537842, + "learning_rate": 4.9996095027539156e-05, + "loss": 7.3074, + "step": 947 + }, + { + "epoch": 0.005638024550385384, + "grad_norm": 2.8088479042053223, + "learning_rate": 4.999608676763542e-05, + "loss": 7.5675, + "step": 948 + }, + { + "epoch": 0.005643971833666381, + "grad_norm": 2.6219863891601562, + "learning_rate": 4.99960784990058e-05, + "loss": 7.6037, + "step": 949 + }, + { + "epoch": 0.005649919116947379, + "grad_norm": 2.88737416267395, + "learning_rate": 4.999607022165031e-05, + "loss": 7.4815, + "step": 950 + }, + { + "epoch": 0.0056558664002283755, + "grad_norm": 2.455707550048828, + "learning_rate": 4.999606193556895e-05, + "loss": 7.553, + "step": 951 + }, + { + "epoch": 0.005661813683509373, + "grad_norm": 2.2502405643463135, + "learning_rate": 4.999605364076173e-05, + "loss": 7.387, + "step": 952 + }, + { + "epoch": 0.00566776096679037, + "grad_norm": 2.754972457885742, + "learning_rate": 4.9996045337228635e-05, + "loss": 7.3088, + "step": 953 + }, + { + "epoch": 0.005673708250071368, + "grad_norm": 3.111553192138672, + "learning_rate": 4.9996037024969686e-05, + "loss": 7.5063, + "step": 954 + }, + { + "epoch": 0.0056796555333523645, + "grad_norm": 2.4000720977783203, + "learning_rate": 4.9996028703984875e-05, + "loss": 7.5705, + "step": 955 + }, + { + "epoch": 0.005685602816633362, + "grad_norm": 2.495659351348877, + "learning_rate": 4.9996020374274215e-05, + "loss": 7.5421, + "step": 956 + }, + { + "epoch": 0.005691550099914359, + "grad_norm": 3.025509834289551, + "learning_rate": 4.99960120358377e-05, + "loss": 7.5406, + "step": 957 + }, + { + "epoch": 0.005697497383195357, + "grad_norm": 2.224342107772827, + "learning_rate": 4.999600368867533e-05, + "loss": 7.4323, + "step": 958 + }, + { + "epoch": 0.0057034446664763535, + "grad_norm": 2.661423683166504, + "learning_rate": 4.999599533278712e-05, + "loss": 7.565, + "step": 959 + }, + { + "epoch": 0.005709391949757351, + "grad_norm": 2.503293037414551, + "learning_rate": 4.999598696817307e-05, + "loss": 7.3552, + "step": 960 + }, + { + "epoch": 0.005715339233038348, + "grad_norm": 2.2878923416137695, + "learning_rate": 4.999597859483316e-05, + "loss": 7.4542, + "step": 961 + }, + { + "epoch": 0.005721286516319346, + "grad_norm": 2.759594678878784, + "learning_rate": 4.999597021276743e-05, + "loss": 7.2349, + "step": 962 + }, + { + "epoch": 0.0057272337996003425, + "grad_norm": 4.5453314781188965, + "learning_rate": 4.999596182197586e-05, + "loss": 7.4728, + "step": 963 + }, + { + "epoch": 0.00573318108288134, + "grad_norm": 2.4369568824768066, + "learning_rate": 4.999595342245846e-05, + "loss": 7.4396, + "step": 964 + }, + { + "epoch": 0.005739128366162337, + "grad_norm": 2.4081692695617676, + "learning_rate": 4.999594501421523e-05, + "loss": 7.536, + "step": 965 + }, + { + "epoch": 0.005745075649443335, + "grad_norm": 3.0494678020477295, + "learning_rate": 4.9995936597246176e-05, + "loss": 7.4061, + "step": 966 + }, + { + "epoch": 0.0057510229327243315, + "grad_norm": 3.3492188453674316, + "learning_rate": 4.999592817155129e-05, + "loss": 7.5419, + "step": 967 + }, + { + "epoch": 0.005756970216005328, + "grad_norm": 2.254714012145996, + "learning_rate": 4.999591973713059e-05, + "loss": 7.4568, + "step": 968 + }, + { + "epoch": 0.005762917499286326, + "grad_norm": 2.3336634635925293, + "learning_rate": 4.999591129398407e-05, + "loss": 7.4386, + "step": 969 + }, + { + "epoch": 0.005768864782567323, + "grad_norm": 2.545154094696045, + "learning_rate": 4.999590284211174e-05, + "loss": 7.226, + "step": 970 + }, + { + "epoch": 0.0057748120658483205, + "grad_norm": 2.891068458557129, + "learning_rate": 4.99958943815136e-05, + "loss": 7.4235, + "step": 971 + }, + { + "epoch": 0.005780759349129317, + "grad_norm": 3.0321712493896484, + "learning_rate": 4.999588591218964e-05, + "loss": 7.2918, + "step": 972 + }, + { + "epoch": 0.005786706632410315, + "grad_norm": 2.935490846633911, + "learning_rate": 4.9995877434139884e-05, + "loss": 7.4172, + "step": 973 + }, + { + "epoch": 0.005792653915691312, + "grad_norm": 3.0021424293518066, + "learning_rate": 4.9995868947364324e-05, + "loss": 7.521, + "step": 974 + }, + { + "epoch": 0.0057986011989723096, + "grad_norm": 2.2784783840179443, + "learning_rate": 4.9995860451862964e-05, + "loss": 7.5716, + "step": 975 + }, + { + "epoch": 0.005804548482253306, + "grad_norm": 2.9321484565734863, + "learning_rate": 4.999585194763581e-05, + "loss": 7.0965, + "step": 976 + }, + { + "epoch": 0.005810495765534304, + "grad_norm": 2.284874439239502, + "learning_rate": 4.999584343468285e-05, + "loss": 7.4376, + "step": 977 + }, + { + "epoch": 0.005816443048815301, + "grad_norm": 2.2066683769226074, + "learning_rate": 4.9995834913004115e-05, + "loss": 7.4478, + "step": 978 + }, + { + "epoch": 0.005822390332096299, + "grad_norm": 2.286323070526123, + "learning_rate": 4.999582638259959e-05, + "loss": 7.4139, + "step": 979 + }, + { + "epoch": 0.005828337615377295, + "grad_norm": 2.5052928924560547, + "learning_rate": 4.999581784346927e-05, + "loss": 7.4278, + "step": 980 + }, + { + "epoch": 0.005834284898658293, + "grad_norm": 2.273698091506958, + "learning_rate": 4.9995809295613175e-05, + "loss": 7.4019, + "step": 981 + }, + { + "epoch": 0.00584023218193929, + "grad_norm": 2.729466676712036, + "learning_rate": 4.999580073903129e-05, + "loss": 7.4716, + "step": 982 + }, + { + "epoch": 0.005846179465220288, + "grad_norm": 2.5776185989379883, + "learning_rate": 4.999579217372365e-05, + "loss": 7.4708, + "step": 983 + }, + { + "epoch": 0.005852126748501284, + "grad_norm": 2.4125893115997314, + "learning_rate": 4.9995783599690226e-05, + "loss": 7.4505, + "step": 984 + }, + { + "epoch": 0.005858074031782282, + "grad_norm": 2.975911855697632, + "learning_rate": 4.9995775016931035e-05, + "loss": 7.4095, + "step": 985 + }, + { + "epoch": 0.005864021315063279, + "grad_norm": 2.4155962467193604, + "learning_rate": 4.9995766425446076e-05, + "loss": 7.3084, + "step": 986 + }, + { + "epoch": 0.005869968598344277, + "grad_norm": 2.436950922012329, + "learning_rate": 4.999575782523535e-05, + "loss": 7.2782, + "step": 987 + }, + { + "epoch": 0.0058759158816252734, + "grad_norm": 2.2371575832366943, + "learning_rate": 4.999574921629887e-05, + "loss": 7.3879, + "step": 988 + }, + { + "epoch": 0.005881863164906271, + "grad_norm": 2.3079733848571777, + "learning_rate": 4.999574059863663e-05, + "loss": 7.5117, + "step": 989 + }, + { + "epoch": 0.005887810448187268, + "grad_norm": 2.4018514156341553, + "learning_rate": 4.9995731972248626e-05, + "loss": 7.4486, + "step": 990 + }, + { + "epoch": 0.005893757731468266, + "grad_norm": 2.3437294960021973, + "learning_rate": 4.9995723337134884e-05, + "loss": 7.461, + "step": 991 + }, + { + "epoch": 0.0058997050147492625, + "grad_norm": 3.15254545211792, + "learning_rate": 4.999571469329538e-05, + "loss": 7.014, + "step": 992 + }, + { + "epoch": 0.00590565229803026, + "grad_norm": 2.4809768199920654, + "learning_rate": 4.999570604073014e-05, + "loss": 7.4339, + "step": 993 + }, + { + "epoch": 0.005911599581311257, + "grad_norm": 3.4286630153656006, + "learning_rate": 4.9995697379439154e-05, + "loss": 7.3086, + "step": 994 + }, + { + "epoch": 0.005917546864592255, + "grad_norm": 3.9362127780914307, + "learning_rate": 4.999568870942243e-05, + "loss": 7.2635, + "step": 995 + }, + { + "epoch": 0.0059234941478732515, + "grad_norm": 2.6632091999053955, + "learning_rate": 4.9995680030679965e-05, + "loss": 7.2779, + "step": 996 + }, + { + "epoch": 0.005929441431154249, + "grad_norm": 5.218096733093262, + "learning_rate": 4.999567134321177e-05, + "loss": 7.4285, + "step": 997 + }, + { + "epoch": 0.005935388714435246, + "grad_norm": 3.441894769668579, + "learning_rate": 4.9995662647017835e-05, + "loss": 7.5576, + "step": 998 + }, + { + "epoch": 0.005941335997716243, + "grad_norm": 2.560178279876709, + "learning_rate": 4.9995653942098184e-05, + "loss": 7.5692, + "step": 999 + }, + { + "epoch": 0.0059472832809972405, + "grad_norm": 2.458313226699829, + "learning_rate": 4.999564522845281e-05, + "loss": 7.0495, + "step": 1000 + }, + { + "epoch": 0.005953230564278237, + "grad_norm": 2.539314031600952, + "learning_rate": 4.999563650608171e-05, + "loss": 7.1919, + "step": 1001 + }, + { + "epoch": 0.005959177847559235, + "grad_norm": 3.6134390830993652, + "learning_rate": 4.999562777498489e-05, + "loss": 7.0725, + "step": 1002 + }, + { + "epoch": 0.005965125130840232, + "grad_norm": 2.6582295894622803, + "learning_rate": 4.9995619035162355e-05, + "loss": 7.3008, + "step": 1003 + }, + { + "epoch": 0.0059710724141212295, + "grad_norm": 2.4968035221099854, + "learning_rate": 4.999561028661411e-05, + "loss": 7.2862, + "step": 1004 + }, + { + "epoch": 0.005977019697402226, + "grad_norm": 3.002840042114258, + "learning_rate": 4.999560152934015e-05, + "loss": 7.1721, + "step": 1005 + }, + { + "epoch": 0.005982966980683224, + "grad_norm": 3.4327914714813232, + "learning_rate": 4.999559276334049e-05, + "loss": 7.242, + "step": 1006 + }, + { + "epoch": 0.005988914263964221, + "grad_norm": 2.4082493782043457, + "learning_rate": 4.999558398861513e-05, + "loss": 7.1588, + "step": 1007 + }, + { + "epoch": 0.0059948615472452185, + "grad_norm": 2.39475417137146, + "learning_rate": 4.9995575205164056e-05, + "loss": 7.1713, + "step": 1008 + }, + { + "epoch": 0.006000808830526215, + "grad_norm": 2.946331024169922, + "learning_rate": 4.99955664129873e-05, + "loss": 7.1553, + "step": 1009 + }, + { + "epoch": 0.006006756113807213, + "grad_norm": 2.4334871768951416, + "learning_rate": 4.999555761208484e-05, + "loss": 7.1898, + "step": 1010 + }, + { + "epoch": 0.00601270339708821, + "grad_norm": 2.3159971237182617, + "learning_rate": 4.999554880245669e-05, + "loss": 7.0642, + "step": 1011 + }, + { + "epoch": 0.0060186506803692075, + "grad_norm": 2.9773905277252197, + "learning_rate": 4.9995539984102854e-05, + "loss": 7.3285, + "step": 1012 + }, + { + "epoch": 0.006024597963650204, + "grad_norm": 3.444267749786377, + "learning_rate": 4.999553115702334e-05, + "loss": 7.1263, + "step": 1013 + }, + { + "epoch": 0.006030545246931202, + "grad_norm": 2.6518173217773438, + "learning_rate": 4.9995522321218136e-05, + "loss": 7.3915, + "step": 1014 + }, + { + "epoch": 0.006036492530212199, + "grad_norm": 2.46230149269104, + "learning_rate": 4.9995513476687254e-05, + "loss": 7.1808, + "step": 1015 + }, + { + "epoch": 0.0060424398134931966, + "grad_norm": 2.2243192195892334, + "learning_rate": 4.99955046234307e-05, + "loss": 7.4262, + "step": 1016 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 3.0834670066833496, + "learning_rate": 4.999549576144847e-05, + "loss": 7.4028, + "step": 1017 + }, + { + "epoch": 0.006054334380055191, + "grad_norm": 3.2453930377960205, + "learning_rate": 4.9995486890740573e-05, + "loss": 7.5537, + "step": 1018 + }, + { + "epoch": 0.006060281663336188, + "grad_norm": 2.7142229080200195, + "learning_rate": 4.9995478011307015e-05, + "loss": 7.4131, + "step": 1019 + }, + { + "epoch": 0.006066228946617186, + "grad_norm": 2.9567463397979736, + "learning_rate": 4.9995469123147784e-05, + "loss": 7.5969, + "step": 1020 + }, + { + "epoch": 0.006072176229898182, + "grad_norm": 2.5698695182800293, + "learning_rate": 4.99954602262629e-05, + "loss": 7.2721, + "step": 1021 + }, + { + "epoch": 0.00607812351317918, + "grad_norm": 2.3958864212036133, + "learning_rate": 4.999545132065235e-05, + "loss": 7.3414, + "step": 1022 + }, + { + "epoch": 0.006084070796460177, + "grad_norm": 2.528024911880493, + "learning_rate": 4.9995442406316156e-05, + "loss": 7.2821, + "step": 1023 + }, + { + "epoch": 0.006090018079741175, + "grad_norm": 2.6904075145721436, + "learning_rate": 4.999543348325431e-05, + "loss": 7.3726, + "step": 1024 + }, + { + "epoch": 0.006095965363022171, + "grad_norm": 2.8618202209472656, + "learning_rate": 4.999542455146681e-05, + "loss": 7.4232, + "step": 1025 + }, + { + "epoch": 0.006101912646303169, + "grad_norm": 1.978455662727356, + "learning_rate": 4.999541561095367e-05, + "loss": 7.5949, + "step": 1026 + }, + { + "epoch": 0.006107859929584166, + "grad_norm": 2.882568836212158, + "learning_rate": 4.999540666171489e-05, + "loss": 7.4868, + "step": 1027 + }, + { + "epoch": 0.006113807212865164, + "grad_norm": 2.9586474895477295, + "learning_rate": 4.999539770375047e-05, + "loss": 7.1556, + "step": 1028 + }, + { + "epoch": 0.00611975449614616, + "grad_norm": 2.5675363540649414, + "learning_rate": 4.999538873706041e-05, + "loss": 7.3306, + "step": 1029 + }, + { + "epoch": 0.006125701779427157, + "grad_norm": 3.440857410430908, + "learning_rate": 4.999537976164472e-05, + "loss": 7.3654, + "step": 1030 + }, + { + "epoch": 0.006131649062708155, + "grad_norm": 3.7741217613220215, + "learning_rate": 4.999537077750341e-05, + "loss": 6.8088, + "step": 1031 + }, + { + "epoch": 0.006137596345989152, + "grad_norm": 3.801609754562378, + "learning_rate": 4.999536178463647e-05, + "loss": 6.989, + "step": 1032 + }, + { + "epoch": 0.0061435436292701495, + "grad_norm": 2.627225875854492, + "learning_rate": 4.9995352783043905e-05, + "loss": 7.4066, + "step": 1033 + }, + { + "epoch": 0.006149490912551146, + "grad_norm": 3.3529040813446045, + "learning_rate": 4.9995343772725725e-05, + "loss": 7.0403, + "step": 1034 + }, + { + "epoch": 0.006155438195832144, + "grad_norm": 3.248558521270752, + "learning_rate": 4.999533475368192e-05, + "loss": 7.2664, + "step": 1035 + }, + { + "epoch": 0.006161385479113141, + "grad_norm": 3.1260814666748047, + "learning_rate": 4.9995325725912515e-05, + "loss": 7.3257, + "step": 1036 + }, + { + "epoch": 0.0061673327623941385, + "grad_norm": 2.379659414291382, + "learning_rate": 4.999531668941748e-05, + "loss": 7.4448, + "step": 1037 + }, + { + "epoch": 0.006173280045675135, + "grad_norm": 2.8478498458862305, + "learning_rate": 4.999530764419685e-05, + "loss": 7.3892, + "step": 1038 + }, + { + "epoch": 0.006179227328956133, + "grad_norm": 4.104954719543457, + "learning_rate": 4.999529859025062e-05, + "loss": 7.5172, + "step": 1039 + }, + { + "epoch": 0.00618517461223713, + "grad_norm": 2.50160813331604, + "learning_rate": 4.999528952757879e-05, + "loss": 7.1894, + "step": 1040 + }, + { + "epoch": 0.0061911218955181275, + "grad_norm": 2.5545871257781982, + "learning_rate": 4.999528045618136e-05, + "loss": 7.3892, + "step": 1041 + }, + { + "epoch": 0.006197069178799124, + "grad_norm": 2.9980626106262207, + "learning_rate": 4.999527137605833e-05, + "loss": 7.3517, + "step": 1042 + }, + { + "epoch": 0.006203016462080122, + "grad_norm": 2.5920562744140625, + "learning_rate": 4.999526228720971e-05, + "loss": 7.1716, + "step": 1043 + }, + { + "epoch": 0.006208963745361119, + "grad_norm": 2.5224244594573975, + "learning_rate": 4.999525318963551e-05, + "loss": 7.1892, + "step": 1044 + }, + { + "epoch": 0.0062149110286421165, + "grad_norm": 2.7092106342315674, + "learning_rate": 4.999524408333572e-05, + "loss": 7.178, + "step": 1045 + }, + { + "epoch": 0.006220858311923113, + "grad_norm": 2.523320198059082, + "learning_rate": 4.999523496831035e-05, + "loss": 7.1486, + "step": 1046 + }, + { + "epoch": 0.006226805595204111, + "grad_norm": 2.4491217136383057, + "learning_rate": 4.99952258445594e-05, + "loss": 7.121, + "step": 1047 + }, + { + "epoch": 0.006232752878485108, + "grad_norm": 2.29109263420105, + "learning_rate": 4.9995216712082875e-05, + "loss": 7.4323, + "step": 1048 + }, + { + "epoch": 0.0062387001617661055, + "grad_norm": 2.5234057903289795, + "learning_rate": 4.9995207570880783e-05, + "loss": 7.1552, + "step": 1049 + }, + { + "epoch": 0.006244647445047102, + "grad_norm": 2.301316499710083, + "learning_rate": 4.9995198420953115e-05, + "loss": 7.3625, + "step": 1050 + }, + { + "epoch": 0.0062505947283281, + "grad_norm": 2.4358527660369873, + "learning_rate": 4.999518926229989e-05, + "loss": 7.2462, + "step": 1051 + }, + { + "epoch": 0.006256542011609097, + "grad_norm": 2.3915181159973145, + "learning_rate": 4.999518009492109e-05, + "loss": 7.173, + "step": 1052 + }, + { + "epoch": 0.0062624892948900945, + "grad_norm": 2.5529091358184814, + "learning_rate": 4.999517091881674e-05, + "loss": 7.2463, + "step": 1053 + }, + { + "epoch": 0.006268436578171091, + "grad_norm": 3.235435724258423, + "learning_rate": 4.999516173398683e-05, + "loss": 7.1149, + "step": 1054 + }, + { + "epoch": 0.006274383861452089, + "grad_norm": 2.692140817642212, + "learning_rate": 4.9995152540431375e-05, + "loss": 7.3554, + "step": 1055 + }, + { + "epoch": 0.006280331144733086, + "grad_norm": 2.910116195678711, + "learning_rate": 4.999514333815036e-05, + "loss": 7.4424, + "step": 1056 + }, + { + "epoch": 0.0062862784280140836, + "grad_norm": 2.897463798522949, + "learning_rate": 4.9995134127143804e-05, + "loss": 7.2345, + "step": 1057 + }, + { + "epoch": 0.00629222571129508, + "grad_norm": 2.5925514698028564, + "learning_rate": 4.999512490741171e-05, + "loss": 7.1539, + "step": 1058 + }, + { + "epoch": 0.006298172994576078, + "grad_norm": 2.693816900253296, + "learning_rate": 4.999511567895407e-05, + "loss": 7.0905, + "step": 1059 + }, + { + "epoch": 0.006304120277857075, + "grad_norm": 3.3717474937438965, + "learning_rate": 4.9995106441770896e-05, + "loss": 7.1407, + "step": 1060 + }, + { + "epoch": 0.006310067561138072, + "grad_norm": 2.6128973960876465, + "learning_rate": 4.999509719586218e-05, + "loss": 7.2748, + "step": 1061 + }, + { + "epoch": 0.006316014844419069, + "grad_norm": 2.24324369430542, + "learning_rate": 4.999508794122795e-05, + "loss": 7.2553, + "step": 1062 + }, + { + "epoch": 0.006321962127700066, + "grad_norm": 2.7593698501586914, + "learning_rate": 4.999507867786818e-05, + "loss": 7.1039, + "step": 1063 + }, + { + "epoch": 0.006327909410981064, + "grad_norm": 2.6210618019104004, + "learning_rate": 4.999506940578289e-05, + "loss": 7.0247, + "step": 1064 + }, + { + "epoch": 0.006333856694262061, + "grad_norm": 2.410187244415283, + "learning_rate": 4.9995060124972084e-05, + "loss": 7.3931, + "step": 1065 + }, + { + "epoch": 0.006339803977543058, + "grad_norm": 2.795302391052246, + "learning_rate": 4.999505083543575e-05, + "loss": 7.3168, + "step": 1066 + }, + { + "epoch": 0.006345751260824055, + "grad_norm": 2.3720662593841553, + "learning_rate": 4.999504153717391e-05, + "loss": 7.3719, + "step": 1067 + }, + { + "epoch": 0.006351698544105053, + "grad_norm": 2.721585988998413, + "learning_rate": 4.9995032230186556e-05, + "loss": 7.3847, + "step": 1068 + }, + { + "epoch": 0.00635764582738605, + "grad_norm": 2.967153549194336, + "learning_rate": 4.99950229144737e-05, + "loss": 7.3224, + "step": 1069 + }, + { + "epoch": 0.006363593110667047, + "grad_norm": 3.8144783973693848, + "learning_rate": 4.999501359003533e-05, + "loss": 7.0767, + "step": 1070 + }, + { + "epoch": 0.006369540393948044, + "grad_norm": 3.7694199085235596, + "learning_rate": 4.999500425687147e-05, + "loss": 7.4486, + "step": 1071 + }, + { + "epoch": 0.006375487677229042, + "grad_norm": 2.9668312072753906, + "learning_rate": 4.999499491498211e-05, + "loss": 7.3415, + "step": 1072 + }, + { + "epoch": 0.006381434960510039, + "grad_norm": 4.196050643920898, + "learning_rate": 4.999498556436725e-05, + "loss": 7.3784, + "step": 1073 + }, + { + "epoch": 0.0063873822437910364, + "grad_norm": 4.676602363586426, + "learning_rate": 4.99949762050269e-05, + "loss": 7.3773, + "step": 1074 + }, + { + "epoch": 0.006393329527072033, + "grad_norm": 2.8828656673431396, + "learning_rate": 4.999496683696107e-05, + "loss": 7.2359, + "step": 1075 + }, + { + "epoch": 0.006399276810353031, + "grad_norm": 2.7532308101654053, + "learning_rate": 4.9994957460169745e-05, + "loss": 7.356, + "step": 1076 + }, + { + "epoch": 0.006405224093634028, + "grad_norm": 5.535451412200928, + "learning_rate": 4.999494807465293e-05, + "loss": 7.261, + "step": 1077 + }, + { + "epoch": 0.0064111713769150255, + "grad_norm": 3.6439530849456787, + "learning_rate": 4.999493868041066e-05, + "loss": 7.4664, + "step": 1078 + }, + { + "epoch": 0.006417118660196022, + "grad_norm": 3.563948154449463, + "learning_rate": 4.99949292774429e-05, + "loss": 7.0427, + "step": 1079 + }, + { + "epoch": 0.00642306594347702, + "grad_norm": 3.6243784427642822, + "learning_rate": 4.9994919865749675e-05, + "loss": 7.3292, + "step": 1080 + }, + { + "epoch": 0.006429013226758017, + "grad_norm": 5.1197590827941895, + "learning_rate": 4.999491044533098e-05, + "loss": 7.3717, + "step": 1081 + }, + { + "epoch": 0.0064349605100390145, + "grad_norm": 4.3969902992248535, + "learning_rate": 4.999490101618682e-05, + "loss": 7.2875, + "step": 1082 + }, + { + "epoch": 0.006440907793320011, + "grad_norm": 2.6302945613861084, + "learning_rate": 4.999489157831719e-05, + "loss": 7.1958, + "step": 1083 + }, + { + "epoch": 0.006446855076601009, + "grad_norm": 3.782078504562378, + "learning_rate": 4.9994882131722116e-05, + "loss": 7.2951, + "step": 1084 + }, + { + "epoch": 0.006452802359882006, + "grad_norm": 3.432082414627075, + "learning_rate": 4.999487267640158e-05, + "loss": 7.0974, + "step": 1085 + }, + { + "epoch": 0.0064587496431630035, + "grad_norm": 3.364793300628662, + "learning_rate": 4.999486321235559e-05, + "loss": 7.0847, + "step": 1086 + }, + { + "epoch": 0.006464696926444, + "grad_norm": 2.7063019275665283, + "learning_rate": 4.999485373958416e-05, + "loss": 7.1421, + "step": 1087 + }, + { + "epoch": 0.006470644209724998, + "grad_norm": 3.0648648738861084, + "learning_rate": 4.999484425808727e-05, + "loss": 7.2723, + "step": 1088 + }, + { + "epoch": 0.006476591493005995, + "grad_norm": 3.3968300819396973, + "learning_rate": 4.999483476786495e-05, + "loss": 7.1438, + "step": 1089 + }, + { + "epoch": 0.0064825387762869925, + "grad_norm": 2.864647150039673, + "learning_rate": 4.999482526891719e-05, + "loss": 7.1512, + "step": 1090 + }, + { + "epoch": 0.006488486059567989, + "grad_norm": 2.577043056488037, + "learning_rate": 4.999481576124399e-05, + "loss": 6.8914, + "step": 1091 + }, + { + "epoch": 0.006494433342848986, + "grad_norm": 2.83754563331604, + "learning_rate": 4.999480624484536e-05, + "loss": 6.9999, + "step": 1092 + }, + { + "epoch": 0.006500380626129984, + "grad_norm": 3.5623857975006104, + "learning_rate": 4.999479671972131e-05, + "loss": 7.0567, + "step": 1093 + }, + { + "epoch": 0.006506327909410981, + "grad_norm": 2.35555362701416, + "learning_rate": 4.9994787185871814e-05, + "loss": 7.3075, + "step": 1094 + }, + { + "epoch": 0.006512275192691978, + "grad_norm": 3.8677117824554443, + "learning_rate": 4.9994777643296914e-05, + "loss": 7.3608, + "step": 1095 + }, + { + "epoch": 0.006518222475972975, + "grad_norm": 3.8163843154907227, + "learning_rate": 4.999476809199659e-05, + "loss": 7.4368, + "step": 1096 + }, + { + "epoch": 0.006524169759253973, + "grad_norm": 2.5424652099609375, + "learning_rate": 4.999475853197085e-05, + "loss": 7.4968, + "step": 1097 + }, + { + "epoch": 0.00653011704253497, + "grad_norm": 2.876898765563965, + "learning_rate": 4.99947489632197e-05, + "loss": 6.9948, + "step": 1098 + }, + { + "epoch": 0.006536064325815967, + "grad_norm": 3.3934860229492188, + "learning_rate": 4.999473938574314e-05, + "loss": 6.9588, + "step": 1099 + }, + { + "epoch": 0.006542011609096964, + "grad_norm": 2.1184024810791016, + "learning_rate": 4.9994729799541176e-05, + "loss": 7.1933, + "step": 1100 + }, + { + "epoch": 0.006547958892377962, + "grad_norm": 2.2882895469665527, + "learning_rate": 4.999472020461381e-05, + "loss": 7.0796, + "step": 1101 + }, + { + "epoch": 0.006553906175658959, + "grad_norm": 3.239429235458374, + "learning_rate": 4.9994710600961045e-05, + "loss": 6.9535, + "step": 1102 + }, + { + "epoch": 0.006559853458939956, + "grad_norm": 2.4653263092041016, + "learning_rate": 4.9994700988582884e-05, + "loss": 6.9316, + "step": 1103 + }, + { + "epoch": 0.006565800742220953, + "grad_norm": 2.511516571044922, + "learning_rate": 4.999469136747933e-05, + "loss": 6.9844, + "step": 1104 + }, + { + "epoch": 0.006571748025501951, + "grad_norm": 2.9725844860076904, + "learning_rate": 4.9994681737650384e-05, + "loss": 7.1955, + "step": 1105 + }, + { + "epoch": 0.006577695308782948, + "grad_norm": 3.04697585105896, + "learning_rate": 4.9994672099096066e-05, + "loss": 7.1044, + "step": 1106 + }, + { + "epoch": 0.006583642592063945, + "grad_norm": 3.395076274871826, + "learning_rate": 4.999466245181635e-05, + "loss": 7.1968, + "step": 1107 + }, + { + "epoch": 0.006589589875344942, + "grad_norm": 2.362884044647217, + "learning_rate": 4.999465279581127e-05, + "loss": 7.3114, + "step": 1108 + }, + { + "epoch": 0.00659553715862594, + "grad_norm": 2.730980396270752, + "learning_rate": 4.99946431310808e-05, + "loss": 7.1978, + "step": 1109 + }, + { + "epoch": 0.006601484441906937, + "grad_norm": 3.288687229156494, + "learning_rate": 4.9994633457624974e-05, + "loss": 7.4397, + "step": 1110 + }, + { + "epoch": 0.006607431725187934, + "grad_norm": 3.3060662746429443, + "learning_rate": 4.999462377544377e-05, + "loss": 7.1638, + "step": 1111 + }, + { + "epoch": 0.006613379008468931, + "grad_norm": 2.2697036266326904, + "learning_rate": 4.9994614084537204e-05, + "loss": 7.2654, + "step": 1112 + }, + { + "epoch": 0.006619326291749929, + "grad_norm": 2.330495595932007, + "learning_rate": 4.999460438490528e-05, + "loss": 7.2132, + "step": 1113 + }, + { + "epoch": 0.006625273575030926, + "grad_norm": 2.8239340782165527, + "learning_rate": 4.999459467654799e-05, + "loss": 7.3477, + "step": 1114 + }, + { + "epoch": 0.0066312208583119234, + "grad_norm": 2.591614246368408, + "learning_rate": 4.999458495946535e-05, + "loss": 7.0377, + "step": 1115 + }, + { + "epoch": 0.00663716814159292, + "grad_norm": 4.554818630218506, + "learning_rate": 4.999457523365736e-05, + "loss": 7.1266, + "step": 1116 + }, + { + "epoch": 0.006643115424873918, + "grad_norm": 2.21018123626709, + "learning_rate": 4.999456549912401e-05, + "loss": 7.1433, + "step": 1117 + }, + { + "epoch": 0.006649062708154915, + "grad_norm": 2.0298593044281006, + "learning_rate": 4.999455575586533e-05, + "loss": 7.257, + "step": 1118 + }, + { + "epoch": 0.0066550099914359125, + "grad_norm": 2.4532642364501953, + "learning_rate": 4.9994546003881305e-05, + "loss": 7.0618, + "step": 1119 + }, + { + "epoch": 0.006660957274716909, + "grad_norm": 2.428380012512207, + "learning_rate": 4.999453624317194e-05, + "loss": 7.2039, + "step": 1120 + }, + { + "epoch": 0.006666904557997907, + "grad_norm": 2.5572609901428223, + "learning_rate": 4.999452647373724e-05, + "loss": 7.0991, + "step": 1121 + }, + { + "epoch": 0.006672851841278904, + "grad_norm": 2.379640817642212, + "learning_rate": 4.999451669557721e-05, + "loss": 7.1424, + "step": 1122 + }, + { + "epoch": 0.006678799124559901, + "grad_norm": 2.5764007568359375, + "learning_rate": 4.999450690869185e-05, + "loss": 7.1218, + "step": 1123 + }, + { + "epoch": 0.006684746407840898, + "grad_norm": 2.6560606956481934, + "learning_rate": 4.999449711308117e-05, + "loss": 7.2994, + "step": 1124 + }, + { + "epoch": 0.006690693691121895, + "grad_norm": 2.4687581062316895, + "learning_rate": 4.999448730874518e-05, + "loss": 7.4169, + "step": 1125 + }, + { + "epoch": 0.006696640974402893, + "grad_norm": 2.8232173919677734, + "learning_rate": 4.999447749568386e-05, + "loss": 7.291, + "step": 1126 + }, + { + "epoch": 0.00670258825768389, + "grad_norm": 2.6960325241088867, + "learning_rate": 4.9994467673897224e-05, + "loss": 7.3162, + "step": 1127 + }, + { + "epoch": 0.006708535540964887, + "grad_norm": 2.222391366958618, + "learning_rate": 4.999445784338528e-05, + "loss": 7.221, + "step": 1128 + }, + { + "epoch": 0.006714482824245884, + "grad_norm": 2.334995985031128, + "learning_rate": 4.9994448004148024e-05, + "loss": 7.4813, + "step": 1129 + }, + { + "epoch": 0.006720430107526882, + "grad_norm": 2.653491497039795, + "learning_rate": 4.999443815618548e-05, + "loss": 7.3515, + "step": 1130 + }, + { + "epoch": 0.006726377390807879, + "grad_norm": 2.6943631172180176, + "learning_rate": 4.999442829949762e-05, + "loss": 7.2674, + "step": 1131 + }, + { + "epoch": 0.006732324674088876, + "grad_norm": 2.395573377609253, + "learning_rate": 4.999441843408447e-05, + "loss": 7.483, + "step": 1132 + }, + { + "epoch": 0.006738271957369873, + "grad_norm": 2.3801541328430176, + "learning_rate": 4.999440855994603e-05, + "loss": 7.3355, + "step": 1133 + }, + { + "epoch": 0.006744219240650871, + "grad_norm": 2.8566555976867676, + "learning_rate": 4.999439867708229e-05, + "loss": 6.8323, + "step": 1134 + }, + { + "epoch": 0.006750166523931868, + "grad_norm": 2.5987985134124756, + "learning_rate": 4.999438878549327e-05, + "loss": 6.957, + "step": 1135 + }, + { + "epoch": 0.006756113807212865, + "grad_norm": 2.4411563873291016, + "learning_rate": 4.9994378885178964e-05, + "loss": 6.9935, + "step": 1136 + }, + { + "epoch": 0.006762061090493862, + "grad_norm": 2.4227802753448486, + "learning_rate": 4.9994368976139386e-05, + "loss": 7.2856, + "step": 1137 + }, + { + "epoch": 0.00676800837377486, + "grad_norm": 2.55317759513855, + "learning_rate": 4.999435905837453e-05, + "loss": 7.1741, + "step": 1138 + }, + { + "epoch": 0.006773955657055857, + "grad_norm": 2.3329968452453613, + "learning_rate": 4.9994349131884396e-05, + "loss": 7.2007, + "step": 1139 + }, + { + "epoch": 0.006779902940336854, + "grad_norm": 2.538499593734741, + "learning_rate": 4.999433919666899e-05, + "loss": 7.1755, + "step": 1140 + }, + { + "epoch": 0.006785850223617851, + "grad_norm": 2.3580374717712402, + "learning_rate": 4.999432925272833e-05, + "loss": 7.2249, + "step": 1141 + }, + { + "epoch": 0.006791797506898849, + "grad_norm": 2.2783255577087402, + "learning_rate": 4.99943193000624e-05, + "loss": 7.3627, + "step": 1142 + }, + { + "epoch": 0.006797744790179846, + "grad_norm": 3.0798208713531494, + "learning_rate": 4.999430933867122e-05, + "loss": 7.2718, + "step": 1143 + }, + { + "epoch": 0.006803692073460843, + "grad_norm": 2.703232526779175, + "learning_rate": 4.9994299368554776e-05, + "loss": 7.116, + "step": 1144 + }, + { + "epoch": 0.00680963935674184, + "grad_norm": 2.480327606201172, + "learning_rate": 4.9994289389713076e-05, + "loss": 6.9743, + "step": 1145 + }, + { + "epoch": 0.006815586640022838, + "grad_norm": 2.2707130908966064, + "learning_rate": 4.9994279402146137e-05, + "loss": 6.9919, + "step": 1146 + }, + { + "epoch": 0.006821533923303835, + "grad_norm": 2.0424580574035645, + "learning_rate": 4.999426940585396e-05, + "loss": 7.0366, + "step": 1147 + }, + { + "epoch": 0.006827481206584832, + "grad_norm": 1.9720054864883423, + "learning_rate": 4.999425940083653e-05, + "loss": 6.8622, + "step": 1148 + }, + { + "epoch": 0.006833428489865829, + "grad_norm": 2.7109742164611816, + "learning_rate": 4.9994249387093864e-05, + "loss": 7.5375, + "step": 1149 + }, + { + "epoch": 0.006839375773146827, + "grad_norm": 2.267328977584839, + "learning_rate": 4.999423936462596e-05, + "loss": 7.5606, + "step": 1150 + }, + { + "epoch": 0.006845323056427824, + "grad_norm": 2.958360433578491, + "learning_rate": 4.999422933343283e-05, + "loss": 7.3503, + "step": 1151 + }, + { + "epoch": 0.006851270339708821, + "grad_norm": 2.2681283950805664, + "learning_rate": 4.9994219293514475e-05, + "loss": 6.9278, + "step": 1152 + }, + { + "epoch": 0.006857217622989818, + "grad_norm": 2.4755337238311768, + "learning_rate": 4.999420924487089e-05, + "loss": 7.1385, + "step": 1153 + }, + { + "epoch": 0.006863164906270815, + "grad_norm": 2.283277988433838, + "learning_rate": 4.999419918750209e-05, + "loss": 6.9287, + "step": 1154 + }, + { + "epoch": 0.006869112189551813, + "grad_norm": 2.3692893981933594, + "learning_rate": 4.999418912140808e-05, + "loss": 7.0648, + "step": 1155 + }, + { + "epoch": 0.00687505947283281, + "grad_norm": 2.2676453590393066, + "learning_rate": 4.999417904658884e-05, + "loss": 6.9754, + "step": 1156 + }, + { + "epoch": 0.006881006756113807, + "grad_norm": 2.4106669425964355, + "learning_rate": 4.9994168963044405e-05, + "loss": 7.033, + "step": 1157 + }, + { + "epoch": 0.006886954039394804, + "grad_norm": 2.947758913040161, + "learning_rate": 4.9994158870774754e-05, + "loss": 7.0821, + "step": 1158 + }, + { + "epoch": 0.006892901322675802, + "grad_norm": 2.5338058471679688, + "learning_rate": 4.9994148769779905e-05, + "loss": 6.9426, + "step": 1159 + }, + { + "epoch": 0.006898848605956799, + "grad_norm": 2.4848148822784424, + "learning_rate": 4.999413866005985e-05, + "loss": 7.2488, + "step": 1160 + }, + { + "epoch": 0.006904795889237796, + "grad_norm": 2.444077730178833, + "learning_rate": 4.999412854161461e-05, + "loss": 6.871, + "step": 1161 + }, + { + "epoch": 0.006910743172518793, + "grad_norm": 2.376962661743164, + "learning_rate": 4.9994118414444174e-05, + "loss": 7.0258, + "step": 1162 + }, + { + "epoch": 0.006916690455799791, + "grad_norm": 3.502023458480835, + "learning_rate": 4.9994108278548545e-05, + "loss": 7.4869, + "step": 1163 + }, + { + "epoch": 0.006922637739080788, + "grad_norm": 3.117741584777832, + "learning_rate": 4.999409813392774e-05, + "loss": 7.4437, + "step": 1164 + }, + { + "epoch": 0.006928585022361785, + "grad_norm": 3.805560827255249, + "learning_rate": 4.999408798058175e-05, + "loss": 7.3796, + "step": 1165 + }, + { + "epoch": 0.006934532305642782, + "grad_norm": 3.67065167427063, + "learning_rate": 4.9994077818510576e-05, + "loss": 7.2304, + "step": 1166 + }, + { + "epoch": 0.00694047958892378, + "grad_norm": 2.5749545097351074, + "learning_rate": 4.9994067647714236e-05, + "loss": 7.0943, + "step": 1167 + }, + { + "epoch": 0.006946426872204777, + "grad_norm": 2.561405897140503, + "learning_rate": 4.9994057468192724e-05, + "loss": 6.9496, + "step": 1168 + }, + { + "epoch": 0.006952374155485774, + "grad_norm": 2.477344512939453, + "learning_rate": 4.999404727994604e-05, + "loss": 7.3494, + "step": 1169 + }, + { + "epoch": 0.006958321438766771, + "grad_norm": 2.897580146789551, + "learning_rate": 4.999403708297419e-05, + "loss": 7.6081, + "step": 1170 + }, + { + "epoch": 0.006964268722047769, + "grad_norm": 3.899249792098999, + "learning_rate": 4.999402687727719e-05, + "loss": 7.4448, + "step": 1171 + }, + { + "epoch": 0.006970216005328766, + "grad_norm": 3.0791561603546143, + "learning_rate": 4.9994016662855025e-05, + "loss": 7.1616, + "step": 1172 + }, + { + "epoch": 0.006976163288609763, + "grad_norm": 2.8212931156158447, + "learning_rate": 4.999400643970771e-05, + "loss": 7.1824, + "step": 1173 + }, + { + "epoch": 0.00698211057189076, + "grad_norm": 4.33271598815918, + "learning_rate": 4.9993996207835246e-05, + "loss": 7.2432, + "step": 1174 + }, + { + "epoch": 0.006988057855171758, + "grad_norm": 2.985125780105591, + "learning_rate": 4.999398596723764e-05, + "loss": 7.6521, + "step": 1175 + }, + { + "epoch": 0.006994005138452755, + "grad_norm": 3.1069905757904053, + "learning_rate": 4.9993975717914885e-05, + "loss": 7.0071, + "step": 1176 + }, + { + "epoch": 0.006999952421733752, + "grad_norm": 2.915214776992798, + "learning_rate": 4.9993965459866995e-05, + "loss": 7.6192, + "step": 1177 + }, + { + "epoch": 0.007005899705014749, + "grad_norm": 5.314033031463623, + "learning_rate": 4.999395519309397e-05, + "loss": 6.9447, + "step": 1178 + }, + { + "epoch": 0.007011846988295747, + "grad_norm": 2.2723114490509033, + "learning_rate": 4.999394491759581e-05, + "loss": 7.1228, + "step": 1179 + }, + { + "epoch": 0.007017794271576744, + "grad_norm": 2.936365842819214, + "learning_rate": 4.999393463337253e-05, + "loss": 7.136, + "step": 1180 + }, + { + "epoch": 0.007023741554857741, + "grad_norm": 2.864250898361206, + "learning_rate": 4.9993924340424115e-05, + "loss": 7.026, + "step": 1181 + }, + { + "epoch": 0.007029688838138738, + "grad_norm": 3.299370050430298, + "learning_rate": 4.9993914038750586e-05, + "loss": 7.1114, + "step": 1182 + }, + { + "epoch": 0.007035636121419736, + "grad_norm": 3.0609943866729736, + "learning_rate": 4.999390372835193e-05, + "loss": 7.3052, + "step": 1183 + }, + { + "epoch": 0.007041583404700733, + "grad_norm": 3.54488468170166, + "learning_rate": 4.9993893409228176e-05, + "loss": 7.4845, + "step": 1184 + }, + { + "epoch": 0.0070475306879817295, + "grad_norm": 2.5196385383605957, + "learning_rate": 4.99938830813793e-05, + "loss": 7.312, + "step": 1185 + }, + { + "epoch": 0.007053477971262727, + "grad_norm": 3.570802927017212, + "learning_rate": 4.9993872744805326e-05, + "loss": 7.0038, + "step": 1186 + }, + { + "epoch": 0.007059425254543724, + "grad_norm": 2.631058931350708, + "learning_rate": 4.999386239950624e-05, + "loss": 7.5574, + "step": 1187 + }, + { + "epoch": 0.007065372537824722, + "grad_norm": 3.027251958847046, + "learning_rate": 4.999385204548206e-05, + "loss": 6.9837, + "step": 1188 + }, + { + "epoch": 0.0070713198211057185, + "grad_norm": 3.00128173828125, + "learning_rate": 4.999384168273279e-05, + "loss": 7.4479, + "step": 1189 + }, + { + "epoch": 0.007077267104386716, + "grad_norm": 2.127028226852417, + "learning_rate": 4.999383131125842e-05, + "loss": 7.3609, + "step": 1190 + }, + { + "epoch": 0.007083214387667713, + "grad_norm": 2.375511646270752, + "learning_rate": 4.9993820931058965e-05, + "loss": 7.3695, + "step": 1191 + }, + { + "epoch": 0.007089161670948711, + "grad_norm": 2.527743101119995, + "learning_rate": 4.999381054213442e-05, + "loss": 7.1478, + "step": 1192 + }, + { + "epoch": 0.0070951089542297075, + "grad_norm": 2.1600632667541504, + "learning_rate": 4.99938001444848e-05, + "loss": 7.7111, + "step": 1193 + }, + { + "epoch": 0.007101056237510705, + "grad_norm": 2.3242850303649902, + "learning_rate": 4.99937897381101e-05, + "loss": 7.6751, + "step": 1194 + }, + { + "epoch": 0.007107003520791702, + "grad_norm": 3.4553158283233643, + "learning_rate": 4.9993779323010334e-05, + "loss": 7.775, + "step": 1195 + }, + { + "epoch": 0.0071129508040727, + "grad_norm": 2.4339516162872314, + "learning_rate": 4.999376889918549e-05, + "loss": 7.099, + "step": 1196 + }, + { + "epoch": 0.0071188980873536966, + "grad_norm": 2.531851291656494, + "learning_rate": 4.9993758466635574e-05, + "loss": 7.5222, + "step": 1197 + }, + { + "epoch": 0.007124845370634694, + "grad_norm": 2.6549220085144043, + "learning_rate": 4.999374802536061e-05, + "loss": 7.4917, + "step": 1198 + }, + { + "epoch": 0.007130792653915691, + "grad_norm": 2.9149320125579834, + "learning_rate": 4.999373757536058e-05, + "loss": 7.0438, + "step": 1199 + }, + { + "epoch": 0.007136739937196689, + "grad_norm": 3.0234971046447754, + "learning_rate": 4.999372711663549e-05, + "loss": 7.6838, + "step": 1200 + }, + { + "epoch": 0.007142687220477686, + "grad_norm": 2.4006800651550293, + "learning_rate": 4.999371664918535e-05, + "loss": 7.6607, + "step": 1201 + }, + { + "epoch": 0.007148634503758683, + "grad_norm": 2.6191699504852295, + "learning_rate": 4.9993706173010164e-05, + "loss": 7.4727, + "step": 1202 + }, + { + "epoch": 0.00715458178703968, + "grad_norm": 3.040844440460205, + "learning_rate": 4.999369568810993e-05, + "loss": 7.1459, + "step": 1203 + }, + { + "epoch": 0.007160529070320678, + "grad_norm": 2.8474466800689697, + "learning_rate": 4.9993685194484654e-05, + "loss": 7.4615, + "step": 1204 + }, + { + "epoch": 0.007166476353601675, + "grad_norm": 1.928662657737732, + "learning_rate": 4.999367469213435e-05, + "loss": 7.4259, + "step": 1205 + }, + { + "epoch": 0.007172423636882672, + "grad_norm": 2.369540214538574, + "learning_rate": 4.999366418105901e-05, + "loss": 6.9342, + "step": 1206 + }, + { + "epoch": 0.007178370920163669, + "grad_norm": 4.003239154815674, + "learning_rate": 4.999365366125863e-05, + "loss": 7.3289, + "step": 1207 + }, + { + "epoch": 0.007184318203444667, + "grad_norm": 4.491976261138916, + "learning_rate": 4.9993643132733234e-05, + "loss": 7.3479, + "step": 1208 + }, + { + "epoch": 0.007190265486725664, + "grad_norm": 2.3678557872772217, + "learning_rate": 4.9993632595482806e-05, + "loss": 7.3091, + "step": 1209 + }, + { + "epoch": 0.007196212770006661, + "grad_norm": 2.9310050010681152, + "learning_rate": 4.999362204950737e-05, + "loss": 7.1996, + "step": 1210 + }, + { + "epoch": 0.007202160053287658, + "grad_norm": 3.6861345767974854, + "learning_rate": 4.999361149480691e-05, + "loss": 7.43, + "step": 1211 + }, + { + "epoch": 0.007208107336568656, + "grad_norm": 2.657515287399292, + "learning_rate": 4.9993600931381446e-05, + "loss": 6.9888, + "step": 1212 + }, + { + "epoch": 0.007214054619849653, + "grad_norm": 2.8346996307373047, + "learning_rate": 4.999359035923097e-05, + "loss": 7.0366, + "step": 1213 + }, + { + "epoch": 0.00722000190313065, + "grad_norm": 3.494162082672119, + "learning_rate": 4.9993579778355487e-05, + "loss": 7.499, + "step": 1214 + }, + { + "epoch": 0.007225949186411647, + "grad_norm": 2.9848556518554688, + "learning_rate": 4.999356918875501e-05, + "loss": 7.2064, + "step": 1215 + }, + { + "epoch": 0.007231896469692645, + "grad_norm": 2.391390562057495, + "learning_rate": 4.999355859042953e-05, + "loss": 7.2752, + "step": 1216 + }, + { + "epoch": 0.007237843752973642, + "grad_norm": 2.872891902923584, + "learning_rate": 4.9993547983379065e-05, + "loss": 6.9865, + "step": 1217 + }, + { + "epoch": 0.0072437910362546385, + "grad_norm": 2.760213613510132, + "learning_rate": 4.99935373676036e-05, + "loss": 7.0211, + "step": 1218 + }, + { + "epoch": 0.007249738319535636, + "grad_norm": 2.8857531547546387, + "learning_rate": 4.9993526743103156e-05, + "loss": 6.9162, + "step": 1219 + }, + { + "epoch": 0.007255685602816633, + "grad_norm": 3.150836229324341, + "learning_rate": 4.999351610987772e-05, + "loss": 7.2929, + "step": 1220 + }, + { + "epoch": 0.007261632886097631, + "grad_norm": 2.2004289627075195, + "learning_rate": 4.999350546792732e-05, + "loss": 7.4729, + "step": 1221 + }, + { + "epoch": 0.0072675801693786275, + "grad_norm": 2.5004026889801025, + "learning_rate": 4.999349481725194e-05, + "loss": 7.5235, + "step": 1222 + }, + { + "epoch": 0.007273527452659625, + "grad_norm": 2.8355395793914795, + "learning_rate": 4.999348415785159e-05, + "loss": 7.3535, + "step": 1223 + }, + { + "epoch": 0.007279474735940622, + "grad_norm": 2.559330701828003, + "learning_rate": 4.9993473489726276e-05, + "loss": 6.9634, + "step": 1224 + }, + { + "epoch": 0.00728542201922162, + "grad_norm": 2.3559181690216064, + "learning_rate": 4.999346281287599e-05, + "loss": 6.9246, + "step": 1225 + }, + { + "epoch": 0.0072913693025026165, + "grad_norm": 2.3852717876434326, + "learning_rate": 4.999345212730075e-05, + "loss": 6.6417, + "step": 1226 + }, + { + "epoch": 0.007297316585783614, + "grad_norm": 2.2604117393493652, + "learning_rate": 4.999344143300055e-05, + "loss": 7.4182, + "step": 1227 + }, + { + "epoch": 0.007303263869064611, + "grad_norm": 2.57983660697937, + "learning_rate": 4.9993430729975396e-05, + "loss": 7.4841, + "step": 1228 + }, + { + "epoch": 0.007309211152345609, + "grad_norm": 2.653935670852661, + "learning_rate": 4.99934200182253e-05, + "loss": 7.5477, + "step": 1229 + }, + { + "epoch": 0.0073151584356266055, + "grad_norm": 2.0740158557891846, + "learning_rate": 4.999340929775026e-05, + "loss": 7.4359, + "step": 1230 + }, + { + "epoch": 0.007321105718907603, + "grad_norm": 2.62064528465271, + "learning_rate": 4.9993398568550275e-05, + "loss": 7.1817, + "step": 1231 + }, + { + "epoch": 0.0073270530021886, + "grad_norm": 2.318244457244873, + "learning_rate": 4.999338783062536e-05, + "loss": 7.1663, + "step": 1232 + }, + { + "epoch": 0.007333000285469598, + "grad_norm": 3.0533225536346436, + "learning_rate": 4.99933770839755e-05, + "loss": 7.3051, + "step": 1233 + }, + { + "epoch": 0.0073389475687505945, + "grad_norm": 4.821422100067139, + "learning_rate": 4.999336632860072e-05, + "loss": 7.3435, + "step": 1234 + }, + { + "epoch": 0.007344894852031592, + "grad_norm": 2.680873155593872, + "learning_rate": 4.999335556450101e-05, + "loss": 7.3447, + "step": 1235 + }, + { + "epoch": 0.007350842135312589, + "grad_norm": 3.287454605102539, + "learning_rate": 4.999334479167638e-05, + "loss": 7.1957, + "step": 1236 + }, + { + "epoch": 0.007356789418593587, + "grad_norm": 3.7452759742736816, + "learning_rate": 4.999333401012682e-05, + "loss": 7.2093, + "step": 1237 + }, + { + "epoch": 0.0073627367018745836, + "grad_norm": 3.363443374633789, + "learning_rate": 4.999332321985236e-05, + "loss": 7.297, + "step": 1238 + }, + { + "epoch": 0.007368683985155581, + "grad_norm": 3.070962905883789, + "learning_rate": 4.999331242085299e-05, + "loss": 7.0831, + "step": 1239 + }, + { + "epoch": 0.007374631268436578, + "grad_norm": 3.635183095932007, + "learning_rate": 4.9993301613128706e-05, + "loss": 7.3116, + "step": 1240 + }, + { + "epoch": 0.007380578551717576, + "grad_norm": 2.532179594039917, + "learning_rate": 4.9993290796679516e-05, + "loss": 7.5238, + "step": 1241 + }, + { + "epoch": 0.007386525834998573, + "grad_norm": 2.1147687435150146, + "learning_rate": 4.999327997150543e-05, + "loss": 7.2279, + "step": 1242 + }, + { + "epoch": 0.00739247311827957, + "grad_norm": 2.1221182346343994, + "learning_rate": 4.999326913760645e-05, + "loss": 7.6575, + "step": 1243 + }, + { + "epoch": 0.007398420401560567, + "grad_norm": 2.2920000553131104, + "learning_rate": 4.999325829498257e-05, + "loss": 7.5652, + "step": 1244 + }, + { + "epoch": 0.007404367684841565, + "grad_norm": 2.3444230556488037, + "learning_rate": 4.9993247443633814e-05, + "loss": 7.3992, + "step": 1245 + }, + { + "epoch": 0.007410314968122562, + "grad_norm": 2.2778663635253906, + "learning_rate": 4.9993236583560164e-05, + "loss": 7.1212, + "step": 1246 + }, + { + "epoch": 0.007416262251403559, + "grad_norm": 2.38369083404541, + "learning_rate": 4.999322571476164e-05, + "loss": 7.4605, + "step": 1247 + }, + { + "epoch": 0.007422209534684556, + "grad_norm": 3.578537702560425, + "learning_rate": 4.999321483723823e-05, + "loss": 7.1446, + "step": 1248 + }, + { + "epoch": 0.007428156817965553, + "grad_norm": 5.227176666259766, + "learning_rate": 4.9993203950989954e-05, + "loss": 7.2308, + "step": 1249 + }, + { + "epoch": 0.007434104101246551, + "grad_norm": 2.665844440460205, + "learning_rate": 4.9993193056016805e-05, + "loss": 7.102, + "step": 1250 + }, + { + "epoch": 0.007440051384527547, + "grad_norm": 4.462922096252441, + "learning_rate": 4.9993182152318796e-05, + "loss": 7.003, + "step": 1251 + }, + { + "epoch": 0.007445998667808545, + "grad_norm": 4.9459099769592285, + "learning_rate": 4.999317123989592e-05, + "loss": 7.1338, + "step": 1252 + }, + { + "epoch": 0.007451945951089542, + "grad_norm": 3.127427339553833, + "learning_rate": 4.9993160318748186e-05, + "loss": 7.045, + "step": 1253 + }, + { + "epoch": 0.00745789323437054, + "grad_norm": 3.03910231590271, + "learning_rate": 4.9993149388875606e-05, + "loss": 6.8523, + "step": 1254 + }, + { + "epoch": 0.0074638405176515365, + "grad_norm": 2.931033134460449, + "learning_rate": 4.9993138450278166e-05, + "loss": 7.3065, + "step": 1255 + }, + { + "epoch": 0.007469787800932534, + "grad_norm": 4.60735559463501, + "learning_rate": 4.999312750295588e-05, + "loss": 7.5384, + "step": 1256 + }, + { + "epoch": 0.007475735084213531, + "grad_norm": 3.0745065212249756, + "learning_rate": 4.9993116546908755e-05, + "loss": 7.6279, + "step": 1257 + }, + { + "epoch": 0.007481682367494529, + "grad_norm": 2.7158751487731934, + "learning_rate": 4.9993105582136804e-05, + "loss": 7.1885, + "step": 1258 + }, + { + "epoch": 0.0074876296507755255, + "grad_norm": 3.5049819946289062, + "learning_rate": 4.999309460864e-05, + "loss": 6.6833, + "step": 1259 + }, + { + "epoch": 0.007493576934056523, + "grad_norm": 3.229778289794922, + "learning_rate": 4.999308362641837e-05, + "loss": 6.784, + "step": 1260 + }, + { + "epoch": 0.00749952421733752, + "grad_norm": 2.7032854557037354, + "learning_rate": 4.999307263547191e-05, + "loss": 6.8003, + "step": 1261 + }, + { + "epoch": 0.007505471500618518, + "grad_norm": 5.892059326171875, + "learning_rate": 4.999306163580063e-05, + "loss": 7.2365, + "step": 1262 + }, + { + "epoch": 0.0075114187838995145, + "grad_norm": 5.8021135330200195, + "learning_rate": 4.999305062740453e-05, + "loss": 7.3822, + "step": 1263 + }, + { + "epoch": 0.007517366067180512, + "grad_norm": 5.1242899894714355, + "learning_rate": 4.9993039610283614e-05, + "loss": 7.2192, + "step": 1264 + }, + { + "epoch": 0.007523313350461509, + "grad_norm": 3.102980375289917, + "learning_rate": 4.9993028584437884e-05, + "loss": 7.4895, + "step": 1265 + }, + { + "epoch": 0.007529260633742507, + "grad_norm": 4.993838310241699, + "learning_rate": 4.999301754986735e-05, + "loss": 7.4771, + "step": 1266 + }, + { + "epoch": 0.0075352079170235035, + "grad_norm": 4.003589630126953, + "learning_rate": 4.999300650657201e-05, + "loss": 7.3591, + "step": 1267 + }, + { + "epoch": 0.007541155200304501, + "grad_norm": 3.6125710010528564, + "learning_rate": 4.999299545455187e-05, + "loss": 7.262, + "step": 1268 + }, + { + "epoch": 0.007547102483585498, + "grad_norm": 3.182196617126465, + "learning_rate": 4.999298439380693e-05, + "loss": 7.2689, + "step": 1269 + }, + { + "epoch": 0.007553049766866496, + "grad_norm": 2.428313732147217, + "learning_rate": 4.99929733243372e-05, + "loss": 7.2364, + "step": 1270 + }, + { + "epoch": 0.0075589970501474925, + "grad_norm": 2.673356771469116, + "learning_rate": 4.999296224614268e-05, + "loss": 7.2356, + "step": 1271 + }, + { + "epoch": 0.00756494433342849, + "grad_norm": 2.508026361465454, + "learning_rate": 4.9992951159223376e-05, + "loss": 7.1052, + "step": 1272 + }, + { + "epoch": 0.007570891616709487, + "grad_norm": 2.7501845359802246, + "learning_rate": 4.99929400635793e-05, + "loss": 7.5041, + "step": 1273 + }, + { + "epoch": 0.007576838899990485, + "grad_norm": 2.4604434967041016, + "learning_rate": 4.999292895921044e-05, + "loss": 7.5042, + "step": 1274 + }, + { + "epoch": 0.0075827861832714815, + "grad_norm": 2.4926865100860596, + "learning_rate": 4.99929178461168e-05, + "loss": 7.2104, + "step": 1275 + }, + { + "epoch": 0.007588733466552479, + "grad_norm": 2.631985664367676, + "learning_rate": 4.999290672429839e-05, + "loss": 6.8608, + "step": 1276 + }, + { + "epoch": 0.007594680749833476, + "grad_norm": 2.5684268474578857, + "learning_rate": 4.999289559375523e-05, + "loss": 7.1199, + "step": 1277 + }, + { + "epoch": 0.007600628033114474, + "grad_norm": 2.4312644004821777, + "learning_rate": 4.99928844544873e-05, + "loss": 7.1814, + "step": 1278 + }, + { + "epoch": 0.0076065753163954706, + "grad_norm": 2.794407367706299, + "learning_rate": 4.99928733064946e-05, + "loss": 7.2909, + "step": 1279 + }, + { + "epoch": 0.007612522599676467, + "grad_norm": 2.5903992652893066, + "learning_rate": 4.9992862149777166e-05, + "loss": 7.354, + "step": 1280 + }, + { + "epoch": 0.007618469882957465, + "grad_norm": 2.266364336013794, + "learning_rate": 4.999285098433497e-05, + "loss": 7.5697, + "step": 1281 + }, + { + "epoch": 0.007624417166238462, + "grad_norm": 3.1871070861816406, + "learning_rate": 4.999283981016803e-05, + "loss": 7.4393, + "step": 1282 + }, + { + "epoch": 0.00763036444951946, + "grad_norm": 2.137981653213501, + "learning_rate": 4.999282862727635e-05, + "loss": 7.3591, + "step": 1283 + }, + { + "epoch": 0.007636311732800456, + "grad_norm": 2.3166019916534424, + "learning_rate": 4.999281743565993e-05, + "loss": 7.4307, + "step": 1284 + }, + { + "epoch": 0.007642259016081454, + "grad_norm": 2.331110954284668, + "learning_rate": 4.999280623531878e-05, + "loss": 7.3214, + "step": 1285 + }, + { + "epoch": 0.007648206299362451, + "grad_norm": 2.7417728900909424, + "learning_rate": 4.999279502625289e-05, + "loss": 7.3593, + "step": 1286 + }, + { + "epoch": 0.007654153582643449, + "grad_norm": 3.089448928833008, + "learning_rate": 4.999278380846228e-05, + "loss": 7.3347, + "step": 1287 + }, + { + "epoch": 0.007660100865924445, + "grad_norm": 2.9446022510528564, + "learning_rate": 4.999277258194694e-05, + "loss": 7.3109, + "step": 1288 + }, + { + "epoch": 0.007666048149205443, + "grad_norm": 2.713355302810669, + "learning_rate": 4.9992761346706896e-05, + "loss": 7.2962, + "step": 1289 + }, + { + "epoch": 0.00767199543248644, + "grad_norm": 2.9480702877044678, + "learning_rate": 4.9992750102742125e-05, + "loss": 7.2081, + "step": 1290 + }, + { + "epoch": 0.007677942715767438, + "grad_norm": 2.737271785736084, + "learning_rate": 4.999273885005265e-05, + "loss": 7.2251, + "step": 1291 + }, + { + "epoch": 0.007683889999048434, + "grad_norm": 2.6954190731048584, + "learning_rate": 4.9992727588638466e-05, + "loss": 7.3437, + "step": 1292 + }, + { + "epoch": 0.007689837282329432, + "grad_norm": 3.0270752906799316, + "learning_rate": 4.999271631849958e-05, + "loss": 7.2516, + "step": 1293 + }, + { + "epoch": 0.007695784565610429, + "grad_norm": 2.824052333831787, + "learning_rate": 4.999270503963599e-05, + "loss": 7.2706, + "step": 1294 + }, + { + "epoch": 0.007701731848891427, + "grad_norm": 2.800713300704956, + "learning_rate": 4.999269375204771e-05, + "loss": 7.2497, + "step": 1295 + }, + { + "epoch": 0.0077076791321724234, + "grad_norm": 3.2510271072387695, + "learning_rate": 4.999268245573474e-05, + "loss": 7.025, + "step": 1296 + }, + { + "epoch": 0.007713626415453421, + "grad_norm": 3.095862627029419, + "learning_rate": 4.999267115069708e-05, + "loss": 7.1815, + "step": 1297 + }, + { + "epoch": 0.007719573698734418, + "grad_norm": 3.2238826751708984, + "learning_rate": 4.999265983693473e-05, + "loss": 7.2268, + "step": 1298 + }, + { + "epoch": 0.007725520982015416, + "grad_norm": 3.18687105178833, + "learning_rate": 4.999264851444771e-05, + "loss": 7.2076, + "step": 1299 + }, + { + "epoch": 0.0077314682652964125, + "grad_norm": 3.1385931968688965, + "learning_rate": 4.9992637183236016e-05, + "loss": 7.2323, + "step": 1300 + }, + { + "epoch": 0.00773741554857741, + "grad_norm": 2.3172361850738525, + "learning_rate": 4.999262584329964e-05, + "loss": 7.1225, + "step": 1301 + }, + { + "epoch": 0.007743362831858407, + "grad_norm": 3.3223013877868652, + "learning_rate": 4.99926144946386e-05, + "loss": 7.2108, + "step": 1302 + }, + { + "epoch": 0.007749310115139405, + "grad_norm": 3.197218894958496, + "learning_rate": 4.99926031372529e-05, + "loss": 7.5123, + "step": 1303 + }, + { + "epoch": 0.0077552573984204015, + "grad_norm": 2.8411800861358643, + "learning_rate": 4.999259177114254e-05, + "loss": 7.3047, + "step": 1304 + }, + { + "epoch": 0.007761204681701399, + "grad_norm": 2.7549736499786377, + "learning_rate": 4.9992580396307524e-05, + "loss": 7.3478, + "step": 1305 + }, + { + "epoch": 0.007767151964982396, + "grad_norm": 2.8829352855682373, + "learning_rate": 4.999256901274786e-05, + "loss": 7.1871, + "step": 1306 + }, + { + "epoch": 0.007773099248263394, + "grad_norm": 2.710076332092285, + "learning_rate": 4.999255762046354e-05, + "loss": 7.0891, + "step": 1307 + }, + { + "epoch": 0.0077790465315443905, + "grad_norm": 2.6598877906799316, + "learning_rate": 4.999254621945458e-05, + "loss": 7.6178, + "step": 1308 + }, + { + "epoch": 0.007784993814825388, + "grad_norm": 2.4012649059295654, + "learning_rate": 4.999253480972099e-05, + "loss": 7.5925, + "step": 1309 + }, + { + "epoch": 0.007790941098106385, + "grad_norm": 2.1501622200012207, + "learning_rate": 4.999252339126275e-05, + "loss": 7.6471, + "step": 1310 + }, + { + "epoch": 0.007796888381387382, + "grad_norm": 3.2150895595550537, + "learning_rate": 4.9992511964079886e-05, + "loss": 7.3995, + "step": 1311 + }, + { + "epoch": 0.0078028356646683795, + "grad_norm": 2.450465440750122, + "learning_rate": 4.9992500528172395e-05, + "loss": 7.219, + "step": 1312 + }, + { + "epoch": 0.007808782947949376, + "grad_norm": 2.714510679244995, + "learning_rate": 4.9992489083540274e-05, + "loss": 7.2023, + "step": 1313 + }, + { + "epoch": 0.007814730231230374, + "grad_norm": 2.660019636154175, + "learning_rate": 4.999247763018354e-05, + "loss": 6.8686, + "step": 1314 + }, + { + "epoch": 0.00782067751451137, + "grad_norm": 2.1031477451324463, + "learning_rate": 4.999246616810218e-05, + "loss": 7.305, + "step": 1315 + }, + { + "epoch": 0.007826624797792368, + "grad_norm": 3.0037856101989746, + "learning_rate": 4.999245469729622e-05, + "loss": 6.9788, + "step": 1316 + }, + { + "epoch": 0.007832572081073366, + "grad_norm": 3.1931207180023193, + "learning_rate": 4.999244321776565e-05, + "loss": 6.9312, + "step": 1317 + }, + { + "epoch": 0.007838519364354363, + "grad_norm": 2.7419891357421875, + "learning_rate": 4.999243172951047e-05, + "loss": 6.7732, + "step": 1318 + }, + { + "epoch": 0.00784446664763536, + "grad_norm": 2.772061824798584, + "learning_rate": 4.99924202325307e-05, + "loss": 6.9576, + "step": 1319 + }, + { + "epoch": 0.007850413930916357, + "grad_norm": 2.9300522804260254, + "learning_rate": 4.999240872682632e-05, + "loss": 6.8366, + "step": 1320 + }, + { + "epoch": 0.007856361214197355, + "grad_norm": 3.4697458744049072, + "learning_rate": 4.9992397212397365e-05, + "loss": 6.9234, + "step": 1321 + }, + { + "epoch": 0.007862308497478352, + "grad_norm": 3.044647693634033, + "learning_rate": 4.999238568924381e-05, + "loss": 6.8406, + "step": 1322 + }, + { + "epoch": 0.007868255780759349, + "grad_norm": 2.4429051876068115, + "learning_rate": 4.999237415736567e-05, + "loss": 6.9815, + "step": 1323 + }, + { + "epoch": 0.007874203064040346, + "grad_norm": 2.6193530559539795, + "learning_rate": 4.999236261676296e-05, + "loss": 7.3867, + "step": 1324 + }, + { + "epoch": 0.007880150347321344, + "grad_norm": 3.9543204307556152, + "learning_rate": 4.999235106743567e-05, + "loss": 7.2391, + "step": 1325 + }, + { + "epoch": 0.007886097630602341, + "grad_norm": 3.12777042388916, + "learning_rate": 4.9992339509383814e-05, + "loss": 7.0976, + "step": 1326 + }, + { + "epoch": 0.007892044913883338, + "grad_norm": 2.4543895721435547, + "learning_rate": 4.999232794260739e-05, + "loss": 7.1865, + "step": 1327 + }, + { + "epoch": 0.007897992197164335, + "grad_norm": 4.254832744598389, + "learning_rate": 4.999231636710639e-05, + "loss": 6.777, + "step": 1328 + }, + { + "epoch": 0.007903939480445333, + "grad_norm": 2.7835497856140137, + "learning_rate": 4.999230478288084e-05, + "loss": 6.8508, + "step": 1329 + }, + { + "epoch": 0.00790988676372633, + "grad_norm": 3.2724666595458984, + "learning_rate": 4.999229318993073e-05, + "loss": 6.7636, + "step": 1330 + }, + { + "epoch": 0.007915834047007327, + "grad_norm": 4.657248020172119, + "learning_rate": 4.9992281588256075e-05, + "loss": 7.3677, + "step": 1331 + }, + { + "epoch": 0.007921781330288324, + "grad_norm": 6.201416492462158, + "learning_rate": 4.999226997785686e-05, + "loss": 7.5804, + "step": 1332 + }, + { + "epoch": 0.007927728613569322, + "grad_norm": 4.955161094665527, + "learning_rate": 4.999225835873312e-05, + "loss": 7.1867, + "step": 1333 + }, + { + "epoch": 0.007933675896850319, + "grad_norm": 3.4105887413024902, + "learning_rate": 4.9992246730884826e-05, + "loss": 7.0948, + "step": 1334 + }, + { + "epoch": 0.007939623180131316, + "grad_norm": 2.514570951461792, + "learning_rate": 4.999223509431201e-05, + "loss": 6.9367, + "step": 1335 + }, + { + "epoch": 0.007945570463412313, + "grad_norm": 3.7689249515533447, + "learning_rate": 4.9992223449014654e-05, + "loss": 7.2209, + "step": 1336 + }, + { + "epoch": 0.007951517746693311, + "grad_norm": 4.997833728790283, + "learning_rate": 4.999221179499277e-05, + "loss": 7.3336, + "step": 1337 + }, + { + "epoch": 0.007957465029974308, + "grad_norm": 5.1314287185668945, + "learning_rate": 4.999220013224637e-05, + "loss": 6.933, + "step": 1338 + }, + { + "epoch": 0.007963412313255305, + "grad_norm": 3.708528518676758, + "learning_rate": 4.9992188460775447e-05, + "loss": 6.9598, + "step": 1339 + }, + { + "epoch": 0.007969359596536302, + "grad_norm": 3.029602289199829, + "learning_rate": 4.999217678058001e-05, + "loss": 7.3674, + "step": 1340 + }, + { + "epoch": 0.007975306879817299, + "grad_norm": 3.000312089920044, + "learning_rate": 4.999216509166006e-05, + "loss": 7.2705, + "step": 1341 + }, + { + "epoch": 0.007981254163098297, + "grad_norm": 4.852355480194092, + "learning_rate": 4.999215339401561e-05, + "loss": 7.1842, + "step": 1342 + }, + { + "epoch": 0.007987201446379294, + "grad_norm": 3.0430521965026855, + "learning_rate": 4.999214168764664e-05, + "loss": 7.5616, + "step": 1343 + }, + { + "epoch": 0.00799314872966029, + "grad_norm": 2.793760061264038, + "learning_rate": 4.999212997255319e-05, + "loss": 7.4867, + "step": 1344 + }, + { + "epoch": 0.007999096012941288, + "grad_norm": 3.516545295715332, + "learning_rate": 4.9992118248735245e-05, + "loss": 7.5857, + "step": 1345 + }, + { + "epoch": 0.008005043296222286, + "grad_norm": 4.272013187408447, + "learning_rate": 4.9992106516192796e-05, + "loss": 7.5686, + "step": 1346 + }, + { + "epoch": 0.008010990579503283, + "grad_norm": 3.176974058151245, + "learning_rate": 4.999209477492587e-05, + "loss": 7.1826, + "step": 1347 + }, + { + "epoch": 0.00801693786278428, + "grad_norm": 3.2615413665771484, + "learning_rate": 4.999208302493447e-05, + "loss": 7.3933, + "step": 1348 + }, + { + "epoch": 0.008022885146065277, + "grad_norm": 2.9548113346099854, + "learning_rate": 4.999207126621858e-05, + "loss": 7.339, + "step": 1349 + }, + { + "epoch": 0.008028832429346275, + "grad_norm": 3.445829153060913, + "learning_rate": 4.999205949877822e-05, + "loss": 7.4223, + "step": 1350 + }, + { + "epoch": 0.008034779712627272, + "grad_norm": 3.471991777420044, + "learning_rate": 4.999204772261338e-05, + "loss": 7.4192, + "step": 1351 + }, + { + "epoch": 0.008040726995908269, + "grad_norm": 3.1682589054107666, + "learning_rate": 4.999203593772409e-05, + "loss": 7.3433, + "step": 1352 + }, + { + "epoch": 0.008046674279189266, + "grad_norm": 4.693798065185547, + "learning_rate": 4.999202414411033e-05, + "loss": 7.1479, + "step": 1353 + }, + { + "epoch": 0.008052621562470264, + "grad_norm": 3.0599937438964844, + "learning_rate": 4.9992012341772114e-05, + "loss": 7.3137, + "step": 1354 + }, + { + "epoch": 0.008058568845751261, + "grad_norm": 2.9557557106018066, + "learning_rate": 4.999200053070945e-05, + "loss": 7.4466, + "step": 1355 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 2.5595791339874268, + "learning_rate": 4.999198871092233e-05, + "loss": 7.4716, + "step": 1356 + }, + { + "epoch": 0.008070463412313255, + "grad_norm": 2.919729709625244, + "learning_rate": 4.999197688241076e-05, + "loss": 7.0754, + "step": 1357 + }, + { + "epoch": 0.008076410695594253, + "grad_norm": 2.5880625247955322, + "learning_rate": 4.9991965045174763e-05, + "loss": 7.2794, + "step": 1358 + }, + { + "epoch": 0.00808235797887525, + "grad_norm": 2.9933066368103027, + "learning_rate": 4.999195319921432e-05, + "loss": 7.3547, + "step": 1359 + }, + { + "epoch": 0.008088305262156247, + "grad_norm": 5.097862243652344, + "learning_rate": 4.999194134452945e-05, + "loss": 7.1922, + "step": 1360 + }, + { + "epoch": 0.008094252545437244, + "grad_norm": 4.1795830726623535, + "learning_rate": 4.9991929481120146e-05, + "loss": 7.0437, + "step": 1361 + }, + { + "epoch": 0.008100199828718242, + "grad_norm": 3.292961835861206, + "learning_rate": 4.999191760898642e-05, + "loss": 6.8637, + "step": 1362 + }, + { + "epoch": 0.008106147111999239, + "grad_norm": 3.052610397338867, + "learning_rate": 4.999190572812828e-05, + "loss": 7.1675, + "step": 1363 + }, + { + "epoch": 0.008112094395280236, + "grad_norm": 2.975646734237671, + "learning_rate": 4.999189383854571e-05, + "loss": 7.1309, + "step": 1364 + }, + { + "epoch": 0.008118041678561233, + "grad_norm": 2.71195912361145, + "learning_rate": 4.999188194023874e-05, + "loss": 7.2247, + "step": 1365 + }, + { + "epoch": 0.008123988961842231, + "grad_norm": 2.751002311706543, + "learning_rate": 4.9991870033207354e-05, + "loss": 6.8553, + "step": 1366 + }, + { + "epoch": 0.008129936245123228, + "grad_norm": 3.4521234035491943, + "learning_rate": 4.999185811745157e-05, + "loss": 6.8373, + "step": 1367 + }, + { + "epoch": 0.008135883528404225, + "grad_norm": 3.054330348968506, + "learning_rate": 4.999184619297138e-05, + "loss": 6.6982, + "step": 1368 + }, + { + "epoch": 0.008141830811685222, + "grad_norm": 3.513794183731079, + "learning_rate": 4.99918342597668e-05, + "loss": 6.5567, + "step": 1369 + }, + { + "epoch": 0.00814777809496622, + "grad_norm": 3.681838274002075, + "learning_rate": 4.9991822317837836e-05, + "loss": 6.6335, + "step": 1370 + }, + { + "epoch": 0.008153725378247217, + "grad_norm": 4.144393444061279, + "learning_rate": 4.999181036718447e-05, + "loss": 6.5361, + "step": 1371 + }, + { + "epoch": 0.008159672661528214, + "grad_norm": 2.9771196842193604, + "learning_rate": 4.9991798407806736e-05, + "loss": 7.0085, + "step": 1372 + }, + { + "epoch": 0.00816561994480921, + "grad_norm": 3.114884376525879, + "learning_rate": 4.9991786439704615e-05, + "loss": 7.1498, + "step": 1373 + }, + { + "epoch": 0.008171567228090208, + "grad_norm": 2.76042103767395, + "learning_rate": 4.9991774462878115e-05, + "loss": 6.8462, + "step": 1374 + }, + { + "epoch": 0.008177514511371206, + "grad_norm": 3.257528066635132, + "learning_rate": 4.999176247732725e-05, + "loss": 6.4595, + "step": 1375 + }, + { + "epoch": 0.008183461794652203, + "grad_norm": 3.377774238586426, + "learning_rate": 4.999175048305202e-05, + "loss": 6.3131, + "step": 1376 + }, + { + "epoch": 0.0081894090779332, + "grad_norm": 3.029477834701538, + "learning_rate": 4.999173848005243e-05, + "loss": 6.7182, + "step": 1377 + }, + { + "epoch": 0.008195356361214197, + "grad_norm": 3.0353076457977295, + "learning_rate": 4.9991726468328476e-05, + "loss": 7.009, + "step": 1378 + }, + { + "epoch": 0.008201303644495195, + "grad_norm": 2.465014934539795, + "learning_rate": 4.999171444788017e-05, + "loss": 7.6277, + "step": 1379 + }, + { + "epoch": 0.008207250927776192, + "grad_norm": 3.025954484939575, + "learning_rate": 4.999170241870752e-05, + "loss": 7.2815, + "step": 1380 + }, + { + "epoch": 0.008213198211057189, + "grad_norm": 3.8414018154144287, + "learning_rate": 4.999169038081052e-05, + "loss": 7.2238, + "step": 1381 + }, + { + "epoch": 0.008219145494338186, + "grad_norm": 3.2927470207214355, + "learning_rate": 4.999167833418918e-05, + "loss": 7.1505, + "step": 1382 + }, + { + "epoch": 0.008225092777619184, + "grad_norm": 2.6132330894470215, + "learning_rate": 4.999166627884351e-05, + "loss": 7.2499, + "step": 1383 + }, + { + "epoch": 0.008231040060900181, + "grad_norm": 2.523366689682007, + "learning_rate": 4.9991654214773497e-05, + "loss": 6.9812, + "step": 1384 + }, + { + "epoch": 0.008236987344181178, + "grad_norm": 3.977471351623535, + "learning_rate": 4.9991642141979154e-05, + "loss": 7.3196, + "step": 1385 + }, + { + "epoch": 0.008242934627462175, + "grad_norm": 2.731952428817749, + "learning_rate": 4.99916300604605e-05, + "loss": 7.1014, + "step": 1386 + }, + { + "epoch": 0.008248881910743173, + "grad_norm": 2.6128756999969482, + "learning_rate": 4.999161797021752e-05, + "loss": 7.0235, + "step": 1387 + }, + { + "epoch": 0.00825482919402417, + "grad_norm": 2.263430595397949, + "learning_rate": 4.999160587125023e-05, + "loss": 7.0183, + "step": 1388 + }, + { + "epoch": 0.008260776477305167, + "grad_norm": 2.799994707107544, + "learning_rate": 4.9991593763558614e-05, + "loss": 6.9553, + "step": 1389 + }, + { + "epoch": 0.008266723760586164, + "grad_norm": 2.5443058013916016, + "learning_rate": 4.99915816471427e-05, + "loss": 7.2302, + "step": 1390 + }, + { + "epoch": 0.008272671043867162, + "grad_norm": 2.304185152053833, + "learning_rate": 4.999156952200248e-05, + "loss": 7.2589, + "step": 1391 + }, + { + "epoch": 0.008278618327148159, + "grad_norm": 2.1639649868011475, + "learning_rate": 4.999155738813797e-05, + "loss": 7.0067, + "step": 1392 + }, + { + "epoch": 0.008284565610429156, + "grad_norm": 2.276514768600464, + "learning_rate": 4.999154524554915e-05, + "loss": 7.2721, + "step": 1393 + }, + { + "epoch": 0.008290512893710153, + "grad_norm": 2.212200880050659, + "learning_rate": 4.9991533094236055e-05, + "loss": 7.1183, + "step": 1394 + }, + { + "epoch": 0.008296460176991151, + "grad_norm": 2.5289459228515625, + "learning_rate": 4.999152093419867e-05, + "loss": 7.0289, + "step": 1395 + }, + { + "epoch": 0.008302407460272148, + "grad_norm": 2.5915603637695312, + "learning_rate": 4.999150876543699e-05, + "loss": 6.7497, + "step": 1396 + }, + { + "epoch": 0.008308354743553145, + "grad_norm": 2.680513858795166, + "learning_rate": 4.999149658795105e-05, + "loss": 6.7139, + "step": 1397 + }, + { + "epoch": 0.008314302026834142, + "grad_norm": 2.65744948387146, + "learning_rate": 4.999148440174083e-05, + "loss": 6.6151, + "step": 1398 + }, + { + "epoch": 0.00832024931011514, + "grad_norm": 3.8028745651245117, + "learning_rate": 4.9991472206806334e-05, + "loss": 7.1992, + "step": 1399 + }, + { + "epoch": 0.008326196593396137, + "grad_norm": 2.8436119556427, + "learning_rate": 4.999146000314758e-05, + "loss": 7.165, + "step": 1400 + }, + { + "epoch": 0.008332143876677134, + "grad_norm": 2.6658496856689453, + "learning_rate": 4.999144779076457e-05, + "loss": 7.5945, + "step": 1401 + }, + { + "epoch": 0.00833809115995813, + "grad_norm": 2.909703016281128, + "learning_rate": 4.99914355696573e-05, + "loss": 7.6378, + "step": 1402 + }, + { + "epoch": 0.00834403844323913, + "grad_norm": 2.5827598571777344, + "learning_rate": 4.9991423339825776e-05, + "loss": 7.5441, + "step": 1403 + }, + { + "epoch": 0.008349985726520126, + "grad_norm": 3.0283706188201904, + "learning_rate": 4.999141110127e-05, + "loss": 7.1162, + "step": 1404 + }, + { + "epoch": 0.008355933009801123, + "grad_norm": 3.11690354347229, + "learning_rate": 4.999139885398999e-05, + "loss": 6.5123, + "step": 1405 + }, + { + "epoch": 0.00836188029308212, + "grad_norm": 2.6188690662384033, + "learning_rate": 4.999138659798574e-05, + "loss": 7.6384, + "step": 1406 + }, + { + "epoch": 0.008367827576363117, + "grad_norm": 3.4412481784820557, + "learning_rate": 4.999137433325725e-05, + "loss": 7.4067, + "step": 1407 + }, + { + "epoch": 0.008373774859644115, + "grad_norm": 3.1690893173217773, + "learning_rate": 4.999136205980454e-05, + "loss": 7.3937, + "step": 1408 + }, + { + "epoch": 0.008379722142925112, + "grad_norm": 2.1589877605438232, + "learning_rate": 4.999134977762759e-05, + "loss": 7.454, + "step": 1409 + }, + { + "epoch": 0.008385669426206109, + "grad_norm": 2.485901117324829, + "learning_rate": 4.999133748672642e-05, + "loss": 7.3421, + "step": 1410 + }, + { + "epoch": 0.008391616709487106, + "grad_norm": 2.543128252029419, + "learning_rate": 4.999132518710104e-05, + "loss": 7.3162, + "step": 1411 + }, + { + "epoch": 0.008397563992768104, + "grad_norm": 2.8048489093780518, + "learning_rate": 4.999131287875144e-05, + "loss": 7.297, + "step": 1412 + }, + { + "epoch": 0.008403511276049101, + "grad_norm": 3.0391035079956055, + "learning_rate": 4.9991300561677634e-05, + "loss": 7.2409, + "step": 1413 + }, + { + "epoch": 0.008409458559330098, + "grad_norm": 2.3196053504943848, + "learning_rate": 4.999128823587962e-05, + "loss": 7.1358, + "step": 1414 + }, + { + "epoch": 0.008415405842611095, + "grad_norm": 3.1876983642578125, + "learning_rate": 4.999127590135741e-05, + "loss": 7.1501, + "step": 1415 + }, + { + "epoch": 0.008421353125892093, + "grad_norm": 3.6832327842712402, + "learning_rate": 4.9991263558111e-05, + "loss": 7.181, + "step": 1416 + }, + { + "epoch": 0.00842730040917309, + "grad_norm": 3.7491936683654785, + "learning_rate": 4.99912512061404e-05, + "loss": 6.9669, + "step": 1417 + }, + { + "epoch": 0.008433247692454087, + "grad_norm": 3.1583478450775146, + "learning_rate": 4.9991238845445615e-05, + "loss": 7.2155, + "step": 1418 + }, + { + "epoch": 0.008439194975735084, + "grad_norm": 3.11611008644104, + "learning_rate": 4.999122647602664e-05, + "loss": 7.164, + "step": 1419 + }, + { + "epoch": 0.008445142259016082, + "grad_norm": 6.127118110656738, + "learning_rate": 4.9991214097883495e-05, + "loss": 7.232, + "step": 1420 + }, + { + "epoch": 0.008451089542297079, + "grad_norm": 4.736495494842529, + "learning_rate": 4.9991201711016166e-05, + "loss": 7.3685, + "step": 1421 + }, + { + "epoch": 0.008457036825578076, + "grad_norm": 2.9656684398651123, + "learning_rate": 4.999118931542467e-05, + "loss": 7.2658, + "step": 1422 + }, + { + "epoch": 0.008462984108859073, + "grad_norm": 2.5959243774414062, + "learning_rate": 4.999117691110901e-05, + "loss": 7.0908, + "step": 1423 + }, + { + "epoch": 0.008468931392140071, + "grad_norm": 4.546379089355469, + "learning_rate": 4.999116449806919e-05, + "loss": 7.1343, + "step": 1424 + }, + { + "epoch": 0.008474878675421068, + "grad_norm": 3.6856796741485596, + "learning_rate": 4.9991152076305206e-05, + "loss": 6.9205, + "step": 1425 + }, + { + "epoch": 0.008480825958702065, + "grad_norm": 3.293973922729492, + "learning_rate": 4.9991139645817075e-05, + "loss": 6.9954, + "step": 1426 + }, + { + "epoch": 0.008486773241983062, + "grad_norm": 3.2511162757873535, + "learning_rate": 4.999112720660479e-05, + "loss": 6.7661, + "step": 1427 + }, + { + "epoch": 0.00849272052526406, + "grad_norm": 3.990840196609497, + "learning_rate": 4.9991114758668364e-05, + "loss": 6.7402, + "step": 1428 + }, + { + "epoch": 0.008498667808545057, + "grad_norm": 3.306809186935425, + "learning_rate": 4.9991102302007804e-05, + "loss": 6.6801, + "step": 1429 + }, + { + "epoch": 0.008504615091826054, + "grad_norm": 5.208675384521484, + "learning_rate": 4.99910898366231e-05, + "loss": 7.0128, + "step": 1430 + }, + { + "epoch": 0.00851056237510705, + "grad_norm": 4.131346225738525, + "learning_rate": 4.9991077362514266e-05, + "loss": 7.0992, + "step": 1431 + }, + { + "epoch": 0.00851650965838805, + "grad_norm": 2.60927152633667, + "learning_rate": 4.99910648796813e-05, + "loss": 7.2731, + "step": 1432 + }, + { + "epoch": 0.008522456941669046, + "grad_norm": 5.654631614685059, + "learning_rate": 4.9991052388124224e-05, + "loss": 6.6105, + "step": 1433 + }, + { + "epoch": 0.008528404224950043, + "grad_norm": 6.108455657958984, + "learning_rate": 4.9991039887843025e-05, + "loss": 6.3548, + "step": 1434 + }, + { + "epoch": 0.00853435150823104, + "grad_norm": 3.758371591567993, + "learning_rate": 4.9991027378837705e-05, + "loss": 6.6171, + "step": 1435 + }, + { + "epoch": 0.008540298791512036, + "grad_norm": 2.1995320320129395, + "learning_rate": 4.9991014861108285e-05, + "loss": 6.5987, + "step": 1436 + }, + { + "epoch": 0.008546246074793035, + "grad_norm": 2.3778254985809326, + "learning_rate": 4.999100233465476e-05, + "loss": 6.8067, + "step": 1437 + }, + { + "epoch": 0.008552193358074032, + "grad_norm": 2.521928310394287, + "learning_rate": 4.999098979947713e-05, + "loss": 6.7756, + "step": 1438 + }, + { + "epoch": 0.008558140641355029, + "grad_norm": 2.109605073928833, + "learning_rate": 4.99909772555754e-05, + "loss": 6.7091, + "step": 1439 + }, + { + "epoch": 0.008564087924636025, + "grad_norm": 2.55838680267334, + "learning_rate": 4.9990964702949585e-05, + "loss": 6.8989, + "step": 1440 + }, + { + "epoch": 0.008570035207917024, + "grad_norm": 2.4499685764312744, + "learning_rate": 4.9990952141599675e-05, + "loss": 6.6241, + "step": 1441 + }, + { + "epoch": 0.00857598249119802, + "grad_norm": 2.265371322631836, + "learning_rate": 4.9990939571525685e-05, + "loss": 7.6681, + "step": 1442 + }, + { + "epoch": 0.008581929774479018, + "grad_norm": 2.4496965408325195, + "learning_rate": 4.999092699272762e-05, + "loss": 6.8177, + "step": 1443 + }, + { + "epoch": 0.008587877057760014, + "grad_norm": 2.5555005073547363, + "learning_rate": 4.999091440520548e-05, + "loss": 6.6402, + "step": 1444 + }, + { + "epoch": 0.008593824341041013, + "grad_norm": 2.042592763900757, + "learning_rate": 4.999090180895927e-05, + "loss": 6.6114, + "step": 1445 + }, + { + "epoch": 0.00859977162432201, + "grad_norm": 2.3100671768188477, + "learning_rate": 4.9990889203988986e-05, + "loss": 6.712, + "step": 1446 + }, + { + "epoch": 0.008605718907603007, + "grad_norm": 2.7600841522216797, + "learning_rate": 4.999087659029465e-05, + "loss": 6.6531, + "step": 1447 + }, + { + "epoch": 0.008611666190884004, + "grad_norm": 3.292684316635132, + "learning_rate": 4.999086396787625e-05, + "loss": 6.9896, + "step": 1448 + }, + { + "epoch": 0.008617613474165002, + "grad_norm": 2.7579386234283447, + "learning_rate": 4.999085133673381e-05, + "loss": 7.1559, + "step": 1449 + }, + { + "epoch": 0.008623560757445999, + "grad_norm": 2.7898707389831543, + "learning_rate": 4.999083869686731e-05, + "loss": 6.9861, + "step": 1450 + }, + { + "epoch": 0.008629508040726996, + "grad_norm": 3.439809799194336, + "learning_rate": 4.999082604827677e-05, + "loss": 6.759, + "step": 1451 + }, + { + "epoch": 0.008635455324007993, + "grad_norm": 2.924859046936035, + "learning_rate": 4.999081339096219e-05, + "loss": 6.5438, + "step": 1452 + }, + { + "epoch": 0.008641402607288991, + "grad_norm": 3.363886594772339, + "learning_rate": 4.999080072492358e-05, + "loss": 7.0477, + "step": 1453 + }, + { + "epoch": 0.008647349890569988, + "grad_norm": 2.924988031387329, + "learning_rate": 4.999078805016093e-05, + "loss": 6.9228, + "step": 1454 + }, + { + "epoch": 0.008653297173850985, + "grad_norm": 3.2283847332000732, + "learning_rate": 4.999077536667426e-05, + "loss": 6.8763, + "step": 1455 + }, + { + "epoch": 0.008659244457131982, + "grad_norm": 2.635744094848633, + "learning_rate": 4.999076267446357e-05, + "loss": 6.6438, + "step": 1456 + }, + { + "epoch": 0.00866519174041298, + "grad_norm": 2.829801559448242, + "learning_rate": 4.9990749973528864e-05, + "loss": 6.9466, + "step": 1457 + }, + { + "epoch": 0.008671139023693977, + "grad_norm": 3.3631057739257812, + "learning_rate": 4.999073726387014e-05, + "loss": 7.2652, + "step": 1458 + }, + { + "epoch": 0.008677086306974974, + "grad_norm": 3.9970719814300537, + "learning_rate": 4.999072454548741e-05, + "loss": 7.053, + "step": 1459 + }, + { + "epoch": 0.00868303359025597, + "grad_norm": 3.322787046432495, + "learning_rate": 4.9990711818380674e-05, + "loss": 7.0272, + "step": 1460 + }, + { + "epoch": 0.008688980873536969, + "grad_norm": 2.7370798587799072, + "learning_rate": 4.999069908254995e-05, + "loss": 6.8545, + "step": 1461 + }, + { + "epoch": 0.008694928156817966, + "grad_norm": 2.845191240310669, + "learning_rate": 4.999068633799522e-05, + "loss": 6.9393, + "step": 1462 + }, + { + "epoch": 0.008700875440098963, + "grad_norm": 3.064960241317749, + "learning_rate": 4.99906735847165e-05, + "loss": 6.7734, + "step": 1463 + }, + { + "epoch": 0.00870682272337996, + "grad_norm": 7.113090515136719, + "learning_rate": 4.99906608227138e-05, + "loss": 7.0532, + "step": 1464 + }, + { + "epoch": 0.008712770006660958, + "grad_norm": 5.90821647644043, + "learning_rate": 4.999064805198711e-05, + "loss": 7.1494, + "step": 1465 + }, + { + "epoch": 0.008718717289941955, + "grad_norm": 3.9366238117218018, + "learning_rate": 4.9990635272536454e-05, + "loss": 7.623, + "step": 1466 + }, + { + "epoch": 0.008724664573222952, + "grad_norm": 3.1239330768585205, + "learning_rate": 4.9990622484361814e-05, + "loss": 7.4938, + "step": 1467 + }, + { + "epoch": 0.008730611856503949, + "grad_norm": 2.6688928604125977, + "learning_rate": 4.9990609687463216e-05, + "loss": 7.3445, + "step": 1468 + }, + { + "epoch": 0.008736559139784945, + "grad_norm": 3.047154664993286, + "learning_rate": 4.9990596881840646e-05, + "loss": 7.158, + "step": 1469 + }, + { + "epoch": 0.008742506423065944, + "grad_norm": 2.5230467319488525, + "learning_rate": 4.999058406749412e-05, + "loss": 7.1368, + "step": 1470 + }, + { + "epoch": 0.00874845370634694, + "grad_norm": 2.729705333709717, + "learning_rate": 4.999057124442364e-05, + "loss": 7.0144, + "step": 1471 + }, + { + "epoch": 0.008754400989627938, + "grad_norm": 2.5796756744384766, + "learning_rate": 4.999055841262921e-05, + "loss": 7.2157, + "step": 1472 + }, + { + "epoch": 0.008760348272908934, + "grad_norm": 3.458691358566284, + "learning_rate": 4.999054557211084e-05, + "loss": 6.7631, + "step": 1473 + }, + { + "epoch": 0.008766295556189933, + "grad_norm": 2.7262747287750244, + "learning_rate": 4.999053272286851e-05, + "loss": 6.9784, + "step": 1474 + }, + { + "epoch": 0.00877224283947093, + "grad_norm": 2.6003808975219727, + "learning_rate": 4.9990519864902267e-05, + "loss": 7.1369, + "step": 1475 + }, + { + "epoch": 0.008778190122751927, + "grad_norm": 3.4032137393951416, + "learning_rate": 4.999050699821207e-05, + "loss": 6.9569, + "step": 1476 + }, + { + "epoch": 0.008784137406032923, + "grad_norm": 4.099828243255615, + "learning_rate": 4.9990494122797957e-05, + "loss": 6.9977, + "step": 1477 + }, + { + "epoch": 0.008790084689313922, + "grad_norm": 3.1837944984436035, + "learning_rate": 4.999048123865992e-05, + "loss": 7.1331, + "step": 1478 + }, + { + "epoch": 0.008796031972594919, + "grad_norm": 2.618847131729126, + "learning_rate": 4.999046834579796e-05, + "loss": 7.0043, + "step": 1479 + }, + { + "epoch": 0.008801979255875916, + "grad_norm": 3.0132501125335693, + "learning_rate": 4.999045544421209e-05, + "loss": 6.7836, + "step": 1480 + }, + { + "epoch": 0.008807926539156912, + "grad_norm": 2.4608371257781982, + "learning_rate": 4.999044253390231e-05, + "loss": 7.0721, + "step": 1481 + }, + { + "epoch": 0.008813873822437911, + "grad_norm": 3.280649423599243, + "learning_rate": 4.999042961486863e-05, + "loss": 7.959, + "step": 1482 + }, + { + "epoch": 0.008819821105718908, + "grad_norm": 2.7038395404815674, + "learning_rate": 4.999041668711104e-05, + "loss": 7.1256, + "step": 1483 + }, + { + "epoch": 0.008825768388999905, + "grad_norm": 2.1451892852783203, + "learning_rate": 4.9990403750629556e-05, + "loss": 7.2219, + "step": 1484 + }, + { + "epoch": 0.008831715672280901, + "grad_norm": 2.3731601238250732, + "learning_rate": 4.999039080542418e-05, + "loss": 7.2023, + "step": 1485 + }, + { + "epoch": 0.0088376629555619, + "grad_norm": 2.444089651107788, + "learning_rate": 4.999037785149492e-05, + "loss": 7.0988, + "step": 1486 + }, + { + "epoch": 0.008843610238842897, + "grad_norm": 2.644712448120117, + "learning_rate": 4.999036488884177e-05, + "loss": 7.1916, + "step": 1487 + }, + { + "epoch": 0.008849557522123894, + "grad_norm": 5.477145671844482, + "learning_rate": 4.999035191746475e-05, + "loss": 6.7256, + "step": 1488 + }, + { + "epoch": 0.00885550480540489, + "grad_norm": 2.2691709995269775, + "learning_rate": 4.999033893736386e-05, + "loss": 7.2505, + "step": 1489 + }, + { + "epoch": 0.008861452088685889, + "grad_norm": 2.5880343914031982, + "learning_rate": 4.999032594853909e-05, + "loss": 6.9549, + "step": 1490 + }, + { + "epoch": 0.008867399371966886, + "grad_norm": 2.2748520374298096, + "learning_rate": 4.999031295099046e-05, + "loss": 6.8269, + "step": 1491 + }, + { + "epoch": 0.008873346655247883, + "grad_norm": 2.262706995010376, + "learning_rate": 4.999029994471797e-05, + "loss": 6.8876, + "step": 1492 + }, + { + "epoch": 0.00887929393852888, + "grad_norm": 2.264256238937378, + "learning_rate": 4.999028692972162e-05, + "loss": 7.1545, + "step": 1493 + }, + { + "epoch": 0.008885241221809878, + "grad_norm": 2.489259719848633, + "learning_rate": 4.9990273906001424e-05, + "loss": 7.194, + "step": 1494 + }, + { + "epoch": 0.008891188505090875, + "grad_norm": 2.7545981407165527, + "learning_rate": 4.999026087355738e-05, + "loss": 7.0148, + "step": 1495 + }, + { + "epoch": 0.008897135788371872, + "grad_norm": 2.6869328022003174, + "learning_rate": 4.999024783238949e-05, + "loss": 7.2535, + "step": 1496 + }, + { + "epoch": 0.008903083071652869, + "grad_norm": 2.5216503143310547, + "learning_rate": 4.999023478249777e-05, + "loss": 6.4351, + "step": 1497 + }, + { + "epoch": 0.008909030354933865, + "grad_norm": 2.5090575218200684, + "learning_rate": 4.9990221723882216e-05, + "loss": 7.3068, + "step": 1498 + }, + { + "epoch": 0.008914977638214864, + "grad_norm": 2.5026490688323975, + "learning_rate": 4.999020865654283e-05, + "loss": 7.1274, + "step": 1499 + }, + { + "epoch": 0.00892092492149586, + "grad_norm": 2.8030898571014404, + "learning_rate": 4.999019558047963e-05, + "loss": 7.0016, + "step": 1500 + }, + { + "epoch": 0.008926872204776858, + "grad_norm": 2.533383846282959, + "learning_rate": 4.99901824956926e-05, + "loss": 6.8991, + "step": 1501 + }, + { + "epoch": 0.008932819488057854, + "grad_norm": 2.5584118366241455, + "learning_rate": 4.999016940218175e-05, + "loss": 6.9237, + "step": 1502 + }, + { + "epoch": 0.008938766771338853, + "grad_norm": 2.778592586517334, + "learning_rate": 4.99901562999471e-05, + "loss": 7.0941, + "step": 1503 + }, + { + "epoch": 0.00894471405461985, + "grad_norm": 4.023860931396484, + "learning_rate": 4.999014318898865e-05, + "loss": 6.5188, + "step": 1504 + }, + { + "epoch": 0.008950661337900847, + "grad_norm": 3.018118143081665, + "learning_rate": 4.999013006930639e-05, + "loss": 7.0557, + "step": 1505 + }, + { + "epoch": 0.008956608621181843, + "grad_norm": 2.802061080932617, + "learning_rate": 4.999011694090033e-05, + "loss": 7.2645, + "step": 1506 + }, + { + "epoch": 0.008962555904462842, + "grad_norm": 2.3782076835632324, + "learning_rate": 4.999010380377049e-05, + "loss": 7.3707, + "step": 1507 + }, + { + "epoch": 0.008968503187743839, + "grad_norm": 2.451878309249878, + "learning_rate": 4.999009065791686e-05, + "loss": 7.2783, + "step": 1508 + }, + { + "epoch": 0.008974450471024836, + "grad_norm": 3.85514235496521, + "learning_rate": 4.999007750333945e-05, + "loss": 6.3543, + "step": 1509 + }, + { + "epoch": 0.008980397754305832, + "grad_norm": 2.617177963256836, + "learning_rate": 4.999006434003825e-05, + "loss": 7.0175, + "step": 1510 + }, + { + "epoch": 0.008986345037586831, + "grad_norm": 2.6909587383270264, + "learning_rate": 4.999005116801329e-05, + "loss": 7.3282, + "step": 1511 + }, + { + "epoch": 0.008992292320867828, + "grad_norm": 2.332165241241455, + "learning_rate": 4.9990037987264546e-05, + "loss": 7.0993, + "step": 1512 + }, + { + "epoch": 0.008998239604148825, + "grad_norm": 2.5398497581481934, + "learning_rate": 4.9990024797792055e-05, + "loss": 7.2867, + "step": 1513 + }, + { + "epoch": 0.009004186887429821, + "grad_norm": 2.432264566421509, + "learning_rate": 4.9990011599595796e-05, + "loss": 7.1619, + "step": 1514 + }, + { + "epoch": 0.00901013417071082, + "grad_norm": 2.2937278747558594, + "learning_rate": 4.998999839267578e-05, + "loss": 7.1138, + "step": 1515 + }, + { + "epoch": 0.009016081453991817, + "grad_norm": 2.3305680751800537, + "learning_rate": 4.998998517703202e-05, + "loss": 7.0569, + "step": 1516 + }, + { + "epoch": 0.009022028737272814, + "grad_norm": 3.0785884857177734, + "learning_rate": 4.998997195266451e-05, + "loss": 7.0922, + "step": 1517 + }, + { + "epoch": 0.00902797602055381, + "grad_norm": 2.354283571243286, + "learning_rate": 4.998995871957326e-05, + "loss": 7.0024, + "step": 1518 + }, + { + "epoch": 0.009033923303834809, + "grad_norm": 2.488194465637207, + "learning_rate": 4.998994547775827e-05, + "loss": 7.0045, + "step": 1519 + }, + { + "epoch": 0.009039870587115806, + "grad_norm": 2.6196579933166504, + "learning_rate": 4.998993222721956e-05, + "loss": 6.9416, + "step": 1520 + }, + { + "epoch": 0.009045817870396803, + "grad_norm": 2.6524155139923096, + "learning_rate": 4.998991896795711e-05, + "loss": 6.9562, + "step": 1521 + }, + { + "epoch": 0.0090517651536778, + "grad_norm": 3.308661460876465, + "learning_rate": 4.998990569997094e-05, + "loss": 6.8602, + "step": 1522 + }, + { + "epoch": 0.009057712436958798, + "grad_norm": 2.7995994091033936, + "learning_rate": 4.9989892423261055e-05, + "loss": 7.7049, + "step": 1523 + }, + { + "epoch": 0.009063659720239795, + "grad_norm": 2.547189235687256, + "learning_rate": 4.9989879137827456e-05, + "loss": 7.0254, + "step": 1524 + }, + { + "epoch": 0.009069607003520792, + "grad_norm": 2.796393871307373, + "learning_rate": 4.998986584367015e-05, + "loss": 7.0124, + "step": 1525 + }, + { + "epoch": 0.009075554286801788, + "grad_norm": 2.9441823959350586, + "learning_rate": 4.9989852540789136e-05, + "loss": 7.0174, + "step": 1526 + }, + { + "epoch": 0.009081501570082787, + "grad_norm": 2.509150743484497, + "learning_rate": 4.998983922918443e-05, + "loss": 6.9405, + "step": 1527 + }, + { + "epoch": 0.009087448853363784, + "grad_norm": 2.3686184883117676, + "learning_rate": 4.998982590885603e-05, + "loss": 6.794, + "step": 1528 + }, + { + "epoch": 0.00909339613664478, + "grad_norm": 2.937530755996704, + "learning_rate": 4.998981257980393e-05, + "loss": 6.9716, + "step": 1529 + }, + { + "epoch": 0.009099343419925777, + "grad_norm": 2.493178606033325, + "learning_rate": 4.998979924202814e-05, + "loss": 6.5986, + "step": 1530 + }, + { + "epoch": 0.009105290703206774, + "grad_norm": 2.071356773376465, + "learning_rate": 4.9989785895528686e-05, + "loss": 6.536, + "step": 1531 + }, + { + "epoch": 0.009111237986487773, + "grad_norm": 1.9372920989990234, + "learning_rate": 4.998977254030554e-05, + "loss": 6.4036, + "step": 1532 + }, + { + "epoch": 0.00911718526976877, + "grad_norm": 2.3329098224639893, + "learning_rate": 4.998975917635873e-05, + "loss": 6.4861, + "step": 1533 + }, + { + "epoch": 0.009123132553049767, + "grad_norm": 2.9681191444396973, + "learning_rate": 4.998974580368826e-05, + "loss": 6.939, + "step": 1534 + }, + { + "epoch": 0.009129079836330763, + "grad_norm": 2.5993690490722656, + "learning_rate": 4.9989732422294125e-05, + "loss": 7.0809, + "step": 1535 + }, + { + "epoch": 0.009135027119611762, + "grad_norm": 2.827244997024536, + "learning_rate": 4.998971903217633e-05, + "loss": 7.597, + "step": 1536 + }, + { + "epoch": 0.009140974402892759, + "grad_norm": 2.712247848510742, + "learning_rate": 4.9989705633334884e-05, + "loss": 7.3695, + "step": 1537 + }, + { + "epoch": 0.009146921686173756, + "grad_norm": 1.7997468709945679, + "learning_rate": 4.998969222576978e-05, + "loss": 7.6497, + "step": 1538 + }, + { + "epoch": 0.009152868969454752, + "grad_norm": 2.234931230545044, + "learning_rate": 4.998967880948104e-05, + "loss": 7.1636, + "step": 1539 + }, + { + "epoch": 0.009158816252735751, + "grad_norm": 2.150766611099243, + "learning_rate": 4.9989665384468666e-05, + "loss": 6.8621, + "step": 1540 + }, + { + "epoch": 0.009164763536016748, + "grad_norm": 2.9628021717071533, + "learning_rate": 4.998965195073265e-05, + "loss": 6.5059, + "step": 1541 + }, + { + "epoch": 0.009170710819297745, + "grad_norm": 2.720155715942383, + "learning_rate": 4.998963850827301e-05, + "loss": 7.0129, + "step": 1542 + }, + { + "epoch": 0.009176658102578741, + "grad_norm": 2.994684934616089, + "learning_rate": 4.9989625057089744e-05, + "loss": 7.3621, + "step": 1543 + }, + { + "epoch": 0.00918260538585974, + "grad_norm": 2.5991618633270264, + "learning_rate": 4.998961159718286e-05, + "loss": 6.7278, + "step": 1544 + }, + { + "epoch": 0.009188552669140737, + "grad_norm": 2.406353712081909, + "learning_rate": 4.9989598128552355e-05, + "loss": 7.5987, + "step": 1545 + }, + { + "epoch": 0.009194499952421734, + "grad_norm": 3.1308467388153076, + "learning_rate": 4.998958465119824e-05, + "loss": 7.1947, + "step": 1546 + }, + { + "epoch": 0.00920044723570273, + "grad_norm": 2.5381908416748047, + "learning_rate": 4.998957116512053e-05, + "loss": 6.8415, + "step": 1547 + }, + { + "epoch": 0.009206394518983729, + "grad_norm": 2.666410446166992, + "learning_rate": 4.998955767031921e-05, + "loss": 6.9052, + "step": 1548 + }, + { + "epoch": 0.009212341802264726, + "grad_norm": 2.156036138534546, + "learning_rate": 4.9989544166794286e-05, + "loss": 7.6604, + "step": 1549 + }, + { + "epoch": 0.009218289085545723, + "grad_norm": 2.620114803314209, + "learning_rate": 4.998953065454578e-05, + "loss": 6.5475, + "step": 1550 + }, + { + "epoch": 0.00922423636882672, + "grad_norm": 3.2780802249908447, + "learning_rate": 4.9989517133573694e-05, + "loss": 7.0572, + "step": 1551 + }, + { + "epoch": 0.009230183652107718, + "grad_norm": 3.6108100414276123, + "learning_rate": 4.998950360387802e-05, + "loss": 7.0149, + "step": 1552 + }, + { + "epoch": 0.009236130935388715, + "grad_norm": 3.4336259365081787, + "learning_rate": 4.998949006545876e-05, + "loss": 7.2436, + "step": 1553 + }, + { + "epoch": 0.009242078218669712, + "grad_norm": 3.271630048751831, + "learning_rate": 4.9989476518315934e-05, + "loss": 7.3807, + "step": 1554 + }, + { + "epoch": 0.009248025501950708, + "grad_norm": 3.0718438625335693, + "learning_rate": 4.998946296244954e-05, + "loss": 7.2313, + "step": 1555 + }, + { + "epoch": 0.009253972785231707, + "grad_norm": 2.2010579109191895, + "learning_rate": 4.9989449397859575e-05, + "loss": 7.4269, + "step": 1556 + }, + { + "epoch": 0.009259920068512704, + "grad_norm": 2.9805495738983154, + "learning_rate": 4.998943582454607e-05, + "loss": 7.2107, + "step": 1557 + }, + { + "epoch": 0.0092658673517937, + "grad_norm": 2.8313159942626953, + "learning_rate": 4.9989422242508995e-05, + "loss": 7.0453, + "step": 1558 + }, + { + "epoch": 0.009271814635074697, + "grad_norm": 2.7660701274871826, + "learning_rate": 4.998940865174837e-05, + "loss": 7.2205, + "step": 1559 + }, + { + "epoch": 0.009277761918355694, + "grad_norm": 3.808122396469116, + "learning_rate": 4.998939505226421e-05, + "loss": 6.9966, + "step": 1560 + }, + { + "epoch": 0.009283709201636693, + "grad_norm": 3.188976526260376, + "learning_rate": 4.99893814440565e-05, + "loss": 7.0049, + "step": 1561 + }, + { + "epoch": 0.00928965648491769, + "grad_norm": 2.5491533279418945, + "learning_rate": 4.998936782712526e-05, + "loss": 7.0451, + "step": 1562 + }, + { + "epoch": 0.009295603768198686, + "grad_norm": 3.4607698917388916, + "learning_rate": 4.99893542014705e-05, + "loss": 7.0304, + "step": 1563 + }, + { + "epoch": 0.009301551051479683, + "grad_norm": 3.4761910438537598, + "learning_rate": 4.99893405670922e-05, + "loss": 6.9787, + "step": 1564 + }, + { + "epoch": 0.009307498334760682, + "grad_norm": 3.15938138961792, + "learning_rate": 4.998932692399039e-05, + "loss": 7.0203, + "step": 1565 + }, + { + "epoch": 0.009313445618041679, + "grad_norm": 2.600304126739502, + "learning_rate": 4.9989313272165064e-05, + "loss": 7.0782, + "step": 1566 + }, + { + "epoch": 0.009319392901322675, + "grad_norm": 2.54158616065979, + "learning_rate": 4.9989299611616216e-05, + "loss": 6.8354, + "step": 1567 + }, + { + "epoch": 0.009325340184603672, + "grad_norm": 3.4649429321289062, + "learning_rate": 4.9989285942343864e-05, + "loss": 6.8238, + "step": 1568 + }, + { + "epoch": 0.00933128746788467, + "grad_norm": 2.522388458251953, + "learning_rate": 4.998927226434802e-05, + "loss": 6.9544, + "step": 1569 + }, + { + "epoch": 0.009337234751165668, + "grad_norm": 4.074129581451416, + "learning_rate": 4.9989258577628675e-05, + "loss": 6.7229, + "step": 1570 + }, + { + "epoch": 0.009343182034446664, + "grad_norm": 3.395894765853882, + "learning_rate": 4.998924488218584e-05, + "loss": 7.1372, + "step": 1571 + }, + { + "epoch": 0.009349129317727661, + "grad_norm": 2.9850378036499023, + "learning_rate": 4.9989231178019516e-05, + "loss": 6.8966, + "step": 1572 + }, + { + "epoch": 0.00935507660100866, + "grad_norm": 3.1391544342041016, + "learning_rate": 4.9989217465129704e-05, + "loss": 6.6744, + "step": 1573 + }, + { + "epoch": 0.009361023884289657, + "grad_norm": 3.8727803230285645, + "learning_rate": 4.9989203743516414e-05, + "loss": 6.9359, + "step": 1574 + }, + { + "epoch": 0.009366971167570654, + "grad_norm": 3.466169595718384, + "learning_rate": 4.998919001317966e-05, + "loss": 6.979, + "step": 1575 + }, + { + "epoch": 0.00937291845085165, + "grad_norm": 3.3481826782226562, + "learning_rate": 4.998917627411943e-05, + "loss": 6.7749, + "step": 1576 + }, + { + "epoch": 0.009378865734132649, + "grad_norm": 2.425971031188965, + "learning_rate": 4.9989162526335745e-05, + "loss": 7.0127, + "step": 1577 + }, + { + "epoch": 0.009384813017413646, + "grad_norm": 2.8379313945770264, + "learning_rate": 4.9989148769828595e-05, + "loss": 6.5782, + "step": 1578 + }, + { + "epoch": 0.009390760300694643, + "grad_norm": 3.0456466674804688, + "learning_rate": 4.9989135004597994e-05, + "loss": 6.9832, + "step": 1579 + }, + { + "epoch": 0.00939670758397564, + "grad_norm": 2.690138101577759, + "learning_rate": 4.9989121230643944e-05, + "loss": 7.0079, + "step": 1580 + }, + { + "epoch": 0.009402654867256638, + "grad_norm": 3.683105945587158, + "learning_rate": 4.9989107447966444e-05, + "loss": 7.2734, + "step": 1581 + }, + { + "epoch": 0.009408602150537635, + "grad_norm": 2.3310985565185547, + "learning_rate": 4.9989093656565513e-05, + "loss": 7.2388, + "step": 1582 + }, + { + "epoch": 0.009414549433818632, + "grad_norm": 2.353322982788086, + "learning_rate": 4.998907985644115e-05, + "loss": 7.0612, + "step": 1583 + }, + { + "epoch": 0.009420496717099628, + "grad_norm": 2.8458571434020996, + "learning_rate": 4.9989066047593344e-05, + "loss": 7.3093, + "step": 1584 + }, + { + "epoch": 0.009426444000380627, + "grad_norm": 2.3322811126708984, + "learning_rate": 4.9989052230022125e-05, + "loss": 6.983, + "step": 1585 + }, + { + "epoch": 0.009432391283661624, + "grad_norm": 2.7431764602661133, + "learning_rate": 4.998903840372748e-05, + "loss": 6.9694, + "step": 1586 + }, + { + "epoch": 0.00943833856694262, + "grad_norm": 2.7704508304595947, + "learning_rate": 4.998902456870942e-05, + "loss": 6.7727, + "step": 1587 + }, + { + "epoch": 0.009444285850223617, + "grad_norm": 2.4920814037323, + "learning_rate": 4.998901072496796e-05, + "loss": 7.0612, + "step": 1588 + }, + { + "epoch": 0.009450233133504616, + "grad_norm": 2.5911498069763184, + "learning_rate": 4.998899687250308e-05, + "loss": 6.8774, + "step": 1589 + }, + { + "epoch": 0.009456180416785613, + "grad_norm": 2.7269680500030518, + "learning_rate": 4.998898301131481e-05, + "loss": 7.0782, + "step": 1590 + }, + { + "epoch": 0.00946212770006661, + "grad_norm": 2.9707436561584473, + "learning_rate": 4.998896914140314e-05, + "loss": 7.307, + "step": 1591 + }, + { + "epoch": 0.009468074983347606, + "grad_norm": 3.064683675765991, + "learning_rate": 4.998895526276808e-05, + "loss": 7.3708, + "step": 1592 + }, + { + "epoch": 0.009474022266628603, + "grad_norm": 2.4465317726135254, + "learning_rate": 4.998894137540963e-05, + "loss": 7.0085, + "step": 1593 + }, + { + "epoch": 0.009479969549909602, + "grad_norm": 3.3061211109161377, + "learning_rate": 4.99889274793278e-05, + "loss": 6.8353, + "step": 1594 + }, + { + "epoch": 0.009485916833190599, + "grad_norm": 3.283397912979126, + "learning_rate": 4.9988913574522594e-05, + "loss": 6.6848, + "step": 1595 + }, + { + "epoch": 0.009491864116471595, + "grad_norm": 2.770745277404785, + "learning_rate": 4.9988899660994014e-05, + "loss": 7.1742, + "step": 1596 + }, + { + "epoch": 0.009497811399752592, + "grad_norm": 2.7975432872772217, + "learning_rate": 4.998888573874207e-05, + "loss": 6.7329, + "step": 1597 + }, + { + "epoch": 0.00950375868303359, + "grad_norm": 2.545919418334961, + "learning_rate": 4.998887180776677e-05, + "loss": 6.7203, + "step": 1598 + }, + { + "epoch": 0.009509705966314588, + "grad_norm": 2.7961528301239014, + "learning_rate": 4.99888578680681e-05, + "loss": 7.384, + "step": 1599 + }, + { + "epoch": 0.009515653249595584, + "grad_norm": 2.570570230484009, + "learning_rate": 4.9988843919646096e-05, + "loss": 7.0246, + "step": 1600 + }, + { + "epoch": 0.009521600532876581, + "grad_norm": 2.5365843772888184, + "learning_rate": 4.9988829962500734e-05, + "loss": 6.8801, + "step": 1601 + }, + { + "epoch": 0.00952754781615758, + "grad_norm": 2.4713737964630127, + "learning_rate": 4.998881599663203e-05, + "loss": 7.1974, + "step": 1602 + }, + { + "epoch": 0.009533495099438577, + "grad_norm": 2.5286331176757812, + "learning_rate": 4.998880202203999e-05, + "loss": 7.26, + "step": 1603 + }, + { + "epoch": 0.009539442382719573, + "grad_norm": 2.2333719730377197, + "learning_rate": 4.998878803872461e-05, + "loss": 7.3254, + "step": 1604 + }, + { + "epoch": 0.00954538966600057, + "grad_norm": 2.544095277786255, + "learning_rate": 4.9988774046685915e-05, + "loss": 7.407, + "step": 1605 + }, + { + "epoch": 0.009551336949281569, + "grad_norm": 3.057140588760376, + "learning_rate": 4.9988760045923886e-05, + "loss": 6.5303, + "step": 1606 + }, + { + "epoch": 0.009557284232562566, + "grad_norm": 3.0190670490264893, + "learning_rate": 4.998874603643854e-05, + "loss": 6.3276, + "step": 1607 + }, + { + "epoch": 0.009563231515843562, + "grad_norm": 2.208249568939209, + "learning_rate": 4.998873201822989e-05, + "loss": 6.856, + "step": 1608 + }, + { + "epoch": 0.00956917879912456, + "grad_norm": 2.3519229888916016, + "learning_rate": 4.998871799129793e-05, + "loss": 6.9854, + "step": 1609 + }, + { + "epoch": 0.009575126082405558, + "grad_norm": 2.604816198348999, + "learning_rate": 4.9988703955642655e-05, + "loss": 7.3127, + "step": 1610 + }, + { + "epoch": 0.009581073365686555, + "grad_norm": 2.320030927658081, + "learning_rate": 4.9988689911264094e-05, + "loss": 7.216, + "step": 1611 + }, + { + "epoch": 0.009587020648967551, + "grad_norm": 2.8475282192230225, + "learning_rate": 4.998867585816224e-05, + "loss": 6.6743, + "step": 1612 + }, + { + "epoch": 0.009592967932248548, + "grad_norm": 2.518707036972046, + "learning_rate": 4.998866179633709e-05, + "loss": 7.0257, + "step": 1613 + }, + { + "epoch": 0.009598915215529547, + "grad_norm": 2.7348618507385254, + "learning_rate": 4.998864772578866e-05, + "loss": 7.1933, + "step": 1614 + }, + { + "epoch": 0.009604862498810544, + "grad_norm": 2.5701184272766113, + "learning_rate": 4.9988633646516946e-05, + "loss": 7.1071, + "step": 1615 + }, + { + "epoch": 0.00961080978209154, + "grad_norm": 2.916544198989868, + "learning_rate": 4.998861955852197e-05, + "loss": 7.1331, + "step": 1616 + }, + { + "epoch": 0.009616757065372537, + "grad_norm": 2.390934944152832, + "learning_rate": 4.998860546180371e-05, + "loss": 7.3252, + "step": 1617 + }, + { + "epoch": 0.009622704348653536, + "grad_norm": 2.6720097064971924, + "learning_rate": 4.998859135636219e-05, + "loss": 7.0105, + "step": 1618 + }, + { + "epoch": 0.009628651631934533, + "grad_norm": 2.3859329223632812, + "learning_rate": 4.998857724219742e-05, + "loss": 7.023, + "step": 1619 + }, + { + "epoch": 0.00963459891521553, + "grad_norm": 2.9713187217712402, + "learning_rate": 4.998856311930939e-05, + "loss": 7.0338, + "step": 1620 + }, + { + "epoch": 0.009640546198496526, + "grad_norm": 2.33858060836792, + "learning_rate": 4.998854898769811e-05, + "loss": 7.0103, + "step": 1621 + }, + { + "epoch": 0.009646493481777523, + "grad_norm": 2.8897042274475098, + "learning_rate": 4.9988534847363585e-05, + "loss": 7.1225, + "step": 1622 + }, + { + "epoch": 0.009652440765058522, + "grad_norm": 2.354513645172119, + "learning_rate": 4.9988520698305826e-05, + "loss": 6.9272, + "step": 1623 + }, + { + "epoch": 0.009658388048339519, + "grad_norm": 2.5571863651275635, + "learning_rate": 4.9988506540524826e-05, + "loss": 6.3418, + "step": 1624 + }, + { + "epoch": 0.009664335331620515, + "grad_norm": 2.342381238937378, + "learning_rate": 4.99884923740206e-05, + "loss": 6.4265, + "step": 1625 + }, + { + "epoch": 0.009670282614901512, + "grad_norm": 2.5594370365142822, + "learning_rate": 4.998847819879315e-05, + "loss": 6.9801, + "step": 1626 + }, + { + "epoch": 0.00967622989818251, + "grad_norm": 3.6932148933410645, + "learning_rate": 4.9988464014842476e-05, + "loss": 7.0231, + "step": 1627 + }, + { + "epoch": 0.009682177181463508, + "grad_norm": 2.713508367538452, + "learning_rate": 4.998844982216859e-05, + "loss": 6.9041, + "step": 1628 + }, + { + "epoch": 0.009688124464744504, + "grad_norm": 2.703103542327881, + "learning_rate": 4.99884356207715e-05, + "loss": 6.9272, + "step": 1629 + }, + { + "epoch": 0.009694071748025501, + "grad_norm": 3.228708267211914, + "learning_rate": 4.9988421410651197e-05, + "loss": 6.9242, + "step": 1630 + }, + { + "epoch": 0.0097000190313065, + "grad_norm": 3.3407063484191895, + "learning_rate": 4.9988407191807694e-05, + "loss": 6.8871, + "step": 1631 + }, + { + "epoch": 0.009705966314587497, + "grad_norm": 2.3833165168762207, + "learning_rate": 4.9988392964241005e-05, + "loss": 6.9667, + "step": 1632 + }, + { + "epoch": 0.009711913597868493, + "grad_norm": 3.607023239135742, + "learning_rate": 4.9988378727951123e-05, + "loss": 6.93, + "step": 1633 + }, + { + "epoch": 0.00971786088114949, + "grad_norm": 3.797107219696045, + "learning_rate": 4.9988364482938056e-05, + "loss": 6.8115, + "step": 1634 + }, + { + "epoch": 0.009723808164430489, + "grad_norm": 2.5586941242218018, + "learning_rate": 4.998835022920181e-05, + "loss": 6.7322, + "step": 1635 + }, + { + "epoch": 0.009729755447711486, + "grad_norm": 2.377680540084839, + "learning_rate": 4.9988335966742385e-05, + "loss": 6.7127, + "step": 1636 + }, + { + "epoch": 0.009735702730992482, + "grad_norm": 2.510584592819214, + "learning_rate": 4.998832169555979e-05, + "loss": 6.836, + "step": 1637 + }, + { + "epoch": 0.00974165001427348, + "grad_norm": 2.8817014694213867, + "learning_rate": 4.9988307415654025e-05, + "loss": 6.7812, + "step": 1638 + }, + { + "epoch": 0.009747597297554478, + "grad_norm": 2.878535509109497, + "learning_rate": 4.998829312702511e-05, + "loss": 6.7852, + "step": 1639 + }, + { + "epoch": 0.009753544580835475, + "grad_norm": 2.5870323181152344, + "learning_rate": 4.998827882967304e-05, + "loss": 6.8569, + "step": 1640 + }, + { + "epoch": 0.009759491864116471, + "grad_norm": 2.7275760173797607, + "learning_rate": 4.998826452359782e-05, + "loss": 6.8304, + "step": 1641 + }, + { + "epoch": 0.009765439147397468, + "grad_norm": 2.24550461769104, + "learning_rate": 4.998825020879945e-05, + "loss": 6.7609, + "step": 1642 + }, + { + "epoch": 0.009771386430678467, + "grad_norm": 2.2101621627807617, + "learning_rate": 4.9988235885277934e-05, + "loss": 6.7548, + "step": 1643 + }, + { + "epoch": 0.009777333713959464, + "grad_norm": 2.289870023727417, + "learning_rate": 4.9988221553033294e-05, + "loss": 6.8899, + "step": 1644 + }, + { + "epoch": 0.00978328099724046, + "grad_norm": 2.6337740421295166, + "learning_rate": 4.9988207212065516e-05, + "loss": 6.7605, + "step": 1645 + }, + { + "epoch": 0.009789228280521457, + "grad_norm": 2.442605972290039, + "learning_rate": 4.998819286237462e-05, + "loss": 6.6299, + "step": 1646 + }, + { + "epoch": 0.009795175563802456, + "grad_norm": 2.6570451259613037, + "learning_rate": 4.9988178503960606e-05, + "loss": 6.6933, + "step": 1647 + }, + { + "epoch": 0.009801122847083453, + "grad_norm": 2.597043752670288, + "learning_rate": 4.9988164136823467e-05, + "loss": 6.7667, + "step": 1648 + }, + { + "epoch": 0.00980707013036445, + "grad_norm": 3.2576608657836914, + "learning_rate": 4.998814976096323e-05, + "loss": 7.1774, + "step": 1649 + }, + { + "epoch": 0.009813017413645446, + "grad_norm": 3.110119342803955, + "learning_rate": 4.998813537637988e-05, + "loss": 7.2139, + "step": 1650 + }, + { + "epoch": 0.009818964696926445, + "grad_norm": 3.038086414337158, + "learning_rate": 4.998812098307343e-05, + "loss": 7.2752, + "step": 1651 + }, + { + "epoch": 0.009824911980207442, + "grad_norm": 2.965916872024536, + "learning_rate": 4.998810658104389e-05, + "loss": 7.1151, + "step": 1652 + }, + { + "epoch": 0.009830859263488438, + "grad_norm": 3.011476755142212, + "learning_rate": 4.998809217029126e-05, + "loss": 7.1335, + "step": 1653 + }, + { + "epoch": 0.009836806546769435, + "grad_norm": 3.8196349143981934, + "learning_rate": 4.9988077750815534e-05, + "loss": 7.0865, + "step": 1654 + }, + { + "epoch": 0.009842753830050432, + "grad_norm": 3.2577872276306152, + "learning_rate": 4.998806332261674e-05, + "loss": 7.4285, + "step": 1655 + }, + { + "epoch": 0.00984870111333143, + "grad_norm": 2.847039222717285, + "learning_rate": 4.998804888569487e-05, + "loss": 7.3251, + "step": 1656 + }, + { + "epoch": 0.009854648396612428, + "grad_norm": 3.4066355228424072, + "learning_rate": 4.998803444004992e-05, + "loss": 7.3137, + "step": 1657 + }, + { + "epoch": 0.009860595679893424, + "grad_norm": 3.6774044036865234, + "learning_rate": 4.998801998568192e-05, + "loss": 7.0772, + "step": 1658 + }, + { + "epoch": 0.009866542963174421, + "grad_norm": 3.1404600143432617, + "learning_rate": 4.998800552259085e-05, + "loss": 7.1143, + "step": 1659 + }, + { + "epoch": 0.00987249024645542, + "grad_norm": 3.6337625980377197, + "learning_rate": 4.998799105077674e-05, + "loss": 7.1296, + "step": 1660 + }, + { + "epoch": 0.009878437529736417, + "grad_norm": 4.551114082336426, + "learning_rate": 4.9987976570239566e-05, + "loss": 7.1343, + "step": 1661 + }, + { + "epoch": 0.009884384813017413, + "grad_norm": 3.2305374145507812, + "learning_rate": 4.998796208097935e-05, + "loss": 7.0852, + "step": 1662 + }, + { + "epoch": 0.00989033209629841, + "grad_norm": 2.5174615383148193, + "learning_rate": 4.99879475829961e-05, + "loss": 7.2315, + "step": 1663 + }, + { + "epoch": 0.009896279379579409, + "grad_norm": 3.623525381088257, + "learning_rate": 4.9987933076289804e-05, + "loss": 7.4222, + "step": 1664 + }, + { + "epoch": 0.009902226662860406, + "grad_norm": 4.217465877532959, + "learning_rate": 4.998791856086049e-05, + "loss": 7.4003, + "step": 1665 + }, + { + "epoch": 0.009908173946141402, + "grad_norm": 2.42301344871521, + "learning_rate": 4.998790403670815e-05, + "loss": 7.3295, + "step": 1666 + }, + { + "epoch": 0.0099141212294224, + "grad_norm": 2.3003029823303223, + "learning_rate": 4.998788950383279e-05, + "loss": 7.2072, + "step": 1667 + }, + { + "epoch": 0.009920068512703398, + "grad_norm": 3.3792307376861572, + "learning_rate": 4.9987874962234414e-05, + "loss": 7.2882, + "step": 1668 + }, + { + "epoch": 0.009926015795984395, + "grad_norm": 3.42130184173584, + "learning_rate": 4.998786041191303e-05, + "loss": 7.1231, + "step": 1669 + }, + { + "epoch": 0.009931963079265391, + "grad_norm": 3.496676445007324, + "learning_rate": 4.9987845852868644e-05, + "loss": 7.2535, + "step": 1670 + }, + { + "epoch": 0.009937910362546388, + "grad_norm": 2.695780038833618, + "learning_rate": 4.9987831285101255e-05, + "loss": 7.3784, + "step": 1671 + }, + { + "epoch": 0.009943857645827387, + "grad_norm": 2.2745561599731445, + "learning_rate": 4.998781670861088e-05, + "loss": 7.1184, + "step": 1672 + }, + { + "epoch": 0.009949804929108384, + "grad_norm": 3.8487844467163086, + "learning_rate": 4.99878021233975e-05, + "loss": 7.277, + "step": 1673 + }, + { + "epoch": 0.00995575221238938, + "grad_norm": 2.6628305912017822, + "learning_rate": 4.998778752946115e-05, + "loss": 6.8204, + "step": 1674 + }, + { + "epoch": 0.009961699495670377, + "grad_norm": 3.6330301761627197, + "learning_rate": 4.998777292680182e-05, + "loss": 7.3003, + "step": 1675 + }, + { + "epoch": 0.009967646778951376, + "grad_norm": 2.644237995147705, + "learning_rate": 4.998775831541952e-05, + "loss": 7.1492, + "step": 1676 + }, + { + "epoch": 0.009973594062232373, + "grad_norm": 2.895193099975586, + "learning_rate": 4.998774369531424e-05, + "loss": 7.3986, + "step": 1677 + }, + { + "epoch": 0.00997954134551337, + "grad_norm": 3.2180328369140625, + "learning_rate": 4.998772906648601e-05, + "loss": 7.1085, + "step": 1678 + }, + { + "epoch": 0.009985488628794366, + "grad_norm": 3.5874838829040527, + "learning_rate": 4.9987714428934815e-05, + "loss": 6.9554, + "step": 1679 + }, + { + "epoch": 0.009991435912075365, + "grad_norm": 2.419516086578369, + "learning_rate": 4.9987699782660666e-05, + "loss": 6.6222, + "step": 1680 + }, + { + "epoch": 0.009997383195356362, + "grad_norm": 2.715808153152466, + "learning_rate": 4.9987685127663574e-05, + "loss": 6.8417, + "step": 1681 + }, + { + "epoch": 0.010003330478637358, + "grad_norm": 2.2847111225128174, + "learning_rate": 4.9987670463943534e-05, + "loss": 7.1649, + "step": 1682 + }, + { + "epoch": 0.010009277761918355, + "grad_norm": 2.402684450149536, + "learning_rate": 4.998765579150056e-05, + "loss": 7.6113, + "step": 1683 + }, + { + "epoch": 0.010015225045199352, + "grad_norm": 2.54388689994812, + "learning_rate": 4.998764111033465e-05, + "loss": 7.1261, + "step": 1684 + }, + { + "epoch": 0.01002117232848035, + "grad_norm": 2.8077542781829834, + "learning_rate": 4.9987626420445823e-05, + "loss": 7.1349, + "step": 1685 + }, + { + "epoch": 0.010027119611761347, + "grad_norm": 2.228707790374756, + "learning_rate": 4.9987611721834063e-05, + "loss": 7.1123, + "step": 1686 + }, + { + "epoch": 0.010033066895042344, + "grad_norm": 2.648607015609741, + "learning_rate": 4.998759701449939e-05, + "loss": 7.0263, + "step": 1687 + }, + { + "epoch": 0.010039014178323341, + "grad_norm": 3.0278162956237793, + "learning_rate": 4.99875822984418e-05, + "loss": 6.6463, + "step": 1688 + }, + { + "epoch": 0.01004496146160434, + "grad_norm": 3.1550052165985107, + "learning_rate": 4.998756757366131e-05, + "loss": 6.8773, + "step": 1689 + }, + { + "epoch": 0.010050908744885336, + "grad_norm": 3.3911843299865723, + "learning_rate": 4.998755284015792e-05, + "loss": 7.5045, + "step": 1690 + }, + { + "epoch": 0.010056856028166333, + "grad_norm": 2.668861150741577, + "learning_rate": 4.998753809793162e-05, + "loss": 7.5545, + "step": 1691 + }, + { + "epoch": 0.01006280331144733, + "grad_norm": 2.182792901992798, + "learning_rate": 4.998752334698244e-05, + "loss": 7.2315, + "step": 1692 + }, + { + "epoch": 0.010068750594728329, + "grad_norm": 2.981476068496704, + "learning_rate": 4.998750858731037e-05, + "loss": 7.3455, + "step": 1693 + }, + { + "epoch": 0.010074697878009325, + "grad_norm": 3.1855525970458984, + "learning_rate": 4.998749381891542e-05, + "loss": 7.3408, + "step": 1694 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 2.5677361488342285, + "learning_rate": 4.998747904179759e-05, + "loss": 6.7591, + "step": 1695 + }, + { + "epoch": 0.010086592444571319, + "grad_norm": 2.7397539615631104, + "learning_rate": 4.9987464255956894e-05, + "loss": 7.3976, + "step": 1696 + }, + { + "epoch": 0.010092539727852318, + "grad_norm": 2.1141586303710938, + "learning_rate": 4.998744946139333e-05, + "loss": 7.4287, + "step": 1697 + }, + { + "epoch": 0.010098487011133314, + "grad_norm": 2.1999096870422363, + "learning_rate": 4.998743465810691e-05, + "loss": 7.4804, + "step": 1698 + }, + { + "epoch": 0.010104434294414311, + "grad_norm": 2.4150960445404053, + "learning_rate": 4.9987419846097634e-05, + "loss": 7.1743, + "step": 1699 + }, + { + "epoch": 0.010110381577695308, + "grad_norm": 2.564270496368408, + "learning_rate": 4.998740502536551e-05, + "loss": 7.262, + "step": 1700 + }, + { + "epoch": 0.010116328860976307, + "grad_norm": 3.045964241027832, + "learning_rate": 4.9987390195910536e-05, + "loss": 7.0778, + "step": 1701 + }, + { + "epoch": 0.010122276144257304, + "grad_norm": 3.2720210552215576, + "learning_rate": 4.998737535773272e-05, + "loss": 7.2188, + "step": 1702 + }, + { + "epoch": 0.0101282234275383, + "grad_norm": 2.54496693611145, + "learning_rate": 4.998736051083207e-05, + "loss": 6.9985, + "step": 1703 + }, + { + "epoch": 0.010134170710819297, + "grad_norm": 3.6252541542053223, + "learning_rate": 4.998734565520859e-05, + "loss": 7.3502, + "step": 1704 + }, + { + "epoch": 0.010140117994100296, + "grad_norm": 3.468963146209717, + "learning_rate": 4.99873307908623e-05, + "loss": 6.9642, + "step": 1705 + }, + { + "epoch": 0.010146065277381293, + "grad_norm": 2.8778045177459717, + "learning_rate": 4.9987315917793174e-05, + "loss": 6.8675, + "step": 1706 + }, + { + "epoch": 0.01015201256066229, + "grad_norm": 2.4492053985595703, + "learning_rate": 4.9987301036001236e-05, + "loss": 7.3484, + "step": 1707 + }, + { + "epoch": 0.010157959843943286, + "grad_norm": 2.5170838832855225, + "learning_rate": 4.99872861454865e-05, + "loss": 7.6004, + "step": 1708 + }, + { + "epoch": 0.010163907127224285, + "grad_norm": 2.3539648056030273, + "learning_rate": 4.998727124624895e-05, + "loss": 7.3304, + "step": 1709 + }, + { + "epoch": 0.010169854410505282, + "grad_norm": 2.6097705364227295, + "learning_rate": 4.998725633828861e-05, + "loss": 7.3227, + "step": 1710 + }, + { + "epoch": 0.010175801693786278, + "grad_norm": 2.5909392833709717, + "learning_rate": 4.9987241421605466e-05, + "loss": 7.3797, + "step": 1711 + }, + { + "epoch": 0.010181748977067275, + "grad_norm": 3.143157958984375, + "learning_rate": 4.998722649619954e-05, + "loss": 7.1236, + "step": 1712 + }, + { + "epoch": 0.010187696260348274, + "grad_norm": 2.0621843338012695, + "learning_rate": 4.9987211562070835e-05, + "loss": 7.5322, + "step": 1713 + }, + { + "epoch": 0.01019364354362927, + "grad_norm": 1.7781084775924683, + "learning_rate": 4.9987196619219354e-05, + "loss": 7.428, + "step": 1714 + }, + { + "epoch": 0.010199590826910267, + "grad_norm": 2.3108980655670166, + "learning_rate": 4.9987181667645094e-05, + "loss": 7.3814, + "step": 1715 + }, + { + "epoch": 0.010205538110191264, + "grad_norm": 2.5184621810913086, + "learning_rate": 4.998716670734807e-05, + "loss": 7.374, + "step": 1716 + }, + { + "epoch": 0.010211485393472261, + "grad_norm": 1.9185826778411865, + "learning_rate": 4.9987151738328284e-05, + "loss": 7.3352, + "step": 1717 + }, + { + "epoch": 0.01021743267675326, + "grad_norm": 2.794224262237549, + "learning_rate": 4.998713676058574e-05, + "loss": 7.0293, + "step": 1718 + }, + { + "epoch": 0.010223379960034256, + "grad_norm": 3.601804733276367, + "learning_rate": 4.998712177412045e-05, + "loss": 7.0277, + "step": 1719 + }, + { + "epoch": 0.010229327243315253, + "grad_norm": 3.3258707523345947, + "learning_rate": 4.998710677893241e-05, + "loss": 6.9478, + "step": 1720 + }, + { + "epoch": 0.01023527452659625, + "grad_norm": 3.147439956665039, + "learning_rate": 4.9987091775021625e-05, + "loss": 6.7295, + "step": 1721 + }, + { + "epoch": 0.010241221809877249, + "grad_norm": 2.7821006774902344, + "learning_rate": 4.998707676238811e-05, + "loss": 6.7587, + "step": 1722 + }, + { + "epoch": 0.010247169093158245, + "grad_norm": 2.580597400665283, + "learning_rate": 4.998706174103186e-05, + "loss": 6.9091, + "step": 1723 + }, + { + "epoch": 0.010253116376439242, + "grad_norm": 2.5501208305358887, + "learning_rate": 4.998704671095289e-05, + "loss": 7.3262, + "step": 1724 + }, + { + "epoch": 0.010259063659720239, + "grad_norm": 2.5460124015808105, + "learning_rate": 4.99870316721512e-05, + "loss": 7.278, + "step": 1725 + }, + { + "epoch": 0.010265010943001238, + "grad_norm": 2.0253796577453613, + "learning_rate": 4.998701662462679e-05, + "loss": 7.1757, + "step": 1726 + }, + { + "epoch": 0.010270958226282234, + "grad_norm": 2.3127388954162598, + "learning_rate": 4.998700156837968e-05, + "loss": 7.1057, + "step": 1727 + }, + { + "epoch": 0.010276905509563231, + "grad_norm": 2.931878089904785, + "learning_rate": 4.998698650340986e-05, + "loss": 6.9993, + "step": 1728 + }, + { + "epoch": 0.010282852792844228, + "grad_norm": 3.239272356033325, + "learning_rate": 4.998697142971734e-05, + "loss": 6.7754, + "step": 1729 + }, + { + "epoch": 0.010288800076125227, + "grad_norm": 2.388212203979492, + "learning_rate": 4.998695634730213e-05, + "loss": 7.2794, + "step": 1730 + }, + { + "epoch": 0.010294747359406223, + "grad_norm": 2.7766799926757812, + "learning_rate": 4.998694125616423e-05, + "loss": 7.4636, + "step": 1731 + }, + { + "epoch": 0.01030069464268722, + "grad_norm": 2.543757915496826, + "learning_rate": 4.9986926156303646e-05, + "loss": 6.8801, + "step": 1732 + }, + { + "epoch": 0.010306641925968217, + "grad_norm": 1.8907097578048706, + "learning_rate": 4.9986911047720384e-05, + "loss": 7.0353, + "step": 1733 + }, + { + "epoch": 0.010312589209249216, + "grad_norm": 1.9585598707199097, + "learning_rate": 4.9986895930414444e-05, + "loss": 7.0469, + "step": 1734 + }, + { + "epoch": 0.010318536492530212, + "grad_norm": 2.5191497802734375, + "learning_rate": 4.998688080438585e-05, + "loss": 7.1469, + "step": 1735 + }, + { + "epoch": 0.01032448377581121, + "grad_norm": 3.5709545612335205, + "learning_rate": 4.998686566963459e-05, + "loss": 7.0499, + "step": 1736 + }, + { + "epoch": 0.010330431059092206, + "grad_norm": 2.3778624534606934, + "learning_rate": 4.998685052616067e-05, + "loss": 7.5897, + "step": 1737 + }, + { + "epoch": 0.010336378342373205, + "grad_norm": 2.0795674324035645, + "learning_rate": 4.9986835373964094e-05, + "loss": 6.8778, + "step": 1738 + }, + { + "epoch": 0.010342325625654201, + "grad_norm": 2.7674901485443115, + "learning_rate": 4.9986820213044875e-05, + "loss": 6.4428, + "step": 1739 + }, + { + "epoch": 0.010348272908935198, + "grad_norm": 2.7203595638275146, + "learning_rate": 4.998680504340302e-05, + "loss": 7.4668, + "step": 1740 + }, + { + "epoch": 0.010354220192216195, + "grad_norm": 2.840240955352783, + "learning_rate": 4.998678986503853e-05, + "loss": 7.2219, + "step": 1741 + }, + { + "epoch": 0.010360167475497194, + "grad_norm": 2.7803452014923096, + "learning_rate": 4.9986774677951404e-05, + "loss": 6.5674, + "step": 1742 + }, + { + "epoch": 0.01036611475877819, + "grad_norm": 2.467574119567871, + "learning_rate": 4.998675948214165e-05, + "loss": 6.9621, + "step": 1743 + }, + { + "epoch": 0.010372062042059187, + "grad_norm": 2.1437904834747314, + "learning_rate": 4.998674427760929e-05, + "loss": 7.1564, + "step": 1744 + }, + { + "epoch": 0.010378009325340184, + "grad_norm": 2.504685163497925, + "learning_rate": 4.9986729064354304e-05, + "loss": 6.8836, + "step": 1745 + }, + { + "epoch": 0.010383956608621183, + "grad_norm": 2.401296615600586, + "learning_rate": 4.998671384237671e-05, + "loss": 7.2906, + "step": 1746 + }, + { + "epoch": 0.01038990389190218, + "grad_norm": 2.233701705932617, + "learning_rate": 4.9986698611676516e-05, + "loss": 6.6854, + "step": 1747 + }, + { + "epoch": 0.010395851175183176, + "grad_norm": 2.9597983360290527, + "learning_rate": 4.998668337225373e-05, + "loss": 6.8859, + "step": 1748 + }, + { + "epoch": 0.010401798458464173, + "grad_norm": 3.2164804935455322, + "learning_rate": 4.998666812410834e-05, + "loss": 6.8255, + "step": 1749 + }, + { + "epoch": 0.01040774574174517, + "grad_norm": 3.010002374649048, + "learning_rate": 4.9986652867240364e-05, + "loss": 6.7092, + "step": 1750 + }, + { + "epoch": 0.010413693025026169, + "grad_norm": 2.8442068099975586, + "learning_rate": 4.998663760164981e-05, + "loss": 6.7231, + "step": 1751 + }, + { + "epoch": 0.010419640308307165, + "grad_norm": 3.127922773361206, + "learning_rate": 4.9986622327336676e-05, + "loss": 6.6072, + "step": 1752 + }, + { + "epoch": 0.010425587591588162, + "grad_norm": 2.7306833267211914, + "learning_rate": 4.998660704430097e-05, + "loss": 6.696, + "step": 1753 + }, + { + "epoch": 0.010431534874869159, + "grad_norm": 2.9005799293518066, + "learning_rate": 4.99865917525427e-05, + "loss": 6.6598, + "step": 1754 + }, + { + "epoch": 0.010437482158150158, + "grad_norm": 3.17934513092041, + "learning_rate": 4.9986576452061865e-05, + "loss": 6.5887, + "step": 1755 + }, + { + "epoch": 0.010443429441431154, + "grad_norm": 2.9390244483947754, + "learning_rate": 4.9986561142858476e-05, + "loss": 6.5375, + "step": 1756 + }, + { + "epoch": 0.010449376724712151, + "grad_norm": 2.5547196865081787, + "learning_rate": 4.998654582493254e-05, + "loss": 6.7484, + "step": 1757 + }, + { + "epoch": 0.010455324007993148, + "grad_norm": 2.9969568252563477, + "learning_rate": 4.9986530498284054e-05, + "loss": 6.6496, + "step": 1758 + }, + { + "epoch": 0.010461271291274147, + "grad_norm": 2.843932867050171, + "learning_rate": 4.998651516291303e-05, + "loss": 6.5713, + "step": 1759 + }, + { + "epoch": 0.010467218574555143, + "grad_norm": 2.9114811420440674, + "learning_rate": 4.9986499818819476e-05, + "loss": 7.5248, + "step": 1760 + }, + { + "epoch": 0.01047316585783614, + "grad_norm": 3.0292229652404785, + "learning_rate": 4.998648446600339e-05, + "loss": 7.2346, + "step": 1761 + }, + { + "epoch": 0.010479113141117137, + "grad_norm": 2.553088426589966, + "learning_rate": 4.998646910446478e-05, + "loss": 7.1531, + "step": 1762 + }, + { + "epoch": 0.010485060424398136, + "grad_norm": 2.9838356971740723, + "learning_rate": 4.998645373420365e-05, + "loss": 6.6561, + "step": 1763 + }, + { + "epoch": 0.010491007707679132, + "grad_norm": 2.8948864936828613, + "learning_rate": 4.9986438355220014e-05, + "loss": 6.463, + "step": 1764 + }, + { + "epoch": 0.01049695499096013, + "grad_norm": 2.805084228515625, + "learning_rate": 4.9986422967513856e-05, + "loss": 6.701, + "step": 1765 + }, + { + "epoch": 0.010502902274241126, + "grad_norm": 2.748077869415283, + "learning_rate": 4.998640757108522e-05, + "loss": 7.3223, + "step": 1766 + }, + { + "epoch": 0.010508849557522125, + "grad_norm": 3.0048258304595947, + "learning_rate": 4.998639216593406e-05, + "loss": 7.2582, + "step": 1767 + }, + { + "epoch": 0.010514796840803121, + "grad_norm": 2.538522958755493, + "learning_rate": 4.998637675206043e-05, + "loss": 7.1208, + "step": 1768 + }, + { + "epoch": 0.010520744124084118, + "grad_norm": 2.2091188430786133, + "learning_rate": 4.99863613294643e-05, + "loss": 7.0577, + "step": 1769 + }, + { + "epoch": 0.010526691407365115, + "grad_norm": 2.8454909324645996, + "learning_rate": 4.998634589814569e-05, + "loss": 7.1296, + "step": 1770 + }, + { + "epoch": 0.010532638690646114, + "grad_norm": 3.4139351844787598, + "learning_rate": 4.998633045810461e-05, + "loss": 6.9565, + "step": 1771 + }, + { + "epoch": 0.01053858597392711, + "grad_norm": 2.3192107677459717, + "learning_rate": 4.9986315009341066e-05, + "loss": 6.6027, + "step": 1772 + }, + { + "epoch": 0.010544533257208107, + "grad_norm": 2.309290647506714, + "learning_rate": 4.998629955185505e-05, + "loss": 7.0417, + "step": 1773 + }, + { + "epoch": 0.010550480540489104, + "grad_norm": 3.2046520709991455, + "learning_rate": 4.998628408564657e-05, + "loss": 7.0368, + "step": 1774 + }, + { + "epoch": 0.010556427823770103, + "grad_norm": 2.459064483642578, + "learning_rate": 4.9986268610715646e-05, + "loss": 7.2726, + "step": 1775 + }, + { + "epoch": 0.0105623751070511, + "grad_norm": 2.602522134780884, + "learning_rate": 4.998625312706227e-05, + "loss": 7.3377, + "step": 1776 + }, + { + "epoch": 0.010568322390332096, + "grad_norm": 3.9599175453186035, + "learning_rate": 4.998623763468645e-05, + "loss": 6.9146, + "step": 1777 + }, + { + "epoch": 0.010574269673613093, + "grad_norm": 3.312527894973755, + "learning_rate": 4.99862221335882e-05, + "loss": 6.7457, + "step": 1778 + }, + { + "epoch": 0.01058021695689409, + "grad_norm": 2.5287606716156006, + "learning_rate": 4.9986206623767506e-05, + "loss": 7.2651, + "step": 1779 + }, + { + "epoch": 0.010586164240175088, + "grad_norm": 2.4065616130828857, + "learning_rate": 4.99861911052244e-05, + "loss": 7.1135, + "step": 1780 + }, + { + "epoch": 0.010592111523456085, + "grad_norm": 2.321385383605957, + "learning_rate": 4.998617557795886e-05, + "loss": 7.1985, + "step": 1781 + }, + { + "epoch": 0.010598058806737082, + "grad_norm": 2.118995189666748, + "learning_rate": 4.9986160041970906e-05, + "loss": 7.2832, + "step": 1782 + }, + { + "epoch": 0.010604006090018079, + "grad_norm": 2.2536606788635254, + "learning_rate": 4.9986144497260544e-05, + "loss": 7.191, + "step": 1783 + }, + { + "epoch": 0.010609953373299078, + "grad_norm": 2.2956738471984863, + "learning_rate": 4.998612894382778e-05, + "loss": 7.0496, + "step": 1784 + }, + { + "epoch": 0.010615900656580074, + "grad_norm": 2.4258289337158203, + "learning_rate": 4.9986113381672614e-05, + "loss": 7.2767, + "step": 1785 + }, + { + "epoch": 0.010621847939861071, + "grad_norm": 2.4731507301330566, + "learning_rate": 4.998609781079505e-05, + "loss": 6.8805, + "step": 1786 + }, + { + "epoch": 0.010627795223142068, + "grad_norm": 2.3245391845703125, + "learning_rate": 4.9986082231195105e-05, + "loss": 6.8921, + "step": 1787 + }, + { + "epoch": 0.010633742506423067, + "grad_norm": 2.6239898204803467, + "learning_rate": 4.998606664287278e-05, + "loss": 6.9353, + "step": 1788 + }, + { + "epoch": 0.010639689789704063, + "grad_norm": 2.186162233352661, + "learning_rate": 4.9986051045828065e-05, + "loss": 6.8466, + "step": 1789 + }, + { + "epoch": 0.01064563707298506, + "grad_norm": 2.2362232208251953, + "learning_rate": 4.998603544006098e-05, + "loss": 6.82, + "step": 1790 + }, + { + "epoch": 0.010651584356266057, + "grad_norm": 2.2302427291870117, + "learning_rate": 4.998601982557153e-05, + "loss": 6.7034, + "step": 1791 + }, + { + "epoch": 0.010657531639547056, + "grad_norm": 2.0393195152282715, + "learning_rate": 4.998600420235972e-05, + "loss": 6.6646, + "step": 1792 + }, + { + "epoch": 0.010663478922828052, + "grad_norm": 1.976536512374878, + "learning_rate": 4.9985988570425556e-05, + "loss": 6.4994, + "step": 1793 + }, + { + "epoch": 0.01066942620610905, + "grad_norm": 2.4167046546936035, + "learning_rate": 4.998597292976904e-05, + "loss": 6.7849, + "step": 1794 + }, + { + "epoch": 0.010675373489390046, + "grad_norm": 2.3077776432037354, + "learning_rate": 4.998595728039018e-05, + "loss": 6.8356, + "step": 1795 + }, + { + "epoch": 0.010681320772671045, + "grad_norm": 2.5263309478759766, + "learning_rate": 4.998594162228898e-05, + "loss": 6.6351, + "step": 1796 + }, + { + "epoch": 0.010687268055952041, + "grad_norm": 2.153365135192871, + "learning_rate": 4.9985925955465443e-05, + "loss": 6.7911, + "step": 1797 + }, + { + "epoch": 0.010693215339233038, + "grad_norm": 3.3034393787384033, + "learning_rate": 4.998591027991958e-05, + "loss": 6.7589, + "step": 1798 + }, + { + "epoch": 0.010699162622514035, + "grad_norm": 2.2177388668060303, + "learning_rate": 4.998589459565139e-05, + "loss": 6.571, + "step": 1799 + }, + { + "epoch": 0.010705109905795034, + "grad_norm": 2.3165230751037598, + "learning_rate": 4.9985878902660886e-05, + "loss": 6.9124, + "step": 1800 + }, + { + "epoch": 0.01071105718907603, + "grad_norm": 2.270045757293701, + "learning_rate": 4.998586320094807e-05, + "loss": 6.4442, + "step": 1801 + }, + { + "epoch": 0.010717004472357027, + "grad_norm": 2.1198744773864746, + "learning_rate": 4.9985847490512945e-05, + "loss": 6.555, + "step": 1802 + }, + { + "epoch": 0.010722951755638024, + "grad_norm": 2.5428359508514404, + "learning_rate": 4.998583177135552e-05, + "loss": 6.8991, + "step": 1803 + }, + { + "epoch": 0.010728899038919023, + "grad_norm": 1.983817219734192, + "learning_rate": 4.99858160434758e-05, + "loss": 6.6428, + "step": 1804 + }, + { + "epoch": 0.01073484632220002, + "grad_norm": 2.2749712467193604, + "learning_rate": 4.998580030687379e-05, + "loss": 6.7294, + "step": 1805 + }, + { + "epoch": 0.010740793605481016, + "grad_norm": 1.914762258529663, + "learning_rate": 4.998578456154949e-05, + "loss": 7.0395, + "step": 1806 + }, + { + "epoch": 0.010746740888762013, + "grad_norm": 1.6850765943527222, + "learning_rate": 4.998576880750292e-05, + "loss": 6.862, + "step": 1807 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 2.2930233478546143, + "learning_rate": 4.9985753044734076e-05, + "loss": 6.8213, + "step": 1808 + }, + { + "epoch": 0.010758635455324008, + "grad_norm": 2.193464756011963, + "learning_rate": 4.998573727324295e-05, + "loss": 6.9303, + "step": 1809 + }, + { + "epoch": 0.010764582738605005, + "grad_norm": 2.2451658248901367, + "learning_rate": 4.9985721493029576e-05, + "loss": 6.8061, + "step": 1810 + }, + { + "epoch": 0.010770530021886002, + "grad_norm": 2.164214849472046, + "learning_rate": 4.998570570409394e-05, + "loss": 6.6485, + "step": 1811 + }, + { + "epoch": 0.010776477305166999, + "grad_norm": 2.3530375957489014, + "learning_rate": 4.9985689906436054e-05, + "loss": 6.6826, + "step": 1812 + }, + { + "epoch": 0.010782424588447997, + "grad_norm": 3.007641553878784, + "learning_rate": 4.998567410005591e-05, + "loss": 6.0781, + "step": 1813 + }, + { + "epoch": 0.010788371871728994, + "grad_norm": 2.500411033630371, + "learning_rate": 4.998565828495354e-05, + "loss": 7.0544, + "step": 1814 + }, + { + "epoch": 0.010794319155009991, + "grad_norm": 2.329221725463867, + "learning_rate": 4.998564246112893e-05, + "loss": 7.2505, + "step": 1815 + }, + { + "epoch": 0.010800266438290988, + "grad_norm": 2.05120849609375, + "learning_rate": 4.998562662858209e-05, + "loss": 7.3094, + "step": 1816 + }, + { + "epoch": 0.010806213721571986, + "grad_norm": 1.83049738407135, + "learning_rate": 4.9985610787313023e-05, + "loss": 6.7752, + "step": 1817 + }, + { + "epoch": 0.010812161004852983, + "grad_norm": 2.2754576206207275, + "learning_rate": 4.998559493732174e-05, + "loss": 6.9396, + "step": 1818 + }, + { + "epoch": 0.01081810828813398, + "grad_norm": 2.104849338531494, + "learning_rate": 4.998557907860825e-05, + "loss": 7.2624, + "step": 1819 + }, + { + "epoch": 0.010824055571414977, + "grad_norm": 3.152069568634033, + "learning_rate": 4.998556321117254e-05, + "loss": 6.6763, + "step": 1820 + }, + { + "epoch": 0.010830002854695975, + "grad_norm": 3.4046475887298584, + "learning_rate": 4.9985547335014636e-05, + "loss": 6.7145, + "step": 1821 + }, + { + "epoch": 0.010835950137976972, + "grad_norm": 1.9208084344863892, + "learning_rate": 4.9985531450134534e-05, + "loss": 6.8985, + "step": 1822 + }, + { + "epoch": 0.010841897421257969, + "grad_norm": 2.4949824810028076, + "learning_rate": 4.998551555653224e-05, + "loss": 6.8196, + "step": 1823 + }, + { + "epoch": 0.010847844704538966, + "grad_norm": 2.613175392150879, + "learning_rate": 4.998549965420776e-05, + "loss": 6.7918, + "step": 1824 + }, + { + "epoch": 0.010853791987819965, + "grad_norm": 2.3322529792785645, + "learning_rate": 4.9985483743161105e-05, + "loss": 6.6133, + "step": 1825 + }, + { + "epoch": 0.010859739271100961, + "grad_norm": 3.116680860519409, + "learning_rate": 4.998546782339227e-05, + "loss": 7.4026, + "step": 1826 + }, + { + "epoch": 0.010865686554381958, + "grad_norm": 2.673938274383545, + "learning_rate": 4.998545189490127e-05, + "loss": 6.9181, + "step": 1827 + }, + { + "epoch": 0.010871633837662955, + "grad_norm": 2.135727643966675, + "learning_rate": 4.998543595768811e-05, + "loss": 6.9514, + "step": 1828 + }, + { + "epoch": 0.010877581120943954, + "grad_norm": 2.241696357727051, + "learning_rate": 4.9985420011752784e-05, + "loss": 7.126, + "step": 1829 + }, + { + "epoch": 0.01088352840422495, + "grad_norm": 2.316342830657959, + "learning_rate": 4.9985404057095315e-05, + "loss": 6.9752, + "step": 1830 + }, + { + "epoch": 0.010889475687505947, + "grad_norm": 2.591611623764038, + "learning_rate": 4.998538809371569e-05, + "loss": 6.8721, + "step": 1831 + }, + { + "epoch": 0.010895422970786944, + "grad_norm": 2.2846317291259766, + "learning_rate": 4.9985372121613935e-05, + "loss": 6.9468, + "step": 1832 + }, + { + "epoch": 0.010901370254067943, + "grad_norm": 2.0799343585968018, + "learning_rate": 4.998535614079004e-05, + "loss": 7.0839, + "step": 1833 + }, + { + "epoch": 0.01090731753734894, + "grad_norm": 2.1908833980560303, + "learning_rate": 4.998534015124401e-05, + "loss": 6.7228, + "step": 1834 + }, + { + "epoch": 0.010913264820629936, + "grad_norm": 2.329401969909668, + "learning_rate": 4.998532415297587e-05, + "loss": 6.715, + "step": 1835 + }, + { + "epoch": 0.010919212103910933, + "grad_norm": 1.9492794275283813, + "learning_rate": 4.998530814598559e-05, + "loss": 6.6762, + "step": 1836 + }, + { + "epoch": 0.010925159387191932, + "grad_norm": 1.9564979076385498, + "learning_rate": 4.998529213027321e-05, + "loss": 6.8545, + "step": 1837 + }, + { + "epoch": 0.010931106670472928, + "grad_norm": 1.8424931764602661, + "learning_rate": 4.998527610583872e-05, + "loss": 6.8505, + "step": 1838 + }, + { + "epoch": 0.010937053953753925, + "grad_norm": 1.9743967056274414, + "learning_rate": 4.998526007268213e-05, + "loss": 6.8413, + "step": 1839 + }, + { + "epoch": 0.010943001237034922, + "grad_norm": 2.31296968460083, + "learning_rate": 4.998524403080345e-05, + "loss": 6.7327, + "step": 1840 + }, + { + "epoch": 0.010948948520315919, + "grad_norm": 2.049689292907715, + "learning_rate": 4.9985227980202665e-05, + "loss": 7.0029, + "step": 1841 + }, + { + "epoch": 0.010954895803596917, + "grad_norm": 2.1640658378601074, + "learning_rate": 4.99852119208798e-05, + "loss": 7.0749, + "step": 1842 + }, + { + "epoch": 0.010960843086877914, + "grad_norm": 1.8896230459213257, + "learning_rate": 4.998519585283486e-05, + "loss": 6.7249, + "step": 1843 + }, + { + "epoch": 0.010966790370158911, + "grad_norm": 2.4835314750671387, + "learning_rate": 4.998517977606785e-05, + "loss": 6.5605, + "step": 1844 + }, + { + "epoch": 0.010972737653439908, + "grad_norm": 2.2472622394561768, + "learning_rate": 4.998516369057876e-05, + "loss": 6.8291, + "step": 1845 + }, + { + "epoch": 0.010978684936720906, + "grad_norm": 2.499096155166626, + "learning_rate": 4.998514759636762e-05, + "loss": 6.6921, + "step": 1846 + }, + { + "epoch": 0.010984632220001903, + "grad_norm": 2.296786308288574, + "learning_rate": 4.998513149343442e-05, + "loss": 7.0475, + "step": 1847 + }, + { + "epoch": 0.0109905795032829, + "grad_norm": 2.2896368503570557, + "learning_rate": 4.998511538177916e-05, + "loss": 6.775, + "step": 1848 + }, + { + "epoch": 0.010996526786563897, + "grad_norm": 2.025575637817383, + "learning_rate": 4.998509926140186e-05, + "loss": 6.9538, + "step": 1849 + }, + { + "epoch": 0.011002474069844895, + "grad_norm": 2.23502779006958, + "learning_rate": 4.9985083132302525e-05, + "loss": 7.0595, + "step": 1850 + }, + { + "epoch": 0.011008421353125892, + "grad_norm": 2.7158777713775635, + "learning_rate": 4.998506699448115e-05, + "loss": 7.0086, + "step": 1851 + }, + { + "epoch": 0.011014368636406889, + "grad_norm": 2.2707183361053467, + "learning_rate": 4.998505084793775e-05, + "loss": 6.6396, + "step": 1852 + }, + { + "epoch": 0.011020315919687886, + "grad_norm": 3.196085214614868, + "learning_rate": 4.998503469267232e-05, + "loss": 6.6026, + "step": 1853 + }, + { + "epoch": 0.011026263202968884, + "grad_norm": 2.4472603797912598, + "learning_rate": 4.9985018528684876e-05, + "loss": 7.1332, + "step": 1854 + }, + { + "epoch": 0.011032210486249881, + "grad_norm": 2.7070915699005127, + "learning_rate": 4.998500235597542e-05, + "loss": 6.9669, + "step": 1855 + }, + { + "epoch": 0.011038157769530878, + "grad_norm": 2.127729654312134, + "learning_rate": 4.998498617454396e-05, + "loss": 6.9589, + "step": 1856 + }, + { + "epoch": 0.011044105052811875, + "grad_norm": 2.2897160053253174, + "learning_rate": 4.99849699843905e-05, + "loss": 7.0402, + "step": 1857 + }, + { + "epoch": 0.011050052336092873, + "grad_norm": 1.888961672782898, + "learning_rate": 4.998495378551504e-05, + "loss": 6.9406, + "step": 1858 + }, + { + "epoch": 0.01105599961937387, + "grad_norm": 1.9889254570007324, + "learning_rate": 4.9984937577917594e-05, + "loss": 6.8392, + "step": 1859 + }, + { + "epoch": 0.011061946902654867, + "grad_norm": 3.042891025543213, + "learning_rate": 4.998492136159817e-05, + "loss": 6.7743, + "step": 1860 + }, + { + "epoch": 0.011067894185935864, + "grad_norm": 2.423988103866577, + "learning_rate": 4.998490513655676e-05, + "loss": 6.9802, + "step": 1861 + }, + { + "epoch": 0.011073841469216862, + "grad_norm": 2.6415674686431885, + "learning_rate": 4.998488890279338e-05, + "loss": 6.7104, + "step": 1862 + }, + { + "epoch": 0.01107978875249786, + "grad_norm": 2.686969518661499, + "learning_rate": 4.998487266030804e-05, + "loss": 7.0539, + "step": 1863 + }, + { + "epoch": 0.011085736035778856, + "grad_norm": 2.6695480346679688, + "learning_rate": 4.998485640910072e-05, + "loss": 6.9812, + "step": 1864 + }, + { + "epoch": 0.011091683319059853, + "grad_norm": 2.6251392364501953, + "learning_rate": 4.9984840149171466e-05, + "loss": 6.9954, + "step": 1865 + }, + { + "epoch": 0.011097630602340851, + "grad_norm": 2.487593650817871, + "learning_rate": 4.998482388052025e-05, + "loss": 7.0847, + "step": 1866 + }, + { + "epoch": 0.011103577885621848, + "grad_norm": 2.3249282836914062, + "learning_rate": 4.998480760314709e-05, + "loss": 6.9936, + "step": 1867 + }, + { + "epoch": 0.011109525168902845, + "grad_norm": 2.170452833175659, + "learning_rate": 4.9984791317052e-05, + "loss": 6.9155, + "step": 1868 + }, + { + "epoch": 0.011115472452183842, + "grad_norm": 3.331779718399048, + "learning_rate": 4.9984775022234975e-05, + "loss": 6.9128, + "step": 1869 + }, + { + "epoch": 0.01112141973546484, + "grad_norm": 2.7665064334869385, + "learning_rate": 4.9984758718696026e-05, + "loss": 6.9002, + "step": 1870 + }, + { + "epoch": 0.011127367018745837, + "grad_norm": 2.2872116565704346, + "learning_rate": 4.998474240643515e-05, + "loss": 6.9058, + "step": 1871 + }, + { + "epoch": 0.011133314302026834, + "grad_norm": 2.2125210762023926, + "learning_rate": 4.998472608545236e-05, + "loss": 6.932, + "step": 1872 + }, + { + "epoch": 0.011139261585307831, + "grad_norm": 2.1135666370391846, + "learning_rate": 4.998470975574766e-05, + "loss": 7.0018, + "step": 1873 + }, + { + "epoch": 0.011145208868588828, + "grad_norm": 2.0649492740631104, + "learning_rate": 4.998469341732105e-05, + "loss": 7.0132, + "step": 1874 + }, + { + "epoch": 0.011151156151869826, + "grad_norm": 4.0558576583862305, + "learning_rate": 4.9984677070172546e-05, + "loss": 6.8826, + "step": 1875 + }, + { + "epoch": 0.011157103435150823, + "grad_norm": 2.5675904750823975, + "learning_rate": 4.998466071430216e-05, + "loss": 7.0314, + "step": 1876 + }, + { + "epoch": 0.01116305071843182, + "grad_norm": 2.9773342609405518, + "learning_rate": 4.998464434970987e-05, + "loss": 6.8608, + "step": 1877 + }, + { + "epoch": 0.011168998001712817, + "grad_norm": 2.804995059967041, + "learning_rate": 4.9984627976395705e-05, + "loss": 6.6857, + "step": 1878 + }, + { + "epoch": 0.011174945284993815, + "grad_norm": 3.758509874343872, + "learning_rate": 4.9984611594359664e-05, + "loss": 6.9995, + "step": 1879 + }, + { + "epoch": 0.011180892568274812, + "grad_norm": 2.583061933517456, + "learning_rate": 4.998459520360176e-05, + "loss": 6.5844, + "step": 1880 + }, + { + "epoch": 0.011186839851555809, + "grad_norm": 2.357642889022827, + "learning_rate": 4.998457880412198e-05, + "loss": 6.6435, + "step": 1881 + }, + { + "epoch": 0.011192787134836806, + "grad_norm": 2.181558609008789, + "learning_rate": 4.9984562395920356e-05, + "loss": 7.045, + "step": 1882 + }, + { + "epoch": 0.011198734418117804, + "grad_norm": 2.4768264293670654, + "learning_rate": 4.998454597899688e-05, + "loss": 7.2053, + "step": 1883 + }, + { + "epoch": 0.011204681701398801, + "grad_norm": 2.4422380924224854, + "learning_rate": 4.998452955335154e-05, + "loss": 6.8038, + "step": 1884 + }, + { + "epoch": 0.011210628984679798, + "grad_norm": 3.3173701763153076, + "learning_rate": 4.998451311898437e-05, + "loss": 6.8619, + "step": 1885 + }, + { + "epoch": 0.011216576267960795, + "grad_norm": 2.4492833614349365, + "learning_rate": 4.9984496675895366e-05, + "loss": 6.6681, + "step": 1886 + }, + { + "epoch": 0.011222523551241793, + "grad_norm": 3.065016031265259, + "learning_rate": 4.998448022408453e-05, + "loss": 6.7439, + "step": 1887 + }, + { + "epoch": 0.01122847083452279, + "grad_norm": 3.327730655670166, + "learning_rate": 4.998446376355187e-05, + "loss": 6.735, + "step": 1888 + }, + { + "epoch": 0.011234418117803787, + "grad_norm": 3.428292751312256, + "learning_rate": 4.998444729429739e-05, + "loss": 6.5277, + "step": 1889 + }, + { + "epoch": 0.011240365401084784, + "grad_norm": 2.4982972145080566, + "learning_rate": 4.9984430816321095e-05, + "loss": 6.8228, + "step": 1890 + }, + { + "epoch": 0.011246312684365782, + "grad_norm": 2.568232297897339, + "learning_rate": 4.9984414329623e-05, + "loss": 7.0772, + "step": 1891 + }, + { + "epoch": 0.01125225996764678, + "grad_norm": 2.534109115600586, + "learning_rate": 4.99843978342031e-05, + "loss": 7.0259, + "step": 1892 + }, + { + "epoch": 0.011258207250927776, + "grad_norm": 2.6394994258880615, + "learning_rate": 4.998438133006141e-05, + "loss": 6.8692, + "step": 1893 + }, + { + "epoch": 0.011264154534208773, + "grad_norm": 2.4049339294433594, + "learning_rate": 4.998436481719792e-05, + "loss": 6.8653, + "step": 1894 + }, + { + "epoch": 0.011270101817489771, + "grad_norm": 2.661191701889038, + "learning_rate": 4.998434829561266e-05, + "loss": 6.628, + "step": 1895 + }, + { + "epoch": 0.011276049100770768, + "grad_norm": 2.395829916000366, + "learning_rate": 4.998433176530561e-05, + "loss": 6.9876, + "step": 1896 + }, + { + "epoch": 0.011281996384051765, + "grad_norm": 2.547858715057373, + "learning_rate": 4.99843152262768e-05, + "loss": 7.3832, + "step": 1897 + }, + { + "epoch": 0.011287943667332762, + "grad_norm": 2.364246368408203, + "learning_rate": 4.998429867852621e-05, + "loss": 7.3771, + "step": 1898 + }, + { + "epoch": 0.01129389095061376, + "grad_norm": 2.3385260105133057, + "learning_rate": 4.998428212205387e-05, + "loss": 6.971, + "step": 1899 + }, + { + "epoch": 0.011299838233894757, + "grad_norm": 2.253760576248169, + "learning_rate": 4.998426555685977e-05, + "loss": 7.0588, + "step": 1900 + }, + { + "epoch": 0.011305785517175754, + "grad_norm": 2.4103500843048096, + "learning_rate": 4.998424898294392e-05, + "loss": 6.8731, + "step": 1901 + }, + { + "epoch": 0.011311732800456751, + "grad_norm": 2.4819014072418213, + "learning_rate": 4.998423240030633e-05, + "loss": 6.9502, + "step": 1902 + }, + { + "epoch": 0.011317680083737748, + "grad_norm": 2.503901243209839, + "learning_rate": 4.998421580894701e-05, + "loss": 7.017, + "step": 1903 + }, + { + "epoch": 0.011323627367018746, + "grad_norm": 2.2224137783050537, + "learning_rate": 4.9984199208865943e-05, + "loss": 7.1938, + "step": 1904 + }, + { + "epoch": 0.011329574650299743, + "grad_norm": 2.1291286945343018, + "learning_rate": 4.998418260006316e-05, + "loss": 7.1152, + "step": 1905 + }, + { + "epoch": 0.01133552193358074, + "grad_norm": 2.4611241817474365, + "learning_rate": 4.9984165982538655e-05, + "loss": 7.0316, + "step": 1906 + }, + { + "epoch": 0.011341469216861737, + "grad_norm": 2.329432487487793, + "learning_rate": 4.998414935629243e-05, + "loss": 7.0032, + "step": 1907 + }, + { + "epoch": 0.011347416500142735, + "grad_norm": 2.0618371963500977, + "learning_rate": 4.9984132721324505e-05, + "loss": 7.2566, + "step": 1908 + }, + { + "epoch": 0.011353363783423732, + "grad_norm": 2.063511371612549, + "learning_rate": 4.998411607763487e-05, + "loss": 7.0144, + "step": 1909 + }, + { + "epoch": 0.011359311066704729, + "grad_norm": 2.188871145248413, + "learning_rate": 4.998409942522355e-05, + "loss": 6.9652, + "step": 1910 + }, + { + "epoch": 0.011365258349985726, + "grad_norm": 2.499746322631836, + "learning_rate": 4.998408276409053e-05, + "loss": 6.9173, + "step": 1911 + }, + { + "epoch": 0.011371205633266724, + "grad_norm": 2.2809276580810547, + "learning_rate": 4.9984066094235826e-05, + "loss": 6.9202, + "step": 1912 + }, + { + "epoch": 0.011377152916547721, + "grad_norm": 1.7967042922973633, + "learning_rate": 4.998404941565944e-05, + "loss": 7.0652, + "step": 1913 + }, + { + "epoch": 0.011383100199828718, + "grad_norm": 2.339747667312622, + "learning_rate": 4.9984032728361384e-05, + "loss": 6.943, + "step": 1914 + }, + { + "epoch": 0.011389047483109715, + "grad_norm": 2.65795636177063, + "learning_rate": 4.998401603234166e-05, + "loss": 6.7197, + "step": 1915 + }, + { + "epoch": 0.011394994766390713, + "grad_norm": 2.181105852127075, + "learning_rate": 4.998399932760027e-05, + "loss": 6.7358, + "step": 1916 + }, + { + "epoch": 0.01140094204967171, + "grad_norm": 2.4130990505218506, + "learning_rate": 4.998398261413723e-05, + "loss": 6.8653, + "step": 1917 + }, + { + "epoch": 0.011406889332952707, + "grad_norm": 2.23822021484375, + "learning_rate": 4.998396589195254e-05, + "loss": 7.2125, + "step": 1918 + }, + { + "epoch": 0.011412836616233704, + "grad_norm": 2.176309823989868, + "learning_rate": 4.9983949161046207e-05, + "loss": 7.1077, + "step": 1919 + }, + { + "epoch": 0.011418783899514702, + "grad_norm": 2.2468202114105225, + "learning_rate": 4.9983932421418226e-05, + "loss": 7.1411, + "step": 1920 + }, + { + "epoch": 0.0114247311827957, + "grad_norm": 2.0748138427734375, + "learning_rate": 4.998391567306862e-05, + "loss": 7.0605, + "step": 1921 + }, + { + "epoch": 0.011430678466076696, + "grad_norm": 2.93007230758667, + "learning_rate": 4.998389891599738e-05, + "loss": 6.5832, + "step": 1922 + }, + { + "epoch": 0.011436625749357693, + "grad_norm": 2.125582218170166, + "learning_rate": 4.9983882150204534e-05, + "loss": 7.0761, + "step": 1923 + }, + { + "epoch": 0.011442573032638691, + "grad_norm": 2.3291571140289307, + "learning_rate": 4.998386537569005e-05, + "loss": 6.8781, + "step": 1924 + }, + { + "epoch": 0.011448520315919688, + "grad_norm": 2.8930649757385254, + "learning_rate": 4.9983848592453975e-05, + "loss": 7.1694, + "step": 1925 + }, + { + "epoch": 0.011454467599200685, + "grad_norm": 2.8450441360473633, + "learning_rate": 4.998383180049629e-05, + "loss": 7.1474, + "step": 1926 + }, + { + "epoch": 0.011460414882481682, + "grad_norm": 2.5900778770446777, + "learning_rate": 4.9983814999817016e-05, + "loss": 7.0423, + "step": 1927 + }, + { + "epoch": 0.01146636216576268, + "grad_norm": 2.289428949356079, + "learning_rate": 4.998379819041614e-05, + "loss": 6.9777, + "step": 1928 + }, + { + "epoch": 0.011472309449043677, + "grad_norm": 2.609384059906006, + "learning_rate": 4.998378137229368e-05, + "loss": 7.0488, + "step": 1929 + }, + { + "epoch": 0.011478256732324674, + "grad_norm": 2.1039459705352783, + "learning_rate": 4.998376454544964e-05, + "loss": 6.9308, + "step": 1930 + }, + { + "epoch": 0.01148420401560567, + "grad_norm": 2.1776134967803955, + "learning_rate": 4.9983747709884024e-05, + "loss": 6.9951, + "step": 1931 + }, + { + "epoch": 0.01149015129888667, + "grad_norm": 2.3150827884674072, + "learning_rate": 4.998373086559684e-05, + "loss": 6.9165, + "step": 1932 + }, + { + "epoch": 0.011496098582167666, + "grad_norm": 2.308370590209961, + "learning_rate": 4.99837140125881e-05, + "loss": 7.0155, + "step": 1933 + }, + { + "epoch": 0.011502045865448663, + "grad_norm": 2.234208106994629, + "learning_rate": 4.99836971508578e-05, + "loss": 6.9901, + "step": 1934 + }, + { + "epoch": 0.01150799314872966, + "grad_norm": 2.2340307235717773, + "learning_rate": 4.9983680280405953e-05, + "loss": 7.004, + "step": 1935 + }, + { + "epoch": 0.011513940432010657, + "grad_norm": 2.9458208084106445, + "learning_rate": 4.998366340123256e-05, + "loss": 7.3797, + "step": 1936 + }, + { + "epoch": 0.011519887715291655, + "grad_norm": 2.8516271114349365, + "learning_rate": 4.998364651333762e-05, + "loss": 7.3503, + "step": 1937 + }, + { + "epoch": 0.011525834998572652, + "grad_norm": 1.974025845527649, + "learning_rate": 4.998362961672116e-05, + "loss": 7.21, + "step": 1938 + }, + { + "epoch": 0.011531782281853649, + "grad_norm": 2.110117197036743, + "learning_rate": 4.998361271138317e-05, + "loss": 6.9494, + "step": 1939 + }, + { + "epoch": 0.011537729565134646, + "grad_norm": 2.2003207206726074, + "learning_rate": 4.9983595797323646e-05, + "loss": 6.8858, + "step": 1940 + }, + { + "epoch": 0.011543676848415644, + "grad_norm": 2.200982093811035, + "learning_rate": 4.998357887454262e-05, + "loss": 6.9512, + "step": 1941 + }, + { + "epoch": 0.011549624131696641, + "grad_norm": 2.303903102874756, + "learning_rate": 4.998356194304008e-05, + "loss": 7.2823, + "step": 1942 + }, + { + "epoch": 0.011555571414977638, + "grad_norm": 2.1376724243164062, + "learning_rate": 4.9983545002816035e-05, + "loss": 7.0321, + "step": 1943 + }, + { + "epoch": 0.011561518698258635, + "grad_norm": 2.3128151893615723, + "learning_rate": 4.99835280538705e-05, + "loss": 6.9714, + "step": 1944 + }, + { + "epoch": 0.011567465981539633, + "grad_norm": 2.359212636947632, + "learning_rate": 4.9983511096203465e-05, + "loss": 7.0496, + "step": 1945 + }, + { + "epoch": 0.01157341326482063, + "grad_norm": 2.346946954727173, + "learning_rate": 4.9983494129814945e-05, + "loss": 6.9865, + "step": 1946 + }, + { + "epoch": 0.011579360548101627, + "grad_norm": 2.447598934173584, + "learning_rate": 4.998347715470495e-05, + "loss": 6.9609, + "step": 1947 + }, + { + "epoch": 0.011585307831382624, + "grad_norm": 2.355300188064575, + "learning_rate": 4.998346017087348e-05, + "loss": 7.03, + "step": 1948 + }, + { + "epoch": 0.011591255114663622, + "grad_norm": 2.3207437992095947, + "learning_rate": 4.9983443178320545e-05, + "loss": 6.8181, + "step": 1949 + }, + { + "epoch": 0.011597202397944619, + "grad_norm": 2.359839677810669, + "learning_rate": 4.998342617704615e-05, + "loss": 6.8828, + "step": 1950 + }, + { + "epoch": 0.011603149681225616, + "grad_norm": 2.264890432357788, + "learning_rate": 4.9983409167050284e-05, + "loss": 7.3467, + "step": 1951 + }, + { + "epoch": 0.011609096964506613, + "grad_norm": 2.2720789909362793, + "learning_rate": 4.998339214833298e-05, + "loss": 7.3912, + "step": 1952 + }, + { + "epoch": 0.011615044247787611, + "grad_norm": 2.414433240890503, + "learning_rate": 4.9983375120894226e-05, + "loss": 7.1505, + "step": 1953 + }, + { + "epoch": 0.011620991531068608, + "grad_norm": 2.095290422439575, + "learning_rate": 4.998335808473404e-05, + "loss": 7.1642, + "step": 1954 + }, + { + "epoch": 0.011626938814349605, + "grad_norm": 2.118901252746582, + "learning_rate": 4.998334103985242e-05, + "loss": 7.0528, + "step": 1955 + }, + { + "epoch": 0.011632886097630602, + "grad_norm": 2.4361472129821777, + "learning_rate": 4.998332398624937e-05, + "loss": 7.3064, + "step": 1956 + }, + { + "epoch": 0.0116388333809116, + "grad_norm": 2.0978667736053467, + "learning_rate": 4.99833069239249e-05, + "loss": 7.0041, + "step": 1957 + }, + { + "epoch": 0.011644780664192597, + "grad_norm": 3.156329393386841, + "learning_rate": 4.998328985287902e-05, + "loss": 6.9169, + "step": 1958 + }, + { + "epoch": 0.011650727947473594, + "grad_norm": 2.311004400253296, + "learning_rate": 4.9983272773111735e-05, + "loss": 7.1128, + "step": 1959 + }, + { + "epoch": 0.01165667523075459, + "grad_norm": 2.406993865966797, + "learning_rate": 4.9983255684623036e-05, + "loss": 7.1403, + "step": 1960 + }, + { + "epoch": 0.01166262251403559, + "grad_norm": 2.0262861251831055, + "learning_rate": 4.998323858741295e-05, + "loss": 7.1014, + "step": 1961 + }, + { + "epoch": 0.011668569797316586, + "grad_norm": 2.369420051574707, + "learning_rate": 4.998322148148147e-05, + "loss": 7.1422, + "step": 1962 + }, + { + "epoch": 0.011674517080597583, + "grad_norm": 2.156019687652588, + "learning_rate": 4.998320436682861e-05, + "loss": 6.8405, + "step": 1963 + }, + { + "epoch": 0.01168046436387858, + "grad_norm": 2.35737681388855, + "learning_rate": 4.998318724345436e-05, + "loss": 6.8004, + "step": 1964 + }, + { + "epoch": 0.011686411647159577, + "grad_norm": 2.443676233291626, + "learning_rate": 4.998317011135875e-05, + "loss": 7.1959, + "step": 1965 + }, + { + "epoch": 0.011692358930440575, + "grad_norm": 2.1023004055023193, + "learning_rate": 4.998315297054177e-05, + "loss": 7.0684, + "step": 1966 + }, + { + "epoch": 0.011698306213721572, + "grad_norm": 2.5166187286376953, + "learning_rate": 4.998313582100342e-05, + "loss": 6.5876, + "step": 1967 + }, + { + "epoch": 0.011704253497002569, + "grad_norm": 2.1868557929992676, + "learning_rate": 4.9983118662743726e-05, + "loss": 6.6097, + "step": 1968 + }, + { + "epoch": 0.011710200780283566, + "grad_norm": 2.196786880493164, + "learning_rate": 4.998310149576269e-05, + "loss": 6.9798, + "step": 1969 + }, + { + "epoch": 0.011716148063564564, + "grad_norm": 2.361915111541748, + "learning_rate": 4.998308432006029e-05, + "loss": 6.8441, + "step": 1970 + }, + { + "epoch": 0.011722095346845561, + "grad_norm": 2.3234047889709473, + "learning_rate": 4.998306713563657e-05, + "loss": 6.9481, + "step": 1971 + }, + { + "epoch": 0.011728042630126558, + "grad_norm": 2.4995763301849365, + "learning_rate": 4.9983049942491514e-05, + "loss": 6.9903, + "step": 1972 + }, + { + "epoch": 0.011733989913407555, + "grad_norm": 2.21274995803833, + "learning_rate": 4.998303274062514e-05, + "loss": 7.1484, + "step": 1973 + }, + { + "epoch": 0.011739937196688553, + "grad_norm": 2.4777519702911377, + "learning_rate": 4.998301553003743e-05, + "loss": 7.144, + "step": 1974 + }, + { + "epoch": 0.01174588447996955, + "grad_norm": 2.089796304702759, + "learning_rate": 4.9982998310728426e-05, + "loss": 6.6765, + "step": 1975 + }, + { + "epoch": 0.011751831763250547, + "grad_norm": 3.012753963470459, + "learning_rate": 4.998298108269811e-05, + "loss": 6.8501, + "step": 1976 + }, + { + "epoch": 0.011757779046531544, + "grad_norm": 2.5427911281585693, + "learning_rate": 4.9982963845946486e-05, + "loss": 7.0171, + "step": 1977 + }, + { + "epoch": 0.011763726329812542, + "grad_norm": 2.8591670989990234, + "learning_rate": 4.998294660047358e-05, + "loss": 6.9881, + "step": 1978 + }, + { + "epoch": 0.011769673613093539, + "grad_norm": 2.952085256576538, + "learning_rate": 4.998292934627937e-05, + "loss": 6.9459, + "step": 1979 + }, + { + "epoch": 0.011775620896374536, + "grad_norm": 2.451958656311035, + "learning_rate": 4.998291208336388e-05, + "loss": 6.9515, + "step": 1980 + }, + { + "epoch": 0.011781568179655533, + "grad_norm": 2.448319435119629, + "learning_rate": 4.998289481172713e-05, + "loss": 6.8618, + "step": 1981 + }, + { + "epoch": 0.011787515462936531, + "grad_norm": 3.1797080039978027, + "learning_rate": 4.99828775313691e-05, + "loss": 6.7528, + "step": 1982 + }, + { + "epoch": 0.011793462746217528, + "grad_norm": 2.841120719909668, + "learning_rate": 4.99828602422898e-05, + "loss": 6.8, + "step": 1983 + }, + { + "epoch": 0.011799410029498525, + "grad_norm": 3.128098726272583, + "learning_rate": 4.998284294448925e-05, + "loss": 6.7574, + "step": 1984 + }, + { + "epoch": 0.011805357312779522, + "grad_norm": 2.7724568843841553, + "learning_rate": 4.998282563796744e-05, + "loss": 6.6119, + "step": 1985 + }, + { + "epoch": 0.01181130459606052, + "grad_norm": 2.8025269508361816, + "learning_rate": 4.998280832272439e-05, + "loss": 6.4676, + "step": 1986 + }, + { + "epoch": 0.011817251879341517, + "grad_norm": 2.5756618976593018, + "learning_rate": 4.99827909987601e-05, + "loss": 6.5421, + "step": 1987 + }, + { + "epoch": 0.011823199162622514, + "grad_norm": 2.9116249084472656, + "learning_rate": 4.998277366607457e-05, + "loss": 6.5446, + "step": 1988 + }, + { + "epoch": 0.01182914644590351, + "grad_norm": 2.571019411087036, + "learning_rate": 4.9982756324667815e-05, + "loss": 6.7898, + "step": 1989 + }, + { + "epoch": 0.01183509372918451, + "grad_norm": 2.818885326385498, + "learning_rate": 4.998273897453984e-05, + "loss": 6.6604, + "step": 1990 + }, + { + "epoch": 0.011841041012465506, + "grad_norm": 2.8561007976531982, + "learning_rate": 4.998272161569064e-05, + "loss": 6.5473, + "step": 1991 + }, + { + "epoch": 0.011846988295746503, + "grad_norm": 2.5539605617523193, + "learning_rate": 4.998270424812024e-05, + "loss": 6.5492, + "step": 1992 + }, + { + "epoch": 0.0118529355790275, + "grad_norm": 2.3242900371551514, + "learning_rate": 4.998268687182863e-05, + "loss": 6.4577, + "step": 1993 + }, + { + "epoch": 0.011858882862308498, + "grad_norm": 2.874807596206665, + "learning_rate": 4.998266948681582e-05, + "loss": 6.6071, + "step": 1994 + }, + { + "epoch": 0.011864830145589495, + "grad_norm": 2.9014296531677246, + "learning_rate": 4.9982652093081827e-05, + "loss": 7.2221, + "step": 1995 + }, + { + "epoch": 0.011870777428870492, + "grad_norm": 2.5874252319335938, + "learning_rate": 4.998263469062665e-05, + "loss": 6.593, + "step": 1996 + }, + { + "epoch": 0.011876724712151489, + "grad_norm": 2.4252052307128906, + "learning_rate": 4.998261727945028e-05, + "loss": 7.0138, + "step": 1997 + }, + { + "epoch": 0.011882671995432486, + "grad_norm": 2.3569211959838867, + "learning_rate": 4.998259985955275e-05, + "loss": 6.8743, + "step": 1998 + }, + { + "epoch": 0.011888619278713484, + "grad_norm": 2.560659408569336, + "learning_rate": 4.9982582430934045e-05, + "loss": 6.8926, + "step": 1999 + }, + { + "epoch": 0.011894566561994481, + "grad_norm": 2.0855636596679688, + "learning_rate": 4.9982564993594184e-05, + "loss": 7.1691, + "step": 2000 + }, + { + "epoch": 0.011900513845275478, + "grad_norm": 2.024829387664795, + "learning_rate": 4.998254754753316e-05, + "loss": 7.1797, + "step": 2001 + }, + { + "epoch": 0.011906461128556475, + "grad_norm": 2.093733549118042, + "learning_rate": 4.998253009275099e-05, + "loss": 6.9706, + "step": 2002 + }, + { + "epoch": 0.011912408411837473, + "grad_norm": 1.9211688041687012, + "learning_rate": 4.998251262924768e-05, + "loss": 7.018, + "step": 2003 + }, + { + "epoch": 0.01191835569511847, + "grad_norm": 2.3146321773529053, + "learning_rate": 4.998249515702323e-05, + "loss": 6.9384, + "step": 2004 + }, + { + "epoch": 0.011924302978399467, + "grad_norm": 2.346309185028076, + "learning_rate": 4.998247767607765e-05, + "loss": 6.5674, + "step": 2005 + }, + { + "epoch": 0.011930250261680464, + "grad_norm": 2.39471697807312, + "learning_rate": 4.998246018641094e-05, + "loss": 6.769, + "step": 2006 + }, + { + "epoch": 0.011936197544961462, + "grad_norm": 2.1689298152923584, + "learning_rate": 4.998244268802312e-05, + "loss": 7.0945, + "step": 2007 + }, + { + "epoch": 0.011942144828242459, + "grad_norm": 2.4209859371185303, + "learning_rate": 4.998242518091418e-05, + "loss": 6.98, + "step": 2008 + }, + { + "epoch": 0.011948092111523456, + "grad_norm": 2.6378684043884277, + "learning_rate": 4.998240766508414e-05, + "loss": 6.6833, + "step": 2009 + }, + { + "epoch": 0.011954039394804453, + "grad_norm": 2.2804839611053467, + "learning_rate": 4.9982390140532995e-05, + "loss": 6.7129, + "step": 2010 + }, + { + "epoch": 0.011959986678085451, + "grad_norm": 2.1788251399993896, + "learning_rate": 4.998237260726075e-05, + "loss": 7.0175, + "step": 2011 + }, + { + "epoch": 0.011965933961366448, + "grad_norm": 1.8988546133041382, + "learning_rate": 4.998235506526743e-05, + "loss": 7.0857, + "step": 2012 + }, + { + "epoch": 0.011971881244647445, + "grad_norm": 2.560107469558716, + "learning_rate": 4.9982337514553026e-05, + "loss": 7.0771, + "step": 2013 + }, + { + "epoch": 0.011977828527928442, + "grad_norm": 2.1771798133850098, + "learning_rate": 4.998231995511754e-05, + "loss": 7.071, + "step": 2014 + }, + { + "epoch": 0.01198377581120944, + "grad_norm": 1.9619860649108887, + "learning_rate": 4.998230238696098e-05, + "loss": 6.9109, + "step": 2015 + }, + { + "epoch": 0.011989723094490437, + "grad_norm": 2.16719126701355, + "learning_rate": 4.998228481008337e-05, + "loss": 6.903, + "step": 2016 + }, + { + "epoch": 0.011995670377771434, + "grad_norm": 2.4643077850341797, + "learning_rate": 4.998226722448469e-05, + "loss": 6.5301, + "step": 2017 + }, + { + "epoch": 0.01200161766105243, + "grad_norm": 2.5153393745422363, + "learning_rate": 4.9982249630164965e-05, + "loss": 7.107, + "step": 2018 + }, + { + "epoch": 0.01200756494433343, + "grad_norm": 2.6180920600891113, + "learning_rate": 4.998223202712419e-05, + "loss": 6.9905, + "step": 2019 + }, + { + "epoch": 0.012013512227614426, + "grad_norm": 2.333186149597168, + "learning_rate": 4.998221441536238e-05, + "loss": 7.074, + "step": 2020 + }, + { + "epoch": 0.012019459510895423, + "grad_norm": 2.138176918029785, + "learning_rate": 4.998219679487953e-05, + "loss": 7.0211, + "step": 2021 + }, + { + "epoch": 0.01202540679417642, + "grad_norm": 2.9845499992370605, + "learning_rate": 4.998217916567567e-05, + "loss": 6.7341, + "step": 2022 + }, + { + "epoch": 0.012031354077457418, + "grad_norm": 3.1216208934783936, + "learning_rate": 4.998216152775077e-05, + "loss": 7.1569, + "step": 2023 + }, + { + "epoch": 0.012037301360738415, + "grad_norm": 2.4693727493286133, + "learning_rate": 4.998214388110487e-05, + "loss": 6.6427, + "step": 2024 + }, + { + "epoch": 0.012043248644019412, + "grad_norm": 2.784562349319458, + "learning_rate": 4.9982126225737955e-05, + "loss": 6.6898, + "step": 2025 + }, + { + "epoch": 0.012049195927300409, + "grad_norm": 3.0549166202545166, + "learning_rate": 4.9982108561650036e-05, + "loss": 6.6004, + "step": 2026 + }, + { + "epoch": 0.012055143210581406, + "grad_norm": 2.565505266189575, + "learning_rate": 4.998209088884113e-05, + "loss": 6.5981, + "step": 2027 + }, + { + "epoch": 0.012061090493862404, + "grad_norm": 2.862548828125, + "learning_rate": 4.998207320731122e-05, + "loss": 6.4329, + "step": 2028 + }, + { + "epoch": 0.012067037777143401, + "grad_norm": 2.835280179977417, + "learning_rate": 4.998205551706033e-05, + "loss": 6.6854, + "step": 2029 + }, + { + "epoch": 0.012072985060424398, + "grad_norm": 2.4550364017486572, + "learning_rate": 4.9982037818088474e-05, + "loss": 6.7115, + "step": 2030 + }, + { + "epoch": 0.012078932343705395, + "grad_norm": 2.9977426528930664, + "learning_rate": 4.998202011039564e-05, + "loss": 6.341, + "step": 2031 + }, + { + "epoch": 0.012084879626986393, + "grad_norm": 2.258370876312256, + "learning_rate": 4.998200239398184e-05, + "loss": 6.7094, + "step": 2032 + }, + { + "epoch": 0.01209082691026739, + "grad_norm": 2.4484050273895264, + "learning_rate": 4.9981984668847085e-05, + "loss": 7.1115, + "step": 2033 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 2.4668514728546143, + "learning_rate": 4.9981966934991366e-05, + "loss": 6.9411, + "step": 2034 + }, + { + "epoch": 0.012102721476829384, + "grad_norm": 2.218479871749878, + "learning_rate": 4.998194919241471e-05, + "loss": 6.7175, + "step": 2035 + }, + { + "epoch": 0.012108668760110382, + "grad_norm": 2.201815366744995, + "learning_rate": 4.9981931441117115e-05, + "loss": 6.8684, + "step": 2036 + }, + { + "epoch": 0.012114616043391379, + "grad_norm": 2.4610331058502197, + "learning_rate": 4.998191368109858e-05, + "loss": 6.7214, + "step": 2037 + }, + { + "epoch": 0.012120563326672376, + "grad_norm": 2.7274906635284424, + "learning_rate": 4.998189591235912e-05, + "loss": 6.7611, + "step": 2038 + }, + { + "epoch": 0.012126510609953373, + "grad_norm": 2.7716658115386963, + "learning_rate": 4.9981878134898735e-05, + "loss": 6.7679, + "step": 2039 + }, + { + "epoch": 0.012132457893234371, + "grad_norm": 3.3206236362457275, + "learning_rate": 4.9981860348717434e-05, + "loss": 6.6283, + "step": 2040 + }, + { + "epoch": 0.012138405176515368, + "grad_norm": 2.511906862258911, + "learning_rate": 4.9981842553815225e-05, + "loss": 6.9537, + "step": 2041 + }, + { + "epoch": 0.012144352459796365, + "grad_norm": 2.7797024250030518, + "learning_rate": 4.998182475019212e-05, + "loss": 7.0488, + "step": 2042 + }, + { + "epoch": 0.012150299743077362, + "grad_norm": 3.523092031478882, + "learning_rate": 4.998180693784811e-05, + "loss": 6.9249, + "step": 2043 + }, + { + "epoch": 0.01215624702635836, + "grad_norm": 3.1001851558685303, + "learning_rate": 4.998178911678322e-05, + "loss": 7.0998, + "step": 2044 + }, + { + "epoch": 0.012162194309639357, + "grad_norm": 2.5291028022766113, + "learning_rate": 4.998177128699743e-05, + "loss": 6.8381, + "step": 2045 + }, + { + "epoch": 0.012168141592920354, + "grad_norm": 3.308398723602295, + "learning_rate": 4.998175344849077e-05, + "loss": 6.6849, + "step": 2046 + }, + { + "epoch": 0.01217408887620135, + "grad_norm": 3.4255475997924805, + "learning_rate": 4.998173560126323e-05, + "loss": 6.7816, + "step": 2047 + }, + { + "epoch": 0.01218003615948235, + "grad_norm": 3.4510817527770996, + "learning_rate": 4.998171774531484e-05, + "loss": 6.7961, + "step": 2048 + }, + { + "epoch": 0.012185983442763346, + "grad_norm": 3.15468168258667, + "learning_rate": 4.998169988064558e-05, + "loss": 6.9409, + "step": 2049 + }, + { + "epoch": 0.012191930726044343, + "grad_norm": 2.5568132400512695, + "learning_rate": 4.998168200725547e-05, + "loss": 6.8573, + "step": 2050 + }, + { + "epoch": 0.01219787800932534, + "grad_norm": 1.9745045900344849, + "learning_rate": 4.9981664125144515e-05, + "loss": 6.7948, + "step": 2051 + }, + { + "epoch": 0.012203825292606338, + "grad_norm": 2.2304463386535645, + "learning_rate": 4.9981646234312714e-05, + "loss": 6.6896, + "step": 2052 + }, + { + "epoch": 0.012209772575887335, + "grad_norm": 2.4391567707061768, + "learning_rate": 4.998162833476008e-05, + "loss": 6.7129, + "step": 2053 + }, + { + "epoch": 0.012215719859168332, + "grad_norm": 3.243905544281006, + "learning_rate": 4.9981610426486615e-05, + "loss": 7.0744, + "step": 2054 + }, + { + "epoch": 0.012221667142449329, + "grad_norm": 3.2596933841705322, + "learning_rate": 4.998159250949233e-05, + "loss": 6.9361, + "step": 2055 + }, + { + "epoch": 0.012227614425730327, + "grad_norm": 2.554436445236206, + "learning_rate": 4.998157458377723e-05, + "loss": 6.9354, + "step": 2056 + }, + { + "epoch": 0.012233561709011324, + "grad_norm": 2.3636975288391113, + "learning_rate": 4.998155664934132e-05, + "loss": 6.849, + "step": 2057 + }, + { + "epoch": 0.01223950899229232, + "grad_norm": 2.224684953689575, + "learning_rate": 4.99815387061846e-05, + "loss": 6.7011, + "step": 2058 + }, + { + "epoch": 0.012245456275573318, + "grad_norm": 2.6892964839935303, + "learning_rate": 4.9981520754307096e-05, + "loss": 6.753, + "step": 2059 + }, + { + "epoch": 0.012251403558854315, + "grad_norm": 2.7645084857940674, + "learning_rate": 4.9981502793708796e-05, + "loss": 6.5437, + "step": 2060 + }, + { + "epoch": 0.012257350842135313, + "grad_norm": 2.1315746307373047, + "learning_rate": 4.9981484824389716e-05, + "loss": 6.8843, + "step": 2061 + }, + { + "epoch": 0.01226329812541631, + "grad_norm": 2.6275408267974854, + "learning_rate": 4.998146684634984e-05, + "loss": 6.7275, + "step": 2062 + }, + { + "epoch": 0.012269245408697307, + "grad_norm": 2.530688762664795, + "learning_rate": 4.998144885958921e-05, + "loss": 6.6089, + "step": 2063 + }, + { + "epoch": 0.012275192691978304, + "grad_norm": 2.0959835052490234, + "learning_rate": 4.998143086410781e-05, + "loss": 6.7425, + "step": 2064 + }, + { + "epoch": 0.012281139975259302, + "grad_norm": 2.887242078781128, + "learning_rate": 4.998141285990565e-05, + "loss": 6.6867, + "step": 2065 + }, + { + "epoch": 0.012287087258540299, + "grad_norm": 2.430122137069702, + "learning_rate": 4.9981394846982734e-05, + "loss": 6.6636, + "step": 2066 + }, + { + "epoch": 0.012293034541821296, + "grad_norm": 2.269162654876709, + "learning_rate": 4.998137682533907e-05, + "loss": 7.1165, + "step": 2067 + }, + { + "epoch": 0.012298981825102293, + "grad_norm": 2.6741089820861816, + "learning_rate": 4.998135879497467e-05, + "loss": 6.6678, + "step": 2068 + }, + { + "epoch": 0.012304929108383291, + "grad_norm": 2.3362507820129395, + "learning_rate": 4.998134075588953e-05, + "loss": 7.0103, + "step": 2069 + }, + { + "epoch": 0.012310876391664288, + "grad_norm": 2.310638189315796, + "learning_rate": 4.9981322708083666e-05, + "loss": 6.9235, + "step": 2070 + }, + { + "epoch": 0.012316823674945285, + "grad_norm": 2.161853790283203, + "learning_rate": 4.998130465155708e-05, + "loss": 6.9392, + "step": 2071 + }, + { + "epoch": 0.012322770958226282, + "grad_norm": 2.2609059810638428, + "learning_rate": 4.9981286586309786e-05, + "loss": 6.888, + "step": 2072 + }, + { + "epoch": 0.01232871824150728, + "grad_norm": 2.6072967052459717, + "learning_rate": 4.998126851234177e-05, + "loss": 6.7739, + "step": 2073 + }, + { + "epoch": 0.012334665524788277, + "grad_norm": 3.092834711074829, + "learning_rate": 4.9981250429653056e-05, + "loss": 6.5529, + "step": 2074 + }, + { + "epoch": 0.012340612808069274, + "grad_norm": 2.303149461746216, + "learning_rate": 4.998123233824366e-05, + "loss": 6.618, + "step": 2075 + }, + { + "epoch": 0.01234656009135027, + "grad_norm": 2.888063907623291, + "learning_rate": 4.998121423811355e-05, + "loss": 6.9224, + "step": 2076 + }, + { + "epoch": 0.012352507374631269, + "grad_norm": 2.990727424621582, + "learning_rate": 4.998119612926277e-05, + "loss": 6.94, + "step": 2077 + }, + { + "epoch": 0.012358454657912266, + "grad_norm": 3.016002893447876, + "learning_rate": 4.998117801169131e-05, + "loss": 6.6231, + "step": 2078 + }, + { + "epoch": 0.012364401941193263, + "grad_norm": 2.057124614715576, + "learning_rate": 4.998115988539918e-05, + "loss": 6.803, + "step": 2079 + }, + { + "epoch": 0.01237034922447426, + "grad_norm": 2.371136426925659, + "learning_rate": 4.998114175038639e-05, + "loss": 6.8244, + "step": 2080 + }, + { + "epoch": 0.012376296507755258, + "grad_norm": 2.804365873336792, + "learning_rate": 4.998112360665292e-05, + "loss": 6.8787, + "step": 2081 + }, + { + "epoch": 0.012382243791036255, + "grad_norm": 3.4987633228302, + "learning_rate": 4.998110545419882e-05, + "loss": 6.6946, + "step": 2082 + }, + { + "epoch": 0.012388191074317252, + "grad_norm": 2.950608968734741, + "learning_rate": 4.998108729302407e-05, + "loss": 6.7915, + "step": 2083 + }, + { + "epoch": 0.012394138357598249, + "grad_norm": 2.4327776432037354, + "learning_rate": 4.998106912312868e-05, + "loss": 6.727, + "step": 2084 + }, + { + "epoch": 0.012400085640879247, + "grad_norm": 2.46014142036438, + "learning_rate": 4.998105094451265e-05, + "loss": 6.6797, + "step": 2085 + }, + { + "epoch": 0.012406032924160244, + "grad_norm": 2.947566270828247, + "learning_rate": 4.9981032757175995e-05, + "loss": 6.6401, + "step": 2086 + }, + { + "epoch": 0.01241198020744124, + "grad_norm": 2.5999064445495605, + "learning_rate": 4.9981014561118724e-05, + "loss": 6.58, + "step": 2087 + }, + { + "epoch": 0.012417927490722238, + "grad_norm": 2.9761807918548584, + "learning_rate": 4.9980996356340836e-05, + "loss": 6.8538, + "step": 2088 + }, + { + "epoch": 0.012423874774003236, + "grad_norm": 2.690925121307373, + "learning_rate": 4.9980978142842336e-05, + "loss": 6.9087, + "step": 2089 + }, + { + "epoch": 0.012429822057284233, + "grad_norm": 2.218524217605591, + "learning_rate": 4.998095992062325e-05, + "loss": 6.7221, + "step": 2090 + }, + { + "epoch": 0.01243576934056523, + "grad_norm": 2.630094051361084, + "learning_rate": 4.998094168968355e-05, + "loss": 6.7346, + "step": 2091 + }, + { + "epoch": 0.012441716623846227, + "grad_norm": 2.7839179039001465, + "learning_rate": 4.9980923450023276e-05, + "loss": 6.8668, + "step": 2092 + }, + { + "epoch": 0.012447663907127223, + "grad_norm": 2.422914743423462, + "learning_rate": 4.9980905201642415e-05, + "loss": 6.7953, + "step": 2093 + }, + { + "epoch": 0.012453611190408222, + "grad_norm": 2.525883674621582, + "learning_rate": 4.998088694454097e-05, + "loss": 6.6322, + "step": 2094 + }, + { + "epoch": 0.012459558473689219, + "grad_norm": 2.515536308288574, + "learning_rate": 4.998086867871896e-05, + "loss": 7.4297, + "step": 2095 + }, + { + "epoch": 0.012465505756970216, + "grad_norm": 2.689542055130005, + "learning_rate": 4.998085040417639e-05, + "loss": 7.4316, + "step": 2096 + }, + { + "epoch": 0.012471453040251212, + "grad_norm": 2.4374492168426514, + "learning_rate": 4.998083212091327e-05, + "loss": 6.8035, + "step": 2097 + }, + { + "epoch": 0.012477400323532211, + "grad_norm": 2.284153699874878, + "learning_rate": 4.998081382892959e-05, + "loss": 6.6644, + "step": 2098 + }, + { + "epoch": 0.012483347606813208, + "grad_norm": 2.113539218902588, + "learning_rate": 4.9980795528225366e-05, + "loss": 6.5201, + "step": 2099 + }, + { + "epoch": 0.012489294890094205, + "grad_norm": 2.2590157985687256, + "learning_rate": 4.998077721880061e-05, + "loss": 6.8074, + "step": 2100 + }, + { + "epoch": 0.012495242173375202, + "grad_norm": 2.077986717224121, + "learning_rate": 4.9980758900655316e-05, + "loss": 6.6986, + "step": 2101 + }, + { + "epoch": 0.0125011894566562, + "grad_norm": 2.495882987976074, + "learning_rate": 4.99807405737895e-05, + "loss": 6.6949, + "step": 2102 + }, + { + "epoch": 0.012507136739937197, + "grad_norm": 2.224621295928955, + "learning_rate": 4.998072223820317e-05, + "loss": 6.5723, + "step": 2103 + }, + { + "epoch": 0.012513084023218194, + "grad_norm": 2.515867233276367, + "learning_rate": 4.998070389389632e-05, + "loss": 6.4327, + "step": 2104 + }, + { + "epoch": 0.01251903130649919, + "grad_norm": 2.3134326934814453, + "learning_rate": 4.998068554086897e-05, + "loss": 6.2818, + "step": 2105 + }, + { + "epoch": 0.012524978589780189, + "grad_norm": 2.7688093185424805, + "learning_rate": 4.998066717912112e-05, + "loss": 6.4585, + "step": 2106 + }, + { + "epoch": 0.012530925873061186, + "grad_norm": 3.211790084838867, + "learning_rate": 4.998064880865277e-05, + "loss": 6.5227, + "step": 2107 + }, + { + "epoch": 0.012536873156342183, + "grad_norm": 2.9701578617095947, + "learning_rate": 4.998063042946395e-05, + "loss": 6.5674, + "step": 2108 + }, + { + "epoch": 0.01254282043962318, + "grad_norm": 2.1295664310455322, + "learning_rate": 4.998061204155463e-05, + "loss": 6.5697, + "step": 2109 + }, + { + "epoch": 0.012548767722904178, + "grad_norm": 2.841683864593506, + "learning_rate": 4.998059364492485e-05, + "loss": 6.453, + "step": 2110 + }, + { + "epoch": 0.012554715006185175, + "grad_norm": 2.481001615524292, + "learning_rate": 4.99805752395746e-05, + "loss": 6.555, + "step": 2111 + }, + { + "epoch": 0.012560662289466172, + "grad_norm": 2.357745885848999, + "learning_rate": 4.998055682550389e-05, + "loss": 6.7916, + "step": 2112 + }, + { + "epoch": 0.012566609572747169, + "grad_norm": 2.349417209625244, + "learning_rate": 4.9980538402712725e-05, + "loss": 6.7257, + "step": 2113 + }, + { + "epoch": 0.012572556856028167, + "grad_norm": 2.846930742263794, + "learning_rate": 4.998051997120111e-05, + "loss": 6.7095, + "step": 2114 + }, + { + "epoch": 0.012578504139309164, + "grad_norm": 2.362506628036499, + "learning_rate": 4.998050153096906e-05, + "loss": 6.675, + "step": 2115 + }, + { + "epoch": 0.01258445142259016, + "grad_norm": 2.3275344371795654, + "learning_rate": 4.998048308201656e-05, + "loss": 6.9031, + "step": 2116 + }, + { + "epoch": 0.012590398705871158, + "grad_norm": 2.194359540939331, + "learning_rate": 4.9980464624343644e-05, + "loss": 6.8258, + "step": 2117 + }, + { + "epoch": 0.012596345989152156, + "grad_norm": 2.3926312923431396, + "learning_rate": 4.99804461579503e-05, + "loss": 6.7136, + "step": 2118 + }, + { + "epoch": 0.012602293272433153, + "grad_norm": 2.7430222034454346, + "learning_rate": 4.9980427682836546e-05, + "loss": 6.5475, + "step": 2119 + }, + { + "epoch": 0.01260824055571415, + "grad_norm": 2.1563844680786133, + "learning_rate": 4.998040919900237e-05, + "loss": 6.7105, + "step": 2120 + }, + { + "epoch": 0.012614187838995147, + "grad_norm": 2.1061437129974365, + "learning_rate": 4.998039070644781e-05, + "loss": 6.6411, + "step": 2121 + }, + { + "epoch": 0.012620135122276143, + "grad_norm": 2.6192378997802734, + "learning_rate": 4.9980372205172844e-05, + "loss": 6.6831, + "step": 2122 + }, + { + "epoch": 0.012626082405557142, + "grad_norm": 2.794616222381592, + "learning_rate": 4.9980353695177495e-05, + "loss": 6.8128, + "step": 2123 + }, + { + "epoch": 0.012632029688838139, + "grad_norm": 2.3656489849090576, + "learning_rate": 4.998033517646176e-05, + "loss": 6.8109, + "step": 2124 + }, + { + "epoch": 0.012637976972119136, + "grad_norm": 2.658433437347412, + "learning_rate": 4.998031664902564e-05, + "loss": 6.7979, + "step": 2125 + }, + { + "epoch": 0.012643924255400132, + "grad_norm": 2.889954090118408, + "learning_rate": 4.9980298112869154e-05, + "loss": 6.6745, + "step": 2126 + }, + { + "epoch": 0.012649871538681131, + "grad_norm": 2.469790458679199, + "learning_rate": 4.9980279567992304e-05, + "loss": 6.7056, + "step": 2127 + }, + { + "epoch": 0.012655818821962128, + "grad_norm": 2.4310262203216553, + "learning_rate": 4.9980261014395094e-05, + "loss": 6.8809, + "step": 2128 + }, + { + "epoch": 0.012661766105243125, + "grad_norm": 2.772359609603882, + "learning_rate": 4.998024245207754e-05, + "loss": 7.0383, + "step": 2129 + }, + { + "epoch": 0.012667713388524121, + "grad_norm": 2.292144775390625, + "learning_rate": 4.9980223881039635e-05, + "loss": 6.9062, + "step": 2130 + }, + { + "epoch": 0.01267366067180512, + "grad_norm": 2.590363025665283, + "learning_rate": 4.998020530128139e-05, + "loss": 6.5803, + "step": 2131 + }, + { + "epoch": 0.012679607955086117, + "grad_norm": 2.78432035446167, + "learning_rate": 4.9980186712802824e-05, + "loss": 6.788, + "step": 2132 + }, + { + "epoch": 0.012685555238367114, + "grad_norm": 2.6188290119171143, + "learning_rate": 4.998016811560392e-05, + "loss": 6.5827, + "step": 2133 + }, + { + "epoch": 0.01269150252164811, + "grad_norm": 2.868215560913086, + "learning_rate": 4.99801495096847e-05, + "loss": 6.5845, + "step": 2134 + }, + { + "epoch": 0.012697449804929109, + "grad_norm": 2.4738945960998535, + "learning_rate": 4.998013089504518e-05, + "loss": 6.5019, + "step": 2135 + }, + { + "epoch": 0.012703397088210106, + "grad_norm": 2.5315287113189697, + "learning_rate": 4.998011227168534e-05, + "loss": 6.6765, + "step": 2136 + }, + { + "epoch": 0.012709344371491103, + "grad_norm": 2.7871086597442627, + "learning_rate": 4.998009363960521e-05, + "loss": 6.64, + "step": 2137 + }, + { + "epoch": 0.0127152916547721, + "grad_norm": 2.267502784729004, + "learning_rate": 4.998007499880479e-05, + "loss": 6.8665, + "step": 2138 + }, + { + "epoch": 0.012721238938053098, + "grad_norm": 2.5014212131500244, + "learning_rate": 4.998005634928408e-05, + "loss": 6.6757, + "step": 2139 + }, + { + "epoch": 0.012727186221334095, + "grad_norm": 2.3600070476531982, + "learning_rate": 4.998003769104308e-05, + "loss": 6.5425, + "step": 2140 + }, + { + "epoch": 0.012733133504615092, + "grad_norm": 2.32123064994812, + "learning_rate": 4.998001902408182e-05, + "loss": 6.5192, + "step": 2141 + }, + { + "epoch": 0.012739080787896088, + "grad_norm": 2.5059258937835693, + "learning_rate": 4.998000034840029e-05, + "loss": 6.6315, + "step": 2142 + }, + { + "epoch": 0.012745028071177087, + "grad_norm": 2.2143092155456543, + "learning_rate": 4.99799816639985e-05, + "loss": 6.6058, + "step": 2143 + }, + { + "epoch": 0.012750975354458084, + "grad_norm": 2.3660342693328857, + "learning_rate": 4.997996297087645e-05, + "loss": 6.554, + "step": 2144 + }, + { + "epoch": 0.01275692263773908, + "grad_norm": 2.4286036491394043, + "learning_rate": 4.9979944269034164e-05, + "loss": 6.4857, + "step": 2145 + }, + { + "epoch": 0.012762869921020078, + "grad_norm": 2.4002180099487305, + "learning_rate": 4.997992555847163e-05, + "loss": 6.5083, + "step": 2146 + }, + { + "epoch": 0.012768817204301076, + "grad_norm": 2.418942451477051, + "learning_rate": 4.997990683918886e-05, + "loss": 6.5471, + "step": 2147 + }, + { + "epoch": 0.012774764487582073, + "grad_norm": 2.535654067993164, + "learning_rate": 4.997988811118587e-05, + "loss": 6.5999, + "step": 2148 + }, + { + "epoch": 0.01278071177086307, + "grad_norm": 2.581505298614502, + "learning_rate": 4.9979869374462655e-05, + "loss": 6.2525, + "step": 2149 + }, + { + "epoch": 0.012786659054144067, + "grad_norm": 2.681297779083252, + "learning_rate": 4.997985062901923e-05, + "loss": 6.1463, + "step": 2150 + }, + { + "epoch": 0.012792606337425065, + "grad_norm": 2.3542990684509277, + "learning_rate": 4.997983187485559e-05, + "loss": 6.433, + "step": 2151 + }, + { + "epoch": 0.012798553620706062, + "grad_norm": 2.2994048595428467, + "learning_rate": 4.997981311197175e-05, + "loss": 6.5952, + "step": 2152 + }, + { + "epoch": 0.012804500903987059, + "grad_norm": 2.4703454971313477, + "learning_rate": 4.9979794340367724e-05, + "loss": 6.5581, + "step": 2153 + }, + { + "epoch": 0.012810448187268056, + "grad_norm": 2.511383533477783, + "learning_rate": 4.9979775560043504e-05, + "loss": 6.577, + "step": 2154 + }, + { + "epoch": 0.012816395470549052, + "grad_norm": 2.3300156593322754, + "learning_rate": 4.99797567709991e-05, + "loss": 6.4349, + "step": 2155 + }, + { + "epoch": 0.012822342753830051, + "grad_norm": 2.523878574371338, + "learning_rate": 4.997973797323452e-05, + "loss": 6.5044, + "step": 2156 + }, + { + "epoch": 0.012828290037111048, + "grad_norm": 2.4185073375701904, + "learning_rate": 4.9979719166749776e-05, + "loss": 6.537, + "step": 2157 + }, + { + "epoch": 0.012834237320392045, + "grad_norm": 2.324090003967285, + "learning_rate": 4.997970035154487e-05, + "loss": 6.803, + "step": 2158 + }, + { + "epoch": 0.012840184603673041, + "grad_norm": 2.468872547149658, + "learning_rate": 4.9979681527619804e-05, + "loss": 7.0837, + "step": 2159 + }, + { + "epoch": 0.01284613188695404, + "grad_norm": 2.1467936038970947, + "learning_rate": 4.99796626949746e-05, + "loss": 6.7373, + "step": 2160 + }, + { + "epoch": 0.012852079170235037, + "grad_norm": 2.3208062648773193, + "learning_rate": 4.9979643853609246e-05, + "loss": 6.5483, + "step": 2161 + }, + { + "epoch": 0.012858026453516034, + "grad_norm": 2.2797584533691406, + "learning_rate": 4.997962500352376e-05, + "loss": 6.5857, + "step": 2162 + }, + { + "epoch": 0.01286397373679703, + "grad_norm": 2.3447721004486084, + "learning_rate": 4.9979606144718135e-05, + "loss": 6.8511, + "step": 2163 + }, + { + "epoch": 0.012869921020078029, + "grad_norm": 2.6456334590911865, + "learning_rate": 4.9979587277192395e-05, + "loss": 6.9457, + "step": 2164 + }, + { + "epoch": 0.012875868303359026, + "grad_norm": 3.2567737102508545, + "learning_rate": 4.997956840094654e-05, + "loss": 6.6405, + "step": 2165 + }, + { + "epoch": 0.012881815586640023, + "grad_norm": 2.847371816635132, + "learning_rate": 4.9979549515980574e-05, + "loss": 6.751, + "step": 2166 + }, + { + "epoch": 0.01288776286992102, + "grad_norm": 2.999779462814331, + "learning_rate": 4.99795306222945e-05, + "loss": 6.7437, + "step": 2167 + }, + { + "epoch": 0.012893710153202018, + "grad_norm": 2.3793458938598633, + "learning_rate": 4.9979511719888336e-05, + "loss": 6.6864, + "step": 2168 + }, + { + "epoch": 0.012899657436483015, + "grad_norm": 2.284724473953247, + "learning_rate": 4.9979492808762084e-05, + "loss": 6.4237, + "step": 2169 + }, + { + "epoch": 0.012905604719764012, + "grad_norm": 2.560758352279663, + "learning_rate": 4.997947388891575e-05, + "loss": 6.5964, + "step": 2170 + }, + { + "epoch": 0.012911552003045008, + "grad_norm": 2.7461421489715576, + "learning_rate": 4.997945496034934e-05, + "loss": 6.5354, + "step": 2171 + }, + { + "epoch": 0.012917499286326007, + "grad_norm": 3.0868208408355713, + "learning_rate": 4.9979436023062854e-05, + "loss": 6.6445, + "step": 2172 + }, + { + "epoch": 0.012923446569607004, + "grad_norm": 2.565009593963623, + "learning_rate": 4.997941707705631e-05, + "loss": 6.6015, + "step": 2173 + }, + { + "epoch": 0.012929393852888, + "grad_norm": 2.9424686431884766, + "learning_rate": 4.997939812232971e-05, + "loss": 6.4887, + "step": 2174 + }, + { + "epoch": 0.012935341136168997, + "grad_norm": 3.0674476623535156, + "learning_rate": 4.997937915888305e-05, + "loss": 6.4728, + "step": 2175 + }, + { + "epoch": 0.012941288419449996, + "grad_norm": 3.040189266204834, + "learning_rate": 4.997936018671636e-05, + "loss": 6.3788, + "step": 2176 + }, + { + "epoch": 0.012947235702730993, + "grad_norm": 2.756211042404175, + "learning_rate": 4.9979341205829626e-05, + "loss": 6.4167, + "step": 2177 + }, + { + "epoch": 0.01295318298601199, + "grad_norm": 2.6333322525024414, + "learning_rate": 4.997932221622287e-05, + "loss": 6.6392, + "step": 2178 + }, + { + "epoch": 0.012959130269292986, + "grad_norm": 2.6951076984405518, + "learning_rate": 4.997930321789608e-05, + "loss": 6.3299, + "step": 2179 + }, + { + "epoch": 0.012965077552573985, + "grad_norm": 2.5388028621673584, + "learning_rate": 4.997928421084928e-05, + "loss": 6.2646, + "step": 2180 + }, + { + "epoch": 0.012971024835854982, + "grad_norm": 3.312171459197998, + "learning_rate": 4.997926519508247e-05, + "loss": 6.6331, + "step": 2181 + }, + { + "epoch": 0.012976972119135979, + "grad_norm": 3.437025547027588, + "learning_rate": 4.997924617059565e-05, + "loss": 5.5981, + "step": 2182 + }, + { + "epoch": 0.012982919402416975, + "grad_norm": 2.74035906791687, + "learning_rate": 4.997922713738884e-05, + "loss": 5.1641, + "step": 2183 + }, + { + "epoch": 0.012988866685697972, + "grad_norm": 2.618525505065918, + "learning_rate": 4.9979208095462036e-05, + "loss": 5.9978, + "step": 2184 + }, + { + "epoch": 0.012994813968978971, + "grad_norm": 2.633692502975464, + "learning_rate": 4.9979189044815254e-05, + "loss": 6.2812, + "step": 2185 + }, + { + "epoch": 0.013000761252259968, + "grad_norm": 2.087557792663574, + "learning_rate": 4.997916998544849e-05, + "loss": 6.2864, + "step": 2186 + }, + { + "epoch": 0.013006708535540965, + "grad_norm": 3.365112066268921, + "learning_rate": 4.997915091736176e-05, + "loss": 5.3517, + "step": 2187 + }, + { + "epoch": 0.013012655818821961, + "grad_norm": 2.7561593055725098, + "learning_rate": 4.997913184055506e-05, + "loss": 6.3667, + "step": 2188 + }, + { + "epoch": 0.01301860310210296, + "grad_norm": 2.630976676940918, + "learning_rate": 4.9979112755028415e-05, + "loss": 6.5858, + "step": 2189 + }, + { + "epoch": 0.013024550385383957, + "grad_norm": 2.56007981300354, + "learning_rate": 4.9979093660781805e-05, + "loss": 6.6862, + "step": 2190 + }, + { + "epoch": 0.013030497668664954, + "grad_norm": 2.509631633758545, + "learning_rate": 4.997907455781526e-05, + "loss": 6.4699, + "step": 2191 + }, + { + "epoch": 0.01303644495194595, + "grad_norm": 2.442028522491455, + "learning_rate": 4.997905544612878e-05, + "loss": 6.5755, + "step": 2192 + }, + { + "epoch": 0.013042392235226949, + "grad_norm": 2.561016321182251, + "learning_rate": 4.997903632572236e-05, + "loss": 6.4529, + "step": 2193 + }, + { + "epoch": 0.013048339518507946, + "grad_norm": 2.585753917694092, + "learning_rate": 4.9979017196596025e-05, + "loss": 6.188, + "step": 2194 + }, + { + "epoch": 0.013054286801788943, + "grad_norm": 2.3657655715942383, + "learning_rate": 4.997899805874977e-05, + "loss": 6.1414, + "step": 2195 + }, + { + "epoch": 0.01306023408506994, + "grad_norm": 2.818251609802246, + "learning_rate": 4.997897891218361e-05, + "loss": 6.5276, + "step": 2196 + }, + { + "epoch": 0.013066181368350938, + "grad_norm": 2.9687695503234863, + "learning_rate": 4.997895975689754e-05, + "loss": 6.131, + "step": 2197 + }, + { + "epoch": 0.013072128651631935, + "grad_norm": 2.8505353927612305, + "learning_rate": 4.997894059289157e-05, + "loss": 6.5269, + "step": 2198 + }, + { + "epoch": 0.013078075934912932, + "grad_norm": 2.331573486328125, + "learning_rate": 4.997892142016573e-05, + "loss": 6.1101, + "step": 2199 + }, + { + "epoch": 0.013084023218193928, + "grad_norm": 2.3241569995880127, + "learning_rate": 4.997890223871998e-05, + "loss": 6.5081, + "step": 2200 + }, + { + "epoch": 0.013089970501474927, + "grad_norm": 2.658834218978882, + "learning_rate": 4.997888304855437e-05, + "loss": 6.554, + "step": 2201 + }, + { + "epoch": 0.013095917784755924, + "grad_norm": 2.703911304473877, + "learning_rate": 4.997886384966889e-05, + "loss": 6.337, + "step": 2202 + }, + { + "epoch": 0.01310186506803692, + "grad_norm": 3.020775318145752, + "learning_rate": 4.997884464206354e-05, + "loss": 6.4375, + "step": 2203 + }, + { + "epoch": 0.013107812351317917, + "grad_norm": 3.324218273162842, + "learning_rate": 4.9978825425738334e-05, + "loss": 6.4871, + "step": 2204 + }, + { + "epoch": 0.013113759634598916, + "grad_norm": 3.822019577026367, + "learning_rate": 4.9978806200693276e-05, + "loss": 6.6372, + "step": 2205 + }, + { + "epoch": 0.013119706917879913, + "grad_norm": 3.3639512062072754, + "learning_rate": 4.997878696692838e-05, + "loss": 6.1826, + "step": 2206 + }, + { + "epoch": 0.01312565420116091, + "grad_norm": 3.580603837966919, + "learning_rate": 4.997876772444365e-05, + "loss": 6.793, + "step": 2207 + }, + { + "epoch": 0.013131601484441906, + "grad_norm": 2.472733497619629, + "learning_rate": 4.9978748473239084e-05, + "loss": 6.9054, + "step": 2208 + }, + { + "epoch": 0.013137548767722905, + "grad_norm": 3.327461004257202, + "learning_rate": 4.99787292133147e-05, + "loss": 6.6735, + "step": 2209 + }, + { + "epoch": 0.013143496051003902, + "grad_norm": 3.493234157562256, + "learning_rate": 4.99787099446705e-05, + "loss": 6.9702, + "step": 2210 + }, + { + "epoch": 0.013149443334284899, + "grad_norm": 2.2516424655914307, + "learning_rate": 4.9978690667306483e-05, + "loss": 7.196, + "step": 2211 + }, + { + "epoch": 0.013155390617565895, + "grad_norm": 1.8846355676651, + "learning_rate": 4.9978671381222665e-05, + "loss": 7.0373, + "step": 2212 + }, + { + "epoch": 0.013161337900846894, + "grad_norm": 2.9334232807159424, + "learning_rate": 4.997865208641906e-05, + "loss": 6.2065, + "step": 2213 + }, + { + "epoch": 0.01316728518412789, + "grad_norm": 2.713006019592285, + "learning_rate": 4.997863278289565e-05, + "loss": 6.788, + "step": 2214 + }, + { + "epoch": 0.013173232467408888, + "grad_norm": 2.6246018409729004, + "learning_rate": 4.9978613470652466e-05, + "loss": 6.7979, + "step": 2215 + }, + { + "epoch": 0.013179179750689884, + "grad_norm": 2.2770373821258545, + "learning_rate": 4.997859414968951e-05, + "loss": 6.8307, + "step": 2216 + }, + { + "epoch": 0.013185127033970881, + "grad_norm": 2.6244993209838867, + "learning_rate": 4.997857482000679e-05, + "loss": 6.3176, + "step": 2217 + }, + { + "epoch": 0.01319107431725188, + "grad_norm": 3.4668054580688477, + "learning_rate": 4.997855548160429e-05, + "loss": 6.8962, + "step": 2218 + }, + { + "epoch": 0.013197021600532877, + "grad_norm": 2.711785078048706, + "learning_rate": 4.9978536134482047e-05, + "loss": 6.7111, + "step": 2219 + }, + { + "epoch": 0.013202968883813873, + "grad_norm": 2.6757078170776367, + "learning_rate": 4.997851677864005e-05, + "loss": 6.5501, + "step": 2220 + }, + { + "epoch": 0.01320891616709487, + "grad_norm": 2.150338888168335, + "learning_rate": 4.997849741407831e-05, + "loss": 6.43, + "step": 2221 + }, + { + "epoch": 0.013214863450375869, + "grad_norm": 3.115309953689575, + "learning_rate": 4.9978478040796836e-05, + "loss": 6.4074, + "step": 2222 + }, + { + "epoch": 0.013220810733656866, + "grad_norm": 2.8754189014434814, + "learning_rate": 4.997845865879564e-05, + "loss": 6.2663, + "step": 2223 + }, + { + "epoch": 0.013226758016937862, + "grad_norm": 2.6169707775115967, + "learning_rate": 4.9978439268074716e-05, + "loss": 6.5987, + "step": 2224 + }, + { + "epoch": 0.01323270530021886, + "grad_norm": 2.3814637660980225, + "learning_rate": 4.997841986863408e-05, + "loss": 6.8124, + "step": 2225 + }, + { + "epoch": 0.013238652583499858, + "grad_norm": 2.0276811122894287, + "learning_rate": 4.997840046047373e-05, + "loss": 6.6632, + "step": 2226 + }, + { + "epoch": 0.013244599866780855, + "grad_norm": 2.7943263053894043, + "learning_rate": 4.997838104359368e-05, + "loss": 6.5452, + "step": 2227 + }, + { + "epoch": 0.013250547150061852, + "grad_norm": 2.4058234691619873, + "learning_rate": 4.997836161799393e-05, + "loss": 6.4697, + "step": 2228 + }, + { + "epoch": 0.013256494433342848, + "grad_norm": 2.2487008571624756, + "learning_rate": 4.9978342183674504e-05, + "loss": 6.3361, + "step": 2229 + }, + { + "epoch": 0.013262441716623847, + "grad_norm": 2.3470170497894287, + "learning_rate": 4.997832274063539e-05, + "loss": 6.4024, + "step": 2230 + }, + { + "epoch": 0.013268388999904844, + "grad_norm": 2.589695692062378, + "learning_rate": 4.9978303288876606e-05, + "loss": 6.4184, + "step": 2231 + }, + { + "epoch": 0.01327433628318584, + "grad_norm": 2.691371440887451, + "learning_rate": 4.997828382839815e-05, + "loss": 6.4225, + "step": 2232 + }, + { + "epoch": 0.013280283566466837, + "grad_norm": 3.110410213470459, + "learning_rate": 4.997826435920003e-05, + "loss": 6.5307, + "step": 2233 + }, + { + "epoch": 0.013286230849747836, + "grad_norm": 2.688519239425659, + "learning_rate": 4.9978244881282266e-05, + "loss": 6.568, + "step": 2234 + }, + { + "epoch": 0.013292178133028833, + "grad_norm": 2.3346059322357178, + "learning_rate": 4.997822539464485e-05, + "loss": 6.8837, + "step": 2235 + }, + { + "epoch": 0.01329812541630983, + "grad_norm": 2.679826021194458, + "learning_rate": 4.997820589928779e-05, + "loss": 6.3961, + "step": 2236 + }, + { + "epoch": 0.013304072699590826, + "grad_norm": 2.388120412826538, + "learning_rate": 4.99781863952111e-05, + "loss": 6.4363, + "step": 2237 + }, + { + "epoch": 0.013310019982871825, + "grad_norm": 2.834341049194336, + "learning_rate": 4.997816688241478e-05, + "loss": 6.4855, + "step": 2238 + }, + { + "epoch": 0.013315967266152822, + "grad_norm": 2.8623831272125244, + "learning_rate": 4.997814736089885e-05, + "loss": 6.8607, + "step": 2239 + }, + { + "epoch": 0.013321914549433819, + "grad_norm": 3.001241683959961, + "learning_rate": 4.99781278306633e-05, + "loss": 6.9777, + "step": 2240 + }, + { + "epoch": 0.013327861832714815, + "grad_norm": 2.9721016883850098, + "learning_rate": 4.9978108291708135e-05, + "loss": 6.9821, + "step": 2241 + }, + { + "epoch": 0.013333809115995814, + "grad_norm": 2.798360824584961, + "learning_rate": 4.997808874403338e-05, + "loss": 7.0096, + "step": 2242 + }, + { + "epoch": 0.01333975639927681, + "grad_norm": 3.2242093086242676, + "learning_rate": 4.997806918763903e-05, + "loss": 6.9091, + "step": 2243 + }, + { + "epoch": 0.013345703682557808, + "grad_norm": 2.681920289993286, + "learning_rate": 4.99780496225251e-05, + "loss": 6.7769, + "step": 2244 + }, + { + "epoch": 0.013351650965838804, + "grad_norm": 3.199514865875244, + "learning_rate": 4.9978030048691584e-05, + "loss": 6.6202, + "step": 2245 + }, + { + "epoch": 0.013357598249119801, + "grad_norm": 2.89886474609375, + "learning_rate": 4.9978010466138496e-05, + "loss": 6.7075, + "step": 2246 + }, + { + "epoch": 0.0133635455324008, + "grad_norm": 2.7091262340545654, + "learning_rate": 4.997799087486584e-05, + "loss": 6.9129, + "step": 2247 + }, + { + "epoch": 0.013369492815681797, + "grad_norm": 2.2538888454437256, + "learning_rate": 4.997797127487364e-05, + "loss": 6.6412, + "step": 2248 + }, + { + "epoch": 0.013375440098962793, + "grad_norm": 2.668286085128784, + "learning_rate": 4.997795166616187e-05, + "loss": 6.8506, + "step": 2249 + }, + { + "epoch": 0.01338138738224379, + "grad_norm": 3.915975570678711, + "learning_rate": 4.997793204873057e-05, + "loss": 6.567, + "step": 2250 + }, + { + "epoch": 0.013387334665524789, + "grad_norm": 2.5549614429473877, + "learning_rate": 4.997791242257972e-05, + "loss": 6.7971, + "step": 2251 + }, + { + "epoch": 0.013393281948805786, + "grad_norm": 2.511810064315796, + "learning_rate": 4.997789278770935e-05, + "loss": 7.1949, + "step": 2252 + }, + { + "epoch": 0.013399229232086782, + "grad_norm": 2.026937484741211, + "learning_rate": 4.9977873144119445e-05, + "loss": 7.2067, + "step": 2253 + }, + { + "epoch": 0.01340517651536778, + "grad_norm": 3.6016058921813965, + "learning_rate": 4.997785349181002e-05, + "loss": 6.549, + "step": 2254 + }, + { + "epoch": 0.013411123798648778, + "grad_norm": 2.867418050765991, + "learning_rate": 4.9977833830781094e-05, + "loss": 6.5562, + "step": 2255 + }, + { + "epoch": 0.013417071081929775, + "grad_norm": 2.2168800830841064, + "learning_rate": 4.9977814161032665e-05, + "loss": 7.1798, + "step": 2256 + }, + { + "epoch": 0.013423018365210771, + "grad_norm": 2.728299856185913, + "learning_rate": 4.997779448256473e-05, + "loss": 6.9314, + "step": 2257 + }, + { + "epoch": 0.013428965648491768, + "grad_norm": 2.7336437702178955, + "learning_rate": 4.997777479537732e-05, + "loss": 7.0643, + "step": 2258 + }, + { + "epoch": 0.013434912931772767, + "grad_norm": 3.1546053886413574, + "learning_rate": 4.997775509947041e-05, + "loss": 6.8853, + "step": 2259 + }, + { + "epoch": 0.013440860215053764, + "grad_norm": 3.037036180496216, + "learning_rate": 4.997773539484404e-05, + "loss": 6.6892, + "step": 2260 + }, + { + "epoch": 0.01344680749833476, + "grad_norm": 2.8779382705688477, + "learning_rate": 4.997771568149818e-05, + "loss": 6.4991, + "step": 2261 + }, + { + "epoch": 0.013452754781615757, + "grad_norm": 3.1105282306671143, + "learning_rate": 4.997769595943288e-05, + "loss": 6.4253, + "step": 2262 + }, + { + "epoch": 0.013458702064896756, + "grad_norm": 4.604808330535889, + "learning_rate": 4.997767622864811e-05, + "loss": 6.504, + "step": 2263 + }, + { + "epoch": 0.013464649348177753, + "grad_norm": 4.345273017883301, + "learning_rate": 4.9977656489143896e-05, + "loss": 6.2, + "step": 2264 + }, + { + "epoch": 0.01347059663145875, + "grad_norm": 2.9744133949279785, + "learning_rate": 4.9977636740920243e-05, + "loss": 6.5458, + "step": 2265 + }, + { + "epoch": 0.013476543914739746, + "grad_norm": 3.3981447219848633, + "learning_rate": 4.9977616983977146e-05, + "loss": 6.9791, + "step": 2266 + }, + { + "epoch": 0.013482491198020745, + "grad_norm": 2.5855109691619873, + "learning_rate": 4.997759721831463e-05, + "loss": 6.7425, + "step": 2267 + }, + { + "epoch": 0.013488438481301742, + "grad_norm": 3.961195707321167, + "learning_rate": 4.997757744393269e-05, + "loss": 6.4042, + "step": 2268 + }, + { + "epoch": 0.013494385764582739, + "grad_norm": 3.8216230869293213, + "learning_rate": 4.997755766083133e-05, + "loss": 6.4962, + "step": 2269 + }, + { + "epoch": 0.013500333047863735, + "grad_norm": 3.077279567718506, + "learning_rate": 4.9977537869010574e-05, + "loss": 6.4298, + "step": 2270 + }, + { + "epoch": 0.013506280331144734, + "grad_norm": 2.56152081489563, + "learning_rate": 4.9977518068470406e-05, + "loss": 6.35, + "step": 2271 + }, + { + "epoch": 0.01351222761442573, + "grad_norm": 2.4069855213165283, + "learning_rate": 4.9977498259210854e-05, + "loss": 6.2923, + "step": 2272 + }, + { + "epoch": 0.013518174897706728, + "grad_norm": 2.9591124057769775, + "learning_rate": 4.9977478441231904e-05, + "loss": 6.2477, + "step": 2273 + }, + { + "epoch": 0.013524122180987724, + "grad_norm": 2.627110481262207, + "learning_rate": 4.997745861453359e-05, + "loss": 6.1012, + "step": 2274 + }, + { + "epoch": 0.013530069464268723, + "grad_norm": 2.3042867183685303, + "learning_rate": 4.997743877911589e-05, + "loss": 6.1155, + "step": 2275 + }, + { + "epoch": 0.01353601674754972, + "grad_norm": 2.709324359893799, + "learning_rate": 4.997741893497882e-05, + "loss": 6.0103, + "step": 2276 + }, + { + "epoch": 0.013541964030830717, + "grad_norm": 2.7087934017181396, + "learning_rate": 4.997739908212241e-05, + "loss": 6.0709, + "step": 2277 + }, + { + "epoch": 0.013547911314111713, + "grad_norm": 3.560149669647217, + "learning_rate": 4.997737922054664e-05, + "loss": 6.1775, + "step": 2278 + }, + { + "epoch": 0.01355385859739271, + "grad_norm": 4.623898506164551, + "learning_rate": 4.997735935025152e-05, + "loss": 6.1993, + "step": 2279 + }, + { + "epoch": 0.013559805880673709, + "grad_norm": 2.9960882663726807, + "learning_rate": 4.997733947123707e-05, + "loss": 6.4211, + "step": 2280 + }, + { + "epoch": 0.013565753163954706, + "grad_norm": 3.8918421268463135, + "learning_rate": 4.9977319583503276e-05, + "loss": 6.0194, + "step": 2281 + }, + { + "epoch": 0.013571700447235702, + "grad_norm": 3.4164741039276123, + "learning_rate": 4.997729968705017e-05, + "loss": 5.9824, + "step": 2282 + }, + { + "epoch": 0.0135776477305167, + "grad_norm": 2.4005794525146484, + "learning_rate": 4.997727978187774e-05, + "loss": 5.9727, + "step": 2283 + }, + { + "epoch": 0.013583595013797698, + "grad_norm": 2.4654550552368164, + "learning_rate": 4.9977259867986e-05, + "loss": 6.2681, + "step": 2284 + }, + { + "epoch": 0.013589542297078695, + "grad_norm": 3.193905830383301, + "learning_rate": 4.997723994537496e-05, + "loss": 6.4996, + "step": 2285 + }, + { + "epoch": 0.013595489580359691, + "grad_norm": 2.4845757484436035, + "learning_rate": 4.997722001404462e-05, + "loss": 7.0464, + "step": 2286 + }, + { + "epoch": 0.013601436863640688, + "grad_norm": 3.170182466506958, + "learning_rate": 4.9977200073995e-05, + "loss": 6.1071, + "step": 2287 + }, + { + "epoch": 0.013607384146921687, + "grad_norm": 2.2331149578094482, + "learning_rate": 4.997718012522609e-05, + "loss": 6.6823, + "step": 2288 + }, + { + "epoch": 0.013613331430202684, + "grad_norm": 2.4146671295166016, + "learning_rate": 4.9977160167737904e-05, + "loss": 6.4398, + "step": 2289 + }, + { + "epoch": 0.01361927871348368, + "grad_norm": 3.23956561088562, + "learning_rate": 4.9977140201530445e-05, + "loss": 6.9295, + "step": 2290 + }, + { + "epoch": 0.013625225996764677, + "grad_norm": 3.402979850769043, + "learning_rate": 4.997712022660374e-05, + "loss": 6.7116, + "step": 2291 + }, + { + "epoch": 0.013631173280045676, + "grad_norm": 3.241320848464966, + "learning_rate": 4.997710024295777e-05, + "loss": 6.8871, + "step": 2292 + }, + { + "epoch": 0.013637120563326673, + "grad_norm": 2.5378634929656982, + "learning_rate": 4.997708025059255e-05, + "loss": 6.9548, + "step": 2293 + }, + { + "epoch": 0.01364306784660767, + "grad_norm": 3.1968839168548584, + "learning_rate": 4.9977060249508087e-05, + "loss": 6.6388, + "step": 2294 + }, + { + "epoch": 0.013649015129888666, + "grad_norm": 2.6951656341552734, + "learning_rate": 4.99770402397044e-05, + "loss": 6.9654, + "step": 2295 + }, + { + "epoch": 0.013654962413169665, + "grad_norm": 2.4168484210968018, + "learning_rate": 4.997702022118147e-05, + "loss": 6.6666, + "step": 2296 + }, + { + "epoch": 0.013660909696450662, + "grad_norm": 3.1395177841186523, + "learning_rate": 4.997700019393934e-05, + "loss": 6.4957, + "step": 2297 + }, + { + "epoch": 0.013666856979731658, + "grad_norm": 3.1591687202453613, + "learning_rate": 4.9976980157977985e-05, + "loss": 6.4392, + "step": 2298 + }, + { + "epoch": 0.013672804263012655, + "grad_norm": 2.2415151596069336, + "learning_rate": 4.9976960113297436e-05, + "loss": 6.4543, + "step": 2299 + }, + { + "epoch": 0.013678751546293654, + "grad_norm": 3.9113616943359375, + "learning_rate": 4.997694005989767e-05, + "loss": 6.7088, + "step": 2300 + }, + { + "epoch": 0.01368469882957465, + "grad_norm": 4.218390941619873, + "learning_rate": 4.997691999777873e-05, + "loss": 6.7199, + "step": 2301 + }, + { + "epoch": 0.013690646112855647, + "grad_norm": 4.200760841369629, + "learning_rate": 4.997689992694059e-05, + "loss": 6.6343, + "step": 2302 + }, + { + "epoch": 0.013696593396136644, + "grad_norm": 3.7164547443389893, + "learning_rate": 4.997687984738328e-05, + "loss": 6.772, + "step": 2303 + }, + { + "epoch": 0.013702540679417643, + "grad_norm": 2.1898231506347656, + "learning_rate": 4.99768597591068e-05, + "loss": 6.6165, + "step": 2304 + }, + { + "epoch": 0.01370848796269864, + "grad_norm": 2.72632098197937, + "learning_rate": 4.9976839662111166e-05, + "loss": 6.6474, + "step": 2305 + }, + { + "epoch": 0.013714435245979636, + "grad_norm": 3.64900279045105, + "learning_rate": 4.997681955639636e-05, + "loss": 6.4322, + "step": 2306 + }, + { + "epoch": 0.013720382529260633, + "grad_norm": 3.978445053100586, + "learning_rate": 4.997679944196241e-05, + "loss": 6.5434, + "step": 2307 + }, + { + "epoch": 0.01372632981254163, + "grad_norm": 5.709702491760254, + "learning_rate": 4.997677931880931e-05, + "loss": 6.5234, + "step": 2308 + }, + { + "epoch": 0.013732277095822629, + "grad_norm": 3.0389838218688965, + "learning_rate": 4.997675918693708e-05, + "loss": 6.4163, + "step": 2309 + }, + { + "epoch": 0.013738224379103625, + "grad_norm": 2.695113182067871, + "learning_rate": 4.9976739046345725e-05, + "loss": 6.6956, + "step": 2310 + }, + { + "epoch": 0.013744171662384622, + "grad_norm": 2.9768142700195312, + "learning_rate": 4.997671889703525e-05, + "loss": 6.5315, + "step": 2311 + }, + { + "epoch": 0.01375011894566562, + "grad_norm": 3.750454902648926, + "learning_rate": 4.997669873900566e-05, + "loss": 6.5568, + "step": 2312 + }, + { + "epoch": 0.013756066228946618, + "grad_norm": 3.390232801437378, + "learning_rate": 4.9976678572256955e-05, + "loss": 6.4916, + "step": 2313 + }, + { + "epoch": 0.013762013512227615, + "grad_norm": 3.1487748622894287, + "learning_rate": 4.997665839678915e-05, + "loss": 6.6378, + "step": 2314 + }, + { + "epoch": 0.013767960795508611, + "grad_norm": 2.5654940605163574, + "learning_rate": 4.997663821260226e-05, + "loss": 6.5817, + "step": 2315 + }, + { + "epoch": 0.013773908078789608, + "grad_norm": 2.7092552185058594, + "learning_rate": 4.9976618019696275e-05, + "loss": 6.982, + "step": 2316 + }, + { + "epoch": 0.013779855362070607, + "grad_norm": 3.642826557159424, + "learning_rate": 4.9976597818071214e-05, + "loss": 6.7951, + "step": 2317 + }, + { + "epoch": 0.013785802645351604, + "grad_norm": 3.4288947582244873, + "learning_rate": 4.997657760772708e-05, + "loss": 6.4366, + "step": 2318 + }, + { + "epoch": 0.0137917499286326, + "grad_norm": 2.7620253562927246, + "learning_rate": 4.997655738866389e-05, + "loss": 6.6588, + "step": 2319 + }, + { + "epoch": 0.013797697211913597, + "grad_norm": 2.4266698360443115, + "learning_rate": 4.997653716088163e-05, + "loss": 6.697, + "step": 2320 + }, + { + "epoch": 0.013803644495194596, + "grad_norm": 2.289365768432617, + "learning_rate": 4.9976516924380325e-05, + "loss": 6.7583, + "step": 2321 + }, + { + "epoch": 0.013809591778475593, + "grad_norm": 2.4238948822021484, + "learning_rate": 4.9976496679159976e-05, + "loss": 6.7949, + "step": 2322 + }, + { + "epoch": 0.01381553906175659, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.997647642522059e-05, + "loss": 6.5914, + "step": 2323 + }, + { + "epoch": 0.013821486345037586, + "grad_norm": 2.961089849472046, + "learning_rate": 4.997645616256217e-05, + "loss": 6.3513, + "step": 2324 + }, + { + "epoch": 0.013827433628318585, + "grad_norm": 2.437685251235962, + "learning_rate": 4.997643589118472e-05, + "loss": 6.4626, + "step": 2325 + }, + { + "epoch": 0.013833380911599582, + "grad_norm": 2.769731044769287, + "learning_rate": 4.9976415611088267e-05, + "loss": 6.2801, + "step": 2326 + }, + { + "epoch": 0.013839328194880578, + "grad_norm": 2.700697183609009, + "learning_rate": 4.9976395322272805e-05, + "loss": 6.1969, + "step": 2327 + }, + { + "epoch": 0.013845275478161575, + "grad_norm": 3.8049886226654053, + "learning_rate": 4.997637502473834e-05, + "loss": 6.769, + "step": 2328 + }, + { + "epoch": 0.013851222761442574, + "grad_norm": 3.748903512954712, + "learning_rate": 4.9976354718484875e-05, + "loss": 6.6486, + "step": 2329 + }, + { + "epoch": 0.01385717004472357, + "grad_norm": 3.7807834148406982, + "learning_rate": 4.9976334403512426e-05, + "loss": 6.6251, + "step": 2330 + }, + { + "epoch": 0.013863117328004567, + "grad_norm": 2.5358874797821045, + "learning_rate": 4.997631407982099e-05, + "loss": 6.4425, + "step": 2331 + }, + { + "epoch": 0.013869064611285564, + "grad_norm": 2.4619522094726562, + "learning_rate": 4.9976293747410596e-05, + "loss": 7.2166, + "step": 2332 + }, + { + "epoch": 0.013875011894566563, + "grad_norm": 2.740412473678589, + "learning_rate": 4.997627340628123e-05, + "loss": 6.8907, + "step": 2333 + }, + { + "epoch": 0.01388095917784756, + "grad_norm": 2.872852087020874, + "learning_rate": 4.9976253056432895e-05, + "loss": 6.6142, + "step": 2334 + }, + { + "epoch": 0.013886906461128556, + "grad_norm": 2.01629900932312, + "learning_rate": 4.997623269786562e-05, + "loss": 6.398, + "step": 2335 + }, + { + "epoch": 0.013892853744409553, + "grad_norm": 2.4405698776245117, + "learning_rate": 4.99762123305794e-05, + "loss": 6.9282, + "step": 2336 + }, + { + "epoch": 0.013898801027690552, + "grad_norm": 2.2520413398742676, + "learning_rate": 4.9976191954574235e-05, + "loss": 6.5565, + "step": 2337 + }, + { + "epoch": 0.013904748310971549, + "grad_norm": 2.314852476119995, + "learning_rate": 4.997617156985014e-05, + "loss": 6.3055, + "step": 2338 + }, + { + "epoch": 0.013910695594252545, + "grad_norm": 2.9049081802368164, + "learning_rate": 4.9976151176407124e-05, + "loss": 7.1806, + "step": 2339 + }, + { + "epoch": 0.013916642877533542, + "grad_norm": 2.7533769607543945, + "learning_rate": 4.9976130774245197e-05, + "loss": 7.0047, + "step": 2340 + }, + { + "epoch": 0.013922590160814539, + "grad_norm": 2.124826431274414, + "learning_rate": 4.997611036336435e-05, + "loss": 7.1897, + "step": 2341 + }, + { + "epoch": 0.013928537444095538, + "grad_norm": 2.5205366611480713, + "learning_rate": 4.997608994376461e-05, + "loss": 6.8592, + "step": 2342 + }, + { + "epoch": 0.013934484727376534, + "grad_norm": 2.8026719093322754, + "learning_rate": 4.9976069515445975e-05, + "loss": 6.6622, + "step": 2343 + }, + { + "epoch": 0.013940432010657531, + "grad_norm": 3.045438051223755, + "learning_rate": 4.997604907840845e-05, + "loss": 6.6176, + "step": 2344 + }, + { + "epoch": 0.013946379293938528, + "grad_norm": 2.820199489593506, + "learning_rate": 4.997602863265204e-05, + "loss": 6.4489, + "step": 2345 + }, + { + "epoch": 0.013952326577219527, + "grad_norm": 2.997990369796753, + "learning_rate": 4.997600817817676e-05, + "loss": 7.0989, + "step": 2346 + }, + { + "epoch": 0.013958273860500523, + "grad_norm": 3.316575050354004, + "learning_rate": 4.9975987714982606e-05, + "loss": 6.9042, + "step": 2347 + }, + { + "epoch": 0.01396422114378152, + "grad_norm": 2.3339803218841553, + "learning_rate": 4.99759672430696e-05, + "loss": 6.8831, + "step": 2348 + }, + { + "epoch": 0.013970168427062517, + "grad_norm": 2.510274648666382, + "learning_rate": 4.997594676243775e-05, + "loss": 7.1093, + "step": 2349 + }, + { + "epoch": 0.013976115710343516, + "grad_norm": 2.893909215927124, + "learning_rate": 4.997592627308705e-05, + "loss": 6.5477, + "step": 2350 + }, + { + "epoch": 0.013982062993624512, + "grad_norm": 3.6036674976348877, + "learning_rate": 4.9975905775017505e-05, + "loss": 6.3278, + "step": 2351 + }, + { + "epoch": 0.01398801027690551, + "grad_norm": 2.1260125637054443, + "learning_rate": 4.9975885268229127e-05, + "loss": 6.7883, + "step": 2352 + }, + { + "epoch": 0.013993957560186506, + "grad_norm": 2.328247308731079, + "learning_rate": 4.997586475272193e-05, + "loss": 6.4832, + "step": 2353 + }, + { + "epoch": 0.013999904843467505, + "grad_norm": 2.8075780868530273, + "learning_rate": 4.997584422849593e-05, + "loss": 6.9333, + "step": 2354 + }, + { + "epoch": 0.014005852126748502, + "grad_norm": 1.9339990615844727, + "learning_rate": 4.9975823695551106e-05, + "loss": 6.6856, + "step": 2355 + }, + { + "epoch": 0.014011799410029498, + "grad_norm": 2.842968225479126, + "learning_rate": 4.997580315388748e-05, + "loss": 6.48, + "step": 2356 + }, + { + "epoch": 0.014017746693310495, + "grad_norm": 1.8715558052062988, + "learning_rate": 4.997578260350506e-05, + "loss": 6.8702, + "step": 2357 + }, + { + "epoch": 0.014023693976591494, + "grad_norm": 2.4310202598571777, + "learning_rate": 4.9975762044403865e-05, + "loss": 7.0112, + "step": 2358 + }, + { + "epoch": 0.01402964125987249, + "grad_norm": 2.292121648788452, + "learning_rate": 4.997574147658387e-05, + "loss": 6.6505, + "step": 2359 + }, + { + "epoch": 0.014035588543153487, + "grad_norm": 2.374007225036621, + "learning_rate": 4.997572090004511e-05, + "loss": 6.7332, + "step": 2360 + }, + { + "epoch": 0.014041535826434484, + "grad_norm": 2.198131561279297, + "learning_rate": 4.997570031478759e-05, + "loss": 6.6358, + "step": 2361 + }, + { + "epoch": 0.014047483109715483, + "grad_norm": 2.3109302520751953, + "learning_rate": 4.997567972081131e-05, + "loss": 6.6194, + "step": 2362 + }, + { + "epoch": 0.01405343039299648, + "grad_norm": 2.49338698387146, + "learning_rate": 4.997565911811627e-05, + "loss": 6.5036, + "step": 2363 + }, + { + "epoch": 0.014059377676277476, + "grad_norm": 2.6462419033050537, + "learning_rate": 4.997563850670249e-05, + "loss": 6.4294, + "step": 2364 + }, + { + "epoch": 0.014065324959558473, + "grad_norm": 3.0072524547576904, + "learning_rate": 4.997561788656997e-05, + "loss": 6.8814, + "step": 2365 + }, + { + "epoch": 0.014071272242839472, + "grad_norm": 2.435209035873413, + "learning_rate": 4.997559725771872e-05, + "loss": 6.4684, + "step": 2366 + }, + { + "epoch": 0.014077219526120469, + "grad_norm": 2.8023672103881836, + "learning_rate": 4.997557662014875e-05, + "loss": 6.7922, + "step": 2367 + }, + { + "epoch": 0.014083166809401465, + "grad_norm": 2.6129658222198486, + "learning_rate": 4.9975555973860065e-05, + "loss": 6.4539, + "step": 2368 + }, + { + "epoch": 0.014089114092682462, + "grad_norm": 2.559117317199707, + "learning_rate": 4.997553531885267e-05, + "loss": 6.4713, + "step": 2369 + }, + { + "epoch": 0.014095061375963459, + "grad_norm": 2.4535956382751465, + "learning_rate": 4.9975514655126575e-05, + "loss": 6.963, + "step": 2370 + }, + { + "epoch": 0.014101008659244458, + "grad_norm": 2.3025150299072266, + "learning_rate": 4.997549398268178e-05, + "loss": 6.9299, + "step": 2371 + }, + { + "epoch": 0.014106955942525454, + "grad_norm": 2.834411382675171, + "learning_rate": 4.997547330151831e-05, + "loss": 6.299, + "step": 2372 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 2.8046083450317383, + "learning_rate": 4.997545261163615e-05, + "loss": 5.7691, + "step": 2373 + }, + { + "epoch": 0.014118850509087448, + "grad_norm": 2.663776159286499, + "learning_rate": 4.997543191303532e-05, + "loss": 5.969, + "step": 2374 + }, + { + "epoch": 0.014124797792368447, + "grad_norm": 2.725154161453247, + "learning_rate": 4.997541120571582e-05, + "loss": 5.7473, + "step": 2375 + }, + { + "epoch": 0.014130745075649443, + "grad_norm": 2.9021074771881104, + "learning_rate": 4.9975390489677663e-05, + "loss": 6.3177, + "step": 2376 + }, + { + "epoch": 0.01413669235893044, + "grad_norm": 2.4043307304382324, + "learning_rate": 4.9975369764920866e-05, + "loss": 6.358, + "step": 2377 + }, + { + "epoch": 0.014142639642211437, + "grad_norm": 2.4163010120391846, + "learning_rate": 4.997534903144542e-05, + "loss": 6.6807, + "step": 2378 + }, + { + "epoch": 0.014148586925492436, + "grad_norm": 3.0710666179656982, + "learning_rate": 4.9975328289251335e-05, + "loss": 6.2416, + "step": 2379 + }, + { + "epoch": 0.014154534208773432, + "grad_norm": 2.159627676010132, + "learning_rate": 4.997530753833862e-05, + "loss": 7.1434, + "step": 2380 + }, + { + "epoch": 0.01416048149205443, + "grad_norm": 2.308382034301758, + "learning_rate": 4.997528677870729e-05, + "loss": 7.1243, + "step": 2381 + }, + { + "epoch": 0.014166428775335426, + "grad_norm": 2.7461323738098145, + "learning_rate": 4.997526601035734e-05, + "loss": 6.3066, + "step": 2382 + }, + { + "epoch": 0.014172376058616425, + "grad_norm": 2.8835322856903076, + "learning_rate": 4.997524523328878e-05, + "loss": 6.28, + "step": 2383 + }, + { + "epoch": 0.014178323341897421, + "grad_norm": 2.5195534229278564, + "learning_rate": 4.997522444750162e-05, + "loss": 6.9561, + "step": 2384 + }, + { + "epoch": 0.014184270625178418, + "grad_norm": 3.1697885990142822, + "learning_rate": 4.997520365299587e-05, + "loss": 6.7432, + "step": 2385 + }, + { + "epoch": 0.014190217908459415, + "grad_norm": 3.6300339698791504, + "learning_rate": 4.997518284977154e-05, + "loss": 6.3676, + "step": 2386 + }, + { + "epoch": 0.014196165191740414, + "grad_norm": 3.261981964111328, + "learning_rate": 4.9975162037828625e-05, + "loss": 6.0991, + "step": 2387 + }, + { + "epoch": 0.01420211247502141, + "grad_norm": 3.6291120052337646, + "learning_rate": 4.9975141217167146e-05, + "loss": 6.1239, + "step": 2388 + }, + { + "epoch": 0.014208059758302407, + "grad_norm": 3.192958116531372, + "learning_rate": 4.997512038778709e-05, + "loss": 6.4455, + "step": 2389 + }, + { + "epoch": 0.014214007041583404, + "grad_norm": 2.8887948989868164, + "learning_rate": 4.997509954968849e-05, + "loss": 6.9441, + "step": 2390 + }, + { + "epoch": 0.014219954324864403, + "grad_norm": 2.3568248748779297, + "learning_rate": 4.9975078702871336e-05, + "loss": 7.0207, + "step": 2391 + }, + { + "epoch": 0.0142259016081454, + "grad_norm": 2.2629294395446777, + "learning_rate": 4.997505784733564e-05, + "loss": 6.9575, + "step": 2392 + }, + { + "epoch": 0.014231848891426396, + "grad_norm": 2.5458898544311523, + "learning_rate": 4.99750369830814e-05, + "loss": 6.8533, + "step": 2393 + }, + { + "epoch": 0.014237796174707393, + "grad_norm": 2.5125060081481934, + "learning_rate": 4.997501611010865e-05, + "loss": 6.8615, + "step": 2394 + }, + { + "epoch": 0.014243743457988392, + "grad_norm": 2.9903738498687744, + "learning_rate": 4.997499522841737e-05, + "loss": 6.6927, + "step": 2395 + }, + { + "epoch": 0.014249690741269389, + "grad_norm": 2.7536470890045166, + "learning_rate": 4.997497433800758e-05, + "loss": 6.6454, + "step": 2396 + }, + { + "epoch": 0.014255638024550385, + "grad_norm": 3.5041043758392334, + "learning_rate": 4.997495343887928e-05, + "loss": 6.485, + "step": 2397 + }, + { + "epoch": 0.014261585307831382, + "grad_norm": 3.8025100231170654, + "learning_rate": 4.997493253103249e-05, + "loss": 6.3731, + "step": 2398 + }, + { + "epoch": 0.01426753259111238, + "grad_norm": 3.2657718658447266, + "learning_rate": 4.99749116144672e-05, + "loss": 6.23, + "step": 2399 + }, + { + "epoch": 0.014273479874393378, + "grad_norm": 2.721632719039917, + "learning_rate": 4.997489068918343e-05, + "loss": 6.7292, + "step": 2400 + }, + { + "epoch": 0.014279427157674374, + "grad_norm": 2.3483569622039795, + "learning_rate": 4.9974869755181186e-05, + "loss": 6.4842, + "step": 2401 + }, + { + "epoch": 0.014285374440955371, + "grad_norm": 2.4931676387786865, + "learning_rate": 4.997484881246047e-05, + "loss": 7.0529, + "step": 2402 + }, + { + "epoch": 0.014291321724236368, + "grad_norm": 2.4944825172424316, + "learning_rate": 4.99748278610213e-05, + "loss": 7.0185, + "step": 2403 + }, + { + "epoch": 0.014297269007517367, + "grad_norm": 2.9124202728271484, + "learning_rate": 4.997480690086367e-05, + "loss": 6.9847, + "step": 2404 + }, + { + "epoch": 0.014303216290798363, + "grad_norm": 2.5802674293518066, + "learning_rate": 4.997478593198759e-05, + "loss": 7.0389, + "step": 2405 + }, + { + "epoch": 0.01430916357407936, + "grad_norm": 2.636709451675415, + "learning_rate": 4.9974764954393075e-05, + "loss": 6.7281, + "step": 2406 + }, + { + "epoch": 0.014315110857360357, + "grad_norm": 3.801760196685791, + "learning_rate": 4.997474396808012e-05, + "loss": 5.9962, + "step": 2407 + }, + { + "epoch": 0.014321058140641356, + "grad_norm": 3.7983996868133545, + "learning_rate": 4.997472297304875e-05, + "loss": 6.3821, + "step": 2408 + }, + { + "epoch": 0.014327005423922352, + "grad_norm": 2.863408088684082, + "learning_rate": 4.997470196929895e-05, + "loss": 6.2206, + "step": 2409 + }, + { + "epoch": 0.01433295270720335, + "grad_norm": 2.6187095642089844, + "learning_rate": 4.997468095683076e-05, + "loss": 6.2205, + "step": 2410 + }, + { + "epoch": 0.014338899990484346, + "grad_norm": 3.202986240386963, + "learning_rate": 4.997465993564414e-05, + "loss": 6.259, + "step": 2411 + }, + { + "epoch": 0.014344847273765345, + "grad_norm": 2.9131264686584473, + "learning_rate": 4.9974638905739146e-05, + "loss": 6.4159, + "step": 2412 + }, + { + "epoch": 0.014350794557046341, + "grad_norm": 2.384477376937866, + "learning_rate": 4.9974617867115754e-05, + "loss": 6.6669, + "step": 2413 + }, + { + "epoch": 0.014356741840327338, + "grad_norm": 2.448495626449585, + "learning_rate": 4.997459681977398e-05, + "loss": 6.5679, + "step": 2414 + }, + { + "epoch": 0.014362689123608335, + "grad_norm": 2.1945343017578125, + "learning_rate": 4.997457576371384e-05, + "loss": 6.3856, + "step": 2415 + }, + { + "epoch": 0.014368636406889334, + "grad_norm": 1.867848515510559, + "learning_rate": 4.997455469893533e-05, + "loss": 6.3127, + "step": 2416 + }, + { + "epoch": 0.01437458369017033, + "grad_norm": 2.560976266860962, + "learning_rate": 4.997453362543846e-05, + "loss": 6.4619, + "step": 2417 + }, + { + "epoch": 0.014380530973451327, + "grad_norm": 3.2440431118011475, + "learning_rate": 4.997451254322323e-05, + "loss": 6.399, + "step": 2418 + }, + { + "epoch": 0.014386478256732324, + "grad_norm": 3.0021307468414307, + "learning_rate": 4.9974491452289664e-05, + "loss": 6.174, + "step": 2419 + }, + { + "epoch": 0.014392425540013323, + "grad_norm": 2.6046524047851562, + "learning_rate": 4.997447035263776e-05, + "loss": 6.8284, + "step": 2420 + }, + { + "epoch": 0.01439837282329432, + "grad_norm": 3.1395344734191895, + "learning_rate": 4.997444924426753e-05, + "loss": 6.3395, + "step": 2421 + }, + { + "epoch": 0.014404320106575316, + "grad_norm": 3.056152582168579, + "learning_rate": 4.997442812717897e-05, + "loss": 6.3468, + "step": 2422 + }, + { + "epoch": 0.014410267389856313, + "grad_norm": 2.2532267570495605, + "learning_rate": 4.9974407001372105e-05, + "loss": 6.5187, + "step": 2423 + }, + { + "epoch": 0.014416214673137312, + "grad_norm": 2.0228383541107178, + "learning_rate": 4.997438586684693e-05, + "loss": 6.4452, + "step": 2424 + }, + { + "epoch": 0.014422161956418308, + "grad_norm": 3.2889909744262695, + "learning_rate": 4.997436472360345e-05, + "loss": 6.6466, + "step": 2425 + }, + { + "epoch": 0.014428109239699305, + "grad_norm": 2.957916498184204, + "learning_rate": 4.9974343571641677e-05, + "loss": 6.9617, + "step": 2426 + }, + { + "epoch": 0.014434056522980302, + "grad_norm": 2.7629241943359375, + "learning_rate": 4.997432241096162e-05, + "loss": 6.1687, + "step": 2427 + }, + { + "epoch": 0.0144400038062613, + "grad_norm": 2.849297285079956, + "learning_rate": 4.997430124156329e-05, + "loss": 6.4647, + "step": 2428 + }, + { + "epoch": 0.014445951089542297, + "grad_norm": 2.2432122230529785, + "learning_rate": 4.997428006344669e-05, + "loss": 7.1739, + "step": 2429 + }, + { + "epoch": 0.014451898372823294, + "grad_norm": 2.814807891845703, + "learning_rate": 4.997425887661181e-05, + "loss": 5.945, + "step": 2430 + }, + { + "epoch": 0.014457845656104291, + "grad_norm": 3.140153646469116, + "learning_rate": 4.997423768105869e-05, + "loss": 6.5948, + "step": 2431 + }, + { + "epoch": 0.01446379293938529, + "grad_norm": 2.5276620388031006, + "learning_rate": 4.997421647678732e-05, + "loss": 6.9813, + "step": 2432 + }, + { + "epoch": 0.014469740222666286, + "grad_norm": 2.462204694747925, + "learning_rate": 4.9974195263797705e-05, + "loss": 6.8987, + "step": 2433 + }, + { + "epoch": 0.014475687505947283, + "grad_norm": 3.117255210876465, + "learning_rate": 4.997417404208986e-05, + "loss": 5.883, + "step": 2434 + }, + { + "epoch": 0.01448163478922828, + "grad_norm": 2.6207518577575684, + "learning_rate": 4.997415281166379e-05, + "loss": 6.8065, + "step": 2435 + }, + { + "epoch": 0.014487582072509277, + "grad_norm": 2.996624231338501, + "learning_rate": 4.99741315725195e-05, + "loss": 6.5162, + "step": 2436 + }, + { + "epoch": 0.014493529355790276, + "grad_norm": 2.1946496963500977, + "learning_rate": 4.9974110324656996e-05, + "loss": 6.9521, + "step": 2437 + }, + { + "epoch": 0.014499476639071272, + "grad_norm": 2.273017406463623, + "learning_rate": 4.997408906807629e-05, + "loss": 7.0144, + "step": 2438 + }, + { + "epoch": 0.01450542392235227, + "grad_norm": 2.516509771347046, + "learning_rate": 4.997406780277739e-05, + "loss": 7.013, + "step": 2439 + }, + { + "epoch": 0.014511371205633266, + "grad_norm": 3.0296435356140137, + "learning_rate": 4.9974046528760296e-05, + "loss": 6.934, + "step": 2440 + }, + { + "epoch": 0.014517318488914265, + "grad_norm": 2.6135010719299316, + "learning_rate": 4.9974025246025024e-05, + "loss": 6.7151, + "step": 2441 + }, + { + "epoch": 0.014523265772195261, + "grad_norm": 2.6850788593292236, + "learning_rate": 4.997400395457158e-05, + "loss": 6.5223, + "step": 2442 + }, + { + "epoch": 0.014529213055476258, + "grad_norm": 3.0401692390441895, + "learning_rate": 4.9973982654399966e-05, + "loss": 7.2006, + "step": 2443 + }, + { + "epoch": 0.014535160338757255, + "grad_norm": 3.016805410385132, + "learning_rate": 4.997396134551019e-05, + "loss": 7.0633, + "step": 2444 + }, + { + "epoch": 0.014541107622038254, + "grad_norm": 3.107154130935669, + "learning_rate": 4.9973940027902264e-05, + "loss": 6.9096, + "step": 2445 + }, + { + "epoch": 0.01454705490531925, + "grad_norm": 2.720054864883423, + "learning_rate": 4.9973918701576196e-05, + "loss": 6.7061, + "step": 2446 + }, + { + "epoch": 0.014553002188600247, + "grad_norm": 2.386401414871216, + "learning_rate": 4.9973897366531984e-05, + "loss": 6.5877, + "step": 2447 + }, + { + "epoch": 0.014558949471881244, + "grad_norm": 2.488243579864502, + "learning_rate": 4.997387602276965e-05, + "loss": 6.7792, + "step": 2448 + }, + { + "epoch": 0.014564896755162243, + "grad_norm": 2.7504360675811768, + "learning_rate": 4.9973854670289196e-05, + "loss": 6.6164, + "step": 2449 + }, + { + "epoch": 0.01457084403844324, + "grad_norm": 3.001441240310669, + "learning_rate": 4.9973833309090626e-05, + "loss": 6.5933, + "step": 2450 + }, + { + "epoch": 0.014576791321724236, + "grad_norm": 2.6449999809265137, + "learning_rate": 4.997381193917394e-05, + "loss": 6.5323, + "step": 2451 + }, + { + "epoch": 0.014582738605005233, + "grad_norm": 2.81846022605896, + "learning_rate": 4.9973790560539156e-05, + "loss": 6.5146, + "step": 2452 + }, + { + "epoch": 0.014588685888286232, + "grad_norm": 2.662916421890259, + "learning_rate": 4.997376917318629e-05, + "loss": 6.161, + "step": 2453 + }, + { + "epoch": 0.014594633171567228, + "grad_norm": 2.689601421356201, + "learning_rate": 4.997374777711533e-05, + "loss": 6.2008, + "step": 2454 + }, + { + "epoch": 0.014600580454848225, + "grad_norm": 2.6690561771392822, + "learning_rate": 4.99737263723263e-05, + "loss": 6.4418, + "step": 2455 + }, + { + "epoch": 0.014606527738129222, + "grad_norm": 2.897270917892456, + "learning_rate": 4.997370495881919e-05, + "loss": 6.3968, + "step": 2456 + }, + { + "epoch": 0.01461247502141022, + "grad_norm": 2.9327831268310547, + "learning_rate": 4.997368353659402e-05, + "loss": 6.4665, + "step": 2457 + }, + { + "epoch": 0.014618422304691217, + "grad_norm": 2.658013343811035, + "learning_rate": 4.99736621056508e-05, + "loss": 6.399, + "step": 2458 + }, + { + "epoch": 0.014624369587972214, + "grad_norm": 2.6055238246917725, + "learning_rate": 4.997364066598953e-05, + "loss": 6.4679, + "step": 2459 + }, + { + "epoch": 0.014630316871253211, + "grad_norm": 3.0595951080322266, + "learning_rate": 4.997361921761022e-05, + "loss": 5.8797, + "step": 2460 + }, + { + "epoch": 0.01463626415453421, + "grad_norm": 2.994694471359253, + "learning_rate": 4.997359776051288e-05, + "loss": 5.704, + "step": 2461 + }, + { + "epoch": 0.014642211437815206, + "grad_norm": 2.78153657913208, + "learning_rate": 4.9973576294697514e-05, + "loss": 5.7289, + "step": 2462 + }, + { + "epoch": 0.014648158721096203, + "grad_norm": 2.5119385719299316, + "learning_rate": 4.997355482016414e-05, + "loss": 5.5494, + "step": 2463 + }, + { + "epoch": 0.0146541060043772, + "grad_norm": 2.7880990505218506, + "learning_rate": 4.997353333691274e-05, + "loss": 5.5905, + "step": 2464 + }, + { + "epoch": 0.014660053287658197, + "grad_norm": 2.827352523803711, + "learning_rate": 4.9973511844943346e-05, + "loss": 6.4429, + "step": 2465 + }, + { + "epoch": 0.014666000570939195, + "grad_norm": 2.4297358989715576, + "learning_rate": 4.997349034425595e-05, + "loss": 6.8647, + "step": 2466 + }, + { + "epoch": 0.014671947854220192, + "grad_norm": 2.649064064025879, + "learning_rate": 4.997346883485057e-05, + "loss": 6.5568, + "step": 2467 + }, + { + "epoch": 0.014677895137501189, + "grad_norm": 3.2215452194213867, + "learning_rate": 4.9973447316727215e-05, + "loss": 5.5684, + "step": 2468 + }, + { + "epoch": 0.014683842420782186, + "grad_norm": 2.8760056495666504, + "learning_rate": 4.9973425789885884e-05, + "loss": 5.6395, + "step": 2469 + }, + { + "epoch": 0.014689789704063184, + "grad_norm": 2.4002890586853027, + "learning_rate": 4.9973404254326585e-05, + "loss": 5.9525, + "step": 2470 + }, + { + "epoch": 0.014695736987344181, + "grad_norm": 2.32314395904541, + "learning_rate": 4.997338271004933e-05, + "loss": 6.9675, + "step": 2471 + }, + { + "epoch": 0.014701684270625178, + "grad_norm": 2.262680768966675, + "learning_rate": 4.997336115705413e-05, + "loss": 7.1361, + "step": 2472 + }, + { + "epoch": 0.014707631553906175, + "grad_norm": 2.2855215072631836, + "learning_rate": 4.997333959534098e-05, + "loss": 7.1141, + "step": 2473 + }, + { + "epoch": 0.014713578837187173, + "grad_norm": 2.5461738109588623, + "learning_rate": 4.99733180249099e-05, + "loss": 7.0492, + "step": 2474 + }, + { + "epoch": 0.01471952612046817, + "grad_norm": 2.455561399459839, + "learning_rate": 4.99732964457609e-05, + "loss": 6.9303, + "step": 2475 + }, + { + "epoch": 0.014725473403749167, + "grad_norm": 3.3767740726470947, + "learning_rate": 4.997327485789397e-05, + "loss": 6.8531, + "step": 2476 + }, + { + "epoch": 0.014731420687030164, + "grad_norm": 2.9320104122161865, + "learning_rate": 4.9973253261309125e-05, + "loss": 6.9258, + "step": 2477 + }, + { + "epoch": 0.014737367970311162, + "grad_norm": 2.380960464477539, + "learning_rate": 4.997323165600638e-05, + "loss": 6.8581, + "step": 2478 + }, + { + "epoch": 0.01474331525359216, + "grad_norm": 2.727154016494751, + "learning_rate": 4.997321004198574e-05, + "loss": 7.3814, + "step": 2479 + }, + { + "epoch": 0.014749262536873156, + "grad_norm": 2.8693020343780518, + "learning_rate": 4.997318841924721e-05, + "loss": 6.3793, + "step": 2480 + }, + { + "epoch": 0.014755209820154153, + "grad_norm": 2.941622734069824, + "learning_rate": 4.997316678779079e-05, + "loss": 7.3567, + "step": 2481 + }, + { + "epoch": 0.014761157103435152, + "grad_norm": 3.0310213565826416, + "learning_rate": 4.9973145147616505e-05, + "loss": 6.8832, + "step": 2482 + }, + { + "epoch": 0.014767104386716148, + "grad_norm": 1.9184696674346924, + "learning_rate": 4.9973123498724353e-05, + "loss": 6.7369, + "step": 2483 + }, + { + "epoch": 0.014773051669997145, + "grad_norm": 2.3090195655822754, + "learning_rate": 4.9973101841114335e-05, + "loss": 6.8927, + "step": 2484 + }, + { + "epoch": 0.014778998953278142, + "grad_norm": 2.2947685718536377, + "learning_rate": 4.997308017478647e-05, + "loss": 6.9441, + "step": 2485 + }, + { + "epoch": 0.01478494623655914, + "grad_norm": 2.363690137863159, + "learning_rate": 4.997305849974076e-05, + "loss": 6.9397, + "step": 2486 + }, + { + "epoch": 0.014790893519840137, + "grad_norm": 1.7546948194503784, + "learning_rate": 4.997303681597721e-05, + "loss": 6.7888, + "step": 2487 + }, + { + "epoch": 0.014796840803121134, + "grad_norm": 1.8824211359024048, + "learning_rate": 4.997301512349584e-05, + "loss": 6.6486, + "step": 2488 + }, + { + "epoch": 0.014802788086402131, + "grad_norm": 3.68865704536438, + "learning_rate": 4.9972993422296636e-05, + "loss": 7.0318, + "step": 2489 + }, + { + "epoch": 0.01480873536968313, + "grad_norm": 3.0788486003875732, + "learning_rate": 4.997297171237962e-05, + "loss": 6.814, + "step": 2490 + }, + { + "epoch": 0.014814682652964126, + "grad_norm": 2.6903607845306396, + "learning_rate": 4.997294999374481e-05, + "loss": 6.9752, + "step": 2491 + }, + { + "epoch": 0.014820629936245123, + "grad_norm": 2.6673712730407715, + "learning_rate": 4.9972928266392194e-05, + "loss": 6.9083, + "step": 2492 + }, + { + "epoch": 0.01482657721952612, + "grad_norm": 2.335632801055908, + "learning_rate": 4.9972906530321786e-05, + "loss": 7.027, + "step": 2493 + }, + { + "epoch": 0.014832524502807119, + "grad_norm": 3.2885966300964355, + "learning_rate": 4.997288478553359e-05, + "loss": 6.6551, + "step": 2494 + }, + { + "epoch": 0.014838471786088115, + "grad_norm": 2.7297918796539307, + "learning_rate": 4.997286303202762e-05, + "loss": 6.7345, + "step": 2495 + }, + { + "epoch": 0.014844419069369112, + "grad_norm": 2.640814781188965, + "learning_rate": 4.997284126980388e-05, + "loss": 6.743, + "step": 2496 + }, + { + "epoch": 0.014850366352650109, + "grad_norm": 2.699632167816162, + "learning_rate": 4.997281949886239e-05, + "loss": 6.4633, + "step": 2497 + }, + { + "epoch": 0.014856313635931106, + "grad_norm": 2.5185790061950684, + "learning_rate": 4.9972797719203135e-05, + "loss": 6.5496, + "step": 2498 + }, + { + "epoch": 0.014862260919212104, + "grad_norm": 2.659393548965454, + "learning_rate": 4.9972775930826144e-05, + "loss": 6.5066, + "step": 2499 + }, + { + "epoch": 0.014868208202493101, + "grad_norm": 2.160808563232422, + "learning_rate": 4.99727541337314e-05, + "loss": 6.9851, + "step": 2500 + }, + { + "epoch": 0.014874155485774098, + "grad_norm": 2.656506299972534, + "learning_rate": 4.997273232791894e-05, + "loss": 7.5696, + "step": 2501 + }, + { + "epoch": 0.014880102769055095, + "grad_norm": 2.490612506866455, + "learning_rate": 4.9972710513388754e-05, + "loss": 7.2623, + "step": 2502 + }, + { + "epoch": 0.014886050052336093, + "grad_norm": 2.1744866371154785, + "learning_rate": 4.997268869014085e-05, + "loss": 6.5208, + "step": 2503 + }, + { + "epoch": 0.01489199733561709, + "grad_norm": 2.8058252334594727, + "learning_rate": 4.9972666858175236e-05, + "loss": 6.1527, + "step": 2504 + }, + { + "epoch": 0.014897944618898087, + "grad_norm": 2.418827533721924, + "learning_rate": 4.997264501749193e-05, + "loss": 6.2244, + "step": 2505 + }, + { + "epoch": 0.014903891902179084, + "grad_norm": 2.499648332595825, + "learning_rate": 4.997262316809092e-05, + "loss": 6.8904, + "step": 2506 + }, + { + "epoch": 0.014909839185460082, + "grad_norm": 2.3598594665527344, + "learning_rate": 4.9972601309972235e-05, + "loss": 7.0794, + "step": 2507 + }, + { + "epoch": 0.01491578646874108, + "grad_norm": 2.2443082332611084, + "learning_rate": 4.997257944313587e-05, + "loss": 7.3078, + "step": 2508 + }, + { + "epoch": 0.014921733752022076, + "grad_norm": 2.407501459121704, + "learning_rate": 4.9972557567581835e-05, + "loss": 7.0677, + "step": 2509 + }, + { + "epoch": 0.014927681035303073, + "grad_norm": 2.060865640640259, + "learning_rate": 4.997253568331014e-05, + "loss": 6.7128, + "step": 2510 + }, + { + "epoch": 0.014933628318584071, + "grad_norm": 2.3876516819000244, + "learning_rate": 4.997251379032078e-05, + "loss": 6.7562, + "step": 2511 + }, + { + "epoch": 0.014939575601865068, + "grad_norm": 2.387176990509033, + "learning_rate": 4.997249188861379e-05, + "loss": 6.8237, + "step": 2512 + }, + { + "epoch": 0.014945522885146065, + "grad_norm": 2.7324886322021484, + "learning_rate": 4.997246997818915e-05, + "loss": 6.8963, + "step": 2513 + }, + { + "epoch": 0.014951470168427062, + "grad_norm": 2.3832128047943115, + "learning_rate": 4.997244805904689e-05, + "loss": 6.9467, + "step": 2514 + }, + { + "epoch": 0.01495741745170806, + "grad_norm": 1.8594162464141846, + "learning_rate": 4.9972426131187e-05, + "loss": 7.0712, + "step": 2515 + }, + { + "epoch": 0.014963364734989057, + "grad_norm": 2.322068691253662, + "learning_rate": 4.997240419460949e-05, + "loss": 6.8898, + "step": 2516 + }, + { + "epoch": 0.014969312018270054, + "grad_norm": 2.4850032329559326, + "learning_rate": 4.997238224931438e-05, + "loss": 6.5439, + "step": 2517 + }, + { + "epoch": 0.014975259301551051, + "grad_norm": 2.919579029083252, + "learning_rate": 4.997236029530166e-05, + "loss": 6.3987, + "step": 2518 + }, + { + "epoch": 0.01498120658483205, + "grad_norm": 2.651900053024292, + "learning_rate": 4.997233833257135e-05, + "loss": 6.2735, + "step": 2519 + }, + { + "epoch": 0.014987153868113046, + "grad_norm": 2.7912142276763916, + "learning_rate": 4.997231636112346e-05, + "loss": 6.9835, + "step": 2520 + }, + { + "epoch": 0.014993101151394043, + "grad_norm": 2.5735433101654053, + "learning_rate": 4.997229438095799e-05, + "loss": 7.1218, + "step": 2521 + }, + { + "epoch": 0.01499904843467504, + "grad_norm": 2.483186721801758, + "learning_rate": 4.997227239207494e-05, + "loss": 7.0343, + "step": 2522 + }, + { + "epoch": 0.015004995717956039, + "grad_norm": 2.9296681880950928, + "learning_rate": 4.997225039447434e-05, + "loss": 6.5455, + "step": 2523 + }, + { + "epoch": 0.015010943001237035, + "grad_norm": 2.5536422729492188, + "learning_rate": 4.997222838815618e-05, + "loss": 6.7173, + "step": 2524 + }, + { + "epoch": 0.015016890284518032, + "grad_norm": 6.365324020385742, + "learning_rate": 4.997220637312047e-05, + "loss": 6.0909, + "step": 2525 + }, + { + "epoch": 0.015022837567799029, + "grad_norm": 3.7258150577545166, + "learning_rate": 4.997218434936723e-05, + "loss": 5.9019, + "step": 2526 + }, + { + "epoch": 0.015028784851080026, + "grad_norm": 2.9021997451782227, + "learning_rate": 4.997216231689645e-05, + "loss": 5.8601, + "step": 2527 + }, + { + "epoch": 0.015034732134361024, + "grad_norm": 2.570988416671753, + "learning_rate": 4.997214027570815e-05, + "loss": 6.1513, + "step": 2528 + }, + { + "epoch": 0.015040679417642021, + "grad_norm": 3.013540029525757, + "learning_rate": 4.997211822580233e-05, + "loss": 6.6471, + "step": 2529 + }, + { + "epoch": 0.015046626700923018, + "grad_norm": 2.612210750579834, + "learning_rate": 4.997209616717901e-05, + "loss": 6.5523, + "step": 2530 + }, + { + "epoch": 0.015052573984204015, + "grad_norm": 2.93513822555542, + "learning_rate": 4.9972074099838186e-05, + "loss": 6.1845, + "step": 2531 + }, + { + "epoch": 0.015058521267485013, + "grad_norm": 3.569002389907837, + "learning_rate": 4.9972052023779865e-05, + "loss": 6.7383, + "step": 2532 + }, + { + "epoch": 0.01506446855076601, + "grad_norm": 2.560023784637451, + "learning_rate": 4.9972029939004064e-05, + "loss": 6.4978, + "step": 2533 + }, + { + "epoch": 0.015070415834047007, + "grad_norm": 2.304612398147583, + "learning_rate": 4.997200784551078e-05, + "loss": 6.3316, + "step": 2534 + }, + { + "epoch": 0.015076363117328004, + "grad_norm": 2.4442996978759766, + "learning_rate": 4.997198574330003e-05, + "loss": 6.4245, + "step": 2535 + }, + { + "epoch": 0.015082310400609002, + "grad_norm": 2.764831304550171, + "learning_rate": 4.997196363237181e-05, + "loss": 6.2251, + "step": 2536 + }, + { + "epoch": 0.01508825768389, + "grad_norm": 2.6534347534179688, + "learning_rate": 4.997194151272615e-05, + "loss": 6.6674, + "step": 2537 + }, + { + "epoch": 0.015094204967170996, + "grad_norm": 2.5901331901550293, + "learning_rate": 4.997191938436303e-05, + "loss": 6.5724, + "step": 2538 + }, + { + "epoch": 0.015100152250451993, + "grad_norm": 2.6827733516693115, + "learning_rate": 4.9971897247282474e-05, + "loss": 6.4774, + "step": 2539 + }, + { + "epoch": 0.015106099533732991, + "grad_norm": 2.087397813796997, + "learning_rate": 4.997187510148449e-05, + "loss": 6.5011, + "step": 2540 + }, + { + "epoch": 0.015112046817013988, + "grad_norm": 2.157935619354248, + "learning_rate": 4.9971852946969076e-05, + "loss": 6.3258, + "step": 2541 + }, + { + "epoch": 0.015117994100294985, + "grad_norm": 2.680481195449829, + "learning_rate": 4.997183078373625e-05, + "loss": 6.5631, + "step": 2542 + }, + { + "epoch": 0.015123941383575982, + "grad_norm": 2.897608995437622, + "learning_rate": 4.997180861178602e-05, + "loss": 6.7913, + "step": 2543 + }, + { + "epoch": 0.01512988866685698, + "grad_norm": 2.5714452266693115, + "learning_rate": 4.997178643111838e-05, + "loss": 6.767, + "step": 2544 + }, + { + "epoch": 0.015135835950137977, + "grad_norm": 2.096376419067383, + "learning_rate": 4.997176424173336e-05, + "loss": 6.7365, + "step": 2545 + }, + { + "epoch": 0.015141783233418974, + "grad_norm": 2.083101987838745, + "learning_rate": 4.9971742043630955e-05, + "loss": 6.4693, + "step": 2546 + }, + { + "epoch": 0.015147730516699971, + "grad_norm": 3.509512186050415, + "learning_rate": 4.997171983681116e-05, + "loss": 6.4068, + "step": 2547 + }, + { + "epoch": 0.01515367779998097, + "grad_norm": 3.055772304534912, + "learning_rate": 4.997169762127401e-05, + "loss": 6.3411, + "step": 2548 + }, + { + "epoch": 0.015159625083261966, + "grad_norm": 2.627429485321045, + "learning_rate": 4.997167539701949e-05, + "loss": 6.3788, + "step": 2549 + }, + { + "epoch": 0.015165572366542963, + "grad_norm": 2.408599853515625, + "learning_rate": 4.997165316404761e-05, + "loss": 6.2822, + "step": 2550 + }, + { + "epoch": 0.01517151964982396, + "grad_norm": 2.906006336212158, + "learning_rate": 4.997163092235839e-05, + "loss": 6.2615, + "step": 2551 + }, + { + "epoch": 0.015177466933104958, + "grad_norm": 2.4585347175598145, + "learning_rate": 4.997160867195183e-05, + "loss": 6.4076, + "step": 2552 + }, + { + "epoch": 0.015183414216385955, + "grad_norm": 2.495539665222168, + "learning_rate": 4.9971586412827944e-05, + "loss": 6.4893, + "step": 2553 + }, + { + "epoch": 0.015189361499666952, + "grad_norm": 2.719583034515381, + "learning_rate": 4.9971564144986734e-05, + "loss": 6.276, + "step": 2554 + }, + { + "epoch": 0.015195308782947949, + "grad_norm": 2.464207887649536, + "learning_rate": 4.9971541868428206e-05, + "loss": 6.2713, + "step": 2555 + }, + { + "epoch": 0.015201256066228947, + "grad_norm": 2.3604822158813477, + "learning_rate": 4.997151958315237e-05, + "loss": 6.2648, + "step": 2556 + }, + { + "epoch": 0.015207203349509944, + "grad_norm": 2.729820966720581, + "learning_rate": 4.997149728915924e-05, + "loss": 6.2985, + "step": 2557 + }, + { + "epoch": 0.015213150632790941, + "grad_norm": 2.565760612487793, + "learning_rate": 4.997147498644882e-05, + "loss": 6.401, + "step": 2558 + }, + { + "epoch": 0.015219097916071938, + "grad_norm": 3.091628074645996, + "learning_rate": 4.9971452675021104e-05, + "loss": 6.1774, + "step": 2559 + }, + { + "epoch": 0.015225045199352935, + "grad_norm": 2.452453851699829, + "learning_rate": 4.9971430354876125e-05, + "loss": 6.4669, + "step": 2560 + }, + { + "epoch": 0.015230992482633933, + "grad_norm": 2.4285218715667725, + "learning_rate": 4.997140802601387e-05, + "loss": 6.4086, + "step": 2561 + }, + { + "epoch": 0.01523693976591493, + "grad_norm": 2.094043254852295, + "learning_rate": 4.9971385688434356e-05, + "loss": 6.2502, + "step": 2562 + }, + { + "epoch": 0.015242887049195927, + "grad_norm": 2.5989573001861572, + "learning_rate": 4.9971363342137586e-05, + "loss": 6.2948, + "step": 2563 + }, + { + "epoch": 0.015248834332476924, + "grad_norm": 2.5372314453125, + "learning_rate": 4.9971340987123574e-05, + "loss": 6.5643, + "step": 2564 + }, + { + "epoch": 0.015254781615757922, + "grad_norm": 2.3666064739227295, + "learning_rate": 4.9971318623392325e-05, + "loss": 6.4807, + "step": 2565 + }, + { + "epoch": 0.01526072889903892, + "grad_norm": 2.3216497898101807, + "learning_rate": 4.997129625094385e-05, + "loss": 6.448, + "step": 2566 + }, + { + "epoch": 0.015266676182319916, + "grad_norm": 2.202665090560913, + "learning_rate": 4.9971273869778153e-05, + "loss": 6.3766, + "step": 2567 + }, + { + "epoch": 0.015272623465600913, + "grad_norm": 2.5678982734680176, + "learning_rate": 4.997125147989524e-05, + "loss": 6.0799, + "step": 2568 + }, + { + "epoch": 0.015278570748881911, + "grad_norm": 2.7904717922210693, + "learning_rate": 4.997122908129512e-05, + "loss": 6.3446, + "step": 2569 + }, + { + "epoch": 0.015284518032162908, + "grad_norm": 2.383120059967041, + "learning_rate": 4.99712066739778e-05, + "loss": 6.2398, + "step": 2570 + }, + { + "epoch": 0.015290465315443905, + "grad_norm": 2.4302077293395996, + "learning_rate": 4.9971184257943294e-05, + "loss": 6.2678, + "step": 2571 + }, + { + "epoch": 0.015296412598724902, + "grad_norm": 2.2923178672790527, + "learning_rate": 4.99711618331916e-05, + "loss": 6.4742, + "step": 2572 + }, + { + "epoch": 0.0153023598820059, + "grad_norm": 2.582810878753662, + "learning_rate": 4.9971139399722735e-05, + "loss": 6.4679, + "step": 2573 + }, + { + "epoch": 0.015308307165286897, + "grad_norm": 2.718228578567505, + "learning_rate": 4.997111695753671e-05, + "loss": 6.2475, + "step": 2574 + }, + { + "epoch": 0.015314254448567894, + "grad_norm": 2.4639811515808105, + "learning_rate": 4.997109450663352e-05, + "loss": 6.463, + "step": 2575 + }, + { + "epoch": 0.01532020173184889, + "grad_norm": 2.6998252868652344, + "learning_rate": 4.997107204701318e-05, + "loss": 6.2885, + "step": 2576 + }, + { + "epoch": 0.01532614901512989, + "grad_norm": 2.831291437149048, + "learning_rate": 4.997104957867569e-05, + "loss": 6.2056, + "step": 2577 + }, + { + "epoch": 0.015332096298410886, + "grad_norm": 2.9070980548858643, + "learning_rate": 4.997102710162107e-05, + "loss": 6.3247, + "step": 2578 + }, + { + "epoch": 0.015338043581691883, + "grad_norm": 2.2583134174346924, + "learning_rate": 4.997100461584933e-05, + "loss": 6.3241, + "step": 2579 + }, + { + "epoch": 0.01534399086497288, + "grad_norm": 2.1661887168884277, + "learning_rate": 4.997098212136045e-05, + "loss": 6.173, + "step": 2580 + }, + { + "epoch": 0.015349938148253878, + "grad_norm": 2.146256446838379, + "learning_rate": 4.997095961815448e-05, + "loss": 6.2267, + "step": 2581 + }, + { + "epoch": 0.015355885431534875, + "grad_norm": 2.5691211223602295, + "learning_rate": 4.997093710623139e-05, + "loss": 6.3302, + "step": 2582 + }, + { + "epoch": 0.015361832714815872, + "grad_norm": 2.5439505577087402, + "learning_rate": 4.997091458559121e-05, + "loss": 6.2111, + "step": 2583 + }, + { + "epoch": 0.015367779998096869, + "grad_norm": 2.451582670211792, + "learning_rate": 4.997089205623394e-05, + "loss": 6.2369, + "step": 2584 + }, + { + "epoch": 0.015373727281377867, + "grad_norm": 2.6275687217712402, + "learning_rate": 4.99708695181596e-05, + "loss": 6.1104, + "step": 2585 + }, + { + "epoch": 0.015379674564658864, + "grad_norm": 2.7068562507629395, + "learning_rate": 4.997084697136818e-05, + "loss": 6.1646, + "step": 2586 + }, + { + "epoch": 0.015385621847939861, + "grad_norm": 2.7819957733154297, + "learning_rate": 4.9970824415859694e-05, + "loss": 6.4203, + "step": 2587 + }, + { + "epoch": 0.015391569131220858, + "grad_norm": 2.7021708488464355, + "learning_rate": 4.9970801851634154e-05, + "loss": 6.1535, + "step": 2588 + }, + { + "epoch": 0.015397516414501855, + "grad_norm": 2.50740909576416, + "learning_rate": 4.997077927869156e-05, + "loss": 6.0139, + "step": 2589 + }, + { + "epoch": 0.015403463697782853, + "grad_norm": 2.5769078731536865, + "learning_rate": 4.997075669703193e-05, + "loss": 6.129, + "step": 2590 + }, + { + "epoch": 0.01540941098106385, + "grad_norm": 2.7379090785980225, + "learning_rate": 4.997073410665526e-05, + "loss": 6.4168, + "step": 2591 + }, + { + "epoch": 0.015415358264344847, + "grad_norm": 2.3530659675598145, + "learning_rate": 4.9970711507561565e-05, + "loss": 6.3114, + "step": 2592 + }, + { + "epoch": 0.015421305547625844, + "grad_norm": 2.6025893688201904, + "learning_rate": 4.997068889975086e-05, + "loss": 6.2506, + "step": 2593 + }, + { + "epoch": 0.015427252830906842, + "grad_norm": 2.311833143234253, + "learning_rate": 4.9970666283223145e-05, + "loss": 6.3372, + "step": 2594 + }, + { + "epoch": 0.015433200114187839, + "grad_norm": 2.339947462081909, + "learning_rate": 4.997064365797842e-05, + "loss": 6.2987, + "step": 2595 + }, + { + "epoch": 0.015439147397468836, + "grad_norm": 2.2132725715637207, + "learning_rate": 4.9970621024016714e-05, + "loss": 6.2473, + "step": 2596 + }, + { + "epoch": 0.015445094680749833, + "grad_norm": 2.7063987255096436, + "learning_rate": 4.9970598381338014e-05, + "loss": 6.1702, + "step": 2597 + }, + { + "epoch": 0.015451041964030831, + "grad_norm": 2.4952430725097656, + "learning_rate": 4.9970575729942335e-05, + "loss": 6.3301, + "step": 2598 + }, + { + "epoch": 0.015456989247311828, + "grad_norm": 2.7442502975463867, + "learning_rate": 4.997055306982969e-05, + "loss": 6.1922, + "step": 2599 + }, + { + "epoch": 0.015462936530592825, + "grad_norm": 2.860058069229126, + "learning_rate": 4.997053040100008e-05, + "loss": 6.0674, + "step": 2600 + }, + { + "epoch": 0.015468883813873822, + "grad_norm": 2.821620464324951, + "learning_rate": 4.997050772345352e-05, + "loss": 6.0445, + "step": 2601 + }, + { + "epoch": 0.01547483109715482, + "grad_norm": 2.369174003601074, + "learning_rate": 4.997048503719001e-05, + "loss": 5.8641, + "step": 2602 + }, + { + "epoch": 0.015480778380435817, + "grad_norm": 2.2836029529571533, + "learning_rate": 4.997046234220956e-05, + "loss": 5.7629, + "step": 2603 + }, + { + "epoch": 0.015486725663716814, + "grad_norm": 3.13094162940979, + "learning_rate": 4.997043963851218e-05, + "loss": 6.7871, + "step": 2604 + }, + { + "epoch": 0.01549267294699781, + "grad_norm": 2.884119749069214, + "learning_rate": 4.9970416926097885e-05, + "loss": 6.1079, + "step": 2605 + }, + { + "epoch": 0.01549862023027881, + "grad_norm": 3.0921716690063477, + "learning_rate": 4.997039420496666e-05, + "loss": 5.9221, + "step": 2606 + }, + { + "epoch": 0.015504567513559806, + "grad_norm": 2.6903741359710693, + "learning_rate": 4.997037147511855e-05, + "loss": 5.7377, + "step": 2607 + }, + { + "epoch": 0.015510514796840803, + "grad_norm": 2.177030086517334, + "learning_rate": 4.997034873655352e-05, + "loss": 5.7272, + "step": 2608 + }, + { + "epoch": 0.0155164620801218, + "grad_norm": 2.41406512260437, + "learning_rate": 4.997032598927162e-05, + "loss": 5.6456, + "step": 2609 + }, + { + "epoch": 0.015522409363402798, + "grad_norm": 2.6853182315826416, + "learning_rate": 4.997030323327282e-05, + "loss": 6.1634, + "step": 2610 + }, + { + "epoch": 0.015528356646683795, + "grad_norm": 2.734081983566284, + "learning_rate": 4.997028046855715e-05, + "loss": 6.1366, + "step": 2611 + }, + { + "epoch": 0.015534303929964792, + "grad_norm": 2.234046459197998, + "learning_rate": 4.997025769512461e-05, + "loss": 5.6773, + "step": 2612 + }, + { + "epoch": 0.015540251213245789, + "grad_norm": 2.467381715774536, + "learning_rate": 4.9970234912975226e-05, + "loss": 5.6409, + "step": 2613 + }, + { + "epoch": 0.015546198496526787, + "grad_norm": 2.4890551567077637, + "learning_rate": 4.997021212210897e-05, + "loss": 5.5961, + "step": 2614 + }, + { + "epoch": 0.015552145779807784, + "grad_norm": 2.254138708114624, + "learning_rate": 4.997018932252588e-05, + "loss": 5.6039, + "step": 2615 + }, + { + "epoch": 0.015558093063088781, + "grad_norm": 2.5773816108703613, + "learning_rate": 4.9970166514225955e-05, + "loss": 5.9935, + "step": 2616 + }, + { + "epoch": 0.015564040346369778, + "grad_norm": 2.308300733566284, + "learning_rate": 4.997014369720921e-05, + "loss": 5.8307, + "step": 2617 + }, + { + "epoch": 0.015569987629650776, + "grad_norm": 2.3276724815368652, + "learning_rate": 4.9970120871475634e-05, + "loss": 5.5819, + "step": 2618 + }, + { + "epoch": 0.015575934912931773, + "grad_norm": 2.7989203929901123, + "learning_rate": 4.997009803702526e-05, + "loss": 6.0816, + "step": 2619 + }, + { + "epoch": 0.01558188219621277, + "grad_norm": 2.5614469051361084, + "learning_rate": 4.997007519385807e-05, + "loss": 5.6677, + "step": 2620 + }, + { + "epoch": 0.015587829479493767, + "grad_norm": 2.4494402408599854, + "learning_rate": 4.9970052341974096e-05, + "loss": 5.7754, + "step": 2621 + }, + { + "epoch": 0.015593776762774764, + "grad_norm": 2.214578151702881, + "learning_rate": 4.997002948137333e-05, + "loss": 6.4244, + "step": 2622 + }, + { + "epoch": 0.015599724046055762, + "grad_norm": 2.8115196228027344, + "learning_rate": 4.9970006612055776e-05, + "loss": 5.9822, + "step": 2623 + }, + { + "epoch": 0.015605671329336759, + "grad_norm": 2.4020626544952393, + "learning_rate": 4.996998373402146e-05, + "loss": 6.0481, + "step": 2624 + }, + { + "epoch": 0.015611618612617756, + "grad_norm": 2.3936421871185303, + "learning_rate": 4.996996084727038e-05, + "loss": 6.0663, + "step": 2625 + }, + { + "epoch": 0.015617565895898753, + "grad_norm": 2.2710554599761963, + "learning_rate": 4.996993795180254e-05, + "loss": 6.0668, + "step": 2626 + }, + { + "epoch": 0.015623513179179751, + "grad_norm": 2.141789436340332, + "learning_rate": 4.9969915047617955e-05, + "loss": 6.2159, + "step": 2627 + }, + { + "epoch": 0.015629460462460748, + "grad_norm": 2.557889461517334, + "learning_rate": 4.9969892134716635e-05, + "loss": 6.262, + "step": 2628 + }, + { + "epoch": 0.015635407745741747, + "grad_norm": 2.3966641426086426, + "learning_rate": 4.9969869213098574e-05, + "loss": 6.0412, + "step": 2629 + }, + { + "epoch": 0.01564135502902274, + "grad_norm": 2.301426410675049, + "learning_rate": 4.99698462827638e-05, + "loss": 6.0798, + "step": 2630 + }, + { + "epoch": 0.01564730231230374, + "grad_norm": 2.4315614700317383, + "learning_rate": 4.996982334371231e-05, + "loss": 5.8736, + "step": 2631 + }, + { + "epoch": 0.015653249595584735, + "grad_norm": 2.5549440383911133, + "learning_rate": 4.9969800395944105e-05, + "loss": 5.7858, + "step": 2632 + }, + { + "epoch": 0.015659196878865734, + "grad_norm": 2.480375289916992, + "learning_rate": 4.99697774394592e-05, + "loss": 6.3261, + "step": 2633 + }, + { + "epoch": 0.015665144162146732, + "grad_norm": 2.42866849899292, + "learning_rate": 4.9969754474257614e-05, + "loss": 6.1729, + "step": 2634 + }, + { + "epoch": 0.015671091445427728, + "grad_norm": 2.32722544670105, + "learning_rate": 4.9969731500339335e-05, + "loss": 5.7746, + "step": 2635 + }, + { + "epoch": 0.015677038728708726, + "grad_norm": 2.6797266006469727, + "learning_rate": 4.996970851770438e-05, + "loss": 6.1657, + "step": 2636 + }, + { + "epoch": 0.015682986011989725, + "grad_norm": 2.87758731842041, + "learning_rate": 4.9969685526352775e-05, + "loss": 6.1475, + "step": 2637 + }, + { + "epoch": 0.01568893329527072, + "grad_norm": 2.898663282394409, + "learning_rate": 4.996966252628449e-05, + "loss": 6.2942, + "step": 2638 + }, + { + "epoch": 0.01569488057855172, + "grad_norm": 3.3087987899780273, + "learning_rate": 4.996963951749957e-05, + "loss": 5.9962, + "step": 2639 + }, + { + "epoch": 0.015700827861832713, + "grad_norm": 2.4418020248413086, + "learning_rate": 4.996961649999799e-05, + "loss": 6.1065, + "step": 2640 + }, + { + "epoch": 0.015706775145113712, + "grad_norm": 2.5839014053344727, + "learning_rate": 4.9969593473779786e-05, + "loss": 6.2303, + "step": 2641 + }, + { + "epoch": 0.01571272242839471, + "grad_norm": 2.683163642883301, + "learning_rate": 4.996957043884495e-05, + "loss": 5.7194, + "step": 2642 + }, + { + "epoch": 0.015718669711675706, + "grad_norm": 2.628574848175049, + "learning_rate": 4.99695473951935e-05, + "loss": 5.6239, + "step": 2643 + }, + { + "epoch": 0.015724616994956704, + "grad_norm": 3.0716800689697266, + "learning_rate": 4.9969524342825434e-05, + "loss": 6.1957, + "step": 2644 + }, + { + "epoch": 0.015730564278237703, + "grad_norm": 2.415626287460327, + "learning_rate": 4.996950128174077e-05, + "loss": 6.2953, + "step": 2645 + }, + { + "epoch": 0.015736511561518698, + "grad_norm": 2.6836612224578857, + "learning_rate": 4.996947821193951e-05, + "loss": 6.103, + "step": 2646 + }, + { + "epoch": 0.015742458844799696, + "grad_norm": 2.2673206329345703, + "learning_rate": 4.996945513342166e-05, + "loss": 6.2628, + "step": 2647 + }, + { + "epoch": 0.01574840612808069, + "grad_norm": 2.629955530166626, + "learning_rate": 4.996943204618724e-05, + "loss": 6.2444, + "step": 2648 + }, + { + "epoch": 0.01575435341136169, + "grad_norm": 2.6730127334594727, + "learning_rate": 4.996940895023623e-05, + "loss": 6.0595, + "step": 2649 + }, + { + "epoch": 0.01576030069464269, + "grad_norm": 2.607389450073242, + "learning_rate": 4.996938584556867e-05, + "loss": 6.0253, + "step": 2650 + }, + { + "epoch": 0.015766247977923684, + "grad_norm": 2.264345407485962, + "learning_rate": 4.996936273218456e-05, + "loss": 6.1011, + "step": 2651 + }, + { + "epoch": 0.015772195261204682, + "grad_norm": 2.218766450881958, + "learning_rate": 4.99693396100839e-05, + "loss": 6.0545, + "step": 2652 + }, + { + "epoch": 0.015778142544485677, + "grad_norm": 2.435213088989258, + "learning_rate": 4.99693164792667e-05, + "loss": 6.0679, + "step": 2653 + }, + { + "epoch": 0.015784089827766676, + "grad_norm": 2.2278120517730713, + "learning_rate": 4.996929333973297e-05, + "loss": 6.0864, + "step": 2654 + }, + { + "epoch": 0.015790037111047674, + "grad_norm": 1.983554482460022, + "learning_rate": 4.9969270191482715e-05, + "loss": 6.124, + "step": 2655 + }, + { + "epoch": 0.01579598439432867, + "grad_norm": 1.9382312297821045, + "learning_rate": 4.996924703451594e-05, + "loss": 6.392, + "step": 2656 + }, + { + "epoch": 0.015801931677609668, + "grad_norm": 2.8142831325531006, + "learning_rate": 4.9969223868832674e-05, + "loss": 6.017, + "step": 2657 + }, + { + "epoch": 0.015807878960890667, + "grad_norm": 2.3466787338256836, + "learning_rate": 4.9969200694432904e-05, + "loss": 5.9588, + "step": 2658 + }, + { + "epoch": 0.01581382624417166, + "grad_norm": 2.0172243118286133, + "learning_rate": 4.996917751131664e-05, + "loss": 5.9513, + "step": 2659 + }, + { + "epoch": 0.01581977352745266, + "grad_norm": 2.3778223991394043, + "learning_rate": 4.99691543194839e-05, + "loss": 6.2205, + "step": 2660 + }, + { + "epoch": 0.015825720810733655, + "grad_norm": 2.4351084232330322, + "learning_rate": 4.9969131118934675e-05, + "loss": 6.0916, + "step": 2661 + }, + { + "epoch": 0.015831668094014654, + "grad_norm": 2.22328519821167, + "learning_rate": 4.9969107909669e-05, + "loss": 6.5546, + "step": 2662 + }, + { + "epoch": 0.015837615377295652, + "grad_norm": 2.4626407623291016, + "learning_rate": 4.996908469168685e-05, + "loss": 6.522, + "step": 2663 + }, + { + "epoch": 0.015843562660576647, + "grad_norm": 2.1032283306121826, + "learning_rate": 4.9969061464988266e-05, + "loss": 6.3372, + "step": 2664 + }, + { + "epoch": 0.015849509943857646, + "grad_norm": 2.1436524391174316, + "learning_rate": 4.9969038229573236e-05, + "loss": 6.3792, + "step": 2665 + }, + { + "epoch": 0.015855457227138645, + "grad_norm": 2.42084002494812, + "learning_rate": 4.996901498544176e-05, + "loss": 6.701, + "step": 2666 + }, + { + "epoch": 0.01586140451041964, + "grad_norm": 2.854630947113037, + "learning_rate": 4.996899173259388e-05, + "loss": 6.3273, + "step": 2667 + }, + { + "epoch": 0.015867351793700638, + "grad_norm": 2.2480521202087402, + "learning_rate": 4.996896847102957e-05, + "loss": 6.4314, + "step": 2668 + }, + { + "epoch": 0.015873299076981633, + "grad_norm": 3.7074203491210938, + "learning_rate": 4.996894520074886e-05, + "loss": 5.9438, + "step": 2669 + }, + { + "epoch": 0.015879246360262632, + "grad_norm": 3.1037209033966064, + "learning_rate": 4.9968921921751735e-05, + "loss": 5.7915, + "step": 2670 + }, + { + "epoch": 0.01588519364354363, + "grad_norm": 2.8338170051574707, + "learning_rate": 4.996889863403823e-05, + "loss": 6.7765, + "step": 2671 + }, + { + "epoch": 0.015891140926824626, + "grad_norm": 2.6366934776306152, + "learning_rate": 4.996887533760833e-05, + "loss": 6.8019, + "step": 2672 + }, + { + "epoch": 0.015897088210105624, + "grad_norm": 2.3954126834869385, + "learning_rate": 4.996885203246207e-05, + "loss": 6.3946, + "step": 2673 + }, + { + "epoch": 0.015903035493386623, + "grad_norm": 2.5771238803863525, + "learning_rate": 4.996882871859943e-05, + "loss": 6.3767, + "step": 2674 + }, + { + "epoch": 0.015908982776667618, + "grad_norm": 3.8544304370880127, + "learning_rate": 4.9968805396020424e-05, + "loss": 7.0813, + "step": 2675 + }, + { + "epoch": 0.015914930059948616, + "grad_norm": 3.4221606254577637, + "learning_rate": 4.996878206472507e-05, + "loss": 6.4782, + "step": 2676 + }, + { + "epoch": 0.01592087734322961, + "grad_norm": 3.6425843238830566, + "learning_rate": 4.996875872471338e-05, + "loss": 5.8685, + "step": 2677 + }, + { + "epoch": 0.01592682462651061, + "grad_norm": 3.255345344543457, + "learning_rate": 4.996873537598535e-05, + "loss": 5.7099, + "step": 2678 + }, + { + "epoch": 0.01593277190979161, + "grad_norm": 2.5217175483703613, + "learning_rate": 4.9968712018540997e-05, + "loss": 5.8978, + "step": 2679 + }, + { + "epoch": 0.015938719193072604, + "grad_norm": 2.2415871620178223, + "learning_rate": 4.996868865238031e-05, + "loss": 6.8186, + "step": 2680 + }, + { + "epoch": 0.015944666476353602, + "grad_norm": 2.1412270069122314, + "learning_rate": 4.996866527750332e-05, + "loss": 6.8056, + "step": 2681 + }, + { + "epoch": 0.015950613759634597, + "grad_norm": 2.423093557357788, + "learning_rate": 4.996864189391004e-05, + "loss": 7.0769, + "step": 2682 + }, + { + "epoch": 0.015956561042915596, + "grad_norm": 2.2334039211273193, + "learning_rate": 4.9968618501600454e-05, + "loss": 6.9954, + "step": 2683 + }, + { + "epoch": 0.015962508326196594, + "grad_norm": 2.4311838150024414, + "learning_rate": 4.996859510057458e-05, + "loss": 6.8375, + "step": 2684 + }, + { + "epoch": 0.01596845560947759, + "grad_norm": 4.861137866973877, + "learning_rate": 4.996857169083242e-05, + "loss": 6.2628, + "step": 2685 + }, + { + "epoch": 0.015974402892758588, + "grad_norm": 3.064213991165161, + "learning_rate": 4.996854827237401e-05, + "loss": 6.4316, + "step": 2686 + }, + { + "epoch": 0.015980350176039586, + "grad_norm": 2.307011365890503, + "learning_rate": 4.996852484519932e-05, + "loss": 6.6212, + "step": 2687 + }, + { + "epoch": 0.01598629745932058, + "grad_norm": 2.5157034397125244, + "learning_rate": 4.9968501409308374e-05, + "loss": 7.153, + "step": 2688 + }, + { + "epoch": 0.01599224474260158, + "grad_norm": 2.4122424125671387, + "learning_rate": 4.996847796470119e-05, + "loss": 7.2244, + "step": 2689 + }, + { + "epoch": 0.015998192025882575, + "grad_norm": 2.305055618286133, + "learning_rate": 4.9968454511377773e-05, + "loss": 7.4751, + "step": 2690 + }, + { + "epoch": 0.016004139309163574, + "grad_norm": 3.068027973175049, + "learning_rate": 4.9968431049338116e-05, + "loss": 6.5709, + "step": 2691 + }, + { + "epoch": 0.016010086592444572, + "grad_norm": 2.09893798828125, + "learning_rate": 4.9968407578582246e-05, + "loss": 6.7212, + "step": 2692 + }, + { + "epoch": 0.016016033875725567, + "grad_norm": 2.3161933422088623, + "learning_rate": 4.9968384099110163e-05, + "loss": 6.6243, + "step": 2693 + }, + { + "epoch": 0.016021981159006566, + "grad_norm": 2.913304090499878, + "learning_rate": 4.9968360610921874e-05, + "loss": 6.1946, + "step": 2694 + }, + { + "epoch": 0.016027928442287565, + "grad_norm": 2.746368408203125, + "learning_rate": 4.9968337114017386e-05, + "loss": 6.3783, + "step": 2695 + }, + { + "epoch": 0.01603387572556856, + "grad_norm": 2.40331768989563, + "learning_rate": 4.9968313608396705e-05, + "loss": 6.9898, + "step": 2696 + }, + { + "epoch": 0.016039823008849558, + "grad_norm": 2.214869976043701, + "learning_rate": 4.9968290094059844e-05, + "loss": 6.4497, + "step": 2697 + }, + { + "epoch": 0.016045770292130553, + "grad_norm": 2.050436019897461, + "learning_rate": 4.996826657100682e-05, + "loss": 6.8897, + "step": 2698 + }, + { + "epoch": 0.016051717575411552, + "grad_norm": 2.294149398803711, + "learning_rate": 4.996824303923763e-05, + "loss": 6.5583, + "step": 2699 + }, + { + "epoch": 0.01605766485869255, + "grad_norm": 2.26918625831604, + "learning_rate": 4.996821949875228e-05, + "loss": 6.7411, + "step": 2700 + }, + { + "epoch": 0.016063612141973545, + "grad_norm": 2.1330158710479736, + "learning_rate": 4.9968195949550775e-05, + "loss": 6.8068, + "step": 2701 + }, + { + "epoch": 0.016069559425254544, + "grad_norm": 1.8605769872665405, + "learning_rate": 4.996817239163315e-05, + "loss": 6.4833, + "step": 2702 + }, + { + "epoch": 0.016075506708535543, + "grad_norm": 3.132803440093994, + "learning_rate": 4.996814882499938e-05, + "loss": 5.8281, + "step": 2703 + }, + { + "epoch": 0.016081453991816538, + "grad_norm": 3.1079390048980713, + "learning_rate": 4.996812524964949e-05, + "loss": 5.6894, + "step": 2704 + }, + { + "epoch": 0.016087401275097536, + "grad_norm": 2.2877023220062256, + "learning_rate": 4.996810166558349e-05, + "loss": 7.0128, + "step": 2705 + }, + { + "epoch": 0.01609334855837853, + "grad_norm": 2.415696859359741, + "learning_rate": 4.996807807280138e-05, + "loss": 6.8098, + "step": 2706 + }, + { + "epoch": 0.01609929584165953, + "grad_norm": 2.342111110687256, + "learning_rate": 4.996805447130317e-05, + "loss": 7.2452, + "step": 2707 + }, + { + "epoch": 0.01610524312494053, + "grad_norm": 2.6504852771759033, + "learning_rate": 4.996803086108887e-05, + "loss": 6.6731, + "step": 2708 + }, + { + "epoch": 0.016111190408221523, + "grad_norm": 2.6157166957855225, + "learning_rate": 4.996800724215849e-05, + "loss": 6.9377, + "step": 2709 + }, + { + "epoch": 0.016117137691502522, + "grad_norm": 2.6289443969726562, + "learning_rate": 4.9967983614512036e-05, + "loss": 6.639, + "step": 2710 + }, + { + "epoch": 0.01612308497478352, + "grad_norm": 2.966489791870117, + "learning_rate": 4.996795997814952e-05, + "loss": 6.3681, + "step": 2711 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 3.7333364486694336, + "learning_rate": 4.9967936333070944e-05, + "loss": 5.6015, + "step": 2712 + }, + { + "epoch": 0.016134979541345514, + "grad_norm": 2.942728281021118, + "learning_rate": 4.9967912679276316e-05, + "loss": 5.6548, + "step": 2713 + }, + { + "epoch": 0.01614092682462651, + "grad_norm": 2.394622802734375, + "learning_rate": 4.996788901676566e-05, + "loss": 6.5119, + "step": 2714 + }, + { + "epoch": 0.016146874107907508, + "grad_norm": 2.8388447761535645, + "learning_rate": 4.9967865345538963e-05, + "loss": 6.4424, + "step": 2715 + }, + { + "epoch": 0.016152821391188506, + "grad_norm": 2.7682905197143555, + "learning_rate": 4.9967841665596245e-05, + "loss": 6.4688, + "step": 2716 + }, + { + "epoch": 0.0161587686744695, + "grad_norm": 3.0281460285186768, + "learning_rate": 4.996781797693751e-05, + "loss": 6.52, + "step": 2717 + }, + { + "epoch": 0.0161647159577505, + "grad_norm": 2.9734318256378174, + "learning_rate": 4.996779427956276e-05, + "loss": 6.4307, + "step": 2718 + }, + { + "epoch": 0.016170663241031495, + "grad_norm": 2.7653586864471436, + "learning_rate": 4.996777057347202e-05, + "loss": 6.1783, + "step": 2719 + }, + { + "epoch": 0.016176610524312494, + "grad_norm": 2.9418516159057617, + "learning_rate": 4.996774685866529e-05, + "loss": 6.5466, + "step": 2720 + }, + { + "epoch": 0.016182557807593492, + "grad_norm": 2.789217233657837, + "learning_rate": 4.996772313514258e-05, + "loss": 6.9296, + "step": 2721 + }, + { + "epoch": 0.016188505090874487, + "grad_norm": 2.8092539310455322, + "learning_rate": 4.996769940290389e-05, + "loss": 6.6186, + "step": 2722 + }, + { + "epoch": 0.016194452374155486, + "grad_norm": 2.696572780609131, + "learning_rate": 4.996767566194923e-05, + "loss": 6.5361, + "step": 2723 + }, + { + "epoch": 0.016200399657436484, + "grad_norm": 2.5987300872802734, + "learning_rate": 4.996765191227862e-05, + "loss": 6.4029, + "step": 2724 + }, + { + "epoch": 0.01620634694071748, + "grad_norm": 2.083057165145874, + "learning_rate": 4.996762815389205e-05, + "loss": 6.4747, + "step": 2725 + }, + { + "epoch": 0.016212294223998478, + "grad_norm": 2.912338972091675, + "learning_rate": 4.9967604386789555e-05, + "loss": 6.8869, + "step": 2726 + }, + { + "epoch": 0.016218241507279473, + "grad_norm": 2.642224073410034, + "learning_rate": 4.9967580610971124e-05, + "loss": 6.6701, + "step": 2727 + }, + { + "epoch": 0.016224188790560472, + "grad_norm": 2.673652410507202, + "learning_rate": 4.996755682643676e-05, + "loss": 6.8624, + "step": 2728 + }, + { + "epoch": 0.01623013607384147, + "grad_norm": 2.5223872661590576, + "learning_rate": 4.996753303318648e-05, + "loss": 6.8247, + "step": 2729 + }, + { + "epoch": 0.016236083357122465, + "grad_norm": 2.252037525177002, + "learning_rate": 4.99675092312203e-05, + "loss": 6.7924, + "step": 2730 + }, + { + "epoch": 0.016242030640403464, + "grad_norm": 2.2854461669921875, + "learning_rate": 4.9967485420538216e-05, + "loss": 6.4761, + "step": 2731 + }, + { + "epoch": 0.016247977923684463, + "grad_norm": 2.426912546157837, + "learning_rate": 4.9967461601140244e-05, + "loss": 6.6028, + "step": 2732 + }, + { + "epoch": 0.016253925206965458, + "grad_norm": 2.7375681400299072, + "learning_rate": 4.9967437773026384e-05, + "loss": 6.5283, + "step": 2733 + }, + { + "epoch": 0.016259872490246456, + "grad_norm": 2.7669689655303955, + "learning_rate": 4.996741393619665e-05, + "loss": 6.4382, + "step": 2734 + }, + { + "epoch": 0.01626581977352745, + "grad_norm": 2.294597864151001, + "learning_rate": 4.996739009065105e-05, + "loss": 6.7479, + "step": 2735 + }, + { + "epoch": 0.01627176705680845, + "grad_norm": 2.4791014194488525, + "learning_rate": 4.996736623638959e-05, + "loss": 6.7043, + "step": 2736 + }, + { + "epoch": 0.01627771434008945, + "grad_norm": 2.4080021381378174, + "learning_rate": 4.9967342373412286e-05, + "loss": 6.6046, + "step": 2737 + }, + { + "epoch": 0.016283661623370443, + "grad_norm": 2.463109254837036, + "learning_rate": 4.996731850171914e-05, + "loss": 6.3895, + "step": 2738 + }, + { + "epoch": 0.016289608906651442, + "grad_norm": 2.665908098220825, + "learning_rate": 4.9967294621310155e-05, + "loss": 6.6482, + "step": 2739 + }, + { + "epoch": 0.01629555618993244, + "grad_norm": 2.399526357650757, + "learning_rate": 4.996727073218536e-05, + "loss": 6.7098, + "step": 2740 + }, + { + "epoch": 0.016301503473213436, + "grad_norm": 2.678091287612915, + "learning_rate": 4.996724683434473e-05, + "loss": 6.419, + "step": 2741 + }, + { + "epoch": 0.016307450756494434, + "grad_norm": 2.5573642253875732, + "learning_rate": 4.99672229277883e-05, + "loss": 6.4703, + "step": 2742 + }, + { + "epoch": 0.01631339803977543, + "grad_norm": 2.644097089767456, + "learning_rate": 4.996719901251607e-05, + "loss": 5.9854, + "step": 2743 + }, + { + "epoch": 0.016319345323056428, + "grad_norm": 2.6165592670440674, + "learning_rate": 4.996717508852805e-05, + "loss": 6.1776, + "step": 2744 + }, + { + "epoch": 0.016325292606337426, + "grad_norm": 2.175647020339966, + "learning_rate": 4.996715115582426e-05, + "loss": 6.5533, + "step": 2745 + }, + { + "epoch": 0.01633123988961842, + "grad_norm": 2.112217664718628, + "learning_rate": 4.996712721440467e-05, + "loss": 6.5572, + "step": 2746 + }, + { + "epoch": 0.01633718717289942, + "grad_norm": 2.165111541748047, + "learning_rate": 4.996710326426933e-05, + "loss": 6.2798, + "step": 2747 + }, + { + "epoch": 0.016343134456180415, + "grad_norm": 2.5812315940856934, + "learning_rate": 4.996707930541823e-05, + "loss": 6.0831, + "step": 2748 + }, + { + "epoch": 0.016349081739461414, + "grad_norm": 2.2306227684020996, + "learning_rate": 4.996705533785138e-05, + "loss": 6.5833, + "step": 2749 + }, + { + "epoch": 0.016355029022742412, + "grad_norm": 1.999974250793457, + "learning_rate": 4.996703136156878e-05, + "loss": 6.2461, + "step": 2750 + }, + { + "epoch": 0.016360976306023407, + "grad_norm": 2.0521416664123535, + "learning_rate": 4.996700737657046e-05, + "loss": 6.4606, + "step": 2751 + }, + { + "epoch": 0.016366923589304406, + "grad_norm": 1.8630053997039795, + "learning_rate": 4.996698338285642e-05, + "loss": 6.1375, + "step": 2752 + }, + { + "epoch": 0.016372870872585404, + "grad_norm": 1.7525913715362549, + "learning_rate": 4.9966959380426646e-05, + "loss": 6.1769, + "step": 2753 + }, + { + "epoch": 0.0163788181558664, + "grad_norm": 2.8151230812072754, + "learning_rate": 4.996693536928118e-05, + "loss": 5.9066, + "step": 2754 + }, + { + "epoch": 0.016384765439147398, + "grad_norm": 2.503230571746826, + "learning_rate": 4.9966911349420004e-05, + "loss": 6.3725, + "step": 2755 + }, + { + "epoch": 0.016390712722428393, + "grad_norm": 2.676284074783325, + "learning_rate": 4.996688732084314e-05, + "loss": 6.9086, + "step": 2756 + }, + { + "epoch": 0.01639666000570939, + "grad_norm": 2.3367252349853516, + "learning_rate": 4.99668632835506e-05, + "loss": 6.1323, + "step": 2757 + }, + { + "epoch": 0.01640260728899039, + "grad_norm": 3.3071084022521973, + "learning_rate": 4.996683923754237e-05, + "loss": 6.162, + "step": 2758 + }, + { + "epoch": 0.016408554572271385, + "grad_norm": 2.64388370513916, + "learning_rate": 4.9966815182818494e-05, + "loss": 6.171, + "step": 2759 + }, + { + "epoch": 0.016414501855552384, + "grad_norm": 2.2378199100494385, + "learning_rate": 4.996679111937895e-05, + "loss": 6.4466, + "step": 2760 + }, + { + "epoch": 0.016420449138833382, + "grad_norm": 2.5944395065307617, + "learning_rate": 4.996676704722376e-05, + "loss": 6.7034, + "step": 2761 + }, + { + "epoch": 0.016426396422114378, + "grad_norm": 2.768211841583252, + "learning_rate": 4.996674296635293e-05, + "loss": 6.7551, + "step": 2762 + }, + { + "epoch": 0.016432343705395376, + "grad_norm": 2.80188250541687, + "learning_rate": 4.9966718876766467e-05, + "loss": 6.8437, + "step": 2763 + }, + { + "epoch": 0.01643829098867637, + "grad_norm": 2.2422847747802734, + "learning_rate": 4.996669477846438e-05, + "loss": 6.5365, + "step": 2764 + }, + { + "epoch": 0.01644423827195737, + "grad_norm": 2.526724100112915, + "learning_rate": 4.996667067144668e-05, + "loss": 6.3735, + "step": 2765 + }, + { + "epoch": 0.01645018555523837, + "grad_norm": 3.2267372608184814, + "learning_rate": 4.996664655571337e-05, + "loss": 6.0508, + "step": 2766 + }, + { + "epoch": 0.016456132838519363, + "grad_norm": 3.393270969390869, + "learning_rate": 4.996662243126446e-05, + "loss": 6.5543, + "step": 2767 + }, + { + "epoch": 0.016462080121800362, + "grad_norm": 2.7712342739105225, + "learning_rate": 4.996659829809996e-05, + "loss": 6.5891, + "step": 2768 + }, + { + "epoch": 0.01646802740508136, + "grad_norm": 2.5687179565429688, + "learning_rate": 4.996657415621988e-05, + "loss": 6.464, + "step": 2769 + }, + { + "epoch": 0.016473974688362356, + "grad_norm": 3.059953451156616, + "learning_rate": 4.996655000562424e-05, + "loss": 6.4286, + "step": 2770 + }, + { + "epoch": 0.016479921971643354, + "grad_norm": 3.3729803562164307, + "learning_rate": 4.9966525846313015e-05, + "loss": 6.5937, + "step": 2771 + }, + { + "epoch": 0.01648586925492435, + "grad_norm": 2.907397985458374, + "learning_rate": 4.996650167828624e-05, + "loss": 6.2559, + "step": 2772 + }, + { + "epoch": 0.016491816538205348, + "grad_norm": 3.5011706352233887, + "learning_rate": 4.996647750154392e-05, + "loss": 5.7897, + "step": 2773 + }, + { + "epoch": 0.016497763821486346, + "grad_norm": 2.5495986938476562, + "learning_rate": 4.996645331608607e-05, + "loss": 6.688, + "step": 2774 + }, + { + "epoch": 0.01650371110476734, + "grad_norm": 2.486416816711426, + "learning_rate": 4.9966429121912675e-05, + "loss": 6.8169, + "step": 2775 + }, + { + "epoch": 0.01650965838804834, + "grad_norm": 2.272162437438965, + "learning_rate": 4.9966404919023755e-05, + "loss": 6.696, + "step": 2776 + }, + { + "epoch": 0.016515605671329335, + "grad_norm": 2.9408323764801025, + "learning_rate": 4.9966380707419334e-05, + "loss": 6.1711, + "step": 2777 + }, + { + "epoch": 0.016521552954610334, + "grad_norm": 3.361907958984375, + "learning_rate": 4.99663564870994e-05, + "loss": 5.6029, + "step": 2778 + }, + { + "epoch": 0.016527500237891332, + "grad_norm": 3.06835675239563, + "learning_rate": 4.996633225806397e-05, + "loss": 5.332, + "step": 2779 + }, + { + "epoch": 0.016533447521172327, + "grad_norm": 3.058638572692871, + "learning_rate": 4.9966308020313054e-05, + "loss": 6.3345, + "step": 2780 + }, + { + "epoch": 0.016539394804453326, + "grad_norm": 2.8265507221221924, + "learning_rate": 4.9966283773846654e-05, + "loss": 5.4231, + "step": 2781 + }, + { + "epoch": 0.016545342087734324, + "grad_norm": 3.128094434738159, + "learning_rate": 4.996625951866478e-05, + "loss": 5.4144, + "step": 2782 + }, + { + "epoch": 0.01655128937101532, + "grad_norm": 2.6830554008483887, + "learning_rate": 4.9966235254767445e-05, + "loss": 6.0084, + "step": 2783 + }, + { + "epoch": 0.016557236654296318, + "grad_norm": 2.7146122455596924, + "learning_rate": 4.996621098215466e-05, + "loss": 6.7104, + "step": 2784 + }, + { + "epoch": 0.016563183937577313, + "grad_norm": 3.518169403076172, + "learning_rate": 4.9966186700826425e-05, + "loss": 5.4509, + "step": 2785 + }, + { + "epoch": 0.01656913122085831, + "grad_norm": 2.7607035636901855, + "learning_rate": 4.9966162410782755e-05, + "loss": 6.2149, + "step": 2786 + }, + { + "epoch": 0.01657507850413931, + "grad_norm": 2.897862195968628, + "learning_rate": 4.996613811202365e-05, + "loss": 6.4713, + "step": 2787 + }, + { + "epoch": 0.016581025787420305, + "grad_norm": 2.6984574794769287, + "learning_rate": 4.9966113804549134e-05, + "loss": 6.2298, + "step": 2788 + }, + { + "epoch": 0.016586973070701304, + "grad_norm": 2.7281908988952637, + "learning_rate": 4.996608948835919e-05, + "loss": 6.0244, + "step": 2789 + }, + { + "epoch": 0.016592920353982302, + "grad_norm": 2.314769983291626, + "learning_rate": 4.996606516345386e-05, + "loss": 6.8523, + "step": 2790 + }, + { + "epoch": 0.016598867637263297, + "grad_norm": 2.887943744659424, + "learning_rate": 4.9966040829833115e-05, + "loss": 6.8407, + "step": 2791 + }, + { + "epoch": 0.016604814920544296, + "grad_norm": 3.4924309253692627, + "learning_rate": 4.9966016487497e-05, + "loss": 6.3646, + "step": 2792 + }, + { + "epoch": 0.01661076220382529, + "grad_norm": 2.3095340728759766, + "learning_rate": 4.9965992136445495e-05, + "loss": 6.407, + "step": 2793 + }, + { + "epoch": 0.01661670948710629, + "grad_norm": 3.771980047225952, + "learning_rate": 4.9965967776678627e-05, + "loss": 6.0596, + "step": 2794 + }, + { + "epoch": 0.016622656770387288, + "grad_norm": 3.452252149581909, + "learning_rate": 4.99659434081964e-05, + "loss": 6.1351, + "step": 2795 + }, + { + "epoch": 0.016628604053668283, + "grad_norm": 2.4391021728515625, + "learning_rate": 4.996591903099881e-05, + "loss": 6.3304, + "step": 2796 + }, + { + "epoch": 0.016634551336949282, + "grad_norm": 2.7057220935821533, + "learning_rate": 4.9965894645085885e-05, + "loss": 6.8328, + "step": 2797 + }, + { + "epoch": 0.01664049862023028, + "grad_norm": 2.392627716064453, + "learning_rate": 4.996587025045762e-05, + "loss": 6.8491, + "step": 2798 + }, + { + "epoch": 0.016646445903511276, + "grad_norm": 2.47928786277771, + "learning_rate": 4.9965845847114024e-05, + "loss": 6.6323, + "step": 2799 + }, + { + "epoch": 0.016652393186792274, + "grad_norm": 2.438870668411255, + "learning_rate": 4.9965821435055115e-05, + "loss": 6.3832, + "step": 2800 + }, + { + "epoch": 0.01665834047007327, + "grad_norm": 2.6875247955322266, + "learning_rate": 4.9965797014280895e-05, + "loss": 6.6994, + "step": 2801 + }, + { + "epoch": 0.016664287753354268, + "grad_norm": 2.71785044670105, + "learning_rate": 4.996577258479137e-05, + "loss": 6.2505, + "step": 2802 + }, + { + "epoch": 0.016670235036635266, + "grad_norm": 2.32853102684021, + "learning_rate": 4.996574814658655e-05, + "loss": 6.4409, + "step": 2803 + }, + { + "epoch": 0.01667618231991626, + "grad_norm": 2.271027088165283, + "learning_rate": 4.996572369966646e-05, + "loss": 6.4928, + "step": 2804 + }, + { + "epoch": 0.01668212960319726, + "grad_norm": 2.621448278427124, + "learning_rate": 4.996569924403108e-05, + "loss": 6.7248, + "step": 2805 + }, + { + "epoch": 0.01668807688647826, + "grad_norm": 3.621654748916626, + "learning_rate": 4.9965674779680435e-05, + "loss": 6.7268, + "step": 2806 + }, + { + "epoch": 0.016694024169759254, + "grad_norm": 2.2045094966888428, + "learning_rate": 4.9965650306614534e-05, + "loss": 6.6406, + "step": 2807 + }, + { + "epoch": 0.016699971453040252, + "grad_norm": 2.4885873794555664, + "learning_rate": 4.9965625824833376e-05, + "loss": 6.611, + "step": 2808 + }, + { + "epoch": 0.016705918736321247, + "grad_norm": 2.796971082687378, + "learning_rate": 4.996560133433697e-05, + "loss": 6.455, + "step": 2809 + }, + { + "epoch": 0.016711866019602246, + "grad_norm": 2.539395570755005, + "learning_rate": 4.996557683512535e-05, + "loss": 6.8169, + "step": 2810 + }, + { + "epoch": 0.016717813302883244, + "grad_norm": 2.322824239730835, + "learning_rate": 4.99655523271985e-05, + "loss": 6.3217, + "step": 2811 + }, + { + "epoch": 0.01672376058616424, + "grad_norm": 2.4404520988464355, + "learning_rate": 4.9965527810556424e-05, + "loss": 6.5026, + "step": 2812 + }, + { + "epoch": 0.016729707869445238, + "grad_norm": 2.287362575531006, + "learning_rate": 4.996550328519915e-05, + "loss": 6.9183, + "step": 2813 + }, + { + "epoch": 0.016735655152726233, + "grad_norm": 2.369877815246582, + "learning_rate": 4.996547875112667e-05, + "loss": 6.7488, + "step": 2814 + }, + { + "epoch": 0.01674160243600723, + "grad_norm": 2.323082685470581, + "learning_rate": 4.996545420833899e-05, + "loss": 6.6177, + "step": 2815 + }, + { + "epoch": 0.01674754971928823, + "grad_norm": 2.221214532852173, + "learning_rate": 4.9965429656836145e-05, + "loss": 6.6844, + "step": 2816 + }, + { + "epoch": 0.016753497002569225, + "grad_norm": 2.246819496154785, + "learning_rate": 4.9965405096618116e-05, + "loss": 6.5631, + "step": 2817 + }, + { + "epoch": 0.016759444285850224, + "grad_norm": 2.411806583404541, + "learning_rate": 4.996538052768493e-05, + "loss": 6.4037, + "step": 2818 + }, + { + "epoch": 0.016765391569131222, + "grad_norm": 1.941197395324707, + "learning_rate": 4.996535595003658e-05, + "loss": 6.5232, + "step": 2819 + }, + { + "epoch": 0.016771338852412217, + "grad_norm": 2.149991750717163, + "learning_rate": 4.996533136367309e-05, + "loss": 6.4166, + "step": 2820 + }, + { + "epoch": 0.016777286135693216, + "grad_norm": 2.5388433933258057, + "learning_rate": 4.9965306768594454e-05, + "loss": 6.5733, + "step": 2821 + }, + { + "epoch": 0.01678323341897421, + "grad_norm": 2.1857333183288574, + "learning_rate": 4.9965282164800694e-05, + "loss": 6.5558, + "step": 2822 + }, + { + "epoch": 0.01678918070225521, + "grad_norm": 2.1090164184570312, + "learning_rate": 4.9965257552291804e-05, + "loss": 6.6916, + "step": 2823 + }, + { + "epoch": 0.016795127985536208, + "grad_norm": 2.1102349758148193, + "learning_rate": 4.9965232931067806e-05, + "loss": 6.5852, + "step": 2824 + }, + { + "epoch": 0.016801075268817203, + "grad_norm": 2.384660005569458, + "learning_rate": 4.99652083011287e-05, + "loss": 6.5033, + "step": 2825 + }, + { + "epoch": 0.016807022552098202, + "grad_norm": 2.314896821975708, + "learning_rate": 4.9965183662474504e-05, + "loss": 6.4108, + "step": 2826 + }, + { + "epoch": 0.0168129698353792, + "grad_norm": 2.4358227252960205, + "learning_rate": 4.9965159015105215e-05, + "loss": 6.5309, + "step": 2827 + }, + { + "epoch": 0.016818917118660195, + "grad_norm": 2.179905652999878, + "learning_rate": 4.9965134359020844e-05, + "loss": 6.4593, + "step": 2828 + }, + { + "epoch": 0.016824864401941194, + "grad_norm": 2.2742464542388916, + "learning_rate": 4.99651096942214e-05, + "loss": 6.6654, + "step": 2829 + }, + { + "epoch": 0.01683081168522219, + "grad_norm": 2.211026668548584, + "learning_rate": 4.9965085020706906e-05, + "loss": 6.4527, + "step": 2830 + }, + { + "epoch": 0.016836758968503188, + "grad_norm": 2.552072763442993, + "learning_rate": 4.996506033847735e-05, + "loss": 6.5338, + "step": 2831 + }, + { + "epoch": 0.016842706251784186, + "grad_norm": 2.3208038806915283, + "learning_rate": 4.996503564753276e-05, + "loss": 6.473, + "step": 2832 + }, + { + "epoch": 0.01684865353506518, + "grad_norm": 2.3756048679351807, + "learning_rate": 4.996501094787312e-05, + "loss": 6.4223, + "step": 2833 + }, + { + "epoch": 0.01685460081834618, + "grad_norm": 2.386152982711792, + "learning_rate": 4.996498623949846e-05, + "loss": 6.317, + "step": 2834 + }, + { + "epoch": 0.01686054810162718, + "grad_norm": 2.144510507583618, + "learning_rate": 4.996496152240878e-05, + "loss": 6.4039, + "step": 2835 + }, + { + "epoch": 0.016866495384908173, + "grad_norm": 2.3362607955932617, + "learning_rate": 4.996493679660409e-05, + "loss": 6.5411, + "step": 2836 + }, + { + "epoch": 0.016872442668189172, + "grad_norm": 2.156428337097168, + "learning_rate": 4.9964912062084404e-05, + "loss": 6.3399, + "step": 2837 + }, + { + "epoch": 0.016878389951470167, + "grad_norm": 2.3429903984069824, + "learning_rate": 4.9964887318849715e-05, + "loss": 6.5159, + "step": 2838 + }, + { + "epoch": 0.016884337234751166, + "grad_norm": 2.1888442039489746, + "learning_rate": 4.9964862566900045e-05, + "loss": 6.3906, + "step": 2839 + }, + { + "epoch": 0.016890284518032164, + "grad_norm": 2.3973047733306885, + "learning_rate": 4.9964837806235396e-05, + "loss": 6.3452, + "step": 2840 + }, + { + "epoch": 0.01689623180131316, + "grad_norm": 2.232057809829712, + "learning_rate": 4.996481303685578e-05, + "loss": 6.5203, + "step": 2841 + }, + { + "epoch": 0.016902179084594158, + "grad_norm": 2.672342300415039, + "learning_rate": 4.996478825876122e-05, + "loss": 6.8615, + "step": 2842 + }, + { + "epoch": 0.016908126367875153, + "grad_norm": 2.603943347930908, + "learning_rate": 4.996476347195171e-05, + "loss": 7.1632, + "step": 2843 + }, + { + "epoch": 0.01691407365115615, + "grad_norm": 2.684616804122925, + "learning_rate": 4.9964738676427234e-05, + "loss": 6.5546, + "step": 2844 + }, + { + "epoch": 0.01692002093443715, + "grad_norm": 2.1103904247283936, + "learning_rate": 4.996471387218785e-05, + "loss": 6.4666, + "step": 2845 + }, + { + "epoch": 0.016925968217718145, + "grad_norm": 2.8278937339782715, + "learning_rate": 4.9964689059233525e-05, + "loss": 6.3685, + "step": 2846 + }, + { + "epoch": 0.016931915500999144, + "grad_norm": 3.2611489295959473, + "learning_rate": 4.9964664237564296e-05, + "loss": 6.5537, + "step": 2847 + }, + { + "epoch": 0.016937862784280142, + "grad_norm": 3.029353141784668, + "learning_rate": 4.9964639407180155e-05, + "loss": 6.6097, + "step": 2848 + }, + { + "epoch": 0.016943810067561137, + "grad_norm": 2.6735312938690186, + "learning_rate": 4.996461456808112e-05, + "loss": 6.5854, + "step": 2849 + }, + { + "epoch": 0.016949757350842136, + "grad_norm": 2.7619409561157227, + "learning_rate": 4.99645897202672e-05, + "loss": 6.5944, + "step": 2850 + }, + { + "epoch": 0.01695570463412313, + "grad_norm": 3.0398738384246826, + "learning_rate": 4.9964564863738396e-05, + "loss": 6.3804, + "step": 2851 + }, + { + "epoch": 0.01696165191740413, + "grad_norm": 3.5388784408569336, + "learning_rate": 4.996453999849472e-05, + "loss": 7.0993, + "step": 2852 + }, + { + "epoch": 0.016967599200685128, + "grad_norm": 2.3602113723754883, + "learning_rate": 4.9964515124536185e-05, + "loss": 6.4981, + "step": 2853 + }, + { + "epoch": 0.016973546483966123, + "grad_norm": 2.346632957458496, + "learning_rate": 4.996449024186278e-05, + "loss": 6.4892, + "step": 2854 + }, + { + "epoch": 0.016979493767247122, + "grad_norm": 2.9653544425964355, + "learning_rate": 4.996446535047454e-05, + "loss": 6.2772, + "step": 2855 + }, + { + "epoch": 0.01698544105052812, + "grad_norm": 3.1064538955688477, + "learning_rate": 4.996444045037147e-05, + "loss": 6.238, + "step": 2856 + }, + { + "epoch": 0.016991388333809115, + "grad_norm": 2.9617815017700195, + "learning_rate": 4.9964415541553564e-05, + "loss": 6.2991, + "step": 2857 + }, + { + "epoch": 0.016997335617090114, + "grad_norm": 2.5993905067443848, + "learning_rate": 4.996439062402084e-05, + "loss": 6.5482, + "step": 2858 + }, + { + "epoch": 0.01700328290037111, + "grad_norm": 2.5469226837158203, + "learning_rate": 4.996436569777331e-05, + "loss": 6.437, + "step": 2859 + }, + { + "epoch": 0.017009230183652108, + "grad_norm": 2.709184408187866, + "learning_rate": 4.9964340762810965e-05, + "loss": 6.1362, + "step": 2860 + }, + { + "epoch": 0.017015177466933106, + "grad_norm": 2.843942880630493, + "learning_rate": 4.9964315819133837e-05, + "loss": 6.2443, + "step": 2861 + }, + { + "epoch": 0.0170211247502141, + "grad_norm": 3.022735357284546, + "learning_rate": 4.9964290866741925e-05, + "loss": 6.3161, + "step": 2862 + }, + { + "epoch": 0.0170270720334951, + "grad_norm": 2.487271308898926, + "learning_rate": 4.996426590563523e-05, + "loss": 6.3352, + "step": 2863 + }, + { + "epoch": 0.0170330193167761, + "grad_norm": 2.624000072479248, + "learning_rate": 4.996424093581377e-05, + "loss": 6.3575, + "step": 2864 + }, + { + "epoch": 0.017038966600057093, + "grad_norm": 2.378368854522705, + "learning_rate": 4.996421595727756e-05, + "loss": 6.3284, + "step": 2865 + }, + { + "epoch": 0.017044913883338092, + "grad_norm": 2.6903984546661377, + "learning_rate": 4.996419097002659e-05, + "loss": 6.271, + "step": 2866 + }, + { + "epoch": 0.017050861166619087, + "grad_norm": 2.536391019821167, + "learning_rate": 4.9964165974060875e-05, + "loss": 6.1276, + "step": 2867 + }, + { + "epoch": 0.017056808449900086, + "grad_norm": 2.470395803451538, + "learning_rate": 4.9964140969380434e-05, + "loss": 6.1032, + "step": 2868 + }, + { + "epoch": 0.017062755733181084, + "grad_norm": 2.929818630218506, + "learning_rate": 4.996411595598528e-05, + "loss": 6.0994, + "step": 2869 + }, + { + "epoch": 0.01706870301646208, + "grad_norm": 2.548701763153076, + "learning_rate": 4.99640909338754e-05, + "loss": 6.2227, + "step": 2870 + }, + { + "epoch": 0.017074650299743078, + "grad_norm": 2.6044397354125977, + "learning_rate": 4.99640659030508e-05, + "loss": 6.0778, + "step": 2871 + }, + { + "epoch": 0.017080597583024073, + "grad_norm": 2.687392473220825, + "learning_rate": 4.996404086351153e-05, + "loss": 6.2975, + "step": 2872 + }, + { + "epoch": 0.01708654486630507, + "grad_norm": 2.740201711654663, + "learning_rate": 4.9964015815257556e-05, + "loss": 6.5955, + "step": 2873 + }, + { + "epoch": 0.01709249214958607, + "grad_norm": 2.605958938598633, + "learning_rate": 4.99639907582889e-05, + "loss": 6.2112, + "step": 2874 + }, + { + "epoch": 0.017098439432867065, + "grad_norm": 2.9691529273986816, + "learning_rate": 4.996396569260558e-05, + "loss": 6.1435, + "step": 2875 + }, + { + "epoch": 0.017104386716148064, + "grad_norm": 2.822201728820801, + "learning_rate": 4.9963940618207593e-05, + "loss": 6.1949, + "step": 2876 + }, + { + "epoch": 0.017110333999429062, + "grad_norm": 2.6231529712677, + "learning_rate": 4.996391553509495e-05, + "loss": 6.5082, + "step": 2877 + }, + { + "epoch": 0.017116281282710057, + "grad_norm": 2.6511785984039307, + "learning_rate": 4.9963890443267666e-05, + "loss": 6.4461, + "step": 2878 + }, + { + "epoch": 0.017122228565991056, + "grad_norm": 2.4790167808532715, + "learning_rate": 4.996386534272575e-05, + "loss": 6.4642, + "step": 2879 + }, + { + "epoch": 0.01712817584927205, + "grad_norm": 3.6982533931732178, + "learning_rate": 4.99638402334692e-05, + "loss": 6.2957, + "step": 2880 + }, + { + "epoch": 0.01713412313255305, + "grad_norm": 2.380385160446167, + "learning_rate": 4.996381511549804e-05, + "loss": 6.3174, + "step": 2881 + }, + { + "epoch": 0.017140070415834048, + "grad_norm": 2.425537347793579, + "learning_rate": 4.996378998881226e-05, + "loss": 6.2055, + "step": 2882 + }, + { + "epoch": 0.017146017699115043, + "grad_norm": 2.4667842388153076, + "learning_rate": 4.996376485341188e-05, + "loss": 6.245, + "step": 2883 + }, + { + "epoch": 0.01715196498239604, + "grad_norm": 2.6306424140930176, + "learning_rate": 4.996373970929691e-05, + "loss": 6.1162, + "step": 2884 + }, + { + "epoch": 0.01715791226567704, + "grad_norm": 4.439255714416504, + "learning_rate": 4.996371455646736e-05, + "loss": 5.9868, + "step": 2885 + }, + { + "epoch": 0.017163859548958035, + "grad_norm": 3.3248472213745117, + "learning_rate": 4.9963689394923224e-05, + "loss": 5.861, + "step": 2886 + }, + { + "epoch": 0.017169806832239034, + "grad_norm": 2.45271897315979, + "learning_rate": 4.996366422466453e-05, + "loss": 6.1588, + "step": 2887 + }, + { + "epoch": 0.01717575411552003, + "grad_norm": 3.1748130321502686, + "learning_rate": 4.996363904569128e-05, + "loss": 6.3607, + "step": 2888 + }, + { + "epoch": 0.017181701398801028, + "grad_norm": 3.300736427307129, + "learning_rate": 4.996361385800348e-05, + "loss": 6.0709, + "step": 2889 + }, + { + "epoch": 0.017187648682082026, + "grad_norm": 2.720550060272217, + "learning_rate": 4.9963588661601136e-05, + "loss": 6.0496, + "step": 2890 + }, + { + "epoch": 0.01719359596536302, + "grad_norm": 2.251845121383667, + "learning_rate": 4.9963563456484266e-05, + "loss": 6.0088, + "step": 2891 + }, + { + "epoch": 0.01719954324864402, + "grad_norm": 2.7863035202026367, + "learning_rate": 4.996353824265288e-05, + "loss": 5.9478, + "step": 2892 + }, + { + "epoch": 0.01720549053192502, + "grad_norm": 2.831744432449341, + "learning_rate": 4.996351302010697e-05, + "loss": 6.1629, + "step": 2893 + }, + { + "epoch": 0.017211437815206013, + "grad_norm": 4.583891868591309, + "learning_rate": 4.9963487788846556e-05, + "loss": 6.7936, + "step": 2894 + }, + { + "epoch": 0.017217385098487012, + "grad_norm": 2.4525468349456787, + "learning_rate": 4.996346254887165e-05, + "loss": 6.3188, + "step": 2895 + }, + { + "epoch": 0.017223332381768007, + "grad_norm": 3.0866281986236572, + "learning_rate": 4.9963437300182254e-05, + "loss": 6.0207, + "step": 2896 + }, + { + "epoch": 0.017229279665049006, + "grad_norm": 3.1188113689422607, + "learning_rate": 4.996341204277838e-05, + "loss": 5.9873, + "step": 2897 + }, + { + "epoch": 0.017235226948330004, + "grad_norm": 2.4119350910186768, + "learning_rate": 4.996338677666004e-05, + "loss": 5.8104, + "step": 2898 + }, + { + "epoch": 0.017241174231611, + "grad_norm": 1.9601647853851318, + "learning_rate": 4.996336150182724e-05, + "loss": 6.2166, + "step": 2899 + }, + { + "epoch": 0.017247121514891998, + "grad_norm": 3.428379535675049, + "learning_rate": 4.9963336218279986e-05, + "loss": 6.4284, + "step": 2900 + }, + { + "epoch": 0.017253068798172993, + "grad_norm": 2.629446506500244, + "learning_rate": 4.996331092601829e-05, + "loss": 6.4916, + "step": 2901 + }, + { + "epoch": 0.01725901608145399, + "grad_norm": 2.3860316276550293, + "learning_rate": 4.996328562504216e-05, + "loss": 6.5035, + "step": 2902 + }, + { + "epoch": 0.01726496336473499, + "grad_norm": 2.6754682064056396, + "learning_rate": 4.996326031535161e-05, + "loss": 6.6374, + "step": 2903 + }, + { + "epoch": 0.017270910648015985, + "grad_norm": 2.737901210784912, + "learning_rate": 4.9963234996946635e-05, + "loss": 6.5023, + "step": 2904 + }, + { + "epoch": 0.017276857931296984, + "grad_norm": 2.481691837310791, + "learning_rate": 4.996320966982726e-05, + "loss": 6.5211, + "step": 2905 + }, + { + "epoch": 0.017282805214577982, + "grad_norm": 3.3993568420410156, + "learning_rate": 4.996318433399348e-05, + "loss": 6.4239, + "step": 2906 + }, + { + "epoch": 0.017288752497858977, + "grad_norm": 3.9149057865142822, + "learning_rate": 4.9963158989445316e-05, + "loss": 6.3874, + "step": 2907 + }, + { + "epoch": 0.017294699781139976, + "grad_norm": 2.3808562755584717, + "learning_rate": 4.996313363618276e-05, + "loss": 6.2887, + "step": 2908 + }, + { + "epoch": 0.01730064706442097, + "grad_norm": 2.6186649799346924, + "learning_rate": 4.996310827420585e-05, + "loss": 6.2944, + "step": 2909 + }, + { + "epoch": 0.01730659434770197, + "grad_norm": 2.5251142978668213, + "learning_rate": 4.9963082903514554e-05, + "loss": 6.0944, + "step": 2910 + }, + { + "epoch": 0.017312541630982968, + "grad_norm": 2.8212270736694336, + "learning_rate": 4.9963057524108926e-05, + "loss": 6.6621, + "step": 2911 + }, + { + "epoch": 0.017318488914263963, + "grad_norm": 2.477485418319702, + "learning_rate": 4.996303213598894e-05, + "loss": 6.3941, + "step": 2912 + }, + { + "epoch": 0.01732443619754496, + "grad_norm": 3.6508305072784424, + "learning_rate": 4.996300673915462e-05, + "loss": 6.3234, + "step": 2913 + }, + { + "epoch": 0.01733038348082596, + "grad_norm": 2.1635468006134033, + "learning_rate": 4.996298133360598e-05, + "loss": 6.2877, + "step": 2914 + }, + { + "epoch": 0.017336330764106955, + "grad_norm": 3.431082010269165, + "learning_rate": 4.9962955919343004e-05, + "loss": 6.2627, + "step": 2915 + }, + { + "epoch": 0.017342278047387954, + "grad_norm": 3.272376775741577, + "learning_rate": 4.9962930496365736e-05, + "loss": 6.1458, + "step": 2916 + }, + { + "epoch": 0.01734822533066895, + "grad_norm": 3.5927000045776367, + "learning_rate": 4.996290506467415e-05, + "loss": 5.9828, + "step": 2917 + }, + { + "epoch": 0.017354172613949947, + "grad_norm": 3.569641351699829, + "learning_rate": 4.996287962426829e-05, + "loss": 6.5957, + "step": 2918 + }, + { + "epoch": 0.017360119897230946, + "grad_norm": 3.281855344772339, + "learning_rate": 4.9962854175148134e-05, + "loss": 6.3393, + "step": 2919 + }, + { + "epoch": 0.01736606718051194, + "grad_norm": 2.6009061336517334, + "learning_rate": 4.9962828717313706e-05, + "loss": 6.3537, + "step": 2920 + }, + { + "epoch": 0.01737201446379294, + "grad_norm": 3.964467763900757, + "learning_rate": 4.996280325076501e-05, + "loss": 6.0281, + "step": 2921 + }, + { + "epoch": 0.017377961747073938, + "grad_norm": 3.9164865016937256, + "learning_rate": 4.9962777775502064e-05, + "loss": 6.5255, + "step": 2922 + }, + { + "epoch": 0.017383909030354933, + "grad_norm": 2.349709987640381, + "learning_rate": 4.996275229152486e-05, + "loss": 6.2459, + "step": 2923 + }, + { + "epoch": 0.017389856313635932, + "grad_norm": 2.5735161304473877, + "learning_rate": 4.9962726798833425e-05, + "loss": 6.0463, + "step": 2924 + }, + { + "epoch": 0.017395803596916927, + "grad_norm": 2.228271961212158, + "learning_rate": 4.9962701297427764e-05, + "loss": 6.1147, + "step": 2925 + }, + { + "epoch": 0.017401750880197926, + "grad_norm": 2.4587175846099854, + "learning_rate": 4.9962675787307875e-05, + "loss": 7.0868, + "step": 2926 + }, + { + "epoch": 0.017407698163478924, + "grad_norm": 2.2712674140930176, + "learning_rate": 4.996265026847378e-05, + "loss": 6.175, + "step": 2927 + }, + { + "epoch": 0.01741364544675992, + "grad_norm": 3.0724384784698486, + "learning_rate": 4.996262474092547e-05, + "loss": 6.5354, + "step": 2928 + }, + { + "epoch": 0.017419592730040918, + "grad_norm": 4.872220039367676, + "learning_rate": 4.996259920466297e-05, + "loss": 6.1938, + "step": 2929 + }, + { + "epoch": 0.017425540013321916, + "grad_norm": 4.508706569671631, + "learning_rate": 4.996257365968629e-05, + "loss": 6.1813, + "step": 2930 + }, + { + "epoch": 0.01743148729660291, + "grad_norm": 3.0419485569000244, + "learning_rate": 4.996254810599543e-05, + "loss": 5.9529, + "step": 2931 + }, + { + "epoch": 0.01743743457988391, + "grad_norm": 2.8372066020965576, + "learning_rate": 4.996252254359041e-05, + "loss": 5.9422, + "step": 2932 + }, + { + "epoch": 0.017443381863164905, + "grad_norm": 4.554285526275635, + "learning_rate": 4.996249697247122e-05, + "loss": 6.9073, + "step": 2933 + }, + { + "epoch": 0.017449329146445904, + "grad_norm": 3.121094226837158, + "learning_rate": 4.996247139263788e-05, + "loss": 6.2827, + "step": 2934 + }, + { + "epoch": 0.017455276429726902, + "grad_norm": 3.936596632003784, + "learning_rate": 4.996244580409041e-05, + "loss": 6.7863, + "step": 2935 + }, + { + "epoch": 0.017461223713007897, + "grad_norm": 3.5771539211273193, + "learning_rate": 4.99624202068288e-05, + "loss": 7.0691, + "step": 2936 + }, + { + "epoch": 0.017467170996288896, + "grad_norm": 2.0674471855163574, + "learning_rate": 4.996239460085307e-05, + "loss": 6.9768, + "step": 2937 + }, + { + "epoch": 0.01747311827956989, + "grad_norm": 2.600167989730835, + "learning_rate": 4.996236898616322e-05, + "loss": 6.4235, + "step": 2938 + }, + { + "epoch": 0.01747906556285089, + "grad_norm": 2.9444847106933594, + "learning_rate": 4.9962343362759267e-05, + "loss": 6.7305, + "step": 2939 + }, + { + "epoch": 0.017485012846131888, + "grad_norm": 3.721101999282837, + "learning_rate": 4.996231773064122e-05, + "loss": 6.5147, + "step": 2940 + }, + { + "epoch": 0.017490960129412883, + "grad_norm": 5.715269565582275, + "learning_rate": 4.9962292089809086e-05, + "loss": 6.1433, + "step": 2941 + }, + { + "epoch": 0.01749690741269388, + "grad_norm": 4.245530128479004, + "learning_rate": 4.996226644026287e-05, + "loss": 6.2163, + "step": 2942 + }, + { + "epoch": 0.01750285469597488, + "grad_norm": 2.7717039585113525, + "learning_rate": 4.996224078200259e-05, + "loss": 5.877, + "step": 2943 + }, + { + "epoch": 0.017508801979255875, + "grad_norm": 3.4189441204071045, + "learning_rate": 4.9962215115028255e-05, + "loss": 5.9575, + "step": 2944 + }, + { + "epoch": 0.017514749262536874, + "grad_norm": 3.754513740539551, + "learning_rate": 4.996218943933986e-05, + "loss": 5.7512, + "step": 2945 + }, + { + "epoch": 0.01752069654581787, + "grad_norm": 3.4231228828430176, + "learning_rate": 4.9962163754937426e-05, + "loss": 6.4566, + "step": 2946 + }, + { + "epoch": 0.017526643829098867, + "grad_norm": 2.7481472492218018, + "learning_rate": 4.996213806182095e-05, + "loss": 6.1385, + "step": 2947 + }, + { + "epoch": 0.017532591112379866, + "grad_norm": 2.802342414855957, + "learning_rate": 4.996211235999046e-05, + "loss": 5.6656, + "step": 2948 + }, + { + "epoch": 0.01753853839566086, + "grad_norm": 2.60530686378479, + "learning_rate": 4.996208664944595e-05, + "loss": 5.7339, + "step": 2949 + }, + { + "epoch": 0.01754448567894186, + "grad_norm": 2.476100206375122, + "learning_rate": 4.996206093018744e-05, + "loss": 6.0447, + "step": 2950 + }, + { + "epoch": 0.017550432962222858, + "grad_norm": 2.3516924381256104, + "learning_rate": 4.9962035202214916e-05, + "loss": 6.2046, + "step": 2951 + }, + { + "epoch": 0.017556380245503853, + "grad_norm": 2.447519302368164, + "learning_rate": 4.996200946552842e-05, + "loss": 6.0279, + "step": 2952 + }, + { + "epoch": 0.017562327528784852, + "grad_norm": 2.679766893386841, + "learning_rate": 4.996198372012794e-05, + "loss": 5.9072, + "step": 2953 + }, + { + "epoch": 0.017568274812065847, + "grad_norm": 2.3413944244384766, + "learning_rate": 4.9961957966013486e-05, + "loss": 5.9214, + "step": 2954 + }, + { + "epoch": 0.017574222095346845, + "grad_norm": 2.273725986480713, + "learning_rate": 4.996193220318507e-05, + "loss": 6.2107, + "step": 2955 + }, + { + "epoch": 0.017580169378627844, + "grad_norm": 2.9424052238464355, + "learning_rate": 4.99619064316427e-05, + "loss": 5.8618, + "step": 2956 + }, + { + "epoch": 0.01758611666190884, + "grad_norm": 2.40987229347229, + "learning_rate": 4.9961880651386394e-05, + "loss": 6.1306, + "step": 2957 + }, + { + "epoch": 0.017592063945189838, + "grad_norm": 2.542084217071533, + "learning_rate": 4.9961854862416144e-05, + "loss": 6.2225, + "step": 2958 + }, + { + "epoch": 0.017598011228470836, + "grad_norm": 2.06935977935791, + "learning_rate": 4.996182906473198e-05, + "loss": 5.9899, + "step": 2959 + }, + { + "epoch": 0.01760395851175183, + "grad_norm": 2.1998584270477295, + "learning_rate": 4.99618032583339e-05, + "loss": 6.2268, + "step": 2960 + }, + { + "epoch": 0.01760990579503283, + "grad_norm": 2.5595617294311523, + "learning_rate": 4.99617774432219e-05, + "loss": 6.2856, + "step": 2961 + }, + { + "epoch": 0.017615853078313825, + "grad_norm": 2.9262382984161377, + "learning_rate": 4.9961751619396e-05, + "loss": 6.2747, + "step": 2962 + }, + { + "epoch": 0.017621800361594823, + "grad_norm": 2.3705809116363525, + "learning_rate": 4.996172578685622e-05, + "loss": 6.1376, + "step": 2963 + }, + { + "epoch": 0.017627747644875822, + "grad_norm": 2.20991849899292, + "learning_rate": 4.996169994560256e-05, + "loss": 6.0118, + "step": 2964 + }, + { + "epoch": 0.017633694928156817, + "grad_norm": 2.2801706790924072, + "learning_rate": 4.996167409563502e-05, + "loss": 6.0924, + "step": 2965 + }, + { + "epoch": 0.017639642211437816, + "grad_norm": 2.5618062019348145, + "learning_rate": 4.996164823695362e-05, + "loss": 6.0931, + "step": 2966 + }, + { + "epoch": 0.01764558949471881, + "grad_norm": 2.2933573722839355, + "learning_rate": 4.996162236955837e-05, + "loss": 6.1584, + "step": 2967 + }, + { + "epoch": 0.01765153677799981, + "grad_norm": 2.2387471199035645, + "learning_rate": 4.996159649344928e-05, + "loss": 6.1224, + "step": 2968 + }, + { + "epoch": 0.017657484061280808, + "grad_norm": 2.425929069519043, + "learning_rate": 4.9961570608626347e-05, + "loss": 6.2419, + "step": 2969 + }, + { + "epoch": 0.017663431344561803, + "grad_norm": 3.0279812812805176, + "learning_rate": 4.996154471508959e-05, + "loss": 6.0478, + "step": 2970 + }, + { + "epoch": 0.0176693786278428, + "grad_norm": 2.8950276374816895, + "learning_rate": 4.9961518812839015e-05, + "loss": 5.9663, + "step": 2971 + }, + { + "epoch": 0.0176753259111238, + "grad_norm": 2.9908859729766846, + "learning_rate": 4.996149290187463e-05, + "loss": 5.8101, + "step": 2972 + }, + { + "epoch": 0.017681273194404795, + "grad_norm": 2.900987148284912, + "learning_rate": 4.996146698219645e-05, + "loss": 6.133, + "step": 2973 + }, + { + "epoch": 0.017687220477685794, + "grad_norm": 3.3194754123687744, + "learning_rate": 4.996144105380447e-05, + "loss": 5.9763, + "step": 2974 + }, + { + "epoch": 0.01769316776096679, + "grad_norm": 2.4997923374176025, + "learning_rate": 4.996141511669872e-05, + "loss": 6.1062, + "step": 2975 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 2.3048369884490967, + "learning_rate": 4.996138917087919e-05, + "loss": 6.138, + "step": 2976 + }, + { + "epoch": 0.017705062327528786, + "grad_norm": 2.3391027450561523, + "learning_rate": 4.99613632163459e-05, + "loss": 6.0612, + "step": 2977 + }, + { + "epoch": 0.01771100961080978, + "grad_norm": 2.6164605617523193, + "learning_rate": 4.996133725309886e-05, + "loss": 6.0402, + "step": 2978 + }, + { + "epoch": 0.01771695689409078, + "grad_norm": 2.6534295082092285, + "learning_rate": 4.996131128113807e-05, + "loss": 5.9027, + "step": 2979 + }, + { + "epoch": 0.017722904177371778, + "grad_norm": 2.1807172298431396, + "learning_rate": 4.996128530046354e-05, + "loss": 5.7083, + "step": 2980 + }, + { + "epoch": 0.017728851460652773, + "grad_norm": 2.433762550354004, + "learning_rate": 4.9961259311075296e-05, + "loss": 6.1587, + "step": 2981 + }, + { + "epoch": 0.017734798743933772, + "grad_norm": 2.4656107425689697, + "learning_rate": 4.996123331297333e-05, + "loss": 5.9831, + "step": 2982 + }, + { + "epoch": 0.017740746027214767, + "grad_norm": 2.536060333251953, + "learning_rate": 4.996120730615765e-05, + "loss": 5.9083, + "step": 2983 + }, + { + "epoch": 0.017746693310495765, + "grad_norm": 2.2993409633636475, + "learning_rate": 4.996118129062828e-05, + "loss": 6.0156, + "step": 2984 + }, + { + "epoch": 0.017752640593776764, + "grad_norm": 2.0221481323242188, + "learning_rate": 4.996115526638521e-05, + "loss": 5.9836, + "step": 2985 + }, + { + "epoch": 0.01775858787705776, + "grad_norm": 2.401350498199463, + "learning_rate": 4.996112923342846e-05, + "loss": 5.8071, + "step": 2986 + }, + { + "epoch": 0.017764535160338758, + "grad_norm": 2.469214677810669, + "learning_rate": 4.996110319175804e-05, + "loss": 5.8784, + "step": 2987 + }, + { + "epoch": 0.017770482443619756, + "grad_norm": 2.454481601715088, + "learning_rate": 4.9961077141373955e-05, + "loss": 5.9168, + "step": 2988 + }, + { + "epoch": 0.01777642972690075, + "grad_norm": 2.3173487186431885, + "learning_rate": 4.996105108227621e-05, + "loss": 5.8797, + "step": 2989 + }, + { + "epoch": 0.01778237701018175, + "grad_norm": 2.1967554092407227, + "learning_rate": 4.996102501446483e-05, + "loss": 5.972, + "step": 2990 + }, + { + "epoch": 0.017788324293462745, + "grad_norm": 2.1263201236724854, + "learning_rate": 4.996099893793981e-05, + "loss": 5.9301, + "step": 2991 + }, + { + "epoch": 0.017794271576743743, + "grad_norm": 2.1959195137023926, + "learning_rate": 4.9960972852701165e-05, + "loss": 6.0422, + "step": 2992 + }, + { + "epoch": 0.017800218860024742, + "grad_norm": 2.3290374279022217, + "learning_rate": 4.99609467587489e-05, + "loss": 6.1926, + "step": 2993 + }, + { + "epoch": 0.017806166143305737, + "grad_norm": 2.3518059253692627, + "learning_rate": 4.996092065608303e-05, + "loss": 5.8583, + "step": 2994 + }, + { + "epoch": 0.017812113426586736, + "grad_norm": 2.4263339042663574, + "learning_rate": 4.996089454470355e-05, + "loss": 5.8149, + "step": 2995 + }, + { + "epoch": 0.01781806070986773, + "grad_norm": 2.0764389038085938, + "learning_rate": 4.99608684246105e-05, + "loss": 5.8782, + "step": 2996 + }, + { + "epoch": 0.01782400799314873, + "grad_norm": 2.086904764175415, + "learning_rate": 4.996084229580385e-05, + "loss": 5.7885, + "step": 2997 + }, + { + "epoch": 0.017829955276429728, + "grad_norm": 2.1907291412353516, + "learning_rate": 4.996081615828363e-05, + "loss": 5.9246, + "step": 2998 + }, + { + "epoch": 0.017835902559710723, + "grad_norm": 2.4596495628356934, + "learning_rate": 4.9960790012049854e-05, + "loss": 5.7786, + "step": 2999 + }, + { + "epoch": 0.01784184984299172, + "grad_norm": 2.0762453079223633, + "learning_rate": 4.996076385710252e-05, + "loss": 5.9901, + "step": 3000 + }, + { + "epoch": 0.01784779712627272, + "grad_norm": 2.068714141845703, + "learning_rate": 4.996073769344164e-05, + "loss": 5.9437, + "step": 3001 + }, + { + "epoch": 0.017853744409553715, + "grad_norm": 2.4760496616363525, + "learning_rate": 4.9960711521067226e-05, + "loss": 5.8633, + "step": 3002 + }, + { + "epoch": 0.017859691692834714, + "grad_norm": 2.395643949508667, + "learning_rate": 4.996068533997928e-05, + "loss": 5.8024, + "step": 3003 + }, + { + "epoch": 0.01786563897611571, + "grad_norm": 2.120586633682251, + "learning_rate": 4.996065915017783e-05, + "loss": 6.0712, + "step": 3004 + }, + { + "epoch": 0.017871586259396707, + "grad_norm": 2.384794235229492, + "learning_rate": 4.9960632951662866e-05, + "loss": 5.9089, + "step": 3005 + }, + { + "epoch": 0.017877533542677706, + "grad_norm": 2.24297833442688, + "learning_rate": 4.99606067444344e-05, + "loss": 6.0263, + "step": 3006 + }, + { + "epoch": 0.0178834808259587, + "grad_norm": 1.983299732208252, + "learning_rate": 4.996058052849245e-05, + "loss": 5.8706, + "step": 3007 + }, + { + "epoch": 0.0178894281092397, + "grad_norm": 2.2866950035095215, + "learning_rate": 4.996055430383701e-05, + "loss": 5.9031, + "step": 3008 + }, + { + "epoch": 0.017895375392520698, + "grad_norm": 2.3343560695648193, + "learning_rate": 4.996052807046811e-05, + "loss": 5.9155, + "step": 3009 + }, + { + "epoch": 0.017901322675801693, + "grad_norm": 2.079763650894165, + "learning_rate": 4.9960501828385734e-05, + "loss": 5.8102, + "step": 3010 + }, + { + "epoch": 0.01790726995908269, + "grad_norm": 2.0398895740509033, + "learning_rate": 4.996047557758991e-05, + "loss": 5.773, + "step": 3011 + }, + { + "epoch": 0.017913217242363687, + "grad_norm": 2.2478318214416504, + "learning_rate": 4.996044931808064e-05, + "loss": 5.8584, + "step": 3012 + }, + { + "epoch": 0.017919164525644685, + "grad_norm": 2.301398992538452, + "learning_rate": 4.996042304985794e-05, + "loss": 5.9053, + "step": 3013 + }, + { + "epoch": 0.017925111808925684, + "grad_norm": 2.0428216457366943, + "learning_rate": 4.996039677292181e-05, + "loss": 5.9571, + "step": 3014 + }, + { + "epoch": 0.01793105909220668, + "grad_norm": 2.049572467803955, + "learning_rate": 4.9960370487272266e-05, + "loss": 5.9464, + "step": 3015 + }, + { + "epoch": 0.017937006375487678, + "grad_norm": 2.1681618690490723, + "learning_rate": 4.996034419290931e-05, + "loss": 5.9969, + "step": 3016 + }, + { + "epoch": 0.017942953658768676, + "grad_norm": 2.3879425525665283, + "learning_rate": 4.996031788983296e-05, + "loss": 5.7962, + "step": 3017 + }, + { + "epoch": 0.01794890094204967, + "grad_norm": 2.232508420944214, + "learning_rate": 4.996029157804323e-05, + "loss": 5.8479, + "step": 3018 + }, + { + "epoch": 0.01795484822533067, + "grad_norm": 2.222257137298584, + "learning_rate": 4.9960265257540104e-05, + "loss": 5.952, + "step": 3019 + }, + { + "epoch": 0.017960795508611665, + "grad_norm": 2.213777542114258, + "learning_rate": 4.996023892832362e-05, + "loss": 5.9891, + "step": 3020 + }, + { + "epoch": 0.017966742791892663, + "grad_norm": 2.286097764968872, + "learning_rate": 4.996021259039377e-05, + "loss": 5.8995, + "step": 3021 + }, + { + "epoch": 0.017972690075173662, + "grad_norm": 2.1588432788848877, + "learning_rate": 4.996018624375056e-05, + "loss": 5.988, + "step": 3022 + }, + { + "epoch": 0.017978637358454657, + "grad_norm": 2.2468602657318115, + "learning_rate": 4.996015988839402e-05, + "loss": 5.9303, + "step": 3023 + }, + { + "epoch": 0.017984584641735656, + "grad_norm": 2.1732120513916016, + "learning_rate": 4.9960133524324135e-05, + "loss": 5.8696, + "step": 3024 + }, + { + "epoch": 0.01799053192501665, + "grad_norm": 2.2985105514526367, + "learning_rate": 4.996010715154093e-05, + "loss": 5.9251, + "step": 3025 + }, + { + "epoch": 0.01799647920829765, + "grad_norm": 2.1920788288116455, + "learning_rate": 4.996008077004441e-05, + "loss": 5.8023, + "step": 3026 + }, + { + "epoch": 0.018002426491578648, + "grad_norm": 1.9393725395202637, + "learning_rate": 4.996005437983458e-05, + "loss": 5.9576, + "step": 3027 + }, + { + "epoch": 0.018008373774859643, + "grad_norm": 2.115035057067871, + "learning_rate": 4.9960027980911455e-05, + "loss": 5.9105, + "step": 3028 + }, + { + "epoch": 0.01801432105814064, + "grad_norm": 2.143432855606079, + "learning_rate": 4.996000157327504e-05, + "loss": 5.9951, + "step": 3029 + }, + { + "epoch": 0.01802026834142164, + "grad_norm": 2.4353296756744385, + "learning_rate": 4.995997515692536e-05, + "loss": 5.9761, + "step": 3030 + }, + { + "epoch": 0.018026215624702635, + "grad_norm": 1.999054193496704, + "learning_rate": 4.995994873186239e-05, + "loss": 6.028, + "step": 3031 + }, + { + "epoch": 0.018032162907983634, + "grad_norm": 2.05645751953125, + "learning_rate": 4.995992229808617e-05, + "loss": 5.9778, + "step": 3032 + }, + { + "epoch": 0.01803811019126463, + "grad_norm": 1.948923110961914, + "learning_rate": 4.99598958555967e-05, + "loss": 5.8735, + "step": 3033 + }, + { + "epoch": 0.018044057474545627, + "grad_norm": 2.1208486557006836, + "learning_rate": 4.995986940439399e-05, + "loss": 5.7913, + "step": 3034 + }, + { + "epoch": 0.018050004757826626, + "grad_norm": 2.051079750061035, + "learning_rate": 4.995984294447804e-05, + "loss": 5.8097, + "step": 3035 + }, + { + "epoch": 0.01805595204110762, + "grad_norm": 2.021207571029663, + "learning_rate": 4.995981647584887e-05, + "loss": 5.8425, + "step": 3036 + }, + { + "epoch": 0.01806189932438862, + "grad_norm": 2.471315622329712, + "learning_rate": 4.995978999850649e-05, + "loss": 5.7735, + "step": 3037 + }, + { + "epoch": 0.018067846607669618, + "grad_norm": 2.604836940765381, + "learning_rate": 4.9959763512450896e-05, + "loss": 6.4525, + "step": 3038 + }, + { + "epoch": 0.018073793890950613, + "grad_norm": 2.375361919403076, + "learning_rate": 4.995973701768212e-05, + "loss": 5.8072, + "step": 3039 + }, + { + "epoch": 0.01807974117423161, + "grad_norm": 2.354280471801758, + "learning_rate": 4.995971051420014e-05, + "loss": 5.9434, + "step": 3040 + }, + { + "epoch": 0.018085688457512607, + "grad_norm": 2.7335755825042725, + "learning_rate": 4.9959684002005e-05, + "loss": 5.5899, + "step": 3041 + }, + { + "epoch": 0.018091635740793605, + "grad_norm": 2.244917869567871, + "learning_rate": 4.995965748109668e-05, + "loss": 5.799, + "step": 3042 + }, + { + "epoch": 0.018097583024074604, + "grad_norm": 2.2413697242736816, + "learning_rate": 4.995963095147521e-05, + "loss": 5.8635, + "step": 3043 + }, + { + "epoch": 0.0181035303073556, + "grad_norm": 2.122586488723755, + "learning_rate": 4.9959604413140584e-05, + "loss": 5.8098, + "step": 3044 + }, + { + "epoch": 0.018109477590636597, + "grad_norm": 2.407517910003662, + "learning_rate": 4.995957786609282e-05, + "loss": 6.0319, + "step": 3045 + }, + { + "epoch": 0.018115424873917596, + "grad_norm": 2.5628743171691895, + "learning_rate": 4.9959551310331934e-05, + "loss": 5.9561, + "step": 3046 + }, + { + "epoch": 0.01812137215719859, + "grad_norm": 2.335650682449341, + "learning_rate": 4.995952474585791e-05, + "loss": 6.1168, + "step": 3047 + }, + { + "epoch": 0.01812731944047959, + "grad_norm": 2.169771432876587, + "learning_rate": 4.995949817267078e-05, + "loss": 6.0555, + "step": 3048 + }, + { + "epoch": 0.018133266723760585, + "grad_norm": 2.2245211601257324, + "learning_rate": 4.995947159077056e-05, + "loss": 5.9084, + "step": 3049 + }, + { + "epoch": 0.018139214007041583, + "grad_norm": 2.2296931743621826, + "learning_rate": 4.995944500015723e-05, + "loss": 5.8878, + "step": 3050 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 2.2372493743896484, + "learning_rate": 4.995941840083082e-05, + "loss": 5.9521, + "step": 3051 + }, + { + "epoch": 0.018151108573603577, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.995939179279134e-05, + "loss": 5.899, + "step": 3052 + }, + { + "epoch": 0.018157055856884576, + "grad_norm": 2.218245267868042, + "learning_rate": 4.995936517603879e-05, + "loss": 6.0311, + "step": 3053 + }, + { + "epoch": 0.018163003140165574, + "grad_norm": 2.2877273559570312, + "learning_rate": 4.995933855057318e-05, + "loss": 6.0052, + "step": 3054 + }, + { + "epoch": 0.01816895042344657, + "grad_norm": 2.225764751434326, + "learning_rate": 4.995931191639453e-05, + "loss": 6.0373, + "step": 3055 + }, + { + "epoch": 0.018174897706727568, + "grad_norm": 2.5069313049316406, + "learning_rate": 4.995928527350284e-05, + "loss": 5.8729, + "step": 3056 + }, + { + "epoch": 0.018180844990008563, + "grad_norm": 2.089759588241577, + "learning_rate": 4.995925862189812e-05, + "loss": 5.9462, + "step": 3057 + }, + { + "epoch": 0.01818679227328956, + "grad_norm": 2.0159049034118652, + "learning_rate": 4.9959231961580376e-05, + "loss": 5.9276, + "step": 3058 + }, + { + "epoch": 0.01819273955657056, + "grad_norm": 2.207636594772339, + "learning_rate": 4.995920529254963e-05, + "loss": 5.9921, + "step": 3059 + }, + { + "epoch": 0.018198686839851555, + "grad_norm": 2.380232810974121, + "learning_rate": 4.995917861480588e-05, + "loss": 5.9092, + "step": 3060 + }, + { + "epoch": 0.018204634123132554, + "grad_norm": 2.073237895965576, + "learning_rate": 4.9959151928349134e-05, + "loss": 5.8472, + "step": 3061 + }, + { + "epoch": 0.01821058140641355, + "grad_norm": 1.824062705039978, + "learning_rate": 4.995912523317942e-05, + "loss": 5.7958, + "step": 3062 + }, + { + "epoch": 0.018216528689694547, + "grad_norm": 2.3961215019226074, + "learning_rate": 4.995909852929672e-05, + "loss": 6.1388, + "step": 3063 + }, + { + "epoch": 0.018222475972975546, + "grad_norm": 2.8391239643096924, + "learning_rate": 4.9959071816701065e-05, + "loss": 5.7564, + "step": 3064 + }, + { + "epoch": 0.01822842325625654, + "grad_norm": 2.4684112071990967, + "learning_rate": 4.995904509539244e-05, + "loss": 5.8372, + "step": 3065 + }, + { + "epoch": 0.01823437053953754, + "grad_norm": 2.419983386993408, + "learning_rate": 4.995901836537089e-05, + "loss": 5.9332, + "step": 3066 + }, + { + "epoch": 0.018240317822818538, + "grad_norm": 2.500227928161621, + "learning_rate": 4.99589916266364e-05, + "loss": 6.0848, + "step": 3067 + }, + { + "epoch": 0.018246265106099533, + "grad_norm": 2.1683971881866455, + "learning_rate": 4.9958964879188976e-05, + "loss": 6.0911, + "step": 3068 + }, + { + "epoch": 0.01825221238938053, + "grad_norm": 2.2345223426818848, + "learning_rate": 4.995893812302864e-05, + "loss": 6.016, + "step": 3069 + }, + { + "epoch": 0.018258159672661527, + "grad_norm": 2.318321466445923, + "learning_rate": 4.995891135815539e-05, + "loss": 5.9622, + "step": 3070 + }, + { + "epoch": 0.018264106955942525, + "grad_norm": 2.294602155685425, + "learning_rate": 4.9958884584569255e-05, + "loss": 5.8908, + "step": 3071 + }, + { + "epoch": 0.018270054239223524, + "grad_norm": 2.5472419261932373, + "learning_rate": 4.995885780227022e-05, + "loss": 5.7906, + "step": 3072 + }, + { + "epoch": 0.01827600152250452, + "grad_norm": 2.319101095199585, + "learning_rate": 4.995883101125831e-05, + "loss": 6.3366, + "step": 3073 + }, + { + "epoch": 0.018281948805785517, + "grad_norm": 2.3564186096191406, + "learning_rate": 4.995880421153353e-05, + "loss": 5.9863, + "step": 3074 + }, + { + "epoch": 0.018287896089066516, + "grad_norm": 2.434756278991699, + "learning_rate": 4.995877740309589e-05, + "loss": 5.885, + "step": 3075 + }, + { + "epoch": 0.01829384337234751, + "grad_norm": 2.062861442565918, + "learning_rate": 4.99587505859454e-05, + "loss": 6.0813, + "step": 3076 + }, + { + "epoch": 0.01829979065562851, + "grad_norm": 2.127049684524536, + "learning_rate": 4.995872376008206e-05, + "loss": 6.1226, + "step": 3077 + }, + { + "epoch": 0.018305737938909505, + "grad_norm": 2.288405656814575, + "learning_rate": 4.995869692550589e-05, + "loss": 5.9625, + "step": 3078 + }, + { + "epoch": 0.018311685222190503, + "grad_norm": 2.2387006282806396, + "learning_rate": 4.9958670082216905e-05, + "loss": 5.9479, + "step": 3079 + }, + { + "epoch": 0.018317632505471502, + "grad_norm": 2.18864107131958, + "learning_rate": 4.9958643230215096e-05, + "loss": 5.9223, + "step": 3080 + }, + { + "epoch": 0.018323579788752497, + "grad_norm": 2.3457415103912354, + "learning_rate": 4.995861636950049e-05, + "loss": 5.7857, + "step": 3081 + }, + { + "epoch": 0.018329527072033495, + "grad_norm": 2.6946494579315186, + "learning_rate": 4.995858950007309e-05, + "loss": 5.5546, + "step": 3082 + }, + { + "epoch": 0.018335474355314494, + "grad_norm": 2.5135412216186523, + "learning_rate": 4.99585626219329e-05, + "loss": 5.5624, + "step": 3083 + }, + { + "epoch": 0.01834142163859549, + "grad_norm": 2.6617767810821533, + "learning_rate": 4.9958535735079934e-05, + "loss": 5.8789, + "step": 3084 + }, + { + "epoch": 0.018347368921876488, + "grad_norm": 2.099261522293091, + "learning_rate": 4.9958508839514196e-05, + "loss": 5.9365, + "step": 3085 + }, + { + "epoch": 0.018353316205157483, + "grad_norm": 2.5267064571380615, + "learning_rate": 4.9958481935235715e-05, + "loss": 6.0935, + "step": 3086 + }, + { + "epoch": 0.01835926348843848, + "grad_norm": 2.3353283405303955, + "learning_rate": 4.995845502224447e-05, + "loss": 5.909, + "step": 3087 + }, + { + "epoch": 0.01836521077171948, + "grad_norm": 2.396430492401123, + "learning_rate": 4.9958428100540496e-05, + "loss": 6.0272, + "step": 3088 + }, + { + "epoch": 0.018371158055000475, + "grad_norm": 2.095308303833008, + "learning_rate": 4.9958401170123784e-05, + "loss": 5.9791, + "step": 3089 + }, + { + "epoch": 0.018377105338281473, + "grad_norm": 2.7606077194213867, + "learning_rate": 4.9958374230994357e-05, + "loss": 5.9716, + "step": 3090 + }, + { + "epoch": 0.01838305262156247, + "grad_norm": 2.4490914344787598, + "learning_rate": 4.995834728315222e-05, + "loss": 5.8763, + "step": 3091 + }, + { + "epoch": 0.018388999904843467, + "grad_norm": 2.709092855453491, + "learning_rate": 4.9958320326597385e-05, + "loss": 5.74, + "step": 3092 + }, + { + "epoch": 0.018394947188124466, + "grad_norm": 2.8829305171966553, + "learning_rate": 4.9958293361329856e-05, + "loss": 5.8469, + "step": 3093 + }, + { + "epoch": 0.01840089447140546, + "grad_norm": 2.6500396728515625, + "learning_rate": 4.995826638734964e-05, + "loss": 5.8578, + "step": 3094 + }, + { + "epoch": 0.01840684175468646, + "grad_norm": 2.0665056705474854, + "learning_rate": 4.9958239404656755e-05, + "loss": 5.9662, + "step": 3095 + }, + { + "epoch": 0.018412789037967458, + "grad_norm": 2.3198931217193604, + "learning_rate": 4.9958212413251205e-05, + "loss": 6.0663, + "step": 3096 + }, + { + "epoch": 0.018418736321248453, + "grad_norm": 2.9056031703948975, + "learning_rate": 4.9958185413133e-05, + "loss": 5.8015, + "step": 3097 + }, + { + "epoch": 0.01842468360452945, + "grad_norm": 2.446164131164551, + "learning_rate": 4.995815840430216e-05, + "loss": 5.6878, + "step": 3098 + }, + { + "epoch": 0.018430630887810447, + "grad_norm": 2.797506093978882, + "learning_rate": 4.995813138675867e-05, + "loss": 5.7675, + "step": 3099 + }, + { + "epoch": 0.018436578171091445, + "grad_norm": 3.2914962768554688, + "learning_rate": 4.995810436050256e-05, + "loss": 6.3661, + "step": 3100 + }, + { + "epoch": 0.018442525454372444, + "grad_norm": 2.444363594055176, + "learning_rate": 4.995807732553384e-05, + "loss": 5.9251, + "step": 3101 + }, + { + "epoch": 0.01844847273765344, + "grad_norm": 2.526951551437378, + "learning_rate": 4.9958050281852505e-05, + "loss": 5.8202, + "step": 3102 + }, + { + "epoch": 0.018454420020934437, + "grad_norm": 2.2046117782592773, + "learning_rate": 4.995802322945857e-05, + "loss": 6.0572, + "step": 3103 + }, + { + "epoch": 0.018460367304215436, + "grad_norm": 2.5484018325805664, + "learning_rate": 4.9957996168352055e-05, + "loss": 6.1215, + "step": 3104 + }, + { + "epoch": 0.01846631458749643, + "grad_norm": 2.4785003662109375, + "learning_rate": 4.9957969098532965e-05, + "loss": 5.9524, + "step": 3105 + }, + { + "epoch": 0.01847226187077743, + "grad_norm": 2.9028711318969727, + "learning_rate": 4.9957942020001294e-05, + "loss": 6.1175, + "step": 3106 + }, + { + "epoch": 0.018478209154058425, + "grad_norm": 2.1766602993011475, + "learning_rate": 4.995791493275707e-05, + "loss": 5.9746, + "step": 3107 + }, + { + "epoch": 0.018484156437339423, + "grad_norm": 2.079423189163208, + "learning_rate": 4.995788783680029e-05, + "loss": 5.9463, + "step": 3108 + }, + { + "epoch": 0.018490103720620422, + "grad_norm": 2.285184144973755, + "learning_rate": 4.995786073213098e-05, + "loss": 5.5174, + "step": 3109 + }, + { + "epoch": 0.018496051003901417, + "grad_norm": 2.170018196105957, + "learning_rate": 4.9957833618749126e-05, + "loss": 5.7948, + "step": 3110 + }, + { + "epoch": 0.018501998287182415, + "grad_norm": 2.284517526626587, + "learning_rate": 4.9957806496654754e-05, + "loss": 5.9455, + "step": 3111 + }, + { + "epoch": 0.018507945570463414, + "grad_norm": 2.5539982318878174, + "learning_rate": 4.9957779365847876e-05, + "loss": 5.9791, + "step": 3112 + }, + { + "epoch": 0.01851389285374441, + "grad_norm": 2.1735522747039795, + "learning_rate": 4.995775222632849e-05, + "loss": 5.9549, + "step": 3113 + }, + { + "epoch": 0.018519840137025408, + "grad_norm": 2.2272653579711914, + "learning_rate": 4.995772507809662e-05, + "loss": 5.8618, + "step": 3114 + }, + { + "epoch": 0.018525787420306403, + "grad_norm": 1.9390417337417603, + "learning_rate": 4.995769792115225e-05, + "loss": 5.9617, + "step": 3115 + }, + { + "epoch": 0.0185317347035874, + "grad_norm": 2.6526312828063965, + "learning_rate": 4.9957670755495414e-05, + "loss": 5.9296, + "step": 3116 + }, + { + "epoch": 0.0185376819868684, + "grad_norm": 2.533996105194092, + "learning_rate": 4.995764358112611e-05, + "loss": 6.0045, + "step": 3117 + }, + { + "epoch": 0.018543629270149395, + "grad_norm": 2.183347225189209, + "learning_rate": 4.995761639804436e-05, + "loss": 5.9254, + "step": 3118 + }, + { + "epoch": 0.018549576553430393, + "grad_norm": 1.9411321878433228, + "learning_rate": 4.995758920625015e-05, + "loss": 5.9404, + "step": 3119 + }, + { + "epoch": 0.01855552383671139, + "grad_norm": 4.914453029632568, + "learning_rate": 4.9957562005743514e-05, + "loss": 5.8139, + "step": 3120 + }, + { + "epoch": 0.018561471119992387, + "grad_norm": 2.3052754402160645, + "learning_rate": 4.9957534796524444e-05, + "loss": 5.6525, + "step": 3121 + }, + { + "epoch": 0.018567418403273386, + "grad_norm": 2.424464464187622, + "learning_rate": 4.995750757859296e-05, + "loss": 5.9599, + "step": 3122 + }, + { + "epoch": 0.01857336568655438, + "grad_norm": 2.1392033100128174, + "learning_rate": 4.995748035194907e-05, + "loss": 5.9558, + "step": 3123 + }, + { + "epoch": 0.01857931296983538, + "grad_norm": 4.67656135559082, + "learning_rate": 4.995745311659278e-05, + "loss": 5.7606, + "step": 3124 + }, + { + "epoch": 0.018585260253116378, + "grad_norm": 2.0772082805633545, + "learning_rate": 4.99574258725241e-05, + "loss": 5.9328, + "step": 3125 + }, + { + "epoch": 0.018591207536397373, + "grad_norm": 2.0255486965179443, + "learning_rate": 4.995739861974303e-05, + "loss": 5.9395, + "step": 3126 + }, + { + "epoch": 0.01859715481967837, + "grad_norm": 2.3629064559936523, + "learning_rate": 4.995737135824961e-05, + "loss": 5.9663, + "step": 3127 + }, + { + "epoch": 0.018603102102959367, + "grad_norm": 1.9924237728118896, + "learning_rate": 4.9957344088043814e-05, + "loss": 5.8998, + "step": 3128 + }, + { + "epoch": 0.018609049386240365, + "grad_norm": 2.096774101257324, + "learning_rate": 4.9957316809125676e-05, + "loss": 5.7178, + "step": 3129 + }, + { + "epoch": 0.018614996669521364, + "grad_norm": 2.2288100719451904, + "learning_rate": 4.9957289521495194e-05, + "loss": 5.9096, + "step": 3130 + }, + { + "epoch": 0.01862094395280236, + "grad_norm": 2.456099033355713, + "learning_rate": 4.995726222515238e-05, + "loss": 5.7738, + "step": 3131 + }, + { + "epoch": 0.018626891236083357, + "grad_norm": 2.238218069076538, + "learning_rate": 4.995723492009724e-05, + "loss": 5.6929, + "step": 3132 + }, + { + "epoch": 0.018632838519364356, + "grad_norm": 1.8309845924377441, + "learning_rate": 4.9957207606329795e-05, + "loss": 5.9339, + "step": 3133 + }, + { + "epoch": 0.01863878580264535, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.995718028385003e-05, + "loss": 5.9704, + "step": 3134 + }, + { + "epoch": 0.01864473308592635, + "grad_norm": 2.0929813385009766, + "learning_rate": 4.9957152952657995e-05, + "loss": 5.7598, + "step": 3135 + }, + { + "epoch": 0.018650680369207345, + "grad_norm": 2.2813265323638916, + "learning_rate": 4.995712561275366e-05, + "loss": 5.7986, + "step": 3136 + }, + { + "epoch": 0.018656627652488343, + "grad_norm": 2.1189653873443604, + "learning_rate": 4.995709826413705e-05, + "loss": 5.6603, + "step": 3137 + }, + { + "epoch": 0.01866257493576934, + "grad_norm": 2.1439480781555176, + "learning_rate": 4.9957070906808185e-05, + "loss": 5.6952, + "step": 3138 + }, + { + "epoch": 0.018668522219050337, + "grad_norm": 2.4345993995666504, + "learning_rate": 4.995704354076706e-05, + "loss": 5.7531, + "step": 3139 + }, + { + "epoch": 0.018674469502331335, + "grad_norm": 2.5551047325134277, + "learning_rate": 4.995701616601368e-05, + "loss": 5.544, + "step": 3140 + }, + { + "epoch": 0.018680416785612334, + "grad_norm": 2.333603620529175, + "learning_rate": 4.9956988782548075e-05, + "loss": 5.5732, + "step": 3141 + }, + { + "epoch": 0.01868636406889333, + "grad_norm": 2.2983827590942383, + "learning_rate": 4.995696139037024e-05, + "loss": 5.8779, + "step": 3142 + }, + { + "epoch": 0.018692311352174328, + "grad_norm": 2.7525672912597656, + "learning_rate": 4.995693398948018e-05, + "loss": 5.5998, + "step": 3143 + }, + { + "epoch": 0.018698258635455323, + "grad_norm": 2.3622052669525146, + "learning_rate": 4.995690657987793e-05, + "loss": 5.8851, + "step": 3144 + }, + { + "epoch": 0.01870420591873632, + "grad_norm": 2.4975669384002686, + "learning_rate": 4.995687916156346e-05, + "loss": 5.6388, + "step": 3145 + }, + { + "epoch": 0.01871015320201732, + "grad_norm": 2.5763049125671387, + "learning_rate": 4.9956851734536816e-05, + "loss": 5.4931, + "step": 3146 + }, + { + "epoch": 0.018716100485298315, + "grad_norm": 2.7156779766082764, + "learning_rate": 4.995682429879799e-05, + "loss": 5.8035, + "step": 3147 + }, + { + "epoch": 0.018722047768579313, + "grad_norm": 2.259134292602539, + "learning_rate": 4.995679685434699e-05, + "loss": 5.9519, + "step": 3148 + }, + { + "epoch": 0.018727995051860312, + "grad_norm": 2.544829845428467, + "learning_rate": 4.995676940118383e-05, + "loss": 5.7373, + "step": 3149 + }, + { + "epoch": 0.018733942335141307, + "grad_norm": 2.326660633087158, + "learning_rate": 4.995674193930853e-05, + "loss": 5.7719, + "step": 3150 + }, + { + "epoch": 0.018739889618422306, + "grad_norm": 2.25370192527771, + "learning_rate": 4.995671446872108e-05, + "loss": 5.813, + "step": 3151 + }, + { + "epoch": 0.0187458369017033, + "grad_norm": 2.1467692852020264, + "learning_rate": 4.99566869894215e-05, + "loss": 5.5836, + "step": 3152 + }, + { + "epoch": 0.0187517841849843, + "grad_norm": 2.30096697807312, + "learning_rate": 4.9956659501409796e-05, + "loss": 5.8249, + "step": 3153 + }, + { + "epoch": 0.018757731468265298, + "grad_norm": 2.3050386905670166, + "learning_rate": 4.9956632004685986e-05, + "loss": 5.6806, + "step": 3154 + }, + { + "epoch": 0.018763678751546293, + "grad_norm": 2.473008632659912, + "learning_rate": 4.995660449925007e-05, + "loss": 5.4512, + "step": 3155 + }, + { + "epoch": 0.01876962603482729, + "grad_norm": 2.0691702365875244, + "learning_rate": 4.995657698510206e-05, + "loss": 5.6582, + "step": 3156 + }, + { + "epoch": 0.018775573318108287, + "grad_norm": 2.332423686981201, + "learning_rate": 4.995654946224197e-05, + "loss": 5.6017, + "step": 3157 + }, + { + "epoch": 0.018781520601389285, + "grad_norm": 2.6423730850219727, + "learning_rate": 4.9956521930669806e-05, + "loss": 5.619, + "step": 3158 + }, + { + "epoch": 0.018787467884670284, + "grad_norm": 3.0884950160980225, + "learning_rate": 4.995649439038558e-05, + "loss": 5.7813, + "step": 3159 + }, + { + "epoch": 0.01879341516795128, + "grad_norm": 2.4923598766326904, + "learning_rate": 4.995646684138929e-05, + "loss": 5.8089, + "step": 3160 + }, + { + "epoch": 0.018799362451232277, + "grad_norm": 2.5505683422088623, + "learning_rate": 4.9956439283680965e-05, + "loss": 5.8171, + "step": 3161 + }, + { + "epoch": 0.018805309734513276, + "grad_norm": 2.7343056201934814, + "learning_rate": 4.99564117172606e-05, + "loss": 6.3472, + "step": 3162 + }, + { + "epoch": 0.01881125701779427, + "grad_norm": 2.9170796871185303, + "learning_rate": 4.995638414212821e-05, + "loss": 5.7478, + "step": 3163 + }, + { + "epoch": 0.01881720430107527, + "grad_norm": 2.392648696899414, + "learning_rate": 4.9956356558283815e-05, + "loss": 5.8105, + "step": 3164 + }, + { + "epoch": 0.018823151584356265, + "grad_norm": 2.532207727432251, + "learning_rate": 4.9956328965727394e-05, + "loss": 5.9285, + "step": 3165 + }, + { + "epoch": 0.018829098867637263, + "grad_norm": 2.6717050075531006, + "learning_rate": 4.995630136445899e-05, + "loss": 6.0344, + "step": 3166 + }, + { + "epoch": 0.01883504615091826, + "grad_norm": 2.1829564571380615, + "learning_rate": 4.99562737544786e-05, + "loss": 6.0078, + "step": 3167 + }, + { + "epoch": 0.018840993434199257, + "grad_norm": 2.2728323936462402, + "learning_rate": 4.995624613578622e-05, + "loss": 5.8211, + "step": 3168 + }, + { + "epoch": 0.018846940717480255, + "grad_norm": 2.046717882156372, + "learning_rate": 4.995621850838189e-05, + "loss": 5.9685, + "step": 3169 + }, + { + "epoch": 0.018852888000761254, + "grad_norm": 2.737494945526123, + "learning_rate": 4.995619087226559e-05, + "loss": 5.649, + "step": 3170 + }, + { + "epoch": 0.01885883528404225, + "grad_norm": 2.276503801345825, + "learning_rate": 4.9956163227437345e-05, + "loss": 5.8137, + "step": 3171 + }, + { + "epoch": 0.018864782567323247, + "grad_norm": 2.2799227237701416, + "learning_rate": 4.9956135573897155e-05, + "loss": 5.8277, + "step": 3172 + }, + { + "epoch": 0.018870729850604243, + "grad_norm": 2.131425619125366, + "learning_rate": 4.995610791164505e-05, + "loss": 5.8909, + "step": 3173 + }, + { + "epoch": 0.01887667713388524, + "grad_norm": 2.2295737266540527, + "learning_rate": 4.995608024068102e-05, + "loss": 5.8236, + "step": 3174 + }, + { + "epoch": 0.01888262441716624, + "grad_norm": 2.30082631111145, + "learning_rate": 4.9956052561005076e-05, + "loss": 5.7331, + "step": 3175 + }, + { + "epoch": 0.018888571700447235, + "grad_norm": 2.751847505569458, + "learning_rate": 4.9956024872617225e-05, + "loss": 5.8673, + "step": 3176 + }, + { + "epoch": 0.018894518983728233, + "grad_norm": 2.4597535133361816, + "learning_rate": 4.995599717551749e-05, + "loss": 5.7561, + "step": 3177 + }, + { + "epoch": 0.018900466267009232, + "grad_norm": 2.1418228149414062, + "learning_rate": 4.9955969469705874e-05, + "loss": 5.7112, + "step": 3178 + }, + { + "epoch": 0.018906413550290227, + "grad_norm": 2.0560619831085205, + "learning_rate": 4.9955941755182395e-05, + "loss": 5.7764, + "step": 3179 + }, + { + "epoch": 0.018912360833571226, + "grad_norm": 2.268781900405884, + "learning_rate": 4.9955914031947046e-05, + "loss": 5.7319, + "step": 3180 + }, + { + "epoch": 0.01891830811685222, + "grad_norm": 2.6272811889648438, + "learning_rate": 4.995588629999985e-05, + "loss": 6.0601, + "step": 3181 + }, + { + "epoch": 0.01892425540013322, + "grad_norm": 2.1991870403289795, + "learning_rate": 4.995585855934081e-05, + "loss": 5.602, + "step": 3182 + }, + { + "epoch": 0.018930202683414218, + "grad_norm": 2.0521514415740967, + "learning_rate": 4.995583080996994e-05, + "loss": 5.8075, + "step": 3183 + }, + { + "epoch": 0.018936149966695213, + "grad_norm": 2.153473138809204, + "learning_rate": 4.995580305188724e-05, + "loss": 5.8219, + "step": 3184 + }, + { + "epoch": 0.01894209724997621, + "grad_norm": 2.0663251876831055, + "learning_rate": 4.9955775285092735e-05, + "loss": 5.836, + "step": 3185 + }, + { + "epoch": 0.018948044533257206, + "grad_norm": 1.8808318376541138, + "learning_rate": 4.995574750958642e-05, + "loss": 5.7938, + "step": 3186 + }, + { + "epoch": 0.018953991816538205, + "grad_norm": 2.256012201309204, + "learning_rate": 4.995571972536831e-05, + "loss": 5.6404, + "step": 3187 + }, + { + "epoch": 0.018959939099819204, + "grad_norm": 2.29636287689209, + "learning_rate": 4.995569193243843e-05, + "loss": 5.7161, + "step": 3188 + }, + { + "epoch": 0.0189658863831002, + "grad_norm": 2.728804588317871, + "learning_rate": 4.995566413079676e-05, + "loss": 5.8165, + "step": 3189 + }, + { + "epoch": 0.018971833666381197, + "grad_norm": 2.3115599155426025, + "learning_rate": 4.995563632044333e-05, + "loss": 5.7004, + "step": 3190 + }, + { + "epoch": 0.018977780949662196, + "grad_norm": 2.1607725620269775, + "learning_rate": 4.995560850137815e-05, + "loss": 5.7788, + "step": 3191 + }, + { + "epoch": 0.01898372823294319, + "grad_norm": 2.322132110595703, + "learning_rate": 4.995558067360122e-05, + "loss": 5.5677, + "step": 3192 + }, + { + "epoch": 0.01898967551622419, + "grad_norm": 2.148022174835205, + "learning_rate": 4.995555283711256e-05, + "loss": 5.7708, + "step": 3193 + }, + { + "epoch": 0.018995622799505184, + "grad_norm": 2.339812994003296, + "learning_rate": 4.9955524991912165e-05, + "loss": 5.7945, + "step": 3194 + }, + { + "epoch": 0.019001570082786183, + "grad_norm": 1.9469980001449585, + "learning_rate": 4.995549713800006e-05, + "loss": 5.695, + "step": 3195 + }, + { + "epoch": 0.01900751736606718, + "grad_norm": 2.1744890213012695, + "learning_rate": 4.9955469275376254e-05, + "loss": 5.7544, + "step": 3196 + }, + { + "epoch": 0.019013464649348177, + "grad_norm": 2.175123691558838, + "learning_rate": 4.9955441404040745e-05, + "loss": 5.598, + "step": 3197 + }, + { + "epoch": 0.019019411932629175, + "grad_norm": 2.3011369705200195, + "learning_rate": 4.995541352399355e-05, + "loss": 5.7069, + "step": 3198 + }, + { + "epoch": 0.019025359215910174, + "grad_norm": 2.2227025032043457, + "learning_rate": 4.9955385635234675e-05, + "loss": 5.6854, + "step": 3199 + }, + { + "epoch": 0.01903130649919117, + "grad_norm": 2.5465073585510254, + "learning_rate": 4.995535773776414e-05, + "loss": 5.9085, + "step": 3200 + }, + { + "epoch": 0.019037253782472167, + "grad_norm": 2.936612844467163, + "learning_rate": 4.995532983158194e-05, + "loss": 6.0519, + "step": 3201 + }, + { + "epoch": 0.019043201065753163, + "grad_norm": 2.8298418521881104, + "learning_rate": 4.9955301916688094e-05, + "loss": 5.9473, + "step": 3202 + }, + { + "epoch": 0.01904914834903416, + "grad_norm": 2.2295944690704346, + "learning_rate": 4.9955273993082615e-05, + "loss": 5.9652, + "step": 3203 + }, + { + "epoch": 0.01905509563231516, + "grad_norm": 2.7771801948547363, + "learning_rate": 4.9955246060765505e-05, + "loss": 5.9291, + "step": 3204 + }, + { + "epoch": 0.019061042915596155, + "grad_norm": 3.0721678733825684, + "learning_rate": 4.9955218119736776e-05, + "loss": 6.2319, + "step": 3205 + }, + { + "epoch": 0.019066990198877153, + "grad_norm": 2.7866547107696533, + "learning_rate": 4.9955190169996434e-05, + "loss": 6.0412, + "step": 3206 + }, + { + "epoch": 0.019072937482158152, + "grad_norm": 2.287216901779175, + "learning_rate": 4.99551622115445e-05, + "loss": 5.6435, + "step": 3207 + }, + { + "epoch": 0.019078884765439147, + "grad_norm": 2.3618898391723633, + "learning_rate": 4.995513424438098e-05, + "loss": 5.7711, + "step": 3208 + }, + { + "epoch": 0.019084832048720145, + "grad_norm": 2.192997932434082, + "learning_rate": 4.995510626850587e-05, + "loss": 5.8351, + "step": 3209 + }, + { + "epoch": 0.01909077933200114, + "grad_norm": 2.252722978591919, + "learning_rate": 4.995507828391919e-05, + "loss": 5.5989, + "step": 3210 + }, + { + "epoch": 0.01909672661528214, + "grad_norm": 2.451167106628418, + "learning_rate": 4.995505029062095e-05, + "loss": 5.8533, + "step": 3211 + }, + { + "epoch": 0.019102673898563138, + "grad_norm": 2.1897904872894287, + "learning_rate": 4.995502228861116e-05, + "loss": 6.2807, + "step": 3212 + }, + { + "epoch": 0.019108621181844133, + "grad_norm": 2.196805715560913, + "learning_rate": 4.995499427788984e-05, + "loss": 5.9418, + "step": 3213 + }, + { + "epoch": 0.01911456846512513, + "grad_norm": 1.9791160821914673, + "learning_rate": 4.995496625845698e-05, + "loss": 5.9909, + "step": 3214 + }, + { + "epoch": 0.019120515748406126, + "grad_norm": 2.3592171669006348, + "learning_rate": 4.995493823031261e-05, + "loss": 5.807, + "step": 3215 + }, + { + "epoch": 0.019126463031687125, + "grad_norm": 2.8238747119903564, + "learning_rate": 4.9954910193456713e-05, + "loss": 5.7587, + "step": 3216 + }, + { + "epoch": 0.019132410314968123, + "grad_norm": 2.4695584774017334, + "learning_rate": 4.9954882147889326e-05, + "loss": 5.746, + "step": 3217 + }, + { + "epoch": 0.01913835759824912, + "grad_norm": 2.3983800411224365, + "learning_rate": 4.995485409361044e-05, + "loss": 5.9364, + "step": 3218 + }, + { + "epoch": 0.019144304881530117, + "grad_norm": 2.1279618740081787, + "learning_rate": 4.995482603062008e-05, + "loss": 5.9383, + "step": 3219 + }, + { + "epoch": 0.019150252164811116, + "grad_norm": 18.583581924438477, + "learning_rate": 4.9954797958918244e-05, + "loss": 5.8596, + "step": 3220 + }, + { + "epoch": 0.01915619944809211, + "grad_norm": 2.1420741081237793, + "learning_rate": 4.995476987850495e-05, + "loss": 5.9311, + "step": 3221 + }, + { + "epoch": 0.01916214673137311, + "grad_norm": 2.314380645751953, + "learning_rate": 4.99547417893802e-05, + "loss": 5.8229, + "step": 3222 + }, + { + "epoch": 0.019168094014654104, + "grad_norm": 2.3818936347961426, + "learning_rate": 4.9954713691544004e-05, + "loss": 6.1124, + "step": 3223 + }, + { + "epoch": 0.019174041297935103, + "grad_norm": 2.521789789199829, + "learning_rate": 4.9954685584996377e-05, + "loss": 5.8939, + "step": 3224 + }, + { + "epoch": 0.0191799885812161, + "grad_norm": 1.9583165645599365, + "learning_rate": 4.9954657469737334e-05, + "loss": 6.0005, + "step": 3225 + }, + { + "epoch": 0.019185935864497097, + "grad_norm": 2.349581241607666, + "learning_rate": 4.995462934576687e-05, + "loss": 5.8467, + "step": 3226 + }, + { + "epoch": 0.019191883147778095, + "grad_norm": 2.081836223602295, + "learning_rate": 4.9954601213085e-05, + "loss": 6.1001, + "step": 3227 + }, + { + "epoch": 0.019197830431059094, + "grad_norm": 2.3207972049713135, + "learning_rate": 4.995457307169175e-05, + "loss": 5.794, + "step": 3228 + }, + { + "epoch": 0.01920377771434009, + "grad_norm": 1.8516380786895752, + "learning_rate": 4.99545449215871e-05, + "loss": 5.785, + "step": 3229 + }, + { + "epoch": 0.019209724997621087, + "grad_norm": 2.3822309970855713, + "learning_rate": 4.995451676277109e-05, + "loss": 5.7861, + "step": 3230 + }, + { + "epoch": 0.019215672280902082, + "grad_norm": 2.857161283493042, + "learning_rate": 4.995448859524371e-05, + "loss": 5.8333, + "step": 3231 + }, + { + "epoch": 0.01922161956418308, + "grad_norm": 2.201551914215088, + "learning_rate": 4.9954460419004974e-05, + "loss": 5.8653, + "step": 3232 + }, + { + "epoch": 0.01922756684746408, + "grad_norm": 2.1707022190093994, + "learning_rate": 4.995443223405489e-05, + "loss": 5.772, + "step": 3233 + }, + { + "epoch": 0.019233514130745075, + "grad_norm": 2.1242458820343018, + "learning_rate": 4.995440404039348e-05, + "loss": 5.8806, + "step": 3234 + }, + { + "epoch": 0.019239461414026073, + "grad_norm": 2.106945514678955, + "learning_rate": 4.995437583802074e-05, + "loss": 5.6746, + "step": 3235 + }, + { + "epoch": 0.019245408697307072, + "grad_norm": 2.083181858062744, + "learning_rate": 4.995434762693669e-05, + "loss": 5.9332, + "step": 3236 + }, + { + "epoch": 0.019251355980588067, + "grad_norm": 2.1857783794403076, + "learning_rate": 4.995431940714134e-05, + "loss": 5.6663, + "step": 3237 + }, + { + "epoch": 0.019257303263869065, + "grad_norm": 2.031041145324707, + "learning_rate": 4.995429117863468e-05, + "loss": 5.6734, + "step": 3238 + }, + { + "epoch": 0.01926325054715006, + "grad_norm": 2.31980037689209, + "learning_rate": 4.995426294141674e-05, + "loss": 5.8851, + "step": 3239 + }, + { + "epoch": 0.01926919783043106, + "grad_norm": 2.102965831756592, + "learning_rate": 4.9954234695487535e-05, + "loss": 5.7092, + "step": 3240 + }, + { + "epoch": 0.019275145113712058, + "grad_norm": 2.031169891357422, + "learning_rate": 4.995420644084705e-05, + "loss": 5.9755, + "step": 3241 + }, + { + "epoch": 0.019281092396993053, + "grad_norm": 2.2460241317749023, + "learning_rate": 4.995417817749532e-05, + "loss": 5.8895, + "step": 3242 + }, + { + "epoch": 0.01928703968027405, + "grad_norm": 2.618539571762085, + "learning_rate": 4.9954149905432336e-05, + "loss": 5.6964, + "step": 3243 + }, + { + "epoch": 0.019292986963555046, + "grad_norm": 2.1615748405456543, + "learning_rate": 4.995412162465812e-05, + "loss": 5.7162, + "step": 3244 + }, + { + "epoch": 0.019298934246836045, + "grad_norm": 2.363663673400879, + "learning_rate": 4.995409333517268e-05, + "loss": 5.7957, + "step": 3245 + }, + { + "epoch": 0.019304881530117043, + "grad_norm": 2.131084680557251, + "learning_rate": 4.9954065036976025e-05, + "loss": 5.7925, + "step": 3246 + }, + { + "epoch": 0.01931082881339804, + "grad_norm": 2.4043118953704834, + "learning_rate": 4.9954036730068155e-05, + "loss": 5.7895, + "step": 3247 + }, + { + "epoch": 0.019316776096679037, + "grad_norm": 2.521756887435913, + "learning_rate": 4.995400841444909e-05, + "loss": 5.6279, + "step": 3248 + }, + { + "epoch": 0.019322723379960036, + "grad_norm": 2.1791021823883057, + "learning_rate": 4.9953980090118846e-05, + "loss": 5.717, + "step": 3249 + }, + { + "epoch": 0.01932867066324103, + "grad_norm": 2.6562376022338867, + "learning_rate": 4.995395175707742e-05, + "loss": 5.7407, + "step": 3250 + }, + { + "epoch": 0.01933461794652203, + "grad_norm": 2.4377942085266113, + "learning_rate": 4.995392341532483e-05, + "loss": 5.539, + "step": 3251 + }, + { + "epoch": 0.019340565229803024, + "grad_norm": 2.3716847896575928, + "learning_rate": 4.995389506486109e-05, + "loss": 5.7251, + "step": 3252 + }, + { + "epoch": 0.019346512513084023, + "grad_norm": 2.2509348392486572, + "learning_rate": 4.995386670568619e-05, + "loss": 5.8749, + "step": 3253 + }, + { + "epoch": 0.01935245979636502, + "grad_norm": 2.265608072280884, + "learning_rate": 4.995383833780016e-05, + "loss": 5.8236, + "step": 3254 + }, + { + "epoch": 0.019358407079646017, + "grad_norm": 1.972179651260376, + "learning_rate": 4.9953809961203e-05, + "loss": 5.9235, + "step": 3255 + }, + { + "epoch": 0.019364354362927015, + "grad_norm": 2.314030170440674, + "learning_rate": 4.9953781575894723e-05, + "loss": 5.7355, + "step": 3256 + }, + { + "epoch": 0.019370301646208014, + "grad_norm": 2.3061349391937256, + "learning_rate": 4.995375318187534e-05, + "loss": 5.7337, + "step": 3257 + }, + { + "epoch": 0.01937624892948901, + "grad_norm": 1.9106477499008179, + "learning_rate": 4.9953724779144864e-05, + "loss": 5.8342, + "step": 3258 + }, + { + "epoch": 0.019382196212770007, + "grad_norm": 2.313750982284546, + "learning_rate": 4.9953696367703296e-05, + "loss": 5.7981, + "step": 3259 + }, + { + "epoch": 0.019388143496051002, + "grad_norm": 2.4477834701538086, + "learning_rate": 4.9953667947550644e-05, + "loss": 5.8212, + "step": 3260 + }, + { + "epoch": 0.019394090779332, + "grad_norm": 2.072659730911255, + "learning_rate": 4.9953639518686936e-05, + "loss": 5.7335, + "step": 3261 + }, + { + "epoch": 0.019400038062613, + "grad_norm": 2.0848984718322754, + "learning_rate": 4.995361108111216e-05, + "loss": 5.7427, + "step": 3262 + }, + { + "epoch": 0.019405985345893995, + "grad_norm": 1.938265323638916, + "learning_rate": 4.9953582634826345e-05, + "loss": 5.7946, + "step": 3263 + }, + { + "epoch": 0.019411932629174993, + "grad_norm": 2.227194309234619, + "learning_rate": 4.995355417982949e-05, + "loss": 5.9095, + "step": 3264 + }, + { + "epoch": 0.01941787991245599, + "grad_norm": 2.3245849609375, + "learning_rate": 4.9953525716121604e-05, + "loss": 5.802, + "step": 3265 + }, + { + "epoch": 0.019423827195736987, + "grad_norm": 2.08950138092041, + "learning_rate": 4.9953497243702696e-05, + "loss": 5.9001, + "step": 3266 + }, + { + "epoch": 0.019429774479017985, + "grad_norm": 1.93153715133667, + "learning_rate": 4.9953468762572786e-05, + "loss": 5.9042, + "step": 3267 + }, + { + "epoch": 0.01943572176229898, + "grad_norm": 2.4099066257476807, + "learning_rate": 4.9953440272731874e-05, + "loss": 5.8181, + "step": 3268 + }, + { + "epoch": 0.01944166904557998, + "grad_norm": 2.078752279281616, + "learning_rate": 4.995341177417998e-05, + "loss": 5.8771, + "step": 3269 + }, + { + "epoch": 0.019447616328860978, + "grad_norm": 2.012592077255249, + "learning_rate": 4.9953383266917106e-05, + "loss": 5.8135, + "step": 3270 + }, + { + "epoch": 0.019453563612141973, + "grad_norm": 2.0364151000976562, + "learning_rate": 4.995335475094326e-05, + "loss": 5.8767, + "step": 3271 + }, + { + "epoch": 0.01945951089542297, + "grad_norm": 2.0447049140930176, + "learning_rate": 4.995332622625846e-05, + "loss": 5.8236, + "step": 3272 + }, + { + "epoch": 0.01946545817870397, + "grad_norm": 2.2354300022125244, + "learning_rate": 4.995329769286271e-05, + "loss": 5.7794, + "step": 3273 + }, + { + "epoch": 0.019471405461984965, + "grad_norm": 2.031331777572632, + "learning_rate": 4.995326915075602e-05, + "loss": 5.87, + "step": 3274 + }, + { + "epoch": 0.019477352745265963, + "grad_norm": 2.2116496562957764, + "learning_rate": 4.99532405999384e-05, + "loss": 5.885, + "step": 3275 + }, + { + "epoch": 0.01948330002854696, + "grad_norm": 1.9008034467697144, + "learning_rate": 4.995321204040987e-05, + "loss": 5.8646, + "step": 3276 + }, + { + "epoch": 0.019489247311827957, + "grad_norm": 2.1743087768554688, + "learning_rate": 4.995318347217042e-05, + "loss": 5.9742, + "step": 3277 + }, + { + "epoch": 0.019495194595108956, + "grad_norm": 2.09171724319458, + "learning_rate": 4.995315489522008e-05, + "loss": 5.882, + "step": 3278 + }, + { + "epoch": 0.01950114187838995, + "grad_norm": 1.816938042640686, + "learning_rate": 4.995312630955885e-05, + "loss": 5.9164, + "step": 3279 + }, + { + "epoch": 0.01950708916167095, + "grad_norm": 2.065207004547119, + "learning_rate": 4.995309771518674e-05, + "loss": 5.9273, + "step": 3280 + }, + { + "epoch": 0.019513036444951944, + "grad_norm": 2.1037240028381348, + "learning_rate": 4.9953069112103757e-05, + "loss": 5.863, + "step": 3281 + }, + { + "epoch": 0.019518983728232943, + "grad_norm": 2.011705160140991, + "learning_rate": 4.995304050030992e-05, + "loss": 5.712, + "step": 3282 + }, + { + "epoch": 0.01952493101151394, + "grad_norm": 2.2053868770599365, + "learning_rate": 4.995301187980523e-05, + "loss": 5.6988, + "step": 3283 + }, + { + "epoch": 0.019530878294794937, + "grad_norm": 2.0522396564483643, + "learning_rate": 4.995298325058971e-05, + "loss": 5.6831, + "step": 3284 + }, + { + "epoch": 0.019536825578075935, + "grad_norm": 1.9751875400543213, + "learning_rate": 4.995295461266336e-05, + "loss": 6.0187, + "step": 3285 + }, + { + "epoch": 0.019542772861356934, + "grad_norm": 2.79711651802063, + "learning_rate": 4.9952925966026185e-05, + "loss": 6.4995, + "step": 3286 + }, + { + "epoch": 0.01954872014463793, + "grad_norm": 2.1059019565582275, + "learning_rate": 4.9952897310678206e-05, + "loss": 5.9603, + "step": 3287 + }, + { + "epoch": 0.019554667427918927, + "grad_norm": 2.169428825378418, + "learning_rate": 4.995286864661942e-05, + "loss": 5.7973, + "step": 3288 + }, + { + "epoch": 0.019560614711199922, + "grad_norm": 2.165508985519409, + "learning_rate": 4.995283997384985e-05, + "loss": 5.9132, + "step": 3289 + }, + { + "epoch": 0.01956656199448092, + "grad_norm": 2.248450994491577, + "learning_rate": 4.9952811292369506e-05, + "loss": 5.8202, + "step": 3290 + }, + { + "epoch": 0.01957250927776192, + "grad_norm": 2.3068084716796875, + "learning_rate": 4.9952782602178394e-05, + "loss": 5.8223, + "step": 3291 + }, + { + "epoch": 0.019578456561042915, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.9952753903276516e-05, + "loss": 5.6231, + "step": 3292 + }, + { + "epoch": 0.019584403844323913, + "grad_norm": 2.136564254760742, + "learning_rate": 4.9952725195663895e-05, + "loss": 5.9859, + "step": 3293 + }, + { + "epoch": 0.01959035112760491, + "grad_norm": 2.6265337467193604, + "learning_rate": 4.9952696479340535e-05, + "loss": 5.9126, + "step": 3294 + }, + { + "epoch": 0.019596298410885907, + "grad_norm": 2.442678928375244, + "learning_rate": 4.9952667754306445e-05, + "loss": 5.9361, + "step": 3295 + }, + { + "epoch": 0.019602245694166905, + "grad_norm": 2.0740134716033936, + "learning_rate": 4.9952639020561644e-05, + "loss": 5.913, + "step": 3296 + }, + { + "epoch": 0.0196081929774479, + "grad_norm": 2.4088518619537354, + "learning_rate": 4.995261027810612e-05, + "loss": 5.8297, + "step": 3297 + }, + { + "epoch": 0.0196141402607289, + "grad_norm": 2.1514804363250732, + "learning_rate": 4.995258152693991e-05, + "loss": 5.8256, + "step": 3298 + }, + { + "epoch": 0.019620087544009897, + "grad_norm": 2.921570062637329, + "learning_rate": 4.9952552767063e-05, + "loss": 6.0243, + "step": 3299 + }, + { + "epoch": 0.019626034827290893, + "grad_norm": 2.398749828338623, + "learning_rate": 4.995252399847542e-05, + "loss": 6.004, + "step": 3300 + }, + { + "epoch": 0.01963198211057189, + "grad_norm": 2.2024805545806885, + "learning_rate": 4.995249522117717e-05, + "loss": 5.9201, + "step": 3301 + }, + { + "epoch": 0.01963792939385289, + "grad_norm": 2.112269401550293, + "learning_rate": 4.9952466435168266e-05, + "loss": 5.8488, + "step": 3302 + }, + { + "epoch": 0.019643876677133885, + "grad_norm": 2.04632568359375, + "learning_rate": 4.99524376404487e-05, + "loss": 5.8054, + "step": 3303 + }, + { + "epoch": 0.019649823960414883, + "grad_norm": 2.6293606758117676, + "learning_rate": 4.995240883701851e-05, + "loss": 5.6799, + "step": 3304 + }, + { + "epoch": 0.01965577124369588, + "grad_norm": 2.5172793865203857, + "learning_rate": 4.995238002487769e-05, + "loss": 5.712, + "step": 3305 + }, + { + "epoch": 0.019661718526976877, + "grad_norm": 2.549194097518921, + "learning_rate": 4.995235120402625e-05, + "loss": 5.7208, + "step": 3306 + }, + { + "epoch": 0.019667665810257876, + "grad_norm": 2.2993295192718506, + "learning_rate": 4.99523223744642e-05, + "loss": 5.7952, + "step": 3307 + }, + { + "epoch": 0.01967361309353887, + "grad_norm": 2.1270902156829834, + "learning_rate": 4.9952293536191555e-05, + "loss": 5.6988, + "step": 3308 + }, + { + "epoch": 0.01967956037681987, + "grad_norm": 2.349858283996582, + "learning_rate": 4.9952264689208315e-05, + "loss": 5.623, + "step": 3309 + }, + { + "epoch": 0.019685507660100864, + "grad_norm": 2.1501529216766357, + "learning_rate": 4.9952235833514506e-05, + "loss": 5.6498, + "step": 3310 + }, + { + "epoch": 0.019691454943381863, + "grad_norm": 2.0577821731567383, + "learning_rate": 4.995220696911012e-05, + "loss": 5.6863, + "step": 3311 + }, + { + "epoch": 0.01969740222666286, + "grad_norm": 2.0787386894226074, + "learning_rate": 4.9952178095995185e-05, + "loss": 5.6314, + "step": 3312 + }, + { + "epoch": 0.019703349509943856, + "grad_norm": 2.4042680263519287, + "learning_rate": 4.99521492141697e-05, + "loss": 5.6152, + "step": 3313 + }, + { + "epoch": 0.019709296793224855, + "grad_norm": 2.444410800933838, + "learning_rate": 4.995212032363368e-05, + "loss": 5.5375, + "step": 3314 + }, + { + "epoch": 0.019715244076505854, + "grad_norm": 2.1678028106689453, + "learning_rate": 4.995209142438712e-05, + "loss": 5.6239, + "step": 3315 + }, + { + "epoch": 0.01972119135978685, + "grad_norm": 2.5436410903930664, + "learning_rate": 4.9952062516430054e-05, + "loss": 5.4234, + "step": 3316 + }, + { + "epoch": 0.019727138643067847, + "grad_norm": 2.454561471939087, + "learning_rate": 4.9952033599762484e-05, + "loss": 5.4198, + "step": 3317 + }, + { + "epoch": 0.019733085926348842, + "grad_norm": 2.388125419616699, + "learning_rate": 4.9952004674384413e-05, + "loss": 5.5073, + "step": 3318 + }, + { + "epoch": 0.01973903320962984, + "grad_norm": 2.1900579929351807, + "learning_rate": 4.995197574029585e-05, + "loss": 5.3463, + "step": 3319 + }, + { + "epoch": 0.01974498049291084, + "grad_norm": 2.5625739097595215, + "learning_rate": 4.995194679749681e-05, + "loss": 5.4291, + "step": 3320 + }, + { + "epoch": 0.019750927776191834, + "grad_norm": 2.52402400970459, + "learning_rate": 4.995191784598731e-05, + "loss": 5.3826, + "step": 3321 + }, + { + "epoch": 0.019756875059472833, + "grad_norm": 2.5888168811798096, + "learning_rate": 4.995188888576735e-05, + "loss": 5.381, + "step": 3322 + }, + { + "epoch": 0.01976282234275383, + "grad_norm": 2.637080669403076, + "learning_rate": 4.995185991683694e-05, + "loss": 5.3321, + "step": 3323 + }, + { + "epoch": 0.019768769626034827, + "grad_norm": 2.46553111076355, + "learning_rate": 4.9951830939196095e-05, + "loss": 5.3663, + "step": 3324 + }, + { + "epoch": 0.019774716909315825, + "grad_norm": 2.2397992610931396, + "learning_rate": 4.9951801952844826e-05, + "loss": 5.3237, + "step": 3325 + }, + { + "epoch": 0.01978066419259682, + "grad_norm": 2.3519208431243896, + "learning_rate": 4.9951772957783144e-05, + "loss": 5.4166, + "step": 3326 + }, + { + "epoch": 0.01978661147587782, + "grad_norm": 2.6235291957855225, + "learning_rate": 4.9951743954011056e-05, + "loss": 5.8094, + "step": 3327 + }, + { + "epoch": 0.019792558759158817, + "grad_norm": 2.162285327911377, + "learning_rate": 4.995171494152856e-05, + "loss": 5.6491, + "step": 3328 + }, + { + "epoch": 0.019798506042439813, + "grad_norm": 2.231853485107422, + "learning_rate": 4.995168592033569e-05, + "loss": 5.69, + "step": 3329 + }, + { + "epoch": 0.01980445332572081, + "grad_norm": 2.7305827140808105, + "learning_rate": 4.995165689043244e-05, + "loss": 5.5028, + "step": 3330 + }, + { + "epoch": 0.01981040060900181, + "grad_norm": 2.9917726516723633, + "learning_rate": 4.9951627851818824e-05, + "loss": 5.3227, + "step": 3331 + }, + { + "epoch": 0.019816347892282805, + "grad_norm": 3.0039985179901123, + "learning_rate": 4.995159880449486e-05, + "loss": 5.5965, + "step": 3332 + }, + { + "epoch": 0.019822295175563803, + "grad_norm": 3.081099510192871, + "learning_rate": 4.995156974846054e-05, + "loss": 5.6945, + "step": 3333 + }, + { + "epoch": 0.0198282424588448, + "grad_norm": 2.042445182800293, + "learning_rate": 4.995154068371589e-05, + "loss": 5.693, + "step": 3334 + }, + { + "epoch": 0.019834189742125797, + "grad_norm": 2.8875865936279297, + "learning_rate": 4.995151161026091e-05, + "loss": 5.5981, + "step": 3335 + }, + { + "epoch": 0.019840137025406795, + "grad_norm": 2.4203453063964844, + "learning_rate": 4.9951482528095615e-05, + "loss": 5.6269, + "step": 3336 + }, + { + "epoch": 0.01984608430868779, + "grad_norm": 2.332151174545288, + "learning_rate": 4.995145343722002e-05, + "loss": 5.6002, + "step": 3337 + }, + { + "epoch": 0.01985203159196879, + "grad_norm": 2.556549310684204, + "learning_rate": 4.995142433763413e-05, + "loss": 5.7715, + "step": 3338 + }, + { + "epoch": 0.019857978875249784, + "grad_norm": 2.453113079071045, + "learning_rate": 4.995139522933796e-05, + "loss": 5.8958, + "step": 3339 + }, + { + "epoch": 0.019863926158530783, + "grad_norm": 1.9842414855957031, + "learning_rate": 4.995136611233151e-05, + "loss": 5.9781, + "step": 3340 + }, + { + "epoch": 0.01986987344181178, + "grad_norm": 2.3725521564483643, + "learning_rate": 4.995133698661479e-05, + "loss": 5.9902, + "step": 3341 + }, + { + "epoch": 0.019875820725092776, + "grad_norm": 2.679001808166504, + "learning_rate": 4.9951307852187824e-05, + "loss": 5.9526, + "step": 3342 + }, + { + "epoch": 0.019881768008373775, + "grad_norm": 2.272595167160034, + "learning_rate": 4.995127870905061e-05, + "loss": 5.9685, + "step": 3343 + }, + { + "epoch": 0.019887715291654774, + "grad_norm": 2.0300357341766357, + "learning_rate": 4.995124955720317e-05, + "loss": 5.7702, + "step": 3344 + }, + { + "epoch": 0.01989366257493577, + "grad_norm": 2.5023481845855713, + "learning_rate": 4.9951220396645504e-05, + "loss": 5.6612, + "step": 3345 + }, + { + "epoch": 0.019899609858216767, + "grad_norm": 2.426457166671753, + "learning_rate": 4.995119122737762e-05, + "loss": 5.767, + "step": 3346 + }, + { + "epoch": 0.019905557141497762, + "grad_norm": 2.4919028282165527, + "learning_rate": 4.995116204939954e-05, + "loss": 6.0578, + "step": 3347 + }, + { + "epoch": 0.01991150442477876, + "grad_norm": 3.099792957305908, + "learning_rate": 4.995113286271126e-05, + "loss": 7.053, + "step": 3348 + }, + { + "epoch": 0.01991745170805976, + "grad_norm": 2.597169876098633, + "learning_rate": 4.9951103667312795e-05, + "loss": 5.8467, + "step": 3349 + }, + { + "epoch": 0.019923398991340754, + "grad_norm": 2.1132469177246094, + "learning_rate": 4.995107446320416e-05, + "loss": 5.7296, + "step": 3350 + }, + { + "epoch": 0.019929346274621753, + "grad_norm": 2.4141721725463867, + "learning_rate": 4.995104525038537e-05, + "loss": 5.8705, + "step": 3351 + }, + { + "epoch": 0.01993529355790275, + "grad_norm": 1.9012199640274048, + "learning_rate": 4.995101602885642e-05, + "loss": 5.8759, + "step": 3352 + }, + { + "epoch": 0.019941240841183747, + "grad_norm": 2.168673038482666, + "learning_rate": 4.9950986798617335e-05, + "loss": 5.8161, + "step": 3353 + }, + { + "epoch": 0.019947188124464745, + "grad_norm": 2.1579155921936035, + "learning_rate": 4.995095755966811e-05, + "loss": 5.8699, + "step": 3354 + }, + { + "epoch": 0.01995313540774574, + "grad_norm": 2.1460800170898438, + "learning_rate": 4.9950928312008774e-05, + "loss": 5.9144, + "step": 3355 + }, + { + "epoch": 0.01995908269102674, + "grad_norm": 2.402167558670044, + "learning_rate": 4.995089905563932e-05, + "loss": 5.8857, + "step": 3356 + }, + { + "epoch": 0.019965029974307737, + "grad_norm": 2.6381726264953613, + "learning_rate": 4.995086979055976e-05, + "loss": 6.0021, + "step": 3357 + }, + { + "epoch": 0.019970977257588732, + "grad_norm": 2.5577943325042725, + "learning_rate": 4.995084051677012e-05, + "loss": 5.9425, + "step": 3358 + }, + { + "epoch": 0.01997692454086973, + "grad_norm": 2.188215494155884, + "learning_rate": 4.995081123427039e-05, + "loss": 6.0656, + "step": 3359 + }, + { + "epoch": 0.01998287182415073, + "grad_norm": 1.8278366327285767, + "learning_rate": 4.9950781943060596e-05, + "loss": 5.8229, + "step": 3360 + }, + { + "epoch": 0.019988819107431725, + "grad_norm": 1.9054077863693237, + "learning_rate": 4.995075264314074e-05, + "loss": 5.8158, + "step": 3361 + }, + { + "epoch": 0.019994766390712723, + "grad_norm": 2.1255416870117188, + "learning_rate": 4.9950723334510826e-05, + "loss": 5.8816, + "step": 3362 + }, + { + "epoch": 0.02000071367399372, + "grad_norm": 2.026923656463623, + "learning_rate": 4.995069401717088e-05, + "loss": 5.7463, + "step": 3363 + }, + { + "epoch": 0.020006660957274717, + "grad_norm": 2.015178680419922, + "learning_rate": 4.9950664691120905e-05, + "loss": 5.6689, + "step": 3364 + }, + { + "epoch": 0.020012608240555715, + "grad_norm": 1.7729417085647583, + "learning_rate": 4.995063535636091e-05, + "loss": 5.701, + "step": 3365 + }, + { + "epoch": 0.02001855552383671, + "grad_norm": 1.9893600940704346, + "learning_rate": 4.9950606012890905e-05, + "loss": 5.7502, + "step": 3366 + }, + { + "epoch": 0.02002450280711771, + "grad_norm": 1.8950870037078857, + "learning_rate": 4.99505766607109e-05, + "loss": 5.6094, + "step": 3367 + }, + { + "epoch": 0.020030450090398704, + "grad_norm": 2.4140830039978027, + "learning_rate": 4.995054729982091e-05, + "loss": 5.8387, + "step": 3368 + }, + { + "epoch": 0.020036397373679703, + "grad_norm": 2.1887669563293457, + "learning_rate": 4.995051793022094e-05, + "loss": 5.7348, + "step": 3369 + }, + { + "epoch": 0.0200423446569607, + "grad_norm": 1.9632731676101685, + "learning_rate": 4.9950488551911e-05, + "loss": 5.5568, + "step": 3370 + }, + { + "epoch": 0.020048291940241696, + "grad_norm": 2.116834878921509, + "learning_rate": 4.995045916489111e-05, + "loss": 5.461, + "step": 3371 + }, + { + "epoch": 0.020054239223522695, + "grad_norm": 2.021256923675537, + "learning_rate": 4.9950429769161266e-05, + "loss": 5.6601, + "step": 3372 + }, + { + "epoch": 0.020060186506803693, + "grad_norm": 2.1648659706115723, + "learning_rate": 4.9950400364721486e-05, + "loss": 5.5364, + "step": 3373 + }, + { + "epoch": 0.02006613379008469, + "grad_norm": 2.043499231338501, + "learning_rate": 4.9950370951571775e-05, + "loss": 5.7273, + "step": 3374 + }, + { + "epoch": 0.020072081073365687, + "grad_norm": 2.296121597290039, + "learning_rate": 4.995034152971215e-05, + "loss": 5.8494, + "step": 3375 + }, + { + "epoch": 0.020078028356646682, + "grad_norm": 2.401031494140625, + "learning_rate": 4.995031209914261e-05, + "loss": 5.719, + "step": 3376 + }, + { + "epoch": 0.02008397563992768, + "grad_norm": 2.3130364418029785, + "learning_rate": 4.995028265986319e-05, + "loss": 5.7998, + "step": 3377 + }, + { + "epoch": 0.02008992292320868, + "grad_norm": 2.3820009231567383, + "learning_rate": 4.9950253211873874e-05, + "loss": 6.0632, + "step": 3378 + }, + { + "epoch": 0.020095870206489674, + "grad_norm": 2.1970956325531006, + "learning_rate": 4.995022375517469e-05, + "loss": 5.9776, + "step": 3379 + }, + { + "epoch": 0.020101817489770673, + "grad_norm": 1.912102460861206, + "learning_rate": 4.995019428976564e-05, + "loss": 5.7194, + "step": 3380 + }, + { + "epoch": 0.02010776477305167, + "grad_norm": 2.3187389373779297, + "learning_rate": 4.995016481564673e-05, + "loss": 6.0225, + "step": 3381 + }, + { + "epoch": 0.020113712056332667, + "grad_norm": 1.959000587463379, + "learning_rate": 4.995013533281797e-05, + "loss": 5.8453, + "step": 3382 + }, + { + "epoch": 0.020119659339613665, + "grad_norm": 2.0283286571502686, + "learning_rate": 4.995010584127938e-05, + "loss": 5.6837, + "step": 3383 + }, + { + "epoch": 0.02012560662289466, + "grad_norm": 2.410351037979126, + "learning_rate": 4.995007634103097e-05, + "loss": 5.8172, + "step": 3384 + }, + { + "epoch": 0.02013155390617566, + "grad_norm": 2.2864298820495605, + "learning_rate": 4.995004683207275e-05, + "loss": 5.8995, + "step": 3385 + }, + { + "epoch": 0.020137501189456657, + "grad_norm": 2.830883026123047, + "learning_rate": 4.995001731440472e-05, + "loss": 5.7273, + "step": 3386 + }, + { + "epoch": 0.020143448472737652, + "grad_norm": 2.486783981323242, + "learning_rate": 4.9949987788026896e-05, + "loss": 5.88, + "step": 3387 + }, + { + "epoch": 0.02014939575601865, + "grad_norm": 2.109975576400757, + "learning_rate": 4.994995825293929e-05, + "loss": 5.8618, + "step": 3388 + }, + { + "epoch": 0.02015534303929965, + "grad_norm": 2.249293327331543, + "learning_rate": 4.994992870914191e-05, + "loss": 5.8511, + "step": 3389 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.5433366298675537, + "learning_rate": 4.9949899156634774e-05, + "loss": 5.7375, + "step": 3390 + }, + { + "epoch": 0.020167237605861643, + "grad_norm": 2.7013652324676514, + "learning_rate": 4.9949869595417876e-05, + "loss": 5.8886, + "step": 3391 + }, + { + "epoch": 0.020173184889142638, + "grad_norm": 2.536972761154175, + "learning_rate": 4.994984002549124e-05, + "loss": 5.4203, + "step": 3392 + }, + { + "epoch": 0.020179132172423637, + "grad_norm": 2.596230983734131, + "learning_rate": 4.9949810446854876e-05, + "loss": 5.7882, + "step": 3393 + }, + { + "epoch": 0.020185079455704635, + "grad_norm": 2.6889936923980713, + "learning_rate": 4.9949780859508786e-05, + "loss": 5.6822, + "step": 3394 + }, + { + "epoch": 0.02019102673898563, + "grad_norm": 2.541027069091797, + "learning_rate": 4.994975126345299e-05, + "loss": 5.7394, + "step": 3395 + }, + { + "epoch": 0.02019697402226663, + "grad_norm": 2.2267251014709473, + "learning_rate": 4.9949721658687485e-05, + "loss": 5.7847, + "step": 3396 + }, + { + "epoch": 0.020202921305547628, + "grad_norm": 2.439689874649048, + "learning_rate": 4.994969204521231e-05, + "loss": 5.6222, + "step": 3397 + }, + { + "epoch": 0.020208868588828623, + "grad_norm": 2.9407742023468018, + "learning_rate": 4.9949662423027434e-05, + "loss": 5.6629, + "step": 3398 + }, + { + "epoch": 0.02021481587210962, + "grad_norm": 2.42802357673645, + "learning_rate": 4.9949632792132894e-05, + "loss": 5.3369, + "step": 3399 + }, + { + "epoch": 0.020220763155390616, + "grad_norm": 2.465508222579956, + "learning_rate": 4.99496031525287e-05, + "loss": 5.3365, + "step": 3400 + }, + { + "epoch": 0.020226710438671615, + "grad_norm": 2.408794403076172, + "learning_rate": 4.9949573504214854e-05, + "loss": 5.3156, + "step": 3401 + }, + { + "epoch": 0.020232657721952613, + "grad_norm": 2.229372978210449, + "learning_rate": 4.9949543847191374e-05, + "loss": 5.9194, + "step": 3402 + }, + { + "epoch": 0.02023860500523361, + "grad_norm": 4.567020416259766, + "learning_rate": 4.9949514181458254e-05, + "loss": 6.3379, + "step": 3403 + }, + { + "epoch": 0.020244552288514607, + "grad_norm": 3.9927520751953125, + "learning_rate": 4.9949484507015534e-05, + "loss": 6.3351, + "step": 3404 + }, + { + "epoch": 0.020250499571795602, + "grad_norm": 2.4830081462860107, + "learning_rate": 4.9949454823863195e-05, + "loss": 6.4046, + "step": 3405 + }, + { + "epoch": 0.0202564468550766, + "grad_norm": 2.282722234725952, + "learning_rate": 4.994942513200126e-05, + "loss": 6.5473, + "step": 3406 + }, + { + "epoch": 0.0202623941383576, + "grad_norm": 2.411367416381836, + "learning_rate": 4.994939543142973e-05, + "loss": 5.7898, + "step": 3407 + }, + { + "epoch": 0.020268341421638594, + "grad_norm": 3.2052342891693115, + "learning_rate": 4.994936572214864e-05, + "loss": 5.6695, + "step": 3408 + }, + { + "epoch": 0.020274288704919593, + "grad_norm": 4.142974853515625, + "learning_rate": 4.994933600415798e-05, + "loss": 6.2037, + "step": 3409 + }, + { + "epoch": 0.02028023598820059, + "grad_norm": 2.839066982269287, + "learning_rate": 4.994930627745776e-05, + "loss": 6.7308, + "step": 3410 + }, + { + "epoch": 0.020286183271481587, + "grad_norm": 3.3138885498046875, + "learning_rate": 4.9949276542048e-05, + "loss": 5.8873, + "step": 3411 + }, + { + "epoch": 0.020292130554762585, + "grad_norm": 2.6651928424835205, + "learning_rate": 4.9949246797928704e-05, + "loss": 6.6325, + "step": 3412 + }, + { + "epoch": 0.02029807783804358, + "grad_norm": 2.919436454772949, + "learning_rate": 4.994921704509988e-05, + "loss": 6.3239, + "step": 3413 + }, + { + "epoch": 0.02030402512132458, + "grad_norm": 2.6901097297668457, + "learning_rate": 4.994918728356155e-05, + "loss": 6.1712, + "step": 3414 + }, + { + "epoch": 0.020309972404605577, + "grad_norm": 2.573249340057373, + "learning_rate": 4.9949157513313704e-05, + "loss": 5.8194, + "step": 3415 + }, + { + "epoch": 0.020315919687886572, + "grad_norm": 3.0603950023651123, + "learning_rate": 4.994912773435637e-05, + "loss": 6.3881, + "step": 3416 + }, + { + "epoch": 0.02032186697116757, + "grad_norm": 3.1800057888031006, + "learning_rate": 4.994909794668956e-05, + "loss": 5.9486, + "step": 3417 + }, + { + "epoch": 0.02032781425444857, + "grad_norm": 2.537182092666626, + "learning_rate": 4.994906815031327e-05, + "loss": 6.5454, + "step": 3418 + }, + { + "epoch": 0.020333761537729565, + "grad_norm": 2.474705457687378, + "learning_rate": 4.9949038345227525e-05, + "loss": 6.5356, + "step": 3419 + }, + { + "epoch": 0.020339708821010563, + "grad_norm": 3.054689645767212, + "learning_rate": 4.994900853143232e-05, + "loss": 6.4526, + "step": 3420 + }, + { + "epoch": 0.020345656104291558, + "grad_norm": 2.587644100189209, + "learning_rate": 4.994897870892769e-05, + "loss": 6.2811, + "step": 3421 + }, + { + "epoch": 0.020351603387572557, + "grad_norm": 2.110041618347168, + "learning_rate": 4.994894887771361e-05, + "loss": 6.0428, + "step": 3422 + }, + { + "epoch": 0.020357550670853555, + "grad_norm": 2.4931492805480957, + "learning_rate": 4.9948919037790115e-05, + "loss": 6.3683, + "step": 3423 + }, + { + "epoch": 0.02036349795413455, + "grad_norm": 2.7169463634490967, + "learning_rate": 4.994888918915721e-05, + "loss": 6.5335, + "step": 3424 + }, + { + "epoch": 0.02036944523741555, + "grad_norm": 2.164363145828247, + "learning_rate": 4.994885933181491e-05, + "loss": 6.0409, + "step": 3425 + }, + { + "epoch": 0.020375392520696547, + "grad_norm": 2.480468273162842, + "learning_rate": 4.994882946576322e-05, + "loss": 5.8816, + "step": 3426 + }, + { + "epoch": 0.020381339803977543, + "grad_norm": 2.928361415863037, + "learning_rate": 4.994879959100215e-05, + "loss": 6.1706, + "step": 3427 + }, + { + "epoch": 0.02038728708725854, + "grad_norm": 2.1536660194396973, + "learning_rate": 4.994876970753171e-05, + "loss": 6.0559, + "step": 3428 + }, + { + "epoch": 0.020393234370539536, + "grad_norm": 2.6913530826568604, + "learning_rate": 4.994873981535192e-05, + "loss": 6.7411, + "step": 3429 + }, + { + "epoch": 0.020399181653820535, + "grad_norm": 2.647124767303467, + "learning_rate": 4.994870991446278e-05, + "loss": 6.5251, + "step": 3430 + }, + { + "epoch": 0.020405128937101533, + "grad_norm": 2.621612310409546, + "learning_rate": 4.994868000486429e-05, + "loss": 6.7029, + "step": 3431 + }, + { + "epoch": 0.02041107622038253, + "grad_norm": 2.1986844539642334, + "learning_rate": 4.994865008655649e-05, + "loss": 6.4561, + "step": 3432 + }, + { + "epoch": 0.020417023503663527, + "grad_norm": 2.706897735595703, + "learning_rate": 4.994862015953936e-05, + "loss": 6.3125, + "step": 3433 + }, + { + "epoch": 0.020422970786944522, + "grad_norm": 2.403346300125122, + "learning_rate": 4.994859022381294e-05, + "loss": 6.0808, + "step": 3434 + }, + { + "epoch": 0.02042891807022552, + "grad_norm": 2.367835521697998, + "learning_rate": 4.994856027937722e-05, + "loss": 6.2634, + "step": 3435 + }, + { + "epoch": 0.02043486535350652, + "grad_norm": 2.8564250469207764, + "learning_rate": 4.9948530326232205e-05, + "loss": 6.579, + "step": 3436 + }, + { + "epoch": 0.020440812636787514, + "grad_norm": 2.9472100734710693, + "learning_rate": 4.9948500364377925e-05, + "loss": 6.3873, + "step": 3437 + }, + { + "epoch": 0.020446759920068513, + "grad_norm": 2.3005917072296143, + "learning_rate": 4.994847039381438e-05, + "loss": 6.2316, + "step": 3438 + }, + { + "epoch": 0.02045270720334951, + "grad_norm": 2.0548787117004395, + "learning_rate": 4.9948440414541584e-05, + "loss": 6.5022, + "step": 3439 + }, + { + "epoch": 0.020458654486630506, + "grad_norm": 2.1332197189331055, + "learning_rate": 4.9948410426559536e-05, + "loss": 6.1486, + "step": 3440 + }, + { + "epoch": 0.020464601769911505, + "grad_norm": 2.112738847732544, + "learning_rate": 4.994838042986827e-05, + "loss": 5.9125, + "step": 3441 + }, + { + "epoch": 0.0204705490531925, + "grad_norm": 2.714627981185913, + "learning_rate": 4.9948350424467774e-05, + "loss": 6.1164, + "step": 3442 + }, + { + "epoch": 0.0204764963364735, + "grad_norm": 2.337571382522583, + "learning_rate": 4.994832041035806e-05, + "loss": 6.0567, + "step": 3443 + }, + { + "epoch": 0.020482443619754497, + "grad_norm": 2.354389190673828, + "learning_rate": 4.994829038753915e-05, + "loss": 5.5922, + "step": 3444 + }, + { + "epoch": 0.020488390903035492, + "grad_norm": 2.3885531425476074, + "learning_rate": 4.994826035601106e-05, + "loss": 6.4178, + "step": 3445 + }, + { + "epoch": 0.02049433818631649, + "grad_norm": 2.931328058242798, + "learning_rate": 4.994823031577378e-05, + "loss": 6.356, + "step": 3446 + }, + { + "epoch": 0.02050028546959749, + "grad_norm": 2.4858877658843994, + "learning_rate": 4.994820026682733e-05, + "loss": 6.0601, + "step": 3447 + }, + { + "epoch": 0.020506232752878484, + "grad_norm": 2.626811981201172, + "learning_rate": 4.9948170209171725e-05, + "loss": 6.4372, + "step": 3448 + }, + { + "epoch": 0.020512180036159483, + "grad_norm": 2.2917356491088867, + "learning_rate": 4.994814014280696e-05, + "loss": 5.9828, + "step": 3449 + }, + { + "epoch": 0.020518127319440478, + "grad_norm": 2.174531936645508, + "learning_rate": 4.9948110067733075e-05, + "loss": 6.3382, + "step": 3450 + }, + { + "epoch": 0.020524074602721477, + "grad_norm": 2.9880006313323975, + "learning_rate": 4.994807998395005e-05, + "loss": 6.7493, + "step": 3451 + }, + { + "epoch": 0.020530021886002475, + "grad_norm": 2.6577212810516357, + "learning_rate": 4.994804989145792e-05, + "loss": 6.853, + "step": 3452 + }, + { + "epoch": 0.02053596916928347, + "grad_norm": 2.8832437992095947, + "learning_rate": 4.994801979025667e-05, + "loss": 6.5829, + "step": 3453 + }, + { + "epoch": 0.02054191645256447, + "grad_norm": 2.473177194595337, + "learning_rate": 4.994798968034633e-05, + "loss": 6.2879, + "step": 3454 + }, + { + "epoch": 0.020547863735845467, + "grad_norm": 2.7484633922576904, + "learning_rate": 4.994795956172691e-05, + "loss": 6.2037, + "step": 3455 + }, + { + "epoch": 0.020553811019126463, + "grad_norm": 1.6647555828094482, + "learning_rate": 4.9947929434398403e-05, + "loss": 6.5639, + "step": 3456 + }, + { + "epoch": 0.02055975830240746, + "grad_norm": 3.71087908744812, + "learning_rate": 4.994789929836084e-05, + "loss": 6.8464, + "step": 3457 + }, + { + "epoch": 0.020565705585688456, + "grad_norm": 2.705892324447632, + "learning_rate": 4.994786915361422e-05, + "loss": 6.8316, + "step": 3458 + }, + { + "epoch": 0.020571652868969455, + "grad_norm": 2.3619437217712402, + "learning_rate": 4.994783900015856e-05, + "loss": 6.3441, + "step": 3459 + }, + { + "epoch": 0.020577600152250453, + "grad_norm": 2.490499258041382, + "learning_rate": 4.9947808837993864e-05, + "loss": 6.1467, + "step": 3460 + }, + { + "epoch": 0.02058354743553145, + "grad_norm": 2.546614170074463, + "learning_rate": 4.994777866712015e-05, + "loss": 5.6677, + "step": 3461 + }, + { + "epoch": 0.020589494718812447, + "grad_norm": 2.473695755004883, + "learning_rate": 4.994774848753741e-05, + "loss": 5.7815, + "step": 3462 + }, + { + "epoch": 0.020595442002093442, + "grad_norm": 2.0494625568389893, + "learning_rate": 4.994771829924569e-05, + "loss": 5.674, + "step": 3463 + }, + { + "epoch": 0.02060138928537444, + "grad_norm": 2.1504273414611816, + "learning_rate": 4.9947688102244964e-05, + "loss": 5.5299, + "step": 3464 + }, + { + "epoch": 0.02060733656865544, + "grad_norm": 2.908170700073242, + "learning_rate": 4.994765789653526e-05, + "loss": 5.8448, + "step": 3465 + }, + { + "epoch": 0.020613283851936434, + "grad_norm": 3.1434714794158936, + "learning_rate": 4.994762768211659e-05, + "loss": 5.8413, + "step": 3466 + }, + { + "epoch": 0.020619231135217433, + "grad_norm": 2.4688189029693604, + "learning_rate": 4.994759745898896e-05, + "loss": 5.6458, + "step": 3467 + }, + { + "epoch": 0.02062517841849843, + "grad_norm": 2.172083854675293, + "learning_rate": 4.994756722715238e-05, + "loss": 5.723, + "step": 3468 + }, + { + "epoch": 0.020631125701779426, + "grad_norm": 2.0702707767486572, + "learning_rate": 4.994753698660687e-05, + "loss": 5.6199, + "step": 3469 + }, + { + "epoch": 0.020637072985060425, + "grad_norm": 2.2142136096954346, + "learning_rate": 4.9947506737352425e-05, + "loss": 5.5476, + "step": 3470 + }, + { + "epoch": 0.02064302026834142, + "grad_norm": 2.156874179840088, + "learning_rate": 4.994747647938907e-05, + "loss": 5.4773, + "step": 3471 + }, + { + "epoch": 0.02064896755162242, + "grad_norm": 3.3683371543884277, + "learning_rate": 4.9947446212716795e-05, + "loss": 6.4804, + "step": 3472 + }, + { + "epoch": 0.020654914834903417, + "grad_norm": 2.2435977458953857, + "learning_rate": 4.9947415937335635e-05, + "loss": 6.0622, + "step": 3473 + }, + { + "epoch": 0.020660862118184412, + "grad_norm": 3.0824263095855713, + "learning_rate": 4.994738565324558e-05, + "loss": 6.8809, + "step": 3474 + }, + { + "epoch": 0.02066680940146541, + "grad_norm": 2.6978909969329834, + "learning_rate": 4.9947355360446664e-05, + "loss": 6.823, + "step": 3475 + }, + { + "epoch": 0.02067275668474641, + "grad_norm": 3.041680097579956, + "learning_rate": 4.9947325058938874e-05, + "loss": 6.4268, + "step": 3476 + }, + { + "epoch": 0.020678703968027404, + "grad_norm": 3.5326781272888184, + "learning_rate": 4.9947294748722237e-05, + "loss": 6.3516, + "step": 3477 + }, + { + "epoch": 0.020684651251308403, + "grad_norm": 2.7611732482910156, + "learning_rate": 4.994726442979675e-05, + "loss": 6.2206, + "step": 3478 + }, + { + "epoch": 0.020690598534589398, + "grad_norm": 3.8533458709716797, + "learning_rate": 4.994723410216244e-05, + "loss": 6.7907, + "step": 3479 + }, + { + "epoch": 0.020696545817870397, + "grad_norm": 2.8091351985931396, + "learning_rate": 4.99472037658193e-05, + "loss": 6.7468, + "step": 3480 + }, + { + "epoch": 0.020702493101151395, + "grad_norm": 2.4317073822021484, + "learning_rate": 4.994717342076736e-05, + "loss": 6.4682, + "step": 3481 + }, + { + "epoch": 0.02070844038443239, + "grad_norm": 2.5132029056549072, + "learning_rate": 4.994714306700661e-05, + "loss": 6.1966, + "step": 3482 + }, + { + "epoch": 0.02071438766771339, + "grad_norm": 2.8161535263061523, + "learning_rate": 4.994711270453707e-05, + "loss": 5.6045, + "step": 3483 + }, + { + "epoch": 0.020720334950994387, + "grad_norm": 2.654115915298462, + "learning_rate": 4.994708233335875e-05, + "loss": 5.8983, + "step": 3484 + }, + { + "epoch": 0.020726282234275382, + "grad_norm": 2.5971553325653076, + "learning_rate": 4.9947051953471664e-05, + "loss": 5.4422, + "step": 3485 + }, + { + "epoch": 0.02073222951755638, + "grad_norm": 2.5758557319641113, + "learning_rate": 4.9947021564875816e-05, + "loss": 5.5921, + "step": 3486 + }, + { + "epoch": 0.020738176800837376, + "grad_norm": 2.635345458984375, + "learning_rate": 4.994699116757122e-05, + "loss": 6.2316, + "step": 3487 + }, + { + "epoch": 0.020744124084118375, + "grad_norm": 2.573514938354492, + "learning_rate": 4.9946960761557896e-05, + "loss": 6.5069, + "step": 3488 + }, + { + "epoch": 0.020750071367399373, + "grad_norm": 2.587735176086426, + "learning_rate": 4.994693034683584e-05, + "loss": 5.9114, + "step": 3489 + }, + { + "epoch": 0.02075601865068037, + "grad_norm": 2.4980244636535645, + "learning_rate": 4.9946899923405075e-05, + "loss": 6.1805, + "step": 3490 + }, + { + "epoch": 0.020761965933961367, + "grad_norm": 2.614003896713257, + "learning_rate": 4.9946869491265594e-05, + "loss": 6.2294, + "step": 3491 + }, + { + "epoch": 0.020767913217242365, + "grad_norm": 3.3819997310638428, + "learning_rate": 4.994683905041743e-05, + "loss": 5.4716, + "step": 3492 + }, + { + "epoch": 0.02077386050052336, + "grad_norm": 3.168170213699341, + "learning_rate": 4.994680860086057e-05, + "loss": 5.4041, + "step": 3493 + }, + { + "epoch": 0.02077980778380436, + "grad_norm": 3.05253267288208, + "learning_rate": 4.994677814259504e-05, + "loss": 5.4958, + "step": 3494 + }, + { + "epoch": 0.020785755067085354, + "grad_norm": 2.8560431003570557, + "learning_rate": 4.994674767562085e-05, + "loss": 5.4153, + "step": 3495 + }, + { + "epoch": 0.020791702350366353, + "grad_norm": 2.790382146835327, + "learning_rate": 4.994671719993801e-05, + "loss": 6.3581, + "step": 3496 + }, + { + "epoch": 0.02079764963364735, + "grad_norm": 2.9860496520996094, + "learning_rate": 4.9946686715546535e-05, + "loss": 6.5779, + "step": 3497 + }, + { + "epoch": 0.020803596916928346, + "grad_norm": 2.744859457015991, + "learning_rate": 4.994665622244642e-05, + "loss": 6.5748, + "step": 3498 + }, + { + "epoch": 0.020809544200209345, + "grad_norm": 2.7951292991638184, + "learning_rate": 4.9946625720637683e-05, + "loss": 6.1954, + "step": 3499 + }, + { + "epoch": 0.02081549148349034, + "grad_norm": 3.2961854934692383, + "learning_rate": 4.994659521012034e-05, + "loss": 6.243, + "step": 3500 + }, + { + "epoch": 0.02082143876677134, + "grad_norm": 2.934246301651001, + "learning_rate": 4.99465646908944e-05, + "loss": 6.1307, + "step": 3501 + }, + { + "epoch": 0.020827386050052337, + "grad_norm": 3.9152729511260986, + "learning_rate": 4.994653416295987e-05, + "loss": 6.0167, + "step": 3502 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 4.510169506072998, + "learning_rate": 4.994650362631676e-05, + "loss": 6.533, + "step": 3503 + }, + { + "epoch": 0.02083928061661433, + "grad_norm": 3.415665864944458, + "learning_rate": 4.994647308096509e-05, + "loss": 6.4978, + "step": 3504 + }, + { + "epoch": 0.02084522789989533, + "grad_norm": 2.6515185832977295, + "learning_rate": 4.9946442526904856e-05, + "loss": 6.3859, + "step": 3505 + }, + { + "epoch": 0.020851175183176324, + "grad_norm": 2.8215248584747314, + "learning_rate": 4.994641196413609e-05, + "loss": 6.243, + "step": 3506 + }, + { + "epoch": 0.020857122466457323, + "grad_norm": 2.644529104232788, + "learning_rate": 4.9946381392658773e-05, + "loss": 6.2954, + "step": 3507 + }, + { + "epoch": 0.020863069749738318, + "grad_norm": 3.349699020385742, + "learning_rate": 4.994635081247294e-05, + "loss": 6.5617, + "step": 3508 + }, + { + "epoch": 0.020869017033019317, + "grad_norm": 3.3669090270996094, + "learning_rate": 4.9946320223578596e-05, + "loss": 6.6458, + "step": 3509 + }, + { + "epoch": 0.020874964316300315, + "grad_norm": 2.5562078952789307, + "learning_rate": 4.994628962597575e-05, + "loss": 5.5041, + "step": 3510 + }, + { + "epoch": 0.02088091159958131, + "grad_norm": 2.851809501647949, + "learning_rate": 4.994625901966441e-05, + "loss": 5.4607, + "step": 3511 + }, + { + "epoch": 0.02088685888286231, + "grad_norm": 3.2769458293914795, + "learning_rate": 4.994622840464458e-05, + "loss": 5.3115, + "step": 3512 + }, + { + "epoch": 0.020892806166143307, + "grad_norm": 2.5495102405548096, + "learning_rate": 4.994619778091629e-05, + "loss": 5.9997, + "step": 3513 + }, + { + "epoch": 0.020898753449424302, + "grad_norm": 2.609463930130005, + "learning_rate": 4.994616714847954e-05, + "loss": 6.562, + "step": 3514 + }, + { + "epoch": 0.0209047007327053, + "grad_norm": 2.5731685161590576, + "learning_rate": 4.994613650733433e-05, + "loss": 6.5341, + "step": 3515 + }, + { + "epoch": 0.020910648015986296, + "grad_norm": 2.481297254562378, + "learning_rate": 4.99461058574807e-05, + "loss": 6.5878, + "step": 3516 + }, + { + "epoch": 0.020916595299267295, + "grad_norm": 2.4096593856811523, + "learning_rate": 4.9946075198918624e-05, + "loss": 6.5054, + "step": 3517 + }, + { + "epoch": 0.020922542582548293, + "grad_norm": 2.4417459964752197, + "learning_rate": 4.994604453164814e-05, + "loss": 6.3292, + "step": 3518 + }, + { + "epoch": 0.020928489865829288, + "grad_norm": 2.7062435150146484, + "learning_rate": 4.994601385566925e-05, + "loss": 5.564, + "step": 3519 + }, + { + "epoch": 0.020934437149110287, + "grad_norm": 2.613614559173584, + "learning_rate": 4.9945983170981955e-05, + "loss": 5.3929, + "step": 3520 + }, + { + "epoch": 0.020940384432391285, + "grad_norm": 2.4933719635009766, + "learning_rate": 4.994595247758629e-05, + "loss": 6.1841, + "step": 3521 + }, + { + "epoch": 0.02094633171567228, + "grad_norm": 2.251507043838501, + "learning_rate": 4.994592177548224e-05, + "loss": 6.3109, + "step": 3522 + }, + { + "epoch": 0.02095227899895328, + "grad_norm": 2.3830223083496094, + "learning_rate": 4.994589106466983e-05, + "loss": 5.9421, + "step": 3523 + }, + { + "epoch": 0.020958226282234274, + "grad_norm": 2.2940196990966797, + "learning_rate": 4.994586034514906e-05, + "loss": 6.0858, + "step": 3524 + }, + { + "epoch": 0.020964173565515273, + "grad_norm": 2.916836977005005, + "learning_rate": 4.994582961691996e-05, + "loss": 5.166, + "step": 3525 + }, + { + "epoch": 0.02097012084879627, + "grad_norm": 2.7183029651641846, + "learning_rate": 4.994579887998252e-05, + "loss": 6.9732, + "step": 3526 + }, + { + "epoch": 0.020976068132077266, + "grad_norm": 2.70143985748291, + "learning_rate": 4.994576813433676e-05, + "loss": 5.917, + "step": 3527 + }, + { + "epoch": 0.020982015415358265, + "grad_norm": 2.7375986576080322, + "learning_rate": 4.994573737998269e-05, + "loss": 5.3025, + "step": 3528 + }, + { + "epoch": 0.02098796269863926, + "grad_norm": 2.656982183456421, + "learning_rate": 4.994570661692033e-05, + "loss": 5.2383, + "step": 3529 + }, + { + "epoch": 0.02099390998192026, + "grad_norm": 2.2119734287261963, + "learning_rate": 4.994567584514968e-05, + "loss": 6.0456, + "step": 3530 + }, + { + "epoch": 0.020999857265201257, + "grad_norm": 2.9191582202911377, + "learning_rate": 4.9945645064670737e-05, + "loss": 6.3808, + "step": 3531 + }, + { + "epoch": 0.021005804548482252, + "grad_norm": 3.124101400375366, + "learning_rate": 4.994561427548354e-05, + "loss": 5.3631, + "step": 3532 + }, + { + "epoch": 0.02101175183176325, + "grad_norm": 2.803938150405884, + "learning_rate": 4.994558347758808e-05, + "loss": 5.3172, + "step": 3533 + }, + { + "epoch": 0.02101769911504425, + "grad_norm": 2.6231577396392822, + "learning_rate": 4.994555267098438e-05, + "loss": 6.4466, + "step": 3534 + }, + { + "epoch": 0.021023646398325244, + "grad_norm": 2.735590696334839, + "learning_rate": 4.994552185567244e-05, + "loss": 5.3115, + "step": 3535 + }, + { + "epoch": 0.021029593681606243, + "grad_norm": 2.730459690093994, + "learning_rate": 4.994549103165228e-05, + "loss": 5.2311, + "step": 3536 + }, + { + "epoch": 0.021035540964887238, + "grad_norm": 2.1241424083709717, + "learning_rate": 4.994546019892391e-05, + "loss": 5.6599, + "step": 3537 + }, + { + "epoch": 0.021041488248168237, + "grad_norm": 2.607807159423828, + "learning_rate": 4.994542935748733e-05, + "loss": 6.1182, + "step": 3538 + }, + { + "epoch": 0.021047435531449235, + "grad_norm": 2.6896564960479736, + "learning_rate": 4.9945398507342567e-05, + "loss": 6.2827, + "step": 3539 + }, + { + "epoch": 0.02105338281473023, + "grad_norm": 2.9237961769104004, + "learning_rate": 4.994536764848962e-05, + "loss": 5.9629, + "step": 3540 + }, + { + "epoch": 0.02105933009801123, + "grad_norm": 2.7576143741607666, + "learning_rate": 4.99453367809285e-05, + "loss": 5.7612, + "step": 3541 + }, + { + "epoch": 0.021065277381292227, + "grad_norm": 3.1622097492218018, + "learning_rate": 4.9945305904659226e-05, + "loss": 6.0415, + "step": 3542 + }, + { + "epoch": 0.021071224664573222, + "grad_norm": 2.471127510070801, + "learning_rate": 4.994527501968179e-05, + "loss": 6.1264, + "step": 3543 + }, + { + "epoch": 0.02107717194785422, + "grad_norm": 2.797504425048828, + "learning_rate": 4.994524412599623e-05, + "loss": 6.3515, + "step": 3544 + }, + { + "epoch": 0.021083119231135216, + "grad_norm": 2.4932103157043457, + "learning_rate": 4.9945213223602535e-05, + "loss": 6.4327, + "step": 3545 + }, + { + "epoch": 0.021089066514416215, + "grad_norm": 2.5194599628448486, + "learning_rate": 4.9945182312500725e-05, + "loss": 6.4003, + "step": 3546 + }, + { + "epoch": 0.021095013797697213, + "grad_norm": 2.287858247756958, + "learning_rate": 4.9945151392690814e-05, + "loss": 6.3287, + "step": 3547 + }, + { + "epoch": 0.021100961080978208, + "grad_norm": 2.941619873046875, + "learning_rate": 4.994512046417281e-05, + "loss": 6.1364, + "step": 3548 + }, + { + "epoch": 0.021106908364259207, + "grad_norm": 3.1448967456817627, + "learning_rate": 4.994508952694672e-05, + "loss": 5.8638, + "step": 3549 + }, + { + "epoch": 0.021112855647540205, + "grad_norm": 2.869966983795166, + "learning_rate": 4.994505858101255e-05, + "loss": 6.0122, + "step": 3550 + }, + { + "epoch": 0.0211188029308212, + "grad_norm": 2.421264886856079, + "learning_rate": 4.9945027626370325e-05, + "loss": 6.1243, + "step": 3551 + }, + { + "epoch": 0.0211247502141022, + "grad_norm": 2.599456310272217, + "learning_rate": 4.9944996663020047e-05, + "loss": 5.9484, + "step": 3552 + }, + { + "epoch": 0.021130697497383194, + "grad_norm": 3.1029574871063232, + "learning_rate": 4.994496569096173e-05, + "loss": 5.9347, + "step": 3553 + }, + { + "epoch": 0.021136644780664193, + "grad_norm": 3.02494478225708, + "learning_rate": 4.994493471019538e-05, + "loss": 5.814, + "step": 3554 + }, + { + "epoch": 0.02114259206394519, + "grad_norm": 2.359682559967041, + "learning_rate": 4.994490372072101e-05, + "loss": 5.8533, + "step": 3555 + }, + { + "epoch": 0.021148539347226186, + "grad_norm": 2.7072582244873047, + "learning_rate": 4.994487272253864e-05, + "loss": 5.855, + "step": 3556 + }, + { + "epoch": 0.021154486630507185, + "grad_norm": 2.3102664947509766, + "learning_rate": 4.994484171564826e-05, + "loss": 5.6701, + "step": 3557 + }, + { + "epoch": 0.02116043391378818, + "grad_norm": 2.3804259300231934, + "learning_rate": 4.9944810700049906e-05, + "loss": 5.5096, + "step": 3558 + }, + { + "epoch": 0.02116638119706918, + "grad_norm": 2.463280439376831, + "learning_rate": 4.994477967574357e-05, + "loss": 5.5178, + "step": 3559 + }, + { + "epoch": 0.021172328480350177, + "grad_norm": 2.884152412414551, + "learning_rate": 4.9944748642729265e-05, + "loss": 6.1013, + "step": 3560 + }, + { + "epoch": 0.021178275763631172, + "grad_norm": 3.009460210800171, + "learning_rate": 4.9944717601007006e-05, + "loss": 6.2725, + "step": 3561 + }, + { + "epoch": 0.02118422304691217, + "grad_norm": 2.5930371284484863, + "learning_rate": 4.9944686550576814e-05, + "loss": 6.1138, + "step": 3562 + }, + { + "epoch": 0.02119017033019317, + "grad_norm": 2.8212878704071045, + "learning_rate": 4.9944655491438684e-05, + "loss": 5.6209, + "step": 3563 + }, + { + "epoch": 0.021196117613474164, + "grad_norm": 2.9814743995666504, + "learning_rate": 4.9944624423592634e-05, + "loss": 5.8912, + "step": 3564 + }, + { + "epoch": 0.021202064896755163, + "grad_norm": 3.1456093788146973, + "learning_rate": 4.994459334703867e-05, + "loss": 5.961, + "step": 3565 + }, + { + "epoch": 0.021208012180036158, + "grad_norm": 2.9300050735473633, + "learning_rate": 4.9944562261776805e-05, + "loss": 6.773, + "step": 3566 + }, + { + "epoch": 0.021213959463317156, + "grad_norm": 2.570685625076294, + "learning_rate": 4.994453116780705e-05, + "loss": 6.3575, + "step": 3567 + }, + { + "epoch": 0.021219906746598155, + "grad_norm": 2.7060914039611816, + "learning_rate": 4.994450006512943e-05, + "loss": 6.249, + "step": 3568 + }, + { + "epoch": 0.02122585402987915, + "grad_norm": 3.0027518272399902, + "learning_rate": 4.994446895374393e-05, + "loss": 5.8243, + "step": 3569 + }, + { + "epoch": 0.02123180131316015, + "grad_norm": 2.785888195037842, + "learning_rate": 4.994443783365058e-05, + "loss": 5.9836, + "step": 3570 + }, + { + "epoch": 0.021237748596441147, + "grad_norm": 2.5480010509490967, + "learning_rate": 4.994440670484938e-05, + "loss": 6.4237, + "step": 3571 + }, + { + "epoch": 0.021243695879722142, + "grad_norm": 2.687121629714966, + "learning_rate": 4.9944375567340345e-05, + "loss": 6.4497, + "step": 3572 + }, + { + "epoch": 0.02124964316300314, + "grad_norm": 2.6066362857818604, + "learning_rate": 4.994434442112349e-05, + "loss": 6.3853, + "step": 3573 + }, + { + "epoch": 0.021255590446284136, + "grad_norm": 2.880352020263672, + "learning_rate": 4.994431326619882e-05, + "loss": 6.382, + "step": 3574 + }, + { + "epoch": 0.021261537729565134, + "grad_norm": 3.0415213108062744, + "learning_rate": 4.9944282102566345e-05, + "loss": 6.4472, + "step": 3575 + }, + { + "epoch": 0.021267485012846133, + "grad_norm": 2.4917140007019043, + "learning_rate": 4.994425093022609e-05, + "loss": 6.2546, + "step": 3576 + }, + { + "epoch": 0.021273432296127128, + "grad_norm": 2.53648042678833, + "learning_rate": 4.9944219749178044e-05, + "loss": 6.37, + "step": 3577 + }, + { + "epoch": 0.021279379579408127, + "grad_norm": 2.796342134475708, + "learning_rate": 4.994418855942223e-05, + "loss": 6.1691, + "step": 3578 + }, + { + "epoch": 0.021285326862689125, + "grad_norm": 2.9148125648498535, + "learning_rate": 4.9944157360958656e-05, + "loss": 6.2552, + "step": 3579 + }, + { + "epoch": 0.02129127414597012, + "grad_norm": 3.0777838230133057, + "learning_rate": 4.994412615378734e-05, + "loss": 6.2359, + "step": 3580 + }, + { + "epoch": 0.02129722142925112, + "grad_norm": 2.5878093242645264, + "learning_rate": 4.994409493790828e-05, + "loss": 6.0746, + "step": 3581 + }, + { + "epoch": 0.021303168712532114, + "grad_norm": 3.2084906101226807, + "learning_rate": 4.99440637133215e-05, + "loss": 6.1357, + "step": 3582 + }, + { + "epoch": 0.021309115995813113, + "grad_norm": 3.7210965156555176, + "learning_rate": 4.9944032480027004e-05, + "loss": 6.5117, + "step": 3583 + }, + { + "epoch": 0.02131506327909411, + "grad_norm": 2.8332109451293945, + "learning_rate": 4.994400123802481e-05, + "loss": 6.0908, + "step": 3584 + }, + { + "epoch": 0.021321010562375106, + "grad_norm": 2.83854341506958, + "learning_rate": 4.994396998731491e-05, + "loss": 6.1522, + "step": 3585 + }, + { + "epoch": 0.021326957845656105, + "grad_norm": 2.5171611309051514, + "learning_rate": 4.9943938727897335e-05, + "loss": 6.2253, + "step": 3586 + }, + { + "epoch": 0.0213329051289371, + "grad_norm": 2.2111763954162598, + "learning_rate": 4.9943907459772086e-05, + "loss": 5.7673, + "step": 3587 + }, + { + "epoch": 0.0213388524122181, + "grad_norm": 2.5147926807403564, + "learning_rate": 4.994387618293918e-05, + "loss": 6.8327, + "step": 3588 + }, + { + "epoch": 0.021344799695499097, + "grad_norm": 2.969285488128662, + "learning_rate": 4.9943844897398626e-05, + "loss": 6.9995, + "step": 3589 + }, + { + "epoch": 0.021350746978780092, + "grad_norm": 4.00917911529541, + "learning_rate": 4.994381360315043e-05, + "loss": 6.6377, + "step": 3590 + }, + { + "epoch": 0.02135669426206109, + "grad_norm": 3.899319887161255, + "learning_rate": 4.994378230019461e-05, + "loss": 6.162, + "step": 3591 + }, + { + "epoch": 0.02136264154534209, + "grad_norm": 2.9522764682769775, + "learning_rate": 4.994375098853117e-05, + "loss": 6.4405, + "step": 3592 + }, + { + "epoch": 0.021368588828623084, + "grad_norm": 3.0569825172424316, + "learning_rate": 4.994371966816012e-05, + "loss": 6.2631, + "step": 3593 + }, + { + "epoch": 0.021374536111904083, + "grad_norm": 2.9470009803771973, + "learning_rate": 4.994368833908148e-05, + "loss": 6.4785, + "step": 3594 + }, + { + "epoch": 0.021380483395185078, + "grad_norm": 2.913940668106079, + "learning_rate": 4.994365700129525e-05, + "loss": 6.6566, + "step": 3595 + }, + { + "epoch": 0.021386430678466076, + "grad_norm": 2.6037404537200928, + "learning_rate": 4.9943625654801465e-05, + "loss": 6.2535, + "step": 3596 + }, + { + "epoch": 0.021392377961747075, + "grad_norm": 2.998276948928833, + "learning_rate": 4.99435942996001e-05, + "loss": 6.8851, + "step": 3597 + }, + { + "epoch": 0.02139832524502807, + "grad_norm": 2.2189996242523193, + "learning_rate": 4.994356293569119e-05, + "loss": 6.8707, + "step": 3598 + }, + { + "epoch": 0.02140427252830907, + "grad_norm": 2.4528486728668213, + "learning_rate": 4.994353156307474e-05, + "loss": 6.9166, + "step": 3599 + }, + { + "epoch": 0.021410219811590067, + "grad_norm": 3.0538241863250732, + "learning_rate": 4.994350018175076e-05, + "loss": 6.3258, + "step": 3600 + }, + { + "epoch": 0.021416167094871062, + "grad_norm": 3.789745569229126, + "learning_rate": 4.994346879171926e-05, + "loss": 6.1962, + "step": 3601 + }, + { + "epoch": 0.02142211437815206, + "grad_norm": 3.2789254188537598, + "learning_rate": 4.994343739298025e-05, + "loss": 6.2126, + "step": 3602 + }, + { + "epoch": 0.021428061661433056, + "grad_norm": 3.0887696743011475, + "learning_rate": 4.994340598553375e-05, + "loss": 6.2395, + "step": 3603 + }, + { + "epoch": 0.021434008944714054, + "grad_norm": 2.9189252853393555, + "learning_rate": 4.994337456937977e-05, + "loss": 6.193, + "step": 3604 + }, + { + "epoch": 0.021439956227995053, + "grad_norm": 2.8582170009613037, + "learning_rate": 4.9943343144518306e-05, + "loss": 6.1077, + "step": 3605 + }, + { + "epoch": 0.021445903511276048, + "grad_norm": 3.076979160308838, + "learning_rate": 4.994331171094938e-05, + "loss": 6.0474, + "step": 3606 + }, + { + "epoch": 0.021451850794557047, + "grad_norm": 3.482161045074463, + "learning_rate": 4.994328026867301e-05, + "loss": 6.0551, + "step": 3607 + }, + { + "epoch": 0.021457798077838045, + "grad_norm": 3.001046895980835, + "learning_rate": 4.994324881768919e-05, + "loss": 6.0393, + "step": 3608 + }, + { + "epoch": 0.02146374536111904, + "grad_norm": 2.8006365299224854, + "learning_rate": 4.994321735799794e-05, + "loss": 6.0042, + "step": 3609 + }, + { + "epoch": 0.02146969264440004, + "grad_norm": 3.10727858543396, + "learning_rate": 4.994318588959927e-05, + "loss": 5.8981, + "step": 3610 + }, + { + "epoch": 0.021475639927681034, + "grad_norm": 2.660557985305786, + "learning_rate": 4.9943154412493194e-05, + "loss": 6.0426, + "step": 3611 + }, + { + "epoch": 0.021481587210962032, + "grad_norm": 2.8504562377929688, + "learning_rate": 4.994312292667972e-05, + "loss": 6.9774, + "step": 3612 + }, + { + "epoch": 0.02148753449424303, + "grad_norm": 3.0076539516448975, + "learning_rate": 4.994309143215886e-05, + "loss": 6.3238, + "step": 3613 + }, + { + "epoch": 0.021493481777524026, + "grad_norm": 2.2966883182525635, + "learning_rate": 4.9943059928930626e-05, + "loss": 7.0015, + "step": 3614 + }, + { + "epoch": 0.021499429060805025, + "grad_norm": 2.5054080486297607, + "learning_rate": 4.994302841699502e-05, + "loss": 6.9226, + "step": 3615 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 2.856278657913208, + "learning_rate": 4.9942996896352066e-05, + "loss": 6.7836, + "step": 3616 + }, + { + "epoch": 0.02151132362736702, + "grad_norm": 2.4902377128601074, + "learning_rate": 4.994296536700177e-05, + "loss": 6.7077, + "step": 3617 + }, + { + "epoch": 0.021517270910648017, + "grad_norm": 2.477932929992676, + "learning_rate": 4.994293382894414e-05, + "loss": 6.8284, + "step": 3618 + }, + { + "epoch": 0.021523218193929012, + "grad_norm": 2.3034260272979736, + "learning_rate": 4.994290228217919e-05, + "loss": 6.8012, + "step": 3619 + }, + { + "epoch": 0.02152916547721001, + "grad_norm": 2.3850560188293457, + "learning_rate": 4.9942870726706934e-05, + "loss": 6.6208, + "step": 3620 + }, + { + "epoch": 0.02153511276049101, + "grad_norm": 2.4397644996643066, + "learning_rate": 4.994283916252738e-05, + "loss": 6.7522, + "step": 3621 + }, + { + "epoch": 0.021541060043772004, + "grad_norm": 2.400846242904663, + "learning_rate": 4.994280758964053e-05, + "loss": 6.7529, + "step": 3622 + }, + { + "epoch": 0.021547007327053003, + "grad_norm": 2.358290195465088, + "learning_rate": 4.994277600804641e-05, + "loss": 6.6812, + "step": 3623 + }, + { + "epoch": 0.021552954610333998, + "grad_norm": 2.7409300804138184, + "learning_rate": 4.994274441774503e-05, + "loss": 6.668, + "step": 3624 + }, + { + "epoch": 0.021558901893614996, + "grad_norm": 2.6890954971313477, + "learning_rate": 4.994271281873639e-05, + "loss": 6.5537, + "step": 3625 + }, + { + "epoch": 0.021564849176895995, + "grad_norm": 2.8959596157073975, + "learning_rate": 4.9942681211020505e-05, + "loss": 6.4492, + "step": 3626 + }, + { + "epoch": 0.02157079646017699, + "grad_norm": 2.4325244426727295, + "learning_rate": 4.994264959459738e-05, + "loss": 6.9819, + "step": 3627 + }, + { + "epoch": 0.02157674374345799, + "grad_norm": 2.92891263961792, + "learning_rate": 4.9942617969467045e-05, + "loss": 6.9266, + "step": 3628 + }, + { + "epoch": 0.021582691026738987, + "grad_norm": 2.4398467540740967, + "learning_rate": 4.994258633562951e-05, + "loss": 6.514, + "step": 3629 + }, + { + "epoch": 0.021588638310019982, + "grad_norm": 2.577467203140259, + "learning_rate": 4.9942554693084756e-05, + "loss": 6.7248, + "step": 3630 + }, + { + "epoch": 0.02159458559330098, + "grad_norm": 2.3682591915130615, + "learning_rate": 4.9942523041832824e-05, + "loss": 6.7798, + "step": 3631 + }, + { + "epoch": 0.021600532876581976, + "grad_norm": 2.1863434314727783, + "learning_rate": 4.9942491381873705e-05, + "loss": 6.6636, + "step": 3632 + }, + { + "epoch": 0.021606480159862974, + "grad_norm": 2.0172441005706787, + "learning_rate": 4.9942459713207426e-05, + "loss": 6.6772, + "step": 3633 + }, + { + "epoch": 0.021612427443143973, + "grad_norm": 1.8671952486038208, + "learning_rate": 4.9942428035834e-05, + "loss": 6.3648, + "step": 3634 + }, + { + "epoch": 0.021618374726424968, + "grad_norm": 3.226900815963745, + "learning_rate": 4.9942396349753416e-05, + "loss": 6.4127, + "step": 3635 + }, + { + "epoch": 0.021624322009705967, + "grad_norm": 2.7766973972320557, + "learning_rate": 4.994236465496571e-05, + "loss": 6.4476, + "step": 3636 + }, + { + "epoch": 0.021630269292986965, + "grad_norm": 2.157118082046509, + "learning_rate": 4.9942332951470875e-05, + "loss": 6.5876, + "step": 3637 + }, + { + "epoch": 0.02163621657626796, + "grad_norm": 2.3870396614074707, + "learning_rate": 4.994230123926893e-05, + "loss": 6.5861, + "step": 3638 + }, + { + "epoch": 0.02164216385954896, + "grad_norm": 2.8139939308166504, + "learning_rate": 4.994226951835989e-05, + "loss": 6.4845, + "step": 3639 + }, + { + "epoch": 0.021648111142829954, + "grad_norm": 2.856207847595215, + "learning_rate": 4.9942237788743764e-05, + "loss": 6.1514, + "step": 3640 + }, + { + "epoch": 0.021654058426110952, + "grad_norm": 3.523162603378296, + "learning_rate": 4.9942206050420545e-05, + "loss": 5.8114, + "step": 3641 + }, + { + "epoch": 0.02166000570939195, + "grad_norm": 2.746587038040161, + "learning_rate": 4.9942174303390274e-05, + "loss": 5.7397, + "step": 3642 + }, + { + "epoch": 0.021665952992672946, + "grad_norm": 2.902067184448242, + "learning_rate": 4.9942142547652946e-05, + "loss": 6.4353, + "step": 3643 + }, + { + "epoch": 0.021671900275953945, + "grad_norm": 2.981391191482544, + "learning_rate": 4.994211078320857e-05, + "loss": 6.2153, + "step": 3644 + }, + { + "epoch": 0.021677847559234943, + "grad_norm": 2.6004254817962646, + "learning_rate": 4.994207901005716e-05, + "loss": 6.2365, + "step": 3645 + }, + { + "epoch": 0.021683794842515938, + "grad_norm": 2.748678684234619, + "learning_rate": 4.994204722819873e-05, + "loss": 5.8126, + "step": 3646 + }, + { + "epoch": 0.021689742125796937, + "grad_norm": 2.675466299057007, + "learning_rate": 4.994201543763329e-05, + "loss": 6.3032, + "step": 3647 + }, + { + "epoch": 0.021695689409077932, + "grad_norm": 2.681823253631592, + "learning_rate": 4.9941983638360855e-05, + "loss": 6.2706, + "step": 3648 + }, + { + "epoch": 0.02170163669235893, + "grad_norm": 2.481586217880249, + "learning_rate": 4.994195183038142e-05, + "loss": 6.1792, + "step": 3649 + }, + { + "epoch": 0.02170758397563993, + "grad_norm": 2.3379831314086914, + "learning_rate": 4.9941920013695024e-05, + "loss": 6.2689, + "step": 3650 + }, + { + "epoch": 0.021713531258920924, + "grad_norm": 2.5885238647460938, + "learning_rate": 4.994188818830164e-05, + "loss": 6.3018, + "step": 3651 + }, + { + "epoch": 0.021719478542201923, + "grad_norm": 2.341939687728882, + "learning_rate": 4.994185635420131e-05, + "loss": 5.6178, + "step": 3652 + }, + { + "epoch": 0.021725425825482918, + "grad_norm": 2.4126031398773193, + "learning_rate": 4.9941824511394044e-05, + "loss": 5.4044, + "step": 3653 + }, + { + "epoch": 0.021731373108763916, + "grad_norm": 2.2289719581604004, + "learning_rate": 4.994179265987983e-05, + "loss": 5.4134, + "step": 3654 + }, + { + "epoch": 0.021737320392044915, + "grad_norm": 2.5151331424713135, + "learning_rate": 4.994176079965871e-05, + "loss": 5.3321, + "step": 3655 + }, + { + "epoch": 0.02174326767532591, + "grad_norm": 2.0761523246765137, + "learning_rate": 4.9941728930730665e-05, + "loss": 5.3363, + "step": 3656 + }, + { + "epoch": 0.02174921495860691, + "grad_norm": 2.272510051727295, + "learning_rate": 4.994169705309573e-05, + "loss": 6.0208, + "step": 3657 + }, + { + "epoch": 0.021755162241887907, + "grad_norm": 2.6145198345184326, + "learning_rate": 4.994166516675389e-05, + "loss": 6.299, + "step": 3658 + }, + { + "epoch": 0.021761109525168902, + "grad_norm": 2.978618621826172, + "learning_rate": 4.994163327170519e-05, + "loss": 5.1248, + "step": 3659 + }, + { + "epoch": 0.0217670568084499, + "grad_norm": 2.398813247680664, + "learning_rate": 4.994160136794962e-05, + "loss": 5.1217, + "step": 3660 + }, + { + "epoch": 0.021773004091730896, + "grad_norm": 2.1145291328430176, + "learning_rate": 4.994156945548719e-05, + "loss": 5.2676, + "step": 3661 + }, + { + "epoch": 0.021778951375011894, + "grad_norm": 2.045334577560425, + "learning_rate": 4.9941537534317915e-05, + "loss": 5.2088, + "step": 3662 + }, + { + "epoch": 0.021784898658292893, + "grad_norm": 2.0598506927490234, + "learning_rate": 4.9941505604441806e-05, + "loss": 5.363, + "step": 3663 + }, + { + "epoch": 0.021790845941573888, + "grad_norm": 2.189143657684326, + "learning_rate": 4.9941473665858884e-05, + "loss": 6.0592, + "step": 3664 + }, + { + "epoch": 0.021796793224854887, + "grad_norm": 6.8580780029296875, + "learning_rate": 4.994144171856915e-05, + "loss": 6.0323, + "step": 3665 + }, + { + "epoch": 0.021802740508135885, + "grad_norm": 2.0607001781463623, + "learning_rate": 4.994140976257261e-05, + "loss": 6.0883, + "step": 3666 + }, + { + "epoch": 0.02180868779141688, + "grad_norm": 2.1669631004333496, + "learning_rate": 4.9941377797869284e-05, + "loss": 6.0546, + "step": 3667 + }, + { + "epoch": 0.02181463507469788, + "grad_norm": 2.912822961807251, + "learning_rate": 4.994134582445917e-05, + "loss": 6.0285, + "step": 3668 + }, + { + "epoch": 0.021820582357978874, + "grad_norm": 2.3223111629486084, + "learning_rate": 4.994131384234231e-05, + "loss": 6.0948, + "step": 3669 + }, + { + "epoch": 0.021826529641259872, + "grad_norm": 2.067002296447754, + "learning_rate": 4.994128185151868e-05, + "loss": 6.2908, + "step": 3670 + }, + { + "epoch": 0.02183247692454087, + "grad_norm": 2.593642473220825, + "learning_rate": 4.9941249851988317e-05, + "loss": 6.2878, + "step": 3671 + }, + { + "epoch": 0.021838424207821866, + "grad_norm": 2.6345975399017334, + "learning_rate": 4.994121784375121e-05, + "loss": 6.0796, + "step": 3672 + }, + { + "epoch": 0.021844371491102865, + "grad_norm": 2.398861885070801, + "learning_rate": 4.994118582680739e-05, + "loss": 6.096, + "step": 3673 + }, + { + "epoch": 0.021850318774383863, + "grad_norm": 2.102933883666992, + "learning_rate": 4.994115380115686e-05, + "loss": 6.1347, + "step": 3674 + }, + { + "epoch": 0.021856266057664858, + "grad_norm": 2.43632435798645, + "learning_rate": 4.994112176679963e-05, + "loss": 6.074, + "step": 3675 + }, + { + "epoch": 0.021862213340945857, + "grad_norm": 2.304213523864746, + "learning_rate": 4.9941089723735706e-05, + "loss": 5.8897, + "step": 3676 + }, + { + "epoch": 0.021868160624226852, + "grad_norm": 2.6283092498779297, + "learning_rate": 4.9941057671965106e-05, + "loss": 5.9605, + "step": 3677 + }, + { + "epoch": 0.02187410790750785, + "grad_norm": 2.0781428813934326, + "learning_rate": 4.994102561148785e-05, + "loss": 6.0645, + "step": 3678 + }, + { + "epoch": 0.02188005519078885, + "grad_norm": 2.229210376739502, + "learning_rate": 4.994099354230393e-05, + "loss": 6.223, + "step": 3679 + }, + { + "epoch": 0.021886002474069844, + "grad_norm": 2.4410789012908936, + "learning_rate": 4.9940961464413374e-05, + "loss": 6.1115, + "step": 3680 + }, + { + "epoch": 0.021891949757350843, + "grad_norm": 2.99076771736145, + "learning_rate": 4.994092937781618e-05, + "loss": 5.9028, + "step": 3681 + }, + { + "epoch": 0.021897897040631838, + "grad_norm": 2.8403074741363525, + "learning_rate": 4.994089728251237e-05, + "loss": 5.7286, + "step": 3682 + }, + { + "epoch": 0.021903844323912836, + "grad_norm": 2.0928149223327637, + "learning_rate": 4.994086517850195e-05, + "loss": 5.849, + "step": 3683 + }, + { + "epoch": 0.021909791607193835, + "grad_norm": 2.320279836654663, + "learning_rate": 4.994083306578492e-05, + "loss": 5.6767, + "step": 3684 + }, + { + "epoch": 0.02191573889047483, + "grad_norm": 3.0701658725738525, + "learning_rate": 4.994080094436132e-05, + "loss": 5.9555, + "step": 3685 + }, + { + "epoch": 0.02192168617375583, + "grad_norm": 2.1042048931121826, + "learning_rate": 4.994076881423113e-05, + "loss": 5.7651, + "step": 3686 + }, + { + "epoch": 0.021927633457036827, + "grad_norm": 2.35819673538208, + "learning_rate": 4.9940736675394385e-05, + "loss": 6.0203, + "step": 3687 + }, + { + "epoch": 0.021933580740317822, + "grad_norm": 2.659224510192871, + "learning_rate": 4.994070452785108e-05, + "loss": 5.9935, + "step": 3688 + }, + { + "epoch": 0.02193952802359882, + "grad_norm": 2.4628207683563232, + "learning_rate": 4.994067237160124e-05, + "loss": 5.9135, + "step": 3689 + }, + { + "epoch": 0.021945475306879816, + "grad_norm": 3.7227911949157715, + "learning_rate": 4.9940640206644865e-05, + "loss": 5.8365, + "step": 3690 + }, + { + "epoch": 0.021951422590160814, + "grad_norm": 3.5226151943206787, + "learning_rate": 4.994060803298197e-05, + "loss": 5.7807, + "step": 3691 + }, + { + "epoch": 0.021957369873441813, + "grad_norm": 2.3665735721588135, + "learning_rate": 4.994057585061256e-05, + "loss": 5.9632, + "step": 3692 + }, + { + "epoch": 0.021963317156722808, + "grad_norm": 2.877263069152832, + "learning_rate": 4.9940543659536666e-05, + "loss": 5.6425, + "step": 3693 + }, + { + "epoch": 0.021969264440003806, + "grad_norm": 2.5431532859802246, + "learning_rate": 4.994051145975428e-05, + "loss": 5.6531, + "step": 3694 + }, + { + "epoch": 0.021975211723284805, + "grad_norm": 2.7033538818359375, + "learning_rate": 4.9940479251265415e-05, + "loss": 5.6907, + "step": 3695 + }, + { + "epoch": 0.0219811590065658, + "grad_norm": 3.6627206802368164, + "learning_rate": 4.9940447034070093e-05, + "loss": 5.9118, + "step": 3696 + }, + { + "epoch": 0.0219871062898468, + "grad_norm": 3.896959066390991, + "learning_rate": 4.994041480816831e-05, + "loss": 5.9926, + "step": 3697 + }, + { + "epoch": 0.021993053573127794, + "grad_norm": 3.37575626373291, + "learning_rate": 4.994038257356009e-05, + "loss": 5.9768, + "step": 3698 + }, + { + "epoch": 0.021999000856408792, + "grad_norm": 2.7694313526153564, + "learning_rate": 4.9940350330245444e-05, + "loss": 5.8486, + "step": 3699 + }, + { + "epoch": 0.02200494813968979, + "grad_norm": 2.3815293312072754, + "learning_rate": 4.9940318078224376e-05, + "loss": 6.0663, + "step": 3700 + }, + { + "epoch": 0.022010895422970786, + "grad_norm": 2.3171627521514893, + "learning_rate": 4.99402858174969e-05, + "loss": 5.8543, + "step": 3701 + }, + { + "epoch": 0.022016842706251784, + "grad_norm": 2.5090551376342773, + "learning_rate": 4.994025354806303e-05, + "loss": 5.7005, + "step": 3702 + }, + { + "epoch": 0.022022789989532783, + "grad_norm": 2.7024855613708496, + "learning_rate": 4.9940221269922774e-05, + "loss": 5.7375, + "step": 3703 + }, + { + "epoch": 0.022028737272813778, + "grad_norm": 2.7900679111480713, + "learning_rate": 4.994018898307614e-05, + "loss": 6.0094, + "step": 3704 + }, + { + "epoch": 0.022034684556094777, + "grad_norm": 2.3678438663482666, + "learning_rate": 4.994015668752315e-05, + "loss": 5.822, + "step": 3705 + }, + { + "epoch": 0.022040631839375772, + "grad_norm": 2.5406653881073, + "learning_rate": 4.9940124383263807e-05, + "loss": 5.8984, + "step": 3706 + }, + { + "epoch": 0.02204657912265677, + "grad_norm": 2.371800422668457, + "learning_rate": 4.994009207029813e-05, + "loss": 5.9821, + "step": 3707 + }, + { + "epoch": 0.02205252640593777, + "grad_norm": 2.004669666290283, + "learning_rate": 4.994005974862612e-05, + "loss": 5.8801, + "step": 3708 + }, + { + "epoch": 0.022058473689218764, + "grad_norm": 2.777472972869873, + "learning_rate": 4.9940027418247787e-05, + "loss": 5.8821, + "step": 3709 + }, + { + "epoch": 0.022064420972499763, + "grad_norm": 2.599883556365967, + "learning_rate": 4.9939995079163156e-05, + "loss": 5.8716, + "step": 3710 + }, + { + "epoch": 0.022070368255780758, + "grad_norm": 2.5891127586364746, + "learning_rate": 4.993996273137223e-05, + "loss": 5.7607, + "step": 3711 + }, + { + "epoch": 0.022076315539061756, + "grad_norm": 2.3737518787384033, + "learning_rate": 4.993993037487501e-05, + "loss": 5.7825, + "step": 3712 + }, + { + "epoch": 0.022082262822342755, + "grad_norm": 2.421785831451416, + "learning_rate": 4.9939898009671524e-05, + "loss": 5.7143, + "step": 3713 + }, + { + "epoch": 0.02208821010562375, + "grad_norm": 2.4267804622650146, + "learning_rate": 4.9939865635761785e-05, + "loss": 5.8031, + "step": 3714 + }, + { + "epoch": 0.02209415738890475, + "grad_norm": 2.390333414077759, + "learning_rate": 4.993983325314579e-05, + "loss": 5.7985, + "step": 3715 + }, + { + "epoch": 0.022100104672185747, + "grad_norm": 2.2265970706939697, + "learning_rate": 4.993980086182356e-05, + "loss": 5.6261, + "step": 3716 + }, + { + "epoch": 0.022106051955466742, + "grad_norm": 2.3872458934783936, + "learning_rate": 4.99397684617951e-05, + "loss": 5.8185, + "step": 3717 + }, + { + "epoch": 0.02211199923874774, + "grad_norm": 2.077075958251953, + "learning_rate": 4.9939736053060425e-05, + "loss": 5.6252, + "step": 3718 + }, + { + "epoch": 0.022117946522028736, + "grad_norm": 2.0642287731170654, + "learning_rate": 4.993970363561954e-05, + "loss": 5.8034, + "step": 3719 + }, + { + "epoch": 0.022123893805309734, + "grad_norm": 3.5353951454162598, + "learning_rate": 4.9939671209472474e-05, + "loss": 6.7808, + "step": 3720 + }, + { + "epoch": 0.022129841088590733, + "grad_norm": 2.910531520843506, + "learning_rate": 4.9939638774619216e-05, + "loss": 5.9323, + "step": 3721 + }, + { + "epoch": 0.022135788371871728, + "grad_norm": 2.7450106143951416, + "learning_rate": 4.9939606331059794e-05, + "loss": 5.9926, + "step": 3722 + }, + { + "epoch": 0.022141735655152726, + "grad_norm": 2.7628188133239746, + "learning_rate": 4.993957387879421e-05, + "loss": 5.9129, + "step": 3723 + }, + { + "epoch": 0.022147682938433725, + "grad_norm": 2.6644890308380127, + "learning_rate": 4.9939541417822485e-05, + "loss": 5.7038, + "step": 3724 + }, + { + "epoch": 0.02215363022171472, + "grad_norm": 2.143744707107544, + "learning_rate": 4.993950894814461e-05, + "loss": 5.5821, + "step": 3725 + }, + { + "epoch": 0.02215957750499572, + "grad_norm": 2.1691160202026367, + "learning_rate": 4.993947646976063e-05, + "loss": 5.5929, + "step": 3726 + }, + { + "epoch": 0.022165524788276714, + "grad_norm": 2.1479709148406982, + "learning_rate": 4.993944398267052e-05, + "loss": 5.6653, + "step": 3727 + }, + { + "epoch": 0.022171472071557712, + "grad_norm": 2.7749600410461426, + "learning_rate": 4.993941148687431e-05, + "loss": 5.5682, + "step": 3728 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 2.668672561645508, + "learning_rate": 4.993937898237201e-05, + "loss": 5.5968, + "step": 3729 + }, + { + "epoch": 0.022183366638119706, + "grad_norm": 2.3903374671936035, + "learning_rate": 4.993934646916364e-05, + "loss": 5.7541, + "step": 3730 + }, + { + "epoch": 0.022189313921400704, + "grad_norm": 1.8555344343185425, + "learning_rate": 4.993931394724919e-05, + "loss": 5.5449, + "step": 3731 + }, + { + "epoch": 0.022195261204681703, + "grad_norm": 2.1140637397766113, + "learning_rate": 4.993928141662869e-05, + "loss": 5.8201, + "step": 3732 + }, + { + "epoch": 0.022201208487962698, + "grad_norm": 2.221573829650879, + "learning_rate": 4.993924887730213e-05, + "loss": 5.7583, + "step": 3733 + }, + { + "epoch": 0.022207155771243697, + "grad_norm": 2.0801634788513184, + "learning_rate": 4.993921632926956e-05, + "loss": 5.7083, + "step": 3734 + }, + { + "epoch": 0.02221310305452469, + "grad_norm": 2.0167016983032227, + "learning_rate": 4.993918377253095e-05, + "loss": 5.7798, + "step": 3735 + }, + { + "epoch": 0.02221905033780569, + "grad_norm": 2.104529619216919, + "learning_rate": 4.993915120708634e-05, + "loss": 5.7346, + "step": 3736 + }, + { + "epoch": 0.02222499762108669, + "grad_norm": 2.0807201862335205, + "learning_rate": 4.993911863293572e-05, + "loss": 5.7663, + "step": 3737 + }, + { + "epoch": 0.022230944904367684, + "grad_norm": 1.9223891496658325, + "learning_rate": 4.9939086050079115e-05, + "loss": 5.648, + "step": 3738 + }, + { + "epoch": 0.022236892187648682, + "grad_norm": 2.3831584453582764, + "learning_rate": 4.9939053458516535e-05, + "loss": 5.7988, + "step": 3739 + }, + { + "epoch": 0.02224283947092968, + "grad_norm": 2.433318853378296, + "learning_rate": 4.993902085824799e-05, + "loss": 5.7794, + "step": 3740 + }, + { + "epoch": 0.022248786754210676, + "grad_norm": 2.2488365173339844, + "learning_rate": 4.993898824927348e-05, + "loss": 5.7332, + "step": 3741 + }, + { + "epoch": 0.022254734037491675, + "grad_norm": 2.2924392223358154, + "learning_rate": 4.993895563159303e-05, + "loss": 5.8977, + "step": 3742 + }, + { + "epoch": 0.02226068132077267, + "grad_norm": 2.1601176261901855, + "learning_rate": 4.9938923005206664e-05, + "loss": 5.8588, + "step": 3743 + }, + { + "epoch": 0.02226662860405367, + "grad_norm": 2.256439447402954, + "learning_rate": 4.993889037011436e-05, + "loss": 5.6111, + "step": 3744 + }, + { + "epoch": 0.022272575887334667, + "grad_norm": 2.184950828552246, + "learning_rate": 4.993885772631615e-05, + "loss": 5.7544, + "step": 3745 + }, + { + "epoch": 0.022278523170615662, + "grad_norm": 2.250422716140747, + "learning_rate": 4.993882507381205e-05, + "loss": 5.6534, + "step": 3746 + }, + { + "epoch": 0.02228447045389666, + "grad_norm": 2.473811626434326, + "learning_rate": 4.9938792412602056e-05, + "loss": 5.5699, + "step": 3747 + }, + { + "epoch": 0.022290417737177656, + "grad_norm": 2.2859978675842285, + "learning_rate": 4.993875974268619e-05, + "loss": 5.8712, + "step": 3748 + }, + { + "epoch": 0.022296365020458654, + "grad_norm": 2.4002318382263184, + "learning_rate": 4.993872706406446e-05, + "loss": 5.8121, + "step": 3749 + }, + { + "epoch": 0.022302312303739653, + "grad_norm": 2.2692153453826904, + "learning_rate": 4.9938694376736884e-05, + "loss": 5.5516, + "step": 3750 + }, + { + "epoch": 0.022308259587020648, + "grad_norm": 2.1874892711639404, + "learning_rate": 4.9938661680703456e-05, + "loss": 5.8264, + "step": 3751 + }, + { + "epoch": 0.022314206870301646, + "grad_norm": 2.3802871704101562, + "learning_rate": 4.993862897596421e-05, + "loss": 5.6523, + "step": 3752 + }, + { + "epoch": 0.022320154153582645, + "grad_norm": 2.514646530151367, + "learning_rate": 4.9938596262519145e-05, + "loss": 5.5193, + "step": 3753 + }, + { + "epoch": 0.02232610143686364, + "grad_norm": 2.3175413608551025, + "learning_rate": 4.993856354036827e-05, + "loss": 5.5372, + "step": 3754 + }, + { + "epoch": 0.02233204872014464, + "grad_norm": 2.2071855068206787, + "learning_rate": 4.9938530809511595e-05, + "loss": 5.5002, + "step": 3755 + }, + { + "epoch": 0.022337996003425634, + "grad_norm": 2.046440839767456, + "learning_rate": 4.9938498069949144e-05, + "loss": 5.585, + "step": 3756 + }, + { + "epoch": 0.022343943286706632, + "grad_norm": 2.3971145153045654, + "learning_rate": 4.9938465321680915e-05, + "loss": 5.7858, + "step": 3757 + }, + { + "epoch": 0.02234989056998763, + "grad_norm": 2.462597131729126, + "learning_rate": 4.9938432564706936e-05, + "loss": 5.5606, + "step": 3758 + }, + { + "epoch": 0.022355837853268626, + "grad_norm": 2.3134138584136963, + "learning_rate": 4.99383997990272e-05, + "loss": 5.4587, + "step": 3759 + }, + { + "epoch": 0.022361785136549624, + "grad_norm": 2.137929916381836, + "learning_rate": 4.993836702464173e-05, + "loss": 5.4768, + "step": 3760 + }, + { + "epoch": 0.022367732419830623, + "grad_norm": 2.647691011428833, + "learning_rate": 4.993833424155053e-05, + "loss": 5.7902, + "step": 3761 + }, + { + "epoch": 0.022373679703111618, + "grad_norm": 2.535640239715576, + "learning_rate": 4.993830144975361e-05, + "loss": 5.8263, + "step": 3762 + }, + { + "epoch": 0.022379626986392617, + "grad_norm": 2.422997236251831, + "learning_rate": 4.9938268649251e-05, + "loss": 5.7751, + "step": 3763 + }, + { + "epoch": 0.02238557426967361, + "grad_norm": 2.6906728744506836, + "learning_rate": 4.9938235840042694e-05, + "loss": 5.5974, + "step": 3764 + }, + { + "epoch": 0.02239152155295461, + "grad_norm": 2.0284483432769775, + "learning_rate": 4.99382030221287e-05, + "loss": 5.6816, + "step": 3765 + }, + { + "epoch": 0.02239746883623561, + "grad_norm": 2.6392064094543457, + "learning_rate": 4.9938170195509035e-05, + "loss": 5.9052, + "step": 3766 + }, + { + "epoch": 0.022403416119516604, + "grad_norm": 2.6770617961883545, + "learning_rate": 4.993813736018372e-05, + "loss": 5.9041, + "step": 3767 + }, + { + "epoch": 0.022409363402797602, + "grad_norm": 2.5972392559051514, + "learning_rate": 4.993810451615276e-05, + "loss": 5.7834, + "step": 3768 + }, + { + "epoch": 0.0224153106860786, + "grad_norm": 2.0095736980438232, + "learning_rate": 4.993807166341616e-05, + "loss": 5.6074, + "step": 3769 + }, + { + "epoch": 0.022421257969359596, + "grad_norm": 2.412578582763672, + "learning_rate": 4.9938038801973945e-05, + "loss": 5.742, + "step": 3770 + }, + { + "epoch": 0.022427205252640595, + "grad_norm": 2.1285388469696045, + "learning_rate": 4.993800593182612e-05, + "loss": 5.7665, + "step": 3771 + }, + { + "epoch": 0.02243315253592159, + "grad_norm": 2.091252326965332, + "learning_rate": 4.993797305297268e-05, + "loss": 5.7165, + "step": 3772 + }, + { + "epoch": 0.022439099819202588, + "grad_norm": 2.5366342067718506, + "learning_rate": 4.993794016541367e-05, + "loss": 6.259, + "step": 3773 + }, + { + "epoch": 0.022445047102483587, + "grad_norm": 2.2637953758239746, + "learning_rate": 4.9937907269149063e-05, + "loss": 6.2132, + "step": 3774 + }, + { + "epoch": 0.022450994385764582, + "grad_norm": 2.570979595184326, + "learning_rate": 4.99378743641789e-05, + "loss": 5.9656, + "step": 3775 + }, + { + "epoch": 0.02245694166904558, + "grad_norm": 2.0587873458862305, + "learning_rate": 4.993784145050319e-05, + "loss": 5.7096, + "step": 3776 + }, + { + "epoch": 0.022462888952326576, + "grad_norm": 2.396812677383423, + "learning_rate": 4.993780852812192e-05, + "loss": 5.7258, + "step": 3777 + }, + { + "epoch": 0.022468836235607574, + "grad_norm": 2.081541061401367, + "learning_rate": 4.993777559703513e-05, + "loss": 5.6777, + "step": 3778 + }, + { + "epoch": 0.022474783518888573, + "grad_norm": 2.5242559909820557, + "learning_rate": 4.993774265724281e-05, + "loss": 5.961, + "step": 3779 + }, + { + "epoch": 0.022480730802169568, + "grad_norm": 2.4249329566955566, + "learning_rate": 4.993770970874499e-05, + "loss": 6.0494, + "step": 3780 + }, + { + "epoch": 0.022486678085450566, + "grad_norm": 2.7482552528381348, + "learning_rate": 4.993767675154169e-05, + "loss": 5.7579, + "step": 3781 + }, + { + "epoch": 0.022492625368731565, + "grad_norm": 4.115204811096191, + "learning_rate": 4.993764378563288e-05, + "loss": 6.3891, + "step": 3782 + }, + { + "epoch": 0.02249857265201256, + "grad_norm": 2.51346755027771, + "learning_rate": 4.99376108110186e-05, + "loss": 5.7982, + "step": 3783 + }, + { + "epoch": 0.02250451993529356, + "grad_norm": 2.2737278938293457, + "learning_rate": 4.993757782769887e-05, + "loss": 5.7576, + "step": 3784 + }, + { + "epoch": 0.022510467218574554, + "grad_norm": 2.2068402767181396, + "learning_rate": 4.9937544835673674e-05, + "loss": 5.9801, + "step": 3785 + }, + { + "epoch": 0.022516414501855552, + "grad_norm": 1.8548356294631958, + "learning_rate": 4.993751183494305e-05, + "loss": 6.2054, + "step": 3786 + }, + { + "epoch": 0.02252236178513655, + "grad_norm": 2.3499045372009277, + "learning_rate": 4.993747882550699e-05, + "loss": 6.0694, + "step": 3787 + }, + { + "epoch": 0.022528309068417546, + "grad_norm": 2.2253386974334717, + "learning_rate": 4.993744580736552e-05, + "loss": 5.709, + "step": 3788 + }, + { + "epoch": 0.022534256351698544, + "grad_norm": 2.1136696338653564, + "learning_rate": 4.993741278051864e-05, + "loss": 5.9546, + "step": 3789 + }, + { + "epoch": 0.022540203634979543, + "grad_norm": 1.8777605295181274, + "learning_rate": 4.9937379744966375e-05, + "loss": 5.7587, + "step": 3790 + }, + { + "epoch": 0.022546150918260538, + "grad_norm": 2.527571201324463, + "learning_rate": 4.9937346700708723e-05, + "loss": 5.0992, + "step": 3791 + }, + { + "epoch": 0.022552098201541537, + "grad_norm": 2.515805244445801, + "learning_rate": 4.99373136477457e-05, + "loss": 4.9766, + "step": 3792 + }, + { + "epoch": 0.02255804548482253, + "grad_norm": 2.442979574203491, + "learning_rate": 4.9937280586077315e-05, + "loss": 5.0981, + "step": 3793 + }, + { + "epoch": 0.02256399276810353, + "grad_norm": 2.575383424758911, + "learning_rate": 4.993724751570359e-05, + "loss": 5.0809, + "step": 3794 + }, + { + "epoch": 0.02256994005138453, + "grad_norm": 2.0855023860931396, + "learning_rate": 4.9937214436624524e-05, + "loss": 5.5744, + "step": 3795 + }, + { + "epoch": 0.022575887334665524, + "grad_norm": 2.237565040588379, + "learning_rate": 4.993718134884013e-05, + "loss": 5.6796, + "step": 3796 + }, + { + "epoch": 0.022581834617946522, + "grad_norm": 2.5895159244537354, + "learning_rate": 4.993714825235044e-05, + "loss": 5.2068, + "step": 3797 + }, + { + "epoch": 0.02258778190122752, + "grad_norm": 2.1277096271514893, + "learning_rate": 4.993711514715544e-05, + "loss": 5.5588, + "step": 3798 + }, + { + "epoch": 0.022593729184508516, + "grad_norm": 2.7074246406555176, + "learning_rate": 4.993708203325515e-05, + "loss": 5.0104, + "step": 3799 + }, + { + "epoch": 0.022599676467789515, + "grad_norm": 2.114569664001465, + "learning_rate": 4.993704891064958e-05, + "loss": 5.0453, + "step": 3800 + }, + { + "epoch": 0.02260562375107051, + "grad_norm": 2.4222404956817627, + "learning_rate": 4.9937015779338746e-05, + "loss": 5.3799, + "step": 3801 + }, + { + "epoch": 0.022611571034351508, + "grad_norm": 2.238755941390991, + "learning_rate": 4.993698263932266e-05, + "loss": 5.0075, + "step": 3802 + }, + { + "epoch": 0.022617518317632507, + "grad_norm": 2.0748255252838135, + "learning_rate": 4.993694949060133e-05, + "loss": 5.0007, + "step": 3803 + }, + { + "epoch": 0.022623465600913502, + "grad_norm": 2.1528635025024414, + "learning_rate": 4.993691633317477e-05, + "loss": 5.1048, + "step": 3804 + }, + { + "epoch": 0.0226294128841945, + "grad_norm": 2.0237200260162354, + "learning_rate": 4.993688316704298e-05, + "loss": 5.1465, + "step": 3805 + }, + { + "epoch": 0.022635360167475495, + "grad_norm": 2.2698304653167725, + "learning_rate": 4.993684999220599e-05, + "loss": 4.9642, + "step": 3806 + }, + { + "epoch": 0.022641307450756494, + "grad_norm": 2.7863757610321045, + "learning_rate": 4.993681680866381e-05, + "loss": 5.6277, + "step": 3807 + }, + { + "epoch": 0.022647254734037493, + "grad_norm": 2.394087553024292, + "learning_rate": 4.9936783616416436e-05, + "loss": 6.0895, + "step": 3808 + }, + { + "epoch": 0.022653202017318488, + "grad_norm": 2.8036317825317383, + "learning_rate": 4.993675041546389e-05, + "loss": 6.2002, + "step": 3809 + }, + { + "epoch": 0.022659149300599486, + "grad_norm": 2.4970054626464844, + "learning_rate": 4.993671720580618e-05, + "loss": 5.5114, + "step": 3810 + }, + { + "epoch": 0.022665096583880485, + "grad_norm": 3.2434241771698, + "learning_rate": 4.993668398744332e-05, + "loss": 5.0366, + "step": 3811 + }, + { + "epoch": 0.02267104386716148, + "grad_norm": 2.707104206085205, + "learning_rate": 4.9936650760375326e-05, + "loss": 5.5132, + "step": 3812 + }, + { + "epoch": 0.02267699115044248, + "grad_norm": 2.540231466293335, + "learning_rate": 4.9936617524602204e-05, + "loss": 5.8026, + "step": 3813 + }, + { + "epoch": 0.022682938433723474, + "grad_norm": 2.8549184799194336, + "learning_rate": 4.993658428012397e-05, + "loss": 6.0854, + "step": 3814 + }, + { + "epoch": 0.022688885717004472, + "grad_norm": 2.5972952842712402, + "learning_rate": 4.993655102694062e-05, + "loss": 5.8055, + "step": 3815 + }, + { + "epoch": 0.02269483300028547, + "grad_norm": 3.1625113487243652, + "learning_rate": 4.9936517765052184e-05, + "loss": 5.9683, + "step": 3816 + }, + { + "epoch": 0.022700780283566466, + "grad_norm": 3.239820718765259, + "learning_rate": 4.993648449445867e-05, + "loss": 5.9725, + "step": 3817 + }, + { + "epoch": 0.022706727566847464, + "grad_norm": 2.9632809162139893, + "learning_rate": 4.993645121516008e-05, + "loss": 5.9767, + "step": 3818 + }, + { + "epoch": 0.022712674850128463, + "grad_norm": 2.7486021518707275, + "learning_rate": 4.9936417927156435e-05, + "loss": 6.3471, + "step": 3819 + }, + { + "epoch": 0.022718622133409458, + "grad_norm": 3.8044490814208984, + "learning_rate": 4.993638463044775e-05, + "loss": 6.1275, + "step": 3820 + }, + { + "epoch": 0.022724569416690456, + "grad_norm": 4.851193428039551, + "learning_rate": 4.9936351325034024e-05, + "loss": 5.6658, + "step": 3821 + }, + { + "epoch": 0.02273051669997145, + "grad_norm": 3.1302716732025146, + "learning_rate": 4.993631801091528e-05, + "loss": 5.5256, + "step": 3822 + }, + { + "epoch": 0.02273646398325245, + "grad_norm": 5.310885906219482, + "learning_rate": 4.9936284688091526e-05, + "loss": 5.4771, + "step": 3823 + }, + { + "epoch": 0.02274241126653345, + "grad_norm": 5.493198394775391, + "learning_rate": 4.9936251356562765e-05, + "loss": 6.0993, + "step": 3824 + }, + { + "epoch": 0.022748358549814444, + "grad_norm": 3.5346286296844482, + "learning_rate": 4.993621801632902e-05, + "loss": 6.6862, + "step": 3825 + }, + { + "epoch": 0.022754305833095442, + "grad_norm": 4.550736904144287, + "learning_rate": 4.9936184667390304e-05, + "loss": 6.5658, + "step": 3826 + }, + { + "epoch": 0.02276025311637644, + "grad_norm": 3.3957576751708984, + "learning_rate": 4.993615130974662e-05, + "loss": 6.0596, + "step": 3827 + }, + { + "epoch": 0.022766200399657436, + "grad_norm": 2.614089012145996, + "learning_rate": 4.993611794339798e-05, + "loss": 6.77, + "step": 3828 + }, + { + "epoch": 0.022772147682938434, + "grad_norm": 3.712106704711914, + "learning_rate": 4.99360845683444e-05, + "loss": 6.4084, + "step": 3829 + }, + { + "epoch": 0.02277809496621943, + "grad_norm": 3.7331995964050293, + "learning_rate": 4.99360511845859e-05, + "loss": 6.2627, + "step": 3830 + }, + { + "epoch": 0.022784042249500428, + "grad_norm": 3.8898067474365234, + "learning_rate": 4.993601779212247e-05, + "loss": 6.6476, + "step": 3831 + }, + { + "epoch": 0.022789989532781427, + "grad_norm": 2.829078435897827, + "learning_rate": 4.9935984390954136e-05, + "loss": 6.2307, + "step": 3832 + }, + { + "epoch": 0.022795936816062422, + "grad_norm": 3.467954635620117, + "learning_rate": 4.9935950981080906e-05, + "loss": 6.5283, + "step": 3833 + }, + { + "epoch": 0.02280188409934342, + "grad_norm": 2.317840099334717, + "learning_rate": 4.99359175625028e-05, + "loss": 6.4549, + "step": 3834 + }, + { + "epoch": 0.02280783138262442, + "grad_norm": 2.7261998653411865, + "learning_rate": 4.9935884135219825e-05, + "loss": 6.2049, + "step": 3835 + }, + { + "epoch": 0.022813778665905414, + "grad_norm": 2.623098373413086, + "learning_rate": 4.993585069923198e-05, + "loss": 6.3847, + "step": 3836 + }, + { + "epoch": 0.022819725949186413, + "grad_norm": 2.4825377464294434, + "learning_rate": 4.993581725453929e-05, + "loss": 6.3532, + "step": 3837 + }, + { + "epoch": 0.022825673232467408, + "grad_norm": 2.278151750564575, + "learning_rate": 4.993578380114176e-05, + "loss": 5.8885, + "step": 3838 + }, + { + "epoch": 0.022831620515748406, + "grad_norm": 2.045839548110962, + "learning_rate": 4.9935750339039425e-05, + "loss": 6.6852, + "step": 3839 + }, + { + "epoch": 0.022837567799029405, + "grad_norm": 2.4009597301483154, + "learning_rate": 4.993571686823226e-05, + "loss": 6.1676, + "step": 3840 + }, + { + "epoch": 0.0228435150823104, + "grad_norm": 2.759819507598877, + "learning_rate": 4.9935683388720296e-05, + "loss": 6.3913, + "step": 3841 + }, + { + "epoch": 0.0228494623655914, + "grad_norm": 2.798785924911499, + "learning_rate": 4.9935649900503546e-05, + "loss": 6.8169, + "step": 3842 + }, + { + "epoch": 0.022855409648872393, + "grad_norm": 2.389890432357788, + "learning_rate": 4.9935616403582015e-05, + "loss": 6.7506, + "step": 3843 + }, + { + "epoch": 0.022861356932153392, + "grad_norm": 2.882474184036255, + "learning_rate": 4.9935582897955715e-05, + "loss": 6.2458, + "step": 3844 + }, + { + "epoch": 0.02286730421543439, + "grad_norm": 2.2487478256225586, + "learning_rate": 4.993554938362467e-05, + "loss": 6.7296, + "step": 3845 + }, + { + "epoch": 0.022873251498715386, + "grad_norm": 1.9563521146774292, + "learning_rate": 4.993551586058888e-05, + "loss": 6.6878, + "step": 3846 + }, + { + "epoch": 0.022879198781996384, + "grad_norm": 7.555780410766602, + "learning_rate": 4.993548232884835e-05, + "loss": 6.3309, + "step": 3847 + }, + { + "epoch": 0.022885146065277383, + "grad_norm": 2.2573931217193604, + "learning_rate": 4.99354487884031e-05, + "loss": 6.3384, + "step": 3848 + }, + { + "epoch": 0.022891093348558378, + "grad_norm": 2.063267946243286, + "learning_rate": 4.993541523925316e-05, + "loss": 6.2342, + "step": 3849 + }, + { + "epoch": 0.022897040631839376, + "grad_norm": 2.1032445430755615, + "learning_rate": 4.9935381681398505e-05, + "loss": 6.5458, + "step": 3850 + }, + { + "epoch": 0.02290298791512037, + "grad_norm": 2.233400583267212, + "learning_rate": 4.9935348114839176e-05, + "loss": 6.46, + "step": 3851 + }, + { + "epoch": 0.02290893519840137, + "grad_norm": 2.069182872772217, + "learning_rate": 4.9935314539575174e-05, + "loss": 6.4829, + "step": 3852 + }, + { + "epoch": 0.02291488248168237, + "grad_norm": 1.9986059665679932, + "learning_rate": 4.993528095560651e-05, + "loss": 6.4651, + "step": 3853 + }, + { + "epoch": 0.022920829764963364, + "grad_norm": 2.0529284477233887, + "learning_rate": 4.99352473629332e-05, + "loss": 6.1151, + "step": 3854 + }, + { + "epoch": 0.022926777048244362, + "grad_norm": 1.9643630981445312, + "learning_rate": 4.993521376155525e-05, + "loss": 5.991, + "step": 3855 + }, + { + "epoch": 0.02293272433152536, + "grad_norm": 2.2183501720428467, + "learning_rate": 4.9935180151472674e-05, + "loss": 6.8568, + "step": 3856 + }, + { + "epoch": 0.022938671614806356, + "grad_norm": 2.2095682621002197, + "learning_rate": 4.993514653268548e-05, + "loss": 6.8145, + "step": 3857 + }, + { + "epoch": 0.022944618898087354, + "grad_norm": 2.194451332092285, + "learning_rate": 4.9935112905193694e-05, + "loss": 6.4781, + "step": 3858 + }, + { + "epoch": 0.02295056618136835, + "grad_norm": 2.2242066860198975, + "learning_rate": 4.9935079268997306e-05, + "loss": 6.0535, + "step": 3859 + }, + { + "epoch": 0.022956513464649348, + "grad_norm": 2.336190938949585, + "learning_rate": 4.9935045624096354e-05, + "loss": 6.2453, + "step": 3860 + }, + { + "epoch": 0.022962460747930347, + "grad_norm": 1.9997279644012451, + "learning_rate": 4.9935011970490824e-05, + "loss": 6.3852, + "step": 3861 + }, + { + "epoch": 0.02296840803121134, + "grad_norm": 2.9107778072357178, + "learning_rate": 4.993497830818074e-05, + "loss": 6.0891, + "step": 3862 + }, + { + "epoch": 0.02297435531449234, + "grad_norm": 2.1357171535491943, + "learning_rate": 4.993494463716612e-05, + "loss": 6.5111, + "step": 3863 + }, + { + "epoch": 0.02298030259777334, + "grad_norm": 2.0228497982025146, + "learning_rate": 4.9934910957446954e-05, + "loss": 6.6009, + "step": 3864 + }, + { + "epoch": 0.022986249881054334, + "grad_norm": 2.8057942390441895, + "learning_rate": 4.993487726902328e-05, + "loss": 6.414, + "step": 3865 + }, + { + "epoch": 0.022992197164335332, + "grad_norm": 3.0660998821258545, + "learning_rate": 4.99348435718951e-05, + "loss": 6.3673, + "step": 3866 + }, + { + "epoch": 0.022998144447616328, + "grad_norm": 2.2440497875213623, + "learning_rate": 4.9934809866062416e-05, + "loss": 6.1793, + "step": 3867 + }, + { + "epoch": 0.023004091730897326, + "grad_norm": 2.342358350753784, + "learning_rate": 4.993477615152525e-05, + "loss": 6.5279, + "step": 3868 + }, + { + "epoch": 0.023010039014178325, + "grad_norm": 1.9231956005096436, + "learning_rate": 4.993474242828361e-05, + "loss": 6.4975, + "step": 3869 + }, + { + "epoch": 0.02301598629745932, + "grad_norm": 2.503028631210327, + "learning_rate": 4.9934708696337516e-05, + "loss": 6.5261, + "step": 3870 + }, + { + "epoch": 0.02302193358074032, + "grad_norm": 2.2343928813934326, + "learning_rate": 4.993467495568697e-05, + "loss": 6.0525, + "step": 3871 + }, + { + "epoch": 0.023027880864021313, + "grad_norm": 2.851964235305786, + "learning_rate": 4.993464120633198e-05, + "loss": 6.1271, + "step": 3872 + }, + { + "epoch": 0.023033828147302312, + "grad_norm": 2.580017328262329, + "learning_rate": 4.993460744827257e-05, + "loss": 6.2018, + "step": 3873 + }, + { + "epoch": 0.02303977543058331, + "grad_norm": 2.227879047393799, + "learning_rate": 4.9934573681508744e-05, + "loss": 6.0177, + "step": 3874 + }, + { + "epoch": 0.023045722713864306, + "grad_norm": 2.696531295776367, + "learning_rate": 4.993453990604051e-05, + "loss": 6.627, + "step": 3875 + }, + { + "epoch": 0.023051669997145304, + "grad_norm": 2.3439393043518066, + "learning_rate": 4.99345061218679e-05, + "loss": 6.5388, + "step": 3876 + }, + { + "epoch": 0.023057617280426303, + "grad_norm": 2.5400748252868652, + "learning_rate": 4.99344723289909e-05, + "loss": 5.9162, + "step": 3877 + }, + { + "epoch": 0.023063564563707298, + "grad_norm": 2.658193588256836, + "learning_rate": 4.9934438527409535e-05, + "loss": 5.6645, + "step": 3878 + }, + { + "epoch": 0.023069511846988296, + "grad_norm": 2.3102848529815674, + "learning_rate": 4.9934404717123814e-05, + "loss": 5.9969, + "step": 3879 + }, + { + "epoch": 0.02307545913026929, + "grad_norm": 2.6107916831970215, + "learning_rate": 4.993437089813376e-05, + "loss": 6.1776, + "step": 3880 + }, + { + "epoch": 0.02308140641355029, + "grad_norm": 2.6275434494018555, + "learning_rate": 4.993433707043937e-05, + "loss": 6.2563, + "step": 3881 + }, + { + "epoch": 0.02308735369683129, + "grad_norm": 2.8595218658447266, + "learning_rate": 4.993430323404066e-05, + "loss": 5.9371, + "step": 3882 + }, + { + "epoch": 0.023093300980112284, + "grad_norm": 2.2947659492492676, + "learning_rate": 4.993426938893764e-05, + "loss": 5.7263, + "step": 3883 + }, + { + "epoch": 0.023099248263393282, + "grad_norm": 3.3769729137420654, + "learning_rate": 4.9934235535130326e-05, + "loss": 6.2706, + "step": 3884 + }, + { + "epoch": 0.02310519554667428, + "grad_norm": 2.792043447494507, + "learning_rate": 4.9934201672618716e-05, + "loss": 5.9264, + "step": 3885 + }, + { + "epoch": 0.023111142829955276, + "grad_norm": 2.592167615890503, + "learning_rate": 4.993416780140285e-05, + "loss": 6.4031, + "step": 3886 + }, + { + "epoch": 0.023117090113236274, + "grad_norm": 2.429898977279663, + "learning_rate": 4.9934133921482716e-05, + "loss": 6.4609, + "step": 3887 + }, + { + "epoch": 0.02312303739651727, + "grad_norm": 2.1771554946899414, + "learning_rate": 4.993410003285834e-05, + "loss": 6.2873, + "step": 3888 + }, + { + "epoch": 0.023128984679798268, + "grad_norm": 2.7799339294433594, + "learning_rate": 4.9934066135529724e-05, + "loss": 5.7405, + "step": 3889 + }, + { + "epoch": 0.023134931963079267, + "grad_norm": 2.626492977142334, + "learning_rate": 4.993403222949688e-05, + "loss": 5.783, + "step": 3890 + }, + { + "epoch": 0.02314087924636026, + "grad_norm": 2.837663412094116, + "learning_rate": 4.993399831475982e-05, + "loss": 5.8039, + "step": 3891 + }, + { + "epoch": 0.02314682652964126, + "grad_norm": 2.68230938911438, + "learning_rate": 4.9933964391318564e-05, + "loss": 5.6587, + "step": 3892 + }, + { + "epoch": 0.02315277381292226, + "grad_norm": 3.2064061164855957, + "learning_rate": 4.993393045917312e-05, + "loss": 5.9516, + "step": 3893 + }, + { + "epoch": 0.023158721096203254, + "grad_norm": 3.5179402828216553, + "learning_rate": 4.99338965183235e-05, + "loss": 5.7925, + "step": 3894 + }, + { + "epoch": 0.023164668379484252, + "grad_norm": 2.9261434078216553, + "learning_rate": 4.993386256876971e-05, + "loss": 5.8677, + "step": 3895 + }, + { + "epoch": 0.023170615662765248, + "grad_norm": 3.092033624649048, + "learning_rate": 4.9933828610511766e-05, + "loss": 5.6248, + "step": 3896 + }, + { + "epoch": 0.023176562946046246, + "grad_norm": 2.7650182247161865, + "learning_rate": 4.9933794643549683e-05, + "loss": 5.7371, + "step": 3897 + }, + { + "epoch": 0.023182510229327245, + "grad_norm": 2.402839422225952, + "learning_rate": 4.993376066788347e-05, + "loss": 5.4802, + "step": 3898 + }, + { + "epoch": 0.02318845751260824, + "grad_norm": 2.606062889099121, + "learning_rate": 4.993372668351314e-05, + "loss": 5.5766, + "step": 3899 + }, + { + "epoch": 0.023194404795889238, + "grad_norm": 2.2177329063415527, + "learning_rate": 4.99336926904387e-05, + "loss": 5.5744, + "step": 3900 + }, + { + "epoch": 0.023200352079170233, + "grad_norm": 2.6953063011169434, + "learning_rate": 4.9933658688660166e-05, + "loss": 5.6414, + "step": 3901 + }, + { + "epoch": 0.023206299362451232, + "grad_norm": 2.90512752532959, + "learning_rate": 4.993362467817755e-05, + "loss": 5.5445, + "step": 3902 + }, + { + "epoch": 0.02321224664573223, + "grad_norm": 3.724168062210083, + "learning_rate": 4.993359065899086e-05, + "loss": 5.7733, + "step": 3903 + }, + { + "epoch": 0.023218193929013226, + "grad_norm": 2.9355592727661133, + "learning_rate": 4.993355663110012e-05, + "loss": 5.579, + "step": 3904 + }, + { + "epoch": 0.023224141212294224, + "grad_norm": 2.7822163105010986, + "learning_rate": 4.993352259450532e-05, + "loss": 5.5105, + "step": 3905 + }, + { + "epoch": 0.023230088495575223, + "grad_norm": 3.672539710998535, + "learning_rate": 4.99334885492065e-05, + "loss": 6.3865, + "step": 3906 + }, + { + "epoch": 0.023236035778856218, + "grad_norm": 2.26755952835083, + "learning_rate": 4.993345449520364e-05, + "loss": 5.5472, + "step": 3907 + }, + { + "epoch": 0.023241983062137216, + "grad_norm": 2.8935770988464355, + "learning_rate": 4.993342043249678e-05, + "loss": 5.5948, + "step": 3908 + }, + { + "epoch": 0.02324793034541821, + "grad_norm": 3.077798366546631, + "learning_rate": 4.9933386361085924e-05, + "loss": 5.288, + "step": 3909 + }, + { + "epoch": 0.02325387762869921, + "grad_norm": 2.479198694229126, + "learning_rate": 4.993335228097107e-05, + "loss": 5.3743, + "step": 3910 + }, + { + "epoch": 0.02325982491198021, + "grad_norm": 2.429049015045166, + "learning_rate": 4.9933318192152244e-05, + "loss": 5.6709, + "step": 3911 + }, + { + "epoch": 0.023265772195261204, + "grad_norm": 2.4515016078948975, + "learning_rate": 4.993328409462945e-05, + "loss": 5.4946, + "step": 3912 + }, + { + "epoch": 0.023271719478542202, + "grad_norm": 2.3859386444091797, + "learning_rate": 4.993324998840271e-05, + "loss": 5.5947, + "step": 3913 + }, + { + "epoch": 0.0232776667618232, + "grad_norm": 2.746438503265381, + "learning_rate": 4.993321587347203e-05, + "loss": 5.6743, + "step": 3914 + }, + { + "epoch": 0.023283614045104196, + "grad_norm": 2.416118621826172, + "learning_rate": 4.993318174983742e-05, + "loss": 5.7073, + "step": 3915 + }, + { + "epoch": 0.023289561328385194, + "grad_norm": 2.3427727222442627, + "learning_rate": 4.99331476174989e-05, + "loss": 5.5933, + "step": 3916 + }, + { + "epoch": 0.02329550861166619, + "grad_norm": 2.2179009914398193, + "learning_rate": 4.993311347645647e-05, + "loss": 5.7726, + "step": 3917 + }, + { + "epoch": 0.023301455894947188, + "grad_norm": 2.732923984527588, + "learning_rate": 4.993307932671014e-05, + "loss": 5.5783, + "step": 3918 + }, + { + "epoch": 0.023307403178228187, + "grad_norm": 2.5090553760528564, + "learning_rate": 4.993304516825994e-05, + "loss": 5.6598, + "step": 3919 + }, + { + "epoch": 0.02331335046150918, + "grad_norm": 2.690276622772217, + "learning_rate": 4.993301100110587e-05, + "loss": 5.9688, + "step": 3920 + }, + { + "epoch": 0.02331929774479018, + "grad_norm": 2.559215784072876, + "learning_rate": 4.993297682524794e-05, + "loss": 6.3315, + "step": 3921 + }, + { + "epoch": 0.02332524502807118, + "grad_norm": 2.2800240516662598, + "learning_rate": 4.993294264068617e-05, + "loss": 6.2787, + "step": 3922 + }, + { + "epoch": 0.023331192311352174, + "grad_norm": 2.478898525238037, + "learning_rate": 4.993290844742057e-05, + "loss": 6.1145, + "step": 3923 + }, + { + "epoch": 0.023337139594633172, + "grad_norm": 2.4902184009552, + "learning_rate": 4.993287424545115e-05, + "loss": 6.0665, + "step": 3924 + }, + { + "epoch": 0.023343086877914167, + "grad_norm": 2.4157116413116455, + "learning_rate": 4.9932840034777906e-05, + "loss": 6.1697, + "step": 3925 + }, + { + "epoch": 0.023349034161195166, + "grad_norm": 2.340575933456421, + "learning_rate": 4.993280581540087e-05, + "loss": 6.1121, + "step": 3926 + }, + { + "epoch": 0.023354981444476165, + "grad_norm": 2.586881160736084, + "learning_rate": 4.993277158732006e-05, + "loss": 6.1792, + "step": 3927 + }, + { + "epoch": 0.02336092872775716, + "grad_norm": 2.448880910873413, + "learning_rate": 4.9932737350535476e-05, + "loss": 6.084, + "step": 3928 + }, + { + "epoch": 0.023366876011038158, + "grad_norm": 2.525082588195801, + "learning_rate": 4.993270310504712e-05, + "loss": 5.6726, + "step": 3929 + }, + { + "epoch": 0.023372823294319153, + "grad_norm": 2.310445547103882, + "learning_rate": 4.993266885085503e-05, + "loss": 5.9496, + "step": 3930 + }, + { + "epoch": 0.023378770577600152, + "grad_norm": 2.275416612625122, + "learning_rate": 4.993263458795918e-05, + "loss": 6.0042, + "step": 3931 + }, + { + "epoch": 0.02338471786088115, + "grad_norm": 2.481973648071289, + "learning_rate": 4.993260031635963e-05, + "loss": 5.6177, + "step": 3932 + }, + { + "epoch": 0.023390665144162145, + "grad_norm": 2.439544677734375, + "learning_rate": 4.993256603605635e-05, + "loss": 5.9745, + "step": 3933 + }, + { + "epoch": 0.023396612427443144, + "grad_norm": 2.1909360885620117, + "learning_rate": 4.993253174704937e-05, + "loss": 5.9966, + "step": 3934 + }, + { + "epoch": 0.023402559710724143, + "grad_norm": 2.1893911361694336, + "learning_rate": 4.993249744933871e-05, + "loss": 6.0643, + "step": 3935 + }, + { + "epoch": 0.023408506994005138, + "grad_norm": 3.2023842334747314, + "learning_rate": 4.993246314292437e-05, + "loss": 6.2284, + "step": 3936 + }, + { + "epoch": 0.023414454277286136, + "grad_norm": 2.980842113494873, + "learning_rate": 4.9932428827806356e-05, + "loss": 6.2359, + "step": 3937 + }, + { + "epoch": 0.02342040156056713, + "grad_norm": 2.6659433841705322, + "learning_rate": 4.99323945039847e-05, + "loss": 6.2901, + "step": 3938 + }, + { + "epoch": 0.02342634884384813, + "grad_norm": 2.2173492908477783, + "learning_rate": 4.993236017145939e-05, + "loss": 5.8157, + "step": 3939 + }, + { + "epoch": 0.02343229612712913, + "grad_norm": 2.592771530151367, + "learning_rate": 4.993232583023046e-05, + "loss": 5.7747, + "step": 3940 + }, + { + "epoch": 0.023438243410410124, + "grad_norm": 2.328951835632324, + "learning_rate": 4.9932291480297915e-05, + "loss": 5.7367, + "step": 3941 + }, + { + "epoch": 0.023444190693691122, + "grad_norm": 2.3135616779327393, + "learning_rate": 4.993225712166176e-05, + "loss": 6.0592, + "step": 3942 + }, + { + "epoch": 0.02345013797697212, + "grad_norm": 2.49661922454834, + "learning_rate": 4.993222275432201e-05, + "loss": 5.9737, + "step": 3943 + }, + { + "epoch": 0.023456085260253116, + "grad_norm": 2.6462106704711914, + "learning_rate": 4.9932188378278683e-05, + "loss": 5.7053, + "step": 3944 + }, + { + "epoch": 0.023462032543534114, + "grad_norm": 2.102663516998291, + "learning_rate": 4.993215399353178e-05, + "loss": 5.9006, + "step": 3945 + }, + { + "epoch": 0.02346797982681511, + "grad_norm": 2.474500894546509, + "learning_rate": 4.9932119600081326e-05, + "loss": 6.092, + "step": 3946 + }, + { + "epoch": 0.023473927110096108, + "grad_norm": 2.6023428440093994, + "learning_rate": 4.993208519792732e-05, + "loss": 5.9045, + "step": 3947 + }, + { + "epoch": 0.023479874393377106, + "grad_norm": 2.76432466506958, + "learning_rate": 4.99320507870698e-05, + "loss": 5.8178, + "step": 3948 + }, + { + "epoch": 0.0234858216766581, + "grad_norm": 2.250816822052002, + "learning_rate": 4.993201636750874e-05, + "loss": 5.9091, + "step": 3949 + }, + { + "epoch": 0.0234917689599391, + "grad_norm": 2.1984071731567383, + "learning_rate": 4.993198193924417e-05, + "loss": 5.8804, + "step": 3950 + }, + { + "epoch": 0.0234977162432201, + "grad_norm": 2.5217959880828857, + "learning_rate": 4.993194750227611e-05, + "loss": 5.9879, + "step": 3951 + }, + { + "epoch": 0.023503663526501094, + "grad_norm": 2.080110788345337, + "learning_rate": 4.993191305660456e-05, + "loss": 5.6352, + "step": 3952 + }, + { + "epoch": 0.023509610809782092, + "grad_norm": 2.637500286102295, + "learning_rate": 4.9931878602229545e-05, + "loss": 5.7924, + "step": 3953 + }, + { + "epoch": 0.023515558093063087, + "grad_norm": 2.660531759262085, + "learning_rate": 4.9931844139151056e-05, + "loss": 6.1936, + "step": 3954 + }, + { + "epoch": 0.023521505376344086, + "grad_norm": 2.423699378967285, + "learning_rate": 4.993180966736913e-05, + "loss": 5.8974, + "step": 3955 + }, + { + "epoch": 0.023527452659625085, + "grad_norm": 2.581876277923584, + "learning_rate": 4.993177518688375e-05, + "loss": 5.833, + "step": 3956 + }, + { + "epoch": 0.02353339994290608, + "grad_norm": 2.586538076400757, + "learning_rate": 4.9931740697694965e-05, + "loss": 5.9649, + "step": 3957 + }, + { + "epoch": 0.023539347226187078, + "grad_norm": 2.5123441219329834, + "learning_rate": 4.993170619980276e-05, + "loss": 6.1251, + "step": 3958 + }, + { + "epoch": 0.023545294509468077, + "grad_norm": 3.076904535293579, + "learning_rate": 4.993167169320715e-05, + "loss": 5.9559, + "step": 3959 + }, + { + "epoch": 0.023551241792749072, + "grad_norm": 2.572312593460083, + "learning_rate": 4.9931637177908153e-05, + "loss": 6.0291, + "step": 3960 + }, + { + "epoch": 0.02355718907603007, + "grad_norm": 1.9910492897033691, + "learning_rate": 4.9931602653905776e-05, + "loss": 5.8413, + "step": 3961 + }, + { + "epoch": 0.023563136359311065, + "grad_norm": 2.530710458755493, + "learning_rate": 4.993156812120004e-05, + "loss": 6.1217, + "step": 3962 + }, + { + "epoch": 0.023569083642592064, + "grad_norm": 2.3089046478271484, + "learning_rate": 4.993153357979095e-05, + "loss": 5.822, + "step": 3963 + }, + { + "epoch": 0.023575030925873063, + "grad_norm": 2.8980624675750732, + "learning_rate": 4.993149902967852e-05, + "loss": 6.3906, + "step": 3964 + }, + { + "epoch": 0.023580978209154058, + "grad_norm": 2.2176012992858887, + "learning_rate": 4.993146447086275e-05, + "loss": 5.9259, + "step": 3965 + }, + { + "epoch": 0.023586925492435056, + "grad_norm": 2.01096773147583, + "learning_rate": 4.993142990334367e-05, + "loss": 6.3141, + "step": 3966 + }, + { + "epoch": 0.02359287277571605, + "grad_norm": 3.4096288681030273, + "learning_rate": 4.993139532712129e-05, + "loss": 6.3165, + "step": 3967 + }, + { + "epoch": 0.02359882005899705, + "grad_norm": 2.20595645904541, + "learning_rate": 4.9931360742195623e-05, + "loss": 6.016, + "step": 3968 + }, + { + "epoch": 0.02360476734227805, + "grad_norm": 3.543301820755005, + "learning_rate": 4.993132614856666e-05, + "loss": 5.722, + "step": 3969 + }, + { + "epoch": 0.023610714625559043, + "grad_norm": 2.82092547416687, + "learning_rate": 4.993129154623444e-05, + "loss": 5.8217, + "step": 3970 + }, + { + "epoch": 0.023616661908840042, + "grad_norm": 2.4585440158843994, + "learning_rate": 4.9931256935198954e-05, + "loss": 6.3298, + "step": 3971 + }, + { + "epoch": 0.02362260919212104, + "grad_norm": 2.104340076446533, + "learning_rate": 4.993122231546024e-05, + "loss": 5.9174, + "step": 3972 + }, + { + "epoch": 0.023628556475402036, + "grad_norm": 2.5130183696746826, + "learning_rate": 4.993118768701828e-05, + "loss": 6.3075, + "step": 3973 + }, + { + "epoch": 0.023634503758683034, + "grad_norm": 2.4567196369171143, + "learning_rate": 4.99311530498731e-05, + "loss": 6.0088, + "step": 3974 + }, + { + "epoch": 0.02364045104196403, + "grad_norm": 2.5174858570098877, + "learning_rate": 4.993111840402471e-05, + "loss": 6.6739, + "step": 3975 + }, + { + "epoch": 0.023646398325245028, + "grad_norm": 2.0032241344451904, + "learning_rate": 4.9931083749473136e-05, + "loss": 5.7052, + "step": 3976 + }, + { + "epoch": 0.023652345608526026, + "grad_norm": 2.9536757469177246, + "learning_rate": 4.993104908621837e-05, + "loss": 5.415, + "step": 3977 + }, + { + "epoch": 0.02365829289180702, + "grad_norm": 2.6650888919830322, + "learning_rate": 4.9931014414260435e-05, + "loss": 5.4333, + "step": 3978 + }, + { + "epoch": 0.02366424017508802, + "grad_norm": 2.3574490547180176, + "learning_rate": 4.9930979733599334e-05, + "loss": 5.5802, + "step": 3979 + }, + { + "epoch": 0.02367018745836902, + "grad_norm": 2.855534791946411, + "learning_rate": 4.99309450442351e-05, + "loss": 5.5131, + "step": 3980 + }, + { + "epoch": 0.023676134741650014, + "grad_norm": 2.430943727493286, + "learning_rate": 4.993091034616772e-05, + "loss": 6.2497, + "step": 3981 + }, + { + "epoch": 0.023682082024931012, + "grad_norm": 2.1671106815338135, + "learning_rate": 4.993087563939722e-05, + "loss": 5.9994, + "step": 3982 + }, + { + "epoch": 0.023688029308212007, + "grad_norm": 2.3268723487854004, + "learning_rate": 4.9930840923923606e-05, + "loss": 5.4779, + "step": 3983 + }, + { + "epoch": 0.023693976591493006, + "grad_norm": 2.3953616619110107, + "learning_rate": 4.993080619974689e-05, + "loss": 5.4044, + "step": 3984 + }, + { + "epoch": 0.023699923874774004, + "grad_norm": 2.043724775314331, + "learning_rate": 4.993077146686709e-05, + "loss": 5.6252, + "step": 3985 + }, + { + "epoch": 0.023705871158055, + "grad_norm": 2.5629520416259766, + "learning_rate": 4.9930736725284224e-05, + "loss": 5.1765, + "step": 3986 + }, + { + "epoch": 0.023711818441335998, + "grad_norm": 2.2148349285125732, + "learning_rate": 4.993070197499828e-05, + "loss": 5.5452, + "step": 3987 + }, + { + "epoch": 0.023717765724616997, + "grad_norm": 2.3913650512695312, + "learning_rate": 4.9930667216009295e-05, + "loss": 6.0882, + "step": 3988 + }, + { + "epoch": 0.02372371300789799, + "grad_norm": 2.619607925415039, + "learning_rate": 4.993063244831727e-05, + "loss": 6.4482, + "step": 3989 + }, + { + "epoch": 0.02372966029117899, + "grad_norm": 2.0585055351257324, + "learning_rate": 4.993059767192222e-05, + "loss": 6.0467, + "step": 3990 + }, + { + "epoch": 0.023735607574459985, + "grad_norm": 2.3380227088928223, + "learning_rate": 4.993056288682416e-05, + "loss": 5.9382, + "step": 3991 + }, + { + "epoch": 0.023741554857740984, + "grad_norm": 2.7252683639526367, + "learning_rate": 4.9930528093023085e-05, + "loss": 6.0444, + "step": 3992 + }, + { + "epoch": 0.023747502141021982, + "grad_norm": 2.333296060562134, + "learning_rate": 4.993049329051903e-05, + "loss": 5.6614, + "step": 3993 + }, + { + "epoch": 0.023753449424302978, + "grad_norm": 2.3571507930755615, + "learning_rate": 4.9930458479312e-05, + "loss": 6.328, + "step": 3994 + }, + { + "epoch": 0.023759396707583976, + "grad_norm": 2.7106499671936035, + "learning_rate": 4.9930423659402005e-05, + "loss": 6.0347, + "step": 3995 + }, + { + "epoch": 0.02376534399086497, + "grad_norm": 3.000009298324585, + "learning_rate": 4.9930388830789043e-05, + "loss": 5.5511, + "step": 3996 + }, + { + "epoch": 0.02377129127414597, + "grad_norm": 2.787912130355835, + "learning_rate": 4.993035399347316e-05, + "loss": 5.2059, + "step": 3997 + }, + { + "epoch": 0.02377723855742697, + "grad_norm": 2.7351326942443848, + "learning_rate": 4.993031914745433e-05, + "loss": 5.2997, + "step": 3998 + }, + { + "epoch": 0.023783185840707963, + "grad_norm": 2.770566701889038, + "learning_rate": 4.993028429273259e-05, + "loss": 5.8871, + "step": 3999 + }, + { + "epoch": 0.023789133123988962, + "grad_norm": 2.9528706073760986, + "learning_rate": 4.993024942930794e-05, + "loss": 5.8177, + "step": 4000 + }, + { + "epoch": 0.02379508040726996, + "grad_norm": 2.543329954147339, + "learning_rate": 4.993021455718041e-05, + "loss": 5.6446, + "step": 4001 + }, + { + "epoch": 0.023801027690550956, + "grad_norm": 2.7284936904907227, + "learning_rate": 4.993017967634999e-05, + "loss": 5.8404, + "step": 4002 + }, + { + "epoch": 0.023806974973831954, + "grad_norm": 2.752187728881836, + "learning_rate": 4.99301447868167e-05, + "loss": 5.6959, + "step": 4003 + }, + { + "epoch": 0.02381292225711295, + "grad_norm": 2.86651611328125, + "learning_rate": 4.993010988858056e-05, + "loss": 5.6329, + "step": 4004 + }, + { + "epoch": 0.023818869540393948, + "grad_norm": 3.9363176822662354, + "learning_rate": 4.9930074981641574e-05, + "loss": 5.31, + "step": 4005 + }, + { + "epoch": 0.023824816823674946, + "grad_norm": 3.41188907623291, + "learning_rate": 4.9930040065999764e-05, + "loss": 5.9905, + "step": 4006 + }, + { + "epoch": 0.02383076410695594, + "grad_norm": 3.4761459827423096, + "learning_rate": 4.9930005141655125e-05, + "loss": 6.0575, + "step": 4007 + }, + { + "epoch": 0.02383671139023694, + "grad_norm": 3.1562440395355225, + "learning_rate": 4.992997020860768e-05, + "loss": 5.9915, + "step": 4008 + }, + { + "epoch": 0.02384265867351794, + "grad_norm": 2.884049415588379, + "learning_rate": 4.992993526685744e-05, + "loss": 5.8051, + "step": 4009 + }, + { + "epoch": 0.023848605956798934, + "grad_norm": 3.3188138008117676, + "learning_rate": 4.992990031640442e-05, + "loss": 5.9637, + "step": 4010 + }, + { + "epoch": 0.023854553240079932, + "grad_norm": 3.2048282623291016, + "learning_rate": 4.992986535724862e-05, + "loss": 6.631, + "step": 4011 + }, + { + "epoch": 0.023860500523360927, + "grad_norm": 2.80204701423645, + "learning_rate": 4.992983038939008e-05, + "loss": 6.0063, + "step": 4012 + }, + { + "epoch": 0.023866447806641926, + "grad_norm": 2.993398427963257, + "learning_rate": 4.992979541282877e-05, + "loss": 5.9778, + "step": 4013 + }, + { + "epoch": 0.023872395089922924, + "grad_norm": 2.7519168853759766, + "learning_rate": 4.9929760427564744e-05, + "loss": 6.4272, + "step": 4014 + }, + { + "epoch": 0.02387834237320392, + "grad_norm": 2.9606168270111084, + "learning_rate": 4.992972543359799e-05, + "loss": 5.5372, + "step": 4015 + }, + { + "epoch": 0.023884289656484918, + "grad_norm": 2.1724514961242676, + "learning_rate": 4.992969043092853e-05, + "loss": 6.3115, + "step": 4016 + }, + { + "epoch": 0.023890236939765917, + "grad_norm": 2.1742191314697266, + "learning_rate": 4.9929655419556365e-05, + "loss": 6.5097, + "step": 4017 + }, + { + "epoch": 0.02389618422304691, + "grad_norm": 1.9729878902435303, + "learning_rate": 4.9929620399481526e-05, + "loss": 6.7061, + "step": 4018 + }, + { + "epoch": 0.02390213150632791, + "grad_norm": 2.6273725032806396, + "learning_rate": 4.9929585370704e-05, + "loss": 6.2838, + "step": 4019 + }, + { + "epoch": 0.023908078789608905, + "grad_norm": 2.5495283603668213, + "learning_rate": 4.9929550333223826e-05, + "loss": 6.1175, + "step": 4020 + }, + { + "epoch": 0.023914026072889904, + "grad_norm": 2.50193452835083, + "learning_rate": 4.9929515287041e-05, + "loss": 5.7689, + "step": 4021 + }, + { + "epoch": 0.023919973356170902, + "grad_norm": 2.402991771697998, + "learning_rate": 4.992948023215553e-05, + "loss": 6.4222, + "step": 4022 + }, + { + "epoch": 0.023925920639451898, + "grad_norm": 2.1722981929779053, + "learning_rate": 4.9929445168567444e-05, + "loss": 6.2335, + "step": 4023 + }, + { + "epoch": 0.023931867922732896, + "grad_norm": 1.6895688772201538, + "learning_rate": 4.992941009627675e-05, + "loss": 6.163, + "step": 4024 + }, + { + "epoch": 0.02393781520601389, + "grad_norm": 1.9944639205932617, + "learning_rate": 4.992937501528345e-05, + "loss": 6.2622, + "step": 4025 + }, + { + "epoch": 0.02394376248929489, + "grad_norm": 2.6157150268554688, + "learning_rate": 4.9929339925587565e-05, + "loss": 6.4582, + "step": 4026 + }, + { + "epoch": 0.023949709772575888, + "grad_norm": 2.021772623062134, + "learning_rate": 4.992930482718911e-05, + "loss": 6.2921, + "step": 4027 + }, + { + "epoch": 0.023955657055856883, + "grad_norm": 2.465402603149414, + "learning_rate": 4.992926972008808e-05, + "loss": 6.6426, + "step": 4028 + }, + { + "epoch": 0.023961604339137882, + "grad_norm": 2.337763547897339, + "learning_rate": 4.99292346042845e-05, + "loss": 6.4988, + "step": 4029 + }, + { + "epoch": 0.02396755162241888, + "grad_norm": 2.400064706802368, + "learning_rate": 4.9929199479778394e-05, + "loss": 6.6666, + "step": 4030 + }, + { + "epoch": 0.023973498905699876, + "grad_norm": 2.4205784797668457, + "learning_rate": 4.9929164346569756e-05, + "loss": 5.8805, + "step": 4031 + }, + { + "epoch": 0.023979446188980874, + "grad_norm": 2.312434673309326, + "learning_rate": 4.9929129204658605e-05, + "loss": 6.5161, + "step": 4032 + }, + { + "epoch": 0.02398539347226187, + "grad_norm": 2.02748966217041, + "learning_rate": 4.9929094054044944e-05, + "loss": 6.1272, + "step": 4033 + }, + { + "epoch": 0.023991340755542868, + "grad_norm": 2.280242443084717, + "learning_rate": 4.992905889472881e-05, + "loss": 5.7217, + "step": 4034 + }, + { + "epoch": 0.023997288038823866, + "grad_norm": 2.3911778926849365, + "learning_rate": 4.992902372671019e-05, + "loss": 5.7441, + "step": 4035 + }, + { + "epoch": 0.02400323532210486, + "grad_norm": 2.1767921447753906, + "learning_rate": 4.99289885499891e-05, + "loss": 5.7212, + "step": 4036 + }, + { + "epoch": 0.02400918260538586, + "grad_norm": 2.3067142963409424, + "learning_rate": 4.992895336456557e-05, + "loss": 5.6689, + "step": 4037 + }, + { + "epoch": 0.02401512988866686, + "grad_norm": 2.1564273834228516, + "learning_rate": 4.992891817043959e-05, + "loss": 6.1445, + "step": 4038 + }, + { + "epoch": 0.024021077171947854, + "grad_norm": 2.4852945804595947, + "learning_rate": 4.9928882967611184e-05, + "loss": 6.1883, + "step": 4039 + }, + { + "epoch": 0.024027024455228852, + "grad_norm": 2.9280812740325928, + "learning_rate": 4.992884775608036e-05, + "loss": 6.097, + "step": 4040 + }, + { + "epoch": 0.024032971738509847, + "grad_norm": 2.3219356536865234, + "learning_rate": 4.992881253584714e-05, + "loss": 6.3163, + "step": 4041 + }, + { + "epoch": 0.024038919021790846, + "grad_norm": 2.672386884689331, + "learning_rate": 4.9928777306911525e-05, + "loss": 5.9615, + "step": 4042 + }, + { + "epoch": 0.024044866305071844, + "grad_norm": 2.5886473655700684, + "learning_rate": 4.992874206927353e-05, + "loss": 6.0114, + "step": 4043 + }, + { + "epoch": 0.02405081358835284, + "grad_norm": 2.991230010986328, + "learning_rate": 4.992870682293318e-05, + "loss": 5.6805, + "step": 4044 + }, + { + "epoch": 0.024056760871633838, + "grad_norm": 2.3270034790039062, + "learning_rate": 4.9928671567890464e-05, + "loss": 5.7503, + "step": 4045 + }, + { + "epoch": 0.024062708154914837, + "grad_norm": 2.591627359390259, + "learning_rate": 4.99286363041454e-05, + "loss": 5.5707, + "step": 4046 + }, + { + "epoch": 0.02406865543819583, + "grad_norm": 2.1936891078948975, + "learning_rate": 4.992860103169802e-05, + "loss": 5.6503, + "step": 4047 + }, + { + "epoch": 0.02407460272147683, + "grad_norm": 2.2928214073181152, + "learning_rate": 4.992856575054832e-05, + "loss": 5.6067, + "step": 4048 + }, + { + "epoch": 0.024080550004757825, + "grad_norm": 2.4503591060638428, + "learning_rate": 4.992853046069632e-05, + "loss": 6.0067, + "step": 4049 + }, + { + "epoch": 0.024086497288038824, + "grad_norm": 2.84260630607605, + "learning_rate": 4.992849516214202e-05, + "loss": 6.4533, + "step": 4050 + }, + { + "epoch": 0.024092444571319822, + "grad_norm": 2.7172651290893555, + "learning_rate": 4.992845985488543e-05, + "loss": 6.4901, + "step": 4051 + }, + { + "epoch": 0.024098391854600817, + "grad_norm": 2.2101316452026367, + "learning_rate": 4.992842453892659e-05, + "loss": 6.3481, + "step": 4052 + }, + { + "epoch": 0.024104339137881816, + "grad_norm": 2.488199234008789, + "learning_rate": 4.992838921426549e-05, + "loss": 6.4893, + "step": 4053 + }, + { + "epoch": 0.02411028642116281, + "grad_norm": 2.3767058849334717, + "learning_rate": 4.992835388090215e-05, + "loss": 5.9828, + "step": 4054 + }, + { + "epoch": 0.02411623370444381, + "grad_norm": 2.3979814052581787, + "learning_rate": 4.992831853883657e-05, + "loss": 5.7607, + "step": 4055 + }, + { + "epoch": 0.024122180987724808, + "grad_norm": 2.766644239425659, + "learning_rate": 4.992828318806877e-05, + "loss": 5.523, + "step": 4056 + }, + { + "epoch": 0.024128128271005803, + "grad_norm": 3.3954427242279053, + "learning_rate": 4.9928247828598775e-05, + "loss": 6.1247, + "step": 4057 + }, + { + "epoch": 0.024134075554286802, + "grad_norm": 3.5597097873687744, + "learning_rate": 4.9928212460426585e-05, + "loss": 6.0877, + "step": 4058 + }, + { + "epoch": 0.0241400228375678, + "grad_norm": 2.8089418411254883, + "learning_rate": 4.992817708355221e-05, + "loss": 5.324, + "step": 4059 + }, + { + "epoch": 0.024145970120848795, + "grad_norm": 2.6756842136383057, + "learning_rate": 4.992814169797566e-05, + "loss": 5.5516, + "step": 4060 + }, + { + "epoch": 0.024151917404129794, + "grad_norm": 2.1218929290771484, + "learning_rate": 4.992810630369696e-05, + "loss": 6.102, + "step": 4061 + }, + { + "epoch": 0.02415786468741079, + "grad_norm": 2.7189652919769287, + "learning_rate": 4.992807090071611e-05, + "loss": 6.4258, + "step": 4062 + }, + { + "epoch": 0.024163811970691788, + "grad_norm": 2.4340744018554688, + "learning_rate": 4.992803548903313e-05, + "loss": 5.8059, + "step": 4063 + }, + { + "epoch": 0.024169759253972786, + "grad_norm": 2.46604323387146, + "learning_rate": 4.992800006864804e-05, + "loss": 5.8963, + "step": 4064 + }, + { + "epoch": 0.02417570653725378, + "grad_norm": 2.1969218254089355, + "learning_rate": 4.9927964639560835e-05, + "loss": 5.7835, + "step": 4065 + }, + { + "epoch": 0.02418165382053478, + "grad_norm": 2.4529223442077637, + "learning_rate": 4.9927929201771535e-05, + "loss": 6.3405, + "step": 4066 + }, + { + "epoch": 0.02418760110381578, + "grad_norm": 2.145331859588623, + "learning_rate": 4.992789375528015e-05, + "loss": 6.14, + "step": 4067 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 2.212646961212158, + "learning_rate": 4.99278583000867e-05, + "loss": 5.8793, + "step": 4068 + }, + { + "epoch": 0.024199495670377772, + "grad_norm": 2.3249876499176025, + "learning_rate": 4.992782283619118e-05, + "loss": 5.8702, + "step": 4069 + }, + { + "epoch": 0.024205442953658767, + "grad_norm": 2.180964946746826, + "learning_rate": 4.9927787363593634e-05, + "loss": 6.216, + "step": 4070 + }, + { + "epoch": 0.024211390236939766, + "grad_norm": 2.5633153915405273, + "learning_rate": 4.992775188229405e-05, + "loss": 6.031, + "step": 4071 + }, + { + "epoch": 0.024217337520220764, + "grad_norm": 2.867342233657837, + "learning_rate": 4.992771639229244e-05, + "loss": 5.9853, + "step": 4072 + }, + { + "epoch": 0.02422328480350176, + "grad_norm": 2.111253023147583, + "learning_rate": 4.992768089358882e-05, + "loss": 5.8404, + "step": 4073 + }, + { + "epoch": 0.024229232086782758, + "grad_norm": 1.9325549602508545, + "learning_rate": 4.992764538618321e-05, + "loss": 6.0175, + "step": 4074 + }, + { + "epoch": 0.024235179370063756, + "grad_norm": 2.721740484237671, + "learning_rate": 4.992760987007561e-05, + "loss": 5.9274, + "step": 4075 + }, + { + "epoch": 0.02424112665334475, + "grad_norm": 3.5240588188171387, + "learning_rate": 4.992757434526604e-05, + "loss": 5.3593, + "step": 4076 + }, + { + "epoch": 0.02424707393662575, + "grad_norm": 2.744248867034912, + "learning_rate": 4.9927538811754516e-05, + "loss": 5.8938, + "step": 4077 + }, + { + "epoch": 0.024253021219906745, + "grad_norm": 2.545384645462036, + "learning_rate": 4.992750326954104e-05, + "loss": 6.2127, + "step": 4078 + }, + { + "epoch": 0.024258968503187744, + "grad_norm": 2.7550806999206543, + "learning_rate": 4.992746771862563e-05, + "loss": 6.0784, + "step": 4079 + }, + { + "epoch": 0.024264915786468742, + "grad_norm": 2.408040761947632, + "learning_rate": 4.9927432159008305e-05, + "loss": 5.5908, + "step": 4080 + }, + { + "epoch": 0.024270863069749737, + "grad_norm": 2.581378698348999, + "learning_rate": 4.9927396590689066e-05, + "loss": 5.4438, + "step": 4081 + }, + { + "epoch": 0.024276810353030736, + "grad_norm": 2.4320218563079834, + "learning_rate": 4.992736101366794e-05, + "loss": 5.6239, + "step": 4082 + }, + { + "epoch": 0.024282757636311735, + "grad_norm": 2.4725472927093506, + "learning_rate": 4.992732542794492e-05, + "loss": 6.237, + "step": 4083 + }, + { + "epoch": 0.02428870491959273, + "grad_norm": 2.3081839084625244, + "learning_rate": 4.992728983352003e-05, + "loss": 5.9917, + "step": 4084 + }, + { + "epoch": 0.024294652202873728, + "grad_norm": 1.9090701341629028, + "learning_rate": 4.9927254230393287e-05, + "loss": 5.9125, + "step": 4085 + }, + { + "epoch": 0.024300599486154723, + "grad_norm": 2.3943240642547607, + "learning_rate": 4.992721861856468e-05, + "loss": 5.3431, + "step": 4086 + }, + { + "epoch": 0.024306546769435722, + "grad_norm": 2.226968765258789, + "learning_rate": 4.992718299803425e-05, + "loss": 5.4328, + "step": 4087 + }, + { + "epoch": 0.02431249405271672, + "grad_norm": 2.238218307495117, + "learning_rate": 4.9927147368801994e-05, + "loss": 5.4877, + "step": 4088 + }, + { + "epoch": 0.024318441335997715, + "grad_norm": 2.216540575027466, + "learning_rate": 4.992711173086794e-05, + "loss": 5.4037, + "step": 4089 + }, + { + "epoch": 0.024324388619278714, + "grad_norm": 2.3136301040649414, + "learning_rate": 4.992707608423208e-05, + "loss": 5.4576, + "step": 4090 + }, + { + "epoch": 0.02433033590255971, + "grad_norm": 2.0434980392456055, + "learning_rate": 4.9927040428894436e-05, + "loss": 5.8044, + "step": 4091 + }, + { + "epoch": 0.024336283185840708, + "grad_norm": 2.7837064266204834, + "learning_rate": 4.992700476485502e-05, + "loss": 6.4183, + "step": 4092 + }, + { + "epoch": 0.024342230469121706, + "grad_norm": 2.580411195755005, + "learning_rate": 4.992696909211384e-05, + "loss": 5.4545, + "step": 4093 + }, + { + "epoch": 0.0243481777524027, + "grad_norm": 2.1215696334838867, + "learning_rate": 4.9926933410670916e-05, + "loss": 5.5629, + "step": 4094 + }, + { + "epoch": 0.0243541250356837, + "grad_norm": 1.9621074199676514, + "learning_rate": 4.992689772052626e-05, + "loss": 5.5248, + "step": 4095 + }, + { + "epoch": 0.0243600723189647, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.992686202167988e-05, + "loss": 5.3285, + "step": 4096 + }, + { + "epoch": 0.024366019602245693, + "grad_norm": 1.9506359100341797, + "learning_rate": 4.992682631413179e-05, + "loss": 5.7989, + "step": 4097 + }, + { + "epoch": 0.024371966885526692, + "grad_norm": 1.9154741764068604, + "learning_rate": 4.9926790597882e-05, + "loss": 5.6029, + "step": 4098 + }, + { + "epoch": 0.024377914168807687, + "grad_norm": 2.2147481441497803, + "learning_rate": 4.9926754872930524e-05, + "loss": 5.5406, + "step": 4099 + }, + { + "epoch": 0.024383861452088686, + "grad_norm": 2.1268460750579834, + "learning_rate": 4.992671913927738e-05, + "loss": 5.6434, + "step": 4100 + }, + { + "epoch": 0.024389808735369684, + "grad_norm": 2.1212456226348877, + "learning_rate": 4.992668339692258e-05, + "loss": 5.6888, + "step": 4101 + }, + { + "epoch": 0.02439575601865068, + "grad_norm": 2.2292001247406006, + "learning_rate": 4.992664764586612e-05, + "loss": 5.3982, + "step": 4102 + }, + { + "epoch": 0.024401703301931678, + "grad_norm": 2.2713210582733154, + "learning_rate": 4.9926611886108035e-05, + "loss": 5.3521, + "step": 4103 + }, + { + "epoch": 0.024407650585212676, + "grad_norm": 2.273437738418579, + "learning_rate": 4.9926576117648314e-05, + "loss": 5.474, + "step": 4104 + }, + { + "epoch": 0.02441359786849367, + "grad_norm": 2.2879083156585693, + "learning_rate": 4.9926540340487e-05, + "loss": 5.4474, + "step": 4105 + }, + { + "epoch": 0.02441954515177467, + "grad_norm": 2.2517430782318115, + "learning_rate": 4.992650455462408e-05, + "loss": 5.5013, + "step": 4106 + }, + { + "epoch": 0.024425492435055665, + "grad_norm": 2.1391677856445312, + "learning_rate": 4.992646876005957e-05, + "loss": 5.3899, + "step": 4107 + }, + { + "epoch": 0.024431439718336664, + "grad_norm": 2.2989962100982666, + "learning_rate": 4.9926432956793494e-05, + "loss": 5.7995, + "step": 4108 + }, + { + "epoch": 0.024437387001617662, + "grad_norm": 2.550706386566162, + "learning_rate": 4.992639714482586e-05, + "loss": 5.6599, + "step": 4109 + }, + { + "epoch": 0.024443334284898657, + "grad_norm": 2.321398973464966, + "learning_rate": 4.992636132415667e-05, + "loss": 5.6852, + "step": 4110 + }, + { + "epoch": 0.024449281568179656, + "grad_norm": 2.300795555114746, + "learning_rate": 4.992632549478595e-05, + "loss": 5.7318, + "step": 4111 + }, + { + "epoch": 0.024455228851460654, + "grad_norm": 2.229156970977783, + "learning_rate": 4.992628965671371e-05, + "loss": 5.6617, + "step": 4112 + }, + { + "epoch": 0.02446117613474165, + "grad_norm": 2.253934144973755, + "learning_rate": 4.992625380993995e-05, + "loss": 5.5762, + "step": 4113 + }, + { + "epoch": 0.024467123418022648, + "grad_norm": 2.0932998657226562, + "learning_rate": 4.992621795446471e-05, + "loss": 5.568, + "step": 4114 + }, + { + "epoch": 0.024473070701303643, + "grad_norm": 2.5969886779785156, + "learning_rate": 4.9926182090287966e-05, + "loss": 5.6626, + "step": 4115 + }, + { + "epoch": 0.02447901798458464, + "grad_norm": 2.5260698795318604, + "learning_rate": 4.992614621740976e-05, + "loss": 5.6333, + "step": 4116 + }, + { + "epoch": 0.02448496526786564, + "grad_norm": 2.0017902851104736, + "learning_rate": 4.992611033583009e-05, + "loss": 5.793, + "step": 4117 + }, + { + "epoch": 0.024490912551146635, + "grad_norm": 2.1847705841064453, + "learning_rate": 4.992607444554898e-05, + "loss": 5.8348, + "step": 4118 + }, + { + "epoch": 0.024496859834427634, + "grad_norm": 2.141007900238037, + "learning_rate": 4.992603854656642e-05, + "loss": 5.7835, + "step": 4119 + }, + { + "epoch": 0.02450280711770863, + "grad_norm": 2.294605255126953, + "learning_rate": 4.992600263888245e-05, + "loss": 5.6615, + "step": 4120 + }, + { + "epoch": 0.024508754400989628, + "grad_norm": 2.433936357498169, + "learning_rate": 4.9925966722497064e-05, + "loss": 5.6479, + "step": 4121 + }, + { + "epoch": 0.024514701684270626, + "grad_norm": 2.1522979736328125, + "learning_rate": 4.992593079741028e-05, + "loss": 5.5761, + "step": 4122 + }, + { + "epoch": 0.02452064896755162, + "grad_norm": 2.141065835952759, + "learning_rate": 4.9925894863622114e-05, + "loss": 5.602, + "step": 4123 + }, + { + "epoch": 0.02452659625083262, + "grad_norm": 2.187838554382324, + "learning_rate": 4.9925858921132576e-05, + "loss": 5.6337, + "step": 4124 + }, + { + "epoch": 0.02453254353411362, + "grad_norm": 2.303027629852295, + "learning_rate": 4.992582296994167e-05, + "loss": 5.6126, + "step": 4125 + }, + { + "epoch": 0.024538490817394613, + "grad_norm": 1.9233589172363281, + "learning_rate": 4.992578701004943e-05, + "loss": 5.5852, + "step": 4126 + }, + { + "epoch": 0.024544438100675612, + "grad_norm": 2.0383386611938477, + "learning_rate": 4.992575104145585e-05, + "loss": 5.6477, + "step": 4127 + }, + { + "epoch": 0.024550385383956607, + "grad_norm": 2.2752933502197266, + "learning_rate": 4.9925715064160946e-05, + "loss": 5.6263, + "step": 4128 + }, + { + "epoch": 0.024556332667237606, + "grad_norm": 2.400083541870117, + "learning_rate": 4.9925679078164734e-05, + "loss": 5.5249, + "step": 4129 + }, + { + "epoch": 0.024562279950518604, + "grad_norm": 2.167365312576294, + "learning_rate": 4.992564308346722e-05, + "loss": 5.7299, + "step": 4130 + }, + { + "epoch": 0.0245682272337996, + "grad_norm": 1.9696096181869507, + "learning_rate": 4.9925607080068426e-05, + "loss": 5.7961, + "step": 4131 + }, + { + "epoch": 0.024574174517080598, + "grad_norm": 2.1817007064819336, + "learning_rate": 4.992557106796836e-05, + "loss": 5.7973, + "step": 4132 + }, + { + "epoch": 0.024580121800361596, + "grad_norm": 2.4329075813293457, + "learning_rate": 4.992553504716704e-05, + "loss": 6.2428, + "step": 4133 + }, + { + "epoch": 0.02458606908364259, + "grad_norm": 2.159193754196167, + "learning_rate": 4.9925499017664464e-05, + "loss": 5.5784, + "step": 4134 + }, + { + "epoch": 0.02459201636692359, + "grad_norm": 2.2614853382110596, + "learning_rate": 4.992546297946066e-05, + "loss": 5.7572, + "step": 4135 + }, + { + "epoch": 0.024597963650204585, + "grad_norm": 2.2874412536621094, + "learning_rate": 4.992542693255563e-05, + "loss": 5.5726, + "step": 4136 + }, + { + "epoch": 0.024603910933485584, + "grad_norm": 2.1634466648101807, + "learning_rate": 4.992539087694939e-05, + "loss": 5.5112, + "step": 4137 + }, + { + "epoch": 0.024609858216766582, + "grad_norm": 2.195528507232666, + "learning_rate": 4.9925354812641955e-05, + "loss": 5.6073, + "step": 4138 + }, + { + "epoch": 0.024615805500047577, + "grad_norm": 2.0328054428100586, + "learning_rate": 4.992531873963334e-05, + "loss": 5.5686, + "step": 4139 + }, + { + "epoch": 0.024621752783328576, + "grad_norm": 2.244218349456787, + "learning_rate": 4.992528265792355e-05, + "loss": 5.6871, + "step": 4140 + }, + { + "epoch": 0.024627700066609574, + "grad_norm": 2.081721544265747, + "learning_rate": 4.992524656751261e-05, + "loss": 5.5327, + "step": 4141 + }, + { + "epoch": 0.02463364734989057, + "grad_norm": 1.9305940866470337, + "learning_rate": 4.992521046840051e-05, + "loss": 5.5265, + "step": 4142 + }, + { + "epoch": 0.024639594633171568, + "grad_norm": 2.624286651611328, + "learning_rate": 4.992517436058728e-05, + "loss": 5.3881, + "step": 4143 + }, + { + "epoch": 0.024645541916452563, + "grad_norm": 2.204803705215454, + "learning_rate": 4.9925138244072935e-05, + "loss": 5.6686, + "step": 4144 + }, + { + "epoch": 0.02465148919973356, + "grad_norm": 2.4664852619171143, + "learning_rate": 4.992510211885748e-05, + "loss": 5.3152, + "step": 4145 + }, + { + "epoch": 0.02465743648301456, + "grad_norm": 2.3428542613983154, + "learning_rate": 4.992506598494093e-05, + "loss": 5.5875, + "step": 4146 + }, + { + "epoch": 0.024663383766295555, + "grad_norm": 2.1902847290039062, + "learning_rate": 4.992502984232329e-05, + "loss": 5.4826, + "step": 4147 + }, + { + "epoch": 0.024669331049576554, + "grad_norm": 2.0401039123535156, + "learning_rate": 4.992499369100459e-05, + "loss": 5.518, + "step": 4148 + }, + { + "epoch": 0.02467527833285755, + "grad_norm": 2.5250306129455566, + "learning_rate": 4.9924957530984825e-05, + "loss": 5.5744, + "step": 4149 + }, + { + "epoch": 0.024681225616138548, + "grad_norm": 1.9975959062576294, + "learning_rate": 4.9924921362264016e-05, + "loss": 5.6834, + "step": 4150 + }, + { + "epoch": 0.024687172899419546, + "grad_norm": 2.047011375427246, + "learning_rate": 4.992488518484217e-05, + "loss": 5.6703, + "step": 4151 + }, + { + "epoch": 0.02469312018270054, + "grad_norm": 2.142411470413208, + "learning_rate": 4.9924848998719314e-05, + "loss": 5.781, + "step": 4152 + }, + { + "epoch": 0.02469906746598154, + "grad_norm": 2.1012768745422363, + "learning_rate": 4.992481280389545e-05, + "loss": 5.618, + "step": 4153 + }, + { + "epoch": 0.024705014749262538, + "grad_norm": 2.4698173999786377, + "learning_rate": 4.9924776600370584e-05, + "loss": 6.4773, + "step": 4154 + }, + { + "epoch": 0.024710962032543533, + "grad_norm": 2.4975368976593018, + "learning_rate": 4.992474038814474e-05, + "loss": 5.2568, + "step": 4155 + }, + { + "epoch": 0.024716909315824532, + "grad_norm": 1.8329259157180786, + "learning_rate": 4.992470416721793e-05, + "loss": 5.775, + "step": 4156 + }, + { + "epoch": 0.024722856599105527, + "grad_norm": 1.9757754802703857, + "learning_rate": 4.992466793759015e-05, + "loss": 5.5408, + "step": 4157 + }, + { + "epoch": 0.024728803882386526, + "grad_norm": 1.8300005197525024, + "learning_rate": 4.9924631699261434e-05, + "loss": 5.5356, + "step": 4158 + }, + { + "epoch": 0.024734751165667524, + "grad_norm": 2.099102735519409, + "learning_rate": 4.992459545223179e-05, + "loss": 5.6811, + "step": 4159 + }, + { + "epoch": 0.02474069844894852, + "grad_norm": 2.000169277191162, + "learning_rate": 4.992455919650123e-05, + "loss": 5.511, + "step": 4160 + }, + { + "epoch": 0.024746645732229518, + "grad_norm": 2.0555150508880615, + "learning_rate": 4.992452293206976e-05, + "loss": 5.7553, + "step": 4161 + }, + { + "epoch": 0.024752593015510516, + "grad_norm": 2.0416486263275146, + "learning_rate": 4.99244866589374e-05, + "loss": 5.6965, + "step": 4162 + }, + { + "epoch": 0.02475854029879151, + "grad_norm": 2.0028059482574463, + "learning_rate": 4.9924450377104146e-05, + "loss": 5.7211, + "step": 4163 + }, + { + "epoch": 0.02476448758207251, + "grad_norm": 2.22377872467041, + "learning_rate": 4.992441408657004e-05, + "loss": 5.6384, + "step": 4164 + }, + { + "epoch": 0.024770434865353505, + "grad_norm": 2.038804531097412, + "learning_rate": 4.9924377787335064e-05, + "loss": 5.6351, + "step": 4165 + }, + { + "epoch": 0.024776382148634504, + "grad_norm": 2.357773542404175, + "learning_rate": 4.992434147939925e-05, + "loss": 5.2791, + "step": 4166 + }, + { + "epoch": 0.024782329431915502, + "grad_norm": 2.1949357986450195, + "learning_rate": 4.992430516276261e-05, + "loss": 5.7389, + "step": 4167 + }, + { + "epoch": 0.024788276715196497, + "grad_norm": 2.1015608310699463, + "learning_rate": 4.992426883742516e-05, + "loss": 5.632, + "step": 4168 + }, + { + "epoch": 0.024794223998477496, + "grad_norm": 2.166201591491699, + "learning_rate": 4.992423250338689e-05, + "loss": 5.5701, + "step": 4169 + }, + { + "epoch": 0.024800171281758494, + "grad_norm": 2.0805492401123047, + "learning_rate": 4.9924196160647836e-05, + "loss": 5.5955, + "step": 4170 + }, + { + "epoch": 0.02480611856503949, + "grad_norm": 1.803229570388794, + "learning_rate": 4.9924159809208e-05, + "loss": 5.6267, + "step": 4171 + }, + { + "epoch": 0.024812065848320488, + "grad_norm": 2.008639335632324, + "learning_rate": 4.9924123449067393e-05, + "loss": 5.6667, + "step": 4172 + }, + { + "epoch": 0.024818013131601483, + "grad_norm": 1.9843655824661255, + "learning_rate": 4.9924087080226044e-05, + "loss": 5.5981, + "step": 4173 + }, + { + "epoch": 0.02482396041488248, + "grad_norm": 2.10270357131958, + "learning_rate": 4.9924050702683946e-05, + "loss": 5.5293, + "step": 4174 + }, + { + "epoch": 0.02482990769816348, + "grad_norm": 2.315976142883301, + "learning_rate": 4.992401431644112e-05, + "loss": 5.6046, + "step": 4175 + }, + { + "epoch": 0.024835854981444475, + "grad_norm": 2.168473482131958, + "learning_rate": 4.992397792149758e-05, + "loss": 5.4271, + "step": 4176 + }, + { + "epoch": 0.024841802264725474, + "grad_norm": 2.1870200634002686, + "learning_rate": 4.9923941517853335e-05, + "loss": 5.6399, + "step": 4177 + }, + { + "epoch": 0.024847749548006472, + "grad_norm": 2.2944717407226562, + "learning_rate": 4.9923905105508394e-05, + "loss": 5.4483, + "step": 4178 + }, + { + "epoch": 0.024853696831287467, + "grad_norm": 2.1662731170654297, + "learning_rate": 4.9923868684462785e-05, + "loss": 5.6773, + "step": 4179 + }, + { + "epoch": 0.024859644114568466, + "grad_norm": 1.7448937892913818, + "learning_rate": 4.992383225471651e-05, + "loss": 5.6097, + "step": 4180 + }, + { + "epoch": 0.02486559139784946, + "grad_norm": 2.3577585220336914, + "learning_rate": 4.9923795816269576e-05, + "loss": 5.5003, + "step": 4181 + }, + { + "epoch": 0.02487153868113046, + "grad_norm": 2.4175360202789307, + "learning_rate": 4.9923759369122e-05, + "loss": 5.4925, + "step": 4182 + }, + { + "epoch": 0.024877485964411458, + "grad_norm": 2.199329137802124, + "learning_rate": 4.992372291327381e-05, + "loss": 5.6239, + "step": 4183 + }, + { + "epoch": 0.024883433247692453, + "grad_norm": 2.054450511932373, + "learning_rate": 4.9923686448724994e-05, + "loss": 5.59, + "step": 4184 + }, + { + "epoch": 0.024889380530973452, + "grad_norm": 2.0354533195495605, + "learning_rate": 4.9923649975475585e-05, + "loss": 5.6092, + "step": 4185 + }, + { + "epoch": 0.024895327814254447, + "grad_norm": 2.0409371852874756, + "learning_rate": 4.9923613493525576e-05, + "loss": 5.5009, + "step": 4186 + }, + { + "epoch": 0.024901275097535445, + "grad_norm": 2.3314719200134277, + "learning_rate": 4.992357700287501e-05, + "loss": 5.5077, + "step": 4187 + }, + { + "epoch": 0.024907222380816444, + "grad_norm": 2.050706386566162, + "learning_rate": 4.9923540503523865e-05, + "loss": 5.5857, + "step": 4188 + }, + { + "epoch": 0.02491316966409744, + "grad_norm": 2.3477721214294434, + "learning_rate": 4.992350399547218e-05, + "loss": 5.5119, + "step": 4189 + }, + { + "epoch": 0.024919116947378438, + "grad_norm": 2.365171194076538, + "learning_rate": 4.992346747871994e-05, + "loss": 5.583, + "step": 4190 + }, + { + "epoch": 0.024925064230659436, + "grad_norm": 1.9642738103866577, + "learning_rate": 4.992343095326719e-05, + "loss": 5.3527, + "step": 4191 + }, + { + "epoch": 0.02493101151394043, + "grad_norm": 2.25437593460083, + "learning_rate": 4.992339441911392e-05, + "loss": 5.4751, + "step": 4192 + }, + { + "epoch": 0.02493695879722143, + "grad_norm": 2.0476715564727783, + "learning_rate": 4.992335787626016e-05, + "loss": 5.5808, + "step": 4193 + }, + { + "epoch": 0.024942906080502425, + "grad_norm": 2.248382329940796, + "learning_rate": 4.992332132470591e-05, + "loss": 5.5771, + "step": 4194 + }, + { + "epoch": 0.024948853363783424, + "grad_norm": 2.279232978820801, + "learning_rate": 4.992328476445118e-05, + "loss": 5.3803, + "step": 4195 + }, + { + "epoch": 0.024954800647064422, + "grad_norm": 2.0171918869018555, + "learning_rate": 4.992324819549599e-05, + "loss": 5.662, + "step": 4196 + }, + { + "epoch": 0.024960747930345417, + "grad_norm": 2.14736008644104, + "learning_rate": 4.992321161784036e-05, + "loss": 5.6422, + "step": 4197 + }, + { + "epoch": 0.024966695213626416, + "grad_norm": 2.1694438457489014, + "learning_rate": 4.9923175031484284e-05, + "loss": 5.4377, + "step": 4198 + }, + { + "epoch": 0.024972642496907414, + "grad_norm": 1.9280356168746948, + "learning_rate": 4.9923138436427784e-05, + "loss": 5.5499, + "step": 4199 + }, + { + "epoch": 0.02497858978018841, + "grad_norm": 2.185974359512329, + "learning_rate": 4.992310183267088e-05, + "loss": 5.6404, + "step": 4200 + }, + { + "epoch": 0.024984537063469408, + "grad_norm": 2.102681875228882, + "learning_rate": 4.9923065220213585e-05, + "loss": 5.5888, + "step": 4201 + }, + { + "epoch": 0.024990484346750403, + "grad_norm": 2.07100772857666, + "learning_rate": 4.99230285990559e-05, + "loss": 5.6473, + "step": 4202 + }, + { + "epoch": 0.0249964316300314, + "grad_norm": 2.088634967803955, + "learning_rate": 4.992299196919784e-05, + "loss": 5.4993, + "step": 4203 + }, + { + "epoch": 0.0250023789133124, + "grad_norm": 2.2086873054504395, + "learning_rate": 4.992295533063942e-05, + "loss": 5.5797, + "step": 4204 + }, + { + "epoch": 0.025008326196593395, + "grad_norm": 2.250753164291382, + "learning_rate": 4.992291868338066e-05, + "loss": 5.5666, + "step": 4205 + }, + { + "epoch": 0.025014273479874394, + "grad_norm": 2.132636785507202, + "learning_rate": 4.992288202742156e-05, + "loss": 5.6715, + "step": 4206 + }, + { + "epoch": 0.025020220763155392, + "grad_norm": 2.8332200050354004, + "learning_rate": 4.992284536276214e-05, + "loss": 4.9687, + "step": 4207 + }, + { + "epoch": 0.025026168046436387, + "grad_norm": 2.345991849899292, + "learning_rate": 4.992280868940241e-05, + "loss": 5.2181, + "step": 4208 + }, + { + "epoch": 0.025032115329717386, + "grad_norm": 2.149568557739258, + "learning_rate": 4.992277200734239e-05, + "loss": 5.5336, + "step": 4209 + }, + { + "epoch": 0.02503806261299838, + "grad_norm": 2.031353235244751, + "learning_rate": 4.992273531658209e-05, + "loss": 5.5779, + "step": 4210 + }, + { + "epoch": 0.02504400989627938, + "grad_norm": 2.217374086380005, + "learning_rate": 4.9922698617121524e-05, + "loss": 5.782, + "step": 4211 + }, + { + "epoch": 0.025049957179560378, + "grad_norm": 2.3629000186920166, + "learning_rate": 4.992266190896069e-05, + "loss": 5.7916, + "step": 4212 + }, + { + "epoch": 0.025055904462841373, + "grad_norm": 2.2439091205596924, + "learning_rate": 4.9922625192099616e-05, + "loss": 5.8002, + "step": 4213 + }, + { + "epoch": 0.025061851746122372, + "grad_norm": 2.1707634925842285, + "learning_rate": 4.992258846653831e-05, + "loss": 6.5789, + "step": 4214 + }, + { + "epoch": 0.025067799029403367, + "grad_norm": 3.1655468940734863, + "learning_rate": 4.992255173227679e-05, + "loss": 6.3867, + "step": 4215 + }, + { + "epoch": 0.025073746312684365, + "grad_norm": 3.1309874057769775, + "learning_rate": 4.992251498931506e-05, + "loss": 6.2682, + "step": 4216 + }, + { + "epoch": 0.025079693595965364, + "grad_norm": 3.2077460289001465, + "learning_rate": 4.992247823765315e-05, + "loss": 5.8593, + "step": 4217 + }, + { + "epoch": 0.02508564087924636, + "grad_norm": 2.2944962978363037, + "learning_rate": 4.992244147729105e-05, + "loss": 5.7994, + "step": 4218 + }, + { + "epoch": 0.025091588162527358, + "grad_norm": 2.2380926609039307, + "learning_rate": 4.9922404708228776e-05, + "loss": 5.7606, + "step": 4219 + }, + { + "epoch": 0.025097535445808356, + "grad_norm": 2.601795196533203, + "learning_rate": 4.992236793046636e-05, + "loss": 5.7585, + "step": 4220 + }, + { + "epoch": 0.02510348272908935, + "grad_norm": 2.494765520095825, + "learning_rate": 4.99223311440038e-05, + "loss": 5.8102, + "step": 4221 + }, + { + "epoch": 0.02510943001237035, + "grad_norm": 2.4690544605255127, + "learning_rate": 4.992229434884111e-05, + "loss": 5.8682, + "step": 4222 + }, + { + "epoch": 0.025115377295651345, + "grad_norm": 2.1011085510253906, + "learning_rate": 4.99222575449783e-05, + "loss": 5.6982, + "step": 4223 + }, + { + "epoch": 0.025121324578932343, + "grad_norm": 2.2298128604888916, + "learning_rate": 4.992222073241539e-05, + "loss": 5.7606, + "step": 4224 + }, + { + "epoch": 0.025127271862213342, + "grad_norm": 1.93464994430542, + "learning_rate": 4.99221839111524e-05, + "loss": 5.7097, + "step": 4225 + }, + { + "epoch": 0.025133219145494337, + "grad_norm": 2.15191650390625, + "learning_rate": 4.9922147081189324e-05, + "loss": 5.5852, + "step": 4226 + }, + { + "epoch": 0.025139166428775336, + "grad_norm": 2.086954355239868, + "learning_rate": 4.992211024252619e-05, + "loss": 5.5871, + "step": 4227 + }, + { + "epoch": 0.025145113712056334, + "grad_norm": 2.212296724319458, + "learning_rate": 4.9922073395162995e-05, + "loss": 5.562, + "step": 4228 + }, + { + "epoch": 0.02515106099533733, + "grad_norm": 2.0786778926849365, + "learning_rate": 4.992203653909977e-05, + "loss": 5.6599, + "step": 4229 + }, + { + "epoch": 0.025157008278618328, + "grad_norm": 2.3243489265441895, + "learning_rate": 4.9921999674336514e-05, + "loss": 5.9791, + "step": 4230 + }, + { + "epoch": 0.025162955561899323, + "grad_norm": 2.1922898292541504, + "learning_rate": 4.9921962800873247e-05, + "loss": 5.7352, + "step": 4231 + }, + { + "epoch": 0.02516890284518032, + "grad_norm": 2.1154398918151855, + "learning_rate": 4.992192591870998e-05, + "loss": 5.6408, + "step": 4232 + }, + { + "epoch": 0.02517485012846132, + "grad_norm": 2.3520143032073975, + "learning_rate": 4.992188902784673e-05, + "loss": 5.6318, + "step": 4233 + }, + { + "epoch": 0.025180797411742315, + "grad_norm": 2.16597580909729, + "learning_rate": 4.99218521282835e-05, + "loss": 5.4978, + "step": 4234 + }, + { + "epoch": 0.025186744695023314, + "grad_norm": 2.2510032653808594, + "learning_rate": 4.992181522002032e-05, + "loss": 5.4863, + "step": 4235 + }, + { + "epoch": 0.025192691978304312, + "grad_norm": 1.9984945058822632, + "learning_rate": 4.9921778303057174e-05, + "loss": 5.7514, + "step": 4236 + }, + { + "epoch": 0.025198639261585307, + "grad_norm": 2.019435167312622, + "learning_rate": 4.9921741377394106e-05, + "loss": 5.6481, + "step": 4237 + }, + { + "epoch": 0.025204586544866306, + "grad_norm": 1.8546136617660522, + "learning_rate": 4.9921704443031114e-05, + "loss": 5.5907, + "step": 4238 + }, + { + "epoch": 0.0252105338281473, + "grad_norm": 2.012821912765503, + "learning_rate": 4.9921667499968214e-05, + "loss": 5.6942, + "step": 4239 + }, + { + "epoch": 0.0252164811114283, + "grad_norm": 2.215322971343994, + "learning_rate": 4.992163054820541e-05, + "loss": 5.6248, + "step": 4240 + }, + { + "epoch": 0.025222428394709298, + "grad_norm": 2.1009631156921387, + "learning_rate": 4.9921593587742726e-05, + "loss": 5.7769, + "step": 4241 + }, + { + "epoch": 0.025228375677990293, + "grad_norm": 2.280970335006714, + "learning_rate": 4.992155661858017e-05, + "loss": 5.4233, + "step": 4242 + }, + { + "epoch": 0.025234322961271292, + "grad_norm": 2.324589729309082, + "learning_rate": 4.992151964071776e-05, + "loss": 5.7138, + "step": 4243 + }, + { + "epoch": 0.025240270244552287, + "grad_norm": 2.01705002784729, + "learning_rate": 4.9921482654155506e-05, + "loss": 5.6946, + "step": 4244 + }, + { + "epoch": 0.025246217527833285, + "grad_norm": 2.0912036895751953, + "learning_rate": 4.9921445658893414e-05, + "loss": 5.8085, + "step": 4245 + }, + { + "epoch": 0.025252164811114284, + "grad_norm": 2.03450870513916, + "learning_rate": 4.99214086549315e-05, + "loss": 5.9129, + "step": 4246 + }, + { + "epoch": 0.02525811209439528, + "grad_norm": 2.1532092094421387, + "learning_rate": 4.9921371642269786e-05, + "loss": 5.708, + "step": 4247 + }, + { + "epoch": 0.025264059377676278, + "grad_norm": 2.2842540740966797, + "learning_rate": 4.992133462090828e-05, + "loss": 5.6693, + "step": 4248 + }, + { + "epoch": 0.025270006660957276, + "grad_norm": 2.0693325996398926, + "learning_rate": 4.9921297590846997e-05, + "loss": 5.7278, + "step": 4249 + }, + { + "epoch": 0.02527595394423827, + "grad_norm": 2.0139124393463135, + "learning_rate": 4.9921260552085934e-05, + "loss": 5.5897, + "step": 4250 + }, + { + "epoch": 0.02528190122751927, + "grad_norm": 2.4587321281433105, + "learning_rate": 4.9921223504625125e-05, + "loss": 5.6884, + "step": 4251 + }, + { + "epoch": 0.025287848510800265, + "grad_norm": 2.062640428543091, + "learning_rate": 4.992118644846457e-05, + "loss": 5.6189, + "step": 4252 + }, + { + "epoch": 0.025293795794081263, + "grad_norm": 1.9889299869537354, + "learning_rate": 4.992114938360429e-05, + "loss": 5.7326, + "step": 4253 + }, + { + "epoch": 0.025299743077362262, + "grad_norm": 2.001913547515869, + "learning_rate": 4.992111231004429e-05, + "loss": 5.6765, + "step": 4254 + }, + { + "epoch": 0.025305690360643257, + "grad_norm": 2.0345358848571777, + "learning_rate": 4.992107522778459e-05, + "loss": 5.5783, + "step": 4255 + }, + { + "epoch": 0.025311637643924256, + "grad_norm": 2.277817487716675, + "learning_rate": 4.9921038136825205e-05, + "loss": 5.6672, + "step": 4256 + }, + { + "epoch": 0.025317584927205254, + "grad_norm": 1.8992491960525513, + "learning_rate": 4.992100103716614e-05, + "loss": 5.532, + "step": 4257 + }, + { + "epoch": 0.02532353221048625, + "grad_norm": 2.202746629714966, + "learning_rate": 4.992096392880741e-05, + "loss": 5.697, + "step": 4258 + }, + { + "epoch": 0.025329479493767248, + "grad_norm": 2.020514488220215, + "learning_rate": 4.992092681174903e-05, + "loss": 5.9102, + "step": 4259 + }, + { + "epoch": 0.025335426777048243, + "grad_norm": 2.0697989463806152, + "learning_rate": 4.9920889685991e-05, + "loss": 5.5165, + "step": 4260 + }, + { + "epoch": 0.02534137406032924, + "grad_norm": 2.619258165359497, + "learning_rate": 4.992085255153336e-05, + "loss": 5.6577, + "step": 4261 + }, + { + "epoch": 0.02534732134361024, + "grad_norm": 2.1612637042999268, + "learning_rate": 4.99208154083761e-05, + "loss": 5.8193, + "step": 4262 + }, + { + "epoch": 0.025353268626891235, + "grad_norm": 1.9237465858459473, + "learning_rate": 4.9920778256519244e-05, + "loss": 5.6533, + "step": 4263 + }, + { + "epoch": 0.025359215910172234, + "grad_norm": 2.164339065551758, + "learning_rate": 4.99207410959628e-05, + "loss": 5.5566, + "step": 4264 + }, + { + "epoch": 0.025365163193453232, + "grad_norm": 2.0753626823425293, + "learning_rate": 4.992070392670678e-05, + "loss": 5.8444, + "step": 4265 + }, + { + "epoch": 0.025371110476734227, + "grad_norm": 1.977522850036621, + "learning_rate": 4.992066674875121e-05, + "loss": 5.6615, + "step": 4266 + }, + { + "epoch": 0.025377057760015226, + "grad_norm": 1.9911431074142456, + "learning_rate": 4.992062956209608e-05, + "loss": 5.6366, + "step": 4267 + }, + { + "epoch": 0.02538300504329622, + "grad_norm": 2.0334808826446533, + "learning_rate": 4.992059236674142e-05, + "loss": 5.8399, + "step": 4268 + }, + { + "epoch": 0.02538895232657722, + "grad_norm": 2.2869162559509277, + "learning_rate": 4.992055516268724e-05, + "loss": 5.7302, + "step": 4269 + }, + { + "epoch": 0.025394899609858218, + "grad_norm": 2.0845389366149902, + "learning_rate": 4.9920517949933556e-05, + "loss": 5.619, + "step": 4270 + }, + { + "epoch": 0.025400846893139213, + "grad_norm": 2.290881633758545, + "learning_rate": 4.9920480728480376e-05, + "loss": 5.5629, + "step": 4271 + }, + { + "epoch": 0.02540679417642021, + "grad_norm": 2.0897767543792725, + "learning_rate": 4.9920443498327706e-05, + "loss": 5.7009, + "step": 4272 + }, + { + "epoch": 0.025412741459701207, + "grad_norm": 1.8389668464660645, + "learning_rate": 4.9920406259475574e-05, + "loss": 5.6359, + "step": 4273 + }, + { + "epoch": 0.025418688742982205, + "grad_norm": 2.0262937545776367, + "learning_rate": 4.992036901192399e-05, + "loss": 5.6707, + "step": 4274 + }, + { + "epoch": 0.025424636026263204, + "grad_norm": 2.04280686378479, + "learning_rate": 4.992033175567295e-05, + "loss": 5.7917, + "step": 4275 + }, + { + "epoch": 0.0254305833095442, + "grad_norm": 2.0945205688476562, + "learning_rate": 4.992029449072249e-05, + "loss": 5.7208, + "step": 4276 + }, + { + "epoch": 0.025436530592825198, + "grad_norm": 1.9662036895751953, + "learning_rate": 4.992025721707261e-05, + "loss": 5.7141, + "step": 4277 + }, + { + "epoch": 0.025442477876106196, + "grad_norm": 2.582284450531006, + "learning_rate": 4.9920219934723316e-05, + "loss": 5.9514, + "step": 4278 + }, + { + "epoch": 0.02544842515938719, + "grad_norm": 1.9792051315307617, + "learning_rate": 4.992018264367464e-05, + "loss": 5.3867, + "step": 4279 + }, + { + "epoch": 0.02545437244266819, + "grad_norm": 2.0107717514038086, + "learning_rate": 4.992014534392658e-05, + "loss": 5.5985, + "step": 4280 + }, + { + "epoch": 0.025460319725949185, + "grad_norm": 2.2035727500915527, + "learning_rate": 4.9920108035479166e-05, + "loss": 5.6356, + "step": 4281 + }, + { + "epoch": 0.025466267009230183, + "grad_norm": 2.1973958015441895, + "learning_rate": 4.992007071833239e-05, + "loss": 5.3557, + "step": 4282 + }, + { + "epoch": 0.025472214292511182, + "grad_norm": 2.031371831893921, + "learning_rate": 4.9920033392486275e-05, + "loss": 5.484, + "step": 4283 + }, + { + "epoch": 0.025478161575792177, + "grad_norm": 1.9966185092926025, + "learning_rate": 4.991999605794084e-05, + "loss": 5.4137, + "step": 4284 + }, + { + "epoch": 0.025484108859073176, + "grad_norm": 1.699460506439209, + "learning_rate": 4.9919958714696085e-05, + "loss": 5.7099, + "step": 4285 + }, + { + "epoch": 0.025490056142354174, + "grad_norm": 2.270535945892334, + "learning_rate": 4.991992136275203e-05, + "loss": 5.6654, + "step": 4286 + }, + { + "epoch": 0.02549600342563517, + "grad_norm": 2.0636515617370605, + "learning_rate": 4.99198840021087e-05, + "loss": 5.6996, + "step": 4287 + }, + { + "epoch": 0.025501950708916168, + "grad_norm": 2.217365026473999, + "learning_rate": 4.991984663276608e-05, + "loss": 5.6148, + "step": 4288 + }, + { + "epoch": 0.025507897992197163, + "grad_norm": 2.182109832763672, + "learning_rate": 4.99198092547242e-05, + "loss": 5.6469, + "step": 4289 + }, + { + "epoch": 0.02551384527547816, + "grad_norm": 1.995924472808838, + "learning_rate": 4.9919771867983084e-05, + "loss": 5.7607, + "step": 4290 + }, + { + "epoch": 0.02551979255875916, + "grad_norm": 1.9308382272720337, + "learning_rate": 4.991973447254272e-05, + "loss": 5.7219, + "step": 4291 + }, + { + "epoch": 0.025525739842040155, + "grad_norm": 2.2675700187683105, + "learning_rate": 4.991969706840315e-05, + "loss": 5.7348, + "step": 4292 + }, + { + "epoch": 0.025531687125321154, + "grad_norm": 2.0441880226135254, + "learning_rate": 4.991965965556435e-05, + "loss": 5.5827, + "step": 4293 + }, + { + "epoch": 0.025537634408602152, + "grad_norm": 2.0111331939697266, + "learning_rate": 4.9919622234026376e-05, + "loss": 5.5355, + "step": 4294 + }, + { + "epoch": 0.025543581691883147, + "grad_norm": 2.214946985244751, + "learning_rate": 4.991958480378921e-05, + "loss": 5.5327, + "step": 4295 + }, + { + "epoch": 0.025549528975164146, + "grad_norm": 1.9673919677734375, + "learning_rate": 4.991954736485287e-05, + "loss": 5.5744, + "step": 4296 + }, + { + "epoch": 0.02555547625844514, + "grad_norm": 2.0662097930908203, + "learning_rate": 4.991950991721738e-05, + "loss": 5.5301, + "step": 4297 + }, + { + "epoch": 0.02556142354172614, + "grad_norm": 2.1912949085235596, + "learning_rate": 4.991947246088274e-05, + "loss": 5.6505, + "step": 4298 + }, + { + "epoch": 0.025567370825007138, + "grad_norm": 2.1073548793792725, + "learning_rate": 4.991943499584898e-05, + "loss": 5.7429, + "step": 4299 + }, + { + "epoch": 0.025573318108288133, + "grad_norm": 2.4015331268310547, + "learning_rate": 4.9919397522116096e-05, + "loss": 5.9959, + "step": 4300 + }, + { + "epoch": 0.02557926539156913, + "grad_norm": 2.5571470260620117, + "learning_rate": 4.99193600396841e-05, + "loss": 5.9058, + "step": 4301 + }, + { + "epoch": 0.02558521267485013, + "grad_norm": 2.148449182510376, + "learning_rate": 4.9919322548553026e-05, + "loss": 5.6298, + "step": 4302 + }, + { + "epoch": 0.025591159958131125, + "grad_norm": 2.3006222248077393, + "learning_rate": 4.991928504872287e-05, + "loss": 5.4854, + "step": 4303 + }, + { + "epoch": 0.025597107241412124, + "grad_norm": 2.2384679317474365, + "learning_rate": 4.9919247540193646e-05, + "loss": 5.7089, + "step": 4304 + }, + { + "epoch": 0.02560305452469312, + "grad_norm": 2.195736885070801, + "learning_rate": 4.9919210022965376e-05, + "loss": 5.986, + "step": 4305 + }, + { + "epoch": 0.025609001807974117, + "grad_norm": 2.3446342945098877, + "learning_rate": 4.991917249703806e-05, + "loss": 5.88, + "step": 4306 + }, + { + "epoch": 0.025614949091255116, + "grad_norm": 2.3800623416900635, + "learning_rate": 4.9919134962411724e-05, + "loss": 5.6897, + "step": 4307 + }, + { + "epoch": 0.02562089637453611, + "grad_norm": 1.8407396078109741, + "learning_rate": 4.991909741908637e-05, + "loss": 5.7359, + "step": 4308 + }, + { + "epoch": 0.02562684365781711, + "grad_norm": 2.3566956520080566, + "learning_rate": 4.9919059867062026e-05, + "loss": 5.5606, + "step": 4309 + }, + { + "epoch": 0.025632790941098105, + "grad_norm": 2.149317741394043, + "learning_rate": 4.991902230633869e-05, + "loss": 5.6966, + "step": 4310 + }, + { + "epoch": 0.025638738224379103, + "grad_norm": 2.3567728996276855, + "learning_rate": 4.991898473691638e-05, + "loss": 5.4694, + "step": 4311 + }, + { + "epoch": 0.025644685507660102, + "grad_norm": 1.9388068914413452, + "learning_rate": 4.9918947158795106e-05, + "loss": 5.5947, + "step": 4312 + }, + { + "epoch": 0.025650632790941097, + "grad_norm": 1.844419002532959, + "learning_rate": 4.9918909571974893e-05, + "loss": 5.6159, + "step": 4313 + }, + { + "epoch": 0.025656580074222095, + "grad_norm": 1.8664250373840332, + "learning_rate": 4.991887197645574e-05, + "loss": 5.7211, + "step": 4314 + }, + { + "epoch": 0.025662527357503094, + "grad_norm": 2.073004961013794, + "learning_rate": 4.991883437223767e-05, + "loss": 5.8873, + "step": 4315 + }, + { + "epoch": 0.02566847464078409, + "grad_norm": 2.316938877105713, + "learning_rate": 4.991879675932068e-05, + "loss": 5.4372, + "step": 4316 + }, + { + "epoch": 0.025674421924065088, + "grad_norm": 2.2646546363830566, + "learning_rate": 4.991875913770481e-05, + "loss": 5.5486, + "step": 4317 + }, + { + "epoch": 0.025680369207346083, + "grad_norm": 2.2417361736297607, + "learning_rate": 4.991872150739005e-05, + "loss": 5.2264, + "step": 4318 + }, + { + "epoch": 0.02568631649062708, + "grad_norm": 2.271566867828369, + "learning_rate": 4.9918683868376437e-05, + "loss": 5.1546, + "step": 4319 + }, + { + "epoch": 0.02569226377390808, + "grad_norm": 2.211650848388672, + "learning_rate": 4.9918646220663954e-05, + "loss": 5.382, + "step": 4320 + }, + { + "epoch": 0.025698211057189075, + "grad_norm": 2.3627288341522217, + "learning_rate": 4.991860856425263e-05, + "loss": 5.6099, + "step": 4321 + }, + { + "epoch": 0.025704158340470074, + "grad_norm": 2.3968141078948975, + "learning_rate": 4.991857089914249e-05, + "loss": 5.3689, + "step": 4322 + }, + { + "epoch": 0.025710105623751072, + "grad_norm": 2.3576786518096924, + "learning_rate": 4.991853322533352e-05, + "loss": 5.4441, + "step": 4323 + }, + { + "epoch": 0.025716052907032067, + "grad_norm": 2.0814530849456787, + "learning_rate": 4.991849554282575e-05, + "loss": 5.6137, + "step": 4324 + }, + { + "epoch": 0.025722000190313066, + "grad_norm": 2.103505849838257, + "learning_rate": 4.991845785161919e-05, + "loss": 5.5518, + "step": 4325 + }, + { + "epoch": 0.02572794747359406, + "grad_norm": 2.188350200653076, + "learning_rate": 4.991842015171386e-05, + "loss": 5.5958, + "step": 4326 + }, + { + "epoch": 0.02573389475687506, + "grad_norm": 2.124088764190674, + "learning_rate": 4.9918382443109766e-05, + "loss": 5.3851, + "step": 4327 + }, + { + "epoch": 0.025739842040156058, + "grad_norm": 2.181466579437256, + "learning_rate": 4.991834472580692e-05, + "loss": 5.4629, + "step": 4328 + }, + { + "epoch": 0.025745789323437053, + "grad_norm": 1.9634013175964355, + "learning_rate": 4.9918306999805344e-05, + "loss": 5.4768, + "step": 4329 + }, + { + "epoch": 0.02575173660671805, + "grad_norm": 2.2046115398406982, + "learning_rate": 4.991826926510503e-05, + "loss": 5.3977, + "step": 4330 + }, + { + "epoch": 0.02575768388999905, + "grad_norm": 1.8660465478897095, + "learning_rate": 4.9918231521706014e-05, + "loss": 5.4837, + "step": 4331 + }, + { + "epoch": 0.025763631173280045, + "grad_norm": 1.9825572967529297, + "learning_rate": 4.99181937696083e-05, + "loss": 5.5158, + "step": 4332 + }, + { + "epoch": 0.025769578456561044, + "grad_norm": 1.9114030599594116, + "learning_rate": 4.9918156008811906e-05, + "loss": 5.3291, + "step": 4333 + }, + { + "epoch": 0.02577552573984204, + "grad_norm": 2.008059024810791, + "learning_rate": 4.9918118239316835e-05, + "loss": 5.2993, + "step": 4334 + }, + { + "epoch": 0.025781473023123037, + "grad_norm": 2.0090153217315674, + "learning_rate": 4.991808046112311e-05, + "loss": 5.2951, + "step": 4335 + }, + { + "epoch": 0.025787420306404036, + "grad_norm": 2.013878345489502, + "learning_rate": 4.991804267423074e-05, + "loss": 5.3491, + "step": 4336 + }, + { + "epoch": 0.02579336758968503, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.9918004878639734e-05, + "loss": 5.2744, + "step": 4337 + }, + { + "epoch": 0.02579931487296603, + "grad_norm": 1.9945006370544434, + "learning_rate": 4.991796707435012e-05, + "loss": 5.5176, + "step": 4338 + }, + { + "epoch": 0.025805262156247025, + "grad_norm": 2.1205811500549316, + "learning_rate": 4.9917929261361894e-05, + "loss": 5.6534, + "step": 4339 + }, + { + "epoch": 0.025811209439528023, + "grad_norm": 2.6607353687286377, + "learning_rate": 4.991789143967508e-05, + "loss": 6.343, + "step": 4340 + }, + { + "epoch": 0.025817156722809022, + "grad_norm": 2.241818904876709, + "learning_rate": 4.991785360928968e-05, + "loss": 5.6774, + "step": 4341 + }, + { + "epoch": 0.025823104006090017, + "grad_norm": 1.9817326068878174, + "learning_rate": 4.9917815770205723e-05, + "loss": 5.7686, + "step": 4342 + }, + { + "epoch": 0.025829051289371015, + "grad_norm": 2.323802947998047, + "learning_rate": 4.991777792242321e-05, + "loss": 5.9564, + "step": 4343 + }, + { + "epoch": 0.025834998572652014, + "grad_norm": 2.3318228721618652, + "learning_rate": 4.991774006594216e-05, + "loss": 5.9057, + "step": 4344 + }, + { + "epoch": 0.02584094585593301, + "grad_norm": 2.032776355743408, + "learning_rate": 4.991770220076258e-05, + "loss": 5.9753, + "step": 4345 + }, + { + "epoch": 0.025846893139214008, + "grad_norm": 2.116837739944458, + "learning_rate": 4.9917664326884495e-05, + "loss": 5.8458, + "step": 4346 + }, + { + "epoch": 0.025852840422495003, + "grad_norm": 2.312878370285034, + "learning_rate": 4.991762644430791e-05, + "loss": 5.5128, + "step": 4347 + }, + { + "epoch": 0.025858787705776, + "grad_norm": 2.3003859519958496, + "learning_rate": 4.991758855303283e-05, + "loss": 5.7192, + "step": 4348 + }, + { + "epoch": 0.025864734989057, + "grad_norm": 1.898258924484253, + "learning_rate": 4.9917550653059286e-05, + "loss": 5.6422, + "step": 4349 + }, + { + "epoch": 0.025870682272337995, + "grad_norm": 1.9477754831314087, + "learning_rate": 4.9917512744387276e-05, + "loss": 5.7885, + "step": 4350 + }, + { + "epoch": 0.025876629555618993, + "grad_norm": 2.479979991912842, + "learning_rate": 4.991747482701683e-05, + "loss": 5.4692, + "step": 4351 + }, + { + "epoch": 0.025882576838899992, + "grad_norm": 2.324336290359497, + "learning_rate": 4.991743690094794e-05, + "loss": 5.4186, + "step": 4352 + }, + { + "epoch": 0.025888524122180987, + "grad_norm": 2.076723337173462, + "learning_rate": 4.9917398966180625e-05, + "loss": 5.4363, + "step": 4353 + }, + { + "epoch": 0.025894471405461986, + "grad_norm": 1.9004534482955933, + "learning_rate": 4.991736102271492e-05, + "loss": 5.6451, + "step": 4354 + }, + { + "epoch": 0.02590041868874298, + "grad_norm": 1.8098558187484741, + "learning_rate": 4.991732307055082e-05, + "loss": 5.8666, + "step": 4355 + }, + { + "epoch": 0.02590636597202398, + "grad_norm": 2.1158571243286133, + "learning_rate": 4.991728510968833e-05, + "loss": 5.5421, + "step": 4356 + }, + { + "epoch": 0.025912313255304978, + "grad_norm": 2.1235690116882324, + "learning_rate": 4.991724714012748e-05, + "loss": 5.9947, + "step": 4357 + }, + { + "epoch": 0.025918260538585973, + "grad_norm": 2.1306662559509277, + "learning_rate": 4.9917209161868276e-05, + "loss": 5.4648, + "step": 4358 + }, + { + "epoch": 0.02592420782186697, + "grad_norm": 1.7927355766296387, + "learning_rate": 4.991717117491073e-05, + "loss": 5.4339, + "step": 4359 + }, + { + "epoch": 0.02593015510514797, + "grad_norm": 2.314069986343384, + "learning_rate": 4.991713317925485e-05, + "loss": 5.5534, + "step": 4360 + }, + { + "epoch": 0.025936102388428965, + "grad_norm": 2.2628493309020996, + "learning_rate": 4.9917095174900665e-05, + "loss": 5.5996, + "step": 4361 + }, + { + "epoch": 0.025942049671709964, + "grad_norm": 2.1669869422912598, + "learning_rate": 4.991705716184818e-05, + "loss": 5.704, + "step": 4362 + }, + { + "epoch": 0.02594799695499096, + "grad_norm": 2.2048137187957764, + "learning_rate": 4.99170191400974e-05, + "loss": 5.6576, + "step": 4363 + }, + { + "epoch": 0.025953944238271957, + "grad_norm": 2.172398328781128, + "learning_rate": 4.991698110964835e-05, + "loss": 5.7254, + "step": 4364 + }, + { + "epoch": 0.025959891521552956, + "grad_norm": 1.9689068794250488, + "learning_rate": 4.9916943070501047e-05, + "loss": 5.7303, + "step": 4365 + }, + { + "epoch": 0.02596583880483395, + "grad_norm": 1.7037044763565063, + "learning_rate": 4.991690502265549e-05, + "loss": 5.6542, + "step": 4366 + }, + { + "epoch": 0.02597178608811495, + "grad_norm": 1.7666655778884888, + "learning_rate": 4.9916866966111695e-05, + "loss": 5.7833, + "step": 4367 + }, + { + "epoch": 0.025977733371395945, + "grad_norm": 2.0178141593933105, + "learning_rate": 4.991682890086968e-05, + "loss": 5.7759, + "step": 4368 + }, + { + "epoch": 0.025983680654676943, + "grad_norm": 1.7989983558654785, + "learning_rate": 4.991679082692946e-05, + "loss": 5.8772, + "step": 4369 + }, + { + "epoch": 0.025989627937957942, + "grad_norm": 1.8004199266433716, + "learning_rate": 4.9916752744291054e-05, + "loss": 5.6145, + "step": 4370 + }, + { + "epoch": 0.025995575221238937, + "grad_norm": 1.837074637413025, + "learning_rate": 4.991671465295446e-05, + "loss": 5.4874, + "step": 4371 + }, + { + "epoch": 0.026001522504519935, + "grad_norm": 1.7436491250991821, + "learning_rate": 4.991667655291969e-05, + "loss": 5.7212, + "step": 4372 + }, + { + "epoch": 0.026007469787800934, + "grad_norm": 1.7802095413208008, + "learning_rate": 4.991663844418678e-05, + "loss": 5.7004, + "step": 4373 + }, + { + "epoch": 0.02601341707108193, + "grad_norm": 2.112487316131592, + "learning_rate": 4.991660032675572e-05, + "loss": 5.5579, + "step": 4374 + }, + { + "epoch": 0.026019364354362928, + "grad_norm": 2.0917413234710693, + "learning_rate": 4.9916562200626535e-05, + "loss": 5.7825, + "step": 4375 + }, + { + "epoch": 0.026025311637643923, + "grad_norm": 1.8323053121566772, + "learning_rate": 4.991652406579924e-05, + "loss": 5.7699, + "step": 4376 + }, + { + "epoch": 0.02603125892092492, + "grad_norm": 1.9480723142623901, + "learning_rate": 4.9916485922273835e-05, + "loss": 5.6591, + "step": 4377 + }, + { + "epoch": 0.02603720620420592, + "grad_norm": 2.000739812850952, + "learning_rate": 4.991644777005035e-05, + "loss": 5.8919, + "step": 4378 + }, + { + "epoch": 0.026043153487486915, + "grad_norm": 2.093573808670044, + "learning_rate": 4.991640960912879e-05, + "loss": 5.7357, + "step": 4379 + }, + { + "epoch": 0.026049100770767913, + "grad_norm": 1.932019591331482, + "learning_rate": 4.991637143950916e-05, + "loss": 5.7268, + "step": 4380 + }, + { + "epoch": 0.026055048054048912, + "grad_norm": 1.820102572441101, + "learning_rate": 4.991633326119149e-05, + "loss": 5.8733, + "step": 4381 + }, + { + "epoch": 0.026060995337329907, + "grad_norm": 1.9091769456863403, + "learning_rate": 4.991629507417578e-05, + "loss": 5.5532, + "step": 4382 + }, + { + "epoch": 0.026066942620610906, + "grad_norm": 2.0037779808044434, + "learning_rate": 4.991625687846205e-05, + "loss": 5.7841, + "step": 4383 + }, + { + "epoch": 0.0260728899038919, + "grad_norm": 1.7106568813323975, + "learning_rate": 4.991621867405032e-05, + "loss": 5.4486, + "step": 4384 + }, + { + "epoch": 0.0260788371871729, + "grad_norm": 1.7802643775939941, + "learning_rate": 4.9916180460940585e-05, + "loss": 5.7494, + "step": 4385 + }, + { + "epoch": 0.026084784470453898, + "grad_norm": 2.089503288269043, + "learning_rate": 4.991614223913288e-05, + "loss": 5.6044, + "step": 4386 + }, + { + "epoch": 0.026090731753734893, + "grad_norm": 2.3315577507019043, + "learning_rate": 4.99161040086272e-05, + "loss": 5.9552, + "step": 4387 + }, + { + "epoch": 0.02609667903701589, + "grad_norm": 2.1202025413513184, + "learning_rate": 4.9916065769423566e-05, + "loss": 5.778, + "step": 4388 + }, + { + "epoch": 0.02610262632029689, + "grad_norm": 2.3448777198791504, + "learning_rate": 4.991602752152199e-05, + "loss": 5.8014, + "step": 4389 + }, + { + "epoch": 0.026108573603577885, + "grad_norm": 2.1613330841064453, + "learning_rate": 4.9915989264922495e-05, + "loss": 5.731, + "step": 4390 + }, + { + "epoch": 0.026114520886858884, + "grad_norm": 2.0314743518829346, + "learning_rate": 4.991595099962507e-05, + "loss": 5.8181, + "step": 4391 + }, + { + "epoch": 0.02612046817013988, + "grad_norm": 2.053994655609131, + "learning_rate": 4.9915912725629755e-05, + "loss": 5.7264, + "step": 4392 + }, + { + "epoch": 0.026126415453420877, + "grad_norm": 1.8720483779907227, + "learning_rate": 4.991587444293655e-05, + "loss": 5.5229, + "step": 4393 + }, + { + "epoch": 0.026132362736701876, + "grad_norm": 1.8745067119598389, + "learning_rate": 4.991583615154547e-05, + "loss": 5.612, + "step": 4394 + }, + { + "epoch": 0.02613831001998287, + "grad_norm": 2.124157428741455, + "learning_rate": 4.9915797851456525e-05, + "loss": 5.7276, + "step": 4395 + }, + { + "epoch": 0.02614425730326387, + "grad_norm": 2.2587873935699463, + "learning_rate": 4.991575954266974e-05, + "loss": 5.7994, + "step": 4396 + }, + { + "epoch": 0.026150204586544865, + "grad_norm": 1.9030078649520874, + "learning_rate": 4.9915721225185116e-05, + "loss": 5.7491, + "step": 4397 + }, + { + "epoch": 0.026156151869825863, + "grad_norm": 2.2278738021850586, + "learning_rate": 4.991568289900267e-05, + "loss": 5.4701, + "step": 4398 + }, + { + "epoch": 0.02616209915310686, + "grad_norm": 2.190974473953247, + "learning_rate": 4.991564456412242e-05, + "loss": 5.6731, + "step": 4399 + }, + { + "epoch": 0.026168046436387857, + "grad_norm": 2.3491454124450684, + "learning_rate": 4.991560622054438e-05, + "loss": 5.4041, + "step": 4400 + }, + { + "epoch": 0.026173993719668855, + "grad_norm": 2.2767796516418457, + "learning_rate": 4.991556786826854e-05, + "loss": 5.9005, + "step": 4401 + }, + { + "epoch": 0.026179941002949854, + "grad_norm": 2.3645145893096924, + "learning_rate": 4.991552950729496e-05, + "loss": 6.3108, + "step": 4402 + }, + { + "epoch": 0.02618588828623085, + "grad_norm": 2.1715476512908936, + "learning_rate": 4.9915491137623605e-05, + "loss": 5.8186, + "step": 4403 + }, + { + "epoch": 0.026191835569511848, + "grad_norm": 2.195758581161499, + "learning_rate": 4.991545275925452e-05, + "loss": 5.692, + "step": 4404 + }, + { + "epoch": 0.026197782852792843, + "grad_norm": 2.1124489307403564, + "learning_rate": 4.9915414372187705e-05, + "loss": 5.6582, + "step": 4405 + }, + { + "epoch": 0.02620373013607384, + "grad_norm": 1.9873831272125244, + "learning_rate": 4.991537597642317e-05, + "loss": 5.6309, + "step": 4406 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 1.9675770998001099, + "learning_rate": 4.991533757196094e-05, + "loss": 5.7095, + "step": 4407 + }, + { + "epoch": 0.026215624702635835, + "grad_norm": 1.9072648286819458, + "learning_rate": 4.991529915880103e-05, + "loss": 5.6449, + "step": 4408 + }, + { + "epoch": 0.026221571985916833, + "grad_norm": 2.3060495853424072, + "learning_rate": 4.9915260736943435e-05, + "loss": 5.6712, + "step": 4409 + }, + { + "epoch": 0.026227519269197832, + "grad_norm": 2.4438107013702393, + "learning_rate": 4.991522230638819e-05, + "loss": 5.2384, + "step": 4410 + }, + { + "epoch": 0.026233466552478827, + "grad_norm": 1.8102613687515259, + "learning_rate": 4.991518386713529e-05, + "loss": 5.5508, + "step": 4411 + }, + { + "epoch": 0.026239413835759826, + "grad_norm": 2.0226693153381348, + "learning_rate": 4.991514541918476e-05, + "loss": 5.4049, + "step": 4412 + }, + { + "epoch": 0.02624536111904082, + "grad_norm": 2.261418104171753, + "learning_rate": 4.991510696253661e-05, + "loss": 5.3324, + "step": 4413 + }, + { + "epoch": 0.02625130840232182, + "grad_norm": 2.232844352722168, + "learning_rate": 4.9915068497190856e-05, + "loss": 5.2601, + "step": 4414 + }, + { + "epoch": 0.026257255685602818, + "grad_norm": 2.2306487560272217, + "learning_rate": 4.99150300231475e-05, + "loss": 5.3329, + "step": 4415 + }, + { + "epoch": 0.026263202968883813, + "grad_norm": 2.1368730068206787, + "learning_rate": 4.9914991540406574e-05, + "loss": 5.573, + "step": 4416 + }, + { + "epoch": 0.02626915025216481, + "grad_norm": 1.984078288078308, + "learning_rate": 4.991495304896808e-05, + "loss": 5.6518, + "step": 4417 + }, + { + "epoch": 0.02627509753544581, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.9914914548832034e-05, + "loss": 5.7076, + "step": 4418 + }, + { + "epoch": 0.026281044818726805, + "grad_norm": 1.9880858659744263, + "learning_rate": 4.991487603999845e-05, + "loss": 5.6533, + "step": 4419 + }, + { + "epoch": 0.026286992102007804, + "grad_norm": 2.0475687980651855, + "learning_rate": 4.991483752246734e-05, + "loss": 5.6311, + "step": 4420 + }, + { + "epoch": 0.0262929393852888, + "grad_norm": 2.2796714305877686, + "learning_rate": 4.991479899623871e-05, + "loss": 5.364, + "step": 4421 + }, + { + "epoch": 0.026298886668569797, + "grad_norm": 1.8535730838775635, + "learning_rate": 4.991476046131259e-05, + "loss": 5.6153, + "step": 4422 + }, + { + "epoch": 0.026304833951850796, + "grad_norm": 1.97511887550354, + "learning_rate": 4.9914721917688976e-05, + "loss": 5.5682, + "step": 4423 + }, + { + "epoch": 0.02631078123513179, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.99146833653679e-05, + "loss": 5.5609, + "step": 4424 + }, + { + "epoch": 0.02631672851841279, + "grad_norm": 1.9997434616088867, + "learning_rate": 4.9914644804349356e-05, + "loss": 5.6196, + "step": 4425 + }, + { + "epoch": 0.026322675801693788, + "grad_norm": 1.6116957664489746, + "learning_rate": 4.991460623463337e-05, + "loss": 5.5003, + "step": 4426 + }, + { + "epoch": 0.026328623084974783, + "grad_norm": 1.8156583309173584, + "learning_rate": 4.991456765621996e-05, + "loss": 5.5875, + "step": 4427 + }, + { + "epoch": 0.02633457036825578, + "grad_norm": 2.0364272594451904, + "learning_rate": 4.991452906910912e-05, + "loss": 5.6541, + "step": 4428 + }, + { + "epoch": 0.026340517651536777, + "grad_norm": 1.8430767059326172, + "learning_rate": 4.991449047330088e-05, + "loss": 5.5408, + "step": 4429 + }, + { + "epoch": 0.026346464934817775, + "grad_norm": 2.049476385116577, + "learning_rate": 4.991445186879525e-05, + "loss": 5.5644, + "step": 4430 + }, + { + "epoch": 0.026352412218098774, + "grad_norm": 1.9186240434646606, + "learning_rate": 4.991441325559224e-05, + "loss": 5.5977, + "step": 4431 + }, + { + "epoch": 0.02635835950137977, + "grad_norm": 1.80244779586792, + "learning_rate": 4.991437463369186e-05, + "loss": 5.5114, + "step": 4432 + }, + { + "epoch": 0.026364306784660767, + "grad_norm": 2.2580177783966064, + "learning_rate": 4.991433600309414e-05, + "loss": 5.4132, + "step": 4433 + }, + { + "epoch": 0.026370254067941763, + "grad_norm": 2.0970637798309326, + "learning_rate": 4.991429736379908e-05, + "loss": 5.6211, + "step": 4434 + }, + { + "epoch": 0.02637620135122276, + "grad_norm": 2.0690932273864746, + "learning_rate": 4.9914258715806696e-05, + "loss": 5.6511, + "step": 4435 + }, + { + "epoch": 0.02638214863450376, + "grad_norm": 2.063052177429199, + "learning_rate": 4.9914220059117e-05, + "loss": 5.5169, + "step": 4436 + }, + { + "epoch": 0.026388095917784755, + "grad_norm": 1.990708827972412, + "learning_rate": 4.991418139373001e-05, + "loss": 5.5018, + "step": 4437 + }, + { + "epoch": 0.026394043201065753, + "grad_norm": 2.1311633586883545, + "learning_rate": 4.9914142719645736e-05, + "loss": 5.4714, + "step": 4438 + }, + { + "epoch": 0.026399990484346752, + "grad_norm": 1.7688508033752441, + "learning_rate": 4.991410403686419e-05, + "loss": 5.5208, + "step": 4439 + }, + { + "epoch": 0.026405937767627747, + "grad_norm": 2.3486130237579346, + "learning_rate": 4.9914065345385383e-05, + "loss": 5.4524, + "step": 4440 + }, + { + "epoch": 0.026411885050908745, + "grad_norm": 2.0333707332611084, + "learning_rate": 4.9914026645209344e-05, + "loss": 5.6747, + "step": 4441 + }, + { + "epoch": 0.02641783233418974, + "grad_norm": 1.8731845617294312, + "learning_rate": 4.991398793633607e-05, + "loss": 5.6436, + "step": 4442 + }, + { + "epoch": 0.02642377961747074, + "grad_norm": 2.003361225128174, + "learning_rate": 4.991394921876558e-05, + "loss": 5.4628, + "step": 4443 + }, + { + "epoch": 0.026429726900751738, + "grad_norm": 2.1195411682128906, + "learning_rate": 4.991391049249789e-05, + "loss": 5.4096, + "step": 4444 + }, + { + "epoch": 0.026435674184032733, + "grad_norm": 1.857364535331726, + "learning_rate": 4.991387175753301e-05, + "loss": 5.3928, + "step": 4445 + }, + { + "epoch": 0.02644162146731373, + "grad_norm": 1.8932915925979614, + "learning_rate": 4.991383301387095e-05, + "loss": 5.4917, + "step": 4446 + }, + { + "epoch": 0.02644756875059473, + "grad_norm": 1.8743010759353638, + "learning_rate": 4.991379426151174e-05, + "loss": 5.6766, + "step": 4447 + }, + { + "epoch": 0.026453516033875725, + "grad_norm": 1.910796046257019, + "learning_rate": 4.991375550045537e-05, + "loss": 5.4347, + "step": 4448 + }, + { + "epoch": 0.026459463317156724, + "grad_norm": 1.7901744842529297, + "learning_rate": 4.991371673070187e-05, + "loss": 5.5339, + "step": 4449 + }, + { + "epoch": 0.02646541060043772, + "grad_norm": 1.86943519115448, + "learning_rate": 4.9913677952251244e-05, + "loss": 5.4867, + "step": 4450 + }, + { + "epoch": 0.026471357883718717, + "grad_norm": 1.8662208318710327, + "learning_rate": 4.991363916510352e-05, + "loss": 5.4992, + "step": 4451 + }, + { + "epoch": 0.026477305166999716, + "grad_norm": 1.7465355396270752, + "learning_rate": 4.99136003692587e-05, + "loss": 5.5243, + "step": 4452 + }, + { + "epoch": 0.02648325245028071, + "grad_norm": 1.9097687005996704, + "learning_rate": 4.9913561564716794e-05, + "loss": 5.5096, + "step": 4453 + }, + { + "epoch": 0.02648919973356171, + "grad_norm": 2.1472127437591553, + "learning_rate": 4.991352275147783e-05, + "loss": 5.4462, + "step": 4454 + }, + { + "epoch": 0.026495147016842708, + "grad_norm": 2.3966939449310303, + "learning_rate": 4.9913483929541806e-05, + "loss": 5.2938, + "step": 4455 + }, + { + "epoch": 0.026501094300123703, + "grad_norm": 2.1738977432250977, + "learning_rate": 4.991344509890874e-05, + "loss": 5.317, + "step": 4456 + }, + { + "epoch": 0.0265070415834047, + "grad_norm": 1.963944435119629, + "learning_rate": 4.9913406259578646e-05, + "loss": 5.3827, + "step": 4457 + }, + { + "epoch": 0.026512988866685697, + "grad_norm": 2.1755871772766113, + "learning_rate": 4.991336741155155e-05, + "loss": 5.2941, + "step": 4458 + }, + { + "epoch": 0.026518936149966695, + "grad_norm": 2.2461934089660645, + "learning_rate": 4.991332855482744e-05, + "loss": 5.3503, + "step": 4459 + }, + { + "epoch": 0.026524883433247694, + "grad_norm": 2.2270491123199463, + "learning_rate": 4.9913289689406355e-05, + "loss": 5.417, + "step": 4460 + }, + { + "epoch": 0.02653083071652869, + "grad_norm": 2.437074661254883, + "learning_rate": 4.991325081528829e-05, + "loss": 5.1938, + "step": 4461 + }, + { + "epoch": 0.026536777999809687, + "grad_norm": 2.159170150756836, + "learning_rate": 4.991321193247328e-05, + "loss": 5.2088, + "step": 4462 + }, + { + "epoch": 0.026542725283090682, + "grad_norm": 2.08797287940979, + "learning_rate": 4.9913173040961315e-05, + "loss": 5.1829, + "step": 4463 + }, + { + "epoch": 0.02654867256637168, + "grad_norm": 2.805191993713379, + "learning_rate": 4.991313414075242e-05, + "loss": 6.3049, + "step": 4464 + }, + { + "epoch": 0.02655461984965268, + "grad_norm": 2.3204843997955322, + "learning_rate": 4.991309523184661e-05, + "loss": 5.3831, + "step": 4465 + }, + { + "epoch": 0.026560567132933675, + "grad_norm": 2.217212200164795, + "learning_rate": 4.991305631424389e-05, + "loss": 5.4647, + "step": 4466 + }, + { + "epoch": 0.026566514416214673, + "grad_norm": 2.1094207763671875, + "learning_rate": 4.991301738794429e-05, + "loss": 5.5837, + "step": 4467 + }, + { + "epoch": 0.026572461699495672, + "grad_norm": 2.225660562515259, + "learning_rate": 4.99129784529478e-05, + "loss": 5.8316, + "step": 4468 + }, + { + "epoch": 0.026578408982776667, + "grad_norm": 2.361238956451416, + "learning_rate": 4.991293950925446e-05, + "loss": 5.8358, + "step": 4469 + }, + { + "epoch": 0.026584356266057665, + "grad_norm": 2.3268609046936035, + "learning_rate": 4.991290055686426e-05, + "loss": 5.732, + "step": 4470 + }, + { + "epoch": 0.02659030354933866, + "grad_norm": 2.1456172466278076, + "learning_rate": 4.9912861595777226e-05, + "loss": 5.9, + "step": 4471 + }, + { + "epoch": 0.02659625083261966, + "grad_norm": 2.114696979522705, + "learning_rate": 4.991282262599337e-05, + "loss": 5.4464, + "step": 4472 + }, + { + "epoch": 0.026602198115900658, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9912783647512705e-05, + "loss": 5.5053, + "step": 4473 + }, + { + "epoch": 0.026608145399181653, + "grad_norm": 1.9743404388427734, + "learning_rate": 4.9912744660335245e-05, + "loss": 5.5877, + "step": 4474 + }, + { + "epoch": 0.02661409268246265, + "grad_norm": 2.052358865737915, + "learning_rate": 4.991270566446101e-05, + "loss": 5.5891, + "step": 4475 + }, + { + "epoch": 0.02662003996574365, + "grad_norm": 2.1602041721343994, + "learning_rate": 4.991266665989e-05, + "loss": 5.581, + "step": 4476 + }, + { + "epoch": 0.026625987249024645, + "grad_norm": 2.241586685180664, + "learning_rate": 4.9912627646622236e-05, + "loss": 5.5375, + "step": 4477 + }, + { + "epoch": 0.026631934532305643, + "grad_norm": 1.7952601909637451, + "learning_rate": 4.991258862465773e-05, + "loss": 5.5273, + "step": 4478 + }, + { + "epoch": 0.02663788181558664, + "grad_norm": 1.9767752885818481, + "learning_rate": 4.991254959399649e-05, + "loss": 5.4476, + "step": 4479 + }, + { + "epoch": 0.026643829098867637, + "grad_norm": 1.7997682094573975, + "learning_rate": 4.991251055463855e-05, + "loss": 5.5666, + "step": 4480 + }, + { + "epoch": 0.026649776382148636, + "grad_norm": 2.3247575759887695, + "learning_rate": 4.9912471506583905e-05, + "loss": 5.5247, + "step": 4481 + }, + { + "epoch": 0.02665572366542963, + "grad_norm": 2.165900230407715, + "learning_rate": 4.991243244983257e-05, + "loss": 5.6807, + "step": 4482 + }, + { + "epoch": 0.02666167094871063, + "grad_norm": 2.598257303237915, + "learning_rate": 4.991239338438456e-05, + "loss": 5.6609, + "step": 4483 + }, + { + "epoch": 0.026667618231991628, + "grad_norm": 2.2752041816711426, + "learning_rate": 4.991235431023989e-05, + "loss": 5.5199, + "step": 4484 + }, + { + "epoch": 0.026673565515272623, + "grad_norm": 2.3482842445373535, + "learning_rate": 4.9912315227398586e-05, + "loss": 5.6438, + "step": 4485 + }, + { + "epoch": 0.02667951279855362, + "grad_norm": 2.034403085708618, + "learning_rate": 4.991227613586065e-05, + "loss": 5.6191, + "step": 4486 + }, + { + "epoch": 0.026685460081834617, + "grad_norm": 1.9002971649169922, + "learning_rate": 4.9912237035626085e-05, + "loss": 5.6627, + "step": 4487 + }, + { + "epoch": 0.026691407365115615, + "grad_norm": 2.0305564403533936, + "learning_rate": 4.9912197926694924e-05, + "loss": 5.7009, + "step": 4488 + }, + { + "epoch": 0.026697354648396614, + "grad_norm": 2.029777765274048, + "learning_rate": 4.991215880906717e-05, + "loss": 5.5201, + "step": 4489 + }, + { + "epoch": 0.02670330193167761, + "grad_norm": 1.8889492750167847, + "learning_rate": 4.991211968274283e-05, + "loss": 5.602, + "step": 4490 + }, + { + "epoch": 0.026709249214958607, + "grad_norm": 1.9616930484771729, + "learning_rate": 4.9912080547721934e-05, + "loss": 5.5352, + "step": 4491 + }, + { + "epoch": 0.026715196498239602, + "grad_norm": 2.449345827102661, + "learning_rate": 4.9912041404004485e-05, + "loss": 5.7103, + "step": 4492 + }, + { + "epoch": 0.0267211437815206, + "grad_norm": 2.5550389289855957, + "learning_rate": 4.991200225159051e-05, + "loss": 5.5593, + "step": 4493 + }, + { + "epoch": 0.0267270910648016, + "grad_norm": 2.2512362003326416, + "learning_rate": 4.9911963090479996e-05, + "loss": 5.6329, + "step": 4494 + }, + { + "epoch": 0.026733038348082595, + "grad_norm": 2.0346968173980713, + "learning_rate": 4.9911923920672984e-05, + "loss": 5.5966, + "step": 4495 + }, + { + "epoch": 0.026738985631363593, + "grad_norm": 2.013648271560669, + "learning_rate": 4.991188474216947e-05, + "loss": 5.6532, + "step": 4496 + }, + { + "epoch": 0.026744932914644592, + "grad_norm": 1.8361715078353882, + "learning_rate": 4.9911845554969484e-05, + "loss": 5.519, + "step": 4497 + }, + { + "epoch": 0.026750880197925587, + "grad_norm": 2.1487016677856445, + "learning_rate": 4.991180635907302e-05, + "loss": 5.436, + "step": 4498 + }, + { + "epoch": 0.026756827481206585, + "grad_norm": 2.277714967727661, + "learning_rate": 4.991176715448011e-05, + "loss": 5.3574, + "step": 4499 + }, + { + "epoch": 0.02676277476448758, + "grad_norm": 2.3313565254211426, + "learning_rate": 4.9911727941190755e-05, + "loss": 5.5408, + "step": 4500 + }, + { + "epoch": 0.02676872204776858, + "grad_norm": 2.105825662612915, + "learning_rate": 4.9911688719204975e-05, + "loss": 5.4801, + "step": 4501 + }, + { + "epoch": 0.026774669331049578, + "grad_norm": 2.122138261795044, + "learning_rate": 4.991164948852278e-05, + "loss": 5.4645, + "step": 4502 + }, + { + "epoch": 0.026780616614330573, + "grad_norm": 1.8742777109146118, + "learning_rate": 4.991161024914419e-05, + "loss": 5.5646, + "step": 4503 + }, + { + "epoch": 0.02678656389761157, + "grad_norm": 1.762276291847229, + "learning_rate": 4.991157100106921e-05, + "loss": 5.5672, + "step": 4504 + }, + { + "epoch": 0.02679251118089257, + "grad_norm": 1.9174740314483643, + "learning_rate": 4.9911531744297855e-05, + "loss": 5.4296, + "step": 4505 + }, + { + "epoch": 0.026798458464173565, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.991149247883015e-05, + "loss": 5.5685, + "step": 4506 + }, + { + "epoch": 0.026804405747454563, + "grad_norm": 1.8675988912582397, + "learning_rate": 4.9911453204666094e-05, + "loss": 5.4757, + "step": 4507 + }, + { + "epoch": 0.02681035303073556, + "grad_norm": 2.3117783069610596, + "learning_rate": 4.99114139218057e-05, + "loss": 5.7057, + "step": 4508 + }, + { + "epoch": 0.026816300314016557, + "grad_norm": 2.5439465045928955, + "learning_rate": 4.9911374630249007e-05, + "loss": 5.7393, + "step": 4509 + }, + { + "epoch": 0.026822247597297556, + "grad_norm": 2.4611666202545166, + "learning_rate": 4.9911335329996e-05, + "loss": 5.7215, + "step": 4510 + }, + { + "epoch": 0.02682819488057855, + "grad_norm": 2.1540768146514893, + "learning_rate": 4.99112960210467e-05, + "loss": 5.7059, + "step": 4511 + }, + { + "epoch": 0.02683414216385955, + "grad_norm": 2.1183645725250244, + "learning_rate": 4.9911256703401134e-05, + "loss": 5.4454, + "step": 4512 + }, + { + "epoch": 0.026840089447140548, + "grad_norm": 2.1757540702819824, + "learning_rate": 4.9911217377059295e-05, + "loss": 5.6851, + "step": 4513 + }, + { + "epoch": 0.026846036730421543, + "grad_norm": 2.2770378589630127, + "learning_rate": 4.9911178042021214e-05, + "loss": 5.5957, + "step": 4514 + }, + { + "epoch": 0.02685198401370254, + "grad_norm": 2.320993185043335, + "learning_rate": 4.9911138698286895e-05, + "loss": 5.4674, + "step": 4515 + }, + { + "epoch": 0.026857931296983537, + "grad_norm": 2.2340428829193115, + "learning_rate": 4.991109934585636e-05, + "loss": 5.4514, + "step": 4516 + }, + { + "epoch": 0.026863878580264535, + "grad_norm": 2.1531431674957275, + "learning_rate": 4.991105998472962e-05, + "loss": 5.4386, + "step": 4517 + }, + { + "epoch": 0.026869825863545534, + "grad_norm": 2.1567044258117676, + "learning_rate": 4.991102061490667e-05, + "loss": 5.422, + "step": 4518 + }, + { + "epoch": 0.02687577314682653, + "grad_norm": 2.1181681156158447, + "learning_rate": 4.9910981236387554e-05, + "loss": 5.7214, + "step": 4519 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 2.3410873413085938, + "learning_rate": 4.9910941849172263e-05, + "loss": 5.8603, + "step": 4520 + }, + { + "epoch": 0.026887667713388526, + "grad_norm": 2.4943840503692627, + "learning_rate": 4.9910902453260824e-05, + "loss": 5.7084, + "step": 4521 + }, + { + "epoch": 0.02689361499666952, + "grad_norm": 2.1420044898986816, + "learning_rate": 4.991086304865325e-05, + "loss": 5.528, + "step": 4522 + }, + { + "epoch": 0.02689956227995052, + "grad_norm": 2.3257980346679688, + "learning_rate": 4.991082363534955e-05, + "loss": 5.6791, + "step": 4523 + }, + { + "epoch": 0.026905509563231515, + "grad_norm": 2.335049867630005, + "learning_rate": 4.991078421334974e-05, + "loss": 5.6184, + "step": 4524 + }, + { + "epoch": 0.026911456846512513, + "grad_norm": 3.7381551265716553, + "learning_rate": 4.9910744782653825e-05, + "loss": 5.954, + "step": 4525 + }, + { + "epoch": 0.02691740412979351, + "grad_norm": 3.1807587146759033, + "learning_rate": 4.991070534326183e-05, + "loss": 6.5662, + "step": 4526 + }, + { + "epoch": 0.026923351413074507, + "grad_norm": 2.378366708755493, + "learning_rate": 4.991066589517376e-05, + "loss": 6.2312, + "step": 4527 + }, + { + "epoch": 0.026929298696355505, + "grad_norm": 2.5797109603881836, + "learning_rate": 4.991062643838964e-05, + "loss": 5.9969, + "step": 4528 + }, + { + "epoch": 0.0269352459796365, + "grad_norm": 2.522815704345703, + "learning_rate": 4.991058697290948e-05, + "loss": 5.919, + "step": 4529 + }, + { + "epoch": 0.0269411932629175, + "grad_norm": 2.5215437412261963, + "learning_rate": 4.991054749873329e-05, + "loss": 5.8812, + "step": 4530 + }, + { + "epoch": 0.026947140546198498, + "grad_norm": 2.1608335971832275, + "learning_rate": 4.991050801586108e-05, + "loss": 5.8381, + "step": 4531 + }, + { + "epoch": 0.026953087829479493, + "grad_norm": 2.37752366065979, + "learning_rate": 4.991046852429288e-05, + "loss": 5.7612, + "step": 4532 + }, + { + "epoch": 0.02695903511276049, + "grad_norm": 2.117534875869751, + "learning_rate": 4.991042902402868e-05, + "loss": 5.6762, + "step": 4533 + }, + { + "epoch": 0.02696498239604149, + "grad_norm": 2.595797061920166, + "learning_rate": 4.991038951506851e-05, + "loss": 6.19, + "step": 4534 + }, + { + "epoch": 0.026970929679322485, + "grad_norm": 2.2216086387634277, + "learning_rate": 4.991034999741239e-05, + "loss": 6.1612, + "step": 4535 + }, + { + "epoch": 0.026976876962603483, + "grad_norm": 2.829735279083252, + "learning_rate": 4.991031047106032e-05, + "loss": 5.6955, + "step": 4536 + }, + { + "epoch": 0.02698282424588448, + "grad_norm": 2.5018115043640137, + "learning_rate": 4.991027093601231e-05, + "loss": 5.4966, + "step": 4537 + }, + { + "epoch": 0.026988771529165477, + "grad_norm": 2.334052085876465, + "learning_rate": 4.9910231392268385e-05, + "loss": 6.1603, + "step": 4538 + }, + { + "epoch": 0.026994718812446476, + "grad_norm": 2.497351884841919, + "learning_rate": 4.991019183982856e-05, + "loss": 6.0128, + "step": 4539 + }, + { + "epoch": 0.02700066609572747, + "grad_norm": 2.2976267337799072, + "learning_rate": 4.991015227869284e-05, + "loss": 5.6696, + "step": 4540 + }, + { + "epoch": 0.02700661337900847, + "grad_norm": 2.6851742267608643, + "learning_rate": 4.991011270886125e-05, + "loss": 5.7996, + "step": 4541 + }, + { + "epoch": 0.027012560662289468, + "grad_norm": 2.531029224395752, + "learning_rate": 4.991007313033379e-05, + "loss": 5.6671, + "step": 4542 + }, + { + "epoch": 0.027018507945570463, + "grad_norm": 2.195552110671997, + "learning_rate": 4.991003354311048e-05, + "loss": 6.3213, + "step": 4543 + }, + { + "epoch": 0.02702445522885146, + "grad_norm": 2.2973361015319824, + "learning_rate": 4.9909993947191336e-05, + "loss": 6.1523, + "step": 4544 + }, + { + "epoch": 0.027030402512132456, + "grad_norm": 2.4766385555267334, + "learning_rate": 4.990995434257637e-05, + "loss": 5.7894, + "step": 4545 + }, + { + "epoch": 0.027036349795413455, + "grad_norm": 2.486384630203247, + "learning_rate": 4.9909914729265606e-05, + "loss": 6.2814, + "step": 4546 + }, + { + "epoch": 0.027042297078694454, + "grad_norm": 2.5054233074188232, + "learning_rate": 4.9909875107259036e-05, + "loss": 6.2859, + "step": 4547 + }, + { + "epoch": 0.02704824436197545, + "grad_norm": 2.70576548576355, + "learning_rate": 4.990983547655669e-05, + "loss": 6.2424, + "step": 4548 + }, + { + "epoch": 0.027054191645256447, + "grad_norm": 3.0937716960906982, + "learning_rate": 4.990979583715858e-05, + "loss": 6.4392, + "step": 4549 + }, + { + "epoch": 0.027060138928537446, + "grad_norm": 2.6290581226348877, + "learning_rate": 4.9909756189064714e-05, + "loss": 6.3565, + "step": 4550 + }, + { + "epoch": 0.02706608621181844, + "grad_norm": 2.5180583000183105, + "learning_rate": 4.990971653227511e-05, + "loss": 6.1482, + "step": 4551 + }, + { + "epoch": 0.02707203349509944, + "grad_norm": 2.6096208095550537, + "learning_rate": 4.990967686678978e-05, + "loss": 5.7724, + "step": 4552 + }, + { + "epoch": 0.027077980778380435, + "grad_norm": 3.187276840209961, + "learning_rate": 4.990963719260874e-05, + "loss": 5.682, + "step": 4553 + }, + { + "epoch": 0.027083928061661433, + "grad_norm": 2.3522419929504395, + "learning_rate": 4.9909597509732006e-05, + "loss": 6.7045, + "step": 4554 + }, + { + "epoch": 0.02708987534494243, + "grad_norm": 2.6016366481781006, + "learning_rate": 4.990955781815959e-05, + "loss": 6.0653, + "step": 4555 + }, + { + "epoch": 0.027095822628223427, + "grad_norm": 2.5409183502197266, + "learning_rate": 4.99095181178915e-05, + "loss": 5.861, + "step": 4556 + }, + { + "epoch": 0.027101769911504425, + "grad_norm": 2.5297863483428955, + "learning_rate": 4.9909478408927754e-05, + "loss": 5.5301, + "step": 4557 + }, + { + "epoch": 0.02710771719478542, + "grad_norm": 2.4822275638580322, + "learning_rate": 4.990943869126837e-05, + "loss": 5.6919, + "step": 4558 + }, + { + "epoch": 0.02711366447806642, + "grad_norm": 2.3832650184631348, + "learning_rate": 4.9909398964913365e-05, + "loss": 5.9589, + "step": 4559 + }, + { + "epoch": 0.027119611761347417, + "grad_norm": 2.0038483142852783, + "learning_rate": 4.9909359229862734e-05, + "loss": 6.1847, + "step": 4560 + }, + { + "epoch": 0.027125559044628413, + "grad_norm": 2.3678700923919678, + "learning_rate": 4.990931948611651e-05, + "loss": 6.4794, + "step": 4561 + }, + { + "epoch": 0.02713150632790941, + "grad_norm": 2.7433204650878906, + "learning_rate": 4.990927973367469e-05, + "loss": 6.6997, + "step": 4562 + }, + { + "epoch": 0.02713745361119041, + "grad_norm": 3.5579798221588135, + "learning_rate": 4.990923997253731e-05, + "loss": 6.1809, + "step": 4563 + }, + { + "epoch": 0.027143400894471405, + "grad_norm": 3.254093647003174, + "learning_rate": 4.990920020270436e-05, + "loss": 6.1446, + "step": 4564 + }, + { + "epoch": 0.027149348177752403, + "grad_norm": 3.0661215782165527, + "learning_rate": 4.990916042417588e-05, + "loss": 6.6702, + "step": 4565 + }, + { + "epoch": 0.0271552954610334, + "grad_norm": 2.641291618347168, + "learning_rate": 4.9909120636951864e-05, + "loss": 6.4951, + "step": 4566 + }, + { + "epoch": 0.027161242744314397, + "grad_norm": 2.050675868988037, + "learning_rate": 4.990908084103233e-05, + "loss": 6.3365, + "step": 4567 + }, + { + "epoch": 0.027167190027595396, + "grad_norm": 2.081108331680298, + "learning_rate": 4.990904103641729e-05, + "loss": 6.1874, + "step": 4568 + }, + { + "epoch": 0.02717313731087639, + "grad_norm": 2.5833899974823, + "learning_rate": 4.9909001223106766e-05, + "loss": 6.0892, + "step": 4569 + }, + { + "epoch": 0.02717908459415739, + "grad_norm": 2.7387397289276123, + "learning_rate": 4.990896140110076e-05, + "loss": 6.1036, + "step": 4570 + }, + { + "epoch": 0.027185031877438388, + "grad_norm": 2.5665578842163086, + "learning_rate": 4.99089215703993e-05, + "loss": 5.9577, + "step": 4571 + }, + { + "epoch": 0.027190979160719383, + "grad_norm": 2.3825178146362305, + "learning_rate": 4.990888173100239e-05, + "loss": 5.9654, + "step": 4572 + }, + { + "epoch": 0.02719692644400038, + "grad_norm": 2.562509059906006, + "learning_rate": 4.990884188291005e-05, + "loss": 6.009, + "step": 4573 + }, + { + "epoch": 0.027202873727281376, + "grad_norm": 2.141941785812378, + "learning_rate": 4.9908802026122284e-05, + "loss": 5.8315, + "step": 4574 + }, + { + "epoch": 0.027208821010562375, + "grad_norm": 2.5348474979400635, + "learning_rate": 4.990876216063912e-05, + "loss": 6.3763, + "step": 4575 + }, + { + "epoch": 0.027214768293843374, + "grad_norm": 2.751520872116089, + "learning_rate": 4.990872228646056e-05, + "loss": 6.5684, + "step": 4576 + }, + { + "epoch": 0.02722071557712437, + "grad_norm": 4.626354694366455, + "learning_rate": 4.990868240358662e-05, + "loss": 6.115, + "step": 4577 + }, + { + "epoch": 0.027226662860405367, + "grad_norm": 2.648479700088501, + "learning_rate": 4.990864251201732e-05, + "loss": 6.0879, + "step": 4578 + }, + { + "epoch": 0.027232610143686366, + "grad_norm": 2.21056866645813, + "learning_rate": 4.990860261175268e-05, + "loss": 6.2923, + "step": 4579 + }, + { + "epoch": 0.02723855742696736, + "grad_norm": 2.3460421562194824, + "learning_rate": 4.9908562702792684e-05, + "loss": 6.4044, + "step": 4580 + }, + { + "epoch": 0.02724450471024836, + "grad_norm": 2.6087262630462646, + "learning_rate": 4.990852278513738e-05, + "loss": 6.5131, + "step": 4581 + }, + { + "epoch": 0.027250451993529354, + "grad_norm": 2.6969377994537354, + "learning_rate": 4.9908482858786765e-05, + "loss": 6.3483, + "step": 4582 + }, + { + "epoch": 0.027256399276810353, + "grad_norm": 2.64043927192688, + "learning_rate": 4.990844292374085e-05, + "loss": 5.8712, + "step": 4583 + }, + { + "epoch": 0.02726234656009135, + "grad_norm": 2.5738205909729004, + "learning_rate": 4.9908402979999654e-05, + "loss": 5.9165, + "step": 4584 + }, + { + "epoch": 0.027268293843372347, + "grad_norm": 2.2725625038146973, + "learning_rate": 4.99083630275632e-05, + "loss": 5.8454, + "step": 4585 + }, + { + "epoch": 0.027274241126653345, + "grad_norm": 2.5911824703216553, + "learning_rate": 4.9908323066431494e-05, + "loss": 5.6729, + "step": 4586 + }, + { + "epoch": 0.02728018840993434, + "grad_norm": 2.6691668033599854, + "learning_rate": 4.9908283096604546e-05, + "loss": 5.7726, + "step": 4587 + }, + { + "epoch": 0.02728613569321534, + "grad_norm": 2.6512796878814697, + "learning_rate": 4.990824311808238e-05, + "loss": 6.1295, + "step": 4588 + }, + { + "epoch": 0.027292082976496337, + "grad_norm": 2.816943645477295, + "learning_rate": 4.9908203130865e-05, + "loss": 5.5172, + "step": 4589 + }, + { + "epoch": 0.027298030259777332, + "grad_norm": 2.6252098083496094, + "learning_rate": 4.990816313495242e-05, + "loss": 5.5955, + "step": 4590 + }, + { + "epoch": 0.02730397754305833, + "grad_norm": 2.3711740970611572, + "learning_rate": 4.990812313034466e-05, + "loss": 5.3348, + "step": 4591 + }, + { + "epoch": 0.02730992482633933, + "grad_norm": 2.355436086654663, + "learning_rate": 4.990808311704173e-05, + "loss": 5.6171, + "step": 4592 + }, + { + "epoch": 0.027315872109620325, + "grad_norm": 2.3344695568084717, + "learning_rate": 4.990804309504365e-05, + "loss": 5.46, + "step": 4593 + }, + { + "epoch": 0.027321819392901323, + "grad_norm": 2.3890786170959473, + "learning_rate": 4.990800306435043e-05, + "loss": 5.5658, + "step": 4594 + }, + { + "epoch": 0.02732776667618232, + "grad_norm": 2.5606987476348877, + "learning_rate": 4.990796302496208e-05, + "loss": 5.4778, + "step": 4595 + }, + { + "epoch": 0.027333713959463317, + "grad_norm": 2.2443172931671143, + "learning_rate": 4.9907922976878616e-05, + "loss": 5.486, + "step": 4596 + }, + { + "epoch": 0.027339661242744315, + "grad_norm": 2.3428351879119873, + "learning_rate": 4.990788292010005e-05, + "loss": 5.3332, + "step": 4597 + }, + { + "epoch": 0.02734560852602531, + "grad_norm": 2.6336300373077393, + "learning_rate": 4.9907842854626406e-05, + "loss": 5.4606, + "step": 4598 + }, + { + "epoch": 0.02735155580930631, + "grad_norm": 2.3052382469177246, + "learning_rate": 4.990780278045769e-05, + "loss": 5.4028, + "step": 4599 + }, + { + "epoch": 0.027357503092587308, + "grad_norm": 2.4661340713500977, + "learning_rate": 4.990776269759392e-05, + "loss": 5.6011, + "step": 4600 + }, + { + "epoch": 0.027363450375868303, + "grad_norm": 2.400527238845825, + "learning_rate": 4.99077226060351e-05, + "loss": 5.5952, + "step": 4601 + }, + { + "epoch": 0.0273693976591493, + "grad_norm": 2.364900827407837, + "learning_rate": 4.9907682505781256e-05, + "loss": 5.2125, + "step": 4602 + }, + { + "epoch": 0.027375344942430296, + "grad_norm": 2.383680820465088, + "learning_rate": 4.99076423968324e-05, + "loss": 5.4253, + "step": 4603 + }, + { + "epoch": 0.027381292225711295, + "grad_norm": 2.681903839111328, + "learning_rate": 4.990760227918854e-05, + "loss": 5.3741, + "step": 4604 + }, + { + "epoch": 0.027387239508992293, + "grad_norm": 2.3454341888427734, + "learning_rate": 4.990756215284969e-05, + "loss": 5.3032, + "step": 4605 + }, + { + "epoch": 0.02739318679227329, + "grad_norm": 2.439807653427124, + "learning_rate": 4.990752201781587e-05, + "loss": 5.3368, + "step": 4606 + }, + { + "epoch": 0.027399134075554287, + "grad_norm": 2.938976764678955, + "learning_rate": 4.990748187408709e-05, + "loss": 6.1251, + "step": 4607 + }, + { + "epoch": 0.027405081358835286, + "grad_norm": 3.353973865509033, + "learning_rate": 4.990744172166337e-05, + "loss": 6.72, + "step": 4608 + }, + { + "epoch": 0.02741102864211628, + "grad_norm": 2.4661834239959717, + "learning_rate": 4.990740156054472e-05, + "loss": 5.7156, + "step": 4609 + }, + { + "epoch": 0.02741697592539728, + "grad_norm": 2.303976058959961, + "learning_rate": 4.990736139073116e-05, + "loss": 5.3493, + "step": 4610 + }, + { + "epoch": 0.027422923208678274, + "grad_norm": 2.4225149154663086, + "learning_rate": 4.990732121222268e-05, + "loss": 5.4831, + "step": 4611 + }, + { + "epoch": 0.027428870491959273, + "grad_norm": 2.5566627979278564, + "learning_rate": 4.990728102501932e-05, + "loss": 5.9159, + "step": 4612 + }, + { + "epoch": 0.02743481777524027, + "grad_norm": 2.64258074760437, + "learning_rate": 4.9907240829121085e-05, + "loss": 6.7137, + "step": 4613 + }, + { + "epoch": 0.027440765058521267, + "grad_norm": 2.967501640319824, + "learning_rate": 4.9907200624527986e-05, + "loss": 6.3333, + "step": 4614 + }, + { + "epoch": 0.027446712341802265, + "grad_norm": 2.6084952354431152, + "learning_rate": 4.990716041124005e-05, + "loss": 6.1201, + "step": 4615 + }, + { + "epoch": 0.02745265962508326, + "grad_norm": 3.0721616744995117, + "learning_rate": 4.990712018925727e-05, + "loss": 6.396, + "step": 4616 + }, + { + "epoch": 0.02745860690836426, + "grad_norm": 2.888263463973999, + "learning_rate": 4.990707995857968e-05, + "loss": 6.0773, + "step": 4617 + }, + { + "epoch": 0.027464554191645257, + "grad_norm": 2.7506093978881836, + "learning_rate": 4.990703971920728e-05, + "loss": 5.9909, + "step": 4618 + }, + { + "epoch": 0.027470501474926252, + "grad_norm": 2.8273298740386963, + "learning_rate": 4.99069994711401e-05, + "loss": 5.9591, + "step": 4619 + }, + { + "epoch": 0.02747644875820725, + "grad_norm": 2.451011896133423, + "learning_rate": 4.990695921437813e-05, + "loss": 6.1596, + "step": 4620 + }, + { + "epoch": 0.02748239604148825, + "grad_norm": 2.762265920639038, + "learning_rate": 4.990691894892141e-05, + "loss": 6.6233, + "step": 4621 + }, + { + "epoch": 0.027488343324769245, + "grad_norm": 2.4570846557617188, + "learning_rate": 4.990687867476994e-05, + "loss": 6.5025, + "step": 4622 + }, + { + "epoch": 0.027494290608050243, + "grad_norm": 3.108992576599121, + "learning_rate": 4.990683839192373e-05, + "loss": 5.921, + "step": 4623 + }, + { + "epoch": 0.02750023789133124, + "grad_norm": 2.887580156326294, + "learning_rate": 4.99067981003828e-05, + "loss": 5.9266, + "step": 4624 + }, + { + "epoch": 0.027506185174612237, + "grad_norm": 3.083556890487671, + "learning_rate": 4.990675780014718e-05, + "loss": 5.765, + "step": 4625 + }, + { + "epoch": 0.027512132457893235, + "grad_norm": 2.710231304168701, + "learning_rate": 4.990671749121685e-05, + "loss": 5.7674, + "step": 4626 + }, + { + "epoch": 0.02751807974117423, + "grad_norm": 2.738926410675049, + "learning_rate": 4.9906677173591845e-05, + "loss": 5.801, + "step": 4627 + }, + { + "epoch": 0.02752402702445523, + "grad_norm": 2.6737735271453857, + "learning_rate": 4.9906636847272176e-05, + "loss": 6.2581, + "step": 4628 + }, + { + "epoch": 0.027529974307736228, + "grad_norm": 2.623969554901123, + "learning_rate": 4.990659651225786e-05, + "loss": 5.5044, + "step": 4629 + }, + { + "epoch": 0.027535921591017223, + "grad_norm": 3.069460153579712, + "learning_rate": 4.990655616854891e-05, + "loss": 5.9639, + "step": 4630 + }, + { + "epoch": 0.02754186887429822, + "grad_norm": 2.6889147758483887, + "learning_rate": 4.990651581614534e-05, + "loss": 6.3032, + "step": 4631 + }, + { + "epoch": 0.027547816157579216, + "grad_norm": 3.5284838676452637, + "learning_rate": 4.990647545504716e-05, + "loss": 6.4104, + "step": 4632 + }, + { + "epoch": 0.027553763440860215, + "grad_norm": 2.326162338256836, + "learning_rate": 4.9906435085254384e-05, + "loss": 6.2593, + "step": 4633 + }, + { + "epoch": 0.027559710724141213, + "grad_norm": 1.946542739868164, + "learning_rate": 4.990639470676703e-05, + "loss": 6.1522, + "step": 4634 + }, + { + "epoch": 0.02756565800742221, + "grad_norm": 2.26143741607666, + "learning_rate": 4.990635431958511e-05, + "loss": 6.0189, + "step": 4635 + }, + { + "epoch": 0.027571605290703207, + "grad_norm": 2.8332626819610596, + "learning_rate": 4.990631392370865e-05, + "loss": 5.6226, + "step": 4636 + }, + { + "epoch": 0.027577552573984206, + "grad_norm": 3.919443130493164, + "learning_rate": 4.9906273519137636e-05, + "loss": 6.2147, + "step": 4637 + }, + { + "epoch": 0.0275834998572652, + "grad_norm": 2.4030275344848633, + "learning_rate": 4.9906233105872115e-05, + "loss": 5.6589, + "step": 4638 + }, + { + "epoch": 0.0275894471405462, + "grad_norm": 2.7806994915008545, + "learning_rate": 4.990619268391207e-05, + "loss": 5.4349, + "step": 4639 + }, + { + "epoch": 0.027595394423827194, + "grad_norm": 2.5759501457214355, + "learning_rate": 4.990615225325754e-05, + "loss": 6.1171, + "step": 4640 + }, + { + "epoch": 0.027601341707108193, + "grad_norm": 2.337517023086548, + "learning_rate": 4.990611181390853e-05, + "loss": 5.5514, + "step": 4641 + }, + { + "epoch": 0.02760728899038919, + "grad_norm": 2.6464250087738037, + "learning_rate": 4.990607136586505e-05, + "loss": 6.1852, + "step": 4642 + }, + { + "epoch": 0.027613236273670187, + "grad_norm": 2.030210256576538, + "learning_rate": 4.9906030909127125e-05, + "loss": 6.0919, + "step": 4643 + }, + { + "epoch": 0.027619183556951185, + "grad_norm": 2.4546520709991455, + "learning_rate": 4.990599044369475e-05, + "loss": 6.3018, + "step": 4644 + }, + { + "epoch": 0.027625130840232184, + "grad_norm": 2.508500337600708, + "learning_rate": 4.990594996956796e-05, + "loss": 5.7933, + "step": 4645 + }, + { + "epoch": 0.02763107812351318, + "grad_norm": 2.3363263607025146, + "learning_rate": 4.990590948674676e-05, + "loss": 6.4252, + "step": 4646 + }, + { + "epoch": 0.027637025406794177, + "grad_norm": 2.794673442840576, + "learning_rate": 4.990586899523116e-05, + "loss": 5.3554, + "step": 4647 + }, + { + "epoch": 0.027642972690075172, + "grad_norm": 2.5396835803985596, + "learning_rate": 4.990582849502118e-05, + "loss": 5.2352, + "step": 4648 + }, + { + "epoch": 0.02764891997335617, + "grad_norm": 2.6878976821899414, + "learning_rate": 4.990578798611684e-05, + "loss": 4.9262, + "step": 4649 + }, + { + "epoch": 0.02765486725663717, + "grad_norm": 2.2143187522888184, + "learning_rate": 4.9905747468518136e-05, + "loss": 6.0785, + "step": 4650 + }, + { + "epoch": 0.027660814539918165, + "grad_norm": 2.6812448501586914, + "learning_rate": 4.9905706942225094e-05, + "loss": 5.1692, + "step": 4651 + }, + { + "epoch": 0.027666761823199163, + "grad_norm": 2.5155227184295654, + "learning_rate": 4.9905666407237726e-05, + "loss": 5.0194, + "step": 4652 + }, + { + "epoch": 0.027672709106480158, + "grad_norm": 2.406834363937378, + "learning_rate": 4.9905625863556047e-05, + "loss": 5.1249, + "step": 4653 + }, + { + "epoch": 0.027678656389761157, + "grad_norm": 3.3666698932647705, + "learning_rate": 4.990558531118008e-05, + "loss": 5.9619, + "step": 4654 + }, + { + "epoch": 0.027684603673042155, + "grad_norm": 2.6557607650756836, + "learning_rate": 4.9905544750109826e-05, + "loss": 5.9118, + "step": 4655 + }, + { + "epoch": 0.02769055095632315, + "grad_norm": 2.60469651222229, + "learning_rate": 4.9905504180345304e-05, + "loss": 6.3746, + "step": 4656 + }, + { + "epoch": 0.02769649823960415, + "grad_norm": 2.5417349338531494, + "learning_rate": 4.9905463601886526e-05, + "loss": 5.6975, + "step": 4657 + }, + { + "epoch": 0.027702445522885148, + "grad_norm": 2.723829984664917, + "learning_rate": 4.990542301473351e-05, + "loss": 5.6189, + "step": 4658 + }, + { + "epoch": 0.027708392806166143, + "grad_norm": 3.0544204711914062, + "learning_rate": 4.990538241888627e-05, + "loss": 5.4999, + "step": 4659 + }, + { + "epoch": 0.02771434008944714, + "grad_norm": 3.0536513328552246, + "learning_rate": 4.990534181434481e-05, + "loss": 6.0636, + "step": 4660 + }, + { + "epoch": 0.027720287372728136, + "grad_norm": 3.0618786811828613, + "learning_rate": 4.990530120110916e-05, + "loss": 6.0856, + "step": 4661 + }, + { + "epoch": 0.027726234656009135, + "grad_norm": 2.6602306365966797, + "learning_rate": 4.9905260579179325e-05, + "loss": 5.8341, + "step": 4662 + }, + { + "epoch": 0.027732181939290133, + "grad_norm": 2.729137420654297, + "learning_rate": 4.990521994855532e-05, + "loss": 6.7052, + "step": 4663 + }, + { + "epoch": 0.02773812922257113, + "grad_norm": 3.0878489017486572, + "learning_rate": 4.990517930923716e-05, + "loss": 6.1308, + "step": 4664 + }, + { + "epoch": 0.027744076505852127, + "grad_norm": 2.524418354034424, + "learning_rate": 4.990513866122486e-05, + "loss": 6.2547, + "step": 4665 + }, + { + "epoch": 0.027750023789133126, + "grad_norm": 2.457075595855713, + "learning_rate": 4.990509800451844e-05, + "loss": 6.6615, + "step": 4666 + }, + { + "epoch": 0.02775597107241412, + "grad_norm": 2.474487543106079, + "learning_rate": 4.9905057339117894e-05, + "loss": 6.63, + "step": 4667 + }, + { + "epoch": 0.02776191835569512, + "grad_norm": 2.611098289489746, + "learning_rate": 4.9905016665023254e-05, + "loss": 5.8232, + "step": 4668 + }, + { + "epoch": 0.027767865638976114, + "grad_norm": 2.8012242317199707, + "learning_rate": 4.990497598223454e-05, + "loss": 5.8478, + "step": 4669 + }, + { + "epoch": 0.027773812922257113, + "grad_norm": 2.706725597381592, + "learning_rate": 4.990493529075174e-05, + "loss": 5.8585, + "step": 4670 + }, + { + "epoch": 0.02777976020553811, + "grad_norm": 2.490032196044922, + "learning_rate": 4.99048945905749e-05, + "loss": 6.2181, + "step": 4671 + }, + { + "epoch": 0.027785707488819106, + "grad_norm": 2.4735357761383057, + "learning_rate": 4.990485388170401e-05, + "loss": 6.2153, + "step": 4672 + }, + { + "epoch": 0.027791654772100105, + "grad_norm": 2.7573068141937256, + "learning_rate": 4.9904813164139094e-05, + "loss": 6.217, + "step": 4673 + }, + { + "epoch": 0.027797602055381104, + "grad_norm": 2.4663283824920654, + "learning_rate": 4.990477243788017e-05, + "loss": 6.4153, + "step": 4674 + }, + { + "epoch": 0.0278035493386621, + "grad_norm": 2.737656831741333, + "learning_rate": 4.9904731702927234e-05, + "loss": 6.5209, + "step": 4675 + }, + { + "epoch": 0.027809496621943097, + "grad_norm": 2.5112721920013428, + "learning_rate": 4.990469095928032e-05, + "loss": 5.979, + "step": 4676 + }, + { + "epoch": 0.027815443905224092, + "grad_norm": 2.6602795124053955, + "learning_rate": 4.990465020693944e-05, + "loss": 5.9206, + "step": 4677 + }, + { + "epoch": 0.02782139118850509, + "grad_norm": 2.460538625717163, + "learning_rate": 4.9904609445904606e-05, + "loss": 5.9855, + "step": 4678 + }, + { + "epoch": 0.02782733847178609, + "grad_norm": 2.750138998031616, + "learning_rate": 4.990456867617582e-05, + "loss": 5.8425, + "step": 4679 + }, + { + "epoch": 0.027833285755067085, + "grad_norm": 2.9843833446502686, + "learning_rate": 4.9904527897753114e-05, + "loss": 6.1385, + "step": 4680 + }, + { + "epoch": 0.027839233038348083, + "grad_norm": 2.586923360824585, + "learning_rate": 4.99044871106365e-05, + "loss": 5.6278, + "step": 4681 + }, + { + "epoch": 0.027845180321629078, + "grad_norm": 3.114211082458496, + "learning_rate": 4.990444631482597e-05, + "loss": 6.1259, + "step": 4682 + }, + { + "epoch": 0.027851127604910077, + "grad_norm": 2.3222453594207764, + "learning_rate": 4.990440551032157e-05, + "loss": 6.3048, + "step": 4683 + }, + { + "epoch": 0.027857074888191075, + "grad_norm": 2.15678334236145, + "learning_rate": 4.99043646971233e-05, + "loss": 5.9082, + "step": 4684 + }, + { + "epoch": 0.02786302217147207, + "grad_norm": 3.946350574493408, + "learning_rate": 4.990432387523116e-05, + "loss": 5.6907, + "step": 4685 + }, + { + "epoch": 0.02786896945475307, + "grad_norm": 2.9612419605255127, + "learning_rate": 4.9904283044645185e-05, + "loss": 5.3894, + "step": 4686 + }, + { + "epoch": 0.027874916738034067, + "grad_norm": 2.3602261543273926, + "learning_rate": 4.990424220536538e-05, + "loss": 6.0716, + "step": 4687 + }, + { + "epoch": 0.027880864021315063, + "grad_norm": 2.822300672531128, + "learning_rate": 4.990420135739177e-05, + "loss": 5.9788, + "step": 4688 + }, + { + "epoch": 0.02788681130459606, + "grad_norm": 2.766280174255371, + "learning_rate": 4.990416050072435e-05, + "loss": 5.9945, + "step": 4689 + }, + { + "epoch": 0.027892758587877056, + "grad_norm": 2.810359239578247, + "learning_rate": 4.990411963536315e-05, + "loss": 6.0598, + "step": 4690 + }, + { + "epoch": 0.027898705871158055, + "grad_norm": 2.510014295578003, + "learning_rate": 4.990407876130818e-05, + "loss": 6.1793, + "step": 4691 + }, + { + "epoch": 0.027904653154439053, + "grad_norm": 2.5394086837768555, + "learning_rate": 4.990403787855945e-05, + "loss": 6.1309, + "step": 4692 + }, + { + "epoch": 0.02791060043772005, + "grad_norm": 2.922084093093872, + "learning_rate": 4.990399698711698e-05, + "loss": 6.1956, + "step": 4693 + }, + { + "epoch": 0.027916547721001047, + "grad_norm": 3.6614181995391846, + "learning_rate": 4.9903956086980785e-05, + "loss": 6.535, + "step": 4694 + }, + { + "epoch": 0.027922495004282046, + "grad_norm": 3.3680684566497803, + "learning_rate": 4.990391517815087e-05, + "loss": 6.5729, + "step": 4695 + }, + { + "epoch": 0.02792844228756304, + "grad_norm": 2.522193431854248, + "learning_rate": 4.990387426062726e-05, + "loss": 5.9406, + "step": 4696 + }, + { + "epoch": 0.02793438957084404, + "grad_norm": 2.9665534496307373, + "learning_rate": 4.990383333440996e-05, + "loss": 6.0281, + "step": 4697 + }, + { + "epoch": 0.027940336854125034, + "grad_norm": 2.643218755722046, + "learning_rate": 4.9903792399498996e-05, + "loss": 5.8965, + "step": 4698 + }, + { + "epoch": 0.027946284137406033, + "grad_norm": 2.498765230178833, + "learning_rate": 4.990375145589436e-05, + "loss": 6.0975, + "step": 4699 + }, + { + "epoch": 0.02795223142068703, + "grad_norm": 4.380255699157715, + "learning_rate": 4.99037105035961e-05, + "loss": 6.6298, + "step": 4700 + }, + { + "epoch": 0.027958178703968026, + "grad_norm": 3.925454616546631, + "learning_rate": 4.990366954260421e-05, + "loss": 6.5742, + "step": 4701 + }, + { + "epoch": 0.027964125987249025, + "grad_norm": 2.5388591289520264, + "learning_rate": 4.99036285729187e-05, + "loss": 6.6102, + "step": 4702 + }, + { + "epoch": 0.027970073270530024, + "grad_norm": 2.6793510913848877, + "learning_rate": 4.9903587594539594e-05, + "loss": 6.4265, + "step": 4703 + }, + { + "epoch": 0.02797602055381102, + "grad_norm": 2.8652729988098145, + "learning_rate": 4.9903546607466903e-05, + "loss": 6.4567, + "step": 4704 + }, + { + "epoch": 0.027981967837092017, + "grad_norm": 2.936021089553833, + "learning_rate": 4.990350561170063e-05, + "loss": 6.404, + "step": 4705 + }, + { + "epoch": 0.027987915120373012, + "grad_norm": 3.256253719329834, + "learning_rate": 4.9903464607240816e-05, + "loss": 6.2291, + "step": 4706 + }, + { + "epoch": 0.02799386240365401, + "grad_norm": 2.8268187046051025, + "learning_rate": 4.990342359408745e-05, + "loss": 6.2582, + "step": 4707 + }, + { + "epoch": 0.02799980968693501, + "grad_norm": 2.5889041423797607, + "learning_rate": 4.9903382572240556e-05, + "loss": 6.3325, + "step": 4708 + }, + { + "epoch": 0.028005756970216004, + "grad_norm": 2.635388135910034, + "learning_rate": 4.9903341541700154e-05, + "loss": 6.1256, + "step": 4709 + }, + { + "epoch": 0.028011704253497003, + "grad_norm": 2.562976360321045, + "learning_rate": 4.990330050246625e-05, + "loss": 5.9333, + "step": 4710 + }, + { + "epoch": 0.028017651536777998, + "grad_norm": 3.488809585571289, + "learning_rate": 4.990325945453887e-05, + "loss": 6.3651, + "step": 4711 + }, + { + "epoch": 0.028023598820058997, + "grad_norm": 2.963324546813965, + "learning_rate": 4.9903218397918e-05, + "loss": 6.718, + "step": 4712 + }, + { + "epoch": 0.028029546103339995, + "grad_norm": 2.4070823192596436, + "learning_rate": 4.990317733260369e-05, + "loss": 6.2502, + "step": 4713 + }, + { + "epoch": 0.02803549338662099, + "grad_norm": 2.711190938949585, + "learning_rate": 4.9903136258595925e-05, + "loss": 6.0397, + "step": 4714 + }, + { + "epoch": 0.02804144066990199, + "grad_norm": 2.466150999069214, + "learning_rate": 4.9903095175894746e-05, + "loss": 5.9344, + "step": 4715 + }, + { + "epoch": 0.028047387953182987, + "grad_norm": 2.4558048248291016, + "learning_rate": 4.990305408450014e-05, + "loss": 6.1121, + "step": 4716 + }, + { + "epoch": 0.028053335236463982, + "grad_norm": 2.4023051261901855, + "learning_rate": 4.990301298441215e-05, + "loss": 6.0202, + "step": 4717 + }, + { + "epoch": 0.02805928251974498, + "grad_norm": 3.118098258972168, + "learning_rate": 4.9902971875630765e-05, + "loss": 6.5365, + "step": 4718 + }, + { + "epoch": 0.028065229803025976, + "grad_norm": 2.3716087341308594, + "learning_rate": 4.990293075815602e-05, + "loss": 6.1382, + "step": 4719 + }, + { + "epoch": 0.028071177086306975, + "grad_norm": 2.4663496017456055, + "learning_rate": 4.990288963198791e-05, + "loss": 5.9804, + "step": 4720 + }, + { + "epoch": 0.028077124369587973, + "grad_norm": 2.2623326778411865, + "learning_rate": 4.9902848497126466e-05, + "loss": 5.9666, + "step": 4721 + }, + { + "epoch": 0.02808307165286897, + "grad_norm": 2.4884161949157715, + "learning_rate": 4.990280735357168e-05, + "loss": 6.0203, + "step": 4722 + }, + { + "epoch": 0.028089018936149967, + "grad_norm": 2.6154520511627197, + "learning_rate": 4.990276620132359e-05, + "loss": 5.9191, + "step": 4723 + }, + { + "epoch": 0.028094966219430965, + "grad_norm": 2.692396879196167, + "learning_rate": 4.990272504038221e-05, + "loss": 6.5314, + "step": 4724 + }, + { + "epoch": 0.02810091350271196, + "grad_norm": 2.483306407928467, + "learning_rate": 4.990268387074754e-05, + "loss": 6.6522, + "step": 4725 + }, + { + "epoch": 0.02810686078599296, + "grad_norm": 3.2098593711853027, + "learning_rate": 4.99026426924196e-05, + "loss": 5.8712, + "step": 4726 + }, + { + "epoch": 0.028112808069273954, + "grad_norm": 2.7335867881774902, + "learning_rate": 4.99026015053984e-05, + "loss": 5.7678, + "step": 4727 + }, + { + "epoch": 0.028118755352554953, + "grad_norm": 2.7587473392486572, + "learning_rate": 4.990256030968396e-05, + "loss": 6.4233, + "step": 4728 + }, + { + "epoch": 0.02812470263583595, + "grad_norm": 2.7686030864715576, + "learning_rate": 4.99025191052763e-05, + "loss": 6.4572, + "step": 4729 + }, + { + "epoch": 0.028130649919116946, + "grad_norm": 2.755916118621826, + "learning_rate": 4.990247789217543e-05, + "loss": 5.9858, + "step": 4730 + }, + { + "epoch": 0.028136597202397945, + "grad_norm": 2.614316463470459, + "learning_rate": 4.990243667038135e-05, + "loss": 6.2315, + "step": 4731 + }, + { + "epoch": 0.028142544485678943, + "grad_norm": 2.0796027183532715, + "learning_rate": 4.990239543989409e-05, + "loss": 6.236, + "step": 4732 + }, + { + "epoch": 0.02814849176895994, + "grad_norm": 2.623412847518921, + "learning_rate": 4.9902354200713665e-05, + "loss": 6.3962, + "step": 4733 + }, + { + "epoch": 0.028154439052240937, + "grad_norm": 2.2746191024780273, + "learning_rate": 4.9902312952840086e-05, + "loss": 5.9101, + "step": 4734 + }, + { + "epoch": 0.028160386335521932, + "grad_norm": 2.102444887161255, + "learning_rate": 4.990227169627336e-05, + "loss": 6.4652, + "step": 4735 + }, + { + "epoch": 0.02816633361880293, + "grad_norm": 2.7720580101013184, + "learning_rate": 4.990223043101352e-05, + "loss": 5.8981, + "step": 4736 + }, + { + "epoch": 0.02817228090208393, + "grad_norm": 2.4479453563690186, + "learning_rate": 4.9902189157060564e-05, + "loss": 6.3554, + "step": 4737 + }, + { + "epoch": 0.028178228185364924, + "grad_norm": 2.7894740104675293, + "learning_rate": 4.990214787441451e-05, + "loss": 6.0017, + "step": 4738 + }, + { + "epoch": 0.028184175468645923, + "grad_norm": 2.869884490966797, + "learning_rate": 4.990210658307537e-05, + "loss": 5.9419, + "step": 4739 + }, + { + "epoch": 0.028190122751926918, + "grad_norm": 2.262723207473755, + "learning_rate": 4.990206528304316e-05, + "loss": 6.172, + "step": 4740 + }, + { + "epoch": 0.028196070035207917, + "grad_norm": 2.179358720779419, + "learning_rate": 4.99020239743179e-05, + "loss": 6.5204, + "step": 4741 + }, + { + "epoch": 0.028202017318488915, + "grad_norm": 2.085179328918457, + "learning_rate": 4.9901982656899606e-05, + "loss": 6.3972, + "step": 4742 + }, + { + "epoch": 0.02820796460176991, + "grad_norm": 1.657567024230957, + "learning_rate": 4.990194133078828e-05, + "loss": 6.4199, + "step": 4743 + }, + { + "epoch": 0.02821391188505091, + "grad_norm": 1.8054349422454834, + "learning_rate": 4.990189999598395e-05, + "loss": 6.3768, + "step": 4744 + }, + { + "epoch": 0.028219859168331907, + "grad_norm": 2.0365710258483887, + "learning_rate": 4.990185865248662e-05, + "loss": 6.3228, + "step": 4745 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 2.069211006164551, + "learning_rate": 4.9901817300296304e-05, + "loss": 5.9874, + "step": 4746 + }, + { + "epoch": 0.0282317537348939, + "grad_norm": 2.3339149951934814, + "learning_rate": 4.9901775939413026e-05, + "loss": 6.1526, + "step": 4747 + }, + { + "epoch": 0.028237701018174896, + "grad_norm": 2.0425326824188232, + "learning_rate": 4.99017345698368e-05, + "loss": 6.2157, + "step": 4748 + }, + { + "epoch": 0.028243648301455895, + "grad_norm": 2.1598799228668213, + "learning_rate": 4.9901693191567625e-05, + "loss": 6.2653, + "step": 4749 + }, + { + "epoch": 0.028249595584736893, + "grad_norm": 2.066566228866577, + "learning_rate": 4.990165180460553e-05, + "loss": 6.3788, + "step": 4750 + }, + { + "epoch": 0.02825554286801789, + "grad_norm": 2.2870383262634277, + "learning_rate": 4.9901610408950527e-05, + "loss": 6.2608, + "step": 4751 + }, + { + "epoch": 0.028261490151298887, + "grad_norm": 2.3180785179138184, + "learning_rate": 4.990156900460263e-05, + "loss": 6.3545, + "step": 4752 + }, + { + "epoch": 0.028267437434579885, + "grad_norm": 2.55261492729187, + "learning_rate": 4.990152759156185e-05, + "loss": 6.3888, + "step": 4753 + }, + { + "epoch": 0.02827338471786088, + "grad_norm": 2.087925910949707, + "learning_rate": 4.990148616982821e-05, + "loss": 6.3585, + "step": 4754 + }, + { + "epoch": 0.02827933200114188, + "grad_norm": 2.2446579933166504, + "learning_rate": 4.9901444739401714e-05, + "loss": 6.4655, + "step": 4755 + }, + { + "epoch": 0.028285279284422874, + "grad_norm": 2.2980077266693115, + "learning_rate": 4.990140330028238e-05, + "loss": 6.3776, + "step": 4756 + }, + { + "epoch": 0.028291226567703873, + "grad_norm": 2.0658226013183594, + "learning_rate": 4.9901361852470224e-05, + "loss": 6.0412, + "step": 4757 + }, + { + "epoch": 0.02829717385098487, + "grad_norm": 2.8402137756347656, + "learning_rate": 4.990132039596526e-05, + "loss": 6.0017, + "step": 4758 + }, + { + "epoch": 0.028303121134265866, + "grad_norm": 2.4620237350463867, + "learning_rate": 4.99012789307675e-05, + "loss": 5.9235, + "step": 4759 + }, + { + "epoch": 0.028309068417546865, + "grad_norm": 2.3318607807159424, + "learning_rate": 4.990123745687697e-05, + "loss": 6.2464, + "step": 4760 + }, + { + "epoch": 0.028315015700827863, + "grad_norm": 2.4998981952667236, + "learning_rate": 4.9901195974293666e-05, + "loss": 6.2731, + "step": 4761 + }, + { + "epoch": 0.02832096298410886, + "grad_norm": 2.4374287128448486, + "learning_rate": 4.9901154483017614e-05, + "loss": 6.362, + "step": 4762 + }, + { + "epoch": 0.028326910267389857, + "grad_norm": 2.6257424354553223, + "learning_rate": 4.990111298304882e-05, + "loss": 6.1456, + "step": 4763 + }, + { + "epoch": 0.028332857550670852, + "grad_norm": 2.74934458732605, + "learning_rate": 4.990107147438732e-05, + "loss": 6.0121, + "step": 4764 + }, + { + "epoch": 0.02833880483395185, + "grad_norm": 2.33137583732605, + "learning_rate": 4.9901029957033106e-05, + "loss": 6.0207, + "step": 4765 + }, + { + "epoch": 0.02834475211723285, + "grad_norm": 1.9006321430206299, + "learning_rate": 4.9900988430986196e-05, + "loss": 5.8946, + "step": 4766 + }, + { + "epoch": 0.028350699400513844, + "grad_norm": 1.9786534309387207, + "learning_rate": 4.990094689624661e-05, + "loss": 5.7782, + "step": 4767 + }, + { + "epoch": 0.028356646683794843, + "grad_norm": 2.1215951442718506, + "learning_rate": 4.9900905352814365e-05, + "loss": 5.8129, + "step": 4768 + }, + { + "epoch": 0.02836259396707584, + "grad_norm": 2.9569597244262695, + "learning_rate": 4.9900863800689465e-05, + "loss": 5.7882, + "step": 4769 + }, + { + "epoch": 0.028368541250356837, + "grad_norm": 2.720447540283203, + "learning_rate": 4.990082223987193e-05, + "loss": 5.9075, + "step": 4770 + }, + { + "epoch": 0.028374488533637835, + "grad_norm": 2.8727002143859863, + "learning_rate": 4.990078067036178e-05, + "loss": 6.1571, + "step": 4771 + }, + { + "epoch": 0.02838043581691883, + "grad_norm": 2.2992594242095947, + "learning_rate": 4.990073909215902e-05, + "loss": 6.0195, + "step": 4772 + }, + { + "epoch": 0.02838638310019983, + "grad_norm": 2.0323293209075928, + "learning_rate": 4.990069750526368e-05, + "loss": 5.8049, + "step": 4773 + }, + { + "epoch": 0.028392330383480827, + "grad_norm": 2.938795328140259, + "learning_rate": 4.9900655909675755e-05, + "loss": 6.9215, + "step": 4774 + }, + { + "epoch": 0.028398277666761822, + "grad_norm": 2.6333048343658447, + "learning_rate": 4.990061430539527e-05, + "loss": 5.868, + "step": 4775 + }, + { + "epoch": 0.02840422495004282, + "grad_norm": 2.8569674491882324, + "learning_rate": 4.990057269242223e-05, + "loss": 5.8782, + "step": 4776 + }, + { + "epoch": 0.028410172233323816, + "grad_norm": 2.62206768989563, + "learning_rate": 4.9900531070756666e-05, + "loss": 5.7751, + "step": 4777 + }, + { + "epoch": 0.028416119516604815, + "grad_norm": 2.2112414836883545, + "learning_rate": 4.990048944039858e-05, + "loss": 5.7985, + "step": 4778 + }, + { + "epoch": 0.028422066799885813, + "grad_norm": 2.1571342945098877, + "learning_rate": 4.990044780134799e-05, + "loss": 5.9089, + "step": 4779 + }, + { + "epoch": 0.028428014083166808, + "grad_norm": 2.4310410022735596, + "learning_rate": 4.9900406153604916e-05, + "loss": 5.6728, + "step": 4780 + }, + { + "epoch": 0.028433961366447807, + "grad_norm": 2.25822377204895, + "learning_rate": 4.990036449716937e-05, + "loss": 5.5808, + "step": 4781 + }, + { + "epoch": 0.028439908649728805, + "grad_norm": 2.3068299293518066, + "learning_rate": 4.990032283204136e-05, + "loss": 5.729, + "step": 4782 + }, + { + "epoch": 0.0284458559330098, + "grad_norm": 2.0582191944122314, + "learning_rate": 4.9900281158220905e-05, + "loss": 5.6877, + "step": 4783 + }, + { + "epoch": 0.0284518032162908, + "grad_norm": 2.572824239730835, + "learning_rate": 4.9900239475708015e-05, + "loss": 5.9522, + "step": 4784 + }, + { + "epoch": 0.028457750499571794, + "grad_norm": 2.299001693725586, + "learning_rate": 4.990019778450271e-05, + "loss": 5.7579, + "step": 4785 + }, + { + "epoch": 0.028463697782852793, + "grad_norm": 2.231381893157959, + "learning_rate": 4.990015608460501e-05, + "loss": 5.756, + "step": 4786 + }, + { + "epoch": 0.02846964506613379, + "grad_norm": 1.7982486486434937, + "learning_rate": 4.990011437601492e-05, + "loss": 5.8076, + "step": 4787 + }, + { + "epoch": 0.028475592349414786, + "grad_norm": 1.8788951635360718, + "learning_rate": 4.990007265873245e-05, + "loss": 5.8798, + "step": 4788 + }, + { + "epoch": 0.028481539632695785, + "grad_norm": 1.6190022230148315, + "learning_rate": 4.9900030932757623e-05, + "loss": 5.5695, + "step": 4789 + }, + { + "epoch": 0.028487486915976783, + "grad_norm": 1.9226019382476807, + "learning_rate": 4.9899989198090455e-05, + "loss": 5.671, + "step": 4790 + }, + { + "epoch": 0.02849343419925778, + "grad_norm": 1.7437139749526978, + "learning_rate": 4.989994745473097e-05, + "loss": 5.6728, + "step": 4791 + }, + { + "epoch": 0.028499381482538777, + "grad_norm": 1.624126672744751, + "learning_rate": 4.989990570267915e-05, + "loss": 5.6209, + "step": 4792 + }, + { + "epoch": 0.028505328765819772, + "grad_norm": 2.1894004344940186, + "learning_rate": 4.9899863941935046e-05, + "loss": 5.6669, + "step": 4793 + }, + { + "epoch": 0.02851127604910077, + "grad_norm": 2.2243428230285645, + "learning_rate": 4.9899822172498646e-05, + "loss": 5.4557, + "step": 4794 + }, + { + "epoch": 0.02851722333238177, + "grad_norm": 2.032611608505249, + "learning_rate": 4.989978039436998e-05, + "loss": 5.7883, + "step": 4795 + }, + { + "epoch": 0.028523170615662764, + "grad_norm": 1.8496538400650024, + "learning_rate": 4.989973860754906e-05, + "loss": 5.6329, + "step": 4796 + }, + { + "epoch": 0.028529117898943763, + "grad_norm": 1.7072707414627075, + "learning_rate": 4.989969681203589e-05, + "loss": 5.7242, + "step": 4797 + }, + { + "epoch": 0.02853506518222476, + "grad_norm": 1.7351912260055542, + "learning_rate": 4.9899655007830504e-05, + "loss": 5.648, + "step": 4798 + }, + { + "epoch": 0.028541012465505756, + "grad_norm": 2.514162302017212, + "learning_rate": 4.9899613194932904e-05, + "loss": 5.556, + "step": 4799 + }, + { + "epoch": 0.028546959748786755, + "grad_norm": 10.245063781738281, + "learning_rate": 4.98995713733431e-05, + "loss": 5.5922, + "step": 4800 + }, + { + "epoch": 0.02855290703206775, + "grad_norm": 2.012106418609619, + "learning_rate": 4.989952954306112e-05, + "loss": 5.5092, + "step": 4801 + }, + { + "epoch": 0.02855885431534875, + "grad_norm": 1.8654139041900635, + "learning_rate": 4.9899487704086966e-05, + "loss": 5.4164, + "step": 4802 + }, + { + "epoch": 0.028564801598629747, + "grad_norm": 1.778798222541809, + "learning_rate": 4.9899445856420656e-05, + "loss": 5.5537, + "step": 4803 + }, + { + "epoch": 0.028570748881910742, + "grad_norm": 2.205038547515869, + "learning_rate": 4.989940400006221e-05, + "loss": 5.9338, + "step": 4804 + }, + { + "epoch": 0.02857669616519174, + "grad_norm": 2.3908839225769043, + "learning_rate": 4.989936213501164e-05, + "loss": 5.8962, + "step": 4805 + }, + { + "epoch": 0.028582643448472736, + "grad_norm": 2.3438172340393066, + "learning_rate": 4.9899320261268966e-05, + "loss": 5.8133, + "step": 4806 + }, + { + "epoch": 0.028588590731753735, + "grad_norm": 2.4021737575531006, + "learning_rate": 4.989927837883419e-05, + "loss": 5.8366, + "step": 4807 + }, + { + "epoch": 0.028594538015034733, + "grad_norm": 1.9976004362106323, + "learning_rate": 4.989923648770734e-05, + "loss": 5.6976, + "step": 4808 + }, + { + "epoch": 0.028600485298315728, + "grad_norm": 2.2234697341918945, + "learning_rate": 4.989919458788841e-05, + "loss": 5.7871, + "step": 4809 + }, + { + "epoch": 0.028606432581596727, + "grad_norm": 2.203223705291748, + "learning_rate": 4.989915267937744e-05, + "loss": 5.5799, + "step": 4810 + }, + { + "epoch": 0.028612379864877725, + "grad_norm": 2.2155261039733887, + "learning_rate": 4.989911076217442e-05, + "loss": 5.6022, + "step": 4811 + }, + { + "epoch": 0.02861832714815872, + "grad_norm": 1.9379621744155884, + "learning_rate": 4.989906883627939e-05, + "loss": 5.8647, + "step": 4812 + }, + { + "epoch": 0.02862427443143972, + "grad_norm": 2.0589749813079834, + "learning_rate": 4.9899026901692345e-05, + "loss": 5.6048, + "step": 4813 + }, + { + "epoch": 0.028630221714720714, + "grad_norm": 2.3813774585723877, + "learning_rate": 4.9898984958413315e-05, + "loss": 5.6726, + "step": 4814 + }, + { + "epoch": 0.028636168998001713, + "grad_norm": 2.06425142288208, + "learning_rate": 4.98989430064423e-05, + "loss": 5.8505, + "step": 4815 + }, + { + "epoch": 0.02864211628128271, + "grad_norm": 2.199697494506836, + "learning_rate": 4.9898901045779326e-05, + "loss": 5.6114, + "step": 4816 + }, + { + "epoch": 0.028648063564563706, + "grad_norm": 2.136411428451538, + "learning_rate": 4.98988590764244e-05, + "loss": 5.3987, + "step": 4817 + }, + { + "epoch": 0.028654010847844705, + "grad_norm": 1.914929986000061, + "learning_rate": 4.9898817098377534e-05, + "loss": 5.702, + "step": 4818 + }, + { + "epoch": 0.028659958131125703, + "grad_norm": 2.316027879714966, + "learning_rate": 4.989877511163876e-05, + "loss": 5.5886, + "step": 4819 + }, + { + "epoch": 0.0286659054144067, + "grad_norm": 3.2775018215179443, + "learning_rate": 4.9898733116208076e-05, + "loss": 5.5337, + "step": 4820 + }, + { + "epoch": 0.028671852697687697, + "grad_norm": 2.16430926322937, + "learning_rate": 4.989869111208549e-05, + "loss": 5.7189, + "step": 4821 + }, + { + "epoch": 0.028677799980968692, + "grad_norm": 2.1936638355255127, + "learning_rate": 4.9898649099271046e-05, + "loss": 5.2942, + "step": 4822 + }, + { + "epoch": 0.02868374726424969, + "grad_norm": 2.262485980987549, + "learning_rate": 4.9898607077764736e-05, + "loss": 5.4284, + "step": 4823 + }, + { + "epoch": 0.02868969454753069, + "grad_norm": 1.7890170812606812, + "learning_rate": 4.989856504756657e-05, + "loss": 5.6021, + "step": 4824 + }, + { + "epoch": 0.028695641830811684, + "grad_norm": 1.747862696647644, + "learning_rate": 4.9898523008676585e-05, + "loss": 5.72, + "step": 4825 + }, + { + "epoch": 0.028701589114092683, + "grad_norm": 1.9750064611434937, + "learning_rate": 4.989848096109477e-05, + "loss": 5.8923, + "step": 4826 + }, + { + "epoch": 0.02870753639737368, + "grad_norm": 2.0249626636505127, + "learning_rate": 4.989843890482117e-05, + "loss": 5.4866, + "step": 4827 + }, + { + "epoch": 0.028713483680654676, + "grad_norm": 2.2737395763397217, + "learning_rate": 4.9898396839855765e-05, + "loss": 5.5498, + "step": 4828 + }, + { + "epoch": 0.028719430963935675, + "grad_norm": 2.2852187156677246, + "learning_rate": 4.98983547661986e-05, + "loss": 5.672, + "step": 4829 + }, + { + "epoch": 0.02872537824721667, + "grad_norm": 1.9441994428634644, + "learning_rate": 4.989831268384967e-05, + "loss": 5.4933, + "step": 4830 + }, + { + "epoch": 0.02873132553049767, + "grad_norm": 1.9561070203781128, + "learning_rate": 4.989827059280899e-05, + "loss": 5.7465, + "step": 4831 + }, + { + "epoch": 0.028737272813778667, + "grad_norm": 2.482849597930908, + "learning_rate": 4.9898228493076594e-05, + "loss": 5.4338, + "step": 4832 + }, + { + "epoch": 0.028743220097059662, + "grad_norm": 1.8582524061203003, + "learning_rate": 4.989818638465247e-05, + "loss": 5.5378, + "step": 4833 + }, + { + "epoch": 0.02874916738034066, + "grad_norm": 2.119783639907837, + "learning_rate": 4.9898144267536654e-05, + "loss": 5.6012, + "step": 4834 + }, + { + "epoch": 0.028755114663621656, + "grad_norm": 2.333965301513672, + "learning_rate": 4.989810214172915e-05, + "loss": 5.7376, + "step": 4835 + }, + { + "epoch": 0.028761061946902654, + "grad_norm": 2.600861072540283, + "learning_rate": 4.989806000722999e-05, + "loss": 6.2747, + "step": 4836 + }, + { + "epoch": 0.028767009230183653, + "grad_norm": 2.3250534534454346, + "learning_rate": 4.989801786403916e-05, + "loss": 5.5993, + "step": 4837 + }, + { + "epoch": 0.028772956513464648, + "grad_norm": 2.507377862930298, + "learning_rate": 4.9897975712156686e-05, + "loss": 5.3919, + "step": 4838 + }, + { + "epoch": 0.028778903796745647, + "grad_norm": 1.9882018566131592, + "learning_rate": 4.9897933551582596e-05, + "loss": 5.5939, + "step": 4839 + }, + { + "epoch": 0.028784851080026645, + "grad_norm": 2.235269784927368, + "learning_rate": 4.989789138231688e-05, + "loss": 5.4036, + "step": 4840 + }, + { + "epoch": 0.02879079836330764, + "grad_norm": 1.895071029663086, + "learning_rate": 4.989784920435959e-05, + "loss": 5.7259, + "step": 4841 + }, + { + "epoch": 0.02879674564658864, + "grad_norm": 2.0197908878326416, + "learning_rate": 4.989780701771071e-05, + "loss": 5.5114, + "step": 4842 + }, + { + "epoch": 0.028802692929869634, + "grad_norm": 1.9679557085037231, + "learning_rate": 4.989776482237025e-05, + "loss": 5.5798, + "step": 4843 + }, + { + "epoch": 0.028808640213150633, + "grad_norm": 1.980610728263855, + "learning_rate": 4.989772261833825e-05, + "loss": 5.5509, + "step": 4844 + }, + { + "epoch": 0.02881458749643163, + "grad_norm": 2.4565272331237793, + "learning_rate": 4.989768040561471e-05, + "loss": 5.4723, + "step": 4845 + }, + { + "epoch": 0.028820534779712626, + "grad_norm": 2.0567848682403564, + "learning_rate": 4.989763818419964e-05, + "loss": 5.546, + "step": 4846 + }, + { + "epoch": 0.028826482062993625, + "grad_norm": 2.0259108543395996, + "learning_rate": 4.989759595409307e-05, + "loss": 5.4138, + "step": 4847 + }, + { + "epoch": 0.028832429346274623, + "grad_norm": 1.9334442615509033, + "learning_rate": 4.9897553715295003e-05, + "loss": 5.7036, + "step": 4848 + }, + { + "epoch": 0.02883837662955562, + "grad_norm": 1.8335916996002197, + "learning_rate": 4.989751146780546e-05, + "loss": 5.6399, + "step": 4849 + }, + { + "epoch": 0.028844323912836617, + "grad_norm": 2.129821538925171, + "learning_rate": 4.989746921162445e-05, + "loss": 5.7108, + "step": 4850 + }, + { + "epoch": 0.028850271196117612, + "grad_norm": 2.4127001762390137, + "learning_rate": 4.9897426946751994e-05, + "loss": 5.3901, + "step": 4851 + }, + { + "epoch": 0.02885621847939861, + "grad_norm": 1.9506126642227173, + "learning_rate": 4.98973846731881e-05, + "loss": 5.7781, + "step": 4852 + }, + { + "epoch": 0.02886216576267961, + "grad_norm": 1.6746875047683716, + "learning_rate": 4.9897342390932786e-05, + "loss": 5.7408, + "step": 4853 + }, + { + "epoch": 0.028868113045960604, + "grad_norm": 1.95681893825531, + "learning_rate": 4.989730009998607e-05, + "loss": 5.7181, + "step": 4854 + }, + { + "epoch": 0.028874060329241603, + "grad_norm": 1.782030701637268, + "learning_rate": 4.9897257800347964e-05, + "loss": 5.5901, + "step": 4855 + }, + { + "epoch": 0.0288800076125226, + "grad_norm": 1.7590057849884033, + "learning_rate": 4.9897215492018476e-05, + "loss": 5.4566, + "step": 4856 + }, + { + "epoch": 0.028885954895803596, + "grad_norm": 2.4675025939941406, + "learning_rate": 4.989717317499764e-05, + "loss": 5.7738, + "step": 4857 + }, + { + "epoch": 0.028891902179084595, + "grad_norm": 2.221975326538086, + "learning_rate": 4.989713084928545e-05, + "loss": 5.591, + "step": 4858 + }, + { + "epoch": 0.02889784946236559, + "grad_norm": 2.21158504486084, + "learning_rate": 4.989708851488192e-05, + "loss": 5.7755, + "step": 4859 + }, + { + "epoch": 0.02890379674564659, + "grad_norm": 2.2253987789154053, + "learning_rate": 4.989704617178709e-05, + "loss": 5.8653, + "step": 4860 + }, + { + "epoch": 0.028909744028927587, + "grad_norm": 2.3298027515411377, + "learning_rate": 4.989700382000094e-05, + "loss": 5.3371, + "step": 4861 + }, + { + "epoch": 0.028915691312208582, + "grad_norm": 2.1918935775756836, + "learning_rate": 4.989696145952352e-05, + "loss": 5.4893, + "step": 4862 + }, + { + "epoch": 0.02892163859548958, + "grad_norm": 2.422117233276367, + "learning_rate": 4.989691909035482e-05, + "loss": 5.8775, + "step": 4863 + }, + { + "epoch": 0.02892758587877058, + "grad_norm": 2.4346981048583984, + "learning_rate": 4.989687671249487e-05, + "loss": 6.3671, + "step": 4864 + }, + { + "epoch": 0.028933533162051574, + "grad_norm": 2.094780921936035, + "learning_rate": 4.989683432594367e-05, + "loss": 5.7814, + "step": 4865 + }, + { + "epoch": 0.028939480445332573, + "grad_norm": 2.240318775177002, + "learning_rate": 4.9896791930701244e-05, + "loss": 5.6606, + "step": 4866 + }, + { + "epoch": 0.028945427728613568, + "grad_norm": 2.102381706237793, + "learning_rate": 4.989674952676761e-05, + "loss": 5.8477, + "step": 4867 + }, + { + "epoch": 0.028951375011894567, + "grad_norm": 2.2786238193511963, + "learning_rate": 4.989670711414277e-05, + "loss": 5.8786, + "step": 4868 + }, + { + "epoch": 0.028957322295175565, + "grad_norm": 2.079899549484253, + "learning_rate": 4.989666469282675e-05, + "loss": 6.2171, + "step": 4869 + }, + { + "epoch": 0.02896326957845656, + "grad_norm": 2.024061679840088, + "learning_rate": 4.989662226281956e-05, + "loss": 6.2889, + "step": 4870 + }, + { + "epoch": 0.02896921686173756, + "grad_norm": 2.1397578716278076, + "learning_rate": 4.989657982412122e-05, + "loss": 6.2477, + "step": 4871 + }, + { + "epoch": 0.028975164145018554, + "grad_norm": 2.1303393840789795, + "learning_rate": 4.989653737673174e-05, + "loss": 6.3005, + "step": 4872 + }, + { + "epoch": 0.028981111428299552, + "grad_norm": 2.4091451168060303, + "learning_rate": 4.989649492065114e-05, + "loss": 5.997, + "step": 4873 + }, + { + "epoch": 0.02898705871158055, + "grad_norm": 2.2236886024475098, + "learning_rate": 4.989645245587942e-05, + "loss": 5.7886, + "step": 4874 + }, + { + "epoch": 0.028993005994861546, + "grad_norm": 2.6160736083984375, + "learning_rate": 4.989640998241661e-05, + "loss": 6.1542, + "step": 4875 + }, + { + "epoch": 0.028998953278142545, + "grad_norm": 2.4163296222686768, + "learning_rate": 4.989636750026273e-05, + "loss": 6.392, + "step": 4876 + }, + { + "epoch": 0.029004900561423543, + "grad_norm": 2.079172372817993, + "learning_rate": 4.989632500941778e-05, + "loss": 6.2886, + "step": 4877 + }, + { + "epoch": 0.02901084784470454, + "grad_norm": 2.628694772720337, + "learning_rate": 4.989628250988178e-05, + "loss": 6.0359, + "step": 4878 + }, + { + "epoch": 0.029016795127985537, + "grad_norm": 2.2080392837524414, + "learning_rate": 4.989624000165474e-05, + "loss": 5.9916, + "step": 4879 + }, + { + "epoch": 0.029022742411266532, + "grad_norm": 2.4130380153656006, + "learning_rate": 4.9896197484736685e-05, + "loss": 6.3835, + "step": 4880 + }, + { + "epoch": 0.02902868969454753, + "grad_norm": 2.328511953353882, + "learning_rate": 4.989615495912762e-05, + "loss": 5.838, + "step": 4881 + }, + { + "epoch": 0.02903463697782853, + "grad_norm": 2.273345470428467, + "learning_rate": 4.989611242482757e-05, + "loss": 5.8764, + "step": 4882 + }, + { + "epoch": 0.029040584261109524, + "grad_norm": 2.1498537063598633, + "learning_rate": 4.9896069881836535e-05, + "loss": 6.1562, + "step": 4883 + }, + { + "epoch": 0.029046531544390523, + "grad_norm": 2.497267723083496, + "learning_rate": 4.989602733015455e-05, + "loss": 5.6708, + "step": 4884 + }, + { + "epoch": 0.02905247882767152, + "grad_norm": 2.232802152633667, + "learning_rate": 4.989598476978161e-05, + "loss": 5.6854, + "step": 4885 + }, + { + "epoch": 0.029058426110952516, + "grad_norm": 2.0582375526428223, + "learning_rate": 4.989594220071775e-05, + "loss": 6.5288, + "step": 4886 + }, + { + "epoch": 0.029064373394233515, + "grad_norm": 3.2556731700897217, + "learning_rate": 4.989589962296296e-05, + "loss": 5.9985, + "step": 4887 + }, + { + "epoch": 0.02907032067751451, + "grad_norm": 2.2807655334472656, + "learning_rate": 4.989585703651728e-05, + "loss": 6.1802, + "step": 4888 + }, + { + "epoch": 0.02907626796079551, + "grad_norm": 2.379136085510254, + "learning_rate": 4.989581444138071e-05, + "loss": 6.3531, + "step": 4889 + }, + { + "epoch": 0.029082215244076507, + "grad_norm": 2.9518685340881348, + "learning_rate": 4.989577183755327e-05, + "loss": 6.0689, + "step": 4890 + }, + { + "epoch": 0.029088162527357502, + "grad_norm": 2.823340654373169, + "learning_rate": 4.9895729225034973e-05, + "loss": 6.3405, + "step": 4891 + }, + { + "epoch": 0.0290941098106385, + "grad_norm": 2.4327731132507324, + "learning_rate": 4.989568660382583e-05, + "loss": 6.4928, + "step": 4892 + }, + { + "epoch": 0.0291000570939195, + "grad_norm": 2.0744240283966064, + "learning_rate": 4.9895643973925864e-05, + "loss": 6.2664, + "step": 4893 + }, + { + "epoch": 0.029106004377200494, + "grad_norm": 2.373710870742798, + "learning_rate": 4.9895601335335085e-05, + "loss": 5.9738, + "step": 4894 + }, + { + "epoch": 0.029111951660481493, + "grad_norm": 2.2934412956237793, + "learning_rate": 4.9895558688053505e-05, + "loss": 6.1353, + "step": 4895 + }, + { + "epoch": 0.029117898943762488, + "grad_norm": 2.4360926151275635, + "learning_rate": 4.989551603208114e-05, + "loss": 5.4768, + "step": 4896 + }, + { + "epoch": 0.029123846227043487, + "grad_norm": 2.8072469234466553, + "learning_rate": 4.989547336741802e-05, + "loss": 5.977, + "step": 4897 + }, + { + "epoch": 0.029129793510324485, + "grad_norm": 2.7759921550750732, + "learning_rate": 4.9895430694064135e-05, + "loss": 6.3918, + "step": 4898 + }, + { + "epoch": 0.02913574079360548, + "grad_norm": 2.4547574520111084, + "learning_rate": 4.989538801201953e-05, + "loss": 6.0461, + "step": 4899 + }, + { + "epoch": 0.02914168807688648, + "grad_norm": 2.6097168922424316, + "learning_rate": 4.9895345321284184e-05, + "loss": 5.88, + "step": 4900 + }, + { + "epoch": 0.029147635360167474, + "grad_norm": 2.8312575817108154, + "learning_rate": 4.989530262185814e-05, + "loss": 6.0314, + "step": 4901 + }, + { + "epoch": 0.029153582643448472, + "grad_norm": 2.928974151611328, + "learning_rate": 4.98952599137414e-05, + "loss": 6.3698, + "step": 4902 + }, + { + "epoch": 0.02915952992672947, + "grad_norm": 2.527578115463257, + "learning_rate": 4.989521719693398e-05, + "loss": 6.4301, + "step": 4903 + }, + { + "epoch": 0.029165477210010466, + "grad_norm": 2.392106771469116, + "learning_rate": 4.9895174471435904e-05, + "loss": 6.3515, + "step": 4904 + }, + { + "epoch": 0.029171424493291465, + "grad_norm": 1.9899437427520752, + "learning_rate": 4.989513173724717e-05, + "loss": 6.3265, + "step": 4905 + }, + { + "epoch": 0.029177371776572463, + "grad_norm": 2.057600736618042, + "learning_rate": 4.9895088994367806e-05, + "loss": 6.2402, + "step": 4906 + }, + { + "epoch": 0.029183319059853458, + "grad_norm": 2.8310391902923584, + "learning_rate": 4.989504624279783e-05, + "loss": 5.9056, + "step": 4907 + }, + { + "epoch": 0.029189266343134457, + "grad_norm": 2.904785394668579, + "learning_rate": 4.989500348253724e-05, + "loss": 5.8847, + "step": 4908 + }, + { + "epoch": 0.029195213626415452, + "grad_norm": 2.7728030681610107, + "learning_rate": 4.989496071358607e-05, + "loss": 5.8997, + "step": 4909 + }, + { + "epoch": 0.02920116090969645, + "grad_norm": 2.768862009048462, + "learning_rate": 4.989491793594432e-05, + "loss": 6.1267, + "step": 4910 + }, + { + "epoch": 0.02920710819297745, + "grad_norm": 2.4353668689727783, + "learning_rate": 4.989487514961201e-05, + "loss": 5.9087, + "step": 4911 + }, + { + "epoch": 0.029213055476258444, + "grad_norm": 2.5170469284057617, + "learning_rate": 4.9894832354589164e-05, + "loss": 6.0971, + "step": 4912 + }, + { + "epoch": 0.029219002759539443, + "grad_norm": 2.345998764038086, + "learning_rate": 4.9894789550875784e-05, + "loss": 6.2518, + "step": 4913 + }, + { + "epoch": 0.02922495004282044, + "grad_norm": 2.429123878479004, + "learning_rate": 4.98947467384719e-05, + "loss": 6.238, + "step": 4914 + }, + { + "epoch": 0.029230897326101436, + "grad_norm": 2.531514883041382, + "learning_rate": 4.9894703917377506e-05, + "loss": 6.0177, + "step": 4915 + }, + { + "epoch": 0.029236844609382435, + "grad_norm": 2.833874464035034, + "learning_rate": 4.9894661087592634e-05, + "loss": 6.2018, + "step": 4916 + }, + { + "epoch": 0.02924279189266343, + "grad_norm": 2.521381378173828, + "learning_rate": 4.9894618249117287e-05, + "loss": 6.1777, + "step": 4917 + }, + { + "epoch": 0.02924873917594443, + "grad_norm": 2.731703758239746, + "learning_rate": 4.989457540195149e-05, + "loss": 6.0237, + "step": 4918 + }, + { + "epoch": 0.029254686459225427, + "grad_norm": 2.918398141860962, + "learning_rate": 4.989453254609525e-05, + "loss": 6.5688, + "step": 4919 + }, + { + "epoch": 0.029260633742506422, + "grad_norm": 2.407552480697632, + "learning_rate": 4.989448968154859e-05, + "loss": 5.9751, + "step": 4920 + }, + { + "epoch": 0.02926658102578742, + "grad_norm": 2.575258731842041, + "learning_rate": 4.989444680831152e-05, + "loss": 5.7587, + "step": 4921 + }, + { + "epoch": 0.02927252830906842, + "grad_norm": 2.6550750732421875, + "learning_rate": 4.989440392638406e-05, + "loss": 6.6404, + "step": 4922 + }, + { + "epoch": 0.029278475592349414, + "grad_norm": 2.569438934326172, + "learning_rate": 4.989436103576621e-05, + "loss": 5.8615, + "step": 4923 + }, + { + "epoch": 0.029284422875630413, + "grad_norm": 2.4601991176605225, + "learning_rate": 4.989431813645801e-05, + "loss": 5.8969, + "step": 4924 + }, + { + "epoch": 0.029290370158911408, + "grad_norm": 3.579819917678833, + "learning_rate": 4.989427522845945e-05, + "loss": 5.8832, + "step": 4925 + }, + { + "epoch": 0.029296317442192406, + "grad_norm": 2.5762264728546143, + "learning_rate": 4.9894232311770556e-05, + "loss": 5.4841, + "step": 4926 + }, + { + "epoch": 0.029302264725473405, + "grad_norm": 3.352381706237793, + "learning_rate": 4.989418938639134e-05, + "loss": 5.8936, + "step": 4927 + }, + { + "epoch": 0.0293082120087544, + "grad_norm": 2.824322462081909, + "learning_rate": 4.9894146452321835e-05, + "loss": 5.8291, + "step": 4928 + }, + { + "epoch": 0.0293141592920354, + "grad_norm": 2.6431384086608887, + "learning_rate": 4.9894103509562026e-05, + "loss": 6.2519, + "step": 4929 + }, + { + "epoch": 0.029320106575316394, + "grad_norm": 3.0580949783325195, + "learning_rate": 4.989406055811195e-05, + "loss": 6.4141, + "step": 4930 + }, + { + "epoch": 0.029326053858597392, + "grad_norm": 2.757420778274536, + "learning_rate": 4.989401759797161e-05, + "loss": 6.1427, + "step": 4931 + }, + { + "epoch": 0.02933200114187839, + "grad_norm": 2.713111639022827, + "learning_rate": 4.989397462914103e-05, + "loss": 6.4107, + "step": 4932 + }, + { + "epoch": 0.029337948425159386, + "grad_norm": 2.7954351902008057, + "learning_rate": 4.9893931651620215e-05, + "loss": 5.7657, + "step": 4933 + }, + { + "epoch": 0.029343895708440385, + "grad_norm": 2.3637917041778564, + "learning_rate": 4.9893888665409196e-05, + "loss": 5.8209, + "step": 4934 + }, + { + "epoch": 0.029349842991721383, + "grad_norm": 2.938631296157837, + "learning_rate": 4.9893845670507964e-05, + "loss": 6.0502, + "step": 4935 + }, + { + "epoch": 0.029355790275002378, + "grad_norm": 2.8911824226379395, + "learning_rate": 4.989380266691655e-05, + "loss": 5.9736, + "step": 4936 + }, + { + "epoch": 0.029361737558283377, + "grad_norm": 2.9410245418548584, + "learning_rate": 4.989375965463498e-05, + "loss": 5.2824, + "step": 4937 + }, + { + "epoch": 0.029367684841564372, + "grad_norm": 2.4925217628479004, + "learning_rate": 4.9893716633663244e-05, + "loss": 5.5829, + "step": 4938 + }, + { + "epoch": 0.02937363212484537, + "grad_norm": 2.485349178314209, + "learning_rate": 4.9893673604001366e-05, + "loss": 5.8812, + "step": 4939 + }, + { + "epoch": 0.02937957940812637, + "grad_norm": 2.3950133323669434, + "learning_rate": 4.9893630565649376e-05, + "loss": 5.9314, + "step": 4940 + }, + { + "epoch": 0.029385526691407364, + "grad_norm": 2.28104829788208, + "learning_rate": 4.989358751860726e-05, + "loss": 6.1768, + "step": 4941 + }, + { + "epoch": 0.029391473974688363, + "grad_norm": 2.4479010105133057, + "learning_rate": 4.989354446287507e-05, + "loss": 6.1645, + "step": 4942 + }, + { + "epoch": 0.02939742125796936, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.989350139845279e-05, + "loss": 5.7145, + "step": 4943 + }, + { + "epoch": 0.029403368541250356, + "grad_norm": 2.4120032787323, + "learning_rate": 4.989345832534045e-05, + "loss": 5.695, + "step": 4944 + }, + { + "epoch": 0.029409315824531355, + "grad_norm": 2.6345109939575195, + "learning_rate": 4.989341524353805e-05, + "loss": 5.4805, + "step": 4945 + }, + { + "epoch": 0.02941526310781235, + "grad_norm": 2.8750240802764893, + "learning_rate": 4.989337215304563e-05, + "loss": 5.0352, + "step": 4946 + }, + { + "epoch": 0.02942121039109335, + "grad_norm": 2.7220489978790283, + "learning_rate": 4.989332905386318e-05, + "loss": 5.1646, + "step": 4947 + }, + { + "epoch": 0.029427157674374347, + "grad_norm": 2.464871883392334, + "learning_rate": 4.9893285945990734e-05, + "loss": 4.9989, + "step": 4948 + }, + { + "epoch": 0.029433104957655342, + "grad_norm": 2.261049270629883, + "learning_rate": 4.989324282942829e-05, + "loss": 6.2217, + "step": 4949 + }, + { + "epoch": 0.02943905224093634, + "grad_norm": 2.224818468093872, + "learning_rate": 4.9893199704175876e-05, + "loss": 6.3964, + "step": 4950 + }, + { + "epoch": 0.02944499952421734, + "grad_norm": 2.366520643234253, + "learning_rate": 4.989315657023351e-05, + "loss": 6.3572, + "step": 4951 + }, + { + "epoch": 0.029450946807498334, + "grad_norm": 2.4811010360717773, + "learning_rate": 4.989311342760119e-05, + "loss": 5.7867, + "step": 4952 + }, + { + "epoch": 0.029456894090779333, + "grad_norm": 2.246730089187622, + "learning_rate": 4.989307027627895e-05, + "loss": 6.0865, + "step": 4953 + }, + { + "epoch": 0.029462841374060328, + "grad_norm": 2.297379493713379, + "learning_rate": 4.989302711626679e-05, + "loss": 5.9257, + "step": 4954 + }, + { + "epoch": 0.029468788657341326, + "grad_norm": 2.5890488624572754, + "learning_rate": 4.989298394756473e-05, + "loss": 5.7631, + "step": 4955 + }, + { + "epoch": 0.029474735940622325, + "grad_norm": 3.3777449131011963, + "learning_rate": 4.989294077017279e-05, + "loss": 5.4014, + "step": 4956 + }, + { + "epoch": 0.02948068322390332, + "grad_norm": 2.0395402908325195, + "learning_rate": 4.9892897584090986e-05, + "loss": 6.2429, + "step": 4957 + }, + { + "epoch": 0.02948663050718432, + "grad_norm": 2.0414693355560303, + "learning_rate": 4.989285438931932e-05, + "loss": 6.4685, + "step": 4958 + }, + { + "epoch": 0.029492577790465314, + "grad_norm": 2.2383265495300293, + "learning_rate": 4.989281118585783e-05, + "loss": 6.1651, + "step": 4959 + }, + { + "epoch": 0.029498525073746312, + "grad_norm": 2.559720754623413, + "learning_rate": 4.98927679737065e-05, + "loss": 6.3822, + "step": 4960 + }, + { + "epoch": 0.02950447235702731, + "grad_norm": 2.810699939727783, + "learning_rate": 4.989272475286537e-05, + "loss": 6.2076, + "step": 4961 + }, + { + "epoch": 0.029510419640308306, + "grad_norm": 2.9151525497436523, + "learning_rate": 4.989268152333445e-05, + "loss": 5.9892, + "step": 4962 + }, + { + "epoch": 0.029516366923589304, + "grad_norm": 2.295197010040283, + "learning_rate": 4.9892638285113744e-05, + "loss": 6.1392, + "step": 4963 + }, + { + "epoch": 0.029522314206870303, + "grad_norm": 2.271088123321533, + "learning_rate": 4.989259503820328e-05, + "loss": 6.6991, + "step": 4964 + }, + { + "epoch": 0.029528261490151298, + "grad_norm": 2.338074207305908, + "learning_rate": 4.9892551782603064e-05, + "loss": 5.9615, + "step": 4965 + }, + { + "epoch": 0.029534208773432297, + "grad_norm": 2.3510494232177734, + "learning_rate": 4.989250851831312e-05, + "loss": 5.8894, + "step": 4966 + }, + { + "epoch": 0.029540156056713292, + "grad_norm": 2.1170454025268555, + "learning_rate": 4.989246524533345e-05, + "loss": 5.6921, + "step": 4967 + }, + { + "epoch": 0.02954610333999429, + "grad_norm": 3.289508104324341, + "learning_rate": 4.989242196366409e-05, + "loss": 6.1689, + "step": 4968 + }, + { + "epoch": 0.02955205062327529, + "grad_norm": 2.068229913711548, + "learning_rate": 4.989237867330504e-05, + "loss": 6.3342, + "step": 4969 + }, + { + "epoch": 0.029557997906556284, + "grad_norm": 2.198928117752075, + "learning_rate": 4.9892335374256316e-05, + "loss": 6.5125, + "step": 4970 + }, + { + "epoch": 0.029563945189837283, + "grad_norm": 2.3634228706359863, + "learning_rate": 4.989229206651793e-05, + "loss": 5.8328, + "step": 4971 + }, + { + "epoch": 0.02956989247311828, + "grad_norm": 2.1632115840911865, + "learning_rate": 4.989224875008991e-05, + "loss": 6.0702, + "step": 4972 + }, + { + "epoch": 0.029575839756399276, + "grad_norm": 2.461888313293457, + "learning_rate": 4.989220542497226e-05, + "loss": 6.01, + "step": 4973 + }, + { + "epoch": 0.029581787039680275, + "grad_norm": 2.668333053588867, + "learning_rate": 4.9892162091164997e-05, + "loss": 6.0369, + "step": 4974 + }, + { + "epoch": 0.02958773432296127, + "grad_norm": 3.0210723876953125, + "learning_rate": 4.9892118748668135e-05, + "loss": 6.0652, + "step": 4975 + }, + { + "epoch": 0.02959368160624227, + "grad_norm": 2.937350034713745, + "learning_rate": 4.98920753974817e-05, + "loss": 6.0205, + "step": 4976 + }, + { + "epoch": 0.029599628889523267, + "grad_norm": 2.904499053955078, + "learning_rate": 4.9892032037605685e-05, + "loss": 5.9561, + "step": 4977 + }, + { + "epoch": 0.029605576172804262, + "grad_norm": 2.218867778778076, + "learning_rate": 4.989198866904013e-05, + "loss": 5.4173, + "step": 4978 + }, + { + "epoch": 0.02961152345608526, + "grad_norm": 3.009920835494995, + "learning_rate": 4.9891945291785034e-05, + "loss": 5.5577, + "step": 4979 + }, + { + "epoch": 0.02961747073936626, + "grad_norm": 2.731687545776367, + "learning_rate": 4.9891901905840424e-05, + "loss": 5.6591, + "step": 4980 + }, + { + "epoch": 0.029623418022647254, + "grad_norm": 2.244101047515869, + "learning_rate": 4.98918585112063e-05, + "loss": 6.1434, + "step": 4981 + }, + { + "epoch": 0.029629365305928253, + "grad_norm": 2.3366870880126953, + "learning_rate": 4.989181510788269e-05, + "loss": 6.0132, + "step": 4982 + }, + { + "epoch": 0.029635312589209248, + "grad_norm": 3.2757890224456787, + "learning_rate": 4.98917716958696e-05, + "loss": 5.7486, + "step": 4983 + }, + { + "epoch": 0.029641259872490246, + "grad_norm": 2.361041784286499, + "learning_rate": 4.989172827516705e-05, + "loss": 5.8192, + "step": 4984 + }, + { + "epoch": 0.029647207155771245, + "grad_norm": 3.3433775901794434, + "learning_rate": 4.9891684845775054e-05, + "loss": 5.8688, + "step": 4985 + }, + { + "epoch": 0.02965315443905224, + "grad_norm": 2.6427462100982666, + "learning_rate": 4.9891641407693635e-05, + "loss": 5.9459, + "step": 4986 + }, + { + "epoch": 0.02965910172233324, + "grad_norm": 3.0931055545806885, + "learning_rate": 4.9891597960922795e-05, + "loss": 6.4822, + "step": 4987 + }, + { + "epoch": 0.029665049005614237, + "grad_norm": 2.598477840423584, + "learning_rate": 4.989155450546256e-05, + "loss": 6.0362, + "step": 4988 + }, + { + "epoch": 0.029670996288895232, + "grad_norm": 2.460313081741333, + "learning_rate": 4.989151104131294e-05, + "loss": 5.6209, + "step": 4989 + }, + { + "epoch": 0.02967694357217623, + "grad_norm": 2.4712390899658203, + "learning_rate": 4.989146756847395e-05, + "loss": 6.3849, + "step": 4990 + }, + { + "epoch": 0.029682890855457226, + "grad_norm": 2.365860939025879, + "learning_rate": 4.98914240869456e-05, + "loss": 6.2791, + "step": 4991 + }, + { + "epoch": 0.029688838138738224, + "grad_norm": 2.6213366985321045, + "learning_rate": 4.9891380596727915e-05, + "loss": 6.2888, + "step": 4992 + }, + { + "epoch": 0.029694785422019223, + "grad_norm": 2.742213487625122, + "learning_rate": 4.989133709782091e-05, + "loss": 6.3522, + "step": 4993 + }, + { + "epoch": 0.029700732705300218, + "grad_norm": 2.2428665161132812, + "learning_rate": 4.9891293590224594e-05, + "loss": 6.6735, + "step": 4994 + }, + { + "epoch": 0.029706679988581217, + "grad_norm": 2.4242279529571533, + "learning_rate": 4.989125007393898e-05, + "loss": 6.2283, + "step": 4995 + }, + { + "epoch": 0.02971262727186221, + "grad_norm": 2.422177314758301, + "learning_rate": 4.989120654896409e-05, + "loss": 6.0273, + "step": 4996 + }, + { + "epoch": 0.02971857455514321, + "grad_norm": 2.4325926303863525, + "learning_rate": 4.989116301529994e-05, + "loss": 5.9504, + "step": 4997 + }, + { + "epoch": 0.02972452183842421, + "grad_norm": 2.42901873588562, + "learning_rate": 4.9891119472946544e-05, + "loss": 5.8156, + "step": 4998 + }, + { + "epoch": 0.029730469121705204, + "grad_norm": 2.4361307621002197, + "learning_rate": 4.989107592190391e-05, + "loss": 5.9025, + "step": 4999 + }, + { + "epoch": 0.029736416404986202, + "grad_norm": 2.9486470222473145, + "learning_rate": 4.9891032362172065e-05, + "loss": 6.3204, + "step": 5000 + }, + { + "epoch": 0.0297423636882672, + "grad_norm": 2.456681966781616, + "learning_rate": 4.989098879375101e-05, + "loss": 5.8203, + "step": 5001 + }, + { + "epoch": 0.029748310971548196, + "grad_norm": 2.5065391063690186, + "learning_rate": 4.9890945216640775e-05, + "loss": 6.452, + "step": 5002 + }, + { + "epoch": 0.029754258254829195, + "grad_norm": 2.386488199234009, + "learning_rate": 4.989090163084136e-05, + "loss": 5.9195, + "step": 5003 + }, + { + "epoch": 0.02976020553811019, + "grad_norm": 2.1387040615081787, + "learning_rate": 4.9890858036352796e-05, + "loss": 6.2127, + "step": 5004 + }, + { + "epoch": 0.02976615282139119, + "grad_norm": 2.518099784851074, + "learning_rate": 4.989081443317508e-05, + "loss": 6.1099, + "step": 5005 + }, + { + "epoch": 0.029772100104672187, + "grad_norm": 3.2108826637268066, + "learning_rate": 4.989077082130825e-05, + "loss": 5.9808, + "step": 5006 + }, + { + "epoch": 0.029778047387953182, + "grad_norm": 2.176065444946289, + "learning_rate": 4.9890727200752304e-05, + "loss": 6.0825, + "step": 5007 + }, + { + "epoch": 0.02978399467123418, + "grad_norm": 2.2961249351501465, + "learning_rate": 4.9890683571507265e-05, + "loss": 5.968, + "step": 5008 + }, + { + "epoch": 0.02978994195451518, + "grad_norm": 2.1954386234283447, + "learning_rate": 4.9890639933573144e-05, + "loss": 6.0799, + "step": 5009 + }, + { + "epoch": 0.029795889237796174, + "grad_norm": 2.256039619445801, + "learning_rate": 4.989059628694995e-05, + "loss": 5.9503, + "step": 5010 + }, + { + "epoch": 0.029801836521077173, + "grad_norm": 2.4350922107696533, + "learning_rate": 4.9890552631637715e-05, + "loss": 5.6741, + "step": 5011 + }, + { + "epoch": 0.029807783804358168, + "grad_norm": 2.68904447555542, + "learning_rate": 4.989050896763645e-05, + "loss": 5.5872, + "step": 5012 + }, + { + "epoch": 0.029813731087639166, + "grad_norm": 2.2877871990203857, + "learning_rate": 4.989046529494615e-05, + "loss": 6.1273, + "step": 5013 + }, + { + "epoch": 0.029819678370920165, + "grad_norm": 2.350348711013794, + "learning_rate": 4.989042161356686e-05, + "loss": 6.1113, + "step": 5014 + }, + { + "epoch": 0.02982562565420116, + "grad_norm": 2.295382499694824, + "learning_rate": 4.989037792349858e-05, + "loss": 6.036, + "step": 5015 + }, + { + "epoch": 0.02983157293748216, + "grad_norm": 2.317863941192627, + "learning_rate": 4.989033422474131e-05, + "loss": 5.961, + "step": 5016 + }, + { + "epoch": 0.029837520220763157, + "grad_norm": 2.286289930343628, + "learning_rate": 4.9890290517295095e-05, + "loss": 5.8163, + "step": 5017 + }, + { + "epoch": 0.029843467504044152, + "grad_norm": 2.246863842010498, + "learning_rate": 4.989024680115993e-05, + "loss": 5.9689, + "step": 5018 + }, + { + "epoch": 0.02984941478732515, + "grad_norm": 1.8732661008834839, + "learning_rate": 4.989020307633585e-05, + "loss": 5.9046, + "step": 5019 + }, + { + "epoch": 0.029855362070606146, + "grad_norm": 2.0211753845214844, + "learning_rate": 4.989015934282285e-05, + "loss": 5.95, + "step": 5020 + }, + { + "epoch": 0.029861309353887144, + "grad_norm": 2.014890193939209, + "learning_rate": 4.9890115600620946e-05, + "loss": 5.7312, + "step": 5021 + }, + { + "epoch": 0.029867256637168143, + "grad_norm": 2.2749524116516113, + "learning_rate": 4.989007184973017e-05, + "loss": 6.2573, + "step": 5022 + }, + { + "epoch": 0.029873203920449138, + "grad_norm": 2.080747604370117, + "learning_rate": 4.989002809015052e-05, + "loss": 5.7607, + "step": 5023 + }, + { + "epoch": 0.029879151203730137, + "grad_norm": 2.3403279781341553, + "learning_rate": 4.988998432188202e-05, + "loss": 5.7876, + "step": 5024 + }, + { + "epoch": 0.02988509848701113, + "grad_norm": 2.573802947998047, + "learning_rate": 4.988994054492468e-05, + "loss": 5.9036, + "step": 5025 + }, + { + "epoch": 0.02989104577029213, + "grad_norm": 2.267409324645996, + "learning_rate": 4.988989675927853e-05, + "loss": 5.7433, + "step": 5026 + }, + { + "epoch": 0.02989699305357313, + "grad_norm": 2.8241517543792725, + "learning_rate": 4.9889852964943566e-05, + "loss": 6.2338, + "step": 5027 + }, + { + "epoch": 0.029902940336854124, + "grad_norm": 2.338927745819092, + "learning_rate": 4.988980916191982e-05, + "loss": 6.0226, + "step": 5028 + }, + { + "epoch": 0.029908887620135122, + "grad_norm": 2.0798492431640625, + "learning_rate": 4.9889765350207285e-05, + "loss": 5.6919, + "step": 5029 + }, + { + "epoch": 0.02991483490341612, + "grad_norm": 2.3199923038482666, + "learning_rate": 4.9889721529806e-05, + "loss": 5.7533, + "step": 5030 + }, + { + "epoch": 0.029920782186697116, + "grad_norm": 2.1074399948120117, + "learning_rate": 4.988967770071596e-05, + "loss": 5.7486, + "step": 5031 + }, + { + "epoch": 0.029926729469978115, + "grad_norm": 2.2539381980895996, + "learning_rate": 4.9889633862937205e-05, + "loss": 5.6816, + "step": 5032 + }, + { + "epoch": 0.02993267675325911, + "grad_norm": 2.1393015384674072, + "learning_rate": 4.9889590016469726e-05, + "loss": 5.6635, + "step": 5033 + }, + { + "epoch": 0.029938624036540108, + "grad_norm": 2.6661975383758545, + "learning_rate": 4.988954616131355e-05, + "loss": 6.0218, + "step": 5034 + }, + { + "epoch": 0.029944571319821107, + "grad_norm": 2.6529600620269775, + "learning_rate": 4.988950229746869e-05, + "loss": 5.8847, + "step": 5035 + }, + { + "epoch": 0.029950518603102102, + "grad_norm": 2.510859966278076, + "learning_rate": 4.988945842493517e-05, + "loss": 5.7154, + "step": 5036 + }, + { + "epoch": 0.0299564658863831, + "grad_norm": 2.875394105911255, + "learning_rate": 4.9889414543712985e-05, + "loss": 5.6304, + "step": 5037 + }, + { + "epoch": 0.0299624131696641, + "grad_norm": 2.718808650970459, + "learning_rate": 4.988937065380217e-05, + "loss": 5.6562, + "step": 5038 + }, + { + "epoch": 0.029968360452945094, + "grad_norm": 2.702265501022339, + "learning_rate": 4.988932675520273e-05, + "loss": 5.6484, + "step": 5039 + }, + { + "epoch": 0.029974307736226093, + "grad_norm": 2.765209436416626, + "learning_rate": 4.988928284791469e-05, + "loss": 5.793, + "step": 5040 + }, + { + "epoch": 0.029980255019507088, + "grad_norm": 3.386352062225342, + "learning_rate": 4.9889238931938047e-05, + "loss": 5.5392, + "step": 5041 + }, + { + "epoch": 0.029986202302788086, + "grad_norm": 2.1632583141326904, + "learning_rate": 4.988919500727284e-05, + "loss": 5.8032, + "step": 5042 + }, + { + "epoch": 0.029992149586069085, + "grad_norm": 2.4121060371398926, + "learning_rate": 4.9889151073919064e-05, + "loss": 5.9793, + "step": 5043 + }, + { + "epoch": 0.02999809686935008, + "grad_norm": 2.2160584926605225, + "learning_rate": 4.988910713187674e-05, + "loss": 5.8802, + "step": 5044 + }, + { + "epoch": 0.03000404415263108, + "grad_norm": 3.120509386062622, + "learning_rate": 4.988906318114589e-05, + "loss": 5.5691, + "step": 5045 + }, + { + "epoch": 0.030009991435912077, + "grad_norm": 3.0660078525543213, + "learning_rate": 4.988901922172652e-05, + "loss": 5.3687, + "step": 5046 + }, + { + "epoch": 0.030015938719193072, + "grad_norm": 1.939757227897644, + "learning_rate": 4.988897525361867e-05, + "loss": 5.526, + "step": 5047 + }, + { + "epoch": 0.03002188600247407, + "grad_norm": 2.2970168590545654, + "learning_rate": 4.9888931276822315e-05, + "loss": 5.6334, + "step": 5048 + }, + { + "epoch": 0.030027833285755066, + "grad_norm": 2.162632942199707, + "learning_rate": 4.988888729133749e-05, + "loss": 5.8887, + "step": 5049 + }, + { + "epoch": 0.030033780569036064, + "grad_norm": 2.027017831802368, + "learning_rate": 4.9888843297164223e-05, + "loss": 5.9237, + "step": 5050 + }, + { + "epoch": 0.030039727852317063, + "grad_norm": 1.9226456880569458, + "learning_rate": 4.988879929430251e-05, + "loss": 5.6833, + "step": 5051 + }, + { + "epoch": 0.030045675135598058, + "grad_norm": 1.6490615606307983, + "learning_rate": 4.9888755282752384e-05, + "loss": 5.5738, + "step": 5052 + }, + { + "epoch": 0.030051622418879056, + "grad_norm": 2.456385850906372, + "learning_rate": 4.9888711262513846e-05, + "loss": 5.3771, + "step": 5053 + }, + { + "epoch": 0.03005756970216005, + "grad_norm": 2.480044364929199, + "learning_rate": 4.988866723358692e-05, + "loss": 5.2456, + "step": 5054 + }, + { + "epoch": 0.03006351698544105, + "grad_norm": 2.4033162593841553, + "learning_rate": 4.988862319597161e-05, + "loss": 5.1629, + "step": 5055 + }, + { + "epoch": 0.03006946426872205, + "grad_norm": 2.7228541374206543, + "learning_rate": 4.9888579149667935e-05, + "loss": 5.0195, + "step": 5056 + }, + { + "epoch": 0.030075411552003044, + "grad_norm": 2.4641635417938232, + "learning_rate": 4.9888535094675926e-05, + "loss": 5.3259, + "step": 5057 + }, + { + "epoch": 0.030081358835284042, + "grad_norm": 2.443666458129883, + "learning_rate": 4.9888491030995575e-05, + "loss": 5.4212, + "step": 5058 + }, + { + "epoch": 0.03008730611856504, + "grad_norm": 2.3267531394958496, + "learning_rate": 4.988844695862692e-05, + "loss": 5.6517, + "step": 5059 + }, + { + "epoch": 0.030093253401846036, + "grad_norm": 1.9090640544891357, + "learning_rate": 4.988840287756996e-05, + "loss": 5.7946, + "step": 5060 + }, + { + "epoch": 0.030099200685127035, + "grad_norm": 1.6169202327728271, + "learning_rate": 4.988835878782472e-05, + "loss": 5.7332, + "step": 5061 + }, + { + "epoch": 0.03010514796840803, + "grad_norm": 1.9369432926177979, + "learning_rate": 4.9888314689391205e-05, + "loss": 5.5954, + "step": 5062 + }, + { + "epoch": 0.030111095251689028, + "grad_norm": 2.0444133281707764, + "learning_rate": 4.9888270582269434e-05, + "loss": 5.5332, + "step": 5063 + }, + { + "epoch": 0.030117042534970027, + "grad_norm": 1.949061632156372, + "learning_rate": 4.988822646645943e-05, + "loss": 5.6064, + "step": 5064 + }, + { + "epoch": 0.030122989818251022, + "grad_norm": 1.5208648443222046, + "learning_rate": 4.988818234196121e-05, + "loss": 5.6615, + "step": 5065 + }, + { + "epoch": 0.03012893710153202, + "grad_norm": 1.8466709852218628, + "learning_rate": 4.988813820877477e-05, + "loss": 5.79, + "step": 5066 + }, + { + "epoch": 0.03013488438481302, + "grad_norm": 1.7094037532806396, + "learning_rate": 4.988809406690015e-05, + "loss": 5.8194, + "step": 5067 + }, + { + "epoch": 0.030140831668094014, + "grad_norm": 1.5698916912078857, + "learning_rate": 4.988804991633734e-05, + "loss": 5.5981, + "step": 5068 + }, + { + "epoch": 0.030146778951375013, + "grad_norm": 2.032156467437744, + "learning_rate": 4.988800575708638e-05, + "loss": 5.6729, + "step": 5069 + }, + { + "epoch": 0.030152726234656008, + "grad_norm": 1.9716484546661377, + "learning_rate": 4.988796158914727e-05, + "loss": 5.5227, + "step": 5070 + }, + { + "epoch": 0.030158673517937006, + "grad_norm": 1.8809682130813599, + "learning_rate": 4.988791741252002e-05, + "loss": 5.6231, + "step": 5071 + }, + { + "epoch": 0.030164620801218005, + "grad_norm": 1.8293371200561523, + "learning_rate": 4.9887873227204675e-05, + "loss": 5.5067, + "step": 5072 + }, + { + "epoch": 0.030170568084499, + "grad_norm": 2.225281000137329, + "learning_rate": 4.988782903320122e-05, + "loss": 5.3056, + "step": 5073 + }, + { + "epoch": 0.03017651536778, + "grad_norm": 2.0776474475860596, + "learning_rate": 4.988778483050968e-05, + "loss": 5.206, + "step": 5074 + }, + { + "epoch": 0.030182462651060997, + "grad_norm": 2.068323850631714, + "learning_rate": 4.9887740619130076e-05, + "loss": 5.5975, + "step": 5075 + }, + { + "epoch": 0.030188409934341992, + "grad_norm": 2.077782392501831, + "learning_rate": 4.988769639906241e-05, + "loss": 5.6967, + "step": 5076 + }, + { + "epoch": 0.03019435721762299, + "grad_norm": 1.9837195873260498, + "learning_rate": 4.988765217030672e-05, + "loss": 5.7834, + "step": 5077 + }, + { + "epoch": 0.030200304500903986, + "grad_norm": 1.9612236022949219, + "learning_rate": 4.9887607932863e-05, + "loss": 5.5472, + "step": 5078 + }, + { + "epoch": 0.030206251784184984, + "grad_norm": 2.022251605987549, + "learning_rate": 4.988756368673127e-05, + "loss": 5.704, + "step": 5079 + }, + { + "epoch": 0.030212199067465983, + "grad_norm": 2.02227783203125, + "learning_rate": 4.988751943191156e-05, + "loss": 5.4125, + "step": 5080 + }, + { + "epoch": 0.030218146350746978, + "grad_norm": 2.0527732372283936, + "learning_rate": 4.9887475168403856e-05, + "loss": 5.464, + "step": 5081 + }, + { + "epoch": 0.030224093634027976, + "grad_norm": 2.1465423107147217, + "learning_rate": 4.9887430896208205e-05, + "loss": 5.3415, + "step": 5082 + }, + { + "epoch": 0.03023004091730897, + "grad_norm": 1.9170550107955933, + "learning_rate": 4.9887386615324606e-05, + "loss": 5.5762, + "step": 5083 + }, + { + "epoch": 0.03023598820058997, + "grad_norm": 3.367650032043457, + "learning_rate": 4.988734232575307e-05, + "loss": 6.26, + "step": 5084 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 2.0784621238708496, + "learning_rate": 4.988729802749363e-05, + "loss": 5.5316, + "step": 5085 + }, + { + "epoch": 0.030247882767151964, + "grad_norm": 1.9531089067459106, + "learning_rate": 4.988725372054629e-05, + "loss": 5.5901, + "step": 5086 + }, + { + "epoch": 0.030253830050432962, + "grad_norm": 1.9677239656448364, + "learning_rate": 4.988720940491106e-05, + "loss": 5.4963, + "step": 5087 + }, + { + "epoch": 0.03025977733371396, + "grad_norm": 1.9835426807403564, + "learning_rate": 4.988716508058797e-05, + "loss": 5.6355, + "step": 5088 + }, + { + "epoch": 0.030265724616994956, + "grad_norm": 1.908250331878662, + "learning_rate": 4.988712074757703e-05, + "loss": 5.165, + "step": 5089 + }, + { + "epoch": 0.030271671900275954, + "grad_norm": 1.9852073192596436, + "learning_rate": 4.9887076405878246e-05, + "loss": 5.6623, + "step": 5090 + }, + { + "epoch": 0.03027761918355695, + "grad_norm": 1.9073505401611328, + "learning_rate": 4.988703205549164e-05, + "loss": 5.6685, + "step": 5091 + }, + { + "epoch": 0.030283566466837948, + "grad_norm": 1.744931697845459, + "learning_rate": 4.988698769641724e-05, + "loss": 5.4004, + "step": 5092 + }, + { + "epoch": 0.030289513750118947, + "grad_norm": 2.0623345375061035, + "learning_rate": 4.9886943328655034e-05, + "loss": 5.3846, + "step": 5093 + }, + { + "epoch": 0.030295461033399942, + "grad_norm": 1.647375226020813, + "learning_rate": 4.9886898952205064e-05, + "loss": 5.5823, + "step": 5094 + }, + { + "epoch": 0.03030140831668094, + "grad_norm": 2.2364108562469482, + "learning_rate": 4.9886854567067334e-05, + "loss": 5.5959, + "step": 5095 + }, + { + "epoch": 0.03030735559996194, + "grad_norm": 2.059187650680542, + "learning_rate": 4.988681017324185e-05, + "loss": 5.6043, + "step": 5096 + }, + { + "epoch": 0.030313302883242934, + "grad_norm": 1.8996437788009644, + "learning_rate": 4.988676577072865e-05, + "loss": 5.4366, + "step": 5097 + }, + { + "epoch": 0.030319250166523933, + "grad_norm": 2.0983266830444336, + "learning_rate": 4.988672135952773e-05, + "loss": 5.5568, + "step": 5098 + }, + { + "epoch": 0.030325197449804928, + "grad_norm": 2.065119743347168, + "learning_rate": 4.988667693963911e-05, + "loss": 5.4239, + "step": 5099 + }, + { + "epoch": 0.030331144733085926, + "grad_norm": 1.9394044876098633, + "learning_rate": 4.988663251106282e-05, + "loss": 5.573, + "step": 5100 + }, + { + "epoch": 0.030337092016366925, + "grad_norm": 2.225097417831421, + "learning_rate": 4.9886588073798855e-05, + "loss": 5.5877, + "step": 5101 + }, + { + "epoch": 0.03034303929964792, + "grad_norm": 2.185018539428711, + "learning_rate": 4.9886543627847236e-05, + "loss": 5.6884, + "step": 5102 + }, + { + "epoch": 0.03034898658292892, + "grad_norm": 1.9751871824264526, + "learning_rate": 4.988649917320799e-05, + "loss": 5.4836, + "step": 5103 + }, + { + "epoch": 0.030354933866209917, + "grad_norm": 1.8753101825714111, + "learning_rate": 4.988645470988113e-05, + "loss": 5.4049, + "step": 5104 + }, + { + "epoch": 0.030360881149490912, + "grad_norm": 2.12246036529541, + "learning_rate": 4.988641023786665e-05, + "loss": 5.5365, + "step": 5105 + }, + { + "epoch": 0.03036682843277191, + "grad_norm": 2.1078991889953613, + "learning_rate": 4.988636575716459e-05, + "loss": 5.5269, + "step": 5106 + }, + { + "epoch": 0.030372775716052906, + "grad_norm": 1.9127923250198364, + "learning_rate": 4.9886321267774946e-05, + "loss": 5.48, + "step": 5107 + }, + { + "epoch": 0.030378722999333904, + "grad_norm": 1.8971906900405884, + "learning_rate": 4.988627676969776e-05, + "loss": 5.5202, + "step": 5108 + }, + { + "epoch": 0.030384670282614903, + "grad_norm": 2.162097454071045, + "learning_rate": 4.9886232262933024e-05, + "loss": 5.5229, + "step": 5109 + }, + { + "epoch": 0.030390617565895898, + "grad_norm": 2.21211838722229, + "learning_rate": 4.988618774748076e-05, + "loss": 5.3648, + "step": 5110 + }, + { + "epoch": 0.030396564849176896, + "grad_norm": 1.8907619714736938, + "learning_rate": 4.988614322334099e-05, + "loss": 5.4338, + "step": 5111 + }, + { + "epoch": 0.030402512132457895, + "grad_norm": 2.0131993293762207, + "learning_rate": 4.9886098690513725e-05, + "loss": 5.4005, + "step": 5112 + }, + { + "epoch": 0.03040845941573889, + "grad_norm": 1.9474748373031616, + "learning_rate": 4.9886054148998975e-05, + "loss": 5.5544, + "step": 5113 + }, + { + "epoch": 0.03041440669901989, + "grad_norm": 1.9809894561767578, + "learning_rate": 4.988600959879676e-05, + "loss": 5.6204, + "step": 5114 + }, + { + "epoch": 0.030420353982300884, + "grad_norm": 2.1792514324188232, + "learning_rate": 4.9885965039907104e-05, + "loss": 5.5368, + "step": 5115 + }, + { + "epoch": 0.030426301265581882, + "grad_norm": 2.050903081893921, + "learning_rate": 4.9885920472330004e-05, + "loss": 5.4717, + "step": 5116 + }, + { + "epoch": 0.03043224854886288, + "grad_norm": 1.9938042163848877, + "learning_rate": 4.988587589606549e-05, + "loss": 5.5373, + "step": 5117 + }, + { + "epoch": 0.030438195832143876, + "grad_norm": 1.7375110387802124, + "learning_rate": 4.988583131111358e-05, + "loss": 5.5621, + "step": 5118 + }, + { + "epoch": 0.030444143115424874, + "grad_norm": 2.077605962753296, + "learning_rate": 4.988578671747428e-05, + "loss": 5.5451, + "step": 5119 + }, + { + "epoch": 0.03045009039870587, + "grad_norm": 2.071706771850586, + "learning_rate": 4.988574211514761e-05, + "loss": 5.327, + "step": 5120 + }, + { + "epoch": 0.030456037681986868, + "grad_norm": 1.8317911624908447, + "learning_rate": 4.9885697504133574e-05, + "loss": 5.4123, + "step": 5121 + }, + { + "epoch": 0.030461984965267867, + "grad_norm": 2.1231188774108887, + "learning_rate": 4.988565288443221e-05, + "loss": 5.3789, + "step": 5122 + }, + { + "epoch": 0.03046793224854886, + "grad_norm": 2.1298999786376953, + "learning_rate": 4.988560825604352e-05, + "loss": 5.4382, + "step": 5123 + }, + { + "epoch": 0.03047387953182986, + "grad_norm": 1.791053056716919, + "learning_rate": 4.9885563618967525e-05, + "loss": 5.3918, + "step": 5124 + }, + { + "epoch": 0.03047982681511086, + "grad_norm": 1.9610999822616577, + "learning_rate": 4.988551897320423e-05, + "loss": 5.3232, + "step": 5125 + }, + { + "epoch": 0.030485774098391854, + "grad_norm": 1.9926520586013794, + "learning_rate": 4.9885474318753654e-05, + "loss": 5.4316, + "step": 5126 + }, + { + "epoch": 0.030491721381672852, + "grad_norm": 1.8942431211471558, + "learning_rate": 4.988542965561582e-05, + "loss": 5.4055, + "step": 5127 + }, + { + "epoch": 0.030497668664953848, + "grad_norm": 1.7872856855392456, + "learning_rate": 4.988538498379074e-05, + "loss": 5.5117, + "step": 5128 + }, + { + "epoch": 0.030503615948234846, + "grad_norm": 2.040205478668213, + "learning_rate": 4.988534030327843e-05, + "loss": 5.4068, + "step": 5129 + }, + { + "epoch": 0.030509563231515845, + "grad_norm": 2.0108931064605713, + "learning_rate": 4.988529561407891e-05, + "loss": 5.3636, + "step": 5130 + }, + { + "epoch": 0.03051551051479684, + "grad_norm": 2.0339555740356445, + "learning_rate": 4.988525091619218e-05, + "loss": 5.2811, + "step": 5131 + }, + { + "epoch": 0.03052145779807784, + "grad_norm": 1.7631195783615112, + "learning_rate": 4.988520620961828e-05, + "loss": 5.3407, + "step": 5132 + }, + { + "epoch": 0.030527405081358837, + "grad_norm": 1.6906533241271973, + "learning_rate": 4.988516149435719e-05, + "loss": 5.3121, + "step": 5133 + }, + { + "epoch": 0.030533352364639832, + "grad_norm": 2.0753448009490967, + "learning_rate": 4.988511677040897e-05, + "loss": 5.4532, + "step": 5134 + }, + { + "epoch": 0.03053929964792083, + "grad_norm": 1.9836634397506714, + "learning_rate": 4.9885072037773595e-05, + "loss": 5.4345, + "step": 5135 + }, + { + "epoch": 0.030545246931201826, + "grad_norm": 1.8526780605316162, + "learning_rate": 4.988502729645111e-05, + "loss": 5.446, + "step": 5136 + }, + { + "epoch": 0.030551194214482824, + "grad_norm": 2.126626968383789, + "learning_rate": 4.988498254644152e-05, + "loss": 5.703, + "step": 5137 + }, + { + "epoch": 0.030557141497763823, + "grad_norm": 1.9711220264434814, + "learning_rate": 4.988493778774483e-05, + "loss": 5.5872, + "step": 5138 + }, + { + "epoch": 0.030563088781044818, + "grad_norm": 2.070727586746216, + "learning_rate": 4.988489302036107e-05, + "loss": 5.4407, + "step": 5139 + }, + { + "epoch": 0.030569036064325816, + "grad_norm": 2.1414859294891357, + "learning_rate": 4.988484824429025e-05, + "loss": 5.5291, + "step": 5140 + }, + { + "epoch": 0.030574983347606815, + "grad_norm": 2.01366925239563, + "learning_rate": 4.9884803459532384e-05, + "loss": 5.3561, + "step": 5141 + }, + { + "epoch": 0.03058093063088781, + "grad_norm": 1.851836085319519, + "learning_rate": 4.988475866608749e-05, + "loss": 5.679, + "step": 5142 + }, + { + "epoch": 0.03058687791416881, + "grad_norm": 1.6984909772872925, + "learning_rate": 4.988471386395559e-05, + "loss": 5.6075, + "step": 5143 + }, + { + "epoch": 0.030592825197449804, + "grad_norm": 1.9371756315231323, + "learning_rate": 4.9884669053136696e-05, + "loss": 5.7062, + "step": 5144 + }, + { + "epoch": 0.030598772480730802, + "grad_norm": 1.9286617040634155, + "learning_rate": 4.9884624233630815e-05, + "loss": 5.573, + "step": 5145 + }, + { + "epoch": 0.0306047197640118, + "grad_norm": 2.7633650302886963, + "learning_rate": 4.988457940543797e-05, + "loss": 6.2082, + "step": 5146 + }, + { + "epoch": 0.030610667047292796, + "grad_norm": 2.6948676109313965, + "learning_rate": 4.9884534568558173e-05, + "loss": 5.7475, + "step": 5147 + }, + { + "epoch": 0.030616614330573794, + "grad_norm": 2.1618316173553467, + "learning_rate": 4.988448972299145e-05, + "loss": 5.4049, + "step": 5148 + }, + { + "epoch": 0.03062256161385479, + "grad_norm": 2.417043685913086, + "learning_rate": 4.98844448687378e-05, + "loss": 5.3663, + "step": 5149 + }, + { + "epoch": 0.030628508897135788, + "grad_norm": 1.9748867750167847, + "learning_rate": 4.988440000579725e-05, + "loss": 5.1876, + "step": 5150 + }, + { + "epoch": 0.030634456180416787, + "grad_norm": 2.0534770488739014, + "learning_rate": 4.988435513416981e-05, + "loss": 5.4519, + "step": 5151 + }, + { + "epoch": 0.03064040346369778, + "grad_norm": 1.9772714376449585, + "learning_rate": 4.98843102538555e-05, + "loss": 5.5241, + "step": 5152 + }, + { + "epoch": 0.03064635074697878, + "grad_norm": 2.4160993099212646, + "learning_rate": 4.988426536485434e-05, + "loss": 5.6535, + "step": 5153 + }, + { + "epoch": 0.03065229803025978, + "grad_norm": 1.9931175708770752, + "learning_rate": 4.9884220467166345e-05, + "loss": 5.6693, + "step": 5154 + }, + { + "epoch": 0.030658245313540774, + "grad_norm": 1.9071956872940063, + "learning_rate": 4.9884175560791516e-05, + "loss": 5.5533, + "step": 5155 + }, + { + "epoch": 0.030664192596821772, + "grad_norm": 1.8562983274459839, + "learning_rate": 4.9884130645729876e-05, + "loss": 5.5621, + "step": 5156 + }, + { + "epoch": 0.030670139880102767, + "grad_norm": 2.087606430053711, + "learning_rate": 4.9884085721981446e-05, + "loss": 5.5256, + "step": 5157 + }, + { + "epoch": 0.030676087163383766, + "grad_norm": 2.3242955207824707, + "learning_rate": 4.988404078954624e-05, + "loss": 5.3906, + "step": 5158 + }, + { + "epoch": 0.030682034446664765, + "grad_norm": 2.221330404281616, + "learning_rate": 4.988399584842427e-05, + "loss": 5.5719, + "step": 5159 + }, + { + "epoch": 0.03068798172994576, + "grad_norm": 1.7819960117340088, + "learning_rate": 4.988395089861556e-05, + "loss": 5.5823, + "step": 5160 + }, + { + "epoch": 0.030693929013226758, + "grad_norm": 1.781802773475647, + "learning_rate": 4.988390594012011e-05, + "loss": 5.6087, + "step": 5161 + }, + { + "epoch": 0.030699876296507757, + "grad_norm": 2.0003581047058105, + "learning_rate": 4.988386097293796e-05, + "loss": 5.5695, + "step": 5162 + }, + { + "epoch": 0.030705823579788752, + "grad_norm": 1.9411736726760864, + "learning_rate": 4.98838159970691e-05, + "loss": 5.441, + "step": 5163 + }, + { + "epoch": 0.03071177086306975, + "grad_norm": 2.159541368484497, + "learning_rate": 4.9883771012513556e-05, + "loss": 5.6191, + "step": 5164 + }, + { + "epoch": 0.030717718146350746, + "grad_norm": 2.1045689582824707, + "learning_rate": 4.988372601927135e-05, + "loss": 5.3261, + "step": 5165 + }, + { + "epoch": 0.030723665429631744, + "grad_norm": 2.004770040512085, + "learning_rate": 4.988368101734249e-05, + "loss": 5.3392, + "step": 5166 + }, + { + "epoch": 0.030729612712912743, + "grad_norm": 2.1851232051849365, + "learning_rate": 4.9883636006726996e-05, + "loss": 5.3048, + "step": 5167 + }, + { + "epoch": 0.030735559996193738, + "grad_norm": 2.1333882808685303, + "learning_rate": 4.988359098742488e-05, + "loss": 5.336, + "step": 5168 + }, + { + "epoch": 0.030741507279474736, + "grad_norm": 2.1911604404449463, + "learning_rate": 4.9883545959436165e-05, + "loss": 5.757, + "step": 5169 + }, + { + "epoch": 0.030747454562755735, + "grad_norm": 2.0385994911193848, + "learning_rate": 4.988350092276085e-05, + "loss": 5.7889, + "step": 5170 + }, + { + "epoch": 0.03075340184603673, + "grad_norm": 2.2300381660461426, + "learning_rate": 4.988345587739897e-05, + "loss": 5.3812, + "step": 5171 + }, + { + "epoch": 0.03075934912931773, + "grad_norm": 2.4643938541412354, + "learning_rate": 4.988341082335053e-05, + "loss": 5.2503, + "step": 5172 + }, + { + "epoch": 0.030765296412598724, + "grad_norm": 2.0791194438934326, + "learning_rate": 4.988336576061555e-05, + "loss": 5.2958, + "step": 5173 + }, + { + "epoch": 0.030771243695879722, + "grad_norm": 2.1123111248016357, + "learning_rate": 4.988332068919405e-05, + "loss": 5.3656, + "step": 5174 + }, + { + "epoch": 0.03077719097916072, + "grad_norm": 2.199747323989868, + "learning_rate": 4.9883275609086026e-05, + "loss": 5.7015, + "step": 5175 + }, + { + "epoch": 0.030783138262441716, + "grad_norm": 2.0083510875701904, + "learning_rate": 4.988323052029151e-05, + "loss": 5.7068, + "step": 5176 + }, + { + "epoch": 0.030789085545722714, + "grad_norm": 2.1027777194976807, + "learning_rate": 4.988318542281053e-05, + "loss": 5.6986, + "step": 5177 + }, + { + "epoch": 0.03079503282900371, + "grad_norm": 1.8593190908432007, + "learning_rate": 4.9883140316643074e-05, + "loss": 5.7194, + "step": 5178 + }, + { + "epoch": 0.030800980112284708, + "grad_norm": 1.9712544679641724, + "learning_rate": 4.988309520178918e-05, + "loss": 5.6472, + "step": 5179 + }, + { + "epoch": 0.030806927395565707, + "grad_norm": 2.1114501953125, + "learning_rate": 4.9883050078248836e-05, + "loss": 5.6767, + "step": 5180 + }, + { + "epoch": 0.0308128746788467, + "grad_norm": 3.0505895614624023, + "learning_rate": 4.988300494602209e-05, + "loss": 5.3705, + "step": 5181 + }, + { + "epoch": 0.0308188219621277, + "grad_norm": 2.648364782333374, + "learning_rate": 4.988295980510895e-05, + "loss": 5.3072, + "step": 5182 + }, + { + "epoch": 0.0308247692454087, + "grad_norm": 2.2162837982177734, + "learning_rate": 4.9882914655509414e-05, + "loss": 5.3359, + "step": 5183 + }, + { + "epoch": 0.030830716528689694, + "grad_norm": 2.16666316986084, + "learning_rate": 4.988286949722352e-05, + "loss": 5.3446, + "step": 5184 + }, + { + "epoch": 0.030836663811970692, + "grad_norm": 2.951157569885254, + "learning_rate": 4.988282433025126e-05, + "loss": 5.7776, + "step": 5185 + }, + { + "epoch": 0.030842611095251687, + "grad_norm": 2.9967124462127686, + "learning_rate": 4.988277915459267e-05, + "loss": 5.6004, + "step": 5186 + }, + { + "epoch": 0.030848558378532686, + "grad_norm": 2.3998372554779053, + "learning_rate": 4.988273397024777e-05, + "loss": 5.3562, + "step": 5187 + }, + { + "epoch": 0.030854505661813685, + "grad_norm": 2.290592670440674, + "learning_rate": 4.9882688777216544e-05, + "loss": 5.3211, + "step": 5188 + }, + { + "epoch": 0.03086045294509468, + "grad_norm": 2.0349433422088623, + "learning_rate": 4.988264357549904e-05, + "loss": 5.2917, + "step": 5189 + }, + { + "epoch": 0.030866400228375678, + "grad_norm": 1.922006607055664, + "learning_rate": 4.988259836509526e-05, + "loss": 5.2297, + "step": 5190 + }, + { + "epoch": 0.030872347511656677, + "grad_norm": 1.9518259763717651, + "learning_rate": 4.9882553146005225e-05, + "loss": 5.2232, + "step": 5191 + }, + { + "epoch": 0.030878294794937672, + "grad_norm": 2.1054210662841797, + "learning_rate": 4.988250791822894e-05, + "loss": 5.3705, + "step": 5192 + }, + { + "epoch": 0.03088424207821867, + "grad_norm": 2.0954079627990723, + "learning_rate": 4.988246268176644e-05, + "loss": 5.2522, + "step": 5193 + }, + { + "epoch": 0.030890189361499665, + "grad_norm": 1.8628660440444946, + "learning_rate": 4.9882417436617724e-05, + "loss": 5.3856, + "step": 5194 + }, + { + "epoch": 0.030896136644780664, + "grad_norm": 2.2788021564483643, + "learning_rate": 4.988237218278281e-05, + "loss": 5.4399, + "step": 5195 + }, + { + "epoch": 0.030902083928061663, + "grad_norm": 1.981086015701294, + "learning_rate": 4.9882326920261717e-05, + "loss": 5.2853, + "step": 5196 + }, + { + "epoch": 0.030908031211342658, + "grad_norm": 1.9278241395950317, + "learning_rate": 4.988228164905446e-05, + "loss": 5.3997, + "step": 5197 + }, + { + "epoch": 0.030913978494623656, + "grad_norm": 1.842748999595642, + "learning_rate": 4.988223636916106e-05, + "loss": 5.3215, + "step": 5198 + }, + { + "epoch": 0.030919925777904655, + "grad_norm": 1.9974339008331299, + "learning_rate": 4.988219108058153e-05, + "loss": 5.4851, + "step": 5199 + }, + { + "epoch": 0.03092587306118565, + "grad_norm": 2.015939474105835, + "learning_rate": 4.988214578331588e-05, + "loss": 5.322, + "step": 5200 + }, + { + "epoch": 0.03093182034446665, + "grad_norm": 2.035209894180298, + "learning_rate": 4.9882100477364135e-05, + "loss": 5.3896, + "step": 5201 + }, + { + "epoch": 0.030937767627747643, + "grad_norm": 1.9803009033203125, + "learning_rate": 4.9882055162726296e-05, + "loss": 5.2624, + "step": 5202 + }, + { + "epoch": 0.030943714911028642, + "grad_norm": 1.9504352807998657, + "learning_rate": 4.98820098394024e-05, + "loss": 5.2333, + "step": 5203 + }, + { + "epoch": 0.03094966219430964, + "grad_norm": 1.850542664527893, + "learning_rate": 4.9881964507392443e-05, + "loss": 5.5632, + "step": 5204 + }, + { + "epoch": 0.030955609477590636, + "grad_norm": 1.8594067096710205, + "learning_rate": 4.9881919166696456e-05, + "loss": 5.3775, + "step": 5205 + }, + { + "epoch": 0.030961556760871634, + "grad_norm": 2.019274950027466, + "learning_rate": 4.988187381731444e-05, + "loss": 5.4565, + "step": 5206 + }, + { + "epoch": 0.030967504044152633, + "grad_norm": 1.7151249647140503, + "learning_rate": 4.988182845924643e-05, + "loss": 5.5984, + "step": 5207 + }, + { + "epoch": 0.030973451327433628, + "grad_norm": 2.5127339363098145, + "learning_rate": 4.988178309249242e-05, + "loss": 6.2724, + "step": 5208 + }, + { + "epoch": 0.030979398610714626, + "grad_norm": 1.869344711303711, + "learning_rate": 4.9881737717052436e-05, + "loss": 5.5408, + "step": 5209 + }, + { + "epoch": 0.03098534589399562, + "grad_norm": 2.035419225692749, + "learning_rate": 4.98816923329265e-05, + "loss": 5.4154, + "step": 5210 + }, + { + "epoch": 0.03099129317727662, + "grad_norm": 1.7084250450134277, + "learning_rate": 4.9881646940114624e-05, + "loss": 5.6327, + "step": 5211 + }, + { + "epoch": 0.03099724046055762, + "grad_norm": 2.1035211086273193, + "learning_rate": 4.9881601538616816e-05, + "loss": 5.5041, + "step": 5212 + }, + { + "epoch": 0.031003187743838614, + "grad_norm": 1.920366883277893, + "learning_rate": 4.9881556128433105e-05, + "loss": 5.5919, + "step": 5213 + }, + { + "epoch": 0.031009135027119612, + "grad_norm": 2.000555992126465, + "learning_rate": 4.988151070956349e-05, + "loss": 5.5078, + "step": 5214 + }, + { + "epoch": 0.031015082310400607, + "grad_norm": 1.9930146932601929, + "learning_rate": 4.9881465282008e-05, + "loss": 5.5002, + "step": 5215 + }, + { + "epoch": 0.031021029593681606, + "grad_norm": 2.163329839706421, + "learning_rate": 4.988141984576665e-05, + "loss": 5.3504, + "step": 5216 + }, + { + "epoch": 0.031026976876962604, + "grad_norm": 1.766228437423706, + "learning_rate": 4.988137440083946e-05, + "loss": 5.5304, + "step": 5217 + }, + { + "epoch": 0.0310329241602436, + "grad_norm": 2.1399648189544678, + "learning_rate": 4.988132894722644e-05, + "loss": 5.4757, + "step": 5218 + }, + { + "epoch": 0.031038871443524598, + "grad_norm": 2.2287001609802246, + "learning_rate": 4.988128348492759e-05, + "loss": 5.4902, + "step": 5219 + }, + { + "epoch": 0.031044818726805597, + "grad_norm": 2.095080852508545, + "learning_rate": 4.988123801394295e-05, + "loss": 5.3462, + "step": 5220 + }, + { + "epoch": 0.031050766010086592, + "grad_norm": 2.0873003005981445, + "learning_rate": 4.988119253427253e-05, + "loss": 5.2825, + "step": 5221 + }, + { + "epoch": 0.03105671329336759, + "grad_norm": 2.0918655395507812, + "learning_rate": 4.988114704591633e-05, + "loss": 5.2859, + "step": 5222 + }, + { + "epoch": 0.031062660576648585, + "grad_norm": 1.9637762308120728, + "learning_rate": 4.9881101548874384e-05, + "loss": 5.4687, + "step": 5223 + }, + { + "epoch": 0.031068607859929584, + "grad_norm": 2.046672821044922, + "learning_rate": 4.988105604314671e-05, + "loss": 5.5095, + "step": 5224 + }, + { + "epoch": 0.031074555143210583, + "grad_norm": 2.0264053344726562, + "learning_rate": 4.988101052873332e-05, + "loss": 5.4221, + "step": 5225 + }, + { + "epoch": 0.031080502426491578, + "grad_norm": 1.9367676973342896, + "learning_rate": 4.9880965005634216e-05, + "loss": 5.1881, + "step": 5226 + }, + { + "epoch": 0.031086449709772576, + "grad_norm": 2.0398001670837402, + "learning_rate": 4.9880919473849425e-05, + "loss": 5.4938, + "step": 5227 + }, + { + "epoch": 0.031092396993053575, + "grad_norm": 2.037411689758301, + "learning_rate": 4.988087393337896e-05, + "loss": 5.0893, + "step": 5228 + }, + { + "epoch": 0.03109834427633457, + "grad_norm": 2.1337075233459473, + "learning_rate": 4.988082838422285e-05, + "loss": 4.9822, + "step": 5229 + }, + { + "epoch": 0.03110429155961557, + "grad_norm": 1.9911794662475586, + "learning_rate": 4.988078282638109e-05, + "loss": 5.2472, + "step": 5230 + }, + { + "epoch": 0.031110238842896563, + "grad_norm": 2.1050829887390137, + "learning_rate": 4.98807372598537e-05, + "loss": 5.3478, + "step": 5231 + }, + { + "epoch": 0.031116186126177562, + "grad_norm": 1.9364343881607056, + "learning_rate": 4.988069168464071e-05, + "loss": 5.2551, + "step": 5232 + }, + { + "epoch": 0.03112213340945856, + "grad_norm": 1.9834885597229004, + "learning_rate": 4.988064610074213e-05, + "loss": 5.2147, + "step": 5233 + }, + { + "epoch": 0.031128080692739556, + "grad_norm": 2.0815906524658203, + "learning_rate": 4.9880600508157974e-05, + "loss": 5.1607, + "step": 5234 + }, + { + "epoch": 0.031134027976020554, + "grad_norm": 1.9558357000350952, + "learning_rate": 4.988055490688825e-05, + "loss": 5.4, + "step": 5235 + }, + { + "epoch": 0.031139975259301553, + "grad_norm": 1.9036076068878174, + "learning_rate": 4.9880509296932986e-05, + "loss": 5.4953, + "step": 5236 + }, + { + "epoch": 0.031145922542582548, + "grad_norm": 2.4709548950195312, + "learning_rate": 4.98804636782922e-05, + "loss": 5.2628, + "step": 5237 + }, + { + "epoch": 0.031151869825863546, + "grad_norm": 2.2380030155181885, + "learning_rate": 4.988041805096589e-05, + "loss": 5.2423, + "step": 5238 + }, + { + "epoch": 0.03115781710914454, + "grad_norm": 2.348639726638794, + "learning_rate": 4.988037241495409e-05, + "loss": 5.1966, + "step": 5239 + }, + { + "epoch": 0.03116376439242554, + "grad_norm": 1.9384468793869019, + "learning_rate": 4.9880326770256805e-05, + "loss": 5.47, + "step": 5240 + }, + { + "epoch": 0.03116971167570654, + "grad_norm": 2.2664244174957275, + "learning_rate": 4.988028111687406e-05, + "loss": 5.5511, + "step": 5241 + }, + { + "epoch": 0.031175658958987534, + "grad_norm": 2.1356422901153564, + "learning_rate": 4.988023545480586e-05, + "loss": 5.6462, + "step": 5242 + }, + { + "epoch": 0.031181606242268532, + "grad_norm": 2.240190267562866, + "learning_rate": 4.9880189784052226e-05, + "loss": 5.3494, + "step": 5243 + }, + { + "epoch": 0.031187553525549527, + "grad_norm": 1.8032485246658325, + "learning_rate": 4.988014410461318e-05, + "loss": 5.2305, + "step": 5244 + }, + { + "epoch": 0.031193500808830526, + "grad_norm": 2.177501678466797, + "learning_rate": 4.988009841648873e-05, + "loss": 5.1891, + "step": 5245 + }, + { + "epoch": 0.031199448092111524, + "grad_norm": 2.157317876815796, + "learning_rate": 4.988005271967889e-05, + "loss": 5.1038, + "step": 5246 + }, + { + "epoch": 0.03120539537539252, + "grad_norm": 1.9995821714401245, + "learning_rate": 4.988000701418369e-05, + "loss": 5.1098, + "step": 5247 + }, + { + "epoch": 0.031211342658673518, + "grad_norm": 2.201558828353882, + "learning_rate": 4.987996130000313e-05, + "loss": 5.0702, + "step": 5248 + }, + { + "epoch": 0.031217289941954517, + "grad_norm": 2.065645933151245, + "learning_rate": 4.987991557713724e-05, + "loss": 5.2012, + "step": 5249 + }, + { + "epoch": 0.03122323722523551, + "grad_norm": 1.908347487449646, + "learning_rate": 4.9879869845586024e-05, + "loss": 5.0913, + "step": 5250 + }, + { + "epoch": 0.03122918450851651, + "grad_norm": 1.913979411125183, + "learning_rate": 4.98798241053495e-05, + "loss": 5.0036, + "step": 5251 + }, + { + "epoch": 0.031235131791797505, + "grad_norm": 2.217616558074951, + "learning_rate": 4.9879778356427686e-05, + "loss": 5.0621, + "step": 5252 + }, + { + "epoch": 0.031241079075078504, + "grad_norm": 2.419713258743286, + "learning_rate": 4.9879732598820605e-05, + "loss": 5.1264, + "step": 5253 + }, + { + "epoch": 0.031247026358359502, + "grad_norm": 2.298295497894287, + "learning_rate": 4.987968683252826e-05, + "loss": 5.0576, + "step": 5254 + }, + { + "epoch": 0.0312529736416405, + "grad_norm": 2.120589256286621, + "learning_rate": 4.987964105755067e-05, + "loss": 5.175, + "step": 5255 + }, + { + "epoch": 0.031258920924921496, + "grad_norm": 2.3129806518554688, + "learning_rate": 4.987959527388787e-05, + "loss": 5.1827, + "step": 5256 + }, + { + "epoch": 0.03126486820820249, + "grad_norm": 2.251680612564087, + "learning_rate": 4.9879549481539846e-05, + "loss": 5.0473, + "step": 5257 + }, + { + "epoch": 0.03127081549148349, + "grad_norm": 2.101229429244995, + "learning_rate": 4.987950368050663e-05, + "loss": 5.0453, + "step": 5258 + }, + { + "epoch": 0.03127676277476449, + "grad_norm": 2.189565420150757, + "learning_rate": 4.987945787078824e-05, + "loss": 5.087, + "step": 5259 + }, + { + "epoch": 0.03128271005804548, + "grad_norm": 2.05485463142395, + "learning_rate": 4.9879412052384687e-05, + "loss": 5.0192, + "step": 5260 + }, + { + "epoch": 0.031288657341326485, + "grad_norm": 1.8166489601135254, + "learning_rate": 4.9879366225295994e-05, + "loss": 5.0456, + "step": 5261 + }, + { + "epoch": 0.03129460462460748, + "grad_norm": 2.1403279304504395, + "learning_rate": 4.9879320389522165e-05, + "loss": 4.9455, + "step": 5262 + }, + { + "epoch": 0.031300551907888476, + "grad_norm": 1.8833802938461304, + "learning_rate": 4.9879274545063226e-05, + "loss": 5.0891, + "step": 5263 + }, + { + "epoch": 0.03130649919116947, + "grad_norm": 2.000692367553711, + "learning_rate": 4.987922869191918e-05, + "loss": 5.1125, + "step": 5264 + }, + { + "epoch": 0.03131244647445047, + "grad_norm": 1.947544813156128, + "learning_rate": 4.9879182830090065e-05, + "loss": 4.9139, + "step": 5265 + }, + { + "epoch": 0.03131839375773147, + "grad_norm": 1.8827823400497437, + "learning_rate": 4.987913695957588e-05, + "loss": 5.0154, + "step": 5266 + }, + { + "epoch": 0.03132434104101246, + "grad_norm": 2.268115997314453, + "learning_rate": 4.987909108037664e-05, + "loss": 5.0379, + "step": 5267 + }, + { + "epoch": 0.031330288324293465, + "grad_norm": 1.85139000415802, + "learning_rate": 4.987904519249237e-05, + "loss": 4.9428, + "step": 5268 + }, + { + "epoch": 0.03133623560757446, + "grad_norm": 2.208338737487793, + "learning_rate": 4.987899929592308e-05, + "loss": 4.9366, + "step": 5269 + }, + { + "epoch": 0.031342182890855455, + "grad_norm": 3.5571236610412598, + "learning_rate": 4.987895339066879e-05, + "loss": 6.8471, + "step": 5270 + }, + { + "epoch": 0.03134813017413646, + "grad_norm": 2.000157594680786, + "learning_rate": 4.9878907476729516e-05, + "loss": 5.025, + "step": 5271 + }, + { + "epoch": 0.03135407745741745, + "grad_norm": 2.0588366985321045, + "learning_rate": 4.987886155410527e-05, + "loss": 4.8955, + "step": 5272 + }, + { + "epoch": 0.03136002474069845, + "grad_norm": 2.217839241027832, + "learning_rate": 4.9878815622796074e-05, + "loss": 4.9889, + "step": 5273 + }, + { + "epoch": 0.03136597202397945, + "grad_norm": 2.2453126907348633, + "learning_rate": 4.987876968280194e-05, + "loss": 5.3774, + "step": 5274 + }, + { + "epoch": 0.031371919307260444, + "grad_norm": 1.9839471578598022, + "learning_rate": 4.9878723734122876e-05, + "loss": 4.993, + "step": 5275 + }, + { + "epoch": 0.03137786659054144, + "grad_norm": 1.9534602165222168, + "learning_rate": 4.987867777675892e-05, + "loss": 4.9079, + "step": 5276 + }, + { + "epoch": 0.031383813873822435, + "grad_norm": 1.96163809299469, + "learning_rate": 4.9878631810710066e-05, + "loss": 4.9829, + "step": 5277 + }, + { + "epoch": 0.03138976115710344, + "grad_norm": 2.0814366340637207, + "learning_rate": 4.987858583597634e-05, + "loss": 4.8731, + "step": 5278 + }, + { + "epoch": 0.03139570844038443, + "grad_norm": 1.9846211671829224, + "learning_rate": 4.987853985255776e-05, + "loss": 4.9495, + "step": 5279 + }, + { + "epoch": 0.03140165572366543, + "grad_norm": 2.1237289905548096, + "learning_rate": 4.9878493860454335e-05, + "loss": 5.3887, + "step": 5280 + }, + { + "epoch": 0.03140760300694643, + "grad_norm": 2.1526784896850586, + "learning_rate": 4.9878447859666086e-05, + "loss": 5.3603, + "step": 5281 + }, + { + "epoch": 0.031413550290227424, + "grad_norm": 2.0563082695007324, + "learning_rate": 4.987840185019303e-05, + "loss": 5.4104, + "step": 5282 + }, + { + "epoch": 0.03141949757350842, + "grad_norm": 2.0586647987365723, + "learning_rate": 4.9878355832035175e-05, + "loss": 5.517, + "step": 5283 + }, + { + "epoch": 0.03142544485678942, + "grad_norm": 1.8817695379257202, + "learning_rate": 4.9878309805192546e-05, + "loss": 5.3616, + "step": 5284 + }, + { + "epoch": 0.031431392140070416, + "grad_norm": 2.0987086296081543, + "learning_rate": 4.987826376966516e-05, + "loss": 5.3237, + "step": 5285 + }, + { + "epoch": 0.03143733942335141, + "grad_norm": 2.3505301475524902, + "learning_rate": 4.987821772545302e-05, + "loss": 5.5165, + "step": 5286 + }, + { + "epoch": 0.03144328670663241, + "grad_norm": 2.1199939250946045, + "learning_rate": 4.987817167255616e-05, + "loss": 5.3029, + "step": 5287 + }, + { + "epoch": 0.03144923398991341, + "grad_norm": 1.7463518381118774, + "learning_rate": 4.987812561097458e-05, + "loss": 5.3589, + "step": 5288 + }, + { + "epoch": 0.0314551812731944, + "grad_norm": 1.9957356452941895, + "learning_rate": 4.987807954070831e-05, + "loss": 5.2459, + "step": 5289 + }, + { + "epoch": 0.031461128556475405, + "grad_norm": 1.7865337133407593, + "learning_rate": 4.987803346175736e-05, + "loss": 5.3041, + "step": 5290 + }, + { + "epoch": 0.0314670758397564, + "grad_norm": 1.82949960231781, + "learning_rate": 4.9877987374121744e-05, + "loss": 5.5761, + "step": 5291 + }, + { + "epoch": 0.031473023123037396, + "grad_norm": 1.974692940711975, + "learning_rate": 4.9877941277801475e-05, + "loss": 5.5033, + "step": 5292 + }, + { + "epoch": 0.03147897040631839, + "grad_norm": 2.1808922290802, + "learning_rate": 4.9877895172796577e-05, + "loss": 5.6739, + "step": 5293 + }, + { + "epoch": 0.03148491768959939, + "grad_norm": 2.7555716037750244, + "learning_rate": 4.987784905910706e-05, + "loss": 5.2489, + "step": 5294 + }, + { + "epoch": 0.03149086497288039, + "grad_norm": 2.475541353225708, + "learning_rate": 4.9877802936732955e-05, + "loss": 5.2304, + "step": 5295 + }, + { + "epoch": 0.03149681225616138, + "grad_norm": 1.945482611656189, + "learning_rate": 4.987775680567425e-05, + "loss": 5.4085, + "step": 5296 + }, + { + "epoch": 0.031502759539442385, + "grad_norm": 1.9879848957061768, + "learning_rate": 4.987771066593099e-05, + "loss": 5.5372, + "step": 5297 + }, + { + "epoch": 0.03150870682272338, + "grad_norm": 2.0529556274414062, + "learning_rate": 4.987766451750317e-05, + "loss": 5.578, + "step": 5298 + }, + { + "epoch": 0.031514654106004375, + "grad_norm": 1.7769572734832764, + "learning_rate": 4.9877618360390816e-05, + "loss": 5.5348, + "step": 5299 + }, + { + "epoch": 0.03152060138928538, + "grad_norm": 1.9111005067825317, + "learning_rate": 4.987757219459395e-05, + "loss": 5.4267, + "step": 5300 + }, + { + "epoch": 0.03152654867256637, + "grad_norm": 1.9047571420669556, + "learning_rate": 4.987752602011256e-05, + "loss": 5.433, + "step": 5301 + }, + { + "epoch": 0.03153249595584737, + "grad_norm": 1.9031875133514404, + "learning_rate": 4.98774798369467e-05, + "loss": 5.4929, + "step": 5302 + }, + { + "epoch": 0.03153844323912837, + "grad_norm": 1.858656883239746, + "learning_rate": 4.987743364509637e-05, + "loss": 5.3583, + "step": 5303 + }, + { + "epoch": 0.031544390522409364, + "grad_norm": 1.9254835844039917, + "learning_rate": 4.987738744456158e-05, + "loss": 5.4885, + "step": 5304 + }, + { + "epoch": 0.03155033780569036, + "grad_norm": 1.96173095703125, + "learning_rate": 4.987734123534235e-05, + "loss": 5.4869, + "step": 5305 + }, + { + "epoch": 0.031556285088971354, + "grad_norm": 1.7857433557510376, + "learning_rate": 4.98772950174387e-05, + "loss": 5.3845, + "step": 5306 + }, + { + "epoch": 0.031562232372252357, + "grad_norm": 1.9360556602478027, + "learning_rate": 4.9877248790850636e-05, + "loss": 5.3809, + "step": 5307 + }, + { + "epoch": 0.03156817965553335, + "grad_norm": 2.2044126987457275, + "learning_rate": 4.9877202555578197e-05, + "loss": 5.2413, + "step": 5308 + }, + { + "epoch": 0.03157412693881435, + "grad_norm": 1.8200992345809937, + "learning_rate": 4.9877156311621365e-05, + "loss": 5.6241, + "step": 5309 + }, + { + "epoch": 0.03158007422209535, + "grad_norm": 2.0771358013153076, + "learning_rate": 4.987711005898019e-05, + "loss": 5.6854, + "step": 5310 + }, + { + "epoch": 0.031586021505376344, + "grad_norm": 1.8330012559890747, + "learning_rate": 4.987706379765466e-05, + "loss": 5.712, + "step": 5311 + }, + { + "epoch": 0.03159196878865734, + "grad_norm": 1.941501498222351, + "learning_rate": 4.987701752764481e-05, + "loss": 5.4131, + "step": 5312 + }, + { + "epoch": 0.03159791607193834, + "grad_norm": 1.8688616752624512, + "learning_rate": 4.987697124895065e-05, + "loss": 5.3719, + "step": 5313 + }, + { + "epoch": 0.031603863355219336, + "grad_norm": 1.8723224401474, + "learning_rate": 4.98769249615722e-05, + "loss": 5.665, + "step": 5314 + }, + { + "epoch": 0.03160981063850033, + "grad_norm": 1.9460058212280273, + "learning_rate": 4.9876878665509474e-05, + "loss": 5.7048, + "step": 5315 + }, + { + "epoch": 0.03161575792178133, + "grad_norm": 1.9752602577209473, + "learning_rate": 4.987683236076248e-05, + "loss": 5.7098, + "step": 5316 + }, + { + "epoch": 0.03162170520506233, + "grad_norm": 1.8122695684432983, + "learning_rate": 4.9876786047331244e-05, + "loss": 5.2717, + "step": 5317 + }, + { + "epoch": 0.03162765248834332, + "grad_norm": 1.961983323097229, + "learning_rate": 4.9876739725215775e-05, + "loss": 5.5593, + "step": 5318 + }, + { + "epoch": 0.031633599771624325, + "grad_norm": 1.7362732887268066, + "learning_rate": 4.98766933944161e-05, + "loss": 5.5002, + "step": 5319 + }, + { + "epoch": 0.03163954705490532, + "grad_norm": 2.084033489227295, + "learning_rate": 4.9876647054932226e-05, + "loss": 5.5398, + "step": 5320 + }, + { + "epoch": 0.031645494338186315, + "grad_norm": 1.869452953338623, + "learning_rate": 4.9876600706764165e-05, + "loss": 5.5985, + "step": 5321 + }, + { + "epoch": 0.03165144162146731, + "grad_norm": 3.597667694091797, + "learning_rate": 4.9876554349911943e-05, + "loss": 5.4143, + "step": 5322 + }, + { + "epoch": 0.03165738890474831, + "grad_norm": 2.2364773750305176, + "learning_rate": 4.9876507984375574e-05, + "loss": 5.3756, + "step": 5323 + }, + { + "epoch": 0.03166333618802931, + "grad_norm": 2.0204551219940186, + "learning_rate": 4.987646161015508e-05, + "loss": 5.4964, + "step": 5324 + }, + { + "epoch": 0.0316692834713103, + "grad_norm": 1.7375823259353638, + "learning_rate": 4.987641522725046e-05, + "loss": 5.5249, + "step": 5325 + }, + { + "epoch": 0.031675230754591305, + "grad_norm": 1.661597728729248, + "learning_rate": 4.987636883566175e-05, + "loss": 5.4828, + "step": 5326 + }, + { + "epoch": 0.0316811780378723, + "grad_norm": 1.8612693548202515, + "learning_rate": 4.9876322435388944e-05, + "loss": 5.4711, + "step": 5327 + }, + { + "epoch": 0.031687125321153295, + "grad_norm": 1.8282328844070435, + "learning_rate": 4.987627602643208e-05, + "loss": 5.5234, + "step": 5328 + }, + { + "epoch": 0.0316930726044343, + "grad_norm": 1.951170802116394, + "learning_rate": 4.987622960879116e-05, + "loss": 5.4117, + "step": 5329 + }, + { + "epoch": 0.03169901988771529, + "grad_norm": 1.819174885749817, + "learning_rate": 4.9876183182466207e-05, + "loss": 5.3446, + "step": 5330 + }, + { + "epoch": 0.03170496717099629, + "grad_norm": 1.8710874319076538, + "learning_rate": 4.9876136747457245e-05, + "loss": 5.3755, + "step": 5331 + }, + { + "epoch": 0.03171091445427729, + "grad_norm": 2.1957387924194336, + "learning_rate": 4.9876090303764264e-05, + "loss": 6.3036, + "step": 5332 + }, + { + "epoch": 0.031716861737558284, + "grad_norm": 1.774741530418396, + "learning_rate": 4.987604385138731e-05, + "loss": 5.3822, + "step": 5333 + }, + { + "epoch": 0.03172280902083928, + "grad_norm": 1.793230414390564, + "learning_rate": 4.987599739032638e-05, + "loss": 5.4224, + "step": 5334 + }, + { + "epoch": 0.031728756304120274, + "grad_norm": 1.7986340522766113, + "learning_rate": 4.98759509205815e-05, + "loss": 5.3939, + "step": 5335 + }, + { + "epoch": 0.031734703587401276, + "grad_norm": 1.7775462865829468, + "learning_rate": 4.9875904442152675e-05, + "loss": 5.4356, + "step": 5336 + }, + { + "epoch": 0.03174065087068227, + "grad_norm": 1.882104516029358, + "learning_rate": 4.987585795503994e-05, + "loss": 5.2852, + "step": 5337 + }, + { + "epoch": 0.03174659815396327, + "grad_norm": 1.9842430353164673, + "learning_rate": 4.987581145924329e-05, + "loss": 5.4089, + "step": 5338 + }, + { + "epoch": 0.03175254543724427, + "grad_norm": 1.7098103761672974, + "learning_rate": 4.9875764954762754e-05, + "loss": 5.2442, + "step": 5339 + }, + { + "epoch": 0.031758492720525264, + "grad_norm": 1.8304857015609741, + "learning_rate": 4.9875718441598354e-05, + "loss": 5.5403, + "step": 5340 + }, + { + "epoch": 0.03176444000380626, + "grad_norm": 2.0763137340545654, + "learning_rate": 4.987567191975009e-05, + "loss": 5.8295, + "step": 5341 + }, + { + "epoch": 0.03177038728708726, + "grad_norm": 1.907271385192871, + "learning_rate": 4.9875625389217984e-05, + "loss": 5.6979, + "step": 5342 + }, + { + "epoch": 0.031776334570368256, + "grad_norm": 2.1263620853424072, + "learning_rate": 4.9875578850002056e-05, + "loss": 5.7713, + "step": 5343 + }, + { + "epoch": 0.03178228185364925, + "grad_norm": 2.038358211517334, + "learning_rate": 4.987553230210232e-05, + "loss": 6.0019, + "step": 5344 + }, + { + "epoch": 0.03178822913693025, + "grad_norm": 1.5671371221542358, + "learning_rate": 4.987548574551879e-05, + "loss": 5.9237, + "step": 5345 + }, + { + "epoch": 0.03179417642021125, + "grad_norm": 1.9159321784973145, + "learning_rate": 4.987543918025149e-05, + "loss": 6.0363, + "step": 5346 + }, + { + "epoch": 0.03180012370349224, + "grad_norm": 1.8012747764587402, + "learning_rate": 4.987539260630043e-05, + "loss": 5.901, + "step": 5347 + }, + { + "epoch": 0.031806070986773245, + "grad_norm": 2.154933214187622, + "learning_rate": 4.9875346023665625e-05, + "loss": 5.6379, + "step": 5348 + }, + { + "epoch": 0.03181201827005424, + "grad_norm": 2.191539764404297, + "learning_rate": 4.98752994323471e-05, + "loss": 5.5322, + "step": 5349 + }, + { + "epoch": 0.031817965553335235, + "grad_norm": 2.0007123947143555, + "learning_rate": 4.9875252832344856e-05, + "loss": 5.7398, + "step": 5350 + }, + { + "epoch": 0.03182391283661623, + "grad_norm": 1.7119163274765015, + "learning_rate": 4.9875206223658924e-05, + "loss": 5.8507, + "step": 5351 + }, + { + "epoch": 0.03182986011989723, + "grad_norm": 1.8882098197937012, + "learning_rate": 4.987515960628931e-05, + "loss": 5.8668, + "step": 5352 + }, + { + "epoch": 0.03183580740317823, + "grad_norm": 2.005493402481079, + "learning_rate": 4.987511298023604e-05, + "loss": 5.9672, + "step": 5353 + }, + { + "epoch": 0.03184175468645922, + "grad_norm": 1.858807921409607, + "learning_rate": 4.987506634549912e-05, + "loss": 5.9344, + "step": 5354 + }, + { + "epoch": 0.031847701969740225, + "grad_norm": 2.2698724269866943, + "learning_rate": 4.987501970207858e-05, + "loss": 5.6553, + "step": 5355 + }, + { + "epoch": 0.03185364925302122, + "grad_norm": 1.7690725326538086, + "learning_rate": 4.987497304997442e-05, + "loss": 5.6255, + "step": 5356 + }, + { + "epoch": 0.031859596536302215, + "grad_norm": 2.008002758026123, + "learning_rate": 4.987492638918667e-05, + "loss": 5.5578, + "step": 5357 + }, + { + "epoch": 0.03186554381958322, + "grad_norm": 1.6483304500579834, + "learning_rate": 4.987487971971533e-05, + "loss": 5.4786, + "step": 5358 + }, + { + "epoch": 0.03187149110286421, + "grad_norm": 1.9136204719543457, + "learning_rate": 4.987483304156044e-05, + "loss": 5.6043, + "step": 5359 + }, + { + "epoch": 0.03187743838614521, + "grad_norm": 1.9811625480651855, + "learning_rate": 4.987478635472199e-05, + "loss": 5.6172, + "step": 5360 + }, + { + "epoch": 0.03188338566942621, + "grad_norm": 2.012134075164795, + "learning_rate": 4.987473965920002e-05, + "loss": 5.6715, + "step": 5361 + }, + { + "epoch": 0.031889332952707204, + "grad_norm": 1.930550217628479, + "learning_rate": 4.987469295499453e-05, + "loss": 5.516, + "step": 5362 + }, + { + "epoch": 0.0318952802359882, + "grad_norm": 2.1190578937530518, + "learning_rate": 4.987464624210554e-05, + "loss": 5.5176, + "step": 5363 + }, + { + "epoch": 0.031901227519269194, + "grad_norm": 2.428710699081421, + "learning_rate": 4.987459952053307e-05, + "loss": 5.4088, + "step": 5364 + }, + { + "epoch": 0.031907174802550196, + "grad_norm": 1.8820819854736328, + "learning_rate": 4.987455279027713e-05, + "loss": 5.3753, + "step": 5365 + }, + { + "epoch": 0.03191312208583119, + "grad_norm": 1.6506859064102173, + "learning_rate": 4.987450605133775e-05, + "loss": 5.6018, + "step": 5366 + }, + { + "epoch": 0.03191906936911219, + "grad_norm": 2.060772657394409, + "learning_rate": 4.9874459303714925e-05, + "loss": 5.3587, + "step": 5367 + }, + { + "epoch": 0.03192501665239319, + "grad_norm": 2.3591532707214355, + "learning_rate": 4.9874412547408694e-05, + "loss": 5.7685, + "step": 5368 + }, + { + "epoch": 0.031930963935674184, + "grad_norm": 2.140322685241699, + "learning_rate": 4.987436578241906e-05, + "loss": 5.9015, + "step": 5369 + }, + { + "epoch": 0.03193691121895518, + "grad_norm": 2.2479233741760254, + "learning_rate": 4.987431900874604e-05, + "loss": 5.6079, + "step": 5370 + }, + { + "epoch": 0.03194285850223618, + "grad_norm": 2.0334317684173584, + "learning_rate": 4.987427222638965e-05, + "loss": 5.6364, + "step": 5371 + }, + { + "epoch": 0.031948805785517176, + "grad_norm": 2.0599231719970703, + "learning_rate": 4.987422543534991e-05, + "loss": 5.6578, + "step": 5372 + }, + { + "epoch": 0.03195475306879817, + "grad_norm": 2.237504720687866, + "learning_rate": 4.9874178635626836e-05, + "loss": 5.5784, + "step": 5373 + }, + { + "epoch": 0.03196070035207917, + "grad_norm": 2.013193130493164, + "learning_rate": 4.987413182722044e-05, + "loss": 5.4874, + "step": 5374 + }, + { + "epoch": 0.03196664763536017, + "grad_norm": 1.9806950092315674, + "learning_rate": 4.987408501013075e-05, + "loss": 5.41, + "step": 5375 + }, + { + "epoch": 0.03197259491864116, + "grad_norm": 1.7534204721450806, + "learning_rate": 4.9874038184357766e-05, + "loss": 5.4596, + "step": 5376 + }, + { + "epoch": 0.031978542201922165, + "grad_norm": 1.5722386837005615, + "learning_rate": 4.987399134990152e-05, + "loss": 5.508, + "step": 5377 + }, + { + "epoch": 0.03198448948520316, + "grad_norm": 7.868972301483154, + "learning_rate": 4.987394450676201e-05, + "loss": 5.1734, + "step": 5378 + }, + { + "epoch": 0.031990436768484155, + "grad_norm": 2.2103798389434814, + "learning_rate": 4.9873897654939274e-05, + "loss": 5.6766, + "step": 5379 + }, + { + "epoch": 0.03199638405176515, + "grad_norm": 1.9590017795562744, + "learning_rate": 4.9873850794433306e-05, + "loss": 5.7764, + "step": 5380 + }, + { + "epoch": 0.03200233133504615, + "grad_norm": 1.96006441116333, + "learning_rate": 4.9873803925244146e-05, + "loss": 5.7933, + "step": 5381 + }, + { + "epoch": 0.03200827861832715, + "grad_norm": 1.7377163171768188, + "learning_rate": 4.987375704737178e-05, + "loss": 5.692, + "step": 5382 + }, + { + "epoch": 0.03201422590160814, + "grad_norm": 2.0734782218933105, + "learning_rate": 4.9873710160816256e-05, + "loss": 5.5466, + "step": 5383 + }, + { + "epoch": 0.032020173184889145, + "grad_norm": 2.4700942039489746, + "learning_rate": 4.9873663265577574e-05, + "loss": 5.5837, + "step": 5384 + }, + { + "epoch": 0.03202612046817014, + "grad_norm": 2.067009925842285, + "learning_rate": 4.987361636165576e-05, + "loss": 5.4777, + "step": 5385 + }, + { + "epoch": 0.032032067751451135, + "grad_norm": 1.9585732221603394, + "learning_rate": 4.9873569449050815e-05, + "loss": 5.62, + "step": 5386 + }, + { + "epoch": 0.03203801503473214, + "grad_norm": 2.0210976600646973, + "learning_rate": 4.9873522527762766e-05, + "loss": 5.3554, + "step": 5387 + }, + { + "epoch": 0.03204396231801313, + "grad_norm": 2.0345299243927, + "learning_rate": 4.987347559779163e-05, + "loss": 5.3912, + "step": 5388 + }, + { + "epoch": 0.03204990960129413, + "grad_norm": 2.0960853099823, + "learning_rate": 4.987342865913742e-05, + "loss": 5.3497, + "step": 5389 + }, + { + "epoch": 0.03205585688457513, + "grad_norm": 2.0156044960021973, + "learning_rate": 4.987338171180015e-05, + "loss": 5.2769, + "step": 5390 + }, + { + "epoch": 0.032061804167856124, + "grad_norm": 2.0021722316741943, + "learning_rate": 4.987333475577984e-05, + "loss": 5.2338, + "step": 5391 + }, + { + "epoch": 0.03206775145113712, + "grad_norm": 1.8502025604248047, + "learning_rate": 4.987328779107651e-05, + "loss": 5.4231, + "step": 5392 + }, + { + "epoch": 0.03207369873441812, + "grad_norm": 2.0788064002990723, + "learning_rate": 4.987324081769016e-05, + "loss": 5.3989, + "step": 5393 + }, + { + "epoch": 0.032079646017699116, + "grad_norm": 5.172029495239258, + "learning_rate": 4.987319383562083e-05, + "loss": 6.5943, + "step": 5394 + }, + { + "epoch": 0.03208559330098011, + "grad_norm": 1.8732082843780518, + "learning_rate": 4.987314684486852e-05, + "loss": 5.3085, + "step": 5395 + }, + { + "epoch": 0.032091540584261107, + "grad_norm": 2.0511786937713623, + "learning_rate": 4.987309984543326e-05, + "loss": 5.1598, + "step": 5396 + }, + { + "epoch": 0.03209748786754211, + "grad_norm": 2.1821703910827637, + "learning_rate": 4.987305283731505e-05, + "loss": 5.3575, + "step": 5397 + }, + { + "epoch": 0.032103435150823104, + "grad_norm": 2.1190478801727295, + "learning_rate": 4.9873005820513906e-05, + "loss": 5.2371, + "step": 5398 + }, + { + "epoch": 0.0321093824341041, + "grad_norm": 2.1476964950561523, + "learning_rate": 4.987295879502987e-05, + "loss": 5.1378, + "step": 5399 + }, + { + "epoch": 0.0321153297173851, + "grad_norm": 2.3466129302978516, + "learning_rate": 4.987291176086293e-05, + "loss": 5.0642, + "step": 5400 + }, + { + "epoch": 0.032121277000666096, + "grad_norm": 2.267949104309082, + "learning_rate": 4.9872864718013115e-05, + "loss": 5.6835, + "step": 5401 + }, + { + "epoch": 0.03212722428394709, + "grad_norm": 3.1235604286193848, + "learning_rate": 4.987281766648044e-05, + "loss": 6.2094, + "step": 5402 + }, + { + "epoch": 0.03213317156722809, + "grad_norm": 2.494929790496826, + "learning_rate": 4.987277060626493e-05, + "loss": 6.2387, + "step": 5403 + }, + { + "epoch": 0.03213911885050909, + "grad_norm": 2.554422616958618, + "learning_rate": 4.987272353736658e-05, + "loss": 5.9655, + "step": 5404 + }, + { + "epoch": 0.03214506613379008, + "grad_norm": 3.688295841217041, + "learning_rate": 4.987267645978543e-05, + "loss": 6.3994, + "step": 5405 + }, + { + "epoch": 0.032151013417071085, + "grad_norm": 2.773847818374634, + "learning_rate": 4.987262937352147e-05, + "loss": 5.515, + "step": 5406 + }, + { + "epoch": 0.03215696070035208, + "grad_norm": 3.067812204360962, + "learning_rate": 4.987258227857475e-05, + "loss": 5.7388, + "step": 5407 + }, + { + "epoch": 0.032162907983633075, + "grad_norm": 3.0557258129119873, + "learning_rate": 4.987253517494525e-05, + "loss": 6.0334, + "step": 5408 + }, + { + "epoch": 0.03216885526691407, + "grad_norm": 2.2864489555358887, + "learning_rate": 4.9872488062633026e-05, + "loss": 6.2805, + "step": 5409 + }, + { + "epoch": 0.03217480255019507, + "grad_norm": 3.2848916053771973, + "learning_rate": 4.987244094163807e-05, + "loss": 6.4782, + "step": 5410 + }, + { + "epoch": 0.03218074983347607, + "grad_norm": 3.7147631645202637, + "learning_rate": 4.987239381196039e-05, + "loss": 6.6618, + "step": 5411 + }, + { + "epoch": 0.03218669711675706, + "grad_norm": 2.740705966949463, + "learning_rate": 4.9872346673600017e-05, + "loss": 6.0261, + "step": 5412 + }, + { + "epoch": 0.032192644400038065, + "grad_norm": 2.6408498287200928, + "learning_rate": 4.9872299526556965e-05, + "loss": 5.8645, + "step": 5413 + }, + { + "epoch": 0.03219859168331906, + "grad_norm": 2.8298256397247314, + "learning_rate": 4.987225237083125e-05, + "loss": 5.9263, + "step": 5414 + }, + { + "epoch": 0.032204538966600055, + "grad_norm": 2.9417197704315186, + "learning_rate": 4.987220520642289e-05, + "loss": 5.8018, + "step": 5415 + }, + { + "epoch": 0.03221048624988106, + "grad_norm": 3.2862906455993652, + "learning_rate": 4.9872158033331904e-05, + "loss": 5.8429, + "step": 5416 + }, + { + "epoch": 0.03221643353316205, + "grad_norm": 2.7724359035491943, + "learning_rate": 4.9872110851558306e-05, + "loss": 5.9504, + "step": 5417 + }, + { + "epoch": 0.03222238081644305, + "grad_norm": 2.2753829956054688, + "learning_rate": 4.9872063661102106e-05, + "loss": 5.6443, + "step": 5418 + }, + { + "epoch": 0.03222832809972405, + "grad_norm": 2.597649097442627, + "learning_rate": 4.987201646196332e-05, + "loss": 6.4441, + "step": 5419 + }, + { + "epoch": 0.032234275383005044, + "grad_norm": 2.7298800945281982, + "learning_rate": 4.987196925414198e-05, + "loss": 6.2988, + "step": 5420 + }, + { + "epoch": 0.03224022266628604, + "grad_norm": 3.2329537868499756, + "learning_rate": 4.987192203763809e-05, + "loss": 5.8743, + "step": 5421 + }, + { + "epoch": 0.03224616994956704, + "grad_norm": 3.033226251602173, + "learning_rate": 4.987187481245167e-05, + "loss": 5.4863, + "step": 5422 + }, + { + "epoch": 0.032252117232848036, + "grad_norm": 2.7728521823883057, + "learning_rate": 4.987182757858273e-05, + "loss": 5.5722, + "step": 5423 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.6083309650421143, + "learning_rate": 4.98717803360313e-05, + "loss": 6.5257, + "step": 5424 + }, + { + "epoch": 0.032264011799410026, + "grad_norm": 2.5422329902648926, + "learning_rate": 4.987173308479738e-05, + "loss": 6.5582, + "step": 5425 + }, + { + "epoch": 0.03226995908269103, + "grad_norm": 2.7634811401367188, + "learning_rate": 4.9871685824881e-05, + "loss": 6.0987, + "step": 5426 + }, + { + "epoch": 0.032275906365972024, + "grad_norm": 3.631476640701294, + "learning_rate": 4.987163855628217e-05, + "loss": 5.8506, + "step": 5427 + }, + { + "epoch": 0.03228185364925302, + "grad_norm": 2.9783661365509033, + "learning_rate": 4.9871591279000904e-05, + "loss": 5.9387, + "step": 5428 + }, + { + "epoch": 0.03228780093253402, + "grad_norm": 2.369645357131958, + "learning_rate": 4.9871543993037225e-05, + "loss": 5.8097, + "step": 5429 + }, + { + "epoch": 0.032293748215815016, + "grad_norm": 2.782055616378784, + "learning_rate": 4.9871496698391155e-05, + "loss": 5.5301, + "step": 5430 + }, + { + "epoch": 0.03229969549909601, + "grad_norm": 2.408205270767212, + "learning_rate": 4.98714493950627e-05, + "loss": 5.6514, + "step": 5431 + }, + { + "epoch": 0.03230564278237701, + "grad_norm": 2.0641589164733887, + "learning_rate": 4.987140208305187e-05, + "loss": 5.6168, + "step": 5432 + }, + { + "epoch": 0.03231159006565801, + "grad_norm": 2.109773874282837, + "learning_rate": 4.987135476235869e-05, + "loss": 5.6678, + "step": 5433 + }, + { + "epoch": 0.032317537348939, + "grad_norm": 2.9809730052948, + "learning_rate": 4.987130743298318e-05, + "loss": 6.0531, + "step": 5434 + }, + { + "epoch": 0.032323484632220005, + "grad_norm": 2.5728509426116943, + "learning_rate": 4.9871260094925365e-05, + "loss": 6.05, + "step": 5435 + }, + { + "epoch": 0.032329431915501, + "grad_norm": 2.477074146270752, + "learning_rate": 4.9871212748185236e-05, + "loss": 6.351, + "step": 5436 + }, + { + "epoch": 0.032335379198781995, + "grad_norm": 2.3485517501831055, + "learning_rate": 4.987116539276283e-05, + "loss": 6.3033, + "step": 5437 + }, + { + "epoch": 0.03234132648206299, + "grad_norm": 2.4214296340942383, + "learning_rate": 4.987111802865816e-05, + "loss": 6.1152, + "step": 5438 + }, + { + "epoch": 0.03234727376534399, + "grad_norm": 3.5628256797790527, + "learning_rate": 4.9871070655871234e-05, + "loss": 5.6502, + "step": 5439 + }, + { + "epoch": 0.03235322104862499, + "grad_norm": 3.190075159072876, + "learning_rate": 4.987102327440208e-05, + "loss": 5.4164, + "step": 5440 + }, + { + "epoch": 0.03235916833190598, + "grad_norm": 2.402754306793213, + "learning_rate": 4.9870975884250696e-05, + "loss": 5.7116, + "step": 5441 + }, + { + "epoch": 0.032365115615186985, + "grad_norm": 2.846653938293457, + "learning_rate": 4.987092848541712e-05, + "loss": 6.1456, + "step": 5442 + }, + { + "epoch": 0.03237106289846798, + "grad_norm": 2.6700549125671387, + "learning_rate": 4.987088107790136e-05, + "loss": 5.9777, + "step": 5443 + }, + { + "epoch": 0.032377010181748975, + "grad_norm": 2.8929460048675537, + "learning_rate": 4.987083366170343e-05, + "loss": 6.1459, + "step": 5444 + }, + { + "epoch": 0.03238295746502998, + "grad_norm": 2.524376153945923, + "learning_rate": 4.987078623682335e-05, + "loss": 6.4341, + "step": 5445 + }, + { + "epoch": 0.03238890474831097, + "grad_norm": 2.0901076793670654, + "learning_rate": 4.987073880326114e-05, + "loss": 6.3968, + "step": 5446 + }, + { + "epoch": 0.03239485203159197, + "grad_norm": 3.0033867359161377, + "learning_rate": 4.9870691361016805e-05, + "loss": 5.8656, + "step": 5447 + }, + { + "epoch": 0.03240079931487297, + "grad_norm": 2.7715492248535156, + "learning_rate": 4.987064391009038e-05, + "loss": 6.1634, + "step": 5448 + }, + { + "epoch": 0.032406746598153964, + "grad_norm": 2.6102347373962402, + "learning_rate": 4.9870596450481855e-05, + "loss": 6.2521, + "step": 5449 + }, + { + "epoch": 0.03241269388143496, + "grad_norm": 2.326253890991211, + "learning_rate": 4.9870548982191265e-05, + "loss": 6.2517, + "step": 5450 + }, + { + "epoch": 0.03241864116471596, + "grad_norm": 2.3012197017669678, + "learning_rate": 4.987050150521863e-05, + "loss": 6.2261, + "step": 5451 + }, + { + "epoch": 0.032424588447996956, + "grad_norm": 2.100337505340576, + "learning_rate": 4.987045401956396e-05, + "loss": 5.6291, + "step": 5452 + }, + { + "epoch": 0.03243053573127795, + "grad_norm": 3.094754219055176, + "learning_rate": 4.987040652522727e-05, + "loss": 5.897, + "step": 5453 + }, + { + "epoch": 0.032436483014558946, + "grad_norm": 2.7406179904937744, + "learning_rate": 4.987035902220857e-05, + "loss": 6.0083, + "step": 5454 + }, + { + "epoch": 0.03244243029783995, + "grad_norm": 2.4106287956237793, + "learning_rate": 4.9870311510507895e-05, + "loss": 5.8538, + "step": 5455 + }, + { + "epoch": 0.032448377581120944, + "grad_norm": 2.7335946559906006, + "learning_rate": 4.987026399012525e-05, + "loss": 5.9181, + "step": 5456 + }, + { + "epoch": 0.03245432486440194, + "grad_norm": 2.796175003051758, + "learning_rate": 4.987021646106064e-05, + "loss": 5.6461, + "step": 5457 + }, + { + "epoch": 0.03246027214768294, + "grad_norm": 3.086470127105713, + "learning_rate": 4.987016892331411e-05, + "loss": 5.6692, + "step": 5458 + }, + { + "epoch": 0.032466219430963936, + "grad_norm": 2.394465923309326, + "learning_rate": 4.9870121376885656e-05, + "loss": 6.3046, + "step": 5459 + }, + { + "epoch": 0.03247216671424493, + "grad_norm": 2.0745291709899902, + "learning_rate": 4.98700738217753e-05, + "loss": 6.0491, + "step": 5460 + }, + { + "epoch": 0.03247811399752593, + "grad_norm": 2.66359281539917, + "learning_rate": 4.987002625798305e-05, + "loss": 5.6468, + "step": 5461 + }, + { + "epoch": 0.03248406128080693, + "grad_norm": 2.392833948135376, + "learning_rate": 4.9869978685508936e-05, + "loss": 5.8421, + "step": 5462 + }, + { + "epoch": 0.03249000856408792, + "grad_norm": 2.671710252761841, + "learning_rate": 4.9869931104352975e-05, + "loss": 5.6892, + "step": 5463 + }, + { + "epoch": 0.032495955847368925, + "grad_norm": 2.7013144493103027, + "learning_rate": 4.986988351451517e-05, + "loss": 5.7911, + "step": 5464 + }, + { + "epoch": 0.03250190313064992, + "grad_norm": 1.926703929901123, + "learning_rate": 4.9869835915995555e-05, + "loss": 5.5492, + "step": 5465 + }, + { + "epoch": 0.032507850413930915, + "grad_norm": 2.5668530464172363, + "learning_rate": 4.986978830879413e-05, + "loss": 5.8949, + "step": 5466 + }, + { + "epoch": 0.03251379769721191, + "grad_norm": 2.555305004119873, + "learning_rate": 4.986974069291092e-05, + "loss": 5.7408, + "step": 5467 + }, + { + "epoch": 0.03251974498049291, + "grad_norm": 2.551226854324341, + "learning_rate": 4.986969306834594e-05, + "loss": 5.7738, + "step": 5468 + }, + { + "epoch": 0.03252569226377391, + "grad_norm": 2.3194847106933594, + "learning_rate": 4.986964543509921e-05, + "loss": 6.2837, + "step": 5469 + }, + { + "epoch": 0.0325316395470549, + "grad_norm": 1.9618690013885498, + "learning_rate": 4.986959779317074e-05, + "loss": 5.9236, + "step": 5470 + }, + { + "epoch": 0.032537586830335904, + "grad_norm": 2.351971387863159, + "learning_rate": 4.986955014256055e-05, + "loss": 5.591, + "step": 5471 + }, + { + "epoch": 0.0325435341136169, + "grad_norm": 2.3772034645080566, + "learning_rate": 4.986950248326866e-05, + "loss": 5.6785, + "step": 5472 + }, + { + "epoch": 0.032549481396897895, + "grad_norm": 2.5764195919036865, + "learning_rate": 4.9869454815295085e-05, + "loss": 5.525, + "step": 5473 + }, + { + "epoch": 0.0325554286801789, + "grad_norm": 2.231048107147217, + "learning_rate": 4.986940713863984e-05, + "loss": 5.6789, + "step": 5474 + }, + { + "epoch": 0.03256137596345989, + "grad_norm": 2.8053946495056152, + "learning_rate": 4.986935945330294e-05, + "loss": 5.6319, + "step": 5475 + }, + { + "epoch": 0.03256732324674089, + "grad_norm": 3.4610519409179688, + "learning_rate": 4.98693117592844e-05, + "loss": 5.9855, + "step": 5476 + }, + { + "epoch": 0.03257327053002189, + "grad_norm": 2.5019664764404297, + "learning_rate": 4.986926405658425e-05, + "loss": 5.9997, + "step": 5477 + }, + { + "epoch": 0.032579217813302884, + "grad_norm": 2.6583313941955566, + "learning_rate": 4.986921634520249e-05, + "loss": 6.3755, + "step": 5478 + }, + { + "epoch": 0.03258516509658388, + "grad_norm": 2.990699291229248, + "learning_rate": 4.986916862513914e-05, + "loss": 5.8932, + "step": 5479 + }, + { + "epoch": 0.03259111237986488, + "grad_norm": 3.282546043395996, + "learning_rate": 4.986912089639423e-05, + "loss": 5.5508, + "step": 5480 + }, + { + "epoch": 0.032597059663145876, + "grad_norm": 3.1012487411499023, + "learning_rate": 4.9869073158967755e-05, + "loss": 5.5567, + "step": 5481 + }, + { + "epoch": 0.03260300694642687, + "grad_norm": 2.141892433166504, + "learning_rate": 4.986902541285975e-05, + "loss": 5.6195, + "step": 5482 + }, + { + "epoch": 0.032608954229707866, + "grad_norm": 2.173670530319214, + "learning_rate": 4.986897765807023e-05, + "loss": 5.6913, + "step": 5483 + }, + { + "epoch": 0.03261490151298887, + "grad_norm": 2.4076435565948486, + "learning_rate": 4.98689298945992e-05, + "loss": 5.8324, + "step": 5484 + }, + { + "epoch": 0.03262084879626986, + "grad_norm": 2.8968818187713623, + "learning_rate": 4.986888212244668e-05, + "loss": 6.0086, + "step": 5485 + }, + { + "epoch": 0.03262679607955086, + "grad_norm": 2.2434191703796387, + "learning_rate": 4.9868834341612696e-05, + "loss": 5.9645, + "step": 5486 + }, + { + "epoch": 0.03263274336283186, + "grad_norm": 1.9683157205581665, + "learning_rate": 4.9868786552097255e-05, + "loss": 5.9173, + "step": 5487 + }, + { + "epoch": 0.032638690646112856, + "grad_norm": 2.369816303253174, + "learning_rate": 4.9868738753900384e-05, + "loss": 6.2728, + "step": 5488 + }, + { + "epoch": 0.03264463792939385, + "grad_norm": 2.1152775287628174, + "learning_rate": 4.986869094702209e-05, + "loss": 6.0474, + "step": 5489 + }, + { + "epoch": 0.03265058521267485, + "grad_norm": 2.3219857215881348, + "learning_rate": 4.9868643131462397e-05, + "loss": 5.7451, + "step": 5490 + }, + { + "epoch": 0.03265653249595585, + "grad_norm": 2.236046075820923, + "learning_rate": 4.986859530722131e-05, + "loss": 5.7775, + "step": 5491 + }, + { + "epoch": 0.03266247977923684, + "grad_norm": 2.3334364891052246, + "learning_rate": 4.986854747429886e-05, + "loss": 5.7429, + "step": 5492 + }, + { + "epoch": 0.032668427062517845, + "grad_norm": 2.5464704036712646, + "learning_rate": 4.986849963269505e-05, + "loss": 5.5781, + "step": 5493 + }, + { + "epoch": 0.03267437434579884, + "grad_norm": 2.104419469833374, + "learning_rate": 4.986845178240991e-05, + "loss": 5.6378, + "step": 5494 + }, + { + "epoch": 0.032680321629079835, + "grad_norm": 2.3115224838256836, + "learning_rate": 4.9868403923443444e-05, + "loss": 5.7617, + "step": 5495 + }, + { + "epoch": 0.03268626891236083, + "grad_norm": 2.3370540142059326, + "learning_rate": 4.9868356055795685e-05, + "loss": 6.1278, + "step": 5496 + }, + { + "epoch": 0.03269221619564183, + "grad_norm": 2.8618736267089844, + "learning_rate": 4.986830817946663e-05, + "loss": 6.0879, + "step": 5497 + }, + { + "epoch": 0.03269816347892283, + "grad_norm": 2.3229949474334717, + "learning_rate": 4.986826029445631e-05, + "loss": 6.0915, + "step": 5498 + }, + { + "epoch": 0.03270411076220382, + "grad_norm": 2.549914598464966, + "learning_rate": 4.986821240076473e-05, + "loss": 6.2375, + "step": 5499 + }, + { + "epoch": 0.032710058045484824, + "grad_norm": 2.595916271209717, + "learning_rate": 4.986816449839192e-05, + "loss": 6.095, + "step": 5500 + }, + { + "epoch": 0.03271600532876582, + "grad_norm": 2.4409420490264893, + "learning_rate": 4.98681165873379e-05, + "loss": 5.353, + "step": 5501 + }, + { + "epoch": 0.032721952612046815, + "grad_norm": 2.550156593322754, + "learning_rate": 4.986806866760266e-05, + "loss": 5.558, + "step": 5502 + }, + { + "epoch": 0.03272789989532782, + "grad_norm": 2.7811737060546875, + "learning_rate": 4.986802073918625e-05, + "loss": 5.7174, + "step": 5503 + }, + { + "epoch": 0.03273384717860881, + "grad_norm": 2.8430123329162598, + "learning_rate": 4.986797280208866e-05, + "loss": 5.5644, + "step": 5504 + }, + { + "epoch": 0.03273979446188981, + "grad_norm": 3.021040201187134, + "learning_rate": 4.986792485630992e-05, + "loss": 5.9451, + "step": 5505 + }, + { + "epoch": 0.03274574174517081, + "grad_norm": 2.69866681098938, + "learning_rate": 4.986787690185005e-05, + "loss": 5.9934, + "step": 5506 + }, + { + "epoch": 0.032751689028451804, + "grad_norm": 2.7202444076538086, + "learning_rate": 4.986782893870906e-05, + "loss": 6.1298, + "step": 5507 + }, + { + "epoch": 0.0327576363117328, + "grad_norm": 2.223405122756958, + "learning_rate": 4.986778096688696e-05, + "loss": 5.8968, + "step": 5508 + }, + { + "epoch": 0.0327635835950138, + "grad_norm": 2.5733680725097656, + "learning_rate": 4.986773298638378e-05, + "loss": 6.0928, + "step": 5509 + }, + { + "epoch": 0.032769530878294796, + "grad_norm": 2.584397554397583, + "learning_rate": 4.986768499719953e-05, + "loss": 5.7879, + "step": 5510 + }, + { + "epoch": 0.03277547816157579, + "grad_norm": 3.160489797592163, + "learning_rate": 4.986763699933423e-05, + "loss": 5.6413, + "step": 5511 + }, + { + "epoch": 0.032781425444856786, + "grad_norm": 2.8224406242370605, + "learning_rate": 4.9867588992787894e-05, + "loss": 6.1476, + "step": 5512 + }, + { + "epoch": 0.03278737272813779, + "grad_norm": 2.2565996646881104, + "learning_rate": 4.986754097756054e-05, + "loss": 6.208, + "step": 5513 + }, + { + "epoch": 0.03279332001141878, + "grad_norm": 2.5425479412078857, + "learning_rate": 4.9867492953652184e-05, + "loss": 5.934, + "step": 5514 + }, + { + "epoch": 0.03279926729469978, + "grad_norm": 2.6598689556121826, + "learning_rate": 4.986744492106284e-05, + "loss": 5.7433, + "step": 5515 + }, + { + "epoch": 0.03280521457798078, + "grad_norm": 2.419388771057129, + "learning_rate": 4.986739687979253e-05, + "loss": 5.378, + "step": 5516 + }, + { + "epoch": 0.032811161861261776, + "grad_norm": 2.72784161567688, + "learning_rate": 4.986734882984127e-05, + "loss": 5.4089, + "step": 5517 + }, + { + "epoch": 0.03281710914454277, + "grad_norm": 3.0592923164367676, + "learning_rate": 4.9867300771209075e-05, + "loss": 5.9573, + "step": 5518 + }, + { + "epoch": 0.03282305642782377, + "grad_norm": 2.7681832313537598, + "learning_rate": 4.9867252703895965e-05, + "loss": 5.5325, + "step": 5519 + }, + { + "epoch": 0.03282900371110477, + "grad_norm": 2.6752777099609375, + "learning_rate": 4.9867204627901946e-05, + "loss": 5.7543, + "step": 5520 + }, + { + "epoch": 0.03283495099438576, + "grad_norm": 2.481203317642212, + "learning_rate": 4.9867156543227046e-05, + "loss": 5.575, + "step": 5521 + }, + { + "epoch": 0.032840898277666765, + "grad_norm": 2.6403908729553223, + "learning_rate": 4.986710844987128e-05, + "loss": 5.4381, + "step": 5522 + }, + { + "epoch": 0.03284684556094776, + "grad_norm": 2.6146085262298584, + "learning_rate": 4.986706034783466e-05, + "loss": 5.8672, + "step": 5523 + }, + { + "epoch": 0.032852792844228755, + "grad_norm": 3.453666925430298, + "learning_rate": 4.986701223711722e-05, + "loss": 5.8353, + "step": 5524 + }, + { + "epoch": 0.03285874012750975, + "grad_norm": 2.511216640472412, + "learning_rate": 4.986696411771895e-05, + "loss": 5.9567, + "step": 5525 + }, + { + "epoch": 0.03286468741079075, + "grad_norm": 2.57395601272583, + "learning_rate": 4.986691598963988e-05, + "loss": 5.6396, + "step": 5526 + }, + { + "epoch": 0.03287063469407175, + "grad_norm": 2.778801441192627, + "learning_rate": 4.986686785288003e-05, + "loss": 6.0237, + "step": 5527 + }, + { + "epoch": 0.03287658197735274, + "grad_norm": 2.5216047763824463, + "learning_rate": 4.986681970743941e-05, + "loss": 6.1305, + "step": 5528 + }, + { + "epoch": 0.032882529260633744, + "grad_norm": 2.5105085372924805, + "learning_rate": 4.986677155331804e-05, + "loss": 6.4951, + "step": 5529 + }, + { + "epoch": 0.03288847654391474, + "grad_norm": 2.4105372428894043, + "learning_rate": 4.9866723390515946e-05, + "loss": 6.291, + "step": 5530 + }, + { + "epoch": 0.032894423827195735, + "grad_norm": 2.740095853805542, + "learning_rate": 4.9866675219033125e-05, + "loss": 5.762, + "step": 5531 + }, + { + "epoch": 0.03290037111047674, + "grad_norm": 2.327892541885376, + "learning_rate": 4.9866627038869605e-05, + "loss": 6.1023, + "step": 5532 + }, + { + "epoch": 0.03290631839375773, + "grad_norm": 2.71732497215271, + "learning_rate": 4.9866578850025414e-05, + "loss": 6.0739, + "step": 5533 + }, + { + "epoch": 0.03291226567703873, + "grad_norm": 2.1895039081573486, + "learning_rate": 4.9866530652500545e-05, + "loss": 5.801, + "step": 5534 + }, + { + "epoch": 0.03291821296031973, + "grad_norm": 2.39670729637146, + "learning_rate": 4.986648244629503e-05, + "loss": 6.0105, + "step": 5535 + }, + { + "epoch": 0.032924160243600724, + "grad_norm": 2.14630126953125, + "learning_rate": 4.986643423140889e-05, + "loss": 5.8457, + "step": 5536 + }, + { + "epoch": 0.03293010752688172, + "grad_norm": 2.111196994781494, + "learning_rate": 4.9866386007842125e-05, + "loss": 6.0804, + "step": 5537 + }, + { + "epoch": 0.03293605481016272, + "grad_norm": 2.8245434761047363, + "learning_rate": 4.986633777559476e-05, + "loss": 6.3152, + "step": 5538 + }, + { + "epoch": 0.032942002093443716, + "grad_norm": 2.3561060428619385, + "learning_rate": 4.9866289534666824e-05, + "loss": 6.286, + "step": 5539 + }, + { + "epoch": 0.03294794937672471, + "grad_norm": 3.21701979637146, + "learning_rate": 4.986624128505832e-05, + "loss": 5.9775, + "step": 5540 + }, + { + "epoch": 0.032953896660005706, + "grad_norm": 3.9414072036743164, + "learning_rate": 4.9866193026769265e-05, + "loss": 5.9413, + "step": 5541 + }, + { + "epoch": 0.03295984394328671, + "grad_norm": 2.7801051139831543, + "learning_rate": 4.986614475979968e-05, + "loss": 5.8642, + "step": 5542 + }, + { + "epoch": 0.0329657912265677, + "grad_norm": 2.7095935344696045, + "learning_rate": 4.986609648414958e-05, + "loss": 5.6952, + "step": 5543 + }, + { + "epoch": 0.0329717385098487, + "grad_norm": 2.5800812244415283, + "learning_rate": 4.986604819981898e-05, + "loss": 6.0285, + "step": 5544 + }, + { + "epoch": 0.0329776857931297, + "grad_norm": 2.6105730533599854, + "learning_rate": 4.9865999906807904e-05, + "loss": 5.6683, + "step": 5545 + }, + { + "epoch": 0.032983633076410696, + "grad_norm": 2.635570764541626, + "learning_rate": 4.9865951605116366e-05, + "loss": 5.9092, + "step": 5546 + }, + { + "epoch": 0.03298958035969169, + "grad_norm": 2.3708200454711914, + "learning_rate": 4.9865903294744373e-05, + "loss": 6.0034, + "step": 5547 + }, + { + "epoch": 0.03299552764297269, + "grad_norm": 2.437201499938965, + "learning_rate": 4.986585497569196e-05, + "loss": 6.2587, + "step": 5548 + }, + { + "epoch": 0.03300147492625369, + "grad_norm": 2.076016426086426, + "learning_rate": 4.9865806647959126e-05, + "loss": 6.358, + "step": 5549 + }, + { + "epoch": 0.03300742220953468, + "grad_norm": 1.8261257410049438, + "learning_rate": 4.98657583115459e-05, + "loss": 6.0431, + "step": 5550 + }, + { + "epoch": 0.033013369492815685, + "grad_norm": 2.8339858055114746, + "learning_rate": 4.98657099664523e-05, + "loss": 5.7956, + "step": 5551 + }, + { + "epoch": 0.03301931677609668, + "grad_norm": 2.7288596630096436, + "learning_rate": 4.986566161267833e-05, + "loss": 5.7092, + "step": 5552 + }, + { + "epoch": 0.033025264059377675, + "grad_norm": 2.7197329998016357, + "learning_rate": 4.986561325022402e-05, + "loss": 5.649, + "step": 5553 + }, + { + "epoch": 0.03303121134265867, + "grad_norm": 2.6161739826202393, + "learning_rate": 4.986556487908937e-05, + "loss": 5.6935, + "step": 5554 + }, + { + "epoch": 0.03303715862593967, + "grad_norm": 2.695068597793579, + "learning_rate": 4.986551649927441e-05, + "loss": 5.6901, + "step": 5555 + }, + { + "epoch": 0.03304310590922067, + "grad_norm": 3.0315186977386475, + "learning_rate": 4.986546811077917e-05, + "loss": 5.6317, + "step": 5556 + }, + { + "epoch": 0.03304905319250166, + "grad_norm": 2.3597543239593506, + "learning_rate": 4.986541971360364e-05, + "loss": 5.8129, + "step": 5557 + }, + { + "epoch": 0.033055000475782664, + "grad_norm": 2.8090550899505615, + "learning_rate": 4.986537130774785e-05, + "loss": 6.4427, + "step": 5558 + }, + { + "epoch": 0.03306094775906366, + "grad_norm": 3.4232771396636963, + "learning_rate": 4.986532289321182e-05, + "loss": 6.5737, + "step": 5559 + }, + { + "epoch": 0.033066895042344654, + "grad_norm": 2.1425294876098633, + "learning_rate": 4.986527446999556e-05, + "loss": 6.2395, + "step": 5560 + }, + { + "epoch": 0.033072842325625657, + "grad_norm": 2.5348880290985107, + "learning_rate": 4.986522603809909e-05, + "loss": 6.0425, + "step": 5561 + }, + { + "epoch": 0.03307878960890665, + "grad_norm": 3.0824179649353027, + "learning_rate": 4.986517759752242e-05, + "loss": 5.8785, + "step": 5562 + }, + { + "epoch": 0.03308473689218765, + "grad_norm": 2.297706365585327, + "learning_rate": 4.986512914826558e-05, + "loss": 5.8989, + "step": 5563 + }, + { + "epoch": 0.03309068417546865, + "grad_norm": 2.866257667541504, + "learning_rate": 4.986508069032858e-05, + "loss": 5.8905, + "step": 5564 + }, + { + "epoch": 0.033096631458749644, + "grad_norm": 2.2450008392333984, + "learning_rate": 4.9865032223711436e-05, + "loss": 6.3302, + "step": 5565 + }, + { + "epoch": 0.03310257874203064, + "grad_norm": 2.235558271408081, + "learning_rate": 4.9864983748414166e-05, + "loss": 6.4235, + "step": 5566 + }, + { + "epoch": 0.03310852602531164, + "grad_norm": 2.5197713375091553, + "learning_rate": 4.986493526443679e-05, + "loss": 6.3999, + "step": 5567 + }, + { + "epoch": 0.033114473308592636, + "grad_norm": 2.5716195106506348, + "learning_rate": 4.986488677177932e-05, + "loss": 6.0258, + "step": 5568 + }, + { + "epoch": 0.03312042059187363, + "grad_norm": 2.468663454055786, + "learning_rate": 4.986483827044177e-05, + "loss": 6.7553, + "step": 5569 + }, + { + "epoch": 0.033126367875154626, + "grad_norm": 2.4334170818328857, + "learning_rate": 4.986478976042417e-05, + "loss": 6.4722, + "step": 5570 + }, + { + "epoch": 0.03313231515843563, + "grad_norm": 2.234487533569336, + "learning_rate": 4.986474124172652e-05, + "loss": 5.7158, + "step": 5571 + }, + { + "epoch": 0.03313826244171662, + "grad_norm": 2.8017537593841553, + "learning_rate": 4.9864692714348857e-05, + "loss": 5.9552, + "step": 5572 + }, + { + "epoch": 0.03314420972499762, + "grad_norm": 3.171354055404663, + "learning_rate": 4.986464417829118e-05, + "loss": 6.027, + "step": 5573 + }, + { + "epoch": 0.03315015700827862, + "grad_norm": 2.890169620513916, + "learning_rate": 4.9864595633553516e-05, + "loss": 6.2768, + "step": 5574 + }, + { + "epoch": 0.033156104291559615, + "grad_norm": 3.010934829711914, + "learning_rate": 4.986454708013587e-05, + "loss": 6.4054, + "step": 5575 + }, + { + "epoch": 0.03316205157484061, + "grad_norm": 2.143833875656128, + "learning_rate": 4.9864498518038274e-05, + "loss": 6.3771, + "step": 5576 + }, + { + "epoch": 0.03316799885812161, + "grad_norm": 2.2067418098449707, + "learning_rate": 4.986444994726074e-05, + "loss": 6.0158, + "step": 5577 + }, + { + "epoch": 0.03317394614140261, + "grad_norm": 2.3396403789520264, + "learning_rate": 4.986440136780328e-05, + "loss": 6.4286, + "step": 5578 + }, + { + "epoch": 0.0331798934246836, + "grad_norm": 2.8305866718292236, + "learning_rate": 4.9864352779665915e-05, + "loss": 5.7804, + "step": 5579 + }, + { + "epoch": 0.033185840707964605, + "grad_norm": 2.748194456100464, + "learning_rate": 4.9864304182848664e-05, + "loss": 6.1711, + "step": 5580 + }, + { + "epoch": 0.0331917879912456, + "grad_norm": 2.329761505126953, + "learning_rate": 4.9864255577351534e-05, + "loss": 6.2722, + "step": 5581 + }, + { + "epoch": 0.033197735274526595, + "grad_norm": 2.4633524417877197, + "learning_rate": 4.986420696317457e-05, + "loss": 6.1349, + "step": 5582 + }, + { + "epoch": 0.03320368255780759, + "grad_norm": 1.8909802436828613, + "learning_rate": 4.986415834031775e-05, + "loss": 6.2181, + "step": 5583 + }, + { + "epoch": 0.03320962984108859, + "grad_norm": 2.1794517040252686, + "learning_rate": 4.9864109708781104e-05, + "loss": 6.2808, + "step": 5584 + }, + { + "epoch": 0.03321557712436959, + "grad_norm": 2.1766669750213623, + "learning_rate": 4.986406106856466e-05, + "loss": 6.3004, + "step": 5585 + }, + { + "epoch": 0.03322152440765058, + "grad_norm": 2.27526593208313, + "learning_rate": 4.986401241966844e-05, + "loss": 5.9225, + "step": 5586 + }, + { + "epoch": 0.033227471690931584, + "grad_norm": 3.2843096256256104, + "learning_rate": 4.986396376209244e-05, + "loss": 5.8364, + "step": 5587 + }, + { + "epoch": 0.03323341897421258, + "grad_norm": 2.509831666946411, + "learning_rate": 4.9863915095836685e-05, + "loss": 5.6958, + "step": 5588 + }, + { + "epoch": 0.033239366257493574, + "grad_norm": 2.5235815048217773, + "learning_rate": 4.98638664209012e-05, + "loss": 5.4937, + "step": 5589 + }, + { + "epoch": 0.033245313540774576, + "grad_norm": 2.918334484100342, + "learning_rate": 4.986381773728599e-05, + "loss": 5.8284, + "step": 5590 + }, + { + "epoch": 0.03325126082405557, + "grad_norm": 2.8091490268707275, + "learning_rate": 4.986376904499108e-05, + "loss": 5.8126, + "step": 5591 + }, + { + "epoch": 0.03325720810733657, + "grad_norm": 2.555173635482788, + "learning_rate": 4.986372034401649e-05, + "loss": 5.6393, + "step": 5592 + }, + { + "epoch": 0.03326315539061757, + "grad_norm": 2.6366164684295654, + "learning_rate": 4.986367163436223e-05, + "loss": 6.6675, + "step": 5593 + }, + { + "epoch": 0.033269102673898564, + "grad_norm": 2.5691051483154297, + "learning_rate": 4.9863622916028316e-05, + "loss": 6.5808, + "step": 5594 + }, + { + "epoch": 0.03327504995717956, + "grad_norm": 2.239384889602661, + "learning_rate": 4.986357418901477e-05, + "loss": 6.0191, + "step": 5595 + }, + { + "epoch": 0.03328099724046056, + "grad_norm": 2.3877806663513184, + "learning_rate": 4.9863525453321614e-05, + "loss": 5.7429, + "step": 5596 + }, + { + "epoch": 0.033286944523741556, + "grad_norm": 2.559633731842041, + "learning_rate": 4.9863476708948846e-05, + "loss": 5.4866, + "step": 5597 + }, + { + "epoch": 0.03329289180702255, + "grad_norm": 3.7681171894073486, + "learning_rate": 4.98634279558965e-05, + "loss": 5.6139, + "step": 5598 + }, + { + "epoch": 0.033298839090303546, + "grad_norm": 3.999264717102051, + "learning_rate": 4.9863379194164594e-05, + "loss": 5.6031, + "step": 5599 + }, + { + "epoch": 0.03330478637358455, + "grad_norm": 3.1031601428985596, + "learning_rate": 4.986333042375313e-05, + "loss": 5.5397, + "step": 5600 + }, + { + "epoch": 0.03331073365686554, + "grad_norm": 3.104998826980591, + "learning_rate": 4.986328164466214e-05, + "loss": 5.4274, + "step": 5601 + }, + { + "epoch": 0.03331668094014654, + "grad_norm": 2.9426207542419434, + "learning_rate": 4.986323285689163e-05, + "loss": 5.5859, + "step": 5602 + }, + { + "epoch": 0.03332262822342754, + "grad_norm": 2.6912827491760254, + "learning_rate": 4.986318406044163e-05, + "loss": 5.7375, + "step": 5603 + }, + { + "epoch": 0.033328575506708535, + "grad_norm": 4.394237041473389, + "learning_rate": 4.9863135255312145e-05, + "loss": 5.8246, + "step": 5604 + }, + { + "epoch": 0.03333452278998953, + "grad_norm": 2.812197685241699, + "learning_rate": 4.986308644150319e-05, + "loss": 5.6263, + "step": 5605 + }, + { + "epoch": 0.03334047007327053, + "grad_norm": 3.1969878673553467, + "learning_rate": 4.98630376190148e-05, + "loss": 5.4174, + "step": 5606 + }, + { + "epoch": 0.03334641735655153, + "grad_norm": 2.6018595695495605, + "learning_rate": 4.9862988787846975e-05, + "loss": 5.3917, + "step": 5607 + }, + { + "epoch": 0.03335236463983252, + "grad_norm": 2.5274007320404053, + "learning_rate": 4.986293994799974e-05, + "loss": 5.4252, + "step": 5608 + }, + { + "epoch": 0.033358311923113525, + "grad_norm": 2.57043194770813, + "learning_rate": 4.9862891099473105e-05, + "loss": 5.5321, + "step": 5609 + }, + { + "epoch": 0.03336425920639452, + "grad_norm": 3.4353785514831543, + "learning_rate": 4.986284224226709e-05, + "loss": 5.6599, + "step": 5610 + }, + { + "epoch": 0.033370206489675515, + "grad_norm": 3.308945894241333, + "learning_rate": 4.986279337638172e-05, + "loss": 5.8668, + "step": 5611 + }, + { + "epoch": 0.03337615377295652, + "grad_norm": 2.789703607559204, + "learning_rate": 4.9862744501817006e-05, + "loss": 5.8352, + "step": 5612 + }, + { + "epoch": 0.03338210105623751, + "grad_norm": 1.9887118339538574, + "learning_rate": 4.986269561857296e-05, + "loss": 5.7527, + "step": 5613 + }, + { + "epoch": 0.03338804833951851, + "grad_norm": 2.5447990894317627, + "learning_rate": 4.986264672664961e-05, + "loss": 5.5539, + "step": 5614 + }, + { + "epoch": 0.0333939956227995, + "grad_norm": 2.2903668880462646, + "learning_rate": 4.9862597826046965e-05, + "loss": 5.4555, + "step": 5615 + }, + { + "epoch": 0.033399942906080504, + "grad_norm": 3.1669414043426514, + "learning_rate": 4.986254891676504e-05, + "loss": 5.6852, + "step": 5616 + }, + { + "epoch": 0.0334058901893615, + "grad_norm": 3.7491395473480225, + "learning_rate": 4.986249999880386e-05, + "loss": 5.682, + "step": 5617 + }, + { + "epoch": 0.033411837472642494, + "grad_norm": 3.0548582077026367, + "learning_rate": 4.986245107216343e-05, + "loss": 5.7844, + "step": 5618 + }, + { + "epoch": 0.033417784755923496, + "grad_norm": 2.628957509994507, + "learning_rate": 4.986240213684378e-05, + "loss": 5.5646, + "step": 5619 + }, + { + "epoch": 0.03342373203920449, + "grad_norm": 2.050936460494995, + "learning_rate": 4.986235319284492e-05, + "loss": 5.7187, + "step": 5620 + }, + { + "epoch": 0.03342967932248549, + "grad_norm": 2.2839999198913574, + "learning_rate": 4.986230424016688e-05, + "loss": 5.6613, + "step": 5621 + }, + { + "epoch": 0.03343562660576649, + "grad_norm": 2.177778959274292, + "learning_rate": 4.986225527880966e-05, + "loss": 5.7205, + "step": 5622 + }, + { + "epoch": 0.033441573889047484, + "grad_norm": 2.1690266132354736, + "learning_rate": 4.9862206308773286e-05, + "loss": 5.4344, + "step": 5623 + }, + { + "epoch": 0.03344752117232848, + "grad_norm": 2.0134127140045166, + "learning_rate": 4.9862157330057766e-05, + "loss": 5.7872, + "step": 5624 + }, + { + "epoch": 0.03345346845560948, + "grad_norm": 2.0246710777282715, + "learning_rate": 4.986210834266313e-05, + "loss": 5.3291, + "step": 5625 + }, + { + "epoch": 0.033459415738890476, + "grad_norm": 2.020939350128174, + "learning_rate": 4.986205934658939e-05, + "loss": 5.3966, + "step": 5626 + }, + { + "epoch": 0.03346536302217147, + "grad_norm": 2.3261308670043945, + "learning_rate": 4.986201034183655e-05, + "loss": 5.4667, + "step": 5627 + }, + { + "epoch": 0.033471310305452466, + "grad_norm": 2.135641574859619, + "learning_rate": 4.9861961328404646e-05, + "loss": 5.4925, + "step": 5628 + }, + { + "epoch": 0.03347725758873347, + "grad_norm": 2.3122894763946533, + "learning_rate": 4.986191230629369e-05, + "loss": 5.6665, + "step": 5629 + }, + { + "epoch": 0.03348320487201446, + "grad_norm": 2.4461214542388916, + "learning_rate": 4.98618632755037e-05, + "loss": 5.8442, + "step": 5630 + }, + { + "epoch": 0.03348915215529546, + "grad_norm": 2.189009189605713, + "learning_rate": 4.9861814236034685e-05, + "loss": 5.5793, + "step": 5631 + }, + { + "epoch": 0.03349509943857646, + "grad_norm": 2.1961586475372314, + "learning_rate": 4.986176518788667e-05, + "loss": 5.5364, + "step": 5632 + }, + { + "epoch": 0.033501046721857455, + "grad_norm": 2.120177745819092, + "learning_rate": 4.986171613105967e-05, + "loss": 5.4042, + "step": 5633 + }, + { + "epoch": 0.03350699400513845, + "grad_norm": 1.9021252393722534, + "learning_rate": 4.9861667065553696e-05, + "loss": 5.2665, + "step": 5634 + }, + { + "epoch": 0.03351294128841945, + "grad_norm": 1.8944766521453857, + "learning_rate": 4.986161799136878e-05, + "loss": 5.3853, + "step": 5635 + }, + { + "epoch": 0.03351888857170045, + "grad_norm": 2.059847354888916, + "learning_rate": 4.9861568908504916e-05, + "loss": 5.3046, + "step": 5636 + }, + { + "epoch": 0.03352483585498144, + "grad_norm": 2.1350111961364746, + "learning_rate": 4.9861519816962155e-05, + "loss": 5.3684, + "step": 5637 + }, + { + "epoch": 0.033530783138262445, + "grad_norm": 2.0733792781829834, + "learning_rate": 4.986147071674048e-05, + "loss": 5.4581, + "step": 5638 + }, + { + "epoch": 0.03353673042154344, + "grad_norm": 2.0736827850341797, + "learning_rate": 4.986142160783993e-05, + "loss": 5.7019, + "step": 5639 + }, + { + "epoch": 0.033542677704824435, + "grad_norm": 2.1903107166290283, + "learning_rate": 4.986137249026051e-05, + "loss": 5.4353, + "step": 5640 + }, + { + "epoch": 0.03354862498810544, + "grad_norm": 2.2678940296173096, + "learning_rate": 4.9861323364002244e-05, + "loss": 5.4951, + "step": 5641 + }, + { + "epoch": 0.03355457227138643, + "grad_norm": 3.590702772140503, + "learning_rate": 4.9861274229065145e-05, + "loss": 6.1522, + "step": 5642 + }, + { + "epoch": 0.03356051955466743, + "grad_norm": 2.0955893993377686, + "learning_rate": 4.9861225085449224e-05, + "loss": 5.3544, + "step": 5643 + }, + { + "epoch": 0.03356646683794842, + "grad_norm": 1.9370301961898804, + "learning_rate": 4.986117593315452e-05, + "loss": 5.4732, + "step": 5644 + }, + { + "epoch": 0.033572414121229424, + "grad_norm": 2.141752243041992, + "learning_rate": 4.986112677218103e-05, + "loss": 5.5768, + "step": 5645 + }, + { + "epoch": 0.03357836140451042, + "grad_norm": 1.9236360788345337, + "learning_rate": 4.986107760252878e-05, + "loss": 5.7641, + "step": 5646 + }, + { + "epoch": 0.033584308687791414, + "grad_norm": 1.8353725671768188, + "learning_rate": 4.9861028424197785e-05, + "loss": 5.8011, + "step": 5647 + }, + { + "epoch": 0.033590255971072416, + "grad_norm": 2.0918078422546387, + "learning_rate": 4.9860979237188055e-05, + "loss": 5.6862, + "step": 5648 + }, + { + "epoch": 0.03359620325435341, + "grad_norm": 2.2244462966918945, + "learning_rate": 4.986093004149962e-05, + "loss": 5.472, + "step": 5649 + }, + { + "epoch": 0.033602150537634407, + "grad_norm": 2.1517422199249268, + "learning_rate": 4.9860880837132495e-05, + "loss": 5.3655, + "step": 5650 + }, + { + "epoch": 0.03360809782091541, + "grad_norm": 2.241863489151001, + "learning_rate": 4.986083162408669e-05, + "loss": 5.5385, + "step": 5651 + }, + { + "epoch": 0.033614045104196404, + "grad_norm": 2.458171844482422, + "learning_rate": 4.986078240236222e-05, + "loss": 5.5531, + "step": 5652 + }, + { + "epoch": 0.0336199923874774, + "grad_norm": 2.2601864337921143, + "learning_rate": 4.986073317195911e-05, + "loss": 5.9313, + "step": 5653 + }, + { + "epoch": 0.0336259396707584, + "grad_norm": 2.243647575378418, + "learning_rate": 4.986068393287738e-05, + "loss": 5.4064, + "step": 5654 + }, + { + "epoch": 0.033631886954039396, + "grad_norm": 2.283515453338623, + "learning_rate": 4.986063468511704e-05, + "loss": 5.295, + "step": 5655 + }, + { + "epoch": 0.03363783423732039, + "grad_norm": 2.701770305633545, + "learning_rate": 4.986058542867811e-05, + "loss": 5.8548, + "step": 5656 + }, + { + "epoch": 0.033643781520601386, + "grad_norm": 2.8186864852905273, + "learning_rate": 4.98605361635606e-05, + "loss": 5.378, + "step": 5657 + }, + { + "epoch": 0.03364972880388239, + "grad_norm": 2.6508500576019287, + "learning_rate": 4.9860486889764536e-05, + "loss": 5.469, + "step": 5658 + }, + { + "epoch": 0.03365567608716338, + "grad_norm": 2.3984878063201904, + "learning_rate": 4.986043760728994e-05, + "loss": 5.3978, + "step": 5659 + }, + { + "epoch": 0.03366162337044438, + "grad_norm": 3.64663028717041, + "learning_rate": 4.9860388316136814e-05, + "loss": 5.502, + "step": 5660 + }, + { + "epoch": 0.03366757065372538, + "grad_norm": 3.1112046241760254, + "learning_rate": 4.986033901630519e-05, + "loss": 5.7347, + "step": 5661 + }, + { + "epoch": 0.033673517937006375, + "grad_norm": 2.619877338409424, + "learning_rate": 4.9860289707795074e-05, + "loss": 6.2099, + "step": 5662 + }, + { + "epoch": 0.03367946522028737, + "grad_norm": 2.0318470001220703, + "learning_rate": 4.986024039060648e-05, + "loss": 6.246, + "step": 5663 + }, + { + "epoch": 0.03368541250356837, + "grad_norm": 2.1484673023223877, + "learning_rate": 4.986019106473945e-05, + "loss": 6.1689, + "step": 5664 + }, + { + "epoch": 0.03369135978684937, + "grad_norm": 2.6159844398498535, + "learning_rate": 4.9860141730193974e-05, + "loss": 5.8217, + "step": 5665 + }, + { + "epoch": 0.03369730707013036, + "grad_norm": 2.5019965171813965, + "learning_rate": 4.9860092386970084e-05, + "loss": 6.1138, + "step": 5666 + }, + { + "epoch": 0.033703254353411365, + "grad_norm": 2.962315797805786, + "learning_rate": 4.9860043035067785e-05, + "loss": 5.7057, + "step": 5667 + }, + { + "epoch": 0.03370920163669236, + "grad_norm": 2.455721139907837, + "learning_rate": 4.9859993674487106e-05, + "loss": 5.6203, + "step": 5668 + }, + { + "epoch": 0.033715148919973355, + "grad_norm": 2.432368278503418, + "learning_rate": 4.9859944305228066e-05, + "loss": 6.2337, + "step": 5669 + }, + { + "epoch": 0.03372109620325436, + "grad_norm": 2.3222782611846924, + "learning_rate": 4.985989492729067e-05, + "loss": 6.2845, + "step": 5670 + }, + { + "epoch": 0.03372704348653535, + "grad_norm": 2.107440948486328, + "learning_rate": 4.985984554067494e-05, + "loss": 6.2404, + "step": 5671 + }, + { + "epoch": 0.03373299076981635, + "grad_norm": 1.9450268745422363, + "learning_rate": 4.98597961453809e-05, + "loss": 6.1679, + "step": 5672 + }, + { + "epoch": 0.03373893805309734, + "grad_norm": 1.7591795921325684, + "learning_rate": 4.9859746741408554e-05, + "loss": 6.3425, + "step": 5673 + }, + { + "epoch": 0.033744885336378344, + "grad_norm": 2.009420871734619, + "learning_rate": 4.985969732875794e-05, + "loss": 6.3607, + "step": 5674 + }, + { + "epoch": 0.03375083261965934, + "grad_norm": 2.097215175628662, + "learning_rate": 4.9859647907429054e-05, + "loss": 6.2009, + "step": 5675 + }, + { + "epoch": 0.033756779902940334, + "grad_norm": 1.7670379877090454, + "learning_rate": 4.985959847742192e-05, + "loss": 5.935, + "step": 5676 + }, + { + "epoch": 0.033762727186221336, + "grad_norm": 2.052022695541382, + "learning_rate": 4.985954903873656e-05, + "loss": 5.4054, + "step": 5677 + }, + { + "epoch": 0.03376867446950233, + "grad_norm": 1.9225167036056519, + "learning_rate": 4.985949959137298e-05, + "loss": 5.6905, + "step": 5678 + }, + { + "epoch": 0.033774621752783326, + "grad_norm": 2.4080653190612793, + "learning_rate": 4.985945013533122e-05, + "loss": 6.5566, + "step": 5679 + }, + { + "epoch": 0.03378056903606433, + "grad_norm": 2.8340251445770264, + "learning_rate": 4.985940067061128e-05, + "loss": 6.3556, + "step": 5680 + }, + { + "epoch": 0.033786516319345324, + "grad_norm": 2.2872672080993652, + "learning_rate": 4.985935119721317e-05, + "loss": 6.1806, + "step": 5681 + }, + { + "epoch": 0.03379246360262632, + "grad_norm": 3.309203863143921, + "learning_rate": 4.985930171513692e-05, + "loss": 6.1766, + "step": 5682 + }, + { + "epoch": 0.03379841088590732, + "grad_norm": 2.936709403991699, + "learning_rate": 4.985925222438255e-05, + "loss": 5.907, + "step": 5683 + }, + { + "epoch": 0.033804358169188316, + "grad_norm": 2.3226964473724365, + "learning_rate": 4.985920272495007e-05, + "loss": 5.5734, + "step": 5684 + }, + { + "epoch": 0.03381030545246931, + "grad_norm": 2.3053154945373535, + "learning_rate": 4.98591532168395e-05, + "loss": 6.5688, + "step": 5685 + }, + { + "epoch": 0.033816252735750306, + "grad_norm": 2.2494077682495117, + "learning_rate": 4.985910370005086e-05, + "loss": 6.3539, + "step": 5686 + }, + { + "epoch": 0.03382220001903131, + "grad_norm": 1.9559924602508545, + "learning_rate": 4.9859054174584155e-05, + "loss": 6.2015, + "step": 5687 + }, + { + "epoch": 0.0338281473023123, + "grad_norm": 2.7915425300598145, + "learning_rate": 4.985900464043942e-05, + "loss": 5.7426, + "step": 5688 + }, + { + "epoch": 0.0338340945855933, + "grad_norm": 2.448496103286743, + "learning_rate": 4.985895509761665e-05, + "loss": 6.2697, + "step": 5689 + }, + { + "epoch": 0.0338400418688743, + "grad_norm": 1.7736696004867554, + "learning_rate": 4.9858905546115885e-05, + "loss": 6.5513, + "step": 5690 + }, + { + "epoch": 0.033845989152155295, + "grad_norm": 1.668285608291626, + "learning_rate": 4.9858855985937136e-05, + "loss": 6.0179, + "step": 5691 + }, + { + "epoch": 0.03385193643543629, + "grad_norm": 2.157799243927002, + "learning_rate": 4.985880641708042e-05, + "loss": 6.1863, + "step": 5692 + }, + { + "epoch": 0.03385788371871729, + "grad_norm": 2.2437758445739746, + "learning_rate": 4.985875683954574e-05, + "loss": 6.128, + "step": 5693 + }, + { + "epoch": 0.03386383100199829, + "grad_norm": 2.8323628902435303, + "learning_rate": 4.9858707253333124e-05, + "loss": 6.2746, + "step": 5694 + }, + { + "epoch": 0.03386977828527928, + "grad_norm": 2.270587205886841, + "learning_rate": 4.98586576584426e-05, + "loss": 6.1002, + "step": 5695 + }, + { + "epoch": 0.033875725568560285, + "grad_norm": 1.9165533781051636, + "learning_rate": 4.985860805487417e-05, + "loss": 5.7016, + "step": 5696 + }, + { + "epoch": 0.03388167285184128, + "grad_norm": 2.230407953262329, + "learning_rate": 4.985855844262786e-05, + "loss": 5.9649, + "step": 5697 + }, + { + "epoch": 0.033887620135122275, + "grad_norm": 2.5094211101531982, + "learning_rate": 4.985850882170368e-05, + "loss": 6.0184, + "step": 5698 + }, + { + "epoch": 0.03389356741840328, + "grad_norm": 2.6195943355560303, + "learning_rate": 4.9858459192101656e-05, + "loss": 5.8501, + "step": 5699 + }, + { + "epoch": 0.03389951470168427, + "grad_norm": 2.747486114501953, + "learning_rate": 4.9858409553821794e-05, + "loss": 5.7066, + "step": 5700 + }, + { + "epoch": 0.03390546198496527, + "grad_norm": 2.154109001159668, + "learning_rate": 4.985835990686413e-05, + "loss": 6.1072, + "step": 5701 + }, + { + "epoch": 0.03391140926824626, + "grad_norm": 2.4329216480255127, + "learning_rate": 4.9858310251228655e-05, + "loss": 5.9552, + "step": 5702 + }, + { + "epoch": 0.033917356551527264, + "grad_norm": 2.4760935306549072, + "learning_rate": 4.9858260586915405e-05, + "loss": 5.9023, + "step": 5703 + }, + { + "epoch": 0.03392330383480826, + "grad_norm": 2.400474786758423, + "learning_rate": 4.9858210913924397e-05, + "loss": 6.1688, + "step": 5704 + }, + { + "epoch": 0.033929251118089254, + "grad_norm": 2.402930498123169, + "learning_rate": 4.9858161232255644e-05, + "loss": 6.0776, + "step": 5705 + }, + { + "epoch": 0.033935198401370256, + "grad_norm": 2.0408313274383545, + "learning_rate": 4.985811154190916e-05, + "loss": 6.1841, + "step": 5706 + }, + { + "epoch": 0.03394114568465125, + "grad_norm": 1.889190912246704, + "learning_rate": 4.9858061842884976e-05, + "loss": 5.9689, + "step": 5707 + }, + { + "epoch": 0.033947092967932246, + "grad_norm": 2.2231624126434326, + "learning_rate": 4.9858012135183086e-05, + "loss": 6.0009, + "step": 5708 + }, + { + "epoch": 0.03395304025121325, + "grad_norm": 2.0229554176330566, + "learning_rate": 4.985796241880353e-05, + "loss": 6.3237, + "step": 5709 + }, + { + "epoch": 0.033958987534494244, + "grad_norm": 2.0570971965789795, + "learning_rate": 4.985791269374631e-05, + "loss": 6.3104, + "step": 5710 + }, + { + "epoch": 0.03396493481777524, + "grad_norm": 2.584663152694702, + "learning_rate": 4.9857862960011454e-05, + "loss": 5.8493, + "step": 5711 + }, + { + "epoch": 0.03397088210105624, + "grad_norm": 1.7870328426361084, + "learning_rate": 4.985781321759897e-05, + "loss": 6.2321, + "step": 5712 + }, + { + "epoch": 0.033976829384337236, + "grad_norm": 2.201756000518799, + "learning_rate": 4.9857763466508886e-05, + "loss": 6.1936, + "step": 5713 + }, + { + "epoch": 0.03398277666761823, + "grad_norm": 2.4489476680755615, + "learning_rate": 4.9857713706741216e-05, + "loss": 6.11, + "step": 5714 + }, + { + "epoch": 0.033988723950899226, + "grad_norm": 2.007643461227417, + "learning_rate": 4.9857663938295964e-05, + "loss": 6.288, + "step": 5715 + }, + { + "epoch": 0.03399467123418023, + "grad_norm": 1.8299764394760132, + "learning_rate": 4.9857614161173165e-05, + "loss": 6.0719, + "step": 5716 + }, + { + "epoch": 0.03400061851746122, + "grad_norm": 1.7619884014129639, + "learning_rate": 4.985756437537283e-05, + "loss": 6.1418, + "step": 5717 + }, + { + "epoch": 0.03400656580074222, + "grad_norm": 1.9445360898971558, + "learning_rate": 4.985751458089498e-05, + "loss": 6.1223, + "step": 5718 + }, + { + "epoch": 0.03401251308402322, + "grad_norm": 2.2320010662078857, + "learning_rate": 4.985746477773962e-05, + "loss": 5.5239, + "step": 5719 + }, + { + "epoch": 0.034018460367304215, + "grad_norm": 2.631765365600586, + "learning_rate": 4.985741496590678e-05, + "loss": 5.6348, + "step": 5720 + }, + { + "epoch": 0.03402440765058521, + "grad_norm": 2.4715576171875, + "learning_rate": 4.985736514539647e-05, + "loss": 5.9608, + "step": 5721 + }, + { + "epoch": 0.03403035493386621, + "grad_norm": 2.633188009262085, + "learning_rate": 4.985731531620871e-05, + "loss": 5.602, + "step": 5722 + }, + { + "epoch": 0.03403630221714721, + "grad_norm": 2.4303035736083984, + "learning_rate": 4.9857265478343526e-05, + "loss": 5.495, + "step": 5723 + }, + { + "epoch": 0.0340422495004282, + "grad_norm": 2.463447332382202, + "learning_rate": 4.985721563180092e-05, + "loss": 5.4633, + "step": 5724 + }, + { + "epoch": 0.034048196783709204, + "grad_norm": 2.349965810775757, + "learning_rate": 4.985716577658092e-05, + "loss": 6.0067, + "step": 5725 + }, + { + "epoch": 0.0340541440669902, + "grad_norm": 1.8741793632507324, + "learning_rate": 4.985711591268354e-05, + "loss": 5.8658, + "step": 5726 + }, + { + "epoch": 0.034060091350271195, + "grad_norm": 1.957612156867981, + "learning_rate": 4.98570660401088e-05, + "loss": 6.2016, + "step": 5727 + }, + { + "epoch": 0.0340660386335522, + "grad_norm": 2.4883556365966797, + "learning_rate": 4.985701615885671e-05, + "loss": 6.3056, + "step": 5728 + }, + { + "epoch": 0.03407198591683319, + "grad_norm": 2.6959800720214844, + "learning_rate": 4.98569662689273e-05, + "loss": 5.7267, + "step": 5729 + }, + { + "epoch": 0.03407793320011419, + "grad_norm": 2.579802989959717, + "learning_rate": 4.985691637032057e-05, + "loss": 5.2467, + "step": 5730 + }, + { + "epoch": 0.03408388048339518, + "grad_norm": 2.136262893676758, + "learning_rate": 4.985686646303656e-05, + "loss": 5.7071, + "step": 5731 + }, + { + "epoch": 0.034089827766676184, + "grad_norm": 2.1442244052886963, + "learning_rate": 4.985681654707526e-05, + "loss": 6.3961, + "step": 5732 + }, + { + "epoch": 0.03409577504995718, + "grad_norm": 2.164340019226074, + "learning_rate": 4.9856766622436714e-05, + "loss": 6.2455, + "step": 5733 + }, + { + "epoch": 0.034101722333238174, + "grad_norm": 2.199791193008423, + "learning_rate": 4.985671668912092e-05, + "loss": 5.8804, + "step": 5734 + }, + { + "epoch": 0.034107669616519176, + "grad_norm": 2.0359933376312256, + "learning_rate": 4.9856666747127905e-05, + "loss": 6.359, + "step": 5735 + }, + { + "epoch": 0.03411361689980017, + "grad_norm": 2.17069935798645, + "learning_rate": 4.985661679645769e-05, + "loss": 6.6736, + "step": 5736 + }, + { + "epoch": 0.034119564183081166, + "grad_norm": 1.9114634990692139, + "learning_rate": 4.9856566837110275e-05, + "loss": 5.9629, + "step": 5737 + }, + { + "epoch": 0.03412551146636217, + "grad_norm": 2.2872474193573, + "learning_rate": 4.9856516869085704e-05, + "loss": 5.5856, + "step": 5738 + }, + { + "epoch": 0.03413145874964316, + "grad_norm": 2.0800466537475586, + "learning_rate": 4.9856466892383965e-05, + "loss": 5.7732, + "step": 5739 + }, + { + "epoch": 0.03413740603292416, + "grad_norm": 2.37117338180542, + "learning_rate": 4.98564169070051e-05, + "loss": 5.667, + "step": 5740 + }, + { + "epoch": 0.03414335331620516, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.985636691294911e-05, + "loss": 5.4874, + "step": 5741 + }, + { + "epoch": 0.034149300599486156, + "grad_norm": 2.0097250938415527, + "learning_rate": 4.9856316910216024e-05, + "loss": 5.5469, + "step": 5742 + }, + { + "epoch": 0.03415524788276715, + "grad_norm": 2.430954933166504, + "learning_rate": 4.985626689880586e-05, + "loss": 5.7635, + "step": 5743 + }, + { + "epoch": 0.034161195166048146, + "grad_norm": 2.1000874042510986, + "learning_rate": 4.985621687871862e-05, + "loss": 5.7102, + "step": 5744 + }, + { + "epoch": 0.03416714244932915, + "grad_norm": 2.2048611640930176, + "learning_rate": 4.9856166849954336e-05, + "loss": 5.8156, + "step": 5745 + }, + { + "epoch": 0.03417308973261014, + "grad_norm": 2.145538330078125, + "learning_rate": 4.985611681251302e-05, + "loss": 5.9101, + "step": 5746 + }, + { + "epoch": 0.03417903701589114, + "grad_norm": 2.86169695854187, + "learning_rate": 4.9856066766394685e-05, + "loss": 5.7358, + "step": 5747 + }, + { + "epoch": 0.03418498429917214, + "grad_norm": 2.0648229122161865, + "learning_rate": 4.985601671159936e-05, + "loss": 6.0529, + "step": 5748 + }, + { + "epoch": 0.034190931582453135, + "grad_norm": 2.191251039505005, + "learning_rate": 4.985596664812706e-05, + "loss": 6.1999, + "step": 5749 + }, + { + "epoch": 0.03419687886573413, + "grad_norm": 2.556640148162842, + "learning_rate": 4.985591657597779e-05, + "loss": 6.0671, + "step": 5750 + }, + { + "epoch": 0.03420282614901513, + "grad_norm": 2.1796281337738037, + "learning_rate": 4.985586649515158e-05, + "loss": 6.1537, + "step": 5751 + }, + { + "epoch": 0.03420877343229613, + "grad_norm": 2.1884169578552246, + "learning_rate": 4.985581640564845e-05, + "loss": 5.7667, + "step": 5752 + }, + { + "epoch": 0.03421472071557712, + "grad_norm": 2.3836331367492676, + "learning_rate": 4.9855766307468404e-05, + "loss": 5.6608, + "step": 5753 + }, + { + "epoch": 0.034220667998858124, + "grad_norm": 2.0464322566986084, + "learning_rate": 4.985571620061147e-05, + "loss": 5.5317, + "step": 5754 + }, + { + "epoch": 0.03422661528213912, + "grad_norm": 2.3275644779205322, + "learning_rate": 4.9855666085077654e-05, + "loss": 5.8611, + "step": 5755 + }, + { + "epoch": 0.034232562565420115, + "grad_norm": 2.7268338203430176, + "learning_rate": 4.9855615960867e-05, + "loss": 5.6323, + "step": 5756 + }, + { + "epoch": 0.03423850984870112, + "grad_norm": 2.578986406326294, + "learning_rate": 4.985556582797949e-05, + "loss": 5.6108, + "step": 5757 + }, + { + "epoch": 0.03424445713198211, + "grad_norm": 2.4127955436706543, + "learning_rate": 4.985551568641516e-05, + "loss": 5.7054, + "step": 5758 + }, + { + "epoch": 0.03425040441526311, + "grad_norm": 2.1954357624053955, + "learning_rate": 4.985546553617404e-05, + "loss": 6.194, + "step": 5759 + }, + { + "epoch": 0.0342563516985441, + "grad_norm": 2.43851900100708, + "learning_rate": 4.985541537725612e-05, + "loss": 5.9067, + "step": 5760 + }, + { + "epoch": 0.034262298981825104, + "grad_norm": 2.0910801887512207, + "learning_rate": 4.9855365209661445e-05, + "loss": 6.1017, + "step": 5761 + }, + { + "epoch": 0.0342682462651061, + "grad_norm": 1.9936187267303467, + "learning_rate": 4.985531503339e-05, + "loss": 6.1239, + "step": 5762 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 2.0663299560546875, + "learning_rate": 4.985526484844183e-05, + "loss": 6.0514, + "step": 5763 + }, + { + "epoch": 0.034280140831668096, + "grad_norm": 2.4357266426086426, + "learning_rate": 4.985521465481695e-05, + "loss": 5.3695, + "step": 5764 + }, + { + "epoch": 0.03428608811494909, + "grad_norm": 2.12214994430542, + "learning_rate": 4.985516445251537e-05, + "loss": 5.5531, + "step": 5765 + }, + { + "epoch": 0.034292035398230086, + "grad_norm": 2.731661319732666, + "learning_rate": 4.9855114241537105e-05, + "loss": 6.2403, + "step": 5766 + }, + { + "epoch": 0.03429798268151109, + "grad_norm": 2.0668931007385254, + "learning_rate": 4.985506402188217e-05, + "loss": 6.0873, + "step": 5767 + }, + { + "epoch": 0.03430392996479208, + "grad_norm": 2.3165833950042725, + "learning_rate": 4.98550137935506e-05, + "loss": 5.9365, + "step": 5768 + }, + { + "epoch": 0.03430987724807308, + "grad_norm": 1.8637720346450806, + "learning_rate": 4.98549635565424e-05, + "loss": 6.0837, + "step": 5769 + }, + { + "epoch": 0.03431582453135408, + "grad_norm": 2.1689205169677734, + "learning_rate": 4.985491331085758e-05, + "loss": 5.703, + "step": 5770 + }, + { + "epoch": 0.034321771814635076, + "grad_norm": 2.245283365249634, + "learning_rate": 4.985486305649618e-05, + "loss": 6.0134, + "step": 5771 + }, + { + "epoch": 0.03432771909791607, + "grad_norm": 2.2685303688049316, + "learning_rate": 4.98548127934582e-05, + "loss": 5.279, + "step": 5772 + }, + { + "epoch": 0.034333666381197066, + "grad_norm": 2.376253128051758, + "learning_rate": 4.985476252174365e-05, + "loss": 5.5812, + "step": 5773 + }, + { + "epoch": 0.03433961366447807, + "grad_norm": 2.2636559009552, + "learning_rate": 4.985471224135257e-05, + "loss": 5.6906, + "step": 5774 + }, + { + "epoch": 0.03434556094775906, + "grad_norm": 2.22103214263916, + "learning_rate": 4.9854661952284965e-05, + "loss": 6.2066, + "step": 5775 + }, + { + "epoch": 0.03435150823104006, + "grad_norm": 2.308610439300537, + "learning_rate": 4.985461165454085e-05, + "loss": 6.1582, + "step": 5776 + }, + { + "epoch": 0.03435745551432106, + "grad_norm": 1.9191935062408447, + "learning_rate": 4.985456134812026e-05, + "loss": 5.4587, + "step": 5777 + }, + { + "epoch": 0.034363402797602055, + "grad_norm": 2.3127100467681885, + "learning_rate": 4.9854511033023184e-05, + "loss": 5.3375, + "step": 5778 + }, + { + "epoch": 0.03436935008088305, + "grad_norm": 2.4817371368408203, + "learning_rate": 4.985446070924966e-05, + "loss": 5.4961, + "step": 5779 + }, + { + "epoch": 0.03437529736416405, + "grad_norm": 2.0995922088623047, + "learning_rate": 4.9854410376799695e-05, + "loss": 5.7676, + "step": 5780 + }, + { + "epoch": 0.03438124464744505, + "grad_norm": 2.261229991912842, + "learning_rate": 4.985436003567332e-05, + "loss": 5.4446, + "step": 5781 + }, + { + "epoch": 0.03438719193072604, + "grad_norm": 2.275536060333252, + "learning_rate": 4.985430968587055e-05, + "loss": 5.4297, + "step": 5782 + }, + { + "epoch": 0.034393139214007044, + "grad_norm": 2.3733773231506348, + "learning_rate": 4.985425932739138e-05, + "loss": 5.7658, + "step": 5783 + }, + { + "epoch": 0.03439908649728804, + "grad_norm": 2.201716184616089, + "learning_rate": 4.985420896023586e-05, + "loss": 5.5502, + "step": 5784 + }, + { + "epoch": 0.034405033780569035, + "grad_norm": 2.1012730598449707, + "learning_rate": 4.9854158584403985e-05, + "loss": 5.7199, + "step": 5785 + }, + { + "epoch": 0.03441098106385004, + "grad_norm": 2.065568685531616, + "learning_rate": 4.985410819989579e-05, + "loss": 6.1547, + "step": 5786 + }, + { + "epoch": 0.03441692834713103, + "grad_norm": 1.9217867851257324, + "learning_rate": 4.9854057806711275e-05, + "loss": 6.2556, + "step": 5787 + }, + { + "epoch": 0.03442287563041203, + "grad_norm": 2.028602123260498, + "learning_rate": 4.985400740485047e-05, + "loss": 5.9347, + "step": 5788 + }, + { + "epoch": 0.03442882291369302, + "grad_norm": 2.002855062484741, + "learning_rate": 4.9853956994313376e-05, + "loss": 5.3966, + "step": 5789 + }, + { + "epoch": 0.034434770196974024, + "grad_norm": 2.3740642070770264, + "learning_rate": 4.985390657510003e-05, + "loss": 5.7801, + "step": 5790 + }, + { + "epoch": 0.03444071748025502, + "grad_norm": 2.1149635314941406, + "learning_rate": 4.9853856147210444e-05, + "loss": 5.6504, + "step": 5791 + }, + { + "epoch": 0.034446664763536014, + "grad_norm": 2.3519630432128906, + "learning_rate": 4.985380571064463e-05, + "loss": 5.9172, + "step": 5792 + }, + { + "epoch": 0.034452612046817016, + "grad_norm": 2.38930082321167, + "learning_rate": 4.985375526540261e-05, + "loss": 5.6196, + "step": 5793 + }, + { + "epoch": 0.03445855933009801, + "grad_norm": 2.245596408843994, + "learning_rate": 4.98537048114844e-05, + "loss": 5.5034, + "step": 5794 + }, + { + "epoch": 0.034464506613379006, + "grad_norm": 2.272158622741699, + "learning_rate": 4.985365434889002e-05, + "loss": 5.5867, + "step": 5795 + }, + { + "epoch": 0.03447045389666001, + "grad_norm": 2.2090094089508057, + "learning_rate": 4.9853603877619485e-05, + "loss": 5.68, + "step": 5796 + }, + { + "epoch": 0.034476401179941, + "grad_norm": 2.0545220375061035, + "learning_rate": 4.985355339767281e-05, + "loss": 5.8382, + "step": 5797 + }, + { + "epoch": 0.034482348463222, + "grad_norm": 2.143134593963623, + "learning_rate": 4.985350290905003e-05, + "loss": 5.5753, + "step": 5798 + }, + { + "epoch": 0.034488295746503, + "grad_norm": 2.3938257694244385, + "learning_rate": 4.985345241175114e-05, + "loss": 5.7545, + "step": 5799 + }, + { + "epoch": 0.034494243029783996, + "grad_norm": 2.132998466491699, + "learning_rate": 4.985340190577616e-05, + "loss": 5.5477, + "step": 5800 + }, + { + "epoch": 0.03450019031306499, + "grad_norm": 3.141417980194092, + "learning_rate": 4.9853351391125126e-05, + "loss": 5.3509, + "step": 5801 + }, + { + "epoch": 0.034506137596345986, + "grad_norm": 2.4776933193206787, + "learning_rate": 4.9853300867798034e-05, + "loss": 6.1052, + "step": 5802 + }, + { + "epoch": 0.03451208487962699, + "grad_norm": 2.1782073974609375, + "learning_rate": 4.985325033579492e-05, + "loss": 5.9599, + "step": 5803 + }, + { + "epoch": 0.03451803216290798, + "grad_norm": 2.2631704807281494, + "learning_rate": 4.9853199795115794e-05, + "loss": 5.534, + "step": 5804 + }, + { + "epoch": 0.03452397944618898, + "grad_norm": 2.140612840652466, + "learning_rate": 4.985314924576066e-05, + "loss": 5.7479, + "step": 5805 + }, + { + "epoch": 0.03452992672946998, + "grad_norm": 2.726651668548584, + "learning_rate": 4.9853098687729563e-05, + "loss": 5.4639, + "step": 5806 + }, + { + "epoch": 0.034535874012750975, + "grad_norm": 1.852423071861267, + "learning_rate": 4.985304812102249e-05, + "loss": 5.4209, + "step": 5807 + }, + { + "epoch": 0.03454182129603197, + "grad_norm": 2.5236833095550537, + "learning_rate": 4.9852997545639485e-05, + "loss": 5.9653, + "step": 5808 + }, + { + "epoch": 0.03454776857931297, + "grad_norm": 2.2740652561187744, + "learning_rate": 4.985294696158056e-05, + "loss": 5.9457, + "step": 5809 + }, + { + "epoch": 0.03455371586259397, + "grad_norm": 2.931777000427246, + "learning_rate": 4.9852896368845715e-05, + "loss": 5.6709, + "step": 5810 + }, + { + "epoch": 0.03455966314587496, + "grad_norm": 2.6981759071350098, + "learning_rate": 4.9852845767434986e-05, + "loss": 5.1747, + "step": 5811 + }, + { + "epoch": 0.034565610429155964, + "grad_norm": 2.2675211429595947, + "learning_rate": 4.985279515734839e-05, + "loss": 5.2393, + "step": 5812 + }, + { + "epoch": 0.03457155771243696, + "grad_norm": 2.535473346710205, + "learning_rate": 4.985274453858594e-05, + "loss": 6.2184, + "step": 5813 + }, + { + "epoch": 0.034577504995717954, + "grad_norm": 2.8692495822906494, + "learning_rate": 4.985269391114765e-05, + "loss": 5.2557, + "step": 5814 + }, + { + "epoch": 0.034583452278998957, + "grad_norm": 2.908472776412964, + "learning_rate": 4.985264327503354e-05, + "loss": 5.1559, + "step": 5815 + }, + { + "epoch": 0.03458939956227995, + "grad_norm": 2.3630192279815674, + "learning_rate": 4.985259263024363e-05, + "loss": 5.3159, + "step": 5816 + }, + { + "epoch": 0.03459534684556095, + "grad_norm": 2.1287102699279785, + "learning_rate": 4.9852541976777933e-05, + "loss": 5.2069, + "step": 5817 + }, + { + "epoch": 0.03460129412884194, + "grad_norm": 2.751567840576172, + "learning_rate": 4.985249131463647e-05, + "loss": 5.6561, + "step": 5818 + }, + { + "epoch": 0.034607241412122944, + "grad_norm": 2.505608081817627, + "learning_rate": 4.985244064381927e-05, + "loss": 5.9708, + "step": 5819 + }, + { + "epoch": 0.03461318869540394, + "grad_norm": 2.351593255996704, + "learning_rate": 4.9852389964326337e-05, + "loss": 5.9046, + "step": 5820 + }, + { + "epoch": 0.034619135978684934, + "grad_norm": 2.3037939071655273, + "learning_rate": 4.985233927615769e-05, + "loss": 6.0069, + "step": 5821 + }, + { + "epoch": 0.034625083261965936, + "grad_norm": 2.2482705116271973, + "learning_rate": 4.985228857931334e-05, + "loss": 5.9492, + "step": 5822 + }, + { + "epoch": 0.03463103054524693, + "grad_norm": 2.23640513420105, + "learning_rate": 4.985223787379332e-05, + "loss": 5.6631, + "step": 5823 + }, + { + "epoch": 0.034636977828527926, + "grad_norm": 2.710275411605835, + "learning_rate": 4.985218715959764e-05, + "loss": 5.5961, + "step": 5824 + }, + { + "epoch": 0.03464292511180893, + "grad_norm": 2.7220160961151123, + "learning_rate": 4.9852136436726313e-05, + "loss": 5.6922, + "step": 5825 + }, + { + "epoch": 0.03464887239508992, + "grad_norm": 2.4542758464813232, + "learning_rate": 4.985208570517937e-05, + "loss": 5.4742, + "step": 5826 + }, + { + "epoch": 0.03465481967837092, + "grad_norm": 2.7492685317993164, + "learning_rate": 4.9852034964956816e-05, + "loss": 5.4598, + "step": 5827 + }, + { + "epoch": 0.03466076696165192, + "grad_norm": 2.757937431335449, + "learning_rate": 4.9851984216058677e-05, + "loss": 6.1865, + "step": 5828 + }, + { + "epoch": 0.034666714244932915, + "grad_norm": 2.835890531539917, + "learning_rate": 4.985193345848497e-05, + "loss": 5.3368, + "step": 5829 + }, + { + "epoch": 0.03467266152821391, + "grad_norm": 2.694884777069092, + "learning_rate": 4.98518826922357e-05, + "loss": 5.3654, + "step": 5830 + }, + { + "epoch": 0.03467860881149491, + "grad_norm": 2.443784236907959, + "learning_rate": 4.98518319173109e-05, + "loss": 5.7879, + "step": 5831 + }, + { + "epoch": 0.03468455609477591, + "grad_norm": 2.0198488235473633, + "learning_rate": 4.985178113371058e-05, + "loss": 5.766, + "step": 5832 + }, + { + "epoch": 0.0346905033780569, + "grad_norm": 2.8718788623809814, + "learning_rate": 4.985173034143476e-05, + "loss": 5.5506, + "step": 5833 + }, + { + "epoch": 0.0346964506613379, + "grad_norm": 2.4353652000427246, + "learning_rate": 4.9851679540483455e-05, + "loss": 5.7139, + "step": 5834 + }, + { + "epoch": 0.0347023979446189, + "grad_norm": 1.9376598596572876, + "learning_rate": 4.985162873085669e-05, + "loss": 6.2326, + "step": 5835 + }, + { + "epoch": 0.034708345227899895, + "grad_norm": 2.2225289344787598, + "learning_rate": 4.985157791255448e-05, + "loss": 5.5997, + "step": 5836 + }, + { + "epoch": 0.03471429251118089, + "grad_norm": 2.011493682861328, + "learning_rate": 4.985152708557684e-05, + "loss": 5.6882, + "step": 5837 + }, + { + "epoch": 0.03472023979446189, + "grad_norm": 1.8679020404815674, + "learning_rate": 4.985147624992378e-05, + "loss": 5.5427, + "step": 5838 + }, + { + "epoch": 0.03472618707774289, + "grad_norm": 1.9470884799957275, + "learning_rate": 4.9851425405595334e-05, + "loss": 5.5957, + "step": 5839 + }, + { + "epoch": 0.03473213436102388, + "grad_norm": 2.0765669345855713, + "learning_rate": 4.985137455259151e-05, + "loss": 5.4416, + "step": 5840 + }, + { + "epoch": 0.034738081644304884, + "grad_norm": 2.0521979331970215, + "learning_rate": 4.985132369091233e-05, + "loss": 5.4641, + "step": 5841 + }, + { + "epoch": 0.03474402892758588, + "grad_norm": 1.7439172267913818, + "learning_rate": 4.985127282055781e-05, + "loss": 5.1998, + "step": 5842 + }, + { + "epoch": 0.034749976210866874, + "grad_norm": 1.7347313165664673, + "learning_rate": 4.985122194152797e-05, + "loss": 5.2392, + "step": 5843 + }, + { + "epoch": 0.034755923494147876, + "grad_norm": 1.7362169027328491, + "learning_rate": 4.985117105382282e-05, + "loss": 5.1769, + "step": 5844 + }, + { + "epoch": 0.03476187077742887, + "grad_norm": 1.7468090057373047, + "learning_rate": 4.985112015744239e-05, + "loss": 5.3915, + "step": 5845 + }, + { + "epoch": 0.03476781806070987, + "grad_norm": 1.8685250282287598, + "learning_rate": 4.985106925238668e-05, + "loss": 5.6119, + "step": 5846 + }, + { + "epoch": 0.03477376534399086, + "grad_norm": 1.9595715999603271, + "learning_rate": 4.985101833865572e-05, + "loss": 5.5536, + "step": 5847 + }, + { + "epoch": 0.034779712627271864, + "grad_norm": 1.8454965353012085, + "learning_rate": 4.985096741624953e-05, + "loss": 5.8127, + "step": 5848 + }, + { + "epoch": 0.03478565991055286, + "grad_norm": 1.9182006120681763, + "learning_rate": 4.985091648516813e-05, + "loss": 5.8807, + "step": 5849 + }, + { + "epoch": 0.034791607193833854, + "grad_norm": 2.042923927307129, + "learning_rate": 4.9850865545411526e-05, + "loss": 5.9013, + "step": 5850 + }, + { + "epoch": 0.034797554477114856, + "grad_norm": 2.341055393218994, + "learning_rate": 4.985081459697974e-05, + "loss": 6.214, + "step": 5851 + }, + { + "epoch": 0.03480350176039585, + "grad_norm": 2.026190996170044, + "learning_rate": 4.985076363987279e-05, + "loss": 5.3693, + "step": 5852 + }, + { + "epoch": 0.034809449043676846, + "grad_norm": 2.045264482498169, + "learning_rate": 4.98507126740907e-05, + "loss": 5.6325, + "step": 5853 + }, + { + "epoch": 0.03481539632695785, + "grad_norm": 2.2710580825805664, + "learning_rate": 4.985066169963348e-05, + "loss": 5.8355, + "step": 5854 + }, + { + "epoch": 0.03482134361023884, + "grad_norm": 1.8813494443893433, + "learning_rate": 4.985061071650115e-05, + "loss": 5.5849, + "step": 5855 + }, + { + "epoch": 0.03482729089351984, + "grad_norm": 2.2177746295928955, + "learning_rate": 4.985055972469373e-05, + "loss": 5.5518, + "step": 5856 + }, + { + "epoch": 0.03483323817680084, + "grad_norm": 1.897653341293335, + "learning_rate": 4.9850508724211234e-05, + "loss": 5.6035, + "step": 5857 + }, + { + "epoch": 0.034839185460081835, + "grad_norm": 2.349821090698242, + "learning_rate": 4.985045771505369e-05, + "loss": 5.8181, + "step": 5858 + }, + { + "epoch": 0.03484513274336283, + "grad_norm": 1.900538682937622, + "learning_rate": 4.98504066972211e-05, + "loss": 5.2751, + "step": 5859 + }, + { + "epoch": 0.03485108002664383, + "grad_norm": 2.1902174949645996, + "learning_rate": 4.985035567071349e-05, + "loss": 5.2709, + "step": 5860 + }, + { + "epoch": 0.03485702730992483, + "grad_norm": 1.7833307981491089, + "learning_rate": 4.9850304635530884e-05, + "loss": 5.2104, + "step": 5861 + }, + { + "epoch": 0.03486297459320582, + "grad_norm": 2.017603874206543, + "learning_rate": 4.985025359167329e-05, + "loss": 5.2257, + "step": 5862 + }, + { + "epoch": 0.03486892187648682, + "grad_norm": 1.9828181266784668, + "learning_rate": 4.9850202539140724e-05, + "loss": 5.2303, + "step": 5863 + }, + { + "epoch": 0.03487486915976782, + "grad_norm": 2.0273706912994385, + "learning_rate": 4.9850151477933216e-05, + "loss": 5.1743, + "step": 5864 + }, + { + "epoch": 0.034880816443048815, + "grad_norm": 1.9634721279144287, + "learning_rate": 4.985010040805077e-05, + "loss": 5.1541, + "step": 5865 + }, + { + "epoch": 0.03488676372632981, + "grad_norm": 2.2766621112823486, + "learning_rate": 4.985004932949342e-05, + "loss": 5.1372, + "step": 5866 + }, + { + "epoch": 0.03489271100961081, + "grad_norm": 2.0768795013427734, + "learning_rate": 4.984999824226117e-05, + "loss": 5.2567, + "step": 5867 + }, + { + "epoch": 0.03489865829289181, + "grad_norm": 1.8665590286254883, + "learning_rate": 4.984994714635404e-05, + "loss": 5.1356, + "step": 5868 + }, + { + "epoch": 0.0349046055761728, + "grad_norm": 2.056450843811035, + "learning_rate": 4.984989604177205e-05, + "loss": 5.1667, + "step": 5869 + }, + { + "epoch": 0.034910552859453804, + "grad_norm": 2.1191976070404053, + "learning_rate": 4.984984492851522e-05, + "loss": 5.1898, + "step": 5870 + }, + { + "epoch": 0.0349165001427348, + "grad_norm": 2.049450397491455, + "learning_rate": 4.9849793806583566e-05, + "loss": 5.1568, + "step": 5871 + }, + { + "epoch": 0.034922447426015794, + "grad_norm": 1.79837167263031, + "learning_rate": 4.984974267597711e-05, + "loss": 5.1288, + "step": 5872 + }, + { + "epoch": 0.034928394709296796, + "grad_norm": 1.959088683128357, + "learning_rate": 4.984969153669585e-05, + "loss": 5.1063, + "step": 5873 + }, + { + "epoch": 0.03493434199257779, + "grad_norm": 1.9193873405456543, + "learning_rate": 4.9849640388739836e-05, + "loss": 5.1608, + "step": 5874 + }, + { + "epoch": 0.03494028927585879, + "grad_norm": 1.6684316396713257, + "learning_rate": 4.9849589232109065e-05, + "loss": 5.0926, + "step": 5875 + }, + { + "epoch": 0.03494623655913978, + "grad_norm": 1.8383700847625732, + "learning_rate": 4.984953806680356e-05, + "loss": 5.0474, + "step": 5876 + }, + { + "epoch": 0.034952183842420784, + "grad_norm": 2.233779191970825, + "learning_rate": 4.984948689282333e-05, + "loss": 5.5046, + "step": 5877 + }, + { + "epoch": 0.03495813112570178, + "grad_norm": 2.2267282009124756, + "learning_rate": 4.9849435710168415e-05, + "loss": 5.6235, + "step": 5878 + }, + { + "epoch": 0.034964078408982774, + "grad_norm": 1.7933586835861206, + "learning_rate": 4.9849384518838804e-05, + "loss": 5.0968, + "step": 5879 + }, + { + "epoch": 0.034970025692263776, + "grad_norm": 2.0050230026245117, + "learning_rate": 4.984933331883453e-05, + "loss": 4.9789, + "step": 5880 + }, + { + "epoch": 0.03497597297554477, + "grad_norm": 1.7422970533370972, + "learning_rate": 4.9849282110155627e-05, + "loss": 5.1556, + "step": 5881 + }, + { + "epoch": 0.034981920258825766, + "grad_norm": 2.1242151260375977, + "learning_rate": 4.984923089280209e-05, + "loss": 5.7039, + "step": 5882 + }, + { + "epoch": 0.03498786754210677, + "grad_norm": 1.8656666278839111, + "learning_rate": 4.9849179666773934e-05, + "loss": 5.7185, + "step": 5883 + }, + { + "epoch": 0.03499381482538776, + "grad_norm": 1.6954991817474365, + "learning_rate": 4.984912843207119e-05, + "loss": 5.5686, + "step": 5884 + }, + { + "epoch": 0.03499976210866876, + "grad_norm": 1.7692710161209106, + "learning_rate": 4.984907718869387e-05, + "loss": 5.4058, + "step": 5885 + }, + { + "epoch": 0.03500570939194976, + "grad_norm": 1.8496350049972534, + "learning_rate": 4.9849025936642004e-05, + "loss": 5.5037, + "step": 5886 + }, + { + "epoch": 0.035011656675230755, + "grad_norm": 2.0124640464782715, + "learning_rate": 4.984897467591559e-05, + "loss": 5.6146, + "step": 5887 + }, + { + "epoch": 0.03501760395851175, + "grad_norm": 2.5522549152374268, + "learning_rate": 4.984892340651466e-05, + "loss": 5.6403, + "step": 5888 + }, + { + "epoch": 0.03502355124179275, + "grad_norm": 2.2127344608306885, + "learning_rate": 4.9848872128439224e-05, + "loss": 5.6277, + "step": 5889 + }, + { + "epoch": 0.03502949852507375, + "grad_norm": 2.578322172164917, + "learning_rate": 4.9848820841689305e-05, + "loss": 5.849, + "step": 5890 + }, + { + "epoch": 0.03503544580835474, + "grad_norm": 1.8083957433700562, + "learning_rate": 4.9848769546264915e-05, + "loss": 5.4407, + "step": 5891 + }, + { + "epoch": 0.03504139309163574, + "grad_norm": 1.885387897491455, + "learning_rate": 4.984871824216609e-05, + "loss": 5.4486, + "step": 5892 + }, + { + "epoch": 0.03504734037491674, + "grad_norm": 1.9450737237930298, + "learning_rate": 4.9848666929392817e-05, + "loss": 5.4196, + "step": 5893 + }, + { + "epoch": 0.035053287658197735, + "grad_norm": 1.9072003364562988, + "learning_rate": 4.984861560794514e-05, + "loss": 5.6293, + "step": 5894 + }, + { + "epoch": 0.03505923494147873, + "grad_norm": 2.064192056655884, + "learning_rate": 4.984856427782307e-05, + "loss": 5.7105, + "step": 5895 + }, + { + "epoch": 0.03506518222475973, + "grad_norm": 2.0101802349090576, + "learning_rate": 4.984851293902663e-05, + "loss": 5.5623, + "step": 5896 + }, + { + "epoch": 0.03507112950804073, + "grad_norm": 1.9813642501831055, + "learning_rate": 4.984846159155581e-05, + "loss": 5.653, + "step": 5897 + }, + { + "epoch": 0.03507707679132172, + "grad_norm": 1.9213227033615112, + "learning_rate": 4.9848410235410666e-05, + "loss": 5.5194, + "step": 5898 + }, + { + "epoch": 0.035083024074602724, + "grad_norm": 1.803076982498169, + "learning_rate": 4.984835887059119e-05, + "loss": 5.4101, + "step": 5899 + }, + { + "epoch": 0.03508897135788372, + "grad_norm": 1.8419232368469238, + "learning_rate": 4.9848307497097414e-05, + "loss": 5.7329, + "step": 5900 + }, + { + "epoch": 0.035094918641164714, + "grad_norm": 1.9258531332015991, + "learning_rate": 4.984825611492935e-05, + "loss": 5.559, + "step": 5901 + }, + { + "epoch": 0.035100865924445716, + "grad_norm": 1.869529366493225, + "learning_rate": 4.984820472408701e-05, + "loss": 5.5682, + "step": 5902 + }, + { + "epoch": 0.03510681320772671, + "grad_norm": 1.753365159034729, + "learning_rate": 4.984815332457042e-05, + "loss": 5.6241, + "step": 5903 + }, + { + "epoch": 0.035112760491007707, + "grad_norm": 1.6581326723098755, + "learning_rate": 4.98481019163796e-05, + "loss": 5.4752, + "step": 5904 + }, + { + "epoch": 0.0351187077742887, + "grad_norm": 1.9120882749557495, + "learning_rate": 4.9848050499514565e-05, + "loss": 5.5678, + "step": 5905 + }, + { + "epoch": 0.035124655057569704, + "grad_norm": 1.9840329885482788, + "learning_rate": 4.984799907397533e-05, + "loss": 5.5369, + "step": 5906 + }, + { + "epoch": 0.0351306023408507, + "grad_norm": 1.7970712184906006, + "learning_rate": 4.9847947639761914e-05, + "loss": 5.5857, + "step": 5907 + }, + { + "epoch": 0.035136549624131694, + "grad_norm": 1.7219270467758179, + "learning_rate": 4.984789619687435e-05, + "loss": 5.609, + "step": 5908 + }, + { + "epoch": 0.035142496907412696, + "grad_norm": 1.8945105075836182, + "learning_rate": 4.984784474531262e-05, + "loss": 5.5893, + "step": 5909 + }, + { + "epoch": 0.03514844419069369, + "grad_norm": 1.8570127487182617, + "learning_rate": 4.984779328507678e-05, + "loss": 5.4556, + "step": 5910 + }, + { + "epoch": 0.035154391473974686, + "grad_norm": 1.9291017055511475, + "learning_rate": 4.984774181616683e-05, + "loss": 5.476, + "step": 5911 + }, + { + "epoch": 0.03516033875725569, + "grad_norm": 1.9138598442077637, + "learning_rate": 4.984769033858278e-05, + "loss": 5.6329, + "step": 5912 + }, + { + "epoch": 0.03516628604053668, + "grad_norm": 1.9484977722167969, + "learning_rate": 4.9847638852324665e-05, + "loss": 5.5305, + "step": 5913 + }, + { + "epoch": 0.03517223332381768, + "grad_norm": 1.7338584661483765, + "learning_rate": 4.984758735739249e-05, + "loss": 5.4842, + "step": 5914 + }, + { + "epoch": 0.03517818060709868, + "grad_norm": 1.8625437021255493, + "learning_rate": 4.984753585378629e-05, + "loss": 5.3696, + "step": 5915 + }, + { + "epoch": 0.035184127890379675, + "grad_norm": 1.798782229423523, + "learning_rate": 4.984748434150607e-05, + "loss": 5.5803, + "step": 5916 + }, + { + "epoch": 0.03519007517366067, + "grad_norm": 2.0596888065338135, + "learning_rate": 4.9847432820551845e-05, + "loss": 5.3274, + "step": 5917 + }, + { + "epoch": 0.03519602245694167, + "grad_norm": 2.0848498344421387, + "learning_rate": 4.984738129092364e-05, + "loss": 5.3334, + "step": 5918 + }, + { + "epoch": 0.03520196974022267, + "grad_norm": 2.000460386276245, + "learning_rate": 4.984732975262147e-05, + "loss": 5.4411, + "step": 5919 + }, + { + "epoch": 0.03520791702350366, + "grad_norm": 1.676957607269287, + "learning_rate": 4.9847278205645355e-05, + "loss": 5.47, + "step": 5920 + }, + { + "epoch": 0.03521386430678466, + "grad_norm": 1.911482334136963, + "learning_rate": 4.984722664999531e-05, + "loss": 5.5736, + "step": 5921 + }, + { + "epoch": 0.03521981159006566, + "grad_norm": 1.9573029279708862, + "learning_rate": 4.9847175085671356e-05, + "loss": 5.5509, + "step": 5922 + }, + { + "epoch": 0.035225758873346655, + "grad_norm": 1.8878334760665894, + "learning_rate": 4.984712351267351e-05, + "loss": 5.6437, + "step": 5923 + }, + { + "epoch": 0.03523170615662765, + "grad_norm": 1.9107712507247925, + "learning_rate": 4.984707193100179e-05, + "loss": 5.4471, + "step": 5924 + }, + { + "epoch": 0.03523765343990865, + "grad_norm": 1.7408612966537476, + "learning_rate": 4.9847020340656215e-05, + "loss": 5.3706, + "step": 5925 + }, + { + "epoch": 0.03524360072318965, + "grad_norm": 1.9594995975494385, + "learning_rate": 4.98469687416368e-05, + "loss": 5.4113, + "step": 5926 + }, + { + "epoch": 0.03524954800647064, + "grad_norm": 1.8772166967391968, + "learning_rate": 4.984691713394356e-05, + "loss": 5.368, + "step": 5927 + }, + { + "epoch": 0.035255495289751644, + "grad_norm": 2.1143953800201416, + "learning_rate": 4.9846865517576524e-05, + "loss": 5.3829, + "step": 5928 + }, + { + "epoch": 0.03526144257303264, + "grad_norm": 2.0923383235931396, + "learning_rate": 4.984681389253571e-05, + "loss": 5.9834, + "step": 5929 + }, + { + "epoch": 0.035267389856313634, + "grad_norm": 2.016749620437622, + "learning_rate": 4.984676225882112e-05, + "loss": 5.68, + "step": 5930 + }, + { + "epoch": 0.035273337139594636, + "grad_norm": 1.6040265560150146, + "learning_rate": 4.984671061643279e-05, + "loss": 5.7406, + "step": 5931 + }, + { + "epoch": 0.03527928442287563, + "grad_norm": 2.100774049758911, + "learning_rate": 4.984665896537072e-05, + "loss": 5.5545, + "step": 5932 + }, + { + "epoch": 0.035285231706156626, + "grad_norm": 2.008575439453125, + "learning_rate": 4.984660730563494e-05, + "loss": 5.3769, + "step": 5933 + }, + { + "epoch": 0.03529117898943762, + "grad_norm": 1.9622136354446411, + "learning_rate": 4.984655563722547e-05, + "loss": 5.5792, + "step": 5934 + }, + { + "epoch": 0.035297126272718624, + "grad_norm": 1.764647364616394, + "learning_rate": 4.9846503960142325e-05, + "loss": 5.6543, + "step": 5935 + }, + { + "epoch": 0.03530307355599962, + "grad_norm": 1.6166809797286987, + "learning_rate": 4.984645227438552e-05, + "loss": 5.7948, + "step": 5936 + }, + { + "epoch": 0.035309020839280614, + "grad_norm": 1.7368977069854736, + "learning_rate": 4.9846400579955074e-05, + "loss": 5.6288, + "step": 5937 + }, + { + "epoch": 0.035314968122561616, + "grad_norm": 1.649059772491455, + "learning_rate": 4.984634887685101e-05, + "loss": 5.8538, + "step": 5938 + }, + { + "epoch": 0.03532091540584261, + "grad_norm": 1.6092652082443237, + "learning_rate": 4.984629716507334e-05, + "loss": 5.7077, + "step": 5939 + }, + { + "epoch": 0.035326862689123606, + "grad_norm": 1.76821768283844, + "learning_rate": 4.984624544462209e-05, + "loss": 5.4206, + "step": 5940 + }, + { + "epoch": 0.03533280997240461, + "grad_norm": 1.5885004997253418, + "learning_rate": 4.984619371549727e-05, + "loss": 5.3997, + "step": 5941 + }, + { + "epoch": 0.0353387572556856, + "grad_norm": 1.6730574369430542, + "learning_rate": 4.984614197769889e-05, + "loss": 5.4952, + "step": 5942 + }, + { + "epoch": 0.0353447045389666, + "grad_norm": 1.9951595067977905, + "learning_rate": 4.984609023122699e-05, + "loss": 5.5658, + "step": 5943 + }, + { + "epoch": 0.0353506518222476, + "grad_norm": 1.8277794122695923, + "learning_rate": 4.984603847608157e-05, + "loss": 5.5313, + "step": 5944 + }, + { + "epoch": 0.035356599105528595, + "grad_norm": 1.5988150835037231, + "learning_rate": 4.984598671226266e-05, + "loss": 5.4661, + "step": 5945 + }, + { + "epoch": 0.03536254638880959, + "grad_norm": 1.8313721418380737, + "learning_rate": 4.9845934939770264e-05, + "loss": 5.3005, + "step": 5946 + }, + { + "epoch": 0.03536849367209059, + "grad_norm": 1.8441407680511475, + "learning_rate": 4.984588315860442e-05, + "loss": 5.4564, + "step": 5947 + }, + { + "epoch": 0.03537444095537159, + "grad_norm": 2.8165388107299805, + "learning_rate": 4.9845831368765126e-05, + "loss": 5.4582, + "step": 5948 + }, + { + "epoch": 0.03538038823865258, + "grad_norm": 1.8860023021697998, + "learning_rate": 4.9845779570252415e-05, + "loss": 5.4952, + "step": 5949 + }, + { + "epoch": 0.03538633552193358, + "grad_norm": 1.7752633094787598, + "learning_rate": 4.98457277630663e-05, + "loss": 5.4301, + "step": 5950 + }, + { + "epoch": 0.03539228280521458, + "grad_norm": 1.9038548469543457, + "learning_rate": 4.984567594720679e-05, + "loss": 5.2591, + "step": 5951 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 2.6449787616729736, + "learning_rate": 4.984562412267392e-05, + "loss": 5.9317, + "step": 5952 + }, + { + "epoch": 0.03540417737177657, + "grad_norm": 1.95949125289917, + "learning_rate": 4.98455722894677e-05, + "loss": 5.4686, + "step": 5953 + }, + { + "epoch": 0.03541012465505757, + "grad_norm": 2.0208640098571777, + "learning_rate": 4.984552044758814e-05, + "loss": 5.6361, + "step": 5954 + }, + { + "epoch": 0.03541607193833857, + "grad_norm": 2.2328197956085205, + "learning_rate": 4.9845468597035274e-05, + "loss": 5.455, + "step": 5955 + }, + { + "epoch": 0.03542201922161956, + "grad_norm": 2.115952968597412, + "learning_rate": 4.9845416737809105e-05, + "loss": 5.3275, + "step": 5956 + }, + { + "epoch": 0.035427966504900564, + "grad_norm": 2.023791790008545, + "learning_rate": 4.984536486990966e-05, + "loss": 5.3135, + "step": 5957 + }, + { + "epoch": 0.03543391378818156, + "grad_norm": 1.9721077680587769, + "learning_rate": 4.9845312993336945e-05, + "loss": 5.3429, + "step": 5958 + }, + { + "epoch": 0.035439861071462554, + "grad_norm": 2.047588586807251, + "learning_rate": 4.9845261108091e-05, + "loss": 5.4027, + "step": 5959 + }, + { + "epoch": 0.035445808354743556, + "grad_norm": 1.9019498825073242, + "learning_rate": 4.9845209214171826e-05, + "loss": 5.3867, + "step": 5960 + }, + { + "epoch": 0.03545175563802455, + "grad_norm": 1.9442843198776245, + "learning_rate": 4.984515731157945e-05, + "loss": 5.3189, + "step": 5961 + }, + { + "epoch": 0.035457702921305546, + "grad_norm": 2.051422357559204, + "learning_rate": 4.9845105400313885e-05, + "loss": 5.5713, + "step": 5962 + }, + { + "epoch": 0.03546365020458654, + "grad_norm": 1.811908483505249, + "learning_rate": 4.9845053480375145e-05, + "loss": 5.6221, + "step": 5963 + }, + { + "epoch": 0.035469597487867544, + "grad_norm": 2.017991542816162, + "learning_rate": 4.984500155176326e-05, + "loss": 5.2774, + "step": 5964 + }, + { + "epoch": 0.03547554477114854, + "grad_norm": 1.972644329071045, + "learning_rate": 4.9844949614478244e-05, + "loss": 5.3208, + "step": 5965 + }, + { + "epoch": 0.035481492054429534, + "grad_norm": 1.9937026500701904, + "learning_rate": 4.984489766852011e-05, + "loss": 5.455, + "step": 5966 + }, + { + "epoch": 0.035487439337710536, + "grad_norm": 1.7297019958496094, + "learning_rate": 4.984484571388887e-05, + "loss": 5.3829, + "step": 5967 + }, + { + "epoch": 0.03549338662099153, + "grad_norm": 1.6428204774856567, + "learning_rate": 4.984479375058456e-05, + "loss": 5.3638, + "step": 5968 + }, + { + "epoch": 0.035499333904272526, + "grad_norm": 1.9522719383239746, + "learning_rate": 4.9844741778607186e-05, + "loss": 5.3379, + "step": 5969 + }, + { + "epoch": 0.03550528118755353, + "grad_norm": 2.0280921459198, + "learning_rate": 4.984468979795677e-05, + "loss": 5.4366, + "step": 5970 + }, + { + "epoch": 0.03551122847083452, + "grad_norm": 2.0396251678466797, + "learning_rate": 4.9844637808633334e-05, + "loss": 5.5681, + "step": 5971 + }, + { + "epoch": 0.03551717575411552, + "grad_norm": 1.5256271362304688, + "learning_rate": 4.984458581063689e-05, + "loss": 5.602, + "step": 5972 + }, + { + "epoch": 0.03552312303739652, + "grad_norm": 1.8829892873764038, + "learning_rate": 4.984453380396745e-05, + "loss": 5.3851, + "step": 5973 + }, + { + "epoch": 0.035529070320677515, + "grad_norm": 2.047106981277466, + "learning_rate": 4.984448178862505e-05, + "loss": 5.3724, + "step": 5974 + }, + { + "epoch": 0.03553501760395851, + "grad_norm": 2.066572904586792, + "learning_rate": 4.984442976460969e-05, + "loss": 5.3352, + "step": 5975 + }, + { + "epoch": 0.03554096488723951, + "grad_norm": 1.9785430431365967, + "learning_rate": 4.98443777319214e-05, + "loss": 5.2641, + "step": 5976 + }, + { + "epoch": 0.03554691217052051, + "grad_norm": 1.8999443054199219, + "learning_rate": 4.98443256905602e-05, + "loss": 5.3402, + "step": 5977 + }, + { + "epoch": 0.0355528594538015, + "grad_norm": 1.8599263429641724, + "learning_rate": 4.98442736405261e-05, + "loss": 5.2612, + "step": 5978 + }, + { + "epoch": 0.0355588067370825, + "grad_norm": 1.7216875553131104, + "learning_rate": 4.984422158181911e-05, + "loss": 5.4041, + "step": 5979 + }, + { + "epoch": 0.0355647540203635, + "grad_norm": 2.0259687900543213, + "learning_rate": 4.984416951443926e-05, + "loss": 5.4895, + "step": 5980 + }, + { + "epoch": 0.035570701303644495, + "grad_norm": 1.705736756324768, + "learning_rate": 4.9844117438386583e-05, + "loss": 5.5845, + "step": 5981 + }, + { + "epoch": 0.03557664858692549, + "grad_norm": 1.9546462297439575, + "learning_rate": 4.9844065353661074e-05, + "loss": 5.6803, + "step": 5982 + }, + { + "epoch": 0.03558259587020649, + "grad_norm": 1.829689383506775, + "learning_rate": 4.984401326026275e-05, + "loss": 5.5816, + "step": 5983 + }, + { + "epoch": 0.03558854315348749, + "grad_norm": 1.6464663743972778, + "learning_rate": 4.984396115819164e-05, + "loss": 5.5738, + "step": 5984 + }, + { + "epoch": 0.03559449043676848, + "grad_norm": 1.7786076068878174, + "learning_rate": 4.984390904744777e-05, + "loss": 5.3667, + "step": 5985 + }, + { + "epoch": 0.035600437720049484, + "grad_norm": 2.210754871368408, + "learning_rate": 4.984385692803114e-05, + "loss": 5.5259, + "step": 5986 + }, + { + "epoch": 0.03560638500333048, + "grad_norm": 1.7361842393875122, + "learning_rate": 4.984380479994179e-05, + "loss": 5.6108, + "step": 5987 + }, + { + "epoch": 0.035612332286611474, + "grad_norm": 1.926477313041687, + "learning_rate": 4.9843752663179703e-05, + "loss": 5.593, + "step": 5988 + }, + { + "epoch": 0.035618279569892476, + "grad_norm": 1.6683733463287354, + "learning_rate": 4.984370051774493e-05, + "loss": 5.6305, + "step": 5989 + }, + { + "epoch": 0.03562422685317347, + "grad_norm": 1.790499210357666, + "learning_rate": 4.9843648363637475e-05, + "loss": 5.596, + "step": 5990 + }, + { + "epoch": 0.035630174136454466, + "grad_norm": 1.8355207443237305, + "learning_rate": 4.984359620085736e-05, + "loss": 5.5818, + "step": 5991 + }, + { + "epoch": 0.03563612141973546, + "grad_norm": 1.9352680444717407, + "learning_rate": 4.98435440294046e-05, + "loss": 5.187, + "step": 5992 + }, + { + "epoch": 0.03564206870301646, + "grad_norm": 2.063159465789795, + "learning_rate": 4.9843491849279225e-05, + "loss": 5.3245, + "step": 5993 + }, + { + "epoch": 0.03564801598629746, + "grad_norm": 1.6848958730697632, + "learning_rate": 4.984343966048123e-05, + "loss": 5.4454, + "step": 5994 + }, + { + "epoch": 0.035653963269578454, + "grad_norm": 2.1244423389434814, + "learning_rate": 4.9843387463010654e-05, + "loss": 5.5018, + "step": 5995 + }, + { + "epoch": 0.035659910552859456, + "grad_norm": 1.9100427627563477, + "learning_rate": 4.9843335256867505e-05, + "loss": 5.5597, + "step": 5996 + }, + { + "epoch": 0.03566585783614045, + "grad_norm": 1.9130252599716187, + "learning_rate": 4.984328304205181e-05, + "loss": 5.4538, + "step": 5997 + }, + { + "epoch": 0.035671805119421446, + "grad_norm": 1.6285213232040405, + "learning_rate": 4.984323081856358e-05, + "loss": 5.7361, + "step": 5998 + }, + { + "epoch": 0.03567775240270245, + "grad_norm": 1.6690980195999146, + "learning_rate": 4.984317858640283e-05, + "loss": 5.7537, + "step": 5999 + }, + { + "epoch": 0.03568369968598344, + "grad_norm": 1.5258572101593018, + "learning_rate": 4.984312634556959e-05, + "loss": 5.7419, + "step": 6000 + }, + { + "epoch": 0.03568964696926444, + "grad_norm": 1.9586881399154663, + "learning_rate": 4.984307409606386e-05, + "loss": 5.4449, + "step": 6001 + }, + { + "epoch": 0.03569559425254544, + "grad_norm": 2.1795685291290283, + "learning_rate": 4.9843021837885684e-05, + "loss": 5.3833, + "step": 6002 + }, + { + "epoch": 0.035701541535826435, + "grad_norm": 2.1241326332092285, + "learning_rate": 4.984296957103506e-05, + "loss": 5.3064, + "step": 6003 + }, + { + "epoch": 0.03570748881910743, + "grad_norm": 1.9621204137802124, + "learning_rate": 4.9842917295512004e-05, + "loss": 5.3002, + "step": 6004 + }, + { + "epoch": 0.03571343610238843, + "grad_norm": 2.041503429412842, + "learning_rate": 4.984286501131655e-05, + "loss": 5.2885, + "step": 6005 + }, + { + "epoch": 0.03571938338566943, + "grad_norm": 2.1099791526794434, + "learning_rate": 4.984281271844871e-05, + "loss": 5.3038, + "step": 6006 + }, + { + "epoch": 0.03572533066895042, + "grad_norm": 2.0209009647369385, + "learning_rate": 4.98427604169085e-05, + "loss": 5.8373, + "step": 6007 + }, + { + "epoch": 0.03573127795223142, + "grad_norm": 1.7534282207489014, + "learning_rate": 4.9842708106695934e-05, + "loss": 5.6522, + "step": 6008 + }, + { + "epoch": 0.03573722523551242, + "grad_norm": 2.3014237880706787, + "learning_rate": 4.984265578781104e-05, + "loss": 5.462, + "step": 6009 + }, + { + "epoch": 0.035743172518793415, + "grad_norm": 2.123767614364624, + "learning_rate": 4.984260346025382e-05, + "loss": 5.3901, + "step": 6010 + }, + { + "epoch": 0.03574911980207441, + "grad_norm": 2.4190175533294678, + "learning_rate": 4.9842551124024315e-05, + "loss": 5.1526, + "step": 6011 + }, + { + "epoch": 0.03575506708535541, + "grad_norm": 1.9972834587097168, + "learning_rate": 4.984249877912254e-05, + "loss": 5.2987, + "step": 6012 + }, + { + "epoch": 0.03576101436863641, + "grad_norm": 2.002969980239868, + "learning_rate": 4.9842446425548494e-05, + "loss": 5.5244, + "step": 6013 + }, + { + "epoch": 0.0357669616519174, + "grad_norm": 2.8208391666412354, + "learning_rate": 4.984239406330221e-05, + "loss": 5.834, + "step": 6014 + }, + { + "epoch": 0.035772908935198404, + "grad_norm": 2.409303665161133, + "learning_rate": 4.98423416923837e-05, + "loss": 5.1709, + "step": 6015 + }, + { + "epoch": 0.0357788562184794, + "grad_norm": 2.215888500213623, + "learning_rate": 4.984228931279298e-05, + "loss": 5.38, + "step": 6016 + }, + { + "epoch": 0.035784803501760394, + "grad_norm": 1.9130421876907349, + "learning_rate": 4.9842236924530086e-05, + "loss": 5.4551, + "step": 6017 + }, + { + "epoch": 0.035790750785041396, + "grad_norm": 1.8963314294815063, + "learning_rate": 4.9842184527595015e-05, + "loss": 5.3512, + "step": 6018 + }, + { + "epoch": 0.03579669806832239, + "grad_norm": 2.0085666179656982, + "learning_rate": 4.98421321219878e-05, + "loss": 5.3013, + "step": 6019 + }, + { + "epoch": 0.035802645351603386, + "grad_norm": 2.1059834957122803, + "learning_rate": 4.9842079707708446e-05, + "loss": 5.4052, + "step": 6020 + }, + { + "epoch": 0.03580859263488438, + "grad_norm": 1.965694785118103, + "learning_rate": 4.984202728475699e-05, + "loss": 5.5392, + "step": 6021 + }, + { + "epoch": 0.03581453991816538, + "grad_norm": 1.9495680332183838, + "learning_rate": 4.9841974853133425e-05, + "loss": 5.309, + "step": 6022 + }, + { + "epoch": 0.03582048720144638, + "grad_norm": 1.9762555360794067, + "learning_rate": 4.9841922412837795e-05, + "loss": 5.3979, + "step": 6023 + }, + { + "epoch": 0.035826434484727374, + "grad_norm": 1.7825839519500732, + "learning_rate": 4.98418699638701e-05, + "loss": 5.3502, + "step": 6024 + }, + { + "epoch": 0.035832381768008376, + "grad_norm": 1.9636192321777344, + "learning_rate": 4.984181750623037e-05, + "loss": 5.6341, + "step": 6025 + }, + { + "epoch": 0.03583832905128937, + "grad_norm": 1.833883285522461, + "learning_rate": 4.984176503991861e-05, + "loss": 5.5861, + "step": 6026 + }, + { + "epoch": 0.035844276334570366, + "grad_norm": 1.91568124294281, + "learning_rate": 4.984171256493485e-05, + "loss": 5.591, + "step": 6027 + }, + { + "epoch": 0.03585022361785137, + "grad_norm": 2.153472423553467, + "learning_rate": 4.9841660081279105e-05, + "loss": 5.3463, + "step": 6028 + }, + { + "epoch": 0.03585617090113236, + "grad_norm": 1.8164830207824707, + "learning_rate": 4.984160758895139e-05, + "loss": 5.4886, + "step": 6029 + }, + { + "epoch": 0.03586211818441336, + "grad_norm": 2.0216922760009766, + "learning_rate": 4.984155508795174e-05, + "loss": 5.5777, + "step": 6030 + }, + { + "epoch": 0.03586806546769436, + "grad_norm": 1.966779351234436, + "learning_rate": 4.984150257828014e-05, + "loss": 5.1867, + "step": 6031 + }, + { + "epoch": 0.035874012750975355, + "grad_norm": 2.091109275817871, + "learning_rate": 4.9841450059936645e-05, + "loss": 5.5302, + "step": 6032 + }, + { + "epoch": 0.03587996003425635, + "grad_norm": 1.8772802352905273, + "learning_rate": 4.984139753292125e-05, + "loss": 5.2904, + "step": 6033 + }, + { + "epoch": 0.03588590731753735, + "grad_norm": 2.049431800842285, + "learning_rate": 4.984134499723397e-05, + "loss": 5.293, + "step": 6034 + }, + { + "epoch": 0.03589185460081835, + "grad_norm": 2.0902609825134277, + "learning_rate": 4.984129245287485e-05, + "loss": 5.2689, + "step": 6035 + }, + { + "epoch": 0.03589780188409934, + "grad_norm": 1.91702139377594, + "learning_rate": 4.9841239899843886e-05, + "loss": 5.255, + "step": 6036 + }, + { + "epoch": 0.03590374916738034, + "grad_norm": 1.7073708772659302, + "learning_rate": 4.984118733814109e-05, + "loss": 5.3272, + "step": 6037 + }, + { + "epoch": 0.03590969645066134, + "grad_norm": 1.625712275505066, + "learning_rate": 4.9841134767766506e-05, + "loss": 5.5366, + "step": 6038 + }, + { + "epoch": 0.035915643733942335, + "grad_norm": 1.8465087413787842, + "learning_rate": 4.984108218872014e-05, + "loss": 5.3373, + "step": 6039 + }, + { + "epoch": 0.03592159101722333, + "grad_norm": 2.2392280101776123, + "learning_rate": 4.9841029601002e-05, + "loss": 5.5898, + "step": 6040 + }, + { + "epoch": 0.03592753830050433, + "grad_norm": 2.6571459770202637, + "learning_rate": 4.984097700461212e-05, + "loss": 5.963, + "step": 6041 + }, + { + "epoch": 0.03593348558378533, + "grad_norm": 2.7220845222473145, + "learning_rate": 4.98409243995505e-05, + "loss": 5.6997, + "step": 6042 + }, + { + "epoch": 0.03593943286706632, + "grad_norm": 2.430968999862671, + "learning_rate": 4.9840871785817185e-05, + "loss": 5.2949, + "step": 6043 + }, + { + "epoch": 0.035945380150347324, + "grad_norm": 2.3006606101989746, + "learning_rate": 4.984081916341217e-05, + "loss": 5.2045, + "step": 6044 + }, + { + "epoch": 0.03595132743362832, + "grad_norm": 2.2382659912109375, + "learning_rate": 4.984076653233548e-05, + "loss": 5.417, + "step": 6045 + }, + { + "epoch": 0.035957274716909314, + "grad_norm": 2.1896233558654785, + "learning_rate": 4.9840713892587146e-05, + "loss": 5.7215, + "step": 6046 + }, + { + "epoch": 0.035963222000190316, + "grad_norm": 1.8175956010818481, + "learning_rate": 4.9840661244167166e-05, + "loss": 5.569, + "step": 6047 + }, + { + "epoch": 0.03596916928347131, + "grad_norm": 2.066828727722168, + "learning_rate": 4.984060858707557e-05, + "loss": 5.6285, + "step": 6048 + }, + { + "epoch": 0.035975116566752306, + "grad_norm": 2.246291160583496, + "learning_rate": 4.984055592131237e-05, + "loss": 5.5583, + "step": 6049 + }, + { + "epoch": 0.0359810638500333, + "grad_norm": 2.2394871711730957, + "learning_rate": 4.984050324687759e-05, + "loss": 5.3917, + "step": 6050 + }, + { + "epoch": 0.0359870111333143, + "grad_norm": 2.5051162242889404, + "learning_rate": 4.984045056377125e-05, + "loss": 5.6955, + "step": 6051 + }, + { + "epoch": 0.0359929584165953, + "grad_norm": 2.1360414028167725, + "learning_rate": 4.984039787199336e-05, + "loss": 5.5451, + "step": 6052 + }, + { + "epoch": 0.035998905699876294, + "grad_norm": 2.0267562866210938, + "learning_rate": 4.984034517154395e-05, + "loss": 5.4559, + "step": 6053 + }, + { + "epoch": 0.036004852983157296, + "grad_norm": 1.7683112621307373, + "learning_rate": 4.984029246242303e-05, + "loss": 5.4663, + "step": 6054 + }, + { + "epoch": 0.03601080026643829, + "grad_norm": 2.0600638389587402, + "learning_rate": 4.9840239744630626e-05, + "loss": 5.5081, + "step": 6055 + }, + { + "epoch": 0.036016747549719286, + "grad_norm": 2.093698740005493, + "learning_rate": 4.984018701816674e-05, + "loss": 5.5435, + "step": 6056 + }, + { + "epoch": 0.03602269483300029, + "grad_norm": 2.217721462249756, + "learning_rate": 4.984013428303141e-05, + "loss": 5.7482, + "step": 6057 + }, + { + "epoch": 0.03602864211628128, + "grad_norm": 1.9680962562561035, + "learning_rate": 4.9840081539224636e-05, + "loss": 5.9722, + "step": 6058 + }, + { + "epoch": 0.03603458939956228, + "grad_norm": 1.8606425523757935, + "learning_rate": 4.9840028786746455e-05, + "loss": 5.8379, + "step": 6059 + }, + { + "epoch": 0.03604053668284328, + "grad_norm": 2.0129475593566895, + "learning_rate": 4.983997602559688e-05, + "loss": 5.7199, + "step": 6060 + }, + { + "epoch": 0.036046483966124275, + "grad_norm": 1.9370187520980835, + "learning_rate": 4.9839923255775917e-05, + "loss": 5.3563, + "step": 6061 + }, + { + "epoch": 0.03605243124940527, + "grad_norm": 1.775894284248352, + "learning_rate": 4.983987047728359e-05, + "loss": 5.5201, + "step": 6062 + }, + { + "epoch": 0.03605837853268627, + "grad_norm": 1.9943023920059204, + "learning_rate": 4.9839817690119934e-05, + "loss": 5.4034, + "step": 6063 + }, + { + "epoch": 0.03606432581596727, + "grad_norm": 1.9605768918991089, + "learning_rate": 4.983976489428494e-05, + "loss": 5.5314, + "step": 6064 + }, + { + "epoch": 0.03607027309924826, + "grad_norm": 1.7820254564285278, + "learning_rate": 4.983971208977866e-05, + "loss": 5.6131, + "step": 6065 + }, + { + "epoch": 0.03607622038252926, + "grad_norm": 2.010796070098877, + "learning_rate": 4.983965927660108e-05, + "loss": 5.5114, + "step": 6066 + }, + { + "epoch": 0.03608216766581026, + "grad_norm": 1.8461687564849854, + "learning_rate": 4.983960645475223e-05, + "loss": 5.4752, + "step": 6067 + }, + { + "epoch": 0.036088114949091255, + "grad_norm": 2.048119068145752, + "learning_rate": 4.983955362423214e-05, + "loss": 5.3325, + "step": 6068 + }, + { + "epoch": 0.03609406223237225, + "grad_norm": 2.021646499633789, + "learning_rate": 4.9839500785040804e-05, + "loss": 5.2238, + "step": 6069 + }, + { + "epoch": 0.03610000951565325, + "grad_norm": 1.9979503154754639, + "learning_rate": 4.9839447937178264e-05, + "loss": 5.4054, + "step": 6070 + }, + { + "epoch": 0.03610595679893425, + "grad_norm": 1.980776071548462, + "learning_rate": 4.983939508064453e-05, + "loss": 5.4094, + "step": 6071 + }, + { + "epoch": 0.03611190408221524, + "grad_norm": 1.8364293575286865, + "learning_rate": 4.9839342215439615e-05, + "loss": 5.4372, + "step": 6072 + }, + { + "epoch": 0.036117851365496244, + "grad_norm": 1.8870443105697632, + "learning_rate": 4.983928934156354e-05, + "loss": 5.4075, + "step": 6073 + }, + { + "epoch": 0.03612379864877724, + "grad_norm": 2.176180124282837, + "learning_rate": 4.9839236459016337e-05, + "loss": 5.4302, + "step": 6074 + }, + { + "epoch": 0.036129745932058234, + "grad_norm": 2.054960012435913, + "learning_rate": 4.983918356779801e-05, + "loss": 5.3796, + "step": 6075 + }, + { + "epoch": 0.036135693215339236, + "grad_norm": 2.2146401405334473, + "learning_rate": 4.9839130667908576e-05, + "loss": 5.651, + "step": 6076 + }, + { + "epoch": 0.03614164049862023, + "grad_norm": 1.908640742301941, + "learning_rate": 4.983907775934806e-05, + "loss": 5.3002, + "step": 6077 + }, + { + "epoch": 0.036147587781901226, + "grad_norm": 1.9364973306655884, + "learning_rate": 4.983902484211648e-05, + "loss": 5.2299, + "step": 6078 + }, + { + "epoch": 0.03615353506518223, + "grad_norm": 1.7405542135238647, + "learning_rate": 4.983897191621385e-05, + "loss": 5.268, + "step": 6079 + }, + { + "epoch": 0.03615948234846322, + "grad_norm": 2.0347912311553955, + "learning_rate": 4.9838918981640195e-05, + "loss": 5.4887, + "step": 6080 + }, + { + "epoch": 0.03616542963174422, + "grad_norm": 2.0755162239074707, + "learning_rate": 4.9838866038395524e-05, + "loss": 5.2208, + "step": 6081 + }, + { + "epoch": 0.03617137691502521, + "grad_norm": 1.9119634628295898, + "learning_rate": 4.9838813086479865e-05, + "loss": 5.2659, + "step": 6082 + }, + { + "epoch": 0.036177324198306215, + "grad_norm": 1.9172658920288086, + "learning_rate": 4.983876012589324e-05, + "loss": 5.4098, + "step": 6083 + }, + { + "epoch": 0.03618327148158721, + "grad_norm": 2.09004545211792, + "learning_rate": 4.983870715663565e-05, + "loss": 5.5866, + "step": 6084 + }, + { + "epoch": 0.036189218764868206, + "grad_norm": 2.0952436923980713, + "learning_rate": 4.983865417870712e-05, + "loss": 5.5288, + "step": 6085 + }, + { + "epoch": 0.03619516604814921, + "grad_norm": 1.8599412441253662, + "learning_rate": 4.9838601192107686e-05, + "loss": 5.7538, + "step": 6086 + }, + { + "epoch": 0.0362011133314302, + "grad_norm": 1.8318936824798584, + "learning_rate": 4.983854819683735e-05, + "loss": 5.9613, + "step": 6087 + }, + { + "epoch": 0.0362070606147112, + "grad_norm": 1.8312503099441528, + "learning_rate": 4.983849519289613e-05, + "loss": 5.2749, + "step": 6088 + }, + { + "epoch": 0.0362130078979922, + "grad_norm": 2.157576560974121, + "learning_rate": 4.983844218028405e-05, + "loss": 5.2826, + "step": 6089 + }, + { + "epoch": 0.036218955181273195, + "grad_norm": 2.1377198696136475, + "learning_rate": 4.983838915900112e-05, + "loss": 5.2843, + "step": 6090 + }, + { + "epoch": 0.03622490246455419, + "grad_norm": 2.0167126655578613, + "learning_rate": 4.983833612904737e-05, + "loss": 5.4713, + "step": 6091 + }, + { + "epoch": 0.03623084974783519, + "grad_norm": 1.748759388923645, + "learning_rate": 4.9838283090422814e-05, + "loss": 5.3685, + "step": 6092 + }, + { + "epoch": 0.03623679703111619, + "grad_norm": 2.0344316959381104, + "learning_rate": 4.983823004312747e-05, + "loss": 5.1093, + "step": 6093 + }, + { + "epoch": 0.03624274431439718, + "grad_norm": 1.9061161279678345, + "learning_rate": 4.9838176987161356e-05, + "loss": 5.2035, + "step": 6094 + }, + { + "epoch": 0.03624869159767818, + "grad_norm": 1.9090344905853271, + "learning_rate": 4.983812392252449e-05, + "loss": 5.3863, + "step": 6095 + }, + { + "epoch": 0.03625463888095918, + "grad_norm": 1.9536118507385254, + "learning_rate": 4.9838070849216894e-05, + "loss": 5.5349, + "step": 6096 + }, + { + "epoch": 0.036260586164240174, + "grad_norm": 1.89446222782135, + "learning_rate": 4.983801776723858e-05, + "loss": 5.7098, + "step": 6097 + }, + { + "epoch": 0.03626653344752117, + "grad_norm": 1.6403870582580566, + "learning_rate": 4.983796467658958e-05, + "loss": 5.6726, + "step": 6098 + }, + { + "epoch": 0.03627248073080217, + "grad_norm": 1.7792481184005737, + "learning_rate": 4.983791157726989e-05, + "loss": 5.6761, + "step": 6099 + }, + { + "epoch": 0.03627842801408317, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.9837858469279554e-05, + "loss": 5.6576, + "step": 6100 + }, + { + "epoch": 0.03628437529736416, + "grad_norm": 1.9885895252227783, + "learning_rate": 4.983780535261857e-05, + "loss": 5.5944, + "step": 6101 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 1.771620750427246, + "learning_rate": 4.983775222728697e-05, + "loss": 5.7949, + "step": 6102 + }, + { + "epoch": 0.03629626986392616, + "grad_norm": 1.684471845626831, + "learning_rate": 4.9837699093284765e-05, + "loss": 5.5435, + "step": 6103 + }, + { + "epoch": 0.036302217147207154, + "grad_norm": 1.8454065322875977, + "learning_rate": 4.9837645950611966e-05, + "loss": 5.4526, + "step": 6104 + }, + { + "epoch": 0.036308164430488156, + "grad_norm": 1.6522735357284546, + "learning_rate": 4.983759279926862e-05, + "loss": 5.7302, + "step": 6105 + }, + { + "epoch": 0.03631411171376915, + "grad_norm": 1.8691065311431885, + "learning_rate": 4.9837539639254713e-05, + "loss": 5.6494, + "step": 6106 + }, + { + "epoch": 0.036320058997050146, + "grad_norm": 1.9420015811920166, + "learning_rate": 4.9837486470570286e-05, + "loss": 5.77, + "step": 6107 + }, + { + "epoch": 0.03632600628033115, + "grad_norm": 1.8399784564971924, + "learning_rate": 4.9837433293215344e-05, + "loss": 5.6669, + "step": 6108 + }, + { + "epoch": 0.03633195356361214, + "grad_norm": 1.799460530281067, + "learning_rate": 4.983738010718991e-05, + "loss": 5.5557, + "step": 6109 + }, + { + "epoch": 0.03633790084689314, + "grad_norm": 1.8826879262924194, + "learning_rate": 4.9837326912494e-05, + "loss": 5.4865, + "step": 6110 + }, + { + "epoch": 0.03634384813017413, + "grad_norm": 1.9582240581512451, + "learning_rate": 4.983727370912764e-05, + "loss": 5.5882, + "step": 6111 + }, + { + "epoch": 0.036349795413455135, + "grad_norm": 2.011892795562744, + "learning_rate": 4.9837220497090846e-05, + "loss": 5.4932, + "step": 6112 + }, + { + "epoch": 0.03635574269673613, + "grad_norm": 1.7751367092132568, + "learning_rate": 4.983716727638363e-05, + "loss": 5.4981, + "step": 6113 + }, + { + "epoch": 0.036361689980017126, + "grad_norm": 1.984121322631836, + "learning_rate": 4.983711404700603e-05, + "loss": 5.4801, + "step": 6114 + }, + { + "epoch": 0.03636763726329813, + "grad_norm": 1.9601882696151733, + "learning_rate": 4.983706080895804e-05, + "loss": 5.218, + "step": 6115 + }, + { + "epoch": 0.03637358454657912, + "grad_norm": 1.800227165222168, + "learning_rate": 4.9837007562239684e-05, + "loss": 5.5178, + "step": 6116 + }, + { + "epoch": 0.03637953182986012, + "grad_norm": 1.9257889986038208, + "learning_rate": 4.983695430685099e-05, + "loss": 5.6695, + "step": 6117 + }, + { + "epoch": 0.03638547911314112, + "grad_norm": 1.8011913299560547, + "learning_rate": 4.9836901042791976e-05, + "loss": 5.7478, + "step": 6118 + }, + { + "epoch": 0.036391426396422115, + "grad_norm": 1.8668690919876099, + "learning_rate": 4.983684777006264e-05, + "loss": 5.7027, + "step": 6119 + }, + { + "epoch": 0.03639737367970311, + "grad_norm": 1.898126244544983, + "learning_rate": 4.983679448866304e-05, + "loss": 5.5206, + "step": 6120 + }, + { + "epoch": 0.03640332096298411, + "grad_norm": 1.8264409303665161, + "learning_rate": 4.983674119859316e-05, + "loss": 5.4686, + "step": 6121 + }, + { + "epoch": 0.03640926824626511, + "grad_norm": 1.8090230226516724, + "learning_rate": 4.983668789985303e-05, + "loss": 5.4761, + "step": 6122 + }, + { + "epoch": 0.0364152155295461, + "grad_norm": 1.8193403482437134, + "learning_rate": 4.983663459244266e-05, + "loss": 5.3443, + "step": 6123 + }, + { + "epoch": 0.0364211628128271, + "grad_norm": 1.8199255466461182, + "learning_rate": 4.9836581276362095e-05, + "loss": 5.427, + "step": 6124 + }, + { + "epoch": 0.0364271100961081, + "grad_norm": 1.72145414352417, + "learning_rate": 4.9836527951611325e-05, + "loss": 5.4372, + "step": 6125 + }, + { + "epoch": 0.036433057379389094, + "grad_norm": 1.8164423704147339, + "learning_rate": 4.9836474618190386e-05, + "loss": 5.4702, + "step": 6126 + }, + { + "epoch": 0.03643900466267009, + "grad_norm": 1.897775650024414, + "learning_rate": 4.9836421276099287e-05, + "loss": 5.4259, + "step": 6127 + }, + { + "epoch": 0.03644495194595109, + "grad_norm": 1.851101279258728, + "learning_rate": 4.9836367925338046e-05, + "loss": 5.3837, + "step": 6128 + }, + { + "epoch": 0.03645089922923209, + "grad_norm": 1.749374508857727, + "learning_rate": 4.98363145659067e-05, + "loss": 5.3232, + "step": 6129 + }, + { + "epoch": 0.03645684651251308, + "grad_norm": 1.95986008644104, + "learning_rate": 4.9836261197805235e-05, + "loss": 5.2692, + "step": 6130 + }, + { + "epoch": 0.036462793795794084, + "grad_norm": 1.7947750091552734, + "learning_rate": 4.98362078210337e-05, + "loss": 5.409, + "step": 6131 + }, + { + "epoch": 0.03646874107907508, + "grad_norm": 2.119044303894043, + "learning_rate": 4.983615443559209e-05, + "loss": 5.5924, + "step": 6132 + }, + { + "epoch": 0.036474688362356074, + "grad_norm": 1.7285267114639282, + "learning_rate": 4.983610104148044e-05, + "loss": 5.6955, + "step": 6133 + }, + { + "epoch": 0.036480635645637076, + "grad_norm": 2.1711652278900146, + "learning_rate": 4.983604763869877e-05, + "loss": 5.1941, + "step": 6134 + }, + { + "epoch": 0.03648658292891807, + "grad_norm": 2.060039758682251, + "learning_rate": 4.983599422724709e-05, + "loss": 5.5131, + "step": 6135 + }, + { + "epoch": 0.036492530212199066, + "grad_norm": 1.6212393045425415, + "learning_rate": 4.9835940807125415e-05, + "loss": 5.4856, + "step": 6136 + }, + { + "epoch": 0.03649847749548007, + "grad_norm": 1.7602918148040771, + "learning_rate": 4.983588737833378e-05, + "loss": 5.4177, + "step": 6137 + }, + { + "epoch": 0.03650442477876106, + "grad_norm": 2.660930633544922, + "learning_rate": 4.983583394087218e-05, + "loss": 5.5879, + "step": 6138 + }, + { + "epoch": 0.03651037206204206, + "grad_norm": 2.3608336448669434, + "learning_rate": 4.9835780494740655e-05, + "loss": 5.3894, + "step": 6139 + }, + { + "epoch": 0.03651631934532305, + "grad_norm": 2.071632146835327, + "learning_rate": 4.983572703993922e-05, + "loss": 5.6185, + "step": 6140 + }, + { + "epoch": 0.036522266628604055, + "grad_norm": 1.7023842334747314, + "learning_rate": 4.983567357646788e-05, + "loss": 5.5648, + "step": 6141 + }, + { + "epoch": 0.03652821391188505, + "grad_norm": 2.2168798446655273, + "learning_rate": 4.983562010432667e-05, + "loss": 5.4578, + "step": 6142 + }, + { + "epoch": 0.036534161195166046, + "grad_norm": 2.0916104316711426, + "learning_rate": 4.98355666235156e-05, + "loss": 5.4977, + "step": 6143 + }, + { + "epoch": 0.03654010847844705, + "grad_norm": 1.7101606130599976, + "learning_rate": 4.9835513134034686e-05, + "loss": 5.4081, + "step": 6144 + }, + { + "epoch": 0.03654605576172804, + "grad_norm": 1.9058302640914917, + "learning_rate": 4.983545963588395e-05, + "loss": 5.2145, + "step": 6145 + }, + { + "epoch": 0.03655200304500904, + "grad_norm": 2.319023847579956, + "learning_rate": 4.9835406129063424e-05, + "loss": 5.3023, + "step": 6146 + }, + { + "epoch": 0.03655795032829004, + "grad_norm": 2.1135916709899902, + "learning_rate": 4.98353526135731e-05, + "loss": 5.4796, + "step": 6147 + }, + { + "epoch": 0.036563897611571035, + "grad_norm": 2.409088373184204, + "learning_rate": 4.983529908941302e-05, + "loss": 5.3124, + "step": 6148 + }, + { + "epoch": 0.03656984489485203, + "grad_norm": 1.8679871559143066, + "learning_rate": 4.9835245556583185e-05, + "loss": 5.3741, + "step": 6149 + }, + { + "epoch": 0.03657579217813303, + "grad_norm": 1.9335602521896362, + "learning_rate": 4.983519201508363e-05, + "loss": 5.3231, + "step": 6150 + }, + { + "epoch": 0.03658173946141403, + "grad_norm": 2.0352535247802734, + "learning_rate": 4.9835138464914366e-05, + "loss": 5.4643, + "step": 6151 + }, + { + "epoch": 0.03658768674469502, + "grad_norm": 2.4156594276428223, + "learning_rate": 4.983508490607541e-05, + "loss": 5.4092, + "step": 6152 + }, + { + "epoch": 0.03659363402797602, + "grad_norm": 2.1936473846435547, + "learning_rate": 4.983503133856678e-05, + "loss": 5.5093, + "step": 6153 + }, + { + "epoch": 0.03659958131125702, + "grad_norm": 1.6346958875656128, + "learning_rate": 4.98349777623885e-05, + "loss": 5.512, + "step": 6154 + }, + { + "epoch": 0.036605528594538014, + "grad_norm": 1.9810141324996948, + "learning_rate": 4.9834924177540584e-05, + "loss": 5.4981, + "step": 6155 + }, + { + "epoch": 0.03661147587781901, + "grad_norm": 2.1253950595855713, + "learning_rate": 4.9834870584023055e-05, + "loss": 5.4022, + "step": 6156 + }, + { + "epoch": 0.03661742316110001, + "grad_norm": 2.011754274368286, + "learning_rate": 4.9834816981835926e-05, + "loss": 5.6107, + "step": 6157 + }, + { + "epoch": 0.036623370444381007, + "grad_norm": 2.210934638977051, + "learning_rate": 4.983476337097922e-05, + "loss": 5.4348, + "step": 6158 + }, + { + "epoch": 0.036629317727662, + "grad_norm": 2.1351871490478516, + "learning_rate": 4.983470975145296e-05, + "loss": 5.2022, + "step": 6159 + }, + { + "epoch": 0.036635265010943004, + "grad_norm": 2.1564714908599854, + "learning_rate": 4.983465612325715e-05, + "loss": 5.3583, + "step": 6160 + }, + { + "epoch": 0.036641212294224, + "grad_norm": 1.9411755800247192, + "learning_rate": 4.983460248639182e-05, + "loss": 5.4643, + "step": 6161 + }, + { + "epoch": 0.036647159577504994, + "grad_norm": 2.129741907119751, + "learning_rate": 4.983454884085699e-05, + "loss": 5.3834, + "step": 6162 + }, + { + "epoch": 0.036653106860785996, + "grad_norm": 2.12172269821167, + "learning_rate": 4.983449518665268e-05, + "loss": 5.4418, + "step": 6163 + }, + { + "epoch": 0.03665905414406699, + "grad_norm": 2.097452163696289, + "learning_rate": 4.9834441523778893e-05, + "loss": 5.3741, + "step": 6164 + }, + { + "epoch": 0.036665001427347986, + "grad_norm": 2.0458765029907227, + "learning_rate": 4.983438785223567e-05, + "loss": 5.373, + "step": 6165 + }, + { + "epoch": 0.03667094871062899, + "grad_norm": 1.9431376457214355, + "learning_rate": 4.983433417202301e-05, + "loss": 5.4003, + "step": 6166 + }, + { + "epoch": 0.03667689599390998, + "grad_norm": 2.136819362640381, + "learning_rate": 4.983428048314095e-05, + "loss": 5.503, + "step": 6167 + }, + { + "epoch": 0.03668284327719098, + "grad_norm": 1.863153338432312, + "learning_rate": 4.983422678558949e-05, + "loss": 5.4357, + "step": 6168 + }, + { + "epoch": 0.03668879056047197, + "grad_norm": 1.9198437929153442, + "learning_rate": 4.9834173079368665e-05, + "loss": 5.4304, + "step": 6169 + }, + { + "epoch": 0.036694737843752975, + "grad_norm": 1.9080480337142944, + "learning_rate": 4.9834119364478484e-05, + "loss": 5.4329, + "step": 6170 + }, + { + "epoch": 0.03670068512703397, + "grad_norm": 1.9116952419281006, + "learning_rate": 4.983406564091897e-05, + "loss": 5.3248, + "step": 6171 + }, + { + "epoch": 0.036706632410314965, + "grad_norm": 2.007685661315918, + "learning_rate": 4.983401190869014e-05, + "loss": 5.3554, + "step": 6172 + }, + { + "epoch": 0.03671257969359597, + "grad_norm": 1.8134535551071167, + "learning_rate": 4.983395816779201e-05, + "loss": 5.2907, + "step": 6173 + }, + { + "epoch": 0.03671852697687696, + "grad_norm": 2.093061685562134, + "learning_rate": 4.9833904418224606e-05, + "loss": 5.4055, + "step": 6174 + }, + { + "epoch": 0.03672447426015796, + "grad_norm": 2.1263599395751953, + "learning_rate": 4.9833850659987934e-05, + "loss": 5.2758, + "step": 6175 + }, + { + "epoch": 0.03673042154343896, + "grad_norm": 1.9442895650863647, + "learning_rate": 4.983379689308203e-05, + "loss": 5.4183, + "step": 6176 + }, + { + "epoch": 0.036736368826719955, + "grad_norm": 1.9587830305099487, + "learning_rate": 4.98337431175069e-05, + "loss": 5.3624, + "step": 6177 + }, + { + "epoch": 0.03674231611000095, + "grad_norm": 1.9845789670944214, + "learning_rate": 4.9833689333262565e-05, + "loss": 5.3933, + "step": 6178 + }, + { + "epoch": 0.03674826339328195, + "grad_norm": 1.9748643636703491, + "learning_rate": 4.9833635540349055e-05, + "loss": 5.5221, + "step": 6179 + }, + { + "epoch": 0.03675421067656295, + "grad_norm": 1.8139559030532837, + "learning_rate": 4.983358173876638e-05, + "loss": 5.5524, + "step": 6180 + }, + { + "epoch": 0.03676015795984394, + "grad_norm": 1.93784499168396, + "learning_rate": 4.9833527928514546e-05, + "loss": 5.7145, + "step": 6181 + }, + { + "epoch": 0.03676610524312494, + "grad_norm": 1.9064222574234009, + "learning_rate": 4.9833474109593594e-05, + "loss": 5.5283, + "step": 6182 + }, + { + "epoch": 0.03677205252640594, + "grad_norm": 1.7044670581817627, + "learning_rate": 4.9833420282003524e-05, + "loss": 5.2877, + "step": 6183 + }, + { + "epoch": 0.036777999809686934, + "grad_norm": 1.8328427076339722, + "learning_rate": 4.983336644574437e-05, + "loss": 5.5019, + "step": 6184 + }, + { + "epoch": 0.03678394709296793, + "grad_norm": 1.600780725479126, + "learning_rate": 4.983331260081614e-05, + "loss": 5.5347, + "step": 6185 + }, + { + "epoch": 0.03678989437624893, + "grad_norm": 1.8333978652954102, + "learning_rate": 4.983325874721886e-05, + "loss": 5.5127, + "step": 6186 + }, + { + "epoch": 0.036795841659529926, + "grad_norm": 1.8825682401657104, + "learning_rate": 4.9833204884952546e-05, + "loss": 5.5338, + "step": 6187 + }, + { + "epoch": 0.03680178894281092, + "grad_norm": 1.6875951290130615, + "learning_rate": 4.983315101401721e-05, + "loss": 5.2465, + "step": 6188 + }, + { + "epoch": 0.036807736226091924, + "grad_norm": 1.6224017143249512, + "learning_rate": 4.983309713441289e-05, + "loss": 5.4741, + "step": 6189 + }, + { + "epoch": 0.03681368350937292, + "grad_norm": 1.991721272468567, + "learning_rate": 4.983304324613958e-05, + "loss": 5.4547, + "step": 6190 + }, + { + "epoch": 0.036819630792653914, + "grad_norm": 1.843961238861084, + "learning_rate": 4.983298934919732e-05, + "loss": 5.3262, + "step": 6191 + }, + { + "epoch": 0.036825578075934916, + "grad_norm": 1.8342533111572266, + "learning_rate": 4.983293544358612e-05, + "loss": 5.6808, + "step": 6192 + }, + { + "epoch": 0.03683152535921591, + "grad_norm": 1.8796159029006958, + "learning_rate": 4.983288152930599e-05, + "loss": 5.5454, + "step": 6193 + }, + { + "epoch": 0.036837472642496906, + "grad_norm": 1.9033316373825073, + "learning_rate": 4.983282760635696e-05, + "loss": 5.3566, + "step": 6194 + }, + { + "epoch": 0.03684341992577791, + "grad_norm": 1.915873408317566, + "learning_rate": 4.9832773674739054e-05, + "loss": 5.4555, + "step": 6195 + }, + { + "epoch": 0.0368493672090589, + "grad_norm": 1.8510993719100952, + "learning_rate": 4.983271973445228e-05, + "loss": 5.5042, + "step": 6196 + }, + { + "epoch": 0.0368553144923399, + "grad_norm": 1.7180782556533813, + "learning_rate": 4.983266578549666e-05, + "loss": 5.4671, + "step": 6197 + }, + { + "epoch": 0.03686126177562089, + "grad_norm": 1.7828874588012695, + "learning_rate": 4.983261182787221e-05, + "loss": 5.4943, + "step": 6198 + }, + { + "epoch": 0.036867209058901895, + "grad_norm": 1.5032141208648682, + "learning_rate": 4.983255786157895e-05, + "loss": 5.3881, + "step": 6199 + }, + { + "epoch": 0.03687315634218289, + "grad_norm": 2.530954599380493, + "learning_rate": 4.983250388661691e-05, + "loss": 5.4449, + "step": 6200 + }, + { + "epoch": 0.036879103625463885, + "grad_norm": 2.011044979095459, + "learning_rate": 4.983244990298609e-05, + "loss": 5.2722, + "step": 6201 + }, + { + "epoch": 0.03688505090874489, + "grad_norm": 2.2209532260894775, + "learning_rate": 4.9832395910686525e-05, + "loss": 5.0932, + "step": 6202 + }, + { + "epoch": 0.03689099819202588, + "grad_norm": 1.8695623874664307, + "learning_rate": 4.983234190971823e-05, + "loss": 5.2891, + "step": 6203 + }, + { + "epoch": 0.03689694547530688, + "grad_norm": 2.172349691390991, + "learning_rate": 4.983228790008121e-05, + "loss": 5.578, + "step": 6204 + }, + { + "epoch": 0.03690289275858788, + "grad_norm": 2.1099209785461426, + "learning_rate": 4.9832233881775505e-05, + "loss": 5.3708, + "step": 6205 + }, + { + "epoch": 0.036908840041868875, + "grad_norm": 2.16737961769104, + "learning_rate": 4.9832179854801116e-05, + "loss": 5.303, + "step": 6206 + }, + { + "epoch": 0.03691478732514987, + "grad_norm": 2.248220682144165, + "learning_rate": 4.983212581915807e-05, + "loss": 5.362, + "step": 6207 + }, + { + "epoch": 0.03692073460843087, + "grad_norm": 2.0701045989990234, + "learning_rate": 4.983207177484639e-05, + "loss": 5.4528, + "step": 6208 + }, + { + "epoch": 0.03692668189171187, + "grad_norm": 1.9989019632339478, + "learning_rate": 4.983201772186609e-05, + "loss": 5.786, + "step": 6209 + }, + { + "epoch": 0.03693262917499286, + "grad_norm": 1.9126088619232178, + "learning_rate": 4.983196366021719e-05, + "loss": 5.2312, + "step": 6210 + }, + { + "epoch": 0.03693857645827386, + "grad_norm": 2.1317548751831055, + "learning_rate": 4.9831909589899695e-05, + "loss": 5.3028, + "step": 6211 + }, + { + "epoch": 0.03694452374155486, + "grad_norm": 2.164898157119751, + "learning_rate": 4.983185551091365e-05, + "loss": 5.3186, + "step": 6212 + }, + { + "epoch": 0.036950471024835854, + "grad_norm": 2.1085855960845947, + "learning_rate": 4.983180142325906e-05, + "loss": 5.3026, + "step": 6213 + }, + { + "epoch": 0.03695641830811685, + "grad_norm": 1.8321222066879272, + "learning_rate": 4.983174732693594e-05, + "loss": 5.6632, + "step": 6214 + }, + { + "epoch": 0.03696236559139785, + "grad_norm": 2.0537941455841064, + "learning_rate": 4.983169322194432e-05, + "loss": 5.2269, + "step": 6215 + }, + { + "epoch": 0.036968312874678846, + "grad_norm": 1.9598063230514526, + "learning_rate": 4.98316391082842e-05, + "loss": 5.4974, + "step": 6216 + }, + { + "epoch": 0.03697426015795984, + "grad_norm": 2.3764376640319824, + "learning_rate": 4.983158498595563e-05, + "loss": 5.7715, + "step": 6217 + }, + { + "epoch": 0.036980207441240844, + "grad_norm": 1.8938835859298706, + "learning_rate": 4.9831530854958595e-05, + "loss": 5.5577, + "step": 6218 + }, + { + "epoch": 0.03698615472452184, + "grad_norm": 2.2023189067840576, + "learning_rate": 4.9831476715293134e-05, + "loss": 5.2596, + "step": 6219 + }, + { + "epoch": 0.036992102007802834, + "grad_norm": 1.9010800123214722, + "learning_rate": 4.9831422566959266e-05, + "loss": 5.3313, + "step": 6220 + }, + { + "epoch": 0.036998049291083836, + "grad_norm": 1.9679474830627441, + "learning_rate": 4.9831368409957e-05, + "loss": 5.2701, + "step": 6221 + }, + { + "epoch": 0.03700399657436483, + "grad_norm": 1.903558373451233, + "learning_rate": 4.983131424428635e-05, + "loss": 5.2821, + "step": 6222 + }, + { + "epoch": 0.037009943857645826, + "grad_norm": 1.976114273071289, + "learning_rate": 4.983126006994736e-05, + "loss": 5.374, + "step": 6223 + }, + { + "epoch": 0.03701589114092683, + "grad_norm": 2.9803311824798584, + "learning_rate": 4.983120588694003e-05, + "loss": 5.3576, + "step": 6224 + }, + { + "epoch": 0.03702183842420782, + "grad_norm": 1.5921218395233154, + "learning_rate": 4.983115169526438e-05, + "loss": 5.1654, + "step": 6225 + }, + { + "epoch": 0.03702778570748882, + "grad_norm": 1.7458349466323853, + "learning_rate": 4.983109749492043e-05, + "loss": 5.1038, + "step": 6226 + }, + { + "epoch": 0.03703373299076981, + "grad_norm": 1.9425132274627686, + "learning_rate": 4.983104328590821e-05, + "loss": 5.3815, + "step": 6227 + }, + { + "epoch": 0.037039680274050815, + "grad_norm": 1.9506715536117554, + "learning_rate": 4.983098906822772e-05, + "loss": 5.2215, + "step": 6228 + }, + { + "epoch": 0.03704562755733181, + "grad_norm": 1.8596410751342773, + "learning_rate": 4.983093484187899e-05, + "loss": 5.2058, + "step": 6229 + }, + { + "epoch": 0.037051574840612805, + "grad_norm": 1.720473289489746, + "learning_rate": 4.9830880606862043e-05, + "loss": 5.2701, + "step": 6230 + }, + { + "epoch": 0.03705752212389381, + "grad_norm": 1.7786411046981812, + "learning_rate": 4.983082636317688e-05, + "loss": 5.3216, + "step": 6231 + }, + { + "epoch": 0.0370634694071748, + "grad_norm": 3.6291537284851074, + "learning_rate": 4.983077211082354e-05, + "loss": 5.2282, + "step": 6232 + }, + { + "epoch": 0.0370694166904558, + "grad_norm": 1.7453030347824097, + "learning_rate": 4.983071784980203e-05, + "loss": 5.2667, + "step": 6233 + }, + { + "epoch": 0.0370753639737368, + "grad_norm": 1.7036694288253784, + "learning_rate": 4.983066358011238e-05, + "loss": 5.3023, + "step": 6234 + }, + { + "epoch": 0.037081311257017795, + "grad_norm": 1.7196505069732666, + "learning_rate": 4.9830609301754595e-05, + "loss": 5.2211, + "step": 6235 + }, + { + "epoch": 0.03708725854029879, + "grad_norm": 3.4630305767059326, + "learning_rate": 4.983055501472871e-05, + "loss": 5.6159, + "step": 6236 + }, + { + "epoch": 0.03709320582357979, + "grad_norm": 2.9739367961883545, + "learning_rate": 4.9830500719034726e-05, + "loss": 5.4477, + "step": 6237 + }, + { + "epoch": 0.03709915310686079, + "grad_norm": 2.760664463043213, + "learning_rate": 4.983044641467267e-05, + "loss": 5.0879, + "step": 6238 + }, + { + "epoch": 0.03710510039014178, + "grad_norm": 2.166203022003174, + "learning_rate": 4.9830392101642566e-05, + "loss": 5.5635, + "step": 6239 + }, + { + "epoch": 0.03711104767342278, + "grad_norm": 2.3798410892486572, + "learning_rate": 4.9830337779944425e-05, + "loss": 5.0676, + "step": 6240 + }, + { + "epoch": 0.03711699495670378, + "grad_norm": 2.3990557193756104, + "learning_rate": 4.983028344957827e-05, + "loss": 5.2788, + "step": 6241 + }, + { + "epoch": 0.037122942239984774, + "grad_norm": 2.487978458404541, + "learning_rate": 4.9830229110544124e-05, + "loss": 5.852, + "step": 6242 + }, + { + "epoch": 0.03712888952326577, + "grad_norm": 2.304749011993408, + "learning_rate": 4.9830174762842e-05, + "loss": 6.0886, + "step": 6243 + }, + { + "epoch": 0.03713483680654677, + "grad_norm": 2.169614791870117, + "learning_rate": 4.983012040647191e-05, + "loss": 6.1178, + "step": 6244 + }, + { + "epoch": 0.037140784089827766, + "grad_norm": 2.119131326675415, + "learning_rate": 4.98300660414339e-05, + "loss": 6.25, + "step": 6245 + }, + { + "epoch": 0.03714673137310876, + "grad_norm": 2.3797547817230225, + "learning_rate": 4.9830011667727964e-05, + "loss": 5.879, + "step": 6246 + }, + { + "epoch": 0.03715267865638976, + "grad_norm": 2.303718328475952, + "learning_rate": 4.982995728535411e-05, + "loss": 6.0015, + "step": 6247 + }, + { + "epoch": 0.03715862593967076, + "grad_norm": 2.867103099822998, + "learning_rate": 4.9829902894312396e-05, + "loss": 5.8726, + "step": 6248 + }, + { + "epoch": 0.037164573222951754, + "grad_norm": 2.4248557090759277, + "learning_rate": 4.9829848494602806e-05, + "loss": 5.6579, + "step": 6249 + }, + { + "epoch": 0.037170520506232756, + "grad_norm": 2.2622148990631104, + "learning_rate": 4.982979408622538e-05, + "loss": 5.7677, + "step": 6250 + }, + { + "epoch": 0.03717646778951375, + "grad_norm": 2.320502996444702, + "learning_rate": 4.9829739669180126e-05, + "loss": 5.7362, + "step": 6251 + }, + { + "epoch": 0.037182415072794746, + "grad_norm": 2.2096636295318604, + "learning_rate": 4.9829685243467065e-05, + "loss": 5.9069, + "step": 6252 + }, + { + "epoch": 0.03718836235607575, + "grad_norm": 2.620361089706421, + "learning_rate": 4.982963080908623e-05, + "loss": 5.9419, + "step": 6253 + }, + { + "epoch": 0.03719430963935674, + "grad_norm": 2.478158950805664, + "learning_rate": 4.982957636603761e-05, + "loss": 6.4776, + "step": 6254 + }, + { + "epoch": 0.03720025692263774, + "grad_norm": 2.5912528038024902, + "learning_rate": 4.982952191432125e-05, + "loss": 5.7176, + "step": 6255 + }, + { + "epoch": 0.03720620420591873, + "grad_norm": 2.57177734375, + "learning_rate": 4.982946745393716e-05, + "loss": 5.4271, + "step": 6256 + }, + { + "epoch": 0.037212151489199735, + "grad_norm": 2.424567699432373, + "learning_rate": 4.982941298488535e-05, + "loss": 5.82, + "step": 6257 + }, + { + "epoch": 0.03721809877248073, + "grad_norm": 2.477827548980713, + "learning_rate": 4.9829358507165856e-05, + "loss": 5.7961, + "step": 6258 + }, + { + "epoch": 0.037224046055761725, + "grad_norm": 2.0598270893096924, + "learning_rate": 4.982930402077869e-05, + "loss": 5.9264, + "step": 6259 + }, + { + "epoch": 0.03722999333904273, + "grad_norm": 2.0599095821380615, + "learning_rate": 4.9829249525723875e-05, + "loss": 6.0518, + "step": 6260 + }, + { + "epoch": 0.03723594062232372, + "grad_norm": 2.110170841217041, + "learning_rate": 4.982919502200142e-05, + "loss": 5.8631, + "step": 6261 + }, + { + "epoch": 0.03724188790560472, + "grad_norm": 2.333972930908203, + "learning_rate": 4.982914050961135e-05, + "loss": 5.5361, + "step": 6262 + }, + { + "epoch": 0.03724783518888572, + "grad_norm": 2.2322769165039062, + "learning_rate": 4.982908598855369e-05, + "loss": 5.8002, + "step": 6263 + }, + { + "epoch": 0.037253782472166715, + "grad_norm": 1.9915717840194702, + "learning_rate": 4.982903145882845e-05, + "loss": 5.7096, + "step": 6264 + }, + { + "epoch": 0.03725972975544771, + "grad_norm": 2.2031619548797607, + "learning_rate": 4.9828976920435645e-05, + "loss": 5.5716, + "step": 6265 + }, + { + "epoch": 0.03726567703872871, + "grad_norm": 2.9422314167022705, + "learning_rate": 4.9828922373375295e-05, + "loss": 5.929, + "step": 6266 + }, + { + "epoch": 0.03727162432200971, + "grad_norm": 3.264784336090088, + "learning_rate": 4.982886781764744e-05, + "loss": 5.9801, + "step": 6267 + }, + { + "epoch": 0.0372775716052907, + "grad_norm": 2.8314197063446045, + "learning_rate": 4.982881325325208e-05, + "loss": 6.0173, + "step": 6268 + }, + { + "epoch": 0.0372835188885717, + "grad_norm": 2.9550328254699707, + "learning_rate": 4.9828758680189234e-05, + "loss": 5.9838, + "step": 6269 + }, + { + "epoch": 0.0372894661718527, + "grad_norm": 2.6827526092529297, + "learning_rate": 4.9828704098458924e-05, + "loss": 6.0235, + "step": 6270 + }, + { + "epoch": 0.037295413455133694, + "grad_norm": 2.7174222469329834, + "learning_rate": 4.982864950806118e-05, + "loss": 5.8315, + "step": 6271 + }, + { + "epoch": 0.03730136073841469, + "grad_norm": 2.6177315711975098, + "learning_rate": 4.9828594908996e-05, + "loss": 5.8577, + "step": 6272 + }, + { + "epoch": 0.03730730802169569, + "grad_norm": 2.449669361114502, + "learning_rate": 4.982854030126342e-05, + "loss": 5.9591, + "step": 6273 + }, + { + "epoch": 0.037313255304976686, + "grad_norm": 2.5328989028930664, + "learning_rate": 4.9828485684863446e-05, + "loss": 5.7764, + "step": 6274 + }, + { + "epoch": 0.03731920258825768, + "grad_norm": 2.2581989765167236, + "learning_rate": 4.982843105979611e-05, + "loss": 5.9524, + "step": 6275 + }, + { + "epoch": 0.03732514987153868, + "grad_norm": 2.261212110519409, + "learning_rate": 4.982837642606142e-05, + "loss": 5.5814, + "step": 6276 + }, + { + "epoch": 0.03733109715481968, + "grad_norm": 2.2957348823547363, + "learning_rate": 4.98283217836594e-05, + "loss": 5.6967, + "step": 6277 + }, + { + "epoch": 0.037337044438100674, + "grad_norm": 2.814037322998047, + "learning_rate": 4.982826713259008e-05, + "loss": 5.8787, + "step": 6278 + }, + { + "epoch": 0.037342991721381676, + "grad_norm": 2.678133249282837, + "learning_rate": 4.9828212472853464e-05, + "loss": 5.94, + "step": 6279 + }, + { + "epoch": 0.03734893900466267, + "grad_norm": 2.2949652671813965, + "learning_rate": 4.982815780444957e-05, + "loss": 5.7263, + "step": 6280 + }, + { + "epoch": 0.037354886287943666, + "grad_norm": 2.4542131423950195, + "learning_rate": 4.982810312737842e-05, + "loss": 5.8317, + "step": 6281 + }, + { + "epoch": 0.03736083357122467, + "grad_norm": 2.7850544452667236, + "learning_rate": 4.982804844164005e-05, + "loss": 5.5631, + "step": 6282 + }, + { + "epoch": 0.03736678085450566, + "grad_norm": 2.6285061836242676, + "learning_rate": 4.9827993747234454e-05, + "loss": 5.6212, + "step": 6283 + }, + { + "epoch": 0.03737272813778666, + "grad_norm": 2.602590799331665, + "learning_rate": 4.9827939044161666e-05, + "loss": 5.5529, + "step": 6284 + }, + { + "epoch": 0.03737867542106765, + "grad_norm": 2.6196670532226562, + "learning_rate": 4.98278843324217e-05, + "loss": 5.6915, + "step": 6285 + }, + { + "epoch": 0.037384622704348655, + "grad_norm": 2.7072317600250244, + "learning_rate": 4.982782961201457e-05, + "loss": 5.7535, + "step": 6286 + }, + { + "epoch": 0.03739056998762965, + "grad_norm": 2.626033067703247, + "learning_rate": 4.982777488294031e-05, + "loss": 5.6053, + "step": 6287 + }, + { + "epoch": 0.037396517270910645, + "grad_norm": 1.8426648378372192, + "learning_rate": 4.982772014519892e-05, + "loss": 5.6167, + "step": 6288 + }, + { + "epoch": 0.03740246455419165, + "grad_norm": 2.5587830543518066, + "learning_rate": 4.9827665398790445e-05, + "loss": 5.6442, + "step": 6289 + }, + { + "epoch": 0.03740841183747264, + "grad_norm": 2.6163039207458496, + "learning_rate": 4.9827610643714877e-05, + "loss": 5.699, + "step": 6290 + }, + { + "epoch": 0.03741435912075364, + "grad_norm": 2.5752358436584473, + "learning_rate": 4.982755587997225e-05, + "loss": 5.666, + "step": 6291 + }, + { + "epoch": 0.03742030640403464, + "grad_norm": 2.6609575748443604, + "learning_rate": 4.982750110756258e-05, + "loss": 5.5634, + "step": 6292 + }, + { + "epoch": 0.037426253687315635, + "grad_norm": 2.724731683731079, + "learning_rate": 4.9827446326485884e-05, + "loss": 5.6259, + "step": 6293 + }, + { + "epoch": 0.03743220097059663, + "grad_norm": 2.5849807262420654, + "learning_rate": 4.9827391536742185e-05, + "loss": 5.6182, + "step": 6294 + }, + { + "epoch": 0.03743814825387763, + "grad_norm": 2.6737449169158936, + "learning_rate": 4.9827336738331496e-05, + "loss": 5.5426, + "step": 6295 + }, + { + "epoch": 0.03744409553715863, + "grad_norm": 2.5739669799804688, + "learning_rate": 4.9827281931253844e-05, + "loss": 5.6283, + "step": 6296 + }, + { + "epoch": 0.03745004282043962, + "grad_norm": 2.652730703353882, + "learning_rate": 4.982722711550924e-05, + "loss": 5.5241, + "step": 6297 + }, + { + "epoch": 0.037455990103720624, + "grad_norm": 2.7140653133392334, + "learning_rate": 4.982717229109772e-05, + "loss": 5.7052, + "step": 6298 + }, + { + "epoch": 0.03746193738700162, + "grad_norm": 2.1617860794067383, + "learning_rate": 4.982711745801928e-05, + "loss": 5.6224, + "step": 6299 + }, + { + "epoch": 0.037467884670282614, + "grad_norm": 2.1400585174560547, + "learning_rate": 4.982706261627395e-05, + "loss": 5.5753, + "step": 6300 + }, + { + "epoch": 0.03747383195356361, + "grad_norm": 2.4439101219177246, + "learning_rate": 4.9827007765861754e-05, + "loss": 5.6219, + "step": 6301 + }, + { + "epoch": 0.03747977923684461, + "grad_norm": 2.507141351699829, + "learning_rate": 4.9826952906782697e-05, + "loss": 5.6666, + "step": 6302 + }, + { + "epoch": 0.037485726520125606, + "grad_norm": 2.2664029598236084, + "learning_rate": 4.982689803903682e-05, + "loss": 5.7792, + "step": 6303 + }, + { + "epoch": 0.0374916738034066, + "grad_norm": 2.49678635597229, + "learning_rate": 4.982684316262411e-05, + "loss": 5.5899, + "step": 6304 + }, + { + "epoch": 0.0374976210866876, + "grad_norm": 2.244603395462036, + "learning_rate": 4.9826788277544625e-05, + "loss": 5.4624, + "step": 6305 + }, + { + "epoch": 0.0375035683699686, + "grad_norm": 2.144343376159668, + "learning_rate": 4.9826733383798366e-05, + "loss": 5.3428, + "step": 6306 + }, + { + "epoch": 0.037509515653249594, + "grad_norm": 1.7709565162658691, + "learning_rate": 4.982667848138534e-05, + "loss": 5.3596, + "step": 6307 + }, + { + "epoch": 0.037515462936530596, + "grad_norm": 2.0245232582092285, + "learning_rate": 4.9826623570305574e-05, + "loss": 5.4005, + "step": 6308 + }, + { + "epoch": 0.03752141021981159, + "grad_norm": 2.5346829891204834, + "learning_rate": 4.9826568650559095e-05, + "loss": 5.5089, + "step": 6309 + }, + { + "epoch": 0.037527357503092586, + "grad_norm": 2.638684034347534, + "learning_rate": 4.982651372214592e-05, + "loss": 5.6847, + "step": 6310 + }, + { + "epoch": 0.03753330478637359, + "grad_norm": 2.024423122406006, + "learning_rate": 4.982645878506606e-05, + "loss": 5.3633, + "step": 6311 + }, + { + "epoch": 0.03753925206965458, + "grad_norm": 1.983167290687561, + "learning_rate": 4.982640383931955e-05, + "loss": 5.2086, + "step": 6312 + }, + { + "epoch": 0.03754519935293558, + "grad_norm": 1.8388524055480957, + "learning_rate": 4.982634888490639e-05, + "loss": 5.1904, + "step": 6313 + }, + { + "epoch": 0.03755114663621657, + "grad_norm": 1.8280584812164307, + "learning_rate": 4.982629392182661e-05, + "loss": 5.3072, + "step": 6314 + }, + { + "epoch": 0.037557093919497575, + "grad_norm": 1.6278408765792847, + "learning_rate": 4.982623895008023e-05, + "loss": 5.3003, + "step": 6315 + }, + { + "epoch": 0.03756304120277857, + "grad_norm": 2.0519096851348877, + "learning_rate": 4.982618396966726e-05, + "loss": 5.3494, + "step": 6316 + }, + { + "epoch": 0.037568988486059565, + "grad_norm": 1.935744285583496, + "learning_rate": 4.982612898058773e-05, + "loss": 5.6993, + "step": 6317 + }, + { + "epoch": 0.03757493576934057, + "grad_norm": 1.882163166999817, + "learning_rate": 4.9826073982841656e-05, + "loss": 5.758, + "step": 6318 + }, + { + "epoch": 0.03758088305262156, + "grad_norm": 1.7747882604599, + "learning_rate": 4.982601897642906e-05, + "loss": 5.1501, + "step": 6319 + }, + { + "epoch": 0.03758683033590256, + "grad_norm": 2.044093370437622, + "learning_rate": 4.982596396134995e-05, + "loss": 5.2801, + "step": 6320 + }, + { + "epoch": 0.03759277761918356, + "grad_norm": 1.739441990852356, + "learning_rate": 4.9825908937604346e-05, + "loss": 5.1619, + "step": 6321 + }, + { + "epoch": 0.037598724902464555, + "grad_norm": 2.0353312492370605, + "learning_rate": 4.982585390519229e-05, + "loss": 5.6796, + "step": 6322 + }, + { + "epoch": 0.03760467218574555, + "grad_norm": 2.076667308807373, + "learning_rate": 4.9825798864113774e-05, + "loss": 6.2522, + "step": 6323 + }, + { + "epoch": 0.03761061946902655, + "grad_norm": 2.773676633834839, + "learning_rate": 4.982574381436883e-05, + "loss": 5.879, + "step": 6324 + }, + { + "epoch": 0.03761656675230755, + "grad_norm": 2.2013933658599854, + "learning_rate": 4.982568875595748e-05, + "loss": 6.0341, + "step": 6325 + }, + { + "epoch": 0.03762251403558854, + "grad_norm": 2.288806915283203, + "learning_rate": 4.9825633688879736e-05, + "loss": 6.219, + "step": 6326 + }, + { + "epoch": 0.037628461318869544, + "grad_norm": 2.874372720718384, + "learning_rate": 4.982557861313561e-05, + "loss": 5.7616, + "step": 6327 + }, + { + "epoch": 0.03763440860215054, + "grad_norm": 2.7471537590026855, + "learning_rate": 4.982552352872515e-05, + "loss": 5.7214, + "step": 6328 + }, + { + "epoch": 0.037640355885431534, + "grad_norm": 2.475513458251953, + "learning_rate": 4.982546843564834e-05, + "loss": 6.0039, + "step": 6329 + }, + { + "epoch": 0.03764630316871253, + "grad_norm": 2.5376412868499756, + "learning_rate": 4.982541333390523e-05, + "loss": 6.3042, + "step": 6330 + }, + { + "epoch": 0.03765225045199353, + "grad_norm": 2.599989414215088, + "learning_rate": 4.9825358223495814e-05, + "loss": 6.488, + "step": 6331 + }, + { + "epoch": 0.037658197735274526, + "grad_norm": 2.2657089233398438, + "learning_rate": 4.9825303104420115e-05, + "loss": 6.2743, + "step": 6332 + }, + { + "epoch": 0.03766414501855552, + "grad_norm": 2.303926467895508, + "learning_rate": 4.982524797667818e-05, + "loss": 6.3888, + "step": 6333 + }, + { + "epoch": 0.03767009230183652, + "grad_norm": 2.771775007247925, + "learning_rate": 4.982519284026999e-05, + "loss": 6.0911, + "step": 6334 + }, + { + "epoch": 0.03767603958511752, + "grad_norm": 2.492748260498047, + "learning_rate": 4.982513769519559e-05, + "loss": 5.9905, + "step": 6335 + }, + { + "epoch": 0.03768198686839851, + "grad_norm": 2.294985771179199, + "learning_rate": 4.982508254145498e-05, + "loss": 6.4574, + "step": 6336 + }, + { + "epoch": 0.037687934151679515, + "grad_norm": 2.6514554023742676, + "learning_rate": 4.9825027379048205e-05, + "loss": 6.1541, + "step": 6337 + }, + { + "epoch": 0.03769388143496051, + "grad_norm": 2.0114963054656982, + "learning_rate": 4.982497220797526e-05, + "loss": 6.0602, + "step": 6338 + }, + { + "epoch": 0.037699828718241506, + "grad_norm": 2.6345295906066895, + "learning_rate": 4.982491702823618e-05, + "loss": 6.024, + "step": 6339 + }, + { + "epoch": 0.03770577600152251, + "grad_norm": 2.619980573654175, + "learning_rate": 4.982486183983097e-05, + "loss": 6.0642, + "step": 6340 + }, + { + "epoch": 0.0377117232848035, + "grad_norm": 2.491279125213623, + "learning_rate": 4.9824806642759664e-05, + "loss": 5.8517, + "step": 6341 + }, + { + "epoch": 0.0377176705680845, + "grad_norm": 2.5161385536193848, + "learning_rate": 4.982475143702227e-05, + "loss": 5.7467, + "step": 6342 + }, + { + "epoch": 0.03772361785136549, + "grad_norm": 2.3237602710723877, + "learning_rate": 4.982469622261882e-05, + "loss": 5.801, + "step": 6343 + }, + { + "epoch": 0.037729565134646495, + "grad_norm": 2.21382999420166, + "learning_rate": 4.9824640999549314e-05, + "loss": 5.968, + "step": 6344 + }, + { + "epoch": 0.03773551241792749, + "grad_norm": 2.1770498752593994, + "learning_rate": 4.9824585767813794e-05, + "loss": 6.2998, + "step": 6345 + }, + { + "epoch": 0.037741459701208485, + "grad_norm": 2.321563720703125, + "learning_rate": 4.982453052741225e-05, + "loss": 5.631, + "step": 6346 + }, + { + "epoch": 0.03774740698448949, + "grad_norm": 3.2769439220428467, + "learning_rate": 4.982447527834473e-05, + "loss": 5.4845, + "step": 6347 + }, + { + "epoch": 0.03775335426777048, + "grad_norm": 2.954331874847412, + "learning_rate": 4.9824420020611244e-05, + "loss": 5.2, + "step": 6348 + }, + { + "epoch": 0.03775930155105148, + "grad_norm": 2.735182523727417, + "learning_rate": 4.98243647542118e-05, + "loss": 5.1907, + "step": 6349 + }, + { + "epoch": 0.03776524883433248, + "grad_norm": 2.872142791748047, + "learning_rate": 4.982430947914644e-05, + "loss": 5.5159, + "step": 6350 + }, + { + "epoch": 0.037771196117613474, + "grad_norm": 3.14219331741333, + "learning_rate": 4.982425419541517e-05, + "loss": 5.0843, + "step": 6351 + }, + { + "epoch": 0.03777714340089447, + "grad_norm": 2.2689874172210693, + "learning_rate": 4.9824198903018e-05, + "loss": 6.0446, + "step": 6352 + }, + { + "epoch": 0.03778309068417547, + "grad_norm": 2.3468856811523438, + "learning_rate": 4.982414360195496e-05, + "loss": 5.952, + "step": 6353 + }, + { + "epoch": 0.03778903796745647, + "grad_norm": 2.944509983062744, + "learning_rate": 4.9824088292226065e-05, + "loss": 5.4918, + "step": 6354 + }, + { + "epoch": 0.03779498525073746, + "grad_norm": 2.8139286041259766, + "learning_rate": 4.982403297383135e-05, + "loss": 5.3296, + "step": 6355 + }, + { + "epoch": 0.037800932534018464, + "grad_norm": 2.540224552154541, + "learning_rate": 4.982397764677081e-05, + "loss": 5.3464, + "step": 6356 + }, + { + "epoch": 0.03780687981729946, + "grad_norm": 2.56709885597229, + "learning_rate": 4.982392231104448e-05, + "loss": 5.2313, + "step": 6357 + }, + { + "epoch": 0.037812827100580454, + "grad_norm": 2.2051165103912354, + "learning_rate": 4.982386696665238e-05, + "loss": 5.7783, + "step": 6358 + }, + { + "epoch": 0.03781877438386145, + "grad_norm": 2.5773870944976807, + "learning_rate": 4.9823811613594515e-05, + "loss": 5.6691, + "step": 6359 + }, + { + "epoch": 0.03782472166714245, + "grad_norm": 2.5163073539733887, + "learning_rate": 4.982375625187092e-05, + "loss": 5.7936, + "step": 6360 + }, + { + "epoch": 0.037830668950423446, + "grad_norm": 2.4268851280212402, + "learning_rate": 4.98237008814816e-05, + "loss": 5.8116, + "step": 6361 + }, + { + "epoch": 0.03783661623370444, + "grad_norm": 2.397402286529541, + "learning_rate": 4.9823645502426597e-05, + "loss": 5.9895, + "step": 6362 + }, + { + "epoch": 0.03784256351698544, + "grad_norm": 2.590672731399536, + "learning_rate": 4.98235901147059e-05, + "loss": 5.9022, + "step": 6363 + }, + { + "epoch": 0.03784851080026644, + "grad_norm": 2.268540859222412, + "learning_rate": 4.9823534718319557e-05, + "loss": 5.8958, + "step": 6364 + }, + { + "epoch": 0.03785445808354743, + "grad_norm": 2.1419460773468018, + "learning_rate": 4.982347931326757e-05, + "loss": 5.8446, + "step": 6365 + }, + { + "epoch": 0.037860405366828435, + "grad_norm": 2.3988053798675537, + "learning_rate": 4.9823423899549957e-05, + "loss": 6.2267, + "step": 6366 + }, + { + "epoch": 0.03786635265010943, + "grad_norm": 2.120121955871582, + "learning_rate": 4.9823368477166755e-05, + "loss": 6.1352, + "step": 6367 + }, + { + "epoch": 0.037872299933390426, + "grad_norm": 2.274610996246338, + "learning_rate": 4.982331304611796e-05, + "loss": 6.1342, + "step": 6368 + }, + { + "epoch": 0.03787824721667143, + "grad_norm": 1.6934765577316284, + "learning_rate": 4.98232576064036e-05, + "loss": 5.7969, + "step": 6369 + }, + { + "epoch": 0.03788419449995242, + "grad_norm": 2.62416672706604, + "learning_rate": 4.982320215802371e-05, + "loss": 5.9669, + "step": 6370 + }, + { + "epoch": 0.03789014178323342, + "grad_norm": 2.416639804840088, + "learning_rate": 4.98231467009783e-05, + "loss": 5.9628, + "step": 6371 + }, + { + "epoch": 0.03789608906651441, + "grad_norm": 2.049412965774536, + "learning_rate": 4.9823091235267375e-05, + "loss": 5.658, + "step": 6372 + }, + { + "epoch": 0.037902036349795415, + "grad_norm": 2.0502147674560547, + "learning_rate": 4.982303576089097e-05, + "loss": 5.9114, + "step": 6373 + }, + { + "epoch": 0.03790798363307641, + "grad_norm": 2.1566948890686035, + "learning_rate": 4.982298027784909e-05, + "loss": 5.6932, + "step": 6374 + }, + { + "epoch": 0.037913930916357405, + "grad_norm": 2.394083261489868, + "learning_rate": 4.9822924786141774e-05, + "loss": 6.3041, + "step": 6375 + }, + { + "epoch": 0.03791987819963841, + "grad_norm": 2.545910120010376, + "learning_rate": 4.9822869285769024e-05, + "loss": 6.2125, + "step": 6376 + }, + { + "epoch": 0.0379258254829194, + "grad_norm": 2.271461248397827, + "learning_rate": 4.9822813776730875e-05, + "loss": 6.2322, + "step": 6377 + }, + { + "epoch": 0.0379317727662004, + "grad_norm": 2.3840630054473877, + "learning_rate": 4.9822758259027336e-05, + "loss": 6.0167, + "step": 6378 + }, + { + "epoch": 0.0379377200494814, + "grad_norm": 2.600618600845337, + "learning_rate": 4.9822702732658426e-05, + "loss": 5.6722, + "step": 6379 + }, + { + "epoch": 0.037943667332762394, + "grad_norm": 2.0911965370178223, + "learning_rate": 4.982264719762417e-05, + "loss": 5.579, + "step": 6380 + }, + { + "epoch": 0.03794961461604339, + "grad_norm": 2.015505075454712, + "learning_rate": 4.9822591653924575e-05, + "loss": 5.9747, + "step": 6381 + }, + { + "epoch": 0.03795556189932439, + "grad_norm": 2.237262010574341, + "learning_rate": 4.982253610155968e-05, + "loss": 6.3792, + "step": 6382 + }, + { + "epoch": 0.03796150918260539, + "grad_norm": 2.1448137760162354, + "learning_rate": 4.982248054052949e-05, + "loss": 6.1049, + "step": 6383 + }, + { + "epoch": 0.03796745646588638, + "grad_norm": 2.2597758769989014, + "learning_rate": 4.9822424970834034e-05, + "loss": 5.8428, + "step": 6384 + }, + { + "epoch": 0.037973403749167384, + "grad_norm": 1.9935969114303589, + "learning_rate": 4.982236939247332e-05, + "loss": 6.0032, + "step": 6385 + }, + { + "epoch": 0.03797935103244838, + "grad_norm": 2.506916046142578, + "learning_rate": 4.982231380544737e-05, + "loss": 5.9221, + "step": 6386 + }, + { + "epoch": 0.037985298315729374, + "grad_norm": 2.083393096923828, + "learning_rate": 4.9822258209756214e-05, + "loss": 5.8862, + "step": 6387 + }, + { + "epoch": 0.03799124559901037, + "grad_norm": 2.631091594696045, + "learning_rate": 4.982220260539987e-05, + "loss": 5.6593, + "step": 6388 + }, + { + "epoch": 0.03799719288229137, + "grad_norm": 2.5732531547546387, + "learning_rate": 4.982214699237834e-05, + "loss": 5.5084, + "step": 6389 + }, + { + "epoch": 0.038003140165572366, + "grad_norm": 2.7797791957855225, + "learning_rate": 4.982209137069166e-05, + "loss": 5.6792, + "step": 6390 + }, + { + "epoch": 0.03800908744885336, + "grad_norm": 2.2800772190093994, + "learning_rate": 4.982203574033984e-05, + "loss": 5.6299, + "step": 6391 + }, + { + "epoch": 0.03801503473213436, + "grad_norm": 2.4182863235473633, + "learning_rate": 4.9821980101322905e-05, + "loss": 5.71, + "step": 6392 + }, + { + "epoch": 0.03802098201541536, + "grad_norm": 2.2968835830688477, + "learning_rate": 4.982192445364088e-05, + "loss": 5.6112, + "step": 6393 + }, + { + "epoch": 0.03802692929869635, + "grad_norm": 2.3713324069976807, + "learning_rate": 4.982186879729377e-05, + "loss": 5.423, + "step": 6394 + }, + { + "epoch": 0.038032876581977355, + "grad_norm": 2.745352268218994, + "learning_rate": 4.98218131322816e-05, + "loss": 5.5145, + "step": 6395 + }, + { + "epoch": 0.03803882386525835, + "grad_norm": 2.755211353302002, + "learning_rate": 4.98217574586044e-05, + "loss": 5.4399, + "step": 6396 + }, + { + "epoch": 0.038044771148539346, + "grad_norm": 2.5452096462249756, + "learning_rate": 4.982170177626217e-05, + "loss": 5.5691, + "step": 6397 + }, + { + "epoch": 0.03805071843182035, + "grad_norm": 2.6195876598358154, + "learning_rate": 4.9821646085254954e-05, + "loss": 5.4512, + "step": 6398 + }, + { + "epoch": 0.03805666571510134, + "grad_norm": 2.4931671619415283, + "learning_rate": 4.982159038558275e-05, + "loss": 6.0505, + "step": 6399 + }, + { + "epoch": 0.03806261299838234, + "grad_norm": 2.45062255859375, + "learning_rate": 4.982153467724558e-05, + "loss": 6.2367, + "step": 6400 + }, + { + "epoch": 0.03806856028166333, + "grad_norm": 2.688624620437622, + "learning_rate": 4.982147896024348e-05, + "loss": 6.0522, + "step": 6401 + }, + { + "epoch": 0.038074507564944335, + "grad_norm": 2.421660900115967, + "learning_rate": 4.982142323457645e-05, + "loss": 5.8166, + "step": 6402 + }, + { + "epoch": 0.03808045484822533, + "grad_norm": 2.594134569168091, + "learning_rate": 4.982136750024452e-05, + "loss": 5.5476, + "step": 6403 + }, + { + "epoch": 0.038086402131506325, + "grad_norm": 2.4492971897125244, + "learning_rate": 4.982131175724771e-05, + "loss": 5.2302, + "step": 6404 + }, + { + "epoch": 0.03809234941478733, + "grad_norm": 2.4200360774993896, + "learning_rate": 4.9821256005586036e-05, + "loss": 6.1404, + "step": 6405 + }, + { + "epoch": 0.03809829669806832, + "grad_norm": 2.1949775218963623, + "learning_rate": 4.982120024525951e-05, + "loss": 5.9589, + "step": 6406 + }, + { + "epoch": 0.03810424398134932, + "grad_norm": 2.3570375442504883, + "learning_rate": 4.9821144476268164e-05, + "loss": 5.9022, + "step": 6407 + }, + { + "epoch": 0.03811019126463032, + "grad_norm": 2.16460919380188, + "learning_rate": 4.9821088698612016e-05, + "loss": 5.8535, + "step": 6408 + }, + { + "epoch": 0.038116138547911314, + "grad_norm": 1.8189443349838257, + "learning_rate": 4.982103291229108e-05, + "loss": 5.9345, + "step": 6409 + }, + { + "epoch": 0.03812208583119231, + "grad_norm": 2.553919792175293, + "learning_rate": 4.9820977117305376e-05, + "loss": 5.31, + "step": 6410 + }, + { + "epoch": 0.03812803311447331, + "grad_norm": 2.8085403442382812, + "learning_rate": 4.982092131365493e-05, + "loss": 4.9902, + "step": 6411 + }, + { + "epoch": 0.03813398039775431, + "grad_norm": 2.3698999881744385, + "learning_rate": 4.982086550133976e-05, + "loss": 5.4982, + "step": 6412 + }, + { + "epoch": 0.0381399276810353, + "grad_norm": 1.996026873588562, + "learning_rate": 4.9820809680359876e-05, + "loss": 5.6556, + "step": 6413 + }, + { + "epoch": 0.038145874964316304, + "grad_norm": 2.0816900730133057, + "learning_rate": 4.9820753850715305e-05, + "loss": 5.8823, + "step": 6414 + }, + { + "epoch": 0.0381518222475973, + "grad_norm": 2.282745122909546, + "learning_rate": 4.982069801240606e-05, + "loss": 5.1641, + "step": 6415 + }, + { + "epoch": 0.038157769530878294, + "grad_norm": 2.043991804122925, + "learning_rate": 4.982064216543217e-05, + "loss": 5.7569, + "step": 6416 + }, + { + "epoch": 0.03816371681415929, + "grad_norm": 2.086071014404297, + "learning_rate": 4.982058630979365e-05, + "loss": 5.9586, + "step": 6417 + }, + { + "epoch": 0.03816966409744029, + "grad_norm": 2.295060873031616, + "learning_rate": 4.9820530445490525e-05, + "loss": 5.3733, + "step": 6418 + }, + { + "epoch": 0.038175611380721286, + "grad_norm": 2.512267827987671, + "learning_rate": 4.98204745725228e-05, + "loss": 5.0399, + "step": 6419 + }, + { + "epoch": 0.03818155866400228, + "grad_norm": 2.5434467792510986, + "learning_rate": 4.982041869089051e-05, + "loss": 4.7907, + "step": 6420 + }, + { + "epoch": 0.03818750594728328, + "grad_norm": 2.4192142486572266, + "learning_rate": 4.9820362800593666e-05, + "loss": 4.9116, + "step": 6421 + }, + { + "epoch": 0.03819345323056428, + "grad_norm": 2.867542028427124, + "learning_rate": 4.9820306901632296e-05, + "loss": 5.9905, + "step": 6422 + }, + { + "epoch": 0.03819940051384527, + "grad_norm": 2.3099327087402344, + "learning_rate": 4.982025099400641e-05, + "loss": 5.9319, + "step": 6423 + }, + { + "epoch": 0.038205347797126275, + "grad_norm": 2.28169584274292, + "learning_rate": 4.9820195077716026e-05, + "loss": 6.2533, + "step": 6424 + }, + { + "epoch": 0.03821129508040727, + "grad_norm": 2.1065595149993896, + "learning_rate": 4.9820139152761167e-05, + "loss": 5.7123, + "step": 6425 + }, + { + "epoch": 0.038217242363688265, + "grad_norm": 2.0210213661193848, + "learning_rate": 4.9820083219141865e-05, + "loss": 5.7758, + "step": 6426 + }, + { + "epoch": 0.03822318964696927, + "grad_norm": 1.6545369625091553, + "learning_rate": 4.9820027276858114e-05, + "loss": 5.6792, + "step": 6427 + }, + { + "epoch": 0.03822913693025026, + "grad_norm": 2.177621841430664, + "learning_rate": 4.981997132590996e-05, + "loss": 6.0167, + "step": 6428 + }, + { + "epoch": 0.03823508421353126, + "grad_norm": 2.3910553455352783, + "learning_rate": 4.981991536629741e-05, + "loss": 6.1161, + "step": 6429 + }, + { + "epoch": 0.03824103149681225, + "grad_norm": 2.4915859699249268, + "learning_rate": 4.981985939802047e-05, + "loss": 5.6449, + "step": 6430 + }, + { + "epoch": 0.038246978780093255, + "grad_norm": 2.0343215465545654, + "learning_rate": 4.981980342107919e-05, + "loss": 5.967, + "step": 6431 + }, + { + "epoch": 0.03825292606337425, + "grad_norm": 1.8326199054718018, + "learning_rate": 4.9819747435473565e-05, + "loss": 5.9183, + "step": 6432 + }, + { + "epoch": 0.038258873346655245, + "grad_norm": 2.1482350826263428, + "learning_rate": 4.981969144120362e-05, + "loss": 5.794, + "step": 6433 + }, + { + "epoch": 0.03826482062993625, + "grad_norm": 2.346355438232422, + "learning_rate": 4.9819635438269384e-05, + "loss": 5.6775, + "step": 6434 + }, + { + "epoch": 0.03827076791321724, + "grad_norm": 2.252150774002075, + "learning_rate": 4.981957942667087e-05, + "loss": 5.9383, + "step": 6435 + }, + { + "epoch": 0.03827671519649824, + "grad_norm": 2.1851654052734375, + "learning_rate": 4.981952340640809e-05, + "loss": 6.0555, + "step": 6436 + }, + { + "epoch": 0.03828266247977924, + "grad_norm": 2.0609381198883057, + "learning_rate": 4.9819467377481076e-05, + "loss": 6.3209, + "step": 6437 + }, + { + "epoch": 0.038288609763060234, + "grad_norm": 2.4882800579071045, + "learning_rate": 4.981941133988984e-05, + "loss": 6.2411, + "step": 6438 + }, + { + "epoch": 0.03829455704634123, + "grad_norm": 1.8794118165969849, + "learning_rate": 4.981935529363441e-05, + "loss": 5.5696, + "step": 6439 + }, + { + "epoch": 0.03830050432962223, + "grad_norm": 2.542656660079956, + "learning_rate": 4.981929923871479e-05, + "loss": 5.8106, + "step": 6440 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 2.3871288299560547, + "learning_rate": 4.981924317513101e-05, + "loss": 5.6354, + "step": 6441 + }, + { + "epoch": 0.03831239889618422, + "grad_norm": 2.4628939628601074, + "learning_rate": 4.981918710288309e-05, + "loss": 5.9695, + "step": 6442 + }, + { + "epoch": 0.038318346179465224, + "grad_norm": 2.908543586730957, + "learning_rate": 4.9819131021971056e-05, + "loss": 5.2742, + "step": 6443 + }, + { + "epoch": 0.03832429346274622, + "grad_norm": 3.353813886642456, + "learning_rate": 4.9819074932394916e-05, + "loss": 5.3823, + "step": 6444 + }, + { + "epoch": 0.038330240746027214, + "grad_norm": 2.5253870487213135, + "learning_rate": 4.981901883415469e-05, + "loss": 5.7, + "step": 6445 + }, + { + "epoch": 0.03833618802930821, + "grad_norm": 2.3375632762908936, + "learning_rate": 4.98189627272504e-05, + "loss": 5.2862, + "step": 6446 + }, + { + "epoch": 0.03834213531258921, + "grad_norm": 2.534599542617798, + "learning_rate": 4.981890661168207e-05, + "loss": 5.3961, + "step": 6447 + }, + { + "epoch": 0.038348082595870206, + "grad_norm": 2.383511781692505, + "learning_rate": 4.9818850487449716e-05, + "loss": 6.4658, + "step": 6448 + }, + { + "epoch": 0.0383540298791512, + "grad_norm": 2.2824161052703857, + "learning_rate": 4.981879435455336e-05, + "loss": 5.5221, + "step": 6449 + }, + { + "epoch": 0.0383599771624322, + "grad_norm": 2.355271100997925, + "learning_rate": 4.981873821299301e-05, + "loss": 5.5054, + "step": 6450 + }, + { + "epoch": 0.0383659244457132, + "grad_norm": 2.0071253776550293, + "learning_rate": 4.981868206276871e-05, + "loss": 5.5911, + "step": 6451 + }, + { + "epoch": 0.03837187172899419, + "grad_norm": 2.2770705223083496, + "learning_rate": 4.9818625903880445e-05, + "loss": 5.8978, + "step": 6452 + }, + { + "epoch": 0.038377819012275195, + "grad_norm": 2.2425332069396973, + "learning_rate": 4.981856973632827e-05, + "loss": 6.3189, + "step": 6453 + }, + { + "epoch": 0.03838376629555619, + "grad_norm": 2.300560235977173, + "learning_rate": 4.981851356011218e-05, + "loss": 5.745, + "step": 6454 + }, + { + "epoch": 0.038389713578837185, + "grad_norm": 2.4516983032226562, + "learning_rate": 4.981845737523221e-05, + "loss": 5.8978, + "step": 6455 + }, + { + "epoch": 0.03839566086211819, + "grad_norm": 2.3463354110717773, + "learning_rate": 4.981840118168837e-05, + "loss": 5.668, + "step": 6456 + }, + { + "epoch": 0.03840160814539918, + "grad_norm": 2.623608112335205, + "learning_rate": 4.981834497948068e-05, + "loss": 5.471, + "step": 6457 + }, + { + "epoch": 0.03840755542868018, + "grad_norm": 2.441089391708374, + "learning_rate": 4.9818288768609166e-05, + "loss": 5.0986, + "step": 6458 + }, + { + "epoch": 0.03841350271196117, + "grad_norm": 2.597635507583618, + "learning_rate": 4.981823254907384e-05, + "loss": 5.1046, + "step": 6459 + }, + { + "epoch": 0.038419449995242175, + "grad_norm": 2.344855785369873, + "learning_rate": 4.9818176320874727e-05, + "loss": 5.8878, + "step": 6460 + }, + { + "epoch": 0.03842539727852317, + "grad_norm": 2.2569222450256348, + "learning_rate": 4.981812008401184e-05, + "loss": 5.342, + "step": 6461 + }, + { + "epoch": 0.038431344561804165, + "grad_norm": 2.276780843734741, + "learning_rate": 4.981806383848522e-05, + "loss": 5.566, + "step": 6462 + }, + { + "epoch": 0.03843729184508517, + "grad_norm": 2.1354174613952637, + "learning_rate": 4.9818007584294856e-05, + "loss": 5.8678, + "step": 6463 + }, + { + "epoch": 0.03844323912836616, + "grad_norm": 2.164092779159546, + "learning_rate": 4.981795132144078e-05, + "loss": 5.7937, + "step": 6464 + }, + { + "epoch": 0.03844918641164716, + "grad_norm": 2.3034324645996094, + "learning_rate": 4.981789504992303e-05, + "loss": 5.843, + "step": 6465 + }, + { + "epoch": 0.03845513369492816, + "grad_norm": 1.9616999626159668, + "learning_rate": 4.9817838769741584e-05, + "loss": 6.0563, + "step": 6466 + }, + { + "epoch": 0.038461080978209154, + "grad_norm": 2.2784626483917236, + "learning_rate": 4.9817782480896505e-05, + "loss": 6.4152, + "step": 6467 + }, + { + "epoch": 0.03846702826149015, + "grad_norm": 1.8581526279449463, + "learning_rate": 4.981772618338779e-05, + "loss": 5.9833, + "step": 6468 + }, + { + "epoch": 0.03847297554477115, + "grad_norm": 2.2493395805358887, + "learning_rate": 4.9817669877215466e-05, + "loss": 6.2985, + "step": 6469 + }, + { + "epoch": 0.038478922828052146, + "grad_norm": 2.289125919342041, + "learning_rate": 4.981761356237955e-05, + "loss": 5.8555, + "step": 6470 + }, + { + "epoch": 0.03848487011133314, + "grad_norm": 2.11012601852417, + "learning_rate": 4.981755723888006e-05, + "loss": 6.6137, + "step": 6471 + }, + { + "epoch": 0.038490817394614144, + "grad_norm": 2.1793103218078613, + "learning_rate": 4.981750090671702e-05, + "loss": 6.0117, + "step": 6472 + }, + { + "epoch": 0.03849676467789514, + "grad_norm": 2.1857750415802, + "learning_rate": 4.9817444565890436e-05, + "loss": 5.9877, + "step": 6473 + }, + { + "epoch": 0.038502711961176134, + "grad_norm": 1.7430874109268188, + "learning_rate": 4.981738821640035e-05, + "loss": 5.829, + "step": 6474 + }, + { + "epoch": 0.03850865924445713, + "grad_norm": 1.8017771244049072, + "learning_rate": 4.981733185824676e-05, + "loss": 6.3853, + "step": 6475 + }, + { + "epoch": 0.03851460652773813, + "grad_norm": 2.1420724391937256, + "learning_rate": 4.9817275491429705e-05, + "loss": 5.982, + "step": 6476 + }, + { + "epoch": 0.038520553811019126, + "grad_norm": 2.441521167755127, + "learning_rate": 4.9817219115949195e-05, + "loss": 6.1159, + "step": 6477 + }, + { + "epoch": 0.03852650109430012, + "grad_norm": 2.158682346343994, + "learning_rate": 4.9817162731805246e-05, + "loss": 6.1306, + "step": 6478 + }, + { + "epoch": 0.03853244837758112, + "grad_norm": 2.154538869857788, + "learning_rate": 4.9817106338997884e-05, + "loss": 6.0745, + "step": 6479 + }, + { + "epoch": 0.03853839566086212, + "grad_norm": 2.077674388885498, + "learning_rate": 4.981704993752713e-05, + "loss": 6.2171, + "step": 6480 + }, + { + "epoch": 0.03854434294414311, + "grad_norm": 2.181500196456909, + "learning_rate": 4.981699352739299e-05, + "loss": 6.228, + "step": 6481 + }, + { + "epoch": 0.038550290227424115, + "grad_norm": 2.678189992904663, + "learning_rate": 4.98169371085955e-05, + "loss": 5.965, + "step": 6482 + }, + { + "epoch": 0.03855623751070511, + "grad_norm": 2.713480234146118, + "learning_rate": 4.981688068113467e-05, + "loss": 5.9078, + "step": 6483 + }, + { + "epoch": 0.038562184793986105, + "grad_norm": 2.4872853755950928, + "learning_rate": 4.981682424501053e-05, + "loss": 5.7525, + "step": 6484 + }, + { + "epoch": 0.03856813207726711, + "grad_norm": 2.274711847305298, + "learning_rate": 4.98167678002231e-05, + "loss": 5.9193, + "step": 6485 + }, + { + "epoch": 0.0385740793605481, + "grad_norm": 2.4730162620544434, + "learning_rate": 4.981671134677238e-05, + "loss": 6.2961, + "step": 6486 + }, + { + "epoch": 0.0385800266438291, + "grad_norm": 1.7856062650680542, + "learning_rate": 4.9816654884658396e-05, + "loss": 5.9005, + "step": 6487 + }, + { + "epoch": 0.03858597392711009, + "grad_norm": 1.8812140226364136, + "learning_rate": 4.981659841388119e-05, + "loss": 5.9428, + "step": 6488 + }, + { + "epoch": 0.038591921210391095, + "grad_norm": 1.9963254928588867, + "learning_rate": 4.9816541934440756e-05, + "loss": 6.0136, + "step": 6489 + }, + { + "epoch": 0.03859786849367209, + "grad_norm": 2.741892099380493, + "learning_rate": 4.981648544633713e-05, + "loss": 6.5065, + "step": 6490 + }, + { + "epoch": 0.038603815776953085, + "grad_norm": 2.226672410964966, + "learning_rate": 4.981642894957032e-05, + "loss": 5.9705, + "step": 6491 + }, + { + "epoch": 0.03860976306023409, + "grad_norm": 2.015429973602295, + "learning_rate": 4.981637244414036e-05, + "loss": 6.1418, + "step": 6492 + }, + { + "epoch": 0.03861571034351508, + "grad_norm": 2.032304286956787, + "learning_rate": 4.981631593004725e-05, + "loss": 6.2104, + "step": 6493 + }, + { + "epoch": 0.03862165762679608, + "grad_norm": 2.0174217224121094, + "learning_rate": 4.981625940729102e-05, + "loss": 5.9861, + "step": 6494 + }, + { + "epoch": 0.03862760491007708, + "grad_norm": 1.9466323852539062, + "learning_rate": 4.98162028758717e-05, + "loss": 6.0958, + "step": 6495 + }, + { + "epoch": 0.038633552193358074, + "grad_norm": 1.6796106100082397, + "learning_rate": 4.9816146335789296e-05, + "loss": 6.0708, + "step": 6496 + }, + { + "epoch": 0.03863949947663907, + "grad_norm": 2.0496580600738525, + "learning_rate": 4.9816089787043826e-05, + "loss": 6.0137, + "step": 6497 + }, + { + "epoch": 0.03864544675992007, + "grad_norm": 2.5402488708496094, + "learning_rate": 4.9816033229635324e-05, + "loss": 6.1389, + "step": 6498 + }, + { + "epoch": 0.038651394043201066, + "grad_norm": 2.2701938152313232, + "learning_rate": 4.9815976663563795e-05, + "loss": 6.1277, + "step": 6499 + }, + { + "epoch": 0.03865734132648206, + "grad_norm": 2.328554630279541, + "learning_rate": 4.9815920088829273e-05, + "loss": 6.0402, + "step": 6500 + }, + { + "epoch": 0.038663288609763063, + "grad_norm": 2.1817965507507324, + "learning_rate": 4.981586350543176e-05, + "loss": 6.2732, + "step": 6501 + }, + { + "epoch": 0.03866923589304406, + "grad_norm": 2.4273757934570312, + "learning_rate": 4.981580691337129e-05, + "loss": 6.1842, + "step": 6502 + }, + { + "epoch": 0.038675183176325054, + "grad_norm": 2.1365530490875244, + "learning_rate": 4.981575031264787e-05, + "loss": 6.1527, + "step": 6503 + }, + { + "epoch": 0.03868113045960605, + "grad_norm": 2.2198991775512695, + "learning_rate": 4.981569370326154e-05, + "loss": 6.0841, + "step": 6504 + }, + { + "epoch": 0.03868707774288705, + "grad_norm": 2.0078141689300537, + "learning_rate": 4.98156370852123e-05, + "loss": 6.0401, + "step": 6505 + }, + { + "epoch": 0.038693025026168046, + "grad_norm": 2.0243566036224365, + "learning_rate": 4.9815580458500184e-05, + "loss": 5.9111, + "step": 6506 + }, + { + "epoch": 0.03869897230944904, + "grad_norm": 2.3084707260131836, + "learning_rate": 4.98155238231252e-05, + "loss": 5.9865, + "step": 6507 + }, + { + "epoch": 0.03870491959273004, + "grad_norm": 1.8110517263412476, + "learning_rate": 4.981546717908738e-05, + "loss": 5.9132, + "step": 6508 + }, + { + "epoch": 0.03871086687601104, + "grad_norm": 2.2639706134796143, + "learning_rate": 4.981541052638673e-05, + "loss": 5.8195, + "step": 6509 + }, + { + "epoch": 0.03871681415929203, + "grad_norm": 2.2684152126312256, + "learning_rate": 4.981535386502327e-05, + "loss": 6.4894, + "step": 6510 + }, + { + "epoch": 0.038722761442573035, + "grad_norm": 2.363118886947632, + "learning_rate": 4.981529719499704e-05, + "loss": 6.1888, + "step": 6511 + }, + { + "epoch": 0.03872870872585403, + "grad_norm": 2.2158865928649902, + "learning_rate": 4.9815240516308045e-05, + "loss": 6.3361, + "step": 6512 + }, + { + "epoch": 0.038734656009135025, + "grad_norm": 2.096928834915161, + "learning_rate": 4.98151838289563e-05, + "loss": 5.8554, + "step": 6513 + }, + { + "epoch": 0.03874060329241603, + "grad_norm": 2.2228331565856934, + "learning_rate": 4.981512713294183e-05, + "loss": 5.9961, + "step": 6514 + }, + { + "epoch": 0.03874655057569702, + "grad_norm": 1.8646903038024902, + "learning_rate": 4.981507042826466e-05, + "loss": 6.1471, + "step": 6515 + }, + { + "epoch": 0.03875249785897802, + "grad_norm": 2.227267265319824, + "learning_rate": 4.98150137149248e-05, + "loss": 5.9655, + "step": 6516 + }, + { + "epoch": 0.03875844514225902, + "grad_norm": 2.6884701251983643, + "learning_rate": 4.981495699292228e-05, + "loss": 5.7958, + "step": 6517 + }, + { + "epoch": 0.038764392425540015, + "grad_norm": 2.953523635864258, + "learning_rate": 4.981490026225711e-05, + "loss": 5.8305, + "step": 6518 + }, + { + "epoch": 0.03877033970882101, + "grad_norm": 2.5009984970092773, + "learning_rate": 4.981484352292932e-05, + "loss": 5.7838, + "step": 6519 + }, + { + "epoch": 0.038776286992102005, + "grad_norm": 2.2291715145111084, + "learning_rate": 4.981478677493892e-05, + "loss": 5.7622, + "step": 6520 + }, + { + "epoch": 0.03878223427538301, + "grad_norm": 2.1492466926574707, + "learning_rate": 4.9814730018285935e-05, + "loss": 5.5379, + "step": 6521 + }, + { + "epoch": 0.038788181558664, + "grad_norm": 1.8914062976837158, + "learning_rate": 4.981467325297039e-05, + "loss": 5.8368, + "step": 6522 + }, + { + "epoch": 0.038794128841945, + "grad_norm": 2.301670789718628, + "learning_rate": 4.981461647899229e-05, + "loss": 5.9019, + "step": 6523 + }, + { + "epoch": 0.038800076125226, + "grad_norm": 2.2850520610809326, + "learning_rate": 4.981455969635167e-05, + "loss": 5.6616, + "step": 6524 + }, + { + "epoch": 0.038806023408506994, + "grad_norm": 2.4155313968658447, + "learning_rate": 4.9814502905048546e-05, + "loss": 5.7842, + "step": 6525 + }, + { + "epoch": 0.03881197069178799, + "grad_norm": 2.0731799602508545, + "learning_rate": 4.981444610508293e-05, + "loss": 6.084, + "step": 6526 + }, + { + "epoch": 0.03881791797506899, + "grad_norm": 2.990232229232788, + "learning_rate": 4.981438929645484e-05, + "loss": 5.2556, + "step": 6527 + }, + { + "epoch": 0.038823865258349986, + "grad_norm": 3.0814263820648193, + "learning_rate": 4.981433247916432e-05, + "loss": 5.1895, + "step": 6528 + }, + { + "epoch": 0.03882981254163098, + "grad_norm": 3.197000503540039, + "learning_rate": 4.9814275653211365e-05, + "loss": 4.9539, + "step": 6529 + }, + { + "epoch": 0.03883575982491198, + "grad_norm": 3.062098979949951, + "learning_rate": 4.9814218818596e-05, + "loss": 4.8417, + "step": 6530 + }, + { + "epoch": 0.03884170710819298, + "grad_norm": 3.092667579650879, + "learning_rate": 4.981416197531825e-05, + "loss": 5.0479, + "step": 6531 + }, + { + "epoch": 0.038847654391473974, + "grad_norm": 3.00508713722229, + "learning_rate": 4.981410512337813e-05, + "loss": 5.864, + "step": 6532 + }, + { + "epoch": 0.03885360167475497, + "grad_norm": 3.3760926723480225, + "learning_rate": 4.981404826277567e-05, + "loss": 6.5745, + "step": 6533 + }, + { + "epoch": 0.03885954895803597, + "grad_norm": 2.6170921325683594, + "learning_rate": 4.981399139351087e-05, + "loss": 5.7959, + "step": 6534 + }, + { + "epoch": 0.038865496241316966, + "grad_norm": 2.9855849742889404, + "learning_rate": 4.981393451558377e-05, + "loss": 4.9118, + "step": 6535 + }, + { + "epoch": 0.03887144352459796, + "grad_norm": 2.885373830795288, + "learning_rate": 4.981387762899438e-05, + "loss": 4.8342, + "step": 6536 + }, + { + "epoch": 0.03887739080787896, + "grad_norm": 2.6936960220336914, + "learning_rate": 4.981382073374272e-05, + "loss": 4.7323, + "step": 6537 + }, + { + "epoch": 0.03888333809115996, + "grad_norm": 2.7214853763580322, + "learning_rate": 4.981376382982882e-05, + "loss": 5.5414, + "step": 6538 + }, + { + "epoch": 0.03888928537444095, + "grad_norm": 2.449828863143921, + "learning_rate": 4.981370691725269e-05, + "loss": 5.6385, + "step": 6539 + }, + { + "epoch": 0.038895232657721955, + "grad_norm": 2.551046133041382, + "learning_rate": 4.981364999601434e-05, + "loss": 5.4699, + "step": 6540 + }, + { + "epoch": 0.03890117994100295, + "grad_norm": 2.1208136081695557, + "learning_rate": 4.981359306611381e-05, + "loss": 5.6674, + "step": 6541 + }, + { + "epoch": 0.038907127224283945, + "grad_norm": 2.4039392471313477, + "learning_rate": 4.9813536127551105e-05, + "loss": 6.1872, + "step": 6542 + }, + { + "epoch": 0.03891307450756495, + "grad_norm": 2.0119946002960205, + "learning_rate": 4.9813479180326256e-05, + "loss": 6.0917, + "step": 6543 + }, + { + "epoch": 0.03891902179084594, + "grad_norm": 3.2959303855895996, + "learning_rate": 4.9813422224439275e-05, + "loss": 5.5646, + "step": 6544 + }, + { + "epoch": 0.03892496907412694, + "grad_norm": 2.9011316299438477, + "learning_rate": 4.981336525989019e-05, + "loss": 5.5324, + "step": 6545 + }, + { + "epoch": 0.03893091635740794, + "grad_norm": 2.2984118461608887, + "learning_rate": 4.981330828667901e-05, + "loss": 5.4961, + "step": 6546 + }, + { + "epoch": 0.038936863640688935, + "grad_norm": 2.1745059490203857, + "learning_rate": 4.981325130480576e-05, + "loss": 5.6631, + "step": 6547 + }, + { + "epoch": 0.03894281092396993, + "grad_norm": 2.3001794815063477, + "learning_rate": 4.981319431427046e-05, + "loss": 5.5897, + "step": 6548 + }, + { + "epoch": 0.038948758207250925, + "grad_norm": 2.329446315765381, + "learning_rate": 4.9813137315073136e-05, + "loss": 5.4599, + "step": 6549 + }, + { + "epoch": 0.03895470549053193, + "grad_norm": 2.4700307846069336, + "learning_rate": 4.98130803072138e-05, + "loss": 5.2788, + "step": 6550 + }, + { + "epoch": 0.03896065277381292, + "grad_norm": 2.309767484664917, + "learning_rate": 4.9813023290692467e-05, + "loss": 5.3828, + "step": 6551 + }, + { + "epoch": 0.03896660005709392, + "grad_norm": 2.1923089027404785, + "learning_rate": 4.981296626550917e-05, + "loss": 5.225, + "step": 6552 + }, + { + "epoch": 0.03897254734037492, + "grad_norm": 2.424954652786255, + "learning_rate": 4.981290923166392e-05, + "loss": 5.2007, + "step": 6553 + }, + { + "epoch": 0.038978494623655914, + "grad_norm": 2.53446102142334, + "learning_rate": 4.981285218915674e-05, + "loss": 5.142, + "step": 6554 + }, + { + "epoch": 0.03898444190693691, + "grad_norm": 2.492788791656494, + "learning_rate": 4.9812795137987655e-05, + "loss": 5.5755, + "step": 6555 + }, + { + "epoch": 0.03899038919021791, + "grad_norm": 2.8081278800964355, + "learning_rate": 4.9812738078156674e-05, + "loss": 4.9815, + "step": 6556 + }, + { + "epoch": 0.038996336473498906, + "grad_norm": 2.535109758377075, + "learning_rate": 4.981268100966383e-05, + "loss": 5.3678, + "step": 6557 + }, + { + "epoch": 0.0390022837567799, + "grad_norm": 2.36004900932312, + "learning_rate": 4.981262393250913e-05, + "loss": 5.0422, + "step": 6558 + }, + { + "epoch": 0.0390082310400609, + "grad_norm": 2.2315657138824463, + "learning_rate": 4.98125668466926e-05, + "loss": 5.0345, + "step": 6559 + }, + { + "epoch": 0.0390141783233419, + "grad_norm": 2.293947696685791, + "learning_rate": 4.981250975221425e-05, + "loss": 4.9308, + "step": 6560 + }, + { + "epoch": 0.039020125606622894, + "grad_norm": 2.239915132522583, + "learning_rate": 4.9812452649074124e-05, + "loss": 5.3504, + "step": 6561 + }, + { + "epoch": 0.03902607288990389, + "grad_norm": 1.8740140199661255, + "learning_rate": 4.981239553727222e-05, + "loss": 5.9432, + "step": 6562 + }, + { + "epoch": 0.03903202017318489, + "grad_norm": 1.7221744060516357, + "learning_rate": 4.981233841680857e-05, + "loss": 5.8387, + "step": 6563 + }, + { + "epoch": 0.039037967456465886, + "grad_norm": 1.9648221731185913, + "learning_rate": 4.981228128768318e-05, + "loss": 5.7836, + "step": 6564 + }, + { + "epoch": 0.03904391473974688, + "grad_norm": 1.7790826559066772, + "learning_rate": 4.981222414989608e-05, + "loss": 5.842, + "step": 6565 + }, + { + "epoch": 0.03904986202302788, + "grad_norm": 2.039483070373535, + "learning_rate": 4.9812167003447296e-05, + "loss": 5.6509, + "step": 6566 + }, + { + "epoch": 0.03905580930630888, + "grad_norm": 2.1241865158081055, + "learning_rate": 4.981210984833684e-05, + "loss": 5.5626, + "step": 6567 + }, + { + "epoch": 0.03906175658958987, + "grad_norm": 2.1290524005889893, + "learning_rate": 4.981205268456473e-05, + "loss": 5.5114, + "step": 6568 + }, + { + "epoch": 0.039067703872870875, + "grad_norm": 2.181558132171631, + "learning_rate": 4.981199551213099e-05, + "loss": 5.5356, + "step": 6569 + }, + { + "epoch": 0.03907365115615187, + "grad_norm": 2.1696360111236572, + "learning_rate": 4.9811938331035635e-05, + "loss": 5.5684, + "step": 6570 + }, + { + "epoch": 0.039079598439432865, + "grad_norm": 1.8040674924850464, + "learning_rate": 4.98118811412787e-05, + "loss": 5.605, + "step": 6571 + }, + { + "epoch": 0.03908554572271387, + "grad_norm": 2.4475252628326416, + "learning_rate": 4.981182394286018e-05, + "loss": 6.4733, + "step": 6572 + }, + { + "epoch": 0.03909149300599486, + "grad_norm": 2.0800678730010986, + "learning_rate": 4.981176673578011e-05, + "loss": 5.5613, + "step": 6573 + }, + { + "epoch": 0.03909744028927586, + "grad_norm": 1.7632306814193726, + "learning_rate": 4.981170952003852e-05, + "loss": 5.5971, + "step": 6574 + }, + { + "epoch": 0.03910338757255686, + "grad_norm": 1.6671072244644165, + "learning_rate": 4.981165229563541e-05, + "loss": 5.4462, + "step": 6575 + }, + { + "epoch": 0.039109334855837855, + "grad_norm": 1.8972923755645752, + "learning_rate": 4.981159506257081e-05, + "loss": 5.7747, + "step": 6576 + }, + { + "epoch": 0.03911528213911885, + "grad_norm": 1.8343021869659424, + "learning_rate": 4.981153782084473e-05, + "loss": 5.7542, + "step": 6577 + }, + { + "epoch": 0.039121229422399845, + "grad_norm": 1.669877529144287, + "learning_rate": 4.9811480570457216e-05, + "loss": 5.6736, + "step": 6578 + }, + { + "epoch": 0.03912717670568085, + "grad_norm": 1.9555165767669678, + "learning_rate": 4.981142331140825e-05, + "loss": 5.2997, + "step": 6579 + }, + { + "epoch": 0.03913312398896184, + "grad_norm": 2.5131587982177734, + "learning_rate": 4.981136604369789e-05, + "loss": 5.2093, + "step": 6580 + }, + { + "epoch": 0.03913907127224284, + "grad_norm": 2.0637567043304443, + "learning_rate": 4.9811308767326134e-05, + "loss": 5.1671, + "step": 6581 + }, + { + "epoch": 0.03914501855552384, + "grad_norm": 2.140839099884033, + "learning_rate": 4.9811251482293e-05, + "loss": 5.3237, + "step": 6582 + }, + { + "epoch": 0.039150965838804834, + "grad_norm": 1.968489408493042, + "learning_rate": 4.981119418859852e-05, + "loss": 5.6015, + "step": 6583 + }, + { + "epoch": 0.03915691312208583, + "grad_norm": 1.873827338218689, + "learning_rate": 4.9811136886242705e-05, + "loss": 5.3316, + "step": 6584 + }, + { + "epoch": 0.03916286040536683, + "grad_norm": 1.9897359609603882, + "learning_rate": 4.981107957522558e-05, + "loss": 5.1548, + "step": 6585 + }, + { + "epoch": 0.039168807688647826, + "grad_norm": 2.004457950592041, + "learning_rate": 4.9811022255547165e-05, + "loss": 5.1977, + "step": 6586 + }, + { + "epoch": 0.03917475497192882, + "grad_norm": 2.1058437824249268, + "learning_rate": 4.9810964927207485e-05, + "loss": 5.0217, + "step": 6587 + }, + { + "epoch": 0.03918070225520982, + "grad_norm": 1.9846851825714111, + "learning_rate": 4.981090759020654e-05, + "loss": 5.1123, + "step": 6588 + }, + { + "epoch": 0.03918664953849082, + "grad_norm": 2.018026828765869, + "learning_rate": 4.981085024454437e-05, + "loss": 5.0516, + "step": 6589 + }, + { + "epoch": 0.039192596821771813, + "grad_norm": 1.7792260646820068, + "learning_rate": 4.9810792890220995e-05, + "loss": 5.5266, + "step": 6590 + }, + { + "epoch": 0.03919854410505281, + "grad_norm": 2.0855109691619873, + "learning_rate": 4.981073552723642e-05, + "loss": 5.5504, + "step": 6591 + }, + { + "epoch": 0.03920449138833381, + "grad_norm": 1.9998018741607666, + "learning_rate": 4.9810678155590676e-05, + "loss": 5.3447, + "step": 6592 + }, + { + "epoch": 0.039210438671614806, + "grad_norm": 2.332714557647705, + "learning_rate": 4.981062077528377e-05, + "loss": 5.6166, + "step": 6593 + }, + { + "epoch": 0.0392163859548958, + "grad_norm": 1.9647892713546753, + "learning_rate": 4.981056338631575e-05, + "loss": 5.0113, + "step": 6594 + }, + { + "epoch": 0.0392223332381768, + "grad_norm": 1.9961154460906982, + "learning_rate": 4.9810505988686604e-05, + "loss": 5.0143, + "step": 6595 + }, + { + "epoch": 0.0392282805214578, + "grad_norm": 1.9039133787155151, + "learning_rate": 4.981044858239637e-05, + "loss": 5.3602, + "step": 6596 + }, + { + "epoch": 0.03923422780473879, + "grad_norm": 1.9076604843139648, + "learning_rate": 4.981039116744507e-05, + "loss": 5.4165, + "step": 6597 + }, + { + "epoch": 0.039240175088019795, + "grad_norm": 1.6676216125488281, + "learning_rate": 4.981033374383272e-05, + "loss": 5.4018, + "step": 6598 + }, + { + "epoch": 0.03924612237130079, + "grad_norm": 1.7158783674240112, + "learning_rate": 4.981027631155933e-05, + "loss": 5.3233, + "step": 6599 + }, + { + "epoch": 0.039252069654581785, + "grad_norm": 1.6659481525421143, + "learning_rate": 4.9810218870624945e-05, + "loss": 5.4671, + "step": 6600 + }, + { + "epoch": 0.03925801693786279, + "grad_norm": 2.008171319961548, + "learning_rate": 4.981016142102956e-05, + "loss": 5.6424, + "step": 6601 + }, + { + "epoch": 0.03926396422114378, + "grad_norm": 2.213045835494995, + "learning_rate": 4.9810103962773204e-05, + "loss": 5.419, + "step": 6602 + }, + { + "epoch": 0.03926991150442478, + "grad_norm": 2.0159718990325928, + "learning_rate": 4.981004649585589e-05, + "loss": 5.4301, + "step": 6603 + }, + { + "epoch": 0.03927585878770578, + "grad_norm": 1.982701063156128, + "learning_rate": 4.9809989020277646e-05, + "loss": 5.6001, + "step": 6604 + }, + { + "epoch": 0.039281806070986774, + "grad_norm": 2.1933834552764893, + "learning_rate": 4.98099315360385e-05, + "loss": 5.6756, + "step": 6605 + }, + { + "epoch": 0.03928775335426777, + "grad_norm": 1.858798623085022, + "learning_rate": 4.980987404313846e-05, + "loss": 5.43, + "step": 6606 + }, + { + "epoch": 0.039293700637548765, + "grad_norm": 1.8233433961868286, + "learning_rate": 4.980981654157755e-05, + "loss": 5.4638, + "step": 6607 + }, + { + "epoch": 0.03929964792082977, + "grad_norm": 2.0368216037750244, + "learning_rate": 4.9809759031355784e-05, + "loss": 5.71, + "step": 6608 + }, + { + "epoch": 0.03930559520411076, + "grad_norm": 1.9923310279846191, + "learning_rate": 4.9809701512473196e-05, + "loss": 5.6443, + "step": 6609 + }, + { + "epoch": 0.03931154248739176, + "grad_norm": 2.391463279724121, + "learning_rate": 4.9809643984929785e-05, + "loss": 5.4701, + "step": 6610 + }, + { + "epoch": 0.03931748977067276, + "grad_norm": 1.8456658124923706, + "learning_rate": 4.98095864487256e-05, + "loss": 5.4346, + "step": 6611 + }, + { + "epoch": 0.039323437053953754, + "grad_norm": 1.7941107749938965, + "learning_rate": 4.980952890386063e-05, + "loss": 5.4198, + "step": 6612 + }, + { + "epoch": 0.03932938433723475, + "grad_norm": 1.8455369472503662, + "learning_rate": 4.980947135033492e-05, + "loss": 5.3915, + "step": 6613 + }, + { + "epoch": 0.03933533162051575, + "grad_norm": 1.8710846900939941, + "learning_rate": 4.980941378814847e-05, + "loss": 5.2744, + "step": 6614 + }, + { + "epoch": 0.039341278903796746, + "grad_norm": 2.203129768371582, + "learning_rate": 4.980935621730132e-05, + "loss": 5.4409, + "step": 6615 + }, + { + "epoch": 0.03934722618707774, + "grad_norm": 1.8944141864776611, + "learning_rate": 4.980929863779348e-05, + "loss": 5.4661, + "step": 6616 + }, + { + "epoch": 0.03935317347035874, + "grad_norm": 1.8268091678619385, + "learning_rate": 4.9809241049624966e-05, + "loss": 5.4088, + "step": 6617 + }, + { + "epoch": 0.03935912075363974, + "grad_norm": 1.838927984237671, + "learning_rate": 4.98091834527958e-05, + "loss": 5.5335, + "step": 6618 + }, + { + "epoch": 0.03936506803692073, + "grad_norm": 1.8441804647445679, + "learning_rate": 4.9809125847306e-05, + "loss": 5.4639, + "step": 6619 + }, + { + "epoch": 0.03937101532020173, + "grad_norm": 2.012754440307617, + "learning_rate": 4.980906823315561e-05, + "loss": 5.5606, + "step": 6620 + }, + { + "epoch": 0.03937696260348273, + "grad_norm": 1.8358973264694214, + "learning_rate": 4.980901061034461e-05, + "loss": 5.4217, + "step": 6621 + }, + { + "epoch": 0.039382909886763726, + "grad_norm": 2.0668959617614746, + "learning_rate": 4.980895297887305e-05, + "loss": 5.5164, + "step": 6622 + }, + { + "epoch": 0.03938885717004472, + "grad_norm": 2.032320976257324, + "learning_rate": 4.9808895338740934e-05, + "loss": 5.4914, + "step": 6623 + }, + { + "epoch": 0.03939480445332572, + "grad_norm": 1.8650145530700684, + "learning_rate": 4.980883768994829e-05, + "loss": 5.3718, + "step": 6624 + }, + { + "epoch": 0.03940075173660672, + "grad_norm": 4.494358539581299, + "learning_rate": 4.980878003249515e-05, + "loss": 5.5253, + "step": 6625 + }, + { + "epoch": 0.03940669901988771, + "grad_norm": 1.9295374155044556, + "learning_rate": 4.980872236638151e-05, + "loss": 5.3187, + "step": 6626 + }, + { + "epoch": 0.039412646303168715, + "grad_norm": 2.089717388153076, + "learning_rate": 4.980866469160741e-05, + "loss": 5.5311, + "step": 6627 + }, + { + "epoch": 0.03941859358644971, + "grad_norm": 1.701429843902588, + "learning_rate": 4.980860700817285e-05, + "loss": 5.4529, + "step": 6628 + }, + { + "epoch": 0.039424540869730705, + "grad_norm": 1.8336073160171509, + "learning_rate": 4.980854931607787e-05, + "loss": 5.2987, + "step": 6629 + }, + { + "epoch": 0.03943048815301171, + "grad_norm": 2.7922565937042236, + "learning_rate": 4.9808491615322475e-05, + "loss": 5.3492, + "step": 6630 + }, + { + "epoch": 0.0394364354362927, + "grad_norm": 1.8253742456436157, + "learning_rate": 4.980843390590669e-05, + "loss": 5.3928, + "step": 6631 + }, + { + "epoch": 0.0394423827195737, + "grad_norm": 2.646916151046753, + "learning_rate": 4.980837618783055e-05, + "loss": 5.4329, + "step": 6632 + }, + { + "epoch": 0.0394483300028547, + "grad_norm": 2.1956236362457275, + "learning_rate": 4.980831846109405e-05, + "loss": 5.4794, + "step": 6633 + }, + { + "epoch": 0.039454277286135694, + "grad_norm": 2.7274577617645264, + "learning_rate": 4.980826072569723e-05, + "loss": 5.9666, + "step": 6634 + }, + { + "epoch": 0.03946022456941669, + "grad_norm": 1.9890350103378296, + "learning_rate": 4.98082029816401e-05, + "loss": 5.5518, + "step": 6635 + }, + { + "epoch": 0.039466171852697685, + "grad_norm": 2.7760517597198486, + "learning_rate": 4.980814522892268e-05, + "loss": 5.2777, + "step": 6636 + }, + { + "epoch": 0.03947211913597869, + "grad_norm": 2.035254716873169, + "learning_rate": 4.9808087467544995e-05, + "loss": 5.5872, + "step": 6637 + }, + { + "epoch": 0.03947806641925968, + "grad_norm": 1.9728864431381226, + "learning_rate": 4.980802969750706e-05, + "loss": 5.3357, + "step": 6638 + }, + { + "epoch": 0.03948401370254068, + "grad_norm": 1.795480489730835, + "learning_rate": 4.98079719188089e-05, + "loss": 5.6414, + "step": 6639 + }, + { + "epoch": 0.03948996098582168, + "grad_norm": 1.7882109880447388, + "learning_rate": 4.980791413145054e-05, + "loss": 5.3499, + "step": 6640 + }, + { + "epoch": 0.039495908269102674, + "grad_norm": 1.8416422605514526, + "learning_rate": 4.9807856335431994e-05, + "loss": 5.3292, + "step": 6641 + }, + { + "epoch": 0.03950185555238367, + "grad_norm": 1.9525254964828491, + "learning_rate": 4.9807798530753266e-05, + "loss": 5.2782, + "step": 6642 + }, + { + "epoch": 0.03950780283566467, + "grad_norm": 1.5100830793380737, + "learning_rate": 4.9807740717414406e-05, + "loss": 5.2807, + "step": 6643 + }, + { + "epoch": 0.039513750118945666, + "grad_norm": 2.029430866241455, + "learning_rate": 4.9807682895415406e-05, + "loss": 5.4496, + "step": 6644 + }, + { + "epoch": 0.03951969740222666, + "grad_norm": 1.7976901531219482, + "learning_rate": 4.9807625064756315e-05, + "loss": 5.1021, + "step": 6645 + }, + { + "epoch": 0.03952564468550766, + "grad_norm": 1.5770336389541626, + "learning_rate": 4.980756722543714e-05, + "loss": 5.3946, + "step": 6646 + }, + { + "epoch": 0.03953159196878866, + "grad_norm": 1.8289496898651123, + "learning_rate": 4.980750937745788e-05, + "loss": 5.4821, + "step": 6647 + }, + { + "epoch": 0.03953753925206965, + "grad_norm": 1.7413506507873535, + "learning_rate": 4.980745152081859e-05, + "loss": 5.4827, + "step": 6648 + }, + { + "epoch": 0.03954348653535065, + "grad_norm": 2.048400402069092, + "learning_rate": 4.980739365551927e-05, + "loss": 5.2359, + "step": 6649 + }, + { + "epoch": 0.03954943381863165, + "grad_norm": 2.331897735595703, + "learning_rate": 4.980733578155995e-05, + "loss": 5.2988, + "step": 6650 + }, + { + "epoch": 0.039555381101912646, + "grad_norm": 2.1224608421325684, + "learning_rate": 4.980727789894065e-05, + "loss": 5.1228, + "step": 6651 + }, + { + "epoch": 0.03956132838519364, + "grad_norm": 1.5331578254699707, + "learning_rate": 4.9807220007661374e-05, + "loss": 5.184, + "step": 6652 + }, + { + "epoch": 0.03956727566847464, + "grad_norm": 1.773489236831665, + "learning_rate": 4.980716210772216e-05, + "loss": 5.1883, + "step": 6653 + }, + { + "epoch": 0.03957322295175564, + "grad_norm": 2.119302749633789, + "learning_rate": 4.9807104199123016e-05, + "loss": 5.5437, + "step": 6654 + }, + { + "epoch": 0.03957917023503663, + "grad_norm": 2.0695033073425293, + "learning_rate": 4.9807046281863974e-05, + "loss": 5.5951, + "step": 6655 + }, + { + "epoch": 0.039585117518317635, + "grad_norm": 2.0522243976593018, + "learning_rate": 4.980698835594505e-05, + "loss": 5.2736, + "step": 6656 + }, + { + "epoch": 0.03959106480159863, + "grad_norm": 2.3200113773345947, + "learning_rate": 4.980693042136626e-05, + "loss": 5.5701, + "step": 6657 + }, + { + "epoch": 0.039597012084879625, + "grad_norm": 1.8731193542480469, + "learning_rate": 4.980687247812762e-05, + "loss": 5.3929, + "step": 6658 + }, + { + "epoch": 0.03960295936816063, + "grad_norm": 1.8390223979949951, + "learning_rate": 4.980681452622916e-05, + "loss": 5.1684, + "step": 6659 + }, + { + "epoch": 0.03960890665144162, + "grad_norm": 2.24766206741333, + "learning_rate": 4.980675656567091e-05, + "loss": 5.0232, + "step": 6660 + }, + { + "epoch": 0.03961485393472262, + "grad_norm": 2.2592451572418213, + "learning_rate": 4.980669859645286e-05, + "loss": 4.9878, + "step": 6661 + }, + { + "epoch": 0.03962080121800362, + "grad_norm": 2.14709734916687, + "learning_rate": 4.9806640618575064e-05, + "loss": 5.1036, + "step": 6662 + }, + { + "epoch": 0.039626748501284614, + "grad_norm": 2.133910655975342, + "learning_rate": 4.9806582632037516e-05, + "loss": 5.0356, + "step": 6663 + }, + { + "epoch": 0.03963269578456561, + "grad_norm": 2.2513222694396973, + "learning_rate": 4.980652463684025e-05, + "loss": 5.2357, + "step": 6664 + }, + { + "epoch": 0.039638643067846605, + "grad_norm": 2.078355312347412, + "learning_rate": 4.980646663298328e-05, + "loss": 5.3857, + "step": 6665 + }, + { + "epoch": 0.03964459035112761, + "grad_norm": 2.3798105716705322, + "learning_rate": 4.980640862046663e-05, + "loss": 5.0888, + "step": 6666 + }, + { + "epoch": 0.0396505376344086, + "grad_norm": 2.241868019104004, + "learning_rate": 4.980635059929032e-05, + "loss": 5.1397, + "step": 6667 + }, + { + "epoch": 0.0396564849176896, + "grad_norm": 2.2053534984588623, + "learning_rate": 4.9806292569454365e-05, + "loss": 4.799, + "step": 6668 + }, + { + "epoch": 0.0396624322009706, + "grad_norm": 2.2996716499328613, + "learning_rate": 4.980623453095879e-05, + "loss": 4.9597, + "step": 6669 + }, + { + "epoch": 0.039668379484251594, + "grad_norm": 1.9892657995224, + "learning_rate": 4.9806176483803615e-05, + "loss": 5.0784, + "step": 6670 + }, + { + "epoch": 0.03967432676753259, + "grad_norm": 2.2087242603302, + "learning_rate": 4.980611842798887e-05, + "loss": 5.4099, + "step": 6671 + }, + { + "epoch": 0.03968027405081359, + "grad_norm": 2.215728521347046, + "learning_rate": 4.980606036351455e-05, + "loss": 5.2889, + "step": 6672 + }, + { + "epoch": 0.039686221334094586, + "grad_norm": 2.228073835372925, + "learning_rate": 4.9806002290380705e-05, + "loss": 5.3816, + "step": 6673 + }, + { + "epoch": 0.03969216861737558, + "grad_norm": 2.209808826446533, + "learning_rate": 4.980594420858733e-05, + "loss": 5.6233, + "step": 6674 + }, + { + "epoch": 0.03969811590065658, + "grad_norm": 1.8294177055358887, + "learning_rate": 4.980588611813446e-05, + "loss": 5.5756, + "step": 6675 + }, + { + "epoch": 0.03970406318393758, + "grad_norm": 2.236435890197754, + "learning_rate": 4.980582801902212e-05, + "loss": 5.4807, + "step": 6676 + }, + { + "epoch": 0.03971001046721857, + "grad_norm": 2.528804063796997, + "learning_rate": 4.980576991125031e-05, + "loss": 5.6503, + "step": 6677 + }, + { + "epoch": 0.03971595775049957, + "grad_norm": 2.312063217163086, + "learning_rate": 4.9805711794819065e-05, + "loss": 5.5517, + "step": 6678 + }, + { + "epoch": 0.03972190503378057, + "grad_norm": 2.336134672164917, + "learning_rate": 4.98056536697284e-05, + "loss": 5.5708, + "step": 6679 + }, + { + "epoch": 0.039727852317061566, + "grad_norm": 2.2809929847717285, + "learning_rate": 4.980559553597834e-05, + "loss": 5.453, + "step": 6680 + }, + { + "epoch": 0.03973379960034256, + "grad_norm": 2.0603368282318115, + "learning_rate": 4.98055373935689e-05, + "loss": 5.3482, + "step": 6681 + }, + { + "epoch": 0.03973974688362356, + "grad_norm": 1.9654933214187622, + "learning_rate": 4.980547924250011e-05, + "loss": 5.29, + "step": 6682 + }, + { + "epoch": 0.03974569416690456, + "grad_norm": 2.4211983680725098, + "learning_rate": 4.9805421082771985e-05, + "loss": 5.4261, + "step": 6683 + }, + { + "epoch": 0.03975164145018555, + "grad_norm": 2.129987955093384, + "learning_rate": 4.9805362914384533e-05, + "loss": 5.3551, + "step": 6684 + }, + { + "epoch": 0.039757588733466555, + "grad_norm": 2.127936601638794, + "learning_rate": 4.9805304737337796e-05, + "loss": 5.4647, + "step": 6685 + }, + { + "epoch": 0.03976353601674755, + "grad_norm": 2.303382158279419, + "learning_rate": 4.980524655163178e-05, + "loss": 5.1699, + "step": 6686 + }, + { + "epoch": 0.039769483300028545, + "grad_norm": 2.6889941692352295, + "learning_rate": 4.98051883572665e-05, + "loss": 5.2031, + "step": 6687 + }, + { + "epoch": 0.03977543058330955, + "grad_norm": 3.321950674057007, + "learning_rate": 4.9805130154242e-05, + "loss": 4.9815, + "step": 6688 + }, + { + "epoch": 0.03978137786659054, + "grad_norm": 3.1951568126678467, + "learning_rate": 4.980507194255827e-05, + "loss": 4.8946, + "step": 6689 + }, + { + "epoch": 0.03978732514987154, + "grad_norm": 2.355271816253662, + "learning_rate": 4.9805013722215355e-05, + "loss": 5.9223, + "step": 6690 + }, + { + "epoch": 0.03979327243315254, + "grad_norm": 2.3401644229888916, + "learning_rate": 4.9804955493213264e-05, + "loss": 6.1826, + "step": 6691 + }, + { + "epoch": 0.039799219716433534, + "grad_norm": 2.191997766494751, + "learning_rate": 4.980489725555202e-05, + "loss": 5.5617, + "step": 6692 + }, + { + "epoch": 0.03980516699971453, + "grad_norm": 2.377803087234497, + "learning_rate": 4.9804839009231644e-05, + "loss": 5.684, + "step": 6693 + }, + { + "epoch": 0.039811114282995524, + "grad_norm": 1.9084972143173218, + "learning_rate": 4.980478075425215e-05, + "loss": 6.0291, + "step": 6694 + }, + { + "epoch": 0.039817061566276526, + "grad_norm": 2.185628890991211, + "learning_rate": 4.9804722490613566e-05, + "loss": 5.5808, + "step": 6695 + }, + { + "epoch": 0.03982300884955752, + "grad_norm": 2.3253934383392334, + "learning_rate": 4.980466421831591e-05, + "loss": 5.7076, + "step": 6696 + }, + { + "epoch": 0.03982895613283852, + "grad_norm": 2.1599392890930176, + "learning_rate": 4.98046059373592e-05, + "loss": 5.9607, + "step": 6697 + }, + { + "epoch": 0.03983490341611952, + "grad_norm": 2.093137741088867, + "learning_rate": 4.980454764774346e-05, + "loss": 6.0014, + "step": 6698 + }, + { + "epoch": 0.039840850699400514, + "grad_norm": 2.4242093563079834, + "learning_rate": 4.980448934946871e-05, + "loss": 5.6255, + "step": 6699 + }, + { + "epoch": 0.03984679798268151, + "grad_norm": 2.523277521133423, + "learning_rate": 4.980443104253497e-05, + "loss": 5.5302, + "step": 6700 + }, + { + "epoch": 0.03985274526596251, + "grad_norm": 1.7926498651504517, + "learning_rate": 4.980437272694225e-05, + "loss": 5.6467, + "step": 6701 + }, + { + "epoch": 0.039858692549243506, + "grad_norm": 1.7630435228347778, + "learning_rate": 4.980431440269059e-05, + "loss": 5.9615, + "step": 6702 + }, + { + "epoch": 0.0398646398325245, + "grad_norm": 1.8051058053970337, + "learning_rate": 4.980425606978e-05, + "loss": 6.13, + "step": 6703 + }, + { + "epoch": 0.0398705871158055, + "grad_norm": 2.104901075363159, + "learning_rate": 4.98041977282105e-05, + "loss": 6.142, + "step": 6704 + }, + { + "epoch": 0.0398765343990865, + "grad_norm": 1.7022942304611206, + "learning_rate": 4.98041393779821e-05, + "loss": 5.6764, + "step": 6705 + }, + { + "epoch": 0.03988248168236749, + "grad_norm": 2.140230178833008, + "learning_rate": 4.980408101909485e-05, + "loss": 5.9796, + "step": 6706 + }, + { + "epoch": 0.03988842896564849, + "grad_norm": 1.9564754962921143, + "learning_rate": 4.9804022651548734e-05, + "loss": 6.005, + "step": 6707 + }, + { + "epoch": 0.03989437624892949, + "grad_norm": 1.9460588693618774, + "learning_rate": 4.9803964275343795e-05, + "loss": 5.9784, + "step": 6708 + }, + { + "epoch": 0.039900323532210485, + "grad_norm": 1.7314271926879883, + "learning_rate": 4.980390589048005e-05, + "loss": 5.7766, + "step": 6709 + }, + { + "epoch": 0.03990627081549148, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.9803847496957524e-05, + "loss": 5.7386, + "step": 6710 + }, + { + "epoch": 0.03991221809877248, + "grad_norm": 2.3194711208343506, + "learning_rate": 4.980378909477622e-05, + "loss": 6.1324, + "step": 6711 + }, + { + "epoch": 0.03991816538205348, + "grad_norm": 2.3532958030700684, + "learning_rate": 4.980373068393618e-05, + "loss": 6.027, + "step": 6712 + }, + { + "epoch": 0.03992411266533447, + "grad_norm": 2.5944385528564453, + "learning_rate": 4.980367226443741e-05, + "loss": 6.2892, + "step": 6713 + }, + { + "epoch": 0.039930059948615475, + "grad_norm": 1.5707015991210938, + "learning_rate": 4.9803613836279926e-05, + "loss": 5.6525, + "step": 6714 + }, + { + "epoch": 0.03993600723189647, + "grad_norm": 2.022613286972046, + "learning_rate": 4.980355539946376e-05, + "loss": 5.8943, + "step": 6715 + }, + { + "epoch": 0.039941954515177465, + "grad_norm": 1.7783907651901245, + "learning_rate": 4.980349695398894e-05, + "loss": 5.6451, + "step": 6716 + }, + { + "epoch": 0.03994790179845847, + "grad_norm": 2.098841428756714, + "learning_rate": 4.980343849985547e-05, + "loss": 6.1143, + "step": 6717 + }, + { + "epoch": 0.03995384908173946, + "grad_norm": 2.045955181121826, + "learning_rate": 4.9803380037063374e-05, + "loss": 6.1802, + "step": 6718 + }, + { + "epoch": 0.03995979636502046, + "grad_norm": 1.7324507236480713, + "learning_rate": 4.980332156561267e-05, + "loss": 6.081, + "step": 6719 + }, + { + "epoch": 0.03996574364830146, + "grad_norm": 1.795184850692749, + "learning_rate": 4.9803263085503385e-05, + "loss": 5.6075, + "step": 6720 + }, + { + "epoch": 0.039971690931582454, + "grad_norm": 2.1466586589813232, + "learning_rate": 4.980320459673554e-05, + "loss": 6.045, + "step": 6721 + }, + { + "epoch": 0.03997763821486345, + "grad_norm": 2.1261258125305176, + "learning_rate": 4.980314609930915e-05, + "loss": 6.0589, + "step": 6722 + }, + { + "epoch": 0.039983585498144444, + "grad_norm": 2.559584617614746, + "learning_rate": 4.980308759322424e-05, + "loss": 6.3894, + "step": 6723 + }, + { + "epoch": 0.039989532781425446, + "grad_norm": 2.4580929279327393, + "learning_rate": 4.980302907848083e-05, + "loss": 6.3979, + "step": 6724 + }, + { + "epoch": 0.03999548006470644, + "grad_norm": 1.8877859115600586, + "learning_rate": 4.9802970555078934e-05, + "loss": 5.5076, + "step": 6725 + }, + { + "epoch": 0.04000142734798744, + "grad_norm": 2.145123243331909, + "learning_rate": 4.9802912023018585e-05, + "loss": 6.1913, + "step": 6726 + }, + { + "epoch": 0.04000737463126844, + "grad_norm": 1.9321368932724, + "learning_rate": 4.980285348229979e-05, + "loss": 5.9614, + "step": 6727 + }, + { + "epoch": 0.040013321914549434, + "grad_norm": 1.883589506149292, + "learning_rate": 4.9802794932922577e-05, + "loss": 5.4293, + "step": 6728 + }, + { + "epoch": 0.04001926919783043, + "grad_norm": 1.9066367149353027, + "learning_rate": 4.980273637488696e-05, + "loss": 5.4299, + "step": 6729 + }, + { + "epoch": 0.04002521648111143, + "grad_norm": 1.845290184020996, + "learning_rate": 4.9802677808192963e-05, + "loss": 5.596, + "step": 6730 + }, + { + "epoch": 0.040031163764392426, + "grad_norm": 2.3295016288757324, + "learning_rate": 4.980261923284062e-05, + "loss": 6.1266, + "step": 6731 + }, + { + "epoch": 0.04003711104767342, + "grad_norm": 2.451676368713379, + "learning_rate": 4.980256064882993e-05, + "loss": 6.0578, + "step": 6732 + }, + { + "epoch": 0.04004305833095442, + "grad_norm": 2.1317830085754395, + "learning_rate": 4.9802502056160915e-05, + "loss": 6.2627, + "step": 6733 + }, + { + "epoch": 0.04004900561423542, + "grad_norm": 2.223085641860962, + "learning_rate": 4.980244345483361e-05, + "loss": 5.5751, + "step": 6734 + }, + { + "epoch": 0.04005495289751641, + "grad_norm": 2.508385181427002, + "learning_rate": 4.9802384844848035e-05, + "loss": 5.572, + "step": 6735 + }, + { + "epoch": 0.04006090018079741, + "grad_norm": 2.5150837898254395, + "learning_rate": 4.98023262262042e-05, + "loss": 5.3443, + "step": 6736 + }, + { + "epoch": 0.04006684746407841, + "grad_norm": 2.293503761291504, + "learning_rate": 4.980226759890212e-05, + "loss": 5.37, + "step": 6737 + }, + { + "epoch": 0.040072794747359405, + "grad_norm": 1.8764920234680176, + "learning_rate": 4.9802208962941834e-05, + "loss": 5.3804, + "step": 6738 + }, + { + "epoch": 0.0400787420306404, + "grad_norm": 1.8443305492401123, + "learning_rate": 4.980215031832335e-05, + "loss": 5.7787, + "step": 6739 + }, + { + "epoch": 0.0400846893139214, + "grad_norm": 2.6707816123962402, + "learning_rate": 4.980209166504669e-05, + "loss": 6.2858, + "step": 6740 + }, + { + "epoch": 0.0400906365972024, + "grad_norm": 2.3520665168762207, + "learning_rate": 4.980203300311188e-05, + "loss": 5.8069, + "step": 6741 + }, + { + "epoch": 0.04009658388048339, + "grad_norm": 2.0564348697662354, + "learning_rate": 4.980197433251893e-05, + "loss": 6.1698, + "step": 6742 + }, + { + "epoch": 0.040102531163764395, + "grad_norm": 2.205469846725464, + "learning_rate": 4.9801915653267875e-05, + "loss": 5.8401, + "step": 6743 + }, + { + "epoch": 0.04010847844704539, + "grad_norm": 2.042363405227661, + "learning_rate": 4.980185696535873e-05, + "loss": 5.9673, + "step": 6744 + }, + { + "epoch": 0.040114425730326385, + "grad_norm": 1.7575644254684448, + "learning_rate": 4.98017982687915e-05, + "loss": 5.7852, + "step": 6745 + }, + { + "epoch": 0.04012037301360739, + "grad_norm": 1.968548059463501, + "learning_rate": 4.980173956356623e-05, + "loss": 6.2085, + "step": 6746 + }, + { + "epoch": 0.04012632029688838, + "grad_norm": 2.0365097522735596, + "learning_rate": 4.980168084968292e-05, + "loss": 6.4235, + "step": 6747 + }, + { + "epoch": 0.04013226758016938, + "grad_norm": 2.7265079021453857, + "learning_rate": 4.9801622127141605e-05, + "loss": 6.0804, + "step": 6748 + }, + { + "epoch": 0.04013821486345038, + "grad_norm": 2.1604299545288086, + "learning_rate": 4.98015633959423e-05, + "loss": 5.942, + "step": 6749 + }, + { + "epoch": 0.040144162146731374, + "grad_norm": 2.4122307300567627, + "learning_rate": 4.980150465608502e-05, + "loss": 6.2877, + "step": 6750 + }, + { + "epoch": 0.04015010943001237, + "grad_norm": 2.040780782699585, + "learning_rate": 4.98014459075698e-05, + "loss": 5.645, + "step": 6751 + }, + { + "epoch": 0.040156056713293364, + "grad_norm": 2.3660147190093994, + "learning_rate": 4.980138715039665e-05, + "loss": 5.975, + "step": 6752 + }, + { + "epoch": 0.040162003996574366, + "grad_norm": 2.2332143783569336, + "learning_rate": 4.980132838456558e-05, + "loss": 6.1383, + "step": 6753 + }, + { + "epoch": 0.04016795127985536, + "grad_norm": 2.7028262615203857, + "learning_rate": 4.9801269610076635e-05, + "loss": 6.3817, + "step": 6754 + }, + { + "epoch": 0.04017389856313636, + "grad_norm": 2.4653360843658447, + "learning_rate": 4.980121082692982e-05, + "loss": 6.3079, + "step": 6755 + }, + { + "epoch": 0.04017984584641736, + "grad_norm": 2.1470963954925537, + "learning_rate": 4.980115203512515e-05, + "loss": 6.063, + "step": 6756 + }, + { + "epoch": 0.040185793129698354, + "grad_norm": 2.3440990447998047, + "learning_rate": 4.9801093234662666e-05, + "loss": 5.818, + "step": 6757 + }, + { + "epoch": 0.04019174041297935, + "grad_norm": 2.120245933532715, + "learning_rate": 4.980103442554237e-05, + "loss": 5.5867, + "step": 6758 + }, + { + "epoch": 0.04019768769626035, + "grad_norm": 3.196829080581665, + "learning_rate": 4.980097560776429e-05, + "loss": 6.0369, + "step": 6759 + }, + { + "epoch": 0.040203634979541346, + "grad_norm": 2.247997522354126, + "learning_rate": 4.9800916781328456e-05, + "loss": 5.8383, + "step": 6760 + }, + { + "epoch": 0.04020958226282234, + "grad_norm": 2.26254940032959, + "learning_rate": 4.9800857946234866e-05, + "loss": 5.8477, + "step": 6761 + }, + { + "epoch": 0.04021552954610334, + "grad_norm": 2.200495958328247, + "learning_rate": 4.9800799102483556e-05, + "loss": 5.681, + "step": 6762 + }, + { + "epoch": 0.04022147682938434, + "grad_norm": 2.136009454727173, + "learning_rate": 4.980074025007454e-05, + "loss": 5.6453, + "step": 6763 + }, + { + "epoch": 0.04022742411266533, + "grad_norm": 2.3510351181030273, + "learning_rate": 4.980068138900785e-05, + "loss": 5.5735, + "step": 6764 + }, + { + "epoch": 0.040233371395946335, + "grad_norm": 2.249199628829956, + "learning_rate": 4.980062251928349e-05, + "loss": 5.9883, + "step": 6765 + }, + { + "epoch": 0.04023931867922733, + "grad_norm": 2.426816463470459, + "learning_rate": 4.9800563640901494e-05, + "loss": 6.1658, + "step": 6766 + }, + { + "epoch": 0.040245265962508325, + "grad_norm": 2.1044836044311523, + "learning_rate": 4.9800504753861874e-05, + "loss": 5.8627, + "step": 6767 + }, + { + "epoch": 0.04025121324578932, + "grad_norm": 1.9563783407211304, + "learning_rate": 4.9800445858164656e-05, + "loss": 5.9642, + "step": 6768 + }, + { + "epoch": 0.04025716052907032, + "grad_norm": 2.3810997009277344, + "learning_rate": 4.980038695380986e-05, + "loss": 5.2938, + "step": 6769 + }, + { + "epoch": 0.04026310781235132, + "grad_norm": 2.3180932998657227, + "learning_rate": 4.98003280407975e-05, + "loss": 5.7682, + "step": 6770 + }, + { + "epoch": 0.04026905509563231, + "grad_norm": 2.420954704284668, + "learning_rate": 4.980026911912761e-05, + "loss": 5.5724, + "step": 6771 + }, + { + "epoch": 0.040275002378913315, + "grad_norm": 2.447460651397705, + "learning_rate": 4.9800210188800193e-05, + "loss": 5.4844, + "step": 6772 + }, + { + "epoch": 0.04028094966219431, + "grad_norm": 2.4059863090515137, + "learning_rate": 4.980015124981529e-05, + "loss": 5.604, + "step": 6773 + }, + { + "epoch": 0.040286896945475305, + "grad_norm": 2.251492977142334, + "learning_rate": 4.9800092302172894e-05, + "loss": 5.4565, + "step": 6774 + }, + { + "epoch": 0.04029284422875631, + "grad_norm": 2.478682279586792, + "learning_rate": 4.980003334587305e-05, + "loss": 5.9416, + "step": 6775 + }, + { + "epoch": 0.0402987915120373, + "grad_norm": 2.2685835361480713, + "learning_rate": 4.9799974380915785e-05, + "loss": 5.9659, + "step": 6776 + }, + { + "epoch": 0.0403047387953183, + "grad_norm": 2.833101987838745, + "learning_rate": 4.979991540730108e-05, + "loss": 5.3406, + "step": 6777 + }, + { + "epoch": 0.0403106860785993, + "grad_norm": 3.0967416763305664, + "learning_rate": 4.9799856425029e-05, + "loss": 5.5848, + "step": 6778 + }, + { + "epoch": 0.040316633361880294, + "grad_norm": 2.3081796169281006, + "learning_rate": 4.9799797434099536e-05, + "loss": 5.5964, + "step": 6779 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 2.359531879425049, + "learning_rate": 4.9799738434512724e-05, + "loss": 5.6614, + "step": 6780 + }, + { + "epoch": 0.040328527928442284, + "grad_norm": 2.1566221714019775, + "learning_rate": 4.979967942626858e-05, + "loss": 6.0517, + "step": 6781 + }, + { + "epoch": 0.040334475211723286, + "grad_norm": 2.3964991569519043, + "learning_rate": 4.979962040936712e-05, + "loss": 5.9516, + "step": 6782 + }, + { + "epoch": 0.04034042249500428, + "grad_norm": 1.9913266897201538, + "learning_rate": 4.9799561383808365e-05, + "loss": 5.9144, + "step": 6783 + }, + { + "epoch": 0.040346369778285276, + "grad_norm": 1.7329169511795044, + "learning_rate": 4.979950234959235e-05, + "loss": 6.0393, + "step": 6784 + }, + { + "epoch": 0.04035231706156628, + "grad_norm": 1.8278034925460815, + "learning_rate": 4.979944330671908e-05, + "loss": 5.9318, + "step": 6785 + }, + { + "epoch": 0.040358264344847274, + "grad_norm": 2.089806318283081, + "learning_rate": 4.979938425518858e-05, + "loss": 5.5726, + "step": 6786 + }, + { + "epoch": 0.04036421162812827, + "grad_norm": 2.03664231300354, + "learning_rate": 4.9799325195000874e-05, + "loss": 5.8265, + "step": 6787 + }, + { + "epoch": 0.04037015891140927, + "grad_norm": 1.8801567554473877, + "learning_rate": 4.979926612615597e-05, + "loss": 5.7575, + "step": 6788 + }, + { + "epoch": 0.040376106194690266, + "grad_norm": 1.814959168434143, + "learning_rate": 4.979920704865391e-05, + "loss": 5.8737, + "step": 6789 + }, + { + "epoch": 0.04038205347797126, + "grad_norm": 1.7018035650253296, + "learning_rate": 4.97991479624947e-05, + "loss": 5.6768, + "step": 6790 + }, + { + "epoch": 0.04038800076125226, + "grad_norm": 2.21545147895813, + "learning_rate": 4.979908886767837e-05, + "loss": 5.4206, + "step": 6791 + }, + { + "epoch": 0.04039394804453326, + "grad_norm": 2.6184499263763428, + "learning_rate": 4.979902976420492e-05, + "loss": 5.0255, + "step": 6792 + }, + { + "epoch": 0.04039989532781425, + "grad_norm": 2.3914453983306885, + "learning_rate": 4.9798970652074396e-05, + "loss": 4.884, + "step": 6793 + }, + { + "epoch": 0.040405842611095255, + "grad_norm": 2.4367334842681885, + "learning_rate": 4.97989115312868e-05, + "loss": 4.7445, + "step": 6794 + }, + { + "epoch": 0.04041178989437625, + "grad_norm": 2.794490337371826, + "learning_rate": 4.9798852401842165e-05, + "loss": 4.9686, + "step": 6795 + }, + { + "epoch": 0.040417737177657245, + "grad_norm": 2.665395736694336, + "learning_rate": 4.979879326374051e-05, + "loss": 4.854, + "step": 6796 + }, + { + "epoch": 0.04042368446093824, + "grad_norm": 2.0832581520080566, + "learning_rate": 4.979873411698184e-05, + "loss": 5.0371, + "step": 6797 + }, + { + "epoch": 0.04042963174421924, + "grad_norm": 2.4604554176330566, + "learning_rate": 4.979867496156619e-05, + "loss": 4.7524, + "step": 6798 + }, + { + "epoch": 0.04043557902750024, + "grad_norm": 2.3760480880737305, + "learning_rate": 4.979861579749359e-05, + "loss": 4.7645, + "step": 6799 + }, + { + "epoch": 0.04044152631078123, + "grad_norm": 2.468043088912964, + "learning_rate": 4.979855662476405e-05, + "loss": 4.7791, + "step": 6800 + }, + { + "epoch": 0.040447473594062235, + "grad_norm": 2.516026258468628, + "learning_rate": 4.979849744337758e-05, + "loss": 4.7978, + "step": 6801 + }, + { + "epoch": 0.04045342087734323, + "grad_norm": 2.1882307529449463, + "learning_rate": 4.979843825333421e-05, + "loss": 5.002, + "step": 6802 + }, + { + "epoch": 0.040459368160624225, + "grad_norm": 2.423140525817871, + "learning_rate": 4.979837905463397e-05, + "loss": 5.0161, + "step": 6803 + }, + { + "epoch": 0.04046531544390523, + "grad_norm": 2.485739231109619, + "learning_rate": 4.979831984727687e-05, + "loss": 4.7613, + "step": 6804 + }, + { + "epoch": 0.04047126272718622, + "grad_norm": 2.267744302749634, + "learning_rate": 4.979826063126293e-05, + "loss": 4.7496, + "step": 6805 + }, + { + "epoch": 0.04047721001046722, + "grad_norm": 2.3172249794006348, + "learning_rate": 4.9798201406592176e-05, + "loss": 4.8153, + "step": 6806 + }, + { + "epoch": 0.04048315729374822, + "grad_norm": 2.309471607208252, + "learning_rate": 4.979814217326463e-05, + "loss": 4.9874, + "step": 6807 + }, + { + "epoch": 0.040489104577029214, + "grad_norm": 1.989372968673706, + "learning_rate": 4.97980829312803e-05, + "loss": 5.1254, + "step": 6808 + }, + { + "epoch": 0.04049505186031021, + "grad_norm": 2.4409830570220947, + "learning_rate": 4.9798023680639216e-05, + "loss": 4.6476, + "step": 6809 + }, + { + "epoch": 0.040500999143591204, + "grad_norm": 2.5192453861236572, + "learning_rate": 4.97979644213414e-05, + "loss": 4.6933, + "step": 6810 + }, + { + "epoch": 0.040506946426872206, + "grad_norm": 2.294718027114868, + "learning_rate": 4.979790515338688e-05, + "loss": 4.8266, + "step": 6811 + }, + { + "epoch": 0.0405128937101532, + "grad_norm": 2.294550657272339, + "learning_rate": 4.979784587677565e-05, + "loss": 4.6691, + "step": 6812 + }, + { + "epoch": 0.040518840993434196, + "grad_norm": 2.332326889038086, + "learning_rate": 4.979778659150776e-05, + "loss": 4.8366, + "step": 6813 + }, + { + "epoch": 0.0405247882767152, + "grad_norm": 2.325439929962158, + "learning_rate": 4.979772729758322e-05, + "loss": 4.8149, + "step": 6814 + }, + { + "epoch": 0.040530735559996194, + "grad_norm": 2.165926456451416, + "learning_rate": 4.979766799500204e-05, + "loss": 4.7309, + "step": 6815 + }, + { + "epoch": 0.04053668284327719, + "grad_norm": 2.3184943199157715, + "learning_rate": 4.9797608683764264e-05, + "loss": 4.7163, + "step": 6816 + }, + { + "epoch": 0.04054263012655819, + "grad_norm": 2.2161147594451904, + "learning_rate": 4.979754936386989e-05, + "loss": 4.5549, + "step": 6817 + }, + { + "epoch": 0.040548577409839186, + "grad_norm": 2.415496587753296, + "learning_rate": 4.979749003531895e-05, + "loss": 4.7676, + "step": 6818 + }, + { + "epoch": 0.04055452469312018, + "grad_norm": 2.1700618267059326, + "learning_rate": 4.979743069811146e-05, + "loss": 4.8448, + "step": 6819 + }, + { + "epoch": 0.04056047197640118, + "grad_norm": 2.4978747367858887, + "learning_rate": 4.9797371352247446e-05, + "loss": 6.363, + "step": 6820 + }, + { + "epoch": 0.04056641925968218, + "grad_norm": 1.9293922185897827, + "learning_rate": 4.979731199772693e-05, + "loss": 5.6502, + "step": 6821 + }, + { + "epoch": 0.04057236654296317, + "grad_norm": 2.5583136081695557, + "learning_rate": 4.9797252634549915e-05, + "loss": 4.874, + "step": 6822 + }, + { + "epoch": 0.040578313826244175, + "grad_norm": 2.263460159301758, + "learning_rate": 4.979719326271645e-05, + "loss": 5.8457, + "step": 6823 + }, + { + "epoch": 0.04058426110952517, + "grad_norm": 2.5630266666412354, + "learning_rate": 4.979713388222653e-05, + "loss": 4.8668, + "step": 6824 + }, + { + "epoch": 0.040590208392806165, + "grad_norm": 2.2965216636657715, + "learning_rate": 4.9797074493080186e-05, + "loss": 5.0049, + "step": 6825 + }, + { + "epoch": 0.04059615567608716, + "grad_norm": 2.222405433654785, + "learning_rate": 4.979701509527745e-05, + "loss": 5.0204, + "step": 6826 + }, + { + "epoch": 0.04060210295936816, + "grad_norm": 2.4425504207611084, + "learning_rate": 4.979695568881833e-05, + "loss": 5.687, + "step": 6827 + }, + { + "epoch": 0.04060805024264916, + "grad_norm": 2.329901933670044, + "learning_rate": 4.979689627370284e-05, + "loss": 5.9447, + "step": 6828 + }, + { + "epoch": 0.04061399752593015, + "grad_norm": 2.3041510581970215, + "learning_rate": 4.9796836849931015e-05, + "loss": 5.9277, + "step": 6829 + }, + { + "epoch": 0.040619944809211155, + "grad_norm": 2.3020026683807373, + "learning_rate": 4.979677741750287e-05, + "loss": 5.9675, + "step": 6830 + }, + { + "epoch": 0.04062589209249215, + "grad_norm": 2.1861371994018555, + "learning_rate": 4.9796717976418426e-05, + "loss": 6.1312, + "step": 6831 + }, + { + "epoch": 0.040631839375773145, + "grad_norm": 1.9544565677642822, + "learning_rate": 4.979665852667771e-05, + "loss": 5.9218, + "step": 6832 + }, + { + "epoch": 0.04063778665905415, + "grad_norm": 2.346431016921997, + "learning_rate": 4.979659906828073e-05, + "loss": 6.1668, + "step": 6833 + }, + { + "epoch": 0.04064373394233514, + "grad_norm": 2.0405263900756836, + "learning_rate": 4.979653960122751e-05, + "loss": 6.0501, + "step": 6834 + }, + { + "epoch": 0.04064968122561614, + "grad_norm": 1.7645004987716675, + "learning_rate": 4.979648012551809e-05, + "loss": 6.0299, + "step": 6835 + }, + { + "epoch": 0.04065562850889714, + "grad_norm": 2.284703016281128, + "learning_rate": 4.979642064115246e-05, + "loss": 5.5501, + "step": 6836 + }, + { + "epoch": 0.040661575792178134, + "grad_norm": 1.7246543169021606, + "learning_rate": 4.979636114813066e-05, + "loss": 5.5733, + "step": 6837 + }, + { + "epoch": 0.04066752307545913, + "grad_norm": 2.0958921909332275, + "learning_rate": 4.9796301646452705e-05, + "loss": 5.8998, + "step": 6838 + }, + { + "epoch": 0.040673470358740124, + "grad_norm": 2.2123169898986816, + "learning_rate": 4.979624213611862e-05, + "loss": 6.0322, + "step": 6839 + }, + { + "epoch": 0.040679417642021126, + "grad_norm": 1.9541656970977783, + "learning_rate": 4.9796182617128426e-05, + "loss": 5.9255, + "step": 6840 + }, + { + "epoch": 0.04068536492530212, + "grad_norm": 2.077601909637451, + "learning_rate": 4.979612308948213e-05, + "loss": 5.6975, + "step": 6841 + }, + { + "epoch": 0.040691312208583116, + "grad_norm": 2.0595803260803223, + "learning_rate": 4.979606355317977e-05, + "loss": 6.0696, + "step": 6842 + }, + { + "epoch": 0.04069725949186412, + "grad_norm": 1.9800641536712646, + "learning_rate": 4.979600400822136e-05, + "loss": 5.7357, + "step": 6843 + }, + { + "epoch": 0.040703206775145113, + "grad_norm": 2.26238751411438, + "learning_rate": 4.979594445460692e-05, + "loss": 5.9119, + "step": 6844 + }, + { + "epoch": 0.04070915405842611, + "grad_norm": 2.0941457748413086, + "learning_rate": 4.979588489233648e-05, + "loss": 5.945, + "step": 6845 + }, + { + "epoch": 0.04071510134170711, + "grad_norm": 2.1995291709899902, + "learning_rate": 4.979582532141005e-05, + "loss": 5.8406, + "step": 6846 + }, + { + "epoch": 0.040721048624988106, + "grad_norm": 2.0138349533081055, + "learning_rate": 4.9795765741827646e-05, + "loss": 5.7984, + "step": 6847 + }, + { + "epoch": 0.0407269959082691, + "grad_norm": 1.9314415454864502, + "learning_rate": 4.9795706153589304e-05, + "loss": 5.8686, + "step": 6848 + }, + { + "epoch": 0.0407329431915501, + "grad_norm": 2.1324212551116943, + "learning_rate": 4.979564655669503e-05, + "loss": 5.8477, + "step": 6849 + }, + { + "epoch": 0.0407388904748311, + "grad_norm": 1.9601761102676392, + "learning_rate": 4.979558695114486e-05, + "loss": 5.9078, + "step": 6850 + }, + { + "epoch": 0.04074483775811209, + "grad_norm": 2.004333734512329, + "learning_rate": 4.97955273369388e-05, + "loss": 5.9852, + "step": 6851 + }, + { + "epoch": 0.040750785041393095, + "grad_norm": 1.9015164375305176, + "learning_rate": 4.979546771407688e-05, + "loss": 5.6286, + "step": 6852 + }, + { + "epoch": 0.04075673232467409, + "grad_norm": 1.9674208164215088, + "learning_rate": 4.979540808255911e-05, + "loss": 5.8715, + "step": 6853 + }, + { + "epoch": 0.040762679607955085, + "grad_norm": 2.0473713874816895, + "learning_rate": 4.9795348442385534e-05, + "loss": 5.7488, + "step": 6854 + }, + { + "epoch": 0.04076862689123608, + "grad_norm": 1.9536950588226318, + "learning_rate": 4.979528879355615e-05, + "loss": 5.6755, + "step": 6855 + }, + { + "epoch": 0.04077457417451708, + "grad_norm": 2.189659595489502, + "learning_rate": 4.979522913607099e-05, + "loss": 5.7934, + "step": 6856 + }, + { + "epoch": 0.04078052145779808, + "grad_norm": 1.999742031097412, + "learning_rate": 4.9795169469930067e-05, + "loss": 5.7341, + "step": 6857 + }, + { + "epoch": 0.04078646874107907, + "grad_norm": 2.1212494373321533, + "learning_rate": 4.9795109795133414e-05, + "loss": 5.8465, + "step": 6858 + }, + { + "epoch": 0.040792416024360074, + "grad_norm": 1.966467261314392, + "learning_rate": 4.979505011168104e-05, + "loss": 5.8699, + "step": 6859 + }, + { + "epoch": 0.04079836330764107, + "grad_norm": 2.290205955505371, + "learning_rate": 4.979499041957297e-05, + "loss": 6.387, + "step": 6860 + }, + { + "epoch": 0.040804310590922065, + "grad_norm": 2.41827130317688, + "learning_rate": 4.979493071880923e-05, + "loss": 6.893, + "step": 6861 + }, + { + "epoch": 0.04081025787420307, + "grad_norm": 2.0652520656585693, + "learning_rate": 4.979487100938983e-05, + "loss": 6.6435, + "step": 6862 + }, + { + "epoch": 0.04081620515748406, + "grad_norm": 1.8594858646392822, + "learning_rate": 4.979481129131479e-05, + "loss": 5.7441, + "step": 6863 + }, + { + "epoch": 0.04082215244076506, + "grad_norm": 2.269240617752075, + "learning_rate": 4.979475156458415e-05, + "loss": 5.8468, + "step": 6864 + }, + { + "epoch": 0.04082809972404606, + "grad_norm": 2.2355518341064453, + "learning_rate": 4.979469182919792e-05, + "loss": 5.8717, + "step": 6865 + }, + { + "epoch": 0.040834047007327054, + "grad_norm": 1.9578050374984741, + "learning_rate": 4.9794632085156105e-05, + "loss": 5.6777, + "step": 6866 + }, + { + "epoch": 0.04083999429060805, + "grad_norm": 2.354609727859497, + "learning_rate": 4.979457233245875e-05, + "loss": 5.7993, + "step": 6867 + }, + { + "epoch": 0.040845941573889044, + "grad_norm": 1.978289008140564, + "learning_rate": 4.9794512571105865e-05, + "loss": 5.7429, + "step": 6868 + }, + { + "epoch": 0.040851888857170046, + "grad_norm": 1.9695252180099487, + "learning_rate": 4.979445280109747e-05, + "loss": 6.1322, + "step": 6869 + }, + { + "epoch": 0.04085783614045104, + "grad_norm": 2.172510862350464, + "learning_rate": 4.9794393022433586e-05, + "loss": 5.9443, + "step": 6870 + }, + { + "epoch": 0.040863783423732036, + "grad_norm": 2.1992416381835938, + "learning_rate": 4.9794333235114244e-05, + "loss": 6.4094, + "step": 6871 + }, + { + "epoch": 0.04086973070701304, + "grad_norm": 2.1804773807525635, + "learning_rate": 4.979427343913945e-05, + "loss": 6.3871, + "step": 6872 + }, + { + "epoch": 0.04087567799029403, + "grad_norm": 2.2877554893493652, + "learning_rate": 4.979421363450923e-05, + "loss": 6.2509, + "step": 6873 + }, + { + "epoch": 0.04088162527357503, + "grad_norm": 2.0697927474975586, + "learning_rate": 4.979415382122361e-05, + "loss": 5.9008, + "step": 6874 + }, + { + "epoch": 0.04088757255685603, + "grad_norm": 2.2907917499542236, + "learning_rate": 4.97940939992826e-05, + "loss": 5.6137, + "step": 6875 + }, + { + "epoch": 0.040893519840137026, + "grad_norm": 1.9960983991622925, + "learning_rate": 4.979403416868623e-05, + "loss": 5.7283, + "step": 6876 + }, + { + "epoch": 0.04089946712341802, + "grad_norm": 2.2767558097839355, + "learning_rate": 4.9793974329434525e-05, + "loss": 5.3632, + "step": 6877 + }, + { + "epoch": 0.04090541440669902, + "grad_norm": 2.295635461807251, + "learning_rate": 4.97939144815275e-05, + "loss": 5.4524, + "step": 6878 + }, + { + "epoch": 0.04091136168998002, + "grad_norm": 2.247194766998291, + "learning_rate": 4.9793854624965166e-05, + "loss": 5.7846, + "step": 6879 + }, + { + "epoch": 0.04091730897326101, + "grad_norm": 2.2641420364379883, + "learning_rate": 4.9793794759747565e-05, + "loss": 5.7479, + "step": 6880 + }, + { + "epoch": 0.040923256256542015, + "grad_norm": 2.002126455307007, + "learning_rate": 4.97937348858747e-05, + "loss": 5.2694, + "step": 6881 + }, + { + "epoch": 0.04092920353982301, + "grad_norm": 2.079157590866089, + "learning_rate": 4.9793675003346596e-05, + "loss": 6.2711, + "step": 6882 + }, + { + "epoch": 0.040935150823104005, + "grad_norm": 1.9030524492263794, + "learning_rate": 4.979361511216328e-05, + "loss": 5.7259, + "step": 6883 + }, + { + "epoch": 0.040941098106385, + "grad_norm": 1.9157373905181885, + "learning_rate": 4.9793555212324774e-05, + "loss": 6.086, + "step": 6884 + }, + { + "epoch": 0.040947045389666, + "grad_norm": 1.8622015714645386, + "learning_rate": 4.979349530383108e-05, + "loss": 6.1318, + "step": 6885 + }, + { + "epoch": 0.040952992672947, + "grad_norm": 2.3341257572174072, + "learning_rate": 4.9793435386682256e-05, + "loss": 5.9421, + "step": 6886 + }, + { + "epoch": 0.04095893995622799, + "grad_norm": 2.6894209384918213, + "learning_rate": 4.979337546087828e-05, + "loss": 5.5351, + "step": 6887 + }, + { + "epoch": 0.040964887239508994, + "grad_norm": 2.5316739082336426, + "learning_rate": 4.979331552641919e-05, + "loss": 5.5056, + "step": 6888 + }, + { + "epoch": 0.04097083452278999, + "grad_norm": 2.5129077434539795, + "learning_rate": 4.979325558330502e-05, + "loss": 5.3091, + "step": 6889 + }, + { + "epoch": 0.040976781806070985, + "grad_norm": 2.275536298751831, + "learning_rate": 4.979319563153578e-05, + "loss": 5.494, + "step": 6890 + }, + { + "epoch": 0.04098272908935199, + "grad_norm": 2.749375104904175, + "learning_rate": 4.9793135671111494e-05, + "loss": 6.0139, + "step": 6891 + }, + { + "epoch": 0.04098867637263298, + "grad_norm": 2.419163227081299, + "learning_rate": 4.9793075702032177e-05, + "loss": 6.1102, + "step": 6892 + }, + { + "epoch": 0.04099462365591398, + "grad_norm": 2.311450958251953, + "learning_rate": 4.9793015724297856e-05, + "loss": 5.9798, + "step": 6893 + }, + { + "epoch": 0.04100057093919498, + "grad_norm": 2.0522212982177734, + "learning_rate": 4.979295573790854e-05, + "loss": 5.9247, + "step": 6894 + }, + { + "epoch": 0.041006518222475974, + "grad_norm": 2.1928513050079346, + "learning_rate": 4.979289574286427e-05, + "loss": 5.8001, + "step": 6895 + }, + { + "epoch": 0.04101246550575697, + "grad_norm": 2.1945207118988037, + "learning_rate": 4.979283573916505e-05, + "loss": 5.9975, + "step": 6896 + }, + { + "epoch": 0.041018412789037964, + "grad_norm": 2.274843454360962, + "learning_rate": 4.979277572681091e-05, + "loss": 5.693, + "step": 6897 + }, + { + "epoch": 0.041024360072318966, + "grad_norm": 2.2715282440185547, + "learning_rate": 4.979271570580186e-05, + "loss": 5.9952, + "step": 6898 + }, + { + "epoch": 0.04103030735559996, + "grad_norm": 2.4459903240203857, + "learning_rate": 4.9792655676137943e-05, + "loss": 6.0305, + "step": 6899 + }, + { + "epoch": 0.041036254638880956, + "grad_norm": 2.8737339973449707, + "learning_rate": 4.9792595637819165e-05, + "loss": 6.0982, + "step": 6900 + }, + { + "epoch": 0.04104220192216196, + "grad_norm": 2.382143974304199, + "learning_rate": 4.979253559084553e-05, + "loss": 5.6122, + "step": 6901 + }, + { + "epoch": 0.04104814920544295, + "grad_norm": 2.4127237796783447, + "learning_rate": 4.97924755352171e-05, + "loss": 5.7723, + "step": 6902 + }, + { + "epoch": 0.04105409648872395, + "grad_norm": 2.3108956813812256, + "learning_rate": 4.979241547093386e-05, + "loss": 6.1655, + "step": 6903 + }, + { + "epoch": 0.04106004377200495, + "grad_norm": 2.250555992126465, + "learning_rate": 4.979235539799584e-05, + "loss": 6.0627, + "step": 6904 + }, + { + "epoch": 0.041065991055285946, + "grad_norm": 2.187957525253296, + "learning_rate": 4.979229531640307e-05, + "loss": 6.1438, + "step": 6905 + }, + { + "epoch": 0.04107193833856694, + "grad_norm": 1.9089539051055908, + "learning_rate": 4.979223522615557e-05, + "loss": 6.1431, + "step": 6906 + }, + { + "epoch": 0.04107788562184794, + "grad_norm": 2.343569040298462, + "learning_rate": 4.979217512725336e-05, + "loss": 5.9774, + "step": 6907 + }, + { + "epoch": 0.04108383290512894, + "grad_norm": 2.759631633758545, + "learning_rate": 4.979211501969645e-05, + "loss": 5.7982, + "step": 6908 + }, + { + "epoch": 0.04108978018840993, + "grad_norm": 2.295811414718628, + "learning_rate": 4.979205490348487e-05, + "loss": 6.0843, + "step": 6909 + }, + { + "epoch": 0.041095727471690935, + "grad_norm": 2.6259605884552, + "learning_rate": 4.979199477861864e-05, + "loss": 5.6498, + "step": 6910 + }, + { + "epoch": 0.04110167475497193, + "grad_norm": 2.396895408630371, + "learning_rate": 4.9791934645097785e-05, + "loss": 5.9936, + "step": 6911 + }, + { + "epoch": 0.041107622038252925, + "grad_norm": 2.020845651626587, + "learning_rate": 4.979187450292231e-05, + "loss": 5.4867, + "step": 6912 + }, + { + "epoch": 0.04111356932153392, + "grad_norm": 2.6473753452301025, + "learning_rate": 4.979181435209226e-05, + "loss": 5.3556, + "step": 6913 + }, + { + "epoch": 0.04111951660481492, + "grad_norm": 2.353158712387085, + "learning_rate": 4.9791754192607636e-05, + "loss": 6.3122, + "step": 6914 + }, + { + "epoch": 0.04112546388809592, + "grad_norm": 2.499817132949829, + "learning_rate": 4.9791694024468474e-05, + "loss": 5.816, + "step": 6915 + }, + { + "epoch": 0.04113141117137691, + "grad_norm": 2.009239673614502, + "learning_rate": 4.979163384767478e-05, + "loss": 5.5982, + "step": 6916 + }, + { + "epoch": 0.041137358454657914, + "grad_norm": 2.3885819911956787, + "learning_rate": 4.9791573662226586e-05, + "loss": 5.7403, + "step": 6917 + }, + { + "epoch": 0.04114330573793891, + "grad_norm": 2.3135135173797607, + "learning_rate": 4.979151346812391e-05, + "loss": 5.3151, + "step": 6918 + }, + { + "epoch": 0.041149253021219905, + "grad_norm": 1.9801241159439087, + "learning_rate": 4.979145326536677e-05, + "loss": 5.5148, + "step": 6919 + }, + { + "epoch": 0.04115520030450091, + "grad_norm": 2.0724904537200928, + "learning_rate": 4.979139305395519e-05, + "loss": 5.5355, + "step": 6920 + }, + { + "epoch": 0.0411611475877819, + "grad_norm": 1.8104170560836792, + "learning_rate": 4.97913328338892e-05, + "loss": 5.4861, + "step": 6921 + }, + { + "epoch": 0.0411670948710629, + "grad_norm": 1.81072998046875, + "learning_rate": 4.9791272605168804e-05, + "loss": 5.5075, + "step": 6922 + }, + { + "epoch": 0.0411730421543439, + "grad_norm": 1.709191083908081, + "learning_rate": 4.979121236779403e-05, + "loss": 6.1353, + "step": 6923 + }, + { + "epoch": 0.041178989437624894, + "grad_norm": 2.004974126815796, + "learning_rate": 4.9791152121764903e-05, + "loss": 5.478, + "step": 6924 + }, + { + "epoch": 0.04118493672090589, + "grad_norm": 1.937933325767517, + "learning_rate": 4.979109186708144e-05, + "loss": 5.4022, + "step": 6925 + }, + { + "epoch": 0.041190884004186884, + "grad_norm": 1.9453305006027222, + "learning_rate": 4.979103160374367e-05, + "loss": 5.243, + "step": 6926 + }, + { + "epoch": 0.041196831287467886, + "grad_norm": 1.8552072048187256, + "learning_rate": 4.979097133175159e-05, + "loss": 5.3104, + "step": 6927 + }, + { + "epoch": 0.04120277857074888, + "grad_norm": 1.9148203134536743, + "learning_rate": 4.9790911051105246e-05, + "loss": 5.5538, + "step": 6928 + }, + { + "epoch": 0.041208725854029876, + "grad_norm": 1.9658032655715942, + "learning_rate": 4.979085076180466e-05, + "loss": 5.5285, + "step": 6929 + }, + { + "epoch": 0.04121467313731088, + "grad_norm": 1.7332781553268433, + "learning_rate": 4.9790790463849835e-05, + "loss": 5.1959, + "step": 6930 + }, + { + "epoch": 0.04122062042059187, + "grad_norm": 1.5762557983398438, + "learning_rate": 4.9790730157240804e-05, + "loss": 5.3672, + "step": 6931 + }, + { + "epoch": 0.04122656770387287, + "grad_norm": 1.7899656295776367, + "learning_rate": 4.979066984197759e-05, + "loss": 5.3588, + "step": 6932 + }, + { + "epoch": 0.04123251498715387, + "grad_norm": 1.5992622375488281, + "learning_rate": 4.97906095180602e-05, + "loss": 5.275, + "step": 6933 + }, + { + "epoch": 0.041238462270434866, + "grad_norm": 1.875116229057312, + "learning_rate": 4.9790549185488666e-05, + "loss": 5.3428, + "step": 6934 + }, + { + "epoch": 0.04124440955371586, + "grad_norm": 1.8110510110855103, + "learning_rate": 4.979048884426301e-05, + "loss": 5.2416, + "step": 6935 + }, + { + "epoch": 0.04125035683699686, + "grad_norm": 1.5512267351150513, + "learning_rate": 4.979042849438325e-05, + "loss": 5.3643, + "step": 6936 + }, + { + "epoch": 0.04125630412027786, + "grad_norm": 1.8929630517959595, + "learning_rate": 4.979036813584941e-05, + "loss": 5.4232, + "step": 6937 + }, + { + "epoch": 0.04126225140355885, + "grad_norm": 1.8569291830062866, + "learning_rate": 4.9790307768661504e-05, + "loss": 5.2949, + "step": 6938 + }, + { + "epoch": 0.041268198686839855, + "grad_norm": 1.6058611869812012, + "learning_rate": 4.9790247392819564e-05, + "loss": 5.3736, + "step": 6939 + }, + { + "epoch": 0.04127414597012085, + "grad_norm": 1.8455227613449097, + "learning_rate": 4.97901870083236e-05, + "loss": 5.2768, + "step": 6940 + }, + { + "epoch": 0.041280093253401845, + "grad_norm": 1.9346935749053955, + "learning_rate": 4.979012661517364e-05, + "loss": 5.4316, + "step": 6941 + }, + { + "epoch": 0.04128604053668284, + "grad_norm": 1.8085594177246094, + "learning_rate": 4.97900662133697e-05, + "loss": 5.365, + "step": 6942 + }, + { + "epoch": 0.04129198781996384, + "grad_norm": 1.73456871509552, + "learning_rate": 4.9790005802911804e-05, + "loss": 5.2726, + "step": 6943 + }, + { + "epoch": 0.04129793510324484, + "grad_norm": 2.1071617603302, + "learning_rate": 4.978994538379997e-05, + "loss": 6.2313, + "step": 6944 + }, + { + "epoch": 0.04130388238652583, + "grad_norm": 1.7098963260650635, + "learning_rate": 4.978988495603423e-05, + "loss": 5.3162, + "step": 6945 + }, + { + "epoch": 0.041309829669806834, + "grad_norm": 1.8131905794143677, + "learning_rate": 4.978982451961459e-05, + "loss": 5.2486, + "step": 6946 + }, + { + "epoch": 0.04131577695308783, + "grad_norm": 1.8162381649017334, + "learning_rate": 4.978976407454109e-05, + "loss": 5.2806, + "step": 6947 + }, + { + "epoch": 0.041321724236368824, + "grad_norm": 1.9250297546386719, + "learning_rate": 4.9789703620813734e-05, + "loss": 5.1742, + "step": 6948 + }, + { + "epoch": 0.041327671519649826, + "grad_norm": 1.8263678550720215, + "learning_rate": 4.978964315843254e-05, + "loss": 5.1786, + "step": 6949 + }, + { + "epoch": 0.04133361880293082, + "grad_norm": 1.6751807928085327, + "learning_rate": 4.9789582687397546e-05, + "loss": 5.4798, + "step": 6950 + }, + { + "epoch": 0.04133956608621182, + "grad_norm": 1.7842947244644165, + "learning_rate": 4.9789522207708764e-05, + "loss": 5.201, + "step": 6951 + }, + { + "epoch": 0.04134551336949282, + "grad_norm": 1.6785067319869995, + "learning_rate": 4.978946171936621e-05, + "loss": 5.3852, + "step": 6952 + }, + { + "epoch": 0.041351460652773814, + "grad_norm": 1.5475291013717651, + "learning_rate": 4.978940122236992e-05, + "loss": 5.4083, + "step": 6953 + }, + { + "epoch": 0.04135740793605481, + "grad_norm": 1.7445106506347656, + "learning_rate": 4.97893407167199e-05, + "loss": 5.3125, + "step": 6954 + }, + { + "epoch": 0.041363355219335804, + "grad_norm": 1.7334082126617432, + "learning_rate": 4.9789280202416175e-05, + "loss": 5.5388, + "step": 6955 + }, + { + "epoch": 0.041369302502616806, + "grad_norm": 1.7267119884490967, + "learning_rate": 4.9789219679458774e-05, + "loss": 5.5175, + "step": 6956 + }, + { + "epoch": 0.0413752497858978, + "grad_norm": 1.8033246994018555, + "learning_rate": 4.978915914784771e-05, + "loss": 5.3523, + "step": 6957 + }, + { + "epoch": 0.041381197069178796, + "grad_norm": 1.9836528301239014, + "learning_rate": 4.978909860758301e-05, + "loss": 5.3808, + "step": 6958 + }, + { + "epoch": 0.0413871443524598, + "grad_norm": 1.6260416507720947, + "learning_rate": 4.978903805866469e-05, + "loss": 5.4642, + "step": 6959 + }, + { + "epoch": 0.04139309163574079, + "grad_norm": 1.7260626554489136, + "learning_rate": 4.978897750109277e-05, + "loss": 5.4975, + "step": 6960 + }, + { + "epoch": 0.04139903891902179, + "grad_norm": 1.6948668956756592, + "learning_rate": 4.978891693486728e-05, + "loss": 5.5768, + "step": 6961 + }, + { + "epoch": 0.04140498620230279, + "grad_norm": 1.7885476350784302, + "learning_rate": 4.978885635998824e-05, + "loss": 5.4156, + "step": 6962 + }, + { + "epoch": 0.041410933485583785, + "grad_norm": 1.8626813888549805, + "learning_rate": 4.978879577645565e-05, + "loss": 5.354, + "step": 6963 + }, + { + "epoch": 0.04141688076886478, + "grad_norm": 1.867090106010437, + "learning_rate": 4.9788735184269553e-05, + "loss": 5.2934, + "step": 6964 + }, + { + "epoch": 0.04142282805214578, + "grad_norm": 1.7208340167999268, + "learning_rate": 4.9788674583429974e-05, + "loss": 5.2116, + "step": 6965 + }, + { + "epoch": 0.04142877533542678, + "grad_norm": 1.934480905532837, + "learning_rate": 4.9788613973936916e-05, + "loss": 5.5801, + "step": 6966 + }, + { + "epoch": 0.04143472261870777, + "grad_norm": 1.6263724565505981, + "learning_rate": 4.978855335579041e-05, + "loss": 5.3835, + "step": 6967 + }, + { + "epoch": 0.041440669901988775, + "grad_norm": 1.743996262550354, + "learning_rate": 4.9788492728990474e-05, + "loss": 5.3281, + "step": 6968 + }, + { + "epoch": 0.04144661718526977, + "grad_norm": 1.5556843280792236, + "learning_rate": 4.978843209353714e-05, + "loss": 5.442, + "step": 6969 + }, + { + "epoch": 0.041452564468550765, + "grad_norm": 1.5540435314178467, + "learning_rate": 4.978837144943041e-05, + "loss": 5.3621, + "step": 6970 + }, + { + "epoch": 0.04145851175183176, + "grad_norm": 1.7884414196014404, + "learning_rate": 4.9788310796670326e-05, + "loss": 5.571, + "step": 6971 + }, + { + "epoch": 0.04146445903511276, + "grad_norm": 1.7550957202911377, + "learning_rate": 4.9788250135256886e-05, + "loss": 5.61, + "step": 6972 + }, + { + "epoch": 0.04147040631839376, + "grad_norm": 1.9336804151535034, + "learning_rate": 4.978818946519013e-05, + "loss": 5.6142, + "step": 6973 + }, + { + "epoch": 0.04147635360167475, + "grad_norm": 1.8888505697250366, + "learning_rate": 4.978812878647008e-05, + "loss": 5.4908, + "step": 6974 + }, + { + "epoch": 0.041482300884955754, + "grad_norm": 1.940371036529541, + "learning_rate": 4.978806809909674e-05, + "loss": 5.5407, + "step": 6975 + }, + { + "epoch": 0.04148824816823675, + "grad_norm": 2.0182151794433594, + "learning_rate": 4.9788007403070146e-05, + "loss": 5.3643, + "step": 6976 + }, + { + "epoch": 0.041494195451517744, + "grad_norm": 1.7960541248321533, + "learning_rate": 4.978794669839032e-05, + "loss": 5.4994, + "step": 6977 + }, + { + "epoch": 0.041500142734798746, + "grad_norm": 1.8403207063674927, + "learning_rate": 4.978788598505727e-05, + "loss": 5.4501, + "step": 6978 + }, + { + "epoch": 0.04150609001807974, + "grad_norm": 1.7232698202133179, + "learning_rate": 4.978782526307103e-05, + "loss": 5.5406, + "step": 6979 + }, + { + "epoch": 0.04151203730136074, + "grad_norm": 1.7003169059753418, + "learning_rate": 4.9787764532431615e-05, + "loss": 5.3427, + "step": 6980 + }, + { + "epoch": 0.04151798458464174, + "grad_norm": 2.041384696960449, + "learning_rate": 4.978770379313904e-05, + "loss": 5.5121, + "step": 6981 + }, + { + "epoch": 0.041523931867922734, + "grad_norm": 1.5773900747299194, + "learning_rate": 4.978764304519334e-05, + "loss": 5.4604, + "step": 6982 + }, + { + "epoch": 0.04152987915120373, + "grad_norm": 1.8834172487258911, + "learning_rate": 4.9787582288594535e-05, + "loss": 5.5141, + "step": 6983 + }, + { + "epoch": 0.04153582643448473, + "grad_norm": 1.7956576347351074, + "learning_rate": 4.978752152334264e-05, + "loss": 5.5664, + "step": 6984 + }, + { + "epoch": 0.041541773717765726, + "grad_norm": 1.8676495552062988, + "learning_rate": 4.978746074943767e-05, + "loss": 5.2846, + "step": 6985 + }, + { + "epoch": 0.04154772100104672, + "grad_norm": 1.7709665298461914, + "learning_rate": 4.9787399966879654e-05, + "loss": 5.3375, + "step": 6986 + }, + { + "epoch": 0.041553668284327716, + "grad_norm": 2.012941837310791, + "learning_rate": 4.978733917566862e-05, + "loss": 5.6973, + "step": 6987 + }, + { + "epoch": 0.04155961556760872, + "grad_norm": 1.8220570087432861, + "learning_rate": 4.978727837580458e-05, + "loss": 5.191, + "step": 6988 + }, + { + "epoch": 0.04156556285088971, + "grad_norm": 1.6511586904525757, + "learning_rate": 4.978721756728755e-05, + "loss": 5.2787, + "step": 6989 + }, + { + "epoch": 0.04157151013417071, + "grad_norm": 1.9026141166687012, + "learning_rate": 4.978715675011757e-05, + "loss": 5.4456, + "step": 6990 + }, + { + "epoch": 0.04157745741745171, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.9787095924294633e-05, + "loss": 5.5013, + "step": 6991 + }, + { + "epoch": 0.041583404700732705, + "grad_norm": 1.8720741271972656, + "learning_rate": 4.978703508981879e-05, + "loss": 5.3952, + "step": 6992 + }, + { + "epoch": 0.0415893519840137, + "grad_norm": 1.817356824874878, + "learning_rate": 4.978697424669005e-05, + "loss": 5.4719, + "step": 6993 + }, + { + "epoch": 0.0415952992672947, + "grad_norm": 1.740702509880066, + "learning_rate": 4.978691339490843e-05, + "loss": 5.6484, + "step": 6994 + }, + { + "epoch": 0.0416012465505757, + "grad_norm": 1.8752427101135254, + "learning_rate": 4.978685253447395e-05, + "loss": 5.6394, + "step": 6995 + }, + { + "epoch": 0.04160719383385669, + "grad_norm": 1.8180509805679321, + "learning_rate": 4.978679166538665e-05, + "loss": 5.3401, + "step": 6996 + }, + { + "epoch": 0.041613141117137695, + "grad_norm": 1.9002251625061035, + "learning_rate": 4.9786730787646516e-05, + "loss": 5.3237, + "step": 6997 + }, + { + "epoch": 0.04161908840041869, + "grad_norm": 1.741176724433899, + "learning_rate": 4.978666990125361e-05, + "loss": 5.2311, + "step": 6998 + }, + { + "epoch": 0.041625035683699685, + "grad_norm": 2.0994246006011963, + "learning_rate": 4.9786609006207925e-05, + "loss": 5.3549, + "step": 6999 + }, + { + "epoch": 0.04163098296698068, + "grad_norm": 1.8438987731933594, + "learning_rate": 4.978654810250949e-05, + "loss": 5.4322, + "step": 7000 + }, + { + "epoch": 0.04163693025026168, + "grad_norm": 1.7411181926727295, + "learning_rate": 4.978648719015833e-05, + "loss": 5.455, + "step": 7001 + }, + { + "epoch": 0.04164287753354268, + "grad_norm": 1.6879174709320068, + "learning_rate": 4.978642626915446e-05, + "loss": 5.3676, + "step": 7002 + }, + { + "epoch": 0.04164882481682367, + "grad_norm": 1.8912461996078491, + "learning_rate": 4.9786365339497906e-05, + "loss": 5.6181, + "step": 7003 + }, + { + "epoch": 0.041654772100104674, + "grad_norm": 1.9234617948532104, + "learning_rate": 4.978630440118869e-05, + "loss": 5.5388, + "step": 7004 + }, + { + "epoch": 0.04166071938338567, + "grad_norm": 2.1059048175811768, + "learning_rate": 4.9786243454226824e-05, + "loss": 5.6856, + "step": 7005 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 2.1900687217712402, + "learning_rate": 4.9786182498612347e-05, + "loss": 6.2426, + "step": 7006 + }, + { + "epoch": 0.041672613949947666, + "grad_norm": 1.7580265998840332, + "learning_rate": 4.9786121534345265e-05, + "loss": 5.2342, + "step": 7007 + }, + { + "epoch": 0.04167856123322866, + "grad_norm": 1.4747200012207031, + "learning_rate": 4.97860605614256e-05, + "loss": 5.1977, + "step": 7008 + }, + { + "epoch": 0.04168450851650966, + "grad_norm": 1.8164165019989014, + "learning_rate": 4.978599957985338e-05, + "loss": 5.1362, + "step": 7009 + }, + { + "epoch": 0.04169045579979066, + "grad_norm": 1.468550443649292, + "learning_rate": 4.978593858962863e-05, + "loss": 5.1265, + "step": 7010 + }, + { + "epoch": 0.041696403083071654, + "grad_norm": 1.584343433380127, + "learning_rate": 4.9785877590751356e-05, + "loss": 5.2611, + "step": 7011 + }, + { + "epoch": 0.04170235036635265, + "grad_norm": 1.7864785194396973, + "learning_rate": 4.978581658322159e-05, + "loss": 5.5214, + "step": 7012 + }, + { + "epoch": 0.04170829764963365, + "grad_norm": 1.8359016180038452, + "learning_rate": 4.978575556703936e-05, + "loss": 5.3808, + "step": 7013 + }, + { + "epoch": 0.041714244932914646, + "grad_norm": 1.8298325538635254, + "learning_rate": 4.978569454220467e-05, + "loss": 5.5606, + "step": 7014 + }, + { + "epoch": 0.04172019221619564, + "grad_norm": 2.1555540561676025, + "learning_rate": 4.978563350871755e-05, + "loss": 5.6592, + "step": 7015 + }, + { + "epoch": 0.041726139499476636, + "grad_norm": 2.5251846313476562, + "learning_rate": 4.9785572466578026e-05, + "loss": 5.5771, + "step": 7016 + }, + { + "epoch": 0.04173208678275764, + "grad_norm": 1.7765661478042603, + "learning_rate": 4.9785511415786115e-05, + "loss": 5.5558, + "step": 7017 + }, + { + "epoch": 0.04173803406603863, + "grad_norm": 1.9711554050445557, + "learning_rate": 4.978545035634183e-05, + "loss": 5.5565, + "step": 7018 + }, + { + "epoch": 0.04174398134931963, + "grad_norm": 1.8080202341079712, + "learning_rate": 4.978538928824521e-05, + "loss": 5.5037, + "step": 7019 + }, + { + "epoch": 0.04174992863260063, + "grad_norm": 1.7506872415542603, + "learning_rate": 4.978532821149626e-05, + "loss": 5.3362, + "step": 7020 + }, + { + "epoch": 0.041755875915881625, + "grad_norm": 1.5606149435043335, + "learning_rate": 4.978526712609501e-05, + "loss": 5.3541, + "step": 7021 + }, + { + "epoch": 0.04176182319916262, + "grad_norm": 1.8840737342834473, + "learning_rate": 4.9785206032041476e-05, + "loss": 5.2315, + "step": 7022 + }, + { + "epoch": 0.04176777048244362, + "grad_norm": 2.118178606033325, + "learning_rate": 4.978514492933569e-05, + "loss": 5.6174, + "step": 7023 + }, + { + "epoch": 0.04177371776572462, + "grad_norm": 2.043907403945923, + "learning_rate": 4.978508381797766e-05, + "loss": 5.6272, + "step": 7024 + }, + { + "epoch": 0.04177966504900561, + "grad_norm": 1.764411211013794, + "learning_rate": 4.978502269796742e-05, + "loss": 5.6153, + "step": 7025 + }, + { + "epoch": 0.041785612332286615, + "grad_norm": 1.5760626792907715, + "learning_rate": 4.978496156930498e-05, + "loss": 5.5734, + "step": 7026 + }, + { + "epoch": 0.04179155961556761, + "grad_norm": 1.8857802152633667, + "learning_rate": 4.9784900431990366e-05, + "loss": 5.5295, + "step": 7027 + }, + { + "epoch": 0.041797506898848605, + "grad_norm": 1.7287275791168213, + "learning_rate": 4.97848392860236e-05, + "loss": 5.3175, + "step": 7028 + }, + { + "epoch": 0.0418034541821296, + "grad_norm": 1.915263295173645, + "learning_rate": 4.97847781314047e-05, + "loss": 5.4838, + "step": 7029 + }, + { + "epoch": 0.0418094014654106, + "grad_norm": 2.049435615539551, + "learning_rate": 4.97847169681337e-05, + "loss": 5.5508, + "step": 7030 + }, + { + "epoch": 0.0418153487486916, + "grad_norm": 1.8955415487289429, + "learning_rate": 4.97846557962106e-05, + "loss": 5.4618, + "step": 7031 + }, + { + "epoch": 0.04182129603197259, + "grad_norm": 1.8957183361053467, + "learning_rate": 4.978459461563543e-05, + "loss": 5.5293, + "step": 7032 + }, + { + "epoch": 0.041827243315253594, + "grad_norm": 2.050734043121338, + "learning_rate": 4.978453342640822e-05, + "loss": 5.8002, + "step": 7033 + }, + { + "epoch": 0.04183319059853459, + "grad_norm": 1.9867476224899292, + "learning_rate": 4.978447222852899e-05, + "loss": 5.466, + "step": 7034 + }, + { + "epoch": 0.041839137881815584, + "grad_norm": 1.7928507328033447, + "learning_rate": 4.978441102199775e-05, + "loss": 5.3312, + "step": 7035 + }, + { + "epoch": 0.041845085165096586, + "grad_norm": 1.7984018325805664, + "learning_rate": 4.978434980681453e-05, + "loss": 5.2936, + "step": 7036 + }, + { + "epoch": 0.04185103244837758, + "grad_norm": 1.8011672496795654, + "learning_rate": 4.9784288582979355e-05, + "loss": 5.484, + "step": 7037 + }, + { + "epoch": 0.041856979731658576, + "grad_norm": 1.9439928531646729, + "learning_rate": 4.9784227350492236e-05, + "loss": 5.4563, + "step": 7038 + }, + { + "epoch": 0.04186292701493958, + "grad_norm": 1.71321439743042, + "learning_rate": 4.97841661093532e-05, + "loss": 5.3909, + "step": 7039 + }, + { + "epoch": 0.041868874298220574, + "grad_norm": 1.629333734512329, + "learning_rate": 4.9784104859562266e-05, + "loss": 5.3112, + "step": 7040 + }, + { + "epoch": 0.04187482158150157, + "grad_norm": 1.5248417854309082, + "learning_rate": 4.9784043601119456e-05, + "loss": 5.3724, + "step": 7041 + }, + { + "epoch": 0.04188076886478257, + "grad_norm": 1.8886220455169678, + "learning_rate": 4.97839823340248e-05, + "loss": 5.443, + "step": 7042 + }, + { + "epoch": 0.041886716148063566, + "grad_norm": 1.5902595520019531, + "learning_rate": 4.9783921058278307e-05, + "loss": 5.4249, + "step": 7043 + }, + { + "epoch": 0.04189266343134456, + "grad_norm": 1.837579369544983, + "learning_rate": 4.978385977388e-05, + "loss": 5.3767, + "step": 7044 + }, + { + "epoch": 0.041898610714625556, + "grad_norm": 1.8306061029434204, + "learning_rate": 4.9783798480829905e-05, + "loss": 5.4206, + "step": 7045 + }, + { + "epoch": 0.04190455799790656, + "grad_norm": 1.6887965202331543, + "learning_rate": 4.9783737179128044e-05, + "loss": 5.5327, + "step": 7046 + }, + { + "epoch": 0.04191050528118755, + "grad_norm": 1.8081728219985962, + "learning_rate": 4.978367586877444e-05, + "loss": 5.4547, + "step": 7047 + }, + { + "epoch": 0.04191645256446855, + "grad_norm": 1.8341114521026611, + "learning_rate": 4.97836145497691e-05, + "loss": 5.4175, + "step": 7048 + }, + { + "epoch": 0.04192239984774955, + "grad_norm": 1.965240240097046, + "learning_rate": 4.978355322211207e-05, + "loss": 5.4253, + "step": 7049 + }, + { + "epoch": 0.041928347131030545, + "grad_norm": 1.7060484886169434, + "learning_rate": 4.9783491885803343e-05, + "loss": 5.3493, + "step": 7050 + }, + { + "epoch": 0.04193429441431154, + "grad_norm": 1.8203076124191284, + "learning_rate": 4.978343054084297e-05, + "loss": 5.4601, + "step": 7051 + }, + { + "epoch": 0.04194024169759254, + "grad_norm": 1.919954538345337, + "learning_rate": 4.9783369187230945e-05, + "loss": 5.4921, + "step": 7052 + }, + { + "epoch": 0.04194618898087354, + "grad_norm": 1.4519730806350708, + "learning_rate": 4.9783307824967306e-05, + "loss": 5.4922, + "step": 7053 + }, + { + "epoch": 0.04195213626415453, + "grad_norm": 1.8431898355484009, + "learning_rate": 4.9783246454052066e-05, + "loss": 5.384, + "step": 7054 + }, + { + "epoch": 0.041958083547435535, + "grad_norm": 1.5493370294570923, + "learning_rate": 4.978318507448526e-05, + "loss": 5.5294, + "step": 7055 + }, + { + "epoch": 0.04196403083071653, + "grad_norm": 1.6405844688415527, + "learning_rate": 4.97831236862669e-05, + "loss": 5.492, + "step": 7056 + }, + { + "epoch": 0.041969978113997525, + "grad_norm": 1.7830392122268677, + "learning_rate": 4.9783062289396996e-05, + "loss": 5.2977, + "step": 7057 + }, + { + "epoch": 0.04197592539727852, + "grad_norm": 1.8268102407455444, + "learning_rate": 4.9783000883875595e-05, + "loss": 5.3396, + "step": 7058 + }, + { + "epoch": 0.04198187268055952, + "grad_norm": 1.942901849746704, + "learning_rate": 4.9782939469702694e-05, + "loss": 5.3338, + "step": 7059 + }, + { + "epoch": 0.04198781996384052, + "grad_norm": 1.5793414115905762, + "learning_rate": 4.9782878046878334e-05, + "loss": 5.3286, + "step": 7060 + }, + { + "epoch": 0.04199376724712151, + "grad_norm": 1.5777463912963867, + "learning_rate": 4.9782816615402515e-05, + "loss": 5.2942, + "step": 7061 + }, + { + "epoch": 0.041999714530402514, + "grad_norm": 1.6393412351608276, + "learning_rate": 4.978275517527528e-05, + "loss": 5.2557, + "step": 7062 + }, + { + "epoch": 0.04200566181368351, + "grad_norm": 1.9657515287399292, + "learning_rate": 4.978269372649664e-05, + "loss": 5.3875, + "step": 7063 + }, + { + "epoch": 0.042011609096964504, + "grad_norm": 2.1419737339019775, + "learning_rate": 4.9782632269066623e-05, + "loss": 5.2014, + "step": 7064 + }, + { + "epoch": 0.042017556380245506, + "grad_norm": 2.0425620079040527, + "learning_rate": 4.978257080298523e-05, + "loss": 5.194, + "step": 7065 + }, + { + "epoch": 0.0420235036635265, + "grad_norm": 1.7248409986495972, + "learning_rate": 4.978250932825251e-05, + "loss": 5.1922, + "step": 7066 + }, + { + "epoch": 0.042029450946807496, + "grad_norm": 1.8265177011489868, + "learning_rate": 4.978244784486847e-05, + "loss": 5.4474, + "step": 7067 + }, + { + "epoch": 0.0420353982300885, + "grad_norm": 1.803701400756836, + "learning_rate": 4.9782386352833134e-05, + "loss": 6.2155, + "step": 7068 + }, + { + "epoch": 0.042041345513369494, + "grad_norm": 1.9970064163208008, + "learning_rate": 4.978232485214652e-05, + "loss": 5.3622, + "step": 7069 + }, + { + "epoch": 0.04204729279665049, + "grad_norm": 1.7449073791503906, + "learning_rate": 4.978226334280865e-05, + "loss": 5.3146, + "step": 7070 + }, + { + "epoch": 0.04205324007993149, + "grad_norm": 2.0284547805786133, + "learning_rate": 4.978220182481955e-05, + "loss": 5.0169, + "step": 7071 + }, + { + "epoch": 0.042059187363212486, + "grad_norm": 1.6801714897155762, + "learning_rate": 4.978214029817924e-05, + "loss": 5.1294, + "step": 7072 + }, + { + "epoch": 0.04206513464649348, + "grad_norm": 2.160585641860962, + "learning_rate": 4.978207876288774e-05, + "loss": 5.072, + "step": 7073 + }, + { + "epoch": 0.042071081929774476, + "grad_norm": 2.07739520072937, + "learning_rate": 4.978201721894508e-05, + "loss": 5.2065, + "step": 7074 + }, + { + "epoch": 0.04207702921305548, + "grad_norm": 2.1396286487579346, + "learning_rate": 4.978195566635127e-05, + "loss": 5.1066, + "step": 7075 + }, + { + "epoch": 0.04208297649633647, + "grad_norm": 1.883280634880066, + "learning_rate": 4.978189410510633e-05, + "loss": 5.2842, + "step": 7076 + }, + { + "epoch": 0.04208892377961747, + "grad_norm": 1.9917101860046387, + "learning_rate": 4.978183253521029e-05, + "loss": 5.0799, + "step": 7077 + }, + { + "epoch": 0.04209487106289847, + "grad_norm": 1.9387022256851196, + "learning_rate": 4.9781770956663164e-05, + "loss": 5.1898, + "step": 7078 + }, + { + "epoch": 0.042100818346179465, + "grad_norm": 1.9767060279846191, + "learning_rate": 4.978170936946498e-05, + "loss": 5.0692, + "step": 7079 + }, + { + "epoch": 0.04210676562946046, + "grad_norm": 2.0076138973236084, + "learning_rate": 4.978164777361576e-05, + "loss": 5.0255, + "step": 7080 + }, + { + "epoch": 0.04211271291274146, + "grad_norm": 1.8253445625305176, + "learning_rate": 4.978158616911552e-05, + "loss": 5.0111, + "step": 7081 + }, + { + "epoch": 0.04211866019602246, + "grad_norm": 1.6551930904388428, + "learning_rate": 4.978152455596429e-05, + "loss": 4.9849, + "step": 7082 + }, + { + "epoch": 0.04212460747930345, + "grad_norm": 1.8462406396865845, + "learning_rate": 4.9781462934162084e-05, + "loss": 5.0862, + "step": 7083 + }, + { + "epoch": 0.042130554762584455, + "grad_norm": 2.0828206539154053, + "learning_rate": 4.978140130370892e-05, + "loss": 5.031, + "step": 7084 + }, + { + "epoch": 0.04213650204586545, + "grad_norm": 1.7917357683181763, + "learning_rate": 4.978133966460483e-05, + "loss": 5.0028, + "step": 7085 + }, + { + "epoch": 0.042142449329146445, + "grad_norm": 1.7324126958847046, + "learning_rate": 4.9781278016849834e-05, + "loss": 4.9759, + "step": 7086 + }, + { + "epoch": 0.04214839661242744, + "grad_norm": 1.8673282861709595, + "learning_rate": 4.978121636044394e-05, + "loss": 5.3631, + "step": 7087 + }, + { + "epoch": 0.04215434389570844, + "grad_norm": 1.7723935842514038, + "learning_rate": 4.9781154695387186e-05, + "loss": 5.3427, + "step": 7088 + }, + { + "epoch": 0.04216029117898944, + "grad_norm": 1.4671146869659424, + "learning_rate": 4.978109302167958e-05, + "loss": 5.3003, + "step": 7089 + }, + { + "epoch": 0.04216623846227043, + "grad_norm": 1.9667481184005737, + "learning_rate": 4.9781031339321156e-05, + "loss": 5.0957, + "step": 7090 + }, + { + "epoch": 0.042172185745551434, + "grad_norm": 1.8162986040115356, + "learning_rate": 4.978096964831193e-05, + "loss": 5.1472, + "step": 7091 + }, + { + "epoch": 0.04217813302883243, + "grad_norm": 1.7793545722961426, + "learning_rate": 4.9780907948651926e-05, + "loss": 5.1771, + "step": 7092 + }, + { + "epoch": 0.042184080312113424, + "grad_norm": 1.8093308210372925, + "learning_rate": 4.9780846240341156e-05, + "loss": 5.1611, + "step": 7093 + }, + { + "epoch": 0.042190027595394426, + "grad_norm": 1.7010010480880737, + "learning_rate": 4.978078452337965e-05, + "loss": 5.4478, + "step": 7094 + }, + { + "epoch": 0.04219597487867542, + "grad_norm": 1.7978744506835938, + "learning_rate": 4.9780722797767434e-05, + "loss": 5.4443, + "step": 7095 + }, + { + "epoch": 0.042201922161956416, + "grad_norm": 1.4861794710159302, + "learning_rate": 4.9780661063504516e-05, + "loss": 5.3773, + "step": 7096 + }, + { + "epoch": 0.04220786944523742, + "grad_norm": 1.7805769443511963, + "learning_rate": 4.978059932059093e-05, + "loss": 5.0896, + "step": 7097 + }, + { + "epoch": 0.042213816728518413, + "grad_norm": 1.7392783164978027, + "learning_rate": 4.9780537569026695e-05, + "loss": 5.0602, + "step": 7098 + }, + { + "epoch": 0.04221976401179941, + "grad_norm": 1.8742554187774658, + "learning_rate": 4.978047580881182e-05, + "loss": 5.2595, + "step": 7099 + }, + { + "epoch": 0.04222571129508041, + "grad_norm": 1.6077641248703003, + "learning_rate": 4.978041403994635e-05, + "loss": 5.0925, + "step": 7100 + }, + { + "epoch": 0.042231658578361406, + "grad_norm": 1.7536481618881226, + "learning_rate": 4.9780352262430286e-05, + "loss": 5.2546, + "step": 7101 + }, + { + "epoch": 0.0422376058616424, + "grad_norm": 1.6404869556427002, + "learning_rate": 4.9780290476263656e-05, + "loss": 5.1349, + "step": 7102 + }, + { + "epoch": 0.042243553144923396, + "grad_norm": 1.7223635911941528, + "learning_rate": 4.978022868144649e-05, + "loss": 5.2894, + "step": 7103 + }, + { + "epoch": 0.0422495004282044, + "grad_norm": 1.7856663465499878, + "learning_rate": 4.9780166877978796e-05, + "loss": 5.384, + "step": 7104 + }, + { + "epoch": 0.04225544771148539, + "grad_norm": 1.6434816122055054, + "learning_rate": 4.978010506586061e-05, + "loss": 5.257, + "step": 7105 + }, + { + "epoch": 0.04226139499476639, + "grad_norm": 1.668371558189392, + "learning_rate": 4.9780043245091936e-05, + "loss": 5.2698, + "step": 7106 + }, + { + "epoch": 0.04226734227804739, + "grad_norm": 1.7553619146347046, + "learning_rate": 4.97799814156728e-05, + "loss": 5.1591, + "step": 7107 + }, + { + "epoch": 0.042273289561328385, + "grad_norm": 1.6918652057647705, + "learning_rate": 4.977991957760324e-05, + "loss": 5.2727, + "step": 7108 + }, + { + "epoch": 0.04227923684460938, + "grad_norm": 1.6634269952774048, + "learning_rate": 4.977985773088326e-05, + "loss": 5.3099, + "step": 7109 + }, + { + "epoch": 0.04228518412789038, + "grad_norm": 2.131647825241089, + "learning_rate": 4.977979587551289e-05, + "loss": 5.0885, + "step": 7110 + }, + { + "epoch": 0.04229113141117138, + "grad_norm": 1.6632722616195679, + "learning_rate": 4.977973401149215e-05, + "loss": 5.1546, + "step": 7111 + }, + { + "epoch": 0.04229707869445237, + "grad_norm": 1.762418270111084, + "learning_rate": 4.977967213882107e-05, + "loss": 5.0884, + "step": 7112 + }, + { + "epoch": 0.042303025977733374, + "grad_norm": 1.9325755834579468, + "learning_rate": 4.977961025749964e-05, + "loss": 5.1857, + "step": 7113 + }, + { + "epoch": 0.04230897326101437, + "grad_norm": 1.8359284400939941, + "learning_rate": 4.9779548367527926e-05, + "loss": 5.165, + "step": 7114 + }, + { + "epoch": 0.042314920544295365, + "grad_norm": 1.8305978775024414, + "learning_rate": 4.977948646890591e-05, + "loss": 5.1347, + "step": 7115 + }, + { + "epoch": 0.04232086782757636, + "grad_norm": 1.7374697923660278, + "learning_rate": 4.9779424561633644e-05, + "loss": 5.5219, + "step": 7116 + }, + { + "epoch": 0.04232681511085736, + "grad_norm": 1.9947689771652222, + "learning_rate": 4.9779362645711135e-05, + "loss": 5.4445, + "step": 7117 + }, + { + "epoch": 0.04233276239413836, + "grad_norm": 1.6639795303344727, + "learning_rate": 4.97793007211384e-05, + "loss": 5.3798, + "step": 7118 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 1.6983096599578857, + "learning_rate": 4.977923878791547e-05, + "loss": 5.2847, + "step": 7119 + }, + { + "epoch": 0.042344656960700354, + "grad_norm": 1.7397092580795288, + "learning_rate": 4.9779176846042366e-05, + "loss": 5.3175, + "step": 7120 + }, + { + "epoch": 0.04235060424398135, + "grad_norm": 1.5255639553070068, + "learning_rate": 4.977911489551911e-05, + "loss": 5.2735, + "step": 7121 + }, + { + "epoch": 0.042356551527262344, + "grad_norm": 1.5646785497665405, + "learning_rate": 4.9779052936345715e-05, + "loss": 5.3892, + "step": 7122 + }, + { + "epoch": 0.042362498810543346, + "grad_norm": 1.7479640245437622, + "learning_rate": 4.977899096852221e-05, + "loss": 5.4341, + "step": 7123 + }, + { + "epoch": 0.04236844609382434, + "grad_norm": 1.6275604963302612, + "learning_rate": 4.9778928992048615e-05, + "loss": 5.5209, + "step": 7124 + }, + { + "epoch": 0.042374393377105336, + "grad_norm": 1.6917749643325806, + "learning_rate": 4.977886700692496e-05, + "loss": 5.5779, + "step": 7125 + }, + { + "epoch": 0.04238034066038634, + "grad_norm": 1.683716058731079, + "learning_rate": 4.977880501315125e-05, + "loss": 5.475, + "step": 7126 + }, + { + "epoch": 0.04238628794366733, + "grad_norm": 1.7665706872940063, + "learning_rate": 4.977874301072751e-05, + "loss": 5.3666, + "step": 7127 + }, + { + "epoch": 0.04239223522694833, + "grad_norm": 1.715329885482788, + "learning_rate": 4.977868099965377e-05, + "loss": 5.407, + "step": 7128 + }, + { + "epoch": 0.04239818251022933, + "grad_norm": 1.8468618392944336, + "learning_rate": 4.977861897993006e-05, + "loss": 5.328, + "step": 7129 + }, + { + "epoch": 0.042404129793510326, + "grad_norm": 1.59178626537323, + "learning_rate": 4.977855695155638e-05, + "loss": 5.7797, + "step": 7130 + }, + { + "epoch": 0.04241007707679132, + "grad_norm": 1.4733757972717285, + "learning_rate": 4.977849491453277e-05, + "loss": 5.3019, + "step": 7131 + }, + { + "epoch": 0.042416024360072316, + "grad_norm": 1.4632091522216797, + "learning_rate": 4.977843286885923e-05, + "loss": 5.1754, + "step": 7132 + }, + { + "epoch": 0.04242197164335332, + "grad_norm": 1.530564308166504, + "learning_rate": 4.97783708145358e-05, + "loss": 5.3613, + "step": 7133 + }, + { + "epoch": 0.04242791892663431, + "grad_norm": 1.954219102859497, + "learning_rate": 4.97783087515625e-05, + "loss": 5.4013, + "step": 7134 + }, + { + "epoch": 0.04243386620991531, + "grad_norm": 1.8276890516281128, + "learning_rate": 4.977824667993935e-05, + "loss": 5.3611, + "step": 7135 + }, + { + "epoch": 0.04243981349319631, + "grad_norm": 2.1430561542510986, + "learning_rate": 4.977818459966637e-05, + "loss": 5.1501, + "step": 7136 + }, + { + "epoch": 0.042445760776477305, + "grad_norm": 1.9150115251541138, + "learning_rate": 4.977812251074357e-05, + "loss": 5.1778, + "step": 7137 + }, + { + "epoch": 0.0424517080597583, + "grad_norm": 1.6958523988723755, + "learning_rate": 4.9778060413171004e-05, + "loss": 5.5029, + "step": 7138 + }, + { + "epoch": 0.0424576553430393, + "grad_norm": 1.7183772325515747, + "learning_rate": 4.977799830694866e-05, + "loss": 5.4323, + "step": 7139 + }, + { + "epoch": 0.0424636026263203, + "grad_norm": 1.717731237411499, + "learning_rate": 4.977793619207657e-05, + "loss": 5.3418, + "step": 7140 + }, + { + "epoch": 0.04246954990960129, + "grad_norm": 1.8155564069747925, + "learning_rate": 4.9777874068554766e-05, + "loss": 5.2865, + "step": 7141 + }, + { + "epoch": 0.042475497192882294, + "grad_norm": 1.9890762567520142, + "learning_rate": 4.9777811936383254e-05, + "loss": 5.4101, + "step": 7142 + }, + { + "epoch": 0.04248144447616329, + "grad_norm": 1.8181748390197754, + "learning_rate": 4.977774979556207e-05, + "loss": 5.2719, + "step": 7143 + }, + { + "epoch": 0.042487391759444285, + "grad_norm": 1.7353019714355469, + "learning_rate": 4.9777687646091234e-05, + "loss": 5.4202, + "step": 7144 + }, + { + "epoch": 0.04249333904272528, + "grad_norm": 1.6121984720230103, + "learning_rate": 4.977762548797076e-05, + "loss": 5.3174, + "step": 7145 + }, + { + "epoch": 0.04249928632600628, + "grad_norm": 1.9579551219940186, + "learning_rate": 4.977756332120067e-05, + "loss": 5.135, + "step": 7146 + }, + { + "epoch": 0.04250523360928728, + "grad_norm": 1.9396319389343262, + "learning_rate": 4.977750114578099e-05, + "loss": 5.7521, + "step": 7147 + }, + { + "epoch": 0.04251118089256827, + "grad_norm": 1.8567198514938354, + "learning_rate": 4.977743896171173e-05, + "loss": 5.7521, + "step": 7148 + }, + { + "epoch": 0.042517128175849274, + "grad_norm": 2.139861583709717, + "learning_rate": 4.977737676899293e-05, + "loss": 5.472, + "step": 7149 + }, + { + "epoch": 0.04252307545913027, + "grad_norm": 1.6526445150375366, + "learning_rate": 4.977731456762461e-05, + "loss": 5.5557, + "step": 7150 + }, + { + "epoch": 0.042529022742411264, + "grad_norm": 1.7761725187301636, + "learning_rate": 4.9777252357606784e-05, + "loss": 5.1922, + "step": 7151 + }, + { + "epoch": 0.042534970025692266, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.977719013893947e-05, + "loss": 5.5067, + "step": 7152 + }, + { + "epoch": 0.04254091730897326, + "grad_norm": 1.746470332145691, + "learning_rate": 4.97771279116227e-05, + "loss": 5.28, + "step": 7153 + }, + { + "epoch": 0.042546864592254256, + "grad_norm": 1.9258379936218262, + "learning_rate": 4.9777065675656484e-05, + "loss": 5.7223, + "step": 7154 + }, + { + "epoch": 0.04255281187553526, + "grad_norm": 1.9928748607635498, + "learning_rate": 4.977700343104086e-05, + "loss": 5.727, + "step": 7155 + }, + { + "epoch": 0.04255875915881625, + "grad_norm": 1.7435163259506226, + "learning_rate": 4.9776941177775824e-05, + "loss": 5.6636, + "step": 7156 + }, + { + "epoch": 0.04256470644209725, + "grad_norm": 1.6818004846572876, + "learning_rate": 4.977687891586143e-05, + "loss": 5.6589, + "step": 7157 + }, + { + "epoch": 0.04257065372537825, + "grad_norm": 1.812779426574707, + "learning_rate": 4.9776816645297676e-05, + "loss": 5.2705, + "step": 7158 + }, + { + "epoch": 0.042576601008659246, + "grad_norm": 1.7637232542037964, + "learning_rate": 4.977675436608459e-05, + "loss": 5.2872, + "step": 7159 + }, + { + "epoch": 0.04258254829194024, + "grad_norm": 1.9504014253616333, + "learning_rate": 4.97766920782222e-05, + "loss": 5.1324, + "step": 7160 + }, + { + "epoch": 0.042588495575221236, + "grad_norm": 1.7741994857788086, + "learning_rate": 4.9776629781710525e-05, + "loss": 5.4164, + "step": 7161 + }, + { + "epoch": 0.04259444285850224, + "grad_norm": 2.0005195140838623, + "learning_rate": 4.9776567476549576e-05, + "loss": 5.4667, + "step": 7162 + }, + { + "epoch": 0.04260039014178323, + "grad_norm": 2.256420612335205, + "learning_rate": 4.977650516273939e-05, + "loss": 5.1116, + "step": 7163 + }, + { + "epoch": 0.04260633742506423, + "grad_norm": 2.0806920528411865, + "learning_rate": 4.977644284027998e-05, + "loss": 5.2333, + "step": 7164 + }, + { + "epoch": 0.04261228470834523, + "grad_norm": 1.898760199546814, + "learning_rate": 4.9776380509171364e-05, + "loss": 5.4761, + "step": 7165 + }, + { + "epoch": 0.042618231991626225, + "grad_norm": 1.7251659631729126, + "learning_rate": 4.977631816941358e-05, + "loss": 5.5584, + "step": 7166 + }, + { + "epoch": 0.04262417927490722, + "grad_norm": 1.741645336151123, + "learning_rate": 4.977625582100664e-05, + "loss": 5.4133, + "step": 7167 + }, + { + "epoch": 0.04263012655818822, + "grad_norm": 1.921617031097412, + "learning_rate": 4.977619346395055e-05, + "loss": 5.1829, + "step": 7168 + }, + { + "epoch": 0.04263607384146922, + "grad_norm": 1.7597262859344482, + "learning_rate": 4.977613109824536e-05, + "loss": 5.1743, + "step": 7169 + }, + { + "epoch": 0.04264202112475021, + "grad_norm": 1.8069764375686646, + "learning_rate": 4.977606872389107e-05, + "loss": 5.4004, + "step": 7170 + }, + { + "epoch": 0.042647968408031214, + "grad_norm": 1.7694367170333862, + "learning_rate": 4.9776006340887714e-05, + "loss": 5.2018, + "step": 7171 + }, + { + "epoch": 0.04265391569131221, + "grad_norm": 1.8260759115219116, + "learning_rate": 4.9775943949235316e-05, + "loss": 5.4115, + "step": 7172 + }, + { + "epoch": 0.042659862974593205, + "grad_norm": 1.71034574508667, + "learning_rate": 4.9775881548933884e-05, + "loss": 5.2781, + "step": 7173 + }, + { + "epoch": 0.0426658102578742, + "grad_norm": 1.7208900451660156, + "learning_rate": 4.977581913998345e-05, + "loss": 5.4686, + "step": 7174 + }, + { + "epoch": 0.0426717575411552, + "grad_norm": 1.8545277118682861, + "learning_rate": 4.977575672238404e-05, + "loss": 5.4545, + "step": 7175 + }, + { + "epoch": 0.0426777048244362, + "grad_norm": 1.7892229557037354, + "learning_rate": 4.9775694296135656e-05, + "loss": 5.6612, + "step": 7176 + }, + { + "epoch": 0.04268365210771719, + "grad_norm": 1.8321889638900757, + "learning_rate": 4.9775631861238343e-05, + "loss": 5.5889, + "step": 7177 + }, + { + "epoch": 0.042689599390998194, + "grad_norm": 1.7925626039505005, + "learning_rate": 4.977556941769211e-05, + "loss": 5.6218, + "step": 7178 + }, + { + "epoch": 0.04269554667427919, + "grad_norm": 1.9650121927261353, + "learning_rate": 4.9775506965496984e-05, + "loss": 5.5228, + "step": 7179 + }, + { + "epoch": 0.042701493957560184, + "grad_norm": 1.9050647020339966, + "learning_rate": 4.977544450465298e-05, + "loss": 5.5547, + "step": 7180 + }, + { + "epoch": 0.042707441240841186, + "grad_norm": 1.8334670066833496, + "learning_rate": 4.977538203516013e-05, + "loss": 5.3895, + "step": 7181 + }, + { + "epoch": 0.04271338852412218, + "grad_norm": 1.803544521331787, + "learning_rate": 4.9775319557018444e-05, + "loss": 5.6288, + "step": 7182 + }, + { + "epoch": 0.042719335807403176, + "grad_norm": 1.823440432548523, + "learning_rate": 4.9775257070227956e-05, + "loss": 5.4996, + "step": 7183 + }, + { + "epoch": 0.04272528309068418, + "grad_norm": 1.9730159044265747, + "learning_rate": 4.977519457478868e-05, + "loss": 5.5004, + "step": 7184 + }, + { + "epoch": 0.04273123037396517, + "grad_norm": 1.9566004276275635, + "learning_rate": 4.977513207070064e-05, + "loss": 5.5496, + "step": 7185 + }, + { + "epoch": 0.04273717765724617, + "grad_norm": 2.0958995819091797, + "learning_rate": 4.977506955796385e-05, + "loss": 5.5256, + "step": 7186 + }, + { + "epoch": 0.04274312494052717, + "grad_norm": 1.8957890272140503, + "learning_rate": 4.977500703657835e-05, + "loss": 5.3337, + "step": 7187 + }, + { + "epoch": 0.042749072223808166, + "grad_norm": 1.8224141597747803, + "learning_rate": 4.977494450654414e-05, + "loss": 5.1362, + "step": 7188 + }, + { + "epoch": 0.04275501950708916, + "grad_norm": 1.648296594619751, + "learning_rate": 4.977488196786126e-05, + "loss": 5.3398, + "step": 7189 + }, + { + "epoch": 0.042760966790370156, + "grad_norm": 1.6238311529159546, + "learning_rate": 4.977481942052972e-05, + "loss": 5.2083, + "step": 7190 + }, + { + "epoch": 0.04276691407365116, + "grad_norm": 1.7399996519088745, + "learning_rate": 4.977475686454956e-05, + "loss": 5.2403, + "step": 7191 + }, + { + "epoch": 0.04277286135693215, + "grad_norm": 1.7260342836380005, + "learning_rate": 4.977469429992077e-05, + "loss": 5.2282, + "step": 7192 + }, + { + "epoch": 0.04277880864021315, + "grad_norm": 4.4954447746276855, + "learning_rate": 4.9774631726643396e-05, + "loss": 5.1044, + "step": 7193 + }, + { + "epoch": 0.04278475592349415, + "grad_norm": 1.879869818687439, + "learning_rate": 4.977456914471746e-05, + "loss": 5.3431, + "step": 7194 + }, + { + "epoch": 0.042790703206775145, + "grad_norm": 1.8826582431793213, + "learning_rate": 4.977450655414297e-05, + "loss": 5.2951, + "step": 7195 + }, + { + "epoch": 0.04279665049005614, + "grad_norm": 1.8973712921142578, + "learning_rate": 4.977444395491996e-05, + "loss": 5.343, + "step": 7196 + }, + { + "epoch": 0.04280259777333714, + "grad_norm": 1.6125551462173462, + "learning_rate": 4.977438134704845e-05, + "loss": 5.2849, + "step": 7197 + }, + { + "epoch": 0.04280854505661814, + "grad_norm": 1.441159963607788, + "learning_rate": 4.9774318730528456e-05, + "loss": 5.2955, + "step": 7198 + }, + { + "epoch": 0.04281449233989913, + "grad_norm": 1.9655884504318237, + "learning_rate": 4.9774256105360004e-05, + "loss": 5.2093, + "step": 7199 + }, + { + "epoch": 0.042820439623180134, + "grad_norm": 1.7824043035507202, + "learning_rate": 4.9774193471543116e-05, + "loss": 5.2105, + "step": 7200 + }, + { + "epoch": 0.04282638690646113, + "grad_norm": 1.8331031799316406, + "learning_rate": 4.977413082907781e-05, + "loss": 5.3359, + "step": 7201 + }, + { + "epoch": 0.042832334189742124, + "grad_norm": 1.8695242404937744, + "learning_rate": 4.977406817796412e-05, + "loss": 5.3686, + "step": 7202 + }, + { + "epoch": 0.042838281473023126, + "grad_norm": 1.70205557346344, + "learning_rate": 4.977400551820205e-05, + "loss": 5.2689, + "step": 7203 + }, + { + "epoch": 0.04284422875630412, + "grad_norm": 1.700307846069336, + "learning_rate": 4.9773942849791635e-05, + "loss": 5.3946, + "step": 7204 + }, + { + "epoch": 0.04285017603958512, + "grad_norm": 1.625637173652649, + "learning_rate": 4.977388017273288e-05, + "loss": 5.095, + "step": 7205 + }, + { + "epoch": 0.04285612332286611, + "grad_norm": 1.7689390182495117, + "learning_rate": 4.977381748702583e-05, + "loss": 5.0097, + "step": 7206 + }, + { + "epoch": 0.042862070606147114, + "grad_norm": 1.856493353843689, + "learning_rate": 4.97737547926705e-05, + "loss": 5.0551, + "step": 7207 + }, + { + "epoch": 0.04286801788942811, + "grad_norm": 1.6497242450714111, + "learning_rate": 4.97736920896669e-05, + "loss": 5.031, + "step": 7208 + }, + { + "epoch": 0.042873965172709104, + "grad_norm": 1.5884608030319214, + "learning_rate": 4.977362937801506e-05, + "loss": 5.0758, + "step": 7209 + }, + { + "epoch": 0.042879912455990106, + "grad_norm": 1.5206499099731445, + "learning_rate": 4.9773566657715006e-05, + "loss": 5.049, + "step": 7210 + }, + { + "epoch": 0.0428858597392711, + "grad_norm": 1.7026933431625366, + "learning_rate": 4.977350392876676e-05, + "loss": 5.001, + "step": 7211 + }, + { + "epoch": 0.042891807022552096, + "grad_norm": 1.4197289943695068, + "learning_rate": 4.977344119117034e-05, + "loss": 5.0446, + "step": 7212 + }, + { + "epoch": 0.0428977543058331, + "grad_norm": 1.498713731765747, + "learning_rate": 4.977337844492576e-05, + "loss": 5.0574, + "step": 7213 + }, + { + "epoch": 0.04290370158911409, + "grad_norm": 1.7583528757095337, + "learning_rate": 4.9773315690033054e-05, + "loss": 4.994, + "step": 7214 + }, + { + "epoch": 0.04290964887239509, + "grad_norm": 1.8511004447937012, + "learning_rate": 4.9773252926492236e-05, + "loss": 4.9888, + "step": 7215 + }, + { + "epoch": 0.04291559615567609, + "grad_norm": 1.5799078941345215, + "learning_rate": 4.9773190154303334e-05, + "loss": 5.0028, + "step": 7216 + }, + { + "epoch": 0.042921543438957085, + "grad_norm": 1.6737205982208252, + "learning_rate": 4.977312737346637e-05, + "loss": 5.0701, + "step": 7217 + }, + { + "epoch": 0.04292749072223808, + "grad_norm": 1.537049412727356, + "learning_rate": 4.977306458398136e-05, + "loss": 5.0747, + "step": 7218 + }, + { + "epoch": 0.042933438005519076, + "grad_norm": 1.7501899003982544, + "learning_rate": 4.977300178584833e-05, + "loss": 5.0172, + "step": 7219 + }, + { + "epoch": 0.04293938528880008, + "grad_norm": 1.5130890607833862, + "learning_rate": 4.9772938979067294e-05, + "loss": 5.0196, + "step": 7220 + }, + { + "epoch": 0.04294533257208107, + "grad_norm": 1.628053903579712, + "learning_rate": 4.977287616363829e-05, + "loss": 5.0526, + "step": 7221 + }, + { + "epoch": 0.04295127985536207, + "grad_norm": 1.6736811399459839, + "learning_rate": 4.977281333956133e-05, + "loss": 5.0093, + "step": 7222 + }, + { + "epoch": 0.04295722713864307, + "grad_norm": 1.6157552003860474, + "learning_rate": 4.977275050683643e-05, + "loss": 4.9562, + "step": 7223 + }, + { + "epoch": 0.042963174421924065, + "grad_norm": 1.6699459552764893, + "learning_rate": 4.9772687665463625e-05, + "loss": 4.9603, + "step": 7224 + }, + { + "epoch": 0.04296912170520506, + "grad_norm": 1.4698256254196167, + "learning_rate": 4.9772624815442925e-05, + "loss": 4.9908, + "step": 7225 + }, + { + "epoch": 0.04297506898848606, + "grad_norm": 1.5310906171798706, + "learning_rate": 4.9772561956774365e-05, + "loss": 5.0081, + "step": 7226 + }, + { + "epoch": 0.04298101627176706, + "grad_norm": 1.6135941743850708, + "learning_rate": 4.977249908945795e-05, + "loss": 5.1394, + "step": 7227 + }, + { + "epoch": 0.04298696355504805, + "grad_norm": 1.7632607221603394, + "learning_rate": 4.977243621349372e-05, + "loss": 4.9992, + "step": 7228 + }, + { + "epoch": 0.042992910838329054, + "grad_norm": 1.574826955795288, + "learning_rate": 4.977237332888168e-05, + "loss": 4.9361, + "step": 7229 + }, + { + "epoch": 0.04299885812161005, + "grad_norm": 1.6633859872817993, + "learning_rate": 4.9772310435621874e-05, + "loss": 4.9085, + "step": 7230 + }, + { + "epoch": 0.043004805404891044, + "grad_norm": 1.6180634498596191, + "learning_rate": 4.97722475337143e-05, + "loss": 4.939, + "step": 7231 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 1.959694266319275, + "learning_rate": 4.9772184623158996e-05, + "loss": 5.231, + "step": 7232 + }, + { + "epoch": 0.04301669997145304, + "grad_norm": 1.6264785528182983, + "learning_rate": 4.977212170395598e-05, + "loss": 5.3228, + "step": 7233 + }, + { + "epoch": 0.04302264725473404, + "grad_norm": 2.109292507171631, + "learning_rate": 4.9772058776105264e-05, + "loss": 5.4579, + "step": 7234 + }, + { + "epoch": 0.04302859453801503, + "grad_norm": 1.991877555847168, + "learning_rate": 4.977199583960688e-05, + "loss": 5.355, + "step": 7235 + }, + { + "epoch": 0.043034541821296034, + "grad_norm": 2.23330020904541, + "learning_rate": 4.977193289446085e-05, + "loss": 5.3233, + "step": 7236 + }, + { + "epoch": 0.04304048910457703, + "grad_norm": 2.077359914779663, + "learning_rate": 4.9771869940667194e-05, + "loss": 5.2003, + "step": 7237 + }, + { + "epoch": 0.043046436387858024, + "grad_norm": 1.652498722076416, + "learning_rate": 4.977180697822593e-05, + "loss": 5.0232, + "step": 7238 + }, + { + "epoch": 0.043052383671139026, + "grad_norm": 1.9277194738388062, + "learning_rate": 4.977174400713709e-05, + "loss": 5.3826, + "step": 7239 + }, + { + "epoch": 0.04305833095442002, + "grad_norm": 1.9263273477554321, + "learning_rate": 4.9771681027400694e-05, + "loss": 5.5258, + "step": 7240 + }, + { + "epoch": 0.043064278237701016, + "grad_norm": 2.066934108734131, + "learning_rate": 4.9771618039016756e-05, + "loss": 5.6398, + "step": 7241 + }, + { + "epoch": 0.04307022552098202, + "grad_norm": 1.7810741662979126, + "learning_rate": 4.9771555041985295e-05, + "loss": 5.3716, + "step": 7242 + }, + { + "epoch": 0.04307617280426301, + "grad_norm": 1.7068313360214233, + "learning_rate": 4.977149203630635e-05, + "loss": 5.4042, + "step": 7243 + }, + { + "epoch": 0.04308212008754401, + "grad_norm": 1.8587994575500488, + "learning_rate": 4.977142902197992e-05, + "loss": 5.3635, + "step": 7244 + }, + { + "epoch": 0.04308806737082501, + "grad_norm": 2.101649284362793, + "learning_rate": 4.9771365999006054e-05, + "loss": 5.5292, + "step": 7245 + }, + { + "epoch": 0.043094014654106005, + "grad_norm": 1.8571972846984863, + "learning_rate": 4.9771302967384756e-05, + "loss": 5.4577, + "step": 7246 + }, + { + "epoch": 0.043099961937387, + "grad_norm": 1.9837383031845093, + "learning_rate": 4.9771239927116045e-05, + "loss": 5.4976, + "step": 7247 + }, + { + "epoch": 0.043105909220667996, + "grad_norm": 1.7688343524932861, + "learning_rate": 4.977117687819996e-05, + "loss": 5.448, + "step": 7248 + }, + { + "epoch": 0.043111856503949, + "grad_norm": 1.923824429512024, + "learning_rate": 4.9771113820636505e-05, + "loss": 5.3436, + "step": 7249 + }, + { + "epoch": 0.04311780378722999, + "grad_norm": 1.4405949115753174, + "learning_rate": 4.9771050754425715e-05, + "loss": 5.2751, + "step": 7250 + }, + { + "epoch": 0.04312375107051099, + "grad_norm": 1.7337450981140137, + "learning_rate": 4.977098767956761e-05, + "loss": 5.4693, + "step": 7251 + }, + { + "epoch": 0.04312969835379199, + "grad_norm": 2.063887119293213, + "learning_rate": 4.977092459606221e-05, + "loss": 5.4576, + "step": 7252 + }, + { + "epoch": 0.043135645637072985, + "grad_norm": 1.576517105102539, + "learning_rate": 4.9770861503909524e-05, + "loss": 5.4052, + "step": 7253 + }, + { + "epoch": 0.04314159292035398, + "grad_norm": 1.8137834072113037, + "learning_rate": 4.9770798403109596e-05, + "loss": 5.5732, + "step": 7254 + }, + { + "epoch": 0.04314754020363498, + "grad_norm": 1.7954564094543457, + "learning_rate": 4.977073529366244e-05, + "loss": 5.4213, + "step": 7255 + }, + { + "epoch": 0.04315348748691598, + "grad_norm": 1.993961215019226, + "learning_rate": 4.977067217556807e-05, + "loss": 5.2909, + "step": 7256 + }, + { + "epoch": 0.04315943477019697, + "grad_norm": 1.6993632316589355, + "learning_rate": 4.977060904882651e-05, + "loss": 5.4523, + "step": 7257 + }, + { + "epoch": 0.043165382053477974, + "grad_norm": 1.8541932106018066, + "learning_rate": 4.977054591343779e-05, + "loss": 5.3182, + "step": 7258 + }, + { + "epoch": 0.04317132933675897, + "grad_norm": 1.7425625324249268, + "learning_rate": 4.9770482769401935e-05, + "loss": 5.2527, + "step": 7259 + }, + { + "epoch": 0.043177276620039964, + "grad_norm": 1.7028024196624756, + "learning_rate": 4.9770419616718955e-05, + "loss": 5.1305, + "step": 7260 + }, + { + "epoch": 0.043183223903320966, + "grad_norm": 1.745316982269287, + "learning_rate": 4.977035645538888e-05, + "loss": 5.0368, + "step": 7261 + }, + { + "epoch": 0.04318917118660196, + "grad_norm": 1.8373509645462036, + "learning_rate": 4.977029328541173e-05, + "loss": 5.353, + "step": 7262 + }, + { + "epoch": 0.04319511846988296, + "grad_norm": 1.9976449012756348, + "learning_rate": 4.9770230106787526e-05, + "loss": 5.363, + "step": 7263 + }, + { + "epoch": 0.04320106575316395, + "grad_norm": 1.7109822034835815, + "learning_rate": 4.977016691951629e-05, + "loss": 5.3462, + "step": 7264 + }, + { + "epoch": 0.043207013036444954, + "grad_norm": 1.8688478469848633, + "learning_rate": 4.9770103723598036e-05, + "loss": 5.3564, + "step": 7265 + }, + { + "epoch": 0.04321296031972595, + "grad_norm": 1.8680217266082764, + "learning_rate": 4.9770040519032804e-05, + "loss": 5.2713, + "step": 7266 + }, + { + "epoch": 0.043218907603006944, + "grad_norm": 1.8022522926330566, + "learning_rate": 4.976997730582061e-05, + "loss": 5.153, + "step": 7267 + }, + { + "epoch": 0.043224854886287946, + "grad_norm": 1.7128162384033203, + "learning_rate": 4.976991408396147e-05, + "loss": 5.3107, + "step": 7268 + }, + { + "epoch": 0.04323080216956894, + "grad_norm": 1.8222606182098389, + "learning_rate": 4.9769850853455404e-05, + "loss": 5.3599, + "step": 7269 + }, + { + "epoch": 0.043236749452849936, + "grad_norm": 1.829373836517334, + "learning_rate": 4.976978761430244e-05, + "loss": 5.3991, + "step": 7270 + }, + { + "epoch": 0.04324269673613094, + "grad_norm": 1.8270717859268188, + "learning_rate": 4.97697243665026e-05, + "loss": 5.2434, + "step": 7271 + }, + { + "epoch": 0.04324864401941193, + "grad_norm": 1.9759695529937744, + "learning_rate": 4.976966111005591e-05, + "loss": 5.4585, + "step": 7272 + }, + { + "epoch": 0.04325459130269293, + "grad_norm": 2.0235564708709717, + "learning_rate": 4.9769597844962376e-05, + "loss": 5.3996, + "step": 7273 + }, + { + "epoch": 0.04326053858597393, + "grad_norm": 1.9220880270004272, + "learning_rate": 4.976953457122204e-05, + "loss": 5.344, + "step": 7274 + }, + { + "epoch": 0.043266485869254925, + "grad_norm": 1.6257338523864746, + "learning_rate": 4.976947128883492e-05, + "loss": 5.4012, + "step": 7275 + }, + { + "epoch": 0.04327243315253592, + "grad_norm": 1.6390771865844727, + "learning_rate": 4.976940799780103e-05, + "loss": 5.3693, + "step": 7276 + }, + { + "epoch": 0.043278380435816916, + "grad_norm": 1.5769712924957275, + "learning_rate": 4.976934469812039e-05, + "loss": 5.3214, + "step": 7277 + }, + { + "epoch": 0.04328432771909792, + "grad_norm": 1.539920687675476, + "learning_rate": 4.9769281389793035e-05, + "loss": 5.2784, + "step": 7278 + }, + { + "epoch": 0.04329027500237891, + "grad_norm": 1.662835717201233, + "learning_rate": 4.976921807281897e-05, + "loss": 5.2717, + "step": 7279 + }, + { + "epoch": 0.04329622228565991, + "grad_norm": 1.3613345623016357, + "learning_rate": 4.9769154747198234e-05, + "loss": 5.4241, + "step": 7280 + }, + { + "epoch": 0.04330216956894091, + "grad_norm": 1.5267658233642578, + "learning_rate": 4.976909141293084e-05, + "loss": 5.454, + "step": 7281 + }, + { + "epoch": 0.043308116852221905, + "grad_norm": 1.5050435066223145, + "learning_rate": 4.976902807001681e-05, + "loss": 5.4975, + "step": 7282 + }, + { + "epoch": 0.0433140641355029, + "grad_norm": 1.292698621749878, + "learning_rate": 4.976896471845617e-05, + "loss": 5.4071, + "step": 7283 + }, + { + "epoch": 0.0433200114187839, + "grad_norm": 1.6818265914916992, + "learning_rate": 4.9768901358248946e-05, + "loss": 5.3561, + "step": 7284 + }, + { + "epoch": 0.0433259587020649, + "grad_norm": 1.5995383262634277, + "learning_rate": 4.976883798939515e-05, + "loss": 5.2623, + "step": 7285 + }, + { + "epoch": 0.04333190598534589, + "grad_norm": 1.6959342956542969, + "learning_rate": 4.976877461189481e-05, + "loss": 5.3193, + "step": 7286 + }, + { + "epoch": 0.043337853268626894, + "grad_norm": 1.6978071928024292, + "learning_rate": 4.976871122574794e-05, + "loss": 5.5653, + "step": 7287 + }, + { + "epoch": 0.04334380055190789, + "grad_norm": 1.7587183713912964, + "learning_rate": 4.976864783095457e-05, + "loss": 5.545, + "step": 7288 + }, + { + "epoch": 0.043349747835188884, + "grad_norm": 1.6225430965423584, + "learning_rate": 4.976858442751473e-05, + "loss": 5.5804, + "step": 7289 + }, + { + "epoch": 0.043355695118469886, + "grad_norm": 1.5895410776138306, + "learning_rate": 4.976852101542843e-05, + "loss": 5.4798, + "step": 7290 + }, + { + "epoch": 0.04336164240175088, + "grad_norm": 1.759022831916809, + "learning_rate": 4.976845759469569e-05, + "loss": 5.4794, + "step": 7291 + }, + { + "epoch": 0.043367589685031877, + "grad_norm": 1.483383059501648, + "learning_rate": 4.976839416531654e-05, + "loss": 5.2547, + "step": 7292 + }, + { + "epoch": 0.04337353696831287, + "grad_norm": 2.136172294616699, + "learning_rate": 4.9768330727291e-05, + "loss": 5.1655, + "step": 7293 + }, + { + "epoch": 0.043379484251593874, + "grad_norm": 1.9202553033828735, + "learning_rate": 4.9768267280619094e-05, + "loss": 5.1945, + "step": 7294 + }, + { + "epoch": 0.04338543153487487, + "grad_norm": 1.7927708625793457, + "learning_rate": 4.976820382530084e-05, + "loss": 5.4936, + "step": 7295 + }, + { + "epoch": 0.043391378818155864, + "grad_norm": 1.597887396812439, + "learning_rate": 4.976814036133626e-05, + "loss": 5.5516, + "step": 7296 + }, + { + "epoch": 0.043397326101436866, + "grad_norm": 1.493356466293335, + "learning_rate": 4.9768076888725376e-05, + "loss": 5.552, + "step": 7297 + }, + { + "epoch": 0.04340327338471786, + "grad_norm": 1.6748720407485962, + "learning_rate": 4.976801340746822e-05, + "loss": 5.3957, + "step": 7298 + }, + { + "epoch": 0.043409220667998856, + "grad_norm": 1.541945457458496, + "learning_rate": 4.9767949917564794e-05, + "loss": 5.5558, + "step": 7299 + }, + { + "epoch": 0.04341516795127986, + "grad_norm": 1.6436586380004883, + "learning_rate": 4.976788641901514e-05, + "loss": 5.4918, + "step": 7300 + }, + { + "epoch": 0.04342111523456085, + "grad_norm": 1.69910728931427, + "learning_rate": 4.9767822911819274e-05, + "loss": 5.4688, + "step": 7301 + }, + { + "epoch": 0.04342706251784185, + "grad_norm": 1.8294274806976318, + "learning_rate": 4.976775939597721e-05, + "loss": 5.505, + "step": 7302 + }, + { + "epoch": 0.04343300980112285, + "grad_norm": 1.720880389213562, + "learning_rate": 4.976769587148899e-05, + "loss": 5.3509, + "step": 7303 + }, + { + "epoch": 0.043438957084403845, + "grad_norm": 1.5898194313049316, + "learning_rate": 4.976763233835461e-05, + "loss": 5.2955, + "step": 7304 + }, + { + "epoch": 0.04344490436768484, + "grad_norm": 1.569218397140503, + "learning_rate": 4.976756879657412e-05, + "loss": 5.5695, + "step": 7305 + }, + { + "epoch": 0.043450851650965835, + "grad_norm": 1.5551841259002686, + "learning_rate": 4.976750524614752e-05, + "loss": 5.5313, + "step": 7306 + }, + { + "epoch": 0.04345679893424684, + "grad_norm": 1.5870057344436646, + "learning_rate": 4.9767441687074834e-05, + "loss": 5.7525, + "step": 7307 + }, + { + "epoch": 0.04346274621752783, + "grad_norm": 1.5421022176742554, + "learning_rate": 4.97673781193561e-05, + "loss": 5.6176, + "step": 7308 + }, + { + "epoch": 0.04346869350080883, + "grad_norm": 1.9368326663970947, + "learning_rate": 4.976731454299132e-05, + "loss": 5.4239, + "step": 7309 + }, + { + "epoch": 0.04347464078408983, + "grad_norm": 1.719084620475769, + "learning_rate": 4.976725095798053e-05, + "loss": 5.3526, + "step": 7310 + }, + { + "epoch": 0.043480588067370825, + "grad_norm": 1.8004268407821655, + "learning_rate": 4.9767187364323756e-05, + "loss": 5.7112, + "step": 7311 + }, + { + "epoch": 0.04348653535065182, + "grad_norm": 1.9922735691070557, + "learning_rate": 4.9767123762021003e-05, + "loss": 5.4993, + "step": 7312 + }, + { + "epoch": 0.04349248263393282, + "grad_norm": 1.6768959760665894, + "learning_rate": 4.976706015107231e-05, + "loss": 5.4713, + "step": 7313 + }, + { + "epoch": 0.04349842991721382, + "grad_norm": 1.6070122718811035, + "learning_rate": 4.976699653147768e-05, + "loss": 5.4695, + "step": 7314 + }, + { + "epoch": 0.04350437720049481, + "grad_norm": 1.5641200542449951, + "learning_rate": 4.976693290323716e-05, + "loss": 5.3596, + "step": 7315 + }, + { + "epoch": 0.043510324483775814, + "grad_norm": 3.0344419479370117, + "learning_rate": 4.976686926635076e-05, + "loss": 5.7371, + "step": 7316 + }, + { + "epoch": 0.04351627176705681, + "grad_norm": 1.8784242868423462, + "learning_rate": 4.9766805620818494e-05, + "loss": 5.5142, + "step": 7317 + }, + { + "epoch": 0.043522219050337804, + "grad_norm": 2.0644166469573975, + "learning_rate": 4.9766741966640394e-05, + "loss": 5.276, + "step": 7318 + }, + { + "epoch": 0.043528166333618806, + "grad_norm": 1.8128771781921387, + "learning_rate": 4.976667830381649e-05, + "loss": 5.3515, + "step": 7319 + }, + { + "epoch": 0.0435341136168998, + "grad_norm": 1.8899081945419312, + "learning_rate": 4.9766614632346786e-05, + "loss": 5.3981, + "step": 7320 + }, + { + "epoch": 0.043540060900180796, + "grad_norm": 1.89181649684906, + "learning_rate": 4.976655095223131e-05, + "loss": 5.4378, + "step": 7321 + }, + { + "epoch": 0.04354600818346179, + "grad_norm": 1.6332184076309204, + "learning_rate": 4.976648726347009e-05, + "loss": 5.4023, + "step": 7322 + }, + { + "epoch": 0.043551955466742794, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.976642356606315e-05, + "loss": 5.8375, + "step": 7323 + }, + { + "epoch": 0.04355790275002379, + "grad_norm": 2.029244899749756, + "learning_rate": 4.97663598600105e-05, + "loss": 5.5617, + "step": 7324 + }, + { + "epoch": 0.043563850033304784, + "grad_norm": 2.138946056365967, + "learning_rate": 4.9766296145312175e-05, + "loss": 5.5076, + "step": 7325 + }, + { + "epoch": 0.043569797316585786, + "grad_norm": 1.8702884912490845, + "learning_rate": 4.9766232421968184e-05, + "loss": 5.123, + "step": 7326 + }, + { + "epoch": 0.04357574459986678, + "grad_norm": 1.8917137384414673, + "learning_rate": 4.976616868997856e-05, + "loss": 5.4809, + "step": 7327 + }, + { + "epoch": 0.043581691883147776, + "grad_norm": 2.2203474044799805, + "learning_rate": 4.976610494934333e-05, + "loss": 5.6359, + "step": 7328 + }, + { + "epoch": 0.04358763916642878, + "grad_norm": 2.4505302906036377, + "learning_rate": 4.976604120006251e-05, + "loss": 6.1423, + "step": 7329 + }, + { + "epoch": 0.04359358644970977, + "grad_norm": 2.4601128101348877, + "learning_rate": 4.976597744213611e-05, + "loss": 6.0908, + "step": 7330 + }, + { + "epoch": 0.04359953373299077, + "grad_norm": 1.9502687454223633, + "learning_rate": 4.976591367556417e-05, + "loss": 5.918, + "step": 7331 + }, + { + "epoch": 0.04360548101627177, + "grad_norm": 2.180250644683838, + "learning_rate": 4.9765849900346696e-05, + "loss": 5.7203, + "step": 7332 + }, + { + "epoch": 0.043611428299552765, + "grad_norm": 2.125669002532959, + "learning_rate": 4.9765786116483726e-05, + "loss": 5.7875, + "step": 7333 + }, + { + "epoch": 0.04361737558283376, + "grad_norm": 2.0372321605682373, + "learning_rate": 4.9765722323975286e-05, + "loss": 5.6777, + "step": 7334 + }, + { + "epoch": 0.043623322866114755, + "grad_norm": 2.5857362747192383, + "learning_rate": 4.976565852282137e-05, + "loss": 5.2989, + "step": 7335 + }, + { + "epoch": 0.04362927014939576, + "grad_norm": 2.5774800777435303, + "learning_rate": 4.976559471302203e-05, + "loss": 6.0479, + "step": 7336 + }, + { + "epoch": 0.04363521743267675, + "grad_norm": 2.0820937156677246, + "learning_rate": 4.976553089457727e-05, + "loss": 5.7636, + "step": 7337 + }, + { + "epoch": 0.04364116471595775, + "grad_norm": 2.287719964981079, + "learning_rate": 4.9765467067487126e-05, + "loss": 5.7706, + "step": 7338 + }, + { + "epoch": 0.04364711199923875, + "grad_norm": 2.6578378677368164, + "learning_rate": 4.9765403231751614e-05, + "loss": 6.1506, + "step": 7339 + }, + { + "epoch": 0.043653059282519745, + "grad_norm": 2.503955841064453, + "learning_rate": 4.976533938737075e-05, + "loss": 6.0658, + "step": 7340 + }, + { + "epoch": 0.04365900656580074, + "grad_norm": 2.28857684135437, + "learning_rate": 4.976527553434456e-05, + "loss": 5.833, + "step": 7341 + }, + { + "epoch": 0.04366495384908174, + "grad_norm": 2.327331781387329, + "learning_rate": 4.976521167267307e-05, + "loss": 5.934, + "step": 7342 + }, + { + "epoch": 0.04367090113236274, + "grad_norm": 1.7726761102676392, + "learning_rate": 4.976514780235631e-05, + "loss": 6.034, + "step": 7343 + }, + { + "epoch": 0.04367684841564373, + "grad_norm": 2.180790662765503, + "learning_rate": 4.9765083923394285e-05, + "loss": 6.1377, + "step": 7344 + }, + { + "epoch": 0.043682795698924734, + "grad_norm": 2.031378984451294, + "learning_rate": 4.9765020035787024e-05, + "loss": 5.7203, + "step": 7345 + }, + { + "epoch": 0.04368874298220573, + "grad_norm": 2.453611135482788, + "learning_rate": 4.9764956139534545e-05, + "loss": 5.9798, + "step": 7346 + }, + { + "epoch": 0.043694690265486724, + "grad_norm": 2.3802528381347656, + "learning_rate": 4.976489223463688e-05, + "loss": 5.9343, + "step": 7347 + }, + { + "epoch": 0.043700637548767726, + "grad_norm": 2.771704912185669, + "learning_rate": 4.976482832109406e-05, + "loss": 6.5202, + "step": 7348 + }, + { + "epoch": 0.04370658483204872, + "grad_norm": 1.9455180168151855, + "learning_rate": 4.9764764398906084e-05, + "loss": 6.1159, + "step": 7349 + }, + { + "epoch": 0.043712532115329716, + "grad_norm": 1.9527102708816528, + "learning_rate": 4.9764700468072976e-05, + "loss": 5.7773, + "step": 7350 + }, + { + "epoch": 0.04371847939861071, + "grad_norm": 1.9531358480453491, + "learning_rate": 4.976463652859478e-05, + "loss": 5.9918, + "step": 7351 + }, + { + "epoch": 0.043724426681891713, + "grad_norm": 2.375239849090576, + "learning_rate": 4.97645725804715e-05, + "loss": 5.5054, + "step": 7352 + }, + { + "epoch": 0.04373037396517271, + "grad_norm": 2.156553030014038, + "learning_rate": 4.9764508623703166e-05, + "loss": 5.664, + "step": 7353 + }, + { + "epoch": 0.043736321248453704, + "grad_norm": 2.317331075668335, + "learning_rate": 4.9764444658289796e-05, + "loss": 5.4473, + "step": 7354 + }, + { + "epoch": 0.043742268531734706, + "grad_norm": 2.1958348751068115, + "learning_rate": 4.976438068423141e-05, + "loss": 5.3584, + "step": 7355 + }, + { + "epoch": 0.0437482158150157, + "grad_norm": 2.152045249938965, + "learning_rate": 4.976431670152803e-05, + "loss": 5.4388, + "step": 7356 + }, + { + "epoch": 0.043754163098296696, + "grad_norm": 2.0661544799804688, + "learning_rate": 4.976425271017971e-05, + "loss": 5.3866, + "step": 7357 + }, + { + "epoch": 0.0437601103815777, + "grad_norm": 2.106480598449707, + "learning_rate": 4.976418871018642e-05, + "loss": 5.5928, + "step": 7358 + }, + { + "epoch": 0.04376605766485869, + "grad_norm": 2.5921759605407715, + "learning_rate": 4.976412470154821e-05, + "loss": 6.0133, + "step": 7359 + }, + { + "epoch": 0.04377200494813969, + "grad_norm": 2.4117794036865234, + "learning_rate": 4.97640606842651e-05, + "loss": 6.0988, + "step": 7360 + }, + { + "epoch": 0.04377795223142069, + "grad_norm": 1.9839050769805908, + "learning_rate": 4.976399665833712e-05, + "loss": 5.9568, + "step": 7361 + }, + { + "epoch": 0.043783899514701685, + "grad_norm": 2.166215419769287, + "learning_rate": 4.9763932623764285e-05, + "loss": 5.9205, + "step": 7362 + }, + { + "epoch": 0.04378984679798268, + "grad_norm": 2.8216545581817627, + "learning_rate": 4.9763868580546616e-05, + "loss": 5.792, + "step": 7363 + }, + { + "epoch": 0.043795794081263675, + "grad_norm": 2.907707929611206, + "learning_rate": 4.976380452868413e-05, + "loss": 5.5824, + "step": 7364 + }, + { + "epoch": 0.04380174136454468, + "grad_norm": 2.173025369644165, + "learning_rate": 4.976374046817686e-05, + "loss": 6.2752, + "step": 7365 + }, + { + "epoch": 0.04380768864782567, + "grad_norm": 2.1098685264587402, + "learning_rate": 4.9763676399024814e-05, + "loss": 5.8052, + "step": 7366 + }, + { + "epoch": 0.04381363593110667, + "grad_norm": 2.1980762481689453, + "learning_rate": 4.9763612321228035e-05, + "loss": 5.3456, + "step": 7367 + }, + { + "epoch": 0.04381958321438767, + "grad_norm": 2.091327667236328, + "learning_rate": 4.976354823478654e-05, + "loss": 5.211, + "step": 7368 + }, + { + "epoch": 0.043825530497668665, + "grad_norm": 2.37920880317688, + "learning_rate": 4.976348413970033e-05, + "loss": 5.8652, + "step": 7369 + }, + { + "epoch": 0.04383147778094966, + "grad_norm": 2.454202175140381, + "learning_rate": 4.976342003596946e-05, + "loss": 5.9654, + "step": 7370 + }, + { + "epoch": 0.04383742506423066, + "grad_norm": 2.04577898979187, + "learning_rate": 4.9763355923593927e-05, + "loss": 6.3042, + "step": 7371 + }, + { + "epoch": 0.04384337234751166, + "grad_norm": 2.358250141143799, + "learning_rate": 4.976329180257376e-05, + "loss": 6.1403, + "step": 7372 + }, + { + "epoch": 0.04384931963079265, + "grad_norm": 2.177819013595581, + "learning_rate": 4.9763227672909e-05, + "loss": 5.8993, + "step": 7373 + }, + { + "epoch": 0.043855266914073654, + "grad_norm": 2.24910569190979, + "learning_rate": 4.976316353459963e-05, + "loss": 5.9763, + "step": 7374 + }, + { + "epoch": 0.04386121419735465, + "grad_norm": 2.3985965251922607, + "learning_rate": 4.976309938764571e-05, + "loss": 6.2288, + "step": 7375 + }, + { + "epoch": 0.043867161480635644, + "grad_norm": 2.1250808238983154, + "learning_rate": 4.9763035232047244e-05, + "loss": 6.1588, + "step": 7376 + }, + { + "epoch": 0.043873108763916646, + "grad_norm": 1.9815669059753418, + "learning_rate": 4.976297106780426e-05, + "loss": 6.3202, + "step": 7377 + }, + { + "epoch": 0.04387905604719764, + "grad_norm": 2.181999683380127, + "learning_rate": 4.976290689491677e-05, + "loss": 5.9125, + "step": 7378 + }, + { + "epoch": 0.043885003330478636, + "grad_norm": 2.365546703338623, + "learning_rate": 4.9762842713384815e-05, + "loss": 6.0991, + "step": 7379 + }, + { + "epoch": 0.04389095061375963, + "grad_norm": 2.0843441486358643, + "learning_rate": 4.9762778523208406e-05, + "loss": 5.9675, + "step": 7380 + }, + { + "epoch": 0.04389689789704063, + "grad_norm": 2.271576404571533, + "learning_rate": 4.9762714324387566e-05, + "loss": 5.5703, + "step": 7381 + }, + { + "epoch": 0.04390284518032163, + "grad_norm": 2.244211435317993, + "learning_rate": 4.9762650116922314e-05, + "loss": 5.4674, + "step": 7382 + }, + { + "epoch": 0.043908792463602624, + "grad_norm": 1.728034257888794, + "learning_rate": 4.9762585900812684e-05, + "loss": 5.6264, + "step": 7383 + }, + { + "epoch": 0.043914739746883626, + "grad_norm": 2.400587320327759, + "learning_rate": 4.976252167605869e-05, + "loss": 6.052, + "step": 7384 + }, + { + "epoch": 0.04392068703016462, + "grad_norm": 1.9865821599960327, + "learning_rate": 4.9762457442660346e-05, + "loss": 5.8544, + "step": 7385 + }, + { + "epoch": 0.043926634313445616, + "grad_norm": 2.236527681350708, + "learning_rate": 4.97623932006177e-05, + "loss": 5.5033, + "step": 7386 + }, + { + "epoch": 0.04393258159672662, + "grad_norm": 2.0424020290374756, + "learning_rate": 4.9762328949930746e-05, + "loss": 5.4088, + "step": 7387 + }, + { + "epoch": 0.04393852888000761, + "grad_norm": 2.0601999759674072, + "learning_rate": 4.976226469059952e-05, + "loss": 5.8599, + "step": 7388 + }, + { + "epoch": 0.04394447616328861, + "grad_norm": 2.5052783489227295, + "learning_rate": 4.976220042262404e-05, + "loss": 5.8202, + "step": 7389 + }, + { + "epoch": 0.04395042344656961, + "grad_norm": 2.178549289703369, + "learning_rate": 4.9762136146004344e-05, + "loss": 5.4554, + "step": 7390 + }, + { + "epoch": 0.043956370729850605, + "grad_norm": 1.9407802820205688, + "learning_rate": 4.976207186074043e-05, + "loss": 5.4062, + "step": 7391 + }, + { + "epoch": 0.0439623180131316, + "grad_norm": 1.4814093112945557, + "learning_rate": 4.9762007566832336e-05, + "loss": 5.4662, + "step": 7392 + }, + { + "epoch": 0.043968265296412595, + "grad_norm": 1.8808835744857788, + "learning_rate": 4.9761943264280086e-05, + "loss": 6.1617, + "step": 7393 + }, + { + "epoch": 0.0439742125796936, + "grad_norm": 1.9318643808364868, + "learning_rate": 4.97618789530837e-05, + "loss": 6.1357, + "step": 7394 + }, + { + "epoch": 0.04398015986297459, + "grad_norm": 2.2515900135040283, + "learning_rate": 4.976181463324319e-05, + "loss": 6.11, + "step": 7395 + }, + { + "epoch": 0.04398610714625559, + "grad_norm": 2.375298023223877, + "learning_rate": 4.9761750304758584e-05, + "loss": 6.1121, + "step": 7396 + }, + { + "epoch": 0.04399205442953659, + "grad_norm": 2.2254321575164795, + "learning_rate": 4.9761685967629914e-05, + "loss": 6.0136, + "step": 7397 + }, + { + "epoch": 0.043998001712817585, + "grad_norm": 2.146164894104004, + "learning_rate": 4.976162162185719e-05, + "loss": 5.8391, + "step": 7398 + }, + { + "epoch": 0.04400394899609858, + "grad_norm": 2.3237650394439697, + "learning_rate": 4.976155726744044e-05, + "loss": 5.461, + "step": 7399 + }, + { + "epoch": 0.04400989627937958, + "grad_norm": 2.2263002395629883, + "learning_rate": 4.976149290437969e-05, + "loss": 5.5885, + "step": 7400 + }, + { + "epoch": 0.04401584356266058, + "grad_norm": 1.9597729444503784, + "learning_rate": 4.9761428532674956e-05, + "loss": 5.348, + "step": 7401 + }, + { + "epoch": 0.04402179084594157, + "grad_norm": 2.2215018272399902, + "learning_rate": 4.976136415232626e-05, + "loss": 5.933, + "step": 7402 + }, + { + "epoch": 0.044027738129222574, + "grad_norm": 2.258618116378784, + "learning_rate": 4.9761299763333635e-05, + "loss": 6.0685, + "step": 7403 + }, + { + "epoch": 0.04403368541250357, + "grad_norm": 2.3045873641967773, + "learning_rate": 4.976123536569709e-05, + "loss": 5.7277, + "step": 7404 + }, + { + "epoch": 0.044039632695784564, + "grad_norm": 2.546252489089966, + "learning_rate": 4.976117095941666e-05, + "loss": 5.8839, + "step": 7405 + }, + { + "epoch": 0.044045579979065566, + "grad_norm": 1.8963768482208252, + "learning_rate": 4.976110654449235e-05, + "loss": 6.1247, + "step": 7406 + }, + { + "epoch": 0.04405152726234656, + "grad_norm": 2.6287784576416016, + "learning_rate": 4.976104212092421e-05, + "loss": 5.9712, + "step": 7407 + }, + { + "epoch": 0.044057474545627556, + "grad_norm": 2.562612295150757, + "learning_rate": 4.976097768871223e-05, + "loss": 6.1226, + "step": 7408 + }, + { + "epoch": 0.04406342182890855, + "grad_norm": 2.2308688163757324, + "learning_rate": 4.976091324785645e-05, + "loss": 6.3235, + "step": 7409 + }, + { + "epoch": 0.04406936911218955, + "grad_norm": 2.4595553874969482, + "learning_rate": 4.976084879835691e-05, + "loss": 5.8164, + "step": 7410 + }, + { + "epoch": 0.04407531639547055, + "grad_norm": 2.3693978786468506, + "learning_rate": 4.97607843402136e-05, + "loss": 5.7727, + "step": 7411 + }, + { + "epoch": 0.044081263678751544, + "grad_norm": 4.144592761993408, + "learning_rate": 4.9760719873426546e-05, + "loss": 5.6382, + "step": 7412 + }, + { + "epoch": 0.044087210962032546, + "grad_norm": 2.5423779487609863, + "learning_rate": 4.9760655397995794e-05, + "loss": 5.7526, + "step": 7413 + }, + { + "epoch": 0.04409315824531354, + "grad_norm": 2.119281053543091, + "learning_rate": 4.976059091392135e-05, + "loss": 5.7246, + "step": 7414 + }, + { + "epoch": 0.044099105528594536, + "grad_norm": 2.177074432373047, + "learning_rate": 4.976052642120324e-05, + "loss": 5.7296, + "step": 7415 + }, + { + "epoch": 0.04410505281187554, + "grad_norm": 1.8897806406021118, + "learning_rate": 4.9760461919841486e-05, + "loss": 5.6349, + "step": 7416 + }, + { + "epoch": 0.04411100009515653, + "grad_norm": 2.445082187652588, + "learning_rate": 4.97603974098361e-05, + "loss": 5.7414, + "step": 7417 + }, + { + "epoch": 0.04411694737843753, + "grad_norm": 2.2564280033111572, + "learning_rate": 4.976033289118713e-05, + "loss": 5.6709, + "step": 7418 + }, + { + "epoch": 0.04412289466171853, + "grad_norm": 2.1907529830932617, + "learning_rate": 4.976026836389458e-05, + "loss": 5.6067, + "step": 7419 + }, + { + "epoch": 0.044128841944999525, + "grad_norm": 2.1872594356536865, + "learning_rate": 4.976020382795848e-05, + "loss": 5.5166, + "step": 7420 + }, + { + "epoch": 0.04413478922828052, + "grad_norm": 1.7740691900253296, + "learning_rate": 4.9760139283378835e-05, + "loss": 5.5833, + "step": 7421 + }, + { + "epoch": 0.044140736511561515, + "grad_norm": 2.128389358520508, + "learning_rate": 4.976007473015569e-05, + "loss": 5.6403, + "step": 7422 + }, + { + "epoch": 0.04414668379484252, + "grad_norm": 2.6193220615386963, + "learning_rate": 4.9760010168289053e-05, + "loss": 5.8139, + "step": 7423 + }, + { + "epoch": 0.04415263107812351, + "grad_norm": 2.727902412414551, + "learning_rate": 4.9759945597778955e-05, + "loss": 5.3286, + "step": 7424 + }, + { + "epoch": 0.04415857836140451, + "grad_norm": 2.4500436782836914, + "learning_rate": 4.975988101862542e-05, + "loss": 5.2647, + "step": 7425 + }, + { + "epoch": 0.04416452564468551, + "grad_norm": 2.1040356159210205, + "learning_rate": 4.975981643082846e-05, + "loss": 6.0935, + "step": 7426 + }, + { + "epoch": 0.044170472927966505, + "grad_norm": 1.9168792963027954, + "learning_rate": 4.975975183438811e-05, + "loss": 5.5147, + "step": 7427 + }, + { + "epoch": 0.0441764202112475, + "grad_norm": 2.0156469345092773, + "learning_rate": 4.9759687229304384e-05, + "loss": 6.2896, + "step": 7428 + }, + { + "epoch": 0.0441823674945285, + "grad_norm": 2.362933874130249, + "learning_rate": 4.975962261557731e-05, + "loss": 5.9514, + "step": 7429 + }, + { + "epoch": 0.0441883147778095, + "grad_norm": 2.2892727851867676, + "learning_rate": 4.9759557993206906e-05, + "loss": 5.5646, + "step": 7430 + }, + { + "epoch": 0.04419426206109049, + "grad_norm": 2.287722587585449, + "learning_rate": 4.97594933621932e-05, + "loss": 5.364, + "step": 7431 + }, + { + "epoch": 0.044200209344371494, + "grad_norm": 2.0421855449676514, + "learning_rate": 4.9759428722536194e-05, + "loss": 5.6838, + "step": 7432 + }, + { + "epoch": 0.04420615662765249, + "grad_norm": 2.2392499446868896, + "learning_rate": 4.9759364074235944e-05, + "loss": 6.0727, + "step": 7433 + }, + { + "epoch": 0.044212103910933484, + "grad_norm": 2.084768295288086, + "learning_rate": 4.975929941729245e-05, + "loss": 6.1208, + "step": 7434 + }, + { + "epoch": 0.044218051194214486, + "grad_norm": 1.817015528678894, + "learning_rate": 4.975923475170574e-05, + "loss": 6.3405, + "step": 7435 + }, + { + "epoch": 0.04422399847749548, + "grad_norm": 1.974926233291626, + "learning_rate": 4.9759170077475834e-05, + "loss": 5.9607, + "step": 7436 + }, + { + "epoch": 0.044229945760776476, + "grad_norm": 2.1244025230407715, + "learning_rate": 4.975910539460277e-05, + "loss": 6.2579, + "step": 7437 + }, + { + "epoch": 0.04423589304405747, + "grad_norm": 1.9459706544876099, + "learning_rate": 4.975904070308655e-05, + "loss": 5.5877, + "step": 7438 + }, + { + "epoch": 0.04424184032733847, + "grad_norm": 2.1891977787017822, + "learning_rate": 4.97589760029272e-05, + "loss": 5.9913, + "step": 7439 + }, + { + "epoch": 0.04424778761061947, + "grad_norm": 2.0368902683258057, + "learning_rate": 4.9758911294124756e-05, + "loss": 5.9478, + "step": 7440 + }, + { + "epoch": 0.044253734893900463, + "grad_norm": 2.2937796115875244, + "learning_rate": 4.975884657667922e-05, + "loss": 6.1529, + "step": 7441 + }, + { + "epoch": 0.044259682177181466, + "grad_norm": 2.601637125015259, + "learning_rate": 4.975878185059064e-05, + "loss": 5.4446, + "step": 7442 + }, + { + "epoch": 0.04426562946046246, + "grad_norm": 2.2025954723358154, + "learning_rate": 4.975871711585902e-05, + "loss": 5.8911, + "step": 7443 + }, + { + "epoch": 0.044271576743743456, + "grad_norm": 2.0498836040496826, + "learning_rate": 4.975865237248438e-05, + "loss": 6.0604, + "step": 7444 + }, + { + "epoch": 0.04427752402702446, + "grad_norm": 2.308239459991455, + "learning_rate": 4.975858762046676e-05, + "loss": 5.9599, + "step": 7445 + }, + { + "epoch": 0.04428347131030545, + "grad_norm": 2.286747455596924, + "learning_rate": 4.9758522859806165e-05, + "loss": 6.3528, + "step": 7446 + }, + { + "epoch": 0.04428941859358645, + "grad_norm": 2.2376902103424072, + "learning_rate": 4.975845809050264e-05, + "loss": 6.205, + "step": 7447 + }, + { + "epoch": 0.04429536587686745, + "grad_norm": 1.8052057027816772, + "learning_rate": 4.9758393312556176e-05, + "loss": 6.2188, + "step": 7448 + }, + { + "epoch": 0.044301313160148445, + "grad_norm": 1.9839476346969604, + "learning_rate": 4.975832852596682e-05, + "loss": 6.1479, + "step": 7449 + }, + { + "epoch": 0.04430726044342944, + "grad_norm": 1.8890517950057983, + "learning_rate": 4.975826373073459e-05, + "loss": 6.2524, + "step": 7450 + }, + { + "epoch": 0.04431320772671044, + "grad_norm": 2.049192428588867, + "learning_rate": 4.97581989268595e-05, + "loss": 5.5486, + "step": 7451 + }, + { + "epoch": 0.04431915500999144, + "grad_norm": 2.8271291255950928, + "learning_rate": 4.975813411434158e-05, + "loss": 5.1916, + "step": 7452 + }, + { + "epoch": 0.04432510229327243, + "grad_norm": 1.94833505153656, + "learning_rate": 4.975806929318085e-05, + "loss": 5.6747, + "step": 7453 + }, + { + "epoch": 0.04433104957655343, + "grad_norm": 2.14536190032959, + "learning_rate": 4.975800446337734e-05, + "loss": 5.4066, + "step": 7454 + }, + { + "epoch": 0.04433699685983443, + "grad_norm": 2.5557188987731934, + "learning_rate": 4.975793962493106e-05, + "loss": 5.2257, + "step": 7455 + }, + { + "epoch": 0.044342944143115424, + "grad_norm": 2.4718832969665527, + "learning_rate": 4.975787477784205e-05, + "loss": 6.0248, + "step": 7456 + }, + { + "epoch": 0.04434889142639642, + "grad_norm": 2.8627419471740723, + "learning_rate": 4.975780992211031e-05, + "loss": 5.3245, + "step": 7457 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 2.932990789413452, + "learning_rate": 4.9757745057735876e-05, + "loss": 4.8914, + "step": 7458 + }, + { + "epoch": 0.04436078599295842, + "grad_norm": 2.6231770515441895, + "learning_rate": 4.975768018471877e-05, + "loss": 5.3323, + "step": 7459 + }, + { + "epoch": 0.04436673327623941, + "grad_norm": 2.5591986179351807, + "learning_rate": 4.975761530305901e-05, + "loss": 5.4972, + "step": 7460 + }, + { + "epoch": 0.044372680559520414, + "grad_norm": 2.4060492515563965, + "learning_rate": 4.975755041275664e-05, + "loss": 5.5988, + "step": 7461 + }, + { + "epoch": 0.04437862784280141, + "grad_norm": 2.377260446548462, + "learning_rate": 4.975748551381164e-05, + "loss": 5.2137, + "step": 7462 + }, + { + "epoch": 0.044384575126082404, + "grad_norm": 2.171934127807617, + "learning_rate": 4.9757420606224076e-05, + "loss": 5.6313, + "step": 7463 + }, + { + "epoch": 0.044390522409363406, + "grad_norm": 2.1225788593292236, + "learning_rate": 4.975735568999394e-05, + "loss": 5.839, + "step": 7464 + }, + { + "epoch": 0.0443964696926444, + "grad_norm": 2.271127939224243, + "learning_rate": 4.975729076512128e-05, + "loss": 5.7111, + "step": 7465 + }, + { + "epoch": 0.044402416975925396, + "grad_norm": 2.7138264179229736, + "learning_rate": 4.975722583160609e-05, + "loss": 5.3169, + "step": 7466 + }, + { + "epoch": 0.04440836425920639, + "grad_norm": 2.8181982040405273, + "learning_rate": 4.9757160889448416e-05, + "loss": 5.3323, + "step": 7467 + }, + { + "epoch": 0.04441431154248739, + "grad_norm": 2.680816411972046, + "learning_rate": 4.975709593864828e-05, + "loss": 5.6924, + "step": 7468 + }, + { + "epoch": 0.04442025882576839, + "grad_norm": 2.3682074546813965, + "learning_rate": 4.975703097920569e-05, + "loss": 6.0049, + "step": 7469 + }, + { + "epoch": 0.04442620610904938, + "grad_norm": 2.3080508708953857, + "learning_rate": 4.9756966011120674e-05, + "loss": 6.4438, + "step": 7470 + }, + { + "epoch": 0.044432153392330385, + "grad_norm": 2.2631113529205322, + "learning_rate": 4.9756901034393265e-05, + "loss": 5.9296, + "step": 7471 + }, + { + "epoch": 0.04443810067561138, + "grad_norm": 2.283712148666382, + "learning_rate": 4.975683604902347e-05, + "loss": 5.831, + "step": 7472 + }, + { + "epoch": 0.044444047958892376, + "grad_norm": 2.2130608558654785, + "learning_rate": 4.975677105501132e-05, + "loss": 5.8757, + "step": 7473 + }, + { + "epoch": 0.04444999524217338, + "grad_norm": 1.9392763376235962, + "learning_rate": 4.975670605235684e-05, + "loss": 5.5836, + "step": 7474 + }, + { + "epoch": 0.04445594252545437, + "grad_norm": 2.097076416015625, + "learning_rate": 4.975664104106005e-05, + "loss": 6.0782, + "step": 7475 + }, + { + "epoch": 0.04446188980873537, + "grad_norm": 2.063021183013916, + "learning_rate": 4.975657602112097e-05, + "loss": 6.2171, + "step": 7476 + }, + { + "epoch": 0.04446783709201637, + "grad_norm": 2.4466049671173096, + "learning_rate": 4.9756510992539626e-05, + "loss": 5.8649, + "step": 7477 + }, + { + "epoch": 0.044473784375297365, + "grad_norm": 2.2160751819610596, + "learning_rate": 4.975644595531605e-05, + "loss": 5.9297, + "step": 7478 + }, + { + "epoch": 0.04447973165857836, + "grad_norm": 2.69352650642395, + "learning_rate": 4.975638090945024e-05, + "loss": 6.1062, + "step": 7479 + }, + { + "epoch": 0.04448567894185936, + "grad_norm": 2.2830610275268555, + "learning_rate": 4.975631585494224e-05, + "loss": 6.1663, + "step": 7480 + }, + { + "epoch": 0.04449162622514036, + "grad_norm": 2.936842203140259, + "learning_rate": 4.975625079179206e-05, + "loss": 5.9952, + "step": 7481 + }, + { + "epoch": 0.04449757350842135, + "grad_norm": 2.1398322582244873, + "learning_rate": 4.9756185719999725e-05, + "loss": 6.0005, + "step": 7482 + }, + { + "epoch": 0.04450352079170235, + "grad_norm": 2.2835536003112793, + "learning_rate": 4.9756120639565275e-05, + "loss": 5.7155, + "step": 7483 + }, + { + "epoch": 0.04450946807498335, + "grad_norm": 2.22917103767395, + "learning_rate": 4.975605555048871e-05, + "loss": 5.7134, + "step": 7484 + }, + { + "epoch": 0.044515415358264344, + "grad_norm": 2.0195605754852295, + "learning_rate": 4.975599045277006e-05, + "loss": 5.6369, + "step": 7485 + }, + { + "epoch": 0.04452136264154534, + "grad_norm": 1.8495477437973022, + "learning_rate": 4.975592534640936e-05, + "loss": 5.9035, + "step": 7486 + }, + { + "epoch": 0.04452730992482634, + "grad_norm": 2.4814226627349854, + "learning_rate": 4.9755860231406616e-05, + "loss": 6.1024, + "step": 7487 + }, + { + "epoch": 0.04453325720810734, + "grad_norm": 2.221820831298828, + "learning_rate": 4.975579510776186e-05, + "loss": 6.1193, + "step": 7488 + }, + { + "epoch": 0.04453920449138833, + "grad_norm": 1.935722827911377, + "learning_rate": 4.975572997547511e-05, + "loss": 6.1088, + "step": 7489 + }, + { + "epoch": 0.044545151774669334, + "grad_norm": 2.1287481784820557, + "learning_rate": 4.975566483454638e-05, + "loss": 6.1064, + "step": 7490 + }, + { + "epoch": 0.04455109905795033, + "grad_norm": 2.1914093494415283, + "learning_rate": 4.9755599684975716e-05, + "loss": 6.072, + "step": 7491 + }, + { + "epoch": 0.044557046341231324, + "grad_norm": 2.1979966163635254, + "learning_rate": 4.975553452676312e-05, + "loss": 6.1447, + "step": 7492 + }, + { + "epoch": 0.044562993624512326, + "grad_norm": 2.108259916305542, + "learning_rate": 4.975546935990863e-05, + "loss": 6.0109, + "step": 7493 + }, + { + "epoch": 0.04456894090779332, + "grad_norm": 2.2454450130462646, + "learning_rate": 4.975540418441226e-05, + "loss": 5.8627, + "step": 7494 + }, + { + "epoch": 0.044574888191074316, + "grad_norm": 2.151130437850952, + "learning_rate": 4.9755339000274027e-05, + "loss": 6.0241, + "step": 7495 + }, + { + "epoch": 0.04458083547435531, + "grad_norm": 1.9150489568710327, + "learning_rate": 4.975527380749397e-05, + "loss": 6.0179, + "step": 7496 + }, + { + "epoch": 0.04458678275763631, + "grad_norm": 1.9065133333206177, + "learning_rate": 4.97552086060721e-05, + "loss": 5.9991, + "step": 7497 + }, + { + "epoch": 0.04459273004091731, + "grad_norm": 1.9627622365951538, + "learning_rate": 4.975514339600844e-05, + "loss": 5.9633, + "step": 7498 + }, + { + "epoch": 0.0445986773241983, + "grad_norm": 1.7777502536773682, + "learning_rate": 4.975507817730302e-05, + "loss": 5.9426, + "step": 7499 + }, + { + "epoch": 0.044604624607479305, + "grad_norm": 1.6735023260116577, + "learning_rate": 4.9755012949955846e-05, + "loss": 5.9432, + "step": 7500 + }, + { + "epoch": 0.0446105718907603, + "grad_norm": 2.1570491790771484, + "learning_rate": 4.975494771396697e-05, + "loss": 6.2032, + "step": 7501 + }, + { + "epoch": 0.044616519174041296, + "grad_norm": 2.286522150039673, + "learning_rate": 4.9754882469336387e-05, + "loss": 5.7226, + "step": 7502 + }, + { + "epoch": 0.0446224664573223, + "grad_norm": 2.1940622329711914, + "learning_rate": 4.975481721606413e-05, + "loss": 6.2215, + "step": 7503 + }, + { + "epoch": 0.04462841374060329, + "grad_norm": 2.329263210296631, + "learning_rate": 4.9754751954150224e-05, + "loss": 5.5403, + "step": 7504 + }, + { + "epoch": 0.04463436102388429, + "grad_norm": 2.112712860107422, + "learning_rate": 4.975468668359469e-05, + "loss": 5.7581, + "step": 7505 + }, + { + "epoch": 0.04464030830716529, + "grad_norm": 2.2875239849090576, + "learning_rate": 4.975462140439755e-05, + "loss": 5.9593, + "step": 7506 + }, + { + "epoch": 0.044646255590446285, + "grad_norm": 2.282121419906616, + "learning_rate": 4.975455611655883e-05, + "loss": 5.8684, + "step": 7507 + }, + { + "epoch": 0.04465220287372728, + "grad_norm": 1.8482197523117065, + "learning_rate": 4.975449082007855e-05, + "loss": 5.753, + "step": 7508 + }, + { + "epoch": 0.04465815015700828, + "grad_norm": 2.6635684967041016, + "learning_rate": 4.9754425514956724e-05, + "loss": 5.0732, + "step": 7509 + }, + { + "epoch": 0.04466409744028928, + "grad_norm": 2.6632800102233887, + "learning_rate": 4.9754360201193395e-05, + "loss": 5.1644, + "step": 7510 + }, + { + "epoch": 0.04467004472357027, + "grad_norm": 2.630445718765259, + "learning_rate": 4.9754294878788574e-05, + "loss": 5.0322, + "step": 7511 + }, + { + "epoch": 0.04467599200685127, + "grad_norm": 2.4036223888397217, + "learning_rate": 4.975422954774228e-05, + "loss": 4.8949, + "step": 7512 + }, + { + "epoch": 0.04468193929013227, + "grad_norm": 2.381810426712036, + "learning_rate": 4.9754164208054535e-05, + "loss": 5.7921, + "step": 7513 + }, + { + "epoch": 0.044687886573413264, + "grad_norm": 2.570949077606201, + "learning_rate": 4.9754098859725377e-05, + "loss": 5.9612, + "step": 7514 + }, + { + "epoch": 0.04469383385669426, + "grad_norm": 2.510998010635376, + "learning_rate": 4.9754033502754815e-05, + "loss": 5.7273, + "step": 7515 + }, + { + "epoch": 0.04469978113997526, + "grad_norm": 2.6216115951538086, + "learning_rate": 4.975396813714288e-05, + "loss": 5.7601, + "step": 7516 + }, + { + "epoch": 0.04470572842325626, + "grad_norm": 2.5298542976379395, + "learning_rate": 4.975390276288958e-05, + "loss": 5.8007, + "step": 7517 + }, + { + "epoch": 0.04471167570653725, + "grad_norm": 2.6195290088653564, + "learning_rate": 4.975383737999496e-05, + "loss": 5.6071, + "step": 7518 + }, + { + "epoch": 0.044717622989818254, + "grad_norm": 2.5432629585266113, + "learning_rate": 4.975377198845902e-05, + "loss": 6.0224, + "step": 7519 + }, + { + "epoch": 0.04472357027309925, + "grad_norm": 2.2290337085723877, + "learning_rate": 4.97537065882818e-05, + "loss": 5.7141, + "step": 7520 + }, + { + "epoch": 0.044729517556380244, + "grad_norm": 2.627206802368164, + "learning_rate": 4.975364117946332e-05, + "loss": 6.2518, + "step": 7521 + }, + { + "epoch": 0.044735464839661246, + "grad_norm": 2.386993169784546, + "learning_rate": 4.975357576200359e-05, + "loss": 6.0494, + "step": 7522 + }, + { + "epoch": 0.04474141212294224, + "grad_norm": 2.20511794090271, + "learning_rate": 4.9753510335902656e-05, + "loss": 6.2563, + "step": 7523 + }, + { + "epoch": 0.044747359406223236, + "grad_norm": 2.5564749240875244, + "learning_rate": 4.975344490116052e-05, + "loss": 6.2498, + "step": 7524 + }, + { + "epoch": 0.04475330668950423, + "grad_norm": 2.6001932621002197, + "learning_rate": 4.975337945777721e-05, + "loss": 5.6721, + "step": 7525 + }, + { + "epoch": 0.04475925397278523, + "grad_norm": 2.6677772998809814, + "learning_rate": 4.975331400575275e-05, + "loss": 5.88, + "step": 7526 + }, + { + "epoch": 0.04476520125606623, + "grad_norm": 3.616734027862549, + "learning_rate": 4.975324854508716e-05, + "loss": 5.4835, + "step": 7527 + }, + { + "epoch": 0.04477114853934722, + "grad_norm": 3.0301461219787598, + "learning_rate": 4.975318307578048e-05, + "loss": 5.326, + "step": 7528 + }, + { + "epoch": 0.044777095822628225, + "grad_norm": 2.029836893081665, + "learning_rate": 4.975311759783271e-05, + "loss": 5.3516, + "step": 7529 + }, + { + "epoch": 0.04478304310590922, + "grad_norm": 1.9886969327926636, + "learning_rate": 4.9753052111243885e-05, + "loss": 5.3442, + "step": 7530 + }, + { + "epoch": 0.044788990389190216, + "grad_norm": 2.4227612018585205, + "learning_rate": 4.975298661601403e-05, + "loss": 5.4273, + "step": 7531 + }, + { + "epoch": 0.04479493767247122, + "grad_norm": 2.8426849842071533, + "learning_rate": 4.975292111214316e-05, + "loss": 5.6604, + "step": 7532 + }, + { + "epoch": 0.04480088495575221, + "grad_norm": 2.4818854331970215, + "learning_rate": 4.97528555996313e-05, + "loss": 6.4941, + "step": 7533 + }, + { + "epoch": 0.04480683223903321, + "grad_norm": 2.291642904281616, + "learning_rate": 4.9752790078478465e-05, + "loss": 6.404, + "step": 7534 + }, + { + "epoch": 0.04481277952231421, + "grad_norm": 2.4973669052124023, + "learning_rate": 4.9752724548684695e-05, + "loss": 5.6068, + "step": 7535 + }, + { + "epoch": 0.044818726805595205, + "grad_norm": 2.273130416870117, + "learning_rate": 4.975265901025001e-05, + "loss": 6.1689, + "step": 7536 + }, + { + "epoch": 0.0448246740888762, + "grad_norm": 3.362520456314087, + "learning_rate": 4.9752593463174424e-05, + "loss": 5.5346, + "step": 7537 + }, + { + "epoch": 0.0448306213721572, + "grad_norm": 5.170871257781982, + "learning_rate": 4.9752527907457956e-05, + "loss": 5.3831, + "step": 7538 + }, + { + "epoch": 0.0448365686554382, + "grad_norm": 4.224242687225342, + "learning_rate": 4.975246234310064e-05, + "loss": 5.2511, + "step": 7539 + }, + { + "epoch": 0.04484251593871919, + "grad_norm": 3.1753036975860596, + "learning_rate": 4.97523967701025e-05, + "loss": 5.06, + "step": 7540 + }, + { + "epoch": 0.04484846322200019, + "grad_norm": 2.4226467609405518, + "learning_rate": 4.975233118846355e-05, + "loss": 5.5225, + "step": 7541 + }, + { + "epoch": 0.04485441050528119, + "grad_norm": 2.5356781482696533, + "learning_rate": 4.9752265598183814e-05, + "loss": 5.5865, + "step": 7542 + }, + { + "epoch": 0.044860357788562184, + "grad_norm": 2.1505908966064453, + "learning_rate": 4.9752199999263326e-05, + "loss": 5.7436, + "step": 7543 + }, + { + "epoch": 0.04486630507184318, + "grad_norm": 2.675703763961792, + "learning_rate": 4.97521343917021e-05, + "loss": 5.3693, + "step": 7544 + }, + { + "epoch": 0.04487225235512418, + "grad_norm": 3.5228023529052734, + "learning_rate": 4.975206877550015e-05, + "loss": 4.8527, + "step": 7545 + }, + { + "epoch": 0.044878199638405177, + "grad_norm": 3.1165566444396973, + "learning_rate": 4.975200315065752e-05, + "loss": 4.7971, + "step": 7546 + }, + { + "epoch": 0.04488414692168617, + "grad_norm": 2.6216177940368652, + "learning_rate": 4.975193751717421e-05, + "loss": 4.9328, + "step": 7547 + }, + { + "epoch": 0.044890094204967174, + "grad_norm": 2.352031707763672, + "learning_rate": 4.975187187505026e-05, + "loss": 5.0021, + "step": 7548 + }, + { + "epoch": 0.04489604148824817, + "grad_norm": 1.8147127628326416, + "learning_rate": 4.975180622428569e-05, + "loss": 5.7009, + "step": 7549 + }, + { + "epoch": 0.044901988771529164, + "grad_norm": 2.1674726009368896, + "learning_rate": 4.9751740564880516e-05, + "loss": 5.2545, + "step": 7550 + }, + { + "epoch": 0.044907936054810166, + "grad_norm": 2.2935330867767334, + "learning_rate": 4.975167489683477e-05, + "loss": 5.2351, + "step": 7551 + }, + { + "epoch": 0.04491388333809116, + "grad_norm": 2.2964932918548584, + "learning_rate": 4.975160922014846e-05, + "loss": 5.483, + "step": 7552 + }, + { + "epoch": 0.044919830621372156, + "grad_norm": 1.8180936574935913, + "learning_rate": 4.9751543534821635e-05, + "loss": 5.668, + "step": 7553 + }, + { + "epoch": 0.04492577790465315, + "grad_norm": 1.906435251235962, + "learning_rate": 4.9751477840854286e-05, + "loss": 5.6664, + "step": 7554 + }, + { + "epoch": 0.04493172518793415, + "grad_norm": 2.459702253341675, + "learning_rate": 4.9751412138246455e-05, + "loss": 5.5272, + "step": 7555 + }, + { + "epoch": 0.04493767247121515, + "grad_norm": 2.1219170093536377, + "learning_rate": 4.975134642699817e-05, + "loss": 5.638, + "step": 7556 + }, + { + "epoch": 0.04494361975449614, + "grad_norm": 2.1492953300476074, + "learning_rate": 4.975128070710944e-05, + "loss": 5.9422, + "step": 7557 + }, + { + "epoch": 0.044949567037777145, + "grad_norm": 1.813988208770752, + "learning_rate": 4.97512149785803e-05, + "loss": 5.9875, + "step": 7558 + }, + { + "epoch": 0.04495551432105814, + "grad_norm": 1.6336817741394043, + "learning_rate": 4.975114924141075e-05, + "loss": 5.9245, + "step": 7559 + }, + { + "epoch": 0.044961461604339135, + "grad_norm": 1.9339455366134644, + "learning_rate": 4.9751083495600847e-05, + "loss": 5.3263, + "step": 7560 + }, + { + "epoch": 0.04496740888762014, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.975101774115059e-05, + "loss": 5.4625, + "step": 7561 + }, + { + "epoch": 0.04497335617090113, + "grad_norm": 2.2994346618652344, + "learning_rate": 4.9750951978060004e-05, + "loss": 5.6327, + "step": 7562 + }, + { + "epoch": 0.04497930345418213, + "grad_norm": 2.1627299785614014, + "learning_rate": 4.975088620632912e-05, + "loss": 5.4882, + "step": 7563 + }, + { + "epoch": 0.04498525073746313, + "grad_norm": 2.763397693634033, + "learning_rate": 4.9750820425957954e-05, + "loss": 5.727, + "step": 7564 + }, + { + "epoch": 0.044991198020744125, + "grad_norm": 2.0107216835021973, + "learning_rate": 4.975075463694654e-05, + "loss": 5.3852, + "step": 7565 + }, + { + "epoch": 0.04499714530402512, + "grad_norm": 1.8424763679504395, + "learning_rate": 4.975068883929489e-05, + "loss": 5.3072, + "step": 7566 + }, + { + "epoch": 0.04500309258730612, + "grad_norm": 1.946702003479004, + "learning_rate": 4.975062303300303e-05, + "loss": 5.3184, + "step": 7567 + }, + { + "epoch": 0.04500903987058712, + "grad_norm": 2.1091182231903076, + "learning_rate": 4.9750557218070984e-05, + "loss": 5.0689, + "step": 7568 + }, + { + "epoch": 0.04501498715386811, + "grad_norm": 2.0064187049865723, + "learning_rate": 4.975049139449877e-05, + "loss": 4.8495, + "step": 7569 + }, + { + "epoch": 0.04502093443714911, + "grad_norm": 1.7544279098510742, + "learning_rate": 4.9750425562286416e-05, + "loss": 4.9524, + "step": 7570 + }, + { + "epoch": 0.04502688172043011, + "grad_norm": 2.0814568996429443, + "learning_rate": 4.9750359721433945e-05, + "loss": 4.798, + "step": 7571 + }, + { + "epoch": 0.045032829003711104, + "grad_norm": 2.1185543537139893, + "learning_rate": 4.975029387194139e-05, + "loss": 4.9313, + "step": 7572 + }, + { + "epoch": 0.0450387762869921, + "grad_norm": 2.3774518966674805, + "learning_rate": 4.975022801380875e-05, + "loss": 5.5954, + "step": 7573 + }, + { + "epoch": 0.0450447235702731, + "grad_norm": 2.261306047439575, + "learning_rate": 4.975016214703606e-05, + "loss": 5.5598, + "step": 7574 + }, + { + "epoch": 0.045050670853554096, + "grad_norm": 2.128244161605835, + "learning_rate": 4.975009627162335e-05, + "loss": 5.359, + "step": 7575 + }, + { + "epoch": 0.04505661813683509, + "grad_norm": 2.0767438411712646, + "learning_rate": 4.975003038757064e-05, + "loss": 5.6855, + "step": 7576 + }, + { + "epoch": 0.045062565420116094, + "grad_norm": 1.9789010286331177, + "learning_rate": 4.974996449487794e-05, + "loss": 5.1807, + "step": 7577 + }, + { + "epoch": 0.04506851270339709, + "grad_norm": 1.9136112928390503, + "learning_rate": 4.97498985935453e-05, + "loss": 5.3811, + "step": 7578 + }, + { + "epoch": 0.045074459986678084, + "grad_norm": 2.150641441345215, + "learning_rate": 4.974983268357271e-05, + "loss": 5.3281, + "step": 7579 + }, + { + "epoch": 0.045080407269959086, + "grad_norm": 1.9636656045913696, + "learning_rate": 4.9749766764960215e-05, + "loss": 5.5003, + "step": 7580 + }, + { + "epoch": 0.04508635455324008, + "grad_norm": 1.826335072517395, + "learning_rate": 4.974970083770783e-05, + "loss": 5.4687, + "step": 7581 + }, + { + "epoch": 0.045092301836521076, + "grad_norm": 1.9246041774749756, + "learning_rate": 4.974963490181558e-05, + "loss": 5.5373, + "step": 7582 + }, + { + "epoch": 0.04509824911980207, + "grad_norm": 1.8421686887741089, + "learning_rate": 4.974956895728349e-05, + "loss": 5.386, + "step": 7583 + }, + { + "epoch": 0.04510419640308307, + "grad_norm": 1.8685556650161743, + "learning_rate": 4.974950300411158e-05, + "loss": 5.5857, + "step": 7584 + }, + { + "epoch": 0.04511014368636407, + "grad_norm": 1.7022168636322021, + "learning_rate": 4.974943704229987e-05, + "loss": 5.2562, + "step": 7585 + }, + { + "epoch": 0.04511609096964506, + "grad_norm": 1.876855731010437, + "learning_rate": 4.97493710718484e-05, + "loss": 5.1359, + "step": 7586 + }, + { + "epoch": 0.045122038252926065, + "grad_norm": 1.8728361129760742, + "learning_rate": 4.974930509275717e-05, + "loss": 5.3124, + "step": 7587 + }, + { + "epoch": 0.04512798553620706, + "grad_norm": 1.930086612701416, + "learning_rate": 4.974923910502622e-05, + "loss": 5.3261, + "step": 7588 + }, + { + "epoch": 0.045133932819488055, + "grad_norm": 2.0309081077575684, + "learning_rate": 4.9749173108655564e-05, + "loss": 5.1138, + "step": 7589 + }, + { + "epoch": 0.04513988010276906, + "grad_norm": 2.042174816131592, + "learning_rate": 4.974910710364522e-05, + "loss": 5.3521, + "step": 7590 + }, + { + "epoch": 0.04514582738605005, + "grad_norm": 1.5278770923614502, + "learning_rate": 4.9749041089995224e-05, + "loss": 5.4075, + "step": 7591 + }, + { + "epoch": 0.04515177466933105, + "grad_norm": 1.7624976634979248, + "learning_rate": 4.974897506770559e-05, + "loss": 5.1698, + "step": 7592 + }, + { + "epoch": 0.04515772195261205, + "grad_norm": 1.9077380895614624, + "learning_rate": 4.974890903677635e-05, + "loss": 5.3973, + "step": 7593 + }, + { + "epoch": 0.045163669235893045, + "grad_norm": 1.5724380016326904, + "learning_rate": 4.974884299720752e-05, + "loss": 5.6325, + "step": 7594 + }, + { + "epoch": 0.04516961651917404, + "grad_norm": 1.9702832698822021, + "learning_rate": 4.974877694899913e-05, + "loss": 5.247, + "step": 7595 + }, + { + "epoch": 0.04517556380245504, + "grad_norm": 1.9913853406906128, + "learning_rate": 4.974871089215118e-05, + "loss": 5.6393, + "step": 7596 + }, + { + "epoch": 0.04518151108573604, + "grad_norm": 1.806470274925232, + "learning_rate": 4.974864482666372e-05, + "loss": 5.302, + "step": 7597 + }, + { + "epoch": 0.04518745836901703, + "grad_norm": 1.7056912183761597, + "learning_rate": 4.974857875253678e-05, + "loss": 5.4066, + "step": 7598 + }, + { + "epoch": 0.04519340565229803, + "grad_norm": 1.5990647077560425, + "learning_rate": 4.974851266977035e-05, + "loss": 5.4087, + "step": 7599 + }, + { + "epoch": 0.04519935293557903, + "grad_norm": 1.9233685731887817, + "learning_rate": 4.974844657836447e-05, + "loss": 5.4891, + "step": 7600 + }, + { + "epoch": 0.045205300218860024, + "grad_norm": 1.8654414415359497, + "learning_rate": 4.9748380478319165e-05, + "loss": 5.4955, + "step": 7601 + }, + { + "epoch": 0.04521124750214102, + "grad_norm": 1.7592424154281616, + "learning_rate": 4.974831436963446e-05, + "loss": 5.2298, + "step": 7602 + }, + { + "epoch": 0.04521719478542202, + "grad_norm": 1.8132792711257935, + "learning_rate": 4.974824825231037e-05, + "loss": 5.3487, + "step": 7603 + }, + { + "epoch": 0.045223142068703016, + "grad_norm": 1.8109947443008423, + "learning_rate": 4.974818212634692e-05, + "loss": 5.4511, + "step": 7604 + }, + { + "epoch": 0.04522908935198401, + "grad_norm": 1.96711266040802, + "learning_rate": 4.974811599174414e-05, + "loss": 5.3249, + "step": 7605 + }, + { + "epoch": 0.045235036635265014, + "grad_norm": 1.9123655557632446, + "learning_rate": 4.9748049848502054e-05, + "loss": 5.3681, + "step": 7606 + }, + { + "epoch": 0.04524098391854601, + "grad_norm": 1.7210376262664795, + "learning_rate": 4.974798369662067e-05, + "loss": 5.3441, + "step": 7607 + }, + { + "epoch": 0.045246931201827004, + "grad_norm": 1.590617060661316, + "learning_rate": 4.974791753610002e-05, + "loss": 5.5619, + "step": 7608 + }, + { + "epoch": 0.045252878485108006, + "grad_norm": 1.77785062789917, + "learning_rate": 4.974785136694013e-05, + "loss": 5.4717, + "step": 7609 + }, + { + "epoch": 0.045258825768389, + "grad_norm": 1.66475510597229, + "learning_rate": 4.9747785189141025e-05, + "loss": 5.3501, + "step": 7610 + }, + { + "epoch": 0.045264773051669996, + "grad_norm": 1.9176442623138428, + "learning_rate": 4.974771900270272e-05, + "loss": 5.1197, + "step": 7611 + }, + { + "epoch": 0.04527072033495099, + "grad_norm": 1.8143234252929688, + "learning_rate": 4.974765280762525e-05, + "loss": 5.3103, + "step": 7612 + }, + { + "epoch": 0.04527666761823199, + "grad_norm": 1.8954168558120728, + "learning_rate": 4.974758660390861e-05, + "loss": 5.2009, + "step": 7613 + }, + { + "epoch": 0.04528261490151299, + "grad_norm": 1.7779622077941895, + "learning_rate": 4.974752039155286e-05, + "loss": 5.519, + "step": 7614 + }, + { + "epoch": 0.04528856218479398, + "grad_norm": 1.8181761503219604, + "learning_rate": 4.9747454170558e-05, + "loss": 5.4967, + "step": 7615 + }, + { + "epoch": 0.045294509468074985, + "grad_norm": 1.657665491104126, + "learning_rate": 4.9747387940924064e-05, + "loss": 5.6437, + "step": 7616 + }, + { + "epoch": 0.04530045675135598, + "grad_norm": 1.7993237972259521, + "learning_rate": 4.974732170265107e-05, + "loss": 5.3094, + "step": 7617 + }, + { + "epoch": 0.045306404034636975, + "grad_norm": 1.8798805475234985, + "learning_rate": 4.974725545573904e-05, + "loss": 5.3268, + "step": 7618 + }, + { + "epoch": 0.04531235131791798, + "grad_norm": 1.9271420240402222, + "learning_rate": 4.974718920018799e-05, + "loss": 5.3405, + "step": 7619 + }, + { + "epoch": 0.04531829860119897, + "grad_norm": 1.9256294965744019, + "learning_rate": 4.9747122935997967e-05, + "loss": 5.3118, + "step": 7620 + }, + { + "epoch": 0.04532424588447997, + "grad_norm": 2.3345041275024414, + "learning_rate": 4.9747056663168965e-05, + "loss": 4.9813, + "step": 7621 + }, + { + "epoch": 0.04533019316776097, + "grad_norm": 1.7056258916854858, + "learning_rate": 4.974699038170103e-05, + "loss": 5.4725, + "step": 7622 + }, + { + "epoch": 0.045336140451041965, + "grad_norm": 2.075711250305176, + "learning_rate": 4.9746924091594174e-05, + "loss": 5.2215, + "step": 7623 + }, + { + "epoch": 0.04534208773432296, + "grad_norm": 1.818048357963562, + "learning_rate": 4.974685779284843e-05, + "loss": 5.0463, + "step": 7624 + }, + { + "epoch": 0.04534803501760396, + "grad_norm": 1.6590908765792847, + "learning_rate": 4.9746791485463806e-05, + "loss": 5.2476, + "step": 7625 + }, + { + "epoch": 0.04535398230088496, + "grad_norm": 2.2024991512298584, + "learning_rate": 4.974672516944033e-05, + "loss": 5.6437, + "step": 7626 + }, + { + "epoch": 0.04535992958416595, + "grad_norm": 1.71639883518219, + "learning_rate": 4.974665884477803e-05, + "loss": 5.2418, + "step": 7627 + }, + { + "epoch": 0.04536587686744695, + "grad_norm": 1.75436270236969, + "learning_rate": 4.974659251147693e-05, + "loss": 5.2209, + "step": 7628 + }, + { + "epoch": 0.04537182415072795, + "grad_norm": 2.577916383743286, + "learning_rate": 4.974652616953705e-05, + "loss": 5.2385, + "step": 7629 + }, + { + "epoch": 0.045377771434008944, + "grad_norm": 1.9784717559814453, + "learning_rate": 4.9746459818958416e-05, + "loss": 5.265, + "step": 7630 + }, + { + "epoch": 0.04538371871728994, + "grad_norm": 1.971383810043335, + "learning_rate": 4.974639345974104e-05, + "loss": 5.0548, + "step": 7631 + }, + { + "epoch": 0.04538966600057094, + "grad_norm": 2.096876621246338, + "learning_rate": 4.974632709188496e-05, + "loss": 5.1491, + "step": 7632 + }, + { + "epoch": 0.045395613283851936, + "grad_norm": 1.6079102754592896, + "learning_rate": 4.974626071539019e-05, + "loss": 5.1959, + "step": 7633 + }, + { + "epoch": 0.04540156056713293, + "grad_norm": 1.6881030797958374, + "learning_rate": 4.9746194330256755e-05, + "loss": 5.1772, + "step": 7634 + }, + { + "epoch": 0.04540750785041393, + "grad_norm": 1.7459675073623657, + "learning_rate": 4.974612793648469e-05, + "loss": 5.1885, + "step": 7635 + }, + { + "epoch": 0.04541345513369493, + "grad_norm": 1.739272117614746, + "learning_rate": 4.9746061534073993e-05, + "loss": 5.318, + "step": 7636 + }, + { + "epoch": 0.045419402416975924, + "grad_norm": 1.7761027812957764, + "learning_rate": 4.974599512302471e-05, + "loss": 5.1525, + "step": 7637 + }, + { + "epoch": 0.045425349700256926, + "grad_norm": 1.8695855140686035, + "learning_rate": 4.9745928703336854e-05, + "loss": 5.5754, + "step": 7638 + }, + { + "epoch": 0.04543129698353792, + "grad_norm": 1.8737404346466064, + "learning_rate": 4.9745862275010446e-05, + "loss": 5.2908, + "step": 7639 + }, + { + "epoch": 0.045437244266818916, + "grad_norm": 1.731676459312439, + "learning_rate": 4.9745795838045515e-05, + "loss": 5.2671, + "step": 7640 + }, + { + "epoch": 0.04544319155009991, + "grad_norm": 1.6687474250793457, + "learning_rate": 4.974572939244209e-05, + "loss": 5.1629, + "step": 7641 + }, + { + "epoch": 0.04544913883338091, + "grad_norm": 2.1376633644104004, + "learning_rate": 4.974566293820018e-05, + "loss": 5.2853, + "step": 7642 + }, + { + "epoch": 0.04545508611666191, + "grad_norm": 2.0989861488342285, + "learning_rate": 4.974559647531981e-05, + "loss": 5.1311, + "step": 7643 + }, + { + "epoch": 0.0454610333999429, + "grad_norm": 2.3433620929718018, + "learning_rate": 4.974553000380102e-05, + "loss": 4.9854, + "step": 7644 + }, + { + "epoch": 0.045466980683223905, + "grad_norm": 2.306170701980591, + "learning_rate": 4.974546352364381e-05, + "loss": 5.3152, + "step": 7645 + }, + { + "epoch": 0.0454729279665049, + "grad_norm": 1.9588537216186523, + "learning_rate": 4.974539703484822e-05, + "loss": 5.3903, + "step": 7646 + }, + { + "epoch": 0.045478875249785895, + "grad_norm": 1.7994736433029175, + "learning_rate": 4.9745330537414265e-05, + "loss": 5.2505, + "step": 7647 + }, + { + "epoch": 0.0454848225330669, + "grad_norm": 1.983175277709961, + "learning_rate": 4.974526403134197e-05, + "loss": 5.2607, + "step": 7648 + }, + { + "epoch": 0.04549076981634789, + "grad_norm": 1.8853832483291626, + "learning_rate": 4.974519751663136e-05, + "loss": 5.1475, + "step": 7649 + }, + { + "epoch": 0.04549671709962889, + "grad_norm": 1.9374700784683228, + "learning_rate": 4.9745130993282464e-05, + "loss": 5.2039, + "step": 7650 + }, + { + "epoch": 0.04550266438290989, + "grad_norm": 1.8200404644012451, + "learning_rate": 4.974506446129529e-05, + "loss": 5.2794, + "step": 7651 + }, + { + "epoch": 0.045508611666190885, + "grad_norm": 1.8375320434570312, + "learning_rate": 4.974499792066987e-05, + "loss": 5.1149, + "step": 7652 + }, + { + "epoch": 0.04551455894947188, + "grad_norm": 1.7842520475387573, + "learning_rate": 4.974493137140623e-05, + "loss": 5.0332, + "step": 7653 + }, + { + "epoch": 0.04552050623275288, + "grad_norm": 2.0220818519592285, + "learning_rate": 4.974486481350439e-05, + "loss": 5.0277, + "step": 7654 + }, + { + "epoch": 0.04552645351603388, + "grad_norm": 2.0787746906280518, + "learning_rate": 4.9744798246964375e-05, + "loss": 5.0587, + "step": 7655 + }, + { + "epoch": 0.04553240079931487, + "grad_norm": 1.7024985551834106, + "learning_rate": 4.97447316717862e-05, + "loss": 5.0184, + "step": 7656 + }, + { + "epoch": 0.04553834808259587, + "grad_norm": 1.9057540893554688, + "learning_rate": 4.97446650879699e-05, + "loss": 5.3945, + "step": 7657 + }, + { + "epoch": 0.04554429536587687, + "grad_norm": 1.7963287830352783, + "learning_rate": 4.974459849551549e-05, + "loss": 4.9869, + "step": 7658 + }, + { + "epoch": 0.045550242649157864, + "grad_norm": 2.027353286743164, + "learning_rate": 4.974453189442299e-05, + "loss": 5.1389, + "step": 7659 + }, + { + "epoch": 0.04555618993243886, + "grad_norm": 1.7137126922607422, + "learning_rate": 4.9744465284692445e-05, + "loss": 5.058, + "step": 7660 + }, + { + "epoch": 0.04556213721571986, + "grad_norm": 2.0363876819610596, + "learning_rate": 4.9744398666323854e-05, + "loss": 4.9174, + "step": 7661 + }, + { + "epoch": 0.045568084499000856, + "grad_norm": 2.1440837383270264, + "learning_rate": 4.9744332039317255e-05, + "loss": 4.8894, + "step": 7662 + }, + { + "epoch": 0.04557403178228185, + "grad_norm": 1.9582308530807495, + "learning_rate": 4.9744265403672655e-05, + "loss": 5.0666, + "step": 7663 + }, + { + "epoch": 0.04557997906556285, + "grad_norm": 1.9997116327285767, + "learning_rate": 4.97441987593901e-05, + "loss": 5.0804, + "step": 7664 + }, + { + "epoch": 0.04558592634884385, + "grad_norm": 2.067361831665039, + "learning_rate": 4.9744132106469586e-05, + "loss": 4.8655, + "step": 7665 + }, + { + "epoch": 0.045591873632124844, + "grad_norm": 1.7066930532455444, + "learning_rate": 4.9744065444911165e-05, + "loss": 4.792, + "step": 7666 + }, + { + "epoch": 0.045597820915405846, + "grad_norm": 1.8526182174682617, + "learning_rate": 4.974399877471484e-05, + "loss": 4.755, + "step": 7667 + }, + { + "epoch": 0.04560376819868684, + "grad_norm": 1.8744564056396484, + "learning_rate": 4.9743932095880644e-05, + "loss": 4.7732, + "step": 7668 + }, + { + "epoch": 0.045609715481967836, + "grad_norm": 1.849574327468872, + "learning_rate": 4.97438654084086e-05, + "loss": 4.7743, + "step": 7669 + }, + { + "epoch": 0.04561566276524884, + "grad_norm": 1.87284255027771, + "learning_rate": 4.9743798712298714e-05, + "loss": 5.0582, + "step": 7670 + }, + { + "epoch": 0.04562161004852983, + "grad_norm": 2.206273078918457, + "learning_rate": 4.974373200755104e-05, + "loss": 5.4683, + "step": 7671 + }, + { + "epoch": 0.04562755733181083, + "grad_norm": 1.9849058389663696, + "learning_rate": 4.974366529416557e-05, + "loss": 5.4087, + "step": 7672 + }, + { + "epoch": 0.04563350461509182, + "grad_norm": 1.9440083503723145, + "learning_rate": 4.974359857214235e-05, + "loss": 4.9607, + "step": 7673 + }, + { + "epoch": 0.045639451898372825, + "grad_norm": 1.7112319469451904, + "learning_rate": 4.974353184148139e-05, + "loss": 5.6589, + "step": 7674 + }, + { + "epoch": 0.04564539918165382, + "grad_norm": 1.921215295791626, + "learning_rate": 4.974346510218273e-05, + "loss": 5.4495, + "step": 7675 + }, + { + "epoch": 0.045651346464934815, + "grad_norm": 1.9582061767578125, + "learning_rate": 4.974339835424637e-05, + "loss": 5.2459, + "step": 7676 + }, + { + "epoch": 0.04565729374821582, + "grad_norm": 1.9781824350357056, + "learning_rate": 4.974333159767235e-05, + "loss": 5.3424, + "step": 7677 + }, + { + "epoch": 0.04566324103149681, + "grad_norm": 1.7183479070663452, + "learning_rate": 4.974326483246069e-05, + "loss": 5.3741, + "step": 7678 + }, + { + "epoch": 0.04566918831477781, + "grad_norm": 1.7942447662353516, + "learning_rate": 4.974319805861141e-05, + "loss": 5.4008, + "step": 7679 + }, + { + "epoch": 0.04567513559805881, + "grad_norm": 1.8255115747451782, + "learning_rate": 4.974313127612454e-05, + "loss": 5.1849, + "step": 7680 + }, + { + "epoch": 0.045681082881339805, + "grad_norm": 1.7907564640045166, + "learning_rate": 4.974306448500009e-05, + "loss": 5.1757, + "step": 7681 + }, + { + "epoch": 0.0456870301646208, + "grad_norm": 2.911489486694336, + "learning_rate": 4.97429976852381e-05, + "loss": 4.8909, + "step": 7682 + }, + { + "epoch": 0.0456929774479018, + "grad_norm": 2.849125623703003, + "learning_rate": 4.9742930876838576e-05, + "loss": 4.7733, + "step": 7683 + }, + { + "epoch": 0.0456989247311828, + "grad_norm": 2.4196949005126953, + "learning_rate": 4.9742864059801565e-05, + "loss": 4.8571, + "step": 7684 + }, + { + "epoch": 0.04570487201446379, + "grad_norm": 1.9430558681488037, + "learning_rate": 4.974279723412706e-05, + "loss": 5.1338, + "step": 7685 + }, + { + "epoch": 0.04571081929774479, + "grad_norm": 1.7538554668426514, + "learning_rate": 4.9742730399815105e-05, + "loss": 5.5524, + "step": 7686 + }, + { + "epoch": 0.04571676658102579, + "grad_norm": 2.006115198135376, + "learning_rate": 4.9742663556865724e-05, + "loss": 5.3343, + "step": 7687 + }, + { + "epoch": 0.045722713864306784, + "grad_norm": 2.554234027862549, + "learning_rate": 4.974259670527893e-05, + "loss": 5.8426, + "step": 7688 + }, + { + "epoch": 0.04572866114758778, + "grad_norm": 2.656747579574585, + "learning_rate": 4.974252984505475e-05, + "loss": 5.1578, + "step": 7689 + }, + { + "epoch": 0.04573460843086878, + "grad_norm": 2.800208568572998, + "learning_rate": 4.9742462976193216e-05, + "loss": 4.8019, + "step": 7690 + }, + { + "epoch": 0.045740555714149776, + "grad_norm": 2.674938201904297, + "learning_rate": 4.974239609869433e-05, + "loss": 4.7177, + "step": 7691 + }, + { + "epoch": 0.04574650299743077, + "grad_norm": 2.751533269882202, + "learning_rate": 4.974232921255815e-05, + "loss": 4.7568, + "step": 7692 + }, + { + "epoch": 0.04575245028071177, + "grad_norm": 2.623917818069458, + "learning_rate": 4.974226231778466e-05, + "loss": 4.5908, + "step": 7693 + }, + { + "epoch": 0.04575839756399277, + "grad_norm": 2.2248899936676025, + "learning_rate": 4.9742195414373904e-05, + "loss": 5.4066, + "step": 7694 + }, + { + "epoch": 0.045764344847273764, + "grad_norm": 1.7959388494491577, + "learning_rate": 4.974212850232591e-05, + "loss": 6.1414, + "step": 7695 + }, + { + "epoch": 0.045770292130554766, + "grad_norm": 2.0049352645874023, + "learning_rate": 4.974206158164069e-05, + "loss": 6.0106, + "step": 7696 + }, + { + "epoch": 0.04577623941383576, + "grad_norm": 2.4794270992279053, + "learning_rate": 4.9741994652318276e-05, + "loss": 5.8647, + "step": 7697 + }, + { + "epoch": 0.045782186697116756, + "grad_norm": 3.9380109310150146, + "learning_rate": 4.974192771435868e-05, + "loss": 5.719, + "step": 7698 + }, + { + "epoch": 0.04578813398039776, + "grad_norm": 2.564023017883301, + "learning_rate": 4.974186076776194e-05, + "loss": 4.7294, + "step": 7699 + }, + { + "epoch": 0.04579408126367875, + "grad_norm": 3.7082693576812744, + "learning_rate": 4.974179381252807e-05, + "loss": 5.1975, + "step": 7700 + }, + { + "epoch": 0.04580002854695975, + "grad_norm": 4.0067524909973145, + "learning_rate": 4.97417268486571e-05, + "loss": 5.4047, + "step": 7701 + }, + { + "epoch": 0.04580597583024074, + "grad_norm": 3.978787660598755, + "learning_rate": 4.974165987614904e-05, + "loss": 5.7023, + "step": 7702 + }, + { + "epoch": 0.045811923113521745, + "grad_norm": 4.597605228424072, + "learning_rate": 4.974159289500392e-05, + "loss": 6.5186, + "step": 7703 + }, + { + "epoch": 0.04581787039680274, + "grad_norm": 2.8793985843658447, + "learning_rate": 4.974152590522177e-05, + "loss": 6.1476, + "step": 7704 + }, + { + "epoch": 0.045823817680083735, + "grad_norm": 2.466089963912964, + "learning_rate": 4.974145890680262e-05, + "loss": 5.5154, + "step": 7705 + }, + { + "epoch": 0.04582976496336474, + "grad_norm": 2.937228202819824, + "learning_rate": 4.974139189974647e-05, + "loss": 5.5146, + "step": 7706 + }, + { + "epoch": 0.04583571224664573, + "grad_norm": 2.4580399990081787, + "learning_rate": 4.974132488405336e-05, + "loss": 6.214, + "step": 7707 + }, + { + "epoch": 0.04584165952992673, + "grad_norm": 4.910717010498047, + "learning_rate": 4.97412578597233e-05, + "loss": 5.819, + "step": 7708 + }, + { + "epoch": 0.04584760681320773, + "grad_norm": 5.372139930725098, + "learning_rate": 4.974119082675634e-05, + "loss": 5.3242, + "step": 7709 + }, + { + "epoch": 0.045853554096488724, + "grad_norm": 2.050492525100708, + "learning_rate": 4.9741123785152474e-05, + "loss": 6.0468, + "step": 7710 + }, + { + "epoch": 0.04585950137976972, + "grad_norm": 1.7090541124343872, + "learning_rate": 4.974105673491174e-05, + "loss": 5.7652, + "step": 7711 + }, + { + "epoch": 0.04586544866305072, + "grad_norm": 2.512538194656372, + "learning_rate": 4.974098967603415e-05, + "loss": 5.3184, + "step": 7712 + }, + { + "epoch": 0.04587139594633172, + "grad_norm": 3.311289072036743, + "learning_rate": 4.974092260851975e-05, + "loss": 5.5379, + "step": 7713 + }, + { + "epoch": 0.04587734322961271, + "grad_norm": 3.3318710327148438, + "learning_rate": 4.974085553236854e-05, + "loss": 5.5543, + "step": 7714 + }, + { + "epoch": 0.04588329051289371, + "grad_norm": 2.6384379863739014, + "learning_rate": 4.9740788447580555e-05, + "loss": 6.3475, + "step": 7715 + }, + { + "epoch": 0.04588923779617471, + "grad_norm": 2.0066304206848145, + "learning_rate": 4.974072135415582e-05, + "loss": 6.3685, + "step": 7716 + }, + { + "epoch": 0.045895185079455704, + "grad_norm": 2.4189116954803467, + "learning_rate": 4.9740654252094356e-05, + "loss": 5.4128, + "step": 7717 + }, + { + "epoch": 0.0459011323627367, + "grad_norm": 2.431011438369751, + "learning_rate": 4.974058714139618e-05, + "loss": 5.34, + "step": 7718 + }, + { + "epoch": 0.0459070796460177, + "grad_norm": 2.1997156143188477, + "learning_rate": 4.974052002206132e-05, + "loss": 5.4223, + "step": 7719 + }, + { + "epoch": 0.045913026929298696, + "grad_norm": 2.0700082778930664, + "learning_rate": 4.9740452894089806e-05, + "loss": 5.4255, + "step": 7720 + }, + { + "epoch": 0.04591897421257969, + "grad_norm": 2.3476040363311768, + "learning_rate": 4.974038575748165e-05, + "loss": 5.5055, + "step": 7721 + }, + { + "epoch": 0.04592492149586069, + "grad_norm": 4.2995524406433105, + "learning_rate": 4.974031861223688e-05, + "loss": 5.8869, + "step": 7722 + }, + { + "epoch": 0.04593086877914169, + "grad_norm": 4.690639495849609, + "learning_rate": 4.974025145835552e-05, + "loss": 6.0808, + "step": 7723 + }, + { + "epoch": 0.04593681606242268, + "grad_norm": 3.9823479652404785, + "learning_rate": 4.97401842958376e-05, + "loss": 6.0844, + "step": 7724 + }, + { + "epoch": 0.045942763345703685, + "grad_norm": 3.69808030128479, + "learning_rate": 4.9740117124683136e-05, + "loss": 5.9611, + "step": 7725 + }, + { + "epoch": 0.04594871062898468, + "grad_norm": 2.5912535190582275, + "learning_rate": 4.974004994489215e-05, + "loss": 5.9669, + "step": 7726 + }, + { + "epoch": 0.045954657912265676, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.973998275646467e-05, + "loss": 5.6717, + "step": 7727 + }, + { + "epoch": 0.04596060519554668, + "grad_norm": 2.179302930831909, + "learning_rate": 4.973991555940072e-05, + "loss": 5.4077, + "step": 7728 + }, + { + "epoch": 0.04596655247882767, + "grad_norm": 2.4919214248657227, + "learning_rate": 4.973984835370031e-05, + "loss": 6.118, + "step": 7729 + }, + { + "epoch": 0.04597249976210867, + "grad_norm": 3.5036723613739014, + "learning_rate": 4.9739781139363485e-05, + "loss": 5.436, + "step": 7730 + }, + { + "epoch": 0.04597844704538966, + "grad_norm": 4.129561424255371, + "learning_rate": 4.973971391639026e-05, + "loss": 4.8414, + "step": 7731 + }, + { + "epoch": 0.045984394328670665, + "grad_norm": 2.867039203643799, + "learning_rate": 4.973964668478065e-05, + "loss": 4.7385, + "step": 7732 + }, + { + "epoch": 0.04599034161195166, + "grad_norm": 2.754023313522339, + "learning_rate": 4.973957944453469e-05, + "loss": 4.6063, + "step": 7733 + }, + { + "epoch": 0.045996288895232655, + "grad_norm": 2.1025235652923584, + "learning_rate": 4.973951219565239e-05, + "loss": 5.3233, + "step": 7734 + }, + { + "epoch": 0.04600223617851366, + "grad_norm": 2.352883815765381, + "learning_rate": 4.973944493813379e-05, + "loss": 5.5648, + "step": 7735 + }, + { + "epoch": 0.04600818346179465, + "grad_norm": 2.049377679824829, + "learning_rate": 4.97393776719789e-05, + "loss": 6.1241, + "step": 7736 + }, + { + "epoch": 0.04601413074507565, + "grad_norm": 1.7124110460281372, + "learning_rate": 4.9739310397187756e-05, + "loss": 6.1258, + "step": 7737 + }, + { + "epoch": 0.04602007802835665, + "grad_norm": 2.2592861652374268, + "learning_rate": 4.9739243113760364e-05, + "loss": 6.1972, + "step": 7738 + }, + { + "epoch": 0.046026025311637644, + "grad_norm": 2.3926188945770264, + "learning_rate": 4.973917582169677e-05, + "loss": 6.1681, + "step": 7739 + }, + { + "epoch": 0.04603197259491864, + "grad_norm": 1.9956084489822388, + "learning_rate": 4.973910852099698e-05, + "loss": 6.2068, + "step": 7740 + }, + { + "epoch": 0.04603791987819964, + "grad_norm": 1.924467921257019, + "learning_rate": 4.973904121166102e-05, + "loss": 6.4391, + "step": 7741 + }, + { + "epoch": 0.04604386716148064, + "grad_norm": 1.9410041570663452, + "learning_rate": 4.973897389368891e-05, + "loss": 5.9378, + "step": 7742 + }, + { + "epoch": 0.04604981444476163, + "grad_norm": 2.0418617725372314, + "learning_rate": 4.9738906567080686e-05, + "loss": 5.8823, + "step": 7743 + }, + { + "epoch": 0.04605576172804263, + "grad_norm": 2.696143627166748, + "learning_rate": 4.973883923183637e-05, + "loss": 5.8551, + "step": 7744 + }, + { + "epoch": 0.04606170901132363, + "grad_norm": 2.482703447341919, + "learning_rate": 4.973877188795598e-05, + "loss": 5.5752, + "step": 7745 + }, + { + "epoch": 0.046067656294604624, + "grad_norm": 2.520437240600586, + "learning_rate": 4.973870453543954e-05, + "loss": 5.571, + "step": 7746 + }, + { + "epoch": 0.04607360357788562, + "grad_norm": 2.568150758743286, + "learning_rate": 4.973863717428707e-05, + "loss": 5.9145, + "step": 7747 + }, + { + "epoch": 0.04607955086116662, + "grad_norm": 2.6373183727264404, + "learning_rate": 4.9738569804498605e-05, + "loss": 5.9414, + "step": 7748 + }, + { + "epoch": 0.046085498144447616, + "grad_norm": 2.1663565635681152, + "learning_rate": 4.973850242607415e-05, + "loss": 6.2316, + "step": 7749 + }, + { + "epoch": 0.04609144542772861, + "grad_norm": 2.044316053390503, + "learning_rate": 4.973843503901374e-05, + "loss": 5.7232, + "step": 7750 + }, + { + "epoch": 0.04609739271100961, + "grad_norm": 2.1740782260894775, + "learning_rate": 4.9738367643317405e-05, + "loss": 6.0388, + "step": 7751 + }, + { + "epoch": 0.04610333999429061, + "grad_norm": 2.0643458366394043, + "learning_rate": 4.973830023898516e-05, + "loss": 5.8201, + "step": 7752 + }, + { + "epoch": 0.0461092872775716, + "grad_norm": 1.7433217763900757, + "learning_rate": 4.973823282601703e-05, + "loss": 6.0464, + "step": 7753 + }, + { + "epoch": 0.046115234560852605, + "grad_norm": 2.657677412033081, + "learning_rate": 4.9738165404413037e-05, + "loss": 5.2849, + "step": 7754 + }, + { + "epoch": 0.0461211818441336, + "grad_norm": 1.7317034006118774, + "learning_rate": 4.9738097974173205e-05, + "loss": 6.0619, + "step": 7755 + }, + { + "epoch": 0.046127129127414596, + "grad_norm": 1.6109949350357056, + "learning_rate": 4.973803053529756e-05, + "loss": 5.7832, + "step": 7756 + }, + { + "epoch": 0.0461330764106956, + "grad_norm": 2.2980475425720215, + "learning_rate": 4.9737963087786125e-05, + "loss": 5.4346, + "step": 7757 + }, + { + "epoch": 0.04613902369397659, + "grad_norm": 2.5162737369537354, + "learning_rate": 4.973789563163892e-05, + "loss": 5.3723, + "step": 7758 + }, + { + "epoch": 0.04614497097725759, + "grad_norm": 2.3493261337280273, + "learning_rate": 4.973782816685597e-05, + "loss": 5.7474, + "step": 7759 + }, + { + "epoch": 0.04615091826053858, + "grad_norm": 2.1428544521331787, + "learning_rate": 4.9737760693437306e-05, + "loss": 5.6318, + "step": 7760 + }, + { + "epoch": 0.046156865543819585, + "grad_norm": 2.11627197265625, + "learning_rate": 4.973769321138294e-05, + "loss": 5.38, + "step": 7761 + }, + { + "epoch": 0.04616281282710058, + "grad_norm": 2.411957263946533, + "learning_rate": 4.9737625720692906e-05, + "loss": 5.1822, + "step": 7762 + }, + { + "epoch": 0.046168760110381575, + "grad_norm": 2.3566222190856934, + "learning_rate": 4.973755822136722e-05, + "loss": 5.0405, + "step": 7763 + }, + { + "epoch": 0.04617470739366258, + "grad_norm": 2.2235679626464844, + "learning_rate": 4.973749071340591e-05, + "loss": 5.4746, + "step": 7764 + }, + { + "epoch": 0.04618065467694357, + "grad_norm": 2.4175586700439453, + "learning_rate": 4.973742319680899e-05, + "loss": 5.7519, + "step": 7765 + }, + { + "epoch": 0.04618660196022457, + "grad_norm": 2.3386452198028564, + "learning_rate": 4.9737355671576496e-05, + "loss": 6.1765, + "step": 7766 + }, + { + "epoch": 0.04619254924350557, + "grad_norm": 2.084333658218384, + "learning_rate": 4.973728813770845e-05, + "loss": 6.1439, + "step": 7767 + }, + { + "epoch": 0.046198496526786564, + "grad_norm": 2.0523531436920166, + "learning_rate": 4.973722059520487e-05, + "loss": 6.294, + "step": 7768 + }, + { + "epoch": 0.04620444381006756, + "grad_norm": 2.1187572479248047, + "learning_rate": 4.973715304406578e-05, + "loss": 5.3679, + "step": 7769 + }, + { + "epoch": 0.04621039109334856, + "grad_norm": 2.5249836444854736, + "learning_rate": 4.9737085484291204e-05, + "loss": 5.9086, + "step": 7770 + }, + { + "epoch": 0.04621633837662956, + "grad_norm": 2.35662841796875, + "learning_rate": 4.973701791588117e-05, + "loss": 6.3135, + "step": 7771 + }, + { + "epoch": 0.04622228565991055, + "grad_norm": 2.070955276489258, + "learning_rate": 4.9736950338835695e-05, + "loss": 5.8748, + "step": 7772 + }, + { + "epoch": 0.04622823294319155, + "grad_norm": 2.151587963104248, + "learning_rate": 4.9736882753154814e-05, + "loss": 6.2053, + "step": 7773 + }, + { + "epoch": 0.04623418022647255, + "grad_norm": 2.2187843322753906, + "learning_rate": 4.9736815158838534e-05, + "loss": 5.762, + "step": 7774 + }, + { + "epoch": 0.046240127509753544, + "grad_norm": 1.8676223754882812, + "learning_rate": 4.973674755588689e-05, + "loss": 6.06, + "step": 7775 + }, + { + "epoch": 0.04624607479303454, + "grad_norm": 2.2110252380371094, + "learning_rate": 4.9736679944299906e-05, + "loss": 5.6474, + "step": 7776 + }, + { + "epoch": 0.04625202207631554, + "grad_norm": 2.0635151863098145, + "learning_rate": 4.9736612324077605e-05, + "loss": 5.5579, + "step": 7777 + }, + { + "epoch": 0.046257969359596536, + "grad_norm": 2.1654598712921143, + "learning_rate": 4.973654469522e-05, + "loss": 5.5388, + "step": 7778 + }, + { + "epoch": 0.04626391664287753, + "grad_norm": 2.3735673427581787, + "learning_rate": 4.973647705772713e-05, + "loss": 5.4383, + "step": 7779 + }, + { + "epoch": 0.04626986392615853, + "grad_norm": 2.344160318374634, + "learning_rate": 4.9736409411599e-05, + "loss": 5.6501, + "step": 7780 + }, + { + "epoch": 0.04627581120943953, + "grad_norm": 3.023350477218628, + "learning_rate": 4.973634175683566e-05, + "loss": 5.2688, + "step": 7781 + }, + { + "epoch": 0.04628175849272052, + "grad_norm": 2.8814494609832764, + "learning_rate": 4.973627409343711e-05, + "loss": 5.08, + "step": 7782 + }, + { + "epoch": 0.046287705776001525, + "grad_norm": 2.475191831588745, + "learning_rate": 4.973620642140339e-05, + "loss": 5.0761, + "step": 7783 + }, + { + "epoch": 0.04629365305928252, + "grad_norm": 2.5567755699157715, + "learning_rate": 4.9736138740734504e-05, + "loss": 5.46, + "step": 7784 + }, + { + "epoch": 0.046299600342563516, + "grad_norm": 2.9225175380706787, + "learning_rate": 4.973607105143049e-05, + "loss": 5.5219, + "step": 7785 + }, + { + "epoch": 0.04630554762584452, + "grad_norm": 2.3112781047821045, + "learning_rate": 4.973600335349138e-05, + "loss": 6.4204, + "step": 7786 + }, + { + "epoch": 0.04631149490912551, + "grad_norm": 2.228182554244995, + "learning_rate": 4.973593564691717e-05, + "loss": 6.3299, + "step": 7787 + }, + { + "epoch": 0.04631744219240651, + "grad_norm": 1.8612277507781982, + "learning_rate": 4.973586793170792e-05, + "loss": 5.994, + "step": 7788 + }, + { + "epoch": 0.0463233894756875, + "grad_norm": 1.9788155555725098, + "learning_rate": 4.9735800207863626e-05, + "loss": 6.1676, + "step": 7789 + }, + { + "epoch": 0.046329336758968505, + "grad_norm": 2.2335264682769775, + "learning_rate": 4.973573247538431e-05, + "loss": 6.3112, + "step": 7790 + }, + { + "epoch": 0.0463352840422495, + "grad_norm": 2.168656349182129, + "learning_rate": 4.973566473427001e-05, + "loss": 5.8326, + "step": 7791 + }, + { + "epoch": 0.046341231325530495, + "grad_norm": 1.9187591075897217, + "learning_rate": 4.9735596984520755e-05, + "loss": 5.8734, + "step": 7792 + }, + { + "epoch": 0.0463471786088115, + "grad_norm": 2.195242166519165, + "learning_rate": 4.973552922613655e-05, + "loss": 6.1325, + "step": 7793 + }, + { + "epoch": 0.04635312589209249, + "grad_norm": 1.9698888063430786, + "learning_rate": 4.973546145911743e-05, + "loss": 5.8586, + "step": 7794 + }, + { + "epoch": 0.04635907317537349, + "grad_norm": 2.2149972915649414, + "learning_rate": 4.973539368346342e-05, + "loss": 5.4087, + "step": 7795 + }, + { + "epoch": 0.04636502045865449, + "grad_norm": 1.8587820529937744, + "learning_rate": 4.973532589917453e-05, + "loss": 5.9956, + "step": 7796 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 2.022866725921631, + "learning_rate": 4.97352581062508e-05, + "loss": 6.0905, + "step": 7797 + }, + { + "epoch": 0.04637691502521648, + "grad_norm": 2.0257678031921387, + "learning_rate": 4.973519030469225e-05, + "loss": 6.02, + "step": 7798 + }, + { + "epoch": 0.04638286230849748, + "grad_norm": 1.6909089088439941, + "learning_rate": 4.973512249449889e-05, + "loss": 5.727, + "step": 7799 + }, + { + "epoch": 0.046388809591778477, + "grad_norm": 1.8882997035980225, + "learning_rate": 4.9735054675670754e-05, + "loss": 5.655, + "step": 7800 + }, + { + "epoch": 0.04639475687505947, + "grad_norm": 2.1775193214416504, + "learning_rate": 4.9734986848207876e-05, + "loss": 5.8067, + "step": 7801 + }, + { + "epoch": 0.04640070415834047, + "grad_norm": 2.136690139770508, + "learning_rate": 4.973491901211027e-05, + "loss": 5.5515, + "step": 7802 + }, + { + "epoch": 0.04640665144162147, + "grad_norm": 1.8036144971847534, + "learning_rate": 4.973485116737795e-05, + "loss": 5.8404, + "step": 7803 + }, + { + "epoch": 0.046412598724902464, + "grad_norm": 2.1350481510162354, + "learning_rate": 4.973478331401096e-05, + "loss": 6.1635, + "step": 7804 + }, + { + "epoch": 0.04641854600818346, + "grad_norm": 2.4152462482452393, + "learning_rate": 4.97347154520093e-05, + "loss": 5.9882, + "step": 7805 + }, + { + "epoch": 0.04642449329146446, + "grad_norm": 2.166402578353882, + "learning_rate": 4.9734647581373015e-05, + "loss": 5.8982, + "step": 7806 + }, + { + "epoch": 0.046430440574745456, + "grad_norm": 1.8684437274932861, + "learning_rate": 4.973457970210211e-05, + "loss": 5.9501, + "step": 7807 + }, + { + "epoch": 0.04643638785802645, + "grad_norm": 1.775829792022705, + "learning_rate": 4.973451181419663e-05, + "loss": 5.83, + "step": 7808 + }, + { + "epoch": 0.04644233514130745, + "grad_norm": 1.7500759363174438, + "learning_rate": 4.973444391765659e-05, + "loss": 6.0084, + "step": 7809 + }, + { + "epoch": 0.04644828242458845, + "grad_norm": 2.3920938968658447, + "learning_rate": 4.9734376012482e-05, + "loss": 5.559, + "step": 7810 + }, + { + "epoch": 0.04645422970786944, + "grad_norm": 2.7680983543395996, + "learning_rate": 4.97343080986729e-05, + "loss": 5.3521, + "step": 7811 + }, + { + "epoch": 0.046460176991150445, + "grad_norm": 2.6618781089782715, + "learning_rate": 4.9734240176229316e-05, + "loss": 5.6917, + "step": 7812 + }, + { + "epoch": 0.04646612427443144, + "grad_norm": 2.086775541305542, + "learning_rate": 4.9734172245151256e-05, + "loss": 5.582, + "step": 7813 + }, + { + "epoch": 0.046472071557712435, + "grad_norm": 2.190012216567993, + "learning_rate": 4.973410430543875e-05, + "loss": 5.9132, + "step": 7814 + }, + { + "epoch": 0.04647801884099344, + "grad_norm": 2.317610740661621, + "learning_rate": 4.973403635709183e-05, + "loss": 5.7055, + "step": 7815 + }, + { + "epoch": 0.04648396612427443, + "grad_norm": 2.1291167736053467, + "learning_rate": 4.973396840011051e-05, + "loss": 5.6711, + "step": 7816 + }, + { + "epoch": 0.04648991340755543, + "grad_norm": 1.5421113967895508, + "learning_rate": 4.9733900434494815e-05, + "loss": 5.6433, + "step": 7817 + }, + { + "epoch": 0.04649586069083642, + "grad_norm": 2.222355604171753, + "learning_rate": 4.973383246024477e-05, + "loss": 5.3685, + "step": 7818 + }, + { + "epoch": 0.046501807974117425, + "grad_norm": 2.097116708755493, + "learning_rate": 4.97337644773604e-05, + "loss": 5.6528, + "step": 7819 + }, + { + "epoch": 0.04650775525739842, + "grad_norm": 2.0224382877349854, + "learning_rate": 4.973369648584174e-05, + "loss": 5.8849, + "step": 7820 + }, + { + "epoch": 0.046513702540679415, + "grad_norm": 2.1581428050994873, + "learning_rate": 4.973362848568879e-05, + "loss": 5.985, + "step": 7821 + }, + { + "epoch": 0.04651964982396042, + "grad_norm": 2.43945574760437, + "learning_rate": 4.9733560476901584e-05, + "loss": 5.5682, + "step": 7822 + }, + { + "epoch": 0.04652559710724141, + "grad_norm": 3.174143075942993, + "learning_rate": 4.9733492459480157e-05, + "loss": 4.832, + "step": 7823 + }, + { + "epoch": 0.04653154439052241, + "grad_norm": 2.269339084625244, + "learning_rate": 4.973342443342452e-05, + "loss": 5.5804, + "step": 7824 + }, + { + "epoch": 0.04653749167380341, + "grad_norm": 2.3775289058685303, + "learning_rate": 4.9733356398734695e-05, + "loss": 5.8299, + "step": 7825 + }, + { + "epoch": 0.046543438957084404, + "grad_norm": 2.065579414367676, + "learning_rate": 4.9733288355410716e-05, + "loss": 5.6985, + "step": 7826 + }, + { + "epoch": 0.0465493862403654, + "grad_norm": 1.9699875116348267, + "learning_rate": 4.9733220303452604e-05, + "loss": 6.0161, + "step": 7827 + }, + { + "epoch": 0.0465553335236464, + "grad_norm": 2.1414806842803955, + "learning_rate": 4.9733152242860374e-05, + "loss": 6.2534, + "step": 7828 + }, + { + "epoch": 0.046561280806927396, + "grad_norm": 2.414738416671753, + "learning_rate": 4.973308417363406e-05, + "loss": 5.8402, + "step": 7829 + }, + { + "epoch": 0.04656722809020839, + "grad_norm": 2.4105031490325928, + "learning_rate": 4.973301609577368e-05, + "loss": 5.8728, + "step": 7830 + }, + { + "epoch": 0.04657317537348939, + "grad_norm": 2.7718660831451416, + "learning_rate": 4.9732948009279264e-05, + "loss": 5.637, + "step": 7831 + }, + { + "epoch": 0.04657912265677039, + "grad_norm": 2.205103874206543, + "learning_rate": 4.9732879914150824e-05, + "loss": 5.4119, + "step": 7832 + }, + { + "epoch": 0.046585069940051384, + "grad_norm": 1.9080390930175781, + "learning_rate": 4.9732811810388394e-05, + "loss": 5.3387, + "step": 7833 + }, + { + "epoch": 0.04659101722333238, + "grad_norm": 1.6600725650787354, + "learning_rate": 4.9732743697992e-05, + "loss": 5.3192, + "step": 7834 + }, + { + "epoch": 0.04659696450661338, + "grad_norm": 1.9428787231445312, + "learning_rate": 4.973267557696165e-05, + "loss": 5.3127, + "step": 7835 + }, + { + "epoch": 0.046602911789894376, + "grad_norm": 2.174811840057373, + "learning_rate": 4.973260744729738e-05, + "loss": 5.7181, + "step": 7836 + }, + { + "epoch": 0.04660885907317537, + "grad_norm": 2.5420422554016113, + "learning_rate": 4.9732539308999224e-05, + "loss": 5.934, + "step": 7837 + }, + { + "epoch": 0.04661480635645637, + "grad_norm": 2.079343795776367, + "learning_rate": 4.973247116206719e-05, + "loss": 5.236, + "step": 7838 + }, + { + "epoch": 0.04662075363973737, + "grad_norm": 1.7748003005981445, + "learning_rate": 4.97324030065013e-05, + "loss": 5.2929, + "step": 7839 + }, + { + "epoch": 0.04662670092301836, + "grad_norm": 2.2746875286102295, + "learning_rate": 4.973233484230159e-05, + "loss": 5.182, + "step": 7840 + }, + { + "epoch": 0.046632648206299365, + "grad_norm": 1.7846394777297974, + "learning_rate": 4.9732266669468074e-05, + "loss": 5.2682, + "step": 7841 + }, + { + "epoch": 0.04663859548958036, + "grad_norm": 2.078132152557373, + "learning_rate": 4.973219848800078e-05, + "loss": 5.3245, + "step": 7842 + }, + { + "epoch": 0.046644542772861355, + "grad_norm": 1.7784876823425293, + "learning_rate": 4.9732130297899726e-05, + "loss": 5.4582, + "step": 7843 + }, + { + "epoch": 0.04665049005614236, + "grad_norm": 1.8421920537948608, + "learning_rate": 4.973206209916495e-05, + "loss": 5.3504, + "step": 7844 + }, + { + "epoch": 0.04665643733942335, + "grad_norm": 1.9958820343017578, + "learning_rate": 4.9731993891796455e-05, + "loss": 5.2914, + "step": 7845 + }, + { + "epoch": 0.04666238462270435, + "grad_norm": 2.0615813732147217, + "learning_rate": 4.9731925675794286e-05, + "loss": 5.3318, + "step": 7846 + }, + { + "epoch": 0.04666833190598534, + "grad_norm": 1.7690422534942627, + "learning_rate": 4.973185745115846e-05, + "loss": 5.3169, + "step": 7847 + }, + { + "epoch": 0.046674279189266345, + "grad_norm": 1.7990578413009644, + "learning_rate": 4.9731789217888994e-05, + "loss": 5.3136, + "step": 7848 + }, + { + "epoch": 0.04668022647254734, + "grad_norm": 2.0028672218322754, + "learning_rate": 4.9731720975985905e-05, + "loss": 5.2115, + "step": 7849 + }, + { + "epoch": 0.046686173755828335, + "grad_norm": 2.0703940391540527, + "learning_rate": 4.973165272544924e-05, + "loss": 5.2439, + "step": 7850 + }, + { + "epoch": 0.04669212103910934, + "grad_norm": 2.1105704307556152, + "learning_rate": 4.973158446627901e-05, + "loss": 5.5812, + "step": 7851 + }, + { + "epoch": 0.04669806832239033, + "grad_norm": 1.7391036748886108, + "learning_rate": 4.9731516198475236e-05, + "loss": 5.229, + "step": 7852 + }, + { + "epoch": 0.04670401560567133, + "grad_norm": 1.6907505989074707, + "learning_rate": 4.973144792203795e-05, + "loss": 5.2674, + "step": 7853 + }, + { + "epoch": 0.04670996288895233, + "grad_norm": 1.608168125152588, + "learning_rate": 4.973137963696717e-05, + "loss": 5.389, + "step": 7854 + }, + { + "epoch": 0.046715910172233324, + "grad_norm": 1.7521610260009766, + "learning_rate": 4.9731311343262913e-05, + "loss": 5.2436, + "step": 7855 + }, + { + "epoch": 0.04672185745551432, + "grad_norm": 2.0182595252990723, + "learning_rate": 4.973124304092522e-05, + "loss": 5.2746, + "step": 7856 + }, + { + "epoch": 0.04672780473879532, + "grad_norm": 1.7990871667861938, + "learning_rate": 4.97311747299541e-05, + "loss": 5.4241, + "step": 7857 + }, + { + "epoch": 0.046733752022076316, + "grad_norm": 2.124717950820923, + "learning_rate": 4.973110641034958e-05, + "loss": 5.5133, + "step": 7858 + }, + { + "epoch": 0.04673969930535731, + "grad_norm": 2.066869020462036, + "learning_rate": 4.973103808211169e-05, + "loss": 5.252, + "step": 7859 + }, + { + "epoch": 0.04674564658863831, + "grad_norm": 1.8004878759384155, + "learning_rate": 4.9730969745240455e-05, + "loss": 5.483, + "step": 7860 + }, + { + "epoch": 0.04675159387191931, + "grad_norm": 1.6822713613510132, + "learning_rate": 4.9730901399735886e-05, + "loss": 5.3916, + "step": 7861 + }, + { + "epoch": 0.046757541155200304, + "grad_norm": 1.7024493217468262, + "learning_rate": 4.973083304559802e-05, + "loss": 5.3504, + "step": 7862 + }, + { + "epoch": 0.0467634884384813, + "grad_norm": 1.5939997434616089, + "learning_rate": 4.973076468282687e-05, + "loss": 5.4151, + "step": 7863 + }, + { + "epoch": 0.0467694357217623, + "grad_norm": 1.7603535652160645, + "learning_rate": 4.9730696311422475e-05, + "loss": 5.351, + "step": 7864 + }, + { + "epoch": 0.046775383005043296, + "grad_norm": 1.737897276878357, + "learning_rate": 4.973062793138484e-05, + "loss": 5.0834, + "step": 7865 + }, + { + "epoch": 0.04678133028832429, + "grad_norm": 2.4130520820617676, + "learning_rate": 4.973055954271401e-05, + "loss": 4.833, + "step": 7866 + }, + { + "epoch": 0.04678727757160529, + "grad_norm": 1.9712201356887817, + "learning_rate": 4.9730491145409987e-05, + "loss": 5.0048, + "step": 7867 + }, + { + "epoch": 0.04679322485488629, + "grad_norm": 1.808608055114746, + "learning_rate": 4.97304227394728e-05, + "loss": 5.3134, + "step": 7868 + }, + { + "epoch": 0.04679917213816728, + "grad_norm": 1.8121775388717651, + "learning_rate": 4.973035432490249e-05, + "loss": 5.2594, + "step": 7869 + }, + { + "epoch": 0.046805119421448285, + "grad_norm": 1.7191296815872192, + "learning_rate": 4.9730285901699064e-05, + "loss": 5.206, + "step": 7870 + }, + { + "epoch": 0.04681106670472928, + "grad_norm": 1.931894063949585, + "learning_rate": 4.973021746986255e-05, + "loss": 5.3349, + "step": 7871 + }, + { + "epoch": 0.046817013988010275, + "grad_norm": 2.5420172214508057, + "learning_rate": 4.973014902939297e-05, + "loss": 5.2894, + "step": 7872 + }, + { + "epoch": 0.04682296127129128, + "grad_norm": 2.5522336959838867, + "learning_rate": 4.973008058029036e-05, + "loss": 5.2144, + "step": 7873 + }, + { + "epoch": 0.04682890855457227, + "grad_norm": 3.1389801502227783, + "learning_rate": 4.973001212255472e-05, + "loss": 5.7229, + "step": 7874 + }, + { + "epoch": 0.04683485583785327, + "grad_norm": 1.8687554597854614, + "learning_rate": 4.97299436561861e-05, + "loss": 5.483, + "step": 7875 + }, + { + "epoch": 0.04684080312113426, + "grad_norm": 2.2526602745056152, + "learning_rate": 4.972987518118451e-05, + "loss": 5.4562, + "step": 7876 + }, + { + "epoch": 0.046846750404415265, + "grad_norm": 2.108677625656128, + "learning_rate": 4.972980669754997e-05, + "loss": 5.2005, + "step": 7877 + }, + { + "epoch": 0.04685269768769626, + "grad_norm": 2.023118019104004, + "learning_rate": 4.972973820528252e-05, + "loss": 5.3674, + "step": 7878 + }, + { + "epoch": 0.046858644970977255, + "grad_norm": 1.6553964614868164, + "learning_rate": 4.9729669704382165e-05, + "loss": 5.3256, + "step": 7879 + }, + { + "epoch": 0.04686459225425826, + "grad_norm": 1.8197314739227295, + "learning_rate": 4.972960119484894e-05, + "loss": 5.1738, + "step": 7880 + }, + { + "epoch": 0.04687053953753925, + "grad_norm": 1.6142289638519287, + "learning_rate": 4.972953267668287e-05, + "loss": 5.245, + "step": 7881 + }, + { + "epoch": 0.04687648682082025, + "grad_norm": 1.4962797164916992, + "learning_rate": 4.972946414988398e-05, + "loss": 5.3121, + "step": 7882 + }, + { + "epoch": 0.04688243410410125, + "grad_norm": 1.487801432609558, + "learning_rate": 4.972939561445228e-05, + "loss": 5.1828, + "step": 7883 + }, + { + "epoch": 0.046888381387382244, + "grad_norm": 1.9139772653579712, + "learning_rate": 4.972932707038781e-05, + "loss": 5.2432, + "step": 7884 + }, + { + "epoch": 0.04689432867066324, + "grad_norm": 1.7533615827560425, + "learning_rate": 4.972925851769058e-05, + "loss": 5.6451, + "step": 7885 + }, + { + "epoch": 0.04690027595394424, + "grad_norm": 1.8561608791351318, + "learning_rate": 4.972918995636062e-05, + "loss": 5.4293, + "step": 7886 + }, + { + "epoch": 0.046906223237225236, + "grad_norm": 1.6891844272613525, + "learning_rate": 4.972912138639797e-05, + "loss": 5.2736, + "step": 7887 + }, + { + "epoch": 0.04691217052050623, + "grad_norm": 1.9279890060424805, + "learning_rate": 4.972905280780262e-05, + "loss": 5.5733, + "step": 7888 + }, + { + "epoch": 0.04691811780378723, + "grad_norm": 1.7810181379318237, + "learning_rate": 4.9728984220574624e-05, + "loss": 5.2036, + "step": 7889 + }, + { + "epoch": 0.04692406508706823, + "grad_norm": 1.6455233097076416, + "learning_rate": 4.9728915624714004e-05, + "loss": 5.3493, + "step": 7890 + }, + { + "epoch": 0.046930012370349224, + "grad_norm": 1.5345048904418945, + "learning_rate": 4.9728847020220756e-05, + "loss": 5.2528, + "step": 7891 + }, + { + "epoch": 0.04693595965363022, + "grad_norm": 1.455165982246399, + "learning_rate": 4.9728778407094935e-05, + "loss": 5.2769, + "step": 7892 + }, + { + "epoch": 0.04694190693691122, + "grad_norm": 1.577910304069519, + "learning_rate": 4.972870978533655e-05, + "loss": 5.2182, + "step": 7893 + }, + { + "epoch": 0.046947854220192216, + "grad_norm": 1.728143334388733, + "learning_rate": 4.972864115494563e-05, + "loss": 5.3446, + "step": 7894 + }, + { + "epoch": 0.04695380150347321, + "grad_norm": 1.6157398223876953, + "learning_rate": 4.972857251592219e-05, + "loss": 5.4866, + "step": 7895 + }, + { + "epoch": 0.04695974878675421, + "grad_norm": 1.5386699438095093, + "learning_rate": 4.9728503868266266e-05, + "loss": 5.4626, + "step": 7896 + }, + { + "epoch": 0.04696569607003521, + "grad_norm": 1.874915599822998, + "learning_rate": 4.972843521197788e-05, + "loss": 5.4152, + "step": 7897 + }, + { + "epoch": 0.0469716433533162, + "grad_norm": 1.7093253135681152, + "learning_rate": 4.9728366547057046e-05, + "loss": 5.2852, + "step": 7898 + }, + { + "epoch": 0.046977590636597205, + "grad_norm": 1.6435173749923706, + "learning_rate": 4.9728297873503806e-05, + "loss": 5.3985, + "step": 7899 + }, + { + "epoch": 0.0469835379198782, + "grad_norm": 1.5776588916778564, + "learning_rate": 4.972822919131816e-05, + "loss": 5.2914, + "step": 7900 + }, + { + "epoch": 0.046989485203159195, + "grad_norm": 2.051072835922241, + "learning_rate": 4.972816050050015e-05, + "loss": 5.343, + "step": 7901 + }, + { + "epoch": 0.0469954324864402, + "grad_norm": 2.003816604614258, + "learning_rate": 4.972809180104979e-05, + "loss": 5.3577, + "step": 7902 + }, + { + "epoch": 0.04700137976972119, + "grad_norm": 1.9092657566070557, + "learning_rate": 4.9728023092967116e-05, + "loss": 5.551, + "step": 7903 + }, + { + "epoch": 0.04700732705300219, + "grad_norm": 1.763007640838623, + "learning_rate": 4.972795437625214e-05, + "loss": 5.5611, + "step": 7904 + }, + { + "epoch": 0.04701327433628318, + "grad_norm": 2.637850046157837, + "learning_rate": 4.9727885650904895e-05, + "loss": 5.937, + "step": 7905 + }, + { + "epoch": 0.047019221619564185, + "grad_norm": 1.6650307178497314, + "learning_rate": 4.9727816916925395e-05, + "loss": 5.6418, + "step": 7906 + }, + { + "epoch": 0.04702516890284518, + "grad_norm": 1.6943029165267944, + "learning_rate": 4.972774817431367e-05, + "loss": 5.4826, + "step": 7907 + }, + { + "epoch": 0.047031116186126175, + "grad_norm": 1.4689685106277466, + "learning_rate": 4.972767942306975e-05, + "loss": 5.4849, + "step": 7908 + }, + { + "epoch": 0.04703706346940718, + "grad_norm": 1.759244441986084, + "learning_rate": 4.9727610663193644e-05, + "loss": 5.3496, + "step": 7909 + }, + { + "epoch": 0.04704301075268817, + "grad_norm": 1.8706889152526855, + "learning_rate": 4.9727541894685395e-05, + "loss": 5.2836, + "step": 7910 + }, + { + "epoch": 0.04704895803596917, + "grad_norm": 1.486164927482605, + "learning_rate": 4.972747311754501e-05, + "loss": 5.4125, + "step": 7911 + }, + { + "epoch": 0.04705490531925017, + "grad_norm": 1.6479889154434204, + "learning_rate": 4.972740433177252e-05, + "loss": 5.1986, + "step": 7912 + }, + { + "epoch": 0.047060852602531164, + "grad_norm": 1.5741796493530273, + "learning_rate": 4.9727335537367944e-05, + "loss": 5.4761, + "step": 7913 + }, + { + "epoch": 0.04706679988581216, + "grad_norm": 1.5001682043075562, + "learning_rate": 4.972726673433131e-05, + "loss": 5.6267, + "step": 7914 + }, + { + "epoch": 0.04707274716909316, + "grad_norm": 1.774282455444336, + "learning_rate": 4.972719792266265e-05, + "loss": 5.5944, + "step": 7915 + }, + { + "epoch": 0.047078694452374156, + "grad_norm": 1.6656653881072998, + "learning_rate": 4.972712910236198e-05, + "loss": 5.4159, + "step": 7916 + }, + { + "epoch": 0.04708464173565515, + "grad_norm": 1.7174065113067627, + "learning_rate": 4.972706027342933e-05, + "loss": 5.4239, + "step": 7917 + }, + { + "epoch": 0.04709058901893615, + "grad_norm": 1.607878565788269, + "learning_rate": 4.9726991435864705e-05, + "loss": 5.4517, + "step": 7918 + }, + { + "epoch": 0.04709653630221715, + "grad_norm": 1.9639167785644531, + "learning_rate": 4.972692258966815e-05, + "loss": 5.5371, + "step": 7919 + }, + { + "epoch": 0.047102483585498144, + "grad_norm": 1.5418875217437744, + "learning_rate": 4.9726853734839684e-05, + "loss": 5.4798, + "step": 7920 + }, + { + "epoch": 0.04710843086877914, + "grad_norm": 1.54796302318573, + "learning_rate": 4.9726784871379326e-05, + "loss": 5.5329, + "step": 7921 + }, + { + "epoch": 0.04711437815206014, + "grad_norm": 1.8075921535491943, + "learning_rate": 4.97267159992871e-05, + "loss": 5.6049, + "step": 7922 + }, + { + "epoch": 0.047120325435341136, + "grad_norm": 1.4973857402801514, + "learning_rate": 4.972664711856304e-05, + "loss": 5.27, + "step": 7923 + }, + { + "epoch": 0.04712627271862213, + "grad_norm": 2.1028542518615723, + "learning_rate": 4.9726578229207155e-05, + "loss": 5.3626, + "step": 7924 + }, + { + "epoch": 0.04713222000190313, + "grad_norm": 2.2057480812072754, + "learning_rate": 4.9726509331219485e-05, + "loss": 5.1767, + "step": 7925 + }, + { + "epoch": 0.04713816728518413, + "grad_norm": 2.0549347400665283, + "learning_rate": 4.972644042460004e-05, + "loss": 5.3362, + "step": 7926 + }, + { + "epoch": 0.04714411456846512, + "grad_norm": 2.0960693359375, + "learning_rate": 4.972637150934885e-05, + "loss": 5.5162, + "step": 7927 + }, + { + "epoch": 0.047150061851746125, + "grad_norm": 2.2022509574890137, + "learning_rate": 4.9726302585465945e-05, + "loss": 5.3263, + "step": 7928 + }, + { + "epoch": 0.04715600913502712, + "grad_norm": 1.7065988779067993, + "learning_rate": 4.9726233652951335e-05, + "loss": 5.4349, + "step": 7929 + }, + { + "epoch": 0.047161956418308115, + "grad_norm": 1.742591142654419, + "learning_rate": 4.972616471180506e-05, + "loss": 5.2396, + "step": 7930 + }, + { + "epoch": 0.04716790370158912, + "grad_norm": 1.888846755027771, + "learning_rate": 4.972609576202713e-05, + "loss": 5.3453, + "step": 7931 + }, + { + "epoch": 0.04717385098487011, + "grad_norm": 1.6499360799789429, + "learning_rate": 4.972602680361758e-05, + "loss": 5.2819, + "step": 7932 + }, + { + "epoch": 0.04717979826815111, + "grad_norm": 1.8801236152648926, + "learning_rate": 4.9725957836576434e-05, + "loss": 5.2456, + "step": 7933 + }, + { + "epoch": 0.0471857455514321, + "grad_norm": 2.050522565841675, + "learning_rate": 4.97258888609037e-05, + "loss": 5.2069, + "step": 7934 + }, + { + "epoch": 0.047191692834713105, + "grad_norm": 2.0722391605377197, + "learning_rate": 4.972581987659942e-05, + "loss": 5.5057, + "step": 7935 + }, + { + "epoch": 0.0471976401179941, + "grad_norm": 2.728468179702759, + "learning_rate": 4.972575088366361e-05, + "loss": 5.5485, + "step": 7936 + }, + { + "epoch": 0.047203587401275095, + "grad_norm": 2.0293211936950684, + "learning_rate": 4.9725681882096295e-05, + "loss": 5.7126, + "step": 7937 + }, + { + "epoch": 0.0472095346845561, + "grad_norm": 2.1351194381713867, + "learning_rate": 4.97256128718975e-05, + "loss": 5.7313, + "step": 7938 + }, + { + "epoch": 0.04721548196783709, + "grad_norm": 1.9040015935897827, + "learning_rate": 4.972554385306726e-05, + "loss": 5.696, + "step": 7939 + }, + { + "epoch": 0.04722142925111809, + "grad_norm": 1.640110731124878, + "learning_rate": 4.9725474825605574e-05, + "loss": 5.2626, + "step": 7940 + }, + { + "epoch": 0.04722737653439909, + "grad_norm": 1.887408971786499, + "learning_rate": 4.972540578951249e-05, + "loss": 5.2734, + "step": 7941 + }, + { + "epoch": 0.047233323817680084, + "grad_norm": 1.8867583274841309, + "learning_rate": 4.972533674478801e-05, + "loss": 5.6811, + "step": 7942 + }, + { + "epoch": 0.04723927110096108, + "grad_norm": 1.811104655265808, + "learning_rate": 4.9725267691432174e-05, + "loss": 5.575, + "step": 7943 + }, + { + "epoch": 0.04724521838424208, + "grad_norm": 1.8644812107086182, + "learning_rate": 4.9725198629445014e-05, + "loss": 5.5718, + "step": 7944 + }, + { + "epoch": 0.047251165667523076, + "grad_norm": 1.693788766860962, + "learning_rate": 4.972512955882653e-05, + "loss": 5.5924, + "step": 7945 + }, + { + "epoch": 0.04725711295080407, + "grad_norm": 1.8305641412734985, + "learning_rate": 4.9725060479576766e-05, + "loss": 5.6529, + "step": 7946 + }, + { + "epoch": 0.04726306023408507, + "grad_norm": 1.7662039995193481, + "learning_rate": 4.9724991391695734e-05, + "loss": 5.6709, + "step": 7947 + }, + { + "epoch": 0.04726900751736607, + "grad_norm": 2.1799724102020264, + "learning_rate": 4.972492229518347e-05, + "loss": 5.6266, + "step": 7948 + }, + { + "epoch": 0.047274954800647064, + "grad_norm": 1.9300130605697632, + "learning_rate": 4.972485319003998e-05, + "loss": 5.6494, + "step": 7949 + }, + { + "epoch": 0.04728090208392806, + "grad_norm": 1.9196375608444214, + "learning_rate": 4.9724784076265307e-05, + "loss": 5.571, + "step": 7950 + }, + { + "epoch": 0.04728684936720906, + "grad_norm": 1.906616449356079, + "learning_rate": 4.972471495385947e-05, + "loss": 5.6537, + "step": 7951 + }, + { + "epoch": 0.047292796650490056, + "grad_norm": 1.826536774635315, + "learning_rate": 4.972464582282249e-05, + "loss": 5.6251, + "step": 7952 + }, + { + "epoch": 0.04729874393377105, + "grad_norm": 1.7790716886520386, + "learning_rate": 4.972457668315438e-05, + "loss": 5.3488, + "step": 7953 + }, + { + "epoch": 0.04730469121705205, + "grad_norm": 1.8892159461975098, + "learning_rate": 4.972450753485519e-05, + "loss": 5.4794, + "step": 7954 + }, + { + "epoch": 0.04731063850033305, + "grad_norm": 1.9409239292144775, + "learning_rate": 4.972443837792492e-05, + "loss": 5.6058, + "step": 7955 + }, + { + "epoch": 0.04731658578361404, + "grad_norm": 1.9935575723648071, + "learning_rate": 4.972436921236361e-05, + "loss": 5.6481, + "step": 7956 + }, + { + "epoch": 0.047322533066895045, + "grad_norm": 1.8507076501846313, + "learning_rate": 4.9724300038171276e-05, + "loss": 5.4723, + "step": 7957 + }, + { + "epoch": 0.04732848035017604, + "grad_norm": 1.9355841875076294, + "learning_rate": 4.972423085534794e-05, + "loss": 5.3843, + "step": 7958 + }, + { + "epoch": 0.047334427633457035, + "grad_norm": 1.9815531969070435, + "learning_rate": 4.972416166389363e-05, + "loss": 5.5635, + "step": 7959 + }, + { + "epoch": 0.04734037491673804, + "grad_norm": 1.7955007553100586, + "learning_rate": 4.972409246380838e-05, + "loss": 5.6002, + "step": 7960 + }, + { + "epoch": 0.04734632220001903, + "grad_norm": 2.0184547901153564, + "learning_rate": 4.97240232550922e-05, + "loss": 5.5458, + "step": 7961 + }, + { + "epoch": 0.04735226948330003, + "grad_norm": 1.7418156862258911, + "learning_rate": 4.972395403774512e-05, + "loss": 5.6443, + "step": 7962 + }, + { + "epoch": 0.04735821676658102, + "grad_norm": 1.9832762479782104, + "learning_rate": 4.972388481176716e-05, + "loss": 5.3799, + "step": 7963 + }, + { + "epoch": 0.047364164049862024, + "grad_norm": 1.8777718544006348, + "learning_rate": 4.972381557715835e-05, + "loss": 5.4349, + "step": 7964 + }, + { + "epoch": 0.04737011133314302, + "grad_norm": 1.519038438796997, + "learning_rate": 4.972374633391871e-05, + "loss": 5.2418, + "step": 7965 + }, + { + "epoch": 0.047376058616424015, + "grad_norm": 1.6425752639770508, + "learning_rate": 4.972367708204826e-05, + "loss": 5.1648, + "step": 7966 + }, + { + "epoch": 0.04738200589970502, + "grad_norm": 1.7461836338043213, + "learning_rate": 4.972360782154704e-05, + "loss": 5.1745, + "step": 7967 + }, + { + "epoch": 0.04738795318298601, + "grad_norm": 1.7991663217544556, + "learning_rate": 4.9723538552415064e-05, + "loss": 5.2268, + "step": 7968 + }, + { + "epoch": 0.04739390046626701, + "grad_norm": 1.9127873182296753, + "learning_rate": 4.9723469274652345e-05, + "loss": 5.5205, + "step": 7969 + }, + { + "epoch": 0.04739984774954801, + "grad_norm": 1.8836725950241089, + "learning_rate": 4.972339998825893e-05, + "loss": 5.3803, + "step": 7970 + }, + { + "epoch": 0.047405795032829004, + "grad_norm": 1.8391705751419067, + "learning_rate": 4.9723330693234825e-05, + "loss": 5.3084, + "step": 7971 + }, + { + "epoch": 0.04741174231611, + "grad_norm": 1.6707972288131714, + "learning_rate": 4.9723261389580063e-05, + "loss": 5.3275, + "step": 7972 + }, + { + "epoch": 0.047417689599391, + "grad_norm": 1.8807258605957031, + "learning_rate": 4.972319207729467e-05, + "loss": 5.0766, + "step": 7973 + }, + { + "epoch": 0.047423636882671996, + "grad_norm": 1.8980032205581665, + "learning_rate": 4.9723122756378655e-05, + "loss": 5.185, + "step": 7974 + }, + { + "epoch": 0.04742958416595299, + "grad_norm": 1.9011166095733643, + "learning_rate": 4.9723053426832055e-05, + "loss": 5.2494, + "step": 7975 + }, + { + "epoch": 0.04743553144923399, + "grad_norm": 1.6457782983779907, + "learning_rate": 4.97229840886549e-05, + "loss": 5.4205, + "step": 7976 + }, + { + "epoch": 0.04744147873251499, + "grad_norm": 1.558515191078186, + "learning_rate": 4.9722914741847206e-05, + "loss": 5.2111, + "step": 7977 + }, + { + "epoch": 0.04744742601579598, + "grad_norm": 1.4780910015106201, + "learning_rate": 4.9722845386409e-05, + "loss": 5.3365, + "step": 7978 + }, + { + "epoch": 0.04745337329907698, + "grad_norm": 1.529249668121338, + "learning_rate": 4.9722776022340296e-05, + "loss": 5.1323, + "step": 7979 + }, + { + "epoch": 0.04745932058235798, + "grad_norm": 1.66848886013031, + "learning_rate": 4.972270664964113e-05, + "loss": 5.2057, + "step": 7980 + }, + { + "epoch": 0.047465267865638976, + "grad_norm": 1.5645034313201904, + "learning_rate": 4.972263726831152e-05, + "loss": 5.1537, + "step": 7981 + }, + { + "epoch": 0.04747121514891997, + "grad_norm": 1.8793894052505493, + "learning_rate": 4.9722567878351496e-05, + "loss": 5.4403, + "step": 7982 + }, + { + "epoch": 0.04747716243220097, + "grad_norm": 1.7316640615463257, + "learning_rate": 4.972249847976108e-05, + "loss": 5.3642, + "step": 7983 + }, + { + "epoch": 0.04748310971548197, + "grad_norm": 1.7195171117782593, + "learning_rate": 4.972242907254029e-05, + "loss": 5.2603, + "step": 7984 + }, + { + "epoch": 0.04748905699876296, + "grad_norm": 1.6860026121139526, + "learning_rate": 4.972235965668916e-05, + "loss": 5.356, + "step": 7985 + }, + { + "epoch": 0.047495004282043965, + "grad_norm": 1.5396910905838013, + "learning_rate": 4.972229023220771e-05, + "loss": 5.2566, + "step": 7986 + }, + { + "epoch": 0.04750095156532496, + "grad_norm": 1.694547176361084, + "learning_rate": 4.9722220799095956e-05, + "loss": 5.0897, + "step": 7987 + }, + { + "epoch": 0.047506898848605955, + "grad_norm": 1.7608548402786255, + "learning_rate": 4.972215135735394e-05, + "loss": 5.4084, + "step": 7988 + }, + { + "epoch": 0.04751284613188696, + "grad_norm": 1.697198748588562, + "learning_rate": 4.9722081906981675e-05, + "loss": 5.4133, + "step": 7989 + }, + { + "epoch": 0.04751879341516795, + "grad_norm": 1.6107436418533325, + "learning_rate": 4.972201244797918e-05, + "loss": 5.2839, + "step": 7990 + }, + { + "epoch": 0.04752474069844895, + "grad_norm": 1.8178008794784546, + "learning_rate": 4.972194298034649e-05, + "loss": 5.3722, + "step": 7991 + }, + { + "epoch": 0.04753068798172994, + "grad_norm": 1.6542725563049316, + "learning_rate": 4.972187350408363e-05, + "loss": 5.3434, + "step": 7992 + }, + { + "epoch": 0.047536635265010944, + "grad_norm": 1.8194152116775513, + "learning_rate": 4.972180401919061e-05, + "loss": 5.3763, + "step": 7993 + }, + { + "epoch": 0.04754258254829194, + "grad_norm": 1.890317678451538, + "learning_rate": 4.9721734525667476e-05, + "loss": 5.529, + "step": 7994 + }, + { + "epoch": 0.047548529831572935, + "grad_norm": 1.813226342201233, + "learning_rate": 4.972166502351423e-05, + "loss": 5.0826, + "step": 7995 + }, + { + "epoch": 0.04755447711485394, + "grad_norm": 1.7679328918457031, + "learning_rate": 4.9721595512730905e-05, + "loss": 5.3589, + "step": 7996 + }, + { + "epoch": 0.04756042439813493, + "grad_norm": 1.8390278816223145, + "learning_rate": 4.972152599331753e-05, + "loss": 5.1568, + "step": 7997 + }, + { + "epoch": 0.04756637168141593, + "grad_norm": 2.9323909282684326, + "learning_rate": 4.972145646527413e-05, + "loss": 5.6457, + "step": 7998 + }, + { + "epoch": 0.04757231896469693, + "grad_norm": 1.8839350938796997, + "learning_rate": 4.972138692860072e-05, + "loss": 5.1204, + "step": 7999 + }, + { + "epoch": 0.047578266247977924, + "grad_norm": 1.9047685861587524, + "learning_rate": 4.972131738329733e-05, + "loss": 5.2741, + "step": 8000 + }, + { + "epoch": 0.04758421353125892, + "grad_norm": 2.39807391166687, + "learning_rate": 4.972124782936398e-05, + "loss": 5.0134, + "step": 8001 + }, + { + "epoch": 0.04759016081453992, + "grad_norm": 2.197404146194458, + "learning_rate": 4.972117826680071e-05, + "loss": 5.3012, + "step": 8002 + }, + { + "epoch": 0.047596108097820916, + "grad_norm": 2.2648651599884033, + "learning_rate": 4.9721108695607515e-05, + "loss": 5.7196, + "step": 8003 + }, + { + "epoch": 0.04760205538110191, + "grad_norm": 1.7686847448349, + "learning_rate": 4.972103911578444e-05, + "loss": 5.4261, + "step": 8004 + }, + { + "epoch": 0.04760800266438291, + "grad_norm": 1.726653814315796, + "learning_rate": 4.972096952733152e-05, + "loss": 5.33, + "step": 8005 + }, + { + "epoch": 0.04761394994766391, + "grad_norm": 1.6855807304382324, + "learning_rate": 4.972089993024875e-05, + "loss": 5.2382, + "step": 8006 + }, + { + "epoch": 0.0476198972309449, + "grad_norm": 1.644954800605774, + "learning_rate": 4.972083032453617e-05, + "loss": 5.3309, + "step": 8007 + }, + { + "epoch": 0.0476258445142259, + "grad_norm": 1.8630400896072388, + "learning_rate": 4.9720760710193816e-05, + "loss": 5.282, + "step": 8008 + }, + { + "epoch": 0.0476317917975069, + "grad_norm": 1.862716555595398, + "learning_rate": 4.972069108722168e-05, + "loss": 5.3307, + "step": 8009 + }, + { + "epoch": 0.047637739080787896, + "grad_norm": 1.8025259971618652, + "learning_rate": 4.972062145561982e-05, + "loss": 5.2236, + "step": 8010 + }, + { + "epoch": 0.04764368636406889, + "grad_norm": 1.7213356494903564, + "learning_rate": 4.972055181538825e-05, + "loss": 5.0635, + "step": 8011 + }, + { + "epoch": 0.04764963364734989, + "grad_norm": 1.5237104892730713, + "learning_rate": 4.9720482166526986e-05, + "loss": 5.3089, + "step": 8012 + }, + { + "epoch": 0.04765558093063089, + "grad_norm": 1.628957748413086, + "learning_rate": 4.972041250903605e-05, + "loss": 5.2299, + "step": 8013 + }, + { + "epoch": 0.04766152821391188, + "grad_norm": 1.9217725992202759, + "learning_rate": 4.972034284291548e-05, + "loss": 5.2504, + "step": 8014 + }, + { + "epoch": 0.047667475497192885, + "grad_norm": 2.114549160003662, + "learning_rate": 4.97202731681653e-05, + "loss": 5.219, + "step": 8015 + }, + { + "epoch": 0.04767342278047388, + "grad_norm": 1.9268896579742432, + "learning_rate": 4.9720203484785525e-05, + "loss": 5.145, + "step": 8016 + }, + { + "epoch": 0.047679370063754875, + "grad_norm": 2.04050874710083, + "learning_rate": 4.9720133792776166e-05, + "loss": 5.354, + "step": 8017 + }, + { + "epoch": 0.04768531734703588, + "grad_norm": 1.8002599477767944, + "learning_rate": 4.972006409213728e-05, + "loss": 5.0547, + "step": 8018 + }, + { + "epoch": 0.04769126463031687, + "grad_norm": 1.9655365943908691, + "learning_rate": 4.9719994382868876e-05, + "loss": 5.2188, + "step": 8019 + }, + { + "epoch": 0.04769721191359787, + "grad_norm": 1.7188535928726196, + "learning_rate": 4.971992466497097e-05, + "loss": 5.1792, + "step": 8020 + }, + { + "epoch": 0.04770315919687886, + "grad_norm": 1.582184910774231, + "learning_rate": 4.97198549384436e-05, + "loss": 5.2295, + "step": 8021 + }, + { + "epoch": 0.047709106480159864, + "grad_norm": 1.4490164518356323, + "learning_rate": 4.971978520328677e-05, + "loss": 5.1677, + "step": 8022 + }, + { + "epoch": 0.04771505376344086, + "grad_norm": 1.472896695137024, + "learning_rate": 4.971971545950054e-05, + "loss": 4.9954, + "step": 8023 + }, + { + "epoch": 0.047721001046721855, + "grad_norm": 1.5845187902450562, + "learning_rate": 4.97196457070849e-05, + "loss": 5.1273, + "step": 8024 + }, + { + "epoch": 0.04772694833000286, + "grad_norm": 1.6418551206588745, + "learning_rate": 4.9719575946039887e-05, + "loss": 5.0835, + "step": 8025 + }, + { + "epoch": 0.04773289561328385, + "grad_norm": 1.379805088043213, + "learning_rate": 4.971950617636553e-05, + "loss": 5.1058, + "step": 8026 + }, + { + "epoch": 0.04773884289656485, + "grad_norm": 1.7939400672912598, + "learning_rate": 4.9719436398061835e-05, + "loss": 5.0105, + "step": 8027 + }, + { + "epoch": 0.04774479017984585, + "grad_norm": 1.5610185861587524, + "learning_rate": 4.971936661112886e-05, + "loss": 5.032, + "step": 8028 + }, + { + "epoch": 0.047750737463126844, + "grad_norm": 1.524402379989624, + "learning_rate": 4.9719296815566594e-05, + "loss": 5.1376, + "step": 8029 + }, + { + "epoch": 0.04775668474640784, + "grad_norm": 1.7448087930679321, + "learning_rate": 4.971922701137509e-05, + "loss": 4.9496, + "step": 8030 + }, + { + "epoch": 0.04776263202968884, + "grad_norm": 1.7382763624191284, + "learning_rate": 4.971915719855435e-05, + "loss": 4.9755, + "step": 8031 + }, + { + "epoch": 0.047768579312969836, + "grad_norm": 1.6728250980377197, + "learning_rate": 4.971908737710441e-05, + "loss": 5.1436, + "step": 8032 + }, + { + "epoch": 0.04777452659625083, + "grad_norm": 1.4256306886672974, + "learning_rate": 4.971901754702529e-05, + "loss": 4.9739, + "step": 8033 + }, + { + "epoch": 0.04778047387953183, + "grad_norm": 1.660714864730835, + "learning_rate": 4.971894770831702e-05, + "loss": 5.1337, + "step": 8034 + }, + { + "epoch": 0.04778642116281283, + "grad_norm": 1.5240182876586914, + "learning_rate": 4.9718877860979615e-05, + "loss": 5.1143, + "step": 8035 + }, + { + "epoch": 0.04779236844609382, + "grad_norm": 1.478852391242981, + "learning_rate": 4.971880800501311e-05, + "loss": 4.968, + "step": 8036 + }, + { + "epoch": 0.04779831572937482, + "grad_norm": 1.5343812704086304, + "learning_rate": 4.971873814041752e-05, + "loss": 4.9393, + "step": 8037 + }, + { + "epoch": 0.04780426301265582, + "grad_norm": 1.6728276014328003, + "learning_rate": 4.971866826719288e-05, + "loss": 5.0535, + "step": 8038 + }, + { + "epoch": 0.047810210295936816, + "grad_norm": 1.4831758737564087, + "learning_rate": 4.971859838533921e-05, + "loss": 5.0705, + "step": 8039 + }, + { + "epoch": 0.04781615757921781, + "grad_norm": 1.7412161827087402, + "learning_rate": 4.971852849485653e-05, + "loss": 4.9338, + "step": 8040 + }, + { + "epoch": 0.04782210486249881, + "grad_norm": 1.4696041345596313, + "learning_rate": 4.971845859574487e-05, + "loss": 5.0643, + "step": 8041 + }, + { + "epoch": 0.04782805214577981, + "grad_norm": 1.4190481901168823, + "learning_rate": 4.9718388688004235e-05, + "loss": 5.0743, + "step": 8042 + }, + { + "epoch": 0.0478339994290608, + "grad_norm": 1.513454556465149, + "learning_rate": 4.9718318771634686e-05, + "loss": 4.8832, + "step": 8043 + }, + { + "epoch": 0.047839946712341805, + "grad_norm": 1.7310774326324463, + "learning_rate": 4.9718248846636216e-05, + "loss": 4.957, + "step": 8044 + }, + { + "epoch": 0.0478458939956228, + "grad_norm": 1.4895838499069214, + "learning_rate": 4.971817891300886e-05, + "loss": 4.9121, + "step": 8045 + }, + { + "epoch": 0.047851841278903795, + "grad_norm": 1.6848632097244263, + "learning_rate": 4.9718108970752656e-05, + "loss": 5.1337, + "step": 8046 + }, + { + "epoch": 0.0478577885621848, + "grad_norm": 1.7145766019821167, + "learning_rate": 4.97180390198676e-05, + "loss": 5.1827, + "step": 8047 + }, + { + "epoch": 0.04786373584546579, + "grad_norm": 1.668140172958374, + "learning_rate": 4.971796906035374e-05, + "loss": 5.4071, + "step": 8048 + }, + { + "epoch": 0.04786968312874679, + "grad_norm": 1.6927748918533325, + "learning_rate": 4.9717899092211094e-05, + "loss": 5.4319, + "step": 8049 + }, + { + "epoch": 0.04787563041202778, + "grad_norm": 1.6696170568466187, + "learning_rate": 4.971782911543968e-05, + "loss": 5.4137, + "step": 8050 + }, + { + "epoch": 0.047881577695308784, + "grad_norm": 1.9299427270889282, + "learning_rate": 4.971775913003953e-05, + "loss": 5.6676, + "step": 8051 + }, + { + "epoch": 0.04788752497858978, + "grad_norm": 1.7163755893707275, + "learning_rate": 4.971768913601066e-05, + "loss": 5.2916, + "step": 8052 + }, + { + "epoch": 0.047893472261870774, + "grad_norm": 1.7822209596633911, + "learning_rate": 4.971761913335311e-05, + "loss": 5.6364, + "step": 8053 + }, + { + "epoch": 0.047899419545151777, + "grad_norm": 1.725375771522522, + "learning_rate": 4.971754912206689e-05, + "loss": 5.045, + "step": 8054 + }, + { + "epoch": 0.04790536682843277, + "grad_norm": 1.5243995189666748, + "learning_rate": 4.9717479102152027e-05, + "loss": 5.4691, + "step": 8055 + }, + { + "epoch": 0.04791131411171377, + "grad_norm": 1.6673872470855713, + "learning_rate": 4.971740907360854e-05, + "loss": 5.4851, + "step": 8056 + }, + { + "epoch": 0.04791726139499477, + "grad_norm": 1.6378693580627441, + "learning_rate": 4.971733903643647e-05, + "loss": 5.2574, + "step": 8057 + }, + { + "epoch": 0.047923208678275764, + "grad_norm": 1.484250545501709, + "learning_rate": 4.9717268990635835e-05, + "loss": 5.2988, + "step": 8058 + }, + { + "epoch": 0.04792915596155676, + "grad_norm": 1.626955270767212, + "learning_rate": 4.971719893620665e-05, + "loss": 5.3502, + "step": 8059 + }, + { + "epoch": 0.04793510324483776, + "grad_norm": 2.1421375274658203, + "learning_rate": 4.9717128873148954e-05, + "loss": 5.3006, + "step": 8060 + }, + { + "epoch": 0.047941050528118756, + "grad_norm": 1.5175740718841553, + "learning_rate": 4.971705880146276e-05, + "loss": 5.4144, + "step": 8061 + }, + { + "epoch": 0.04794699781139975, + "grad_norm": 1.6170361042022705, + "learning_rate": 4.9716988721148095e-05, + "loss": 5.3635, + "step": 8062 + }, + { + "epoch": 0.04795294509468075, + "grad_norm": 1.7269384860992432, + "learning_rate": 4.971691863220499e-05, + "loss": 5.2813, + "step": 8063 + }, + { + "epoch": 0.04795889237796175, + "grad_norm": 1.5144844055175781, + "learning_rate": 4.971684853463345e-05, + "loss": 5.3242, + "step": 8064 + }, + { + "epoch": 0.04796483966124274, + "grad_norm": 1.7125827074050903, + "learning_rate": 4.971677842843353e-05, + "loss": 5.2968, + "step": 8065 + }, + { + "epoch": 0.04797078694452374, + "grad_norm": 1.6067146062850952, + "learning_rate": 4.9716708313605234e-05, + "loss": 5.4446, + "step": 8066 + }, + { + "epoch": 0.04797673422780474, + "grad_norm": 1.8911150693893433, + "learning_rate": 4.9716638190148585e-05, + "loss": 5.1875, + "step": 8067 + }, + { + "epoch": 0.047982681511085735, + "grad_norm": 1.6865830421447754, + "learning_rate": 4.971656805806362e-05, + "loss": 5.1909, + "step": 8068 + }, + { + "epoch": 0.04798862879436673, + "grad_norm": 2.009566068649292, + "learning_rate": 4.9716497917350345e-05, + "loss": 4.9392, + "step": 8069 + }, + { + "epoch": 0.04799457607764773, + "grad_norm": 1.8578897714614868, + "learning_rate": 4.97164277680088e-05, + "loss": 5.3101, + "step": 8070 + }, + { + "epoch": 0.04800052336092873, + "grad_norm": 1.8935741186141968, + "learning_rate": 4.971635761003901e-05, + "loss": 5.3952, + "step": 8071 + }, + { + "epoch": 0.04800647064420972, + "grad_norm": 2.0030407905578613, + "learning_rate": 4.9716287443440994e-05, + "loss": 5.1685, + "step": 8072 + }, + { + "epoch": 0.048012417927490725, + "grad_norm": 2.0079195499420166, + "learning_rate": 4.9716217268214775e-05, + "loss": 5.4942, + "step": 8073 + }, + { + "epoch": 0.04801836521077172, + "grad_norm": 1.7105878591537476, + "learning_rate": 4.971614708436038e-05, + "loss": 5.4124, + "step": 8074 + }, + { + "epoch": 0.048024312494052715, + "grad_norm": 1.7642161846160889, + "learning_rate": 4.971607689187784e-05, + "loss": 5.3187, + "step": 8075 + }, + { + "epoch": 0.04803025977733372, + "grad_norm": 1.7304610013961792, + "learning_rate": 4.9716006690767165e-05, + "loss": 5.308, + "step": 8076 + }, + { + "epoch": 0.04803620706061471, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.971593648102839e-05, + "loss": 5.4581, + "step": 8077 + }, + { + "epoch": 0.04804215434389571, + "grad_norm": 1.8008997440338135, + "learning_rate": 4.971586626266154e-05, + "loss": 5.3266, + "step": 8078 + }, + { + "epoch": 0.0480481016271767, + "grad_norm": 1.8691446781158447, + "learning_rate": 4.971579603566663e-05, + "loss": 5.2847, + "step": 8079 + }, + { + "epoch": 0.048054048910457704, + "grad_norm": 1.7805777788162231, + "learning_rate": 4.97157258000437e-05, + "loss": 5.446, + "step": 8080 + }, + { + "epoch": 0.0480599961937387, + "grad_norm": 1.4973244667053223, + "learning_rate": 4.971565555579275e-05, + "loss": 5.412, + "step": 8081 + }, + { + "epoch": 0.048065943477019694, + "grad_norm": 1.5994775295257568, + "learning_rate": 4.971558530291384e-05, + "loss": 5.3285, + "step": 8082 + }, + { + "epoch": 0.048071890760300696, + "grad_norm": 1.7743935585021973, + "learning_rate": 4.971551504140696e-05, + "loss": 5.326, + "step": 8083 + }, + { + "epoch": 0.04807783804358169, + "grad_norm": 1.5922112464904785, + "learning_rate": 4.9715444771272154e-05, + "loss": 5.3338, + "step": 8084 + }, + { + "epoch": 0.04808378532686269, + "grad_norm": 1.5587191581726074, + "learning_rate": 4.971537449250944e-05, + "loss": 5.2437, + "step": 8085 + }, + { + "epoch": 0.04808973261014369, + "grad_norm": 1.4972636699676514, + "learning_rate": 4.971530420511884e-05, + "loss": 5.2271, + "step": 8086 + }, + { + "epoch": 0.048095679893424684, + "grad_norm": 1.6221843957901, + "learning_rate": 4.971523390910039e-05, + "loss": 5.3225, + "step": 8087 + }, + { + "epoch": 0.04810162717670568, + "grad_norm": 1.5826990604400635, + "learning_rate": 4.971516360445411e-05, + "loss": 5.2955, + "step": 8088 + }, + { + "epoch": 0.04810757445998668, + "grad_norm": 1.729963779449463, + "learning_rate": 4.971509329118001e-05, + "loss": 5.3263, + "step": 8089 + }, + { + "epoch": 0.048113521743267676, + "grad_norm": 1.680851697921753, + "learning_rate": 4.971502296927813e-05, + "loss": 5.3579, + "step": 8090 + }, + { + "epoch": 0.04811946902654867, + "grad_norm": 2.028024673461914, + "learning_rate": 4.9714952638748504e-05, + "loss": 5.3632, + "step": 8091 + }, + { + "epoch": 0.04812541630982967, + "grad_norm": 1.6236159801483154, + "learning_rate": 4.9714882299591127e-05, + "loss": 5.222, + "step": 8092 + }, + { + "epoch": 0.04813136359311067, + "grad_norm": 1.7522811889648438, + "learning_rate": 4.971481195180605e-05, + "loss": 5.3752, + "step": 8093 + }, + { + "epoch": 0.04813731087639166, + "grad_norm": 1.7108362913131714, + "learning_rate": 4.9714741595393274e-05, + "loss": 5.2994, + "step": 8094 + }, + { + "epoch": 0.04814325815967266, + "grad_norm": 1.7863954305648804, + "learning_rate": 4.971467123035285e-05, + "loss": 5.2386, + "step": 8095 + }, + { + "epoch": 0.04814920544295366, + "grad_norm": 2.0054473876953125, + "learning_rate": 4.971460085668479e-05, + "loss": 5.3565, + "step": 8096 + }, + { + "epoch": 0.048155152726234655, + "grad_norm": 1.6878743171691895, + "learning_rate": 4.971453047438911e-05, + "loss": 5.3448, + "step": 8097 + }, + { + "epoch": 0.04816110000951565, + "grad_norm": 1.8534557819366455, + "learning_rate": 4.971446008346585e-05, + "loss": 5.1446, + "step": 8098 + }, + { + "epoch": 0.04816704729279665, + "grad_norm": 1.8549425601959229, + "learning_rate": 4.9714389683915025e-05, + "loss": 5.2433, + "step": 8099 + }, + { + "epoch": 0.04817299457607765, + "grad_norm": 1.5624927282333374, + "learning_rate": 4.9714319275736666e-05, + "loss": 5.0645, + "step": 8100 + }, + { + "epoch": 0.04817894185935864, + "grad_norm": 1.670462965965271, + "learning_rate": 4.971424885893078e-05, + "loss": 5.1213, + "step": 8101 + }, + { + "epoch": 0.048184889142639645, + "grad_norm": 2.039595603942871, + "learning_rate": 4.9714178433497414e-05, + "loss": 5.1797, + "step": 8102 + }, + { + "epoch": 0.04819083642592064, + "grad_norm": 1.9546380043029785, + "learning_rate": 4.971410799943659e-05, + "loss": 5.2432, + "step": 8103 + }, + { + "epoch": 0.048196783709201635, + "grad_norm": 1.892397403717041, + "learning_rate": 4.971403755674832e-05, + "loss": 5.1775, + "step": 8104 + }, + { + "epoch": 0.04820273099248264, + "grad_norm": 1.7021955251693726, + "learning_rate": 4.971396710543263e-05, + "loss": 5.2242, + "step": 8105 + }, + { + "epoch": 0.04820867827576363, + "grad_norm": 1.7652686834335327, + "learning_rate": 4.9713896645489556e-05, + "loss": 5.1419, + "step": 8106 + }, + { + "epoch": 0.04821462555904463, + "grad_norm": 1.8669620752334595, + "learning_rate": 4.971382617691911e-05, + "loss": 5.1392, + "step": 8107 + }, + { + "epoch": 0.04822057284232562, + "grad_norm": 1.8774491548538208, + "learning_rate": 4.971375569972133e-05, + "loss": 5.1853, + "step": 8108 + }, + { + "epoch": 0.048226520125606624, + "grad_norm": 1.6108628511428833, + "learning_rate": 4.971368521389623e-05, + "loss": 5.4858, + "step": 8109 + }, + { + "epoch": 0.04823246740888762, + "grad_norm": 1.6839191913604736, + "learning_rate": 4.9713614719443835e-05, + "loss": 5.4217, + "step": 8110 + }, + { + "epoch": 0.048238414692168614, + "grad_norm": 1.9300925731658936, + "learning_rate": 4.9713544216364176e-05, + "loss": 5.2259, + "step": 8111 + }, + { + "epoch": 0.048244361975449616, + "grad_norm": 1.9142355918884277, + "learning_rate": 4.971347370465728e-05, + "loss": 5.2, + "step": 8112 + }, + { + "epoch": 0.04825030925873061, + "grad_norm": 1.8046603202819824, + "learning_rate": 4.971340318432315e-05, + "loss": 5.0951, + "step": 8113 + }, + { + "epoch": 0.04825625654201161, + "grad_norm": 1.9129396677017212, + "learning_rate": 4.971333265536184e-05, + "loss": 5.0376, + "step": 8114 + }, + { + "epoch": 0.04826220382529261, + "grad_norm": 1.6774524450302124, + "learning_rate": 4.971326211777335e-05, + "loss": 5.4313, + "step": 8115 + }, + { + "epoch": 0.048268151108573604, + "grad_norm": 1.8156472444534302, + "learning_rate": 4.971319157155773e-05, + "loss": 5.4336, + "step": 8116 + }, + { + "epoch": 0.0482740983918546, + "grad_norm": 1.5704171657562256, + "learning_rate": 4.9713121016714976e-05, + "loss": 5.6878, + "step": 8117 + }, + { + "epoch": 0.0482800456751356, + "grad_norm": 1.585528016090393, + "learning_rate": 4.9713050453245135e-05, + "loss": 5.6208, + "step": 8118 + }, + { + "epoch": 0.048285992958416596, + "grad_norm": 1.3975930213928223, + "learning_rate": 4.9712979881148215e-05, + "loss": 5.8001, + "step": 8119 + }, + { + "epoch": 0.04829194024169759, + "grad_norm": 1.8124761581420898, + "learning_rate": 4.971290930042426e-05, + "loss": 5.6006, + "step": 8120 + }, + { + "epoch": 0.04829788752497859, + "grad_norm": 1.8448232412338257, + "learning_rate": 4.971283871107327e-05, + "loss": 5.4324, + "step": 8121 + }, + { + "epoch": 0.04830383480825959, + "grad_norm": 1.772218108177185, + "learning_rate": 4.97127681130953e-05, + "loss": 6.0943, + "step": 8122 + }, + { + "epoch": 0.04830978209154058, + "grad_norm": 2.038703441619873, + "learning_rate": 4.9712697506490345e-05, + "loss": 5.4224, + "step": 8123 + }, + { + "epoch": 0.04831572937482158, + "grad_norm": 1.576430320739746, + "learning_rate": 4.971262689125845e-05, + "loss": 5.351, + "step": 8124 + }, + { + "epoch": 0.04832167665810258, + "grad_norm": 1.857021450996399, + "learning_rate": 4.971255626739963e-05, + "loss": 5.258, + "step": 8125 + }, + { + "epoch": 0.048327623941383575, + "grad_norm": 1.7989404201507568, + "learning_rate": 4.971248563491391e-05, + "loss": 5.3925, + "step": 8126 + }, + { + "epoch": 0.04833357122466457, + "grad_norm": 1.8104023933410645, + "learning_rate": 4.9712414993801314e-05, + "loss": 5.4326, + "step": 8127 + }, + { + "epoch": 0.04833951850794557, + "grad_norm": 1.898054838180542, + "learning_rate": 4.971234434406188e-05, + "loss": 5.2094, + "step": 8128 + }, + { + "epoch": 0.04834546579122657, + "grad_norm": 1.436633586883545, + "learning_rate": 4.971227368569561e-05, + "loss": 5.2994, + "step": 8129 + }, + { + "epoch": 0.04835141307450756, + "grad_norm": 1.4576120376586914, + "learning_rate": 4.971220301870255e-05, + "loss": 5.3504, + "step": 8130 + }, + { + "epoch": 0.048357360357788565, + "grad_norm": 1.7260229587554932, + "learning_rate": 4.971213234308271e-05, + "loss": 5.1083, + "step": 8131 + }, + { + "epoch": 0.04836330764106956, + "grad_norm": 1.8110415935516357, + "learning_rate": 4.971206165883612e-05, + "loss": 5.1298, + "step": 8132 + }, + { + "epoch": 0.048369254924350555, + "grad_norm": 2.1696786880493164, + "learning_rate": 4.9711990965962804e-05, + "loss": 5.8155, + "step": 8133 + }, + { + "epoch": 0.04837520220763156, + "grad_norm": 1.9905856847763062, + "learning_rate": 4.971192026446279e-05, + "loss": 5.5814, + "step": 8134 + }, + { + "epoch": 0.04838114949091255, + "grad_norm": 1.7459521293640137, + "learning_rate": 4.97118495543361e-05, + "loss": 5.4358, + "step": 8135 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 1.8495198488235474, + "learning_rate": 4.9711778835582756e-05, + "loss": 5.3652, + "step": 8136 + }, + { + "epoch": 0.04839304405747455, + "grad_norm": 1.782850742340088, + "learning_rate": 4.971170810820279e-05, + "loss": 5.2361, + "step": 8137 + }, + { + "epoch": 0.048398991340755544, + "grad_norm": 1.7327016592025757, + "learning_rate": 4.971163737219622e-05, + "loss": 5.0802, + "step": 8138 + }, + { + "epoch": 0.04840493862403654, + "grad_norm": 1.663620114326477, + "learning_rate": 4.9711566627563066e-05, + "loss": 5.1566, + "step": 8139 + }, + { + "epoch": 0.048410885907317534, + "grad_norm": 1.5109026432037354, + "learning_rate": 4.971149587430336e-05, + "loss": 5.1499, + "step": 8140 + }, + { + "epoch": 0.048416833190598536, + "grad_norm": 1.3494226932525635, + "learning_rate": 4.971142511241714e-05, + "loss": 5.1684, + "step": 8141 + }, + { + "epoch": 0.04842278047387953, + "grad_norm": 1.721880555152893, + "learning_rate": 4.97113543419044e-05, + "loss": 5.0199, + "step": 8142 + }, + { + "epoch": 0.048428727757160527, + "grad_norm": 1.7465516328811646, + "learning_rate": 4.971128356276519e-05, + "loss": 5.1181, + "step": 8143 + }, + { + "epoch": 0.04843467504044153, + "grad_norm": 1.8127025365829468, + "learning_rate": 4.971121277499953e-05, + "loss": 5.6514, + "step": 8144 + }, + { + "epoch": 0.048440622323722524, + "grad_norm": 1.6027450561523438, + "learning_rate": 4.971114197860743e-05, + "loss": 5.3408, + "step": 8145 + }, + { + "epoch": 0.04844656960700352, + "grad_norm": 1.6985208988189697, + "learning_rate": 4.971107117358894e-05, + "loss": 5.2002, + "step": 8146 + }, + { + "epoch": 0.04845251689028452, + "grad_norm": 1.681305170059204, + "learning_rate": 4.971100035994406e-05, + "loss": 5.1389, + "step": 8147 + }, + { + "epoch": 0.048458464173565516, + "grad_norm": 1.6053674221038818, + "learning_rate": 4.971092953767282e-05, + "loss": 5.0665, + "step": 8148 + }, + { + "epoch": 0.04846441145684651, + "grad_norm": 1.743134617805481, + "learning_rate": 4.9710858706775266e-05, + "loss": 5.1427, + "step": 8149 + }, + { + "epoch": 0.04847035874012751, + "grad_norm": 1.4901342391967773, + "learning_rate": 4.9710787867251396e-05, + "loss": 5.1957, + "step": 8150 + }, + { + "epoch": 0.04847630602340851, + "grad_norm": 1.6003857851028442, + "learning_rate": 4.971071701910125e-05, + "loss": 5.0658, + "step": 8151 + }, + { + "epoch": 0.0484822533066895, + "grad_norm": 1.7036428451538086, + "learning_rate": 4.971064616232484e-05, + "loss": 5.0823, + "step": 8152 + }, + { + "epoch": 0.0484882005899705, + "grad_norm": 1.5894789695739746, + "learning_rate": 4.97105752969222e-05, + "loss": 5.093, + "step": 8153 + }, + { + "epoch": 0.0484941478732515, + "grad_norm": 1.487648367881775, + "learning_rate": 4.9710504422893364e-05, + "loss": 5.0089, + "step": 8154 + }, + { + "epoch": 0.048500095156532495, + "grad_norm": 2.0251479148864746, + "learning_rate": 4.971043354023834e-05, + "loss": 5.0552, + "step": 8155 + }, + { + "epoch": 0.04850604243981349, + "grad_norm": 1.7097325325012207, + "learning_rate": 4.971036264895715e-05, + "loss": 5.2737, + "step": 8156 + }, + { + "epoch": 0.04851198972309449, + "grad_norm": 1.784836769104004, + "learning_rate": 4.971029174904984e-05, + "loss": 5.2863, + "step": 8157 + }, + { + "epoch": 0.04851793700637549, + "grad_norm": 1.4765781164169312, + "learning_rate": 4.9710220840516416e-05, + "loss": 5.4057, + "step": 8158 + }, + { + "epoch": 0.04852388428965648, + "grad_norm": 1.4173041582107544, + "learning_rate": 4.9710149923356915e-05, + "loss": 5.187, + "step": 8159 + }, + { + "epoch": 0.048529831572937485, + "grad_norm": 1.488173007965088, + "learning_rate": 4.971007899757135e-05, + "loss": 4.975, + "step": 8160 + }, + { + "epoch": 0.04853577885621848, + "grad_norm": 1.391435980796814, + "learning_rate": 4.9710008063159756e-05, + "loss": 5.0782, + "step": 8161 + }, + { + "epoch": 0.048541726139499475, + "grad_norm": 1.7100436687469482, + "learning_rate": 4.970993712012215e-05, + "loss": 5.4953, + "step": 8162 + }, + { + "epoch": 0.04854767342278048, + "grad_norm": 1.8748459815979004, + "learning_rate": 4.970986616845856e-05, + "loss": 5.4535, + "step": 8163 + }, + { + "epoch": 0.04855362070606147, + "grad_norm": 1.901802897453308, + "learning_rate": 4.970979520816902e-05, + "loss": 5.3619, + "step": 8164 + }, + { + "epoch": 0.04855956798934247, + "grad_norm": 1.9850586652755737, + "learning_rate": 4.970972423925354e-05, + "loss": 5.039, + "step": 8165 + }, + { + "epoch": 0.04856551527262347, + "grad_norm": 1.5195177793502808, + "learning_rate": 4.970965326171214e-05, + "loss": 5.1721, + "step": 8166 + }, + { + "epoch": 0.048571462555904464, + "grad_norm": 1.4180214405059814, + "learning_rate": 4.9709582275544866e-05, + "loss": 5.2319, + "step": 8167 + }, + { + "epoch": 0.04857740983918546, + "grad_norm": 1.3797354698181152, + "learning_rate": 4.970951128075173e-05, + "loss": 5.1813, + "step": 8168 + }, + { + "epoch": 0.048583357122466454, + "grad_norm": 1.6448336839675903, + "learning_rate": 4.970944027733276e-05, + "loss": 5.1968, + "step": 8169 + }, + { + "epoch": 0.048589304405747456, + "grad_norm": 1.6626337766647339, + "learning_rate": 4.9709369265287986e-05, + "loss": 5.1303, + "step": 8170 + }, + { + "epoch": 0.04859525168902845, + "grad_norm": 1.5715514421463013, + "learning_rate": 4.970929824461742e-05, + "loss": 5.1609, + "step": 8171 + }, + { + "epoch": 0.048601198972309446, + "grad_norm": 1.5971697568893433, + "learning_rate": 4.970922721532108e-05, + "loss": 5.1489, + "step": 8172 + }, + { + "epoch": 0.04860714625559045, + "grad_norm": 1.6784114837646484, + "learning_rate": 4.970915617739903e-05, + "loss": 5.2778, + "step": 8173 + }, + { + "epoch": 0.048613093538871444, + "grad_norm": 1.7507476806640625, + "learning_rate": 4.970908513085125e-05, + "loss": 5.5719, + "step": 8174 + }, + { + "epoch": 0.04861904082215244, + "grad_norm": 1.7017735242843628, + "learning_rate": 4.970901407567779e-05, + "loss": 5.5197, + "step": 8175 + }, + { + "epoch": 0.04862498810543344, + "grad_norm": 1.8569817543029785, + "learning_rate": 4.9708943011878674e-05, + "loss": 5.3823, + "step": 8176 + }, + { + "epoch": 0.048630935388714436, + "grad_norm": 1.5183817148208618, + "learning_rate": 4.970887193945391e-05, + "loss": 5.5518, + "step": 8177 + }, + { + "epoch": 0.04863688267199543, + "grad_norm": 1.4175498485565186, + "learning_rate": 4.970880085840354e-05, + "loss": 5.4526, + "step": 8178 + }, + { + "epoch": 0.04864282995527643, + "grad_norm": 1.7228561639785767, + "learning_rate": 4.970872976872758e-05, + "loss": 5.5162, + "step": 8179 + }, + { + "epoch": 0.04864877723855743, + "grad_norm": 2.043182849884033, + "learning_rate": 4.970865867042606e-05, + "loss": 5.4212, + "step": 8180 + }, + { + "epoch": 0.04865472452183842, + "grad_norm": 1.377565622329712, + "learning_rate": 4.970858756349901e-05, + "loss": 5.2817, + "step": 8181 + }, + { + "epoch": 0.04866067180511942, + "grad_norm": 1.6977208852767944, + "learning_rate": 4.970851644794643e-05, + "loss": 5.4081, + "step": 8182 + }, + { + "epoch": 0.04866661908840042, + "grad_norm": 1.3136184215545654, + "learning_rate": 4.970844532376838e-05, + "loss": 5.4272, + "step": 8183 + }, + { + "epoch": 0.048672566371681415, + "grad_norm": 1.8863121271133423, + "learning_rate": 4.9708374190964854e-05, + "loss": 5.441, + "step": 8184 + }, + { + "epoch": 0.04867851365496241, + "grad_norm": 1.6755374670028687, + "learning_rate": 4.97083030495359e-05, + "loss": 5.5045, + "step": 8185 + }, + { + "epoch": 0.04868446093824341, + "grad_norm": 1.8439961671829224, + "learning_rate": 4.970823189948153e-05, + "loss": 5.5252, + "step": 8186 + }, + { + "epoch": 0.04869040822152441, + "grad_norm": 1.9662889242172241, + "learning_rate": 4.9708160740801765e-05, + "loss": 5.4379, + "step": 8187 + }, + { + "epoch": 0.0486963555048054, + "grad_norm": 1.691857099533081, + "learning_rate": 4.970808957349664e-05, + "loss": 5.3652, + "step": 8188 + }, + { + "epoch": 0.048702302788086405, + "grad_norm": 1.7482357025146484, + "learning_rate": 4.970801839756618e-05, + "loss": 5.1436, + "step": 8189 + }, + { + "epoch": 0.0487082500713674, + "grad_norm": 1.9221199750900269, + "learning_rate": 4.9707947213010396e-05, + "loss": 5.1936, + "step": 8190 + }, + { + "epoch": 0.048714197354648395, + "grad_norm": 1.9124062061309814, + "learning_rate": 4.970787601982933e-05, + "loss": 5.28, + "step": 8191 + }, + { + "epoch": 0.0487201446379294, + "grad_norm": 1.8999123573303223, + "learning_rate": 4.9707804818023e-05, + "loss": 5.3262, + "step": 8192 + }, + { + "epoch": 0.04872609192121039, + "grad_norm": 1.7711995840072632, + "learning_rate": 4.970773360759143e-05, + "loss": 5.1764, + "step": 8193 + }, + { + "epoch": 0.04873203920449139, + "grad_norm": 2.122689962387085, + "learning_rate": 4.970766238853465e-05, + "loss": 5.4345, + "step": 8194 + }, + { + "epoch": 0.04873798648777239, + "grad_norm": 2.1027848720550537, + "learning_rate": 4.9707591160852675e-05, + "loss": 5.4547, + "step": 8195 + }, + { + "epoch": 0.048743933771053384, + "grad_norm": 1.6944631338119507, + "learning_rate": 4.970751992454553e-05, + "loss": 5.3638, + "step": 8196 + }, + { + "epoch": 0.04874988105433438, + "grad_norm": 1.7444918155670166, + "learning_rate": 4.9707448679613256e-05, + "loss": 5.2378, + "step": 8197 + }, + { + "epoch": 0.048755828337615374, + "grad_norm": 1.8864104747772217, + "learning_rate": 4.970737742605586e-05, + "loss": 5.3142, + "step": 8198 + }, + { + "epoch": 0.048761775620896376, + "grad_norm": 1.968748927116394, + "learning_rate": 4.970730616387338e-05, + "loss": 5.0824, + "step": 8199 + }, + { + "epoch": 0.04876772290417737, + "grad_norm": 2.166405439376831, + "learning_rate": 4.9707234893065824e-05, + "loss": 5.0999, + "step": 8200 + }, + { + "epoch": 0.048773670187458366, + "grad_norm": 1.9185746908187866, + "learning_rate": 4.970716361363323e-05, + "loss": 5.1465, + "step": 8201 + }, + { + "epoch": 0.04877961747073937, + "grad_norm": 1.9191651344299316, + "learning_rate": 4.9707092325575635e-05, + "loss": 5.0713, + "step": 8202 + }, + { + "epoch": 0.048785564754020364, + "grad_norm": 1.6470153331756592, + "learning_rate": 4.9707021028893034e-05, + "loss": 5.0816, + "step": 8203 + }, + { + "epoch": 0.04879151203730136, + "grad_norm": 1.6995042562484741, + "learning_rate": 4.9706949723585475e-05, + "loss": 5.0207, + "step": 8204 + }, + { + "epoch": 0.04879745932058236, + "grad_norm": 1.8208703994750977, + "learning_rate": 4.970687840965297e-05, + "loss": 4.9789, + "step": 8205 + }, + { + "epoch": 0.048803406603863356, + "grad_norm": 1.8558207750320435, + "learning_rate": 4.9706807087095555e-05, + "loss": 5.0655, + "step": 8206 + }, + { + "epoch": 0.04880935388714435, + "grad_norm": 1.6349478960037231, + "learning_rate": 4.9706735755913234e-05, + "loss": 5.2657, + "step": 8207 + }, + { + "epoch": 0.04881530117042535, + "grad_norm": 1.587143063545227, + "learning_rate": 4.9706664416106065e-05, + "loss": 5.0765, + "step": 8208 + }, + { + "epoch": 0.04882124845370635, + "grad_norm": 1.8467018604278564, + "learning_rate": 4.9706593067674047e-05, + "loss": 5.1458, + "step": 8209 + }, + { + "epoch": 0.04882719573698734, + "grad_norm": 1.8066186904907227, + "learning_rate": 4.9706521710617214e-05, + "loss": 5.0656, + "step": 8210 + }, + { + "epoch": 0.04883314302026834, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9706450344935586e-05, + "loss": 5.1448, + "step": 8211 + }, + { + "epoch": 0.04883909030354934, + "grad_norm": 1.8924201726913452, + "learning_rate": 4.97063789706292e-05, + "loss": 4.748, + "step": 8212 + }, + { + "epoch": 0.048845037586830335, + "grad_norm": 2.091324806213379, + "learning_rate": 4.9706307587698064e-05, + "loss": 5.6537, + "step": 8213 + }, + { + "epoch": 0.04885098487011133, + "grad_norm": 3.1737043857574463, + "learning_rate": 4.970623619614221e-05, + "loss": 5.6898, + "step": 8214 + }, + { + "epoch": 0.04885693215339233, + "grad_norm": 2.194577932357788, + "learning_rate": 4.970616479596167e-05, + "loss": 5.4958, + "step": 8215 + }, + { + "epoch": 0.04886287943667333, + "grad_norm": 2.2362759113311768, + "learning_rate": 4.970609338715646e-05, + "loss": 4.9919, + "step": 8216 + }, + { + "epoch": 0.04886882671995432, + "grad_norm": 1.703684687614441, + "learning_rate": 4.970602196972661e-05, + "loss": 4.8733, + "step": 8217 + }, + { + "epoch": 0.048874774003235325, + "grad_norm": 2.0205307006835938, + "learning_rate": 4.970595054367214e-05, + "loss": 5.1177, + "step": 8218 + }, + { + "epoch": 0.04888072128651632, + "grad_norm": 2.1270928382873535, + "learning_rate": 4.970587910899308e-05, + "loss": 5.6208, + "step": 8219 + }, + { + "epoch": 0.048886668569797315, + "grad_norm": 1.8992488384246826, + "learning_rate": 4.9705807665689455e-05, + "loss": 5.7754, + "step": 8220 + }, + { + "epoch": 0.04889261585307832, + "grad_norm": 2.279099225997925, + "learning_rate": 4.9705736213761286e-05, + "loss": 5.5924, + "step": 8221 + }, + { + "epoch": 0.04889856313635931, + "grad_norm": 1.9186346530914307, + "learning_rate": 4.9705664753208594e-05, + "loss": 5.9424, + "step": 8222 + }, + { + "epoch": 0.04890451041964031, + "grad_norm": 2.0286009311676025, + "learning_rate": 4.970559328403141e-05, + "loss": 5.8461, + "step": 8223 + }, + { + "epoch": 0.04891045770292131, + "grad_norm": 1.797555685043335, + "learning_rate": 4.970552180622977e-05, + "loss": 5.4929, + "step": 8224 + }, + { + "epoch": 0.048916404986202304, + "grad_norm": 2.4879684448242188, + "learning_rate": 4.970545031980368e-05, + "loss": 5.5253, + "step": 8225 + }, + { + "epoch": 0.0489223522694833, + "grad_norm": 2.749763011932373, + "learning_rate": 4.970537882475318e-05, + "loss": 5.6001, + "step": 8226 + }, + { + "epoch": 0.048928299552764294, + "grad_norm": 2.2076292037963867, + "learning_rate": 4.970530732107827e-05, + "loss": 5.5876, + "step": 8227 + }, + { + "epoch": 0.048934246836045296, + "grad_norm": 2.6566662788391113, + "learning_rate": 4.970523580877901e-05, + "loss": 5.7151, + "step": 8228 + }, + { + "epoch": 0.04894019411932629, + "grad_norm": 2.4873850345611572, + "learning_rate": 4.97051642878554e-05, + "loss": 5.7124, + "step": 8229 + }, + { + "epoch": 0.048946141402607286, + "grad_norm": 1.8365200757980347, + "learning_rate": 4.970509275830748e-05, + "loss": 5.292, + "step": 8230 + }, + { + "epoch": 0.04895208868588829, + "grad_norm": 2.064730644226074, + "learning_rate": 4.9705021220135254e-05, + "loss": 5.2854, + "step": 8231 + }, + { + "epoch": 0.04895803596916928, + "grad_norm": 1.969298005104065, + "learning_rate": 4.970494967333877e-05, + "loss": 5.2113, + "step": 8232 + }, + { + "epoch": 0.04896398325245028, + "grad_norm": 1.8438071012496948, + "learning_rate": 4.9704878117918044e-05, + "loss": 5.2281, + "step": 8233 + }, + { + "epoch": 0.04896993053573128, + "grad_norm": 1.9163525104522705, + "learning_rate": 4.97048065538731e-05, + "loss": 5.043, + "step": 8234 + }, + { + "epoch": 0.048975877819012276, + "grad_norm": 1.802356243133545, + "learning_rate": 4.970473498120395e-05, + "loss": 5.2079, + "step": 8235 + }, + { + "epoch": 0.04898182510229327, + "grad_norm": 1.7572704553604126, + "learning_rate": 4.9704663399910645e-05, + "loss": 5.1119, + "step": 8236 + }, + { + "epoch": 0.04898777238557427, + "grad_norm": 1.848747730255127, + "learning_rate": 4.970459180999319e-05, + "loss": 5.0233, + "step": 8237 + }, + { + "epoch": 0.04899371966885527, + "grad_norm": 2.023036003112793, + "learning_rate": 4.9704520211451624e-05, + "loss": 5.2793, + "step": 8238 + }, + { + "epoch": 0.04899966695213626, + "grad_norm": 1.6738852262496948, + "learning_rate": 4.9704448604285965e-05, + "loss": 5.5255, + "step": 8239 + }, + { + "epoch": 0.04900561423541726, + "grad_norm": 1.6676057577133179, + "learning_rate": 4.970437698849624e-05, + "loss": 5.4287, + "step": 8240 + }, + { + "epoch": 0.04901156151869826, + "grad_norm": 1.9960590600967407, + "learning_rate": 4.970430536408247e-05, + "loss": 5.2939, + "step": 8241 + }, + { + "epoch": 0.049017508801979255, + "grad_norm": 2.7218708992004395, + "learning_rate": 4.9704233731044675e-05, + "loss": 5.9019, + "step": 8242 + }, + { + "epoch": 0.04902345608526025, + "grad_norm": 2.385664224624634, + "learning_rate": 4.970416208938289e-05, + "loss": 5.9146, + "step": 8243 + }, + { + "epoch": 0.04902940336854125, + "grad_norm": 2.2598092555999756, + "learning_rate": 4.970409043909714e-05, + "loss": 5.7451, + "step": 8244 + }, + { + "epoch": 0.04903535065182225, + "grad_norm": 2.3063299655914307, + "learning_rate": 4.970401878018745e-05, + "loss": 5.8675, + "step": 8245 + }, + { + "epoch": 0.04904129793510324, + "grad_norm": 2.1543853282928467, + "learning_rate": 4.9703947112653836e-05, + "loss": 5.9136, + "step": 8246 + }, + { + "epoch": 0.049047245218384244, + "grad_norm": 2.267531633377075, + "learning_rate": 4.970387543649634e-05, + "loss": 5.6834, + "step": 8247 + }, + { + "epoch": 0.04905319250166524, + "grad_norm": 2.047351121902466, + "learning_rate": 4.970380375171496e-05, + "loss": 5.5754, + "step": 8248 + }, + { + "epoch": 0.049059139784946235, + "grad_norm": 2.2565114498138428, + "learning_rate": 4.9703732058309745e-05, + "loss": 5.7067, + "step": 8249 + }, + { + "epoch": 0.04906508706822724, + "grad_norm": 1.7584022283554077, + "learning_rate": 4.970366035628073e-05, + "loss": 5.3926, + "step": 8250 + }, + { + "epoch": 0.04907103435150823, + "grad_norm": 1.9898183345794678, + "learning_rate": 4.9703588645627896e-05, + "loss": 5.7163, + "step": 8251 + }, + { + "epoch": 0.04907698163478923, + "grad_norm": 2.4134786128997803, + "learning_rate": 4.970351692635131e-05, + "loss": 5.672, + "step": 8252 + }, + { + "epoch": 0.04908292891807023, + "grad_norm": 2.1059436798095703, + "learning_rate": 4.970344519845097e-05, + "loss": 5.7719, + "step": 8253 + }, + { + "epoch": 0.049088876201351224, + "grad_norm": 2.0731539726257324, + "learning_rate": 4.970337346192692e-05, + "loss": 5.7104, + "step": 8254 + }, + { + "epoch": 0.04909482348463222, + "grad_norm": 2.3058536052703857, + "learning_rate": 4.970330171677918e-05, + "loss": 5.7435, + "step": 8255 + }, + { + "epoch": 0.049100770767913214, + "grad_norm": 2.051424980163574, + "learning_rate": 4.970322996300777e-05, + "loss": 5.7371, + "step": 8256 + }, + { + "epoch": 0.049106718051194216, + "grad_norm": 2.1715517044067383, + "learning_rate": 4.970315820061271e-05, + "loss": 5.5805, + "step": 8257 + }, + { + "epoch": 0.04911266533447521, + "grad_norm": 2.136617422103882, + "learning_rate": 4.9703086429594034e-05, + "loss": 5.8689, + "step": 8258 + }, + { + "epoch": 0.049118612617756206, + "grad_norm": 1.7089059352874756, + "learning_rate": 4.970301464995178e-05, + "loss": 6.0614, + "step": 8259 + }, + { + "epoch": 0.04912455990103721, + "grad_norm": 2.410067319869995, + "learning_rate": 4.970294286168595e-05, + "loss": 5.8762, + "step": 8260 + }, + { + "epoch": 0.0491305071843182, + "grad_norm": 2.2186291217803955, + "learning_rate": 4.970287106479657e-05, + "loss": 5.4903, + "step": 8261 + }, + { + "epoch": 0.0491364544675992, + "grad_norm": 2.312793016433716, + "learning_rate": 4.970279925928368e-05, + "loss": 6.2488, + "step": 8262 + }, + { + "epoch": 0.0491424017508802, + "grad_norm": 2.127859354019165, + "learning_rate": 4.9702727445147305e-05, + "loss": 5.9976, + "step": 8263 + }, + { + "epoch": 0.049148349034161196, + "grad_norm": 2.604367733001709, + "learning_rate": 4.9702655622387454e-05, + "loss": 5.4153, + "step": 8264 + }, + { + "epoch": 0.04915429631744219, + "grad_norm": 1.7832142114639282, + "learning_rate": 4.9702583791004165e-05, + "loss": 5.4024, + "step": 8265 + }, + { + "epoch": 0.04916024360072319, + "grad_norm": 2.04298734664917, + "learning_rate": 4.970251195099746e-05, + "loss": 5.7034, + "step": 8266 + }, + { + "epoch": 0.04916619088400419, + "grad_norm": 2.1806769371032715, + "learning_rate": 4.970244010236736e-05, + "loss": 6.1212, + "step": 8267 + }, + { + "epoch": 0.04917213816728518, + "grad_norm": 1.8740427494049072, + "learning_rate": 4.970236824511389e-05, + "loss": 5.7562, + "step": 8268 + }, + { + "epoch": 0.04917808545056618, + "grad_norm": 1.7718658447265625, + "learning_rate": 4.970229637923709e-05, + "loss": 5.5126, + "step": 8269 + }, + { + "epoch": 0.04918403273384718, + "grad_norm": 1.4966565370559692, + "learning_rate": 4.970222450473696e-05, + "loss": 5.5422, + "step": 8270 + }, + { + "epoch": 0.049189980017128175, + "grad_norm": 1.8283390998840332, + "learning_rate": 4.970215262161355e-05, + "loss": 5.9333, + "step": 8271 + }, + { + "epoch": 0.04919592730040917, + "grad_norm": 2.087460517883301, + "learning_rate": 4.970208072986687e-05, + "loss": 5.5413, + "step": 8272 + }, + { + "epoch": 0.04920187458369017, + "grad_norm": 2.2952873706817627, + "learning_rate": 4.970200882949694e-05, + "loss": 5.7848, + "step": 8273 + }, + { + "epoch": 0.04920782186697117, + "grad_norm": 1.9511842727661133, + "learning_rate": 4.9701936920503804e-05, + "loss": 5.6172, + "step": 8274 + }, + { + "epoch": 0.04921376915025216, + "grad_norm": 1.992211937904358, + "learning_rate": 4.970186500288748e-05, + "loss": 5.48, + "step": 8275 + }, + { + "epoch": 0.049219716433533164, + "grad_norm": 1.739013910293579, + "learning_rate": 4.9701793076647984e-05, + "loss": 5.6351, + "step": 8276 + }, + { + "epoch": 0.04922566371681416, + "grad_norm": 2.150797128677368, + "learning_rate": 4.970172114178534e-05, + "loss": 5.5957, + "step": 8277 + }, + { + "epoch": 0.049231611000095155, + "grad_norm": 2.074070930480957, + "learning_rate": 4.9701649198299594e-05, + "loss": 5.4751, + "step": 8278 + }, + { + "epoch": 0.04923755828337616, + "grad_norm": 2.2276322841644287, + "learning_rate": 4.970157724619075e-05, + "loss": 5.4434, + "step": 8279 + }, + { + "epoch": 0.04924350556665715, + "grad_norm": 1.9707896709442139, + "learning_rate": 4.970150528545884e-05, + "loss": 5.6935, + "step": 8280 + }, + { + "epoch": 0.04924945284993815, + "grad_norm": 2.07774019241333, + "learning_rate": 4.9701433316103895e-05, + "loss": 6.0455, + "step": 8281 + }, + { + "epoch": 0.04925540013321915, + "grad_norm": 2.3262722492218018, + "learning_rate": 4.970136133812593e-05, + "loss": 5.6039, + "step": 8282 + }, + { + "epoch": 0.049261347416500144, + "grad_norm": 2.4353108406066895, + "learning_rate": 4.970128935152498e-05, + "loss": 5.3823, + "step": 8283 + }, + { + "epoch": 0.04926729469978114, + "grad_norm": 2.7383084297180176, + "learning_rate": 4.970121735630106e-05, + "loss": 5.4039, + "step": 8284 + }, + { + "epoch": 0.049273241983062134, + "grad_norm": 2.9022698402404785, + "learning_rate": 4.9701145352454205e-05, + "loss": 5.3571, + "step": 8285 + }, + { + "epoch": 0.049279189266343136, + "grad_norm": 2.314373731613159, + "learning_rate": 4.970107333998443e-05, + "loss": 5.4877, + "step": 8286 + }, + { + "epoch": 0.04928513654962413, + "grad_norm": 1.9494023323059082, + "learning_rate": 4.970100131889177e-05, + "loss": 5.5171, + "step": 8287 + }, + { + "epoch": 0.049291083832905126, + "grad_norm": 2.7892074584960938, + "learning_rate": 4.9700929289176245e-05, + "loss": 5.5347, + "step": 8288 + }, + { + "epoch": 0.04929703111618613, + "grad_norm": 2.305204391479492, + "learning_rate": 4.970085725083788e-05, + "loss": 5.8689, + "step": 8289 + }, + { + "epoch": 0.04930297839946712, + "grad_norm": 2.4212634563446045, + "learning_rate": 4.97007852038767e-05, + "loss": 5.8982, + "step": 8290 + }, + { + "epoch": 0.04930892568274812, + "grad_norm": 3.584625482559204, + "learning_rate": 4.9700713148292734e-05, + "loss": 5.2341, + "step": 8291 + }, + { + "epoch": 0.04931487296602912, + "grad_norm": 2.874703884124756, + "learning_rate": 4.9700641084086e-05, + "loss": 5.2312, + "step": 8292 + }, + { + "epoch": 0.049320820249310116, + "grad_norm": 2.113234519958496, + "learning_rate": 4.9700569011256524e-05, + "loss": 5.5779, + "step": 8293 + }, + { + "epoch": 0.04932676753259111, + "grad_norm": 3.027318000793457, + "learning_rate": 4.970049692980434e-05, + "loss": 5.3899, + "step": 8294 + }, + { + "epoch": 0.04933271481587211, + "grad_norm": 2.779520273208618, + "learning_rate": 4.970042483972947e-05, + "loss": 5.4023, + "step": 8295 + }, + { + "epoch": 0.04933866209915311, + "grad_norm": 2.4358251094818115, + "learning_rate": 4.970035274103193e-05, + "loss": 5.4932, + "step": 8296 + }, + { + "epoch": 0.0493446093824341, + "grad_norm": 1.926193118095398, + "learning_rate": 4.970028063371176e-05, + "loss": 5.4058, + "step": 8297 + }, + { + "epoch": 0.0493505566657151, + "grad_norm": 1.7216569185256958, + "learning_rate": 4.970020851776898e-05, + "loss": 5.3265, + "step": 8298 + }, + { + "epoch": 0.0493565039489961, + "grad_norm": 1.9850976467132568, + "learning_rate": 4.97001363932036e-05, + "loss": 5.1626, + "step": 8299 + }, + { + "epoch": 0.049362451232277095, + "grad_norm": 2.1380982398986816, + "learning_rate": 4.9700064260015666e-05, + "loss": 5.3285, + "step": 8300 + }, + { + "epoch": 0.04936839851555809, + "grad_norm": 2.118781566619873, + "learning_rate": 4.969999211820518e-05, + "loss": 5.3544, + "step": 8301 + }, + { + "epoch": 0.04937434579883909, + "grad_norm": 2.0255584716796875, + "learning_rate": 4.96999199677722e-05, + "loss": 5.4256, + "step": 8302 + }, + { + "epoch": 0.04938029308212009, + "grad_norm": 2.0269806385040283, + "learning_rate": 4.9699847808716724e-05, + "loss": 5.9744, + "step": 8303 + }, + { + "epoch": 0.04938624036540108, + "grad_norm": 2.60446834564209, + "learning_rate": 4.969977564103879e-05, + "loss": 5.3926, + "step": 8304 + }, + { + "epoch": 0.049392187648682084, + "grad_norm": 2.1011881828308105, + "learning_rate": 4.9699703464738426e-05, + "loss": 5.4278, + "step": 8305 + }, + { + "epoch": 0.04939813493196308, + "grad_norm": 1.9267319440841675, + "learning_rate": 4.969963127981564e-05, + "loss": 5.6232, + "step": 8306 + }, + { + "epoch": 0.049404082215244075, + "grad_norm": 2.1958322525024414, + "learning_rate": 4.969955908627048e-05, + "loss": 5.8577, + "step": 8307 + }, + { + "epoch": 0.049410029498525077, + "grad_norm": 2.392241954803467, + "learning_rate": 4.969948688410294e-05, + "loss": 5.8013, + "step": 8308 + }, + { + "epoch": 0.04941597678180607, + "grad_norm": 2.8284695148468018, + "learning_rate": 4.969941467331308e-05, + "loss": 6.1246, + "step": 8309 + }, + { + "epoch": 0.04942192406508707, + "grad_norm": 2.8590078353881836, + "learning_rate": 4.96993424539009e-05, + "loss": 6.1068, + "step": 8310 + }, + { + "epoch": 0.04942787134836807, + "grad_norm": 1.876207709312439, + "learning_rate": 4.969927022586644e-05, + "loss": 5.5493, + "step": 8311 + }, + { + "epoch": 0.049433818631649064, + "grad_norm": 1.988061547279358, + "learning_rate": 4.969919798920972e-05, + "loss": 5.7059, + "step": 8312 + }, + { + "epoch": 0.04943976591493006, + "grad_norm": 2.8230605125427246, + "learning_rate": 4.969912574393077e-05, + "loss": 5.9381, + "step": 8313 + }, + { + "epoch": 0.049445713198211054, + "grad_norm": 2.4622697830200195, + "learning_rate": 4.96990534900296e-05, + "loss": 6.0935, + "step": 8314 + }, + { + "epoch": 0.049451660481492056, + "grad_norm": 2.0811798572540283, + "learning_rate": 4.9698981227506254e-05, + "loss": 6.3475, + "step": 8315 + }, + { + "epoch": 0.04945760776477305, + "grad_norm": 2.099489212036133, + "learning_rate": 4.9698908956360745e-05, + "loss": 5.7266, + "step": 8316 + }, + { + "epoch": 0.049463555048054046, + "grad_norm": 2.1711854934692383, + "learning_rate": 4.9698836676593104e-05, + "loss": 5.6067, + "step": 8317 + }, + { + "epoch": 0.04946950233133505, + "grad_norm": 2.195296287536621, + "learning_rate": 4.969876438820335e-05, + "loss": 5.3896, + "step": 8318 + }, + { + "epoch": 0.04947544961461604, + "grad_norm": 2.114830255508423, + "learning_rate": 4.969869209119151e-05, + "loss": 5.6922, + "step": 8319 + }, + { + "epoch": 0.04948139689789704, + "grad_norm": 2.1534018516540527, + "learning_rate": 4.969861978555762e-05, + "loss": 6.1372, + "step": 8320 + }, + { + "epoch": 0.04948734418117804, + "grad_norm": 2.151495933532715, + "learning_rate": 4.9698547471301696e-05, + "loss": 6.0915, + "step": 8321 + }, + { + "epoch": 0.049493291464459035, + "grad_norm": 1.8232096433639526, + "learning_rate": 4.9698475148423764e-05, + "loss": 6.1492, + "step": 8322 + }, + { + "epoch": 0.04949923874774003, + "grad_norm": 2.1538467407226562, + "learning_rate": 4.9698402816923844e-05, + "loss": 5.6253, + "step": 8323 + }, + { + "epoch": 0.04950518603102103, + "grad_norm": 2.278797149658203, + "learning_rate": 4.969833047680197e-05, + "loss": 6.0055, + "step": 8324 + }, + { + "epoch": 0.04951113331430203, + "grad_norm": 2.479342460632324, + "learning_rate": 4.9698258128058164e-05, + "loss": 5.7909, + "step": 8325 + }, + { + "epoch": 0.04951708059758302, + "grad_norm": 2.2959346771240234, + "learning_rate": 4.969818577069245e-05, + "loss": 5.6888, + "step": 8326 + }, + { + "epoch": 0.04952302788086402, + "grad_norm": 1.841544270515442, + "learning_rate": 4.969811340470486e-05, + "loss": 5.5091, + "step": 8327 + }, + { + "epoch": 0.04952897516414502, + "grad_norm": 2.4512903690338135, + "learning_rate": 4.969804103009541e-05, + "loss": 5.7271, + "step": 8328 + }, + { + "epoch": 0.049534922447426015, + "grad_norm": 2.035473585128784, + "learning_rate": 4.969796864686413e-05, + "loss": 5.3056, + "step": 8329 + }, + { + "epoch": 0.04954086973070701, + "grad_norm": 2.030576705932617, + "learning_rate": 4.9697896255011046e-05, + "loss": 5.2765, + "step": 8330 + }, + { + "epoch": 0.04954681701398801, + "grad_norm": 1.680253505706787, + "learning_rate": 4.9697823854536175e-05, + "loss": 5.1968, + "step": 8331 + }, + { + "epoch": 0.04955276429726901, + "grad_norm": 1.962259292602539, + "learning_rate": 4.969775144543955e-05, + "loss": 5.0743, + "step": 8332 + }, + { + "epoch": 0.04955871158055, + "grad_norm": 2.499044895172119, + "learning_rate": 4.96976790277212e-05, + "loss": 5.5204, + "step": 8333 + }, + { + "epoch": 0.049564658863831004, + "grad_norm": 2.004849672317505, + "learning_rate": 4.969760660138114e-05, + "loss": 5.5714, + "step": 8334 + }, + { + "epoch": 0.049570606147112, + "grad_norm": 2.255171775817871, + "learning_rate": 4.9697534166419405e-05, + "loss": 5.0766, + "step": 8335 + }, + { + "epoch": 0.049576553430392994, + "grad_norm": 2.1219112873077393, + "learning_rate": 4.969746172283601e-05, + "loss": 5.0613, + "step": 8336 + }, + { + "epoch": 0.049582500713673996, + "grad_norm": 1.9718400239944458, + "learning_rate": 4.9697389270631004e-05, + "loss": 5.0007, + "step": 8337 + }, + { + "epoch": 0.04958844799695499, + "grad_norm": 1.87917160987854, + "learning_rate": 4.969731680980437e-05, + "loss": 4.9533, + "step": 8338 + }, + { + "epoch": 0.04959439528023599, + "grad_norm": 1.9610000848770142, + "learning_rate": 4.969724434035618e-05, + "loss": 4.9761, + "step": 8339 + }, + { + "epoch": 0.04960034256351699, + "grad_norm": 1.859434723854065, + "learning_rate": 4.969717186228642e-05, + "loss": 5.2373, + "step": 8340 + }, + { + "epoch": 0.049606289846797984, + "grad_norm": 1.9905357360839844, + "learning_rate": 4.9697099375595144e-05, + "loss": 4.8858, + "step": 8341 + }, + { + "epoch": 0.04961223713007898, + "grad_norm": 1.995355486869812, + "learning_rate": 4.969702688028236e-05, + "loss": 4.9468, + "step": 8342 + }, + { + "epoch": 0.049618184413359974, + "grad_norm": 1.9970706701278687, + "learning_rate": 4.96969543763481e-05, + "loss": 4.8891, + "step": 8343 + }, + { + "epoch": 0.049624131696640976, + "grad_norm": 1.9036997556686401, + "learning_rate": 4.9696881863792385e-05, + "loss": 4.7622, + "step": 8344 + }, + { + "epoch": 0.04963007897992197, + "grad_norm": 1.9532603025436401, + "learning_rate": 4.9696809342615245e-05, + "loss": 4.7832, + "step": 8345 + }, + { + "epoch": 0.049636026263202966, + "grad_norm": 1.9032143354415894, + "learning_rate": 4.969673681281671e-05, + "loss": 4.7569, + "step": 8346 + }, + { + "epoch": 0.04964197354648397, + "grad_norm": 3.4294323921203613, + "learning_rate": 4.96966642743968e-05, + "loss": 5.9381, + "step": 8347 + }, + { + "epoch": 0.04964792082976496, + "grad_norm": 4.137698173522949, + "learning_rate": 4.969659172735554e-05, + "loss": 6.4081, + "step": 8348 + }, + { + "epoch": 0.04965386811304596, + "grad_norm": 2.774838447570801, + "learning_rate": 4.969651917169295e-05, + "loss": 5.9888, + "step": 8349 + }, + { + "epoch": 0.04965981539632696, + "grad_norm": 2.4056432247161865, + "learning_rate": 4.9696446607409054e-05, + "loss": 6.1239, + "step": 8350 + }, + { + "epoch": 0.049665762679607955, + "grad_norm": 2.098475456237793, + "learning_rate": 4.969637403450389e-05, + "loss": 6.4226, + "step": 8351 + }, + { + "epoch": 0.04967170996288895, + "grad_norm": 2.1402597427368164, + "learning_rate": 4.9696301452977475e-05, + "loss": 5.8836, + "step": 8352 + }, + { + "epoch": 0.04967765724616995, + "grad_norm": 2.8023130893707275, + "learning_rate": 4.9696228862829844e-05, + "loss": 6.2452, + "step": 8353 + }, + { + "epoch": 0.04968360452945095, + "grad_norm": 2.7669503688812256, + "learning_rate": 4.9696156264061e-05, + "loss": 6.0093, + "step": 8354 + }, + { + "epoch": 0.04968955181273194, + "grad_norm": 2.2357375621795654, + "learning_rate": 4.9696083656671e-05, + "loss": 6.0614, + "step": 8355 + }, + { + "epoch": 0.049695499096012945, + "grad_norm": 2.1435539722442627, + "learning_rate": 4.969601104065984e-05, + "loss": 6.0718, + "step": 8356 + }, + { + "epoch": 0.04970144637929394, + "grad_norm": 2.6372897624969482, + "learning_rate": 4.969593841602757e-05, + "loss": 5.4878, + "step": 8357 + }, + { + "epoch": 0.049707393662574935, + "grad_norm": 1.9730110168457031, + "learning_rate": 4.9695865782774186e-05, + "loss": 5.8913, + "step": 8358 + }, + { + "epoch": 0.04971334094585593, + "grad_norm": 2.262437105178833, + "learning_rate": 4.9695793140899737e-05, + "loss": 5.0382, + "step": 8359 + }, + { + "epoch": 0.04971928822913693, + "grad_norm": 1.794268250465393, + "learning_rate": 4.9695720490404254e-05, + "loss": 5.784, + "step": 8360 + }, + { + "epoch": 0.04972523551241793, + "grad_norm": 1.9568414688110352, + "learning_rate": 4.969564783128773e-05, + "loss": 5.8939, + "step": 8361 + }, + { + "epoch": 0.04973118279569892, + "grad_norm": 2.0560479164123535, + "learning_rate": 4.969557516355022e-05, + "loss": 5.8806, + "step": 8362 + }, + { + "epoch": 0.049737130078979924, + "grad_norm": 1.9009175300598145, + "learning_rate": 4.9695502487191746e-05, + "loss": 5.5568, + "step": 8363 + }, + { + "epoch": 0.04974307736226092, + "grad_norm": 2.1240882873535156, + "learning_rate": 4.9695429802212325e-05, + "loss": 5.4514, + "step": 8364 + }, + { + "epoch": 0.049749024645541914, + "grad_norm": 2.0803675651550293, + "learning_rate": 4.969535710861198e-05, + "loss": 5.7679, + "step": 8365 + }, + { + "epoch": 0.049754971928822916, + "grad_norm": 1.9357428550720215, + "learning_rate": 4.969528440639074e-05, + "loss": 6.1658, + "step": 8366 + }, + { + "epoch": 0.04976091921210391, + "grad_norm": 1.89462411403656, + "learning_rate": 4.9695211695548635e-05, + "loss": 6.0559, + "step": 8367 + }, + { + "epoch": 0.04976686649538491, + "grad_norm": 1.5986123085021973, + "learning_rate": 4.969513897608569e-05, + "loss": 5.7787, + "step": 8368 + }, + { + "epoch": 0.04977281377866591, + "grad_norm": 2.0391738414764404, + "learning_rate": 4.969506624800192e-05, + "loss": 5.5559, + "step": 8369 + }, + { + "epoch": 0.049778761061946904, + "grad_norm": 2.1463794708251953, + "learning_rate": 4.969499351129736e-05, + "loss": 5.5734, + "step": 8370 + }, + { + "epoch": 0.0497847083452279, + "grad_norm": 2.1488826274871826, + "learning_rate": 4.969492076597203e-05, + "loss": 5.7502, + "step": 8371 + }, + { + "epoch": 0.049790655628508894, + "grad_norm": 2.214439868927002, + "learning_rate": 4.9694848012025966e-05, + "loss": 5.8829, + "step": 8372 + }, + { + "epoch": 0.049796602911789896, + "grad_norm": 2.366196632385254, + "learning_rate": 4.969477524945918e-05, + "loss": 5.3428, + "step": 8373 + }, + { + "epoch": 0.04980255019507089, + "grad_norm": 2.239044189453125, + "learning_rate": 4.96947024782717e-05, + "loss": 5.7258, + "step": 8374 + }, + { + "epoch": 0.049808497478351886, + "grad_norm": 2.315492868423462, + "learning_rate": 4.9694629698463554e-05, + "loss": 5.6542, + "step": 8375 + }, + { + "epoch": 0.04981444476163289, + "grad_norm": 2.340740919113159, + "learning_rate": 4.969455691003478e-05, + "loss": 5.0699, + "step": 8376 + }, + { + "epoch": 0.04982039204491388, + "grad_norm": 2.644800901412964, + "learning_rate": 4.9694484112985386e-05, + "loss": 5.3808, + "step": 8377 + }, + { + "epoch": 0.04982633932819488, + "grad_norm": 2.7073781490325928, + "learning_rate": 4.96944113073154e-05, + "loss": 5.5233, + "step": 8378 + }, + { + "epoch": 0.04983228661147588, + "grad_norm": 2.5480713844299316, + "learning_rate": 4.969433849302485e-05, + "loss": 5.3908, + "step": 8379 + }, + { + "epoch": 0.049838233894756875, + "grad_norm": 2.494356155395508, + "learning_rate": 4.969426567011376e-05, + "loss": 5.3528, + "step": 8380 + }, + { + "epoch": 0.04984418117803787, + "grad_norm": 2.4249942302703857, + "learning_rate": 4.9694192838582155e-05, + "loss": 5.2995, + "step": 8381 + }, + { + "epoch": 0.04985012846131887, + "grad_norm": 2.5930840969085693, + "learning_rate": 4.9694119998430066e-05, + "loss": 6.0202, + "step": 8382 + }, + { + "epoch": 0.04985607574459987, + "grad_norm": 2.391972541809082, + "learning_rate": 4.969404714965752e-05, + "loss": 6.0247, + "step": 8383 + }, + { + "epoch": 0.04986202302788086, + "grad_norm": 2.2849159240722656, + "learning_rate": 4.9693974292264535e-05, + "loss": 5.892, + "step": 8384 + }, + { + "epoch": 0.049867970311161865, + "grad_norm": 2.1887097358703613, + "learning_rate": 4.9693901426251134e-05, + "loss": 6.0196, + "step": 8385 + }, + { + "epoch": 0.04987391759444286, + "grad_norm": 2.3988685607910156, + "learning_rate": 4.969382855161735e-05, + "loss": 5.5596, + "step": 8386 + }, + { + "epoch": 0.049879864877723855, + "grad_norm": 2.675144910812378, + "learning_rate": 4.9693755668363204e-05, + "loss": 5.3495, + "step": 8387 + }, + { + "epoch": 0.04988581216100485, + "grad_norm": 2.3753585815429688, + "learning_rate": 4.969368277648873e-05, + "loss": 5.8823, + "step": 8388 + }, + { + "epoch": 0.04989175944428585, + "grad_norm": 2.3168766498565674, + "learning_rate": 4.969360987599394e-05, + "loss": 5.9768, + "step": 8389 + }, + { + "epoch": 0.04989770672756685, + "grad_norm": 2.427138566970825, + "learning_rate": 4.969353696687886e-05, + "loss": 6.1823, + "step": 8390 + }, + { + "epoch": 0.04990365401084784, + "grad_norm": 2.304731845855713, + "learning_rate": 4.9693464049143526e-05, + "loss": 5.8697, + "step": 8391 + }, + { + "epoch": 0.049909601294128844, + "grad_norm": 2.2139687538146973, + "learning_rate": 4.9693391122787966e-05, + "loss": 6.0274, + "step": 8392 + }, + { + "epoch": 0.04991554857740984, + "grad_norm": 2.1165316104888916, + "learning_rate": 4.9693318187812185e-05, + "loss": 5.2499, + "step": 8393 + }, + { + "epoch": 0.049921495860690834, + "grad_norm": 2.5213639736175537, + "learning_rate": 4.969324524421624e-05, + "loss": 4.9105, + "step": 8394 + }, + { + "epoch": 0.049927443143971836, + "grad_norm": 2.2188315391540527, + "learning_rate": 4.9693172292000125e-05, + "loss": 4.8652, + "step": 8395 + }, + { + "epoch": 0.04993339042725283, + "grad_norm": 2.393179416656494, + "learning_rate": 4.9693099331163886e-05, + "loss": 4.924, + "step": 8396 + }, + { + "epoch": 0.04993933771053383, + "grad_norm": 2.150264024734497, + "learning_rate": 4.969302636170753e-05, + "loss": 4.9168, + "step": 8397 + }, + { + "epoch": 0.04994528499381483, + "grad_norm": 2.252499580383301, + "learning_rate": 4.96929533836311e-05, + "loss": 4.7822, + "step": 8398 + }, + { + "epoch": 0.049951232277095824, + "grad_norm": 2.342132806777954, + "learning_rate": 4.969288039693461e-05, + "loss": 5.3691, + "step": 8399 + }, + { + "epoch": 0.04995717956037682, + "grad_norm": 2.3533523082733154, + "learning_rate": 4.96928074016181e-05, + "loss": 5.9989, + "step": 8400 + }, + { + "epoch": 0.049963126843657814, + "grad_norm": 2.185727834701538, + "learning_rate": 4.969273439768158e-05, + "loss": 5.6101, + "step": 8401 + }, + { + "epoch": 0.049969074126938816, + "grad_norm": 2.3396189212799072, + "learning_rate": 4.969266138512509e-05, + "loss": 5.845, + "step": 8402 + }, + { + "epoch": 0.04997502141021981, + "grad_norm": 2.2145371437072754, + "learning_rate": 4.969258836394864e-05, + "loss": 5.6657, + "step": 8403 + }, + { + "epoch": 0.049980968693500806, + "grad_norm": 2.2084364891052246, + "learning_rate": 4.969251533415226e-05, + "loss": 5.8823, + "step": 8404 + }, + { + "epoch": 0.04998691597678181, + "grad_norm": 1.7423903942108154, + "learning_rate": 4.9692442295735984e-05, + "loss": 5.8209, + "step": 8405 + }, + { + "epoch": 0.0499928632600628, + "grad_norm": 2.3057217597961426, + "learning_rate": 4.9692369248699824e-05, + "loss": 5.8352, + "step": 8406 + }, + { + "epoch": 0.0499988105433438, + "grad_norm": 2.1800148487091064, + "learning_rate": 4.969229619304382e-05, + "loss": 5.783, + "step": 8407 + }, + { + "epoch": 0.0500047578266248, + "grad_norm": 1.8594306707382202, + "learning_rate": 4.969222312876799e-05, + "loss": 6.01, + "step": 8408 + }, + { + "epoch": 0.050010705109905795, + "grad_norm": 2.119917392730713, + "learning_rate": 4.9692150055872355e-05, + "loss": 5.7282, + "step": 8409 + }, + { + "epoch": 0.05001665239318679, + "grad_norm": 2.5282747745513916, + "learning_rate": 4.969207697435695e-05, + "loss": 5.0853, + "step": 8410 + }, + { + "epoch": 0.05002259967646779, + "grad_norm": 2.5683388710021973, + "learning_rate": 4.969200388422179e-05, + "loss": 4.9841, + "step": 8411 + }, + { + "epoch": 0.05002854695974879, + "grad_norm": 2.649918794631958, + "learning_rate": 4.969193078546692e-05, + "loss": 5.6365, + "step": 8412 + }, + { + "epoch": 0.05003449424302978, + "grad_norm": 2.3040120601654053, + "learning_rate": 4.969185767809234e-05, + "loss": 5.8272, + "step": 8413 + }, + { + "epoch": 0.050040441526310785, + "grad_norm": 2.033600330352783, + "learning_rate": 4.9691784562098084e-05, + "loss": 5.9779, + "step": 8414 + }, + { + "epoch": 0.05004638880959178, + "grad_norm": 2.1903419494628906, + "learning_rate": 4.96917114374842e-05, + "loss": 5.8651, + "step": 8415 + }, + { + "epoch": 0.050052336092872775, + "grad_norm": 2.4431047439575195, + "learning_rate": 4.969163830425068e-05, + "loss": 4.7787, + "step": 8416 + }, + { + "epoch": 0.05005828337615377, + "grad_norm": 2.6652824878692627, + "learning_rate": 4.969156516239756e-05, + "loss": 4.7133, + "step": 8417 + }, + { + "epoch": 0.05006423065943477, + "grad_norm": 2.4090182781219482, + "learning_rate": 4.969149201192488e-05, + "loss": 4.4506, + "step": 8418 + }, + { + "epoch": 0.05007017794271577, + "grad_norm": 2.5310218334198, + "learning_rate": 4.969141885283265e-05, + "loss": 4.5286, + "step": 8419 + }, + { + "epoch": 0.05007612522599676, + "grad_norm": 2.5333101749420166, + "learning_rate": 4.9691345685120905e-05, + "loss": 4.6012, + "step": 8420 + }, + { + "epoch": 0.050082072509277764, + "grad_norm": 2.172724485397339, + "learning_rate": 4.9691272508789665e-05, + "loss": 4.9161, + "step": 8421 + }, + { + "epoch": 0.05008801979255876, + "grad_norm": 2.034684181213379, + "learning_rate": 4.969119932383896e-05, + "loss": 5.3105, + "step": 8422 + }, + { + "epoch": 0.050093967075839754, + "grad_norm": 1.9046155214309692, + "learning_rate": 4.969112613026881e-05, + "loss": 5.4308, + "step": 8423 + }, + { + "epoch": 0.050099914359120756, + "grad_norm": 1.7256773710250854, + "learning_rate": 4.9691052928079226e-05, + "loss": 5.2232, + "step": 8424 + }, + { + "epoch": 0.05010586164240175, + "grad_norm": 2.0075321197509766, + "learning_rate": 4.969097971727027e-05, + "loss": 6.1764, + "step": 8425 + }, + { + "epoch": 0.050111808925682746, + "grad_norm": 2.1523852348327637, + "learning_rate": 4.9690906497841946e-05, + "loss": 5.8419, + "step": 8426 + }, + { + "epoch": 0.05011775620896375, + "grad_norm": 1.9675406217575073, + "learning_rate": 4.969083326979428e-05, + "loss": 5.7919, + "step": 8427 + }, + { + "epoch": 0.050123703492244744, + "grad_norm": 2.0327789783477783, + "learning_rate": 4.9690760033127295e-05, + "loss": 5.0232, + "step": 8428 + }, + { + "epoch": 0.05012965077552574, + "grad_norm": 1.677471399307251, + "learning_rate": 4.969068678784102e-05, + "loss": 5.1106, + "step": 8429 + }, + { + "epoch": 0.050135598058806734, + "grad_norm": 1.727847933769226, + "learning_rate": 4.9690613533935496e-05, + "loss": 5.1589, + "step": 8430 + }, + { + "epoch": 0.050141545342087736, + "grad_norm": 1.8167927265167236, + "learning_rate": 4.9690540271410726e-05, + "loss": 5.1207, + "step": 8431 + }, + { + "epoch": 0.05014749262536873, + "grad_norm": 2.277425527572632, + "learning_rate": 4.969046700026674e-05, + "loss": 5.6614, + "step": 8432 + }, + { + "epoch": 0.050153439908649726, + "grad_norm": 1.6471065282821655, + "learning_rate": 4.969039372050356e-05, + "loss": 5.2065, + "step": 8433 + }, + { + "epoch": 0.05015938719193073, + "grad_norm": 1.9049899578094482, + "learning_rate": 4.9690320432121226e-05, + "loss": 5.7453, + "step": 8434 + }, + { + "epoch": 0.05016533447521172, + "grad_norm": 1.9145495891571045, + "learning_rate": 4.969024713511976e-05, + "loss": 6.2207, + "step": 8435 + }, + { + "epoch": 0.05017128175849272, + "grad_norm": 1.6634061336517334, + "learning_rate": 4.969017382949918e-05, + "loss": 6.1694, + "step": 8436 + }, + { + "epoch": 0.05017722904177372, + "grad_norm": 1.9804925918579102, + "learning_rate": 4.969010051525952e-05, + "loss": 6.2917, + "step": 8437 + }, + { + "epoch": 0.050183176325054715, + "grad_norm": 1.9674698114395142, + "learning_rate": 4.969002719240079e-05, + "loss": 6.3105, + "step": 8438 + }, + { + "epoch": 0.05018912360833571, + "grad_norm": 2.1540520191192627, + "learning_rate": 4.968995386092303e-05, + "loss": 5.964, + "step": 8439 + }, + { + "epoch": 0.05019507089161671, + "grad_norm": 1.8545453548431396, + "learning_rate": 4.9689880520826274e-05, + "loss": 5.8744, + "step": 8440 + }, + { + "epoch": 0.05020101817489771, + "grad_norm": 1.8022514581680298, + "learning_rate": 4.968980717211053e-05, + "loss": 6.1547, + "step": 8441 + }, + { + "epoch": 0.0502069654581787, + "grad_norm": 1.6297475099563599, + "learning_rate": 4.968973381477582e-05, + "loss": 6.1397, + "step": 8442 + }, + { + "epoch": 0.050212912741459705, + "grad_norm": 1.6256400346755981, + "learning_rate": 4.968966044882219e-05, + "loss": 6.0529, + "step": 8443 + }, + { + "epoch": 0.0502188600247407, + "grad_norm": 1.5988365411758423, + "learning_rate": 4.968958707424965e-05, + "loss": 6.0653, + "step": 8444 + }, + { + "epoch": 0.050224807308021695, + "grad_norm": 1.7062568664550781, + "learning_rate": 4.968951369105823e-05, + "loss": 5.6761, + "step": 8445 + }, + { + "epoch": 0.05023075459130269, + "grad_norm": 2.6108970642089844, + "learning_rate": 4.968944029924796e-05, + "loss": 5.7222, + "step": 8446 + }, + { + "epoch": 0.05023670187458369, + "grad_norm": 2.2341887950897217, + "learning_rate": 4.9689366898818854e-05, + "loss": 6.057, + "step": 8447 + }, + { + "epoch": 0.05024264915786469, + "grad_norm": 2.1819159984588623, + "learning_rate": 4.968929348977095e-05, + "loss": 6.0386, + "step": 8448 + }, + { + "epoch": 0.05024859644114568, + "grad_norm": 1.9941349029541016, + "learning_rate": 4.968922007210427e-05, + "loss": 6.132, + "step": 8449 + }, + { + "epoch": 0.050254543724426684, + "grad_norm": 1.7330418825149536, + "learning_rate": 4.968914664581883e-05, + "loss": 6.0834, + "step": 8450 + }, + { + "epoch": 0.05026049100770768, + "grad_norm": 1.8946608304977417, + "learning_rate": 4.968907321091467e-05, + "loss": 5.9147, + "step": 8451 + }, + { + "epoch": 0.050266438290988674, + "grad_norm": 2.314767599105835, + "learning_rate": 4.9688999767391815e-05, + "loss": 5.7087, + "step": 8452 + }, + { + "epoch": 0.050272385574269676, + "grad_norm": 2.604673147201538, + "learning_rate": 4.968892631525028e-05, + "loss": 5.7348, + "step": 8453 + }, + { + "epoch": 0.05027833285755067, + "grad_norm": 2.3386125564575195, + "learning_rate": 4.9688852854490097e-05, + "loss": 5.7509, + "step": 8454 + }, + { + "epoch": 0.050284280140831666, + "grad_norm": 2.3919529914855957, + "learning_rate": 4.968877938511129e-05, + "loss": 5.5851, + "step": 8455 + }, + { + "epoch": 0.05029022742411267, + "grad_norm": 2.0978026390075684, + "learning_rate": 4.9688705907113886e-05, + "loss": 5.3663, + "step": 8456 + }, + { + "epoch": 0.050296174707393664, + "grad_norm": 2.1700327396392822, + "learning_rate": 4.9688632420497904e-05, + "loss": 6.0197, + "step": 8457 + }, + { + "epoch": 0.05030212199067466, + "grad_norm": 2.1657676696777344, + "learning_rate": 4.968855892526338e-05, + "loss": 6.1721, + "step": 8458 + }, + { + "epoch": 0.050308069273955654, + "grad_norm": 2.434732437133789, + "learning_rate": 4.968848542141033e-05, + "loss": 6.0217, + "step": 8459 + }, + { + "epoch": 0.050314016557236656, + "grad_norm": 1.8453216552734375, + "learning_rate": 4.96884119089388e-05, + "loss": 6.4071, + "step": 8460 + }, + { + "epoch": 0.05031996384051765, + "grad_norm": 1.930168628692627, + "learning_rate": 4.9688338387848784e-05, + "loss": 6.5024, + "step": 8461 + }, + { + "epoch": 0.050325911123798646, + "grad_norm": 2.1785950660705566, + "learning_rate": 4.968826485814033e-05, + "loss": 5.803, + "step": 8462 + }, + { + "epoch": 0.05033185840707965, + "grad_norm": 2.003187894821167, + "learning_rate": 4.968819131981346e-05, + "loss": 6.2269, + "step": 8463 + }, + { + "epoch": 0.05033780569036064, + "grad_norm": 2.9522452354431152, + "learning_rate": 4.9688117772868195e-05, + "loss": 5.5603, + "step": 8464 + }, + { + "epoch": 0.05034375297364164, + "grad_norm": 1.9813052415847778, + "learning_rate": 4.968804421730457e-05, + "loss": 6.0101, + "step": 8465 + }, + { + "epoch": 0.05034970025692264, + "grad_norm": 2.370225667953491, + "learning_rate": 4.9687970653122596e-05, + "loss": 6.3236, + "step": 8466 + }, + { + "epoch": 0.050355647540203635, + "grad_norm": 1.9233943223953247, + "learning_rate": 4.968789708032231e-05, + "loss": 6.2962, + "step": 8467 + }, + { + "epoch": 0.05036159482348463, + "grad_norm": 1.8740222454071045, + "learning_rate": 4.968782349890373e-05, + "loss": 5.5454, + "step": 8468 + }, + { + "epoch": 0.05036754210676563, + "grad_norm": 1.8627724647521973, + "learning_rate": 4.968774990886689e-05, + "loss": 5.9242, + "step": 8469 + }, + { + "epoch": 0.05037348939004663, + "grad_norm": 1.7016552686691284, + "learning_rate": 4.968767631021181e-05, + "loss": 6.3302, + "step": 8470 + }, + { + "epoch": 0.05037943667332762, + "grad_norm": 1.8826018571853638, + "learning_rate": 4.9687602702938515e-05, + "loss": 6.3308, + "step": 8471 + }, + { + "epoch": 0.050385383956608625, + "grad_norm": 1.777480959892273, + "learning_rate": 4.9687529087047036e-05, + "loss": 6.3948, + "step": 8472 + }, + { + "epoch": 0.05039133123988962, + "grad_norm": 2.10075306892395, + "learning_rate": 4.9687455462537396e-05, + "loss": 6.1615, + "step": 8473 + }, + { + "epoch": 0.050397278523170615, + "grad_norm": 2.3484537601470947, + "learning_rate": 4.9687381829409616e-05, + "loss": 5.8286, + "step": 8474 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 1.8243837356567383, + "learning_rate": 4.968730818766373e-05, + "loss": 6.014, + "step": 8475 + }, + { + "epoch": 0.05040917308973261, + "grad_norm": 1.8149470090866089, + "learning_rate": 4.9687234537299765e-05, + "loss": 5.9723, + "step": 8476 + }, + { + "epoch": 0.05041512037301361, + "grad_norm": 2.400754451751709, + "learning_rate": 4.968716087831773e-05, + "loss": 5.237, + "step": 8477 + }, + { + "epoch": 0.0504210676562946, + "grad_norm": 2.4394338130950928, + "learning_rate": 4.968708721071767e-05, + "loss": 5.1106, + "step": 8478 + }, + { + "epoch": 0.050427014939575604, + "grad_norm": 2.210686445236206, + "learning_rate": 4.96870135344996e-05, + "loss": 5.0002, + "step": 8479 + }, + { + "epoch": 0.0504329622228566, + "grad_norm": 2.302997589111328, + "learning_rate": 4.968693984966355e-05, + "loss": 5.689, + "step": 8480 + }, + { + "epoch": 0.050438909506137594, + "grad_norm": 2.0761525630950928, + "learning_rate": 4.9686866156209546e-05, + "loss": 5.4452, + "step": 8481 + }, + { + "epoch": 0.050444856789418596, + "grad_norm": 2.3239383697509766, + "learning_rate": 4.968679245413761e-05, + "loss": 5.4427, + "step": 8482 + }, + { + "epoch": 0.05045080407269959, + "grad_norm": 3.2064802646636963, + "learning_rate": 4.9686718743447766e-05, + "loss": 5.2947, + "step": 8483 + }, + { + "epoch": 0.050456751355980586, + "grad_norm": 2.680786371231079, + "learning_rate": 4.968664502414004e-05, + "loss": 5.4776, + "step": 8484 + }, + { + "epoch": 0.05046269863926159, + "grad_norm": 2.107583522796631, + "learning_rate": 4.9686571296214476e-05, + "loss": 5.5172, + "step": 8485 + }, + { + "epoch": 0.050468645922542583, + "grad_norm": 1.939788579940796, + "learning_rate": 4.9686497559671075e-05, + "loss": 5.6056, + "step": 8486 + }, + { + "epoch": 0.05047459320582358, + "grad_norm": 1.883991003036499, + "learning_rate": 4.968642381450987e-05, + "loss": 5.6511, + "step": 8487 + }, + { + "epoch": 0.050480540489104574, + "grad_norm": 1.8518444299697876, + "learning_rate": 4.96863500607309e-05, + "loss": 5.5897, + "step": 8488 + }, + { + "epoch": 0.050486487772385576, + "grad_norm": 1.6704350709915161, + "learning_rate": 4.968627629833418e-05, + "loss": 5.5002, + "step": 8489 + }, + { + "epoch": 0.05049243505566657, + "grad_norm": 1.755231261253357, + "learning_rate": 4.968620252731972e-05, + "loss": 5.6012, + "step": 8490 + }, + { + "epoch": 0.050498382338947566, + "grad_norm": 1.8532077074050903, + "learning_rate": 4.968612874768758e-05, + "loss": 5.4443, + "step": 8491 + }, + { + "epoch": 0.05050432962222857, + "grad_norm": 1.787781000137329, + "learning_rate": 4.9686054959437756e-05, + "loss": 5.5623, + "step": 8492 + }, + { + "epoch": 0.05051027690550956, + "grad_norm": 1.6963365077972412, + "learning_rate": 4.9685981162570295e-05, + "loss": 5.5349, + "step": 8493 + }, + { + "epoch": 0.05051622418879056, + "grad_norm": 4.328898906707764, + "learning_rate": 4.96859073570852e-05, + "loss": 5.8026, + "step": 8494 + }, + { + "epoch": 0.05052217147207156, + "grad_norm": 1.6906582117080688, + "learning_rate": 4.968583354298252e-05, + "loss": 5.4804, + "step": 8495 + }, + { + "epoch": 0.050528118755352555, + "grad_norm": 1.5316333770751953, + "learning_rate": 4.968575972026227e-05, + "loss": 5.6005, + "step": 8496 + }, + { + "epoch": 0.05053406603863355, + "grad_norm": 1.6029349565505981, + "learning_rate": 4.968568588892447e-05, + "loss": 5.5991, + "step": 8497 + }, + { + "epoch": 0.05054001332191455, + "grad_norm": 2.246537685394287, + "learning_rate": 4.968561204896916e-05, + "loss": 5.8537, + "step": 8498 + }, + { + "epoch": 0.05054596060519555, + "grad_norm": 2.0347564220428467, + "learning_rate": 4.9685538200396355e-05, + "loss": 5.7968, + "step": 8499 + }, + { + "epoch": 0.05055190788847654, + "grad_norm": 1.7635436058044434, + "learning_rate": 4.968546434320608e-05, + "loss": 5.6324, + "step": 8500 + }, + { + "epoch": 0.050557855171757544, + "grad_norm": 2.415397882461548, + "learning_rate": 4.9685390477398363e-05, + "loss": 5.3795, + "step": 8501 + }, + { + "epoch": 0.05056380245503854, + "grad_norm": 2.1499149799346924, + "learning_rate": 4.9685316602973245e-05, + "loss": 5.5638, + "step": 8502 + }, + { + "epoch": 0.050569749738319535, + "grad_norm": 2.0479557514190674, + "learning_rate": 4.9685242719930725e-05, + "loss": 5.3902, + "step": 8503 + }, + { + "epoch": 0.05057569702160053, + "grad_norm": 1.874993085861206, + "learning_rate": 4.9685168828270845e-05, + "loss": 5.4607, + "step": 8504 + }, + { + "epoch": 0.05058164430488153, + "grad_norm": 1.6361217498779297, + "learning_rate": 4.9685094927993623e-05, + "loss": 5.4378, + "step": 8505 + }, + { + "epoch": 0.05058759158816253, + "grad_norm": 1.598026990890503, + "learning_rate": 4.9685021019099096e-05, + "loss": 5.4336, + "step": 8506 + }, + { + "epoch": 0.05059353887144352, + "grad_norm": 1.7636823654174805, + "learning_rate": 4.968494710158728e-05, + "loss": 5.4757, + "step": 8507 + }, + { + "epoch": 0.050599486154724524, + "grad_norm": 1.7823325395584106, + "learning_rate": 4.968487317545821e-05, + "loss": 5.4872, + "step": 8508 + }, + { + "epoch": 0.05060543343800552, + "grad_norm": 2.39149808883667, + "learning_rate": 4.9684799240711896e-05, + "loss": 5.039, + "step": 8509 + }, + { + "epoch": 0.050611380721286514, + "grad_norm": 2.0295841693878174, + "learning_rate": 4.968472529734838e-05, + "loss": 5.1086, + "step": 8510 + }, + { + "epoch": 0.050617328004567516, + "grad_norm": 2.6830973625183105, + "learning_rate": 4.9684651345367684e-05, + "loss": 4.8889, + "step": 8511 + }, + { + "epoch": 0.05062327528784851, + "grad_norm": 2.3600027561187744, + "learning_rate": 4.9684577384769825e-05, + "loss": 5.5305, + "step": 8512 + }, + { + "epoch": 0.050629222571129506, + "grad_norm": 2.1680233478546143, + "learning_rate": 4.968450341555484e-05, + "loss": 5.8196, + "step": 8513 + }, + { + "epoch": 0.05063516985441051, + "grad_norm": 1.800645351409912, + "learning_rate": 4.968442943772275e-05, + "loss": 5.2689, + "step": 8514 + }, + { + "epoch": 0.0506411171376915, + "grad_norm": 1.983245849609375, + "learning_rate": 4.9684355451273566e-05, + "loss": 4.7782, + "step": 8515 + }, + { + "epoch": 0.0506470644209725, + "grad_norm": 2.12082576751709, + "learning_rate": 4.968428145620735e-05, + "loss": 4.7946, + "step": 8516 + }, + { + "epoch": 0.050653011704253494, + "grad_norm": 1.7249135971069336, + "learning_rate": 4.968420745252409e-05, + "loss": 4.7055, + "step": 8517 + }, + { + "epoch": 0.050658958987534496, + "grad_norm": 1.971240758895874, + "learning_rate": 4.968413344022384e-05, + "loss": 4.7343, + "step": 8518 + }, + { + "epoch": 0.05066490627081549, + "grad_norm": 1.780387282371521, + "learning_rate": 4.968405941930661e-05, + "loss": 4.7502, + "step": 8519 + }, + { + "epoch": 0.050670853554096486, + "grad_norm": 1.772007942199707, + "learning_rate": 4.968398538977242e-05, + "loss": 4.7439, + "step": 8520 + }, + { + "epoch": 0.05067680083737749, + "grad_norm": 1.9167592525482178, + "learning_rate": 4.9683911351621324e-05, + "loss": 4.6393, + "step": 8521 + }, + { + "epoch": 0.05068274812065848, + "grad_norm": 2.0527031421661377, + "learning_rate": 4.968383730485331e-05, + "loss": 4.6379, + "step": 8522 + }, + { + "epoch": 0.05068869540393948, + "grad_norm": 2.0608508586883545, + "learning_rate": 4.968376324946844e-05, + "loss": 4.6128, + "step": 8523 + }, + { + "epoch": 0.05069464268722048, + "grad_norm": 1.984731674194336, + "learning_rate": 4.968368918546672e-05, + "loss": 4.5969, + "step": 8524 + }, + { + "epoch": 0.050700589970501475, + "grad_norm": 1.7904438972473145, + "learning_rate": 4.968361511284817e-05, + "loss": 4.6853, + "step": 8525 + }, + { + "epoch": 0.05070653725378247, + "grad_norm": 1.8095389604568481, + "learning_rate": 4.968354103161283e-05, + "loss": 4.5748, + "step": 8526 + }, + { + "epoch": 0.05071248453706347, + "grad_norm": 1.8565012216567993, + "learning_rate": 4.968346694176073e-05, + "loss": 4.5249, + "step": 8527 + }, + { + "epoch": 0.05071843182034447, + "grad_norm": 1.7721836566925049, + "learning_rate": 4.968339284329188e-05, + "loss": 4.6593, + "step": 8528 + }, + { + "epoch": 0.05072437910362546, + "grad_norm": 1.9470161199569702, + "learning_rate": 4.968331873620631e-05, + "loss": 4.5432, + "step": 8529 + }, + { + "epoch": 0.050730326386906464, + "grad_norm": 1.8639118671417236, + "learning_rate": 4.968324462050404e-05, + "loss": 4.4464, + "step": 8530 + }, + { + "epoch": 0.05073627367018746, + "grad_norm": 1.9226467609405518, + "learning_rate": 4.9683170496185114e-05, + "loss": 4.4364, + "step": 8531 + }, + { + "epoch": 0.050742220953468455, + "grad_norm": 1.988198161125183, + "learning_rate": 4.9683096363249545e-05, + "loss": 4.6614, + "step": 8532 + }, + { + "epoch": 0.05074816823674945, + "grad_norm": 1.903645396232605, + "learning_rate": 4.9683022221697374e-05, + "loss": 4.5168, + "step": 8533 + }, + { + "epoch": 0.05075411552003045, + "grad_norm": 1.903448224067688, + "learning_rate": 4.96829480715286e-05, + "loss": 4.5899, + "step": 8534 + }, + { + "epoch": 0.05076006280331145, + "grad_norm": 1.864522099494934, + "learning_rate": 4.9682873912743274e-05, + "loss": 4.5896, + "step": 8535 + }, + { + "epoch": 0.05076601008659244, + "grad_norm": 1.8760302066802979, + "learning_rate": 4.9682799745341406e-05, + "loss": 4.593, + "step": 8536 + }, + { + "epoch": 0.050771957369873444, + "grad_norm": 1.9024009704589844, + "learning_rate": 4.968272556932303e-05, + "loss": 4.9861, + "step": 8537 + }, + { + "epoch": 0.05077790465315444, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9682651384688176e-05, + "loss": 5.6755, + "step": 8538 + }, + { + "epoch": 0.050783851936435434, + "grad_norm": 1.758934736251831, + "learning_rate": 4.9682577191436854e-05, + "loss": 5.4334, + "step": 8539 + }, + { + "epoch": 0.050789799219716436, + "grad_norm": 2.3531200885772705, + "learning_rate": 4.968250298956909e-05, + "loss": 4.9819, + "step": 8540 + }, + { + "epoch": 0.05079574650299743, + "grad_norm": 1.901681661605835, + "learning_rate": 4.968242877908494e-05, + "loss": 5.1642, + "step": 8541 + }, + { + "epoch": 0.050801693786278426, + "grad_norm": 1.7250633239746094, + "learning_rate": 4.96823545599844e-05, + "loss": 5.4847, + "step": 8542 + }, + { + "epoch": 0.05080764106955943, + "grad_norm": 1.7400966882705688, + "learning_rate": 4.968228033226751e-05, + "loss": 5.5902, + "step": 8543 + }, + { + "epoch": 0.05081358835284042, + "grad_norm": 1.5469578504562378, + "learning_rate": 4.968220609593428e-05, + "loss": 5.6432, + "step": 8544 + }, + { + "epoch": 0.05081953563612142, + "grad_norm": 1.8277182579040527, + "learning_rate": 4.968213185098475e-05, + "loss": 5.3296, + "step": 8545 + }, + { + "epoch": 0.050825482919402414, + "grad_norm": 2.0535261631011963, + "learning_rate": 4.9682057597418943e-05, + "loss": 5.5278, + "step": 8546 + }, + { + "epoch": 0.050831430202683416, + "grad_norm": 1.8631746768951416, + "learning_rate": 4.9681983335236894e-05, + "loss": 5.556, + "step": 8547 + }, + { + "epoch": 0.05083737748596441, + "grad_norm": 1.6663711071014404, + "learning_rate": 4.968190906443861e-05, + "loss": 5.4321, + "step": 8548 + }, + { + "epoch": 0.050843324769245406, + "grad_norm": 1.8302260637283325, + "learning_rate": 4.968183478502413e-05, + "loss": 5.4746, + "step": 8549 + }, + { + "epoch": 0.05084927205252641, + "grad_norm": 1.9203182458877563, + "learning_rate": 4.968176049699347e-05, + "loss": 5.4334, + "step": 8550 + }, + { + "epoch": 0.0508552193358074, + "grad_norm": 2.0406670570373535, + "learning_rate": 4.9681686200346674e-05, + "loss": 5.6509, + "step": 8551 + }, + { + "epoch": 0.0508611666190884, + "grad_norm": 2.3438572883605957, + "learning_rate": 4.968161189508374e-05, + "loss": 5.8662, + "step": 8552 + }, + { + "epoch": 0.0508671139023694, + "grad_norm": 1.9612985849380493, + "learning_rate": 4.968153758120473e-05, + "loss": 5.6813, + "step": 8553 + }, + { + "epoch": 0.050873061185650395, + "grad_norm": 1.4175993204116821, + "learning_rate": 4.968146325870964e-05, + "loss": 5.4593, + "step": 8554 + }, + { + "epoch": 0.05087900846893139, + "grad_norm": 1.3445212841033936, + "learning_rate": 4.96813889275985e-05, + "loss": 5.4195, + "step": 8555 + }, + { + "epoch": 0.05088495575221239, + "grad_norm": 1.9938427209854126, + "learning_rate": 4.968131458787135e-05, + "loss": 5.8791, + "step": 8556 + }, + { + "epoch": 0.05089090303549339, + "grad_norm": 1.7449276447296143, + "learning_rate": 4.9681240239528216e-05, + "loss": 5.3574, + "step": 8557 + }, + { + "epoch": 0.05089685031877438, + "grad_norm": 2.0117087364196777, + "learning_rate": 4.96811658825691e-05, + "loss": 5.3548, + "step": 8558 + }, + { + "epoch": 0.050902797602055384, + "grad_norm": 1.97372567653656, + "learning_rate": 4.968109151699406e-05, + "loss": 5.5281, + "step": 8559 + }, + { + "epoch": 0.05090874488533638, + "grad_norm": 1.8815237283706665, + "learning_rate": 4.9681017142803095e-05, + "loss": 5.4849, + "step": 8560 + }, + { + "epoch": 0.050914692168617375, + "grad_norm": 1.627252221107483, + "learning_rate": 4.968094275999624e-05, + "loss": 5.2125, + "step": 8561 + }, + { + "epoch": 0.05092063945189837, + "grad_norm": 1.4768601655960083, + "learning_rate": 4.968086836857353e-05, + "loss": 5.0817, + "step": 8562 + }, + { + "epoch": 0.05092658673517937, + "grad_norm": 2.0249485969543457, + "learning_rate": 4.968079396853498e-05, + "loss": 5.4025, + "step": 8563 + }, + { + "epoch": 0.05093253401846037, + "grad_norm": 2.0904550552368164, + "learning_rate": 4.968071955988062e-05, + "loss": 5.4404, + "step": 8564 + }, + { + "epoch": 0.05093848130174136, + "grad_norm": 1.935063123703003, + "learning_rate": 4.9680645142610475e-05, + "loss": 5.4961, + "step": 8565 + }, + { + "epoch": 0.050944428585022364, + "grad_norm": 1.9836292266845703, + "learning_rate": 4.968057071672457e-05, + "loss": 5.2469, + "step": 8566 + }, + { + "epoch": 0.05095037586830336, + "grad_norm": 1.8337205648422241, + "learning_rate": 4.9680496282222944e-05, + "loss": 5.4432, + "step": 8567 + }, + { + "epoch": 0.050956323151584354, + "grad_norm": 1.9169154167175293, + "learning_rate": 4.9680421839105604e-05, + "loss": 5.2606, + "step": 8568 + }, + { + "epoch": 0.050962270434865356, + "grad_norm": 1.5869332551956177, + "learning_rate": 4.968034738737258e-05, + "loss": 5.006, + "step": 8569 + }, + { + "epoch": 0.05096821771814635, + "grad_norm": 1.5824979543685913, + "learning_rate": 4.968027292702391e-05, + "loss": 5.2078, + "step": 8570 + }, + { + "epoch": 0.050974165001427346, + "grad_norm": 1.7121458053588867, + "learning_rate": 4.96801984580596e-05, + "loss": 5.3913, + "step": 8571 + }, + { + "epoch": 0.05098011228470835, + "grad_norm": 1.7111082077026367, + "learning_rate": 4.96801239804797e-05, + "loss": 5.3957, + "step": 8572 + }, + { + "epoch": 0.05098605956798934, + "grad_norm": 1.834083080291748, + "learning_rate": 4.968004949428421e-05, + "loss": 5.501, + "step": 8573 + }, + { + "epoch": 0.05099200685127034, + "grad_norm": 1.773421287536621, + "learning_rate": 4.967997499947318e-05, + "loss": 5.429, + "step": 8574 + }, + { + "epoch": 0.05099795413455134, + "grad_norm": 1.7471132278442383, + "learning_rate": 4.967990049604663e-05, + "loss": 5.4853, + "step": 8575 + }, + { + "epoch": 0.051003901417832335, + "grad_norm": 1.7264289855957031, + "learning_rate": 4.967982598400457e-05, + "loss": 5.4415, + "step": 8576 + }, + { + "epoch": 0.05100984870111333, + "grad_norm": 1.750982403755188, + "learning_rate": 4.9679751463347044e-05, + "loss": 5.1731, + "step": 8577 + }, + { + "epoch": 0.051015795984394326, + "grad_norm": 1.6106518507003784, + "learning_rate": 4.967967693407407e-05, + "loss": 5.2692, + "step": 8578 + }, + { + "epoch": 0.05102174326767533, + "grad_norm": 1.8728212118148804, + "learning_rate": 4.967960239618568e-05, + "loss": 5.2416, + "step": 8579 + }, + { + "epoch": 0.05102769055095632, + "grad_norm": 1.6410562992095947, + "learning_rate": 4.967952784968189e-05, + "loss": 5.1824, + "step": 8580 + }, + { + "epoch": 0.05103363783423732, + "grad_norm": 1.7119427919387817, + "learning_rate": 4.967945329456274e-05, + "loss": 5.2316, + "step": 8581 + }, + { + "epoch": 0.05103958511751832, + "grad_norm": 1.667602300643921, + "learning_rate": 4.967937873082824e-05, + "loss": 4.9599, + "step": 8582 + }, + { + "epoch": 0.051045532400799315, + "grad_norm": 1.9595974683761597, + "learning_rate": 4.967930415847842e-05, + "loss": 4.9613, + "step": 8583 + }, + { + "epoch": 0.05105147968408031, + "grad_norm": 1.70210862159729, + "learning_rate": 4.967922957751332e-05, + "loss": 5.3587, + "step": 8584 + }, + { + "epoch": 0.05105742696736131, + "grad_norm": 2.101145029067993, + "learning_rate": 4.967915498793295e-05, + "loss": 5.2782, + "step": 8585 + }, + { + "epoch": 0.05106337425064231, + "grad_norm": 1.8836926221847534, + "learning_rate": 4.9679080389737344e-05, + "loss": 5.3128, + "step": 8586 + }, + { + "epoch": 0.0510693215339233, + "grad_norm": 1.7542184591293335, + "learning_rate": 4.967900578292652e-05, + "loss": 5.2236, + "step": 8587 + }, + { + "epoch": 0.051075268817204304, + "grad_norm": 1.8415964841842651, + "learning_rate": 4.967893116750052e-05, + "loss": 5.1267, + "step": 8588 + }, + { + "epoch": 0.0510812161004853, + "grad_norm": 1.7702316045761108, + "learning_rate": 4.967885654345936e-05, + "loss": 5.6495, + "step": 8589 + }, + { + "epoch": 0.051087163383766294, + "grad_norm": 1.7790406942367554, + "learning_rate": 4.967878191080306e-05, + "loss": 5.2561, + "step": 8590 + }, + { + "epoch": 0.05109311066704729, + "grad_norm": 1.7282217741012573, + "learning_rate": 4.967870726953165e-05, + "loss": 5.2589, + "step": 8591 + }, + { + "epoch": 0.05109905795032829, + "grad_norm": 1.6590560674667358, + "learning_rate": 4.967863261964517e-05, + "loss": 5.1952, + "step": 8592 + }, + { + "epoch": 0.05110500523360929, + "grad_norm": 1.5948386192321777, + "learning_rate": 4.9678557961143625e-05, + "loss": 5.297, + "step": 8593 + }, + { + "epoch": 0.05111095251689028, + "grad_norm": 1.8219022750854492, + "learning_rate": 4.9678483294027046e-05, + "loss": 5.3391, + "step": 8594 + }, + { + "epoch": 0.051116899800171284, + "grad_norm": 1.547616720199585, + "learning_rate": 4.967840861829547e-05, + "loss": 5.4224, + "step": 8595 + }, + { + "epoch": 0.05112284708345228, + "grad_norm": 1.7924590110778809, + "learning_rate": 4.9678333933948914e-05, + "loss": 5.2371, + "step": 8596 + }, + { + "epoch": 0.051128794366733274, + "grad_norm": 1.7630747556686401, + "learning_rate": 4.9678259240987416e-05, + "loss": 5.4849, + "step": 8597 + }, + { + "epoch": 0.051134741650014276, + "grad_norm": 1.7853891849517822, + "learning_rate": 4.967818453941098e-05, + "loss": 5.1753, + "step": 8598 + }, + { + "epoch": 0.05114068893329527, + "grad_norm": 1.6572301387786865, + "learning_rate": 4.9678109829219654e-05, + "loss": 5.3747, + "step": 8599 + }, + { + "epoch": 0.051146636216576266, + "grad_norm": 1.6574329137802124, + "learning_rate": 4.9678035110413445e-05, + "loss": 5.417, + "step": 8600 + }, + { + "epoch": 0.05115258349985727, + "grad_norm": 1.7093894481658936, + "learning_rate": 4.9677960382992396e-05, + "loss": 5.4605, + "step": 8601 + }, + { + "epoch": 0.05115853078313826, + "grad_norm": 1.6304559707641602, + "learning_rate": 4.967788564695652e-05, + "loss": 5.6186, + "step": 8602 + }, + { + "epoch": 0.05116447806641926, + "grad_norm": 1.6134929656982422, + "learning_rate": 4.967781090230586e-05, + "loss": 5.5084, + "step": 8603 + }, + { + "epoch": 0.05117042534970026, + "grad_norm": 1.7007251977920532, + "learning_rate": 4.9677736149040426e-05, + "loss": 5.2542, + "step": 8604 + }, + { + "epoch": 0.051176372632981255, + "grad_norm": 1.6648818254470825, + "learning_rate": 4.967766138716025e-05, + "loss": 5.4136, + "step": 8605 + }, + { + "epoch": 0.05118231991626225, + "grad_norm": 1.5595816373825073, + "learning_rate": 4.967758661666535e-05, + "loss": 5.181, + "step": 8606 + }, + { + "epoch": 0.051188267199543246, + "grad_norm": 1.7358763217926025, + "learning_rate": 4.967751183755577e-05, + "loss": 5.3509, + "step": 8607 + }, + { + "epoch": 0.05119421448282425, + "grad_norm": 1.6836191415786743, + "learning_rate": 4.967743704983152e-05, + "loss": 5.4656, + "step": 8608 + }, + { + "epoch": 0.05120016176610524, + "grad_norm": 1.4641087055206299, + "learning_rate": 4.967736225349263e-05, + "loss": 5.5304, + "step": 8609 + }, + { + "epoch": 0.05120610904938624, + "grad_norm": 1.6273541450500488, + "learning_rate": 4.967728744853913e-05, + "loss": 5.4029, + "step": 8610 + }, + { + "epoch": 0.05121205633266724, + "grad_norm": 1.6471314430236816, + "learning_rate": 4.967721263497105e-05, + "loss": 5.4333, + "step": 8611 + }, + { + "epoch": 0.051218003615948235, + "grad_norm": 1.798155665397644, + "learning_rate": 4.96771378127884e-05, + "loss": 5.5214, + "step": 8612 + }, + { + "epoch": 0.05122395089922923, + "grad_norm": 1.8606700897216797, + "learning_rate": 4.967706298199122e-05, + "loss": 4.8808, + "step": 8613 + }, + { + "epoch": 0.05122989818251023, + "grad_norm": 1.7144849300384521, + "learning_rate": 4.967698814257953e-05, + "loss": 4.9451, + "step": 8614 + }, + { + "epoch": 0.05123584546579123, + "grad_norm": 1.7411640882492065, + "learning_rate": 4.9676913294553364e-05, + "loss": 4.9771, + "step": 8615 + }, + { + "epoch": 0.05124179274907222, + "grad_norm": 1.7012072801589966, + "learning_rate": 4.9676838437912736e-05, + "loss": 4.9028, + "step": 8616 + }, + { + "epoch": 0.051247740032353224, + "grad_norm": 1.8154243230819702, + "learning_rate": 4.967676357265768e-05, + "loss": 5.4115, + "step": 8617 + }, + { + "epoch": 0.05125368731563422, + "grad_norm": 2.7746822834014893, + "learning_rate": 4.967668869878823e-05, + "loss": 5.5487, + "step": 8618 + }, + { + "epoch": 0.051259634598915214, + "grad_norm": 1.8362152576446533, + "learning_rate": 4.9676613816304395e-05, + "loss": 5.486, + "step": 8619 + }, + { + "epoch": 0.05126558188219621, + "grad_norm": 1.975853681564331, + "learning_rate": 4.967653892520621e-05, + "loss": 5.4348, + "step": 8620 + }, + { + "epoch": 0.05127152916547721, + "grad_norm": 1.8126581907272339, + "learning_rate": 4.96764640254937e-05, + "loss": 5.4558, + "step": 8621 + }, + { + "epoch": 0.05127747644875821, + "grad_norm": 1.6068531274795532, + "learning_rate": 4.967638911716689e-05, + "loss": 5.4672, + "step": 8622 + }, + { + "epoch": 0.0512834237320392, + "grad_norm": 1.6384878158569336, + "learning_rate": 4.9676314200225804e-05, + "loss": 5.1591, + "step": 8623 + }, + { + "epoch": 0.051289371015320204, + "grad_norm": 2.0413742065429688, + "learning_rate": 4.9676239274670474e-05, + "loss": 4.8992, + "step": 8624 + }, + { + "epoch": 0.0512953182986012, + "grad_norm": 1.7591389417648315, + "learning_rate": 4.967616434050093e-05, + "loss": 5.3629, + "step": 8625 + }, + { + "epoch": 0.051301265581882194, + "grad_norm": 1.9222301244735718, + "learning_rate": 4.967608939771719e-05, + "loss": 5.5082, + "step": 8626 + }, + { + "epoch": 0.051307212865163196, + "grad_norm": 1.8040579557418823, + "learning_rate": 4.967601444631928e-05, + "loss": 5.4019, + "step": 8627 + }, + { + "epoch": 0.05131316014844419, + "grad_norm": 2.0685603618621826, + "learning_rate": 4.967593948630723e-05, + "loss": 5.1959, + "step": 8628 + }, + { + "epoch": 0.051319107431725186, + "grad_norm": 1.446341872215271, + "learning_rate": 4.967586451768106e-05, + "loss": 5.4233, + "step": 8629 + }, + { + "epoch": 0.05132505471500619, + "grad_norm": 1.4487289190292358, + "learning_rate": 4.9675789540440806e-05, + "loss": 5.4065, + "step": 8630 + }, + { + "epoch": 0.05133100199828718, + "grad_norm": 2.367469310760498, + "learning_rate": 4.967571455458648e-05, + "loss": 5.3512, + "step": 8631 + }, + { + "epoch": 0.05133694928156818, + "grad_norm": 2.7115249633789062, + "learning_rate": 4.967563956011812e-05, + "loss": 5.4494, + "step": 8632 + }, + { + "epoch": 0.05134289656484918, + "grad_norm": 2.6692097187042236, + "learning_rate": 4.967556455703576e-05, + "loss": 5.2747, + "step": 8633 + }, + { + "epoch": 0.051348843848130175, + "grad_norm": 2.516005754470825, + "learning_rate": 4.967548954533941e-05, + "loss": 5.2305, + "step": 8634 + }, + { + "epoch": 0.05135479113141117, + "grad_norm": 1.6234782934188843, + "learning_rate": 4.96754145250291e-05, + "loss": 5.5192, + "step": 8635 + }, + { + "epoch": 0.051360738414692166, + "grad_norm": 1.9273806810379028, + "learning_rate": 4.9675339496104855e-05, + "loss": 5.4479, + "step": 8636 + }, + { + "epoch": 0.05136668569797317, + "grad_norm": 2.510847568511963, + "learning_rate": 4.967526445856671e-05, + "loss": 4.9858, + "step": 8637 + }, + { + "epoch": 0.05137263298125416, + "grad_norm": 2.3722991943359375, + "learning_rate": 4.967518941241468e-05, + "loss": 5.2287, + "step": 8638 + }, + { + "epoch": 0.05137858026453516, + "grad_norm": 2.286569118499756, + "learning_rate": 4.96751143576488e-05, + "loss": 5.2643, + "step": 8639 + }, + { + "epoch": 0.05138452754781616, + "grad_norm": 2.493534803390503, + "learning_rate": 4.9675039294269086e-05, + "loss": 5.1207, + "step": 8640 + }, + { + "epoch": 0.051390474831097155, + "grad_norm": 2.622694969177246, + "learning_rate": 4.967496422227558e-05, + "loss": 4.9735, + "step": 8641 + }, + { + "epoch": 0.05139642211437815, + "grad_norm": 1.7518365383148193, + "learning_rate": 4.967488914166829e-05, + "loss": 5.8818, + "step": 8642 + }, + { + "epoch": 0.05140236939765915, + "grad_norm": 2.0281870365142822, + "learning_rate": 4.9674814052447256e-05, + "loss": 6.3773, + "step": 8643 + }, + { + "epoch": 0.05140831668094015, + "grad_norm": 1.880083441734314, + "learning_rate": 4.96747389546125e-05, + "loss": 5.831, + "step": 8644 + }, + { + "epoch": 0.05141426396422114, + "grad_norm": 2.0792593955993652, + "learning_rate": 4.967466384816404e-05, + "loss": 5.8799, + "step": 8645 + }, + { + "epoch": 0.051420211247502144, + "grad_norm": 2.4550280570983887, + "learning_rate": 4.967458873310192e-05, + "loss": 5.2983, + "step": 8646 + }, + { + "epoch": 0.05142615853078314, + "grad_norm": 2.5590765476226807, + "learning_rate": 4.967451360942615e-05, + "loss": 5.1157, + "step": 8647 + }, + { + "epoch": 0.051432105814064134, + "grad_norm": 2.2328450679779053, + "learning_rate": 4.967443847713677e-05, + "loss": 5.047, + "step": 8648 + }, + { + "epoch": 0.05143805309734513, + "grad_norm": 2.0624022483825684, + "learning_rate": 4.9674363336233786e-05, + "loss": 5.6819, + "step": 8649 + }, + { + "epoch": 0.05144400038062613, + "grad_norm": 2.075239658355713, + "learning_rate": 4.9674288186717246e-05, + "loss": 5.895, + "step": 8650 + }, + { + "epoch": 0.05144994766390713, + "grad_norm": 1.7228562831878662, + "learning_rate": 4.967421302858716e-05, + "loss": 5.9199, + "step": 8651 + }, + { + "epoch": 0.05145589494718812, + "grad_norm": 2.235020637512207, + "learning_rate": 4.967413786184356e-05, + "loss": 5.0644, + "step": 8652 + }, + { + "epoch": 0.051461842230469124, + "grad_norm": 1.8620972633361816, + "learning_rate": 4.967406268648648e-05, + "loss": 5.7956, + "step": 8653 + }, + { + "epoch": 0.05146778951375012, + "grad_norm": 1.7914378643035889, + "learning_rate": 4.967398750251594e-05, + "loss": 5.742, + "step": 8654 + }, + { + "epoch": 0.051473736797031114, + "grad_norm": 2.0010504722595215, + "learning_rate": 4.967391230993196e-05, + "loss": 5.7808, + "step": 8655 + }, + { + "epoch": 0.051479684080312116, + "grad_norm": 2.1851212978363037, + "learning_rate": 4.9673837108734575e-05, + "loss": 5.4217, + "step": 8656 + }, + { + "epoch": 0.05148563136359311, + "grad_norm": 1.6896641254425049, + "learning_rate": 4.967376189892382e-05, + "loss": 6.321, + "step": 8657 + }, + { + "epoch": 0.051491578646874106, + "grad_norm": 1.7083675861358643, + "learning_rate": 4.967368668049969e-05, + "loss": 5.495, + "step": 8658 + }, + { + "epoch": 0.05149752593015511, + "grad_norm": 2.537256956100464, + "learning_rate": 4.967361145346224e-05, + "loss": 5.4096, + "step": 8659 + }, + { + "epoch": 0.0515034732134361, + "grad_norm": 2.3463892936706543, + "learning_rate": 4.967353621781149e-05, + "loss": 6.2461, + "step": 8660 + }, + { + "epoch": 0.0515094204967171, + "grad_norm": 1.6834701299667358, + "learning_rate": 4.967346097354746e-05, + "loss": 6.1007, + "step": 8661 + }, + { + "epoch": 0.0515153677799981, + "grad_norm": 2.140557289123535, + "learning_rate": 4.9673385720670184e-05, + "loss": 5.9908, + "step": 8662 + }, + { + "epoch": 0.051521315063279095, + "grad_norm": 2.211639165878296, + "learning_rate": 4.9673310459179676e-05, + "loss": 6.4192, + "step": 8663 + }, + { + "epoch": 0.05152726234656009, + "grad_norm": 1.8421399593353271, + "learning_rate": 4.9673235189075975e-05, + "loss": 6.099, + "step": 8664 + }, + { + "epoch": 0.051533209629841085, + "grad_norm": 1.7775965929031372, + "learning_rate": 4.96731599103591e-05, + "loss": 5.9572, + "step": 8665 + }, + { + "epoch": 0.05153915691312209, + "grad_norm": 1.7500132322311401, + "learning_rate": 4.967308462302909e-05, + "loss": 6.0987, + "step": 8666 + }, + { + "epoch": 0.05154510419640308, + "grad_norm": 1.7952892780303955, + "learning_rate": 4.967300932708595e-05, + "loss": 6.0235, + "step": 8667 + }, + { + "epoch": 0.05155105147968408, + "grad_norm": 1.7696008682250977, + "learning_rate": 4.967293402252972e-05, + "loss": 5.8253, + "step": 8668 + }, + { + "epoch": 0.05155699876296508, + "grad_norm": 1.848975419998169, + "learning_rate": 4.967285870936042e-05, + "loss": 6.0942, + "step": 8669 + }, + { + "epoch": 0.051562946046246075, + "grad_norm": 2.412909507751465, + "learning_rate": 4.967278338757808e-05, + "loss": 5.5752, + "step": 8670 + }, + { + "epoch": 0.05156889332952707, + "grad_norm": 2.0214738845825195, + "learning_rate": 4.967270805718273e-05, + "loss": 5.5721, + "step": 8671 + }, + { + "epoch": 0.05157484061280807, + "grad_norm": 2.3830201625823975, + "learning_rate": 4.967263271817439e-05, + "loss": 6.034, + "step": 8672 + }, + { + "epoch": 0.05158078789608907, + "grad_norm": 2.213979959487915, + "learning_rate": 4.9672557370553094e-05, + "loss": 6.0169, + "step": 8673 + }, + { + "epoch": 0.05158673517937006, + "grad_norm": 1.9657354354858398, + "learning_rate": 4.967248201431887e-05, + "loss": 6.0159, + "step": 8674 + }, + { + "epoch": 0.051592682462651064, + "grad_norm": 2.0882673263549805, + "learning_rate": 4.967240664947172e-05, + "loss": 6.1088, + "step": 8675 + }, + { + "epoch": 0.05159862974593206, + "grad_norm": 2.291152000427246, + "learning_rate": 4.96723312760117e-05, + "loss": 5.4534, + "step": 8676 + }, + { + "epoch": 0.051604577029213054, + "grad_norm": 2.3495421409606934, + "learning_rate": 4.967225589393881e-05, + "loss": 5.5524, + "step": 8677 + }, + { + "epoch": 0.05161052431249405, + "grad_norm": 2.2665255069732666, + "learning_rate": 4.9672180503253106e-05, + "loss": 5.5208, + "step": 8678 + }, + { + "epoch": 0.05161647159577505, + "grad_norm": 2.1587207317352295, + "learning_rate": 4.9672105103954594e-05, + "loss": 5.7016, + "step": 8679 + }, + { + "epoch": 0.051622418879056046, + "grad_norm": 2.2260420322418213, + "learning_rate": 4.96720296960433e-05, + "loss": 5.6179, + "step": 8680 + }, + { + "epoch": 0.05162836616233704, + "grad_norm": 3.1678147315979004, + "learning_rate": 4.967195427951926e-05, + "loss": 5.4655, + "step": 8681 + }, + { + "epoch": 0.051634313445618044, + "grad_norm": 3.0126166343688965, + "learning_rate": 4.967187885438249e-05, + "loss": 5.5663, + "step": 8682 + }, + { + "epoch": 0.05164026072889904, + "grad_norm": 2.290069341659546, + "learning_rate": 4.9671803420633034e-05, + "loss": 5.7462, + "step": 8683 + }, + { + "epoch": 0.051646208012180034, + "grad_norm": 2.1958532333374023, + "learning_rate": 4.96717279782709e-05, + "loss": 5.8359, + "step": 8684 + }, + { + "epoch": 0.051652155295461036, + "grad_norm": 2.063312530517578, + "learning_rate": 4.967165252729611e-05, + "loss": 5.847, + "step": 8685 + }, + { + "epoch": 0.05165810257874203, + "grad_norm": 1.8041539192199707, + "learning_rate": 4.967157706770872e-05, + "loss": 5.9408, + "step": 8686 + }, + { + "epoch": 0.051664049862023026, + "grad_norm": 1.684831976890564, + "learning_rate": 4.967150159950873e-05, + "loss": 6.019, + "step": 8687 + }, + { + "epoch": 0.05166999714530403, + "grad_norm": 2.4915740489959717, + "learning_rate": 4.967142612269616e-05, + "loss": 5.357, + "step": 8688 + }, + { + "epoch": 0.05167594442858502, + "grad_norm": 2.2621138095855713, + "learning_rate": 4.967135063727106e-05, + "loss": 5.7726, + "step": 8689 + }, + { + "epoch": 0.05168189171186602, + "grad_norm": 1.9304747581481934, + "learning_rate": 4.967127514323345e-05, + "loss": 6.0958, + "step": 8690 + }, + { + "epoch": 0.05168783899514702, + "grad_norm": 1.7657890319824219, + "learning_rate": 4.9671199640583354e-05, + "loss": 6.1036, + "step": 8691 + }, + { + "epoch": 0.051693786278428015, + "grad_norm": 1.7449486255645752, + "learning_rate": 4.9671124129320794e-05, + "loss": 6.0843, + "step": 8692 + }, + { + "epoch": 0.05169973356170901, + "grad_norm": 2.0155117511749268, + "learning_rate": 4.96710486094458e-05, + "loss": 5.9626, + "step": 8693 + }, + { + "epoch": 0.051705680844990005, + "grad_norm": 2.1015188694000244, + "learning_rate": 4.967097308095839e-05, + "loss": 5.6053, + "step": 8694 + }, + { + "epoch": 0.05171162812827101, + "grad_norm": 1.9602909088134766, + "learning_rate": 4.967089754385861e-05, + "loss": 5.1988, + "step": 8695 + }, + { + "epoch": 0.051717575411552, + "grad_norm": 2.141657590866089, + "learning_rate": 4.9670821998146474e-05, + "loss": 5.2994, + "step": 8696 + }, + { + "epoch": 0.051723522694833, + "grad_norm": 2.1301774978637695, + "learning_rate": 4.9670746443822006e-05, + "loss": 5.7935, + "step": 8697 + }, + { + "epoch": 0.051729469978114, + "grad_norm": 1.9465678930282593, + "learning_rate": 4.9670670880885225e-05, + "loss": 5.1861, + "step": 8698 + }, + { + "epoch": 0.051735417261394995, + "grad_norm": 2.177234411239624, + "learning_rate": 4.967059530933618e-05, + "loss": 5.1114, + "step": 8699 + }, + { + "epoch": 0.05174136454467599, + "grad_norm": 2.0886077880859375, + "learning_rate": 4.967051972917488e-05, + "loss": 5.2905, + "step": 8700 + }, + { + "epoch": 0.05174731182795699, + "grad_norm": 1.8517125844955444, + "learning_rate": 4.967044414040136e-05, + "loss": 5.1672, + "step": 8701 + }, + { + "epoch": 0.05175325911123799, + "grad_norm": 1.7342808246612549, + "learning_rate": 4.967036854301564e-05, + "loss": 5.2767, + "step": 8702 + }, + { + "epoch": 0.05175920639451898, + "grad_norm": 1.7315362691879272, + "learning_rate": 4.9670292937017746e-05, + "loss": 5.2897, + "step": 8703 + }, + { + "epoch": 0.051765153677799984, + "grad_norm": 1.8794540166854858, + "learning_rate": 4.967021732240772e-05, + "loss": 5.3808, + "step": 8704 + }, + { + "epoch": 0.05177110096108098, + "grad_norm": 1.8047478199005127, + "learning_rate": 4.9670141699185565e-05, + "loss": 5.1074, + "step": 8705 + }, + { + "epoch": 0.051777048244361974, + "grad_norm": 1.699475884437561, + "learning_rate": 4.967006606735132e-05, + "loss": 5.8162, + "step": 8706 + }, + { + "epoch": 0.05178299552764297, + "grad_norm": 2.008352518081665, + "learning_rate": 4.966999042690501e-05, + "loss": 6.3593, + "step": 8707 + }, + { + "epoch": 0.05178894281092397, + "grad_norm": 1.8776370286941528, + "learning_rate": 4.966991477784667e-05, + "loss": 6.3419, + "step": 8708 + }, + { + "epoch": 0.051794890094204966, + "grad_norm": 2.018157720565796, + "learning_rate": 4.9669839120176306e-05, + "loss": 6.1927, + "step": 8709 + }, + { + "epoch": 0.05180083737748596, + "grad_norm": 1.833764910697937, + "learning_rate": 4.966976345389396e-05, + "loss": 5.0803, + "step": 8710 + }, + { + "epoch": 0.051806784660766964, + "grad_norm": 1.7809339761734009, + "learning_rate": 4.9669687778999655e-05, + "loss": 5.3891, + "step": 8711 + }, + { + "epoch": 0.05181273194404796, + "grad_norm": 1.9905017614364624, + "learning_rate": 4.966961209549341e-05, + "loss": 6.247, + "step": 8712 + }, + { + "epoch": 0.051818679227328954, + "grad_norm": 2.1396658420562744, + "learning_rate": 4.966953640337527e-05, + "loss": 6.2506, + "step": 8713 + }, + { + "epoch": 0.051824626510609956, + "grad_norm": 1.778996467590332, + "learning_rate": 4.9669460702645244e-05, + "loss": 6.1333, + "step": 8714 + }, + { + "epoch": 0.05183057379389095, + "grad_norm": 1.9936842918395996, + "learning_rate": 4.9669384993303366e-05, + "loss": 5.6486, + "step": 8715 + }, + { + "epoch": 0.051836521077171946, + "grad_norm": 1.8064475059509277, + "learning_rate": 4.9669309275349656e-05, + "loss": 6.1217, + "step": 8716 + }, + { + "epoch": 0.05184246836045295, + "grad_norm": 1.9532819986343384, + "learning_rate": 4.966923354878414e-05, + "loss": 5.5402, + "step": 8717 + }, + { + "epoch": 0.05184841564373394, + "grad_norm": 2.4843015670776367, + "learning_rate": 4.966915781360686e-05, + "loss": 4.7674, + "step": 8718 + }, + { + "epoch": 0.05185436292701494, + "grad_norm": 2.7453129291534424, + "learning_rate": 4.9669082069817835e-05, + "loss": 4.4489, + "step": 8719 + }, + { + "epoch": 0.05186031021029594, + "grad_norm": 3.0180628299713135, + "learning_rate": 4.9669006317417084e-05, + "loss": 4.1401, + "step": 8720 + }, + { + "epoch": 0.051866257493576935, + "grad_norm": 2.44638991355896, + "learning_rate": 4.966893055640464e-05, + "loss": 4.7241, + "step": 8721 + }, + { + "epoch": 0.05187220477685793, + "grad_norm": 2.0131804943084717, + "learning_rate": 4.9668854786780514e-05, + "loss": 5.6495, + "step": 8722 + }, + { + "epoch": 0.051878152060138925, + "grad_norm": 2.0331337451934814, + "learning_rate": 4.966877900854476e-05, + "loss": 5.6812, + "step": 8723 + }, + { + "epoch": 0.05188409934341993, + "grad_norm": 2.5784926414489746, + "learning_rate": 4.9668703221697385e-05, + "loss": 5.3617, + "step": 8724 + }, + { + "epoch": 0.05189004662670092, + "grad_norm": 2.599321126937866, + "learning_rate": 4.9668627426238425e-05, + "loss": 5.6273, + "step": 8725 + }, + { + "epoch": 0.05189599390998192, + "grad_norm": 2.53541898727417, + "learning_rate": 4.966855162216789e-05, + "loss": 5.2916, + "step": 8726 + }, + { + "epoch": 0.05190194119326292, + "grad_norm": 2.165160655975342, + "learning_rate": 4.9668475809485825e-05, + "loss": 5.6152, + "step": 8727 + }, + { + "epoch": 0.051907888476543915, + "grad_norm": 2.4488654136657715, + "learning_rate": 4.966839998819225e-05, + "loss": 5.4163, + "step": 8728 + }, + { + "epoch": 0.05191383575982491, + "grad_norm": 2.2756056785583496, + "learning_rate": 4.96683241582872e-05, + "loss": 5.9449, + "step": 8729 + }, + { + "epoch": 0.05191978304310591, + "grad_norm": 2.7889063358306885, + "learning_rate": 4.9668248319770683e-05, + "loss": 5.9502, + "step": 8730 + }, + { + "epoch": 0.05192573032638691, + "grad_norm": 2.620378255844116, + "learning_rate": 4.9668172472642735e-05, + "loss": 4.8344, + "step": 8731 + }, + { + "epoch": 0.0519316776096679, + "grad_norm": 2.2405688762664795, + "learning_rate": 4.9668096616903395e-05, + "loss": 5.598, + "step": 8732 + }, + { + "epoch": 0.051937624892948904, + "grad_norm": 2.3559701442718506, + "learning_rate": 4.9668020752552664e-05, + "loss": 5.7951, + "step": 8733 + }, + { + "epoch": 0.0519435721762299, + "grad_norm": 1.9856364727020264, + "learning_rate": 4.966794487959058e-05, + "loss": 5.3907, + "step": 8734 + }, + { + "epoch": 0.051949519459510894, + "grad_norm": 2.345541000366211, + "learning_rate": 4.966786899801718e-05, + "loss": 5.9875, + "step": 8735 + }, + { + "epoch": 0.05195546674279189, + "grad_norm": 2.4069056510925293, + "learning_rate": 4.9667793107832485e-05, + "loss": 6.0062, + "step": 8736 + }, + { + "epoch": 0.05196141402607289, + "grad_norm": 1.9191378355026245, + "learning_rate": 4.966771720903651e-05, + "loss": 6.1341, + "step": 8737 + }, + { + "epoch": 0.051967361309353886, + "grad_norm": 2.135986089706421, + "learning_rate": 4.9667641301629284e-05, + "loss": 5.6993, + "step": 8738 + }, + { + "epoch": 0.05197330859263488, + "grad_norm": 2.0774824619293213, + "learning_rate": 4.966756538561085e-05, + "loss": 5.9791, + "step": 8739 + }, + { + "epoch": 0.051979255875915883, + "grad_norm": 2.1451659202575684, + "learning_rate": 4.9667489460981224e-05, + "loss": 5.8181, + "step": 8740 + }, + { + "epoch": 0.05198520315919688, + "grad_norm": 2.2769901752471924, + "learning_rate": 4.966741352774043e-05, + "loss": 5.6799, + "step": 8741 + }, + { + "epoch": 0.051991150442477874, + "grad_norm": 2.22038197517395, + "learning_rate": 4.9667337585888494e-05, + "loss": 5.8781, + "step": 8742 + }, + { + "epoch": 0.051997097725758876, + "grad_norm": 2.417508125305176, + "learning_rate": 4.9667261635425446e-05, + "loss": 5.3458, + "step": 8743 + }, + { + "epoch": 0.05200304500903987, + "grad_norm": 2.0334360599517822, + "learning_rate": 4.966718567635131e-05, + "loss": 5.5241, + "step": 8744 + }, + { + "epoch": 0.052008992292320866, + "grad_norm": 2.3476316928863525, + "learning_rate": 4.9667109708666126e-05, + "loss": 5.8786, + "step": 8745 + }, + { + "epoch": 0.05201493957560187, + "grad_norm": 2.160106897354126, + "learning_rate": 4.96670337323699e-05, + "loss": 5.616, + "step": 8746 + }, + { + "epoch": 0.05202088685888286, + "grad_norm": 2.0048086643218994, + "learning_rate": 4.9666957747462665e-05, + "loss": 5.5787, + "step": 8747 + }, + { + "epoch": 0.05202683414216386, + "grad_norm": 2.9226925373077393, + "learning_rate": 4.966688175394446e-05, + "loss": 5.3708, + "step": 8748 + }, + { + "epoch": 0.05203278142544486, + "grad_norm": 1.9020568132400513, + "learning_rate": 4.9666805751815294e-05, + "loss": 5.6037, + "step": 8749 + }, + { + "epoch": 0.052038728708725855, + "grad_norm": 2.218637466430664, + "learning_rate": 4.966672974107519e-05, + "loss": 5.2983, + "step": 8750 + }, + { + "epoch": 0.05204467599200685, + "grad_norm": 2.906625270843506, + "learning_rate": 4.96666537217242e-05, + "loss": 5.1234, + "step": 8751 + }, + { + "epoch": 0.052050623275287845, + "grad_norm": 2.0095551013946533, + "learning_rate": 4.966657769376234e-05, + "loss": 5.2695, + "step": 8752 + }, + { + "epoch": 0.05205657055856885, + "grad_norm": 2.1369643211364746, + "learning_rate": 4.966650165718963e-05, + "loss": 5.5426, + "step": 8753 + }, + { + "epoch": 0.05206251784184984, + "grad_norm": 2.4762122631073, + "learning_rate": 4.966642561200608e-05, + "loss": 5.5595, + "step": 8754 + }, + { + "epoch": 0.05206846512513084, + "grad_norm": 2.199430227279663, + "learning_rate": 4.966634955821176e-05, + "loss": 5.5155, + "step": 8755 + }, + { + "epoch": 0.05207441240841184, + "grad_norm": 2.132460355758667, + "learning_rate": 4.966627349580666e-05, + "loss": 5.5344, + "step": 8756 + }, + { + "epoch": 0.052080359691692835, + "grad_norm": 2.4437100887298584, + "learning_rate": 4.966619742479082e-05, + "loss": 5.0135, + "step": 8757 + }, + { + "epoch": 0.05208630697497383, + "grad_norm": 1.5223499536514282, + "learning_rate": 4.9666121345164265e-05, + "loss": 5.5467, + "step": 8758 + }, + { + "epoch": 0.05209225425825483, + "grad_norm": 2.101797580718994, + "learning_rate": 4.966604525692702e-05, + "loss": 5.9493, + "step": 8759 + }, + { + "epoch": 0.05209820154153583, + "grad_norm": 1.9338927268981934, + "learning_rate": 4.966596916007912e-05, + "loss": 5.6625, + "step": 8760 + }, + { + "epoch": 0.05210414882481682, + "grad_norm": 2.1328654289245605, + "learning_rate": 4.966589305462058e-05, + "loss": 6.3202, + "step": 8761 + }, + { + "epoch": 0.052110096108097824, + "grad_norm": 1.963287115097046, + "learning_rate": 4.9665816940551434e-05, + "loss": 5.8885, + "step": 8762 + }, + { + "epoch": 0.05211604339137882, + "grad_norm": 2.124155282974243, + "learning_rate": 4.96657408178717e-05, + "loss": 5.6015, + "step": 8763 + }, + { + "epoch": 0.052121990674659814, + "grad_norm": 2.1011505126953125, + "learning_rate": 4.966566468658142e-05, + "loss": 5.7786, + "step": 8764 + }, + { + "epoch": 0.05212793795794081, + "grad_norm": 1.769573450088501, + "learning_rate": 4.966558854668061e-05, + "loss": 5.8229, + "step": 8765 + }, + { + "epoch": 0.05213388524122181, + "grad_norm": 1.7712751626968384, + "learning_rate": 4.966551239816929e-05, + "loss": 5.733, + "step": 8766 + }, + { + "epoch": 0.052139832524502806, + "grad_norm": 1.68185555934906, + "learning_rate": 4.9665436241047503e-05, + "loss": 6.015, + "step": 8767 + }, + { + "epoch": 0.0521457798077838, + "grad_norm": 1.8619519472122192, + "learning_rate": 4.966536007531526e-05, + "loss": 5.9545, + "step": 8768 + }, + { + "epoch": 0.0521517270910648, + "grad_norm": 1.6538097858428955, + "learning_rate": 4.96652839009726e-05, + "loss": 5.6138, + "step": 8769 + }, + { + "epoch": 0.0521576743743458, + "grad_norm": 1.721737027168274, + "learning_rate": 4.966520771801955e-05, + "loss": 6.0001, + "step": 8770 + }, + { + "epoch": 0.052163621657626794, + "grad_norm": 1.8449060916900635, + "learning_rate": 4.966513152645612e-05, + "loss": 5.6811, + "step": 8771 + }, + { + "epoch": 0.052169568940907796, + "grad_norm": 2.3810017108917236, + "learning_rate": 4.966505532628235e-05, + "loss": 5.4662, + "step": 8772 + }, + { + "epoch": 0.05217551622418879, + "grad_norm": 2.9262144565582275, + "learning_rate": 4.9664979117498265e-05, + "loss": 5.3555, + "step": 8773 + }, + { + "epoch": 0.052181463507469786, + "grad_norm": 2.1560001373291016, + "learning_rate": 4.966490290010389e-05, + "loss": 5.988, + "step": 8774 + }, + { + "epoch": 0.05218741079075079, + "grad_norm": 1.8220587968826294, + "learning_rate": 4.966482667409925e-05, + "loss": 5.8334, + "step": 8775 + }, + { + "epoch": 0.05219335807403178, + "grad_norm": 2.393651008605957, + "learning_rate": 4.9664750439484375e-05, + "loss": 5.5866, + "step": 8776 + }, + { + "epoch": 0.05219930535731278, + "grad_norm": 2.193864583969116, + "learning_rate": 4.966467419625929e-05, + "loss": 5.6642, + "step": 8777 + }, + { + "epoch": 0.05220525264059378, + "grad_norm": 2.24094820022583, + "learning_rate": 4.966459794442403e-05, + "loss": 5.7149, + "step": 8778 + }, + { + "epoch": 0.052211199923874775, + "grad_norm": 2.447439670562744, + "learning_rate": 4.9664521683978606e-05, + "loss": 5.4759, + "step": 8779 + }, + { + "epoch": 0.05221714720715577, + "grad_norm": 1.9538700580596924, + "learning_rate": 4.9664445414923055e-05, + "loss": 5.7, + "step": 8780 + }, + { + "epoch": 0.052223094490436765, + "grad_norm": 1.8960500955581665, + "learning_rate": 4.966436913725739e-05, + "loss": 5.7852, + "step": 8781 + }, + { + "epoch": 0.05222904177371777, + "grad_norm": 1.9234421253204346, + "learning_rate": 4.966429285098166e-05, + "loss": 5.9842, + "step": 8782 + }, + { + "epoch": 0.05223498905699876, + "grad_norm": 2.2879858016967773, + "learning_rate": 4.966421655609588e-05, + "loss": 5.6572, + "step": 8783 + }, + { + "epoch": 0.05224093634027976, + "grad_norm": 2.287932872772217, + "learning_rate": 4.966414025260008e-05, + "loss": 6.0675, + "step": 8784 + }, + { + "epoch": 0.05224688362356076, + "grad_norm": 1.6395118236541748, + "learning_rate": 4.9664063940494275e-05, + "loss": 5.6846, + "step": 8785 + }, + { + "epoch": 0.052252830906841755, + "grad_norm": 1.7121644020080566, + "learning_rate": 4.966398761977851e-05, + "loss": 5.7014, + "step": 8786 + }, + { + "epoch": 0.05225877819012275, + "grad_norm": 1.6225544214248657, + "learning_rate": 4.966391129045279e-05, + "loss": 5.6152, + "step": 8787 + }, + { + "epoch": 0.05226472547340375, + "grad_norm": 1.8484382629394531, + "learning_rate": 4.966383495251716e-05, + "loss": 5.8109, + "step": 8788 + }, + { + "epoch": 0.05227067275668475, + "grad_norm": 1.8225692510604858, + "learning_rate": 4.966375860597164e-05, + "loss": 6.0587, + "step": 8789 + }, + { + "epoch": 0.05227662003996574, + "grad_norm": 2.0333876609802246, + "learning_rate": 4.9663682250816255e-05, + "loss": 6.1406, + "step": 8790 + }, + { + "epoch": 0.052282567323246744, + "grad_norm": 2.0004124641418457, + "learning_rate": 4.9663605887051036e-05, + "loss": 5.6227, + "step": 8791 + }, + { + "epoch": 0.05228851460652774, + "grad_norm": 1.723655343055725, + "learning_rate": 4.9663529514676005e-05, + "loss": 5.5013, + "step": 8792 + }, + { + "epoch": 0.052294461889808734, + "grad_norm": 1.8351995944976807, + "learning_rate": 4.966345313369119e-05, + "loss": 5.3327, + "step": 8793 + }, + { + "epoch": 0.05230040917308973, + "grad_norm": 1.7514569759368896, + "learning_rate": 4.9663376744096615e-05, + "loss": 5.235, + "step": 8794 + }, + { + "epoch": 0.05230635645637073, + "grad_norm": 1.6678166389465332, + "learning_rate": 4.966330034589232e-05, + "loss": 5.2269, + "step": 8795 + }, + { + "epoch": 0.052312303739651726, + "grad_norm": 1.82132887840271, + "learning_rate": 4.9663223939078315e-05, + "loss": 5.0288, + "step": 8796 + }, + { + "epoch": 0.05231825102293272, + "grad_norm": 1.7815704345703125, + "learning_rate": 4.966314752365463e-05, + "loss": 5.4489, + "step": 8797 + }, + { + "epoch": 0.05232419830621372, + "grad_norm": 2.5268197059631348, + "learning_rate": 4.96630710996213e-05, + "loss": 5.0321, + "step": 8798 + }, + { + "epoch": 0.05233014558949472, + "grad_norm": 2.921208620071411, + "learning_rate": 4.9662994666978346e-05, + "loss": 5.0826, + "step": 8799 + }, + { + "epoch": 0.052336092872775714, + "grad_norm": 2.83243727684021, + "learning_rate": 4.9662918225725794e-05, + "loss": 4.9754, + "step": 8800 + }, + { + "epoch": 0.052342040156056716, + "grad_norm": 2.960346221923828, + "learning_rate": 4.966284177586368e-05, + "loss": 5.5808, + "step": 8801 + }, + { + "epoch": 0.05234798743933771, + "grad_norm": 2.479055643081665, + "learning_rate": 4.966276531739201e-05, + "loss": 5.3779, + "step": 8802 + }, + { + "epoch": 0.052353934722618706, + "grad_norm": 2.8753128051757812, + "learning_rate": 4.966268885031083e-05, + "loss": 5.4023, + "step": 8803 + }, + { + "epoch": 0.05235988200589971, + "grad_norm": 2.1152822971343994, + "learning_rate": 4.966261237462016e-05, + "loss": 6.1181, + "step": 8804 + }, + { + "epoch": 0.0523658292891807, + "grad_norm": 2.7178313732147217, + "learning_rate": 4.966253589032003e-05, + "loss": 5.1597, + "step": 8805 + }, + { + "epoch": 0.0523717765724617, + "grad_norm": 2.6567695140838623, + "learning_rate": 4.966245939741045e-05, + "loss": 5.0582, + "step": 8806 + }, + { + "epoch": 0.0523777238557427, + "grad_norm": 3.0211431980133057, + "learning_rate": 4.966238289589147e-05, + "loss": 4.8331, + "step": 8807 + }, + { + "epoch": 0.052383671139023695, + "grad_norm": 2.9341561794281006, + "learning_rate": 4.9662306385763114e-05, + "loss": 4.8482, + "step": 8808 + }, + { + "epoch": 0.05238961842230469, + "grad_norm": 2.781118631362915, + "learning_rate": 4.966222986702539e-05, + "loss": 4.9199, + "step": 8809 + }, + { + "epoch": 0.052395565705585685, + "grad_norm": 2.459233283996582, + "learning_rate": 4.9662153339678344e-05, + "loss": 5.4156, + "step": 8810 + }, + { + "epoch": 0.05240151298886669, + "grad_norm": 1.9862231016159058, + "learning_rate": 4.966207680372199e-05, + "loss": 5.3937, + "step": 8811 + }, + { + "epoch": 0.05240746027214768, + "grad_norm": 3.3698437213897705, + "learning_rate": 4.966200025915636e-05, + "loss": 4.6231, + "step": 8812 + }, + { + "epoch": 0.05241340755542868, + "grad_norm": 2.9254424571990967, + "learning_rate": 4.9661923705981486e-05, + "loss": 4.5612, + "step": 8813 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 2.684386968612671, + "learning_rate": 4.966184714419738e-05, + "loss": 4.8646, + "step": 8814 + }, + { + "epoch": 0.052425302121990675, + "grad_norm": 2.812406539916992, + "learning_rate": 4.966177057380409e-05, + "loss": 4.5116, + "step": 8815 + }, + { + "epoch": 0.05243124940527167, + "grad_norm": 2.1739046573638916, + "learning_rate": 4.966169399480162e-05, + "loss": 5.3369, + "step": 8816 + }, + { + "epoch": 0.05243719668855267, + "grad_norm": 2.408341407775879, + "learning_rate": 4.966161740719001e-05, + "loss": 5.0368, + "step": 8817 + }, + { + "epoch": 0.05244314397183367, + "grad_norm": 2.2844927310943604, + "learning_rate": 4.966154081096929e-05, + "loss": 5.0657, + "step": 8818 + }, + { + "epoch": 0.05244909125511466, + "grad_norm": 2.5329723358154297, + "learning_rate": 4.9661464206139475e-05, + "loss": 5.2006, + "step": 8819 + }, + { + "epoch": 0.052455038538395664, + "grad_norm": 2.154224395751953, + "learning_rate": 4.9661387592700595e-05, + "loss": 5.238, + "step": 8820 + }, + { + "epoch": 0.05246098582167666, + "grad_norm": 2.1069657802581787, + "learning_rate": 4.966131097065269e-05, + "loss": 5.0894, + "step": 8821 + }, + { + "epoch": 0.052466933104957654, + "grad_norm": 2.165954351425171, + "learning_rate": 4.9661234339995763e-05, + "loss": 5.1148, + "step": 8822 + }, + { + "epoch": 0.052472880388238656, + "grad_norm": 1.8859459161758423, + "learning_rate": 4.9661157700729866e-05, + "loss": 5.1703, + "step": 8823 + }, + { + "epoch": 0.05247882767151965, + "grad_norm": 1.9739452600479126, + "learning_rate": 4.9661081052855004e-05, + "loss": 5.3978, + "step": 8824 + }, + { + "epoch": 0.052484774954800646, + "grad_norm": 1.95566987991333, + "learning_rate": 4.966100439637122e-05, + "loss": 5.3592, + "step": 8825 + }, + { + "epoch": 0.05249072223808164, + "grad_norm": 1.8613550662994385, + "learning_rate": 4.966092773127853e-05, + "loss": 5.3746, + "step": 8826 + }, + { + "epoch": 0.05249666952136264, + "grad_norm": 2.001701831817627, + "learning_rate": 4.9660851057576966e-05, + "loss": 5.3269, + "step": 8827 + }, + { + "epoch": 0.05250261680464364, + "grad_norm": 1.8846383094787598, + "learning_rate": 4.9660774375266556e-05, + "loss": 5.7906, + "step": 8828 + }, + { + "epoch": 0.052508564087924633, + "grad_norm": 1.982998251914978, + "learning_rate": 4.966069768434732e-05, + "loss": 5.6609, + "step": 8829 + }, + { + "epoch": 0.052514511371205636, + "grad_norm": 2.3036038875579834, + "learning_rate": 4.9660620984819294e-05, + "loss": 5.6172, + "step": 8830 + }, + { + "epoch": 0.05252045865448663, + "grad_norm": 1.9227113723754883, + "learning_rate": 4.9660544276682496e-05, + "loss": 5.4734, + "step": 8831 + }, + { + "epoch": 0.052526405937767626, + "grad_norm": 2.038203716278076, + "learning_rate": 4.9660467559936964e-05, + "loss": 5.6484, + "step": 8832 + }, + { + "epoch": 0.05253235322104863, + "grad_norm": 2.217108964920044, + "learning_rate": 4.9660390834582704e-05, + "loss": 5.4064, + "step": 8833 + }, + { + "epoch": 0.05253830050432962, + "grad_norm": 2.4458765983581543, + "learning_rate": 4.966031410061976e-05, + "loss": 5.605, + "step": 8834 + }, + { + "epoch": 0.05254424778761062, + "grad_norm": 2.2767014503479004, + "learning_rate": 4.966023735804817e-05, + "loss": 5.4258, + "step": 8835 + }, + { + "epoch": 0.05255019507089162, + "grad_norm": 2.3594579696655273, + "learning_rate": 4.9660160606867936e-05, + "loss": 5.5138, + "step": 8836 + }, + { + "epoch": 0.052556142354172615, + "grad_norm": 1.8961461782455444, + "learning_rate": 4.966008384707909e-05, + "loss": 5.9879, + "step": 8837 + }, + { + "epoch": 0.05256208963745361, + "grad_norm": 1.824751615524292, + "learning_rate": 4.966000707868167e-05, + "loss": 5.4558, + "step": 8838 + }, + { + "epoch": 0.052568036920734605, + "grad_norm": 2.005291223526001, + "learning_rate": 4.9659930301675694e-05, + "loss": 5.821, + "step": 8839 + }, + { + "epoch": 0.05257398420401561, + "grad_norm": 2.0951414108276367, + "learning_rate": 4.965985351606119e-05, + "loss": 5.2816, + "step": 8840 + }, + { + "epoch": 0.0525799314872966, + "grad_norm": 2.236849069595337, + "learning_rate": 4.9659776721838194e-05, + "loss": 5.4734, + "step": 8841 + }, + { + "epoch": 0.0525858787705776, + "grad_norm": 1.8877390623092651, + "learning_rate": 4.965969991900671e-05, + "loss": 5.2445, + "step": 8842 + }, + { + "epoch": 0.0525918260538586, + "grad_norm": 2.726071834564209, + "learning_rate": 4.9659623107566785e-05, + "loss": 5.6059, + "step": 8843 + }, + { + "epoch": 0.052597773337139594, + "grad_norm": 2.279759168624878, + "learning_rate": 4.965954628751844e-05, + "loss": 5.6755, + "step": 8844 + }, + { + "epoch": 0.05260372062042059, + "grad_norm": 1.9941623210906982, + "learning_rate": 4.965946945886171e-05, + "loss": 5.5222, + "step": 8845 + }, + { + "epoch": 0.05260966790370159, + "grad_norm": 2.0556750297546387, + "learning_rate": 4.965939262159661e-05, + "loss": 5.6064, + "step": 8846 + }, + { + "epoch": 0.05261561518698259, + "grad_norm": 1.9260958433151245, + "learning_rate": 4.965931577572317e-05, + "loss": 5.6264, + "step": 8847 + }, + { + "epoch": 0.05262156247026358, + "grad_norm": 2.1252758502960205, + "learning_rate": 4.9659238921241413e-05, + "loss": 5.9832, + "step": 8848 + }, + { + "epoch": 0.052627509753544584, + "grad_norm": 1.8081480264663696, + "learning_rate": 4.9659162058151377e-05, + "loss": 5.4391, + "step": 8849 + }, + { + "epoch": 0.05263345703682558, + "grad_norm": 1.8439849615097046, + "learning_rate": 4.965908518645308e-05, + "loss": 5.5351, + "step": 8850 + }, + { + "epoch": 0.052639404320106574, + "grad_norm": 2.1782681941986084, + "learning_rate": 4.9659008306146556e-05, + "loss": 5.9692, + "step": 8851 + }, + { + "epoch": 0.052645351603387576, + "grad_norm": 2.0206944942474365, + "learning_rate": 4.965893141723182e-05, + "loss": 5.4736, + "step": 8852 + }, + { + "epoch": 0.05265129888666857, + "grad_norm": 2.283517360687256, + "learning_rate": 4.965885451970891e-05, + "loss": 5.4504, + "step": 8853 + }, + { + "epoch": 0.052657246169949566, + "grad_norm": 2.701608180999756, + "learning_rate": 4.965877761357784e-05, + "loss": 5.318, + "step": 8854 + }, + { + "epoch": 0.05266319345323056, + "grad_norm": 2.8494722843170166, + "learning_rate": 4.965870069883866e-05, + "loss": 4.9835, + "step": 8855 + }, + { + "epoch": 0.05266914073651156, + "grad_norm": 2.0555408000946045, + "learning_rate": 4.965862377549137e-05, + "loss": 5.7587, + "step": 8856 + }, + { + "epoch": 0.05267508801979256, + "grad_norm": 2.3476004600524902, + "learning_rate": 4.9658546843536014e-05, + "loss": 5.8775, + "step": 8857 + }, + { + "epoch": 0.05268103530307355, + "grad_norm": 1.8152700662612915, + "learning_rate": 4.965846990297262e-05, + "loss": 5.6274, + "step": 8858 + }, + { + "epoch": 0.052686982586354555, + "grad_norm": 2.1541671752929688, + "learning_rate": 4.965839295380119e-05, + "loss": 5.6786, + "step": 8859 + }, + { + "epoch": 0.05269292986963555, + "grad_norm": 2.1708984375, + "learning_rate": 4.965831599602179e-05, + "loss": 5.8817, + "step": 8860 + }, + { + "epoch": 0.052698877152916546, + "grad_norm": 1.6558966636657715, + "learning_rate": 4.9658239029634415e-05, + "loss": 5.5375, + "step": 8861 + }, + { + "epoch": 0.05270482443619755, + "grad_norm": 2.1165130138397217, + "learning_rate": 4.9658162054639115e-05, + "loss": 5.5936, + "step": 8862 + }, + { + "epoch": 0.05271077171947854, + "grad_norm": 2.4143176078796387, + "learning_rate": 4.9658085071035893e-05, + "loss": 5.71, + "step": 8863 + }, + { + "epoch": 0.05271671900275954, + "grad_norm": 1.9471622705459595, + "learning_rate": 4.965800807882479e-05, + "loss": 5.7588, + "step": 8864 + }, + { + "epoch": 0.05272266628604054, + "grad_norm": 2.2014408111572266, + "learning_rate": 4.9657931078005835e-05, + "loss": 5.7699, + "step": 8865 + }, + { + "epoch": 0.052728613569321535, + "grad_norm": 1.7588191032409668, + "learning_rate": 4.965785406857905e-05, + "loss": 5.3921, + "step": 8866 + }, + { + "epoch": 0.05273456085260253, + "grad_norm": 1.835635781288147, + "learning_rate": 4.965777705054446e-05, + "loss": 5.1531, + "step": 8867 + }, + { + "epoch": 0.052740508135883525, + "grad_norm": 2.3071937561035156, + "learning_rate": 4.96577000239021e-05, + "loss": 5.5926, + "step": 8868 + }, + { + "epoch": 0.05274645541916453, + "grad_norm": 2.195712089538574, + "learning_rate": 4.9657622988651995e-05, + "loss": 5.4579, + "step": 8869 + }, + { + "epoch": 0.05275240270244552, + "grad_norm": 2.273738145828247, + "learning_rate": 4.9657545944794156e-05, + "loss": 5.6138, + "step": 8870 + }, + { + "epoch": 0.05275834998572652, + "grad_norm": 2.208343982696533, + "learning_rate": 4.9657468892328626e-05, + "loss": 5.5508, + "step": 8871 + }, + { + "epoch": 0.05276429726900752, + "grad_norm": 2.2111566066741943, + "learning_rate": 4.965739183125544e-05, + "loss": 5.7044, + "step": 8872 + }, + { + "epoch": 0.052770244552288514, + "grad_norm": 1.7516666650772095, + "learning_rate": 4.96573147615746e-05, + "loss": 5.4357, + "step": 8873 + }, + { + "epoch": 0.05277619183556951, + "grad_norm": 2.0703322887420654, + "learning_rate": 4.9657237683286155e-05, + "loss": 5.5383, + "step": 8874 + }, + { + "epoch": 0.05278213911885051, + "grad_norm": 1.796243667602539, + "learning_rate": 4.965716059639012e-05, + "loss": 5.5024, + "step": 8875 + }, + { + "epoch": 0.05278808640213151, + "grad_norm": 2.322397232055664, + "learning_rate": 4.9657083500886526e-05, + "loss": 5.8814, + "step": 8876 + }, + { + "epoch": 0.0527940336854125, + "grad_norm": 2.6743311882019043, + "learning_rate": 4.96570063967754e-05, + "loss": 5.4989, + "step": 8877 + }, + { + "epoch": 0.052799980968693504, + "grad_norm": 2.4381649494171143, + "learning_rate": 4.965692928405676e-05, + "loss": 5.5807, + "step": 8878 + }, + { + "epoch": 0.0528059282519745, + "grad_norm": 2.3703296184539795, + "learning_rate": 4.9656852162730646e-05, + "loss": 5.5586, + "step": 8879 + }, + { + "epoch": 0.052811875535255494, + "grad_norm": 1.7828437089920044, + "learning_rate": 4.9656775032797075e-05, + "loss": 5.2553, + "step": 8880 + }, + { + "epoch": 0.052817822818536496, + "grad_norm": 1.730290412902832, + "learning_rate": 4.9656697894256085e-05, + "loss": 5.3558, + "step": 8881 + }, + { + "epoch": 0.05282377010181749, + "grad_norm": 1.6909739971160889, + "learning_rate": 4.9656620747107694e-05, + "loss": 5.4397, + "step": 8882 + }, + { + "epoch": 0.052829717385098486, + "grad_norm": 1.9772145748138428, + "learning_rate": 4.965654359135193e-05, + "loss": 5.5786, + "step": 8883 + }, + { + "epoch": 0.05283566466837948, + "grad_norm": 1.8624964952468872, + "learning_rate": 4.965646642698883e-05, + "loss": 5.5466, + "step": 8884 + }, + { + "epoch": 0.05284161195166048, + "grad_norm": 1.7061936855316162, + "learning_rate": 4.96563892540184e-05, + "loss": 5.3439, + "step": 8885 + }, + { + "epoch": 0.05284755923494148, + "grad_norm": 1.715483546257019, + "learning_rate": 4.965631207244069e-05, + "loss": 5.2732, + "step": 8886 + }, + { + "epoch": 0.05285350651822247, + "grad_norm": 1.7801883220672607, + "learning_rate": 4.965623488225571e-05, + "loss": 5.2427, + "step": 8887 + }, + { + "epoch": 0.052859453801503475, + "grad_norm": 1.5122452974319458, + "learning_rate": 4.9656157683463495e-05, + "loss": 5.2812, + "step": 8888 + }, + { + "epoch": 0.05286540108478447, + "grad_norm": 1.878077507019043, + "learning_rate": 4.965608047606407e-05, + "loss": 5.6385, + "step": 8889 + }, + { + "epoch": 0.052871348368065466, + "grad_norm": 2.0781304836273193, + "learning_rate": 4.965600326005746e-05, + "loss": 5.3345, + "step": 8890 + }, + { + "epoch": 0.05287729565134647, + "grad_norm": 1.953302264213562, + "learning_rate": 4.965592603544369e-05, + "loss": 5.2694, + "step": 8891 + }, + { + "epoch": 0.05288324293462746, + "grad_norm": 1.9993265867233276, + "learning_rate": 4.96558488022228e-05, + "loss": 5.3323, + "step": 8892 + }, + { + "epoch": 0.05288919021790846, + "grad_norm": 1.7653480768203735, + "learning_rate": 4.96557715603948e-05, + "loss": 5.389, + "step": 8893 + }, + { + "epoch": 0.05289513750118946, + "grad_norm": 1.8843438625335693, + "learning_rate": 4.965569430995973e-05, + "loss": 5.3334, + "step": 8894 + }, + { + "epoch": 0.052901084784470455, + "grad_norm": 1.6673407554626465, + "learning_rate": 4.9655617050917616e-05, + "loss": 5.4469, + "step": 8895 + }, + { + "epoch": 0.05290703206775145, + "grad_norm": 1.8208844661712646, + "learning_rate": 4.9655539783268476e-05, + "loss": 5.6288, + "step": 8896 + }, + { + "epoch": 0.052912979351032445, + "grad_norm": 1.755162000656128, + "learning_rate": 4.965546250701234e-05, + "loss": 5.4388, + "step": 8897 + }, + { + "epoch": 0.05291892663431345, + "grad_norm": 1.9435405731201172, + "learning_rate": 4.965538522214924e-05, + "loss": 5.5877, + "step": 8898 + }, + { + "epoch": 0.05292487391759444, + "grad_norm": 1.8579509258270264, + "learning_rate": 4.9655307928679196e-05, + "loss": 5.4405, + "step": 8899 + }, + { + "epoch": 0.05293082120087544, + "grad_norm": 1.8897236585617065, + "learning_rate": 4.9655230626602246e-05, + "loss": 5.2931, + "step": 8900 + }, + { + "epoch": 0.05293676848415644, + "grad_norm": 1.928133487701416, + "learning_rate": 4.9655153315918403e-05, + "loss": 5.2345, + "step": 8901 + }, + { + "epoch": 0.052942715767437434, + "grad_norm": 1.8830339908599854, + "learning_rate": 4.96550759966277e-05, + "loss": 5.3288, + "step": 8902 + }, + { + "epoch": 0.05294866305071843, + "grad_norm": 1.6774102449417114, + "learning_rate": 4.9654998668730167e-05, + "loss": 5.2939, + "step": 8903 + }, + { + "epoch": 0.05295461033399943, + "grad_norm": 1.7440418004989624, + "learning_rate": 4.9654921332225826e-05, + "loss": 5.4663, + "step": 8904 + }, + { + "epoch": 0.05296055761728043, + "grad_norm": 1.92295241355896, + "learning_rate": 4.965484398711471e-05, + "loss": 5.556, + "step": 8905 + }, + { + "epoch": 0.05296650490056142, + "grad_norm": 1.5319017171859741, + "learning_rate": 4.965476663339684e-05, + "loss": 5.5267, + "step": 8906 + }, + { + "epoch": 0.052972452183842424, + "grad_norm": 1.7626374959945679, + "learning_rate": 4.9654689271072255e-05, + "loss": 5.3774, + "step": 8907 + }, + { + "epoch": 0.05297839946712342, + "grad_norm": 1.745743989944458, + "learning_rate": 4.965461190014096e-05, + "loss": 5.4877, + "step": 8908 + }, + { + "epoch": 0.052984346750404414, + "grad_norm": 1.6091177463531494, + "learning_rate": 4.9654534520603e-05, + "loss": 5.2969, + "step": 8909 + }, + { + "epoch": 0.052990294033685416, + "grad_norm": 1.7392489910125732, + "learning_rate": 4.96544571324584e-05, + "loss": 5.4247, + "step": 8910 + }, + { + "epoch": 0.05299624131696641, + "grad_norm": 1.9275293350219727, + "learning_rate": 4.965437973570718e-05, + "loss": 5.2184, + "step": 8911 + }, + { + "epoch": 0.053002188600247406, + "grad_norm": 1.6901222467422485, + "learning_rate": 4.965430233034937e-05, + "loss": 5.1459, + "step": 8912 + }, + { + "epoch": 0.0530081358835284, + "grad_norm": 1.9212596416473389, + "learning_rate": 4.965422491638499e-05, + "loss": 5.2439, + "step": 8913 + }, + { + "epoch": 0.0530140831668094, + "grad_norm": 1.814706802368164, + "learning_rate": 4.965414749381409e-05, + "loss": 5.5608, + "step": 8914 + }, + { + "epoch": 0.0530200304500904, + "grad_norm": 1.7997081279754639, + "learning_rate": 4.965407006263668e-05, + "loss": 5.6099, + "step": 8915 + }, + { + "epoch": 0.05302597773337139, + "grad_norm": 1.8545546531677246, + "learning_rate": 4.9653992622852777e-05, + "loss": 5.5844, + "step": 8916 + }, + { + "epoch": 0.053031925016652395, + "grad_norm": 1.665958285331726, + "learning_rate": 4.965391517446243e-05, + "loss": 5.4967, + "step": 8917 + }, + { + "epoch": 0.05303787229993339, + "grad_norm": 1.6157240867614746, + "learning_rate": 4.9653837717465655e-05, + "loss": 5.2523, + "step": 8918 + }, + { + "epoch": 0.053043819583214386, + "grad_norm": 1.9782540798187256, + "learning_rate": 4.965376025186248e-05, + "loss": 5.2384, + "step": 8919 + }, + { + "epoch": 0.05304976686649539, + "grad_norm": 2.0229971408843994, + "learning_rate": 4.9653682777652925e-05, + "loss": 5.1703, + "step": 8920 + }, + { + "epoch": 0.05305571414977638, + "grad_norm": 1.8299061059951782, + "learning_rate": 4.965360529483703e-05, + "loss": 5.0257, + "step": 8921 + }, + { + "epoch": 0.05306166143305738, + "grad_norm": 1.9080857038497925, + "learning_rate": 4.965352780341482e-05, + "loss": 5.2516, + "step": 8922 + }, + { + "epoch": 0.05306760871633838, + "grad_norm": 1.9998538494110107, + "learning_rate": 4.965345030338631e-05, + "loss": 5.1991, + "step": 8923 + }, + { + "epoch": 0.053073555999619375, + "grad_norm": 1.7606618404388428, + "learning_rate": 4.965337279475154e-05, + "loss": 5.2194, + "step": 8924 + }, + { + "epoch": 0.05307950328290037, + "grad_norm": 1.9633625745773315, + "learning_rate": 4.9653295277510525e-05, + "loss": 5.2463, + "step": 8925 + }, + { + "epoch": 0.053085450566181365, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.9653217751663306e-05, + "loss": 5.2737, + "step": 8926 + }, + { + "epoch": 0.05309139784946237, + "grad_norm": 1.836289405822754, + "learning_rate": 4.965314021720991e-05, + "loss": 5.1157, + "step": 8927 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 1.8526496887207031, + "learning_rate": 4.965306267415035e-05, + "loss": 5.6541, + "step": 8928 + }, + { + "epoch": 0.05310329241602436, + "grad_norm": 1.9928539991378784, + "learning_rate": 4.965298512248466e-05, + "loss": 5.194, + "step": 8929 + }, + { + "epoch": 0.05310923969930536, + "grad_norm": 1.601536512374878, + "learning_rate": 4.9652907562212867e-05, + "loss": 5.285, + "step": 8930 + }, + { + "epoch": 0.053115186982586354, + "grad_norm": 1.8940081596374512, + "learning_rate": 4.9652829993335e-05, + "loss": 5.1791, + "step": 8931 + }, + { + "epoch": 0.05312113426586735, + "grad_norm": 1.7984519004821777, + "learning_rate": 4.9652752415851085e-05, + "loss": 5.2225, + "step": 8932 + }, + { + "epoch": 0.05312708154914835, + "grad_norm": 1.7474113702774048, + "learning_rate": 4.965267482976115e-05, + "loss": 5.0099, + "step": 8933 + }, + { + "epoch": 0.053133028832429346, + "grad_norm": 1.7044427394866943, + "learning_rate": 4.9652597235065214e-05, + "loss": 5.1456, + "step": 8934 + }, + { + "epoch": 0.05313897611571034, + "grad_norm": 1.5422965288162231, + "learning_rate": 4.9652519631763316e-05, + "loss": 5.0714, + "step": 8935 + }, + { + "epoch": 0.053144923398991344, + "grad_norm": 1.6831375360488892, + "learning_rate": 4.965244201985548e-05, + "loss": 5.0742, + "step": 8936 + }, + { + "epoch": 0.05315087068227234, + "grad_norm": 1.7648097276687622, + "learning_rate": 4.9652364399341734e-05, + "loss": 5.1108, + "step": 8937 + }, + { + "epoch": 0.053156817965553334, + "grad_norm": 1.669393539428711, + "learning_rate": 4.965228677022209e-05, + "loss": 5.1801, + "step": 8938 + }, + { + "epoch": 0.053162765248834336, + "grad_norm": 2.0252909660339355, + "learning_rate": 4.96522091324966e-05, + "loss": 5.3955, + "step": 8939 + }, + { + "epoch": 0.05316871253211533, + "grad_norm": 1.686355710029602, + "learning_rate": 4.965213148616527e-05, + "loss": 5.2626, + "step": 8940 + }, + { + "epoch": 0.053174659815396326, + "grad_norm": 1.7601011991500854, + "learning_rate": 4.965205383122814e-05, + "loss": 5.1603, + "step": 8941 + }, + { + "epoch": 0.05318060709867732, + "grad_norm": 1.7249791622161865, + "learning_rate": 4.9651976167685235e-05, + "loss": 5.4245, + "step": 8942 + }, + { + "epoch": 0.05318655438195832, + "grad_norm": 1.869367003440857, + "learning_rate": 4.9651898495536574e-05, + "loss": 5.2269, + "step": 8943 + }, + { + "epoch": 0.05319250166523932, + "grad_norm": 1.8296380043029785, + "learning_rate": 4.965182081478219e-05, + "loss": 5.3236, + "step": 8944 + }, + { + "epoch": 0.05319844894852031, + "grad_norm": 1.8211008310317993, + "learning_rate": 4.9651743125422115e-05, + "loss": 5.269, + "step": 8945 + }, + { + "epoch": 0.053204396231801315, + "grad_norm": 1.868295431137085, + "learning_rate": 4.965166542745637e-05, + "loss": 5.2733, + "step": 8946 + }, + { + "epoch": 0.05321034351508231, + "grad_norm": 1.6603426933288574, + "learning_rate": 4.965158772088498e-05, + "loss": 5.2685, + "step": 8947 + }, + { + "epoch": 0.053216290798363305, + "grad_norm": 1.680565357208252, + "learning_rate": 4.965151000570798e-05, + "loss": 5.4452, + "step": 8948 + }, + { + "epoch": 0.05322223808164431, + "grad_norm": 1.6473147869110107, + "learning_rate": 4.9651432281925394e-05, + "loss": 5.4476, + "step": 8949 + }, + { + "epoch": 0.0532281853649253, + "grad_norm": 1.5291423797607422, + "learning_rate": 4.965135454953724e-05, + "loss": 5.4617, + "step": 8950 + }, + { + "epoch": 0.0532341326482063, + "grad_norm": 1.4708455801010132, + "learning_rate": 4.965127680854356e-05, + "loss": 5.5431, + "step": 8951 + }, + { + "epoch": 0.0532400799314873, + "grad_norm": 1.4297362565994263, + "learning_rate": 4.9651199058944366e-05, + "loss": 5.431, + "step": 8952 + }, + { + "epoch": 0.053246027214768295, + "grad_norm": 1.726123571395874, + "learning_rate": 4.96511213007397e-05, + "loss": 5.2801, + "step": 8953 + }, + { + "epoch": 0.05325197449804929, + "grad_norm": 1.7977174520492554, + "learning_rate": 4.9651043533929584e-05, + "loss": 5.3273, + "step": 8954 + }, + { + "epoch": 0.053257921781330285, + "grad_norm": 1.8125461339950562, + "learning_rate": 4.9650965758514034e-05, + "loss": 5.3135, + "step": 8955 + }, + { + "epoch": 0.05326386906461129, + "grad_norm": 1.4925352334976196, + "learning_rate": 4.965088797449309e-05, + "loss": 5.1454, + "step": 8956 + }, + { + "epoch": 0.05326981634789228, + "grad_norm": 1.6977181434631348, + "learning_rate": 4.965081018186678e-05, + "loss": 5.3207, + "step": 8957 + }, + { + "epoch": 0.05327576363117328, + "grad_norm": 1.7767595052719116, + "learning_rate": 4.965073238063512e-05, + "loss": 5.203, + "step": 8958 + }, + { + "epoch": 0.05328171091445428, + "grad_norm": 1.53665292263031, + "learning_rate": 4.965065457079815e-05, + "loss": 5.3088, + "step": 8959 + }, + { + "epoch": 0.053287658197735274, + "grad_norm": 1.724476933479309, + "learning_rate": 4.965057675235589e-05, + "loss": 5.2628, + "step": 8960 + }, + { + "epoch": 0.05329360548101627, + "grad_norm": 1.7339463233947754, + "learning_rate": 4.965049892530837e-05, + "loss": 5.3174, + "step": 8961 + }, + { + "epoch": 0.05329955276429727, + "grad_norm": 1.8414005041122437, + "learning_rate": 4.965042108965561e-05, + "loss": 5.2121, + "step": 8962 + }, + { + "epoch": 0.053305500047578266, + "grad_norm": 1.7969903945922852, + "learning_rate": 4.9650343245397655e-05, + "loss": 5.0947, + "step": 8963 + }, + { + "epoch": 0.05331144733085926, + "grad_norm": 1.573320746421814, + "learning_rate": 4.965026539253451e-05, + "loss": 5.0624, + "step": 8964 + }, + { + "epoch": 0.053317394614140264, + "grad_norm": 1.7296351194381714, + "learning_rate": 4.9650187531066204e-05, + "loss": 5.5497, + "step": 8965 + }, + { + "epoch": 0.05332334189742126, + "grad_norm": 1.931847095489502, + "learning_rate": 4.9650109660992784e-05, + "loss": 5.537, + "step": 8966 + }, + { + "epoch": 0.053329289180702254, + "grad_norm": 1.8911564350128174, + "learning_rate": 4.965003178231427e-05, + "loss": 5.4891, + "step": 8967 + }, + { + "epoch": 0.053335236463983256, + "grad_norm": 1.933401107788086, + "learning_rate": 4.964995389503067e-05, + "loss": 5.3157, + "step": 8968 + }, + { + "epoch": 0.05334118374726425, + "grad_norm": 1.8299031257629395, + "learning_rate": 4.964987599914204e-05, + "loss": 5.2955, + "step": 8969 + }, + { + "epoch": 0.053347131030545246, + "grad_norm": 1.5823233127593994, + "learning_rate": 4.964979809464838e-05, + "loss": 5.2708, + "step": 8970 + }, + { + "epoch": 0.05335307831382624, + "grad_norm": 1.602689504623413, + "learning_rate": 4.9649720181549737e-05, + "loss": 5.3646, + "step": 8971 + }, + { + "epoch": 0.05335902559710724, + "grad_norm": 2.2379884719848633, + "learning_rate": 4.964964225984613e-05, + "loss": 5.5453, + "step": 8972 + }, + { + "epoch": 0.05336497288038824, + "grad_norm": 2.2210440635681152, + "learning_rate": 4.964956432953759e-05, + "loss": 5.2123, + "step": 8973 + }, + { + "epoch": 0.05337092016366923, + "grad_norm": 2.4450249671936035, + "learning_rate": 4.964948639062413e-05, + "loss": 5.172, + "step": 8974 + }, + { + "epoch": 0.053376867446950235, + "grad_norm": 1.7727516889572144, + "learning_rate": 4.9649408443105806e-05, + "loss": 5.3447, + "step": 8975 + }, + { + "epoch": 0.05338281473023123, + "grad_norm": 1.8239831924438477, + "learning_rate": 4.964933048698262e-05, + "loss": 5.3628, + "step": 8976 + }, + { + "epoch": 0.053388762013512225, + "grad_norm": 1.9517360925674438, + "learning_rate": 4.964925252225461e-05, + "loss": 5.6118, + "step": 8977 + }, + { + "epoch": 0.05339470929679323, + "grad_norm": 2.1735262870788574, + "learning_rate": 4.9649174548921796e-05, + "loss": 5.7332, + "step": 8978 + }, + { + "epoch": 0.05340065658007422, + "grad_norm": 1.4132062196731567, + "learning_rate": 4.964909656698421e-05, + "loss": 5.8078, + "step": 8979 + }, + { + "epoch": 0.05340660386335522, + "grad_norm": 1.5568846464157104, + "learning_rate": 4.964901857644188e-05, + "loss": 5.6328, + "step": 8980 + }, + { + "epoch": 0.05341255114663622, + "grad_norm": 1.6015586853027344, + "learning_rate": 4.964894057729484e-05, + "loss": 5.3738, + "step": 8981 + }, + { + "epoch": 0.053418498429917215, + "grad_norm": 1.492748737335205, + "learning_rate": 4.9648862569543105e-05, + "loss": 5.4336, + "step": 8982 + }, + { + "epoch": 0.05342444571319821, + "grad_norm": 1.9008845090866089, + "learning_rate": 4.96487845531867e-05, + "loss": 5.455, + "step": 8983 + }, + { + "epoch": 0.053430392996479205, + "grad_norm": 1.9590948820114136, + "learning_rate": 4.9648706528225664e-05, + "loss": 5.3308, + "step": 8984 + }, + { + "epoch": 0.05343634027976021, + "grad_norm": 1.9980428218841553, + "learning_rate": 4.964862849466002e-05, + "loss": 5.3777, + "step": 8985 + }, + { + "epoch": 0.0534422875630412, + "grad_norm": 1.769711971282959, + "learning_rate": 4.964855045248979e-05, + "loss": 5.4451, + "step": 8986 + }, + { + "epoch": 0.0534482348463222, + "grad_norm": 1.769977331161499, + "learning_rate": 4.964847240171502e-05, + "loss": 5.277, + "step": 8987 + }, + { + "epoch": 0.0534541821296032, + "grad_norm": 1.6647396087646484, + "learning_rate": 4.9648394342335705e-05, + "loss": 5.4655, + "step": 8988 + }, + { + "epoch": 0.053460129412884194, + "grad_norm": 1.861554503440857, + "learning_rate": 4.9648316274351906e-05, + "loss": 5.308, + "step": 8989 + }, + { + "epoch": 0.05346607669616519, + "grad_norm": 1.9457745552062988, + "learning_rate": 4.964823819776362e-05, + "loss": 6.2361, + "step": 8990 + }, + { + "epoch": 0.05347202397944619, + "grad_norm": 1.7702157497406006, + "learning_rate": 4.9648160112570896e-05, + "loss": 5.366, + "step": 8991 + }, + { + "epoch": 0.053477971262727186, + "grad_norm": 2.0074565410614014, + "learning_rate": 4.964808201877375e-05, + "loss": 5.3598, + "step": 8992 + }, + { + "epoch": 0.05348391854600818, + "grad_norm": 1.8686721324920654, + "learning_rate": 4.964800391637222e-05, + "loss": 5.4607, + "step": 8993 + }, + { + "epoch": 0.053489865829289183, + "grad_norm": 1.9749736785888672, + "learning_rate": 4.964792580536632e-05, + "loss": 5.3734, + "step": 8994 + }, + { + "epoch": 0.05349581311257018, + "grad_norm": 1.8435015678405762, + "learning_rate": 4.964784768575609e-05, + "loss": 5.3815, + "step": 8995 + }, + { + "epoch": 0.053501760395851174, + "grad_norm": 2.01983380317688, + "learning_rate": 4.9647769557541546e-05, + "loss": 5.4089, + "step": 8996 + }, + { + "epoch": 0.053507707679132176, + "grad_norm": 2.014798402786255, + "learning_rate": 4.964769142072272e-05, + "loss": 5.3906, + "step": 8997 + }, + { + "epoch": 0.05351365496241317, + "grad_norm": 1.8822753429412842, + "learning_rate": 4.9647613275299644e-05, + "loss": 5.3598, + "step": 8998 + }, + { + "epoch": 0.053519602245694166, + "grad_norm": 1.6534459590911865, + "learning_rate": 4.9647535121272334e-05, + "loss": 5.4577, + "step": 8999 + }, + { + "epoch": 0.05352554952897516, + "grad_norm": 1.6497015953063965, + "learning_rate": 4.964745695864083e-05, + "loss": 5.3915, + "step": 9000 + }, + { + "epoch": 0.05353149681225616, + "grad_norm": 1.5535780191421509, + "learning_rate": 4.964737878740515e-05, + "loss": 5.2444, + "step": 9001 + }, + { + "epoch": 0.05353744409553716, + "grad_norm": 1.6840674877166748, + "learning_rate": 4.964730060756533e-05, + "loss": 5.3439, + "step": 9002 + }, + { + "epoch": 0.05354339137881815, + "grad_norm": 1.7857226133346558, + "learning_rate": 4.9647222419121384e-05, + "loss": 5.3231, + "step": 9003 + }, + { + "epoch": 0.053549338662099155, + "grad_norm": 1.6067994832992554, + "learning_rate": 4.964714422207335e-05, + "loss": 5.4019, + "step": 9004 + }, + { + "epoch": 0.05355528594538015, + "grad_norm": 1.7026724815368652, + "learning_rate": 4.964706601642125e-05, + "loss": 5.2716, + "step": 9005 + }, + { + "epoch": 0.053561233228661145, + "grad_norm": 1.632804036140442, + "learning_rate": 4.964698780216512e-05, + "loss": 5.4132, + "step": 9006 + }, + { + "epoch": 0.05356718051194215, + "grad_norm": 1.6569499969482422, + "learning_rate": 4.964690957930498e-05, + "loss": 5.294, + "step": 9007 + }, + { + "epoch": 0.05357312779522314, + "grad_norm": 1.8141810894012451, + "learning_rate": 4.964683134784086e-05, + "loss": 5.3365, + "step": 9008 + }, + { + "epoch": 0.05357907507850414, + "grad_norm": 1.6555678844451904, + "learning_rate": 4.964675310777278e-05, + "loss": 5.3488, + "step": 9009 + }, + { + "epoch": 0.05358502236178514, + "grad_norm": 1.8363603353500366, + "learning_rate": 4.964667485910078e-05, + "loss": 5.3679, + "step": 9010 + }, + { + "epoch": 0.053590969645066135, + "grad_norm": 1.7839024066925049, + "learning_rate": 4.9646596601824874e-05, + "loss": 5.2514, + "step": 9011 + }, + { + "epoch": 0.05359691692834713, + "grad_norm": 1.8712091445922852, + "learning_rate": 4.96465183359451e-05, + "loss": 5.4313, + "step": 9012 + }, + { + "epoch": 0.053602864211628125, + "grad_norm": 1.9677501916885376, + "learning_rate": 4.964644006146148e-05, + "loss": 5.2442, + "step": 9013 + }, + { + "epoch": 0.05360881149490913, + "grad_norm": 1.8567090034484863, + "learning_rate": 4.964636177837404e-05, + "loss": 5.105, + "step": 9014 + }, + { + "epoch": 0.05361475877819012, + "grad_norm": 1.7319908142089844, + "learning_rate": 4.964628348668281e-05, + "loss": 5.2962, + "step": 9015 + }, + { + "epoch": 0.05362070606147112, + "grad_norm": 1.6412272453308105, + "learning_rate": 4.9646205186387824e-05, + "loss": 5.2302, + "step": 9016 + }, + { + "epoch": 0.05362665334475212, + "grad_norm": 1.9401088953018188, + "learning_rate": 4.96461268774891e-05, + "loss": 5.4425, + "step": 9017 + }, + { + "epoch": 0.053632600628033114, + "grad_norm": 1.7045506238937378, + "learning_rate": 4.964604855998666e-05, + "loss": 5.2325, + "step": 9018 + }, + { + "epoch": 0.05363854791131411, + "grad_norm": 1.8232519626617432, + "learning_rate": 4.9645970233880545e-05, + "loss": 5.5047, + "step": 9019 + }, + { + "epoch": 0.05364449519459511, + "grad_norm": 1.718833327293396, + "learning_rate": 4.964589189917077e-05, + "loss": 5.3323, + "step": 9020 + }, + { + "epoch": 0.053650442477876106, + "grad_norm": 1.608774185180664, + "learning_rate": 4.9645813555857376e-05, + "loss": 5.2374, + "step": 9021 + }, + { + "epoch": 0.0536563897611571, + "grad_norm": 1.6789363622665405, + "learning_rate": 4.964573520394039e-05, + "loss": 5.3291, + "step": 9022 + }, + { + "epoch": 0.0536623370444381, + "grad_norm": 1.6596689224243164, + "learning_rate": 4.964565684341982e-05, + "loss": 5.308, + "step": 9023 + }, + { + "epoch": 0.0536682843277191, + "grad_norm": 1.8141522407531738, + "learning_rate": 4.9645578474295703e-05, + "loss": 5.2033, + "step": 9024 + }, + { + "epoch": 0.053674231611000094, + "grad_norm": 1.428606390953064, + "learning_rate": 4.964550009656808e-05, + "loss": 5.2441, + "step": 9025 + }, + { + "epoch": 0.053680178894281096, + "grad_norm": 1.5033652782440186, + "learning_rate": 4.9645421710236965e-05, + "loss": 5.2132, + "step": 9026 + }, + { + "epoch": 0.05368612617756209, + "grad_norm": 1.7123147249221802, + "learning_rate": 4.9645343315302385e-05, + "loss": 5.3145, + "step": 9027 + }, + { + "epoch": 0.053692073460843086, + "grad_norm": 1.5851943492889404, + "learning_rate": 4.9645264911764376e-05, + "loss": 5.353, + "step": 9028 + }, + { + "epoch": 0.05369802074412408, + "grad_norm": 1.6627084016799927, + "learning_rate": 4.964518649962295e-05, + "loss": 5.1049, + "step": 9029 + }, + { + "epoch": 0.05370396802740508, + "grad_norm": 1.51585853099823, + "learning_rate": 4.964510807887815e-05, + "loss": 4.9433, + "step": 9030 + }, + { + "epoch": 0.05370991531068608, + "grad_norm": 1.7350785732269287, + "learning_rate": 4.964502964952999e-05, + "loss": 5.1761, + "step": 9031 + }, + { + "epoch": 0.05371586259396707, + "grad_norm": 1.925410509109497, + "learning_rate": 4.964495121157852e-05, + "loss": 5.0528, + "step": 9032 + }, + { + "epoch": 0.053721809877248075, + "grad_norm": 1.794162631034851, + "learning_rate": 4.964487276502374e-05, + "loss": 5.2009, + "step": 9033 + }, + { + "epoch": 0.05372775716052907, + "grad_norm": 1.6729109287261963, + "learning_rate": 4.964479430986569e-05, + "loss": 5.16, + "step": 9034 + }, + { + "epoch": 0.053733704443810065, + "grad_norm": 1.8543394804000854, + "learning_rate": 4.9644715846104406e-05, + "loss": 5.3545, + "step": 9035 + }, + { + "epoch": 0.05373965172709107, + "grad_norm": 1.6876883506774902, + "learning_rate": 4.96446373737399e-05, + "loss": 5.2074, + "step": 9036 + }, + { + "epoch": 0.05374559901037206, + "grad_norm": 1.816701054573059, + "learning_rate": 4.9644558892772205e-05, + "loss": 5.154, + "step": 9037 + }, + { + "epoch": 0.05375154629365306, + "grad_norm": 1.471283197402954, + "learning_rate": 4.964448040320135e-05, + "loss": 5.2577, + "step": 9038 + }, + { + "epoch": 0.05375749357693406, + "grad_norm": 1.5764297246932983, + "learning_rate": 4.964440190502736e-05, + "loss": 5.0115, + "step": 9039 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 1.6854795217514038, + "learning_rate": 4.964432339825027e-05, + "loss": 5.1957, + "step": 9040 + }, + { + "epoch": 0.05376938814349605, + "grad_norm": 1.889570951461792, + "learning_rate": 4.964424488287009e-05, + "loss": 5.1229, + "step": 9041 + }, + { + "epoch": 0.05377533542677705, + "grad_norm": 1.7528218030929565, + "learning_rate": 4.964416635888687e-05, + "loss": 5.0002, + "step": 9042 + }, + { + "epoch": 0.05378128271005805, + "grad_norm": 1.68081796169281, + "learning_rate": 4.964408782630062e-05, + "loss": 5.0567, + "step": 9043 + }, + { + "epoch": 0.05378722999333904, + "grad_norm": 1.6083979606628418, + "learning_rate": 4.9644009285111384e-05, + "loss": 5.0775, + "step": 9044 + }, + { + "epoch": 0.05379317727662004, + "grad_norm": 1.676720380783081, + "learning_rate": 4.9643930735319164e-05, + "loss": 5.0446, + "step": 9045 + }, + { + "epoch": 0.05379912455990104, + "grad_norm": 1.6502453088760376, + "learning_rate": 4.964385217692401e-05, + "loss": 5.3751, + "step": 9046 + }, + { + "epoch": 0.053805071843182034, + "grad_norm": 1.9226343631744385, + "learning_rate": 4.9643773609925935e-05, + "loss": 5.2442, + "step": 9047 + }, + { + "epoch": 0.05381101912646303, + "grad_norm": 1.8054014444351196, + "learning_rate": 4.964369503432498e-05, + "loss": 5.4844, + "step": 9048 + }, + { + "epoch": 0.05381696640974403, + "grad_norm": 1.5151008367538452, + "learning_rate": 4.9643616450121166e-05, + "loss": 5.2834, + "step": 9049 + }, + { + "epoch": 0.053822913693025026, + "grad_norm": 2.0237820148468018, + "learning_rate": 4.964353785731452e-05, + "loss": 5.3166, + "step": 9050 + }, + { + "epoch": 0.05382886097630602, + "grad_norm": 2.145364999771118, + "learning_rate": 4.964345925590507e-05, + "loss": 5.3803, + "step": 9051 + }, + { + "epoch": 0.05383480825958702, + "grad_norm": 1.747369408607483, + "learning_rate": 4.964338064589284e-05, + "loss": 6.1041, + "step": 9052 + }, + { + "epoch": 0.05384075554286802, + "grad_norm": 1.9964301586151123, + "learning_rate": 4.964330202727786e-05, + "loss": 5.1707, + "step": 9053 + }, + { + "epoch": 0.053846702826149014, + "grad_norm": 1.630233645439148, + "learning_rate": 4.9643223400060155e-05, + "loss": 4.9385, + "step": 9054 + }, + { + "epoch": 0.053852650109430016, + "grad_norm": 1.5782960653305054, + "learning_rate": 4.9643144764239765e-05, + "loss": 4.9953, + "step": 9055 + }, + { + "epoch": 0.05385859739271101, + "grad_norm": 2.1511783599853516, + "learning_rate": 4.9643066119816706e-05, + "loss": 5.4329, + "step": 9056 + }, + { + "epoch": 0.053864544675992006, + "grad_norm": 2.2133493423461914, + "learning_rate": 4.9642987466791004e-05, + "loss": 5.7347, + "step": 9057 + }, + { + "epoch": 0.053870491959273, + "grad_norm": 1.7669782638549805, + "learning_rate": 4.9642908805162686e-05, + "loss": 5.4129, + "step": 9058 + }, + { + "epoch": 0.053876439242554, + "grad_norm": 1.8005794286727905, + "learning_rate": 4.9642830134931787e-05, + "loss": 5.2397, + "step": 9059 + }, + { + "epoch": 0.053882386525835, + "grad_norm": 1.697607398033142, + "learning_rate": 4.9642751456098325e-05, + "loss": 5.3388, + "step": 9060 + }, + { + "epoch": 0.05388833380911599, + "grad_norm": 1.4916869401931763, + "learning_rate": 4.9642672768662344e-05, + "loss": 5.2574, + "step": 9061 + }, + { + "epoch": 0.053894281092396995, + "grad_norm": 1.7112784385681152, + "learning_rate": 4.964259407262385e-05, + "loss": 4.9881, + "step": 9062 + }, + { + "epoch": 0.05390022837567799, + "grad_norm": 1.4831846952438354, + "learning_rate": 4.964251536798289e-05, + "loss": 5.3976, + "step": 9063 + }, + { + "epoch": 0.053906175658958985, + "grad_norm": 1.626370906829834, + "learning_rate": 4.9642436654739476e-05, + "loss": 5.2409, + "step": 9064 + }, + { + "epoch": 0.05391212294223999, + "grad_norm": 1.7369413375854492, + "learning_rate": 4.964235793289365e-05, + "loss": 5.2732, + "step": 9065 + }, + { + "epoch": 0.05391807022552098, + "grad_norm": 1.7028629779815674, + "learning_rate": 4.964227920244542e-05, + "loss": 5.3161, + "step": 9066 + }, + { + "epoch": 0.05392401750880198, + "grad_norm": 1.9031678438186646, + "learning_rate": 4.964220046339483e-05, + "loss": 5.2517, + "step": 9067 + }, + { + "epoch": 0.05392996479208298, + "grad_norm": 1.8210735321044922, + "learning_rate": 4.96421217157419e-05, + "loss": 5.2819, + "step": 9068 + }, + { + "epoch": 0.053935912075363975, + "grad_norm": 1.7334645986557007, + "learning_rate": 4.9642042959486666e-05, + "loss": 5.4296, + "step": 9069 + }, + { + "epoch": 0.05394185935864497, + "grad_norm": 1.732790231704712, + "learning_rate": 4.964196419462914e-05, + "loss": 5.3589, + "step": 9070 + }, + { + "epoch": 0.05394780664192597, + "grad_norm": 1.417751669883728, + "learning_rate": 4.964188542116937e-05, + "loss": 5.0958, + "step": 9071 + }, + { + "epoch": 0.05395375392520697, + "grad_norm": 1.8562361001968384, + "learning_rate": 4.964180663910737e-05, + "loss": 5.2622, + "step": 9072 + }, + { + "epoch": 0.05395970120848796, + "grad_norm": 1.7366154193878174, + "learning_rate": 4.9641727848443166e-05, + "loss": 5.2329, + "step": 9073 + }, + { + "epoch": 0.05396564849176896, + "grad_norm": 1.8587182760238647, + "learning_rate": 4.9641649049176785e-05, + "loss": 4.9392, + "step": 9074 + }, + { + "epoch": 0.05397159577504996, + "grad_norm": 1.6152398586273193, + "learning_rate": 4.964157024130827e-05, + "loss": 5.473, + "step": 9075 + }, + { + "epoch": 0.053977543058330954, + "grad_norm": 1.5967273712158203, + "learning_rate": 4.9641491424837626e-05, + "loss": 5.2877, + "step": 9076 + }, + { + "epoch": 0.05398349034161195, + "grad_norm": 1.4986391067504883, + "learning_rate": 4.96414125997649e-05, + "loss": 5.2163, + "step": 9077 + }, + { + "epoch": 0.05398943762489295, + "grad_norm": 1.563905119895935, + "learning_rate": 4.964133376609011e-05, + "loss": 5.2043, + "step": 9078 + }, + { + "epoch": 0.053995384908173946, + "grad_norm": 1.5690317153930664, + "learning_rate": 4.964125492381329e-05, + "loss": 5.2226, + "step": 9079 + }, + { + "epoch": 0.05400133219145494, + "grad_norm": 1.7732517719268799, + "learning_rate": 4.9641176072934446e-05, + "loss": 5.3123, + "step": 9080 + }, + { + "epoch": 0.05400727947473594, + "grad_norm": 1.7045226097106934, + "learning_rate": 4.964109721345364e-05, + "loss": 5.0872, + "step": 9081 + }, + { + "epoch": 0.05401322675801694, + "grad_norm": 1.6405664682388306, + "learning_rate": 4.964101834537087e-05, + "loss": 5.3863, + "step": 9082 + }, + { + "epoch": 0.054019174041297933, + "grad_norm": 1.7410979270935059, + "learning_rate": 4.964093946868618e-05, + "loss": 5.0952, + "step": 9083 + }, + { + "epoch": 0.054025121324578936, + "grad_norm": 2.0102951526641846, + "learning_rate": 4.964086058339959e-05, + "loss": 4.9484, + "step": 9084 + }, + { + "epoch": 0.05403106860785993, + "grad_norm": 1.8228510618209839, + "learning_rate": 4.9640781689511133e-05, + "loss": 5.1141, + "step": 9085 + }, + { + "epoch": 0.054037015891140926, + "grad_norm": 1.7363582849502563, + "learning_rate": 4.964070278702083e-05, + "loss": 5.1164, + "step": 9086 + }, + { + "epoch": 0.05404296317442192, + "grad_norm": 1.6060153245925903, + "learning_rate": 4.9640623875928714e-05, + "loss": 5.1746, + "step": 9087 + }, + { + "epoch": 0.05404891045770292, + "grad_norm": 1.6690374612808228, + "learning_rate": 4.9640544956234814e-05, + "loss": 5.0931, + "step": 9088 + }, + { + "epoch": 0.05405485774098392, + "grad_norm": 1.613527774810791, + "learning_rate": 4.964046602793916e-05, + "loss": 5.2224, + "step": 9089 + }, + { + "epoch": 0.05406080502426491, + "grad_norm": 1.6461642980575562, + "learning_rate": 4.964038709104176e-05, + "loss": 5.3175, + "step": 9090 + }, + { + "epoch": 0.054066752307545915, + "grad_norm": 1.839709758758545, + "learning_rate": 4.9640308145542664e-05, + "loss": 5.3247, + "step": 9091 + }, + { + "epoch": 0.05407269959082691, + "grad_norm": 1.8977348804473877, + "learning_rate": 4.9640229191441886e-05, + "loss": 5.4256, + "step": 9092 + }, + { + "epoch": 0.054078646874107905, + "grad_norm": 1.9805532693862915, + "learning_rate": 4.9640150228739454e-05, + "loss": 4.9413, + "step": 9093 + }, + { + "epoch": 0.05408459415738891, + "grad_norm": 2.0237114429473877, + "learning_rate": 4.964007125743542e-05, + "loss": 4.8808, + "step": 9094 + }, + { + "epoch": 0.0540905414406699, + "grad_norm": 1.9848511219024658, + "learning_rate": 4.963999227752977e-05, + "loss": 5.0295, + "step": 9095 + }, + { + "epoch": 0.0540964887239509, + "grad_norm": 1.925876498222351, + "learning_rate": 4.9639913289022564e-05, + "loss": 5.0129, + "step": 9096 + }, + { + "epoch": 0.0541024360072319, + "grad_norm": 1.4887725114822388, + "learning_rate": 4.963983429191382e-05, + "loss": 4.9706, + "step": 9097 + }, + { + "epoch": 0.054108383290512894, + "grad_norm": 1.615160584449768, + "learning_rate": 4.963975528620356e-05, + "loss": 5.0066, + "step": 9098 + }, + { + "epoch": 0.05411433057379389, + "grad_norm": 1.969086766242981, + "learning_rate": 4.9639676271891816e-05, + "loss": 4.9539, + "step": 9099 + }, + { + "epoch": 0.05412027785707489, + "grad_norm": 1.8290555477142334, + "learning_rate": 4.963959724897862e-05, + "loss": 5.2467, + "step": 9100 + }, + { + "epoch": 0.05412622514035589, + "grad_norm": 2.004157066345215, + "learning_rate": 4.963951821746399e-05, + "loss": 4.8, + "step": 9101 + }, + { + "epoch": 0.05413217242363688, + "grad_norm": 1.9732778072357178, + "learning_rate": 4.9639439177347955e-05, + "loss": 4.8828, + "step": 9102 + }, + { + "epoch": 0.05413811970691788, + "grad_norm": 1.8653557300567627, + "learning_rate": 4.963936012863056e-05, + "loss": 5.0591, + "step": 9103 + }, + { + "epoch": 0.05414406699019888, + "grad_norm": 1.7854375839233398, + "learning_rate": 4.9639281071311804e-05, + "loss": 5.0914, + "step": 9104 + }, + { + "epoch": 0.054150014273479874, + "grad_norm": 1.7956377267837524, + "learning_rate": 4.963920200539174e-05, + "loss": 5.3484, + "step": 9105 + }, + { + "epoch": 0.05415596155676087, + "grad_norm": 1.7851346731185913, + "learning_rate": 4.963912293087039e-05, + "loss": 5.3146, + "step": 9106 + }, + { + "epoch": 0.05416190884004187, + "grad_norm": 1.72859787940979, + "learning_rate": 4.9639043847747756e-05, + "loss": 5.1611, + "step": 9107 + }, + { + "epoch": 0.054167856123322866, + "grad_norm": 1.5961265563964844, + "learning_rate": 4.9638964756023904e-05, + "loss": 5.247, + "step": 9108 + }, + { + "epoch": 0.05417380340660386, + "grad_norm": 1.7507922649383545, + "learning_rate": 4.963888565569884e-05, + "loss": 5.2011, + "step": 9109 + }, + { + "epoch": 0.05417975068988486, + "grad_norm": 1.8338440656661987, + "learning_rate": 4.9638806546772594e-05, + "loss": 5.2413, + "step": 9110 + }, + { + "epoch": 0.05418569797316586, + "grad_norm": 1.8935306072235107, + "learning_rate": 4.963872742924519e-05, + "loss": 5.1042, + "step": 9111 + }, + { + "epoch": 0.05419164525644685, + "grad_norm": 1.6512808799743652, + "learning_rate": 4.963864830311667e-05, + "loss": 5.2437, + "step": 9112 + }, + { + "epoch": 0.054197592539727855, + "grad_norm": 1.6099332571029663, + "learning_rate": 4.963856916838705e-05, + "loss": 5.2828, + "step": 9113 + }, + { + "epoch": 0.05420353982300885, + "grad_norm": 2.114581823348999, + "learning_rate": 4.9638490025056355e-05, + "loss": 6.1534, + "step": 9114 + }, + { + "epoch": 0.054209487106289846, + "grad_norm": 1.762335181236267, + "learning_rate": 4.963841087312462e-05, + "loss": 5.1504, + "step": 9115 + }, + { + "epoch": 0.05421543438957084, + "grad_norm": 1.7669222354888916, + "learning_rate": 4.963833171259187e-05, + "loss": 5.0365, + "step": 9116 + }, + { + "epoch": 0.05422138167285184, + "grad_norm": 1.7319819927215576, + "learning_rate": 4.963825254345814e-05, + "loss": 5.0724, + "step": 9117 + }, + { + "epoch": 0.05422732895613284, + "grad_norm": 1.618116021156311, + "learning_rate": 4.9638173365723444e-05, + "loss": 5.0964, + "step": 9118 + }, + { + "epoch": 0.05423327623941383, + "grad_norm": 1.6506006717681885, + "learning_rate": 4.9638094179387814e-05, + "loss": 5.1189, + "step": 9119 + }, + { + "epoch": 0.054239223522694835, + "grad_norm": 1.7512328624725342, + "learning_rate": 4.963801498445129e-05, + "loss": 5.2732, + "step": 9120 + }, + { + "epoch": 0.05424517080597583, + "grad_norm": 1.5639985799789429, + "learning_rate": 4.963793578091388e-05, + "loss": 5.0718, + "step": 9121 + }, + { + "epoch": 0.054251118089256825, + "grad_norm": 1.7059093713760376, + "learning_rate": 4.963785656877562e-05, + "loss": 5.0744, + "step": 9122 + }, + { + "epoch": 0.05425706537253783, + "grad_norm": 1.574802279472351, + "learning_rate": 4.9637777348036546e-05, + "loss": 5.2663, + "step": 9123 + }, + { + "epoch": 0.05426301265581882, + "grad_norm": 1.7343204021453857, + "learning_rate": 4.9637698118696674e-05, + "loss": 5.0805, + "step": 9124 + }, + { + "epoch": 0.05426895993909982, + "grad_norm": 1.6154165267944336, + "learning_rate": 4.963761888075604e-05, + "loss": 5.1402, + "step": 9125 + }, + { + "epoch": 0.05427490722238082, + "grad_norm": 1.6474148035049438, + "learning_rate": 4.9637539634214666e-05, + "loss": 5.0601, + "step": 9126 + }, + { + "epoch": 0.054280854505661814, + "grad_norm": 1.7573519945144653, + "learning_rate": 4.963746037907258e-05, + "loss": 5.1846, + "step": 9127 + }, + { + "epoch": 0.05428680178894281, + "grad_norm": 1.4558652639389038, + "learning_rate": 4.963738111532981e-05, + "loss": 5.3132, + "step": 9128 + }, + { + "epoch": 0.05429274907222381, + "grad_norm": 1.6261000633239746, + "learning_rate": 4.963730184298639e-05, + "loss": 5.2843, + "step": 9129 + }, + { + "epoch": 0.05429869635550481, + "grad_norm": 1.4502191543579102, + "learning_rate": 4.963722256204234e-05, + "loss": 5.14, + "step": 9130 + }, + { + "epoch": 0.0543046436387858, + "grad_norm": 1.6366747617721558, + "learning_rate": 4.9637143272497686e-05, + "loss": 5.1496, + "step": 9131 + }, + { + "epoch": 0.0543105909220668, + "grad_norm": 1.603745698928833, + "learning_rate": 4.963706397435246e-05, + "loss": 5.0644, + "step": 9132 + }, + { + "epoch": 0.0543165382053478, + "grad_norm": 1.419536828994751, + "learning_rate": 4.963698466760669e-05, + "loss": 5.3182, + "step": 9133 + }, + { + "epoch": 0.054322485488628794, + "grad_norm": 1.511765480041504, + "learning_rate": 4.963690535226041e-05, + "loss": 5.2808, + "step": 9134 + }, + { + "epoch": 0.05432843277190979, + "grad_norm": 1.4999688863754272, + "learning_rate": 4.963682602831364e-05, + "loss": 4.9235, + "step": 9135 + }, + { + "epoch": 0.05433438005519079, + "grad_norm": 1.5918420553207397, + "learning_rate": 4.96367466957664e-05, + "loss": 4.9293, + "step": 9136 + }, + { + "epoch": 0.054340327338471786, + "grad_norm": 1.502748727798462, + "learning_rate": 4.963666735461874e-05, + "loss": 5.2692, + "step": 9137 + }, + { + "epoch": 0.05434627462175278, + "grad_norm": 1.6474169492721558, + "learning_rate": 4.963658800487066e-05, + "loss": 5.1638, + "step": 9138 + }, + { + "epoch": 0.05435222190503378, + "grad_norm": 2.0195884704589844, + "learning_rate": 4.9636508646522204e-05, + "loss": 5.1085, + "step": 9139 + }, + { + "epoch": 0.05435816918831478, + "grad_norm": 1.7266180515289307, + "learning_rate": 4.9636429279573406e-05, + "loss": 5.0747, + "step": 9140 + }, + { + "epoch": 0.05436411647159577, + "grad_norm": 1.6965065002441406, + "learning_rate": 4.963634990402428e-05, + "loss": 5.1246, + "step": 9141 + }, + { + "epoch": 0.054370063754876775, + "grad_norm": 1.7629759311676025, + "learning_rate": 4.9636270519874856e-05, + "loss": 5.274, + "step": 9142 + }, + { + "epoch": 0.05437601103815777, + "grad_norm": 1.6365042924880981, + "learning_rate": 4.9636191127125164e-05, + "loss": 5.2469, + "step": 9143 + }, + { + "epoch": 0.054381958321438766, + "grad_norm": 1.6777831315994263, + "learning_rate": 4.9636111725775235e-05, + "loss": 5.3041, + "step": 9144 + }, + { + "epoch": 0.05438790560471976, + "grad_norm": 1.5354039669036865, + "learning_rate": 4.9636032315825096e-05, + "loss": 5.1799, + "step": 9145 + }, + { + "epoch": 0.05439385288800076, + "grad_norm": 1.508083701133728, + "learning_rate": 4.9635952897274773e-05, + "loss": 5.0822, + "step": 9146 + }, + { + "epoch": 0.05439980017128176, + "grad_norm": 1.5960441827774048, + "learning_rate": 4.963587347012429e-05, + "loss": 5.1618, + "step": 9147 + }, + { + "epoch": 0.05440574745456275, + "grad_norm": 1.4927520751953125, + "learning_rate": 4.9635794034373675e-05, + "loss": 5.1464, + "step": 9148 + }, + { + "epoch": 0.054411694737843755, + "grad_norm": 1.7420401573181152, + "learning_rate": 4.9635714590022966e-05, + "loss": 5.2866, + "step": 9149 + }, + { + "epoch": 0.05441764202112475, + "grad_norm": 1.7907800674438477, + "learning_rate": 4.9635635137072176e-05, + "loss": 5.1042, + "step": 9150 + }, + { + "epoch": 0.054423589304405745, + "grad_norm": 1.7073547840118408, + "learning_rate": 4.963555567552135e-05, + "loss": 5.1986, + "step": 9151 + }, + { + "epoch": 0.05442953658768675, + "grad_norm": 1.894405484199524, + "learning_rate": 4.96354762053705e-05, + "loss": 5.225, + "step": 9152 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 1.5830878019332886, + "learning_rate": 4.9635396726619656e-05, + "loss": 5.2902, + "step": 9153 + }, + { + "epoch": 0.05444143115424874, + "grad_norm": 1.5435214042663574, + "learning_rate": 4.963531723926885e-05, + "loss": 5.0773, + "step": 9154 + }, + { + "epoch": 0.05444737843752974, + "grad_norm": 1.4262596368789673, + "learning_rate": 4.9635237743318117e-05, + "loss": 5.129, + "step": 9155 + }, + { + "epoch": 0.054453325720810734, + "grad_norm": 1.5793390274047852, + "learning_rate": 4.9635158238767475e-05, + "loss": 5.1693, + "step": 9156 + }, + { + "epoch": 0.05445927300409173, + "grad_norm": 1.767318606376648, + "learning_rate": 4.963507872561695e-05, + "loss": 5.2541, + "step": 9157 + }, + { + "epoch": 0.05446522028737273, + "grad_norm": 1.5084065198898315, + "learning_rate": 4.963499920386658e-05, + "loss": 5.2531, + "step": 9158 + }, + { + "epoch": 0.05447116757065373, + "grad_norm": 1.797877311706543, + "learning_rate": 4.963491967351638e-05, + "loss": 5.2278, + "step": 9159 + }, + { + "epoch": 0.05447711485393472, + "grad_norm": 1.7463361024856567, + "learning_rate": 4.963484013456639e-05, + "loss": 5.1005, + "step": 9160 + }, + { + "epoch": 0.05448306213721572, + "grad_norm": 1.8208277225494385, + "learning_rate": 4.9634760587016626e-05, + "loss": 5.1437, + "step": 9161 + }, + { + "epoch": 0.05448900942049672, + "grad_norm": 1.9020015001296997, + "learning_rate": 4.9634681030867116e-05, + "loss": 5.1554, + "step": 9162 + }, + { + "epoch": 0.054494956703777714, + "grad_norm": 1.8370200395584106, + "learning_rate": 4.9634601466117904e-05, + "loss": 5.2418, + "step": 9163 + }, + { + "epoch": 0.05450090398705871, + "grad_norm": 1.785875678062439, + "learning_rate": 4.9634521892769004e-05, + "loss": 5.1916, + "step": 9164 + }, + { + "epoch": 0.05450685127033971, + "grad_norm": 1.7501643896102905, + "learning_rate": 4.963444231082045e-05, + "loss": 5.0887, + "step": 9165 + }, + { + "epoch": 0.054512798553620706, + "grad_norm": 1.6924220323562622, + "learning_rate": 4.963436272027227e-05, + "loss": 5.2458, + "step": 9166 + }, + { + "epoch": 0.0545187458369017, + "grad_norm": 1.895605206489563, + "learning_rate": 4.963428312112447e-05, + "loss": 5.1286, + "step": 9167 + }, + { + "epoch": 0.0545246931201827, + "grad_norm": 1.842207908630371, + "learning_rate": 4.963420351337711e-05, + "loss": 5.1177, + "step": 9168 + }, + { + "epoch": 0.0545306404034637, + "grad_norm": 1.7467048168182373, + "learning_rate": 4.963412389703021e-05, + "loss": 5.1616, + "step": 9169 + }, + { + "epoch": 0.05453658768674469, + "grad_norm": 1.8047499656677246, + "learning_rate": 4.963404427208378e-05, + "loss": 5.0543, + "step": 9170 + }, + { + "epoch": 0.054542534970025695, + "grad_norm": 1.5830637216567993, + "learning_rate": 4.963396463853786e-05, + "loss": 5.0989, + "step": 9171 + }, + { + "epoch": 0.05454848225330669, + "grad_norm": 1.7481937408447266, + "learning_rate": 4.9633884996392485e-05, + "loss": 5.1686, + "step": 9172 + }, + { + "epoch": 0.054554429536587686, + "grad_norm": 1.7132925987243652, + "learning_rate": 4.9633805345647664e-05, + "loss": 4.9683, + "step": 9173 + }, + { + "epoch": 0.05456037681986868, + "grad_norm": 1.8369117975234985, + "learning_rate": 4.9633725686303445e-05, + "loss": 5.154, + "step": 9174 + }, + { + "epoch": 0.05456632410314968, + "grad_norm": 1.615011215209961, + "learning_rate": 4.963364601835985e-05, + "loss": 5.0982, + "step": 9175 + }, + { + "epoch": 0.05457227138643068, + "grad_norm": 1.853742003440857, + "learning_rate": 4.963356634181689e-05, + "loss": 6.0599, + "step": 9176 + }, + { + "epoch": 0.05457821866971167, + "grad_norm": 1.5529752969741821, + "learning_rate": 4.963348665667462e-05, + "loss": 5.1355, + "step": 9177 + }, + { + "epoch": 0.054584165952992675, + "grad_norm": 1.5113881826400757, + "learning_rate": 4.963340696293305e-05, + "loss": 5.1947, + "step": 9178 + }, + { + "epoch": 0.05459011323627367, + "grad_norm": 1.6840931177139282, + "learning_rate": 4.963332726059221e-05, + "loss": 5.2163, + "step": 9179 + }, + { + "epoch": 0.054596060519554665, + "grad_norm": 1.7720422744750977, + "learning_rate": 4.963324754965214e-05, + "loss": 5.4737, + "step": 9180 + }, + { + "epoch": 0.05460200780283567, + "grad_norm": 1.632574200630188, + "learning_rate": 4.963316783011285e-05, + "loss": 5.2274, + "step": 9181 + }, + { + "epoch": 0.05460795508611666, + "grad_norm": 1.5859557390213013, + "learning_rate": 4.963308810197437e-05, + "loss": 5.3503, + "step": 9182 + }, + { + "epoch": 0.05461390236939766, + "grad_norm": 1.8342604637145996, + "learning_rate": 4.963300836523674e-05, + "loss": 5.1967, + "step": 9183 + }, + { + "epoch": 0.05461984965267866, + "grad_norm": 1.7443957328796387, + "learning_rate": 4.963292861989998e-05, + "loss": 5.0935, + "step": 9184 + }, + { + "epoch": 0.054625796935959654, + "grad_norm": 1.9289584159851074, + "learning_rate": 4.963284886596412e-05, + "loss": 5.1817, + "step": 9185 + }, + { + "epoch": 0.05463174421924065, + "grad_norm": 1.8695822954177856, + "learning_rate": 4.9632769103429186e-05, + "loss": 5.4304, + "step": 9186 + }, + { + "epoch": 0.05463769150252165, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.9632689332295206e-05, + "loss": 5.3924, + "step": 9187 + }, + { + "epoch": 0.054643638785802646, + "grad_norm": 1.6061500310897827, + "learning_rate": 4.963260955256221e-05, + "loss": 5.2309, + "step": 9188 + }, + { + "epoch": 0.05464958606908364, + "grad_norm": 1.5478893518447876, + "learning_rate": 4.963252976423022e-05, + "loss": 5.2615, + "step": 9189 + }, + { + "epoch": 0.05465553335236464, + "grad_norm": 1.4304052591323853, + "learning_rate": 4.9632449967299276e-05, + "loss": 5.2116, + "step": 9190 + }, + { + "epoch": 0.05466148063564564, + "grad_norm": 1.5438693761825562, + "learning_rate": 4.9632370161769395e-05, + "loss": 5.1176, + "step": 9191 + }, + { + "epoch": 0.054667427918926634, + "grad_norm": 1.6602065563201904, + "learning_rate": 4.9632290347640606e-05, + "loss": 5.1521, + "step": 9192 + }, + { + "epoch": 0.05467337520220763, + "grad_norm": 1.530038595199585, + "learning_rate": 4.9632210524912934e-05, + "loss": 5.1437, + "step": 9193 + }, + { + "epoch": 0.05467932248548863, + "grad_norm": 1.617691159248352, + "learning_rate": 4.963213069358643e-05, + "loss": 5.0601, + "step": 9194 + }, + { + "epoch": 0.054685269768769626, + "grad_norm": 1.722401738166809, + "learning_rate": 4.963205085366108e-05, + "loss": 5.2664, + "step": 9195 + }, + { + "epoch": 0.05469121705205062, + "grad_norm": 1.803673267364502, + "learning_rate": 4.963197100513696e-05, + "loss": 5.4164, + "step": 9196 + }, + { + "epoch": 0.05469716433533162, + "grad_norm": 1.8565739393234253, + "learning_rate": 4.963189114801405e-05, + "loss": 5.225, + "step": 9197 + }, + { + "epoch": 0.05470311161861262, + "grad_norm": 1.780698299407959, + "learning_rate": 4.963181128229242e-05, + "loss": 5.1694, + "step": 9198 + }, + { + "epoch": 0.05470905890189361, + "grad_norm": 1.820416808128357, + "learning_rate": 4.963173140797207e-05, + "loss": 5.3305, + "step": 9199 + }, + { + "epoch": 0.054715006185174615, + "grad_norm": 1.471983551979065, + "learning_rate": 4.963165152505304e-05, + "loss": 5.3217, + "step": 9200 + }, + { + "epoch": 0.05472095346845561, + "grad_norm": 1.504616141319275, + "learning_rate": 4.9631571633535354e-05, + "loss": 5.3349, + "step": 9201 + }, + { + "epoch": 0.054726900751736605, + "grad_norm": 1.5888862609863281, + "learning_rate": 4.963149173341903e-05, + "loss": 5.3431, + "step": 9202 + }, + { + "epoch": 0.0547328480350176, + "grad_norm": 1.6633155345916748, + "learning_rate": 4.963141182470412e-05, + "loss": 5.2678, + "step": 9203 + }, + { + "epoch": 0.0547387953182986, + "grad_norm": 1.7259690761566162, + "learning_rate": 4.9631331907390636e-05, + "loss": 5.348, + "step": 9204 + }, + { + "epoch": 0.0547447426015796, + "grad_norm": 1.703925371170044, + "learning_rate": 4.963125198147861e-05, + "loss": 5.4123, + "step": 9205 + }, + { + "epoch": 0.05475068988486059, + "grad_norm": 1.6619760990142822, + "learning_rate": 4.963117204696807e-05, + "loss": 5.1732, + "step": 9206 + }, + { + "epoch": 0.054756637168141595, + "grad_norm": 1.7368190288543701, + "learning_rate": 4.963109210385903e-05, + "loss": 5.0843, + "step": 9207 + }, + { + "epoch": 0.05476258445142259, + "grad_norm": 1.781179666519165, + "learning_rate": 4.9631012152151545e-05, + "loss": 5.1343, + "step": 9208 + }, + { + "epoch": 0.054768531734703585, + "grad_norm": 1.674793004989624, + "learning_rate": 4.9630932191845624e-05, + "loss": 5.4079, + "step": 9209 + }, + { + "epoch": 0.05477447901798459, + "grad_norm": 1.7708344459533691, + "learning_rate": 4.9630852222941296e-05, + "loss": 4.9702, + "step": 9210 + }, + { + "epoch": 0.05478042630126558, + "grad_norm": 1.684725046157837, + "learning_rate": 4.9630772245438594e-05, + "loss": 5.263, + "step": 9211 + }, + { + "epoch": 0.05478637358454658, + "grad_norm": 1.6064784526824951, + "learning_rate": 4.963069225933754e-05, + "loss": 5.3402, + "step": 9212 + }, + { + "epoch": 0.05479232086782758, + "grad_norm": 1.5189318656921387, + "learning_rate": 4.963061226463816e-05, + "loss": 5.1928, + "step": 9213 + }, + { + "epoch": 0.054798268151108574, + "grad_norm": 1.8095827102661133, + "learning_rate": 4.96305322613405e-05, + "loss": 5.262, + "step": 9214 + }, + { + "epoch": 0.05480421543438957, + "grad_norm": 1.8325434923171997, + "learning_rate": 4.963045224944458e-05, + "loss": 5.4975, + "step": 9215 + }, + { + "epoch": 0.05481016271767057, + "grad_norm": 1.6597868204116821, + "learning_rate": 4.963037222895042e-05, + "loss": 5.6232, + "step": 9216 + }, + { + "epoch": 0.054816110000951566, + "grad_norm": 1.6402417421340942, + "learning_rate": 4.9630292199858044e-05, + "loss": 5.5358, + "step": 9217 + }, + { + "epoch": 0.05482205728423256, + "grad_norm": 1.3956371545791626, + "learning_rate": 4.963021216216749e-05, + "loss": 5.2563, + "step": 9218 + }, + { + "epoch": 0.05482800456751356, + "grad_norm": 1.5958374738693237, + "learning_rate": 4.963013211587878e-05, + "loss": 5.1539, + "step": 9219 + }, + { + "epoch": 0.05483395185079456, + "grad_norm": 1.6152080297470093, + "learning_rate": 4.963005206099195e-05, + "loss": 5.4025, + "step": 9220 + }, + { + "epoch": 0.054839899134075554, + "grad_norm": 1.392427921295166, + "learning_rate": 4.962997199750702e-05, + "loss": 5.4149, + "step": 9221 + }, + { + "epoch": 0.05484584641735655, + "grad_norm": 1.5625338554382324, + "learning_rate": 4.962989192542403e-05, + "loss": 5.5837, + "step": 9222 + }, + { + "epoch": 0.05485179370063755, + "grad_norm": 1.6465163230895996, + "learning_rate": 4.962981184474299e-05, + "loss": 5.2934, + "step": 9223 + }, + { + "epoch": 0.054857740983918546, + "grad_norm": 1.5344611406326294, + "learning_rate": 4.962973175546394e-05, + "loss": 5.4734, + "step": 9224 + }, + { + "epoch": 0.05486368826719954, + "grad_norm": 1.2378648519515991, + "learning_rate": 4.962965165758691e-05, + "loss": 5.3368, + "step": 9225 + }, + { + "epoch": 0.05486963555048054, + "grad_norm": 1.396785020828247, + "learning_rate": 4.9629571551111915e-05, + "loss": 5.3163, + "step": 9226 + }, + { + "epoch": 0.05487558283376154, + "grad_norm": 1.639452338218689, + "learning_rate": 4.9629491436038994e-05, + "loss": 5.3933, + "step": 9227 + }, + { + "epoch": 0.05488153011704253, + "grad_norm": 1.5648834705352783, + "learning_rate": 4.9629411312368166e-05, + "loss": 5.3717, + "step": 9228 + }, + { + "epoch": 0.054887477400323535, + "grad_norm": 1.4774922132492065, + "learning_rate": 4.962933118009947e-05, + "loss": 5.1318, + "step": 9229 + }, + { + "epoch": 0.05489342468360453, + "grad_norm": 1.4987083673477173, + "learning_rate": 4.9629251039232935e-05, + "loss": 5.1436, + "step": 9230 + }, + { + "epoch": 0.054899371966885525, + "grad_norm": 1.660605788230896, + "learning_rate": 4.9629170889768586e-05, + "loss": 5.1841, + "step": 9231 + }, + { + "epoch": 0.05490531925016652, + "grad_norm": 1.4441273212432861, + "learning_rate": 4.962909073170643e-05, + "loss": 5.3108, + "step": 9232 + }, + { + "epoch": 0.05491126653344752, + "grad_norm": 1.3297922611236572, + "learning_rate": 4.962901056504653e-05, + "loss": 5.1441, + "step": 9233 + }, + { + "epoch": 0.05491721381672852, + "grad_norm": 1.2989814281463623, + "learning_rate": 4.9628930389788886e-05, + "loss": 5.5146, + "step": 9234 + }, + { + "epoch": 0.05492316110000951, + "grad_norm": 1.350948452949524, + "learning_rate": 4.962885020593354e-05, + "loss": 5.2832, + "step": 9235 + }, + { + "epoch": 0.054929108383290515, + "grad_norm": 1.5801438093185425, + "learning_rate": 4.962877001348052e-05, + "loss": 5.4251, + "step": 9236 + }, + { + "epoch": 0.05493505566657151, + "grad_norm": 1.4355653524398804, + "learning_rate": 4.9628689812429854e-05, + "loss": 5.4092, + "step": 9237 + }, + { + "epoch": 0.054941002949852505, + "grad_norm": 1.692746639251709, + "learning_rate": 4.962860960278156e-05, + "loss": 5.3858, + "step": 9238 + }, + { + "epoch": 0.05494695023313351, + "grad_norm": 1.5125641822814941, + "learning_rate": 4.962852938453567e-05, + "loss": 5.6584, + "step": 9239 + }, + { + "epoch": 0.0549528975164145, + "grad_norm": 1.4158848524093628, + "learning_rate": 4.962844915769221e-05, + "loss": 5.652, + "step": 9240 + }, + { + "epoch": 0.0549588447996955, + "grad_norm": 1.314286231994629, + "learning_rate": 4.9628368922251235e-05, + "loss": 5.501, + "step": 9241 + }, + { + "epoch": 0.0549647920829765, + "grad_norm": 1.4003247022628784, + "learning_rate": 4.962828867821273e-05, + "loss": 5.448, + "step": 9242 + }, + { + "epoch": 0.054970739366257494, + "grad_norm": 1.7670220136642456, + "learning_rate": 4.962820842557675e-05, + "loss": 5.4854, + "step": 9243 + }, + { + "epoch": 0.05497668664953849, + "grad_norm": 1.9435075521469116, + "learning_rate": 4.962812816434332e-05, + "loss": 5.3824, + "step": 9244 + }, + { + "epoch": 0.05498263393281949, + "grad_norm": 2.1733458042144775, + "learning_rate": 4.9628047894512466e-05, + "loss": 5.6771, + "step": 9245 + }, + { + "epoch": 0.054988581216100486, + "grad_norm": 1.5455420017242432, + "learning_rate": 4.962796761608421e-05, + "loss": 5.4634, + "step": 9246 + }, + { + "epoch": 0.05499452849938148, + "grad_norm": 1.623382806777954, + "learning_rate": 4.962788732905859e-05, + "loss": 5.8441, + "step": 9247 + }, + { + "epoch": 0.05500047578266248, + "grad_norm": 1.928788423538208, + "learning_rate": 4.962780703343563e-05, + "loss": 5.6553, + "step": 9248 + }, + { + "epoch": 0.05500642306594348, + "grad_norm": 1.660984992980957, + "learning_rate": 4.962772672921535e-05, + "loss": 5.5953, + "step": 9249 + }, + { + "epoch": 0.055012370349224474, + "grad_norm": 2.081026792526245, + "learning_rate": 4.962764641639779e-05, + "loss": 5.7065, + "step": 9250 + }, + { + "epoch": 0.05501831763250547, + "grad_norm": 1.8750234842300415, + "learning_rate": 4.962756609498297e-05, + "loss": 5.8814, + "step": 9251 + }, + { + "epoch": 0.05502426491578647, + "grad_norm": 1.9573127031326294, + "learning_rate": 4.9627485764970916e-05, + "loss": 5.7415, + "step": 9252 + }, + { + "epoch": 0.055030212199067466, + "grad_norm": 1.7536600828170776, + "learning_rate": 4.962740542636167e-05, + "loss": 5.5638, + "step": 9253 + }, + { + "epoch": 0.05503615948234846, + "grad_norm": 1.692557692527771, + "learning_rate": 4.962732507915525e-05, + "loss": 5.5362, + "step": 9254 + }, + { + "epoch": 0.05504210676562946, + "grad_norm": 1.9066821336746216, + "learning_rate": 4.962724472335168e-05, + "loss": 5.3094, + "step": 9255 + }, + { + "epoch": 0.05504805404891046, + "grad_norm": 2.069007158279419, + "learning_rate": 4.9627164358951e-05, + "loss": 5.766, + "step": 9256 + }, + { + "epoch": 0.05505400133219145, + "grad_norm": 2.0293545722961426, + "learning_rate": 4.9627083985953227e-05, + "loss": 5.7769, + "step": 9257 + }, + { + "epoch": 0.055059948615472455, + "grad_norm": 1.7953507900238037, + "learning_rate": 4.962700360435839e-05, + "loss": 5.8435, + "step": 9258 + }, + { + "epoch": 0.05506589589875345, + "grad_norm": 1.9281821250915527, + "learning_rate": 4.9626923214166535e-05, + "loss": 5.8342, + "step": 9259 + }, + { + "epoch": 0.055071843182034445, + "grad_norm": 1.4612617492675781, + "learning_rate": 4.962684281537766e-05, + "loss": 5.8273, + "step": 9260 + }, + { + "epoch": 0.05507779046531545, + "grad_norm": 1.8589900732040405, + "learning_rate": 4.9626762407991817e-05, + "loss": 5.7607, + "step": 9261 + }, + { + "epoch": 0.05508373774859644, + "grad_norm": 1.9395030736923218, + "learning_rate": 4.9626681992009025e-05, + "loss": 5.7573, + "step": 9262 + }, + { + "epoch": 0.05508968503187744, + "grad_norm": 1.7344708442687988, + "learning_rate": 4.962660156742931e-05, + "loss": 5.7999, + "step": 9263 + }, + { + "epoch": 0.05509563231515843, + "grad_norm": 1.7719827890396118, + "learning_rate": 4.9626521134252704e-05, + "loss": 5.7882, + "step": 9264 + }, + { + "epoch": 0.055101579598439435, + "grad_norm": 1.4955536127090454, + "learning_rate": 4.9626440692479236e-05, + "loss": 5.639, + "step": 9265 + }, + { + "epoch": 0.05510752688172043, + "grad_norm": 2.0087990760803223, + "learning_rate": 4.9626360242108925e-05, + "loss": 5.841, + "step": 9266 + }, + { + "epoch": 0.055113474165001425, + "grad_norm": 1.7334564924240112, + "learning_rate": 4.962627978314181e-05, + "loss": 5.4267, + "step": 9267 + }, + { + "epoch": 0.05511942144828243, + "grad_norm": 2.1204535961151123, + "learning_rate": 4.962619931557792e-05, + "loss": 5.4451, + "step": 9268 + }, + { + "epoch": 0.05512536873156342, + "grad_norm": 2.2374279499053955, + "learning_rate": 4.962611883941727e-05, + "loss": 5.5095, + "step": 9269 + }, + { + "epoch": 0.05513131601484442, + "grad_norm": 1.735070824623108, + "learning_rate": 4.9626038354659904e-05, + "loss": 5.3609, + "step": 9270 + }, + { + "epoch": 0.05513726329812542, + "grad_norm": 1.9748501777648926, + "learning_rate": 4.9625957861305837e-05, + "loss": 5.3366, + "step": 9271 + }, + { + "epoch": 0.055143210581406414, + "grad_norm": 1.8736618757247925, + "learning_rate": 4.96258773593551e-05, + "loss": 5.4706, + "step": 9272 + }, + { + "epoch": 0.05514915786468741, + "grad_norm": 2.571755886077881, + "learning_rate": 4.9625796848807736e-05, + "loss": 5.0393, + "step": 9273 + }, + { + "epoch": 0.05515510514796841, + "grad_norm": 2.1467013359069824, + "learning_rate": 4.962571632966375e-05, + "loss": 5.5798, + "step": 9274 + }, + { + "epoch": 0.055161052431249406, + "grad_norm": 2.4553916454315186, + "learning_rate": 4.962563580192319e-05, + "loss": 5.4323, + "step": 9275 + }, + { + "epoch": 0.0551669997145304, + "grad_norm": 2.4478797912597656, + "learning_rate": 4.962555526558607e-05, + "loss": 5.2591, + "step": 9276 + }, + { + "epoch": 0.055172946997811396, + "grad_norm": 2.2164270877838135, + "learning_rate": 4.9625474720652416e-05, + "loss": 5.3404, + "step": 9277 + }, + { + "epoch": 0.0551788942810924, + "grad_norm": 1.9161698818206787, + "learning_rate": 4.962539416712227e-05, + "loss": 5.2591, + "step": 9278 + }, + { + "epoch": 0.055184841564373394, + "grad_norm": 2.348734140396118, + "learning_rate": 4.962531360499565e-05, + "loss": 5.8132, + "step": 9279 + }, + { + "epoch": 0.05519078884765439, + "grad_norm": 2.400090456008911, + "learning_rate": 4.962523303427259e-05, + "loss": 5.7786, + "step": 9280 + }, + { + "epoch": 0.05519673613093539, + "grad_norm": 2.1626594066619873, + "learning_rate": 4.9625152454953115e-05, + "loss": 5.8488, + "step": 9281 + }, + { + "epoch": 0.055202683414216386, + "grad_norm": 1.7470853328704834, + "learning_rate": 4.962507186703725e-05, + "loss": 5.72, + "step": 9282 + }, + { + "epoch": 0.05520863069749738, + "grad_norm": 1.9191921949386597, + "learning_rate": 4.962499127052503e-05, + "loss": 5.6321, + "step": 9283 + }, + { + "epoch": 0.05521457798077838, + "grad_norm": 2.1550769805908203, + "learning_rate": 4.962491066541649e-05, + "loss": 5.4521, + "step": 9284 + }, + { + "epoch": 0.05522052526405938, + "grad_norm": 2.0529074668884277, + "learning_rate": 4.9624830051711634e-05, + "loss": 5.4108, + "step": 9285 + }, + { + "epoch": 0.05522647254734037, + "grad_norm": 1.7673834562301636, + "learning_rate": 4.962474942941051e-05, + "loss": 5.5955, + "step": 9286 + }, + { + "epoch": 0.055232419830621375, + "grad_norm": 1.9575849771499634, + "learning_rate": 4.9624668798513143e-05, + "loss": 5.6295, + "step": 9287 + }, + { + "epoch": 0.05523836711390237, + "grad_norm": 1.8054029941558838, + "learning_rate": 4.9624588159019546e-05, + "loss": 5.3372, + "step": 9288 + }, + { + "epoch": 0.055244314397183365, + "grad_norm": 1.8002424240112305, + "learning_rate": 4.962450751092978e-05, + "loss": 5.4404, + "step": 9289 + }, + { + "epoch": 0.05525026168046437, + "grad_norm": 2.052530527114868, + "learning_rate": 4.962442685424383e-05, + "loss": 5.4921, + "step": 9290 + }, + { + "epoch": 0.05525620896374536, + "grad_norm": 1.8559443950653076, + "learning_rate": 4.962434618896176e-05, + "loss": 5.5776, + "step": 9291 + }, + { + "epoch": 0.05526215624702636, + "grad_norm": 1.8794355392456055, + "learning_rate": 4.962426551508359e-05, + "loss": 5.5818, + "step": 9292 + }, + { + "epoch": 0.05526810353030735, + "grad_norm": 1.8995412588119507, + "learning_rate": 4.962418483260933e-05, + "loss": 5.6274, + "step": 9293 + }, + { + "epoch": 0.055274050813588355, + "grad_norm": 1.8608371019363403, + "learning_rate": 4.962410414153903e-05, + "loss": 5.4655, + "step": 9294 + }, + { + "epoch": 0.05527999809686935, + "grad_norm": 2.0378072261810303, + "learning_rate": 4.9624023441872715e-05, + "loss": 5.5579, + "step": 9295 + }, + { + "epoch": 0.055285945380150345, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.9623942733610397e-05, + "loss": 5.6663, + "step": 9296 + }, + { + "epoch": 0.05529189266343135, + "grad_norm": 2.4487335681915283, + "learning_rate": 4.962386201675212e-05, + "loss": 5.6792, + "step": 9297 + }, + { + "epoch": 0.05529783994671234, + "grad_norm": 2.0460383892059326, + "learning_rate": 4.96237812912979e-05, + "loss": 5.917, + "step": 9298 + }, + { + "epoch": 0.05530378722999334, + "grad_norm": 2.4838030338287354, + "learning_rate": 4.962370055724778e-05, + "loss": 5.1067, + "step": 9299 + }, + { + "epoch": 0.05530973451327434, + "grad_norm": 1.9340513944625854, + "learning_rate": 4.962361981460178e-05, + "loss": 5.2529, + "step": 9300 + }, + { + "epoch": 0.055315681796555334, + "grad_norm": 2.201068878173828, + "learning_rate": 4.9623539063359925e-05, + "loss": 5.6055, + "step": 9301 + }, + { + "epoch": 0.05532162907983633, + "grad_norm": 2.0552330017089844, + "learning_rate": 4.962345830352225e-05, + "loss": 5.3531, + "step": 9302 + }, + { + "epoch": 0.05532757636311733, + "grad_norm": 2.611407995223999, + "learning_rate": 4.9623377535088785e-05, + "loss": 5.5829, + "step": 9303 + }, + { + "epoch": 0.055333523646398326, + "grad_norm": 2.2239346504211426, + "learning_rate": 4.962329675805955e-05, + "loss": 5.3558, + "step": 9304 + }, + { + "epoch": 0.05533947092967932, + "grad_norm": 2.3899872303009033, + "learning_rate": 4.9623215972434566e-05, + "loss": 5.7277, + "step": 9305 + }, + { + "epoch": 0.055345418212960316, + "grad_norm": 2.8471267223358154, + "learning_rate": 4.962313517821389e-05, + "loss": 6.1046, + "step": 9306 + }, + { + "epoch": 0.05535136549624132, + "grad_norm": 2.426400661468506, + "learning_rate": 4.962305437539752e-05, + "loss": 5.8942, + "step": 9307 + }, + { + "epoch": 0.055357312779522314, + "grad_norm": 2.3548812866210938, + "learning_rate": 4.962297356398549e-05, + "loss": 6.0552, + "step": 9308 + }, + { + "epoch": 0.05536326006280331, + "grad_norm": 1.8423515558242798, + "learning_rate": 4.9622892743977844e-05, + "loss": 5.9377, + "step": 9309 + }, + { + "epoch": 0.05536920734608431, + "grad_norm": 2.1509203910827637, + "learning_rate": 4.96228119153746e-05, + "loss": 5.7195, + "step": 9310 + }, + { + "epoch": 0.055375154629365306, + "grad_norm": 2.3096275329589844, + "learning_rate": 4.962273107817579e-05, + "loss": 5.3461, + "step": 9311 + }, + { + "epoch": 0.0553811019126463, + "grad_norm": 1.980205774307251, + "learning_rate": 4.962265023238143e-05, + "loss": 5.8851, + "step": 9312 + }, + { + "epoch": 0.0553870491959273, + "grad_norm": 1.8162591457366943, + "learning_rate": 4.962256937799156e-05, + "loss": 5.7092, + "step": 9313 + }, + { + "epoch": 0.0553929964792083, + "grad_norm": 1.873853087425232, + "learning_rate": 4.962248851500621e-05, + "loss": 5.8939, + "step": 9314 + }, + { + "epoch": 0.05539894376248929, + "grad_norm": 1.8039345741271973, + "learning_rate": 4.96224076434254e-05, + "loss": 5.9289, + "step": 9315 + }, + { + "epoch": 0.055404891045770295, + "grad_norm": 2.3106470108032227, + "learning_rate": 4.962232676324916e-05, + "loss": 5.9103, + "step": 9316 + }, + { + "epoch": 0.05541083832905129, + "grad_norm": 2.2209455966949463, + "learning_rate": 4.962224587447752e-05, + "loss": 6.0053, + "step": 9317 + }, + { + "epoch": 0.055416785612332285, + "grad_norm": 2.0624780654907227, + "learning_rate": 4.962216497711052e-05, + "loss": 5.9258, + "step": 9318 + }, + { + "epoch": 0.05542273289561329, + "grad_norm": 2.371662139892578, + "learning_rate": 4.962208407114817e-05, + "loss": 6.4127, + "step": 9319 + }, + { + "epoch": 0.05542868017889428, + "grad_norm": 2.7035610675811768, + "learning_rate": 4.96220031565905e-05, + "loss": 5.9742, + "step": 9320 + }, + { + "epoch": 0.05543462746217528, + "grad_norm": 2.060577392578125, + "learning_rate": 4.9621922233437544e-05, + "loss": 5.9729, + "step": 9321 + }, + { + "epoch": 0.05544057474545627, + "grad_norm": 1.7935984134674072, + "learning_rate": 4.962184130168933e-05, + "loss": 5.4077, + "step": 9322 + }, + { + "epoch": 0.055446522028737275, + "grad_norm": 1.8716622591018677, + "learning_rate": 4.9621760361345885e-05, + "loss": 5.4554, + "step": 9323 + }, + { + "epoch": 0.05545246931201827, + "grad_norm": 1.9150923490524292, + "learning_rate": 4.962167941240724e-05, + "loss": 5.8121, + "step": 9324 + }, + { + "epoch": 0.055458416595299265, + "grad_norm": 1.9207059144973755, + "learning_rate": 4.962159845487342e-05, + "loss": 5.8593, + "step": 9325 + }, + { + "epoch": 0.05546436387858027, + "grad_norm": 1.962039589881897, + "learning_rate": 4.9621517488744454e-05, + "loss": 6.0174, + "step": 9326 + }, + { + "epoch": 0.05547031116186126, + "grad_norm": 2.0445704460144043, + "learning_rate": 4.9621436514020376e-05, + "loss": 5.5782, + "step": 9327 + }, + { + "epoch": 0.05547625844514226, + "grad_norm": 2.0861823558807373, + "learning_rate": 4.9621355530701204e-05, + "loss": 5.6102, + "step": 9328 + }, + { + "epoch": 0.05548220572842326, + "grad_norm": 2.0184309482574463, + "learning_rate": 4.962127453878697e-05, + "loss": 5.8072, + "step": 9329 + }, + { + "epoch": 0.055488153011704254, + "grad_norm": 1.899994134902954, + "learning_rate": 4.962119353827771e-05, + "loss": 5.7361, + "step": 9330 + }, + { + "epoch": 0.05549410029498525, + "grad_norm": 1.8874105215072632, + "learning_rate": 4.962111252917344e-05, + "loss": 5.7988, + "step": 9331 + }, + { + "epoch": 0.05550004757826625, + "grad_norm": 2.046682119369507, + "learning_rate": 4.9621031511474194e-05, + "loss": 5.7037, + "step": 9332 + }, + { + "epoch": 0.055505994861547246, + "grad_norm": 2.2552926540374756, + "learning_rate": 4.962095048517999e-05, + "loss": 5.7556, + "step": 9333 + }, + { + "epoch": 0.05551194214482824, + "grad_norm": 2.1904358863830566, + "learning_rate": 4.962086945029089e-05, + "loss": 5.6529, + "step": 9334 + }, + { + "epoch": 0.055517889428109236, + "grad_norm": 2.03745698928833, + "learning_rate": 4.9620788406806883e-05, + "loss": 5.8504, + "step": 9335 + }, + { + "epoch": 0.05552383671139024, + "grad_norm": 1.81668221950531, + "learning_rate": 4.9620707354728017e-05, + "loss": 5.3275, + "step": 9336 + }, + { + "epoch": 0.055529783994671233, + "grad_norm": 2.570976734161377, + "learning_rate": 4.962062629405432e-05, + "loss": 5.666, + "step": 9337 + }, + { + "epoch": 0.05553573127795223, + "grad_norm": 2.6855766773223877, + "learning_rate": 4.962054522478581e-05, + "loss": 5.7798, + "step": 9338 + }, + { + "epoch": 0.05554167856123323, + "grad_norm": 2.329690933227539, + "learning_rate": 4.962046414692252e-05, + "loss": 5.9334, + "step": 9339 + }, + { + "epoch": 0.055547625844514226, + "grad_norm": 1.6809495687484741, + "learning_rate": 4.962038306046449e-05, + "loss": 5.8506, + "step": 9340 + }, + { + "epoch": 0.05555357312779522, + "grad_norm": 1.7170113325119019, + "learning_rate": 4.962030196541173e-05, + "loss": 6.0863, + "step": 9341 + }, + { + "epoch": 0.05555952041107622, + "grad_norm": 2.247680902481079, + "learning_rate": 4.962022086176428e-05, + "loss": 5.2188, + "step": 9342 + }, + { + "epoch": 0.05556546769435722, + "grad_norm": 2.680091381072998, + "learning_rate": 4.9620139749522165e-05, + "loss": 4.8506, + "step": 9343 + }, + { + "epoch": 0.05557141497763821, + "grad_norm": 2.1886465549468994, + "learning_rate": 4.962005862868542e-05, + "loss": 5.5164, + "step": 9344 + }, + { + "epoch": 0.055577362260919215, + "grad_norm": 2.061368227005005, + "learning_rate": 4.961997749925405e-05, + "loss": 5.4491, + "step": 9345 + }, + { + "epoch": 0.05558330954420021, + "grad_norm": 2.368156909942627, + "learning_rate": 4.961989636122812e-05, + "loss": 5.9053, + "step": 9346 + }, + { + "epoch": 0.055589256827481205, + "grad_norm": 2.562565803527832, + "learning_rate": 4.961981521460763e-05, + "loss": 5.7683, + "step": 9347 + }, + { + "epoch": 0.05559520411076221, + "grad_norm": 2.388779640197754, + "learning_rate": 4.961973405939262e-05, + "loss": 5.1235, + "step": 9348 + }, + { + "epoch": 0.0556011513940432, + "grad_norm": 2.546994686126709, + "learning_rate": 4.9619652895583104e-05, + "loss": 4.7793, + "step": 9349 + }, + { + "epoch": 0.0556070986773242, + "grad_norm": 2.379549026489258, + "learning_rate": 4.9619571723179135e-05, + "loss": 4.8949, + "step": 9350 + }, + { + "epoch": 0.05561304596060519, + "grad_norm": 2.1621344089508057, + "learning_rate": 4.961949054218072e-05, + "loss": 4.6824, + "step": 9351 + }, + { + "epoch": 0.055618993243886194, + "grad_norm": 2.136289119720459, + "learning_rate": 4.96194093525879e-05, + "loss": 4.834, + "step": 9352 + }, + { + "epoch": 0.05562494052716719, + "grad_norm": 2.3572680950164795, + "learning_rate": 4.9619328154400694e-05, + "loss": 4.9755, + "step": 9353 + }, + { + "epoch": 0.055630887810448185, + "grad_norm": 2.2439966201782227, + "learning_rate": 4.961924694761913e-05, + "loss": 5.7662, + "step": 9354 + }, + { + "epoch": 0.05563683509372919, + "grad_norm": 2.287597894668579, + "learning_rate": 4.961916573224326e-05, + "loss": 4.6108, + "step": 9355 + }, + { + "epoch": 0.05564278237701018, + "grad_norm": 2.1382369995117188, + "learning_rate": 4.961908450827308e-05, + "loss": 4.5993, + "step": 9356 + }, + { + "epoch": 0.05564872966029118, + "grad_norm": 2.112348794937134, + "learning_rate": 4.961900327570863e-05, + "loss": 4.6798, + "step": 9357 + }, + { + "epoch": 0.05565467694357218, + "grad_norm": 2.0453972816467285, + "learning_rate": 4.9618922034549946e-05, + "loss": 4.5424, + "step": 9358 + }, + { + "epoch": 0.055660624226853174, + "grad_norm": 2.0547754764556885, + "learning_rate": 4.961884078479705e-05, + "loss": 5.0661, + "step": 9359 + }, + { + "epoch": 0.05566657151013417, + "grad_norm": 2.5003650188446045, + "learning_rate": 4.9618759526449965e-05, + "loss": 5.3388, + "step": 9360 + }, + { + "epoch": 0.05567251879341517, + "grad_norm": 2.0582423210144043, + "learning_rate": 4.9618678259508736e-05, + "loss": 5.8437, + "step": 9361 + }, + { + "epoch": 0.055678466076696166, + "grad_norm": 1.7867279052734375, + "learning_rate": 4.9618596983973376e-05, + "loss": 5.369, + "step": 9362 + }, + { + "epoch": 0.05568441335997716, + "grad_norm": 2.03729248046875, + "learning_rate": 4.961851569984392e-05, + "loss": 5.9932, + "step": 9363 + }, + { + "epoch": 0.055690360643258156, + "grad_norm": 2.2527456283569336, + "learning_rate": 4.961843440712038e-05, + "loss": 5.893, + "step": 9364 + }, + { + "epoch": 0.05569630792653916, + "grad_norm": 2.0027201175689697, + "learning_rate": 4.9618353105802815e-05, + "loss": 5.8216, + "step": 9365 + }, + { + "epoch": 0.05570225520982015, + "grad_norm": 2.236548662185669, + "learning_rate": 4.961827179589124e-05, + "loss": 5.5371, + "step": 9366 + }, + { + "epoch": 0.05570820249310115, + "grad_norm": 2.4477334022521973, + "learning_rate": 4.9618190477385666e-05, + "loss": 5.6552, + "step": 9367 + }, + { + "epoch": 0.05571414977638215, + "grad_norm": 2.504549026489258, + "learning_rate": 4.9618109150286145e-05, + "loss": 5.5732, + "step": 9368 + }, + { + "epoch": 0.055720097059663146, + "grad_norm": 2.1413187980651855, + "learning_rate": 4.9618027814592695e-05, + "loss": 5.1792, + "step": 9369 + }, + { + "epoch": 0.05572604434294414, + "grad_norm": 2.1714866161346436, + "learning_rate": 4.9617946470305344e-05, + "loss": 5.3444, + "step": 9370 + }, + { + "epoch": 0.05573199162622514, + "grad_norm": 1.7478383779525757, + "learning_rate": 4.9617865117424126e-05, + "loss": 5.7151, + "step": 9371 + }, + { + "epoch": 0.05573793890950614, + "grad_norm": 2.0415220260620117, + "learning_rate": 4.9617783755949067e-05, + "loss": 5.8765, + "step": 9372 + }, + { + "epoch": 0.05574388619278713, + "grad_norm": 1.917108416557312, + "learning_rate": 4.961770238588019e-05, + "loss": 6.0797, + "step": 9373 + }, + { + "epoch": 0.055749833476068135, + "grad_norm": 1.9404850006103516, + "learning_rate": 4.961762100721753e-05, + "loss": 6.1376, + "step": 9374 + }, + { + "epoch": 0.05575578075934913, + "grad_norm": 1.7101916074752808, + "learning_rate": 4.9617539619961104e-05, + "loss": 5.9375, + "step": 9375 + }, + { + "epoch": 0.055761728042630125, + "grad_norm": 2.591960906982422, + "learning_rate": 4.9617458224110954e-05, + "loss": 5.3716, + "step": 9376 + }, + { + "epoch": 0.05576767532591113, + "grad_norm": 2.070600986480713, + "learning_rate": 4.961737681966711e-05, + "loss": 5.3822, + "step": 9377 + }, + { + "epoch": 0.05577362260919212, + "grad_norm": 2.100820302963257, + "learning_rate": 4.9617295406629594e-05, + "loss": 5.7703, + "step": 9378 + }, + { + "epoch": 0.05577956989247312, + "grad_norm": 2.2413878440856934, + "learning_rate": 4.961721398499843e-05, + "loss": 4.9197, + "step": 9379 + }, + { + "epoch": 0.05578551717575411, + "grad_norm": 1.9762401580810547, + "learning_rate": 4.961713255477365e-05, + "loss": 5.6705, + "step": 9380 + }, + { + "epoch": 0.055791464459035114, + "grad_norm": 2.22676420211792, + "learning_rate": 4.961705111595528e-05, + "loss": 5.0196, + "step": 9381 + }, + { + "epoch": 0.05579741174231611, + "grad_norm": 2.0652241706848145, + "learning_rate": 4.9616969668543364e-05, + "loss": 5.3894, + "step": 9382 + }, + { + "epoch": 0.055803359025597105, + "grad_norm": 2.156890630722046, + "learning_rate": 4.96168882125379e-05, + "loss": 5.3063, + "step": 9383 + }, + { + "epoch": 0.05580930630887811, + "grad_norm": 2.131964683532715, + "learning_rate": 4.961680674793895e-05, + "loss": 5.9304, + "step": 9384 + }, + { + "epoch": 0.0558152535921591, + "grad_norm": 2.2117621898651123, + "learning_rate": 4.9616725274746525e-05, + "loss": 5.9553, + "step": 9385 + }, + { + "epoch": 0.0558212008754401, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.9616643792960654e-05, + "loss": 5.9911, + "step": 9386 + }, + { + "epoch": 0.0558271481587211, + "grad_norm": 1.7709077596664429, + "learning_rate": 4.961656230258136e-05, + "loss": 5.6291, + "step": 9387 + }, + { + "epoch": 0.055833095442002094, + "grad_norm": 1.838767170906067, + "learning_rate": 4.961648080360869e-05, + "loss": 6.0152, + "step": 9388 + }, + { + "epoch": 0.05583904272528309, + "grad_norm": 2.117058515548706, + "learning_rate": 4.9616399296042656e-05, + "loss": 4.8079, + "step": 9389 + }, + { + "epoch": 0.05584499000856409, + "grad_norm": 2.147491693496704, + "learning_rate": 4.9616317779883293e-05, + "loss": 4.6489, + "step": 9390 + }, + { + "epoch": 0.055850937291845086, + "grad_norm": 2.1025705337524414, + "learning_rate": 4.961623625513062e-05, + "loss": 4.4984, + "step": 9391 + }, + { + "epoch": 0.05585688457512608, + "grad_norm": 1.799986720085144, + "learning_rate": 4.961615472178468e-05, + "loss": 5.1008, + "step": 9392 + }, + { + "epoch": 0.055862831858407076, + "grad_norm": 2.2975053787231445, + "learning_rate": 4.961607317984549e-05, + "loss": 5.9754, + "step": 9393 + }, + { + "epoch": 0.05586877914168808, + "grad_norm": 1.9996155500411987, + "learning_rate": 4.961599162931309e-05, + "loss": 5.9255, + "step": 9394 + }, + { + "epoch": 0.05587472642496907, + "grad_norm": 1.7344794273376465, + "learning_rate": 4.9615910070187496e-05, + "loss": 6.0873, + "step": 9395 + }, + { + "epoch": 0.05588067370825007, + "grad_norm": 2.260706901550293, + "learning_rate": 4.961582850246875e-05, + "loss": 5.9454, + "step": 9396 + }, + { + "epoch": 0.05588662099153107, + "grad_norm": 2.1810765266418457, + "learning_rate": 4.961574692615686e-05, + "loss": 5.7548, + "step": 9397 + }, + { + "epoch": 0.055892568274812066, + "grad_norm": 2.0940003395080566, + "learning_rate": 4.961566534125188e-05, + "loss": 5.8184, + "step": 9398 + }, + { + "epoch": 0.05589851555809306, + "grad_norm": 2.066464900970459, + "learning_rate": 4.961558374775382e-05, + "loss": 5.7867, + "step": 9399 + }, + { + "epoch": 0.05590446284137406, + "grad_norm": 1.7197705507278442, + "learning_rate": 4.961550214566271e-05, + "loss": 5.9211, + "step": 9400 + }, + { + "epoch": 0.05591041012465506, + "grad_norm": 2.3055293560028076, + "learning_rate": 4.9615420534978583e-05, + "loss": 5.9531, + "step": 9401 + }, + { + "epoch": 0.05591635740793605, + "grad_norm": 2.0974669456481934, + "learning_rate": 4.961533891570147e-05, + "loss": 5.9347, + "step": 9402 + }, + { + "epoch": 0.055922304691217055, + "grad_norm": 2.5196354389190674, + "learning_rate": 4.96152572878314e-05, + "loss": 5.0729, + "step": 9403 + }, + { + "epoch": 0.05592825197449805, + "grad_norm": 2.157181978225708, + "learning_rate": 4.9615175651368395e-05, + "loss": 5.9513, + "step": 9404 + }, + { + "epoch": 0.055934199257779045, + "grad_norm": 1.94083833694458, + "learning_rate": 4.9615094006312485e-05, + "loss": 5.9239, + "step": 9405 + }, + { + "epoch": 0.05594014654106005, + "grad_norm": 2.2118191719055176, + "learning_rate": 4.9615012352663704e-05, + "loss": 5.6936, + "step": 9406 + }, + { + "epoch": 0.05594609382434104, + "grad_norm": 2.2255051136016846, + "learning_rate": 4.9614930690422065e-05, + "loss": 5.7475, + "step": 9407 + }, + { + "epoch": 0.05595204110762204, + "grad_norm": 2.1640844345092773, + "learning_rate": 4.961484901958762e-05, + "loss": 5.8138, + "step": 9408 + }, + { + "epoch": 0.05595798839090303, + "grad_norm": 2.2722928524017334, + "learning_rate": 4.961476734016038e-05, + "loss": 5.5784, + "step": 9409 + }, + { + "epoch": 0.055963935674184034, + "grad_norm": 2.0541749000549316, + "learning_rate": 4.961468565214039e-05, + "loss": 5.6871, + "step": 9410 + }, + { + "epoch": 0.05596988295746503, + "grad_norm": 2.3496010303497314, + "learning_rate": 4.9614603955527655e-05, + "loss": 5.4195, + "step": 9411 + }, + { + "epoch": 0.055975830240746025, + "grad_norm": 2.333435297012329, + "learning_rate": 4.9614522250322215e-05, + "loss": 5.4257, + "step": 9412 + }, + { + "epoch": 0.05598177752402703, + "grad_norm": 2.339057445526123, + "learning_rate": 4.9614440536524106e-05, + "loss": 5.4158, + "step": 9413 + }, + { + "epoch": 0.05598772480730802, + "grad_norm": 2.4383058547973633, + "learning_rate": 4.961435881413335e-05, + "loss": 5.4569, + "step": 9414 + }, + { + "epoch": 0.05599367209058902, + "grad_norm": 2.1405389308929443, + "learning_rate": 4.961427708314997e-05, + "loss": 5.6178, + "step": 9415 + }, + { + "epoch": 0.05599961937387002, + "grad_norm": 2.2082836627960205, + "learning_rate": 4.961419534357401e-05, + "loss": 5.386, + "step": 9416 + }, + { + "epoch": 0.056005566657151014, + "grad_norm": 2.0305027961730957, + "learning_rate": 4.961411359540548e-05, + "loss": 5.2822, + "step": 9417 + }, + { + "epoch": 0.05601151394043201, + "grad_norm": 2.606452226638794, + "learning_rate": 4.961403183864442e-05, + "loss": 5.2691, + "step": 9418 + }, + { + "epoch": 0.05601746122371301, + "grad_norm": 2.3506669998168945, + "learning_rate": 4.961395007329086e-05, + "loss": 5.3307, + "step": 9419 + }, + { + "epoch": 0.056023408506994006, + "grad_norm": 2.3472225666046143, + "learning_rate": 4.961386829934482e-05, + "loss": 5.2247, + "step": 9420 + }, + { + "epoch": 0.056029355790275, + "grad_norm": 2.1121721267700195, + "learning_rate": 4.961378651680633e-05, + "loss": 5.2857, + "step": 9421 + }, + { + "epoch": 0.056035303073555996, + "grad_norm": 2.4357142448425293, + "learning_rate": 4.9613704725675427e-05, + "loss": 5.3398, + "step": 9422 + }, + { + "epoch": 0.056041250356837, + "grad_norm": 2.639418125152588, + "learning_rate": 4.961362292595213e-05, + "loss": 5.3008, + "step": 9423 + }, + { + "epoch": 0.05604719764011799, + "grad_norm": 3.297189712524414, + "learning_rate": 4.961354111763647e-05, + "loss": 5.5908, + "step": 9424 + }, + { + "epoch": 0.05605314492339899, + "grad_norm": 2.095613718032837, + "learning_rate": 4.961345930072848e-05, + "loss": 5.2389, + "step": 9425 + }, + { + "epoch": 0.05605909220667999, + "grad_norm": 2.2495081424713135, + "learning_rate": 4.9613377475228186e-05, + "loss": 5.474, + "step": 9426 + }, + { + "epoch": 0.056065039489960986, + "grad_norm": 2.282697916030884, + "learning_rate": 4.961329564113562e-05, + "loss": 5.3253, + "step": 9427 + }, + { + "epoch": 0.05607098677324198, + "grad_norm": 2.515075206756592, + "learning_rate": 4.96132137984508e-05, + "loss": 5.238, + "step": 9428 + }, + { + "epoch": 0.05607693405652298, + "grad_norm": 2.072274684906006, + "learning_rate": 4.961313194717376e-05, + "loss": 5.3627, + "step": 9429 + }, + { + "epoch": 0.05608288133980398, + "grad_norm": 2.4552547931671143, + "learning_rate": 4.961305008730454e-05, + "loss": 6.1799, + "step": 9430 + }, + { + "epoch": 0.05608882862308497, + "grad_norm": 2.2289538383483887, + "learning_rate": 4.9612968218843146e-05, + "loss": 5.5477, + "step": 9431 + }, + { + "epoch": 0.056094775906365975, + "grad_norm": 2.6174185276031494, + "learning_rate": 4.9612886341789635e-05, + "loss": 5.1779, + "step": 9432 + }, + { + "epoch": 0.05610072318964697, + "grad_norm": 2.4489150047302246, + "learning_rate": 4.9612804456144005e-05, + "loss": 5.2067, + "step": 9433 + }, + { + "epoch": 0.056106670472927965, + "grad_norm": 2.2651829719543457, + "learning_rate": 4.96127225619063e-05, + "loss": 5.3582, + "step": 9434 + }, + { + "epoch": 0.05611261775620897, + "grad_norm": 2.1985251903533936, + "learning_rate": 4.9612640659076556e-05, + "loss": 5.2034, + "step": 9435 + }, + { + "epoch": 0.05611856503948996, + "grad_norm": 1.9510128498077393, + "learning_rate": 4.961255874765479e-05, + "loss": 5.2263, + "step": 9436 + }, + { + "epoch": 0.05612451232277096, + "grad_norm": 2.338815212249756, + "learning_rate": 4.961247682764104e-05, + "loss": 5.9091, + "step": 9437 + }, + { + "epoch": 0.05613045960605195, + "grad_norm": 2.097111225128174, + "learning_rate": 4.961239489903532e-05, + "loss": 6.3285, + "step": 9438 + }, + { + "epoch": 0.056136406889332954, + "grad_norm": 1.9965720176696777, + "learning_rate": 4.961231296183767e-05, + "loss": 6.3141, + "step": 9439 + }, + { + "epoch": 0.05614235417261395, + "grad_norm": 2.2406206130981445, + "learning_rate": 4.9612231016048114e-05, + "loss": 5.7335, + "step": 9440 + }, + { + "epoch": 0.056148301455894944, + "grad_norm": 2.2798993587493896, + "learning_rate": 4.961214906166668e-05, + "loss": 4.9959, + "step": 9441 + }, + { + "epoch": 0.056154248739175947, + "grad_norm": 2.482706069946289, + "learning_rate": 4.96120670986934e-05, + "loss": 5.295, + "step": 9442 + }, + { + "epoch": 0.05616019602245694, + "grad_norm": 2.398867607116699, + "learning_rate": 4.961198512712831e-05, + "loss": 4.9592, + "step": 9443 + }, + { + "epoch": 0.05616614330573794, + "grad_norm": 2.1979055404663086, + "learning_rate": 4.961190314697143e-05, + "loss": 5.1003, + "step": 9444 + }, + { + "epoch": 0.05617209058901894, + "grad_norm": 2.3249244689941406, + "learning_rate": 4.961182115822278e-05, + "loss": 5.1408, + "step": 9445 + }, + { + "epoch": 0.056178037872299934, + "grad_norm": 2.3679821491241455, + "learning_rate": 4.96117391608824e-05, + "loss": 5.4006, + "step": 9446 + }, + { + "epoch": 0.05618398515558093, + "grad_norm": 1.8706363439559937, + "learning_rate": 4.961165715495032e-05, + "loss": 6.1741, + "step": 9447 + }, + { + "epoch": 0.05618993243886193, + "grad_norm": 2.1825344562530518, + "learning_rate": 4.961157514042656e-05, + "loss": 6.0869, + "step": 9448 + }, + { + "epoch": 0.056195879722142926, + "grad_norm": 1.85076904296875, + "learning_rate": 4.961149311731116e-05, + "loss": 5.9252, + "step": 9449 + }, + { + "epoch": 0.05620182700542392, + "grad_norm": 1.9433631896972656, + "learning_rate": 4.961141108560413e-05, + "loss": 5.968, + "step": 9450 + }, + { + "epoch": 0.056207774288704916, + "grad_norm": 2.5718259811401367, + "learning_rate": 4.961132904530552e-05, + "loss": 5.4274, + "step": 9451 + }, + { + "epoch": 0.05621372157198592, + "grad_norm": 1.919552206993103, + "learning_rate": 4.961124699641535e-05, + "loss": 5.1943, + "step": 9452 + }, + { + "epoch": 0.05621966885526691, + "grad_norm": 2.1371817588806152, + "learning_rate": 4.961116493893364e-05, + "loss": 5.9949, + "step": 9453 + }, + { + "epoch": 0.05622561613854791, + "grad_norm": 2.5715489387512207, + "learning_rate": 4.961108287286044e-05, + "loss": 6.2061, + "step": 9454 + }, + { + "epoch": 0.05623156342182891, + "grad_norm": 2.1871471405029297, + "learning_rate": 4.961100079819575e-05, + "loss": 5.7872, + "step": 9455 + }, + { + "epoch": 0.056237510705109905, + "grad_norm": 2.011925220489502, + "learning_rate": 4.961091871493962e-05, + "loss": 5.7992, + "step": 9456 + }, + { + "epoch": 0.0562434579883909, + "grad_norm": 2.516580820083618, + "learning_rate": 4.9610836623092074e-05, + "loss": 5.9154, + "step": 9457 + }, + { + "epoch": 0.0562494052716719, + "grad_norm": 1.9336326122283936, + "learning_rate": 4.961075452265314e-05, + "loss": 5.7933, + "step": 9458 + }, + { + "epoch": 0.0562553525549529, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.961067241362285e-05, + "loss": 6.1897, + "step": 9459 + }, + { + "epoch": 0.05626129983823389, + "grad_norm": 1.9757578372955322, + "learning_rate": 4.961059029600122e-05, + "loss": 6.0909, + "step": 9460 + }, + { + "epoch": 0.056267247121514895, + "grad_norm": 1.9767241477966309, + "learning_rate": 4.9610508169788294e-05, + "loss": 6.2212, + "step": 9461 + }, + { + "epoch": 0.05627319440479589, + "grad_norm": 1.9890403747558594, + "learning_rate": 4.961042603498409e-05, + "loss": 6.5071, + "step": 9462 + }, + { + "epoch": 0.056279141688076885, + "grad_norm": 1.9011937379837036, + "learning_rate": 4.961034389158864e-05, + "loss": 5.8098, + "step": 9463 + }, + { + "epoch": 0.05628508897135789, + "grad_norm": 2.236356735229492, + "learning_rate": 4.961026173960197e-05, + "loss": 4.8901, + "step": 9464 + }, + { + "epoch": 0.05629103625463888, + "grad_norm": 1.9147372245788574, + "learning_rate": 4.961017957902412e-05, + "loss": 5.1372, + "step": 9465 + }, + { + "epoch": 0.05629698353791988, + "grad_norm": 1.9628163576126099, + "learning_rate": 4.9610097409855106e-05, + "loss": 5.1161, + "step": 9466 + }, + { + "epoch": 0.05630293082120087, + "grad_norm": 2.0323991775512695, + "learning_rate": 4.961001523209496e-05, + "loss": 5.1493, + "step": 9467 + }, + { + "epoch": 0.056308878104481874, + "grad_norm": 1.7026360034942627, + "learning_rate": 4.9609933045743714e-05, + "loss": 5.2349, + "step": 9468 + }, + { + "epoch": 0.05631482538776287, + "grad_norm": 1.7758761644363403, + "learning_rate": 4.9609850850801394e-05, + "loss": 5.231, + "step": 9469 + }, + { + "epoch": 0.056320772671043864, + "grad_norm": 2.3305037021636963, + "learning_rate": 4.9609768647268026e-05, + "loss": 5.9209, + "step": 9470 + }, + { + "epoch": 0.056326719954324866, + "grad_norm": 2.2628681659698486, + "learning_rate": 4.960968643514365e-05, + "loss": 5.4753, + "step": 9471 + }, + { + "epoch": 0.05633266723760586, + "grad_norm": 2.4022347927093506, + "learning_rate": 4.9609604214428286e-05, + "loss": 4.8414, + "step": 9472 + }, + { + "epoch": 0.05633861452088686, + "grad_norm": 2.2767343521118164, + "learning_rate": 4.9609521985121955e-05, + "loss": 4.7178, + "step": 9473 + }, + { + "epoch": 0.05634456180416786, + "grad_norm": 2.547600507736206, + "learning_rate": 4.96094397472247e-05, + "loss": 4.7365, + "step": 9474 + }, + { + "epoch": 0.056350509087448854, + "grad_norm": 2.3546998500823975, + "learning_rate": 4.960935750073654e-05, + "loss": 5.4846, + "step": 9475 + }, + { + "epoch": 0.05635645637072985, + "grad_norm": 2.9641268253326416, + "learning_rate": 4.960927524565751e-05, + "loss": 5.7409, + "step": 9476 + }, + { + "epoch": 0.05636240365401085, + "grad_norm": 3.1727824211120605, + "learning_rate": 4.960919298198764e-05, + "loss": 5.8456, + "step": 9477 + }, + { + "epoch": 0.056368350937291846, + "grad_norm": 2.620507001876831, + "learning_rate": 4.960911070972695e-05, + "loss": 5.6295, + "step": 9478 + }, + { + "epoch": 0.05637429822057284, + "grad_norm": 2.6132571697235107, + "learning_rate": 4.960902842887548e-05, + "loss": 5.697, + "step": 9479 + }, + { + "epoch": 0.056380245503853836, + "grad_norm": 2.2931299209594727, + "learning_rate": 4.960894613943324e-05, + "loss": 5.4723, + "step": 9480 + }, + { + "epoch": 0.05638619278713484, + "grad_norm": 2.176729202270508, + "learning_rate": 4.9608863841400284e-05, + "loss": 5.7403, + "step": 9481 + }, + { + "epoch": 0.05639214007041583, + "grad_norm": 1.932180404663086, + "learning_rate": 4.9608781534776616e-05, + "loss": 5.9256, + "step": 9482 + }, + { + "epoch": 0.05639808735369683, + "grad_norm": 1.7315243482589722, + "learning_rate": 4.9608699219562286e-05, + "loss": 5.9176, + "step": 9483 + }, + { + "epoch": 0.05640403463697783, + "grad_norm": 1.6548408269882202, + "learning_rate": 4.9608616895757306e-05, + "loss": 5.7495, + "step": 9484 + }, + { + "epoch": 0.056409981920258825, + "grad_norm": 1.8549202680587769, + "learning_rate": 4.960853456336172e-05, + "loss": 5.5261, + "step": 9485 + }, + { + "epoch": 0.05641592920353982, + "grad_norm": 2.5990993976593018, + "learning_rate": 4.9608452222375544e-05, + "loss": 5.5934, + "step": 9486 + }, + { + "epoch": 0.05642187648682082, + "grad_norm": 1.705051302909851, + "learning_rate": 4.9608369872798815e-05, + "loss": 5.3613, + "step": 9487 + }, + { + "epoch": 0.05642782377010182, + "grad_norm": 1.6170406341552734, + "learning_rate": 4.960828751463156e-05, + "loss": 5.2743, + "step": 9488 + }, + { + "epoch": 0.05643377105338281, + "grad_norm": 1.6247482299804688, + "learning_rate": 4.9608205147873796e-05, + "loss": 5.2772, + "step": 9489 + }, + { + "epoch": 0.056439718336663815, + "grad_norm": 1.7574137449264526, + "learning_rate": 4.9608122772525575e-05, + "loss": 5.3464, + "step": 9490 + }, + { + "epoch": 0.05644566561994481, + "grad_norm": 1.8814537525177002, + "learning_rate": 4.960804038858691e-05, + "loss": 5.3092, + "step": 9491 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 2.0222842693328857, + "learning_rate": 4.9607957996057816e-05, + "loss": 4.8234, + "step": 9492 + }, + { + "epoch": 0.05645756018650681, + "grad_norm": 1.6224759817123413, + "learning_rate": 4.960787559493836e-05, + "loss": 5.3962, + "step": 9493 + }, + { + "epoch": 0.0564635074697878, + "grad_norm": 1.4097533226013184, + "learning_rate": 4.960779318522853e-05, + "loss": 5.8302, + "step": 9494 + }, + { + "epoch": 0.0564694547530688, + "grad_norm": 1.7296205759048462, + "learning_rate": 4.960771076692839e-05, + "loss": 5.5679, + "step": 9495 + }, + { + "epoch": 0.05647540203634979, + "grad_norm": 1.6300212144851685, + "learning_rate": 4.960762834003794e-05, + "loss": 5.4315, + "step": 9496 + }, + { + "epoch": 0.056481349319630794, + "grad_norm": 1.8587864637374878, + "learning_rate": 4.960754590455723e-05, + "loss": 5.5492, + "step": 9497 + }, + { + "epoch": 0.05648729660291179, + "grad_norm": 1.8136985301971436, + "learning_rate": 4.960746346048628e-05, + "loss": 5.6363, + "step": 9498 + }, + { + "epoch": 0.056493243886192784, + "grad_norm": 2.1277284622192383, + "learning_rate": 4.960738100782511e-05, + "loss": 5.593, + "step": 9499 + }, + { + "epoch": 0.056499191169473786, + "grad_norm": 2.0262863636016846, + "learning_rate": 4.960729854657377e-05, + "loss": 5.6396, + "step": 9500 + }, + { + "epoch": 0.05650513845275478, + "grad_norm": 1.7870309352874756, + "learning_rate": 4.9607216076732266e-05, + "loss": 5.6523, + "step": 9501 + }, + { + "epoch": 0.05651108573603578, + "grad_norm": 1.734782099723816, + "learning_rate": 4.9607133598300636e-05, + "loss": 5.5313, + "step": 9502 + }, + { + "epoch": 0.05651703301931678, + "grad_norm": 2.2485032081604004, + "learning_rate": 4.9607051111278914e-05, + "loss": 5.3814, + "step": 9503 + }, + { + "epoch": 0.056522980302597774, + "grad_norm": 1.5091774463653564, + "learning_rate": 4.9606968615667125e-05, + "loss": 5.5277, + "step": 9504 + }, + { + "epoch": 0.05652892758587877, + "grad_norm": 1.7117774486541748, + "learning_rate": 4.9606886111465303e-05, + "loss": 5.2649, + "step": 9505 + }, + { + "epoch": 0.05653487486915977, + "grad_norm": 1.7309353351593018, + "learning_rate": 4.960680359867346e-05, + "loss": 5.2276, + "step": 9506 + }, + { + "epoch": 0.056540822152440766, + "grad_norm": 1.7058963775634766, + "learning_rate": 4.960672107729164e-05, + "loss": 5.1848, + "step": 9507 + }, + { + "epoch": 0.05654676943572176, + "grad_norm": 1.7862296104431152, + "learning_rate": 4.960663854731987e-05, + "loss": 5.2424, + "step": 9508 + }, + { + "epoch": 0.05655271671900276, + "grad_norm": 1.8900794982910156, + "learning_rate": 4.960655600875818e-05, + "loss": 5.283, + "step": 9509 + }, + { + "epoch": 0.05655866400228376, + "grad_norm": 1.9991587400436401, + "learning_rate": 4.960647346160658e-05, + "loss": 5.3525, + "step": 9510 + }, + { + "epoch": 0.05656461128556475, + "grad_norm": 1.6889851093292236, + "learning_rate": 4.960639090586513e-05, + "loss": 5.0592, + "step": 9511 + }, + { + "epoch": 0.05657055856884575, + "grad_norm": 1.6314234733581543, + "learning_rate": 4.9606308341533844e-05, + "loss": 5.1733, + "step": 9512 + }, + { + "epoch": 0.05657650585212675, + "grad_norm": 1.7801847457885742, + "learning_rate": 4.960622576861275e-05, + "loss": 5.2358, + "step": 9513 + }, + { + "epoch": 0.056582453135407745, + "grad_norm": 1.6572017669677734, + "learning_rate": 4.9606143187101864e-05, + "loss": 5.2429, + "step": 9514 + }, + { + "epoch": 0.05658840041868874, + "grad_norm": 1.7574421167373657, + "learning_rate": 4.960606059700124e-05, + "loss": 5.0717, + "step": 9515 + }, + { + "epoch": 0.05659434770196974, + "grad_norm": 1.8162970542907715, + "learning_rate": 4.960597799831088e-05, + "loss": 5.1513, + "step": 9516 + }, + { + "epoch": 0.05660029498525074, + "grad_norm": 1.9231795072555542, + "learning_rate": 4.960589539103084e-05, + "loss": 5.1539, + "step": 9517 + }, + { + "epoch": 0.05660624226853173, + "grad_norm": 1.624566674232483, + "learning_rate": 4.9605812775161136e-05, + "loss": 5.0999, + "step": 9518 + }, + { + "epoch": 0.056612189551812735, + "grad_norm": 1.4293668270111084, + "learning_rate": 4.960573015070179e-05, + "loss": 5.2365, + "step": 9519 + }, + { + "epoch": 0.05661813683509373, + "grad_norm": 1.789515495300293, + "learning_rate": 4.960564751765284e-05, + "loss": 5.2233, + "step": 9520 + }, + { + "epoch": 0.056624084118374725, + "grad_norm": 1.7212306261062622, + "learning_rate": 4.960556487601432e-05, + "loss": 5.1902, + "step": 9521 + }, + { + "epoch": 0.05663003140165573, + "grad_norm": 1.7691519260406494, + "learning_rate": 4.960548222578625e-05, + "loss": 5.2136, + "step": 9522 + }, + { + "epoch": 0.05663597868493672, + "grad_norm": 1.5925794839859009, + "learning_rate": 4.960539956696866e-05, + "loss": 5.4808, + "step": 9523 + }, + { + "epoch": 0.05664192596821772, + "grad_norm": 1.7014095783233643, + "learning_rate": 4.960531689956157e-05, + "loss": 5.1934, + "step": 9524 + }, + { + "epoch": 0.05664787325149871, + "grad_norm": 1.3620802164077759, + "learning_rate": 4.960523422356502e-05, + "loss": 5.0169, + "step": 9525 + }, + { + "epoch": 0.056653820534779714, + "grad_norm": 1.4778205156326294, + "learning_rate": 4.960515153897904e-05, + "loss": 5.1535, + "step": 9526 + }, + { + "epoch": 0.05665976781806071, + "grad_norm": 1.6393300294876099, + "learning_rate": 4.960506884580366e-05, + "loss": 5.2494, + "step": 9527 + }, + { + "epoch": 0.056665715101341704, + "grad_norm": 1.6070711612701416, + "learning_rate": 4.96049861440389e-05, + "loss": 5.3117, + "step": 9528 + }, + { + "epoch": 0.056671662384622706, + "grad_norm": 1.6023461818695068, + "learning_rate": 4.96049034336848e-05, + "loss": 5.1554, + "step": 9529 + }, + { + "epoch": 0.0566776096679037, + "grad_norm": 1.6061514616012573, + "learning_rate": 4.9604820714741374e-05, + "loss": 5.4123, + "step": 9530 + }, + { + "epoch": 0.056683556951184697, + "grad_norm": 1.8043792247772217, + "learning_rate": 4.960473798720866e-05, + "loss": 5.2582, + "step": 9531 + }, + { + "epoch": 0.0566895042344657, + "grad_norm": 1.6002432107925415, + "learning_rate": 4.960465525108669e-05, + "loss": 5.211, + "step": 9532 + }, + { + "epoch": 0.056695451517746694, + "grad_norm": 1.851266622543335, + "learning_rate": 4.960457250637549e-05, + "loss": 5.0949, + "step": 9533 + }, + { + "epoch": 0.05670139880102769, + "grad_norm": 1.7806520462036133, + "learning_rate": 4.9604489753075085e-05, + "loss": 5.1178, + "step": 9534 + }, + { + "epoch": 0.05670734608430869, + "grad_norm": 1.9938620328903198, + "learning_rate": 4.9604406991185506e-05, + "loss": 5.098, + "step": 9535 + }, + { + "epoch": 0.056713293367589686, + "grad_norm": 1.7983622550964355, + "learning_rate": 4.960432422070679e-05, + "loss": 4.98, + "step": 9536 + }, + { + "epoch": 0.05671924065087068, + "grad_norm": 1.845821499824524, + "learning_rate": 4.960424144163895e-05, + "loss": 4.951, + "step": 9537 + }, + { + "epoch": 0.05672518793415168, + "grad_norm": 1.8922109603881836, + "learning_rate": 4.960415865398202e-05, + "loss": 5.0327, + "step": 9538 + }, + { + "epoch": 0.05673113521743268, + "grad_norm": 2.159832239151001, + "learning_rate": 4.960407585773604e-05, + "loss": 5.5287, + "step": 9539 + }, + { + "epoch": 0.05673708250071367, + "grad_norm": 1.9966739416122437, + "learning_rate": 4.960399305290103e-05, + "loss": 5.7114, + "step": 9540 + }, + { + "epoch": 0.05674302978399467, + "grad_norm": 1.8796072006225586, + "learning_rate": 4.9603910239477026e-05, + "loss": 5.4673, + "step": 9541 + }, + { + "epoch": 0.05674897706727567, + "grad_norm": 1.6589174270629883, + "learning_rate": 4.9603827417464045e-05, + "loss": 5.3755, + "step": 9542 + }, + { + "epoch": 0.056754924350556665, + "grad_norm": 1.975807547569275, + "learning_rate": 4.960374458686212e-05, + "loss": 5.0648, + "step": 9543 + }, + { + "epoch": 0.05676087163383766, + "grad_norm": 1.7437241077423096, + "learning_rate": 4.960366174767128e-05, + "loss": 5.2338, + "step": 9544 + }, + { + "epoch": 0.05676681891711866, + "grad_norm": 1.8508884906768799, + "learning_rate": 4.9603578899891564e-05, + "loss": 5.3432, + "step": 9545 + }, + { + "epoch": 0.05677276620039966, + "grad_norm": 2.2117562294006348, + "learning_rate": 4.960349604352299e-05, + "loss": 5.0623, + "step": 9546 + }, + { + "epoch": 0.05677871348368065, + "grad_norm": 1.7681034803390503, + "learning_rate": 4.9603413178565586e-05, + "loss": 5.1998, + "step": 9547 + }, + { + "epoch": 0.056784660766961655, + "grad_norm": 2.4477179050445557, + "learning_rate": 4.960333030501939e-05, + "loss": 5.3317, + "step": 9548 + }, + { + "epoch": 0.05679060805024265, + "grad_norm": 1.8297652006149292, + "learning_rate": 4.9603247422884426e-05, + "loss": 5.3608, + "step": 9549 + }, + { + "epoch": 0.056796555333523645, + "grad_norm": 1.8361153602600098, + "learning_rate": 4.9603164532160715e-05, + "loss": 5.3914, + "step": 9550 + }, + { + "epoch": 0.05680250261680465, + "grad_norm": 1.748226523399353, + "learning_rate": 4.96030816328483e-05, + "loss": 5.3436, + "step": 9551 + }, + { + "epoch": 0.05680844990008564, + "grad_norm": 1.744964599609375, + "learning_rate": 4.96029987249472e-05, + "loss": 5.4287, + "step": 9552 + }, + { + "epoch": 0.05681439718336664, + "grad_norm": 1.9512866735458374, + "learning_rate": 4.9602915808457454e-05, + "loss": 5.3601, + "step": 9553 + }, + { + "epoch": 0.05682034446664763, + "grad_norm": 1.5863629579544067, + "learning_rate": 4.9602832883379077e-05, + "loss": 5.5491, + "step": 9554 + }, + { + "epoch": 0.056826291749928634, + "grad_norm": 1.967677354812622, + "learning_rate": 4.96027499497121e-05, + "loss": 5.2402, + "step": 9555 + }, + { + "epoch": 0.05683223903320963, + "grad_norm": 2.277714252471924, + "learning_rate": 4.960266700745657e-05, + "loss": 5.5155, + "step": 9556 + }, + { + "epoch": 0.056838186316490624, + "grad_norm": 1.8371034860610962, + "learning_rate": 4.96025840566125e-05, + "loss": 5.2694, + "step": 9557 + }, + { + "epoch": 0.056844133599771626, + "grad_norm": 1.723008155822754, + "learning_rate": 4.9602501097179915e-05, + "loss": 5.4983, + "step": 9558 + }, + { + "epoch": 0.05685008088305262, + "grad_norm": 1.6955413818359375, + "learning_rate": 4.960241812915886e-05, + "loss": 5.6888, + "step": 9559 + }, + { + "epoch": 0.056856028166333616, + "grad_norm": 1.5899012088775635, + "learning_rate": 4.960233515254935e-05, + "loss": 5.4241, + "step": 9560 + }, + { + "epoch": 0.05686197544961462, + "grad_norm": 1.493268370628357, + "learning_rate": 4.9602252167351416e-05, + "loss": 5.1889, + "step": 9561 + }, + { + "epoch": 0.056867922732895614, + "grad_norm": 1.8037081956863403, + "learning_rate": 4.9602169173565094e-05, + "loss": 5.1785, + "step": 9562 + }, + { + "epoch": 0.05687387001617661, + "grad_norm": 1.6377664804458618, + "learning_rate": 4.960208617119041e-05, + "loss": 5.2593, + "step": 9563 + }, + { + "epoch": 0.05687981729945761, + "grad_norm": 2.077209234237671, + "learning_rate": 4.960200316022739e-05, + "loss": 5.1012, + "step": 9564 + }, + { + "epoch": 0.056885764582738606, + "grad_norm": 2.3584885597229004, + "learning_rate": 4.9601920140676064e-05, + "loss": 5.1141, + "step": 9565 + }, + { + "epoch": 0.0568917118660196, + "grad_norm": 1.990319013595581, + "learning_rate": 4.960183711253646e-05, + "loss": 4.9336, + "step": 9566 + }, + { + "epoch": 0.0568976591493006, + "grad_norm": 2.037742853164673, + "learning_rate": 4.960175407580861e-05, + "loss": 4.8494, + "step": 9567 + }, + { + "epoch": 0.0569036064325816, + "grad_norm": 1.8493839502334595, + "learning_rate": 4.9601671030492546e-05, + "loss": 5.337, + "step": 9568 + }, + { + "epoch": 0.05690955371586259, + "grad_norm": 1.9864604473114014, + "learning_rate": 4.960158797658829e-05, + "loss": 5.5684, + "step": 9569 + }, + { + "epoch": 0.05691550099914359, + "grad_norm": 1.9740629196166992, + "learning_rate": 4.960150491409587e-05, + "loss": 5.444, + "step": 9570 + }, + { + "epoch": 0.05692144828242459, + "grad_norm": 1.9429807662963867, + "learning_rate": 4.960142184301533e-05, + "loss": 5.277, + "step": 9571 + }, + { + "epoch": 0.056927395565705585, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.960133876334668e-05, + "loss": 5.1694, + "step": 9572 + }, + { + "epoch": 0.05693334284898658, + "grad_norm": 1.7716888189315796, + "learning_rate": 4.960125567508996e-05, + "loss": 5.1383, + "step": 9573 + }, + { + "epoch": 0.05693929013226758, + "grad_norm": 1.8266246318817139, + "learning_rate": 4.9601172578245194e-05, + "loss": 5.4019, + "step": 9574 + }, + { + "epoch": 0.05694523741554858, + "grad_norm": 1.8929648399353027, + "learning_rate": 4.9601089472812414e-05, + "loss": 5.3948, + "step": 9575 + }, + { + "epoch": 0.05695118469882957, + "grad_norm": 1.9918208122253418, + "learning_rate": 4.960100635879165e-05, + "loss": 5.3195, + "step": 9576 + }, + { + "epoch": 0.056957131982110575, + "grad_norm": 1.4987989664077759, + "learning_rate": 4.960092323618292e-05, + "loss": 5.5292, + "step": 9577 + }, + { + "epoch": 0.05696307926539157, + "grad_norm": 1.683800220489502, + "learning_rate": 4.960084010498627e-05, + "loss": 5.5069, + "step": 9578 + }, + { + "epoch": 0.056969026548672565, + "grad_norm": 1.767561435699463, + "learning_rate": 4.960075696520171e-05, + "loss": 5.4134, + "step": 9579 + }, + { + "epoch": 0.05697497383195357, + "grad_norm": 2.077564239501953, + "learning_rate": 4.960067381682929e-05, + "loss": 5.3362, + "step": 9580 + }, + { + "epoch": 0.05698092111523456, + "grad_norm": 2.0167109966278076, + "learning_rate": 4.960059065986903e-05, + "loss": 5.4235, + "step": 9581 + }, + { + "epoch": 0.05698686839851556, + "grad_norm": 1.647669792175293, + "learning_rate": 4.9600507494320953e-05, + "loss": 5.3273, + "step": 9582 + }, + { + "epoch": 0.05699281568179655, + "grad_norm": 1.6051719188690186, + "learning_rate": 4.960042432018509e-05, + "loss": 5.2486, + "step": 9583 + }, + { + "epoch": 0.056998762965077554, + "grad_norm": 1.9283394813537598, + "learning_rate": 4.960034113746148e-05, + "loss": 5.233, + "step": 9584 + }, + { + "epoch": 0.05700471024835855, + "grad_norm": 1.6215802431106567, + "learning_rate": 4.960025794615014e-05, + "loss": 5.2322, + "step": 9585 + }, + { + "epoch": 0.057010657531639544, + "grad_norm": 1.8902918100357056, + "learning_rate": 4.960017474625111e-05, + "loss": 5.063, + "step": 9586 + }, + { + "epoch": 0.057016604814920546, + "grad_norm": 2.4694666862487793, + "learning_rate": 4.9600091537764415e-05, + "loss": 4.498, + "step": 9587 + }, + { + "epoch": 0.05702255209820154, + "grad_norm": 1.98915433883667, + "learning_rate": 4.960000832069007e-05, + "loss": 4.8781, + "step": 9588 + }, + { + "epoch": 0.057028499381482536, + "grad_norm": 2.0424818992614746, + "learning_rate": 4.9599925095028126e-05, + "loss": 5.5803, + "step": 9589 + }, + { + "epoch": 0.05703444666476354, + "grad_norm": 1.471275806427002, + "learning_rate": 4.95998418607786e-05, + "loss": 5.5604, + "step": 9590 + }, + { + "epoch": 0.057040393948044534, + "grad_norm": 1.6512761116027832, + "learning_rate": 4.959975861794152e-05, + "loss": 5.2147, + "step": 9591 + }, + { + "epoch": 0.05704634123132553, + "grad_norm": 1.6902865171432495, + "learning_rate": 4.959967536651693e-05, + "loss": 5.2654, + "step": 9592 + }, + { + "epoch": 0.05705228851460653, + "grad_norm": 1.5656665563583374, + "learning_rate": 4.9599592106504835e-05, + "loss": 5.1106, + "step": 9593 + }, + { + "epoch": 0.057058235797887526, + "grad_norm": 1.760901927947998, + "learning_rate": 4.959950883790528e-05, + "loss": 5.1833, + "step": 9594 + }, + { + "epoch": 0.05706418308116852, + "grad_norm": 1.5585325956344604, + "learning_rate": 4.9599425560718294e-05, + "loss": 5.202, + "step": 9595 + }, + { + "epoch": 0.05707013036444952, + "grad_norm": 1.5477479696273804, + "learning_rate": 4.959934227494389e-05, + "loss": 5.121, + "step": 9596 + }, + { + "epoch": 0.05707607764773052, + "grad_norm": 1.9299825429916382, + "learning_rate": 4.959925898058213e-05, + "loss": 5.0026, + "step": 9597 + }, + { + "epoch": 0.05708202493101151, + "grad_norm": 1.866237759590149, + "learning_rate": 4.959917567763301e-05, + "loss": 4.999, + "step": 9598 + }, + { + "epoch": 0.05708797221429251, + "grad_norm": 1.6670162677764893, + "learning_rate": 4.959909236609657e-05, + "loss": 5.4047, + "step": 9599 + }, + { + "epoch": 0.05709391949757351, + "grad_norm": 1.4666836261749268, + "learning_rate": 4.9599009045972844e-05, + "loss": 5.3598, + "step": 9600 + }, + { + "epoch": 0.057099866780854505, + "grad_norm": 1.928645372390747, + "learning_rate": 4.959892571726186e-05, + "loss": 5.7015, + "step": 9601 + }, + { + "epoch": 0.0571058140641355, + "grad_norm": 1.9761322736740112, + "learning_rate": 4.959884237996365e-05, + "loss": 4.8682, + "step": 9602 + }, + { + "epoch": 0.0571117613474165, + "grad_norm": 1.9823036193847656, + "learning_rate": 4.959875903407823e-05, + "loss": 4.8752, + "step": 9603 + }, + { + "epoch": 0.0571177086306975, + "grad_norm": 1.9242253303527832, + "learning_rate": 4.959867567960564e-05, + "loss": 4.9314, + "step": 9604 + }, + { + "epoch": 0.05712365591397849, + "grad_norm": 1.740980625152588, + "learning_rate": 4.9598592316545904e-05, + "loss": 4.9843, + "step": 9605 + }, + { + "epoch": 0.057129603197259494, + "grad_norm": 2.0768508911132812, + "learning_rate": 4.959850894489906e-05, + "loss": 4.8528, + "step": 9606 + }, + { + "epoch": 0.05713555048054049, + "grad_norm": 1.7417833805084229, + "learning_rate": 4.959842556466513e-05, + "loss": 5.1374, + "step": 9607 + }, + { + "epoch": 0.057141497763821485, + "grad_norm": 1.933691382408142, + "learning_rate": 4.959834217584414e-05, + "loss": 5.349, + "step": 9608 + }, + { + "epoch": 0.05714744504710249, + "grad_norm": 1.8035194873809814, + "learning_rate": 4.959825877843612e-05, + "loss": 5.0212, + "step": 9609 + }, + { + "epoch": 0.05715339233038348, + "grad_norm": 2.323709487915039, + "learning_rate": 4.9598175372441106e-05, + "loss": 5.5346, + "step": 9610 + }, + { + "epoch": 0.05715933961366448, + "grad_norm": 1.755983591079712, + "learning_rate": 4.959809195785912e-05, + "loss": 4.8425, + "step": 9611 + }, + { + "epoch": 0.05716528689694547, + "grad_norm": 1.6614432334899902, + "learning_rate": 4.95980085346902e-05, + "loss": 4.912, + "step": 9612 + }, + { + "epoch": 0.057171234180226474, + "grad_norm": 1.8319662809371948, + "learning_rate": 4.959792510293436e-05, + "loss": 5.0125, + "step": 9613 + }, + { + "epoch": 0.05717718146350747, + "grad_norm": 1.8528090715408325, + "learning_rate": 4.959784166259165e-05, + "loss": 4.898, + "step": 9614 + }, + { + "epoch": 0.057183128746788464, + "grad_norm": 2.163757562637329, + "learning_rate": 4.959775821366208e-05, + "loss": 5.2041, + "step": 9615 + }, + { + "epoch": 0.057189076030069466, + "grad_norm": 1.939430832862854, + "learning_rate": 4.959767475614569e-05, + "loss": 5.3337, + "step": 9616 + }, + { + "epoch": 0.05719502331335046, + "grad_norm": 1.7198511362075806, + "learning_rate": 4.959759129004251e-05, + "loss": 5.2682, + "step": 9617 + }, + { + "epoch": 0.057200970596631456, + "grad_norm": 1.7674570083618164, + "learning_rate": 4.959750781535255e-05, + "loss": 5.4188, + "step": 9618 + }, + { + "epoch": 0.05720691787991246, + "grad_norm": 1.7197433710098267, + "learning_rate": 4.959742433207587e-05, + "loss": 5.1725, + "step": 9619 + }, + { + "epoch": 0.05721286516319345, + "grad_norm": 1.6682969331741333, + "learning_rate": 4.959734084021248e-05, + "loss": 5.1349, + "step": 9620 + }, + { + "epoch": 0.05721881244647445, + "grad_norm": 1.3784568309783936, + "learning_rate": 4.959725733976241e-05, + "loss": 5.2408, + "step": 9621 + }, + { + "epoch": 0.05722475972975545, + "grad_norm": 1.690483808517456, + "learning_rate": 4.9597173830725686e-05, + "loss": 5.2616, + "step": 9622 + }, + { + "epoch": 0.057230707013036446, + "grad_norm": 1.5313903093338013, + "learning_rate": 4.959709031310235e-05, + "loss": 5.1481, + "step": 9623 + }, + { + "epoch": 0.05723665429631744, + "grad_norm": 1.6266121864318848, + "learning_rate": 4.959700678689242e-05, + "loss": 5.0192, + "step": 9624 + }, + { + "epoch": 0.05724260157959844, + "grad_norm": 2.3125410079956055, + "learning_rate": 4.959692325209593e-05, + "loss": 4.5513, + "step": 9625 + }, + { + "epoch": 0.05724854886287944, + "grad_norm": 1.6884924173355103, + "learning_rate": 4.9596839708712913e-05, + "loss": 5.1917, + "step": 9626 + }, + { + "epoch": 0.05725449614616043, + "grad_norm": 1.5797723531723022, + "learning_rate": 4.9596756156743385e-05, + "loss": 5.5674, + "step": 9627 + }, + { + "epoch": 0.05726044342944143, + "grad_norm": 1.6152269840240479, + "learning_rate": 4.959667259618739e-05, + "loss": 5.4566, + "step": 9628 + }, + { + "epoch": 0.05726639071272243, + "grad_norm": 1.611608624458313, + "learning_rate": 4.959658902704495e-05, + "loss": 5.3678, + "step": 9629 + }, + { + "epoch": 0.057272337996003425, + "grad_norm": 1.774327278137207, + "learning_rate": 4.9596505449316086e-05, + "loss": 5.2438, + "step": 9630 + }, + { + "epoch": 0.05727828527928442, + "grad_norm": 1.7961443662643433, + "learning_rate": 4.9596421863000856e-05, + "loss": 5.3061, + "step": 9631 + }, + { + "epoch": 0.05728423256256542, + "grad_norm": 1.709675669670105, + "learning_rate": 4.959633826809925e-05, + "loss": 5.0095, + "step": 9632 + }, + { + "epoch": 0.05729017984584642, + "grad_norm": 1.7140734195709229, + "learning_rate": 4.959625466461132e-05, + "loss": 5.313, + "step": 9633 + }, + { + "epoch": 0.05729612712912741, + "grad_norm": 1.8302016258239746, + "learning_rate": 4.95961710525371e-05, + "loss": 5.4008, + "step": 9634 + }, + { + "epoch": 0.057302074412408414, + "grad_norm": 1.8570395708084106, + "learning_rate": 4.95960874318766e-05, + "loss": 5.513, + "step": 9635 + }, + { + "epoch": 0.05730802169568941, + "grad_norm": 1.6907027959823608, + "learning_rate": 4.959600380262987e-05, + "loss": 5.1933, + "step": 9636 + }, + { + "epoch": 0.057313968978970405, + "grad_norm": 1.6505299806594849, + "learning_rate": 4.9595920164796926e-05, + "loss": 5.1537, + "step": 9637 + }, + { + "epoch": 0.05731991626225141, + "grad_norm": 1.5248258113861084, + "learning_rate": 4.95958365183778e-05, + "loss": 5.4232, + "step": 9638 + }, + { + "epoch": 0.0573258635455324, + "grad_norm": 1.4630048274993896, + "learning_rate": 4.9595752863372524e-05, + "loss": 5.565, + "step": 9639 + }, + { + "epoch": 0.0573318108288134, + "grad_norm": 1.5858573913574219, + "learning_rate": 4.959566919978112e-05, + "loss": 5.4364, + "step": 9640 + }, + { + "epoch": 0.05733775811209439, + "grad_norm": 1.7803694009780884, + "learning_rate": 4.9595585527603625e-05, + "loss": 5.1727, + "step": 9641 + }, + { + "epoch": 0.057343705395375394, + "grad_norm": 1.639163851737976, + "learning_rate": 4.959550184684007e-05, + "loss": 5.5538, + "step": 9642 + }, + { + "epoch": 0.05734965267865639, + "grad_norm": 1.5917890071868896, + "learning_rate": 4.959541815749046e-05, + "loss": 5.6788, + "step": 9643 + }, + { + "epoch": 0.057355599961937384, + "grad_norm": 1.5524990558624268, + "learning_rate": 4.959533445955487e-05, + "loss": 5.7832, + "step": 9644 + }, + { + "epoch": 0.057361547245218386, + "grad_norm": 1.7229019403457642, + "learning_rate": 4.959525075303328e-05, + "loss": 5.4417, + "step": 9645 + }, + { + "epoch": 0.05736749452849938, + "grad_norm": 1.5434623956680298, + "learning_rate": 4.959516703792575e-05, + "loss": 5.3629, + "step": 9646 + }, + { + "epoch": 0.057373441811780376, + "grad_norm": 1.4929866790771484, + "learning_rate": 4.9595083314232306e-05, + "loss": 5.8586, + "step": 9647 + }, + { + "epoch": 0.05737938909506138, + "grad_norm": 1.209796667098999, + "learning_rate": 4.959499958195297e-05, + "loss": 5.5001, + "step": 9648 + }, + { + "epoch": 0.05738533637834237, + "grad_norm": 2.703871488571167, + "learning_rate": 4.9594915841087775e-05, + "loss": 5.6564, + "step": 9649 + }, + { + "epoch": 0.05739128366162337, + "grad_norm": 1.9408828020095825, + "learning_rate": 4.959483209163674e-05, + "loss": 5.6683, + "step": 9650 + }, + { + "epoch": 0.05739723094490437, + "grad_norm": 1.8055803775787354, + "learning_rate": 4.9594748333599914e-05, + "loss": 5.3046, + "step": 9651 + }, + { + "epoch": 0.057403178228185366, + "grad_norm": 2.3453104496002197, + "learning_rate": 4.959466456697731e-05, + "loss": 6.1944, + "step": 9652 + }, + { + "epoch": 0.05740912551146636, + "grad_norm": 2.3799800872802734, + "learning_rate": 4.959458079176897e-05, + "loss": 5.6706, + "step": 9653 + }, + { + "epoch": 0.05741507279474736, + "grad_norm": 2.111069440841675, + "learning_rate": 4.959449700797491e-05, + "loss": 5.1808, + "step": 9654 + }, + { + "epoch": 0.05742102007802836, + "grad_norm": 2.237873077392578, + "learning_rate": 4.9594413215595164e-05, + "loss": 5.0609, + "step": 9655 + }, + { + "epoch": 0.05742696736130935, + "grad_norm": 1.956520438194275, + "learning_rate": 4.959432941462977e-05, + "loss": 5.1431, + "step": 9656 + }, + { + "epoch": 0.05743291464459035, + "grad_norm": 2.3761603832244873, + "learning_rate": 4.9594245605078735e-05, + "loss": 4.8722, + "step": 9657 + }, + { + "epoch": 0.05743886192787135, + "grad_norm": 1.820745825767517, + "learning_rate": 4.959416178694212e-05, + "loss": 5.0149, + "step": 9658 + }, + { + "epoch": 0.057444809211152345, + "grad_norm": 2.0804755687713623, + "learning_rate": 4.9594077960219924e-05, + "loss": 5.7698, + "step": 9659 + }, + { + "epoch": 0.05745075649443334, + "grad_norm": 1.9319117069244385, + "learning_rate": 4.9593994124912196e-05, + "loss": 5.3054, + "step": 9660 + }, + { + "epoch": 0.05745670377771434, + "grad_norm": 2.386338472366333, + "learning_rate": 4.959391028101896e-05, + "loss": 5.2093, + "step": 9661 + }, + { + "epoch": 0.05746265106099534, + "grad_norm": 1.852386474609375, + "learning_rate": 4.9593826428540244e-05, + "loss": 5.1943, + "step": 9662 + }, + { + "epoch": 0.05746859834427633, + "grad_norm": 1.9619694948196411, + "learning_rate": 4.959374256747607e-05, + "loss": 4.8275, + "step": 9663 + }, + { + "epoch": 0.057474545627557334, + "grad_norm": 2.4797024726867676, + "learning_rate": 4.9593658697826485e-05, + "loss": 5.5257, + "step": 9664 + }, + { + "epoch": 0.05748049291083833, + "grad_norm": 2.1713874340057373, + "learning_rate": 4.959357481959149e-05, + "loss": 5.4486, + "step": 9665 + }, + { + "epoch": 0.057486440194119325, + "grad_norm": 1.9605398178100586, + "learning_rate": 4.9593490932771145e-05, + "loss": 5.1512, + "step": 9666 + }, + { + "epoch": 0.05749238747740033, + "grad_norm": 1.9853549003601074, + "learning_rate": 4.959340703736547e-05, + "loss": 5.665, + "step": 9667 + }, + { + "epoch": 0.05749833476068132, + "grad_norm": 1.984279990196228, + "learning_rate": 4.9593323133374494e-05, + "loss": 5.7797, + "step": 9668 + }, + { + "epoch": 0.05750428204396232, + "grad_norm": 1.8343236446380615, + "learning_rate": 4.9593239220798225e-05, + "loss": 5.0261, + "step": 9669 + }, + { + "epoch": 0.05751022932724331, + "grad_norm": 1.8675687313079834, + "learning_rate": 4.959315529963673e-05, + "loss": 4.8754, + "step": 9670 + }, + { + "epoch": 0.057516176610524314, + "grad_norm": 1.9129834175109863, + "learning_rate": 4.959307136989e-05, + "loss": 5.1056, + "step": 9671 + }, + { + "epoch": 0.05752212389380531, + "grad_norm": 3.142893075942993, + "learning_rate": 4.95929874315581e-05, + "loss": 5.6029, + "step": 9672 + }, + { + "epoch": 0.057528071177086304, + "grad_norm": 1.80843985080719, + "learning_rate": 4.9592903484641026e-05, + "loss": 5.57, + "step": 9673 + }, + { + "epoch": 0.057534018460367306, + "grad_norm": 1.9195841550827026, + "learning_rate": 4.9592819529138835e-05, + "loss": 5.6964, + "step": 9674 + }, + { + "epoch": 0.0575399657436483, + "grad_norm": 2.026477813720703, + "learning_rate": 4.959273556505154e-05, + "loss": 5.8544, + "step": 9675 + }, + { + "epoch": 0.057545913026929296, + "grad_norm": 2.111274003982544, + "learning_rate": 4.959265159237918e-05, + "loss": 5.8014, + "step": 9676 + }, + { + "epoch": 0.0575518603102103, + "grad_norm": 1.9789505004882812, + "learning_rate": 4.9592567611121776e-05, + "loss": 5.7646, + "step": 9677 + }, + { + "epoch": 0.05755780759349129, + "grad_norm": 1.8776015043258667, + "learning_rate": 4.9592483621279365e-05, + "loss": 6.1603, + "step": 9678 + }, + { + "epoch": 0.05756375487677229, + "grad_norm": 2.135849714279175, + "learning_rate": 4.9592399622851956e-05, + "loss": 5.6372, + "step": 9679 + }, + { + "epoch": 0.05756970216005329, + "grad_norm": 2.3335585594177246, + "learning_rate": 4.959231561583961e-05, + "loss": 5.5515, + "step": 9680 + }, + { + "epoch": 0.057575649443334286, + "grad_norm": 1.9315869808197021, + "learning_rate": 4.9592231600242337e-05, + "loss": 5.9287, + "step": 9681 + }, + { + "epoch": 0.05758159672661528, + "grad_norm": 2.4559311866760254, + "learning_rate": 4.959214757606017e-05, + "loss": 5.6079, + "step": 9682 + }, + { + "epoch": 0.05758754400989628, + "grad_norm": 2.6558609008789062, + "learning_rate": 4.959206354329314e-05, + "loss": 5.5728, + "step": 9683 + }, + { + "epoch": 0.05759349129317728, + "grad_norm": 2.2376396656036377, + "learning_rate": 4.9591979501941274e-05, + "loss": 5.5318, + "step": 9684 + }, + { + "epoch": 0.05759943857645827, + "grad_norm": 1.8506240844726562, + "learning_rate": 4.95918954520046e-05, + "loss": 5.7957, + "step": 9685 + }, + { + "epoch": 0.05760538585973927, + "grad_norm": 2.2428138256073, + "learning_rate": 4.9591811393483144e-05, + "loss": 5.7223, + "step": 9686 + }, + { + "epoch": 0.05761133314302027, + "grad_norm": 2.5734875202178955, + "learning_rate": 4.9591727326376955e-05, + "loss": 5.3401, + "step": 9687 + }, + { + "epoch": 0.057617280426301265, + "grad_norm": 2.567263126373291, + "learning_rate": 4.959164325068604e-05, + "loss": 5.4853, + "step": 9688 + }, + { + "epoch": 0.05762322770958226, + "grad_norm": 2.4430556297302246, + "learning_rate": 4.959155916641043e-05, + "loss": 5.9845, + "step": 9689 + }, + { + "epoch": 0.05762917499286326, + "grad_norm": 2.039846181869507, + "learning_rate": 4.959147507355017e-05, + "loss": 6.0689, + "step": 9690 + }, + { + "epoch": 0.05763512227614426, + "grad_norm": 2.207920551300049, + "learning_rate": 4.959139097210528e-05, + "loss": 5.6658, + "step": 9691 + }, + { + "epoch": 0.05764106955942525, + "grad_norm": 1.7421616315841675, + "learning_rate": 4.959130686207578e-05, + "loss": 6.0915, + "step": 9692 + }, + { + "epoch": 0.057647016842706254, + "grad_norm": 1.7738968133926392, + "learning_rate": 4.9591222743461716e-05, + "loss": 6.2092, + "step": 9693 + }, + { + "epoch": 0.05765296412598725, + "grad_norm": 1.8665943145751953, + "learning_rate": 4.959113861626311e-05, + "loss": 6.0922, + "step": 9694 + }, + { + "epoch": 0.057658911409268244, + "grad_norm": 2.0272347927093506, + "learning_rate": 4.959105448047999e-05, + "loss": 5.8291, + "step": 9695 + }, + { + "epoch": 0.057664858692549247, + "grad_norm": 2.8527796268463135, + "learning_rate": 4.9590970336112395e-05, + "loss": 5.428, + "step": 9696 + }, + { + "epoch": 0.05767080597583024, + "grad_norm": 1.8518950939178467, + "learning_rate": 4.959088618316033e-05, + "loss": 5.4199, + "step": 9697 + }, + { + "epoch": 0.05767675325911124, + "grad_norm": 2.38712739944458, + "learning_rate": 4.959080202162386e-05, + "loss": 5.1627, + "step": 9698 + }, + { + "epoch": 0.05768270054239223, + "grad_norm": 1.8407059907913208, + "learning_rate": 4.959071785150298e-05, + "loss": 5.1827, + "step": 9699 + }, + { + "epoch": 0.057688647825673234, + "grad_norm": 2.431151866912842, + "learning_rate": 4.9590633672797744e-05, + "loss": 6.1722, + "step": 9700 + }, + { + "epoch": 0.05769459510895423, + "grad_norm": 2.498046398162842, + "learning_rate": 4.9590549485508165e-05, + "loss": 6.2321, + "step": 9701 + }, + { + "epoch": 0.057700542392235224, + "grad_norm": 1.8793575763702393, + "learning_rate": 4.959046528963428e-05, + "loss": 5.4019, + "step": 9702 + }, + { + "epoch": 0.057706489675516226, + "grad_norm": 2.137622117996216, + "learning_rate": 4.9590381085176115e-05, + "loss": 5.9118, + "step": 9703 + }, + { + "epoch": 0.05771243695879722, + "grad_norm": 1.9514268636703491, + "learning_rate": 4.959029687213371e-05, + "loss": 5.6651, + "step": 9704 + }, + { + "epoch": 0.057718384242078216, + "grad_norm": 2.3678367137908936, + "learning_rate": 4.9590212650507085e-05, + "loss": 5.2054, + "step": 9705 + }, + { + "epoch": 0.05772433152535922, + "grad_norm": 2.8808276653289795, + "learning_rate": 4.9590128420296266e-05, + "loss": 5.3066, + "step": 9706 + }, + { + "epoch": 0.05773027880864021, + "grad_norm": 2.2405474185943604, + "learning_rate": 4.9590044181501297e-05, + "loss": 5.2904, + "step": 9707 + }, + { + "epoch": 0.05773622609192121, + "grad_norm": 2.3762283325195312, + "learning_rate": 4.958995993412219e-05, + "loss": 5.5847, + "step": 9708 + }, + { + "epoch": 0.05774217337520221, + "grad_norm": 2.5258681774139404, + "learning_rate": 4.958987567815898e-05, + "loss": 5.4852, + "step": 9709 + }, + { + "epoch": 0.057748120658483205, + "grad_norm": 2.31478214263916, + "learning_rate": 4.9589791413611704e-05, + "loss": 5.5658, + "step": 9710 + }, + { + "epoch": 0.0577540679417642, + "grad_norm": 1.735771894454956, + "learning_rate": 4.958970714048038e-05, + "loss": 6.0311, + "step": 9711 + }, + { + "epoch": 0.0577600152250452, + "grad_norm": 2.2843849658966064, + "learning_rate": 4.958962285876505e-05, + "loss": 5.9535, + "step": 9712 + }, + { + "epoch": 0.0577659625083262, + "grad_norm": 2.3449392318725586, + "learning_rate": 4.958953856846573e-05, + "loss": 5.9835, + "step": 9713 + }, + { + "epoch": 0.05777190979160719, + "grad_norm": 2.319952964782715, + "learning_rate": 4.9589454269582456e-05, + "loss": 5.5318, + "step": 9714 + }, + { + "epoch": 0.05777785707488819, + "grad_norm": 2.6801493167877197, + "learning_rate": 4.958936996211526e-05, + "loss": 4.8672, + "step": 9715 + }, + { + "epoch": 0.05778380435816919, + "grad_norm": 2.622528553009033, + "learning_rate": 4.958928564606418e-05, + "loss": 6.0755, + "step": 9716 + }, + { + "epoch": 0.057789751641450185, + "grad_norm": 1.973480224609375, + "learning_rate": 4.9589201321429216e-05, + "loss": 5.8197, + "step": 9717 + }, + { + "epoch": 0.05779569892473118, + "grad_norm": 2.060497760772705, + "learning_rate": 4.958911698821043e-05, + "loss": 5.2838, + "step": 9718 + }, + { + "epoch": 0.05780164620801218, + "grad_norm": 2.068103551864624, + "learning_rate": 4.958903264640783e-05, + "loss": 5.4917, + "step": 9719 + }, + { + "epoch": 0.05780759349129318, + "grad_norm": 2.5899293422698975, + "learning_rate": 4.958894829602145e-05, + "loss": 5.1312, + "step": 9720 + }, + { + "epoch": 0.05781354077457417, + "grad_norm": 3.2153897285461426, + "learning_rate": 4.958886393705132e-05, + "loss": 4.7502, + "step": 9721 + }, + { + "epoch": 0.057819488057855174, + "grad_norm": 2.805802345275879, + "learning_rate": 4.9588779569497484e-05, + "loss": 4.6876, + "step": 9722 + }, + { + "epoch": 0.05782543534113617, + "grad_norm": 2.3670101165771484, + "learning_rate": 4.958869519335995e-05, + "loss": 4.6025, + "step": 9723 + }, + { + "epoch": 0.057831382624417164, + "grad_norm": 1.992903709411621, + "learning_rate": 4.9588610808638755e-05, + "loss": 5.3602, + "step": 9724 + }, + { + "epoch": 0.057837329907698166, + "grad_norm": 2.249572277069092, + "learning_rate": 4.958852641533394e-05, + "loss": 4.9574, + "step": 9725 + }, + { + "epoch": 0.05784327719097916, + "grad_norm": 2.500433921813965, + "learning_rate": 4.958844201344552e-05, + "loss": 5.3656, + "step": 9726 + }, + { + "epoch": 0.05784922447426016, + "grad_norm": 2.0277605056762695, + "learning_rate": 4.9588357602973526e-05, + "loss": 5.6467, + "step": 9727 + }, + { + "epoch": 0.05785517175754116, + "grad_norm": 2.1196112632751465, + "learning_rate": 4.958827318391799e-05, + "loss": 5.6257, + "step": 9728 + }, + { + "epoch": 0.057861119040822154, + "grad_norm": 3.160593271255493, + "learning_rate": 4.9588188756278945e-05, + "loss": 4.9618, + "step": 9729 + }, + { + "epoch": 0.05786706632410315, + "grad_norm": 1.90407395362854, + "learning_rate": 4.958810432005642e-05, + "loss": 5.4551, + "step": 9730 + }, + { + "epoch": 0.057873013607384144, + "grad_norm": 2.0096004009246826, + "learning_rate": 4.958801987525043e-05, + "loss": 5.6562, + "step": 9731 + }, + { + "epoch": 0.057878960890665146, + "grad_norm": 2.617847442626953, + "learning_rate": 4.958793542186103e-05, + "loss": 5.747, + "step": 9732 + }, + { + "epoch": 0.05788490817394614, + "grad_norm": 2.3982057571411133, + "learning_rate": 4.9587850959888226e-05, + "loss": 5.6146, + "step": 9733 + }, + { + "epoch": 0.057890855457227136, + "grad_norm": 2.0222113132476807, + "learning_rate": 4.9587766489332065e-05, + "loss": 6.0204, + "step": 9734 + }, + { + "epoch": 0.05789680274050814, + "grad_norm": 2.1110177040100098, + "learning_rate": 4.958768201019257e-05, + "loss": 5.2957, + "step": 9735 + }, + { + "epoch": 0.05790275002378913, + "grad_norm": 1.8278865814208984, + "learning_rate": 4.958759752246977e-05, + "loss": 5.9902, + "step": 9736 + }, + { + "epoch": 0.05790869730707013, + "grad_norm": 2.2461514472961426, + "learning_rate": 4.958751302616368e-05, + "loss": 5.8572, + "step": 9737 + }, + { + "epoch": 0.05791464459035113, + "grad_norm": 1.7453250885009766, + "learning_rate": 4.958742852127435e-05, + "loss": 5.6658, + "step": 9738 + }, + { + "epoch": 0.057920591873632125, + "grad_norm": 2.480726718902588, + "learning_rate": 4.95873440078018e-05, + "loss": 5.4231, + "step": 9739 + }, + { + "epoch": 0.05792653915691312, + "grad_norm": 2.2310776710510254, + "learning_rate": 4.958725948574607e-05, + "loss": 5.4768, + "step": 9740 + }, + { + "epoch": 0.05793248644019412, + "grad_norm": 1.9454891681671143, + "learning_rate": 4.958717495510718e-05, + "loss": 5.4503, + "step": 9741 + }, + { + "epoch": 0.05793843372347512, + "grad_norm": 2.196054458618164, + "learning_rate": 4.958709041588516e-05, + "loss": 5.1987, + "step": 9742 + }, + { + "epoch": 0.05794438100675611, + "grad_norm": 2.385000228881836, + "learning_rate": 4.958700586808004e-05, + "loss": 5.8413, + "step": 9743 + }, + { + "epoch": 0.05795032829003711, + "grad_norm": 2.0967705249786377, + "learning_rate": 4.958692131169185e-05, + "loss": 5.8531, + "step": 9744 + }, + { + "epoch": 0.05795627557331811, + "grad_norm": 2.186253309249878, + "learning_rate": 4.958683674672062e-05, + "loss": 5.8241, + "step": 9745 + }, + { + "epoch": 0.057962222856599105, + "grad_norm": 1.8932995796203613, + "learning_rate": 4.958675217316638e-05, + "loss": 5.8724, + "step": 9746 + }, + { + "epoch": 0.0579681701398801, + "grad_norm": 1.9706943035125732, + "learning_rate": 4.958666759102916e-05, + "loss": 5.6565, + "step": 9747 + }, + { + "epoch": 0.0579741174231611, + "grad_norm": 1.7686703205108643, + "learning_rate": 4.958658300030898e-05, + "loss": 5.6299, + "step": 9748 + }, + { + "epoch": 0.0579800647064421, + "grad_norm": 2.309403419494629, + "learning_rate": 4.958649840100589e-05, + "loss": 4.6907, + "step": 9749 + }, + { + "epoch": 0.05798601198972309, + "grad_norm": 2.139760971069336, + "learning_rate": 4.95864137931199e-05, + "loss": 4.7311, + "step": 9750 + }, + { + "epoch": 0.057991959273004094, + "grad_norm": 1.960402011871338, + "learning_rate": 4.958632917665105e-05, + "loss": 5.598, + "step": 9751 + }, + { + "epoch": 0.05799790655628509, + "grad_norm": 1.721853256225586, + "learning_rate": 4.958624455159936e-05, + "loss": 6.0519, + "step": 9752 + }, + { + "epoch": 0.058003853839566084, + "grad_norm": 1.8527748584747314, + "learning_rate": 4.958615991796487e-05, + "loss": 5.3347, + "step": 9753 + }, + { + "epoch": 0.058009801122847086, + "grad_norm": 2.070084810256958, + "learning_rate": 4.958607527574761e-05, + "loss": 4.6653, + "step": 9754 + }, + { + "epoch": 0.05801574840612808, + "grad_norm": 2.143115997314453, + "learning_rate": 4.9585990624947605e-05, + "loss": 4.6522, + "step": 9755 + }, + { + "epoch": 0.05802169568940908, + "grad_norm": 2.2870991230010986, + "learning_rate": 4.9585905965564884e-05, + "loss": 4.7037, + "step": 9756 + }, + { + "epoch": 0.05802764297269008, + "grad_norm": 2.0633544921875, + "learning_rate": 4.958582129759947e-05, + "loss": 4.689, + "step": 9757 + }, + { + "epoch": 0.058033590255971074, + "grad_norm": 1.8845857381820679, + "learning_rate": 4.95857366210514e-05, + "loss": 4.8077, + "step": 9758 + }, + { + "epoch": 0.05803953753925207, + "grad_norm": 1.7319310903549194, + "learning_rate": 4.9585651935920715e-05, + "loss": 5.3528, + "step": 9759 + }, + { + "epoch": 0.058045484822533064, + "grad_norm": 2.2369909286499023, + "learning_rate": 4.958556724220742e-05, + "loss": 4.6549, + "step": 9760 + }, + { + "epoch": 0.058051432105814066, + "grad_norm": 2.076901912689209, + "learning_rate": 4.9585482539911566e-05, + "loss": 4.4642, + "step": 9761 + }, + { + "epoch": 0.05805737938909506, + "grad_norm": 2.0487091541290283, + "learning_rate": 4.958539782903318e-05, + "loss": 4.6575, + "step": 9762 + }, + { + "epoch": 0.058063326672376056, + "grad_norm": 2.2116169929504395, + "learning_rate": 4.9585313109572274e-05, + "loss": 4.4866, + "step": 9763 + }, + { + "epoch": 0.05806927395565706, + "grad_norm": 1.9818168878555298, + "learning_rate": 4.958522838152889e-05, + "loss": 4.7502, + "step": 9764 + }, + { + "epoch": 0.05807522123893805, + "grad_norm": 2.1484010219573975, + "learning_rate": 4.958514364490306e-05, + "loss": 5.7809, + "step": 9765 + }, + { + "epoch": 0.05808116852221905, + "grad_norm": 2.4087398052215576, + "learning_rate": 4.958505889969481e-05, + "loss": 5.5236, + "step": 9766 + }, + { + "epoch": 0.05808711580550005, + "grad_norm": 2.000459909439087, + "learning_rate": 4.9584974145904165e-05, + "loss": 4.7356, + "step": 9767 + }, + { + "epoch": 0.058093063088781045, + "grad_norm": 2.3958399295806885, + "learning_rate": 4.958488938353116e-05, + "loss": 4.3695, + "step": 9768 + }, + { + "epoch": 0.05809901037206204, + "grad_norm": 2.039053440093994, + "learning_rate": 4.958480461257584e-05, + "loss": 4.6128, + "step": 9769 + }, + { + "epoch": 0.05810495765534304, + "grad_norm": 1.7663822174072266, + "learning_rate": 4.95847198330382e-05, + "loss": 4.8533, + "step": 9770 + }, + { + "epoch": 0.05811090493862404, + "grad_norm": 2.594289779663086, + "learning_rate": 4.9584635044918295e-05, + "loss": 5.3048, + "step": 9771 + }, + { + "epoch": 0.05811685222190503, + "grad_norm": 2.712372303009033, + "learning_rate": 4.958455024821615e-05, + "loss": 5.4435, + "step": 9772 + }, + { + "epoch": 0.05812279950518603, + "grad_norm": 2.4295241832733154, + "learning_rate": 4.9584465442931794e-05, + "loss": 5.2665, + "step": 9773 + }, + { + "epoch": 0.05812874678846703, + "grad_norm": 2.5820906162261963, + "learning_rate": 4.9584380629065245e-05, + "loss": 5.6227, + "step": 9774 + }, + { + "epoch": 0.058134694071748025, + "grad_norm": 2.140291213989258, + "learning_rate": 4.958429580661655e-05, + "loss": 5.1792, + "step": 9775 + }, + { + "epoch": 0.05814064135502902, + "grad_norm": 2.111551523208618, + "learning_rate": 4.9584210975585734e-05, + "loss": 5.7262, + "step": 9776 + }, + { + "epoch": 0.05814658863831002, + "grad_norm": 2.5887086391448975, + "learning_rate": 4.958412613597282e-05, + "loss": 5.1613, + "step": 9777 + }, + { + "epoch": 0.05815253592159102, + "grad_norm": 1.9678863286972046, + "learning_rate": 4.9584041287777835e-05, + "loss": 5.7693, + "step": 9778 + }, + { + "epoch": 0.05815848320487201, + "grad_norm": 2.000265121459961, + "learning_rate": 4.958395643100083e-05, + "loss": 5.654, + "step": 9779 + }, + { + "epoch": 0.058164430488153014, + "grad_norm": 1.8926239013671875, + "learning_rate": 4.958387156564181e-05, + "loss": 5.3004, + "step": 9780 + }, + { + "epoch": 0.05817037777143401, + "grad_norm": 2.3557002544403076, + "learning_rate": 4.958378669170082e-05, + "loss": 5.5437, + "step": 9781 + }, + { + "epoch": 0.058176325054715004, + "grad_norm": 1.9434150457382202, + "learning_rate": 4.958370180917787e-05, + "loss": 5.8442, + "step": 9782 + }, + { + "epoch": 0.058182272337996006, + "grad_norm": 1.875900387763977, + "learning_rate": 4.9583616918073026e-05, + "loss": 5.9312, + "step": 9783 + }, + { + "epoch": 0.058188219621277, + "grad_norm": 1.8945306539535522, + "learning_rate": 4.958353201838628e-05, + "loss": 5.7166, + "step": 9784 + }, + { + "epoch": 0.058194166904557997, + "grad_norm": 1.7081416845321655, + "learning_rate": 4.9583447110117684e-05, + "loss": 6.0803, + "step": 9785 + }, + { + "epoch": 0.058200114187839, + "grad_norm": 1.6520098447799683, + "learning_rate": 4.958336219326725e-05, + "loss": 6.0181, + "step": 9786 + }, + { + "epoch": 0.058206061471119994, + "grad_norm": 1.90665602684021, + "learning_rate": 4.9583277267835024e-05, + "loss": 5.586, + "step": 9787 + }, + { + "epoch": 0.05821200875440099, + "grad_norm": 1.8179740905761719, + "learning_rate": 4.958319233382104e-05, + "loss": 5.8637, + "step": 9788 + }, + { + "epoch": 0.058217956037681984, + "grad_norm": 1.8228380680084229, + "learning_rate": 4.95831073912253e-05, + "loss": 5.7406, + "step": 9789 + }, + { + "epoch": 0.058223903320962986, + "grad_norm": 1.691999912261963, + "learning_rate": 4.958302244004786e-05, + "loss": 5.8021, + "step": 9790 + }, + { + "epoch": 0.05822985060424398, + "grad_norm": 1.8590795993804932, + "learning_rate": 4.958293748028875e-05, + "loss": 5.5897, + "step": 9791 + }, + { + "epoch": 0.058235797887524976, + "grad_norm": 1.5923960208892822, + "learning_rate": 4.958285251194797e-05, + "loss": 5.7424, + "step": 9792 + }, + { + "epoch": 0.05824174517080598, + "grad_norm": 1.6928486824035645, + "learning_rate": 4.958276753502559e-05, + "loss": 5.905, + "step": 9793 + }, + { + "epoch": 0.05824769245408697, + "grad_norm": 2.120725393295288, + "learning_rate": 4.958268254952161e-05, + "loss": 5.9974, + "step": 9794 + }, + { + "epoch": 0.05825363973736797, + "grad_norm": 1.850441813468933, + "learning_rate": 4.9582597555436075e-05, + "loss": 5.7171, + "step": 9795 + }, + { + "epoch": 0.05825958702064897, + "grad_norm": 2.196037530899048, + "learning_rate": 4.9582512552769e-05, + "loss": 6.1243, + "step": 9796 + }, + { + "epoch": 0.058265534303929965, + "grad_norm": 1.9170193672180176, + "learning_rate": 4.9582427541520423e-05, + "loss": 5.8087, + "step": 9797 + }, + { + "epoch": 0.05827148158721096, + "grad_norm": 1.974478006362915, + "learning_rate": 4.958234252169039e-05, + "loss": 5.794, + "step": 9798 + }, + { + "epoch": 0.05827742887049196, + "grad_norm": 1.824965476989746, + "learning_rate": 4.9582257493278904e-05, + "loss": 5.6904, + "step": 9799 + }, + { + "epoch": 0.05828337615377296, + "grad_norm": 1.828037142753601, + "learning_rate": 4.9582172456286e-05, + "loss": 5.6793, + "step": 9800 + }, + { + "epoch": 0.05828932343705395, + "grad_norm": 1.8949617147445679, + "learning_rate": 4.9582087410711726e-05, + "loss": 5.6685, + "step": 9801 + }, + { + "epoch": 0.05829527072033495, + "grad_norm": 1.8183050155639648, + "learning_rate": 4.958200235655609e-05, + "loss": 5.7754, + "step": 9802 + }, + { + "epoch": 0.05830121800361595, + "grad_norm": 1.6816062927246094, + "learning_rate": 4.9581917293819135e-05, + "loss": 5.6931, + "step": 9803 + }, + { + "epoch": 0.058307165286896945, + "grad_norm": 1.875659465789795, + "learning_rate": 4.958183222250089e-05, + "loss": 5.7568, + "step": 9804 + }, + { + "epoch": 0.05831311257017794, + "grad_norm": 2.162404775619507, + "learning_rate": 4.958174714260137e-05, + "loss": 5.7969, + "step": 9805 + }, + { + "epoch": 0.05831905985345894, + "grad_norm": 2.2122790813446045, + "learning_rate": 4.958166205412064e-05, + "loss": 5.7301, + "step": 9806 + }, + { + "epoch": 0.05832500713673994, + "grad_norm": 1.8822424411773682, + "learning_rate": 4.9581576957058686e-05, + "loss": 5.7034, + "step": 9807 + }, + { + "epoch": 0.05833095442002093, + "grad_norm": 1.8780319690704346, + "learning_rate": 4.958149185141556e-05, + "loss": 5.6573, + "step": 9808 + }, + { + "epoch": 0.058336901703301934, + "grad_norm": 1.9177708625793457, + "learning_rate": 4.958140673719129e-05, + "loss": 5.6619, + "step": 9809 + }, + { + "epoch": 0.05834284898658293, + "grad_norm": 1.8662844896316528, + "learning_rate": 4.95813216143859e-05, + "loss": 5.5857, + "step": 9810 + }, + { + "epoch": 0.058348796269863924, + "grad_norm": 2.1798834800720215, + "learning_rate": 4.958123648299944e-05, + "loss": 5.5811, + "step": 9811 + }, + { + "epoch": 0.058354743553144926, + "grad_norm": 2.1575138568878174, + "learning_rate": 4.958115134303191e-05, + "loss": 5.6761, + "step": 9812 + }, + { + "epoch": 0.05836069083642592, + "grad_norm": 2.055314302444458, + "learning_rate": 4.958106619448336e-05, + "loss": 5.721, + "step": 9813 + }, + { + "epoch": 0.058366638119706916, + "grad_norm": 1.8962149620056152, + "learning_rate": 4.958098103735381e-05, + "loss": 5.6132, + "step": 9814 + }, + { + "epoch": 0.05837258540298792, + "grad_norm": 1.7715760469436646, + "learning_rate": 4.95808958716433e-05, + "loss": 5.6461, + "step": 9815 + }, + { + "epoch": 0.058378532686268914, + "grad_norm": 1.9166070222854614, + "learning_rate": 4.958081069735184e-05, + "loss": 5.5628, + "step": 9816 + }, + { + "epoch": 0.05838447996954991, + "grad_norm": 1.8872902393341064, + "learning_rate": 4.9580725514479484e-05, + "loss": 5.6476, + "step": 9817 + }, + { + "epoch": 0.058390427252830904, + "grad_norm": 1.8257521390914917, + "learning_rate": 4.9580640323026254e-05, + "loss": 5.6175, + "step": 9818 + }, + { + "epoch": 0.058396374536111906, + "grad_norm": 1.919291377067566, + "learning_rate": 4.958055512299217e-05, + "loss": 5.5954, + "step": 9819 + }, + { + "epoch": 0.0584023218193929, + "grad_norm": 1.8318076133728027, + "learning_rate": 4.958046991437726e-05, + "loss": 5.6255, + "step": 9820 + }, + { + "epoch": 0.058408269102673896, + "grad_norm": 1.9153858423233032, + "learning_rate": 4.958038469718158e-05, + "loss": 5.6787, + "step": 9821 + }, + { + "epoch": 0.0584142163859549, + "grad_norm": 1.967021107673645, + "learning_rate": 4.958029947140513e-05, + "loss": 5.6714, + "step": 9822 + }, + { + "epoch": 0.05842016366923589, + "grad_norm": 1.654997706413269, + "learning_rate": 4.958021423704795e-05, + "loss": 5.4809, + "step": 9823 + }, + { + "epoch": 0.05842611095251689, + "grad_norm": 1.8183335065841675, + "learning_rate": 4.9580128994110074e-05, + "loss": 5.5223, + "step": 9824 + }, + { + "epoch": 0.05843205823579789, + "grad_norm": 1.7665660381317139, + "learning_rate": 4.958004374259153e-05, + "loss": 5.5639, + "step": 9825 + }, + { + "epoch": 0.058438005519078885, + "grad_norm": 1.8233551979064941, + "learning_rate": 4.957995848249235e-05, + "loss": 5.6358, + "step": 9826 + }, + { + "epoch": 0.05844395280235988, + "grad_norm": 1.721301555633545, + "learning_rate": 4.957987321381256e-05, + "loss": 5.4989, + "step": 9827 + }, + { + "epoch": 0.05844990008564088, + "grad_norm": 1.6921659708023071, + "learning_rate": 4.957978793655218e-05, + "loss": 5.448, + "step": 9828 + }, + { + "epoch": 0.05845584736892188, + "grad_norm": 1.810354232788086, + "learning_rate": 4.957970265071126e-05, + "loss": 5.4501, + "step": 9829 + }, + { + "epoch": 0.05846179465220287, + "grad_norm": 1.7205116748809814, + "learning_rate": 4.957961735628982e-05, + "loss": 5.5222, + "step": 9830 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 1.9636965990066528, + "learning_rate": 4.957953205328788e-05, + "loss": 5.5894, + "step": 9831 + }, + { + "epoch": 0.05847368921876487, + "grad_norm": 1.9312820434570312, + "learning_rate": 4.9579446741705485e-05, + "loss": 5.6543, + "step": 9832 + }, + { + "epoch": 0.058479636502045865, + "grad_norm": 1.870448112487793, + "learning_rate": 4.9579361421542665e-05, + "loss": 5.6707, + "step": 9833 + }, + { + "epoch": 0.05848558378532686, + "grad_norm": 1.5943735837936401, + "learning_rate": 4.9579276092799435e-05, + "loss": 5.5184, + "step": 9834 + }, + { + "epoch": 0.05849153106860786, + "grad_norm": 1.6929852962493896, + "learning_rate": 4.957919075547584e-05, + "loss": 5.5188, + "step": 9835 + }, + { + "epoch": 0.05849747835188886, + "grad_norm": 2.0268075466156006, + "learning_rate": 4.95791054095719e-05, + "loss": 5.4909, + "step": 9836 + }, + { + "epoch": 0.05850342563516985, + "grad_norm": 2.047982931137085, + "learning_rate": 4.957902005508765e-05, + "loss": 5.6459, + "step": 9837 + }, + { + "epoch": 0.058509372918450854, + "grad_norm": 1.7938467264175415, + "learning_rate": 4.957893469202311e-05, + "loss": 5.4805, + "step": 9838 + }, + { + "epoch": 0.05851532020173185, + "grad_norm": 1.803093433380127, + "learning_rate": 4.957884932037833e-05, + "loss": 5.4092, + "step": 9839 + }, + { + "epoch": 0.058521267485012844, + "grad_norm": 1.8001232147216797, + "learning_rate": 4.957876394015333e-05, + "loss": 5.9168, + "step": 9840 + }, + { + "epoch": 0.058527214768293846, + "grad_norm": 1.9442622661590576, + "learning_rate": 4.9578678551348125e-05, + "loss": 6.0317, + "step": 9841 + }, + { + "epoch": 0.05853316205157484, + "grad_norm": 2.013845205307007, + "learning_rate": 4.957859315396276e-05, + "loss": 5.6855, + "step": 9842 + }, + { + "epoch": 0.058539109334855836, + "grad_norm": 2.7557523250579834, + "learning_rate": 4.9578507747997264e-05, + "loss": 5.3782, + "step": 9843 + }, + { + "epoch": 0.05854505661813684, + "grad_norm": 1.9822032451629639, + "learning_rate": 4.957842233345167e-05, + "loss": 6.22, + "step": 9844 + }, + { + "epoch": 0.058551003901417834, + "grad_norm": 1.7408699989318848, + "learning_rate": 4.9578336910326e-05, + "loss": 5.2347, + "step": 9845 + }, + { + "epoch": 0.05855695118469883, + "grad_norm": 3.2186660766601562, + "learning_rate": 4.957825147862028e-05, + "loss": 5.3282, + "step": 9846 + }, + { + "epoch": 0.058562898467979824, + "grad_norm": 3.3589892387390137, + "learning_rate": 4.957816603833455e-05, + "loss": 5.5689, + "step": 9847 + }, + { + "epoch": 0.058568845751260826, + "grad_norm": 3.4228861331939697, + "learning_rate": 4.957808058946883e-05, + "loss": 5.5797, + "step": 9848 + }, + { + "epoch": 0.05857479303454182, + "grad_norm": 2.420506238937378, + "learning_rate": 4.957799513202317e-05, + "loss": 5.735, + "step": 9849 + }, + { + "epoch": 0.058580740317822816, + "grad_norm": 1.8269212245941162, + "learning_rate": 4.957790966599758e-05, + "loss": 5.7571, + "step": 9850 + }, + { + "epoch": 0.05858668760110382, + "grad_norm": 2.011110305786133, + "learning_rate": 4.957782419139209e-05, + "loss": 5.9786, + "step": 9851 + }, + { + "epoch": 0.05859263488438481, + "grad_norm": 2.3139355182647705, + "learning_rate": 4.957773870820674e-05, + "loss": 5.8356, + "step": 9852 + }, + { + "epoch": 0.05859858216766581, + "grad_norm": 2.3406572341918945, + "learning_rate": 4.957765321644155e-05, + "loss": 5.8426, + "step": 9853 + }, + { + "epoch": 0.05860452945094681, + "grad_norm": 2.1194591522216797, + "learning_rate": 4.957756771609657e-05, + "loss": 5.6152, + "step": 9854 + }, + { + "epoch": 0.058610476734227805, + "grad_norm": 1.9966599941253662, + "learning_rate": 4.95774822071718e-05, + "loss": 5.8189, + "step": 9855 + }, + { + "epoch": 0.0586164240175088, + "grad_norm": 1.8953092098236084, + "learning_rate": 4.95773966896673e-05, + "loss": 5.8185, + "step": 9856 + }, + { + "epoch": 0.0586223713007898, + "grad_norm": 1.9035093784332275, + "learning_rate": 4.957731116358307e-05, + "loss": 5.6554, + "step": 9857 + }, + { + "epoch": 0.0586283185840708, + "grad_norm": 3.507546901702881, + "learning_rate": 4.9577225628919157e-05, + "loss": 5.8906, + "step": 9858 + }, + { + "epoch": 0.05863426586735179, + "grad_norm": 2.1840403079986572, + "learning_rate": 4.9577140085675586e-05, + "loss": 5.6084, + "step": 9859 + }, + { + "epoch": 0.05864021315063279, + "grad_norm": 2.008424758911133, + "learning_rate": 4.95770545338524e-05, + "loss": 5.8435, + "step": 9860 + }, + { + "epoch": 0.05864616043391379, + "grad_norm": 1.9004656076431274, + "learning_rate": 4.957696897344961e-05, + "loss": 5.5906, + "step": 9861 + }, + { + "epoch": 0.058652107717194785, + "grad_norm": 1.8043147325515747, + "learning_rate": 4.9576883404467255e-05, + "loss": 5.6057, + "step": 9862 + }, + { + "epoch": 0.05865805500047578, + "grad_norm": 1.6765285730361938, + "learning_rate": 4.957679782690537e-05, + "loss": 5.7246, + "step": 9863 + }, + { + "epoch": 0.05866400228375678, + "grad_norm": 2.0207018852233887, + "learning_rate": 4.9576712240763974e-05, + "loss": 5.8459, + "step": 9864 + }, + { + "epoch": 0.05866994956703778, + "grad_norm": 1.975874423980713, + "learning_rate": 4.95766266460431e-05, + "loss": 5.7313, + "step": 9865 + }, + { + "epoch": 0.05867589685031877, + "grad_norm": 2.085277557373047, + "learning_rate": 4.957654104274279e-05, + "loss": 5.1359, + "step": 9866 + }, + { + "epoch": 0.058681844133599774, + "grad_norm": 2.039437770843506, + "learning_rate": 4.957645543086305e-05, + "loss": 5.5673, + "step": 9867 + }, + { + "epoch": 0.05868779141688077, + "grad_norm": 2.0692098140716553, + "learning_rate": 4.9576369810403926e-05, + "loss": 5.6326, + "step": 9868 + }, + { + "epoch": 0.058693738700161764, + "grad_norm": 2.3873767852783203, + "learning_rate": 4.957628418136545e-05, + "loss": 5.5133, + "step": 9869 + }, + { + "epoch": 0.058699685983442766, + "grad_norm": 2.9347658157348633, + "learning_rate": 4.957619854374764e-05, + "loss": 5.5444, + "step": 9870 + }, + { + "epoch": 0.05870563326672376, + "grad_norm": 2.955348014831543, + "learning_rate": 4.957611289755054e-05, + "loss": 5.4883, + "step": 9871 + }, + { + "epoch": 0.058711580550004756, + "grad_norm": 2.147033214569092, + "learning_rate": 4.957602724277417e-05, + "loss": 5.4554, + "step": 9872 + }, + { + "epoch": 0.05871752783328576, + "grad_norm": 2.1422510147094727, + "learning_rate": 4.957594157941856e-05, + "loss": 5.56, + "step": 9873 + }, + { + "epoch": 0.05872347511656675, + "grad_norm": 2.018935203552246, + "learning_rate": 4.957585590748375e-05, + "loss": 5.5176, + "step": 9874 + }, + { + "epoch": 0.05872942239984775, + "grad_norm": 3.0146446228027344, + "learning_rate": 4.957577022696976e-05, + "loss": 5.2623, + "step": 9875 + }, + { + "epoch": 0.058735369683128744, + "grad_norm": 2.923011064529419, + "learning_rate": 4.957568453787662e-05, + "loss": 5.1828, + "step": 9876 + }, + { + "epoch": 0.058741316966409746, + "grad_norm": 2.7203526496887207, + "learning_rate": 4.9575598840204366e-05, + "loss": 5.1565, + "step": 9877 + }, + { + "epoch": 0.05874726424969074, + "grad_norm": 2.056260108947754, + "learning_rate": 4.9575513133953025e-05, + "loss": 5.1345, + "step": 9878 + }, + { + "epoch": 0.058753211532971736, + "grad_norm": 2.3120932579040527, + "learning_rate": 4.9575427419122616e-05, + "loss": 5.1792, + "step": 9879 + }, + { + "epoch": 0.05875915881625274, + "grad_norm": 2.1298701763153076, + "learning_rate": 4.9575341695713186e-05, + "loss": 5.1447, + "step": 9880 + }, + { + "epoch": 0.05876510609953373, + "grad_norm": 2.393869638442993, + "learning_rate": 4.9575255963724756e-05, + "loss": 5.2938, + "step": 9881 + }, + { + "epoch": 0.05877105338281473, + "grad_norm": 2.324061155319214, + "learning_rate": 4.9575170223157366e-05, + "loss": 5.1488, + "step": 9882 + }, + { + "epoch": 0.05877700066609573, + "grad_norm": 2.1416141986846924, + "learning_rate": 4.957508447401103e-05, + "loss": 5.0551, + "step": 9883 + }, + { + "epoch": 0.058782947949376725, + "grad_norm": 2.127350091934204, + "learning_rate": 4.9574998716285795e-05, + "loss": 5.03, + "step": 9884 + }, + { + "epoch": 0.05878889523265772, + "grad_norm": 2.317267417907715, + "learning_rate": 4.957491294998167e-05, + "loss": 5.049, + "step": 9885 + }, + { + "epoch": 0.05879484251593872, + "grad_norm": 2.3667004108428955, + "learning_rate": 4.9574827175098704e-05, + "loss": 5.009, + "step": 9886 + }, + { + "epoch": 0.05880078979921972, + "grad_norm": 2.4034934043884277, + "learning_rate": 4.9574741391636915e-05, + "loss": 4.9419, + "step": 9887 + }, + { + "epoch": 0.05880673708250071, + "grad_norm": 2.3792901039123535, + "learning_rate": 4.957465559959634e-05, + "loss": 4.8517, + "step": 9888 + }, + { + "epoch": 0.05881268436578171, + "grad_norm": 2.139249086380005, + "learning_rate": 4.957456979897701e-05, + "loss": 5.0767, + "step": 9889 + }, + { + "epoch": 0.05881863164906271, + "grad_norm": 2.5370614528656006, + "learning_rate": 4.957448398977894e-05, + "loss": 5.0243, + "step": 9890 + }, + { + "epoch": 0.058824578932343705, + "grad_norm": 2.0474746227264404, + "learning_rate": 4.957439817200218e-05, + "loss": 4.988, + "step": 9891 + }, + { + "epoch": 0.0588305262156247, + "grad_norm": 2.1323394775390625, + "learning_rate": 4.957431234564675e-05, + "loss": 5.7499, + "step": 9892 + }, + { + "epoch": 0.0588364734989057, + "grad_norm": 2.135988473892212, + "learning_rate": 4.957422651071269e-05, + "loss": 6.0197, + "step": 9893 + }, + { + "epoch": 0.0588424207821867, + "grad_norm": 2.4457356929779053, + "learning_rate": 4.957414066720001e-05, + "loss": 5.4461, + "step": 9894 + }, + { + "epoch": 0.05884836806546769, + "grad_norm": 2.3973019123077393, + "learning_rate": 4.957405481510876e-05, + "loss": 5.0372, + "step": 9895 + }, + { + "epoch": 0.058854315348748694, + "grad_norm": 2.5532052516937256, + "learning_rate": 4.957396895443896e-05, + "loss": 5.1462, + "step": 9896 + }, + { + "epoch": 0.05886026263202969, + "grad_norm": 2.3662166595458984, + "learning_rate": 4.9573883085190633e-05, + "loss": 5.1894, + "step": 9897 + }, + { + "epoch": 0.058866209915310684, + "grad_norm": 2.153883695602417, + "learning_rate": 4.9573797207363825e-05, + "loss": 5.6859, + "step": 9898 + }, + { + "epoch": 0.058872157198591686, + "grad_norm": 1.9541380405426025, + "learning_rate": 4.957371132095856e-05, + "loss": 5.5487, + "step": 9899 + }, + { + "epoch": 0.05887810448187268, + "grad_norm": 1.7920335531234741, + "learning_rate": 4.957362542597486e-05, + "loss": 5.4021, + "step": 9900 + }, + { + "epoch": 0.058884051765153676, + "grad_norm": 2.351090431213379, + "learning_rate": 4.9573539522412756e-05, + "loss": 4.9377, + "step": 9901 + }, + { + "epoch": 0.05888999904843468, + "grad_norm": 2.4780900478363037, + "learning_rate": 4.95734536102723e-05, + "loss": 5.04, + "step": 9902 + }, + { + "epoch": 0.05889594633171567, + "grad_norm": 1.7211192846298218, + "learning_rate": 4.957336768955349e-05, + "loss": 5.2959, + "step": 9903 + }, + { + "epoch": 0.05890189361499667, + "grad_norm": 1.9051212072372437, + "learning_rate": 4.957328176025638e-05, + "loss": 5.5587, + "step": 9904 + }, + { + "epoch": 0.058907840898277664, + "grad_norm": 2.009725332260132, + "learning_rate": 4.957319582238099e-05, + "loss": 5.5366, + "step": 9905 + }, + { + "epoch": 0.058913788181558666, + "grad_norm": 1.835423231124878, + "learning_rate": 4.957310987592735e-05, + "loss": 5.2522, + "step": 9906 + }, + { + "epoch": 0.05891973546483966, + "grad_norm": 1.6150819063186646, + "learning_rate": 4.957302392089549e-05, + "loss": 5.3935, + "step": 9907 + }, + { + "epoch": 0.058925682748120656, + "grad_norm": 1.825942873954773, + "learning_rate": 4.9572937957285435e-05, + "loss": 5.5435, + "step": 9908 + }, + { + "epoch": 0.05893163003140166, + "grad_norm": 1.5434985160827637, + "learning_rate": 4.957285198509724e-05, + "loss": 5.2508, + "step": 9909 + }, + { + "epoch": 0.05893757731468265, + "grad_norm": 1.7675530910491943, + "learning_rate": 4.9572766004330894e-05, + "loss": 5.2811, + "step": 9910 + }, + { + "epoch": 0.05894352459796365, + "grad_norm": 1.5196996927261353, + "learning_rate": 4.957268001498646e-05, + "loss": 5.1829, + "step": 9911 + }, + { + "epoch": 0.05894947188124465, + "grad_norm": 1.5598126649856567, + "learning_rate": 4.9572594017063964e-05, + "loss": 5.2067, + "step": 9912 + }, + { + "epoch": 0.058955419164525645, + "grad_norm": 1.6600217819213867, + "learning_rate": 4.957250801056342e-05, + "loss": 5.1591, + "step": 9913 + }, + { + "epoch": 0.05896136644780664, + "grad_norm": 2.040682315826416, + "learning_rate": 4.957242199548487e-05, + "loss": 4.8792, + "step": 9914 + }, + { + "epoch": 0.05896731373108764, + "grad_norm": 2.0122241973876953, + "learning_rate": 4.9572335971828346e-05, + "loss": 5.9489, + "step": 9915 + }, + { + "epoch": 0.05897326101436864, + "grad_norm": 2.4522452354431152, + "learning_rate": 4.957224993959386e-05, + "loss": 5.943, + "step": 9916 + }, + { + "epoch": 0.05897920829764963, + "grad_norm": 1.9101065397262573, + "learning_rate": 4.957216389878147e-05, + "loss": 5.858, + "step": 9917 + }, + { + "epoch": 0.05898515558093063, + "grad_norm": 1.6488839387893677, + "learning_rate": 4.957207784939118e-05, + "loss": 5.4935, + "step": 9918 + }, + { + "epoch": 0.05899110286421163, + "grad_norm": 1.7620775699615479, + "learning_rate": 4.957199179142303e-05, + "loss": 5.6067, + "step": 9919 + }, + { + "epoch": 0.058997050147492625, + "grad_norm": 2.6018314361572266, + "learning_rate": 4.957190572487707e-05, + "loss": 5.5249, + "step": 9920 + }, + { + "epoch": 0.05900299743077362, + "grad_norm": 1.810274600982666, + "learning_rate": 4.957181964975329e-05, + "loss": 5.4063, + "step": 9921 + }, + { + "epoch": 0.05900894471405462, + "grad_norm": 1.7467454671859741, + "learning_rate": 4.957173356605176e-05, + "loss": 5.4476, + "step": 9922 + }, + { + "epoch": 0.05901489199733562, + "grad_norm": 1.9074509143829346, + "learning_rate": 4.9571647473772483e-05, + "loss": 5.8014, + "step": 9923 + }, + { + "epoch": 0.05902083928061661, + "grad_norm": 1.6376137733459473, + "learning_rate": 4.9571561372915496e-05, + "loss": 5.6813, + "step": 9924 + }, + { + "epoch": 0.059026786563897614, + "grad_norm": 1.9984129667282104, + "learning_rate": 4.957147526348083e-05, + "loss": 5.9534, + "step": 9925 + }, + { + "epoch": 0.05903273384717861, + "grad_norm": 2.38493013381958, + "learning_rate": 4.957138914546852e-05, + "loss": 5.6903, + "step": 9926 + }, + { + "epoch": 0.059038681130459604, + "grad_norm": 1.86250901222229, + "learning_rate": 4.957130301887859e-05, + "loss": 5.1777, + "step": 9927 + }, + { + "epoch": 0.059044628413740606, + "grad_norm": 1.6241644620895386, + "learning_rate": 4.957121688371107e-05, + "loss": 5.1693, + "step": 9928 + }, + { + "epoch": 0.0590505756970216, + "grad_norm": 1.5627753734588623, + "learning_rate": 4.9571130739965996e-05, + "loss": 5.0313, + "step": 9929 + }, + { + "epoch": 0.059056522980302596, + "grad_norm": 1.6763062477111816, + "learning_rate": 4.957104458764339e-05, + "loss": 4.9973, + "step": 9930 + }, + { + "epoch": 0.0590624702635836, + "grad_norm": 1.6215085983276367, + "learning_rate": 4.957095842674329e-05, + "loss": 5.2216, + "step": 9931 + }, + { + "epoch": 0.05906841754686459, + "grad_norm": 1.5599844455718994, + "learning_rate": 4.957087225726572e-05, + "loss": 5.4525, + "step": 9932 + }, + { + "epoch": 0.05907436483014559, + "grad_norm": 1.3916441202163696, + "learning_rate": 4.957078607921072e-05, + "loss": 5.4434, + "step": 9933 + }, + { + "epoch": 0.059080312113426584, + "grad_norm": 1.524478554725647, + "learning_rate": 4.9570699892578295e-05, + "loss": 5.3979, + "step": 9934 + }, + { + "epoch": 0.059086259396707586, + "grad_norm": 1.264108657836914, + "learning_rate": 4.9570613697368505e-05, + "loss": 5.2892, + "step": 9935 + }, + { + "epoch": 0.05909220667998858, + "grad_norm": 1.7481588125228882, + "learning_rate": 4.957052749358137e-05, + "loss": 4.8539, + "step": 9936 + }, + { + "epoch": 0.059098153963269576, + "grad_norm": 1.675515055656433, + "learning_rate": 4.957044128121692e-05, + "loss": 5.4645, + "step": 9937 + }, + { + "epoch": 0.05910410124655058, + "grad_norm": 1.6560577154159546, + "learning_rate": 4.957035506027517e-05, + "loss": 4.9354, + "step": 9938 + }, + { + "epoch": 0.05911004852983157, + "grad_norm": 1.5030722618103027, + "learning_rate": 4.9570268830756174e-05, + "loss": 5.206, + "step": 9939 + }, + { + "epoch": 0.05911599581311257, + "grad_norm": 1.65435791015625, + "learning_rate": 4.957018259265994e-05, + "loss": 5.2132, + "step": 9940 + }, + { + "epoch": 0.05912194309639357, + "grad_norm": 1.6701000928878784, + "learning_rate": 4.9570096345986515e-05, + "loss": 5.2313, + "step": 9941 + }, + { + "epoch": 0.059127890379674565, + "grad_norm": 1.412954330444336, + "learning_rate": 4.957001009073593e-05, + "loss": 5.2511, + "step": 9942 + }, + { + "epoch": 0.05913383766295556, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.95699238269082e-05, + "loss": 5.3646, + "step": 9943 + }, + { + "epoch": 0.05913978494623656, + "grad_norm": 1.6969150304794312, + "learning_rate": 4.9569837554503365e-05, + "loss": 5.3001, + "step": 9944 + }, + { + "epoch": 0.05914573222951756, + "grad_norm": 1.8579715490341187, + "learning_rate": 4.9569751273521454e-05, + "loss": 5.0944, + "step": 9945 + }, + { + "epoch": 0.05915167951279855, + "grad_norm": 1.6907633543014526, + "learning_rate": 4.956966498396249e-05, + "loss": 5.1447, + "step": 9946 + }, + { + "epoch": 0.059157626796079554, + "grad_norm": 1.7581912279129028, + "learning_rate": 4.9569578685826525e-05, + "loss": 5.2065, + "step": 9947 + }, + { + "epoch": 0.05916357407936055, + "grad_norm": 1.4447051286697388, + "learning_rate": 4.9569492379113555e-05, + "loss": 5.081, + "step": 9948 + }, + { + "epoch": 0.059169521362641544, + "grad_norm": 1.731697916984558, + "learning_rate": 4.9569406063823644e-05, + "loss": 5.241, + "step": 9949 + }, + { + "epoch": 0.05917546864592254, + "grad_norm": 1.6483672857284546, + "learning_rate": 4.956931973995681e-05, + "loss": 5.306, + "step": 9950 + }, + { + "epoch": 0.05918141592920354, + "grad_norm": 2.2123141288757324, + "learning_rate": 4.956923340751306e-05, + "loss": 5.6134, + "step": 9951 + }, + { + "epoch": 0.05918736321248454, + "grad_norm": 1.8569937944412231, + "learning_rate": 4.956914706649246e-05, + "loss": 5.4819, + "step": 9952 + }, + { + "epoch": 0.05919331049576553, + "grad_norm": 1.8417435884475708, + "learning_rate": 4.956906071689502e-05, + "loss": 5.4116, + "step": 9953 + }, + { + "epoch": 0.059199257779046534, + "grad_norm": 1.7050427198410034, + "learning_rate": 4.956897435872078e-05, + "loss": 5.238, + "step": 9954 + }, + { + "epoch": 0.05920520506232753, + "grad_norm": 1.6636401414871216, + "learning_rate": 4.956888799196976e-05, + "loss": 5.0962, + "step": 9955 + }, + { + "epoch": 0.059211152345608524, + "grad_norm": 1.9194599390029907, + "learning_rate": 4.9568801616642e-05, + "loss": 5.2078, + "step": 9956 + }, + { + "epoch": 0.059217099628889526, + "grad_norm": 1.6154237985610962, + "learning_rate": 4.956871523273752e-05, + "loss": 5.3562, + "step": 9957 + }, + { + "epoch": 0.05922304691217052, + "grad_norm": 1.4500404596328735, + "learning_rate": 4.956862884025636e-05, + "loss": 5.2061, + "step": 9958 + }, + { + "epoch": 0.059228994195451516, + "grad_norm": 1.6681636571884155, + "learning_rate": 4.956854243919854e-05, + "loss": 5.3455, + "step": 9959 + }, + { + "epoch": 0.05923494147873252, + "grad_norm": 1.7175511121749878, + "learning_rate": 4.9568456029564104e-05, + "loss": 5.2967, + "step": 9960 + }, + { + "epoch": 0.05924088876201351, + "grad_norm": 1.5013905763626099, + "learning_rate": 4.956836961135306e-05, + "loss": 4.9836, + "step": 9961 + }, + { + "epoch": 0.05924683604529451, + "grad_norm": 1.6521363258361816, + "learning_rate": 4.956828318456546e-05, + "loss": 5.0295, + "step": 9962 + }, + { + "epoch": 0.0592527833285755, + "grad_norm": 1.5945814847946167, + "learning_rate": 4.9568196749201326e-05, + "loss": 4.9511, + "step": 9963 + }, + { + "epoch": 0.059258730611856505, + "grad_norm": 1.508301854133606, + "learning_rate": 4.95681103052607e-05, + "loss": 4.9469, + "step": 9964 + }, + { + "epoch": 0.0592646778951375, + "grad_norm": 1.5902310609817505, + "learning_rate": 4.956802385274358e-05, + "loss": 4.9761, + "step": 9965 + }, + { + "epoch": 0.059270625178418496, + "grad_norm": 1.739424467086792, + "learning_rate": 4.956793739165003e-05, + "loss": 5.2443, + "step": 9966 + }, + { + "epoch": 0.0592765724616995, + "grad_norm": 1.8317997455596924, + "learning_rate": 4.9567850921980056e-05, + "loss": 5.0046, + "step": 9967 + }, + { + "epoch": 0.05928251974498049, + "grad_norm": 1.8073506355285645, + "learning_rate": 4.956776444373371e-05, + "loss": 5.1779, + "step": 9968 + }, + { + "epoch": 0.05928846702826149, + "grad_norm": 1.8806017637252808, + "learning_rate": 4.956767795691101e-05, + "loss": 5.2956, + "step": 9969 + }, + { + "epoch": 0.05929441431154249, + "grad_norm": 1.8397493362426758, + "learning_rate": 4.956759146151198e-05, + "loss": 5.1775, + "step": 9970 + }, + { + "epoch": 0.059300361594823485, + "grad_norm": 2.001387119293213, + "learning_rate": 4.9567504957536656e-05, + "loss": 5.2149, + "step": 9971 + }, + { + "epoch": 0.05930630887810448, + "grad_norm": 2.011504650115967, + "learning_rate": 4.956741844498508e-05, + "loss": 5.2384, + "step": 9972 + }, + { + "epoch": 0.05931225616138548, + "grad_norm": 1.7936465740203857, + "learning_rate": 4.956733192385727e-05, + "loss": 5.2297, + "step": 9973 + }, + { + "epoch": 0.05931820344466648, + "grad_norm": 1.7336666584014893, + "learning_rate": 4.9567245394153255e-05, + "loss": 5.1637, + "step": 9974 + }, + { + "epoch": 0.05932415072794747, + "grad_norm": 1.7429137229919434, + "learning_rate": 4.956715885587307e-05, + "loss": 5.1315, + "step": 9975 + }, + { + "epoch": 0.059330098011228474, + "grad_norm": 1.6609208583831787, + "learning_rate": 4.956707230901674e-05, + "loss": 5.1554, + "step": 9976 + }, + { + "epoch": 0.05933604529450947, + "grad_norm": 1.630026936531067, + "learning_rate": 4.95669857535843e-05, + "loss": 5.1569, + "step": 9977 + }, + { + "epoch": 0.059341992577790464, + "grad_norm": 1.6968966722488403, + "learning_rate": 4.956689918957579e-05, + "loss": 5.06, + "step": 9978 + }, + { + "epoch": 0.05934793986107146, + "grad_norm": 1.6973050832748413, + "learning_rate": 4.9566812616991214e-05, + "loss": 5.2044, + "step": 9979 + }, + { + "epoch": 0.05935388714435246, + "grad_norm": 1.436073899269104, + "learning_rate": 4.9566726035830624e-05, + "loss": 5.2638, + "step": 9980 + }, + { + "epoch": 0.05935983442763346, + "grad_norm": 1.7667059898376465, + "learning_rate": 4.956663944609404e-05, + "loss": 5.0912, + "step": 9981 + }, + { + "epoch": 0.05936578171091445, + "grad_norm": 2.277327060699463, + "learning_rate": 4.9566552847781504e-05, + "loss": 5.6089, + "step": 9982 + }, + { + "epoch": 0.059371728994195454, + "grad_norm": 1.521134376525879, + "learning_rate": 4.956646624089304e-05, + "loss": 5.0213, + "step": 9983 + }, + { + "epoch": 0.05937767627747645, + "grad_norm": 1.556511402130127, + "learning_rate": 4.956637962542867e-05, + "loss": 5.1126, + "step": 9984 + }, + { + "epoch": 0.059383623560757444, + "grad_norm": 1.6691070795059204, + "learning_rate": 4.9566293001388423e-05, + "loss": 5.1351, + "step": 9985 + }, + { + "epoch": 0.059389570844038446, + "grad_norm": 1.5213310718536377, + "learning_rate": 4.956620636877235e-05, + "loss": 5.2402, + "step": 9986 + }, + { + "epoch": 0.05939551812731944, + "grad_norm": 1.5169057846069336, + "learning_rate": 4.956611972758046e-05, + "loss": 5.214, + "step": 9987 + }, + { + "epoch": 0.059401465410600436, + "grad_norm": 1.6076115369796753, + "learning_rate": 4.956603307781279e-05, + "loss": 5.1081, + "step": 9988 + }, + { + "epoch": 0.05940741269388144, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.9565946419469376e-05, + "loss": 5.1582, + "step": 9989 + }, + { + "epoch": 0.05941335997716243, + "grad_norm": 1.5118008852005005, + "learning_rate": 4.956585975255025e-05, + "loss": 5.0515, + "step": 9990 + }, + { + "epoch": 0.05941930726044343, + "grad_norm": 1.8852020502090454, + "learning_rate": 4.956577307705543e-05, + "loss": 5.3811, + "step": 9991 + }, + { + "epoch": 0.05942525454372442, + "grad_norm": 1.7066764831542969, + "learning_rate": 4.9565686392984955e-05, + "loss": 5.4599, + "step": 9992 + }, + { + "epoch": 0.059431201827005425, + "grad_norm": 1.5517010688781738, + "learning_rate": 4.956559970033885e-05, + "loss": 5.0728, + "step": 9993 + }, + { + "epoch": 0.05943714911028642, + "grad_norm": 1.508901596069336, + "learning_rate": 4.956551299911715e-05, + "loss": 5.1857, + "step": 9994 + }, + { + "epoch": 0.059443096393567416, + "grad_norm": 1.8867852687835693, + "learning_rate": 4.9565426289319874e-05, + "loss": 5.2223, + "step": 9995 + }, + { + "epoch": 0.05944904367684842, + "grad_norm": 1.4767159223556519, + "learning_rate": 4.9565339570947076e-05, + "loss": 5.1404, + "step": 9996 + }, + { + "epoch": 0.05945499096012941, + "grad_norm": 1.6351869106292725, + "learning_rate": 4.956525284399876e-05, + "loss": 5.3235, + "step": 9997 + }, + { + "epoch": 0.05946093824341041, + "grad_norm": 1.543565273284912, + "learning_rate": 4.956516610847497e-05, + "loss": 5.3365, + "step": 9998 + }, + { + "epoch": 0.05946688552669141, + "grad_norm": 1.4907768964767456, + "learning_rate": 4.9565079364375746e-05, + "loss": 5.4215, + "step": 9999 + }, + { + "epoch": 0.059472832809972405, + "grad_norm": 1.5810034275054932, + "learning_rate": 4.956499261170109e-05, + "loss": 5.3899, + "step": 10000 + }, + { + "epoch": 0.0594787800932534, + "grad_norm": 1.6342787742614746, + "learning_rate": 4.956490585045106e-05, + "loss": 5.4278, + "step": 10001 + }, + { + "epoch": 0.0594847273765344, + "grad_norm": 1.5474039316177368, + "learning_rate": 4.956481908062567e-05, + "loss": 5.1232, + "step": 10002 + }, + { + "epoch": 0.0594906746598154, + "grad_norm": 1.5679951906204224, + "learning_rate": 4.956473230222496e-05, + "loss": 5.3245, + "step": 10003 + }, + { + "epoch": 0.05949662194309639, + "grad_norm": 1.4851021766662598, + "learning_rate": 4.9564645515248955e-05, + "loss": 5.1806, + "step": 10004 + }, + { + "epoch": 0.059502569226377394, + "grad_norm": 1.8518844842910767, + "learning_rate": 4.956455871969768e-05, + "loss": 5.2543, + "step": 10005 + }, + { + "epoch": 0.05950851650965839, + "grad_norm": 1.7865514755249023, + "learning_rate": 4.956447191557118e-05, + "loss": 5.405, + "step": 10006 + }, + { + "epoch": 0.059514463792939384, + "grad_norm": 1.9051682949066162, + "learning_rate": 4.956438510286946e-05, + "loss": 5.0509, + "step": 10007 + }, + { + "epoch": 0.05952041107622038, + "grad_norm": 1.5150926113128662, + "learning_rate": 4.956429828159258e-05, + "loss": 5.0065, + "step": 10008 + }, + { + "epoch": 0.05952635835950138, + "grad_norm": 1.6085938215255737, + "learning_rate": 4.956421145174056e-05, + "loss": 5.2295, + "step": 10009 + }, + { + "epoch": 0.05953230564278238, + "grad_norm": 1.6337605714797974, + "learning_rate": 4.9564124613313424e-05, + "loss": 5.1666, + "step": 10010 + }, + { + "epoch": 0.05953825292606337, + "grad_norm": 1.5093178749084473, + "learning_rate": 4.9564037766311205e-05, + "loss": 5.2268, + "step": 10011 + }, + { + "epoch": 0.059544200209344374, + "grad_norm": 1.5047305822372437, + "learning_rate": 4.9563950910733936e-05, + "loss": 5.1065, + "step": 10012 + }, + { + "epoch": 0.05955014749262537, + "grad_norm": 1.6275629997253418, + "learning_rate": 4.9563864046581645e-05, + "loss": 5.2366, + "step": 10013 + }, + { + "epoch": 0.059556094775906364, + "grad_norm": 1.535582184791565, + "learning_rate": 4.956377717385436e-05, + "loss": 5.1799, + "step": 10014 + }, + { + "epoch": 0.059562042059187366, + "grad_norm": 1.448477864265442, + "learning_rate": 4.956369029255211e-05, + "loss": 5.2207, + "step": 10015 + }, + { + "epoch": 0.05956798934246836, + "grad_norm": 1.5288492441177368, + "learning_rate": 4.956360340267494e-05, + "loss": 5.3646, + "step": 10016 + }, + { + "epoch": 0.059573936625749356, + "grad_norm": 1.5746785402297974, + "learning_rate": 4.956351650422287e-05, + "loss": 5.1941, + "step": 10017 + }, + { + "epoch": 0.05957988390903036, + "grad_norm": 1.7088212966918945, + "learning_rate": 4.956342959719592e-05, + "loss": 5.1667, + "step": 10018 + }, + { + "epoch": 0.05958583119231135, + "grad_norm": 1.7666717767715454, + "learning_rate": 4.956334268159414e-05, + "loss": 5.1808, + "step": 10019 + }, + { + "epoch": 0.05959177847559235, + "grad_norm": 1.6472598314285278, + "learning_rate": 4.956325575741755e-05, + "loss": 5.3369, + "step": 10020 + }, + { + "epoch": 0.05959772575887334, + "grad_norm": 1.7340562343597412, + "learning_rate": 4.9563168824666174e-05, + "loss": 5.5623, + "step": 10021 + }, + { + "epoch": 0.059603673042154345, + "grad_norm": 1.9677515029907227, + "learning_rate": 4.9563081883340054e-05, + "loss": 4.7612, + "step": 10022 + }, + { + "epoch": 0.05960962032543534, + "grad_norm": 1.4823256731033325, + "learning_rate": 4.9562994933439215e-05, + "loss": 5.4504, + "step": 10023 + }, + { + "epoch": 0.059615567608716336, + "grad_norm": 1.5346739292144775, + "learning_rate": 4.956290797496369e-05, + "loss": 5.5455, + "step": 10024 + }, + { + "epoch": 0.05962151489199734, + "grad_norm": 1.5420036315917969, + "learning_rate": 4.956282100791351e-05, + "loss": 5.1363, + "step": 10025 + }, + { + "epoch": 0.05962746217527833, + "grad_norm": 1.7927091121673584, + "learning_rate": 4.956273403228869e-05, + "loss": 5.0768, + "step": 10026 + }, + { + "epoch": 0.05963340945855933, + "grad_norm": 1.7139612436294556, + "learning_rate": 4.9562647048089287e-05, + "loss": 5.2046, + "step": 10027 + }, + { + "epoch": 0.05963935674184033, + "grad_norm": 1.627684473991394, + "learning_rate": 4.956256005531531e-05, + "loss": 5.3844, + "step": 10028 + }, + { + "epoch": 0.059645304025121325, + "grad_norm": 1.5006085634231567, + "learning_rate": 4.9562473053966805e-05, + "loss": 5.4948, + "step": 10029 + }, + { + "epoch": 0.05965125130840232, + "grad_norm": 1.5670723915100098, + "learning_rate": 4.956238604404378e-05, + "loss": 5.5465, + "step": 10030 + }, + { + "epoch": 0.05965719859168332, + "grad_norm": 1.5671201944351196, + "learning_rate": 4.95622990255463e-05, + "loss": 5.1969, + "step": 10031 + }, + { + "epoch": 0.05966314587496432, + "grad_norm": 2.1628634929656982, + "learning_rate": 4.956221199847436e-05, + "loss": 5.0244, + "step": 10032 + }, + { + "epoch": 0.05966909315824531, + "grad_norm": 1.5766685009002686, + "learning_rate": 4.956212496282801e-05, + "loss": 5.4698, + "step": 10033 + }, + { + "epoch": 0.059675040441526314, + "grad_norm": 1.625812292098999, + "learning_rate": 4.956203791860728e-05, + "loss": 5.3825, + "step": 10034 + }, + { + "epoch": 0.05968098772480731, + "grad_norm": 1.4307054281234741, + "learning_rate": 4.956195086581219e-05, + "loss": 5.3576, + "step": 10035 + }, + { + "epoch": 0.059686935008088304, + "grad_norm": 1.4459644556045532, + "learning_rate": 4.9561863804442785e-05, + "loss": 5.3478, + "step": 10036 + }, + { + "epoch": 0.0596928822913693, + "grad_norm": 1.8038474321365356, + "learning_rate": 4.9561776734499075e-05, + "loss": 5.4967, + "step": 10037 + }, + { + "epoch": 0.0596988295746503, + "grad_norm": 1.41011381149292, + "learning_rate": 4.9561689655981115e-05, + "loss": 5.4224, + "step": 10038 + }, + { + "epoch": 0.059704776857931297, + "grad_norm": 1.6678937673568726, + "learning_rate": 4.956160256888891e-05, + "loss": 5.27, + "step": 10039 + }, + { + "epoch": 0.05971072414121229, + "grad_norm": 1.794647455215454, + "learning_rate": 4.956151547322251e-05, + "loss": 5.2822, + "step": 10040 + }, + { + "epoch": 0.059716671424493294, + "grad_norm": 1.5010912418365479, + "learning_rate": 4.9561428368981944e-05, + "loss": 5.3778, + "step": 10041 + }, + { + "epoch": 0.05972261870777429, + "grad_norm": 1.785395860671997, + "learning_rate": 4.9561341256167234e-05, + "loss": 5.4213, + "step": 10042 + }, + { + "epoch": 0.059728565991055284, + "grad_norm": 1.889667272567749, + "learning_rate": 4.956125413477841e-05, + "loss": 5.2795, + "step": 10043 + }, + { + "epoch": 0.059734513274336286, + "grad_norm": 2.209780216217041, + "learning_rate": 4.95611670048155e-05, + "loss": 5.6823, + "step": 10044 + }, + { + "epoch": 0.05974046055761728, + "grad_norm": 1.979069471359253, + "learning_rate": 4.956107986627855e-05, + "loss": 5.3437, + "step": 10045 + }, + { + "epoch": 0.059746407840898276, + "grad_norm": 1.8391239643096924, + "learning_rate": 4.9560992719167584e-05, + "loss": 5.2246, + "step": 10046 + }, + { + "epoch": 0.05975235512417928, + "grad_norm": 2.0196359157562256, + "learning_rate": 4.956090556348262e-05, + "loss": 5.3549, + "step": 10047 + }, + { + "epoch": 0.05975830240746027, + "grad_norm": 1.7103056907653809, + "learning_rate": 4.95608183992237e-05, + "loss": 5.4016, + "step": 10048 + }, + { + "epoch": 0.05976424969074127, + "grad_norm": 1.543308138847351, + "learning_rate": 4.956073122639085e-05, + "loss": 5.2628, + "step": 10049 + }, + { + "epoch": 0.05977019697402226, + "grad_norm": 2.0719797611236572, + "learning_rate": 4.956064404498411e-05, + "loss": 5.3149, + "step": 10050 + }, + { + "epoch": 0.059776144257303265, + "grad_norm": 1.9024063348770142, + "learning_rate": 4.95605568550035e-05, + "loss": 5.2804, + "step": 10051 + }, + { + "epoch": 0.05978209154058426, + "grad_norm": 1.6171611547470093, + "learning_rate": 4.9560469656449046e-05, + "loss": 5.2558, + "step": 10052 + }, + { + "epoch": 0.059788038823865255, + "grad_norm": 1.5416970252990723, + "learning_rate": 4.9560382449320795e-05, + "loss": 5.3164, + "step": 10053 + }, + { + "epoch": 0.05979398610714626, + "grad_norm": 1.6956002712249756, + "learning_rate": 4.956029523361877e-05, + "loss": 5.2123, + "step": 10054 + }, + { + "epoch": 0.05979993339042725, + "grad_norm": 1.6414602994918823, + "learning_rate": 4.956020800934299e-05, + "loss": 5.3302, + "step": 10055 + }, + { + "epoch": 0.05980588067370825, + "grad_norm": 1.6868051290512085, + "learning_rate": 4.95601207764935e-05, + "loss": 5.2076, + "step": 10056 + }, + { + "epoch": 0.05981182795698925, + "grad_norm": 1.7299697399139404, + "learning_rate": 4.956003353507033e-05, + "loss": 5.3502, + "step": 10057 + }, + { + "epoch": 0.059817775240270245, + "grad_norm": 1.4923878908157349, + "learning_rate": 4.95599462850735e-05, + "loss": 5.3081, + "step": 10058 + }, + { + "epoch": 0.05982372252355124, + "grad_norm": 1.571413516998291, + "learning_rate": 4.9559859026503045e-05, + "loss": 5.1434, + "step": 10059 + }, + { + "epoch": 0.05982966980683224, + "grad_norm": 1.6265422105789185, + "learning_rate": 4.9559771759359e-05, + "loss": 5.2455, + "step": 10060 + }, + { + "epoch": 0.05983561709011324, + "grad_norm": 1.7889208793640137, + "learning_rate": 4.9559684483641395e-05, + "loss": 5.2429, + "step": 10061 + }, + { + "epoch": 0.05984156437339423, + "grad_norm": 1.5957598686218262, + "learning_rate": 4.955959719935025e-05, + "loss": 5.2299, + "step": 10062 + }, + { + "epoch": 0.059847511656675234, + "grad_norm": 1.6366177797317505, + "learning_rate": 4.955950990648561e-05, + "loss": 5.366, + "step": 10063 + }, + { + "epoch": 0.05985345893995623, + "grad_norm": 1.6712719202041626, + "learning_rate": 4.95594226050475e-05, + "loss": 5.3602, + "step": 10064 + }, + { + "epoch": 0.059859406223237224, + "grad_norm": 1.8273069858551025, + "learning_rate": 4.955933529503595e-05, + "loss": 5.3586, + "step": 10065 + }, + { + "epoch": 0.05986535350651822, + "grad_norm": 1.6638576984405518, + "learning_rate": 4.955924797645098e-05, + "loss": 5.2359, + "step": 10066 + }, + { + "epoch": 0.05987130078979922, + "grad_norm": 1.8127614259719849, + "learning_rate": 4.955916064929264e-05, + "loss": 5.3815, + "step": 10067 + }, + { + "epoch": 0.059877248073080216, + "grad_norm": 1.7204198837280273, + "learning_rate": 4.955907331356095e-05, + "loss": 5.5576, + "step": 10068 + }, + { + "epoch": 0.05988319535636121, + "grad_norm": 1.9153103828430176, + "learning_rate": 4.9558985969255936e-05, + "loss": 5.4363, + "step": 10069 + }, + { + "epoch": 0.059889142639642214, + "grad_norm": 1.6427290439605713, + "learning_rate": 4.9558898616377634e-05, + "loss": 5.4497, + "step": 10070 + }, + { + "epoch": 0.05989508992292321, + "grad_norm": 1.660217046737671, + "learning_rate": 4.955881125492608e-05, + "loss": 5.4988, + "step": 10071 + }, + { + "epoch": 0.059901037206204204, + "grad_norm": 1.7776225805282593, + "learning_rate": 4.955872388490129e-05, + "loss": 5.2714, + "step": 10072 + }, + { + "epoch": 0.059906984489485206, + "grad_norm": 1.5099388360977173, + "learning_rate": 4.9558636506303314e-05, + "loss": 5.4714, + "step": 10073 + }, + { + "epoch": 0.0599129317727662, + "grad_norm": 1.523537039756775, + "learning_rate": 4.955854911913217e-05, + "loss": 5.3528, + "step": 10074 + }, + { + "epoch": 0.059918879056047196, + "grad_norm": 1.3424321413040161, + "learning_rate": 4.9558461723387885e-05, + "loss": 5.3385, + "step": 10075 + }, + { + "epoch": 0.0599248263393282, + "grad_norm": 1.3843169212341309, + "learning_rate": 4.955837431907049e-05, + "loss": 5.383, + "step": 10076 + }, + { + "epoch": 0.05993077362260919, + "grad_norm": 1.4927351474761963, + "learning_rate": 4.955828690618003e-05, + "loss": 5.3536, + "step": 10077 + }, + { + "epoch": 0.05993672090589019, + "grad_norm": 1.5207486152648926, + "learning_rate": 4.955819948471653e-05, + "loss": 5.3557, + "step": 10078 + }, + { + "epoch": 0.05994266818917118, + "grad_norm": 1.5589584112167358, + "learning_rate": 4.9558112054680004e-05, + "loss": 5.3747, + "step": 10079 + }, + { + "epoch": 0.059948615472452185, + "grad_norm": 1.436951756477356, + "learning_rate": 4.9558024616070496e-05, + "loss": 5.2807, + "step": 10080 + }, + { + "epoch": 0.05995456275573318, + "grad_norm": 1.4345866441726685, + "learning_rate": 4.955793716888804e-05, + "loss": 5.4, + "step": 10081 + }, + { + "epoch": 0.059960510039014175, + "grad_norm": 1.2811249494552612, + "learning_rate": 4.955784971313267e-05, + "loss": 5.2531, + "step": 10082 + }, + { + "epoch": 0.05996645732229518, + "grad_norm": 1.5558568239212036, + "learning_rate": 4.955776224880439e-05, + "loss": 5.1136, + "step": 10083 + }, + { + "epoch": 0.05997240460557617, + "grad_norm": 1.3918567895889282, + "learning_rate": 4.955767477590326e-05, + "loss": 5.2748, + "step": 10084 + }, + { + "epoch": 0.05997835188885717, + "grad_norm": 1.3277204036712646, + "learning_rate": 4.9557587294429295e-05, + "loss": 5.2346, + "step": 10085 + }, + { + "epoch": 0.05998429917213817, + "grad_norm": 1.2874623537063599, + "learning_rate": 4.955749980438253e-05, + "loss": 5.2616, + "step": 10086 + }, + { + "epoch": 0.059990246455419165, + "grad_norm": 1.7534229755401611, + "learning_rate": 4.9557412305763004e-05, + "loss": 5.2509, + "step": 10087 + }, + { + "epoch": 0.05999619373870016, + "grad_norm": 1.4560372829437256, + "learning_rate": 4.955732479857072e-05, + "loss": 5.2385, + "step": 10088 + }, + { + "epoch": 0.06000214102198116, + "grad_norm": 1.232779860496521, + "learning_rate": 4.955723728280575e-05, + "loss": 5.2726, + "step": 10089 + }, + { + "epoch": 0.06000808830526216, + "grad_norm": 1.6178683042526245, + "learning_rate": 4.955714975846809e-05, + "loss": 5.3816, + "step": 10090 + }, + { + "epoch": 0.06001403558854315, + "grad_norm": 1.5438450574874878, + "learning_rate": 4.955706222555779e-05, + "loss": 5.2706, + "step": 10091 + }, + { + "epoch": 0.060019982871824154, + "grad_norm": 1.5367876291275024, + "learning_rate": 4.955697468407486e-05, + "loss": 5.1955, + "step": 10092 + }, + { + "epoch": 0.06002593015510515, + "grad_norm": 1.2902512550354004, + "learning_rate": 4.955688713401936e-05, + "loss": 5.166, + "step": 10093 + }, + { + "epoch": 0.060031877438386144, + "grad_norm": 1.5516488552093506, + "learning_rate": 4.95567995753913e-05, + "loss": 5.1256, + "step": 10094 + }, + { + "epoch": 0.06003782472166714, + "grad_norm": 1.3104857206344604, + "learning_rate": 4.9556712008190706e-05, + "loss": 5.1604, + "step": 10095 + }, + { + "epoch": 0.06004377200494814, + "grad_norm": 1.6237741708755493, + "learning_rate": 4.955662443241762e-05, + "loss": 5.2686, + "step": 10096 + }, + { + "epoch": 0.060049719288229136, + "grad_norm": 1.6566027402877808, + "learning_rate": 4.955653684807208e-05, + "loss": 5.3376, + "step": 10097 + }, + { + "epoch": 0.06005566657151013, + "grad_norm": 1.4010981321334839, + "learning_rate": 4.9556449255154106e-05, + "loss": 5.4008, + "step": 10098 + }, + { + "epoch": 0.060061613854791134, + "grad_norm": 1.6399116516113281, + "learning_rate": 4.955636165366372e-05, + "loss": 5.2718, + "step": 10099 + }, + { + "epoch": 0.06006756113807213, + "grad_norm": 1.5371499061584473, + "learning_rate": 4.955627404360096e-05, + "loss": 5.2107, + "step": 10100 + }, + { + "epoch": 0.060073508421353124, + "grad_norm": 1.598186731338501, + "learning_rate": 4.955618642496587e-05, + "loss": 5.3482, + "step": 10101 + }, + { + "epoch": 0.060079455704634126, + "grad_norm": 1.526595115661621, + "learning_rate": 4.955609879775846e-05, + "loss": 5.2335, + "step": 10102 + }, + { + "epoch": 0.06008540298791512, + "grad_norm": 1.509990930557251, + "learning_rate": 4.955601116197877e-05, + "loss": 5.168, + "step": 10103 + }, + { + "epoch": 0.060091350271196116, + "grad_norm": 1.368203043937683, + "learning_rate": 4.9555923517626836e-05, + "loss": 5.2183, + "step": 10104 + }, + { + "epoch": 0.06009729755447712, + "grad_norm": 1.5153454542160034, + "learning_rate": 4.955583586470268e-05, + "loss": 5.2558, + "step": 10105 + }, + { + "epoch": 0.06010324483775811, + "grad_norm": 2.9330217838287354, + "learning_rate": 4.955574820320633e-05, + "loss": 5.6863, + "step": 10106 + }, + { + "epoch": 0.06010919212103911, + "grad_norm": 1.6096080541610718, + "learning_rate": 4.9555660533137825e-05, + "loss": 5.2243, + "step": 10107 + }, + { + "epoch": 0.0601151394043201, + "grad_norm": 1.5425163507461548, + "learning_rate": 4.95555728544972e-05, + "loss": 5.4308, + "step": 10108 + }, + { + "epoch": 0.060121086687601105, + "grad_norm": 1.4898573160171509, + "learning_rate": 4.955548516728447e-05, + "loss": 5.389, + "step": 10109 + }, + { + "epoch": 0.0601270339708821, + "grad_norm": 1.5746946334838867, + "learning_rate": 4.955539747149968e-05, + "loss": 5.1414, + "step": 10110 + }, + { + "epoch": 0.060132981254163095, + "grad_norm": 1.7621461153030396, + "learning_rate": 4.955530976714285e-05, + "loss": 5.4572, + "step": 10111 + }, + { + "epoch": 0.0601389285374441, + "grad_norm": 1.4524224996566772, + "learning_rate": 4.9555222054214015e-05, + "loss": 5.4577, + "step": 10112 + }, + { + "epoch": 0.06014487582072509, + "grad_norm": 1.5630146265029907, + "learning_rate": 4.95551343327132e-05, + "loss": 5.277, + "step": 10113 + }, + { + "epoch": 0.06015082310400609, + "grad_norm": 1.9279972314834595, + "learning_rate": 4.955504660264045e-05, + "loss": 5.1485, + "step": 10114 + }, + { + "epoch": 0.06015677038728709, + "grad_norm": 1.618775725364685, + "learning_rate": 4.9554958863995786e-05, + "loss": 5.1262, + "step": 10115 + }, + { + "epoch": 0.060162717670568085, + "grad_norm": 1.8578898906707764, + "learning_rate": 4.955487111677924e-05, + "loss": 5.3451, + "step": 10116 + }, + { + "epoch": 0.06016866495384908, + "grad_norm": 1.5652815103530884, + "learning_rate": 4.955478336099084e-05, + "loss": 5.2326, + "step": 10117 + }, + { + "epoch": 0.06017461223713008, + "grad_norm": 1.4957774877548218, + "learning_rate": 4.9554695596630616e-05, + "loss": 5.3332, + "step": 10118 + }, + { + "epoch": 0.06018055952041108, + "grad_norm": 1.428112506866455, + "learning_rate": 4.9554607823698606e-05, + "loss": 5.2647, + "step": 10119 + }, + { + "epoch": 0.06018650680369207, + "grad_norm": 1.9383279085159302, + "learning_rate": 4.955452004219484e-05, + "loss": 5.5897, + "step": 10120 + }, + { + "epoch": 0.060192454086973074, + "grad_norm": 1.8523132801055908, + "learning_rate": 4.955443225211934e-05, + "loss": 5.6204, + "step": 10121 + }, + { + "epoch": 0.06019840137025407, + "grad_norm": 1.7980049848556519, + "learning_rate": 4.955434445347214e-05, + "loss": 5.4383, + "step": 10122 + }, + { + "epoch": 0.060204348653535064, + "grad_norm": 1.7927988767623901, + "learning_rate": 4.9554256646253274e-05, + "loss": 5.6066, + "step": 10123 + }, + { + "epoch": 0.06021029593681606, + "grad_norm": 1.8549528121948242, + "learning_rate": 4.955416883046277e-05, + "loss": 5.2963, + "step": 10124 + }, + { + "epoch": 0.06021624322009706, + "grad_norm": 1.7140870094299316, + "learning_rate": 4.955408100610066e-05, + "loss": 5.4636, + "step": 10125 + }, + { + "epoch": 0.060222190503378056, + "grad_norm": 1.3744412660598755, + "learning_rate": 4.955399317316697e-05, + "loss": 5.2985, + "step": 10126 + }, + { + "epoch": 0.06022813778665905, + "grad_norm": 1.572782278060913, + "learning_rate": 4.9553905331661734e-05, + "loss": 5.2598, + "step": 10127 + }, + { + "epoch": 0.06023408506994005, + "grad_norm": 1.6485692262649536, + "learning_rate": 4.955381748158499e-05, + "loss": 5.3764, + "step": 10128 + }, + { + "epoch": 0.06024003235322105, + "grad_norm": 1.5442413091659546, + "learning_rate": 4.955372962293676e-05, + "loss": 5.2504, + "step": 10129 + }, + { + "epoch": 0.060245979636502044, + "grad_norm": 1.807518482208252, + "learning_rate": 4.9553641755717075e-05, + "loss": 5.2853, + "step": 10130 + }, + { + "epoch": 0.060251926919783046, + "grad_norm": 1.5858244895935059, + "learning_rate": 4.9553553879925965e-05, + "loss": 5.2645, + "step": 10131 + }, + { + "epoch": 0.06025787420306404, + "grad_norm": 1.596307396888733, + "learning_rate": 4.955346599556347e-05, + "loss": 5.4094, + "step": 10132 + }, + { + "epoch": 0.060263821486345036, + "grad_norm": 1.4624857902526855, + "learning_rate": 4.955337810262961e-05, + "loss": 5.4366, + "step": 10133 + }, + { + "epoch": 0.06026976876962604, + "grad_norm": 1.426866888999939, + "learning_rate": 4.955329020112442e-05, + "loss": 5.324, + "step": 10134 + }, + { + "epoch": 0.06027571605290703, + "grad_norm": 1.6577516794204712, + "learning_rate": 4.955320229104793e-05, + "loss": 5.2937, + "step": 10135 + }, + { + "epoch": 0.06028166333618803, + "grad_norm": 1.3958433866500854, + "learning_rate": 4.9553114372400166e-05, + "loss": 5.421, + "step": 10136 + }, + { + "epoch": 0.06028761061946902, + "grad_norm": 1.3242517709732056, + "learning_rate": 4.9553026445181173e-05, + "loss": 5.2697, + "step": 10137 + }, + { + "epoch": 0.060293557902750025, + "grad_norm": 1.519018530845642, + "learning_rate": 4.955293850939096e-05, + "loss": 5.1432, + "step": 10138 + }, + { + "epoch": 0.06029950518603102, + "grad_norm": 1.528515338897705, + "learning_rate": 4.955285056502958e-05, + "loss": 5.1388, + "step": 10139 + }, + { + "epoch": 0.060305452469312015, + "grad_norm": 1.4830992221832275, + "learning_rate": 4.955276261209705e-05, + "loss": 5.3222, + "step": 10140 + }, + { + "epoch": 0.06031139975259302, + "grad_norm": 1.4149411916732788, + "learning_rate": 4.95526746505934e-05, + "loss": 5.2706, + "step": 10141 + }, + { + "epoch": 0.06031734703587401, + "grad_norm": 1.4466478824615479, + "learning_rate": 4.9552586680518676e-05, + "loss": 5.2309, + "step": 10142 + }, + { + "epoch": 0.06032329431915501, + "grad_norm": 1.4246203899383545, + "learning_rate": 4.9552498701872884e-05, + "loss": 5.1539, + "step": 10143 + }, + { + "epoch": 0.06032924160243601, + "grad_norm": 1.632572889328003, + "learning_rate": 4.955241071465608e-05, + "loss": 5.3788, + "step": 10144 + }, + { + "epoch": 0.060335188885717005, + "grad_norm": 1.5974568128585815, + "learning_rate": 4.955232271886828e-05, + "loss": 5.3558, + "step": 10145 + }, + { + "epoch": 0.060341136168998, + "grad_norm": 1.6396468877792358, + "learning_rate": 4.9552234714509516e-05, + "loss": 5.2162, + "step": 10146 + }, + { + "epoch": 0.060347083452279, + "grad_norm": 1.5349491834640503, + "learning_rate": 4.9552146701579815e-05, + "loss": 5.212, + "step": 10147 + }, + { + "epoch": 0.06035303073556, + "grad_norm": 1.5236495733261108, + "learning_rate": 4.955205868007922e-05, + "loss": 5.2984, + "step": 10148 + }, + { + "epoch": 0.06035897801884099, + "grad_norm": 1.4593411684036255, + "learning_rate": 4.955197065000775e-05, + "loss": 5.268, + "step": 10149 + }, + { + "epoch": 0.060364925302121994, + "grad_norm": 1.4498536586761475, + "learning_rate": 4.955188261136545e-05, + "loss": 5.1437, + "step": 10150 + }, + { + "epoch": 0.06037087258540299, + "grad_norm": 1.5059176683425903, + "learning_rate": 4.9551794564152334e-05, + "loss": 5.3011, + "step": 10151 + }, + { + "epoch": 0.060376819868683984, + "grad_norm": 1.5773544311523438, + "learning_rate": 4.9551706508368445e-05, + "loss": 5.2066, + "step": 10152 + }, + { + "epoch": 0.06038276715196498, + "grad_norm": 1.4858072996139526, + "learning_rate": 4.95516184440138e-05, + "loss": 5.2757, + "step": 10153 + }, + { + "epoch": 0.06038871443524598, + "grad_norm": 1.486055612564087, + "learning_rate": 4.955153037108845e-05, + "loss": 5.1416, + "step": 10154 + }, + { + "epoch": 0.060394661718526976, + "grad_norm": 1.3411048650741577, + "learning_rate": 4.955144228959241e-05, + "loss": 5.1708, + "step": 10155 + }, + { + "epoch": 0.06040060900180797, + "grad_norm": 1.2979127168655396, + "learning_rate": 4.9551354199525714e-05, + "loss": 5.1421, + "step": 10156 + }, + { + "epoch": 0.06040655628508897, + "grad_norm": 1.4928209781646729, + "learning_rate": 4.9551266100888395e-05, + "loss": 5.2185, + "step": 10157 + }, + { + "epoch": 0.06041250356836997, + "grad_norm": 1.58747398853302, + "learning_rate": 4.955117799368048e-05, + "loss": 5.2587, + "step": 10158 + }, + { + "epoch": 0.060418450851650964, + "grad_norm": 1.1862558126449585, + "learning_rate": 4.9551089877902e-05, + "loss": 5.2405, + "step": 10159 + }, + { + "epoch": 0.060424398134931966, + "grad_norm": 1.5547248125076294, + "learning_rate": 4.955100175355299e-05, + "loss": 5.2326, + "step": 10160 + }, + { + "epoch": 0.06043034541821296, + "grad_norm": 1.6986664533615112, + "learning_rate": 4.955091362063349e-05, + "loss": 5.2261, + "step": 10161 + }, + { + "epoch": 0.060436292701493956, + "grad_norm": 1.531891107559204, + "learning_rate": 4.95508254791435e-05, + "loss": 5.4475, + "step": 10162 + }, + { + "epoch": 0.06044223998477496, + "grad_norm": 1.57411789894104, + "learning_rate": 4.955073732908309e-05, + "loss": 5.1346, + "step": 10163 + }, + { + "epoch": 0.06044818726805595, + "grad_norm": 1.548439383506775, + "learning_rate": 4.9550649170452255e-05, + "loss": 5.1953, + "step": 10164 + }, + { + "epoch": 0.06045413455133695, + "grad_norm": 1.645850419998169, + "learning_rate": 4.955056100325105e-05, + "loss": 5.2728, + "step": 10165 + }, + { + "epoch": 0.06046008183461794, + "grad_norm": 1.6308786869049072, + "learning_rate": 4.95504728274795e-05, + "loss": 5.3134, + "step": 10166 + }, + { + "epoch": 0.060466029117898945, + "grad_norm": 1.4754101037979126, + "learning_rate": 4.955038464313763e-05, + "loss": 5.3938, + "step": 10167 + }, + { + "epoch": 0.06047197640117994, + "grad_norm": 2.408869981765747, + "learning_rate": 4.955029645022548e-05, + "loss": 5.4687, + "step": 10168 + }, + { + "epoch": 0.060477923684460935, + "grad_norm": 1.6601638793945312, + "learning_rate": 4.955020824874307e-05, + "loss": 5.165, + "step": 10169 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 1.5239113569259644, + "learning_rate": 4.955012003869043e-05, + "loss": 5.133, + "step": 10170 + }, + { + "epoch": 0.06048981825102293, + "grad_norm": 1.6661083698272705, + "learning_rate": 4.955003182006761e-05, + "loss": 5.2033, + "step": 10171 + }, + { + "epoch": 0.06049576553430393, + "grad_norm": 1.4320698976516724, + "learning_rate": 4.9549943592874615e-05, + "loss": 5.1842, + "step": 10172 + }, + { + "epoch": 0.06050171281758493, + "grad_norm": 1.789302110671997, + "learning_rate": 4.95498553571115e-05, + "loss": 5.1052, + "step": 10173 + }, + { + "epoch": 0.060507660100865925, + "grad_norm": 1.598085880279541, + "learning_rate": 4.954976711277828e-05, + "loss": 5.3194, + "step": 10174 + }, + { + "epoch": 0.06051360738414692, + "grad_norm": 1.4569145441055298, + "learning_rate": 4.954967885987498e-05, + "loss": 5.2009, + "step": 10175 + }, + { + "epoch": 0.06051955466742792, + "grad_norm": 1.5980345010757446, + "learning_rate": 4.954959059840165e-05, + "loss": 5.1686, + "step": 10176 + }, + { + "epoch": 0.06052550195070892, + "grad_norm": 1.5382320880889893, + "learning_rate": 4.954950232835831e-05, + "loss": 5.303, + "step": 10177 + }, + { + "epoch": 0.06053144923398991, + "grad_norm": 1.5568296909332275, + "learning_rate": 4.954941404974499e-05, + "loss": 5.2044, + "step": 10178 + }, + { + "epoch": 0.060537396517270914, + "grad_norm": 1.6732075214385986, + "learning_rate": 4.954932576256173e-05, + "loss": 5.3133, + "step": 10179 + }, + { + "epoch": 0.06054334380055191, + "grad_norm": 1.6905434131622314, + "learning_rate": 4.954923746680855e-05, + "loss": 5.3868, + "step": 10180 + }, + { + "epoch": 0.060549291083832904, + "grad_norm": 1.4349027872085571, + "learning_rate": 4.954914916248549e-05, + "loss": 5.2215, + "step": 10181 + }, + { + "epoch": 0.0605552383671139, + "grad_norm": 1.5257092714309692, + "learning_rate": 4.9549060849592566e-05, + "loss": 5.2148, + "step": 10182 + }, + { + "epoch": 0.0605611856503949, + "grad_norm": 1.5402655601501465, + "learning_rate": 4.954897252812982e-05, + "loss": 5.3069, + "step": 10183 + }, + { + "epoch": 0.060567132933675896, + "grad_norm": 1.801798701286316, + "learning_rate": 4.954888419809729e-05, + "loss": 5.0786, + "step": 10184 + }, + { + "epoch": 0.06057308021695689, + "grad_norm": 1.4860090017318726, + "learning_rate": 4.954879585949499e-05, + "loss": 4.8878, + "step": 10185 + }, + { + "epoch": 0.06057902750023789, + "grad_norm": 1.7319056987762451, + "learning_rate": 4.954870751232296e-05, + "loss": 4.9013, + "step": 10186 + }, + { + "epoch": 0.06058497478351889, + "grad_norm": 1.4376243352890015, + "learning_rate": 4.954861915658123e-05, + "loss": 5.37, + "step": 10187 + }, + { + "epoch": 0.060590922066799884, + "grad_norm": 1.2903879880905151, + "learning_rate": 4.954853079226983e-05, + "loss": 5.5355, + "step": 10188 + }, + { + "epoch": 0.060596869350080886, + "grad_norm": 1.5223259925842285, + "learning_rate": 4.95484424193888e-05, + "loss": 5.3451, + "step": 10189 + }, + { + "epoch": 0.06060281663336188, + "grad_norm": 1.283892035484314, + "learning_rate": 4.954835403793815e-05, + "loss": 5.2245, + "step": 10190 + }, + { + "epoch": 0.060608763916642876, + "grad_norm": 1.5581207275390625, + "learning_rate": 4.9548265647917936e-05, + "loss": 5.303, + "step": 10191 + }, + { + "epoch": 0.06061471119992388, + "grad_norm": 1.4258673191070557, + "learning_rate": 4.9548177249328164e-05, + "loss": 5.4569, + "step": 10192 + }, + { + "epoch": 0.06062065848320487, + "grad_norm": 1.4326061010360718, + "learning_rate": 4.9548088842168886e-05, + "loss": 5.2761, + "step": 10193 + }, + { + "epoch": 0.06062660576648587, + "grad_norm": 1.9100563526153564, + "learning_rate": 4.9548000426440114e-05, + "loss": 4.9366, + "step": 10194 + }, + { + "epoch": 0.06063255304976687, + "grad_norm": 1.7059932947158813, + "learning_rate": 4.9547912002141895e-05, + "loss": 4.9135, + "step": 10195 + }, + { + "epoch": 0.060638500333047865, + "grad_norm": 1.6715087890625, + "learning_rate": 4.954782356927425e-05, + "loss": 5.0662, + "step": 10196 + }, + { + "epoch": 0.06064444761632886, + "grad_norm": 1.966430902481079, + "learning_rate": 4.9547735127837223e-05, + "loss": 4.7995, + "step": 10197 + }, + { + "epoch": 0.060650394899609855, + "grad_norm": 1.7138090133666992, + "learning_rate": 4.954764667783083e-05, + "loss": 4.9745, + "step": 10198 + }, + { + "epoch": 0.06065634218289086, + "grad_norm": 1.832889199256897, + "learning_rate": 4.95475582192551e-05, + "loss": 4.9795, + "step": 10199 + }, + { + "epoch": 0.06066228946617185, + "grad_norm": 1.883525013923645, + "learning_rate": 4.954746975211008e-05, + "loss": 4.8523, + "step": 10200 + }, + { + "epoch": 0.06066823674945285, + "grad_norm": 1.747101068496704, + "learning_rate": 4.954738127639579e-05, + "loss": 4.9402, + "step": 10201 + }, + { + "epoch": 0.06067418403273385, + "grad_norm": 1.583900809288025, + "learning_rate": 4.9547292792112256e-05, + "loss": 5.176, + "step": 10202 + }, + { + "epoch": 0.060680131316014845, + "grad_norm": 1.6390752792358398, + "learning_rate": 4.954720429925953e-05, + "loss": 5.1014, + "step": 10203 + }, + { + "epoch": 0.06068607859929584, + "grad_norm": 1.4499305486679077, + "learning_rate": 4.954711579783762e-05, + "loss": 5.1473, + "step": 10204 + }, + { + "epoch": 0.06069202588257684, + "grad_norm": 1.2734607458114624, + "learning_rate": 4.954702728784656e-05, + "loss": 5.0919, + "step": 10205 + }, + { + "epoch": 0.06069797316585784, + "grad_norm": 1.4447498321533203, + "learning_rate": 4.954693876928639e-05, + "loss": 5.0145, + "step": 10206 + }, + { + "epoch": 0.06070392044913883, + "grad_norm": 1.7052301168441772, + "learning_rate": 4.954685024215714e-05, + "loss": 5.109, + "step": 10207 + }, + { + "epoch": 0.060709867732419834, + "grad_norm": 1.6922130584716797, + "learning_rate": 4.9546761706458836e-05, + "loss": 5.2519, + "step": 10208 + }, + { + "epoch": 0.06071581501570083, + "grad_norm": 1.7998334169387817, + "learning_rate": 4.954667316219151e-05, + "loss": 5.2272, + "step": 10209 + }, + { + "epoch": 0.060721762298981824, + "grad_norm": 1.6331555843353271, + "learning_rate": 4.95465846093552e-05, + "loss": 5.1382, + "step": 10210 + }, + { + "epoch": 0.06072770958226282, + "grad_norm": 1.4777888059616089, + "learning_rate": 4.954649604794993e-05, + "loss": 5.0601, + "step": 10211 + }, + { + "epoch": 0.06073365686554382, + "grad_norm": 1.6776998043060303, + "learning_rate": 4.954640747797573e-05, + "loss": 5.0229, + "step": 10212 + }, + { + "epoch": 0.060739604148824816, + "grad_norm": 1.9567780494689941, + "learning_rate": 4.9546318899432634e-05, + "loss": 5.483, + "step": 10213 + }, + { + "epoch": 0.06074555143210581, + "grad_norm": 1.7381116151809692, + "learning_rate": 4.9546230312320664e-05, + "loss": 5.4088, + "step": 10214 + }, + { + "epoch": 0.06075149871538681, + "grad_norm": 2.290041446685791, + "learning_rate": 4.954614171663986e-05, + "loss": 5.0879, + "step": 10215 + }, + { + "epoch": 0.06075744599866781, + "grad_norm": 1.680309534072876, + "learning_rate": 4.9546053112390255e-05, + "loss": 5.1931, + "step": 10216 + }, + { + "epoch": 0.0607633932819488, + "grad_norm": 1.997379183769226, + "learning_rate": 4.9545964499571885e-05, + "loss": 5.0834, + "step": 10217 + }, + { + "epoch": 0.060769340565229805, + "grad_norm": 1.9145865440368652, + "learning_rate": 4.954587587818476e-05, + "loss": 5.3478, + "step": 10218 + }, + { + "epoch": 0.0607752878485108, + "grad_norm": 1.565874457359314, + "learning_rate": 4.954578724822893e-05, + "loss": 5.2579, + "step": 10219 + }, + { + "epoch": 0.060781235131791796, + "grad_norm": 1.5997511148452759, + "learning_rate": 4.9545698609704416e-05, + "loss": 5.233, + "step": 10220 + }, + { + "epoch": 0.0607871824150728, + "grad_norm": 2.205021619796753, + "learning_rate": 4.954560996261125e-05, + "loss": 5.227, + "step": 10221 + }, + { + "epoch": 0.06079312969835379, + "grad_norm": 1.5360487699508667, + "learning_rate": 4.954552130694947e-05, + "loss": 5.182, + "step": 10222 + }, + { + "epoch": 0.06079907698163479, + "grad_norm": 1.5571166276931763, + "learning_rate": 4.95454326427191e-05, + "loss": 5.3671, + "step": 10223 + }, + { + "epoch": 0.06080502426491579, + "grad_norm": 1.7289685010910034, + "learning_rate": 4.9545343969920175e-05, + "loss": 5.1256, + "step": 10224 + }, + { + "epoch": 0.060810971548196785, + "grad_norm": 1.7945314645767212, + "learning_rate": 4.954525528855272e-05, + "loss": 5.0339, + "step": 10225 + }, + { + "epoch": 0.06081691883147778, + "grad_norm": 1.7037841081619263, + "learning_rate": 4.954516659861678e-05, + "loss": 4.9308, + "step": 10226 + }, + { + "epoch": 0.060822866114758775, + "grad_norm": 1.8096303939819336, + "learning_rate": 4.954507790011237e-05, + "loss": 5.1173, + "step": 10227 + }, + { + "epoch": 0.06082881339803978, + "grad_norm": 1.7563896179199219, + "learning_rate": 4.954498919303952e-05, + "loss": 5.1713, + "step": 10228 + }, + { + "epoch": 0.06083476068132077, + "grad_norm": 1.8820421695709229, + "learning_rate": 4.954490047739827e-05, + "loss": 5.2372, + "step": 10229 + }, + { + "epoch": 0.06084070796460177, + "grad_norm": 2.7050085067749023, + "learning_rate": 4.954481175318865e-05, + "loss": 5.6108, + "step": 10230 + }, + { + "epoch": 0.06084665524788277, + "grad_norm": 1.6424611806869507, + "learning_rate": 4.954472302041069e-05, + "loss": 5.1423, + "step": 10231 + }, + { + "epoch": 0.060852602531163764, + "grad_norm": 1.7690013647079468, + "learning_rate": 4.954463427906443e-05, + "loss": 5.0232, + "step": 10232 + }, + { + "epoch": 0.06085854981444476, + "grad_norm": 1.8925920724868774, + "learning_rate": 4.9544545529149874e-05, + "loss": 4.8949, + "step": 10233 + }, + { + "epoch": 0.06086449709772576, + "grad_norm": 1.7629793882369995, + "learning_rate": 4.954445677066709e-05, + "loss": 4.8832, + "step": 10234 + }, + { + "epoch": 0.06087044438100676, + "grad_norm": 1.5553311109542847, + "learning_rate": 4.9544368003616084e-05, + "loss": 4.8787, + "step": 10235 + }, + { + "epoch": 0.06087639166428775, + "grad_norm": 1.6236152648925781, + "learning_rate": 4.9544279227996884e-05, + "loss": 4.8583, + "step": 10236 + }, + { + "epoch": 0.060882338947568754, + "grad_norm": 1.7591924667358398, + "learning_rate": 4.954419044380954e-05, + "loss": 5.1468, + "step": 10237 + }, + { + "epoch": 0.06088828623084975, + "grad_norm": 1.8084702491760254, + "learning_rate": 4.954410165105406e-05, + "loss": 5.3178, + "step": 10238 + }, + { + "epoch": 0.060894233514130744, + "grad_norm": 1.6629832983016968, + "learning_rate": 4.9544012849730495e-05, + "loss": 5.2955, + "step": 10239 + }, + { + "epoch": 0.06090018079741174, + "grad_norm": 1.6681956052780151, + "learning_rate": 4.954392403983887e-05, + "loss": 4.9919, + "step": 10240 + }, + { + "epoch": 0.06090612808069274, + "grad_norm": 1.7849150896072388, + "learning_rate": 4.954383522137922e-05, + "loss": 4.9667, + "step": 10241 + }, + { + "epoch": 0.060912075363973736, + "grad_norm": 1.6313222646713257, + "learning_rate": 4.954374639435157e-05, + "loss": 4.9842, + "step": 10242 + }, + { + "epoch": 0.06091802264725473, + "grad_norm": 1.3376604318618774, + "learning_rate": 4.954365755875594e-05, + "loss": 5.2643, + "step": 10243 + }, + { + "epoch": 0.06092396993053573, + "grad_norm": 1.5971726179122925, + "learning_rate": 4.954356871459238e-05, + "loss": 5.2225, + "step": 10244 + }, + { + "epoch": 0.06092991721381673, + "grad_norm": 1.638786792755127, + "learning_rate": 4.954347986186091e-05, + "loss": 5.2855, + "step": 10245 + }, + { + "epoch": 0.06093586449709772, + "grad_norm": 1.6273027658462524, + "learning_rate": 4.954339100056157e-05, + "loss": 5.3825, + "step": 10246 + }, + { + "epoch": 0.060941811780378725, + "grad_norm": 1.4666591882705688, + "learning_rate": 4.954330213069438e-05, + "loss": 5.3148, + "step": 10247 + }, + { + "epoch": 0.06094775906365972, + "grad_norm": 1.447332501411438, + "learning_rate": 4.954321325225938e-05, + "loss": 5.1907, + "step": 10248 + }, + { + "epoch": 0.060953706346940716, + "grad_norm": 1.7162379026412964, + "learning_rate": 4.95431243652566e-05, + "loss": 5.289, + "step": 10249 + }, + { + "epoch": 0.06095965363022172, + "grad_norm": 1.7236372232437134, + "learning_rate": 4.954303546968606e-05, + "loss": 5.1839, + "step": 10250 + }, + { + "epoch": 0.06096560091350271, + "grad_norm": 1.76384437084198, + "learning_rate": 4.954294656554781e-05, + "loss": 5.1665, + "step": 10251 + }, + { + "epoch": 0.06097154819678371, + "grad_norm": 1.595041275024414, + "learning_rate": 4.954285765284187e-05, + "loss": 5.2667, + "step": 10252 + }, + { + "epoch": 0.06097749548006471, + "grad_norm": 1.6735886335372925, + "learning_rate": 4.954276873156827e-05, + "loss": 5.3367, + "step": 10253 + }, + { + "epoch": 0.060983442763345705, + "grad_norm": 1.656801462173462, + "learning_rate": 4.9542679801727044e-05, + "loss": 5.3188, + "step": 10254 + }, + { + "epoch": 0.0609893900466267, + "grad_norm": 1.7149133682250977, + "learning_rate": 4.9542590863318214e-05, + "loss": 5.0618, + "step": 10255 + }, + { + "epoch": 0.060995337329907695, + "grad_norm": 1.715561032295227, + "learning_rate": 4.954250191634183e-05, + "loss": 5.2589, + "step": 10256 + }, + { + "epoch": 0.0610012846131887, + "grad_norm": 1.4005486965179443, + "learning_rate": 4.95424129607979e-05, + "loss": 5.1061, + "step": 10257 + }, + { + "epoch": 0.06100723189646969, + "grad_norm": 1.6608542203903198, + "learning_rate": 4.954232399668648e-05, + "loss": 5.3779, + "step": 10258 + }, + { + "epoch": 0.06101317917975069, + "grad_norm": 1.5471054315567017, + "learning_rate": 4.954223502400758e-05, + "loss": 5.448, + "step": 10259 + }, + { + "epoch": 0.06101912646303169, + "grad_norm": 1.6794294118881226, + "learning_rate": 4.9542146042761246e-05, + "loss": 5.1452, + "step": 10260 + }, + { + "epoch": 0.061025073746312684, + "grad_norm": 1.5416966676712036, + "learning_rate": 4.95420570529475e-05, + "loss": 5.2192, + "step": 10261 + }, + { + "epoch": 0.06103102102959368, + "grad_norm": 1.6667221784591675, + "learning_rate": 4.954196805456637e-05, + "loss": 5.3682, + "step": 10262 + }, + { + "epoch": 0.06103696831287468, + "grad_norm": 1.3199689388275146, + "learning_rate": 4.95418790476179e-05, + "loss": 5.1038, + "step": 10263 + }, + { + "epoch": 0.06104291559615568, + "grad_norm": 1.5326366424560547, + "learning_rate": 4.954179003210211e-05, + "loss": 5.3002, + "step": 10264 + }, + { + "epoch": 0.06104886287943667, + "grad_norm": 1.529453992843628, + "learning_rate": 4.954170100801904e-05, + "loss": 5.4515, + "step": 10265 + }, + { + "epoch": 0.061054810162717674, + "grad_norm": 1.719894528388977, + "learning_rate": 4.954161197536871e-05, + "loss": 5.4161, + "step": 10266 + }, + { + "epoch": 0.06106075744599867, + "grad_norm": 1.4632771015167236, + "learning_rate": 4.954152293415115e-05, + "loss": 5.4669, + "step": 10267 + }, + { + "epoch": 0.061066704729279664, + "grad_norm": 1.7698414325714111, + "learning_rate": 4.954143388436641e-05, + "loss": 5.4045, + "step": 10268 + }, + { + "epoch": 0.06107265201256066, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.95413448260145e-05, + "loss": 5.3637, + "step": 10269 + }, + { + "epoch": 0.06107859929584166, + "grad_norm": 1.6832401752471924, + "learning_rate": 4.954125575909547e-05, + "loss": 5.2123, + "step": 10270 + }, + { + "epoch": 0.061084546579122656, + "grad_norm": 1.6782628297805786, + "learning_rate": 4.954116668360933e-05, + "loss": 5.3007, + "step": 10271 + }, + { + "epoch": 0.06109049386240365, + "grad_norm": 1.598941683769226, + "learning_rate": 4.954107759955613e-05, + "loss": 5.1452, + "step": 10272 + }, + { + "epoch": 0.06109644114568465, + "grad_norm": 1.4137005805969238, + "learning_rate": 4.954098850693589e-05, + "loss": 5.1348, + "step": 10273 + }, + { + "epoch": 0.06110238842896565, + "grad_norm": 1.388108730316162, + "learning_rate": 4.9540899405748646e-05, + "loss": 5.4108, + "step": 10274 + }, + { + "epoch": 0.06110833571224664, + "grad_norm": 1.5997217893600464, + "learning_rate": 4.954081029599443e-05, + "loss": 5.3727, + "step": 10275 + }, + { + "epoch": 0.061114282995527645, + "grad_norm": 1.5805003643035889, + "learning_rate": 4.954072117767327e-05, + "loss": 5.4151, + "step": 10276 + }, + { + "epoch": 0.06112023027880864, + "grad_norm": 1.402063250541687, + "learning_rate": 4.9540632050785194e-05, + "loss": 5.287, + "step": 10277 + }, + { + "epoch": 0.061126177562089636, + "grad_norm": 1.6100205183029175, + "learning_rate": 4.9540542915330236e-05, + "loss": 5.2047, + "step": 10278 + }, + { + "epoch": 0.06113212484537064, + "grad_norm": 1.6199030876159668, + "learning_rate": 4.9540453771308435e-05, + "loss": 5.2141, + "step": 10279 + }, + { + "epoch": 0.06113807212865163, + "grad_norm": 1.485408067703247, + "learning_rate": 4.95403646187198e-05, + "loss": 5.1893, + "step": 10280 + }, + { + "epoch": 0.06114401941193263, + "grad_norm": 1.5842605829238892, + "learning_rate": 4.9540275457564395e-05, + "loss": 5.1383, + "step": 10281 + }, + { + "epoch": 0.06114996669521363, + "grad_norm": 1.5824682712554932, + "learning_rate": 4.9540186287842225e-05, + "loss": 5.1754, + "step": 10282 + }, + { + "epoch": 0.061155913978494625, + "grad_norm": 1.7714753150939941, + "learning_rate": 4.954009710955333e-05, + "loss": 5.2951, + "step": 10283 + }, + { + "epoch": 0.06116186126177562, + "grad_norm": 1.6528159379959106, + "learning_rate": 4.954000792269774e-05, + "loss": 5.2391, + "step": 10284 + }, + { + "epoch": 0.061167808545056615, + "grad_norm": 1.54135262966156, + "learning_rate": 4.953991872727549e-05, + "loss": 5.3849, + "step": 10285 + }, + { + "epoch": 0.06117375582833762, + "grad_norm": 1.4225090742111206, + "learning_rate": 4.953982952328661e-05, + "loss": 5.2211, + "step": 10286 + }, + { + "epoch": 0.06117970311161861, + "grad_norm": 1.7174444198608398, + "learning_rate": 4.953974031073112e-05, + "loss": 5.2873, + "step": 10287 + }, + { + "epoch": 0.06118565039489961, + "grad_norm": 1.4754962921142578, + "learning_rate": 4.953965108960907e-05, + "loss": 5.3137, + "step": 10288 + }, + { + "epoch": 0.06119159767818061, + "grad_norm": 1.6911029815673828, + "learning_rate": 4.9539561859920475e-05, + "loss": 5.1914, + "step": 10289 + }, + { + "epoch": 0.061197544961461604, + "grad_norm": 1.5569958686828613, + "learning_rate": 4.953947262166537e-05, + "loss": 5.2141, + "step": 10290 + }, + { + "epoch": 0.0612034922447426, + "grad_norm": 1.5939570665359497, + "learning_rate": 4.9539383374843794e-05, + "loss": 5.2059, + "step": 10291 + }, + { + "epoch": 0.0612094395280236, + "grad_norm": 1.7220442295074463, + "learning_rate": 4.953929411945577e-05, + "loss": 5.3399, + "step": 10292 + }, + { + "epoch": 0.061215386811304597, + "grad_norm": 1.7158905267715454, + "learning_rate": 4.953920485550134e-05, + "loss": 5.3392, + "step": 10293 + }, + { + "epoch": 0.06122133409458559, + "grad_norm": 1.5761021375656128, + "learning_rate": 4.9539115582980525e-05, + "loss": 5.1523, + "step": 10294 + }, + { + "epoch": 0.061227281377866594, + "grad_norm": 1.7746198177337646, + "learning_rate": 4.953902630189335e-05, + "loss": 5.1577, + "step": 10295 + }, + { + "epoch": 0.06123322866114759, + "grad_norm": 1.9633466005325317, + "learning_rate": 4.953893701223986e-05, + "loss": 5.448, + "step": 10296 + }, + { + "epoch": 0.061239175944428584, + "grad_norm": 1.7086774110794067, + "learning_rate": 4.953884771402007e-05, + "loss": 5.2624, + "step": 10297 + }, + { + "epoch": 0.06124512322770958, + "grad_norm": 1.5247907638549805, + "learning_rate": 4.953875840723403e-05, + "loss": 5.1644, + "step": 10298 + }, + { + "epoch": 0.06125107051099058, + "grad_norm": 1.7014293670654297, + "learning_rate": 4.953866909188177e-05, + "loss": 5.2118, + "step": 10299 + }, + { + "epoch": 0.061257017794271576, + "grad_norm": 1.390368103981018, + "learning_rate": 4.9538579767963305e-05, + "loss": 5.3159, + "step": 10300 + }, + { + "epoch": 0.06126296507755257, + "grad_norm": 1.4748090505599976, + "learning_rate": 4.953849043547868e-05, + "loss": 5.5283, + "step": 10301 + }, + { + "epoch": 0.06126891236083357, + "grad_norm": 1.6433857679367065, + "learning_rate": 4.953840109442792e-05, + "loss": 5.3388, + "step": 10302 + }, + { + "epoch": 0.06127485964411457, + "grad_norm": 1.6636543273925781, + "learning_rate": 4.9538311744811056e-05, + "loss": 5.4523, + "step": 10303 + }, + { + "epoch": 0.06128080692739556, + "grad_norm": 1.6074668169021606, + "learning_rate": 4.953822238662812e-05, + "loss": 5.2963, + "step": 10304 + }, + { + "epoch": 0.061286754210676565, + "grad_norm": 1.8746674060821533, + "learning_rate": 4.9538133019879155e-05, + "loss": 5.359, + "step": 10305 + }, + { + "epoch": 0.06129270149395756, + "grad_norm": 1.5438963174819946, + "learning_rate": 4.953804364456417e-05, + "loss": 5.2039, + "step": 10306 + }, + { + "epoch": 0.061298648777238555, + "grad_norm": 1.5594170093536377, + "learning_rate": 4.9537954260683205e-05, + "loss": 5.3003, + "step": 10307 + }, + { + "epoch": 0.06130459606051956, + "grad_norm": 1.3331657648086548, + "learning_rate": 4.95378648682363e-05, + "loss": 5.3051, + "step": 10308 + }, + { + "epoch": 0.06131054334380055, + "grad_norm": 1.5514707565307617, + "learning_rate": 4.953777546722348e-05, + "loss": 5.3344, + "step": 10309 + }, + { + "epoch": 0.06131649062708155, + "grad_norm": 1.6396936178207397, + "learning_rate": 4.953768605764477e-05, + "loss": 5.1244, + "step": 10310 + }, + { + "epoch": 0.06132243791036255, + "grad_norm": 1.576407551765442, + "learning_rate": 4.953759663950022e-05, + "loss": 5.1908, + "step": 10311 + }, + { + "epoch": 0.061328385193643545, + "grad_norm": 1.5868182182312012, + "learning_rate": 4.953750721278984e-05, + "loss": 5.2538, + "step": 10312 + }, + { + "epoch": 0.06133433247692454, + "grad_norm": 1.7734450101852417, + "learning_rate": 4.9537417777513664e-05, + "loss": 5.3727, + "step": 10313 + }, + { + "epoch": 0.061340279760205535, + "grad_norm": 1.5105754137039185, + "learning_rate": 4.953732833367174e-05, + "loss": 5.3547, + "step": 10314 + }, + { + "epoch": 0.06134622704348654, + "grad_norm": 1.5607833862304688, + "learning_rate": 4.953723888126408e-05, + "loss": 5.2265, + "step": 10315 + }, + { + "epoch": 0.06135217432676753, + "grad_norm": 1.2882065773010254, + "learning_rate": 4.9537149420290726e-05, + "loss": 4.9719, + "step": 10316 + }, + { + "epoch": 0.06135812161004853, + "grad_norm": 1.4349958896636963, + "learning_rate": 4.953705995075171e-05, + "loss": 5.2773, + "step": 10317 + }, + { + "epoch": 0.06136406889332953, + "grad_norm": 2.3595380783081055, + "learning_rate": 4.953697047264706e-05, + "loss": 5.7403, + "step": 10318 + }, + { + "epoch": 0.061370016176610524, + "grad_norm": 1.6126785278320312, + "learning_rate": 4.9536880985976805e-05, + "loss": 5.5316, + "step": 10319 + }, + { + "epoch": 0.06137596345989152, + "grad_norm": 1.7738999128341675, + "learning_rate": 4.953679149074098e-05, + "loss": 5.602, + "step": 10320 + }, + { + "epoch": 0.06138191074317252, + "grad_norm": 1.9263441562652588, + "learning_rate": 4.953670198693961e-05, + "loss": 5.0669, + "step": 10321 + }, + { + "epoch": 0.061387858026453516, + "grad_norm": 1.6290051937103271, + "learning_rate": 4.953661247457273e-05, + "loss": 5.2163, + "step": 10322 + }, + { + "epoch": 0.06139380530973451, + "grad_norm": 1.6354936361312866, + "learning_rate": 4.9536522953640374e-05, + "loss": 5.1678, + "step": 10323 + }, + { + "epoch": 0.061399752593015514, + "grad_norm": 1.7600759267807007, + "learning_rate": 4.953643342414257e-05, + "loss": 5.946, + "step": 10324 + }, + { + "epoch": 0.06140569987629651, + "grad_norm": 2.0515828132629395, + "learning_rate": 4.9536343886079357e-05, + "loss": 5.463, + "step": 10325 + }, + { + "epoch": 0.061411647159577504, + "grad_norm": 1.9990586042404175, + "learning_rate": 4.9536254339450754e-05, + "loss": 5.3084, + "step": 10326 + }, + { + "epoch": 0.0614175944428585, + "grad_norm": 1.7596598863601685, + "learning_rate": 4.95361647842568e-05, + "loss": 5.9268, + "step": 10327 + }, + { + "epoch": 0.0614235417261395, + "grad_norm": 1.8702850341796875, + "learning_rate": 4.953607522049752e-05, + "loss": 5.4303, + "step": 10328 + }, + { + "epoch": 0.061429489009420496, + "grad_norm": 1.9598991870880127, + "learning_rate": 4.953598564817296e-05, + "loss": 5.1813, + "step": 10329 + }, + { + "epoch": 0.06143543629270149, + "grad_norm": 1.5180566310882568, + "learning_rate": 4.953589606728314e-05, + "loss": 5.6051, + "step": 10330 + }, + { + "epoch": 0.06144138357598249, + "grad_norm": 1.4654324054718018, + "learning_rate": 4.953580647782808e-05, + "loss": 5.7188, + "step": 10331 + }, + { + "epoch": 0.06144733085926349, + "grad_norm": 1.351413607597351, + "learning_rate": 4.9535716879807835e-05, + "loss": 5.6928, + "step": 10332 + }, + { + "epoch": 0.06145327814254448, + "grad_norm": 1.4495320320129395, + "learning_rate": 4.953562727322242e-05, + "loss": 5.5576, + "step": 10333 + }, + { + "epoch": 0.061459225425825485, + "grad_norm": 1.4851731061935425, + "learning_rate": 4.953553765807187e-05, + "loss": 5.31, + "step": 10334 + }, + { + "epoch": 0.06146517270910648, + "grad_norm": 1.9790018796920776, + "learning_rate": 4.953544803435622e-05, + "loss": 5.5375, + "step": 10335 + }, + { + "epoch": 0.061471119992387475, + "grad_norm": 1.6931076049804688, + "learning_rate": 4.953535840207549e-05, + "loss": 5.6863, + "step": 10336 + }, + { + "epoch": 0.06147706727566848, + "grad_norm": 1.7479010820388794, + "learning_rate": 4.9535268761229735e-05, + "loss": 5.571, + "step": 10337 + }, + { + "epoch": 0.06148301455894947, + "grad_norm": 2.0722434520721436, + "learning_rate": 4.953517911181896e-05, + "loss": 5.2462, + "step": 10338 + }, + { + "epoch": 0.06148896184223047, + "grad_norm": 2.125288486480713, + "learning_rate": 4.953508945384322e-05, + "loss": 5.6343, + "step": 10339 + }, + { + "epoch": 0.06149490912551147, + "grad_norm": 2.0187058448791504, + "learning_rate": 4.953499978730252e-05, + "loss": 5.8642, + "step": 10340 + }, + { + "epoch": 0.061500856408792465, + "grad_norm": 1.6849068403244019, + "learning_rate": 4.9534910112196906e-05, + "loss": 5.5534, + "step": 10341 + }, + { + "epoch": 0.06150680369207346, + "grad_norm": 2.008009433746338, + "learning_rate": 4.953482042852641e-05, + "loss": 5.464, + "step": 10342 + }, + { + "epoch": 0.061512750975354455, + "grad_norm": 1.7537699937820435, + "learning_rate": 4.953473073629107e-05, + "loss": 5.9052, + "step": 10343 + }, + { + "epoch": 0.06151869825863546, + "grad_norm": 1.5746090412139893, + "learning_rate": 4.95346410354909e-05, + "loss": 5.6898, + "step": 10344 + }, + { + "epoch": 0.06152464554191645, + "grad_norm": 2.027543783187866, + "learning_rate": 4.9534551326125944e-05, + "loss": 6.0481, + "step": 10345 + }, + { + "epoch": 0.06153059282519745, + "grad_norm": 1.6113003492355347, + "learning_rate": 4.9534461608196224e-05, + "loss": 5.4792, + "step": 10346 + }, + { + "epoch": 0.06153654010847845, + "grad_norm": 1.5709928274154663, + "learning_rate": 4.953437188170178e-05, + "loss": 5.7601, + "step": 10347 + }, + { + "epoch": 0.061542487391759444, + "grad_norm": 1.7116700410842896, + "learning_rate": 4.953428214664265e-05, + "loss": 5.7284, + "step": 10348 + }, + { + "epoch": 0.06154843467504044, + "grad_norm": 2.262103796005249, + "learning_rate": 4.953419240301884e-05, + "loss": 5.7247, + "step": 10349 + }, + { + "epoch": 0.06155438195832144, + "grad_norm": 1.8536508083343506, + "learning_rate": 4.9534102650830406e-05, + "loss": 5.7509, + "step": 10350 + }, + { + "epoch": 0.061560329241602436, + "grad_norm": 2.1372785568237305, + "learning_rate": 4.953401289007737e-05, + "loss": 5.8436, + "step": 10351 + }, + { + "epoch": 0.06156627652488343, + "grad_norm": 2.5555527210235596, + "learning_rate": 4.953392312075976e-05, + "loss": 5.6481, + "step": 10352 + }, + { + "epoch": 0.061572223808164434, + "grad_norm": 2.607111692428589, + "learning_rate": 4.953383334287761e-05, + "loss": 5.4822, + "step": 10353 + }, + { + "epoch": 0.06157817109144543, + "grad_norm": 2.728994369506836, + "learning_rate": 4.953374355643095e-05, + "loss": 5.4327, + "step": 10354 + }, + { + "epoch": 0.061584118374726424, + "grad_norm": 2.3375606536865234, + "learning_rate": 4.953365376141983e-05, + "loss": 5.537, + "step": 10355 + }, + { + "epoch": 0.06159006565800742, + "grad_norm": 2.4509146213531494, + "learning_rate": 4.953356395784425e-05, + "loss": 5.5717, + "step": 10356 + }, + { + "epoch": 0.06159601294128842, + "grad_norm": 2.412198781967163, + "learning_rate": 4.953347414570426e-05, + "loss": 5.5216, + "step": 10357 + }, + { + "epoch": 0.061601960224569416, + "grad_norm": 1.7105822563171387, + "learning_rate": 4.9533384324999886e-05, + "loss": 5.6661, + "step": 10358 + }, + { + "epoch": 0.06160790750785041, + "grad_norm": 2.2394793033599854, + "learning_rate": 4.953329449573116e-05, + "loss": 5.2062, + "step": 10359 + }, + { + "epoch": 0.06161385479113141, + "grad_norm": 2.1791203022003174, + "learning_rate": 4.9533204657898127e-05, + "loss": 5.1961, + "step": 10360 + }, + { + "epoch": 0.06161980207441241, + "grad_norm": 2.0430495738983154, + "learning_rate": 4.953311481150079e-05, + "loss": 5.1492, + "step": 10361 + }, + { + "epoch": 0.0616257493576934, + "grad_norm": 2.157975435256958, + "learning_rate": 4.9533024956539204e-05, + "loss": 4.9354, + "step": 10362 + }, + { + "epoch": 0.061631696640974405, + "grad_norm": 2.101484537124634, + "learning_rate": 4.953293509301339e-05, + "loss": 4.9212, + "step": 10363 + }, + { + "epoch": 0.0616376439242554, + "grad_norm": 1.740793228149414, + "learning_rate": 4.953284522092338e-05, + "loss": 5.1234, + "step": 10364 + }, + { + "epoch": 0.061643591207536395, + "grad_norm": 1.9694514274597168, + "learning_rate": 4.953275534026921e-05, + "loss": 5.3688, + "step": 10365 + }, + { + "epoch": 0.0616495384908174, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.953266545105091e-05, + "loss": 4.7194, + "step": 10366 + }, + { + "epoch": 0.06165548577409839, + "grad_norm": 2.016284942626953, + "learning_rate": 4.95325755532685e-05, + "loss": 4.7397, + "step": 10367 + }, + { + "epoch": 0.06166143305737939, + "grad_norm": 2.3073251247406006, + "learning_rate": 4.9532485646922036e-05, + "loss": 4.59, + "step": 10368 + }, + { + "epoch": 0.06166738034066039, + "grad_norm": 2.265873670578003, + "learning_rate": 4.9532395732011524e-05, + "loss": 4.7713, + "step": 10369 + }, + { + "epoch": 0.061673327623941385, + "grad_norm": 1.8176212310791016, + "learning_rate": 4.953230580853701e-05, + "loss": 5.2288, + "step": 10370 + }, + { + "epoch": 0.06167927490722238, + "grad_norm": 2.3636794090270996, + "learning_rate": 4.953221587649852e-05, + "loss": 5.1683, + "step": 10371 + }, + { + "epoch": 0.061685222190503375, + "grad_norm": 1.8074215650558472, + "learning_rate": 4.953212593589609e-05, + "loss": 6.037, + "step": 10372 + }, + { + "epoch": 0.06169116947378438, + "grad_norm": 2.1368768215179443, + "learning_rate": 4.953203598672975e-05, + "loss": 5.8481, + "step": 10373 + }, + { + "epoch": 0.06169711675706537, + "grad_norm": 2.924474000930786, + "learning_rate": 4.953194602899952e-05, + "loss": 4.327, + "step": 10374 + }, + { + "epoch": 0.06170306404034637, + "grad_norm": 2.412336826324463, + "learning_rate": 4.953185606270545e-05, + "loss": 4.3885, + "step": 10375 + }, + { + "epoch": 0.06170901132362737, + "grad_norm": 1.9676904678344727, + "learning_rate": 4.953176608784756e-05, + "loss": 5.4581, + "step": 10376 + }, + { + "epoch": 0.061714958606908364, + "grad_norm": 2.1357827186584473, + "learning_rate": 4.953167610442588e-05, + "loss": 6.1762, + "step": 10377 + }, + { + "epoch": 0.06172090589018936, + "grad_norm": 1.912763237953186, + "learning_rate": 4.953158611244045e-05, + "loss": 6.3403, + "step": 10378 + }, + { + "epoch": 0.06172685317347036, + "grad_norm": 2.0528855323791504, + "learning_rate": 4.95314961118913e-05, + "loss": 6.1921, + "step": 10379 + }, + { + "epoch": 0.061732800456751356, + "grad_norm": 2.1858723163604736, + "learning_rate": 4.953140610277846e-05, + "loss": 5.1944, + "step": 10380 + }, + { + "epoch": 0.06173874774003235, + "grad_norm": 2.04040265083313, + "learning_rate": 4.9531316085101944e-05, + "loss": 5.1866, + "step": 10381 + }, + { + "epoch": 0.06174469502331335, + "grad_norm": 2.216113567352295, + "learning_rate": 4.953122605886181e-05, + "loss": 5.5625, + "step": 10382 + }, + { + "epoch": 0.06175064230659435, + "grad_norm": 1.7107234001159668, + "learning_rate": 4.9531136024058076e-05, + "loss": 5.917, + "step": 10383 + }, + { + "epoch": 0.061756589589875344, + "grad_norm": 1.983104944229126, + "learning_rate": 4.9531045980690776e-05, + "loss": 6.0113, + "step": 10384 + }, + { + "epoch": 0.06176253687315634, + "grad_norm": 2.0186147689819336, + "learning_rate": 4.9530955928759945e-05, + "loss": 6.5227, + "step": 10385 + }, + { + "epoch": 0.06176848415643734, + "grad_norm": 1.8337477445602417, + "learning_rate": 4.9530865868265605e-05, + "loss": 5.9586, + "step": 10386 + }, + { + "epoch": 0.061774431439718336, + "grad_norm": 1.6523345708847046, + "learning_rate": 4.9530775799207795e-05, + "loss": 5.7073, + "step": 10387 + }, + { + "epoch": 0.06178037872299933, + "grad_norm": 1.617838740348816, + "learning_rate": 4.953068572158654e-05, + "loss": 5.3771, + "step": 10388 + }, + { + "epoch": 0.06178632600628033, + "grad_norm": 1.7327697277069092, + "learning_rate": 4.953059563540189e-05, + "loss": 5.3021, + "step": 10389 + }, + { + "epoch": 0.06179227328956133, + "grad_norm": 2.726762294769287, + "learning_rate": 4.9530505540653856e-05, + "loss": 5.2568, + "step": 10390 + }, + { + "epoch": 0.06179822057284232, + "grad_norm": 2.540090560913086, + "learning_rate": 4.953041543734247e-05, + "loss": 5.114, + "step": 10391 + }, + { + "epoch": 0.061804167856123325, + "grad_norm": 2.26487135887146, + "learning_rate": 4.953032532546777e-05, + "loss": 5.2552, + "step": 10392 + }, + { + "epoch": 0.06181011513940432, + "grad_norm": 1.9986075162887573, + "learning_rate": 4.95302352050298e-05, + "loss": 5.3555, + "step": 10393 + }, + { + "epoch": 0.061816062422685315, + "grad_norm": 2.2121987342834473, + "learning_rate": 4.9530145076028564e-05, + "loss": 5.665, + "step": 10394 + }, + { + "epoch": 0.06182200970596632, + "grad_norm": 1.892927646636963, + "learning_rate": 4.953005493846411e-05, + "loss": 5.2536, + "step": 10395 + }, + { + "epoch": 0.06182795698924731, + "grad_norm": 2.1083126068115234, + "learning_rate": 4.952996479233647e-05, + "loss": 6.1748, + "step": 10396 + }, + { + "epoch": 0.06183390427252831, + "grad_norm": 2.2235448360443115, + "learning_rate": 4.9529874637645675e-05, + "loss": 6.0676, + "step": 10397 + }, + { + "epoch": 0.06183985155580931, + "grad_norm": 2.0888702869415283, + "learning_rate": 4.952978447439175e-05, + "loss": 5.2515, + "step": 10398 + }, + { + "epoch": 0.061845798839090305, + "grad_norm": 1.826622724533081, + "learning_rate": 4.9529694302574736e-05, + "loss": 5.6849, + "step": 10399 + }, + { + "epoch": 0.0618517461223713, + "grad_norm": 1.9772933721542358, + "learning_rate": 4.952960412219465e-05, + "loss": 5.7702, + "step": 10400 + }, + { + "epoch": 0.061857693405652295, + "grad_norm": 2.2230029106140137, + "learning_rate": 4.952951393325154e-05, + "loss": 5.5747, + "step": 10401 + }, + { + "epoch": 0.0618636406889333, + "grad_norm": 1.9372552633285522, + "learning_rate": 4.9529423735745425e-05, + "loss": 5.4728, + "step": 10402 + }, + { + "epoch": 0.06186958797221429, + "grad_norm": 2.2238845825195312, + "learning_rate": 4.952933352967635e-05, + "loss": 5.2462, + "step": 10403 + }, + { + "epoch": 0.06187553525549529, + "grad_norm": 1.7716748714447021, + "learning_rate": 4.952924331504433e-05, + "loss": 5.5651, + "step": 10404 + }, + { + "epoch": 0.06188148253877629, + "grad_norm": 2.2933645248413086, + "learning_rate": 4.9529153091849405e-05, + "loss": 5.8684, + "step": 10405 + }, + { + "epoch": 0.061887429822057284, + "grad_norm": 2.222883939743042, + "learning_rate": 4.9529062860091616e-05, + "loss": 5.8427, + "step": 10406 + }, + { + "epoch": 0.06189337710533828, + "grad_norm": 1.645338773727417, + "learning_rate": 4.9528972619770975e-05, + "loss": 5.7001, + "step": 10407 + }, + { + "epoch": 0.06189932438861928, + "grad_norm": 2.1029653549194336, + "learning_rate": 4.952888237088752e-05, + "loss": 5.728, + "step": 10408 + }, + { + "epoch": 0.061905271671900276, + "grad_norm": 2.2689831256866455, + "learning_rate": 4.952879211344129e-05, + "loss": 5.4678, + "step": 10409 + }, + { + "epoch": 0.06191121895518127, + "grad_norm": 1.908469557762146, + "learning_rate": 4.9528701847432315e-05, + "loss": 6.007, + "step": 10410 + }, + { + "epoch": 0.06191716623846227, + "grad_norm": 1.819381833076477, + "learning_rate": 4.952861157286062e-05, + "loss": 6.2041, + "step": 10411 + }, + { + "epoch": 0.06192311352174327, + "grad_norm": 2.16945743560791, + "learning_rate": 4.952852128972624e-05, + "loss": 5.7757, + "step": 10412 + }, + { + "epoch": 0.061929060805024264, + "grad_norm": 2.1671459674835205, + "learning_rate": 4.952843099802921e-05, + "loss": 5.5212, + "step": 10413 + }, + { + "epoch": 0.061935008088305266, + "grad_norm": 1.730073094367981, + "learning_rate": 4.952834069776956e-05, + "loss": 5.809, + "step": 10414 + }, + { + "epoch": 0.06194095537158626, + "grad_norm": 2.1048457622528076, + "learning_rate": 4.952825038894732e-05, + "loss": 5.7219, + "step": 10415 + }, + { + "epoch": 0.061946902654867256, + "grad_norm": 2.7438642978668213, + "learning_rate": 4.9528160071562516e-05, + "loss": 5.6367, + "step": 10416 + }, + { + "epoch": 0.06195284993814825, + "grad_norm": 2.0103960037231445, + "learning_rate": 4.952806974561518e-05, + "loss": 5.1429, + "step": 10417 + }, + { + "epoch": 0.06195879722142925, + "grad_norm": 2.1754884719848633, + "learning_rate": 4.9527979411105354e-05, + "loss": 5.9337, + "step": 10418 + }, + { + "epoch": 0.06196474450471025, + "grad_norm": 2.553421974182129, + "learning_rate": 4.9527889068033063e-05, + "loss": 5.7076, + "step": 10419 + }, + { + "epoch": 0.06197069178799124, + "grad_norm": 2.0601327419281006, + "learning_rate": 4.952779871639834e-05, + "loss": 5.7855, + "step": 10420 + }, + { + "epoch": 0.061976639071272245, + "grad_norm": 2.0958025455474854, + "learning_rate": 4.952770835620122e-05, + "loss": 5.8621, + "step": 10421 + }, + { + "epoch": 0.06198258635455324, + "grad_norm": 2.2658755779266357, + "learning_rate": 4.952761798744172e-05, + "loss": 5.9306, + "step": 10422 + }, + { + "epoch": 0.061988533637834235, + "grad_norm": 1.933090090751648, + "learning_rate": 4.9527527610119896e-05, + "loss": 5.1557, + "step": 10423 + }, + { + "epoch": 0.06199448092111524, + "grad_norm": 2.5761375427246094, + "learning_rate": 4.952743722423575e-05, + "loss": 5.4438, + "step": 10424 + }, + { + "epoch": 0.06200042820439623, + "grad_norm": 2.0499768257141113, + "learning_rate": 4.9527346829789344e-05, + "loss": 5.4153, + "step": 10425 + }, + { + "epoch": 0.06200637548767723, + "grad_norm": 1.970674991607666, + "learning_rate": 4.952725642678069e-05, + "loss": 5.8678, + "step": 10426 + }, + { + "epoch": 0.06201232277095823, + "grad_norm": 2.4563233852386475, + "learning_rate": 4.9527166015209814e-05, + "loss": 4.926, + "step": 10427 + }, + { + "epoch": 0.062018270054239225, + "grad_norm": 1.8380508422851562, + "learning_rate": 4.9527075595076763e-05, + "loss": 4.9619, + "step": 10428 + }, + { + "epoch": 0.06202421733752022, + "grad_norm": 1.8930846452713013, + "learning_rate": 4.9526985166381565e-05, + "loss": 4.8252, + "step": 10429 + }, + { + "epoch": 0.062030164620801215, + "grad_norm": 2.401026725769043, + "learning_rate": 4.952689472912426e-05, + "loss": 4.5023, + "step": 10430 + }, + { + "epoch": 0.06203611190408222, + "grad_norm": 2.2801949977874756, + "learning_rate": 4.952680428330486e-05, + "loss": 4.6461, + "step": 10431 + }, + { + "epoch": 0.06204205918736321, + "grad_norm": 2.2466189861297607, + "learning_rate": 4.95267138289234e-05, + "loss": 4.5946, + "step": 10432 + }, + { + "epoch": 0.06204800647064421, + "grad_norm": 2.1723902225494385, + "learning_rate": 4.952662336597993e-05, + "loss": 5.6417, + "step": 10433 + }, + { + "epoch": 0.06205395375392521, + "grad_norm": 1.9614545106887817, + "learning_rate": 4.952653289447446e-05, + "loss": 5.0758, + "step": 10434 + }, + { + "epoch": 0.062059901037206204, + "grad_norm": 2.465252637863159, + "learning_rate": 4.9526442414407036e-05, + "loss": 4.6159, + "step": 10435 + }, + { + "epoch": 0.0620658483204872, + "grad_norm": 2.2298080921173096, + "learning_rate": 4.9526351925777684e-05, + "loss": 5.24, + "step": 10436 + }, + { + "epoch": 0.0620717956037682, + "grad_norm": 2.1284472942352295, + "learning_rate": 4.952626142858643e-05, + "loss": 4.5255, + "step": 10437 + }, + { + "epoch": 0.062077742887049196, + "grad_norm": 2.1340067386627197, + "learning_rate": 4.9526170922833314e-05, + "loss": 4.5931, + "step": 10438 + }, + { + "epoch": 0.06208369017033019, + "grad_norm": 2.20354962348938, + "learning_rate": 4.952608040851837e-05, + "loss": 4.7688, + "step": 10439 + }, + { + "epoch": 0.06208963745361119, + "grad_norm": 1.5250015258789062, + "learning_rate": 4.952598988564162e-05, + "loss": 5.3292, + "step": 10440 + }, + { + "epoch": 0.06209558473689219, + "grad_norm": 2.1667168140411377, + "learning_rate": 4.95258993542031e-05, + "loss": 5.6216, + "step": 10441 + }, + { + "epoch": 0.062101532020173184, + "grad_norm": 1.8172663450241089, + "learning_rate": 4.9525808814202846e-05, + "loss": 5.5813, + "step": 10442 + }, + { + "epoch": 0.062107479303454186, + "grad_norm": 1.9832731485366821, + "learning_rate": 4.9525718265640884e-05, + "loss": 5.4444, + "step": 10443 + }, + { + "epoch": 0.06211342658673518, + "grad_norm": 2.051358699798584, + "learning_rate": 4.952562770851724e-05, + "loss": 5.3488, + "step": 10444 + }, + { + "epoch": 0.062119373870016176, + "grad_norm": 2.1487104892730713, + "learning_rate": 4.952553714283196e-05, + "loss": 5.3803, + "step": 10445 + }, + { + "epoch": 0.06212532115329717, + "grad_norm": 2.086853504180908, + "learning_rate": 4.952544656858507e-05, + "loss": 5.4585, + "step": 10446 + }, + { + "epoch": 0.06213126843657817, + "grad_norm": 2.1599764823913574, + "learning_rate": 4.95253559857766e-05, + "loss": 5.3728, + "step": 10447 + }, + { + "epoch": 0.06213721571985917, + "grad_norm": 1.877626657485962, + "learning_rate": 4.9525265394406576e-05, + "loss": 5.433, + "step": 10448 + }, + { + "epoch": 0.06214316300314016, + "grad_norm": 2.022185802459717, + "learning_rate": 4.952517479447504e-05, + "loss": 5.6472, + "step": 10449 + }, + { + "epoch": 0.062149110286421165, + "grad_norm": 2.1667773723602295, + "learning_rate": 4.9525084185982015e-05, + "loss": 5.3174, + "step": 10450 + }, + { + "epoch": 0.06215505756970216, + "grad_norm": 1.6227883100509644, + "learning_rate": 4.952499356892753e-05, + "loss": 5.3747, + "step": 10451 + }, + { + "epoch": 0.062161004852983155, + "grad_norm": 1.935307502746582, + "learning_rate": 4.952490294331164e-05, + "loss": 5.7716, + "step": 10452 + }, + { + "epoch": 0.06216695213626416, + "grad_norm": 2.6584694385528564, + "learning_rate": 4.952481230913435e-05, + "loss": 5.3525, + "step": 10453 + }, + { + "epoch": 0.06217289941954515, + "grad_norm": 2.626344919204712, + "learning_rate": 4.9524721666395705e-05, + "loss": 5.2118, + "step": 10454 + }, + { + "epoch": 0.06217884670282615, + "grad_norm": 2.525580644607544, + "learning_rate": 4.9524631015095735e-05, + "loss": 5.1231, + "step": 10455 + }, + { + "epoch": 0.06218479398610715, + "grad_norm": 2.274801015853882, + "learning_rate": 4.9524540355234464e-05, + "loss": 5.0637, + "step": 10456 + }, + { + "epoch": 0.062190741269388145, + "grad_norm": 1.9937769174575806, + "learning_rate": 4.952444968681193e-05, + "loss": 5.8196, + "step": 10457 + }, + { + "epoch": 0.06219668855266914, + "grad_norm": 2.124290943145752, + "learning_rate": 4.952435900982816e-05, + "loss": 5.5221, + "step": 10458 + }, + { + "epoch": 0.062202635835950135, + "grad_norm": 2.2544684410095215, + "learning_rate": 4.95242683242832e-05, + "loss": 5.6656, + "step": 10459 + }, + { + "epoch": 0.06220858311923114, + "grad_norm": 2.2626397609710693, + "learning_rate": 4.952417763017706e-05, + "loss": 5.5836, + "step": 10460 + }, + { + "epoch": 0.06221453040251213, + "grad_norm": 1.9299595355987549, + "learning_rate": 4.9524086927509796e-05, + "loss": 5.6637, + "step": 10461 + }, + { + "epoch": 0.06222047768579313, + "grad_norm": 1.769463062286377, + "learning_rate": 4.952399621628142e-05, + "loss": 5.4836, + "step": 10462 + }, + { + "epoch": 0.06222642496907413, + "grad_norm": 1.6773936748504639, + "learning_rate": 4.952390549649196e-05, + "loss": 5.2894, + "step": 10463 + }, + { + "epoch": 0.062232372252355124, + "grad_norm": 1.7612723112106323, + "learning_rate": 4.952381476814148e-05, + "loss": 5.5438, + "step": 10464 + }, + { + "epoch": 0.06223831953563612, + "grad_norm": 2.5255069732666016, + "learning_rate": 4.952372403122997e-05, + "loss": 5.7864, + "step": 10465 + }, + { + "epoch": 0.06224426681891712, + "grad_norm": 2.1128363609313965, + "learning_rate": 4.9523633285757486e-05, + "loss": 5.6207, + "step": 10466 + }, + { + "epoch": 0.062250214102198116, + "grad_norm": 1.8612544536590576, + "learning_rate": 4.952354253172407e-05, + "loss": 5.9177, + "step": 10467 + }, + { + "epoch": 0.06225616138547911, + "grad_norm": 2.092707633972168, + "learning_rate": 4.9523451769129715e-05, + "loss": 5.6047, + "step": 10468 + }, + { + "epoch": 0.06226210866876011, + "grad_norm": 2.6695668697357178, + "learning_rate": 4.952336099797449e-05, + "loss": 5.4931, + "step": 10469 + }, + { + "epoch": 0.06226805595204111, + "grad_norm": 2.2714614868164062, + "learning_rate": 4.9523270218258414e-05, + "loss": 5.4481, + "step": 10470 + }, + { + "epoch": 0.0622740032353221, + "grad_norm": 2.035304307937622, + "learning_rate": 4.952317942998151e-05, + "loss": 5.3609, + "step": 10471 + }, + { + "epoch": 0.062279950518603105, + "grad_norm": 2.295647144317627, + "learning_rate": 4.952308863314382e-05, + "loss": 5.5687, + "step": 10472 + }, + { + "epoch": 0.0622858978018841, + "grad_norm": 1.8365178108215332, + "learning_rate": 4.9522997827745375e-05, + "loss": 5.4207, + "step": 10473 + }, + { + "epoch": 0.062291845085165096, + "grad_norm": 1.6130415201187134, + "learning_rate": 4.9522907013786206e-05, + "loss": 5.1894, + "step": 10474 + }, + { + "epoch": 0.06229779236844609, + "grad_norm": 2.01560115814209, + "learning_rate": 4.952281619126634e-05, + "loss": 5.4956, + "step": 10475 + }, + { + "epoch": 0.06230373965172709, + "grad_norm": 2.7854549884796143, + "learning_rate": 4.952272536018582e-05, + "loss": 5.2341, + "step": 10476 + }, + { + "epoch": 0.06230968693500809, + "grad_norm": 2.7532944679260254, + "learning_rate": 4.9522634520544666e-05, + "loss": 5.1863, + "step": 10477 + }, + { + "epoch": 0.06231563421828908, + "grad_norm": 2.193084239959717, + "learning_rate": 4.952254367234291e-05, + "loss": 5.5187, + "step": 10478 + }, + { + "epoch": 0.062321581501570085, + "grad_norm": 2.245664119720459, + "learning_rate": 4.952245281558059e-05, + "loss": 5.1275, + "step": 10479 + }, + { + "epoch": 0.06232752878485108, + "grad_norm": 2.0522654056549072, + "learning_rate": 4.9522361950257734e-05, + "loss": 5.2887, + "step": 10480 + }, + { + "epoch": 0.062333476068132075, + "grad_norm": 2.132280111312866, + "learning_rate": 4.952227107637437e-05, + "loss": 5.8767, + "step": 10481 + }, + { + "epoch": 0.06233942335141308, + "grad_norm": 2.155574083328247, + "learning_rate": 4.952218019393055e-05, + "loss": 5.9499, + "step": 10482 + }, + { + "epoch": 0.06234537063469407, + "grad_norm": 2.3979780673980713, + "learning_rate": 4.952208930292627e-05, + "loss": 5.7622, + "step": 10483 + }, + { + "epoch": 0.06235131791797507, + "grad_norm": 2.444812297821045, + "learning_rate": 4.9521998403361595e-05, + "loss": 5.3332, + "step": 10484 + }, + { + "epoch": 0.06235726520125607, + "grad_norm": 2.369248867034912, + "learning_rate": 4.952190749523654e-05, + "loss": 5.109, + "step": 10485 + }, + { + "epoch": 0.062363212484537064, + "grad_norm": 1.9160844087600708, + "learning_rate": 4.952181657855114e-05, + "loss": 5.1783, + "step": 10486 + }, + { + "epoch": 0.06236915976781806, + "grad_norm": 2.1532788276672363, + "learning_rate": 4.952172565330543e-05, + "loss": 5.913, + "step": 10487 + }, + { + "epoch": 0.062375107051099055, + "grad_norm": 2.132382392883301, + "learning_rate": 4.9521634719499435e-05, + "loss": 5.7748, + "step": 10488 + }, + { + "epoch": 0.06238105433438006, + "grad_norm": 2.22267484664917, + "learning_rate": 4.9521543777133194e-05, + "loss": 5.6464, + "step": 10489 + }, + { + "epoch": 0.06238700161766105, + "grad_norm": 2.0619423389434814, + "learning_rate": 4.952145282620674e-05, + "loss": 5.4881, + "step": 10490 + }, + { + "epoch": 0.06239294890094205, + "grad_norm": 2.9574310779571533, + "learning_rate": 4.952136186672009e-05, + "loss": 5.4401, + "step": 10491 + }, + { + "epoch": 0.06239889618422305, + "grad_norm": 1.7362775802612305, + "learning_rate": 4.952127089867329e-05, + "loss": 6.0755, + "step": 10492 + }, + { + "epoch": 0.062404843467504044, + "grad_norm": 1.8244996070861816, + "learning_rate": 4.952117992206637e-05, + "loss": 6.2588, + "step": 10493 + }, + { + "epoch": 0.06241079075078504, + "grad_norm": 1.8556538820266724, + "learning_rate": 4.952108893689936e-05, + "loss": 6.0827, + "step": 10494 + }, + { + "epoch": 0.06241673803406604, + "grad_norm": 2.2471442222595215, + "learning_rate": 4.9520997943172285e-05, + "loss": 5.98, + "step": 10495 + }, + { + "epoch": 0.062422685317347036, + "grad_norm": 3.0217249393463135, + "learning_rate": 4.9520906940885186e-05, + "loss": 5.5116, + "step": 10496 + }, + { + "epoch": 0.06242863260062803, + "grad_norm": 2.02962064743042, + "learning_rate": 4.9520815930038086e-05, + "loss": 5.9341, + "step": 10497 + }, + { + "epoch": 0.06243457988390903, + "grad_norm": 1.6286019086837769, + "learning_rate": 4.9520724910631034e-05, + "loss": 5.1944, + "step": 10498 + }, + { + "epoch": 0.06244052716719003, + "grad_norm": 1.9963330030441284, + "learning_rate": 4.9520633882664044e-05, + "loss": 6.0584, + "step": 10499 + }, + { + "epoch": 0.06244647445047102, + "grad_norm": 1.884988784790039, + "learning_rate": 4.9520542846137155e-05, + "loss": 6.2744, + "step": 10500 + }, + { + "epoch": 0.062452421733752025, + "grad_norm": 1.9402821063995361, + "learning_rate": 4.95204518010504e-05, + "loss": 5.9201, + "step": 10501 + }, + { + "epoch": 0.06245836901703302, + "grad_norm": 1.9304310083389282, + "learning_rate": 4.9520360747403805e-05, + "loss": 5.7227, + "step": 10502 + }, + { + "epoch": 0.062464316300314016, + "grad_norm": 2.8199663162231445, + "learning_rate": 4.9520269685197405e-05, + "loss": 6.4819, + "step": 10503 + }, + { + "epoch": 0.06247026358359501, + "grad_norm": 1.456852912902832, + "learning_rate": 4.9520178614431236e-05, + "loss": 5.3169, + "step": 10504 + }, + { + "epoch": 0.06247621086687601, + "grad_norm": 2.3753762245178223, + "learning_rate": 4.9520087535105324e-05, + "loss": 5.9817, + "step": 10505 + }, + { + "epoch": 0.06248215815015701, + "grad_norm": 2.329932928085327, + "learning_rate": 4.951999644721971e-05, + "loss": 6.0266, + "step": 10506 + }, + { + "epoch": 0.062488105433438, + "grad_norm": 1.772615671157837, + "learning_rate": 4.951990535077441e-05, + "loss": 5.2548, + "step": 10507 + }, + { + "epoch": 0.062494052716719005, + "grad_norm": 2.1240997314453125, + "learning_rate": 4.951981424576946e-05, + "loss": 5.3991, + "step": 10508 + }, + { + "epoch": 0.0625, + "grad_norm": 1.7283856868743896, + "learning_rate": 4.9519723132204905e-05, + "loss": 5.2065, + "step": 10509 + }, + { + "epoch": 0.062505947283281, + "grad_norm": 2.197404384613037, + "learning_rate": 4.951963201008076e-05, + "loss": 5.7282, + "step": 10510 + }, + { + "epoch": 0.06251189456656199, + "grad_norm": 1.8550727367401123, + "learning_rate": 4.9519540879397075e-05, + "loss": 6.0125, + "step": 10511 + }, + { + "epoch": 0.06251784184984299, + "grad_norm": 1.5998154878616333, + "learning_rate": 4.951944974015387e-05, + "loss": 5.9371, + "step": 10512 + }, + { + "epoch": 0.062523789133124, + "grad_norm": 1.644454836845398, + "learning_rate": 4.951935859235117e-05, + "loss": 5.9315, + "step": 10513 + }, + { + "epoch": 0.06252973641640498, + "grad_norm": 1.9119540452957153, + "learning_rate": 4.951926743598902e-05, + "loss": 5.7104, + "step": 10514 + }, + { + "epoch": 0.06253568369968598, + "grad_norm": 1.8863649368286133, + "learning_rate": 4.951917627106745e-05, + "loss": 5.8639, + "step": 10515 + }, + { + "epoch": 0.06254163098296699, + "grad_norm": 2.1626899242401123, + "learning_rate": 4.951908509758648e-05, + "loss": 5.9727, + "step": 10516 + }, + { + "epoch": 0.06254757826624797, + "grad_norm": 1.9397778511047363, + "learning_rate": 4.9518993915546155e-05, + "loss": 5.9771, + "step": 10517 + }, + { + "epoch": 0.06255352554952898, + "grad_norm": 1.7723463773727417, + "learning_rate": 4.951890272494651e-05, + "loss": 5.8684, + "step": 10518 + }, + { + "epoch": 0.06255947283280998, + "grad_norm": 1.9191977977752686, + "learning_rate": 4.9518811525787565e-05, + "loss": 5.7242, + "step": 10519 + }, + { + "epoch": 0.06256542011609097, + "grad_norm": 1.7599314451217651, + "learning_rate": 4.951872031806935e-05, + "loss": 5.5234, + "step": 10520 + }, + { + "epoch": 0.06257136739937197, + "grad_norm": 1.6560989618301392, + "learning_rate": 4.951862910179191e-05, + "loss": 5.5907, + "step": 10521 + }, + { + "epoch": 0.06257731468265297, + "grad_norm": 1.9756556749343872, + "learning_rate": 4.9518537876955265e-05, + "loss": 6.0013, + "step": 10522 + }, + { + "epoch": 0.06258326196593396, + "grad_norm": 1.9012173414230347, + "learning_rate": 4.9518446643559454e-05, + "loss": 5.8073, + "step": 10523 + }, + { + "epoch": 0.06258920924921496, + "grad_norm": 1.8992196321487427, + "learning_rate": 4.951835540160451e-05, + "loss": 5.8571, + "step": 10524 + }, + { + "epoch": 0.06259515653249595, + "grad_norm": 1.8002395629882812, + "learning_rate": 4.9518264151090455e-05, + "loss": 5.7798, + "step": 10525 + }, + { + "epoch": 0.06260110381577695, + "grad_norm": 1.732063889503479, + "learning_rate": 4.9518172892017335e-05, + "loss": 5.8167, + "step": 10526 + }, + { + "epoch": 0.06260705109905795, + "grad_norm": 1.6961164474487305, + "learning_rate": 4.951808162438517e-05, + "loss": 5.8797, + "step": 10527 + }, + { + "epoch": 0.06261299838233894, + "grad_norm": 1.904102087020874, + "learning_rate": 4.9517990348193996e-05, + "loss": 5.7109, + "step": 10528 + }, + { + "epoch": 0.06261894566561994, + "grad_norm": 1.6908652782440186, + "learning_rate": 4.951789906344384e-05, + "loss": 5.8435, + "step": 10529 + }, + { + "epoch": 0.06262489294890095, + "grad_norm": 1.8550028800964355, + "learning_rate": 4.951780777013475e-05, + "loss": 5.6218, + "step": 10530 + }, + { + "epoch": 0.06263084023218193, + "grad_norm": 1.7106919288635254, + "learning_rate": 4.951771646826674e-05, + "loss": 5.6668, + "step": 10531 + }, + { + "epoch": 0.06263678751546294, + "grad_norm": 1.5522899627685547, + "learning_rate": 4.951762515783984e-05, + "loss": 5.418, + "step": 10532 + }, + { + "epoch": 0.06264273479874394, + "grad_norm": 1.7510137557983398, + "learning_rate": 4.9517533838854104e-05, + "loss": 5.6595, + "step": 10533 + }, + { + "epoch": 0.06264868208202493, + "grad_norm": 2.1222739219665527, + "learning_rate": 4.9517442511309544e-05, + "loss": 6.0008, + "step": 10534 + }, + { + "epoch": 0.06265462936530593, + "grad_norm": 1.977807641029358, + "learning_rate": 4.95173511752062e-05, + "loss": 5.8263, + "step": 10535 + }, + { + "epoch": 0.06266057664858693, + "grad_norm": 1.6423957347869873, + "learning_rate": 4.9517259830544105e-05, + "loss": 6.2078, + "step": 10536 + }, + { + "epoch": 0.06266652393186792, + "grad_norm": 1.9365674257278442, + "learning_rate": 4.9517168477323286e-05, + "loss": 6.0972, + "step": 10537 + }, + { + "epoch": 0.06267247121514892, + "grad_norm": 1.6738137006759644, + "learning_rate": 4.951707711554377e-05, + "loss": 5.7439, + "step": 10538 + }, + { + "epoch": 0.06267841849842992, + "grad_norm": 2.4281718730926514, + "learning_rate": 4.95169857452056e-05, + "loss": 5.4822, + "step": 10539 + }, + { + "epoch": 0.06268436578171091, + "grad_norm": 2.53411602973938, + "learning_rate": 4.951689436630881e-05, + "loss": 5.4883, + "step": 10540 + }, + { + "epoch": 0.06269031306499191, + "grad_norm": 2.116520643234253, + "learning_rate": 4.951680297885342e-05, + "loss": 5.6123, + "step": 10541 + }, + { + "epoch": 0.06269626034827291, + "grad_norm": 1.8546512126922607, + "learning_rate": 4.951671158283946e-05, + "loss": 5.443, + "step": 10542 + }, + { + "epoch": 0.0627022076315539, + "grad_norm": 2.0048365592956543, + "learning_rate": 4.9516620178266975e-05, + "loss": 5.7759, + "step": 10543 + }, + { + "epoch": 0.0627081549148349, + "grad_norm": 1.6800916194915771, + "learning_rate": 4.9516528765136e-05, + "loss": 5.6767, + "step": 10544 + }, + { + "epoch": 0.0627141021981159, + "grad_norm": 1.7444523572921753, + "learning_rate": 4.9516437343446544e-05, + "loss": 5.297, + "step": 10545 + }, + { + "epoch": 0.0627200494813969, + "grad_norm": 1.8653407096862793, + "learning_rate": 4.951634591319866e-05, + "loss": 5.6999, + "step": 10546 + }, + { + "epoch": 0.0627259967646779, + "grad_norm": 1.7988131046295166, + "learning_rate": 4.9516254474392376e-05, + "loss": 5.5244, + "step": 10547 + }, + { + "epoch": 0.0627319440479589, + "grad_norm": 1.7915012836456299, + "learning_rate": 4.951616302702772e-05, + "loss": 5.6766, + "step": 10548 + }, + { + "epoch": 0.06273789133123989, + "grad_norm": 1.8351629972457886, + "learning_rate": 4.951607157110471e-05, + "loss": 5.6332, + "step": 10549 + }, + { + "epoch": 0.06274383861452089, + "grad_norm": 1.6819947957992554, + "learning_rate": 4.951598010662341e-05, + "loss": 5.5773, + "step": 10550 + }, + { + "epoch": 0.06274978589780189, + "grad_norm": 2.2969119548797607, + "learning_rate": 4.951588863358383e-05, + "loss": 5.6847, + "step": 10551 + }, + { + "epoch": 0.06275573318108288, + "grad_norm": 2.346092939376831, + "learning_rate": 4.951579715198601e-05, + "loss": 5.404, + "step": 10552 + }, + { + "epoch": 0.06276168046436388, + "grad_norm": 1.8255709409713745, + "learning_rate": 4.951570566182997e-05, + "loss": 5.9009, + "step": 10553 + }, + { + "epoch": 0.06276762774764487, + "grad_norm": 2.4000492095947266, + "learning_rate": 4.951561416311575e-05, + "loss": 5.4395, + "step": 10554 + }, + { + "epoch": 0.06277357503092587, + "grad_norm": 2.1519010066986084, + "learning_rate": 4.951552265584339e-05, + "loss": 5.6447, + "step": 10555 + }, + { + "epoch": 0.06277952231420687, + "grad_norm": 1.7821810245513916, + "learning_rate": 4.9515431140012915e-05, + "loss": 5.3495, + "step": 10556 + }, + { + "epoch": 0.06278546959748786, + "grad_norm": 1.8359061479568481, + "learning_rate": 4.9515339615624356e-05, + "loss": 5.7258, + "step": 10557 + }, + { + "epoch": 0.06279141688076886, + "grad_norm": 1.899970293045044, + "learning_rate": 4.951524808267774e-05, + "loss": 5.9683, + "step": 10558 + }, + { + "epoch": 0.06279736416404987, + "grad_norm": 1.6407743692398071, + "learning_rate": 4.951515654117311e-05, + "loss": 6.001, + "step": 10559 + }, + { + "epoch": 0.06280331144733085, + "grad_norm": 1.5474567413330078, + "learning_rate": 4.9515064991110485e-05, + "loss": 5.673, + "step": 10560 + }, + { + "epoch": 0.06280925873061186, + "grad_norm": 1.7129321098327637, + "learning_rate": 4.951497343248991e-05, + "loss": 5.7232, + "step": 10561 + }, + { + "epoch": 0.06281520601389286, + "grad_norm": 1.948367953300476, + "learning_rate": 4.95148818653114e-05, + "loss": 5.9378, + "step": 10562 + }, + { + "epoch": 0.06282115329717385, + "grad_norm": 1.788724422454834, + "learning_rate": 4.951479028957501e-05, + "loss": 5.9077, + "step": 10563 + }, + { + "epoch": 0.06282710058045485, + "grad_norm": 1.7036423683166504, + "learning_rate": 4.951469870528076e-05, + "loss": 5.7688, + "step": 10564 + }, + { + "epoch": 0.06283304786373585, + "grad_norm": 1.6055458784103394, + "learning_rate": 4.9514607112428676e-05, + "loss": 5.7234, + "step": 10565 + }, + { + "epoch": 0.06283899514701684, + "grad_norm": 1.9353829622268677, + "learning_rate": 4.95145155110188e-05, + "loss": 6.1046, + "step": 10566 + }, + { + "epoch": 0.06284494243029784, + "grad_norm": 1.6070129871368408, + "learning_rate": 4.9514423901051157e-05, + "loss": 5.7379, + "step": 10567 + }, + { + "epoch": 0.06285088971357884, + "grad_norm": 1.447828769683838, + "learning_rate": 4.951433228252579e-05, + "loss": 5.2944, + "step": 10568 + }, + { + "epoch": 0.06285683699685983, + "grad_norm": 2.5256540775299072, + "learning_rate": 4.951424065544271e-05, + "loss": 5.1358, + "step": 10569 + }, + { + "epoch": 0.06286278428014083, + "grad_norm": 2.29848051071167, + "learning_rate": 4.951414901980197e-05, + "loss": 5.1967, + "step": 10570 + }, + { + "epoch": 0.06286873156342183, + "grad_norm": 1.9477180242538452, + "learning_rate": 4.951405737560359e-05, + "loss": 5.7509, + "step": 10571 + }, + { + "epoch": 0.06287467884670282, + "grad_norm": 1.9303146600723267, + "learning_rate": 4.951396572284761e-05, + "loss": 5.7052, + "step": 10572 + }, + { + "epoch": 0.06288062612998382, + "grad_norm": 1.5632199048995972, + "learning_rate": 4.951387406153405e-05, + "loss": 5.5001, + "step": 10573 + }, + { + "epoch": 0.06288657341326483, + "grad_norm": 1.6798962354660034, + "learning_rate": 4.951378239166296e-05, + "loss": 5.5537, + "step": 10574 + }, + { + "epoch": 0.06289252069654581, + "grad_norm": 1.7395051717758179, + "learning_rate": 4.9513690713234355e-05, + "loss": 5.736, + "step": 10575 + }, + { + "epoch": 0.06289846797982682, + "grad_norm": 1.726020097732544, + "learning_rate": 4.951359902624828e-05, + "loss": 5.6802, + "step": 10576 + }, + { + "epoch": 0.06290441526310782, + "grad_norm": 1.8063993453979492, + "learning_rate": 4.9513507330704755e-05, + "loss": 5.6077, + "step": 10577 + }, + { + "epoch": 0.0629103625463888, + "grad_norm": 1.6284246444702148, + "learning_rate": 4.951341562660382e-05, + "loss": 5.8327, + "step": 10578 + }, + { + "epoch": 0.06291630982966981, + "grad_norm": 2.635869026184082, + "learning_rate": 4.95133239139455e-05, + "loss": 5.8252, + "step": 10579 + }, + { + "epoch": 0.06292225711295081, + "grad_norm": 2.5127367973327637, + "learning_rate": 4.9513232192729845e-05, + "loss": 5.7431, + "step": 10580 + }, + { + "epoch": 0.0629282043962318, + "grad_norm": 2.0740721225738525, + "learning_rate": 4.951314046295686e-05, + "loss": 5.4582, + "step": 10581 + }, + { + "epoch": 0.0629341516795128, + "grad_norm": 2.32232666015625, + "learning_rate": 4.95130487246266e-05, + "loss": 5.2523, + "step": 10582 + }, + { + "epoch": 0.06294009896279379, + "grad_norm": 2.164407730102539, + "learning_rate": 4.951295697773908e-05, + "loss": 5.6436, + "step": 10583 + }, + { + "epoch": 0.06294604624607479, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.951286522229435e-05, + "loss": 5.5333, + "step": 10584 + }, + { + "epoch": 0.0629519935293558, + "grad_norm": 2.025470733642578, + "learning_rate": 4.951277345829242e-05, + "loss": 5.5041, + "step": 10585 + }, + { + "epoch": 0.06295794081263678, + "grad_norm": 1.9415414333343506, + "learning_rate": 4.951268168573334e-05, + "loss": 5.2148, + "step": 10586 + }, + { + "epoch": 0.06296388809591778, + "grad_norm": 1.9229072332382202, + "learning_rate": 4.9512589904617135e-05, + "loss": 5.1461, + "step": 10587 + }, + { + "epoch": 0.06296983537919879, + "grad_norm": 2.414041757583618, + "learning_rate": 4.951249811494384e-05, + "loss": 5.5023, + "step": 10588 + }, + { + "epoch": 0.06297578266247977, + "grad_norm": 2.49826979637146, + "learning_rate": 4.9512406316713486e-05, + "loss": 5.3566, + "step": 10589 + }, + { + "epoch": 0.06298172994576078, + "grad_norm": 1.7222081422805786, + "learning_rate": 4.951231450992611e-05, + "loss": 5.3128, + "step": 10590 + }, + { + "epoch": 0.06298767722904178, + "grad_norm": 1.7181445360183716, + "learning_rate": 4.9512222694581725e-05, + "loss": 5.4598, + "step": 10591 + }, + { + "epoch": 0.06299362451232277, + "grad_norm": 1.547813892364502, + "learning_rate": 4.9512130870680385e-05, + "loss": 5.3997, + "step": 10592 + }, + { + "epoch": 0.06299957179560377, + "grad_norm": 1.6273536682128906, + "learning_rate": 4.95120390382221e-05, + "loss": 5.1668, + "step": 10593 + }, + { + "epoch": 0.06300551907888477, + "grad_norm": 1.6771745681762695, + "learning_rate": 4.9511947197206934e-05, + "loss": 5.2368, + "step": 10594 + }, + { + "epoch": 0.06301146636216576, + "grad_norm": 2.439664125442505, + "learning_rate": 4.951185534763489e-05, + "loss": 5.2178, + "step": 10595 + }, + { + "epoch": 0.06301741364544676, + "grad_norm": 2.194408655166626, + "learning_rate": 4.951176348950601e-05, + "loss": 5.3593, + "step": 10596 + }, + { + "epoch": 0.06302336092872776, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.9511671622820334e-05, + "loss": 6.3141, + "step": 10597 + }, + { + "epoch": 0.06302930821200875, + "grad_norm": 1.9550800323486328, + "learning_rate": 4.951157974757789e-05, + "loss": 5.8944, + "step": 10598 + }, + { + "epoch": 0.06303525549528975, + "grad_norm": 1.764724612236023, + "learning_rate": 4.9511487863778693e-05, + "loss": 5.5796, + "step": 10599 + }, + { + "epoch": 0.06304120277857075, + "grad_norm": 1.7987425327301025, + "learning_rate": 4.951139597142279e-05, + "loss": 5.5231, + "step": 10600 + }, + { + "epoch": 0.06304715006185174, + "grad_norm": 1.495875358581543, + "learning_rate": 4.951130407051022e-05, + "loss": 5.5019, + "step": 10601 + }, + { + "epoch": 0.06305309734513274, + "grad_norm": 2.7586476802825928, + "learning_rate": 4.9511212161041e-05, + "loss": 5.7043, + "step": 10602 + }, + { + "epoch": 0.06305904462841375, + "grad_norm": 2.1746270656585693, + "learning_rate": 4.951112024301517e-05, + "loss": 5.351, + "step": 10603 + }, + { + "epoch": 0.06306499191169473, + "grad_norm": 1.8681105375289917, + "learning_rate": 4.951102831643277e-05, + "loss": 5.4847, + "step": 10604 + }, + { + "epoch": 0.06307093919497574, + "grad_norm": 1.772286057472229, + "learning_rate": 4.951093638129382e-05, + "loss": 5.767, + "step": 10605 + }, + { + "epoch": 0.06307688647825674, + "grad_norm": 1.847748875617981, + "learning_rate": 4.951084443759835e-05, + "loss": 5.7737, + "step": 10606 + }, + { + "epoch": 0.06308283376153773, + "grad_norm": 1.9219080209732056, + "learning_rate": 4.95107524853464e-05, + "loss": 5.9414, + "step": 10607 + }, + { + "epoch": 0.06308878104481873, + "grad_norm": 1.6497199535369873, + "learning_rate": 4.9510660524538e-05, + "loss": 5.7124, + "step": 10608 + }, + { + "epoch": 0.06309472832809973, + "grad_norm": 1.8772788047790527, + "learning_rate": 4.951056855517318e-05, + "loss": 5.6784, + "step": 10609 + }, + { + "epoch": 0.06310067561138072, + "grad_norm": 2.035104990005493, + "learning_rate": 4.951047657725197e-05, + "loss": 5.5975, + "step": 10610 + }, + { + "epoch": 0.06310662289466172, + "grad_norm": 2.000922918319702, + "learning_rate": 4.9510384590774414e-05, + "loss": 5.2133, + "step": 10611 + }, + { + "epoch": 0.06311257017794271, + "grad_norm": 2.2581655979156494, + "learning_rate": 4.9510292595740536e-05, + "loss": 5.468, + "step": 10612 + }, + { + "epoch": 0.06311851746122371, + "grad_norm": 2.0332419872283936, + "learning_rate": 4.9510200592150365e-05, + "loss": 5.4923, + "step": 10613 + }, + { + "epoch": 0.06312446474450471, + "grad_norm": 1.9499238729476929, + "learning_rate": 4.9510108580003934e-05, + "loss": 5.5535, + "step": 10614 + }, + { + "epoch": 0.0631304120277857, + "grad_norm": 2.017491579055786, + "learning_rate": 4.951001655930128e-05, + "loss": 5.3771, + "step": 10615 + }, + { + "epoch": 0.0631363593110667, + "grad_norm": 2.355508804321289, + "learning_rate": 4.950992453004243e-05, + "loss": 5.0035, + "step": 10616 + }, + { + "epoch": 0.0631423065943477, + "grad_norm": 2.0470683574676514, + "learning_rate": 4.9509832492227426e-05, + "loss": 5.6073, + "step": 10617 + }, + { + "epoch": 0.0631482538776287, + "grad_norm": 1.7955858707427979, + "learning_rate": 4.9509740445856284e-05, + "loss": 5.8097, + "step": 10618 + }, + { + "epoch": 0.0631542011609097, + "grad_norm": 2.0126395225524902, + "learning_rate": 4.9509648390929045e-05, + "loss": 5.5989, + "step": 10619 + }, + { + "epoch": 0.0631601484441907, + "grad_norm": 1.8632375001907349, + "learning_rate": 4.950955632744575e-05, + "loss": 5.5585, + "step": 10620 + }, + { + "epoch": 0.06316609572747169, + "grad_norm": 2.2190446853637695, + "learning_rate": 4.950946425540641e-05, + "loss": 5.5182, + "step": 10621 + }, + { + "epoch": 0.06317204301075269, + "grad_norm": 2.082871675491333, + "learning_rate": 4.9509372174811074e-05, + "loss": 5.7849, + "step": 10622 + }, + { + "epoch": 0.06317799029403369, + "grad_norm": 2.17744517326355, + "learning_rate": 4.9509280085659774e-05, + "loss": 5.2332, + "step": 10623 + }, + { + "epoch": 0.06318393757731468, + "grad_norm": 1.7662746906280518, + "learning_rate": 4.950918798795253e-05, + "loss": 5.4136, + "step": 10624 + }, + { + "epoch": 0.06318988486059568, + "grad_norm": 1.6879531145095825, + "learning_rate": 4.950909588168939e-05, + "loss": 5.3747, + "step": 10625 + }, + { + "epoch": 0.06319583214387668, + "grad_norm": 2.0174877643585205, + "learning_rate": 4.950900376687038e-05, + "loss": 5.2927, + "step": 10626 + }, + { + "epoch": 0.06320177942715767, + "grad_norm": 1.9052749872207642, + "learning_rate": 4.950891164349552e-05, + "loss": 5.1492, + "step": 10627 + }, + { + "epoch": 0.06320772671043867, + "grad_norm": 1.7647850513458252, + "learning_rate": 4.950881951156485e-05, + "loss": 5.4182, + "step": 10628 + }, + { + "epoch": 0.06321367399371967, + "grad_norm": 1.9794502258300781, + "learning_rate": 4.950872737107841e-05, + "loss": 5.3838, + "step": 10629 + }, + { + "epoch": 0.06321962127700066, + "grad_norm": 2.3403780460357666, + "learning_rate": 4.950863522203623e-05, + "loss": 5.4542, + "step": 10630 + }, + { + "epoch": 0.06322556856028166, + "grad_norm": 1.8747358322143555, + "learning_rate": 4.9508543064438336e-05, + "loss": 5.4949, + "step": 10631 + }, + { + "epoch": 0.06323151584356267, + "grad_norm": 1.9435046911239624, + "learning_rate": 4.950845089828476e-05, + "loss": 5.6136, + "step": 10632 + }, + { + "epoch": 0.06323746312684365, + "grad_norm": 2.095583438873291, + "learning_rate": 4.9508358723575544e-05, + "loss": 5.2864, + "step": 10633 + }, + { + "epoch": 0.06324341041012466, + "grad_norm": 1.8254145383834839, + "learning_rate": 4.9508266540310705e-05, + "loss": 5.4732, + "step": 10634 + }, + { + "epoch": 0.06324935769340566, + "grad_norm": 2.303638458251953, + "learning_rate": 4.950817434849029e-05, + "loss": 5.1501, + "step": 10635 + }, + { + "epoch": 0.06325530497668665, + "grad_norm": 2.5389420986175537, + "learning_rate": 4.950808214811432e-05, + "loss": 5.0723, + "step": 10636 + }, + { + "epoch": 0.06326125225996765, + "grad_norm": 2.1702539920806885, + "learning_rate": 4.950798993918283e-05, + "loss": 4.8838, + "step": 10637 + }, + { + "epoch": 0.06326719954324865, + "grad_norm": 1.921650767326355, + "learning_rate": 4.9507897721695855e-05, + "loss": 5.9958, + "step": 10638 + }, + { + "epoch": 0.06327314682652964, + "grad_norm": 2.2247352600097656, + "learning_rate": 4.950780549565343e-05, + "loss": 4.9319, + "step": 10639 + }, + { + "epoch": 0.06327909410981064, + "grad_norm": 2.3517649173736572, + "learning_rate": 4.950771326105558e-05, + "loss": 4.6033, + "step": 10640 + }, + { + "epoch": 0.06328504139309163, + "grad_norm": 2.053856134414673, + "learning_rate": 4.950762101790234e-05, + "loss": 4.3799, + "step": 10641 + }, + { + "epoch": 0.06329098867637263, + "grad_norm": 1.8055500984191895, + "learning_rate": 4.9507528766193746e-05, + "loss": 5.244, + "step": 10642 + }, + { + "epoch": 0.06329693595965363, + "grad_norm": 2.0694682598114014, + "learning_rate": 4.950743650592983e-05, + "loss": 5.1965, + "step": 10643 + }, + { + "epoch": 0.06330288324293462, + "grad_norm": 2.027399778366089, + "learning_rate": 4.950734423711061e-05, + "loss": 4.5576, + "step": 10644 + }, + { + "epoch": 0.06330883052621562, + "grad_norm": 2.22308087348938, + "learning_rate": 4.950725195973614e-05, + "loss": 4.4679, + "step": 10645 + }, + { + "epoch": 0.06331477780949663, + "grad_norm": 2.1807515621185303, + "learning_rate": 4.9507159673806436e-05, + "loss": 4.6147, + "step": 10646 + }, + { + "epoch": 0.06332072509277761, + "grad_norm": 2.0173258781433105, + "learning_rate": 4.9507067379321536e-05, + "loss": 4.5657, + "step": 10647 + }, + { + "epoch": 0.06332667237605862, + "grad_norm": 1.832610845565796, + "learning_rate": 4.9506975076281474e-05, + "loss": 4.7433, + "step": 10648 + }, + { + "epoch": 0.06333261965933962, + "grad_norm": 2.027352809906006, + "learning_rate": 4.950688276468628e-05, + "loss": 5.0426, + "step": 10649 + }, + { + "epoch": 0.0633385669426206, + "grad_norm": 1.856307864189148, + "learning_rate": 4.950679044453599e-05, + "loss": 5.2838, + "step": 10650 + }, + { + "epoch": 0.06334451422590161, + "grad_norm": 2.0875375270843506, + "learning_rate": 4.950669811583062e-05, + "loss": 4.5728, + "step": 10651 + }, + { + "epoch": 0.06335046150918261, + "grad_norm": 2.1067941188812256, + "learning_rate": 4.950660577857023e-05, + "loss": 4.5313, + "step": 10652 + }, + { + "epoch": 0.0633564087924636, + "grad_norm": 2.1747500896453857, + "learning_rate": 4.9506513432754825e-05, + "loss": 4.432, + "step": 10653 + }, + { + "epoch": 0.0633623560757446, + "grad_norm": 1.769059181213379, + "learning_rate": 4.950642107838446e-05, + "loss": 5.4667, + "step": 10654 + }, + { + "epoch": 0.0633683033590256, + "grad_norm": 2.2065072059631348, + "learning_rate": 4.9506328715459146e-05, + "loss": 5.9873, + "step": 10655 + }, + { + "epoch": 0.06337425064230659, + "grad_norm": 1.679431438446045, + "learning_rate": 4.950623634397893e-05, + "loss": 5.851, + "step": 10656 + }, + { + "epoch": 0.06338019792558759, + "grad_norm": 1.919668197631836, + "learning_rate": 4.950614396394384e-05, + "loss": 5.8613, + "step": 10657 + }, + { + "epoch": 0.0633861452088686, + "grad_norm": 1.5296612977981567, + "learning_rate": 4.9506051575353915e-05, + "loss": 5.7067, + "step": 10658 + }, + { + "epoch": 0.06339209249214958, + "grad_norm": 2.1283507347106934, + "learning_rate": 4.950595917820917e-05, + "loss": 5.1141, + "step": 10659 + }, + { + "epoch": 0.06339803977543058, + "grad_norm": 1.7011604309082031, + "learning_rate": 4.950586677250966e-05, + "loss": 6.0463, + "step": 10660 + }, + { + "epoch": 0.06340398705871159, + "grad_norm": 1.7479497194290161, + "learning_rate": 4.9505774358255396e-05, + "loss": 5.8942, + "step": 10661 + }, + { + "epoch": 0.06340993434199257, + "grad_norm": 1.939471960067749, + "learning_rate": 4.950568193544642e-05, + "loss": 5.562, + "step": 10662 + }, + { + "epoch": 0.06341588162527358, + "grad_norm": 1.871993899345398, + "learning_rate": 4.9505589504082764e-05, + "loss": 5.746, + "step": 10663 + }, + { + "epoch": 0.06342182890855458, + "grad_norm": 2.173109292984009, + "learning_rate": 4.950549706416446e-05, + "loss": 5.5927, + "step": 10664 + }, + { + "epoch": 0.06342777619183557, + "grad_norm": 1.809971809387207, + "learning_rate": 4.950540461569154e-05, + "loss": 5.8983, + "step": 10665 + }, + { + "epoch": 0.06343372347511657, + "grad_norm": 1.6344120502471924, + "learning_rate": 4.950531215866404e-05, + "loss": 5.5301, + "step": 10666 + }, + { + "epoch": 0.06343967075839757, + "grad_norm": 2.080425500869751, + "learning_rate": 4.9505219693081985e-05, + "loss": 6.0214, + "step": 10667 + }, + { + "epoch": 0.06344561804167856, + "grad_norm": 1.9382790327072144, + "learning_rate": 4.9505127218945415e-05, + "loss": 5.676, + "step": 10668 + }, + { + "epoch": 0.06345156532495956, + "grad_norm": 1.6945782899856567, + "learning_rate": 4.9505034736254354e-05, + "loss": 5.9337, + "step": 10669 + }, + { + "epoch": 0.06345751260824055, + "grad_norm": 1.6129313707351685, + "learning_rate": 4.9504942245008836e-05, + "loss": 5.6561, + "step": 10670 + }, + { + "epoch": 0.06346345989152155, + "grad_norm": 2.002903461456299, + "learning_rate": 4.95048497452089e-05, + "loss": 5.6302, + "step": 10671 + }, + { + "epoch": 0.06346940717480255, + "grad_norm": 1.6016403436660767, + "learning_rate": 4.950475723685457e-05, + "loss": 5.8275, + "step": 10672 + }, + { + "epoch": 0.06347535445808354, + "grad_norm": 1.7645297050476074, + "learning_rate": 4.9504664719945895e-05, + "loss": 5.5541, + "step": 10673 + }, + { + "epoch": 0.06348130174136454, + "grad_norm": 1.9627439975738525, + "learning_rate": 4.950457219448288e-05, + "loss": 5.6425, + "step": 10674 + }, + { + "epoch": 0.06348724902464555, + "grad_norm": 1.6297314167022705, + "learning_rate": 4.950447966046558e-05, + "loss": 5.5735, + "step": 10675 + }, + { + "epoch": 0.06349319630792653, + "grad_norm": 1.7911304235458374, + "learning_rate": 4.9504387117894014e-05, + "loss": 5.7736, + "step": 10676 + }, + { + "epoch": 0.06349914359120754, + "grad_norm": 1.627543330192566, + "learning_rate": 4.950429456676823e-05, + "loss": 5.736, + "step": 10677 + }, + { + "epoch": 0.06350509087448854, + "grad_norm": 1.9574320316314697, + "learning_rate": 4.950420200708824e-05, + "loss": 5.365, + "step": 10678 + }, + { + "epoch": 0.06351103815776953, + "grad_norm": 1.7698450088500977, + "learning_rate": 4.950410943885408e-05, + "loss": 5.5742, + "step": 10679 + }, + { + "epoch": 0.06351698544105053, + "grad_norm": 1.7660366296768188, + "learning_rate": 4.9504016862065806e-05, + "loss": 5.9064, + "step": 10680 + }, + { + "epoch": 0.06352293272433153, + "grad_norm": 2.0279083251953125, + "learning_rate": 4.9503924276723425e-05, + "loss": 5.7938, + "step": 10681 + }, + { + "epoch": 0.06352888000761252, + "grad_norm": 2.101827621459961, + "learning_rate": 4.9503831682826974e-05, + "loss": 5.4898, + "step": 10682 + }, + { + "epoch": 0.06353482729089352, + "grad_norm": 2.04978084564209, + "learning_rate": 4.9503739080376486e-05, + "loss": 5.3753, + "step": 10683 + }, + { + "epoch": 0.06354077457417452, + "grad_norm": 1.8539999723434448, + "learning_rate": 4.950364646937201e-05, + "loss": 5.5575, + "step": 10684 + }, + { + "epoch": 0.06354672185745551, + "grad_norm": 2.077073097229004, + "learning_rate": 4.9503553849813556e-05, + "loss": 5.4628, + "step": 10685 + }, + { + "epoch": 0.06355266914073651, + "grad_norm": 1.8130167722702026, + "learning_rate": 4.950346122170116e-05, + "loss": 5.1648, + "step": 10686 + }, + { + "epoch": 0.06355861642401751, + "grad_norm": 1.810944676399231, + "learning_rate": 4.950336858503486e-05, + "loss": 5.8371, + "step": 10687 + }, + { + "epoch": 0.0635645637072985, + "grad_norm": 2.0081756114959717, + "learning_rate": 4.950327593981469e-05, + "loss": 5.6933, + "step": 10688 + }, + { + "epoch": 0.0635705109905795, + "grad_norm": 1.5824620723724365, + "learning_rate": 4.950318328604068e-05, + "loss": 5.4494, + "step": 10689 + }, + { + "epoch": 0.0635764582738605, + "grad_norm": 1.6470626592636108, + "learning_rate": 4.950309062371286e-05, + "loss": 6.2401, + "step": 10690 + }, + { + "epoch": 0.0635824055571415, + "grad_norm": 1.799074649810791, + "learning_rate": 4.950299795283127e-05, + "loss": 6.1075, + "step": 10691 + }, + { + "epoch": 0.0635883528404225, + "grad_norm": 2.0551035404205322, + "learning_rate": 4.950290527339593e-05, + "loss": 5.6646, + "step": 10692 + }, + { + "epoch": 0.0635943001237035, + "grad_norm": 2.3543875217437744, + "learning_rate": 4.9502812585406875e-05, + "loss": 4.9341, + "step": 10693 + }, + { + "epoch": 0.06360024740698449, + "grad_norm": 2.0479071140289307, + "learning_rate": 4.950271988886415e-05, + "loss": 5.3351, + "step": 10694 + }, + { + "epoch": 0.06360619469026549, + "grad_norm": 1.9331302642822266, + "learning_rate": 4.950262718376778e-05, + "loss": 5.6269, + "step": 10695 + }, + { + "epoch": 0.06361214197354649, + "grad_norm": 1.9922640323638916, + "learning_rate": 4.950253447011779e-05, + "loss": 5.5113, + "step": 10696 + }, + { + "epoch": 0.06361808925682748, + "grad_norm": 1.769916296005249, + "learning_rate": 4.950244174791422e-05, + "loss": 5.5902, + "step": 10697 + }, + { + "epoch": 0.06362403654010848, + "grad_norm": 2.8808071613311768, + "learning_rate": 4.95023490171571e-05, + "loss": 4.9506, + "step": 10698 + }, + { + "epoch": 0.06362998382338947, + "grad_norm": 2.0609331130981445, + "learning_rate": 4.9502256277846466e-05, + "loss": 5.4256, + "step": 10699 + }, + { + "epoch": 0.06363593110667047, + "grad_norm": 2.0112223625183105, + "learning_rate": 4.950216352998234e-05, + "loss": 6.1121, + "step": 10700 + }, + { + "epoch": 0.06364187838995147, + "grad_norm": 1.5665667057037354, + "learning_rate": 4.9502070773564765e-05, + "loss": 5.1959, + "step": 10701 + }, + { + "epoch": 0.06364782567323246, + "grad_norm": 1.9731864929199219, + "learning_rate": 4.9501978008593774e-05, + "loss": 5.2887, + "step": 10702 + }, + { + "epoch": 0.06365377295651346, + "grad_norm": 1.7925242185592651, + "learning_rate": 4.9501885235069404e-05, + "loss": 5.7386, + "step": 10703 + }, + { + "epoch": 0.06365972023979447, + "grad_norm": 1.6686629056930542, + "learning_rate": 4.950179245299166e-05, + "loss": 5.7279, + "step": 10704 + }, + { + "epoch": 0.06366566752307545, + "grad_norm": 2.034392833709717, + "learning_rate": 4.95016996623606e-05, + "loss": 5.6148, + "step": 10705 + }, + { + "epoch": 0.06367161480635646, + "grad_norm": 2.1711995601654053, + "learning_rate": 4.9501606863176254e-05, + "loss": 5.7088, + "step": 10706 + }, + { + "epoch": 0.06367756208963746, + "grad_norm": 2.3276829719543457, + "learning_rate": 4.950151405543865e-05, + "loss": 5.3658, + "step": 10707 + }, + { + "epoch": 0.06368350937291845, + "grad_norm": 2.174130916595459, + "learning_rate": 4.9501421239147824e-05, + "loss": 5.3459, + "step": 10708 + }, + { + "epoch": 0.06368945665619945, + "grad_norm": 1.8721747398376465, + "learning_rate": 4.9501328414303794e-05, + "loss": 5.3375, + "step": 10709 + }, + { + "epoch": 0.06369540393948045, + "grad_norm": 1.8677324056625366, + "learning_rate": 4.9501235580906615e-05, + "loss": 5.8192, + "step": 10710 + }, + { + "epoch": 0.06370135122276144, + "grad_norm": 2.0901246070861816, + "learning_rate": 4.9501142738956294e-05, + "loss": 6.1188, + "step": 10711 + }, + { + "epoch": 0.06370729850604244, + "grad_norm": 1.7860997915267944, + "learning_rate": 4.9501049888452885e-05, + "loss": 5.4011, + "step": 10712 + }, + { + "epoch": 0.06371324578932344, + "grad_norm": 2.000946283340454, + "learning_rate": 4.950095702939642e-05, + "loss": 5.16, + "step": 10713 + }, + { + "epoch": 0.06371919307260443, + "grad_norm": 2.47086501121521, + "learning_rate": 4.950086416178691e-05, + "loss": 5.1543, + "step": 10714 + }, + { + "epoch": 0.06372514035588543, + "grad_norm": 1.8694473505020142, + "learning_rate": 4.9500771285624415e-05, + "loss": 5.3576, + "step": 10715 + }, + { + "epoch": 0.06373108763916643, + "grad_norm": 1.8921676874160767, + "learning_rate": 4.9500678400908946e-05, + "loss": 5.0827, + "step": 10716 + }, + { + "epoch": 0.06373703492244742, + "grad_norm": 1.8423974514007568, + "learning_rate": 4.950058550764054e-05, + "loss": 4.9912, + "step": 10717 + }, + { + "epoch": 0.06374298220572842, + "grad_norm": 1.6893757581710815, + "learning_rate": 4.950049260581924e-05, + "loss": 5.2792, + "step": 10718 + }, + { + "epoch": 0.06374892948900943, + "grad_norm": 1.720799446105957, + "learning_rate": 4.950039969544507e-05, + "loss": 5.4355, + "step": 10719 + }, + { + "epoch": 0.06375487677229041, + "grad_norm": 1.717527151107788, + "learning_rate": 4.9500306776518065e-05, + "loss": 5.2802, + "step": 10720 + }, + { + "epoch": 0.06376082405557142, + "grad_norm": 1.876207947731018, + "learning_rate": 4.950021384903825e-05, + "loss": 5.4667, + "step": 10721 + }, + { + "epoch": 0.06376677133885242, + "grad_norm": 1.7892308235168457, + "learning_rate": 4.9500120913005666e-05, + "loss": 5.6635, + "step": 10722 + }, + { + "epoch": 0.0637727186221334, + "grad_norm": 1.828092336654663, + "learning_rate": 4.950002796842034e-05, + "loss": 5.5301, + "step": 10723 + }, + { + "epoch": 0.06377866590541441, + "grad_norm": 1.5860785245895386, + "learning_rate": 4.949993501528232e-05, + "loss": 5.2337, + "step": 10724 + }, + { + "epoch": 0.06378461318869541, + "grad_norm": 1.731295108795166, + "learning_rate": 4.949984205359161e-05, + "loss": 5.4115, + "step": 10725 + }, + { + "epoch": 0.0637905604719764, + "grad_norm": 2.194288969039917, + "learning_rate": 4.949974908334827e-05, + "loss": 5.4736, + "step": 10726 + }, + { + "epoch": 0.0637965077552574, + "grad_norm": 1.6036415100097656, + "learning_rate": 4.949965610455231e-05, + "loss": 5.4563, + "step": 10727 + }, + { + "epoch": 0.06380245503853839, + "grad_norm": 1.6228232383728027, + "learning_rate": 4.949956311720378e-05, + "loss": 5.4695, + "step": 10728 + }, + { + "epoch": 0.06380840232181939, + "grad_norm": 1.3040069341659546, + "learning_rate": 4.94994701213027e-05, + "loss": 5.0126, + "step": 10729 + }, + { + "epoch": 0.06381434960510039, + "grad_norm": 1.5976930856704712, + "learning_rate": 4.9499377116849116e-05, + "loss": 5.0165, + "step": 10730 + }, + { + "epoch": 0.06382029688838138, + "grad_norm": 1.5877797603607178, + "learning_rate": 4.9499284103843046e-05, + "loss": 5.1634, + "step": 10731 + }, + { + "epoch": 0.06382624417166238, + "grad_norm": 1.6466439962387085, + "learning_rate": 4.949919108228453e-05, + "loss": 5.3954, + "step": 10732 + }, + { + "epoch": 0.06383219145494338, + "grad_norm": 1.5188345909118652, + "learning_rate": 4.949909805217361e-05, + "loss": 5.2876, + "step": 10733 + }, + { + "epoch": 0.06383813873822437, + "grad_norm": 1.836227297782898, + "learning_rate": 4.94990050135103e-05, + "loss": 5.4966, + "step": 10734 + }, + { + "epoch": 0.06384408602150538, + "grad_norm": 1.5542840957641602, + "learning_rate": 4.9498911966294635e-05, + "loss": 5.2188, + "step": 10735 + }, + { + "epoch": 0.06385003330478638, + "grad_norm": 1.3053034543991089, + "learning_rate": 4.9498818910526656e-05, + "loss": 5.3834, + "step": 10736 + }, + { + "epoch": 0.06385598058806737, + "grad_norm": 1.4250247478485107, + "learning_rate": 4.9498725846206395e-05, + "loss": 5.1852, + "step": 10737 + }, + { + "epoch": 0.06386192787134837, + "grad_norm": 1.5885393619537354, + "learning_rate": 4.9498632773333886e-05, + "loss": 5.2518, + "step": 10738 + }, + { + "epoch": 0.06386787515462937, + "grad_norm": 1.5664896965026855, + "learning_rate": 4.949853969190915e-05, + "loss": 5.1186, + "step": 10739 + }, + { + "epoch": 0.06387382243791036, + "grad_norm": 1.5156123638153076, + "learning_rate": 4.949844660193223e-05, + "loss": 5.1111, + "step": 10740 + }, + { + "epoch": 0.06387976972119136, + "grad_norm": 1.5308325290679932, + "learning_rate": 4.949835350340316e-05, + "loss": 5.1577, + "step": 10741 + }, + { + "epoch": 0.06388571700447236, + "grad_norm": 1.3338321447372437, + "learning_rate": 4.949826039632196e-05, + "loss": 5.2386, + "step": 10742 + }, + { + "epoch": 0.06389166428775335, + "grad_norm": 1.5307821035385132, + "learning_rate": 4.9498167280688676e-05, + "loss": 5.1173, + "step": 10743 + }, + { + "epoch": 0.06389761157103435, + "grad_norm": 1.607913613319397, + "learning_rate": 4.9498074156503325e-05, + "loss": 5.3077, + "step": 10744 + }, + { + "epoch": 0.06390355885431535, + "grad_norm": 1.6242469549179077, + "learning_rate": 4.949798102376596e-05, + "loss": 5.3319, + "step": 10745 + }, + { + "epoch": 0.06390950613759634, + "grad_norm": 1.62213134765625, + "learning_rate": 4.9497887882476604e-05, + "loss": 5.3494, + "step": 10746 + }, + { + "epoch": 0.06391545342087734, + "grad_norm": 1.4064897298812866, + "learning_rate": 4.949779473263528e-05, + "loss": 5.207, + "step": 10747 + }, + { + "epoch": 0.06392140070415835, + "grad_norm": 1.7431879043579102, + "learning_rate": 4.949770157424203e-05, + "loss": 5.4068, + "step": 10748 + }, + { + "epoch": 0.06392734798743933, + "grad_norm": 1.5815304517745972, + "learning_rate": 4.949760840729689e-05, + "loss": 5.3917, + "step": 10749 + }, + { + "epoch": 0.06393329527072034, + "grad_norm": 1.576541543006897, + "learning_rate": 4.949751523179988e-05, + "loss": 5.4123, + "step": 10750 + }, + { + "epoch": 0.06393924255400134, + "grad_norm": 1.6717814207077026, + "learning_rate": 4.9497422047751054e-05, + "loss": 5.3028, + "step": 10751 + }, + { + "epoch": 0.06394518983728233, + "grad_norm": 1.4091792106628418, + "learning_rate": 4.9497328855150424e-05, + "loss": 5.2231, + "step": 10752 + }, + { + "epoch": 0.06395113712056333, + "grad_norm": 1.4366726875305176, + "learning_rate": 4.949723565399803e-05, + "loss": 5.2908, + "step": 10753 + }, + { + "epoch": 0.06395708440384433, + "grad_norm": 1.6679248809814453, + "learning_rate": 4.9497142444293906e-05, + "loss": 5.1079, + "step": 10754 + }, + { + "epoch": 0.06396303168712532, + "grad_norm": 1.6619216203689575, + "learning_rate": 4.949704922603808e-05, + "loss": 5.1504, + "step": 10755 + }, + { + "epoch": 0.06396897897040632, + "grad_norm": 1.7149940729141235, + "learning_rate": 4.9496955999230586e-05, + "loss": 5.3031, + "step": 10756 + }, + { + "epoch": 0.06397492625368732, + "grad_norm": 1.711256504058838, + "learning_rate": 4.9496862763871456e-05, + "loss": 5.2146, + "step": 10757 + }, + { + "epoch": 0.06398087353696831, + "grad_norm": 1.654680609703064, + "learning_rate": 4.949676951996073e-05, + "loss": 5.2774, + "step": 10758 + }, + { + "epoch": 0.06398682082024931, + "grad_norm": 1.5115636587142944, + "learning_rate": 4.949667626749843e-05, + "loss": 5.2155, + "step": 10759 + }, + { + "epoch": 0.0639927681035303, + "grad_norm": 1.7153947353363037, + "learning_rate": 4.9496583006484596e-05, + "loss": 5.2711, + "step": 10760 + }, + { + "epoch": 0.0639987153868113, + "grad_norm": 1.8497945070266724, + "learning_rate": 4.949648973691926e-05, + "loss": 5.2864, + "step": 10761 + }, + { + "epoch": 0.0640046626700923, + "grad_norm": 1.5251562595367432, + "learning_rate": 4.9496396458802455e-05, + "loss": 5.2532, + "step": 10762 + }, + { + "epoch": 0.0640106099533733, + "grad_norm": 1.5916621685028076, + "learning_rate": 4.94963031721342e-05, + "loss": 5.2136, + "step": 10763 + }, + { + "epoch": 0.0640165572366543, + "grad_norm": 1.5781627893447876, + "learning_rate": 4.949620987691455e-05, + "loss": 5.3188, + "step": 10764 + }, + { + "epoch": 0.0640225045199353, + "grad_norm": 1.7783690690994263, + "learning_rate": 4.9496116573143515e-05, + "loss": 5.4196, + "step": 10765 + }, + { + "epoch": 0.06402845180321629, + "grad_norm": 1.5746928453445435, + "learning_rate": 4.949602326082115e-05, + "loss": 5.3724, + "step": 10766 + }, + { + "epoch": 0.06403439908649729, + "grad_norm": 1.677771806716919, + "learning_rate": 4.9495929939947475e-05, + "loss": 5.2894, + "step": 10767 + }, + { + "epoch": 0.06404034636977829, + "grad_norm": 1.7747725248336792, + "learning_rate": 4.949583661052252e-05, + "loss": 5.0527, + "step": 10768 + }, + { + "epoch": 0.06404629365305928, + "grad_norm": 1.6927893161773682, + "learning_rate": 4.9495743272546314e-05, + "loss": 5.0999, + "step": 10769 + }, + { + "epoch": 0.06405224093634028, + "grad_norm": 1.6289039850234985, + "learning_rate": 4.949564992601891e-05, + "loss": 5.4197, + "step": 10770 + }, + { + "epoch": 0.06405818821962128, + "grad_norm": 1.742658019065857, + "learning_rate": 4.9495556570940316e-05, + "loss": 5.2927, + "step": 10771 + }, + { + "epoch": 0.06406413550290227, + "grad_norm": 1.6643215417861938, + "learning_rate": 4.949546320731059e-05, + "loss": 5.3262, + "step": 10772 + }, + { + "epoch": 0.06407008278618327, + "grad_norm": 1.6400927305221558, + "learning_rate": 4.949536983512974e-05, + "loss": 5.1072, + "step": 10773 + }, + { + "epoch": 0.06407603006946427, + "grad_norm": 1.7093544006347656, + "learning_rate": 4.949527645439781e-05, + "loss": 5.1849, + "step": 10774 + }, + { + "epoch": 0.06408197735274526, + "grad_norm": 1.6980849504470825, + "learning_rate": 4.949518306511484e-05, + "loss": 5.3661, + "step": 10775 + }, + { + "epoch": 0.06408792463602626, + "grad_norm": 1.7241551876068115, + "learning_rate": 4.949508966728085e-05, + "loss": 5.3315, + "step": 10776 + }, + { + "epoch": 0.06409387191930727, + "grad_norm": 1.8421318531036377, + "learning_rate": 4.9494996260895874e-05, + "loss": 5.3506, + "step": 10777 + }, + { + "epoch": 0.06409981920258825, + "grad_norm": 1.835738182067871, + "learning_rate": 4.949490284595995e-05, + "loss": 5.2087, + "step": 10778 + }, + { + "epoch": 0.06410576648586926, + "grad_norm": 1.6622625589370728, + "learning_rate": 4.949480942247311e-05, + "loss": 5.0072, + "step": 10779 + }, + { + "epoch": 0.06411171376915026, + "grad_norm": 1.5437613725662231, + "learning_rate": 4.949471599043539e-05, + "loss": 5.182, + "step": 10780 + }, + { + "epoch": 0.06411766105243125, + "grad_norm": 1.620758295059204, + "learning_rate": 4.949462254984681e-05, + "loss": 5.2771, + "step": 10781 + }, + { + "epoch": 0.06412360833571225, + "grad_norm": 1.6143954992294312, + "learning_rate": 4.949452910070741e-05, + "loss": 5.1175, + "step": 10782 + }, + { + "epoch": 0.06412955561899325, + "grad_norm": 1.8173086643218994, + "learning_rate": 4.949443564301722e-05, + "loss": 5.175, + "step": 10783 + }, + { + "epoch": 0.06413550290227424, + "grad_norm": 1.75434148311615, + "learning_rate": 4.9494342176776284e-05, + "loss": 5.1133, + "step": 10784 + }, + { + "epoch": 0.06414145018555524, + "grad_norm": 1.7278660535812378, + "learning_rate": 4.949424870198462e-05, + "loss": 5.0704, + "step": 10785 + }, + { + "epoch": 0.06414739746883624, + "grad_norm": 1.793285608291626, + "learning_rate": 4.949415521864228e-05, + "loss": 5.1567, + "step": 10786 + }, + { + "epoch": 0.06415334475211723, + "grad_norm": 1.7892498970031738, + "learning_rate": 4.949406172674927e-05, + "loss": 5.201, + "step": 10787 + }, + { + "epoch": 0.06415929203539823, + "grad_norm": 2.276643991470337, + "learning_rate": 4.9493968226305645e-05, + "loss": 5.5555, + "step": 10788 + }, + { + "epoch": 0.06416523931867922, + "grad_norm": 1.5785993337631226, + "learning_rate": 4.9493874717311416e-05, + "loss": 5.2692, + "step": 10789 + }, + { + "epoch": 0.06417118660196022, + "grad_norm": 1.3982635736465454, + "learning_rate": 4.949378119976664e-05, + "loss": 5.24, + "step": 10790 + }, + { + "epoch": 0.06417713388524122, + "grad_norm": 1.4310967922210693, + "learning_rate": 4.949368767367133e-05, + "loss": 5.2032, + "step": 10791 + }, + { + "epoch": 0.06418308116852221, + "grad_norm": 1.5635451078414917, + "learning_rate": 4.949359413902554e-05, + "loss": 5.2589, + "step": 10792 + }, + { + "epoch": 0.06418902845180322, + "grad_norm": 1.5000566244125366, + "learning_rate": 4.949350059582927e-05, + "loss": 5.147, + "step": 10793 + }, + { + "epoch": 0.06419497573508422, + "grad_norm": 1.7782738208770752, + "learning_rate": 4.9493407044082585e-05, + "loss": 5.1987, + "step": 10794 + }, + { + "epoch": 0.0642009230183652, + "grad_norm": 1.5931564569473267, + "learning_rate": 4.94933134837855e-05, + "loss": 5.2591, + "step": 10795 + }, + { + "epoch": 0.06420687030164621, + "grad_norm": 1.619287371635437, + "learning_rate": 4.9493219914938055e-05, + "loss": 5.1041, + "step": 10796 + }, + { + "epoch": 0.06421281758492721, + "grad_norm": 1.5174281597137451, + "learning_rate": 4.949312633754028e-05, + "loss": 5.1798, + "step": 10797 + }, + { + "epoch": 0.0642187648682082, + "grad_norm": 1.6485828161239624, + "learning_rate": 4.9493032751592205e-05, + "loss": 5.1086, + "step": 10798 + }, + { + "epoch": 0.0642247121514892, + "grad_norm": 1.830984354019165, + "learning_rate": 4.949293915709386e-05, + "loss": 5.2241, + "step": 10799 + }, + { + "epoch": 0.0642306594347702, + "grad_norm": 1.9102944135665894, + "learning_rate": 4.94928455540453e-05, + "loss": 4.9652, + "step": 10800 + }, + { + "epoch": 0.06423660671805119, + "grad_norm": 1.6826778650283813, + "learning_rate": 4.949275194244653e-05, + "loss": 5.0479, + "step": 10801 + }, + { + "epoch": 0.06424255400133219, + "grad_norm": 1.7545628547668457, + "learning_rate": 4.9492658322297595e-05, + "loss": 4.9263, + "step": 10802 + }, + { + "epoch": 0.0642485012846132, + "grad_norm": 1.621121883392334, + "learning_rate": 4.949256469359852e-05, + "loss": 4.9095, + "step": 10803 + }, + { + "epoch": 0.06425444856789418, + "grad_norm": 1.727095603942871, + "learning_rate": 4.9492471056349356e-05, + "loss": 5.1913, + "step": 10804 + }, + { + "epoch": 0.06426039585117518, + "grad_norm": 1.749241590499878, + "learning_rate": 4.949237741055011e-05, + "loss": 5.4284, + "step": 10805 + }, + { + "epoch": 0.06426634313445619, + "grad_norm": 1.627784252166748, + "learning_rate": 4.9492283756200834e-05, + "loss": 5.547, + "step": 10806 + }, + { + "epoch": 0.06427229041773717, + "grad_norm": 1.8133957386016846, + "learning_rate": 4.949219009330155e-05, + "loss": 5.5841, + "step": 10807 + }, + { + "epoch": 0.06427823770101818, + "grad_norm": 1.6667630672454834, + "learning_rate": 4.949209642185231e-05, + "loss": 5.4091, + "step": 10808 + }, + { + "epoch": 0.06428418498429918, + "grad_norm": 1.601288914680481, + "learning_rate": 4.949200274185312e-05, + "loss": 4.9647, + "step": 10809 + }, + { + "epoch": 0.06429013226758017, + "grad_norm": 1.4544743299484253, + "learning_rate": 4.9491909053304025e-05, + "loss": 5.477, + "step": 10810 + }, + { + "epoch": 0.06429607955086117, + "grad_norm": 1.65786874294281, + "learning_rate": 4.949181535620506e-05, + "loss": 5.2401, + "step": 10811 + }, + { + "epoch": 0.06430202683414217, + "grad_norm": 1.561251163482666, + "learning_rate": 4.949172165055625e-05, + "loss": 5.7689, + "step": 10812 + }, + { + "epoch": 0.06430797411742316, + "grad_norm": 1.465378999710083, + "learning_rate": 4.949162793635764e-05, + "loss": 5.4109, + "step": 10813 + }, + { + "epoch": 0.06431392140070416, + "grad_norm": 1.3914259672164917, + "learning_rate": 4.949153421360926e-05, + "loss": 5.5144, + "step": 10814 + }, + { + "epoch": 0.06431986868398516, + "grad_norm": 1.6016005277633667, + "learning_rate": 4.949144048231113e-05, + "loss": 5.2708, + "step": 10815 + }, + { + "epoch": 0.06432581596726615, + "grad_norm": 1.4063479900360107, + "learning_rate": 4.94913467424633e-05, + "loss": 5.0303, + "step": 10816 + }, + { + "epoch": 0.06433176325054715, + "grad_norm": 1.5708017349243164, + "learning_rate": 4.9491252994065785e-05, + "loss": 5.3104, + "step": 10817 + }, + { + "epoch": 0.06433771053382814, + "grad_norm": 1.5542651414871216, + "learning_rate": 4.9491159237118626e-05, + "loss": 5.1308, + "step": 10818 + }, + { + "epoch": 0.06434365781710914, + "grad_norm": 1.3946558237075806, + "learning_rate": 4.9491065471621855e-05, + "loss": 5.243, + "step": 10819 + }, + { + "epoch": 0.06434960510039014, + "grad_norm": 1.3560529947280884, + "learning_rate": 4.9490971697575513e-05, + "loss": 4.9319, + "step": 10820 + }, + { + "epoch": 0.06435555238367113, + "grad_norm": 1.6921281814575195, + "learning_rate": 4.949087791497963e-05, + "loss": 5.2203, + "step": 10821 + }, + { + "epoch": 0.06436149966695213, + "grad_norm": 1.5226655006408691, + "learning_rate": 4.9490784123834225e-05, + "loss": 5.1879, + "step": 10822 + }, + { + "epoch": 0.06436744695023314, + "grad_norm": 1.5012669563293457, + "learning_rate": 4.9490690324139346e-05, + "loss": 5.2373, + "step": 10823 + }, + { + "epoch": 0.06437339423351413, + "grad_norm": 1.8050286769866943, + "learning_rate": 4.949059651589502e-05, + "loss": 5.0441, + "step": 10824 + }, + { + "epoch": 0.06437934151679513, + "grad_norm": 1.6800918579101562, + "learning_rate": 4.9490502699101274e-05, + "loss": 5.0871, + "step": 10825 + }, + { + "epoch": 0.06438528880007613, + "grad_norm": 1.4211550951004028, + "learning_rate": 4.949040887375814e-05, + "loss": 5.118, + "step": 10826 + }, + { + "epoch": 0.06439123608335712, + "grad_norm": 1.7064868211746216, + "learning_rate": 4.949031503986568e-05, + "loss": 5.2285, + "step": 10827 + }, + { + "epoch": 0.06439718336663812, + "grad_norm": 1.862491250038147, + "learning_rate": 4.949022119742388e-05, + "loss": 5.0958, + "step": 10828 + }, + { + "epoch": 0.06440313064991912, + "grad_norm": 1.933610200881958, + "learning_rate": 4.949012734643281e-05, + "loss": 5.1282, + "step": 10829 + }, + { + "epoch": 0.06440907793320011, + "grad_norm": 1.6140058040618896, + "learning_rate": 4.949003348689249e-05, + "loss": 4.9913, + "step": 10830 + }, + { + "epoch": 0.06441502521648111, + "grad_norm": 1.6881496906280518, + "learning_rate": 4.948993961880295e-05, + "loss": 5.1017, + "step": 10831 + }, + { + "epoch": 0.06442097249976211, + "grad_norm": 1.7887358665466309, + "learning_rate": 4.948984574216422e-05, + "loss": 5.1503, + "step": 10832 + }, + { + "epoch": 0.0644269197830431, + "grad_norm": 1.635720133781433, + "learning_rate": 4.948975185697634e-05, + "loss": 5.3381, + "step": 10833 + }, + { + "epoch": 0.0644328670663241, + "grad_norm": 1.6106109619140625, + "learning_rate": 4.9489657963239346e-05, + "loss": 5.0498, + "step": 10834 + }, + { + "epoch": 0.0644388143496051, + "grad_norm": 1.740438461303711, + "learning_rate": 4.9489564060953266e-05, + "loss": 5.0302, + "step": 10835 + }, + { + "epoch": 0.0644447616328861, + "grad_norm": 1.663994312286377, + "learning_rate": 4.9489470150118124e-05, + "loss": 5.1976, + "step": 10836 + }, + { + "epoch": 0.0644507089161671, + "grad_norm": 1.6748932600021362, + "learning_rate": 4.9489376230733965e-05, + "loss": 5.0055, + "step": 10837 + }, + { + "epoch": 0.0644566561994481, + "grad_norm": 1.7139437198638916, + "learning_rate": 4.948928230280082e-05, + "loss": 4.9617, + "step": 10838 + }, + { + "epoch": 0.06446260348272909, + "grad_norm": 1.698791742324829, + "learning_rate": 4.948918836631872e-05, + "loss": 4.9725, + "step": 10839 + }, + { + "epoch": 0.06446855076601009, + "grad_norm": 1.6961768865585327, + "learning_rate": 4.94890944212877e-05, + "loss": 4.9126, + "step": 10840 + }, + { + "epoch": 0.06447449804929109, + "grad_norm": 1.6551483869552612, + "learning_rate": 4.948900046770778e-05, + "loss": 5.0775, + "step": 10841 + }, + { + "epoch": 0.06448044533257208, + "grad_norm": 1.5863447189331055, + "learning_rate": 4.948890650557901e-05, + "loss": 5.0467, + "step": 10842 + }, + { + "epoch": 0.06448639261585308, + "grad_norm": 1.5629637241363525, + "learning_rate": 4.9488812534901414e-05, + "loss": 5.0012, + "step": 10843 + }, + { + "epoch": 0.06449233989913408, + "grad_norm": 1.5247453451156616, + "learning_rate": 4.948871855567503e-05, + "loss": 4.9928, + "step": 10844 + }, + { + "epoch": 0.06449828718241507, + "grad_norm": 1.7595921754837036, + "learning_rate": 4.948862456789988e-05, + "loss": 4.9256, + "step": 10845 + }, + { + "epoch": 0.06450423446569607, + "grad_norm": 1.6370458602905273, + "learning_rate": 4.948853057157601e-05, + "loss": 4.9499, + "step": 10846 + }, + { + "epoch": 0.06451018174897706, + "grad_norm": 1.7747406959533691, + "learning_rate": 4.948843656670345e-05, + "loss": 4.9246, + "step": 10847 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.6769739389419556, + "learning_rate": 4.948834255328222e-05, + "loss": 4.9561, + "step": 10848 + }, + { + "epoch": 0.06452207631553906, + "grad_norm": 1.60416841506958, + "learning_rate": 4.948824853131236e-05, + "loss": 5.0318, + "step": 10849 + }, + { + "epoch": 0.06452802359882005, + "grad_norm": 2.1050093173980713, + "learning_rate": 4.948815450079392e-05, + "loss": 5.5308, + "step": 10850 + }, + { + "epoch": 0.06453397088210105, + "grad_norm": 1.7474935054779053, + "learning_rate": 4.948806046172691e-05, + "loss": 5.0752, + "step": 10851 + }, + { + "epoch": 0.06453991816538206, + "grad_norm": 1.8992688655853271, + "learning_rate": 4.948796641411138e-05, + "loss": 5.3704, + "step": 10852 + }, + { + "epoch": 0.06454586544866305, + "grad_norm": 1.9632636308670044, + "learning_rate": 4.948787235794734e-05, + "loss": 5.4173, + "step": 10853 + }, + { + "epoch": 0.06455181273194405, + "grad_norm": 1.9034284353256226, + "learning_rate": 4.948777829323484e-05, + "loss": 5.2655, + "step": 10854 + }, + { + "epoch": 0.06455776001522505, + "grad_norm": 1.716711163520813, + "learning_rate": 4.9487684219973914e-05, + "loss": 5.4192, + "step": 10855 + }, + { + "epoch": 0.06456370729850604, + "grad_norm": 1.7886557579040527, + "learning_rate": 4.948759013816459e-05, + "loss": 5.2828, + "step": 10856 + }, + { + "epoch": 0.06456965458178704, + "grad_norm": 2.004117250442505, + "learning_rate": 4.9487496047806905e-05, + "loss": 4.9521, + "step": 10857 + }, + { + "epoch": 0.06457560186506804, + "grad_norm": 1.627955436706543, + "learning_rate": 4.948740194890088e-05, + "loss": 5.4288, + "step": 10858 + }, + { + "epoch": 0.06458154914834903, + "grad_norm": 2.2537145614624023, + "learning_rate": 4.948730784144656e-05, + "loss": 5.8176, + "step": 10859 + }, + { + "epoch": 0.06458749643163003, + "grad_norm": 2.216066837310791, + "learning_rate": 4.948721372544397e-05, + "loss": 5.4569, + "step": 10860 + }, + { + "epoch": 0.06459344371491103, + "grad_norm": 1.7641898393630981, + "learning_rate": 4.948711960089315e-05, + "loss": 5.659, + "step": 10861 + }, + { + "epoch": 0.06459939099819202, + "grad_norm": 1.9137814044952393, + "learning_rate": 4.948702546779413e-05, + "loss": 5.6275, + "step": 10862 + }, + { + "epoch": 0.06460533828147302, + "grad_norm": 2.2355434894561768, + "learning_rate": 4.948693132614694e-05, + "loss": 5.1712, + "step": 10863 + }, + { + "epoch": 0.06461128556475403, + "grad_norm": 1.780849814414978, + "learning_rate": 4.9486837175951616e-05, + "loss": 5.4521, + "step": 10864 + }, + { + "epoch": 0.06461723284803501, + "grad_norm": 1.8078423738479614, + "learning_rate": 4.948674301720819e-05, + "loss": 5.3609, + "step": 10865 + }, + { + "epoch": 0.06462318013131602, + "grad_norm": 1.590707540512085, + "learning_rate": 4.94866488499167e-05, + "loss": 5.4121, + "step": 10866 + }, + { + "epoch": 0.06462912741459702, + "grad_norm": 1.4369510412216187, + "learning_rate": 4.948655467407717e-05, + "loss": 5.418, + "step": 10867 + }, + { + "epoch": 0.064635074697878, + "grad_norm": 1.5800751447677612, + "learning_rate": 4.9486460489689634e-05, + "loss": 5.3492, + "step": 10868 + }, + { + "epoch": 0.06464102198115901, + "grad_norm": 1.5271484851837158, + "learning_rate": 4.948636629675413e-05, + "loss": 5.2758, + "step": 10869 + }, + { + "epoch": 0.06464696926444001, + "grad_norm": 1.7175722122192383, + "learning_rate": 4.948627209527069e-05, + "loss": 5.2939, + "step": 10870 + }, + { + "epoch": 0.064652916547721, + "grad_norm": 1.568851113319397, + "learning_rate": 4.948617788523935e-05, + "loss": 5.2559, + "step": 10871 + }, + { + "epoch": 0.064658863831002, + "grad_norm": 1.4012210369110107, + "learning_rate": 4.9486083666660135e-05, + "loss": 5.3195, + "step": 10872 + }, + { + "epoch": 0.064664811114283, + "grad_norm": 1.5386475324630737, + "learning_rate": 4.948598943953308e-05, + "loss": 5.293, + "step": 10873 + }, + { + "epoch": 0.06467075839756399, + "grad_norm": 1.4143292903900146, + "learning_rate": 4.948589520385821e-05, + "loss": 5.2181, + "step": 10874 + }, + { + "epoch": 0.06467670568084499, + "grad_norm": 1.392470121383667, + "learning_rate": 4.9485800959635576e-05, + "loss": 5.3074, + "step": 10875 + }, + { + "epoch": 0.06468265296412598, + "grad_norm": 1.7176567316055298, + "learning_rate": 4.94857067068652e-05, + "loss": 5.3024, + "step": 10876 + }, + { + "epoch": 0.06468860024740698, + "grad_norm": 1.5002285242080688, + "learning_rate": 4.9485612445547115e-05, + "loss": 5.1543, + "step": 10877 + }, + { + "epoch": 0.06469454753068798, + "grad_norm": 1.5615242719650269, + "learning_rate": 4.9485518175681364e-05, + "loss": 5.371, + "step": 10878 + }, + { + "epoch": 0.06470049481396897, + "grad_norm": 1.4294706583023071, + "learning_rate": 4.9485423897267966e-05, + "loss": 5.4151, + "step": 10879 + }, + { + "epoch": 0.06470644209724997, + "grad_norm": 2.0147571563720703, + "learning_rate": 4.948532961030695e-05, + "loss": 5.3082, + "step": 10880 + }, + { + "epoch": 0.06471238938053098, + "grad_norm": 1.5661358833312988, + "learning_rate": 4.948523531479837e-05, + "loss": 5.8232, + "step": 10881 + }, + { + "epoch": 0.06471833666381197, + "grad_norm": 1.5608779191970825, + "learning_rate": 4.9485141010742245e-05, + "loss": 5.5648, + "step": 10882 + }, + { + "epoch": 0.06472428394709297, + "grad_norm": 2.3148789405822754, + "learning_rate": 4.948504669813861e-05, + "loss": 4.8802, + "step": 10883 + }, + { + "epoch": 0.06473023123037397, + "grad_norm": 1.9495759010314941, + "learning_rate": 4.9484952376987504e-05, + "loss": 5.1985, + "step": 10884 + }, + { + "epoch": 0.06473617851365496, + "grad_norm": 2.031764268875122, + "learning_rate": 4.9484858047288944e-05, + "loss": 5.0772, + "step": 10885 + }, + { + "epoch": 0.06474212579693596, + "grad_norm": 1.6575301885604858, + "learning_rate": 4.948476370904298e-05, + "loss": 5.2157, + "step": 10886 + }, + { + "epoch": 0.06474807308021696, + "grad_norm": 1.6381278038024902, + "learning_rate": 4.948466936224964e-05, + "loss": 5.1168, + "step": 10887 + }, + { + "epoch": 0.06475402036349795, + "grad_norm": 1.672555923461914, + "learning_rate": 4.9484575006908945e-05, + "loss": 5.2839, + "step": 10888 + }, + { + "epoch": 0.06475996764677895, + "grad_norm": 1.8838026523590088, + "learning_rate": 4.9484480643020944e-05, + "loss": 5.301, + "step": 10889 + }, + { + "epoch": 0.06476591493005995, + "grad_norm": 1.935205101966858, + "learning_rate": 4.9484386270585656e-05, + "loss": 5.2898, + "step": 10890 + }, + { + "epoch": 0.06477186221334094, + "grad_norm": 1.630003809928894, + "learning_rate": 4.9484291889603134e-05, + "loss": 5.181, + "step": 10891 + }, + { + "epoch": 0.06477780949662194, + "grad_norm": 1.5095784664154053, + "learning_rate": 4.948419750007339e-05, + "loss": 5.3159, + "step": 10892 + }, + { + "epoch": 0.06478375677990295, + "grad_norm": 1.7217234373092651, + "learning_rate": 4.948410310199647e-05, + "loss": 5.3395, + "step": 10893 + }, + { + "epoch": 0.06478970406318393, + "grad_norm": 1.727953314781189, + "learning_rate": 4.94840086953724e-05, + "loss": 5.1374, + "step": 10894 + }, + { + "epoch": 0.06479565134646494, + "grad_norm": 1.7891777753829956, + "learning_rate": 4.9483914280201224e-05, + "loss": 5.2145, + "step": 10895 + }, + { + "epoch": 0.06480159862974594, + "grad_norm": 1.7402048110961914, + "learning_rate": 4.9483819856482956e-05, + "loss": 5.1723, + "step": 10896 + }, + { + "epoch": 0.06480754591302693, + "grad_norm": 1.6635658740997314, + "learning_rate": 4.9483725424217644e-05, + "loss": 5.0995, + "step": 10897 + }, + { + "epoch": 0.06481349319630793, + "grad_norm": 1.6190650463104248, + "learning_rate": 4.9483630983405317e-05, + "loss": 5.2062, + "step": 10898 + }, + { + "epoch": 0.06481944047958893, + "grad_norm": 1.6335800886154175, + "learning_rate": 4.9483536534046006e-05, + "loss": 5.4298, + "step": 10899 + }, + { + "epoch": 0.06482538776286992, + "grad_norm": 1.7549209594726562, + "learning_rate": 4.948344207613974e-05, + "loss": 5.1833, + "step": 10900 + }, + { + "epoch": 0.06483133504615092, + "grad_norm": 1.6011431217193604, + "learning_rate": 4.948334760968656e-05, + "loss": 5.2329, + "step": 10901 + }, + { + "epoch": 0.06483728232943192, + "grad_norm": 1.627424955368042, + "learning_rate": 4.9483253134686505e-05, + "loss": 5.3059, + "step": 10902 + }, + { + "epoch": 0.06484322961271291, + "grad_norm": 1.593361258506775, + "learning_rate": 4.948315865113959e-05, + "loss": 5.2711, + "step": 10903 + }, + { + "epoch": 0.06484917689599391, + "grad_norm": 1.5899426937103271, + "learning_rate": 4.9483064159045854e-05, + "loss": 5.2449, + "step": 10904 + }, + { + "epoch": 0.0648551241792749, + "grad_norm": 1.6572548151016235, + "learning_rate": 4.948296965840534e-05, + "loss": 5.18, + "step": 10905 + }, + { + "epoch": 0.0648610714625559, + "grad_norm": 1.649928092956543, + "learning_rate": 4.948287514921808e-05, + "loss": 5.2434, + "step": 10906 + }, + { + "epoch": 0.0648670187458369, + "grad_norm": 1.4546284675598145, + "learning_rate": 4.9482780631484094e-05, + "loss": 5.405, + "step": 10907 + }, + { + "epoch": 0.06487296602911789, + "grad_norm": 1.624617338180542, + "learning_rate": 4.9482686105203425e-05, + "loss": 5.3537, + "step": 10908 + }, + { + "epoch": 0.0648789133123989, + "grad_norm": 1.5108991861343384, + "learning_rate": 4.94825915703761e-05, + "loss": 5.1709, + "step": 10909 + }, + { + "epoch": 0.0648848605956799, + "grad_norm": 1.571028470993042, + "learning_rate": 4.948249702700215e-05, + "loss": 5.1374, + "step": 10910 + }, + { + "epoch": 0.06489080787896088, + "grad_norm": 1.3280094861984253, + "learning_rate": 4.948240247508162e-05, + "loss": 5.3469, + "step": 10911 + }, + { + "epoch": 0.06489675516224189, + "grad_norm": 1.8487119674682617, + "learning_rate": 4.948230791461454e-05, + "loss": 5.4673, + "step": 10912 + }, + { + "epoch": 0.06490270244552289, + "grad_norm": 1.6253544092178345, + "learning_rate": 4.9482213345600936e-05, + "loss": 5.2096, + "step": 10913 + }, + { + "epoch": 0.06490864972880388, + "grad_norm": 1.8487451076507568, + "learning_rate": 4.9482118768040844e-05, + "loss": 5.1452, + "step": 10914 + }, + { + "epoch": 0.06491459701208488, + "grad_norm": 1.6638668775558472, + "learning_rate": 4.948202418193429e-05, + "loss": 5.2382, + "step": 10915 + }, + { + "epoch": 0.06492054429536588, + "grad_norm": 1.662256121635437, + "learning_rate": 4.9481929587281326e-05, + "loss": 5.3125, + "step": 10916 + }, + { + "epoch": 0.06492649157864687, + "grad_norm": 1.5133339166641235, + "learning_rate": 4.948183498408197e-05, + "loss": 5.2494, + "step": 10917 + }, + { + "epoch": 0.06493243886192787, + "grad_norm": 1.5063300132751465, + "learning_rate": 4.9481740372336256e-05, + "loss": 5.1778, + "step": 10918 + }, + { + "epoch": 0.06493838614520887, + "grad_norm": 1.5223631858825684, + "learning_rate": 4.948164575204421e-05, + "loss": 5.1773, + "step": 10919 + }, + { + "epoch": 0.06494433342848986, + "grad_norm": 1.6163926124572754, + "learning_rate": 4.948155112320589e-05, + "loss": 5.2669, + "step": 10920 + }, + { + "epoch": 0.06495028071177086, + "grad_norm": 1.4077887535095215, + "learning_rate": 4.948145648582131e-05, + "loss": 5.1711, + "step": 10921 + }, + { + "epoch": 0.06495622799505187, + "grad_norm": 1.5710374116897583, + "learning_rate": 4.9481361839890505e-05, + "loss": 5.1687, + "step": 10922 + }, + { + "epoch": 0.06496217527833285, + "grad_norm": 1.5444159507751465, + "learning_rate": 4.9481267185413506e-05, + "loss": 5.2681, + "step": 10923 + }, + { + "epoch": 0.06496812256161386, + "grad_norm": 1.4816917181015015, + "learning_rate": 4.948117252239035e-05, + "loss": 5.2897, + "step": 10924 + }, + { + "epoch": 0.06497406984489486, + "grad_norm": 1.3373851776123047, + "learning_rate": 4.9481077850821075e-05, + "loss": 5.1607, + "step": 10925 + }, + { + "epoch": 0.06498001712817585, + "grad_norm": 1.7353702783584595, + "learning_rate": 4.948098317070571e-05, + "loss": 5.2546, + "step": 10926 + }, + { + "epoch": 0.06498596441145685, + "grad_norm": 1.4494054317474365, + "learning_rate": 4.948088848204428e-05, + "loss": 5.2244, + "step": 10927 + }, + { + "epoch": 0.06499191169473785, + "grad_norm": 1.6031813621520996, + "learning_rate": 4.9480793784836825e-05, + "loss": 5.2487, + "step": 10928 + }, + { + "epoch": 0.06499785897801884, + "grad_norm": 1.4134970903396606, + "learning_rate": 4.948069907908338e-05, + "loss": 5.2224, + "step": 10929 + }, + { + "epoch": 0.06500380626129984, + "grad_norm": 1.5790150165557861, + "learning_rate": 4.948060436478398e-05, + "loss": 5.3096, + "step": 10930 + }, + { + "epoch": 0.06500975354458084, + "grad_norm": 1.3925936222076416, + "learning_rate": 4.9480509641938644e-05, + "loss": 5.1823, + "step": 10931 + }, + { + "epoch": 0.06501570082786183, + "grad_norm": 1.40078866481781, + "learning_rate": 4.948041491054742e-05, + "loss": 5.1352, + "step": 10932 + }, + { + "epoch": 0.06502164811114283, + "grad_norm": 1.509726881980896, + "learning_rate": 4.948032017061034e-05, + "loss": 5.199, + "step": 10933 + }, + { + "epoch": 0.06502759539442382, + "grad_norm": 1.5671876668930054, + "learning_rate": 4.948022542212743e-05, + "loss": 5.2323, + "step": 10934 + }, + { + "epoch": 0.06503354267770482, + "grad_norm": 1.5019149780273438, + "learning_rate": 4.948013066509872e-05, + "loss": 5.244, + "step": 10935 + }, + { + "epoch": 0.06503948996098582, + "grad_norm": 1.576842188835144, + "learning_rate": 4.948003589952426e-05, + "loss": 5.153, + "step": 10936 + }, + { + "epoch": 0.06504543724426681, + "grad_norm": 1.4069315195083618, + "learning_rate": 4.9479941125404074e-05, + "loss": 5.3396, + "step": 10937 + }, + { + "epoch": 0.06505138452754781, + "grad_norm": 1.6663076877593994, + "learning_rate": 4.947984634273818e-05, + "loss": 5.223, + "step": 10938 + }, + { + "epoch": 0.06505733181082882, + "grad_norm": 1.5132073163986206, + "learning_rate": 4.947975155152663e-05, + "loss": 5.1335, + "step": 10939 + }, + { + "epoch": 0.0650632790941098, + "grad_norm": 1.59386146068573, + "learning_rate": 4.9479656751769455e-05, + "loss": 5.4893, + "step": 10940 + }, + { + "epoch": 0.06506922637739081, + "grad_norm": 1.3486778736114502, + "learning_rate": 4.9479561943466686e-05, + "loss": 5.2164, + "step": 10941 + }, + { + "epoch": 0.06507517366067181, + "grad_norm": 1.4107574224472046, + "learning_rate": 4.947946712661835e-05, + "loss": 5.2337, + "step": 10942 + }, + { + "epoch": 0.0650811209439528, + "grad_norm": 1.6905080080032349, + "learning_rate": 4.947937230122449e-05, + "loss": 5.1749, + "step": 10943 + }, + { + "epoch": 0.0650870682272338, + "grad_norm": 1.5062333345413208, + "learning_rate": 4.947927746728513e-05, + "loss": 5.2227, + "step": 10944 + }, + { + "epoch": 0.0650930155105148, + "grad_norm": 1.4318712949752808, + "learning_rate": 4.947918262480031e-05, + "loss": 5.1565, + "step": 10945 + }, + { + "epoch": 0.06509896279379579, + "grad_norm": 1.5121338367462158, + "learning_rate": 4.9479087773770055e-05, + "loss": 5.3718, + "step": 10946 + }, + { + "epoch": 0.06510491007707679, + "grad_norm": 1.2901450395584106, + "learning_rate": 4.947899291419441e-05, + "loss": 5.291, + "step": 10947 + }, + { + "epoch": 0.0651108573603578, + "grad_norm": 1.5350853204727173, + "learning_rate": 4.9478898046073394e-05, + "loss": 5.411, + "step": 10948 + }, + { + "epoch": 0.06511680464363878, + "grad_norm": 1.5083260536193848, + "learning_rate": 4.947880316940705e-05, + "loss": 4.9143, + "step": 10949 + }, + { + "epoch": 0.06512275192691978, + "grad_norm": 1.462415099143982, + "learning_rate": 4.947870828419541e-05, + "loss": 5.0059, + "step": 10950 + }, + { + "epoch": 0.06512869921020079, + "grad_norm": 1.9356911182403564, + "learning_rate": 4.947861339043851e-05, + "loss": 5.3886, + "step": 10951 + }, + { + "epoch": 0.06513464649348177, + "grad_norm": 1.4918417930603027, + "learning_rate": 4.947851848813637e-05, + "loss": 5.3456, + "step": 10952 + }, + { + "epoch": 0.06514059377676278, + "grad_norm": 1.8015687465667725, + "learning_rate": 4.9478423577289044e-05, + "loss": 5.4599, + "step": 10953 + }, + { + "epoch": 0.06514654106004378, + "grad_norm": 1.663827657699585, + "learning_rate": 4.947832865789654e-05, + "loss": 5.4448, + "step": 10954 + }, + { + "epoch": 0.06515248834332477, + "grad_norm": 1.7196985483169556, + "learning_rate": 4.947823372995891e-05, + "loss": 5.4799, + "step": 10955 + }, + { + "epoch": 0.06515843562660577, + "grad_norm": 1.341449499130249, + "learning_rate": 4.947813879347619e-05, + "loss": 5.0305, + "step": 10956 + }, + { + "epoch": 0.06516438290988677, + "grad_norm": 1.9917103052139282, + "learning_rate": 4.9478043848448394e-05, + "loss": 4.9911, + "step": 10957 + }, + { + "epoch": 0.06517033019316776, + "grad_norm": 1.8540695905685425, + "learning_rate": 4.947794889487557e-05, + "loss": 4.9725, + "step": 10958 + }, + { + "epoch": 0.06517627747644876, + "grad_norm": 1.6755226850509644, + "learning_rate": 4.9477853932757744e-05, + "loss": 5.1452, + "step": 10959 + }, + { + "epoch": 0.06518222475972976, + "grad_norm": 1.613694667816162, + "learning_rate": 4.9477758962094954e-05, + "loss": 5.1241, + "step": 10960 + }, + { + "epoch": 0.06518817204301075, + "grad_norm": 1.4891341924667358, + "learning_rate": 4.9477663982887235e-05, + "loss": 5.2139, + "step": 10961 + }, + { + "epoch": 0.06519411932629175, + "grad_norm": 1.451180100440979, + "learning_rate": 4.947756899513461e-05, + "loss": 5.216, + "step": 10962 + }, + { + "epoch": 0.06520006660957274, + "grad_norm": 1.7225643396377563, + "learning_rate": 4.947747399883712e-05, + "loss": 4.9342, + "step": 10963 + }, + { + "epoch": 0.06520601389285374, + "grad_norm": 1.5917341709136963, + "learning_rate": 4.94773789939948e-05, + "loss": 4.9196, + "step": 10964 + }, + { + "epoch": 0.06521196117613474, + "grad_norm": 1.3010936975479126, + "learning_rate": 4.947728398060768e-05, + "loss": 4.8165, + "step": 10965 + }, + { + "epoch": 0.06521790845941573, + "grad_norm": 1.6672911643981934, + "learning_rate": 4.947718895867579e-05, + "loss": 5.082, + "step": 10966 + }, + { + "epoch": 0.06522385574269673, + "grad_norm": 1.5662728548049927, + "learning_rate": 4.947709392819916e-05, + "loss": 5.1654, + "step": 10967 + }, + { + "epoch": 0.06522980302597774, + "grad_norm": 1.3455015420913696, + "learning_rate": 4.947699888917784e-05, + "loss": 4.6897, + "step": 10968 + }, + { + "epoch": 0.06523575030925872, + "grad_norm": 1.6042569875717163, + "learning_rate": 4.947690384161185e-05, + "loss": 4.6814, + "step": 10969 + }, + { + "epoch": 0.06524169759253973, + "grad_norm": 1.436345100402832, + "learning_rate": 4.947680878550123e-05, + "loss": 4.6052, + "step": 10970 + }, + { + "epoch": 0.06524764487582073, + "grad_norm": 1.3438220024108887, + "learning_rate": 4.9476713720846e-05, + "loss": 4.6385, + "step": 10971 + }, + { + "epoch": 0.06525359215910172, + "grad_norm": 1.378206729888916, + "learning_rate": 4.94766186476462e-05, + "loss": 4.5546, + "step": 10972 + }, + { + "epoch": 0.06525953944238272, + "grad_norm": 1.5776808261871338, + "learning_rate": 4.9476523565901874e-05, + "loss": 4.7728, + "step": 10973 + }, + { + "epoch": 0.06526548672566372, + "grad_norm": 1.8892265558242798, + "learning_rate": 4.947642847561305e-05, + "loss": 5.3423, + "step": 10974 + }, + { + "epoch": 0.06527143400894471, + "grad_norm": 1.279730200767517, + "learning_rate": 4.9476333376779746e-05, + "loss": 4.649, + "step": 10975 + }, + { + "epoch": 0.06527738129222571, + "grad_norm": 1.6268417835235596, + "learning_rate": 4.947623826940201e-05, + "loss": 4.6534, + "step": 10976 + }, + { + "epoch": 0.06528332857550671, + "grad_norm": 1.4456939697265625, + "learning_rate": 4.947614315347987e-05, + "loss": 4.6636, + "step": 10977 + }, + { + "epoch": 0.0652892758587877, + "grad_norm": 1.4848358631134033, + "learning_rate": 4.947604802901337e-05, + "loss": 4.6823, + "step": 10978 + }, + { + "epoch": 0.0652952231420687, + "grad_norm": 1.4143959283828735, + "learning_rate": 4.947595289600253e-05, + "loss": 4.546, + "step": 10979 + }, + { + "epoch": 0.0653011704253497, + "grad_norm": 1.7399781942367554, + "learning_rate": 4.947585775444739e-05, + "loss": 5.1456, + "step": 10980 + }, + { + "epoch": 0.0653071177086307, + "grad_norm": 1.9160579442977905, + "learning_rate": 4.947576260434797e-05, + "loss": 5.4101, + "step": 10981 + }, + { + "epoch": 0.0653130649919117, + "grad_norm": 1.9356415271759033, + "learning_rate": 4.947566744570433e-05, + "loss": 5.6235, + "step": 10982 + }, + { + "epoch": 0.0653190122751927, + "grad_norm": 1.756996512413025, + "learning_rate": 4.947557227851648e-05, + "loss": 5.6458, + "step": 10983 + }, + { + "epoch": 0.06532495955847369, + "grad_norm": 1.790447473526001, + "learning_rate": 4.947547710278446e-05, + "loss": 5.1529, + "step": 10984 + }, + { + "epoch": 0.06533090684175469, + "grad_norm": 1.8125256299972534, + "learning_rate": 4.94753819185083e-05, + "loss": 4.8824, + "step": 10985 + }, + { + "epoch": 0.06533685412503569, + "grad_norm": 1.72708261013031, + "learning_rate": 4.947528672568804e-05, + "loss": 5.1252, + "step": 10986 + }, + { + "epoch": 0.06534280140831668, + "grad_norm": 1.5867630243301392, + "learning_rate": 4.9475191524323714e-05, + "loss": 5.2007, + "step": 10987 + }, + { + "epoch": 0.06534874869159768, + "grad_norm": 1.8278383016586304, + "learning_rate": 4.9475096314415356e-05, + "loss": 5.1268, + "step": 10988 + }, + { + "epoch": 0.06535469597487868, + "grad_norm": 1.6850647926330566, + "learning_rate": 4.947500109596298e-05, + "loss": 5.0058, + "step": 10989 + }, + { + "epoch": 0.06536064325815967, + "grad_norm": 1.4993211030960083, + "learning_rate": 4.9474905868966645e-05, + "loss": 5.1911, + "step": 10990 + }, + { + "epoch": 0.06536659054144067, + "grad_norm": 1.4816709756851196, + "learning_rate": 4.947481063342637e-05, + "loss": 5.073, + "step": 10991 + }, + { + "epoch": 0.06537253782472166, + "grad_norm": 1.5394763946533203, + "learning_rate": 4.9474715389342194e-05, + "loss": 5.3133, + "step": 10992 + }, + { + "epoch": 0.06537848510800266, + "grad_norm": 1.6095061302185059, + "learning_rate": 4.9474620136714144e-05, + "loss": 5.1657, + "step": 10993 + }, + { + "epoch": 0.06538443239128366, + "grad_norm": 1.707533597946167, + "learning_rate": 4.947452487554226e-05, + "loss": 5.2022, + "step": 10994 + }, + { + "epoch": 0.06539037967456465, + "grad_norm": 1.6304863691329956, + "learning_rate": 4.947442960582657e-05, + "loss": 5.1454, + "step": 10995 + }, + { + "epoch": 0.06539632695784565, + "grad_norm": 1.5767943859100342, + "learning_rate": 4.9474334327567103e-05, + "loss": 5.0317, + "step": 10996 + }, + { + "epoch": 0.06540227424112666, + "grad_norm": 1.6779369115829468, + "learning_rate": 4.9474239040763916e-05, + "loss": 5.1932, + "step": 10997 + }, + { + "epoch": 0.06540822152440764, + "grad_norm": 1.6607457399368286, + "learning_rate": 4.947414374541701e-05, + "loss": 5.2488, + "step": 10998 + }, + { + "epoch": 0.06541416880768865, + "grad_norm": 1.5271342992782593, + "learning_rate": 4.947404844152644e-05, + "loss": 5.2225, + "step": 10999 + }, + { + "epoch": 0.06542011609096965, + "grad_norm": 1.3633404970169067, + "learning_rate": 4.947395312909223e-05, + "loss": 5.2228, + "step": 11000 + }, + { + "epoch": 0.06542606337425064, + "grad_norm": 1.4911702871322632, + "learning_rate": 4.9473857808114416e-05, + "loss": 5.3533, + "step": 11001 + }, + { + "epoch": 0.06543201065753164, + "grad_norm": 1.350714087486267, + "learning_rate": 4.947376247859303e-05, + "loss": 5.2553, + "step": 11002 + }, + { + "epoch": 0.06543795794081264, + "grad_norm": 1.531064510345459, + "learning_rate": 4.9473667140528116e-05, + "loss": 5.0982, + "step": 11003 + }, + { + "epoch": 0.06544390522409363, + "grad_norm": 1.4037193059921265, + "learning_rate": 4.947357179391968e-05, + "loss": 5.2129, + "step": 11004 + }, + { + "epoch": 0.06544985250737463, + "grad_norm": 1.5746560096740723, + "learning_rate": 4.9473476438767784e-05, + "loss": 5.2561, + "step": 11005 + }, + { + "epoch": 0.06545579979065563, + "grad_norm": 1.4906586408615112, + "learning_rate": 4.947338107507245e-05, + "loss": 5.2584, + "step": 11006 + }, + { + "epoch": 0.06546174707393662, + "grad_norm": 1.687965989112854, + "learning_rate": 4.947328570283371e-05, + "loss": 5.0578, + "step": 11007 + }, + { + "epoch": 0.06546769435721762, + "grad_norm": 1.6732810735702515, + "learning_rate": 4.94731903220516e-05, + "loss": 5.1301, + "step": 11008 + }, + { + "epoch": 0.06547364164049863, + "grad_norm": 1.465431809425354, + "learning_rate": 4.947309493272615e-05, + "loss": 5.2479, + "step": 11009 + }, + { + "epoch": 0.06547958892377961, + "grad_norm": 1.4699040651321411, + "learning_rate": 4.94729995348574e-05, + "loss": 5.263, + "step": 11010 + }, + { + "epoch": 0.06548553620706062, + "grad_norm": 1.5757801532745361, + "learning_rate": 4.947290412844537e-05, + "loss": 5.2938, + "step": 11011 + }, + { + "epoch": 0.06549148349034162, + "grad_norm": 1.5458070039749146, + "learning_rate": 4.947280871349011e-05, + "loss": 5.2755, + "step": 11012 + }, + { + "epoch": 0.0654974307736226, + "grad_norm": 1.4919404983520508, + "learning_rate": 4.9472713289991644e-05, + "loss": 5.1432, + "step": 11013 + }, + { + "epoch": 0.06550337805690361, + "grad_norm": 1.513539433479309, + "learning_rate": 4.947261785795001e-05, + "loss": 5.3262, + "step": 11014 + }, + { + "epoch": 0.06550932534018461, + "grad_norm": 1.610257863998413, + "learning_rate": 4.947252241736523e-05, + "loss": 5.1444, + "step": 11015 + }, + { + "epoch": 0.0655152726234656, + "grad_norm": 1.5597975254058838, + "learning_rate": 4.947242696823735e-05, + "loss": 5.1581, + "step": 11016 + }, + { + "epoch": 0.0655212199067466, + "grad_norm": 1.686418056488037, + "learning_rate": 4.94723315105664e-05, + "loss": 5.1608, + "step": 11017 + }, + { + "epoch": 0.0655271671900276, + "grad_norm": 1.5329445600509644, + "learning_rate": 4.94722360443524e-05, + "loss": 5.1716, + "step": 11018 + }, + { + "epoch": 0.06553311447330859, + "grad_norm": 1.4718917608261108, + "learning_rate": 4.94721405695954e-05, + "loss": 5.0924, + "step": 11019 + }, + { + "epoch": 0.06553906175658959, + "grad_norm": 1.4442907571792603, + "learning_rate": 4.947204508629544e-05, + "loss": 5.3967, + "step": 11020 + }, + { + "epoch": 0.06554500903987058, + "grad_norm": 1.523834466934204, + "learning_rate": 4.947194959445253e-05, + "loss": 5.2068, + "step": 11021 + }, + { + "epoch": 0.06555095632315158, + "grad_norm": 1.4898262023925781, + "learning_rate": 4.947185409406672e-05, + "loss": 5.1664, + "step": 11022 + }, + { + "epoch": 0.06555690360643258, + "grad_norm": 1.504695177078247, + "learning_rate": 4.947175858513804e-05, + "loss": 5.2349, + "step": 11023 + }, + { + "epoch": 0.06556285088971357, + "grad_norm": 1.3538787364959717, + "learning_rate": 4.9471663067666516e-05, + "loss": 5.1034, + "step": 11024 + }, + { + "epoch": 0.06556879817299457, + "grad_norm": 1.3748440742492676, + "learning_rate": 4.94715675416522e-05, + "loss": 4.9759, + "step": 11025 + }, + { + "epoch": 0.06557474545627558, + "grad_norm": 1.5980280637741089, + "learning_rate": 4.94714720070951e-05, + "loss": 5.3042, + "step": 11026 + }, + { + "epoch": 0.06558069273955656, + "grad_norm": 1.641076683998108, + "learning_rate": 4.9471376463995266e-05, + "loss": 5.3373, + "step": 11027 + }, + { + "epoch": 0.06558664002283757, + "grad_norm": 1.5320390462875366, + "learning_rate": 4.947128091235273e-05, + "loss": 5.2308, + "step": 11028 + }, + { + "epoch": 0.06559258730611857, + "grad_norm": 1.5777555704116821, + "learning_rate": 4.9471185352167514e-05, + "loss": 5.2242, + "step": 11029 + }, + { + "epoch": 0.06559853458939956, + "grad_norm": 1.5055029392242432, + "learning_rate": 4.947108978343967e-05, + "loss": 5.1974, + "step": 11030 + }, + { + "epoch": 0.06560448187268056, + "grad_norm": 1.3923927545547485, + "learning_rate": 4.947099420616922e-05, + "loss": 5.3244, + "step": 11031 + }, + { + "epoch": 0.06561042915596156, + "grad_norm": 1.40999174118042, + "learning_rate": 4.9470898620356186e-05, + "loss": 5.3315, + "step": 11032 + }, + { + "epoch": 0.06561637643924255, + "grad_norm": 1.418296456336975, + "learning_rate": 4.947080302600063e-05, + "loss": 5.3942, + "step": 11033 + }, + { + "epoch": 0.06562232372252355, + "grad_norm": 1.7927478551864624, + "learning_rate": 4.9470707423102566e-05, + "loss": 5.3084, + "step": 11034 + }, + { + "epoch": 0.06562827100580455, + "grad_norm": 1.385011911392212, + "learning_rate": 4.947061181166203e-05, + "loss": 5.2043, + "step": 11035 + }, + { + "epoch": 0.06563421828908554, + "grad_norm": 1.5702954530715942, + "learning_rate": 4.9470516191679054e-05, + "loss": 5.9851, + "step": 11036 + }, + { + "epoch": 0.06564016557236654, + "grad_norm": 1.4196525812149048, + "learning_rate": 4.947042056315367e-05, + "loss": 5.2592, + "step": 11037 + }, + { + "epoch": 0.06564611285564755, + "grad_norm": 1.8318798542022705, + "learning_rate": 4.947032492608592e-05, + "loss": 5.3181, + "step": 11038 + }, + { + "epoch": 0.06565206013892853, + "grad_norm": 1.615460991859436, + "learning_rate": 4.947022928047583e-05, + "loss": 5.4053, + "step": 11039 + }, + { + "epoch": 0.06565800742220954, + "grad_norm": 1.384602427482605, + "learning_rate": 4.947013362632344e-05, + "loss": 5.3955, + "step": 11040 + }, + { + "epoch": 0.06566395470549054, + "grad_norm": 1.5959913730621338, + "learning_rate": 4.947003796362878e-05, + "loss": 5.4737, + "step": 11041 + }, + { + "epoch": 0.06566990198877153, + "grad_norm": 1.483659029006958, + "learning_rate": 4.946994229239188e-05, + "loss": 5.3804, + "step": 11042 + }, + { + "epoch": 0.06567584927205253, + "grad_norm": 1.2752004861831665, + "learning_rate": 4.946984661261277e-05, + "loss": 5.3806, + "step": 11043 + }, + { + "epoch": 0.06568179655533353, + "grad_norm": 2.0671582221984863, + "learning_rate": 4.946975092429149e-05, + "loss": 5.3047, + "step": 11044 + }, + { + "epoch": 0.06568774383861452, + "grad_norm": 1.6126081943511963, + "learning_rate": 4.946965522742808e-05, + "loss": 5.1905, + "step": 11045 + }, + { + "epoch": 0.06569369112189552, + "grad_norm": 1.6867598295211792, + "learning_rate": 4.946955952202257e-05, + "loss": 5.1543, + "step": 11046 + }, + { + "epoch": 0.06569963840517652, + "grad_norm": 1.3493974208831787, + "learning_rate": 4.946946380807498e-05, + "loss": 5.1527, + "step": 11047 + }, + { + "epoch": 0.06570558568845751, + "grad_norm": 1.4694898128509521, + "learning_rate": 4.946936808558536e-05, + "loss": 5.238, + "step": 11048 + }, + { + "epoch": 0.06571153297173851, + "grad_norm": 1.7940189838409424, + "learning_rate": 4.946927235455373e-05, + "loss": 5.0666, + "step": 11049 + }, + { + "epoch": 0.0657174802550195, + "grad_norm": 1.7015198469161987, + "learning_rate": 4.946917661498013e-05, + "loss": 5.5182, + "step": 11050 + }, + { + "epoch": 0.0657234275383005, + "grad_norm": 2.214686632156372, + "learning_rate": 4.946908086686459e-05, + "loss": 5.9424, + "step": 11051 + }, + { + "epoch": 0.0657293748215815, + "grad_norm": 1.7855008840560913, + "learning_rate": 4.9468985110207154e-05, + "loss": 5.8496, + "step": 11052 + }, + { + "epoch": 0.06573532210486249, + "grad_norm": 1.8354082107543945, + "learning_rate": 4.946888934500785e-05, + "loss": 5.8044, + "step": 11053 + }, + { + "epoch": 0.0657412693881435, + "grad_norm": 2.0321154594421387, + "learning_rate": 4.9468793571266705e-05, + "loss": 5.9488, + "step": 11054 + }, + { + "epoch": 0.0657472166714245, + "grad_norm": 2.2285213470458984, + "learning_rate": 4.946869778898376e-05, + "loss": 5.1819, + "step": 11055 + }, + { + "epoch": 0.06575316395470548, + "grad_norm": 1.9831287860870361, + "learning_rate": 4.946860199815904e-05, + "loss": 5.2068, + "step": 11056 + }, + { + "epoch": 0.06575911123798649, + "grad_norm": 2.1150667667388916, + "learning_rate": 4.946850619879259e-05, + "loss": 5.1523, + "step": 11057 + }, + { + "epoch": 0.06576505852126749, + "grad_norm": 1.9136968851089478, + "learning_rate": 4.946841039088444e-05, + "loss": 5.0084, + "step": 11058 + }, + { + "epoch": 0.06577100580454848, + "grad_norm": 1.9802511930465698, + "learning_rate": 4.9468314574434604e-05, + "loss": 4.9223, + "step": 11059 + }, + { + "epoch": 0.06577695308782948, + "grad_norm": 1.940656065940857, + "learning_rate": 4.946821874944315e-05, + "loss": 4.9662, + "step": 11060 + }, + { + "epoch": 0.06578290037111048, + "grad_norm": 1.8476706743240356, + "learning_rate": 4.9468122915910084e-05, + "loss": 4.8863, + "step": 11061 + }, + { + "epoch": 0.06578884765439147, + "grad_norm": 2.0490243434906006, + "learning_rate": 4.946802707383546e-05, + "loss": 4.8459, + "step": 11062 + }, + { + "epoch": 0.06579479493767247, + "grad_norm": 1.8996137380599976, + "learning_rate": 4.946793122321928e-05, + "loss": 4.7574, + "step": 11063 + }, + { + "epoch": 0.06580074222095347, + "grad_norm": 1.8910033702850342, + "learning_rate": 4.946783536406161e-05, + "loss": 4.8808, + "step": 11064 + }, + { + "epoch": 0.06580668950423446, + "grad_norm": 2.123816967010498, + "learning_rate": 4.946773949636247e-05, + "loss": 4.8486, + "step": 11065 + }, + { + "epoch": 0.06581263678751546, + "grad_norm": 1.7508260011672974, + "learning_rate": 4.9467643620121906e-05, + "loss": 4.9856, + "step": 11066 + }, + { + "epoch": 0.06581858407079647, + "grad_norm": 1.728398084640503, + "learning_rate": 4.9467547735339926e-05, + "loss": 4.9634, + "step": 11067 + }, + { + "epoch": 0.06582453135407745, + "grad_norm": 2.1020689010620117, + "learning_rate": 4.946745184201659e-05, + "loss": 4.6133, + "step": 11068 + }, + { + "epoch": 0.06583047863735846, + "grad_norm": 2.106549024581909, + "learning_rate": 4.9467355940151904e-05, + "loss": 4.7124, + "step": 11069 + }, + { + "epoch": 0.06583642592063946, + "grad_norm": 2.078505039215088, + "learning_rate": 4.9467260029745924e-05, + "loss": 4.5828, + "step": 11070 + }, + { + "epoch": 0.06584237320392045, + "grad_norm": 1.987950325012207, + "learning_rate": 4.946716411079868e-05, + "loss": 4.5823, + "step": 11071 + }, + { + "epoch": 0.06584832048720145, + "grad_norm": 1.9027208089828491, + "learning_rate": 4.94670681833102e-05, + "loss": 4.8063, + "step": 11072 + }, + { + "epoch": 0.06585426777048245, + "grad_norm": 2.001823902130127, + "learning_rate": 4.946697224728052e-05, + "loss": 4.5405, + "step": 11073 + }, + { + "epoch": 0.06586021505376344, + "grad_norm": 2.1472394466400146, + "learning_rate": 4.946687630270967e-05, + "loss": 4.6565, + "step": 11074 + }, + { + "epoch": 0.06586616233704444, + "grad_norm": 2.0731146335601807, + "learning_rate": 4.946678034959769e-05, + "loss": 4.5022, + "step": 11075 + }, + { + "epoch": 0.06587210962032544, + "grad_norm": 2.0769810676574707, + "learning_rate": 4.946668438794461e-05, + "loss": 4.5248, + "step": 11076 + }, + { + "epoch": 0.06587805690360643, + "grad_norm": 2.183871269226074, + "learning_rate": 4.946658841775046e-05, + "loss": 4.5723, + "step": 11077 + }, + { + "epoch": 0.06588400418688743, + "grad_norm": 2.0304160118103027, + "learning_rate": 4.9466492439015275e-05, + "loss": 4.5928, + "step": 11078 + }, + { + "epoch": 0.06588995147016842, + "grad_norm": 1.9167170524597168, + "learning_rate": 4.94663964517391e-05, + "loss": 4.4162, + "step": 11079 + }, + { + "epoch": 0.06589589875344942, + "grad_norm": 2.1295299530029297, + "learning_rate": 4.9466300455921946e-05, + "loss": 4.6662, + "step": 11080 + }, + { + "epoch": 0.06590184603673042, + "grad_norm": 2.180253744125366, + "learning_rate": 4.946620445156386e-05, + "loss": 4.5101, + "step": 11081 + }, + { + "epoch": 0.06590779332001141, + "grad_norm": 1.887289047241211, + "learning_rate": 4.9466108438664885e-05, + "loss": 4.3611, + "step": 11082 + }, + { + "epoch": 0.06591374060329241, + "grad_norm": 1.8323948383331299, + "learning_rate": 4.946601241722504e-05, + "loss": 4.8711, + "step": 11083 + }, + { + "epoch": 0.06591968788657342, + "grad_norm": 1.944860577583313, + "learning_rate": 4.946591638724436e-05, + "loss": 4.5288, + "step": 11084 + }, + { + "epoch": 0.0659256351698544, + "grad_norm": 1.9748528003692627, + "learning_rate": 4.946582034872288e-05, + "loss": 4.3819, + "step": 11085 + }, + { + "epoch": 0.0659315824531354, + "grad_norm": 2.017582416534424, + "learning_rate": 4.9465724301660635e-05, + "loss": 4.4508, + "step": 11086 + }, + { + "epoch": 0.06593752973641641, + "grad_norm": 1.8043986558914185, + "learning_rate": 4.946562824605766e-05, + "loss": 4.5948, + "step": 11087 + }, + { + "epoch": 0.0659434770196974, + "grad_norm": 1.8695666790008545, + "learning_rate": 4.946553218191399e-05, + "loss": 4.2691, + "step": 11088 + }, + { + "epoch": 0.0659494243029784, + "grad_norm": 2.027717351913452, + "learning_rate": 4.9465436109229656e-05, + "loss": 4.4152, + "step": 11089 + }, + { + "epoch": 0.0659553715862594, + "grad_norm": 1.989127278327942, + "learning_rate": 4.946534002800469e-05, + "loss": 4.5155, + "step": 11090 + }, + { + "epoch": 0.06596131886954039, + "grad_norm": 1.9889907836914062, + "learning_rate": 4.9465243938239124e-05, + "loss": 4.4047, + "step": 11091 + }, + { + "epoch": 0.06596726615282139, + "grad_norm": 2.077021837234497, + "learning_rate": 4.946514783993299e-05, + "loss": 4.5199, + "step": 11092 + }, + { + "epoch": 0.0659732134361024, + "grad_norm": 1.9180271625518799, + "learning_rate": 4.946505173308633e-05, + "loss": 4.4511, + "step": 11093 + }, + { + "epoch": 0.06597916071938338, + "grad_norm": 2.120338201522827, + "learning_rate": 4.946495561769918e-05, + "loss": 4.3034, + "step": 11094 + }, + { + "epoch": 0.06598510800266438, + "grad_norm": 1.9632322788238525, + "learning_rate": 4.946485949377156e-05, + "loss": 5.2411, + "step": 11095 + }, + { + "epoch": 0.06599105528594539, + "grad_norm": 2.0921249389648438, + "learning_rate": 4.946476336130351e-05, + "loss": 4.5768, + "step": 11096 + }, + { + "epoch": 0.06599700256922637, + "grad_norm": 2.1472532749176025, + "learning_rate": 4.9464667220295066e-05, + "loss": 4.6279, + "step": 11097 + }, + { + "epoch": 0.06600294985250738, + "grad_norm": 2.472062349319458, + "learning_rate": 4.946457107074626e-05, + "loss": 5.703, + "step": 11098 + }, + { + "epoch": 0.06600889713578838, + "grad_norm": 1.8995217084884644, + "learning_rate": 4.946447491265712e-05, + "loss": 4.5265, + "step": 11099 + }, + { + "epoch": 0.06601484441906937, + "grad_norm": 2.173339605331421, + "learning_rate": 4.946437874602769e-05, + "loss": 4.5356, + "step": 11100 + }, + { + "epoch": 0.06602079170235037, + "grad_norm": 1.8179867267608643, + "learning_rate": 4.9464282570858e-05, + "loss": 4.3765, + "step": 11101 + }, + { + "epoch": 0.06602673898563137, + "grad_norm": 2.367713212966919, + "learning_rate": 4.946418638714808e-05, + "loss": 5.6831, + "step": 11102 + }, + { + "epoch": 0.06603268626891236, + "grad_norm": 2.3576571941375732, + "learning_rate": 4.9464090194897964e-05, + "loss": 5.563, + "step": 11103 + }, + { + "epoch": 0.06603863355219336, + "grad_norm": 2.0476090908050537, + "learning_rate": 4.946399399410768e-05, + "loss": 5.7503, + "step": 11104 + }, + { + "epoch": 0.06604458083547436, + "grad_norm": 2.104295253753662, + "learning_rate": 4.946389778477728e-05, + "loss": 5.669, + "step": 11105 + }, + { + "epoch": 0.06605052811875535, + "grad_norm": 2.1458580493927, + "learning_rate": 4.946380156690677e-05, + "loss": 5.5317, + "step": 11106 + }, + { + "epoch": 0.06605647540203635, + "grad_norm": 2.0373425483703613, + "learning_rate": 4.946370534049621e-05, + "loss": 5.5952, + "step": 11107 + }, + { + "epoch": 0.06606242268531734, + "grad_norm": 2.232574701309204, + "learning_rate": 4.946360910554563e-05, + "loss": 5.6076, + "step": 11108 + }, + { + "epoch": 0.06606836996859834, + "grad_norm": 2.1477861404418945, + "learning_rate": 4.946351286205505e-05, + "loss": 5.5862, + "step": 11109 + }, + { + "epoch": 0.06607431725187934, + "grad_norm": 2.105203866958618, + "learning_rate": 4.946341661002451e-05, + "loss": 5.5089, + "step": 11110 + }, + { + "epoch": 0.06608026453516033, + "grad_norm": 2.1524410247802734, + "learning_rate": 4.9463320349454047e-05, + "loss": 5.419, + "step": 11111 + }, + { + "epoch": 0.06608621181844133, + "grad_norm": 2.132504463195801, + "learning_rate": 4.946322408034369e-05, + "loss": 5.3421, + "step": 11112 + }, + { + "epoch": 0.06609215910172234, + "grad_norm": 1.7870386838912964, + "learning_rate": 4.9463127802693474e-05, + "loss": 5.1829, + "step": 11113 + }, + { + "epoch": 0.06609810638500332, + "grad_norm": 1.9586358070373535, + "learning_rate": 4.946303151650343e-05, + "loss": 5.228, + "step": 11114 + }, + { + "epoch": 0.06610405366828433, + "grad_norm": 2.092473030090332, + "learning_rate": 4.9462935221773594e-05, + "loss": 5.4616, + "step": 11115 + }, + { + "epoch": 0.06611000095156533, + "grad_norm": 2.204131603240967, + "learning_rate": 4.946283891850401e-05, + "loss": 5.4552, + "step": 11116 + }, + { + "epoch": 0.06611594823484632, + "grad_norm": 1.998795747756958, + "learning_rate": 4.946274260669469e-05, + "loss": 5.5193, + "step": 11117 + }, + { + "epoch": 0.06612189551812732, + "grad_norm": 1.9446638822555542, + "learning_rate": 4.9462646286345684e-05, + "loss": 5.3923, + "step": 11118 + }, + { + "epoch": 0.06612784280140832, + "grad_norm": 1.828114628791809, + "learning_rate": 4.946254995745702e-05, + "loss": 5.4306, + "step": 11119 + }, + { + "epoch": 0.06613379008468931, + "grad_norm": 2.1322944164276123, + "learning_rate": 4.946245362002873e-05, + "loss": 5.3831, + "step": 11120 + }, + { + "epoch": 0.06613973736797031, + "grad_norm": 2.1194324493408203, + "learning_rate": 4.9462357274060856e-05, + "loss": 5.2805, + "step": 11121 + }, + { + "epoch": 0.06614568465125131, + "grad_norm": 2.011417865753174, + "learning_rate": 4.946226091955342e-05, + "loss": 5.3052, + "step": 11122 + }, + { + "epoch": 0.0661516319345323, + "grad_norm": 2.202887773513794, + "learning_rate": 4.9462164556506464e-05, + "loss": 5.5263, + "step": 11123 + }, + { + "epoch": 0.0661575792178133, + "grad_norm": 2.075645685195923, + "learning_rate": 4.946206818492002e-05, + "loss": 5.1033, + "step": 11124 + }, + { + "epoch": 0.0661635265010943, + "grad_norm": 2.0723443031311035, + "learning_rate": 4.946197180479412e-05, + "loss": 4.8365, + "step": 11125 + }, + { + "epoch": 0.0661694737843753, + "grad_norm": 2.245961904525757, + "learning_rate": 4.94618754161288e-05, + "loss": 5.0123, + "step": 11126 + }, + { + "epoch": 0.0661754210676563, + "grad_norm": 2.0513699054718018, + "learning_rate": 4.9461779018924096e-05, + "loss": 4.9909, + "step": 11127 + }, + { + "epoch": 0.0661813683509373, + "grad_norm": 2.1552181243896484, + "learning_rate": 4.9461682613180024e-05, + "loss": 5.165, + "step": 11128 + }, + { + "epoch": 0.06618731563421829, + "grad_norm": 2.1207263469696045, + "learning_rate": 4.946158619889664e-05, + "loss": 5.3254, + "step": 11129 + }, + { + "epoch": 0.06619326291749929, + "grad_norm": 1.8278319835662842, + "learning_rate": 4.946148977607397e-05, + "loss": 5.2462, + "step": 11130 + }, + { + "epoch": 0.06619921020078029, + "grad_norm": 2.434661865234375, + "learning_rate": 4.9461393344712046e-05, + "loss": 5.28, + "step": 11131 + }, + { + "epoch": 0.06620515748406128, + "grad_norm": 2.3434953689575195, + "learning_rate": 4.9461296904810904e-05, + "loss": 5.112, + "step": 11132 + }, + { + "epoch": 0.06621110476734228, + "grad_norm": 2.010430335998535, + "learning_rate": 4.946120045637057e-05, + "loss": 5.1236, + "step": 11133 + }, + { + "epoch": 0.06621705205062328, + "grad_norm": 2.19608736038208, + "learning_rate": 4.946110399939109e-05, + "loss": 5.122, + "step": 11134 + }, + { + "epoch": 0.06622299933390427, + "grad_norm": 1.9471449851989746, + "learning_rate": 4.946100753387249e-05, + "loss": 5.2849, + "step": 11135 + }, + { + "epoch": 0.06622894661718527, + "grad_norm": 2.0541727542877197, + "learning_rate": 4.94609110598148e-05, + "loss": 5.4196, + "step": 11136 + }, + { + "epoch": 0.06623489390046626, + "grad_norm": 2.268826723098755, + "learning_rate": 4.946081457721806e-05, + "loss": 5.449, + "step": 11137 + }, + { + "epoch": 0.06624084118374726, + "grad_norm": 2.075227975845337, + "learning_rate": 4.9460718086082307e-05, + "loss": 5.5463, + "step": 11138 + }, + { + "epoch": 0.06624678846702826, + "grad_norm": 2.0949649810791016, + "learning_rate": 4.9460621586407567e-05, + "loss": 5.3737, + "step": 11139 + }, + { + "epoch": 0.06625273575030925, + "grad_norm": 2.1247878074645996, + "learning_rate": 4.9460525078193874e-05, + "loss": 5.2766, + "step": 11140 + }, + { + "epoch": 0.06625868303359025, + "grad_norm": 1.8304489850997925, + "learning_rate": 4.9460428561441276e-05, + "loss": 5.181, + "step": 11141 + }, + { + "epoch": 0.06626463031687126, + "grad_norm": 2.160853862762451, + "learning_rate": 4.946033203614978e-05, + "loss": 5.5222, + "step": 11142 + }, + { + "epoch": 0.06627057760015224, + "grad_norm": 1.9857962131500244, + "learning_rate": 4.9460235502319446e-05, + "loss": 5.574, + "step": 11143 + }, + { + "epoch": 0.06627652488343325, + "grad_norm": 2.016709804534912, + "learning_rate": 4.9460138959950294e-05, + "loss": 5.5255, + "step": 11144 + }, + { + "epoch": 0.06628247216671425, + "grad_norm": 1.8675861358642578, + "learning_rate": 4.946004240904235e-05, + "loss": 5.3604, + "step": 11145 + }, + { + "epoch": 0.06628841944999524, + "grad_norm": 1.9159897565841675, + "learning_rate": 4.945994584959567e-05, + "loss": 5.5348, + "step": 11146 + }, + { + "epoch": 0.06629436673327624, + "grad_norm": 2.0460150241851807, + "learning_rate": 4.945984928161027e-05, + "loss": 5.3267, + "step": 11147 + }, + { + "epoch": 0.06630031401655724, + "grad_norm": 1.8361427783966064, + "learning_rate": 4.9459752705086196e-05, + "loss": 5.3309, + "step": 11148 + }, + { + "epoch": 0.06630626129983823, + "grad_norm": 1.5448495149612427, + "learning_rate": 4.945965612002347e-05, + "loss": 5.0789, + "step": 11149 + }, + { + "epoch": 0.06631220858311923, + "grad_norm": 1.4580925703048706, + "learning_rate": 4.9459559526422125e-05, + "loss": 5.2011, + "step": 11150 + }, + { + "epoch": 0.06631815586640023, + "grad_norm": 1.606593370437622, + "learning_rate": 4.945946292428221e-05, + "loss": 5.2061, + "step": 11151 + }, + { + "epoch": 0.06632410314968122, + "grad_norm": 1.4270994663238525, + "learning_rate": 4.945936631360375e-05, + "loss": 5.089, + "step": 11152 + }, + { + "epoch": 0.06633005043296222, + "grad_norm": 1.6082873344421387, + "learning_rate": 4.9459269694386766e-05, + "loss": 5.2502, + "step": 11153 + }, + { + "epoch": 0.06633599771624323, + "grad_norm": 1.5378412008285522, + "learning_rate": 4.945917306663131e-05, + "loss": 5.4431, + "step": 11154 + }, + { + "epoch": 0.06634194499952421, + "grad_norm": 1.2726879119873047, + "learning_rate": 4.9459076430337416e-05, + "loss": 5.4568, + "step": 11155 + }, + { + "epoch": 0.06634789228280522, + "grad_norm": 1.6131432056427002, + "learning_rate": 4.94589797855051e-05, + "loss": 5.2507, + "step": 11156 + }, + { + "epoch": 0.06635383956608622, + "grad_norm": 1.5835362672805786, + "learning_rate": 4.945888313213442e-05, + "loss": 5.1122, + "step": 11157 + }, + { + "epoch": 0.0663597868493672, + "grad_norm": 1.5903444290161133, + "learning_rate": 4.945878647022539e-05, + "loss": 5.3236, + "step": 11158 + }, + { + "epoch": 0.06636573413264821, + "grad_norm": 1.7948551177978516, + "learning_rate": 4.945868979977805e-05, + "loss": 5.5939, + "step": 11159 + }, + { + "epoch": 0.06637168141592921, + "grad_norm": 2.1183457374572754, + "learning_rate": 4.945859312079243e-05, + "loss": 5.3639, + "step": 11160 + }, + { + "epoch": 0.0663776286992102, + "grad_norm": 1.5584137439727783, + "learning_rate": 4.945849643326857e-05, + "loss": 5.4302, + "step": 11161 + }, + { + "epoch": 0.0663835759824912, + "grad_norm": 1.5150829553604126, + "learning_rate": 4.9458399737206504e-05, + "loss": 5.2485, + "step": 11162 + }, + { + "epoch": 0.0663895232657722, + "grad_norm": 1.421235203742981, + "learning_rate": 4.9458303032606264e-05, + "loss": 5.2149, + "step": 11163 + }, + { + "epoch": 0.06639547054905319, + "grad_norm": 1.640207052230835, + "learning_rate": 4.945820631946788e-05, + "loss": 5.2807, + "step": 11164 + }, + { + "epoch": 0.06640141783233419, + "grad_norm": 1.5021215677261353, + "learning_rate": 4.945810959779139e-05, + "loss": 5.3684, + "step": 11165 + }, + { + "epoch": 0.06640736511561518, + "grad_norm": 1.802828073501587, + "learning_rate": 4.945801286757682e-05, + "loss": 5.2153, + "step": 11166 + }, + { + "epoch": 0.06641331239889618, + "grad_norm": 1.556386947631836, + "learning_rate": 4.945791612882422e-05, + "loss": 5.1908, + "step": 11167 + }, + { + "epoch": 0.06641925968217718, + "grad_norm": 1.5906118154525757, + "learning_rate": 4.9457819381533616e-05, + "loss": 5.2183, + "step": 11168 + }, + { + "epoch": 0.06642520696545817, + "grad_norm": 1.5778700113296509, + "learning_rate": 4.945772262570503e-05, + "loss": 5.2465, + "step": 11169 + }, + { + "epoch": 0.06643115424873917, + "grad_norm": 1.4705984592437744, + "learning_rate": 4.945762586133852e-05, + "loss": 5.1496, + "step": 11170 + }, + { + "epoch": 0.06643710153202018, + "grad_norm": 1.5118781328201294, + "learning_rate": 4.9457529088434093e-05, + "loss": 5.1764, + "step": 11171 + }, + { + "epoch": 0.06644304881530116, + "grad_norm": 1.5784192085266113, + "learning_rate": 4.94574323069918e-05, + "loss": 5.165, + "step": 11172 + }, + { + "epoch": 0.06644899609858217, + "grad_norm": 1.517220139503479, + "learning_rate": 4.9457335517011666e-05, + "loss": 5.1718, + "step": 11173 + }, + { + "epoch": 0.06645494338186317, + "grad_norm": 1.3823192119598389, + "learning_rate": 4.9457238718493734e-05, + "loss": 5.1945, + "step": 11174 + }, + { + "epoch": 0.06646089066514416, + "grad_norm": 1.4499212503433228, + "learning_rate": 4.945714191143803e-05, + "loss": 5.1044, + "step": 11175 + }, + { + "epoch": 0.06646683794842516, + "grad_norm": 1.4904807806015015, + "learning_rate": 4.945704509584459e-05, + "loss": 5.1781, + "step": 11176 + }, + { + "epoch": 0.06647278523170616, + "grad_norm": 1.6798325777053833, + "learning_rate": 4.945694827171345e-05, + "loss": 4.8879, + "step": 11177 + }, + { + "epoch": 0.06647873251498715, + "grad_norm": 1.3890799283981323, + "learning_rate": 4.945685143904464e-05, + "loss": 4.9941, + "step": 11178 + }, + { + "epoch": 0.06648467979826815, + "grad_norm": 1.4167201519012451, + "learning_rate": 4.94567545978382e-05, + "loss": 5.016, + "step": 11179 + }, + { + "epoch": 0.06649062708154915, + "grad_norm": 1.5122467279434204, + "learning_rate": 4.9456657748094145e-05, + "loss": 4.9937, + "step": 11180 + }, + { + "epoch": 0.06649657436483014, + "grad_norm": 1.4347165822982788, + "learning_rate": 4.9456560889812543e-05, + "loss": 5.0486, + "step": 11181 + }, + { + "epoch": 0.06650252164811114, + "grad_norm": 1.6328964233398438, + "learning_rate": 4.94564640229934e-05, + "loss": 5.1891, + "step": 11182 + }, + { + "epoch": 0.06650846893139215, + "grad_norm": 1.5832617282867432, + "learning_rate": 4.9456367147636765e-05, + "loss": 5.2947, + "step": 11183 + }, + { + "epoch": 0.06651441621467313, + "grad_norm": 1.6932839155197144, + "learning_rate": 4.9456270263742655e-05, + "loss": 5.0755, + "step": 11184 + }, + { + "epoch": 0.06652036349795414, + "grad_norm": 1.6238216161727905, + "learning_rate": 4.945617337131111e-05, + "loss": 5.1903, + "step": 11185 + }, + { + "epoch": 0.06652631078123514, + "grad_norm": 2.362353801727295, + "learning_rate": 4.945607647034218e-05, + "loss": 5.3641, + "step": 11186 + }, + { + "epoch": 0.06653225806451613, + "grad_norm": 1.6447978019714355, + "learning_rate": 4.9455979560835874e-05, + "loss": 5.0174, + "step": 11187 + }, + { + "epoch": 0.06653820534779713, + "grad_norm": 1.6059958934783936, + "learning_rate": 4.945588264279225e-05, + "loss": 4.884, + "step": 11188 + }, + { + "epoch": 0.06654415263107813, + "grad_norm": 1.6291608810424805, + "learning_rate": 4.9455785716211325e-05, + "loss": 4.9735, + "step": 11189 + }, + { + "epoch": 0.06655009991435912, + "grad_norm": 1.6926389932632446, + "learning_rate": 4.9455688781093135e-05, + "loss": 4.9294, + "step": 11190 + }, + { + "epoch": 0.06655604719764012, + "grad_norm": 1.5816938877105713, + "learning_rate": 4.945559183743772e-05, + "loss": 4.9161, + "step": 11191 + }, + { + "epoch": 0.06656199448092112, + "grad_norm": 1.5514836311340332, + "learning_rate": 4.9455494885245115e-05, + "loss": 4.9102, + "step": 11192 + }, + { + "epoch": 0.06656794176420211, + "grad_norm": 1.6787114143371582, + "learning_rate": 4.9455397924515346e-05, + "loss": 4.9628, + "step": 11193 + }, + { + "epoch": 0.06657388904748311, + "grad_norm": 1.5264941453933716, + "learning_rate": 4.945530095524844e-05, + "loss": 5.1685, + "step": 11194 + }, + { + "epoch": 0.06657983633076411, + "grad_norm": 1.80072820186615, + "learning_rate": 4.945520397744445e-05, + "loss": 4.8308, + "step": 11195 + }, + { + "epoch": 0.0665857836140451, + "grad_norm": 1.7497553825378418, + "learning_rate": 4.945510699110341e-05, + "loss": 4.8846, + "step": 11196 + }, + { + "epoch": 0.0665917308973261, + "grad_norm": 1.8938134908676147, + "learning_rate": 4.945500999622533e-05, + "loss": 4.8303, + "step": 11197 + }, + { + "epoch": 0.06659767818060709, + "grad_norm": 1.7286055088043213, + "learning_rate": 4.9454912992810264e-05, + "loss": 4.7686, + "step": 11198 + }, + { + "epoch": 0.0666036254638881, + "grad_norm": 1.7573840618133545, + "learning_rate": 4.945481598085824e-05, + "loss": 4.7527, + "step": 11199 + }, + { + "epoch": 0.0666095727471691, + "grad_norm": 1.9013001918792725, + "learning_rate": 4.94547189603693e-05, + "loss": 5.0987, + "step": 11200 + }, + { + "epoch": 0.06661552003045008, + "grad_norm": 1.5453308820724487, + "learning_rate": 4.945462193134346e-05, + "loss": 5.3799, + "step": 11201 + }, + { + "epoch": 0.06662146731373109, + "grad_norm": 1.763839602470398, + "learning_rate": 4.945452489378076e-05, + "loss": 5.2904, + "step": 11202 + }, + { + "epoch": 0.06662741459701209, + "grad_norm": 1.650407075881958, + "learning_rate": 4.945442784768125e-05, + "loss": 5.3007, + "step": 11203 + }, + { + "epoch": 0.06663336188029308, + "grad_norm": 1.6620690822601318, + "learning_rate": 4.945433079304495e-05, + "loss": 5.394, + "step": 11204 + }, + { + "epoch": 0.06663930916357408, + "grad_norm": 1.5000416040420532, + "learning_rate": 4.945423372987189e-05, + "loss": 5.0648, + "step": 11205 + }, + { + "epoch": 0.06664525644685508, + "grad_norm": 2.1791460514068604, + "learning_rate": 4.945413665816211e-05, + "loss": 5.5261, + "step": 11206 + }, + { + "epoch": 0.06665120373013607, + "grad_norm": 2.084258556365967, + "learning_rate": 4.945403957791565e-05, + "loss": 5.5796, + "step": 11207 + }, + { + "epoch": 0.06665715101341707, + "grad_norm": 1.9391356706619263, + "learning_rate": 4.945394248913253e-05, + "loss": 5.4855, + "step": 11208 + }, + { + "epoch": 0.06666309829669807, + "grad_norm": 1.8323030471801758, + "learning_rate": 4.9453845391812803e-05, + "loss": 5.5711, + "step": 11209 + }, + { + "epoch": 0.06666904557997906, + "grad_norm": 1.9193792343139648, + "learning_rate": 4.945374828595648e-05, + "loss": 5.2585, + "step": 11210 + }, + { + "epoch": 0.06667499286326006, + "grad_norm": 1.7111014127731323, + "learning_rate": 4.9453651171563606e-05, + "loss": 5.1965, + "step": 11211 + }, + { + "epoch": 0.06668094014654107, + "grad_norm": 1.8574761152267456, + "learning_rate": 4.9453554048634224e-05, + "loss": 5.2538, + "step": 11212 + }, + { + "epoch": 0.06668688742982205, + "grad_norm": 2.18009352684021, + "learning_rate": 4.945345691716835e-05, + "loss": 5.2486, + "step": 11213 + }, + { + "epoch": 0.06669283471310306, + "grad_norm": 2.167819023132324, + "learning_rate": 4.945335977716603e-05, + "loss": 5.1877, + "step": 11214 + }, + { + "epoch": 0.06669878199638406, + "grad_norm": 2.086603879928589, + "learning_rate": 4.9453262628627297e-05, + "loss": 5.32, + "step": 11215 + }, + { + "epoch": 0.06670472927966505, + "grad_norm": 2.239917039871216, + "learning_rate": 4.945316547155218e-05, + "loss": 5.5289, + "step": 11216 + }, + { + "epoch": 0.06671067656294605, + "grad_norm": 1.9402177333831787, + "learning_rate": 4.945306830594072e-05, + "loss": 5.5159, + "step": 11217 + }, + { + "epoch": 0.06671662384622705, + "grad_norm": 2.2730953693389893, + "learning_rate": 4.945297113179294e-05, + "loss": 5.5132, + "step": 11218 + }, + { + "epoch": 0.06672257112950804, + "grad_norm": 2.4021079540252686, + "learning_rate": 4.945287394910888e-05, + "loss": 5.7505, + "step": 11219 + }, + { + "epoch": 0.06672851841278904, + "grad_norm": 1.8272559642791748, + "learning_rate": 4.945277675788859e-05, + "loss": 5.7324, + "step": 11220 + }, + { + "epoch": 0.06673446569607004, + "grad_norm": 1.641192078590393, + "learning_rate": 4.945267955813206e-05, + "loss": 5.7665, + "step": 11221 + }, + { + "epoch": 0.06674041297935103, + "grad_norm": 2.1081202030181885, + "learning_rate": 4.945258234983938e-05, + "loss": 5.3633, + "step": 11222 + }, + { + "epoch": 0.06674636026263203, + "grad_norm": 1.7172397375106812, + "learning_rate": 4.945248513301054e-05, + "loss": 5.775, + "step": 11223 + }, + { + "epoch": 0.06675230754591303, + "grad_norm": 1.9968703985214233, + "learning_rate": 4.9452387907645594e-05, + "loss": 5.4817, + "step": 11224 + }, + { + "epoch": 0.06675825482919402, + "grad_norm": 1.9165494441986084, + "learning_rate": 4.9452290673744575e-05, + "loss": 5.6977, + "step": 11225 + }, + { + "epoch": 0.06676420211247502, + "grad_norm": 1.832783579826355, + "learning_rate": 4.945219343130751e-05, + "loss": 5.2065, + "step": 11226 + }, + { + "epoch": 0.06677014939575601, + "grad_norm": 2.073590040206909, + "learning_rate": 4.945209618033444e-05, + "loss": 5.0158, + "step": 11227 + }, + { + "epoch": 0.06677609667903701, + "grad_norm": 2.0305895805358887, + "learning_rate": 4.9451998920825395e-05, + "loss": 4.8452, + "step": 11228 + }, + { + "epoch": 0.06678204396231802, + "grad_norm": 1.8843696117401123, + "learning_rate": 4.945190165278041e-05, + "loss": 5.5082, + "step": 11229 + }, + { + "epoch": 0.066787991245599, + "grad_norm": 1.66866934299469, + "learning_rate": 4.945180437619951e-05, + "loss": 5.4151, + "step": 11230 + }, + { + "epoch": 0.06679393852888, + "grad_norm": 1.8018205165863037, + "learning_rate": 4.9451707091082746e-05, + "loss": 5.124, + "step": 11231 + }, + { + "epoch": 0.06679988581216101, + "grad_norm": 1.760339379310608, + "learning_rate": 4.9451609797430146e-05, + "loss": 4.9834, + "step": 11232 + }, + { + "epoch": 0.066805833095442, + "grad_norm": 1.609376072883606, + "learning_rate": 4.945151249524174e-05, + "loss": 5.0217, + "step": 11233 + }, + { + "epoch": 0.066811780378723, + "grad_norm": 1.5468369722366333, + "learning_rate": 4.9451415184517556e-05, + "loss": 5.1881, + "step": 11234 + }, + { + "epoch": 0.066817727662004, + "grad_norm": 1.2027482986450195, + "learning_rate": 4.945131786525764e-05, + "loss": 5.1014, + "step": 11235 + }, + { + "epoch": 0.06682367494528499, + "grad_norm": 1.6050941944122314, + "learning_rate": 4.945122053746203e-05, + "loss": 5.0314, + "step": 11236 + }, + { + "epoch": 0.06682962222856599, + "grad_norm": 1.4980865716934204, + "learning_rate": 4.9451123201130746e-05, + "loss": 4.9371, + "step": 11237 + }, + { + "epoch": 0.06683556951184699, + "grad_norm": 1.6754953861236572, + "learning_rate": 4.9451025856263824e-05, + "loss": 4.9733, + "step": 11238 + }, + { + "epoch": 0.06684151679512798, + "grad_norm": 1.5051567554473877, + "learning_rate": 4.9450928502861303e-05, + "loss": 4.8994, + "step": 11239 + }, + { + "epoch": 0.06684746407840898, + "grad_norm": 1.5211920738220215, + "learning_rate": 4.945083114092321e-05, + "loss": 4.8459, + "step": 11240 + }, + { + "epoch": 0.06685341136168998, + "grad_norm": 1.6717231273651123, + "learning_rate": 4.9450733770449596e-05, + "loss": 5.1029, + "step": 11241 + }, + { + "epoch": 0.06685935864497097, + "grad_norm": 1.4853429794311523, + "learning_rate": 4.945063639144048e-05, + "loss": 5.2199, + "step": 11242 + }, + { + "epoch": 0.06686530592825198, + "grad_norm": 1.6102755069732666, + "learning_rate": 4.9450539003895894e-05, + "loss": 5.1191, + "step": 11243 + }, + { + "epoch": 0.06687125321153298, + "grad_norm": 1.6091139316558838, + "learning_rate": 4.9450441607815876e-05, + "loss": 5.2492, + "step": 11244 + }, + { + "epoch": 0.06687720049481397, + "grad_norm": 1.5190162658691406, + "learning_rate": 4.945034420320047e-05, + "loss": 5.1763, + "step": 11245 + }, + { + "epoch": 0.06688314777809497, + "grad_norm": 1.636243462562561, + "learning_rate": 4.94502467900497e-05, + "loss": 5.4906, + "step": 11246 + }, + { + "epoch": 0.06688909506137597, + "grad_norm": 1.5214428901672363, + "learning_rate": 4.9450149368363594e-05, + "loss": 5.3554, + "step": 11247 + }, + { + "epoch": 0.06689504234465696, + "grad_norm": 1.696183681488037, + "learning_rate": 4.9450051938142205e-05, + "loss": 5.3185, + "step": 11248 + }, + { + "epoch": 0.06690098962793796, + "grad_norm": 1.5344911813735962, + "learning_rate": 4.944995449938555e-05, + "loss": 5.345, + "step": 11249 + }, + { + "epoch": 0.06690693691121896, + "grad_norm": 1.598035454750061, + "learning_rate": 4.944985705209366e-05, + "loss": 5.2271, + "step": 11250 + }, + { + "epoch": 0.06691288419449995, + "grad_norm": 1.501841425895691, + "learning_rate": 4.944975959626659e-05, + "loss": 5.1807, + "step": 11251 + }, + { + "epoch": 0.06691883147778095, + "grad_norm": 1.3818657398223877, + "learning_rate": 4.944966213190436e-05, + "loss": 5.2953, + "step": 11252 + }, + { + "epoch": 0.06692477876106195, + "grad_norm": 1.5480642318725586, + "learning_rate": 4.9449564659007e-05, + "loss": 5.3048, + "step": 11253 + }, + { + "epoch": 0.06693072604434294, + "grad_norm": 1.5553090572357178, + "learning_rate": 4.9449467177574546e-05, + "loss": 5.1365, + "step": 11254 + }, + { + "epoch": 0.06693667332762394, + "grad_norm": 1.581534743309021, + "learning_rate": 4.944936968760705e-05, + "loss": 5.1498, + "step": 11255 + }, + { + "epoch": 0.06694262061090493, + "grad_norm": 1.8294548988342285, + "learning_rate": 4.944927218910452e-05, + "loss": 5.1331, + "step": 11256 + }, + { + "epoch": 0.06694856789418593, + "grad_norm": 1.3404508829116821, + "learning_rate": 4.944917468206701e-05, + "loss": 5.5092, + "step": 11257 + }, + { + "epoch": 0.06695451517746694, + "grad_norm": 1.5146483182907104, + "learning_rate": 4.944907716649454e-05, + "loss": 5.2797, + "step": 11258 + }, + { + "epoch": 0.06696046246074792, + "grad_norm": 1.571393609046936, + "learning_rate": 4.944897964238715e-05, + "loss": 5.4528, + "step": 11259 + }, + { + "epoch": 0.06696640974402893, + "grad_norm": 1.640459656715393, + "learning_rate": 4.944888210974487e-05, + "loss": 5.1032, + "step": 11260 + }, + { + "epoch": 0.06697235702730993, + "grad_norm": 1.5397419929504395, + "learning_rate": 4.944878456856774e-05, + "loss": 5.2333, + "step": 11261 + }, + { + "epoch": 0.06697830431059092, + "grad_norm": 1.4423824548721313, + "learning_rate": 4.94486870188558e-05, + "loss": 5.1765, + "step": 11262 + }, + { + "epoch": 0.06698425159387192, + "grad_norm": 1.366347074508667, + "learning_rate": 4.9448589460609066e-05, + "loss": 5.2257, + "step": 11263 + }, + { + "epoch": 0.06699019887715292, + "grad_norm": 1.370089054107666, + "learning_rate": 4.944849189382759e-05, + "loss": 5.4681, + "step": 11264 + }, + { + "epoch": 0.06699614616043391, + "grad_norm": 1.3014042377471924, + "learning_rate": 4.9448394318511394e-05, + "loss": 5.3434, + "step": 11265 + }, + { + "epoch": 0.06700209344371491, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.9448296734660516e-05, + "loss": 5.3064, + "step": 11266 + }, + { + "epoch": 0.06700804072699591, + "grad_norm": 1.6640921831130981, + "learning_rate": 4.944819914227499e-05, + "loss": 5.2896, + "step": 11267 + }, + { + "epoch": 0.0670139880102769, + "grad_norm": 1.4969593286514282, + "learning_rate": 4.9448101541354845e-05, + "loss": 5.1413, + "step": 11268 + }, + { + "epoch": 0.0670199352935579, + "grad_norm": 1.4021313190460205, + "learning_rate": 4.9448003931900126e-05, + "loss": 5.2609, + "step": 11269 + }, + { + "epoch": 0.0670258825768389, + "grad_norm": 1.6506398916244507, + "learning_rate": 4.9447906313910865e-05, + "loss": 5.3365, + "step": 11270 + }, + { + "epoch": 0.0670318298601199, + "grad_norm": 1.6469614505767822, + "learning_rate": 4.9447808687387084e-05, + "loss": 5.0384, + "step": 11271 + }, + { + "epoch": 0.0670377771434009, + "grad_norm": 1.5047974586486816, + "learning_rate": 4.944771105232883e-05, + "loss": 5.3565, + "step": 11272 + }, + { + "epoch": 0.0670437244266819, + "grad_norm": 1.4467194080352783, + "learning_rate": 4.9447613408736135e-05, + "loss": 5.5576, + "step": 11273 + }, + { + "epoch": 0.06704967170996289, + "grad_norm": 1.4636478424072266, + "learning_rate": 4.9447515756609034e-05, + "loss": 5.6407, + "step": 11274 + }, + { + "epoch": 0.06705561899324389, + "grad_norm": 1.373046875, + "learning_rate": 4.944741809594755e-05, + "loss": 5.4286, + "step": 11275 + }, + { + "epoch": 0.06706156627652489, + "grad_norm": 1.5114089250564575, + "learning_rate": 4.944732042675172e-05, + "loss": 5.6425, + "step": 11276 + }, + { + "epoch": 0.06706751355980588, + "grad_norm": 1.8263514041900635, + "learning_rate": 4.9447222749021596e-05, + "loss": 5.2469, + "step": 11277 + }, + { + "epoch": 0.06707346084308688, + "grad_norm": 1.780553936958313, + "learning_rate": 4.944712506275719e-05, + "loss": 5.3306, + "step": 11278 + }, + { + "epoch": 0.06707940812636788, + "grad_norm": 1.6208360195159912, + "learning_rate": 4.9447027367958556e-05, + "loss": 5.5365, + "step": 11279 + }, + { + "epoch": 0.06708535540964887, + "grad_norm": 1.336965560913086, + "learning_rate": 4.9446929664625705e-05, + "loss": 5.2694, + "step": 11280 + }, + { + "epoch": 0.06709130269292987, + "grad_norm": 1.6100155115127563, + "learning_rate": 4.9446831952758685e-05, + "loss": 5.5489, + "step": 11281 + }, + { + "epoch": 0.06709724997621087, + "grad_norm": 1.8020440340042114, + "learning_rate": 4.944673423235753e-05, + "loss": 5.3396, + "step": 11282 + }, + { + "epoch": 0.06710319725949186, + "grad_norm": 1.5315353870391846, + "learning_rate": 4.9446636503422276e-05, + "loss": 5.3687, + "step": 11283 + }, + { + "epoch": 0.06710914454277286, + "grad_norm": 2.2560019493103027, + "learning_rate": 4.9446538765952953e-05, + "loss": 5.4584, + "step": 11284 + }, + { + "epoch": 0.06711509182605385, + "grad_norm": 1.4653301239013672, + "learning_rate": 4.94464410199496e-05, + "loss": 5.3438, + "step": 11285 + }, + { + "epoch": 0.06712103910933485, + "grad_norm": 1.5931557416915894, + "learning_rate": 4.9446343265412243e-05, + "loss": 5.5802, + "step": 11286 + }, + { + "epoch": 0.06712698639261586, + "grad_norm": 1.5282461643218994, + "learning_rate": 4.944624550234092e-05, + "loss": 5.5634, + "step": 11287 + }, + { + "epoch": 0.06713293367589684, + "grad_norm": 1.7275618314743042, + "learning_rate": 4.944614773073566e-05, + "loss": 5.3797, + "step": 11288 + }, + { + "epoch": 0.06713888095917785, + "grad_norm": 1.6453620195388794, + "learning_rate": 4.944604995059651e-05, + "loss": 5.4693, + "step": 11289 + }, + { + "epoch": 0.06714482824245885, + "grad_norm": 1.870483636856079, + "learning_rate": 4.944595216192349e-05, + "loss": 5.4693, + "step": 11290 + }, + { + "epoch": 0.06715077552573984, + "grad_norm": 1.5478577613830566, + "learning_rate": 4.944585436471665e-05, + "loss": 5.694, + "step": 11291 + }, + { + "epoch": 0.06715672280902084, + "grad_norm": 1.9456945657730103, + "learning_rate": 4.944575655897601e-05, + "loss": 5.6687, + "step": 11292 + }, + { + "epoch": 0.06716267009230184, + "grad_norm": 1.808176875114441, + "learning_rate": 4.944565874470161e-05, + "loss": 5.7444, + "step": 11293 + }, + { + "epoch": 0.06716861737558283, + "grad_norm": 1.8066149950027466, + "learning_rate": 4.944556092189347e-05, + "loss": 5.5264, + "step": 11294 + }, + { + "epoch": 0.06717456465886383, + "grad_norm": 2.2896971702575684, + "learning_rate": 4.9445463090551656e-05, + "loss": 4.7624, + "step": 11295 + }, + { + "epoch": 0.06718051194214483, + "grad_norm": 1.7178759574890137, + "learning_rate": 4.9445365250676165e-05, + "loss": 5.79, + "step": 11296 + }, + { + "epoch": 0.06718645922542582, + "grad_norm": 1.8841933012008667, + "learning_rate": 4.944526740226707e-05, + "loss": 5.9792, + "step": 11297 + }, + { + "epoch": 0.06719240650870682, + "grad_norm": 1.8618090152740479, + "learning_rate": 4.944516954532437e-05, + "loss": 5.957, + "step": 11298 + }, + { + "epoch": 0.06719835379198782, + "grad_norm": 1.7545913457870483, + "learning_rate": 4.944507167984812e-05, + "loss": 5.4484, + "step": 11299 + }, + { + "epoch": 0.06720430107526881, + "grad_norm": 2.023158073425293, + "learning_rate": 4.9444973805838345e-05, + "loss": 5.0873, + "step": 11300 + }, + { + "epoch": 0.06721024835854982, + "grad_norm": 1.893340826034546, + "learning_rate": 4.944487592329509e-05, + "loss": 5.042, + "step": 11301 + }, + { + "epoch": 0.06721619564183082, + "grad_norm": 1.981518268585205, + "learning_rate": 4.944477803221837e-05, + "loss": 5.1463, + "step": 11302 + }, + { + "epoch": 0.0672221429251118, + "grad_norm": 2.47416090965271, + "learning_rate": 4.9444680132608236e-05, + "loss": 5.2885, + "step": 11303 + }, + { + "epoch": 0.06722809020839281, + "grad_norm": 2.3973519802093506, + "learning_rate": 4.944458222446472e-05, + "loss": 5.3321, + "step": 11304 + }, + { + "epoch": 0.06723403749167381, + "grad_norm": 1.9117941856384277, + "learning_rate": 4.9444484307787846e-05, + "loss": 5.2159, + "step": 11305 + }, + { + "epoch": 0.0672399847749548, + "grad_norm": 1.8732513189315796, + "learning_rate": 4.9444386382577656e-05, + "loss": 5.222, + "step": 11306 + }, + { + "epoch": 0.0672459320582358, + "grad_norm": 1.9202747344970703, + "learning_rate": 4.9444288448834184e-05, + "loss": 5.5766, + "step": 11307 + }, + { + "epoch": 0.0672518793415168, + "grad_norm": 1.8956191539764404, + "learning_rate": 4.944419050655747e-05, + "loss": 5.7129, + "step": 11308 + }, + { + "epoch": 0.06725782662479779, + "grad_norm": 2.7075235843658447, + "learning_rate": 4.9444092555747534e-05, + "loss": 5.2199, + "step": 11309 + }, + { + "epoch": 0.06726377390807879, + "grad_norm": 2.396125078201294, + "learning_rate": 4.944399459640442e-05, + "loss": 5.3548, + "step": 11310 + }, + { + "epoch": 0.0672697211913598, + "grad_norm": 2.6050171852111816, + "learning_rate": 4.9443896628528166e-05, + "loss": 5.616, + "step": 11311 + }, + { + "epoch": 0.06727566847464078, + "grad_norm": 2.512720823287964, + "learning_rate": 4.94437986521188e-05, + "loss": 5.3699, + "step": 11312 + }, + { + "epoch": 0.06728161575792178, + "grad_norm": 2.509716510772705, + "learning_rate": 4.9443700667176345e-05, + "loss": 5.431, + "step": 11313 + }, + { + "epoch": 0.06728756304120277, + "grad_norm": 2.2237601280212402, + "learning_rate": 4.944360267370085e-05, + "loss": 5.3985, + "step": 11314 + }, + { + "epoch": 0.06729351032448377, + "grad_norm": 1.982344627380371, + "learning_rate": 4.9443504671692356e-05, + "loss": 5.4849, + "step": 11315 + }, + { + "epoch": 0.06729945760776478, + "grad_norm": 2.1006124019622803, + "learning_rate": 4.9443406661150874e-05, + "loss": 5.227, + "step": 11316 + }, + { + "epoch": 0.06730540489104576, + "grad_norm": 2.0929529666900635, + "learning_rate": 4.9443308642076456e-05, + "loss": 5.524, + "step": 11317 + }, + { + "epoch": 0.06731135217432677, + "grad_norm": 1.9268262386322021, + "learning_rate": 4.944321061446914e-05, + "loss": 6.0622, + "step": 11318 + }, + { + "epoch": 0.06731729945760777, + "grad_norm": 2.257065773010254, + "learning_rate": 4.944311257832894e-05, + "loss": 4.9455, + "step": 11319 + }, + { + "epoch": 0.06732324674088876, + "grad_norm": 2.056244373321533, + "learning_rate": 4.944301453365591e-05, + "loss": 5.4157, + "step": 11320 + }, + { + "epoch": 0.06732919402416976, + "grad_norm": 2.1667540073394775, + "learning_rate": 4.944291648045007e-05, + "loss": 5.5767, + "step": 11321 + }, + { + "epoch": 0.06733514130745076, + "grad_norm": 1.9596853256225586, + "learning_rate": 4.944281841871146e-05, + "loss": 5.6532, + "step": 11322 + }, + { + "epoch": 0.06734108859073175, + "grad_norm": 1.7050867080688477, + "learning_rate": 4.9442720348440116e-05, + "loss": 5.8881, + "step": 11323 + }, + { + "epoch": 0.06734703587401275, + "grad_norm": 1.8681753873825073, + "learning_rate": 4.944262226963607e-05, + "loss": 5.9369, + "step": 11324 + }, + { + "epoch": 0.06735298315729375, + "grad_norm": 1.9432111978530884, + "learning_rate": 4.9442524182299365e-05, + "loss": 5.9163, + "step": 11325 + }, + { + "epoch": 0.06735893044057474, + "grad_norm": 1.8099175691604614, + "learning_rate": 4.9442426086430026e-05, + "loss": 5.809, + "step": 11326 + }, + { + "epoch": 0.06736487772385574, + "grad_norm": 1.6179800033569336, + "learning_rate": 4.944232798202808e-05, + "loss": 5.5609, + "step": 11327 + }, + { + "epoch": 0.06737082500713674, + "grad_norm": 2.303189992904663, + "learning_rate": 4.944222986909357e-05, + "loss": 5.9291, + "step": 11328 + }, + { + "epoch": 0.06737677229041773, + "grad_norm": 1.913813829421997, + "learning_rate": 4.944213174762654e-05, + "loss": 5.8672, + "step": 11329 + }, + { + "epoch": 0.06738271957369873, + "grad_norm": 2.1856813430786133, + "learning_rate": 4.944203361762701e-05, + "loss": 5.2632, + "step": 11330 + }, + { + "epoch": 0.06738866685697974, + "grad_norm": 2.019679069519043, + "learning_rate": 4.9441935479095016e-05, + "loss": 5.3707, + "step": 11331 + }, + { + "epoch": 0.06739461414026073, + "grad_norm": 1.8531097173690796, + "learning_rate": 4.944183733203059e-05, + "loss": 5.6689, + "step": 11332 + }, + { + "epoch": 0.06740056142354173, + "grad_norm": 2.068208694458008, + "learning_rate": 4.944173917643378e-05, + "loss": 5.6111, + "step": 11333 + }, + { + "epoch": 0.06740650870682273, + "grad_norm": 1.8021270036697388, + "learning_rate": 4.944164101230461e-05, + "loss": 6.0865, + "step": 11334 + }, + { + "epoch": 0.06741245599010372, + "grad_norm": 1.9051427841186523, + "learning_rate": 4.944154283964312e-05, + "loss": 5.5862, + "step": 11335 + }, + { + "epoch": 0.06741840327338472, + "grad_norm": 1.718483805656433, + "learning_rate": 4.944144465844933e-05, + "loss": 5.2505, + "step": 11336 + }, + { + "epoch": 0.06742435055666572, + "grad_norm": 2.205167531967163, + "learning_rate": 4.944134646872329e-05, + "loss": 5.3181, + "step": 11337 + }, + { + "epoch": 0.06743029783994671, + "grad_norm": 1.550945520401001, + "learning_rate": 4.944124827046502e-05, + "loss": 5.4129, + "step": 11338 + }, + { + "epoch": 0.06743624512322771, + "grad_norm": 2.08793044090271, + "learning_rate": 4.944115006367458e-05, + "loss": 5.9705, + "step": 11339 + }, + { + "epoch": 0.06744219240650871, + "grad_norm": 1.8955761194229126, + "learning_rate": 4.944105184835197e-05, + "loss": 4.9629, + "step": 11340 + }, + { + "epoch": 0.0674481396897897, + "grad_norm": 1.7287909984588623, + "learning_rate": 4.944095362449724e-05, + "loss": 5.1097, + "step": 11341 + }, + { + "epoch": 0.0674540869730707, + "grad_norm": 1.8718771934509277, + "learning_rate": 4.944085539211044e-05, + "loss": 5.6443, + "step": 11342 + }, + { + "epoch": 0.06746003425635169, + "grad_norm": 2.220863103866577, + "learning_rate": 4.9440757151191585e-05, + "loss": 5.5042, + "step": 11343 + }, + { + "epoch": 0.0674659815396327, + "grad_norm": 1.9501415491104126, + "learning_rate": 4.944065890174071e-05, + "loss": 5.6788, + "step": 11344 + }, + { + "epoch": 0.0674719288229137, + "grad_norm": 1.8566590547561646, + "learning_rate": 4.944056064375786e-05, + "loss": 5.6531, + "step": 11345 + }, + { + "epoch": 0.06747787610619468, + "grad_norm": 1.895409345626831, + "learning_rate": 4.9440462377243055e-05, + "loss": 5.6441, + "step": 11346 + }, + { + "epoch": 0.06748382338947569, + "grad_norm": 2.1746973991394043, + "learning_rate": 4.9440364102196345e-05, + "loss": 5.8624, + "step": 11347 + }, + { + "epoch": 0.06748977067275669, + "grad_norm": 1.9661751985549927, + "learning_rate": 4.944026581861775e-05, + "loss": 5.6075, + "step": 11348 + }, + { + "epoch": 0.06749571795603768, + "grad_norm": 1.8591458797454834, + "learning_rate": 4.944016752650731e-05, + "loss": 5.9115, + "step": 11349 + }, + { + "epoch": 0.06750166523931868, + "grad_norm": 1.6491025686264038, + "learning_rate": 4.9440069225865065e-05, + "loss": 6.0548, + "step": 11350 + }, + { + "epoch": 0.06750761252259968, + "grad_norm": 1.857928991317749, + "learning_rate": 4.9439970916691045e-05, + "loss": 5.4326, + "step": 11351 + }, + { + "epoch": 0.06751355980588067, + "grad_norm": 1.8189151287078857, + "learning_rate": 4.943987259898528e-05, + "loss": 5.7744, + "step": 11352 + }, + { + "epoch": 0.06751950708916167, + "grad_norm": 1.7486300468444824, + "learning_rate": 4.943977427274781e-05, + "loss": 5.7128, + "step": 11353 + }, + { + "epoch": 0.06752545437244267, + "grad_norm": 1.7272138595581055, + "learning_rate": 4.943967593797866e-05, + "loss": 5.9922, + "step": 11354 + }, + { + "epoch": 0.06753140165572366, + "grad_norm": 1.740860939025879, + "learning_rate": 4.9439577594677875e-05, + "loss": 5.8486, + "step": 11355 + }, + { + "epoch": 0.06753734893900466, + "grad_norm": 1.9054155349731445, + "learning_rate": 4.9439479242845494e-05, + "loss": 5.4694, + "step": 11356 + }, + { + "epoch": 0.06754329622228566, + "grad_norm": 1.9783501625061035, + "learning_rate": 4.943938088248154e-05, + "loss": 5.5185, + "step": 11357 + }, + { + "epoch": 0.06754924350556665, + "grad_norm": 1.8267238140106201, + "learning_rate": 4.943928251358605e-05, + "loss": 5.7589, + "step": 11358 + }, + { + "epoch": 0.06755519078884765, + "grad_norm": 1.6957738399505615, + "learning_rate": 4.943918413615906e-05, + "loss": 5.5716, + "step": 11359 + }, + { + "epoch": 0.06756113807212866, + "grad_norm": 2.0818982124328613, + "learning_rate": 4.94390857502006e-05, + "loss": 5.8969, + "step": 11360 + }, + { + "epoch": 0.06756708535540965, + "grad_norm": 1.8012073040008545, + "learning_rate": 4.9438987355710703e-05, + "loss": 6.1053, + "step": 11361 + }, + { + "epoch": 0.06757303263869065, + "grad_norm": 2.2209696769714355, + "learning_rate": 4.943888895268942e-05, + "loss": 5.9714, + "step": 11362 + }, + { + "epoch": 0.06757897992197165, + "grad_norm": 1.8006336688995361, + "learning_rate": 4.943879054113676e-05, + "loss": 5.6427, + "step": 11363 + }, + { + "epoch": 0.06758492720525264, + "grad_norm": 1.7628017663955688, + "learning_rate": 4.9438692121052775e-05, + "loss": 5.8639, + "step": 11364 + }, + { + "epoch": 0.06759087448853364, + "grad_norm": 1.8574492931365967, + "learning_rate": 4.94385936924375e-05, + "loss": 5.892, + "step": 11365 + }, + { + "epoch": 0.06759682177181464, + "grad_norm": 1.7926831245422363, + "learning_rate": 4.9438495255290964e-05, + "loss": 5.9024, + "step": 11366 + }, + { + "epoch": 0.06760276905509563, + "grad_norm": 2.503370761871338, + "learning_rate": 4.94383968096132e-05, + "loss": 5.994, + "step": 11367 + }, + { + "epoch": 0.06760871633837663, + "grad_norm": 1.7123390436172485, + "learning_rate": 4.943829835540424e-05, + "loss": 5.8052, + "step": 11368 + }, + { + "epoch": 0.06761466362165763, + "grad_norm": 2.0890092849731445, + "learning_rate": 4.943819989266413e-05, + "loss": 5.067, + "step": 11369 + }, + { + "epoch": 0.06762061090493862, + "grad_norm": 1.8000640869140625, + "learning_rate": 4.9438101421392894e-05, + "loss": 5.3562, + "step": 11370 + }, + { + "epoch": 0.06762655818821962, + "grad_norm": 2.254873514175415, + "learning_rate": 4.9438002941590564e-05, + "loss": 5.0557, + "step": 11371 + }, + { + "epoch": 0.06763250547150061, + "grad_norm": 1.8080449104309082, + "learning_rate": 4.943790445325719e-05, + "loss": 5.6702, + "step": 11372 + }, + { + "epoch": 0.06763845275478161, + "grad_norm": 2.0175933837890625, + "learning_rate": 4.943780595639279e-05, + "loss": 5.6227, + "step": 11373 + }, + { + "epoch": 0.06764440003806262, + "grad_norm": 1.9859650135040283, + "learning_rate": 4.943770745099741e-05, + "loss": 5.4437, + "step": 11374 + }, + { + "epoch": 0.0676503473213436, + "grad_norm": 1.975573182106018, + "learning_rate": 4.943760893707107e-05, + "loss": 5.3101, + "step": 11375 + }, + { + "epoch": 0.0676562946046246, + "grad_norm": 2.2590208053588867, + "learning_rate": 4.943751041461382e-05, + "loss": 5.2544, + "step": 11376 + }, + { + "epoch": 0.06766224188790561, + "grad_norm": 1.8615392446517944, + "learning_rate": 4.943741188362568e-05, + "loss": 5.5266, + "step": 11377 + }, + { + "epoch": 0.0676681891711866, + "grad_norm": 2.056810140609741, + "learning_rate": 4.943731334410669e-05, + "loss": 5.1994, + "step": 11378 + }, + { + "epoch": 0.0676741364544676, + "grad_norm": 2.0275685787200928, + "learning_rate": 4.94372147960569e-05, + "loss": 5.7385, + "step": 11379 + }, + { + "epoch": 0.0676800837377486, + "grad_norm": 2.082963466644287, + "learning_rate": 4.9437116239476325e-05, + "loss": 5.1531, + "step": 11380 + }, + { + "epoch": 0.06768603102102959, + "grad_norm": 2.176421642303467, + "learning_rate": 4.9437017674365004e-05, + "loss": 5.521, + "step": 11381 + }, + { + "epoch": 0.06769197830431059, + "grad_norm": 2.1424365043640137, + "learning_rate": 4.9436919100722964e-05, + "loss": 5.4543, + "step": 11382 + }, + { + "epoch": 0.06769792558759159, + "grad_norm": 2.07836651802063, + "learning_rate": 4.9436820518550266e-05, + "loss": 5.5166, + "step": 11383 + }, + { + "epoch": 0.06770387287087258, + "grad_norm": 1.9776746034622192, + "learning_rate": 4.9436721927846915e-05, + "loss": 5.4621, + "step": 11384 + }, + { + "epoch": 0.06770982015415358, + "grad_norm": 1.9985042810440063, + "learning_rate": 4.943662332861296e-05, + "loss": 5.3835, + "step": 11385 + }, + { + "epoch": 0.06771576743743458, + "grad_norm": 1.6877795457839966, + "learning_rate": 4.943652472084843e-05, + "loss": 5.185, + "step": 11386 + }, + { + "epoch": 0.06772171472071557, + "grad_norm": 1.8307565450668335, + "learning_rate": 4.943642610455336e-05, + "loss": 5.117, + "step": 11387 + }, + { + "epoch": 0.06772766200399657, + "grad_norm": 2.0381922721862793, + "learning_rate": 4.943632747972779e-05, + "loss": 5.6004, + "step": 11388 + }, + { + "epoch": 0.06773360928727758, + "grad_norm": 1.9554756879806519, + "learning_rate": 4.943622884637175e-05, + "loss": 5.9638, + "step": 11389 + }, + { + "epoch": 0.06773955657055857, + "grad_norm": 1.878861665725708, + "learning_rate": 4.9436130204485274e-05, + "loss": 5.7961, + "step": 11390 + }, + { + "epoch": 0.06774550385383957, + "grad_norm": 2.040012836456299, + "learning_rate": 4.94360315540684e-05, + "loss": 5.7175, + "step": 11391 + }, + { + "epoch": 0.06775145113712057, + "grad_norm": 2.262408494949341, + "learning_rate": 4.943593289512115e-05, + "loss": 4.8581, + "step": 11392 + }, + { + "epoch": 0.06775739842040156, + "grad_norm": 2.201751232147217, + "learning_rate": 4.943583422764358e-05, + "loss": 5.0647, + "step": 11393 + }, + { + "epoch": 0.06776334570368256, + "grad_norm": 1.9768764972686768, + "learning_rate": 4.943573555163571e-05, + "loss": 5.8836, + "step": 11394 + }, + { + "epoch": 0.06776929298696356, + "grad_norm": 2.1048574447631836, + "learning_rate": 4.9435636867097575e-05, + "loss": 5.9746, + "step": 11395 + }, + { + "epoch": 0.06777524027024455, + "grad_norm": 1.5297552347183228, + "learning_rate": 4.943553817402921e-05, + "loss": 4.912, + "step": 11396 + }, + { + "epoch": 0.06778118755352555, + "grad_norm": 1.5313429832458496, + "learning_rate": 4.943543947243066e-05, + "loss": 4.975, + "step": 11397 + }, + { + "epoch": 0.06778713483680655, + "grad_norm": 1.8882219791412354, + "learning_rate": 4.943534076230194e-05, + "loss": 5.2183, + "step": 11398 + }, + { + "epoch": 0.06779308212008754, + "grad_norm": 1.698997139930725, + "learning_rate": 4.9435242043643094e-05, + "loss": 5.8019, + "step": 11399 + }, + { + "epoch": 0.06779902940336854, + "grad_norm": 1.775140404701233, + "learning_rate": 4.943514331645417e-05, + "loss": 5.7451, + "step": 11400 + }, + { + "epoch": 0.06780497668664953, + "grad_norm": 2.273650884628296, + "learning_rate": 4.943504458073518e-05, + "loss": 4.7727, + "step": 11401 + }, + { + "epoch": 0.06781092396993053, + "grad_norm": 2.166961908340454, + "learning_rate": 4.943494583648617e-05, + "loss": 5.4537, + "step": 11402 + }, + { + "epoch": 0.06781687125321154, + "grad_norm": 2.147876024246216, + "learning_rate": 4.943484708370717e-05, + "loss": 5.2635, + "step": 11403 + }, + { + "epoch": 0.06782281853649252, + "grad_norm": 1.968397855758667, + "learning_rate": 4.943474832239822e-05, + "loss": 5.6591, + "step": 11404 + }, + { + "epoch": 0.06782876581977353, + "grad_norm": 1.8838316202163696, + "learning_rate": 4.943464955255935e-05, + "loss": 5.5462, + "step": 11405 + }, + { + "epoch": 0.06783471310305453, + "grad_norm": 2.4205315113067627, + "learning_rate": 4.94345507741906e-05, + "loss": 4.859, + "step": 11406 + }, + { + "epoch": 0.06784066038633552, + "grad_norm": 2.1272950172424316, + "learning_rate": 4.9434451987292e-05, + "loss": 5.1791, + "step": 11407 + }, + { + "epoch": 0.06784660766961652, + "grad_norm": 2.345055341720581, + "learning_rate": 4.9434353191863595e-05, + "loss": 5.1616, + "step": 11408 + }, + { + "epoch": 0.06785255495289752, + "grad_norm": 2.3967537879943848, + "learning_rate": 4.9434254387905395e-05, + "loss": 5.1805, + "step": 11409 + }, + { + "epoch": 0.06785850223617851, + "grad_norm": 2.2108283042907715, + "learning_rate": 4.943415557541745e-05, + "loss": 5.381, + "step": 11410 + }, + { + "epoch": 0.06786444951945951, + "grad_norm": 2.178776979446411, + "learning_rate": 4.94340567543998e-05, + "loss": 5.4016, + "step": 11411 + }, + { + "epoch": 0.06787039680274051, + "grad_norm": 2.003169059753418, + "learning_rate": 4.943395792485247e-05, + "loss": 5.5632, + "step": 11412 + }, + { + "epoch": 0.0678763440860215, + "grad_norm": 2.0337789058685303, + "learning_rate": 4.9433859086775506e-05, + "loss": 5.4476, + "step": 11413 + }, + { + "epoch": 0.0678822913693025, + "grad_norm": 1.784868836402893, + "learning_rate": 4.943376024016892e-05, + "loss": 5.3578, + "step": 11414 + }, + { + "epoch": 0.0678882386525835, + "grad_norm": 1.7282286882400513, + "learning_rate": 4.943366138503277e-05, + "loss": 5.6202, + "step": 11415 + }, + { + "epoch": 0.06789418593586449, + "grad_norm": 1.9716618061065674, + "learning_rate": 4.943356252136707e-05, + "loss": 4.9861, + "step": 11416 + }, + { + "epoch": 0.0679001332191455, + "grad_norm": 2.399317502975464, + "learning_rate": 4.943346364917188e-05, + "loss": 4.4494, + "step": 11417 + }, + { + "epoch": 0.0679060805024265, + "grad_norm": 2.142995834350586, + "learning_rate": 4.943336476844722e-05, + "loss": 4.5989, + "step": 11418 + }, + { + "epoch": 0.06791202778570748, + "grad_norm": 1.9394404888153076, + "learning_rate": 4.943326587919311e-05, + "loss": 4.4944, + "step": 11419 + }, + { + "epoch": 0.06791797506898849, + "grad_norm": 2.41937518119812, + "learning_rate": 4.9433166981409615e-05, + "loss": 5.1687, + "step": 11420 + }, + { + "epoch": 0.06792392235226949, + "grad_norm": 2.1686136722564697, + "learning_rate": 4.943306807509675e-05, + "loss": 6.2976, + "step": 11421 + }, + { + "epoch": 0.06792986963555048, + "grad_norm": 1.9649391174316406, + "learning_rate": 4.943296916025455e-05, + "loss": 6.0242, + "step": 11422 + }, + { + "epoch": 0.06793581691883148, + "grad_norm": 1.9251484870910645, + "learning_rate": 4.943287023688305e-05, + "loss": 5.9777, + "step": 11423 + }, + { + "epoch": 0.06794176420211248, + "grad_norm": 1.838348388671875, + "learning_rate": 4.9432771304982296e-05, + "loss": 5.8669, + "step": 11424 + }, + { + "epoch": 0.06794771148539347, + "grad_norm": 2.5417487621307373, + "learning_rate": 4.94326723645523e-05, + "loss": 5.5131, + "step": 11425 + }, + { + "epoch": 0.06795365876867447, + "grad_norm": 2.2175936698913574, + "learning_rate": 4.943257341559312e-05, + "loss": 5.4657, + "step": 11426 + }, + { + "epoch": 0.06795960605195547, + "grad_norm": 2.4474873542785645, + "learning_rate": 4.943247445810478e-05, + "loss": 5.2401, + "step": 11427 + }, + { + "epoch": 0.06796555333523646, + "grad_norm": 2.176483392715454, + "learning_rate": 4.9432375492087324e-05, + "loss": 5.7295, + "step": 11428 + }, + { + "epoch": 0.06797150061851746, + "grad_norm": 1.9311527013778687, + "learning_rate": 4.943227651754077e-05, + "loss": 5.8135, + "step": 11429 + }, + { + "epoch": 0.06797744790179845, + "grad_norm": 2.2462544441223145, + "learning_rate": 4.943217753446516e-05, + "loss": 6.0761, + "step": 11430 + }, + { + "epoch": 0.06798339518507945, + "grad_norm": 2.3158276081085205, + "learning_rate": 4.943207854286053e-05, + "loss": 6.0223, + "step": 11431 + }, + { + "epoch": 0.06798934246836046, + "grad_norm": 1.6222623586654663, + "learning_rate": 4.9431979542726914e-05, + "loss": 5.9417, + "step": 11432 + }, + { + "epoch": 0.06799528975164144, + "grad_norm": 1.9809083938598633, + "learning_rate": 4.9431880534064345e-05, + "loss": 5.7476, + "step": 11433 + }, + { + "epoch": 0.06800123703492245, + "grad_norm": 1.9575468301773071, + "learning_rate": 4.9431781516872865e-05, + "loss": 5.6169, + "step": 11434 + }, + { + "epoch": 0.06800718431820345, + "grad_norm": 2.1103882789611816, + "learning_rate": 4.9431682491152495e-05, + "loss": 5.5119, + "step": 11435 + }, + { + "epoch": 0.06801313160148444, + "grad_norm": 2.280287265777588, + "learning_rate": 4.943158345690328e-05, + "loss": 5.2622, + "step": 11436 + }, + { + "epoch": 0.06801907888476544, + "grad_norm": 2.582737684249878, + "learning_rate": 4.943148441412525e-05, + "loss": 5.2644, + "step": 11437 + }, + { + "epoch": 0.06802502616804644, + "grad_norm": 2.1919124126434326, + "learning_rate": 4.9431385362818446e-05, + "loss": 5.0717, + "step": 11438 + }, + { + "epoch": 0.06803097345132743, + "grad_norm": 2.3036141395568848, + "learning_rate": 4.9431286302982896e-05, + "loss": 5.0049, + "step": 11439 + }, + { + "epoch": 0.06803692073460843, + "grad_norm": 2.3675789833068848, + "learning_rate": 4.943118723461864e-05, + "loss": 5.4686, + "step": 11440 + }, + { + "epoch": 0.06804286801788943, + "grad_norm": 2.8305327892303467, + "learning_rate": 4.94310881577257e-05, + "loss": 5.3409, + "step": 11441 + }, + { + "epoch": 0.06804881530117042, + "grad_norm": 1.562173843383789, + "learning_rate": 4.9430989072304126e-05, + "loss": 5.6801, + "step": 11442 + }, + { + "epoch": 0.06805476258445142, + "grad_norm": 1.9728971719741821, + "learning_rate": 4.9430889978353945e-05, + "loss": 5.4252, + "step": 11443 + }, + { + "epoch": 0.06806070986773242, + "grad_norm": 2.054025173187256, + "learning_rate": 4.9430790875875185e-05, + "loss": 5.1155, + "step": 11444 + }, + { + "epoch": 0.06806665715101341, + "grad_norm": 1.8511056900024414, + "learning_rate": 4.9430691764867895e-05, + "loss": 5.102, + "step": 11445 + }, + { + "epoch": 0.06807260443429441, + "grad_norm": 1.9024226665496826, + "learning_rate": 4.943059264533211e-05, + "loss": 5.0761, + "step": 11446 + }, + { + "epoch": 0.06807855171757542, + "grad_norm": 2.4767966270446777, + "learning_rate": 4.9430493517267843e-05, + "loss": 4.9809, + "step": 11447 + }, + { + "epoch": 0.0680844990008564, + "grad_norm": 2.393517255783081, + "learning_rate": 4.943039438067515e-05, + "loss": 5.1191, + "step": 11448 + }, + { + "epoch": 0.06809044628413741, + "grad_norm": 1.9510548114776611, + "learning_rate": 4.9430295235554055e-05, + "loss": 5.7117, + "step": 11449 + }, + { + "epoch": 0.06809639356741841, + "grad_norm": 2.1002418994903564, + "learning_rate": 4.9430196081904605e-05, + "loss": 5.7003, + "step": 11450 + }, + { + "epoch": 0.0681023408506994, + "grad_norm": 2.5328590869903564, + "learning_rate": 4.943009691972682e-05, + "loss": 6.1835, + "step": 11451 + }, + { + "epoch": 0.0681082881339804, + "grad_norm": 1.9173791408538818, + "learning_rate": 4.9429997749020743e-05, + "loss": 5.9596, + "step": 11452 + }, + { + "epoch": 0.0681142354172614, + "grad_norm": 2.0781052112579346, + "learning_rate": 4.9429898569786406e-05, + "loss": 5.7335, + "step": 11453 + }, + { + "epoch": 0.06812018270054239, + "grad_norm": 2.4210550785064697, + "learning_rate": 4.942979938202384e-05, + "loss": 4.9888, + "step": 11454 + }, + { + "epoch": 0.06812612998382339, + "grad_norm": 1.8438634872436523, + "learning_rate": 4.942970018573309e-05, + "loss": 5.8027, + "step": 11455 + }, + { + "epoch": 0.0681320772671044, + "grad_norm": 2.122882843017578, + "learning_rate": 4.942960098091418e-05, + "loss": 5.8569, + "step": 11456 + }, + { + "epoch": 0.06813802455038538, + "grad_norm": 1.6002168655395508, + "learning_rate": 4.942950176756715e-05, + "loss": 5.7362, + "step": 11457 + }, + { + "epoch": 0.06814397183366638, + "grad_norm": 1.8086539506912231, + "learning_rate": 4.942940254569203e-05, + "loss": 5.7537, + "step": 11458 + }, + { + "epoch": 0.06814991911694737, + "grad_norm": 2.0441513061523438, + "learning_rate": 4.942930331528886e-05, + "loss": 5.8255, + "step": 11459 + }, + { + "epoch": 0.06815586640022837, + "grad_norm": 1.8272675275802612, + "learning_rate": 4.942920407635767e-05, + "loss": 5.6915, + "step": 11460 + }, + { + "epoch": 0.06816181368350938, + "grad_norm": 3.3902077674865723, + "learning_rate": 4.94291048288985e-05, + "loss": 4.719, + "step": 11461 + }, + { + "epoch": 0.06816776096679036, + "grad_norm": 3.1770875453948975, + "learning_rate": 4.9429005572911385e-05, + "loss": 4.401, + "step": 11462 + }, + { + "epoch": 0.06817370825007137, + "grad_norm": 1.9011846780776978, + "learning_rate": 4.9428906308396355e-05, + "loss": 5.4768, + "step": 11463 + }, + { + "epoch": 0.06817965553335237, + "grad_norm": 1.7608321905136108, + "learning_rate": 4.9428807035353443e-05, + "loss": 5.5755, + "step": 11464 + }, + { + "epoch": 0.06818560281663336, + "grad_norm": 1.8250397443771362, + "learning_rate": 4.9428707753782686e-05, + "loss": 5.7804, + "step": 11465 + }, + { + "epoch": 0.06819155009991436, + "grad_norm": 2.566436290740967, + "learning_rate": 4.942860846368412e-05, + "loss": 5.0442, + "step": 11466 + }, + { + "epoch": 0.06819749738319536, + "grad_norm": 3.336547613143921, + "learning_rate": 4.942850916505779e-05, + "loss": 4.5331, + "step": 11467 + }, + { + "epoch": 0.06820344466647635, + "grad_norm": 2.6383185386657715, + "learning_rate": 4.9428409857903714e-05, + "loss": 4.5301, + "step": 11468 + }, + { + "epoch": 0.06820939194975735, + "grad_norm": 2.3853955268859863, + "learning_rate": 4.9428310542221924e-05, + "loss": 4.3398, + "step": 11469 + }, + { + "epoch": 0.06821533923303835, + "grad_norm": 2.3954038619995117, + "learning_rate": 4.942821121801246e-05, + "loss": 5.0841, + "step": 11470 + }, + { + "epoch": 0.06822128651631934, + "grad_norm": 2.922161340713501, + "learning_rate": 4.942811188527537e-05, + "loss": 4.5573, + "step": 11471 + }, + { + "epoch": 0.06822723379960034, + "grad_norm": 2.7202560901641846, + "learning_rate": 4.942801254401068e-05, + "loss": 4.5047, + "step": 11472 + }, + { + "epoch": 0.06823318108288134, + "grad_norm": 2.2289440631866455, + "learning_rate": 4.9427913194218424e-05, + "loss": 5.4686, + "step": 11473 + }, + { + "epoch": 0.06823912836616233, + "grad_norm": 2.2033851146698, + "learning_rate": 4.9427813835898635e-05, + "loss": 5.3554, + "step": 11474 + }, + { + "epoch": 0.06824507564944333, + "grad_norm": 2.171147346496582, + "learning_rate": 4.9427714469051345e-05, + "loss": 5.504, + "step": 11475 + }, + { + "epoch": 0.06825102293272434, + "grad_norm": 2.0110602378845215, + "learning_rate": 4.9427615093676594e-05, + "loss": 5.6126, + "step": 11476 + }, + { + "epoch": 0.06825697021600532, + "grad_norm": 2.08642840385437, + "learning_rate": 4.942751570977441e-05, + "loss": 6.0948, + "step": 11477 + }, + { + "epoch": 0.06826291749928633, + "grad_norm": 2.12245774269104, + "learning_rate": 4.9427416317344835e-05, + "loss": 5.2845, + "step": 11478 + }, + { + "epoch": 0.06826886478256733, + "grad_norm": 1.9155166149139404, + "learning_rate": 4.942731691638791e-05, + "loss": 5.4674, + "step": 11479 + }, + { + "epoch": 0.06827481206584832, + "grad_norm": 2.3452367782592773, + "learning_rate": 4.942721750690365e-05, + "loss": 5.2368, + "step": 11480 + }, + { + "epoch": 0.06828075934912932, + "grad_norm": 2.1282498836517334, + "learning_rate": 4.9427118088892105e-05, + "loss": 5.348, + "step": 11481 + }, + { + "epoch": 0.06828670663241032, + "grad_norm": 1.9251933097839355, + "learning_rate": 4.9427018662353306e-05, + "loss": 5.2588, + "step": 11482 + }, + { + "epoch": 0.06829265391569131, + "grad_norm": 1.9481078386306763, + "learning_rate": 4.942691922728728e-05, + "loss": 5.2775, + "step": 11483 + }, + { + "epoch": 0.06829860119897231, + "grad_norm": 1.9506112337112427, + "learning_rate": 4.942681978369408e-05, + "loss": 5.6865, + "step": 11484 + }, + { + "epoch": 0.06830454848225331, + "grad_norm": 2.0636112689971924, + "learning_rate": 4.942672033157373e-05, + "loss": 6.218, + "step": 11485 + }, + { + "epoch": 0.0683104957655343, + "grad_norm": 1.8479397296905518, + "learning_rate": 4.9426620870926256e-05, + "loss": 6.1283, + "step": 11486 + }, + { + "epoch": 0.0683164430488153, + "grad_norm": 1.9079830646514893, + "learning_rate": 4.94265214017517e-05, + "loss": 6.127, + "step": 11487 + }, + { + "epoch": 0.06832239033209629, + "grad_norm": 2.1076481342315674, + "learning_rate": 4.9426421924050105e-05, + "loss": 5.9978, + "step": 11488 + }, + { + "epoch": 0.0683283376153773, + "grad_norm": 1.885231375694275, + "learning_rate": 4.942632243782149e-05, + "loss": 5.8269, + "step": 11489 + }, + { + "epoch": 0.0683342848986583, + "grad_norm": 1.968980073928833, + "learning_rate": 4.942622294306591e-05, + "loss": 5.899, + "step": 11490 + }, + { + "epoch": 0.06834023218193928, + "grad_norm": 1.9857345819473267, + "learning_rate": 4.9426123439783376e-05, + "loss": 5.9416, + "step": 11491 + }, + { + "epoch": 0.06834617946522029, + "grad_norm": 1.8433799743652344, + "learning_rate": 4.942602392797394e-05, + "loss": 6.0714, + "step": 11492 + }, + { + "epoch": 0.06835212674850129, + "grad_norm": 1.9299565553665161, + "learning_rate": 4.942592440763764e-05, + "loss": 6.14, + "step": 11493 + }, + { + "epoch": 0.06835807403178228, + "grad_norm": 1.5700571537017822, + "learning_rate": 4.9425824878774486e-05, + "loss": 6.0496, + "step": 11494 + }, + { + "epoch": 0.06836402131506328, + "grad_norm": 1.6914032697677612, + "learning_rate": 4.942572534138454e-05, + "loss": 5.8301, + "step": 11495 + }, + { + "epoch": 0.06836996859834428, + "grad_norm": 1.6765984296798706, + "learning_rate": 4.942562579546782e-05, + "loss": 6.0701, + "step": 11496 + }, + { + "epoch": 0.06837591588162527, + "grad_norm": 1.715425729751587, + "learning_rate": 4.9425526241024364e-05, + "loss": 5.9499, + "step": 11497 + }, + { + "epoch": 0.06838186316490627, + "grad_norm": 1.8849130868911743, + "learning_rate": 4.942542667805422e-05, + "loss": 5.7088, + "step": 11498 + }, + { + "epoch": 0.06838781044818727, + "grad_norm": 2.1290276050567627, + "learning_rate": 4.9425327106557405e-05, + "loss": 5.9329, + "step": 11499 + }, + { + "epoch": 0.06839375773146826, + "grad_norm": 1.9105192422866821, + "learning_rate": 4.942522752653396e-05, + "loss": 5.9068, + "step": 11500 + }, + { + "epoch": 0.06839970501474926, + "grad_norm": 1.9120036363601685, + "learning_rate": 4.9425127937983926e-05, + "loss": 5.8411, + "step": 11501 + }, + { + "epoch": 0.06840565229803026, + "grad_norm": 2.1045427322387695, + "learning_rate": 4.942502834090732e-05, + "loss": 6.1575, + "step": 11502 + }, + { + "epoch": 0.06841159958131125, + "grad_norm": 1.8271901607513428, + "learning_rate": 4.94249287353042e-05, + "loss": 6.0732, + "step": 11503 + }, + { + "epoch": 0.06841754686459225, + "grad_norm": 1.4770866632461548, + "learning_rate": 4.942482912117459e-05, + "loss": 6.0823, + "step": 11504 + }, + { + "epoch": 0.06842349414787326, + "grad_norm": 1.7055792808532715, + "learning_rate": 4.942472949851852e-05, + "loss": 6.0738, + "step": 11505 + }, + { + "epoch": 0.06842944143115424, + "grad_norm": 1.588705062866211, + "learning_rate": 4.942462986733602e-05, + "loss": 5.9731, + "step": 11506 + }, + { + "epoch": 0.06843538871443525, + "grad_norm": 2.662527561187744, + "learning_rate": 4.942453022762715e-05, + "loss": 5.7745, + "step": 11507 + }, + { + "epoch": 0.06844133599771625, + "grad_norm": 2.0649495124816895, + "learning_rate": 4.9424430579391925e-05, + "loss": 5.7173, + "step": 11508 + }, + { + "epoch": 0.06844728328099724, + "grad_norm": 1.647801160812378, + "learning_rate": 4.942433092263038e-05, + "loss": 6.1516, + "step": 11509 + }, + { + "epoch": 0.06845323056427824, + "grad_norm": 1.743788480758667, + "learning_rate": 4.942423125734256e-05, + "loss": 6.0211, + "step": 11510 + }, + { + "epoch": 0.06845917784755924, + "grad_norm": 1.898647665977478, + "learning_rate": 4.942413158352849e-05, + "loss": 6.0106, + "step": 11511 + }, + { + "epoch": 0.06846512513084023, + "grad_norm": 1.5159860849380493, + "learning_rate": 4.94240319011882e-05, + "loss": 5.8759, + "step": 11512 + }, + { + "epoch": 0.06847107241412123, + "grad_norm": 3.265730142593384, + "learning_rate": 4.9423932210321744e-05, + "loss": 4.7228, + "step": 11513 + }, + { + "epoch": 0.06847701969740223, + "grad_norm": 2.9290871620178223, + "learning_rate": 4.9423832510929136e-05, + "loss": 4.5315, + "step": 11514 + }, + { + "epoch": 0.06848296698068322, + "grad_norm": 2.4189975261688232, + "learning_rate": 4.942373280301042e-05, + "loss": 4.5803, + "step": 11515 + }, + { + "epoch": 0.06848891426396422, + "grad_norm": 2.4018993377685547, + "learning_rate": 4.9423633086565645e-05, + "loss": 5.1411, + "step": 11516 + }, + { + "epoch": 0.06849486154724521, + "grad_norm": 2.4697556495666504, + "learning_rate": 4.9423533361594824e-05, + "loss": 5.1523, + "step": 11517 + }, + { + "epoch": 0.06850080883052621, + "grad_norm": 2.1573715209960938, + "learning_rate": 4.942343362809799e-05, + "loss": 5.3488, + "step": 11518 + }, + { + "epoch": 0.06850675611380722, + "grad_norm": 1.9723131656646729, + "learning_rate": 4.9423333886075205e-05, + "loss": 5.2315, + "step": 11519 + }, + { + "epoch": 0.0685127033970882, + "grad_norm": 1.6925430297851562, + "learning_rate": 4.9423234135526475e-05, + "loss": 5.3055, + "step": 11520 + }, + { + "epoch": 0.0685186506803692, + "grad_norm": 2.8665122985839844, + "learning_rate": 4.942313437645185e-05, + "loss": 4.4905, + "step": 11521 + }, + { + "epoch": 0.06852459796365021, + "grad_norm": 2.7538015842437744, + "learning_rate": 4.942303460885136e-05, + "loss": 4.3863, + "step": 11522 + }, + { + "epoch": 0.0685305452469312, + "grad_norm": 2.335664987564087, + "learning_rate": 4.942293483272504e-05, + "loss": 4.4571, + "step": 11523 + }, + { + "epoch": 0.0685364925302122, + "grad_norm": 1.7987995147705078, + "learning_rate": 4.942283504807293e-05, + "loss": 5.1802, + "step": 11524 + }, + { + "epoch": 0.0685424398134932, + "grad_norm": 2.3286690711975098, + "learning_rate": 4.9422735254895056e-05, + "loss": 5.2883, + "step": 11525 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 2.093317747116089, + "learning_rate": 4.9422635453191466e-05, + "loss": 5.2589, + "step": 11526 + }, + { + "epoch": 0.06855433438005519, + "grad_norm": 1.914236307144165, + "learning_rate": 4.942253564296218e-05, + "loss": 5.4347, + "step": 11527 + }, + { + "epoch": 0.06856028166333619, + "grad_norm": 1.602265477180481, + "learning_rate": 4.942243582420724e-05, + "loss": 5.8021, + "step": 11528 + }, + { + "epoch": 0.06856622894661718, + "grad_norm": 1.4433797597885132, + "learning_rate": 4.9422335996926674e-05, + "loss": 5.7432, + "step": 11529 + }, + { + "epoch": 0.06857217622989818, + "grad_norm": 1.3481166362762451, + "learning_rate": 4.942223616112053e-05, + "loss": 5.2946, + "step": 11530 + }, + { + "epoch": 0.06857812351317918, + "grad_norm": 1.879550576210022, + "learning_rate": 4.942213631678883e-05, + "loss": 5.2669, + "step": 11531 + }, + { + "epoch": 0.06858407079646017, + "grad_norm": 2.7241995334625244, + "learning_rate": 4.942203646393162e-05, + "loss": 5.2248, + "step": 11532 + }, + { + "epoch": 0.06859001807974117, + "grad_norm": 1.9870814085006714, + "learning_rate": 4.942193660254892e-05, + "loss": 5.4025, + "step": 11533 + }, + { + "epoch": 0.06859596536302218, + "grad_norm": 1.89231276512146, + "learning_rate": 4.942183673264079e-05, + "loss": 5.6046, + "step": 11534 + }, + { + "epoch": 0.06860191264630316, + "grad_norm": 2.024684429168701, + "learning_rate": 4.9421736854207235e-05, + "loss": 5.4031, + "step": 11535 + }, + { + "epoch": 0.06860785992958417, + "grad_norm": 1.6764521598815918, + "learning_rate": 4.942163696724831e-05, + "loss": 5.702, + "step": 11536 + }, + { + "epoch": 0.06861380721286517, + "grad_norm": 1.7738621234893799, + "learning_rate": 4.942153707176405e-05, + "loss": 5.1491, + "step": 11537 + }, + { + "epoch": 0.06861975449614616, + "grad_norm": 1.416986346244812, + "learning_rate": 4.942143716775447e-05, + "loss": 5.3883, + "step": 11538 + }, + { + "epoch": 0.06862570177942716, + "grad_norm": 1.837067723274231, + "learning_rate": 4.942133725521963e-05, + "loss": 5.2945, + "step": 11539 + }, + { + "epoch": 0.06863164906270816, + "grad_norm": 1.995610237121582, + "learning_rate": 4.942123733415955e-05, + "loss": 5.2589, + "step": 11540 + }, + { + "epoch": 0.06863759634598915, + "grad_norm": 1.9689414501190186, + "learning_rate": 4.9421137404574264e-05, + "loss": 5.3715, + "step": 11541 + }, + { + "epoch": 0.06864354362927015, + "grad_norm": 1.6984235048294067, + "learning_rate": 4.942103746646382e-05, + "loss": 5.3987, + "step": 11542 + }, + { + "epoch": 0.06864949091255115, + "grad_norm": 1.2645832300186157, + "learning_rate": 4.9420937519828234e-05, + "loss": 5.2142, + "step": 11543 + }, + { + "epoch": 0.06865543819583214, + "grad_norm": 1.6830233335494995, + "learning_rate": 4.9420837564667556e-05, + "loss": 5.1172, + "step": 11544 + }, + { + "epoch": 0.06866138547911314, + "grad_norm": 1.5734926462173462, + "learning_rate": 4.9420737600981816e-05, + "loss": 5.3789, + "step": 11545 + }, + { + "epoch": 0.06866733276239413, + "grad_norm": 1.7375764846801758, + "learning_rate": 4.942063762877105e-05, + "loss": 5.5311, + "step": 11546 + }, + { + "epoch": 0.06867328004567513, + "grad_norm": 1.5421762466430664, + "learning_rate": 4.942053764803529e-05, + "loss": 5.1722, + "step": 11547 + }, + { + "epoch": 0.06867922732895614, + "grad_norm": 1.6282575130462646, + "learning_rate": 4.942043765877457e-05, + "loss": 5.4754, + "step": 11548 + }, + { + "epoch": 0.06868517461223712, + "grad_norm": 1.5595266819000244, + "learning_rate": 4.9420337660988936e-05, + "loss": 5.3516, + "step": 11549 + }, + { + "epoch": 0.06869112189551813, + "grad_norm": 1.5642317533493042, + "learning_rate": 4.9420237654678405e-05, + "loss": 5.2364, + "step": 11550 + }, + { + "epoch": 0.06869706917879913, + "grad_norm": 1.5491602420806885, + "learning_rate": 4.942013763984302e-05, + "loss": 5.1566, + "step": 11551 + }, + { + "epoch": 0.06870301646208012, + "grad_norm": 1.4256258010864258, + "learning_rate": 4.942003761648283e-05, + "loss": 5.1592, + "step": 11552 + }, + { + "epoch": 0.06870896374536112, + "grad_norm": 1.756016492843628, + "learning_rate": 4.9419937584597846e-05, + "loss": 5.012, + "step": 11553 + }, + { + "epoch": 0.06871491102864212, + "grad_norm": 2.5290040969848633, + "learning_rate": 4.941983754418812e-05, + "loss": 4.571, + "step": 11554 + }, + { + "epoch": 0.06872085831192311, + "grad_norm": 2.6146528720855713, + "learning_rate": 4.9419737495253685e-05, + "loss": 4.3515, + "step": 11555 + }, + { + "epoch": 0.06872680559520411, + "grad_norm": 2.3333144187927246, + "learning_rate": 4.941963743779456e-05, + "loss": 4.3032, + "step": 11556 + }, + { + "epoch": 0.06873275287848511, + "grad_norm": 2.342433452606201, + "learning_rate": 4.9419537371810795e-05, + "loss": 4.2942, + "step": 11557 + }, + { + "epoch": 0.0687387001617661, + "grad_norm": 2.423696517944336, + "learning_rate": 4.941943729730243e-05, + "loss": 4.4, + "step": 11558 + }, + { + "epoch": 0.0687446474450471, + "grad_norm": 2.3420050144195557, + "learning_rate": 4.941933721426948e-05, + "loss": 5.0466, + "step": 11559 + }, + { + "epoch": 0.0687505947283281, + "grad_norm": 2.7115821838378906, + "learning_rate": 4.9419237122712e-05, + "loss": 5.1197, + "step": 11560 + }, + { + "epoch": 0.06875654201160909, + "grad_norm": 2.7316489219665527, + "learning_rate": 4.9419137022630014e-05, + "loss": 5.2435, + "step": 11561 + }, + { + "epoch": 0.0687624892948901, + "grad_norm": 2.291551113128662, + "learning_rate": 4.941903691402356e-05, + "loss": 5.0345, + "step": 11562 + }, + { + "epoch": 0.0687684365781711, + "grad_norm": 2.4499049186706543, + "learning_rate": 4.941893679689267e-05, + "loss": 4.503, + "step": 11563 + }, + { + "epoch": 0.06877438386145208, + "grad_norm": 2.7120168209075928, + "learning_rate": 4.9418836671237385e-05, + "loss": 4.2954, + "step": 11564 + }, + { + "epoch": 0.06878033114473309, + "grad_norm": 2.8483526706695557, + "learning_rate": 4.941873653705774e-05, + "loss": 6.269, + "step": 11565 + }, + { + "epoch": 0.06878627842801409, + "grad_norm": 2.3191473484039307, + "learning_rate": 4.941863639435376e-05, + "loss": 6.1628, + "step": 11566 + }, + { + "epoch": 0.06879222571129508, + "grad_norm": 3.4622583389282227, + "learning_rate": 4.9418536243125486e-05, + "loss": 5.6115, + "step": 11567 + }, + { + "epoch": 0.06879817299457608, + "grad_norm": 1.7118897438049316, + "learning_rate": 4.941843608337295e-05, + "loss": 5.4801, + "step": 11568 + }, + { + "epoch": 0.06880412027785708, + "grad_norm": 2.876338243484497, + "learning_rate": 4.9418335915096195e-05, + "loss": 5.0806, + "step": 11569 + }, + { + "epoch": 0.06881006756113807, + "grad_norm": 2.2875587940216064, + "learning_rate": 4.941823573829525e-05, + "loss": 5.2833, + "step": 11570 + }, + { + "epoch": 0.06881601484441907, + "grad_norm": 1.797743320465088, + "learning_rate": 4.9418135552970155e-05, + "loss": 6.1407, + "step": 11571 + }, + { + "epoch": 0.06882196212770007, + "grad_norm": 1.957331895828247, + "learning_rate": 4.941803535912094e-05, + "loss": 5.8743, + "step": 11572 + }, + { + "epoch": 0.06882790941098106, + "grad_norm": 1.9552925825119019, + "learning_rate": 4.9417935156747644e-05, + "loss": 5.584, + "step": 11573 + }, + { + "epoch": 0.06883385669426206, + "grad_norm": 2.057610034942627, + "learning_rate": 4.94178349458503e-05, + "loss": 5.8445, + "step": 11574 + }, + { + "epoch": 0.06883980397754305, + "grad_norm": 1.7856727838516235, + "learning_rate": 4.941773472642893e-05, + "loss": 6.0133, + "step": 11575 + }, + { + "epoch": 0.06884575126082405, + "grad_norm": 1.4494417905807495, + "learning_rate": 4.941763449848359e-05, + "loss": 5.888, + "step": 11576 + }, + { + "epoch": 0.06885169854410506, + "grad_norm": 2.1377499103546143, + "learning_rate": 4.9417534262014306e-05, + "loss": 6.0604, + "step": 11577 + }, + { + "epoch": 0.06885764582738604, + "grad_norm": 1.769888162612915, + "learning_rate": 4.9417434017021105e-05, + "loss": 5.8815, + "step": 11578 + }, + { + "epoch": 0.06886359311066705, + "grad_norm": 1.933935523033142, + "learning_rate": 4.9417333763504036e-05, + "loss": 5.6601, + "step": 11579 + }, + { + "epoch": 0.06886954039394805, + "grad_norm": 1.8672062158584595, + "learning_rate": 4.941723350146313e-05, + "loss": 5.8143, + "step": 11580 + }, + { + "epoch": 0.06887548767722904, + "grad_norm": 1.9899057149887085, + "learning_rate": 4.941713323089842e-05, + "loss": 5.8465, + "step": 11581 + }, + { + "epoch": 0.06888143496051004, + "grad_norm": 2.1053643226623535, + "learning_rate": 4.941703295180994e-05, + "loss": 5.4582, + "step": 11582 + }, + { + "epoch": 0.06888738224379104, + "grad_norm": 1.9435245990753174, + "learning_rate": 4.9416932664197726e-05, + "loss": 5.8503, + "step": 11583 + }, + { + "epoch": 0.06889332952707203, + "grad_norm": 1.9407175779342651, + "learning_rate": 4.941683236806181e-05, + "loss": 5.706, + "step": 11584 + }, + { + "epoch": 0.06889927681035303, + "grad_norm": 2.0505893230438232, + "learning_rate": 4.941673206340224e-05, + "loss": 6.01, + "step": 11585 + }, + { + "epoch": 0.06890522409363403, + "grad_norm": 1.6713486909866333, + "learning_rate": 4.941663175021903e-05, + "loss": 5.8347, + "step": 11586 + }, + { + "epoch": 0.06891117137691502, + "grad_norm": 1.5333812236785889, + "learning_rate": 4.941653142851223e-05, + "loss": 5.8493, + "step": 11587 + }, + { + "epoch": 0.06891711866019602, + "grad_norm": 2.10982346534729, + "learning_rate": 4.9416431098281865e-05, + "loss": 5.4037, + "step": 11588 + }, + { + "epoch": 0.06892306594347702, + "grad_norm": 1.766663908958435, + "learning_rate": 4.9416330759527985e-05, + "loss": 5.0335, + "step": 11589 + }, + { + "epoch": 0.06892901322675801, + "grad_norm": 2.0600688457489014, + "learning_rate": 4.9416230412250615e-05, + "loss": 5.4017, + "step": 11590 + }, + { + "epoch": 0.06893496051003901, + "grad_norm": 1.6271671056747437, + "learning_rate": 4.941613005644979e-05, + "loss": 5.903, + "step": 11591 + }, + { + "epoch": 0.06894090779332002, + "grad_norm": 1.9222697019577026, + "learning_rate": 4.9416029692125544e-05, + "loss": 5.1666, + "step": 11592 + }, + { + "epoch": 0.068946855076601, + "grad_norm": 1.7405030727386475, + "learning_rate": 4.941592931927792e-05, + "loss": 5.0799, + "step": 11593 + }, + { + "epoch": 0.068952802359882, + "grad_norm": 1.7639994621276855, + "learning_rate": 4.941582893790694e-05, + "loss": 5.7596, + "step": 11594 + }, + { + "epoch": 0.06895874964316301, + "grad_norm": 1.9628292322158813, + "learning_rate": 4.941572854801265e-05, + "loss": 4.4573, + "step": 11595 + }, + { + "epoch": 0.068964696926444, + "grad_norm": 1.7616615295410156, + "learning_rate": 4.941562814959508e-05, + "loss": 4.6399, + "step": 11596 + }, + { + "epoch": 0.068970644209725, + "grad_norm": 1.8174281120300293, + "learning_rate": 4.9415527742654265e-05, + "loss": 5.6279, + "step": 11597 + }, + { + "epoch": 0.068976591493006, + "grad_norm": 1.563138723373413, + "learning_rate": 4.941542732719025e-05, + "loss": 5.8696, + "step": 11598 + }, + { + "epoch": 0.06898253877628699, + "grad_norm": 1.4704676866531372, + "learning_rate": 4.9415326903203055e-05, + "loss": 5.7129, + "step": 11599 + }, + { + "epoch": 0.06898848605956799, + "grad_norm": 2.484572410583496, + "learning_rate": 4.9415226470692724e-05, + "loss": 5.336, + "step": 11600 + }, + { + "epoch": 0.068994433342849, + "grad_norm": 1.882876992225647, + "learning_rate": 4.9415126029659284e-05, + "loss": 5.4273, + "step": 11601 + }, + { + "epoch": 0.06900038062612998, + "grad_norm": 1.7827874422073364, + "learning_rate": 4.941502558010278e-05, + "loss": 5.6699, + "step": 11602 + }, + { + "epoch": 0.06900632790941098, + "grad_norm": 1.5609276294708252, + "learning_rate": 4.941492512202325e-05, + "loss": 5.648, + "step": 11603 + }, + { + "epoch": 0.06901227519269197, + "grad_norm": 1.6941063404083252, + "learning_rate": 4.941482465542071e-05, + "loss": 5.633, + "step": 11604 + }, + { + "epoch": 0.06901822247597297, + "grad_norm": 1.768922209739685, + "learning_rate": 4.941472418029521e-05, + "loss": 5.6072, + "step": 11605 + }, + { + "epoch": 0.06902416975925398, + "grad_norm": 2.225846767425537, + "learning_rate": 4.941462369664679e-05, + "loss": 4.9314, + "step": 11606 + }, + { + "epoch": 0.06903011704253496, + "grad_norm": 2.4479281902313232, + "learning_rate": 4.941452320447546e-05, + "loss": 5.0563, + "step": 11607 + }, + { + "epoch": 0.06903606432581597, + "grad_norm": 2.358238935470581, + "learning_rate": 4.941442270378129e-05, + "loss": 4.9379, + "step": 11608 + }, + { + "epoch": 0.06904201160909697, + "grad_norm": 2.2679247856140137, + "learning_rate": 4.941432219456429e-05, + "loss": 5.0655, + "step": 11609 + }, + { + "epoch": 0.06904795889237796, + "grad_norm": 2.524176597595215, + "learning_rate": 4.94142216768245e-05, + "loss": 4.8694, + "step": 11610 + }, + { + "epoch": 0.06905390617565896, + "grad_norm": 2.1919515132904053, + "learning_rate": 4.9414121150561966e-05, + "loss": 5.0889, + "step": 11611 + }, + { + "epoch": 0.06905985345893996, + "grad_norm": 2.2838563919067383, + "learning_rate": 4.94140206157767e-05, + "loss": 4.9942, + "step": 11612 + }, + { + "epoch": 0.06906580074222095, + "grad_norm": 2.2270026206970215, + "learning_rate": 4.9413920072468764e-05, + "loss": 4.9885, + "step": 11613 + }, + { + "epoch": 0.06907174802550195, + "grad_norm": 2.175245761871338, + "learning_rate": 4.9413819520638176e-05, + "loss": 4.9829, + "step": 11614 + }, + { + "epoch": 0.06907769530878295, + "grad_norm": 2.128441572189331, + "learning_rate": 4.941371896028498e-05, + "loss": 4.9802, + "step": 11615 + }, + { + "epoch": 0.06908364259206394, + "grad_norm": 2.7656328678131104, + "learning_rate": 4.94136183914092e-05, + "loss": 5.1302, + "step": 11616 + }, + { + "epoch": 0.06908958987534494, + "grad_norm": 2.23917818069458, + "learning_rate": 4.941351781401088e-05, + "loss": 4.8766, + "step": 11617 + }, + { + "epoch": 0.06909553715862594, + "grad_norm": 1.861399531364441, + "learning_rate": 4.941341722809005e-05, + "loss": 5.8151, + "step": 11618 + }, + { + "epoch": 0.06910148444190693, + "grad_norm": 2.13590145111084, + "learning_rate": 4.9413316633646754e-05, + "loss": 5.6892, + "step": 11619 + }, + { + "epoch": 0.06910743172518793, + "grad_norm": 1.8261966705322266, + "learning_rate": 4.9413216030681024e-05, + "loss": 6.1387, + "step": 11620 + }, + { + "epoch": 0.06911337900846894, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.941311541919289e-05, + "loss": 5.3217, + "step": 11621 + }, + { + "epoch": 0.06911932629174992, + "grad_norm": 2.1011979579925537, + "learning_rate": 4.941301479918239e-05, + "loss": 5.048, + "step": 11622 + }, + { + "epoch": 0.06912527357503093, + "grad_norm": 2.214597225189209, + "learning_rate": 4.941291417064956e-05, + "loss": 5.4312, + "step": 11623 + }, + { + "epoch": 0.06913122085831193, + "grad_norm": 2.6525864601135254, + "learning_rate": 4.941281353359443e-05, + "loss": 4.4151, + "step": 11624 + }, + { + "epoch": 0.06913716814159292, + "grad_norm": 1.9638911485671997, + "learning_rate": 4.941271288801704e-05, + "loss": 5.0091, + "step": 11625 + }, + { + "epoch": 0.06914311542487392, + "grad_norm": 2.062688112258911, + "learning_rate": 4.941261223391742e-05, + "loss": 5.503, + "step": 11626 + }, + { + "epoch": 0.06914906270815492, + "grad_norm": 2.219430685043335, + "learning_rate": 4.941251157129561e-05, + "loss": 4.984, + "step": 11627 + }, + { + "epoch": 0.06915500999143591, + "grad_norm": 2.0745718479156494, + "learning_rate": 4.941241090015165e-05, + "loss": 5.3094, + "step": 11628 + }, + { + "epoch": 0.06916095727471691, + "grad_norm": 1.8852496147155762, + "learning_rate": 4.941231022048557e-05, + "loss": 5.2424, + "step": 11629 + }, + { + "epoch": 0.06916690455799791, + "grad_norm": 2.335723400115967, + "learning_rate": 4.9412209532297404e-05, + "loss": 5.6031, + "step": 11630 + }, + { + "epoch": 0.0691728518412789, + "grad_norm": 2.167698621749878, + "learning_rate": 4.941210883558719e-05, + "loss": 5.3132, + "step": 11631 + }, + { + "epoch": 0.0691787991245599, + "grad_norm": 2.213068962097168, + "learning_rate": 4.941200813035495e-05, + "loss": 5.2049, + "step": 11632 + }, + { + "epoch": 0.06918474640784089, + "grad_norm": 1.9697870016098022, + "learning_rate": 4.941190741660075e-05, + "loss": 5.3118, + "step": 11633 + }, + { + "epoch": 0.0691906936911219, + "grad_norm": 1.7360777854919434, + "learning_rate": 4.941180669432458e-05, + "loss": 5.444, + "step": 11634 + }, + { + "epoch": 0.0691966409744029, + "grad_norm": 1.8400771617889404, + "learning_rate": 4.9411705963526514e-05, + "loss": 5.6975, + "step": 11635 + }, + { + "epoch": 0.06920258825768388, + "grad_norm": 1.492242693901062, + "learning_rate": 4.941160522420657e-05, + "loss": 5.5617, + "step": 11636 + }, + { + "epoch": 0.06920853554096489, + "grad_norm": 1.6014543771743774, + "learning_rate": 4.9411504476364794e-05, + "loss": 5.7317, + "step": 11637 + }, + { + "epoch": 0.06921448282424589, + "grad_norm": 1.7973628044128418, + "learning_rate": 4.9411403720001215e-05, + "loss": 5.3105, + "step": 11638 + }, + { + "epoch": 0.06922043010752688, + "grad_norm": 1.8314461708068848, + "learning_rate": 4.9411302955115853e-05, + "loss": 5.624, + "step": 11639 + }, + { + "epoch": 0.06922637739080788, + "grad_norm": 1.621315836906433, + "learning_rate": 4.941120218170877e-05, + "loss": 5.8243, + "step": 11640 + }, + { + "epoch": 0.06923232467408888, + "grad_norm": 2.0378596782684326, + "learning_rate": 4.941110139977998e-05, + "loss": 4.9275, + "step": 11641 + }, + { + "epoch": 0.06923827195736987, + "grad_norm": 1.8713582754135132, + "learning_rate": 4.941100060932954e-05, + "loss": 5.1218, + "step": 11642 + }, + { + "epoch": 0.06924421924065087, + "grad_norm": 1.878404140472412, + "learning_rate": 4.941089981035746e-05, + "loss": 5.4997, + "step": 11643 + }, + { + "epoch": 0.06925016652393187, + "grad_norm": 1.7230712175369263, + "learning_rate": 4.941079900286379e-05, + "loss": 5.5514, + "step": 11644 + }, + { + "epoch": 0.06925611380721286, + "grad_norm": 1.6272276639938354, + "learning_rate": 4.941069818684856e-05, + "loss": 5.7186, + "step": 11645 + }, + { + "epoch": 0.06926206109049386, + "grad_norm": 1.5610454082489014, + "learning_rate": 4.9410597362311814e-05, + "loss": 5.8929, + "step": 11646 + }, + { + "epoch": 0.06926800837377486, + "grad_norm": 1.7373837232589722, + "learning_rate": 4.941049652925358e-05, + "loss": 5.6428, + "step": 11647 + }, + { + "epoch": 0.06927395565705585, + "grad_norm": 1.9722628593444824, + "learning_rate": 4.9410395687673886e-05, + "loss": 5.9562, + "step": 11648 + }, + { + "epoch": 0.06927990294033685, + "grad_norm": 1.5603039264678955, + "learning_rate": 4.941029483757278e-05, + "loss": 6.031, + "step": 11649 + }, + { + "epoch": 0.06928585022361786, + "grad_norm": 1.6971800327301025, + "learning_rate": 4.941019397895029e-05, + "loss": 5.7527, + "step": 11650 + }, + { + "epoch": 0.06929179750689884, + "grad_norm": 1.9559118747711182, + "learning_rate": 4.9410093111806456e-05, + "loss": 5.0904, + "step": 11651 + }, + { + "epoch": 0.06929774479017985, + "grad_norm": 1.561122179031372, + "learning_rate": 4.9409992236141315e-05, + "loss": 5.7438, + "step": 11652 + }, + { + "epoch": 0.06930369207346085, + "grad_norm": 1.6071819067001343, + "learning_rate": 4.940989135195489e-05, + "loss": 5.8852, + "step": 11653 + }, + { + "epoch": 0.06930963935674184, + "grad_norm": 1.6804322004318237, + "learning_rate": 4.940979045924723e-05, + "loss": 5.7174, + "step": 11654 + }, + { + "epoch": 0.06931558664002284, + "grad_norm": 1.5802178382873535, + "learning_rate": 4.940968955801836e-05, + "loss": 5.8755, + "step": 11655 + }, + { + "epoch": 0.06932153392330384, + "grad_norm": 2.1002743244171143, + "learning_rate": 4.940958864826832e-05, + "loss": 5.6323, + "step": 11656 + }, + { + "epoch": 0.06932748120658483, + "grad_norm": 1.8874709606170654, + "learning_rate": 4.9409487729997144e-05, + "loss": 5.6798, + "step": 11657 + }, + { + "epoch": 0.06933342848986583, + "grad_norm": 1.6967203617095947, + "learning_rate": 4.940938680320487e-05, + "loss": 5.8461, + "step": 11658 + }, + { + "epoch": 0.06933937577314683, + "grad_norm": 1.9648679494857788, + "learning_rate": 4.9409285867891534e-05, + "loss": 5.842, + "step": 11659 + }, + { + "epoch": 0.06934532305642782, + "grad_norm": 1.8681408166885376, + "learning_rate": 4.940918492405716e-05, + "loss": 5.8859, + "step": 11660 + }, + { + "epoch": 0.06935127033970882, + "grad_norm": 2.0480551719665527, + "learning_rate": 4.9409083971701805e-05, + "loss": 5.6415, + "step": 11661 + }, + { + "epoch": 0.06935721762298983, + "grad_norm": 2.102832555770874, + "learning_rate": 4.940898301082548e-05, + "loss": 5.6163, + "step": 11662 + }, + { + "epoch": 0.06936316490627081, + "grad_norm": 1.7471407651901245, + "learning_rate": 4.940888204142824e-05, + "loss": 5.7973, + "step": 11663 + }, + { + "epoch": 0.06936911218955182, + "grad_norm": 1.9675641059875488, + "learning_rate": 4.94087810635101e-05, + "loss": 5.1125, + "step": 11664 + }, + { + "epoch": 0.0693750594728328, + "grad_norm": 1.6316107511520386, + "learning_rate": 4.940868007707111e-05, + "loss": 5.5067, + "step": 11665 + }, + { + "epoch": 0.0693810067561138, + "grad_norm": 1.8663619756698608, + "learning_rate": 4.940857908211131e-05, + "loss": 5.5552, + "step": 11666 + }, + { + "epoch": 0.06938695403939481, + "grad_norm": 2.155702590942383, + "learning_rate": 4.940847807863072e-05, + "loss": 6.0919, + "step": 11667 + }, + { + "epoch": 0.0693929013226758, + "grad_norm": 1.968467354774475, + "learning_rate": 4.9408377066629384e-05, + "loss": 5.8105, + "step": 11668 + }, + { + "epoch": 0.0693988486059568, + "grad_norm": 1.5245625972747803, + "learning_rate": 4.940827604610734e-05, + "loss": 5.8901, + "step": 11669 + }, + { + "epoch": 0.0694047958892378, + "grad_norm": 1.7377501726150513, + "learning_rate": 4.940817501706461e-05, + "loss": 5.5917, + "step": 11670 + }, + { + "epoch": 0.06941074317251879, + "grad_norm": 1.9668710231781006, + "learning_rate": 4.940807397950125e-05, + "loss": 5.6857, + "step": 11671 + }, + { + "epoch": 0.06941669045579979, + "grad_norm": 1.8168022632598877, + "learning_rate": 4.9407972933417266e-05, + "loss": 5.7032, + "step": 11672 + }, + { + "epoch": 0.06942263773908079, + "grad_norm": 2.4009077548980713, + "learning_rate": 4.940787187881273e-05, + "loss": 5.6767, + "step": 11673 + }, + { + "epoch": 0.06942858502236178, + "grad_norm": 1.8541746139526367, + "learning_rate": 4.940777081568765e-05, + "loss": 5.6327, + "step": 11674 + }, + { + "epoch": 0.06943453230564278, + "grad_norm": 2.028602361679077, + "learning_rate": 4.940766974404206e-05, + "loss": 5.0819, + "step": 11675 + }, + { + "epoch": 0.06944047958892378, + "grad_norm": 2.0870065689086914, + "learning_rate": 4.940756866387602e-05, + "loss": 5.1645, + "step": 11676 + }, + { + "epoch": 0.06944642687220477, + "grad_norm": 1.8009755611419678, + "learning_rate": 4.940746757518954e-05, + "loss": 4.9832, + "step": 11677 + }, + { + "epoch": 0.06945237415548577, + "grad_norm": 2.20975399017334, + "learning_rate": 4.9407366477982675e-05, + "loss": 4.9683, + "step": 11678 + }, + { + "epoch": 0.06945832143876678, + "grad_norm": 1.89133882522583, + "learning_rate": 4.940726537225544e-05, + "loss": 4.7736, + "step": 11679 + }, + { + "epoch": 0.06946426872204776, + "grad_norm": 1.7583657503128052, + "learning_rate": 4.940716425800789e-05, + "loss": 5.4275, + "step": 11680 + }, + { + "epoch": 0.06947021600532877, + "grad_norm": 2.1929352283477783, + "learning_rate": 4.940706313524004e-05, + "loss": 4.8441, + "step": 11681 + }, + { + "epoch": 0.06947616328860977, + "grad_norm": 2.1098999977111816, + "learning_rate": 4.940696200395194e-05, + "loss": 5.065, + "step": 11682 + }, + { + "epoch": 0.06948211057189076, + "grad_norm": 1.7651045322418213, + "learning_rate": 4.940686086414363e-05, + "loss": 5.7086, + "step": 11683 + }, + { + "epoch": 0.06948805785517176, + "grad_norm": 1.6675828695297241, + "learning_rate": 4.9406759715815134e-05, + "loss": 5.89, + "step": 11684 + }, + { + "epoch": 0.06949400513845276, + "grad_norm": 1.9754993915557861, + "learning_rate": 4.940665855896648e-05, + "loss": 5.7752, + "step": 11685 + }, + { + "epoch": 0.06949995242173375, + "grad_norm": 1.7652478218078613, + "learning_rate": 4.940655739359773e-05, + "loss": 5.6518, + "step": 11686 + }, + { + "epoch": 0.06950589970501475, + "grad_norm": 1.898997187614441, + "learning_rate": 4.940645621970889e-05, + "loss": 5.4579, + "step": 11687 + }, + { + "epoch": 0.06951184698829575, + "grad_norm": 2.1233060359954834, + "learning_rate": 4.940635503730001e-05, + "loss": 4.3979, + "step": 11688 + }, + { + "epoch": 0.06951779427157674, + "grad_norm": 2.0859549045562744, + "learning_rate": 4.940625384637113e-05, + "loss": 4.4309, + "step": 11689 + }, + { + "epoch": 0.06952374155485774, + "grad_norm": 2.051492929458618, + "learning_rate": 4.940615264692228e-05, + "loss": 4.4332, + "step": 11690 + }, + { + "epoch": 0.06952968883813875, + "grad_norm": 2.0359628200531006, + "learning_rate": 4.940605143895348e-05, + "loss": 4.29, + "step": 11691 + }, + { + "epoch": 0.06953563612141973, + "grad_norm": 2.0122604370117188, + "learning_rate": 4.940595022246479e-05, + "loss": 4.4391, + "step": 11692 + }, + { + "epoch": 0.06954158340470074, + "grad_norm": 2.059694290161133, + "learning_rate": 4.940584899745624e-05, + "loss": 4.3993, + "step": 11693 + }, + { + "epoch": 0.06954753068798172, + "grad_norm": 2.0355825424194336, + "learning_rate": 4.940574776392786e-05, + "loss": 4.2829, + "step": 11694 + }, + { + "epoch": 0.06955347797126273, + "grad_norm": 1.933385968208313, + "learning_rate": 4.940564652187967e-05, + "loss": 4.372, + "step": 11695 + }, + { + "epoch": 0.06955942525454373, + "grad_norm": 2.0848586559295654, + "learning_rate": 4.940554527131174e-05, + "loss": 4.3064, + "step": 11696 + }, + { + "epoch": 0.06956537253782472, + "grad_norm": 1.889845848083496, + "learning_rate": 4.940544401222407e-05, + "loss": 4.3811, + "step": 11697 + }, + { + "epoch": 0.06957131982110572, + "grad_norm": 2.0076160430908203, + "learning_rate": 4.9405342744616724e-05, + "loss": 4.3382, + "step": 11698 + }, + { + "epoch": 0.06957726710438672, + "grad_norm": 1.9708037376403809, + "learning_rate": 4.940524146848971e-05, + "loss": 4.4659, + "step": 11699 + }, + { + "epoch": 0.06958321438766771, + "grad_norm": 2.086454153060913, + "learning_rate": 4.940514018384309e-05, + "loss": 4.196, + "step": 11700 + }, + { + "epoch": 0.06958916167094871, + "grad_norm": 2.095062255859375, + "learning_rate": 4.940503889067689e-05, + "loss": 4.2062, + "step": 11701 + }, + { + "epoch": 0.06959510895422971, + "grad_norm": 2.0661754608154297, + "learning_rate": 4.940493758899114e-05, + "loss": 4.3468, + "step": 11702 + }, + { + "epoch": 0.0696010562375107, + "grad_norm": 2.073573350906372, + "learning_rate": 4.9404836278785875e-05, + "loss": 4.248, + "step": 11703 + }, + { + "epoch": 0.0696070035207917, + "grad_norm": 2.104018449783325, + "learning_rate": 4.940473496006114e-05, + "loss": 4.1523, + "step": 11704 + }, + { + "epoch": 0.0696129508040727, + "grad_norm": 2.067532777786255, + "learning_rate": 4.9404633632816954e-05, + "loss": 4.2721, + "step": 11705 + }, + { + "epoch": 0.06961889808735369, + "grad_norm": 2.036736249923706, + "learning_rate": 4.9404532297053376e-05, + "loss": 4.4057, + "step": 11706 + }, + { + "epoch": 0.0696248453706347, + "grad_norm": 1.9911088943481445, + "learning_rate": 4.940443095277042e-05, + "loss": 4.1875, + "step": 11707 + }, + { + "epoch": 0.0696307926539157, + "grad_norm": 2.017457962036133, + "learning_rate": 4.9404329599968124e-05, + "loss": 4.1506, + "step": 11708 + }, + { + "epoch": 0.06963673993719668, + "grad_norm": 1.8043596744537354, + "learning_rate": 4.940422823864654e-05, + "loss": 4.3937, + "step": 11709 + }, + { + "epoch": 0.06964268722047769, + "grad_norm": 2.0362250804901123, + "learning_rate": 4.9404126868805687e-05, + "loss": 3.8076, + "step": 11710 + }, + { + "epoch": 0.06964863450375869, + "grad_norm": 2.10723876953125, + "learning_rate": 4.940402549044561e-05, + "loss": 4.2487, + "step": 11711 + }, + { + "epoch": 0.06965458178703968, + "grad_norm": 2.1901967525482178, + "learning_rate": 4.940392410356632e-05, + "loss": 4.1183, + "step": 11712 + }, + { + "epoch": 0.06966052907032068, + "grad_norm": 2.196518659591675, + "learning_rate": 4.9403822708167896e-05, + "loss": 4.2959, + "step": 11713 + }, + { + "epoch": 0.06966647635360168, + "grad_norm": 2.1917595863342285, + "learning_rate": 4.940372130425034e-05, + "loss": 4.1011, + "step": 11714 + }, + { + "epoch": 0.06967242363688267, + "grad_norm": 2.14424991607666, + "learning_rate": 4.9403619891813696e-05, + "loss": 3.9033, + "step": 11715 + }, + { + "epoch": 0.06967837092016367, + "grad_norm": 1.9970608949661255, + "learning_rate": 4.9403518470858004e-05, + "loss": 3.9243, + "step": 11716 + }, + { + "epoch": 0.06968431820344467, + "grad_norm": 2.215721607208252, + "learning_rate": 4.9403417041383294e-05, + "loss": 4.0036, + "step": 11717 + }, + { + "epoch": 0.06969026548672566, + "grad_norm": 1.9153071641921997, + "learning_rate": 4.94033156033896e-05, + "loss": 5.6849, + "step": 11718 + }, + { + "epoch": 0.06969621277000666, + "grad_norm": 2.287951707839966, + "learning_rate": 4.9403214156876966e-05, + "loss": 4.3569, + "step": 11719 + }, + { + "epoch": 0.06970216005328767, + "grad_norm": 2.1257216930389404, + "learning_rate": 4.940311270184542e-05, + "loss": 4.1051, + "step": 11720 + }, + { + "epoch": 0.06970810733656865, + "grad_norm": 2.164879560470581, + "learning_rate": 4.9403011238295e-05, + "loss": 4.0754, + "step": 11721 + }, + { + "epoch": 0.06971405461984966, + "grad_norm": 2.2430567741394043, + "learning_rate": 4.940290976622574e-05, + "loss": 4.1251, + "step": 11722 + }, + { + "epoch": 0.06972000190313064, + "grad_norm": 2.2621891498565674, + "learning_rate": 4.940280828563768e-05, + "loss": 4.2302, + "step": 11723 + }, + { + "epoch": 0.06972594918641165, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.940270679653085e-05, + "loss": 4.2853, + "step": 11724 + }, + { + "epoch": 0.06973189646969265, + "grad_norm": 2.211843729019165, + "learning_rate": 4.940260529890528e-05, + "loss": 3.6609, + "step": 11725 + }, + { + "epoch": 0.06973784375297364, + "grad_norm": 1.8500425815582275, + "learning_rate": 4.940250379276102e-05, + "loss": 3.8701, + "step": 11726 + }, + { + "epoch": 0.06974379103625464, + "grad_norm": 2.09136962890625, + "learning_rate": 4.94024022780981e-05, + "loss": 4.5569, + "step": 11727 + }, + { + "epoch": 0.06974973831953564, + "grad_norm": 1.9922528266906738, + "learning_rate": 4.940230075491655e-05, + "loss": 4.4055, + "step": 11728 + }, + { + "epoch": 0.06975568560281663, + "grad_norm": 2.253831624984741, + "learning_rate": 4.940219922321641e-05, + "loss": 4.114, + "step": 11729 + }, + { + "epoch": 0.06976163288609763, + "grad_norm": 2.0647006034851074, + "learning_rate": 4.94020976829977e-05, + "loss": 4.9004, + "step": 11730 + }, + { + "epoch": 0.06976758016937863, + "grad_norm": 2.5659384727478027, + "learning_rate": 4.940199613426049e-05, + "loss": 5.0852, + "step": 11731 + }, + { + "epoch": 0.06977352745265962, + "grad_norm": 2.227599859237671, + "learning_rate": 4.9401894577004796e-05, + "loss": 5.1603, + "step": 11732 + }, + { + "epoch": 0.06977947473594062, + "grad_norm": 1.8170785903930664, + "learning_rate": 4.940179301123063e-05, + "loss": 5.8334, + "step": 11733 + }, + { + "epoch": 0.06978542201922162, + "grad_norm": 2.1795544624328613, + "learning_rate": 4.940169143693807e-05, + "loss": 5.668, + "step": 11734 + }, + { + "epoch": 0.06979136930250261, + "grad_norm": 2.1248555183410645, + "learning_rate": 4.940158985412713e-05, + "loss": 5.7604, + "step": 11735 + }, + { + "epoch": 0.06979731658578361, + "grad_norm": 1.9677635431289673, + "learning_rate": 4.9401488262797845e-05, + "loss": 5.6568, + "step": 11736 + }, + { + "epoch": 0.06980326386906462, + "grad_norm": 1.9796242713928223, + "learning_rate": 4.940138666295025e-05, + "loss": 5.4303, + "step": 11737 + }, + { + "epoch": 0.0698092111523456, + "grad_norm": 1.7489395141601562, + "learning_rate": 4.9401285054584385e-05, + "loss": 6.1782, + "step": 11738 + }, + { + "epoch": 0.0698151584356266, + "grad_norm": 1.8067989349365234, + "learning_rate": 4.940118343770028e-05, + "loss": 6.0974, + "step": 11739 + }, + { + "epoch": 0.06982110571890761, + "grad_norm": 1.7377318143844604, + "learning_rate": 4.940108181229798e-05, + "loss": 5.8477, + "step": 11740 + }, + { + "epoch": 0.0698270530021886, + "grad_norm": 2.297499656677246, + "learning_rate": 4.940098017837751e-05, + "loss": 4.8027, + "step": 11741 + }, + { + "epoch": 0.0698330002854696, + "grad_norm": 1.7340888977050781, + "learning_rate": 4.940087853593891e-05, + "loss": 5.5897, + "step": 11742 + }, + { + "epoch": 0.0698389475687506, + "grad_norm": 2.019639730453491, + "learning_rate": 4.9400776884982216e-05, + "loss": 5.4493, + "step": 11743 + }, + { + "epoch": 0.06984489485203159, + "grad_norm": 1.7959356307983398, + "learning_rate": 4.9400675225507466e-05, + "loss": 5.5995, + "step": 11744 + }, + { + "epoch": 0.06985084213531259, + "grad_norm": 2.234757661819458, + "learning_rate": 4.940057355751468e-05, + "loss": 5.9542, + "step": 11745 + }, + { + "epoch": 0.06985678941859359, + "grad_norm": 2.047755241394043, + "learning_rate": 4.9400471881003925e-05, + "loss": 5.9125, + "step": 11746 + }, + { + "epoch": 0.06986273670187458, + "grad_norm": 1.9563192129135132, + "learning_rate": 4.940037019597521e-05, + "loss": 5.7298, + "step": 11747 + }, + { + "epoch": 0.06986868398515558, + "grad_norm": 2.7170934677124023, + "learning_rate": 4.940026850242857e-05, + "loss": 5.5172, + "step": 11748 + }, + { + "epoch": 0.06987463126843659, + "grad_norm": 2.326277494430542, + "learning_rate": 4.9400166800364056e-05, + "loss": 5.685, + "step": 11749 + }, + { + "epoch": 0.06988057855171757, + "grad_norm": 1.708383321762085, + "learning_rate": 4.94000650897817e-05, + "loss": 5.3879, + "step": 11750 + }, + { + "epoch": 0.06988652583499858, + "grad_norm": 1.897631049156189, + "learning_rate": 4.9399963370681527e-05, + "loss": 5.6856, + "step": 11751 + }, + { + "epoch": 0.06989247311827956, + "grad_norm": 2.227720260620117, + "learning_rate": 4.939986164306357e-05, + "loss": 5.4487, + "step": 11752 + }, + { + "epoch": 0.06989842040156057, + "grad_norm": 2.7821953296661377, + "learning_rate": 4.939975990692789e-05, + "loss": 5.7276, + "step": 11753 + }, + { + "epoch": 0.06990436768484157, + "grad_norm": 1.8389033079147339, + "learning_rate": 4.939965816227449e-05, + "loss": 5.6933, + "step": 11754 + }, + { + "epoch": 0.06991031496812256, + "grad_norm": 1.7653162479400635, + "learning_rate": 4.939955640910343e-05, + "loss": 5.6079, + "step": 11755 + }, + { + "epoch": 0.06991626225140356, + "grad_norm": 1.7504348754882812, + "learning_rate": 4.939945464741475e-05, + "loss": 6.0413, + "step": 11756 + }, + { + "epoch": 0.06992220953468456, + "grad_norm": 2.118326187133789, + "learning_rate": 4.939935287720845e-05, + "loss": 5.8937, + "step": 11757 + }, + { + "epoch": 0.06992815681796555, + "grad_norm": 1.9626812934875488, + "learning_rate": 4.93992510984846e-05, + "loss": 5.9564, + "step": 11758 + }, + { + "epoch": 0.06993410410124655, + "grad_norm": 1.9915722608566284, + "learning_rate": 4.939914931124322e-05, + "loss": 5.6851, + "step": 11759 + }, + { + "epoch": 0.06994005138452755, + "grad_norm": 1.7959195375442505, + "learning_rate": 4.939904751548435e-05, + "loss": 4.785, + "step": 11760 + }, + { + "epoch": 0.06994599866780854, + "grad_norm": 1.8472923040390015, + "learning_rate": 4.9398945711208025e-05, + "loss": 5.2683, + "step": 11761 + }, + { + "epoch": 0.06995194595108954, + "grad_norm": 1.4207996129989624, + "learning_rate": 4.9398843898414274e-05, + "loss": 5.5402, + "step": 11762 + }, + { + "epoch": 0.06995789323437054, + "grad_norm": 2.122070550918579, + "learning_rate": 4.9398742077103146e-05, + "loss": 5.5397, + "step": 11763 + }, + { + "epoch": 0.06996384051765153, + "grad_norm": 2.285970687866211, + "learning_rate": 4.939864024727467e-05, + "loss": 5.1401, + "step": 11764 + }, + { + "epoch": 0.06996978780093253, + "grad_norm": 2.1245667934417725, + "learning_rate": 4.9398538408928874e-05, + "loss": 5.2009, + "step": 11765 + }, + { + "epoch": 0.06997573508421354, + "grad_norm": 1.8151131868362427, + "learning_rate": 4.939843656206581e-05, + "loss": 4.8635, + "step": 11766 + }, + { + "epoch": 0.06998168236749452, + "grad_norm": 1.9139370918273926, + "learning_rate": 4.9398334706685494e-05, + "loss": 5.5998, + "step": 11767 + }, + { + "epoch": 0.06998762965077553, + "grad_norm": 1.6889853477478027, + "learning_rate": 4.9398232842787976e-05, + "loss": 5.6183, + "step": 11768 + }, + { + "epoch": 0.06999357693405653, + "grad_norm": 1.773409366607666, + "learning_rate": 4.939813097037329e-05, + "loss": 5.5083, + "step": 11769 + }, + { + "epoch": 0.06999952421733752, + "grad_norm": 2.195955991744995, + "learning_rate": 4.9398029089441465e-05, + "loss": 6.4436, + "step": 11770 + }, + { + "epoch": 0.07000547150061852, + "grad_norm": 2.058687448501587, + "learning_rate": 4.939792719999254e-05, + "loss": 6.2875, + "step": 11771 + }, + { + "epoch": 0.07001141878389952, + "grad_norm": 1.9074562788009644, + "learning_rate": 4.939782530202655e-05, + "loss": 5.8764, + "step": 11772 + }, + { + "epoch": 0.07001736606718051, + "grad_norm": 2.163663864135742, + "learning_rate": 4.9397723395543535e-05, + "loss": 5.4666, + "step": 11773 + }, + { + "epoch": 0.07002331335046151, + "grad_norm": 2.2188286781311035, + "learning_rate": 4.939762148054352e-05, + "loss": 6.0679, + "step": 11774 + }, + { + "epoch": 0.07002926063374251, + "grad_norm": 1.8202224969863892, + "learning_rate": 4.9397519557026553e-05, + "loss": 6.0465, + "step": 11775 + }, + { + "epoch": 0.0700352079170235, + "grad_norm": 1.9515994787216187, + "learning_rate": 4.939741762499266e-05, + "loss": 5.9634, + "step": 11776 + }, + { + "epoch": 0.0700411552003045, + "grad_norm": 1.772741675376892, + "learning_rate": 4.9397315684441886e-05, + "loss": 5.3117, + "step": 11777 + }, + { + "epoch": 0.0700471024835855, + "grad_norm": 1.7377926111221313, + "learning_rate": 4.9397213735374256e-05, + "loss": 5.7082, + "step": 11778 + }, + { + "epoch": 0.0700530497668665, + "grad_norm": 1.881205439567566, + "learning_rate": 4.939711177778982e-05, + "loss": 5.8463, + "step": 11779 + }, + { + "epoch": 0.0700589970501475, + "grad_norm": 1.893402099609375, + "learning_rate": 4.939700981168859e-05, + "loss": 5.8321, + "step": 11780 + }, + { + "epoch": 0.07006494433342848, + "grad_norm": 1.6830201148986816, + "learning_rate": 4.939690783707063e-05, + "loss": 5.8655, + "step": 11781 + }, + { + "epoch": 0.07007089161670949, + "grad_norm": 1.9164643287658691, + "learning_rate": 4.939680585393595e-05, + "loss": 5.7089, + "step": 11782 + }, + { + "epoch": 0.07007683889999049, + "grad_norm": 1.5564945936203003, + "learning_rate": 4.93967038622846e-05, + "loss": 5.8671, + "step": 11783 + }, + { + "epoch": 0.07008278618327148, + "grad_norm": 1.6557695865631104, + "learning_rate": 4.939660186211662e-05, + "loss": 5.7461, + "step": 11784 + }, + { + "epoch": 0.07008873346655248, + "grad_norm": 1.7161173820495605, + "learning_rate": 4.9396499853432035e-05, + "loss": 5.0569, + "step": 11785 + }, + { + "epoch": 0.07009468074983348, + "grad_norm": 1.6760550737380981, + "learning_rate": 4.939639783623088e-05, + "loss": 5.4683, + "step": 11786 + }, + { + "epoch": 0.07010062803311447, + "grad_norm": 1.818652629852295, + "learning_rate": 4.9396295810513196e-05, + "loss": 4.9676, + "step": 11787 + }, + { + "epoch": 0.07010657531639547, + "grad_norm": 2.016510009765625, + "learning_rate": 4.939619377627901e-05, + "loss": 5.255, + "step": 11788 + }, + { + "epoch": 0.07011252259967647, + "grad_norm": 2.1893560886383057, + "learning_rate": 4.939609173352838e-05, + "loss": 5.0798, + "step": 11789 + }, + { + "epoch": 0.07011846988295746, + "grad_norm": 1.8063241243362427, + "learning_rate": 4.939598968226132e-05, + "loss": 5.049, + "step": 11790 + }, + { + "epoch": 0.07012441716623846, + "grad_norm": 1.7766486406326294, + "learning_rate": 4.939588762247786e-05, + "loss": 4.8375, + "step": 11791 + }, + { + "epoch": 0.07013036444951946, + "grad_norm": 1.6848721504211426, + "learning_rate": 4.9395785554178066e-05, + "loss": 4.7944, + "step": 11792 + }, + { + "epoch": 0.07013631173280045, + "grad_norm": 1.5173190832138062, + "learning_rate": 4.939568347736195e-05, + "loss": 4.8558, + "step": 11793 + }, + { + "epoch": 0.07014225901608145, + "grad_norm": 1.9625753164291382, + "learning_rate": 4.939558139202955e-05, + "loss": 5.0129, + "step": 11794 + }, + { + "epoch": 0.07014820629936246, + "grad_norm": 2.1610453128814697, + "learning_rate": 4.93954792981809e-05, + "loss": 5.7208, + "step": 11795 + }, + { + "epoch": 0.07015415358264344, + "grad_norm": 2.272775411605835, + "learning_rate": 4.939537719581605e-05, + "loss": 5.3673, + "step": 11796 + }, + { + "epoch": 0.07016010086592445, + "grad_norm": 1.8652429580688477, + "learning_rate": 4.9395275084935025e-05, + "loss": 5.7692, + "step": 11797 + }, + { + "epoch": 0.07016604814920545, + "grad_norm": 1.6594206094741821, + "learning_rate": 4.939517296553786e-05, + "loss": 5.7201, + "step": 11798 + }, + { + "epoch": 0.07017199543248644, + "grad_norm": 1.7499476671218872, + "learning_rate": 4.939507083762459e-05, + "loss": 5.6471, + "step": 11799 + }, + { + "epoch": 0.07017794271576744, + "grad_norm": 2.050825834274292, + "learning_rate": 4.939496870119525e-05, + "loss": 5.4805, + "step": 11800 + }, + { + "epoch": 0.07018388999904844, + "grad_norm": 2.033815383911133, + "learning_rate": 4.939486655624988e-05, + "loss": 5.7465, + "step": 11801 + }, + { + "epoch": 0.07018983728232943, + "grad_norm": 1.7499231100082397, + "learning_rate": 4.939476440278852e-05, + "loss": 5.0271, + "step": 11802 + }, + { + "epoch": 0.07019578456561043, + "grad_norm": 2.331024646759033, + "learning_rate": 4.939466224081119e-05, + "loss": 5.0491, + "step": 11803 + }, + { + "epoch": 0.07020173184889143, + "grad_norm": 2.089859962463379, + "learning_rate": 4.939456007031794e-05, + "loss": 5.6678, + "step": 11804 + }, + { + "epoch": 0.07020767913217242, + "grad_norm": 2.0704381465911865, + "learning_rate": 4.93944578913088e-05, + "loss": 5.5128, + "step": 11805 + }, + { + "epoch": 0.07021362641545342, + "grad_norm": 2.3215534687042236, + "learning_rate": 4.939435570378381e-05, + "loss": 4.8886, + "step": 11806 + }, + { + "epoch": 0.07021957369873442, + "grad_norm": 2.2506353855133057, + "learning_rate": 4.9394253507743004e-05, + "loss": 4.8606, + "step": 11807 + }, + { + "epoch": 0.07022552098201541, + "grad_norm": 1.9065401554107666, + "learning_rate": 4.939415130318641e-05, + "loss": 5.4306, + "step": 11808 + }, + { + "epoch": 0.07023146826529642, + "grad_norm": 1.9229549169540405, + "learning_rate": 4.9394049090114076e-05, + "loss": 5.5586, + "step": 11809 + }, + { + "epoch": 0.0702374155485774, + "grad_norm": 1.857392430305481, + "learning_rate": 4.939394686852603e-05, + "loss": 5.382, + "step": 11810 + }, + { + "epoch": 0.0702433628318584, + "grad_norm": 2.0430874824523926, + "learning_rate": 4.939384463842231e-05, + "loss": 5.4362, + "step": 11811 + }, + { + "epoch": 0.07024931011513941, + "grad_norm": 1.839227318763733, + "learning_rate": 4.939374239980294e-05, + "loss": 5.0285, + "step": 11812 + }, + { + "epoch": 0.0702552573984204, + "grad_norm": 1.9690957069396973, + "learning_rate": 4.939364015266798e-05, + "loss": 5.5512, + "step": 11813 + }, + { + "epoch": 0.0702612046817014, + "grad_norm": 1.819841980934143, + "learning_rate": 4.939353789701745e-05, + "loss": 5.4886, + "step": 11814 + }, + { + "epoch": 0.0702671519649824, + "grad_norm": 1.7670280933380127, + "learning_rate": 4.939343563285138e-05, + "loss": 5.0925, + "step": 11815 + }, + { + "epoch": 0.07027309924826339, + "grad_norm": 1.478452444076538, + "learning_rate": 4.9393333360169824e-05, + "loss": 5.6562, + "step": 11816 + }, + { + "epoch": 0.07027904653154439, + "grad_norm": 1.7796739339828491, + "learning_rate": 4.93932310789728e-05, + "loss": 5.7462, + "step": 11817 + }, + { + "epoch": 0.07028499381482539, + "grad_norm": 1.425431728363037, + "learning_rate": 4.939312878926036e-05, + "loss": 5.6002, + "step": 11818 + }, + { + "epoch": 0.07029094109810638, + "grad_norm": 1.7066885232925415, + "learning_rate": 4.939302649103252e-05, + "loss": 5.3827, + "step": 11819 + }, + { + "epoch": 0.07029688838138738, + "grad_norm": 1.5144743919372559, + "learning_rate": 4.939292418428933e-05, + "loss": 5.094, + "step": 11820 + }, + { + "epoch": 0.07030283566466838, + "grad_norm": 1.5426355600357056, + "learning_rate": 4.939282186903082e-05, + "loss": 5.4808, + "step": 11821 + }, + { + "epoch": 0.07030878294794937, + "grad_norm": 1.5655393600463867, + "learning_rate": 4.9392719545257034e-05, + "loss": 5.5422, + "step": 11822 + }, + { + "epoch": 0.07031473023123037, + "grad_norm": 1.2810043096542358, + "learning_rate": 4.9392617212967995e-05, + "loss": 5.5069, + "step": 11823 + }, + { + "epoch": 0.07032067751451138, + "grad_norm": 1.534588098526001, + "learning_rate": 4.9392514872163754e-05, + "loss": 5.4887, + "step": 11824 + }, + { + "epoch": 0.07032662479779236, + "grad_norm": 1.6692357063293457, + "learning_rate": 4.9392412522844325e-05, + "loss": 5.4235, + "step": 11825 + }, + { + "epoch": 0.07033257208107337, + "grad_norm": 2.1246654987335205, + "learning_rate": 4.939231016500977e-05, + "loss": 5.4533, + "step": 11826 + }, + { + "epoch": 0.07033851936435437, + "grad_norm": 2.0235774517059326, + "learning_rate": 4.9392207798660106e-05, + "loss": 5.0393, + "step": 11827 + }, + { + "epoch": 0.07034446664763536, + "grad_norm": 1.7843154668807983, + "learning_rate": 4.939210542379537e-05, + "loss": 5.2501, + "step": 11828 + }, + { + "epoch": 0.07035041393091636, + "grad_norm": 2.1056478023529053, + "learning_rate": 4.939200304041561e-05, + "loss": 5.7809, + "step": 11829 + }, + { + "epoch": 0.07035636121419736, + "grad_norm": 2.0902159214019775, + "learning_rate": 4.939190064852085e-05, + "loss": 5.591, + "step": 11830 + }, + { + "epoch": 0.07036230849747835, + "grad_norm": 2.3349802494049072, + "learning_rate": 4.9391798248111134e-05, + "loss": 4.7641, + "step": 11831 + }, + { + "epoch": 0.07036825578075935, + "grad_norm": 1.6848636865615845, + "learning_rate": 4.939169583918648e-05, + "loss": 5.5082, + "step": 11832 + }, + { + "epoch": 0.07037420306404035, + "grad_norm": 1.958947777748108, + "learning_rate": 4.939159342174695e-05, + "loss": 5.433, + "step": 11833 + }, + { + "epoch": 0.07038015034732134, + "grad_norm": 1.7382566928863525, + "learning_rate": 4.939149099579256e-05, + "loss": 5.5014, + "step": 11834 + }, + { + "epoch": 0.07038609763060234, + "grad_norm": 2.469529867172241, + "learning_rate": 4.939138856132336e-05, + "loss": 4.6383, + "step": 11835 + }, + { + "epoch": 0.07039204491388334, + "grad_norm": 2.127711057662964, + "learning_rate": 4.939128611833937e-05, + "loss": 5.6088, + "step": 11836 + }, + { + "epoch": 0.07039799219716433, + "grad_norm": 2.252210855484009, + "learning_rate": 4.9391183666840636e-05, + "loss": 5.027, + "step": 11837 + }, + { + "epoch": 0.07040393948044534, + "grad_norm": 1.990277647972107, + "learning_rate": 4.9391081206827194e-05, + "loss": 5.6389, + "step": 11838 + }, + { + "epoch": 0.07040988676372632, + "grad_norm": 2.170099973678589, + "learning_rate": 4.939097873829908e-05, + "loss": 5.5588, + "step": 11839 + }, + { + "epoch": 0.07041583404700733, + "grad_norm": 2.4616951942443848, + "learning_rate": 4.939087626125632e-05, + "loss": 5.6505, + "step": 11840 + }, + { + "epoch": 0.07042178133028833, + "grad_norm": 1.9600075483322144, + "learning_rate": 4.9390773775698964e-05, + "loss": 5.1086, + "step": 11841 + }, + { + "epoch": 0.07042772861356932, + "grad_norm": 2.173632860183716, + "learning_rate": 4.939067128162703e-05, + "loss": 5.8069, + "step": 11842 + }, + { + "epoch": 0.07043367589685032, + "grad_norm": 1.9921432733535767, + "learning_rate": 4.939056877904058e-05, + "loss": 5.3222, + "step": 11843 + }, + { + "epoch": 0.07043962318013132, + "grad_norm": 2.1605379581451416, + "learning_rate": 4.939046626793962e-05, + "loss": 5.1565, + "step": 11844 + }, + { + "epoch": 0.07044557046341231, + "grad_norm": 2.2240231037139893, + "learning_rate": 4.9390363748324206e-05, + "loss": 5.3633, + "step": 11845 + }, + { + "epoch": 0.07045151774669331, + "grad_norm": 2.1935648918151855, + "learning_rate": 4.9390261220194374e-05, + "loss": 5.3715, + "step": 11846 + }, + { + "epoch": 0.07045746502997431, + "grad_norm": 2.3079628944396973, + "learning_rate": 4.9390158683550146e-05, + "loss": 5.4728, + "step": 11847 + }, + { + "epoch": 0.0704634123132553, + "grad_norm": 2.1652259826660156, + "learning_rate": 4.939005613839157e-05, + "loss": 5.276, + "step": 11848 + }, + { + "epoch": 0.0704693595965363, + "grad_norm": 1.75044846534729, + "learning_rate": 4.938995358471867e-05, + "loss": 5.3, + "step": 11849 + }, + { + "epoch": 0.0704753068798173, + "grad_norm": 2.11893892288208, + "learning_rate": 4.93898510225315e-05, + "loss": 5.3949, + "step": 11850 + }, + { + "epoch": 0.07048125416309829, + "grad_norm": 1.8546398878097534, + "learning_rate": 4.938974845183008e-05, + "loss": 5.3606, + "step": 11851 + }, + { + "epoch": 0.0704872014463793, + "grad_norm": 2.2334201335906982, + "learning_rate": 4.9389645872614456e-05, + "loss": 5.1987, + "step": 11852 + }, + { + "epoch": 0.0704931487296603, + "grad_norm": 2.0545856952667236, + "learning_rate": 4.938954328488465e-05, + "loss": 5.2742, + "step": 11853 + }, + { + "epoch": 0.07049909601294128, + "grad_norm": 2.011322498321533, + "learning_rate": 4.938944068864071e-05, + "loss": 5.3738, + "step": 11854 + }, + { + "epoch": 0.07050504329622229, + "grad_norm": 1.6539164781570435, + "learning_rate": 4.9389338083882664e-05, + "loss": 5.1915, + "step": 11855 + }, + { + "epoch": 0.07051099057950329, + "grad_norm": 1.9423818588256836, + "learning_rate": 4.9389235470610564e-05, + "loss": 5.4432, + "step": 11856 + }, + { + "epoch": 0.07051693786278428, + "grad_norm": 1.9459011554718018, + "learning_rate": 4.938913284882442e-05, + "loss": 5.2929, + "step": 11857 + }, + { + "epoch": 0.07052288514606528, + "grad_norm": 2.0341713428497314, + "learning_rate": 4.938903021852429e-05, + "loss": 5.1413, + "step": 11858 + }, + { + "epoch": 0.07052883242934628, + "grad_norm": 2.1413371562957764, + "learning_rate": 4.93889275797102e-05, + "loss": 5.0283, + "step": 11859 + }, + { + "epoch": 0.07053477971262727, + "grad_norm": 1.9965273141860962, + "learning_rate": 4.9388824932382185e-05, + "loss": 5.0919, + "step": 11860 + }, + { + "epoch": 0.07054072699590827, + "grad_norm": 1.9912536144256592, + "learning_rate": 4.938872227654028e-05, + "loss": 4.72, + "step": 11861 + }, + { + "epoch": 0.07054667427918927, + "grad_norm": 2.267775058746338, + "learning_rate": 4.9388619612184533e-05, + "loss": 5.3942, + "step": 11862 + }, + { + "epoch": 0.07055262156247026, + "grad_norm": 2.0529544353485107, + "learning_rate": 4.9388516939314965e-05, + "loss": 5.504, + "step": 11863 + }, + { + "epoch": 0.07055856884575126, + "grad_norm": 2.124903678894043, + "learning_rate": 4.938841425793162e-05, + "loss": 5.3684, + "step": 11864 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 2.2070152759552, + "learning_rate": 4.938831156803453e-05, + "loss": 5.1349, + "step": 11865 + }, + { + "epoch": 0.07057046341231325, + "grad_norm": 1.717877745628357, + "learning_rate": 4.9388208869623734e-05, + "loss": 5.2605, + "step": 11866 + }, + { + "epoch": 0.07057641069559425, + "grad_norm": 2.258847951889038, + "learning_rate": 4.9388106162699266e-05, + "loss": 4.9048, + "step": 11867 + }, + { + "epoch": 0.07058235797887524, + "grad_norm": 2.065905809402466, + "learning_rate": 4.938800344726117e-05, + "loss": 5.0523, + "step": 11868 + }, + { + "epoch": 0.07058830526215625, + "grad_norm": 2.13053035736084, + "learning_rate": 4.9387900723309455e-05, + "loss": 5.1551, + "step": 11869 + }, + { + "epoch": 0.07059425254543725, + "grad_norm": 2.0323257446289062, + "learning_rate": 4.938779799084419e-05, + "loss": 5.0807, + "step": 11870 + }, + { + "epoch": 0.07060019982871824, + "grad_norm": 2.0503158569335938, + "learning_rate": 4.9387695249865396e-05, + "loss": 5.1946, + "step": 11871 + }, + { + "epoch": 0.07060614711199924, + "grad_norm": 2.069227933883667, + "learning_rate": 4.9387592500373105e-05, + "loss": 5.0027, + "step": 11872 + }, + { + "epoch": 0.07061209439528024, + "grad_norm": 2.0208382606506348, + "learning_rate": 4.9387489742367354e-05, + "loss": 5.0877, + "step": 11873 + }, + { + "epoch": 0.07061804167856123, + "grad_norm": 2.0159859657287598, + "learning_rate": 4.9387386975848196e-05, + "loss": 4.864, + "step": 11874 + }, + { + "epoch": 0.07062398896184223, + "grad_norm": 1.9365311861038208, + "learning_rate": 4.9387284200815645e-05, + "loss": 4.7373, + "step": 11875 + }, + { + "epoch": 0.07062993624512323, + "grad_norm": 2.1024274826049805, + "learning_rate": 4.9387181417269736e-05, + "loss": 5.0155, + "step": 11876 + }, + { + "epoch": 0.07063588352840422, + "grad_norm": 2.5438032150268555, + "learning_rate": 4.938707862521052e-05, + "loss": 5.3267, + "step": 11877 + }, + { + "epoch": 0.07064183081168522, + "grad_norm": 2.129715919494629, + "learning_rate": 4.938697582463804e-05, + "loss": 5.104, + "step": 11878 + }, + { + "epoch": 0.07064777809496622, + "grad_norm": 2.237442970275879, + "learning_rate": 4.9386873015552303e-05, + "loss": 5.134, + "step": 11879 + }, + { + "epoch": 0.07065372537824721, + "grad_norm": 2.2773404121398926, + "learning_rate": 4.9386770197953366e-05, + "loss": 5.269, + "step": 11880 + }, + { + "epoch": 0.07065967266152821, + "grad_norm": 2.0882620811462402, + "learning_rate": 4.938666737184125e-05, + "loss": 4.8091, + "step": 11881 + }, + { + "epoch": 0.07066561994480922, + "grad_norm": 2.0649476051330566, + "learning_rate": 4.938656453721602e-05, + "loss": 4.9143, + "step": 11882 + }, + { + "epoch": 0.0706715672280902, + "grad_norm": 2.19030499458313, + "learning_rate": 4.938646169407768e-05, + "loss": 4.7439, + "step": 11883 + }, + { + "epoch": 0.0706775145113712, + "grad_norm": 2.8669347763061523, + "learning_rate": 4.938635884242628e-05, + "loss": 4.3684, + "step": 11884 + }, + { + "epoch": 0.07068346179465221, + "grad_norm": 2.3018336296081543, + "learning_rate": 4.9386255982261854e-05, + "loss": 4.8602, + "step": 11885 + }, + { + "epoch": 0.0706894090779332, + "grad_norm": 2.7775471210479736, + "learning_rate": 4.938615311358443e-05, + "loss": 5.2401, + "step": 11886 + }, + { + "epoch": 0.0706953563612142, + "grad_norm": 2.1075756549835205, + "learning_rate": 4.938605023639406e-05, + "loss": 5.1085, + "step": 11887 + }, + { + "epoch": 0.0707013036444952, + "grad_norm": 2.456530809402466, + "learning_rate": 4.9385947350690776e-05, + "loss": 5.0506, + "step": 11888 + }, + { + "epoch": 0.07070725092777619, + "grad_norm": 1.76799738407135, + "learning_rate": 4.9385844456474605e-05, + "loss": 4.8233, + "step": 11889 + }, + { + "epoch": 0.07071319821105719, + "grad_norm": 2.0819127559661865, + "learning_rate": 4.938574155374559e-05, + "loss": 4.4198, + "step": 11890 + }, + { + "epoch": 0.07071914549433819, + "grad_norm": 2.221586227416992, + "learning_rate": 4.9385638642503765e-05, + "loss": 4.2423, + "step": 11891 + }, + { + "epoch": 0.07072509277761918, + "grad_norm": 2.108182668685913, + "learning_rate": 4.938553572274916e-05, + "loss": 4.2564, + "step": 11892 + }, + { + "epoch": 0.07073104006090018, + "grad_norm": 1.9631624221801758, + "learning_rate": 4.938543279448182e-05, + "loss": 4.1641, + "step": 11893 + }, + { + "epoch": 0.07073698734418118, + "grad_norm": 1.9730273485183716, + "learning_rate": 4.938532985770178e-05, + "loss": 4.0728, + "step": 11894 + }, + { + "epoch": 0.07074293462746217, + "grad_norm": 1.9632551670074463, + "learning_rate": 4.9385226912409065e-05, + "loss": 4.2014, + "step": 11895 + }, + { + "epoch": 0.07074888191074317, + "grad_norm": 1.9986671209335327, + "learning_rate": 4.9385123958603726e-05, + "loss": 4.0299, + "step": 11896 + }, + { + "epoch": 0.07075482919402416, + "grad_norm": 2.2256031036376953, + "learning_rate": 4.9385020996285794e-05, + "loss": 4.1397, + "step": 11897 + }, + { + "epoch": 0.07076077647730517, + "grad_norm": 2.231462001800537, + "learning_rate": 4.9384918025455296e-05, + "loss": 4.0977, + "step": 11898 + }, + { + "epoch": 0.07076672376058617, + "grad_norm": 2.0946438312530518, + "learning_rate": 4.938481504611227e-05, + "loss": 3.9446, + "step": 11899 + }, + { + "epoch": 0.07077267104386716, + "grad_norm": 1.6953986883163452, + "learning_rate": 4.938471205825677e-05, + "loss": 4.6809, + "step": 11900 + }, + { + "epoch": 0.07077861832714816, + "grad_norm": 2.1963350772857666, + "learning_rate": 4.938460906188882e-05, + "loss": 4.3626, + "step": 11901 + }, + { + "epoch": 0.07078456561042916, + "grad_norm": 2.2069251537323, + "learning_rate": 4.938450605700845e-05, + "loss": 4.1057, + "step": 11902 + }, + { + "epoch": 0.07079051289371015, + "grad_norm": 2.1809592247009277, + "learning_rate": 4.9384403043615694e-05, + "loss": 3.5619, + "step": 11903 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 2.305171012878418, + "learning_rate": 4.938430002171061e-05, + "loss": 5.8033, + "step": 11904 + }, + { + "epoch": 0.07080240746027215, + "grad_norm": 2.1984407901763916, + "learning_rate": 4.9384196991293205e-05, + "loss": 3.5869, + "step": 11905 + }, + { + "epoch": 0.07080835474355314, + "grad_norm": 1.8870881795883179, + "learning_rate": 4.938409395236353e-05, + "loss": 4.8027, + "step": 11906 + }, + { + "epoch": 0.07081430202683414, + "grad_norm": 2.11314058303833, + "learning_rate": 4.938399090492163e-05, + "loss": 4.1942, + "step": 11907 + }, + { + "epoch": 0.07082024931011514, + "grad_norm": 2.143794298171997, + "learning_rate": 4.938388784896752e-05, + "loss": 3.8526, + "step": 11908 + }, + { + "epoch": 0.07082619659339613, + "grad_norm": 2.4311232566833496, + "learning_rate": 4.938378478450125e-05, + "loss": 3.8572, + "step": 11909 + }, + { + "epoch": 0.07083214387667713, + "grad_norm": 2.0959818363189697, + "learning_rate": 4.9383681711522855e-05, + "loss": 4.3465, + "step": 11910 + }, + { + "epoch": 0.07083809115995814, + "grad_norm": 1.9161559343338013, + "learning_rate": 4.938357863003237e-05, + "loss": 5.5608, + "step": 11911 + }, + { + "epoch": 0.07084403844323912, + "grad_norm": 1.8549482822418213, + "learning_rate": 4.9383475540029824e-05, + "loss": 5.9874, + "step": 11912 + }, + { + "epoch": 0.07084998572652013, + "grad_norm": 1.8600444793701172, + "learning_rate": 4.9383372441515255e-05, + "loss": 6.0579, + "step": 11913 + }, + { + "epoch": 0.07085593300980113, + "grad_norm": 1.6985594034194946, + "learning_rate": 4.938326933448871e-05, + "loss": 5.7963, + "step": 11914 + }, + { + "epoch": 0.07086188029308212, + "grad_norm": 2.06860613822937, + "learning_rate": 4.9383166218950216e-05, + "loss": 5.4789, + "step": 11915 + }, + { + "epoch": 0.07086782757636312, + "grad_norm": 2.8111190795898438, + "learning_rate": 4.938306309489982e-05, + "loss": 5.2546, + "step": 11916 + }, + { + "epoch": 0.07087377485964412, + "grad_norm": 2.700589895248413, + "learning_rate": 4.9382959962337536e-05, + "loss": 5.2021, + "step": 11917 + }, + { + "epoch": 0.07087972214292511, + "grad_norm": 2.364793539047241, + "learning_rate": 4.938285682126341e-05, + "loss": 4.9508, + "step": 11918 + }, + { + "epoch": 0.07088566942620611, + "grad_norm": 2.4212446212768555, + "learning_rate": 4.938275367167749e-05, + "loss": 5.1269, + "step": 11919 + }, + { + "epoch": 0.07089161670948711, + "grad_norm": 1.785733699798584, + "learning_rate": 4.93826505135798e-05, + "loss": 5.7357, + "step": 11920 + }, + { + "epoch": 0.0708975639927681, + "grad_norm": 1.6912823915481567, + "learning_rate": 4.9382547346970376e-05, + "loss": 5.4003, + "step": 11921 + }, + { + "epoch": 0.0709035112760491, + "grad_norm": 1.8408714532852173, + "learning_rate": 4.938244417184926e-05, + "loss": 5.3169, + "step": 11922 + }, + { + "epoch": 0.0709094585593301, + "grad_norm": 2.3245468139648438, + "learning_rate": 4.938234098821648e-05, + "loss": 4.9588, + "step": 11923 + }, + { + "epoch": 0.07091540584261109, + "grad_norm": 1.922179102897644, + "learning_rate": 4.938223779607208e-05, + "loss": 5.431, + "step": 11924 + }, + { + "epoch": 0.0709213531258921, + "grad_norm": 1.8331208229064941, + "learning_rate": 4.9382134595416094e-05, + "loss": 5.9121, + "step": 11925 + }, + { + "epoch": 0.07092730040917308, + "grad_norm": 2.15932297706604, + "learning_rate": 4.9382031386248556e-05, + "loss": 5.058, + "step": 11926 + }, + { + "epoch": 0.07093324769245409, + "grad_norm": 2.2255606651306152, + "learning_rate": 4.93819281685695e-05, + "loss": 4.9215, + "step": 11927 + }, + { + "epoch": 0.07093919497573509, + "grad_norm": 2.3665359020233154, + "learning_rate": 4.938182494237897e-05, + "loss": 4.8405, + "step": 11928 + }, + { + "epoch": 0.07094514225901608, + "grad_norm": 2.1564438343048096, + "learning_rate": 4.938172170767699e-05, + "loss": 4.9598, + "step": 11929 + }, + { + "epoch": 0.07095108954229708, + "grad_norm": 2.2083945274353027, + "learning_rate": 4.938161846446361e-05, + "loss": 4.8603, + "step": 11930 + }, + { + "epoch": 0.07095703682557808, + "grad_norm": 2.3422255516052246, + "learning_rate": 4.938151521273885e-05, + "loss": 4.8926, + "step": 11931 + }, + { + "epoch": 0.07096298410885907, + "grad_norm": 2.5269415378570557, + "learning_rate": 4.9381411952502764e-05, + "loss": 4.876, + "step": 11932 + }, + { + "epoch": 0.07096893139214007, + "grad_norm": 2.1761882305145264, + "learning_rate": 4.9381308683755376e-05, + "loss": 4.7533, + "step": 11933 + }, + { + "epoch": 0.07097487867542107, + "grad_norm": 2.078146457672119, + "learning_rate": 4.938120540649672e-05, + "loss": 4.9606, + "step": 11934 + }, + { + "epoch": 0.07098082595870206, + "grad_norm": 2.3086254596710205, + "learning_rate": 4.9381102120726846e-05, + "loss": 4.7763, + "step": 11935 + }, + { + "epoch": 0.07098677324198306, + "grad_norm": 1.8531124591827393, + "learning_rate": 4.938099882644578e-05, + "loss": 5.0218, + "step": 11936 + }, + { + "epoch": 0.07099272052526406, + "grad_norm": 2.2169790267944336, + "learning_rate": 4.938089552365355e-05, + "loss": 6.0072, + "step": 11937 + }, + { + "epoch": 0.07099866780854505, + "grad_norm": 1.8759880065917969, + "learning_rate": 4.938079221235021e-05, + "loss": 5.8259, + "step": 11938 + }, + { + "epoch": 0.07100461509182605, + "grad_norm": 2.026217222213745, + "learning_rate": 4.938068889253579e-05, + "loss": 5.4426, + "step": 11939 + }, + { + "epoch": 0.07101056237510706, + "grad_norm": 2.5047786235809326, + "learning_rate": 4.938058556421031e-05, + "loss": 4.7276, + "step": 11940 + }, + { + "epoch": 0.07101650965838804, + "grad_norm": 2.243281602859497, + "learning_rate": 4.938048222737383e-05, + "loss": 4.9284, + "step": 11941 + }, + { + "epoch": 0.07102245694166905, + "grad_norm": 1.989563226699829, + "learning_rate": 4.938037888202637e-05, + "loss": 5.7744, + "step": 11942 + }, + { + "epoch": 0.07102840422495005, + "grad_norm": 1.829290509223938, + "learning_rate": 4.9380275528167974e-05, + "loss": 5.6942, + "step": 11943 + }, + { + "epoch": 0.07103435150823104, + "grad_norm": 1.8001593351364136, + "learning_rate": 4.938017216579868e-05, + "loss": 5.6928, + "step": 11944 + }, + { + "epoch": 0.07104029879151204, + "grad_norm": 1.7705434560775757, + "learning_rate": 4.938006879491851e-05, + "loss": 5.6954, + "step": 11945 + }, + { + "epoch": 0.07104624607479304, + "grad_norm": 1.8746812343597412, + "learning_rate": 4.937996541552752e-05, + "loss": 5.7184, + "step": 11946 + }, + { + "epoch": 0.07105219335807403, + "grad_norm": 1.6931661367416382, + "learning_rate": 4.937986202762573e-05, + "loss": 5.398, + "step": 11947 + }, + { + "epoch": 0.07105814064135503, + "grad_norm": 2.0784003734588623, + "learning_rate": 4.937975863121318e-05, + "loss": 5.7164, + "step": 11948 + }, + { + "epoch": 0.07106408792463603, + "grad_norm": 1.8495618104934692, + "learning_rate": 4.937965522628991e-05, + "loss": 5.7093, + "step": 11949 + }, + { + "epoch": 0.07107003520791702, + "grad_norm": 1.7720533609390259, + "learning_rate": 4.9379551812855964e-05, + "loss": 5.7548, + "step": 11950 + }, + { + "epoch": 0.07107598249119802, + "grad_norm": 1.721205472946167, + "learning_rate": 4.937944839091135e-05, + "loss": 5.7496, + "step": 11951 + }, + { + "epoch": 0.07108192977447902, + "grad_norm": 1.896657109260559, + "learning_rate": 4.9379344960456145e-05, + "loss": 5.5989, + "step": 11952 + }, + { + "epoch": 0.07108787705776001, + "grad_norm": 1.4022153615951538, + "learning_rate": 4.9379241521490344e-05, + "loss": 5.5029, + "step": 11953 + }, + { + "epoch": 0.07109382434104101, + "grad_norm": 1.9068467617034912, + "learning_rate": 4.937913807401401e-05, + "loss": 5.6915, + "step": 11954 + }, + { + "epoch": 0.071099771624322, + "grad_norm": 1.6542187929153442, + "learning_rate": 4.9379034618027164e-05, + "loss": 5.6409, + "step": 11955 + }, + { + "epoch": 0.071105718907603, + "grad_norm": 1.5280201435089111, + "learning_rate": 4.937893115352986e-05, + "loss": 5.6264, + "step": 11956 + }, + { + "epoch": 0.07111166619088401, + "grad_norm": 1.767232060432434, + "learning_rate": 4.937882768052211e-05, + "loss": 5.4562, + "step": 11957 + }, + { + "epoch": 0.071117613474165, + "grad_norm": 1.571892261505127, + "learning_rate": 4.9378724199003975e-05, + "loss": 5.7949, + "step": 11958 + }, + { + "epoch": 0.071123560757446, + "grad_norm": 1.9400190114974976, + "learning_rate": 4.937862070897548e-05, + "loss": 5.5872, + "step": 11959 + }, + { + "epoch": 0.071129508040727, + "grad_norm": 1.7246766090393066, + "learning_rate": 4.937851721043665e-05, + "loss": 5.8455, + "step": 11960 + }, + { + "epoch": 0.07113545532400799, + "grad_norm": 1.937168002128601, + "learning_rate": 4.9378413703387534e-05, + "loss": 5.0864, + "step": 11961 + }, + { + "epoch": 0.07114140260728899, + "grad_norm": 2.3808209896087646, + "learning_rate": 4.937831018782817e-05, + "loss": 4.5918, + "step": 11962 + }, + { + "epoch": 0.07114734989056999, + "grad_norm": 2.567026138305664, + "learning_rate": 4.937820666375859e-05, + "loss": 4.7375, + "step": 11963 + }, + { + "epoch": 0.07115329717385098, + "grad_norm": 1.8941316604614258, + "learning_rate": 4.937810313117882e-05, + "loss": 5.811, + "step": 11964 + }, + { + "epoch": 0.07115924445713198, + "grad_norm": 1.9301189184188843, + "learning_rate": 4.9377999590088916e-05, + "loss": 5.7947, + "step": 11965 + }, + { + "epoch": 0.07116519174041298, + "grad_norm": 2.281784772872925, + "learning_rate": 4.93778960404889e-05, + "loss": 5.5993, + "step": 11966 + }, + { + "epoch": 0.07117113902369397, + "grad_norm": 1.7826297283172607, + "learning_rate": 4.937779248237882e-05, + "loss": 6.1836, + "step": 11967 + }, + { + "epoch": 0.07117708630697497, + "grad_norm": 2.8714182376861572, + "learning_rate": 4.9377688915758694e-05, + "loss": 5.3955, + "step": 11968 + }, + { + "epoch": 0.07118303359025598, + "grad_norm": 2.3284013271331787, + "learning_rate": 4.937758534062857e-05, + "loss": 5.3027, + "step": 11969 + }, + { + "epoch": 0.07118898087353696, + "grad_norm": 1.8880923986434937, + "learning_rate": 4.937748175698849e-05, + "loss": 5.8408, + "step": 11970 + }, + { + "epoch": 0.07119492815681797, + "grad_norm": 2.8952460289001465, + "learning_rate": 4.937737816483847e-05, + "loss": 4.7325, + "step": 11971 + }, + { + "epoch": 0.07120087544009897, + "grad_norm": 2.5028738975524902, + "learning_rate": 4.9377274564178574e-05, + "loss": 4.5854, + "step": 11972 + }, + { + "epoch": 0.07120682272337996, + "grad_norm": 1.8834285736083984, + "learning_rate": 4.9377170955008815e-05, + "loss": 5.5415, + "step": 11973 + }, + { + "epoch": 0.07121277000666096, + "grad_norm": 2.162062644958496, + "learning_rate": 4.937706733732924e-05, + "loss": 5.2187, + "step": 11974 + }, + { + "epoch": 0.07121871728994196, + "grad_norm": 2.1506881713867188, + "learning_rate": 4.937696371113988e-05, + "loss": 5.1746, + "step": 11975 + }, + { + "epoch": 0.07122466457322295, + "grad_norm": 2.0309176445007324, + "learning_rate": 4.937686007644078e-05, + "loss": 5.1708, + "step": 11976 + }, + { + "epoch": 0.07123061185650395, + "grad_norm": 2.251579523086548, + "learning_rate": 4.9376756433231966e-05, + "loss": 6.0623, + "step": 11977 + }, + { + "epoch": 0.07123655913978495, + "grad_norm": 2.161918878555298, + "learning_rate": 4.937665278151348e-05, + "loss": 6.2297, + "step": 11978 + }, + { + "epoch": 0.07124250642306594, + "grad_norm": 1.703783631324768, + "learning_rate": 4.937654912128535e-05, + "loss": 5.9388, + "step": 11979 + }, + { + "epoch": 0.07124845370634694, + "grad_norm": 1.7420361042022705, + "learning_rate": 4.937644545254763e-05, + "loss": 5.5426, + "step": 11980 + }, + { + "epoch": 0.07125440098962794, + "grad_norm": 1.8634297847747803, + "learning_rate": 4.937634177530033e-05, + "loss": 5.8412, + "step": 11981 + }, + { + "epoch": 0.07126034827290893, + "grad_norm": 1.8084121942520142, + "learning_rate": 4.937623808954351e-05, + "loss": 6.266, + "step": 11982 + }, + { + "epoch": 0.07126629555618993, + "grad_norm": 1.5925266742706299, + "learning_rate": 4.93761343952772e-05, + "loss": 5.7173, + "step": 11983 + }, + { + "epoch": 0.07127224283947092, + "grad_norm": 1.7778257131576538, + "learning_rate": 4.937603069250143e-05, + "loss": 5.8119, + "step": 11984 + }, + { + "epoch": 0.07127819012275192, + "grad_norm": 1.6839842796325684, + "learning_rate": 4.9375926981216235e-05, + "loss": 5.9446, + "step": 11985 + }, + { + "epoch": 0.07128413740603293, + "grad_norm": 1.7892810106277466, + "learning_rate": 4.937582326142166e-05, + "loss": 5.9564, + "step": 11986 + }, + { + "epoch": 0.07129008468931392, + "grad_norm": 1.7179774045944214, + "learning_rate": 4.9375719533117734e-05, + "loss": 6.1969, + "step": 11987 + }, + { + "epoch": 0.07129603197259492, + "grad_norm": 1.3788355588912964, + "learning_rate": 4.93756157963045e-05, + "loss": 6.0409, + "step": 11988 + }, + { + "epoch": 0.07130197925587592, + "grad_norm": 1.6451042890548706, + "learning_rate": 4.9375512050981986e-05, + "loss": 5.8116, + "step": 11989 + }, + { + "epoch": 0.07130792653915691, + "grad_norm": 1.8904451131820679, + "learning_rate": 4.937540829715024e-05, + "loss": 5.7952, + "step": 11990 + }, + { + "epoch": 0.07131387382243791, + "grad_norm": 1.4976747035980225, + "learning_rate": 4.9375304534809284e-05, + "loss": 5.7092, + "step": 11991 + }, + { + "epoch": 0.07131982110571891, + "grad_norm": 1.5585631132125854, + "learning_rate": 4.937520076395916e-05, + "loss": 6.0693, + "step": 11992 + }, + { + "epoch": 0.0713257683889999, + "grad_norm": 1.8329144716262817, + "learning_rate": 4.937509698459991e-05, + "loss": 5.5883, + "step": 11993 + }, + { + "epoch": 0.0713317156722809, + "grad_norm": 2.6030189990997314, + "learning_rate": 4.937499319673157e-05, + "loss": 5.1776, + "step": 11994 + }, + { + "epoch": 0.0713376629555619, + "grad_norm": 1.744042992591858, + "learning_rate": 4.9374889400354165e-05, + "loss": 5.4105, + "step": 11995 + }, + { + "epoch": 0.07134361023884289, + "grad_norm": 1.819018006324768, + "learning_rate": 4.937478559546774e-05, + "loss": 5.5695, + "step": 11996 + }, + { + "epoch": 0.0713495575221239, + "grad_norm": 1.754894733428955, + "learning_rate": 4.9374681782072325e-05, + "loss": 5.7519, + "step": 11997 + }, + { + "epoch": 0.0713555048054049, + "grad_norm": 2.132507085800171, + "learning_rate": 4.9374577960167964e-05, + "loss": 4.9783, + "step": 11998 + }, + { + "epoch": 0.07136145208868588, + "grad_norm": 2.0926709175109863, + "learning_rate": 4.937447412975469e-05, + "loss": 4.905, + "step": 11999 + }, + { + "epoch": 0.07136739937196689, + "grad_norm": 2.1235594749450684, + "learning_rate": 4.937437029083254e-05, + "loss": 4.7978, + "step": 12000 + }, + { + "epoch": 0.07137334665524789, + "grad_norm": 2.217911720275879, + "learning_rate": 4.937426644340154e-05, + "loss": 4.9506, + "step": 12001 + }, + { + "epoch": 0.07137929393852888, + "grad_norm": 2.0362601280212402, + "learning_rate": 4.937416258746175e-05, + "loss": 5.0299, + "step": 12002 + }, + { + "epoch": 0.07138524122180988, + "grad_norm": 2.2846896648406982, + "learning_rate": 4.937405872301318e-05, + "loss": 5.0606, + "step": 12003 + }, + { + "epoch": 0.07139118850509088, + "grad_norm": 2.2545530796051025, + "learning_rate": 4.937395485005588e-05, + "loss": 4.8651, + "step": 12004 + }, + { + "epoch": 0.07139713578837187, + "grad_norm": 2.32738995552063, + "learning_rate": 4.937385096858989e-05, + "loss": 4.7908, + "step": 12005 + }, + { + "epoch": 0.07140308307165287, + "grad_norm": 2.239215850830078, + "learning_rate": 4.9373747078615235e-05, + "loss": 4.7545, + "step": 12006 + }, + { + "epoch": 0.07140903035493387, + "grad_norm": 2.4766969680786133, + "learning_rate": 4.937364318013196e-05, + "loss": 5.0795, + "step": 12007 + }, + { + "epoch": 0.07141497763821486, + "grad_norm": 2.602111577987671, + "learning_rate": 4.937353927314009e-05, + "loss": 4.6898, + "step": 12008 + }, + { + "epoch": 0.07142092492149586, + "grad_norm": 2.8508496284484863, + "learning_rate": 4.937343535763968e-05, + "loss": 4.3136, + "step": 12009 + }, + { + "epoch": 0.07142687220477686, + "grad_norm": 2.4613311290740967, + "learning_rate": 4.9373331433630754e-05, + "loss": 4.4826, + "step": 12010 + }, + { + "epoch": 0.07143281948805785, + "grad_norm": 2.561643362045288, + "learning_rate": 4.937322750111334e-05, + "loss": 4.251, + "step": 12011 + }, + { + "epoch": 0.07143876677133885, + "grad_norm": 2.397507667541504, + "learning_rate": 4.93731235600875e-05, + "loss": 4.3018, + "step": 12012 + }, + { + "epoch": 0.07144471405461984, + "grad_norm": 2.250120162963867, + "learning_rate": 4.937301961055324e-05, + "loss": 4.1796, + "step": 12013 + }, + { + "epoch": 0.07145066133790084, + "grad_norm": 2.337451934814453, + "learning_rate": 4.9372915652510615e-05, + "loss": 4.2362, + "step": 12014 + }, + { + "epoch": 0.07145660862118185, + "grad_norm": 2.357034921646118, + "learning_rate": 4.937281168595966e-05, + "loss": 4.0961, + "step": 12015 + }, + { + "epoch": 0.07146255590446284, + "grad_norm": 2.0843617916107178, + "learning_rate": 4.93727077109004e-05, + "loss": 4.4584, + "step": 12016 + }, + { + "epoch": 0.07146850318774384, + "grad_norm": 2.149707317352295, + "learning_rate": 4.937260372733289e-05, + "loss": 4.2248, + "step": 12017 + }, + { + "epoch": 0.07147445047102484, + "grad_norm": 2.149765729904175, + "learning_rate": 4.937249973525715e-05, + "loss": 4.154, + "step": 12018 + }, + { + "epoch": 0.07148039775430583, + "grad_norm": 2.1572682857513428, + "learning_rate": 4.937239573467323e-05, + "loss": 4.2345, + "step": 12019 + }, + { + "epoch": 0.07148634503758683, + "grad_norm": 2.246751070022583, + "learning_rate": 4.9372291725581145e-05, + "loss": 3.9739, + "step": 12020 + }, + { + "epoch": 0.07149229232086783, + "grad_norm": 2.2735042572021484, + "learning_rate": 4.9372187707980955e-05, + "loss": 4.0442, + "step": 12021 + }, + { + "epoch": 0.07149823960414882, + "grad_norm": 2.2270023822784424, + "learning_rate": 4.9372083681872684e-05, + "loss": 4.0374, + "step": 12022 + }, + { + "epoch": 0.07150418688742982, + "grad_norm": 2.2228193283081055, + "learning_rate": 4.937197964725637e-05, + "loss": 4.0503, + "step": 12023 + }, + { + "epoch": 0.07151013417071082, + "grad_norm": 2.2630691528320312, + "learning_rate": 4.9371875604132046e-05, + "loss": 4.0431, + "step": 12024 + }, + { + "epoch": 0.07151608145399181, + "grad_norm": 2.2461886405944824, + "learning_rate": 4.937177155249976e-05, + "loss": 4.1164, + "step": 12025 + }, + { + "epoch": 0.07152202873727281, + "grad_norm": 1.9476062059402466, + "learning_rate": 4.937166749235953e-05, + "loss": 4.317, + "step": 12026 + }, + { + "epoch": 0.07152797602055382, + "grad_norm": 2.33138370513916, + "learning_rate": 4.937156342371141e-05, + "loss": 4.1309, + "step": 12027 + }, + { + "epoch": 0.0715339233038348, + "grad_norm": 3.3887436389923096, + "learning_rate": 4.937145934655543e-05, + "loss": 5.1713, + "step": 12028 + }, + { + "epoch": 0.0715398705871158, + "grad_norm": 2.499302625656128, + "learning_rate": 4.937135526089162e-05, + "loss": 4.0553, + "step": 12029 + }, + { + "epoch": 0.07154581787039681, + "grad_norm": 2.4269003868103027, + "learning_rate": 4.937125116672002e-05, + "loss": 4.0425, + "step": 12030 + }, + { + "epoch": 0.0715517651536778, + "grad_norm": 2.1819067001342773, + "learning_rate": 4.937114706404067e-05, + "loss": 4.0591, + "step": 12031 + }, + { + "epoch": 0.0715577124369588, + "grad_norm": 1.8021305799484253, + "learning_rate": 4.937104295285361e-05, + "loss": 4.9171, + "step": 12032 + }, + { + "epoch": 0.0715636597202398, + "grad_norm": 2.1833691596984863, + "learning_rate": 4.937093883315887e-05, + "loss": 4.053, + "step": 12033 + }, + { + "epoch": 0.07156960700352079, + "grad_norm": 2.1684465408325195, + "learning_rate": 4.9370834704956484e-05, + "loss": 4.0692, + "step": 12034 + }, + { + "epoch": 0.07157555428680179, + "grad_norm": 2.1576929092407227, + "learning_rate": 4.937073056824649e-05, + "loss": 3.9958, + "step": 12035 + }, + { + "epoch": 0.07158150157008279, + "grad_norm": 1.5627915859222412, + "learning_rate": 4.9370626423028924e-05, + "loss": 5.3373, + "step": 12036 + }, + { + "epoch": 0.07158744885336378, + "grad_norm": 1.6166819334030151, + "learning_rate": 4.937052226930383e-05, + "loss": 5.801, + "step": 12037 + }, + { + "epoch": 0.07159339613664478, + "grad_norm": 1.4187299013137817, + "learning_rate": 4.937041810707124e-05, + "loss": 5.5937, + "step": 12038 + }, + { + "epoch": 0.07159934341992578, + "grad_norm": 1.5857088565826416, + "learning_rate": 4.937031393633118e-05, + "loss": 5.6268, + "step": 12039 + }, + { + "epoch": 0.07160529070320677, + "grad_norm": 1.5691097974777222, + "learning_rate": 4.93702097570837e-05, + "loss": 5.7414, + "step": 12040 + }, + { + "epoch": 0.07161123798648777, + "grad_norm": 1.4723674058914185, + "learning_rate": 4.9370105569328835e-05, + "loss": 5.4711, + "step": 12041 + }, + { + "epoch": 0.07161718526976876, + "grad_norm": 1.686745047569275, + "learning_rate": 4.937000137306661e-05, + "loss": 5.4302, + "step": 12042 + }, + { + "epoch": 0.07162313255304976, + "grad_norm": 1.7394465208053589, + "learning_rate": 4.936989716829707e-05, + "loss": 5.1609, + "step": 12043 + }, + { + "epoch": 0.07162907983633077, + "grad_norm": 1.4348796606063843, + "learning_rate": 4.9369792955020264e-05, + "loss": 5.2468, + "step": 12044 + }, + { + "epoch": 0.07163502711961175, + "grad_norm": 1.674187421798706, + "learning_rate": 4.93696887332362e-05, + "loss": 5.2451, + "step": 12045 + }, + { + "epoch": 0.07164097440289276, + "grad_norm": 1.6606419086456299, + "learning_rate": 4.9369584502944934e-05, + "loss": 5.2744, + "step": 12046 + }, + { + "epoch": 0.07164692168617376, + "grad_norm": 1.4020198583602905, + "learning_rate": 4.93694802641465e-05, + "loss": 5.2914, + "step": 12047 + }, + { + "epoch": 0.07165286896945475, + "grad_norm": 1.4234102964401245, + "learning_rate": 4.936937601684093e-05, + "loss": 5.2405, + "step": 12048 + }, + { + "epoch": 0.07165881625273575, + "grad_norm": 1.261983036994934, + "learning_rate": 4.936927176102827e-05, + "loss": 5.1532, + "step": 12049 + }, + { + "epoch": 0.07166476353601675, + "grad_norm": 1.3787094354629517, + "learning_rate": 4.9369167496708534e-05, + "loss": 5.2033, + "step": 12050 + }, + { + "epoch": 0.07167071081929774, + "grad_norm": 1.405142068862915, + "learning_rate": 4.9369063223881786e-05, + "loss": 5.0391, + "step": 12051 + }, + { + "epoch": 0.07167665810257874, + "grad_norm": 1.513554573059082, + "learning_rate": 4.936895894254804e-05, + "loss": 5.0236, + "step": 12052 + }, + { + "epoch": 0.07168260538585974, + "grad_norm": 1.4279611110687256, + "learning_rate": 4.9368854652707355e-05, + "loss": 5.1429, + "step": 12053 + }, + { + "epoch": 0.07168855266914073, + "grad_norm": 1.4320182800292969, + "learning_rate": 4.936875035435974e-05, + "loss": 5.0519, + "step": 12054 + }, + { + "epoch": 0.07169449995242173, + "grad_norm": 1.415925145149231, + "learning_rate": 4.936864604750526e-05, + "loss": 4.9904, + "step": 12055 + }, + { + "epoch": 0.07170044723570274, + "grad_norm": 1.403998851776123, + "learning_rate": 4.936854173214393e-05, + "loss": 4.8988, + "step": 12056 + }, + { + "epoch": 0.07170639451898372, + "grad_norm": 1.744532585144043, + "learning_rate": 4.936843740827579e-05, + "loss": 4.9661, + "step": 12057 + }, + { + "epoch": 0.07171234180226473, + "grad_norm": 1.4900517463684082, + "learning_rate": 4.9368333075900884e-05, + "loss": 5.1887, + "step": 12058 + }, + { + "epoch": 0.07171828908554573, + "grad_norm": 1.454063057899475, + "learning_rate": 4.936822873501925e-05, + "loss": 5.2801, + "step": 12059 + }, + { + "epoch": 0.07172423636882672, + "grad_norm": 1.5426071882247925, + "learning_rate": 4.936812438563092e-05, + "loss": 5.1987, + "step": 12060 + }, + { + "epoch": 0.07173018365210772, + "grad_norm": 1.7365894317626953, + "learning_rate": 4.936802002773592e-05, + "loss": 5.1933, + "step": 12061 + }, + { + "epoch": 0.07173613093538872, + "grad_norm": 1.5046216249465942, + "learning_rate": 4.9367915661334295e-05, + "loss": 5.1688, + "step": 12062 + }, + { + "epoch": 0.07174207821866971, + "grad_norm": 1.6715713739395142, + "learning_rate": 4.936781128642609e-05, + "loss": 5.3649, + "step": 12063 + }, + { + "epoch": 0.07174802550195071, + "grad_norm": 1.6386772394180298, + "learning_rate": 4.936770690301134e-05, + "loss": 5.4107, + "step": 12064 + }, + { + "epoch": 0.07175397278523171, + "grad_norm": 1.604153037071228, + "learning_rate": 4.936760251109006e-05, + "loss": 5.2952, + "step": 12065 + }, + { + "epoch": 0.0717599200685127, + "grad_norm": 1.7100228071212769, + "learning_rate": 4.9367498110662306e-05, + "loss": 5.202, + "step": 12066 + }, + { + "epoch": 0.0717658673517937, + "grad_norm": 1.4062007665634155, + "learning_rate": 4.9367393701728116e-05, + "loss": 5.2246, + "step": 12067 + }, + { + "epoch": 0.0717718146350747, + "grad_norm": 1.4552310705184937, + "learning_rate": 4.9367289284287514e-05, + "loss": 5.5919, + "step": 12068 + }, + { + "epoch": 0.07177776191835569, + "grad_norm": 1.5134438276290894, + "learning_rate": 4.9367184858340546e-05, + "loss": 5.3921, + "step": 12069 + }, + { + "epoch": 0.0717837092016367, + "grad_norm": 1.724139928817749, + "learning_rate": 4.9367080423887246e-05, + "loss": 5.6409, + "step": 12070 + }, + { + "epoch": 0.07178965648491768, + "grad_norm": 1.7401317358016968, + "learning_rate": 4.9366975980927655e-05, + "loss": 4.8093, + "step": 12071 + }, + { + "epoch": 0.07179560376819868, + "grad_norm": 2.3226993083953857, + "learning_rate": 4.93668715294618e-05, + "loss": 4.2685, + "step": 12072 + }, + { + "epoch": 0.07180155105147969, + "grad_norm": 2.200608730316162, + "learning_rate": 4.9366767069489715e-05, + "loss": 4.1155, + "step": 12073 + }, + { + "epoch": 0.07180749833476067, + "grad_norm": 2.381131649017334, + "learning_rate": 4.936666260101145e-05, + "loss": 3.9837, + "step": 12074 + }, + { + "epoch": 0.07181344561804168, + "grad_norm": 2.2567548751831055, + "learning_rate": 4.936655812402704e-05, + "loss": 4.0642, + "step": 12075 + }, + { + "epoch": 0.07181939290132268, + "grad_norm": 2.253011703491211, + "learning_rate": 4.9366453638536506e-05, + "loss": 4.0683, + "step": 12076 + }, + { + "epoch": 0.07182534018460367, + "grad_norm": 2.3459978103637695, + "learning_rate": 4.93663491445399e-05, + "loss": 4.0525, + "step": 12077 + }, + { + "epoch": 0.07183128746788467, + "grad_norm": 2.3964619636535645, + "learning_rate": 4.9366244642037254e-05, + "loss": 4.0198, + "step": 12078 + }, + { + "epoch": 0.07183723475116567, + "grad_norm": 2.392293930053711, + "learning_rate": 4.93661401310286e-05, + "loss": 3.7765, + "step": 12079 + }, + { + "epoch": 0.07184318203444666, + "grad_norm": 2.3027987480163574, + "learning_rate": 4.936603561151398e-05, + "loss": 4.0315, + "step": 12080 + }, + { + "epoch": 0.07184912931772766, + "grad_norm": 2.3942925930023193, + "learning_rate": 4.936593108349343e-05, + "loss": 4.1308, + "step": 12081 + }, + { + "epoch": 0.07185507660100866, + "grad_norm": 2.183898687362671, + "learning_rate": 4.9365826546966984e-05, + "loss": 4.0779, + "step": 12082 + }, + { + "epoch": 0.07186102388428965, + "grad_norm": 2.3463728427886963, + "learning_rate": 4.936572200193468e-05, + "loss": 4.0035, + "step": 12083 + }, + { + "epoch": 0.07186697116757065, + "grad_norm": 2.3459651470184326, + "learning_rate": 4.9365617448396556e-05, + "loss": 4.0577, + "step": 12084 + }, + { + "epoch": 0.07187291845085166, + "grad_norm": 2.169189691543579, + "learning_rate": 4.936551288635264e-05, + "loss": 4.2678, + "step": 12085 + }, + { + "epoch": 0.07187886573413264, + "grad_norm": 2.3313188552856445, + "learning_rate": 4.936540831580299e-05, + "loss": 4.9956, + "step": 12086 + }, + { + "epoch": 0.07188481301741365, + "grad_norm": 2.431053400039673, + "learning_rate": 4.936530373674761e-05, + "loss": 5.2317, + "step": 12087 + }, + { + "epoch": 0.07189076030069465, + "grad_norm": 1.8984981775283813, + "learning_rate": 4.936519914918656e-05, + "loss": 5.4541, + "step": 12088 + }, + { + "epoch": 0.07189670758397564, + "grad_norm": 1.8862982988357544, + "learning_rate": 4.9365094553119877e-05, + "loss": 5.6448, + "step": 12089 + }, + { + "epoch": 0.07190265486725664, + "grad_norm": 1.7802925109863281, + "learning_rate": 4.936498994854759e-05, + "loss": 5.3182, + "step": 12090 + }, + { + "epoch": 0.07190860215053764, + "grad_norm": 1.7578701972961426, + "learning_rate": 4.9364885335469734e-05, + "loss": 6.0188, + "step": 12091 + }, + { + "epoch": 0.07191454943381863, + "grad_norm": 1.6750003099441528, + "learning_rate": 4.9364780713886345e-05, + "loss": 6.0822, + "step": 12092 + }, + { + "epoch": 0.07192049671709963, + "grad_norm": 1.4945881366729736, + "learning_rate": 4.936467608379747e-05, + "loss": 6.0554, + "step": 12093 + }, + { + "epoch": 0.07192644400038063, + "grad_norm": 1.5508134365081787, + "learning_rate": 4.936457144520313e-05, + "loss": 5.9712, + "step": 12094 + }, + { + "epoch": 0.07193239128366162, + "grad_norm": 1.4133291244506836, + "learning_rate": 4.936446679810337e-05, + "loss": 5.9137, + "step": 12095 + }, + { + "epoch": 0.07193833856694262, + "grad_norm": 1.415930986404419, + "learning_rate": 4.936436214249823e-05, + "loss": 5.9957, + "step": 12096 + }, + { + "epoch": 0.07194428585022362, + "grad_norm": 1.682356595993042, + "learning_rate": 4.936425747838774e-05, + "loss": 6.2381, + "step": 12097 + }, + { + "epoch": 0.07195023313350461, + "grad_norm": 1.693535566329956, + "learning_rate": 4.9364152805771946e-05, + "loss": 6.0523, + "step": 12098 + }, + { + "epoch": 0.07195618041678561, + "grad_norm": 1.7577873468399048, + "learning_rate": 4.9364048124650875e-05, + "loss": 5.8243, + "step": 12099 + }, + { + "epoch": 0.0719621277000666, + "grad_norm": 1.6486074924468994, + "learning_rate": 4.936394343502457e-05, + "loss": 5.8072, + "step": 12100 + }, + { + "epoch": 0.0719680749833476, + "grad_norm": 1.5245120525360107, + "learning_rate": 4.936383873689306e-05, + "loss": 5.9013, + "step": 12101 + }, + { + "epoch": 0.0719740222666286, + "grad_norm": 1.4771286249160767, + "learning_rate": 4.936373403025638e-05, + "loss": 6.1314, + "step": 12102 + }, + { + "epoch": 0.0719799695499096, + "grad_norm": 1.7547197341918945, + "learning_rate": 4.936362931511458e-05, + "loss": 5.9725, + "step": 12103 + }, + { + "epoch": 0.0719859168331906, + "grad_norm": 1.9942286014556885, + "learning_rate": 4.936352459146769e-05, + "loss": 5.82, + "step": 12104 + }, + { + "epoch": 0.0719918641164716, + "grad_norm": 1.8367860317230225, + "learning_rate": 4.936341985931574e-05, + "loss": 5.8653, + "step": 12105 + }, + { + "epoch": 0.07199781139975259, + "grad_norm": 1.8277100324630737, + "learning_rate": 4.936331511865877e-05, + "loss": 5.6998, + "step": 12106 + }, + { + "epoch": 0.07200375868303359, + "grad_norm": 1.5308998823165894, + "learning_rate": 4.936321036949683e-05, + "loss": 5.822, + "step": 12107 + }, + { + "epoch": 0.07200970596631459, + "grad_norm": 1.7100377082824707, + "learning_rate": 4.936310561182993e-05, + "loss": 5.991, + "step": 12108 + }, + { + "epoch": 0.07201565324959558, + "grad_norm": 1.8563333749771118, + "learning_rate": 4.936300084565813e-05, + "loss": 5.8438, + "step": 12109 + }, + { + "epoch": 0.07202160053287658, + "grad_norm": 1.9967303276062012, + "learning_rate": 4.936289607098146e-05, + "loss": 5.6786, + "step": 12110 + }, + { + "epoch": 0.07202754781615758, + "grad_norm": 2.1997451782226562, + "learning_rate": 4.9362791287799945e-05, + "loss": 5.2983, + "step": 12111 + }, + { + "epoch": 0.07203349509943857, + "grad_norm": 2.144521713256836, + "learning_rate": 4.9362686496113644e-05, + "loss": 5.2942, + "step": 12112 + }, + { + "epoch": 0.07203944238271957, + "grad_norm": 2.0747883319854736, + "learning_rate": 4.936258169592257e-05, + "loss": 5.473, + "step": 12113 + }, + { + "epoch": 0.07204538966600058, + "grad_norm": 2.0386881828308105, + "learning_rate": 4.9362476887226776e-05, + "loss": 5.2557, + "step": 12114 + }, + { + "epoch": 0.07205133694928156, + "grad_norm": 2.190687894821167, + "learning_rate": 4.93623720700263e-05, + "loss": 5.3251, + "step": 12115 + }, + { + "epoch": 0.07205728423256257, + "grad_norm": 1.9349397420883179, + "learning_rate": 4.936226724432116e-05, + "loss": 5.242, + "step": 12116 + }, + { + "epoch": 0.07206323151584357, + "grad_norm": 2.175943613052368, + "learning_rate": 4.93621624101114e-05, + "loss": 5.185, + "step": 12117 + }, + { + "epoch": 0.07206917879912456, + "grad_norm": 2.053994655609131, + "learning_rate": 4.936205756739708e-05, + "loss": 5.0755, + "step": 12118 + }, + { + "epoch": 0.07207512608240556, + "grad_norm": 2.0012362003326416, + "learning_rate": 4.93619527161782e-05, + "loss": 5.1797, + "step": 12119 + }, + { + "epoch": 0.07208107336568656, + "grad_norm": 1.9441219568252563, + "learning_rate": 4.936184785645482e-05, + "loss": 5.5583, + "step": 12120 + }, + { + "epoch": 0.07208702064896755, + "grad_norm": 2.990767002105713, + "learning_rate": 4.936174298822696e-05, + "loss": 4.8348, + "step": 12121 + }, + { + "epoch": 0.07209296793224855, + "grad_norm": 2.8385918140411377, + "learning_rate": 4.936163811149469e-05, + "loss": 4.7299, + "step": 12122 + }, + { + "epoch": 0.07209891521552955, + "grad_norm": 2.5228044986724854, + "learning_rate": 4.9361533226258006e-05, + "loss": 4.622, + "step": 12123 + }, + { + "epoch": 0.07210486249881054, + "grad_norm": 2.317598581314087, + "learning_rate": 4.936142833251697e-05, + "loss": 4.588, + "step": 12124 + }, + { + "epoch": 0.07211080978209154, + "grad_norm": 2.369335889816284, + "learning_rate": 4.936132343027161e-05, + "loss": 4.3843, + "step": 12125 + }, + { + "epoch": 0.07211675706537254, + "grad_norm": 2.4761011600494385, + "learning_rate": 4.936121851952196e-05, + "loss": 4.4101, + "step": 12126 + }, + { + "epoch": 0.07212270434865353, + "grad_norm": 2.3830130100250244, + "learning_rate": 4.9361113600268065e-05, + "loss": 4.5065, + "step": 12127 + }, + { + "epoch": 0.07212865163193453, + "grad_norm": 2.4977028369903564, + "learning_rate": 4.936100867250996e-05, + "loss": 4.4469, + "step": 12128 + }, + { + "epoch": 0.07213459891521554, + "grad_norm": 2.3377795219421387, + "learning_rate": 4.9360903736247663e-05, + "loss": 4.4045, + "step": 12129 + }, + { + "epoch": 0.07214054619849652, + "grad_norm": 2.268906831741333, + "learning_rate": 4.9360798791481245e-05, + "loss": 4.4224, + "step": 12130 + }, + { + "epoch": 0.07214649348177753, + "grad_norm": 2.316899538040161, + "learning_rate": 4.936069383821072e-05, + "loss": 4.3704, + "step": 12131 + }, + { + "epoch": 0.07215244076505851, + "grad_norm": 2.419618606567383, + "learning_rate": 4.936058887643612e-05, + "loss": 5.493, + "step": 12132 + }, + { + "epoch": 0.07215838804833952, + "grad_norm": 2.081756353378296, + "learning_rate": 4.93604839061575e-05, + "loss": 6.2328, + "step": 12133 + }, + { + "epoch": 0.07216433533162052, + "grad_norm": 2.1638660430908203, + "learning_rate": 4.936037892737487e-05, + "loss": 6.3089, + "step": 12134 + }, + { + "epoch": 0.07217028261490151, + "grad_norm": 1.7972848415374756, + "learning_rate": 4.93602739400883e-05, + "loss": 6.4013, + "step": 12135 + }, + { + "epoch": 0.07217622989818251, + "grad_norm": 1.7160871028900146, + "learning_rate": 4.93601689442978e-05, + "loss": 6.1717, + "step": 12136 + }, + { + "epoch": 0.07218217718146351, + "grad_norm": 2.0931475162506104, + "learning_rate": 4.936006394000342e-05, + "loss": 5.3515, + "step": 12137 + }, + { + "epoch": 0.0721881244647445, + "grad_norm": 2.2872977256774902, + "learning_rate": 4.93599589272052e-05, + "loss": 5.8342, + "step": 12138 + }, + { + "epoch": 0.0721940717480255, + "grad_norm": 2.4082720279693604, + "learning_rate": 4.9359853905903166e-05, + "loss": 6.1651, + "step": 12139 + }, + { + "epoch": 0.0722000190313065, + "grad_norm": 2.120962381362915, + "learning_rate": 4.935974887609735e-05, + "loss": 6.1182, + "step": 12140 + }, + { + "epoch": 0.07220596631458749, + "grad_norm": 2.0507090091705322, + "learning_rate": 4.9359643837787805e-05, + "loss": 5.7158, + "step": 12141 + }, + { + "epoch": 0.0722119135978685, + "grad_norm": 2.099963426589966, + "learning_rate": 4.9359538790974556e-05, + "loss": 5.6952, + "step": 12142 + }, + { + "epoch": 0.0722178608811495, + "grad_norm": 1.7631537914276123, + "learning_rate": 4.935943373565765e-05, + "loss": 5.6649, + "step": 12143 + }, + { + "epoch": 0.07222380816443048, + "grad_norm": 1.739601492881775, + "learning_rate": 4.9359328671837115e-05, + "loss": 5.7258, + "step": 12144 + }, + { + "epoch": 0.07222975544771149, + "grad_norm": 1.630116581916809, + "learning_rate": 4.9359223599512996e-05, + "loss": 5.7305, + "step": 12145 + }, + { + "epoch": 0.07223570273099249, + "grad_norm": 1.6106374263763428, + "learning_rate": 4.935911851868531e-05, + "loss": 5.6779, + "step": 12146 + }, + { + "epoch": 0.07224165001427348, + "grad_norm": 1.945662021636963, + "learning_rate": 4.935901342935412e-05, + "loss": 5.716, + "step": 12147 + }, + { + "epoch": 0.07224759729755448, + "grad_norm": 1.8601467609405518, + "learning_rate": 4.935890833151944e-05, + "loss": 5.7539, + "step": 12148 + }, + { + "epoch": 0.07225354458083548, + "grad_norm": 1.8324257135391235, + "learning_rate": 4.9358803225181324e-05, + "loss": 5.7309, + "step": 12149 + }, + { + "epoch": 0.07225949186411647, + "grad_norm": 2.0564095973968506, + "learning_rate": 4.93586981103398e-05, + "loss": 5.7201, + "step": 12150 + }, + { + "epoch": 0.07226543914739747, + "grad_norm": 1.925706386566162, + "learning_rate": 4.93585929869949e-05, + "loss": 5.5736, + "step": 12151 + }, + { + "epoch": 0.07227138643067847, + "grad_norm": 1.5965845584869385, + "learning_rate": 4.935848785514667e-05, + "loss": 5.4351, + "step": 12152 + }, + { + "epoch": 0.07227733371395946, + "grad_norm": 2.2522077560424805, + "learning_rate": 4.935838271479515e-05, + "loss": 5.8261, + "step": 12153 + }, + { + "epoch": 0.07228328099724046, + "grad_norm": 2.242398738861084, + "learning_rate": 4.935827756594036e-05, + "loss": 5.9923, + "step": 12154 + }, + { + "epoch": 0.07228922828052146, + "grad_norm": 2.043266534805298, + "learning_rate": 4.935817240858236e-05, + "loss": 5.6127, + "step": 12155 + }, + { + "epoch": 0.07229517556380245, + "grad_norm": 2.4922964572906494, + "learning_rate": 4.935806724272116e-05, + "loss": 5.3549, + "step": 12156 + }, + { + "epoch": 0.07230112284708345, + "grad_norm": 2.5241329669952393, + "learning_rate": 4.935796206835682e-05, + "loss": 5.2194, + "step": 12157 + }, + { + "epoch": 0.07230707013036446, + "grad_norm": 2.4680237770080566, + "learning_rate": 4.9357856885489365e-05, + "loss": 5.1154, + "step": 12158 + }, + { + "epoch": 0.07231301741364544, + "grad_norm": 2.1012492179870605, + "learning_rate": 4.9357751694118824e-05, + "loss": 4.8526, + "step": 12159 + }, + { + "epoch": 0.07231896469692645, + "grad_norm": 1.9997994899749756, + "learning_rate": 4.935764649424526e-05, + "loss": 4.9778, + "step": 12160 + }, + { + "epoch": 0.07232491198020743, + "grad_norm": 1.770112156867981, + "learning_rate": 4.935754128586868e-05, + "loss": 5.0855, + "step": 12161 + }, + { + "epoch": 0.07233085926348844, + "grad_norm": 2.0865485668182373, + "learning_rate": 4.935743606898914e-05, + "loss": 5.1566, + "step": 12162 + }, + { + "epoch": 0.07233680654676944, + "grad_norm": 2.0801351070404053, + "learning_rate": 4.9357330843606677e-05, + "loss": 5.0611, + "step": 12163 + }, + { + "epoch": 0.07234275383005043, + "grad_norm": 1.8675305843353271, + "learning_rate": 4.935722560972131e-05, + "loss": 4.9216, + "step": 12164 + }, + { + "epoch": 0.07234870111333143, + "grad_norm": 1.9125452041625977, + "learning_rate": 4.935712036733309e-05, + "loss": 4.8363, + "step": 12165 + }, + { + "epoch": 0.07235464839661243, + "grad_norm": 2.4954965114593506, + "learning_rate": 4.935701511644205e-05, + "loss": 4.9816, + "step": 12166 + }, + { + "epoch": 0.07236059567989342, + "grad_norm": 2.412381410598755, + "learning_rate": 4.935690985704823e-05, + "loss": 4.9616, + "step": 12167 + }, + { + "epoch": 0.07236654296317442, + "grad_norm": 2.356994152069092, + "learning_rate": 4.9356804589151665e-05, + "loss": 4.8326, + "step": 12168 + }, + { + "epoch": 0.07237249024645542, + "grad_norm": 2.2399415969848633, + "learning_rate": 4.93566993127524e-05, + "loss": 4.8955, + "step": 12169 + }, + { + "epoch": 0.07237843752973641, + "grad_norm": 2.691772222518921, + "learning_rate": 4.935659402785044e-05, + "loss": 5.6475, + "step": 12170 + }, + { + "epoch": 0.07238438481301741, + "grad_norm": 2.954955816268921, + "learning_rate": 4.9356488734445865e-05, + "loss": 6.2151, + "step": 12171 + }, + { + "epoch": 0.07239033209629842, + "grad_norm": 2.010998010635376, + "learning_rate": 4.935638343253869e-05, + "loss": 5.9124, + "step": 12172 + }, + { + "epoch": 0.0723962793795794, + "grad_norm": 2.2737836837768555, + "learning_rate": 4.935627812212894e-05, + "loss": 5.4068, + "step": 12173 + }, + { + "epoch": 0.0724022266628604, + "grad_norm": 2.2700793743133545, + "learning_rate": 4.9356172803216675e-05, + "loss": 4.8156, + "step": 12174 + }, + { + "epoch": 0.07240817394614141, + "grad_norm": 2.2795162200927734, + "learning_rate": 4.935606747580192e-05, + "loss": 4.7882, + "step": 12175 + }, + { + "epoch": 0.0724141212294224, + "grad_norm": 2.1849277019500732, + "learning_rate": 4.9355962139884715e-05, + "loss": 4.9914, + "step": 12176 + }, + { + "epoch": 0.0724200685127034, + "grad_norm": 2.5336532592773438, + "learning_rate": 4.935585679546509e-05, + "loss": 4.8487, + "step": 12177 + }, + { + "epoch": 0.0724260157959844, + "grad_norm": 2.624995708465576, + "learning_rate": 4.935575144254309e-05, + "loss": 4.9523, + "step": 12178 + }, + { + "epoch": 0.07243196307926539, + "grad_norm": 2.5450191497802734, + "learning_rate": 4.935564608111875e-05, + "loss": 4.9958, + "step": 12179 + }, + { + "epoch": 0.07243791036254639, + "grad_norm": 2.2714452743530273, + "learning_rate": 4.9355540711192107e-05, + "loss": 5.301, + "step": 12180 + }, + { + "epoch": 0.07244385764582739, + "grad_norm": 2.0173168182373047, + "learning_rate": 4.935543533276319e-05, + "loss": 5.7992, + "step": 12181 + }, + { + "epoch": 0.07244980492910838, + "grad_norm": 2.9326014518737793, + "learning_rate": 4.9355329945832054e-05, + "loss": 5.6065, + "step": 12182 + }, + { + "epoch": 0.07245575221238938, + "grad_norm": 2.142066478729248, + "learning_rate": 4.935522455039871e-05, + "loss": 5.5339, + "step": 12183 + }, + { + "epoch": 0.07246169949567038, + "grad_norm": 1.8901113271713257, + "learning_rate": 4.9355119146463214e-05, + "loss": 5.8829, + "step": 12184 + }, + { + "epoch": 0.07246764677895137, + "grad_norm": 1.996052622795105, + "learning_rate": 4.93550137340256e-05, + "loss": 6.2189, + "step": 12185 + }, + { + "epoch": 0.07247359406223237, + "grad_norm": 1.7420963048934937, + "learning_rate": 4.93549083130859e-05, + "loss": 5.9254, + "step": 12186 + }, + { + "epoch": 0.07247954134551338, + "grad_norm": 2.8487229347229004, + "learning_rate": 4.935480288364416e-05, + "loss": 5.8643, + "step": 12187 + }, + { + "epoch": 0.07248548862879436, + "grad_norm": 3.0168306827545166, + "learning_rate": 4.93546974457004e-05, + "loss": 5.811, + "step": 12188 + }, + { + "epoch": 0.07249143591207537, + "grad_norm": 2.841353416442871, + "learning_rate": 4.935459199925467e-05, + "loss": 5.6832, + "step": 12189 + }, + { + "epoch": 0.07249738319535635, + "grad_norm": 2.3517918586730957, + "learning_rate": 4.9354486544307e-05, + "loss": 4.3651, + "step": 12190 + }, + { + "epoch": 0.07250333047863736, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.935438108085744e-05, + "loss": 4.2884, + "step": 12191 + }, + { + "epoch": 0.07250927776191836, + "grad_norm": 2.0812551975250244, + "learning_rate": 4.935427560890601e-05, + "loss": 4.168, + "step": 12192 + }, + { + "epoch": 0.07251522504519935, + "grad_norm": 2.0546631813049316, + "learning_rate": 4.935417012845275e-05, + "loss": 3.862, + "step": 12193 + }, + { + "epoch": 0.07252117232848035, + "grad_norm": 2.130612850189209, + "learning_rate": 4.935406463949771e-05, + "loss": 3.6729, + "step": 12194 + }, + { + "epoch": 0.07252711961176135, + "grad_norm": 2.35225510597229, + "learning_rate": 4.9353959142040917e-05, + "loss": 3.7075, + "step": 12195 + }, + { + "epoch": 0.07253306689504234, + "grad_norm": 2.418698310852051, + "learning_rate": 4.93538536360824e-05, + "loss": 3.679, + "step": 12196 + }, + { + "epoch": 0.07253901417832334, + "grad_norm": 2.4452991485595703, + "learning_rate": 4.9353748121622214e-05, + "loss": 3.7827, + "step": 12197 + }, + { + "epoch": 0.07254496146160434, + "grad_norm": 2.3787992000579834, + "learning_rate": 4.935364259866038e-05, + "loss": 3.7484, + "step": 12198 + }, + { + "epoch": 0.07255090874488533, + "grad_norm": 2.299149751663208, + "learning_rate": 4.935353706719694e-05, + "loss": 3.6186, + "step": 12199 + }, + { + "epoch": 0.07255685602816633, + "grad_norm": 2.666121244430542, + "learning_rate": 4.9353431527231944e-05, + "loss": 3.5323, + "step": 12200 + }, + { + "epoch": 0.07256280331144734, + "grad_norm": 2.4448325634002686, + "learning_rate": 4.9353325978765404e-05, + "loss": 3.8176, + "step": 12201 + }, + { + "epoch": 0.07256875059472832, + "grad_norm": 2.5082852840423584, + "learning_rate": 4.935322042179737e-05, + "loss": 3.7838, + "step": 12202 + }, + { + "epoch": 0.07257469787800933, + "grad_norm": 2.3247005939483643, + "learning_rate": 4.935311485632788e-05, + "loss": 3.8036, + "step": 12203 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 2.4917871952056885, + "learning_rate": 4.9353009282356974e-05, + "loss": 3.6734, + "step": 12204 + }, + { + "epoch": 0.07258659244457132, + "grad_norm": 2.2535903453826904, + "learning_rate": 4.935290369988468e-05, + "loss": 3.7451, + "step": 12205 + }, + { + "epoch": 0.07259253972785232, + "grad_norm": 2.355896472930908, + "learning_rate": 4.9352798108911036e-05, + "loss": 3.5963, + "step": 12206 + }, + { + "epoch": 0.07259848701113332, + "grad_norm": 2.21923828125, + "learning_rate": 4.935269250943609e-05, + "loss": 3.5492, + "step": 12207 + }, + { + "epoch": 0.07260443429441431, + "grad_norm": 2.3795714378356934, + "learning_rate": 4.935258690145986e-05, + "loss": 3.7146, + "step": 12208 + }, + { + "epoch": 0.07261038157769531, + "grad_norm": 2.3866682052612305, + "learning_rate": 4.93524812849824e-05, + "loss": 3.7359, + "step": 12209 + }, + { + "epoch": 0.07261632886097631, + "grad_norm": 2.411289691925049, + "learning_rate": 4.935237566000374e-05, + "loss": 3.6958, + "step": 12210 + }, + { + "epoch": 0.0726222761442573, + "grad_norm": 2.3831989765167236, + "learning_rate": 4.935227002652392e-05, + "loss": 3.6696, + "step": 12211 + }, + { + "epoch": 0.0726282234275383, + "grad_norm": 2.1831908226013184, + "learning_rate": 4.935216438454297e-05, + "loss": 3.905, + "step": 12212 + }, + { + "epoch": 0.0726341707108193, + "grad_norm": 2.1136345863342285, + "learning_rate": 4.9352058734060934e-05, + "loss": 5.0188, + "step": 12213 + }, + { + "epoch": 0.07264011799410029, + "grad_norm": 2.2617692947387695, + "learning_rate": 4.935195307507784e-05, + "loss": 5.1883, + "step": 12214 + }, + { + "epoch": 0.0726460652773813, + "grad_norm": 2.4442226886749268, + "learning_rate": 4.935184740759374e-05, + "loss": 5.1883, + "step": 12215 + }, + { + "epoch": 0.0726520125606623, + "grad_norm": 2.300234794616699, + "learning_rate": 4.935174173160865e-05, + "loss": 4.9925, + "step": 12216 + }, + { + "epoch": 0.07265795984394328, + "grad_norm": 2.1512858867645264, + "learning_rate": 4.935163604712263e-05, + "loss": 4.883, + "step": 12217 + }, + { + "epoch": 0.07266390712722429, + "grad_norm": 2.210825204849243, + "learning_rate": 4.93515303541357e-05, + "loss": 5.165, + "step": 12218 + }, + { + "epoch": 0.07266985441050527, + "grad_norm": 2.1589086055755615, + "learning_rate": 4.935142465264791e-05, + "loss": 4.931, + "step": 12219 + }, + { + "epoch": 0.07267580169378628, + "grad_norm": 2.0527892112731934, + "learning_rate": 4.935131894265927e-05, + "loss": 5.0566, + "step": 12220 + }, + { + "epoch": 0.07268174897706728, + "grad_norm": 2.202828884124756, + "learning_rate": 4.935121322416985e-05, + "loss": 4.9519, + "step": 12221 + }, + { + "epoch": 0.07268769626034827, + "grad_norm": 2.262834310531616, + "learning_rate": 4.935110749717967e-05, + "loss": 4.9596, + "step": 12222 + }, + { + "epoch": 0.07269364354362927, + "grad_norm": 2.169311761856079, + "learning_rate": 4.935100176168877e-05, + "loss": 4.8968, + "step": 12223 + }, + { + "epoch": 0.07269959082691027, + "grad_norm": 2.137746572494507, + "learning_rate": 4.935089601769719e-05, + "loss": 4.8535, + "step": 12224 + }, + { + "epoch": 0.07270553811019126, + "grad_norm": 2.060861587524414, + "learning_rate": 4.935079026520496e-05, + "loss": 5.0784, + "step": 12225 + }, + { + "epoch": 0.07271148539347226, + "grad_norm": 2.235352039337158, + "learning_rate": 4.935068450421213e-05, + "loss": 4.7351, + "step": 12226 + }, + { + "epoch": 0.07271743267675326, + "grad_norm": 2.3832550048828125, + "learning_rate": 4.935057873471872e-05, + "loss": 4.618, + "step": 12227 + }, + { + "epoch": 0.07272337996003425, + "grad_norm": 2.3591537475585938, + "learning_rate": 4.935047295672477e-05, + "loss": 4.7029, + "step": 12228 + }, + { + "epoch": 0.07272932724331525, + "grad_norm": 2.2797207832336426, + "learning_rate": 4.935036717023033e-05, + "loss": 4.9199, + "step": 12229 + }, + { + "epoch": 0.07273527452659626, + "grad_norm": 2.4931957721710205, + "learning_rate": 4.935026137523542e-05, + "loss": 4.5923, + "step": 12230 + }, + { + "epoch": 0.07274122180987724, + "grad_norm": 2.152064323425293, + "learning_rate": 4.9350155571740095e-05, + "loss": 5.1495, + "step": 12231 + }, + { + "epoch": 0.07274716909315825, + "grad_norm": 2.470526695251465, + "learning_rate": 4.935004975974438e-05, + "loss": 4.8257, + "step": 12232 + }, + { + "epoch": 0.07275311637643925, + "grad_norm": 2.262578248977661, + "learning_rate": 4.9349943939248304e-05, + "loss": 5.7004, + "step": 12233 + }, + { + "epoch": 0.07275906365972024, + "grad_norm": 2.0813188552856445, + "learning_rate": 4.934983811025192e-05, + "loss": 5.6048, + "step": 12234 + }, + { + "epoch": 0.07276501094300124, + "grad_norm": 2.4882686138153076, + "learning_rate": 4.934973227275527e-05, + "loss": 5.8121, + "step": 12235 + }, + { + "epoch": 0.07277095822628224, + "grad_norm": 2.5181429386138916, + "learning_rate": 4.9349626426758364e-05, + "loss": 4.5581, + "step": 12236 + }, + { + "epoch": 0.07277690550956323, + "grad_norm": 2.6369354724884033, + "learning_rate": 4.934952057226127e-05, + "loss": 4.7938, + "step": 12237 + }, + { + "epoch": 0.07278285279284423, + "grad_norm": 1.8615930080413818, + "learning_rate": 4.9349414709264e-05, + "loss": 5.2097, + "step": 12238 + }, + { + "epoch": 0.07278880007612523, + "grad_norm": 1.4905575513839722, + "learning_rate": 4.93493088377666e-05, + "loss": 5.5717, + "step": 12239 + }, + { + "epoch": 0.07279474735940622, + "grad_norm": 1.8339897394180298, + "learning_rate": 4.9349202957769106e-05, + "loss": 5.6908, + "step": 12240 + }, + { + "epoch": 0.07280069464268722, + "grad_norm": 1.5875110626220703, + "learning_rate": 4.934909706927156e-05, + "loss": 5.6246, + "step": 12241 + }, + { + "epoch": 0.07280664192596822, + "grad_norm": 1.8365919589996338, + "learning_rate": 4.934899117227399e-05, + "loss": 5.394, + "step": 12242 + }, + { + "epoch": 0.07281258920924921, + "grad_norm": 1.9548145532608032, + "learning_rate": 4.934888526677645e-05, + "loss": 5.2427, + "step": 12243 + }, + { + "epoch": 0.07281853649253021, + "grad_norm": 1.8174974918365479, + "learning_rate": 4.934877935277896e-05, + "loss": 5.5844, + "step": 12244 + }, + { + "epoch": 0.07282448377581122, + "grad_norm": 1.800117015838623, + "learning_rate": 4.934867343028157e-05, + "loss": 4.9386, + "step": 12245 + }, + { + "epoch": 0.0728304310590922, + "grad_norm": 2.0356900691986084, + "learning_rate": 4.93485674992843e-05, + "loss": 4.6911, + "step": 12246 + }, + { + "epoch": 0.0728363783423732, + "grad_norm": 2.009455442428589, + "learning_rate": 4.93484615597872e-05, + "loss": 4.6121, + "step": 12247 + }, + { + "epoch": 0.0728423256256542, + "grad_norm": 1.9252879619598389, + "learning_rate": 4.934835561179031e-05, + "loss": 4.737, + "step": 12248 + }, + { + "epoch": 0.0728482729089352, + "grad_norm": 2.3497977256774902, + "learning_rate": 4.934824965529365e-05, + "loss": 5.6921, + "step": 12249 + }, + { + "epoch": 0.0728542201922162, + "grad_norm": 2.0821962356567383, + "learning_rate": 4.934814369029727e-05, + "loss": 5.3845, + "step": 12250 + }, + { + "epoch": 0.07286016747549719, + "grad_norm": 1.9725046157836914, + "learning_rate": 4.934803771680121e-05, + "loss": 5.5557, + "step": 12251 + }, + { + "epoch": 0.07286611475877819, + "grad_norm": 2.290238618850708, + "learning_rate": 4.93479317348055e-05, + "loss": 5.4258, + "step": 12252 + }, + { + "epoch": 0.07287206204205919, + "grad_norm": 1.9502376317977905, + "learning_rate": 4.934782574431017e-05, + "loss": 5.0531, + "step": 12253 + }, + { + "epoch": 0.07287800932534018, + "grad_norm": 2.128431797027588, + "learning_rate": 4.9347719745315275e-05, + "loss": 5.0241, + "step": 12254 + }, + { + "epoch": 0.07288395660862118, + "grad_norm": 1.9173803329467773, + "learning_rate": 4.934761373782084e-05, + "loss": 5.7107, + "step": 12255 + }, + { + "epoch": 0.07288990389190218, + "grad_norm": 1.5167652368545532, + "learning_rate": 4.93475077218269e-05, + "loss": 5.2304, + "step": 12256 + }, + { + "epoch": 0.07289585117518317, + "grad_norm": 1.4125497341156006, + "learning_rate": 4.9347401697333505e-05, + "loss": 5.1099, + "step": 12257 + }, + { + "epoch": 0.07290179845846417, + "grad_norm": 2.384801149368286, + "learning_rate": 4.934729566434068e-05, + "loss": 5.0051, + "step": 12258 + }, + { + "epoch": 0.07290774574174518, + "grad_norm": 1.9343961477279663, + "learning_rate": 4.934718962284846e-05, + "loss": 5.3367, + "step": 12259 + }, + { + "epoch": 0.07291369302502616, + "grad_norm": 2.048220157623291, + "learning_rate": 4.93470835728569e-05, + "loss": 5.8502, + "step": 12260 + }, + { + "epoch": 0.07291964030830717, + "grad_norm": 2.037167549133301, + "learning_rate": 4.934697751436601e-05, + "loss": 5.1993, + "step": 12261 + }, + { + "epoch": 0.07292558759158817, + "grad_norm": 1.8141452074050903, + "learning_rate": 4.9346871447375854e-05, + "loss": 5.8308, + "step": 12262 + }, + { + "epoch": 0.07293153487486916, + "grad_norm": 1.7525955438613892, + "learning_rate": 4.934676537188645e-05, + "loss": 5.5946, + "step": 12263 + }, + { + "epoch": 0.07293748215815016, + "grad_norm": 1.9784163236618042, + "learning_rate": 4.9346659287897846e-05, + "loss": 5.7214, + "step": 12264 + }, + { + "epoch": 0.07294342944143116, + "grad_norm": 1.8948242664337158, + "learning_rate": 4.934655319541007e-05, + "loss": 5.7434, + "step": 12265 + }, + { + "epoch": 0.07294937672471215, + "grad_norm": 1.698625087738037, + "learning_rate": 4.934644709442317e-05, + "loss": 5.7828, + "step": 12266 + }, + { + "epoch": 0.07295532400799315, + "grad_norm": 1.6057854890823364, + "learning_rate": 4.934634098493717e-05, + "loss": 5.8815, + "step": 12267 + }, + { + "epoch": 0.07296127129127415, + "grad_norm": 1.4753777980804443, + "learning_rate": 4.9346234866952125e-05, + "loss": 5.8368, + "step": 12268 + }, + { + "epoch": 0.07296721857455514, + "grad_norm": 1.8265280723571777, + "learning_rate": 4.9346128740468046e-05, + "loss": 5.7511, + "step": 12269 + }, + { + "epoch": 0.07297316585783614, + "grad_norm": 1.7212530374526978, + "learning_rate": 4.9346022605485e-05, + "loss": 5.6741, + "step": 12270 + }, + { + "epoch": 0.07297911314111714, + "grad_norm": 1.8423148393630981, + "learning_rate": 4.9345916462002996e-05, + "loss": 5.5199, + "step": 12271 + }, + { + "epoch": 0.07298506042439813, + "grad_norm": 1.7754487991333008, + "learning_rate": 4.934581031002209e-05, + "loss": 5.9655, + "step": 12272 + }, + { + "epoch": 0.07299100770767913, + "grad_norm": 1.794704794883728, + "learning_rate": 4.9345704149542313e-05, + "loss": 5.886, + "step": 12273 + }, + { + "epoch": 0.07299695499096014, + "grad_norm": 1.807165503501892, + "learning_rate": 4.93455979805637e-05, + "loss": 5.5222, + "step": 12274 + }, + { + "epoch": 0.07300290227424112, + "grad_norm": 1.6476585865020752, + "learning_rate": 4.934549180308629e-05, + "loss": 5.6588, + "step": 12275 + }, + { + "epoch": 0.07300884955752213, + "grad_norm": 1.8332840204238892, + "learning_rate": 4.9345385617110125e-05, + "loss": 5.0781, + "step": 12276 + }, + { + "epoch": 0.07301479684080311, + "grad_norm": 1.837471842765808, + "learning_rate": 4.934527942263523e-05, + "loss": 5.8881, + "step": 12277 + }, + { + "epoch": 0.07302074412408412, + "grad_norm": 1.538299798965454, + "learning_rate": 4.934517321966165e-05, + "loss": 6.0547, + "step": 12278 + }, + { + "epoch": 0.07302669140736512, + "grad_norm": 1.9346814155578613, + "learning_rate": 4.934506700818943e-05, + "loss": 5.7853, + "step": 12279 + }, + { + "epoch": 0.0730326386906461, + "grad_norm": 1.9108514785766602, + "learning_rate": 4.93449607882186e-05, + "loss": 5.8034, + "step": 12280 + }, + { + "epoch": 0.07303858597392711, + "grad_norm": 2.0216846466064453, + "learning_rate": 4.934485455974919e-05, + "loss": 5.5127, + "step": 12281 + }, + { + "epoch": 0.07304453325720811, + "grad_norm": 2.2365148067474365, + "learning_rate": 4.9344748322781244e-05, + "loss": 5.5519, + "step": 12282 + }, + { + "epoch": 0.0730504805404891, + "grad_norm": 1.872934103012085, + "learning_rate": 4.934464207731479e-05, + "loss": 5.783, + "step": 12283 + }, + { + "epoch": 0.0730564278237701, + "grad_norm": 1.944606900215149, + "learning_rate": 4.934453582334988e-05, + "loss": 5.9803, + "step": 12284 + }, + { + "epoch": 0.0730623751070511, + "grad_norm": 1.765257477760315, + "learning_rate": 4.934442956088654e-05, + "loss": 5.8434, + "step": 12285 + }, + { + "epoch": 0.07306832239033209, + "grad_norm": 1.9726130962371826, + "learning_rate": 4.934432328992482e-05, + "loss": 5.6173, + "step": 12286 + }, + { + "epoch": 0.0730742696736131, + "grad_norm": 2.0510616302490234, + "learning_rate": 4.934421701046474e-05, + "loss": 5.4661, + "step": 12287 + }, + { + "epoch": 0.0730802169568941, + "grad_norm": 1.6038832664489746, + "learning_rate": 4.934411072250635e-05, + "loss": 5.2786, + "step": 12288 + }, + { + "epoch": 0.07308616424017508, + "grad_norm": 2.0088446140289307, + "learning_rate": 4.934400442604968e-05, + "loss": 4.9999, + "step": 12289 + }, + { + "epoch": 0.07309211152345609, + "grad_norm": 1.4760913848876953, + "learning_rate": 4.934389812109477e-05, + "loss": 4.785, + "step": 12290 + }, + { + "epoch": 0.07309805880673709, + "grad_norm": 2.2036757469177246, + "learning_rate": 4.934379180764166e-05, + "loss": 5.8303, + "step": 12291 + }, + { + "epoch": 0.07310400609001808, + "grad_norm": 2.0261359214782715, + "learning_rate": 4.9343685485690385e-05, + "loss": 5.6823, + "step": 12292 + }, + { + "epoch": 0.07310995337329908, + "grad_norm": 1.7493160963058472, + "learning_rate": 4.934357915524097e-05, + "loss": 5.6144, + "step": 12293 + }, + { + "epoch": 0.07311590065658008, + "grad_norm": 1.887373685836792, + "learning_rate": 4.934347281629347e-05, + "loss": 5.9405, + "step": 12294 + }, + { + "epoch": 0.07312184793986107, + "grad_norm": 1.6655008792877197, + "learning_rate": 4.9343366468847915e-05, + "loss": 5.8376, + "step": 12295 + }, + { + "epoch": 0.07312779522314207, + "grad_norm": 1.9241079092025757, + "learning_rate": 4.9343260112904345e-05, + "loss": 5.6072, + "step": 12296 + }, + { + "epoch": 0.07313374250642307, + "grad_norm": 1.7873997688293457, + "learning_rate": 4.934315374846279e-05, + "loss": 5.539, + "step": 12297 + }, + { + "epoch": 0.07313968978970406, + "grad_norm": 1.9266597032546997, + "learning_rate": 4.9343047375523296e-05, + "loss": 5.3921, + "step": 12298 + }, + { + "epoch": 0.07314563707298506, + "grad_norm": 1.9283325672149658, + "learning_rate": 4.934294099408589e-05, + "loss": 5.2326, + "step": 12299 + }, + { + "epoch": 0.07315158435626606, + "grad_norm": 1.739047884941101, + "learning_rate": 4.934283460415062e-05, + "loss": 5.4831, + "step": 12300 + }, + { + "epoch": 0.07315753163954705, + "grad_norm": 1.6729072332382202, + "learning_rate": 4.934272820571752e-05, + "loss": 5.633, + "step": 12301 + }, + { + "epoch": 0.07316347892282805, + "grad_norm": 1.6901992559432983, + "learning_rate": 4.9342621798786616e-05, + "loss": 5.6121, + "step": 12302 + }, + { + "epoch": 0.07316942620610906, + "grad_norm": 1.8640037775039673, + "learning_rate": 4.9342515383357956e-05, + "loss": 5.6498, + "step": 12303 + }, + { + "epoch": 0.07317537348939004, + "grad_norm": 1.9629018306732178, + "learning_rate": 4.9342408959431576e-05, + "loss": 5.9364, + "step": 12304 + }, + { + "epoch": 0.07318132077267105, + "grad_norm": 1.9370427131652832, + "learning_rate": 4.934230252700752e-05, + "loss": 5.8945, + "step": 12305 + }, + { + "epoch": 0.07318726805595203, + "grad_norm": 1.6541575193405151, + "learning_rate": 4.9342196086085814e-05, + "loss": 5.5826, + "step": 12306 + }, + { + "epoch": 0.07319321533923304, + "grad_norm": 1.6640154123306274, + "learning_rate": 4.934208963666649e-05, + "loss": 5.7065, + "step": 12307 + }, + { + "epoch": 0.07319916262251404, + "grad_norm": 1.596665620803833, + "learning_rate": 4.934198317874961e-05, + "loss": 5.6764, + "step": 12308 + }, + { + "epoch": 0.07320510990579503, + "grad_norm": 1.841260552406311, + "learning_rate": 4.9341876712335176e-05, + "loss": 5.624, + "step": 12309 + }, + { + "epoch": 0.07321105718907603, + "grad_norm": 1.921162724494934, + "learning_rate": 4.9341770237423254e-05, + "loss": 5.3177, + "step": 12310 + }, + { + "epoch": 0.07321700447235703, + "grad_norm": 1.844192624092102, + "learning_rate": 4.934166375401388e-05, + "loss": 5.6236, + "step": 12311 + }, + { + "epoch": 0.07322295175563802, + "grad_norm": 1.9088208675384521, + "learning_rate": 4.934155726210707e-05, + "loss": 5.7487, + "step": 12312 + }, + { + "epoch": 0.07322889903891902, + "grad_norm": 2.1057817935943604, + "learning_rate": 4.934145076170288e-05, + "loss": 5.3372, + "step": 12313 + }, + { + "epoch": 0.07323484632220002, + "grad_norm": 1.9507678747177124, + "learning_rate": 4.9341344252801335e-05, + "loss": 5.9318, + "step": 12314 + }, + { + "epoch": 0.07324079360548101, + "grad_norm": 1.9885265827178955, + "learning_rate": 4.934123773540249e-05, + "loss": 5.7724, + "step": 12315 + }, + { + "epoch": 0.07324674088876201, + "grad_norm": 1.81960129737854, + "learning_rate": 4.934113120950636e-05, + "loss": 5.7624, + "step": 12316 + }, + { + "epoch": 0.07325268817204302, + "grad_norm": 1.7848392724990845, + "learning_rate": 4.9341024675112994e-05, + "loss": 5.8135, + "step": 12317 + }, + { + "epoch": 0.073258635455324, + "grad_norm": 1.8326808214187622, + "learning_rate": 4.9340918132222436e-05, + "loss": 5.9725, + "step": 12318 + }, + { + "epoch": 0.073264582738605, + "grad_norm": 1.731719970703125, + "learning_rate": 4.93408115808347e-05, + "loss": 5.8932, + "step": 12319 + }, + { + "epoch": 0.07327053002188601, + "grad_norm": 1.7635269165039062, + "learning_rate": 4.934070502094985e-05, + "loss": 5.4953, + "step": 12320 + }, + { + "epoch": 0.073276477305167, + "grad_norm": 1.61715829372406, + "learning_rate": 4.934059845256791e-05, + "loss": 5.4043, + "step": 12321 + }, + { + "epoch": 0.073282424588448, + "grad_norm": 1.9188543558120728, + "learning_rate": 4.9340491875688914e-05, + "loss": 5.2762, + "step": 12322 + }, + { + "epoch": 0.073288371871729, + "grad_norm": 2.098680019378662, + "learning_rate": 4.9340385290312904e-05, + "loss": 5.4673, + "step": 12323 + }, + { + "epoch": 0.07329431915500999, + "grad_norm": 2.15560245513916, + "learning_rate": 4.934027869643992e-05, + "loss": 5.9124, + "step": 12324 + }, + { + "epoch": 0.07330026643829099, + "grad_norm": 1.9819902181625366, + "learning_rate": 4.934017209407e-05, + "loss": 5.5686, + "step": 12325 + }, + { + "epoch": 0.07330621372157199, + "grad_norm": 2.517003059387207, + "learning_rate": 4.934006548320317e-05, + "loss": 3.9751, + "step": 12326 + }, + { + "epoch": 0.07331216100485298, + "grad_norm": 2.458714723587036, + "learning_rate": 4.9339958863839474e-05, + "loss": 3.7976, + "step": 12327 + }, + { + "epoch": 0.07331810828813398, + "grad_norm": 2.2642102241516113, + "learning_rate": 4.9339852235978955e-05, + "loss": 3.8853, + "step": 12328 + }, + { + "epoch": 0.07332405557141498, + "grad_norm": 2.3097565174102783, + "learning_rate": 4.9339745599621645e-05, + "loss": 3.5699, + "step": 12329 + }, + { + "epoch": 0.07333000285469597, + "grad_norm": 2.312995195388794, + "learning_rate": 4.933963895476758e-05, + "loss": 3.8338, + "step": 12330 + }, + { + "epoch": 0.07333595013797697, + "grad_norm": 2.69657826423645, + "learning_rate": 4.93395323014168e-05, + "loss": 5.3459, + "step": 12331 + }, + { + "epoch": 0.07334189742125798, + "grad_norm": 2.263038396835327, + "learning_rate": 4.9339425639569336e-05, + "loss": 5.712, + "step": 12332 + }, + { + "epoch": 0.07334784470453896, + "grad_norm": 1.9429599046707153, + "learning_rate": 4.9339318969225235e-05, + "loss": 5.7465, + "step": 12333 + }, + { + "epoch": 0.07335379198781997, + "grad_norm": 2.07045841217041, + "learning_rate": 4.933921229038453e-05, + "loss": 5.6726, + "step": 12334 + }, + { + "epoch": 0.07335973927110095, + "grad_norm": 2.0304102897644043, + "learning_rate": 4.933910560304725e-05, + "loss": 5.8084, + "step": 12335 + }, + { + "epoch": 0.07336568655438196, + "grad_norm": 1.8316701650619507, + "learning_rate": 4.933899890721344e-05, + "loss": 5.3852, + "step": 12336 + }, + { + "epoch": 0.07337163383766296, + "grad_norm": 2.1406614780426025, + "learning_rate": 4.933889220288315e-05, + "loss": 5.1097, + "step": 12337 + }, + { + "epoch": 0.07337758112094395, + "grad_norm": 1.7518030405044556, + "learning_rate": 4.9338785490056395e-05, + "loss": 5.2038, + "step": 12338 + }, + { + "epoch": 0.07338352840422495, + "grad_norm": 1.8387973308563232, + "learning_rate": 4.933867876873322e-05, + "loss": 5.0847, + "step": 12339 + }, + { + "epoch": 0.07338947568750595, + "grad_norm": 1.692947506904602, + "learning_rate": 4.933857203891367e-05, + "loss": 5.6124, + "step": 12340 + }, + { + "epoch": 0.07339542297078694, + "grad_norm": 1.6367069482803345, + "learning_rate": 4.933846530059776e-05, + "loss": 5.7119, + "step": 12341 + }, + { + "epoch": 0.07340137025406794, + "grad_norm": 2.0395610332489014, + "learning_rate": 4.933835855378556e-05, + "loss": 5.4164, + "step": 12342 + }, + { + "epoch": 0.07340731753734894, + "grad_norm": 2.074073314666748, + "learning_rate": 4.933825179847709e-05, + "loss": 5.3952, + "step": 12343 + }, + { + "epoch": 0.07341326482062993, + "grad_norm": 2.2825684547424316, + "learning_rate": 4.9338145034672376e-05, + "loss": 5.4019, + "step": 12344 + }, + { + "epoch": 0.07341921210391093, + "grad_norm": 2.006591796875, + "learning_rate": 4.9338038262371476e-05, + "loss": 5.4422, + "step": 12345 + }, + { + "epoch": 0.07342515938719194, + "grad_norm": 2.10418701171875, + "learning_rate": 4.9337931481574415e-05, + "loss": 5.3801, + "step": 12346 + }, + { + "epoch": 0.07343110667047292, + "grad_norm": 1.9998257160186768, + "learning_rate": 4.9337824692281233e-05, + "loss": 5.1673, + "step": 12347 + }, + { + "epoch": 0.07343705395375393, + "grad_norm": 2.175896644592285, + "learning_rate": 4.933771789449197e-05, + "loss": 5.118, + "step": 12348 + }, + { + "epoch": 0.07344300123703493, + "grad_norm": 2.075164318084717, + "learning_rate": 4.933761108820666e-05, + "loss": 5.1662, + "step": 12349 + }, + { + "epoch": 0.07344894852031592, + "grad_norm": 2.0672569274902344, + "learning_rate": 4.933750427342534e-05, + "loss": 5.0957, + "step": 12350 + }, + { + "epoch": 0.07345489580359692, + "grad_norm": 2.0570287704467773, + "learning_rate": 4.9337397450148055e-05, + "loss": 5.2772, + "step": 12351 + }, + { + "epoch": 0.07346084308687792, + "grad_norm": 2.0653116703033447, + "learning_rate": 4.933729061837483e-05, + "loss": 5.4755, + "step": 12352 + }, + { + "epoch": 0.07346679037015891, + "grad_norm": 2.832578420639038, + "learning_rate": 4.933718377810571e-05, + "loss": 4.8128, + "step": 12353 + }, + { + "epoch": 0.07347273765343991, + "grad_norm": 2.378556251525879, + "learning_rate": 4.933707692934073e-05, + "loss": 5.109, + "step": 12354 + }, + { + "epoch": 0.07347868493672091, + "grad_norm": 2.1819205284118652, + "learning_rate": 4.933697007207993e-05, + "loss": 4.8603, + "step": 12355 + }, + { + "epoch": 0.0734846322200019, + "grad_norm": 2.104738473892212, + "learning_rate": 4.9336863206323345e-05, + "loss": 4.7806, + "step": 12356 + }, + { + "epoch": 0.0734905795032829, + "grad_norm": 1.8287266492843628, + "learning_rate": 4.933675633207101e-05, + "loss": 4.7082, + "step": 12357 + }, + { + "epoch": 0.0734965267865639, + "grad_norm": 2.0478014945983887, + "learning_rate": 4.933664944932297e-05, + "loss": 4.6145, + "step": 12358 + }, + { + "epoch": 0.07350247406984489, + "grad_norm": 2.208263397216797, + "learning_rate": 4.9336542558079244e-05, + "loss": 4.7523, + "step": 12359 + }, + { + "epoch": 0.0735084213531259, + "grad_norm": 2.1506083011627197, + "learning_rate": 4.93364356583399e-05, + "loss": 4.7444, + "step": 12360 + }, + { + "epoch": 0.0735143686364069, + "grad_norm": 2.04584002494812, + "learning_rate": 4.933632875010494e-05, + "loss": 4.6706, + "step": 12361 + }, + { + "epoch": 0.07352031591968788, + "grad_norm": 1.8598030805587769, + "learning_rate": 4.933622183337443e-05, + "loss": 4.6404, + "step": 12362 + }, + { + "epoch": 0.07352626320296889, + "grad_norm": 2.5650441646575928, + "learning_rate": 4.93361149081484e-05, + "loss": 5.382, + "step": 12363 + }, + { + "epoch": 0.07353221048624987, + "grad_norm": 2.1182446479797363, + "learning_rate": 4.933600797442688e-05, + "loss": 5.9041, + "step": 12364 + }, + { + "epoch": 0.07353815776953088, + "grad_norm": 1.8753353357315063, + "learning_rate": 4.933590103220991e-05, + "loss": 5.6615, + "step": 12365 + }, + { + "epoch": 0.07354410505281188, + "grad_norm": 1.9428893327713013, + "learning_rate": 4.933579408149752e-05, + "loss": 5.3549, + "step": 12366 + }, + { + "epoch": 0.07355005233609287, + "grad_norm": 1.809191346168518, + "learning_rate": 4.9335687122289766e-05, + "loss": 5.5603, + "step": 12367 + }, + { + "epoch": 0.07355599961937387, + "grad_norm": 1.7782649993896484, + "learning_rate": 4.933558015458667e-05, + "loss": 5.2848, + "step": 12368 + }, + { + "epoch": 0.07356194690265487, + "grad_norm": 1.71909499168396, + "learning_rate": 4.933547317838828e-05, + "loss": 5.3774, + "step": 12369 + }, + { + "epoch": 0.07356789418593586, + "grad_norm": 1.6399723291397095, + "learning_rate": 4.9335366193694625e-05, + "loss": 5.629, + "step": 12370 + }, + { + "epoch": 0.07357384146921686, + "grad_norm": 1.8646855354309082, + "learning_rate": 4.9335259200505746e-05, + "loss": 5.6297, + "step": 12371 + }, + { + "epoch": 0.07357978875249786, + "grad_norm": 1.5271104574203491, + "learning_rate": 4.9335152198821676e-05, + "loss": 5.6112, + "step": 12372 + }, + { + "epoch": 0.07358573603577885, + "grad_norm": 1.6217905282974243, + "learning_rate": 4.933504518864246e-05, + "loss": 5.2959, + "step": 12373 + }, + { + "epoch": 0.07359168331905985, + "grad_norm": 1.5774266719818115, + "learning_rate": 4.933493816996812e-05, + "loss": 5.4181, + "step": 12374 + }, + { + "epoch": 0.07359763060234085, + "grad_norm": 1.3641432523727417, + "learning_rate": 4.933483114279872e-05, + "loss": 5.3903, + "step": 12375 + }, + { + "epoch": 0.07360357788562184, + "grad_norm": 1.67635178565979, + "learning_rate": 4.933472410713428e-05, + "loss": 5.6771, + "step": 12376 + }, + { + "epoch": 0.07360952516890285, + "grad_norm": 1.6944624185562134, + "learning_rate": 4.933461706297483e-05, + "loss": 5.6008, + "step": 12377 + }, + { + "epoch": 0.07361547245218385, + "grad_norm": 1.3603699207305908, + "learning_rate": 4.933451001032042e-05, + "loss": 5.5396, + "step": 12378 + }, + { + "epoch": 0.07362141973546484, + "grad_norm": 1.6585369110107422, + "learning_rate": 4.9334402949171086e-05, + "loss": 5.5697, + "step": 12379 + }, + { + "epoch": 0.07362736701874584, + "grad_norm": 1.503786563873291, + "learning_rate": 4.9334295879526865e-05, + "loss": 5.4539, + "step": 12380 + }, + { + "epoch": 0.07363331430202684, + "grad_norm": 1.4761176109313965, + "learning_rate": 4.933418880138779e-05, + "loss": 5.4573, + "step": 12381 + }, + { + "epoch": 0.07363926158530783, + "grad_norm": 1.671972393989563, + "learning_rate": 4.93340817147539e-05, + "loss": 5.4143, + "step": 12382 + }, + { + "epoch": 0.07364520886858883, + "grad_norm": 1.5486379861831665, + "learning_rate": 4.9333974619625236e-05, + "loss": 5.4134, + "step": 12383 + }, + { + "epoch": 0.07365115615186983, + "grad_norm": 1.340108036994934, + "learning_rate": 4.933386751600183e-05, + "loss": 5.4587, + "step": 12384 + }, + { + "epoch": 0.07365710343515082, + "grad_norm": 1.3910952806472778, + "learning_rate": 4.933376040388372e-05, + "loss": 5.4129, + "step": 12385 + }, + { + "epoch": 0.07366305071843182, + "grad_norm": 1.5878056287765503, + "learning_rate": 4.9333653283270955e-05, + "loss": 5.3633, + "step": 12386 + }, + { + "epoch": 0.07366899800171282, + "grad_norm": 1.6040968894958496, + "learning_rate": 4.933354615416356e-05, + "loss": 5.2486, + "step": 12387 + }, + { + "epoch": 0.07367494528499381, + "grad_norm": 1.4824137687683105, + "learning_rate": 4.933343901656157e-05, + "loss": 5.2947, + "step": 12388 + }, + { + "epoch": 0.07368089256827481, + "grad_norm": 1.6114120483398438, + "learning_rate": 4.933333187046503e-05, + "loss": 5.2948, + "step": 12389 + }, + { + "epoch": 0.07368683985155582, + "grad_norm": 1.4269661903381348, + "learning_rate": 4.933322471587398e-05, + "loss": 5.1633, + "step": 12390 + }, + { + "epoch": 0.0736927871348368, + "grad_norm": 1.430588960647583, + "learning_rate": 4.933311755278844e-05, + "loss": 5.2846, + "step": 12391 + }, + { + "epoch": 0.0736987344181178, + "grad_norm": 1.3490641117095947, + "learning_rate": 4.9333010381208476e-05, + "loss": 5.2067, + "step": 12392 + }, + { + "epoch": 0.0737046817013988, + "grad_norm": 1.9292722940444946, + "learning_rate": 4.9332903201134104e-05, + "loss": 5.6196, + "step": 12393 + }, + { + "epoch": 0.0737106289846798, + "grad_norm": 1.8885586261749268, + "learning_rate": 4.933279601256536e-05, + "loss": 5.5225, + "step": 12394 + }, + { + "epoch": 0.0737165762679608, + "grad_norm": 1.5985313653945923, + "learning_rate": 4.93326888155023e-05, + "loss": 5.7447, + "step": 12395 + }, + { + "epoch": 0.07372252355124179, + "grad_norm": 2.819392681121826, + "learning_rate": 4.933258160994494e-05, + "loss": 6.002, + "step": 12396 + }, + { + "epoch": 0.07372847083452279, + "grad_norm": 2.006615161895752, + "learning_rate": 4.933247439589333e-05, + "loss": 5.7733, + "step": 12397 + }, + { + "epoch": 0.07373441811780379, + "grad_norm": 1.628408432006836, + "learning_rate": 4.933236717334751e-05, + "loss": 5.3899, + "step": 12398 + }, + { + "epoch": 0.07374036540108478, + "grad_norm": 1.5265247821807861, + "learning_rate": 4.93322599423075e-05, + "loss": 5.3891, + "step": 12399 + }, + { + "epoch": 0.07374631268436578, + "grad_norm": 1.6663800477981567, + "learning_rate": 4.933215270277336e-05, + "loss": 5.6172, + "step": 12400 + }, + { + "epoch": 0.07375225996764678, + "grad_norm": 1.7699551582336426, + "learning_rate": 4.933204545474511e-05, + "loss": 5.7088, + "step": 12401 + }, + { + "epoch": 0.07375820725092777, + "grad_norm": 1.5542314052581787, + "learning_rate": 4.93319381982228e-05, + "loss": 5.5925, + "step": 12402 + }, + { + "epoch": 0.07376415453420877, + "grad_norm": 1.5389710664749146, + "learning_rate": 4.933183093320646e-05, + "loss": 5.572, + "step": 12403 + }, + { + "epoch": 0.07377010181748977, + "grad_norm": 1.381242275238037, + "learning_rate": 4.9331723659696124e-05, + "loss": 5.4964, + "step": 12404 + }, + { + "epoch": 0.07377604910077076, + "grad_norm": 1.5536670684814453, + "learning_rate": 4.933161637769184e-05, + "loss": 5.3748, + "step": 12405 + }, + { + "epoch": 0.07378199638405177, + "grad_norm": 1.6656473875045776, + "learning_rate": 4.933150908719364e-05, + "loss": 5.3267, + "step": 12406 + }, + { + "epoch": 0.07378794366733277, + "grad_norm": 1.9200701713562012, + "learning_rate": 4.933140178820156e-05, + "loss": 5.2928, + "step": 12407 + }, + { + "epoch": 0.07379389095061376, + "grad_norm": 1.6290313005447388, + "learning_rate": 4.933129448071564e-05, + "loss": 5.4969, + "step": 12408 + }, + { + "epoch": 0.07379983823389476, + "grad_norm": 1.7247267961502075, + "learning_rate": 4.933118716473592e-05, + "loss": 5.564, + "step": 12409 + }, + { + "epoch": 0.07380578551717576, + "grad_norm": 1.4726417064666748, + "learning_rate": 4.933107984026243e-05, + "loss": 5.1759, + "step": 12410 + }, + { + "epoch": 0.07381173280045675, + "grad_norm": 1.4726674556732178, + "learning_rate": 4.933097250729522e-05, + "loss": 5.1731, + "step": 12411 + }, + { + "epoch": 0.07381768008373775, + "grad_norm": 1.4694938659667969, + "learning_rate": 4.93308651658343e-05, + "loss": 5.4539, + "step": 12412 + }, + { + "epoch": 0.07382362736701875, + "grad_norm": 1.5212653875350952, + "learning_rate": 4.9330757815879734e-05, + "loss": 5.5035, + "step": 12413 + }, + { + "epoch": 0.07382957465029974, + "grad_norm": 1.3731454610824585, + "learning_rate": 4.933065045743156e-05, + "loss": 5.415, + "step": 12414 + }, + { + "epoch": 0.07383552193358074, + "grad_norm": 1.5576610565185547, + "learning_rate": 4.93305430904898e-05, + "loss": 5.2776, + "step": 12415 + }, + { + "epoch": 0.07384146921686174, + "grad_norm": 1.72965407371521, + "learning_rate": 4.93304357150545e-05, + "loss": 5.3598, + "step": 12416 + }, + { + "epoch": 0.07384741650014273, + "grad_norm": 1.5218521356582642, + "learning_rate": 4.93303283311257e-05, + "loss": 5.295, + "step": 12417 + }, + { + "epoch": 0.07385336378342373, + "grad_norm": 1.5174230337142944, + "learning_rate": 4.933022093870343e-05, + "loss": 5.3506, + "step": 12418 + }, + { + "epoch": 0.07385931106670474, + "grad_norm": 1.3844187259674072, + "learning_rate": 4.933011353778773e-05, + "loss": 5.4345, + "step": 12419 + }, + { + "epoch": 0.07386525834998572, + "grad_norm": 1.5130188465118408, + "learning_rate": 4.9330006128378645e-05, + "loss": 5.4359, + "step": 12420 + }, + { + "epoch": 0.07387120563326673, + "grad_norm": 1.599004864692688, + "learning_rate": 4.93298987104762e-05, + "loss": 5.1631, + "step": 12421 + }, + { + "epoch": 0.07387715291654771, + "grad_norm": 1.6220343112945557, + "learning_rate": 4.932979128408044e-05, + "loss": 5.1244, + "step": 12422 + }, + { + "epoch": 0.07388310019982872, + "grad_norm": 1.5366616249084473, + "learning_rate": 4.93296838491914e-05, + "loss": 5.0368, + "step": 12423 + }, + { + "epoch": 0.07388904748310972, + "grad_norm": 1.5800726413726807, + "learning_rate": 4.932957640580912e-05, + "loss": 4.9906, + "step": 12424 + }, + { + "epoch": 0.0738949947663907, + "grad_norm": 1.6035537719726562, + "learning_rate": 4.9329468953933637e-05, + "loss": 5.0616, + "step": 12425 + }, + { + "epoch": 0.07390094204967171, + "grad_norm": 1.580127239227295, + "learning_rate": 4.932936149356499e-05, + "loss": 5.145, + "step": 12426 + }, + { + "epoch": 0.07390688933295271, + "grad_norm": 1.724788784980774, + "learning_rate": 4.932925402470321e-05, + "loss": 4.9589, + "step": 12427 + }, + { + "epoch": 0.0739128366162337, + "grad_norm": 1.5442367792129517, + "learning_rate": 4.932914654734834e-05, + "loss": 5.077, + "step": 12428 + }, + { + "epoch": 0.0739187838995147, + "grad_norm": 1.3692456483840942, + "learning_rate": 4.932903906150042e-05, + "loss": 5.1778, + "step": 12429 + }, + { + "epoch": 0.0739247311827957, + "grad_norm": 1.8229175806045532, + "learning_rate": 4.932893156715948e-05, + "loss": 5.4053, + "step": 12430 + }, + { + "epoch": 0.07393067846607669, + "grad_norm": 1.7769286632537842, + "learning_rate": 4.9328824064325566e-05, + "loss": 5.2541, + "step": 12431 + }, + { + "epoch": 0.07393662574935769, + "grad_norm": 1.7022631168365479, + "learning_rate": 4.93287165529987e-05, + "loss": 4.8555, + "step": 12432 + }, + { + "epoch": 0.0739425730326387, + "grad_norm": 1.5031015872955322, + "learning_rate": 4.932860903317894e-05, + "loss": 5.019, + "step": 12433 + }, + { + "epoch": 0.07394852031591968, + "grad_norm": 1.352550983428955, + "learning_rate": 4.932850150486631e-05, + "loss": 5.239, + "step": 12434 + }, + { + "epoch": 0.07395446759920069, + "grad_norm": 1.5571177005767822, + "learning_rate": 4.932839396806085e-05, + "loss": 5.2511, + "step": 12435 + }, + { + "epoch": 0.07396041488248169, + "grad_norm": 1.7673511505126953, + "learning_rate": 4.93282864227626e-05, + "loss": 5.1811, + "step": 12436 + }, + { + "epoch": 0.07396636216576268, + "grad_norm": 1.6385267972946167, + "learning_rate": 4.932817886897161e-05, + "loss": 5.1644, + "step": 12437 + }, + { + "epoch": 0.07397230944904368, + "grad_norm": 1.6142395734786987, + "learning_rate": 4.932807130668788e-05, + "loss": 5.173, + "step": 12438 + }, + { + "epoch": 0.07397825673232468, + "grad_norm": 1.6966745853424072, + "learning_rate": 4.932796373591149e-05, + "loss": 5.1495, + "step": 12439 + }, + { + "epoch": 0.07398420401560567, + "grad_norm": 1.6631567478179932, + "learning_rate": 4.932785615664245e-05, + "loss": 5.1787, + "step": 12440 + }, + { + "epoch": 0.07399015129888667, + "grad_norm": 1.7747845649719238, + "learning_rate": 4.9327748568880816e-05, + "loss": 5.1303, + "step": 12441 + }, + { + "epoch": 0.07399609858216767, + "grad_norm": 1.457535982131958, + "learning_rate": 4.932764097262661e-05, + "loss": 5.1573, + "step": 12442 + }, + { + "epoch": 0.07400204586544866, + "grad_norm": 1.602452039718628, + "learning_rate": 4.9327533367879875e-05, + "loss": 5.1039, + "step": 12443 + }, + { + "epoch": 0.07400799314872966, + "grad_norm": 1.644687294960022, + "learning_rate": 4.932742575464065e-05, + "loss": 5.3112, + "step": 12444 + }, + { + "epoch": 0.07401394043201066, + "grad_norm": 1.5873420238494873, + "learning_rate": 4.932731813290897e-05, + "loss": 5.1128, + "step": 12445 + }, + { + "epoch": 0.07401988771529165, + "grad_norm": 1.8046668767929077, + "learning_rate": 4.932721050268489e-05, + "loss": 4.9776, + "step": 12446 + }, + { + "epoch": 0.07402583499857265, + "grad_norm": 1.6964846849441528, + "learning_rate": 4.932710286396841e-05, + "loss": 5.0039, + "step": 12447 + }, + { + "epoch": 0.07403178228185366, + "grad_norm": 1.5332229137420654, + "learning_rate": 4.93269952167596e-05, + "loss": 4.9873, + "step": 12448 + }, + { + "epoch": 0.07403772956513464, + "grad_norm": 1.6128625869750977, + "learning_rate": 4.9326887561058485e-05, + "loss": 5.1139, + "step": 12449 + }, + { + "epoch": 0.07404367684841565, + "grad_norm": 1.5800291299819946, + "learning_rate": 4.932677989686511e-05, + "loss": 4.9687, + "step": 12450 + }, + { + "epoch": 0.07404962413169663, + "grad_norm": 1.6543092727661133, + "learning_rate": 4.932667222417951e-05, + "loss": 4.8345, + "step": 12451 + }, + { + "epoch": 0.07405557141497764, + "grad_norm": 1.4438380002975464, + "learning_rate": 4.932656454300171e-05, + "loss": 4.9677, + "step": 12452 + }, + { + "epoch": 0.07406151869825864, + "grad_norm": 1.6437597274780273, + "learning_rate": 4.932645685333176e-05, + "loss": 4.9016, + "step": 12453 + }, + { + "epoch": 0.07406746598153963, + "grad_norm": 1.5359379053115845, + "learning_rate": 4.932634915516969e-05, + "loss": 4.8357, + "step": 12454 + }, + { + "epoch": 0.07407341326482063, + "grad_norm": 1.6683440208435059, + "learning_rate": 4.9326241448515554e-05, + "loss": 4.8715, + "step": 12455 + }, + { + "epoch": 0.07407936054810163, + "grad_norm": 1.5654494762420654, + "learning_rate": 4.932613373336937e-05, + "loss": 4.8993, + "step": 12456 + }, + { + "epoch": 0.07408530783138262, + "grad_norm": 1.5333384275436401, + "learning_rate": 4.932602600973119e-05, + "loss": 4.9181, + "step": 12457 + }, + { + "epoch": 0.07409125511466362, + "grad_norm": 1.5674177408218384, + "learning_rate": 4.9325918277601046e-05, + "loss": 4.905, + "step": 12458 + }, + { + "epoch": 0.07409720239794462, + "grad_norm": 1.410294771194458, + "learning_rate": 4.9325810536978965e-05, + "loss": 4.8645, + "step": 12459 + }, + { + "epoch": 0.07410314968122561, + "grad_norm": 1.4950916767120361, + "learning_rate": 4.9325702787865006e-05, + "loss": 4.8289, + "step": 12460 + }, + { + "epoch": 0.07410909696450661, + "grad_norm": 1.7529935836791992, + "learning_rate": 4.9325595030259195e-05, + "loss": 4.8917, + "step": 12461 + }, + { + "epoch": 0.07411504424778761, + "grad_norm": 3.5575430393218994, + "learning_rate": 4.932548726416157e-05, + "loss": 5.5795, + "step": 12462 + }, + { + "epoch": 0.0741209915310686, + "grad_norm": 1.5091896057128906, + "learning_rate": 4.9325379489572165e-05, + "loss": 4.9864, + "step": 12463 + }, + { + "epoch": 0.0741269388143496, + "grad_norm": 1.6818382740020752, + "learning_rate": 4.932527170649102e-05, + "loss": 5.3386, + "step": 12464 + }, + { + "epoch": 0.07413288609763061, + "grad_norm": 1.7938569784164429, + "learning_rate": 4.932516391491818e-05, + "loss": 5.2668, + "step": 12465 + }, + { + "epoch": 0.0741388333809116, + "grad_norm": 1.89009428024292, + "learning_rate": 4.932505611485367e-05, + "loss": 5.1755, + "step": 12466 + }, + { + "epoch": 0.0741447806641926, + "grad_norm": 1.5277502536773682, + "learning_rate": 4.932494830629753e-05, + "loss": 5.3271, + "step": 12467 + }, + { + "epoch": 0.0741507279474736, + "grad_norm": 1.7720823287963867, + "learning_rate": 4.932484048924981e-05, + "loss": 5.7089, + "step": 12468 + }, + { + "epoch": 0.07415667523075459, + "grad_norm": 1.6797159910202026, + "learning_rate": 4.932473266371054e-05, + "loss": 5.5563, + "step": 12469 + }, + { + "epoch": 0.07416262251403559, + "grad_norm": 1.6536195278167725, + "learning_rate": 4.932462482967976e-05, + "loss": 5.4271, + "step": 12470 + }, + { + "epoch": 0.07416856979731659, + "grad_norm": 1.5667130947113037, + "learning_rate": 4.93245169871575e-05, + "loss": 5.3703, + "step": 12471 + }, + { + "epoch": 0.07417451708059758, + "grad_norm": 1.3659738302230835, + "learning_rate": 4.93244091361438e-05, + "loss": 5.4114, + "step": 12472 + }, + { + "epoch": 0.07418046436387858, + "grad_norm": 1.5106414556503296, + "learning_rate": 4.9324301276638705e-05, + "loss": 5.386, + "step": 12473 + }, + { + "epoch": 0.07418641164715958, + "grad_norm": 1.5054755210876465, + "learning_rate": 4.932419340864225e-05, + "loss": 5.3067, + "step": 12474 + }, + { + "epoch": 0.07419235893044057, + "grad_norm": 1.4413330554962158, + "learning_rate": 4.932408553215446e-05, + "loss": 5.358, + "step": 12475 + }, + { + "epoch": 0.07419830621372157, + "grad_norm": 1.3034652471542358, + "learning_rate": 4.932397764717539e-05, + "loss": 5.2942, + "step": 12476 + }, + { + "epoch": 0.07420425349700258, + "grad_norm": 1.494664192199707, + "learning_rate": 4.9323869753705074e-05, + "loss": 5.4243, + "step": 12477 + }, + { + "epoch": 0.07421020078028356, + "grad_norm": 1.2644178867340088, + "learning_rate": 4.932376185174354e-05, + "loss": 5.2212, + "step": 12478 + }, + { + "epoch": 0.07421614806356457, + "grad_norm": 1.5576590299606323, + "learning_rate": 4.9323653941290836e-05, + "loss": 5.2077, + "step": 12479 + }, + { + "epoch": 0.07422209534684555, + "grad_norm": 1.5699479579925537, + "learning_rate": 4.932354602234699e-05, + "loss": 5.3849, + "step": 12480 + }, + { + "epoch": 0.07422804263012656, + "grad_norm": 1.6582329273223877, + "learning_rate": 4.932343809491205e-05, + "loss": 5.3961, + "step": 12481 + }, + { + "epoch": 0.07423398991340756, + "grad_norm": 1.6159483194351196, + "learning_rate": 4.932333015898605e-05, + "loss": 5.3711, + "step": 12482 + }, + { + "epoch": 0.07423993719668855, + "grad_norm": 1.453933596611023, + "learning_rate": 4.932322221456902e-05, + "loss": 5.2899, + "step": 12483 + }, + { + "epoch": 0.07424588447996955, + "grad_norm": 1.3830047845840454, + "learning_rate": 4.9323114261661014e-05, + "loss": 5.3839, + "step": 12484 + }, + { + "epoch": 0.07425183176325055, + "grad_norm": 1.5541338920593262, + "learning_rate": 4.932300630026205e-05, + "loss": 5.257, + "step": 12485 + }, + { + "epoch": 0.07425777904653154, + "grad_norm": 1.5887267589569092, + "learning_rate": 4.932289833037219e-05, + "loss": 5.2079, + "step": 12486 + }, + { + "epoch": 0.07426372632981254, + "grad_norm": 1.6341818571090698, + "learning_rate": 4.932279035199144e-05, + "loss": 5.2529, + "step": 12487 + }, + { + "epoch": 0.07426967361309354, + "grad_norm": 1.5520392656326294, + "learning_rate": 4.9322682365119866e-05, + "loss": 5.2416, + "step": 12488 + }, + { + "epoch": 0.07427562089637453, + "grad_norm": 1.610711693763733, + "learning_rate": 4.93225743697575e-05, + "loss": 5.3172, + "step": 12489 + }, + { + "epoch": 0.07428156817965553, + "grad_norm": 1.5997258424758911, + "learning_rate": 4.932246636590436e-05, + "loss": 5.2343, + "step": 12490 + }, + { + "epoch": 0.07428751546293653, + "grad_norm": 1.5319284200668335, + "learning_rate": 4.932235835356051e-05, + "loss": 5.2021, + "step": 12491 + }, + { + "epoch": 0.07429346274621752, + "grad_norm": 1.6516488790512085, + "learning_rate": 4.932225033272597e-05, + "loss": 5.2678, + "step": 12492 + }, + { + "epoch": 0.07429941002949852, + "grad_norm": 1.9008166790008545, + "learning_rate": 4.9322142303400786e-05, + "loss": 5.1424, + "step": 12493 + }, + { + "epoch": 0.07430535731277953, + "grad_norm": 1.8372108936309814, + "learning_rate": 4.932203426558499e-05, + "loss": 5.321, + "step": 12494 + }, + { + "epoch": 0.07431130459606052, + "grad_norm": 1.4764071702957153, + "learning_rate": 4.932192621927863e-05, + "loss": 5.3627, + "step": 12495 + }, + { + "epoch": 0.07431725187934152, + "grad_norm": 1.6356589794158936, + "learning_rate": 4.932181816448173e-05, + "loss": 5.2061, + "step": 12496 + }, + { + "epoch": 0.07432319916262252, + "grad_norm": 1.6335545778274536, + "learning_rate": 4.932171010119434e-05, + "loss": 5.2283, + "step": 12497 + }, + { + "epoch": 0.07432914644590351, + "grad_norm": 1.499968409538269, + "learning_rate": 4.932160202941649e-05, + "loss": 5.4862, + "step": 12498 + }, + { + "epoch": 0.07433509372918451, + "grad_norm": 1.7292691469192505, + "learning_rate": 4.932149394914822e-05, + "loss": 5.4055, + "step": 12499 + }, + { + "epoch": 0.07434104101246551, + "grad_norm": 1.6818633079528809, + "learning_rate": 4.932138586038957e-05, + "loss": 5.5262, + "step": 12500 + }, + { + "epoch": 0.0743469882957465, + "grad_norm": 1.4048001766204834, + "learning_rate": 4.932127776314057e-05, + "loss": 5.1876, + "step": 12501 + }, + { + "epoch": 0.0743529355790275, + "grad_norm": 1.6041479110717773, + "learning_rate": 4.9321169657401264e-05, + "loss": 5.0791, + "step": 12502 + }, + { + "epoch": 0.0743588828623085, + "grad_norm": 1.3542897701263428, + "learning_rate": 4.932106154317169e-05, + "loss": 5.189, + "step": 12503 + }, + { + "epoch": 0.07436483014558949, + "grad_norm": 1.7782005071640015, + "learning_rate": 4.932095342045189e-05, + "loss": 5.2823, + "step": 12504 + }, + { + "epoch": 0.0743707774288705, + "grad_norm": 1.5981978178024292, + "learning_rate": 4.932084528924189e-05, + "loss": 5.3978, + "step": 12505 + }, + { + "epoch": 0.0743767247121515, + "grad_norm": 1.5224134922027588, + "learning_rate": 4.9320737149541734e-05, + "loss": 5.336, + "step": 12506 + }, + { + "epoch": 0.07438267199543248, + "grad_norm": 1.4827311038970947, + "learning_rate": 4.932062900135147e-05, + "loss": 5.2284, + "step": 12507 + }, + { + "epoch": 0.07438861927871349, + "grad_norm": 1.4394789934158325, + "learning_rate": 4.932052084467111e-05, + "loss": 5.1672, + "step": 12508 + }, + { + "epoch": 0.07439456656199447, + "grad_norm": 1.5112950801849365, + "learning_rate": 4.9320412679500715e-05, + "loss": 5.4069, + "step": 12509 + }, + { + "epoch": 0.07440051384527548, + "grad_norm": 1.4547615051269531, + "learning_rate": 4.932030450584032e-05, + "loss": 5.3317, + "step": 12510 + }, + { + "epoch": 0.07440646112855648, + "grad_norm": 1.5839279890060425, + "learning_rate": 4.9320196323689946e-05, + "loss": 5.2042, + "step": 12511 + }, + { + "epoch": 0.07441240841183747, + "grad_norm": 1.6392362117767334, + "learning_rate": 4.9320088133049655e-05, + "loss": 5.2595, + "step": 12512 + }, + { + "epoch": 0.07441835569511847, + "grad_norm": 1.530236840248108, + "learning_rate": 4.931997993391947e-05, + "loss": 5.4417, + "step": 12513 + }, + { + "epoch": 0.07442430297839947, + "grad_norm": 1.7665959596633911, + "learning_rate": 4.931987172629943e-05, + "loss": 5.5164, + "step": 12514 + }, + { + "epoch": 0.07443025026168046, + "grad_norm": 1.5256375074386597, + "learning_rate": 4.931976351018957e-05, + "loss": 5.3645, + "step": 12515 + }, + { + "epoch": 0.07443619754496146, + "grad_norm": 1.5948551893234253, + "learning_rate": 4.9319655285589937e-05, + "loss": 5.1964, + "step": 12516 + }, + { + "epoch": 0.07444214482824246, + "grad_norm": 1.451249361038208, + "learning_rate": 4.931954705250056e-05, + "loss": 5.3043, + "step": 12517 + }, + { + "epoch": 0.07444809211152345, + "grad_norm": 1.5874381065368652, + "learning_rate": 4.931943881092148e-05, + "loss": 5.3769, + "step": 12518 + }, + { + "epoch": 0.07445403939480445, + "grad_norm": 1.597102165222168, + "learning_rate": 4.931933056085274e-05, + "loss": 5.2909, + "step": 12519 + }, + { + "epoch": 0.07445998667808545, + "grad_norm": 1.3787156343460083, + "learning_rate": 4.9319222302294364e-05, + "loss": 5.5499, + "step": 12520 + }, + { + "epoch": 0.07446593396136644, + "grad_norm": 1.5816805362701416, + "learning_rate": 4.931911403524641e-05, + "loss": 5.255, + "step": 12521 + }, + { + "epoch": 0.07447188124464744, + "grad_norm": 1.636619210243225, + "learning_rate": 4.93190057597089e-05, + "loss": 5.3816, + "step": 12522 + }, + { + "epoch": 0.07447782852792845, + "grad_norm": 1.518872857093811, + "learning_rate": 4.931889747568187e-05, + "loss": 5.3376, + "step": 12523 + }, + { + "epoch": 0.07448377581120944, + "grad_norm": 1.9586291313171387, + "learning_rate": 4.931878918316537e-05, + "loss": 5.6678, + "step": 12524 + }, + { + "epoch": 0.07448972309449044, + "grad_norm": 1.5893887281417847, + "learning_rate": 4.9318680882159435e-05, + "loss": 5.266, + "step": 12525 + }, + { + "epoch": 0.07449567037777144, + "grad_norm": 1.5339915752410889, + "learning_rate": 4.93185725726641e-05, + "loss": 5.1891, + "step": 12526 + }, + { + "epoch": 0.07450161766105243, + "grad_norm": 1.730128288269043, + "learning_rate": 4.9318464254679396e-05, + "loss": 5.1534, + "step": 12527 + }, + { + "epoch": 0.07450756494433343, + "grad_norm": 1.691015362739563, + "learning_rate": 4.931835592820537e-05, + "loss": 5.2599, + "step": 12528 + }, + { + "epoch": 0.07451351222761443, + "grad_norm": 1.2936137914657593, + "learning_rate": 4.9318247593242056e-05, + "loss": 5.2432, + "step": 12529 + }, + { + "epoch": 0.07451945951089542, + "grad_norm": 1.4507200717926025, + "learning_rate": 4.93181392497895e-05, + "loss": 5.1539, + "step": 12530 + }, + { + "epoch": 0.07452540679417642, + "grad_norm": 1.6212667226791382, + "learning_rate": 4.931803089784772e-05, + "loss": 5.1212, + "step": 12531 + }, + { + "epoch": 0.07453135407745742, + "grad_norm": 1.48690927028656, + "learning_rate": 4.9317922537416775e-05, + "loss": 5.168, + "step": 12532 + }, + { + "epoch": 0.07453730136073841, + "grad_norm": 1.5102870464324951, + "learning_rate": 4.931781416849669e-05, + "loss": 5.2024, + "step": 12533 + }, + { + "epoch": 0.07454324864401941, + "grad_norm": 1.4186264276504517, + "learning_rate": 4.9317705791087516e-05, + "loss": 5.1154, + "step": 12534 + }, + { + "epoch": 0.07454919592730042, + "grad_norm": 1.623822569847107, + "learning_rate": 4.931759740518928e-05, + "loss": 5.0244, + "step": 12535 + }, + { + "epoch": 0.0745551432105814, + "grad_norm": 1.4694246053695679, + "learning_rate": 4.9317489010802015e-05, + "loss": 5.1737, + "step": 12536 + }, + { + "epoch": 0.0745610904938624, + "grad_norm": 1.553551435470581, + "learning_rate": 4.931738060792577e-05, + "loss": 5.1339, + "step": 12537 + }, + { + "epoch": 0.0745670377771434, + "grad_norm": 1.744367003440857, + "learning_rate": 4.9317272196560575e-05, + "loss": 5.1564, + "step": 12538 + }, + { + "epoch": 0.0745729850604244, + "grad_norm": 1.6584309339523315, + "learning_rate": 4.931716377670648e-05, + "loss": 5.1871, + "step": 12539 + }, + { + "epoch": 0.0745789323437054, + "grad_norm": 1.6894947290420532, + "learning_rate": 4.931705534836351e-05, + "loss": 5.1432, + "step": 12540 + }, + { + "epoch": 0.07458487962698639, + "grad_norm": 1.467315912246704, + "learning_rate": 4.93169469115317e-05, + "loss": 5.2072, + "step": 12541 + }, + { + "epoch": 0.07459082691026739, + "grad_norm": 1.478841781616211, + "learning_rate": 4.93168384662111e-05, + "loss": 5.3644, + "step": 12542 + }, + { + "epoch": 0.07459677419354839, + "grad_norm": 1.6001938581466675, + "learning_rate": 4.9316730012401745e-05, + "loss": 5.2031, + "step": 12543 + }, + { + "epoch": 0.07460272147682938, + "grad_norm": 1.480236530303955, + "learning_rate": 4.931662155010367e-05, + "loss": 5.0113, + "step": 12544 + }, + { + "epoch": 0.07460866876011038, + "grad_norm": 1.490511178970337, + "learning_rate": 4.9316513079316914e-05, + "loss": 5.0416, + "step": 12545 + }, + { + "epoch": 0.07461461604339138, + "grad_norm": 1.7327873706817627, + "learning_rate": 4.931640460004152e-05, + "loss": 5.0578, + "step": 12546 + }, + { + "epoch": 0.07462056332667237, + "grad_norm": 1.6410421133041382, + "learning_rate": 4.9316296112277514e-05, + "loss": 5.0239, + "step": 12547 + }, + { + "epoch": 0.07462651060995337, + "grad_norm": 1.5255141258239746, + "learning_rate": 4.9316187616024936e-05, + "loss": 5.1592, + "step": 12548 + }, + { + "epoch": 0.07463245789323437, + "grad_norm": 1.5555649995803833, + "learning_rate": 4.9316079111283835e-05, + "loss": 5.3981, + "step": 12549 + }, + { + "epoch": 0.07463840517651536, + "grad_norm": 1.4196929931640625, + "learning_rate": 4.931597059805424e-05, + "loss": 5.0682, + "step": 12550 + }, + { + "epoch": 0.07464435245979636, + "grad_norm": 1.562338948249817, + "learning_rate": 4.93158620763362e-05, + "loss": 5.3551, + "step": 12551 + }, + { + "epoch": 0.07465029974307737, + "grad_norm": 1.5955942869186401, + "learning_rate": 4.931575354612973e-05, + "loss": 5.3108, + "step": 12552 + }, + { + "epoch": 0.07465624702635835, + "grad_norm": 1.4173908233642578, + "learning_rate": 4.9315645007434885e-05, + "loss": 5.3793, + "step": 12553 + }, + { + "epoch": 0.07466219430963936, + "grad_norm": 1.4075239896774292, + "learning_rate": 4.93155364602517e-05, + "loss": 5.4409, + "step": 12554 + }, + { + "epoch": 0.07466814159292036, + "grad_norm": 1.3041841983795166, + "learning_rate": 4.9315427904580216e-05, + "loss": 5.5285, + "step": 12555 + }, + { + "epoch": 0.07467408887620135, + "grad_norm": 1.4277441501617432, + "learning_rate": 4.9315319340420465e-05, + "loss": 5.5017, + "step": 12556 + }, + { + "epoch": 0.07468003615948235, + "grad_norm": 1.407895803451538, + "learning_rate": 4.931521076777248e-05, + "loss": 5.3675, + "step": 12557 + }, + { + "epoch": 0.07468598344276335, + "grad_norm": 1.429131031036377, + "learning_rate": 4.931510218663632e-05, + "loss": 5.3712, + "step": 12558 + }, + { + "epoch": 0.07469193072604434, + "grad_norm": 1.7229793071746826, + "learning_rate": 4.9314993597011995e-05, + "loss": 5.4513, + "step": 12559 + }, + { + "epoch": 0.07469787800932534, + "grad_norm": 1.5961774587631226, + "learning_rate": 4.9314884998899565e-05, + "loss": 5.5478, + "step": 12560 + }, + { + "epoch": 0.07470382529260634, + "grad_norm": 1.4570807218551636, + "learning_rate": 4.931477639229906e-05, + "loss": 5.3973, + "step": 12561 + }, + { + "epoch": 0.07470977257588733, + "grad_norm": 1.6308903694152832, + "learning_rate": 4.931466777721052e-05, + "loss": 5.1951, + "step": 12562 + }, + { + "epoch": 0.07471571985916833, + "grad_norm": 1.438491940498352, + "learning_rate": 4.9314559153633974e-05, + "loss": 5.4237, + "step": 12563 + }, + { + "epoch": 0.07472166714244934, + "grad_norm": 1.7219120264053345, + "learning_rate": 4.931445052156947e-05, + "loss": 5.2303, + "step": 12564 + }, + { + "epoch": 0.07472761442573032, + "grad_norm": 1.557895302772522, + "learning_rate": 4.931434188101704e-05, + "loss": 5.2383, + "step": 12565 + }, + { + "epoch": 0.07473356170901133, + "grad_norm": 1.3585479259490967, + "learning_rate": 4.931423323197672e-05, + "loss": 5.2698, + "step": 12566 + }, + { + "epoch": 0.07473950899229233, + "grad_norm": 1.643608808517456, + "learning_rate": 4.931412457444857e-05, + "loss": 5.285, + "step": 12567 + }, + { + "epoch": 0.07474545627557332, + "grad_norm": 1.7847453355789185, + "learning_rate": 4.93140159084326e-05, + "loss": 5.413, + "step": 12568 + }, + { + "epoch": 0.07475140355885432, + "grad_norm": 1.5010985136032104, + "learning_rate": 4.931390723392886e-05, + "loss": 5.3665, + "step": 12569 + }, + { + "epoch": 0.0747573508421353, + "grad_norm": 1.3640403747558594, + "learning_rate": 4.931379855093738e-05, + "loss": 5.2253, + "step": 12570 + }, + { + "epoch": 0.07476329812541631, + "grad_norm": 1.4886012077331543, + "learning_rate": 4.9313689859458214e-05, + "loss": 5.5954, + "step": 12571 + }, + { + "epoch": 0.07476924540869731, + "grad_norm": 1.6626142263412476, + "learning_rate": 4.931358115949138e-05, + "loss": 5.3558, + "step": 12572 + }, + { + "epoch": 0.0747751926919783, + "grad_norm": 1.6350460052490234, + "learning_rate": 4.931347245103693e-05, + "loss": 5.3222, + "step": 12573 + }, + { + "epoch": 0.0747811399752593, + "grad_norm": 1.586182951927185, + "learning_rate": 4.93133637340949e-05, + "loss": 5.2056, + "step": 12574 + }, + { + "epoch": 0.0747870872585403, + "grad_norm": 1.6866692304611206, + "learning_rate": 4.931325500866532e-05, + "loss": 5.2698, + "step": 12575 + }, + { + "epoch": 0.07479303454182129, + "grad_norm": 1.4165509939193726, + "learning_rate": 4.9313146274748235e-05, + "loss": 5.2572, + "step": 12576 + }, + { + "epoch": 0.07479898182510229, + "grad_norm": 1.6259573698043823, + "learning_rate": 4.931303753234369e-05, + "loss": 5.2585, + "step": 12577 + }, + { + "epoch": 0.0748049291083833, + "grad_norm": 1.4159972667694092, + "learning_rate": 4.931292878145171e-05, + "loss": 5.1748, + "step": 12578 + }, + { + "epoch": 0.07481087639166428, + "grad_norm": 1.3880494832992554, + "learning_rate": 4.931282002207234e-05, + "loss": 5.2181, + "step": 12579 + }, + { + "epoch": 0.07481682367494528, + "grad_norm": 1.4466285705566406, + "learning_rate": 4.931271125420561e-05, + "loss": 5.2041, + "step": 12580 + }, + { + "epoch": 0.07482277095822629, + "grad_norm": 1.5111972093582153, + "learning_rate": 4.931260247785157e-05, + "loss": 5.2388, + "step": 12581 + }, + { + "epoch": 0.07482871824150727, + "grad_norm": 1.368296504020691, + "learning_rate": 4.9312493693010245e-05, + "loss": 5.0964, + "step": 12582 + }, + { + "epoch": 0.07483466552478828, + "grad_norm": 1.5604379177093506, + "learning_rate": 4.931238489968168e-05, + "loss": 5.2031, + "step": 12583 + }, + { + "epoch": 0.07484061280806928, + "grad_norm": 1.6104371547698975, + "learning_rate": 4.9312276097865916e-05, + "loss": 5.1122, + "step": 12584 + }, + { + "epoch": 0.07484656009135027, + "grad_norm": 1.5082486867904663, + "learning_rate": 4.931216728756299e-05, + "loss": 5.2092, + "step": 12585 + }, + { + "epoch": 0.07485250737463127, + "grad_norm": 2.1802000999450684, + "learning_rate": 4.931205846877293e-05, + "loss": 5.859, + "step": 12586 + }, + { + "epoch": 0.07485845465791227, + "grad_norm": 1.7069321870803833, + "learning_rate": 4.931194964149579e-05, + "loss": 4.9751, + "step": 12587 + }, + { + "epoch": 0.07486440194119326, + "grad_norm": 1.3614740371704102, + "learning_rate": 4.931184080573159e-05, + "loss": 5.2341, + "step": 12588 + }, + { + "epoch": 0.07487034922447426, + "grad_norm": 1.3952617645263672, + "learning_rate": 4.931173196148039e-05, + "loss": 5.0472, + "step": 12589 + }, + { + "epoch": 0.07487629650775526, + "grad_norm": 1.435829758644104, + "learning_rate": 4.9311623108742205e-05, + "loss": 5.0165, + "step": 12590 + }, + { + "epoch": 0.07488224379103625, + "grad_norm": 1.3875840902328491, + "learning_rate": 4.931151424751709e-05, + "loss": 5.5455, + "step": 12591 + }, + { + "epoch": 0.07488819107431725, + "grad_norm": 1.4364032745361328, + "learning_rate": 4.931140537780508e-05, + "loss": 5.5106, + "step": 12592 + }, + { + "epoch": 0.07489413835759826, + "grad_norm": 1.5878878831863403, + "learning_rate": 4.9311296499606194e-05, + "loss": 5.2372, + "step": 12593 + }, + { + "epoch": 0.07490008564087924, + "grad_norm": 1.5724025964736938, + "learning_rate": 4.9311187612920495e-05, + "loss": 5.3771, + "step": 12594 + }, + { + "epoch": 0.07490603292416025, + "grad_norm": 1.4630738496780396, + "learning_rate": 4.9311078717748014e-05, + "loss": 5.3378, + "step": 12595 + }, + { + "epoch": 0.07491198020744125, + "grad_norm": 1.4438437223434448, + "learning_rate": 4.931096981408878e-05, + "loss": 5.3019, + "step": 12596 + }, + { + "epoch": 0.07491792749072224, + "grad_norm": 1.674564242362976, + "learning_rate": 4.931086090194285e-05, + "loss": 5.2957, + "step": 12597 + }, + { + "epoch": 0.07492387477400324, + "grad_norm": 1.237748384475708, + "learning_rate": 4.9310751981310236e-05, + "loss": 5.1994, + "step": 12598 + }, + { + "epoch": 0.07492982205728423, + "grad_norm": 1.5828932523727417, + "learning_rate": 4.9310643052191e-05, + "loss": 5.2326, + "step": 12599 + }, + { + "epoch": 0.07493576934056523, + "grad_norm": 1.2774053812026978, + "learning_rate": 4.931053411458516e-05, + "loss": 5.2496, + "step": 12600 + }, + { + "epoch": 0.07494171662384623, + "grad_norm": 1.2986499071121216, + "learning_rate": 4.9310425168492766e-05, + "loss": 5.3061, + "step": 12601 + }, + { + "epoch": 0.07494766390712722, + "grad_norm": 1.3973673582077026, + "learning_rate": 4.931031621391386e-05, + "loss": 5.1437, + "step": 12602 + }, + { + "epoch": 0.07495361119040822, + "grad_norm": 1.4217787981033325, + "learning_rate": 4.9310207250848475e-05, + "loss": 5.1636, + "step": 12603 + }, + { + "epoch": 0.07495955847368922, + "grad_norm": 1.5062726736068726, + "learning_rate": 4.9310098279296634e-05, + "loss": 5.2944, + "step": 12604 + }, + { + "epoch": 0.07496550575697021, + "grad_norm": 1.4844671487808228, + "learning_rate": 4.9309989299258404e-05, + "loss": 5.1899, + "step": 12605 + }, + { + "epoch": 0.07497145304025121, + "grad_norm": 1.3542430400848389, + "learning_rate": 4.9309880310733805e-05, + "loss": 5.1636, + "step": 12606 + }, + { + "epoch": 0.07497740032353221, + "grad_norm": 1.58526611328125, + "learning_rate": 4.930977131372287e-05, + "loss": 5.5748, + "step": 12607 + }, + { + "epoch": 0.0749833476068132, + "grad_norm": 1.6003972291946411, + "learning_rate": 4.930966230822564e-05, + "loss": 5.3992, + "step": 12608 + }, + { + "epoch": 0.0749892948900942, + "grad_norm": 1.6475237607955933, + "learning_rate": 4.930955329424218e-05, + "loss": 5.4515, + "step": 12609 + }, + { + "epoch": 0.0749952421733752, + "grad_norm": 1.5395694971084595, + "learning_rate": 4.9309444271772486e-05, + "loss": 5.5117, + "step": 12610 + }, + { + "epoch": 0.0750011894566562, + "grad_norm": 1.3863389492034912, + "learning_rate": 4.930933524081663e-05, + "loss": 5.5771, + "step": 12611 + }, + { + "epoch": 0.0750071367399372, + "grad_norm": 1.431830644607544, + "learning_rate": 4.9309226201374626e-05, + "loss": 5.412, + "step": 12612 + }, + { + "epoch": 0.0750130840232182, + "grad_norm": 1.4647631645202637, + "learning_rate": 4.930911715344653e-05, + "loss": 5.1849, + "step": 12613 + }, + { + "epoch": 0.07501903130649919, + "grad_norm": 2.126068592071533, + "learning_rate": 4.930900809703237e-05, + "loss": 5.1712, + "step": 12614 + }, + { + "epoch": 0.07502497858978019, + "grad_norm": 1.3078912496566772, + "learning_rate": 4.9308899032132183e-05, + "loss": 5.3937, + "step": 12615 + }, + { + "epoch": 0.07503092587306119, + "grad_norm": 1.2535938024520874, + "learning_rate": 4.9308789958746016e-05, + "loss": 5.5708, + "step": 12616 + }, + { + "epoch": 0.07503687315634218, + "grad_norm": 1.3942710161209106, + "learning_rate": 4.9308680876873894e-05, + "loss": 5.5907, + "step": 12617 + }, + { + "epoch": 0.07504282043962318, + "grad_norm": 1.3061814308166504, + "learning_rate": 4.930857178651587e-05, + "loss": 5.2515, + "step": 12618 + }, + { + "epoch": 0.07504876772290418, + "grad_norm": 1.8493753671646118, + "learning_rate": 4.930846268767197e-05, + "loss": 4.9958, + "step": 12619 + }, + { + "epoch": 0.07505471500618517, + "grad_norm": 1.5966380834579468, + "learning_rate": 4.9308353580342234e-05, + "loss": 4.8784, + "step": 12620 + }, + { + "epoch": 0.07506066228946617, + "grad_norm": 1.6849051713943481, + "learning_rate": 4.930824446452671e-05, + "loss": 5.1549, + "step": 12621 + }, + { + "epoch": 0.07506660957274718, + "grad_norm": 1.5844405889511108, + "learning_rate": 4.9308135340225426e-05, + "loss": 4.9807, + "step": 12622 + }, + { + "epoch": 0.07507255685602816, + "grad_norm": 1.520621418952942, + "learning_rate": 4.9308026207438424e-05, + "loss": 5.2237, + "step": 12623 + }, + { + "epoch": 0.07507850413930917, + "grad_norm": 1.5273483991622925, + "learning_rate": 4.9307917066165744e-05, + "loss": 5.4053, + "step": 12624 + }, + { + "epoch": 0.07508445142259017, + "grad_norm": 1.7137775421142578, + "learning_rate": 4.9307807916407414e-05, + "loss": 5.0427, + "step": 12625 + }, + { + "epoch": 0.07509039870587116, + "grad_norm": 1.7140679359436035, + "learning_rate": 4.930769875816348e-05, + "loss": 5.0354, + "step": 12626 + }, + { + "epoch": 0.07509634598915216, + "grad_norm": 1.5592498779296875, + "learning_rate": 4.930758959143399e-05, + "loss": 4.9663, + "step": 12627 + }, + { + "epoch": 0.07510229327243315, + "grad_norm": 1.4611366987228394, + "learning_rate": 4.930748041621896e-05, + "loss": 4.9469, + "step": 12628 + }, + { + "epoch": 0.07510824055571415, + "grad_norm": 1.4682248830795288, + "learning_rate": 4.930737123251844e-05, + "loss": 5.0217, + "step": 12629 + }, + { + "epoch": 0.07511418783899515, + "grad_norm": 1.5643991231918335, + "learning_rate": 4.9307262040332474e-05, + "loss": 5.0488, + "step": 12630 + }, + { + "epoch": 0.07512013512227614, + "grad_norm": 1.680577278137207, + "learning_rate": 4.9307152839661094e-05, + "loss": 5.0813, + "step": 12631 + }, + { + "epoch": 0.07512608240555714, + "grad_norm": 1.9138245582580566, + "learning_rate": 4.9307043630504334e-05, + "loss": 5.0965, + "step": 12632 + }, + { + "epoch": 0.07513202968883814, + "grad_norm": 1.7382584810256958, + "learning_rate": 4.9306934412862236e-05, + "loss": 5.3726, + "step": 12633 + }, + { + "epoch": 0.07513797697211913, + "grad_norm": 1.684213638305664, + "learning_rate": 4.930682518673484e-05, + "loss": 5.2511, + "step": 12634 + }, + { + "epoch": 0.07514392425540013, + "grad_norm": 1.6976017951965332, + "learning_rate": 4.9306715952122185e-05, + "loss": 4.9669, + "step": 12635 + }, + { + "epoch": 0.07514987153868113, + "grad_norm": 1.526212453842163, + "learning_rate": 4.930660670902431e-05, + "loss": 4.9405, + "step": 12636 + }, + { + "epoch": 0.07515581882196212, + "grad_norm": 1.6616593599319458, + "learning_rate": 4.930649745744124e-05, + "loss": 5.0266, + "step": 12637 + }, + { + "epoch": 0.07516176610524312, + "grad_norm": 1.7911401987075806, + "learning_rate": 4.930638819737303e-05, + "loss": 4.8774, + "step": 12638 + }, + { + "epoch": 0.07516771338852413, + "grad_norm": 1.3613603115081787, + "learning_rate": 4.93062789288197e-05, + "loss": 5.4048, + "step": 12639 + }, + { + "epoch": 0.07517366067180511, + "grad_norm": 1.5945172309875488, + "learning_rate": 4.930616965178131e-05, + "loss": 5.1918, + "step": 12640 + }, + { + "epoch": 0.07517960795508612, + "grad_norm": 1.816091775894165, + "learning_rate": 4.930606036625789e-05, + "loss": 5.3138, + "step": 12641 + }, + { + "epoch": 0.07518555523836712, + "grad_norm": 1.642877459526062, + "learning_rate": 4.930595107224947e-05, + "loss": 5.2438, + "step": 12642 + }, + { + "epoch": 0.07519150252164811, + "grad_norm": 1.8904980421066284, + "learning_rate": 4.930584176975609e-05, + "loss": 5.1565, + "step": 12643 + }, + { + "epoch": 0.07519744980492911, + "grad_norm": 1.6247447729110718, + "learning_rate": 4.93057324587778e-05, + "loss": 5.1795, + "step": 12644 + }, + { + "epoch": 0.07520339708821011, + "grad_norm": 1.4699510335922241, + "learning_rate": 4.930562313931461e-05, + "loss": 5.3628, + "step": 12645 + }, + { + "epoch": 0.0752093443714911, + "grad_norm": 1.537920355796814, + "learning_rate": 4.93055138113666e-05, + "loss": 5.492, + "step": 12646 + }, + { + "epoch": 0.0752152916547721, + "grad_norm": 1.3268204927444458, + "learning_rate": 4.930540447493378e-05, + "loss": 5.2169, + "step": 12647 + }, + { + "epoch": 0.0752212389380531, + "grad_norm": 1.627005934715271, + "learning_rate": 4.930529513001619e-05, + "loss": 5.9358, + "step": 12648 + }, + { + "epoch": 0.07522718622133409, + "grad_norm": 1.445926547050476, + "learning_rate": 4.930518577661388e-05, + "loss": 5.0762, + "step": 12649 + }, + { + "epoch": 0.0752331335046151, + "grad_norm": 1.5958713293075562, + "learning_rate": 4.930507641472688e-05, + "loss": 5.2345, + "step": 12650 + }, + { + "epoch": 0.0752390807878961, + "grad_norm": 1.470540165901184, + "learning_rate": 4.9304967044355225e-05, + "loss": 5.1259, + "step": 12651 + }, + { + "epoch": 0.07524502807117708, + "grad_norm": 1.4679489135742188, + "learning_rate": 4.930485766549896e-05, + "loss": 5.1456, + "step": 12652 + }, + { + "epoch": 0.07525097535445809, + "grad_norm": 1.3032207489013672, + "learning_rate": 4.930474827815812e-05, + "loss": 5.1479, + "step": 12653 + }, + { + "epoch": 0.07525692263773909, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.930463888233274e-05, + "loss": 5.173, + "step": 12654 + }, + { + "epoch": 0.07526286992102008, + "grad_norm": 1.5788590908050537, + "learning_rate": 4.930452947802286e-05, + "loss": 5.0608, + "step": 12655 + }, + { + "epoch": 0.07526881720430108, + "grad_norm": 1.4392722845077515, + "learning_rate": 4.9304420065228526e-05, + "loss": 5.1209, + "step": 12656 + }, + { + "epoch": 0.07527476448758207, + "grad_norm": 1.4725446701049805, + "learning_rate": 4.930431064394977e-05, + "loss": 5.1249, + "step": 12657 + }, + { + "epoch": 0.07528071177086307, + "grad_norm": 1.4239790439605713, + "learning_rate": 4.930420121418663e-05, + "loss": 5.0262, + "step": 12658 + }, + { + "epoch": 0.07528665905414407, + "grad_norm": 1.3037468194961548, + "learning_rate": 4.930409177593914e-05, + "loss": 5.1158, + "step": 12659 + }, + { + "epoch": 0.07529260633742506, + "grad_norm": 1.430015206336975, + "learning_rate": 4.930398232920734e-05, + "loss": 5.1362, + "step": 12660 + }, + { + "epoch": 0.07529855362070606, + "grad_norm": 1.2381033897399902, + "learning_rate": 4.930387287399127e-05, + "loss": 5.2351, + "step": 12661 + }, + { + "epoch": 0.07530450090398706, + "grad_norm": 1.4459912776947021, + "learning_rate": 4.930376341029098e-05, + "loss": 5.1413, + "step": 12662 + }, + { + "epoch": 0.07531044818726805, + "grad_norm": 1.4875576496124268, + "learning_rate": 4.93036539381065e-05, + "loss": 5.0556, + "step": 12663 + }, + { + "epoch": 0.07531639547054905, + "grad_norm": 1.1632124185562134, + "learning_rate": 4.930354445743785e-05, + "loss": 5.2317, + "step": 12664 + }, + { + "epoch": 0.07532234275383005, + "grad_norm": 1.324722170829773, + "learning_rate": 4.9303434968285096e-05, + "loss": 5.0562, + "step": 12665 + }, + { + "epoch": 0.07532829003711104, + "grad_norm": 1.4292213916778564, + "learning_rate": 4.9303325470648254e-05, + "loss": 5.0991, + "step": 12666 + }, + { + "epoch": 0.07533423732039204, + "grad_norm": 1.4528483152389526, + "learning_rate": 4.930321596452738e-05, + "loss": 5.0675, + "step": 12667 + }, + { + "epoch": 0.07534018460367305, + "grad_norm": 1.5489269495010376, + "learning_rate": 4.9303106449922504e-05, + "loss": 4.9073, + "step": 12668 + }, + { + "epoch": 0.07534613188695403, + "grad_norm": 1.440854787826538, + "learning_rate": 4.9302996926833664e-05, + "loss": 5.0401, + "step": 12669 + }, + { + "epoch": 0.07535207917023504, + "grad_norm": 1.4586740732192993, + "learning_rate": 4.9302887395260894e-05, + "loss": 5.0483, + "step": 12670 + }, + { + "epoch": 0.07535802645351604, + "grad_norm": 1.390376091003418, + "learning_rate": 4.930277785520424e-05, + "loss": 5.1417, + "step": 12671 + }, + { + "epoch": 0.07536397373679703, + "grad_norm": 1.296410083770752, + "learning_rate": 4.9302668306663736e-05, + "loss": 5.461, + "step": 12672 + }, + { + "epoch": 0.07536992102007803, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.930255874963943e-05, + "loss": 5.4972, + "step": 12673 + }, + { + "epoch": 0.07537586830335903, + "grad_norm": 1.4567232131958008, + "learning_rate": 4.930244918413134e-05, + "loss": 5.1921, + "step": 12674 + }, + { + "epoch": 0.07538181558664002, + "grad_norm": 1.7850147485733032, + "learning_rate": 4.930233961013953e-05, + "loss": 5.0658, + "step": 12675 + }, + { + "epoch": 0.07538776286992102, + "grad_norm": 1.5736637115478516, + "learning_rate": 4.930223002766401e-05, + "loss": 5.6874, + "step": 12676 + }, + { + "epoch": 0.07539371015320202, + "grad_norm": 1.5202080011367798, + "learning_rate": 4.9302120436704836e-05, + "loss": 5.7279, + "step": 12677 + }, + { + "epoch": 0.07539965743648301, + "grad_norm": 1.4259493350982666, + "learning_rate": 4.930201083726205e-05, + "loss": 5.5445, + "step": 12678 + }, + { + "epoch": 0.07540560471976401, + "grad_norm": 1.5141973495483398, + "learning_rate": 4.9301901229335674e-05, + "loss": 5.5086, + "step": 12679 + }, + { + "epoch": 0.07541155200304502, + "grad_norm": 1.5044218301773071, + "learning_rate": 4.930179161292576e-05, + "loss": 5.4279, + "step": 12680 + }, + { + "epoch": 0.075417499286326, + "grad_norm": 1.5342620611190796, + "learning_rate": 4.930168198803234e-05, + "loss": 5.0885, + "step": 12681 + }, + { + "epoch": 0.075423446569607, + "grad_norm": 1.8139567375183105, + "learning_rate": 4.930157235465546e-05, + "loss": 5.5586, + "step": 12682 + }, + { + "epoch": 0.07542939385288801, + "grad_norm": 1.606778621673584, + "learning_rate": 4.9301462712795144e-05, + "loss": 5.4007, + "step": 12683 + }, + { + "epoch": 0.075435341136169, + "grad_norm": 1.6451623439788818, + "learning_rate": 4.930135306245144e-05, + "loss": 5.2882, + "step": 12684 + }, + { + "epoch": 0.07544128841945, + "grad_norm": 1.915991187095642, + "learning_rate": 4.9301243403624385e-05, + "loss": 5.0727, + "step": 12685 + }, + { + "epoch": 0.07544723570273099, + "grad_norm": 1.536456823348999, + "learning_rate": 4.930113373631402e-05, + "loss": 5.2154, + "step": 12686 + }, + { + "epoch": 0.07545318298601199, + "grad_norm": 1.5820670127868652, + "learning_rate": 4.9301024060520375e-05, + "loss": 5.0613, + "step": 12687 + }, + { + "epoch": 0.07545913026929299, + "grad_norm": 1.5905929803848267, + "learning_rate": 4.93009143762435e-05, + "loss": 5.08, + "step": 12688 + }, + { + "epoch": 0.07546507755257398, + "grad_norm": 1.5759062767028809, + "learning_rate": 4.9300804683483426e-05, + "loss": 5.0874, + "step": 12689 + }, + { + "epoch": 0.07547102483585498, + "grad_norm": 1.4619840383529663, + "learning_rate": 4.9300694982240186e-05, + "loss": 5.1803, + "step": 12690 + }, + { + "epoch": 0.07547697211913598, + "grad_norm": 1.2742846012115479, + "learning_rate": 4.930058527251383e-05, + "loss": 5.2721, + "step": 12691 + }, + { + "epoch": 0.07548291940241697, + "grad_norm": 1.4095741510391235, + "learning_rate": 4.930047555430439e-05, + "loss": 5.055, + "step": 12692 + }, + { + "epoch": 0.07548886668569797, + "grad_norm": 1.3399991989135742, + "learning_rate": 4.93003658276119e-05, + "loss": 5.0315, + "step": 12693 + }, + { + "epoch": 0.07549481396897897, + "grad_norm": 1.4075208902359009, + "learning_rate": 4.9300256092436407e-05, + "loss": 5.2634, + "step": 12694 + }, + { + "epoch": 0.07550076125225996, + "grad_norm": 1.681321144104004, + "learning_rate": 4.930014634877795e-05, + "loss": 4.9749, + "step": 12695 + }, + { + "epoch": 0.07550670853554096, + "grad_norm": 1.842136263847351, + "learning_rate": 4.9300036596636555e-05, + "loss": 4.797, + "step": 12696 + }, + { + "epoch": 0.07551265581882197, + "grad_norm": 1.8733257055282593, + "learning_rate": 4.929992683601228e-05, + "loss": 5.4726, + "step": 12697 + }, + { + "epoch": 0.07551860310210295, + "grad_norm": 1.747514009475708, + "learning_rate": 4.929981706690514e-05, + "loss": 5.1081, + "step": 12698 + }, + { + "epoch": 0.07552455038538396, + "grad_norm": 1.8107210397720337, + "learning_rate": 4.9299707289315187e-05, + "loss": 4.983, + "step": 12699 + }, + { + "epoch": 0.07553049766866496, + "grad_norm": 1.6319682598114014, + "learning_rate": 4.929959750324246e-05, + "loss": 4.9968, + "step": 12700 + }, + { + "epoch": 0.07553644495194595, + "grad_norm": 1.4653065204620361, + "learning_rate": 4.9299487708687e-05, + "loss": 5.3013, + "step": 12701 + }, + { + "epoch": 0.07554239223522695, + "grad_norm": 1.4665262699127197, + "learning_rate": 4.929937790564883e-05, + "loss": 5.4431, + "step": 12702 + }, + { + "epoch": 0.07554833951850795, + "grad_norm": 1.4962518215179443, + "learning_rate": 4.9299268094127996e-05, + "loss": 5.3692, + "step": 12703 + }, + { + "epoch": 0.07555428680178894, + "grad_norm": 1.7913219928741455, + "learning_rate": 4.929915827412454e-05, + "loss": 5.0082, + "step": 12704 + }, + { + "epoch": 0.07556023408506994, + "grad_norm": 1.5508856773376465, + "learning_rate": 4.929904844563851e-05, + "loss": 5.1501, + "step": 12705 + }, + { + "epoch": 0.07556618136835094, + "grad_norm": 1.5882935523986816, + "learning_rate": 4.929893860866993e-05, + "loss": 4.9579, + "step": 12706 + }, + { + "epoch": 0.07557212865163193, + "grad_norm": 1.4550399780273438, + "learning_rate": 4.9298828763218833e-05, + "loss": 5.0165, + "step": 12707 + }, + { + "epoch": 0.07557807593491293, + "grad_norm": 1.5075403451919556, + "learning_rate": 4.929871890928527e-05, + "loss": 4.933, + "step": 12708 + }, + { + "epoch": 0.07558402321819394, + "grad_norm": 1.7094134092330933, + "learning_rate": 4.929860904686928e-05, + "loss": 4.8842, + "step": 12709 + }, + { + "epoch": 0.07558997050147492, + "grad_norm": 1.5615170001983643, + "learning_rate": 4.929849917597089e-05, + "loss": 5.5301, + "step": 12710 + }, + { + "epoch": 0.07559591778475593, + "grad_norm": 1.6687208414077759, + "learning_rate": 4.929838929659015e-05, + "loss": 4.9325, + "step": 12711 + }, + { + "epoch": 0.07560186506803693, + "grad_norm": 1.3476423025131226, + "learning_rate": 4.9298279408727086e-05, + "loss": 5.1274, + "step": 12712 + }, + { + "epoch": 0.07560781235131792, + "grad_norm": 1.359786868095398, + "learning_rate": 4.929816951238175e-05, + "loss": 4.7549, + "step": 12713 + }, + { + "epoch": 0.07561375963459892, + "grad_norm": 1.305482029914856, + "learning_rate": 4.9298059607554184e-05, + "loss": 4.7371, + "step": 12714 + }, + { + "epoch": 0.0756197069178799, + "grad_norm": 1.408693790435791, + "learning_rate": 4.92979496942444e-05, + "loss": 5.0733, + "step": 12715 + }, + { + "epoch": 0.07562565420116091, + "grad_norm": 1.3604625463485718, + "learning_rate": 4.9297839772452456e-05, + "loss": 4.7947, + "step": 12716 + }, + { + "epoch": 0.07563160148444191, + "grad_norm": 1.4101814031600952, + "learning_rate": 4.929772984217839e-05, + "loss": 5.2003, + "step": 12717 + }, + { + "epoch": 0.0756375487677229, + "grad_norm": 1.4409375190734863, + "learning_rate": 4.929761990342224e-05, + "loss": 5.167, + "step": 12718 + }, + { + "epoch": 0.0756434960510039, + "grad_norm": 1.4309754371643066, + "learning_rate": 4.9297509956184044e-05, + "loss": 5.1499, + "step": 12719 + }, + { + "epoch": 0.0756494433342849, + "grad_norm": 1.6380341053009033, + "learning_rate": 4.929740000046382e-05, + "loss": 4.8282, + "step": 12720 + }, + { + "epoch": 0.07565539061756589, + "grad_norm": 1.6795456409454346, + "learning_rate": 4.929729003626164e-05, + "loss": 4.708, + "step": 12721 + }, + { + "epoch": 0.07566133790084689, + "grad_norm": 1.7367075681686401, + "learning_rate": 4.929718006357753e-05, + "loss": 5.3364, + "step": 12722 + }, + { + "epoch": 0.0756672851841279, + "grad_norm": 1.5842353105545044, + "learning_rate": 4.929707008241152e-05, + "loss": 5.2025, + "step": 12723 + }, + { + "epoch": 0.07567323246740888, + "grad_norm": 1.5129985809326172, + "learning_rate": 4.9296960092763657e-05, + "loss": 5.1788, + "step": 12724 + }, + { + "epoch": 0.07567917975068988, + "grad_norm": 1.4276295900344849, + "learning_rate": 4.929685009463397e-05, + "loss": 5.2597, + "step": 12725 + }, + { + "epoch": 0.07568512703397089, + "grad_norm": 1.499213457107544, + "learning_rate": 4.9296740088022506e-05, + "loss": 5.1778, + "step": 12726 + }, + { + "epoch": 0.07569107431725187, + "grad_norm": 1.4656083583831787, + "learning_rate": 4.92966300729293e-05, + "loss": 5.2689, + "step": 12727 + }, + { + "epoch": 0.07569702160053288, + "grad_norm": 1.6160268783569336, + "learning_rate": 4.9296520049354393e-05, + "loss": 5.1829, + "step": 12728 + }, + { + "epoch": 0.07570296888381388, + "grad_norm": 1.514891266822815, + "learning_rate": 4.929641001729782e-05, + "loss": 5.2586, + "step": 12729 + }, + { + "epoch": 0.07570891616709487, + "grad_norm": 1.4635345935821533, + "learning_rate": 4.929629997675963e-05, + "loss": 5.2159, + "step": 12730 + }, + { + "epoch": 0.07571486345037587, + "grad_norm": 1.704380750656128, + "learning_rate": 4.9296189927739846e-05, + "loss": 5.1068, + "step": 12731 + }, + { + "epoch": 0.07572081073365687, + "grad_norm": 1.5786374807357788, + "learning_rate": 4.929607987023851e-05, + "loss": 5.2306, + "step": 12732 + }, + { + "epoch": 0.07572675801693786, + "grad_norm": 1.5011721849441528, + "learning_rate": 4.929596980425567e-05, + "loss": 5.1594, + "step": 12733 + }, + { + "epoch": 0.07573270530021886, + "grad_norm": 1.4532456398010254, + "learning_rate": 4.9295859729791354e-05, + "loss": 5.0955, + "step": 12734 + }, + { + "epoch": 0.07573865258349986, + "grad_norm": 1.5734699964523315, + "learning_rate": 4.9295749646845604e-05, + "loss": 5.1523, + "step": 12735 + }, + { + "epoch": 0.07574459986678085, + "grad_norm": 1.578141450881958, + "learning_rate": 4.929563955541846e-05, + "loss": 5.0784, + "step": 12736 + }, + { + "epoch": 0.07575054715006185, + "grad_norm": 1.408524513244629, + "learning_rate": 4.929552945550996e-05, + "loss": 5.1411, + "step": 12737 + }, + { + "epoch": 0.07575649443334286, + "grad_norm": 1.4755773544311523, + "learning_rate": 4.929541934712014e-05, + "loss": 5.0666, + "step": 12738 + }, + { + "epoch": 0.07576244171662384, + "grad_norm": 1.5521161556243896, + "learning_rate": 4.929530923024904e-05, + "loss": 5.0938, + "step": 12739 + }, + { + "epoch": 0.07576838899990485, + "grad_norm": 1.4772706031799316, + "learning_rate": 4.929519910489671e-05, + "loss": 5.1178, + "step": 12740 + }, + { + "epoch": 0.07577433628318585, + "grad_norm": 1.2669662237167358, + "learning_rate": 4.9295088971063164e-05, + "loss": 5.2565, + "step": 12741 + }, + { + "epoch": 0.07578028356646684, + "grad_norm": 1.5846413373947144, + "learning_rate": 4.929497882874845e-05, + "loss": 5.2109, + "step": 12742 + }, + { + "epoch": 0.07578623084974784, + "grad_norm": 1.779228687286377, + "learning_rate": 4.929486867795262e-05, + "loss": 5.0196, + "step": 12743 + }, + { + "epoch": 0.07579217813302883, + "grad_norm": 1.6306418180465698, + "learning_rate": 4.92947585186757e-05, + "loss": 5.1982, + "step": 12744 + }, + { + "epoch": 0.07579812541630983, + "grad_norm": 1.5107831954956055, + "learning_rate": 4.9294648350917726e-05, + "loss": 5.0652, + "step": 12745 + }, + { + "epoch": 0.07580407269959083, + "grad_norm": 1.3846759796142578, + "learning_rate": 4.9294538174678744e-05, + "loss": 5.0322, + "step": 12746 + }, + { + "epoch": 0.07581001998287182, + "grad_norm": 1.4558676481246948, + "learning_rate": 4.9294427989958794e-05, + "loss": 4.9626, + "step": 12747 + }, + { + "epoch": 0.07581596726615282, + "grad_norm": 1.3155016899108887, + "learning_rate": 4.92943177967579e-05, + "loss": 4.9965, + "step": 12748 + }, + { + "epoch": 0.07582191454943382, + "grad_norm": 1.3237980604171753, + "learning_rate": 4.9294207595076125e-05, + "loss": 4.9697, + "step": 12749 + }, + { + "epoch": 0.07582786183271481, + "grad_norm": 1.4439423084259033, + "learning_rate": 4.929409738491349e-05, + "loss": 5.0636, + "step": 12750 + }, + { + "epoch": 0.07583380911599581, + "grad_norm": 1.4793460369110107, + "learning_rate": 4.9293987166270024e-05, + "loss": 5.1122, + "step": 12751 + }, + { + "epoch": 0.07583975639927681, + "grad_norm": 1.5353471040725708, + "learning_rate": 4.929387693914578e-05, + "loss": 5.174, + "step": 12752 + }, + { + "epoch": 0.0758457036825578, + "grad_norm": 1.690537452697754, + "learning_rate": 4.929376670354081e-05, + "loss": 5.1515, + "step": 12753 + }, + { + "epoch": 0.0758516509658388, + "grad_norm": 1.4602952003479004, + "learning_rate": 4.9293656459455124e-05, + "loss": 5.1244, + "step": 12754 + }, + { + "epoch": 0.0758575982491198, + "grad_norm": 1.5871785879135132, + "learning_rate": 4.929354620688878e-05, + "loss": 5.2856, + "step": 12755 + }, + { + "epoch": 0.0758635455324008, + "grad_norm": 1.588065505027771, + "learning_rate": 4.92934359458418e-05, + "loss": 5.3694, + "step": 12756 + }, + { + "epoch": 0.0758694928156818, + "grad_norm": 1.5489270687103271, + "learning_rate": 4.929332567631424e-05, + "loss": 5.3546, + "step": 12757 + }, + { + "epoch": 0.0758754400989628, + "grad_norm": 1.493815541267395, + "learning_rate": 4.9293215398306136e-05, + "loss": 5.0878, + "step": 12758 + }, + { + "epoch": 0.07588138738224379, + "grad_norm": 1.3329546451568604, + "learning_rate": 4.929310511181751e-05, + "loss": 5.2171, + "step": 12759 + }, + { + "epoch": 0.07588733466552479, + "grad_norm": 1.5299288034439087, + "learning_rate": 4.929299481684842e-05, + "loss": 5.1695, + "step": 12760 + }, + { + "epoch": 0.07589328194880579, + "grad_norm": 1.5130664110183716, + "learning_rate": 4.9292884513398894e-05, + "loss": 5.3169, + "step": 12761 + }, + { + "epoch": 0.07589922923208678, + "grad_norm": 1.420339584350586, + "learning_rate": 4.9292774201468974e-05, + "loss": 5.1995, + "step": 12762 + }, + { + "epoch": 0.07590517651536778, + "grad_norm": 1.4740930795669556, + "learning_rate": 4.9292663881058696e-05, + "loss": 5.3321, + "step": 12763 + }, + { + "epoch": 0.07591112379864878, + "grad_norm": 1.448968768119812, + "learning_rate": 4.92925535521681e-05, + "loss": 5.1292, + "step": 12764 + }, + { + "epoch": 0.07591707108192977, + "grad_norm": 1.3219209909439087, + "learning_rate": 4.929244321479722e-05, + "loss": 5.1873, + "step": 12765 + }, + { + "epoch": 0.07592301836521077, + "grad_norm": 1.3336325883865356, + "learning_rate": 4.929233286894611e-05, + "loss": 5.248, + "step": 12766 + }, + { + "epoch": 0.07592896564849178, + "grad_norm": 1.4230278730392456, + "learning_rate": 4.9292222514614795e-05, + "loss": 5.2072, + "step": 12767 + }, + { + "epoch": 0.07593491293177276, + "grad_norm": 1.4522627592086792, + "learning_rate": 4.929211215180331e-05, + "loss": 5.4323, + "step": 12768 + }, + { + "epoch": 0.07594086021505377, + "grad_norm": 1.4863537549972534, + "learning_rate": 4.929200178051171e-05, + "loss": 5.241, + "step": 12769 + }, + { + "epoch": 0.07594680749833477, + "grad_norm": 1.7619402408599854, + "learning_rate": 4.929189140074001e-05, + "loss": 5.4853, + "step": 12770 + }, + { + "epoch": 0.07595275478161576, + "grad_norm": 1.6116011142730713, + "learning_rate": 4.929178101248827e-05, + "loss": 5.4793, + "step": 12771 + }, + { + "epoch": 0.07595870206489676, + "grad_norm": 1.8669662475585938, + "learning_rate": 4.9291670615756516e-05, + "loss": 5.4062, + "step": 12772 + }, + { + "epoch": 0.07596464934817775, + "grad_norm": 1.6439383029937744, + "learning_rate": 4.9291560210544796e-05, + "loss": 5.148, + "step": 12773 + }, + { + "epoch": 0.07597059663145875, + "grad_norm": 1.4800657033920288, + "learning_rate": 4.929144979685314e-05, + "loss": 5.3895, + "step": 12774 + }, + { + "epoch": 0.07597654391473975, + "grad_norm": 1.4091606140136719, + "learning_rate": 4.929133937468159e-05, + "loss": 5.3307, + "step": 12775 + }, + { + "epoch": 0.07598249119802074, + "grad_norm": 1.3786438703536987, + "learning_rate": 4.9291228944030176e-05, + "loss": 5.0786, + "step": 12776 + }, + { + "epoch": 0.07598843848130174, + "grad_norm": 1.6039817333221436, + "learning_rate": 4.929111850489896e-05, + "loss": 5.0606, + "step": 12777 + }, + { + "epoch": 0.07599438576458274, + "grad_norm": 1.5277283191680908, + "learning_rate": 4.929100805728796e-05, + "loss": 5.1949, + "step": 12778 + }, + { + "epoch": 0.07600033304786373, + "grad_norm": 1.6756436824798584, + "learning_rate": 4.929089760119722e-05, + "loss": 5.125, + "step": 12779 + }, + { + "epoch": 0.07600628033114473, + "grad_norm": 1.7082979679107666, + "learning_rate": 4.929078713662677e-05, + "loss": 5.1984, + "step": 12780 + }, + { + "epoch": 0.07601222761442573, + "grad_norm": 1.607293963432312, + "learning_rate": 4.929067666357666e-05, + "loss": 5.1809, + "step": 12781 + }, + { + "epoch": 0.07601817489770672, + "grad_norm": 1.5133613348007202, + "learning_rate": 4.9290566182046936e-05, + "loss": 5.2602, + "step": 12782 + }, + { + "epoch": 0.07602412218098772, + "grad_norm": 1.6572481393814087, + "learning_rate": 4.9290455692037616e-05, + "loss": 5.0959, + "step": 12783 + }, + { + "epoch": 0.07603006946426873, + "grad_norm": 1.6593372821807861, + "learning_rate": 4.929034519354876e-05, + "loss": 5.1672, + "step": 12784 + }, + { + "epoch": 0.07603601674754971, + "grad_norm": 1.4214340448379517, + "learning_rate": 4.929023468658038e-05, + "loss": 5.1064, + "step": 12785 + }, + { + "epoch": 0.07604196403083072, + "grad_norm": 1.4875116348266602, + "learning_rate": 4.929012417113255e-05, + "loss": 5.0657, + "step": 12786 + }, + { + "epoch": 0.07604791131411172, + "grad_norm": 1.7354154586791992, + "learning_rate": 4.929001364720527e-05, + "loss": 5.0415, + "step": 12787 + }, + { + "epoch": 0.0760538585973927, + "grad_norm": 1.5597622394561768, + "learning_rate": 4.928990311479861e-05, + "loss": 5.1404, + "step": 12788 + }, + { + "epoch": 0.07605980588067371, + "grad_norm": 1.6819382905960083, + "learning_rate": 4.928979257391258e-05, + "loss": 4.9487, + "step": 12789 + }, + { + "epoch": 0.07606575316395471, + "grad_norm": 1.4722174406051636, + "learning_rate": 4.928968202454725e-05, + "loss": 5.1677, + "step": 12790 + }, + { + "epoch": 0.0760717004472357, + "grad_norm": 1.5145434141159058, + "learning_rate": 4.9289571466702635e-05, + "loss": 5.2197, + "step": 12791 + }, + { + "epoch": 0.0760776477305167, + "grad_norm": 1.6052699089050293, + "learning_rate": 4.9289460900378784e-05, + "loss": 5.2508, + "step": 12792 + }, + { + "epoch": 0.0760835950137977, + "grad_norm": 1.3738253116607666, + "learning_rate": 4.9289350325575734e-05, + "loss": 5.1253, + "step": 12793 + }, + { + "epoch": 0.07608954229707869, + "grad_norm": 1.2580832242965698, + "learning_rate": 4.9289239742293524e-05, + "loss": 5.2497, + "step": 12794 + }, + { + "epoch": 0.0760954895803597, + "grad_norm": 1.6756019592285156, + "learning_rate": 4.928912915053219e-05, + "loss": 5.2471, + "step": 12795 + }, + { + "epoch": 0.0761014368636407, + "grad_norm": 1.6785964965820312, + "learning_rate": 4.928901855029177e-05, + "loss": 4.9893, + "step": 12796 + }, + { + "epoch": 0.07610738414692168, + "grad_norm": 1.6926941871643066, + "learning_rate": 4.92889079415723e-05, + "loss": 5.1558, + "step": 12797 + }, + { + "epoch": 0.07611333143020269, + "grad_norm": 1.4381680488586426, + "learning_rate": 4.9288797324373835e-05, + "loss": 4.9754, + "step": 12798 + }, + { + "epoch": 0.07611927871348369, + "grad_norm": 1.4430698156356812, + "learning_rate": 4.9288686698696393e-05, + "loss": 5.0197, + "step": 12799 + }, + { + "epoch": 0.07612522599676468, + "grad_norm": 1.4745796918869019, + "learning_rate": 4.928857606454002e-05, + "loss": 4.8857, + "step": 12800 + }, + { + "epoch": 0.07613117328004568, + "grad_norm": 1.5430330038070679, + "learning_rate": 4.928846542190477e-05, + "loss": 5.0407, + "step": 12801 + }, + { + "epoch": 0.07613712056332667, + "grad_norm": 1.6061021089553833, + "learning_rate": 4.928835477079066e-05, + "loss": 5.068, + "step": 12802 + }, + { + "epoch": 0.07614306784660767, + "grad_norm": 1.699568510055542, + "learning_rate": 4.9288244111197734e-05, + "loss": 4.9067, + "step": 12803 + }, + { + "epoch": 0.07614901512988867, + "grad_norm": 1.4770212173461914, + "learning_rate": 4.928813344312603e-05, + "loss": 5.0807, + "step": 12804 + }, + { + "epoch": 0.07615496241316966, + "grad_norm": 1.4657871723175049, + "learning_rate": 4.928802276657559e-05, + "loss": 5.1982, + "step": 12805 + }, + { + "epoch": 0.07616090969645066, + "grad_norm": 1.7897653579711914, + "learning_rate": 4.928791208154646e-05, + "loss": 5.1154, + "step": 12806 + }, + { + "epoch": 0.07616685697973166, + "grad_norm": 1.6905261278152466, + "learning_rate": 4.928780138803866e-05, + "loss": 5.3129, + "step": 12807 + }, + { + "epoch": 0.07617280426301265, + "grad_norm": 1.4763284921646118, + "learning_rate": 4.928769068605225e-05, + "loss": 5.2104, + "step": 12808 + }, + { + "epoch": 0.07617875154629365, + "grad_norm": 1.38632333278656, + "learning_rate": 4.928757997558725e-05, + "loss": 5.0857, + "step": 12809 + }, + { + "epoch": 0.07618469882957465, + "grad_norm": 1.5099103450775146, + "learning_rate": 4.928746925664371e-05, + "loss": 5.1264, + "step": 12810 + }, + { + "epoch": 0.07619064611285564, + "grad_norm": 1.285243272781372, + "learning_rate": 4.928735852922167e-05, + "loss": 5.1177, + "step": 12811 + }, + { + "epoch": 0.07619659339613664, + "grad_norm": 1.2749274969100952, + "learning_rate": 4.928724779332116e-05, + "loss": 5.0831, + "step": 12812 + }, + { + "epoch": 0.07620254067941765, + "grad_norm": 2.413712978363037, + "learning_rate": 4.928713704894222e-05, + "loss": 5.2416, + "step": 12813 + }, + { + "epoch": 0.07620848796269863, + "grad_norm": 1.602721929550171, + "learning_rate": 4.9287026296084895e-05, + "loss": 4.9799, + "step": 12814 + }, + { + "epoch": 0.07621443524597964, + "grad_norm": 1.515821099281311, + "learning_rate": 4.928691553474921e-05, + "loss": 5.034, + "step": 12815 + }, + { + "epoch": 0.07622038252926064, + "grad_norm": 1.3245290517807007, + "learning_rate": 4.928680476493523e-05, + "loss": 4.9559, + "step": 12816 + }, + { + "epoch": 0.07622632981254163, + "grad_norm": 1.5383784770965576, + "learning_rate": 4.928669398664297e-05, + "loss": 4.9085, + "step": 12817 + }, + { + "epoch": 0.07623227709582263, + "grad_norm": 1.4406317472457886, + "learning_rate": 4.928658319987247e-05, + "loss": 5.0073, + "step": 12818 + }, + { + "epoch": 0.07623822437910363, + "grad_norm": 1.6843304634094238, + "learning_rate": 4.928647240462378e-05, + "loss": 5.0262, + "step": 12819 + }, + { + "epoch": 0.07624417166238462, + "grad_norm": 1.655497431755066, + "learning_rate": 4.928636160089693e-05, + "loss": 5.0633, + "step": 12820 + }, + { + "epoch": 0.07625011894566562, + "grad_norm": 1.4143035411834717, + "learning_rate": 4.9286250788691973e-05, + "loss": 5.1131, + "step": 12821 + }, + { + "epoch": 0.07625606622894662, + "grad_norm": 1.5316637754440308, + "learning_rate": 4.9286139968008926e-05, + "loss": 5.2727, + "step": 12822 + }, + { + "epoch": 0.07626201351222761, + "grad_norm": 1.6708348989486694, + "learning_rate": 4.9286029138847844e-05, + "loss": 5.1469, + "step": 12823 + }, + { + "epoch": 0.07626796079550861, + "grad_norm": 1.48544180393219, + "learning_rate": 4.928591830120876e-05, + "loss": 5.0916, + "step": 12824 + }, + { + "epoch": 0.07627390807878962, + "grad_norm": 1.3884835243225098, + "learning_rate": 4.9285807455091715e-05, + "loss": 5.1451, + "step": 12825 + }, + { + "epoch": 0.0762798553620706, + "grad_norm": 1.7265839576721191, + "learning_rate": 4.928569660049674e-05, + "loss": 5.0478, + "step": 12826 + }, + { + "epoch": 0.0762858026453516, + "grad_norm": 1.678852915763855, + "learning_rate": 4.9285585737423875e-05, + "loss": 5.2127, + "step": 12827 + }, + { + "epoch": 0.07629174992863261, + "grad_norm": 1.4907126426696777, + "learning_rate": 4.928547486587317e-05, + "loss": 4.9706, + "step": 12828 + }, + { + "epoch": 0.0762976972119136, + "grad_norm": 1.610822319984436, + "learning_rate": 4.928536398584466e-05, + "loss": 5.2416, + "step": 12829 + }, + { + "epoch": 0.0763036444951946, + "grad_norm": 1.5226528644561768, + "learning_rate": 4.9285253097338375e-05, + "loss": 5.2665, + "step": 12830 + }, + { + "epoch": 0.07630959177847559, + "grad_norm": 1.6021392345428467, + "learning_rate": 4.928514220035436e-05, + "loss": 5.2129, + "step": 12831 + }, + { + "epoch": 0.07631553906175659, + "grad_norm": 1.4113723039627075, + "learning_rate": 4.928503129489265e-05, + "loss": 5.3568, + "step": 12832 + }, + { + "epoch": 0.07632148634503759, + "grad_norm": 1.7851402759552002, + "learning_rate": 4.928492038095329e-05, + "loss": 5.2028, + "step": 12833 + }, + { + "epoch": 0.07632743362831858, + "grad_norm": 2.0881283283233643, + "learning_rate": 4.928480945853631e-05, + "loss": 5.2721, + "step": 12834 + }, + { + "epoch": 0.07633338091159958, + "grad_norm": 1.376695156097412, + "learning_rate": 4.928469852764176e-05, + "loss": 5.0203, + "step": 12835 + }, + { + "epoch": 0.07633932819488058, + "grad_norm": 1.585046648979187, + "learning_rate": 4.928458758826967e-05, + "loss": 5.4281, + "step": 12836 + }, + { + "epoch": 0.07634527547816157, + "grad_norm": 1.7124192714691162, + "learning_rate": 4.928447664042008e-05, + "loss": 5.4921, + "step": 12837 + }, + { + "epoch": 0.07635122276144257, + "grad_norm": 1.5693449974060059, + "learning_rate": 4.928436568409304e-05, + "loss": 5.5729, + "step": 12838 + }, + { + "epoch": 0.07635717004472357, + "grad_norm": 2.072880506515503, + "learning_rate": 4.928425471928857e-05, + "loss": 5.1023, + "step": 12839 + }, + { + "epoch": 0.07636311732800456, + "grad_norm": 1.674325704574585, + "learning_rate": 4.928414374600672e-05, + "loss": 5.5319, + "step": 12840 + }, + { + "epoch": 0.07636906461128556, + "grad_norm": 1.3941127061843872, + "learning_rate": 4.9284032764247523e-05, + "loss": 5.4425, + "step": 12841 + }, + { + "epoch": 0.07637501189456657, + "grad_norm": 1.670743703842163, + "learning_rate": 4.9283921774011025e-05, + "loss": 5.2595, + "step": 12842 + }, + { + "epoch": 0.07638095917784755, + "grad_norm": 2.852534294128418, + "learning_rate": 4.928381077529726e-05, + "loss": 5.321, + "step": 12843 + }, + { + "epoch": 0.07638690646112856, + "grad_norm": 1.930977463722229, + "learning_rate": 4.928369976810626e-05, + "loss": 5.2649, + "step": 12844 + }, + { + "epoch": 0.07639285374440956, + "grad_norm": 1.8886314630508423, + "learning_rate": 4.928358875243808e-05, + "loss": 5.1882, + "step": 12845 + }, + { + "epoch": 0.07639880102769055, + "grad_norm": 1.793514609336853, + "learning_rate": 4.9283477728292745e-05, + "loss": 5.0946, + "step": 12846 + }, + { + "epoch": 0.07640474831097155, + "grad_norm": 1.8616431951522827, + "learning_rate": 4.9283366695670304e-05, + "loss": 5.1097, + "step": 12847 + }, + { + "epoch": 0.07641069559425255, + "grad_norm": 1.9281915426254272, + "learning_rate": 4.9283255654570785e-05, + "loss": 5.0054, + "step": 12848 + }, + { + "epoch": 0.07641664287753354, + "grad_norm": 2.036522150039673, + "learning_rate": 4.9283144604994234e-05, + "loss": 4.9115, + "step": 12849 + }, + { + "epoch": 0.07642259016081454, + "grad_norm": 1.7962864637374878, + "learning_rate": 4.928303354694069e-05, + "loss": 4.8951, + "step": 12850 + }, + { + "epoch": 0.07642853744409554, + "grad_norm": 2.1671249866485596, + "learning_rate": 4.9282922480410195e-05, + "loss": 5.1393, + "step": 12851 + }, + { + "epoch": 0.07643448472737653, + "grad_norm": 1.9870150089263916, + "learning_rate": 4.9282811405402774e-05, + "loss": 5.5572, + "step": 12852 + }, + { + "epoch": 0.07644043201065753, + "grad_norm": 2.1498360633850098, + "learning_rate": 4.928270032191847e-05, + "loss": 5.7031, + "step": 12853 + }, + { + "epoch": 0.07644637929393854, + "grad_norm": 2.06821870803833, + "learning_rate": 4.928258922995734e-05, + "loss": 5.723, + "step": 12854 + }, + { + "epoch": 0.07645232657721952, + "grad_norm": 2.283720016479492, + "learning_rate": 4.92824781295194e-05, + "loss": 5.2129, + "step": 12855 + }, + { + "epoch": 0.07645827386050053, + "grad_norm": 2.1862099170684814, + "learning_rate": 4.9282367020604704e-05, + "loss": 4.7535, + "step": 12856 + }, + { + "epoch": 0.07646422114378153, + "grad_norm": 1.7297099828720093, + "learning_rate": 4.928225590321328e-05, + "loss": 5.1965, + "step": 12857 + }, + { + "epoch": 0.07647016842706252, + "grad_norm": 2.0406720638275146, + "learning_rate": 4.9282144777345176e-05, + "loss": 5.289, + "step": 12858 + }, + { + "epoch": 0.07647611571034352, + "grad_norm": 1.8368127346038818, + "learning_rate": 4.928203364300042e-05, + "loss": 5.5448, + "step": 12859 + }, + { + "epoch": 0.0764820629936245, + "grad_norm": 1.837804913520813, + "learning_rate": 4.9281922500179054e-05, + "loss": 5.5284, + "step": 12860 + }, + { + "epoch": 0.07648801027690551, + "grad_norm": 1.7191063165664673, + "learning_rate": 4.928181134888113e-05, + "loss": 5.8212, + "step": 12861 + }, + { + "epoch": 0.07649395756018651, + "grad_norm": 1.757323980331421, + "learning_rate": 4.928170018910667e-05, + "loss": 5.8421, + "step": 12862 + }, + { + "epoch": 0.0764999048434675, + "grad_norm": 1.9213273525238037, + "learning_rate": 4.928158902085572e-05, + "loss": 5.1923, + "step": 12863 + }, + { + "epoch": 0.0765058521267485, + "grad_norm": 1.888006567955017, + "learning_rate": 4.928147784412832e-05, + "loss": 5.4282, + "step": 12864 + }, + { + "epoch": 0.0765117994100295, + "grad_norm": 1.555870771408081, + "learning_rate": 4.9281366658924506e-05, + "loss": 5.8256, + "step": 12865 + }, + { + "epoch": 0.07651774669331049, + "grad_norm": 1.8194485902786255, + "learning_rate": 4.9281255465244314e-05, + "loss": 5.5886, + "step": 12866 + }, + { + "epoch": 0.07652369397659149, + "grad_norm": 1.7867372035980225, + "learning_rate": 4.9281144263087795e-05, + "loss": 5.4818, + "step": 12867 + }, + { + "epoch": 0.0765296412598725, + "grad_norm": 1.8511155843734741, + "learning_rate": 4.928103305245497e-05, + "loss": 5.519, + "step": 12868 + }, + { + "epoch": 0.07653558854315348, + "grad_norm": 2.728428602218628, + "learning_rate": 4.928092183334589e-05, + "loss": 5.0085, + "step": 12869 + }, + { + "epoch": 0.07654153582643448, + "grad_norm": 2.5393402576446533, + "learning_rate": 4.92808106057606e-05, + "loss": 5.0862, + "step": 12870 + }, + { + "epoch": 0.07654748310971549, + "grad_norm": 2.494248151779175, + "learning_rate": 4.928069936969912e-05, + "loss": 5.5557, + "step": 12871 + }, + { + "epoch": 0.07655343039299647, + "grad_norm": 2.4287991523742676, + "learning_rate": 4.9280588125161496e-05, + "loss": 5.6646, + "step": 12872 + }, + { + "epoch": 0.07655937767627748, + "grad_norm": 2.188556432723999, + "learning_rate": 4.928047687214778e-05, + "loss": 5.6618, + "step": 12873 + }, + { + "epoch": 0.07656532495955848, + "grad_norm": 2.7367382049560547, + "learning_rate": 4.9280365610657996e-05, + "loss": 4.6788, + "step": 12874 + }, + { + "epoch": 0.07657127224283947, + "grad_norm": 2.492922067642212, + "learning_rate": 4.9280254340692187e-05, + "loss": 4.4132, + "step": 12875 + }, + { + "epoch": 0.07657721952612047, + "grad_norm": 2.361133575439453, + "learning_rate": 4.928014306225039e-05, + "loss": 4.3957, + "step": 12876 + }, + { + "epoch": 0.07658316680940147, + "grad_norm": 2.652127742767334, + "learning_rate": 4.9280031775332646e-05, + "loss": 4.4568, + "step": 12877 + }, + { + "epoch": 0.07658911409268246, + "grad_norm": 2.40895938873291, + "learning_rate": 4.9279920479938995e-05, + "loss": 4.6276, + "step": 12878 + }, + { + "epoch": 0.07659506137596346, + "grad_norm": 1.9418548345565796, + "learning_rate": 4.927980917606948e-05, + "loss": 5.6008, + "step": 12879 + }, + { + "epoch": 0.07660100865924446, + "grad_norm": 1.7706143856048584, + "learning_rate": 4.9279697863724125e-05, + "loss": 5.4946, + "step": 12880 + }, + { + "epoch": 0.07660695594252545, + "grad_norm": 2.856342077255249, + "learning_rate": 4.9279586542902986e-05, + "loss": 4.9182, + "step": 12881 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 2.713515043258667, + "learning_rate": 4.927947521360608e-05, + "loss": 5.2341, + "step": 12882 + }, + { + "epoch": 0.07661885050908745, + "grad_norm": 2.186169147491455, + "learning_rate": 4.927936387583348e-05, + "loss": 5.1348, + "step": 12883 + }, + { + "epoch": 0.07662479779236844, + "grad_norm": 2.3114492893218994, + "learning_rate": 4.9279252529585195e-05, + "loss": 5.0016, + "step": 12884 + }, + { + "epoch": 0.07663074507564945, + "grad_norm": 2.256502866744995, + "learning_rate": 4.927914117486128e-05, + "loss": 5.1759, + "step": 12885 + }, + { + "epoch": 0.07663669235893045, + "grad_norm": 2.281243324279785, + "learning_rate": 4.927902981166176e-05, + "loss": 5.1437, + "step": 12886 + }, + { + "epoch": 0.07664263964221144, + "grad_norm": 2.3553836345672607, + "learning_rate": 4.927891843998668e-05, + "loss": 5.1622, + "step": 12887 + }, + { + "epoch": 0.07664858692549244, + "grad_norm": 2.420192003250122, + "learning_rate": 4.927880705983609e-05, + "loss": 4.994, + "step": 12888 + }, + { + "epoch": 0.07665453420877343, + "grad_norm": 2.3391306400299072, + "learning_rate": 4.927869567121001e-05, + "loss": 4.9445, + "step": 12889 + }, + { + "epoch": 0.07666048149205443, + "grad_norm": 2.2093355655670166, + "learning_rate": 4.9278584274108484e-05, + "loss": 5.05, + "step": 12890 + }, + { + "epoch": 0.07666642877533543, + "grad_norm": 2.3378305435180664, + "learning_rate": 4.927847286853157e-05, + "loss": 4.8694, + "step": 12891 + }, + { + "epoch": 0.07667237605861642, + "grad_norm": 2.2110583782196045, + "learning_rate": 4.927836145447928e-05, + "loss": 4.8622, + "step": 12892 + }, + { + "epoch": 0.07667832334189742, + "grad_norm": 2.2865991592407227, + "learning_rate": 4.927825003195167e-05, + "loss": 4.9485, + "step": 12893 + }, + { + "epoch": 0.07668427062517842, + "grad_norm": 2.343135356903076, + "learning_rate": 4.927813860094878e-05, + "loss": 4.8874, + "step": 12894 + }, + { + "epoch": 0.07669021790845941, + "grad_norm": 2.1939613819122314, + "learning_rate": 4.927802716147063e-05, + "loss": 4.8349, + "step": 12895 + }, + { + "epoch": 0.07669616519174041, + "grad_norm": 2.866560697555542, + "learning_rate": 4.927791571351728e-05, + "loss": 5.1409, + "step": 12896 + }, + { + "epoch": 0.07670211247502141, + "grad_norm": 2.1052801609039307, + "learning_rate": 4.927780425708876e-05, + "loss": 5.3716, + "step": 12897 + }, + { + "epoch": 0.0767080597583024, + "grad_norm": 2.141184091567993, + "learning_rate": 4.9277692792185106e-05, + "loss": 5.2985, + "step": 12898 + }, + { + "epoch": 0.0767140070415834, + "grad_norm": 1.93148934841156, + "learning_rate": 4.927758131880636e-05, + "loss": 5.6222, + "step": 12899 + }, + { + "epoch": 0.0767199543248644, + "grad_norm": 1.8454651832580566, + "learning_rate": 4.927746983695256e-05, + "loss": 5.6966, + "step": 12900 + }, + { + "epoch": 0.0767259016081454, + "grad_norm": 1.764281153678894, + "learning_rate": 4.9277358346623746e-05, + "loss": 5.4979, + "step": 12901 + }, + { + "epoch": 0.0767318488914264, + "grad_norm": 1.6969131231307983, + "learning_rate": 4.9277246847819965e-05, + "loss": 5.5221, + "step": 12902 + }, + { + "epoch": 0.0767377961747074, + "grad_norm": 1.7118967771530151, + "learning_rate": 4.927713534054124e-05, + "loss": 5.6067, + "step": 12903 + }, + { + "epoch": 0.07674374345798839, + "grad_norm": 2.1508536338806152, + "learning_rate": 4.9277023824787625e-05, + "loss": 5.8241, + "step": 12904 + }, + { + "epoch": 0.07674969074126939, + "grad_norm": 1.8613126277923584, + "learning_rate": 4.927691230055914e-05, + "loss": 5.7141, + "step": 12905 + }, + { + "epoch": 0.07675563802455039, + "grad_norm": 1.8942763805389404, + "learning_rate": 4.927680076785585e-05, + "loss": 5.6909, + "step": 12906 + }, + { + "epoch": 0.07676158530783138, + "grad_norm": 1.8824634552001953, + "learning_rate": 4.927668922667777e-05, + "loss": 5.5055, + "step": 12907 + }, + { + "epoch": 0.07676753259111238, + "grad_norm": 1.8920915126800537, + "learning_rate": 4.927657767702495e-05, + "loss": 5.1783, + "step": 12908 + }, + { + "epoch": 0.07677347987439338, + "grad_norm": 1.8226712942123413, + "learning_rate": 4.927646611889743e-05, + "loss": 5.7529, + "step": 12909 + }, + { + "epoch": 0.07677942715767437, + "grad_norm": 1.88478684425354, + "learning_rate": 4.9276354552295245e-05, + "loss": 5.7034, + "step": 12910 + }, + { + "epoch": 0.07678537444095537, + "grad_norm": 1.6312634944915771, + "learning_rate": 4.927624297721844e-05, + "loss": 5.6476, + "step": 12911 + }, + { + "epoch": 0.07679132172423637, + "grad_norm": 1.5183994770050049, + "learning_rate": 4.927613139366704e-05, + "loss": 5.8517, + "step": 12912 + }, + { + "epoch": 0.07679726900751736, + "grad_norm": 1.6718844175338745, + "learning_rate": 4.92760198016411e-05, + "loss": 5.9619, + "step": 12913 + }, + { + "epoch": 0.07680321629079837, + "grad_norm": 2.575932741165161, + "learning_rate": 4.9275908201140654e-05, + "loss": 5.6903, + "step": 12914 + }, + { + "epoch": 0.07680916357407937, + "grad_norm": 2.2863197326660156, + "learning_rate": 4.927579659216574e-05, + "loss": 5.7517, + "step": 12915 + }, + { + "epoch": 0.07681511085736036, + "grad_norm": 2.231417417526245, + "learning_rate": 4.9275684974716384e-05, + "loss": 5.2323, + "step": 12916 + }, + { + "epoch": 0.07682105814064136, + "grad_norm": 1.9159691333770752, + "learning_rate": 4.927557334879265e-05, + "loss": 5.2548, + "step": 12917 + }, + { + "epoch": 0.07682700542392235, + "grad_norm": 1.6682984828948975, + "learning_rate": 4.927546171439455e-05, + "loss": 5.4639, + "step": 12918 + }, + { + "epoch": 0.07683295270720335, + "grad_norm": 2.1923654079437256, + "learning_rate": 4.927535007152215e-05, + "loss": 5.6016, + "step": 12919 + }, + { + "epoch": 0.07683889999048435, + "grad_norm": 2.2393245697021484, + "learning_rate": 4.9275238420175474e-05, + "loss": 5.9433, + "step": 12920 + }, + { + "epoch": 0.07684484727376534, + "grad_norm": 1.8611164093017578, + "learning_rate": 4.9275126760354565e-05, + "loss": 5.3477, + "step": 12921 + }, + { + "epoch": 0.07685079455704634, + "grad_norm": 1.902567982673645, + "learning_rate": 4.927501509205945e-05, + "loss": 5.4417, + "step": 12922 + }, + { + "epoch": 0.07685674184032734, + "grad_norm": 1.7735011577606201, + "learning_rate": 4.9274903415290184e-05, + "loss": 5.652, + "step": 12923 + }, + { + "epoch": 0.07686268912360833, + "grad_norm": 1.886060357093811, + "learning_rate": 4.927479173004681e-05, + "loss": 5.5927, + "step": 12924 + }, + { + "epoch": 0.07686863640688933, + "grad_norm": 1.8315941095352173, + "learning_rate": 4.927468003632935e-05, + "loss": 5.6559, + "step": 12925 + }, + { + "epoch": 0.07687458369017033, + "grad_norm": 1.7790045738220215, + "learning_rate": 4.927456833413784e-05, + "loss": 5.463, + "step": 12926 + }, + { + "epoch": 0.07688053097345132, + "grad_norm": 1.9559917449951172, + "learning_rate": 4.927445662347234e-05, + "loss": 5.6154, + "step": 12927 + }, + { + "epoch": 0.07688647825673232, + "grad_norm": 1.7274752855300903, + "learning_rate": 4.927434490433287e-05, + "loss": 5.5621, + "step": 12928 + }, + { + "epoch": 0.07689242554001333, + "grad_norm": 1.594190001487732, + "learning_rate": 4.9274233176719486e-05, + "loss": 5.4674, + "step": 12929 + }, + { + "epoch": 0.07689837282329431, + "grad_norm": 1.79281485080719, + "learning_rate": 4.927412144063222e-05, + "loss": 5.5166, + "step": 12930 + }, + { + "epoch": 0.07690432010657532, + "grad_norm": 1.6584967374801636, + "learning_rate": 4.92740096960711e-05, + "loss": 5.4249, + "step": 12931 + }, + { + "epoch": 0.07691026738985632, + "grad_norm": 1.8458021879196167, + "learning_rate": 4.927389794303617e-05, + "loss": 5.6073, + "step": 12932 + }, + { + "epoch": 0.0769162146731373, + "grad_norm": 1.5526570081710815, + "learning_rate": 4.927378618152748e-05, + "loss": 5.3992, + "step": 12933 + }, + { + "epoch": 0.07692216195641831, + "grad_norm": 1.6043710708618164, + "learning_rate": 4.927367441154507e-05, + "loss": 5.3786, + "step": 12934 + }, + { + "epoch": 0.07692810923969931, + "grad_norm": 1.6580268144607544, + "learning_rate": 4.927356263308896e-05, + "loss": 5.5177, + "step": 12935 + }, + { + "epoch": 0.0769340565229803, + "grad_norm": 1.7199897766113281, + "learning_rate": 4.9273450846159194e-05, + "loss": 5.4281, + "step": 12936 + }, + { + "epoch": 0.0769400038062613, + "grad_norm": 1.6920559406280518, + "learning_rate": 4.9273339050755835e-05, + "loss": 5.562, + "step": 12937 + }, + { + "epoch": 0.0769459510895423, + "grad_norm": 1.8027700185775757, + "learning_rate": 4.9273227246878894e-05, + "loss": 5.5473, + "step": 12938 + }, + { + "epoch": 0.07695189837282329, + "grad_norm": 1.6055867671966553, + "learning_rate": 4.927311543452842e-05, + "loss": 5.4903, + "step": 12939 + }, + { + "epoch": 0.07695784565610429, + "grad_norm": 1.5789201259613037, + "learning_rate": 4.9273003613704456e-05, + "loss": 5.4514, + "step": 12940 + }, + { + "epoch": 0.0769637929393853, + "grad_norm": 1.6153863668441772, + "learning_rate": 4.9272891784407034e-05, + "loss": 5.4343, + "step": 12941 + }, + { + "epoch": 0.07696974022266628, + "grad_norm": 1.8802043199539185, + "learning_rate": 4.927277994663619e-05, + "loss": 5.4691, + "step": 12942 + }, + { + "epoch": 0.07697568750594729, + "grad_norm": 1.869836688041687, + "learning_rate": 4.9272668100391984e-05, + "loss": 5.5037, + "step": 12943 + }, + { + "epoch": 0.07698163478922829, + "grad_norm": 1.9082410335540771, + "learning_rate": 4.927255624567443e-05, + "loss": 5.4814, + "step": 12944 + }, + { + "epoch": 0.07698758207250928, + "grad_norm": 1.5890675783157349, + "learning_rate": 4.927244438248358e-05, + "loss": 5.4627, + "step": 12945 + }, + { + "epoch": 0.07699352935579028, + "grad_norm": 1.7432551383972168, + "learning_rate": 4.9272332510819475e-05, + "loss": 5.4301, + "step": 12946 + }, + { + "epoch": 0.07699947663907127, + "grad_norm": 1.7112667560577393, + "learning_rate": 4.927222063068214e-05, + "loss": 5.4028, + "step": 12947 + }, + { + "epoch": 0.07700542392235227, + "grad_norm": 1.7046465873718262, + "learning_rate": 4.9272108742071634e-05, + "loss": 5.4688, + "step": 12948 + }, + { + "epoch": 0.07701137120563327, + "grad_norm": 1.6928964853286743, + "learning_rate": 4.927199684498798e-05, + "loss": 5.4553, + "step": 12949 + }, + { + "epoch": 0.07701731848891426, + "grad_norm": 1.8731732368469238, + "learning_rate": 4.927188493943122e-05, + "loss": 5.3542, + "step": 12950 + }, + { + "epoch": 0.07702326577219526, + "grad_norm": 1.6586295366287231, + "learning_rate": 4.92717730254014e-05, + "loss": 5.2852, + "step": 12951 + }, + { + "epoch": 0.07702921305547626, + "grad_norm": 1.724252462387085, + "learning_rate": 4.927166110289855e-05, + "loss": 5.3982, + "step": 12952 + }, + { + "epoch": 0.07703516033875725, + "grad_norm": 1.7133373022079468, + "learning_rate": 4.9271549171922716e-05, + "loss": 5.3642, + "step": 12953 + }, + { + "epoch": 0.07704110762203825, + "grad_norm": 1.779291033744812, + "learning_rate": 4.927143723247394e-05, + "loss": 5.3949, + "step": 12954 + }, + { + "epoch": 0.07704705490531925, + "grad_norm": 1.8439239263534546, + "learning_rate": 4.927132528455225e-05, + "loss": 5.3829, + "step": 12955 + }, + { + "epoch": 0.07705300218860024, + "grad_norm": 1.7440255880355835, + "learning_rate": 4.927121332815769e-05, + "loss": 5.3881, + "step": 12956 + }, + { + "epoch": 0.07705894947188124, + "grad_norm": 1.8459028005599976, + "learning_rate": 4.927110136329031e-05, + "loss": 5.3575, + "step": 12957 + }, + { + "epoch": 0.07706489675516225, + "grad_norm": 2.8051815032958984, + "learning_rate": 4.927098938995013e-05, + "loss": 5.2814, + "step": 12958 + }, + { + "epoch": 0.07707084403844323, + "grad_norm": 1.8814127445220947, + "learning_rate": 4.9270877408137194e-05, + "loss": 5.3614, + "step": 12959 + }, + { + "epoch": 0.07707679132172424, + "grad_norm": 1.570408821105957, + "learning_rate": 4.927076541785156e-05, + "loss": 5.3453, + "step": 12960 + }, + { + "epoch": 0.07708273860500524, + "grad_norm": 1.607393741607666, + "learning_rate": 4.927065341909324e-05, + "loss": 5.4766, + "step": 12961 + }, + { + "epoch": 0.07708868588828623, + "grad_norm": 1.475420594215393, + "learning_rate": 4.927054141186229e-05, + "loss": 5.4511, + "step": 12962 + }, + { + "epoch": 0.07709463317156723, + "grad_norm": 1.7785848379135132, + "learning_rate": 4.927042939615875e-05, + "loss": 5.3839, + "step": 12963 + }, + { + "epoch": 0.07710058045484823, + "grad_norm": 1.7313402891159058, + "learning_rate": 4.9270317371982645e-05, + "loss": 5.3398, + "step": 12964 + }, + { + "epoch": 0.07710652773812922, + "grad_norm": 1.666938066482544, + "learning_rate": 4.927020533933403e-05, + "loss": 5.4462, + "step": 12965 + }, + { + "epoch": 0.07711247502141022, + "grad_norm": 1.5219112634658813, + "learning_rate": 4.9270093298212933e-05, + "loss": 5.7593, + "step": 12966 + }, + { + "epoch": 0.07711842230469122, + "grad_norm": 2.0760631561279297, + "learning_rate": 4.92699812486194e-05, + "loss": 5.5765, + "step": 12967 + }, + { + "epoch": 0.07712436958797221, + "grad_norm": 1.7648851871490479, + "learning_rate": 4.926986919055346e-05, + "loss": 5.8786, + "step": 12968 + }, + { + "epoch": 0.07713031687125321, + "grad_norm": 1.832141399383545, + "learning_rate": 4.926975712401517e-05, + "loss": 5.6695, + "step": 12969 + }, + { + "epoch": 0.07713626415453421, + "grad_norm": 1.9032765626907349, + "learning_rate": 4.926964504900455e-05, + "loss": 5.701, + "step": 12970 + }, + { + "epoch": 0.0771422114378152, + "grad_norm": 1.7294973134994507, + "learning_rate": 4.9269532965521656e-05, + "loss": 5.6569, + "step": 12971 + }, + { + "epoch": 0.0771481587210962, + "grad_norm": 1.927510142326355, + "learning_rate": 4.926942087356651e-05, + "loss": 5.1289, + "step": 12972 + }, + { + "epoch": 0.07715410600437721, + "grad_norm": 1.6945842504501343, + "learning_rate": 4.926930877313917e-05, + "loss": 5.5703, + "step": 12973 + }, + { + "epoch": 0.0771600532876582, + "grad_norm": 1.7665363550186157, + "learning_rate": 4.926919666423966e-05, + "loss": 5.822, + "step": 12974 + }, + { + "epoch": 0.0771660005709392, + "grad_norm": 1.5802277326583862, + "learning_rate": 4.926908454686801e-05, + "loss": 5.5438, + "step": 12975 + }, + { + "epoch": 0.07717194785422019, + "grad_norm": 1.9065684080123901, + "learning_rate": 4.9268972421024295e-05, + "loss": 5.5556, + "step": 12976 + }, + { + "epoch": 0.07717789513750119, + "grad_norm": 1.7630208730697632, + "learning_rate": 4.9268860286708526e-05, + "loss": 5.6079, + "step": 12977 + }, + { + "epoch": 0.07718384242078219, + "grad_norm": 1.6295850276947021, + "learning_rate": 4.9268748143920746e-05, + "loss": 5.6163, + "step": 12978 + }, + { + "epoch": 0.07718978970406318, + "grad_norm": 1.753202199935913, + "learning_rate": 4.926863599266099e-05, + "loss": 5.549, + "step": 12979 + }, + { + "epoch": 0.07719573698734418, + "grad_norm": 1.7823643684387207, + "learning_rate": 4.9268523832929314e-05, + "loss": 5.6917, + "step": 12980 + }, + { + "epoch": 0.07720168427062518, + "grad_norm": 1.7990792989730835, + "learning_rate": 4.926841166472574e-05, + "loss": 5.5897, + "step": 12981 + }, + { + "epoch": 0.07720763155390617, + "grad_norm": 1.7813109159469604, + "learning_rate": 4.926829948805033e-05, + "loss": 5.5953, + "step": 12982 + }, + { + "epoch": 0.07721357883718717, + "grad_norm": 1.7127541303634644, + "learning_rate": 4.926818730290309e-05, + "loss": 5.5476, + "step": 12983 + }, + { + "epoch": 0.07721952612046817, + "grad_norm": 2.0513558387756348, + "learning_rate": 4.9268075109284084e-05, + "loss": 5.5721, + "step": 12984 + }, + { + "epoch": 0.07722547340374916, + "grad_norm": 1.8053756952285767, + "learning_rate": 4.9267962907193346e-05, + "loss": 5.5344, + "step": 12985 + }, + { + "epoch": 0.07723142068703016, + "grad_norm": 1.7184503078460693, + "learning_rate": 4.9267850696630904e-05, + "loss": 5.602, + "step": 12986 + }, + { + "epoch": 0.07723736797031117, + "grad_norm": 1.8753174543380737, + "learning_rate": 4.926773847759682e-05, + "loss": 5.701, + "step": 12987 + }, + { + "epoch": 0.07724331525359215, + "grad_norm": 1.7761272192001343, + "learning_rate": 4.9267626250091106e-05, + "loss": 5.5026, + "step": 12988 + }, + { + "epoch": 0.07724926253687316, + "grad_norm": 1.6833654642105103, + "learning_rate": 4.926751401411381e-05, + "loss": 5.5615, + "step": 12989 + }, + { + "epoch": 0.07725520982015416, + "grad_norm": 1.8640247583389282, + "learning_rate": 4.926740176966499e-05, + "loss": 5.8367, + "step": 12990 + }, + { + "epoch": 0.07726115710343515, + "grad_norm": 2.036540985107422, + "learning_rate": 4.9267289516744665e-05, + "loss": 5.6258, + "step": 12991 + }, + { + "epoch": 0.07726710438671615, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.926717725535288e-05, + "loss": 5.1961, + "step": 12992 + }, + { + "epoch": 0.07727305166999715, + "grad_norm": 2.149548292160034, + "learning_rate": 4.9267064985489674e-05, + "loss": 5.1735, + "step": 12993 + }, + { + "epoch": 0.07727899895327814, + "grad_norm": 1.7929832935333252, + "learning_rate": 4.926695270715508e-05, + "loss": 5.6889, + "step": 12994 + }, + { + "epoch": 0.07728494623655914, + "grad_norm": 1.7964575290679932, + "learning_rate": 4.926684042034916e-05, + "loss": 5.0576, + "step": 12995 + }, + { + "epoch": 0.07729089351984014, + "grad_norm": 1.8207305669784546, + "learning_rate": 4.926672812507192e-05, + "loss": 5.2703, + "step": 12996 + }, + { + "epoch": 0.07729684080312113, + "grad_norm": 1.6263490915298462, + "learning_rate": 4.9266615821323425e-05, + "loss": 5.5999, + "step": 12997 + }, + { + "epoch": 0.07730278808640213, + "grad_norm": 2.0018131732940674, + "learning_rate": 4.92665035091037e-05, + "loss": 4.9439, + "step": 12998 + }, + { + "epoch": 0.07730873536968313, + "grad_norm": 2.32818341255188, + "learning_rate": 4.926639118841279e-05, + "loss": 4.6071, + "step": 12999 + }, + { + "epoch": 0.07731468265296412, + "grad_norm": 2.3354949951171875, + "learning_rate": 4.926627885925074e-05, + "loss": 4.6642, + "step": 13000 + }, + { + "epoch": 0.07732062993624512, + "grad_norm": 1.71230149269104, + "learning_rate": 4.926616652161757e-05, + "loss": 5.161, + "step": 13001 + }, + { + "epoch": 0.07732657721952613, + "grad_norm": 1.4890326261520386, + "learning_rate": 4.9266054175513345e-05, + "loss": 5.1714, + "step": 13002 + }, + { + "epoch": 0.07733252450280712, + "grad_norm": 1.5844224691390991, + "learning_rate": 4.926594182093809e-05, + "loss": 4.869, + "step": 13003 + }, + { + "epoch": 0.07733847178608812, + "grad_norm": 2.328636884689331, + "learning_rate": 4.926582945789185e-05, + "loss": 5.1571, + "step": 13004 + }, + { + "epoch": 0.0773444190693691, + "grad_norm": 2.067760467529297, + "learning_rate": 4.926571708637464e-05, + "loss": 5.4416, + "step": 13005 + }, + { + "epoch": 0.07735036635265011, + "grad_norm": 1.7148468494415283, + "learning_rate": 4.926560470638653e-05, + "loss": 5.464, + "step": 13006 + }, + { + "epoch": 0.07735631363593111, + "grad_norm": 1.6869080066680908, + "learning_rate": 4.926549231792755e-05, + "loss": 5.5537, + "step": 13007 + }, + { + "epoch": 0.0773622609192121, + "grad_norm": 2.239408254623413, + "learning_rate": 4.9265379920997735e-05, + "loss": 5.1551, + "step": 13008 + }, + { + "epoch": 0.0773682082024931, + "grad_norm": 2.4059038162231445, + "learning_rate": 4.926526751559713e-05, + "loss": 5.2639, + "step": 13009 + }, + { + "epoch": 0.0773741554857741, + "grad_norm": 2.0787813663482666, + "learning_rate": 4.926515510172577e-05, + "loss": 5.3485, + "step": 13010 + }, + { + "epoch": 0.07738010276905509, + "grad_norm": 1.912137508392334, + "learning_rate": 4.9265042679383685e-05, + "loss": 5.551, + "step": 13011 + }, + { + "epoch": 0.07738605005233609, + "grad_norm": 2.0865983963012695, + "learning_rate": 4.926493024857094e-05, + "loss": 5.0343, + "step": 13012 + }, + { + "epoch": 0.0773919973356171, + "grad_norm": 1.9341247081756592, + "learning_rate": 4.926481780928754e-05, + "loss": 5.5904, + "step": 13013 + }, + { + "epoch": 0.07739794461889808, + "grad_norm": 1.7777684926986694, + "learning_rate": 4.926470536153356e-05, + "loss": 5.5396, + "step": 13014 + }, + { + "epoch": 0.07740389190217908, + "grad_norm": 1.7952098846435547, + "learning_rate": 4.926459290530902e-05, + "loss": 5.3212, + "step": 13015 + }, + { + "epoch": 0.07740983918546009, + "grad_norm": 1.7674907445907593, + "learning_rate": 4.926448044061396e-05, + "loss": 5.3316, + "step": 13016 + }, + { + "epoch": 0.07741578646874107, + "grad_norm": 1.8327823877334595, + "learning_rate": 4.926436796744841e-05, + "loss": 5.3129, + "step": 13017 + }, + { + "epoch": 0.07742173375202208, + "grad_norm": 1.613867998123169, + "learning_rate": 4.9264255485812425e-05, + "loss": 5.4935, + "step": 13018 + }, + { + "epoch": 0.07742768103530308, + "grad_norm": 1.7167906761169434, + "learning_rate": 4.9264142995706044e-05, + "loss": 5.3054, + "step": 13019 + }, + { + "epoch": 0.07743362831858407, + "grad_norm": 2.272038698196411, + "learning_rate": 4.92640304971293e-05, + "loss": 5.1327, + "step": 13020 + }, + { + "epoch": 0.07743957560186507, + "grad_norm": 1.6358660459518433, + "learning_rate": 4.926391799008223e-05, + "loss": 5.3285, + "step": 13021 + }, + { + "epoch": 0.07744552288514607, + "grad_norm": 2.166813373565674, + "learning_rate": 4.926380547456488e-05, + "loss": 5.2846, + "step": 13022 + }, + { + "epoch": 0.07745147016842706, + "grad_norm": 2.3251235485076904, + "learning_rate": 4.926369295057729e-05, + "loss": 5.2482, + "step": 13023 + }, + { + "epoch": 0.07745741745170806, + "grad_norm": 1.9402974843978882, + "learning_rate": 4.926358041811949e-05, + "loss": 5.3514, + "step": 13024 + }, + { + "epoch": 0.07746336473498906, + "grad_norm": 2.1346986293792725, + "learning_rate": 4.9263467877191525e-05, + "loss": 5.1912, + "step": 13025 + }, + { + "epoch": 0.07746931201827005, + "grad_norm": 2.0809762477874756, + "learning_rate": 4.926335532779344e-05, + "loss": 5.0547, + "step": 13026 + }, + { + "epoch": 0.07747525930155105, + "grad_norm": 2.110558032989502, + "learning_rate": 4.9263242769925256e-05, + "loss": 5.2177, + "step": 13027 + }, + { + "epoch": 0.07748120658483205, + "grad_norm": 2.3498575687408447, + "learning_rate": 4.926313020358704e-05, + "loss": 4.9997, + "step": 13028 + }, + { + "epoch": 0.07748715386811304, + "grad_norm": 2.4052765369415283, + "learning_rate": 4.92630176287788e-05, + "loss": 4.9736, + "step": 13029 + }, + { + "epoch": 0.07749310115139404, + "grad_norm": 2.3132238388061523, + "learning_rate": 4.9262905045500603e-05, + "loss": 4.9149, + "step": 13030 + }, + { + "epoch": 0.07749904843467505, + "grad_norm": 2.315483331680298, + "learning_rate": 4.926279245375247e-05, + "loss": 4.9096, + "step": 13031 + }, + { + "epoch": 0.07750499571795604, + "grad_norm": 2.0887367725372314, + "learning_rate": 4.926267985353445e-05, + "loss": 5.3274, + "step": 13032 + }, + { + "epoch": 0.07751094300123704, + "grad_norm": 2.3138368129730225, + "learning_rate": 4.926256724484658e-05, + "loss": 4.8627, + "step": 13033 + }, + { + "epoch": 0.07751689028451804, + "grad_norm": 2.348411798477173, + "learning_rate": 4.926245462768889e-05, + "loss": 4.9815, + "step": 13034 + }, + { + "epoch": 0.07752283756779903, + "grad_norm": 1.7357233762741089, + "learning_rate": 4.926234200206144e-05, + "loss": 5.2836, + "step": 13035 + }, + { + "epoch": 0.07752878485108003, + "grad_norm": 1.8633183240890503, + "learning_rate": 4.9262229367964255e-05, + "loss": 5.1838, + "step": 13036 + }, + { + "epoch": 0.07753473213436102, + "grad_norm": 1.736359715461731, + "learning_rate": 4.926211672539737e-05, + "loss": 5.6746, + "step": 13037 + }, + { + "epoch": 0.07754067941764202, + "grad_norm": 2.368511915206909, + "learning_rate": 4.9262004074360834e-05, + "loss": 4.5786, + "step": 13038 + }, + { + "epoch": 0.07754662670092302, + "grad_norm": 1.859297752380371, + "learning_rate": 4.926189141485468e-05, + "loss": 5.8459, + "step": 13039 + }, + { + "epoch": 0.07755257398420401, + "grad_norm": 2.2050845623016357, + "learning_rate": 4.9261778746878955e-05, + "loss": 5.8982, + "step": 13040 + }, + { + "epoch": 0.07755852126748501, + "grad_norm": 1.7485835552215576, + "learning_rate": 4.926166607043369e-05, + "loss": 5.789, + "step": 13041 + }, + { + "epoch": 0.07756446855076601, + "grad_norm": 1.7780888080596924, + "learning_rate": 4.9261553385518936e-05, + "loss": 5.48, + "step": 13042 + }, + { + "epoch": 0.077570415834047, + "grad_norm": 1.8764269351959229, + "learning_rate": 4.9261440692134716e-05, + "loss": 5.093, + "step": 13043 + }, + { + "epoch": 0.077576363117328, + "grad_norm": 1.784196376800537, + "learning_rate": 4.926132799028108e-05, + "loss": 5.4335, + "step": 13044 + }, + { + "epoch": 0.077582310400609, + "grad_norm": 2.173844337463379, + "learning_rate": 4.926121527995806e-05, + "loss": 4.5078, + "step": 13045 + }, + { + "epoch": 0.07758825768389, + "grad_norm": 2.410778045654297, + "learning_rate": 4.9261102561165705e-05, + "loss": 5.2113, + "step": 13046 + }, + { + "epoch": 0.077594204967171, + "grad_norm": 2.0470073223114014, + "learning_rate": 4.9260989833904057e-05, + "loss": 5.4695, + "step": 13047 + }, + { + "epoch": 0.077600152250452, + "grad_norm": 1.619314193725586, + "learning_rate": 4.926087709817314e-05, + "loss": 5.8778, + "step": 13048 + }, + { + "epoch": 0.07760609953373299, + "grad_norm": 2.2353031635284424, + "learning_rate": 4.9260764353973e-05, + "loss": 5.2482, + "step": 13049 + }, + { + "epoch": 0.07761204681701399, + "grad_norm": 2.0858941078186035, + "learning_rate": 4.926065160130369e-05, + "loss": 5.2752, + "step": 13050 + }, + { + "epoch": 0.07761799410029499, + "grad_norm": 2.275660514831543, + "learning_rate": 4.926053884016522e-05, + "loss": 5.004, + "step": 13051 + }, + { + "epoch": 0.07762394138357598, + "grad_norm": 1.9338358640670776, + "learning_rate": 4.926042607055765e-05, + "loss": 5.4688, + "step": 13052 + }, + { + "epoch": 0.07762988866685698, + "grad_norm": 1.7377573251724243, + "learning_rate": 4.926031329248103e-05, + "loss": 5.6429, + "step": 13053 + }, + { + "epoch": 0.07763583595013798, + "grad_norm": 1.8915661573410034, + "learning_rate": 4.9260200505935374e-05, + "loss": 5.543, + "step": 13054 + }, + { + "epoch": 0.07764178323341897, + "grad_norm": 1.7961910963058472, + "learning_rate": 4.926008771092073e-05, + "loss": 5.4245, + "step": 13055 + }, + { + "epoch": 0.07764773051669997, + "grad_norm": 1.9412139654159546, + "learning_rate": 4.9259974907437145e-05, + "loss": 5.5858, + "step": 13056 + }, + { + "epoch": 0.07765367779998097, + "grad_norm": 2.458508253097534, + "learning_rate": 4.925986209548466e-05, + "loss": 5.3307, + "step": 13057 + }, + { + "epoch": 0.07765962508326196, + "grad_norm": 2.23331880569458, + "learning_rate": 4.92597492750633e-05, + "loss": 5.6979, + "step": 13058 + }, + { + "epoch": 0.07766557236654296, + "grad_norm": 2.38264536857605, + "learning_rate": 4.9259636446173104e-05, + "loss": 5.5771, + "step": 13059 + }, + { + "epoch": 0.07767151964982397, + "grad_norm": 2.0892632007598877, + "learning_rate": 4.925952360881413e-05, + "loss": 5.8596, + "step": 13060 + }, + { + "epoch": 0.07767746693310495, + "grad_norm": 1.82732355594635, + "learning_rate": 4.92594107629864e-05, + "loss": 5.3724, + "step": 13061 + }, + { + "epoch": 0.07768341421638596, + "grad_norm": 1.821089506149292, + "learning_rate": 4.925929790868997e-05, + "loss": 5.6499, + "step": 13062 + }, + { + "epoch": 0.07768936149966696, + "grad_norm": 1.9662789106369019, + "learning_rate": 4.925918504592487e-05, + "loss": 5.5132, + "step": 13063 + }, + { + "epoch": 0.07769530878294795, + "grad_norm": 1.830101490020752, + "learning_rate": 4.925907217469113e-05, + "loss": 5.4492, + "step": 13064 + }, + { + "epoch": 0.07770125606622895, + "grad_norm": 1.8362375497817993, + "learning_rate": 4.9258959294988804e-05, + "loss": 5.8314, + "step": 13065 + }, + { + "epoch": 0.07770720334950994, + "grad_norm": 2.23861026763916, + "learning_rate": 4.9258846406817926e-05, + "loss": 6.2564, + "step": 13066 + }, + { + "epoch": 0.07771315063279094, + "grad_norm": 2.2672650814056396, + "learning_rate": 4.9258733510178536e-05, + "loss": 6.3396, + "step": 13067 + }, + { + "epoch": 0.07771909791607194, + "grad_norm": 1.8667620420455933, + "learning_rate": 4.9258620605070665e-05, + "loss": 5.8509, + "step": 13068 + }, + { + "epoch": 0.07772504519935293, + "grad_norm": 1.7386364936828613, + "learning_rate": 4.925850769149436e-05, + "loss": 5.567, + "step": 13069 + }, + { + "epoch": 0.07773099248263393, + "grad_norm": 1.3638315200805664, + "learning_rate": 4.9258394769449675e-05, + "loss": 5.6892, + "step": 13070 + }, + { + "epoch": 0.07773693976591493, + "grad_norm": 1.7117588520050049, + "learning_rate": 4.9258281838936624e-05, + "loss": 5.461, + "step": 13071 + }, + { + "epoch": 0.07774288704919592, + "grad_norm": 1.7597805261611938, + "learning_rate": 4.925816889995526e-05, + "loss": 5.6783, + "step": 13072 + }, + { + "epoch": 0.07774883433247692, + "grad_norm": 1.8734283447265625, + "learning_rate": 4.9258055952505624e-05, + "loss": 5.633, + "step": 13073 + }, + { + "epoch": 0.07775478161575793, + "grad_norm": 1.5552877187728882, + "learning_rate": 4.9257942996587744e-05, + "loss": 5.8804, + "step": 13074 + }, + { + "epoch": 0.07776072889903891, + "grad_norm": 1.2786669731140137, + "learning_rate": 4.925783003220167e-05, + "loss": 5.3208, + "step": 13075 + }, + { + "epoch": 0.07776667618231992, + "grad_norm": 1.558182954788208, + "learning_rate": 4.925771705934744e-05, + "loss": 5.4023, + "step": 13076 + }, + { + "epoch": 0.07777262346560092, + "grad_norm": 1.3482223749160767, + "learning_rate": 4.925760407802509e-05, + "loss": 5.3879, + "step": 13077 + }, + { + "epoch": 0.0777785707488819, + "grad_norm": 1.5111918449401855, + "learning_rate": 4.925749108823466e-05, + "loss": 5.329, + "step": 13078 + }, + { + "epoch": 0.07778451803216291, + "grad_norm": 1.7119463682174683, + "learning_rate": 4.925737808997619e-05, + "loss": 5.7282, + "step": 13079 + }, + { + "epoch": 0.07779046531544391, + "grad_norm": 1.7753342390060425, + "learning_rate": 4.925726508324972e-05, + "loss": 5.2677, + "step": 13080 + }, + { + "epoch": 0.0777964125987249, + "grad_norm": 1.8957557678222656, + "learning_rate": 4.925715206805529e-05, + "loss": 4.7193, + "step": 13081 + }, + { + "epoch": 0.0778023598820059, + "grad_norm": 2.503037214279175, + "learning_rate": 4.9257039044392935e-05, + "loss": 5.034, + "step": 13082 + }, + { + "epoch": 0.0778083071652869, + "grad_norm": 2.031312942504883, + "learning_rate": 4.92569260122627e-05, + "loss": 5.1982, + "step": 13083 + }, + { + "epoch": 0.07781425444856789, + "grad_norm": 1.8345115184783936, + "learning_rate": 4.9256812971664635e-05, + "loss": 5.6059, + "step": 13084 + }, + { + "epoch": 0.07782020173184889, + "grad_norm": 2.134131669998169, + "learning_rate": 4.925669992259875e-05, + "loss": 5.8174, + "step": 13085 + }, + { + "epoch": 0.0778261490151299, + "grad_norm": 1.9598990678787231, + "learning_rate": 4.9256586865065114e-05, + "loss": 5.76, + "step": 13086 + }, + { + "epoch": 0.07783209629841088, + "grad_norm": 1.8105463981628418, + "learning_rate": 4.925647379906375e-05, + "loss": 5.5112, + "step": 13087 + }, + { + "epoch": 0.07783804358169188, + "grad_norm": 1.5290614366531372, + "learning_rate": 4.9256360724594696e-05, + "loss": 5.7122, + "step": 13088 + }, + { + "epoch": 0.07784399086497289, + "grad_norm": 1.6188294887542725, + "learning_rate": 4.9256247641658005e-05, + "loss": 5.58, + "step": 13089 + }, + { + "epoch": 0.07784993814825387, + "grad_norm": 1.8662221431732178, + "learning_rate": 4.925613455025371e-05, + "loss": 5.4975, + "step": 13090 + }, + { + "epoch": 0.07785588543153488, + "grad_norm": 1.808813452720642, + "learning_rate": 4.925602145038184e-05, + "loss": 5.6704, + "step": 13091 + }, + { + "epoch": 0.07786183271481588, + "grad_norm": 1.776418924331665, + "learning_rate": 4.925590834204245e-05, + "loss": 5.7558, + "step": 13092 + }, + { + "epoch": 0.07786777999809687, + "grad_norm": 1.704537034034729, + "learning_rate": 4.925579522523557e-05, + "loss": 5.6667, + "step": 13093 + }, + { + "epoch": 0.07787372728137787, + "grad_norm": 2.115651845932007, + "learning_rate": 4.9255682099961246e-05, + "loss": 5.5823, + "step": 13094 + }, + { + "epoch": 0.07787967456465886, + "grad_norm": 1.851914882659912, + "learning_rate": 4.9255568966219504e-05, + "loss": 5.6749, + "step": 13095 + }, + { + "epoch": 0.07788562184793986, + "grad_norm": 1.8792526721954346, + "learning_rate": 4.92554558240104e-05, + "loss": 5.8539, + "step": 13096 + }, + { + "epoch": 0.07789156913122086, + "grad_norm": 1.805280327796936, + "learning_rate": 4.925534267333397e-05, + "loss": 5.8522, + "step": 13097 + }, + { + "epoch": 0.07789751641450185, + "grad_norm": 1.7457916736602783, + "learning_rate": 4.925522951419025e-05, + "loss": 5.9419, + "step": 13098 + }, + { + "epoch": 0.07790346369778285, + "grad_norm": 1.6427416801452637, + "learning_rate": 4.925511634657928e-05, + "loss": 5.8924, + "step": 13099 + }, + { + "epoch": 0.07790941098106385, + "grad_norm": 1.7034873962402344, + "learning_rate": 4.9255003170501095e-05, + "loss": 5.8701, + "step": 13100 + }, + { + "epoch": 0.07791535826434484, + "grad_norm": 1.6852953433990479, + "learning_rate": 4.925488998595574e-05, + "loss": 5.771, + "step": 13101 + }, + { + "epoch": 0.07792130554762584, + "grad_norm": 1.6478735208511353, + "learning_rate": 4.9254776792943255e-05, + "loss": 5.4274, + "step": 13102 + }, + { + "epoch": 0.07792725283090685, + "grad_norm": 1.5896925926208496, + "learning_rate": 4.925466359146368e-05, + "loss": 5.8217, + "step": 13103 + }, + { + "epoch": 0.07793320011418783, + "grad_norm": 1.649539828300476, + "learning_rate": 4.9254550381517054e-05, + "loss": 5.7899, + "step": 13104 + }, + { + "epoch": 0.07793914739746884, + "grad_norm": 1.5224459171295166, + "learning_rate": 4.925443716310341e-05, + "loss": 5.7931, + "step": 13105 + }, + { + "epoch": 0.07794509468074984, + "grad_norm": 2.009038209915161, + "learning_rate": 4.9254323936222796e-05, + "loss": 5.854, + "step": 13106 + }, + { + "epoch": 0.07795104196403083, + "grad_norm": 1.5545878410339355, + "learning_rate": 4.9254210700875245e-05, + "loss": 5.7212, + "step": 13107 + }, + { + "epoch": 0.07795698924731183, + "grad_norm": 2.0804193019866943, + "learning_rate": 4.92540974570608e-05, + "loss": 5.7195, + "step": 13108 + }, + { + "epoch": 0.07796293653059283, + "grad_norm": 1.940432071685791, + "learning_rate": 4.92539842047795e-05, + "loss": 5.4998, + "step": 13109 + }, + { + "epoch": 0.07796888381387382, + "grad_norm": 2.3788061141967773, + "learning_rate": 4.925387094403139e-05, + "loss": 5.5975, + "step": 13110 + }, + { + "epoch": 0.07797483109715482, + "grad_norm": 1.6193798780441284, + "learning_rate": 4.92537576748165e-05, + "loss": 5.4489, + "step": 13111 + }, + { + "epoch": 0.07798077838043582, + "grad_norm": 1.7056760787963867, + "learning_rate": 4.9253644397134866e-05, + "loss": 5.5584, + "step": 13112 + }, + { + "epoch": 0.07798672566371681, + "grad_norm": 1.2604116201400757, + "learning_rate": 4.925353111098655e-05, + "loss": 5.5681, + "step": 13113 + }, + { + "epoch": 0.07799267294699781, + "grad_norm": 1.305413842201233, + "learning_rate": 4.925341781637157e-05, + "loss": 5.6966, + "step": 13114 + }, + { + "epoch": 0.07799862023027881, + "grad_norm": 2.6248581409454346, + "learning_rate": 4.9253304513289975e-05, + "loss": 5.3666, + "step": 13115 + }, + { + "epoch": 0.0780045675135598, + "grad_norm": 1.687741994857788, + "learning_rate": 4.92531912017418e-05, + "loss": 5.5511, + "step": 13116 + }, + { + "epoch": 0.0780105147968408, + "grad_norm": 1.5827749967575073, + "learning_rate": 4.9253077881727086e-05, + "loss": 5.3363, + "step": 13117 + }, + { + "epoch": 0.0780164620801218, + "grad_norm": 1.5989108085632324, + "learning_rate": 4.925296455324587e-05, + "loss": 5.472, + "step": 13118 + }, + { + "epoch": 0.0780224093634028, + "grad_norm": 1.5687717199325562, + "learning_rate": 4.9252851216298194e-05, + "loss": 5.6894, + "step": 13119 + }, + { + "epoch": 0.0780283566466838, + "grad_norm": 1.312949538230896, + "learning_rate": 4.9252737870884106e-05, + "loss": 5.6735, + "step": 13120 + }, + { + "epoch": 0.0780343039299648, + "grad_norm": 1.5779353380203247, + "learning_rate": 4.925262451700363e-05, + "loss": 5.3281, + "step": 13121 + }, + { + "epoch": 0.07804025121324579, + "grad_norm": 1.6127909421920776, + "learning_rate": 4.9252511154656825e-05, + "loss": 5.27, + "step": 13122 + }, + { + "epoch": 0.07804619849652679, + "grad_norm": 1.6496199369430542, + "learning_rate": 4.925239778384371e-05, + "loss": 5.4913, + "step": 13123 + }, + { + "epoch": 0.07805214577980778, + "grad_norm": 2.394230842590332, + "learning_rate": 4.925228440456433e-05, + "loss": 5.1788, + "step": 13124 + }, + { + "epoch": 0.07805809306308878, + "grad_norm": 2.169250249862671, + "learning_rate": 4.925217101681873e-05, + "loss": 5.4087, + "step": 13125 + }, + { + "epoch": 0.07806404034636978, + "grad_norm": 2.150338649749756, + "learning_rate": 4.925205762060695e-05, + "loss": 5.5004, + "step": 13126 + }, + { + "epoch": 0.07806998762965077, + "grad_norm": 2.0131516456604004, + "learning_rate": 4.925194421592903e-05, + "loss": 5.5791, + "step": 13127 + }, + { + "epoch": 0.07807593491293177, + "grad_norm": 1.8154455423355103, + "learning_rate": 4.925183080278501e-05, + "loss": 5.5479, + "step": 13128 + }, + { + "epoch": 0.07808188219621277, + "grad_norm": 1.7489157915115356, + "learning_rate": 4.925171738117492e-05, + "loss": 5.7169, + "step": 13129 + }, + { + "epoch": 0.07808782947949376, + "grad_norm": 1.6712158918380737, + "learning_rate": 4.92516039510988e-05, + "loss": 6.0751, + "step": 13130 + }, + { + "epoch": 0.07809377676277476, + "grad_norm": 1.7542296648025513, + "learning_rate": 4.9251490512556706e-05, + "loss": 5.8998, + "step": 13131 + }, + { + "epoch": 0.07809972404605577, + "grad_norm": 1.5962193012237549, + "learning_rate": 4.9251377065548666e-05, + "loss": 5.7781, + "step": 13132 + }, + { + "epoch": 0.07810567132933675, + "grad_norm": 1.783756136894226, + "learning_rate": 4.9251263610074714e-05, + "loss": 5.8384, + "step": 13133 + }, + { + "epoch": 0.07811161861261776, + "grad_norm": 1.6608144044876099, + "learning_rate": 4.92511501461349e-05, + "loss": 5.7603, + "step": 13134 + }, + { + "epoch": 0.07811756589589876, + "grad_norm": 1.8659160137176514, + "learning_rate": 4.925103667372926e-05, + "loss": 5.5039, + "step": 13135 + }, + { + "epoch": 0.07812351317917975, + "grad_norm": 1.591565489768982, + "learning_rate": 4.925092319285783e-05, + "loss": 5.7034, + "step": 13136 + }, + { + "epoch": 0.07812946046246075, + "grad_norm": 1.5772358179092407, + "learning_rate": 4.925080970352066e-05, + "loss": 5.6347, + "step": 13137 + }, + { + "epoch": 0.07813540774574175, + "grad_norm": 1.7196561098098755, + "learning_rate": 4.925069620571778e-05, + "loss": 5.7086, + "step": 13138 + }, + { + "epoch": 0.07814135502902274, + "grad_norm": 1.9582041501998901, + "learning_rate": 4.9250582699449237e-05, + "loss": 5.9774, + "step": 13139 + }, + { + "epoch": 0.07814730231230374, + "grad_norm": 2.0566928386688232, + "learning_rate": 4.9250469184715064e-05, + "loss": 5.8527, + "step": 13140 + }, + { + "epoch": 0.07815324959558474, + "grad_norm": 1.9961296319961548, + "learning_rate": 4.92503556615153e-05, + "loss": 5.65, + "step": 13141 + }, + { + "epoch": 0.07815919687886573, + "grad_norm": 1.672601342201233, + "learning_rate": 4.925024212984999e-05, + "loss": 5.7242, + "step": 13142 + }, + { + "epoch": 0.07816514416214673, + "grad_norm": 1.6791996955871582, + "learning_rate": 4.9250128589719166e-05, + "loss": 5.7365, + "step": 13143 + }, + { + "epoch": 0.07817109144542773, + "grad_norm": 2.4464364051818848, + "learning_rate": 4.925001504112288e-05, + "loss": 4.9673, + "step": 13144 + }, + { + "epoch": 0.07817703872870872, + "grad_norm": 2.0053181648254395, + "learning_rate": 4.9249901484061156e-05, + "loss": 5.7916, + "step": 13145 + }, + { + "epoch": 0.07818298601198972, + "grad_norm": 2.512120246887207, + "learning_rate": 4.924978791853405e-05, + "loss": 5.914, + "step": 13146 + }, + { + "epoch": 0.07818893329527073, + "grad_norm": 2.2429497241973877, + "learning_rate": 4.924967434454159e-05, + "loss": 5.8806, + "step": 13147 + }, + { + "epoch": 0.07819488057855171, + "grad_norm": 1.9966307878494263, + "learning_rate": 4.924956076208381e-05, + "loss": 5.8883, + "step": 13148 + }, + { + "epoch": 0.07820082786183272, + "grad_norm": 2.492926836013794, + "learning_rate": 4.924944717116077e-05, + "loss": 5.361, + "step": 13149 + }, + { + "epoch": 0.07820677514511372, + "grad_norm": 2.050769090652466, + "learning_rate": 4.92493335717725e-05, + "loss": 5.5682, + "step": 13150 + }, + { + "epoch": 0.07821272242839471, + "grad_norm": 2.2797789573669434, + "learning_rate": 4.9249219963919037e-05, + "loss": 5.8695, + "step": 13151 + }, + { + "epoch": 0.07821866971167571, + "grad_norm": 2.1034891605377197, + "learning_rate": 4.924910634760041e-05, + "loss": 4.987, + "step": 13152 + }, + { + "epoch": 0.0782246169949567, + "grad_norm": 1.7718714475631714, + "learning_rate": 4.924899272281669e-05, + "loss": 5.112, + "step": 13153 + }, + { + "epoch": 0.0782305642782377, + "grad_norm": 1.730656385421753, + "learning_rate": 4.9248879089567884e-05, + "loss": 5.6589, + "step": 13154 + }, + { + "epoch": 0.0782365115615187, + "grad_norm": 1.7784979343414307, + "learning_rate": 4.9248765447854054e-05, + "loss": 5.6812, + "step": 13155 + }, + { + "epoch": 0.07824245884479969, + "grad_norm": 1.5646599531173706, + "learning_rate": 4.9248651797675213e-05, + "loss": 5.7598, + "step": 13156 + }, + { + "epoch": 0.07824840612808069, + "grad_norm": 2.6416964530944824, + "learning_rate": 4.924853813903144e-05, + "loss": 5.9888, + "step": 13157 + }, + { + "epoch": 0.0782543534113617, + "grad_norm": 1.978983998298645, + "learning_rate": 4.924842447192274e-05, + "loss": 5.8919, + "step": 13158 + }, + { + "epoch": 0.07826030069464268, + "grad_norm": 2.3622004985809326, + "learning_rate": 4.924831079634916e-05, + "loss": 5.706, + "step": 13159 + }, + { + "epoch": 0.07826624797792368, + "grad_norm": 2.4118547439575195, + "learning_rate": 4.9248197112310754e-05, + "loss": 5.529, + "step": 13160 + }, + { + "epoch": 0.07827219526120469, + "grad_norm": 1.9290462732315063, + "learning_rate": 4.9248083419807554e-05, + "loss": 5.6403, + "step": 13161 + }, + { + "epoch": 0.07827814254448567, + "grad_norm": 1.9591599702835083, + "learning_rate": 4.92479697188396e-05, + "loss": 5.3365, + "step": 13162 + }, + { + "epoch": 0.07828408982776668, + "grad_norm": 1.7800555229187012, + "learning_rate": 4.9247856009406924e-05, + "loss": 6.4051, + "step": 13163 + }, + { + "epoch": 0.07829003711104768, + "grad_norm": 1.8390953540802002, + "learning_rate": 4.924774229150958e-05, + "loss": 5.775, + "step": 13164 + }, + { + "epoch": 0.07829598439432867, + "grad_norm": 1.8265724182128906, + "learning_rate": 4.924762856514759e-05, + "loss": 6.1238, + "step": 13165 + }, + { + "epoch": 0.07830193167760967, + "grad_norm": 1.5573666095733643, + "learning_rate": 4.9247514830321005e-05, + "loss": 5.9823, + "step": 13166 + }, + { + "epoch": 0.07830787896089067, + "grad_norm": 2.2647573947906494, + "learning_rate": 4.924740108702987e-05, + "loss": 5.0975, + "step": 13167 + }, + { + "epoch": 0.07831382624417166, + "grad_norm": 2.509573459625244, + "learning_rate": 4.924728733527422e-05, + "loss": 5.1327, + "step": 13168 + }, + { + "epoch": 0.07831977352745266, + "grad_norm": 2.2974681854248047, + "learning_rate": 4.924717357505408e-05, + "loss": 5.1493, + "step": 13169 + }, + { + "epoch": 0.07832572081073366, + "grad_norm": 1.958938717842102, + "learning_rate": 4.924705980636951e-05, + "loss": 6.0291, + "step": 13170 + }, + { + "epoch": 0.07833166809401465, + "grad_norm": 1.7714133262634277, + "learning_rate": 4.924694602922054e-05, + "loss": 5.9623, + "step": 13171 + }, + { + "epoch": 0.07833761537729565, + "grad_norm": 1.7545043230056763, + "learning_rate": 4.924683224360721e-05, + "loss": 5.9123, + "step": 13172 + }, + { + "epoch": 0.07834356266057665, + "grad_norm": 1.4791491031646729, + "learning_rate": 4.924671844952957e-05, + "loss": 5.8959, + "step": 13173 + }, + { + "epoch": 0.07834950994385764, + "grad_norm": 1.783353567123413, + "learning_rate": 4.924660464698764e-05, + "loss": 5.732, + "step": 13174 + }, + { + "epoch": 0.07835545722713864, + "grad_norm": 1.9444235563278198, + "learning_rate": 4.9246490835981474e-05, + "loss": 5.5167, + "step": 13175 + }, + { + "epoch": 0.07836140451041965, + "grad_norm": 1.9656537771224976, + "learning_rate": 4.924637701651111e-05, + "loss": 5.4557, + "step": 13176 + }, + { + "epoch": 0.07836735179370063, + "grad_norm": 1.8164803981781006, + "learning_rate": 4.9246263188576594e-05, + "loss": 5.44, + "step": 13177 + }, + { + "epoch": 0.07837329907698164, + "grad_norm": 1.8245429992675781, + "learning_rate": 4.9246149352177946e-05, + "loss": 5.2164, + "step": 13178 + }, + { + "epoch": 0.07837924636026264, + "grad_norm": 1.76225745677948, + "learning_rate": 4.924603550731522e-05, + "loss": 5.2325, + "step": 13179 + }, + { + "epoch": 0.07838519364354363, + "grad_norm": 2.052314519882202, + "learning_rate": 4.924592165398846e-05, + "loss": 5.7905, + "step": 13180 + }, + { + "epoch": 0.07839114092682463, + "grad_norm": 1.63084077835083, + "learning_rate": 4.924580779219769e-05, + "loss": 5.2703, + "step": 13181 + }, + { + "epoch": 0.07839708821010562, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.9245693921942965e-05, + "loss": 5.5974, + "step": 13182 + }, + { + "epoch": 0.07840303549338662, + "grad_norm": 2.201376438140869, + "learning_rate": 4.9245580043224315e-05, + "loss": 5.1298, + "step": 13183 + }, + { + "epoch": 0.07840898277666762, + "grad_norm": 2.3778293132781982, + "learning_rate": 4.924546615604179e-05, + "loss": 5.2289, + "step": 13184 + }, + { + "epoch": 0.07841493005994861, + "grad_norm": 2.5284171104431152, + "learning_rate": 4.9245352260395414e-05, + "loss": 5.0038, + "step": 13185 + }, + { + "epoch": 0.07842087734322961, + "grad_norm": 2.230825424194336, + "learning_rate": 4.9245238356285244e-05, + "loss": 5.0699, + "step": 13186 + }, + { + "epoch": 0.07842682462651061, + "grad_norm": 2.1288161277770996, + "learning_rate": 4.924512444371131e-05, + "loss": 5.1093, + "step": 13187 + }, + { + "epoch": 0.0784327719097916, + "grad_norm": 1.912685751914978, + "learning_rate": 4.924501052267365e-05, + "loss": 5.5926, + "step": 13188 + }, + { + "epoch": 0.0784387191930726, + "grad_norm": 2.394078254699707, + "learning_rate": 4.924489659317231e-05, + "loss": 5.129, + "step": 13189 + }, + { + "epoch": 0.0784446664763536, + "grad_norm": 2.7360801696777344, + "learning_rate": 4.924478265520733e-05, + "loss": 4.9682, + "step": 13190 + }, + { + "epoch": 0.0784506137596346, + "grad_norm": 2.4817416667938232, + "learning_rate": 4.924466870877874e-05, + "loss": 5.0193, + "step": 13191 + }, + { + "epoch": 0.0784565610429156, + "grad_norm": 2.5156679153442383, + "learning_rate": 4.92445547538866e-05, + "loss": 5.0044, + "step": 13192 + }, + { + "epoch": 0.0784625083261966, + "grad_norm": 2.519080638885498, + "learning_rate": 4.924444079053092e-05, + "loss": 5.0109, + "step": 13193 + }, + { + "epoch": 0.07846845560947759, + "grad_norm": 2.3944201469421387, + "learning_rate": 4.924432681871176e-05, + "loss": 5.0032, + "step": 13194 + }, + { + "epoch": 0.07847440289275859, + "grad_norm": 2.4199647903442383, + "learning_rate": 4.924421283842916e-05, + "loss": 4.8158, + "step": 13195 + }, + { + "epoch": 0.07848035017603959, + "grad_norm": 2.4517173767089844, + "learning_rate": 4.924409884968316e-05, + "loss": 4.8194, + "step": 13196 + }, + { + "epoch": 0.07848629745932058, + "grad_norm": 2.231703042984009, + "learning_rate": 4.924398485247379e-05, + "loss": 4.882, + "step": 13197 + }, + { + "epoch": 0.07849224474260158, + "grad_norm": 2.218252182006836, + "learning_rate": 4.924387084680109e-05, + "loss": 4.872, + "step": 13198 + }, + { + "epoch": 0.07849819202588258, + "grad_norm": 2.2126224040985107, + "learning_rate": 4.924375683266511e-05, + "loss": 5.019, + "step": 13199 + }, + { + "epoch": 0.07850413930916357, + "grad_norm": 2.197240114212036, + "learning_rate": 4.924364281006589e-05, + "loss": 4.9801, + "step": 13200 + }, + { + "epoch": 0.07851008659244457, + "grad_norm": 2.11427640914917, + "learning_rate": 4.9243528779003456e-05, + "loss": 4.992, + "step": 13201 + }, + { + "epoch": 0.07851603387572557, + "grad_norm": 1.9424201250076294, + "learning_rate": 4.9243414739477864e-05, + "loss": 4.9275, + "step": 13202 + }, + { + "epoch": 0.07852198115900656, + "grad_norm": 1.897208571434021, + "learning_rate": 4.9243300691489146e-05, + "loss": 5.0482, + "step": 13203 + }, + { + "epoch": 0.07852792844228756, + "grad_norm": 1.7149171829223633, + "learning_rate": 4.924318663503734e-05, + "loss": 5.4713, + "step": 13204 + }, + { + "epoch": 0.07853387572556857, + "grad_norm": 1.770279049873352, + "learning_rate": 4.924307257012248e-05, + "loss": 5.5565, + "step": 13205 + }, + { + "epoch": 0.07853982300884955, + "grad_norm": 2.043506145477295, + "learning_rate": 4.924295849674463e-05, + "loss": 4.9129, + "step": 13206 + }, + { + "epoch": 0.07854577029213056, + "grad_norm": 1.91255521774292, + "learning_rate": 4.92428444149038e-05, + "loss": 5.5405, + "step": 13207 + }, + { + "epoch": 0.07855171757541156, + "grad_norm": 2.371006965637207, + "learning_rate": 4.924273032460005e-05, + "loss": 5.8047, + "step": 13208 + }, + { + "epoch": 0.07855766485869255, + "grad_norm": 2.1126253604888916, + "learning_rate": 4.9242616225833416e-05, + "loss": 5.6397, + "step": 13209 + }, + { + "epoch": 0.07856361214197355, + "grad_norm": 1.9398634433746338, + "learning_rate": 4.9242502118603925e-05, + "loss": 5.7703, + "step": 13210 + }, + { + "epoch": 0.07856955942525454, + "grad_norm": 1.7660777568817139, + "learning_rate": 4.924238800291164e-05, + "loss": 5.6485, + "step": 13211 + }, + { + "epoch": 0.07857550670853554, + "grad_norm": 1.835633397102356, + "learning_rate": 4.924227387875658e-05, + "loss": 5.701, + "step": 13212 + }, + { + "epoch": 0.07858145399181654, + "grad_norm": 1.8192920684814453, + "learning_rate": 4.9242159746138796e-05, + "loss": 5.5682, + "step": 13213 + }, + { + "epoch": 0.07858740127509753, + "grad_norm": 1.8342156410217285, + "learning_rate": 4.924204560505832e-05, + "loss": 5.2546, + "step": 13214 + }, + { + "epoch": 0.07859334855837853, + "grad_norm": 1.855446696281433, + "learning_rate": 4.92419314555152e-05, + "loss": 5.7471, + "step": 13215 + }, + { + "epoch": 0.07859929584165953, + "grad_norm": 1.7786341905593872, + "learning_rate": 4.924181729750946e-05, + "loss": 5.8774, + "step": 13216 + }, + { + "epoch": 0.07860524312494052, + "grad_norm": 1.7919361591339111, + "learning_rate": 4.9241703131041175e-05, + "loss": 5.7796, + "step": 13217 + }, + { + "epoch": 0.07861119040822152, + "grad_norm": 2.1065824031829834, + "learning_rate": 4.924158895611034e-05, + "loss": 5.2471, + "step": 13218 + }, + { + "epoch": 0.07861713769150253, + "grad_norm": 2.18803334236145, + "learning_rate": 4.9241474772717036e-05, + "loss": 4.8654, + "step": 13219 + }, + { + "epoch": 0.07862308497478351, + "grad_norm": 2.156651020050049, + "learning_rate": 4.924136058086127e-05, + "loss": 4.7614, + "step": 13220 + }, + { + "epoch": 0.07862903225806452, + "grad_norm": 2.098242998123169, + "learning_rate": 4.9241246380543095e-05, + "loss": 4.8152, + "step": 13221 + }, + { + "epoch": 0.07863497954134552, + "grad_norm": 1.9857498407363892, + "learning_rate": 4.924113217176256e-05, + "loss": 4.7955, + "step": 13222 + }, + { + "epoch": 0.0786409268246265, + "grad_norm": 2.046926259994507, + "learning_rate": 4.9241017954519685e-05, + "loss": 4.9851, + "step": 13223 + }, + { + "epoch": 0.07864687410790751, + "grad_norm": 1.804005742073059, + "learning_rate": 4.924090372881454e-05, + "loss": 5.5084, + "step": 13224 + }, + { + "epoch": 0.07865282139118851, + "grad_norm": 1.8413509130477905, + "learning_rate": 4.924078949464713e-05, + "loss": 5.462, + "step": 13225 + }, + { + "epoch": 0.0786587686744695, + "grad_norm": 1.7599927186965942, + "learning_rate": 4.924067525201751e-05, + "loss": 5.4255, + "step": 13226 + }, + { + "epoch": 0.0786647159577505, + "grad_norm": 1.7645682096481323, + "learning_rate": 4.924056100092573e-05, + "loss": 5.4837, + "step": 13227 + }, + { + "epoch": 0.0786706632410315, + "grad_norm": 1.7478766441345215, + "learning_rate": 4.924044674137182e-05, + "loss": 5.2957, + "step": 13228 + }, + { + "epoch": 0.07867661052431249, + "grad_norm": 1.7865453958511353, + "learning_rate": 4.924033247335581e-05, + "loss": 5.1909, + "step": 13229 + }, + { + "epoch": 0.07868255780759349, + "grad_norm": 1.8167400360107422, + "learning_rate": 4.924021819687776e-05, + "loss": 5.2732, + "step": 13230 + }, + { + "epoch": 0.0786885050908745, + "grad_norm": 1.8745819330215454, + "learning_rate": 4.92401039119377e-05, + "loss": 5.3222, + "step": 13231 + }, + { + "epoch": 0.07869445237415548, + "grad_norm": 1.7355458736419678, + "learning_rate": 4.9239989618535665e-05, + "loss": 5.4142, + "step": 13232 + }, + { + "epoch": 0.07870039965743648, + "grad_norm": 1.7634247541427612, + "learning_rate": 4.9239875316671705e-05, + "loss": 5.3114, + "step": 13233 + }, + { + "epoch": 0.07870634694071749, + "grad_norm": 1.8516123294830322, + "learning_rate": 4.9239761006345845e-05, + "loss": 5.3014, + "step": 13234 + }, + { + "epoch": 0.07871229422399847, + "grad_norm": 1.8192317485809326, + "learning_rate": 4.9239646687558146e-05, + "loss": 5.407, + "step": 13235 + }, + { + "epoch": 0.07871824150727948, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.923953236030863e-05, + "loss": 5.4235, + "step": 13236 + }, + { + "epoch": 0.07872418879056048, + "grad_norm": 1.681746006011963, + "learning_rate": 4.923941802459735e-05, + "loss": 5.3367, + "step": 13237 + }, + { + "epoch": 0.07873013607384147, + "grad_norm": 1.6417745351791382, + "learning_rate": 4.9239303680424334e-05, + "loss": 5.253, + "step": 13238 + }, + { + "epoch": 0.07873608335712247, + "grad_norm": 1.6522557735443115, + "learning_rate": 4.9239189327789626e-05, + "loss": 5.0855, + "step": 13239 + }, + { + "epoch": 0.07874203064040346, + "grad_norm": 1.7547293901443481, + "learning_rate": 4.9239074966693275e-05, + "loss": 5.9017, + "step": 13240 + }, + { + "epoch": 0.07874797792368446, + "grad_norm": 1.998478889465332, + "learning_rate": 4.923896059713531e-05, + "loss": 5.4774, + "step": 13241 + }, + { + "epoch": 0.07875392520696546, + "grad_norm": 1.869710922241211, + "learning_rate": 4.9238846219115774e-05, + "loss": 5.4591, + "step": 13242 + }, + { + "epoch": 0.07875987249024645, + "grad_norm": 1.8957170248031616, + "learning_rate": 4.923873183263471e-05, + "loss": 5.2823, + "step": 13243 + }, + { + "epoch": 0.07876581977352745, + "grad_norm": 1.9052289724349976, + "learning_rate": 4.9238617437692146e-05, + "loss": 5.4753, + "step": 13244 + }, + { + "epoch": 0.07877176705680845, + "grad_norm": 1.8786853551864624, + "learning_rate": 4.923850303428814e-05, + "loss": 5.2234, + "step": 13245 + }, + { + "epoch": 0.07877771434008944, + "grad_norm": 2.298356533050537, + "learning_rate": 4.923838862242271e-05, + "loss": 4.7138, + "step": 13246 + }, + { + "epoch": 0.07878366162337044, + "grad_norm": 2.1191911697387695, + "learning_rate": 4.923827420209592e-05, + "loss": 4.6354, + "step": 13247 + }, + { + "epoch": 0.07878960890665145, + "grad_norm": 2.1735050678253174, + "learning_rate": 4.923815977330781e-05, + "loss": 4.454, + "step": 13248 + }, + { + "epoch": 0.07879555618993243, + "grad_norm": 2.0126335620880127, + "learning_rate": 4.923804533605839e-05, + "loss": 4.3387, + "step": 13249 + }, + { + "epoch": 0.07880150347321344, + "grad_norm": 2.00081729888916, + "learning_rate": 4.9237930890347726e-05, + "loss": 4.4009, + "step": 13250 + }, + { + "epoch": 0.07880745075649444, + "grad_norm": 2.198625326156616, + "learning_rate": 4.923781643617586e-05, + "loss": 4.4334, + "step": 13251 + }, + { + "epoch": 0.07881339803977543, + "grad_norm": 2.0630993843078613, + "learning_rate": 4.923770197354281e-05, + "loss": 4.6349, + "step": 13252 + }, + { + "epoch": 0.07881934532305643, + "grad_norm": 1.7470935583114624, + "learning_rate": 4.923758750244863e-05, + "loss": 5.1363, + "step": 13253 + }, + { + "epoch": 0.07882529260633743, + "grad_norm": 1.5461190938949585, + "learning_rate": 4.923747302289335e-05, + "loss": 5.7365, + "step": 13254 + }, + { + "epoch": 0.07883123988961842, + "grad_norm": 1.800528645515442, + "learning_rate": 4.9237358534877036e-05, + "loss": 5.949, + "step": 13255 + }, + { + "epoch": 0.07883718717289942, + "grad_norm": 2.096055746078491, + "learning_rate": 4.923724403839971e-05, + "loss": 5.4203, + "step": 13256 + }, + { + "epoch": 0.07884313445618042, + "grad_norm": 2.0838513374328613, + "learning_rate": 4.92371295334614e-05, + "loss": 5.0542, + "step": 13257 + }, + { + "epoch": 0.07884908173946141, + "grad_norm": 1.711534023284912, + "learning_rate": 4.923701502006217e-05, + "loss": 5.7168, + "step": 13258 + }, + { + "epoch": 0.07885502902274241, + "grad_norm": 1.6610822677612305, + "learning_rate": 4.9236900498202035e-05, + "loss": 5.5605, + "step": 13259 + }, + { + "epoch": 0.07886097630602341, + "grad_norm": 1.549854040145874, + "learning_rate": 4.9236785967881064e-05, + "loss": 5.7792, + "step": 13260 + }, + { + "epoch": 0.0788669235893044, + "grad_norm": 1.9194339513778687, + "learning_rate": 4.923667142909927e-05, + "loss": 5.5481, + "step": 13261 + }, + { + "epoch": 0.0788728708725854, + "grad_norm": 1.6644178628921509, + "learning_rate": 4.923655688185671e-05, + "loss": 5.7271, + "step": 13262 + }, + { + "epoch": 0.0788788181558664, + "grad_norm": 1.820898175239563, + "learning_rate": 4.9236442326153414e-05, + "loss": 6.2458, + "step": 13263 + }, + { + "epoch": 0.0788847654391474, + "grad_norm": 1.732539176940918, + "learning_rate": 4.923632776198943e-05, + "loss": 5.5854, + "step": 13264 + }, + { + "epoch": 0.0788907127224284, + "grad_norm": 1.769140601158142, + "learning_rate": 4.923621318936479e-05, + "loss": 5.5511, + "step": 13265 + }, + { + "epoch": 0.0788966600057094, + "grad_norm": 1.728833556175232, + "learning_rate": 4.923609860827955e-05, + "loss": 5.6215, + "step": 13266 + }, + { + "epoch": 0.07890260728899039, + "grad_norm": 1.5940407514572144, + "learning_rate": 4.923598401873373e-05, + "loss": 5.6572, + "step": 13267 + }, + { + "epoch": 0.07890855457227139, + "grad_norm": 2.153200149536133, + "learning_rate": 4.923586942072737e-05, + "loss": 5.0235, + "step": 13268 + }, + { + "epoch": 0.07891450185555238, + "grad_norm": 1.6448415517807007, + "learning_rate": 4.9235754814260526e-05, + "loss": 5.5353, + "step": 13269 + }, + { + "epoch": 0.07892044913883338, + "grad_norm": 1.706984281539917, + "learning_rate": 4.9235640199333235e-05, + "loss": 5.5278, + "step": 13270 + }, + { + "epoch": 0.07892639642211438, + "grad_norm": 1.6129798889160156, + "learning_rate": 4.923552557594553e-05, + "loss": 5.4643, + "step": 13271 + }, + { + "epoch": 0.07893234370539537, + "grad_norm": 1.612748384475708, + "learning_rate": 4.923541094409745e-05, + "loss": 5.4994, + "step": 13272 + }, + { + "epoch": 0.07893829098867637, + "grad_norm": 1.6947647333145142, + "learning_rate": 4.923529630378904e-05, + "loss": 5.5117, + "step": 13273 + }, + { + "epoch": 0.07894423827195737, + "grad_norm": 1.629684567451477, + "learning_rate": 4.9235181655020336e-05, + "loss": 5.4266, + "step": 13274 + }, + { + "epoch": 0.07895018555523836, + "grad_norm": 1.6417474746704102, + "learning_rate": 4.923506699779139e-05, + "loss": 5.4803, + "step": 13275 + }, + { + "epoch": 0.07895613283851936, + "grad_norm": 1.5188243389129639, + "learning_rate": 4.9234952332102226e-05, + "loss": 5.4066, + "step": 13276 + }, + { + "epoch": 0.07896208012180037, + "grad_norm": 1.4906466007232666, + "learning_rate": 4.9234837657952885e-05, + "loss": 5.4622, + "step": 13277 + }, + { + "epoch": 0.07896802740508135, + "grad_norm": 1.745351791381836, + "learning_rate": 4.9234722975343414e-05, + "loss": 5.458, + "step": 13278 + }, + { + "epoch": 0.07897397468836236, + "grad_norm": 1.734399676322937, + "learning_rate": 4.9234608284273866e-05, + "loss": 5.3542, + "step": 13279 + }, + { + "epoch": 0.07897992197164336, + "grad_norm": 2.396031379699707, + "learning_rate": 4.9234493584744254e-05, + "loss": 5.0978, + "step": 13280 + }, + { + "epoch": 0.07898586925492435, + "grad_norm": 2.0151939392089844, + "learning_rate": 4.9234378876754626e-05, + "loss": 5.5051, + "step": 13281 + }, + { + "epoch": 0.07899181653820535, + "grad_norm": 2.1796762943267822, + "learning_rate": 4.9234264160305036e-05, + "loss": 5.2788, + "step": 13282 + }, + { + "epoch": 0.07899776382148635, + "grad_norm": 2.069291830062866, + "learning_rate": 4.923414943539552e-05, + "loss": 5.4454, + "step": 13283 + }, + { + "epoch": 0.07900371110476734, + "grad_norm": 2.034498929977417, + "learning_rate": 4.92340347020261e-05, + "loss": 5.3849, + "step": 13284 + }, + { + "epoch": 0.07900965838804834, + "grad_norm": 1.8353052139282227, + "learning_rate": 4.9233919960196835e-05, + "loss": 5.3975, + "step": 13285 + }, + { + "epoch": 0.07901560567132934, + "grad_norm": 1.9896777868270874, + "learning_rate": 4.923380520990776e-05, + "loss": 5.1199, + "step": 13286 + }, + { + "epoch": 0.07902155295461033, + "grad_norm": 1.9539830684661865, + "learning_rate": 4.923369045115891e-05, + "loss": 5.3908, + "step": 13287 + }, + { + "epoch": 0.07902750023789133, + "grad_norm": 1.682651162147522, + "learning_rate": 4.923357568395033e-05, + "loss": 5.4719, + "step": 13288 + }, + { + "epoch": 0.07903344752117233, + "grad_norm": 2.0095672607421875, + "learning_rate": 4.923346090828206e-05, + "loss": 5.9258, + "step": 13289 + }, + { + "epoch": 0.07903939480445332, + "grad_norm": 1.7949076890945435, + "learning_rate": 4.923334612415413e-05, + "loss": 5.646, + "step": 13290 + }, + { + "epoch": 0.07904534208773432, + "grad_norm": 2.1651079654693604, + "learning_rate": 4.92332313315666e-05, + "loss": 5.2527, + "step": 13291 + }, + { + "epoch": 0.07905128937101533, + "grad_norm": 2.0362184047698975, + "learning_rate": 4.92331165305195e-05, + "loss": 5.2671, + "step": 13292 + }, + { + "epoch": 0.07905723665429631, + "grad_norm": 1.5425541400909424, + "learning_rate": 4.923300172101287e-05, + "loss": 5.5149, + "step": 13293 + }, + { + "epoch": 0.07906318393757732, + "grad_norm": 2.13031005859375, + "learning_rate": 4.923288690304675e-05, + "loss": 5.9304, + "step": 13294 + }, + { + "epoch": 0.07906913122085832, + "grad_norm": 2.165199041366577, + "learning_rate": 4.923277207662117e-05, + "loss": 5.9153, + "step": 13295 + }, + { + "epoch": 0.0790750785041393, + "grad_norm": 2.1479499340057373, + "learning_rate": 4.923265724173619e-05, + "loss": 5.7215, + "step": 13296 + }, + { + "epoch": 0.07908102578742031, + "grad_norm": 1.8908145427703857, + "learning_rate": 4.923254239839183e-05, + "loss": 5.5801, + "step": 13297 + }, + { + "epoch": 0.0790869730707013, + "grad_norm": 1.7739901542663574, + "learning_rate": 4.9232427546588145e-05, + "loss": 5.283, + "step": 13298 + }, + { + "epoch": 0.0790929203539823, + "grad_norm": 1.8153715133666992, + "learning_rate": 4.9232312686325175e-05, + "loss": 5.4626, + "step": 13299 + }, + { + "epoch": 0.0790988676372633, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.923219781760295e-05, + "loss": 5.5246, + "step": 13300 + }, + { + "epoch": 0.07910481492054429, + "grad_norm": 2.161536455154419, + "learning_rate": 4.923208294042152e-05, + "loss": 5.6865, + "step": 13301 + }, + { + "epoch": 0.07911076220382529, + "grad_norm": 2.5373623371124268, + "learning_rate": 4.9231968054780905e-05, + "loss": 5.8634, + "step": 13302 + }, + { + "epoch": 0.0791167094871063, + "grad_norm": 2.4957666397094727, + "learning_rate": 4.923185316068117e-05, + "loss": 4.9065, + "step": 13303 + }, + { + "epoch": 0.07912265677038728, + "grad_norm": 2.260540246963501, + "learning_rate": 4.923173825812235e-05, + "loss": 5.0815, + "step": 13304 + }, + { + "epoch": 0.07912860405366828, + "grad_norm": 2.406765937805176, + "learning_rate": 4.923162334710448e-05, + "loss": 4.8599, + "step": 13305 + }, + { + "epoch": 0.07913455133694929, + "grad_norm": 2.282153606414795, + "learning_rate": 4.923150842762759e-05, + "loss": 5.1024, + "step": 13306 + }, + { + "epoch": 0.07914049862023027, + "grad_norm": 1.8351432085037231, + "learning_rate": 4.9231393499691744e-05, + "loss": 5.3715, + "step": 13307 + }, + { + "epoch": 0.07914644590351128, + "grad_norm": 1.8290963172912598, + "learning_rate": 4.9231278563296965e-05, + "loss": 5.4456, + "step": 13308 + }, + { + "epoch": 0.07915239318679228, + "grad_norm": 1.7157766819000244, + "learning_rate": 4.923116361844329e-05, + "loss": 5.4952, + "step": 13309 + }, + { + "epoch": 0.07915834047007327, + "grad_norm": 2.051391124725342, + "learning_rate": 4.923104866513077e-05, + "loss": 5.7754, + "step": 13310 + }, + { + "epoch": 0.07916428775335427, + "grad_norm": 1.8714796304702759, + "learning_rate": 4.923093370335944e-05, + "loss": 5.4118, + "step": 13311 + }, + { + "epoch": 0.07917023503663527, + "grad_norm": 2.4251246452331543, + "learning_rate": 4.923081873312935e-05, + "loss": 4.9677, + "step": 13312 + }, + { + "epoch": 0.07917618231991626, + "grad_norm": 3.490328550338745, + "learning_rate": 4.923070375444052e-05, + "loss": 4.5336, + "step": 13313 + }, + { + "epoch": 0.07918212960319726, + "grad_norm": 2.820434331893921, + "learning_rate": 4.9230588767293004e-05, + "loss": 4.2865, + "step": 13314 + }, + { + "epoch": 0.07918807688647826, + "grad_norm": 2.3713653087615967, + "learning_rate": 4.923047377168685e-05, + "loss": 4.2558, + "step": 13315 + }, + { + "epoch": 0.07919402416975925, + "grad_norm": 2.484199285507202, + "learning_rate": 4.923035876762208e-05, + "loss": 3.9565, + "step": 13316 + }, + { + "epoch": 0.07919997145304025, + "grad_norm": 2.771982431411743, + "learning_rate": 4.9230243755098735e-05, + "loss": 3.9478, + "step": 13317 + }, + { + "epoch": 0.07920591873632125, + "grad_norm": 2.613006591796875, + "learning_rate": 4.9230128734116874e-05, + "loss": 4.0285, + "step": 13318 + }, + { + "epoch": 0.07921186601960224, + "grad_norm": 2.378276824951172, + "learning_rate": 4.923001370467653e-05, + "loss": 4.129, + "step": 13319 + }, + { + "epoch": 0.07921781330288324, + "grad_norm": 2.6948869228363037, + "learning_rate": 4.922989866677772e-05, + "loss": 5.7581, + "step": 13320 + }, + { + "epoch": 0.07922376058616425, + "grad_norm": 2.058387517929077, + "learning_rate": 4.922978362042051e-05, + "loss": 5.7589, + "step": 13321 + }, + { + "epoch": 0.07922970786944523, + "grad_norm": 2.2277138233184814, + "learning_rate": 4.9229668565604936e-05, + "loss": 5.691, + "step": 13322 + }, + { + "epoch": 0.07923565515272624, + "grad_norm": 1.827525019645691, + "learning_rate": 4.922955350233104e-05, + "loss": 5.6555, + "step": 13323 + }, + { + "epoch": 0.07924160243600724, + "grad_norm": 1.5456974506378174, + "learning_rate": 4.922943843059885e-05, + "loss": 5.445, + "step": 13324 + }, + { + "epoch": 0.07924754971928823, + "grad_norm": 1.859805703163147, + "learning_rate": 4.922932335040842e-05, + "loss": 5.5864, + "step": 13325 + }, + { + "epoch": 0.07925349700256923, + "grad_norm": 2.0083398818969727, + "learning_rate": 4.922920826175977e-05, + "loss": 5.7598, + "step": 13326 + }, + { + "epoch": 0.07925944428585022, + "grad_norm": 1.9759368896484375, + "learning_rate": 4.922909316465296e-05, + "loss": 5.7778, + "step": 13327 + }, + { + "epoch": 0.07926539156913122, + "grad_norm": 1.9937580823898315, + "learning_rate": 4.9228978059088035e-05, + "loss": 5.7291, + "step": 13328 + }, + { + "epoch": 0.07927133885241222, + "grad_norm": 2.6860668659210205, + "learning_rate": 4.922886294506501e-05, + "loss": 5.0277, + "step": 13329 + }, + { + "epoch": 0.07927728613569321, + "grad_norm": 2.03318190574646, + "learning_rate": 4.9228747822583945e-05, + "loss": 5.2387, + "step": 13330 + }, + { + "epoch": 0.07928323341897421, + "grad_norm": 2.250929117202759, + "learning_rate": 4.9228632691644874e-05, + "loss": 5.2348, + "step": 13331 + }, + { + "epoch": 0.07928918070225521, + "grad_norm": 2.0255093574523926, + "learning_rate": 4.922851755224784e-05, + "loss": 5.6585, + "step": 13332 + }, + { + "epoch": 0.0792951279855362, + "grad_norm": 1.9353551864624023, + "learning_rate": 4.922840240439288e-05, + "loss": 5.3989, + "step": 13333 + }, + { + "epoch": 0.0793010752688172, + "grad_norm": 1.9392589330673218, + "learning_rate": 4.922828724808003e-05, + "loss": 5.9127, + "step": 13334 + }, + { + "epoch": 0.0793070225520982, + "grad_norm": 2.312340021133423, + "learning_rate": 4.922817208330934e-05, + "loss": 5.656, + "step": 13335 + }, + { + "epoch": 0.0793129698353792, + "grad_norm": 2.1480720043182373, + "learning_rate": 4.9228056910080845e-05, + "loss": 5.4582, + "step": 13336 + }, + { + "epoch": 0.0793189171186602, + "grad_norm": 2.0460312366485596, + "learning_rate": 4.922794172839458e-05, + "loss": 5.5177, + "step": 13337 + }, + { + "epoch": 0.0793248644019412, + "grad_norm": 1.8319480419158936, + "learning_rate": 4.92278265382506e-05, + "loss": 5.5872, + "step": 13338 + }, + { + "epoch": 0.07933081168522219, + "grad_norm": 1.610379934310913, + "learning_rate": 4.922771133964893e-05, + "loss": 5.5398, + "step": 13339 + }, + { + "epoch": 0.07933675896850319, + "grad_norm": 1.767022728919983, + "learning_rate": 4.9227596132589616e-05, + "loss": 6.0004, + "step": 13340 + }, + { + "epoch": 0.07934270625178419, + "grad_norm": 2.108621835708618, + "learning_rate": 4.92274809170727e-05, + "loss": 5.1513, + "step": 13341 + }, + { + "epoch": 0.07934865353506518, + "grad_norm": 2.2562835216522217, + "learning_rate": 4.922736569309822e-05, + "loss": 4.7642, + "step": 13342 + }, + { + "epoch": 0.07935460081834618, + "grad_norm": 1.7953063249588013, + "learning_rate": 4.922725046066622e-05, + "loss": 5.2453, + "step": 13343 + }, + { + "epoch": 0.07936054810162718, + "grad_norm": 1.8957513570785522, + "learning_rate": 4.922713521977673e-05, + "loss": 5.0673, + "step": 13344 + }, + { + "epoch": 0.07936649538490817, + "grad_norm": 1.8375275135040283, + "learning_rate": 4.922701997042981e-05, + "loss": 5.0301, + "step": 13345 + }, + { + "epoch": 0.07937244266818917, + "grad_norm": 2.306138515472412, + "learning_rate": 4.9226904712625473e-05, + "loss": 4.7415, + "step": 13346 + }, + { + "epoch": 0.07937838995147017, + "grad_norm": 2.058403730392456, + "learning_rate": 4.922678944636379e-05, + "loss": 5.4454, + "step": 13347 + }, + { + "epoch": 0.07938433723475116, + "grad_norm": 1.9230997562408447, + "learning_rate": 4.922667417164477e-05, + "loss": 5.3755, + "step": 13348 + }, + { + "epoch": 0.07939028451803216, + "grad_norm": 1.9053308963775635, + "learning_rate": 4.922655888846848e-05, + "loss": 5.7708, + "step": 13349 + }, + { + "epoch": 0.07939623180131317, + "grad_norm": 1.8009783029556274, + "learning_rate": 4.922644359683494e-05, + "loss": 4.9939, + "step": 13350 + }, + { + "epoch": 0.07940217908459415, + "grad_norm": 1.6748642921447754, + "learning_rate": 4.92263282967442e-05, + "loss": 5.4869, + "step": 13351 + }, + { + "epoch": 0.07940812636787516, + "grad_norm": 1.532475471496582, + "learning_rate": 4.92262129881963e-05, + "loss": 5.755, + "step": 13352 + }, + { + "epoch": 0.07941407365115616, + "grad_norm": 1.513795018196106, + "learning_rate": 4.9226097671191284e-05, + "loss": 5.4083, + "step": 13353 + }, + { + "epoch": 0.07942002093443715, + "grad_norm": 1.66012442111969, + "learning_rate": 4.922598234572918e-05, + "loss": 5.5185, + "step": 13354 + }, + { + "epoch": 0.07942596821771815, + "grad_norm": 1.6519379615783691, + "learning_rate": 4.922586701181005e-05, + "loss": 5.3482, + "step": 13355 + }, + { + "epoch": 0.07943191550099914, + "grad_norm": 1.4444184303283691, + "learning_rate": 4.922575166943391e-05, + "loss": 5.4466, + "step": 13356 + }, + { + "epoch": 0.07943786278428014, + "grad_norm": 1.4603393077850342, + "learning_rate": 4.92256363186008e-05, + "loss": 5.4343, + "step": 13357 + }, + { + "epoch": 0.07944381006756114, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.922552095931078e-05, + "loss": 5.4224, + "step": 13358 + }, + { + "epoch": 0.07944975735084213, + "grad_norm": 1.3054184913635254, + "learning_rate": 4.922540559156389e-05, + "loss": 5.4801, + "step": 13359 + }, + { + "epoch": 0.07945570463412313, + "grad_norm": 1.6295130252838135, + "learning_rate": 4.922529021536015e-05, + "loss": 5.4593, + "step": 13360 + }, + { + "epoch": 0.07946165191740413, + "grad_norm": 1.6684668064117432, + "learning_rate": 4.922517483069962e-05, + "loss": 5.2817, + "step": 13361 + }, + { + "epoch": 0.07946759920068512, + "grad_norm": 1.580409049987793, + "learning_rate": 4.922505943758232e-05, + "loss": 5.4399, + "step": 13362 + }, + { + "epoch": 0.07947354648396612, + "grad_norm": 1.613756775856018, + "learning_rate": 4.922494403600831e-05, + "loss": 5.2646, + "step": 13363 + }, + { + "epoch": 0.07947949376724713, + "grad_norm": 1.4371063709259033, + "learning_rate": 4.9224828625977616e-05, + "loss": 5.2866, + "step": 13364 + }, + { + "epoch": 0.07948544105052811, + "grad_norm": 1.5926525592803955, + "learning_rate": 4.9224713207490294e-05, + "loss": 5.5958, + "step": 13365 + }, + { + "epoch": 0.07949138833380912, + "grad_norm": 1.5216618776321411, + "learning_rate": 4.9224597780546365e-05, + "loss": 5.6094, + "step": 13366 + }, + { + "epoch": 0.07949733561709012, + "grad_norm": 1.7261598110198975, + "learning_rate": 4.922448234514588e-05, + "loss": 5.2781, + "step": 13367 + }, + { + "epoch": 0.0795032829003711, + "grad_norm": 1.6909232139587402, + "learning_rate": 4.922436690128889e-05, + "loss": 5.3299, + "step": 13368 + }, + { + "epoch": 0.07950923018365211, + "grad_norm": 1.6486754417419434, + "learning_rate": 4.922425144897541e-05, + "loss": 5.2478, + "step": 13369 + }, + { + "epoch": 0.07951517746693311, + "grad_norm": 1.4019837379455566, + "learning_rate": 4.922413598820551e-05, + "loss": 5.2383, + "step": 13370 + }, + { + "epoch": 0.0795211247502141, + "grad_norm": 1.7588412761688232, + "learning_rate": 4.92240205189792e-05, + "loss": 5.3224, + "step": 13371 + }, + { + "epoch": 0.0795270720334951, + "grad_norm": 1.5354480743408203, + "learning_rate": 4.922390504129654e-05, + "loss": 5.1617, + "step": 13372 + }, + { + "epoch": 0.0795330193167761, + "grad_norm": 1.5183011293411255, + "learning_rate": 4.922378955515756e-05, + "loss": 5.3082, + "step": 13373 + }, + { + "epoch": 0.07953896660005709, + "grad_norm": 1.436281681060791, + "learning_rate": 4.922367406056232e-05, + "loss": 5.4446, + "step": 13374 + }, + { + "epoch": 0.07954491388333809, + "grad_norm": 1.526934266090393, + "learning_rate": 4.922355855751083e-05, + "loss": 5.3067, + "step": 13375 + }, + { + "epoch": 0.0795508611666191, + "grad_norm": 1.516784906387329, + "learning_rate": 4.922344304600315e-05, + "loss": 5.4982, + "step": 13376 + }, + { + "epoch": 0.07955680844990008, + "grad_norm": 1.5154777765274048, + "learning_rate": 4.922332752603932e-05, + "loss": 5.3459, + "step": 13377 + }, + { + "epoch": 0.07956275573318108, + "grad_norm": 1.542508840560913, + "learning_rate": 4.9223211997619376e-05, + "loss": 5.3677, + "step": 13378 + }, + { + "epoch": 0.07956870301646209, + "grad_norm": 1.3413010835647583, + "learning_rate": 4.922309646074336e-05, + "loss": 5.2684, + "step": 13379 + }, + { + "epoch": 0.07957465029974307, + "grad_norm": 1.6295002698898315, + "learning_rate": 4.9222980915411306e-05, + "loss": 5.2737, + "step": 13380 + }, + { + "epoch": 0.07958059758302408, + "grad_norm": 1.5810730457305908, + "learning_rate": 4.922286536162326e-05, + "loss": 5.2471, + "step": 13381 + }, + { + "epoch": 0.07958654486630508, + "grad_norm": 1.3186451196670532, + "learning_rate": 4.9222749799379266e-05, + "loss": 5.3081, + "step": 13382 + }, + { + "epoch": 0.07959249214958607, + "grad_norm": 1.3897243738174438, + "learning_rate": 4.922263422867936e-05, + "loss": 5.2658, + "step": 13383 + }, + { + "epoch": 0.07959843943286707, + "grad_norm": 1.3873858451843262, + "learning_rate": 4.922251864952358e-05, + "loss": 5.334, + "step": 13384 + }, + { + "epoch": 0.07960438671614806, + "grad_norm": 1.4205409288406372, + "learning_rate": 4.922240306191197e-05, + "loss": 5.3007, + "step": 13385 + }, + { + "epoch": 0.07961033399942906, + "grad_norm": 1.3726485967636108, + "learning_rate": 4.922228746584457e-05, + "loss": 5.1949, + "step": 13386 + }, + { + "epoch": 0.07961628128271006, + "grad_norm": 1.708837628364563, + "learning_rate": 4.922217186132142e-05, + "loss": 5.2061, + "step": 13387 + }, + { + "epoch": 0.07962222856599105, + "grad_norm": 1.7818368673324585, + "learning_rate": 4.9222056248342556e-05, + "loss": 5.1182, + "step": 13388 + }, + { + "epoch": 0.07962817584927205, + "grad_norm": 1.4941715002059937, + "learning_rate": 4.9221940626908024e-05, + "loss": 5.0899, + "step": 13389 + }, + { + "epoch": 0.07963412313255305, + "grad_norm": 1.3581326007843018, + "learning_rate": 4.922182499701787e-05, + "loss": 5.0551, + "step": 13390 + }, + { + "epoch": 0.07964007041583404, + "grad_norm": 1.5772393941879272, + "learning_rate": 4.922170935867212e-05, + "loss": 5.245, + "step": 13391 + }, + { + "epoch": 0.07964601769911504, + "grad_norm": 1.9635555744171143, + "learning_rate": 4.922159371187082e-05, + "loss": 5.2898, + "step": 13392 + }, + { + "epoch": 0.07965196498239605, + "grad_norm": 1.535050392150879, + "learning_rate": 4.922147805661402e-05, + "loss": 5.2043, + "step": 13393 + }, + { + "epoch": 0.07965791226567703, + "grad_norm": 1.4985787868499756, + "learning_rate": 4.922136239290175e-05, + "loss": 5.1682, + "step": 13394 + }, + { + "epoch": 0.07966385954895804, + "grad_norm": 1.5314218997955322, + "learning_rate": 4.922124672073405e-05, + "loss": 5.321, + "step": 13395 + }, + { + "epoch": 0.07966980683223904, + "grad_norm": 1.440621018409729, + "learning_rate": 4.9221131040110954e-05, + "loss": 5.3013, + "step": 13396 + }, + { + "epoch": 0.07967575411552003, + "grad_norm": 1.5103110074996948, + "learning_rate": 4.9221015351032527e-05, + "loss": 5.2825, + "step": 13397 + }, + { + "epoch": 0.07968170139880103, + "grad_norm": 1.3581254482269287, + "learning_rate": 4.9220899653498786e-05, + "loss": 5.2433, + "step": 13398 + }, + { + "epoch": 0.07968764868208203, + "grad_norm": 1.5673763751983643, + "learning_rate": 4.922078394750978e-05, + "loss": 5.2279, + "step": 13399 + }, + { + "epoch": 0.07969359596536302, + "grad_norm": 1.5550049543380737, + "learning_rate": 4.922066823306555e-05, + "loss": 5.0406, + "step": 13400 + }, + { + "epoch": 0.07969954324864402, + "grad_norm": 1.6366932392120361, + "learning_rate": 4.922055251016613e-05, + "loss": 5.1299, + "step": 13401 + }, + { + "epoch": 0.07970549053192502, + "grad_norm": 1.45979642868042, + "learning_rate": 4.922043677881157e-05, + "loss": 4.9527, + "step": 13402 + }, + { + "epoch": 0.07971143781520601, + "grad_norm": 1.594494104385376, + "learning_rate": 4.922032103900191e-05, + "loss": 5.6511, + "step": 13403 + }, + { + "epoch": 0.07971738509848701, + "grad_norm": 1.419045329093933, + "learning_rate": 4.9220205290737175e-05, + "loss": 5.0936, + "step": 13404 + }, + { + "epoch": 0.07972333238176801, + "grad_norm": 1.5998183488845825, + "learning_rate": 4.922008953401742e-05, + "loss": 5.2774, + "step": 13405 + }, + { + "epoch": 0.079729279665049, + "grad_norm": 1.3942409753799438, + "learning_rate": 4.9219973768842685e-05, + "loss": 5.5466, + "step": 13406 + }, + { + "epoch": 0.07973522694833, + "grad_norm": 1.4478344917297363, + "learning_rate": 4.9219857995213015e-05, + "loss": 5.5757, + "step": 13407 + }, + { + "epoch": 0.079741174231611, + "grad_norm": 1.4197556972503662, + "learning_rate": 4.921974221312843e-05, + "loss": 5.3194, + "step": 13408 + }, + { + "epoch": 0.079747121514892, + "grad_norm": 1.7690924406051636, + "learning_rate": 4.9219626422588996e-05, + "loss": 5.3551, + "step": 13409 + }, + { + "epoch": 0.079753068798173, + "grad_norm": 1.8233799934387207, + "learning_rate": 4.921951062359473e-05, + "loss": 5.3143, + "step": 13410 + }, + { + "epoch": 0.079759016081454, + "grad_norm": 1.738848090171814, + "learning_rate": 4.921939481614568e-05, + "loss": 5.0194, + "step": 13411 + }, + { + "epoch": 0.07976496336473499, + "grad_norm": 1.6401729583740234, + "learning_rate": 4.92192790002419e-05, + "loss": 5.3347, + "step": 13412 + }, + { + "epoch": 0.07977091064801599, + "grad_norm": 1.425485372543335, + "learning_rate": 4.921916317588341e-05, + "loss": 5.0384, + "step": 13413 + }, + { + "epoch": 0.07977685793129698, + "grad_norm": 1.6337133646011353, + "learning_rate": 4.921904734307027e-05, + "loss": 5.3213, + "step": 13414 + }, + { + "epoch": 0.07978280521457798, + "grad_norm": 1.561292052268982, + "learning_rate": 4.92189315018025e-05, + "loss": 5.1502, + "step": 13415 + }, + { + "epoch": 0.07978875249785898, + "grad_norm": 1.6225664615631104, + "learning_rate": 4.921881565208016e-05, + "loss": 5.2638, + "step": 13416 + }, + { + "epoch": 0.07979469978113997, + "grad_norm": 1.5074353218078613, + "learning_rate": 4.921869979390328e-05, + "loss": 5.0872, + "step": 13417 + }, + { + "epoch": 0.07980064706442097, + "grad_norm": 1.4769634008407593, + "learning_rate": 4.92185839272719e-05, + "loss": 5.1341, + "step": 13418 + }, + { + "epoch": 0.07980659434770197, + "grad_norm": 1.5929937362670898, + "learning_rate": 4.921846805218607e-05, + "loss": 5.2799, + "step": 13419 + }, + { + "epoch": 0.07981254163098296, + "grad_norm": 1.4583854675292969, + "learning_rate": 4.921835216864581e-05, + "loss": 5.0822, + "step": 13420 + }, + { + "epoch": 0.07981848891426396, + "grad_norm": 1.4904375076293945, + "learning_rate": 4.921823627665119e-05, + "loss": 5.055, + "step": 13421 + }, + { + "epoch": 0.07982443619754497, + "grad_norm": 1.6971831321716309, + "learning_rate": 4.921812037620221e-05, + "loss": 5.1968, + "step": 13422 + }, + { + "epoch": 0.07983038348082595, + "grad_norm": 1.5604689121246338, + "learning_rate": 4.9218004467298956e-05, + "loss": 4.9681, + "step": 13423 + }, + { + "epoch": 0.07983633076410696, + "grad_norm": 1.678427815437317, + "learning_rate": 4.9217888549941436e-05, + "loss": 5.2044, + "step": 13424 + }, + { + "epoch": 0.07984227804738796, + "grad_norm": 1.521996259689331, + "learning_rate": 4.921777262412971e-05, + "loss": 4.9741, + "step": 13425 + }, + { + "epoch": 0.07984822533066895, + "grad_norm": 1.5315868854522705, + "learning_rate": 4.92176566898638e-05, + "loss": 5.0064, + "step": 13426 + }, + { + "epoch": 0.07985417261394995, + "grad_norm": 1.465867280960083, + "learning_rate": 4.9217540747143765e-05, + "loss": 4.942, + "step": 13427 + }, + { + "epoch": 0.07986011989723095, + "grad_norm": 1.4323827028274536, + "learning_rate": 4.9217424795969634e-05, + "loss": 4.8934, + "step": 13428 + }, + { + "epoch": 0.07986606718051194, + "grad_norm": 1.4645717144012451, + "learning_rate": 4.921730883634145e-05, + "loss": 5.0473, + "step": 13429 + }, + { + "epoch": 0.07987201446379294, + "grad_norm": 1.5992658138275146, + "learning_rate": 4.9217192868259246e-05, + "loss": 4.8968, + "step": 13430 + }, + { + "epoch": 0.07987796174707394, + "grad_norm": 1.4294894933700562, + "learning_rate": 4.921707689172308e-05, + "loss": 5.0719, + "step": 13431 + }, + { + "epoch": 0.07988390903035493, + "grad_norm": 1.5885019302368164, + "learning_rate": 4.921696090673298e-05, + "loss": 5.1505, + "step": 13432 + }, + { + "epoch": 0.07988985631363593, + "grad_norm": 1.4929580688476562, + "learning_rate": 4.921684491328898e-05, + "loss": 5.016, + "step": 13433 + }, + { + "epoch": 0.07989580359691693, + "grad_norm": 1.4980381727218628, + "learning_rate": 4.921672891139114e-05, + "loss": 5.0601, + "step": 13434 + }, + { + "epoch": 0.07990175088019792, + "grad_norm": 1.5698089599609375, + "learning_rate": 4.9216612901039495e-05, + "loss": 5.0251, + "step": 13435 + }, + { + "epoch": 0.07990769816347892, + "grad_norm": 1.459037184715271, + "learning_rate": 4.921649688223407e-05, + "loss": 4.8417, + "step": 13436 + }, + { + "epoch": 0.07991364544675993, + "grad_norm": 1.5418161153793335, + "learning_rate": 4.921638085497492e-05, + "loss": 5.1989, + "step": 13437 + }, + { + "epoch": 0.07991959273004091, + "grad_norm": 1.546325922012329, + "learning_rate": 4.9216264819262084e-05, + "loss": 5.3004, + "step": 13438 + }, + { + "epoch": 0.07992554001332192, + "grad_norm": 1.5820508003234863, + "learning_rate": 4.9216148775095594e-05, + "loss": 5.3327, + "step": 13439 + }, + { + "epoch": 0.07993148729660292, + "grad_norm": 1.5077866315841675, + "learning_rate": 4.9216032722475504e-05, + "loss": 5.2423, + "step": 13440 + }, + { + "epoch": 0.0799374345798839, + "grad_norm": 1.3654597997665405, + "learning_rate": 4.921591666140184e-05, + "loss": 5.1563, + "step": 13441 + }, + { + "epoch": 0.07994338186316491, + "grad_norm": 1.6721473932266235, + "learning_rate": 4.921580059187466e-05, + "loss": 5.1848, + "step": 13442 + }, + { + "epoch": 0.0799493291464459, + "grad_norm": 1.5349076986312866, + "learning_rate": 4.921568451389398e-05, + "loss": 5.1836, + "step": 13443 + }, + { + "epoch": 0.0799552764297269, + "grad_norm": 1.6246919631958008, + "learning_rate": 4.921556842745987e-05, + "loss": 4.8715, + "step": 13444 + }, + { + "epoch": 0.0799612237130079, + "grad_norm": 1.5361920595169067, + "learning_rate": 4.921545233257234e-05, + "loss": 4.8203, + "step": 13445 + }, + { + "epoch": 0.07996717099628889, + "grad_norm": 1.6185765266418457, + "learning_rate": 4.921533622923146e-05, + "loss": 4.8039, + "step": 13446 + }, + { + "epoch": 0.07997311827956989, + "grad_norm": 1.402462363243103, + "learning_rate": 4.9215220117437246e-05, + "loss": 4.8524, + "step": 13447 + }, + { + "epoch": 0.07997906556285089, + "grad_norm": 1.5282337665557861, + "learning_rate": 4.921510399718975e-05, + "loss": 4.8081, + "step": 13448 + }, + { + "epoch": 0.07998501284613188, + "grad_norm": 1.336254596710205, + "learning_rate": 4.921498786848902e-05, + "loss": 4.8468, + "step": 13449 + }, + { + "epoch": 0.07999096012941288, + "grad_norm": 1.4701998233795166, + "learning_rate": 4.921487173133508e-05, + "loss": 4.6873, + "step": 13450 + }, + { + "epoch": 0.07999690741269389, + "grad_norm": 1.6340824365615845, + "learning_rate": 4.921475558572798e-05, + "loss": 4.6779, + "step": 13451 + }, + { + "epoch": 0.08000285469597487, + "grad_norm": 1.557027816772461, + "learning_rate": 4.921463943166775e-05, + "loss": 4.6467, + "step": 13452 + }, + { + "epoch": 0.08000880197925588, + "grad_norm": 1.6390316486358643, + "learning_rate": 4.9214523269154454e-05, + "loss": 4.7376, + "step": 13453 + }, + { + "epoch": 0.08001474926253688, + "grad_norm": 2.3929800987243652, + "learning_rate": 4.921440709818811e-05, + "loss": 5.2623, + "step": 13454 + }, + { + "epoch": 0.08002069654581787, + "grad_norm": 1.5896660089492798, + "learning_rate": 4.921429091876877e-05, + "loss": 4.6952, + "step": 13455 + }, + { + "epoch": 0.08002664382909887, + "grad_norm": 1.6705348491668701, + "learning_rate": 4.921417473089647e-05, + "loss": 4.7963, + "step": 13456 + }, + { + "epoch": 0.08003259111237987, + "grad_norm": 1.5925310850143433, + "learning_rate": 4.9214058534571253e-05, + "loss": 4.7398, + "step": 13457 + }, + { + "epoch": 0.08003853839566086, + "grad_norm": 1.5314396619796753, + "learning_rate": 4.921394232979316e-05, + "loss": 4.7578, + "step": 13458 + }, + { + "epoch": 0.08004448567894186, + "grad_norm": 1.6665661334991455, + "learning_rate": 4.921382611656222e-05, + "loss": 4.7767, + "step": 13459 + }, + { + "epoch": 0.08005043296222286, + "grad_norm": 1.5145021677017212, + "learning_rate": 4.9213709894878495e-05, + "loss": 4.7892, + "step": 13460 + }, + { + "epoch": 0.08005638024550385, + "grad_norm": 1.8332866430282593, + "learning_rate": 4.921359366474201e-05, + "loss": 4.6434, + "step": 13461 + }, + { + "epoch": 0.08006232752878485, + "grad_norm": 1.467970371246338, + "learning_rate": 4.921347742615281e-05, + "loss": 4.6611, + "step": 13462 + }, + { + "epoch": 0.08006827481206585, + "grad_norm": 1.5667515993118286, + "learning_rate": 4.9213361179110936e-05, + "loss": 4.5792, + "step": 13463 + }, + { + "epoch": 0.08007422209534684, + "grad_norm": 1.5370365381240845, + "learning_rate": 4.9213244923616434e-05, + "loss": 4.6724, + "step": 13464 + }, + { + "epoch": 0.08008016937862784, + "grad_norm": 1.7298029661178589, + "learning_rate": 4.921312865966933e-05, + "loss": 4.7808, + "step": 13465 + }, + { + "epoch": 0.08008611666190885, + "grad_norm": 1.5497710704803467, + "learning_rate": 4.921301238726966e-05, + "loss": 4.8228, + "step": 13466 + }, + { + "epoch": 0.08009206394518983, + "grad_norm": 1.4589923620224, + "learning_rate": 4.92128961064175e-05, + "loss": 4.757, + "step": 13467 + }, + { + "epoch": 0.08009801122847084, + "grad_norm": 1.6503071784973145, + "learning_rate": 4.921277981711286e-05, + "loss": 4.6074, + "step": 13468 + }, + { + "epoch": 0.08010395851175184, + "grad_norm": 1.621209979057312, + "learning_rate": 4.921266351935578e-05, + "loss": 4.6338, + "step": 13469 + }, + { + "epoch": 0.08010990579503283, + "grad_norm": 1.6513469219207764, + "learning_rate": 4.921254721314632e-05, + "loss": 4.7399, + "step": 13470 + }, + { + "epoch": 0.08011585307831383, + "grad_norm": 1.5691003799438477, + "learning_rate": 4.9212430898484505e-05, + "loss": 4.8002, + "step": 13471 + }, + { + "epoch": 0.08012180036159482, + "grad_norm": 1.6764090061187744, + "learning_rate": 4.921231457537039e-05, + "loss": 4.7913, + "step": 13472 + }, + { + "epoch": 0.08012774764487582, + "grad_norm": 1.5193006992340088, + "learning_rate": 4.9212198243804e-05, + "loss": 4.8346, + "step": 13473 + }, + { + "epoch": 0.08013369492815682, + "grad_norm": 1.722706913948059, + "learning_rate": 4.921208190378538e-05, + "loss": 4.6969, + "step": 13474 + }, + { + "epoch": 0.08013964221143781, + "grad_norm": 1.6551017761230469, + "learning_rate": 4.921196555531457e-05, + "loss": 4.6504, + "step": 13475 + }, + { + "epoch": 0.08014558949471881, + "grad_norm": 1.462902307510376, + "learning_rate": 4.921184919839162e-05, + "loss": 4.7678, + "step": 13476 + }, + { + "epoch": 0.08015153677799981, + "grad_norm": 1.4332460165023804, + "learning_rate": 4.9211732833016554e-05, + "loss": 4.7563, + "step": 13477 + }, + { + "epoch": 0.0801574840612808, + "grad_norm": 1.466042160987854, + "learning_rate": 4.9211616459189434e-05, + "loss": 4.7071, + "step": 13478 + }, + { + "epoch": 0.0801634313445618, + "grad_norm": 1.5814018249511719, + "learning_rate": 4.9211500076910275e-05, + "loss": 4.7497, + "step": 13479 + }, + { + "epoch": 0.0801693786278428, + "grad_norm": 1.5666007995605469, + "learning_rate": 4.921138368617915e-05, + "loss": 4.7757, + "step": 13480 + }, + { + "epoch": 0.0801753259111238, + "grad_norm": 1.6804678440093994, + "learning_rate": 4.9211267286996064e-05, + "loss": 4.6921, + "step": 13481 + }, + { + "epoch": 0.0801812731944048, + "grad_norm": 1.6126580238342285, + "learning_rate": 4.921115087936108e-05, + "loss": 4.746, + "step": 13482 + }, + { + "epoch": 0.0801872204776858, + "grad_norm": 1.5597195625305176, + "learning_rate": 4.9211034463274235e-05, + "loss": 4.8135, + "step": 13483 + }, + { + "epoch": 0.08019316776096679, + "grad_norm": 1.4779510498046875, + "learning_rate": 4.9210918038735565e-05, + "loss": 4.9011, + "step": 13484 + }, + { + "epoch": 0.08019911504424779, + "grad_norm": 1.449723243713379, + "learning_rate": 4.921080160574512e-05, + "loss": 4.648, + "step": 13485 + }, + { + "epoch": 0.08020506232752879, + "grad_norm": 1.609134554862976, + "learning_rate": 4.921068516430293e-05, + "loss": 4.6809, + "step": 13486 + }, + { + "epoch": 0.08021100961080978, + "grad_norm": 1.5483453273773193, + "learning_rate": 4.921056871440905e-05, + "loss": 4.7247, + "step": 13487 + }, + { + "epoch": 0.08021695689409078, + "grad_norm": 1.5850282907485962, + "learning_rate": 4.921045225606349e-05, + "loss": 4.6378, + "step": 13488 + }, + { + "epoch": 0.08022290417737178, + "grad_norm": 1.746030569076538, + "learning_rate": 4.9210335789266325e-05, + "loss": 4.6986, + "step": 13489 + }, + { + "epoch": 0.08022885146065277, + "grad_norm": 1.5930465459823608, + "learning_rate": 4.921021931401758e-05, + "loss": 4.6339, + "step": 13490 + }, + { + "epoch": 0.08023479874393377, + "grad_norm": 1.5435012578964233, + "learning_rate": 4.92101028303173e-05, + "loss": 4.5761, + "step": 13491 + }, + { + "epoch": 0.08024074602721477, + "grad_norm": 1.8166500329971313, + "learning_rate": 4.920998633816552e-05, + "loss": 4.5668, + "step": 13492 + }, + { + "epoch": 0.08024669331049576, + "grad_norm": 1.659976601600647, + "learning_rate": 4.920986983756228e-05, + "loss": 4.7431, + "step": 13493 + }, + { + "epoch": 0.08025264059377676, + "grad_norm": 1.6075677871704102, + "learning_rate": 4.920975332850762e-05, + "loss": 4.7744, + "step": 13494 + }, + { + "epoch": 0.08025858787705777, + "grad_norm": 1.6895835399627686, + "learning_rate": 4.9209636811001605e-05, + "loss": 4.638, + "step": 13495 + }, + { + "epoch": 0.08026453516033875, + "grad_norm": 1.4848902225494385, + "learning_rate": 4.9209520285044244e-05, + "loss": 4.7314, + "step": 13496 + }, + { + "epoch": 0.08027048244361976, + "grad_norm": 1.6041605472564697, + "learning_rate": 4.920940375063559e-05, + "loss": 4.7329, + "step": 13497 + }, + { + "epoch": 0.08027642972690076, + "grad_norm": 1.5055692195892334, + "learning_rate": 4.920928720777568e-05, + "loss": 4.721, + "step": 13498 + }, + { + "epoch": 0.08028237701018175, + "grad_norm": 1.3238314390182495, + "learning_rate": 4.920917065646456e-05, + "loss": 5.3071, + "step": 13499 + }, + { + "epoch": 0.08028832429346275, + "grad_norm": 1.463626742362976, + "learning_rate": 4.9209054096702266e-05, + "loss": 5.1885, + "step": 13500 + }, + { + "epoch": 0.08029427157674375, + "grad_norm": 1.4844539165496826, + "learning_rate": 4.9208937528488844e-05, + "loss": 5.2873, + "step": 13501 + }, + { + "epoch": 0.08030021886002474, + "grad_norm": 1.5207467079162598, + "learning_rate": 4.920882095182434e-05, + "loss": 5.1049, + "step": 13502 + }, + { + "epoch": 0.08030616614330574, + "grad_norm": 1.3113683462142944, + "learning_rate": 4.920870436670878e-05, + "loss": 5.1821, + "step": 13503 + }, + { + "epoch": 0.08031211342658673, + "grad_norm": 1.3822054862976074, + "learning_rate": 4.920858777314221e-05, + "loss": 5.1467, + "step": 13504 + }, + { + "epoch": 0.08031806070986773, + "grad_norm": 1.7611572742462158, + "learning_rate": 4.920847117112467e-05, + "loss": 5.0616, + "step": 13505 + }, + { + "epoch": 0.08032400799314873, + "grad_norm": 1.632802963256836, + "learning_rate": 4.920835456065621e-05, + "loss": 5.1535, + "step": 13506 + }, + { + "epoch": 0.08032995527642972, + "grad_norm": 1.6254185438156128, + "learning_rate": 4.920823794173686e-05, + "loss": 5.211, + "step": 13507 + }, + { + "epoch": 0.08033590255971072, + "grad_norm": 1.4769513607025146, + "learning_rate": 4.920812131436666e-05, + "loss": 5.0879, + "step": 13508 + }, + { + "epoch": 0.08034184984299172, + "grad_norm": 1.531504511833191, + "learning_rate": 4.920800467854566e-05, + "loss": 4.9068, + "step": 13509 + }, + { + "epoch": 0.08034779712627271, + "grad_norm": 1.6325825452804565, + "learning_rate": 4.9207888034273895e-05, + "loss": 5.0463, + "step": 13510 + }, + { + "epoch": 0.08035374440955372, + "grad_norm": 1.3797351121902466, + "learning_rate": 4.9207771381551406e-05, + "loss": 5.0644, + "step": 13511 + }, + { + "epoch": 0.08035969169283472, + "grad_norm": 1.7325141429901123, + "learning_rate": 4.920765472037823e-05, + "loss": 4.9095, + "step": 13512 + }, + { + "epoch": 0.0803656389761157, + "grad_norm": 1.3197063207626343, + "learning_rate": 4.920753805075442e-05, + "loss": 5.1837, + "step": 13513 + }, + { + "epoch": 0.08037158625939671, + "grad_norm": 1.532212734222412, + "learning_rate": 4.9207421372680006e-05, + "loss": 5.1011, + "step": 13514 + }, + { + "epoch": 0.08037753354267771, + "grad_norm": 1.2958672046661377, + "learning_rate": 4.9207304686155034e-05, + "loss": 5.1349, + "step": 13515 + }, + { + "epoch": 0.0803834808259587, + "grad_norm": 2.914010524749756, + "learning_rate": 4.9207187991179533e-05, + "loss": 5.4637, + "step": 13516 + }, + { + "epoch": 0.0803894281092397, + "grad_norm": 1.490577220916748, + "learning_rate": 4.920707128775356e-05, + "loss": 5.2322, + "step": 13517 + }, + { + "epoch": 0.0803953753925207, + "grad_norm": 1.5756994485855103, + "learning_rate": 4.920695457587714e-05, + "loss": 5.1501, + "step": 13518 + }, + { + "epoch": 0.08040132267580169, + "grad_norm": 1.7483723163604736, + "learning_rate": 4.920683785555033e-05, + "loss": 5.131, + "step": 13519 + }, + { + "epoch": 0.08040726995908269, + "grad_norm": 1.426866054534912, + "learning_rate": 4.920672112677316e-05, + "loss": 5.5304, + "step": 13520 + }, + { + "epoch": 0.0804132172423637, + "grad_norm": 1.3744142055511475, + "learning_rate": 4.920660438954568e-05, + "loss": 5.1042, + "step": 13521 + }, + { + "epoch": 0.08041916452564468, + "grad_norm": 1.5924170017242432, + "learning_rate": 4.9206487643867916e-05, + "loss": 5.261, + "step": 13522 + }, + { + "epoch": 0.08042511180892568, + "grad_norm": 1.566296935081482, + "learning_rate": 4.920637088973992e-05, + "loss": 5.0451, + "step": 13523 + }, + { + "epoch": 0.08043105909220669, + "grad_norm": 1.4542006254196167, + "learning_rate": 4.9206254127161734e-05, + "loss": 5.0351, + "step": 13524 + }, + { + "epoch": 0.08043700637548767, + "grad_norm": 1.4084336757659912, + "learning_rate": 4.920613735613339e-05, + "loss": 5.1177, + "step": 13525 + }, + { + "epoch": 0.08044295365876868, + "grad_norm": 1.5498062372207642, + "learning_rate": 4.920602057665493e-05, + "loss": 4.9068, + "step": 13526 + }, + { + "epoch": 0.08044890094204968, + "grad_norm": 1.4482768774032593, + "learning_rate": 4.920590378872641e-05, + "loss": 4.9393, + "step": 13527 + }, + { + "epoch": 0.08045484822533067, + "grad_norm": 1.4438153505325317, + "learning_rate": 4.920578699234785e-05, + "loss": 5.0109, + "step": 13528 + }, + { + "epoch": 0.08046079550861167, + "grad_norm": 1.5769532918930054, + "learning_rate": 4.9205670187519305e-05, + "loss": 4.916, + "step": 13529 + }, + { + "epoch": 0.08046674279189267, + "grad_norm": 1.6127451658248901, + "learning_rate": 4.9205553374240806e-05, + "loss": 5.0038, + "step": 13530 + }, + { + "epoch": 0.08047269007517366, + "grad_norm": 1.5733160972595215, + "learning_rate": 4.92054365525124e-05, + "loss": 5.2705, + "step": 13531 + }, + { + "epoch": 0.08047863735845466, + "grad_norm": 1.956769585609436, + "learning_rate": 4.920531972233413e-05, + "loss": 5.0572, + "step": 13532 + }, + { + "epoch": 0.08048458464173565, + "grad_norm": 1.614670753479004, + "learning_rate": 4.9205202883706025e-05, + "loss": 5.0323, + "step": 13533 + }, + { + "epoch": 0.08049053192501665, + "grad_norm": 1.3706777095794678, + "learning_rate": 4.920508603662814e-05, + "loss": 5.1335, + "step": 13534 + }, + { + "epoch": 0.08049647920829765, + "grad_norm": 1.5787118673324585, + "learning_rate": 4.9204969181100505e-05, + "loss": 4.9626, + "step": 13535 + }, + { + "epoch": 0.08050242649157864, + "grad_norm": 1.6258914470672607, + "learning_rate": 4.9204852317123175e-05, + "loss": 5.1592, + "step": 13536 + }, + { + "epoch": 0.08050837377485964, + "grad_norm": 1.662347435951233, + "learning_rate": 4.920473544469617e-05, + "loss": 5.053, + "step": 13537 + }, + { + "epoch": 0.08051432105814064, + "grad_norm": 1.8060719966888428, + "learning_rate": 4.920461856381955e-05, + "loss": 5.0823, + "step": 13538 + }, + { + "epoch": 0.08052026834142163, + "grad_norm": 1.7381904125213623, + "learning_rate": 4.920450167449334e-05, + "loss": 4.7485, + "step": 13539 + }, + { + "epoch": 0.08052621562470264, + "grad_norm": 1.838526964187622, + "learning_rate": 4.9204384776717594e-05, + "loss": 5.1404, + "step": 13540 + }, + { + "epoch": 0.08053216290798364, + "grad_norm": 1.8131240606307983, + "learning_rate": 4.920426787049234e-05, + "loss": 5.2337, + "step": 13541 + }, + { + "epoch": 0.08053811019126463, + "grad_norm": 1.7523903846740723, + "learning_rate": 4.9204150955817635e-05, + "loss": 5.2375, + "step": 13542 + }, + { + "epoch": 0.08054405747454563, + "grad_norm": 1.5962380170822144, + "learning_rate": 4.9204034032693505e-05, + "loss": 5.1667, + "step": 13543 + }, + { + "epoch": 0.08055000475782663, + "grad_norm": 1.566009283065796, + "learning_rate": 4.920391710112e-05, + "loss": 5.1105, + "step": 13544 + }, + { + "epoch": 0.08055595204110762, + "grad_norm": 1.6253767013549805, + "learning_rate": 4.920380016109716e-05, + "loss": 5.2942, + "step": 13545 + }, + { + "epoch": 0.08056189932438862, + "grad_norm": 1.538004994392395, + "learning_rate": 4.920368321262502e-05, + "loss": 5.1847, + "step": 13546 + }, + { + "epoch": 0.08056784660766962, + "grad_norm": 1.6407667398452759, + "learning_rate": 4.9203566255703625e-05, + "loss": 5.1368, + "step": 13547 + }, + { + "epoch": 0.08057379389095061, + "grad_norm": 1.5777368545532227, + "learning_rate": 4.9203449290333016e-05, + "loss": 5.1507, + "step": 13548 + }, + { + "epoch": 0.08057974117423161, + "grad_norm": 1.5601979494094849, + "learning_rate": 4.920333231651323e-05, + "loss": 5.0926, + "step": 13549 + }, + { + "epoch": 0.08058568845751261, + "grad_norm": 1.4342397451400757, + "learning_rate": 4.9203215334244315e-05, + "loss": 4.9536, + "step": 13550 + }, + { + "epoch": 0.0805916357407936, + "grad_norm": 1.6202988624572754, + "learning_rate": 4.9203098343526305e-05, + "loss": 4.9009, + "step": 13551 + }, + { + "epoch": 0.0805975830240746, + "grad_norm": 1.4504165649414062, + "learning_rate": 4.9202981344359243e-05, + "loss": 5.3843, + "step": 13552 + }, + { + "epoch": 0.0806035303073556, + "grad_norm": 1.6187599897384644, + "learning_rate": 4.920286433674317e-05, + "loss": 5.3396, + "step": 13553 + }, + { + "epoch": 0.0806094775906366, + "grad_norm": 1.6162225008010864, + "learning_rate": 4.920274732067813e-05, + "loss": 5.3163, + "step": 13554 + }, + { + "epoch": 0.0806154248739176, + "grad_norm": 1.6445814371109009, + "learning_rate": 4.920263029616416e-05, + "loss": 5.207, + "step": 13555 + }, + { + "epoch": 0.0806213721571986, + "grad_norm": 1.5133748054504395, + "learning_rate": 4.9202513263201296e-05, + "loss": 5.4284, + "step": 13556 + }, + { + "epoch": 0.08062731944047959, + "grad_norm": 1.5004390478134155, + "learning_rate": 4.920239622178959e-05, + "loss": 5.0013, + "step": 13557 + }, + { + "epoch": 0.08063326672376059, + "grad_norm": 1.6617141962051392, + "learning_rate": 4.920227917192908e-05, + "loss": 5.346, + "step": 13558 + }, + { + "epoch": 0.08063921400704159, + "grad_norm": 1.5505567789077759, + "learning_rate": 4.92021621136198e-05, + "loss": 5.2799, + "step": 13559 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 1.5264419317245483, + "learning_rate": 4.92020450468618e-05, + "loss": 5.1277, + "step": 13560 + }, + { + "epoch": 0.08065110857360358, + "grad_norm": 1.6758075952529907, + "learning_rate": 4.920192797165511e-05, + "loss": 5.2519, + "step": 13561 + }, + { + "epoch": 0.08065705585688457, + "grad_norm": 1.5858482122421265, + "learning_rate": 4.920181088799978e-05, + "loss": 5.3231, + "step": 13562 + }, + { + "epoch": 0.08066300314016557, + "grad_norm": 1.5122928619384766, + "learning_rate": 4.920169379589585e-05, + "loss": 5.1791, + "step": 13563 + }, + { + "epoch": 0.08066895042344657, + "grad_norm": 1.4593915939331055, + "learning_rate": 4.9201576695343354e-05, + "loss": 5.0555, + "step": 13564 + }, + { + "epoch": 0.08067489770672756, + "grad_norm": 1.6524077653884888, + "learning_rate": 4.9201459586342336e-05, + "loss": 5.1981, + "step": 13565 + }, + { + "epoch": 0.08068084499000856, + "grad_norm": 1.5063152313232422, + "learning_rate": 4.920134246889285e-05, + "loss": 5.0406, + "step": 13566 + }, + { + "epoch": 0.08068679227328956, + "grad_norm": 1.3544602394104004, + "learning_rate": 4.9201225342994914e-05, + "loss": 5.0385, + "step": 13567 + }, + { + "epoch": 0.08069273955657055, + "grad_norm": 1.5672118663787842, + "learning_rate": 4.920110820864858e-05, + "loss": 5.2393, + "step": 13568 + }, + { + "epoch": 0.08069868683985155, + "grad_norm": 1.5031840801239014, + "learning_rate": 4.92009910658539e-05, + "loss": 5.1584, + "step": 13569 + }, + { + "epoch": 0.08070463412313256, + "grad_norm": 1.682307243347168, + "learning_rate": 4.920087391461089e-05, + "loss": 4.8473, + "step": 13570 + }, + { + "epoch": 0.08071058140641355, + "grad_norm": 1.5047411918640137, + "learning_rate": 4.9200756754919616e-05, + "loss": 4.8286, + "step": 13571 + }, + { + "epoch": 0.08071652868969455, + "grad_norm": 1.4234607219696045, + "learning_rate": 4.920063958678011e-05, + "loss": 4.8309, + "step": 13572 + }, + { + "epoch": 0.08072247597297555, + "grad_norm": 1.5061196088790894, + "learning_rate": 4.920052241019239e-05, + "loss": 5.0132, + "step": 13573 + }, + { + "epoch": 0.08072842325625654, + "grad_norm": 1.5565897226333618, + "learning_rate": 4.920040522515654e-05, + "loss": 4.9357, + "step": 13574 + }, + { + "epoch": 0.08073437053953754, + "grad_norm": 1.442288875579834, + "learning_rate": 4.920028803167257e-05, + "loss": 4.7943, + "step": 13575 + }, + { + "epoch": 0.08074031782281854, + "grad_norm": 1.6255996227264404, + "learning_rate": 4.9200170829740534e-05, + "loss": 4.824, + "step": 13576 + }, + { + "epoch": 0.08074626510609953, + "grad_norm": 1.7027612924575806, + "learning_rate": 4.920005361936047e-05, + "loss": 5.1223, + "step": 13577 + }, + { + "epoch": 0.08075221238938053, + "grad_norm": 2.5931310653686523, + "learning_rate": 4.919993640053241e-05, + "loss": 5.3487, + "step": 13578 + }, + { + "epoch": 0.08075815967266153, + "grad_norm": 1.5481868982315063, + "learning_rate": 4.91998191732564e-05, + "loss": 5.0844, + "step": 13579 + }, + { + "epoch": 0.08076410695594252, + "grad_norm": 1.3663432598114014, + "learning_rate": 4.919970193753248e-05, + "loss": 5.2151, + "step": 13580 + }, + { + "epoch": 0.08077005423922352, + "grad_norm": 1.4602998495101929, + "learning_rate": 4.919958469336071e-05, + "loss": 5.3133, + "step": 13581 + }, + { + "epoch": 0.08077600152250453, + "grad_norm": 1.6350071430206299, + "learning_rate": 4.919946744074111e-05, + "loss": 5.5026, + "step": 13582 + }, + { + "epoch": 0.08078194880578551, + "grad_norm": 1.4492799043655396, + "learning_rate": 4.919935017967372e-05, + "loss": 5.4211, + "step": 13583 + }, + { + "epoch": 0.08078789608906652, + "grad_norm": 1.398373007774353, + "learning_rate": 4.919923291015859e-05, + "loss": 5.2947, + "step": 13584 + }, + { + "epoch": 0.08079384337234752, + "grad_norm": 1.543583869934082, + "learning_rate": 4.9199115632195755e-05, + "loss": 5.0361, + "step": 13585 + }, + { + "epoch": 0.0807997906556285, + "grad_norm": 1.7753655910491943, + "learning_rate": 4.9198998345785265e-05, + "loss": 5.1897, + "step": 13586 + }, + { + "epoch": 0.08080573793890951, + "grad_norm": 1.668168544769287, + "learning_rate": 4.919888105092715e-05, + "loss": 5.3786, + "step": 13587 + }, + { + "epoch": 0.08081168522219051, + "grad_norm": 1.3956975936889648, + "learning_rate": 4.919876374762145e-05, + "loss": 5.4662, + "step": 13588 + }, + { + "epoch": 0.0808176325054715, + "grad_norm": 1.3362425565719604, + "learning_rate": 4.9198646435868226e-05, + "loss": 5.4723, + "step": 13589 + }, + { + "epoch": 0.0808235797887525, + "grad_norm": 1.3419675827026367, + "learning_rate": 4.919852911566749e-05, + "loss": 5.3888, + "step": 13590 + }, + { + "epoch": 0.08082952707203349, + "grad_norm": 1.5144484043121338, + "learning_rate": 4.9198411787019304e-05, + "loss": 5.292, + "step": 13591 + }, + { + "epoch": 0.08083547435531449, + "grad_norm": 1.4561097621917725, + "learning_rate": 4.91982944499237e-05, + "loss": 5.3688, + "step": 13592 + }, + { + "epoch": 0.08084142163859549, + "grad_norm": 1.4536436796188354, + "learning_rate": 4.919817710438073e-05, + "loss": 5.3606, + "step": 13593 + }, + { + "epoch": 0.08084736892187648, + "grad_norm": 1.3266935348510742, + "learning_rate": 4.919805975039041e-05, + "loss": 5.3999, + "step": 13594 + }, + { + "epoch": 0.08085331620515748, + "grad_norm": 1.4032717943191528, + "learning_rate": 4.919794238795281e-05, + "loss": 5.3494, + "step": 13595 + }, + { + "epoch": 0.08085926348843848, + "grad_norm": 1.6235400438308716, + "learning_rate": 4.919782501706796e-05, + "loss": 5.1499, + "step": 13596 + }, + { + "epoch": 0.08086521077171947, + "grad_norm": 1.349752426147461, + "learning_rate": 4.919770763773589e-05, + "loss": 5.3599, + "step": 13597 + }, + { + "epoch": 0.08087115805500047, + "grad_norm": 1.9415758848190308, + "learning_rate": 4.919759024995666e-05, + "loss": 5.3427, + "step": 13598 + }, + { + "epoch": 0.08087710533828148, + "grad_norm": 1.688825249671936, + "learning_rate": 4.9197472853730296e-05, + "loss": 5.2918, + "step": 13599 + }, + { + "epoch": 0.08088305262156247, + "grad_norm": 1.55258309841156, + "learning_rate": 4.919735544905685e-05, + "loss": 5.3016, + "step": 13600 + }, + { + "epoch": 0.08088899990484347, + "grad_norm": 1.3860005140304565, + "learning_rate": 4.919723803593634e-05, + "loss": 5.3049, + "step": 13601 + }, + { + "epoch": 0.08089494718812447, + "grad_norm": 1.289819359779358, + "learning_rate": 4.919712061436884e-05, + "loss": 5.1657, + "step": 13602 + }, + { + "epoch": 0.08090089447140546, + "grad_norm": 1.5799275636672974, + "learning_rate": 4.9197003184354375e-05, + "loss": 5.2638, + "step": 13603 + }, + { + "epoch": 0.08090684175468646, + "grad_norm": 1.5292985439300537, + "learning_rate": 4.919688574589299e-05, + "loss": 5.2643, + "step": 13604 + }, + { + "epoch": 0.08091278903796746, + "grad_norm": 1.6338304281234741, + "learning_rate": 4.919676829898471e-05, + "loss": 5.2377, + "step": 13605 + }, + { + "epoch": 0.08091873632124845, + "grad_norm": 1.7117339372634888, + "learning_rate": 4.919665084362959e-05, + "loss": 5.262, + "step": 13606 + }, + { + "epoch": 0.08092468360452945, + "grad_norm": 1.606644868850708, + "learning_rate": 4.919653337982767e-05, + "loss": 5.2308, + "step": 13607 + }, + { + "epoch": 0.08093063088781045, + "grad_norm": 1.5751184225082397, + "learning_rate": 4.9196415907578994e-05, + "loss": 5.1455, + "step": 13608 + }, + { + "epoch": 0.08093657817109144, + "grad_norm": 1.7105200290679932, + "learning_rate": 4.9196298426883595e-05, + "loss": 5.2608, + "step": 13609 + }, + { + "epoch": 0.08094252545437244, + "grad_norm": 1.4504178762435913, + "learning_rate": 4.919618093774152e-05, + "loss": 5.3592, + "step": 13610 + }, + { + "epoch": 0.08094847273765345, + "grad_norm": 1.2036757469177246, + "learning_rate": 4.9196063440152804e-05, + "loss": 5.3256, + "step": 13611 + }, + { + "epoch": 0.08095442002093443, + "grad_norm": 1.4795072078704834, + "learning_rate": 4.9195945934117507e-05, + "loss": 5.2968, + "step": 13612 + }, + { + "epoch": 0.08096036730421544, + "grad_norm": 1.2796508073806763, + "learning_rate": 4.9195828419635644e-05, + "loss": 5.1288, + "step": 13613 + }, + { + "epoch": 0.08096631458749644, + "grad_norm": 1.4119127988815308, + "learning_rate": 4.9195710896707264e-05, + "loss": 5.3238, + "step": 13614 + }, + { + "epoch": 0.08097226187077743, + "grad_norm": 1.618862509727478, + "learning_rate": 4.919559336533241e-05, + "loss": 5.301, + "step": 13615 + }, + { + "epoch": 0.08097820915405843, + "grad_norm": 1.5049046277999878, + "learning_rate": 4.919547582551114e-05, + "loss": 5.3395, + "step": 13616 + }, + { + "epoch": 0.08098415643733943, + "grad_norm": 1.3821018934249878, + "learning_rate": 4.9195358277243464e-05, + "loss": 5.4033, + "step": 13617 + }, + { + "epoch": 0.08099010372062042, + "grad_norm": 1.4585113525390625, + "learning_rate": 4.9195240720529446e-05, + "loss": 5.3098, + "step": 13618 + }, + { + "epoch": 0.08099605100390142, + "grad_norm": 1.5766072273254395, + "learning_rate": 4.9195123155369114e-05, + "loss": 5.2672, + "step": 13619 + }, + { + "epoch": 0.08100199828718241, + "grad_norm": 1.5132715702056885, + "learning_rate": 4.919500558176252e-05, + "loss": 5.1707, + "step": 13620 + }, + { + "epoch": 0.08100794557046341, + "grad_norm": 1.594093918800354, + "learning_rate": 4.91948879997097e-05, + "loss": 5.2988, + "step": 13621 + }, + { + "epoch": 0.08101389285374441, + "grad_norm": 1.529877781867981, + "learning_rate": 4.919477040921069e-05, + "loss": 5.4418, + "step": 13622 + }, + { + "epoch": 0.0810198401370254, + "grad_norm": 1.4329211711883545, + "learning_rate": 4.919465281026554e-05, + "loss": 5.308, + "step": 13623 + }, + { + "epoch": 0.0810257874203064, + "grad_norm": 1.4308300018310547, + "learning_rate": 4.919453520287428e-05, + "loss": 5.259, + "step": 13624 + }, + { + "epoch": 0.0810317347035874, + "grad_norm": 1.248282790184021, + "learning_rate": 4.919441758703697e-05, + "loss": 5.2129, + "step": 13625 + }, + { + "epoch": 0.08103768198686839, + "grad_norm": 1.4535733461380005, + "learning_rate": 4.919429996275363e-05, + "loss": 5.1989, + "step": 13626 + }, + { + "epoch": 0.0810436292701494, + "grad_norm": 1.6055153608322144, + "learning_rate": 4.9194182330024306e-05, + "loss": 5.1669, + "step": 13627 + }, + { + "epoch": 0.0810495765534304, + "grad_norm": 1.6016899347305298, + "learning_rate": 4.919406468884905e-05, + "loss": 5.1958, + "step": 13628 + }, + { + "epoch": 0.08105552383671139, + "grad_norm": 1.4217112064361572, + "learning_rate": 4.91939470392279e-05, + "loss": 4.9775, + "step": 13629 + }, + { + "epoch": 0.08106147111999239, + "grad_norm": 1.4405405521392822, + "learning_rate": 4.919382938116088e-05, + "loss": 5.1865, + "step": 13630 + }, + { + "epoch": 0.08106741840327339, + "grad_norm": 1.3826597929000854, + "learning_rate": 4.919371171464805e-05, + "loss": 5.1909, + "step": 13631 + }, + { + "epoch": 0.08107336568655438, + "grad_norm": 1.942305088043213, + "learning_rate": 4.919359403968944e-05, + "loss": 5.227, + "step": 13632 + }, + { + "epoch": 0.08107931296983538, + "grad_norm": 1.8932685852050781, + "learning_rate": 4.919347635628511e-05, + "loss": 5.3257, + "step": 13633 + }, + { + "epoch": 0.08108526025311638, + "grad_norm": 1.8511128425598145, + "learning_rate": 4.9193358664435074e-05, + "loss": 5.4229, + "step": 13634 + }, + { + "epoch": 0.08109120753639737, + "grad_norm": 1.6317822933197021, + "learning_rate": 4.919324096413939e-05, + "loss": 5.3067, + "step": 13635 + }, + { + "epoch": 0.08109715481967837, + "grad_norm": 1.835503101348877, + "learning_rate": 4.91931232553981e-05, + "loss": 5.3246, + "step": 13636 + }, + { + "epoch": 0.08110310210295937, + "grad_norm": 1.8521870374679565, + "learning_rate": 4.919300553821124e-05, + "loss": 5.3367, + "step": 13637 + }, + { + "epoch": 0.08110904938624036, + "grad_norm": 1.7814146280288696, + "learning_rate": 4.9192887812578844e-05, + "loss": 5.2949, + "step": 13638 + }, + { + "epoch": 0.08111499666952136, + "grad_norm": 1.6024845838546753, + "learning_rate": 4.919277007850097e-05, + "loss": 5.3159, + "step": 13639 + }, + { + "epoch": 0.08112094395280237, + "grad_norm": 2.955554246902466, + "learning_rate": 4.919265233597765e-05, + "loss": 4.8802, + "step": 13640 + }, + { + "epoch": 0.08112689123608335, + "grad_norm": 1.7217108011245728, + "learning_rate": 4.919253458500892e-05, + "loss": 5.08, + "step": 13641 + }, + { + "epoch": 0.08113283851936436, + "grad_norm": 1.686672329902649, + "learning_rate": 4.9192416825594825e-05, + "loss": 5.1349, + "step": 13642 + }, + { + "epoch": 0.08113878580264536, + "grad_norm": 1.5377975702285767, + "learning_rate": 4.9192299057735416e-05, + "loss": 5.1327, + "step": 13643 + }, + { + "epoch": 0.08114473308592635, + "grad_norm": 1.7383031845092773, + "learning_rate": 4.9192181281430716e-05, + "loss": 5.0938, + "step": 13644 + }, + { + "epoch": 0.08115068036920735, + "grad_norm": 1.6174112558364868, + "learning_rate": 4.919206349668077e-05, + "loss": 5.0123, + "step": 13645 + }, + { + "epoch": 0.08115662765248835, + "grad_norm": 1.5967239141464233, + "learning_rate": 4.9191945703485646e-05, + "loss": 5.0334, + "step": 13646 + }, + { + "epoch": 0.08116257493576934, + "grad_norm": 1.5330301523208618, + "learning_rate": 4.919182790184534e-05, + "loss": 5.1615, + "step": 13647 + }, + { + "epoch": 0.08116852221905034, + "grad_norm": 1.5532622337341309, + "learning_rate": 4.919171009175993e-05, + "loss": 5.1565, + "step": 13648 + }, + { + "epoch": 0.08117446950233133, + "grad_norm": 1.4814139604568481, + "learning_rate": 4.919159227322945e-05, + "loss": 5.0991, + "step": 13649 + }, + { + "epoch": 0.08118041678561233, + "grad_norm": 1.2586545944213867, + "learning_rate": 4.919147444625392e-05, + "loss": 5.2482, + "step": 13650 + }, + { + "epoch": 0.08118636406889333, + "grad_norm": 1.5292212963104248, + "learning_rate": 4.91913566108334e-05, + "loss": 5.1787, + "step": 13651 + }, + { + "epoch": 0.08119231135217432, + "grad_norm": 1.5354405641555786, + "learning_rate": 4.919123876696793e-05, + "loss": 5.0046, + "step": 13652 + }, + { + "epoch": 0.08119825863545532, + "grad_norm": 1.3921040296554565, + "learning_rate": 4.919112091465755e-05, + "loss": 5.2199, + "step": 13653 + }, + { + "epoch": 0.08120420591873632, + "grad_norm": 1.471068263053894, + "learning_rate": 4.91910030539023e-05, + "loss": 5.0445, + "step": 13654 + }, + { + "epoch": 0.08121015320201731, + "grad_norm": 1.3318332433700562, + "learning_rate": 4.919088518470222e-05, + "loss": 5.1973, + "step": 13655 + }, + { + "epoch": 0.08121610048529831, + "grad_norm": 1.5445464849472046, + "learning_rate": 4.919076730705735e-05, + "loss": 5.4165, + "step": 13656 + }, + { + "epoch": 0.08122204776857932, + "grad_norm": 1.3854666948318481, + "learning_rate": 4.9190649420967735e-05, + "loss": 5.336, + "step": 13657 + }, + { + "epoch": 0.0812279950518603, + "grad_norm": 1.4703121185302734, + "learning_rate": 4.919053152643342e-05, + "loss": 5.4837, + "step": 13658 + }, + { + "epoch": 0.08123394233514131, + "grad_norm": 1.3189783096313477, + "learning_rate": 4.9190413623454425e-05, + "loss": 5.4163, + "step": 13659 + }, + { + "epoch": 0.08123988961842231, + "grad_norm": 1.469601035118103, + "learning_rate": 4.919029571203081e-05, + "loss": 5.2772, + "step": 13660 + }, + { + "epoch": 0.0812458369017033, + "grad_norm": 1.4215590953826904, + "learning_rate": 4.919017779216262e-05, + "loss": 5.5008, + "step": 13661 + }, + { + "epoch": 0.0812517841849843, + "grad_norm": 1.577255129814148, + "learning_rate": 4.919005986384989e-05, + "loss": 5.2565, + "step": 13662 + }, + { + "epoch": 0.0812577314682653, + "grad_norm": 1.5910719633102417, + "learning_rate": 4.918994192709265e-05, + "loss": 5.1143, + "step": 13663 + }, + { + "epoch": 0.08126367875154629, + "grad_norm": 1.5665141344070435, + "learning_rate": 4.9189823981890964e-05, + "loss": 5.1911, + "step": 13664 + }, + { + "epoch": 0.08126962603482729, + "grad_norm": 1.6348809003829956, + "learning_rate": 4.918970602824485e-05, + "loss": 5.2257, + "step": 13665 + }, + { + "epoch": 0.0812755733181083, + "grad_norm": 1.4213917255401611, + "learning_rate": 4.9189588066154365e-05, + "loss": 5.0528, + "step": 13666 + }, + { + "epoch": 0.08128152060138928, + "grad_norm": 1.497758388519287, + "learning_rate": 4.918947009561955e-05, + "loss": 5.2421, + "step": 13667 + }, + { + "epoch": 0.08128746788467028, + "grad_norm": 1.4052904844284058, + "learning_rate": 4.918935211664043e-05, + "loss": 5.5054, + "step": 13668 + }, + { + "epoch": 0.08129341516795129, + "grad_norm": 1.5615813732147217, + "learning_rate": 4.9189234129217064e-05, + "loss": 5.2711, + "step": 13669 + }, + { + "epoch": 0.08129936245123227, + "grad_norm": 1.2366914749145508, + "learning_rate": 4.9189116133349485e-05, + "loss": 5.4035, + "step": 13670 + }, + { + "epoch": 0.08130530973451328, + "grad_norm": 1.5328080654144287, + "learning_rate": 4.918899812903773e-05, + "loss": 5.3269, + "step": 13671 + }, + { + "epoch": 0.08131125701779428, + "grad_norm": 1.6515448093414307, + "learning_rate": 4.918888011628185e-05, + "loss": 5.1734, + "step": 13672 + }, + { + "epoch": 0.08131720430107527, + "grad_norm": 1.385549783706665, + "learning_rate": 4.918876209508188e-05, + "loss": 5.3769, + "step": 13673 + }, + { + "epoch": 0.08132315158435627, + "grad_norm": 1.4133338928222656, + "learning_rate": 4.9188644065437875e-05, + "loss": 5.2607, + "step": 13674 + }, + { + "epoch": 0.08132909886763727, + "grad_norm": 1.6652443408966064, + "learning_rate": 4.918852602734984e-05, + "loss": 5.3939, + "step": 13675 + }, + { + "epoch": 0.08133504615091826, + "grad_norm": 1.455493450164795, + "learning_rate": 4.918840798081786e-05, + "loss": 5.3051, + "step": 13676 + }, + { + "epoch": 0.08134099343419926, + "grad_norm": 1.5490756034851074, + "learning_rate": 4.918828992584196e-05, + "loss": 5.4309, + "step": 13677 + }, + { + "epoch": 0.08134694071748025, + "grad_norm": 1.5857222080230713, + "learning_rate": 4.918817186242216e-05, + "loss": 5.1158, + "step": 13678 + }, + { + "epoch": 0.08135288800076125, + "grad_norm": 1.6051661968231201, + "learning_rate": 4.918805379055853e-05, + "loss": 5.2668, + "step": 13679 + }, + { + "epoch": 0.08135883528404225, + "grad_norm": 1.6476162672042847, + "learning_rate": 4.91879357102511e-05, + "loss": 5.2367, + "step": 13680 + }, + { + "epoch": 0.08136478256732324, + "grad_norm": 1.4255136251449585, + "learning_rate": 4.918781762149991e-05, + "loss": 5.0348, + "step": 13681 + }, + { + "epoch": 0.08137072985060424, + "grad_norm": 1.4585214853286743, + "learning_rate": 4.9187699524305e-05, + "loss": 5.2323, + "step": 13682 + }, + { + "epoch": 0.08137667713388524, + "grad_norm": 1.3733863830566406, + "learning_rate": 4.9187581418666415e-05, + "loss": 5.0898, + "step": 13683 + }, + { + "epoch": 0.08138262441716623, + "grad_norm": 1.5789494514465332, + "learning_rate": 4.91874633045842e-05, + "loss": 5.0886, + "step": 13684 + }, + { + "epoch": 0.08138857170044723, + "grad_norm": 1.4390051364898682, + "learning_rate": 4.918734518205839e-05, + "loss": 5.4305, + "step": 13685 + }, + { + "epoch": 0.08139451898372824, + "grad_norm": 1.8984171152114868, + "learning_rate": 4.9187227051089025e-05, + "loss": 5.0593, + "step": 13686 + }, + { + "epoch": 0.08140046626700922, + "grad_norm": 1.940045714378357, + "learning_rate": 4.918710891167615e-05, + "loss": 5.3115, + "step": 13687 + }, + { + "epoch": 0.08140641355029023, + "grad_norm": 1.6479912996292114, + "learning_rate": 4.918699076381981e-05, + "loss": 5.1585, + "step": 13688 + }, + { + "epoch": 0.08141236083357123, + "grad_norm": 1.554114818572998, + "learning_rate": 4.918687260752003e-05, + "loss": 5.1581, + "step": 13689 + }, + { + "epoch": 0.08141830811685222, + "grad_norm": 1.6920353174209595, + "learning_rate": 4.9186754442776874e-05, + "loss": 5.2263, + "step": 13690 + }, + { + "epoch": 0.08142425540013322, + "grad_norm": 1.572787880897522, + "learning_rate": 4.9186636269590366e-05, + "loss": 5.1019, + "step": 13691 + }, + { + "epoch": 0.08143020268341422, + "grad_norm": 1.646004319190979, + "learning_rate": 4.918651808796055e-05, + "loss": 5.1426, + "step": 13692 + }, + { + "epoch": 0.08143614996669521, + "grad_norm": 1.578749179840088, + "learning_rate": 4.9186399897887475e-05, + "loss": 4.9682, + "step": 13693 + }, + { + "epoch": 0.08144209724997621, + "grad_norm": 1.7725828886032104, + "learning_rate": 4.918628169937118e-05, + "loss": 5.0772, + "step": 13694 + }, + { + "epoch": 0.08144804453325721, + "grad_norm": 1.808596134185791, + "learning_rate": 4.91861634924117e-05, + "loss": 5.077, + "step": 13695 + }, + { + "epoch": 0.0814539918165382, + "grad_norm": 1.8685991764068604, + "learning_rate": 4.9186045277009084e-05, + "loss": 5.1322, + "step": 13696 + }, + { + "epoch": 0.0814599390998192, + "grad_norm": 1.6144567728042603, + "learning_rate": 4.9185927053163366e-05, + "loss": 5.3354, + "step": 13697 + }, + { + "epoch": 0.0814658863831002, + "grad_norm": 1.767673373222351, + "learning_rate": 4.918580882087459e-05, + "loss": 5.0358, + "step": 13698 + }, + { + "epoch": 0.0814718336663812, + "grad_norm": 1.7151973247528076, + "learning_rate": 4.9185690580142805e-05, + "loss": 5.0371, + "step": 13699 + }, + { + "epoch": 0.0814777809496622, + "grad_norm": 1.710990071296692, + "learning_rate": 4.918557233096803e-05, + "loss": 4.9236, + "step": 13700 + }, + { + "epoch": 0.0814837282329432, + "grad_norm": 1.8118677139282227, + "learning_rate": 4.9185454073350335e-05, + "loss": 4.9112, + "step": 13701 + }, + { + "epoch": 0.08148967551622419, + "grad_norm": 2.0120832920074463, + "learning_rate": 4.918533580728974e-05, + "loss": 4.8201, + "step": 13702 + }, + { + "epoch": 0.08149562279950519, + "grad_norm": 1.742125153541565, + "learning_rate": 4.91852175327863e-05, + "loss": 5.0618, + "step": 13703 + }, + { + "epoch": 0.08150157008278619, + "grad_norm": 1.6496554613113403, + "learning_rate": 4.9185099249840054e-05, + "loss": 5.217, + "step": 13704 + }, + { + "epoch": 0.08150751736606718, + "grad_norm": 1.6782381534576416, + "learning_rate": 4.9184980958451034e-05, + "loss": 5.0362, + "step": 13705 + }, + { + "epoch": 0.08151346464934818, + "grad_norm": 1.8002519607543945, + "learning_rate": 4.918486265861929e-05, + "loss": 4.8812, + "step": 13706 + }, + { + "epoch": 0.08151941193262917, + "grad_norm": 1.5939546823501587, + "learning_rate": 4.918474435034486e-05, + "loss": 5.0571, + "step": 13707 + }, + { + "epoch": 0.08152535921591017, + "grad_norm": 1.6342964172363281, + "learning_rate": 4.918462603362778e-05, + "loss": 5.087, + "step": 13708 + }, + { + "epoch": 0.08153130649919117, + "grad_norm": 1.549822449684143, + "learning_rate": 4.91845077084681e-05, + "loss": 5.1654, + "step": 13709 + }, + { + "epoch": 0.08153725378247216, + "grad_norm": 1.5732479095458984, + "learning_rate": 4.9184389374865855e-05, + "loss": 4.9085, + "step": 13710 + }, + { + "epoch": 0.08154320106575316, + "grad_norm": 1.4182745218276978, + "learning_rate": 4.9184271032821094e-05, + "loss": 4.8846, + "step": 13711 + }, + { + "epoch": 0.08154914834903416, + "grad_norm": 1.3679918050765991, + "learning_rate": 4.918415268233385e-05, + "loss": 5.0263, + "step": 13712 + }, + { + "epoch": 0.08155509563231515, + "grad_norm": 1.4714219570159912, + "learning_rate": 4.918403432340418e-05, + "loss": 5.5169, + "step": 13713 + }, + { + "epoch": 0.08156104291559615, + "grad_norm": 1.8351292610168457, + "learning_rate": 4.91839159560321e-05, + "loss": 5.215, + "step": 13714 + }, + { + "epoch": 0.08156699019887716, + "grad_norm": 1.530781865119934, + "learning_rate": 4.918379758021767e-05, + "loss": 5.0882, + "step": 13715 + }, + { + "epoch": 0.08157293748215814, + "grad_norm": 1.799901008605957, + "learning_rate": 4.918367919596093e-05, + "loss": 5.2248, + "step": 13716 + }, + { + "epoch": 0.08157888476543915, + "grad_norm": 1.7563488483428955, + "learning_rate": 4.9183560803261915e-05, + "loss": 5.3192, + "step": 13717 + }, + { + "epoch": 0.08158483204872015, + "grad_norm": 1.7521497011184692, + "learning_rate": 4.918344240212066e-05, + "loss": 5.4841, + "step": 13718 + }, + { + "epoch": 0.08159077933200114, + "grad_norm": 1.7345610857009888, + "learning_rate": 4.918332399253722e-05, + "loss": 5.0716, + "step": 13719 + }, + { + "epoch": 0.08159672661528214, + "grad_norm": 1.4790915250778198, + "learning_rate": 4.918320557451164e-05, + "loss": 5.1833, + "step": 13720 + }, + { + "epoch": 0.08160267389856314, + "grad_norm": 1.4721198081970215, + "learning_rate": 4.918308714804395e-05, + "loss": 5.1355, + "step": 13721 + }, + { + "epoch": 0.08160862118184413, + "grad_norm": 1.4949108362197876, + "learning_rate": 4.918296871313419e-05, + "loss": 4.9666, + "step": 13722 + }, + { + "epoch": 0.08161456846512513, + "grad_norm": 1.3814501762390137, + "learning_rate": 4.91828502697824e-05, + "loss": 5.0575, + "step": 13723 + }, + { + "epoch": 0.08162051574840613, + "grad_norm": 1.4503964185714722, + "learning_rate": 4.918273181798864e-05, + "loss": 5.4112, + "step": 13724 + }, + { + "epoch": 0.08162646303168712, + "grad_norm": 1.5512415170669556, + "learning_rate": 4.9182613357752925e-05, + "loss": 5.1501, + "step": 13725 + }, + { + "epoch": 0.08163241031496812, + "grad_norm": 1.7429851293563843, + "learning_rate": 4.9182494889075315e-05, + "loss": 5.2736, + "step": 13726 + }, + { + "epoch": 0.08163835759824913, + "grad_norm": 1.325498104095459, + "learning_rate": 4.918237641195584e-05, + "loss": 5.3702, + "step": 13727 + }, + { + "epoch": 0.08164430488153011, + "grad_norm": 1.2677874565124512, + "learning_rate": 4.918225792639456e-05, + "loss": 5.2681, + "step": 13728 + }, + { + "epoch": 0.08165025216481112, + "grad_norm": 1.4957364797592163, + "learning_rate": 4.918213943239149e-05, + "loss": 5.4956, + "step": 13729 + }, + { + "epoch": 0.08165619944809212, + "grad_norm": 1.3380833864212036, + "learning_rate": 4.91820209299467e-05, + "loss": 5.3286, + "step": 13730 + }, + { + "epoch": 0.0816621467313731, + "grad_norm": 1.6803557872772217, + "learning_rate": 4.918190241906021e-05, + "loss": 5.3119, + "step": 13731 + }, + { + "epoch": 0.08166809401465411, + "grad_norm": 1.7933920621871948, + "learning_rate": 4.918178389973206e-05, + "loss": 5.139, + "step": 13732 + }, + { + "epoch": 0.08167404129793511, + "grad_norm": 1.5846813917160034, + "learning_rate": 4.91816653719623e-05, + "loss": 5.4431, + "step": 13733 + }, + { + "epoch": 0.0816799885812161, + "grad_norm": 1.9218448400497437, + "learning_rate": 4.918154683575098e-05, + "loss": 5.3245, + "step": 13734 + }, + { + "epoch": 0.0816859358644971, + "grad_norm": 1.4883100986480713, + "learning_rate": 4.918142829109813e-05, + "loss": 5.3007, + "step": 13735 + }, + { + "epoch": 0.08169188314777809, + "grad_norm": 1.4396723508834839, + "learning_rate": 4.918130973800379e-05, + "loss": 5.1956, + "step": 13736 + }, + { + "epoch": 0.08169783043105909, + "grad_norm": 1.4395633935928345, + "learning_rate": 4.918119117646801e-05, + "loss": 5.1637, + "step": 13737 + }, + { + "epoch": 0.08170377771434009, + "grad_norm": 1.540003776550293, + "learning_rate": 4.9181072606490816e-05, + "loss": 5.2278, + "step": 13738 + }, + { + "epoch": 0.08170972499762108, + "grad_norm": 1.446815848350525, + "learning_rate": 4.918095402807227e-05, + "loss": 5.1627, + "step": 13739 + }, + { + "epoch": 0.08171567228090208, + "grad_norm": 1.4501028060913086, + "learning_rate": 4.918083544121239e-05, + "loss": 5.0747, + "step": 13740 + }, + { + "epoch": 0.08172161956418308, + "grad_norm": 1.217608094215393, + "learning_rate": 4.9180716845911244e-05, + "loss": 5.0668, + "step": 13741 + }, + { + "epoch": 0.08172756684746407, + "grad_norm": 1.6321865320205688, + "learning_rate": 4.918059824216885e-05, + "loss": 5.2785, + "step": 13742 + }, + { + "epoch": 0.08173351413074507, + "grad_norm": 1.5838396549224854, + "learning_rate": 4.9180479629985265e-05, + "loss": 5.1675, + "step": 13743 + }, + { + "epoch": 0.08173946141402608, + "grad_norm": 1.7023003101348877, + "learning_rate": 4.918036100936052e-05, + "loss": 5.1664, + "step": 13744 + }, + { + "epoch": 0.08174540869730706, + "grad_norm": 1.767067790031433, + "learning_rate": 4.918024238029466e-05, + "loss": 5.0157, + "step": 13745 + }, + { + "epoch": 0.08175135598058807, + "grad_norm": 1.6058627367019653, + "learning_rate": 4.918012374278773e-05, + "loss": 5.1772, + "step": 13746 + }, + { + "epoch": 0.08175730326386907, + "grad_norm": 1.7853416204452515, + "learning_rate": 4.9180005096839766e-05, + "loss": 5.2678, + "step": 13747 + }, + { + "epoch": 0.08176325054715006, + "grad_norm": 1.4799201488494873, + "learning_rate": 4.917988644245082e-05, + "loss": 5.3153, + "step": 13748 + }, + { + "epoch": 0.08176919783043106, + "grad_norm": 1.4581291675567627, + "learning_rate": 4.917976777962092e-05, + "loss": 5.2755, + "step": 13749 + }, + { + "epoch": 0.08177514511371206, + "grad_norm": 1.7151737213134766, + "learning_rate": 4.917964910835011e-05, + "loss": 5.1761, + "step": 13750 + }, + { + "epoch": 0.08178109239699305, + "grad_norm": 1.5101522207260132, + "learning_rate": 4.917953042863843e-05, + "loss": 5.0003, + "step": 13751 + }, + { + "epoch": 0.08178703968027405, + "grad_norm": 1.4508110284805298, + "learning_rate": 4.9179411740485935e-05, + "loss": 5.1158, + "step": 13752 + }, + { + "epoch": 0.08179298696355505, + "grad_norm": 1.5012980699539185, + "learning_rate": 4.917929304389266e-05, + "loss": 5.2762, + "step": 13753 + }, + { + "epoch": 0.08179893424683604, + "grad_norm": 1.5914186239242554, + "learning_rate": 4.9179174338858635e-05, + "loss": 5.1422, + "step": 13754 + }, + { + "epoch": 0.08180488153011704, + "grad_norm": 1.5001139640808105, + "learning_rate": 4.9179055625383915e-05, + "loss": 5.2158, + "step": 13755 + }, + { + "epoch": 0.08181082881339805, + "grad_norm": 1.382815957069397, + "learning_rate": 4.917893690346853e-05, + "loss": 5.2562, + "step": 13756 + }, + { + "epoch": 0.08181677609667903, + "grad_norm": 1.3576865196228027, + "learning_rate": 4.9178818173112535e-05, + "loss": 5.221, + "step": 13757 + }, + { + "epoch": 0.08182272337996004, + "grad_norm": 1.5542206764221191, + "learning_rate": 4.917869943431596e-05, + "loss": 5.071, + "step": 13758 + }, + { + "epoch": 0.08182867066324104, + "grad_norm": 1.6010403633117676, + "learning_rate": 4.9178580687078855e-05, + "loss": 5.2052, + "step": 13759 + }, + { + "epoch": 0.08183461794652203, + "grad_norm": 1.3808842897415161, + "learning_rate": 4.9178461931401254e-05, + "loss": 5.3007, + "step": 13760 + }, + { + "epoch": 0.08184056522980303, + "grad_norm": 1.3584518432617188, + "learning_rate": 4.91783431672832e-05, + "loss": 5.3137, + "step": 13761 + }, + { + "epoch": 0.08184651251308403, + "grad_norm": 1.4467449188232422, + "learning_rate": 4.917822439472474e-05, + "loss": 5.2208, + "step": 13762 + }, + { + "epoch": 0.08185245979636502, + "grad_norm": 1.298618197441101, + "learning_rate": 4.917810561372591e-05, + "loss": 5.2161, + "step": 13763 + }, + { + "epoch": 0.08185840707964602, + "grad_norm": 2.5304789543151855, + "learning_rate": 4.9177986824286756e-05, + "loss": 4.6644, + "step": 13764 + }, + { + "epoch": 0.08186435436292701, + "grad_norm": 1.607969880104065, + "learning_rate": 4.917786802640732e-05, + "loss": 5.2116, + "step": 13765 + }, + { + "epoch": 0.08187030164620801, + "grad_norm": 1.401207685470581, + "learning_rate": 4.917774922008763e-05, + "loss": 5.2847, + "step": 13766 + }, + { + "epoch": 0.08187624892948901, + "grad_norm": 1.1652514934539795, + "learning_rate": 4.9177630405327746e-05, + "loss": 5.2939, + "step": 13767 + }, + { + "epoch": 0.08188219621277, + "grad_norm": 1.2998749017715454, + "learning_rate": 4.9177511582127694e-05, + "loss": 5.251, + "step": 13768 + }, + { + "epoch": 0.081888143496051, + "grad_norm": 1.33558988571167, + "learning_rate": 4.917739275048753e-05, + "loss": 5.2749, + "step": 13769 + }, + { + "epoch": 0.081894090779332, + "grad_norm": 1.1457966566085815, + "learning_rate": 4.917727391040728e-05, + "loss": 5.3153, + "step": 13770 + }, + { + "epoch": 0.08190003806261299, + "grad_norm": 1.493249773979187, + "learning_rate": 4.917715506188699e-05, + "loss": 5.3702, + "step": 13771 + }, + { + "epoch": 0.081905985345894, + "grad_norm": 1.2591760158538818, + "learning_rate": 4.917703620492672e-05, + "loss": 5.2019, + "step": 13772 + }, + { + "epoch": 0.081911932629175, + "grad_norm": 1.2480885982513428, + "learning_rate": 4.917691733952648e-05, + "loss": 5.1904, + "step": 13773 + }, + { + "epoch": 0.08191787991245598, + "grad_norm": 1.3278160095214844, + "learning_rate": 4.917679846568634e-05, + "loss": 5.0424, + "step": 13774 + }, + { + "epoch": 0.08192382719573699, + "grad_norm": 1.2930511236190796, + "learning_rate": 4.9176679583406325e-05, + "loss": 5.2437, + "step": 13775 + }, + { + "epoch": 0.08192977447901799, + "grad_norm": 1.39852774143219, + "learning_rate": 4.9176560692686485e-05, + "loss": 5.3683, + "step": 13776 + }, + { + "epoch": 0.08193572176229898, + "grad_norm": 1.3392889499664307, + "learning_rate": 4.917644179352685e-05, + "loss": 5.1894, + "step": 13777 + }, + { + "epoch": 0.08194166904557998, + "grad_norm": 1.318595051765442, + "learning_rate": 4.917632288592747e-05, + "loss": 5.382, + "step": 13778 + }, + { + "epoch": 0.08194761632886098, + "grad_norm": 1.0992580652236938, + "learning_rate": 4.9176203969888395e-05, + "loss": 5.1979, + "step": 13779 + }, + { + "epoch": 0.08195356361214197, + "grad_norm": 1.2092480659484863, + "learning_rate": 4.917608504540965e-05, + "loss": 5.2253, + "step": 13780 + }, + { + "epoch": 0.08195951089542297, + "grad_norm": 1.2495516538619995, + "learning_rate": 4.9175966112491286e-05, + "loss": 5.1951, + "step": 13781 + }, + { + "epoch": 0.08196545817870397, + "grad_norm": 1.642177700996399, + "learning_rate": 4.917584717113334e-05, + "loss": 4.9648, + "step": 13782 + }, + { + "epoch": 0.08197140546198496, + "grad_norm": 1.4849772453308105, + "learning_rate": 4.9175728221335856e-05, + "loss": 4.8231, + "step": 13783 + }, + { + "epoch": 0.08197735274526596, + "grad_norm": 1.1743687391281128, + "learning_rate": 4.917560926309888e-05, + "loss": 4.7685, + "step": 13784 + }, + { + "epoch": 0.08198330002854697, + "grad_norm": 1.2688218355178833, + "learning_rate": 4.9175490296422436e-05, + "loss": 5.3023, + "step": 13785 + }, + { + "epoch": 0.08198924731182795, + "grad_norm": 1.2325210571289062, + "learning_rate": 4.9175371321306584e-05, + "loss": 4.8373, + "step": 13786 + }, + { + "epoch": 0.08199519459510896, + "grad_norm": 1.5414066314697266, + "learning_rate": 4.9175252337751364e-05, + "loss": 5.005, + "step": 13787 + }, + { + "epoch": 0.08200114187838996, + "grad_norm": 2.1581833362579346, + "learning_rate": 4.917513334575681e-05, + "loss": 5.5065, + "step": 13788 + }, + { + "epoch": 0.08200708916167095, + "grad_norm": 2.0199508666992188, + "learning_rate": 4.917501434532297e-05, + "loss": 5.8826, + "step": 13789 + }, + { + "epoch": 0.08201303644495195, + "grad_norm": 1.727602481842041, + "learning_rate": 4.917489533644987e-05, + "loss": 5.6967, + "step": 13790 + }, + { + "epoch": 0.08201898372823295, + "grad_norm": 1.5649336576461792, + "learning_rate": 4.917477631913757e-05, + "loss": 5.783, + "step": 13791 + }, + { + "epoch": 0.08202493101151394, + "grad_norm": 1.7326582670211792, + "learning_rate": 4.9174657293386115e-05, + "loss": 5.6705, + "step": 13792 + }, + { + "epoch": 0.08203087829479494, + "grad_norm": 1.8611500263214111, + "learning_rate": 4.917453825919553e-05, + "loss": 5.4881, + "step": 13793 + }, + { + "epoch": 0.08203682557807593, + "grad_norm": 1.9762206077575684, + "learning_rate": 4.917441921656586e-05, + "loss": 5.4826, + "step": 13794 + }, + { + "epoch": 0.08204277286135693, + "grad_norm": 1.6816489696502686, + "learning_rate": 4.9174300165497154e-05, + "loss": 5.466, + "step": 13795 + }, + { + "epoch": 0.08204872014463793, + "grad_norm": 1.8922536373138428, + "learning_rate": 4.9174181105989445e-05, + "loss": 5.3603, + "step": 13796 + }, + { + "epoch": 0.08205466742791892, + "grad_norm": 2.094996213912964, + "learning_rate": 4.917406203804279e-05, + "loss": 5.8687, + "step": 13797 + }, + { + "epoch": 0.08206061471119992, + "grad_norm": 1.8656450510025024, + "learning_rate": 4.9173942961657215e-05, + "loss": 6.2551, + "step": 13798 + }, + { + "epoch": 0.08206656199448092, + "grad_norm": 1.871787428855896, + "learning_rate": 4.917382387683276e-05, + "loss": 5.6612, + "step": 13799 + }, + { + "epoch": 0.08207250927776191, + "grad_norm": 1.8721636533737183, + "learning_rate": 4.9173704783569475e-05, + "loss": 5.8918, + "step": 13800 + }, + { + "epoch": 0.08207845656104291, + "grad_norm": 2.0554919242858887, + "learning_rate": 4.917358568186741e-05, + "loss": 5.6398, + "step": 13801 + }, + { + "epoch": 0.08208440384432392, + "grad_norm": 1.9311691522598267, + "learning_rate": 4.917346657172658e-05, + "loss": 5.6507, + "step": 13802 + }, + { + "epoch": 0.0820903511276049, + "grad_norm": 1.7426981925964355, + "learning_rate": 4.917334745314705e-05, + "loss": 5.3193, + "step": 13803 + }, + { + "epoch": 0.0820962984108859, + "grad_norm": 1.783890724182129, + "learning_rate": 4.9173228326128856e-05, + "loss": 5.1274, + "step": 13804 + }, + { + "epoch": 0.08210224569416691, + "grad_norm": 1.8739385604858398, + "learning_rate": 4.917310919067203e-05, + "loss": 5.378, + "step": 13805 + }, + { + "epoch": 0.0821081929774479, + "grad_norm": 1.6748543977737427, + "learning_rate": 4.917299004677663e-05, + "loss": 5.4772, + "step": 13806 + }, + { + "epoch": 0.0821141402607289, + "grad_norm": 1.498864769935608, + "learning_rate": 4.917287089444269e-05, + "loss": 5.4485, + "step": 13807 + }, + { + "epoch": 0.0821200875440099, + "grad_norm": 1.6129908561706543, + "learning_rate": 4.917275173367024e-05, + "loss": 5.5245, + "step": 13808 + }, + { + "epoch": 0.08212603482729089, + "grad_norm": 1.4655383825302124, + "learning_rate": 4.917263256445934e-05, + "loss": 5.5513, + "step": 13809 + }, + { + "epoch": 0.08213198211057189, + "grad_norm": 1.765244483947754, + "learning_rate": 4.917251338681003e-05, + "loss": 5.5322, + "step": 13810 + }, + { + "epoch": 0.0821379293938529, + "grad_norm": 2.002889633178711, + "learning_rate": 4.917239420072233e-05, + "loss": 5.1273, + "step": 13811 + }, + { + "epoch": 0.08214387667713388, + "grad_norm": 2.4380993843078613, + "learning_rate": 4.917227500619631e-05, + "loss": 4.8983, + "step": 13812 + }, + { + "epoch": 0.08214982396041488, + "grad_norm": 2.0864169597625732, + "learning_rate": 4.917215580323199e-05, + "loss": 5.077, + "step": 13813 + }, + { + "epoch": 0.08215577124369589, + "grad_norm": 2.2942094802856445, + "learning_rate": 4.917203659182942e-05, + "loss": 5.4359, + "step": 13814 + }, + { + "epoch": 0.08216171852697687, + "grad_norm": 2.067659616470337, + "learning_rate": 4.917191737198865e-05, + "loss": 5.7409, + "step": 13815 + }, + { + "epoch": 0.08216766581025788, + "grad_norm": 2.010085344314575, + "learning_rate": 4.917179814370971e-05, + "loss": 5.2279, + "step": 13816 + }, + { + "epoch": 0.08217361309353888, + "grad_norm": 1.8540743589401245, + "learning_rate": 4.917167890699264e-05, + "loss": 5.6146, + "step": 13817 + }, + { + "epoch": 0.08217956037681987, + "grad_norm": 1.9126391410827637, + "learning_rate": 4.917155966183749e-05, + "loss": 5.7007, + "step": 13818 + }, + { + "epoch": 0.08218550766010087, + "grad_norm": 1.6382626295089722, + "learning_rate": 4.91714404082443e-05, + "loss": 5.3641, + "step": 13819 + }, + { + "epoch": 0.08219145494338187, + "grad_norm": 1.8019288778305054, + "learning_rate": 4.9171321146213105e-05, + "loss": 5.1853, + "step": 13820 + }, + { + "epoch": 0.08219740222666286, + "grad_norm": 1.681685447692871, + "learning_rate": 4.917120187574395e-05, + "loss": 5.4141, + "step": 13821 + }, + { + "epoch": 0.08220334950994386, + "grad_norm": 1.9356689453125, + "learning_rate": 4.9171082596836896e-05, + "loss": 5.5379, + "step": 13822 + }, + { + "epoch": 0.08220929679322485, + "grad_norm": 1.9538071155548096, + "learning_rate": 4.917096330949195e-05, + "loss": 5.5723, + "step": 13823 + }, + { + "epoch": 0.08221524407650585, + "grad_norm": 1.7350852489471436, + "learning_rate": 4.9170844013709175e-05, + "loss": 5.5622, + "step": 13824 + }, + { + "epoch": 0.08222119135978685, + "grad_norm": 1.790276050567627, + "learning_rate": 4.9170724709488606e-05, + "loss": 5.5194, + "step": 13825 + }, + { + "epoch": 0.08222713864306784, + "grad_norm": 2.2997219562530518, + "learning_rate": 4.917060539683028e-05, + "loss": 5.0646, + "step": 13826 + }, + { + "epoch": 0.08223308592634884, + "grad_norm": 1.729131817817688, + "learning_rate": 4.9170486075734254e-05, + "loss": 5.5588, + "step": 13827 + }, + { + "epoch": 0.08223903320962984, + "grad_norm": 1.8754487037658691, + "learning_rate": 4.9170366746200566e-05, + "loss": 5.5435, + "step": 13828 + }, + { + "epoch": 0.08224498049291083, + "grad_norm": 1.8330692052841187, + "learning_rate": 4.9170247408229244e-05, + "loss": 5.598, + "step": 13829 + }, + { + "epoch": 0.08225092777619183, + "grad_norm": 1.8318592309951782, + "learning_rate": 4.917012806182034e-05, + "loss": 5.5165, + "step": 13830 + }, + { + "epoch": 0.08225687505947284, + "grad_norm": 1.6818424463272095, + "learning_rate": 4.9170008706973895e-05, + "loss": 5.3377, + "step": 13831 + }, + { + "epoch": 0.08226282234275382, + "grad_norm": 1.7040458917617798, + "learning_rate": 4.916988934368995e-05, + "loss": 5.4644, + "step": 13832 + }, + { + "epoch": 0.08226876962603483, + "grad_norm": 1.8902777433395386, + "learning_rate": 4.916976997196855e-05, + "loss": 5.4526, + "step": 13833 + }, + { + "epoch": 0.08227471690931583, + "grad_norm": 1.7484904527664185, + "learning_rate": 4.9169650591809724e-05, + "loss": 5.3, + "step": 13834 + }, + { + "epoch": 0.08228066419259682, + "grad_norm": 1.726083517074585, + "learning_rate": 4.916953120321353e-05, + "loss": 5.4451, + "step": 13835 + }, + { + "epoch": 0.08228661147587782, + "grad_norm": 1.791942834854126, + "learning_rate": 4.916941180618e-05, + "loss": 5.444, + "step": 13836 + }, + { + "epoch": 0.08229255875915882, + "grad_norm": 1.9032018184661865, + "learning_rate": 4.916929240070918e-05, + "loss": 5.4411, + "step": 13837 + }, + { + "epoch": 0.08229850604243981, + "grad_norm": 1.6170588731765747, + "learning_rate": 4.91691729868011e-05, + "loss": 5.4293, + "step": 13838 + }, + { + "epoch": 0.08230445332572081, + "grad_norm": 1.3972853422164917, + "learning_rate": 4.9169053564455825e-05, + "loss": 5.2889, + "step": 13839 + }, + { + "epoch": 0.08231040060900181, + "grad_norm": 1.782913088798523, + "learning_rate": 4.916893413367338e-05, + "loss": 5.4092, + "step": 13840 + }, + { + "epoch": 0.0823163478922828, + "grad_norm": 1.83617103099823, + "learning_rate": 4.9168814694453807e-05, + "loss": 5.3997, + "step": 13841 + }, + { + "epoch": 0.0823222951755638, + "grad_norm": 1.92609703540802, + "learning_rate": 4.9168695246797146e-05, + "loss": 5.3469, + "step": 13842 + }, + { + "epoch": 0.0823282424588448, + "grad_norm": 2.20027756690979, + "learning_rate": 4.9168575790703454e-05, + "loss": 5.5999, + "step": 13843 + }, + { + "epoch": 0.0823341897421258, + "grad_norm": 3.096323251724243, + "learning_rate": 4.916845632617275e-05, + "loss": 5.3997, + "step": 13844 + }, + { + "epoch": 0.0823401370254068, + "grad_norm": 2.433900833129883, + "learning_rate": 4.91683368532051e-05, + "loss": 5.4937, + "step": 13845 + }, + { + "epoch": 0.0823460843086878, + "grad_norm": 2.371389389038086, + "learning_rate": 4.9168217371800526e-05, + "loss": 5.966, + "step": 13846 + }, + { + "epoch": 0.08235203159196879, + "grad_norm": 1.5628182888031006, + "learning_rate": 4.9168097881959076e-05, + "loss": 5.5971, + "step": 13847 + }, + { + "epoch": 0.08235797887524979, + "grad_norm": 2.733569622039795, + "learning_rate": 4.91679783836808e-05, + "loss": 5.2696, + "step": 13848 + }, + { + "epoch": 0.08236392615853079, + "grad_norm": 2.117197275161743, + "learning_rate": 4.916785887696572e-05, + "loss": 5.3729, + "step": 13849 + }, + { + "epoch": 0.08236987344181178, + "grad_norm": 2.040476083755493, + "learning_rate": 4.9167739361813905e-05, + "loss": 5.6568, + "step": 13850 + }, + { + "epoch": 0.08237582072509278, + "grad_norm": 2.127465009689331, + "learning_rate": 4.916761983822536e-05, + "loss": 5.9168, + "step": 13851 + }, + { + "epoch": 0.08238176800837377, + "grad_norm": 2.00907301902771, + "learning_rate": 4.916750030620017e-05, + "loss": 5.9104, + "step": 13852 + }, + { + "epoch": 0.08238771529165477, + "grad_norm": 1.721428394317627, + "learning_rate": 4.916738076573835e-05, + "loss": 5.8126, + "step": 13853 + }, + { + "epoch": 0.08239366257493577, + "grad_norm": 1.5760809183120728, + "learning_rate": 4.9167261216839946e-05, + "loss": 6.0134, + "step": 13854 + }, + { + "epoch": 0.08239960985821676, + "grad_norm": 1.648639440536499, + "learning_rate": 4.9167141659505e-05, + "loss": 5.3878, + "step": 13855 + }, + { + "epoch": 0.08240555714149776, + "grad_norm": 1.4113967418670654, + "learning_rate": 4.916702209373355e-05, + "loss": 5.8159, + "step": 13856 + }, + { + "epoch": 0.08241150442477876, + "grad_norm": 1.725477933883667, + "learning_rate": 4.916690251952565e-05, + "loss": 5.7185, + "step": 13857 + }, + { + "epoch": 0.08241745170805975, + "grad_norm": 1.8538665771484375, + "learning_rate": 4.9166782936881326e-05, + "loss": 5.1804, + "step": 13858 + }, + { + "epoch": 0.08242339899134075, + "grad_norm": 1.5203232765197754, + "learning_rate": 4.9166663345800635e-05, + "loss": 5.1486, + "step": 13859 + }, + { + "epoch": 0.08242934627462176, + "grad_norm": 1.8738161325454712, + "learning_rate": 4.916654374628361e-05, + "loss": 5.0062, + "step": 13860 + }, + { + "epoch": 0.08243529355790274, + "grad_norm": 1.689563512802124, + "learning_rate": 4.916642413833029e-05, + "loss": 4.9508, + "step": 13861 + }, + { + "epoch": 0.08244124084118375, + "grad_norm": 1.8749178647994995, + "learning_rate": 4.916630452194073e-05, + "loss": 5.4645, + "step": 13862 + }, + { + "epoch": 0.08244718812446475, + "grad_norm": 2.779536247253418, + "learning_rate": 4.9166184897114956e-05, + "loss": 5.9364, + "step": 13863 + }, + { + "epoch": 0.08245313540774574, + "grad_norm": 2.41239333152771, + "learning_rate": 4.9166065263853014e-05, + "loss": 5.9045, + "step": 13864 + }, + { + "epoch": 0.08245908269102674, + "grad_norm": 1.624475359916687, + "learning_rate": 4.916594562215495e-05, + "loss": 5.4222, + "step": 13865 + }, + { + "epoch": 0.08246502997430774, + "grad_norm": 1.6841174364089966, + "learning_rate": 4.916582597202081e-05, + "loss": 5.3455, + "step": 13866 + }, + { + "epoch": 0.08247097725758873, + "grad_norm": 1.6790028810501099, + "learning_rate": 4.916570631345062e-05, + "loss": 5.5397, + "step": 13867 + }, + { + "epoch": 0.08247692454086973, + "grad_norm": 1.87303626537323, + "learning_rate": 4.9165586646444436e-05, + "loss": 5.6022, + "step": 13868 + }, + { + "epoch": 0.08248287182415073, + "grad_norm": 1.7747167348861694, + "learning_rate": 4.91654669710023e-05, + "loss": 5.4631, + "step": 13869 + }, + { + "epoch": 0.08248881910743172, + "grad_norm": 1.694941759109497, + "learning_rate": 4.9165347287124244e-05, + "loss": 5.5634, + "step": 13870 + }, + { + "epoch": 0.08249476639071272, + "grad_norm": 1.8258243799209595, + "learning_rate": 4.9165227594810316e-05, + "loss": 5.526, + "step": 13871 + }, + { + "epoch": 0.08250071367399373, + "grad_norm": 1.708798885345459, + "learning_rate": 4.9165107894060556e-05, + "loss": 5.5127, + "step": 13872 + }, + { + "epoch": 0.08250666095727471, + "grad_norm": 1.7820818424224854, + "learning_rate": 4.916498818487501e-05, + "loss": 5.4169, + "step": 13873 + }, + { + "epoch": 0.08251260824055572, + "grad_norm": 2.38067626953125, + "learning_rate": 4.916486846725372e-05, + "loss": 5.8063, + "step": 13874 + }, + { + "epoch": 0.08251855552383672, + "grad_norm": 1.8507468700408936, + "learning_rate": 4.916474874119671e-05, + "loss": 5.4871, + "step": 13875 + }, + { + "epoch": 0.0825245028071177, + "grad_norm": 1.8866678476333618, + "learning_rate": 4.916462900670404e-05, + "loss": 5.5452, + "step": 13876 + }, + { + "epoch": 0.08253045009039871, + "grad_norm": 1.853668212890625, + "learning_rate": 4.916450926377576e-05, + "loss": 5.8262, + "step": 13877 + }, + { + "epoch": 0.08253639737367971, + "grad_norm": 1.7404545545578003, + "learning_rate": 4.916438951241189e-05, + "loss": 5.5978, + "step": 13878 + }, + { + "epoch": 0.0825423446569607, + "grad_norm": 1.844139814376831, + "learning_rate": 4.916426975261248e-05, + "loss": 5.765, + "step": 13879 + }, + { + "epoch": 0.0825482919402417, + "grad_norm": 1.9454487562179565, + "learning_rate": 4.916414998437758e-05, + "loss": 5.5458, + "step": 13880 + }, + { + "epoch": 0.08255423922352269, + "grad_norm": 1.317144751548767, + "learning_rate": 4.916403020770722e-05, + "loss": 5.7694, + "step": 13881 + }, + { + "epoch": 0.08256018650680369, + "grad_norm": 1.718024730682373, + "learning_rate": 4.916391042260145e-05, + "loss": 5.7369, + "step": 13882 + }, + { + "epoch": 0.08256613379008469, + "grad_norm": 1.4623572826385498, + "learning_rate": 4.9163790629060305e-05, + "loss": 5.72, + "step": 13883 + }, + { + "epoch": 0.08257208107336568, + "grad_norm": 1.908839225769043, + "learning_rate": 4.916367082708383e-05, + "loss": 5.7175, + "step": 13884 + }, + { + "epoch": 0.08257802835664668, + "grad_norm": 1.7910356521606445, + "learning_rate": 4.916355101667206e-05, + "loss": 5.4446, + "step": 13885 + }, + { + "epoch": 0.08258397563992768, + "grad_norm": 2.132512092590332, + "learning_rate": 4.9163431197825055e-05, + "loss": 5.2315, + "step": 13886 + }, + { + "epoch": 0.08258992292320867, + "grad_norm": 2.223329782485962, + "learning_rate": 4.9163311370542844e-05, + "loss": 5.2953, + "step": 13887 + }, + { + "epoch": 0.08259587020648967, + "grad_norm": 2.6441519260406494, + "learning_rate": 4.916319153482547e-05, + "loss": 5.2637, + "step": 13888 + }, + { + "epoch": 0.08260181748977068, + "grad_norm": 2.1528780460357666, + "learning_rate": 4.9163071690672973e-05, + "loss": 5.1602, + "step": 13889 + }, + { + "epoch": 0.08260776477305166, + "grad_norm": 2.6483633518218994, + "learning_rate": 4.91629518380854e-05, + "loss": 5.2487, + "step": 13890 + }, + { + "epoch": 0.08261371205633267, + "grad_norm": 2.276808738708496, + "learning_rate": 4.916283197706279e-05, + "loss": 5.064, + "step": 13891 + }, + { + "epoch": 0.08261965933961367, + "grad_norm": 1.8921101093292236, + "learning_rate": 4.9162712107605184e-05, + "loss": 5.3979, + "step": 13892 + }, + { + "epoch": 0.08262560662289466, + "grad_norm": 2.2009568214416504, + "learning_rate": 4.9162592229712625e-05, + "loss": 5.2434, + "step": 13893 + }, + { + "epoch": 0.08263155390617566, + "grad_norm": 2.199380874633789, + "learning_rate": 4.916247234338516e-05, + "loss": 4.7187, + "step": 13894 + }, + { + "epoch": 0.08263750118945666, + "grad_norm": 2.3620400428771973, + "learning_rate": 4.916235244862282e-05, + "loss": 4.7371, + "step": 13895 + }, + { + "epoch": 0.08264344847273765, + "grad_norm": 2.100086212158203, + "learning_rate": 4.9162232545425646e-05, + "loss": 4.5239, + "step": 13896 + }, + { + "epoch": 0.08264939575601865, + "grad_norm": 2.100106954574585, + "learning_rate": 4.91621126337937e-05, + "loss": 4.5555, + "step": 13897 + }, + { + "epoch": 0.08265534303929965, + "grad_norm": 2.005345344543457, + "learning_rate": 4.9161992713727e-05, + "loss": 4.397, + "step": 13898 + }, + { + "epoch": 0.08266129032258064, + "grad_norm": 1.9393454790115356, + "learning_rate": 4.91618727852256e-05, + "loss": 4.7327, + "step": 13899 + }, + { + "epoch": 0.08266723760586164, + "grad_norm": 2.0109846591949463, + "learning_rate": 4.916175284828955e-05, + "loss": 4.4987, + "step": 13900 + }, + { + "epoch": 0.08267318488914265, + "grad_norm": 2.0040533542633057, + "learning_rate": 4.916163290291886e-05, + "loss": 4.4703, + "step": 13901 + }, + { + "epoch": 0.08267913217242363, + "grad_norm": 2.014885902404785, + "learning_rate": 4.916151294911361e-05, + "loss": 4.374, + "step": 13902 + }, + { + "epoch": 0.08268507945570464, + "grad_norm": 1.9490050077438354, + "learning_rate": 4.916139298687382e-05, + "loss": 4.6281, + "step": 13903 + }, + { + "epoch": 0.08269102673898564, + "grad_norm": 2.0691943168640137, + "learning_rate": 4.916127301619954e-05, + "loss": 4.5008, + "step": 13904 + }, + { + "epoch": 0.08269697402226663, + "grad_norm": 2.1290805339813232, + "learning_rate": 4.916115303709081e-05, + "loss": 5.4876, + "step": 13905 + }, + { + "epoch": 0.08270292130554763, + "grad_norm": 1.981466293334961, + "learning_rate": 4.916103304954767e-05, + "loss": 5.7699, + "step": 13906 + }, + { + "epoch": 0.08270886858882863, + "grad_norm": 1.8898048400878906, + "learning_rate": 4.916091305357016e-05, + "loss": 5.7874, + "step": 13907 + }, + { + "epoch": 0.08271481587210962, + "grad_norm": 1.7809741497039795, + "learning_rate": 4.916079304915833e-05, + "loss": 5.6264, + "step": 13908 + }, + { + "epoch": 0.08272076315539062, + "grad_norm": 1.7516652345657349, + "learning_rate": 4.916067303631221e-05, + "loss": 5.5751, + "step": 13909 + }, + { + "epoch": 0.08272671043867161, + "grad_norm": 1.9051094055175781, + "learning_rate": 4.916055301503185e-05, + "loss": 5.7984, + "step": 13910 + }, + { + "epoch": 0.08273265772195261, + "grad_norm": 1.7115057706832886, + "learning_rate": 4.9160432985317295e-05, + "loss": 5.6187, + "step": 13911 + }, + { + "epoch": 0.08273860500523361, + "grad_norm": 1.790529727935791, + "learning_rate": 4.916031294716858e-05, + "loss": 5.6276, + "step": 13912 + }, + { + "epoch": 0.0827445522885146, + "grad_norm": 1.742039442062378, + "learning_rate": 4.9160192900585754e-05, + "loss": 5.3783, + "step": 13913 + }, + { + "epoch": 0.0827504995717956, + "grad_norm": 1.7544314861297607, + "learning_rate": 4.916007284556885e-05, + "loss": 5.5276, + "step": 13914 + }, + { + "epoch": 0.0827564468550766, + "grad_norm": 2.0135440826416016, + "learning_rate": 4.915995278211791e-05, + "loss": 5.5177, + "step": 13915 + }, + { + "epoch": 0.08276239413835759, + "grad_norm": 1.5759433507919312, + "learning_rate": 4.915983271023299e-05, + "loss": 5.4652, + "step": 13916 + }, + { + "epoch": 0.0827683414216386, + "grad_norm": 1.7974358797073364, + "learning_rate": 4.915971262991411e-05, + "loss": 5.4463, + "step": 13917 + }, + { + "epoch": 0.0827742887049196, + "grad_norm": 1.847692608833313, + "learning_rate": 4.9159592541161335e-05, + "loss": 5.4247, + "step": 13918 + }, + { + "epoch": 0.08278023598820058, + "grad_norm": 1.6701977252960205, + "learning_rate": 4.915947244397469e-05, + "loss": 5.3451, + "step": 13919 + }, + { + "epoch": 0.08278618327148159, + "grad_norm": 1.9226999282836914, + "learning_rate": 4.915935233835423e-05, + "loss": 5.1159, + "step": 13920 + }, + { + "epoch": 0.08279213055476259, + "grad_norm": 2.430760383605957, + "learning_rate": 4.915923222429998e-05, + "loss": 4.9746, + "step": 13921 + }, + { + "epoch": 0.08279807783804358, + "grad_norm": 1.7708054780960083, + "learning_rate": 4.915911210181199e-05, + "loss": 5.4986, + "step": 13922 + }, + { + "epoch": 0.08280402512132458, + "grad_norm": 1.7802354097366333, + "learning_rate": 4.915899197089031e-05, + "loss": 5.4283, + "step": 13923 + }, + { + "epoch": 0.08280997240460558, + "grad_norm": 2.347226142883301, + "learning_rate": 4.9158871831534984e-05, + "loss": 5.2917, + "step": 13924 + }, + { + "epoch": 0.08281591968788657, + "grad_norm": 2.5685782432556152, + "learning_rate": 4.915875168374603e-05, + "loss": 5.243, + "step": 13925 + }, + { + "epoch": 0.08282186697116757, + "grad_norm": 2.460383176803589, + "learning_rate": 4.915863152752351e-05, + "loss": 4.9241, + "step": 13926 + }, + { + "epoch": 0.08282781425444857, + "grad_norm": 2.2505056858062744, + "learning_rate": 4.915851136286747e-05, + "loss": 5.0951, + "step": 13927 + }, + { + "epoch": 0.08283376153772956, + "grad_norm": 2.517544984817505, + "learning_rate": 4.915839118977793e-05, + "loss": 5.151, + "step": 13928 + }, + { + "epoch": 0.08283970882101056, + "grad_norm": 2.445645809173584, + "learning_rate": 4.915827100825495e-05, + "loss": 5.1831, + "step": 13929 + }, + { + "epoch": 0.08284565610429157, + "grad_norm": 2.347383737564087, + "learning_rate": 4.9158150818298564e-05, + "loss": 5.0299, + "step": 13930 + }, + { + "epoch": 0.08285160338757255, + "grad_norm": 2.1791892051696777, + "learning_rate": 4.915803061990882e-05, + "loss": 5.4083, + "step": 13931 + }, + { + "epoch": 0.08285755067085356, + "grad_norm": 1.9959020614624023, + "learning_rate": 4.9157910413085764e-05, + "loss": 5.9036, + "step": 13932 + }, + { + "epoch": 0.08286349795413456, + "grad_norm": 2.3419620990753174, + "learning_rate": 4.915779019782942e-05, + "loss": 4.9082, + "step": 13933 + }, + { + "epoch": 0.08286944523741555, + "grad_norm": 2.452756643295288, + "learning_rate": 4.915766997413985e-05, + "loss": 4.8272, + "step": 13934 + }, + { + "epoch": 0.08287539252069655, + "grad_norm": 2.344353675842285, + "learning_rate": 4.915754974201708e-05, + "loss": 5.0269, + "step": 13935 + }, + { + "epoch": 0.08288133980397755, + "grad_norm": 2.366218090057373, + "learning_rate": 4.9157429501461175e-05, + "loss": 4.8898, + "step": 13936 + }, + { + "epoch": 0.08288728708725854, + "grad_norm": 1.7986581325531006, + "learning_rate": 4.915730925247214e-05, + "loss": 4.9316, + "step": 13937 + }, + { + "epoch": 0.08289323437053954, + "grad_norm": 2.059094190597534, + "learning_rate": 4.915718899505005e-05, + "loss": 5.1297, + "step": 13938 + }, + { + "epoch": 0.08289918165382054, + "grad_norm": 1.9630707502365112, + "learning_rate": 4.915706872919493e-05, + "loss": 5.4844, + "step": 13939 + }, + { + "epoch": 0.08290512893710153, + "grad_norm": 2.0281238555908203, + "learning_rate": 4.9156948454906825e-05, + "loss": 5.9276, + "step": 13940 + }, + { + "epoch": 0.08291107622038253, + "grad_norm": 1.8783270120620728, + "learning_rate": 4.9156828172185786e-05, + "loss": 5.7085, + "step": 13941 + }, + { + "epoch": 0.08291702350366352, + "grad_norm": 2.190317153930664, + "learning_rate": 4.915670788103184e-05, + "loss": 4.9619, + "step": 13942 + }, + { + "epoch": 0.08292297078694452, + "grad_norm": 2.2746498584747314, + "learning_rate": 4.915658758144505e-05, + "loss": 4.8965, + "step": 13943 + }, + { + "epoch": 0.08292891807022552, + "grad_norm": 1.940510630607605, + "learning_rate": 4.915646727342543e-05, + "loss": 5.0367, + "step": 13944 + }, + { + "epoch": 0.08293486535350651, + "grad_norm": 1.9016308784484863, + "learning_rate": 4.915634695697304e-05, + "loss": 5.5002, + "step": 13945 + }, + { + "epoch": 0.08294081263678751, + "grad_norm": 2.0041022300720215, + "learning_rate": 4.915622663208792e-05, + "loss": 5.4193, + "step": 13946 + }, + { + "epoch": 0.08294675992006852, + "grad_norm": 2.0117805004119873, + "learning_rate": 4.9156106298770115e-05, + "loss": 5.2697, + "step": 13947 + }, + { + "epoch": 0.0829527072033495, + "grad_norm": 1.864820957183838, + "learning_rate": 4.9155985957019654e-05, + "loss": 5.1594, + "step": 13948 + }, + { + "epoch": 0.0829586544866305, + "grad_norm": 1.7407771348953247, + "learning_rate": 4.91558656068366e-05, + "loss": 5.1189, + "step": 13949 + }, + { + "epoch": 0.08296460176991151, + "grad_norm": 2.027552366256714, + "learning_rate": 4.9155745248220976e-05, + "loss": 5.6257, + "step": 13950 + }, + { + "epoch": 0.0829705490531925, + "grad_norm": 1.6893701553344727, + "learning_rate": 4.9155624881172834e-05, + "loss": 5.1268, + "step": 13951 + }, + { + "epoch": 0.0829764963364735, + "grad_norm": 1.7216230630874634, + "learning_rate": 4.915550450569221e-05, + "loss": 5.2768, + "step": 13952 + }, + { + "epoch": 0.0829824436197545, + "grad_norm": 1.6723179817199707, + "learning_rate": 4.915538412177915e-05, + "loss": 5.7059, + "step": 13953 + }, + { + "epoch": 0.08298839090303549, + "grad_norm": 1.7645996809005737, + "learning_rate": 4.915526372943369e-05, + "loss": 5.6065, + "step": 13954 + }, + { + "epoch": 0.08299433818631649, + "grad_norm": 1.9206926822662354, + "learning_rate": 4.915514332865588e-05, + "loss": 4.9229, + "step": 13955 + }, + { + "epoch": 0.08300028546959749, + "grad_norm": 1.9269802570343018, + "learning_rate": 4.9155022919445766e-05, + "loss": 5.5678, + "step": 13956 + }, + { + "epoch": 0.08300623275287848, + "grad_norm": 2.378319501876831, + "learning_rate": 4.915490250180338e-05, + "loss": 4.7271, + "step": 13957 + }, + { + "epoch": 0.08301218003615948, + "grad_norm": 1.73631751537323, + "learning_rate": 4.915478207572876e-05, + "loss": 5.1302, + "step": 13958 + }, + { + "epoch": 0.08301812731944049, + "grad_norm": 1.6520816087722778, + "learning_rate": 4.915466164122196e-05, + "loss": 6.0497, + "step": 13959 + }, + { + "epoch": 0.08302407460272147, + "grad_norm": 1.7382736206054688, + "learning_rate": 4.915454119828302e-05, + "loss": 6.0155, + "step": 13960 + }, + { + "epoch": 0.08303002188600248, + "grad_norm": 1.6733272075653076, + "learning_rate": 4.915442074691197e-05, + "loss": 5.2624, + "step": 13961 + }, + { + "epoch": 0.08303596916928348, + "grad_norm": 2.0024397373199463, + "learning_rate": 4.915430028710887e-05, + "loss": 5.4794, + "step": 13962 + }, + { + "epoch": 0.08304191645256447, + "grad_norm": 1.9784339666366577, + "learning_rate": 4.915417981887375e-05, + "loss": 5.1546, + "step": 13963 + }, + { + "epoch": 0.08304786373584547, + "grad_norm": 1.7146525382995605, + "learning_rate": 4.915405934220666e-05, + "loss": 5.6269, + "step": 13964 + }, + { + "epoch": 0.08305381101912647, + "grad_norm": 1.7252057790756226, + "learning_rate": 4.9153938857107626e-05, + "loss": 5.7015, + "step": 13965 + }, + { + "epoch": 0.08305975830240746, + "grad_norm": 1.6623241901397705, + "learning_rate": 4.9153818363576715e-05, + "loss": 5.5249, + "step": 13966 + }, + { + "epoch": 0.08306570558568846, + "grad_norm": 2.0701472759246826, + "learning_rate": 4.9153697861613944e-05, + "loss": 5.3528, + "step": 13967 + }, + { + "epoch": 0.08307165286896946, + "grad_norm": 1.6600522994995117, + "learning_rate": 4.915357735121938e-05, + "loss": 5.3454, + "step": 13968 + }, + { + "epoch": 0.08307760015225045, + "grad_norm": 2.093092918395996, + "learning_rate": 4.915345683239304e-05, + "loss": 5.2417, + "step": 13969 + }, + { + "epoch": 0.08308354743553145, + "grad_norm": 1.9673899412155151, + "learning_rate": 4.915333630513498e-05, + "loss": 5.1908, + "step": 13970 + }, + { + "epoch": 0.08308949471881244, + "grad_norm": 1.8442246913909912, + "learning_rate": 4.915321576944524e-05, + "loss": 5.6287, + "step": 13971 + }, + { + "epoch": 0.08309544200209344, + "grad_norm": 1.5737566947937012, + "learning_rate": 4.9153095225323864e-05, + "loss": 5.7533, + "step": 13972 + }, + { + "epoch": 0.08310138928537444, + "grad_norm": 1.7948611974716187, + "learning_rate": 4.915297467277089e-05, + "loss": 5.5739, + "step": 13973 + }, + { + "epoch": 0.08310733656865543, + "grad_norm": 2.0080626010894775, + "learning_rate": 4.915285411178637e-05, + "loss": 5.5505, + "step": 13974 + }, + { + "epoch": 0.08311328385193643, + "grad_norm": 1.7838460206985474, + "learning_rate": 4.915273354237033e-05, + "loss": 6.0133, + "step": 13975 + }, + { + "epoch": 0.08311923113521744, + "grad_norm": 1.7599917650222778, + "learning_rate": 4.915261296452282e-05, + "loss": 5.6552, + "step": 13976 + }, + { + "epoch": 0.08312517841849842, + "grad_norm": 1.6211295127868652, + "learning_rate": 4.915249237824388e-05, + "loss": 5.6797, + "step": 13977 + }, + { + "epoch": 0.08313112570177943, + "grad_norm": 1.7404415607452393, + "learning_rate": 4.9152371783533565e-05, + "loss": 5.5134, + "step": 13978 + }, + { + "epoch": 0.08313707298506043, + "grad_norm": 1.8577871322631836, + "learning_rate": 4.9152251180391895e-05, + "loss": 5.5823, + "step": 13979 + }, + { + "epoch": 0.08314302026834142, + "grad_norm": 1.6060470342636108, + "learning_rate": 4.915213056881893e-05, + "loss": 5.5875, + "step": 13980 + }, + { + "epoch": 0.08314896755162242, + "grad_norm": 1.915451169013977, + "learning_rate": 4.91520099488147e-05, + "loss": 5.279, + "step": 13981 + }, + { + "epoch": 0.08315491483490342, + "grad_norm": 2.281404972076416, + "learning_rate": 4.9151889320379265e-05, + "loss": 5.0863, + "step": 13982 + }, + { + "epoch": 0.08316086211818441, + "grad_norm": 1.9069279432296753, + "learning_rate": 4.9151768683512646e-05, + "loss": 5.3055, + "step": 13983 + }, + { + "epoch": 0.08316680940146541, + "grad_norm": 1.810571312904358, + "learning_rate": 4.915164803821489e-05, + "loss": 5.4988, + "step": 13984 + }, + { + "epoch": 0.08317275668474641, + "grad_norm": 1.788197636604309, + "learning_rate": 4.915152738448605e-05, + "loss": 5.6627, + "step": 13985 + }, + { + "epoch": 0.0831787039680274, + "grad_norm": 2.294187545776367, + "learning_rate": 4.9151406722326165e-05, + "loss": 5.1977, + "step": 13986 + }, + { + "epoch": 0.0831846512513084, + "grad_norm": 2.584395170211792, + "learning_rate": 4.915128605173527e-05, + "loss": 5.1909, + "step": 13987 + }, + { + "epoch": 0.0831905985345894, + "grad_norm": 2.249406576156616, + "learning_rate": 4.9151165372713405e-05, + "loss": 5.1109, + "step": 13988 + }, + { + "epoch": 0.0831965458178704, + "grad_norm": 1.8678929805755615, + "learning_rate": 4.915104468526062e-05, + "loss": 5.1035, + "step": 13989 + }, + { + "epoch": 0.0832024931011514, + "grad_norm": 2.139711856842041, + "learning_rate": 4.915092398937696e-05, + "loss": 5.0151, + "step": 13990 + }, + { + "epoch": 0.0832084403844324, + "grad_norm": 2.1683461666107178, + "learning_rate": 4.915080328506246e-05, + "loss": 5.1097, + "step": 13991 + }, + { + "epoch": 0.08321438766771339, + "grad_norm": 2.1205332279205322, + "learning_rate": 4.9150682572317165e-05, + "loss": 4.9998, + "step": 13992 + }, + { + "epoch": 0.08322033495099439, + "grad_norm": 1.8642542362213135, + "learning_rate": 4.915056185114111e-05, + "loss": 5.8554, + "step": 13993 + }, + { + "epoch": 0.08322628223427539, + "grad_norm": 2.1150970458984375, + "learning_rate": 4.915044112153435e-05, + "loss": 5.5297, + "step": 13994 + }, + { + "epoch": 0.08323222951755638, + "grad_norm": 2.584157943725586, + "learning_rate": 4.9150320383496915e-05, + "loss": 5.0058, + "step": 13995 + }, + { + "epoch": 0.08323817680083738, + "grad_norm": 2.305853843688965, + "learning_rate": 4.9150199637028854e-05, + "loss": 5.0785, + "step": 13996 + }, + { + "epoch": 0.08324412408411838, + "grad_norm": 2.0386359691619873, + "learning_rate": 4.9150078882130214e-05, + "loss": 5.1104, + "step": 13997 + }, + { + "epoch": 0.08325007136739937, + "grad_norm": 1.6055399179458618, + "learning_rate": 4.914995811880102e-05, + "loss": 5.778, + "step": 13998 + }, + { + "epoch": 0.08325601865068037, + "grad_norm": 1.635704517364502, + "learning_rate": 4.9149837347041334e-05, + "loss": 6.1107, + "step": 13999 + }, + { + "epoch": 0.08326196593396136, + "grad_norm": 1.8098101615905762, + "learning_rate": 4.9149716566851184e-05, + "loss": 6.1197, + "step": 14000 + }, + { + "epoch": 0.08326791321724236, + "grad_norm": 1.5740363597869873, + "learning_rate": 4.914959577823062e-05, + "loss": 5.7821, + "step": 14001 + }, + { + "epoch": 0.08327386050052336, + "grad_norm": 1.4634822607040405, + "learning_rate": 4.914947498117968e-05, + "loss": 5.7062, + "step": 14002 + }, + { + "epoch": 0.08327980778380435, + "grad_norm": 1.7310374975204468, + "learning_rate": 4.914935417569841e-05, + "loss": 5.6689, + "step": 14003 + }, + { + "epoch": 0.08328575506708535, + "grad_norm": 1.5742056369781494, + "learning_rate": 4.914923336178685e-05, + "loss": 5.6529, + "step": 14004 + }, + { + "epoch": 0.08329170235036636, + "grad_norm": 1.6353307962417603, + "learning_rate": 4.914911253944504e-05, + "loss": 5.4564, + "step": 14005 + }, + { + "epoch": 0.08329764963364734, + "grad_norm": 1.8744231462478638, + "learning_rate": 4.9148991708673024e-05, + "loss": 5.305, + "step": 14006 + }, + { + "epoch": 0.08330359691692835, + "grad_norm": 1.9766863584518433, + "learning_rate": 4.914887086947085e-05, + "loss": 5.711, + "step": 14007 + }, + { + "epoch": 0.08330954420020935, + "grad_norm": 2.1832756996154785, + "learning_rate": 4.914875002183855e-05, + "loss": 4.9322, + "step": 14008 + }, + { + "epoch": 0.08331549148349034, + "grad_norm": 2.2370998859405518, + "learning_rate": 4.914862916577617e-05, + "loss": 4.512, + "step": 14009 + }, + { + "epoch": 0.08332143876677134, + "grad_norm": 2.2743804454803467, + "learning_rate": 4.914850830128376e-05, + "loss": 4.5716, + "step": 14010 + }, + { + "epoch": 0.08332738605005234, + "grad_norm": 2.3644347190856934, + "learning_rate": 4.914838742836134e-05, + "loss": 4.1288, + "step": 14011 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 3.1034274101257324, + "learning_rate": 4.9148266547008984e-05, + "loss": 5.2864, + "step": 14012 + }, + { + "epoch": 0.08333928061661433, + "grad_norm": 2.240302801132202, + "learning_rate": 4.914814565722671e-05, + "loss": 5.3452, + "step": 14013 + }, + { + "epoch": 0.08334522789989533, + "grad_norm": 2.0743885040283203, + "learning_rate": 4.9148024759014566e-05, + "loss": 5.4338, + "step": 14014 + }, + { + "epoch": 0.08335117518317632, + "grad_norm": 2.0169663429260254, + "learning_rate": 4.91479038523726e-05, + "loss": 5.5108, + "step": 14015 + }, + { + "epoch": 0.08335712246645732, + "grad_norm": 1.9730015993118286, + "learning_rate": 4.914778293730085e-05, + "loss": 5.6413, + "step": 14016 + }, + { + "epoch": 0.08336306974973832, + "grad_norm": 2.3047432899475098, + "learning_rate": 4.914766201379936e-05, + "loss": 5.4111, + "step": 14017 + }, + { + "epoch": 0.08336901703301931, + "grad_norm": 3.079416275024414, + "learning_rate": 4.914754108186816e-05, + "loss": 5.5591, + "step": 14018 + }, + { + "epoch": 0.08337496431630032, + "grad_norm": 1.9374867677688599, + "learning_rate": 4.9147420141507314e-05, + "loss": 5.9295, + "step": 14019 + }, + { + "epoch": 0.08338091159958132, + "grad_norm": 1.874292016029358, + "learning_rate": 4.9147299192716855e-05, + "loss": 5.6846, + "step": 14020 + }, + { + "epoch": 0.0833868588828623, + "grad_norm": 1.8852506875991821, + "learning_rate": 4.914717823549682e-05, + "loss": 5.621, + "step": 14021 + }, + { + "epoch": 0.08339280616614331, + "grad_norm": 1.9332367181777954, + "learning_rate": 4.914705726984725e-05, + "loss": 5.8584, + "step": 14022 + }, + { + "epoch": 0.08339875344942431, + "grad_norm": 1.6252962350845337, + "learning_rate": 4.91469362957682e-05, + "loss": 5.8173, + "step": 14023 + }, + { + "epoch": 0.0834047007327053, + "grad_norm": 1.6760259866714478, + "learning_rate": 4.9146815313259695e-05, + "loss": 5.5441, + "step": 14024 + }, + { + "epoch": 0.0834106480159863, + "grad_norm": 1.4979921579360962, + "learning_rate": 4.9146694322321785e-05, + "loss": 6.1467, + "step": 14025 + }, + { + "epoch": 0.0834165952992673, + "grad_norm": 1.4720534086227417, + "learning_rate": 4.914657332295453e-05, + "loss": 5.8626, + "step": 14026 + }, + { + "epoch": 0.08342254258254829, + "grad_norm": 1.6709620952606201, + "learning_rate": 4.914645231515794e-05, + "loss": 5.8468, + "step": 14027 + }, + { + "epoch": 0.08342848986582929, + "grad_norm": 1.6389116048812866, + "learning_rate": 4.9146331298932075e-05, + "loss": 5.9222, + "step": 14028 + }, + { + "epoch": 0.08343443714911028, + "grad_norm": 1.4344384670257568, + "learning_rate": 4.9146210274276974e-05, + "loss": 5.5457, + "step": 14029 + }, + { + "epoch": 0.08344038443239128, + "grad_norm": 1.472469449043274, + "learning_rate": 4.914608924119268e-05, + "loss": 5.608, + "step": 14030 + }, + { + "epoch": 0.08344633171567228, + "grad_norm": 1.6688710451126099, + "learning_rate": 4.914596819967925e-05, + "loss": 5.7982, + "step": 14031 + }, + { + "epoch": 0.08345227899895327, + "grad_norm": 1.6417087316513062, + "learning_rate": 4.9145847149736704e-05, + "loss": 5.6498, + "step": 14032 + }, + { + "epoch": 0.08345822628223427, + "grad_norm": 1.5726937055587769, + "learning_rate": 4.9145726091365084e-05, + "loss": 5.8723, + "step": 14033 + }, + { + "epoch": 0.08346417356551528, + "grad_norm": 1.7523616552352905, + "learning_rate": 4.914560502456444e-05, + "loss": 6.1967, + "step": 14034 + }, + { + "epoch": 0.08347012084879626, + "grad_norm": 1.8270281553268433, + "learning_rate": 4.914548394933483e-05, + "loss": 6.0493, + "step": 14035 + }, + { + "epoch": 0.08347606813207727, + "grad_norm": 1.8113981485366821, + "learning_rate": 4.914536286567627e-05, + "loss": 5.2815, + "step": 14036 + }, + { + "epoch": 0.08348201541535827, + "grad_norm": 1.7894388437271118, + "learning_rate": 4.914524177358881e-05, + "loss": 5.2606, + "step": 14037 + }, + { + "epoch": 0.08348796269863926, + "grad_norm": 1.7994349002838135, + "learning_rate": 4.9145120673072505e-05, + "loss": 5.025, + "step": 14038 + }, + { + "epoch": 0.08349390998192026, + "grad_norm": 1.6934137344360352, + "learning_rate": 4.914499956412738e-05, + "loss": 5.0455, + "step": 14039 + }, + { + "epoch": 0.08349985726520126, + "grad_norm": 1.549500823020935, + "learning_rate": 4.914487844675349e-05, + "loss": 5.3836, + "step": 14040 + }, + { + "epoch": 0.08350580454848225, + "grad_norm": 1.7452481985092163, + "learning_rate": 4.9144757320950873e-05, + "loss": 5.0175, + "step": 14041 + }, + { + "epoch": 0.08351175183176325, + "grad_norm": 1.9420257806777954, + "learning_rate": 4.914463618671957e-05, + "loss": 5.0146, + "step": 14042 + }, + { + "epoch": 0.08351769911504425, + "grad_norm": 1.798431158065796, + "learning_rate": 4.914451504405962e-05, + "loss": 4.7656, + "step": 14043 + }, + { + "epoch": 0.08352364639832524, + "grad_norm": 1.7167326211929321, + "learning_rate": 4.914439389297107e-05, + "loss": 4.7518, + "step": 14044 + }, + { + "epoch": 0.08352959368160624, + "grad_norm": 1.7150487899780273, + "learning_rate": 4.914427273345397e-05, + "loss": 4.8298, + "step": 14045 + }, + { + "epoch": 0.08353554096488724, + "grad_norm": 1.7048633098602295, + "learning_rate": 4.914415156550834e-05, + "loss": 5.0039, + "step": 14046 + }, + { + "epoch": 0.08354148824816823, + "grad_norm": 1.364012598991394, + "learning_rate": 4.914403038913425e-05, + "loss": 5.3718, + "step": 14047 + }, + { + "epoch": 0.08354743553144924, + "grad_norm": 2.29878830909729, + "learning_rate": 4.9143909204331716e-05, + "loss": 4.8874, + "step": 14048 + }, + { + "epoch": 0.08355338281473024, + "grad_norm": 2.1153953075408936, + "learning_rate": 4.91437880111008e-05, + "loss": 4.6646, + "step": 14049 + }, + { + "epoch": 0.08355933009801123, + "grad_norm": 2.289346218109131, + "learning_rate": 4.914366680944153e-05, + "loss": 4.7966, + "step": 14050 + }, + { + "epoch": 0.08356527738129223, + "grad_norm": 1.8394019603729248, + "learning_rate": 4.9143545599353965e-05, + "loss": 5.1788, + "step": 14051 + }, + { + "epoch": 0.08357122466457323, + "grad_norm": 2.192802667617798, + "learning_rate": 4.9143424380838136e-05, + "loss": 5.4549, + "step": 14052 + }, + { + "epoch": 0.08357717194785422, + "grad_norm": 2.128356695175171, + "learning_rate": 4.9143303153894085e-05, + "loss": 5.6652, + "step": 14053 + }, + { + "epoch": 0.08358311923113522, + "grad_norm": 2.0716452598571777, + "learning_rate": 4.914318191852186e-05, + "loss": 5.7013, + "step": 14054 + }, + { + "epoch": 0.08358906651441622, + "grad_norm": 2.298940658569336, + "learning_rate": 4.91430606747215e-05, + "loss": 5.565, + "step": 14055 + }, + { + "epoch": 0.08359501379769721, + "grad_norm": 2.250102996826172, + "learning_rate": 4.914293942249304e-05, + "loss": 5.6935, + "step": 14056 + }, + { + "epoch": 0.08360096108097821, + "grad_norm": 2.123037576675415, + "learning_rate": 4.914281816183653e-05, + "loss": 5.624, + "step": 14057 + }, + { + "epoch": 0.0836069083642592, + "grad_norm": 1.833024501800537, + "learning_rate": 4.9142696892752013e-05, + "loss": 5.4329, + "step": 14058 + }, + { + "epoch": 0.0836128556475402, + "grad_norm": 1.8438977003097534, + "learning_rate": 4.9142575615239526e-05, + "loss": 5.294, + "step": 14059 + }, + { + "epoch": 0.0836188029308212, + "grad_norm": 1.805525541305542, + "learning_rate": 4.914245432929913e-05, + "loss": 5.3778, + "step": 14060 + }, + { + "epoch": 0.08362475021410219, + "grad_norm": 1.5750529766082764, + "learning_rate": 4.9142333034930835e-05, + "loss": 5.357, + "step": 14061 + }, + { + "epoch": 0.0836306974973832, + "grad_norm": 1.3928825855255127, + "learning_rate": 4.914221173213471e-05, + "loss": 5.5141, + "step": 14062 + }, + { + "epoch": 0.0836366447806642, + "grad_norm": 1.6307804584503174, + "learning_rate": 4.914209042091079e-05, + "loss": 5.3687, + "step": 14063 + }, + { + "epoch": 0.08364259206394518, + "grad_norm": 1.533963680267334, + "learning_rate": 4.914196910125911e-05, + "loss": 5.7295, + "step": 14064 + }, + { + "epoch": 0.08364853934722619, + "grad_norm": 1.4950587749481201, + "learning_rate": 4.914184777317972e-05, + "loss": 5.816, + "step": 14065 + }, + { + "epoch": 0.08365448663050719, + "grad_norm": 1.3246190547943115, + "learning_rate": 4.914172643667266e-05, + "loss": 5.6925, + "step": 14066 + }, + { + "epoch": 0.08366043391378818, + "grad_norm": 1.4816724061965942, + "learning_rate": 4.9141605091737975e-05, + "loss": 5.6528, + "step": 14067 + }, + { + "epoch": 0.08366638119706918, + "grad_norm": 1.6656372547149658, + "learning_rate": 4.914148373837571e-05, + "loss": 5.4619, + "step": 14068 + }, + { + "epoch": 0.08367232848035018, + "grad_norm": 1.2973356246948242, + "learning_rate": 4.914136237658589e-05, + "loss": 5.5467, + "step": 14069 + }, + { + "epoch": 0.08367827576363117, + "grad_norm": 1.7669901847839355, + "learning_rate": 4.914124100636857e-05, + "loss": 5.2213, + "step": 14070 + }, + { + "epoch": 0.08368422304691217, + "grad_norm": 1.7352882623672485, + "learning_rate": 4.91411196277238e-05, + "loss": 5.2938, + "step": 14071 + }, + { + "epoch": 0.08369017033019317, + "grad_norm": 1.5912410020828247, + "learning_rate": 4.914099824065161e-05, + "loss": 5.4139, + "step": 14072 + }, + { + "epoch": 0.08369611761347416, + "grad_norm": 1.46699059009552, + "learning_rate": 4.914087684515205e-05, + "loss": 5.2317, + "step": 14073 + }, + { + "epoch": 0.08370206489675516, + "grad_norm": 3.0727121829986572, + "learning_rate": 4.914075544122516e-05, + "loss": 5.2324, + "step": 14074 + }, + { + "epoch": 0.08370801218003616, + "grad_norm": 1.4887278079986572, + "learning_rate": 4.914063402887098e-05, + "loss": 5.0331, + "step": 14075 + }, + { + "epoch": 0.08371395946331715, + "grad_norm": 1.4677956104278564, + "learning_rate": 4.9140512608089555e-05, + "loss": 5.0892, + "step": 14076 + }, + { + "epoch": 0.08371990674659816, + "grad_norm": 1.3760831356048584, + "learning_rate": 4.914039117888093e-05, + "loss": 5.3738, + "step": 14077 + }, + { + "epoch": 0.08372585402987916, + "grad_norm": 1.6125822067260742, + "learning_rate": 4.9140269741245135e-05, + "loss": 5.4629, + "step": 14078 + }, + { + "epoch": 0.08373180131316015, + "grad_norm": 1.6336333751678467, + "learning_rate": 4.9140148295182226e-05, + "loss": 5.2533, + "step": 14079 + }, + { + "epoch": 0.08373774859644115, + "grad_norm": 1.6296573877334595, + "learning_rate": 4.9140026840692247e-05, + "loss": 4.8288, + "step": 14080 + }, + { + "epoch": 0.08374369587972215, + "grad_norm": 1.6058591604232788, + "learning_rate": 4.913990537777522e-05, + "loss": 5.0549, + "step": 14081 + }, + { + "epoch": 0.08374964316300314, + "grad_norm": 1.6199642419815063, + "learning_rate": 4.9139783906431214e-05, + "loss": 5.2387, + "step": 14082 + }, + { + "epoch": 0.08375559044628414, + "grad_norm": 1.7537976503372192, + "learning_rate": 4.913966242666025e-05, + "loss": 5.2766, + "step": 14083 + }, + { + "epoch": 0.08376153772956514, + "grad_norm": 1.579128384590149, + "learning_rate": 4.9139540938462384e-05, + "loss": 5.2251, + "step": 14084 + }, + { + "epoch": 0.08376748501284613, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.913941944183765e-05, + "loss": 5.0699, + "step": 14085 + }, + { + "epoch": 0.08377343229612713, + "grad_norm": 1.4739151000976562, + "learning_rate": 4.91392979367861e-05, + "loss": 5.229, + "step": 14086 + }, + { + "epoch": 0.08377937957940812, + "grad_norm": 1.6380045413970947, + "learning_rate": 4.9139176423307764e-05, + "loss": 5.0977, + "step": 14087 + }, + { + "epoch": 0.08378532686268912, + "grad_norm": 1.640865445137024, + "learning_rate": 4.91390549014027e-05, + "loss": 5.1106, + "step": 14088 + }, + { + "epoch": 0.08379127414597012, + "grad_norm": 1.7274518013000488, + "learning_rate": 4.913893337107093e-05, + "loss": 5.2093, + "step": 14089 + }, + { + "epoch": 0.08379722142925111, + "grad_norm": 1.7702603340148926, + "learning_rate": 4.913881183231251e-05, + "loss": 5.1314, + "step": 14090 + }, + { + "epoch": 0.08380316871253211, + "grad_norm": 1.766479253768921, + "learning_rate": 4.913869028512749e-05, + "loss": 5.1266, + "step": 14091 + }, + { + "epoch": 0.08380911599581312, + "grad_norm": 1.5863205194473267, + "learning_rate": 4.91385687295159e-05, + "loss": 5.1487, + "step": 14092 + }, + { + "epoch": 0.0838150632790941, + "grad_norm": 1.6770803928375244, + "learning_rate": 4.913844716547777e-05, + "loss": 5.2479, + "step": 14093 + }, + { + "epoch": 0.0838210105623751, + "grad_norm": 1.8650991916656494, + "learning_rate": 4.913832559301317e-05, + "loss": 5.2748, + "step": 14094 + }, + { + "epoch": 0.08382695784565611, + "grad_norm": 1.7304933071136475, + "learning_rate": 4.913820401212213e-05, + "loss": 5.2572, + "step": 14095 + }, + { + "epoch": 0.0838329051289371, + "grad_norm": 1.7103501558303833, + "learning_rate": 4.9138082422804695e-05, + "loss": 5.1145, + "step": 14096 + }, + { + "epoch": 0.0838388524122181, + "grad_norm": 1.8390073776245117, + "learning_rate": 4.91379608250609e-05, + "loss": 5.1171, + "step": 14097 + }, + { + "epoch": 0.0838447996954991, + "grad_norm": 1.815047264099121, + "learning_rate": 4.913783921889079e-05, + "loss": 5.2329, + "step": 14098 + }, + { + "epoch": 0.08385074697878009, + "grad_norm": 1.4381682872772217, + "learning_rate": 4.9137717604294415e-05, + "loss": 5.1098, + "step": 14099 + }, + { + "epoch": 0.08385669426206109, + "grad_norm": 1.6523853540420532, + "learning_rate": 4.9137595981271815e-05, + "loss": 5.1352, + "step": 14100 + }, + { + "epoch": 0.08386264154534209, + "grad_norm": 1.377199649810791, + "learning_rate": 4.913747434982302e-05, + "loss": 5.1191, + "step": 14101 + }, + { + "epoch": 0.08386858882862308, + "grad_norm": 1.5858699083328247, + "learning_rate": 4.913735270994809e-05, + "loss": 5.0569, + "step": 14102 + }, + { + "epoch": 0.08387453611190408, + "grad_norm": 1.608522891998291, + "learning_rate": 4.913723106164705e-05, + "loss": 4.8834, + "step": 14103 + }, + { + "epoch": 0.08388048339518508, + "grad_norm": 1.7063453197479248, + "learning_rate": 4.913710940491996e-05, + "loss": 4.9019, + "step": 14104 + }, + { + "epoch": 0.08388643067846607, + "grad_norm": 1.5008784532546997, + "learning_rate": 4.913698773976685e-05, + "loss": 4.8423, + "step": 14105 + }, + { + "epoch": 0.08389237796174707, + "grad_norm": 1.8743178844451904, + "learning_rate": 4.913686606618777e-05, + "loss": 4.9256, + "step": 14106 + }, + { + "epoch": 0.08389832524502808, + "grad_norm": 1.813094973564148, + "learning_rate": 4.9136744384182764e-05, + "loss": 4.9245, + "step": 14107 + }, + { + "epoch": 0.08390427252830907, + "grad_norm": 1.9561067819595337, + "learning_rate": 4.913662269375186e-05, + "loss": 4.8459, + "step": 14108 + }, + { + "epoch": 0.08391021981159007, + "grad_norm": 1.6159533262252808, + "learning_rate": 4.913650099489512e-05, + "loss": 4.8092, + "step": 14109 + }, + { + "epoch": 0.08391616709487107, + "grad_norm": 1.5819872617721558, + "learning_rate": 4.913637928761257e-05, + "loss": 4.9047, + "step": 14110 + }, + { + "epoch": 0.08392211437815206, + "grad_norm": 1.6294678449630737, + "learning_rate": 4.913625757190426e-05, + "loss": 4.6908, + "step": 14111 + }, + { + "epoch": 0.08392806166143306, + "grad_norm": 1.5048410892486572, + "learning_rate": 4.913613584777024e-05, + "loss": 5.2021, + "step": 14112 + }, + { + "epoch": 0.08393400894471406, + "grad_norm": 1.626280665397644, + "learning_rate": 4.9136014115210525e-05, + "loss": 5.4592, + "step": 14113 + }, + { + "epoch": 0.08393995622799505, + "grad_norm": 1.662269115447998, + "learning_rate": 4.91358923742252e-05, + "loss": 5.0027, + "step": 14114 + }, + { + "epoch": 0.08394590351127605, + "grad_norm": 1.5630388259887695, + "learning_rate": 4.913577062481427e-05, + "loss": 5.3327, + "step": 14115 + }, + { + "epoch": 0.08395185079455704, + "grad_norm": 1.4223047494888306, + "learning_rate": 4.913564886697779e-05, + "loss": 5.5081, + "step": 14116 + }, + { + "epoch": 0.08395779807783804, + "grad_norm": 1.3298295736312866, + "learning_rate": 4.9135527100715814e-05, + "loss": 5.3783, + "step": 14117 + }, + { + "epoch": 0.08396374536111904, + "grad_norm": 1.335779070854187, + "learning_rate": 4.913540532602837e-05, + "loss": 5.3901, + "step": 14118 + }, + { + "epoch": 0.08396969264440003, + "grad_norm": 1.5331017971038818, + "learning_rate": 4.913528354291551e-05, + "loss": 5.5643, + "step": 14119 + }, + { + "epoch": 0.08397563992768103, + "grad_norm": 1.703400731086731, + "learning_rate": 4.913516175137727e-05, + "loss": 5.4256, + "step": 14120 + }, + { + "epoch": 0.08398158721096204, + "grad_norm": 1.5330191850662231, + "learning_rate": 4.913503995141369e-05, + "loss": 5.2509, + "step": 14121 + }, + { + "epoch": 0.08398753449424302, + "grad_norm": 1.7405961751937866, + "learning_rate": 4.913491814302482e-05, + "loss": 5.4171, + "step": 14122 + }, + { + "epoch": 0.08399348177752403, + "grad_norm": 1.2550197839736938, + "learning_rate": 4.9134796326210696e-05, + "loss": 5.3908, + "step": 14123 + }, + { + "epoch": 0.08399942906080503, + "grad_norm": 1.2029253244400024, + "learning_rate": 4.9134674500971366e-05, + "loss": 5.5355, + "step": 14124 + }, + { + "epoch": 0.08400537634408602, + "grad_norm": 1.2968589067459106, + "learning_rate": 4.913455266730687e-05, + "loss": 5.4007, + "step": 14125 + }, + { + "epoch": 0.08401132362736702, + "grad_norm": 1.2636605501174927, + "learning_rate": 4.913443082521725e-05, + "loss": 5.2402, + "step": 14126 + }, + { + "epoch": 0.08401727091064802, + "grad_norm": 1.2112632989883423, + "learning_rate": 4.9134308974702554e-05, + "loss": 5.2595, + "step": 14127 + }, + { + "epoch": 0.08402321819392901, + "grad_norm": 1.447730302810669, + "learning_rate": 4.913418711576282e-05, + "loss": 5.2688, + "step": 14128 + }, + { + "epoch": 0.08402916547721001, + "grad_norm": 1.4328616857528687, + "learning_rate": 4.913406524839809e-05, + "loss": 5.2368, + "step": 14129 + }, + { + "epoch": 0.08403511276049101, + "grad_norm": 1.4782198667526245, + "learning_rate": 4.91339433726084e-05, + "loss": 5.2019, + "step": 14130 + }, + { + "epoch": 0.084041060043772, + "grad_norm": 1.499373197555542, + "learning_rate": 4.913382148839381e-05, + "loss": 5.3352, + "step": 14131 + }, + { + "epoch": 0.084047007327053, + "grad_norm": 1.37551748752594, + "learning_rate": 4.9133699595754346e-05, + "loss": 5.1566, + "step": 14132 + }, + { + "epoch": 0.084052954610334, + "grad_norm": 1.6400420665740967, + "learning_rate": 4.913357769469006e-05, + "loss": 5.5225, + "step": 14133 + }, + { + "epoch": 0.08405890189361499, + "grad_norm": 1.3855832815170288, + "learning_rate": 4.913345578520099e-05, + "loss": 5.4466, + "step": 14134 + }, + { + "epoch": 0.084064849176896, + "grad_norm": 1.783508062362671, + "learning_rate": 4.913333386728718e-05, + "loss": 5.1713, + "step": 14135 + }, + { + "epoch": 0.084070796460177, + "grad_norm": 2.435201406478882, + "learning_rate": 4.913321194094866e-05, + "loss": 4.9899, + "step": 14136 + }, + { + "epoch": 0.08407674374345799, + "grad_norm": 1.708850622177124, + "learning_rate": 4.91330900061855e-05, + "loss": 5.0808, + "step": 14137 + }, + { + "epoch": 0.08408269102673899, + "grad_norm": 1.583473801612854, + "learning_rate": 4.913296806299773e-05, + "loss": 5.0164, + "step": 14138 + }, + { + "epoch": 0.08408863831001999, + "grad_norm": 1.6990292072296143, + "learning_rate": 4.9132846111385386e-05, + "loss": 4.9476, + "step": 14139 + }, + { + "epoch": 0.08409458559330098, + "grad_norm": 1.6386258602142334, + "learning_rate": 4.913272415134851e-05, + "loss": 4.9357, + "step": 14140 + }, + { + "epoch": 0.08410053287658198, + "grad_norm": 1.258575439453125, + "learning_rate": 4.9132602182887156e-05, + "loss": 4.7666, + "step": 14141 + }, + { + "epoch": 0.08410648015986298, + "grad_norm": 1.3333406448364258, + "learning_rate": 4.913248020600135e-05, + "loss": 4.698, + "step": 14142 + }, + { + "epoch": 0.08411242744314397, + "grad_norm": 1.3663051128387451, + "learning_rate": 4.913235822069116e-05, + "loss": 4.9414, + "step": 14143 + }, + { + "epoch": 0.08411837472642497, + "grad_norm": 1.6906498670578003, + "learning_rate": 4.91322362269566e-05, + "loss": 5.281, + "step": 14144 + }, + { + "epoch": 0.08412432200970596, + "grad_norm": 1.2671558856964111, + "learning_rate": 4.9132114224797735e-05, + "loss": 5.2566, + "step": 14145 + }, + { + "epoch": 0.08413026929298696, + "grad_norm": 1.4022216796875, + "learning_rate": 4.9131992214214586e-05, + "loss": 5.128, + "step": 14146 + }, + { + "epoch": 0.08413621657626796, + "grad_norm": 1.4810549020767212, + "learning_rate": 4.913187019520722e-05, + "loss": 5.0172, + "step": 14147 + }, + { + "epoch": 0.08414216385954895, + "grad_norm": 1.2757905721664429, + "learning_rate": 4.913174816777566e-05, + "loss": 5.3796, + "step": 14148 + }, + { + "epoch": 0.08414811114282995, + "grad_norm": 1.4088176488876343, + "learning_rate": 4.913162613191996e-05, + "loss": 5.4586, + "step": 14149 + }, + { + "epoch": 0.08415405842611096, + "grad_norm": 1.5218896865844727, + "learning_rate": 4.9131504087640154e-05, + "loss": 5.1652, + "step": 14150 + }, + { + "epoch": 0.08416000570939194, + "grad_norm": 1.4234968423843384, + "learning_rate": 4.913138203493629e-05, + "loss": 5.1917, + "step": 14151 + }, + { + "epoch": 0.08416595299267295, + "grad_norm": 1.4841183423995972, + "learning_rate": 4.913125997380842e-05, + "loss": 5.2818, + "step": 14152 + }, + { + "epoch": 0.08417190027595395, + "grad_norm": 1.8631536960601807, + "learning_rate": 4.9131137904256564e-05, + "loss": 5.4848, + "step": 14153 + }, + { + "epoch": 0.08417784755923494, + "grad_norm": 1.5508880615234375, + "learning_rate": 4.913101582628078e-05, + "loss": 5.3698, + "step": 14154 + }, + { + "epoch": 0.08418379484251594, + "grad_norm": 1.2428319454193115, + "learning_rate": 4.913089373988111e-05, + "loss": 5.2071, + "step": 14155 + }, + { + "epoch": 0.08418974212579694, + "grad_norm": 1.405325174331665, + "learning_rate": 4.91307716450576e-05, + "loss": 5.1774, + "step": 14156 + }, + { + "epoch": 0.08419568940907793, + "grad_norm": 1.6800439357757568, + "learning_rate": 4.913064954181028e-05, + "loss": 5.3735, + "step": 14157 + }, + { + "epoch": 0.08420163669235893, + "grad_norm": 1.475174069404602, + "learning_rate": 4.9130527430139194e-05, + "loss": 5.3303, + "step": 14158 + }, + { + "epoch": 0.08420758397563993, + "grad_norm": 1.5441967248916626, + "learning_rate": 4.91304053100444e-05, + "loss": 5.3007, + "step": 14159 + }, + { + "epoch": 0.08421353125892092, + "grad_norm": 1.3798770904541016, + "learning_rate": 4.913028318152593e-05, + "loss": 5.287, + "step": 14160 + }, + { + "epoch": 0.08421947854220192, + "grad_norm": 1.4294620752334595, + "learning_rate": 4.913016104458382e-05, + "loss": 5.3159, + "step": 14161 + }, + { + "epoch": 0.08422542582548292, + "grad_norm": 1.4971884489059448, + "learning_rate": 4.913003889921812e-05, + "loss": 5.4701, + "step": 14162 + }, + { + "epoch": 0.08423137310876391, + "grad_norm": 1.447045922279358, + "learning_rate": 4.912991674542888e-05, + "loss": 5.306, + "step": 14163 + }, + { + "epoch": 0.08423732039204491, + "grad_norm": 1.7867134809494019, + "learning_rate": 4.9129794583216135e-05, + "loss": 4.8653, + "step": 14164 + }, + { + "epoch": 0.08424326767532592, + "grad_norm": 1.6931066513061523, + "learning_rate": 4.912967241257993e-05, + "loss": 4.7628, + "step": 14165 + }, + { + "epoch": 0.0842492149586069, + "grad_norm": 1.6567879915237427, + "learning_rate": 4.91295502335203e-05, + "loss": 4.7857, + "step": 14166 + }, + { + "epoch": 0.08425516224188791, + "grad_norm": 1.6891521215438843, + "learning_rate": 4.91294280460373e-05, + "loss": 4.7873, + "step": 14167 + }, + { + "epoch": 0.08426110952516891, + "grad_norm": 1.6237304210662842, + "learning_rate": 4.912930585013095e-05, + "loss": 4.8596, + "step": 14168 + }, + { + "epoch": 0.0842670568084499, + "grad_norm": 1.585802674293518, + "learning_rate": 4.912918364580132e-05, + "loss": 4.8226, + "step": 14169 + }, + { + "epoch": 0.0842730040917309, + "grad_norm": 1.6892811059951782, + "learning_rate": 4.912906143304844e-05, + "loss": 4.8307, + "step": 14170 + }, + { + "epoch": 0.0842789513750119, + "grad_norm": 1.8254313468933105, + "learning_rate": 4.912893921187236e-05, + "loss": 4.8508, + "step": 14171 + }, + { + "epoch": 0.08428489865829289, + "grad_norm": 1.5577294826507568, + "learning_rate": 4.912881698227311e-05, + "loss": 4.7303, + "step": 14172 + }, + { + "epoch": 0.08429084594157389, + "grad_norm": 1.5635697841644287, + "learning_rate": 4.912869474425074e-05, + "loss": 4.9597, + "step": 14173 + }, + { + "epoch": 0.08429679322485488, + "grad_norm": 1.6620457172393799, + "learning_rate": 4.9128572497805294e-05, + "loss": 5.1012, + "step": 14174 + }, + { + "epoch": 0.08430274050813588, + "grad_norm": 1.4082841873168945, + "learning_rate": 4.912845024293681e-05, + "loss": 5.1785, + "step": 14175 + }, + { + "epoch": 0.08430868779141688, + "grad_norm": 1.5914233922958374, + "learning_rate": 4.9128327979645336e-05, + "loss": 5.2035, + "step": 14176 + }, + { + "epoch": 0.08431463507469787, + "grad_norm": 1.3170946836471558, + "learning_rate": 4.912820570793091e-05, + "loss": 5.35, + "step": 14177 + }, + { + "epoch": 0.08432058235797887, + "grad_norm": 1.3059190511703491, + "learning_rate": 4.912808342779357e-05, + "loss": 5.1428, + "step": 14178 + }, + { + "epoch": 0.08432652964125988, + "grad_norm": 1.438844919204712, + "learning_rate": 4.912796113923337e-05, + "loss": 5.2154, + "step": 14179 + }, + { + "epoch": 0.08433247692454086, + "grad_norm": 1.401469349861145, + "learning_rate": 4.912783884225035e-05, + "loss": 5.0941, + "step": 14180 + }, + { + "epoch": 0.08433842420782187, + "grad_norm": 1.6718204021453857, + "learning_rate": 4.912771653684456e-05, + "loss": 5.3221, + "step": 14181 + }, + { + "epoch": 0.08434437149110287, + "grad_norm": 1.51036536693573, + "learning_rate": 4.912759422301602e-05, + "loss": 5.2619, + "step": 14182 + }, + { + "epoch": 0.08435031877438386, + "grad_norm": 1.6579569578170776, + "learning_rate": 4.9127471900764795e-05, + "loss": 5.1176, + "step": 14183 + }, + { + "epoch": 0.08435626605766486, + "grad_norm": 1.5300757884979248, + "learning_rate": 4.912734957009091e-05, + "loss": 5.1625, + "step": 14184 + }, + { + "epoch": 0.08436221334094586, + "grad_norm": 1.2839969396591187, + "learning_rate": 4.912722723099442e-05, + "loss": 5.0852, + "step": 14185 + }, + { + "epoch": 0.08436816062422685, + "grad_norm": 1.7074840068817139, + "learning_rate": 4.9127104883475364e-05, + "loss": 5.1611, + "step": 14186 + }, + { + "epoch": 0.08437410790750785, + "grad_norm": 1.790992021560669, + "learning_rate": 4.9126982527533797e-05, + "loss": 5.0386, + "step": 14187 + }, + { + "epoch": 0.08438005519078885, + "grad_norm": 1.5269246101379395, + "learning_rate": 4.912686016316973e-05, + "loss": 5.0272, + "step": 14188 + }, + { + "epoch": 0.08438600247406984, + "grad_norm": 1.510847806930542, + "learning_rate": 4.9126737790383234e-05, + "loss": 5.2073, + "step": 14189 + }, + { + "epoch": 0.08439194975735084, + "grad_norm": 1.6551074981689453, + "learning_rate": 4.912661540917435e-05, + "loss": 5.0436, + "step": 14190 + }, + { + "epoch": 0.08439789704063184, + "grad_norm": 1.3152271509170532, + "learning_rate": 4.91264930195431e-05, + "loss": 5.0981, + "step": 14191 + }, + { + "epoch": 0.08440384432391283, + "grad_norm": 1.478190302848816, + "learning_rate": 4.912637062148955e-05, + "loss": 5.1172, + "step": 14192 + }, + { + "epoch": 0.08440979160719383, + "grad_norm": 1.4574978351593018, + "learning_rate": 4.912624821501373e-05, + "loss": 4.9757, + "step": 14193 + }, + { + "epoch": 0.08441573889047484, + "grad_norm": 1.600182056427002, + "learning_rate": 4.912612580011568e-05, + "loss": 5.1763, + "step": 14194 + }, + { + "epoch": 0.08442168617375582, + "grad_norm": 1.5805768966674805, + "learning_rate": 4.912600337679546e-05, + "loss": 5.1949, + "step": 14195 + }, + { + "epoch": 0.08442763345703683, + "grad_norm": 1.465785264968872, + "learning_rate": 4.9125880945053106e-05, + "loss": 5.0695, + "step": 14196 + }, + { + "epoch": 0.08443358074031783, + "grad_norm": 1.6188615560531616, + "learning_rate": 4.912575850488864e-05, + "loss": 5.1263, + "step": 14197 + }, + { + "epoch": 0.08443952802359882, + "grad_norm": 2.4953408241271973, + "learning_rate": 4.9125636056302125e-05, + "loss": 5.6462, + "step": 14198 + }, + { + "epoch": 0.08444547530687982, + "grad_norm": 1.6779934167861938, + "learning_rate": 4.91255135992936e-05, + "loss": 5.1673, + "step": 14199 + }, + { + "epoch": 0.08445142259016082, + "grad_norm": 1.648706316947937, + "learning_rate": 4.912539113386312e-05, + "loss": 5.3792, + "step": 14200 + }, + { + "epoch": 0.08445736987344181, + "grad_norm": 1.4866549968719482, + "learning_rate": 4.91252686600107e-05, + "loss": 5.2828, + "step": 14201 + }, + { + "epoch": 0.08446331715672281, + "grad_norm": 1.6002475023269653, + "learning_rate": 4.912514617773641e-05, + "loss": 5.3255, + "step": 14202 + }, + { + "epoch": 0.0844692644400038, + "grad_norm": 1.4162862300872803, + "learning_rate": 4.912502368704027e-05, + "loss": 5.3363, + "step": 14203 + }, + { + "epoch": 0.0844752117232848, + "grad_norm": 1.4465757608413696, + "learning_rate": 4.912490118792234e-05, + "loss": 5.586, + "step": 14204 + }, + { + "epoch": 0.0844811590065658, + "grad_norm": 1.8178991079330444, + "learning_rate": 4.912477868038266e-05, + "loss": 5.3029, + "step": 14205 + }, + { + "epoch": 0.08448710628984679, + "grad_norm": 1.4270378351211548, + "learning_rate": 4.912465616442126e-05, + "loss": 5.3864, + "step": 14206 + }, + { + "epoch": 0.0844930535731278, + "grad_norm": 1.5574913024902344, + "learning_rate": 4.91245336400382e-05, + "loss": 5.7667, + "step": 14207 + }, + { + "epoch": 0.0844990008564088, + "grad_norm": 1.3866809606552124, + "learning_rate": 4.91244111072335e-05, + "loss": 5.683, + "step": 14208 + }, + { + "epoch": 0.08450494813968978, + "grad_norm": 1.3390960693359375, + "learning_rate": 4.912428856600722e-05, + "loss": 5.7286, + "step": 14209 + }, + { + "epoch": 0.08451089542297079, + "grad_norm": 1.4317498207092285, + "learning_rate": 4.912416601635942e-05, + "loss": 5.6913, + "step": 14210 + }, + { + "epoch": 0.08451684270625179, + "grad_norm": 1.3110778331756592, + "learning_rate": 4.91240434582901e-05, + "loss": 5.6325, + "step": 14211 + }, + { + "epoch": 0.08452278998953278, + "grad_norm": 1.3288872241973877, + "learning_rate": 4.9123920891799344e-05, + "loss": 5.6343, + "step": 14212 + }, + { + "epoch": 0.08452873727281378, + "grad_norm": 1.2967199087142944, + "learning_rate": 4.912379831688716e-05, + "loss": 5.6514, + "step": 14213 + }, + { + "epoch": 0.08453468455609478, + "grad_norm": 1.6022506952285767, + "learning_rate": 4.912367573355362e-05, + "loss": 5.4006, + "step": 14214 + }, + { + "epoch": 0.08454063183937577, + "grad_norm": 1.6698434352874756, + "learning_rate": 4.912355314179875e-05, + "loss": 5.1543, + "step": 14215 + }, + { + "epoch": 0.08454657912265677, + "grad_norm": 1.6759408712387085, + "learning_rate": 4.9123430541622594e-05, + "loss": 4.9744, + "step": 14216 + }, + { + "epoch": 0.08455252640593777, + "grad_norm": 2.470752239227295, + "learning_rate": 4.91233079330252e-05, + "loss": 5.7614, + "step": 14217 + }, + { + "epoch": 0.08455847368921876, + "grad_norm": 2.1985907554626465, + "learning_rate": 4.91231853160066e-05, + "loss": 6.037, + "step": 14218 + }, + { + "epoch": 0.08456442097249976, + "grad_norm": 2.079569101333618, + "learning_rate": 4.912306269056686e-05, + "loss": 5.4943, + "step": 14219 + }, + { + "epoch": 0.08457036825578076, + "grad_norm": 2.2941744327545166, + "learning_rate": 4.9122940056706e-05, + "loss": 5.3733, + "step": 14220 + }, + { + "epoch": 0.08457631553906175, + "grad_norm": 1.9538209438323975, + "learning_rate": 4.912281741442407e-05, + "loss": 5.6362, + "step": 14221 + }, + { + "epoch": 0.08458226282234275, + "grad_norm": 1.7498515844345093, + "learning_rate": 4.9122694763721124e-05, + "loss": 5.7129, + "step": 14222 + }, + { + "epoch": 0.08458821010562376, + "grad_norm": 2.1728787422180176, + "learning_rate": 4.912257210459718e-05, + "loss": 5.4633, + "step": 14223 + }, + { + "epoch": 0.08459415738890474, + "grad_norm": 2.2436587810516357, + "learning_rate": 4.91224494370523e-05, + "loss": 5.3996, + "step": 14224 + }, + { + "epoch": 0.08460010467218575, + "grad_norm": 2.400299549102783, + "learning_rate": 4.912232676108653e-05, + "loss": 5.3994, + "step": 14225 + }, + { + "epoch": 0.08460605195546675, + "grad_norm": 1.9408513307571411, + "learning_rate": 4.91222040766999e-05, + "loss": 5.4537, + "step": 14226 + }, + { + "epoch": 0.08461199923874774, + "grad_norm": 2.4801602363586426, + "learning_rate": 4.912208138389245e-05, + "loss": 4.6625, + "step": 14227 + }, + { + "epoch": 0.08461794652202874, + "grad_norm": 2.021916627883911, + "learning_rate": 4.912195868266424e-05, + "loss": 4.5642, + "step": 14228 + }, + { + "epoch": 0.08462389380530974, + "grad_norm": 1.9586929082870483, + "learning_rate": 4.91218359730153e-05, + "loss": 4.6361, + "step": 14229 + }, + { + "epoch": 0.08462984108859073, + "grad_norm": 1.8478419780731201, + "learning_rate": 4.912171325494568e-05, + "loss": 4.5632, + "step": 14230 + }, + { + "epoch": 0.08463578837187173, + "grad_norm": 1.7078584432601929, + "learning_rate": 4.9121590528455406e-05, + "loss": 4.7259, + "step": 14231 + }, + { + "epoch": 0.08464173565515272, + "grad_norm": 1.7676106691360474, + "learning_rate": 4.912146779354455e-05, + "loss": 5.2565, + "step": 14232 + }, + { + "epoch": 0.08464768293843372, + "grad_norm": 1.8230634927749634, + "learning_rate": 4.912134505021313e-05, + "loss": 5.7668, + "step": 14233 + }, + { + "epoch": 0.08465363022171472, + "grad_norm": 1.8570215702056885, + "learning_rate": 4.91212222984612e-05, + "loss": 6.1849, + "step": 14234 + }, + { + "epoch": 0.08465957750499571, + "grad_norm": 1.7698529958724976, + "learning_rate": 4.9121099538288805e-05, + "loss": 6.0298, + "step": 14235 + }, + { + "epoch": 0.08466552478827671, + "grad_norm": 1.9919711351394653, + "learning_rate": 4.912097676969597e-05, + "loss": 5.7423, + "step": 14236 + }, + { + "epoch": 0.08467147207155772, + "grad_norm": 1.9937268495559692, + "learning_rate": 4.912085399268277e-05, + "loss": 5.8415, + "step": 14237 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 1.9489192962646484, + "learning_rate": 4.912073120724921e-05, + "loss": 5.812, + "step": 14238 + }, + { + "epoch": 0.0846833666381197, + "grad_norm": 1.6114327907562256, + "learning_rate": 4.9120608413395366e-05, + "loss": 5.9458, + "step": 14239 + }, + { + "epoch": 0.08468931392140071, + "grad_norm": 1.5803523063659668, + "learning_rate": 4.9120485611121265e-05, + "loss": 5.8837, + "step": 14240 + }, + { + "epoch": 0.0846952612046817, + "grad_norm": 1.8166266679763794, + "learning_rate": 4.9120362800426946e-05, + "loss": 5.5997, + "step": 14241 + }, + { + "epoch": 0.0847012084879627, + "grad_norm": 2.2683627605438232, + "learning_rate": 4.912023998131246e-05, + "loss": 5.4089, + "step": 14242 + }, + { + "epoch": 0.0847071557712437, + "grad_norm": 1.959498405456543, + "learning_rate": 4.9120117153777846e-05, + "loss": 5.5651, + "step": 14243 + }, + { + "epoch": 0.08471310305452469, + "grad_norm": 2.2388527393341064, + "learning_rate": 4.9119994317823155e-05, + "loss": 6.1511, + "step": 14244 + }, + { + "epoch": 0.08471905033780569, + "grad_norm": 1.9563941955566406, + "learning_rate": 4.911987147344842e-05, + "loss": 6.0499, + "step": 14245 + }, + { + "epoch": 0.08472499762108669, + "grad_norm": 1.7460871934890747, + "learning_rate": 4.911974862065368e-05, + "loss": 5.8368, + "step": 14246 + }, + { + "epoch": 0.08473094490436768, + "grad_norm": 1.820356845855713, + "learning_rate": 4.911962575943899e-05, + "loss": 5.3679, + "step": 14247 + }, + { + "epoch": 0.08473689218764868, + "grad_norm": 2.2215917110443115, + "learning_rate": 4.911950288980439e-05, + "loss": 5.0686, + "step": 14248 + }, + { + "epoch": 0.08474283947092968, + "grad_norm": 1.7801320552825928, + "learning_rate": 4.9119380011749914e-05, + "loss": 5.7665, + "step": 14249 + }, + { + "epoch": 0.08474878675421067, + "grad_norm": 1.8713878393173218, + "learning_rate": 4.911925712527562e-05, + "loss": 5.7, + "step": 14250 + }, + { + "epoch": 0.08475473403749167, + "grad_norm": 1.9371087551116943, + "learning_rate": 4.911913423038154e-05, + "loss": 5.6707, + "step": 14251 + }, + { + "epoch": 0.08476068132077268, + "grad_norm": 2.2298929691314697, + "learning_rate": 4.9119011327067724e-05, + "loss": 5.7042, + "step": 14252 + }, + { + "epoch": 0.08476662860405366, + "grad_norm": 1.7787251472473145, + "learning_rate": 4.91188884153342e-05, + "loss": 5.9205, + "step": 14253 + }, + { + "epoch": 0.08477257588733467, + "grad_norm": 2.0264973640441895, + "learning_rate": 4.911876549518102e-05, + "loss": 5.2057, + "step": 14254 + }, + { + "epoch": 0.08477852317061567, + "grad_norm": 2.7479963302612305, + "learning_rate": 4.911864256660824e-05, + "loss": 4.3828, + "step": 14255 + }, + { + "epoch": 0.08478447045389666, + "grad_norm": 2.3911163806915283, + "learning_rate": 4.9118519629615886e-05, + "loss": 4.1959, + "step": 14256 + }, + { + "epoch": 0.08479041773717766, + "grad_norm": 2.5100319385528564, + "learning_rate": 4.9118396684204005e-05, + "loss": 4.3845, + "step": 14257 + }, + { + "epoch": 0.08479636502045866, + "grad_norm": 2.575680732727051, + "learning_rate": 4.911827373037264e-05, + "loss": 4.1927, + "step": 14258 + }, + { + "epoch": 0.08480231230373965, + "grad_norm": 2.64941143989563, + "learning_rate": 4.9118150768121837e-05, + "loss": 4.2398, + "step": 14259 + }, + { + "epoch": 0.08480825958702065, + "grad_norm": 3.4619154930114746, + "learning_rate": 4.911802779745163e-05, + "loss": 5.9141, + "step": 14260 + }, + { + "epoch": 0.08481420687030164, + "grad_norm": 2.5471723079681396, + "learning_rate": 4.911790481836208e-05, + "loss": 4.1887, + "step": 14261 + }, + { + "epoch": 0.08482015415358264, + "grad_norm": 2.9113502502441406, + "learning_rate": 4.911778183085321e-05, + "loss": 4.3556, + "step": 14262 + }, + { + "epoch": 0.08482610143686364, + "grad_norm": 2.5952084064483643, + "learning_rate": 4.9117658834925076e-05, + "loss": 5.0408, + "step": 14263 + }, + { + "epoch": 0.08483204872014463, + "grad_norm": 2.60726261138916, + "learning_rate": 4.911753583057771e-05, + "loss": 5.5094, + "step": 14264 + }, + { + "epoch": 0.08483799600342563, + "grad_norm": 1.9005889892578125, + "learning_rate": 4.911741281781117e-05, + "loss": 5.2637, + "step": 14265 + }, + { + "epoch": 0.08484394328670664, + "grad_norm": 1.6408629417419434, + "learning_rate": 4.911728979662549e-05, + "loss": 5.4722, + "step": 14266 + }, + { + "epoch": 0.08484989056998762, + "grad_norm": 1.840955376625061, + "learning_rate": 4.911716676702071e-05, + "loss": 5.5073, + "step": 14267 + }, + { + "epoch": 0.08485583785326863, + "grad_norm": 1.8430123329162598, + "learning_rate": 4.911704372899687e-05, + "loss": 6.0372, + "step": 14268 + }, + { + "epoch": 0.08486178513654963, + "grad_norm": 3.2100231647491455, + "learning_rate": 4.911692068255402e-05, + "loss": 5.0497, + "step": 14269 + }, + { + "epoch": 0.08486773241983062, + "grad_norm": 3.191558837890625, + "learning_rate": 4.911679762769221e-05, + "loss": 5.0467, + "step": 14270 + }, + { + "epoch": 0.08487367970311162, + "grad_norm": 3.04190731048584, + "learning_rate": 4.911667456441148e-05, + "loss": 4.8008, + "step": 14271 + }, + { + "epoch": 0.08487962698639262, + "grad_norm": 2.6688694953918457, + "learning_rate": 4.911655149271186e-05, + "loss": 4.722, + "step": 14272 + }, + { + "epoch": 0.08488557426967361, + "grad_norm": 2.1458704471588135, + "learning_rate": 4.9116428412593394e-05, + "loss": 4.788, + "step": 14273 + }, + { + "epoch": 0.08489152155295461, + "grad_norm": 2.345972776412964, + "learning_rate": 4.911630532405615e-05, + "loss": 4.7955, + "step": 14274 + }, + { + "epoch": 0.08489746883623561, + "grad_norm": 2.2022581100463867, + "learning_rate": 4.911618222710014e-05, + "loss": 4.815, + "step": 14275 + }, + { + "epoch": 0.0849034161195166, + "grad_norm": 2.311004877090454, + "learning_rate": 4.911605912172542e-05, + "loss": 4.8632, + "step": 14276 + }, + { + "epoch": 0.0849093634027976, + "grad_norm": 2.5007429122924805, + "learning_rate": 4.911593600793204e-05, + "loss": 4.7273, + "step": 14277 + }, + { + "epoch": 0.0849153106860786, + "grad_norm": 2.257115364074707, + "learning_rate": 4.9115812885720026e-05, + "loss": 4.9697, + "step": 14278 + }, + { + "epoch": 0.08492125796935959, + "grad_norm": 2.7667057514190674, + "learning_rate": 4.9115689755089436e-05, + "loss": 5.1607, + "step": 14279 + }, + { + "epoch": 0.0849272052526406, + "grad_norm": 2.4240612983703613, + "learning_rate": 4.911556661604031e-05, + "loss": 4.9873, + "step": 14280 + }, + { + "epoch": 0.0849331525359216, + "grad_norm": 1.9951629638671875, + "learning_rate": 4.911544346857269e-05, + "loss": 4.9961, + "step": 14281 + }, + { + "epoch": 0.08493909981920258, + "grad_norm": 1.8532124757766724, + "learning_rate": 4.9115320312686605e-05, + "loss": 4.9467, + "step": 14282 + }, + { + "epoch": 0.08494504710248359, + "grad_norm": 2.41200590133667, + "learning_rate": 4.9115197148382126e-05, + "loss": 4.9865, + "step": 14283 + }, + { + "epoch": 0.08495099438576459, + "grad_norm": 2.2735655307769775, + "learning_rate": 4.911507397565928e-05, + "loss": 4.9223, + "step": 14284 + }, + { + "epoch": 0.08495694166904558, + "grad_norm": 2.29052734375, + "learning_rate": 4.91149507945181e-05, + "loss": 4.9479, + "step": 14285 + }, + { + "epoch": 0.08496288895232658, + "grad_norm": 2.71832275390625, + "learning_rate": 4.911482760495865e-05, + "loss": 4.9537, + "step": 14286 + }, + { + "epoch": 0.08496883623560758, + "grad_norm": 2.1351630687713623, + "learning_rate": 4.911470440698096e-05, + "loss": 5.3776, + "step": 14287 + }, + { + "epoch": 0.08497478351888857, + "grad_norm": 2.514810085296631, + "learning_rate": 4.9114581200585066e-05, + "loss": 5.6067, + "step": 14288 + }, + { + "epoch": 0.08498073080216957, + "grad_norm": 1.787312626838684, + "learning_rate": 4.9114457985771036e-05, + "loss": 5.4929, + "step": 14289 + }, + { + "epoch": 0.08498667808545056, + "grad_norm": 1.7784658670425415, + "learning_rate": 4.911433476253889e-05, + "loss": 5.5471, + "step": 14290 + }, + { + "epoch": 0.08499262536873156, + "grad_norm": 1.6120775938034058, + "learning_rate": 4.9114211530888676e-05, + "loss": 5.5455, + "step": 14291 + }, + { + "epoch": 0.08499857265201256, + "grad_norm": 1.6809823513031006, + "learning_rate": 4.9114088290820446e-05, + "loss": 5.7674, + "step": 14292 + }, + { + "epoch": 0.08500451993529355, + "grad_norm": 1.784569501876831, + "learning_rate": 4.9113965042334234e-05, + "loss": 5.554, + "step": 14293 + }, + { + "epoch": 0.08501046721857455, + "grad_norm": 1.8622018098831177, + "learning_rate": 4.9113841785430094e-05, + "loss": 5.5718, + "step": 14294 + }, + { + "epoch": 0.08501641450185556, + "grad_norm": 1.8970091342926025, + "learning_rate": 4.911371852010805e-05, + "loss": 5.6398, + "step": 14295 + }, + { + "epoch": 0.08502236178513654, + "grad_norm": 1.9560039043426514, + "learning_rate": 4.911359524636816e-05, + "loss": 5.3627, + "step": 14296 + }, + { + "epoch": 0.08502830906841755, + "grad_norm": 1.7574408054351807, + "learning_rate": 4.911347196421046e-05, + "loss": 5.6245, + "step": 14297 + }, + { + "epoch": 0.08503425635169855, + "grad_norm": 2.0868546962738037, + "learning_rate": 4.9113348673635004e-05, + "loss": 5.6092, + "step": 14298 + }, + { + "epoch": 0.08504020363497954, + "grad_norm": 2.1157326698303223, + "learning_rate": 4.9113225374641816e-05, + "loss": 5.0796, + "step": 14299 + }, + { + "epoch": 0.08504615091826054, + "grad_norm": 1.7721058130264282, + "learning_rate": 4.911310206723096e-05, + "loss": 5.148, + "step": 14300 + }, + { + "epoch": 0.08505209820154154, + "grad_norm": 1.586799144744873, + "learning_rate": 4.911297875140246e-05, + "loss": 5.5425, + "step": 14301 + }, + { + "epoch": 0.08505804548482253, + "grad_norm": 1.9669803380966187, + "learning_rate": 4.9112855427156376e-05, + "loss": 5.1675, + "step": 14302 + }, + { + "epoch": 0.08506399276810353, + "grad_norm": 2.279446601867676, + "learning_rate": 4.911273209449274e-05, + "loss": 5.8068, + "step": 14303 + }, + { + "epoch": 0.08506994005138453, + "grad_norm": 2.036482572555542, + "learning_rate": 4.9112608753411605e-05, + "loss": 5.3995, + "step": 14304 + }, + { + "epoch": 0.08507588733466552, + "grad_norm": 1.833946704864502, + "learning_rate": 4.9112485403913e-05, + "loss": 6.069, + "step": 14305 + }, + { + "epoch": 0.08508183461794652, + "grad_norm": 1.6984084844589233, + "learning_rate": 4.9112362045996976e-05, + "loss": 5.7842, + "step": 14306 + }, + { + "epoch": 0.08508778190122752, + "grad_norm": 1.6729326248168945, + "learning_rate": 4.911223867966358e-05, + "loss": 5.5225, + "step": 14307 + }, + { + "epoch": 0.08509372918450851, + "grad_norm": 2.046747922897339, + "learning_rate": 4.911211530491284e-05, + "loss": 4.967, + "step": 14308 + }, + { + "epoch": 0.08509967646778951, + "grad_norm": 1.967058539390564, + "learning_rate": 4.911199192174482e-05, + "loss": 5.8046, + "step": 14309 + }, + { + "epoch": 0.08510562375107052, + "grad_norm": 1.8341583013534546, + "learning_rate": 4.911186853015955e-05, + "loss": 4.8317, + "step": 14310 + }, + { + "epoch": 0.0851115710343515, + "grad_norm": 1.9655890464782715, + "learning_rate": 4.911174513015707e-05, + "loss": 4.6122, + "step": 14311 + }, + { + "epoch": 0.0851175183176325, + "grad_norm": 1.7953969240188599, + "learning_rate": 4.9111621721737445e-05, + "loss": 5.3151, + "step": 14312 + }, + { + "epoch": 0.08512346560091351, + "grad_norm": 1.7074720859527588, + "learning_rate": 4.9111498304900684e-05, + "loss": 5.337, + "step": 14313 + }, + { + "epoch": 0.0851294128841945, + "grad_norm": 1.8258756399154663, + "learning_rate": 4.9111374879646854e-05, + "loss": 5.3245, + "step": 14314 + }, + { + "epoch": 0.0851353601674755, + "grad_norm": 1.731689691543579, + "learning_rate": 4.9111251445976e-05, + "loss": 5.149, + "step": 14315 + }, + { + "epoch": 0.0851413074507565, + "grad_norm": 1.9083631038665771, + "learning_rate": 4.9111128003888154e-05, + "loss": 5.2409, + "step": 14316 + }, + { + "epoch": 0.08514725473403749, + "grad_norm": 1.739311933517456, + "learning_rate": 4.911100455338336e-05, + "loss": 5.0946, + "step": 14317 + }, + { + "epoch": 0.08515320201731849, + "grad_norm": 1.6812219619750977, + "learning_rate": 4.9110881094461655e-05, + "loss": 5.3062, + "step": 14318 + }, + { + "epoch": 0.08515914930059948, + "grad_norm": 1.8215876817703247, + "learning_rate": 4.9110757627123096e-05, + "loss": 5.5774, + "step": 14319 + }, + { + "epoch": 0.08516509658388048, + "grad_norm": 1.9548031091690063, + "learning_rate": 4.9110634151367725e-05, + "loss": 5.7895, + "step": 14320 + }, + { + "epoch": 0.08517104386716148, + "grad_norm": 2.266925096511841, + "learning_rate": 4.911051066719558e-05, + "loss": 4.6526, + "step": 14321 + }, + { + "epoch": 0.08517699115044247, + "grad_norm": 2.304807424545288, + "learning_rate": 4.9110387174606695e-05, + "loss": 5.2573, + "step": 14322 + }, + { + "epoch": 0.08518293843372347, + "grad_norm": 2.019482135772705, + "learning_rate": 4.911026367360114e-05, + "loss": 5.2739, + "step": 14323 + }, + { + "epoch": 0.08518888571700448, + "grad_norm": 2.0559775829315186, + "learning_rate": 4.911014016417893e-05, + "loss": 5.7166, + "step": 14324 + }, + { + "epoch": 0.08519483300028546, + "grad_norm": 2.0565741062164307, + "learning_rate": 4.911001664634012e-05, + "loss": 5.6359, + "step": 14325 + }, + { + "epoch": 0.08520078028356647, + "grad_norm": 1.8766587972640991, + "learning_rate": 4.910989312008475e-05, + "loss": 5.2667, + "step": 14326 + }, + { + "epoch": 0.08520672756684747, + "grad_norm": 1.669317364692688, + "learning_rate": 4.910976958541287e-05, + "loss": 5.7565, + "step": 14327 + }, + { + "epoch": 0.08521267485012846, + "grad_norm": 1.9138641357421875, + "learning_rate": 4.910964604232452e-05, + "loss": 5.9362, + "step": 14328 + }, + { + "epoch": 0.08521862213340946, + "grad_norm": 1.740892767906189, + "learning_rate": 4.9109522490819734e-05, + "loss": 5.6964, + "step": 14329 + }, + { + "epoch": 0.08522456941669046, + "grad_norm": 1.788825511932373, + "learning_rate": 4.9109398930898576e-05, + "loss": 5.4266, + "step": 14330 + }, + { + "epoch": 0.08523051669997145, + "grad_norm": 2.035877227783203, + "learning_rate": 4.910927536256106e-05, + "loss": 5.5609, + "step": 14331 + }, + { + "epoch": 0.08523646398325245, + "grad_norm": 2.078150987625122, + "learning_rate": 4.9109151785807265e-05, + "loss": 5.0074, + "step": 14332 + }, + { + "epoch": 0.08524241126653345, + "grad_norm": 2.601290225982666, + "learning_rate": 4.91090282006372e-05, + "loss": 5.2021, + "step": 14333 + }, + { + "epoch": 0.08524835854981444, + "grad_norm": 1.7069159746170044, + "learning_rate": 4.910890460705092e-05, + "loss": 5.0313, + "step": 14334 + }, + { + "epoch": 0.08525430583309544, + "grad_norm": 1.8937885761260986, + "learning_rate": 4.9108781005048473e-05, + "loss": 4.6001, + "step": 14335 + }, + { + "epoch": 0.08526025311637644, + "grad_norm": 2.3120486736297607, + "learning_rate": 4.91086573946299e-05, + "loss": 4.4027, + "step": 14336 + }, + { + "epoch": 0.08526620039965743, + "grad_norm": 2.064420223236084, + "learning_rate": 4.910853377579524e-05, + "loss": 4.8853, + "step": 14337 + }, + { + "epoch": 0.08527214768293843, + "grad_norm": 1.80779230594635, + "learning_rate": 4.910841014854455e-05, + "loss": 5.5493, + "step": 14338 + }, + { + "epoch": 0.08527809496621944, + "grad_norm": 1.6364500522613525, + "learning_rate": 4.910828651287786e-05, + "loss": 5.6569, + "step": 14339 + }, + { + "epoch": 0.08528404224950042, + "grad_norm": 1.7472214698791504, + "learning_rate": 4.910816286879522e-05, + "loss": 5.4057, + "step": 14340 + }, + { + "epoch": 0.08528998953278143, + "grad_norm": 1.6311333179473877, + "learning_rate": 4.910803921629666e-05, + "loss": 5.8406, + "step": 14341 + }, + { + "epoch": 0.08529593681606243, + "grad_norm": 2.2367610931396484, + "learning_rate": 4.9107915555382236e-05, + "loss": 4.9339, + "step": 14342 + }, + { + "epoch": 0.08530188409934342, + "grad_norm": 2.033160924911499, + "learning_rate": 4.910779188605199e-05, + "loss": 4.8923, + "step": 14343 + }, + { + "epoch": 0.08530783138262442, + "grad_norm": 1.852645993232727, + "learning_rate": 4.910766820830596e-05, + "loss": 5.2208, + "step": 14344 + }, + { + "epoch": 0.08531377866590542, + "grad_norm": 1.9810596704483032, + "learning_rate": 4.910754452214419e-05, + "loss": 5.0119, + "step": 14345 + }, + { + "epoch": 0.08531972594918641, + "grad_norm": 1.92807137966156, + "learning_rate": 4.910742082756673e-05, + "loss": 5.6388, + "step": 14346 + }, + { + "epoch": 0.08532567323246741, + "grad_norm": 1.783923864364624, + "learning_rate": 4.910729712457361e-05, + "loss": 5.2831, + "step": 14347 + }, + { + "epoch": 0.0853316205157484, + "grad_norm": 2.008113145828247, + "learning_rate": 4.91071734131649e-05, + "loss": 5.085, + "step": 14348 + }, + { + "epoch": 0.0853375677990294, + "grad_norm": 2.2313408851623535, + "learning_rate": 4.910704969334061e-05, + "loss": 5.243, + "step": 14349 + }, + { + "epoch": 0.0853435150823104, + "grad_norm": 2.155491590499878, + "learning_rate": 4.9106925965100806e-05, + "loss": 6.0776, + "step": 14350 + }, + { + "epoch": 0.08534946236559139, + "grad_norm": 1.995848536491394, + "learning_rate": 4.910680222844551e-05, + "loss": 5.6763, + "step": 14351 + }, + { + "epoch": 0.0853554096488724, + "grad_norm": 2.033620595932007, + "learning_rate": 4.910667848337479e-05, + "loss": 4.4634, + "step": 14352 + }, + { + "epoch": 0.0853613569321534, + "grad_norm": 2.036668062210083, + "learning_rate": 4.910655472988868e-05, + "loss": 4.6367, + "step": 14353 + }, + { + "epoch": 0.08536730421543438, + "grad_norm": 1.9862895011901855, + "learning_rate": 4.910643096798721e-05, + "loss": 4.4623, + "step": 14354 + }, + { + "epoch": 0.08537325149871539, + "grad_norm": 1.9778163433074951, + "learning_rate": 4.910630719767044e-05, + "loss": 4.3706, + "step": 14355 + }, + { + "epoch": 0.08537919878199639, + "grad_norm": 1.984913945198059, + "learning_rate": 4.9106183418938404e-05, + "loss": 4.4573, + "step": 14356 + }, + { + "epoch": 0.08538514606527738, + "grad_norm": 2.0571017265319824, + "learning_rate": 4.910605963179116e-05, + "loss": 4.2782, + "step": 14357 + }, + { + "epoch": 0.08539109334855838, + "grad_norm": 2.028339147567749, + "learning_rate": 4.910593583622872e-05, + "loss": 4.3874, + "step": 14358 + }, + { + "epoch": 0.08539704063183938, + "grad_norm": 2.03485369682312, + "learning_rate": 4.9105812032251165e-05, + "loss": 4.5877, + "step": 14359 + }, + { + "epoch": 0.08540298791512037, + "grad_norm": 1.950490951538086, + "learning_rate": 4.910568821985851e-05, + "loss": 4.6547, + "step": 14360 + }, + { + "epoch": 0.08540893519840137, + "grad_norm": 2.1270785331726074, + "learning_rate": 4.910556439905081e-05, + "loss": 5.3685, + "step": 14361 + }, + { + "epoch": 0.08541488248168237, + "grad_norm": 2.094545364379883, + "learning_rate": 4.910544056982811e-05, + "loss": 6.1109, + "step": 14362 + }, + { + "epoch": 0.08542082976496336, + "grad_norm": 2.2988197803497314, + "learning_rate": 4.910531673219044e-05, + "loss": 5.4789, + "step": 14363 + }, + { + "epoch": 0.08542677704824436, + "grad_norm": 2.2927358150482178, + "learning_rate": 4.910519288613786e-05, + "loss": 5.3853, + "step": 14364 + }, + { + "epoch": 0.08543272433152536, + "grad_norm": 2.223668098449707, + "learning_rate": 4.910506903167041e-05, + "loss": 5.3572, + "step": 14365 + }, + { + "epoch": 0.08543867161480635, + "grad_norm": 2.0522570610046387, + "learning_rate": 4.910494516878813e-05, + "loss": 5.3581, + "step": 14366 + }, + { + "epoch": 0.08544461889808735, + "grad_norm": 2.4349021911621094, + "learning_rate": 4.910482129749106e-05, + "loss": 5.4082, + "step": 14367 + }, + { + "epoch": 0.08545056618136836, + "grad_norm": 1.976344347000122, + "learning_rate": 4.910469741777924e-05, + "loss": 5.6107, + "step": 14368 + }, + { + "epoch": 0.08545651346464934, + "grad_norm": 1.8476877212524414, + "learning_rate": 4.910457352965272e-05, + "loss": 5.5059, + "step": 14369 + }, + { + "epoch": 0.08546246074793035, + "grad_norm": 1.6204098463058472, + "learning_rate": 4.910444963311155e-05, + "loss": 5.6578, + "step": 14370 + }, + { + "epoch": 0.08546840803121135, + "grad_norm": 1.808021903038025, + "learning_rate": 4.910432572815576e-05, + "loss": 5.8263, + "step": 14371 + }, + { + "epoch": 0.08547435531449234, + "grad_norm": 1.4975682497024536, + "learning_rate": 4.91042018147854e-05, + "loss": 5.582, + "step": 14372 + }, + { + "epoch": 0.08548030259777334, + "grad_norm": 1.644845724105835, + "learning_rate": 4.910407789300051e-05, + "loss": 5.7127, + "step": 14373 + }, + { + "epoch": 0.08548624988105434, + "grad_norm": 1.5433874130249023, + "learning_rate": 4.910395396280114e-05, + "loss": 5.6941, + "step": 14374 + }, + { + "epoch": 0.08549219716433533, + "grad_norm": 1.7267838716506958, + "learning_rate": 4.910383002418732e-05, + "loss": 5.632, + "step": 14375 + }, + { + "epoch": 0.08549814444761633, + "grad_norm": 1.4142215251922607, + "learning_rate": 4.9103706077159116e-05, + "loss": 5.6108, + "step": 14376 + }, + { + "epoch": 0.08550409173089732, + "grad_norm": 1.8514180183410645, + "learning_rate": 4.9103582121716554e-05, + "loss": 5.828, + "step": 14377 + }, + { + "epoch": 0.08551003901417832, + "grad_norm": 1.633837103843689, + "learning_rate": 4.9103458157859674e-05, + "loss": 5.8585, + "step": 14378 + }, + { + "epoch": 0.08551598629745932, + "grad_norm": 1.9934178590774536, + "learning_rate": 4.910333418558853e-05, + "loss": 5.5907, + "step": 14379 + }, + { + "epoch": 0.08552193358074031, + "grad_norm": 1.8934741020202637, + "learning_rate": 4.910321020490316e-05, + "loss": 5.579, + "step": 14380 + }, + { + "epoch": 0.08552788086402131, + "grad_norm": 1.9341318607330322, + "learning_rate": 4.910308621580361e-05, + "loss": 5.8737, + "step": 14381 + }, + { + "epoch": 0.08553382814730232, + "grad_norm": 2.1566226482391357, + "learning_rate": 4.9102962218289915e-05, + "loss": 5.6105, + "step": 14382 + }, + { + "epoch": 0.0855397754305833, + "grad_norm": 1.707112431526184, + "learning_rate": 4.910283821236213e-05, + "loss": 5.6875, + "step": 14383 + }, + { + "epoch": 0.0855457227138643, + "grad_norm": 2.8415439128875732, + "learning_rate": 4.9102714198020296e-05, + "loss": 4.9292, + "step": 14384 + }, + { + "epoch": 0.08555166999714531, + "grad_norm": 2.2043650150299072, + "learning_rate": 4.9102590175264445e-05, + "loss": 5.7264, + "step": 14385 + }, + { + "epoch": 0.0855576172804263, + "grad_norm": 2.2063820362091064, + "learning_rate": 4.9102466144094636e-05, + "loss": 5.1616, + "step": 14386 + }, + { + "epoch": 0.0855635645637073, + "grad_norm": 1.9087328910827637, + "learning_rate": 4.9102342104510903e-05, + "loss": 5.1897, + "step": 14387 + }, + { + "epoch": 0.0855695118469883, + "grad_norm": 1.6418956518173218, + "learning_rate": 4.910221805651329e-05, + "loss": 5.0923, + "step": 14388 + }, + { + "epoch": 0.08557545913026929, + "grad_norm": 1.5215847492218018, + "learning_rate": 4.9102094000101836e-05, + "loss": 4.9602, + "step": 14389 + }, + { + "epoch": 0.08558140641355029, + "grad_norm": 2.249983072280884, + "learning_rate": 4.91019699352766e-05, + "loss": 5.1167, + "step": 14390 + }, + { + "epoch": 0.08558735369683129, + "grad_norm": 1.89960777759552, + "learning_rate": 4.9101845862037615e-05, + "loss": 6.1589, + "step": 14391 + }, + { + "epoch": 0.08559330098011228, + "grad_norm": 1.8243924379348755, + "learning_rate": 4.910172178038492e-05, + "loss": 5.8661, + "step": 14392 + }, + { + "epoch": 0.08559924826339328, + "grad_norm": 1.8313872814178467, + "learning_rate": 4.9101597690318567e-05, + "loss": 5.6129, + "step": 14393 + }, + { + "epoch": 0.08560519554667428, + "grad_norm": 1.8717663288116455, + "learning_rate": 4.9101473591838593e-05, + "loss": 5.6346, + "step": 14394 + }, + { + "epoch": 0.08561114282995527, + "grad_norm": 1.6444953680038452, + "learning_rate": 4.910134948494504e-05, + "loss": 5.7237, + "step": 14395 + }, + { + "epoch": 0.08561709011323627, + "grad_norm": 1.8138811588287354, + "learning_rate": 4.910122536963796e-05, + "loss": 5.7682, + "step": 14396 + }, + { + "epoch": 0.08562303739651728, + "grad_norm": 2.629892110824585, + "learning_rate": 4.9101101245917394e-05, + "loss": 5.89, + "step": 14397 + }, + { + "epoch": 0.08562898467979826, + "grad_norm": 1.8197498321533203, + "learning_rate": 4.910097711378337e-05, + "loss": 5.6768, + "step": 14398 + }, + { + "epoch": 0.08563493196307927, + "grad_norm": 2.1121623516082764, + "learning_rate": 4.9100852973235955e-05, + "loss": 5.672, + "step": 14399 + }, + { + "epoch": 0.08564087924636027, + "grad_norm": 1.8823927640914917, + "learning_rate": 4.910072882427518e-05, + "loss": 5.6717, + "step": 14400 + }, + { + "epoch": 0.08564682652964126, + "grad_norm": 2.602023124694824, + "learning_rate": 4.9100604666901084e-05, + "loss": 5.4193, + "step": 14401 + }, + { + "epoch": 0.08565277381292226, + "grad_norm": 2.420342445373535, + "learning_rate": 4.910048050111372e-05, + "loss": 5.2811, + "step": 14402 + }, + { + "epoch": 0.08565872109620326, + "grad_norm": 2.593797206878662, + "learning_rate": 4.910035632691313e-05, + "loss": 5.2942, + "step": 14403 + }, + { + "epoch": 0.08566466837948425, + "grad_norm": 1.9292038679122925, + "learning_rate": 4.910023214429935e-05, + "loss": 5.0231, + "step": 14404 + }, + { + "epoch": 0.08567061566276525, + "grad_norm": 2.159935712814331, + "learning_rate": 4.9100107953272434e-05, + "loss": 4.8778, + "step": 14405 + }, + { + "epoch": 0.08567656294604625, + "grad_norm": 2.2363314628601074, + "learning_rate": 4.9099983753832416e-05, + "loss": 4.8828, + "step": 14406 + }, + { + "epoch": 0.08568251022932724, + "grad_norm": 2.149986505508423, + "learning_rate": 4.909985954597934e-05, + "loss": 5.4351, + "step": 14407 + }, + { + "epoch": 0.08568845751260824, + "grad_norm": 2.05991268157959, + "learning_rate": 4.909973532971325e-05, + "loss": 5.3759, + "step": 14408 + }, + { + "epoch": 0.08569440479588923, + "grad_norm": 2.0030369758605957, + "learning_rate": 4.9099611105034196e-05, + "loss": 5.5126, + "step": 14409 + }, + { + "epoch": 0.08570035207917023, + "grad_norm": 1.7764592170715332, + "learning_rate": 4.9099486871942216e-05, + "loss": 5.1808, + "step": 14410 + }, + { + "epoch": 0.08570629936245124, + "grad_norm": 1.8827999830245972, + "learning_rate": 4.909936263043735e-05, + "loss": 5.5076, + "step": 14411 + }, + { + "epoch": 0.08571224664573222, + "grad_norm": 2.0153589248657227, + "learning_rate": 4.9099238380519655e-05, + "loss": 5.2955, + "step": 14412 + }, + { + "epoch": 0.08571819392901323, + "grad_norm": 2.0739622116088867, + "learning_rate": 4.909911412218916e-05, + "loss": 5.2463, + "step": 14413 + }, + { + "epoch": 0.08572414121229423, + "grad_norm": 2.4668188095092773, + "learning_rate": 4.909898985544591e-05, + "loss": 5.1859, + "step": 14414 + }, + { + "epoch": 0.08573008849557522, + "grad_norm": 2.245546340942383, + "learning_rate": 4.9098865580289956e-05, + "loss": 5.5472, + "step": 14415 + }, + { + "epoch": 0.08573603577885622, + "grad_norm": 2.244086980819702, + "learning_rate": 4.909874129672133e-05, + "loss": 5.5531, + "step": 14416 + }, + { + "epoch": 0.08574198306213722, + "grad_norm": 2.2983627319335938, + "learning_rate": 4.909861700474009e-05, + "loss": 5.6178, + "step": 14417 + }, + { + "epoch": 0.08574793034541821, + "grad_norm": 1.9792771339416504, + "learning_rate": 4.9098492704346265e-05, + "loss": 5.364, + "step": 14418 + }, + { + "epoch": 0.08575387762869921, + "grad_norm": 1.8312867879867554, + "learning_rate": 4.9098368395539914e-05, + "loss": 5.3105, + "step": 14419 + }, + { + "epoch": 0.08575982491198021, + "grad_norm": 1.8415101766586304, + "learning_rate": 4.909824407832107e-05, + "loss": 5.3182, + "step": 14420 + }, + { + "epoch": 0.0857657721952612, + "grad_norm": 1.965531349182129, + "learning_rate": 4.909811975268977e-05, + "loss": 5.496, + "step": 14421 + }, + { + "epoch": 0.0857717194785422, + "grad_norm": 1.9116218090057373, + "learning_rate": 4.909799541864607e-05, + "loss": 5.2531, + "step": 14422 + }, + { + "epoch": 0.0857776667618232, + "grad_norm": 1.863571286201477, + "learning_rate": 4.909787107619001e-05, + "loss": 5.535, + "step": 14423 + }, + { + "epoch": 0.08578361404510419, + "grad_norm": 1.966637372970581, + "learning_rate": 4.909774672532163e-05, + "loss": 5.5072, + "step": 14424 + }, + { + "epoch": 0.0857895613283852, + "grad_norm": 1.9251974821090698, + "learning_rate": 4.9097622366040974e-05, + "loss": 5.1989, + "step": 14425 + }, + { + "epoch": 0.0857955086116662, + "grad_norm": 1.6277741193771362, + "learning_rate": 4.90974979983481e-05, + "loss": 5.357, + "step": 14426 + }, + { + "epoch": 0.08580145589494718, + "grad_norm": 1.6832202672958374, + "learning_rate": 4.909737362224302e-05, + "loss": 5.3485, + "step": 14427 + }, + { + "epoch": 0.08580740317822819, + "grad_norm": 1.7656053304672241, + "learning_rate": 4.909724923772581e-05, + "loss": 5.3965, + "step": 14428 + }, + { + "epoch": 0.08581335046150919, + "grad_norm": 1.748529076576233, + "learning_rate": 4.909712484479649e-05, + "loss": 5.3895, + "step": 14429 + }, + { + "epoch": 0.08581929774479018, + "grad_norm": 2.1317241191864014, + "learning_rate": 4.909700044345511e-05, + "loss": 5.1703, + "step": 14430 + }, + { + "epoch": 0.08582524502807118, + "grad_norm": 2.6896255016326904, + "learning_rate": 4.909687603370172e-05, + "loss": 5.3942, + "step": 14431 + }, + { + "epoch": 0.08583119231135218, + "grad_norm": 2.1061718463897705, + "learning_rate": 4.909675161553637e-05, + "loss": 5.3545, + "step": 14432 + }, + { + "epoch": 0.08583713959463317, + "grad_norm": 2.7201108932495117, + "learning_rate": 4.9096627188959085e-05, + "loss": 4.9659, + "step": 14433 + }, + { + "epoch": 0.08584308687791417, + "grad_norm": 2.0352578163146973, + "learning_rate": 4.909650275396991e-05, + "loss": 5.2667, + "step": 14434 + }, + { + "epoch": 0.08584903416119517, + "grad_norm": 1.6980863809585571, + "learning_rate": 4.9096378310568905e-05, + "loss": 5.4036, + "step": 14435 + }, + { + "epoch": 0.08585498144447616, + "grad_norm": 1.677700161933899, + "learning_rate": 4.90962538587561e-05, + "loss": 5.3104, + "step": 14436 + }, + { + "epoch": 0.08586092872775716, + "grad_norm": 1.995198369026184, + "learning_rate": 4.9096129398531534e-05, + "loss": 5.4235, + "step": 14437 + }, + { + "epoch": 0.08586687601103815, + "grad_norm": 2.136059284210205, + "learning_rate": 4.909600492989527e-05, + "loss": 5.1867, + "step": 14438 + }, + { + "epoch": 0.08587282329431915, + "grad_norm": 1.9917269945144653, + "learning_rate": 4.909588045284733e-05, + "loss": 5.5507, + "step": 14439 + }, + { + "epoch": 0.08587877057760016, + "grad_norm": 1.7341989278793335, + "learning_rate": 4.909575596738777e-05, + "loss": 5.4782, + "step": 14440 + }, + { + "epoch": 0.08588471786088114, + "grad_norm": 2.058920383453369, + "learning_rate": 4.9095631473516635e-05, + "loss": 5.51, + "step": 14441 + }, + { + "epoch": 0.08589066514416215, + "grad_norm": 1.7856314182281494, + "learning_rate": 4.9095506971233965e-05, + "loss": 5.4189, + "step": 14442 + }, + { + "epoch": 0.08589661242744315, + "grad_norm": 1.5290231704711914, + "learning_rate": 4.90953824605398e-05, + "loss": 5.4398, + "step": 14443 + }, + { + "epoch": 0.08590255971072414, + "grad_norm": 1.6302571296691895, + "learning_rate": 4.909525794143418e-05, + "loss": 5.4468, + "step": 14444 + }, + { + "epoch": 0.08590850699400514, + "grad_norm": 1.9898178577423096, + "learning_rate": 4.909513341391716e-05, + "loss": 5.5514, + "step": 14445 + }, + { + "epoch": 0.08591445427728614, + "grad_norm": 2.539473533630371, + "learning_rate": 4.909500887798878e-05, + "loss": 5.0985, + "step": 14446 + }, + { + "epoch": 0.08592040156056713, + "grad_norm": 2.109477996826172, + "learning_rate": 4.909488433364907e-05, + "loss": 5.1304, + "step": 14447 + }, + { + "epoch": 0.08592634884384813, + "grad_norm": 1.627647042274475, + "learning_rate": 4.9094759780898096e-05, + "loss": 5.7772, + "step": 14448 + }, + { + "epoch": 0.08593229612712913, + "grad_norm": 1.7776944637298584, + "learning_rate": 4.909463521973588e-05, + "loss": 6.3219, + "step": 14449 + }, + { + "epoch": 0.08593824341041012, + "grad_norm": 1.8342489004135132, + "learning_rate": 4.909451065016249e-05, + "loss": 5.7136, + "step": 14450 + }, + { + "epoch": 0.08594419069369112, + "grad_norm": 2.109060764312744, + "learning_rate": 4.9094386072177945e-05, + "loss": 5.449, + "step": 14451 + }, + { + "epoch": 0.08595013797697212, + "grad_norm": 2.5615251064300537, + "learning_rate": 4.909426148578231e-05, + "loss": 4.7441, + "step": 14452 + }, + { + "epoch": 0.08595608526025311, + "grad_norm": 1.7670586109161377, + "learning_rate": 4.909413689097561e-05, + "loss": 5.4488, + "step": 14453 + }, + { + "epoch": 0.08596203254353411, + "grad_norm": 1.9190126657485962, + "learning_rate": 4.909401228775789e-05, + "loss": 5.3128, + "step": 14454 + }, + { + "epoch": 0.08596797982681512, + "grad_norm": 1.679866909980774, + "learning_rate": 4.90938876761292e-05, + "loss": 5.4575, + "step": 14455 + }, + { + "epoch": 0.0859739271100961, + "grad_norm": 1.6199991703033447, + "learning_rate": 4.909376305608959e-05, + "loss": 5.541, + "step": 14456 + }, + { + "epoch": 0.0859798743933771, + "grad_norm": 1.876761794090271, + "learning_rate": 4.9093638427639096e-05, + "loss": 5.7256, + "step": 14457 + }, + { + "epoch": 0.08598582167665811, + "grad_norm": 1.7833212614059448, + "learning_rate": 4.909351379077776e-05, + "loss": 5.6512, + "step": 14458 + }, + { + "epoch": 0.0859917689599391, + "grad_norm": 2.249696731567383, + "learning_rate": 4.909338914550562e-05, + "loss": 5.6517, + "step": 14459 + }, + { + "epoch": 0.0859977162432201, + "grad_norm": 1.8037621974945068, + "learning_rate": 4.909326449182273e-05, + "loss": 5.7564, + "step": 14460 + }, + { + "epoch": 0.0860036635265011, + "grad_norm": 1.4057918787002563, + "learning_rate": 4.909313982972914e-05, + "loss": 5.6259, + "step": 14461 + }, + { + "epoch": 0.08600961080978209, + "grad_norm": 1.5501145124435425, + "learning_rate": 4.9093015159224874e-05, + "loss": 5.6626, + "step": 14462 + }, + { + "epoch": 0.08601555809306309, + "grad_norm": 1.8189458847045898, + "learning_rate": 4.909289048030999e-05, + "loss": 5.4682, + "step": 14463 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 1.6819778680801392, + "learning_rate": 4.909276579298452e-05, + "loss": 5.3511, + "step": 14464 + }, + { + "epoch": 0.08602745265962508, + "grad_norm": 1.8401011228561401, + "learning_rate": 4.909264109724853e-05, + "loss": 5.531, + "step": 14465 + }, + { + "epoch": 0.08603339994290608, + "grad_norm": 1.6418116092681885, + "learning_rate": 4.909251639310203e-05, + "loss": 5.2885, + "step": 14466 + }, + { + "epoch": 0.08603934722618707, + "grad_norm": 1.4331059455871582, + "learning_rate": 4.909239168054509e-05, + "loss": 5.2792, + "step": 14467 + }, + { + "epoch": 0.08604529450946807, + "grad_norm": 1.4047703742980957, + "learning_rate": 4.9092266959577745e-05, + "loss": 5.2179, + "step": 14468 + }, + { + "epoch": 0.08605124179274908, + "grad_norm": 1.641930103302002, + "learning_rate": 4.909214223020003e-05, + "loss": 5.475, + "step": 14469 + }, + { + "epoch": 0.08605718907603006, + "grad_norm": 1.9879019260406494, + "learning_rate": 4.909201749241201e-05, + "loss": 5.3893, + "step": 14470 + }, + { + "epoch": 0.08606313635931107, + "grad_norm": 1.4790434837341309, + "learning_rate": 4.909189274621371e-05, + "loss": 5.3011, + "step": 14471 + }, + { + "epoch": 0.08606908364259207, + "grad_norm": 1.4283875226974487, + "learning_rate": 4.909176799160518e-05, + "loss": 5.4181, + "step": 14472 + }, + { + "epoch": 0.08607503092587306, + "grad_norm": 1.6676496267318726, + "learning_rate": 4.909164322858646e-05, + "loss": 5.4682, + "step": 14473 + }, + { + "epoch": 0.08608097820915406, + "grad_norm": 1.4858648777008057, + "learning_rate": 4.9091518457157605e-05, + "loss": 5.3073, + "step": 14474 + }, + { + "epoch": 0.08608692549243506, + "grad_norm": 1.5135246515274048, + "learning_rate": 4.909139367731864e-05, + "loss": 5.4039, + "step": 14475 + }, + { + "epoch": 0.08609287277571605, + "grad_norm": 1.353051781654358, + "learning_rate": 4.909126888906962e-05, + "loss": 5.5455, + "step": 14476 + }, + { + "epoch": 0.08609882005899705, + "grad_norm": 1.2824941873550415, + "learning_rate": 4.909114409241059e-05, + "loss": 5.6465, + "step": 14477 + }, + { + "epoch": 0.08610476734227805, + "grad_norm": 1.3398411273956299, + "learning_rate": 4.909101928734159e-05, + "loss": 5.5299, + "step": 14478 + }, + { + "epoch": 0.08611071462555904, + "grad_norm": 1.167169213294983, + "learning_rate": 4.909089447386266e-05, + "loss": 5.4376, + "step": 14479 + }, + { + "epoch": 0.08611666190884004, + "grad_norm": 1.2469842433929443, + "learning_rate": 4.9090769651973846e-05, + "loss": 5.4945, + "step": 14480 + }, + { + "epoch": 0.08612260919212104, + "grad_norm": 1.3025931119918823, + "learning_rate": 4.90906448216752e-05, + "loss": 5.3283, + "step": 14481 + }, + { + "epoch": 0.08612855647540203, + "grad_norm": 1.597223162651062, + "learning_rate": 4.909051998296675e-05, + "loss": 5.0729, + "step": 14482 + }, + { + "epoch": 0.08613450375868303, + "grad_norm": 1.53999662399292, + "learning_rate": 4.909039513584856e-05, + "loss": 5.2956, + "step": 14483 + }, + { + "epoch": 0.08614045104196404, + "grad_norm": 1.462623953819275, + "learning_rate": 4.909027028032066e-05, + "loss": 5.2748, + "step": 14484 + }, + { + "epoch": 0.08614639832524502, + "grad_norm": 1.380196452140808, + "learning_rate": 4.909014541638309e-05, + "loss": 5.4184, + "step": 14485 + }, + { + "epoch": 0.08615234560852603, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.90900205440359e-05, + "loss": 5.2064, + "step": 14486 + }, + { + "epoch": 0.08615829289180703, + "grad_norm": 1.406848430633545, + "learning_rate": 4.9089895663279136e-05, + "loss": 5.2019, + "step": 14487 + }, + { + "epoch": 0.08616424017508802, + "grad_norm": 1.3956660032272339, + "learning_rate": 4.908977077411283e-05, + "loss": 5.128, + "step": 14488 + }, + { + "epoch": 0.08617018745836902, + "grad_norm": 1.4705348014831543, + "learning_rate": 4.9089645876537044e-05, + "loss": 5.3451, + "step": 14489 + }, + { + "epoch": 0.08617613474165002, + "grad_norm": 1.4385737180709839, + "learning_rate": 4.9089520970551804e-05, + "loss": 5.0668, + "step": 14490 + }, + { + "epoch": 0.08618208202493101, + "grad_norm": 1.584478735923767, + "learning_rate": 4.908939605615717e-05, + "loss": 4.9412, + "step": 14491 + }, + { + "epoch": 0.08618802930821201, + "grad_norm": 1.2740134000778198, + "learning_rate": 4.908927113335317e-05, + "loss": 4.8684, + "step": 14492 + }, + { + "epoch": 0.08619397659149301, + "grad_norm": 1.5669810771942139, + "learning_rate": 4.9089146202139856e-05, + "loss": 5.1903, + "step": 14493 + }, + { + "epoch": 0.086199923874774, + "grad_norm": 1.6113348007202148, + "learning_rate": 4.908902126251727e-05, + "loss": 5.1217, + "step": 14494 + }, + { + "epoch": 0.086205871158055, + "grad_norm": 1.6401634216308594, + "learning_rate": 4.908889631448546e-05, + "loss": 5.2241, + "step": 14495 + }, + { + "epoch": 0.08621181844133599, + "grad_norm": 1.522625207901001, + "learning_rate": 4.9088771358044456e-05, + "loss": 5.1858, + "step": 14496 + }, + { + "epoch": 0.086217765724617, + "grad_norm": 1.3802037239074707, + "learning_rate": 4.9088646393194316e-05, + "loss": 5.2349, + "step": 14497 + }, + { + "epoch": 0.086223713007898, + "grad_norm": 1.5226190090179443, + "learning_rate": 4.9088521419935076e-05, + "loss": 5.2612, + "step": 14498 + }, + { + "epoch": 0.08622966029117898, + "grad_norm": 1.3293451070785522, + "learning_rate": 4.9088396438266785e-05, + "loss": 5.169, + "step": 14499 + }, + { + "epoch": 0.08623560757445999, + "grad_norm": 1.334403157234192, + "learning_rate": 4.908827144818948e-05, + "loss": 5.1139, + "step": 14500 + }, + { + "epoch": 0.08624155485774099, + "grad_norm": 1.5195876359939575, + "learning_rate": 4.908814644970321e-05, + "loss": 5.1473, + "step": 14501 + }, + { + "epoch": 0.08624750214102198, + "grad_norm": 1.3367561101913452, + "learning_rate": 4.908802144280802e-05, + "loss": 5.1148, + "step": 14502 + }, + { + "epoch": 0.08625344942430298, + "grad_norm": 1.485002875328064, + "learning_rate": 4.908789642750395e-05, + "loss": 5.0796, + "step": 14503 + }, + { + "epoch": 0.08625939670758398, + "grad_norm": 1.3907506465911865, + "learning_rate": 4.9087771403791037e-05, + "loss": 5.1382, + "step": 14504 + }, + { + "epoch": 0.08626534399086497, + "grad_norm": 1.5129644870758057, + "learning_rate": 4.9087646371669336e-05, + "loss": 5.037, + "step": 14505 + }, + { + "epoch": 0.08627129127414597, + "grad_norm": 1.4666407108306885, + "learning_rate": 4.9087521331138896e-05, + "loss": 5.1877, + "step": 14506 + }, + { + "epoch": 0.08627723855742697, + "grad_norm": 1.5812102556228638, + "learning_rate": 4.9087396282199736e-05, + "loss": 5.2588, + "step": 14507 + }, + { + "epoch": 0.08628318584070796, + "grad_norm": 2.976067066192627, + "learning_rate": 4.908727122485193e-05, + "loss": 4.7477, + "step": 14508 + }, + { + "epoch": 0.08628913312398896, + "grad_norm": 1.5401511192321777, + "learning_rate": 4.90871461590955e-05, + "loss": 5.2242, + "step": 14509 + }, + { + "epoch": 0.08629508040726996, + "grad_norm": 1.3266774415969849, + "learning_rate": 4.9087021084930486e-05, + "loss": 5.2792, + "step": 14510 + }, + { + "epoch": 0.08630102769055095, + "grad_norm": 1.3292385339736938, + "learning_rate": 4.9086896002356956e-05, + "loss": 5.2434, + "step": 14511 + }, + { + "epoch": 0.08630697497383195, + "grad_norm": 1.237931489944458, + "learning_rate": 4.908677091137493e-05, + "loss": 5.2173, + "step": 14512 + }, + { + "epoch": 0.08631292225711296, + "grad_norm": 1.2488665580749512, + "learning_rate": 4.908664581198447e-05, + "loss": 5.1262, + "step": 14513 + }, + { + "epoch": 0.08631886954039394, + "grad_norm": 1.5126835107803345, + "learning_rate": 4.9086520704185604e-05, + "loss": 5.2258, + "step": 14514 + }, + { + "epoch": 0.08632481682367495, + "grad_norm": 1.3975410461425781, + "learning_rate": 4.908639558797839e-05, + "loss": 4.9266, + "step": 14515 + }, + { + "epoch": 0.08633076410695595, + "grad_norm": 1.2499217987060547, + "learning_rate": 4.908627046336285e-05, + "loss": 5.1564, + "step": 14516 + }, + { + "epoch": 0.08633671139023694, + "grad_norm": 1.6880254745483398, + "learning_rate": 4.908614533033905e-05, + "loss": 5.0906, + "step": 14517 + }, + { + "epoch": 0.08634265867351794, + "grad_norm": 1.498849630355835, + "learning_rate": 4.908602018890702e-05, + "loss": 5.0771, + "step": 14518 + }, + { + "epoch": 0.08634860595679894, + "grad_norm": 1.9192509651184082, + "learning_rate": 4.908589503906682e-05, + "loss": 5.2173, + "step": 14519 + }, + { + "epoch": 0.08635455324007993, + "grad_norm": 1.8038657903671265, + "learning_rate": 4.9085769880818475e-05, + "loss": 5.3003, + "step": 14520 + }, + { + "epoch": 0.08636050052336093, + "grad_norm": 1.3908354043960571, + "learning_rate": 4.9085644714162037e-05, + "loss": 5.1943, + "step": 14521 + }, + { + "epoch": 0.08636644780664193, + "grad_norm": 1.336630940437317, + "learning_rate": 4.9085519539097556e-05, + "loss": 5.2693, + "step": 14522 + }, + { + "epoch": 0.08637239508992292, + "grad_norm": 1.6008005142211914, + "learning_rate": 4.908539435562506e-05, + "loss": 5.2779, + "step": 14523 + }, + { + "epoch": 0.08637834237320392, + "grad_norm": 1.4620133638381958, + "learning_rate": 4.9085269163744605e-05, + "loss": 5.0467, + "step": 14524 + }, + { + "epoch": 0.08638428965648491, + "grad_norm": 1.5825145244598389, + "learning_rate": 4.9085143963456236e-05, + "loss": 4.9838, + "step": 14525 + }, + { + "epoch": 0.08639023693976591, + "grad_norm": 1.751550555229187, + "learning_rate": 4.9085018754759995e-05, + "loss": 5.0467, + "step": 14526 + }, + { + "epoch": 0.08639618422304692, + "grad_norm": 1.5967564582824707, + "learning_rate": 4.908489353765591e-05, + "loss": 5.0685, + "step": 14527 + }, + { + "epoch": 0.0864021315063279, + "grad_norm": 1.646323800086975, + "learning_rate": 4.908476831214405e-05, + "loss": 4.9341, + "step": 14528 + }, + { + "epoch": 0.0864080787896089, + "grad_norm": 1.482224464416504, + "learning_rate": 4.908464307822443e-05, + "loss": 4.9893, + "step": 14529 + }, + { + "epoch": 0.08641402607288991, + "grad_norm": 1.5190521478652954, + "learning_rate": 4.908451783589713e-05, + "loss": 5.0747, + "step": 14530 + }, + { + "epoch": 0.0864199733561709, + "grad_norm": 1.41251802444458, + "learning_rate": 4.908439258516215e-05, + "loss": 5.0098, + "step": 14531 + }, + { + "epoch": 0.0864259206394519, + "grad_norm": 1.678646445274353, + "learning_rate": 4.9084267326019576e-05, + "loss": 5.0224, + "step": 14532 + }, + { + "epoch": 0.0864318679227329, + "grad_norm": 1.5203865766525269, + "learning_rate": 4.908414205846943e-05, + "loss": 5.109, + "step": 14533 + }, + { + "epoch": 0.08643781520601389, + "grad_norm": 1.5437216758728027, + "learning_rate": 4.9084016782511754e-05, + "loss": 5.1168, + "step": 14534 + }, + { + "epoch": 0.08644376248929489, + "grad_norm": 1.3460302352905273, + "learning_rate": 4.90838914981466e-05, + "loss": 5.1038, + "step": 14535 + }, + { + "epoch": 0.08644970977257589, + "grad_norm": 1.4768339395523071, + "learning_rate": 4.908376620537401e-05, + "loss": 5.129, + "step": 14536 + }, + { + "epoch": 0.08645565705585688, + "grad_norm": 1.2669035196304321, + "learning_rate": 4.9083640904194025e-05, + "loss": 5.0856, + "step": 14537 + }, + { + "epoch": 0.08646160433913788, + "grad_norm": 1.5692600011825562, + "learning_rate": 4.9083515594606686e-05, + "loss": 5.0897, + "step": 14538 + }, + { + "epoch": 0.08646755162241888, + "grad_norm": 1.4857045412063599, + "learning_rate": 4.9083390276612044e-05, + "loss": 4.9654, + "step": 14539 + }, + { + "epoch": 0.08647349890569987, + "grad_norm": 1.5537325143814087, + "learning_rate": 4.908326495021014e-05, + "loss": 5.0431, + "step": 14540 + }, + { + "epoch": 0.08647944618898087, + "grad_norm": 1.483089566230774, + "learning_rate": 4.908313961540101e-05, + "loss": 5.0737, + "step": 14541 + }, + { + "epoch": 0.08648539347226188, + "grad_norm": 1.5829899311065674, + "learning_rate": 4.9083014272184716e-05, + "loss": 4.9844, + "step": 14542 + }, + { + "epoch": 0.08649134075554286, + "grad_norm": 1.3660348653793335, + "learning_rate": 4.908288892056128e-05, + "loss": 5.0384, + "step": 14543 + }, + { + "epoch": 0.08649728803882387, + "grad_norm": 1.3721328973770142, + "learning_rate": 4.9082763560530764e-05, + "loss": 5.0993, + "step": 14544 + }, + { + "epoch": 0.08650323532210487, + "grad_norm": 1.412381887435913, + "learning_rate": 4.90826381920932e-05, + "loss": 4.9359, + "step": 14545 + }, + { + "epoch": 0.08650918260538586, + "grad_norm": 1.5164285898208618, + "learning_rate": 4.9082512815248635e-05, + "loss": 5.0156, + "step": 14546 + }, + { + "epoch": 0.08651512988866686, + "grad_norm": 1.5244861841201782, + "learning_rate": 4.9082387429997117e-05, + "loss": 5.0719, + "step": 14547 + }, + { + "epoch": 0.08652107717194786, + "grad_norm": 1.304221510887146, + "learning_rate": 4.908226203633869e-05, + "loss": 4.9553, + "step": 14548 + }, + { + "epoch": 0.08652702445522885, + "grad_norm": 1.328220009803772, + "learning_rate": 4.908213663427338e-05, + "loss": 4.9761, + "step": 14549 + }, + { + "epoch": 0.08653297173850985, + "grad_norm": 1.4459906816482544, + "learning_rate": 4.908201122380126e-05, + "loss": 5.0422, + "step": 14550 + }, + { + "epoch": 0.08653891902179085, + "grad_norm": 1.5402530431747437, + "learning_rate": 4.908188580492235e-05, + "loss": 4.8856, + "step": 14551 + }, + { + "epoch": 0.08654486630507184, + "grad_norm": 1.6573606729507446, + "learning_rate": 4.90817603776367e-05, + "loss": 5.0958, + "step": 14552 + }, + { + "epoch": 0.08655081358835284, + "grad_norm": 1.5214189291000366, + "learning_rate": 4.9081634941944365e-05, + "loss": 4.9494, + "step": 14553 + }, + { + "epoch": 0.08655676087163383, + "grad_norm": 1.4977836608886719, + "learning_rate": 4.908150949784538e-05, + "loss": 4.9166, + "step": 14554 + }, + { + "epoch": 0.08656270815491483, + "grad_norm": 1.4952701330184937, + "learning_rate": 4.908138404533979e-05, + "loss": 4.9371, + "step": 14555 + }, + { + "epoch": 0.08656865543819584, + "grad_norm": 1.2652736902236938, + "learning_rate": 4.9081258584427626e-05, + "loss": 4.9424, + "step": 14556 + }, + { + "epoch": 0.08657460272147682, + "grad_norm": 1.4386261701583862, + "learning_rate": 4.908113311510895e-05, + "loss": 4.8909, + "step": 14557 + }, + { + "epoch": 0.08658055000475783, + "grad_norm": 1.4800533056259155, + "learning_rate": 4.90810076373838e-05, + "loss": 4.9226, + "step": 14558 + }, + { + "epoch": 0.08658649728803883, + "grad_norm": 1.4734489917755127, + "learning_rate": 4.908088215125222e-05, + "loss": 4.9774, + "step": 14559 + }, + { + "epoch": 0.08659244457131982, + "grad_norm": 1.47382390499115, + "learning_rate": 4.9080756656714245e-05, + "loss": 4.9001, + "step": 14560 + }, + { + "epoch": 0.08659839185460082, + "grad_norm": 1.4358749389648438, + "learning_rate": 4.908063115376994e-05, + "loss": 4.8537, + "step": 14561 + }, + { + "epoch": 0.08660433913788182, + "grad_norm": 1.3895947933197021, + "learning_rate": 4.908050564241933e-05, + "loss": 4.9445, + "step": 14562 + }, + { + "epoch": 0.08661028642116281, + "grad_norm": 1.6166354417800903, + "learning_rate": 4.908038012266246e-05, + "loss": 4.9447, + "step": 14563 + }, + { + "epoch": 0.08661623370444381, + "grad_norm": 1.4621998071670532, + "learning_rate": 4.908025459449938e-05, + "loss": 5.0405, + "step": 14564 + }, + { + "epoch": 0.08662218098772481, + "grad_norm": 1.4160699844360352, + "learning_rate": 4.908012905793013e-05, + "loss": 5.1246, + "step": 14565 + }, + { + "epoch": 0.0866281282710058, + "grad_norm": 1.3748950958251953, + "learning_rate": 4.9080003512954756e-05, + "loss": 5.0856, + "step": 14566 + }, + { + "epoch": 0.0866340755542868, + "grad_norm": 1.5496206283569336, + "learning_rate": 4.9079877959573303e-05, + "loss": 5.1539, + "step": 14567 + }, + { + "epoch": 0.0866400228375678, + "grad_norm": 1.2577475309371948, + "learning_rate": 4.9079752397785814e-05, + "loss": 5.033, + "step": 14568 + }, + { + "epoch": 0.08664597012084879, + "grad_norm": 1.3565775156021118, + "learning_rate": 4.9079626827592336e-05, + "loss": 4.977, + "step": 14569 + }, + { + "epoch": 0.0866519174041298, + "grad_norm": 1.869673252105713, + "learning_rate": 4.90795012489929e-05, + "loss": 5.0452, + "step": 14570 + }, + { + "epoch": 0.0866578646874108, + "grad_norm": 1.3931822776794434, + "learning_rate": 4.907937566198757e-05, + "loss": 5.0182, + "step": 14571 + }, + { + "epoch": 0.08666381197069178, + "grad_norm": 1.5796258449554443, + "learning_rate": 4.907925006657637e-05, + "loss": 5.0167, + "step": 14572 + }, + { + "epoch": 0.08666975925397279, + "grad_norm": 1.439174771308899, + "learning_rate": 4.9079124462759356e-05, + "loss": 5.0223, + "step": 14573 + }, + { + "epoch": 0.08667570653725379, + "grad_norm": 1.5269712209701538, + "learning_rate": 4.907899885053657e-05, + "loss": 5.0726, + "step": 14574 + }, + { + "epoch": 0.08668165382053478, + "grad_norm": 1.6334160566329956, + "learning_rate": 4.9078873229908054e-05, + "loss": 4.902, + "step": 14575 + }, + { + "epoch": 0.08668760110381578, + "grad_norm": 1.2883020639419556, + "learning_rate": 4.9078747600873846e-05, + "loss": 5.0168, + "step": 14576 + }, + { + "epoch": 0.08669354838709678, + "grad_norm": 1.3399035930633545, + "learning_rate": 4.9078621963434e-05, + "loss": 5.1285, + "step": 14577 + }, + { + "epoch": 0.08669949567037777, + "grad_norm": 1.6066272258758545, + "learning_rate": 4.9078496317588556e-05, + "loss": 5.1761, + "step": 14578 + }, + { + "epoch": 0.08670544295365877, + "grad_norm": 1.5316112041473389, + "learning_rate": 4.907837066333756e-05, + "loss": 4.9691, + "step": 14579 + }, + { + "epoch": 0.08671139023693977, + "grad_norm": 1.2680541276931763, + "learning_rate": 4.907824500068105e-05, + "loss": 4.984, + "step": 14580 + }, + { + "epoch": 0.08671733752022076, + "grad_norm": 1.3451861143112183, + "learning_rate": 4.9078119329619076e-05, + "loss": 5.1079, + "step": 14581 + }, + { + "epoch": 0.08672328480350176, + "grad_norm": 1.4813716411590576, + "learning_rate": 4.907799365015168e-05, + "loss": 5.0822, + "step": 14582 + }, + { + "epoch": 0.08672923208678275, + "grad_norm": 1.2526417970657349, + "learning_rate": 4.90778679622789e-05, + "loss": 5.0981, + "step": 14583 + }, + { + "epoch": 0.08673517937006375, + "grad_norm": 1.320970058441162, + "learning_rate": 4.907774226600079e-05, + "loss": 5.2046, + "step": 14584 + }, + { + "epoch": 0.08674112665334476, + "grad_norm": 1.4376531839370728, + "learning_rate": 4.907761656131739e-05, + "loss": 5.0422, + "step": 14585 + }, + { + "epoch": 0.08674707393662574, + "grad_norm": 1.3290382623672485, + "learning_rate": 4.907749084822873e-05, + "loss": 4.9587, + "step": 14586 + }, + { + "epoch": 0.08675302121990675, + "grad_norm": 1.4613630771636963, + "learning_rate": 4.907736512673489e-05, + "loss": 5.0141, + "step": 14587 + }, + { + "epoch": 0.08675896850318775, + "grad_norm": 1.2996604442596436, + "learning_rate": 4.907723939683587e-05, + "loss": 5.0881, + "step": 14588 + }, + { + "epoch": 0.08676491578646874, + "grad_norm": 1.5718237161636353, + "learning_rate": 4.907711365853174e-05, + "loss": 5.0104, + "step": 14589 + }, + { + "epoch": 0.08677086306974974, + "grad_norm": 1.5009227991104126, + "learning_rate": 4.907698791182255e-05, + "loss": 4.9257, + "step": 14590 + }, + { + "epoch": 0.08677681035303074, + "grad_norm": 1.4179331064224243, + "learning_rate": 4.907686215670831e-05, + "loss": 5.0209, + "step": 14591 + }, + { + "epoch": 0.08678275763631173, + "grad_norm": 1.3447542190551758, + "learning_rate": 4.9076736393189105e-05, + "loss": 5.0633, + "step": 14592 + }, + { + "epoch": 0.08678870491959273, + "grad_norm": 1.4221898317337036, + "learning_rate": 4.907661062126495e-05, + "loss": 4.907, + "step": 14593 + }, + { + "epoch": 0.08679465220287373, + "grad_norm": 1.5112396478652954, + "learning_rate": 4.907648484093591e-05, + "loss": 5.0703, + "step": 14594 + }, + { + "epoch": 0.08680059948615472, + "grad_norm": 1.3118572235107422, + "learning_rate": 4.907635905220201e-05, + "loss": 5.0089, + "step": 14595 + }, + { + "epoch": 0.08680654676943572, + "grad_norm": 1.6776518821716309, + "learning_rate": 4.90762332550633e-05, + "loss": 4.9705, + "step": 14596 + }, + { + "epoch": 0.08681249405271672, + "grad_norm": 1.467530608177185, + "learning_rate": 4.9076107449519824e-05, + "loss": 5.0596, + "step": 14597 + }, + { + "epoch": 0.08681844133599771, + "grad_norm": 1.5924569368362427, + "learning_rate": 4.907598163557163e-05, + "loss": 4.9904, + "step": 14598 + }, + { + "epoch": 0.08682438861927871, + "grad_norm": 1.1862461566925049, + "learning_rate": 4.907585581321877e-05, + "loss": 5.2065, + "step": 14599 + }, + { + "epoch": 0.08683033590255972, + "grad_norm": 1.5537490844726562, + "learning_rate": 4.9075729982461265e-05, + "loss": 4.9604, + "step": 14600 + }, + { + "epoch": 0.0868362831858407, + "grad_norm": 1.5608946084976196, + "learning_rate": 4.9075604143299176e-05, + "loss": 4.9951, + "step": 14601 + }, + { + "epoch": 0.0868422304691217, + "grad_norm": 1.3890982866287231, + "learning_rate": 4.907547829573254e-05, + "loss": 5.1994, + "step": 14602 + }, + { + "epoch": 0.08684817775240271, + "grad_norm": 1.5367194414138794, + "learning_rate": 4.907535243976141e-05, + "loss": 5.008, + "step": 14603 + }, + { + "epoch": 0.0868541250356837, + "grad_norm": 1.5362403392791748, + "learning_rate": 4.9075226575385814e-05, + "loss": 5.0239, + "step": 14604 + }, + { + "epoch": 0.0868600723189647, + "grad_norm": 1.3252228498458862, + "learning_rate": 4.9075100702605814e-05, + "loss": 4.9663, + "step": 14605 + }, + { + "epoch": 0.0868660196022457, + "grad_norm": 1.4381712675094604, + "learning_rate": 4.907497482142144e-05, + "loss": 5.1457, + "step": 14606 + }, + { + "epoch": 0.08687196688552669, + "grad_norm": 1.5137197971343994, + "learning_rate": 4.907484893183274e-05, + "loss": 4.9831, + "step": 14607 + }, + { + "epoch": 0.08687791416880769, + "grad_norm": 1.5544081926345825, + "learning_rate": 4.907472303383976e-05, + "loss": 5.0485, + "step": 14608 + }, + { + "epoch": 0.08688386145208869, + "grad_norm": 1.4613279104232788, + "learning_rate": 4.907459712744254e-05, + "loss": 5.3929, + "step": 14609 + }, + { + "epoch": 0.08688980873536968, + "grad_norm": 1.2830102443695068, + "learning_rate": 4.907447121264113e-05, + "loss": 5.4241, + "step": 14610 + }, + { + "epoch": 0.08689575601865068, + "grad_norm": 1.2168337106704712, + "learning_rate": 4.907434528943558e-05, + "loss": 5.4678, + "step": 14611 + }, + { + "epoch": 0.08690170330193167, + "grad_norm": 1.3995872735977173, + "learning_rate": 4.907421935782591e-05, + "loss": 5.2, + "step": 14612 + }, + { + "epoch": 0.08690765058521267, + "grad_norm": 1.4081990718841553, + "learning_rate": 4.907409341781219e-05, + "loss": 5.4356, + "step": 14613 + }, + { + "epoch": 0.08691359786849367, + "grad_norm": 1.4506621360778809, + "learning_rate": 4.9073967469394436e-05, + "loss": 5.3816, + "step": 14614 + }, + { + "epoch": 0.08691954515177466, + "grad_norm": 1.3564461469650269, + "learning_rate": 4.907384151257272e-05, + "loss": 5.2808, + "step": 14615 + }, + { + "epoch": 0.08692549243505567, + "grad_norm": 1.3663856983184814, + "learning_rate": 4.907371554734708e-05, + "loss": 5.4286, + "step": 14616 + }, + { + "epoch": 0.08693143971833667, + "grad_norm": 1.5905755758285522, + "learning_rate": 4.907358957371755e-05, + "loss": 5.3404, + "step": 14617 + }, + { + "epoch": 0.08693738700161766, + "grad_norm": 1.6172430515289307, + "learning_rate": 4.9073463591684175e-05, + "loss": 5.2511, + "step": 14618 + }, + { + "epoch": 0.08694333428489866, + "grad_norm": 1.362925410270691, + "learning_rate": 4.9073337601247e-05, + "loss": 5.3786, + "step": 14619 + }, + { + "epoch": 0.08694928156817966, + "grad_norm": 1.4276455640792847, + "learning_rate": 4.907321160240608e-05, + "loss": 5.1243, + "step": 14620 + }, + { + "epoch": 0.08695522885146065, + "grad_norm": 1.5211840867996216, + "learning_rate": 4.907308559516145e-05, + "loss": 5.1465, + "step": 14621 + }, + { + "epoch": 0.08696117613474165, + "grad_norm": 1.4728838205337524, + "learning_rate": 4.9072959579513146e-05, + "loss": 4.9585, + "step": 14622 + }, + { + "epoch": 0.08696712341802265, + "grad_norm": 1.5337111949920654, + "learning_rate": 4.907283355546123e-05, + "loss": 5.0553, + "step": 14623 + }, + { + "epoch": 0.08697307070130364, + "grad_norm": 1.3105639219284058, + "learning_rate": 4.907270752300573e-05, + "loss": 5.2724, + "step": 14624 + }, + { + "epoch": 0.08697901798458464, + "grad_norm": 1.4726678133010864, + "learning_rate": 4.90725814821467e-05, + "loss": 5.2771, + "step": 14625 + }, + { + "epoch": 0.08698496526786564, + "grad_norm": 1.5226463079452515, + "learning_rate": 4.907245543288418e-05, + "loss": 5.2294, + "step": 14626 + }, + { + "epoch": 0.08699091255114663, + "grad_norm": 1.4187650680541992, + "learning_rate": 4.9072329375218215e-05, + "loss": 5.0003, + "step": 14627 + }, + { + "epoch": 0.08699685983442763, + "grad_norm": 1.3565301895141602, + "learning_rate": 4.907220330914885e-05, + "loss": 5.0616, + "step": 14628 + }, + { + "epoch": 0.08700280711770864, + "grad_norm": 1.3763781785964966, + "learning_rate": 4.907207723467612e-05, + "loss": 5.1036, + "step": 14629 + }, + { + "epoch": 0.08700875440098962, + "grad_norm": 1.350926160812378, + "learning_rate": 4.907195115180009e-05, + "loss": 5.3433, + "step": 14630 + }, + { + "epoch": 0.08701470168427063, + "grad_norm": 1.4927095174789429, + "learning_rate": 4.907182506052078e-05, + "loss": 5.3726, + "step": 14631 + }, + { + "epoch": 0.08702064896755163, + "grad_norm": 1.9378905296325684, + "learning_rate": 4.907169896083824e-05, + "loss": 4.9942, + "step": 14632 + }, + { + "epoch": 0.08702659625083262, + "grad_norm": 1.2046253681182861, + "learning_rate": 4.907157285275253e-05, + "loss": 5.2877, + "step": 14633 + }, + { + "epoch": 0.08703254353411362, + "grad_norm": 1.352828025817871, + "learning_rate": 4.907144673626368e-05, + "loss": 5.264, + "step": 14634 + }, + { + "epoch": 0.08703849081739462, + "grad_norm": 1.4438698291778564, + "learning_rate": 4.907132061137173e-05, + "loss": 5.1767, + "step": 14635 + }, + { + "epoch": 0.08704443810067561, + "grad_norm": 1.4066534042358398, + "learning_rate": 4.9071194478076734e-05, + "loss": 5.0919, + "step": 14636 + }, + { + "epoch": 0.08705038538395661, + "grad_norm": 1.4313786029815674, + "learning_rate": 4.9071068336378736e-05, + "loss": 5.0307, + "step": 14637 + }, + { + "epoch": 0.08705633266723761, + "grad_norm": 1.3995366096496582, + "learning_rate": 4.907094218627778e-05, + "loss": 4.9508, + "step": 14638 + }, + { + "epoch": 0.0870622799505186, + "grad_norm": 1.395270824432373, + "learning_rate": 4.90708160277739e-05, + "loss": 5.1403, + "step": 14639 + }, + { + "epoch": 0.0870682272337996, + "grad_norm": 1.4280959367752075, + "learning_rate": 4.9070689860867144e-05, + "loss": 5.1675, + "step": 14640 + }, + { + "epoch": 0.08707417451708059, + "grad_norm": 1.5028926134109497, + "learning_rate": 4.907056368555757e-05, + "loss": 5.1178, + "step": 14641 + }, + { + "epoch": 0.08708012180036159, + "grad_norm": 1.480936884880066, + "learning_rate": 4.90704375018452e-05, + "loss": 5.1681, + "step": 14642 + }, + { + "epoch": 0.0870860690836426, + "grad_norm": 1.474708914756775, + "learning_rate": 4.907031130973009e-05, + "loss": 4.998, + "step": 14643 + }, + { + "epoch": 0.08709201636692358, + "grad_norm": 1.719551920890808, + "learning_rate": 4.907018510921229e-05, + "loss": 5.0486, + "step": 14644 + }, + { + "epoch": 0.08709796365020459, + "grad_norm": 1.6314032077789307, + "learning_rate": 4.907005890029184e-05, + "loss": 4.9233, + "step": 14645 + }, + { + "epoch": 0.08710391093348559, + "grad_norm": 1.635712742805481, + "learning_rate": 4.906993268296877e-05, + "loss": 4.7026, + "step": 14646 + }, + { + "epoch": 0.08710985821676658, + "grad_norm": 1.5682891607284546, + "learning_rate": 4.906980645724314e-05, + "loss": 4.7681, + "step": 14647 + }, + { + "epoch": 0.08711580550004758, + "grad_norm": 1.5149590969085693, + "learning_rate": 4.906968022311499e-05, + "loss": 4.6026, + "step": 14648 + }, + { + "epoch": 0.08712175278332858, + "grad_norm": 1.666756510734558, + "learning_rate": 4.906955398058436e-05, + "loss": 4.6652, + "step": 14649 + }, + { + "epoch": 0.08712770006660957, + "grad_norm": 1.563281536102295, + "learning_rate": 4.906942772965129e-05, + "loss": 4.8195, + "step": 14650 + }, + { + "epoch": 0.08713364734989057, + "grad_norm": 1.3730766773223877, + "learning_rate": 4.906930147031585e-05, + "loss": 5.3917, + "step": 14651 + }, + { + "epoch": 0.08713959463317157, + "grad_norm": 1.344741940498352, + "learning_rate": 4.906917520257805e-05, + "loss": 5.4866, + "step": 14652 + }, + { + "epoch": 0.08714554191645256, + "grad_norm": 1.4403667449951172, + "learning_rate": 4.906904892643796e-05, + "loss": 5.3869, + "step": 14653 + }, + { + "epoch": 0.08715148919973356, + "grad_norm": 1.4251221418380737, + "learning_rate": 4.906892264189561e-05, + "loss": 5.5564, + "step": 14654 + }, + { + "epoch": 0.08715743648301456, + "grad_norm": 1.0403032302856445, + "learning_rate": 4.9068796348951055e-05, + "loss": 5.3422, + "step": 14655 + }, + { + "epoch": 0.08716338376629555, + "grad_norm": 1.4933732748031616, + "learning_rate": 4.9068670047604313e-05, + "loss": 4.9035, + "step": 14656 + }, + { + "epoch": 0.08716933104957655, + "grad_norm": 1.820141315460205, + "learning_rate": 4.9068543737855466e-05, + "loss": 4.8447, + "step": 14657 + }, + { + "epoch": 0.08717527833285756, + "grad_norm": 1.5337603092193604, + "learning_rate": 4.9068417419704526e-05, + "loss": 4.7122, + "step": 14658 + }, + { + "epoch": 0.08718122561613854, + "grad_norm": 1.6933845281600952, + "learning_rate": 4.9068291093151555e-05, + "loss": 4.6246, + "step": 14659 + }, + { + "epoch": 0.08718717289941955, + "grad_norm": 1.607749342918396, + "learning_rate": 4.906816475819659e-05, + "loss": 4.5246, + "step": 14660 + }, + { + "epoch": 0.08719312018270055, + "grad_norm": 1.6468732357025146, + "learning_rate": 4.906803841483969e-05, + "loss": 4.5529, + "step": 14661 + }, + { + "epoch": 0.08719906746598154, + "grad_norm": 1.7252613306045532, + "learning_rate": 4.906791206308087e-05, + "loss": 4.5866, + "step": 14662 + }, + { + "epoch": 0.08720501474926254, + "grad_norm": 1.8178141117095947, + "learning_rate": 4.90677857029202e-05, + "loss": 4.6312, + "step": 14663 + }, + { + "epoch": 0.08721096203254354, + "grad_norm": 1.6173008680343628, + "learning_rate": 4.906765933435771e-05, + "loss": 4.5964, + "step": 14664 + }, + { + "epoch": 0.08721690931582453, + "grad_norm": 1.4914458990097046, + "learning_rate": 4.9067532957393444e-05, + "loss": 4.7123, + "step": 14665 + }, + { + "epoch": 0.08722285659910553, + "grad_norm": 1.5310544967651367, + "learning_rate": 4.9067406572027465e-05, + "loss": 4.6907, + "step": 14666 + }, + { + "epoch": 0.08722880388238653, + "grad_norm": 1.4311203956604004, + "learning_rate": 4.9067280178259794e-05, + "loss": 4.7749, + "step": 14667 + }, + { + "epoch": 0.08723475116566752, + "grad_norm": 1.6848034858703613, + "learning_rate": 4.9067153776090484e-05, + "loss": 5.1676, + "step": 14668 + }, + { + "epoch": 0.08724069844894852, + "grad_norm": 1.510909914970398, + "learning_rate": 4.906702736551958e-05, + "loss": 5.1237, + "step": 14669 + }, + { + "epoch": 0.08724664573222951, + "grad_norm": 1.4135887622833252, + "learning_rate": 4.906690094654713e-05, + "loss": 5.131, + "step": 14670 + }, + { + "epoch": 0.08725259301551051, + "grad_norm": 1.5739595890045166, + "learning_rate": 4.906677451917317e-05, + "loss": 5.2374, + "step": 14671 + }, + { + "epoch": 0.08725854029879151, + "grad_norm": 1.592644214630127, + "learning_rate": 4.9066648083397746e-05, + "loss": 5.0424, + "step": 14672 + }, + { + "epoch": 0.0872644875820725, + "grad_norm": 1.3842464685440063, + "learning_rate": 4.906652163922091e-05, + "loss": 5.106, + "step": 14673 + }, + { + "epoch": 0.0872704348653535, + "grad_norm": 1.4318630695343018, + "learning_rate": 4.906639518664269e-05, + "loss": 5.1223, + "step": 14674 + }, + { + "epoch": 0.08727638214863451, + "grad_norm": 1.5598502159118652, + "learning_rate": 4.906626872566314e-05, + "loss": 5.0363, + "step": 14675 + }, + { + "epoch": 0.0872823294319155, + "grad_norm": 1.9367897510528564, + "learning_rate": 4.9066142256282316e-05, + "loss": 4.8822, + "step": 14676 + }, + { + "epoch": 0.0872882767151965, + "grad_norm": 1.8134979009628296, + "learning_rate": 4.906601577850024e-05, + "loss": 4.7218, + "step": 14677 + }, + { + "epoch": 0.0872942239984775, + "grad_norm": 1.5139638185501099, + "learning_rate": 4.9065889292316976e-05, + "loss": 5.0311, + "step": 14678 + }, + { + "epoch": 0.08730017128175849, + "grad_norm": 1.5324028730392456, + "learning_rate": 4.906576279773255e-05, + "loss": 5.2366, + "step": 14679 + }, + { + "epoch": 0.08730611856503949, + "grad_norm": 1.4219286441802979, + "learning_rate": 4.906563629474702e-05, + "loss": 5.1362, + "step": 14680 + }, + { + "epoch": 0.08731206584832049, + "grad_norm": 1.4673584699630737, + "learning_rate": 4.906550978336042e-05, + "loss": 5.1336, + "step": 14681 + }, + { + "epoch": 0.08731801313160148, + "grad_norm": 1.2611639499664307, + "learning_rate": 4.906538326357281e-05, + "loss": 5.1791, + "step": 14682 + }, + { + "epoch": 0.08732396041488248, + "grad_norm": 1.283827543258667, + "learning_rate": 4.9065256735384205e-05, + "loss": 5.0889, + "step": 14683 + }, + { + "epoch": 0.08732990769816348, + "grad_norm": 1.4508111476898193, + "learning_rate": 4.906513019879468e-05, + "loss": 4.9832, + "step": 14684 + }, + { + "epoch": 0.08733585498144447, + "grad_norm": 1.3923978805541992, + "learning_rate": 4.906500365380427e-05, + "loss": 4.8147, + "step": 14685 + }, + { + "epoch": 0.08734180226472547, + "grad_norm": 1.3737010955810547, + "learning_rate": 4.906487710041301e-05, + "loss": 4.8448, + "step": 14686 + }, + { + "epoch": 0.08734774954800648, + "grad_norm": 1.4765465259552002, + "learning_rate": 4.906475053862095e-05, + "loss": 4.8601, + "step": 14687 + }, + { + "epoch": 0.08735369683128746, + "grad_norm": 1.527372121810913, + "learning_rate": 4.906462396842813e-05, + "loss": 4.8898, + "step": 14688 + }, + { + "epoch": 0.08735964411456847, + "grad_norm": 1.2455743551254272, + "learning_rate": 4.9064497389834604e-05, + "loss": 4.9954, + "step": 14689 + }, + { + "epoch": 0.08736559139784947, + "grad_norm": 1.3169753551483154, + "learning_rate": 4.906437080284041e-05, + "loss": 5.1384, + "step": 14690 + }, + { + "epoch": 0.08737153868113046, + "grad_norm": 1.3158196210861206, + "learning_rate": 4.906424420744559e-05, + "loss": 5.032, + "step": 14691 + }, + { + "epoch": 0.08737748596441146, + "grad_norm": 1.5421653985977173, + "learning_rate": 4.9064117603650197e-05, + "loss": 4.6448, + "step": 14692 + }, + { + "epoch": 0.08738343324769246, + "grad_norm": 1.4324442148208618, + "learning_rate": 4.906399099145427e-05, + "loss": 4.819, + "step": 14693 + }, + { + "epoch": 0.08738938053097345, + "grad_norm": 1.299877643585205, + "learning_rate": 4.9063864370857836e-05, + "loss": 5.4793, + "step": 14694 + }, + { + "epoch": 0.08739532781425445, + "grad_norm": 1.8289762735366821, + "learning_rate": 4.906373774186097e-05, + "loss": 5.0972, + "step": 14695 + }, + { + "epoch": 0.08740127509753545, + "grad_norm": 1.5460636615753174, + "learning_rate": 4.9063611104463705e-05, + "loss": 5.0992, + "step": 14696 + }, + { + "epoch": 0.08740722238081644, + "grad_norm": 1.4720163345336914, + "learning_rate": 4.9063484458666076e-05, + "loss": 5.0918, + "step": 14697 + }, + { + "epoch": 0.08741316966409744, + "grad_norm": 1.4653000831604004, + "learning_rate": 4.906335780446813e-05, + "loss": 5.1523, + "step": 14698 + }, + { + "epoch": 0.08741911694737843, + "grad_norm": 1.461012840270996, + "learning_rate": 4.9063231141869914e-05, + "loss": 5.1848, + "step": 14699 + }, + { + "epoch": 0.08742506423065943, + "grad_norm": 1.6757450103759766, + "learning_rate": 4.906310447087148e-05, + "loss": 4.9809, + "step": 14700 + }, + { + "epoch": 0.08743101151394043, + "grad_norm": 1.498402714729309, + "learning_rate": 4.906297779147286e-05, + "loss": 5.1451, + "step": 14701 + }, + { + "epoch": 0.08743695879722142, + "grad_norm": 1.341667652130127, + "learning_rate": 4.906285110367411e-05, + "loss": 5.1973, + "step": 14702 + }, + { + "epoch": 0.08744290608050242, + "grad_norm": 1.5008035898208618, + "learning_rate": 4.9062724407475255e-05, + "loss": 5.0961, + "step": 14703 + }, + { + "epoch": 0.08744885336378343, + "grad_norm": 1.6110866069793701, + "learning_rate": 4.9062597702876354e-05, + "loss": 4.7201, + "step": 14704 + }, + { + "epoch": 0.08745480064706442, + "grad_norm": 1.5154603719711304, + "learning_rate": 4.906247098987746e-05, + "loss": 4.6537, + "step": 14705 + }, + { + "epoch": 0.08746074793034542, + "grad_norm": 1.6169204711914062, + "learning_rate": 4.90623442684786e-05, + "loss": 4.512, + "step": 14706 + }, + { + "epoch": 0.08746669521362642, + "grad_norm": 1.4967073202133179, + "learning_rate": 4.9062217538679824e-05, + "loss": 4.7159, + "step": 14707 + }, + { + "epoch": 0.08747264249690741, + "grad_norm": 1.4621938467025757, + "learning_rate": 4.9062090800481174e-05, + "loss": 4.7553, + "step": 14708 + }, + { + "epoch": 0.08747858978018841, + "grad_norm": 1.694868564605713, + "learning_rate": 4.9061964053882694e-05, + "loss": 4.6801, + "step": 14709 + }, + { + "epoch": 0.08748453706346941, + "grad_norm": 1.6228396892547607, + "learning_rate": 4.906183729888444e-05, + "loss": 4.5402, + "step": 14710 + }, + { + "epoch": 0.0874904843467504, + "grad_norm": 1.388859748840332, + "learning_rate": 4.9061710535486435e-05, + "loss": 4.5645, + "step": 14711 + }, + { + "epoch": 0.0874964316300314, + "grad_norm": 1.546074390411377, + "learning_rate": 4.9061583763688746e-05, + "loss": 4.4146, + "step": 14712 + }, + { + "epoch": 0.0875023789133124, + "grad_norm": 1.5526363849639893, + "learning_rate": 4.90614569834914e-05, + "loss": 4.6027, + "step": 14713 + }, + { + "epoch": 0.08750832619659339, + "grad_norm": 1.6809604167938232, + "learning_rate": 4.9061330194894454e-05, + "loss": 4.4927, + "step": 14714 + }, + { + "epoch": 0.0875142734798744, + "grad_norm": 1.8013920783996582, + "learning_rate": 4.906120339789795e-05, + "loss": 4.6949, + "step": 14715 + }, + { + "epoch": 0.0875202207631554, + "grad_norm": 1.587863564491272, + "learning_rate": 4.906107659250192e-05, + "loss": 4.7255, + "step": 14716 + }, + { + "epoch": 0.08752616804643638, + "grad_norm": 1.4871174097061157, + "learning_rate": 4.9060949778706415e-05, + "loss": 4.6753, + "step": 14717 + }, + { + "epoch": 0.08753211532971739, + "grad_norm": 1.5521314144134521, + "learning_rate": 4.9060822956511485e-05, + "loss": 4.6963, + "step": 14718 + }, + { + "epoch": 0.08753806261299839, + "grad_norm": 1.5176832675933838, + "learning_rate": 4.906069612591717e-05, + "loss": 4.7475, + "step": 14719 + }, + { + "epoch": 0.08754400989627938, + "grad_norm": 1.7381534576416016, + "learning_rate": 4.906056928692352e-05, + "loss": 4.6952, + "step": 14720 + }, + { + "epoch": 0.08754995717956038, + "grad_norm": 1.604637622833252, + "learning_rate": 4.9060442439530564e-05, + "loss": 4.5792, + "step": 14721 + }, + { + "epoch": 0.08755590446284138, + "grad_norm": 1.6367937326431274, + "learning_rate": 4.9060315583738356e-05, + "loss": 4.6422, + "step": 14722 + }, + { + "epoch": 0.08756185174612237, + "grad_norm": 1.5177057981491089, + "learning_rate": 4.906018871954695e-05, + "loss": 4.5682, + "step": 14723 + }, + { + "epoch": 0.08756779902940337, + "grad_norm": 1.5539237260818481, + "learning_rate": 4.906006184695637e-05, + "loss": 4.5194, + "step": 14724 + }, + { + "epoch": 0.08757374631268437, + "grad_norm": 1.7041072845458984, + "learning_rate": 4.905993496596668e-05, + "loss": 4.6526, + "step": 14725 + }, + { + "epoch": 0.08757969359596536, + "grad_norm": 1.7187644243240356, + "learning_rate": 4.9059808076577914e-05, + "loss": 4.6251, + "step": 14726 + }, + { + "epoch": 0.08758564087924636, + "grad_norm": 1.6393675804138184, + "learning_rate": 4.905968117879012e-05, + "loss": 4.7242, + "step": 14727 + }, + { + "epoch": 0.08759158816252735, + "grad_norm": 1.6426397562026978, + "learning_rate": 4.905955427260333e-05, + "loss": 4.6272, + "step": 14728 + }, + { + "epoch": 0.08759753544580835, + "grad_norm": 1.3231829404830933, + "learning_rate": 4.9059427358017605e-05, + "loss": 4.621, + "step": 14729 + }, + { + "epoch": 0.08760348272908935, + "grad_norm": 1.3970234394073486, + "learning_rate": 4.905930043503298e-05, + "loss": 4.6356, + "step": 14730 + }, + { + "epoch": 0.08760943001237034, + "grad_norm": 1.511977195739746, + "learning_rate": 4.90591735036495e-05, + "loss": 4.7408, + "step": 14731 + }, + { + "epoch": 0.08761537729565134, + "grad_norm": 1.284788727760315, + "learning_rate": 4.9059046563867216e-05, + "loss": 5.2573, + "step": 14732 + }, + { + "epoch": 0.08762132457893235, + "grad_norm": 1.5148005485534668, + "learning_rate": 4.905891961568617e-05, + "loss": 5.0465, + "step": 14733 + }, + { + "epoch": 0.08762727186221334, + "grad_norm": 1.3727401494979858, + "learning_rate": 4.905879265910639e-05, + "loss": 5.0424, + "step": 14734 + }, + { + "epoch": 0.08763321914549434, + "grad_norm": 1.4994157552719116, + "learning_rate": 4.9058665694127945e-05, + "loss": 5.1662, + "step": 14735 + }, + { + "epoch": 0.08763916642877534, + "grad_norm": 1.5002670288085938, + "learning_rate": 4.905853872075087e-05, + "loss": 5.0872, + "step": 14736 + }, + { + "epoch": 0.08764511371205633, + "grad_norm": 1.580439567565918, + "learning_rate": 4.90584117389752e-05, + "loss": 5.1315, + "step": 14737 + }, + { + "epoch": 0.08765106099533733, + "grad_norm": 1.416154384613037, + "learning_rate": 4.9058284748801e-05, + "loss": 5.1066, + "step": 14738 + }, + { + "epoch": 0.08765700827861833, + "grad_norm": 1.5391058921813965, + "learning_rate": 4.905815775022828e-05, + "loss": 5.1724, + "step": 14739 + }, + { + "epoch": 0.08766295556189932, + "grad_norm": 1.20875883102417, + "learning_rate": 4.905803074325712e-05, + "loss": 5.152, + "step": 14740 + }, + { + "epoch": 0.08766890284518032, + "grad_norm": 1.27827787399292, + "learning_rate": 4.9057903727887556e-05, + "loss": 5.0271, + "step": 14741 + }, + { + "epoch": 0.08767485012846132, + "grad_norm": 1.1356613636016846, + "learning_rate": 4.9057776704119615e-05, + "loss": 5.0078, + "step": 14742 + }, + { + "epoch": 0.08768079741174231, + "grad_norm": 1.3931230306625366, + "learning_rate": 4.9057649671953355e-05, + "loss": 5.1253, + "step": 14743 + }, + { + "epoch": 0.08768674469502331, + "grad_norm": 1.553105115890503, + "learning_rate": 4.905752263138882e-05, + "loss": 5.1259, + "step": 14744 + }, + { + "epoch": 0.08769269197830432, + "grad_norm": 1.4004448652267456, + "learning_rate": 4.905739558242605e-05, + "loss": 5.1104, + "step": 14745 + }, + { + "epoch": 0.0876986392615853, + "grad_norm": 1.6295247077941895, + "learning_rate": 4.905726852506509e-05, + "loss": 5.0718, + "step": 14746 + }, + { + "epoch": 0.0877045865448663, + "grad_norm": 1.5966804027557373, + "learning_rate": 4.9057141459306e-05, + "loss": 5.1922, + "step": 14747 + }, + { + "epoch": 0.08771053382814731, + "grad_norm": 1.5448883771896362, + "learning_rate": 4.9057014385148795e-05, + "loss": 4.9715, + "step": 14748 + }, + { + "epoch": 0.0877164811114283, + "grad_norm": 1.5252676010131836, + "learning_rate": 4.905688730259354e-05, + "loss": 5.2128, + "step": 14749 + }, + { + "epoch": 0.0877224283947093, + "grad_norm": 1.387237310409546, + "learning_rate": 4.9056760211640274e-05, + "loss": 5.0933, + "step": 14750 + }, + { + "epoch": 0.0877283756779903, + "grad_norm": 1.3318862915039062, + "learning_rate": 4.905663311228904e-05, + "loss": 5.1849, + "step": 14751 + }, + { + "epoch": 0.08773432296127129, + "grad_norm": 1.4328356981277466, + "learning_rate": 4.905650600453989e-05, + "loss": 5.2287, + "step": 14752 + }, + { + "epoch": 0.08774027024455229, + "grad_norm": 1.4316518306732178, + "learning_rate": 4.905637888839285e-05, + "loss": 4.9774, + "step": 14753 + }, + { + "epoch": 0.08774621752783329, + "grad_norm": 1.1666837930679321, + "learning_rate": 4.9056251763847996e-05, + "loss": 5.2098, + "step": 14754 + }, + { + "epoch": 0.08775216481111428, + "grad_norm": 1.4383636713027954, + "learning_rate": 4.9056124630905333e-05, + "loss": 5.2438, + "step": 14755 + }, + { + "epoch": 0.08775811209439528, + "grad_norm": 2.6009883880615234, + "learning_rate": 4.9055997489564936e-05, + "loss": 5.7232, + "step": 14756 + }, + { + "epoch": 0.08776405937767627, + "grad_norm": 1.3072876930236816, + "learning_rate": 4.905587033982684e-05, + "loss": 5.1811, + "step": 14757 + }, + { + "epoch": 0.08777000666095727, + "grad_norm": 1.2538501024246216, + "learning_rate": 4.9055743181691084e-05, + "loss": 5.1557, + "step": 14758 + }, + { + "epoch": 0.08777595394423827, + "grad_norm": 1.2565419673919678, + "learning_rate": 4.905561601515771e-05, + "loss": 5.129, + "step": 14759 + }, + { + "epoch": 0.08778190122751926, + "grad_norm": 1.3041788339614868, + "learning_rate": 4.905548884022678e-05, + "loss": 5.2048, + "step": 14760 + }, + { + "epoch": 0.08778784851080026, + "grad_norm": 1.4548598527908325, + "learning_rate": 4.905536165689832e-05, + "loss": 5.2405, + "step": 14761 + }, + { + "epoch": 0.08779379579408127, + "grad_norm": 1.1748031377792358, + "learning_rate": 4.905523446517239e-05, + "loss": 5.1804, + "step": 14762 + }, + { + "epoch": 0.08779974307736226, + "grad_norm": 1.210534930229187, + "learning_rate": 4.905510726504902e-05, + "loss": 5.1383, + "step": 14763 + }, + { + "epoch": 0.08780569036064326, + "grad_norm": 1.2154903411865234, + "learning_rate": 4.9054980056528264e-05, + "loss": 5.2757, + "step": 14764 + }, + { + "epoch": 0.08781163764392426, + "grad_norm": 1.4123867750167847, + "learning_rate": 4.9054852839610166e-05, + "loss": 5.1268, + "step": 14765 + }, + { + "epoch": 0.08781758492720525, + "grad_norm": 1.3136295080184937, + "learning_rate": 4.905472561429476e-05, + "loss": 5.2186, + "step": 14766 + }, + { + "epoch": 0.08782353221048625, + "grad_norm": 1.2741068601608276, + "learning_rate": 4.905459838058209e-05, + "loss": 4.9737, + "step": 14767 + }, + { + "epoch": 0.08782947949376725, + "grad_norm": 1.2963054180145264, + "learning_rate": 4.9054471138472225e-05, + "loss": 5.1712, + "step": 14768 + }, + { + "epoch": 0.08783542677704824, + "grad_norm": 1.5352611541748047, + "learning_rate": 4.905434388796519e-05, + "loss": 4.9473, + "step": 14769 + }, + { + "epoch": 0.08784137406032924, + "grad_norm": 1.3399711847305298, + "learning_rate": 4.905421662906103e-05, + "loss": 5.2402, + "step": 14770 + }, + { + "epoch": 0.08784732134361024, + "grad_norm": 1.4278292655944824, + "learning_rate": 4.9054089361759794e-05, + "loss": 4.9331, + "step": 14771 + }, + { + "epoch": 0.08785326862689123, + "grad_norm": 1.5057200193405151, + "learning_rate": 4.905396208606151e-05, + "loss": 5.1553, + "step": 14772 + }, + { + "epoch": 0.08785921591017223, + "grad_norm": 1.4660797119140625, + "learning_rate": 4.905383480196625e-05, + "loss": 5.0792, + "step": 14773 + }, + { + "epoch": 0.08786516319345324, + "grad_norm": 1.4386217594146729, + "learning_rate": 4.905370750947405e-05, + "loss": 4.8363, + "step": 14774 + }, + { + "epoch": 0.08787111047673422, + "grad_norm": 1.4555455446243286, + "learning_rate": 4.905358020858493e-05, + "loss": 4.8934, + "step": 14775 + }, + { + "epoch": 0.08787705776001523, + "grad_norm": 1.5161443948745728, + "learning_rate": 4.905345289929897e-05, + "loss": 4.8227, + "step": 14776 + }, + { + "epoch": 0.08788300504329623, + "grad_norm": 1.2704185247421265, + "learning_rate": 4.9053325581616185e-05, + "loss": 4.9612, + "step": 14777 + }, + { + "epoch": 0.08788895232657722, + "grad_norm": 1.6396795511245728, + "learning_rate": 4.905319825553664e-05, + "loss": 4.8947, + "step": 14778 + }, + { + "epoch": 0.08789489960985822, + "grad_norm": 1.49285888671875, + "learning_rate": 4.905307092106037e-05, + "loss": 5.0814, + "step": 14779 + }, + { + "epoch": 0.08790084689313922, + "grad_norm": 1.3829785585403442, + "learning_rate": 4.9052943578187424e-05, + "loss": 5.3864, + "step": 14780 + }, + { + "epoch": 0.08790679417642021, + "grad_norm": 1.517054557800293, + "learning_rate": 4.905281622691784e-05, + "loss": 5.3053, + "step": 14781 + }, + { + "epoch": 0.08791274145970121, + "grad_norm": 1.491402506828308, + "learning_rate": 4.905268886725167e-05, + "loss": 5.3685, + "step": 14782 + }, + { + "epoch": 0.08791868874298221, + "grad_norm": 1.5034211874008179, + "learning_rate": 4.905256149918895e-05, + "loss": 5.2139, + "step": 14783 + }, + { + "epoch": 0.0879246360262632, + "grad_norm": 1.4021977186203003, + "learning_rate": 4.905243412272974e-05, + "loss": 5.301, + "step": 14784 + }, + { + "epoch": 0.0879305833095442, + "grad_norm": 1.44327974319458, + "learning_rate": 4.9052306737874064e-05, + "loss": 5.296, + "step": 14785 + }, + { + "epoch": 0.08793653059282519, + "grad_norm": 1.4733220338821411, + "learning_rate": 4.905217934462198e-05, + "loss": 5.3302, + "step": 14786 + }, + { + "epoch": 0.08794247787610619, + "grad_norm": 1.3308794498443604, + "learning_rate": 4.9052051942973533e-05, + "loss": 5.1835, + "step": 14787 + }, + { + "epoch": 0.0879484251593872, + "grad_norm": 1.2667236328125, + "learning_rate": 4.905192453292876e-05, + "loss": 5.1801, + "step": 14788 + }, + { + "epoch": 0.08795437244266818, + "grad_norm": 1.3284921646118164, + "learning_rate": 4.90517971144877e-05, + "loss": 5.106, + "step": 14789 + }, + { + "epoch": 0.08796031972594918, + "grad_norm": 1.4089261293411255, + "learning_rate": 4.9051669687650415e-05, + "loss": 5.133, + "step": 14790 + }, + { + "epoch": 0.08796626700923019, + "grad_norm": 1.1701233386993408, + "learning_rate": 4.905154225241694e-05, + "loss": 5.1602, + "step": 14791 + }, + { + "epoch": 0.08797221429251117, + "grad_norm": 1.169570803642273, + "learning_rate": 4.9051414808787324e-05, + "loss": 5.1231, + "step": 14792 + }, + { + "epoch": 0.08797816157579218, + "grad_norm": 1.5104409456253052, + "learning_rate": 4.90512873567616e-05, + "loss": 5.0774, + "step": 14793 + }, + { + "epoch": 0.08798410885907318, + "grad_norm": 1.3065992593765259, + "learning_rate": 4.9051159896339816e-05, + "loss": 4.9547, + "step": 14794 + }, + { + "epoch": 0.08799005614235417, + "grad_norm": 1.6417936086654663, + "learning_rate": 4.905103242752203e-05, + "loss": 5.2734, + "step": 14795 + }, + { + "epoch": 0.08799600342563517, + "grad_norm": 2.1529974937438965, + "learning_rate": 4.905090495030827e-05, + "loss": 5.1999, + "step": 14796 + }, + { + "epoch": 0.08800195070891617, + "grad_norm": 1.6746312379837036, + "learning_rate": 4.90507774646986e-05, + "loss": 4.959, + "step": 14797 + }, + { + "epoch": 0.08800789799219716, + "grad_norm": 1.4422825574874878, + "learning_rate": 4.905064997069304e-05, + "loss": 5.0581, + "step": 14798 + }, + { + "epoch": 0.08801384527547816, + "grad_norm": 1.658833622932434, + "learning_rate": 4.9050522468291646e-05, + "loss": 4.9591, + "step": 14799 + }, + { + "epoch": 0.08801979255875916, + "grad_norm": 1.4971596002578735, + "learning_rate": 4.9050394957494464e-05, + "loss": 5.2515, + "step": 14800 + }, + { + "epoch": 0.08802573984204015, + "grad_norm": 1.5866429805755615, + "learning_rate": 4.9050267438301546e-05, + "loss": 5.1084, + "step": 14801 + }, + { + "epoch": 0.08803168712532115, + "grad_norm": 1.5049015283584595, + "learning_rate": 4.9050139910712925e-05, + "loss": 5.1102, + "step": 14802 + }, + { + "epoch": 0.08803763440860216, + "grad_norm": 1.6711664199829102, + "learning_rate": 4.905001237472864e-05, + "loss": 5.0215, + "step": 14803 + }, + { + "epoch": 0.08804358169188314, + "grad_norm": 1.6390610933303833, + "learning_rate": 4.904988483034875e-05, + "loss": 4.978, + "step": 14804 + }, + { + "epoch": 0.08804952897516415, + "grad_norm": 1.5968292951583862, + "learning_rate": 4.9049757277573295e-05, + "loss": 5.0183, + "step": 14805 + }, + { + "epoch": 0.08805547625844515, + "grad_norm": 1.4864193201065063, + "learning_rate": 4.9049629716402325e-05, + "loss": 5.5199, + "step": 14806 + }, + { + "epoch": 0.08806142354172614, + "grad_norm": 1.5658420324325562, + "learning_rate": 4.904950214683587e-05, + "loss": 5.4906, + "step": 14807 + }, + { + "epoch": 0.08806737082500714, + "grad_norm": 1.5811707973480225, + "learning_rate": 4.9049374568873975e-05, + "loss": 5.5795, + "step": 14808 + }, + { + "epoch": 0.08807331810828814, + "grad_norm": 1.418641448020935, + "learning_rate": 4.90492469825167e-05, + "loss": 5.3616, + "step": 14809 + }, + { + "epoch": 0.08807926539156913, + "grad_norm": 1.323500633239746, + "learning_rate": 4.904911938776408e-05, + "loss": 5.2641, + "step": 14810 + }, + { + "epoch": 0.08808521267485013, + "grad_norm": 1.590867280960083, + "learning_rate": 4.904899178461616e-05, + "loss": 5.3782, + "step": 14811 + }, + { + "epoch": 0.08809115995813113, + "grad_norm": 1.243213176727295, + "learning_rate": 4.904886417307299e-05, + "loss": 5.4743, + "step": 14812 + }, + { + "epoch": 0.08809710724141212, + "grad_norm": 1.5051169395446777, + "learning_rate": 4.9048736553134614e-05, + "loss": 5.3046, + "step": 14813 + }, + { + "epoch": 0.08810305452469312, + "grad_norm": 1.334234356880188, + "learning_rate": 4.904860892480106e-05, + "loss": 5.2673, + "step": 14814 + }, + { + "epoch": 0.08810900180797411, + "grad_norm": 1.4352458715438843, + "learning_rate": 4.904848128807239e-05, + "loss": 5.3465, + "step": 14815 + }, + { + "epoch": 0.08811494909125511, + "grad_norm": 1.6878329515457153, + "learning_rate": 4.904835364294864e-05, + "loss": 5.3467, + "step": 14816 + }, + { + "epoch": 0.08812089637453611, + "grad_norm": 1.542100191116333, + "learning_rate": 4.904822598942986e-05, + "loss": 5.4147, + "step": 14817 + }, + { + "epoch": 0.0881268436578171, + "grad_norm": 1.5099046230316162, + "learning_rate": 4.90480983275161e-05, + "loss": 5.7198, + "step": 14818 + }, + { + "epoch": 0.0881327909410981, + "grad_norm": 1.6120097637176514, + "learning_rate": 4.9047970657207395e-05, + "loss": 5.4417, + "step": 14819 + }, + { + "epoch": 0.0881387382243791, + "grad_norm": 1.455407977104187, + "learning_rate": 4.904784297850379e-05, + "loss": 5.3028, + "step": 14820 + }, + { + "epoch": 0.0881446855076601, + "grad_norm": 1.589712381362915, + "learning_rate": 4.904771529140533e-05, + "loss": 5.2493, + "step": 14821 + }, + { + "epoch": 0.0881506327909411, + "grad_norm": 1.5051584243774414, + "learning_rate": 4.904758759591206e-05, + "loss": 5.2225, + "step": 14822 + }, + { + "epoch": 0.0881565800742221, + "grad_norm": 1.3623727560043335, + "learning_rate": 4.9047459892024026e-05, + "loss": 5.1738, + "step": 14823 + }, + { + "epoch": 0.08816252735750309, + "grad_norm": 1.4643206596374512, + "learning_rate": 4.9047332179741274e-05, + "loss": 5.123, + "step": 14824 + }, + { + "epoch": 0.08816847464078409, + "grad_norm": 1.4233453273773193, + "learning_rate": 4.904720445906384e-05, + "loss": 4.9263, + "step": 14825 + }, + { + "epoch": 0.08817442192406509, + "grad_norm": 1.6479318141937256, + "learning_rate": 4.9047076729991786e-05, + "loss": 4.9663, + "step": 14826 + }, + { + "epoch": 0.08818036920734608, + "grad_norm": 1.4759633541107178, + "learning_rate": 4.9046948992525145e-05, + "loss": 5.0326, + "step": 14827 + }, + { + "epoch": 0.08818631649062708, + "grad_norm": 1.435533046722412, + "learning_rate": 4.904682124666395e-05, + "loss": 5.0819, + "step": 14828 + }, + { + "epoch": 0.08819226377390808, + "grad_norm": 1.4540610313415527, + "learning_rate": 4.904669349240827e-05, + "loss": 5.391, + "step": 14829 + }, + { + "epoch": 0.08819821105718907, + "grad_norm": 1.6308038234710693, + "learning_rate": 4.904656572975814e-05, + "loss": 4.9723, + "step": 14830 + }, + { + "epoch": 0.08820415834047007, + "grad_norm": 1.453600287437439, + "learning_rate": 4.90464379587136e-05, + "loss": 5.1689, + "step": 14831 + }, + { + "epoch": 0.08821010562375108, + "grad_norm": 1.4876199960708618, + "learning_rate": 4.904631017927469e-05, + "loss": 5.1163, + "step": 14832 + }, + { + "epoch": 0.08821605290703206, + "grad_norm": 1.4240463972091675, + "learning_rate": 4.9046182391441466e-05, + "loss": 5.1154, + "step": 14833 + }, + { + "epoch": 0.08822200019031307, + "grad_norm": 1.4176205396652222, + "learning_rate": 4.904605459521397e-05, + "loss": 5.1587, + "step": 14834 + }, + { + "epoch": 0.08822794747359407, + "grad_norm": 1.302998423576355, + "learning_rate": 4.9045926790592244e-05, + "loss": 5.1302, + "step": 14835 + }, + { + "epoch": 0.08823389475687506, + "grad_norm": 1.4490020275115967, + "learning_rate": 4.904579897757633e-05, + "loss": 5.0817, + "step": 14836 + }, + { + "epoch": 0.08823984204015606, + "grad_norm": 1.4430203437805176, + "learning_rate": 4.9045671156166276e-05, + "loss": 5.1334, + "step": 14837 + }, + { + "epoch": 0.08824578932343706, + "grad_norm": 1.326277494430542, + "learning_rate": 4.9045543326362134e-05, + "loss": 5.3292, + "step": 14838 + }, + { + "epoch": 0.08825173660671805, + "grad_norm": 1.373415470123291, + "learning_rate": 4.9045415488163936e-05, + "loss": 5.454, + "step": 14839 + }, + { + "epoch": 0.08825768388999905, + "grad_norm": 1.4334250688552856, + "learning_rate": 4.904528764157173e-05, + "loss": 5.2735, + "step": 14840 + }, + { + "epoch": 0.08826363117328005, + "grad_norm": 1.4029041528701782, + "learning_rate": 4.904515978658556e-05, + "loss": 5.0549, + "step": 14841 + }, + { + "epoch": 0.08826957845656104, + "grad_norm": 1.355177879333496, + "learning_rate": 4.904503192320548e-05, + "loss": 5.2569, + "step": 14842 + }, + { + "epoch": 0.08827552573984204, + "grad_norm": 1.2063989639282227, + "learning_rate": 4.904490405143153e-05, + "loss": 5.2469, + "step": 14843 + }, + { + "epoch": 0.08828147302312303, + "grad_norm": 1.2290265560150146, + "learning_rate": 4.904477617126374e-05, + "loss": 5.255, + "step": 14844 + }, + { + "epoch": 0.08828742030640403, + "grad_norm": 1.0648494958877563, + "learning_rate": 4.904464828270218e-05, + "loss": 5.2423, + "step": 14845 + }, + { + "epoch": 0.08829336758968503, + "grad_norm": 1.362572431564331, + "learning_rate": 4.904452038574687e-05, + "loss": 5.3856, + "step": 14846 + }, + { + "epoch": 0.08829931487296602, + "grad_norm": 1.3004114627838135, + "learning_rate": 4.9044392480397886e-05, + "loss": 5.0672, + "step": 14847 + }, + { + "epoch": 0.08830526215624702, + "grad_norm": 1.4852789640426636, + "learning_rate": 4.904426456665523e-05, + "loss": 5.2145, + "step": 14848 + }, + { + "epoch": 0.08831120943952803, + "grad_norm": 1.4221493005752563, + "learning_rate": 4.9044136644518976e-05, + "loss": 5.4544, + "step": 14849 + }, + { + "epoch": 0.08831715672280901, + "grad_norm": 1.4444363117218018, + "learning_rate": 4.904400871398917e-05, + "loss": 5.3342, + "step": 14850 + }, + { + "epoch": 0.08832310400609002, + "grad_norm": 1.1723617315292358, + "learning_rate": 4.904388077506585e-05, + "loss": 5.3846, + "step": 14851 + }, + { + "epoch": 0.08832905128937102, + "grad_norm": 1.3458356857299805, + "learning_rate": 4.904375282774905e-05, + "loss": 5.3903, + "step": 14852 + }, + { + "epoch": 0.08833499857265201, + "grad_norm": 1.4839876890182495, + "learning_rate": 4.904362487203883e-05, + "loss": 5.0889, + "step": 14853 + }, + { + "epoch": 0.08834094585593301, + "grad_norm": 1.6487696170806885, + "learning_rate": 4.904349690793523e-05, + "loss": 5.0904, + "step": 14854 + }, + { + "epoch": 0.08834689313921401, + "grad_norm": 1.5201997756958008, + "learning_rate": 4.904336893543829e-05, + "loss": 4.9017, + "step": 14855 + }, + { + "epoch": 0.088352840422495, + "grad_norm": 1.5502886772155762, + "learning_rate": 4.904324095454806e-05, + "loss": 4.931, + "step": 14856 + }, + { + "epoch": 0.088358787705776, + "grad_norm": 1.4996228218078613, + "learning_rate": 4.904311296526458e-05, + "loss": 5.0773, + "step": 14857 + }, + { + "epoch": 0.088364734989057, + "grad_norm": 1.7004456520080566, + "learning_rate": 4.90429849675879e-05, + "loss": 4.9913, + "step": 14858 + }, + { + "epoch": 0.08837068227233799, + "grad_norm": 1.426007866859436, + "learning_rate": 4.904285696151806e-05, + "loss": 5.1312, + "step": 14859 + }, + { + "epoch": 0.088376629555619, + "grad_norm": 1.4049350023269653, + "learning_rate": 4.904272894705512e-05, + "loss": 5.0539, + "step": 14860 + }, + { + "epoch": 0.0883825768389, + "grad_norm": 1.558273434638977, + "learning_rate": 4.9042600924199096e-05, + "loss": 5.0822, + "step": 14861 + }, + { + "epoch": 0.08838852412218098, + "grad_norm": 1.6177934408187866, + "learning_rate": 4.9042472892950055e-05, + "loss": 5.1646, + "step": 14862 + }, + { + "epoch": 0.08839447140546199, + "grad_norm": 1.5152839422225952, + "learning_rate": 4.904234485330803e-05, + "loss": 5.0144, + "step": 14863 + }, + { + "epoch": 0.08840041868874299, + "grad_norm": 1.474231243133545, + "learning_rate": 4.904221680527308e-05, + "loss": 5.1063, + "step": 14864 + }, + { + "epoch": 0.08840636597202398, + "grad_norm": 1.5897177457809448, + "learning_rate": 4.904208874884523e-05, + "loss": 4.9724, + "step": 14865 + }, + { + "epoch": 0.08841231325530498, + "grad_norm": 1.604368805885315, + "learning_rate": 4.904196068402454e-05, + "loss": 4.8905, + "step": 14866 + }, + { + "epoch": 0.08841826053858598, + "grad_norm": 1.338458776473999, + "learning_rate": 4.904183261081105e-05, + "loss": 4.7829, + "step": 14867 + }, + { + "epoch": 0.08842420782186697, + "grad_norm": 1.62189781665802, + "learning_rate": 4.9041704529204806e-05, + "loss": 4.8025, + "step": 14868 + }, + { + "epoch": 0.08843015510514797, + "grad_norm": 1.555298089981079, + "learning_rate": 4.904157643920585e-05, + "loss": 4.9098, + "step": 14869 + }, + { + "epoch": 0.08843610238842897, + "grad_norm": 1.5110834836959839, + "learning_rate": 4.904144834081423e-05, + "loss": 4.8648, + "step": 14870 + }, + { + "epoch": 0.08844204967170996, + "grad_norm": 1.59073805809021, + "learning_rate": 4.904132023402999e-05, + "loss": 4.8997, + "step": 14871 + }, + { + "epoch": 0.08844799695499096, + "grad_norm": 1.5218732357025146, + "learning_rate": 4.904119211885316e-05, + "loss": 5.352, + "step": 14872 + }, + { + "epoch": 0.08845394423827196, + "grad_norm": 1.5263079404830933, + "learning_rate": 4.904106399528382e-05, + "loss": 4.8921, + "step": 14873 + }, + { + "epoch": 0.08845989152155295, + "grad_norm": 1.6151986122131348, + "learning_rate": 4.904093586332198e-05, + "loss": 5.0086, + "step": 14874 + }, + { + "epoch": 0.08846583880483395, + "grad_norm": 1.4971787929534912, + "learning_rate": 4.90408077229677e-05, + "loss": 5.0119, + "step": 14875 + }, + { + "epoch": 0.08847178608811494, + "grad_norm": 1.4897308349609375, + "learning_rate": 4.904067957422102e-05, + "loss": 5.0175, + "step": 14876 + }, + { + "epoch": 0.08847773337139594, + "grad_norm": 1.4023786783218384, + "learning_rate": 4.904055141708199e-05, + "loss": 5.0361, + "step": 14877 + }, + { + "epoch": 0.08848368065467695, + "grad_norm": 1.4664498567581177, + "learning_rate": 4.904042325155065e-05, + "loss": 4.9784, + "step": 14878 + }, + { + "epoch": 0.08848962793795793, + "grad_norm": 1.390824556350708, + "learning_rate": 4.904029507762704e-05, + "loss": 4.9922, + "step": 14879 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 1.9508315324783325, + "learning_rate": 4.904016689531122e-05, + "loss": 5.6352, + "step": 14880 + }, + { + "epoch": 0.08850152250451994, + "grad_norm": 1.4192322492599487, + "learning_rate": 4.904003870460323e-05, + "loss": 5.0654, + "step": 14881 + }, + { + "epoch": 0.08850746978780093, + "grad_norm": 1.5868372917175293, + "learning_rate": 4.903991050550311e-05, + "loss": 4.9631, + "step": 14882 + }, + { + "epoch": 0.08851341707108193, + "grad_norm": 1.405555009841919, + "learning_rate": 4.903978229801089e-05, + "loss": 5.1311, + "step": 14883 + }, + { + "epoch": 0.08851936435436293, + "grad_norm": 1.453817367553711, + "learning_rate": 4.9039654082126646e-05, + "loss": 5.0866, + "step": 14884 + }, + { + "epoch": 0.08852531163764392, + "grad_norm": 1.5051809549331665, + "learning_rate": 4.9039525857850404e-05, + "loss": 5.1606, + "step": 14885 + }, + { + "epoch": 0.08853125892092492, + "grad_norm": 1.5323255062103271, + "learning_rate": 4.9039397625182206e-05, + "loss": 5.1564, + "step": 14886 + }, + { + "epoch": 0.08853720620420592, + "grad_norm": 1.5018506050109863, + "learning_rate": 4.903926938412211e-05, + "loss": 4.9672, + "step": 14887 + }, + { + "epoch": 0.08854315348748691, + "grad_norm": 1.488289713859558, + "learning_rate": 4.903914113467015e-05, + "loss": 4.882, + "step": 14888 + }, + { + "epoch": 0.08854910077076791, + "grad_norm": 1.434045672416687, + "learning_rate": 4.903901287682637e-05, + "loss": 5.0748, + "step": 14889 + }, + { + "epoch": 0.08855504805404892, + "grad_norm": 1.5172244310379028, + "learning_rate": 4.903888461059083e-05, + "loss": 5.065, + "step": 14890 + }, + { + "epoch": 0.0885609953373299, + "grad_norm": 1.545283555984497, + "learning_rate": 4.903875633596356e-05, + "loss": 5.2187, + "step": 14891 + }, + { + "epoch": 0.0885669426206109, + "grad_norm": 1.3149688243865967, + "learning_rate": 4.90386280529446e-05, + "loss": 4.9977, + "step": 14892 + }, + { + "epoch": 0.08857288990389191, + "grad_norm": 1.4925106763839722, + "learning_rate": 4.903849976153401e-05, + "loss": 5.0622, + "step": 14893 + }, + { + "epoch": 0.0885788371871729, + "grad_norm": 1.6073296070098877, + "learning_rate": 4.903837146173183e-05, + "loss": 5.0823, + "step": 14894 + }, + { + "epoch": 0.0885847844704539, + "grad_norm": 1.2879148721694946, + "learning_rate": 4.9038243153538096e-05, + "loss": 5.1574, + "step": 14895 + }, + { + "epoch": 0.0885907317537349, + "grad_norm": 1.6396079063415527, + "learning_rate": 4.903811483695287e-05, + "loss": 5.1748, + "step": 14896 + }, + { + "epoch": 0.08859667903701589, + "grad_norm": 1.426180124282837, + "learning_rate": 4.903798651197618e-05, + "loss": 5.0374, + "step": 14897 + }, + { + "epoch": 0.08860262632029689, + "grad_norm": 1.3685684204101562, + "learning_rate": 4.9037858178608076e-05, + "loss": 4.9373, + "step": 14898 + }, + { + "epoch": 0.08860857360357789, + "grad_norm": 1.5495455265045166, + "learning_rate": 4.903772983684861e-05, + "loss": 5.0696, + "step": 14899 + }, + { + "epoch": 0.08861452088685888, + "grad_norm": 1.4423854351043701, + "learning_rate": 4.9037601486697815e-05, + "loss": 5.1359, + "step": 14900 + }, + { + "epoch": 0.08862046817013988, + "grad_norm": 1.4704400300979614, + "learning_rate": 4.9037473128155745e-05, + "loss": 5.0438, + "step": 14901 + }, + { + "epoch": 0.08862641545342088, + "grad_norm": 1.49704909324646, + "learning_rate": 4.903734476122244e-05, + "loss": 5.0305, + "step": 14902 + }, + { + "epoch": 0.08863236273670187, + "grad_norm": 1.3732075691223145, + "learning_rate": 4.903721638589795e-05, + "loss": 4.9659, + "step": 14903 + }, + { + "epoch": 0.08863831001998287, + "grad_norm": 1.5920335054397583, + "learning_rate": 4.903708800218231e-05, + "loss": 4.9936, + "step": 14904 + }, + { + "epoch": 0.08864425730326386, + "grad_norm": 1.6084437370300293, + "learning_rate": 4.9036959610075575e-05, + "loss": 5.0048, + "step": 14905 + }, + { + "epoch": 0.08865020458654486, + "grad_norm": 1.2329050302505493, + "learning_rate": 4.903683120957778e-05, + "loss": 4.9729, + "step": 14906 + }, + { + "epoch": 0.08865615186982587, + "grad_norm": 1.4001328945159912, + "learning_rate": 4.903670280068898e-05, + "loss": 4.9577, + "step": 14907 + }, + { + "epoch": 0.08866209915310685, + "grad_norm": 1.3499484062194824, + "learning_rate": 4.903657438340921e-05, + "loss": 4.8696, + "step": 14908 + }, + { + "epoch": 0.08866804643638786, + "grad_norm": 1.3606812953948975, + "learning_rate": 4.903644595773853e-05, + "loss": 4.9142, + "step": 14909 + }, + { + "epoch": 0.08867399371966886, + "grad_norm": 1.3275173902511597, + "learning_rate": 4.9036317523676964e-05, + "loss": 5.032, + "step": 14910 + }, + { + "epoch": 0.08867994100294985, + "grad_norm": 1.5485349893569946, + "learning_rate": 4.903618908122458e-05, + "loss": 4.9252, + "step": 14911 + }, + { + "epoch": 0.08868588828623085, + "grad_norm": 1.4325098991394043, + "learning_rate": 4.9036060630381395e-05, + "loss": 4.9971, + "step": 14912 + }, + { + "epoch": 0.08869183556951185, + "grad_norm": 1.4953216314315796, + "learning_rate": 4.903593217114748e-05, + "loss": 4.8228, + "step": 14913 + }, + { + "epoch": 0.08869778285279284, + "grad_norm": 1.4761654138565063, + "learning_rate": 4.9035803703522876e-05, + "loss": 4.8365, + "step": 14914 + }, + { + "epoch": 0.08870373013607384, + "grad_norm": 1.3572559356689453, + "learning_rate": 4.9035675227507615e-05, + "loss": 4.8409, + "step": 14915 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 1.3793766498565674, + "learning_rate": 4.903554674310175e-05, + "loss": 4.8748, + "step": 14916 + }, + { + "epoch": 0.08871562470263583, + "grad_norm": 1.2097266912460327, + "learning_rate": 4.9035418250305314e-05, + "loss": 4.9695, + "step": 14917 + }, + { + "epoch": 0.08872157198591683, + "grad_norm": 1.5097788572311401, + "learning_rate": 4.903528974911837e-05, + "loss": 4.9205, + "step": 14918 + }, + { + "epoch": 0.08872751926919784, + "grad_norm": 1.474219560623169, + "learning_rate": 4.903516123954095e-05, + "loss": 4.9382, + "step": 14919 + }, + { + "epoch": 0.08873346655247882, + "grad_norm": 1.4695779085159302, + "learning_rate": 4.903503272157311e-05, + "loss": 5.1486, + "step": 14920 + }, + { + "epoch": 0.08873941383575983, + "grad_norm": 1.6874669790267944, + "learning_rate": 4.903490419521488e-05, + "loss": 5.6441, + "step": 14921 + }, + { + "epoch": 0.08874536111904083, + "grad_norm": 1.5862348079681396, + "learning_rate": 4.903477566046632e-05, + "loss": 5.1457, + "step": 14922 + }, + { + "epoch": 0.08875130840232182, + "grad_norm": 1.5781593322753906, + "learning_rate": 4.903464711732747e-05, + "loss": 4.915, + "step": 14923 + }, + { + "epoch": 0.08875725568560282, + "grad_norm": 1.5252950191497803, + "learning_rate": 4.903451856579837e-05, + "loss": 5.0672, + "step": 14924 + }, + { + "epoch": 0.08876320296888382, + "grad_norm": 1.575958013534546, + "learning_rate": 4.9034390005879065e-05, + "loss": 4.9914, + "step": 14925 + }, + { + "epoch": 0.08876915025216481, + "grad_norm": 1.3837618827819824, + "learning_rate": 4.90342614375696e-05, + "loss": 5.1778, + "step": 14926 + }, + { + "epoch": 0.08877509753544581, + "grad_norm": 1.4716275930404663, + "learning_rate": 4.9034132860870036e-05, + "loss": 5.2625, + "step": 14927 + }, + { + "epoch": 0.08878104481872681, + "grad_norm": 1.2883623838424683, + "learning_rate": 4.90340042757804e-05, + "loss": 5.2357, + "step": 14928 + }, + { + "epoch": 0.0887869921020078, + "grad_norm": 1.521010398864746, + "learning_rate": 4.9033875682300736e-05, + "loss": 5.4941, + "step": 14929 + }, + { + "epoch": 0.0887929393852888, + "grad_norm": 1.5457875728607178, + "learning_rate": 4.903374708043109e-05, + "loss": 5.3108, + "step": 14930 + }, + { + "epoch": 0.0887988866685698, + "grad_norm": 1.4583250284194946, + "learning_rate": 4.903361847017152e-05, + "loss": 5.425, + "step": 14931 + }, + { + "epoch": 0.08880483395185079, + "grad_norm": 1.561854362487793, + "learning_rate": 4.903348985152206e-05, + "loss": 5.4267, + "step": 14932 + }, + { + "epoch": 0.0888107812351318, + "grad_norm": 1.6274350881576538, + "learning_rate": 4.9033361224482756e-05, + "loss": 5.3266, + "step": 14933 + }, + { + "epoch": 0.08881672851841278, + "grad_norm": 1.3476616144180298, + "learning_rate": 4.903323258905366e-05, + "loss": 5.248, + "step": 14934 + }, + { + "epoch": 0.08882267580169378, + "grad_norm": 1.3584541082382202, + "learning_rate": 4.90331039452348e-05, + "loss": 5.3101, + "step": 14935 + }, + { + "epoch": 0.08882862308497479, + "grad_norm": 1.5269302129745483, + "learning_rate": 4.903297529302624e-05, + "loss": 5.3451, + "step": 14936 + }, + { + "epoch": 0.08883457036825577, + "grad_norm": 1.5320923328399658, + "learning_rate": 4.903284663242801e-05, + "loss": 5.4289, + "step": 14937 + }, + { + "epoch": 0.08884051765153678, + "grad_norm": 1.5647650957107544, + "learning_rate": 4.9032717963440166e-05, + "loss": 5.2925, + "step": 14938 + }, + { + "epoch": 0.08884646493481778, + "grad_norm": 1.3379693031311035, + "learning_rate": 4.9032589286062744e-05, + "loss": 5.3314, + "step": 14939 + }, + { + "epoch": 0.08885241221809877, + "grad_norm": 1.5872068405151367, + "learning_rate": 4.90324606002958e-05, + "loss": 5.3521, + "step": 14940 + }, + { + "epoch": 0.08885835950137977, + "grad_norm": 1.473799228668213, + "learning_rate": 4.9032331906139373e-05, + "loss": 5.3697, + "step": 14941 + }, + { + "epoch": 0.08886430678466077, + "grad_norm": 2.2111928462982178, + "learning_rate": 4.90322032035935e-05, + "loss": 5.0139, + "step": 14942 + }, + { + "epoch": 0.08887025406794176, + "grad_norm": 1.386910319328308, + "learning_rate": 4.903207449265824e-05, + "loss": 5.3982, + "step": 14943 + }, + { + "epoch": 0.08887620135122276, + "grad_norm": 1.4972623586654663, + "learning_rate": 4.9031945773333624e-05, + "loss": 5.4207, + "step": 14944 + }, + { + "epoch": 0.08888214863450376, + "grad_norm": 1.6061536073684692, + "learning_rate": 4.903181704561971e-05, + "loss": 5.4265, + "step": 14945 + }, + { + "epoch": 0.08888809591778475, + "grad_norm": 1.5003243684768677, + "learning_rate": 4.903168830951653e-05, + "loss": 5.2323, + "step": 14946 + }, + { + "epoch": 0.08889404320106575, + "grad_norm": 1.4466320276260376, + "learning_rate": 4.9031559565024144e-05, + "loss": 5.3054, + "step": 14947 + }, + { + "epoch": 0.08889999048434676, + "grad_norm": 1.4495269060134888, + "learning_rate": 4.9031430812142584e-05, + "loss": 5.2725, + "step": 14948 + }, + { + "epoch": 0.08890593776762774, + "grad_norm": 1.2909798622131348, + "learning_rate": 4.9031302050871896e-05, + "loss": 5.13, + "step": 14949 + }, + { + "epoch": 0.08891188505090875, + "grad_norm": 1.368377685546875, + "learning_rate": 4.903117328121214e-05, + "loss": 5.0471, + "step": 14950 + }, + { + "epoch": 0.08891783233418975, + "grad_norm": 1.3496042490005493, + "learning_rate": 4.903104450316334e-05, + "loss": 5.1209, + "step": 14951 + }, + { + "epoch": 0.08892377961747074, + "grad_norm": 1.593047022819519, + "learning_rate": 4.9030915716725554e-05, + "loss": 5.2551, + "step": 14952 + }, + { + "epoch": 0.08892972690075174, + "grad_norm": 1.3550326824188232, + "learning_rate": 4.903078692189882e-05, + "loss": 5.2543, + "step": 14953 + }, + { + "epoch": 0.08893567418403274, + "grad_norm": 1.4302785396575928, + "learning_rate": 4.903065811868319e-05, + "loss": 5.2828, + "step": 14954 + }, + { + "epoch": 0.08894162146731373, + "grad_norm": 1.578244686126709, + "learning_rate": 4.903052930707871e-05, + "loss": 5.0593, + "step": 14955 + }, + { + "epoch": 0.08894756875059473, + "grad_norm": 1.248634696006775, + "learning_rate": 4.903040048708541e-05, + "loss": 5.0644, + "step": 14956 + }, + { + "epoch": 0.08895351603387573, + "grad_norm": 1.4040237665176392, + "learning_rate": 4.903027165870336e-05, + "loss": 5.0951, + "step": 14957 + }, + { + "epoch": 0.08895946331715672, + "grad_norm": 1.1941477060317993, + "learning_rate": 4.903014282193258e-05, + "loss": 5.0298, + "step": 14958 + }, + { + "epoch": 0.08896541060043772, + "grad_norm": 1.4292995929718018, + "learning_rate": 4.9030013976773125e-05, + "loss": 5.1567, + "step": 14959 + }, + { + "epoch": 0.08897135788371872, + "grad_norm": 1.4789859056472778, + "learning_rate": 4.902988512322505e-05, + "loss": 5.2172, + "step": 14960 + }, + { + "epoch": 0.08897730516699971, + "grad_norm": 2.160266876220703, + "learning_rate": 4.9029756261288376e-05, + "loss": 5.3458, + "step": 14961 + }, + { + "epoch": 0.08898325245028071, + "grad_norm": 1.8164606094360352, + "learning_rate": 4.902962739096317e-05, + "loss": 5.2795, + "step": 14962 + }, + { + "epoch": 0.0889891997335617, + "grad_norm": 2.0879664421081543, + "learning_rate": 4.902949851224947e-05, + "loss": 5.595, + "step": 14963 + }, + { + "epoch": 0.0889951470168427, + "grad_norm": 2.59543514251709, + "learning_rate": 4.9029369625147324e-05, + "loss": 5.3626, + "step": 14964 + }, + { + "epoch": 0.0890010943001237, + "grad_norm": 2.0679430961608887, + "learning_rate": 4.9029240729656764e-05, + "loss": 5.4222, + "step": 14965 + }, + { + "epoch": 0.0890070415834047, + "grad_norm": 1.90644109249115, + "learning_rate": 4.902911182577785e-05, + "loss": 6.1042, + "step": 14966 + }, + { + "epoch": 0.0890129888666857, + "grad_norm": 1.8565638065338135, + "learning_rate": 4.9028982913510626e-05, + "loss": 6.0312, + "step": 14967 + }, + { + "epoch": 0.0890189361499667, + "grad_norm": 1.717623233795166, + "learning_rate": 4.902885399285512e-05, + "loss": 5.794, + "step": 14968 + }, + { + "epoch": 0.08902488343324769, + "grad_norm": 2.2094457149505615, + "learning_rate": 4.90287250638114e-05, + "loss": 5.2517, + "step": 14969 + }, + { + "epoch": 0.08903083071652869, + "grad_norm": 2.2559561729431152, + "learning_rate": 4.9028596126379493e-05, + "loss": 5.2155, + "step": 14970 + }, + { + "epoch": 0.08903677799980969, + "grad_norm": 2.5394740104675293, + "learning_rate": 4.9028467180559455e-05, + "loss": 5.0829, + "step": 14971 + }, + { + "epoch": 0.08904272528309068, + "grad_norm": 1.9542546272277832, + "learning_rate": 4.902833822635133e-05, + "loss": 4.856, + "step": 14972 + }, + { + "epoch": 0.08904867256637168, + "grad_norm": 1.9541314840316772, + "learning_rate": 4.9028209263755154e-05, + "loss": 4.9858, + "step": 14973 + }, + { + "epoch": 0.08905461984965268, + "grad_norm": 1.8625229597091675, + "learning_rate": 4.9028080292770986e-05, + "loss": 4.976, + "step": 14974 + }, + { + "epoch": 0.08906056713293367, + "grad_norm": 2.254417657852173, + "learning_rate": 4.9027951313398855e-05, + "loss": 4.9765, + "step": 14975 + }, + { + "epoch": 0.08906651441621467, + "grad_norm": 2.3143160343170166, + "learning_rate": 4.902782232563882e-05, + "loss": 4.9562, + "step": 14976 + }, + { + "epoch": 0.08907246169949568, + "grad_norm": 2.320388078689575, + "learning_rate": 4.902769332949092e-05, + "loss": 4.9988, + "step": 14977 + }, + { + "epoch": 0.08907840898277666, + "grad_norm": 2.378101348876953, + "learning_rate": 4.90275643249552e-05, + "loss": 5.0869, + "step": 14978 + }, + { + "epoch": 0.08908435626605767, + "grad_norm": 2.5663437843322754, + "learning_rate": 4.90274353120317e-05, + "loss": 5.1124, + "step": 14979 + }, + { + "epoch": 0.08909030354933867, + "grad_norm": 2.2866733074188232, + "learning_rate": 4.902730629072048e-05, + "loss": 5.0564, + "step": 14980 + }, + { + "epoch": 0.08909625083261966, + "grad_norm": 2.060153007507324, + "learning_rate": 4.902717726102157e-05, + "loss": 4.9419, + "step": 14981 + }, + { + "epoch": 0.08910219811590066, + "grad_norm": 2.1555984020233154, + "learning_rate": 4.902704822293502e-05, + "loss": 4.6593, + "step": 14982 + }, + { + "epoch": 0.08910814539918166, + "grad_norm": 2.2045845985412598, + "learning_rate": 4.902691917646088e-05, + "loss": 4.6824, + "step": 14983 + }, + { + "epoch": 0.08911409268246265, + "grad_norm": 2.2891733646392822, + "learning_rate": 4.9026790121599185e-05, + "loss": 4.6378, + "step": 14984 + }, + { + "epoch": 0.08912003996574365, + "grad_norm": 2.0503318309783936, + "learning_rate": 4.902666105834999e-05, + "loss": 4.8051, + "step": 14985 + }, + { + "epoch": 0.08912598724902465, + "grad_norm": 2.2125399112701416, + "learning_rate": 4.9026531986713336e-05, + "loss": 5.0773, + "step": 14986 + }, + { + "epoch": 0.08913193453230564, + "grad_norm": 2.1177804470062256, + "learning_rate": 4.902640290668927e-05, + "loss": 5.0995, + "step": 14987 + }, + { + "epoch": 0.08913788181558664, + "grad_norm": 2.1028857231140137, + "learning_rate": 4.902627381827783e-05, + "loss": 4.3883, + "step": 14988 + }, + { + "epoch": 0.08914382909886764, + "grad_norm": 1.9426429271697998, + "learning_rate": 4.9026144721479065e-05, + "loss": 4.6539, + "step": 14989 + }, + { + "epoch": 0.08914977638214863, + "grad_norm": 2.2325892448425293, + "learning_rate": 4.902601561629302e-05, + "loss": 4.731, + "step": 14990 + }, + { + "epoch": 0.08915572366542963, + "grad_norm": 2.3903300762176514, + "learning_rate": 4.9025886502719756e-05, + "loss": 4.5786, + "step": 14991 + }, + { + "epoch": 0.08916167094871062, + "grad_norm": 2.368431806564331, + "learning_rate": 4.9025757380759284e-05, + "loss": 4.8904, + "step": 14992 + }, + { + "epoch": 0.08916761823199162, + "grad_norm": 2.1727442741394043, + "learning_rate": 4.902562825041168e-05, + "loss": 4.6276, + "step": 14993 + }, + { + "epoch": 0.08917356551527263, + "grad_norm": 2.2038626670837402, + "learning_rate": 4.9025499111676975e-05, + "loss": 4.7451, + "step": 14994 + }, + { + "epoch": 0.08917951279855361, + "grad_norm": 2.3933217525482178, + "learning_rate": 4.902536996455521e-05, + "loss": 4.8129, + "step": 14995 + }, + { + "epoch": 0.08918546008183462, + "grad_norm": 2.473212242126465, + "learning_rate": 4.902524080904645e-05, + "loss": 4.6171, + "step": 14996 + }, + { + "epoch": 0.08919140736511562, + "grad_norm": 2.2226645946502686, + "learning_rate": 4.902511164515071e-05, + "loss": 4.3847, + "step": 14997 + }, + { + "epoch": 0.0891973546483966, + "grad_norm": 2.0874104499816895, + "learning_rate": 4.9024982472868065e-05, + "loss": 4.801, + "step": 14998 + }, + { + "epoch": 0.08920330193167761, + "grad_norm": 1.9831374883651733, + "learning_rate": 4.902485329219854e-05, + "loss": 4.8995, + "step": 14999 + }, + { + "epoch": 0.08920924921495861, + "grad_norm": 2.1662073135375977, + "learning_rate": 4.9024724103142196e-05, + "loss": 4.7221, + "step": 15000 + }, + { + "epoch": 0.0892151964982396, + "grad_norm": 2.335336685180664, + "learning_rate": 4.902459490569906e-05, + "loss": 4.5051, + "step": 15001 + }, + { + "epoch": 0.0892211437815206, + "grad_norm": 2.2647337913513184, + "learning_rate": 4.902446569986919e-05, + "loss": 4.5274, + "step": 15002 + }, + { + "epoch": 0.0892270910648016, + "grad_norm": 2.1781129837036133, + "learning_rate": 4.9024336485652625e-05, + "loss": 4.5661, + "step": 15003 + }, + { + "epoch": 0.08923303834808259, + "grad_norm": 2.6452128887176514, + "learning_rate": 4.902420726304941e-05, + "loss": 5.0087, + "step": 15004 + }, + { + "epoch": 0.0892389856313636, + "grad_norm": 2.10276460647583, + "learning_rate": 4.90240780320596e-05, + "loss": 4.5003, + "step": 15005 + }, + { + "epoch": 0.0892449329146446, + "grad_norm": 2.1297876834869385, + "learning_rate": 4.902394879268323e-05, + "loss": 4.7603, + "step": 15006 + }, + { + "epoch": 0.08925088019792558, + "grad_norm": 2.288257122039795, + "learning_rate": 4.902381954492033e-05, + "loss": 4.7433, + "step": 15007 + }, + { + "epoch": 0.08925682748120659, + "grad_norm": 2.422492742538452, + "learning_rate": 4.902369028877098e-05, + "loss": 4.7823, + "step": 15008 + }, + { + "epoch": 0.08926277476448759, + "grad_norm": 2.4264109134674072, + "learning_rate": 4.9023561024235215e-05, + "loss": 4.9725, + "step": 15009 + }, + { + "epoch": 0.08926872204776858, + "grad_norm": 2.191776752471924, + "learning_rate": 4.902343175131307e-05, + "loss": 4.7893, + "step": 15010 + }, + { + "epoch": 0.08927466933104958, + "grad_norm": 2.0434861183166504, + "learning_rate": 4.9023302470004584e-05, + "loss": 5.3321, + "step": 15011 + }, + { + "epoch": 0.08928061661433058, + "grad_norm": 2.3108692169189453, + "learning_rate": 4.902317318030981e-05, + "loss": 4.848, + "step": 15012 + }, + { + "epoch": 0.08928656389761157, + "grad_norm": 1.8814477920532227, + "learning_rate": 4.9023043882228805e-05, + "loss": 4.9666, + "step": 15013 + }, + { + "epoch": 0.08929251118089257, + "grad_norm": 1.7109707593917847, + "learning_rate": 4.902291457576159e-05, + "loss": 5.0996, + "step": 15014 + }, + { + "epoch": 0.08929845846417357, + "grad_norm": 1.4246928691864014, + "learning_rate": 4.902278526090823e-05, + "loss": 5.1413, + "step": 15015 + }, + { + "epoch": 0.08930440574745456, + "grad_norm": 1.5714298486709595, + "learning_rate": 4.902265593766877e-05, + "loss": 5.4028, + "step": 15016 + }, + { + "epoch": 0.08931035303073556, + "grad_norm": 1.4553309679031372, + "learning_rate": 4.902252660604324e-05, + "loss": 5.1903, + "step": 15017 + }, + { + "epoch": 0.08931630031401656, + "grad_norm": 1.3266233205795288, + "learning_rate": 4.902239726603171e-05, + "loss": 5.1093, + "step": 15018 + }, + { + "epoch": 0.08932224759729755, + "grad_norm": 1.3145966529846191, + "learning_rate": 4.902226791763419e-05, + "loss": 5.0704, + "step": 15019 + }, + { + "epoch": 0.08932819488057855, + "grad_norm": 1.4367384910583496, + "learning_rate": 4.9022138560850754e-05, + "loss": 4.9669, + "step": 15020 + }, + { + "epoch": 0.08933414216385954, + "grad_norm": 1.4239497184753418, + "learning_rate": 4.902200919568144e-05, + "loss": 5.1035, + "step": 15021 + }, + { + "epoch": 0.08934008944714054, + "grad_norm": 1.323853611946106, + "learning_rate": 4.9021879822126284e-05, + "loss": 4.989, + "step": 15022 + }, + { + "epoch": 0.08934603673042155, + "grad_norm": 1.596498727798462, + "learning_rate": 4.9021750440185345e-05, + "loss": 5.0445, + "step": 15023 + }, + { + "epoch": 0.08935198401370253, + "grad_norm": 1.3866841793060303, + "learning_rate": 4.902162104985865e-05, + "loss": 4.9832, + "step": 15024 + }, + { + "epoch": 0.08935793129698354, + "grad_norm": 1.2495089769363403, + "learning_rate": 4.9021491651146265e-05, + "loss": 5.1337, + "step": 15025 + }, + { + "epoch": 0.08936387858026454, + "grad_norm": 1.2082443237304688, + "learning_rate": 4.902136224404822e-05, + "loss": 5.1038, + "step": 15026 + }, + { + "epoch": 0.08936982586354553, + "grad_norm": 1.5153082609176636, + "learning_rate": 4.9021232828564564e-05, + "loss": 5.122, + "step": 15027 + }, + { + "epoch": 0.08937577314682653, + "grad_norm": 1.5340677499771118, + "learning_rate": 4.902110340469536e-05, + "loss": 5.2675, + "step": 15028 + }, + { + "epoch": 0.08938172043010753, + "grad_norm": 1.9367091655731201, + "learning_rate": 4.9020973972440624e-05, + "loss": 5.4528, + "step": 15029 + }, + { + "epoch": 0.08938766771338852, + "grad_norm": 1.7637518644332886, + "learning_rate": 4.902084453180041e-05, + "loss": 5.4686, + "step": 15030 + }, + { + "epoch": 0.08939361499666952, + "grad_norm": 1.668220043182373, + "learning_rate": 4.902071508277477e-05, + "loss": 5.5889, + "step": 15031 + }, + { + "epoch": 0.08939956227995052, + "grad_norm": 2.0754151344299316, + "learning_rate": 4.902058562536375e-05, + "loss": 5.7398, + "step": 15032 + }, + { + "epoch": 0.08940550956323151, + "grad_norm": 1.9756910800933838, + "learning_rate": 4.902045615956739e-05, + "loss": 5.528, + "step": 15033 + }, + { + "epoch": 0.08941145684651251, + "grad_norm": 1.6614958047866821, + "learning_rate": 4.9020326685385735e-05, + "loss": 5.5761, + "step": 15034 + }, + { + "epoch": 0.08941740412979352, + "grad_norm": 2.0193135738372803, + "learning_rate": 4.902019720281884e-05, + "loss": 5.1836, + "step": 15035 + }, + { + "epoch": 0.0894233514130745, + "grad_norm": 2.164290428161621, + "learning_rate": 4.9020067711866735e-05, + "loss": 5.0216, + "step": 15036 + }, + { + "epoch": 0.0894292986963555, + "grad_norm": 2.3957648277282715, + "learning_rate": 4.901993821252947e-05, + "loss": 4.9631, + "step": 15037 + }, + { + "epoch": 0.08943524597963651, + "grad_norm": 2.204258680343628, + "learning_rate": 4.90198087048071e-05, + "loss": 4.774, + "step": 15038 + }, + { + "epoch": 0.0894411932629175, + "grad_norm": 1.7879102230072021, + "learning_rate": 4.9019679188699666e-05, + "loss": 5.716, + "step": 15039 + }, + { + "epoch": 0.0894471405461985, + "grad_norm": 1.6019984483718872, + "learning_rate": 4.9019549664207196e-05, + "loss": 5.3657, + "step": 15040 + }, + { + "epoch": 0.0894530878294795, + "grad_norm": 2.079514741897583, + "learning_rate": 4.901942013132976e-05, + "loss": 5.0526, + "step": 15041 + }, + { + "epoch": 0.08945903511276049, + "grad_norm": 1.9381201267242432, + "learning_rate": 4.901929059006739e-05, + "loss": 4.9585, + "step": 15042 + }, + { + "epoch": 0.08946498239604149, + "grad_norm": 1.6514472961425781, + "learning_rate": 4.9019161040420134e-05, + "loss": 5.4721, + "step": 15043 + }, + { + "epoch": 0.08947092967932249, + "grad_norm": 1.7294371128082275, + "learning_rate": 4.901903148238804e-05, + "loss": 5.4401, + "step": 15044 + }, + { + "epoch": 0.08947687696260348, + "grad_norm": 1.7769347429275513, + "learning_rate": 4.901890191597115e-05, + "loss": 5.4324, + "step": 15045 + }, + { + "epoch": 0.08948282424588448, + "grad_norm": 1.6517225503921509, + "learning_rate": 4.9018772341169505e-05, + "loss": 5.2967, + "step": 15046 + }, + { + "epoch": 0.08948877152916548, + "grad_norm": 1.5310052633285522, + "learning_rate": 4.901864275798316e-05, + "loss": 5.4017, + "step": 15047 + }, + { + "epoch": 0.08949471881244647, + "grad_norm": 1.9703199863433838, + "learning_rate": 4.9018513166412146e-05, + "loss": 4.9813, + "step": 15048 + }, + { + "epoch": 0.08950066609572747, + "grad_norm": 1.991087555885315, + "learning_rate": 4.901838356645652e-05, + "loss": 5.2911, + "step": 15049 + }, + { + "epoch": 0.08950661337900846, + "grad_norm": 1.7992926836013794, + "learning_rate": 4.9018253958116334e-05, + "loss": 5.2996, + "step": 15050 + }, + { + "epoch": 0.08951256066228946, + "grad_norm": 1.5164752006530762, + "learning_rate": 4.901812434139161e-05, + "loss": 5.8002, + "step": 15051 + }, + { + "epoch": 0.08951850794557047, + "grad_norm": 1.8143075704574585, + "learning_rate": 4.9017994716282415e-05, + "loss": 5.241, + "step": 15052 + }, + { + "epoch": 0.08952445522885145, + "grad_norm": 1.9806342124938965, + "learning_rate": 4.9017865082788785e-05, + "loss": 5.3656, + "step": 15053 + }, + { + "epoch": 0.08953040251213246, + "grad_norm": 2.403789520263672, + "learning_rate": 4.901773544091077e-05, + "loss": 5.1024, + "step": 15054 + }, + { + "epoch": 0.08953634979541346, + "grad_norm": 1.5903408527374268, + "learning_rate": 4.90176057906484e-05, + "loss": 5.3849, + "step": 15055 + }, + { + "epoch": 0.08954229707869445, + "grad_norm": 1.764125943183899, + "learning_rate": 4.901747613200175e-05, + "loss": 5.0757, + "step": 15056 + }, + { + "epoch": 0.08954824436197545, + "grad_norm": 2.1031241416931152, + "learning_rate": 4.901734646497084e-05, + "loss": 5.2114, + "step": 15057 + }, + { + "epoch": 0.08955419164525645, + "grad_norm": 1.9965282678604126, + "learning_rate": 4.901721678955571e-05, + "loss": 5.1136, + "step": 15058 + }, + { + "epoch": 0.08956013892853744, + "grad_norm": 1.9062676429748535, + "learning_rate": 4.9017087105756434e-05, + "loss": 4.9166, + "step": 15059 + }, + { + "epoch": 0.08956608621181844, + "grad_norm": 2.0963199138641357, + "learning_rate": 4.901695741357303e-05, + "loss": 4.7587, + "step": 15060 + }, + { + "epoch": 0.08957203349509944, + "grad_norm": 1.7062407732009888, + "learning_rate": 4.901682771300556e-05, + "loss": 5.3046, + "step": 15061 + }, + { + "epoch": 0.08957798077838043, + "grad_norm": 1.574013352394104, + "learning_rate": 4.9016698004054065e-05, + "loss": 5.3007, + "step": 15062 + }, + { + "epoch": 0.08958392806166143, + "grad_norm": 1.7540260553359985, + "learning_rate": 4.9016568286718586e-05, + "loss": 5.5824, + "step": 15063 + }, + { + "epoch": 0.08958987534494244, + "grad_norm": 1.4875624179840088, + "learning_rate": 4.901643856099917e-05, + "loss": 5.4569, + "step": 15064 + }, + { + "epoch": 0.08959582262822342, + "grad_norm": 1.6023603677749634, + "learning_rate": 4.901630882689586e-05, + "loss": 5.5397, + "step": 15065 + }, + { + "epoch": 0.08960176991150443, + "grad_norm": 2.1851913928985596, + "learning_rate": 4.9016179084408706e-05, + "loss": 4.9882, + "step": 15066 + }, + { + "epoch": 0.08960771719478543, + "grad_norm": 1.4636015892028809, + "learning_rate": 4.901604933353776e-05, + "loss": 5.4568, + "step": 15067 + }, + { + "epoch": 0.08961366447806642, + "grad_norm": 2.6841142177581787, + "learning_rate": 4.901591957428305e-05, + "loss": 5.8365, + "step": 15068 + }, + { + "epoch": 0.08961961176134742, + "grad_norm": 2.2015743255615234, + "learning_rate": 4.9015789806644643e-05, + "loss": 5.4798, + "step": 15069 + }, + { + "epoch": 0.08962555904462842, + "grad_norm": 2.3934903144836426, + "learning_rate": 4.901566003062256e-05, + "loss": 5.3355, + "step": 15070 + }, + { + "epoch": 0.08963150632790941, + "grad_norm": 2.418919801712036, + "learning_rate": 4.9015530246216866e-05, + "loss": 5.2546, + "step": 15071 + }, + { + "epoch": 0.08963745361119041, + "grad_norm": 2.2773303985595703, + "learning_rate": 4.90154004534276e-05, + "loss": 5.3306, + "step": 15072 + }, + { + "epoch": 0.08964340089447141, + "grad_norm": 2.09413743019104, + "learning_rate": 4.9015270652254796e-05, + "loss": 5.4715, + "step": 15073 + }, + { + "epoch": 0.0896493481777524, + "grad_norm": 1.8905339241027832, + "learning_rate": 4.901514084269852e-05, + "loss": 5.2248, + "step": 15074 + }, + { + "epoch": 0.0896552954610334, + "grad_norm": 1.7001872062683105, + "learning_rate": 4.9015011024758794e-05, + "loss": 5.2869, + "step": 15075 + }, + { + "epoch": 0.0896612427443144, + "grad_norm": 1.7953561544418335, + "learning_rate": 4.901488119843568e-05, + "loss": 5.2027, + "step": 15076 + }, + { + "epoch": 0.08966719002759539, + "grad_norm": 1.8996349573135376, + "learning_rate": 4.9014751363729225e-05, + "loss": 5.8168, + "step": 15077 + }, + { + "epoch": 0.0896731373108764, + "grad_norm": 1.6294323205947876, + "learning_rate": 4.901462152063946e-05, + "loss": 5.0331, + "step": 15078 + }, + { + "epoch": 0.08967908459415738, + "grad_norm": 1.4392082691192627, + "learning_rate": 4.901449166916645e-05, + "loss": 4.9094, + "step": 15079 + }, + { + "epoch": 0.08968503187743838, + "grad_norm": 1.6613532304763794, + "learning_rate": 4.9014361809310216e-05, + "loss": 5.1426, + "step": 15080 + }, + { + "epoch": 0.08969097916071939, + "grad_norm": 1.7502686977386475, + "learning_rate": 4.9014231941070823e-05, + "loss": 5.4298, + "step": 15081 + }, + { + "epoch": 0.08969692644400037, + "grad_norm": 1.9276418685913086, + "learning_rate": 4.9014102064448305e-05, + "loss": 5.8383, + "step": 15082 + }, + { + "epoch": 0.08970287372728138, + "grad_norm": 2.471407651901245, + "learning_rate": 4.901397217944272e-05, + "loss": 6.1879, + "step": 15083 + }, + { + "epoch": 0.08970882101056238, + "grad_norm": 2.0759341716766357, + "learning_rate": 4.90138422860541e-05, + "loss": 6.0929, + "step": 15084 + }, + { + "epoch": 0.08971476829384337, + "grad_norm": 1.6504180431365967, + "learning_rate": 4.9013712384282505e-05, + "loss": 6.0733, + "step": 15085 + }, + { + "epoch": 0.08972071557712437, + "grad_norm": 1.7268849611282349, + "learning_rate": 4.9013582474127965e-05, + "loss": 5.9707, + "step": 15086 + }, + { + "epoch": 0.08972666286040537, + "grad_norm": 1.8029861450195312, + "learning_rate": 4.901345255559053e-05, + "loss": 5.3645, + "step": 15087 + }, + { + "epoch": 0.08973261014368636, + "grad_norm": 1.8240137100219727, + "learning_rate": 4.9013322628670246e-05, + "loss": 5.4201, + "step": 15088 + }, + { + "epoch": 0.08973855742696736, + "grad_norm": 1.799771785736084, + "learning_rate": 4.901319269336716e-05, + "loss": 5.2043, + "step": 15089 + }, + { + "epoch": 0.08974450471024836, + "grad_norm": 1.6271024942398071, + "learning_rate": 4.901306274968131e-05, + "loss": 5.4118, + "step": 15090 + }, + { + "epoch": 0.08975045199352935, + "grad_norm": 1.4443042278289795, + "learning_rate": 4.9012932797612756e-05, + "loss": 5.5921, + "step": 15091 + }, + { + "epoch": 0.08975639927681035, + "grad_norm": 1.7174689769744873, + "learning_rate": 4.9012802837161535e-05, + "loss": 5.5233, + "step": 15092 + }, + { + "epoch": 0.08976234656009136, + "grad_norm": 1.7158472537994385, + "learning_rate": 4.901267286832769e-05, + "loss": 5.9171, + "step": 15093 + }, + { + "epoch": 0.08976829384337234, + "grad_norm": 1.691797137260437, + "learning_rate": 4.9012542891111275e-05, + "loss": 5.6207, + "step": 15094 + }, + { + "epoch": 0.08977424112665335, + "grad_norm": 1.7525362968444824, + "learning_rate": 4.901241290551233e-05, + "loss": 5.3468, + "step": 15095 + }, + { + "epoch": 0.08978018840993435, + "grad_norm": 1.6895235776901245, + "learning_rate": 4.901228291153089e-05, + "loss": 5.3567, + "step": 15096 + }, + { + "epoch": 0.08978613569321534, + "grad_norm": 1.6617051362991333, + "learning_rate": 4.9012152909167015e-05, + "loss": 5.6781, + "step": 15097 + }, + { + "epoch": 0.08979208297649634, + "grad_norm": 1.5234577655792236, + "learning_rate": 4.901202289842075e-05, + "loss": 5.6262, + "step": 15098 + }, + { + "epoch": 0.08979803025977734, + "grad_norm": 2.1545703411102295, + "learning_rate": 4.9011892879292125e-05, + "loss": 5.3112, + "step": 15099 + }, + { + "epoch": 0.08980397754305833, + "grad_norm": 2.246051073074341, + "learning_rate": 4.9011762851781204e-05, + "loss": 5.3783, + "step": 15100 + }, + { + "epoch": 0.08980992482633933, + "grad_norm": 2.000429630279541, + "learning_rate": 4.901163281588802e-05, + "loss": 5.2561, + "step": 15101 + }, + { + "epoch": 0.08981587210962033, + "grad_norm": 2.0881898403167725, + "learning_rate": 4.901150277161263e-05, + "loss": 5.3308, + "step": 15102 + }, + { + "epoch": 0.08982181939290132, + "grad_norm": 2.4498097896575928, + "learning_rate": 4.901137271895506e-05, + "loss": 5.8405, + "step": 15103 + }, + { + "epoch": 0.08982776667618232, + "grad_norm": 2.210160732269287, + "learning_rate": 4.901124265791538e-05, + "loss": 5.5462, + "step": 15104 + }, + { + "epoch": 0.08983371395946332, + "grad_norm": 2.366419553756714, + "learning_rate": 4.9011112588493625e-05, + "loss": 5.4069, + "step": 15105 + }, + { + "epoch": 0.08983966124274431, + "grad_norm": 1.812118649482727, + "learning_rate": 4.901098251068983e-05, + "loss": 5.9549, + "step": 15106 + }, + { + "epoch": 0.08984560852602531, + "grad_norm": 1.6506917476654053, + "learning_rate": 4.901085242450405e-05, + "loss": 5.762, + "step": 15107 + }, + { + "epoch": 0.0898515558093063, + "grad_norm": 1.8076404333114624, + "learning_rate": 4.901072232993633e-05, + "loss": 5.7841, + "step": 15108 + }, + { + "epoch": 0.0898575030925873, + "grad_norm": 2.51157546043396, + "learning_rate": 4.9010592226986716e-05, + "loss": 5.1544, + "step": 15109 + }, + { + "epoch": 0.0898634503758683, + "grad_norm": 1.9424755573272705, + "learning_rate": 4.901046211565526e-05, + "loss": 5.4587, + "step": 15110 + }, + { + "epoch": 0.0898693976591493, + "grad_norm": 1.998506784439087, + "learning_rate": 4.9010331995941995e-05, + "loss": 5.8242, + "step": 15111 + }, + { + "epoch": 0.0898753449424303, + "grad_norm": 1.8947205543518066, + "learning_rate": 4.901020186784697e-05, + "loss": 5.4488, + "step": 15112 + }, + { + "epoch": 0.0898812922257113, + "grad_norm": 1.905993938446045, + "learning_rate": 4.901007173137022e-05, + "loss": 5.3882, + "step": 15113 + }, + { + "epoch": 0.08988723950899229, + "grad_norm": 1.723973274230957, + "learning_rate": 4.900994158651182e-05, + "loss": 5.9411, + "step": 15114 + }, + { + "epoch": 0.08989318679227329, + "grad_norm": 1.747159719467163, + "learning_rate": 4.900981143327179e-05, + "loss": 5.8436, + "step": 15115 + }, + { + "epoch": 0.08989913407555429, + "grad_norm": 1.7400517463684082, + "learning_rate": 4.900968127165018e-05, + "loss": 5.7067, + "step": 15116 + }, + { + "epoch": 0.08990508135883528, + "grad_norm": 1.763750433921814, + "learning_rate": 4.900955110164704e-05, + "loss": 5.6198, + "step": 15117 + }, + { + "epoch": 0.08991102864211628, + "grad_norm": 1.9004894495010376, + "learning_rate": 4.9009420923262416e-05, + "loss": 5.0977, + "step": 15118 + }, + { + "epoch": 0.08991697592539728, + "grad_norm": 1.6853641271591187, + "learning_rate": 4.900929073649635e-05, + "loss": 5.5213, + "step": 15119 + }, + { + "epoch": 0.08992292320867827, + "grad_norm": 1.7032074928283691, + "learning_rate": 4.900916054134889e-05, + "loss": 5.3764, + "step": 15120 + }, + { + "epoch": 0.08992887049195927, + "grad_norm": 1.623089075088501, + "learning_rate": 4.9009030337820084e-05, + "loss": 5.525, + "step": 15121 + }, + { + "epoch": 0.08993481777524027, + "grad_norm": 1.6154295206069946, + "learning_rate": 4.900890012590996e-05, + "loss": 5.7378, + "step": 15122 + }, + { + "epoch": 0.08994076505852126, + "grad_norm": 1.8368462324142456, + "learning_rate": 4.900876990561859e-05, + "loss": 5.4768, + "step": 15123 + }, + { + "epoch": 0.08994671234180227, + "grad_norm": 1.7773829698562622, + "learning_rate": 4.9008639676946e-05, + "loss": 5.419, + "step": 15124 + }, + { + "epoch": 0.08995265962508327, + "grad_norm": 1.625287413597107, + "learning_rate": 4.9008509439892244e-05, + "loss": 5.4727, + "step": 15125 + }, + { + "epoch": 0.08995860690836426, + "grad_norm": 1.6234408617019653, + "learning_rate": 4.9008379194457364e-05, + "loss": 5.413, + "step": 15126 + }, + { + "epoch": 0.08996455419164526, + "grad_norm": 1.7441129684448242, + "learning_rate": 4.900824894064141e-05, + "loss": 5.2681, + "step": 15127 + }, + { + "epoch": 0.08997050147492626, + "grad_norm": 1.8756482601165771, + "learning_rate": 4.900811867844443e-05, + "loss": 5.5319, + "step": 15128 + }, + { + "epoch": 0.08997644875820725, + "grad_norm": 1.9200249910354614, + "learning_rate": 4.900798840786645e-05, + "loss": 4.7499, + "step": 15129 + }, + { + "epoch": 0.08998239604148825, + "grad_norm": 2.4838919639587402, + "learning_rate": 4.900785812890753e-05, + "loss": 5.0713, + "step": 15130 + }, + { + "epoch": 0.08998834332476925, + "grad_norm": 2.1441292762756348, + "learning_rate": 4.900772784156773e-05, + "loss": 4.9425, + "step": 15131 + }, + { + "epoch": 0.08999429060805024, + "grad_norm": 2.0838072299957275, + "learning_rate": 4.9007597545847066e-05, + "loss": 5.0632, + "step": 15132 + }, + { + "epoch": 0.09000023789133124, + "grad_norm": 1.630042314529419, + "learning_rate": 4.90074672417456e-05, + "loss": 5.2275, + "step": 15133 + }, + { + "epoch": 0.09000618517461224, + "grad_norm": 2.336031675338745, + "learning_rate": 4.900733692926338e-05, + "loss": 4.9596, + "step": 15134 + }, + { + "epoch": 0.09001213245789323, + "grad_norm": 2.414837598800659, + "learning_rate": 4.9007206608400446e-05, + "loss": 4.7405, + "step": 15135 + }, + { + "epoch": 0.09001807974117423, + "grad_norm": 2.2872564792633057, + "learning_rate": 4.900707627915684e-05, + "loss": 4.8294, + "step": 15136 + }, + { + "epoch": 0.09002402702445522, + "grad_norm": 2.474933624267578, + "learning_rate": 4.9006945941532615e-05, + "loss": 4.882, + "step": 15137 + }, + { + "epoch": 0.09002997430773622, + "grad_norm": 2.170109987258911, + "learning_rate": 4.900681559552781e-05, + "loss": 4.6778, + "step": 15138 + }, + { + "epoch": 0.09003592159101723, + "grad_norm": 2.1962943077087402, + "learning_rate": 4.900668524114248e-05, + "loss": 4.8201, + "step": 15139 + }, + { + "epoch": 0.09004186887429821, + "grad_norm": 2.46073317527771, + "learning_rate": 4.9006554878376656e-05, + "loss": 4.6929, + "step": 15140 + }, + { + "epoch": 0.09004781615757922, + "grad_norm": 2.4591431617736816, + "learning_rate": 4.90064245072304e-05, + "loss": 4.711, + "step": 15141 + }, + { + "epoch": 0.09005376344086022, + "grad_norm": 2.2225937843322754, + "learning_rate": 4.9006294127703745e-05, + "loss": 5.2556, + "step": 15142 + }, + { + "epoch": 0.0900597107241412, + "grad_norm": 2.3457517623901367, + "learning_rate": 4.900616373979674e-05, + "loss": 5.7773, + "step": 15143 + }, + { + "epoch": 0.09006565800742221, + "grad_norm": 2.226430892944336, + "learning_rate": 4.9006033343509436e-05, + "loss": 5.6364, + "step": 15144 + }, + { + "epoch": 0.09007160529070321, + "grad_norm": 2.1407759189605713, + "learning_rate": 4.900590293884186e-05, + "loss": 5.4202, + "step": 15145 + }, + { + "epoch": 0.0900775525739842, + "grad_norm": 1.7371548414230347, + "learning_rate": 4.9005772525794084e-05, + "loss": 5.5686, + "step": 15146 + }, + { + "epoch": 0.0900834998572652, + "grad_norm": 1.8759154081344604, + "learning_rate": 4.900564210436615e-05, + "loss": 5.4824, + "step": 15147 + }, + { + "epoch": 0.0900894471405462, + "grad_norm": 1.8595685958862305, + "learning_rate": 4.900551167455807e-05, + "loss": 5.6123, + "step": 15148 + }, + { + "epoch": 0.09009539442382719, + "grad_norm": 2.0119471549987793, + "learning_rate": 4.900538123636993e-05, + "loss": 5.5925, + "step": 15149 + }, + { + "epoch": 0.09010134170710819, + "grad_norm": 1.9375147819519043, + "learning_rate": 4.900525078980176e-05, + "loss": 5.5707, + "step": 15150 + }, + { + "epoch": 0.0901072889903892, + "grad_norm": 1.7323594093322754, + "learning_rate": 4.9005120334853595e-05, + "loss": 5.4133, + "step": 15151 + }, + { + "epoch": 0.09011323627367018, + "grad_norm": 1.7680727243423462, + "learning_rate": 4.90049898715255e-05, + "loss": 5.5954, + "step": 15152 + }, + { + "epoch": 0.09011918355695119, + "grad_norm": 1.8436721563339233, + "learning_rate": 4.9004859399817505e-05, + "loss": 5.5689, + "step": 15153 + }, + { + "epoch": 0.09012513084023219, + "grad_norm": 1.8080954551696777, + "learning_rate": 4.9004728919729664e-05, + "loss": 5.5266, + "step": 15154 + }, + { + "epoch": 0.09013107812351318, + "grad_norm": 2.2874748706817627, + "learning_rate": 4.900459843126202e-05, + "loss": 5.1985, + "step": 15155 + }, + { + "epoch": 0.09013702540679418, + "grad_norm": 1.8425899744033813, + "learning_rate": 4.900446793441462e-05, + "loss": 5.2856, + "step": 15156 + }, + { + "epoch": 0.09014297269007518, + "grad_norm": 1.6970654726028442, + "learning_rate": 4.900433742918751e-05, + "loss": 5.8597, + "step": 15157 + }, + { + "epoch": 0.09014891997335617, + "grad_norm": 2.3444008827209473, + "learning_rate": 4.9004206915580726e-05, + "loss": 4.4653, + "step": 15158 + }, + { + "epoch": 0.09015486725663717, + "grad_norm": 2.0390350818634033, + "learning_rate": 4.9004076393594325e-05, + "loss": 4.6565, + "step": 15159 + }, + { + "epoch": 0.09016081453991817, + "grad_norm": 2.0733320713043213, + "learning_rate": 4.900394586322835e-05, + "loss": 4.6052, + "step": 15160 + }, + { + "epoch": 0.09016676182319916, + "grad_norm": 1.9700855016708374, + "learning_rate": 4.9003815324482846e-05, + "loss": 4.7535, + "step": 15161 + }, + { + "epoch": 0.09017270910648016, + "grad_norm": 2.0294783115386963, + "learning_rate": 4.900368477735786e-05, + "loss": 5.4154, + "step": 15162 + }, + { + "epoch": 0.09017865638976116, + "grad_norm": 1.8937848806381226, + "learning_rate": 4.900355422185343e-05, + "loss": 5.3244, + "step": 15163 + }, + { + "epoch": 0.09018460367304215, + "grad_norm": 1.7404329776763916, + "learning_rate": 4.900342365796961e-05, + "loss": 5.887, + "step": 15164 + }, + { + "epoch": 0.09019055095632315, + "grad_norm": 1.5309412479400635, + "learning_rate": 4.9003293085706446e-05, + "loss": 5.4574, + "step": 15165 + }, + { + "epoch": 0.09019649823960414, + "grad_norm": 2.10003662109375, + "learning_rate": 4.9003162505063976e-05, + "loss": 5.2962, + "step": 15166 + }, + { + "epoch": 0.09020244552288514, + "grad_norm": 2.7704551219940186, + "learning_rate": 4.900303191604225e-05, + "loss": 4.6386, + "step": 15167 + }, + { + "epoch": 0.09020839280616615, + "grad_norm": 3.3551974296569824, + "learning_rate": 4.9002901318641314e-05, + "loss": 5.3348, + "step": 15168 + }, + { + "epoch": 0.09021434008944713, + "grad_norm": 2.8300132751464844, + "learning_rate": 4.9002770712861216e-05, + "loss": 5.2031, + "step": 15169 + }, + { + "epoch": 0.09022028737272814, + "grad_norm": 1.77587890625, + "learning_rate": 4.9002640098702005e-05, + "loss": 5.1371, + "step": 15170 + }, + { + "epoch": 0.09022623465600914, + "grad_norm": 1.694191575050354, + "learning_rate": 4.900250947616371e-05, + "loss": 5.7283, + "step": 15171 + }, + { + "epoch": 0.09023218193929013, + "grad_norm": 1.6392415761947632, + "learning_rate": 4.900237884524638e-05, + "loss": 5.3856, + "step": 15172 + }, + { + "epoch": 0.09023812922257113, + "grad_norm": 2.302626371383667, + "learning_rate": 4.900224820595008e-05, + "loss": 5.1007, + "step": 15173 + }, + { + "epoch": 0.09024407650585213, + "grad_norm": 2.296760082244873, + "learning_rate": 4.900211755827484e-05, + "loss": 5.0303, + "step": 15174 + }, + { + "epoch": 0.09025002378913312, + "grad_norm": 2.2914488315582275, + "learning_rate": 4.9001986902220706e-05, + "loss": 5.3176, + "step": 15175 + }, + { + "epoch": 0.09025597107241412, + "grad_norm": 2.084686756134033, + "learning_rate": 4.900185623778774e-05, + "loss": 5.2028, + "step": 15176 + }, + { + "epoch": 0.09026191835569512, + "grad_norm": 1.9465001821517944, + "learning_rate": 4.9001725564975953e-05, + "loss": 4.661, + "step": 15177 + }, + { + "epoch": 0.09026786563897611, + "grad_norm": 2.926347494125366, + "learning_rate": 4.900159488378542e-05, + "loss": 4.4579, + "step": 15178 + }, + { + "epoch": 0.09027381292225711, + "grad_norm": 2.6047539710998535, + "learning_rate": 4.900146419421619e-05, + "loss": 4.5486, + "step": 15179 + }, + { + "epoch": 0.09027976020553811, + "grad_norm": 2.4737868309020996, + "learning_rate": 4.9001333496268274e-05, + "loss": 4.3661, + "step": 15180 + }, + { + "epoch": 0.0902857074888191, + "grad_norm": 2.075547456741333, + "learning_rate": 4.900120278994176e-05, + "loss": 4.3157, + "step": 15181 + }, + { + "epoch": 0.0902916547721001, + "grad_norm": 2.509284019470215, + "learning_rate": 4.900107207523666e-05, + "loss": 4.2558, + "step": 15182 + }, + { + "epoch": 0.09029760205538111, + "grad_norm": 2.4345662593841553, + "learning_rate": 4.9000941352153046e-05, + "loss": 4.2932, + "step": 15183 + }, + { + "epoch": 0.0903035493386621, + "grad_norm": 2.214146137237549, + "learning_rate": 4.9000810620690945e-05, + "loss": 4.6953, + "step": 15184 + }, + { + "epoch": 0.0903094966219431, + "grad_norm": 2.197709083557129, + "learning_rate": 4.900067988085041e-05, + "loss": 4.7138, + "step": 15185 + }, + { + "epoch": 0.0903154439052241, + "grad_norm": 2.0381791591644287, + "learning_rate": 4.900054913263148e-05, + "loss": 6.1924, + "step": 15186 + }, + { + "epoch": 0.09032139118850509, + "grad_norm": 1.7017699480056763, + "learning_rate": 4.900041837603422e-05, + "loss": 6.1646, + "step": 15187 + }, + { + "epoch": 0.09032733847178609, + "grad_norm": 1.5804365873336792, + "learning_rate": 4.9000287611058645e-05, + "loss": 6.1757, + "step": 15188 + }, + { + "epoch": 0.09033328575506709, + "grad_norm": 1.6158896684646606, + "learning_rate": 4.9000156837704836e-05, + "loss": 6.1136, + "step": 15189 + }, + { + "epoch": 0.09033923303834808, + "grad_norm": 1.9524257183074951, + "learning_rate": 4.90000260559728e-05, + "loss": 5.43, + "step": 15190 + }, + { + "epoch": 0.09034518032162908, + "grad_norm": 1.835134744644165, + "learning_rate": 4.899989526586261e-05, + "loss": 6.0223, + "step": 15191 + }, + { + "epoch": 0.09035112760491008, + "grad_norm": 1.7213332653045654, + "learning_rate": 4.899976446737432e-05, + "loss": 5.7823, + "step": 15192 + }, + { + "epoch": 0.09035707488819107, + "grad_norm": 1.8744465112686157, + "learning_rate": 4.899963366050795e-05, + "loss": 5.0549, + "step": 15193 + }, + { + "epoch": 0.09036302217147207, + "grad_norm": 1.800979495048523, + "learning_rate": 4.899950284526355e-05, + "loss": 5.0726, + "step": 15194 + }, + { + "epoch": 0.09036896945475306, + "grad_norm": 1.7476063966751099, + "learning_rate": 4.899937202164118e-05, + "loss": 4.9177, + "step": 15195 + }, + { + "epoch": 0.09037491673803406, + "grad_norm": 1.5107455253601074, + "learning_rate": 4.899924118964087e-05, + "loss": 5.1873, + "step": 15196 + }, + { + "epoch": 0.09038086402131507, + "grad_norm": 1.4630497694015503, + "learning_rate": 4.899911034926267e-05, + "loss": 4.9166, + "step": 15197 + }, + { + "epoch": 0.09038681130459605, + "grad_norm": 1.519824743270874, + "learning_rate": 4.899897950050664e-05, + "loss": 4.9084, + "step": 15198 + }, + { + "epoch": 0.09039275858787706, + "grad_norm": 1.480298399925232, + "learning_rate": 4.899884864337281e-05, + "loss": 4.8724, + "step": 15199 + }, + { + "epoch": 0.09039870587115806, + "grad_norm": 1.549485445022583, + "learning_rate": 4.8998717777861224e-05, + "loss": 4.8378, + "step": 15200 + }, + { + "epoch": 0.09040465315443905, + "grad_norm": 1.6650373935699463, + "learning_rate": 4.8998586903971936e-05, + "loss": 4.9478, + "step": 15201 + }, + { + "epoch": 0.09041060043772005, + "grad_norm": 1.5880005359649658, + "learning_rate": 4.899845602170499e-05, + "loss": 4.7952, + "step": 15202 + }, + { + "epoch": 0.09041654772100105, + "grad_norm": 1.5553892850875854, + "learning_rate": 4.899832513106043e-05, + "loss": 4.9303, + "step": 15203 + }, + { + "epoch": 0.09042249500428204, + "grad_norm": 1.5907729864120483, + "learning_rate": 4.899819423203831e-05, + "loss": 4.7951, + "step": 15204 + }, + { + "epoch": 0.09042844228756304, + "grad_norm": 1.5885943174362183, + "learning_rate": 4.899806332463866e-05, + "loss": 4.8896, + "step": 15205 + }, + { + "epoch": 0.09043438957084404, + "grad_norm": 1.7483280897140503, + "learning_rate": 4.899793240886154e-05, + "loss": 5.6137, + "step": 15206 + }, + { + "epoch": 0.09044033685412503, + "grad_norm": 1.7883373498916626, + "learning_rate": 4.8997801484706984e-05, + "loss": 5.7183, + "step": 15207 + }, + { + "epoch": 0.09044628413740603, + "grad_norm": 1.7988712787628174, + "learning_rate": 4.8997670552175044e-05, + "loss": 5.7979, + "step": 15208 + }, + { + "epoch": 0.09045223142068703, + "grad_norm": 2.1793367862701416, + "learning_rate": 4.899753961126577e-05, + "loss": 5.3549, + "step": 15209 + }, + { + "epoch": 0.09045817870396802, + "grad_norm": 2.117983341217041, + "learning_rate": 4.8997408661979194e-05, + "loss": 5.1934, + "step": 15210 + }, + { + "epoch": 0.09046412598724902, + "grad_norm": 2.1799557209014893, + "learning_rate": 4.899727770431538e-05, + "loss": 5.2521, + "step": 15211 + }, + { + "epoch": 0.09047007327053003, + "grad_norm": 2.117403745651245, + "learning_rate": 4.8997146738274355e-05, + "loss": 5.3379, + "step": 15212 + }, + { + "epoch": 0.09047602055381102, + "grad_norm": 1.59669828414917, + "learning_rate": 4.899701576385619e-05, + "loss": 5.375, + "step": 15213 + }, + { + "epoch": 0.09048196783709202, + "grad_norm": 1.6929266452789307, + "learning_rate": 4.8996884781060907e-05, + "loss": 5.9243, + "step": 15214 + }, + { + "epoch": 0.09048791512037302, + "grad_norm": 1.8353838920593262, + "learning_rate": 4.899675378988855e-05, + "loss": 5.9216, + "step": 15215 + }, + { + "epoch": 0.09049386240365401, + "grad_norm": 1.6468323469161987, + "learning_rate": 4.899662279033918e-05, + "loss": 6.0171, + "step": 15216 + }, + { + "epoch": 0.09049980968693501, + "grad_norm": 1.4748890399932861, + "learning_rate": 4.899649178241284e-05, + "loss": 5.6775, + "step": 15217 + }, + { + "epoch": 0.09050575697021601, + "grad_norm": 1.8783589601516724, + "learning_rate": 4.8996360766109576e-05, + "loss": 5.7625, + "step": 15218 + }, + { + "epoch": 0.090511704253497, + "grad_norm": 1.7860721349716187, + "learning_rate": 4.8996229741429416e-05, + "loss": 5.7512, + "step": 15219 + }, + { + "epoch": 0.090517651536778, + "grad_norm": 1.7337830066680908, + "learning_rate": 4.899609870837243e-05, + "loss": 5.8233, + "step": 15220 + }, + { + "epoch": 0.090523598820059, + "grad_norm": 1.9256298542022705, + "learning_rate": 4.899596766693865e-05, + "loss": 5.8586, + "step": 15221 + }, + { + "epoch": 0.09052954610333999, + "grad_norm": 1.814205288887024, + "learning_rate": 4.8995836617128135e-05, + "loss": 5.4852, + "step": 15222 + }, + { + "epoch": 0.090535493386621, + "grad_norm": 1.8664608001708984, + "learning_rate": 4.899570555894091e-05, + "loss": 5.6847, + "step": 15223 + }, + { + "epoch": 0.09054144066990198, + "grad_norm": 1.8377459049224854, + "learning_rate": 4.899557449237704e-05, + "loss": 5.8869, + "step": 15224 + }, + { + "epoch": 0.09054738795318298, + "grad_norm": 1.788875937461853, + "learning_rate": 4.899544341743656e-05, + "loss": 5.4372, + "step": 15225 + }, + { + "epoch": 0.09055333523646399, + "grad_norm": 1.8490506410598755, + "learning_rate": 4.899531233411951e-05, + "loss": 6.1163, + "step": 15226 + }, + { + "epoch": 0.09055928251974497, + "grad_norm": 2.14841628074646, + "learning_rate": 4.8995181242425955e-05, + "loss": 6.1154, + "step": 15227 + }, + { + "epoch": 0.09056522980302598, + "grad_norm": 2.051154851913452, + "learning_rate": 4.899505014235593e-05, + "loss": 4.9326, + "step": 15228 + }, + { + "epoch": 0.09057117708630698, + "grad_norm": 2.071126937866211, + "learning_rate": 4.899491903390948e-05, + "loss": 4.8831, + "step": 15229 + }, + { + "epoch": 0.09057712436958797, + "grad_norm": 2.0155231952667236, + "learning_rate": 4.899478791708665e-05, + "loss": 4.87, + "step": 15230 + }, + { + "epoch": 0.09058307165286897, + "grad_norm": 1.946815013885498, + "learning_rate": 4.89946567918875e-05, + "loss": 4.8139, + "step": 15231 + }, + { + "epoch": 0.09058901893614997, + "grad_norm": 1.9526349306106567, + "learning_rate": 4.899452565831204e-05, + "loss": 4.7618, + "step": 15232 + }, + { + "epoch": 0.09059496621943096, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.8994394516360355e-05, + "loss": 4.7617, + "step": 15233 + }, + { + "epoch": 0.09060091350271196, + "grad_norm": 2.0964083671569824, + "learning_rate": 4.8994263366032466e-05, + "loss": 4.6298, + "step": 15234 + }, + { + "epoch": 0.09060686078599296, + "grad_norm": 2.0333590507507324, + "learning_rate": 4.899413220732843e-05, + "loss": 4.6419, + "step": 15235 + }, + { + "epoch": 0.09061280806927395, + "grad_norm": 2.076993703842163, + "learning_rate": 4.89940010402483e-05, + "loss": 4.6163, + "step": 15236 + }, + { + "epoch": 0.09061875535255495, + "grad_norm": 1.767774224281311, + "learning_rate": 4.89938698647921e-05, + "loss": 5.2418, + "step": 15237 + }, + { + "epoch": 0.09062470263583595, + "grad_norm": 1.8380626440048218, + "learning_rate": 4.899373868095989e-05, + "loss": 5.3304, + "step": 15238 + }, + { + "epoch": 0.09063064991911694, + "grad_norm": 1.7332574129104614, + "learning_rate": 4.8993607488751716e-05, + "loss": 5.3528, + "step": 15239 + }, + { + "epoch": 0.09063659720239794, + "grad_norm": 1.8473124504089355, + "learning_rate": 4.8993476288167614e-05, + "loss": 5.5801, + "step": 15240 + }, + { + "epoch": 0.09064254448567895, + "grad_norm": 2.299206256866455, + "learning_rate": 4.899334507920765e-05, + "loss": 5.308, + "step": 15241 + }, + { + "epoch": 0.09064849176895994, + "grad_norm": 1.945417046546936, + "learning_rate": 4.899321386187185e-05, + "loss": 4.8894, + "step": 15242 + }, + { + "epoch": 0.09065443905224094, + "grad_norm": 2.328246831893921, + "learning_rate": 4.899308263616027e-05, + "loss": 5.0332, + "step": 15243 + }, + { + "epoch": 0.09066038633552194, + "grad_norm": 2.194546699523926, + "learning_rate": 4.899295140207295e-05, + "loss": 4.8891, + "step": 15244 + }, + { + "epoch": 0.09066633361880293, + "grad_norm": 2.078903913497925, + "learning_rate": 4.899282015960994e-05, + "loss": 5.0327, + "step": 15245 + }, + { + "epoch": 0.09067228090208393, + "grad_norm": 2.2129557132720947, + "learning_rate": 4.8992688908771285e-05, + "loss": 4.8806, + "step": 15246 + }, + { + "epoch": 0.09067822818536493, + "grad_norm": 2.3200979232788086, + "learning_rate": 4.8992557649557026e-05, + "loss": 4.9961, + "step": 15247 + }, + { + "epoch": 0.09068417546864592, + "grad_norm": 1.5829685926437378, + "learning_rate": 4.899242638196722e-05, + "loss": 5.4238, + "step": 15248 + }, + { + "epoch": 0.09069012275192692, + "grad_norm": 1.9085135459899902, + "learning_rate": 4.89922951060019e-05, + "loss": 5.0338, + "step": 15249 + }, + { + "epoch": 0.09069607003520792, + "grad_norm": 2.3000802993774414, + "learning_rate": 4.899216382166112e-05, + "loss": 4.9529, + "step": 15250 + }, + { + "epoch": 0.09070201731848891, + "grad_norm": 2.1610753536224365, + "learning_rate": 4.899203252894492e-05, + "loss": 4.9373, + "step": 15251 + }, + { + "epoch": 0.09070796460176991, + "grad_norm": 2.2821414470672607, + "learning_rate": 4.899190122785336e-05, + "loss": 5.2032, + "step": 15252 + }, + { + "epoch": 0.0907139118850509, + "grad_norm": 2.226741075515747, + "learning_rate": 4.899176991838646e-05, + "loss": 4.9354, + "step": 15253 + }, + { + "epoch": 0.0907198591683319, + "grad_norm": 2.0117716789245605, + "learning_rate": 4.899163860054429e-05, + "loss": 5.1179, + "step": 15254 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 1.6551730632781982, + "learning_rate": 4.8991507274326886e-05, + "loss": 5.6428, + "step": 15255 + }, + { + "epoch": 0.0907317537348939, + "grad_norm": 1.5236784219741821, + "learning_rate": 4.89913759397343e-05, + "loss": 5.4088, + "step": 15256 + }, + { + "epoch": 0.0907377010181749, + "grad_norm": 1.542356252670288, + "learning_rate": 4.899124459676656e-05, + "loss": 5.3383, + "step": 15257 + }, + { + "epoch": 0.0907436483014559, + "grad_norm": 1.5694434642791748, + "learning_rate": 4.899111324542374e-05, + "loss": 5.5202, + "step": 15258 + }, + { + "epoch": 0.09074959558473689, + "grad_norm": 1.459039568901062, + "learning_rate": 4.8990981885705856e-05, + "loss": 5.3481, + "step": 15259 + }, + { + "epoch": 0.09075554286801789, + "grad_norm": 1.4624565839767456, + "learning_rate": 4.899085051761297e-05, + "loss": 5.343, + "step": 15260 + }, + { + "epoch": 0.09076149015129889, + "grad_norm": 1.2748361825942993, + "learning_rate": 4.899071914114513e-05, + "loss": 5.1925, + "step": 15261 + }, + { + "epoch": 0.09076743743457988, + "grad_norm": 1.3813046216964722, + "learning_rate": 4.899058775630237e-05, + "loss": 4.9712, + "step": 15262 + }, + { + "epoch": 0.09077338471786088, + "grad_norm": 1.349108099937439, + "learning_rate": 4.8990456363084756e-05, + "loss": 4.9562, + "step": 15263 + }, + { + "epoch": 0.09077933200114188, + "grad_norm": 1.4744555950164795, + "learning_rate": 4.8990324961492316e-05, + "loss": 5.0014, + "step": 15264 + }, + { + "epoch": 0.09078527928442287, + "grad_norm": 1.4227643013000488, + "learning_rate": 4.8990193551525105e-05, + "loss": 5.076, + "step": 15265 + }, + { + "epoch": 0.09079122656770387, + "grad_norm": 1.4344059228897095, + "learning_rate": 4.8990062133183164e-05, + "loss": 5.2212, + "step": 15266 + }, + { + "epoch": 0.09079717385098487, + "grad_norm": 1.5858408212661743, + "learning_rate": 4.8989930706466534e-05, + "loss": 5.1893, + "step": 15267 + }, + { + "epoch": 0.09080312113426586, + "grad_norm": 1.6398282051086426, + "learning_rate": 4.898979927137527e-05, + "loss": 5.034, + "step": 15268 + }, + { + "epoch": 0.09080906841754686, + "grad_norm": 1.4295551776885986, + "learning_rate": 4.8989667827909416e-05, + "loss": 5.2761, + "step": 15269 + }, + { + "epoch": 0.09081501570082787, + "grad_norm": 1.4313840866088867, + "learning_rate": 4.898953637606902e-05, + "loss": 5.183, + "step": 15270 + }, + { + "epoch": 0.09082096298410886, + "grad_norm": 1.2977478504180908, + "learning_rate": 4.898940491585412e-05, + "loss": 5.1148, + "step": 15271 + }, + { + "epoch": 0.09082691026738986, + "grad_norm": 1.6052992343902588, + "learning_rate": 4.898927344726477e-05, + "loss": 5.3767, + "step": 15272 + }, + { + "epoch": 0.09083285755067086, + "grad_norm": 1.3184257745742798, + "learning_rate": 4.898914197030101e-05, + "loss": 5.3465, + "step": 15273 + }, + { + "epoch": 0.09083880483395185, + "grad_norm": 1.292985439300537, + "learning_rate": 4.898901048496289e-05, + "loss": 5.2478, + "step": 15274 + }, + { + "epoch": 0.09084475211723285, + "grad_norm": 1.1660702228546143, + "learning_rate": 4.898887899125045e-05, + "loss": 5.2655, + "step": 15275 + }, + { + "epoch": 0.09085069940051385, + "grad_norm": 1.2271296977996826, + "learning_rate": 4.8988747489163746e-05, + "loss": 5.2001, + "step": 15276 + }, + { + "epoch": 0.09085664668379484, + "grad_norm": 1.2237215042114258, + "learning_rate": 4.898861597870281e-05, + "loss": 5.213, + "step": 15277 + }, + { + "epoch": 0.09086259396707584, + "grad_norm": 1.3682539463043213, + "learning_rate": 4.898848445986771e-05, + "loss": 5.2174, + "step": 15278 + }, + { + "epoch": 0.09086854125035684, + "grad_norm": 1.2321406602859497, + "learning_rate": 4.8988352932658466e-05, + "loss": 5.1424, + "step": 15279 + }, + { + "epoch": 0.09087448853363783, + "grad_norm": 1.285792350769043, + "learning_rate": 4.898822139707514e-05, + "loss": 5.1438, + "step": 15280 + }, + { + "epoch": 0.09088043581691883, + "grad_norm": 1.137921690940857, + "learning_rate": 4.898808985311778e-05, + "loss": 5.159, + "step": 15281 + }, + { + "epoch": 0.09088638310019982, + "grad_norm": 1.2261563539505005, + "learning_rate": 4.898795830078641e-05, + "loss": 5.1176, + "step": 15282 + }, + { + "epoch": 0.09089233038348082, + "grad_norm": 1.1642104387283325, + "learning_rate": 4.89878267400811e-05, + "loss": 5.0887, + "step": 15283 + }, + { + "epoch": 0.09089827766676183, + "grad_norm": 1.3699917793273926, + "learning_rate": 4.898769517100189e-05, + "loss": 5.0048, + "step": 15284 + }, + { + "epoch": 0.09090422495004281, + "grad_norm": 1.6375452280044556, + "learning_rate": 4.898756359354882e-05, + "loss": 4.6914, + "step": 15285 + }, + { + "epoch": 0.09091017223332382, + "grad_norm": 1.5404956340789795, + "learning_rate": 4.8987432007721944e-05, + "loss": 4.8266, + "step": 15286 + }, + { + "epoch": 0.09091611951660482, + "grad_norm": 1.6747840642929077, + "learning_rate": 4.89873004135213e-05, + "loss": 4.697, + "step": 15287 + }, + { + "epoch": 0.0909220667998858, + "grad_norm": 1.3908432722091675, + "learning_rate": 4.8987168810946935e-05, + "loss": 4.9327, + "step": 15288 + }, + { + "epoch": 0.09092801408316681, + "grad_norm": 1.4933167695999146, + "learning_rate": 4.89870371999989e-05, + "loss": 4.6153, + "step": 15289 + }, + { + "epoch": 0.09093396136644781, + "grad_norm": 1.6259129047393799, + "learning_rate": 4.8986905580677234e-05, + "loss": 4.533, + "step": 15290 + }, + { + "epoch": 0.0909399086497288, + "grad_norm": 1.3692474365234375, + "learning_rate": 4.898677395298199e-05, + "loss": 4.6246, + "step": 15291 + }, + { + "epoch": 0.0909458559330098, + "grad_norm": 1.4951711893081665, + "learning_rate": 4.8986642316913214e-05, + "loss": 4.6677, + "step": 15292 + }, + { + "epoch": 0.0909518032162908, + "grad_norm": 1.5491467714309692, + "learning_rate": 4.8986510672470946e-05, + "loss": 4.9271, + "step": 15293 + }, + { + "epoch": 0.09095775049957179, + "grad_norm": 1.6902397871017456, + "learning_rate": 4.8986379019655235e-05, + "loss": 4.6467, + "step": 15294 + }, + { + "epoch": 0.09096369778285279, + "grad_norm": 1.5122796297073364, + "learning_rate": 4.898624735846613e-05, + "loss": 4.7103, + "step": 15295 + }, + { + "epoch": 0.0909696450661338, + "grad_norm": 1.5287622213363647, + "learning_rate": 4.898611568890367e-05, + "loss": 4.7461, + "step": 15296 + }, + { + "epoch": 0.09097559234941478, + "grad_norm": 1.4649391174316406, + "learning_rate": 4.898598401096791e-05, + "loss": 5.2472, + "step": 15297 + }, + { + "epoch": 0.09098153963269578, + "grad_norm": 1.7621572017669678, + "learning_rate": 4.898585232465889e-05, + "loss": 4.6864, + "step": 15298 + }, + { + "epoch": 0.09098748691597679, + "grad_norm": 1.6371783018112183, + "learning_rate": 4.898572062997665e-05, + "loss": 4.6091, + "step": 15299 + }, + { + "epoch": 0.09099343419925777, + "grad_norm": 1.28440523147583, + "learning_rate": 4.898558892692125e-05, + "loss": 5.0019, + "step": 15300 + }, + { + "epoch": 0.09099938148253878, + "grad_norm": 1.4753130674362183, + "learning_rate": 4.898545721549272e-05, + "loss": 5.3848, + "step": 15301 + }, + { + "epoch": 0.09100532876581978, + "grad_norm": 1.4267481565475464, + "learning_rate": 4.898532549569112e-05, + "loss": 5.1787, + "step": 15302 + }, + { + "epoch": 0.09101127604910077, + "grad_norm": 1.4724546670913696, + "learning_rate": 4.898519376751649e-05, + "loss": 5.2581, + "step": 15303 + }, + { + "epoch": 0.09101722333238177, + "grad_norm": 1.4417310953140259, + "learning_rate": 4.8985062030968875e-05, + "loss": 5.4829, + "step": 15304 + }, + { + "epoch": 0.09102317061566277, + "grad_norm": 1.1160683631896973, + "learning_rate": 4.898493028604833e-05, + "loss": 5.5287, + "step": 15305 + }, + { + "epoch": 0.09102911789894376, + "grad_norm": 1.2454899549484253, + "learning_rate": 4.8984798532754884e-05, + "loss": 5.2984, + "step": 15306 + }, + { + "epoch": 0.09103506518222476, + "grad_norm": 1.5732132196426392, + "learning_rate": 4.8984666771088596e-05, + "loss": 5.4998, + "step": 15307 + }, + { + "epoch": 0.09104101246550576, + "grad_norm": 1.6430423259735107, + "learning_rate": 4.8984535001049515e-05, + "loss": 5.4636, + "step": 15308 + }, + { + "epoch": 0.09104695974878675, + "grad_norm": 1.245288372039795, + "learning_rate": 4.898440322263768e-05, + "loss": 5.2874, + "step": 15309 + }, + { + "epoch": 0.09105290703206775, + "grad_norm": 1.4186644554138184, + "learning_rate": 4.898427143585312e-05, + "loss": 5.2275, + "step": 15310 + }, + { + "epoch": 0.09105885431534876, + "grad_norm": 1.3040757179260254, + "learning_rate": 4.8984139640695915e-05, + "loss": 5.2864, + "step": 15311 + }, + { + "epoch": 0.09106480159862974, + "grad_norm": 1.4106818437576294, + "learning_rate": 4.898400783716609e-05, + "loss": 5.5897, + "step": 15312 + }, + { + "epoch": 0.09107074888191075, + "grad_norm": 1.5596522092819214, + "learning_rate": 4.89838760252637e-05, + "loss": 5.4827, + "step": 15313 + }, + { + "epoch": 0.09107669616519173, + "grad_norm": 2.2576634883880615, + "learning_rate": 4.898374420498878e-05, + "loss": 5.1471, + "step": 15314 + }, + { + "epoch": 0.09108264344847274, + "grad_norm": 1.2749537229537964, + "learning_rate": 4.898361237634139e-05, + "loss": 5.2688, + "step": 15315 + }, + { + "epoch": 0.09108859073175374, + "grad_norm": 1.4171591997146606, + "learning_rate": 4.8983480539321566e-05, + "loss": 5.0796, + "step": 15316 + }, + { + "epoch": 0.09109453801503473, + "grad_norm": 1.2233314514160156, + "learning_rate": 4.898334869392936e-05, + "loss": 5.0992, + "step": 15317 + }, + { + "epoch": 0.09110048529831573, + "grad_norm": 1.4817143678665161, + "learning_rate": 4.8983216840164804e-05, + "loss": 5.2354, + "step": 15318 + }, + { + "epoch": 0.09110643258159673, + "grad_norm": 1.442088007926941, + "learning_rate": 4.898308497802796e-05, + "loss": 5.2177, + "step": 15319 + }, + { + "epoch": 0.09111237986487772, + "grad_norm": 1.3996042013168335, + "learning_rate": 4.898295310751887e-05, + "loss": 4.9938, + "step": 15320 + }, + { + "epoch": 0.09111832714815872, + "grad_norm": 1.3091521263122559, + "learning_rate": 4.8982821228637576e-05, + "loss": 4.9916, + "step": 15321 + }, + { + "epoch": 0.09112427443143972, + "grad_norm": 1.4807448387145996, + "learning_rate": 4.898268934138414e-05, + "loss": 4.9833, + "step": 15322 + }, + { + "epoch": 0.09113022171472071, + "grad_norm": 1.5992671251296997, + "learning_rate": 4.898255744575858e-05, + "loss": 5.1007, + "step": 15323 + }, + { + "epoch": 0.09113616899800171, + "grad_norm": 1.4472523927688599, + "learning_rate": 4.8982425541760954e-05, + "loss": 5.3123, + "step": 15324 + }, + { + "epoch": 0.09114211628128271, + "grad_norm": 1.2865816354751587, + "learning_rate": 4.898229362939132e-05, + "loss": 5.0817, + "step": 15325 + }, + { + "epoch": 0.0911480635645637, + "grad_norm": 1.477144479751587, + "learning_rate": 4.898216170864972e-05, + "loss": 5.1819, + "step": 15326 + }, + { + "epoch": 0.0911540108478447, + "grad_norm": 1.5831303596496582, + "learning_rate": 4.8982029779536184e-05, + "loss": 5.28, + "step": 15327 + }, + { + "epoch": 0.0911599581311257, + "grad_norm": 1.3366963863372803, + "learning_rate": 4.898189784205078e-05, + "loss": 5.3715, + "step": 15328 + }, + { + "epoch": 0.0911659054144067, + "grad_norm": 1.5603365898132324, + "learning_rate": 4.898176589619353e-05, + "loss": 5.2642, + "step": 15329 + }, + { + "epoch": 0.0911718526976877, + "grad_norm": 1.5105326175689697, + "learning_rate": 4.8981633941964506e-05, + "loss": 4.949, + "step": 15330 + }, + { + "epoch": 0.0911777999809687, + "grad_norm": 1.2074800729751587, + "learning_rate": 4.8981501979363734e-05, + "loss": 5.2847, + "step": 15331 + }, + { + "epoch": 0.09118374726424969, + "grad_norm": 1.4356200695037842, + "learning_rate": 4.898137000839127e-05, + "loss": 5.6169, + "step": 15332 + }, + { + "epoch": 0.09118969454753069, + "grad_norm": 1.5015919208526611, + "learning_rate": 4.8981238029047154e-05, + "loss": 5.1135, + "step": 15333 + }, + { + "epoch": 0.09119564183081169, + "grad_norm": 1.4902187585830688, + "learning_rate": 4.8981106041331434e-05, + "loss": 5.4406, + "step": 15334 + }, + { + "epoch": 0.09120158911409268, + "grad_norm": 1.2884581089019775, + "learning_rate": 4.898097404524416e-05, + "loss": 5.3493, + "step": 15335 + }, + { + "epoch": 0.09120753639737368, + "grad_norm": 1.4323054552078247, + "learning_rate": 4.898084204078539e-05, + "loss": 5.0939, + "step": 15336 + }, + { + "epoch": 0.09121348368065468, + "grad_norm": 1.6282861232757568, + "learning_rate": 4.898071002795514e-05, + "loss": 5.1857, + "step": 15337 + }, + { + "epoch": 0.09121943096393567, + "grad_norm": 1.3413678407669067, + "learning_rate": 4.898057800675347e-05, + "loss": 4.9581, + "step": 15338 + }, + { + "epoch": 0.09122537824721667, + "grad_norm": 1.5613822937011719, + "learning_rate": 4.898044597718044e-05, + "loss": 4.6401, + "step": 15339 + }, + { + "epoch": 0.09123132553049768, + "grad_norm": 1.4945799112319946, + "learning_rate": 4.898031393923608e-05, + "loss": 4.6649, + "step": 15340 + }, + { + "epoch": 0.09123727281377866, + "grad_norm": 1.6086750030517578, + "learning_rate": 4.898018189292043e-05, + "loss": 4.5996, + "step": 15341 + }, + { + "epoch": 0.09124322009705967, + "grad_norm": 1.3530272245407104, + "learning_rate": 4.898004983823355e-05, + "loss": 4.6511, + "step": 15342 + }, + { + "epoch": 0.09124916738034065, + "grad_norm": 1.5523587465286255, + "learning_rate": 4.897991777517549e-05, + "loss": 4.8099, + "step": 15343 + }, + { + "epoch": 0.09125511466362166, + "grad_norm": 1.6695882081985474, + "learning_rate": 4.8979785703746286e-05, + "loss": 5.2371, + "step": 15344 + }, + { + "epoch": 0.09126106194690266, + "grad_norm": 1.777717113494873, + "learning_rate": 4.897965362394599e-05, + "loss": 5.373, + "step": 15345 + }, + { + "epoch": 0.09126700923018365, + "grad_norm": 1.2890517711639404, + "learning_rate": 4.8979521535774636e-05, + "loss": 5.3851, + "step": 15346 + }, + { + "epoch": 0.09127295651346465, + "grad_norm": 1.3539687395095825, + "learning_rate": 4.897938943923228e-05, + "loss": 5.1218, + "step": 15347 + }, + { + "epoch": 0.09127890379674565, + "grad_norm": 1.4157010316848755, + "learning_rate": 4.8979257334318974e-05, + "loss": 4.9411, + "step": 15348 + }, + { + "epoch": 0.09128485108002664, + "grad_norm": 1.4856256246566772, + "learning_rate": 4.897912522103475e-05, + "loss": 5.1622, + "step": 15349 + }, + { + "epoch": 0.09129079836330764, + "grad_norm": 1.4729665517807007, + "learning_rate": 4.8978993099379666e-05, + "loss": 5.0901, + "step": 15350 + }, + { + "epoch": 0.09129674564658864, + "grad_norm": 1.376625895500183, + "learning_rate": 4.897886096935376e-05, + "loss": 4.8843, + "step": 15351 + }, + { + "epoch": 0.09130269292986963, + "grad_norm": 1.3019710779190063, + "learning_rate": 4.897872883095708e-05, + "loss": 4.9956, + "step": 15352 + }, + { + "epoch": 0.09130864021315063, + "grad_norm": 1.4751423597335815, + "learning_rate": 4.897859668418968e-05, + "loss": 5.4369, + "step": 15353 + }, + { + "epoch": 0.09131458749643163, + "grad_norm": 1.3563402891159058, + "learning_rate": 4.8978464529051595e-05, + "loss": 5.2071, + "step": 15354 + }, + { + "epoch": 0.09132053477971262, + "grad_norm": 1.7365561723709106, + "learning_rate": 4.8978332365542875e-05, + "loss": 4.8797, + "step": 15355 + }, + { + "epoch": 0.09132648206299362, + "grad_norm": 1.4001792669296265, + "learning_rate": 4.8978200193663565e-05, + "loss": 5.2549, + "step": 15356 + }, + { + "epoch": 0.09133242934627463, + "grad_norm": 1.5568649768829346, + "learning_rate": 4.897806801341371e-05, + "loss": 5.3805, + "step": 15357 + }, + { + "epoch": 0.09133837662955561, + "grad_norm": 1.4169847965240479, + "learning_rate": 4.897793582479337e-05, + "loss": 5.2655, + "step": 15358 + }, + { + "epoch": 0.09134432391283662, + "grad_norm": 1.3992067575454712, + "learning_rate": 4.897780362780258e-05, + "loss": 5.4284, + "step": 15359 + }, + { + "epoch": 0.09135027119611762, + "grad_norm": 1.2274264097213745, + "learning_rate": 4.8977671422441376e-05, + "loss": 5.2443, + "step": 15360 + }, + { + "epoch": 0.09135621847939861, + "grad_norm": 1.4754104614257812, + "learning_rate": 4.897753920870982e-05, + "loss": 5.3438, + "step": 15361 + }, + { + "epoch": 0.09136216576267961, + "grad_norm": 1.3993452787399292, + "learning_rate": 4.897740698660796e-05, + "loss": 5.2396, + "step": 15362 + }, + { + "epoch": 0.09136811304596061, + "grad_norm": 1.2840338945388794, + "learning_rate": 4.897727475613583e-05, + "loss": 5.2912, + "step": 15363 + }, + { + "epoch": 0.0913740603292416, + "grad_norm": 1.5234180688858032, + "learning_rate": 4.8977142517293474e-05, + "loss": 5.4197, + "step": 15364 + }, + { + "epoch": 0.0913800076125226, + "grad_norm": 1.6243525743484497, + "learning_rate": 4.897701027008095e-05, + "loss": 5.4358, + "step": 15365 + }, + { + "epoch": 0.0913859548958036, + "grad_norm": 1.277801513671875, + "learning_rate": 4.8976878014498306e-05, + "loss": 5.2801, + "step": 15366 + }, + { + "epoch": 0.09139190217908459, + "grad_norm": 1.5294082164764404, + "learning_rate": 4.897674575054557e-05, + "loss": 4.8257, + "step": 15367 + }, + { + "epoch": 0.0913978494623656, + "grad_norm": 1.7289122343063354, + "learning_rate": 4.897661347822281e-05, + "loss": 4.8155, + "step": 15368 + }, + { + "epoch": 0.0914037967456466, + "grad_norm": 1.5567346811294556, + "learning_rate": 4.897648119753006e-05, + "loss": 4.8245, + "step": 15369 + }, + { + "epoch": 0.09140974402892758, + "grad_norm": 1.4855397939682007, + "learning_rate": 4.8976348908467365e-05, + "loss": 4.7247, + "step": 15370 + }, + { + "epoch": 0.09141569131220859, + "grad_norm": 1.4355418682098389, + "learning_rate": 4.897621661103477e-05, + "loss": 5.0925, + "step": 15371 + }, + { + "epoch": 0.09142163859548957, + "grad_norm": 1.3165326118469238, + "learning_rate": 4.897608430523233e-05, + "loss": 5.3419, + "step": 15372 + }, + { + "epoch": 0.09142758587877058, + "grad_norm": 1.4930912256240845, + "learning_rate": 4.8975951991060084e-05, + "loss": 5.3267, + "step": 15373 + }, + { + "epoch": 0.09143353316205158, + "grad_norm": 1.2326771020889282, + "learning_rate": 4.897581966851809e-05, + "loss": 5.2902, + "step": 15374 + }, + { + "epoch": 0.09143948044533257, + "grad_norm": 1.1512086391448975, + "learning_rate": 4.897568733760638e-05, + "loss": 5.2362, + "step": 15375 + }, + { + "epoch": 0.09144542772861357, + "grad_norm": 2.2404119968414307, + "learning_rate": 4.8975554998325e-05, + "loss": 5.055, + "step": 15376 + }, + { + "epoch": 0.09145137501189457, + "grad_norm": 1.3026318550109863, + "learning_rate": 4.8975422650674005e-05, + "loss": 5.0192, + "step": 15377 + }, + { + "epoch": 0.09145732229517556, + "grad_norm": 1.5808472633361816, + "learning_rate": 4.897529029465344e-05, + "loss": 5.2429, + "step": 15378 + }, + { + "epoch": 0.09146326957845656, + "grad_norm": 1.5761525630950928, + "learning_rate": 4.897515793026335e-05, + "loss": 4.9123, + "step": 15379 + }, + { + "epoch": 0.09146921686173756, + "grad_norm": 1.488484501838684, + "learning_rate": 4.897502555750377e-05, + "loss": 4.8463, + "step": 15380 + }, + { + "epoch": 0.09147516414501855, + "grad_norm": 1.4662736654281616, + "learning_rate": 4.897489317637477e-05, + "loss": 5.3047, + "step": 15381 + }, + { + "epoch": 0.09148111142829955, + "grad_norm": 1.6454370021820068, + "learning_rate": 4.897476078687637e-05, + "loss": 5.2335, + "step": 15382 + }, + { + "epoch": 0.09148705871158055, + "grad_norm": 1.425868034362793, + "learning_rate": 4.8974628389008636e-05, + "loss": 5.2016, + "step": 15383 + }, + { + "epoch": 0.09149300599486154, + "grad_norm": 1.599349021911621, + "learning_rate": 4.8974495982771606e-05, + "loss": 5.4205, + "step": 15384 + }, + { + "epoch": 0.09149895327814254, + "grad_norm": 1.6200257539749146, + "learning_rate": 4.897436356816533e-05, + "loss": 5.5001, + "step": 15385 + }, + { + "epoch": 0.09150490056142355, + "grad_norm": 1.5314574241638184, + "learning_rate": 4.8974231145189844e-05, + "loss": 5.4711, + "step": 15386 + }, + { + "epoch": 0.09151084784470453, + "grad_norm": 1.507489562034607, + "learning_rate": 4.8974098713845206e-05, + "loss": 5.4001, + "step": 15387 + }, + { + "epoch": 0.09151679512798554, + "grad_norm": 1.4561303853988647, + "learning_rate": 4.897396627413146e-05, + "loss": 5.4566, + "step": 15388 + }, + { + "epoch": 0.09152274241126654, + "grad_norm": 1.3273184299468994, + "learning_rate": 4.897383382604865e-05, + "loss": 5.4665, + "step": 15389 + }, + { + "epoch": 0.09152868969454753, + "grad_norm": 1.370138168334961, + "learning_rate": 4.8973701369596814e-05, + "loss": 5.4319, + "step": 15390 + }, + { + "epoch": 0.09153463697782853, + "grad_norm": 1.4831699132919312, + "learning_rate": 4.897356890477601e-05, + "loss": 5.2734, + "step": 15391 + }, + { + "epoch": 0.09154058426110953, + "grad_norm": 1.3152328729629517, + "learning_rate": 4.897343643158629e-05, + "loss": 5.3573, + "step": 15392 + }, + { + "epoch": 0.09154653154439052, + "grad_norm": 1.635460376739502, + "learning_rate": 4.8973303950027684e-05, + "loss": 5.2433, + "step": 15393 + }, + { + "epoch": 0.09155247882767152, + "grad_norm": 1.5252761840820312, + "learning_rate": 4.897317146010024e-05, + "loss": 5.2164, + "step": 15394 + }, + { + "epoch": 0.09155842611095252, + "grad_norm": 1.600043773651123, + "learning_rate": 4.897303896180402e-05, + "loss": 5.4138, + "step": 15395 + }, + { + "epoch": 0.09156437339423351, + "grad_norm": 1.6243258714675903, + "learning_rate": 4.8972906455139056e-05, + "loss": 5.6129, + "step": 15396 + }, + { + "epoch": 0.09157032067751451, + "grad_norm": 1.2726150751113892, + "learning_rate": 4.89727739401054e-05, + "loss": 5.4639, + "step": 15397 + }, + { + "epoch": 0.09157626796079552, + "grad_norm": 2.1045331954956055, + "learning_rate": 4.897264141670309e-05, + "loss": 5.1875, + "step": 15398 + }, + { + "epoch": 0.0915822152440765, + "grad_norm": 2.1204488277435303, + "learning_rate": 4.897250888493218e-05, + "loss": 5.0401, + "step": 15399 + }, + { + "epoch": 0.0915881625273575, + "grad_norm": 1.794190526008606, + "learning_rate": 4.8972376344792716e-05, + "loss": 6.0581, + "step": 15400 + }, + { + "epoch": 0.0915941098106385, + "grad_norm": 2.050788402557373, + "learning_rate": 4.8972243796284746e-05, + "loss": 5.0138, + "step": 15401 + }, + { + "epoch": 0.0916000570939195, + "grad_norm": 2.1165850162506104, + "learning_rate": 4.897211123940831e-05, + "loss": 4.7077, + "step": 15402 + }, + { + "epoch": 0.0916060043772005, + "grad_norm": 1.9797117710113525, + "learning_rate": 4.8971978674163455e-05, + "loss": 4.8248, + "step": 15403 + }, + { + "epoch": 0.09161195166048149, + "grad_norm": 1.922232747077942, + "learning_rate": 4.8971846100550234e-05, + "loss": 4.7655, + "step": 15404 + }, + { + "epoch": 0.09161789894376249, + "grad_norm": 1.7310322523117065, + "learning_rate": 4.897171351856869e-05, + "loss": 5.425, + "step": 15405 + }, + { + "epoch": 0.09162384622704349, + "grad_norm": 1.9186078310012817, + "learning_rate": 4.897158092821887e-05, + "loss": 6.2449, + "step": 15406 + }, + { + "epoch": 0.09162979351032448, + "grad_norm": 1.7470628023147583, + "learning_rate": 4.897144832950081e-05, + "loss": 6.1586, + "step": 15407 + }, + { + "epoch": 0.09163574079360548, + "grad_norm": 1.7828420400619507, + "learning_rate": 4.897131572241457e-05, + "loss": 6.1068, + "step": 15408 + }, + { + "epoch": 0.09164168807688648, + "grad_norm": 1.8831984996795654, + "learning_rate": 4.897118310696019e-05, + "loss": 5.6989, + "step": 15409 + }, + { + "epoch": 0.09164763536016747, + "grad_norm": 1.6138192415237427, + "learning_rate": 4.8971050483137726e-05, + "loss": 5.8222, + "step": 15410 + }, + { + "epoch": 0.09165358264344847, + "grad_norm": 1.6921756267547607, + "learning_rate": 4.897091785094721e-05, + "loss": 5.8559, + "step": 15411 + }, + { + "epoch": 0.09165952992672947, + "grad_norm": 2.007937431335449, + "learning_rate": 4.8970785210388694e-05, + "loss": 5.4523, + "step": 15412 + }, + { + "epoch": 0.09166547721001046, + "grad_norm": 1.8820117712020874, + "learning_rate": 4.8970652561462224e-05, + "loss": 5.6293, + "step": 15413 + }, + { + "epoch": 0.09167142449329146, + "grad_norm": 2.0193300247192383, + "learning_rate": 4.897051990416785e-05, + "loss": 5.8481, + "step": 15414 + }, + { + "epoch": 0.09167737177657247, + "grad_norm": 2.3685405254364014, + "learning_rate": 4.897038723850561e-05, + "loss": 6.2884, + "step": 15415 + }, + { + "epoch": 0.09168331905985345, + "grad_norm": 2.001131534576416, + "learning_rate": 4.897025456447556e-05, + "loss": 5.6747, + "step": 15416 + }, + { + "epoch": 0.09168926634313446, + "grad_norm": 1.9729053974151611, + "learning_rate": 4.897012188207774e-05, + "loss": 5.9019, + "step": 15417 + }, + { + "epoch": 0.09169521362641546, + "grad_norm": 1.7620398998260498, + "learning_rate": 4.896998919131219e-05, + "loss": 5.9498, + "step": 15418 + }, + { + "epoch": 0.09170116090969645, + "grad_norm": 1.6993772983551025, + "learning_rate": 4.896985649217898e-05, + "loss": 5.973, + "step": 15419 + }, + { + "epoch": 0.09170710819297745, + "grad_norm": 1.6905665397644043, + "learning_rate": 4.896972378467813e-05, + "loss": 5.9729, + "step": 15420 + }, + { + "epoch": 0.09171305547625845, + "grad_norm": 1.710838794708252, + "learning_rate": 4.8969591068809706e-05, + "loss": 5.6661, + "step": 15421 + }, + { + "epoch": 0.09171900275953944, + "grad_norm": 1.9235612154006958, + "learning_rate": 4.896945834457374e-05, + "loss": 5.38, + "step": 15422 + }, + { + "epoch": 0.09172495004282044, + "grad_norm": 2.360656976699829, + "learning_rate": 4.896932561197028e-05, + "loss": 5.2199, + "step": 15423 + }, + { + "epoch": 0.09173089732610144, + "grad_norm": 2.403338670730591, + "learning_rate": 4.896919287099938e-05, + "loss": 5.1776, + "step": 15424 + }, + { + "epoch": 0.09173684460938243, + "grad_norm": 1.9474782943725586, + "learning_rate": 4.896906012166108e-05, + "loss": 5.0781, + "step": 15425 + }, + { + "epoch": 0.09174279189266343, + "grad_norm": 1.8974144458770752, + "learning_rate": 4.896892736395543e-05, + "loss": 5.1609, + "step": 15426 + }, + { + "epoch": 0.09174873917594444, + "grad_norm": 2.3854262828826904, + "learning_rate": 4.896879459788247e-05, + "loss": 5.2019, + "step": 15427 + }, + { + "epoch": 0.09175468645922542, + "grad_norm": 2.4181137084960938, + "learning_rate": 4.8968661823442264e-05, + "loss": 5.1216, + "step": 15428 + }, + { + "epoch": 0.09176063374250643, + "grad_norm": 2.266355514526367, + "learning_rate": 4.896852904063484e-05, + "loss": 5.0401, + "step": 15429 + }, + { + "epoch": 0.09176658102578741, + "grad_norm": 2.086296558380127, + "learning_rate": 4.896839624946025e-05, + "loss": 4.8601, + "step": 15430 + }, + { + "epoch": 0.09177252830906842, + "grad_norm": 1.943326473236084, + "learning_rate": 4.896826344991854e-05, + "loss": 4.9978, + "step": 15431 + }, + { + "epoch": 0.09177847559234942, + "grad_norm": 2.0165631771087646, + "learning_rate": 4.896813064200975e-05, + "loss": 5.0379, + "step": 15432 + }, + { + "epoch": 0.0917844228756304, + "grad_norm": 1.7142544984817505, + "learning_rate": 4.896799782573394e-05, + "loss": 5.7101, + "step": 15433 + }, + { + "epoch": 0.09179037015891141, + "grad_norm": 1.9000083208084106, + "learning_rate": 4.896786500109115e-05, + "loss": 5.9536, + "step": 15434 + }, + { + "epoch": 0.09179631744219241, + "grad_norm": 1.6976677179336548, + "learning_rate": 4.8967732168081426e-05, + "loss": 5.4408, + "step": 15435 + }, + { + "epoch": 0.0918022647254734, + "grad_norm": 1.7433068752288818, + "learning_rate": 4.8967599326704815e-05, + "loss": 5.831, + "step": 15436 + }, + { + "epoch": 0.0918082120087544, + "grad_norm": 1.484256625175476, + "learning_rate": 4.896746647696136e-05, + "loss": 5.943, + "step": 15437 + }, + { + "epoch": 0.0918141592920354, + "grad_norm": 2.2480883598327637, + "learning_rate": 4.8967333618851106e-05, + "loss": 5.6634, + "step": 15438 + }, + { + "epoch": 0.09182010657531639, + "grad_norm": 1.3530383110046387, + "learning_rate": 4.896720075237411e-05, + "loss": 5.8981, + "step": 15439 + }, + { + "epoch": 0.09182605385859739, + "grad_norm": 1.451636552810669, + "learning_rate": 4.896706787753041e-05, + "loss": 5.9803, + "step": 15440 + }, + { + "epoch": 0.0918320011418784, + "grad_norm": 1.5904042720794678, + "learning_rate": 4.896693499432006e-05, + "loss": 5.9692, + "step": 15441 + }, + { + "epoch": 0.09183794842515938, + "grad_norm": 1.3971885442733765, + "learning_rate": 4.896680210274309e-05, + "loss": 5.8612, + "step": 15442 + }, + { + "epoch": 0.09184389570844038, + "grad_norm": 1.325842022895813, + "learning_rate": 4.8966669202799564e-05, + "loss": 5.9081, + "step": 15443 + }, + { + "epoch": 0.09184984299172139, + "grad_norm": 1.4639033079147339, + "learning_rate": 4.8966536294489515e-05, + "loss": 5.8395, + "step": 15444 + }, + { + "epoch": 0.09185579027500237, + "grad_norm": 1.248425006866455, + "learning_rate": 4.896640337781301e-05, + "loss": 5.9016, + "step": 15445 + }, + { + "epoch": 0.09186173755828338, + "grad_norm": 1.4250134229660034, + "learning_rate": 4.896627045277007e-05, + "loss": 5.815, + "step": 15446 + }, + { + "epoch": 0.09186768484156438, + "grad_norm": 1.9178589582443237, + "learning_rate": 4.896613751936075e-05, + "loss": 5.9092, + "step": 15447 + }, + { + "epoch": 0.09187363212484537, + "grad_norm": 1.9218472242355347, + "learning_rate": 4.896600457758511e-05, + "loss": 5.7151, + "step": 15448 + }, + { + "epoch": 0.09187957940812637, + "grad_norm": 1.7698949575424194, + "learning_rate": 4.896587162744317e-05, + "loss": 5.709, + "step": 15449 + }, + { + "epoch": 0.09188552669140737, + "grad_norm": 2.5047290325164795, + "learning_rate": 4.8965738668935e-05, + "loss": 5.5417, + "step": 15450 + }, + { + "epoch": 0.09189147397468836, + "grad_norm": 1.9855560064315796, + "learning_rate": 4.896560570206065e-05, + "loss": 5.9572, + "step": 15451 + }, + { + "epoch": 0.09189742125796936, + "grad_norm": 1.8577516078948975, + "learning_rate": 4.896547272682014e-05, + "loss": 4.8775, + "step": 15452 + }, + { + "epoch": 0.09190336854125036, + "grad_norm": 1.8830385208129883, + "learning_rate": 4.896533974321353e-05, + "loss": 4.8617, + "step": 15453 + }, + { + "epoch": 0.09190931582453135, + "grad_norm": 1.5114052295684814, + "learning_rate": 4.896520675124087e-05, + "loss": 4.9485, + "step": 15454 + }, + { + "epoch": 0.09191526310781235, + "grad_norm": 1.6233285665512085, + "learning_rate": 4.8965073750902205e-05, + "loss": 5.1098, + "step": 15455 + }, + { + "epoch": 0.09192121039109336, + "grad_norm": 1.6900150775909424, + "learning_rate": 4.896494074219758e-05, + "loss": 6.025, + "step": 15456 + }, + { + "epoch": 0.09192715767437434, + "grad_norm": 1.3984570503234863, + "learning_rate": 4.8964807725127046e-05, + "loss": 5.888, + "step": 15457 + }, + { + "epoch": 0.09193310495765535, + "grad_norm": 1.7069528102874756, + "learning_rate": 4.896467469969064e-05, + "loss": 5.6435, + "step": 15458 + }, + { + "epoch": 0.09193905224093633, + "grad_norm": 1.641513705253601, + "learning_rate": 4.896454166588842e-05, + "loss": 5.5641, + "step": 15459 + }, + { + "epoch": 0.09194499952421734, + "grad_norm": 1.8448737859725952, + "learning_rate": 4.896440862372042e-05, + "loss": 5.5673, + "step": 15460 + }, + { + "epoch": 0.09195094680749834, + "grad_norm": 1.7696945667266846, + "learning_rate": 4.8964275573186694e-05, + "loss": 5.4383, + "step": 15461 + }, + { + "epoch": 0.09195689409077933, + "grad_norm": 2.7951743602752686, + "learning_rate": 4.8964142514287285e-05, + "loss": 4.2996, + "step": 15462 + }, + { + "epoch": 0.09196284137406033, + "grad_norm": 2.5503883361816406, + "learning_rate": 4.8964009447022246e-05, + "loss": 4.2864, + "step": 15463 + }, + { + "epoch": 0.09196878865734133, + "grad_norm": 2.2069225311279297, + "learning_rate": 4.896387637139161e-05, + "loss": 4.3818, + "step": 15464 + }, + { + "epoch": 0.09197473594062232, + "grad_norm": 2.34734845161438, + "learning_rate": 4.8963743287395444e-05, + "loss": 4.2951, + "step": 15465 + }, + { + "epoch": 0.09198068322390332, + "grad_norm": 2.2955567836761475, + "learning_rate": 4.896361019503378e-05, + "loss": 4.3349, + "step": 15466 + }, + { + "epoch": 0.09198663050718432, + "grad_norm": 2.3519480228424072, + "learning_rate": 4.8963477094306666e-05, + "loss": 4.2685, + "step": 15467 + }, + { + "epoch": 0.09199257779046531, + "grad_norm": 2.3862032890319824, + "learning_rate": 4.896334398521415e-05, + "loss": 4.1333, + "step": 15468 + }, + { + "epoch": 0.09199852507374631, + "grad_norm": 2.1290738582611084, + "learning_rate": 4.896321086775627e-05, + "loss": 4.7918, + "step": 15469 + }, + { + "epoch": 0.09200447235702731, + "grad_norm": 2.2130253314971924, + "learning_rate": 4.8963077741933095e-05, + "loss": 5.208, + "step": 15470 + }, + { + "epoch": 0.0920104196403083, + "grad_norm": 2.063810110092163, + "learning_rate": 4.896294460774464e-05, + "loss": 5.1891, + "step": 15471 + }, + { + "epoch": 0.0920163669235893, + "grad_norm": 2.068791627883911, + "learning_rate": 4.8962811465190984e-05, + "loss": 5.2855, + "step": 15472 + }, + { + "epoch": 0.0920223142068703, + "grad_norm": 1.8504056930541992, + "learning_rate": 4.896267831427215e-05, + "loss": 5.0159, + "step": 15473 + }, + { + "epoch": 0.0920282614901513, + "grad_norm": 2.150820255279541, + "learning_rate": 4.89625451549882e-05, + "loss": 5.7728, + "step": 15474 + }, + { + "epoch": 0.0920342087734323, + "grad_norm": 2.3655643463134766, + "learning_rate": 4.8962411987339165e-05, + "loss": 5.4863, + "step": 15475 + }, + { + "epoch": 0.0920401560567133, + "grad_norm": 1.509820818901062, + "learning_rate": 4.8962278811325105e-05, + "loss": 5.5682, + "step": 15476 + }, + { + "epoch": 0.09204610333999429, + "grad_norm": 1.8581949472427368, + "learning_rate": 4.896214562694605e-05, + "loss": 5.6875, + "step": 15477 + }, + { + "epoch": 0.09205205062327529, + "grad_norm": 2.028116464614868, + "learning_rate": 4.8962012434202075e-05, + "loss": 5.3495, + "step": 15478 + }, + { + "epoch": 0.09205799790655629, + "grad_norm": 1.9395058155059814, + "learning_rate": 4.89618792330932e-05, + "loss": 5.5616, + "step": 15479 + }, + { + "epoch": 0.09206394518983728, + "grad_norm": 1.9281854629516602, + "learning_rate": 4.896174602361948e-05, + "loss": 5.6449, + "step": 15480 + }, + { + "epoch": 0.09206989247311828, + "grad_norm": 1.7750074863433838, + "learning_rate": 4.896161280578097e-05, + "loss": 5.1178, + "step": 15481 + }, + { + "epoch": 0.09207583975639928, + "grad_norm": 2.0160205364227295, + "learning_rate": 4.89614795795777e-05, + "loss": 5.4698, + "step": 15482 + }, + { + "epoch": 0.09208178703968027, + "grad_norm": 2.0041770935058594, + "learning_rate": 4.896134634500972e-05, + "loss": 4.6989, + "step": 15483 + }, + { + "epoch": 0.09208773432296127, + "grad_norm": 1.9916999340057373, + "learning_rate": 4.896121310207708e-05, + "loss": 4.6296, + "step": 15484 + }, + { + "epoch": 0.09209368160624228, + "grad_norm": 1.62458336353302, + "learning_rate": 4.8961079850779845e-05, + "loss": 5.1147, + "step": 15485 + }, + { + "epoch": 0.09209962888952326, + "grad_norm": 1.8349764347076416, + "learning_rate": 4.8960946591118036e-05, + "loss": 5.3646, + "step": 15486 + }, + { + "epoch": 0.09210557617280427, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.89608133230917e-05, + "loss": 5.7467, + "step": 15487 + }, + { + "epoch": 0.09211152345608525, + "grad_norm": 1.8945664167404175, + "learning_rate": 4.89606800467009e-05, + "loss": 5.5526, + "step": 15488 + }, + { + "epoch": 0.09211747073936626, + "grad_norm": 2.1056711673736572, + "learning_rate": 4.896054676194568e-05, + "loss": 4.8553, + "step": 15489 + }, + { + "epoch": 0.09212341802264726, + "grad_norm": 2.0394606590270996, + "learning_rate": 4.896041346882607e-05, + "loss": 5.4427, + "step": 15490 + }, + { + "epoch": 0.09212936530592825, + "grad_norm": 2.3078689575195312, + "learning_rate": 4.896028016734213e-05, + "loss": 5.3668, + "step": 15491 + }, + { + "epoch": 0.09213531258920925, + "grad_norm": 2.1227409839630127, + "learning_rate": 4.8960146857493904e-05, + "loss": 5.6314, + "step": 15492 + }, + { + "epoch": 0.09214125987249025, + "grad_norm": 2.156165838241577, + "learning_rate": 4.896001353928144e-05, + "loss": 5.5088, + "step": 15493 + }, + { + "epoch": 0.09214720715577124, + "grad_norm": 1.8915730714797974, + "learning_rate": 4.895988021270478e-05, + "loss": 5.5636, + "step": 15494 + }, + { + "epoch": 0.09215315443905224, + "grad_norm": 1.8041549921035767, + "learning_rate": 4.895974687776398e-05, + "loss": 5.5213, + "step": 15495 + }, + { + "epoch": 0.09215910172233324, + "grad_norm": 1.8982187509536743, + "learning_rate": 4.8959613534459074e-05, + "loss": 5.7038, + "step": 15496 + }, + { + "epoch": 0.09216504900561423, + "grad_norm": 1.9235600233078003, + "learning_rate": 4.895948018279012e-05, + "loss": 5.514, + "step": 15497 + }, + { + "epoch": 0.09217099628889523, + "grad_norm": 2.284212112426758, + "learning_rate": 4.895934682275715e-05, + "loss": 5.4624, + "step": 15498 + }, + { + "epoch": 0.09217694357217623, + "grad_norm": 2.770934820175171, + "learning_rate": 4.895921345436022e-05, + "loss": 4.7516, + "step": 15499 + }, + { + "epoch": 0.09218289085545722, + "grad_norm": 2.054158926010132, + "learning_rate": 4.895908007759939e-05, + "loss": 5.6444, + "step": 15500 + }, + { + "epoch": 0.09218883813873822, + "grad_norm": 2.352905511856079, + "learning_rate": 4.895894669247468e-05, + "loss": 4.7985, + "step": 15501 + }, + { + "epoch": 0.09219478542201923, + "grad_norm": 2.612039804458618, + "learning_rate": 4.895881329898615e-05, + "loss": 4.769, + "step": 15502 + }, + { + "epoch": 0.09220073270530021, + "grad_norm": 2.1274194717407227, + "learning_rate": 4.8958679897133854e-05, + "loss": 4.6185, + "step": 15503 + }, + { + "epoch": 0.09220667998858122, + "grad_norm": 2.2458853721618652, + "learning_rate": 4.895854648691782e-05, + "loss": 4.8576, + "step": 15504 + }, + { + "epoch": 0.09221262727186222, + "grad_norm": 2.415526866912842, + "learning_rate": 4.895841306833811e-05, + "loss": 4.999, + "step": 15505 + }, + { + "epoch": 0.0922185745551432, + "grad_norm": 1.8172876834869385, + "learning_rate": 4.8958279641394765e-05, + "loss": 5.1992, + "step": 15506 + }, + { + "epoch": 0.09222452183842421, + "grad_norm": 2.0568878650665283, + "learning_rate": 4.8958146206087826e-05, + "loss": 5.1348, + "step": 15507 + }, + { + "epoch": 0.09223046912170521, + "grad_norm": 2.152869701385498, + "learning_rate": 4.895801276241736e-05, + "loss": 4.9832, + "step": 15508 + }, + { + "epoch": 0.0922364164049862, + "grad_norm": 1.8191282749176025, + "learning_rate": 4.895787931038339e-05, + "loss": 5.3098, + "step": 15509 + }, + { + "epoch": 0.0922423636882672, + "grad_norm": 1.9511895179748535, + "learning_rate": 4.895774584998597e-05, + "loss": 5.5763, + "step": 15510 + }, + { + "epoch": 0.0922483109715482, + "grad_norm": 1.8735122680664062, + "learning_rate": 4.895761238122515e-05, + "loss": 5.3644, + "step": 15511 + }, + { + "epoch": 0.09225425825482919, + "grad_norm": 1.672721028327942, + "learning_rate": 4.895747890410098e-05, + "loss": 5.2794, + "step": 15512 + }, + { + "epoch": 0.0922602055381102, + "grad_norm": 1.5318527221679688, + "learning_rate": 4.89573454186135e-05, + "loss": 5.3575, + "step": 15513 + }, + { + "epoch": 0.0922661528213912, + "grad_norm": 1.8192704916000366, + "learning_rate": 4.895721192476275e-05, + "loss": 5.498, + "step": 15514 + }, + { + "epoch": 0.09227210010467218, + "grad_norm": 1.948249340057373, + "learning_rate": 4.895707842254879e-05, + "loss": 5.6955, + "step": 15515 + }, + { + "epoch": 0.09227804738795319, + "grad_norm": 2.1378414630889893, + "learning_rate": 4.895694491197166e-05, + "loss": 5.4999, + "step": 15516 + }, + { + "epoch": 0.09228399467123417, + "grad_norm": 2.057358980178833, + "learning_rate": 4.8956811393031414e-05, + "loss": 4.7234, + "step": 15517 + }, + { + "epoch": 0.09228994195451518, + "grad_norm": 1.9550749063491821, + "learning_rate": 4.895667786572809e-05, + "loss": 5.7611, + "step": 15518 + }, + { + "epoch": 0.09229588923779618, + "grad_norm": 2.120396852493286, + "learning_rate": 4.8956544330061734e-05, + "loss": 5.8707, + "step": 15519 + }, + { + "epoch": 0.09230183652107717, + "grad_norm": 1.8432284593582153, + "learning_rate": 4.8956410786032404e-05, + "loss": 5.7512, + "step": 15520 + }, + { + "epoch": 0.09230778380435817, + "grad_norm": 1.738993525505066, + "learning_rate": 4.895627723364013e-05, + "loss": 5.2099, + "step": 15521 + }, + { + "epoch": 0.09231373108763917, + "grad_norm": 1.4885916709899902, + "learning_rate": 4.895614367288497e-05, + "loss": 5.6817, + "step": 15522 + }, + { + "epoch": 0.09231967837092016, + "grad_norm": 1.9712351560592651, + "learning_rate": 4.895601010376697e-05, + "loss": 5.4247, + "step": 15523 + }, + { + "epoch": 0.09232562565420116, + "grad_norm": 1.6669690608978271, + "learning_rate": 4.895587652628617e-05, + "loss": 5.2189, + "step": 15524 + }, + { + "epoch": 0.09233157293748216, + "grad_norm": 2.1034297943115234, + "learning_rate": 4.895574294044262e-05, + "loss": 5.4772, + "step": 15525 + }, + { + "epoch": 0.09233752022076315, + "grad_norm": 2.3692588806152344, + "learning_rate": 4.895560934623637e-05, + "loss": 5.002, + "step": 15526 + }, + { + "epoch": 0.09234346750404415, + "grad_norm": 2.708406686782837, + "learning_rate": 4.8955475743667464e-05, + "loss": 4.9923, + "step": 15527 + }, + { + "epoch": 0.09234941478732515, + "grad_norm": 2.4986281394958496, + "learning_rate": 4.895534213273595e-05, + "loss": 4.7859, + "step": 15528 + }, + { + "epoch": 0.09235536207060614, + "grad_norm": 2.4715240001678467, + "learning_rate": 4.895520851344187e-05, + "loss": 5.2135, + "step": 15529 + }, + { + "epoch": 0.09236130935388714, + "grad_norm": 1.77085280418396, + "learning_rate": 4.895507488578528e-05, + "loss": 5.4675, + "step": 15530 + }, + { + "epoch": 0.09236725663716815, + "grad_norm": 1.4845975637435913, + "learning_rate": 4.8954941249766225e-05, + "loss": 5.8627, + "step": 15531 + }, + { + "epoch": 0.09237320392044913, + "grad_norm": 2.0753140449523926, + "learning_rate": 4.8954807605384734e-05, + "loss": 5.8246, + "step": 15532 + }, + { + "epoch": 0.09237915120373014, + "grad_norm": 1.5671929121017456, + "learning_rate": 4.895467395264088e-05, + "loss": 5.8189, + "step": 15533 + }, + { + "epoch": 0.09238509848701114, + "grad_norm": 1.749223232269287, + "learning_rate": 4.895454029153469e-05, + "loss": 5.9183, + "step": 15534 + }, + { + "epoch": 0.09239104577029213, + "grad_norm": 1.7186611890792847, + "learning_rate": 4.895440662206622e-05, + "loss": 5.84, + "step": 15535 + }, + { + "epoch": 0.09239699305357313, + "grad_norm": 1.654483437538147, + "learning_rate": 4.895427294423551e-05, + "loss": 5.4055, + "step": 15536 + }, + { + "epoch": 0.09240294033685413, + "grad_norm": 1.7109687328338623, + "learning_rate": 4.895413925804261e-05, + "loss": 5.3028, + "step": 15537 + }, + { + "epoch": 0.09240888762013512, + "grad_norm": 1.9221105575561523, + "learning_rate": 4.895400556348757e-05, + "loss": 5.2911, + "step": 15538 + }, + { + "epoch": 0.09241483490341612, + "grad_norm": 1.9464010000228882, + "learning_rate": 4.895387186057044e-05, + "loss": 5.5883, + "step": 15539 + }, + { + "epoch": 0.09242078218669712, + "grad_norm": 1.9429137706756592, + "learning_rate": 4.8953738149291254e-05, + "loss": 5.7164, + "step": 15540 + }, + { + "epoch": 0.09242672946997811, + "grad_norm": 1.7792669534683228, + "learning_rate": 4.8953604429650065e-05, + "loss": 5.7924, + "step": 15541 + }, + { + "epoch": 0.09243267675325911, + "grad_norm": 2.2124290466308594, + "learning_rate": 4.895347070164692e-05, + "loss": 5.4432, + "step": 15542 + }, + { + "epoch": 0.09243862403654012, + "grad_norm": 1.6349585056304932, + "learning_rate": 4.8953336965281873e-05, + "loss": 5.6975, + "step": 15543 + }, + { + "epoch": 0.0924445713198211, + "grad_norm": 2.01434063911438, + "learning_rate": 4.895320322055496e-05, + "loss": 5.3564, + "step": 15544 + }, + { + "epoch": 0.0924505186031021, + "grad_norm": 1.8110109567642212, + "learning_rate": 4.895306946746623e-05, + "loss": 5.3061, + "step": 15545 + }, + { + "epoch": 0.0924564658863831, + "grad_norm": 1.6687593460083008, + "learning_rate": 4.895293570601573e-05, + "loss": 5.4061, + "step": 15546 + }, + { + "epoch": 0.0924624131696641, + "grad_norm": 1.7488101720809937, + "learning_rate": 4.895280193620351e-05, + "loss": 5.4726, + "step": 15547 + }, + { + "epoch": 0.0924683604529451, + "grad_norm": 1.9059126377105713, + "learning_rate": 4.895266815802961e-05, + "loss": 5.9665, + "step": 15548 + }, + { + "epoch": 0.09247430773622609, + "grad_norm": 1.9732307195663452, + "learning_rate": 4.8952534371494084e-05, + "loss": 6.007, + "step": 15549 + }, + { + "epoch": 0.09248025501950709, + "grad_norm": 1.792325496673584, + "learning_rate": 4.895240057659697e-05, + "loss": 5.9466, + "step": 15550 + }, + { + "epoch": 0.09248620230278809, + "grad_norm": 1.7282743453979492, + "learning_rate": 4.895226677333833e-05, + "loss": 5.456, + "step": 15551 + }, + { + "epoch": 0.09249214958606908, + "grad_norm": 1.5014616250991821, + "learning_rate": 4.89521329617182e-05, + "loss": 5.0257, + "step": 15552 + }, + { + "epoch": 0.09249809686935008, + "grad_norm": 1.5420494079589844, + "learning_rate": 4.8951999141736624e-05, + "loss": 5.0657, + "step": 15553 + }, + { + "epoch": 0.09250404415263108, + "grad_norm": 1.4273606538772583, + "learning_rate": 4.895186531339365e-05, + "loss": 5.3431, + "step": 15554 + }, + { + "epoch": 0.09250999143591207, + "grad_norm": 1.9525657892227173, + "learning_rate": 4.895173147668933e-05, + "loss": 5.514, + "step": 15555 + }, + { + "epoch": 0.09251593871919307, + "grad_norm": 2.7004175186157227, + "learning_rate": 4.895159763162371e-05, + "loss": 5.3548, + "step": 15556 + }, + { + "epoch": 0.09252188600247407, + "grad_norm": 2.5703442096710205, + "learning_rate": 4.8951463778196835e-05, + "loss": 5.4275, + "step": 15557 + }, + { + "epoch": 0.09252783328575506, + "grad_norm": 2.4033594131469727, + "learning_rate": 4.895132991640875e-05, + "loss": 5.285, + "step": 15558 + }, + { + "epoch": 0.09253378056903606, + "grad_norm": 2.0295355319976807, + "learning_rate": 4.89511960462595e-05, + "loss": 5.1196, + "step": 15559 + }, + { + "epoch": 0.09253972785231707, + "grad_norm": 2.0739188194274902, + "learning_rate": 4.895106216774914e-05, + "loss": 4.7362, + "step": 15560 + }, + { + "epoch": 0.09254567513559805, + "grad_norm": 2.2429590225219727, + "learning_rate": 4.895092828087771e-05, + "loss": 5.0749, + "step": 15561 + }, + { + "epoch": 0.09255162241887906, + "grad_norm": 1.9738318920135498, + "learning_rate": 4.895079438564526e-05, + "loss": 5.6755, + "step": 15562 + }, + { + "epoch": 0.09255756970216006, + "grad_norm": 2.692275047302246, + "learning_rate": 4.895066048205183e-05, + "loss": 5.3146, + "step": 15563 + }, + { + "epoch": 0.09256351698544105, + "grad_norm": 2.774864912033081, + "learning_rate": 4.895052657009748e-05, + "loss": 5.1116, + "step": 15564 + }, + { + "epoch": 0.09256946426872205, + "grad_norm": 2.5513851642608643, + "learning_rate": 4.895039264978224e-05, + "loss": 5.0464, + "step": 15565 + }, + { + "epoch": 0.09257541155200305, + "grad_norm": 2.2035319805145264, + "learning_rate": 4.895025872110617e-05, + "loss": 5.1499, + "step": 15566 + }, + { + "epoch": 0.09258135883528404, + "grad_norm": 1.669402837753296, + "learning_rate": 4.8950124784069305e-05, + "loss": 5.5006, + "step": 15567 + }, + { + "epoch": 0.09258730611856504, + "grad_norm": 1.9433900117874146, + "learning_rate": 4.894999083867171e-05, + "loss": 5.1423, + "step": 15568 + }, + { + "epoch": 0.09259325340184604, + "grad_norm": 2.2401936054229736, + "learning_rate": 4.8949856884913416e-05, + "loss": 4.8937, + "step": 15569 + }, + { + "epoch": 0.09259920068512703, + "grad_norm": 2.094503164291382, + "learning_rate": 4.894972292279447e-05, + "loss": 4.8554, + "step": 15570 + }, + { + "epoch": 0.09260514796840803, + "grad_norm": 2.1677212715148926, + "learning_rate": 4.894958895231493e-05, + "loss": 4.7446, + "step": 15571 + }, + { + "epoch": 0.09261109525168904, + "grad_norm": 2.0262231826782227, + "learning_rate": 4.894945497347483e-05, + "loss": 4.8282, + "step": 15572 + }, + { + "epoch": 0.09261704253497002, + "grad_norm": 1.9491705894470215, + "learning_rate": 4.894932098627423e-05, + "loss": 4.9579, + "step": 15573 + }, + { + "epoch": 0.09262298981825103, + "grad_norm": 2.0898170471191406, + "learning_rate": 4.8949186990713165e-05, + "loss": 4.8197, + "step": 15574 + }, + { + "epoch": 0.09262893710153201, + "grad_norm": 1.8452088832855225, + "learning_rate": 4.894905298679169e-05, + "loss": 4.8359, + "step": 15575 + }, + { + "epoch": 0.09263488438481302, + "grad_norm": 2.1573541164398193, + "learning_rate": 4.894891897450984e-05, + "loss": 4.5882, + "step": 15576 + }, + { + "epoch": 0.09264083166809402, + "grad_norm": 2.1609156131744385, + "learning_rate": 4.894878495386768e-05, + "loss": 4.7556, + "step": 15577 + }, + { + "epoch": 0.092646778951375, + "grad_norm": 1.9062503576278687, + "learning_rate": 4.894865092486524e-05, + "loss": 4.6933, + "step": 15578 + }, + { + "epoch": 0.09265272623465601, + "grad_norm": 1.8876394033432007, + "learning_rate": 4.894851688750257e-05, + "loss": 4.7317, + "step": 15579 + }, + { + "epoch": 0.09265867351793701, + "grad_norm": 1.9106816053390503, + "learning_rate": 4.894838284177972e-05, + "loss": 4.7597, + "step": 15580 + }, + { + "epoch": 0.092664620801218, + "grad_norm": 1.8116264343261719, + "learning_rate": 4.894824878769674e-05, + "loss": 4.8865, + "step": 15581 + }, + { + "epoch": 0.092670568084499, + "grad_norm": 1.8492180109024048, + "learning_rate": 4.894811472525368e-05, + "loss": 4.7282, + "step": 15582 + }, + { + "epoch": 0.09267651536778, + "grad_norm": 1.9450536966323853, + "learning_rate": 4.894798065445058e-05, + "loss": 5.0777, + "step": 15583 + }, + { + "epoch": 0.09268246265106099, + "grad_norm": 2.2099180221557617, + "learning_rate": 4.894784657528748e-05, + "loss": 5.421, + "step": 15584 + }, + { + "epoch": 0.09268840993434199, + "grad_norm": 2.2239253520965576, + "learning_rate": 4.8947712487764436e-05, + "loss": 5.8346, + "step": 15585 + }, + { + "epoch": 0.092694357217623, + "grad_norm": 1.7867511510849, + "learning_rate": 4.894757839188149e-05, + "loss": 5.9306, + "step": 15586 + }, + { + "epoch": 0.09270030450090398, + "grad_norm": 1.6986007690429688, + "learning_rate": 4.89474442876387e-05, + "loss": 5.0704, + "step": 15587 + }, + { + "epoch": 0.09270625178418498, + "grad_norm": 1.7906185388565063, + "learning_rate": 4.89473101750361e-05, + "loss": 5.1951, + "step": 15588 + }, + { + "epoch": 0.09271219906746599, + "grad_norm": 1.7287026643753052, + "learning_rate": 4.894717605407374e-05, + "loss": 5.1736, + "step": 15589 + }, + { + "epoch": 0.09271814635074697, + "grad_norm": 1.6170624494552612, + "learning_rate": 4.8947041924751665e-05, + "loss": 5.5399, + "step": 15590 + }, + { + "epoch": 0.09272409363402798, + "grad_norm": 1.7556488513946533, + "learning_rate": 4.894690778706994e-05, + "loss": 5.574, + "step": 15591 + }, + { + "epoch": 0.09273004091730898, + "grad_norm": 2.346484899520874, + "learning_rate": 4.894677364102859e-05, + "loss": 5.0062, + "step": 15592 + }, + { + "epoch": 0.09273598820058997, + "grad_norm": 2.1376540660858154, + "learning_rate": 4.894663948662766e-05, + "loss": 5.1377, + "step": 15593 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 2.2489631175994873, + "learning_rate": 4.894650532386721e-05, + "loss": 5.1058, + "step": 15594 + }, + { + "epoch": 0.09274788276715197, + "grad_norm": 1.984281063079834, + "learning_rate": 4.8946371152747285e-05, + "loss": 5.1223, + "step": 15595 + }, + { + "epoch": 0.09275383005043296, + "grad_norm": 1.9387162923812866, + "learning_rate": 4.8946236973267935e-05, + "loss": 5.5121, + "step": 15596 + }, + { + "epoch": 0.09275977733371396, + "grad_norm": 1.8052873611450195, + "learning_rate": 4.894610278542919e-05, + "loss": 5.2101, + "step": 15597 + }, + { + "epoch": 0.09276572461699496, + "grad_norm": 2.558525562286377, + "learning_rate": 4.894596858923111e-05, + "loss": 4.6659, + "step": 15598 + }, + { + "epoch": 0.09277167190027595, + "grad_norm": 1.700897455215454, + "learning_rate": 4.8945834384673746e-05, + "loss": 5.4634, + "step": 15599 + }, + { + "epoch": 0.09277761918355695, + "grad_norm": 1.4691836833953857, + "learning_rate": 4.8945700171757134e-05, + "loss": 5.3873, + "step": 15600 + }, + { + "epoch": 0.09278356646683796, + "grad_norm": 1.4673740863800049, + "learning_rate": 4.894556595048132e-05, + "loss": 5.3917, + "step": 15601 + }, + { + "epoch": 0.09278951375011894, + "grad_norm": 1.6252011060714722, + "learning_rate": 4.894543172084637e-05, + "loss": 5.2003, + "step": 15602 + }, + { + "epoch": 0.09279546103339995, + "grad_norm": 1.6320288181304932, + "learning_rate": 4.89452974828523e-05, + "loss": 5.4821, + "step": 15603 + }, + { + "epoch": 0.09280140831668093, + "grad_norm": 2.1444239616394043, + "learning_rate": 4.8945163236499194e-05, + "loss": 5.9926, + "step": 15604 + }, + { + "epoch": 0.09280735559996194, + "grad_norm": 2.3000271320343018, + "learning_rate": 4.894502898178707e-05, + "loss": 4.7545, + "step": 15605 + }, + { + "epoch": 0.09281330288324294, + "grad_norm": 2.259962797164917, + "learning_rate": 4.894489471871597e-05, + "loss": 5.1292, + "step": 15606 + }, + { + "epoch": 0.09281925016652393, + "grad_norm": 2.5522921085357666, + "learning_rate": 4.8944760447285977e-05, + "loss": 5.1226, + "step": 15607 + }, + { + "epoch": 0.09282519744980493, + "grad_norm": 1.7621963024139404, + "learning_rate": 4.8944626167497096e-05, + "loss": 5.5405, + "step": 15608 + }, + { + "epoch": 0.09283114473308593, + "grad_norm": 1.6631364822387695, + "learning_rate": 4.894449187934941e-05, + "loss": 5.4332, + "step": 15609 + }, + { + "epoch": 0.09283709201636692, + "grad_norm": 1.695904016494751, + "learning_rate": 4.894435758284294e-05, + "loss": 5.4989, + "step": 15610 + }, + { + "epoch": 0.09284303929964792, + "grad_norm": 2.0772507190704346, + "learning_rate": 4.894422327797774e-05, + "loss": 5.0412, + "step": 15611 + }, + { + "epoch": 0.09284898658292892, + "grad_norm": 1.959685206413269, + "learning_rate": 4.894408896475386e-05, + "loss": 5.2749, + "step": 15612 + }, + { + "epoch": 0.09285493386620991, + "grad_norm": 2.0305607318878174, + "learning_rate": 4.894395464317135e-05, + "loss": 5.6227, + "step": 15613 + }, + { + "epoch": 0.09286088114949091, + "grad_norm": 1.7631112337112427, + "learning_rate": 4.894382031323026e-05, + "loss": 5.4396, + "step": 15614 + }, + { + "epoch": 0.09286682843277191, + "grad_norm": 1.8171305656433105, + "learning_rate": 4.894368597493062e-05, + "loss": 5.2498, + "step": 15615 + }, + { + "epoch": 0.0928727757160529, + "grad_norm": 2.123805522918701, + "learning_rate": 4.894355162827249e-05, + "loss": 5.8113, + "step": 15616 + }, + { + "epoch": 0.0928787229993339, + "grad_norm": 1.840071201324463, + "learning_rate": 4.894341727325591e-05, + "loss": 5.6394, + "step": 15617 + }, + { + "epoch": 0.0928846702826149, + "grad_norm": 1.7636733055114746, + "learning_rate": 4.8943282909880935e-05, + "loss": 5.5515, + "step": 15618 + }, + { + "epoch": 0.0928906175658959, + "grad_norm": 1.956026315689087, + "learning_rate": 4.89431485381476e-05, + "loss": 5.1716, + "step": 15619 + }, + { + "epoch": 0.0928965648491769, + "grad_norm": 2.2381720542907715, + "learning_rate": 4.894301415805597e-05, + "loss": 4.9692, + "step": 15620 + }, + { + "epoch": 0.0929025121324579, + "grad_norm": 2.178999423980713, + "learning_rate": 4.894287976960607e-05, + "loss": 4.9732, + "step": 15621 + }, + { + "epoch": 0.09290845941573889, + "grad_norm": 2.1932144165039062, + "learning_rate": 4.894274537279796e-05, + "loss": 4.9497, + "step": 15622 + }, + { + "epoch": 0.09291440669901989, + "grad_norm": 2.093252182006836, + "learning_rate": 4.894261096763169e-05, + "loss": 4.7642, + "step": 15623 + }, + { + "epoch": 0.09292035398230089, + "grad_norm": 1.785686731338501, + "learning_rate": 4.89424765541073e-05, + "loss": 5.1449, + "step": 15624 + }, + { + "epoch": 0.09292630126558188, + "grad_norm": 2.250986099243164, + "learning_rate": 4.894234213222484e-05, + "loss": 4.8503, + "step": 15625 + }, + { + "epoch": 0.09293224854886288, + "grad_norm": 1.8585362434387207, + "learning_rate": 4.8942207701984355e-05, + "loss": 4.582, + "step": 15626 + }, + { + "epoch": 0.09293819583214388, + "grad_norm": 2.080742597579956, + "learning_rate": 4.894207326338589e-05, + "loss": 4.4912, + "step": 15627 + }, + { + "epoch": 0.09294414311542487, + "grad_norm": 2.422774076461792, + "learning_rate": 4.8941938816429495e-05, + "loss": 4.4227, + "step": 15628 + }, + { + "epoch": 0.09295009039870587, + "grad_norm": 2.3304965496063232, + "learning_rate": 4.8941804361115215e-05, + "loss": 4.2265, + "step": 15629 + }, + { + "epoch": 0.09295603768198687, + "grad_norm": 2.619837522506714, + "learning_rate": 4.8941669897443105e-05, + "loss": 4.6812, + "step": 15630 + }, + { + "epoch": 0.09296198496526786, + "grad_norm": 2.4924118518829346, + "learning_rate": 4.89415354254132e-05, + "loss": 4.5081, + "step": 15631 + }, + { + "epoch": 0.09296793224854887, + "grad_norm": 2.5034751892089844, + "learning_rate": 4.894140094502556e-05, + "loss": 4.3356, + "step": 15632 + }, + { + "epoch": 0.09297387953182985, + "grad_norm": 2.599963665008545, + "learning_rate": 4.894126645628021e-05, + "loss": 4.6952, + "step": 15633 + }, + { + "epoch": 0.09297982681511086, + "grad_norm": 2.189516544342041, + "learning_rate": 4.894113195917722e-05, + "loss": 5.75, + "step": 15634 + }, + { + "epoch": 0.09298577409839186, + "grad_norm": 2.5768351554870605, + "learning_rate": 4.894099745371663e-05, + "loss": 5.9257, + "step": 15635 + }, + { + "epoch": 0.09299172138167285, + "grad_norm": 2.2909457683563232, + "learning_rate": 4.894086293989848e-05, + "loss": 5.484, + "step": 15636 + }, + { + "epoch": 0.09299766866495385, + "grad_norm": 2.0447487831115723, + "learning_rate": 4.894072841772282e-05, + "loss": 5.2952, + "step": 15637 + }, + { + "epoch": 0.09300361594823485, + "grad_norm": 1.8934963941574097, + "learning_rate": 4.894059388718971e-05, + "loss": 5.3498, + "step": 15638 + }, + { + "epoch": 0.09300956323151584, + "grad_norm": 1.9989632368087769, + "learning_rate": 4.894045934829919e-05, + "loss": 5.55, + "step": 15639 + }, + { + "epoch": 0.09301551051479684, + "grad_norm": 1.4955580234527588, + "learning_rate": 4.8940324801051285e-05, + "loss": 5.1978, + "step": 15640 + }, + { + "epoch": 0.09302145779807784, + "grad_norm": 1.8308879137039185, + "learning_rate": 4.8940190245446074e-05, + "loss": 5.5448, + "step": 15641 + }, + { + "epoch": 0.09302740508135883, + "grad_norm": 1.4997726678848267, + "learning_rate": 4.8940055681483576e-05, + "loss": 5.353, + "step": 15642 + }, + { + "epoch": 0.09303335236463983, + "grad_norm": 1.5643866062164307, + "learning_rate": 4.8939921109163864e-05, + "loss": 5.1456, + "step": 15643 + }, + { + "epoch": 0.09303929964792083, + "grad_norm": 1.8125799894332886, + "learning_rate": 4.8939786528486967e-05, + "loss": 5.3456, + "step": 15644 + }, + { + "epoch": 0.09304524693120182, + "grad_norm": 1.6802864074707031, + "learning_rate": 4.893965193945294e-05, + "loss": 5.279, + "step": 15645 + }, + { + "epoch": 0.09305119421448282, + "grad_norm": 1.4397536516189575, + "learning_rate": 4.893951734206182e-05, + "loss": 5.9849, + "step": 15646 + }, + { + "epoch": 0.09305714149776383, + "grad_norm": 1.618416428565979, + "learning_rate": 4.893938273631368e-05, + "loss": 5.231, + "step": 15647 + }, + { + "epoch": 0.09306308878104481, + "grad_norm": 1.4833893775939941, + "learning_rate": 4.8939248122208537e-05, + "loss": 5.2883, + "step": 15648 + }, + { + "epoch": 0.09306903606432582, + "grad_norm": 1.2709630727767944, + "learning_rate": 4.8939113499746446e-05, + "loss": 5.1042, + "step": 15649 + }, + { + "epoch": 0.09307498334760682, + "grad_norm": 1.2770884037017822, + "learning_rate": 4.893897886892747e-05, + "loss": 5.0682, + "step": 15650 + }, + { + "epoch": 0.0930809306308878, + "grad_norm": 1.4511629343032837, + "learning_rate": 4.893884422975163e-05, + "loss": 5.0904, + "step": 15651 + }, + { + "epoch": 0.09308687791416881, + "grad_norm": 1.7428641319274902, + "learning_rate": 4.8938709582219e-05, + "loss": 5.2569, + "step": 15652 + }, + { + "epoch": 0.09309282519744981, + "grad_norm": 1.5430729389190674, + "learning_rate": 4.89385749263296e-05, + "loss": 5.1698, + "step": 15653 + }, + { + "epoch": 0.0930987724807308, + "grad_norm": 1.6689143180847168, + "learning_rate": 4.8938440262083495e-05, + "loss": 5.1866, + "step": 15654 + }, + { + "epoch": 0.0931047197640118, + "grad_norm": 1.505698323249817, + "learning_rate": 4.8938305589480734e-05, + "loss": 5.1574, + "step": 15655 + }, + { + "epoch": 0.0931106670472928, + "grad_norm": 1.496547818183899, + "learning_rate": 4.8938170908521356e-05, + "loss": 5.1175, + "step": 15656 + }, + { + "epoch": 0.09311661433057379, + "grad_norm": 1.5257115364074707, + "learning_rate": 4.893803621920541e-05, + "loss": 5.1796, + "step": 15657 + }, + { + "epoch": 0.09312256161385479, + "grad_norm": 1.5880948305130005, + "learning_rate": 4.893790152153294e-05, + "loss": 5.1864, + "step": 15658 + }, + { + "epoch": 0.0931285088971358, + "grad_norm": 1.632869839668274, + "learning_rate": 4.8937766815503994e-05, + "loss": 5.1126, + "step": 15659 + }, + { + "epoch": 0.09313445618041678, + "grad_norm": 1.5902632474899292, + "learning_rate": 4.893763210111862e-05, + "loss": 5.0661, + "step": 15660 + }, + { + "epoch": 0.09314040346369779, + "grad_norm": 1.2780532836914062, + "learning_rate": 4.893749737837687e-05, + "loss": 5.2189, + "step": 15661 + }, + { + "epoch": 0.09314635074697877, + "grad_norm": 1.604551076889038, + "learning_rate": 4.8937362647278786e-05, + "loss": 5.4624, + "step": 15662 + }, + { + "epoch": 0.09315229803025978, + "grad_norm": 1.3654263019561768, + "learning_rate": 4.8937227907824424e-05, + "loss": 5.3875, + "step": 15663 + }, + { + "epoch": 0.09315824531354078, + "grad_norm": 1.3098255395889282, + "learning_rate": 4.893709316001381e-05, + "loss": 5.2158, + "step": 15664 + }, + { + "epoch": 0.09316419259682177, + "grad_norm": 1.4036632776260376, + "learning_rate": 4.893695840384701e-05, + "loss": 5.3808, + "step": 15665 + }, + { + "epoch": 0.09317013988010277, + "grad_norm": 1.772504210472107, + "learning_rate": 4.893682363932407e-05, + "loss": 5.4599, + "step": 15666 + }, + { + "epoch": 0.09317608716338377, + "grad_norm": 1.8509577512741089, + "learning_rate": 4.893668886644503e-05, + "loss": 5.223, + "step": 15667 + }, + { + "epoch": 0.09318203444666476, + "grad_norm": 1.7572264671325684, + "learning_rate": 4.893655408520993e-05, + "loss": 5.3276, + "step": 15668 + }, + { + "epoch": 0.09318798172994576, + "grad_norm": 1.7149637937545776, + "learning_rate": 4.8936419295618835e-05, + "loss": 5.3093, + "step": 15669 + }, + { + "epoch": 0.09319392901322676, + "grad_norm": 1.441741943359375, + "learning_rate": 4.893628449767178e-05, + "loss": 5.2237, + "step": 15670 + }, + { + "epoch": 0.09319987629650775, + "grad_norm": 1.4929050207138062, + "learning_rate": 4.893614969136882e-05, + "loss": 5.22, + "step": 15671 + }, + { + "epoch": 0.09320582357978875, + "grad_norm": 1.251057505607605, + "learning_rate": 4.893601487670999e-05, + "loss": 5.2417, + "step": 15672 + }, + { + "epoch": 0.09321177086306975, + "grad_norm": 1.313826560974121, + "learning_rate": 4.893588005369535e-05, + "loss": 5.1841, + "step": 15673 + }, + { + "epoch": 0.09321771814635074, + "grad_norm": 1.1993061304092407, + "learning_rate": 4.8935745222324935e-05, + "loss": 5.1649, + "step": 15674 + }, + { + "epoch": 0.09322366542963174, + "grad_norm": 1.4086672067642212, + "learning_rate": 4.8935610382598806e-05, + "loss": 5.1463, + "step": 15675 + }, + { + "epoch": 0.09322961271291275, + "grad_norm": 1.3089197874069214, + "learning_rate": 4.893547553451701e-05, + "loss": 5.1505, + "step": 15676 + }, + { + "epoch": 0.09323555999619373, + "grad_norm": 1.3332446813583374, + "learning_rate": 4.893534067807957e-05, + "loss": 5.1267, + "step": 15677 + }, + { + "epoch": 0.09324150727947474, + "grad_norm": 1.433020830154419, + "learning_rate": 4.893520581328656e-05, + "loss": 5.1689, + "step": 15678 + }, + { + "epoch": 0.09324745456275574, + "grad_norm": 1.4111361503601074, + "learning_rate": 4.893507094013801e-05, + "loss": 5.1288, + "step": 15679 + }, + { + "epoch": 0.09325340184603673, + "grad_norm": 1.551698923110962, + "learning_rate": 4.893493605863398e-05, + "loss": 5.0919, + "step": 15680 + }, + { + "epoch": 0.09325934912931773, + "grad_norm": 1.5479143857955933, + "learning_rate": 4.893480116877451e-05, + "loss": 4.9749, + "step": 15681 + }, + { + "epoch": 0.09326529641259873, + "grad_norm": 1.3716951608657837, + "learning_rate": 4.893466627055964e-05, + "loss": 5.2221, + "step": 15682 + }, + { + "epoch": 0.09327124369587972, + "grad_norm": 1.409462571144104, + "learning_rate": 4.893453136398943e-05, + "loss": 5.2131, + "step": 15683 + }, + { + "epoch": 0.09327719097916072, + "grad_norm": 1.3185720443725586, + "learning_rate": 4.8934396449063935e-05, + "loss": 5.094, + "step": 15684 + }, + { + "epoch": 0.09328313826244172, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8934261525783176e-05, + "loss": 5.0889, + "step": 15685 + }, + { + "epoch": 0.09328908554572271, + "grad_norm": 2.147268772125244, + "learning_rate": 4.8934126594147216e-05, + "loss": 4.9404, + "step": 15686 + }, + { + "epoch": 0.09329503282900371, + "grad_norm": 1.3361799716949463, + "learning_rate": 4.8933991654156096e-05, + "loss": 5.0744, + "step": 15687 + }, + { + "epoch": 0.09330098011228471, + "grad_norm": 1.6436421871185303, + "learning_rate": 4.893385670580988e-05, + "loss": 5.0633, + "step": 15688 + }, + { + "epoch": 0.0933069273955657, + "grad_norm": 1.5499234199523926, + "learning_rate": 4.8933721749108586e-05, + "loss": 4.8445, + "step": 15689 + }, + { + "epoch": 0.0933128746788467, + "grad_norm": 1.363355278968811, + "learning_rate": 4.893358678405229e-05, + "loss": 5.1135, + "step": 15690 + }, + { + "epoch": 0.0933188219621277, + "grad_norm": 1.4172797203063965, + "learning_rate": 4.893345181064102e-05, + "loss": 5.056, + "step": 15691 + }, + { + "epoch": 0.0933247692454087, + "grad_norm": 1.546329140663147, + "learning_rate": 4.893331682887483e-05, + "loss": 4.9756, + "step": 15692 + }, + { + "epoch": 0.0933307165286897, + "grad_norm": 1.5151170492172241, + "learning_rate": 4.893318183875376e-05, + "loss": 4.991, + "step": 15693 + }, + { + "epoch": 0.09333666381197069, + "grad_norm": 1.1936514377593994, + "learning_rate": 4.893304684027787e-05, + "loss": 5.0454, + "step": 15694 + }, + { + "epoch": 0.09334261109525169, + "grad_norm": 1.4055380821228027, + "learning_rate": 4.893291183344721e-05, + "loss": 5.0673, + "step": 15695 + }, + { + "epoch": 0.09334855837853269, + "grad_norm": 1.4087036848068237, + "learning_rate": 4.89327768182618e-05, + "loss": 4.9748, + "step": 15696 + }, + { + "epoch": 0.09335450566181368, + "grad_norm": 1.251237392425537, + "learning_rate": 4.893264179472171e-05, + "loss": 5.158, + "step": 15697 + }, + { + "epoch": 0.09336045294509468, + "grad_norm": 1.3806357383728027, + "learning_rate": 4.893250676282699e-05, + "loss": 5.2027, + "step": 15698 + }, + { + "epoch": 0.09336640022837568, + "grad_norm": 1.3959203958511353, + "learning_rate": 4.893237172257767e-05, + "loss": 5.1854, + "step": 15699 + }, + { + "epoch": 0.09337234751165667, + "grad_norm": 1.4886810779571533, + "learning_rate": 4.893223667397381e-05, + "loss": 5.2363, + "step": 15700 + }, + { + "epoch": 0.09337829479493767, + "grad_norm": 1.2987968921661377, + "learning_rate": 4.893210161701546e-05, + "loss": 5.2931, + "step": 15701 + }, + { + "epoch": 0.09338424207821867, + "grad_norm": 1.2594645023345947, + "learning_rate": 4.8931966551702644e-05, + "loss": 5.1346, + "step": 15702 + }, + { + "epoch": 0.09339018936149966, + "grad_norm": 1.5101357698440552, + "learning_rate": 4.893183147803544e-05, + "loss": 5.0369, + "step": 15703 + }, + { + "epoch": 0.09339613664478066, + "grad_norm": 1.4388933181762695, + "learning_rate": 4.8931696396013876e-05, + "loss": 5.0427, + "step": 15704 + }, + { + "epoch": 0.09340208392806167, + "grad_norm": 1.2890875339508057, + "learning_rate": 4.8931561305638006e-05, + "loss": 5.1602, + "step": 15705 + }, + { + "epoch": 0.09340803121134265, + "grad_norm": 1.3310670852661133, + "learning_rate": 4.893142620690787e-05, + "loss": 5.4886, + "step": 15706 + }, + { + "epoch": 0.09341397849462366, + "grad_norm": 1.0935169458389282, + "learning_rate": 4.893129109982353e-05, + "loss": 5.4634, + "step": 15707 + }, + { + "epoch": 0.09341992577790466, + "grad_norm": 1.4718440771102905, + "learning_rate": 4.893115598438501e-05, + "loss": 5.4917, + "step": 15708 + }, + { + "epoch": 0.09342587306118565, + "grad_norm": 1.4053934812545776, + "learning_rate": 4.8931020860592384e-05, + "loss": 5.1588, + "step": 15709 + }, + { + "epoch": 0.09343182034446665, + "grad_norm": 1.3130263090133667, + "learning_rate": 4.893088572844568e-05, + "loss": 5.0464, + "step": 15710 + }, + { + "epoch": 0.09343776762774765, + "grad_norm": 1.3342580795288086, + "learning_rate": 4.8930750587944955e-05, + "loss": 5.1464, + "step": 15711 + }, + { + "epoch": 0.09344371491102864, + "grad_norm": 1.3214285373687744, + "learning_rate": 4.893061543909024e-05, + "loss": 5.0867, + "step": 15712 + }, + { + "epoch": 0.09344966219430964, + "grad_norm": 1.2091466188430786, + "learning_rate": 4.893048028188161e-05, + "loss": 5.1403, + "step": 15713 + }, + { + "epoch": 0.09345560947759064, + "grad_norm": 1.421499490737915, + "learning_rate": 4.893034511631909e-05, + "loss": 5.1853, + "step": 15714 + }, + { + "epoch": 0.09346155676087163, + "grad_norm": 1.2093148231506348, + "learning_rate": 4.893020994240273e-05, + "loss": 5.0892, + "step": 15715 + }, + { + "epoch": 0.09346750404415263, + "grad_norm": 1.361080288887024, + "learning_rate": 4.893007476013258e-05, + "loss": 5.0855, + "step": 15716 + }, + { + "epoch": 0.09347345132743363, + "grad_norm": 1.31247079372406, + "learning_rate": 4.89299395695087e-05, + "loss": 5.1667, + "step": 15717 + }, + { + "epoch": 0.09347939861071462, + "grad_norm": 1.4052191972732544, + "learning_rate": 4.892980437053112e-05, + "loss": 4.9256, + "step": 15718 + }, + { + "epoch": 0.09348534589399562, + "grad_norm": 1.409225344657898, + "learning_rate": 4.8929669163199886e-05, + "loss": 4.7722, + "step": 15719 + }, + { + "epoch": 0.09349129317727661, + "grad_norm": 1.54015052318573, + "learning_rate": 4.892953394751505e-05, + "loss": 4.9331, + "step": 15720 + }, + { + "epoch": 0.09349724046055762, + "grad_norm": 1.313596487045288, + "learning_rate": 4.892939872347667e-05, + "loss": 5.0221, + "step": 15721 + }, + { + "epoch": 0.09350318774383862, + "grad_norm": 1.5266852378845215, + "learning_rate": 4.8929263491084785e-05, + "loss": 5.0261, + "step": 15722 + }, + { + "epoch": 0.0935091350271196, + "grad_norm": 1.409408450126648, + "learning_rate": 4.892912825033944e-05, + "loss": 5.1319, + "step": 15723 + }, + { + "epoch": 0.09351508231040061, + "grad_norm": 1.444326639175415, + "learning_rate": 4.892899300124067e-05, + "loss": 5.0043, + "step": 15724 + }, + { + "epoch": 0.09352102959368161, + "grad_norm": 1.6662111282348633, + "learning_rate": 4.8928857743788556e-05, + "loss": 5.22, + "step": 15725 + }, + { + "epoch": 0.0935269768769626, + "grad_norm": 1.5927739143371582, + "learning_rate": 4.8928722477983116e-05, + "loss": 5.1532, + "step": 15726 + }, + { + "epoch": 0.0935329241602436, + "grad_norm": 1.5560848712921143, + "learning_rate": 4.892858720382441e-05, + "loss": 4.8893, + "step": 15727 + }, + { + "epoch": 0.0935388714435246, + "grad_norm": 1.450135588645935, + "learning_rate": 4.892845192131247e-05, + "loss": 4.8116, + "step": 15728 + }, + { + "epoch": 0.09354481872680559, + "grad_norm": 1.3629002571105957, + "learning_rate": 4.892831663044736e-05, + "loss": 4.9439, + "step": 15729 + }, + { + "epoch": 0.09355076601008659, + "grad_norm": 1.5293892621994019, + "learning_rate": 4.892818133122913e-05, + "loss": 5.1726, + "step": 15730 + }, + { + "epoch": 0.0935567132933676, + "grad_norm": 1.193088412284851, + "learning_rate": 4.892804602365781e-05, + "loss": 5.3199, + "step": 15731 + }, + { + "epoch": 0.09356266057664858, + "grad_norm": 1.5575615167617798, + "learning_rate": 4.8927910707733456e-05, + "loss": 5.3426, + "step": 15732 + }, + { + "epoch": 0.09356860785992958, + "grad_norm": 1.4177138805389404, + "learning_rate": 4.892777538345612e-05, + "loss": 5.4028, + "step": 15733 + }, + { + "epoch": 0.09357455514321059, + "grad_norm": 1.4139392375946045, + "learning_rate": 4.892764005082584e-05, + "loss": 5.3854, + "step": 15734 + }, + { + "epoch": 0.09358050242649157, + "grad_norm": 1.5129605531692505, + "learning_rate": 4.892750470984267e-05, + "loss": 5.3614, + "step": 15735 + }, + { + "epoch": 0.09358644970977258, + "grad_norm": 1.23565673828125, + "learning_rate": 4.8927369360506665e-05, + "loss": 5.2379, + "step": 15736 + }, + { + "epoch": 0.09359239699305358, + "grad_norm": 1.4861465692520142, + "learning_rate": 4.892723400281785e-05, + "loss": 5.0968, + "step": 15737 + }, + { + "epoch": 0.09359834427633457, + "grad_norm": 1.4061464071273804, + "learning_rate": 4.892709863677629e-05, + "loss": 5.2947, + "step": 15738 + }, + { + "epoch": 0.09360429155961557, + "grad_norm": 1.2175462245941162, + "learning_rate": 4.892696326238203e-05, + "loss": 5.2828, + "step": 15739 + }, + { + "epoch": 0.09361023884289657, + "grad_norm": 1.398414969444275, + "learning_rate": 4.8926827879635104e-05, + "loss": 5.3281, + "step": 15740 + }, + { + "epoch": 0.09361618612617756, + "grad_norm": 1.438428282737732, + "learning_rate": 4.892669248853558e-05, + "loss": 5.2483, + "step": 15741 + }, + { + "epoch": 0.09362213340945856, + "grad_norm": 1.6579184532165527, + "learning_rate": 4.8926557089083494e-05, + "loss": 5.1275, + "step": 15742 + }, + { + "epoch": 0.09362808069273956, + "grad_norm": 1.2637989521026611, + "learning_rate": 4.892642168127889e-05, + "loss": 5.2276, + "step": 15743 + }, + { + "epoch": 0.09363402797602055, + "grad_norm": 1.383898377418518, + "learning_rate": 4.892628626512182e-05, + "loss": 5.3406, + "step": 15744 + }, + { + "epoch": 0.09363997525930155, + "grad_norm": 1.3794132471084595, + "learning_rate": 4.8926150840612325e-05, + "loss": 5.2309, + "step": 15745 + }, + { + "epoch": 0.09364592254258255, + "grad_norm": 1.3234885931015015, + "learning_rate": 4.8926015407750466e-05, + "loss": 5.3171, + "step": 15746 + }, + { + "epoch": 0.09365186982586354, + "grad_norm": 1.4807502031326294, + "learning_rate": 4.892587996653629e-05, + "loss": 5.3362, + "step": 15747 + }, + { + "epoch": 0.09365781710914454, + "grad_norm": 2.380307912826538, + "learning_rate": 4.892574451696982e-05, + "loss": 5.3103, + "step": 15748 + }, + { + "epoch": 0.09366376439242553, + "grad_norm": 1.5202600955963135, + "learning_rate": 4.892560905905113e-05, + "loss": 5.2225, + "step": 15749 + }, + { + "epoch": 0.09366971167570654, + "grad_norm": 1.34883451461792, + "learning_rate": 4.892547359278025e-05, + "loss": 5.1794, + "step": 15750 + }, + { + "epoch": 0.09367565895898754, + "grad_norm": 1.7073168754577637, + "learning_rate": 4.8925338118157235e-05, + "loss": 5.101, + "step": 15751 + }, + { + "epoch": 0.09368160624226853, + "grad_norm": 1.2718127965927124, + "learning_rate": 4.892520263518214e-05, + "loss": 5.3492, + "step": 15752 + }, + { + "epoch": 0.09368755352554953, + "grad_norm": 1.2247645854949951, + "learning_rate": 4.8925067143854993e-05, + "loss": 5.0841, + "step": 15753 + }, + { + "epoch": 0.09369350080883053, + "grad_norm": 1.4443535804748535, + "learning_rate": 4.892493164417586e-05, + "loss": 5.2866, + "step": 15754 + }, + { + "epoch": 0.09369944809211152, + "grad_norm": 1.2206883430480957, + "learning_rate": 4.8924796136144776e-05, + "loss": 5.116, + "step": 15755 + }, + { + "epoch": 0.09370539537539252, + "grad_norm": 1.4597479104995728, + "learning_rate": 4.89246606197618e-05, + "loss": 5.1501, + "step": 15756 + }, + { + "epoch": 0.09371134265867352, + "grad_norm": 1.4129786491394043, + "learning_rate": 4.892452509502697e-05, + "loss": 5.2618, + "step": 15757 + }, + { + "epoch": 0.09371728994195451, + "grad_norm": 1.382739543914795, + "learning_rate": 4.892438956194033e-05, + "loss": 5.2191, + "step": 15758 + }, + { + "epoch": 0.09372323722523551, + "grad_norm": 1.3665072917938232, + "learning_rate": 4.8924254020501934e-05, + "loss": 4.9739, + "step": 15759 + }, + { + "epoch": 0.09372918450851651, + "grad_norm": 1.3109017610549927, + "learning_rate": 4.892411847071183e-05, + "loss": 5.0648, + "step": 15760 + }, + { + "epoch": 0.0937351317917975, + "grad_norm": 1.5278202295303345, + "learning_rate": 4.892398291257007e-05, + "loss": 5.0215, + "step": 15761 + }, + { + "epoch": 0.0937410790750785, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.8923847346076686e-05, + "loss": 5.442, + "step": 15762 + }, + { + "epoch": 0.0937470263583595, + "grad_norm": 1.4718897342681885, + "learning_rate": 4.892371177123174e-05, + "loss": 5.1484, + "step": 15763 + }, + { + "epoch": 0.0937529736416405, + "grad_norm": 1.2358952760696411, + "learning_rate": 4.8923576188035264e-05, + "loss": 5.3594, + "step": 15764 + }, + { + "epoch": 0.0937589209249215, + "grad_norm": 1.59844172000885, + "learning_rate": 4.8923440596487326e-05, + "loss": 5.221, + "step": 15765 + }, + { + "epoch": 0.0937648682082025, + "grad_norm": 1.4293478727340698, + "learning_rate": 4.892330499658795e-05, + "loss": 5.2211, + "step": 15766 + }, + { + "epoch": 0.09377081549148349, + "grad_norm": 1.167673110961914, + "learning_rate": 4.8923169388337204e-05, + "loss": 5.1274, + "step": 15767 + }, + { + "epoch": 0.09377676277476449, + "grad_norm": 1.4637590646743774, + "learning_rate": 4.892303377173512e-05, + "loss": 5.0781, + "step": 15768 + }, + { + "epoch": 0.09378271005804549, + "grad_norm": 1.383498191833496, + "learning_rate": 4.892289814678176e-05, + "loss": 5.003, + "step": 15769 + }, + { + "epoch": 0.09378865734132648, + "grad_norm": 1.5803290605545044, + "learning_rate": 4.892276251347716e-05, + "loss": 4.9609, + "step": 15770 + }, + { + "epoch": 0.09379460462460748, + "grad_norm": 1.5272483825683594, + "learning_rate": 4.892262687182137e-05, + "loss": 5.074, + "step": 15771 + }, + { + "epoch": 0.09380055190788848, + "grad_norm": 1.377105951309204, + "learning_rate": 4.8922491221814436e-05, + "loss": 5.011, + "step": 15772 + }, + { + "epoch": 0.09380649919116947, + "grad_norm": 1.2150218486785889, + "learning_rate": 4.8922355563456414e-05, + "loss": 5.172, + "step": 15773 + }, + { + "epoch": 0.09381244647445047, + "grad_norm": 1.379515290260315, + "learning_rate": 4.892221989674734e-05, + "loss": 5.229, + "step": 15774 + }, + { + "epoch": 0.09381839375773147, + "grad_norm": 1.5256911516189575, + "learning_rate": 4.892208422168727e-05, + "loss": 5.0163, + "step": 15775 + }, + { + "epoch": 0.09382434104101246, + "grad_norm": 1.645808458328247, + "learning_rate": 4.892194853827624e-05, + "loss": 5.1382, + "step": 15776 + }, + { + "epoch": 0.09383028832429346, + "grad_norm": 1.7437238693237305, + "learning_rate": 4.8921812846514315e-05, + "loss": 4.8078, + "step": 15777 + }, + { + "epoch": 0.09383623560757447, + "grad_norm": 1.384291410446167, + "learning_rate": 4.892167714640152e-05, + "loss": 5.1645, + "step": 15778 + }, + { + "epoch": 0.09384218289085546, + "grad_norm": 1.6412228345870972, + "learning_rate": 4.892154143793792e-05, + "loss": 5.0472, + "step": 15779 + }, + { + "epoch": 0.09384813017413646, + "grad_norm": 1.5364267826080322, + "learning_rate": 4.8921405721123555e-05, + "loss": 5.1357, + "step": 15780 + }, + { + "epoch": 0.09385407745741745, + "grad_norm": 1.4579834938049316, + "learning_rate": 4.892126999595849e-05, + "loss": 5.2047, + "step": 15781 + }, + { + "epoch": 0.09386002474069845, + "grad_norm": 1.4087393283843994, + "learning_rate": 4.8921134262442745e-05, + "loss": 5.3224, + "step": 15782 + }, + { + "epoch": 0.09386597202397945, + "grad_norm": 1.4741411209106445, + "learning_rate": 4.8920998520576376e-05, + "loss": 4.9882, + "step": 15783 + }, + { + "epoch": 0.09387191930726044, + "grad_norm": 1.488578200340271, + "learning_rate": 4.8920862770359434e-05, + "loss": 4.8698, + "step": 15784 + }, + { + "epoch": 0.09387786659054144, + "grad_norm": 1.4695780277252197, + "learning_rate": 4.892072701179197e-05, + "loss": 4.6841, + "step": 15785 + }, + { + "epoch": 0.09388381387382244, + "grad_norm": 1.2468496561050415, + "learning_rate": 4.892059124487402e-05, + "loss": 5.0962, + "step": 15786 + }, + { + "epoch": 0.09388976115710343, + "grad_norm": 1.1099787950515747, + "learning_rate": 4.8920455469605654e-05, + "loss": 5.0883, + "step": 15787 + }, + { + "epoch": 0.09389570844038443, + "grad_norm": 1.3954483270645142, + "learning_rate": 4.892031968598689e-05, + "loss": 4.9554, + "step": 15788 + }, + { + "epoch": 0.09390165572366543, + "grad_norm": 1.3176839351654053, + "learning_rate": 4.892018389401779e-05, + "loss": 5.1638, + "step": 15789 + }, + { + "epoch": 0.09390760300694642, + "grad_norm": 1.2406723499298096, + "learning_rate": 4.892004809369841e-05, + "loss": 5.0569, + "step": 15790 + }, + { + "epoch": 0.09391355029022742, + "grad_norm": 1.395556926727295, + "learning_rate": 4.891991228502878e-05, + "loss": 4.9179, + "step": 15791 + }, + { + "epoch": 0.09391949757350843, + "grad_norm": 1.3977546691894531, + "learning_rate": 4.891977646800896e-05, + "loss": 5.0045, + "step": 15792 + }, + { + "epoch": 0.09392544485678941, + "grad_norm": 1.5089846849441528, + "learning_rate": 4.891964064263899e-05, + "loss": 5.176, + "step": 15793 + }, + { + "epoch": 0.09393139214007042, + "grad_norm": 1.260077953338623, + "learning_rate": 4.891950480891893e-05, + "loss": 5.3789, + "step": 15794 + }, + { + "epoch": 0.09393733942335142, + "grad_norm": 1.3587939739227295, + "learning_rate": 4.891936896684881e-05, + "loss": 5.308, + "step": 15795 + }, + { + "epoch": 0.0939432867066324, + "grad_norm": 1.4004688262939453, + "learning_rate": 4.8919233116428684e-05, + "loss": 5.5232, + "step": 15796 + }, + { + "epoch": 0.09394923398991341, + "grad_norm": 1.3308182954788208, + "learning_rate": 4.89190972576586e-05, + "loss": 5.3944, + "step": 15797 + }, + { + "epoch": 0.09395518127319441, + "grad_norm": 1.3078187704086304, + "learning_rate": 4.891896139053861e-05, + "loss": 5.3146, + "step": 15798 + }, + { + "epoch": 0.0939611285564754, + "grad_norm": 1.3268121480941772, + "learning_rate": 4.891882551506875e-05, + "loss": 5.2966, + "step": 15799 + }, + { + "epoch": 0.0939670758397564, + "grad_norm": 1.424813985824585, + "learning_rate": 4.8918689631249095e-05, + "loss": 5.132, + "step": 15800 + }, + { + "epoch": 0.0939730231230374, + "grad_norm": 1.2917978763580322, + "learning_rate": 4.8918553739079656e-05, + "loss": 5.1889, + "step": 15801 + }, + { + "epoch": 0.09397897040631839, + "grad_norm": 1.377146601676941, + "learning_rate": 4.8918417838560506e-05, + "loss": 5.2749, + "step": 15802 + }, + { + "epoch": 0.09398491768959939, + "grad_norm": 1.2476272583007812, + "learning_rate": 4.891828192969167e-05, + "loss": 5.1367, + "step": 15803 + }, + { + "epoch": 0.0939908649728804, + "grad_norm": 1.423923373222351, + "learning_rate": 4.891814601247322e-05, + "loss": 5.1657, + "step": 15804 + }, + { + "epoch": 0.09399681225616138, + "grad_norm": 1.2762609720230103, + "learning_rate": 4.891801008690518e-05, + "loss": 5.2245, + "step": 15805 + }, + { + "epoch": 0.09400275953944238, + "grad_norm": 1.3098403215408325, + "learning_rate": 4.891787415298763e-05, + "loss": 5.1452, + "step": 15806 + }, + { + "epoch": 0.09400870682272339, + "grad_norm": 1.2892425060272217, + "learning_rate": 4.8917738210720586e-05, + "loss": 5.268, + "step": 15807 + }, + { + "epoch": 0.09401465410600438, + "grad_norm": 1.4667305946350098, + "learning_rate": 4.8917602260104105e-05, + "loss": 5.1666, + "step": 15808 + }, + { + "epoch": 0.09402060138928538, + "grad_norm": 1.289933204650879, + "learning_rate": 4.891746630113824e-05, + "loss": 5.1772, + "step": 15809 + }, + { + "epoch": 0.09402654867256637, + "grad_norm": 2.3923516273498535, + "learning_rate": 4.891733033382303e-05, + "loss": 5.0732, + "step": 15810 + }, + { + "epoch": 0.09403249595584737, + "grad_norm": 1.223607063293457, + "learning_rate": 4.8917194358158534e-05, + "loss": 5.1025, + "step": 15811 + }, + { + "epoch": 0.09403844323912837, + "grad_norm": 1.5959491729736328, + "learning_rate": 4.8917058374144785e-05, + "loss": 5.3244, + "step": 15812 + }, + { + "epoch": 0.09404439052240936, + "grad_norm": 1.2359555959701538, + "learning_rate": 4.8916922381781845e-05, + "loss": 4.8643, + "step": 15813 + }, + { + "epoch": 0.09405033780569036, + "grad_norm": 1.3971196413040161, + "learning_rate": 4.891678638106974e-05, + "loss": 5.0362, + "step": 15814 + }, + { + "epoch": 0.09405628508897136, + "grad_norm": 1.3501266241073608, + "learning_rate": 4.891665037200855e-05, + "loss": 4.8705, + "step": 15815 + }, + { + "epoch": 0.09406223237225235, + "grad_norm": 1.3506006002426147, + "learning_rate": 4.89165143545983e-05, + "loss": 4.9122, + "step": 15816 + }, + { + "epoch": 0.09406817965553335, + "grad_norm": 1.4444037675857544, + "learning_rate": 4.891637832883904e-05, + "loss": 4.8428, + "step": 15817 + }, + { + "epoch": 0.09407412693881435, + "grad_norm": 1.4757333993911743, + "learning_rate": 4.891624229473082e-05, + "loss": 5.1774, + "step": 15818 + }, + { + "epoch": 0.09408007422209534, + "grad_norm": 1.3660651445388794, + "learning_rate": 4.891610625227369e-05, + "loss": 5.2998, + "step": 15819 + }, + { + "epoch": 0.09408602150537634, + "grad_norm": 1.625279426574707, + "learning_rate": 4.891597020146769e-05, + "loss": 5.1365, + "step": 15820 + }, + { + "epoch": 0.09409196878865735, + "grad_norm": 1.5202007293701172, + "learning_rate": 4.891583414231287e-05, + "loss": 5.287, + "step": 15821 + }, + { + "epoch": 0.09409791607193833, + "grad_norm": 1.5217576026916504, + "learning_rate": 4.891569807480928e-05, + "loss": 5.3599, + "step": 15822 + }, + { + "epoch": 0.09410386335521934, + "grad_norm": 1.5446710586547852, + "learning_rate": 4.891556199895696e-05, + "loss": 5.1332, + "step": 15823 + }, + { + "epoch": 0.09410981063850034, + "grad_norm": 1.2877990007400513, + "learning_rate": 4.8915425914755973e-05, + "loss": 5.0756, + "step": 15824 + }, + { + "epoch": 0.09411575792178133, + "grad_norm": 1.3024258613586426, + "learning_rate": 4.891528982220636e-05, + "loss": 5.3293, + "step": 15825 + }, + { + "epoch": 0.09412170520506233, + "grad_norm": 1.3039882183074951, + "learning_rate": 4.8915153721308166e-05, + "loss": 5.1406, + "step": 15826 + }, + { + "epoch": 0.09412765248834333, + "grad_norm": 1.2524348497390747, + "learning_rate": 4.8915017612061435e-05, + "loss": 5.3044, + "step": 15827 + }, + { + "epoch": 0.09413359977162432, + "grad_norm": 1.2522565126419067, + "learning_rate": 4.8914881494466226e-05, + "loss": 5.1776, + "step": 15828 + }, + { + "epoch": 0.09413954705490532, + "grad_norm": 1.3882638216018677, + "learning_rate": 4.8914745368522566e-05, + "loss": 5.2296, + "step": 15829 + }, + { + "epoch": 0.09414549433818632, + "grad_norm": 1.5169535875320435, + "learning_rate": 4.891460923423052e-05, + "loss": 5.2058, + "step": 15830 + }, + { + "epoch": 0.09415144162146731, + "grad_norm": 1.2045719623565674, + "learning_rate": 4.891447309159014e-05, + "loss": 5.256, + "step": 15831 + }, + { + "epoch": 0.09415738890474831, + "grad_norm": 1.4639356136322021, + "learning_rate": 4.891433694060146e-05, + "loss": 5.1781, + "step": 15832 + }, + { + "epoch": 0.09416333618802931, + "grad_norm": 1.498923420906067, + "learning_rate": 4.891420078126453e-05, + "loss": 5.1777, + "step": 15833 + }, + { + "epoch": 0.0941692834713103, + "grad_norm": 1.163977861404419, + "learning_rate": 4.89140646135794e-05, + "loss": 4.9302, + "step": 15834 + }, + { + "epoch": 0.0941752307545913, + "grad_norm": 1.502808690071106, + "learning_rate": 4.8913928437546113e-05, + "loss": 5.1053, + "step": 15835 + }, + { + "epoch": 0.0941811780378723, + "grad_norm": 1.401517391204834, + "learning_rate": 4.891379225316473e-05, + "loss": 5.3156, + "step": 15836 + }, + { + "epoch": 0.0941871253211533, + "grad_norm": 1.328116774559021, + "learning_rate": 4.891365606043528e-05, + "loss": 5.2333, + "step": 15837 + }, + { + "epoch": 0.0941930726044343, + "grad_norm": 1.160243272781372, + "learning_rate": 4.891351985935782e-05, + "loss": 5.2575, + "step": 15838 + }, + { + "epoch": 0.09419901988771529, + "grad_norm": 1.1748963594436646, + "learning_rate": 4.8913383649932404e-05, + "loss": 5.0673, + "step": 15839 + }, + { + "epoch": 0.09420496717099629, + "grad_norm": 1.2916535139083862, + "learning_rate": 4.891324743215907e-05, + "loss": 5.135, + "step": 15840 + }, + { + "epoch": 0.09421091445427729, + "grad_norm": 1.302393913269043, + "learning_rate": 4.8913111206037865e-05, + "loss": 4.9814, + "step": 15841 + }, + { + "epoch": 0.09421686173755828, + "grad_norm": 1.273445963859558, + "learning_rate": 4.891297497156885e-05, + "loss": 4.9163, + "step": 15842 + }, + { + "epoch": 0.09422280902083928, + "grad_norm": 1.444884181022644, + "learning_rate": 4.8912838728752055e-05, + "loss": 4.9316, + "step": 15843 + }, + { + "epoch": 0.09422875630412028, + "grad_norm": 1.411985993385315, + "learning_rate": 4.891270247758753e-05, + "loss": 4.9222, + "step": 15844 + }, + { + "epoch": 0.09423470358740127, + "grad_norm": 1.3697528839111328, + "learning_rate": 4.891256621807533e-05, + "loss": 4.8398, + "step": 15845 + }, + { + "epoch": 0.09424065087068227, + "grad_norm": 1.385298728942871, + "learning_rate": 4.891242995021551e-05, + "loss": 4.8869, + "step": 15846 + }, + { + "epoch": 0.09424659815396327, + "grad_norm": 1.821768879890442, + "learning_rate": 4.8912293674008094e-05, + "loss": 5.178, + "step": 15847 + }, + { + "epoch": 0.09425254543724426, + "grad_norm": 1.8198026418685913, + "learning_rate": 4.891215738945315e-05, + "loss": 5.2892, + "step": 15848 + }, + { + "epoch": 0.09425849272052526, + "grad_norm": 1.4373536109924316, + "learning_rate": 4.891202109655072e-05, + "loss": 5.1203, + "step": 15849 + }, + { + "epoch": 0.09426444000380627, + "grad_norm": 1.2086896896362305, + "learning_rate": 4.8911884795300855e-05, + "loss": 4.8603, + "step": 15850 + }, + { + "epoch": 0.09427038728708725, + "grad_norm": 1.3166700601577759, + "learning_rate": 4.891174848570359e-05, + "loss": 4.917, + "step": 15851 + }, + { + "epoch": 0.09427633457036826, + "grad_norm": 1.5753637552261353, + "learning_rate": 4.891161216775898e-05, + "loss": 5.0197, + "step": 15852 + }, + { + "epoch": 0.09428228185364926, + "grad_norm": 1.5428698062896729, + "learning_rate": 4.891147584146708e-05, + "loss": 5.2048, + "step": 15853 + }, + { + "epoch": 0.09428822913693025, + "grad_norm": 1.3760755062103271, + "learning_rate": 4.8911339506827924e-05, + "loss": 5.2568, + "step": 15854 + }, + { + "epoch": 0.09429417642021125, + "grad_norm": 1.6683621406555176, + "learning_rate": 4.891120316384157e-05, + "loss": 4.8976, + "step": 15855 + }, + { + "epoch": 0.09430012370349225, + "grad_norm": 1.4224987030029297, + "learning_rate": 4.891106681250807e-05, + "loss": 4.9538, + "step": 15856 + }, + { + "epoch": 0.09430607098677324, + "grad_norm": 1.2851178646087646, + "learning_rate": 4.8910930452827454e-05, + "loss": 4.8972, + "step": 15857 + }, + { + "epoch": 0.09431201827005424, + "grad_norm": 1.6412112712860107, + "learning_rate": 4.891079408479978e-05, + "loss": 5.124, + "step": 15858 + }, + { + "epoch": 0.09431796555333524, + "grad_norm": 1.380089282989502, + "learning_rate": 4.891065770842509e-05, + "loss": 5.1155, + "step": 15859 + }, + { + "epoch": 0.09432391283661623, + "grad_norm": 1.3117294311523438, + "learning_rate": 4.891052132370344e-05, + "loss": 5.1968, + "step": 15860 + }, + { + "epoch": 0.09432986011989723, + "grad_norm": 1.5171841382980347, + "learning_rate": 4.891038493063488e-05, + "loss": 5.1029, + "step": 15861 + }, + { + "epoch": 0.09433580740317823, + "grad_norm": 1.4801427125930786, + "learning_rate": 4.8910248529219446e-05, + "loss": 5.1533, + "step": 15862 + }, + { + "epoch": 0.09434175468645922, + "grad_norm": 1.672522783279419, + "learning_rate": 4.8910112119457196e-05, + "loss": 5.3259, + "step": 15863 + }, + { + "epoch": 0.09434770196974022, + "grad_norm": 1.5151952505111694, + "learning_rate": 4.890997570134816e-05, + "loss": 5.2654, + "step": 15864 + }, + { + "epoch": 0.09435364925302123, + "grad_norm": 1.4178684949874878, + "learning_rate": 4.890983927489242e-05, + "loss": 5.2369, + "step": 15865 + }, + { + "epoch": 0.09435959653630221, + "grad_norm": 1.3673019409179688, + "learning_rate": 4.890970284008999e-05, + "loss": 5.2176, + "step": 15866 + }, + { + "epoch": 0.09436554381958322, + "grad_norm": 1.4063305854797363, + "learning_rate": 4.8909566396940934e-05, + "loss": 5.1189, + "step": 15867 + }, + { + "epoch": 0.0943714911028642, + "grad_norm": 1.277815818786621, + "learning_rate": 4.890942994544528e-05, + "loss": 5.2204, + "step": 15868 + }, + { + "epoch": 0.09437743838614521, + "grad_norm": 1.5394912958145142, + "learning_rate": 4.890929348560311e-05, + "loss": 5.1147, + "step": 15869 + }, + { + "epoch": 0.09438338566942621, + "grad_norm": 1.4091798067092896, + "learning_rate": 4.890915701741444e-05, + "loss": 5.1367, + "step": 15870 + }, + { + "epoch": 0.0943893329527072, + "grad_norm": 1.367828369140625, + "learning_rate": 4.8909020540879336e-05, + "loss": 5.1871, + "step": 15871 + }, + { + "epoch": 0.0943952802359882, + "grad_norm": 2.2413175106048584, + "learning_rate": 4.890888405599784e-05, + "loss": 5.0571, + "step": 15872 + }, + { + "epoch": 0.0944012275192692, + "grad_norm": 1.392906904220581, + "learning_rate": 4.8908747562769995e-05, + "loss": 4.9885, + "step": 15873 + }, + { + "epoch": 0.09440717480255019, + "grad_norm": 1.4517099857330322, + "learning_rate": 4.8908611061195865e-05, + "loss": 5.1596, + "step": 15874 + }, + { + "epoch": 0.09441312208583119, + "grad_norm": 1.663919448852539, + "learning_rate": 4.890847455127547e-05, + "loss": 5.0029, + "step": 15875 + }, + { + "epoch": 0.0944190693691122, + "grad_norm": 1.5252666473388672, + "learning_rate": 4.8908338033008885e-05, + "loss": 4.9596, + "step": 15876 + }, + { + "epoch": 0.09442501665239318, + "grad_norm": 1.613261103630066, + "learning_rate": 4.8908201506396143e-05, + "loss": 4.91, + "step": 15877 + }, + { + "epoch": 0.09443096393567418, + "grad_norm": 1.5182253122329712, + "learning_rate": 4.8908064971437295e-05, + "loss": 5.0564, + "step": 15878 + }, + { + "epoch": 0.09443691121895519, + "grad_norm": 1.4765241146087646, + "learning_rate": 4.8907928428132386e-05, + "loss": 5.0863, + "step": 15879 + }, + { + "epoch": 0.09444285850223617, + "grad_norm": 1.6401035785675049, + "learning_rate": 4.890779187648147e-05, + "loss": 4.9876, + "step": 15880 + }, + { + "epoch": 0.09444880578551718, + "grad_norm": 1.4818077087402344, + "learning_rate": 4.8907655316484594e-05, + "loss": 4.9361, + "step": 15881 + }, + { + "epoch": 0.09445475306879818, + "grad_norm": 1.4490398168563843, + "learning_rate": 4.89075187481418e-05, + "loss": 4.8991, + "step": 15882 + }, + { + "epoch": 0.09446070035207917, + "grad_norm": 1.2799785137176514, + "learning_rate": 4.890738217145313e-05, + "loss": 5.0147, + "step": 15883 + }, + { + "epoch": 0.09446664763536017, + "grad_norm": 1.416590929031372, + "learning_rate": 4.890724558641865e-05, + "loss": 5.0255, + "step": 15884 + }, + { + "epoch": 0.09447259491864117, + "grad_norm": 1.4365648031234741, + "learning_rate": 4.8907108993038395e-05, + "loss": 5.0262, + "step": 15885 + }, + { + "epoch": 0.09447854220192216, + "grad_norm": 1.367490530014038, + "learning_rate": 4.890697239131241e-05, + "loss": 4.9478, + "step": 15886 + }, + { + "epoch": 0.09448448948520316, + "grad_norm": 1.3645575046539307, + "learning_rate": 4.8906835781240754e-05, + "loss": 5.0751, + "step": 15887 + }, + { + "epoch": 0.09449043676848416, + "grad_norm": 1.4014960527420044, + "learning_rate": 4.8906699162823464e-05, + "loss": 4.9789, + "step": 15888 + }, + { + "epoch": 0.09449638405176515, + "grad_norm": 1.2261216640472412, + "learning_rate": 4.8906562536060596e-05, + "loss": 4.9619, + "step": 15889 + }, + { + "epoch": 0.09450233133504615, + "grad_norm": 1.3241546154022217, + "learning_rate": 4.890642590095219e-05, + "loss": 4.9947, + "step": 15890 + }, + { + "epoch": 0.09450827861832715, + "grad_norm": 1.337372899055481, + "learning_rate": 4.89062892574983e-05, + "loss": 4.9817, + "step": 15891 + }, + { + "epoch": 0.09451422590160814, + "grad_norm": 1.47610604763031, + "learning_rate": 4.8906152605698974e-05, + "loss": 4.9467, + "step": 15892 + }, + { + "epoch": 0.09452017318488914, + "grad_norm": 1.3533576726913452, + "learning_rate": 4.890601594555425e-05, + "loss": 4.9819, + "step": 15893 + }, + { + "epoch": 0.09452612046817015, + "grad_norm": 1.4445271492004395, + "learning_rate": 4.890587927706419e-05, + "loss": 4.9566, + "step": 15894 + }, + { + "epoch": 0.09453206775145113, + "grad_norm": 1.4600121974945068, + "learning_rate": 4.8905742600228834e-05, + "loss": 4.9341, + "step": 15895 + }, + { + "epoch": 0.09453801503473214, + "grad_norm": 1.2824327945709229, + "learning_rate": 4.8905605915048224e-05, + "loss": 5.0945, + "step": 15896 + }, + { + "epoch": 0.09454396231801313, + "grad_norm": 1.4806164503097534, + "learning_rate": 4.890546922152242e-05, + "loss": 5.1312, + "step": 15897 + }, + { + "epoch": 0.09454990960129413, + "grad_norm": 1.3514155149459839, + "learning_rate": 4.890533251965146e-05, + "loss": 4.9596, + "step": 15898 + }, + { + "epoch": 0.09455585688457513, + "grad_norm": 1.332749843597412, + "learning_rate": 4.89051958094354e-05, + "loss": 5.0649, + "step": 15899 + }, + { + "epoch": 0.09456180416785612, + "grad_norm": 1.310562014579773, + "learning_rate": 4.8905059090874284e-05, + "loss": 5.0977, + "step": 15900 + }, + { + "epoch": 0.09456775145113712, + "grad_norm": 1.342310905456543, + "learning_rate": 4.8904922363968153e-05, + "loss": 5.115, + "step": 15901 + }, + { + "epoch": 0.09457369873441812, + "grad_norm": 1.4810988903045654, + "learning_rate": 4.890478562871706e-05, + "loss": 5.1305, + "step": 15902 + }, + { + "epoch": 0.09457964601769911, + "grad_norm": 1.3064900636672974, + "learning_rate": 4.890464888512106e-05, + "loss": 5.1387, + "step": 15903 + }, + { + "epoch": 0.09458559330098011, + "grad_norm": 1.4571950435638428, + "learning_rate": 4.890451213318019e-05, + "loss": 5.1235, + "step": 15904 + }, + { + "epoch": 0.09459154058426111, + "grad_norm": 1.3964077234268188, + "learning_rate": 4.89043753728945e-05, + "loss": 5.0854, + "step": 15905 + }, + { + "epoch": 0.0945974878675421, + "grad_norm": 1.4404022693634033, + "learning_rate": 4.8904238604264044e-05, + "loss": 5.0991, + "step": 15906 + }, + { + "epoch": 0.0946034351508231, + "grad_norm": 1.3269283771514893, + "learning_rate": 4.890410182728886e-05, + "loss": 4.9299, + "step": 15907 + }, + { + "epoch": 0.0946093824341041, + "grad_norm": 1.4588782787322998, + "learning_rate": 4.8903965041969e-05, + "loss": 5.0992, + "step": 15908 + }, + { + "epoch": 0.0946153297173851, + "grad_norm": 1.2911858558654785, + "learning_rate": 4.8903828248304525e-05, + "loss": 5.0639, + "step": 15909 + }, + { + "epoch": 0.0946212770006661, + "grad_norm": 1.336695909500122, + "learning_rate": 4.8903691446295466e-05, + "loss": 5.1479, + "step": 15910 + }, + { + "epoch": 0.0946272242839471, + "grad_norm": 1.3052904605865479, + "learning_rate": 4.890355463594186e-05, + "loss": 5.049, + "step": 15911 + }, + { + "epoch": 0.09463317156722809, + "grad_norm": 1.3744491338729858, + "learning_rate": 4.890341781724379e-05, + "loss": 5.0709, + "step": 15912 + }, + { + "epoch": 0.09463911885050909, + "grad_norm": 1.5727102756500244, + "learning_rate": 4.890328099020127e-05, + "loss": 4.9857, + "step": 15913 + }, + { + "epoch": 0.09464506613379009, + "grad_norm": 1.5804322957992554, + "learning_rate": 4.890314415481437e-05, + "loss": 5.133, + "step": 15914 + }, + { + "epoch": 0.09465101341707108, + "grad_norm": 1.228421926498413, + "learning_rate": 4.8903007311083124e-05, + "loss": 4.9561, + "step": 15915 + }, + { + "epoch": 0.09465696070035208, + "grad_norm": 1.4680207967758179, + "learning_rate": 4.890287045900759e-05, + "loss": 5.0502, + "step": 15916 + }, + { + "epoch": 0.09466290798363308, + "grad_norm": 1.3447710275650024, + "learning_rate": 4.89027335985878e-05, + "loss": 5.1255, + "step": 15917 + }, + { + "epoch": 0.09466885526691407, + "grad_norm": 1.3510375022888184, + "learning_rate": 4.8902596729823825e-05, + "loss": 5.0936, + "step": 15918 + }, + { + "epoch": 0.09467480255019507, + "grad_norm": 1.3805617094039917, + "learning_rate": 4.89024598527157e-05, + "loss": 5.1146, + "step": 15919 + }, + { + "epoch": 0.09468074983347607, + "grad_norm": 1.568036437034607, + "learning_rate": 4.890232296726347e-05, + "loss": 5.0032, + "step": 15920 + }, + { + "epoch": 0.09468669711675706, + "grad_norm": 1.6060000658035278, + "learning_rate": 4.890218607346718e-05, + "loss": 5.017, + "step": 15921 + }, + { + "epoch": 0.09469264440003806, + "grad_norm": 1.498241901397705, + "learning_rate": 4.890204917132689e-05, + "loss": 5.1265, + "step": 15922 + }, + { + "epoch": 0.09469859168331907, + "grad_norm": 1.418135643005371, + "learning_rate": 4.8901912260842644e-05, + "loss": 5.1458, + "step": 15923 + }, + { + "epoch": 0.09470453896660005, + "grad_norm": 1.3306639194488525, + "learning_rate": 4.890177534201448e-05, + "loss": 5.1672, + "step": 15924 + }, + { + "epoch": 0.09471048624988106, + "grad_norm": 1.542938470840454, + "learning_rate": 4.890163841484246e-05, + "loss": 5.1511, + "step": 15925 + }, + { + "epoch": 0.09471643353316204, + "grad_norm": 1.3050166368484497, + "learning_rate": 4.890150147932662e-05, + "loss": 5.2615, + "step": 15926 + }, + { + "epoch": 0.09472238081644305, + "grad_norm": 1.3447345495224, + "learning_rate": 4.890136453546702e-05, + "loss": 5.2957, + "step": 15927 + }, + { + "epoch": 0.09472832809972405, + "grad_norm": 1.3270481824874878, + "learning_rate": 4.8901227583263695e-05, + "loss": 5.2751, + "step": 15928 + }, + { + "epoch": 0.09473427538300504, + "grad_norm": 1.3909003734588623, + "learning_rate": 4.890109062271669e-05, + "loss": 5.1162, + "step": 15929 + }, + { + "epoch": 0.09474022266628604, + "grad_norm": 1.4668915271759033, + "learning_rate": 4.890095365382608e-05, + "loss": 5.0313, + "step": 15930 + }, + { + "epoch": 0.09474616994956704, + "grad_norm": 1.2651780843734741, + "learning_rate": 4.890081667659188e-05, + "loss": 5.0576, + "step": 15931 + }, + { + "epoch": 0.09475211723284803, + "grad_norm": 1.5086911916732788, + "learning_rate": 4.8900679691014154e-05, + "loss": 4.9508, + "step": 15932 + }, + { + "epoch": 0.09475806451612903, + "grad_norm": 1.2698594331741333, + "learning_rate": 4.8900542697092956e-05, + "loss": 5.0183, + "step": 15933 + }, + { + "epoch": 0.09476401179941003, + "grad_norm": 2.691392183303833, + "learning_rate": 4.8900405694828313e-05, + "loss": 5.0997, + "step": 15934 + }, + { + "epoch": 0.09476995908269102, + "grad_norm": 1.3395452499389648, + "learning_rate": 4.8900268684220295e-05, + "loss": 5.2219, + "step": 15935 + }, + { + "epoch": 0.09477590636597202, + "grad_norm": 1.3485181331634521, + "learning_rate": 4.8900131665268934e-05, + "loss": 4.9594, + "step": 15936 + }, + { + "epoch": 0.09478185364925303, + "grad_norm": 1.2990431785583496, + "learning_rate": 4.889999463797429e-05, + "loss": 4.9492, + "step": 15937 + }, + { + "epoch": 0.09478780093253401, + "grad_norm": 1.2848893404006958, + "learning_rate": 4.8899857602336396e-05, + "loss": 4.9819, + "step": 15938 + }, + { + "epoch": 0.09479374821581502, + "grad_norm": 1.4666554927825928, + "learning_rate": 4.889972055835531e-05, + "loss": 4.9672, + "step": 15939 + }, + { + "epoch": 0.09479969549909602, + "grad_norm": 1.3356142044067383, + "learning_rate": 4.8899583506031085e-05, + "loss": 5.029, + "step": 15940 + }, + { + "epoch": 0.094805642782377, + "grad_norm": 1.561786413192749, + "learning_rate": 4.8899446445363765e-05, + "loss": 4.9071, + "step": 15941 + }, + { + "epoch": 0.09481159006565801, + "grad_norm": 1.4906450510025024, + "learning_rate": 4.889930937635339e-05, + "loss": 5.0832, + "step": 15942 + }, + { + "epoch": 0.09481753734893901, + "grad_norm": 1.5042341947555542, + "learning_rate": 4.889917229900001e-05, + "loss": 5.1069, + "step": 15943 + }, + { + "epoch": 0.09482348463222, + "grad_norm": 1.6562377214431763, + "learning_rate": 4.889903521330368e-05, + "loss": 5.0532, + "step": 15944 + }, + { + "epoch": 0.094829431915501, + "grad_norm": 1.1881135702133179, + "learning_rate": 4.889889811926445e-05, + "loss": 5.1159, + "step": 15945 + }, + { + "epoch": 0.094835379198782, + "grad_norm": 1.3550158739089966, + "learning_rate": 4.889876101688234e-05, + "loss": 5.0754, + "step": 15946 + }, + { + "epoch": 0.09484132648206299, + "grad_norm": 1.403874158859253, + "learning_rate": 4.8898623906157435e-05, + "loss": 5.405, + "step": 15947 + }, + { + "epoch": 0.09484727376534399, + "grad_norm": 1.4460557699203491, + "learning_rate": 4.889848678708977e-05, + "loss": 5.041, + "step": 15948 + }, + { + "epoch": 0.094853221048625, + "grad_norm": 1.4151064157485962, + "learning_rate": 4.889834965967939e-05, + "loss": 5.368, + "step": 15949 + }, + { + "epoch": 0.09485916833190598, + "grad_norm": 1.3388437032699585, + "learning_rate": 4.889821252392633e-05, + "loss": 5.2905, + "step": 15950 + }, + { + "epoch": 0.09486511561518698, + "grad_norm": 1.1941900253295898, + "learning_rate": 4.8898075379830665e-05, + "loss": 5.1499, + "step": 15951 + }, + { + "epoch": 0.09487106289846799, + "grad_norm": 1.4840821027755737, + "learning_rate": 4.889793822739243e-05, + "loss": 5.0461, + "step": 15952 + }, + { + "epoch": 0.09487701018174897, + "grad_norm": 1.4021552801132202, + "learning_rate": 4.889780106661166e-05, + "loss": 4.89, + "step": 15953 + }, + { + "epoch": 0.09488295746502998, + "grad_norm": 1.4893288612365723, + "learning_rate": 4.889766389748842e-05, + "loss": 4.9719, + "step": 15954 + }, + { + "epoch": 0.09488890474831096, + "grad_norm": 1.4530198574066162, + "learning_rate": 4.889752672002275e-05, + "loss": 5.3931, + "step": 15955 + }, + { + "epoch": 0.09489485203159197, + "grad_norm": 1.468037724494934, + "learning_rate": 4.88973895342147e-05, + "loss": 5.271, + "step": 15956 + }, + { + "epoch": 0.09490079931487297, + "grad_norm": 1.3074537515640259, + "learning_rate": 4.889725234006433e-05, + "loss": 5.202, + "step": 15957 + }, + { + "epoch": 0.09490674659815396, + "grad_norm": 1.3678735494613647, + "learning_rate": 4.889711513757166e-05, + "loss": 5.0821, + "step": 15958 + }, + { + "epoch": 0.09491269388143496, + "grad_norm": 1.3922240734100342, + "learning_rate": 4.889697792673676e-05, + "loss": 4.8938, + "step": 15959 + }, + { + "epoch": 0.09491864116471596, + "grad_norm": 1.3895872831344604, + "learning_rate": 4.8896840707559674e-05, + "loss": 4.8293, + "step": 15960 + }, + { + "epoch": 0.09492458844799695, + "grad_norm": 1.223599910736084, + "learning_rate": 4.889670348004045e-05, + "loss": 4.8528, + "step": 15961 + }, + { + "epoch": 0.09493053573127795, + "grad_norm": 1.4488904476165771, + "learning_rate": 4.889656624417913e-05, + "loss": 5.0107, + "step": 15962 + }, + { + "epoch": 0.09493648301455895, + "grad_norm": 1.5250918865203857, + "learning_rate": 4.889642899997576e-05, + "loss": 4.9114, + "step": 15963 + }, + { + "epoch": 0.09494243029783994, + "grad_norm": 1.4656517505645752, + "learning_rate": 4.88962917474304e-05, + "loss": 5.2163, + "step": 15964 + }, + { + "epoch": 0.09494837758112094, + "grad_norm": 1.316635251045227, + "learning_rate": 4.889615448654309e-05, + "loss": 5.1904, + "step": 15965 + }, + { + "epoch": 0.09495432486440195, + "grad_norm": 1.5920292139053345, + "learning_rate": 4.8896017217313886e-05, + "loss": 5.0858, + "step": 15966 + }, + { + "epoch": 0.09496027214768293, + "grad_norm": 1.5263009071350098, + "learning_rate": 4.889587993974282e-05, + "loss": 5.0594, + "step": 15967 + }, + { + "epoch": 0.09496621943096394, + "grad_norm": 1.4230486154556274, + "learning_rate": 4.889574265382996e-05, + "loss": 5.0712, + "step": 15968 + }, + { + "epoch": 0.09497216671424494, + "grad_norm": 1.9315528869628906, + "learning_rate": 4.889560535957533e-05, + "loss": 4.8489, + "step": 15969 + }, + { + "epoch": 0.09497811399752593, + "grad_norm": 1.3432739973068237, + "learning_rate": 4.8895468056979e-05, + "loss": 4.9722, + "step": 15970 + }, + { + "epoch": 0.09498406128080693, + "grad_norm": 1.191886067390442, + "learning_rate": 4.8895330746041e-05, + "loss": 4.9384, + "step": 15971 + }, + { + "epoch": 0.09499000856408793, + "grad_norm": 1.4204323291778564, + "learning_rate": 4.8895193426761396e-05, + "loss": 5.1063, + "step": 15972 + }, + { + "epoch": 0.09499595584736892, + "grad_norm": 1.319189429283142, + "learning_rate": 4.8895056099140224e-05, + "loss": 5.0643, + "step": 15973 + }, + { + "epoch": 0.09500190313064992, + "grad_norm": 1.2905625104904175, + "learning_rate": 4.8894918763177533e-05, + "loss": 5.0806, + "step": 15974 + }, + { + "epoch": 0.09500785041393092, + "grad_norm": 1.6914581060409546, + "learning_rate": 4.889478141887338e-05, + "loss": 4.9209, + "step": 15975 + }, + { + "epoch": 0.09501379769721191, + "grad_norm": 1.390061378479004, + "learning_rate": 4.8894644066227797e-05, + "loss": 5.1376, + "step": 15976 + }, + { + "epoch": 0.09501974498049291, + "grad_norm": 1.2711600065231323, + "learning_rate": 4.889450670524084e-05, + "loss": 5.2344, + "step": 15977 + }, + { + "epoch": 0.09502569226377391, + "grad_norm": 1.472398042678833, + "learning_rate": 4.889436933591256e-05, + "loss": 5.0605, + "step": 15978 + }, + { + "epoch": 0.0950316395470549, + "grad_norm": 1.483567714691162, + "learning_rate": 4.889423195824301e-05, + "loss": 4.9827, + "step": 15979 + }, + { + "epoch": 0.0950375868303359, + "grad_norm": 1.706921935081482, + "learning_rate": 4.889409457223222e-05, + "loss": 5.0692, + "step": 15980 + }, + { + "epoch": 0.0950435341136169, + "grad_norm": 1.7719398736953735, + "learning_rate": 4.889395717788026e-05, + "loss": 5.0985, + "step": 15981 + }, + { + "epoch": 0.0950494813968979, + "grad_norm": 1.6768114566802979, + "learning_rate": 4.889381977518715e-05, + "loss": 4.8838, + "step": 15982 + }, + { + "epoch": 0.0950554286801789, + "grad_norm": 1.5722233057022095, + "learning_rate": 4.889368236415296e-05, + "loss": 4.824, + "step": 15983 + }, + { + "epoch": 0.09506137596345988, + "grad_norm": 1.5722928047180176, + "learning_rate": 4.889354494477773e-05, + "loss": 5.3027, + "step": 15984 + }, + { + "epoch": 0.09506732324674089, + "grad_norm": 2.0003905296325684, + "learning_rate": 4.8893407517061526e-05, + "loss": 5.2216, + "step": 15985 + }, + { + "epoch": 0.09507327053002189, + "grad_norm": 1.390168309211731, + "learning_rate": 4.889327008100437e-05, + "loss": 5.358, + "step": 15986 + }, + { + "epoch": 0.09507921781330288, + "grad_norm": 1.545292854309082, + "learning_rate": 4.889313263660632e-05, + "loss": 5.5124, + "step": 15987 + }, + { + "epoch": 0.09508516509658388, + "grad_norm": 1.4416158199310303, + "learning_rate": 4.889299518386742e-05, + "loss": 5.0929, + "step": 15988 + }, + { + "epoch": 0.09509111237986488, + "grad_norm": 1.8936892747879028, + "learning_rate": 4.889285772278773e-05, + "loss": 4.9407, + "step": 15989 + }, + { + "epoch": 0.09509705966314587, + "grad_norm": 1.4762251377105713, + "learning_rate": 4.889272025336729e-05, + "loss": 5.05, + "step": 15990 + }, + { + "epoch": 0.09510300694642687, + "grad_norm": 1.4513001441955566, + "learning_rate": 4.8892582775606146e-05, + "loss": 5.2386, + "step": 15991 + }, + { + "epoch": 0.09510895422970787, + "grad_norm": 1.8999260663986206, + "learning_rate": 4.8892445289504345e-05, + "loss": 5.1524, + "step": 15992 + }, + { + "epoch": 0.09511490151298886, + "grad_norm": 1.5721614360809326, + "learning_rate": 4.8892307795061945e-05, + "loss": 5.2276, + "step": 15993 + }, + { + "epoch": 0.09512084879626986, + "grad_norm": 1.754425287246704, + "learning_rate": 4.889217029227898e-05, + "loss": 5.118, + "step": 15994 + }, + { + "epoch": 0.09512679607955087, + "grad_norm": 1.6336870193481445, + "learning_rate": 4.889203278115551e-05, + "loss": 5.2065, + "step": 15995 + }, + { + "epoch": 0.09513274336283185, + "grad_norm": 2.721186876296997, + "learning_rate": 4.889189526169157e-05, + "loss": 5.3698, + "step": 15996 + }, + { + "epoch": 0.09513869064611286, + "grad_norm": 1.3870679140090942, + "learning_rate": 4.889175773388722e-05, + "loss": 5.294, + "step": 15997 + }, + { + "epoch": 0.09514463792939386, + "grad_norm": 1.4010889530181885, + "learning_rate": 4.889162019774252e-05, + "loss": 5.2313, + "step": 15998 + }, + { + "epoch": 0.09515058521267485, + "grad_norm": 1.6322177648544312, + "learning_rate": 4.889148265325748e-05, + "loss": 5.2871, + "step": 15999 + }, + { + "epoch": 0.09515653249595585, + "grad_norm": 1.5373196601867676, + "learning_rate": 4.889134510043218e-05, + "loss": 5.4748, + "step": 16000 + }, + { + "epoch": 0.09516247977923685, + "grad_norm": 1.572461724281311, + "learning_rate": 4.889120753926666e-05, + "loss": 5.3634, + "step": 16001 + }, + { + "epoch": 0.09516842706251784, + "grad_norm": 1.3587132692337036, + "learning_rate": 4.889106996976096e-05, + "loss": 5.1399, + "step": 16002 + }, + { + "epoch": 0.09517437434579884, + "grad_norm": 1.1270248889923096, + "learning_rate": 4.889093239191514e-05, + "loss": 5.1845, + "step": 16003 + }, + { + "epoch": 0.09518032162907984, + "grad_norm": 1.5456722974777222, + "learning_rate": 4.889079480572924e-05, + "loss": 5.4895, + "step": 16004 + }, + { + "epoch": 0.09518626891236083, + "grad_norm": 1.2772669792175293, + "learning_rate": 4.8890657211203307e-05, + "loss": 5.5415, + "step": 16005 + }, + { + "epoch": 0.09519221619564183, + "grad_norm": 1.5249123573303223, + "learning_rate": 4.88905196083374e-05, + "loss": 5.2731, + "step": 16006 + }, + { + "epoch": 0.09519816347892283, + "grad_norm": 1.137450098991394, + "learning_rate": 4.889038199713155e-05, + "loss": 5.2232, + "step": 16007 + }, + { + "epoch": 0.09520411076220382, + "grad_norm": 1.4076485633850098, + "learning_rate": 4.889024437758582e-05, + "loss": 5.3428, + "step": 16008 + }, + { + "epoch": 0.09521005804548482, + "grad_norm": 1.3883590698242188, + "learning_rate": 4.889010674970026e-05, + "loss": 5.328, + "step": 16009 + }, + { + "epoch": 0.09521600532876583, + "grad_norm": 1.4320605993270874, + "learning_rate": 4.88899691134749e-05, + "loss": 5.1469, + "step": 16010 + }, + { + "epoch": 0.09522195261204681, + "grad_norm": 1.5601880550384521, + "learning_rate": 4.8889831468909795e-05, + "loss": 5.1063, + "step": 16011 + }, + { + "epoch": 0.09522789989532782, + "grad_norm": 1.4243980646133423, + "learning_rate": 4.8889693816005014e-05, + "loss": 5.067, + "step": 16012 + }, + { + "epoch": 0.0952338471786088, + "grad_norm": 1.3901020288467407, + "learning_rate": 4.8889556154760577e-05, + "loss": 4.9954, + "step": 16013 + }, + { + "epoch": 0.0952397944618898, + "grad_norm": 1.2067557573318481, + "learning_rate": 4.8889418485176544e-05, + "loss": 5.5485, + "step": 16014 + }, + { + "epoch": 0.09524574174517081, + "grad_norm": 1.6004818677902222, + "learning_rate": 4.888928080725296e-05, + "loss": 5.0334, + "step": 16015 + }, + { + "epoch": 0.0952516890284518, + "grad_norm": 1.42451810836792, + "learning_rate": 4.8889143120989864e-05, + "loss": 4.9913, + "step": 16016 + }, + { + "epoch": 0.0952576363117328, + "grad_norm": 1.528438925743103, + "learning_rate": 4.888900542638734e-05, + "loss": 4.9749, + "step": 16017 + }, + { + "epoch": 0.0952635835950138, + "grad_norm": 1.2179231643676758, + "learning_rate": 4.888886772344539e-05, + "loss": 5.0631, + "step": 16018 + }, + { + "epoch": 0.09526953087829479, + "grad_norm": 1.5069763660430908, + "learning_rate": 4.8888730012164085e-05, + "loss": 5.0739, + "step": 16019 + }, + { + "epoch": 0.09527547816157579, + "grad_norm": 1.3587465286254883, + "learning_rate": 4.888859229254348e-05, + "loss": 5.0924, + "step": 16020 + }, + { + "epoch": 0.0952814254448568, + "grad_norm": 1.412811517715454, + "learning_rate": 4.888845456458361e-05, + "loss": 5.0228, + "step": 16021 + }, + { + "epoch": 0.09528737272813778, + "grad_norm": 1.5316507816314697, + "learning_rate": 4.888831682828453e-05, + "loss": 4.9514, + "step": 16022 + }, + { + "epoch": 0.09529332001141878, + "grad_norm": 1.4402068853378296, + "learning_rate": 4.888817908364628e-05, + "loss": 4.9404, + "step": 16023 + }, + { + "epoch": 0.09529926729469979, + "grad_norm": 1.353027582168579, + "learning_rate": 4.888804133066892e-05, + "loss": 5.0359, + "step": 16024 + }, + { + "epoch": 0.09530521457798077, + "grad_norm": 1.4211509227752686, + "learning_rate": 4.8887903569352486e-05, + "loss": 5.2472, + "step": 16025 + }, + { + "epoch": 0.09531116186126178, + "grad_norm": 1.3640077114105225, + "learning_rate": 4.888776579969704e-05, + "loss": 5.4126, + "step": 16026 + }, + { + "epoch": 0.09531710914454278, + "grad_norm": 1.5627541542053223, + "learning_rate": 4.8887628021702616e-05, + "loss": 5.1019, + "step": 16027 + }, + { + "epoch": 0.09532305642782377, + "grad_norm": 1.788611650466919, + "learning_rate": 4.888749023536927e-05, + "loss": 4.9395, + "step": 16028 + }, + { + "epoch": 0.09532900371110477, + "grad_norm": 1.3194786310195923, + "learning_rate": 4.8887352440697044e-05, + "loss": 4.9888, + "step": 16029 + }, + { + "epoch": 0.09533495099438577, + "grad_norm": 1.3091423511505127, + "learning_rate": 4.888721463768598e-05, + "loss": 5.1328, + "step": 16030 + }, + { + "epoch": 0.09534089827766676, + "grad_norm": 1.2864805459976196, + "learning_rate": 4.8887076826336154e-05, + "loss": 5.2569, + "step": 16031 + }, + { + "epoch": 0.09534684556094776, + "grad_norm": 1.3800050020217896, + "learning_rate": 4.888693900664759e-05, + "loss": 5.0698, + "step": 16032 + }, + { + "epoch": 0.09535279284422876, + "grad_norm": 1.2338416576385498, + "learning_rate": 4.8886801178620347e-05, + "loss": 5.227, + "step": 16033 + }, + { + "epoch": 0.09535874012750975, + "grad_norm": 1.4023356437683105, + "learning_rate": 4.888666334225446e-05, + "loss": 5.2976, + "step": 16034 + }, + { + "epoch": 0.09536468741079075, + "grad_norm": 1.4695215225219727, + "learning_rate": 4.8886525497549994e-05, + "loss": 5.1062, + "step": 16035 + }, + { + "epoch": 0.09537063469407175, + "grad_norm": 1.3647410869598389, + "learning_rate": 4.888638764450698e-05, + "loss": 5.2613, + "step": 16036 + }, + { + "epoch": 0.09537658197735274, + "grad_norm": 1.3059413433074951, + "learning_rate": 4.8886249783125484e-05, + "loss": 5.1593, + "step": 16037 + }, + { + "epoch": 0.09538252926063374, + "grad_norm": 1.3861093521118164, + "learning_rate": 4.8886111913405544e-05, + "loss": 4.9149, + "step": 16038 + }, + { + "epoch": 0.09538847654391475, + "grad_norm": 1.4214578866958618, + "learning_rate": 4.88859740353472e-05, + "loss": 5.0443, + "step": 16039 + }, + { + "epoch": 0.09539442382719573, + "grad_norm": 1.3835242986679077, + "learning_rate": 4.888583614895052e-05, + "loss": 4.9516, + "step": 16040 + }, + { + "epoch": 0.09540037111047674, + "grad_norm": 1.47120201587677, + "learning_rate": 4.8885698254215526e-05, + "loss": 4.9673, + "step": 16041 + }, + { + "epoch": 0.09540631839375772, + "grad_norm": 1.4861125946044922, + "learning_rate": 4.8885560351142295e-05, + "loss": 4.8283, + "step": 16042 + }, + { + "epoch": 0.09541226567703873, + "grad_norm": 1.2469282150268555, + "learning_rate": 4.888542243973086e-05, + "loss": 5.164, + "step": 16043 + }, + { + "epoch": 0.09541821296031973, + "grad_norm": 1.2372372150421143, + "learning_rate": 4.888528451998127e-05, + "loss": 5.2986, + "step": 16044 + }, + { + "epoch": 0.09542416024360072, + "grad_norm": 1.370978593826294, + "learning_rate": 4.888514659189357e-05, + "loss": 5.2353, + "step": 16045 + }, + { + "epoch": 0.09543010752688172, + "grad_norm": 1.4328222274780273, + "learning_rate": 4.888500865546781e-05, + "loss": 5.3482, + "step": 16046 + }, + { + "epoch": 0.09543605481016272, + "grad_norm": 1.2651796340942383, + "learning_rate": 4.888487071070405e-05, + "loss": 5.3276, + "step": 16047 + }, + { + "epoch": 0.09544200209344371, + "grad_norm": 1.34639310836792, + "learning_rate": 4.8884732757602325e-05, + "loss": 5.108, + "step": 16048 + }, + { + "epoch": 0.09544794937672471, + "grad_norm": 1.2254658937454224, + "learning_rate": 4.888459479616269e-05, + "loss": 5.1569, + "step": 16049 + }, + { + "epoch": 0.09545389666000571, + "grad_norm": 1.2902439832687378, + "learning_rate": 4.888445682638518e-05, + "loss": 5.2215, + "step": 16050 + }, + { + "epoch": 0.0954598439432867, + "grad_norm": 1.572160243988037, + "learning_rate": 4.888431884826986e-05, + "loss": 5.1288, + "step": 16051 + }, + { + "epoch": 0.0954657912265677, + "grad_norm": 1.266427993774414, + "learning_rate": 4.888418086181676e-05, + "loss": 5.231, + "step": 16052 + }, + { + "epoch": 0.0954717385098487, + "grad_norm": 1.2186620235443115, + "learning_rate": 4.888404286702595e-05, + "loss": 5.113, + "step": 16053 + }, + { + "epoch": 0.0954776857931297, + "grad_norm": 1.386727213859558, + "learning_rate": 4.888390486389747e-05, + "loss": 5.0559, + "step": 16054 + }, + { + "epoch": 0.0954836330764107, + "grad_norm": 1.3253827095031738, + "learning_rate": 4.8883766852431354e-05, + "loss": 5.2569, + "step": 16055 + }, + { + "epoch": 0.0954895803596917, + "grad_norm": 1.219800591468811, + "learning_rate": 4.888362883262767e-05, + "loss": 5.0805, + "step": 16056 + }, + { + "epoch": 0.09549552764297269, + "grad_norm": 1.2425061464309692, + "learning_rate": 4.888349080448646e-05, + "loss": 5.1447, + "step": 16057 + }, + { + "epoch": 0.09550147492625369, + "grad_norm": 2.619645833969116, + "learning_rate": 4.888335276800777e-05, + "loss": 5.2419, + "step": 16058 + }, + { + "epoch": 0.09550742220953469, + "grad_norm": 1.3087180852890015, + "learning_rate": 4.888321472319164e-05, + "loss": 5.1895, + "step": 16059 + }, + { + "epoch": 0.09551336949281568, + "grad_norm": 1.1865695714950562, + "learning_rate": 4.888307667003813e-05, + "loss": 5.1791, + "step": 16060 + }, + { + "epoch": 0.09551931677609668, + "grad_norm": 1.2647303342819214, + "learning_rate": 4.8882938608547294e-05, + "loss": 5.1928, + "step": 16061 + }, + { + "epoch": 0.09552526405937768, + "grad_norm": 1.2161632776260376, + "learning_rate": 4.888280053871916e-05, + "loss": 5.1431, + "step": 16062 + }, + { + "epoch": 0.09553121134265867, + "grad_norm": 1.3904309272766113, + "learning_rate": 4.8882662460553784e-05, + "loss": 5.0658, + "step": 16063 + }, + { + "epoch": 0.09553715862593967, + "grad_norm": 1.4302258491516113, + "learning_rate": 4.888252437405123e-05, + "loss": 5.1838, + "step": 16064 + }, + { + "epoch": 0.09554310590922067, + "grad_norm": 1.4313236474990845, + "learning_rate": 4.888238627921152e-05, + "loss": 5.2108, + "step": 16065 + }, + { + "epoch": 0.09554905319250166, + "grad_norm": 1.485170602798462, + "learning_rate": 4.8882248176034726e-05, + "loss": 5.179, + "step": 16066 + }, + { + "epoch": 0.09555500047578266, + "grad_norm": 1.3742952346801758, + "learning_rate": 4.888211006452088e-05, + "loss": 5.0416, + "step": 16067 + }, + { + "epoch": 0.09556094775906367, + "grad_norm": 1.2600523233413696, + "learning_rate": 4.888197194467005e-05, + "loss": 5.0891, + "step": 16068 + }, + { + "epoch": 0.09556689504234465, + "grad_norm": 1.2905696630477905, + "learning_rate": 4.888183381648225e-05, + "loss": 5.1004, + "step": 16069 + }, + { + "epoch": 0.09557284232562566, + "grad_norm": 1.2373219728469849, + "learning_rate": 4.8881695679957565e-05, + "loss": 5.1549, + "step": 16070 + }, + { + "epoch": 0.09557878960890664, + "grad_norm": 1.43118155002594, + "learning_rate": 4.8881557535096014e-05, + "loss": 5.067, + "step": 16071 + }, + { + "epoch": 0.09558473689218765, + "grad_norm": 1.201025366783142, + "learning_rate": 4.888141938189767e-05, + "loss": 5.1304, + "step": 16072 + }, + { + "epoch": 0.09559068417546865, + "grad_norm": 1.3497222661972046, + "learning_rate": 4.888128122036256e-05, + "loss": 5.0802, + "step": 16073 + }, + { + "epoch": 0.09559663145874964, + "grad_norm": 1.3429580926895142, + "learning_rate": 4.888114305049074e-05, + "loss": 5.1033, + "step": 16074 + }, + { + "epoch": 0.09560257874203064, + "grad_norm": 1.212725281715393, + "learning_rate": 4.888100487228227e-05, + "loss": 5.0627, + "step": 16075 + }, + { + "epoch": 0.09560852602531164, + "grad_norm": 1.258507490158081, + "learning_rate": 4.8880866685737174e-05, + "loss": 5.1215, + "step": 16076 + }, + { + "epoch": 0.09561447330859263, + "grad_norm": 1.4401910305023193, + "learning_rate": 4.888072849085552e-05, + "loss": 4.9619, + "step": 16077 + }, + { + "epoch": 0.09562042059187363, + "grad_norm": 1.240682601928711, + "learning_rate": 4.888059028763735e-05, + "loss": 4.8384, + "step": 16078 + }, + { + "epoch": 0.09562636787515463, + "grad_norm": 1.5701509714126587, + "learning_rate": 4.888045207608272e-05, + "loss": 5.0756, + "step": 16079 + }, + { + "epoch": 0.09563231515843562, + "grad_norm": 2.0408403873443604, + "learning_rate": 4.888031385619166e-05, + "loss": 5.1615, + "step": 16080 + }, + { + "epoch": 0.09563826244171662, + "grad_norm": 1.8134169578552246, + "learning_rate": 4.8880175627964245e-05, + "loss": 5.2383, + "step": 16081 + }, + { + "epoch": 0.09564420972499763, + "grad_norm": 1.4934067726135254, + "learning_rate": 4.888003739140049e-05, + "loss": 5.1512, + "step": 16082 + }, + { + "epoch": 0.09565015700827861, + "grad_norm": 1.6359374523162842, + "learning_rate": 4.887989914650047e-05, + "loss": 5.1245, + "step": 16083 + }, + { + "epoch": 0.09565610429155962, + "grad_norm": 1.5446397066116333, + "learning_rate": 4.887976089326422e-05, + "loss": 4.9806, + "step": 16084 + }, + { + "epoch": 0.09566205157484062, + "grad_norm": 1.845180869102478, + "learning_rate": 4.8879622631691794e-05, + "loss": 5.0474, + "step": 16085 + }, + { + "epoch": 0.0956679988581216, + "grad_norm": 1.8755276203155518, + "learning_rate": 4.887948436178324e-05, + "loss": 5.0674, + "step": 16086 + }, + { + "epoch": 0.09567394614140261, + "grad_norm": 1.5596239566802979, + "learning_rate": 4.88793460835386e-05, + "loss": 5.0699, + "step": 16087 + }, + { + "epoch": 0.09567989342468361, + "grad_norm": 1.6092095375061035, + "learning_rate": 4.8879207796957935e-05, + "loss": 5.1184, + "step": 16088 + }, + { + "epoch": 0.0956858407079646, + "grad_norm": 1.6217916011810303, + "learning_rate": 4.887906950204127e-05, + "loss": 4.9607, + "step": 16089 + }, + { + "epoch": 0.0956917879912456, + "grad_norm": 1.5006567239761353, + "learning_rate": 4.8878931198788694e-05, + "loss": 4.7948, + "step": 16090 + }, + { + "epoch": 0.0956977352745266, + "grad_norm": 1.397647738456726, + "learning_rate": 4.887879288720021e-05, + "loss": 5.1067, + "step": 16091 + }, + { + "epoch": 0.09570368255780759, + "grad_norm": 1.5627835988998413, + "learning_rate": 4.8878654567275886e-05, + "loss": 4.9138, + "step": 16092 + }, + { + "epoch": 0.09570962984108859, + "grad_norm": 1.4590591192245483, + "learning_rate": 4.8878516239015784e-05, + "loss": 4.9132, + "step": 16093 + }, + { + "epoch": 0.0957155771243696, + "grad_norm": 1.347569465637207, + "learning_rate": 4.887837790241992e-05, + "loss": 4.9732, + "step": 16094 + }, + { + "epoch": 0.09572152440765058, + "grad_norm": 1.547169804573059, + "learning_rate": 4.887823955748838e-05, + "loss": 5.1336, + "step": 16095 + }, + { + "epoch": 0.09572747169093158, + "grad_norm": 1.3920515775680542, + "learning_rate": 4.887810120422118e-05, + "loss": 5.0738, + "step": 16096 + }, + { + "epoch": 0.09573341897421259, + "grad_norm": 1.4531773328781128, + "learning_rate": 4.8877962842618386e-05, + "loss": 5.0517, + "step": 16097 + }, + { + "epoch": 0.09573936625749357, + "grad_norm": 1.458679437637329, + "learning_rate": 4.887782447268004e-05, + "loss": 4.9291, + "step": 16098 + }, + { + "epoch": 0.09574531354077458, + "grad_norm": 1.6293518543243408, + "learning_rate": 4.8877686094406196e-05, + "loss": 4.7676, + "step": 16099 + }, + { + "epoch": 0.09575126082405556, + "grad_norm": 1.6756728887557983, + "learning_rate": 4.8877547707796895e-05, + "loss": 4.7426, + "step": 16100 + }, + { + "epoch": 0.09575720810733657, + "grad_norm": 1.7573354244232178, + "learning_rate": 4.8877409312852194e-05, + "loss": 4.6344, + "step": 16101 + }, + { + "epoch": 0.09576315539061757, + "grad_norm": 1.701581597328186, + "learning_rate": 4.8877270909572126e-05, + "loss": 4.8023, + "step": 16102 + }, + { + "epoch": 0.09576910267389856, + "grad_norm": 1.4811267852783203, + "learning_rate": 4.887713249795676e-05, + "loss": 4.9964, + "step": 16103 + }, + { + "epoch": 0.09577504995717956, + "grad_norm": 1.4324437379837036, + "learning_rate": 4.887699407800612e-05, + "loss": 4.9657, + "step": 16104 + }, + { + "epoch": 0.09578099724046056, + "grad_norm": 1.6630572080612183, + "learning_rate": 4.8876855649720285e-05, + "loss": 4.8689, + "step": 16105 + }, + { + "epoch": 0.09578694452374155, + "grad_norm": 1.8548660278320312, + "learning_rate": 4.887671721309928e-05, + "loss": 4.8775, + "step": 16106 + }, + { + "epoch": 0.09579289180702255, + "grad_norm": 1.5234023332595825, + "learning_rate": 4.887657876814316e-05, + "loss": 5.1495, + "step": 16107 + }, + { + "epoch": 0.09579883909030355, + "grad_norm": 1.5281673669815063, + "learning_rate": 4.8876440314851967e-05, + "loss": 4.8887, + "step": 16108 + }, + { + "epoch": 0.09580478637358454, + "grad_norm": 1.6189017295837402, + "learning_rate": 4.887630185322576e-05, + "loss": 4.7103, + "step": 16109 + }, + { + "epoch": 0.09581073365686554, + "grad_norm": 1.8149834871292114, + "learning_rate": 4.8876163383264584e-05, + "loss": 4.5674, + "step": 16110 + }, + { + "epoch": 0.09581668094014655, + "grad_norm": 1.6370511054992676, + "learning_rate": 4.887602490496848e-05, + "loss": 4.6307, + "step": 16111 + }, + { + "epoch": 0.09582262822342753, + "grad_norm": 1.603553056716919, + "learning_rate": 4.887588641833751e-05, + "loss": 4.597, + "step": 16112 + }, + { + "epoch": 0.09582857550670854, + "grad_norm": 1.6511812210083008, + "learning_rate": 4.887574792337171e-05, + "loss": 4.604, + "step": 16113 + }, + { + "epoch": 0.09583452278998954, + "grad_norm": 1.6924868822097778, + "learning_rate": 4.887560942007113e-05, + "loss": 4.6674, + "step": 16114 + }, + { + "epoch": 0.09584047007327053, + "grad_norm": 1.6445999145507812, + "learning_rate": 4.887547090843583e-05, + "loss": 4.492, + "step": 16115 + }, + { + "epoch": 0.09584641735655153, + "grad_norm": 2.282087564468384, + "learning_rate": 4.887533238846585e-05, + "loss": 5.7458, + "step": 16116 + }, + { + "epoch": 0.09585236463983253, + "grad_norm": 1.8790422677993774, + "learning_rate": 4.887519386016123e-05, + "loss": 5.6642, + "step": 16117 + }, + { + "epoch": 0.09585831192311352, + "grad_norm": 1.887954592704773, + "learning_rate": 4.887505532352203e-05, + "loss": 5.8485, + "step": 16118 + }, + { + "epoch": 0.09586425920639452, + "grad_norm": 1.8805441856384277, + "learning_rate": 4.88749167785483e-05, + "loss": 5.5941, + "step": 16119 + }, + { + "epoch": 0.09587020648967552, + "grad_norm": 2.141098976135254, + "learning_rate": 4.8874778225240076e-05, + "loss": 5.1748, + "step": 16120 + }, + { + "epoch": 0.09587615377295651, + "grad_norm": 1.560094952583313, + "learning_rate": 4.887463966359741e-05, + "loss": 5.625, + "step": 16121 + }, + { + "epoch": 0.09588210105623751, + "grad_norm": 1.6463109254837036, + "learning_rate": 4.887450109362036e-05, + "loss": 5.6568, + "step": 16122 + }, + { + "epoch": 0.09588804833951851, + "grad_norm": 1.5389329195022583, + "learning_rate": 4.887436251530898e-05, + "loss": 5.6461, + "step": 16123 + }, + { + "epoch": 0.0958939956227995, + "grad_norm": 1.4973753690719604, + "learning_rate": 4.8874223928663284e-05, + "loss": 5.3542, + "step": 16124 + }, + { + "epoch": 0.0958999429060805, + "grad_norm": 1.4039745330810547, + "learning_rate": 4.8874085333683364e-05, + "loss": 5.506, + "step": 16125 + }, + { + "epoch": 0.0959058901893615, + "grad_norm": 1.819114089012146, + "learning_rate": 4.8873946730369235e-05, + "loss": 5.2586, + "step": 16126 + }, + { + "epoch": 0.0959118374726425, + "grad_norm": 1.9034372568130493, + "learning_rate": 4.887380811872095e-05, + "loss": 5.1818, + "step": 16127 + }, + { + "epoch": 0.0959177847559235, + "grad_norm": 1.8390016555786133, + "learning_rate": 4.8873669498738584e-05, + "loss": 5.8263, + "step": 16128 + }, + { + "epoch": 0.09592373203920448, + "grad_norm": 1.780961275100708, + "learning_rate": 4.887353087042216e-05, + "loss": 5.801, + "step": 16129 + }, + { + "epoch": 0.09592967932248549, + "grad_norm": 1.8105396032333374, + "learning_rate": 4.887339223377173e-05, + "loss": 5.3426, + "step": 16130 + }, + { + "epoch": 0.09593562660576649, + "grad_norm": 1.9126670360565186, + "learning_rate": 4.887325358878735e-05, + "loss": 5.404, + "step": 16131 + }, + { + "epoch": 0.09594157388904748, + "grad_norm": 1.4767181873321533, + "learning_rate": 4.887311493546906e-05, + "loss": 5.5631, + "step": 16132 + }, + { + "epoch": 0.09594752117232848, + "grad_norm": 1.4779311418533325, + "learning_rate": 4.8872976273816904e-05, + "loss": 5.6407, + "step": 16133 + }, + { + "epoch": 0.09595346845560948, + "grad_norm": 1.9026421308517456, + "learning_rate": 4.8872837603830955e-05, + "loss": 5.4299, + "step": 16134 + }, + { + "epoch": 0.09595941573889047, + "grad_norm": 1.845184326171875, + "learning_rate": 4.887269892551123e-05, + "loss": 5.4873, + "step": 16135 + }, + { + "epoch": 0.09596536302217147, + "grad_norm": 2.49023175239563, + "learning_rate": 4.88725602388578e-05, + "loss": 4.1458, + "step": 16136 + }, + { + "epoch": 0.09597131030545247, + "grad_norm": 2.0831515789031982, + "learning_rate": 4.887242154387071e-05, + "loss": 5.0316, + "step": 16137 + }, + { + "epoch": 0.09597725758873346, + "grad_norm": 1.6316094398498535, + "learning_rate": 4.887228284055e-05, + "loss": 5.1289, + "step": 16138 + }, + { + "epoch": 0.09598320487201446, + "grad_norm": 2.025193214416504, + "learning_rate": 4.8872144128895724e-05, + "loss": 5.3065, + "step": 16139 + }, + { + "epoch": 0.09598915215529547, + "grad_norm": 2.077871322631836, + "learning_rate": 4.887200540890793e-05, + "loss": 5.1163, + "step": 16140 + }, + { + "epoch": 0.09599509943857645, + "grad_norm": 1.8450415134429932, + "learning_rate": 4.8871866680586666e-05, + "loss": 5.2638, + "step": 16141 + }, + { + "epoch": 0.09600104672185746, + "grad_norm": 1.676255464553833, + "learning_rate": 4.8871727943931974e-05, + "loss": 4.8191, + "step": 16142 + }, + { + "epoch": 0.09600699400513846, + "grad_norm": 1.6484187841415405, + "learning_rate": 4.8871589198943914e-05, + "loss": 5.3993, + "step": 16143 + }, + { + "epoch": 0.09601294128841945, + "grad_norm": 1.7061866521835327, + "learning_rate": 4.887145044562253e-05, + "loss": 5.2941, + "step": 16144 + }, + { + "epoch": 0.09601888857170045, + "grad_norm": 1.7628071308135986, + "learning_rate": 4.887131168396786e-05, + "loss": 5.2736, + "step": 16145 + }, + { + "epoch": 0.09602483585498145, + "grad_norm": 2.0107390880584717, + "learning_rate": 4.887117291397997e-05, + "loss": 5.1561, + "step": 16146 + }, + { + "epoch": 0.09603078313826244, + "grad_norm": 1.7889841794967651, + "learning_rate": 4.887103413565889e-05, + "loss": 6.0519, + "step": 16147 + }, + { + "epoch": 0.09603673042154344, + "grad_norm": 1.7982914447784424, + "learning_rate": 4.8870895349004686e-05, + "loss": 5.4913, + "step": 16148 + }, + { + "epoch": 0.09604267770482444, + "grad_norm": 1.8263020515441895, + "learning_rate": 4.88707565540174e-05, + "loss": 5.8516, + "step": 16149 + }, + { + "epoch": 0.09604862498810543, + "grad_norm": 1.642863392829895, + "learning_rate": 4.887061775069708e-05, + "loss": 5.5714, + "step": 16150 + }, + { + "epoch": 0.09605457227138643, + "grad_norm": 1.5696642398834229, + "learning_rate": 4.887047893904377e-05, + "loss": 5.4624, + "step": 16151 + }, + { + "epoch": 0.09606051955466743, + "grad_norm": 1.8895677328109741, + "learning_rate": 4.8870340119057536e-05, + "loss": 5.621, + "step": 16152 + }, + { + "epoch": 0.09606646683794842, + "grad_norm": 1.772875428199768, + "learning_rate": 4.8870201290738395e-05, + "loss": 5.5371, + "step": 16153 + }, + { + "epoch": 0.09607241412122942, + "grad_norm": 1.6763731241226196, + "learning_rate": 4.8870062454086415e-05, + "loss": 5.966, + "step": 16154 + }, + { + "epoch": 0.09607836140451043, + "grad_norm": 1.5911294221878052, + "learning_rate": 4.886992360910165e-05, + "loss": 5.3707, + "step": 16155 + }, + { + "epoch": 0.09608430868779141, + "grad_norm": 1.7060188055038452, + "learning_rate": 4.886978475578414e-05, + "loss": 5.5278, + "step": 16156 + }, + { + "epoch": 0.09609025597107242, + "grad_norm": 1.6456331014633179, + "learning_rate": 4.886964589413394e-05, + "loss": 5.5132, + "step": 16157 + }, + { + "epoch": 0.0960962032543534, + "grad_norm": 1.6736609935760498, + "learning_rate": 4.886950702415109e-05, + "loss": 5.245, + "step": 16158 + }, + { + "epoch": 0.0961021505376344, + "grad_norm": 1.5359262228012085, + "learning_rate": 4.886936814583564e-05, + "loss": 5.3893, + "step": 16159 + }, + { + "epoch": 0.09610809782091541, + "grad_norm": 1.5430463552474976, + "learning_rate": 4.886922925918763e-05, + "loss": 5.4257, + "step": 16160 + }, + { + "epoch": 0.0961140451041964, + "grad_norm": 1.940909743309021, + "learning_rate": 4.886909036420714e-05, + "loss": 5.0744, + "step": 16161 + }, + { + "epoch": 0.0961199923874774, + "grad_norm": 1.869372844696045, + "learning_rate": 4.886895146089418e-05, + "loss": 5.4901, + "step": 16162 + }, + { + "epoch": 0.0961259396707584, + "grad_norm": 1.794975996017456, + "learning_rate": 4.886881254924882e-05, + "loss": 5.5174, + "step": 16163 + }, + { + "epoch": 0.09613188695403939, + "grad_norm": 1.6314165592193604, + "learning_rate": 4.8868673629271105e-05, + "loss": 5.5883, + "step": 16164 + }, + { + "epoch": 0.09613783423732039, + "grad_norm": 1.7309901714324951, + "learning_rate": 4.886853470096108e-05, + "loss": 5.3881, + "step": 16165 + }, + { + "epoch": 0.09614378152060139, + "grad_norm": 1.7356623411178589, + "learning_rate": 4.88683957643188e-05, + "loss": 5.3578, + "step": 16166 + }, + { + "epoch": 0.09614972880388238, + "grad_norm": 2.302006244659424, + "learning_rate": 4.886825681934431e-05, + "loss": 5.7811, + "step": 16167 + }, + { + "epoch": 0.09615567608716338, + "grad_norm": 2.282381534576416, + "learning_rate": 4.8868117866037656e-05, + "loss": 5.8847, + "step": 16168 + }, + { + "epoch": 0.09616162337044439, + "grad_norm": 1.9158310890197754, + "learning_rate": 4.886797890439889e-05, + "loss": 5.7663, + "step": 16169 + }, + { + "epoch": 0.09616757065372537, + "grad_norm": 1.6491609811782837, + "learning_rate": 4.886783993442806e-05, + "loss": 5.9077, + "step": 16170 + }, + { + "epoch": 0.09617351793700638, + "grad_norm": 1.739547848701477, + "learning_rate": 4.886770095612521e-05, + "loss": 5.5126, + "step": 16171 + }, + { + "epoch": 0.09617946522028738, + "grad_norm": 1.534516453742981, + "learning_rate": 4.88675619694904e-05, + "loss": 5.372, + "step": 16172 + }, + { + "epoch": 0.09618541250356837, + "grad_norm": 1.8228504657745361, + "learning_rate": 4.8867422974523657e-05, + "loss": 5.4673, + "step": 16173 + }, + { + "epoch": 0.09619135978684937, + "grad_norm": 1.8887168169021606, + "learning_rate": 4.886728397122505e-05, + "loss": 5.5699, + "step": 16174 + }, + { + "epoch": 0.09619730707013037, + "grad_norm": 1.6889835596084595, + "learning_rate": 4.8867144959594626e-05, + "loss": 5.6244, + "step": 16175 + }, + { + "epoch": 0.09620325435341136, + "grad_norm": 1.7387192249298096, + "learning_rate": 4.8867005939632424e-05, + "loss": 5.7735, + "step": 16176 + }, + { + "epoch": 0.09620920163669236, + "grad_norm": 1.9036939144134521, + "learning_rate": 4.8866866911338494e-05, + "loss": 5.8873, + "step": 16177 + }, + { + "epoch": 0.09621514891997336, + "grad_norm": 1.6884106397628784, + "learning_rate": 4.886672787471289e-05, + "loss": 5.1366, + "step": 16178 + }, + { + "epoch": 0.09622109620325435, + "grad_norm": 1.5132830142974854, + "learning_rate": 4.886658882975566e-05, + "loss": 5.2964, + "step": 16179 + }, + { + "epoch": 0.09622704348653535, + "grad_norm": 1.7039000988006592, + "learning_rate": 4.886644977646685e-05, + "loss": 5.2287, + "step": 16180 + }, + { + "epoch": 0.09623299076981635, + "grad_norm": 1.6894882917404175, + "learning_rate": 4.886631071484651e-05, + "loss": 5.3205, + "step": 16181 + }, + { + "epoch": 0.09623893805309734, + "grad_norm": 2.303013324737549, + "learning_rate": 4.8866171644894684e-05, + "loss": 5.2701, + "step": 16182 + }, + { + "epoch": 0.09624488533637834, + "grad_norm": 1.6158491373062134, + "learning_rate": 4.886603256661142e-05, + "loss": 5.522, + "step": 16183 + }, + { + "epoch": 0.09625083261965935, + "grad_norm": 1.5886715650558472, + "learning_rate": 4.8865893479996776e-05, + "loss": 5.7498, + "step": 16184 + }, + { + "epoch": 0.09625677990294033, + "grad_norm": 2.007570505142212, + "learning_rate": 4.88657543850508e-05, + "loss": 5.3746, + "step": 16185 + }, + { + "epoch": 0.09626272718622134, + "grad_norm": 2.8191232681274414, + "learning_rate": 4.886561528177352e-05, + "loss": 4.9794, + "step": 16186 + }, + { + "epoch": 0.09626867446950232, + "grad_norm": 2.5193052291870117, + "learning_rate": 4.886547617016501e-05, + "loss": 4.982, + "step": 16187 + }, + { + "epoch": 0.09627462175278333, + "grad_norm": 1.8875666856765747, + "learning_rate": 4.8865337050225316e-05, + "loss": 5.1801, + "step": 16188 + }, + { + "epoch": 0.09628056903606433, + "grad_norm": 1.441834568977356, + "learning_rate": 4.8865197921954475e-05, + "loss": 5.2723, + "step": 16189 + }, + { + "epoch": 0.09628651631934532, + "grad_norm": 2.0356223583221436, + "learning_rate": 4.8865058785352536e-05, + "loss": 5.4185, + "step": 16190 + }, + { + "epoch": 0.09629246360262632, + "grad_norm": 2.03885817527771, + "learning_rate": 4.8864919640419554e-05, + "loss": 5.1636, + "step": 16191 + }, + { + "epoch": 0.09629841088590732, + "grad_norm": 2.118439197540283, + "learning_rate": 4.8864780487155576e-05, + "loss": 5.4012, + "step": 16192 + }, + { + "epoch": 0.09630435816918831, + "grad_norm": 1.8266710042953491, + "learning_rate": 4.886464132556064e-05, + "loss": 4.9442, + "step": 16193 + }, + { + "epoch": 0.09631030545246931, + "grad_norm": 1.646341323852539, + "learning_rate": 4.886450215563482e-05, + "loss": 5.1368, + "step": 16194 + }, + { + "epoch": 0.09631625273575031, + "grad_norm": 1.8833272457122803, + "learning_rate": 4.886436297737814e-05, + "loss": 5.279, + "step": 16195 + }, + { + "epoch": 0.0963222000190313, + "grad_norm": 1.9521067142486572, + "learning_rate": 4.8864223790790666e-05, + "loss": 5.6571, + "step": 16196 + }, + { + "epoch": 0.0963281473023123, + "grad_norm": 1.8902586698532104, + "learning_rate": 4.8864084595872427e-05, + "loss": 5.632, + "step": 16197 + }, + { + "epoch": 0.0963340945855933, + "grad_norm": 1.7994412183761597, + "learning_rate": 4.886394539262349e-05, + "loss": 5.574, + "step": 16198 + }, + { + "epoch": 0.0963400418688743, + "grad_norm": 1.751780390739441, + "learning_rate": 4.8863806181043895e-05, + "loss": 5.691, + "step": 16199 + }, + { + "epoch": 0.0963459891521553, + "grad_norm": 2.30880069732666, + "learning_rate": 4.8863666961133684e-05, + "loss": 5.7477, + "step": 16200 + }, + { + "epoch": 0.0963519364354363, + "grad_norm": 2.351921319961548, + "learning_rate": 4.8863527732892924e-05, + "loss": 5.8162, + "step": 16201 + }, + { + "epoch": 0.09635788371871729, + "grad_norm": 1.6124454736709595, + "learning_rate": 4.8863388496321636e-05, + "loss": 5.8105, + "step": 16202 + }, + { + "epoch": 0.09636383100199829, + "grad_norm": 1.4927148818969727, + "learning_rate": 4.886324925141991e-05, + "loss": 5.8246, + "step": 16203 + }, + { + "epoch": 0.09636977828527929, + "grad_norm": 1.71438729763031, + "learning_rate": 4.886310999818775e-05, + "loss": 5.798, + "step": 16204 + }, + { + "epoch": 0.09637572556856028, + "grad_norm": 1.9519150257110596, + "learning_rate": 4.886297073662523e-05, + "loss": 5.2815, + "step": 16205 + }, + { + "epoch": 0.09638167285184128, + "grad_norm": 1.7694860696792603, + "learning_rate": 4.88628314667324e-05, + "loss": 5.7564, + "step": 16206 + }, + { + "epoch": 0.09638762013512228, + "grad_norm": 1.658252477645874, + "learning_rate": 4.88626921885093e-05, + "loss": 5.6586, + "step": 16207 + }, + { + "epoch": 0.09639356741840327, + "grad_norm": 2.310295581817627, + "learning_rate": 4.886255290195598e-05, + "loss": 4.9317, + "step": 16208 + }, + { + "epoch": 0.09639951470168427, + "grad_norm": 2.239964246749878, + "learning_rate": 4.886241360707249e-05, + "loss": 5.3794, + "step": 16209 + }, + { + "epoch": 0.09640546198496527, + "grad_norm": 2.470205307006836, + "learning_rate": 4.886227430385887e-05, + "loss": 5.1755, + "step": 16210 + }, + { + "epoch": 0.09641140926824626, + "grad_norm": 2.208298683166504, + "learning_rate": 4.8862134992315185e-05, + "loss": 5.1296, + "step": 16211 + }, + { + "epoch": 0.09641735655152726, + "grad_norm": 2.112288475036621, + "learning_rate": 4.886199567244147e-05, + "loss": 5.0888, + "step": 16212 + }, + { + "epoch": 0.09642330383480827, + "grad_norm": 2.3725969791412354, + "learning_rate": 4.886185634423778e-05, + "loss": 5.0256, + "step": 16213 + }, + { + "epoch": 0.09642925111808925, + "grad_norm": 2.3314402103424072, + "learning_rate": 4.8861717007704164e-05, + "loss": 5.012, + "step": 16214 + }, + { + "epoch": 0.09643519840137026, + "grad_norm": 2.1015000343322754, + "learning_rate": 4.8861577662840676e-05, + "loss": 4.7244, + "step": 16215 + }, + { + "epoch": 0.09644114568465124, + "grad_norm": 2.335218906402588, + "learning_rate": 4.8861438309647344e-05, + "loss": 4.8442, + "step": 16216 + }, + { + "epoch": 0.09644709296793225, + "grad_norm": 2.249216079711914, + "learning_rate": 4.886129894812424e-05, + "loss": 5.2573, + "step": 16217 + }, + { + "epoch": 0.09645304025121325, + "grad_norm": 2.228283166885376, + "learning_rate": 4.8861159578271406e-05, + "loss": 4.7297, + "step": 16218 + }, + { + "epoch": 0.09645898753449424, + "grad_norm": 1.7820645570755005, + "learning_rate": 4.886102020008888e-05, + "loss": 4.8427, + "step": 16219 + }, + { + "epoch": 0.09646493481777524, + "grad_norm": 2.1911120414733887, + "learning_rate": 4.886088081357672e-05, + "loss": 4.9677, + "step": 16220 + }, + { + "epoch": 0.09647088210105624, + "grad_norm": 2.453758716583252, + "learning_rate": 4.8860741418734976e-05, + "loss": 4.9039, + "step": 16221 + }, + { + "epoch": 0.09647682938433723, + "grad_norm": 2.488105058670044, + "learning_rate": 4.886060201556369e-05, + "loss": 5.0211, + "step": 16222 + }, + { + "epoch": 0.09648277666761823, + "grad_norm": 2.2040843963623047, + "learning_rate": 4.8860462604062915e-05, + "loss": 5.1067, + "step": 16223 + }, + { + "epoch": 0.09648872395089923, + "grad_norm": 2.0934717655181885, + "learning_rate": 4.8860323184232695e-05, + "loss": 4.9648, + "step": 16224 + }, + { + "epoch": 0.09649467123418022, + "grad_norm": 2.3775415420532227, + "learning_rate": 4.886018375607309e-05, + "loss": 4.9459, + "step": 16225 + }, + { + "epoch": 0.09650061851746122, + "grad_norm": 2.4042131900787354, + "learning_rate": 4.886004431958414e-05, + "loss": 4.7845, + "step": 16226 + }, + { + "epoch": 0.09650656580074223, + "grad_norm": 2.34424090385437, + "learning_rate": 4.885990487476589e-05, + "loss": 5.012, + "step": 16227 + }, + { + "epoch": 0.09651251308402321, + "grad_norm": 2.2711172103881836, + "learning_rate": 4.8859765421618395e-05, + "loss": 4.906, + "step": 16228 + }, + { + "epoch": 0.09651846036730422, + "grad_norm": 2.4021360874176025, + "learning_rate": 4.8859625960141706e-05, + "loss": 4.916, + "step": 16229 + }, + { + "epoch": 0.09652440765058522, + "grad_norm": 1.9205279350280762, + "learning_rate": 4.885948649033587e-05, + "loss": 5.0469, + "step": 16230 + }, + { + "epoch": 0.0965303549338662, + "grad_norm": 2.226362466812134, + "learning_rate": 4.885934701220093e-05, + "loss": 4.9439, + "step": 16231 + }, + { + "epoch": 0.09653630221714721, + "grad_norm": 2.288909673690796, + "learning_rate": 4.885920752573694e-05, + "loss": 4.8271, + "step": 16232 + }, + { + "epoch": 0.09654224950042821, + "grad_norm": 2.132235050201416, + "learning_rate": 4.8859068030943943e-05, + "loss": 5.1891, + "step": 16233 + }, + { + "epoch": 0.0965481967837092, + "grad_norm": 2.080244541168213, + "learning_rate": 4.8858928527822e-05, + "loss": 4.9055, + "step": 16234 + }, + { + "epoch": 0.0965541440669902, + "grad_norm": 2.324211359024048, + "learning_rate": 4.8858789016371145e-05, + "loss": 5.2614, + "step": 16235 + }, + { + "epoch": 0.0965600913502712, + "grad_norm": 1.827802062034607, + "learning_rate": 4.8858649496591437e-05, + "loss": 4.8874, + "step": 16236 + }, + { + "epoch": 0.09656603863355219, + "grad_norm": 1.8670811653137207, + "learning_rate": 4.885850996848292e-05, + "loss": 5.2402, + "step": 16237 + }, + { + "epoch": 0.09657198591683319, + "grad_norm": 2.046444892883301, + "learning_rate": 4.885837043204564e-05, + "loss": 4.7029, + "step": 16238 + }, + { + "epoch": 0.0965779332001142, + "grad_norm": 2.007894992828369, + "learning_rate": 4.885823088727965e-05, + "loss": 5.6706, + "step": 16239 + }, + { + "epoch": 0.09658388048339518, + "grad_norm": 2.24422025680542, + "learning_rate": 4.8858091334185005e-05, + "loss": 5.9666, + "step": 16240 + }, + { + "epoch": 0.09658982776667618, + "grad_norm": 1.7045838832855225, + "learning_rate": 4.885795177276174e-05, + "loss": 5.3021, + "step": 16241 + }, + { + "epoch": 0.09659577504995719, + "grad_norm": 1.7880860567092896, + "learning_rate": 4.885781220300991e-05, + "loss": 4.9151, + "step": 16242 + }, + { + "epoch": 0.09660172233323817, + "grad_norm": 2.3720862865448, + "learning_rate": 4.885767262492957e-05, + "loss": 5.0868, + "step": 16243 + }, + { + "epoch": 0.09660766961651918, + "grad_norm": 1.8655211925506592, + "learning_rate": 4.8857533038520756e-05, + "loss": 5.5072, + "step": 16244 + }, + { + "epoch": 0.09661361689980018, + "grad_norm": 1.8259748220443726, + "learning_rate": 4.885739344378353e-05, + "loss": 5.5992, + "step": 16245 + }, + { + "epoch": 0.09661956418308117, + "grad_norm": 1.667145013809204, + "learning_rate": 4.885725384071793e-05, + "loss": 5.2069, + "step": 16246 + }, + { + "epoch": 0.09662551146636217, + "grad_norm": 1.8004356622695923, + "learning_rate": 4.8857114229324015e-05, + "loss": 5.232, + "step": 16247 + }, + { + "epoch": 0.09663145874964316, + "grad_norm": 1.8246740102767944, + "learning_rate": 4.8856974609601825e-05, + "loss": 5.185, + "step": 16248 + }, + { + "epoch": 0.09663740603292416, + "grad_norm": 1.7453134059906006, + "learning_rate": 4.885683498155141e-05, + "loss": 4.9118, + "step": 16249 + }, + { + "epoch": 0.09664335331620516, + "grad_norm": 1.76914381980896, + "learning_rate": 4.885669534517282e-05, + "loss": 4.6679, + "step": 16250 + }, + { + "epoch": 0.09664930059948615, + "grad_norm": 2.0119516849517822, + "learning_rate": 4.88565557004661e-05, + "loss": 4.6495, + "step": 16251 + }, + { + "epoch": 0.09665524788276715, + "grad_norm": 1.7628357410430908, + "learning_rate": 4.885641604743131e-05, + "loss": 4.7581, + "step": 16252 + }, + { + "epoch": 0.09666119516604815, + "grad_norm": 1.6456751823425293, + "learning_rate": 4.8856276386068486e-05, + "loss": 4.9539, + "step": 16253 + }, + { + "epoch": 0.09666714244932914, + "grad_norm": 1.8474618196487427, + "learning_rate": 4.885613671637769e-05, + "loss": 5.9248, + "step": 16254 + }, + { + "epoch": 0.09667308973261014, + "grad_norm": 2.1205222606658936, + "learning_rate": 4.885599703835896e-05, + "loss": 5.2783, + "step": 16255 + }, + { + "epoch": 0.09667903701589114, + "grad_norm": 1.7559815645217896, + "learning_rate": 4.885585735201235e-05, + "loss": 5.6276, + "step": 16256 + }, + { + "epoch": 0.09668498429917213, + "grad_norm": 1.5784190893173218, + "learning_rate": 4.885571765733789e-05, + "loss": 5.5933, + "step": 16257 + }, + { + "epoch": 0.09669093158245314, + "grad_norm": 1.7377841472625732, + "learning_rate": 4.885557795433567e-05, + "loss": 5.1234, + "step": 16258 + }, + { + "epoch": 0.09669687886573414, + "grad_norm": 1.6517775058746338, + "learning_rate": 4.88554382430057e-05, + "loss": 5.6291, + "step": 16259 + }, + { + "epoch": 0.09670282614901513, + "grad_norm": 1.8474104404449463, + "learning_rate": 4.885529852334805e-05, + "loss": 6.0357, + "step": 16260 + }, + { + "epoch": 0.09670877343229613, + "grad_norm": 1.6555463075637817, + "learning_rate": 4.8855158795362756e-05, + "loss": 5.9828, + "step": 16261 + }, + { + "epoch": 0.09671472071557713, + "grad_norm": 1.6003193855285645, + "learning_rate": 4.8855019059049876e-05, + "loss": 5.9705, + "step": 16262 + }, + { + "epoch": 0.09672066799885812, + "grad_norm": 1.4992772340774536, + "learning_rate": 4.885487931440945e-05, + "loss": 5.8604, + "step": 16263 + }, + { + "epoch": 0.09672661528213912, + "grad_norm": 1.8667478561401367, + "learning_rate": 4.885473956144154e-05, + "loss": 6.1141, + "step": 16264 + }, + { + "epoch": 0.09673256256542012, + "grad_norm": 1.7311911582946777, + "learning_rate": 4.8854599800146186e-05, + "loss": 5.4142, + "step": 16265 + }, + { + "epoch": 0.09673850984870111, + "grad_norm": 2.0519683361053467, + "learning_rate": 4.885446003052343e-05, + "loss": 5.4321, + "step": 16266 + }, + { + "epoch": 0.09674445713198211, + "grad_norm": 2.02132248878479, + "learning_rate": 4.8854320252573325e-05, + "loss": 5.4957, + "step": 16267 + }, + { + "epoch": 0.09675040441526311, + "grad_norm": 1.7282330989837646, + "learning_rate": 4.885418046629594e-05, + "loss": 5.4486, + "step": 16268 + }, + { + "epoch": 0.0967563516985441, + "grad_norm": 1.909114122390747, + "learning_rate": 4.885404067169129e-05, + "loss": 5.4782, + "step": 16269 + }, + { + "epoch": 0.0967622989818251, + "grad_norm": 1.897161602973938, + "learning_rate": 4.885390086875945e-05, + "loss": 5.8678, + "step": 16270 + }, + { + "epoch": 0.0967682462651061, + "grad_norm": 2.0866503715515137, + "learning_rate": 4.885376105750046e-05, + "loss": 5.0869, + "step": 16271 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 1.6914600133895874, + "learning_rate": 4.885362123791437e-05, + "loss": 5.3385, + "step": 16272 + }, + { + "epoch": 0.0967801408316681, + "grad_norm": 1.4390329122543335, + "learning_rate": 4.885348141000122e-05, + "loss": 5.8069, + "step": 16273 + }, + { + "epoch": 0.0967860881149491, + "grad_norm": 1.5077629089355469, + "learning_rate": 4.885334157376107e-05, + "loss": 5.6679, + "step": 16274 + }, + { + "epoch": 0.09679203539823009, + "grad_norm": 1.4550343751907349, + "learning_rate": 4.885320172919397e-05, + "loss": 5.7548, + "step": 16275 + }, + { + "epoch": 0.09679798268151109, + "grad_norm": 2.068070650100708, + "learning_rate": 4.8853061876299956e-05, + "loss": 4.9706, + "step": 16276 + }, + { + "epoch": 0.09680392996479208, + "grad_norm": 1.3487659692764282, + "learning_rate": 4.885292201507909e-05, + "loss": 5.6918, + "step": 16277 + }, + { + "epoch": 0.09680987724807308, + "grad_norm": 1.4306180477142334, + "learning_rate": 4.885278214553141e-05, + "loss": 5.6196, + "step": 16278 + }, + { + "epoch": 0.09681582453135408, + "grad_norm": 1.6410231590270996, + "learning_rate": 4.885264226765698e-05, + "loss": 5.0523, + "step": 16279 + }, + { + "epoch": 0.09682177181463507, + "grad_norm": 2.4701485633850098, + "learning_rate": 4.8852502381455825e-05, + "loss": 4.6255, + "step": 16280 + }, + { + "epoch": 0.09682771909791607, + "grad_norm": 2.5248069763183594, + "learning_rate": 4.885236248692802e-05, + "loss": 4.5055, + "step": 16281 + }, + { + "epoch": 0.09683366638119707, + "grad_norm": 2.1913154125213623, + "learning_rate": 4.8852222584073595e-05, + "loss": 4.748, + "step": 16282 + }, + { + "epoch": 0.09683961366447806, + "grad_norm": 1.951987385749817, + "learning_rate": 4.8852082672892606e-05, + "loss": 5.3871, + "step": 16283 + }, + { + "epoch": 0.09684556094775906, + "grad_norm": 2.007020950317383, + "learning_rate": 4.885194275338511e-05, + "loss": 6.1075, + "step": 16284 + }, + { + "epoch": 0.09685150823104006, + "grad_norm": 1.9821717739105225, + "learning_rate": 4.885180282555113e-05, + "loss": 5.1719, + "step": 16285 + }, + { + "epoch": 0.09685745551432105, + "grad_norm": 2.339564800262451, + "learning_rate": 4.885166288939074e-05, + "loss": 4.9518, + "step": 16286 + }, + { + "epoch": 0.09686340279760206, + "grad_norm": 2.1785504817962646, + "learning_rate": 4.8851522944903984e-05, + "loss": 4.9656, + "step": 16287 + }, + { + "epoch": 0.09686935008088306, + "grad_norm": 1.7723946571350098, + "learning_rate": 4.885138299209091e-05, + "loss": 6.1572, + "step": 16288 + }, + { + "epoch": 0.09687529736416405, + "grad_norm": 1.702458381652832, + "learning_rate": 4.885124303095156e-05, + "loss": 5.9616, + "step": 16289 + }, + { + "epoch": 0.09688124464744505, + "grad_norm": 2.279836893081665, + "learning_rate": 4.885110306148599e-05, + "loss": 5.4305, + "step": 16290 + }, + { + "epoch": 0.09688719193072605, + "grad_norm": 1.8569501638412476, + "learning_rate": 4.8850963083694244e-05, + "loss": 5.8019, + "step": 16291 + }, + { + "epoch": 0.09689313921400704, + "grad_norm": 1.8126327991485596, + "learning_rate": 4.885082309757637e-05, + "loss": 5.7076, + "step": 16292 + }, + { + "epoch": 0.09689908649728804, + "grad_norm": 1.7170337438583374, + "learning_rate": 4.8850683103132424e-05, + "loss": 5.9862, + "step": 16293 + }, + { + "epoch": 0.09690503378056904, + "grad_norm": 1.7631909847259521, + "learning_rate": 4.8850543100362454e-05, + "loss": 5.917, + "step": 16294 + }, + { + "epoch": 0.09691098106385003, + "grad_norm": 1.9938957691192627, + "learning_rate": 4.88504030892665e-05, + "loss": 5.5773, + "step": 16295 + }, + { + "epoch": 0.09691692834713103, + "grad_norm": 1.9459222555160522, + "learning_rate": 4.8850263069844623e-05, + "loss": 5.2847, + "step": 16296 + }, + { + "epoch": 0.09692287563041203, + "grad_norm": 1.8420277833938599, + "learning_rate": 4.8850123042096865e-05, + "loss": 5.5691, + "step": 16297 + }, + { + "epoch": 0.09692882291369302, + "grad_norm": 2.2592809200286865, + "learning_rate": 4.8849983006023267e-05, + "loss": 5.4666, + "step": 16298 + }, + { + "epoch": 0.09693477019697402, + "grad_norm": 2.080939292907715, + "learning_rate": 4.884984296162389e-05, + "loss": 5.243, + "step": 16299 + }, + { + "epoch": 0.09694071748025503, + "grad_norm": 1.648836374282837, + "learning_rate": 4.884970290889879e-05, + "loss": 5.8331, + "step": 16300 + }, + { + "epoch": 0.09694666476353601, + "grad_norm": 1.668505311012268, + "learning_rate": 4.884956284784799e-05, + "loss": 5.7523, + "step": 16301 + }, + { + "epoch": 0.09695261204681702, + "grad_norm": 1.5473688840866089, + "learning_rate": 4.8849422778471567e-05, + "loss": 5.5379, + "step": 16302 + }, + { + "epoch": 0.09695855933009802, + "grad_norm": 1.9258644580841064, + "learning_rate": 4.8849282700769545e-05, + "loss": 5.6405, + "step": 16303 + }, + { + "epoch": 0.096964506613379, + "grad_norm": 1.5651416778564453, + "learning_rate": 4.884914261474199e-05, + "loss": 6.1487, + "step": 16304 + }, + { + "epoch": 0.09697045389666001, + "grad_norm": 1.5289270877838135, + "learning_rate": 4.884900252038894e-05, + "loss": 5.6653, + "step": 16305 + }, + { + "epoch": 0.096976401179941, + "grad_norm": 1.8394510746002197, + "learning_rate": 4.8848862417710464e-05, + "loss": 4.9243, + "step": 16306 + }, + { + "epoch": 0.096982348463222, + "grad_norm": 1.7624824047088623, + "learning_rate": 4.8848722306706584e-05, + "loss": 5.7712, + "step": 16307 + }, + { + "epoch": 0.096988295746503, + "grad_norm": 1.7294182777404785, + "learning_rate": 4.8848582187377365e-05, + "loss": 5.5197, + "step": 16308 + }, + { + "epoch": 0.09699424302978399, + "grad_norm": 1.69902765750885, + "learning_rate": 4.8848442059722856e-05, + "loss": 5.6485, + "step": 16309 + }, + { + "epoch": 0.09700019031306499, + "grad_norm": 1.7867447137832642, + "learning_rate": 4.88483019237431e-05, + "loss": 5.4422, + "step": 16310 + }, + { + "epoch": 0.09700613759634599, + "grad_norm": 1.6588819026947021, + "learning_rate": 4.884816177943814e-05, + "loss": 5.4282, + "step": 16311 + }, + { + "epoch": 0.09701208487962698, + "grad_norm": 1.504918098449707, + "learning_rate": 4.884802162680804e-05, + "loss": 5.508, + "step": 16312 + }, + { + "epoch": 0.09701803216290798, + "grad_norm": 1.5852895975112915, + "learning_rate": 4.8847881465852846e-05, + "loss": 5.5567, + "step": 16313 + }, + { + "epoch": 0.09702397944618898, + "grad_norm": 1.5719797611236572, + "learning_rate": 4.88477412965726e-05, + "loss": 5.6284, + "step": 16314 + }, + { + "epoch": 0.09702992672946997, + "grad_norm": 1.4208050966262817, + "learning_rate": 4.884760111896735e-05, + "loss": 5.5653, + "step": 16315 + }, + { + "epoch": 0.09703587401275098, + "grad_norm": 1.567555546760559, + "learning_rate": 4.8847460933037156e-05, + "loss": 5.5144, + "step": 16316 + }, + { + "epoch": 0.09704182129603198, + "grad_norm": 1.9179699420928955, + "learning_rate": 4.884732073878205e-05, + "loss": 4.7947, + "step": 16317 + }, + { + "epoch": 0.09704776857931297, + "grad_norm": 2.5346062183380127, + "learning_rate": 4.88471805362021e-05, + "loss": 3.8315, + "step": 16318 + }, + { + "epoch": 0.09705371586259397, + "grad_norm": 2.585686683654785, + "learning_rate": 4.884704032529734e-05, + "loss": 3.7288, + "step": 16319 + }, + { + "epoch": 0.09705966314587497, + "grad_norm": 2.133723020553589, + "learning_rate": 4.8846900106067825e-05, + "loss": 3.6369, + "step": 16320 + }, + { + "epoch": 0.09706561042915596, + "grad_norm": 2.4039080142974854, + "learning_rate": 4.884675987851361e-05, + "loss": 3.9068, + "step": 16321 + }, + { + "epoch": 0.09707155771243696, + "grad_norm": 2.643489360809326, + "learning_rate": 4.884661964263473e-05, + "loss": 3.7793, + "step": 16322 + }, + { + "epoch": 0.09707750499571796, + "grad_norm": 2.485727071762085, + "learning_rate": 4.8846479398431244e-05, + "loss": 4.9789, + "step": 16323 + }, + { + "epoch": 0.09708345227899895, + "grad_norm": 2.8592441082000732, + "learning_rate": 4.8846339145903194e-05, + "loss": 4.0196, + "step": 16324 + }, + { + "epoch": 0.09708939956227995, + "grad_norm": 2.470813035964966, + "learning_rate": 4.884619888505064e-05, + "loss": 5.2308, + "step": 16325 + }, + { + "epoch": 0.09709534684556095, + "grad_norm": 2.3255081176757812, + "learning_rate": 4.884605861587362e-05, + "loss": 5.3535, + "step": 16326 + }, + { + "epoch": 0.09710129412884194, + "grad_norm": 2.1462676525115967, + "learning_rate": 4.8845918338372195e-05, + "loss": 5.2611, + "step": 16327 + }, + { + "epoch": 0.09710724141212294, + "grad_norm": 1.8838989734649658, + "learning_rate": 4.88457780525464e-05, + "loss": 5.8104, + "step": 16328 + }, + { + "epoch": 0.09711318869540395, + "grad_norm": 2.137746572494507, + "learning_rate": 4.884563775839629e-05, + "loss": 5.4702, + "step": 16329 + }, + { + "epoch": 0.09711913597868493, + "grad_norm": 1.8934431076049805, + "learning_rate": 4.884549745592192e-05, + "loss": 4.9703, + "step": 16330 + }, + { + "epoch": 0.09712508326196594, + "grad_norm": 2.409020185470581, + "learning_rate": 4.884535714512333e-05, + "loss": 5.6793, + "step": 16331 + }, + { + "epoch": 0.09713103054524694, + "grad_norm": 2.039520263671875, + "learning_rate": 4.884521682600056e-05, + "loss": 5.7809, + "step": 16332 + }, + { + "epoch": 0.09713697782852793, + "grad_norm": 3.1211516857147217, + "learning_rate": 4.884507649855369e-05, + "loss": 5.6195, + "step": 16333 + }, + { + "epoch": 0.09714292511180893, + "grad_norm": 1.9474505186080933, + "learning_rate": 4.884493616278274e-05, + "loss": 5.3064, + "step": 16334 + }, + { + "epoch": 0.09714887239508992, + "grad_norm": 1.7586307525634766, + "learning_rate": 4.884479581868777e-05, + "loss": 4.9531, + "step": 16335 + }, + { + "epoch": 0.09715481967837092, + "grad_norm": 1.6352753639221191, + "learning_rate": 4.884465546626883e-05, + "loss": 5.304, + "step": 16336 + }, + { + "epoch": 0.09716076696165192, + "grad_norm": 1.681362271308899, + "learning_rate": 4.884451510552597e-05, + "loss": 5.9167, + "step": 16337 + }, + { + "epoch": 0.09716671424493291, + "grad_norm": 1.7970985174179077, + "learning_rate": 4.8844374736459225e-05, + "loss": 6.122, + "step": 16338 + }, + { + "epoch": 0.09717266152821391, + "grad_norm": 1.5312799215316772, + "learning_rate": 4.8844234359068666e-05, + "loss": 4.903, + "step": 16339 + }, + { + "epoch": 0.09717860881149491, + "grad_norm": 1.7024787664413452, + "learning_rate": 4.884409397335432e-05, + "loss": 5.3306, + "step": 16340 + }, + { + "epoch": 0.0971845560947759, + "grad_norm": 3.000169515609741, + "learning_rate": 4.884395357931626e-05, + "loss": 4.9682, + "step": 16341 + }, + { + "epoch": 0.0971905033780569, + "grad_norm": 2.910048484802246, + "learning_rate": 4.884381317695452e-05, + "loss": 5.2385, + "step": 16342 + }, + { + "epoch": 0.0971964506613379, + "grad_norm": 2.1094155311584473, + "learning_rate": 4.8843672766269147e-05, + "loss": 5.1025, + "step": 16343 + }, + { + "epoch": 0.09720239794461889, + "grad_norm": 1.7918319702148438, + "learning_rate": 4.884353234726019e-05, + "loss": 5.2822, + "step": 16344 + }, + { + "epoch": 0.0972083452278999, + "grad_norm": 1.574461579322815, + "learning_rate": 4.884339191992771e-05, + "loss": 5.6254, + "step": 16345 + }, + { + "epoch": 0.0972142925111809, + "grad_norm": 2.0780746936798096, + "learning_rate": 4.884325148427175e-05, + "loss": 5.0641, + "step": 16346 + }, + { + "epoch": 0.09722023979446189, + "grad_norm": 2.30399227142334, + "learning_rate": 4.884311104029235e-05, + "loss": 4.9591, + "step": 16347 + }, + { + "epoch": 0.09722618707774289, + "grad_norm": 2.087993621826172, + "learning_rate": 4.884297058798957e-05, + "loss": 5.0514, + "step": 16348 + }, + { + "epoch": 0.09723213436102389, + "grad_norm": 2.0179786682128906, + "learning_rate": 4.884283012736345e-05, + "loss": 4.9632, + "step": 16349 + }, + { + "epoch": 0.09723808164430488, + "grad_norm": 2.4394171237945557, + "learning_rate": 4.8842689658414054e-05, + "loss": 4.6517, + "step": 16350 + }, + { + "epoch": 0.09724402892758588, + "grad_norm": 2.6895275115966797, + "learning_rate": 4.884254918114142e-05, + "loss": 4.726, + "step": 16351 + }, + { + "epoch": 0.09724997621086688, + "grad_norm": 1.5181125402450562, + "learning_rate": 4.884240869554559e-05, + "loss": 5.679, + "step": 16352 + }, + { + "epoch": 0.09725592349414787, + "grad_norm": 1.758475422859192, + "learning_rate": 4.884226820162662e-05, + "loss": 5.2323, + "step": 16353 + }, + { + "epoch": 0.09726187077742887, + "grad_norm": 2.0166938304901123, + "learning_rate": 4.884212769938457e-05, + "loss": 4.6912, + "step": 16354 + }, + { + "epoch": 0.09726781806070987, + "grad_norm": 2.1366612911224365, + "learning_rate": 4.8841987188819475e-05, + "loss": 4.4761, + "step": 16355 + }, + { + "epoch": 0.09727376534399086, + "grad_norm": 1.9595547914505005, + "learning_rate": 4.884184666993139e-05, + "loss": 4.5343, + "step": 16356 + }, + { + "epoch": 0.09727971262727186, + "grad_norm": 1.896043300628662, + "learning_rate": 4.884170614272037e-05, + "loss": 4.465, + "step": 16357 + }, + { + "epoch": 0.09728565991055287, + "grad_norm": 2.062506675720215, + "learning_rate": 4.884156560718645e-05, + "loss": 4.301, + "step": 16358 + }, + { + "epoch": 0.09729160719383385, + "grad_norm": 2.0816612243652344, + "learning_rate": 4.884142506332968e-05, + "loss": 4.5414, + "step": 16359 + }, + { + "epoch": 0.09729755447711486, + "grad_norm": 2.0095489025115967, + "learning_rate": 4.884128451115012e-05, + "loss": 4.3779, + "step": 16360 + }, + { + "epoch": 0.09730350176039586, + "grad_norm": 2.0766615867614746, + "learning_rate": 4.884114395064781e-05, + "loss": 4.3999, + "step": 16361 + }, + { + "epoch": 0.09730944904367685, + "grad_norm": 2.0266785621643066, + "learning_rate": 4.8841003381822805e-05, + "loss": 4.5122, + "step": 16362 + }, + { + "epoch": 0.09731539632695785, + "grad_norm": 1.9631284475326538, + "learning_rate": 4.884086280467516e-05, + "loss": 4.3061, + "step": 16363 + }, + { + "epoch": 0.09732134361023884, + "grad_norm": 2.2965009212493896, + "learning_rate": 4.8840722219204905e-05, + "loss": 4.3387, + "step": 16364 + }, + { + "epoch": 0.09732729089351984, + "grad_norm": 2.036365509033203, + "learning_rate": 4.8840581625412105e-05, + "loss": 4.3242, + "step": 16365 + }, + { + "epoch": 0.09733323817680084, + "grad_norm": 2.186131477355957, + "learning_rate": 4.88404410232968e-05, + "loss": 4.2517, + "step": 16366 + }, + { + "epoch": 0.09733918546008183, + "grad_norm": 2.2000489234924316, + "learning_rate": 4.884030041285905e-05, + "loss": 4.274, + "step": 16367 + }, + { + "epoch": 0.09734513274336283, + "grad_norm": 3.2708849906921387, + "learning_rate": 4.884015979409889e-05, + "loss": 4.9575, + "step": 16368 + }, + { + "epoch": 0.09735108002664383, + "grad_norm": 1.7634176015853882, + "learning_rate": 4.884001916701639e-05, + "loss": 4.63, + "step": 16369 + }, + { + "epoch": 0.09735702730992482, + "grad_norm": 2.297611713409424, + "learning_rate": 4.883987853161157e-05, + "loss": 4.3009, + "step": 16370 + }, + { + "epoch": 0.09736297459320582, + "grad_norm": 2.1840944290161133, + "learning_rate": 4.8839737887884507e-05, + "loss": 4.2232, + "step": 16371 + }, + { + "epoch": 0.09736892187648682, + "grad_norm": 2.1925270557403564, + "learning_rate": 4.8839597235835234e-05, + "loss": 4.1824, + "step": 16372 + }, + { + "epoch": 0.09737486915976781, + "grad_norm": 2.175720453262329, + "learning_rate": 4.88394565754638e-05, + "loss": 4.2619, + "step": 16373 + }, + { + "epoch": 0.09738081644304881, + "grad_norm": 2.282804489135742, + "learning_rate": 4.883931590677026e-05, + "loss": 4.2207, + "step": 16374 + }, + { + "epoch": 0.09738676372632982, + "grad_norm": 1.674668788909912, + "learning_rate": 4.883917522975466e-05, + "loss": 5.3627, + "step": 16375 + }, + { + "epoch": 0.0973927110096108, + "grad_norm": 1.6538902521133423, + "learning_rate": 4.883903454441705e-05, + "loss": 5.302, + "step": 16376 + }, + { + "epoch": 0.09739865829289181, + "grad_norm": 1.4267115592956543, + "learning_rate": 4.8838893850757485e-05, + "loss": 5.2545, + "step": 16377 + }, + { + "epoch": 0.09740460557617281, + "grad_norm": 1.3086082935333252, + "learning_rate": 4.8838753148776e-05, + "loss": 5.1538, + "step": 16378 + }, + { + "epoch": 0.0974105528594538, + "grad_norm": 1.4384034872055054, + "learning_rate": 4.883861243847266e-05, + "loss": 5.3925, + "step": 16379 + }, + { + "epoch": 0.0974165001427348, + "grad_norm": 1.4971977472305298, + "learning_rate": 4.88384717198475e-05, + "loss": 5.3966, + "step": 16380 + }, + { + "epoch": 0.0974224474260158, + "grad_norm": 1.517468810081482, + "learning_rate": 4.8838330992900584e-05, + "loss": 5.1097, + "step": 16381 + }, + { + "epoch": 0.09742839470929679, + "grad_norm": 1.388852596282959, + "learning_rate": 4.8838190257631944e-05, + "loss": 5.1066, + "step": 16382 + }, + { + "epoch": 0.09743434199257779, + "grad_norm": 1.2972341775894165, + "learning_rate": 4.8838049514041646e-05, + "loss": 5.0383, + "step": 16383 + }, + { + "epoch": 0.0974402892758588, + "grad_norm": 1.338291049003601, + "learning_rate": 4.883790876212972e-05, + "loss": 5.1339, + "step": 16384 + }, + { + "epoch": 0.09744623655913978, + "grad_norm": 1.4399670362472534, + "learning_rate": 4.883776800189624e-05, + "loss": 5.0542, + "step": 16385 + }, + { + "epoch": 0.09745218384242078, + "grad_norm": 1.5091251134872437, + "learning_rate": 4.8837627233341235e-05, + "loss": 4.9303, + "step": 16386 + }, + { + "epoch": 0.09745813112570179, + "grad_norm": 1.4728022813796997, + "learning_rate": 4.8837486456464764e-05, + "loss": 5.0902, + "step": 16387 + }, + { + "epoch": 0.09746407840898277, + "grad_norm": 1.454509973526001, + "learning_rate": 4.8837345671266865e-05, + "loss": 4.9227, + "step": 16388 + }, + { + "epoch": 0.09747002569226378, + "grad_norm": 1.431118130683899, + "learning_rate": 4.88372048777476e-05, + "loss": 5.0128, + "step": 16389 + }, + { + "epoch": 0.09747597297554478, + "grad_norm": 1.434967041015625, + "learning_rate": 4.8837064075907015e-05, + "loss": 5.1793, + "step": 16390 + }, + { + "epoch": 0.09748192025882577, + "grad_norm": 1.5077275037765503, + "learning_rate": 4.883692326574515e-05, + "loss": 5.1573, + "step": 16391 + }, + { + "epoch": 0.09748786754210677, + "grad_norm": 1.44413161277771, + "learning_rate": 4.883678244726208e-05, + "loss": 5.2297, + "step": 16392 + }, + { + "epoch": 0.09749381482538776, + "grad_norm": 1.606898546218872, + "learning_rate": 4.883664162045781e-05, + "loss": 4.9409, + "step": 16393 + }, + { + "epoch": 0.09749976210866876, + "grad_norm": 1.649034857749939, + "learning_rate": 4.883650078533243e-05, + "loss": 5.1519, + "step": 16394 + }, + { + "epoch": 0.09750570939194976, + "grad_norm": 1.5309730768203735, + "learning_rate": 4.883635994188597e-05, + "loss": 4.9568, + "step": 16395 + }, + { + "epoch": 0.09751165667523075, + "grad_norm": 1.8033829927444458, + "learning_rate": 4.883621909011848e-05, + "loss": 4.7442, + "step": 16396 + }, + { + "epoch": 0.09751760395851175, + "grad_norm": 1.653501272201538, + "learning_rate": 4.8836078230030016e-05, + "loss": 4.5672, + "step": 16397 + }, + { + "epoch": 0.09752355124179275, + "grad_norm": 1.686077356338501, + "learning_rate": 4.8835937361620624e-05, + "loss": 4.5819, + "step": 16398 + }, + { + "epoch": 0.09752949852507374, + "grad_norm": 1.5233088731765747, + "learning_rate": 4.883579648489035e-05, + "loss": 4.5191, + "step": 16399 + }, + { + "epoch": 0.09753544580835474, + "grad_norm": 1.6472907066345215, + "learning_rate": 4.883565559983925e-05, + "loss": 4.6418, + "step": 16400 + }, + { + "epoch": 0.09754139309163574, + "grad_norm": 1.817649483680725, + "learning_rate": 4.8835514706467364e-05, + "loss": 4.806, + "step": 16401 + }, + { + "epoch": 0.09754734037491673, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.8835373804774754e-05, + "loss": 4.8169, + "step": 16402 + }, + { + "epoch": 0.09755328765819773, + "grad_norm": 1.5510175228118896, + "learning_rate": 4.883523289476145e-05, + "loss": 4.7987, + "step": 16403 + }, + { + "epoch": 0.09755923494147874, + "grad_norm": 1.4557734727859497, + "learning_rate": 4.8835091976427514e-05, + "loss": 4.7322, + "step": 16404 + }, + { + "epoch": 0.09756518222475973, + "grad_norm": 1.528123140335083, + "learning_rate": 4.8834951049773006e-05, + "loss": 4.7376, + "step": 16405 + }, + { + "epoch": 0.09757112950804073, + "grad_norm": 1.6215547323226929, + "learning_rate": 4.8834810114797944e-05, + "loss": 4.7679, + "step": 16406 + }, + { + "epoch": 0.09757707679132173, + "grad_norm": 1.4554566144943237, + "learning_rate": 4.883466917150241e-05, + "loss": 4.6452, + "step": 16407 + }, + { + "epoch": 0.09758302407460272, + "grad_norm": 1.5100599527359009, + "learning_rate": 4.883452821988644e-05, + "loss": 4.6957, + "step": 16408 + }, + { + "epoch": 0.09758897135788372, + "grad_norm": 1.7057833671569824, + "learning_rate": 4.8834387259950074e-05, + "loss": 4.7888, + "step": 16409 + }, + { + "epoch": 0.09759491864116472, + "grad_norm": 1.4016892910003662, + "learning_rate": 4.883424629169337e-05, + "loss": 4.769, + "step": 16410 + }, + { + "epoch": 0.09760086592444571, + "grad_norm": 1.5257891416549683, + "learning_rate": 4.883410531511638e-05, + "loss": 4.7443, + "step": 16411 + }, + { + "epoch": 0.09760681320772671, + "grad_norm": 1.3904502391815186, + "learning_rate": 4.883396433021916e-05, + "loss": 4.786, + "step": 16412 + }, + { + "epoch": 0.09761276049100771, + "grad_norm": 1.6081106662750244, + "learning_rate": 4.883382333700174e-05, + "loss": 4.5321, + "step": 16413 + }, + { + "epoch": 0.0976187077742887, + "grad_norm": 1.4291402101516724, + "learning_rate": 4.883368233546417e-05, + "loss": 4.5898, + "step": 16414 + }, + { + "epoch": 0.0976246550575697, + "grad_norm": 1.5700920820236206, + "learning_rate": 4.8833541325606524e-05, + "loss": 5.2177, + "step": 16415 + }, + { + "epoch": 0.0976306023408507, + "grad_norm": 1.5503007173538208, + "learning_rate": 4.8833400307428825e-05, + "loss": 5.3911, + "step": 16416 + }, + { + "epoch": 0.0976365496241317, + "grad_norm": 1.5890953540802002, + "learning_rate": 4.8833259280931135e-05, + "loss": 4.9426, + "step": 16417 + }, + { + "epoch": 0.0976424969074127, + "grad_norm": 1.5032304525375366, + "learning_rate": 4.8833118246113494e-05, + "loss": 4.6124, + "step": 16418 + }, + { + "epoch": 0.0976484441906937, + "grad_norm": 1.5300242900848389, + "learning_rate": 4.8832977202975964e-05, + "loss": 4.9323, + "step": 16419 + }, + { + "epoch": 0.09765439147397469, + "grad_norm": 1.7094424962997437, + "learning_rate": 4.883283615151859e-05, + "loss": 5.3205, + "step": 16420 + }, + { + "epoch": 0.09766033875725569, + "grad_norm": 1.8231004476547241, + "learning_rate": 4.883269509174142e-05, + "loss": 5.0414, + "step": 16421 + }, + { + "epoch": 0.09766628604053668, + "grad_norm": 1.7779520750045776, + "learning_rate": 4.8832554023644496e-05, + "loss": 4.9106, + "step": 16422 + }, + { + "epoch": 0.09767223332381768, + "grad_norm": 1.5394103527069092, + "learning_rate": 4.8832412947227875e-05, + "loss": 4.998, + "step": 16423 + }, + { + "epoch": 0.09767818060709868, + "grad_norm": 1.3814078569412231, + "learning_rate": 4.883227186249161e-05, + "loss": 4.9109, + "step": 16424 + }, + { + "epoch": 0.09768412789037967, + "grad_norm": 1.291040301322937, + "learning_rate": 4.8832130769435735e-05, + "loss": 5.3617, + "step": 16425 + }, + { + "epoch": 0.09769007517366067, + "grad_norm": 1.561249017715454, + "learning_rate": 4.883198966806032e-05, + "loss": 5.3041, + "step": 16426 + }, + { + "epoch": 0.09769602245694167, + "grad_norm": 1.7411010265350342, + "learning_rate": 4.883184855836539e-05, + "loss": 5.0816, + "step": 16427 + }, + { + "epoch": 0.09770196974022266, + "grad_norm": 1.6507155895233154, + "learning_rate": 4.8831707440351024e-05, + "loss": 5.1089, + "step": 16428 + }, + { + "epoch": 0.09770791702350366, + "grad_norm": 1.5242364406585693, + "learning_rate": 4.8831566314017254e-05, + "loss": 4.9718, + "step": 16429 + }, + { + "epoch": 0.09771386430678466, + "grad_norm": 2.3768868446350098, + "learning_rate": 4.883142517936412e-05, + "loss": 4.9333, + "step": 16430 + }, + { + "epoch": 0.09771981159006565, + "grad_norm": 1.2830429077148438, + "learning_rate": 4.8831284036391684e-05, + "loss": 4.9238, + "step": 16431 + }, + { + "epoch": 0.09772575887334665, + "grad_norm": 1.5065499544143677, + "learning_rate": 4.883114288509999e-05, + "loss": 5.0151, + "step": 16432 + }, + { + "epoch": 0.09773170615662766, + "grad_norm": 1.5989798307418823, + "learning_rate": 4.88310017254891e-05, + "loss": 5.0081, + "step": 16433 + }, + { + "epoch": 0.09773765343990864, + "grad_norm": 1.391644835472107, + "learning_rate": 4.883086055755905e-05, + "loss": 4.8942, + "step": 16434 + }, + { + "epoch": 0.09774360072318965, + "grad_norm": 1.4952952861785889, + "learning_rate": 4.883071938130989e-05, + "loss": 5.0018, + "step": 16435 + }, + { + "epoch": 0.09774954800647065, + "grad_norm": 1.522814393043518, + "learning_rate": 4.883057819674168e-05, + "loss": 5.2591, + "step": 16436 + }, + { + "epoch": 0.09775549528975164, + "grad_norm": 1.3879649639129639, + "learning_rate": 4.8830437003854454e-05, + "loss": 4.9136, + "step": 16437 + }, + { + "epoch": 0.09776144257303264, + "grad_norm": 1.3485056161880493, + "learning_rate": 4.883029580264827e-05, + "loss": 5.5159, + "step": 16438 + }, + { + "epoch": 0.09776738985631364, + "grad_norm": 1.475131869316101, + "learning_rate": 4.883015459312317e-05, + "loss": 5.4397, + "step": 16439 + }, + { + "epoch": 0.09777333713959463, + "grad_norm": 1.2736895084381104, + "learning_rate": 4.8830013375279215e-05, + "loss": 5.2867, + "step": 16440 + }, + { + "epoch": 0.09777928442287563, + "grad_norm": 1.456312656402588, + "learning_rate": 4.882987214911645e-05, + "loss": 5.3351, + "step": 16441 + }, + { + "epoch": 0.09778523170615663, + "grad_norm": 1.5312397480010986, + "learning_rate": 4.882973091463492e-05, + "loss": 5.3233, + "step": 16442 + }, + { + "epoch": 0.09779117898943762, + "grad_norm": 1.5735961198806763, + "learning_rate": 4.882958967183468e-05, + "loss": 4.9878, + "step": 16443 + }, + { + "epoch": 0.09779712627271862, + "grad_norm": 1.337172508239746, + "learning_rate": 4.882944842071577e-05, + "loss": 5.121, + "step": 16444 + }, + { + "epoch": 0.09780307355599963, + "grad_norm": 1.47593355178833, + "learning_rate": 4.882930716127826e-05, + "loss": 5.4733, + "step": 16445 + }, + { + "epoch": 0.09780902083928061, + "grad_norm": 1.4311164617538452, + "learning_rate": 4.882916589352217e-05, + "loss": 5.2215, + "step": 16446 + }, + { + "epoch": 0.09781496812256162, + "grad_norm": 1.3628556728363037, + "learning_rate": 4.882902461744757e-05, + "loss": 5.3611, + "step": 16447 + }, + { + "epoch": 0.09782091540584262, + "grad_norm": 1.5621687173843384, + "learning_rate": 4.882888333305451e-05, + "loss": 5.4407, + "step": 16448 + }, + { + "epoch": 0.0978268626891236, + "grad_norm": 1.570478081703186, + "learning_rate": 4.8828742040343024e-05, + "loss": 5.533, + "step": 16449 + }, + { + "epoch": 0.09783280997240461, + "grad_norm": 1.3725816011428833, + "learning_rate": 4.8828600739313174e-05, + "loss": 5.1467, + "step": 16450 + }, + { + "epoch": 0.0978387572556856, + "grad_norm": 1.4899497032165527, + "learning_rate": 4.8828459429965e-05, + "loss": 5.233, + "step": 16451 + }, + { + "epoch": 0.0978447045389666, + "grad_norm": 1.380609154701233, + "learning_rate": 4.882831811229857e-05, + "loss": 5.1484, + "step": 16452 + }, + { + "epoch": 0.0978506518222476, + "grad_norm": 1.2167932987213135, + "learning_rate": 4.882817678631391e-05, + "loss": 5.1687, + "step": 16453 + }, + { + "epoch": 0.09785659910552859, + "grad_norm": 1.5250643491744995, + "learning_rate": 4.882803545201108e-05, + "loss": 5.2395, + "step": 16454 + }, + { + "epoch": 0.09786254638880959, + "grad_norm": 1.4288511276245117, + "learning_rate": 4.882789410939013e-05, + "loss": 5.0532, + "step": 16455 + }, + { + "epoch": 0.09786849367209059, + "grad_norm": 1.6325379610061646, + "learning_rate": 4.8827752758451105e-05, + "loss": 5.2077, + "step": 16456 + }, + { + "epoch": 0.09787444095537158, + "grad_norm": 1.4227756261825562, + "learning_rate": 4.882761139919406e-05, + "loss": 5.0431, + "step": 16457 + }, + { + "epoch": 0.09788038823865258, + "grad_norm": 1.355039358139038, + "learning_rate": 4.8827470031619046e-05, + "loss": 4.9062, + "step": 16458 + }, + { + "epoch": 0.09788633552193358, + "grad_norm": 1.5071823596954346, + "learning_rate": 4.8827328655726113e-05, + "loss": 5.2632, + "step": 16459 + }, + { + "epoch": 0.09789228280521457, + "grad_norm": 1.411828637123108, + "learning_rate": 4.88271872715153e-05, + "loss": 5.343, + "step": 16460 + }, + { + "epoch": 0.09789823008849557, + "grad_norm": 1.419164776802063, + "learning_rate": 4.882704587898666e-05, + "loss": 5.1643, + "step": 16461 + }, + { + "epoch": 0.09790417737177658, + "grad_norm": 1.4997645616531372, + "learning_rate": 4.882690447814024e-05, + "loss": 5.1701, + "step": 16462 + }, + { + "epoch": 0.09791012465505756, + "grad_norm": 1.4251139163970947, + "learning_rate": 4.88267630689761e-05, + "loss": 5.0228, + "step": 16463 + }, + { + "epoch": 0.09791607193833857, + "grad_norm": 1.289102554321289, + "learning_rate": 4.882662165149429e-05, + "loss": 5.1934, + "step": 16464 + }, + { + "epoch": 0.09792201922161957, + "grad_norm": 1.1589713096618652, + "learning_rate": 4.882648022569484e-05, + "loss": 5.3388, + "step": 16465 + }, + { + "epoch": 0.09792796650490056, + "grad_norm": 1.1682082414627075, + "learning_rate": 4.8826338791577816e-05, + "loss": 5.2062, + "step": 16466 + }, + { + "epoch": 0.09793391378818156, + "grad_norm": 1.2263107299804688, + "learning_rate": 4.882619734914326e-05, + "loss": 5.414, + "step": 16467 + }, + { + "epoch": 0.09793986107146256, + "grad_norm": 1.2873631715774536, + "learning_rate": 4.882605589839123e-05, + "loss": 5.4286, + "step": 16468 + }, + { + "epoch": 0.09794580835474355, + "grad_norm": 1.2950979471206665, + "learning_rate": 4.882591443932177e-05, + "loss": 5.1603, + "step": 16469 + }, + { + "epoch": 0.09795175563802455, + "grad_norm": 1.5623066425323486, + "learning_rate": 4.882577297193493e-05, + "loss": 5.0778, + "step": 16470 + }, + { + "epoch": 0.09795770292130555, + "grad_norm": 1.5446339845657349, + "learning_rate": 4.882563149623076e-05, + "loss": 5.1451, + "step": 16471 + }, + { + "epoch": 0.09796365020458654, + "grad_norm": 1.599387526512146, + "learning_rate": 4.882549001220931e-05, + "loss": 5.4596, + "step": 16472 + }, + { + "epoch": 0.09796959748786754, + "grad_norm": 1.325596809387207, + "learning_rate": 4.882534851987062e-05, + "loss": 5.4639, + "step": 16473 + }, + { + "epoch": 0.09797554477114855, + "grad_norm": 1.3077852725982666, + "learning_rate": 4.8825207019214746e-05, + "loss": 5.3654, + "step": 16474 + }, + { + "epoch": 0.09798149205442953, + "grad_norm": 1.5500328540802002, + "learning_rate": 4.882506551024174e-05, + "loss": 4.946, + "step": 16475 + }, + { + "epoch": 0.09798743933771054, + "grad_norm": 1.6101415157318115, + "learning_rate": 4.8824923992951656e-05, + "loss": 4.9618, + "step": 16476 + }, + { + "epoch": 0.09799338662099154, + "grad_norm": 1.542837381362915, + "learning_rate": 4.882478246734453e-05, + "loss": 4.9959, + "step": 16477 + }, + { + "epoch": 0.09799933390427253, + "grad_norm": 1.5618165731430054, + "learning_rate": 4.8824640933420424e-05, + "loss": 5.1221, + "step": 16478 + }, + { + "epoch": 0.09800528118755353, + "grad_norm": 1.4425160884857178, + "learning_rate": 4.882449939117938e-05, + "loss": 5.1689, + "step": 16479 + }, + { + "epoch": 0.09801122847083452, + "grad_norm": 1.3621004819869995, + "learning_rate": 4.8824357840621445e-05, + "loss": 4.9975, + "step": 16480 + }, + { + "epoch": 0.09801717575411552, + "grad_norm": 1.5944523811340332, + "learning_rate": 4.882421628174668e-05, + "loss": 5.0296, + "step": 16481 + }, + { + "epoch": 0.09802312303739652, + "grad_norm": 1.391321063041687, + "learning_rate": 4.8824074714555125e-05, + "loss": 5.0139, + "step": 16482 + }, + { + "epoch": 0.09802907032067751, + "grad_norm": 1.2085964679718018, + "learning_rate": 4.882393313904683e-05, + "loss": 5.1125, + "step": 16483 + }, + { + "epoch": 0.09803501760395851, + "grad_norm": 1.391383409500122, + "learning_rate": 4.882379155522185e-05, + "loss": 5.2999, + "step": 16484 + }, + { + "epoch": 0.09804096488723951, + "grad_norm": 1.3748564720153809, + "learning_rate": 4.882364996308023e-05, + "loss": 5.3096, + "step": 16485 + }, + { + "epoch": 0.0980469121705205, + "grad_norm": 1.825728416442871, + "learning_rate": 4.8823508362622014e-05, + "loss": 5.3318, + "step": 16486 + }, + { + "epoch": 0.0980528594538015, + "grad_norm": 1.6402180194854736, + "learning_rate": 4.882336675384726e-05, + "loss": 5.155, + "step": 16487 + }, + { + "epoch": 0.0980588067370825, + "grad_norm": 1.343284249305725, + "learning_rate": 4.882322513675601e-05, + "loss": 4.9341, + "step": 16488 + }, + { + "epoch": 0.09806475402036349, + "grad_norm": 1.3958711624145508, + "learning_rate": 4.882308351134833e-05, + "loss": 4.9595, + "step": 16489 + }, + { + "epoch": 0.0980707013036445, + "grad_norm": 1.572996735572815, + "learning_rate": 4.882294187762425e-05, + "loss": 4.9666, + "step": 16490 + }, + { + "epoch": 0.0980766485869255, + "grad_norm": 1.6167391538619995, + "learning_rate": 4.882280023558383e-05, + "loss": 4.7387, + "step": 16491 + }, + { + "epoch": 0.09808259587020648, + "grad_norm": 2.474092483520508, + "learning_rate": 4.882265858522711e-05, + "loss": 5.1476, + "step": 16492 + }, + { + "epoch": 0.09808854315348749, + "grad_norm": 1.5375875234603882, + "learning_rate": 4.8822516926554155e-05, + "loss": 4.5832, + "step": 16493 + }, + { + "epoch": 0.09809449043676849, + "grad_norm": 1.6802133321762085, + "learning_rate": 4.8822375259565e-05, + "loss": 4.615, + "step": 16494 + }, + { + "epoch": 0.09810043772004948, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.8822233584259703e-05, + "loss": 4.6586, + "step": 16495 + }, + { + "epoch": 0.09810638500333048, + "grad_norm": 1.5207875967025757, + "learning_rate": 4.882209190063831e-05, + "loss": 4.6748, + "step": 16496 + }, + { + "epoch": 0.09811233228661148, + "grad_norm": 1.4980802536010742, + "learning_rate": 4.882195020870087e-05, + "loss": 4.5326, + "step": 16497 + }, + { + "epoch": 0.09811827956989247, + "grad_norm": 1.473092794418335, + "learning_rate": 4.882180850844743e-05, + "loss": 4.6126, + "step": 16498 + }, + { + "epoch": 0.09812422685317347, + "grad_norm": 1.521147608757019, + "learning_rate": 4.8821666799878055e-05, + "loss": 4.6269, + "step": 16499 + }, + { + "epoch": 0.09813017413645447, + "grad_norm": 1.7371230125427246, + "learning_rate": 4.882152508299277e-05, + "loss": 4.6847, + "step": 16500 + }, + { + "epoch": 0.09813612141973546, + "grad_norm": 1.7222683429718018, + "learning_rate": 4.8821383357791636e-05, + "loss": 5.3943, + "step": 16501 + }, + { + "epoch": 0.09814206870301646, + "grad_norm": 1.523373007774353, + "learning_rate": 4.8821241624274705e-05, + "loss": 5.2822, + "step": 16502 + }, + { + "epoch": 0.09814801598629747, + "grad_norm": 1.365224838256836, + "learning_rate": 4.882109988244203e-05, + "loss": 5.1923, + "step": 16503 + }, + { + "epoch": 0.09815396326957845, + "grad_norm": 1.503907322883606, + "learning_rate": 4.882095813229365e-05, + "loss": 5.128, + "step": 16504 + }, + { + "epoch": 0.09815991055285946, + "grad_norm": 1.5996166467666626, + "learning_rate": 4.8820816373829625e-05, + "loss": 4.9296, + "step": 16505 + }, + { + "epoch": 0.09816585783614046, + "grad_norm": 1.373089075088501, + "learning_rate": 4.8820674607049994e-05, + "loss": 5.0614, + "step": 16506 + }, + { + "epoch": 0.09817180511942145, + "grad_norm": 1.3730735778808594, + "learning_rate": 4.882053283195481e-05, + "loss": 5.0374, + "step": 16507 + }, + { + "epoch": 0.09817775240270245, + "grad_norm": 1.2357912063598633, + "learning_rate": 4.882039104854413e-05, + "loss": 5.1513, + "step": 16508 + }, + { + "epoch": 0.09818369968598344, + "grad_norm": 1.402327299118042, + "learning_rate": 4.8820249256817995e-05, + "loss": 5.7344, + "step": 16509 + }, + { + "epoch": 0.09818964696926444, + "grad_norm": 1.3152369260787964, + "learning_rate": 4.882010745677645e-05, + "loss": 5.6755, + "step": 16510 + }, + { + "epoch": 0.09819559425254544, + "grad_norm": 1.409428358078003, + "learning_rate": 4.8819965648419565e-05, + "loss": 5.3562, + "step": 16511 + }, + { + "epoch": 0.09820154153582643, + "grad_norm": 1.3278082609176636, + "learning_rate": 4.881982383174737e-05, + "loss": 5.2401, + "step": 16512 + }, + { + "epoch": 0.09820748881910743, + "grad_norm": 1.287716269493103, + "learning_rate": 4.881968200675991e-05, + "loss": 4.9961, + "step": 16513 + }, + { + "epoch": 0.09821343610238843, + "grad_norm": 1.3444676399230957, + "learning_rate": 4.881954017345727e-05, + "loss": 5.5592, + "step": 16514 + }, + { + "epoch": 0.09821938338566942, + "grad_norm": 1.4815365076065063, + "learning_rate": 4.881939833183945e-05, + "loss": 5.5342, + "step": 16515 + }, + { + "epoch": 0.09822533066895042, + "grad_norm": 1.210050344467163, + "learning_rate": 4.8819256481906536e-05, + "loss": 5.5375, + "step": 16516 + }, + { + "epoch": 0.09823127795223142, + "grad_norm": 2.041801691055298, + "learning_rate": 4.881911462365857e-05, + "loss": 4.601, + "step": 16517 + }, + { + "epoch": 0.09823722523551241, + "grad_norm": 2.196315050125122, + "learning_rate": 4.881897275709558e-05, + "loss": 4.2376, + "step": 16518 + }, + { + "epoch": 0.09824317251879341, + "grad_norm": 2.1649539470672607, + "learning_rate": 4.881883088221765e-05, + "loss": 4.4159, + "step": 16519 + }, + { + "epoch": 0.09824911980207442, + "grad_norm": 2.02476167678833, + "learning_rate": 4.881868899902481e-05, + "loss": 4.4091, + "step": 16520 + }, + { + "epoch": 0.0982550670853554, + "grad_norm": 1.9262346029281616, + "learning_rate": 4.88185471075171e-05, + "loss": 4.4326, + "step": 16521 + }, + { + "epoch": 0.0982610143686364, + "grad_norm": 1.8461369276046753, + "learning_rate": 4.881840520769459e-05, + "loss": 4.1563, + "step": 16522 + }, + { + "epoch": 0.09826696165191741, + "grad_norm": 1.8261640071868896, + "learning_rate": 4.881826329955732e-05, + "loss": 4.3518, + "step": 16523 + }, + { + "epoch": 0.0982729089351984, + "grad_norm": 2.1533737182617188, + "learning_rate": 4.881812138310534e-05, + "loss": 4.292, + "step": 16524 + }, + { + "epoch": 0.0982788562184794, + "grad_norm": 2.11578369140625, + "learning_rate": 4.8817979458338705e-05, + "loss": 4.5411, + "step": 16525 + }, + { + "epoch": 0.0982848035017604, + "grad_norm": 1.8681827783584595, + "learning_rate": 4.881783752525745e-05, + "loss": 5.7264, + "step": 16526 + }, + { + "epoch": 0.09829075078504139, + "grad_norm": 1.98794424533844, + "learning_rate": 4.881769558386163e-05, + "loss": 5.4694, + "step": 16527 + }, + { + "epoch": 0.09829669806832239, + "grad_norm": 2.6389517784118652, + "learning_rate": 4.881755363415131e-05, + "loss": 5.0086, + "step": 16528 + }, + { + "epoch": 0.0983026453516034, + "grad_norm": 2.2565221786499023, + "learning_rate": 4.881741167612653e-05, + "loss": 4.9219, + "step": 16529 + }, + { + "epoch": 0.09830859263488438, + "grad_norm": 1.8296940326690674, + "learning_rate": 4.881726970978733e-05, + "loss": 4.9185, + "step": 16530 + }, + { + "epoch": 0.09831453991816538, + "grad_norm": 2.031334638595581, + "learning_rate": 4.8817127735133774e-05, + "loss": 4.8589, + "step": 16531 + }, + { + "epoch": 0.09832048720144639, + "grad_norm": 1.5883747339248657, + "learning_rate": 4.8816985752165904e-05, + "loss": 5.2695, + "step": 16532 + }, + { + "epoch": 0.09832643448472737, + "grad_norm": 1.4946906566619873, + "learning_rate": 4.8816843760883755e-05, + "loss": 5.6835, + "step": 16533 + }, + { + "epoch": 0.09833238176800838, + "grad_norm": 1.7901808023452759, + "learning_rate": 4.881670176128741e-05, + "loss": 6.1753, + "step": 16534 + }, + { + "epoch": 0.09833832905128938, + "grad_norm": 1.7249737977981567, + "learning_rate": 4.881655975337689e-05, + "loss": 5.86, + "step": 16535 + }, + { + "epoch": 0.09834427633457037, + "grad_norm": 1.8257695436477661, + "learning_rate": 4.8816417737152264e-05, + "loss": 5.1969, + "step": 16536 + }, + { + "epoch": 0.09835022361785137, + "grad_norm": 1.3712751865386963, + "learning_rate": 4.881627571261357e-05, + "loss": 5.7666, + "step": 16537 + }, + { + "epoch": 0.09835617090113236, + "grad_norm": 1.8865090608596802, + "learning_rate": 4.881613367976086e-05, + "loss": 4.8832, + "step": 16538 + }, + { + "epoch": 0.09836211818441336, + "grad_norm": 1.7155808210372925, + "learning_rate": 4.8815991638594175e-05, + "loss": 4.7248, + "step": 16539 + }, + { + "epoch": 0.09836806546769436, + "grad_norm": 1.6654868125915527, + "learning_rate": 4.8815849589113585e-05, + "loss": 4.7095, + "step": 16540 + }, + { + "epoch": 0.09837401275097535, + "grad_norm": 1.6152902841567993, + "learning_rate": 4.881570753131912e-05, + "loss": 5.2894, + "step": 16541 + }, + { + "epoch": 0.09837996003425635, + "grad_norm": 2.1657047271728516, + "learning_rate": 4.8815565465210835e-05, + "loss": 5.9782, + "step": 16542 + }, + { + "epoch": 0.09838590731753735, + "grad_norm": 1.801346778869629, + "learning_rate": 4.88154233907888e-05, + "loss": 5.6683, + "step": 16543 + }, + { + "epoch": 0.09839185460081834, + "grad_norm": 1.7916477918624878, + "learning_rate": 4.881528130805303e-05, + "loss": 5.7056, + "step": 16544 + }, + { + "epoch": 0.09839780188409934, + "grad_norm": 2.1006147861480713, + "learning_rate": 4.881513921700359e-05, + "loss": 5.6315, + "step": 16545 + }, + { + "epoch": 0.09840374916738034, + "grad_norm": 2.3291585445404053, + "learning_rate": 4.8814997117640535e-05, + "loss": 4.8996, + "step": 16546 + }, + { + "epoch": 0.09840969645066133, + "grad_norm": 1.9543695449829102, + "learning_rate": 4.8814855009963916e-05, + "loss": 5.1839, + "step": 16547 + }, + { + "epoch": 0.09841564373394233, + "grad_norm": 2.7100865840911865, + "learning_rate": 4.881471289397378e-05, + "loss": 5.1445, + "step": 16548 + }, + { + "epoch": 0.09842159101722334, + "grad_norm": 2.5749876499176025, + "learning_rate": 4.8814570769670165e-05, + "loss": 5.2023, + "step": 16549 + }, + { + "epoch": 0.09842753830050432, + "grad_norm": 2.079770088195801, + "learning_rate": 4.881442863705313e-05, + "loss": 5.1197, + "step": 16550 + }, + { + "epoch": 0.09843348558378533, + "grad_norm": 1.9495431184768677, + "learning_rate": 4.881428649612272e-05, + "loss": 4.8669, + "step": 16551 + }, + { + "epoch": 0.09843943286706633, + "grad_norm": 2.0918610095977783, + "learning_rate": 4.8814144346879e-05, + "loss": 5.0413, + "step": 16552 + }, + { + "epoch": 0.09844538015034732, + "grad_norm": 2.326662302017212, + "learning_rate": 4.8814002189322e-05, + "loss": 5.0085, + "step": 16553 + }, + { + "epoch": 0.09845132743362832, + "grad_norm": 2.3819150924682617, + "learning_rate": 4.881386002345178e-05, + "loss": 4.8364, + "step": 16554 + }, + { + "epoch": 0.09845727471690932, + "grad_norm": 2.6585230827331543, + "learning_rate": 4.881371784926839e-05, + "loss": 5.1722, + "step": 16555 + }, + { + "epoch": 0.09846322200019031, + "grad_norm": 2.209075689315796, + "learning_rate": 4.881357566677187e-05, + "loss": 5.0474, + "step": 16556 + }, + { + "epoch": 0.09846916928347131, + "grad_norm": 1.9725440740585327, + "learning_rate": 4.881343347596229e-05, + "loss": 5.0361, + "step": 16557 + }, + { + "epoch": 0.09847511656675231, + "grad_norm": 2.0074071884155273, + "learning_rate": 4.881329127683968e-05, + "loss": 5.5143, + "step": 16558 + }, + { + "epoch": 0.0984810638500333, + "grad_norm": 1.8329545259475708, + "learning_rate": 4.8813149069404093e-05, + "loss": 5.8843, + "step": 16559 + }, + { + "epoch": 0.0984870111333143, + "grad_norm": 2.2991678714752197, + "learning_rate": 4.881300685365558e-05, + "loss": 4.6178, + "step": 16560 + }, + { + "epoch": 0.0984929584165953, + "grad_norm": 2.7643637657165527, + "learning_rate": 4.881286462959419e-05, + "loss": 4.1381, + "step": 16561 + }, + { + "epoch": 0.0984989056998763, + "grad_norm": 2.5811941623687744, + "learning_rate": 4.8812722397219985e-05, + "loss": 3.8026, + "step": 16562 + }, + { + "epoch": 0.0985048529831573, + "grad_norm": 2.1111907958984375, + "learning_rate": 4.8812580156533e-05, + "loss": 4.0149, + "step": 16563 + }, + { + "epoch": 0.0985108002664383, + "grad_norm": 2.229973793029785, + "learning_rate": 4.8812437907533294e-05, + "loss": 4.24, + "step": 16564 + }, + { + "epoch": 0.09851674754971929, + "grad_norm": 1.6310914754867554, + "learning_rate": 4.8812295650220905e-05, + "loss": 5.9476, + "step": 16565 + }, + { + "epoch": 0.09852269483300029, + "grad_norm": 1.7397875785827637, + "learning_rate": 4.881215338459589e-05, + "loss": 5.8527, + "step": 16566 + }, + { + "epoch": 0.09852864211628128, + "grad_norm": 1.8279019594192505, + "learning_rate": 4.88120111106583e-05, + "loss": 5.5869, + "step": 16567 + }, + { + "epoch": 0.09853458939956228, + "grad_norm": 1.6956331729888916, + "learning_rate": 4.881186882840818e-05, + "loss": 5.6508, + "step": 16568 + }, + { + "epoch": 0.09854053668284328, + "grad_norm": 1.619205355644226, + "learning_rate": 4.881172653784559e-05, + "loss": 5.6502, + "step": 16569 + }, + { + "epoch": 0.09854648396612427, + "grad_norm": 1.4612733125686646, + "learning_rate": 4.881158423897057e-05, + "loss": 5.5937, + "step": 16570 + }, + { + "epoch": 0.09855243124940527, + "grad_norm": 1.4997358322143555, + "learning_rate": 4.8811441931783165e-05, + "loss": 5.5865, + "step": 16571 + }, + { + "epoch": 0.09855837853268627, + "grad_norm": 1.6516716480255127, + "learning_rate": 4.8811299616283434e-05, + "loss": 5.4031, + "step": 16572 + }, + { + "epoch": 0.09856432581596726, + "grad_norm": 1.5714633464813232, + "learning_rate": 4.881115729247143e-05, + "loss": 5.4543, + "step": 16573 + }, + { + "epoch": 0.09857027309924826, + "grad_norm": 1.4891443252563477, + "learning_rate": 4.881101496034719e-05, + "loss": 5.5687, + "step": 16574 + }, + { + "epoch": 0.09857622038252926, + "grad_norm": 1.3504915237426758, + "learning_rate": 4.8810872619910773e-05, + "loss": 5.5777, + "step": 16575 + }, + { + "epoch": 0.09858216766581025, + "grad_norm": 1.5825836658477783, + "learning_rate": 4.881073027116223e-05, + "loss": 5.547, + "step": 16576 + }, + { + "epoch": 0.09858811494909125, + "grad_norm": 1.4398233890533447, + "learning_rate": 4.8810587914101607e-05, + "loss": 5.4707, + "step": 16577 + }, + { + "epoch": 0.09859406223237226, + "grad_norm": 1.6776020526885986, + "learning_rate": 4.881044554872895e-05, + "loss": 5.4879, + "step": 16578 + }, + { + "epoch": 0.09860000951565324, + "grad_norm": 1.417771339416504, + "learning_rate": 4.8810303175044316e-05, + "loss": 5.5362, + "step": 16579 + }, + { + "epoch": 0.09860595679893425, + "grad_norm": 1.4919921159744263, + "learning_rate": 4.881016079304775e-05, + "loss": 5.5289, + "step": 16580 + }, + { + "epoch": 0.09861190408221525, + "grad_norm": 1.6195905208587646, + "learning_rate": 4.88100184027393e-05, + "loss": 5.467, + "step": 16581 + }, + { + "epoch": 0.09861785136549624, + "grad_norm": 1.5255846977233887, + "learning_rate": 4.880987600411902e-05, + "loss": 6.268, + "step": 16582 + }, + { + "epoch": 0.09862379864877724, + "grad_norm": 1.5051823854446411, + "learning_rate": 4.880973359718696e-05, + "loss": 6.024, + "step": 16583 + }, + { + "epoch": 0.09862974593205824, + "grad_norm": 2.455932378768921, + "learning_rate": 4.880959118194317e-05, + "loss": 5.0881, + "step": 16584 + }, + { + "epoch": 0.09863569321533923, + "grad_norm": 2.3916566371917725, + "learning_rate": 4.880944875838769e-05, + "loss": 5.0897, + "step": 16585 + }, + { + "epoch": 0.09864164049862023, + "grad_norm": 2.0487334728240967, + "learning_rate": 4.880930632652058e-05, + "loss": 5.603, + "step": 16586 + }, + { + "epoch": 0.09864758778190123, + "grad_norm": 1.9195282459259033, + "learning_rate": 4.880916388634189e-05, + "loss": 5.6492, + "step": 16587 + }, + { + "epoch": 0.09865353506518222, + "grad_norm": 1.743602991104126, + "learning_rate": 4.880902143785166e-05, + "loss": 5.7378, + "step": 16588 + }, + { + "epoch": 0.09865948234846322, + "grad_norm": 1.913156509399414, + "learning_rate": 4.880887898104996e-05, + "loss": 5.6267, + "step": 16589 + }, + { + "epoch": 0.09866542963174423, + "grad_norm": 1.8759669065475464, + "learning_rate": 4.880873651593681e-05, + "loss": 5.5593, + "step": 16590 + }, + { + "epoch": 0.09867137691502521, + "grad_norm": 1.8475536108016968, + "learning_rate": 4.880859404251229e-05, + "loss": 5.5021, + "step": 16591 + }, + { + "epoch": 0.09867732419830622, + "grad_norm": 1.5235642194747925, + "learning_rate": 4.880845156077643e-05, + "loss": 5.4692, + "step": 16592 + }, + { + "epoch": 0.09868327148158722, + "grad_norm": 1.8132069110870361, + "learning_rate": 4.8808309070729294e-05, + "loss": 5.6067, + "step": 16593 + }, + { + "epoch": 0.0986892187648682, + "grad_norm": 1.8001697063446045, + "learning_rate": 4.880816657237091e-05, + "loss": 5.749, + "step": 16594 + }, + { + "epoch": 0.09869516604814921, + "grad_norm": 1.8349007368087769, + "learning_rate": 4.8808024065701354e-05, + "loss": 5.6596, + "step": 16595 + }, + { + "epoch": 0.0987011133314302, + "grad_norm": 1.5677918195724487, + "learning_rate": 4.880788155072065e-05, + "loss": 5.725, + "step": 16596 + }, + { + "epoch": 0.0987070606147112, + "grad_norm": 1.8379719257354736, + "learning_rate": 4.880773902742887e-05, + "loss": 5.4325, + "step": 16597 + }, + { + "epoch": 0.0987130078979922, + "grad_norm": 1.8847566843032837, + "learning_rate": 4.880759649582605e-05, + "loss": 5.5737, + "step": 16598 + }, + { + "epoch": 0.09871895518127319, + "grad_norm": 2.398552417755127, + "learning_rate": 4.8807453955912244e-05, + "loss": 5.4192, + "step": 16599 + }, + { + "epoch": 0.09872490246455419, + "grad_norm": 1.990404486656189, + "learning_rate": 4.8807311407687494e-05, + "loss": 5.4624, + "step": 16600 + }, + { + "epoch": 0.09873084974783519, + "grad_norm": 1.533575177192688, + "learning_rate": 4.880716885115187e-05, + "loss": 5.8242, + "step": 16601 + }, + { + "epoch": 0.09873679703111618, + "grad_norm": 1.7357563972473145, + "learning_rate": 4.88070262863054e-05, + "loss": 5.9343, + "step": 16602 + }, + { + "epoch": 0.09874274431439718, + "grad_norm": 1.8504372835159302, + "learning_rate": 4.880688371314816e-05, + "loss": 5.6685, + "step": 16603 + }, + { + "epoch": 0.09874869159767818, + "grad_norm": 2.5040910243988037, + "learning_rate": 4.880674113168016e-05, + "loss": 5.1591, + "step": 16604 + }, + { + "epoch": 0.09875463888095917, + "grad_norm": 2.7820568084716797, + "learning_rate": 4.880659854190148e-05, + "loss": 5.0528, + "step": 16605 + }, + { + "epoch": 0.09876058616424017, + "grad_norm": 2.004427909851074, + "learning_rate": 4.8806455943812165e-05, + "loss": 5.6251, + "step": 16606 + }, + { + "epoch": 0.09876653344752118, + "grad_norm": 1.8053330183029175, + "learning_rate": 4.880631333741227e-05, + "loss": 5.5293, + "step": 16607 + }, + { + "epoch": 0.09877248073080216, + "grad_norm": 1.6708273887634277, + "learning_rate": 4.8806170722701824e-05, + "loss": 6.1215, + "step": 16608 + }, + { + "epoch": 0.09877842801408317, + "grad_norm": 1.6344959735870361, + "learning_rate": 4.88060280996809e-05, + "loss": 6.191, + "step": 16609 + }, + { + "epoch": 0.09878437529736417, + "grad_norm": 1.68915593624115, + "learning_rate": 4.880588546834953e-05, + "loss": 5.9302, + "step": 16610 + }, + { + "epoch": 0.09879032258064516, + "grad_norm": 2.108917236328125, + "learning_rate": 4.8805742828707777e-05, + "loss": 5.5227, + "step": 16611 + }, + { + "epoch": 0.09879626986392616, + "grad_norm": 1.7772480249404907, + "learning_rate": 4.8805600180755685e-05, + "loss": 5.5694, + "step": 16612 + }, + { + "epoch": 0.09880221714720716, + "grad_norm": 1.629629135131836, + "learning_rate": 4.8805457524493305e-05, + "loss": 5.7881, + "step": 16613 + }, + { + "epoch": 0.09880816443048815, + "grad_norm": 1.8985555171966553, + "learning_rate": 4.880531485992068e-05, + "loss": 5.5357, + "step": 16614 + }, + { + "epoch": 0.09881411171376915, + "grad_norm": 2.5329599380493164, + "learning_rate": 4.880517218703786e-05, + "loss": 4.8959, + "step": 16615 + }, + { + "epoch": 0.09882005899705015, + "grad_norm": 2.408377170562744, + "learning_rate": 4.8805029505844915e-05, + "loss": 4.9581, + "step": 16616 + }, + { + "epoch": 0.09882600628033114, + "grad_norm": 2.125190258026123, + "learning_rate": 4.880488681634187e-05, + "loss": 4.4116, + "step": 16617 + }, + { + "epoch": 0.09883195356361214, + "grad_norm": 2.153186082839966, + "learning_rate": 4.880474411852879e-05, + "loss": 4.2887, + "step": 16618 + }, + { + "epoch": 0.09883790084689315, + "grad_norm": 2.3961498737335205, + "learning_rate": 4.880460141240571e-05, + "loss": 4.6521, + "step": 16619 + }, + { + "epoch": 0.09884384813017413, + "grad_norm": 2.4282264709472656, + "learning_rate": 4.880445869797271e-05, + "loss": 4.6307, + "step": 16620 + }, + { + "epoch": 0.09884979541345514, + "grad_norm": 2.461005687713623, + "learning_rate": 4.88043159752298e-05, + "loss": 4.4234, + "step": 16621 + }, + { + "epoch": 0.09885574269673614, + "grad_norm": 2.5483081340789795, + "learning_rate": 4.8804173244177056e-05, + "loss": 4.2688, + "step": 16622 + }, + { + "epoch": 0.09886168998001713, + "grad_norm": 2.370413303375244, + "learning_rate": 4.8804030504814524e-05, + "loss": 4.4887, + "step": 16623 + }, + { + "epoch": 0.09886763726329813, + "grad_norm": 2.681118965148926, + "learning_rate": 4.880388775714225e-05, + "loss": 4.2941, + "step": 16624 + }, + { + "epoch": 0.09887358454657912, + "grad_norm": 2.1210896968841553, + "learning_rate": 4.8803745001160284e-05, + "loss": 5.1994, + "step": 16625 + }, + { + "epoch": 0.09887953182986012, + "grad_norm": 1.703626275062561, + "learning_rate": 4.880360223686867e-05, + "loss": 5.5578, + "step": 16626 + }, + { + "epoch": 0.09888547911314112, + "grad_norm": 1.5515342950820923, + "learning_rate": 4.8803459464267475e-05, + "loss": 5.6636, + "step": 16627 + }, + { + "epoch": 0.09889142639642211, + "grad_norm": 1.2145434617996216, + "learning_rate": 4.880331668335673e-05, + "loss": 5.3634, + "step": 16628 + }, + { + "epoch": 0.09889737367970311, + "grad_norm": 1.2893304824829102, + "learning_rate": 4.88031738941365e-05, + "loss": 5.5383, + "step": 16629 + }, + { + "epoch": 0.09890332096298411, + "grad_norm": 3.1206297874450684, + "learning_rate": 4.880303109660682e-05, + "loss": 4.9313, + "step": 16630 + }, + { + "epoch": 0.0989092682462651, + "grad_norm": 3.382498264312744, + "learning_rate": 4.8802888290767756e-05, + "loss": 4.4475, + "step": 16631 + }, + { + "epoch": 0.0989152155295461, + "grad_norm": 1.8280858993530273, + "learning_rate": 4.880274547661934e-05, + "loss": 5.6722, + "step": 16632 + }, + { + "epoch": 0.0989211628128271, + "grad_norm": 2.0412793159484863, + "learning_rate": 4.880260265416164e-05, + "loss": 5.3952, + "step": 16633 + }, + { + "epoch": 0.09892711009610809, + "grad_norm": 2.0702524185180664, + "learning_rate": 4.880245982339469e-05, + "loss": 5.2754, + "step": 16634 + }, + { + "epoch": 0.0989330573793891, + "grad_norm": 1.7081348896026611, + "learning_rate": 4.880231698431855e-05, + "loss": 5.8414, + "step": 16635 + }, + { + "epoch": 0.0989390046626701, + "grad_norm": 1.7762012481689453, + "learning_rate": 4.880217413693328e-05, + "loss": 6.0106, + "step": 16636 + }, + { + "epoch": 0.09894495194595108, + "grad_norm": 1.815253496170044, + "learning_rate": 4.8802031281238895e-05, + "loss": 5.9715, + "step": 16637 + }, + { + "epoch": 0.09895089922923209, + "grad_norm": 1.8652589321136475, + "learning_rate": 4.880188841723548e-05, + "loss": 5.9437, + "step": 16638 + }, + { + "epoch": 0.09895684651251309, + "grad_norm": 1.687664270401001, + "learning_rate": 4.8801745544923075e-05, + "loss": 6.0776, + "step": 16639 + }, + { + "epoch": 0.09896279379579408, + "grad_norm": 1.579231858253479, + "learning_rate": 4.880160266430171e-05, + "loss": 6.0486, + "step": 16640 + }, + { + "epoch": 0.09896874107907508, + "grad_norm": 1.711932897567749, + "learning_rate": 4.8801459775371464e-05, + "loss": 5.7954, + "step": 16641 + }, + { + "epoch": 0.09897468836235608, + "grad_norm": 2.022918939590454, + "learning_rate": 4.880131687813237e-05, + "loss": 5.4453, + "step": 16642 + }, + { + "epoch": 0.09898063564563707, + "grad_norm": 2.4682674407958984, + "learning_rate": 4.880117397258449e-05, + "loss": 5.2084, + "step": 16643 + }, + { + "epoch": 0.09898658292891807, + "grad_norm": 2.7558486461639404, + "learning_rate": 4.880103105872786e-05, + "loss": 4.8931, + "step": 16644 + }, + { + "epoch": 0.09899253021219907, + "grad_norm": 1.8757295608520508, + "learning_rate": 4.880088813656253e-05, + "loss": 5.4484, + "step": 16645 + }, + { + "epoch": 0.09899847749548006, + "grad_norm": 2.0811331272125244, + "learning_rate": 4.880074520608857e-05, + "loss": 5.8003, + "step": 16646 + }, + { + "epoch": 0.09900442477876106, + "grad_norm": 1.9147615432739258, + "learning_rate": 4.880060226730601e-05, + "loss": 5.869, + "step": 16647 + }, + { + "epoch": 0.09901037206204207, + "grad_norm": 1.974865436553955, + "learning_rate": 4.88004593202149e-05, + "loss": 5.5896, + "step": 16648 + }, + { + "epoch": 0.09901631934532305, + "grad_norm": 1.8365596532821655, + "learning_rate": 4.88003163648153e-05, + "loss": 5.5321, + "step": 16649 + }, + { + "epoch": 0.09902226662860406, + "grad_norm": 1.5927996635437012, + "learning_rate": 4.8800173401107255e-05, + "loss": 5.49, + "step": 16650 + }, + { + "epoch": 0.09902821391188506, + "grad_norm": 1.7566391229629517, + "learning_rate": 4.880003042909081e-05, + "loss": 5.49, + "step": 16651 + }, + { + "epoch": 0.09903416119516605, + "grad_norm": 1.718018651008606, + "learning_rate": 4.879988744876602e-05, + "loss": 5.4515, + "step": 16652 + }, + { + "epoch": 0.09904010847844705, + "grad_norm": 1.8946046829223633, + "learning_rate": 4.879974446013295e-05, + "loss": 4.9902, + "step": 16653 + }, + { + "epoch": 0.09904605576172804, + "grad_norm": 1.939060926437378, + "learning_rate": 4.879960146319162e-05, + "loss": 5.2067, + "step": 16654 + }, + { + "epoch": 0.09905200304500904, + "grad_norm": 1.6621825695037842, + "learning_rate": 4.8799458457942106e-05, + "loss": 5.0041, + "step": 16655 + }, + { + "epoch": 0.09905795032829004, + "grad_norm": 1.8790650367736816, + "learning_rate": 4.879931544438444e-05, + "loss": 4.6893, + "step": 16656 + }, + { + "epoch": 0.09906389761157103, + "grad_norm": 2.20035982131958, + "learning_rate": 4.879917242251868e-05, + "loss": 4.4463, + "step": 16657 + }, + { + "epoch": 0.09906984489485203, + "grad_norm": 1.4379361867904663, + "learning_rate": 4.879902939234487e-05, + "loss": 4.993, + "step": 16658 + }, + { + "epoch": 0.09907579217813303, + "grad_norm": 2.2738726139068604, + "learning_rate": 4.879888635386307e-05, + "loss": 5.108, + "step": 16659 + }, + { + "epoch": 0.09908173946141402, + "grad_norm": 2.0921952724456787, + "learning_rate": 4.8798743307073325e-05, + "loss": 5.3023, + "step": 16660 + }, + { + "epoch": 0.09908768674469502, + "grad_norm": 1.894437313079834, + "learning_rate": 4.8798600251975684e-05, + "loss": 5.2797, + "step": 16661 + }, + { + "epoch": 0.09909363402797602, + "grad_norm": 1.6831610202789307, + "learning_rate": 4.87984571885702e-05, + "loss": 5.3342, + "step": 16662 + }, + { + "epoch": 0.09909958131125701, + "grad_norm": 1.9177473783493042, + "learning_rate": 4.879831411685691e-05, + "loss": 5.2245, + "step": 16663 + }, + { + "epoch": 0.09910552859453801, + "grad_norm": 1.8289183378219604, + "learning_rate": 4.879817103683589e-05, + "loss": 5.2411, + "step": 16664 + }, + { + "epoch": 0.09911147587781902, + "grad_norm": 1.7047971487045288, + "learning_rate": 4.8798027948507166e-05, + "loss": 5.1896, + "step": 16665 + }, + { + "epoch": 0.0991174231611, + "grad_norm": 1.5395535230636597, + "learning_rate": 4.87978848518708e-05, + "loss": 5.0688, + "step": 16666 + }, + { + "epoch": 0.099123370444381, + "grad_norm": 1.652870535850525, + "learning_rate": 4.879774174692683e-05, + "loss": 5.1786, + "step": 16667 + }, + { + "epoch": 0.09912931772766201, + "grad_norm": 1.7581889629364014, + "learning_rate": 4.8797598633675326e-05, + "loss": 5.0549, + "step": 16668 + }, + { + "epoch": 0.099135265010943, + "grad_norm": 1.6056864261627197, + "learning_rate": 4.8797455512116315e-05, + "loss": 5.0516, + "step": 16669 + }, + { + "epoch": 0.099141212294224, + "grad_norm": 1.8067295551300049, + "learning_rate": 4.879731238224986e-05, + "loss": 5.0642, + "step": 16670 + }, + { + "epoch": 0.099147159577505, + "grad_norm": 1.7332173585891724, + "learning_rate": 4.8797169244076016e-05, + "loss": 5.0361, + "step": 16671 + }, + { + "epoch": 0.09915310686078599, + "grad_norm": 1.64972984790802, + "learning_rate": 4.879702609759482e-05, + "loss": 5.0521, + "step": 16672 + }, + { + "epoch": 0.09915905414406699, + "grad_norm": 1.8066579103469849, + "learning_rate": 4.879688294280633e-05, + "loss": 5.1431, + "step": 16673 + }, + { + "epoch": 0.09916500142734799, + "grad_norm": 2.093921661376953, + "learning_rate": 4.879673977971059e-05, + "loss": 5.4831, + "step": 16674 + }, + { + "epoch": 0.09917094871062898, + "grad_norm": 2.1563215255737305, + "learning_rate": 4.879659660830766e-05, + "loss": 5.4992, + "step": 16675 + }, + { + "epoch": 0.09917689599390998, + "grad_norm": 1.9041906595230103, + "learning_rate": 4.8796453428597585e-05, + "loss": 6.0952, + "step": 16676 + }, + { + "epoch": 0.09918284327719099, + "grad_norm": 1.7259836196899414, + "learning_rate": 4.879631024058041e-05, + "loss": 5.9602, + "step": 16677 + }, + { + "epoch": 0.09918879056047197, + "grad_norm": 2.075324058532715, + "learning_rate": 4.879616704425619e-05, + "loss": 5.1186, + "step": 16678 + }, + { + "epoch": 0.09919473784375298, + "grad_norm": 2.243378162384033, + "learning_rate": 4.8796023839624975e-05, + "loss": 4.8764, + "step": 16679 + }, + { + "epoch": 0.09920068512703398, + "grad_norm": 1.8717987537384033, + "learning_rate": 4.879588062668681e-05, + "loss": 5.6084, + "step": 16680 + }, + { + "epoch": 0.09920663241031497, + "grad_norm": 1.8316127061843872, + "learning_rate": 4.879573740544175e-05, + "loss": 5.5613, + "step": 16681 + }, + { + "epoch": 0.09921257969359597, + "grad_norm": 1.7016340494155884, + "learning_rate": 4.879559417588985e-05, + "loss": 5.5577, + "step": 16682 + }, + { + "epoch": 0.09921852697687697, + "grad_norm": 2.2173359394073486, + "learning_rate": 4.879545093803115e-05, + "loss": 4.9591, + "step": 16683 + }, + { + "epoch": 0.09922447426015796, + "grad_norm": 1.9507017135620117, + "learning_rate": 4.87953076918657e-05, + "loss": 5.6648, + "step": 16684 + }, + { + "epoch": 0.09923042154343896, + "grad_norm": 1.6124898195266724, + "learning_rate": 4.879516443739356e-05, + "loss": 6.0163, + "step": 16685 + }, + { + "epoch": 0.09923636882671995, + "grad_norm": 1.5823163986206055, + "learning_rate": 4.879502117461477e-05, + "loss": 5.868, + "step": 16686 + }, + { + "epoch": 0.09924231611000095, + "grad_norm": 1.608522653579712, + "learning_rate": 4.879487790352938e-05, + "loss": 5.7482, + "step": 16687 + }, + { + "epoch": 0.09924826339328195, + "grad_norm": 1.783008337020874, + "learning_rate": 4.879473462413745e-05, + "loss": 5.2352, + "step": 16688 + }, + { + "epoch": 0.09925421067656294, + "grad_norm": 1.8089349269866943, + "learning_rate": 4.8794591336439024e-05, + "loss": 5.1793, + "step": 16689 + }, + { + "epoch": 0.09926015795984394, + "grad_norm": 1.5393356084823608, + "learning_rate": 4.879444804043415e-05, + "loss": 5.4802, + "step": 16690 + }, + { + "epoch": 0.09926610524312494, + "grad_norm": 1.7046642303466797, + "learning_rate": 4.8794304736122886e-05, + "loss": 5.8368, + "step": 16691 + }, + { + "epoch": 0.09927205252640593, + "grad_norm": 1.7474054098129272, + "learning_rate": 4.879416142350527e-05, + "loss": 5.7578, + "step": 16692 + }, + { + "epoch": 0.09927799980968693, + "grad_norm": 1.9804757833480835, + "learning_rate": 4.879401810258136e-05, + "loss": 5.691, + "step": 16693 + }, + { + "epoch": 0.09928394709296794, + "grad_norm": 1.7752422094345093, + "learning_rate": 4.87938747733512e-05, + "loss": 5.2478, + "step": 16694 + }, + { + "epoch": 0.09928989437624892, + "grad_norm": 1.8842644691467285, + "learning_rate": 4.879373143581485e-05, + "loss": 5.2061, + "step": 16695 + }, + { + "epoch": 0.09929584165952993, + "grad_norm": 1.6537442207336426, + "learning_rate": 4.8793588089972355e-05, + "loss": 5.215, + "step": 16696 + }, + { + "epoch": 0.09930178894281093, + "grad_norm": 1.5108014345169067, + "learning_rate": 4.8793444735823755e-05, + "loss": 5.2327, + "step": 16697 + }, + { + "epoch": 0.09930773622609192, + "grad_norm": 1.4653078317642212, + "learning_rate": 4.8793301373369116e-05, + "loss": 5.219, + "step": 16698 + }, + { + "epoch": 0.09931368350937292, + "grad_norm": 1.3908593654632568, + "learning_rate": 4.879315800260848e-05, + "loss": 5.1597, + "step": 16699 + }, + { + "epoch": 0.09931963079265392, + "grad_norm": 1.3809629678726196, + "learning_rate": 4.87930146235419e-05, + "loss": 5.2364, + "step": 16700 + }, + { + "epoch": 0.09932557807593491, + "grad_norm": 1.741685152053833, + "learning_rate": 4.879287123616943e-05, + "loss": 5.7777, + "step": 16701 + }, + { + "epoch": 0.09933152535921591, + "grad_norm": 1.7733122110366821, + "learning_rate": 4.879272784049111e-05, + "loss": 5.4035, + "step": 16702 + }, + { + "epoch": 0.09933747264249691, + "grad_norm": 1.4871195554733276, + "learning_rate": 4.8792584436506985e-05, + "loss": 4.961, + "step": 16703 + }, + { + "epoch": 0.0993434199257779, + "grad_norm": 1.6865509748458862, + "learning_rate": 4.8792441024217115e-05, + "loss": 4.9876, + "step": 16704 + }, + { + "epoch": 0.0993493672090589, + "grad_norm": 1.6606428623199463, + "learning_rate": 4.879229760362156e-05, + "loss": 5.1431, + "step": 16705 + }, + { + "epoch": 0.0993553144923399, + "grad_norm": 1.6394522190093994, + "learning_rate": 4.879215417472036e-05, + "loss": 5.223, + "step": 16706 + }, + { + "epoch": 0.0993612617756209, + "grad_norm": 1.6220464706420898, + "learning_rate": 4.879201073751356e-05, + "loss": 5.322, + "step": 16707 + }, + { + "epoch": 0.0993672090589019, + "grad_norm": 1.4539369344711304, + "learning_rate": 4.879186729200121e-05, + "loss": 5.1935, + "step": 16708 + }, + { + "epoch": 0.0993731563421829, + "grad_norm": 1.7421495914459229, + "learning_rate": 4.8791723838183376e-05, + "loss": 5.0639, + "step": 16709 + }, + { + "epoch": 0.09937910362546389, + "grad_norm": 1.5782475471496582, + "learning_rate": 4.8791580376060085e-05, + "loss": 5.8221, + "step": 16710 + }, + { + "epoch": 0.09938505090874489, + "grad_norm": 1.6991766691207886, + "learning_rate": 4.879143690563141e-05, + "loss": 5.9037, + "step": 16711 + }, + { + "epoch": 0.09939099819202589, + "grad_norm": 1.7815147638320923, + "learning_rate": 4.879129342689739e-05, + "loss": 5.668, + "step": 16712 + }, + { + "epoch": 0.09939694547530688, + "grad_norm": 1.6047189235687256, + "learning_rate": 4.879114993985806e-05, + "loss": 5.3005, + "step": 16713 + }, + { + "epoch": 0.09940289275858788, + "grad_norm": 1.8050780296325684, + "learning_rate": 4.87910064445135e-05, + "loss": 5.4931, + "step": 16714 + }, + { + "epoch": 0.09940884004186887, + "grad_norm": 2.010920286178589, + "learning_rate": 4.8790862940863744e-05, + "loss": 5.6301, + "step": 16715 + }, + { + "epoch": 0.09941478732514987, + "grad_norm": 1.443099856376648, + "learning_rate": 4.879071942890884e-05, + "loss": 5.9498, + "step": 16716 + }, + { + "epoch": 0.09942073460843087, + "grad_norm": 1.777207612991333, + "learning_rate": 4.879057590864885e-05, + "loss": 5.2754, + "step": 16717 + }, + { + "epoch": 0.09942668189171186, + "grad_norm": 2.314602851867676, + "learning_rate": 4.87904323800838e-05, + "loss": 5.1447, + "step": 16718 + }, + { + "epoch": 0.09943262917499286, + "grad_norm": 1.4886807203292847, + "learning_rate": 4.879028884321377e-05, + "loss": 5.5389, + "step": 16719 + }, + { + "epoch": 0.09943857645827386, + "grad_norm": 1.4403626918792725, + "learning_rate": 4.879014529803879e-05, + "loss": 5.5377, + "step": 16720 + }, + { + "epoch": 0.09944452374155485, + "grad_norm": 1.570827841758728, + "learning_rate": 4.8790001744558916e-05, + "loss": 5.2541, + "step": 16721 + }, + { + "epoch": 0.09945047102483585, + "grad_norm": 1.6352084875106812, + "learning_rate": 4.87898581827742e-05, + "loss": 4.9031, + "step": 16722 + }, + { + "epoch": 0.09945641830811686, + "grad_norm": 1.864465594291687, + "learning_rate": 4.878971461268469e-05, + "loss": 4.8689, + "step": 16723 + }, + { + "epoch": 0.09946236559139784, + "grad_norm": 1.5618411302566528, + "learning_rate": 4.878957103429044e-05, + "loss": 5.4576, + "step": 16724 + }, + { + "epoch": 0.09946831287467885, + "grad_norm": 1.6910091638565063, + "learning_rate": 4.8789427447591486e-05, + "loss": 5.557, + "step": 16725 + }, + { + "epoch": 0.09947426015795985, + "grad_norm": 1.708056926727295, + "learning_rate": 4.8789283852587895e-05, + "loss": 5.5343, + "step": 16726 + }, + { + "epoch": 0.09948020744124084, + "grad_norm": 1.5828802585601807, + "learning_rate": 4.878914024927971e-05, + "loss": 5.3913, + "step": 16727 + }, + { + "epoch": 0.09948615472452184, + "grad_norm": 1.6802269220352173, + "learning_rate": 4.878899663766698e-05, + "loss": 5.4407, + "step": 16728 + }, + { + "epoch": 0.09949210200780284, + "grad_norm": 2.0542306900024414, + "learning_rate": 4.8788853017749766e-05, + "loss": 4.9265, + "step": 16729 + }, + { + "epoch": 0.09949804929108383, + "grad_norm": 2.035903215408325, + "learning_rate": 4.87887093895281e-05, + "loss": 5.1802, + "step": 16730 + }, + { + "epoch": 0.09950399657436483, + "grad_norm": 1.7885538339614868, + "learning_rate": 4.8788565753002044e-05, + "loss": 5.5238, + "step": 16731 + }, + { + "epoch": 0.09950994385764583, + "grad_norm": 1.606881022453308, + "learning_rate": 4.878842210817165e-05, + "loss": 5.805, + "step": 16732 + }, + { + "epoch": 0.09951589114092682, + "grad_norm": 1.6354256868362427, + "learning_rate": 4.8788278455036956e-05, + "loss": 5.7968, + "step": 16733 + }, + { + "epoch": 0.09952183842420782, + "grad_norm": 1.7537651062011719, + "learning_rate": 4.8788134793598024e-05, + "loss": 5.5945, + "step": 16734 + }, + { + "epoch": 0.09952778570748883, + "grad_norm": 2.149411678314209, + "learning_rate": 4.8787991123854895e-05, + "loss": 4.7458, + "step": 16735 + }, + { + "epoch": 0.09953373299076981, + "grad_norm": 1.9956060647964478, + "learning_rate": 4.878784744580763e-05, + "loss": 4.9471, + "step": 16736 + }, + { + "epoch": 0.09953968027405082, + "grad_norm": 2.0445396900177, + "learning_rate": 4.878770375945627e-05, + "loss": 4.9063, + "step": 16737 + }, + { + "epoch": 0.09954562755733182, + "grad_norm": 1.8563852310180664, + "learning_rate": 4.878756006480088e-05, + "loss": 5.8788, + "step": 16738 + }, + { + "epoch": 0.0995515748406128, + "grad_norm": 1.8931719064712524, + "learning_rate": 4.8787416361841474e-05, + "loss": 6.0917, + "step": 16739 + }, + { + "epoch": 0.09955752212389381, + "grad_norm": 2.062368869781494, + "learning_rate": 4.878727265057814e-05, + "loss": 5.0113, + "step": 16740 + }, + { + "epoch": 0.09956346940717481, + "grad_norm": 1.7274762392044067, + "learning_rate": 4.878712893101092e-05, + "loss": 5.7383, + "step": 16741 + }, + { + "epoch": 0.0995694166904558, + "grad_norm": 1.7377746105194092, + "learning_rate": 4.878698520313986e-05, + "loss": 5.5545, + "step": 16742 + }, + { + "epoch": 0.0995753639737368, + "grad_norm": 1.8383115530014038, + "learning_rate": 4.8786841466965e-05, + "loss": 5.2297, + "step": 16743 + }, + { + "epoch": 0.09958131125701779, + "grad_norm": 1.7715762853622437, + "learning_rate": 4.8786697722486405e-05, + "loss": 5.4735, + "step": 16744 + }, + { + "epoch": 0.09958725854029879, + "grad_norm": 1.8447803258895874, + "learning_rate": 4.878655396970412e-05, + "loss": 5.25, + "step": 16745 + }, + { + "epoch": 0.09959320582357979, + "grad_norm": 2.215622663497925, + "learning_rate": 4.878641020861819e-05, + "loss": 4.8387, + "step": 16746 + }, + { + "epoch": 0.09959915310686078, + "grad_norm": 1.71353018283844, + "learning_rate": 4.878626643922867e-05, + "loss": 5.6831, + "step": 16747 + }, + { + "epoch": 0.09960510039014178, + "grad_norm": 1.8424171209335327, + "learning_rate": 4.8786122661535616e-05, + "loss": 5.5785, + "step": 16748 + }, + { + "epoch": 0.09961104767342278, + "grad_norm": 1.8796172142028809, + "learning_rate": 4.8785978875539065e-05, + "loss": 5.5921, + "step": 16749 + }, + { + "epoch": 0.09961699495670377, + "grad_norm": 1.820435881614685, + "learning_rate": 4.878583508123908e-05, + "loss": 5.7645, + "step": 16750 + }, + { + "epoch": 0.09962294223998477, + "grad_norm": 1.9210152626037598, + "learning_rate": 4.87856912786357e-05, + "loss": 5.0471, + "step": 16751 + }, + { + "epoch": 0.09962888952326578, + "grad_norm": 1.4372605085372925, + "learning_rate": 4.878554746772899e-05, + "loss": 5.3131, + "step": 16752 + }, + { + "epoch": 0.09963483680654676, + "grad_norm": 1.8078817129135132, + "learning_rate": 4.878540364851898e-05, + "loss": 5.266, + "step": 16753 + }, + { + "epoch": 0.09964078408982777, + "grad_norm": 2.068875551223755, + "learning_rate": 4.878525982100575e-05, + "loss": 4.714, + "step": 16754 + }, + { + "epoch": 0.09964673137310877, + "grad_norm": 2.0813167095184326, + "learning_rate": 4.878511598518931e-05, + "loss": 4.5889, + "step": 16755 + }, + { + "epoch": 0.09965267865638976, + "grad_norm": 2.3035426139831543, + "learning_rate": 4.878497214106974e-05, + "loss": 4.8549, + "step": 16756 + }, + { + "epoch": 0.09965862593967076, + "grad_norm": 1.7791129350662231, + "learning_rate": 4.878482828864709e-05, + "loss": 5.2515, + "step": 16757 + }, + { + "epoch": 0.09966457322295176, + "grad_norm": 1.7512277364730835, + "learning_rate": 4.878468442792139e-05, + "loss": 5.8079, + "step": 16758 + }, + { + "epoch": 0.09967052050623275, + "grad_norm": 1.789523720741272, + "learning_rate": 4.878454055889271e-05, + "loss": 5.4302, + "step": 16759 + }, + { + "epoch": 0.09967646778951375, + "grad_norm": 1.72003173828125, + "learning_rate": 4.8784396681561086e-05, + "loss": 5.6425, + "step": 16760 + }, + { + "epoch": 0.09968241507279475, + "grad_norm": 2.0497727394104004, + "learning_rate": 4.878425279592658e-05, + "loss": 5.6608, + "step": 16761 + }, + { + "epoch": 0.09968836235607574, + "grad_norm": 1.7305432558059692, + "learning_rate": 4.878410890198923e-05, + "loss": 5.5431, + "step": 16762 + }, + { + "epoch": 0.09969430963935674, + "grad_norm": 1.708824634552002, + "learning_rate": 4.878396499974911e-05, + "loss": 5.1754, + "step": 16763 + }, + { + "epoch": 0.09970025692263774, + "grad_norm": 1.9238412380218506, + "learning_rate": 4.878382108920624e-05, + "loss": 5.0595, + "step": 16764 + }, + { + "epoch": 0.09970620420591873, + "grad_norm": 1.7634879350662231, + "learning_rate": 4.878367717036069e-05, + "loss": 5.5733, + "step": 16765 + }, + { + "epoch": 0.09971215148919974, + "grad_norm": 1.7330491542816162, + "learning_rate": 4.8783533243212495e-05, + "loss": 5.4314, + "step": 16766 + }, + { + "epoch": 0.09971809877248074, + "grad_norm": 1.4424408674240112, + "learning_rate": 4.878338930776172e-05, + "loss": 5.3059, + "step": 16767 + }, + { + "epoch": 0.09972404605576173, + "grad_norm": 1.4692374467849731, + "learning_rate": 4.878324536400841e-05, + "loss": 5.2838, + "step": 16768 + }, + { + "epoch": 0.09972999333904273, + "grad_norm": 1.3602346181869507, + "learning_rate": 4.878310141195262e-05, + "loss": 5.5587, + "step": 16769 + }, + { + "epoch": 0.09973594062232373, + "grad_norm": 1.3222168684005737, + "learning_rate": 4.878295745159438e-05, + "loss": 5.61, + "step": 16770 + }, + { + "epoch": 0.09974188790560472, + "grad_norm": 1.398383378982544, + "learning_rate": 4.878281348293377e-05, + "loss": 5.5348, + "step": 16771 + }, + { + "epoch": 0.09974783518888572, + "grad_norm": 1.4184808731079102, + "learning_rate": 4.878266950597081e-05, + "loss": 5.4425, + "step": 16772 + }, + { + "epoch": 0.09975378247216671, + "grad_norm": 1.2451627254486084, + "learning_rate": 4.878252552070558e-05, + "loss": 5.5105, + "step": 16773 + }, + { + "epoch": 0.09975972975544771, + "grad_norm": 1.4243760108947754, + "learning_rate": 4.878238152713811e-05, + "loss": 5.5839, + "step": 16774 + }, + { + "epoch": 0.09976567703872871, + "grad_norm": 1.1774061918258667, + "learning_rate": 4.878223752526846e-05, + "loss": 5.4785, + "step": 16775 + }, + { + "epoch": 0.0997716243220097, + "grad_norm": 1.2542285919189453, + "learning_rate": 4.8782093515096676e-05, + "loss": 5.4994, + "step": 16776 + }, + { + "epoch": 0.0997775716052907, + "grad_norm": 1.486611008644104, + "learning_rate": 4.878194949662281e-05, + "loss": 5.347, + "step": 16777 + }, + { + "epoch": 0.0997835188885717, + "grad_norm": 1.391717791557312, + "learning_rate": 4.878180546984691e-05, + "loss": 5.3397, + "step": 16778 + }, + { + "epoch": 0.09978946617185269, + "grad_norm": 1.819778323173523, + "learning_rate": 4.878166143476902e-05, + "loss": 5.4217, + "step": 16779 + }, + { + "epoch": 0.0997954134551337, + "grad_norm": 1.549660563468933, + "learning_rate": 4.8781517391389205e-05, + "loss": 5.5044, + "step": 16780 + }, + { + "epoch": 0.0998013607384147, + "grad_norm": 1.4923075437545776, + "learning_rate": 4.878137333970751e-05, + "loss": 5.4779, + "step": 16781 + }, + { + "epoch": 0.09980730802169568, + "grad_norm": 1.3846399784088135, + "learning_rate": 4.878122927972398e-05, + "loss": 5.8974, + "step": 16782 + }, + { + "epoch": 0.09981325530497669, + "grad_norm": 1.325563669204712, + "learning_rate": 4.878108521143867e-05, + "loss": 5.516, + "step": 16783 + }, + { + "epoch": 0.09981920258825769, + "grad_norm": 1.3482844829559326, + "learning_rate": 4.878094113485162e-05, + "loss": 5.4661, + "step": 16784 + }, + { + "epoch": 0.09982514987153868, + "grad_norm": 1.4238206148147583, + "learning_rate": 4.87807970499629e-05, + "loss": 5.5551, + "step": 16785 + }, + { + "epoch": 0.09983109715481968, + "grad_norm": 1.1277439594268799, + "learning_rate": 4.8780652956772544e-05, + "loss": 5.3611, + "step": 16786 + }, + { + "epoch": 0.09983704443810068, + "grad_norm": 1.2312495708465576, + "learning_rate": 4.878050885528061e-05, + "loss": 5.4233, + "step": 16787 + }, + { + "epoch": 0.09984299172138167, + "grad_norm": 1.3811876773834229, + "learning_rate": 4.878036474548715e-05, + "loss": 5.4336, + "step": 16788 + }, + { + "epoch": 0.09984893900466267, + "grad_norm": 1.211362361907959, + "learning_rate": 4.87802206273922e-05, + "loss": 4.9956, + "step": 16789 + }, + { + "epoch": 0.09985488628794367, + "grad_norm": 1.0385311841964722, + "learning_rate": 4.878007650099583e-05, + "loss": 5.4416, + "step": 16790 + }, + { + "epoch": 0.09986083357122466, + "grad_norm": 1.2311192750930786, + "learning_rate": 4.8779932366298074e-05, + "loss": 5.4814, + "step": 16791 + }, + { + "epoch": 0.09986678085450566, + "grad_norm": 1.6310219764709473, + "learning_rate": 4.8779788223299e-05, + "loss": 5.1746, + "step": 16792 + }, + { + "epoch": 0.09987272813778666, + "grad_norm": 1.4695444107055664, + "learning_rate": 4.877964407199864e-05, + "loss": 5.3724, + "step": 16793 + }, + { + "epoch": 0.09987867542106765, + "grad_norm": 1.8295196294784546, + "learning_rate": 4.877949991239705e-05, + "loss": 5.1085, + "step": 16794 + }, + { + "epoch": 0.09988462270434866, + "grad_norm": 1.5845080614089966, + "learning_rate": 4.877935574449428e-05, + "loss": 5.027, + "step": 16795 + }, + { + "epoch": 0.09989056998762966, + "grad_norm": 1.3743692636489868, + "learning_rate": 4.8779211568290395e-05, + "loss": 5.0717, + "step": 16796 + }, + { + "epoch": 0.09989651727091065, + "grad_norm": 1.3857053518295288, + "learning_rate": 4.877906738378542e-05, + "loss": 4.9698, + "step": 16797 + }, + { + "epoch": 0.09990246455419165, + "grad_norm": 1.3818373680114746, + "learning_rate": 4.8778923190979425e-05, + "loss": 4.8686, + "step": 16798 + }, + { + "epoch": 0.09990841183747265, + "grad_norm": 1.563095211982727, + "learning_rate": 4.877877898987245e-05, + "loss": 4.6804, + "step": 16799 + }, + { + "epoch": 0.09991435912075364, + "grad_norm": 1.3965919017791748, + "learning_rate": 4.877863478046455e-05, + "loss": 5.141, + "step": 16800 + }, + { + "epoch": 0.09992030640403464, + "grad_norm": 1.5473159551620483, + "learning_rate": 4.8778490562755775e-05, + "loss": 5.0796, + "step": 16801 + }, + { + "epoch": 0.09992625368731563, + "grad_norm": 2.548140525817871, + "learning_rate": 4.877834633674618e-05, + "loss": 4.9149, + "step": 16802 + }, + { + "epoch": 0.09993220097059663, + "grad_norm": 1.59461510181427, + "learning_rate": 4.87782021024358e-05, + "loss": 4.9048, + "step": 16803 + }, + { + "epoch": 0.09993814825387763, + "grad_norm": 1.49467134475708, + "learning_rate": 4.87780578598247e-05, + "loss": 5.2484, + "step": 16804 + }, + { + "epoch": 0.09994409553715862, + "grad_norm": 1.5844218730926514, + "learning_rate": 4.8777913608912926e-05, + "loss": 5.2107, + "step": 16805 + }, + { + "epoch": 0.09995004282043962, + "grad_norm": 1.465334415435791, + "learning_rate": 4.877776934970053e-05, + "loss": 5.4002, + "step": 16806 + }, + { + "epoch": 0.09995599010372062, + "grad_norm": 1.5409786701202393, + "learning_rate": 4.877762508218756e-05, + "loss": 5.6233, + "step": 16807 + }, + { + "epoch": 0.09996193738700161, + "grad_norm": 1.3813812732696533, + "learning_rate": 4.877748080637406e-05, + "loss": 5.3072, + "step": 16808 + }, + { + "epoch": 0.09996788467028261, + "grad_norm": 1.3815702199935913, + "learning_rate": 4.8777336522260095e-05, + "loss": 5.0923, + "step": 16809 + }, + { + "epoch": 0.09997383195356362, + "grad_norm": 1.6513910293579102, + "learning_rate": 4.87771922298457e-05, + "loss": 5.0482, + "step": 16810 + }, + { + "epoch": 0.0999797792368446, + "grad_norm": 1.6680731773376465, + "learning_rate": 4.8777047929130944e-05, + "loss": 4.984, + "step": 16811 + }, + { + "epoch": 0.0999857265201256, + "grad_norm": 1.4342384338378906, + "learning_rate": 4.8776903620115855e-05, + "loss": 5.2745, + "step": 16812 + }, + { + "epoch": 0.09999167380340661, + "grad_norm": 1.564255714416504, + "learning_rate": 4.87767593028005e-05, + "loss": 5.398, + "step": 16813 + }, + { + "epoch": 0.0999976210866876, + "grad_norm": 1.2767013311386108, + "learning_rate": 4.877661497718493e-05, + "loss": 5.0663, + "step": 16814 + }, + { + "epoch": 0.1000035683699686, + "grad_norm": 1.35418701171875, + "learning_rate": 4.877647064326918e-05, + "loss": 5.064, + "step": 16815 + }, + { + "epoch": 0.1000095156532496, + "grad_norm": 1.5754468441009521, + "learning_rate": 4.877632630105331e-05, + "loss": 5.1525, + "step": 16816 + }, + { + "epoch": 0.10001546293653059, + "grad_norm": 1.8457043170928955, + "learning_rate": 4.877618195053737e-05, + "loss": 5.3074, + "step": 16817 + }, + { + "epoch": 0.10002141021981159, + "grad_norm": 1.7238751649856567, + "learning_rate": 4.877603759172141e-05, + "loss": 5.3408, + "step": 16818 + }, + { + "epoch": 0.10002735750309259, + "grad_norm": 1.5342493057250977, + "learning_rate": 4.8775893224605486e-05, + "loss": 5.3495, + "step": 16819 + }, + { + "epoch": 0.10003330478637358, + "grad_norm": 1.4931390285491943, + "learning_rate": 4.877574884918964e-05, + "loss": 5.2617, + "step": 16820 + }, + { + "epoch": 0.10003925206965458, + "grad_norm": 1.5503534078598022, + "learning_rate": 4.877560446547393e-05, + "loss": 5.0805, + "step": 16821 + }, + { + "epoch": 0.10004519935293558, + "grad_norm": 1.480191707611084, + "learning_rate": 4.87754600734584e-05, + "loss": 5.1405, + "step": 16822 + }, + { + "epoch": 0.10005114663621657, + "grad_norm": 1.371559977531433, + "learning_rate": 4.87753156731431e-05, + "loss": 5.2313, + "step": 16823 + }, + { + "epoch": 0.10005709391949758, + "grad_norm": 1.2534080743789673, + "learning_rate": 4.8775171264528085e-05, + "loss": 5.3029, + "step": 16824 + }, + { + "epoch": 0.10006304120277858, + "grad_norm": 1.4513366222381592, + "learning_rate": 4.8775026847613406e-05, + "loss": 5.2663, + "step": 16825 + }, + { + "epoch": 0.10006898848605957, + "grad_norm": 1.4045735597610474, + "learning_rate": 4.8774882422399105e-05, + "loss": 5.2358, + "step": 16826 + }, + { + "epoch": 0.10007493576934057, + "grad_norm": 1.469664216041565, + "learning_rate": 4.877473798888524e-05, + "loss": 5.0215, + "step": 16827 + }, + { + "epoch": 0.10008088305262157, + "grad_norm": 1.4306927919387817, + "learning_rate": 4.8774593547071855e-05, + "loss": 4.8262, + "step": 16828 + }, + { + "epoch": 0.10008683033590256, + "grad_norm": 1.5118143558502197, + "learning_rate": 4.877444909695902e-05, + "loss": 4.8248, + "step": 16829 + }, + { + "epoch": 0.10009277761918356, + "grad_norm": 1.3022321462631226, + "learning_rate": 4.8774304638546754e-05, + "loss": 4.7268, + "step": 16830 + }, + { + "epoch": 0.10009872490246455, + "grad_norm": 1.468758463859558, + "learning_rate": 4.877416017183513e-05, + "loss": 4.8686, + "step": 16831 + }, + { + "epoch": 0.10010467218574555, + "grad_norm": 1.4958772659301758, + "learning_rate": 4.8774015696824196e-05, + "loss": 5.084, + "step": 16832 + }, + { + "epoch": 0.10011061946902655, + "grad_norm": 1.5816160440444946, + "learning_rate": 4.877387121351399e-05, + "loss": 5.1009, + "step": 16833 + }, + { + "epoch": 0.10011656675230754, + "grad_norm": 1.4751555919647217, + "learning_rate": 4.877372672190458e-05, + "loss": 5.1875, + "step": 16834 + }, + { + "epoch": 0.10012251403558854, + "grad_norm": 1.380433201789856, + "learning_rate": 4.8773582221996006e-05, + "loss": 5.3213, + "step": 16835 + }, + { + "epoch": 0.10012846131886954, + "grad_norm": 1.566112756729126, + "learning_rate": 4.877343771378832e-05, + "loss": 4.9251, + "step": 16836 + }, + { + "epoch": 0.10013440860215053, + "grad_norm": 1.4834301471710205, + "learning_rate": 4.8773293197281566e-05, + "loss": 4.7936, + "step": 16837 + }, + { + "epoch": 0.10014035588543153, + "grad_norm": 1.6053043603897095, + "learning_rate": 4.877314867247581e-05, + "loss": 4.8611, + "step": 16838 + }, + { + "epoch": 0.10014630316871254, + "grad_norm": 1.420598030090332, + "learning_rate": 4.877300413937109e-05, + "loss": 5.0481, + "step": 16839 + }, + { + "epoch": 0.10015225045199352, + "grad_norm": 1.474554181098938, + "learning_rate": 4.877285959796746e-05, + "loss": 5.0342, + "step": 16840 + }, + { + "epoch": 0.10015819773527453, + "grad_norm": 1.6535485982894897, + "learning_rate": 4.877271504826496e-05, + "loss": 5.4624, + "step": 16841 + }, + { + "epoch": 0.10016414501855553, + "grad_norm": 1.3873733282089233, + "learning_rate": 4.877257049026367e-05, + "loss": 5.1673, + "step": 16842 + }, + { + "epoch": 0.10017009230183652, + "grad_norm": 1.3890115022659302, + "learning_rate": 4.8772425923963606e-05, + "loss": 4.938, + "step": 16843 + }, + { + "epoch": 0.10017603958511752, + "grad_norm": 1.443969964981079, + "learning_rate": 4.8772281349364846e-05, + "loss": 4.8525, + "step": 16844 + }, + { + "epoch": 0.10018198686839852, + "grad_norm": 1.545344591140747, + "learning_rate": 4.877213676646742e-05, + "loss": 4.8682, + "step": 16845 + }, + { + "epoch": 0.10018793415167951, + "grad_norm": 1.6065396070480347, + "learning_rate": 4.877199217527138e-05, + "loss": 4.7394, + "step": 16846 + }, + { + "epoch": 0.10019388143496051, + "grad_norm": 1.444199800491333, + "learning_rate": 4.877184757577679e-05, + "loss": 4.7775, + "step": 16847 + }, + { + "epoch": 0.10019982871824151, + "grad_norm": 1.5434626340866089, + "learning_rate": 4.87717029679837e-05, + "loss": 4.6714, + "step": 16848 + }, + { + "epoch": 0.1002057760015225, + "grad_norm": 1.502533197402954, + "learning_rate": 4.877155835189215e-05, + "loss": 4.7591, + "step": 16849 + }, + { + "epoch": 0.1002117232848035, + "grad_norm": 1.6330854892730713, + "learning_rate": 4.877141372750219e-05, + "loss": 4.7426, + "step": 16850 + }, + { + "epoch": 0.1002176705680845, + "grad_norm": 1.658887267112732, + "learning_rate": 4.877126909481388e-05, + "loss": 4.7558, + "step": 16851 + }, + { + "epoch": 0.10022361785136549, + "grad_norm": 1.4569580554962158, + "learning_rate": 4.877112445382727e-05, + "loss": 4.7797, + "step": 16852 + }, + { + "epoch": 0.1002295651346465, + "grad_norm": 1.4903759956359863, + "learning_rate": 4.8770979804542394e-05, + "loss": 4.7895, + "step": 16853 + }, + { + "epoch": 0.1002355124179275, + "grad_norm": 1.638406753540039, + "learning_rate": 4.877083514695933e-05, + "loss": 4.7197, + "step": 16854 + }, + { + "epoch": 0.10024145970120849, + "grad_norm": 1.4558868408203125, + "learning_rate": 4.87706904810781e-05, + "loss": 4.7159, + "step": 16855 + }, + { + "epoch": 0.10024740698448949, + "grad_norm": 1.5545023679733276, + "learning_rate": 4.877054580689877e-05, + "loss": 4.7387, + "step": 16856 + }, + { + "epoch": 0.10025335426777049, + "grad_norm": 1.3767842054367065, + "learning_rate": 4.877040112442139e-05, + "loss": 4.7149, + "step": 16857 + }, + { + "epoch": 0.10025930155105148, + "grad_norm": 1.4483342170715332, + "learning_rate": 4.877025643364601e-05, + "loss": 4.7756, + "step": 16858 + }, + { + "epoch": 0.10026524883433248, + "grad_norm": 1.1949654817581177, + "learning_rate": 4.8770111734572673e-05, + "loss": 4.7883, + "step": 16859 + }, + { + "epoch": 0.10027119611761347, + "grad_norm": 1.430977463722229, + "learning_rate": 4.876996702720144e-05, + "loss": 5.0236, + "step": 16860 + }, + { + "epoch": 0.10027714340089447, + "grad_norm": 1.4976351261138916, + "learning_rate": 4.876982231153236e-05, + "loss": 5.1242, + "step": 16861 + }, + { + "epoch": 0.10028309068417547, + "grad_norm": 1.6913431882858276, + "learning_rate": 4.876967758756547e-05, + "loss": 5.3454, + "step": 16862 + }, + { + "epoch": 0.10028903796745646, + "grad_norm": 1.5901557207107544, + "learning_rate": 4.876953285530084e-05, + "loss": 5.2313, + "step": 16863 + }, + { + "epoch": 0.10029498525073746, + "grad_norm": 2.483757257461548, + "learning_rate": 4.8769388114738515e-05, + "loss": 4.9951, + "step": 16864 + }, + { + "epoch": 0.10030093253401846, + "grad_norm": 1.5647902488708496, + "learning_rate": 4.8769243365878536e-05, + "loss": 5.1029, + "step": 16865 + }, + { + "epoch": 0.10030687981729945, + "grad_norm": 1.5830740928649902, + "learning_rate": 4.8769098608720954e-05, + "loss": 5.1918, + "step": 16866 + }, + { + "epoch": 0.10031282710058045, + "grad_norm": 1.5231165885925293, + "learning_rate": 4.876895384326584e-05, + "loss": 5.0817, + "step": 16867 + }, + { + "epoch": 0.10031877438386146, + "grad_norm": 1.5266731977462769, + "learning_rate": 4.876880906951321e-05, + "loss": 4.9117, + "step": 16868 + }, + { + "epoch": 0.10032472166714244, + "grad_norm": 1.9662569761276245, + "learning_rate": 4.876866428746315e-05, + "loss": 4.8381, + "step": 16869 + }, + { + "epoch": 0.10033066895042345, + "grad_norm": 1.34932279586792, + "learning_rate": 4.876851949711569e-05, + "loss": 5.0781, + "step": 16870 + }, + { + "epoch": 0.10033661623370445, + "grad_norm": 1.3333275318145752, + "learning_rate": 4.876837469847089e-05, + "loss": 5.0527, + "step": 16871 + }, + { + "epoch": 0.10034256351698544, + "grad_norm": 1.3569806814193726, + "learning_rate": 4.876822989152879e-05, + "loss": 5.0854, + "step": 16872 + }, + { + "epoch": 0.10034851080026644, + "grad_norm": 1.4417848587036133, + "learning_rate": 4.876808507628945e-05, + "loss": 4.885, + "step": 16873 + }, + { + "epoch": 0.10035445808354744, + "grad_norm": 1.453704833984375, + "learning_rate": 4.876794025275292e-05, + "loss": 4.8919, + "step": 16874 + }, + { + "epoch": 0.10036040536682843, + "grad_norm": 1.392701268196106, + "learning_rate": 4.876779542091924e-05, + "loss": 5.0682, + "step": 16875 + }, + { + "epoch": 0.10036635265010943, + "grad_norm": 1.5623222589492798, + "learning_rate": 4.876765058078847e-05, + "loss": 5.0369, + "step": 16876 + }, + { + "epoch": 0.10037229993339043, + "grad_norm": 1.4053794145584106, + "learning_rate": 4.876750573236066e-05, + "loss": 4.9932, + "step": 16877 + }, + { + "epoch": 0.10037824721667142, + "grad_norm": 1.3282443284988403, + "learning_rate": 4.876736087563586e-05, + "loss": 5.0678, + "step": 16878 + }, + { + "epoch": 0.10038419449995242, + "grad_norm": 1.3737441301345825, + "learning_rate": 4.876721601061412e-05, + "loss": 5.1292, + "step": 16879 + }, + { + "epoch": 0.10039014178323342, + "grad_norm": 1.3209916353225708, + "learning_rate": 4.876707113729549e-05, + "loss": 5.0717, + "step": 16880 + }, + { + "epoch": 0.10039608906651441, + "grad_norm": 1.2051011323928833, + "learning_rate": 4.8766926255680026e-05, + "loss": 5.0075, + "step": 16881 + }, + { + "epoch": 0.10040203634979541, + "grad_norm": 1.260746955871582, + "learning_rate": 4.876678136576777e-05, + "loss": 4.8419, + "step": 16882 + }, + { + "epoch": 0.10040798363307642, + "grad_norm": 1.3981266021728516, + "learning_rate": 4.876663646755877e-05, + "loss": 4.8558, + "step": 16883 + }, + { + "epoch": 0.1004139309163574, + "grad_norm": 1.3491755723953247, + "learning_rate": 4.876649156105309e-05, + "loss": 4.7809, + "step": 16884 + }, + { + "epoch": 0.10041987819963841, + "grad_norm": 1.3315166234970093, + "learning_rate": 4.8766346646250774e-05, + "loss": 4.9221, + "step": 16885 + }, + { + "epoch": 0.10042582548291941, + "grad_norm": 1.250731348991394, + "learning_rate": 4.876620172315186e-05, + "loss": 4.8344, + "step": 16886 + }, + { + "epoch": 0.1004317727662004, + "grad_norm": 1.249316692352295, + "learning_rate": 4.876605679175642e-05, + "loss": 4.8441, + "step": 16887 + }, + { + "epoch": 0.1004377200494814, + "grad_norm": 1.3112961053848267, + "learning_rate": 4.87659118520645e-05, + "loss": 4.834, + "step": 16888 + }, + { + "epoch": 0.10044366733276239, + "grad_norm": 1.4331620931625366, + "learning_rate": 4.876576690407614e-05, + "loss": 4.9801, + "step": 16889 + }, + { + "epoch": 0.10044961461604339, + "grad_norm": 1.5304386615753174, + "learning_rate": 4.8765621947791396e-05, + "loss": 5.1799, + "step": 16890 + }, + { + "epoch": 0.10045556189932439, + "grad_norm": 1.3581719398498535, + "learning_rate": 4.8765476983210326e-05, + "loss": 5.1517, + "step": 16891 + }, + { + "epoch": 0.10046150918260538, + "grad_norm": 1.2568892240524292, + "learning_rate": 4.876533201033296e-05, + "loss": 5.0663, + "step": 16892 + }, + { + "epoch": 0.10046745646588638, + "grad_norm": 1.3863126039505005, + "learning_rate": 4.876518702915936e-05, + "loss": 4.9666, + "step": 16893 + }, + { + "epoch": 0.10047340374916738, + "grad_norm": 1.328078031539917, + "learning_rate": 4.87650420396896e-05, + "loss": 5.0049, + "step": 16894 + }, + { + "epoch": 0.10047935103244837, + "grad_norm": 1.252009630203247, + "learning_rate": 4.8764897041923696e-05, + "loss": 5.0709, + "step": 16895 + }, + { + "epoch": 0.10048529831572937, + "grad_norm": 1.4895809888839722, + "learning_rate": 4.876475203586171e-05, + "loss": 5.0922, + "step": 16896 + }, + { + "epoch": 0.10049124559901038, + "grad_norm": 1.363641619682312, + "learning_rate": 4.8764607021503696e-05, + "loss": 5.0233, + "step": 16897 + }, + { + "epoch": 0.10049719288229136, + "grad_norm": 1.5323866605758667, + "learning_rate": 4.876446199884971e-05, + "loss": 4.8705, + "step": 16898 + }, + { + "epoch": 0.10050314016557237, + "grad_norm": 1.4069478511810303, + "learning_rate": 4.8764316967899786e-05, + "loss": 5.0136, + "step": 16899 + }, + { + "epoch": 0.10050908744885337, + "grad_norm": 1.4166046380996704, + "learning_rate": 4.876417192865399e-05, + "loss": 5.0047, + "step": 16900 + }, + { + "epoch": 0.10051503473213436, + "grad_norm": 1.5298703908920288, + "learning_rate": 4.876402688111237e-05, + "loss": 5.0046, + "step": 16901 + }, + { + "epoch": 0.10052098201541536, + "grad_norm": 1.340071678161621, + "learning_rate": 4.876388182527497e-05, + "loss": 5.107, + "step": 16902 + }, + { + "epoch": 0.10052692929869636, + "grad_norm": 1.367415189743042, + "learning_rate": 4.876373676114184e-05, + "loss": 4.9292, + "step": 16903 + }, + { + "epoch": 0.10053287658197735, + "grad_norm": 1.3535525798797607, + "learning_rate": 4.876359168871304e-05, + "loss": 4.9801, + "step": 16904 + }, + { + "epoch": 0.10053882386525835, + "grad_norm": 1.2370539903640747, + "learning_rate": 4.8763446607988615e-05, + "loss": 4.9598, + "step": 16905 + }, + { + "epoch": 0.10054477114853935, + "grad_norm": 1.251837968826294, + "learning_rate": 4.876330151896862e-05, + "loss": 5.0506, + "step": 16906 + }, + { + "epoch": 0.10055071843182034, + "grad_norm": 1.3221372365951538, + "learning_rate": 4.8763156421653097e-05, + "loss": 5.4094, + "step": 16907 + }, + { + "epoch": 0.10055666571510134, + "grad_norm": 1.34721040725708, + "learning_rate": 4.87630113160421e-05, + "loss": 5.4361, + "step": 16908 + }, + { + "epoch": 0.10056261299838234, + "grad_norm": 1.2884198427200317, + "learning_rate": 4.876286620213568e-05, + "loss": 5.3518, + "step": 16909 + }, + { + "epoch": 0.10056856028166333, + "grad_norm": 1.259414553642273, + "learning_rate": 4.87627210799339e-05, + "loss": 5.2298, + "step": 16910 + }, + { + "epoch": 0.10057450756494433, + "grad_norm": 1.482032299041748, + "learning_rate": 4.8762575949436796e-05, + "loss": 5.3625, + "step": 16911 + }, + { + "epoch": 0.10058045484822534, + "grad_norm": 1.2673801183700562, + "learning_rate": 4.876243081064441e-05, + "loss": 5.2678, + "step": 16912 + }, + { + "epoch": 0.10058640213150633, + "grad_norm": 1.3014607429504395, + "learning_rate": 4.876228566355682e-05, + "loss": 5.2762, + "step": 16913 + }, + { + "epoch": 0.10059234941478733, + "grad_norm": 1.2084840536117554, + "learning_rate": 4.876214050817405e-05, + "loss": 5.1128, + "step": 16914 + }, + { + "epoch": 0.10059829669806833, + "grad_norm": 1.3497353792190552, + "learning_rate": 4.876199534449617e-05, + "loss": 5.1666, + "step": 16915 + }, + { + "epoch": 0.10060424398134932, + "grad_norm": 1.4095430374145508, + "learning_rate": 4.876185017252322e-05, + "loss": 5.0055, + "step": 16916 + }, + { + "epoch": 0.10061019126463032, + "grad_norm": 1.319938063621521, + "learning_rate": 4.876170499225525e-05, + "loss": 5.0628, + "step": 16917 + }, + { + "epoch": 0.10061613854791131, + "grad_norm": 1.2126001119613647, + "learning_rate": 4.876155980369232e-05, + "loss": 5.4244, + "step": 16918 + }, + { + "epoch": 0.10062208583119231, + "grad_norm": 1.0456511974334717, + "learning_rate": 4.876141460683448e-05, + "loss": 5.2556, + "step": 16919 + }, + { + "epoch": 0.10062803311447331, + "grad_norm": 1.2545825242996216, + "learning_rate": 4.8761269401681765e-05, + "loss": 5.1549, + "step": 16920 + }, + { + "epoch": 0.1006339803977543, + "grad_norm": 1.3613678216934204, + "learning_rate": 4.876112418823424e-05, + "loss": 5.0592, + "step": 16921 + }, + { + "epoch": 0.1006399276810353, + "grad_norm": 1.4963204860687256, + "learning_rate": 4.876097896649196e-05, + "loss": 5.1025, + "step": 16922 + }, + { + "epoch": 0.1006458749643163, + "grad_norm": 1.3221436738967896, + "learning_rate": 4.876083373645495e-05, + "loss": 5.2534, + "step": 16923 + }, + { + "epoch": 0.10065182224759729, + "grad_norm": 1.6041839122772217, + "learning_rate": 4.8760688498123294e-05, + "loss": 5.3351, + "step": 16924 + }, + { + "epoch": 0.1006577695308783, + "grad_norm": 1.4891480207443237, + "learning_rate": 4.876054325149702e-05, + "loss": 5.4782, + "step": 16925 + }, + { + "epoch": 0.1006637168141593, + "grad_norm": 2.101271867752075, + "learning_rate": 4.876039799657619e-05, + "loss": 5.3844, + "step": 16926 + }, + { + "epoch": 0.10066966409744028, + "grad_norm": 1.5637247562408447, + "learning_rate": 4.8760252733360845e-05, + "loss": 5.4488, + "step": 16927 + }, + { + "epoch": 0.10067561138072129, + "grad_norm": 1.5939668416976929, + "learning_rate": 4.8760107461851044e-05, + "loss": 5.3429, + "step": 16928 + }, + { + "epoch": 0.10068155866400229, + "grad_norm": 1.509945273399353, + "learning_rate": 4.875996218204684e-05, + "loss": 5.4501, + "step": 16929 + }, + { + "epoch": 0.10068750594728328, + "grad_norm": 1.553009271621704, + "learning_rate": 4.875981689394827e-05, + "loss": 5.4183, + "step": 16930 + }, + { + "epoch": 0.10069345323056428, + "grad_norm": 1.5002714395523071, + "learning_rate": 4.875967159755539e-05, + "loss": 5.2343, + "step": 16931 + }, + { + "epoch": 0.10069940051384528, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8759526292868266e-05, + "loss": 5.4414, + "step": 16932 + }, + { + "epoch": 0.10070534779712627, + "grad_norm": 1.38532555103302, + "learning_rate": 4.875938097988694e-05, + "loss": 5.4026, + "step": 16933 + }, + { + "epoch": 0.10071129508040727, + "grad_norm": 1.4190242290496826, + "learning_rate": 4.8759235658611445e-05, + "loss": 5.346, + "step": 16934 + }, + { + "epoch": 0.10071724236368827, + "grad_norm": 1.291375756263733, + "learning_rate": 4.875909032904186e-05, + "loss": 5.3715, + "step": 16935 + }, + { + "epoch": 0.10072318964696926, + "grad_norm": 1.5563501119613647, + "learning_rate": 4.8758944991178214e-05, + "loss": 5.2474, + "step": 16936 + }, + { + "epoch": 0.10072913693025026, + "grad_norm": 1.2936631441116333, + "learning_rate": 4.875879964502056e-05, + "loss": 5.2627, + "step": 16937 + }, + { + "epoch": 0.10073508421353126, + "grad_norm": 1.5020617246627808, + "learning_rate": 4.875865429056896e-05, + "loss": 5.2166, + "step": 16938 + }, + { + "epoch": 0.10074103149681225, + "grad_norm": 1.4830302000045776, + "learning_rate": 4.8758508927823464e-05, + "loss": 5.2558, + "step": 16939 + }, + { + "epoch": 0.10074697878009325, + "grad_norm": 1.4259967803955078, + "learning_rate": 4.8758363556784114e-05, + "loss": 5.3117, + "step": 16940 + }, + { + "epoch": 0.10075292606337426, + "grad_norm": 1.5735303163528442, + "learning_rate": 4.875821817745096e-05, + "loss": 5.2993, + "step": 16941 + }, + { + "epoch": 0.10075887334665524, + "grad_norm": 1.6409742832183838, + "learning_rate": 4.875807278982407e-05, + "loss": 5.4337, + "step": 16942 + }, + { + "epoch": 0.10076482062993625, + "grad_norm": 1.5159885883331299, + "learning_rate": 4.875792739390347e-05, + "loss": 5.4222, + "step": 16943 + }, + { + "epoch": 0.10077076791321725, + "grad_norm": 1.704200029373169, + "learning_rate": 4.875778198968923e-05, + "loss": 5.5248, + "step": 16944 + }, + { + "epoch": 0.10077671519649824, + "grad_norm": 1.8533267974853516, + "learning_rate": 4.875763657718139e-05, + "loss": 5.2155, + "step": 16945 + }, + { + "epoch": 0.10078266247977924, + "grad_norm": 1.3260399103164673, + "learning_rate": 4.8757491156380006e-05, + "loss": 5.3239, + "step": 16946 + }, + { + "epoch": 0.10078860976306023, + "grad_norm": 1.317050814628601, + "learning_rate": 4.875734572728513e-05, + "loss": 5.2346, + "step": 16947 + }, + { + "epoch": 0.10079455704634123, + "grad_norm": 1.5583351850509644, + "learning_rate": 4.875720028989681e-05, + "loss": 5.194, + "step": 16948 + }, + { + "epoch": 0.10080050432962223, + "grad_norm": 1.3424546718597412, + "learning_rate": 4.8757054844215094e-05, + "loss": 5.3616, + "step": 16949 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 1.3151681423187256, + "learning_rate": 4.875690939024004e-05, + "loss": 5.2183, + "step": 16950 + }, + { + "epoch": 0.10081239889618422, + "grad_norm": 1.441724419593811, + "learning_rate": 4.875676392797168e-05, + "loss": 5.3292, + "step": 16951 + }, + { + "epoch": 0.10081834617946522, + "grad_norm": 1.3751790523529053, + "learning_rate": 4.87566184574101e-05, + "loss": 5.1747, + "step": 16952 + }, + { + "epoch": 0.10082429346274621, + "grad_norm": 1.5188177824020386, + "learning_rate": 4.8756472978555314e-05, + "loss": 5.2291, + "step": 16953 + }, + { + "epoch": 0.10083024074602721, + "grad_norm": 1.2834105491638184, + "learning_rate": 4.87563274914074e-05, + "loss": 5.1655, + "step": 16954 + }, + { + "epoch": 0.10083618802930822, + "grad_norm": 1.3950659036636353, + "learning_rate": 4.8756181995966385e-05, + "loss": 5.2318, + "step": 16955 + }, + { + "epoch": 0.1008421353125892, + "grad_norm": 1.3544670343399048, + "learning_rate": 4.875603649223234e-05, + "loss": 5.026, + "step": 16956 + }, + { + "epoch": 0.1008480825958702, + "grad_norm": 1.4849059581756592, + "learning_rate": 4.875589098020531e-05, + "loss": 5.2139, + "step": 16957 + }, + { + "epoch": 0.10085402987915121, + "grad_norm": 1.2032678127288818, + "learning_rate": 4.875574545988534e-05, + "loss": 5.3103, + "step": 16958 + }, + { + "epoch": 0.1008599771624322, + "grad_norm": 1.4803698062896729, + "learning_rate": 4.875559993127249e-05, + "loss": 5.2546, + "step": 16959 + }, + { + "epoch": 0.1008659244457132, + "grad_norm": 1.374115228652954, + "learning_rate": 4.8755454394366795e-05, + "loss": 5.1654, + "step": 16960 + }, + { + "epoch": 0.1008718717289942, + "grad_norm": 1.420754075050354, + "learning_rate": 4.875530884916832e-05, + "loss": 5.3368, + "step": 16961 + }, + { + "epoch": 0.10087781901227519, + "grad_norm": 1.3919636011123657, + "learning_rate": 4.875516329567712e-05, + "loss": 5.3053, + "step": 16962 + }, + { + "epoch": 0.10088376629555619, + "grad_norm": 1.2697970867156982, + "learning_rate": 4.8755017733893235e-05, + "loss": 5.1771, + "step": 16963 + }, + { + "epoch": 0.10088971357883719, + "grad_norm": 1.3521144390106201, + "learning_rate": 4.8754872163816714e-05, + "loss": 5.3226, + "step": 16964 + }, + { + "epoch": 0.10089566086211818, + "grad_norm": 1.4171572923660278, + "learning_rate": 4.875472658544761e-05, + "loss": 5.17, + "step": 16965 + }, + { + "epoch": 0.10090160814539918, + "grad_norm": 1.1771302223205566, + "learning_rate": 4.875458099878598e-05, + "loss": 5.2938, + "step": 16966 + }, + { + "epoch": 0.10090755542868018, + "grad_norm": 1.3881202936172485, + "learning_rate": 4.875443540383188e-05, + "loss": 5.2567, + "step": 16967 + }, + { + "epoch": 0.10091350271196117, + "grad_norm": 1.3272387981414795, + "learning_rate": 4.875428980058534e-05, + "loss": 5.2459, + "step": 16968 + }, + { + "epoch": 0.10091944999524217, + "grad_norm": 1.227569341659546, + "learning_rate": 4.875414418904643e-05, + "loss": 5.4037, + "step": 16969 + }, + { + "epoch": 0.10092539727852318, + "grad_norm": 1.6725070476531982, + "learning_rate": 4.875399856921519e-05, + "loss": 4.957, + "step": 16970 + }, + { + "epoch": 0.10093134456180416, + "grad_norm": 1.2896990776062012, + "learning_rate": 4.8753852941091676e-05, + "loss": 5.0245, + "step": 16971 + }, + { + "epoch": 0.10093729184508517, + "grad_norm": 1.4771101474761963, + "learning_rate": 4.8753707304675935e-05, + "loss": 5.007, + "step": 16972 + }, + { + "epoch": 0.10094323912836617, + "grad_norm": 1.5898420810699463, + "learning_rate": 4.8753561659968025e-05, + "loss": 5.2144, + "step": 16973 + }, + { + "epoch": 0.10094918641164716, + "grad_norm": 1.3972615003585815, + "learning_rate": 4.875341600696799e-05, + "loss": 5.0019, + "step": 16974 + }, + { + "epoch": 0.10095513369492816, + "grad_norm": 1.3663748502731323, + "learning_rate": 4.875327034567588e-05, + "loss": 5.3281, + "step": 16975 + }, + { + "epoch": 0.10096108097820915, + "grad_norm": 1.4441343545913696, + "learning_rate": 4.875312467609175e-05, + "loss": 5.3224, + "step": 16976 + }, + { + "epoch": 0.10096702826149015, + "grad_norm": 1.409233570098877, + "learning_rate": 4.875297899821565e-05, + "loss": 5.1244, + "step": 16977 + }, + { + "epoch": 0.10097297554477115, + "grad_norm": 1.286838412284851, + "learning_rate": 4.875283331204763e-05, + "loss": 5.187, + "step": 16978 + }, + { + "epoch": 0.10097892282805214, + "grad_norm": 1.3722141981124878, + "learning_rate": 4.8752687617587744e-05, + "loss": 5.1052, + "step": 16979 + }, + { + "epoch": 0.10098487011133314, + "grad_norm": 1.464938998222351, + "learning_rate": 4.8752541914836034e-05, + "loss": 5.2428, + "step": 16980 + }, + { + "epoch": 0.10099081739461414, + "grad_norm": 1.5051358938217163, + "learning_rate": 4.875239620379256e-05, + "loss": 5.204, + "step": 16981 + }, + { + "epoch": 0.10099676467789513, + "grad_norm": 1.374108076095581, + "learning_rate": 4.875225048445737e-05, + "loss": 5.4567, + "step": 16982 + }, + { + "epoch": 0.10100271196117613, + "grad_norm": 1.482023000717163, + "learning_rate": 4.875210475683052e-05, + "loss": 5.3605, + "step": 16983 + }, + { + "epoch": 0.10100865924445714, + "grad_norm": 1.429819107055664, + "learning_rate": 4.8751959020912056e-05, + "loss": 5.3351, + "step": 16984 + }, + { + "epoch": 0.10101460652773812, + "grad_norm": 1.3165935277938843, + "learning_rate": 4.875181327670202e-05, + "loss": 5.2705, + "step": 16985 + }, + { + "epoch": 0.10102055381101913, + "grad_norm": 1.4560794830322266, + "learning_rate": 4.8751667524200474e-05, + "loss": 5.313, + "step": 16986 + }, + { + "epoch": 0.10102650109430013, + "grad_norm": 1.5268526077270508, + "learning_rate": 4.875152176340747e-05, + "loss": 5.2432, + "step": 16987 + }, + { + "epoch": 0.10103244837758112, + "grad_norm": 1.8486063480377197, + "learning_rate": 4.875137599432305e-05, + "loss": 5.4951, + "step": 16988 + }, + { + "epoch": 0.10103839566086212, + "grad_norm": 1.5344970226287842, + "learning_rate": 4.875123021694727e-05, + "loss": 4.7321, + "step": 16989 + }, + { + "epoch": 0.10104434294414312, + "grad_norm": 1.5000940561294556, + "learning_rate": 4.8751084431280186e-05, + "loss": 5.1539, + "step": 16990 + }, + { + "epoch": 0.10105029022742411, + "grad_norm": 1.3047879934310913, + "learning_rate": 4.875093863732184e-05, + "loss": 5.1549, + "step": 16991 + }, + { + "epoch": 0.10105623751070511, + "grad_norm": 1.3496383428573608, + "learning_rate": 4.875079283507229e-05, + "loss": 5.0896, + "step": 16992 + }, + { + "epoch": 0.10106218479398611, + "grad_norm": 1.3492714166641235, + "learning_rate": 4.875064702453158e-05, + "loss": 5.0242, + "step": 16993 + }, + { + "epoch": 0.1010681320772671, + "grad_norm": 1.3479794263839722, + "learning_rate": 4.8750501205699766e-05, + "loss": 4.9653, + "step": 16994 + }, + { + "epoch": 0.1010740793605481, + "grad_norm": 1.4737683534622192, + "learning_rate": 4.87503553785769e-05, + "loss": 5.0082, + "step": 16995 + }, + { + "epoch": 0.1010800266438291, + "grad_norm": 1.335184931755066, + "learning_rate": 4.8750209543163026e-05, + "loss": 5.0068, + "step": 16996 + }, + { + "epoch": 0.10108597392711009, + "grad_norm": 1.3982423543930054, + "learning_rate": 4.87500636994582e-05, + "loss": 4.9958, + "step": 16997 + }, + { + "epoch": 0.1010919212103911, + "grad_norm": 1.4706374406814575, + "learning_rate": 4.874991784746248e-05, + "loss": 4.9776, + "step": 16998 + }, + { + "epoch": 0.1010978684936721, + "grad_norm": 1.4456995725631714, + "learning_rate": 4.8749771987175896e-05, + "loss": 5.1226, + "step": 16999 + }, + { + "epoch": 0.10110381577695308, + "grad_norm": 1.3827359676361084, + "learning_rate": 4.874962611859853e-05, + "loss": 5.0648, + "step": 17000 + }, + { + "epoch": 0.10110976306023409, + "grad_norm": 1.4089758396148682, + "learning_rate": 4.874948024173039e-05, + "loss": 5.0511, + "step": 17001 + }, + { + "epoch": 0.10111571034351509, + "grad_norm": 1.5135823488235474, + "learning_rate": 4.874933435657157e-05, + "loss": 5.1586, + "step": 17002 + }, + { + "epoch": 0.10112165762679608, + "grad_norm": 1.3575700521469116, + "learning_rate": 4.87491884631221e-05, + "loss": 5.4172, + "step": 17003 + }, + { + "epoch": 0.10112760491007708, + "grad_norm": 1.6240919828414917, + "learning_rate": 4.874904256138203e-05, + "loss": 4.8663, + "step": 17004 + }, + { + "epoch": 0.10113355219335807, + "grad_norm": 1.517287254333496, + "learning_rate": 4.8748896651351415e-05, + "loss": 5.2746, + "step": 17005 + }, + { + "epoch": 0.10113949947663907, + "grad_norm": 1.359541893005371, + "learning_rate": 4.87487507330303e-05, + "loss": 5.2497, + "step": 17006 + }, + { + "epoch": 0.10114544675992007, + "grad_norm": 1.608406901359558, + "learning_rate": 4.8748604806418755e-05, + "loss": 5.2789, + "step": 17007 + }, + { + "epoch": 0.10115139404320106, + "grad_norm": 1.5752578973770142, + "learning_rate": 4.874845887151681e-05, + "loss": 5.1583, + "step": 17008 + }, + { + "epoch": 0.10115734132648206, + "grad_norm": 1.5864077806472778, + "learning_rate": 4.8748312928324524e-05, + "loss": 5.2091, + "step": 17009 + }, + { + "epoch": 0.10116328860976306, + "grad_norm": 1.4714727401733398, + "learning_rate": 4.874816697684195e-05, + "loss": 5.2404, + "step": 17010 + }, + { + "epoch": 0.10116923589304405, + "grad_norm": 1.4676539897918701, + "learning_rate": 4.874802101706913e-05, + "loss": 5.3318, + "step": 17011 + }, + { + "epoch": 0.10117518317632505, + "grad_norm": 1.3290908336639404, + "learning_rate": 4.874787504900612e-05, + "loss": 5.0484, + "step": 17012 + }, + { + "epoch": 0.10118113045960606, + "grad_norm": 1.2661367654800415, + "learning_rate": 4.8747729072652984e-05, + "loss": 5.1857, + "step": 17013 + }, + { + "epoch": 0.10118707774288704, + "grad_norm": 1.2540318965911865, + "learning_rate": 4.874758308800975e-05, + "loss": 5.3025, + "step": 17014 + }, + { + "epoch": 0.10119302502616805, + "grad_norm": 1.2353893518447876, + "learning_rate": 4.874743709507649e-05, + "loss": 5.3613, + "step": 17015 + }, + { + "epoch": 0.10119897230944905, + "grad_norm": 1.2193371057510376, + "learning_rate": 4.874729109385323e-05, + "loss": 5.3029, + "step": 17016 + }, + { + "epoch": 0.10120491959273004, + "grad_norm": 1.2443112134933472, + "learning_rate": 4.874714508434005e-05, + "loss": 5.3667, + "step": 17017 + }, + { + "epoch": 0.10121086687601104, + "grad_norm": 1.4194598197937012, + "learning_rate": 4.874699906653698e-05, + "loss": 5.5583, + "step": 17018 + }, + { + "epoch": 0.10121681415929204, + "grad_norm": 1.4791369438171387, + "learning_rate": 4.874685304044408e-05, + "loss": 5.2797, + "step": 17019 + }, + { + "epoch": 0.10122276144257303, + "grad_norm": 1.4528671503067017, + "learning_rate": 4.87467070060614e-05, + "loss": 5.1261, + "step": 17020 + }, + { + "epoch": 0.10122870872585403, + "grad_norm": 1.2694898843765259, + "learning_rate": 4.8746560963388985e-05, + "loss": 5.3817, + "step": 17021 + }, + { + "epoch": 0.10123465600913503, + "grad_norm": 1.6012862920761108, + "learning_rate": 4.8746414912426896e-05, + "loss": 4.962, + "step": 17022 + }, + { + "epoch": 0.10124060329241602, + "grad_norm": 1.6179730892181396, + "learning_rate": 4.874626885317518e-05, + "loss": 4.6365, + "step": 17023 + }, + { + "epoch": 0.10124655057569702, + "grad_norm": 1.4522144794464111, + "learning_rate": 4.8746122785633885e-05, + "loss": 4.8943, + "step": 17024 + }, + { + "epoch": 0.10125249785897802, + "grad_norm": 1.6087841987609863, + "learning_rate": 4.8745976709803064e-05, + "loss": 4.81, + "step": 17025 + }, + { + "epoch": 0.10125844514225901, + "grad_norm": 1.424810767173767, + "learning_rate": 4.8745830625682766e-05, + "loss": 4.8699, + "step": 17026 + }, + { + "epoch": 0.10126439242554001, + "grad_norm": 1.3316916227340698, + "learning_rate": 4.874568453327304e-05, + "loss": 5.0084, + "step": 17027 + }, + { + "epoch": 0.10127033970882102, + "grad_norm": 1.549833059310913, + "learning_rate": 4.8745538432573946e-05, + "loss": 4.748, + "step": 17028 + }, + { + "epoch": 0.101276286992102, + "grad_norm": 1.294263482093811, + "learning_rate": 4.874539232358553e-05, + "loss": 4.8004, + "step": 17029 + }, + { + "epoch": 0.101282234275383, + "grad_norm": 1.5209519863128662, + "learning_rate": 4.8745246206307845e-05, + "loss": 4.8187, + "step": 17030 + }, + { + "epoch": 0.10128818155866401, + "grad_norm": 1.5805583000183105, + "learning_rate": 4.874510008074094e-05, + "loss": 4.7126, + "step": 17031 + }, + { + "epoch": 0.101294128841945, + "grad_norm": 1.473693609237671, + "learning_rate": 4.8744953946884864e-05, + "loss": 4.86, + "step": 17032 + }, + { + "epoch": 0.101300076125226, + "grad_norm": 1.6662403345108032, + "learning_rate": 4.8744807804739664e-05, + "loss": 4.8903, + "step": 17033 + }, + { + "epoch": 0.10130602340850699, + "grad_norm": 1.5269529819488525, + "learning_rate": 4.87446616543054e-05, + "loss": 5.1061, + "step": 17034 + }, + { + "epoch": 0.10131197069178799, + "grad_norm": 1.3940715789794922, + "learning_rate": 4.8744515495582127e-05, + "loss": 5.3221, + "step": 17035 + }, + { + "epoch": 0.10131791797506899, + "grad_norm": 1.4603626728057861, + "learning_rate": 4.874436932856988e-05, + "loss": 5.2562, + "step": 17036 + }, + { + "epoch": 0.10132386525834998, + "grad_norm": 1.4601393938064575, + "learning_rate": 4.874422315326873e-05, + "loss": 5.1297, + "step": 17037 + }, + { + "epoch": 0.10132981254163098, + "grad_norm": 1.3284024000167847, + "learning_rate": 4.874407696967871e-05, + "loss": 5.2209, + "step": 17038 + }, + { + "epoch": 0.10133575982491198, + "grad_norm": 1.1924611330032349, + "learning_rate": 4.874393077779987e-05, + "loss": 5.265, + "step": 17039 + }, + { + "epoch": 0.10134170710819297, + "grad_norm": 1.1306421756744385, + "learning_rate": 4.874378457763228e-05, + "loss": 5.1637, + "step": 17040 + }, + { + "epoch": 0.10134765439147397, + "grad_norm": 1.414591908454895, + "learning_rate": 4.874363836917598e-05, + "loss": 5.1238, + "step": 17041 + }, + { + "epoch": 0.10135360167475498, + "grad_norm": 1.245263934135437, + "learning_rate": 4.8743492152431016e-05, + "loss": 5.1779, + "step": 17042 + }, + { + "epoch": 0.10135954895803596, + "grad_norm": 1.363484501838684, + "learning_rate": 4.874334592739745e-05, + "loss": 5.1328, + "step": 17043 + }, + { + "epoch": 0.10136549624131697, + "grad_norm": 1.3666833639144897, + "learning_rate": 4.8743199694075326e-05, + "loss": 5.2547, + "step": 17044 + }, + { + "epoch": 0.10137144352459797, + "grad_norm": 1.3848010301589966, + "learning_rate": 4.8743053452464694e-05, + "loss": 5.2745, + "step": 17045 + }, + { + "epoch": 0.10137739080787896, + "grad_norm": 1.4478403329849243, + "learning_rate": 4.87429072025656e-05, + "loss": 5.2069, + "step": 17046 + }, + { + "epoch": 0.10138333809115996, + "grad_norm": 1.5361924171447754, + "learning_rate": 4.8742760944378115e-05, + "loss": 5.1721, + "step": 17047 + }, + { + "epoch": 0.10138928537444096, + "grad_norm": 1.549049973487854, + "learning_rate": 4.874261467790227e-05, + "loss": 5.2525, + "step": 17048 + }, + { + "epoch": 0.10139523265772195, + "grad_norm": 1.484999656677246, + "learning_rate": 4.874246840313813e-05, + "loss": 5.2433, + "step": 17049 + }, + { + "epoch": 0.10140117994100295, + "grad_norm": 1.58607017993927, + "learning_rate": 4.8742322120085734e-05, + "loss": 4.9631, + "step": 17050 + }, + { + "epoch": 0.10140712722428395, + "grad_norm": 1.1922807693481445, + "learning_rate": 4.874217582874514e-05, + "loss": 5.1917, + "step": 17051 + }, + { + "epoch": 0.10141307450756494, + "grad_norm": 1.1538786888122559, + "learning_rate": 4.87420295291164e-05, + "loss": 5.0231, + "step": 17052 + }, + { + "epoch": 0.10141902179084594, + "grad_norm": 1.302758812904358, + "learning_rate": 4.874188322119956e-05, + "loss": 5.0292, + "step": 17053 + }, + { + "epoch": 0.10142496907412694, + "grad_norm": 1.2432395219802856, + "learning_rate": 4.874173690499467e-05, + "loss": 5.1671, + "step": 17054 + }, + { + "epoch": 0.10143091635740793, + "grad_norm": 1.3793164491653442, + "learning_rate": 4.8741590580501786e-05, + "loss": 5.2231, + "step": 17055 + }, + { + "epoch": 0.10143686364068893, + "grad_norm": 1.3487818241119385, + "learning_rate": 4.8741444247720966e-05, + "loss": 5.0464, + "step": 17056 + }, + { + "epoch": 0.10144281092396994, + "grad_norm": 1.512860894203186, + "learning_rate": 4.874129790665225e-05, + "loss": 4.8973, + "step": 17057 + }, + { + "epoch": 0.10144875820725092, + "grad_norm": 1.6202374696731567, + "learning_rate": 4.874115155729569e-05, + "loss": 5.0055, + "step": 17058 + }, + { + "epoch": 0.10145470549053193, + "grad_norm": 1.3453385829925537, + "learning_rate": 4.874100519965134e-05, + "loss": 4.7808, + "step": 17059 + }, + { + "epoch": 0.10146065277381293, + "grad_norm": 1.4613635540008545, + "learning_rate": 4.874085883371925e-05, + "loss": 4.8073, + "step": 17060 + }, + { + "epoch": 0.10146660005709392, + "grad_norm": 1.3086074590682983, + "learning_rate": 4.874071245949947e-05, + "loss": 4.9751, + "step": 17061 + }, + { + "epoch": 0.10147254734037492, + "grad_norm": 1.454784631729126, + "learning_rate": 4.8740566076992055e-05, + "loss": 5.2422, + "step": 17062 + }, + { + "epoch": 0.10147849462365591, + "grad_norm": 1.3406941890716553, + "learning_rate": 4.8740419686197054e-05, + "loss": 5.2342, + "step": 17063 + }, + { + "epoch": 0.10148444190693691, + "grad_norm": 1.3241393566131592, + "learning_rate": 4.8740273287114514e-05, + "loss": 5.2168, + "step": 17064 + }, + { + "epoch": 0.10149038919021791, + "grad_norm": 1.2292134761810303, + "learning_rate": 4.8740126879744495e-05, + "loss": 5.171, + "step": 17065 + }, + { + "epoch": 0.1014963364734989, + "grad_norm": 1.395484209060669, + "learning_rate": 4.8739980464087044e-05, + "loss": 5.1782, + "step": 17066 + }, + { + "epoch": 0.1015022837567799, + "grad_norm": 1.8667857646942139, + "learning_rate": 4.87398340401422e-05, + "loss": 5.7113, + "step": 17067 + }, + { + "epoch": 0.1015082310400609, + "grad_norm": 1.4775335788726807, + "learning_rate": 4.873968760791003e-05, + "loss": 5.2518, + "step": 17068 + }, + { + "epoch": 0.10151417832334189, + "grad_norm": 1.5058828592300415, + "learning_rate": 4.873954116739059e-05, + "loss": 5.3249, + "step": 17069 + }, + { + "epoch": 0.1015201256066229, + "grad_norm": 1.4806468486785889, + "learning_rate": 4.873939471858391e-05, + "loss": 5.1119, + "step": 17070 + }, + { + "epoch": 0.1015260728899039, + "grad_norm": 1.3866868019104004, + "learning_rate": 4.873924826149006e-05, + "loss": 5.1709, + "step": 17071 + }, + { + "epoch": 0.10153202017318488, + "grad_norm": 1.2337566614151, + "learning_rate": 4.8739101796109074e-05, + "loss": 5.2346, + "step": 17072 + }, + { + "epoch": 0.10153796745646589, + "grad_norm": 1.5977396965026855, + "learning_rate": 4.873895532244103e-05, + "loss": 5.4213, + "step": 17073 + }, + { + "epoch": 0.10154391473974689, + "grad_norm": 1.343363642692566, + "learning_rate": 4.873880884048595e-05, + "loss": 5.2865, + "step": 17074 + }, + { + "epoch": 0.10154986202302788, + "grad_norm": 1.4759324789047241, + "learning_rate": 4.87386623502439e-05, + "loss": 5.1743, + "step": 17075 + }, + { + "epoch": 0.10155580930630888, + "grad_norm": 1.2113150358200073, + "learning_rate": 4.873851585171493e-05, + "loss": 5.2218, + "step": 17076 + }, + { + "epoch": 0.10156175658958988, + "grad_norm": 1.3962153196334839, + "learning_rate": 4.873836934489908e-05, + "loss": 5.1031, + "step": 17077 + }, + { + "epoch": 0.10156770387287087, + "grad_norm": 1.410144329071045, + "learning_rate": 4.8738222829796424e-05, + "loss": 5.0662, + "step": 17078 + }, + { + "epoch": 0.10157365115615187, + "grad_norm": 1.224947452545166, + "learning_rate": 4.873807630640699e-05, + "loss": 5.1583, + "step": 17079 + }, + { + "epoch": 0.10157959843943287, + "grad_norm": 1.401877522468567, + "learning_rate": 4.873792977473084e-05, + "loss": 5.2688, + "step": 17080 + }, + { + "epoch": 0.10158554572271386, + "grad_norm": 1.3576874732971191, + "learning_rate": 4.873778323476802e-05, + "loss": 5.037, + "step": 17081 + }, + { + "epoch": 0.10159149300599486, + "grad_norm": 1.226619839668274, + "learning_rate": 4.8737636686518595e-05, + "loss": 5.0502, + "step": 17082 + }, + { + "epoch": 0.10159744028927586, + "grad_norm": 1.2307099103927612, + "learning_rate": 4.87374901299826e-05, + "loss": 5.0855, + "step": 17083 + }, + { + "epoch": 0.10160338757255685, + "grad_norm": 1.1481422185897827, + "learning_rate": 4.873734356516009e-05, + "loss": 5.2114, + "step": 17084 + }, + { + "epoch": 0.10160933485583785, + "grad_norm": 1.4645094871520996, + "learning_rate": 4.873719699205113e-05, + "loss": 5.1432, + "step": 17085 + }, + { + "epoch": 0.10161528213911886, + "grad_norm": 1.3309158086776733, + "learning_rate": 4.873705041065575e-05, + "loss": 5.1557, + "step": 17086 + }, + { + "epoch": 0.10162122942239984, + "grad_norm": 1.2546007633209229, + "learning_rate": 4.873690382097401e-05, + "loss": 5.324, + "step": 17087 + }, + { + "epoch": 0.10162717670568085, + "grad_norm": 1.33823561668396, + "learning_rate": 4.873675722300597e-05, + "loss": 5.1773, + "step": 17088 + }, + { + "epoch": 0.10163312398896185, + "grad_norm": 1.3027381896972656, + "learning_rate": 4.873661061675166e-05, + "loss": 5.4172, + "step": 17089 + }, + { + "epoch": 0.10163907127224284, + "grad_norm": 1.3852121829986572, + "learning_rate": 4.873646400221116e-05, + "loss": 5.1655, + "step": 17090 + }, + { + "epoch": 0.10164501855552384, + "grad_norm": 1.4345825910568237, + "learning_rate": 4.87363173793845e-05, + "loss": 4.9941, + "step": 17091 + }, + { + "epoch": 0.10165096583880483, + "grad_norm": 1.4016261100769043, + "learning_rate": 4.873617074827173e-05, + "loss": 4.9657, + "step": 17092 + }, + { + "epoch": 0.10165691312208583, + "grad_norm": 1.339082956314087, + "learning_rate": 4.8736024108872914e-05, + "loss": 5.0075, + "step": 17093 + }, + { + "epoch": 0.10166286040536683, + "grad_norm": 1.3223985433578491, + "learning_rate": 4.8735877461188094e-05, + "loss": 4.9656, + "step": 17094 + }, + { + "epoch": 0.10166880768864782, + "grad_norm": 1.4618138074874878, + "learning_rate": 4.8735730805217326e-05, + "loss": 5.0158, + "step": 17095 + }, + { + "epoch": 0.10167475497192882, + "grad_norm": 1.4075788259506226, + "learning_rate": 4.8735584140960666e-05, + "loss": 5.3668, + "step": 17096 + }, + { + "epoch": 0.10168070225520982, + "grad_norm": 1.2219016551971436, + "learning_rate": 4.873543746841815e-05, + "loss": 5.3549, + "step": 17097 + }, + { + "epoch": 0.10168664953849081, + "grad_norm": 1.4344584941864014, + "learning_rate": 4.873529078758985e-05, + "loss": 5.2044, + "step": 17098 + }, + { + "epoch": 0.10169259682177181, + "grad_norm": 1.3579001426696777, + "learning_rate": 4.8735144098475794e-05, + "loss": 5.1071, + "step": 17099 + }, + { + "epoch": 0.10169854410505282, + "grad_norm": 1.4645969867706299, + "learning_rate": 4.873499740107604e-05, + "loss": 5.0359, + "step": 17100 + }, + { + "epoch": 0.1017044913883338, + "grad_norm": 1.6800013780593872, + "learning_rate": 4.8734850695390654e-05, + "loss": 5.2085, + "step": 17101 + }, + { + "epoch": 0.1017104386716148, + "grad_norm": 1.678339958190918, + "learning_rate": 4.873470398141968e-05, + "loss": 5.1671, + "step": 17102 + }, + { + "epoch": 0.10171638595489581, + "grad_norm": 1.6498647928237915, + "learning_rate": 4.873455725916316e-05, + "loss": 5.2105, + "step": 17103 + }, + { + "epoch": 0.1017223332381768, + "grad_norm": 1.522147297859192, + "learning_rate": 4.873441052862115e-05, + "loss": 5.1215, + "step": 17104 + }, + { + "epoch": 0.1017282805214578, + "grad_norm": 1.3335652351379395, + "learning_rate": 4.87342637897937e-05, + "loss": 5.2504, + "step": 17105 + }, + { + "epoch": 0.1017342278047388, + "grad_norm": 1.1647717952728271, + "learning_rate": 4.873411704268087e-05, + "loss": 5.3183, + "step": 17106 + }, + { + "epoch": 0.10174017508801979, + "grad_norm": 1.3210188150405884, + "learning_rate": 4.8733970287282706e-05, + "loss": 5.399, + "step": 17107 + }, + { + "epoch": 0.10174612237130079, + "grad_norm": 1.2331137657165527, + "learning_rate": 4.873382352359925e-05, + "loss": 5.2521, + "step": 17108 + }, + { + "epoch": 0.10175206965458179, + "grad_norm": 1.245252251625061, + "learning_rate": 4.873367675163056e-05, + "loss": 5.2092, + "step": 17109 + }, + { + "epoch": 0.10175801693786278, + "grad_norm": 1.3423751592636108, + "learning_rate": 4.87335299713767e-05, + "loss": 4.918, + "step": 17110 + }, + { + "epoch": 0.10176396422114378, + "grad_norm": 1.8670060634613037, + "learning_rate": 4.87333831828377e-05, + "loss": 4.6559, + "step": 17111 + }, + { + "epoch": 0.10176991150442478, + "grad_norm": 1.54763925075531, + "learning_rate": 4.873323638601363e-05, + "loss": 5.2565, + "step": 17112 + }, + { + "epoch": 0.10177585878770577, + "grad_norm": 1.134102702140808, + "learning_rate": 4.8733089580904525e-05, + "loss": 5.2119, + "step": 17113 + }, + { + "epoch": 0.10178180607098677, + "grad_norm": 1.395027756690979, + "learning_rate": 4.873294276751045e-05, + "loss": 5.0732, + "step": 17114 + }, + { + "epoch": 0.10178775335426778, + "grad_norm": 1.104973554611206, + "learning_rate": 4.873279594583144e-05, + "loss": 5.0807, + "step": 17115 + }, + { + "epoch": 0.10179370063754876, + "grad_norm": 1.0554969310760498, + "learning_rate": 4.873264911586757e-05, + "loss": 5.0831, + "step": 17116 + }, + { + "epoch": 0.10179964792082977, + "grad_norm": 1.0598722696304321, + "learning_rate": 4.873250227761887e-05, + "loss": 5.1264, + "step": 17117 + }, + { + "epoch": 0.10180559520411077, + "grad_norm": 1.1047697067260742, + "learning_rate": 4.8732355431085395e-05, + "loss": 5.0687, + "step": 17118 + }, + { + "epoch": 0.10181154248739176, + "grad_norm": 1.5564457178115845, + "learning_rate": 4.87322085762672e-05, + "loss": 5.0063, + "step": 17119 + }, + { + "epoch": 0.10181748977067276, + "grad_norm": 1.5218400955200195, + "learning_rate": 4.8732061713164344e-05, + "loss": 5.3785, + "step": 17120 + }, + { + "epoch": 0.10182343705395375, + "grad_norm": 1.3067396879196167, + "learning_rate": 4.873191484177686e-05, + "loss": 5.4108, + "step": 17121 + }, + { + "epoch": 0.10182938433723475, + "grad_norm": 1.4401333332061768, + "learning_rate": 4.873176796210482e-05, + "loss": 5.5251, + "step": 17122 + }, + { + "epoch": 0.10183533162051575, + "grad_norm": 1.0483810901641846, + "learning_rate": 4.873162107414826e-05, + "loss": 5.4983, + "step": 17123 + }, + { + "epoch": 0.10184127890379674, + "grad_norm": 1.2637344598770142, + "learning_rate": 4.8731474177907244e-05, + "loss": 5.4487, + "step": 17124 + }, + { + "epoch": 0.10184722618707774, + "grad_norm": 1.314834475517273, + "learning_rate": 4.873132727338181e-05, + "loss": 5.228, + "step": 17125 + }, + { + "epoch": 0.10185317347035874, + "grad_norm": 1.354665756225586, + "learning_rate": 4.8731180360572e-05, + "loss": 5.3908, + "step": 17126 + }, + { + "epoch": 0.10185912075363973, + "grad_norm": 1.3690662384033203, + "learning_rate": 4.87310334394779e-05, + "loss": 5.0955, + "step": 17127 + }, + { + "epoch": 0.10186506803692073, + "grad_norm": 1.5240978002548218, + "learning_rate": 4.873088651009954e-05, + "loss": 5.2838, + "step": 17128 + }, + { + "epoch": 0.10187101532020174, + "grad_norm": 1.147658109664917, + "learning_rate": 4.8730739572436966e-05, + "loss": 5.3074, + "step": 17129 + }, + { + "epoch": 0.10187696260348272, + "grad_norm": 1.3384162187576294, + "learning_rate": 4.8730592626490235e-05, + "loss": 5.3677, + "step": 17130 + }, + { + "epoch": 0.10188290988676373, + "grad_norm": 1.3388500213623047, + "learning_rate": 4.87304456722594e-05, + "loss": 5.3151, + "step": 17131 + }, + { + "epoch": 0.10188885717004473, + "grad_norm": 1.215617060661316, + "learning_rate": 4.873029870974452e-05, + "loss": 4.9182, + "step": 17132 + }, + { + "epoch": 0.10189480445332572, + "grad_norm": 1.2983050346374512, + "learning_rate": 4.873015173894563e-05, + "loss": 5.142, + "step": 17133 + }, + { + "epoch": 0.10190075173660672, + "grad_norm": 1.3918750286102295, + "learning_rate": 4.873000475986279e-05, + "loss": 5.0548, + "step": 17134 + }, + { + "epoch": 0.10190669901988772, + "grad_norm": 1.3934828042984009, + "learning_rate": 4.8729857772496045e-05, + "loss": 5.1319, + "step": 17135 + }, + { + "epoch": 0.10191264630316871, + "grad_norm": 1.32583487033844, + "learning_rate": 4.872971077684546e-05, + "loss": 5.2762, + "step": 17136 + }, + { + "epoch": 0.10191859358644971, + "grad_norm": 1.295102834701538, + "learning_rate": 4.872956377291108e-05, + "loss": 5.2338, + "step": 17137 + }, + { + "epoch": 0.10192454086973071, + "grad_norm": 1.2840588092803955, + "learning_rate": 4.8729416760692946e-05, + "loss": 5.3957, + "step": 17138 + }, + { + "epoch": 0.1019304881530117, + "grad_norm": 1.371270775794983, + "learning_rate": 4.872926974019112e-05, + "loss": 5.5933, + "step": 17139 + }, + { + "epoch": 0.1019364354362927, + "grad_norm": 1.380387783050537, + "learning_rate": 4.872912271140565e-05, + "loss": 5.6628, + "step": 17140 + }, + { + "epoch": 0.1019423827195737, + "grad_norm": 1.3120551109313965, + "learning_rate": 4.8728975674336596e-05, + "loss": 5.6424, + "step": 17141 + }, + { + "epoch": 0.10194833000285469, + "grad_norm": 1.3965035676956177, + "learning_rate": 4.8728828628984003e-05, + "loss": 5.5413, + "step": 17142 + }, + { + "epoch": 0.1019542772861357, + "grad_norm": 1.5870885848999023, + "learning_rate": 4.872868157534791e-05, + "loss": 5.1952, + "step": 17143 + }, + { + "epoch": 0.1019602245694167, + "grad_norm": 1.584633231163025, + "learning_rate": 4.872853451342839e-05, + "loss": 5.1045, + "step": 17144 + }, + { + "epoch": 0.10196617185269768, + "grad_norm": 1.5781641006469727, + "learning_rate": 4.872838744322548e-05, + "loss": 4.9581, + "step": 17145 + }, + { + "epoch": 0.10197211913597869, + "grad_norm": 1.3683301210403442, + "learning_rate": 4.872824036473923e-05, + "loss": 4.9931, + "step": 17146 + }, + { + "epoch": 0.10197806641925969, + "grad_norm": 1.4182472229003906, + "learning_rate": 4.87280932779697e-05, + "loss": 4.7815, + "step": 17147 + }, + { + "epoch": 0.10198401370254068, + "grad_norm": 1.464609146118164, + "learning_rate": 4.872794618291694e-05, + "loss": 4.9158, + "step": 17148 + }, + { + "epoch": 0.10198996098582168, + "grad_norm": 1.4733667373657227, + "learning_rate": 4.872779907958099e-05, + "loss": 5.069, + "step": 17149 + }, + { + "epoch": 0.10199590826910268, + "grad_norm": 1.4454584121704102, + "learning_rate": 4.872765196796192e-05, + "loss": 5.1131, + "step": 17150 + }, + { + "epoch": 0.10200185555238367, + "grad_norm": 1.6175665855407715, + "learning_rate": 4.872750484805977e-05, + "loss": 4.9432, + "step": 17151 + }, + { + "epoch": 0.10200780283566467, + "grad_norm": 1.378569483757019, + "learning_rate": 4.872735771987459e-05, + "loss": 4.9243, + "step": 17152 + }, + { + "epoch": 0.10201375011894566, + "grad_norm": 1.452481985092163, + "learning_rate": 4.872721058340644e-05, + "loss": 4.8421, + "step": 17153 + }, + { + "epoch": 0.10201969740222666, + "grad_norm": 1.8265782594680786, + "learning_rate": 4.872706343865536e-05, + "loss": 5.2555, + "step": 17154 + }, + { + "epoch": 0.10202564468550766, + "grad_norm": 1.6913262605667114, + "learning_rate": 4.8726916285621414e-05, + "loss": 5.3829, + "step": 17155 + }, + { + "epoch": 0.10203159196878865, + "grad_norm": 1.6480923891067505, + "learning_rate": 4.8726769124304644e-05, + "loss": 5.4168, + "step": 17156 + }, + { + "epoch": 0.10203753925206965, + "grad_norm": 1.702602744102478, + "learning_rate": 4.8726621954705105e-05, + "loss": 5.4045, + "step": 17157 + }, + { + "epoch": 0.10204348653535066, + "grad_norm": 1.749205470085144, + "learning_rate": 4.8726474776822844e-05, + "loss": 5.5886, + "step": 17158 + }, + { + "epoch": 0.10204943381863164, + "grad_norm": 1.927309274673462, + "learning_rate": 4.8726327590657916e-05, + "loss": 5.5547, + "step": 17159 + }, + { + "epoch": 0.10205538110191265, + "grad_norm": 1.6493511199951172, + "learning_rate": 4.8726180396210374e-05, + "loss": 5.6764, + "step": 17160 + }, + { + "epoch": 0.10206132838519365, + "grad_norm": 1.7083081007003784, + "learning_rate": 4.8726033193480266e-05, + "loss": 5.5823, + "step": 17161 + }, + { + "epoch": 0.10206727566847464, + "grad_norm": 1.7882472276687622, + "learning_rate": 4.872588598246765e-05, + "loss": 5.4388, + "step": 17162 + }, + { + "epoch": 0.10207322295175564, + "grad_norm": 1.6043784618377686, + "learning_rate": 4.872573876317257e-05, + "loss": 5.6816, + "step": 17163 + }, + { + "epoch": 0.10207917023503664, + "grad_norm": 1.3449418544769287, + "learning_rate": 4.872559153559507e-05, + "loss": 5.5661, + "step": 17164 + }, + { + "epoch": 0.10208511751831763, + "grad_norm": 1.7593882083892822, + "learning_rate": 4.8725444299735226e-05, + "loss": 4.95, + "step": 17165 + }, + { + "epoch": 0.10209106480159863, + "grad_norm": 1.8593993186950684, + "learning_rate": 4.872529705559307e-05, + "loss": 5.3296, + "step": 17166 + }, + { + "epoch": 0.10209701208487963, + "grad_norm": 1.7530159950256348, + "learning_rate": 4.872514980316865e-05, + "loss": 5.4378, + "step": 17167 + }, + { + "epoch": 0.10210295936816062, + "grad_norm": 1.7487550973892212, + "learning_rate": 4.872500254246203e-05, + "loss": 5.3435, + "step": 17168 + }, + { + "epoch": 0.10210890665144162, + "grad_norm": 1.7868090867996216, + "learning_rate": 4.8724855273473256e-05, + "loss": 5.2266, + "step": 17169 + }, + { + "epoch": 0.10211485393472262, + "grad_norm": 1.6116459369659424, + "learning_rate": 4.872470799620238e-05, + "loss": 5.2394, + "step": 17170 + }, + { + "epoch": 0.10212080121800361, + "grad_norm": 1.6221721172332764, + "learning_rate": 4.872456071064946e-05, + "loss": 5.823, + "step": 17171 + }, + { + "epoch": 0.10212674850128461, + "grad_norm": 1.462540626525879, + "learning_rate": 4.872441341681454e-05, + "loss": 5.8816, + "step": 17172 + }, + { + "epoch": 0.10213269578456562, + "grad_norm": 1.3804352283477783, + "learning_rate": 4.872426611469766e-05, + "loss": 5.7982, + "step": 17173 + }, + { + "epoch": 0.1021386430678466, + "grad_norm": 1.7873106002807617, + "learning_rate": 4.872411880429889e-05, + "loss": 5.0282, + "step": 17174 + }, + { + "epoch": 0.1021445903511276, + "grad_norm": 1.9154506921768188, + "learning_rate": 4.8723971485618284e-05, + "loss": 4.8535, + "step": 17175 + }, + { + "epoch": 0.10215053763440861, + "grad_norm": 1.865502953529358, + "learning_rate": 4.872382415865587e-05, + "loss": 5.5282, + "step": 17176 + }, + { + "epoch": 0.1021564849176896, + "grad_norm": 1.8683371543884277, + "learning_rate": 4.872367682341173e-05, + "loss": 5.2973, + "step": 17177 + }, + { + "epoch": 0.1021624322009706, + "grad_norm": 1.8488374948501587, + "learning_rate": 4.872352947988589e-05, + "loss": 5.4094, + "step": 17178 + }, + { + "epoch": 0.1021683794842516, + "grad_norm": 1.6702567338943481, + "learning_rate": 4.872338212807841e-05, + "loss": 5.5705, + "step": 17179 + }, + { + "epoch": 0.10217432676753259, + "grad_norm": 1.6559606790542603, + "learning_rate": 4.8723234767989345e-05, + "loss": 5.6637, + "step": 17180 + }, + { + "epoch": 0.10218027405081359, + "grad_norm": 1.523253321647644, + "learning_rate": 4.872308739961875e-05, + "loss": 5.4033, + "step": 17181 + }, + { + "epoch": 0.10218622133409458, + "grad_norm": 1.4300789833068848, + "learning_rate": 4.8722940022966665e-05, + "loss": 5.7568, + "step": 17182 + }, + { + "epoch": 0.10219216861737558, + "grad_norm": 1.5076279640197754, + "learning_rate": 4.872279263803314e-05, + "loss": 4.9469, + "step": 17183 + }, + { + "epoch": 0.10219811590065658, + "grad_norm": 1.721596598625183, + "learning_rate": 4.872264524481824e-05, + "loss": 5.1595, + "step": 17184 + }, + { + "epoch": 0.10220406318393757, + "grad_norm": 1.5876305103302002, + "learning_rate": 4.872249784332201e-05, + "loss": 4.9964, + "step": 17185 + }, + { + "epoch": 0.10221001046721857, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.87223504335445e-05, + "loss": 5.0299, + "step": 17186 + }, + { + "epoch": 0.10221595775049958, + "grad_norm": 1.586411952972412, + "learning_rate": 4.872220301548576e-05, + "loss": 4.9945, + "step": 17187 + }, + { + "epoch": 0.10222190503378056, + "grad_norm": 1.541045069694519, + "learning_rate": 4.872205558914585e-05, + "loss": 4.8789, + "step": 17188 + }, + { + "epoch": 0.10222785231706157, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.872190815452481e-05, + "loss": 4.849, + "step": 17189 + }, + { + "epoch": 0.10223379960034257, + "grad_norm": 1.7448357343673706, + "learning_rate": 4.87217607116227e-05, + "loss": 4.7961, + "step": 17190 + }, + { + "epoch": 0.10223974688362356, + "grad_norm": 1.7249553203582764, + "learning_rate": 4.872161326043957e-05, + "loss": 4.7988, + "step": 17191 + }, + { + "epoch": 0.10224569416690456, + "grad_norm": 1.6894437074661255, + "learning_rate": 4.8721465800975465e-05, + "loss": 4.6713, + "step": 17192 + }, + { + "epoch": 0.10225164145018556, + "grad_norm": 1.5226197242736816, + "learning_rate": 4.8721318333230446e-05, + "loss": 4.8233, + "step": 17193 + }, + { + "epoch": 0.10225758873346655, + "grad_norm": 1.6511256694793701, + "learning_rate": 4.8721170857204554e-05, + "loss": 5.177, + "step": 17194 + }, + { + "epoch": 0.10226353601674755, + "grad_norm": 1.8213993310928345, + "learning_rate": 4.872102337289785e-05, + "loss": 5.2472, + "step": 17195 + }, + { + "epoch": 0.10226948330002855, + "grad_norm": 1.6683803796768188, + "learning_rate": 4.872087588031038e-05, + "loss": 4.7902, + "step": 17196 + }, + { + "epoch": 0.10227543058330954, + "grad_norm": 1.5809015035629272, + "learning_rate": 4.8720728379442204e-05, + "loss": 4.6288, + "step": 17197 + }, + { + "epoch": 0.10228137786659054, + "grad_norm": 1.7978498935699463, + "learning_rate": 4.872058087029336e-05, + "loss": 4.6638, + "step": 17198 + }, + { + "epoch": 0.10228732514987154, + "grad_norm": 1.74656081199646, + "learning_rate": 4.87204333528639e-05, + "loss": 5.652, + "step": 17199 + }, + { + "epoch": 0.10229327243315253, + "grad_norm": 1.6222811937332153, + "learning_rate": 4.87202858271539e-05, + "loss": 5.3951, + "step": 17200 + }, + { + "epoch": 0.10229921971643353, + "grad_norm": 1.8816531896591187, + "learning_rate": 4.8720138293163374e-05, + "loss": 5.728, + "step": 17201 + }, + { + "epoch": 0.10230516699971454, + "grad_norm": 1.5618531703948975, + "learning_rate": 4.871999075089241e-05, + "loss": 5.7162, + "step": 17202 + }, + { + "epoch": 0.10231111428299552, + "grad_norm": 1.4562182426452637, + "learning_rate": 4.871984320034103e-05, + "loss": 5.7563, + "step": 17203 + }, + { + "epoch": 0.10231706156627653, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.87196956415093e-05, + "loss": 5.6333, + "step": 17204 + }, + { + "epoch": 0.10232300884955753, + "grad_norm": 1.7934935092926025, + "learning_rate": 4.871954807439727e-05, + "loss": 5.5804, + "step": 17205 + }, + { + "epoch": 0.10232895613283852, + "grad_norm": 1.5005213022232056, + "learning_rate": 4.8719400499005e-05, + "loss": 5.2471, + "step": 17206 + }, + { + "epoch": 0.10233490341611952, + "grad_norm": 1.5418996810913086, + "learning_rate": 4.871925291533252e-05, + "loss": 6.0574, + "step": 17207 + }, + { + "epoch": 0.10234085069940052, + "grad_norm": 1.3919132947921753, + "learning_rate": 4.87191053233799e-05, + "loss": 6.0048, + "step": 17208 + }, + { + "epoch": 0.10234679798268151, + "grad_norm": 1.9565762281417847, + "learning_rate": 4.8718957723147184e-05, + "loss": 4.9914, + "step": 17209 + }, + { + "epoch": 0.10235274526596251, + "grad_norm": 2.3950796127319336, + "learning_rate": 4.871881011463442e-05, + "loss": 5.7963, + "step": 17210 + }, + { + "epoch": 0.1023586925492435, + "grad_norm": 2.0693960189819336, + "learning_rate": 4.871866249784167e-05, + "loss": 5.4641, + "step": 17211 + }, + { + "epoch": 0.1023646398325245, + "grad_norm": 2.105893850326538, + "learning_rate": 4.871851487276898e-05, + "loss": 5.3983, + "step": 17212 + }, + { + "epoch": 0.1023705871158055, + "grad_norm": 2.171363115310669, + "learning_rate": 4.8718367239416404e-05, + "loss": 5.6619, + "step": 17213 + }, + { + "epoch": 0.10237653439908649, + "grad_norm": 2.141611099243164, + "learning_rate": 4.8718219597783984e-05, + "loss": 5.5488, + "step": 17214 + }, + { + "epoch": 0.1023824816823675, + "grad_norm": 1.8755214214324951, + "learning_rate": 4.871807194787178e-05, + "loss": 5.4888, + "step": 17215 + }, + { + "epoch": 0.1023884289656485, + "grad_norm": 2.0865023136138916, + "learning_rate": 4.871792428967984e-05, + "loss": 5.4645, + "step": 17216 + }, + { + "epoch": 0.10239437624892948, + "grad_norm": 1.9486721754074097, + "learning_rate": 4.871777662320823e-05, + "loss": 5.4057, + "step": 17217 + }, + { + "epoch": 0.10240032353221049, + "grad_norm": 2.109412670135498, + "learning_rate": 4.8717628948456976e-05, + "loss": 5.3768, + "step": 17218 + }, + { + "epoch": 0.10240627081549149, + "grad_norm": 2.202826499938965, + "learning_rate": 4.871748126542615e-05, + "loss": 5.4996, + "step": 17219 + }, + { + "epoch": 0.10241221809877248, + "grad_norm": 1.8646687269210815, + "learning_rate": 4.87173335741158e-05, + "loss": 5.5151, + "step": 17220 + }, + { + "epoch": 0.10241816538205348, + "grad_norm": 1.7966501712799072, + "learning_rate": 4.8717185874525964e-05, + "loss": 5.5548, + "step": 17221 + }, + { + "epoch": 0.10242411266533448, + "grad_norm": 1.9538966417312622, + "learning_rate": 4.8717038166656706e-05, + "loss": 5.6221, + "step": 17222 + }, + { + "epoch": 0.10243005994861547, + "grad_norm": 1.6085959672927856, + "learning_rate": 4.871689045050808e-05, + "loss": 5.2468, + "step": 17223 + }, + { + "epoch": 0.10243600723189647, + "grad_norm": 1.7573461532592773, + "learning_rate": 4.871674272608012e-05, + "loss": 5.5835, + "step": 17224 + }, + { + "epoch": 0.10244195451517747, + "grad_norm": 1.8237701654434204, + "learning_rate": 4.87165949933729e-05, + "loss": 5.3537, + "step": 17225 + }, + { + "epoch": 0.10244790179845846, + "grad_norm": 1.963970422744751, + "learning_rate": 4.8716447252386465e-05, + "loss": 5.5714, + "step": 17226 + }, + { + "epoch": 0.10245384908173946, + "grad_norm": 2.0216476917266846, + "learning_rate": 4.871629950312086e-05, + "loss": 5.4889, + "step": 17227 + }, + { + "epoch": 0.10245979636502046, + "grad_norm": 2.0271217823028564, + "learning_rate": 4.871615174557614e-05, + "loss": 5.5903, + "step": 17228 + }, + { + "epoch": 0.10246574364830145, + "grad_norm": 1.7717560529708862, + "learning_rate": 4.871600397975236e-05, + "loss": 5.3989, + "step": 17229 + }, + { + "epoch": 0.10247169093158245, + "grad_norm": 1.722076416015625, + "learning_rate": 4.8715856205649556e-05, + "loss": 5.526, + "step": 17230 + }, + { + "epoch": 0.10247763821486346, + "grad_norm": 2.124905586242676, + "learning_rate": 4.8715708423267805e-05, + "loss": 5.3835, + "step": 17231 + }, + { + "epoch": 0.10248358549814444, + "grad_norm": 2.2088522911071777, + "learning_rate": 4.8715560632607135e-05, + "loss": 5.5228, + "step": 17232 + }, + { + "epoch": 0.10248953278142545, + "grad_norm": 2.0236847400665283, + "learning_rate": 4.871541283366761e-05, + "loss": 5.3851, + "step": 17233 + }, + { + "epoch": 0.10249548006470645, + "grad_norm": 1.7546913623809814, + "learning_rate": 4.871526502644928e-05, + "loss": 5.2, + "step": 17234 + }, + { + "epoch": 0.10250142734798744, + "grad_norm": 1.9796072244644165, + "learning_rate": 4.87151172109522e-05, + "loss": 5.3873, + "step": 17235 + }, + { + "epoch": 0.10250737463126844, + "grad_norm": 1.5305960178375244, + "learning_rate": 4.8714969387176414e-05, + "loss": 5.1888, + "step": 17236 + }, + { + "epoch": 0.10251332191454944, + "grad_norm": 2.007124185562134, + "learning_rate": 4.871482155512198e-05, + "loss": 5.4024, + "step": 17237 + }, + { + "epoch": 0.10251926919783043, + "grad_norm": 1.8268414735794067, + "learning_rate": 4.871467371478894e-05, + "loss": 5.4289, + "step": 17238 + }, + { + "epoch": 0.10252521648111143, + "grad_norm": 1.9826276302337646, + "learning_rate": 4.871452586617736e-05, + "loss": 5.3222, + "step": 17239 + }, + { + "epoch": 0.10253116376439242, + "grad_norm": 1.7642468214035034, + "learning_rate": 4.8714378009287285e-05, + "loss": 5.3858, + "step": 17240 + }, + { + "epoch": 0.10253711104767342, + "grad_norm": 1.9604185819625854, + "learning_rate": 4.8714230144118764e-05, + "loss": 5.4142, + "step": 17241 + }, + { + "epoch": 0.10254305833095442, + "grad_norm": 2.333829402923584, + "learning_rate": 4.8714082270671844e-05, + "loss": 5.2124, + "step": 17242 + }, + { + "epoch": 0.10254900561423541, + "grad_norm": 1.996928095817566, + "learning_rate": 4.8713934388946593e-05, + "loss": 5.5055, + "step": 17243 + }, + { + "epoch": 0.10255495289751641, + "grad_norm": 2.2702581882476807, + "learning_rate": 4.871378649894304e-05, + "loss": 5.3477, + "step": 17244 + }, + { + "epoch": 0.10256090018079742, + "grad_norm": 1.9696896076202393, + "learning_rate": 4.871363860066126e-05, + "loss": 5.39, + "step": 17245 + }, + { + "epoch": 0.1025668474640784, + "grad_norm": 1.7752536535263062, + "learning_rate": 4.871349069410129e-05, + "loss": 5.326, + "step": 17246 + }, + { + "epoch": 0.1025727947473594, + "grad_norm": 1.798829197883606, + "learning_rate": 4.8713342779263184e-05, + "loss": 5.4066, + "step": 17247 + }, + { + "epoch": 0.10257874203064041, + "grad_norm": 1.975467562675476, + "learning_rate": 4.871319485614699e-05, + "loss": 5.4183, + "step": 17248 + }, + { + "epoch": 0.1025846893139214, + "grad_norm": 2.4021782875061035, + "learning_rate": 4.871304692475277e-05, + "loss": 5.3949, + "step": 17249 + }, + { + "epoch": 0.1025906365972024, + "grad_norm": 1.8973580598831177, + "learning_rate": 4.871289898508058e-05, + "loss": 5.437, + "step": 17250 + }, + { + "epoch": 0.1025965838804834, + "grad_norm": 2.3427937030792236, + "learning_rate": 4.8712751037130446e-05, + "loss": 5.4347, + "step": 17251 + }, + { + "epoch": 0.10260253116376439, + "grad_norm": 1.8699359893798828, + "learning_rate": 4.871260308090245e-05, + "loss": 5.3404, + "step": 17252 + }, + { + "epoch": 0.10260847844704539, + "grad_norm": 2.146106719970703, + "learning_rate": 4.871245511639661e-05, + "loss": 5.3664, + "step": 17253 + }, + { + "epoch": 0.10261442573032639, + "grad_norm": 2.0223419666290283, + "learning_rate": 4.871230714361302e-05, + "loss": 5.4117, + "step": 17254 + }, + { + "epoch": 0.10262037301360738, + "grad_norm": 2.036025047302246, + "learning_rate": 4.871215916255169e-05, + "loss": 5.4349, + "step": 17255 + }, + { + "epoch": 0.10262632029688838, + "grad_norm": 2.0085432529449463, + "learning_rate": 4.87120111732127e-05, + "loss": 5.4896, + "step": 17256 + }, + { + "epoch": 0.10263226758016938, + "grad_norm": 2.088165521621704, + "learning_rate": 4.871186317559609e-05, + "loss": 5.2516, + "step": 17257 + }, + { + "epoch": 0.10263821486345037, + "grad_norm": 1.7493584156036377, + "learning_rate": 4.871171516970191e-05, + "loss": 5.0744, + "step": 17258 + }, + { + "epoch": 0.10264416214673137, + "grad_norm": 1.9395314455032349, + "learning_rate": 4.8711567155530224e-05, + "loss": 5.2783, + "step": 17259 + }, + { + "epoch": 0.10265010943001238, + "grad_norm": 2.057565689086914, + "learning_rate": 4.871141913308107e-05, + "loss": 5.2501, + "step": 17260 + }, + { + "epoch": 0.10265605671329336, + "grad_norm": 2.159641742706299, + "learning_rate": 4.87112711023545e-05, + "loss": 5.2844, + "step": 17261 + }, + { + "epoch": 0.10266200399657437, + "grad_norm": 1.8931914567947388, + "learning_rate": 4.8711123063350575e-05, + "loss": 5.4454, + "step": 17262 + }, + { + "epoch": 0.10266795127985537, + "grad_norm": 1.9728927612304688, + "learning_rate": 4.871097501606934e-05, + "loss": 5.3719, + "step": 17263 + }, + { + "epoch": 0.10267389856313636, + "grad_norm": 1.8770530223846436, + "learning_rate": 4.8710826960510845e-05, + "loss": 5.4244, + "step": 17264 + }, + { + "epoch": 0.10267984584641736, + "grad_norm": 2.072201728820801, + "learning_rate": 4.871067889667516e-05, + "loss": 5.3282, + "step": 17265 + }, + { + "epoch": 0.10268579312969836, + "grad_norm": 2.16689133644104, + "learning_rate": 4.8710530824562304e-05, + "loss": 5.4205, + "step": 17266 + }, + { + "epoch": 0.10269174041297935, + "grad_norm": 2.017695903778076, + "learning_rate": 4.8710382744172354e-05, + "loss": 5.1803, + "step": 17267 + }, + { + "epoch": 0.10269768769626035, + "grad_norm": 1.8181023597717285, + "learning_rate": 4.871023465550535e-05, + "loss": 5.3418, + "step": 17268 + }, + { + "epoch": 0.10270363497954134, + "grad_norm": 1.9661909341812134, + "learning_rate": 4.871008655856136e-05, + "loss": 5.115, + "step": 17269 + }, + { + "epoch": 0.10270958226282234, + "grad_norm": 1.9482250213623047, + "learning_rate": 4.870993845334041e-05, + "loss": 5.0172, + "step": 17270 + }, + { + "epoch": 0.10271552954610334, + "grad_norm": 2.0916497707366943, + "learning_rate": 4.870979033984257e-05, + "loss": 5.4317, + "step": 17271 + }, + { + "epoch": 0.10272147682938433, + "grad_norm": 1.919918417930603, + "learning_rate": 4.8709642218067894e-05, + "loss": 5.3986, + "step": 17272 + }, + { + "epoch": 0.10272742411266533, + "grad_norm": 1.8286259174346924, + "learning_rate": 4.870949408801642e-05, + "loss": 5.1301, + "step": 17273 + }, + { + "epoch": 0.10273337139594634, + "grad_norm": 2.2312278747558594, + "learning_rate": 4.870934594968821e-05, + "loss": 5.0839, + "step": 17274 + }, + { + "epoch": 0.10273931867922732, + "grad_norm": 2.2795724868774414, + "learning_rate": 4.870919780308331e-05, + "loss": 5.3578, + "step": 17275 + }, + { + "epoch": 0.10274526596250833, + "grad_norm": 2.253885269165039, + "learning_rate": 4.870904964820178e-05, + "loss": 5.2482, + "step": 17276 + }, + { + "epoch": 0.10275121324578933, + "grad_norm": 1.9351953268051147, + "learning_rate": 4.870890148504366e-05, + "loss": 5.3657, + "step": 17277 + }, + { + "epoch": 0.10275716052907032, + "grad_norm": 2.072274923324585, + "learning_rate": 4.8708753313609004e-05, + "loss": 5.2433, + "step": 17278 + }, + { + "epoch": 0.10276310781235132, + "grad_norm": 2.0419273376464844, + "learning_rate": 4.8708605133897874e-05, + "loss": 5.27, + "step": 17279 + }, + { + "epoch": 0.10276905509563232, + "grad_norm": 2.156855821609497, + "learning_rate": 4.870845694591031e-05, + "loss": 5.1727, + "step": 17280 + }, + { + "epoch": 0.10277500237891331, + "grad_norm": 1.6552194356918335, + "learning_rate": 4.870830874964637e-05, + "loss": 5.0872, + "step": 17281 + }, + { + "epoch": 0.10278094966219431, + "grad_norm": 1.8167924880981445, + "learning_rate": 4.870816054510611e-05, + "loss": 5.2827, + "step": 17282 + }, + { + "epoch": 0.10278689694547531, + "grad_norm": 2.1617610454559326, + "learning_rate": 4.870801233228956e-05, + "loss": 5.1375, + "step": 17283 + }, + { + "epoch": 0.1027928442287563, + "grad_norm": 1.918817162513733, + "learning_rate": 4.87078641111968e-05, + "loss": 5.2945, + "step": 17284 + }, + { + "epoch": 0.1027987915120373, + "grad_norm": 1.5282881259918213, + "learning_rate": 4.870771588182788e-05, + "loss": 5.6653, + "step": 17285 + }, + { + "epoch": 0.1028047387953183, + "grad_norm": 1.7902590036392212, + "learning_rate": 4.8707567644182825e-05, + "loss": 5.6262, + "step": 17286 + }, + { + "epoch": 0.10281068607859929, + "grad_norm": 1.9451625347137451, + "learning_rate": 4.87074193982617e-05, + "loss": 5.1153, + "step": 17287 + }, + { + "epoch": 0.1028166333618803, + "grad_norm": 1.832401156425476, + "learning_rate": 4.870727114406457e-05, + "loss": 5.2928, + "step": 17288 + }, + { + "epoch": 0.1028225806451613, + "grad_norm": 1.645761251449585, + "learning_rate": 4.870712288159147e-05, + "loss": 5.649, + "step": 17289 + }, + { + "epoch": 0.10282852792844228, + "grad_norm": 1.6721855401992798, + "learning_rate": 4.8706974610842474e-05, + "loss": 5.7568, + "step": 17290 + }, + { + "epoch": 0.10283447521172329, + "grad_norm": 1.7489598989486694, + "learning_rate": 4.87068263318176e-05, + "loss": 5.6752, + "step": 17291 + }, + { + "epoch": 0.10284042249500429, + "grad_norm": 1.505332112312317, + "learning_rate": 4.870667804451693e-05, + "loss": 5.2993, + "step": 17292 + }, + { + "epoch": 0.10284636977828528, + "grad_norm": 1.3620814085006714, + "learning_rate": 4.870652974894049e-05, + "loss": 4.7225, + "step": 17293 + }, + { + "epoch": 0.10285231706156628, + "grad_norm": 2.1685922145843506, + "learning_rate": 4.8706381445088356e-05, + "loss": 4.8737, + "step": 17294 + }, + { + "epoch": 0.10285826434484728, + "grad_norm": 2.219942331314087, + "learning_rate": 4.8706233132960566e-05, + "loss": 5.7529, + "step": 17295 + }, + { + "epoch": 0.10286421162812827, + "grad_norm": 1.928809404373169, + "learning_rate": 4.8706084812557176e-05, + "loss": 5.803, + "step": 17296 + }, + { + "epoch": 0.10287015891140927, + "grad_norm": 1.8534711599349976, + "learning_rate": 4.870593648387823e-05, + "loss": 5.9403, + "step": 17297 + }, + { + "epoch": 0.10287610619469026, + "grad_norm": 2.2624459266662598, + "learning_rate": 4.87057881469238e-05, + "loss": 5.1227, + "step": 17298 + }, + { + "epoch": 0.10288205347797126, + "grad_norm": 2.4320240020751953, + "learning_rate": 4.870563980169391e-05, + "loss": 4.9701, + "step": 17299 + }, + { + "epoch": 0.10288800076125226, + "grad_norm": 2.664921760559082, + "learning_rate": 4.870549144818864e-05, + "loss": 4.8771, + "step": 17300 + }, + { + "epoch": 0.10289394804453325, + "grad_norm": 2.2558987140655518, + "learning_rate": 4.870534308640802e-05, + "loss": 5.0682, + "step": 17301 + }, + { + "epoch": 0.10289989532781425, + "grad_norm": 2.291553258895874, + "learning_rate": 4.870519471635211e-05, + "loss": 4.8481, + "step": 17302 + }, + { + "epoch": 0.10290584261109526, + "grad_norm": 1.9109137058258057, + "learning_rate": 4.870504633802096e-05, + "loss": 5.377, + "step": 17303 + }, + { + "epoch": 0.10291178989437624, + "grad_norm": 1.6809476613998413, + "learning_rate": 4.870489795141463e-05, + "loss": 5.5337, + "step": 17304 + }, + { + "epoch": 0.10291773717765725, + "grad_norm": 1.6410505771636963, + "learning_rate": 4.870474955653316e-05, + "loss": 5.5353, + "step": 17305 + }, + { + "epoch": 0.10292368446093825, + "grad_norm": 1.6310313940048218, + "learning_rate": 4.87046011533766e-05, + "loss": 5.4727, + "step": 17306 + }, + { + "epoch": 0.10292963174421924, + "grad_norm": 1.6450475454330444, + "learning_rate": 4.8704452741945015e-05, + "loss": 5.3677, + "step": 17307 + }, + { + "epoch": 0.10293557902750024, + "grad_norm": 1.7327302694320679, + "learning_rate": 4.870430432223846e-05, + "loss": 5.2964, + "step": 17308 + }, + { + "epoch": 0.10294152631078124, + "grad_norm": 2.837498426437378, + "learning_rate": 4.870415589425696e-05, + "loss": 4.7407, + "step": 17309 + }, + { + "epoch": 0.10294747359406223, + "grad_norm": 2.326399803161621, + "learning_rate": 4.8704007458000593e-05, + "loss": 4.8998, + "step": 17310 + }, + { + "epoch": 0.10295342087734323, + "grad_norm": 1.9505521059036255, + "learning_rate": 4.87038590134694e-05, + "loss": 5.438, + "step": 17311 + }, + { + "epoch": 0.10295936816062423, + "grad_norm": 1.690581202507019, + "learning_rate": 4.870371056066344e-05, + "loss": 5.4291, + "step": 17312 + }, + { + "epoch": 0.10296531544390522, + "grad_norm": 1.9977236986160278, + "learning_rate": 4.870356209958276e-05, + "loss": 5.81, + "step": 17313 + }, + { + "epoch": 0.10297126272718622, + "grad_norm": 1.7996702194213867, + "learning_rate": 4.8703413630227405e-05, + "loss": 5.7569, + "step": 17314 + }, + { + "epoch": 0.10297721001046722, + "grad_norm": 1.7594531774520874, + "learning_rate": 4.870326515259743e-05, + "loss": 5.9367, + "step": 17315 + }, + { + "epoch": 0.10298315729374821, + "grad_norm": 1.8434146642684937, + "learning_rate": 4.870311666669289e-05, + "loss": 5.1578, + "step": 17316 + }, + { + "epoch": 0.10298910457702921, + "grad_norm": 2.531515598297119, + "learning_rate": 4.870296817251385e-05, + "loss": 5.0574, + "step": 17317 + }, + { + "epoch": 0.10299505186031022, + "grad_norm": 2.2126452922821045, + "learning_rate": 4.870281967006034e-05, + "loss": 4.9034, + "step": 17318 + }, + { + "epoch": 0.1030009991435912, + "grad_norm": 2.391558885574341, + "learning_rate": 4.870267115933242e-05, + "loss": 4.9584, + "step": 17319 + }, + { + "epoch": 0.1030069464268722, + "grad_norm": 1.9653453826904297, + "learning_rate": 4.8702522640330145e-05, + "loss": 4.9569, + "step": 17320 + }, + { + "epoch": 0.10301289371015321, + "grad_norm": 2.0124504566192627, + "learning_rate": 4.870237411305356e-05, + "loss": 4.9237, + "step": 17321 + }, + { + "epoch": 0.1030188409934342, + "grad_norm": 1.9120689630508423, + "learning_rate": 4.8702225577502724e-05, + "loss": 4.9637, + "step": 17322 + }, + { + "epoch": 0.1030247882767152, + "grad_norm": 2.108009099960327, + "learning_rate": 4.8702077033677684e-05, + "loss": 4.9479, + "step": 17323 + }, + { + "epoch": 0.1030307355599962, + "grad_norm": 2.211385488510132, + "learning_rate": 4.8701928481578494e-05, + "loss": 4.9553, + "step": 17324 + }, + { + "epoch": 0.10303668284327719, + "grad_norm": 2.1452252864837646, + "learning_rate": 4.8701779921205215e-05, + "loss": 4.7809, + "step": 17325 + }, + { + "epoch": 0.10304263012655819, + "grad_norm": 2.126650810241699, + "learning_rate": 4.8701631352557874e-05, + "loss": 4.7027, + "step": 17326 + }, + { + "epoch": 0.10304857740983918, + "grad_norm": 1.9753129482269287, + "learning_rate": 4.870148277563655e-05, + "loss": 4.8073, + "step": 17327 + }, + { + "epoch": 0.10305452469312018, + "grad_norm": 2.013455867767334, + "learning_rate": 4.8701334190441284e-05, + "loss": 4.7989, + "step": 17328 + }, + { + "epoch": 0.10306047197640118, + "grad_norm": 2.2819676399230957, + "learning_rate": 4.8701185596972124e-05, + "loss": 4.7784, + "step": 17329 + }, + { + "epoch": 0.10306641925968217, + "grad_norm": 2.050511360168457, + "learning_rate": 4.870103699522912e-05, + "loss": 4.9621, + "step": 17330 + }, + { + "epoch": 0.10307236654296317, + "grad_norm": 2.422591209411621, + "learning_rate": 4.870088838521233e-05, + "loss": 4.7558, + "step": 17331 + }, + { + "epoch": 0.10307831382624418, + "grad_norm": 2.2109572887420654, + "learning_rate": 4.870073976692181e-05, + "loss": 4.7162, + "step": 17332 + }, + { + "epoch": 0.10308426110952516, + "grad_norm": 2.070526123046875, + "learning_rate": 4.8700591140357596e-05, + "loss": 4.9765, + "step": 17333 + }, + { + "epoch": 0.10309020839280617, + "grad_norm": 1.610152244567871, + "learning_rate": 4.870044250551976e-05, + "loss": 5.9361, + "step": 17334 + }, + { + "epoch": 0.10309615567608717, + "grad_norm": 1.8921641111373901, + "learning_rate": 4.870029386240834e-05, + "loss": 4.9423, + "step": 17335 + }, + { + "epoch": 0.10310210295936816, + "grad_norm": 2.07476806640625, + "learning_rate": 4.870014521102339e-05, + "loss": 4.7742, + "step": 17336 + }, + { + "epoch": 0.10310805024264916, + "grad_norm": 2.021850824356079, + "learning_rate": 4.869999655136498e-05, + "loss": 4.8182, + "step": 17337 + }, + { + "epoch": 0.10311399752593016, + "grad_norm": 1.5896223783493042, + "learning_rate": 4.869984788343314e-05, + "loss": 5.5694, + "step": 17338 + }, + { + "epoch": 0.10311994480921115, + "grad_norm": 1.1907202005386353, + "learning_rate": 4.869969920722792e-05, + "loss": 5.4427, + "step": 17339 + }, + { + "epoch": 0.10312589209249215, + "grad_norm": 1.56050443649292, + "learning_rate": 4.869955052274938e-05, + "loss": 5.2405, + "step": 17340 + }, + { + "epoch": 0.10313183937577315, + "grad_norm": 1.6611580848693848, + "learning_rate": 4.869940182999757e-05, + "loss": 5.1457, + "step": 17341 + }, + { + "epoch": 0.10313778665905414, + "grad_norm": 1.4664785861968994, + "learning_rate": 4.869925312897256e-05, + "loss": 5.2846, + "step": 17342 + }, + { + "epoch": 0.10314373394233514, + "grad_norm": 1.9751476049423218, + "learning_rate": 4.8699104419674366e-05, + "loss": 5.0283, + "step": 17343 + }, + { + "epoch": 0.10314968122561614, + "grad_norm": 1.715144157409668, + "learning_rate": 4.869895570210307e-05, + "loss": 4.8856, + "step": 17344 + }, + { + "epoch": 0.10315562850889713, + "grad_norm": 1.7803713083267212, + "learning_rate": 4.8698806976258704e-05, + "loss": 5.5573, + "step": 17345 + }, + { + "epoch": 0.10316157579217813, + "grad_norm": 1.4687060117721558, + "learning_rate": 4.8698658242141336e-05, + "loss": 5.2287, + "step": 17346 + }, + { + "epoch": 0.10316752307545914, + "grad_norm": 1.6236404180526733, + "learning_rate": 4.869850949975101e-05, + "loss": 5.1, + "step": 17347 + }, + { + "epoch": 0.10317347035874012, + "grad_norm": 1.6414464712142944, + "learning_rate": 4.869836074908778e-05, + "loss": 5.0884, + "step": 17348 + }, + { + "epoch": 0.10317941764202113, + "grad_norm": 1.5938411951065063, + "learning_rate": 4.86982119901517e-05, + "loss": 5.9405, + "step": 17349 + }, + { + "epoch": 0.10318536492530213, + "grad_norm": 1.7434169054031372, + "learning_rate": 4.869806322294282e-05, + "loss": 6.3698, + "step": 17350 + }, + { + "epoch": 0.10319131220858312, + "grad_norm": 1.4999836683273315, + "learning_rate": 4.8697914447461185e-05, + "loss": 5.4169, + "step": 17351 + }, + { + "epoch": 0.10319725949186412, + "grad_norm": 1.768048644065857, + "learning_rate": 4.869776566370686e-05, + "loss": 5.6703, + "step": 17352 + }, + { + "epoch": 0.10320320677514512, + "grad_norm": 1.734729528427124, + "learning_rate": 4.869761687167988e-05, + "loss": 5.6454, + "step": 17353 + }, + { + "epoch": 0.10320915405842611, + "grad_norm": 1.848308801651001, + "learning_rate": 4.869746807138031e-05, + "loss": 5.742, + "step": 17354 + }, + { + "epoch": 0.10321510134170711, + "grad_norm": 1.628144383430481, + "learning_rate": 4.8697319262808205e-05, + "loss": 5.6099, + "step": 17355 + }, + { + "epoch": 0.1032210486249881, + "grad_norm": 1.5005884170532227, + "learning_rate": 4.86971704459636e-05, + "loss": 5.5419, + "step": 17356 + }, + { + "epoch": 0.1032269959082691, + "grad_norm": 1.5255531072616577, + "learning_rate": 4.869702162084657e-05, + "loss": 5.4757, + "step": 17357 + }, + { + "epoch": 0.1032329431915501, + "grad_norm": 1.549132227897644, + "learning_rate": 4.869687278745715e-05, + "loss": 5.4757, + "step": 17358 + }, + { + "epoch": 0.10323889047483109, + "grad_norm": 1.6518296003341675, + "learning_rate": 4.869672394579539e-05, + "loss": 5.5803, + "step": 17359 + }, + { + "epoch": 0.10324483775811209, + "grad_norm": 2.3987839221954346, + "learning_rate": 4.869657509586136e-05, + "loss": 5.0978, + "step": 17360 + }, + { + "epoch": 0.1032507850413931, + "grad_norm": 1.7290594577789307, + "learning_rate": 4.869642623765509e-05, + "loss": 5.4998, + "step": 17361 + }, + { + "epoch": 0.10325673232467408, + "grad_norm": 1.6334084272384644, + "learning_rate": 4.869627737117665e-05, + "loss": 5.4695, + "step": 17362 + }, + { + "epoch": 0.10326267960795509, + "grad_norm": 1.609734296798706, + "learning_rate": 4.8696128496426074e-05, + "loss": 5.4406, + "step": 17363 + }, + { + "epoch": 0.10326862689123609, + "grad_norm": 1.7579066753387451, + "learning_rate": 4.869597961340343e-05, + "loss": 5.6412, + "step": 17364 + }, + { + "epoch": 0.10327457417451708, + "grad_norm": 1.8831701278686523, + "learning_rate": 4.869583072210877e-05, + "loss": 5.444, + "step": 17365 + }, + { + "epoch": 0.10328052145779808, + "grad_norm": 1.9597128629684448, + "learning_rate": 4.869568182254214e-05, + "loss": 5.2228, + "step": 17366 + }, + { + "epoch": 0.10328646874107908, + "grad_norm": 1.8867931365966797, + "learning_rate": 4.8695532914703584e-05, + "loss": 4.9979, + "step": 17367 + }, + { + "epoch": 0.10329241602436007, + "grad_norm": 1.5480263233184814, + "learning_rate": 4.869538399859317e-05, + "loss": 5.6457, + "step": 17368 + }, + { + "epoch": 0.10329836330764107, + "grad_norm": 1.6710255146026611, + "learning_rate": 4.869523507421093e-05, + "loss": 5.774, + "step": 17369 + }, + { + "epoch": 0.10330431059092207, + "grad_norm": 1.6559721231460571, + "learning_rate": 4.869508614155695e-05, + "loss": 5.5643, + "step": 17370 + }, + { + "epoch": 0.10331025787420306, + "grad_norm": 1.4451355934143066, + "learning_rate": 4.869493720063124e-05, + "loss": 5.4598, + "step": 17371 + }, + { + "epoch": 0.10331620515748406, + "grad_norm": 1.8376599550247192, + "learning_rate": 4.869478825143388e-05, + "loss": 4.7552, + "step": 17372 + }, + { + "epoch": 0.10332215244076506, + "grad_norm": 2.0193891525268555, + "learning_rate": 4.869463929396491e-05, + "loss": 4.5671, + "step": 17373 + }, + { + "epoch": 0.10332809972404605, + "grad_norm": 2.07692551612854, + "learning_rate": 4.869449032822439e-05, + "loss": 4.4776, + "step": 17374 + }, + { + "epoch": 0.10333404700732705, + "grad_norm": 1.820893406867981, + "learning_rate": 4.869434135421237e-05, + "loss": 5.4705, + "step": 17375 + }, + { + "epoch": 0.10333999429060806, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.86941923719289e-05, + "loss": 4.8619, + "step": 17376 + }, + { + "epoch": 0.10334594157388904, + "grad_norm": 1.9348174333572388, + "learning_rate": 4.8694043381374026e-05, + "loss": 4.3723, + "step": 17377 + }, + { + "epoch": 0.10335188885717005, + "grad_norm": 1.8993666172027588, + "learning_rate": 4.869389438254781e-05, + "loss": 4.5442, + "step": 17378 + }, + { + "epoch": 0.10335783614045105, + "grad_norm": 1.9089124202728271, + "learning_rate": 4.869374537545031e-05, + "loss": 4.3347, + "step": 17379 + }, + { + "epoch": 0.10336378342373204, + "grad_norm": 1.8560502529144287, + "learning_rate": 4.869359636008155e-05, + "loss": 4.312, + "step": 17380 + }, + { + "epoch": 0.10336973070701304, + "grad_norm": 1.909680962562561, + "learning_rate": 4.8693447336441614e-05, + "loss": 4.3109, + "step": 17381 + }, + { + "epoch": 0.10337567799029404, + "grad_norm": 1.7769371271133423, + "learning_rate": 4.8693298304530535e-05, + "loss": 4.4442, + "step": 17382 + }, + { + "epoch": 0.10338162527357503, + "grad_norm": 2.080097198486328, + "learning_rate": 4.869314926434837e-05, + "loss": 4.339, + "step": 17383 + }, + { + "epoch": 0.10338757255685603, + "grad_norm": 1.8703278303146362, + "learning_rate": 4.8693000215895176e-05, + "loss": 4.4124, + "step": 17384 + }, + { + "epoch": 0.10339351984013702, + "grad_norm": 1.9553934335708618, + "learning_rate": 4.869285115917099e-05, + "loss": 4.3571, + "step": 17385 + }, + { + "epoch": 0.10339946712341802, + "grad_norm": 1.8989006280899048, + "learning_rate": 4.869270209417588e-05, + "loss": 4.4108, + "step": 17386 + }, + { + "epoch": 0.10340541440669902, + "grad_norm": 1.8347021341323853, + "learning_rate": 4.8692553020909896e-05, + "loss": 4.1529, + "step": 17387 + }, + { + "epoch": 0.10341136168998001, + "grad_norm": 1.9458621740341187, + "learning_rate": 4.869240393937309e-05, + "loss": 4.2392, + "step": 17388 + }, + { + "epoch": 0.10341730897326101, + "grad_norm": 1.8578664064407349, + "learning_rate": 4.86922548495655e-05, + "loss": 4.3238, + "step": 17389 + }, + { + "epoch": 0.10342325625654201, + "grad_norm": 1.9359874725341797, + "learning_rate": 4.869210575148719e-05, + "loss": 4.56, + "step": 17390 + }, + { + "epoch": 0.103429203539823, + "grad_norm": 2.0030486583709717, + "learning_rate": 4.869195664513822e-05, + "loss": 4.1571, + "step": 17391 + }, + { + "epoch": 0.103435150823104, + "grad_norm": 1.9431639909744263, + "learning_rate": 4.869180753051863e-05, + "loss": 4.2181, + "step": 17392 + }, + { + "epoch": 0.10344109810638501, + "grad_norm": 1.9171335697174072, + "learning_rate": 4.869165840762847e-05, + "loss": 4.3139, + "step": 17393 + }, + { + "epoch": 0.103447045389666, + "grad_norm": 1.9467666149139404, + "learning_rate": 4.86915092764678e-05, + "loss": 4.3906, + "step": 17394 + }, + { + "epoch": 0.103452992672947, + "grad_norm": 2.1354262828826904, + "learning_rate": 4.8691360137036666e-05, + "loss": 4.3407, + "step": 17395 + }, + { + "epoch": 0.103458939956228, + "grad_norm": 1.7994540929794312, + "learning_rate": 4.8691210989335126e-05, + "loss": 4.5767, + "step": 17396 + }, + { + "epoch": 0.10346488723950899, + "grad_norm": 1.8322330713272095, + "learning_rate": 4.869106183336323e-05, + "loss": 4.62, + "step": 17397 + }, + { + "epoch": 0.10347083452278999, + "grad_norm": 1.9874459505081177, + "learning_rate": 4.869091266912102e-05, + "loss": 4.2579, + "step": 17398 + }, + { + "epoch": 0.10347678180607099, + "grad_norm": 1.8300455808639526, + "learning_rate": 4.869076349660856e-05, + "loss": 4.3049, + "step": 17399 + }, + { + "epoch": 0.10348272908935198, + "grad_norm": 1.8731672763824463, + "learning_rate": 4.8690614315825914e-05, + "loss": 4.3241, + "step": 17400 + }, + { + "epoch": 0.10348867637263298, + "grad_norm": 1.8587061166763306, + "learning_rate": 4.86904651267731e-05, + "loss": 4.2513, + "step": 17401 + }, + { + "epoch": 0.10349462365591398, + "grad_norm": 1.8614505529403687, + "learning_rate": 4.86903159294502e-05, + "loss": 4.2877, + "step": 17402 + }, + { + "epoch": 0.10350057093919497, + "grad_norm": 1.7118782997131348, + "learning_rate": 4.869016672385725e-05, + "loss": 5.951, + "step": 17403 + }, + { + "epoch": 0.10350651822247597, + "grad_norm": 1.6701730489730835, + "learning_rate": 4.869001750999431e-05, + "loss": 5.8099, + "step": 17404 + }, + { + "epoch": 0.10351246550575698, + "grad_norm": 1.4960297346115112, + "learning_rate": 4.868986828786143e-05, + "loss": 5.7589, + "step": 17405 + }, + { + "epoch": 0.10351841278903796, + "grad_norm": 1.3732372522354126, + "learning_rate": 4.868971905745866e-05, + "loss": 5.8552, + "step": 17406 + }, + { + "epoch": 0.10352436007231897, + "grad_norm": 1.5108624696731567, + "learning_rate": 4.868956981878606e-05, + "loss": 5.82, + "step": 17407 + }, + { + "epoch": 0.10353030735559997, + "grad_norm": 1.8640809059143066, + "learning_rate": 4.868942057184367e-05, + "loss": 5.4388, + "step": 17408 + }, + { + "epoch": 0.10353625463888096, + "grad_norm": 2.082534074783325, + "learning_rate": 4.868927131663154e-05, + "loss": 4.3796, + "step": 17409 + }, + { + "epoch": 0.10354220192216196, + "grad_norm": 1.8963665962219238, + "learning_rate": 4.868912205314975e-05, + "loss": 5.6469, + "step": 17410 + }, + { + "epoch": 0.10354814920544296, + "grad_norm": 1.7797149419784546, + "learning_rate": 4.868897278139832e-05, + "loss": 5.6187, + "step": 17411 + }, + { + "epoch": 0.10355409648872395, + "grad_norm": 1.8464981317520142, + "learning_rate": 4.868882350137732e-05, + "loss": 4.8464, + "step": 17412 + }, + { + "epoch": 0.10356004377200495, + "grad_norm": 1.5401747226715088, + "learning_rate": 4.8688674213086794e-05, + "loss": 5.3547, + "step": 17413 + }, + { + "epoch": 0.10356599105528594, + "grad_norm": 1.4159618616104126, + "learning_rate": 4.868852491652679e-05, + "loss": 5.4428, + "step": 17414 + }, + { + "epoch": 0.10357193833856694, + "grad_norm": 1.6561527252197266, + "learning_rate": 4.868837561169738e-05, + "loss": 5.6467, + "step": 17415 + }, + { + "epoch": 0.10357788562184794, + "grad_norm": 1.659527063369751, + "learning_rate": 4.8688226298598586e-05, + "loss": 5.8631, + "step": 17416 + }, + { + "epoch": 0.10358383290512893, + "grad_norm": 1.8206923007965088, + "learning_rate": 4.868807697723049e-05, + "loss": 5.6475, + "step": 17417 + }, + { + "epoch": 0.10358978018840993, + "grad_norm": 1.9741102457046509, + "learning_rate": 4.868792764759312e-05, + "loss": 4.633, + "step": 17418 + }, + { + "epoch": 0.10359572747169093, + "grad_norm": 1.9505152702331543, + "learning_rate": 4.8687778309686546e-05, + "loss": 4.4024, + "step": 17419 + }, + { + "epoch": 0.10360167475497192, + "grad_norm": 1.7461168766021729, + "learning_rate": 4.868762896351082e-05, + "loss": 5.6505, + "step": 17420 + }, + { + "epoch": 0.10360762203825293, + "grad_norm": 1.6750074625015259, + "learning_rate": 4.868747960906598e-05, + "loss": 5.7747, + "step": 17421 + }, + { + "epoch": 0.10361356932153393, + "grad_norm": 1.5986868143081665, + "learning_rate": 4.8687330246352085e-05, + "loss": 5.2086, + "step": 17422 + }, + { + "epoch": 0.10361951660481492, + "grad_norm": 1.5743950605392456, + "learning_rate": 4.868718087536919e-05, + "loss": 5.6462, + "step": 17423 + }, + { + "epoch": 0.10362546388809592, + "grad_norm": 1.5192588567733765, + "learning_rate": 4.868703149611734e-05, + "loss": 5.5579, + "step": 17424 + }, + { + "epoch": 0.10363141117137692, + "grad_norm": 1.7356244325637817, + "learning_rate": 4.86868821085966e-05, + "loss": 5.5978, + "step": 17425 + }, + { + "epoch": 0.10363735845465791, + "grad_norm": 1.7366925477981567, + "learning_rate": 4.868673271280701e-05, + "loss": 5.3812, + "step": 17426 + }, + { + "epoch": 0.10364330573793891, + "grad_norm": 2.016662836074829, + "learning_rate": 4.868658330874862e-05, + "loss": 5.4003, + "step": 17427 + }, + { + "epoch": 0.10364925302121991, + "grad_norm": 2.022550582885742, + "learning_rate": 4.86864338964215e-05, + "loss": 5.191, + "step": 17428 + }, + { + "epoch": 0.1036552003045009, + "grad_norm": 1.8406000137329102, + "learning_rate": 4.868628447582568e-05, + "loss": 5.9494, + "step": 17429 + }, + { + "epoch": 0.1036611475877819, + "grad_norm": 1.7836806774139404, + "learning_rate": 4.868613504696123e-05, + "loss": 5.4606, + "step": 17430 + }, + { + "epoch": 0.1036670948710629, + "grad_norm": 1.6688835620880127, + "learning_rate": 4.86859856098282e-05, + "loss": 5.2287, + "step": 17431 + }, + { + "epoch": 0.10367304215434389, + "grad_norm": 1.7083512544631958, + "learning_rate": 4.868583616442663e-05, + "loss": 4.7133, + "step": 17432 + }, + { + "epoch": 0.1036789894376249, + "grad_norm": 1.8784829378128052, + "learning_rate": 4.8685686710756576e-05, + "loss": 4.8341, + "step": 17433 + }, + { + "epoch": 0.1036849367209059, + "grad_norm": 2.380962610244751, + "learning_rate": 4.8685537248818105e-05, + "loss": 4.6553, + "step": 17434 + }, + { + "epoch": 0.10369088400418688, + "grad_norm": 1.936126470565796, + "learning_rate": 4.868538777861125e-05, + "loss": 5.0645, + "step": 17435 + }, + { + "epoch": 0.10369683128746789, + "grad_norm": 1.9400380849838257, + "learning_rate": 4.8685238300136065e-05, + "loss": 4.9022, + "step": 17436 + }, + { + "epoch": 0.10370277857074889, + "grad_norm": 2.0275371074676514, + "learning_rate": 4.868508881339261e-05, + "loss": 4.8918, + "step": 17437 + }, + { + "epoch": 0.10370872585402988, + "grad_norm": 1.8734835386276245, + "learning_rate": 4.868493931838094e-05, + "loss": 4.9889, + "step": 17438 + }, + { + "epoch": 0.10371467313731088, + "grad_norm": 2.346519947052002, + "learning_rate": 4.868478981510111e-05, + "loss": 4.4857, + "step": 17439 + }, + { + "epoch": 0.10372062042059188, + "grad_norm": 2.4242961406707764, + "learning_rate": 4.868464030355315e-05, + "loss": 4.034, + "step": 17440 + }, + { + "epoch": 0.10372656770387287, + "grad_norm": 2.3877294063568115, + "learning_rate": 4.8684490783737133e-05, + "loss": 4.2761, + "step": 17441 + }, + { + "epoch": 0.10373251498715387, + "grad_norm": 1.832585096359253, + "learning_rate": 4.8684341255653107e-05, + "loss": 5.1485, + "step": 17442 + }, + { + "epoch": 0.10373846227043486, + "grad_norm": 2.0385608673095703, + "learning_rate": 4.868419171930112e-05, + "loss": 5.7793, + "step": 17443 + }, + { + "epoch": 0.10374440955371586, + "grad_norm": 1.8885849714279175, + "learning_rate": 4.8684042174681225e-05, + "loss": 5.9304, + "step": 17444 + }, + { + "epoch": 0.10375035683699686, + "grad_norm": 1.8748784065246582, + "learning_rate": 4.868389262179348e-05, + "loss": 5.3722, + "step": 17445 + }, + { + "epoch": 0.10375630412027785, + "grad_norm": 1.9851447343826294, + "learning_rate": 4.8683743060637924e-05, + "loss": 5.4734, + "step": 17446 + }, + { + "epoch": 0.10376225140355885, + "grad_norm": 2.387681245803833, + "learning_rate": 4.868359349121463e-05, + "loss": 4.7244, + "step": 17447 + }, + { + "epoch": 0.10376819868683985, + "grad_norm": 1.8236793279647827, + "learning_rate": 4.868344391352363e-05, + "loss": 5.0094, + "step": 17448 + }, + { + "epoch": 0.10377414597012084, + "grad_norm": 1.3649673461914062, + "learning_rate": 4.868329432756498e-05, + "loss": 5.3295, + "step": 17449 + }, + { + "epoch": 0.10378009325340184, + "grad_norm": 1.8916471004486084, + "learning_rate": 4.8683144733338746e-05, + "loss": 5.9443, + "step": 17450 + }, + { + "epoch": 0.10378604053668285, + "grad_norm": 1.8541333675384521, + "learning_rate": 4.868299513084497e-05, + "loss": 5.425, + "step": 17451 + }, + { + "epoch": 0.10379198781996384, + "grad_norm": 1.9708364009857178, + "learning_rate": 4.8682845520083695e-05, + "loss": 5.3254, + "step": 17452 + }, + { + "epoch": 0.10379793510324484, + "grad_norm": 1.7171103954315186, + "learning_rate": 4.8682695901054995e-05, + "loss": 5.3498, + "step": 17453 + }, + { + "epoch": 0.10380388238652584, + "grad_norm": 1.6002514362335205, + "learning_rate": 4.868254627375891e-05, + "loss": 5.1611, + "step": 17454 + }, + { + "epoch": 0.10380982966980683, + "grad_norm": 1.9245331287384033, + "learning_rate": 4.8682396638195486e-05, + "loss": 5.3348, + "step": 17455 + }, + { + "epoch": 0.10381577695308783, + "grad_norm": 1.4742863178253174, + "learning_rate": 4.8682246994364786e-05, + "loss": 5.7573, + "step": 17456 + }, + { + "epoch": 0.10382172423636883, + "grad_norm": 1.929343581199646, + "learning_rate": 4.8682097342266855e-05, + "loss": 5.8469, + "step": 17457 + }, + { + "epoch": 0.10382767151964982, + "grad_norm": 1.6212769746780396, + "learning_rate": 4.8681947681901754e-05, + "loss": 5.9121, + "step": 17458 + }, + { + "epoch": 0.10383361880293082, + "grad_norm": 1.6550590991973877, + "learning_rate": 4.868179801326952e-05, + "loss": 5.7114, + "step": 17459 + }, + { + "epoch": 0.10383956608621182, + "grad_norm": 1.671628475189209, + "learning_rate": 4.868164833637023e-05, + "loss": 5.3988, + "step": 17460 + }, + { + "epoch": 0.10384551336949281, + "grad_norm": 1.5833921432495117, + "learning_rate": 4.868149865120391e-05, + "loss": 5.1952, + "step": 17461 + }, + { + "epoch": 0.10385146065277381, + "grad_norm": 1.8280199766159058, + "learning_rate": 4.868134895777063e-05, + "loss": 5.4812, + "step": 17462 + }, + { + "epoch": 0.10385740793605482, + "grad_norm": 1.7413616180419922, + "learning_rate": 4.868119925607043e-05, + "loss": 5.4119, + "step": 17463 + }, + { + "epoch": 0.1038633552193358, + "grad_norm": 1.6645252704620361, + "learning_rate": 4.868104954610337e-05, + "loss": 5.3546, + "step": 17464 + }, + { + "epoch": 0.1038693025026168, + "grad_norm": 1.634175181388855, + "learning_rate": 4.86808998278695e-05, + "loss": 5.3119, + "step": 17465 + }, + { + "epoch": 0.10387524978589781, + "grad_norm": 1.5220096111297607, + "learning_rate": 4.868075010136887e-05, + "loss": 5.1345, + "step": 17466 + }, + { + "epoch": 0.1038811970691788, + "grad_norm": 1.3279895782470703, + "learning_rate": 4.8680600366601534e-05, + "loss": 5.0071, + "step": 17467 + }, + { + "epoch": 0.1038871443524598, + "grad_norm": 1.4460431337356567, + "learning_rate": 4.8680450623567555e-05, + "loss": 4.8219, + "step": 17468 + }, + { + "epoch": 0.1038930916357408, + "grad_norm": 1.7028027772903442, + "learning_rate": 4.868030087226697e-05, + "loss": 5.2679, + "step": 17469 + }, + { + "epoch": 0.10389903891902179, + "grad_norm": 1.7697324752807617, + "learning_rate": 4.8680151112699835e-05, + "loss": 5.504, + "step": 17470 + }, + { + "epoch": 0.10390498620230279, + "grad_norm": 1.4549357891082764, + "learning_rate": 4.86800013448662e-05, + "loss": 5.4475, + "step": 17471 + }, + { + "epoch": 0.10391093348558378, + "grad_norm": 1.7069107294082642, + "learning_rate": 4.867985156876613e-05, + "loss": 5.5878, + "step": 17472 + }, + { + "epoch": 0.10391688076886478, + "grad_norm": 1.8917819261550903, + "learning_rate": 4.867970178439967e-05, + "loss": 5.4449, + "step": 17473 + }, + { + "epoch": 0.10392282805214578, + "grad_norm": 1.7132060527801514, + "learning_rate": 4.8679551991766856e-05, + "loss": 5.7547, + "step": 17474 + }, + { + "epoch": 0.10392877533542677, + "grad_norm": 1.6535362005233765, + "learning_rate": 4.867940219086777e-05, + "loss": 5.9603, + "step": 17475 + }, + { + "epoch": 0.10393472261870777, + "grad_norm": 1.6559079885482788, + "learning_rate": 4.8679252381702443e-05, + "loss": 5.9673, + "step": 17476 + }, + { + "epoch": 0.10394066990198877, + "grad_norm": 1.5295041799545288, + "learning_rate": 4.867910256427093e-05, + "loss": 5.4502, + "step": 17477 + }, + { + "epoch": 0.10394661718526976, + "grad_norm": 1.8571394681930542, + "learning_rate": 4.8678952738573294e-05, + "loss": 6.1838, + "step": 17478 + }, + { + "epoch": 0.10395256446855076, + "grad_norm": 1.7148513793945312, + "learning_rate": 4.8678802904609576e-05, + "loss": 5.9624, + "step": 17479 + }, + { + "epoch": 0.10395851175183177, + "grad_norm": 1.7191139459609985, + "learning_rate": 4.867865306237983e-05, + "loss": 5.8591, + "step": 17480 + }, + { + "epoch": 0.10396445903511276, + "grad_norm": 1.526285171508789, + "learning_rate": 4.867850321188412e-05, + "loss": 5.988, + "step": 17481 + }, + { + "epoch": 0.10397040631839376, + "grad_norm": 1.5284392833709717, + "learning_rate": 4.867835335312249e-05, + "loss": 5.7212, + "step": 17482 + }, + { + "epoch": 0.10397635360167476, + "grad_norm": 1.5675333738327026, + "learning_rate": 4.8678203486094975e-05, + "loss": 5.5921, + "step": 17483 + }, + { + "epoch": 0.10398230088495575, + "grad_norm": 1.7697393894195557, + "learning_rate": 4.8678053610801654e-05, + "loss": 5.1748, + "step": 17484 + }, + { + "epoch": 0.10398824816823675, + "grad_norm": 1.5940029621124268, + "learning_rate": 4.867790372724257e-05, + "loss": 5.7108, + "step": 17485 + }, + { + "epoch": 0.10399419545151775, + "grad_norm": 2.0347743034362793, + "learning_rate": 4.867775383541777e-05, + "loss": 5.4253, + "step": 17486 + }, + { + "epoch": 0.10400014273479874, + "grad_norm": 2.1038641929626465, + "learning_rate": 4.867760393532732e-05, + "loss": 5.2362, + "step": 17487 + }, + { + "epoch": 0.10400609001807974, + "grad_norm": 2.2253377437591553, + "learning_rate": 4.867745402697126e-05, + "loss": 5.0801, + "step": 17488 + }, + { + "epoch": 0.10401203730136074, + "grad_norm": 1.8215906620025635, + "learning_rate": 4.867730411034964e-05, + "loss": 5.1438, + "step": 17489 + }, + { + "epoch": 0.10401798458464173, + "grad_norm": 1.5428386926651, + "learning_rate": 4.867715418546252e-05, + "loss": 5.0664, + "step": 17490 + }, + { + "epoch": 0.10402393186792273, + "grad_norm": 1.3886137008666992, + "learning_rate": 4.867700425230995e-05, + "loss": 4.992, + "step": 17491 + }, + { + "epoch": 0.10402987915120374, + "grad_norm": 1.4177032709121704, + "learning_rate": 4.867685431089199e-05, + "loss": 4.9245, + "step": 17492 + }, + { + "epoch": 0.10403582643448472, + "grad_norm": 1.2621585130691528, + "learning_rate": 4.867670436120867e-05, + "loss": 4.8902, + "step": 17493 + }, + { + "epoch": 0.10404177371776573, + "grad_norm": 1.4095661640167236, + "learning_rate": 4.867655440326007e-05, + "loss": 4.871, + "step": 17494 + }, + { + "epoch": 0.10404772100104673, + "grad_norm": 1.3117374181747437, + "learning_rate": 4.867640443704622e-05, + "loss": 4.9351, + "step": 17495 + }, + { + "epoch": 0.10405366828432772, + "grad_norm": 1.6237322092056274, + "learning_rate": 4.867625446256719e-05, + "loss": 5.4253, + "step": 17496 + }, + { + "epoch": 0.10405961556760872, + "grad_norm": 2.095696210861206, + "learning_rate": 4.867610447982302e-05, + "loss": 5.1793, + "step": 17497 + }, + { + "epoch": 0.10406556285088972, + "grad_norm": 3.627516508102417, + "learning_rate": 4.867595448881377e-05, + "loss": 5.1206, + "step": 17498 + }, + { + "epoch": 0.10407151013417071, + "grad_norm": 2.0525522232055664, + "learning_rate": 4.8675804489539477e-05, + "loss": 5.5922, + "step": 17499 + }, + { + "epoch": 0.10407745741745171, + "grad_norm": 1.6003656387329102, + "learning_rate": 4.867565448200022e-05, + "loss": 6.0267, + "step": 17500 + }, + { + "epoch": 0.1040834047007327, + "grad_norm": 1.4709582328796387, + "learning_rate": 4.8675504466196034e-05, + "loss": 5.55, + "step": 17501 + }, + { + "epoch": 0.1040893519840137, + "grad_norm": 1.5550457239151, + "learning_rate": 4.8675354442126966e-05, + "loss": 5.6857, + "step": 17502 + }, + { + "epoch": 0.1040952992672947, + "grad_norm": 1.6180169582366943, + "learning_rate": 4.8675204409793085e-05, + "loss": 5.3079, + "step": 17503 + }, + { + "epoch": 0.10410124655057569, + "grad_norm": 1.5625691413879395, + "learning_rate": 4.8675054369194426e-05, + "loss": 5.5965, + "step": 17504 + }, + { + "epoch": 0.10410719383385669, + "grad_norm": 1.4117538928985596, + "learning_rate": 4.8674904320331064e-05, + "loss": 5.7337, + "step": 17505 + }, + { + "epoch": 0.1041131411171377, + "grad_norm": 1.5518572330474854, + "learning_rate": 4.867475426320302e-05, + "loss": 5.5802, + "step": 17506 + }, + { + "epoch": 0.10411908840041868, + "grad_norm": 1.3276773691177368, + "learning_rate": 4.867460419781037e-05, + "loss": 6.0462, + "step": 17507 + }, + { + "epoch": 0.10412503568369968, + "grad_norm": 1.3660519123077393, + "learning_rate": 4.867445412415317e-05, + "loss": 6.0382, + "step": 17508 + }, + { + "epoch": 0.10413098296698069, + "grad_norm": 1.2959636449813843, + "learning_rate": 4.867430404223146e-05, + "loss": 5.8823, + "step": 17509 + }, + { + "epoch": 0.10413693025026168, + "grad_norm": 2.009265899658203, + "learning_rate": 4.867415395204528e-05, + "loss": 4.9889, + "step": 17510 + }, + { + "epoch": 0.10414287753354268, + "grad_norm": 1.3692728281021118, + "learning_rate": 4.8674003853594705e-05, + "loss": 5.2382, + "step": 17511 + }, + { + "epoch": 0.10414882481682368, + "grad_norm": 1.4074095487594604, + "learning_rate": 4.8673853746879785e-05, + "loss": 5.8241, + "step": 17512 + }, + { + "epoch": 0.10415477210010467, + "grad_norm": 1.2155077457427979, + "learning_rate": 4.867370363190057e-05, + "loss": 5.762, + "step": 17513 + }, + { + "epoch": 0.10416071938338567, + "grad_norm": 1.1142069101333618, + "learning_rate": 4.86735535086571e-05, + "loss": 5.7591, + "step": 17514 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 1.1758382320404053, + "learning_rate": 4.867340337714944e-05, + "loss": 5.6534, + "step": 17515 + }, + { + "epoch": 0.10417261394994766, + "grad_norm": 1.2154567241668701, + "learning_rate": 4.867325323737765e-05, + "loss": 5.7465, + "step": 17516 + }, + { + "epoch": 0.10417856123322866, + "grad_norm": 1.3033171892166138, + "learning_rate": 4.867310308934176e-05, + "loss": 5.7701, + "step": 17517 + }, + { + "epoch": 0.10418450851650966, + "grad_norm": 1.3491926193237305, + "learning_rate": 4.867295293304184e-05, + "loss": 5.7883, + "step": 17518 + }, + { + "epoch": 0.10419045579979065, + "grad_norm": 1.223988652229309, + "learning_rate": 4.867280276847793e-05, + "loss": 5.7065, + "step": 17519 + }, + { + "epoch": 0.10419640308307165, + "grad_norm": 1.3885700702667236, + "learning_rate": 4.867265259565009e-05, + "loss": 5.6934, + "step": 17520 + }, + { + "epoch": 0.10420235036635266, + "grad_norm": 1.1616452932357788, + "learning_rate": 4.867250241455837e-05, + "loss": 5.6958, + "step": 17521 + }, + { + "epoch": 0.10420829764963364, + "grad_norm": 1.2696330547332764, + "learning_rate": 4.867235222520283e-05, + "loss": 5.5534, + "step": 17522 + }, + { + "epoch": 0.10421424493291465, + "grad_norm": 1.3539372682571411, + "learning_rate": 4.8672202027583516e-05, + "loss": 5.8028, + "step": 17523 + }, + { + "epoch": 0.10422019221619565, + "grad_norm": 2.547095775604248, + "learning_rate": 4.867205182170048e-05, + "loss": 5.0223, + "step": 17524 + }, + { + "epoch": 0.10422613949947664, + "grad_norm": 1.7378231287002563, + "learning_rate": 4.8671901607553775e-05, + "loss": 5.8356, + "step": 17525 + }, + { + "epoch": 0.10423208678275764, + "grad_norm": 1.9287587404251099, + "learning_rate": 4.867175138514346e-05, + "loss": 5.9694, + "step": 17526 + }, + { + "epoch": 0.10423803406603864, + "grad_norm": 1.685260534286499, + "learning_rate": 4.867160115446957e-05, + "loss": 5.6962, + "step": 17527 + }, + { + "epoch": 0.10424398134931963, + "grad_norm": 1.594699501991272, + "learning_rate": 4.8671450915532176e-05, + "loss": 5.6139, + "step": 17528 + }, + { + "epoch": 0.10424992863260063, + "grad_norm": 1.5966441631317139, + "learning_rate": 4.867130066833132e-05, + "loss": 5.6369, + "step": 17529 + }, + { + "epoch": 0.10425587591588162, + "grad_norm": 1.701524019241333, + "learning_rate": 4.867115041286706e-05, + "loss": 5.6487, + "step": 17530 + }, + { + "epoch": 0.10426182319916262, + "grad_norm": 1.575536847114563, + "learning_rate": 4.8671000149139444e-05, + "loss": 5.5935, + "step": 17531 + }, + { + "epoch": 0.10426777048244362, + "grad_norm": 1.6812626123428345, + "learning_rate": 4.867084987714853e-05, + "loss": 5.4343, + "step": 17532 + }, + { + "epoch": 0.10427371776572461, + "grad_norm": 1.6122568845748901, + "learning_rate": 4.867069959689435e-05, + "loss": 5.5194, + "step": 17533 + }, + { + "epoch": 0.10427966504900561, + "grad_norm": 1.5337659120559692, + "learning_rate": 4.8670549308376996e-05, + "loss": 5.5248, + "step": 17534 + }, + { + "epoch": 0.10428561233228661, + "grad_norm": 1.45541250705719, + "learning_rate": 4.867039901159649e-05, + "loss": 5.6301, + "step": 17535 + }, + { + "epoch": 0.1042915596155676, + "grad_norm": 1.6674455404281616, + "learning_rate": 4.867024870655289e-05, + "loss": 6.1182, + "step": 17536 + }, + { + "epoch": 0.1042975068988486, + "grad_norm": 1.4686870574951172, + "learning_rate": 4.867009839324624e-05, + "loss": 5.9761, + "step": 17537 + }, + { + "epoch": 0.1043034541821296, + "grad_norm": 1.6447898149490356, + "learning_rate": 4.866994807167662e-05, + "loss": 5.4559, + "step": 17538 + }, + { + "epoch": 0.1043094014654106, + "grad_norm": 1.4841620922088623, + "learning_rate": 4.866979774184406e-05, + "loss": 5.4441, + "step": 17539 + }, + { + "epoch": 0.1043153487486916, + "grad_norm": 1.8813121318817139, + "learning_rate": 4.8669647403748616e-05, + "loss": 5.348, + "step": 17540 + }, + { + "epoch": 0.1043212960319726, + "grad_norm": 4.018791198730469, + "learning_rate": 4.866949705739035e-05, + "loss": 5.457, + "step": 17541 + }, + { + "epoch": 0.10432724331525359, + "grad_norm": 2.9932172298431396, + "learning_rate": 4.86693467027693e-05, + "loss": 5.2345, + "step": 17542 + }, + { + "epoch": 0.10433319059853459, + "grad_norm": 1.4329689741134644, + "learning_rate": 4.866919633988553e-05, + "loss": 5.8491, + "step": 17543 + }, + { + "epoch": 0.10433913788181559, + "grad_norm": 1.7308731079101562, + "learning_rate": 4.866904596873909e-05, + "loss": 5.5858, + "step": 17544 + }, + { + "epoch": 0.10434508516509658, + "grad_norm": 2.2066311836242676, + "learning_rate": 4.866889558933002e-05, + "loss": 4.7702, + "step": 17545 + }, + { + "epoch": 0.10435103244837758, + "grad_norm": 1.528171181678772, + "learning_rate": 4.866874520165839e-05, + "loss": 5.1622, + "step": 17546 + }, + { + "epoch": 0.10435697973165858, + "grad_norm": 1.8969347476959229, + "learning_rate": 4.866859480572424e-05, + "loss": 5.0091, + "step": 17547 + }, + { + "epoch": 0.10436292701493957, + "grad_norm": 1.6737502813339233, + "learning_rate": 4.8668444401527644e-05, + "loss": 5.7552, + "step": 17548 + }, + { + "epoch": 0.10436887429822057, + "grad_norm": 1.793411374092102, + "learning_rate": 4.8668293989068626e-05, + "loss": 5.7963, + "step": 17549 + }, + { + "epoch": 0.10437482158150158, + "grad_norm": 1.8675566911697388, + "learning_rate": 4.866814356834725e-05, + "loss": 4.7389, + "step": 17550 + }, + { + "epoch": 0.10438076886478256, + "grad_norm": 1.9145622253417969, + "learning_rate": 4.8667993139363574e-05, + "loss": 5.0921, + "step": 17551 + }, + { + "epoch": 0.10438671614806357, + "grad_norm": 1.6751158237457275, + "learning_rate": 4.866784270211764e-05, + "loss": 5.5547, + "step": 17552 + }, + { + "epoch": 0.10439266343134457, + "grad_norm": 1.754550576210022, + "learning_rate": 4.866769225660951e-05, + "loss": 5.6077, + "step": 17553 + }, + { + "epoch": 0.10439861071462556, + "grad_norm": 2.0323402881622314, + "learning_rate": 4.866754180283924e-05, + "loss": 5.1191, + "step": 17554 + }, + { + "epoch": 0.10440455799790656, + "grad_norm": 1.8000339269638062, + "learning_rate": 4.866739134080687e-05, + "loss": 5.1533, + "step": 17555 + }, + { + "epoch": 0.10441050528118756, + "grad_norm": 2.053093671798706, + "learning_rate": 4.866724087051245e-05, + "loss": 4.9985, + "step": 17556 + }, + { + "epoch": 0.10441645256446855, + "grad_norm": 1.6764185428619385, + "learning_rate": 4.866709039195605e-05, + "loss": 4.9674, + "step": 17557 + }, + { + "epoch": 0.10442239984774955, + "grad_norm": 1.6942695379257202, + "learning_rate": 4.866693990513772e-05, + "loss": 4.9319, + "step": 17558 + }, + { + "epoch": 0.10442834713103054, + "grad_norm": 1.5124322175979614, + "learning_rate": 4.8666789410057496e-05, + "loss": 5.1371, + "step": 17559 + }, + { + "epoch": 0.10443429441431154, + "grad_norm": 1.925757646560669, + "learning_rate": 4.866663890671545e-05, + "loss": 4.6366, + "step": 17560 + }, + { + "epoch": 0.10444024169759254, + "grad_norm": 2.0077321529388428, + "learning_rate": 4.866648839511161e-05, + "loss": 4.9993, + "step": 17561 + }, + { + "epoch": 0.10444618898087353, + "grad_norm": 2.1986982822418213, + "learning_rate": 4.866633787524605e-05, + "loss": 4.814, + "step": 17562 + }, + { + "epoch": 0.10445213626415453, + "grad_norm": 1.9967917203903198, + "learning_rate": 4.866618734711882e-05, + "loss": 4.5182, + "step": 17563 + }, + { + "epoch": 0.10445808354743553, + "grad_norm": 1.7663863897323608, + "learning_rate": 4.8666036810729965e-05, + "loss": 4.5589, + "step": 17564 + }, + { + "epoch": 0.10446403083071652, + "grad_norm": 1.7784098386764526, + "learning_rate": 4.8665886266079537e-05, + "loss": 4.6739, + "step": 17565 + }, + { + "epoch": 0.10446997811399752, + "grad_norm": 1.7143903970718384, + "learning_rate": 4.8665735713167596e-05, + "loss": 4.8434, + "step": 17566 + }, + { + "epoch": 0.10447592539727853, + "grad_norm": 2.018825054168701, + "learning_rate": 4.866558515199419e-05, + "loss": 4.5235, + "step": 17567 + }, + { + "epoch": 0.10448187268055951, + "grad_norm": 2.1135973930358887, + "learning_rate": 4.8665434582559374e-05, + "loss": 4.5048, + "step": 17568 + }, + { + "epoch": 0.10448781996384052, + "grad_norm": 2.097177028656006, + "learning_rate": 4.86652840048632e-05, + "loss": 4.7811, + "step": 17569 + }, + { + "epoch": 0.10449376724712152, + "grad_norm": 2.054049015045166, + "learning_rate": 4.866513341890572e-05, + "loss": 4.5964, + "step": 17570 + }, + { + "epoch": 0.10449971453040251, + "grad_norm": 1.9631117582321167, + "learning_rate": 4.866498282468699e-05, + "loss": 4.4055, + "step": 17571 + }, + { + "epoch": 0.10450566181368351, + "grad_norm": 2.079071521759033, + "learning_rate": 4.8664832222207055e-05, + "loss": 4.3743, + "step": 17572 + }, + { + "epoch": 0.10451160909696451, + "grad_norm": 1.8425450325012207, + "learning_rate": 4.8664681611465966e-05, + "loss": 4.411, + "step": 17573 + }, + { + "epoch": 0.1045175563802455, + "grad_norm": 1.812538743019104, + "learning_rate": 4.866453099246379e-05, + "loss": 4.3496, + "step": 17574 + }, + { + "epoch": 0.1045235036635265, + "grad_norm": 1.8823848962783813, + "learning_rate": 4.8664380365200566e-05, + "loss": 4.3613, + "step": 17575 + }, + { + "epoch": 0.1045294509468075, + "grad_norm": 1.6085865497589111, + "learning_rate": 4.8664229729676356e-05, + "loss": 4.5187, + "step": 17576 + }, + { + "epoch": 0.10453539823008849, + "grad_norm": 1.8719606399536133, + "learning_rate": 4.8664079085891204e-05, + "loss": 4.7276, + "step": 17577 + }, + { + "epoch": 0.1045413455133695, + "grad_norm": 1.7630116939544678, + "learning_rate": 4.866392843384517e-05, + "loss": 4.3749, + "step": 17578 + }, + { + "epoch": 0.1045472927966505, + "grad_norm": 1.8641449213027954, + "learning_rate": 4.86637777735383e-05, + "loss": 4.5781, + "step": 17579 + }, + { + "epoch": 0.10455324007993148, + "grad_norm": 1.8178362846374512, + "learning_rate": 4.8663627104970645e-05, + "loss": 4.3217, + "step": 17580 + }, + { + "epoch": 0.10455918736321249, + "grad_norm": 1.7655141353607178, + "learning_rate": 4.866347642814228e-05, + "loss": 4.4972, + "step": 17581 + }, + { + "epoch": 0.10456513464649349, + "grad_norm": 1.843266248703003, + "learning_rate": 4.8663325743053216e-05, + "loss": 4.5214, + "step": 17582 + }, + { + "epoch": 0.10457108192977448, + "grad_norm": 1.8023161888122559, + "learning_rate": 4.866317504970354e-05, + "loss": 4.3205, + "step": 17583 + }, + { + "epoch": 0.10457702921305548, + "grad_norm": 1.7845708131790161, + "learning_rate": 4.8663024348093296e-05, + "loss": 4.1439, + "step": 17584 + }, + { + "epoch": 0.10458297649633648, + "grad_norm": 2.0029754638671875, + "learning_rate": 4.866287363822253e-05, + "loss": 4.4627, + "step": 17585 + }, + { + "epoch": 0.10458892377961747, + "grad_norm": 1.6008789539337158, + "learning_rate": 4.8662722920091305e-05, + "loss": 4.5539, + "step": 17586 + }, + { + "epoch": 0.10459487106289847, + "grad_norm": 1.884207844734192, + "learning_rate": 4.8662572193699664e-05, + "loss": 4.1132, + "step": 17587 + }, + { + "epoch": 0.10460081834617946, + "grad_norm": 1.7014282941818237, + "learning_rate": 4.866242145904767e-05, + "loss": 4.9612, + "step": 17588 + }, + { + "epoch": 0.10460676562946046, + "grad_norm": 1.7388410568237305, + "learning_rate": 4.8662270716135364e-05, + "loss": 5.3079, + "step": 17589 + }, + { + "epoch": 0.10461271291274146, + "grad_norm": 1.6414510011672974, + "learning_rate": 4.8662119964962805e-05, + "loss": 5.5816, + "step": 17590 + }, + { + "epoch": 0.10461866019602245, + "grad_norm": 1.4039387702941895, + "learning_rate": 4.866196920553004e-05, + "loss": 5.0036, + "step": 17591 + }, + { + "epoch": 0.10462460747930345, + "grad_norm": 1.7621723413467407, + "learning_rate": 4.866181843783712e-05, + "loss": 5.3461, + "step": 17592 + }, + { + "epoch": 0.10463055476258445, + "grad_norm": 1.4525210857391357, + "learning_rate": 4.866166766188412e-05, + "loss": 5.2897, + "step": 17593 + }, + { + "epoch": 0.10463650204586544, + "grad_norm": 1.4203788042068481, + "learning_rate": 4.866151687767107e-05, + "loss": 5.2506, + "step": 17594 + }, + { + "epoch": 0.10464244932914644, + "grad_norm": 1.419097900390625, + "learning_rate": 4.866136608519803e-05, + "loss": 5.246, + "step": 17595 + }, + { + "epoch": 0.10464839661242745, + "grad_norm": 1.8866242170333862, + "learning_rate": 4.8661215284465047e-05, + "loss": 5.5259, + "step": 17596 + }, + { + "epoch": 0.10465434389570843, + "grad_norm": 1.5161887407302856, + "learning_rate": 4.866106447547218e-05, + "loss": 5.2219, + "step": 17597 + }, + { + "epoch": 0.10466029117898944, + "grad_norm": 1.3552051782608032, + "learning_rate": 4.866091365821948e-05, + "loss": 4.9473, + "step": 17598 + }, + { + "epoch": 0.10466623846227044, + "grad_norm": 1.3443762063980103, + "learning_rate": 4.8660762832707e-05, + "loss": 5.0027, + "step": 17599 + }, + { + "epoch": 0.10467218574555143, + "grad_norm": 1.5657448768615723, + "learning_rate": 4.866061199893479e-05, + "loss": 5.3873, + "step": 17600 + }, + { + "epoch": 0.10467813302883243, + "grad_norm": 1.177984595298767, + "learning_rate": 4.866046115690291e-05, + "loss": 4.8628, + "step": 17601 + }, + { + "epoch": 0.10468408031211343, + "grad_norm": 1.1911925077438354, + "learning_rate": 4.8660310306611405e-05, + "loss": 4.7862, + "step": 17602 + }, + { + "epoch": 0.10469002759539442, + "grad_norm": 1.238619327545166, + "learning_rate": 4.866015944806033e-05, + "loss": 4.6844, + "step": 17603 + }, + { + "epoch": 0.10469597487867542, + "grad_norm": 1.4151804447174072, + "learning_rate": 4.8660008581249736e-05, + "loss": 4.7824, + "step": 17604 + }, + { + "epoch": 0.10470192216195642, + "grad_norm": 1.1852803230285645, + "learning_rate": 4.8659857706179676e-05, + "loss": 4.8358, + "step": 17605 + }, + { + "epoch": 0.10470786944523741, + "grad_norm": 1.2641617059707642, + "learning_rate": 4.865970682285022e-05, + "loss": 4.688, + "step": 17606 + }, + { + "epoch": 0.10471381672851841, + "grad_norm": 1.3711220026016235, + "learning_rate": 4.865955593126138e-05, + "loss": 4.6552, + "step": 17607 + }, + { + "epoch": 0.10471976401179942, + "grad_norm": 1.5641502141952515, + "learning_rate": 4.865940503141325e-05, + "loss": 5.0781, + "step": 17608 + }, + { + "epoch": 0.1047257112950804, + "grad_norm": 1.5290453433990479, + "learning_rate": 4.865925412330586e-05, + "loss": 5.1347, + "step": 17609 + }, + { + "epoch": 0.1047316585783614, + "grad_norm": 1.6220836639404297, + "learning_rate": 4.8659103206939275e-05, + "loss": 5.2943, + "step": 17610 + }, + { + "epoch": 0.10473760586164241, + "grad_norm": 1.4212614297866821, + "learning_rate": 4.865895228231353e-05, + "loss": 5.2939, + "step": 17611 + }, + { + "epoch": 0.1047435531449234, + "grad_norm": 1.4920703172683716, + "learning_rate": 4.8658801349428696e-05, + "loss": 5.3314, + "step": 17612 + }, + { + "epoch": 0.1047495004282044, + "grad_norm": 1.4596521854400635, + "learning_rate": 4.865865040828482e-05, + "loss": 5.3082, + "step": 17613 + }, + { + "epoch": 0.1047554477114854, + "grad_norm": 1.2887258529663086, + "learning_rate": 4.865849945888195e-05, + "loss": 5.1002, + "step": 17614 + }, + { + "epoch": 0.10476139499476639, + "grad_norm": 1.3587419986724854, + "learning_rate": 4.8658348501220145e-05, + "loss": 4.9773, + "step": 17615 + }, + { + "epoch": 0.10476734227804739, + "grad_norm": 1.5476746559143066, + "learning_rate": 4.865819753529945e-05, + "loss": 5.0726, + "step": 17616 + }, + { + "epoch": 0.10477328956132839, + "grad_norm": 1.2820343971252441, + "learning_rate": 4.865804656111993e-05, + "loss": 5.0708, + "step": 17617 + }, + { + "epoch": 0.10477923684460938, + "grad_norm": 1.5396101474761963, + "learning_rate": 4.8657895578681634e-05, + "loss": 5.087, + "step": 17618 + }, + { + "epoch": 0.10478518412789038, + "grad_norm": 1.9199161529541016, + "learning_rate": 4.86577445879846e-05, + "loss": 4.9402, + "step": 17619 + }, + { + "epoch": 0.10479113141117137, + "grad_norm": 1.6283903121948242, + "learning_rate": 4.8657593589028894e-05, + "loss": 5.2045, + "step": 17620 + }, + { + "epoch": 0.10479707869445237, + "grad_norm": 1.350632905960083, + "learning_rate": 4.865744258181457e-05, + "loss": 5.2314, + "step": 17621 + }, + { + "epoch": 0.10480302597773337, + "grad_norm": 1.5528992414474487, + "learning_rate": 4.865729156634168e-05, + "loss": 4.9361, + "step": 17622 + }, + { + "epoch": 0.10480897326101436, + "grad_norm": 1.4698718786239624, + "learning_rate": 4.865714054261027e-05, + "loss": 5.6547, + "step": 17623 + }, + { + "epoch": 0.10481492054429536, + "grad_norm": 1.2905457019805908, + "learning_rate": 4.86569895106204e-05, + "loss": 5.5628, + "step": 17624 + }, + { + "epoch": 0.10482086782757637, + "grad_norm": 1.2559312582015991, + "learning_rate": 4.8656838470372116e-05, + "loss": 5.3106, + "step": 17625 + }, + { + "epoch": 0.10482681511085735, + "grad_norm": 1.2229273319244385, + "learning_rate": 4.8656687421865466e-05, + "loss": 5.1566, + "step": 17626 + }, + { + "epoch": 0.10483276239413836, + "grad_norm": 1.4148969650268555, + "learning_rate": 4.8656536365100524e-05, + "loss": 5.1785, + "step": 17627 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 1.4109671115875244, + "learning_rate": 4.865638530007732e-05, + "loss": 4.922, + "step": 17628 + }, + { + "epoch": 0.10484465696070035, + "grad_norm": 1.526160478591919, + "learning_rate": 4.865623422679593e-05, + "loss": 5.0734, + "step": 17629 + }, + { + "epoch": 0.10485060424398135, + "grad_norm": 1.5093508958816528, + "learning_rate": 4.865608314525638e-05, + "loss": 5.1926, + "step": 17630 + }, + { + "epoch": 0.10485655152726235, + "grad_norm": 1.4625009298324585, + "learning_rate": 4.8655932055458734e-05, + "loss": 5.1372, + "step": 17631 + }, + { + "epoch": 0.10486249881054334, + "grad_norm": 1.348502516746521, + "learning_rate": 4.865578095740305e-05, + "loss": 5.0275, + "step": 17632 + }, + { + "epoch": 0.10486844609382434, + "grad_norm": 1.4530283212661743, + "learning_rate": 4.865562985108938e-05, + "loss": 5.093, + "step": 17633 + }, + { + "epoch": 0.10487439337710534, + "grad_norm": 1.4871639013290405, + "learning_rate": 4.865547873651778e-05, + "loss": 5.0789, + "step": 17634 + }, + { + "epoch": 0.10488034066038633, + "grad_norm": 1.2314977645874023, + "learning_rate": 4.865532761368828e-05, + "loss": 5.0966, + "step": 17635 + }, + { + "epoch": 0.10488628794366733, + "grad_norm": 1.3988053798675537, + "learning_rate": 4.865517648260097e-05, + "loss": 5.4284, + "step": 17636 + }, + { + "epoch": 0.10489223522694834, + "grad_norm": 1.3434901237487793, + "learning_rate": 4.865502534325587e-05, + "loss": 5.3563, + "step": 17637 + }, + { + "epoch": 0.10489818251022932, + "grad_norm": 1.3380807638168335, + "learning_rate": 4.865487419565305e-05, + "loss": 5.3628, + "step": 17638 + }, + { + "epoch": 0.10490412979351033, + "grad_norm": 1.5222781896591187, + "learning_rate": 4.865472303979255e-05, + "loss": 5.2164, + "step": 17639 + }, + { + "epoch": 0.10491007707679133, + "grad_norm": 1.2916938066482544, + "learning_rate": 4.865457187567444e-05, + "loss": 5.1248, + "step": 17640 + }, + { + "epoch": 0.10491602436007232, + "grad_norm": 1.4988411664962769, + "learning_rate": 4.8654420703298755e-05, + "loss": 5.0932, + "step": 17641 + }, + { + "epoch": 0.10492197164335332, + "grad_norm": 1.2529023885726929, + "learning_rate": 4.8654269522665564e-05, + "loss": 5.1465, + "step": 17642 + }, + { + "epoch": 0.10492791892663432, + "grad_norm": 1.3913809061050415, + "learning_rate": 4.86541183337749e-05, + "loss": 5.0039, + "step": 17643 + }, + { + "epoch": 0.10493386620991531, + "grad_norm": 1.5128841400146484, + "learning_rate": 4.8653967136626836e-05, + "loss": 4.9937, + "step": 17644 + }, + { + "epoch": 0.10493981349319631, + "grad_norm": 1.3300340175628662, + "learning_rate": 4.865381593122142e-05, + "loss": 5.0521, + "step": 17645 + }, + { + "epoch": 0.10494576077647731, + "grad_norm": 1.6548517942428589, + "learning_rate": 4.86536647175587e-05, + "loss": 5.1361, + "step": 17646 + }, + { + "epoch": 0.1049517080597583, + "grad_norm": 1.2479137182235718, + "learning_rate": 4.865351349563873e-05, + "loss": 5.3129, + "step": 17647 + }, + { + "epoch": 0.1049576553430393, + "grad_norm": 1.3804575204849243, + "learning_rate": 4.8653362265461556e-05, + "loss": 4.9891, + "step": 17648 + }, + { + "epoch": 0.10496360262632029, + "grad_norm": 1.2821561098098755, + "learning_rate": 4.865321102702724e-05, + "loss": 5.0255, + "step": 17649 + }, + { + "epoch": 0.10496954990960129, + "grad_norm": 1.5715882778167725, + "learning_rate": 4.865305978033583e-05, + "loss": 4.9897, + "step": 17650 + }, + { + "epoch": 0.1049754971928823, + "grad_norm": 1.5910687446594238, + "learning_rate": 4.865290852538738e-05, + "loss": 5.1387, + "step": 17651 + }, + { + "epoch": 0.10498144447616328, + "grad_norm": 1.4188683032989502, + "learning_rate": 4.865275726218196e-05, + "loss": 5.3502, + "step": 17652 + }, + { + "epoch": 0.10498739175944428, + "grad_norm": 1.6032958030700684, + "learning_rate": 4.8652605990719594e-05, + "loss": 5.2716, + "step": 17653 + }, + { + "epoch": 0.10499333904272529, + "grad_norm": 1.4894942045211792, + "learning_rate": 4.8652454711000353e-05, + "loss": 5.237, + "step": 17654 + }, + { + "epoch": 0.10499928632600627, + "grad_norm": 1.5370794534683228, + "learning_rate": 4.8652303423024276e-05, + "loss": 5.0227, + "step": 17655 + }, + { + "epoch": 0.10500523360928728, + "grad_norm": 1.4100168943405151, + "learning_rate": 4.865215212679143e-05, + "loss": 5.0713, + "step": 17656 + }, + { + "epoch": 0.10501118089256828, + "grad_norm": 1.6180533170700073, + "learning_rate": 4.8652000822301856e-05, + "loss": 5.2041, + "step": 17657 + }, + { + "epoch": 0.10501712817584927, + "grad_norm": 1.2447609901428223, + "learning_rate": 4.865184950955562e-05, + "loss": 5.1073, + "step": 17658 + }, + { + "epoch": 0.10502307545913027, + "grad_norm": 1.4866548776626587, + "learning_rate": 4.865169818855277e-05, + "loss": 5.1287, + "step": 17659 + }, + { + "epoch": 0.10502902274241127, + "grad_norm": 1.33426034450531, + "learning_rate": 4.865154685929335e-05, + "loss": 5.1343, + "step": 17660 + }, + { + "epoch": 0.10503497002569226, + "grad_norm": 1.122551679611206, + "learning_rate": 4.865139552177742e-05, + "loss": 5.1267, + "step": 17661 + }, + { + "epoch": 0.10504091730897326, + "grad_norm": 1.787278175354004, + "learning_rate": 4.865124417600504e-05, + "loss": 5.4828, + "step": 17662 + }, + { + "epoch": 0.10504686459225426, + "grad_norm": 1.4937405586242676, + "learning_rate": 4.8651092821976246e-05, + "loss": 5.3467, + "step": 17663 + }, + { + "epoch": 0.10505281187553525, + "grad_norm": 1.395286202430725, + "learning_rate": 4.86509414596911e-05, + "loss": 5.1552, + "step": 17664 + }, + { + "epoch": 0.10505875915881625, + "grad_norm": 1.5284260511398315, + "learning_rate": 4.865079008914965e-05, + "loss": 5.2718, + "step": 17665 + }, + { + "epoch": 0.10506470644209726, + "grad_norm": 2.0051753520965576, + "learning_rate": 4.865063871035197e-05, + "loss": 5.1121, + "step": 17666 + }, + { + "epoch": 0.10507065372537824, + "grad_norm": 1.690699577331543, + "learning_rate": 4.8650487323298085e-05, + "loss": 5.1091, + "step": 17667 + }, + { + "epoch": 0.10507660100865925, + "grad_norm": 1.5275843143463135, + "learning_rate": 4.865033592798807e-05, + "loss": 5.3064, + "step": 17668 + }, + { + "epoch": 0.10508254829194025, + "grad_norm": 1.584038496017456, + "learning_rate": 4.865018452442195e-05, + "loss": 5.2598, + "step": 17669 + }, + { + "epoch": 0.10508849557522124, + "grad_norm": 1.8086310625076294, + "learning_rate": 4.865003311259981e-05, + "loss": 5.2229, + "step": 17670 + }, + { + "epoch": 0.10509444285850224, + "grad_norm": 1.805972695350647, + "learning_rate": 4.864988169252168e-05, + "loss": 5.1051, + "step": 17671 + }, + { + "epoch": 0.10510039014178324, + "grad_norm": 1.6209838390350342, + "learning_rate": 4.864973026418762e-05, + "loss": 5.1808, + "step": 17672 + }, + { + "epoch": 0.10510633742506423, + "grad_norm": 1.3997793197631836, + "learning_rate": 4.8649578827597684e-05, + "loss": 4.9167, + "step": 17673 + }, + { + "epoch": 0.10511228470834523, + "grad_norm": 1.368037462234497, + "learning_rate": 4.8649427382751925e-05, + "loss": 4.98, + "step": 17674 + }, + { + "epoch": 0.10511823199162623, + "grad_norm": 1.3904718160629272, + "learning_rate": 4.864927592965039e-05, + "loss": 4.8101, + "step": 17675 + }, + { + "epoch": 0.10512417927490722, + "grad_norm": 1.3237133026123047, + "learning_rate": 4.864912446829315e-05, + "loss": 5.1427, + "step": 17676 + }, + { + "epoch": 0.10513012655818822, + "grad_norm": 1.2642048597335815, + "learning_rate": 4.864897299868024e-05, + "loss": 5.2961, + "step": 17677 + }, + { + "epoch": 0.10513607384146921, + "grad_norm": 1.4357531070709229, + "learning_rate": 4.864882152081172e-05, + "loss": 5.4811, + "step": 17678 + }, + { + "epoch": 0.10514202112475021, + "grad_norm": 1.652321696281433, + "learning_rate": 4.864867003468763e-05, + "loss": 5.2172, + "step": 17679 + }, + { + "epoch": 0.10514796840803121, + "grad_norm": 1.6143925189971924, + "learning_rate": 4.864851854030804e-05, + "loss": 4.9856, + "step": 17680 + }, + { + "epoch": 0.1051539156913122, + "grad_norm": 1.637320637702942, + "learning_rate": 4.8648367037673e-05, + "loss": 4.9458, + "step": 17681 + }, + { + "epoch": 0.1051598629745932, + "grad_norm": 1.650970458984375, + "learning_rate": 4.864821552678256e-05, + "loss": 4.714, + "step": 17682 + }, + { + "epoch": 0.1051658102578742, + "grad_norm": 1.616098403930664, + "learning_rate": 4.864806400763676e-05, + "loss": 4.7064, + "step": 17683 + }, + { + "epoch": 0.1051717575411552, + "grad_norm": 1.6400461196899414, + "learning_rate": 4.864791248023568e-05, + "loss": 4.5955, + "step": 17684 + }, + { + "epoch": 0.1051777048244362, + "grad_norm": 1.3815523386001587, + "learning_rate": 4.8647760944579344e-05, + "loss": 4.7491, + "step": 17685 + }, + { + "epoch": 0.1051836521077172, + "grad_norm": 1.5695693492889404, + "learning_rate": 4.864760940066783e-05, + "loss": 4.6242, + "step": 17686 + }, + { + "epoch": 0.10518959939099819, + "grad_norm": 1.5861409902572632, + "learning_rate": 4.8647457848501174e-05, + "loss": 4.5859, + "step": 17687 + }, + { + "epoch": 0.10519554667427919, + "grad_norm": 1.637741208076477, + "learning_rate": 4.864730628807944e-05, + "loss": 4.6572, + "step": 17688 + }, + { + "epoch": 0.10520149395756019, + "grad_norm": 1.5806957483291626, + "learning_rate": 4.864715471940268e-05, + "loss": 4.8879, + "step": 17689 + }, + { + "epoch": 0.10520744124084118, + "grad_norm": 2.0158286094665527, + "learning_rate": 4.864700314247093e-05, + "loss": 5.5019, + "step": 17690 + }, + { + "epoch": 0.10521338852412218, + "grad_norm": 1.5022921562194824, + "learning_rate": 4.8646851557284256e-05, + "loss": 5.2029, + "step": 17691 + }, + { + "epoch": 0.10521933580740318, + "grad_norm": 1.8164446353912354, + "learning_rate": 4.864669996384272e-05, + "loss": 4.9258, + "step": 17692 + }, + { + "epoch": 0.10522528309068417, + "grad_norm": 1.6789724826812744, + "learning_rate": 4.864654836214636e-05, + "loss": 5.0876, + "step": 17693 + }, + { + "epoch": 0.10523123037396517, + "grad_norm": 1.778971552848816, + "learning_rate": 4.864639675219523e-05, + "loss": 5.1052, + "step": 17694 + }, + { + "epoch": 0.10523717765724618, + "grad_norm": 1.2401436567306519, + "learning_rate": 4.8646245133989396e-05, + "loss": 5.2536, + "step": 17695 + }, + { + "epoch": 0.10524312494052716, + "grad_norm": 1.6509275436401367, + "learning_rate": 4.8646093507528904e-05, + "loss": 4.9215, + "step": 17696 + }, + { + "epoch": 0.10524907222380817, + "grad_norm": 1.3725727796554565, + "learning_rate": 4.864594187281379e-05, + "loss": 5.5578, + "step": 17697 + }, + { + "epoch": 0.10525501950708917, + "grad_norm": 1.481040358543396, + "learning_rate": 4.864579022984413e-05, + "loss": 5.4683, + "step": 17698 + }, + { + "epoch": 0.10526096679037016, + "grad_norm": 1.4682444334030151, + "learning_rate": 4.864563857861998e-05, + "loss": 5.5076, + "step": 17699 + }, + { + "epoch": 0.10526691407365116, + "grad_norm": 1.2660551071166992, + "learning_rate": 4.864548691914137e-05, + "loss": 5.6092, + "step": 17700 + }, + { + "epoch": 0.10527286135693216, + "grad_norm": 1.266858458518982, + "learning_rate": 4.8645335251408366e-05, + "loss": 5.4373, + "step": 17701 + }, + { + "epoch": 0.10527880864021315, + "grad_norm": 1.5075262784957886, + "learning_rate": 4.8645183575421024e-05, + "loss": 5.3651, + "step": 17702 + }, + { + "epoch": 0.10528475592349415, + "grad_norm": 1.6108607053756714, + "learning_rate": 4.864503189117939e-05, + "loss": 5.3372, + "step": 17703 + }, + { + "epoch": 0.10529070320677515, + "grad_norm": 1.677874207496643, + "learning_rate": 4.8644880198683515e-05, + "loss": 4.9378, + "step": 17704 + }, + { + "epoch": 0.10529665049005614, + "grad_norm": 1.5847524404525757, + "learning_rate": 4.864472849793346e-05, + "loss": 5.2918, + "step": 17705 + }, + { + "epoch": 0.10530259777333714, + "grad_norm": 1.598244309425354, + "learning_rate": 4.864457678892927e-05, + "loss": 5.2408, + "step": 17706 + }, + { + "epoch": 0.10530854505661813, + "grad_norm": 1.4147340059280396, + "learning_rate": 4.8644425071671015e-05, + "loss": 5.2856, + "step": 17707 + }, + { + "epoch": 0.10531449233989913, + "grad_norm": 1.6057299375534058, + "learning_rate": 4.8644273346158734e-05, + "loss": 5.343, + "step": 17708 + }, + { + "epoch": 0.10532043962318013, + "grad_norm": 1.3503344058990479, + "learning_rate": 4.864412161239247e-05, + "loss": 5.4081, + "step": 17709 + }, + { + "epoch": 0.10532638690646112, + "grad_norm": 1.8316742181777954, + "learning_rate": 4.8643969870372295e-05, + "loss": 4.7925, + "step": 17710 + }, + { + "epoch": 0.10533233418974212, + "grad_norm": 2.1429593563079834, + "learning_rate": 4.864381812009825e-05, + "loss": 4.3519, + "step": 17711 + }, + { + "epoch": 0.10533828147302313, + "grad_norm": 1.9665764570236206, + "learning_rate": 4.8643666361570396e-05, + "loss": 4.388, + "step": 17712 + }, + { + "epoch": 0.10534422875630411, + "grad_norm": 1.7851755619049072, + "learning_rate": 4.864351459478878e-05, + "loss": 4.5242, + "step": 17713 + }, + { + "epoch": 0.10535017603958512, + "grad_norm": 1.8347305059432983, + "learning_rate": 4.864336281975346e-05, + "loss": 4.166, + "step": 17714 + }, + { + "epoch": 0.10535612332286612, + "grad_norm": 1.9413511753082275, + "learning_rate": 4.864321103646449e-05, + "loss": 4.0937, + "step": 17715 + }, + { + "epoch": 0.1053620706061471, + "grad_norm": 1.8122237920761108, + "learning_rate": 4.8643059244921904e-05, + "loss": 4.3812, + "step": 17716 + }, + { + "epoch": 0.10536801788942811, + "grad_norm": 2.0114996433258057, + "learning_rate": 4.864290744512578e-05, + "loss": 4.0728, + "step": 17717 + }, + { + "epoch": 0.10537396517270911, + "grad_norm": 1.8565599918365479, + "learning_rate": 4.8642755637076165e-05, + "loss": 4.2625, + "step": 17718 + }, + { + "epoch": 0.1053799124559901, + "grad_norm": 1.9136046171188354, + "learning_rate": 4.8642603820773105e-05, + "loss": 4.4933, + "step": 17719 + }, + { + "epoch": 0.1053858597392711, + "grad_norm": 1.8930033445358276, + "learning_rate": 4.864245199621666e-05, + "loss": 4.3249, + "step": 17720 + }, + { + "epoch": 0.1053918070225521, + "grad_norm": 1.7729578018188477, + "learning_rate": 4.864230016340687e-05, + "loss": 4.4736, + "step": 17721 + }, + { + "epoch": 0.10539775430583309, + "grad_norm": 2.1663360595703125, + "learning_rate": 4.864214832234381e-05, + "loss": 4.7505, + "step": 17722 + }, + { + "epoch": 0.1054037015891141, + "grad_norm": 1.9864879846572876, + "learning_rate": 4.864199647302751e-05, + "loss": 4.7233, + "step": 17723 + }, + { + "epoch": 0.1054096488723951, + "grad_norm": 2.031329870223999, + "learning_rate": 4.8641844615458035e-05, + "loss": 4.8218, + "step": 17724 + }, + { + "epoch": 0.10541559615567608, + "grad_norm": 2.0325984954833984, + "learning_rate": 4.864169274963544e-05, + "loss": 4.9383, + "step": 17725 + }, + { + "epoch": 0.10542154343895709, + "grad_norm": 1.9482324123382568, + "learning_rate": 4.864154087555977e-05, + "loss": 5.0849, + "step": 17726 + }, + { + "epoch": 0.10542749072223809, + "grad_norm": 1.6887640953063965, + "learning_rate": 4.864138899323108e-05, + "loss": 5.0216, + "step": 17727 + }, + { + "epoch": 0.10543343800551908, + "grad_norm": 2.0226924419403076, + "learning_rate": 4.864123710264944e-05, + "loss": 4.9241, + "step": 17728 + }, + { + "epoch": 0.10543938528880008, + "grad_norm": 1.647629976272583, + "learning_rate": 4.8641085203814873e-05, + "loss": 5.0318, + "step": 17729 + }, + { + "epoch": 0.10544533257208108, + "grad_norm": 1.766290545463562, + "learning_rate": 4.864093329672745e-05, + "loss": 4.9034, + "step": 17730 + }, + { + "epoch": 0.10545127985536207, + "grad_norm": 1.7573658227920532, + "learning_rate": 4.864078138138723e-05, + "loss": 4.7783, + "step": 17731 + }, + { + "epoch": 0.10545722713864307, + "grad_norm": 1.5503767728805542, + "learning_rate": 4.864062945779425e-05, + "loss": 5.1085, + "step": 17732 + }, + { + "epoch": 0.10546317442192407, + "grad_norm": 1.7276320457458496, + "learning_rate": 4.864047752594857e-05, + "loss": 4.8028, + "step": 17733 + }, + { + "epoch": 0.10546912170520506, + "grad_norm": 1.9654134511947632, + "learning_rate": 4.864032558585024e-05, + "loss": 5.1221, + "step": 17734 + }, + { + "epoch": 0.10547506898848606, + "grad_norm": 1.9654512405395508, + "learning_rate": 4.864017363749933e-05, + "loss": 5.0463, + "step": 17735 + }, + { + "epoch": 0.10548101627176705, + "grad_norm": 1.9071869850158691, + "learning_rate": 4.864002168089587e-05, + "loss": 5.0822, + "step": 17736 + }, + { + "epoch": 0.10548696355504805, + "grad_norm": 2.4190056324005127, + "learning_rate": 4.863986971603993e-05, + "loss": 5.7404, + "step": 17737 + }, + { + "epoch": 0.10549291083832905, + "grad_norm": 2.2098371982574463, + "learning_rate": 4.863971774293155e-05, + "loss": 5.9282, + "step": 17738 + }, + { + "epoch": 0.10549885812161004, + "grad_norm": 2.569831132888794, + "learning_rate": 4.8639565761570784e-05, + "loss": 4.3309, + "step": 17739 + }, + { + "epoch": 0.10550480540489104, + "grad_norm": 2.252847909927368, + "learning_rate": 4.8639413771957696e-05, + "loss": 4.185, + "step": 17740 + }, + { + "epoch": 0.10551075268817205, + "grad_norm": 2.3022215366363525, + "learning_rate": 4.8639261774092325e-05, + "loss": 4.3537, + "step": 17741 + }, + { + "epoch": 0.10551669997145303, + "grad_norm": 2.2695138454437256, + "learning_rate": 4.8639109767974745e-05, + "loss": 3.9806, + "step": 17742 + }, + { + "epoch": 0.10552264725473404, + "grad_norm": 2.1722588539123535, + "learning_rate": 4.8638957753604985e-05, + "loss": 3.9803, + "step": 17743 + }, + { + "epoch": 0.10552859453801504, + "grad_norm": 2.4385933876037598, + "learning_rate": 4.863880573098312e-05, + "loss": 4.0148, + "step": 17744 + }, + { + "epoch": 0.10553454182129603, + "grad_norm": 2.3186235427856445, + "learning_rate": 4.8638653700109184e-05, + "loss": 3.979, + "step": 17745 + }, + { + "epoch": 0.10554048910457703, + "grad_norm": 2.4591264724731445, + "learning_rate": 4.863850166098324e-05, + "loss": 3.9258, + "step": 17746 + }, + { + "epoch": 0.10554643638785803, + "grad_norm": 2.2619590759277344, + "learning_rate": 4.8638349613605336e-05, + "loss": 4.0571, + "step": 17747 + }, + { + "epoch": 0.10555238367113902, + "grad_norm": 2.393226146697998, + "learning_rate": 4.863819755797553e-05, + "loss": 4.0036, + "step": 17748 + }, + { + "epoch": 0.10555833095442002, + "grad_norm": 2.281846046447754, + "learning_rate": 4.8638045494093875e-05, + "loss": 3.9382, + "step": 17749 + }, + { + "epoch": 0.10556427823770102, + "grad_norm": 2.165407657623291, + "learning_rate": 4.8637893421960425e-05, + "loss": 4.0204, + "step": 17750 + }, + { + "epoch": 0.10557022552098201, + "grad_norm": 2.131829261779785, + "learning_rate": 4.863774134157523e-05, + "loss": 4.8661, + "step": 17751 + }, + { + "epoch": 0.10557617280426301, + "grad_norm": 2.0619029998779297, + "learning_rate": 4.863758925293834e-05, + "loss": 5.5522, + "step": 17752 + }, + { + "epoch": 0.10558212008754402, + "grad_norm": 1.6535427570343018, + "learning_rate": 4.863743715604981e-05, + "loss": 5.3463, + "step": 17753 + }, + { + "epoch": 0.105588067370825, + "grad_norm": 1.903904676437378, + "learning_rate": 4.86372850509097e-05, + "loss": 5.7202, + "step": 17754 + }, + { + "epoch": 0.105594014654106, + "grad_norm": 1.649357557296753, + "learning_rate": 4.863713293751806e-05, + "loss": 5.577, + "step": 17755 + }, + { + "epoch": 0.10559996193738701, + "grad_norm": 2.0812721252441406, + "learning_rate": 4.8636980815874936e-05, + "loss": 5.3164, + "step": 17756 + }, + { + "epoch": 0.105605909220668, + "grad_norm": 2.312357187271118, + "learning_rate": 4.8636828685980384e-05, + "loss": 5.3018, + "step": 17757 + }, + { + "epoch": 0.105611856503949, + "grad_norm": 2.1815388202667236, + "learning_rate": 4.863667654783447e-05, + "loss": 5.1509, + "step": 17758 + }, + { + "epoch": 0.10561780378723, + "grad_norm": 1.7500512599945068, + "learning_rate": 4.8636524401437225e-05, + "loss": 5.492, + "step": 17759 + }, + { + "epoch": 0.10562375107051099, + "grad_norm": 1.6850415468215942, + "learning_rate": 4.863637224678872e-05, + "loss": 5.5086, + "step": 17760 + }, + { + "epoch": 0.10562969835379199, + "grad_norm": 1.7222185134887695, + "learning_rate": 4.8636220083889e-05, + "loss": 5.4139, + "step": 17761 + }, + { + "epoch": 0.10563564563707299, + "grad_norm": 1.627914309501648, + "learning_rate": 4.8636067912738116e-05, + "loss": 5.5763, + "step": 17762 + }, + { + "epoch": 0.10564159292035398, + "grad_norm": 1.5884100198745728, + "learning_rate": 4.863591573333613e-05, + "loss": 5.544, + "step": 17763 + }, + { + "epoch": 0.10564754020363498, + "grad_norm": 1.4660178422927856, + "learning_rate": 4.8635763545683085e-05, + "loss": 5.4913, + "step": 17764 + }, + { + "epoch": 0.10565348748691597, + "grad_norm": 1.5240764617919922, + "learning_rate": 4.863561134977904e-05, + "loss": 5.4757, + "step": 17765 + }, + { + "epoch": 0.10565943477019697, + "grad_norm": 1.3686332702636719, + "learning_rate": 4.863545914562406e-05, + "loss": 5.4934, + "step": 17766 + }, + { + "epoch": 0.10566538205347797, + "grad_norm": 1.5429164171218872, + "learning_rate": 4.863530693321817e-05, + "loss": 5.3654, + "step": 17767 + }, + { + "epoch": 0.10567132933675896, + "grad_norm": 1.4237322807312012, + "learning_rate": 4.863515471256145e-05, + "loss": 5.4128, + "step": 17768 + }, + { + "epoch": 0.10567727662003996, + "grad_norm": 1.6438677310943604, + "learning_rate": 4.863500248365393e-05, + "loss": 5.3129, + "step": 17769 + }, + { + "epoch": 0.10568322390332097, + "grad_norm": 1.9208921194076538, + "learning_rate": 4.8634850246495675e-05, + "loss": 5.4889, + "step": 17770 + }, + { + "epoch": 0.10568917118660195, + "grad_norm": 1.6967288255691528, + "learning_rate": 4.863469800108675e-05, + "loss": 5.5301, + "step": 17771 + }, + { + "epoch": 0.10569511846988296, + "grad_norm": 1.5820802450180054, + "learning_rate": 4.8634545747427185e-05, + "loss": 5.4126, + "step": 17772 + }, + { + "epoch": 0.10570106575316396, + "grad_norm": 1.8280025720596313, + "learning_rate": 4.8634393485517046e-05, + "loss": 6.1201, + "step": 17773 + }, + { + "epoch": 0.10570701303644495, + "grad_norm": 1.809193730354309, + "learning_rate": 4.8634241215356394e-05, + "loss": 5.4123, + "step": 17774 + }, + { + "epoch": 0.10571296031972595, + "grad_norm": 1.596528172492981, + "learning_rate": 4.863408893694527e-05, + "loss": 5.6865, + "step": 17775 + }, + { + "epoch": 0.10571890760300695, + "grad_norm": 1.7726397514343262, + "learning_rate": 4.8633936650283715e-05, + "loss": 5.7298, + "step": 17776 + }, + { + "epoch": 0.10572485488628794, + "grad_norm": 1.5804529190063477, + "learning_rate": 4.863378435537182e-05, + "loss": 5.6051, + "step": 17777 + }, + { + "epoch": 0.10573080216956894, + "grad_norm": 1.5244919061660767, + "learning_rate": 4.8633632052209595e-05, + "loss": 5.7402, + "step": 17778 + }, + { + "epoch": 0.10573674945284994, + "grad_norm": 1.5003318786621094, + "learning_rate": 4.8633479740797117e-05, + "loss": 5.6978, + "step": 17779 + }, + { + "epoch": 0.10574269673613093, + "grad_norm": 1.7325289249420166, + "learning_rate": 4.863332742113444e-05, + "loss": 5.8616, + "step": 17780 + }, + { + "epoch": 0.10574864401941193, + "grad_norm": 1.8214267492294312, + "learning_rate": 4.863317509322161e-05, + "loss": 5.9213, + "step": 17781 + }, + { + "epoch": 0.10575459130269294, + "grad_norm": 1.7067787647247314, + "learning_rate": 4.863302275705869e-05, + "loss": 5.5518, + "step": 17782 + }, + { + "epoch": 0.10576053858597392, + "grad_norm": 1.8018234968185425, + "learning_rate": 4.863287041264571e-05, + "loss": 5.5241, + "step": 17783 + }, + { + "epoch": 0.10576648586925493, + "grad_norm": 1.7645032405853271, + "learning_rate": 4.863271805998275e-05, + "loss": 5.6471, + "step": 17784 + }, + { + "epoch": 0.10577243315253593, + "grad_norm": 1.6891655921936035, + "learning_rate": 4.8632565699069854e-05, + "loss": 5.9138, + "step": 17785 + }, + { + "epoch": 0.10577838043581692, + "grad_norm": 1.6546204090118408, + "learning_rate": 4.8632413329907076e-05, + "loss": 5.8511, + "step": 17786 + }, + { + "epoch": 0.10578432771909792, + "grad_norm": 1.864680528640747, + "learning_rate": 4.863226095249446e-05, + "loss": 5.7665, + "step": 17787 + }, + { + "epoch": 0.10579027500237892, + "grad_norm": 1.9052486419677734, + "learning_rate": 4.863210856683207e-05, + "loss": 5.6528, + "step": 17788 + }, + { + "epoch": 0.10579622228565991, + "grad_norm": 2.212982416152954, + "learning_rate": 4.8631956172919944e-05, + "loss": 5.2294, + "step": 17789 + }, + { + "epoch": 0.10580216956894091, + "grad_norm": 2.0703213214874268, + "learning_rate": 4.863180377075816e-05, + "loss": 4.9963, + "step": 17790 + }, + { + "epoch": 0.10580811685222191, + "grad_norm": 2.1718661785125732, + "learning_rate": 4.863165136034675e-05, + "loss": 5.1047, + "step": 17791 + }, + { + "epoch": 0.1058140641355029, + "grad_norm": 2.2078070640563965, + "learning_rate": 4.8631498941685774e-05, + "loss": 5.2682, + "step": 17792 + }, + { + "epoch": 0.1058200114187839, + "grad_norm": 2.187614917755127, + "learning_rate": 4.863134651477529e-05, + "loss": 4.9008, + "step": 17793 + }, + { + "epoch": 0.10582595870206489, + "grad_norm": 1.7202839851379395, + "learning_rate": 4.863119407961535e-05, + "loss": 5.1006, + "step": 17794 + }, + { + "epoch": 0.10583190598534589, + "grad_norm": 2.3109450340270996, + "learning_rate": 4.8631041636206e-05, + "loss": 4.8489, + "step": 17795 + }, + { + "epoch": 0.1058378532686269, + "grad_norm": 2.2688632011413574, + "learning_rate": 4.8630889184547295e-05, + "loss": 4.953, + "step": 17796 + }, + { + "epoch": 0.10584380055190788, + "grad_norm": 2.0636980533599854, + "learning_rate": 4.863073672463929e-05, + "loss": 4.9537, + "step": 17797 + }, + { + "epoch": 0.10584974783518888, + "grad_norm": 1.9752720594406128, + "learning_rate": 4.863058425648205e-05, + "loss": 4.8646, + "step": 17798 + }, + { + "epoch": 0.10585569511846989, + "grad_norm": 1.9784966707229614, + "learning_rate": 4.86304317800756e-05, + "loss": 5.1245, + "step": 17799 + }, + { + "epoch": 0.10586164240175087, + "grad_norm": 1.812218427658081, + "learning_rate": 4.863027929542002e-05, + "loss": 5.4367, + "step": 17800 + }, + { + "epoch": 0.10586758968503188, + "grad_norm": 1.8048956394195557, + "learning_rate": 4.863012680251536e-05, + "loss": 5.6052, + "step": 17801 + }, + { + "epoch": 0.10587353696831288, + "grad_norm": 1.9246432781219482, + "learning_rate": 4.862997430136166e-05, + "loss": 5.9335, + "step": 17802 + }, + { + "epoch": 0.10587948425159387, + "grad_norm": 1.5138533115386963, + "learning_rate": 4.862982179195897e-05, + "loss": 5.8785, + "step": 17803 + }, + { + "epoch": 0.10588543153487487, + "grad_norm": 1.4948742389678955, + "learning_rate": 4.862966927430737e-05, + "loss": 5.7478, + "step": 17804 + }, + { + "epoch": 0.10589137881815587, + "grad_norm": 1.4670746326446533, + "learning_rate": 4.862951674840689e-05, + "loss": 5.7397, + "step": 17805 + }, + { + "epoch": 0.10589732610143686, + "grad_norm": 1.4234925508499146, + "learning_rate": 4.862936421425759e-05, + "loss": 5.9919, + "step": 17806 + }, + { + "epoch": 0.10590327338471786, + "grad_norm": 1.8313277959823608, + "learning_rate": 4.862921167185953e-05, + "loss": 5.7289, + "step": 17807 + }, + { + "epoch": 0.10590922066799886, + "grad_norm": 1.7373311519622803, + "learning_rate": 4.8629059121212745e-05, + "loss": 5.7652, + "step": 17808 + }, + { + "epoch": 0.10591516795127985, + "grad_norm": 1.7706129550933838, + "learning_rate": 4.86289065623173e-05, + "loss": 5.4623, + "step": 17809 + }, + { + "epoch": 0.10592111523456085, + "grad_norm": 1.7332470417022705, + "learning_rate": 4.862875399517325e-05, + "loss": 5.5546, + "step": 17810 + }, + { + "epoch": 0.10592706251784186, + "grad_norm": 1.7493473291397095, + "learning_rate": 4.862860141978065e-05, + "loss": 5.2762, + "step": 17811 + }, + { + "epoch": 0.10593300980112284, + "grad_norm": 1.8064602613449097, + "learning_rate": 4.862844883613955e-05, + "loss": 5.2969, + "step": 17812 + }, + { + "epoch": 0.10593895708440385, + "grad_norm": 1.6318674087524414, + "learning_rate": 4.862829624425e-05, + "loss": 5.3229, + "step": 17813 + }, + { + "epoch": 0.10594490436768485, + "grad_norm": 1.7438777685165405, + "learning_rate": 4.8628143644112056e-05, + "loss": 5.3167, + "step": 17814 + }, + { + "epoch": 0.10595085165096584, + "grad_norm": 1.8095386028289795, + "learning_rate": 4.8627991035725774e-05, + "loss": 5.2744, + "step": 17815 + }, + { + "epoch": 0.10595679893424684, + "grad_norm": 1.8095691204071045, + "learning_rate": 4.86278384190912e-05, + "loss": 5.5105, + "step": 17816 + }, + { + "epoch": 0.10596274621752784, + "grad_norm": 1.858776569366455, + "learning_rate": 4.862768579420839e-05, + "loss": 5.4338, + "step": 17817 + }, + { + "epoch": 0.10596869350080883, + "grad_norm": 1.8224806785583496, + "learning_rate": 4.86275331610774e-05, + "loss": 5.6273, + "step": 17818 + }, + { + "epoch": 0.10597464078408983, + "grad_norm": 1.6850696802139282, + "learning_rate": 4.8627380519698284e-05, + "loss": 5.9963, + "step": 17819 + }, + { + "epoch": 0.10598058806737083, + "grad_norm": 1.4804600477218628, + "learning_rate": 4.86272278700711e-05, + "loss": 5.726, + "step": 17820 + }, + { + "epoch": 0.10598653535065182, + "grad_norm": 1.721027135848999, + "learning_rate": 4.862707521219589e-05, + "loss": 5.191, + "step": 17821 + }, + { + "epoch": 0.10599248263393282, + "grad_norm": 1.8109691143035889, + "learning_rate": 4.862692254607271e-05, + "loss": 4.926, + "step": 17822 + }, + { + "epoch": 0.10599842991721381, + "grad_norm": 1.7531434297561646, + "learning_rate": 4.862676987170162e-05, + "loss": 5.0376, + "step": 17823 + }, + { + "epoch": 0.10600437720049481, + "grad_norm": 1.6847648620605469, + "learning_rate": 4.8626617189082656e-05, + "loss": 5.0376, + "step": 17824 + }, + { + "epoch": 0.10601032448377581, + "grad_norm": 1.6512411832809448, + "learning_rate": 4.86264644982159e-05, + "loss": 5.087, + "step": 17825 + }, + { + "epoch": 0.1060162717670568, + "grad_norm": 1.6410924196243286, + "learning_rate": 4.8626311799101375e-05, + "loss": 5.6917, + "step": 17826 + }, + { + "epoch": 0.1060222190503378, + "grad_norm": 2.1565957069396973, + "learning_rate": 4.862615909173916e-05, + "loss": 4.619, + "step": 17827 + }, + { + "epoch": 0.1060281663336188, + "grad_norm": 1.8235310316085815, + "learning_rate": 4.86260063761293e-05, + "loss": 5.1155, + "step": 17828 + }, + { + "epoch": 0.1060341136168998, + "grad_norm": 1.7710633277893066, + "learning_rate": 4.862585365227184e-05, + "loss": 4.7845, + "step": 17829 + }, + { + "epoch": 0.1060400609001808, + "grad_norm": 2.174832820892334, + "learning_rate": 4.862570092016683e-05, + "loss": 4.6384, + "step": 17830 + }, + { + "epoch": 0.1060460081834618, + "grad_norm": 2.359682321548462, + "learning_rate": 4.862554817981434e-05, + "loss": 4.2191, + "step": 17831 + }, + { + "epoch": 0.10605195546674279, + "grad_norm": 2.4251585006713867, + "learning_rate": 4.8625395431214414e-05, + "loss": 4.0982, + "step": 17832 + }, + { + "epoch": 0.10605790275002379, + "grad_norm": 2.543009042739868, + "learning_rate": 4.86252426743671e-05, + "loss": 4.0773, + "step": 17833 + }, + { + "epoch": 0.10606385003330479, + "grad_norm": 2.6991419792175293, + "learning_rate": 4.862508990927247e-05, + "loss": 4.0209, + "step": 17834 + }, + { + "epoch": 0.10606979731658578, + "grad_norm": 2.354445695877075, + "learning_rate": 4.862493713593056e-05, + "loss": 3.9223, + "step": 17835 + }, + { + "epoch": 0.10607574459986678, + "grad_norm": 2.5119223594665527, + "learning_rate": 4.8624784354341426e-05, + "loss": 3.9006, + "step": 17836 + }, + { + "epoch": 0.10608169188314778, + "grad_norm": 2.717792272567749, + "learning_rate": 4.862463156450513e-05, + "loss": 4.3295, + "step": 17837 + }, + { + "epoch": 0.10608763916642877, + "grad_norm": 3.1779162883758545, + "learning_rate": 4.862447876642171e-05, + "loss": 4.3483, + "step": 17838 + }, + { + "epoch": 0.10609358644970977, + "grad_norm": 2.272994041442871, + "learning_rate": 4.8624325960091235e-05, + "loss": 4.2826, + "step": 17839 + }, + { + "epoch": 0.10609953373299078, + "grad_norm": 2.4689860343933105, + "learning_rate": 4.862417314551375e-05, + "loss": 4.9144, + "step": 17840 + }, + { + "epoch": 0.10610548101627176, + "grad_norm": 1.8101458549499512, + "learning_rate": 4.862402032268931e-05, + "loss": 5.9325, + "step": 17841 + }, + { + "epoch": 0.10611142829955277, + "grad_norm": 1.9994734525680542, + "learning_rate": 4.862386749161797e-05, + "loss": 5.5438, + "step": 17842 + }, + { + "epoch": 0.10611737558283377, + "grad_norm": 2.5475401878356934, + "learning_rate": 4.8623714652299786e-05, + "loss": 5.2262, + "step": 17843 + }, + { + "epoch": 0.10612332286611476, + "grad_norm": 2.286040782928467, + "learning_rate": 4.86235618047348e-05, + "loss": 5.065, + "step": 17844 + }, + { + "epoch": 0.10612927014939576, + "grad_norm": 1.788761854171753, + "learning_rate": 4.862340894892308e-05, + "loss": 5.5053, + "step": 17845 + }, + { + "epoch": 0.10613521743267676, + "grad_norm": 2.2951841354370117, + "learning_rate": 4.8623256084864663e-05, + "loss": 5.1262, + "step": 17846 + }, + { + "epoch": 0.10614116471595775, + "grad_norm": 1.962814211845398, + "learning_rate": 4.862310321255962e-05, + "loss": 5.8084, + "step": 17847 + }, + { + "epoch": 0.10614711199923875, + "grad_norm": 1.7888414859771729, + "learning_rate": 4.862295033200799e-05, + "loss": 5.2409, + "step": 17848 + }, + { + "epoch": 0.10615305928251975, + "grad_norm": 1.7108670473098755, + "learning_rate": 4.862279744320983e-05, + "loss": 5.6138, + "step": 17849 + }, + { + "epoch": 0.10615900656580074, + "grad_norm": 1.7636443376541138, + "learning_rate": 4.8622644546165196e-05, + "loss": 5.5664, + "step": 17850 + }, + { + "epoch": 0.10616495384908174, + "grad_norm": 1.7193186283111572, + "learning_rate": 4.8622491640874147e-05, + "loss": 5.7852, + "step": 17851 + }, + { + "epoch": 0.10617090113236273, + "grad_norm": 1.817215919494629, + "learning_rate": 4.8622338727336723e-05, + "loss": 5.5478, + "step": 17852 + }, + { + "epoch": 0.10617684841564373, + "grad_norm": 1.547817349433899, + "learning_rate": 4.8622185805552994e-05, + "loss": 5.5249, + "step": 17853 + }, + { + "epoch": 0.10618279569892473, + "grad_norm": 1.577528953552246, + "learning_rate": 4.862203287552299e-05, + "loss": 5.7268, + "step": 17854 + }, + { + "epoch": 0.10618874298220572, + "grad_norm": 1.4524853229522705, + "learning_rate": 4.862187993724679e-05, + "loss": 5.8539, + "step": 17855 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 1.6361198425292969, + "learning_rate": 4.8621726990724437e-05, + "loss": 5.0815, + "step": 17856 + }, + { + "epoch": 0.10620063754876773, + "grad_norm": 1.65043044090271, + "learning_rate": 4.862157403595598e-05, + "loss": 5.1938, + "step": 17857 + }, + { + "epoch": 0.10620658483204871, + "grad_norm": 1.6236746311187744, + "learning_rate": 4.8621421072941476e-05, + "loss": 5.5602, + "step": 17858 + }, + { + "epoch": 0.10621253211532972, + "grad_norm": 1.4648228883743286, + "learning_rate": 4.862126810168097e-05, + "loss": 5.3728, + "step": 17859 + }, + { + "epoch": 0.10621847939861072, + "grad_norm": 1.4803123474121094, + "learning_rate": 4.862111512217453e-05, + "loss": 5.58, + "step": 17860 + }, + { + "epoch": 0.1062244266818917, + "grad_norm": 1.320387840270996, + "learning_rate": 4.862096213442221e-05, + "loss": 5.0337, + "step": 17861 + }, + { + "epoch": 0.10623037396517271, + "grad_norm": 1.8309158086776733, + "learning_rate": 4.862080913842405e-05, + "loss": 4.3603, + "step": 17862 + }, + { + "epoch": 0.10623632124845371, + "grad_norm": 1.79231595993042, + "learning_rate": 4.86206561341801e-05, + "loss": 4.401, + "step": 17863 + }, + { + "epoch": 0.1062422685317347, + "grad_norm": 1.7894480228424072, + "learning_rate": 4.862050312169043e-05, + "loss": 4.4592, + "step": 17864 + }, + { + "epoch": 0.1062482158150157, + "grad_norm": 1.8271396160125732, + "learning_rate": 4.8620350100955095e-05, + "loss": 4.2442, + "step": 17865 + }, + { + "epoch": 0.1062541630982967, + "grad_norm": 2.03336238861084, + "learning_rate": 4.862019707197413e-05, + "loss": 4.6245, + "step": 17866 + }, + { + "epoch": 0.10626011038157769, + "grad_norm": 1.8034088611602783, + "learning_rate": 4.86200440347476e-05, + "loss": 4.5798, + "step": 17867 + }, + { + "epoch": 0.10626605766485869, + "grad_norm": 1.366013765335083, + "learning_rate": 4.861989098927556e-05, + "loss": 5.2409, + "step": 17868 + }, + { + "epoch": 0.1062720049481397, + "grad_norm": 1.603281855583191, + "learning_rate": 4.8619737935558054e-05, + "loss": 5.6699, + "step": 17869 + }, + { + "epoch": 0.10627795223142068, + "grad_norm": 1.6720329523086548, + "learning_rate": 4.861958487359515e-05, + "loss": 5.2162, + "step": 17870 + }, + { + "epoch": 0.10628389951470169, + "grad_norm": 2.5577762126922607, + "learning_rate": 4.861943180338689e-05, + "loss": 3.9116, + "step": 17871 + }, + { + "epoch": 0.10628984679798269, + "grad_norm": 2.6489310264587402, + "learning_rate": 4.861927872493332e-05, + "loss": 4.232, + "step": 17872 + }, + { + "epoch": 0.10629579408126368, + "grad_norm": 2.481381893157959, + "learning_rate": 4.861912563823451e-05, + "loss": 4.374, + "step": 17873 + }, + { + "epoch": 0.10630174136454468, + "grad_norm": 2.444721221923828, + "learning_rate": 4.861897254329052e-05, + "loss": 4.504, + "step": 17874 + }, + { + "epoch": 0.10630768864782568, + "grad_norm": 2.529085636138916, + "learning_rate": 4.8618819440101373e-05, + "loss": 4.1305, + "step": 17875 + }, + { + "epoch": 0.10631363593110667, + "grad_norm": 3.966379404067993, + "learning_rate": 4.861866632866715e-05, + "loss": 3.9104, + "step": 17876 + }, + { + "epoch": 0.10631958321438767, + "grad_norm": 2.408405065536499, + "learning_rate": 4.8618513208987895e-05, + "loss": 3.8762, + "step": 17877 + }, + { + "epoch": 0.10632553049766867, + "grad_norm": 2.41780686378479, + "learning_rate": 4.8618360081063654e-05, + "loss": 3.7665, + "step": 17878 + }, + { + "epoch": 0.10633147778094966, + "grad_norm": 2.60262393951416, + "learning_rate": 4.861820694489448e-05, + "loss": 4.067, + "step": 17879 + }, + { + "epoch": 0.10633742506423066, + "grad_norm": 2.624938726425171, + "learning_rate": 4.8618053800480456e-05, + "loss": 4.5653, + "step": 17880 + }, + { + "epoch": 0.10634337234751165, + "grad_norm": 2.783202886581421, + "learning_rate": 4.86179006478216e-05, + "loss": 4.4091, + "step": 17881 + }, + { + "epoch": 0.10634931963079265, + "grad_norm": 2.8269615173339844, + "learning_rate": 4.861774748691798e-05, + "loss": 3.949, + "step": 17882 + }, + { + "epoch": 0.10635526691407365, + "grad_norm": 2.82108998298645, + "learning_rate": 4.861759431776965e-05, + "loss": 3.8479, + "step": 17883 + }, + { + "epoch": 0.10636121419735464, + "grad_norm": 2.8543620109558105, + "learning_rate": 4.861744114037666e-05, + "loss": 3.4358, + "step": 17884 + }, + { + "epoch": 0.10636716148063564, + "grad_norm": 2.6492035388946533, + "learning_rate": 4.861728795473907e-05, + "loss": 3.6298, + "step": 17885 + }, + { + "epoch": 0.10637310876391665, + "grad_norm": 2.834181785583496, + "learning_rate": 4.861713476085693e-05, + "loss": 3.4125, + "step": 17886 + }, + { + "epoch": 0.10637905604719763, + "grad_norm": 3.447075605392456, + "learning_rate": 4.861698155873028e-05, + "loss": 3.5416, + "step": 17887 + }, + { + "epoch": 0.10638500333047864, + "grad_norm": 3.6009531021118164, + "learning_rate": 4.86168283483592e-05, + "loss": 4.1912, + "step": 17888 + }, + { + "epoch": 0.10639095061375964, + "grad_norm": 4.086645126342773, + "learning_rate": 4.861667512974372e-05, + "loss": 4.3999, + "step": 17889 + }, + { + "epoch": 0.10639689789704063, + "grad_norm": 3.673405408859253, + "learning_rate": 4.86165219028839e-05, + "loss": 4.3731, + "step": 17890 + }, + { + "epoch": 0.10640284518032163, + "grad_norm": 2.2896664142608643, + "learning_rate": 4.861636866777981e-05, + "loss": 5.5963, + "step": 17891 + }, + { + "epoch": 0.10640879246360263, + "grad_norm": 2.0481069087982178, + "learning_rate": 4.861621542443148e-05, + "loss": 5.7909, + "step": 17892 + }, + { + "epoch": 0.10641473974688362, + "grad_norm": 1.9108741283416748, + "learning_rate": 4.861606217283897e-05, + "loss": 5.3044, + "step": 17893 + }, + { + "epoch": 0.10642068703016462, + "grad_norm": 1.7842040061950684, + "learning_rate": 4.861590891300235e-05, + "loss": 5.3071, + "step": 17894 + }, + { + "epoch": 0.10642663431344562, + "grad_norm": 1.854777455329895, + "learning_rate": 4.861575564492164e-05, + "loss": 5.386, + "step": 17895 + }, + { + "epoch": 0.10643258159672661, + "grad_norm": 1.7286109924316406, + "learning_rate": 4.861560236859693e-05, + "loss": 5.5609, + "step": 17896 + }, + { + "epoch": 0.10643852888000761, + "grad_norm": 1.709408164024353, + "learning_rate": 4.861544908402825e-05, + "loss": 5.6772, + "step": 17897 + }, + { + "epoch": 0.10644447616328861, + "grad_norm": 1.9251428842544556, + "learning_rate": 4.861529579121567e-05, + "loss": 5.6114, + "step": 17898 + }, + { + "epoch": 0.1064504234465696, + "grad_norm": 1.6568808555603027, + "learning_rate": 4.8615142490159226e-05, + "loss": 5.4648, + "step": 17899 + }, + { + "epoch": 0.1064563707298506, + "grad_norm": 1.7793960571289062, + "learning_rate": 4.861498918085898e-05, + "loss": 5.4987, + "step": 17900 + }, + { + "epoch": 0.10646231801313161, + "grad_norm": 1.9044899940490723, + "learning_rate": 4.861483586331499e-05, + "loss": 5.7757, + "step": 17901 + }, + { + "epoch": 0.1064682652964126, + "grad_norm": 2.215278387069702, + "learning_rate": 4.86146825375273e-05, + "loss": 6.2767, + "step": 17902 + }, + { + "epoch": 0.1064742125796936, + "grad_norm": 1.8699604272842407, + "learning_rate": 4.861452920349597e-05, + "loss": 6.2987, + "step": 17903 + }, + { + "epoch": 0.1064801598629746, + "grad_norm": 1.634887456893921, + "learning_rate": 4.861437586122105e-05, + "loss": 6.2596, + "step": 17904 + }, + { + "epoch": 0.10648610714625559, + "grad_norm": 1.54149329662323, + "learning_rate": 4.86142225107026e-05, + "loss": 6.1988, + "step": 17905 + }, + { + "epoch": 0.10649205442953659, + "grad_norm": 1.5954409837722778, + "learning_rate": 4.861406915194067e-05, + "loss": 6.1052, + "step": 17906 + }, + { + "epoch": 0.10649800171281759, + "grad_norm": 1.8810808658599854, + "learning_rate": 4.86139157849353e-05, + "loss": 6.0318, + "step": 17907 + }, + { + "epoch": 0.10650394899609858, + "grad_norm": 1.4983458518981934, + "learning_rate": 4.861376240968656e-05, + "loss": 5.8614, + "step": 17908 + }, + { + "epoch": 0.10650989627937958, + "grad_norm": 1.5446088314056396, + "learning_rate": 4.8613609026194504e-05, + "loss": 5.623, + "step": 17909 + }, + { + "epoch": 0.10651584356266057, + "grad_norm": 1.7121042013168335, + "learning_rate": 4.861345563445918e-05, + "loss": 4.9258, + "step": 17910 + }, + { + "epoch": 0.10652179084594157, + "grad_norm": 2.002478837966919, + "learning_rate": 4.861330223448065e-05, + "loss": 5.285, + "step": 17911 + }, + { + "epoch": 0.10652773812922257, + "grad_norm": 1.7703490257263184, + "learning_rate": 4.8613148826258944e-05, + "loss": 5.2279, + "step": 17912 + }, + { + "epoch": 0.10653368541250356, + "grad_norm": 1.7763222455978394, + "learning_rate": 4.861299540979415e-05, + "loss": 4.8737, + "step": 17913 + }, + { + "epoch": 0.10653963269578456, + "grad_norm": 1.5921473503112793, + "learning_rate": 4.8612841985086296e-05, + "loss": 5.3756, + "step": 17914 + }, + { + "epoch": 0.10654557997906557, + "grad_norm": 1.810085654258728, + "learning_rate": 4.8612688552135435e-05, + "loss": 5.3784, + "step": 17915 + }, + { + "epoch": 0.10655152726234655, + "grad_norm": 2.2289364337921143, + "learning_rate": 4.8612535110941636e-05, + "loss": 5.0258, + "step": 17916 + }, + { + "epoch": 0.10655747454562756, + "grad_norm": 1.9337642192840576, + "learning_rate": 4.8612381661504946e-05, + "loss": 4.9943, + "step": 17917 + }, + { + "epoch": 0.10656342182890856, + "grad_norm": 1.5772477388381958, + "learning_rate": 4.861222820382542e-05, + "loss": 5.1188, + "step": 17918 + }, + { + "epoch": 0.10656936911218955, + "grad_norm": 1.6176950931549072, + "learning_rate": 4.8612074737903097e-05, + "loss": 5.0973, + "step": 17919 + }, + { + "epoch": 0.10657531639547055, + "grad_norm": 1.7878233194351196, + "learning_rate": 4.8611921263738045e-05, + "loss": 5.0342, + "step": 17920 + }, + { + "epoch": 0.10658126367875155, + "grad_norm": 1.7473089694976807, + "learning_rate": 4.861176778133033e-05, + "loss": 5.2844, + "step": 17921 + }, + { + "epoch": 0.10658721096203254, + "grad_norm": 2.472464084625244, + "learning_rate": 4.8611614290679975e-05, + "loss": 4.9654, + "step": 17922 + }, + { + "epoch": 0.10659315824531354, + "grad_norm": 2.5256218910217285, + "learning_rate": 4.861146079178706e-05, + "loss": 4.7885, + "step": 17923 + }, + { + "epoch": 0.10659910552859454, + "grad_norm": 2.2665674686431885, + "learning_rate": 4.861130728465162e-05, + "loss": 5.0838, + "step": 17924 + }, + { + "epoch": 0.10660505281187553, + "grad_norm": 1.6795161962509155, + "learning_rate": 4.861115376927372e-05, + "loss": 5.3174, + "step": 17925 + }, + { + "epoch": 0.10661100009515653, + "grad_norm": 1.5786751508712769, + "learning_rate": 4.8611000245653405e-05, + "loss": 5.1831, + "step": 17926 + }, + { + "epoch": 0.10661694737843753, + "grad_norm": 2.0238442420959473, + "learning_rate": 4.861084671379074e-05, + "loss": 5.7967, + "step": 17927 + }, + { + "epoch": 0.10662289466171852, + "grad_norm": 1.5760328769683838, + "learning_rate": 4.861069317368577e-05, + "loss": 5.5692, + "step": 17928 + }, + { + "epoch": 0.10662884194499953, + "grad_norm": 1.7190479040145874, + "learning_rate": 4.861053962533855e-05, + "loss": 5.4248, + "step": 17929 + }, + { + "epoch": 0.10663478922828053, + "grad_norm": 1.987444519996643, + "learning_rate": 4.861038606874914e-05, + "loss": 5.3845, + "step": 17930 + }, + { + "epoch": 0.10664073651156152, + "grad_norm": 2.3603975772857666, + "learning_rate": 4.8610232503917585e-05, + "loss": 4.9948, + "step": 17931 + }, + { + "epoch": 0.10664668379484252, + "grad_norm": 2.560696601867676, + "learning_rate": 4.861007893084394e-05, + "loss": 4.797, + "step": 17932 + }, + { + "epoch": 0.10665263107812352, + "grad_norm": 2.3494272232055664, + "learning_rate": 4.860992534952826e-05, + "loss": 4.81, + "step": 17933 + }, + { + "epoch": 0.10665857836140451, + "grad_norm": 2.1878998279571533, + "learning_rate": 4.86097717599706e-05, + "loss": 4.7863, + "step": 17934 + }, + { + "epoch": 0.10666452564468551, + "grad_norm": 2.123789072036743, + "learning_rate": 4.8609618162171016e-05, + "loss": 4.7846, + "step": 17935 + }, + { + "epoch": 0.10667047292796651, + "grad_norm": 2.307370662689209, + "learning_rate": 4.8609464556129555e-05, + "loss": 4.3901, + "step": 17936 + }, + { + "epoch": 0.1066764202112475, + "grad_norm": 1.8189514875411987, + "learning_rate": 4.8609310941846274e-05, + "loss": 5.2722, + "step": 17937 + }, + { + "epoch": 0.1066823674945285, + "grad_norm": 1.4699981212615967, + "learning_rate": 4.860915731932123e-05, + "loss": 5.7501, + "step": 17938 + }, + { + "epoch": 0.10668831477780949, + "grad_norm": 1.5624393224716187, + "learning_rate": 4.860900368855447e-05, + "loss": 5.6963, + "step": 17939 + }, + { + "epoch": 0.10669426206109049, + "grad_norm": 1.8463138341903687, + "learning_rate": 4.860885004954605e-05, + "loss": 5.3627, + "step": 17940 + }, + { + "epoch": 0.1067002093443715, + "grad_norm": 1.7627042531967163, + "learning_rate": 4.8608696402296025e-05, + "loss": 5.6548, + "step": 17941 + }, + { + "epoch": 0.10670615662765248, + "grad_norm": 1.631505012512207, + "learning_rate": 4.860854274680444e-05, + "loss": 5.7926, + "step": 17942 + }, + { + "epoch": 0.10671210391093348, + "grad_norm": 1.4491498470306396, + "learning_rate": 4.860838908307137e-05, + "loss": 5.5395, + "step": 17943 + }, + { + "epoch": 0.10671805119421449, + "grad_norm": 1.6210049390792847, + "learning_rate": 4.8608235411096845e-05, + "loss": 5.2768, + "step": 17944 + }, + { + "epoch": 0.10672399847749547, + "grad_norm": 1.4522534608840942, + "learning_rate": 4.860808173088094e-05, + "loss": 5.7723, + "step": 17945 + }, + { + "epoch": 0.10672994576077648, + "grad_norm": 2.0779013633728027, + "learning_rate": 4.860792804242369e-05, + "loss": 5.4679, + "step": 17946 + }, + { + "epoch": 0.10673589304405748, + "grad_norm": 2.248556137084961, + "learning_rate": 4.860777434572515e-05, + "loss": 5.5089, + "step": 17947 + }, + { + "epoch": 0.10674184032733847, + "grad_norm": 2.2192306518554688, + "learning_rate": 4.86076206407854e-05, + "loss": 5.4098, + "step": 17948 + }, + { + "epoch": 0.10674778761061947, + "grad_norm": 1.7523053884506226, + "learning_rate": 4.8607466927604455e-05, + "loss": 5.3223, + "step": 17949 + }, + { + "epoch": 0.10675373489390047, + "grad_norm": 1.8636107444763184, + "learning_rate": 4.8607313206182395e-05, + "loss": 5.339, + "step": 17950 + }, + { + "epoch": 0.10675968217718146, + "grad_norm": 1.9067093133926392, + "learning_rate": 4.860715947651926e-05, + "loss": 5.3779, + "step": 17951 + }, + { + "epoch": 0.10676562946046246, + "grad_norm": 1.850948452949524, + "learning_rate": 4.860700573861512e-05, + "loss": 5.3474, + "step": 17952 + }, + { + "epoch": 0.10677157674374346, + "grad_norm": 2.144895076751709, + "learning_rate": 4.8606851992470005e-05, + "loss": 5.3089, + "step": 17953 + }, + { + "epoch": 0.10677752402702445, + "grad_norm": 2.054420232772827, + "learning_rate": 4.860669823808399e-05, + "loss": 5.3653, + "step": 17954 + }, + { + "epoch": 0.10678347131030545, + "grad_norm": 1.94870126247406, + "learning_rate": 4.860654447545711e-05, + "loss": 5.2514, + "step": 17955 + }, + { + "epoch": 0.10678941859358645, + "grad_norm": 1.8006596565246582, + "learning_rate": 4.860639070458945e-05, + "loss": 5.2357, + "step": 17956 + }, + { + "epoch": 0.10679536587686744, + "grad_norm": 2.309035301208496, + "learning_rate": 4.860623692548103e-05, + "loss": 5.2681, + "step": 17957 + }, + { + "epoch": 0.10680131316014845, + "grad_norm": 2.402949571609497, + "learning_rate": 4.860608313813192e-05, + "loss": 5.549, + "step": 17958 + }, + { + "epoch": 0.10680726044342945, + "grad_norm": 1.724307894706726, + "learning_rate": 4.8605929342542164e-05, + "loss": 5.5283, + "step": 17959 + }, + { + "epoch": 0.10681320772671044, + "grad_norm": 1.8566054105758667, + "learning_rate": 4.860577553871183e-05, + "loss": 5.834, + "step": 17960 + }, + { + "epoch": 0.10681915500999144, + "grad_norm": 1.8882628679275513, + "learning_rate": 4.860562172664096e-05, + "loss": 5.7954, + "step": 17961 + }, + { + "epoch": 0.10682510229327244, + "grad_norm": 1.694075345993042, + "learning_rate": 4.860546790632961e-05, + "loss": 5.7573, + "step": 17962 + }, + { + "epoch": 0.10683104957655343, + "grad_norm": 1.8312102556228638, + "learning_rate": 4.860531407777783e-05, + "loss": 5.4479, + "step": 17963 + }, + { + "epoch": 0.10683699685983443, + "grad_norm": 1.6124730110168457, + "learning_rate": 4.860516024098569e-05, + "loss": 5.5356, + "step": 17964 + }, + { + "epoch": 0.10684294414311543, + "grad_norm": 2.3505187034606934, + "learning_rate": 4.8605006395953225e-05, + "loss": 5.6543, + "step": 17965 + }, + { + "epoch": 0.10684889142639642, + "grad_norm": 2.69331431388855, + "learning_rate": 4.86048525426805e-05, + "loss": 5.5359, + "step": 17966 + }, + { + "epoch": 0.10685483870967742, + "grad_norm": 2.095374822616577, + "learning_rate": 4.860469868116756e-05, + "loss": 5.5514, + "step": 17967 + }, + { + "epoch": 0.10686078599295841, + "grad_norm": 1.8596038818359375, + "learning_rate": 4.8604544811414465e-05, + "loss": 5.5171, + "step": 17968 + }, + { + "epoch": 0.10686673327623941, + "grad_norm": 2.215549945831299, + "learning_rate": 4.860439093342127e-05, + "loss": 5.3824, + "step": 17969 + }, + { + "epoch": 0.10687268055952041, + "grad_norm": 1.9737238883972168, + "learning_rate": 4.860423704718803e-05, + "loss": 5.4159, + "step": 17970 + }, + { + "epoch": 0.1068786278428014, + "grad_norm": 1.8673701286315918, + "learning_rate": 4.860408315271479e-05, + "loss": 5.421, + "step": 17971 + }, + { + "epoch": 0.1068845751260824, + "grad_norm": 1.905371069908142, + "learning_rate": 4.86039292500016e-05, + "loss": 5.4003, + "step": 17972 + }, + { + "epoch": 0.1068905224093634, + "grad_norm": 1.7888939380645752, + "learning_rate": 4.8603775339048534e-05, + "loss": 5.1581, + "step": 17973 + }, + { + "epoch": 0.1068964696926444, + "grad_norm": 1.7499796152114868, + "learning_rate": 4.8603621419855625e-05, + "loss": 5.1334, + "step": 17974 + }, + { + "epoch": 0.1069024169759254, + "grad_norm": 1.6159700155258179, + "learning_rate": 4.860346749242295e-05, + "loss": 5.1999, + "step": 17975 + }, + { + "epoch": 0.1069083642592064, + "grad_norm": 1.7355921268463135, + "learning_rate": 4.860331355675053e-05, + "loss": 5.3899, + "step": 17976 + }, + { + "epoch": 0.10691431154248739, + "grad_norm": 1.760110855102539, + "learning_rate": 4.860315961283846e-05, + "loss": 5.5386, + "step": 17977 + }, + { + "epoch": 0.10692025882576839, + "grad_norm": 1.605482816696167, + "learning_rate": 4.860300566068675e-05, + "loss": 5.5486, + "step": 17978 + }, + { + "epoch": 0.10692620610904939, + "grad_norm": 2.1792690753936768, + "learning_rate": 4.860285170029548e-05, + "loss": 4.8871, + "step": 17979 + }, + { + "epoch": 0.10693215339233038, + "grad_norm": 1.4513617753982544, + "learning_rate": 4.86026977316647e-05, + "loss": 5.1944, + "step": 17980 + }, + { + "epoch": 0.10693810067561138, + "grad_norm": 2.560112476348877, + "learning_rate": 4.860254375479446e-05, + "loss": 4.2504, + "step": 17981 + }, + { + "epoch": 0.10694404795889238, + "grad_norm": 2.035403251647949, + "learning_rate": 4.8602389769684816e-05, + "loss": 5.4479, + "step": 17982 + }, + { + "epoch": 0.10694999524217337, + "grad_norm": 1.8496562242507935, + "learning_rate": 4.8602235776335826e-05, + "loss": 5.4981, + "step": 17983 + }, + { + "epoch": 0.10695594252545437, + "grad_norm": 1.9541285037994385, + "learning_rate": 4.8602081774747536e-05, + "loss": 5.5772, + "step": 17984 + }, + { + "epoch": 0.10696188980873537, + "grad_norm": 1.674981951713562, + "learning_rate": 4.860192776492001e-05, + "loss": 5.3656, + "step": 17985 + }, + { + "epoch": 0.10696783709201636, + "grad_norm": 1.675601601600647, + "learning_rate": 4.860177374685328e-05, + "loss": 5.3382, + "step": 17986 + }, + { + "epoch": 0.10697378437529736, + "grad_norm": 1.8874675035476685, + "learning_rate": 4.860161972054743e-05, + "loss": 5.1908, + "step": 17987 + }, + { + "epoch": 0.10697973165857837, + "grad_norm": 2.267000675201416, + "learning_rate": 4.860146568600249e-05, + "loss": 5.4437, + "step": 17988 + }, + { + "epoch": 0.10698567894185936, + "grad_norm": 1.8062045574188232, + "learning_rate": 4.8601311643218526e-05, + "loss": 5.2315, + "step": 17989 + }, + { + "epoch": 0.10699162622514036, + "grad_norm": 1.9503196477890015, + "learning_rate": 4.8601157592195584e-05, + "loss": 5.3999, + "step": 17990 + }, + { + "epoch": 0.10699757350842136, + "grad_norm": 1.8589918613433838, + "learning_rate": 4.860100353293372e-05, + "loss": 5.694, + "step": 17991 + }, + { + "epoch": 0.10700352079170235, + "grad_norm": 1.69667649269104, + "learning_rate": 4.8600849465432995e-05, + "loss": 5.6146, + "step": 17992 + }, + { + "epoch": 0.10700946807498335, + "grad_norm": 1.6006754636764526, + "learning_rate": 4.8600695389693455e-05, + "loss": 5.2849, + "step": 17993 + }, + { + "epoch": 0.10701541535826435, + "grad_norm": 1.7502506971359253, + "learning_rate": 4.860054130571516e-05, + "loss": 4.9652, + "step": 17994 + }, + { + "epoch": 0.10702136264154534, + "grad_norm": 1.6936286687850952, + "learning_rate": 4.860038721349816e-05, + "loss": 5.2192, + "step": 17995 + }, + { + "epoch": 0.10702730992482634, + "grad_norm": 1.4757579565048218, + "learning_rate": 4.8600233113042496e-05, + "loss": 5.3917, + "step": 17996 + }, + { + "epoch": 0.10703325720810733, + "grad_norm": 1.4602460861206055, + "learning_rate": 4.8600079004348245e-05, + "loss": 5.5418, + "step": 17997 + }, + { + "epoch": 0.10703920449138833, + "grad_norm": 1.4150431156158447, + "learning_rate": 4.859992488741545e-05, + "loss": 5.6592, + "step": 17998 + }, + { + "epoch": 0.10704515177466933, + "grad_norm": 1.385908842086792, + "learning_rate": 4.859977076224416e-05, + "loss": 5.2818, + "step": 17999 + }, + { + "epoch": 0.10705109905795032, + "grad_norm": 1.3683747053146362, + "learning_rate": 4.8599616628834446e-05, + "loss": 5.2743, + "step": 18000 + }, + { + "epoch": 0.10705704634123132, + "grad_norm": 1.2521027326583862, + "learning_rate": 4.859946248718634e-05, + "loss": 5.1564, + "step": 18001 + }, + { + "epoch": 0.10706299362451233, + "grad_norm": 1.445575475692749, + "learning_rate": 4.8599308337299906e-05, + "loss": 5.0108, + "step": 18002 + }, + { + "epoch": 0.10706894090779331, + "grad_norm": 1.3680258989334106, + "learning_rate": 4.859915417917519e-05, + "loss": 5.2649, + "step": 18003 + }, + { + "epoch": 0.10707488819107432, + "grad_norm": 1.2142491340637207, + "learning_rate": 4.859900001281227e-05, + "loss": 5.1143, + "step": 18004 + }, + { + "epoch": 0.10708083547435532, + "grad_norm": 1.244157314300537, + "learning_rate": 4.859884583821117e-05, + "loss": 5.2321, + "step": 18005 + }, + { + "epoch": 0.1070867827576363, + "grad_norm": 1.4057670831680298, + "learning_rate": 4.859869165537196e-05, + "loss": 5.3419, + "step": 18006 + }, + { + "epoch": 0.10709273004091731, + "grad_norm": 1.3243392705917358, + "learning_rate": 4.859853746429469e-05, + "loss": 5.0217, + "step": 18007 + }, + { + "epoch": 0.10709867732419831, + "grad_norm": 1.3227713108062744, + "learning_rate": 4.8598383264979416e-05, + "loss": 5.055, + "step": 18008 + }, + { + "epoch": 0.1071046246074793, + "grad_norm": 1.3313336372375488, + "learning_rate": 4.8598229057426195e-05, + "loss": 5.1319, + "step": 18009 + }, + { + "epoch": 0.1071105718907603, + "grad_norm": 1.385715126991272, + "learning_rate": 4.8598074841635064e-05, + "loss": 4.9349, + "step": 18010 + }, + { + "epoch": 0.1071165191740413, + "grad_norm": 1.3244850635528564, + "learning_rate": 4.85979206176061e-05, + "loss": 4.9055, + "step": 18011 + }, + { + "epoch": 0.10712246645732229, + "grad_norm": 1.2922260761260986, + "learning_rate": 4.859776638533934e-05, + "loss": 5.0518, + "step": 18012 + }, + { + "epoch": 0.10712841374060329, + "grad_norm": 1.3371012210845947, + "learning_rate": 4.8597612144834845e-05, + "loss": 5.234, + "step": 18013 + }, + { + "epoch": 0.1071343610238843, + "grad_norm": 1.3367552757263184, + "learning_rate": 4.859745789609267e-05, + "loss": 4.9765, + "step": 18014 + }, + { + "epoch": 0.10714030830716528, + "grad_norm": 1.5067929029464722, + "learning_rate": 4.859730363911286e-05, + "loss": 5.235, + "step": 18015 + }, + { + "epoch": 0.10714625559044628, + "grad_norm": 1.3660157918930054, + "learning_rate": 4.859714937389548e-05, + "loss": 5.4104, + "step": 18016 + }, + { + "epoch": 0.10715220287372729, + "grad_norm": 1.3999029397964478, + "learning_rate": 4.859699510044057e-05, + "loss": 5.1603, + "step": 18017 + }, + { + "epoch": 0.10715815015700828, + "grad_norm": 1.6147737503051758, + "learning_rate": 4.8596840818748204e-05, + "loss": 5.0506, + "step": 18018 + }, + { + "epoch": 0.10716409744028928, + "grad_norm": 1.5618371963500977, + "learning_rate": 4.859668652881843e-05, + "loss": 5.1564, + "step": 18019 + }, + { + "epoch": 0.10717004472357028, + "grad_norm": 1.3786426782608032, + "learning_rate": 4.859653223065128e-05, + "loss": 5.1884, + "step": 18020 + }, + { + "epoch": 0.10717599200685127, + "grad_norm": 1.429489016532898, + "learning_rate": 4.859637792424683e-05, + "loss": 5.1556, + "step": 18021 + }, + { + "epoch": 0.10718193929013227, + "grad_norm": 1.3347980976104736, + "learning_rate": 4.859622360960513e-05, + "loss": 5.008, + "step": 18022 + }, + { + "epoch": 0.10718788657341327, + "grad_norm": 1.3850064277648926, + "learning_rate": 4.859606928672623e-05, + "loss": 5.0719, + "step": 18023 + }, + { + "epoch": 0.10719383385669426, + "grad_norm": 1.3279672861099243, + "learning_rate": 4.859591495561019e-05, + "loss": 5.0793, + "step": 18024 + }, + { + "epoch": 0.10719978113997526, + "grad_norm": 1.5108927488327026, + "learning_rate": 4.8595760616257056e-05, + "loss": 5.1067, + "step": 18025 + }, + { + "epoch": 0.10720572842325625, + "grad_norm": 1.2342565059661865, + "learning_rate": 4.859560626866689e-05, + "loss": 5.0298, + "step": 18026 + }, + { + "epoch": 0.10721167570653725, + "grad_norm": 1.2821179628372192, + "learning_rate": 4.859545191283974e-05, + "loss": 5.2185, + "step": 18027 + }, + { + "epoch": 0.10721762298981825, + "grad_norm": 1.11893630027771, + "learning_rate": 4.859529754877566e-05, + "loss": 5.1911, + "step": 18028 + }, + { + "epoch": 0.10722357027309924, + "grad_norm": 1.2202814817428589, + "learning_rate": 4.859514317647471e-05, + "loss": 5.028, + "step": 18029 + }, + { + "epoch": 0.10722951755638024, + "grad_norm": 1.3898543119430542, + "learning_rate": 4.859498879593694e-05, + "loss": 5.4019, + "step": 18030 + }, + { + "epoch": 0.10723546483966125, + "grad_norm": 1.2810478210449219, + "learning_rate": 4.859483440716239e-05, + "loss": 5.0634, + "step": 18031 + }, + { + "epoch": 0.10724141212294223, + "grad_norm": 1.4424680471420288, + "learning_rate": 4.859468001015114e-05, + "loss": 5.0058, + "step": 18032 + }, + { + "epoch": 0.10724735940622324, + "grad_norm": 1.4053739309310913, + "learning_rate": 4.859452560490323e-05, + "loss": 5.0174, + "step": 18033 + }, + { + "epoch": 0.10725330668950424, + "grad_norm": 1.2552763223648071, + "learning_rate": 4.859437119141871e-05, + "loss": 5.0222, + "step": 18034 + }, + { + "epoch": 0.10725925397278523, + "grad_norm": 1.3694052696228027, + "learning_rate": 4.859421676969764e-05, + "loss": 4.9663, + "step": 18035 + }, + { + "epoch": 0.10726520125606623, + "grad_norm": 1.3814043998718262, + "learning_rate": 4.859406233974007e-05, + "loss": 5.01, + "step": 18036 + }, + { + "epoch": 0.10727114853934723, + "grad_norm": 1.5185308456420898, + "learning_rate": 4.859390790154606e-05, + "loss": 4.9698, + "step": 18037 + }, + { + "epoch": 0.10727709582262822, + "grad_norm": 1.2509820461273193, + "learning_rate": 4.859375345511566e-05, + "loss": 5.1034, + "step": 18038 + }, + { + "epoch": 0.10728304310590922, + "grad_norm": 1.3478872776031494, + "learning_rate": 4.8593599000448926e-05, + "loss": 5.2459, + "step": 18039 + }, + { + "epoch": 0.10728899038919022, + "grad_norm": 1.3720686435699463, + "learning_rate": 4.859344453754591e-05, + "loss": 5.1671, + "step": 18040 + }, + { + "epoch": 0.10729493767247121, + "grad_norm": 1.3953602313995361, + "learning_rate": 4.859329006640666e-05, + "loss": 5.3221, + "step": 18041 + }, + { + "epoch": 0.10730088495575221, + "grad_norm": 1.4901010990142822, + "learning_rate": 4.859313558703125e-05, + "loss": 5.1694, + "step": 18042 + }, + { + "epoch": 0.10730683223903321, + "grad_norm": 1.4153228998184204, + "learning_rate": 4.859298109941971e-05, + "loss": 5.2721, + "step": 18043 + }, + { + "epoch": 0.1073127795223142, + "grad_norm": 1.34188711643219, + "learning_rate": 4.859282660357211e-05, + "loss": 5.3048, + "step": 18044 + }, + { + "epoch": 0.1073187268055952, + "grad_norm": 1.355832576751709, + "learning_rate": 4.859267209948849e-05, + "loss": 5.2908, + "step": 18045 + }, + { + "epoch": 0.1073246740888762, + "grad_norm": 1.1551882028579712, + "learning_rate": 4.859251758716891e-05, + "loss": 5.1681, + "step": 18046 + }, + { + "epoch": 0.1073306213721572, + "grad_norm": 1.1728358268737793, + "learning_rate": 4.8592363066613434e-05, + "loss": 5.1535, + "step": 18047 + }, + { + "epoch": 0.1073365686554382, + "grad_norm": 1.4180268049240112, + "learning_rate": 4.859220853782211e-05, + "loss": 4.6467, + "step": 18048 + }, + { + "epoch": 0.1073425159387192, + "grad_norm": 1.4042308330535889, + "learning_rate": 4.8592054000794984e-05, + "loss": 4.7348, + "step": 18049 + }, + { + "epoch": 0.10734846322200019, + "grad_norm": 1.2508533000946045, + "learning_rate": 4.859189945553211e-05, + "loss": 4.7797, + "step": 18050 + }, + { + "epoch": 0.10735441050528119, + "grad_norm": 1.2266274690628052, + "learning_rate": 4.859174490203355e-05, + "loss": 4.7223, + "step": 18051 + }, + { + "epoch": 0.10736035778856219, + "grad_norm": 1.3217378854751587, + "learning_rate": 4.8591590340299366e-05, + "loss": 4.82, + "step": 18052 + }, + { + "epoch": 0.10736630507184318, + "grad_norm": 1.3789056539535522, + "learning_rate": 4.8591435770329594e-05, + "loss": 5.3133, + "step": 18053 + }, + { + "epoch": 0.10737225235512418, + "grad_norm": 1.6090314388275146, + "learning_rate": 4.85912811921243e-05, + "loss": 5.2263, + "step": 18054 + }, + { + "epoch": 0.10737819963840518, + "grad_norm": 1.3780972957611084, + "learning_rate": 4.859112660568353e-05, + "loss": 5.3081, + "step": 18055 + }, + { + "epoch": 0.10738414692168617, + "grad_norm": 1.3518953323364258, + "learning_rate": 4.859097201100734e-05, + "loss": 5.3423, + "step": 18056 + }, + { + "epoch": 0.10739009420496717, + "grad_norm": 1.4160034656524658, + "learning_rate": 4.859081740809579e-05, + "loss": 5.3082, + "step": 18057 + }, + { + "epoch": 0.10739604148824816, + "grad_norm": 1.1970654726028442, + "learning_rate": 4.8590662796948924e-05, + "loss": 5.254, + "step": 18058 + }, + { + "epoch": 0.10740198877152916, + "grad_norm": 1.3175582885742188, + "learning_rate": 4.859050817756681e-05, + "loss": 5.2823, + "step": 18059 + }, + { + "epoch": 0.10740793605481017, + "grad_norm": 1.5136942863464355, + "learning_rate": 4.859035354994948e-05, + "loss": 5.2238, + "step": 18060 + }, + { + "epoch": 0.10741388333809115, + "grad_norm": 1.2552412748336792, + "learning_rate": 4.859019891409701e-05, + "loss": 5.0492, + "step": 18061 + }, + { + "epoch": 0.10741983062137216, + "grad_norm": 1.2873655557632446, + "learning_rate": 4.859004427000945e-05, + "loss": 4.9162, + "step": 18062 + }, + { + "epoch": 0.10742577790465316, + "grad_norm": 1.2441788911819458, + "learning_rate": 4.8589889617686834e-05, + "loss": 4.9769, + "step": 18063 + }, + { + "epoch": 0.10743172518793415, + "grad_norm": 1.4254180192947388, + "learning_rate": 4.8589734957129246e-05, + "loss": 4.9917, + "step": 18064 + }, + { + "epoch": 0.10743767247121515, + "grad_norm": 1.3922675848007202, + "learning_rate": 4.858958028833672e-05, + "loss": 4.9705, + "step": 18065 + }, + { + "epoch": 0.10744361975449615, + "grad_norm": 1.430801510810852, + "learning_rate": 4.858942561130932e-05, + "loss": 5.0772, + "step": 18066 + }, + { + "epoch": 0.10744956703777714, + "grad_norm": 1.3651894330978394, + "learning_rate": 4.8589270926047085e-05, + "loss": 4.8844, + "step": 18067 + }, + { + "epoch": 0.10745551432105814, + "grad_norm": 1.4133042097091675, + "learning_rate": 4.858911623255008e-05, + "loss": 4.9397, + "step": 18068 + }, + { + "epoch": 0.10746146160433914, + "grad_norm": 1.4437615871429443, + "learning_rate": 4.858896153081837e-05, + "loss": 4.9977, + "step": 18069 + }, + { + "epoch": 0.10746740888762013, + "grad_norm": 1.3420813083648682, + "learning_rate": 4.858880682085199e-05, + "loss": 4.9295, + "step": 18070 + }, + { + "epoch": 0.10747335617090113, + "grad_norm": 1.2613091468811035, + "learning_rate": 4.8588652102651e-05, + "loss": 5.3186, + "step": 18071 + }, + { + "epoch": 0.10747930345418213, + "grad_norm": 1.2117836475372314, + "learning_rate": 4.858849737621545e-05, + "loss": 5.207, + "step": 18072 + }, + { + "epoch": 0.10748525073746312, + "grad_norm": 1.3153164386749268, + "learning_rate": 4.85883426415454e-05, + "loss": 4.9786, + "step": 18073 + }, + { + "epoch": 0.10749119802074412, + "grad_norm": 1.2437881231307983, + "learning_rate": 4.858818789864091e-05, + "loss": 4.8748, + "step": 18074 + }, + { + "epoch": 0.10749714530402513, + "grad_norm": 1.2477847337722778, + "learning_rate": 4.858803314750203e-05, + "loss": 4.8874, + "step": 18075 + }, + { + "epoch": 0.10750309258730611, + "grad_norm": 1.342822790145874, + "learning_rate": 4.858787838812881e-05, + "loss": 4.8244, + "step": 18076 + }, + { + "epoch": 0.10750903987058712, + "grad_norm": 1.4947394132614136, + "learning_rate": 4.8587723620521306e-05, + "loss": 4.9091, + "step": 18077 + }, + { + "epoch": 0.10751498715386812, + "grad_norm": 1.388978362083435, + "learning_rate": 4.8587568844679566e-05, + "loss": 4.9075, + "step": 18078 + }, + { + "epoch": 0.10752093443714911, + "grad_norm": 1.5932878255844116, + "learning_rate": 4.8587414060603656e-05, + "loss": 4.8712, + "step": 18079 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 1.3746308088302612, + "learning_rate": 4.8587259268293616e-05, + "loss": 4.9187, + "step": 18080 + }, + { + "epoch": 0.10753282900371111, + "grad_norm": 1.2811295986175537, + "learning_rate": 4.858710446774951e-05, + "loss": 4.8643, + "step": 18081 + }, + { + "epoch": 0.1075387762869921, + "grad_norm": 1.4154548645019531, + "learning_rate": 4.858694965897139e-05, + "loss": 4.8802, + "step": 18082 + }, + { + "epoch": 0.1075447235702731, + "grad_norm": 1.3216148614883423, + "learning_rate": 4.8586794841959305e-05, + "loss": 5.0356, + "step": 18083 + }, + { + "epoch": 0.1075506708535541, + "grad_norm": 1.0971577167510986, + "learning_rate": 4.858664001671332e-05, + "loss": 5.2085, + "step": 18084 + }, + { + "epoch": 0.10755661813683509, + "grad_norm": 1.3257287740707397, + "learning_rate": 4.858648518323348e-05, + "loss": 5.1728, + "step": 18085 + }, + { + "epoch": 0.1075625654201161, + "grad_norm": 1.2429475784301758, + "learning_rate": 4.858633034151985e-05, + "loss": 5.1053, + "step": 18086 + }, + { + "epoch": 0.10756851270339708, + "grad_norm": 1.1196707487106323, + "learning_rate": 4.858617549157246e-05, + "loss": 5.074, + "step": 18087 + }, + { + "epoch": 0.10757445998667808, + "grad_norm": 1.1981266736984253, + "learning_rate": 4.858602063339139e-05, + "loss": 5.0093, + "step": 18088 + }, + { + "epoch": 0.10758040726995909, + "grad_norm": 1.3818682432174683, + "learning_rate": 4.858586576697668e-05, + "loss": 5.0184, + "step": 18089 + }, + { + "epoch": 0.10758635455324007, + "grad_norm": 1.303539752960205, + "learning_rate": 4.85857108923284e-05, + "loss": 5.1778, + "step": 18090 + }, + { + "epoch": 0.10759230183652108, + "grad_norm": 1.3990812301635742, + "learning_rate": 4.8585556009446576e-05, + "loss": 4.9785, + "step": 18091 + }, + { + "epoch": 0.10759824911980208, + "grad_norm": 1.2507104873657227, + "learning_rate": 4.858540111833129e-05, + "loss": 4.9024, + "step": 18092 + }, + { + "epoch": 0.10760419640308307, + "grad_norm": 1.2867792844772339, + "learning_rate": 4.858524621898257e-05, + "loss": 4.8847, + "step": 18093 + }, + { + "epoch": 0.10761014368636407, + "grad_norm": 1.1816591024398804, + "learning_rate": 4.8585091311400495e-05, + "loss": 4.9431, + "step": 18094 + }, + { + "epoch": 0.10761609096964507, + "grad_norm": 1.292284607887268, + "learning_rate": 4.85849363955851e-05, + "loss": 5.2273, + "step": 18095 + }, + { + "epoch": 0.10762203825292606, + "grad_norm": 1.3242478370666504, + "learning_rate": 4.8584781471536456e-05, + "loss": 5.093, + "step": 18096 + }, + { + "epoch": 0.10762798553620706, + "grad_norm": 1.211534857749939, + "learning_rate": 4.858462653925461e-05, + "loss": 5.0928, + "step": 18097 + }, + { + "epoch": 0.10763393281948806, + "grad_norm": 1.0469262599945068, + "learning_rate": 4.858447159873961e-05, + "loss": 5.0435, + "step": 18098 + }, + { + "epoch": 0.10763988010276905, + "grad_norm": 1.2352322340011597, + "learning_rate": 4.8584316649991514e-05, + "loss": 5.1899, + "step": 18099 + }, + { + "epoch": 0.10764582738605005, + "grad_norm": 1.2135246992111206, + "learning_rate": 4.8584161693010375e-05, + "loss": 5.1028, + "step": 18100 + }, + { + "epoch": 0.10765177466933105, + "grad_norm": 1.3525876998901367, + "learning_rate": 4.858400672779625e-05, + "loss": 5.0422, + "step": 18101 + }, + { + "epoch": 0.10765772195261204, + "grad_norm": 1.3221076726913452, + "learning_rate": 4.85838517543492e-05, + "loss": 5.1329, + "step": 18102 + }, + { + "epoch": 0.10766366923589304, + "grad_norm": 1.4856393337249756, + "learning_rate": 4.858369677266926e-05, + "loss": 4.6795, + "step": 18103 + }, + { + "epoch": 0.10766961651917405, + "grad_norm": 1.4690982103347778, + "learning_rate": 4.8583541782756495e-05, + "loss": 5.1234, + "step": 18104 + }, + { + "epoch": 0.10767556380245503, + "grad_norm": 1.2535064220428467, + "learning_rate": 4.8583386784610964e-05, + "loss": 5.1344, + "step": 18105 + }, + { + "epoch": 0.10768151108573604, + "grad_norm": 1.3537837266921997, + "learning_rate": 4.858323177823272e-05, + "loss": 5.228, + "step": 18106 + }, + { + "epoch": 0.10768745836901704, + "grad_norm": 1.2927895784378052, + "learning_rate": 4.8583076763621805e-05, + "loss": 5.2371, + "step": 18107 + }, + { + "epoch": 0.10769340565229803, + "grad_norm": 1.2356709241867065, + "learning_rate": 4.8582921740778284e-05, + "loss": 4.9056, + "step": 18108 + }, + { + "epoch": 0.10769935293557903, + "grad_norm": 1.266918420791626, + "learning_rate": 4.858276670970221e-05, + "loss": 5.2142, + "step": 18109 + }, + { + "epoch": 0.10770530021886003, + "grad_norm": 1.1703591346740723, + "learning_rate": 4.858261167039364e-05, + "loss": 5.1237, + "step": 18110 + }, + { + "epoch": 0.10771124750214102, + "grad_norm": 1.2324700355529785, + "learning_rate": 4.858245662285262e-05, + "loss": 5.1391, + "step": 18111 + }, + { + "epoch": 0.10771719478542202, + "grad_norm": 1.2764140367507935, + "learning_rate": 4.85823015670792e-05, + "loss": 5.1368, + "step": 18112 + }, + { + "epoch": 0.10772314206870302, + "grad_norm": 1.254909634590149, + "learning_rate": 4.8582146503073456e-05, + "loss": 5.002, + "step": 18113 + }, + { + "epoch": 0.10772908935198401, + "grad_norm": 1.3368279933929443, + "learning_rate": 4.858199143083542e-05, + "loss": 5.1365, + "step": 18114 + }, + { + "epoch": 0.10773503663526501, + "grad_norm": 1.3550091981887817, + "learning_rate": 4.8581836350365165e-05, + "loss": 5.1722, + "step": 18115 + }, + { + "epoch": 0.107740983918546, + "grad_norm": 1.6306661367416382, + "learning_rate": 4.858168126166272e-05, + "loss": 5.0883, + "step": 18116 + }, + { + "epoch": 0.107746931201827, + "grad_norm": 1.5143946409225464, + "learning_rate": 4.858152616472816e-05, + "loss": 5.1258, + "step": 18117 + }, + { + "epoch": 0.107752878485108, + "grad_norm": 1.6553763151168823, + "learning_rate": 4.858137105956153e-05, + "loss": 4.9596, + "step": 18118 + }, + { + "epoch": 0.107758825768389, + "grad_norm": 1.920473337173462, + "learning_rate": 4.8581215946162896e-05, + "loss": 5.2206, + "step": 18119 + }, + { + "epoch": 0.10776477305167, + "grad_norm": 1.8482425212860107, + "learning_rate": 4.85810608245323e-05, + "loss": 5.1515, + "step": 18120 + }, + { + "epoch": 0.107770720334951, + "grad_norm": 1.6005665063858032, + "learning_rate": 4.8580905694669794e-05, + "loss": 5.1383, + "step": 18121 + }, + { + "epoch": 0.10777666761823199, + "grad_norm": 1.2169783115386963, + "learning_rate": 4.858075055657544e-05, + "loss": 5.3538, + "step": 18122 + }, + { + "epoch": 0.10778261490151299, + "grad_norm": 1.3251442909240723, + "learning_rate": 4.858059541024929e-05, + "loss": 5.3116, + "step": 18123 + }, + { + "epoch": 0.10778856218479399, + "grad_norm": 1.2065789699554443, + "learning_rate": 4.858044025569139e-05, + "loss": 5.2334, + "step": 18124 + }, + { + "epoch": 0.10779450946807498, + "grad_norm": 1.5847411155700684, + "learning_rate": 4.858028509290181e-05, + "loss": 4.9114, + "step": 18125 + }, + { + "epoch": 0.10780045675135598, + "grad_norm": 1.373826503753662, + "learning_rate": 4.85801299218806e-05, + "loss": 5.0748, + "step": 18126 + }, + { + "epoch": 0.10780640403463698, + "grad_norm": 1.7349494695663452, + "learning_rate": 4.85799747426278e-05, + "loss": 5.0888, + "step": 18127 + }, + { + "epoch": 0.10781235131791797, + "grad_norm": 1.3385915756225586, + "learning_rate": 4.857981955514349e-05, + "loss": 5.1472, + "step": 18128 + }, + { + "epoch": 0.10781829860119897, + "grad_norm": 1.3666753768920898, + "learning_rate": 4.857966435942769e-05, + "loss": 5.0881, + "step": 18129 + }, + { + "epoch": 0.10782424588447997, + "grad_norm": 1.39078688621521, + "learning_rate": 4.857950915548048e-05, + "loss": 5.3867, + "step": 18130 + }, + { + "epoch": 0.10783019316776096, + "grad_norm": 1.4484905004501343, + "learning_rate": 4.857935394330192e-05, + "loss": 5.0516, + "step": 18131 + }, + { + "epoch": 0.10783614045104196, + "grad_norm": 1.526084542274475, + "learning_rate": 4.8579198722892034e-05, + "loss": 5.0424, + "step": 18132 + }, + { + "epoch": 0.10784208773432297, + "grad_norm": 1.4617003202438354, + "learning_rate": 4.8579043494250895e-05, + "loss": 5.0245, + "step": 18133 + }, + { + "epoch": 0.10784803501760395, + "grad_norm": 1.3335559368133545, + "learning_rate": 4.857888825737856e-05, + "loss": 4.9398, + "step": 18134 + }, + { + "epoch": 0.10785398230088496, + "grad_norm": 1.1473711729049683, + "learning_rate": 4.857873301227508e-05, + "loss": 5.1818, + "step": 18135 + }, + { + "epoch": 0.10785992958416596, + "grad_norm": 1.5986409187316895, + "learning_rate": 4.8578577758940504e-05, + "loss": 5.3518, + "step": 18136 + }, + { + "epoch": 0.10786587686744695, + "grad_norm": 1.6430408954620361, + "learning_rate": 4.857842249737489e-05, + "loss": 5.3052, + "step": 18137 + }, + { + "epoch": 0.10787182415072795, + "grad_norm": 1.5069605112075806, + "learning_rate": 4.8578267227578303e-05, + "loss": 5.3491, + "step": 18138 + }, + { + "epoch": 0.10787777143400895, + "grad_norm": 1.3385566473007202, + "learning_rate": 4.857811194955077e-05, + "loss": 5.3864, + "step": 18139 + }, + { + "epoch": 0.10788371871728994, + "grad_norm": 1.1956936120986938, + "learning_rate": 4.857795666329237e-05, + "loss": 5.1304, + "step": 18140 + }, + { + "epoch": 0.10788966600057094, + "grad_norm": 1.3437196016311646, + "learning_rate": 4.857780136880315e-05, + "loss": 5.1872, + "step": 18141 + }, + { + "epoch": 0.10789561328385194, + "grad_norm": 1.4649217128753662, + "learning_rate": 4.857764606608316e-05, + "loss": 5.4178, + "step": 18142 + }, + { + "epoch": 0.10790156056713293, + "grad_norm": 1.2196028232574463, + "learning_rate": 4.857749075513246e-05, + "loss": 5.1782, + "step": 18143 + }, + { + "epoch": 0.10790750785041393, + "grad_norm": 1.2016780376434326, + "learning_rate": 4.8577335435951096e-05, + "loss": 5.2293, + "step": 18144 + }, + { + "epoch": 0.10791345513369492, + "grad_norm": 1.3034183979034424, + "learning_rate": 4.857718010853914e-05, + "loss": 5.2886, + "step": 18145 + }, + { + "epoch": 0.10791940241697592, + "grad_norm": 1.1815390586853027, + "learning_rate": 4.857702477289663e-05, + "loss": 5.2637, + "step": 18146 + }, + { + "epoch": 0.10792534970025693, + "grad_norm": 1.328203558921814, + "learning_rate": 4.857686942902362e-05, + "loss": 5.3154, + "step": 18147 + }, + { + "epoch": 0.10793129698353791, + "grad_norm": 1.2995961904525757, + "learning_rate": 4.857671407692016e-05, + "loss": 5.3313, + "step": 18148 + }, + { + "epoch": 0.10793724426681892, + "grad_norm": 1.181191325187683, + "learning_rate": 4.8576558716586326e-05, + "loss": 5.2589, + "step": 18149 + }, + { + "epoch": 0.10794319155009992, + "grad_norm": 1.266570806503296, + "learning_rate": 4.8576403348022154e-05, + "loss": 5.1694, + "step": 18150 + }, + { + "epoch": 0.1079491388333809, + "grad_norm": 1.4107643365859985, + "learning_rate": 4.857624797122771e-05, + "loss": 5.1784, + "step": 18151 + }, + { + "epoch": 0.10795508611666191, + "grad_norm": 1.1809200048446655, + "learning_rate": 4.8576092586203024e-05, + "loss": 5.3081, + "step": 18152 + }, + { + "epoch": 0.10796103339994291, + "grad_norm": 1.179453730583191, + "learning_rate": 4.857593719294818e-05, + "loss": 5.2534, + "step": 18153 + }, + { + "epoch": 0.1079669806832239, + "grad_norm": 1.3677690029144287, + "learning_rate": 4.857578179146323e-05, + "loss": 5.4021, + "step": 18154 + }, + { + "epoch": 0.1079729279665049, + "grad_norm": 1.3077856302261353, + "learning_rate": 4.8575626381748196e-05, + "loss": 5.1766, + "step": 18155 + }, + { + "epoch": 0.1079788752497859, + "grad_norm": 1.075791835784912, + "learning_rate": 4.857547096380317e-05, + "loss": 5.163, + "step": 18156 + }, + { + "epoch": 0.10798482253306689, + "grad_norm": 1.2855931520462036, + "learning_rate": 4.8575315537628186e-05, + "loss": 5.157, + "step": 18157 + }, + { + "epoch": 0.10799076981634789, + "grad_norm": 1.1961009502410889, + "learning_rate": 4.8575160103223303e-05, + "loss": 5.1632, + "step": 18158 + }, + { + "epoch": 0.1079967170996289, + "grad_norm": 1.6419997215270996, + "learning_rate": 4.8575004660588574e-05, + "loss": 5.1575, + "step": 18159 + }, + { + "epoch": 0.10800266438290988, + "grad_norm": 1.5928575992584229, + "learning_rate": 4.857484920972405e-05, + "loss": 5.0818, + "step": 18160 + }, + { + "epoch": 0.10800861166619088, + "grad_norm": 1.3492580652236938, + "learning_rate": 4.85746937506298e-05, + "loss": 5.1529, + "step": 18161 + }, + { + "epoch": 0.10801455894947189, + "grad_norm": 1.543717861175537, + "learning_rate": 4.857453828330587e-05, + "loss": 5.6192, + "step": 18162 + }, + { + "epoch": 0.10802050623275287, + "grad_norm": 1.5657880306243896, + "learning_rate": 4.85743828077523e-05, + "loss": 5.6619, + "step": 18163 + }, + { + "epoch": 0.10802645351603388, + "grad_norm": 1.3861533403396606, + "learning_rate": 4.8574227323969164e-05, + "loss": 5.2147, + "step": 18164 + }, + { + "epoch": 0.10803240079931488, + "grad_norm": 1.3780323266983032, + "learning_rate": 4.85740718319565e-05, + "loss": 5.1112, + "step": 18165 + }, + { + "epoch": 0.10803834808259587, + "grad_norm": 1.5768086910247803, + "learning_rate": 4.857391633171438e-05, + "loss": 5.011, + "step": 18166 + }, + { + "epoch": 0.10804429536587687, + "grad_norm": 1.4504894018173218, + "learning_rate": 4.857376082324285e-05, + "loss": 4.9349, + "step": 18167 + }, + { + "epoch": 0.10805024264915787, + "grad_norm": 1.5084949731826782, + "learning_rate": 4.857360530654196e-05, + "loss": 4.9861, + "step": 18168 + }, + { + "epoch": 0.10805618993243886, + "grad_norm": 1.4052237272262573, + "learning_rate": 4.857344978161177e-05, + "loss": 5.0447, + "step": 18169 + }, + { + "epoch": 0.10806213721571986, + "grad_norm": 1.5666663646697998, + "learning_rate": 4.857329424845233e-05, + "loss": 5.3537, + "step": 18170 + }, + { + "epoch": 0.10806808449900086, + "grad_norm": 1.251293420791626, + "learning_rate": 4.8573138707063695e-05, + "loss": 5.0139, + "step": 18171 + }, + { + "epoch": 0.10807403178228185, + "grad_norm": 1.2570216655731201, + "learning_rate": 4.8572983157445926e-05, + "loss": 4.9959, + "step": 18172 + }, + { + "epoch": 0.10807997906556285, + "grad_norm": 1.5116729736328125, + "learning_rate": 4.857282759959907e-05, + "loss": 5.1592, + "step": 18173 + }, + { + "epoch": 0.10808592634884384, + "grad_norm": 1.518898367881775, + "learning_rate": 4.857267203352318e-05, + "loss": 5.3541, + "step": 18174 + }, + { + "epoch": 0.10809187363212484, + "grad_norm": 1.314247965812683, + "learning_rate": 4.857251645921832e-05, + "loss": 5.2249, + "step": 18175 + }, + { + "epoch": 0.10809782091540585, + "grad_norm": 1.378150224685669, + "learning_rate": 4.857236087668453e-05, + "loss": 5.0004, + "step": 18176 + }, + { + "epoch": 0.10810376819868683, + "grad_norm": 1.4453868865966797, + "learning_rate": 4.8572205285921876e-05, + "loss": 5.2717, + "step": 18177 + }, + { + "epoch": 0.10810971548196784, + "grad_norm": 1.3493587970733643, + "learning_rate": 4.857204968693041e-05, + "loss": 5.4044, + "step": 18178 + }, + { + "epoch": 0.10811566276524884, + "grad_norm": 1.3819094896316528, + "learning_rate": 4.857189407971019e-05, + "loss": 5.0641, + "step": 18179 + }, + { + "epoch": 0.10812161004852983, + "grad_norm": 1.337969422340393, + "learning_rate": 4.857173846426126e-05, + "loss": 4.9078, + "step": 18180 + }, + { + "epoch": 0.10812755733181083, + "grad_norm": 1.655778408050537, + "learning_rate": 4.857158284058367e-05, + "loss": 4.9192, + "step": 18181 + }, + { + "epoch": 0.10813350461509183, + "grad_norm": 1.3867977857589722, + "learning_rate": 4.85714272086775e-05, + "loss": 4.86, + "step": 18182 + }, + { + "epoch": 0.10813945189837282, + "grad_norm": 1.5444231033325195, + "learning_rate": 4.8571271568542786e-05, + "loss": 4.9745, + "step": 18183 + }, + { + "epoch": 0.10814539918165382, + "grad_norm": 1.470123052597046, + "learning_rate": 4.8571115920179576e-05, + "loss": 5.1311, + "step": 18184 + }, + { + "epoch": 0.10815134646493482, + "grad_norm": 1.3052124977111816, + "learning_rate": 4.8570960263587936e-05, + "loss": 5.0657, + "step": 18185 + }, + { + "epoch": 0.10815729374821581, + "grad_norm": 1.4197286367416382, + "learning_rate": 4.857080459876792e-05, + "loss": 5.0798, + "step": 18186 + }, + { + "epoch": 0.10816324103149681, + "grad_norm": 1.5119234323501587, + "learning_rate": 4.857064892571958e-05, + "loss": 5.2842, + "step": 18187 + }, + { + "epoch": 0.10816918831477781, + "grad_norm": 1.6037629842758179, + "learning_rate": 4.8570493244442974e-05, + "loss": 4.8785, + "step": 18188 + }, + { + "epoch": 0.1081751355980588, + "grad_norm": 1.6456643342971802, + "learning_rate": 4.857033755493814e-05, + "loss": 5.2566, + "step": 18189 + }, + { + "epoch": 0.1081810828813398, + "grad_norm": 1.5777020454406738, + "learning_rate": 4.8570181857205155e-05, + "loss": 4.9856, + "step": 18190 + }, + { + "epoch": 0.1081870301646208, + "grad_norm": 1.6042171716690063, + "learning_rate": 4.857002615124405e-05, + "loss": 4.9179, + "step": 18191 + }, + { + "epoch": 0.1081929774479018, + "grad_norm": 1.2339718341827393, + "learning_rate": 4.856987043705491e-05, + "loss": 4.9144, + "step": 18192 + }, + { + "epoch": 0.1081989247311828, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.856971471463776e-05, + "loss": 5.0296, + "step": 18193 + }, + { + "epoch": 0.1082048720144638, + "grad_norm": 1.4179781675338745, + "learning_rate": 4.856955898399267e-05, + "loss": 5.268, + "step": 18194 + }, + { + "epoch": 0.10821081929774479, + "grad_norm": 1.5291078090667725, + "learning_rate": 4.856940324511969e-05, + "loss": 5.2433, + "step": 18195 + }, + { + "epoch": 0.10821676658102579, + "grad_norm": 1.5799169540405273, + "learning_rate": 4.856924749801888e-05, + "loss": 5.1906, + "step": 18196 + }, + { + "epoch": 0.10822271386430679, + "grad_norm": 1.4068591594696045, + "learning_rate": 4.8569091742690276e-05, + "loss": 5.2152, + "step": 18197 + }, + { + "epoch": 0.10822866114758778, + "grad_norm": 1.3728901147842407, + "learning_rate": 4.8568935979133953e-05, + "loss": 5.1717, + "step": 18198 + }, + { + "epoch": 0.10823460843086878, + "grad_norm": 1.524344563484192, + "learning_rate": 4.856878020734996e-05, + "loss": 5.0635, + "step": 18199 + }, + { + "epoch": 0.10824055571414978, + "grad_norm": 1.4725397825241089, + "learning_rate": 4.856862442733835e-05, + "loss": 5.2382, + "step": 18200 + }, + { + "epoch": 0.10824650299743077, + "grad_norm": 1.3467813730239868, + "learning_rate": 4.856846863909917e-05, + "loss": 5.0823, + "step": 18201 + }, + { + "epoch": 0.10825245028071177, + "grad_norm": 1.264833927154541, + "learning_rate": 4.856831284263249e-05, + "loss": 5.1763, + "step": 18202 + }, + { + "epoch": 0.10825839756399276, + "grad_norm": 1.2883045673370361, + "learning_rate": 4.856815703793836e-05, + "loss": 5.1207, + "step": 18203 + }, + { + "epoch": 0.10826434484727376, + "grad_norm": 1.309486746788025, + "learning_rate": 4.856800122501681e-05, + "loss": 5.0648, + "step": 18204 + }, + { + "epoch": 0.10827029213055477, + "grad_norm": 1.4473057985305786, + "learning_rate": 4.856784540386793e-05, + "loss": 4.9615, + "step": 18205 + }, + { + "epoch": 0.10827623941383575, + "grad_norm": 1.5151125192642212, + "learning_rate": 4.856768957449175e-05, + "loss": 5.2847, + "step": 18206 + }, + { + "epoch": 0.10828218669711676, + "grad_norm": 1.4859318733215332, + "learning_rate": 4.8567533736888336e-05, + "loss": 4.931, + "step": 18207 + }, + { + "epoch": 0.10828813398039776, + "grad_norm": 1.6516517400741577, + "learning_rate": 4.8567377891057745e-05, + "loss": 5.05, + "step": 18208 + }, + { + "epoch": 0.10829408126367875, + "grad_norm": 1.679347276687622, + "learning_rate": 4.8567222037000024e-05, + "loss": 5.2281, + "step": 18209 + }, + { + "epoch": 0.10830002854695975, + "grad_norm": 1.5119515657424927, + "learning_rate": 4.856706617471523e-05, + "loss": 4.9572, + "step": 18210 + }, + { + "epoch": 0.10830597583024075, + "grad_norm": 1.6819381713867188, + "learning_rate": 4.8566910304203404e-05, + "loss": 4.6228, + "step": 18211 + }, + { + "epoch": 0.10831192311352174, + "grad_norm": 1.7754294872283936, + "learning_rate": 4.856675442546462e-05, + "loss": 4.6851, + "step": 18212 + }, + { + "epoch": 0.10831787039680274, + "grad_norm": 1.455660343170166, + "learning_rate": 4.856659853849893e-05, + "loss": 5.059, + "step": 18213 + }, + { + "epoch": 0.10832381768008374, + "grad_norm": 1.358823299407959, + "learning_rate": 4.856644264330639e-05, + "loss": 5.0354, + "step": 18214 + }, + { + "epoch": 0.10832976496336473, + "grad_norm": 1.465482473373413, + "learning_rate": 4.856628673988703e-05, + "loss": 5.0441, + "step": 18215 + }, + { + "epoch": 0.10833571224664573, + "grad_norm": 1.3863260746002197, + "learning_rate": 4.8566130828240936e-05, + "loss": 5.0445, + "step": 18216 + }, + { + "epoch": 0.10834165952992673, + "grad_norm": 1.556997299194336, + "learning_rate": 4.856597490836815e-05, + "loss": 5.0629, + "step": 18217 + }, + { + "epoch": 0.10834760681320772, + "grad_norm": 1.3784066438674927, + "learning_rate": 4.856581898026872e-05, + "loss": 5.1894, + "step": 18218 + }, + { + "epoch": 0.10835355409648872, + "grad_norm": 1.4675719738006592, + "learning_rate": 4.856566304394271e-05, + "loss": 5.008, + "step": 18219 + }, + { + "epoch": 0.10835950137976973, + "grad_norm": 1.634920597076416, + "learning_rate": 4.856550709939016e-05, + "loss": 4.7707, + "step": 18220 + }, + { + "epoch": 0.10836544866305071, + "grad_norm": 1.83092200756073, + "learning_rate": 4.856535114661115e-05, + "loss": 4.8947, + "step": 18221 + }, + { + "epoch": 0.10837139594633172, + "grad_norm": 1.497359037399292, + "learning_rate": 4.856519518560571e-05, + "loss": 4.9656, + "step": 18222 + }, + { + "epoch": 0.10837734322961272, + "grad_norm": 1.3194255828857422, + "learning_rate": 4.856503921637391e-05, + "loss": 5.2374, + "step": 18223 + }, + { + "epoch": 0.1083832905128937, + "grad_norm": 1.3584619760513306, + "learning_rate": 4.8564883238915794e-05, + "loss": 5.1154, + "step": 18224 + }, + { + "epoch": 0.10838923779617471, + "grad_norm": 1.4173928499221802, + "learning_rate": 4.8564727253231416e-05, + "loss": 5.173, + "step": 18225 + }, + { + "epoch": 0.10839518507945571, + "grad_norm": 1.4110074043273926, + "learning_rate": 4.8564571259320844e-05, + "loss": 5.2409, + "step": 18226 + }, + { + "epoch": 0.1084011323627367, + "grad_norm": 1.4481827020645142, + "learning_rate": 4.856441525718412e-05, + "loss": 4.8533, + "step": 18227 + }, + { + "epoch": 0.1084070796460177, + "grad_norm": 1.4017881155014038, + "learning_rate": 4.85642592468213e-05, + "loss": 5.0483, + "step": 18228 + }, + { + "epoch": 0.1084130269292987, + "grad_norm": 1.3940458297729492, + "learning_rate": 4.8564103228232445e-05, + "loss": 5.0983, + "step": 18229 + }, + { + "epoch": 0.10841897421257969, + "grad_norm": 1.4414485692977905, + "learning_rate": 4.8563947201417604e-05, + "loss": 5.1561, + "step": 18230 + }, + { + "epoch": 0.1084249214958607, + "grad_norm": 1.3622056245803833, + "learning_rate": 4.856379116637683e-05, + "loss": 5.1773, + "step": 18231 + }, + { + "epoch": 0.10843086877914168, + "grad_norm": 1.3298035860061646, + "learning_rate": 4.856363512311019e-05, + "loss": 5.0742, + "step": 18232 + }, + { + "epoch": 0.10843681606242268, + "grad_norm": 1.3110575675964355, + "learning_rate": 4.856347907161771e-05, + "loss": 5.044, + "step": 18233 + }, + { + "epoch": 0.10844276334570369, + "grad_norm": 1.309591293334961, + "learning_rate": 4.856332301189948e-05, + "loss": 5.1313, + "step": 18234 + }, + { + "epoch": 0.10844871062898467, + "grad_norm": 1.2283830642700195, + "learning_rate": 4.856316694395552e-05, + "loss": 5.0777, + "step": 18235 + }, + { + "epoch": 0.10845465791226568, + "grad_norm": 1.1523172855377197, + "learning_rate": 4.856301086778592e-05, + "loss": 5.1245, + "step": 18236 + }, + { + "epoch": 0.10846060519554668, + "grad_norm": 1.3058217763900757, + "learning_rate": 4.85628547833907e-05, + "loss": 4.9649, + "step": 18237 + }, + { + "epoch": 0.10846655247882767, + "grad_norm": 1.239734172821045, + "learning_rate": 4.856269869076994e-05, + "loss": 5.0736, + "step": 18238 + }, + { + "epoch": 0.10847249976210867, + "grad_norm": 1.2624062299728394, + "learning_rate": 4.856254258992369e-05, + "loss": 5.0538, + "step": 18239 + }, + { + "epoch": 0.10847844704538967, + "grad_norm": 1.2172342538833618, + "learning_rate": 4.856238648085199e-05, + "loss": 5.0781, + "step": 18240 + }, + { + "epoch": 0.10848439432867066, + "grad_norm": 1.2534043788909912, + "learning_rate": 4.8562230363554906e-05, + "loss": 5.2148, + "step": 18241 + }, + { + "epoch": 0.10849034161195166, + "grad_norm": 1.3765602111816406, + "learning_rate": 4.85620742380325e-05, + "loss": 5.1274, + "step": 18242 + }, + { + "epoch": 0.10849628889523266, + "grad_norm": 1.4610897302627563, + "learning_rate": 4.856191810428481e-05, + "loss": 5.0356, + "step": 18243 + }, + { + "epoch": 0.10850223617851365, + "grad_norm": 1.4103399515151978, + "learning_rate": 4.8561761962311895e-05, + "loss": 5.0198, + "step": 18244 + }, + { + "epoch": 0.10850818346179465, + "grad_norm": 1.5159040689468384, + "learning_rate": 4.856160581211382e-05, + "loss": 5.0139, + "step": 18245 + }, + { + "epoch": 0.10851413074507565, + "grad_norm": 1.5071041584014893, + "learning_rate": 4.856144965369063e-05, + "loss": 4.9644, + "step": 18246 + }, + { + "epoch": 0.10852007802835664, + "grad_norm": 1.4504464864730835, + "learning_rate": 4.856129348704237e-05, + "loss": 5.041, + "step": 18247 + }, + { + "epoch": 0.10852602531163764, + "grad_norm": 1.2327022552490234, + "learning_rate": 4.856113731216911e-05, + "loss": 4.9775, + "step": 18248 + }, + { + "epoch": 0.10853197259491865, + "grad_norm": 2.013401508331299, + "learning_rate": 4.8560981129070914e-05, + "loss": 4.5814, + "step": 18249 + }, + { + "epoch": 0.10853791987819963, + "grad_norm": 1.7224215269088745, + "learning_rate": 4.8560824937747814e-05, + "loss": 5.3439, + "step": 18250 + }, + { + "epoch": 0.10854386716148064, + "grad_norm": 1.6198631525039673, + "learning_rate": 4.856066873819987e-05, + "loss": 5.0878, + "step": 18251 + }, + { + "epoch": 0.10854981444476164, + "grad_norm": 1.3257763385772705, + "learning_rate": 4.8560512530427146e-05, + "loss": 5.4697, + "step": 18252 + }, + { + "epoch": 0.10855576172804263, + "grad_norm": 1.6341005563735962, + "learning_rate": 4.856035631442969e-05, + "loss": 5.1383, + "step": 18253 + }, + { + "epoch": 0.10856170901132363, + "grad_norm": 1.4148058891296387, + "learning_rate": 4.8560200090207555e-05, + "loss": 5.3053, + "step": 18254 + }, + { + "epoch": 0.10856765629460463, + "grad_norm": 1.4810155630111694, + "learning_rate": 4.8560043857760796e-05, + "loss": 5.1222, + "step": 18255 + }, + { + "epoch": 0.10857360357788562, + "grad_norm": 1.4345650672912598, + "learning_rate": 4.8559887617089476e-05, + "loss": 5.2331, + "step": 18256 + }, + { + "epoch": 0.10857955086116662, + "grad_norm": 1.7319680452346802, + "learning_rate": 4.855973136819363e-05, + "loss": 4.6762, + "step": 18257 + }, + { + "epoch": 0.10858549814444762, + "grad_norm": 1.3632503747940063, + "learning_rate": 4.855957511107333e-05, + "loss": 4.8047, + "step": 18258 + }, + { + "epoch": 0.10859144542772861, + "grad_norm": 1.2798017263412476, + "learning_rate": 4.8559418845728636e-05, + "loss": 4.9368, + "step": 18259 + }, + { + "epoch": 0.10859739271100961, + "grad_norm": 1.539689540863037, + "learning_rate": 4.855926257215958e-05, + "loss": 4.8178, + "step": 18260 + }, + { + "epoch": 0.1086033399942906, + "grad_norm": 1.2351077795028687, + "learning_rate": 4.855910629036623e-05, + "loss": 5.0983, + "step": 18261 + }, + { + "epoch": 0.1086092872775716, + "grad_norm": 1.582154393196106, + "learning_rate": 4.855895000034865e-05, + "loss": 5.0563, + "step": 18262 + }, + { + "epoch": 0.1086152345608526, + "grad_norm": 1.3505899906158447, + "learning_rate": 4.855879370210688e-05, + "loss": 5.4024, + "step": 18263 + }, + { + "epoch": 0.1086211818441336, + "grad_norm": 1.236626148223877, + "learning_rate": 4.855863739564097e-05, + "loss": 5.4412, + "step": 18264 + }, + { + "epoch": 0.1086271291274146, + "grad_norm": 1.1207302808761597, + "learning_rate": 4.855848108095099e-05, + "loss": 5.3498, + "step": 18265 + }, + { + "epoch": 0.1086330764106956, + "grad_norm": 1.3238142728805542, + "learning_rate": 4.855832475803698e-05, + "loss": 4.9028, + "step": 18266 + }, + { + "epoch": 0.10863902369397659, + "grad_norm": 1.4837650060653687, + "learning_rate": 4.8558168426899006e-05, + "loss": 5.354, + "step": 18267 + }, + { + "epoch": 0.10864497097725759, + "grad_norm": 1.55657160282135, + "learning_rate": 4.8558012087537126e-05, + "loss": 5.4629, + "step": 18268 + }, + { + "epoch": 0.10865091826053859, + "grad_norm": 1.4918092489242554, + "learning_rate": 4.855785573995138e-05, + "loss": 5.046, + "step": 18269 + }, + { + "epoch": 0.10865686554381958, + "grad_norm": 1.5374544858932495, + "learning_rate": 4.855769938414183e-05, + "loss": 4.9571, + "step": 18270 + }, + { + "epoch": 0.10866281282710058, + "grad_norm": 1.360386610031128, + "learning_rate": 4.8557543020108537e-05, + "loss": 4.9482, + "step": 18271 + }, + { + "epoch": 0.10866876011038158, + "grad_norm": 1.2835793495178223, + "learning_rate": 4.855738664785154e-05, + "loss": 4.8301, + "step": 18272 + }, + { + "epoch": 0.10867470739366257, + "grad_norm": 1.453478217124939, + "learning_rate": 4.8557230267370915e-05, + "loss": 4.7873, + "step": 18273 + }, + { + "epoch": 0.10868065467694357, + "grad_norm": 1.4986752271652222, + "learning_rate": 4.855707387866669e-05, + "loss": 5.4533, + "step": 18274 + }, + { + "epoch": 0.10868660196022457, + "grad_norm": 1.574263572692871, + "learning_rate": 4.855691748173894e-05, + "loss": 5.0576, + "step": 18275 + }, + { + "epoch": 0.10869254924350556, + "grad_norm": 1.6014435291290283, + "learning_rate": 4.855676107658772e-05, + "loss": 4.8039, + "step": 18276 + }, + { + "epoch": 0.10869849652678656, + "grad_norm": 1.3822481632232666, + "learning_rate": 4.855660466321307e-05, + "loss": 4.9241, + "step": 18277 + }, + { + "epoch": 0.10870444381006757, + "grad_norm": 1.3199692964553833, + "learning_rate": 4.855644824161506e-05, + "loss": 4.842, + "step": 18278 + }, + { + "epoch": 0.10871039109334855, + "grad_norm": 1.340505599975586, + "learning_rate": 4.855629181179373e-05, + "loss": 4.8217, + "step": 18279 + }, + { + "epoch": 0.10871633837662956, + "grad_norm": 1.32645845413208, + "learning_rate": 4.8556135373749144e-05, + "loss": 4.9701, + "step": 18280 + }, + { + "epoch": 0.10872228565991056, + "grad_norm": 1.3629400730133057, + "learning_rate": 4.855597892748135e-05, + "loss": 5.2129, + "step": 18281 + }, + { + "epoch": 0.10872823294319155, + "grad_norm": 1.504604458808899, + "learning_rate": 4.8555822472990415e-05, + "loss": 4.988, + "step": 18282 + }, + { + "epoch": 0.10873418022647255, + "grad_norm": 1.514352560043335, + "learning_rate": 4.855566601027638e-05, + "loss": 4.8909, + "step": 18283 + }, + { + "epoch": 0.10874012750975355, + "grad_norm": 1.35514235496521, + "learning_rate": 4.85555095393393e-05, + "loss": 4.9441, + "step": 18284 + }, + { + "epoch": 0.10874607479303454, + "grad_norm": 1.1690728664398193, + "learning_rate": 4.8555353060179256e-05, + "loss": 5.3733, + "step": 18285 + }, + { + "epoch": 0.10875202207631554, + "grad_norm": 1.3280658721923828, + "learning_rate": 4.855519657279626e-05, + "loss": 5.4406, + "step": 18286 + }, + { + "epoch": 0.10875796935959654, + "grad_norm": 1.5852582454681396, + "learning_rate": 4.85550400771904e-05, + "loss": 5.176, + "step": 18287 + }, + { + "epoch": 0.10876391664287753, + "grad_norm": 1.233869194984436, + "learning_rate": 4.855488357336172e-05, + "loss": 5.2879, + "step": 18288 + }, + { + "epoch": 0.10876986392615853, + "grad_norm": 1.365251064300537, + "learning_rate": 4.855472706131027e-05, + "loss": 5.1592, + "step": 18289 + }, + { + "epoch": 0.10877581120943952, + "grad_norm": 1.6119641065597534, + "learning_rate": 4.8554570541036104e-05, + "loss": 5.0079, + "step": 18290 + }, + { + "epoch": 0.10878175849272052, + "grad_norm": 1.3233095407485962, + "learning_rate": 4.855441401253928e-05, + "loss": 5.3579, + "step": 18291 + }, + { + "epoch": 0.10878770577600153, + "grad_norm": 1.3345812559127808, + "learning_rate": 4.855425747581986e-05, + "loss": 5.1435, + "step": 18292 + }, + { + "epoch": 0.10879365305928251, + "grad_norm": 1.6694916486740112, + "learning_rate": 4.855410093087789e-05, + "loss": 5.0007, + "step": 18293 + }, + { + "epoch": 0.10879960034256352, + "grad_norm": 1.5835634469985962, + "learning_rate": 4.855394437771342e-05, + "loss": 4.9706, + "step": 18294 + }, + { + "epoch": 0.10880554762584452, + "grad_norm": 1.5465360879898071, + "learning_rate": 4.8553787816326526e-05, + "loss": 4.8983, + "step": 18295 + }, + { + "epoch": 0.1088114949091255, + "grad_norm": 1.4393326044082642, + "learning_rate": 4.855363124671723e-05, + "loss": 4.9365, + "step": 18296 + }, + { + "epoch": 0.10881744219240651, + "grad_norm": 1.5096935033798218, + "learning_rate": 4.8553474668885626e-05, + "loss": 4.8343, + "step": 18297 + }, + { + "epoch": 0.10882338947568751, + "grad_norm": 1.422397255897522, + "learning_rate": 4.8553318082831735e-05, + "loss": 4.9229, + "step": 18298 + }, + { + "epoch": 0.1088293367589685, + "grad_norm": 1.6444910764694214, + "learning_rate": 4.855316148855562e-05, + "loss": 5.0403, + "step": 18299 + }, + { + "epoch": 0.1088352840422495, + "grad_norm": 1.3621931076049805, + "learning_rate": 4.855300488605734e-05, + "loss": 4.9027, + "step": 18300 + }, + { + "epoch": 0.1088412313255305, + "grad_norm": 1.5086915493011475, + "learning_rate": 4.855284827533696e-05, + "loss": 4.95, + "step": 18301 + }, + { + "epoch": 0.10884717860881149, + "grad_norm": 1.7021756172180176, + "learning_rate": 4.855269165639451e-05, + "loss": 4.8245, + "step": 18302 + }, + { + "epoch": 0.10885312589209249, + "grad_norm": 1.6745699644088745, + "learning_rate": 4.855253502923007e-05, + "loss": 4.7832, + "step": 18303 + }, + { + "epoch": 0.1088590731753735, + "grad_norm": 1.2379045486450195, + "learning_rate": 4.8552378393843676e-05, + "loss": 5.0438, + "step": 18304 + }, + { + "epoch": 0.10886502045865448, + "grad_norm": 1.3999474048614502, + "learning_rate": 4.85522217502354e-05, + "loss": 5.0123, + "step": 18305 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 1.3539077043533325, + "learning_rate": 4.8552065098405276e-05, + "loss": 5.0722, + "step": 18306 + }, + { + "epoch": 0.10887691502521649, + "grad_norm": 1.3992128372192383, + "learning_rate": 4.8551908438353374e-05, + "loss": 4.9449, + "step": 18307 + }, + { + "epoch": 0.10888286230849747, + "grad_norm": 1.617443323135376, + "learning_rate": 4.8551751770079744e-05, + "loss": 5.1081, + "step": 18308 + }, + { + "epoch": 0.10888880959177848, + "grad_norm": 1.6027116775512695, + "learning_rate": 4.8551595093584446e-05, + "loss": 5.06, + "step": 18309 + }, + { + "epoch": 0.10889475687505948, + "grad_norm": 1.1488780975341797, + "learning_rate": 4.855143840886752e-05, + "loss": 5.1771, + "step": 18310 + }, + { + "epoch": 0.10890070415834047, + "grad_norm": 1.5683537721633911, + "learning_rate": 4.855128171592903e-05, + "loss": 5.1402, + "step": 18311 + }, + { + "epoch": 0.10890665144162147, + "grad_norm": 1.2840538024902344, + "learning_rate": 4.855112501476904e-05, + "loss": 5.2887, + "step": 18312 + }, + { + "epoch": 0.10891259872490247, + "grad_norm": 1.2311303615570068, + "learning_rate": 4.855096830538759e-05, + "loss": 5.2057, + "step": 18313 + }, + { + "epoch": 0.10891854600818346, + "grad_norm": 1.3655261993408203, + "learning_rate": 4.855081158778474e-05, + "loss": 5.3298, + "step": 18314 + }, + { + "epoch": 0.10892449329146446, + "grad_norm": 1.3405102491378784, + "learning_rate": 4.855065486196055e-05, + "loss": 5.3249, + "step": 18315 + }, + { + "epoch": 0.10893044057474546, + "grad_norm": 1.3816508054733276, + "learning_rate": 4.855049812791506e-05, + "loss": 5.2829, + "step": 18316 + }, + { + "epoch": 0.10893638785802645, + "grad_norm": 1.1929587125778198, + "learning_rate": 4.855034138564835e-05, + "loss": 5.5317, + "step": 18317 + }, + { + "epoch": 0.10894233514130745, + "grad_norm": 1.2426830530166626, + "learning_rate": 4.855018463516045e-05, + "loss": 5.263, + "step": 18318 + }, + { + "epoch": 0.10894828242458844, + "grad_norm": 1.3385604619979858, + "learning_rate": 4.855002787645141e-05, + "loss": 5.2531, + "step": 18319 + }, + { + "epoch": 0.10895422970786944, + "grad_norm": 1.2306677103042603, + "learning_rate": 4.8549871109521314e-05, + "loss": 5.245, + "step": 18320 + }, + { + "epoch": 0.10896017699115045, + "grad_norm": 1.3108047246932983, + "learning_rate": 4.85497143343702e-05, + "loss": 5.3063, + "step": 18321 + }, + { + "epoch": 0.10896612427443143, + "grad_norm": 1.3951044082641602, + "learning_rate": 4.8549557550998126e-05, + "loss": 5.4842, + "step": 18322 + }, + { + "epoch": 0.10897207155771244, + "grad_norm": 1.4618322849273682, + "learning_rate": 4.854940075940514e-05, + "loss": 5.5703, + "step": 18323 + }, + { + "epoch": 0.10897801884099344, + "grad_norm": 1.3512097597122192, + "learning_rate": 4.8549243959591304e-05, + "loss": 5.2615, + "step": 18324 + }, + { + "epoch": 0.10898396612427443, + "grad_norm": 1.261428713798523, + "learning_rate": 4.8549087151556675e-05, + "loss": 5.2617, + "step": 18325 + }, + { + "epoch": 0.10898991340755543, + "grad_norm": 1.5647974014282227, + "learning_rate": 4.854893033530129e-05, + "loss": 5.0529, + "step": 18326 + }, + { + "epoch": 0.10899586069083643, + "grad_norm": 1.3635188341140747, + "learning_rate": 4.8548773510825226e-05, + "loss": 5.1029, + "step": 18327 + }, + { + "epoch": 0.10900180797411742, + "grad_norm": 1.2746639251708984, + "learning_rate": 4.854861667812852e-05, + "loss": 5.1788, + "step": 18328 + }, + { + "epoch": 0.10900775525739842, + "grad_norm": 1.3292982578277588, + "learning_rate": 4.854845983721125e-05, + "loss": 5.2442, + "step": 18329 + }, + { + "epoch": 0.10901370254067942, + "grad_norm": 1.3015047311782837, + "learning_rate": 4.854830298807345e-05, + "loss": 5.2234, + "step": 18330 + }, + { + "epoch": 0.10901964982396041, + "grad_norm": 1.2642244100570679, + "learning_rate": 4.854814613071518e-05, + "loss": 5.1501, + "step": 18331 + }, + { + "epoch": 0.10902559710724141, + "grad_norm": 1.191630482673645, + "learning_rate": 4.8547989265136484e-05, + "loss": 5.1618, + "step": 18332 + }, + { + "epoch": 0.10903154439052241, + "grad_norm": 1.4171391725540161, + "learning_rate": 4.8547832391337445e-05, + "loss": 5.1431, + "step": 18333 + }, + { + "epoch": 0.1090374916738034, + "grad_norm": 1.3901907205581665, + "learning_rate": 4.854767550931809e-05, + "loss": 5.1464, + "step": 18334 + }, + { + "epoch": 0.1090434389570844, + "grad_norm": 1.5166548490524292, + "learning_rate": 4.854751861907849e-05, + "loss": 5.0841, + "step": 18335 + }, + { + "epoch": 0.1090493862403654, + "grad_norm": 1.3555935621261597, + "learning_rate": 4.854736172061869e-05, + "loss": 5.2947, + "step": 18336 + }, + { + "epoch": 0.1090553335236464, + "grad_norm": 1.1348215341567993, + "learning_rate": 4.854720481393875e-05, + "loss": 5.2813, + "step": 18337 + }, + { + "epoch": 0.1090612808069274, + "grad_norm": 1.3353219032287598, + "learning_rate": 4.8547047899038734e-05, + "loss": 5.2473, + "step": 18338 + }, + { + "epoch": 0.1090672280902084, + "grad_norm": 1.550512671470642, + "learning_rate": 4.854689097591868e-05, + "loss": 5.1364, + "step": 18339 + }, + { + "epoch": 0.10907317537348939, + "grad_norm": 1.5353589057922363, + "learning_rate": 4.8546734044578646e-05, + "loss": 5.0105, + "step": 18340 + }, + { + "epoch": 0.10907912265677039, + "grad_norm": 1.4025498628616333, + "learning_rate": 4.85465771050187e-05, + "loss": 5.0779, + "step": 18341 + }, + { + "epoch": 0.10908506994005139, + "grad_norm": 1.220438838005066, + "learning_rate": 4.8546420157238874e-05, + "loss": 5.0732, + "step": 18342 + }, + { + "epoch": 0.10909101722333238, + "grad_norm": 1.4058369398117065, + "learning_rate": 4.8546263201239245e-05, + "loss": 5.0838, + "step": 18343 + }, + { + "epoch": 0.10909696450661338, + "grad_norm": 1.4438905715942383, + "learning_rate": 4.854610623701986e-05, + "loss": 5.0449, + "step": 18344 + }, + { + "epoch": 0.10910291178989438, + "grad_norm": 1.536890983581543, + "learning_rate": 4.854594926458076e-05, + "loss": 4.9601, + "step": 18345 + }, + { + "epoch": 0.10910885907317537, + "grad_norm": 1.3566638231277466, + "learning_rate": 4.8545792283922025e-05, + "loss": 4.9283, + "step": 18346 + }, + { + "epoch": 0.10911480635645637, + "grad_norm": 1.3086943626403809, + "learning_rate": 4.8545635295043694e-05, + "loss": 5.0638, + "step": 18347 + }, + { + "epoch": 0.10912075363973736, + "grad_norm": 1.330124020576477, + "learning_rate": 4.854547829794582e-05, + "loss": 5.0944, + "step": 18348 + }, + { + "epoch": 0.10912670092301836, + "grad_norm": 1.4076783657073975, + "learning_rate": 4.854532129262848e-05, + "loss": 4.9725, + "step": 18349 + }, + { + "epoch": 0.10913264820629937, + "grad_norm": 1.380814552307129, + "learning_rate": 4.854516427909169e-05, + "loss": 5.0551, + "step": 18350 + }, + { + "epoch": 0.10913859548958035, + "grad_norm": 1.4243587255477905, + "learning_rate": 4.854500725733554e-05, + "loss": 5.103, + "step": 18351 + }, + { + "epoch": 0.10914454277286136, + "grad_norm": 1.438328742980957, + "learning_rate": 4.854485022736006e-05, + "loss": 5.1153, + "step": 18352 + }, + { + "epoch": 0.10915049005614236, + "grad_norm": 1.4602978229522705, + "learning_rate": 4.8544693189165324e-05, + "loss": 4.8916, + "step": 18353 + }, + { + "epoch": 0.10915643733942335, + "grad_norm": 1.548378586769104, + "learning_rate": 4.8544536142751385e-05, + "loss": 5.0205, + "step": 18354 + }, + { + "epoch": 0.10916238462270435, + "grad_norm": 1.33285653591156, + "learning_rate": 4.854437908811828e-05, + "loss": 4.9558, + "step": 18355 + }, + { + "epoch": 0.10916833190598535, + "grad_norm": 1.442918300628662, + "learning_rate": 4.854422202526609e-05, + "loss": 4.9119, + "step": 18356 + }, + { + "epoch": 0.10917427918926634, + "grad_norm": 1.498830795288086, + "learning_rate": 4.8544064954194836e-05, + "loss": 4.9787, + "step": 18357 + }, + { + "epoch": 0.10918022647254734, + "grad_norm": 1.422012209892273, + "learning_rate": 4.85439078749046e-05, + "loss": 5.0013, + "step": 18358 + }, + { + "epoch": 0.10918617375582834, + "grad_norm": 1.4635952711105347, + "learning_rate": 4.854375078739543e-05, + "loss": 4.8389, + "step": 18359 + }, + { + "epoch": 0.10919212103910933, + "grad_norm": 1.3973792791366577, + "learning_rate": 4.854359369166738e-05, + "loss": 4.9503, + "step": 18360 + }, + { + "epoch": 0.10919806832239033, + "grad_norm": 1.4016454219818115, + "learning_rate": 4.8543436587720504e-05, + "loss": 4.8533, + "step": 18361 + }, + { + "epoch": 0.10920401560567133, + "grad_norm": 1.215690016746521, + "learning_rate": 4.854327947555486e-05, + "loss": 5.0961, + "step": 18362 + }, + { + "epoch": 0.10920996288895232, + "grad_norm": 1.1589696407318115, + "learning_rate": 4.85431223551705e-05, + "loss": 4.8991, + "step": 18363 + }, + { + "epoch": 0.10921591017223332, + "grad_norm": 1.2894245386123657, + "learning_rate": 4.854296522656748e-05, + "loss": 5.0622, + "step": 18364 + }, + { + "epoch": 0.10922185745551433, + "grad_norm": 1.3525546789169312, + "learning_rate": 4.854280808974585e-05, + "loss": 5.1679, + "step": 18365 + }, + { + "epoch": 0.10922780473879531, + "grad_norm": 1.2055712938308716, + "learning_rate": 4.854265094470567e-05, + "loss": 5.2706, + "step": 18366 + }, + { + "epoch": 0.10923375202207632, + "grad_norm": 1.3646256923675537, + "learning_rate": 4.8542493791447e-05, + "loss": 5.2381, + "step": 18367 + }, + { + "epoch": 0.10923969930535732, + "grad_norm": 1.535840630531311, + "learning_rate": 4.8542336629969875e-05, + "loss": 5.0133, + "step": 18368 + }, + { + "epoch": 0.1092456465886383, + "grad_norm": 1.3226375579833984, + "learning_rate": 4.854217946027437e-05, + "loss": 4.9518, + "step": 18369 + }, + { + "epoch": 0.10925159387191931, + "grad_norm": 1.4403883218765259, + "learning_rate": 4.854202228236054e-05, + "loss": 5.1958, + "step": 18370 + }, + { + "epoch": 0.10925754115520031, + "grad_norm": 1.3661396503448486, + "learning_rate": 4.8541865096228426e-05, + "loss": 5.297, + "step": 18371 + }, + { + "epoch": 0.1092634884384813, + "grad_norm": 1.1291767358779907, + "learning_rate": 4.8541707901878096e-05, + "loss": 5.0954, + "step": 18372 + }, + { + "epoch": 0.1092694357217623, + "grad_norm": 1.414288878440857, + "learning_rate": 4.854155069930959e-05, + "loss": 5.0499, + "step": 18373 + }, + { + "epoch": 0.1092753830050433, + "grad_norm": 1.405760407447815, + "learning_rate": 4.8541393488522976e-05, + "loss": 5.004, + "step": 18374 + }, + { + "epoch": 0.10928133028832429, + "grad_norm": 1.2152272462844849, + "learning_rate": 4.854123626951831e-05, + "loss": 4.9798, + "step": 18375 + }, + { + "epoch": 0.10928727757160529, + "grad_norm": 1.3401811122894287, + "learning_rate": 4.854107904229564e-05, + "loss": 5.1179, + "step": 18376 + }, + { + "epoch": 0.10929322485488628, + "grad_norm": 1.036811113357544, + "learning_rate": 4.854092180685502e-05, + "loss": 5.129, + "step": 18377 + }, + { + "epoch": 0.10929917213816728, + "grad_norm": 1.380259394645691, + "learning_rate": 4.8540764563196506e-05, + "loss": 5.163, + "step": 18378 + }, + { + "epoch": 0.10930511942144829, + "grad_norm": 1.3078418970108032, + "learning_rate": 4.8540607311320156e-05, + "loss": 4.9882, + "step": 18379 + }, + { + "epoch": 0.10931106670472927, + "grad_norm": 1.2273530960083008, + "learning_rate": 4.854045005122603e-05, + "loss": 5.0736, + "step": 18380 + }, + { + "epoch": 0.10931701398801028, + "grad_norm": 1.1997276544570923, + "learning_rate": 4.8540292782914164e-05, + "loss": 4.9193, + "step": 18381 + }, + { + "epoch": 0.10932296127129128, + "grad_norm": 1.2119728326797485, + "learning_rate": 4.854013550638463e-05, + "loss": 4.9752, + "step": 18382 + }, + { + "epoch": 0.10932890855457227, + "grad_norm": 1.1508461236953735, + "learning_rate": 4.853997822163748e-05, + "loss": 4.8432, + "step": 18383 + }, + { + "epoch": 0.10933485583785327, + "grad_norm": 1.2142893075942993, + "learning_rate": 4.853982092867276e-05, + "loss": 5.0771, + "step": 18384 + }, + { + "epoch": 0.10934080312113427, + "grad_norm": 1.1016231775283813, + "learning_rate": 4.8539663627490536e-05, + "loss": 5.0918, + "step": 18385 + }, + { + "epoch": 0.10934675040441526, + "grad_norm": 1.2202482223510742, + "learning_rate": 4.8539506318090865e-05, + "loss": 5.1181, + "step": 18386 + }, + { + "epoch": 0.10935269768769626, + "grad_norm": 1.3560340404510498, + "learning_rate": 4.853934900047379e-05, + "loss": 5.1007, + "step": 18387 + }, + { + "epoch": 0.10935864497097726, + "grad_norm": 1.350473165512085, + "learning_rate": 4.8539191674639374e-05, + "loss": 5.1084, + "step": 18388 + }, + { + "epoch": 0.10936459225425825, + "grad_norm": 1.5102394819259644, + "learning_rate": 4.853903434058766e-05, + "loss": 5.0825, + "step": 18389 + }, + { + "epoch": 0.10937053953753925, + "grad_norm": 1.3704886436462402, + "learning_rate": 4.853887699831872e-05, + "loss": 5.1083, + "step": 18390 + }, + { + "epoch": 0.10937648682082025, + "grad_norm": 1.315167784690857, + "learning_rate": 4.8538719647832606e-05, + "loss": 4.9786, + "step": 18391 + }, + { + "epoch": 0.10938243410410124, + "grad_norm": 1.5208832025527954, + "learning_rate": 4.8538562289129356e-05, + "loss": 4.9011, + "step": 18392 + }, + { + "epoch": 0.10938838138738224, + "grad_norm": 1.3259782791137695, + "learning_rate": 4.8538404922209046e-05, + "loss": 4.9368, + "step": 18393 + }, + { + "epoch": 0.10939432867066325, + "grad_norm": 1.3342556953430176, + "learning_rate": 4.853824754707172e-05, + "loss": 4.9858, + "step": 18394 + }, + { + "epoch": 0.10940027595394423, + "grad_norm": 1.2291737794876099, + "learning_rate": 4.853809016371743e-05, + "loss": 5.0289, + "step": 18395 + }, + { + "epoch": 0.10940622323722524, + "grad_norm": 1.1539384126663208, + "learning_rate": 4.8537932772146245e-05, + "loss": 4.9444, + "step": 18396 + }, + { + "epoch": 0.10941217052050624, + "grad_norm": 1.2171412706375122, + "learning_rate": 4.8537775372358204e-05, + "loss": 4.9818, + "step": 18397 + }, + { + "epoch": 0.10941811780378723, + "grad_norm": 1.2133311033248901, + "learning_rate": 4.8537617964353374e-05, + "loss": 5.2647, + "step": 18398 + }, + { + "epoch": 0.10942406508706823, + "grad_norm": 1.2499877214431763, + "learning_rate": 4.8537460548131796e-05, + "loss": 5.4893, + "step": 18399 + }, + { + "epoch": 0.10943001237034923, + "grad_norm": 1.2127736806869507, + "learning_rate": 4.8537303123693545e-05, + "loss": 5.3607, + "step": 18400 + }, + { + "epoch": 0.10943595965363022, + "grad_norm": 1.3051133155822754, + "learning_rate": 4.853714569103865e-05, + "loss": 5.4531, + "step": 18401 + }, + { + "epoch": 0.10944190693691122, + "grad_norm": 1.3183389902114868, + "learning_rate": 4.85369882501672e-05, + "loss": 5.1784, + "step": 18402 + }, + { + "epoch": 0.10944785422019222, + "grad_norm": 1.5276503562927246, + "learning_rate": 4.853683080107922e-05, + "loss": 4.9092, + "step": 18403 + }, + { + "epoch": 0.10945380150347321, + "grad_norm": 1.519415259361267, + "learning_rate": 4.853667334377478e-05, + "loss": 4.7973, + "step": 18404 + }, + { + "epoch": 0.10945974878675421, + "grad_norm": 1.4063026905059814, + "learning_rate": 4.853651587825392e-05, + "loss": 4.7771, + "step": 18405 + }, + { + "epoch": 0.1094656960700352, + "grad_norm": 1.2753932476043701, + "learning_rate": 4.8536358404516715e-05, + "loss": 4.7902, + "step": 18406 + }, + { + "epoch": 0.1094716433533162, + "grad_norm": 1.5203404426574707, + "learning_rate": 4.8536200922563205e-05, + "loss": 4.961, + "step": 18407 + }, + { + "epoch": 0.1094775906365972, + "grad_norm": 1.4700336456298828, + "learning_rate": 4.8536043432393455e-05, + "loss": 5.0276, + "step": 18408 + }, + { + "epoch": 0.1094835379198782, + "grad_norm": 1.3945552110671997, + "learning_rate": 4.8535885934007506e-05, + "loss": 4.9641, + "step": 18409 + }, + { + "epoch": 0.1094894852031592, + "grad_norm": 1.1885923147201538, + "learning_rate": 4.853572842740544e-05, + "loss": 4.9162, + "step": 18410 + }, + { + "epoch": 0.1094954324864402, + "grad_norm": 1.414090871810913, + "learning_rate": 4.853557091258728e-05, + "loss": 4.9317, + "step": 18411 + }, + { + "epoch": 0.10950137976972119, + "grad_norm": 1.4395371675491333, + "learning_rate": 4.85354133895531e-05, + "loss": 4.7658, + "step": 18412 + }, + { + "epoch": 0.10950732705300219, + "grad_norm": 1.351665735244751, + "learning_rate": 4.8535255858302944e-05, + "loss": 4.9385, + "step": 18413 + }, + { + "epoch": 0.10951327433628319, + "grad_norm": 1.5085922479629517, + "learning_rate": 4.853509831883688e-05, + "loss": 5.0192, + "step": 18414 + }, + { + "epoch": 0.10951922161956418, + "grad_norm": 1.3413939476013184, + "learning_rate": 4.8534940771154954e-05, + "loss": 4.9193, + "step": 18415 + }, + { + "epoch": 0.10952516890284518, + "grad_norm": 1.532934546470642, + "learning_rate": 4.853478321525723e-05, + "loss": 4.9137, + "step": 18416 + }, + { + "epoch": 0.10953111618612618, + "grad_norm": 1.388016700744629, + "learning_rate": 4.8534625651143754e-05, + "loss": 4.9381, + "step": 18417 + }, + { + "epoch": 0.10953706346940717, + "grad_norm": 1.551255702972412, + "learning_rate": 4.853446807881458e-05, + "loss": 5.0973, + "step": 18418 + }, + { + "epoch": 0.10954301075268817, + "grad_norm": 1.4487138986587524, + "learning_rate": 4.853431049826976e-05, + "loss": 5.1313, + "step": 18419 + }, + { + "epoch": 0.10954895803596917, + "grad_norm": 1.467703104019165, + "learning_rate": 4.853415290950936e-05, + "loss": 5.0381, + "step": 18420 + }, + { + "epoch": 0.10955490531925016, + "grad_norm": 1.4529845714569092, + "learning_rate": 4.853399531253343e-05, + "loss": 4.9945, + "step": 18421 + }, + { + "epoch": 0.10956085260253116, + "grad_norm": 1.230872631072998, + "learning_rate": 4.8533837707342036e-05, + "loss": 5.0579, + "step": 18422 + }, + { + "epoch": 0.10956679988581217, + "grad_norm": 1.3668066263198853, + "learning_rate": 4.8533680093935206e-05, + "loss": 5.2567, + "step": 18423 + }, + { + "epoch": 0.10957274716909315, + "grad_norm": 1.3560447692871094, + "learning_rate": 4.853352247231302e-05, + "loss": 5.0152, + "step": 18424 + }, + { + "epoch": 0.10957869445237416, + "grad_norm": 1.4296886920928955, + "learning_rate": 4.8533364842475524e-05, + "loss": 5.1132, + "step": 18425 + }, + { + "epoch": 0.10958464173565516, + "grad_norm": 1.4232845306396484, + "learning_rate": 4.853320720442277e-05, + "loss": 5.0427, + "step": 18426 + }, + { + "epoch": 0.10959058901893615, + "grad_norm": 1.4019423723220825, + "learning_rate": 4.8533049558154826e-05, + "loss": 5.2369, + "step": 18427 + }, + { + "epoch": 0.10959653630221715, + "grad_norm": 1.5423427820205688, + "learning_rate": 4.853289190367173e-05, + "loss": 5.1053, + "step": 18428 + }, + { + "epoch": 0.10960248358549815, + "grad_norm": 1.5049951076507568, + "learning_rate": 4.8532734240973545e-05, + "loss": 5.3784, + "step": 18429 + }, + { + "epoch": 0.10960843086877914, + "grad_norm": 1.678328037261963, + "learning_rate": 4.853257657006033e-05, + "loss": 5.3021, + "step": 18430 + }, + { + "epoch": 0.10961437815206014, + "grad_norm": 1.5986173152923584, + "learning_rate": 4.853241889093213e-05, + "loss": 5.1686, + "step": 18431 + }, + { + "epoch": 0.10962032543534114, + "grad_norm": 1.5304551124572754, + "learning_rate": 4.853226120358901e-05, + "loss": 5.2319, + "step": 18432 + }, + { + "epoch": 0.10962627271862213, + "grad_norm": 1.609595775604248, + "learning_rate": 4.853210350803102e-05, + "loss": 5.0256, + "step": 18433 + }, + { + "epoch": 0.10963222000190313, + "grad_norm": 1.3506170511245728, + "learning_rate": 4.853194580425821e-05, + "loss": 5.0792, + "step": 18434 + }, + { + "epoch": 0.10963816728518412, + "grad_norm": 1.2946768999099731, + "learning_rate": 4.853178809227065e-05, + "loss": 5.0155, + "step": 18435 + }, + { + "epoch": 0.10964411456846512, + "grad_norm": 1.5691487789154053, + "learning_rate": 4.853163037206838e-05, + "loss": 5.1302, + "step": 18436 + }, + { + "epoch": 0.10965006185174613, + "grad_norm": 1.6740599870681763, + "learning_rate": 4.853147264365146e-05, + "loss": 5.2371, + "step": 18437 + }, + { + "epoch": 0.10965600913502711, + "grad_norm": 1.4822674989700317, + "learning_rate": 4.853131490701995e-05, + "loss": 5.0194, + "step": 18438 + }, + { + "epoch": 0.10966195641830812, + "grad_norm": 1.385177493095398, + "learning_rate": 4.853115716217389e-05, + "loss": 4.9444, + "step": 18439 + }, + { + "epoch": 0.10966790370158912, + "grad_norm": 1.3696002960205078, + "learning_rate": 4.853099940911337e-05, + "loss": 5.0557, + "step": 18440 + }, + { + "epoch": 0.1096738509848701, + "grad_norm": 1.6609543561935425, + "learning_rate": 4.8530841647838396e-05, + "loss": 4.9032, + "step": 18441 + }, + { + "epoch": 0.10967979826815111, + "grad_norm": 1.5938438177108765, + "learning_rate": 4.8530683878349056e-05, + "loss": 4.8639, + "step": 18442 + }, + { + "epoch": 0.10968574555143211, + "grad_norm": 1.4565002918243408, + "learning_rate": 4.85305261006454e-05, + "loss": 5.0483, + "step": 18443 + }, + { + "epoch": 0.1096916928347131, + "grad_norm": 1.5930250883102417, + "learning_rate": 4.853036831472749e-05, + "loss": 5.0751, + "step": 18444 + }, + { + "epoch": 0.1096976401179941, + "grad_norm": 1.5648735761642456, + "learning_rate": 4.853021052059536e-05, + "loss": 5.0991, + "step": 18445 + }, + { + "epoch": 0.1097035874012751, + "grad_norm": 1.4230155944824219, + "learning_rate": 4.8530052718249076e-05, + "loss": 5.098, + "step": 18446 + }, + { + "epoch": 0.10970953468455609, + "grad_norm": 1.4366841316223145, + "learning_rate": 4.85298949076887e-05, + "loss": 5.0975, + "step": 18447 + }, + { + "epoch": 0.10971548196783709, + "grad_norm": 1.437514066696167, + "learning_rate": 4.852973708891427e-05, + "loss": 5.0325, + "step": 18448 + }, + { + "epoch": 0.1097214292511181, + "grad_norm": 2.0367636680603027, + "learning_rate": 4.852957926192586e-05, + "loss": 5.2064, + "step": 18449 + }, + { + "epoch": 0.10972737653439908, + "grad_norm": 2.16357684135437, + "learning_rate": 4.852942142672352e-05, + "loss": 5.1532, + "step": 18450 + }, + { + "epoch": 0.10973332381768008, + "grad_norm": 1.6931402683258057, + "learning_rate": 4.8529263583307296e-05, + "loss": 5.2128, + "step": 18451 + }, + { + "epoch": 0.10973927110096109, + "grad_norm": 2.4651196002960205, + "learning_rate": 4.852910573167725e-05, + "loss": 4.798, + "step": 18452 + }, + { + "epoch": 0.10974521838424207, + "grad_norm": 1.7160784006118774, + "learning_rate": 4.852894787183344e-05, + "loss": 5.5087, + "step": 18453 + }, + { + "epoch": 0.10975116566752308, + "grad_norm": 1.478097915649414, + "learning_rate": 4.852879000377591e-05, + "loss": 5.6876, + "step": 18454 + }, + { + "epoch": 0.10975711295080408, + "grad_norm": 1.8612531423568726, + "learning_rate": 4.852863212750474e-05, + "loss": 5.2259, + "step": 18455 + }, + { + "epoch": 0.10976306023408507, + "grad_norm": 1.6869621276855469, + "learning_rate": 4.852847424301995e-05, + "loss": 5.5294, + "step": 18456 + }, + { + "epoch": 0.10976900751736607, + "grad_norm": 1.7378077507019043, + "learning_rate": 4.852831635032161e-05, + "loss": 5.4568, + "step": 18457 + }, + { + "epoch": 0.10977495480064707, + "grad_norm": 1.7788033485412598, + "learning_rate": 4.852815844940979e-05, + "loss": 5.2331, + "step": 18458 + }, + { + "epoch": 0.10978090208392806, + "grad_norm": 1.8730370998382568, + "learning_rate": 4.852800054028453e-05, + "loss": 4.9792, + "step": 18459 + }, + { + "epoch": 0.10978684936720906, + "grad_norm": 1.5126397609710693, + "learning_rate": 4.852784262294588e-05, + "loss": 5.3134, + "step": 18460 + }, + { + "epoch": 0.10979279665049006, + "grad_norm": 1.6687992811203003, + "learning_rate": 4.8527684697393914e-05, + "loss": 5.3296, + "step": 18461 + }, + { + "epoch": 0.10979874393377105, + "grad_norm": 1.6268471479415894, + "learning_rate": 4.852752676362867e-05, + "loss": 4.9804, + "step": 18462 + }, + { + "epoch": 0.10980469121705205, + "grad_norm": 1.7055017948150635, + "learning_rate": 4.8527368821650214e-05, + "loss": 5.0289, + "step": 18463 + }, + { + "epoch": 0.10981063850033304, + "grad_norm": 1.489247441291809, + "learning_rate": 4.852721087145859e-05, + "loss": 5.0428, + "step": 18464 + }, + { + "epoch": 0.10981658578361404, + "grad_norm": 1.7411161661148071, + "learning_rate": 4.8527052913053874e-05, + "loss": 5.1142, + "step": 18465 + }, + { + "epoch": 0.10982253306689505, + "grad_norm": 1.5776443481445312, + "learning_rate": 4.8526894946436094e-05, + "loss": 5.2881, + "step": 18466 + }, + { + "epoch": 0.10982848035017603, + "grad_norm": 1.342997431755066, + "learning_rate": 4.852673697160532e-05, + "loss": 5.0295, + "step": 18467 + }, + { + "epoch": 0.10983442763345704, + "grad_norm": 1.1686962842941284, + "learning_rate": 4.8526578988561606e-05, + "loss": 5.0607, + "step": 18468 + }, + { + "epoch": 0.10984037491673804, + "grad_norm": 1.578697681427002, + "learning_rate": 4.8526420997305006e-05, + "loss": 5.3291, + "step": 18469 + }, + { + "epoch": 0.10984632220001903, + "grad_norm": 1.5248758792877197, + "learning_rate": 4.8526262997835575e-05, + "loss": 5.1206, + "step": 18470 + }, + { + "epoch": 0.10985226948330003, + "grad_norm": 1.1425076723098755, + "learning_rate": 4.852610499015337e-05, + "loss": 5.1892, + "step": 18471 + }, + { + "epoch": 0.10985821676658103, + "grad_norm": 1.356423020362854, + "learning_rate": 4.852594697425844e-05, + "loss": 4.9477, + "step": 18472 + }, + { + "epoch": 0.10986416404986202, + "grad_norm": 1.3905398845672607, + "learning_rate": 4.852578895015085e-05, + "loss": 4.9084, + "step": 18473 + }, + { + "epoch": 0.10987011133314302, + "grad_norm": 1.3447619676589966, + "learning_rate": 4.8525630917830655e-05, + "loss": 4.9042, + "step": 18474 + }, + { + "epoch": 0.10987605861642402, + "grad_norm": 1.2110105752944946, + "learning_rate": 4.8525472877297893e-05, + "loss": 4.9669, + "step": 18475 + }, + { + "epoch": 0.10988200589970501, + "grad_norm": 1.480750560760498, + "learning_rate": 4.8525314828552646e-05, + "loss": 5.1071, + "step": 18476 + }, + { + "epoch": 0.10988795318298601, + "grad_norm": 1.2497118711471558, + "learning_rate": 4.852515677159495e-05, + "loss": 4.8868, + "step": 18477 + }, + { + "epoch": 0.10989390046626701, + "grad_norm": 1.4057846069335938, + "learning_rate": 4.8524998706424856e-05, + "loss": 5.1173, + "step": 18478 + }, + { + "epoch": 0.109899847749548, + "grad_norm": 1.3325163125991821, + "learning_rate": 4.8524840633042436e-05, + "loss": 5.1066, + "step": 18479 + }, + { + "epoch": 0.109905795032829, + "grad_norm": 1.333720326423645, + "learning_rate": 4.852468255144773e-05, + "loss": 5.1404, + "step": 18480 + }, + { + "epoch": 0.10991174231611, + "grad_norm": 1.3484537601470947, + "learning_rate": 4.852452446164081e-05, + "loss": 5.1284, + "step": 18481 + }, + { + "epoch": 0.109917689599391, + "grad_norm": 1.3348337411880493, + "learning_rate": 4.8524366363621716e-05, + "loss": 5.2056, + "step": 18482 + }, + { + "epoch": 0.109923636882672, + "grad_norm": 1.1838293075561523, + "learning_rate": 4.8524208257390504e-05, + "loss": 5.0488, + "step": 18483 + }, + { + "epoch": 0.109929584165953, + "grad_norm": 1.2820385694503784, + "learning_rate": 4.852405014294724e-05, + "loss": 5.1329, + "step": 18484 + }, + { + "epoch": 0.10993553144923399, + "grad_norm": 1.3892844915390015, + "learning_rate": 4.852389202029198e-05, + "loss": 5.0263, + "step": 18485 + }, + { + "epoch": 0.10994147873251499, + "grad_norm": 1.4780217409133911, + "learning_rate": 4.852373388942476e-05, + "loss": 5.0866, + "step": 18486 + }, + { + "epoch": 0.10994742601579599, + "grad_norm": 1.4181870222091675, + "learning_rate": 4.852357575034565e-05, + "loss": 5.1436, + "step": 18487 + }, + { + "epoch": 0.10995337329907698, + "grad_norm": 1.4174554347991943, + "learning_rate": 4.852341760305471e-05, + "loss": 5.132, + "step": 18488 + }, + { + "epoch": 0.10995932058235798, + "grad_norm": 1.2727283239364624, + "learning_rate": 4.852325944755198e-05, + "loss": 5.0171, + "step": 18489 + }, + { + "epoch": 0.10996526786563898, + "grad_norm": 1.2102142572402954, + "learning_rate": 4.852310128383753e-05, + "loss": 5.0183, + "step": 18490 + }, + { + "epoch": 0.10997121514891997, + "grad_norm": 1.254946231842041, + "learning_rate": 4.85229431119114e-05, + "loss": 5.105, + "step": 18491 + }, + { + "epoch": 0.10997716243220097, + "grad_norm": 1.4097338914871216, + "learning_rate": 4.8522784931773666e-05, + "loss": 4.953, + "step": 18492 + }, + { + "epoch": 0.10998310971548196, + "grad_norm": 1.368314504623413, + "learning_rate": 4.852262674342436e-05, + "loss": 4.9527, + "step": 18493 + }, + { + "epoch": 0.10998905699876296, + "grad_norm": 1.3907700777053833, + "learning_rate": 4.8522468546863554e-05, + "loss": 4.9416, + "step": 18494 + }, + { + "epoch": 0.10999500428204396, + "grad_norm": 1.2113755941390991, + "learning_rate": 4.852231034209129e-05, + "loss": 4.8552, + "step": 18495 + }, + { + "epoch": 0.11000095156532495, + "grad_norm": 1.3752022981643677, + "learning_rate": 4.852215212910763e-05, + "loss": 4.9314, + "step": 18496 + }, + { + "epoch": 0.11000689884860596, + "grad_norm": 1.243531584739685, + "learning_rate": 4.852199390791264e-05, + "loss": 4.925, + "step": 18497 + }, + { + "epoch": 0.11001284613188696, + "grad_norm": 1.3528475761413574, + "learning_rate": 4.852183567850636e-05, + "loss": 4.8643, + "step": 18498 + }, + { + "epoch": 0.11001879341516795, + "grad_norm": 1.4653394222259521, + "learning_rate": 4.8521677440888845e-05, + "loss": 4.8894, + "step": 18499 + }, + { + "epoch": 0.11002474069844895, + "grad_norm": 1.3524682521820068, + "learning_rate": 4.852151919506016e-05, + "loss": 4.7458, + "step": 18500 + }, + { + "epoch": 0.11003068798172995, + "grad_norm": 1.3654247522354126, + "learning_rate": 4.852136094102036e-05, + "loss": 4.7971, + "step": 18501 + }, + { + "epoch": 0.11003663526501094, + "grad_norm": 1.395735740661621, + "learning_rate": 4.85212026787695e-05, + "loss": 4.7677, + "step": 18502 + }, + { + "epoch": 0.11004258254829194, + "grad_norm": 1.4467344284057617, + "learning_rate": 4.8521044408307616e-05, + "loss": 4.726, + "step": 18503 + }, + { + "epoch": 0.11004852983157294, + "grad_norm": 1.276580572128296, + "learning_rate": 4.852088612963478e-05, + "loss": 4.8145, + "step": 18504 + }, + { + "epoch": 0.11005447711485393, + "grad_norm": 1.4406812191009521, + "learning_rate": 4.852072784275106e-05, + "loss": 4.7942, + "step": 18505 + }, + { + "epoch": 0.11006042439813493, + "grad_norm": 1.4281691312789917, + "learning_rate": 4.8520569547656483e-05, + "loss": 4.9745, + "step": 18506 + }, + { + "epoch": 0.11006637168141593, + "grad_norm": 1.3521541357040405, + "learning_rate": 4.852041124435112e-05, + "loss": 4.8335, + "step": 18507 + }, + { + "epoch": 0.11007231896469692, + "grad_norm": 1.2510555982589722, + "learning_rate": 4.852025293283503e-05, + "loss": 4.8868, + "step": 18508 + }, + { + "epoch": 0.11007826624797792, + "grad_norm": 1.3792724609375, + "learning_rate": 4.852009461310826e-05, + "loss": 4.9388, + "step": 18509 + }, + { + "epoch": 0.11008421353125893, + "grad_norm": 1.3494830131530762, + "learning_rate": 4.851993628517086e-05, + "loss": 4.8536, + "step": 18510 + }, + { + "epoch": 0.11009016081453991, + "grad_norm": 1.2981318235397339, + "learning_rate": 4.851977794902291e-05, + "loss": 4.8479, + "step": 18511 + }, + { + "epoch": 0.11009610809782092, + "grad_norm": 1.3305935859680176, + "learning_rate": 4.851961960466444e-05, + "loss": 4.9893, + "step": 18512 + }, + { + "epoch": 0.11010205538110192, + "grad_norm": 1.3141270875930786, + "learning_rate": 4.851946125209551e-05, + "loss": 4.8349, + "step": 18513 + }, + { + "epoch": 0.1101080026643829, + "grad_norm": 1.2411303520202637, + "learning_rate": 4.851930289131619e-05, + "loss": 4.8698, + "step": 18514 + }, + { + "epoch": 0.11011394994766391, + "grad_norm": 1.520176887512207, + "learning_rate": 4.851914452232651e-05, + "loss": 4.7576, + "step": 18515 + }, + { + "epoch": 0.11011989723094491, + "grad_norm": 1.3073054552078247, + "learning_rate": 4.851898614512655e-05, + "loss": 4.8974, + "step": 18516 + }, + { + "epoch": 0.1101258445142259, + "grad_norm": 1.4703196287155151, + "learning_rate": 4.8518827759716354e-05, + "loss": 5.0947, + "step": 18517 + }, + { + "epoch": 0.1101317917975069, + "grad_norm": 1.3140865564346313, + "learning_rate": 4.851866936609597e-05, + "loss": 5.4125, + "step": 18518 + }, + { + "epoch": 0.1101377390807879, + "grad_norm": 1.2075819969177246, + "learning_rate": 4.8518510964265465e-05, + "loss": 5.2993, + "step": 18519 + }, + { + "epoch": 0.11014368636406889, + "grad_norm": 1.6519954204559326, + "learning_rate": 4.85183525542249e-05, + "loss": 5.6638, + "step": 18520 + }, + { + "epoch": 0.11014963364734989, + "grad_norm": 2.118663787841797, + "learning_rate": 4.851819413597432e-05, + "loss": 5.5422, + "step": 18521 + }, + { + "epoch": 0.1101555809306309, + "grad_norm": 1.902429461479187, + "learning_rate": 4.851803570951377e-05, + "loss": 5.3244, + "step": 18522 + }, + { + "epoch": 0.11016152821391188, + "grad_norm": 2.593628406524658, + "learning_rate": 4.8517877274843315e-05, + "loss": 5.0554, + "step": 18523 + }, + { + "epoch": 0.11016747549719288, + "grad_norm": 2.6404380798339844, + "learning_rate": 4.851771883196302e-05, + "loss": 4.9789, + "step": 18524 + }, + { + "epoch": 0.11017342278047387, + "grad_norm": 2.08564829826355, + "learning_rate": 4.8517560380872934e-05, + "loss": 4.9616, + "step": 18525 + }, + { + "epoch": 0.11017937006375488, + "grad_norm": 2.306739091873169, + "learning_rate": 4.8517401921573114e-05, + "loss": 4.9368, + "step": 18526 + }, + { + "epoch": 0.11018531734703588, + "grad_norm": 3.0212862491607666, + "learning_rate": 4.85172434540636e-05, + "loss": 4.6379, + "step": 18527 + }, + { + "epoch": 0.11019126463031687, + "grad_norm": 2.554163694381714, + "learning_rate": 4.851708497834446e-05, + "loss": 4.6958, + "step": 18528 + }, + { + "epoch": 0.11019721191359787, + "grad_norm": 2.354631185531616, + "learning_rate": 4.851692649441576e-05, + "loss": 4.7904, + "step": 18529 + }, + { + "epoch": 0.11020315919687887, + "grad_norm": 1.5072609186172485, + "learning_rate": 4.851676800227754e-05, + "loss": 5.5862, + "step": 18530 + }, + { + "epoch": 0.11020910648015986, + "grad_norm": 1.5677906274795532, + "learning_rate": 4.851660950192986e-05, + "loss": 5.8712, + "step": 18531 + }, + { + "epoch": 0.11021505376344086, + "grad_norm": 1.7329411506652832, + "learning_rate": 4.851645099337276e-05, + "loss": 5.4559, + "step": 18532 + }, + { + "epoch": 0.11022100104672186, + "grad_norm": 2.187192916870117, + "learning_rate": 4.851629247660633e-05, + "loss": 5.2172, + "step": 18533 + }, + { + "epoch": 0.11022694833000285, + "grad_norm": 2.5248184204101562, + "learning_rate": 4.851613395163059e-05, + "loss": 4.7283, + "step": 18534 + }, + { + "epoch": 0.11023289561328385, + "grad_norm": 1.897926926612854, + "learning_rate": 4.8515975418445625e-05, + "loss": 5.0609, + "step": 18535 + }, + { + "epoch": 0.11023884289656485, + "grad_norm": 1.6827658414840698, + "learning_rate": 4.851581687705147e-05, + "loss": 5.2637, + "step": 18536 + }, + { + "epoch": 0.11024479017984584, + "grad_norm": 1.6638895273208618, + "learning_rate": 4.8515658327448184e-05, + "loss": 5.3758, + "step": 18537 + }, + { + "epoch": 0.11025073746312684, + "grad_norm": 1.3794528245925903, + "learning_rate": 4.8515499769635824e-05, + "loss": 5.1398, + "step": 18538 + }, + { + "epoch": 0.11025668474640785, + "grad_norm": 1.7829253673553467, + "learning_rate": 4.8515341203614454e-05, + "loss": 5.8449, + "step": 18539 + }, + { + "epoch": 0.11026263202968883, + "grad_norm": 1.9193391799926758, + "learning_rate": 4.85151826293841e-05, + "loss": 5.6113, + "step": 18540 + }, + { + "epoch": 0.11026857931296984, + "grad_norm": 1.9315286874771118, + "learning_rate": 4.851502404694486e-05, + "loss": 5.4341, + "step": 18541 + }, + { + "epoch": 0.11027452659625084, + "grad_norm": 1.8884371519088745, + "learning_rate": 4.851486545629677e-05, + "loss": 5.0711, + "step": 18542 + }, + { + "epoch": 0.11028047387953183, + "grad_norm": 2.104315996170044, + "learning_rate": 4.8514706857439866e-05, + "loss": 4.7431, + "step": 18543 + }, + { + "epoch": 0.11028642116281283, + "grad_norm": 1.9781455993652344, + "learning_rate": 4.8514548250374234e-05, + "loss": 4.9088, + "step": 18544 + }, + { + "epoch": 0.11029236844609383, + "grad_norm": 2.0802392959594727, + "learning_rate": 4.851438963509991e-05, + "loss": 4.8418, + "step": 18545 + }, + { + "epoch": 0.11029831572937482, + "grad_norm": 2.1856627464294434, + "learning_rate": 4.851423101161696e-05, + "loss": 5.5758, + "step": 18546 + }, + { + "epoch": 0.11030426301265582, + "grad_norm": 1.578050971031189, + "learning_rate": 4.851407237992543e-05, + "loss": 5.2795, + "step": 18547 + }, + { + "epoch": 0.11031021029593682, + "grad_norm": 2.241647720336914, + "learning_rate": 4.8513913740025376e-05, + "loss": 4.7807, + "step": 18548 + }, + { + "epoch": 0.11031615757921781, + "grad_norm": 2.102911949157715, + "learning_rate": 4.851375509191687e-05, + "loss": 5.1933, + "step": 18549 + }, + { + "epoch": 0.11032210486249881, + "grad_norm": 1.7198251485824585, + "learning_rate": 4.851359643559995e-05, + "loss": 5.273, + "step": 18550 + }, + { + "epoch": 0.11032805214577981, + "grad_norm": 1.6389858722686768, + "learning_rate": 4.8513437771074675e-05, + "loss": 5.7741, + "step": 18551 + }, + { + "epoch": 0.1103339994290608, + "grad_norm": 1.3120185136795044, + "learning_rate": 4.8513279098341106e-05, + "loss": 5.6433, + "step": 18552 + }, + { + "epoch": 0.1103399467123418, + "grad_norm": 2.6182525157928467, + "learning_rate": 4.8513120417399286e-05, + "loss": 5.2905, + "step": 18553 + }, + { + "epoch": 0.11034589399562279, + "grad_norm": 2.8740553855895996, + "learning_rate": 4.851296172824928e-05, + "loss": 5.0364, + "step": 18554 + }, + { + "epoch": 0.1103518412789038, + "grad_norm": 2.126779794692993, + "learning_rate": 4.851280303089115e-05, + "loss": 4.8801, + "step": 18555 + }, + { + "epoch": 0.1103577885621848, + "grad_norm": 2.2658486366271973, + "learning_rate": 4.851264432532493e-05, + "loss": 5.0411, + "step": 18556 + }, + { + "epoch": 0.11036373584546579, + "grad_norm": 2.2387850284576416, + "learning_rate": 4.8512485611550706e-05, + "loss": 5.048, + "step": 18557 + }, + { + "epoch": 0.11036968312874679, + "grad_norm": 2.5402557849884033, + "learning_rate": 4.851232688956851e-05, + "loss": 5.2581, + "step": 18558 + }, + { + "epoch": 0.11037563041202779, + "grad_norm": 1.9275699853897095, + "learning_rate": 4.8512168159378396e-05, + "loss": 5.765, + "step": 18559 + }, + { + "epoch": 0.11038157769530878, + "grad_norm": 1.6632050275802612, + "learning_rate": 4.8512009420980434e-05, + "loss": 5.9928, + "step": 18560 + }, + { + "epoch": 0.11038752497858978, + "grad_norm": 1.9383779764175415, + "learning_rate": 4.851185067437467e-05, + "loss": 5.306, + "step": 18561 + }, + { + "epoch": 0.11039347226187078, + "grad_norm": 1.6358258724212646, + "learning_rate": 4.851169191956117e-05, + "loss": 5.4039, + "step": 18562 + }, + { + "epoch": 0.11039941954515177, + "grad_norm": 1.625636339187622, + "learning_rate": 4.851153315653997e-05, + "loss": 5.5028, + "step": 18563 + }, + { + "epoch": 0.11040536682843277, + "grad_norm": 1.8142133951187134, + "learning_rate": 4.8511374385311134e-05, + "loss": 5.3636, + "step": 18564 + }, + { + "epoch": 0.11041131411171377, + "grad_norm": 1.778742790222168, + "learning_rate": 4.8511215605874724e-05, + "loss": 5.9869, + "step": 18565 + }, + { + "epoch": 0.11041726139499476, + "grad_norm": 1.7027266025543213, + "learning_rate": 4.8511056818230795e-05, + "loss": 5.9855, + "step": 18566 + }, + { + "epoch": 0.11042320867827576, + "grad_norm": 1.8098080158233643, + "learning_rate": 4.85108980223794e-05, + "loss": 5.3241, + "step": 18567 + }, + { + "epoch": 0.11042915596155677, + "grad_norm": 2.058525562286377, + "learning_rate": 4.851073921832059e-05, + "loss": 5.3369, + "step": 18568 + }, + { + "epoch": 0.11043510324483775, + "grad_norm": 1.6393969058990479, + "learning_rate": 4.851058040605443e-05, + "loss": 5.234, + "step": 18569 + }, + { + "epoch": 0.11044105052811876, + "grad_norm": 1.7245092391967773, + "learning_rate": 4.8510421585580954e-05, + "loss": 5.3252, + "step": 18570 + }, + { + "epoch": 0.11044699781139976, + "grad_norm": 1.7108781337738037, + "learning_rate": 4.851026275690025e-05, + "loss": 5.342, + "step": 18571 + }, + { + "epoch": 0.11045294509468075, + "grad_norm": 1.6860250234603882, + "learning_rate": 4.8510103920012354e-05, + "loss": 5.1265, + "step": 18572 + }, + { + "epoch": 0.11045889237796175, + "grad_norm": 1.4939595460891724, + "learning_rate": 4.850994507491731e-05, + "loss": 4.995, + "step": 18573 + }, + { + "epoch": 0.11046483966124275, + "grad_norm": 1.6137492656707764, + "learning_rate": 4.85097862216152e-05, + "loss": 5.0099, + "step": 18574 + }, + { + "epoch": 0.11047078694452374, + "grad_norm": 1.8155491352081299, + "learning_rate": 4.850962736010606e-05, + "loss": 4.965, + "step": 18575 + }, + { + "epoch": 0.11047673422780474, + "grad_norm": 1.6313834190368652, + "learning_rate": 4.8509468490389955e-05, + "loss": 5.1881, + "step": 18576 + }, + { + "epoch": 0.11048268151108574, + "grad_norm": 1.9885855913162231, + "learning_rate": 4.850930961246694e-05, + "loss": 4.9172, + "step": 18577 + }, + { + "epoch": 0.11048862879436673, + "grad_norm": 1.7815529108047485, + "learning_rate": 4.850915072633706e-05, + "loss": 5.2431, + "step": 18578 + }, + { + "epoch": 0.11049457607764773, + "grad_norm": 1.496060848236084, + "learning_rate": 4.8508991832000384e-05, + "loss": 5.0222, + "step": 18579 + }, + { + "epoch": 0.11050052336092873, + "grad_norm": 1.76019287109375, + "learning_rate": 4.850883292945696e-05, + "loss": 5.1522, + "step": 18580 + }, + { + "epoch": 0.11050647064420972, + "grad_norm": 1.6975457668304443, + "learning_rate": 4.8508674018706845e-05, + "loss": 5.0687, + "step": 18581 + }, + { + "epoch": 0.11051241792749072, + "grad_norm": 2.056002378463745, + "learning_rate": 4.85085150997501e-05, + "loss": 5.0267, + "step": 18582 + }, + { + "epoch": 0.11051836521077171, + "grad_norm": 1.8109005689620972, + "learning_rate": 4.850835617258677e-05, + "loss": 5.7661, + "step": 18583 + }, + { + "epoch": 0.11052431249405271, + "grad_norm": 1.762326717376709, + "learning_rate": 4.850819723721692e-05, + "loss": 5.8038, + "step": 18584 + }, + { + "epoch": 0.11053025977733372, + "grad_norm": 1.5169013738632202, + "learning_rate": 4.85080382936406e-05, + "loss": 5.7988, + "step": 18585 + }, + { + "epoch": 0.1105362070606147, + "grad_norm": 1.7740446329116821, + "learning_rate": 4.850787934185786e-05, + "loss": 5.5388, + "step": 18586 + }, + { + "epoch": 0.11054215434389571, + "grad_norm": 1.560950756072998, + "learning_rate": 4.850772038186877e-05, + "loss": 5.406, + "step": 18587 + }, + { + "epoch": 0.11054810162717671, + "grad_norm": 1.6391148567199707, + "learning_rate": 4.850756141367338e-05, + "loss": 5.4669, + "step": 18588 + }, + { + "epoch": 0.1105540489104577, + "grad_norm": 1.5571023225784302, + "learning_rate": 4.8507402437271734e-05, + "loss": 5.6556, + "step": 18589 + }, + { + "epoch": 0.1105599961937387, + "grad_norm": 1.5374432802200317, + "learning_rate": 4.85072434526639e-05, + "loss": 5.7617, + "step": 18590 + }, + { + "epoch": 0.1105659434770197, + "grad_norm": 1.4683212041854858, + "learning_rate": 4.850708445984993e-05, + "loss": 5.5074, + "step": 18591 + }, + { + "epoch": 0.11057189076030069, + "grad_norm": 1.6689101457595825, + "learning_rate": 4.850692545882988e-05, + "loss": 5.3259, + "step": 18592 + }, + { + "epoch": 0.11057783804358169, + "grad_norm": 1.394108533859253, + "learning_rate": 4.85067664496038e-05, + "loss": 5.1686, + "step": 18593 + }, + { + "epoch": 0.1105837853268627, + "grad_norm": 1.7093585729599, + "learning_rate": 4.850660743217176e-05, + "loss": 5.6622, + "step": 18594 + }, + { + "epoch": 0.11058973261014368, + "grad_norm": 1.6189805269241333, + "learning_rate": 4.85064484065338e-05, + "loss": 5.6855, + "step": 18595 + }, + { + "epoch": 0.11059567989342468, + "grad_norm": 1.5303481817245483, + "learning_rate": 4.850628937268999e-05, + "loss": 5.8242, + "step": 18596 + }, + { + "epoch": 0.11060162717670569, + "grad_norm": 1.6557955741882324, + "learning_rate": 4.850613033064037e-05, + "loss": 5.4924, + "step": 18597 + }, + { + "epoch": 0.11060757445998667, + "grad_norm": 1.5280576944351196, + "learning_rate": 4.8505971280385e-05, + "loss": 5.6122, + "step": 18598 + }, + { + "epoch": 0.11061352174326768, + "grad_norm": 1.3656830787658691, + "learning_rate": 4.8505812221923945e-05, + "loss": 5.5282, + "step": 18599 + }, + { + "epoch": 0.11061946902654868, + "grad_norm": 1.3605096340179443, + "learning_rate": 4.850565315525725e-05, + "loss": 5.0747, + "step": 18600 + }, + { + "epoch": 0.11062541630982967, + "grad_norm": 2.120056390762329, + "learning_rate": 4.850549408038498e-05, + "loss": 5.1559, + "step": 18601 + }, + { + "epoch": 0.11063136359311067, + "grad_norm": 2.14626145362854, + "learning_rate": 4.850533499730718e-05, + "loss": 4.9778, + "step": 18602 + }, + { + "epoch": 0.11063731087639167, + "grad_norm": 2.1857240200042725, + "learning_rate": 4.8505175906023916e-05, + "loss": 4.8555, + "step": 18603 + }, + { + "epoch": 0.11064325815967266, + "grad_norm": 1.6636399030685425, + "learning_rate": 4.850501680653523e-05, + "loss": 5.3488, + "step": 18604 + }, + { + "epoch": 0.11064920544295366, + "grad_norm": 1.669511079788208, + "learning_rate": 4.8504857698841185e-05, + "loss": 5.2697, + "step": 18605 + }, + { + "epoch": 0.11065515272623466, + "grad_norm": 2.1935081481933594, + "learning_rate": 4.850469858294184e-05, + "loss": 4.4319, + "step": 18606 + }, + { + "epoch": 0.11066110000951565, + "grad_norm": 2.2359724044799805, + "learning_rate": 4.850453945883725e-05, + "loss": 4.2343, + "step": 18607 + }, + { + "epoch": 0.11066704729279665, + "grad_norm": 2.278247594833374, + "learning_rate": 4.850438032652747e-05, + "loss": 4.4955, + "step": 18608 + }, + { + "epoch": 0.11067299457607765, + "grad_norm": 2.3036160469055176, + "learning_rate": 4.850422118601254e-05, + "loss": 4.9122, + "step": 18609 + }, + { + "epoch": 0.11067894185935864, + "grad_norm": 2.3913469314575195, + "learning_rate": 4.850406203729254e-05, + "loss": 4.4703, + "step": 18610 + }, + { + "epoch": 0.11068488914263964, + "grad_norm": 1.9795238971710205, + "learning_rate": 4.8503902880367516e-05, + "loss": 4.7099, + "step": 18611 + }, + { + "epoch": 0.11069083642592063, + "grad_norm": 2.3990728855133057, + "learning_rate": 4.850374371523752e-05, + "loss": 4.3833, + "step": 18612 + }, + { + "epoch": 0.11069678370920163, + "grad_norm": 2.429461717605591, + "learning_rate": 4.850358454190261e-05, + "loss": 4.4279, + "step": 18613 + }, + { + "epoch": 0.11070273099248264, + "grad_norm": 2.598304271697998, + "learning_rate": 4.8503425360362845e-05, + "loss": 4.4376, + "step": 18614 + }, + { + "epoch": 0.11070867827576363, + "grad_norm": 2.3201403617858887, + "learning_rate": 4.850326617061827e-05, + "loss": 4.6822, + "step": 18615 + }, + { + "epoch": 0.11071462555904463, + "grad_norm": 1.8401033878326416, + "learning_rate": 4.8503106972668956e-05, + "loss": 5.1109, + "step": 18616 + }, + { + "epoch": 0.11072057284232563, + "grad_norm": 1.772309422492981, + "learning_rate": 4.850294776651494e-05, + "loss": 5.7237, + "step": 18617 + }, + { + "epoch": 0.11072652012560662, + "grad_norm": 1.7160669565200806, + "learning_rate": 4.8502788552156295e-05, + "loss": 5.7218, + "step": 18618 + }, + { + "epoch": 0.11073246740888762, + "grad_norm": 1.5467272996902466, + "learning_rate": 4.850262932959306e-05, + "loss": 5.4169, + "step": 18619 + }, + { + "epoch": 0.11073841469216862, + "grad_norm": 1.3382668495178223, + "learning_rate": 4.8502470098825316e-05, + "loss": 5.1243, + "step": 18620 + }, + { + "epoch": 0.11074436197544961, + "grad_norm": 1.3461776971817017, + "learning_rate": 4.850231085985309e-05, + "loss": 4.9412, + "step": 18621 + }, + { + "epoch": 0.11075030925873061, + "grad_norm": 1.4207700490951538, + "learning_rate": 4.850215161267646e-05, + "loss": 5.4449, + "step": 18622 + }, + { + "epoch": 0.11075625654201161, + "grad_norm": 1.7271502017974854, + "learning_rate": 4.8501992357295454e-05, + "loss": 5.4579, + "step": 18623 + }, + { + "epoch": 0.1107622038252926, + "grad_norm": 1.753090500831604, + "learning_rate": 4.8501833093710156e-05, + "loss": 5.7577, + "step": 18624 + }, + { + "epoch": 0.1107681511085736, + "grad_norm": 1.3730309009552002, + "learning_rate": 4.850167382192062e-05, + "loss": 5.3646, + "step": 18625 + }, + { + "epoch": 0.1107740983918546, + "grad_norm": 1.4723306894302368, + "learning_rate": 4.8501514541926883e-05, + "loss": 4.8234, + "step": 18626 + }, + { + "epoch": 0.1107800456751356, + "grad_norm": 1.3944339752197266, + "learning_rate": 4.850135525372901e-05, + "loss": 4.805, + "step": 18627 + }, + { + "epoch": 0.1107859929584166, + "grad_norm": 1.1402732133865356, + "learning_rate": 4.850119595732706e-05, + "loss": 4.9865, + "step": 18628 + }, + { + "epoch": 0.1107919402416976, + "grad_norm": 1.0595287084579468, + "learning_rate": 4.850103665272108e-05, + "loss": 4.9961, + "step": 18629 + }, + { + "epoch": 0.11079788752497859, + "grad_norm": 1.445143699645996, + "learning_rate": 4.8500877339911136e-05, + "loss": 5.2089, + "step": 18630 + }, + { + "epoch": 0.11080383480825959, + "grad_norm": 2.2014050483703613, + "learning_rate": 4.8500718018897275e-05, + "loss": 4.7445, + "step": 18631 + }, + { + "epoch": 0.11080978209154059, + "grad_norm": 2.117194890975952, + "learning_rate": 4.850055868967956e-05, + "loss": 4.8755, + "step": 18632 + }, + { + "epoch": 0.11081572937482158, + "grad_norm": 1.82968008518219, + "learning_rate": 4.850039935225804e-05, + "loss": 4.8852, + "step": 18633 + }, + { + "epoch": 0.11082167665810258, + "grad_norm": 1.613770842552185, + "learning_rate": 4.8500240006632766e-05, + "loss": 5.1053, + "step": 18634 + }, + { + "epoch": 0.11082762394138358, + "grad_norm": 1.8672553300857544, + "learning_rate": 4.850008065280381e-05, + "loss": 4.7134, + "step": 18635 + }, + { + "epoch": 0.11083357122466457, + "grad_norm": 1.9933403730392456, + "learning_rate": 4.849992129077122e-05, + "loss": 4.7544, + "step": 18636 + }, + { + "epoch": 0.11083951850794557, + "grad_norm": 1.8642876148223877, + "learning_rate": 4.849976192053505e-05, + "loss": 4.6598, + "step": 18637 + }, + { + "epoch": 0.11084546579122657, + "grad_norm": 1.8983674049377441, + "learning_rate": 4.849960254209536e-05, + "loss": 4.7403, + "step": 18638 + }, + { + "epoch": 0.11085141307450756, + "grad_norm": 1.9882328510284424, + "learning_rate": 4.849944315545219e-05, + "loss": 5.0105, + "step": 18639 + }, + { + "epoch": 0.11085736035778856, + "grad_norm": 1.7971723079681396, + "learning_rate": 4.8499283760605614e-05, + "loss": 5.6138, + "step": 18640 + }, + { + "epoch": 0.11086330764106955, + "grad_norm": 1.5002641677856445, + "learning_rate": 4.849912435755568e-05, + "loss": 5.7336, + "step": 18641 + }, + { + "epoch": 0.11086925492435055, + "grad_norm": 1.412880778312683, + "learning_rate": 4.8498964946302436e-05, + "loss": 5.532, + "step": 18642 + }, + { + "epoch": 0.11087520220763156, + "grad_norm": 1.6482197046279907, + "learning_rate": 4.849880552684596e-05, + "loss": 5.5432, + "step": 18643 + }, + { + "epoch": 0.11088114949091255, + "grad_norm": 1.5852200984954834, + "learning_rate": 4.849864609918629e-05, + "loss": 5.3577, + "step": 18644 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 1.540536642074585, + "learning_rate": 4.849848666332348e-05, + "loss": 5.4983, + "step": 18645 + }, + { + "epoch": 0.11089304405747455, + "grad_norm": 1.7822679281234741, + "learning_rate": 4.849832721925759e-05, + "loss": 5.1427, + "step": 18646 + }, + { + "epoch": 0.11089899134075554, + "grad_norm": 1.722977638244629, + "learning_rate": 4.8498167766988685e-05, + "loss": 5.2759, + "step": 18647 + }, + { + "epoch": 0.11090493862403654, + "grad_norm": 1.7543476819992065, + "learning_rate": 4.8498008306516806e-05, + "loss": 5.2616, + "step": 18648 + }, + { + "epoch": 0.11091088590731754, + "grad_norm": 1.4882584810256958, + "learning_rate": 4.8497848837842016e-05, + "loss": 5.3781, + "step": 18649 + }, + { + "epoch": 0.11091683319059853, + "grad_norm": 1.7358192205429077, + "learning_rate": 4.849768936096437e-05, + "loss": 5.5262, + "step": 18650 + }, + { + "epoch": 0.11092278047387953, + "grad_norm": 1.6070705652236938, + "learning_rate": 4.849752987588393e-05, + "loss": 5.0576, + "step": 18651 + }, + { + "epoch": 0.11092872775716053, + "grad_norm": 1.7641521692276, + "learning_rate": 4.8497370382600736e-05, + "loss": 5.21, + "step": 18652 + }, + { + "epoch": 0.11093467504044152, + "grad_norm": 1.8225789070129395, + "learning_rate": 4.849721088111485e-05, + "loss": 6.2734, + "step": 18653 + }, + { + "epoch": 0.11094062232372252, + "grad_norm": 1.8502428531646729, + "learning_rate": 4.849705137142634e-05, + "loss": 5.8298, + "step": 18654 + }, + { + "epoch": 0.11094656960700353, + "grad_norm": 1.4959850311279297, + "learning_rate": 4.8496891853535255e-05, + "loss": 5.4667, + "step": 18655 + }, + { + "epoch": 0.11095251689028451, + "grad_norm": 1.7957161664962769, + "learning_rate": 4.849673232744164e-05, + "loss": 5.3483, + "step": 18656 + }, + { + "epoch": 0.11095846417356552, + "grad_norm": 1.448737382888794, + "learning_rate": 4.8496572793145554e-05, + "loss": 5.4568, + "step": 18657 + }, + { + "epoch": 0.11096441145684652, + "grad_norm": 1.5068676471710205, + "learning_rate": 4.8496413250647065e-05, + "loss": 5.7089, + "step": 18658 + }, + { + "epoch": 0.1109703587401275, + "grad_norm": 1.5162447690963745, + "learning_rate": 4.849625369994622e-05, + "loss": 5.6042, + "step": 18659 + }, + { + "epoch": 0.11097630602340851, + "grad_norm": 1.81594979763031, + "learning_rate": 4.8496094141043076e-05, + "loss": 5.5301, + "step": 18660 + }, + { + "epoch": 0.11098225330668951, + "grad_norm": 1.9147114753723145, + "learning_rate": 4.8495934573937684e-05, + "loss": 4.6335, + "step": 18661 + }, + { + "epoch": 0.1109882005899705, + "grad_norm": 1.4161462783813477, + "learning_rate": 4.8495774998630106e-05, + "loss": 4.9868, + "step": 18662 + }, + { + "epoch": 0.1109941478732515, + "grad_norm": 1.5652790069580078, + "learning_rate": 4.8495615415120396e-05, + "loss": 5.6954, + "step": 18663 + }, + { + "epoch": 0.1110000951565325, + "grad_norm": 1.5217374563217163, + "learning_rate": 4.8495455823408616e-05, + "loss": 5.4338, + "step": 18664 + }, + { + "epoch": 0.11100604243981349, + "grad_norm": 1.3335540294647217, + "learning_rate": 4.8495296223494805e-05, + "loss": 5.4751, + "step": 18665 + }, + { + "epoch": 0.11101198972309449, + "grad_norm": 1.8903460502624512, + "learning_rate": 4.849513661537903e-05, + "loss": 4.9481, + "step": 18666 + }, + { + "epoch": 0.1110179370063755, + "grad_norm": 1.814666748046875, + "learning_rate": 4.849497699906135e-05, + "loss": 5.1422, + "step": 18667 + }, + { + "epoch": 0.11102388428965648, + "grad_norm": 1.7838057279586792, + "learning_rate": 4.8494817374541816e-05, + "loss": 5.3991, + "step": 18668 + }, + { + "epoch": 0.11102983157293748, + "grad_norm": 1.665671944618225, + "learning_rate": 4.849465774182048e-05, + "loss": 5.5362, + "step": 18669 + }, + { + "epoch": 0.11103577885621847, + "grad_norm": 2.255326509475708, + "learning_rate": 4.8494498100897415e-05, + "loss": 5.3161, + "step": 18670 + }, + { + "epoch": 0.11104172613949947, + "grad_norm": 1.7641721963882446, + "learning_rate": 4.849433845177265e-05, + "loss": 5.0422, + "step": 18671 + }, + { + "epoch": 0.11104767342278048, + "grad_norm": 1.4214074611663818, + "learning_rate": 4.8494178794446256e-05, + "loss": 5.2417, + "step": 18672 + }, + { + "epoch": 0.11105362070606146, + "grad_norm": 1.6417256593704224, + "learning_rate": 4.849401912891829e-05, + "loss": 5.262, + "step": 18673 + }, + { + "epoch": 0.11105956798934247, + "grad_norm": 1.4238179922103882, + "learning_rate": 4.84938594551888e-05, + "loss": 5.9754, + "step": 18674 + }, + { + "epoch": 0.11106551527262347, + "grad_norm": 1.9513673782348633, + "learning_rate": 4.849369977325785e-05, + "loss": 5.8917, + "step": 18675 + }, + { + "epoch": 0.11107146255590446, + "grad_norm": 1.625225305557251, + "learning_rate": 4.849354008312549e-05, + "loss": 5.7142, + "step": 18676 + }, + { + "epoch": 0.11107740983918546, + "grad_norm": 1.5306450128555298, + "learning_rate": 4.849338038479178e-05, + "loss": 5.3206, + "step": 18677 + }, + { + "epoch": 0.11108335712246646, + "grad_norm": 2.7895541191101074, + "learning_rate": 4.849322067825677e-05, + "loss": 4.3585, + "step": 18678 + }, + { + "epoch": 0.11108930440574745, + "grad_norm": 2.2688374519348145, + "learning_rate": 4.849306096352052e-05, + "loss": 4.4967, + "step": 18679 + }, + { + "epoch": 0.11109525168902845, + "grad_norm": 2.1710267066955566, + "learning_rate": 4.849290124058309e-05, + "loss": 4.0673, + "step": 18680 + }, + { + "epoch": 0.11110119897230945, + "grad_norm": 2.235142707824707, + "learning_rate": 4.849274150944453e-05, + "loss": 3.8198, + "step": 18681 + }, + { + "epoch": 0.11110714625559044, + "grad_norm": 2.328324317932129, + "learning_rate": 4.849258177010489e-05, + "loss": 4.008, + "step": 18682 + }, + { + "epoch": 0.11111309353887144, + "grad_norm": 2.2681312561035156, + "learning_rate": 4.849242202256424e-05, + "loss": 4.1541, + "step": 18683 + }, + { + "epoch": 0.11111904082215245, + "grad_norm": 2.5430855751037598, + "learning_rate": 4.849226226682262e-05, + "loss": 4.3177, + "step": 18684 + }, + { + "epoch": 0.11112498810543343, + "grad_norm": 2.1995978355407715, + "learning_rate": 4.84921025028801e-05, + "loss": 4.5792, + "step": 18685 + }, + { + "epoch": 0.11113093538871444, + "grad_norm": 1.9515454769134521, + "learning_rate": 4.849194273073673e-05, + "loss": 4.8759, + "step": 18686 + }, + { + "epoch": 0.11113688267199544, + "grad_norm": 2.484431028366089, + "learning_rate": 4.849178295039257e-05, + "loss": 4.1916, + "step": 18687 + }, + { + "epoch": 0.11114282995527643, + "grad_norm": 2.356790065765381, + "learning_rate": 4.8491623161847665e-05, + "loss": 4.38, + "step": 18688 + }, + { + "epoch": 0.11114877723855743, + "grad_norm": 2.414517879486084, + "learning_rate": 4.849146336510207e-05, + "loss": 4.3739, + "step": 18689 + }, + { + "epoch": 0.11115472452183843, + "grad_norm": 2.4129765033721924, + "learning_rate": 4.849130356015587e-05, + "loss": 4.0384, + "step": 18690 + }, + { + "epoch": 0.11116067180511942, + "grad_norm": 2.146932363510132, + "learning_rate": 4.8491143747009074e-05, + "loss": 4.4045, + "step": 18691 + }, + { + "epoch": 0.11116661908840042, + "grad_norm": 2.1945905685424805, + "learning_rate": 4.8490983925661776e-05, + "loss": 5.1674, + "step": 18692 + }, + { + "epoch": 0.11117256637168142, + "grad_norm": 2.2188448905944824, + "learning_rate": 4.849082409611402e-05, + "loss": 4.628, + "step": 18693 + }, + { + "epoch": 0.11117851365496241, + "grad_norm": 1.7684906721115112, + "learning_rate": 4.8490664258365847e-05, + "loss": 5.236, + "step": 18694 + }, + { + "epoch": 0.11118446093824341, + "grad_norm": 2.0367350578308105, + "learning_rate": 4.849050441241734e-05, + "loss": 5.6408, + "step": 18695 + }, + { + "epoch": 0.11119040822152441, + "grad_norm": 2.0829811096191406, + "learning_rate": 4.849034455826853e-05, + "loss": 5.5519, + "step": 18696 + }, + { + "epoch": 0.1111963555048054, + "grad_norm": 1.7884539365768433, + "learning_rate": 4.8490184695919486e-05, + "loss": 5.2345, + "step": 18697 + }, + { + "epoch": 0.1112023027880864, + "grad_norm": 1.8792423009872437, + "learning_rate": 4.849002482537026e-05, + "loss": 4.7622, + "step": 18698 + }, + { + "epoch": 0.11120825007136739, + "grad_norm": 1.7493008375167847, + "learning_rate": 4.8489864946620914e-05, + "loss": 5.295, + "step": 18699 + }, + { + "epoch": 0.1112141973546484, + "grad_norm": 1.60455322265625, + "learning_rate": 4.84897050596715e-05, + "loss": 5.5708, + "step": 18700 + }, + { + "epoch": 0.1112201446379294, + "grad_norm": 1.4326173067092896, + "learning_rate": 4.848954516452206e-05, + "loss": 5.9185, + "step": 18701 + }, + { + "epoch": 0.11122609192121038, + "grad_norm": 1.6318118572235107, + "learning_rate": 4.8489385261172685e-05, + "loss": 5.6545, + "step": 18702 + }, + { + "epoch": 0.11123203920449139, + "grad_norm": 1.4083906412124634, + "learning_rate": 4.848922534962339e-05, + "loss": 5.4776, + "step": 18703 + }, + { + "epoch": 0.11123798648777239, + "grad_norm": 1.222609519958496, + "learning_rate": 4.8489065429874256e-05, + "loss": 5.5094, + "step": 18704 + }, + { + "epoch": 0.11124393377105338, + "grad_norm": 1.6955020427703857, + "learning_rate": 4.848890550192533e-05, + "loss": 5.0516, + "step": 18705 + }, + { + "epoch": 0.11124988105433438, + "grad_norm": 1.3875632286071777, + "learning_rate": 4.848874556577667e-05, + "loss": 5.5321, + "step": 18706 + }, + { + "epoch": 0.11125582833761538, + "grad_norm": 1.2538158893585205, + "learning_rate": 4.848858562142833e-05, + "loss": 5.464, + "step": 18707 + }, + { + "epoch": 0.11126177562089637, + "grad_norm": 1.7350475788116455, + "learning_rate": 4.8488425668880366e-05, + "loss": 5.2815, + "step": 18708 + }, + { + "epoch": 0.11126772290417737, + "grad_norm": 1.543989658355713, + "learning_rate": 4.848826570813284e-05, + "loss": 5.4817, + "step": 18709 + }, + { + "epoch": 0.11127367018745837, + "grad_norm": 1.3931440114974976, + "learning_rate": 4.8488105739185807e-05, + "loss": 5.7652, + "step": 18710 + }, + { + "epoch": 0.11127961747073936, + "grad_norm": 1.4630471467971802, + "learning_rate": 4.8487945762039314e-05, + "loss": 5.4886, + "step": 18711 + }, + { + "epoch": 0.11128556475402036, + "grad_norm": 1.338161826133728, + "learning_rate": 4.848778577669342e-05, + "loss": 5.2021, + "step": 18712 + }, + { + "epoch": 0.11129151203730137, + "grad_norm": 1.4282599687576294, + "learning_rate": 4.8487625783148186e-05, + "loss": 5.2767, + "step": 18713 + }, + { + "epoch": 0.11129745932058235, + "grad_norm": 1.4386523962020874, + "learning_rate": 4.848746578140366e-05, + "loss": 5.7286, + "step": 18714 + }, + { + "epoch": 0.11130340660386336, + "grad_norm": 1.2272754907608032, + "learning_rate": 4.84873057714599e-05, + "loss": 5.3609, + "step": 18715 + }, + { + "epoch": 0.11130935388714436, + "grad_norm": 1.8362592458724976, + "learning_rate": 4.848714575331697e-05, + "loss": 5.0494, + "step": 18716 + }, + { + "epoch": 0.11131530117042535, + "grad_norm": 2.098970651626587, + "learning_rate": 4.848698572697492e-05, + "loss": 4.8282, + "step": 18717 + }, + { + "epoch": 0.11132124845370635, + "grad_norm": 2.2145583629608154, + "learning_rate": 4.84868256924338e-05, + "loss": 4.4621, + "step": 18718 + }, + { + "epoch": 0.11132719573698735, + "grad_norm": 1.8036415576934814, + "learning_rate": 4.848666564969368e-05, + "loss": 5.374, + "step": 18719 + }, + { + "epoch": 0.11133314302026834, + "grad_norm": 1.5794750452041626, + "learning_rate": 4.8486505598754605e-05, + "loss": 5.6246, + "step": 18720 + }, + { + "epoch": 0.11133909030354934, + "grad_norm": 1.637068510055542, + "learning_rate": 4.848634553961664e-05, + "loss": 5.4506, + "step": 18721 + }, + { + "epoch": 0.11134503758683034, + "grad_norm": 1.6928807497024536, + "learning_rate": 4.8486185472279824e-05, + "loss": 5.2405, + "step": 18722 + }, + { + "epoch": 0.11135098487011133, + "grad_norm": 2.0931332111358643, + "learning_rate": 4.848602539674422e-05, + "loss": 4.9366, + "step": 18723 + }, + { + "epoch": 0.11135693215339233, + "grad_norm": 1.4645583629608154, + "learning_rate": 4.848586531300989e-05, + "loss": 5.0677, + "step": 18724 + }, + { + "epoch": 0.11136287943667333, + "grad_norm": 1.7817938327789307, + "learning_rate": 4.8485705221076896e-05, + "loss": 5.5975, + "step": 18725 + }, + { + "epoch": 0.11136882671995432, + "grad_norm": 1.7167946100234985, + "learning_rate": 4.848554512094528e-05, + "loss": 5.829, + "step": 18726 + }, + { + "epoch": 0.11137477400323532, + "grad_norm": 1.723574161529541, + "learning_rate": 4.8485385012615106e-05, + "loss": 5.2702, + "step": 18727 + }, + { + "epoch": 0.11138072128651631, + "grad_norm": 1.4848002195358276, + "learning_rate": 4.848522489608642e-05, + "loss": 5.6739, + "step": 18728 + }, + { + "epoch": 0.11138666856979731, + "grad_norm": 1.798085331916809, + "learning_rate": 4.848506477135929e-05, + "loss": 5.7314, + "step": 18729 + }, + { + "epoch": 0.11139261585307832, + "grad_norm": 1.7033846378326416, + "learning_rate": 4.848490463843376e-05, + "loss": 5.531, + "step": 18730 + }, + { + "epoch": 0.1113985631363593, + "grad_norm": 1.64686119556427, + "learning_rate": 4.8484744497309896e-05, + "loss": 5.8325, + "step": 18731 + }, + { + "epoch": 0.1114045104196403, + "grad_norm": 1.9923123121261597, + "learning_rate": 4.8484584347987755e-05, + "loss": 5.9614, + "step": 18732 + }, + { + "epoch": 0.11141045770292131, + "grad_norm": 1.768896460533142, + "learning_rate": 4.8484424190467385e-05, + "loss": 5.9892, + "step": 18733 + }, + { + "epoch": 0.1114164049862023, + "grad_norm": 1.5981477499008179, + "learning_rate": 4.848426402474885e-05, + "loss": 5.6239, + "step": 18734 + }, + { + "epoch": 0.1114223522694833, + "grad_norm": 1.8919446468353271, + "learning_rate": 4.848410385083219e-05, + "loss": 5.7437, + "step": 18735 + }, + { + "epoch": 0.1114282995527643, + "grad_norm": 2.2705752849578857, + "learning_rate": 4.848394366871748e-05, + "loss": 4.5999, + "step": 18736 + }, + { + "epoch": 0.11143424683604529, + "grad_norm": 1.8626762628555298, + "learning_rate": 4.848378347840476e-05, + "loss": 5.5706, + "step": 18737 + }, + { + "epoch": 0.11144019411932629, + "grad_norm": 1.5893161296844482, + "learning_rate": 4.84836232798941e-05, + "loss": 5.4011, + "step": 18738 + }, + { + "epoch": 0.1114461414026073, + "grad_norm": 1.3441518545150757, + "learning_rate": 4.8483463073185554e-05, + "loss": 5.2412, + "step": 18739 + }, + { + "epoch": 0.11145208868588828, + "grad_norm": 1.6281975507736206, + "learning_rate": 4.848330285827917e-05, + "loss": 5.4281, + "step": 18740 + }, + { + "epoch": 0.11145803596916928, + "grad_norm": 2.1942298412323, + "learning_rate": 4.8483142635175e-05, + "loss": 5.6202, + "step": 18741 + }, + { + "epoch": 0.11146398325245029, + "grad_norm": 2.086764097213745, + "learning_rate": 4.848298240387311e-05, + "loss": 5.665, + "step": 18742 + }, + { + "epoch": 0.11146993053573127, + "grad_norm": 2.0656285285949707, + "learning_rate": 4.848282216437356e-05, + "loss": 5.5196, + "step": 18743 + }, + { + "epoch": 0.11147587781901228, + "grad_norm": 1.5579513311386108, + "learning_rate": 4.84826619166764e-05, + "loss": 5.7366, + "step": 18744 + }, + { + "epoch": 0.11148182510229328, + "grad_norm": 1.7952065467834473, + "learning_rate": 4.848250166078168e-05, + "loss": 5.8041, + "step": 18745 + }, + { + "epoch": 0.11148777238557427, + "grad_norm": 1.3523657321929932, + "learning_rate": 4.848234139668947e-05, + "loss": 5.6628, + "step": 18746 + }, + { + "epoch": 0.11149371966885527, + "grad_norm": 1.6833933591842651, + "learning_rate": 4.848218112439981e-05, + "loss": 5.5285, + "step": 18747 + }, + { + "epoch": 0.11149966695213627, + "grad_norm": 1.308733344078064, + "learning_rate": 4.848202084391276e-05, + "loss": 5.9953, + "step": 18748 + }, + { + "epoch": 0.11150561423541726, + "grad_norm": 1.3434252738952637, + "learning_rate": 4.848186055522838e-05, + "loss": 5.8267, + "step": 18749 + }, + { + "epoch": 0.11151156151869826, + "grad_norm": 1.6250263452529907, + "learning_rate": 4.848170025834673e-05, + "loss": 4.964, + "step": 18750 + }, + { + "epoch": 0.11151750880197926, + "grad_norm": 1.4924334287643433, + "learning_rate": 4.848153995326786e-05, + "loss": 4.9072, + "step": 18751 + }, + { + "epoch": 0.11152345608526025, + "grad_norm": 1.5650702714920044, + "learning_rate": 4.8481379639991826e-05, + "loss": 5.8793, + "step": 18752 + }, + { + "epoch": 0.11152940336854125, + "grad_norm": 1.488553762435913, + "learning_rate": 4.848121931851868e-05, + "loss": 5.823, + "step": 18753 + }, + { + "epoch": 0.11153535065182225, + "grad_norm": 1.5356508493423462, + "learning_rate": 4.848105898884849e-05, + "loss": 5.7632, + "step": 18754 + }, + { + "epoch": 0.11154129793510324, + "grad_norm": 1.5389797687530518, + "learning_rate": 4.8480898650981296e-05, + "loss": 5.8662, + "step": 18755 + }, + { + "epoch": 0.11154724521838424, + "grad_norm": 1.3963713645935059, + "learning_rate": 4.848073830491717e-05, + "loss": 5.5647, + "step": 18756 + }, + { + "epoch": 0.11155319250166523, + "grad_norm": 1.3739324808120728, + "learning_rate": 4.848057795065617e-05, + "loss": 5.6686, + "step": 18757 + }, + { + "epoch": 0.11155913978494623, + "grad_norm": 1.2932708263397217, + "learning_rate": 4.848041758819833e-05, + "loss": 5.6567, + "step": 18758 + }, + { + "epoch": 0.11156508706822724, + "grad_norm": 1.3388581275939941, + "learning_rate": 4.848025721754372e-05, + "loss": 5.6111, + "step": 18759 + }, + { + "epoch": 0.11157103435150822, + "grad_norm": 1.28604257106781, + "learning_rate": 4.84800968386924e-05, + "loss": 5.633, + "step": 18760 + }, + { + "epoch": 0.11157698163478923, + "grad_norm": 2.0710771083831787, + "learning_rate": 4.847993645164441e-05, + "loss": 5.1686, + "step": 18761 + }, + { + "epoch": 0.11158292891807023, + "grad_norm": 1.8022092580795288, + "learning_rate": 4.847977605639983e-05, + "loss": 5.6373, + "step": 18762 + }, + { + "epoch": 0.11158887620135122, + "grad_norm": 1.7080397605895996, + "learning_rate": 4.84796156529587e-05, + "loss": 5.5389, + "step": 18763 + }, + { + "epoch": 0.11159482348463222, + "grad_norm": 1.3582305908203125, + "learning_rate": 4.847945524132107e-05, + "loss": 5.5574, + "step": 18764 + }, + { + "epoch": 0.11160077076791322, + "grad_norm": 1.9037936925888062, + "learning_rate": 4.8479294821487015e-05, + "loss": 5.2108, + "step": 18765 + }, + { + "epoch": 0.11160671805119421, + "grad_norm": 1.6884709596633911, + "learning_rate": 4.8479134393456576e-05, + "loss": 5.2462, + "step": 18766 + }, + { + "epoch": 0.11161266533447521, + "grad_norm": 1.720261812210083, + "learning_rate": 4.8478973957229813e-05, + "loss": 5.5132, + "step": 18767 + }, + { + "epoch": 0.11161861261775621, + "grad_norm": 2.1769275665283203, + "learning_rate": 4.847881351280679e-05, + "loss": 5.1169, + "step": 18768 + }, + { + "epoch": 0.1116245599010372, + "grad_norm": 1.8593683242797852, + "learning_rate": 4.847865306018754e-05, + "loss": 4.8812, + "step": 18769 + }, + { + "epoch": 0.1116305071843182, + "grad_norm": 1.9496150016784668, + "learning_rate": 4.8478492599372147e-05, + "loss": 4.8244, + "step": 18770 + }, + { + "epoch": 0.1116364544675992, + "grad_norm": 1.584330677986145, + "learning_rate": 4.8478332130360655e-05, + "loss": 4.769, + "step": 18771 + }, + { + "epoch": 0.1116424017508802, + "grad_norm": 1.5987087488174438, + "learning_rate": 4.8478171653153116e-05, + "loss": 4.8385, + "step": 18772 + }, + { + "epoch": 0.1116483490341612, + "grad_norm": 1.919463038444519, + "learning_rate": 4.847801116774959e-05, + "loss": 4.7365, + "step": 18773 + }, + { + "epoch": 0.1116542963174422, + "grad_norm": 1.8708561658859253, + "learning_rate": 4.847785067415014e-05, + "loss": 4.9067, + "step": 18774 + }, + { + "epoch": 0.11166024360072319, + "grad_norm": 1.778316617012024, + "learning_rate": 4.8477690172354804e-05, + "loss": 4.8213, + "step": 18775 + }, + { + "epoch": 0.11166619088400419, + "grad_norm": 1.7170525789260864, + "learning_rate": 4.8477529662363655e-05, + "loss": 4.7115, + "step": 18776 + }, + { + "epoch": 0.11167213816728519, + "grad_norm": 1.6704293489456177, + "learning_rate": 4.847736914417674e-05, + "loss": 4.5814, + "step": 18777 + }, + { + "epoch": 0.11167808545056618, + "grad_norm": 1.7422312498092651, + "learning_rate": 4.847720861779412e-05, + "loss": 4.6206, + "step": 18778 + }, + { + "epoch": 0.11168403273384718, + "grad_norm": 1.7162894010543823, + "learning_rate": 4.8477048083215845e-05, + "loss": 4.6421, + "step": 18779 + }, + { + "epoch": 0.11168998001712818, + "grad_norm": 1.7825870513916016, + "learning_rate": 4.847688754044199e-05, + "loss": 4.6899, + "step": 18780 + }, + { + "epoch": 0.11169592730040917, + "grad_norm": 1.8103221654891968, + "learning_rate": 4.8476726989472577e-05, + "loss": 4.5619, + "step": 18781 + }, + { + "epoch": 0.11170187458369017, + "grad_norm": 1.8276532888412476, + "learning_rate": 4.847656643030769e-05, + "loss": 4.3429, + "step": 18782 + }, + { + "epoch": 0.11170782186697117, + "grad_norm": 1.7625696659088135, + "learning_rate": 4.847640586294737e-05, + "loss": 4.4154, + "step": 18783 + }, + { + "epoch": 0.11171376915025216, + "grad_norm": 1.842450499534607, + "learning_rate": 4.8476245287391684e-05, + "loss": 4.6279, + "step": 18784 + }, + { + "epoch": 0.11171971643353316, + "grad_norm": 1.879961371421814, + "learning_rate": 4.847608470364069e-05, + "loss": 4.4906, + "step": 18785 + }, + { + "epoch": 0.11172566371681415, + "grad_norm": 1.5556871891021729, + "learning_rate": 4.847592411169443e-05, + "loss": 5.0258, + "step": 18786 + }, + { + "epoch": 0.11173161100009515, + "grad_norm": 1.8000839948654175, + "learning_rate": 4.8475763511552965e-05, + "loss": 4.4746, + "step": 18787 + }, + { + "epoch": 0.11173755828337616, + "grad_norm": 1.4234516620635986, + "learning_rate": 4.847560290321636e-05, + "loss": 5.4744, + "step": 18788 + }, + { + "epoch": 0.11174350556665714, + "grad_norm": 1.5717182159423828, + "learning_rate": 4.847544228668466e-05, + "loss": 5.4368, + "step": 18789 + }, + { + "epoch": 0.11174945284993815, + "grad_norm": 1.3514728546142578, + "learning_rate": 4.847528166195793e-05, + "loss": 5.3036, + "step": 18790 + }, + { + "epoch": 0.11175540013321915, + "grad_norm": 1.4620373249053955, + "learning_rate": 4.847512102903621e-05, + "loss": 5.2206, + "step": 18791 + }, + { + "epoch": 0.11176134741650014, + "grad_norm": 1.3034706115722656, + "learning_rate": 4.847496038791958e-05, + "loss": 5.3359, + "step": 18792 + }, + { + "epoch": 0.11176729469978114, + "grad_norm": 1.599876046180725, + "learning_rate": 4.847479973860808e-05, + "loss": 5.1282, + "step": 18793 + }, + { + "epoch": 0.11177324198306214, + "grad_norm": 1.4783935546875, + "learning_rate": 4.847463908110177e-05, + "loss": 5.1958, + "step": 18794 + }, + { + "epoch": 0.11177918926634313, + "grad_norm": 1.5132538080215454, + "learning_rate": 4.84744784154007e-05, + "loss": 5.0166, + "step": 18795 + }, + { + "epoch": 0.11178513654962413, + "grad_norm": 1.9335131645202637, + "learning_rate": 4.847431774150495e-05, + "loss": 4.8899, + "step": 18796 + }, + { + "epoch": 0.11179108383290513, + "grad_norm": 1.5765737295150757, + "learning_rate": 4.847415705941454e-05, + "loss": 5.2848, + "step": 18797 + }, + { + "epoch": 0.11179703111618612, + "grad_norm": 1.7239350080490112, + "learning_rate": 4.847399636912955e-05, + "loss": 5.0606, + "step": 18798 + }, + { + "epoch": 0.11180297839946712, + "grad_norm": 1.5246455669403076, + "learning_rate": 4.847383567065004e-05, + "loss": 5.0829, + "step": 18799 + }, + { + "epoch": 0.11180892568274813, + "grad_norm": 1.3902997970581055, + "learning_rate": 4.847367496397604e-05, + "loss": 5.2729, + "step": 18800 + }, + { + "epoch": 0.11181487296602911, + "grad_norm": 1.426282286643982, + "learning_rate": 4.8473514249107634e-05, + "loss": 5.2259, + "step": 18801 + }, + { + "epoch": 0.11182082024931012, + "grad_norm": 1.4425853490829468, + "learning_rate": 4.847335352604486e-05, + "loss": 4.923, + "step": 18802 + }, + { + "epoch": 0.11182676753259112, + "grad_norm": 1.26097571849823, + "learning_rate": 4.8473192794787786e-05, + "loss": 4.9122, + "step": 18803 + }, + { + "epoch": 0.1118327148158721, + "grad_norm": 1.4102699756622314, + "learning_rate": 4.847303205533646e-05, + "loss": 4.9641, + "step": 18804 + }, + { + "epoch": 0.11183866209915311, + "grad_norm": 1.3965771198272705, + "learning_rate": 4.847287130769094e-05, + "loss": 4.9832, + "step": 18805 + }, + { + "epoch": 0.11184460938243411, + "grad_norm": 1.3588200807571411, + "learning_rate": 4.8472710551851284e-05, + "loss": 5.0502, + "step": 18806 + }, + { + "epoch": 0.1118505566657151, + "grad_norm": 1.394020676612854, + "learning_rate": 4.847254978781755e-05, + "loss": 4.9699, + "step": 18807 + }, + { + "epoch": 0.1118565039489961, + "grad_norm": 1.4548087120056152, + "learning_rate": 4.8472389015589794e-05, + "loss": 4.9112, + "step": 18808 + }, + { + "epoch": 0.1118624512322771, + "grad_norm": 1.4359081983566284, + "learning_rate": 4.847222823516806e-05, + "loss": 4.9284, + "step": 18809 + }, + { + "epoch": 0.11186839851555809, + "grad_norm": 1.3159685134887695, + "learning_rate": 4.847206744655242e-05, + "loss": 4.9661, + "step": 18810 + }, + { + "epoch": 0.11187434579883909, + "grad_norm": 1.5037652254104614, + "learning_rate": 4.847190664974292e-05, + "loss": 5.0318, + "step": 18811 + }, + { + "epoch": 0.1118802930821201, + "grad_norm": 1.7603816986083984, + "learning_rate": 4.8471745844739624e-05, + "loss": 5.0486, + "step": 18812 + }, + { + "epoch": 0.11188624036540108, + "grad_norm": 1.6205053329467773, + "learning_rate": 4.847158503154259e-05, + "loss": 5.0587, + "step": 18813 + }, + { + "epoch": 0.11189218764868208, + "grad_norm": 1.559334635734558, + "learning_rate": 4.847142421015185e-05, + "loss": 5.1514, + "step": 18814 + }, + { + "epoch": 0.11189813493196307, + "grad_norm": 1.4896910190582275, + "learning_rate": 4.8471263380567495e-05, + "loss": 5.2103, + "step": 18815 + }, + { + "epoch": 0.11190408221524407, + "grad_norm": 1.43007493019104, + "learning_rate": 4.847110254278956e-05, + "loss": 5.0152, + "step": 18816 + }, + { + "epoch": 0.11191002949852508, + "grad_norm": 1.3567081689834595, + "learning_rate": 4.84709416968181e-05, + "loss": 4.7193, + "step": 18817 + }, + { + "epoch": 0.11191597678180606, + "grad_norm": 1.3283864259719849, + "learning_rate": 4.8470780842653186e-05, + "loss": 4.8559, + "step": 18818 + }, + { + "epoch": 0.11192192406508707, + "grad_norm": 1.5427826642990112, + "learning_rate": 4.8470619980294854e-05, + "loss": 5.1406, + "step": 18819 + }, + { + "epoch": 0.11192787134836807, + "grad_norm": 1.4549115896224976, + "learning_rate": 4.847045910974318e-05, + "loss": 5.0377, + "step": 18820 + }, + { + "epoch": 0.11193381863164906, + "grad_norm": 1.3822715282440186, + "learning_rate": 4.84702982309982e-05, + "loss": 4.9279, + "step": 18821 + }, + { + "epoch": 0.11193976591493006, + "grad_norm": 1.290756106376648, + "learning_rate": 4.8470137344059996e-05, + "loss": 4.9631, + "step": 18822 + }, + { + "epoch": 0.11194571319821106, + "grad_norm": 1.8070625066757202, + "learning_rate": 4.84699764489286e-05, + "loss": 5.0103, + "step": 18823 + }, + { + "epoch": 0.11195166048149205, + "grad_norm": 1.6692131757736206, + "learning_rate": 4.846981554560408e-05, + "loss": 5.1265, + "step": 18824 + }, + { + "epoch": 0.11195760776477305, + "grad_norm": 1.7644426822662354, + "learning_rate": 4.8469654634086495e-05, + "loss": 5.0712, + "step": 18825 + }, + { + "epoch": 0.11196355504805405, + "grad_norm": 1.5689074993133545, + "learning_rate": 4.8469493714375893e-05, + "loss": 5.0551, + "step": 18826 + }, + { + "epoch": 0.11196950233133504, + "grad_norm": 1.610300064086914, + "learning_rate": 4.846933278647233e-05, + "loss": 5.0746, + "step": 18827 + }, + { + "epoch": 0.11197544961461604, + "grad_norm": 1.2828009128570557, + "learning_rate": 4.846917185037586e-05, + "loss": 5.0645, + "step": 18828 + }, + { + "epoch": 0.11198139689789705, + "grad_norm": 1.386265516281128, + "learning_rate": 4.846901090608655e-05, + "loss": 5.1885, + "step": 18829 + }, + { + "epoch": 0.11198734418117803, + "grad_norm": 1.446359634399414, + "learning_rate": 4.846884995360446e-05, + "loss": 5.3245, + "step": 18830 + }, + { + "epoch": 0.11199329146445904, + "grad_norm": 1.4347827434539795, + "learning_rate": 4.846868899292962e-05, + "loss": 5.379, + "step": 18831 + }, + { + "epoch": 0.11199923874774004, + "grad_norm": 1.7589528560638428, + "learning_rate": 4.846852802406212e-05, + "loss": 5.2726, + "step": 18832 + }, + { + "epoch": 0.11200518603102103, + "grad_norm": 1.4316980838775635, + "learning_rate": 4.846836704700199e-05, + "loss": 5.5424, + "step": 18833 + }, + { + "epoch": 0.11201113331430203, + "grad_norm": 1.202364444732666, + "learning_rate": 4.84682060617493e-05, + "loss": 5.4271, + "step": 18834 + }, + { + "epoch": 0.11201708059758303, + "grad_norm": 1.282231330871582, + "learning_rate": 4.8468045068304094e-05, + "loss": 5.4895, + "step": 18835 + }, + { + "epoch": 0.11202302788086402, + "grad_norm": 1.8428497314453125, + "learning_rate": 4.846788406666644e-05, + "loss": 4.9924, + "step": 18836 + }, + { + "epoch": 0.11202897516414502, + "grad_norm": 1.8442119359970093, + "learning_rate": 4.846772305683639e-05, + "loss": 4.6735, + "step": 18837 + }, + { + "epoch": 0.11203492244742602, + "grad_norm": 1.7083659172058105, + "learning_rate": 4.846756203881401e-05, + "loss": 4.8064, + "step": 18838 + }, + { + "epoch": 0.11204086973070701, + "grad_norm": 1.5663195848464966, + "learning_rate": 4.8467401012599336e-05, + "loss": 5.095, + "step": 18839 + }, + { + "epoch": 0.11204681701398801, + "grad_norm": 1.7466095685958862, + "learning_rate": 4.846723997819244e-05, + "loss": 4.7633, + "step": 18840 + }, + { + "epoch": 0.11205276429726901, + "grad_norm": 1.73336660861969, + "learning_rate": 4.846707893559336e-05, + "loss": 4.8776, + "step": 18841 + }, + { + "epoch": 0.11205871158055, + "grad_norm": 1.726456880569458, + "learning_rate": 4.8466917884802175e-05, + "loss": 4.845, + "step": 18842 + }, + { + "epoch": 0.112064658863831, + "grad_norm": 1.733583927154541, + "learning_rate": 4.8466756825818934e-05, + "loss": 4.8272, + "step": 18843 + }, + { + "epoch": 0.11207060614711199, + "grad_norm": 1.8252346515655518, + "learning_rate": 4.8466595758643684e-05, + "loss": 4.7088, + "step": 18844 + }, + { + "epoch": 0.112076553430393, + "grad_norm": 1.6071163415908813, + "learning_rate": 4.8466434683276495e-05, + "loss": 4.7085, + "step": 18845 + }, + { + "epoch": 0.112082500713674, + "grad_norm": 1.8407503366470337, + "learning_rate": 4.846627359971741e-05, + "loss": 4.6885, + "step": 18846 + }, + { + "epoch": 0.11208844799695498, + "grad_norm": 1.5426356792449951, + "learning_rate": 4.84661125079665e-05, + "loss": 4.7252, + "step": 18847 + }, + { + "epoch": 0.11209439528023599, + "grad_norm": 1.8290139436721802, + "learning_rate": 4.84659514080238e-05, + "loss": 4.9314, + "step": 18848 + }, + { + "epoch": 0.11210034256351699, + "grad_norm": 1.73724365234375, + "learning_rate": 4.846579029988939e-05, + "loss": 4.7618, + "step": 18849 + }, + { + "epoch": 0.11210628984679798, + "grad_norm": 2.0577304363250732, + "learning_rate": 4.8465629183563314e-05, + "loss": 4.8118, + "step": 18850 + }, + { + "epoch": 0.11211223713007898, + "grad_norm": 1.8696433305740356, + "learning_rate": 4.846546805904562e-05, + "loss": 4.6813, + "step": 18851 + }, + { + "epoch": 0.11211818441335998, + "grad_norm": 1.6597977876663208, + "learning_rate": 4.846530692633638e-05, + "loss": 4.5187, + "step": 18852 + }, + { + "epoch": 0.11212413169664097, + "grad_norm": 1.6595630645751953, + "learning_rate": 4.846514578543564e-05, + "loss": 5.012, + "step": 18853 + }, + { + "epoch": 0.11213007897992197, + "grad_norm": 2.2116329669952393, + "learning_rate": 4.846498463634347e-05, + "loss": 5.1757, + "step": 18854 + }, + { + "epoch": 0.11213602626320297, + "grad_norm": 1.8592875003814697, + "learning_rate": 4.846482347905991e-05, + "loss": 6.0403, + "step": 18855 + }, + { + "epoch": 0.11214197354648396, + "grad_norm": 1.7812080383300781, + "learning_rate": 4.846466231358502e-05, + "loss": 5.974, + "step": 18856 + }, + { + "epoch": 0.11214792082976496, + "grad_norm": 1.8986600637435913, + "learning_rate": 4.846450113991886e-05, + "loss": 5.3866, + "step": 18857 + }, + { + "epoch": 0.11215386811304597, + "grad_norm": 2.4542179107666016, + "learning_rate": 4.846433995806148e-05, + "loss": 4.863, + "step": 18858 + }, + { + "epoch": 0.11215981539632695, + "grad_norm": 2.1604816913604736, + "learning_rate": 4.846417876801295e-05, + "loss": 5.219, + "step": 18859 + }, + { + "epoch": 0.11216576267960796, + "grad_norm": 2.325782060623169, + "learning_rate": 4.846401756977331e-05, + "loss": 5.1454, + "step": 18860 + }, + { + "epoch": 0.11217170996288896, + "grad_norm": 2.3508334159851074, + "learning_rate": 4.846385636334263e-05, + "loss": 5.1318, + "step": 18861 + }, + { + "epoch": 0.11217765724616995, + "grad_norm": 2.2381060123443604, + "learning_rate": 4.846369514872096e-05, + "loss": 5.0676, + "step": 18862 + }, + { + "epoch": 0.11218360452945095, + "grad_norm": 2.3624770641326904, + "learning_rate": 4.8463533925908355e-05, + "loss": 5.0251, + "step": 18863 + }, + { + "epoch": 0.11218955181273195, + "grad_norm": 1.9950919151306152, + "learning_rate": 4.846337269490487e-05, + "loss": 5.0396, + "step": 18864 + }, + { + "epoch": 0.11219549909601294, + "grad_norm": 1.829410433769226, + "learning_rate": 4.8463211455710574e-05, + "loss": 4.9327, + "step": 18865 + }, + { + "epoch": 0.11220144637929394, + "grad_norm": 1.8879605531692505, + "learning_rate": 4.846305020832551e-05, + "loss": 4.8902, + "step": 18866 + }, + { + "epoch": 0.11220739366257494, + "grad_norm": 1.89055335521698, + "learning_rate": 4.846288895274973e-05, + "loss": 4.9219, + "step": 18867 + }, + { + "epoch": 0.11221334094585593, + "grad_norm": 2.224971055984497, + "learning_rate": 4.84627276889833e-05, + "loss": 5.0164, + "step": 18868 + }, + { + "epoch": 0.11221928822913693, + "grad_norm": 2.1675336360931396, + "learning_rate": 4.8462566417026276e-05, + "loss": 5.0082, + "step": 18869 + }, + { + "epoch": 0.11222523551241793, + "grad_norm": 1.885236144065857, + "learning_rate": 4.8462405136878714e-05, + "loss": 5.1484, + "step": 18870 + }, + { + "epoch": 0.11223118279569892, + "grad_norm": 1.3037774562835693, + "learning_rate": 4.846224384854067e-05, + "loss": 5.64, + "step": 18871 + }, + { + "epoch": 0.11223713007897992, + "grad_norm": 1.6506762504577637, + "learning_rate": 4.846208255201219e-05, + "loss": 5.6067, + "step": 18872 + }, + { + "epoch": 0.11224307736226091, + "grad_norm": 1.4294368028640747, + "learning_rate": 4.8461921247293344e-05, + "loss": 5.67, + "step": 18873 + }, + { + "epoch": 0.11224902464554191, + "grad_norm": 1.6201854944229126, + "learning_rate": 4.846175993438419e-05, + "loss": 5.6093, + "step": 18874 + }, + { + "epoch": 0.11225497192882292, + "grad_norm": 1.5683603286743164, + "learning_rate": 4.846159861328478e-05, + "loss": 5.6129, + "step": 18875 + }, + { + "epoch": 0.1122609192121039, + "grad_norm": 1.5446193218231201, + "learning_rate": 4.8461437283995156e-05, + "loss": 5.6063, + "step": 18876 + }, + { + "epoch": 0.1122668664953849, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846127594651539e-05, + "loss": 5.6291, + "step": 18877 + }, + { + "epoch": 0.11227281377866591, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846111460084554e-05, + "loss": 5.6282, + "step": 18878 + }, + { + "epoch": 0.1122787610619469, + "grad_norm": 1.4379156827926636, + "learning_rate": 4.846095324698565e-05, + "loss": 5.5451, + "step": 18879 + }, + { + "epoch": 0.1122847083452279, + "grad_norm": 1.4940646886825562, + "learning_rate": 4.8460791884935785e-05, + "loss": 5.4705, + "step": 18880 + }, + { + "epoch": 0.1122906556285089, + "grad_norm": 1.4625567197799683, + "learning_rate": 4.8460630514696e-05, + "loss": 5.5428, + "step": 18881 + }, + { + "epoch": 0.11229660291178989, + "grad_norm": 1.7899153232574463, + "learning_rate": 4.846046913626636e-05, + "loss": 5.7665, + "step": 18882 + }, + { + "epoch": 0.11230255019507089, + "grad_norm": 2.1002516746520996, + "learning_rate": 4.8460307749646906e-05, + "loss": 6.1132, + "step": 18883 + }, + { + "epoch": 0.11230849747835189, + "grad_norm": 1.8406580686569214, + "learning_rate": 4.84601463548377e-05, + "loss": 5.5207, + "step": 18884 + }, + { + "epoch": 0.11231444476163288, + "grad_norm": 1.6287425756454468, + "learning_rate": 4.84599849518388e-05, + "loss": 5.931, + "step": 18885 + }, + { + "epoch": 0.11232039204491388, + "grad_norm": 1.4447002410888672, + "learning_rate": 4.845982354065027e-05, + "loss": 5.6181, + "step": 18886 + }, + { + "epoch": 0.11232633932819489, + "grad_norm": 1.6555171012878418, + "learning_rate": 4.845966212127215e-05, + "loss": 5.1448, + "step": 18887 + }, + { + "epoch": 0.11233228661147587, + "grad_norm": 2.0948448181152344, + "learning_rate": 4.84595006937045e-05, + "loss": 5.3695, + "step": 18888 + }, + { + "epoch": 0.11233823389475688, + "grad_norm": 1.6369346380233765, + "learning_rate": 4.845933925794739e-05, + "loss": 5.5859, + "step": 18889 + }, + { + "epoch": 0.11234418117803788, + "grad_norm": 1.4660474061965942, + "learning_rate": 4.845917781400086e-05, + "loss": 5.6121, + "step": 18890 + }, + { + "epoch": 0.11235012846131887, + "grad_norm": 1.6739449501037598, + "learning_rate": 4.845901636186497e-05, + "loss": 5.6874, + "step": 18891 + }, + { + "epoch": 0.11235607574459987, + "grad_norm": 1.4542694091796875, + "learning_rate": 4.8458854901539794e-05, + "loss": 5.5956, + "step": 18892 + }, + { + "epoch": 0.11236202302788087, + "grad_norm": 1.3305023908615112, + "learning_rate": 4.8458693433025365e-05, + "loss": 5.658, + "step": 18893 + }, + { + "epoch": 0.11236797031116186, + "grad_norm": 1.8081300258636475, + "learning_rate": 4.845853195632175e-05, + "loss": 4.8563, + "step": 18894 + }, + { + "epoch": 0.11237391759444286, + "grad_norm": 1.8959764242172241, + "learning_rate": 4.8458370471429e-05, + "loss": 5.3051, + "step": 18895 + }, + { + "epoch": 0.11237986487772386, + "grad_norm": 1.9471427202224731, + "learning_rate": 4.845820897834718e-05, + "loss": 5.8181, + "step": 18896 + }, + { + "epoch": 0.11238581216100485, + "grad_norm": 1.6311548948287964, + "learning_rate": 4.845804747707634e-05, + "loss": 5.7714, + "step": 18897 + }, + { + "epoch": 0.11239175944428585, + "grad_norm": 1.830788493156433, + "learning_rate": 4.845788596761653e-05, + "loss": 5.9535, + "step": 18898 + }, + { + "epoch": 0.11239770672756685, + "grad_norm": 1.7896127700805664, + "learning_rate": 4.8457724449967836e-05, + "loss": 5.5385, + "step": 18899 + }, + { + "epoch": 0.11240365401084784, + "grad_norm": 1.5098718404769897, + "learning_rate": 4.845756292413027e-05, + "loss": 5.4067, + "step": 18900 + }, + { + "epoch": 0.11240960129412884, + "grad_norm": 1.9224756956100464, + "learning_rate": 4.845740139010392e-05, + "loss": 5.4863, + "step": 18901 + }, + { + "epoch": 0.11241554857740983, + "grad_norm": 2.1158740520477295, + "learning_rate": 4.845723984788884e-05, + "loss": 5.0745, + "step": 18902 + }, + { + "epoch": 0.11242149586069083, + "grad_norm": 2.292292594909668, + "learning_rate": 4.845707829748507e-05, + "loss": 4.9248, + "step": 18903 + }, + { + "epoch": 0.11242744314397184, + "grad_norm": 2.312593698501587, + "learning_rate": 4.8456916738892675e-05, + "loss": 4.9712, + "step": 18904 + }, + { + "epoch": 0.11243339042725282, + "grad_norm": 1.7302945852279663, + "learning_rate": 4.8456755172111725e-05, + "loss": 5.0814, + "step": 18905 + }, + { + "epoch": 0.11243933771053383, + "grad_norm": 1.3441206216812134, + "learning_rate": 4.845659359714225e-05, + "loss": 5.6563, + "step": 18906 + }, + { + "epoch": 0.11244528499381483, + "grad_norm": 1.5126272439956665, + "learning_rate": 4.845643201398433e-05, + "loss": 5.607, + "step": 18907 + }, + { + "epoch": 0.11245123227709582, + "grad_norm": 1.438795804977417, + "learning_rate": 4.845627042263801e-05, + "loss": 5.5287, + "step": 18908 + }, + { + "epoch": 0.11245717956037682, + "grad_norm": 1.6724447011947632, + "learning_rate": 4.845610882310335e-05, + "loss": 5.361, + "step": 18909 + }, + { + "epoch": 0.11246312684365782, + "grad_norm": 1.7267217636108398, + "learning_rate": 4.845594721538041e-05, + "loss": 5.6361, + "step": 18910 + }, + { + "epoch": 0.11246907412693881, + "grad_norm": 1.7616380453109741, + "learning_rate": 4.845578559946923e-05, + "loss": 5.2538, + "step": 18911 + }, + { + "epoch": 0.11247502141021981, + "grad_norm": 1.8318467140197754, + "learning_rate": 4.845562397536988e-05, + "loss": 4.8236, + "step": 18912 + }, + { + "epoch": 0.11248096869350081, + "grad_norm": 2.4882378578186035, + "learning_rate": 4.8455462343082415e-05, + "loss": 4.5624, + "step": 18913 + }, + { + "epoch": 0.1124869159767818, + "grad_norm": 2.5109870433807373, + "learning_rate": 4.845530070260689e-05, + "loss": 4.7906, + "step": 18914 + }, + { + "epoch": 0.1124928632600628, + "grad_norm": 2.2084672451019287, + "learning_rate": 4.845513905394336e-05, + "loss": 4.5304, + "step": 18915 + }, + { + "epoch": 0.1124988105433438, + "grad_norm": 2.4276058673858643, + "learning_rate": 4.8454977397091885e-05, + "loss": 4.3753, + "step": 18916 + }, + { + "epoch": 0.1125047578266248, + "grad_norm": 2.5022165775299072, + "learning_rate": 4.845481573205252e-05, + "loss": 4.1849, + "step": 18917 + }, + { + "epoch": 0.1125107051099058, + "grad_norm": 2.511643171310425, + "learning_rate": 4.845465405882532e-05, + "loss": 4.4007, + "step": 18918 + }, + { + "epoch": 0.1125166523931868, + "grad_norm": 2.598860263824463, + "learning_rate": 4.845449237741034e-05, + "loss": 4.6015, + "step": 18919 + }, + { + "epoch": 0.11252259967646779, + "grad_norm": 2.339555263519287, + "learning_rate": 4.845433068780765e-05, + "loss": 4.4123, + "step": 18920 + }, + { + "epoch": 0.11252854695974879, + "grad_norm": 2.286858320236206, + "learning_rate": 4.845416899001729e-05, + "loss": 4.3709, + "step": 18921 + }, + { + "epoch": 0.11253449424302979, + "grad_norm": 2.431622266769409, + "learning_rate": 4.845400728403932e-05, + "loss": 4.2162, + "step": 18922 + }, + { + "epoch": 0.11254044152631078, + "grad_norm": 2.7147364616394043, + "learning_rate": 4.8453845569873796e-05, + "loss": 4.3949, + "step": 18923 + }, + { + "epoch": 0.11254638880959178, + "grad_norm": 2.4738264083862305, + "learning_rate": 4.8453683847520784e-05, + "loss": 4.2671, + "step": 18924 + }, + { + "epoch": 0.11255233609287278, + "grad_norm": 2.007298707962036, + "learning_rate": 4.8453522116980325e-05, + "loss": 4.9317, + "step": 18925 + }, + { + "epoch": 0.11255828337615377, + "grad_norm": 1.8057860136032104, + "learning_rate": 4.8453360378252486e-05, + "loss": 5.4763, + "step": 18926 + }, + { + "epoch": 0.11256423065943477, + "grad_norm": 1.913892149925232, + "learning_rate": 4.845319863133733e-05, + "loss": 5.3112, + "step": 18927 + }, + { + "epoch": 0.11257017794271577, + "grad_norm": 1.6226540803909302, + "learning_rate": 4.845303687623489e-05, + "loss": 5.7164, + "step": 18928 + }, + { + "epoch": 0.11257612522599676, + "grad_norm": 1.7885600328445435, + "learning_rate": 4.8452875112945253e-05, + "loss": 5.7746, + "step": 18929 + }, + { + "epoch": 0.11258207250927776, + "grad_norm": 1.5598177909851074, + "learning_rate": 4.8452713341468444e-05, + "loss": 5.7843, + "step": 18930 + }, + { + "epoch": 0.11258801979255875, + "grad_norm": 1.517059564590454, + "learning_rate": 4.845255156180455e-05, + "loss": 5.7777, + "step": 18931 + }, + { + "epoch": 0.11259396707583975, + "grad_norm": 1.2515442371368408, + "learning_rate": 4.84523897739536e-05, + "loss": 5.7443, + "step": 18932 + }, + { + "epoch": 0.11259991435912076, + "grad_norm": 1.4970554113388062, + "learning_rate": 4.845222797791566e-05, + "loss": 5.6157, + "step": 18933 + }, + { + "epoch": 0.11260586164240174, + "grad_norm": 1.632620930671692, + "learning_rate": 4.8452066173690804e-05, + "loss": 5.0715, + "step": 18934 + }, + { + "epoch": 0.11261180892568275, + "grad_norm": 1.9634324312210083, + "learning_rate": 4.845190436127907e-05, + "loss": 5.3624, + "step": 18935 + }, + { + "epoch": 0.11261775620896375, + "grad_norm": 1.663560152053833, + "learning_rate": 4.8451742540680514e-05, + "loss": 5.4324, + "step": 18936 + }, + { + "epoch": 0.11262370349224474, + "grad_norm": 1.560684323310852, + "learning_rate": 4.84515807118952e-05, + "loss": 4.8426, + "step": 18937 + }, + { + "epoch": 0.11262965077552574, + "grad_norm": 1.5759334564208984, + "learning_rate": 4.8451418874923185e-05, + "loss": 5.6239, + "step": 18938 + }, + { + "epoch": 0.11263559805880674, + "grad_norm": 1.8501811027526855, + "learning_rate": 4.8451257029764504e-05, + "loss": 5.1734, + "step": 18939 + }, + { + "epoch": 0.11264154534208773, + "grad_norm": 1.811924934387207, + "learning_rate": 4.845109517641925e-05, + "loss": 5.2778, + "step": 18940 + }, + { + "epoch": 0.11264749262536873, + "grad_norm": 1.9684933423995972, + "learning_rate": 4.845093331488746e-05, + "loss": 5.3673, + "step": 18941 + }, + { + "epoch": 0.11265343990864973, + "grad_norm": 2.1155457496643066, + "learning_rate": 4.8450771445169185e-05, + "loss": 4.6955, + "step": 18942 + }, + { + "epoch": 0.11265938719193072, + "grad_norm": 2.117941379547119, + "learning_rate": 4.8450609567264495e-05, + "loss": 4.4051, + "step": 18943 + }, + { + "epoch": 0.11266533447521172, + "grad_norm": 1.9649946689605713, + "learning_rate": 4.845044768117343e-05, + "loss": 5.0204, + "step": 18944 + }, + { + "epoch": 0.11267128175849273, + "grad_norm": 1.898119568824768, + "learning_rate": 4.845028578689606e-05, + "loss": 4.9994, + "step": 18945 + }, + { + "epoch": 0.11267722904177371, + "grad_norm": 2.4376771450042725, + "learning_rate": 4.845012388443244e-05, + "loss": 4.6852, + "step": 18946 + }, + { + "epoch": 0.11268317632505472, + "grad_norm": 2.593094825744629, + "learning_rate": 4.844996197378262e-05, + "loss": 4.3845, + "step": 18947 + }, + { + "epoch": 0.11268912360833572, + "grad_norm": 2.6004302501678467, + "learning_rate": 4.844980005494666e-05, + "loss": 4.2989, + "step": 18948 + }, + { + "epoch": 0.1126950708916167, + "grad_norm": 2.4045653343200684, + "learning_rate": 4.844963812792462e-05, + "loss": 4.411, + "step": 18949 + }, + { + "epoch": 0.11270101817489771, + "grad_norm": 2.2256572246551514, + "learning_rate": 4.8449476192716555e-05, + "loss": 4.423, + "step": 18950 + }, + { + "epoch": 0.11270696545817871, + "grad_norm": 2.110077142715454, + "learning_rate": 4.844931424932252e-05, + "loss": 4.2971, + "step": 18951 + }, + { + "epoch": 0.1127129127414597, + "grad_norm": 1.8960111141204834, + "learning_rate": 4.844915229774257e-05, + "loss": 5.0758, + "step": 18952 + }, + { + "epoch": 0.1127188600247407, + "grad_norm": 1.998542308807373, + "learning_rate": 4.844899033797676e-05, + "loss": 4.8565, + "step": 18953 + }, + { + "epoch": 0.1127248073080217, + "grad_norm": 1.7070491313934326, + "learning_rate": 4.8448828370025156e-05, + "loss": 5.4684, + "step": 18954 + }, + { + "epoch": 0.11273075459130269, + "grad_norm": 2.062570095062256, + "learning_rate": 4.8448666393887806e-05, + "loss": 5.5384, + "step": 18955 + }, + { + "epoch": 0.11273670187458369, + "grad_norm": 1.8782148361206055, + "learning_rate": 4.844850440956476e-05, + "loss": 5.0373, + "step": 18956 + }, + { + "epoch": 0.1127426491578647, + "grad_norm": 2.3674817085266113, + "learning_rate": 4.8448342417056096e-05, + "loss": 5.1999, + "step": 18957 + }, + { + "epoch": 0.11274859644114568, + "grad_norm": 2.2243809700012207, + "learning_rate": 4.844818041636186e-05, + "loss": 5.3275, + "step": 18958 + }, + { + "epoch": 0.11275454372442668, + "grad_norm": 2.2929039001464844, + "learning_rate": 4.8448018407482096e-05, + "loss": 5.3958, + "step": 18959 + }, + { + "epoch": 0.11276049100770767, + "grad_norm": 2.0325045585632324, + "learning_rate": 4.844785639041688e-05, + "loss": 4.6686, + "step": 18960 + }, + { + "epoch": 0.11276643829098867, + "grad_norm": 1.8510624170303345, + "learning_rate": 4.8447694365166255e-05, + "loss": 4.9134, + "step": 18961 + }, + { + "epoch": 0.11277238557426968, + "grad_norm": 1.7537583112716675, + "learning_rate": 4.844753233173027e-05, + "loss": 5.0618, + "step": 18962 + }, + { + "epoch": 0.11277833285755066, + "grad_norm": 1.9293370246887207, + "learning_rate": 4.844737029010901e-05, + "loss": 4.8716, + "step": 18963 + }, + { + "epoch": 0.11278428014083167, + "grad_norm": 1.6931575536727905, + "learning_rate": 4.844720824030251e-05, + "loss": 5.4606, + "step": 18964 + }, + { + "epoch": 0.11279022742411267, + "grad_norm": 1.970825433731079, + "learning_rate": 4.8447046182310836e-05, + "loss": 5.2482, + "step": 18965 + }, + { + "epoch": 0.11279617470739366, + "grad_norm": 1.4842323064804077, + "learning_rate": 4.844688411613404e-05, + "loss": 5.972, + "step": 18966 + }, + { + "epoch": 0.11280212199067466, + "grad_norm": 1.84175705909729, + "learning_rate": 4.8446722041772174e-05, + "loss": 4.7696, + "step": 18967 + }, + { + "epoch": 0.11280806927395566, + "grad_norm": 1.8980286121368408, + "learning_rate": 4.84465599592253e-05, + "loss": 4.5125, + "step": 18968 + }, + { + "epoch": 0.11281401655723665, + "grad_norm": 1.7349838018417358, + "learning_rate": 4.844639786849348e-05, + "loss": 4.581, + "step": 18969 + }, + { + "epoch": 0.11281996384051765, + "grad_norm": 1.5894320011138916, + "learning_rate": 4.844623576957675e-05, + "loss": 4.9205, + "step": 18970 + }, + { + "epoch": 0.11282591112379865, + "grad_norm": 1.8740227222442627, + "learning_rate": 4.84460736624752e-05, + "loss": 4.938, + "step": 18971 + }, + { + "epoch": 0.11283185840707964, + "grad_norm": 1.744537591934204, + "learning_rate": 4.8445911547188854e-05, + "loss": 5.5215, + "step": 18972 + }, + { + "epoch": 0.11283780569036064, + "grad_norm": 1.5465041399002075, + "learning_rate": 4.844574942371779e-05, + "loss": 5.3607, + "step": 18973 + }, + { + "epoch": 0.11284375297364165, + "grad_norm": 1.8417413234710693, + "learning_rate": 4.8445587292062056e-05, + "loss": 5.632, + "step": 18974 + }, + { + "epoch": 0.11284970025692263, + "grad_norm": 1.7401045560836792, + "learning_rate": 4.8445425152221704e-05, + "loss": 5.5514, + "step": 18975 + }, + { + "epoch": 0.11285564754020364, + "grad_norm": 1.6192666292190552, + "learning_rate": 4.8445263004196805e-05, + "loss": 5.2694, + "step": 18976 + }, + { + "epoch": 0.11286159482348464, + "grad_norm": 1.842510461807251, + "learning_rate": 4.84451008479874e-05, + "loss": 5.3429, + "step": 18977 + }, + { + "epoch": 0.11286754210676563, + "grad_norm": 1.4824966192245483, + "learning_rate": 4.8444938683593554e-05, + "loss": 5.5212, + "step": 18978 + }, + { + "epoch": 0.11287348939004663, + "grad_norm": 1.7926548719406128, + "learning_rate": 4.8444776511015324e-05, + "loss": 4.8687, + "step": 18979 + }, + { + "epoch": 0.11287943667332763, + "grad_norm": 1.7114008665084839, + "learning_rate": 4.844461433025277e-05, + "loss": 4.7459, + "step": 18980 + }, + { + "epoch": 0.11288538395660862, + "grad_norm": 1.8884011507034302, + "learning_rate": 4.844445214130594e-05, + "loss": 5.1957, + "step": 18981 + }, + { + "epoch": 0.11289133123988962, + "grad_norm": 1.6901582479476929, + "learning_rate": 4.844428994417489e-05, + "loss": 5.3349, + "step": 18982 + }, + { + "epoch": 0.11289727852317062, + "grad_norm": 1.7148336172103882, + "learning_rate": 4.844412773885968e-05, + "loss": 5.4903, + "step": 18983 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 1.478767991065979, + "learning_rate": 4.844396552536037e-05, + "loss": 5.6339, + "step": 18984 + }, + { + "epoch": 0.11290917308973261, + "grad_norm": 1.5679733753204346, + "learning_rate": 4.844380330367701e-05, + "loss": 5.4722, + "step": 18985 + }, + { + "epoch": 0.11291512037301361, + "grad_norm": 1.718564510345459, + "learning_rate": 4.844364107380966e-05, + "loss": 5.2826, + "step": 18986 + }, + { + "epoch": 0.1129210676562946, + "grad_norm": 1.6757621765136719, + "learning_rate": 4.844347883575839e-05, + "loss": 5.7454, + "step": 18987 + }, + { + "epoch": 0.1129270149395756, + "grad_norm": 1.9370322227478027, + "learning_rate": 4.844331658952324e-05, + "loss": 4.6631, + "step": 18988 + }, + { + "epoch": 0.1129329622228566, + "grad_norm": 1.9932162761688232, + "learning_rate": 4.844315433510426e-05, + "loss": 4.7486, + "step": 18989 + }, + { + "epoch": 0.1129389095061376, + "grad_norm": 2.0191309452056885, + "learning_rate": 4.844299207250152e-05, + "loss": 4.6999, + "step": 18990 + }, + { + "epoch": 0.1129448567894186, + "grad_norm": 1.971913456916809, + "learning_rate": 4.8442829801715074e-05, + "loss": 4.7345, + "step": 18991 + }, + { + "epoch": 0.11295080407269958, + "grad_norm": 1.8503371477127075, + "learning_rate": 4.844266752274498e-05, + "loss": 4.5352, + "step": 18992 + }, + { + "epoch": 0.11295675135598059, + "grad_norm": 2.0024712085723877, + "learning_rate": 4.8442505235591294e-05, + "loss": 4.6513, + "step": 18993 + }, + { + "epoch": 0.11296269863926159, + "grad_norm": 1.645996332168579, + "learning_rate": 4.844234294025407e-05, + "loss": 4.816, + "step": 18994 + }, + { + "epoch": 0.11296864592254258, + "grad_norm": 1.6649290323257446, + "learning_rate": 4.844218063673337e-05, + "loss": 5.1471, + "step": 18995 + }, + { + "epoch": 0.11297459320582358, + "grad_norm": 1.4211794137954712, + "learning_rate": 4.844201832502924e-05, + "loss": 5.0807, + "step": 18996 + }, + { + "epoch": 0.11298054048910458, + "grad_norm": 1.6982463598251343, + "learning_rate": 4.844185600514175e-05, + "loss": 4.9912, + "step": 18997 + }, + { + "epoch": 0.11298648777238557, + "grad_norm": 1.5852501392364502, + "learning_rate": 4.844169367707095e-05, + "loss": 5.4541, + "step": 18998 + }, + { + "epoch": 0.11299243505566657, + "grad_norm": 1.787331223487854, + "learning_rate": 4.844153134081689e-05, + "loss": 5.4295, + "step": 18999 + }, + { + "epoch": 0.11299838233894757, + "grad_norm": 1.5758492946624756, + "learning_rate": 4.844136899637964e-05, + "loss": 5.2601, + "step": 19000 + }, + { + "epoch": 0.11300432962222856, + "grad_norm": 1.5441172122955322, + "learning_rate": 4.844120664375925e-05, + "loss": 4.882, + "step": 19001 + }, + { + "epoch": 0.11301027690550956, + "grad_norm": 1.6587432622909546, + "learning_rate": 4.8441044282955774e-05, + "loss": 4.8311, + "step": 19002 + }, + { + "epoch": 0.11301622418879056, + "grad_norm": 1.6563838720321655, + "learning_rate": 4.844088191396927e-05, + "loss": 4.87, + "step": 19003 + }, + { + "epoch": 0.11302217147207155, + "grad_norm": 1.7367866039276123, + "learning_rate": 4.84407195367998e-05, + "loss": 5.2984, + "step": 19004 + }, + { + "epoch": 0.11302811875535256, + "grad_norm": 2.3307883739471436, + "learning_rate": 4.844055715144742e-05, + "loss": 4.8798, + "step": 19005 + }, + { + "epoch": 0.11303406603863356, + "grad_norm": 2.601762294769287, + "learning_rate": 4.844039475791218e-05, + "loss": 4.8156, + "step": 19006 + }, + { + "epoch": 0.11304001332191455, + "grad_norm": 2.372610330581665, + "learning_rate": 4.844023235619414e-05, + "loss": 4.0715, + "step": 19007 + }, + { + "epoch": 0.11304596060519555, + "grad_norm": 2.16119384765625, + "learning_rate": 4.8440069946293356e-05, + "loss": 4.2701, + "step": 19008 + }, + { + "epoch": 0.11305190788847655, + "grad_norm": 2.1576502323150635, + "learning_rate": 4.843990752820989e-05, + "loss": 4.1302, + "step": 19009 + }, + { + "epoch": 0.11305785517175754, + "grad_norm": 2.122025489807129, + "learning_rate": 4.843974510194379e-05, + "loss": 4.0969, + "step": 19010 + }, + { + "epoch": 0.11306380245503854, + "grad_norm": 2.1929194927215576, + "learning_rate": 4.843958266749512e-05, + "loss": 4.2054, + "step": 19011 + }, + { + "epoch": 0.11306974973831954, + "grad_norm": 2.6305301189422607, + "learning_rate": 4.843942022486393e-05, + "loss": 4.3942, + "step": 19012 + }, + { + "epoch": 0.11307569702160053, + "grad_norm": 2.5355119705200195, + "learning_rate": 4.843925777405028e-05, + "loss": 4.4392, + "step": 19013 + }, + { + "epoch": 0.11308164430488153, + "grad_norm": 2.5040411949157715, + "learning_rate": 4.843909531505424e-05, + "loss": 4.221, + "step": 19014 + }, + { + "epoch": 0.11308759158816253, + "grad_norm": 2.15824556350708, + "learning_rate": 4.843893284787584e-05, + "loss": 4.8255, + "step": 19015 + }, + { + "epoch": 0.11309353887144352, + "grad_norm": 1.6300889253616333, + "learning_rate": 4.8438770372515155e-05, + "loss": 5.3668, + "step": 19016 + }, + { + "epoch": 0.11309948615472452, + "grad_norm": 1.745676875114441, + "learning_rate": 4.8438607888972245e-05, + "loss": 5.2858, + "step": 19017 + }, + { + "epoch": 0.11310543343800553, + "grad_norm": 1.6511434316635132, + "learning_rate": 4.8438445397247146e-05, + "loss": 5.2856, + "step": 19018 + }, + { + "epoch": 0.11311138072128651, + "grad_norm": 1.6282720565795898, + "learning_rate": 4.843828289733994e-05, + "loss": 5.7748, + "step": 19019 + }, + { + "epoch": 0.11311732800456752, + "grad_norm": 1.6303821802139282, + "learning_rate": 4.843812038925066e-05, + "loss": 5.3627, + "step": 19020 + }, + { + "epoch": 0.1131232752878485, + "grad_norm": 1.5684829950332642, + "learning_rate": 4.843795787297938e-05, + "loss": 5.6563, + "step": 19021 + }, + { + "epoch": 0.1131292225711295, + "grad_norm": 1.9084935188293457, + "learning_rate": 4.843779534852615e-05, + "loss": 5.7084, + "step": 19022 + }, + { + "epoch": 0.11313516985441051, + "grad_norm": 1.5176855325698853, + "learning_rate": 4.843763281589103e-05, + "loss": 5.7602, + "step": 19023 + }, + { + "epoch": 0.1131411171376915, + "grad_norm": 1.3877767324447632, + "learning_rate": 4.843747027507407e-05, + "loss": 5.4914, + "step": 19024 + }, + { + "epoch": 0.1131470644209725, + "grad_norm": 2.0801119804382324, + "learning_rate": 4.843730772607533e-05, + "loss": 4.8814, + "step": 19025 + }, + { + "epoch": 0.1131530117042535, + "grad_norm": 1.9673620462417603, + "learning_rate": 4.8437145168894874e-05, + "loss": 4.9423, + "step": 19026 + }, + { + "epoch": 0.11315895898753449, + "grad_norm": 1.5284085273742676, + "learning_rate": 4.8436982603532755e-05, + "loss": 5.0471, + "step": 19027 + }, + { + "epoch": 0.11316490627081549, + "grad_norm": 1.870762825012207, + "learning_rate": 4.8436820029989023e-05, + "loss": 4.9376, + "step": 19028 + }, + { + "epoch": 0.11317085355409649, + "grad_norm": 1.9094692468643188, + "learning_rate": 4.843665744826374e-05, + "loss": 4.8677, + "step": 19029 + }, + { + "epoch": 0.11317680083737748, + "grad_norm": 1.6463623046875, + "learning_rate": 4.8436494858356964e-05, + "loss": 5.3397, + "step": 19030 + }, + { + "epoch": 0.11318274812065848, + "grad_norm": 1.8127562999725342, + "learning_rate": 4.8436332260268745e-05, + "loss": 5.1626, + "step": 19031 + }, + { + "epoch": 0.11318869540393948, + "grad_norm": 1.5196025371551514, + "learning_rate": 4.8436169653999144e-05, + "loss": 5.1213, + "step": 19032 + }, + { + "epoch": 0.11319464268722047, + "grad_norm": 1.8930630683898926, + "learning_rate": 4.843600703954823e-05, + "loss": 4.8268, + "step": 19033 + }, + { + "epoch": 0.11320058997050148, + "grad_norm": 2.1579136848449707, + "learning_rate": 4.843584441691603e-05, + "loss": 5.6111, + "step": 19034 + }, + { + "epoch": 0.11320653725378248, + "grad_norm": 1.7644915580749512, + "learning_rate": 4.8435681786102624e-05, + "loss": 5.5762, + "step": 19035 + }, + { + "epoch": 0.11321248453706347, + "grad_norm": 1.5442852973937988, + "learning_rate": 4.843551914710808e-05, + "loss": 5.6486, + "step": 19036 + }, + { + "epoch": 0.11321843182034447, + "grad_norm": 1.823852777481079, + "learning_rate": 4.843535649993242e-05, + "loss": 5.6581, + "step": 19037 + }, + { + "epoch": 0.11322437910362547, + "grad_norm": 1.5850268602371216, + "learning_rate": 4.8435193844575726e-05, + "loss": 5.6351, + "step": 19038 + }, + { + "epoch": 0.11323032638690646, + "grad_norm": 1.6234556436538696, + "learning_rate": 4.843503118103805e-05, + "loss": 5.5462, + "step": 19039 + }, + { + "epoch": 0.11323627367018746, + "grad_norm": 1.602618932723999, + "learning_rate": 4.843486850931944e-05, + "loss": 5.2935, + "step": 19040 + }, + { + "epoch": 0.11324222095346846, + "grad_norm": 1.6808282136917114, + "learning_rate": 4.843470582941997e-05, + "loss": 5.2254, + "step": 19041 + }, + { + "epoch": 0.11324816823674945, + "grad_norm": 1.6311568021774292, + "learning_rate": 4.8434543141339674e-05, + "loss": 5.1894, + "step": 19042 + }, + { + "epoch": 0.11325411552003045, + "grad_norm": 1.5836867094039917, + "learning_rate": 4.843438044507863e-05, + "loss": 5.6344, + "step": 19043 + }, + { + "epoch": 0.11326006280331145, + "grad_norm": 1.5654397010803223, + "learning_rate": 4.843421774063688e-05, + "loss": 5.2902, + "step": 19044 + }, + { + "epoch": 0.11326601008659244, + "grad_norm": 2.3957626819610596, + "learning_rate": 4.843405502801449e-05, + "loss": 4.812, + "step": 19045 + }, + { + "epoch": 0.11327195736987344, + "grad_norm": 2.123473644256592, + "learning_rate": 4.843389230721151e-05, + "loss": 4.6399, + "step": 19046 + }, + { + "epoch": 0.11327790465315445, + "grad_norm": 1.6691471338272095, + "learning_rate": 4.8433729578228007e-05, + "loss": 4.9337, + "step": 19047 + }, + { + "epoch": 0.11328385193643543, + "grad_norm": 1.6179373264312744, + "learning_rate": 4.8433566841064025e-05, + "loss": 5.1002, + "step": 19048 + }, + { + "epoch": 0.11328979921971644, + "grad_norm": 1.658995270729065, + "learning_rate": 4.843340409571963e-05, + "loss": 5.0397, + "step": 19049 + }, + { + "epoch": 0.11329574650299742, + "grad_norm": 2.0216362476348877, + "learning_rate": 4.843324134219488e-05, + "loss": 5.3112, + "step": 19050 + }, + { + "epoch": 0.11330169378627843, + "grad_norm": 2.0376546382904053, + "learning_rate": 4.843307858048982e-05, + "loss": 5.087, + "step": 19051 + }, + { + "epoch": 0.11330764106955943, + "grad_norm": 2.2038021087646484, + "learning_rate": 4.8432915810604516e-05, + "loss": 4.951, + "step": 19052 + }, + { + "epoch": 0.11331358835284042, + "grad_norm": 1.8985834121704102, + "learning_rate": 4.843275303253903e-05, + "loss": 5.522, + "step": 19053 + }, + { + "epoch": 0.11331953563612142, + "grad_norm": 1.9047077894210815, + "learning_rate": 4.8432590246293404e-05, + "loss": 5.8387, + "step": 19054 + }, + { + "epoch": 0.11332548291940242, + "grad_norm": 1.508352279663086, + "learning_rate": 4.8432427451867704e-05, + "loss": 5.7969, + "step": 19055 + }, + { + "epoch": 0.11333143020268341, + "grad_norm": 1.631695032119751, + "learning_rate": 4.8432264649261984e-05, + "loss": 5.3562, + "step": 19056 + }, + { + "epoch": 0.11333737748596441, + "grad_norm": 1.673411250114441, + "learning_rate": 4.8432101838476305e-05, + "loss": 5.3286, + "step": 19057 + }, + { + "epoch": 0.11334332476924541, + "grad_norm": 2.697946071624756, + "learning_rate": 4.843193901951072e-05, + "loss": 5.0634, + "step": 19058 + }, + { + "epoch": 0.1133492720525264, + "grad_norm": 2.5914673805236816, + "learning_rate": 4.843177619236529e-05, + "loss": 4.8294, + "step": 19059 + }, + { + "epoch": 0.1133552193358074, + "grad_norm": 1.8503727912902832, + "learning_rate": 4.843161335704007e-05, + "loss": 5.1436, + "step": 19060 + }, + { + "epoch": 0.1133611666190884, + "grad_norm": 1.7629435062408447, + "learning_rate": 4.843145051353511e-05, + "loss": 5.1822, + "step": 19061 + }, + { + "epoch": 0.11336711390236939, + "grad_norm": 1.826360821723938, + "learning_rate": 4.843128766185048e-05, + "loss": 5.5151, + "step": 19062 + }, + { + "epoch": 0.1133730611856504, + "grad_norm": 2.0347046852111816, + "learning_rate": 4.843112480198623e-05, + "loss": 4.7732, + "step": 19063 + }, + { + "epoch": 0.1133790084689314, + "grad_norm": 2.037482738494873, + "learning_rate": 4.843096193394241e-05, + "loss": 4.6475, + "step": 19064 + }, + { + "epoch": 0.11338495575221239, + "grad_norm": 2.1152050495147705, + "learning_rate": 4.8430799057719076e-05, + "loss": 4.531, + "step": 19065 + }, + { + "epoch": 0.11339090303549339, + "grad_norm": 2.303982734680176, + "learning_rate": 4.8430636173316306e-05, + "loss": 4.6317, + "step": 19066 + }, + { + "epoch": 0.11339685031877439, + "grad_norm": 2.3326570987701416, + "learning_rate": 4.843047328073414e-05, + "loss": 4.736, + "step": 19067 + }, + { + "epoch": 0.11340279760205538, + "grad_norm": 2.371316909790039, + "learning_rate": 4.8430310379972634e-05, + "loss": 4.806, + "step": 19068 + }, + { + "epoch": 0.11340874488533638, + "grad_norm": 2.5370912551879883, + "learning_rate": 4.8430147471031855e-05, + "loss": 4.7867, + "step": 19069 + }, + { + "epoch": 0.11341469216861738, + "grad_norm": 2.456982135772705, + "learning_rate": 4.842998455391185e-05, + "loss": 4.6942, + "step": 19070 + }, + { + "epoch": 0.11342063945189837, + "grad_norm": 2.526287078857422, + "learning_rate": 4.842982162861268e-05, + "loss": 4.7333, + "step": 19071 + }, + { + "epoch": 0.11342658673517937, + "grad_norm": 2.2763514518737793, + "learning_rate": 4.84296586951344e-05, + "loss": 4.712, + "step": 19072 + }, + { + "epoch": 0.11343253401846037, + "grad_norm": 2.330958366394043, + "learning_rate": 4.842949575347707e-05, + "loss": 4.5875, + "step": 19073 + }, + { + "epoch": 0.11343848130174136, + "grad_norm": 2.390018939971924, + "learning_rate": 4.8429332803640745e-05, + "loss": 4.6941, + "step": 19074 + }, + { + "epoch": 0.11344442858502236, + "grad_norm": 2.279719829559326, + "learning_rate": 4.842916984562548e-05, + "loss": 4.6216, + "step": 19075 + }, + { + "epoch": 0.11345037586830337, + "grad_norm": 2.2815043926239014, + "learning_rate": 4.842900687943133e-05, + "loss": 4.5667, + "step": 19076 + }, + { + "epoch": 0.11345632315158435, + "grad_norm": 2.301231861114502, + "learning_rate": 4.842884390505836e-05, + "loss": 4.5451, + "step": 19077 + }, + { + "epoch": 0.11346227043486536, + "grad_norm": 2.1763200759887695, + "learning_rate": 4.842868092250662e-05, + "loss": 4.5937, + "step": 19078 + }, + { + "epoch": 0.11346821771814634, + "grad_norm": 2.2151448726654053, + "learning_rate": 4.842851793177618e-05, + "loss": 4.8341, + "step": 19079 + }, + { + "epoch": 0.11347416500142735, + "grad_norm": 2.3094639778137207, + "learning_rate": 4.8428354932867085e-05, + "loss": 4.7308, + "step": 19080 + }, + { + "epoch": 0.11348011228470835, + "grad_norm": 1.5218987464904785, + "learning_rate": 4.8428191925779385e-05, + "loss": 5.2701, + "step": 19081 + }, + { + "epoch": 0.11348605956798934, + "grad_norm": 1.3781639337539673, + "learning_rate": 4.842802891051315e-05, + "loss": 5.6873, + "step": 19082 + }, + { + "epoch": 0.11349200685127034, + "grad_norm": 1.814702033996582, + "learning_rate": 4.842786588706842e-05, + "loss": 5.7713, + "step": 19083 + }, + { + "epoch": 0.11349795413455134, + "grad_norm": 1.5691754817962646, + "learning_rate": 4.842770285544528e-05, + "loss": 5.7115, + "step": 19084 + }, + { + "epoch": 0.11350390141783233, + "grad_norm": 1.962762713432312, + "learning_rate": 4.8427539815643766e-05, + "loss": 5.4159, + "step": 19085 + }, + { + "epoch": 0.11350984870111333, + "grad_norm": 1.6766527891159058, + "learning_rate": 4.842737676766393e-05, + "loss": 5.6007, + "step": 19086 + }, + { + "epoch": 0.11351579598439433, + "grad_norm": 1.782934308052063, + "learning_rate": 4.8427213711505844e-05, + "loss": 5.982, + "step": 19087 + }, + { + "epoch": 0.11352174326767532, + "grad_norm": 1.5706422328948975, + "learning_rate": 4.842705064716957e-05, + "loss": 5.5125, + "step": 19088 + }, + { + "epoch": 0.11352769055095632, + "grad_norm": 2.4957141876220703, + "learning_rate": 4.842688757465515e-05, + "loss": 4.5386, + "step": 19089 + }, + { + "epoch": 0.11353363783423732, + "grad_norm": 2.1444833278656006, + "learning_rate": 4.842672449396264e-05, + "loss": 4.6108, + "step": 19090 + }, + { + "epoch": 0.11353958511751831, + "grad_norm": 2.4586305618286133, + "learning_rate": 4.8426561405092106e-05, + "loss": 4.7453, + "step": 19091 + }, + { + "epoch": 0.11354553240079931, + "grad_norm": 2.228759765625, + "learning_rate": 4.8426398308043605e-05, + "loss": 4.662, + "step": 19092 + }, + { + "epoch": 0.11355147968408032, + "grad_norm": 2.029172420501709, + "learning_rate": 4.8426235202817184e-05, + "loss": 4.6389, + "step": 19093 + }, + { + "epoch": 0.1135574269673613, + "grad_norm": 2.1887340545654297, + "learning_rate": 4.842607208941291e-05, + "loss": 4.6852, + "step": 19094 + }, + { + "epoch": 0.11356337425064231, + "grad_norm": 1.7664849758148193, + "learning_rate": 4.842590896783084e-05, + "loss": 5.2435, + "step": 19095 + }, + { + "epoch": 0.11356932153392331, + "grad_norm": 1.5581247806549072, + "learning_rate": 4.8425745838071016e-05, + "loss": 5.6828, + "step": 19096 + }, + { + "epoch": 0.1135752688172043, + "grad_norm": 1.570602297782898, + "learning_rate": 4.842558270013352e-05, + "loss": 5.7011, + "step": 19097 + }, + { + "epoch": 0.1135812161004853, + "grad_norm": 1.4669830799102783, + "learning_rate": 4.842541955401838e-05, + "loss": 5.4361, + "step": 19098 + }, + { + "epoch": 0.1135871633837663, + "grad_norm": 1.199173927307129, + "learning_rate": 4.842525639972568e-05, + "loss": 5.5198, + "step": 19099 + }, + { + "epoch": 0.11359311066704729, + "grad_norm": 1.1747777462005615, + "learning_rate": 4.842509323725546e-05, + "loss": 5.6252, + "step": 19100 + }, + { + "epoch": 0.11359905795032829, + "grad_norm": 1.4497981071472168, + "learning_rate": 4.8424930066607784e-05, + "loss": 5.4295, + "step": 19101 + }, + { + "epoch": 0.1136050052336093, + "grad_norm": 1.485688328742981, + "learning_rate": 4.8424766887782704e-05, + "loss": 5.1248, + "step": 19102 + }, + { + "epoch": 0.11361095251689028, + "grad_norm": 1.419149398803711, + "learning_rate": 4.842460370078028e-05, + "loss": 5.0604, + "step": 19103 + }, + { + "epoch": 0.11361689980017128, + "grad_norm": 1.622096300125122, + "learning_rate": 4.842444050560058e-05, + "loss": 5.4429, + "step": 19104 + }, + { + "epoch": 0.11362284708345229, + "grad_norm": 1.2471072673797607, + "learning_rate": 4.8424277302243636e-05, + "loss": 5.3636, + "step": 19105 + }, + { + "epoch": 0.11362879436673327, + "grad_norm": 1.3416316509246826, + "learning_rate": 4.842411409070952e-05, + "loss": 5.1415, + "step": 19106 + }, + { + "epoch": 0.11363474165001428, + "grad_norm": 1.3691420555114746, + "learning_rate": 4.8423950870998293e-05, + "loss": 5.3286, + "step": 19107 + }, + { + "epoch": 0.11364068893329526, + "grad_norm": 1.2382487058639526, + "learning_rate": 4.842378764311e-05, + "loss": 5.4391, + "step": 19108 + }, + { + "epoch": 0.11364663621657627, + "grad_norm": 1.1729276180267334, + "learning_rate": 4.842362440704471e-05, + "loss": 5.4158, + "step": 19109 + }, + { + "epoch": 0.11365258349985727, + "grad_norm": 1.2451897859573364, + "learning_rate": 4.842346116280247e-05, + "loss": 5.2487, + "step": 19110 + }, + { + "epoch": 0.11365853078313826, + "grad_norm": 1.255652666091919, + "learning_rate": 4.8423297910383354e-05, + "loss": 5.2759, + "step": 19111 + }, + { + "epoch": 0.11366447806641926, + "grad_norm": 1.170296549797058, + "learning_rate": 4.8423134649787394e-05, + "loss": 5.1508, + "step": 19112 + }, + { + "epoch": 0.11367042534970026, + "grad_norm": 1.3954061269760132, + "learning_rate": 4.842297138101467e-05, + "loss": 5.3102, + "step": 19113 + }, + { + "epoch": 0.11367637263298125, + "grad_norm": 1.2746593952178955, + "learning_rate": 4.842280810406522e-05, + "loss": 5.2587, + "step": 19114 + }, + { + "epoch": 0.11368231991626225, + "grad_norm": 1.3224173784255981, + "learning_rate": 4.8422644818939114e-05, + "loss": 5.1927, + "step": 19115 + }, + { + "epoch": 0.11368826719954325, + "grad_norm": 1.0930812358856201, + "learning_rate": 4.84224815256364e-05, + "loss": 5.1676, + "step": 19116 + }, + { + "epoch": 0.11369421448282424, + "grad_norm": 1.3805547952651978, + "learning_rate": 4.842231822415715e-05, + "loss": 5.066, + "step": 19117 + }, + { + "epoch": 0.11370016176610524, + "grad_norm": 1.3455450534820557, + "learning_rate": 4.84221549145014e-05, + "loss": 4.9656, + "step": 19118 + }, + { + "epoch": 0.11370610904938624, + "grad_norm": 1.442218542098999, + "learning_rate": 4.842199159666922e-05, + "loss": 4.9094, + "step": 19119 + }, + { + "epoch": 0.11371205633266723, + "grad_norm": 1.435941457748413, + "learning_rate": 4.8421828270660665e-05, + "loss": 5.1035, + "step": 19120 + }, + { + "epoch": 0.11371800361594823, + "grad_norm": 1.2507586479187012, + "learning_rate": 4.84216649364758e-05, + "loss": 5.2395, + "step": 19121 + }, + { + "epoch": 0.11372395089922924, + "grad_norm": 1.3616739511489868, + "learning_rate": 4.842150159411466e-05, + "loss": 5.2082, + "step": 19122 + }, + { + "epoch": 0.11372989818251023, + "grad_norm": 1.2988322973251343, + "learning_rate": 4.842133824357732e-05, + "loss": 5.1271, + "step": 19123 + }, + { + "epoch": 0.11373584546579123, + "grad_norm": 1.2761636972427368, + "learning_rate": 4.842117488486384e-05, + "loss": 5.1724, + "step": 19124 + }, + { + "epoch": 0.11374179274907223, + "grad_norm": 1.2834585905075073, + "learning_rate": 4.842101151797426e-05, + "loss": 5.2256, + "step": 19125 + }, + { + "epoch": 0.11374774003235322, + "grad_norm": 1.2074506282806396, + "learning_rate": 4.8420848142908655e-05, + "loss": 5.2704, + "step": 19126 + }, + { + "epoch": 0.11375368731563422, + "grad_norm": 1.355292797088623, + "learning_rate": 4.842068475966707e-05, + "loss": 5.1109, + "step": 19127 + }, + { + "epoch": 0.11375963459891522, + "grad_norm": 1.1144691705703735, + "learning_rate": 4.8420521368249565e-05, + "loss": 5.0903, + "step": 19128 + }, + { + "epoch": 0.11376558188219621, + "grad_norm": 1.3889878988265991, + "learning_rate": 4.84203579686562e-05, + "loss": 5.1289, + "step": 19129 + }, + { + "epoch": 0.11377152916547721, + "grad_norm": 1.1302597522735596, + "learning_rate": 4.8420194560887035e-05, + "loss": 4.9211, + "step": 19130 + }, + { + "epoch": 0.11377747644875821, + "grad_norm": 1.1715654134750366, + "learning_rate": 4.8420031144942115e-05, + "loss": 5.2239, + "step": 19131 + }, + { + "epoch": 0.1137834237320392, + "grad_norm": 1.327021837234497, + "learning_rate": 4.84198677208215e-05, + "loss": 5.2941, + "step": 19132 + }, + { + "epoch": 0.1137893710153202, + "grad_norm": 1.3442116975784302, + "learning_rate": 4.841970428852526e-05, + "loss": 5.1752, + "step": 19133 + }, + { + "epoch": 0.1137953182986012, + "grad_norm": 1.207207202911377, + "learning_rate": 4.841954084805344e-05, + "loss": 4.9607, + "step": 19134 + }, + { + "epoch": 0.1138012655818822, + "grad_norm": 1.1609065532684326, + "learning_rate": 4.8419377399406104e-05, + "loss": 5.0458, + "step": 19135 + }, + { + "epoch": 0.1138072128651632, + "grad_norm": 1.365605115890503, + "learning_rate": 4.84192139425833e-05, + "loss": 5.0884, + "step": 19136 + }, + { + "epoch": 0.11381316014844418, + "grad_norm": 1.5192269086837769, + "learning_rate": 4.8419050477585096e-05, + "loss": 5.4803, + "step": 19137 + }, + { + "epoch": 0.11381910743172519, + "grad_norm": 1.187456488609314, + "learning_rate": 4.841888700441153e-05, + "loss": 5.4595, + "step": 19138 + }, + { + "epoch": 0.11382505471500619, + "grad_norm": 1.1836395263671875, + "learning_rate": 4.841872352306268e-05, + "loss": 5.27, + "step": 19139 + }, + { + "epoch": 0.11383100199828718, + "grad_norm": 1.353762149810791, + "learning_rate": 4.841856003353861e-05, + "loss": 5.4646, + "step": 19140 + }, + { + "epoch": 0.11383694928156818, + "grad_norm": 1.4854416847229004, + "learning_rate": 4.8418396535839344e-05, + "loss": 5.2894, + "step": 19141 + }, + { + "epoch": 0.11384289656484918, + "grad_norm": 1.3731143474578857, + "learning_rate": 4.841823302996496e-05, + "loss": 4.7512, + "step": 19142 + }, + { + "epoch": 0.11384884384813017, + "grad_norm": 1.3945658206939697, + "learning_rate": 4.841806951591552e-05, + "loss": 4.9625, + "step": 19143 + }, + { + "epoch": 0.11385479113141117, + "grad_norm": 1.2692869901657104, + "learning_rate": 4.841790599369107e-05, + "loss": 5.2245, + "step": 19144 + }, + { + "epoch": 0.11386073841469217, + "grad_norm": 1.3667423725128174, + "learning_rate": 4.8417742463291674e-05, + "loss": 5.202, + "step": 19145 + }, + { + "epoch": 0.11386668569797316, + "grad_norm": 1.2639939785003662, + "learning_rate": 4.8417578924717377e-05, + "loss": 5.4378, + "step": 19146 + }, + { + "epoch": 0.11387263298125416, + "grad_norm": 1.327867865562439, + "learning_rate": 4.8417415377968255e-05, + "loss": 5.1632, + "step": 19147 + }, + { + "epoch": 0.11387858026453516, + "grad_norm": 1.2095093727111816, + "learning_rate": 4.841725182304435e-05, + "loss": 4.9969, + "step": 19148 + }, + { + "epoch": 0.11388452754781615, + "grad_norm": 1.3395425081253052, + "learning_rate": 4.841708825994573e-05, + "loss": 5.1797, + "step": 19149 + }, + { + "epoch": 0.11389047483109715, + "grad_norm": 1.4817496538162231, + "learning_rate": 4.841692468867244e-05, + "loss": 5.1126, + "step": 19150 + }, + { + "epoch": 0.11389642211437816, + "grad_norm": 1.3066308498382568, + "learning_rate": 4.8416761109224547e-05, + "loss": 5.2692, + "step": 19151 + }, + { + "epoch": 0.11390236939765915, + "grad_norm": 1.444701075553894, + "learning_rate": 4.84165975216021e-05, + "loss": 5.0525, + "step": 19152 + }, + { + "epoch": 0.11390831668094015, + "grad_norm": 1.2720032930374146, + "learning_rate": 4.8416433925805165e-05, + "loss": 5.138, + "step": 19153 + }, + { + "epoch": 0.11391426396422115, + "grad_norm": 1.2228437662124634, + "learning_rate": 4.84162703218338e-05, + "loss": 5.028, + "step": 19154 + }, + { + "epoch": 0.11392021124750214, + "grad_norm": 1.1950013637542725, + "learning_rate": 4.841610670968805e-05, + "loss": 5.0873, + "step": 19155 + }, + { + "epoch": 0.11392615853078314, + "grad_norm": 1.3538236618041992, + "learning_rate": 4.8415943089367976e-05, + "loss": 5.0039, + "step": 19156 + }, + { + "epoch": 0.11393210581406414, + "grad_norm": 1.3344488143920898, + "learning_rate": 4.841577946087364e-05, + "loss": 5.0215, + "step": 19157 + }, + { + "epoch": 0.11393805309734513, + "grad_norm": 1.7098866701126099, + "learning_rate": 4.841561582420511e-05, + "loss": 5.5719, + "step": 19158 + }, + { + "epoch": 0.11394400038062613, + "grad_norm": 1.3574185371398926, + "learning_rate": 4.841545217936241e-05, + "loss": 4.8491, + "step": 19159 + }, + { + "epoch": 0.11394994766390713, + "grad_norm": 1.447292447090149, + "learning_rate": 4.8415288526345634e-05, + "loss": 4.8632, + "step": 19160 + }, + { + "epoch": 0.11395589494718812, + "grad_norm": 1.6439673900604248, + "learning_rate": 4.841512486515481e-05, + "loss": 5.282, + "step": 19161 + }, + { + "epoch": 0.11396184223046912, + "grad_norm": 1.3063132762908936, + "learning_rate": 4.841496119579002e-05, + "loss": 5.0399, + "step": 19162 + }, + { + "epoch": 0.11396778951375013, + "grad_norm": 1.4244173765182495, + "learning_rate": 4.8414797518251296e-05, + "loss": 4.7731, + "step": 19163 + }, + { + "epoch": 0.11397373679703111, + "grad_norm": 1.225203514099121, + "learning_rate": 4.841463383253872e-05, + "loss": 4.8294, + "step": 19164 + }, + { + "epoch": 0.11397968408031212, + "grad_norm": 1.2978007793426514, + "learning_rate": 4.8414470138652334e-05, + "loss": 4.6336, + "step": 19165 + }, + { + "epoch": 0.1139856313635931, + "grad_norm": 1.306591272354126, + "learning_rate": 4.8414306436592194e-05, + "loss": 4.8267, + "step": 19166 + }, + { + "epoch": 0.1139915786468741, + "grad_norm": 1.1227960586547852, + "learning_rate": 4.841414272635837e-05, + "loss": 4.7438, + "step": 19167 + }, + { + "epoch": 0.11399752593015511, + "grad_norm": 1.3674911260604858, + "learning_rate": 4.8413979007950905e-05, + "loss": 4.8127, + "step": 19168 + }, + { + "epoch": 0.1140034732134361, + "grad_norm": 1.3923397064208984, + "learning_rate": 4.841381528136986e-05, + "loss": 5.1568, + "step": 19169 + }, + { + "epoch": 0.1140094204967171, + "grad_norm": 1.2014738321304321, + "learning_rate": 4.84136515466153e-05, + "loss": 5.0116, + "step": 19170 + }, + { + "epoch": 0.1140153677799981, + "grad_norm": 1.3564008474349976, + "learning_rate": 4.841348780368726e-05, + "loss": 5.1181, + "step": 19171 + }, + { + "epoch": 0.11402131506327909, + "grad_norm": 1.1918834447860718, + "learning_rate": 4.841332405258583e-05, + "loss": 5.0854, + "step": 19172 + }, + { + "epoch": 0.11402726234656009, + "grad_norm": 1.2056841850280762, + "learning_rate": 4.8413160293311047e-05, + "loss": 4.825, + "step": 19173 + }, + { + "epoch": 0.11403320962984109, + "grad_norm": 1.3841508626937866, + "learning_rate": 4.841299652586298e-05, + "loss": 4.7543, + "step": 19174 + }, + { + "epoch": 0.11403915691312208, + "grad_norm": 1.511307716369629, + "learning_rate": 4.841283275024166e-05, + "loss": 4.9821, + "step": 19175 + }, + { + "epoch": 0.11404510419640308, + "grad_norm": 1.2577831745147705, + "learning_rate": 4.8412668966447175e-05, + "loss": 5.0138, + "step": 19176 + }, + { + "epoch": 0.11405105147968408, + "grad_norm": 1.442159652709961, + "learning_rate": 4.841250517447956e-05, + "loss": 5.0066, + "step": 19177 + }, + { + "epoch": 0.11405699876296507, + "grad_norm": 1.3029484748840332, + "learning_rate": 4.841234137433889e-05, + "loss": 4.9229, + "step": 19178 + }, + { + "epoch": 0.11406294604624607, + "grad_norm": 1.3138917684555054, + "learning_rate": 4.841217756602521e-05, + "loss": 4.6262, + "step": 19179 + }, + { + "epoch": 0.11406889332952708, + "grad_norm": 1.2164885997772217, + "learning_rate": 4.841201374953857e-05, + "loss": 4.7952, + "step": 19180 + }, + { + "epoch": 0.11407484061280806, + "grad_norm": 1.4247347116470337, + "learning_rate": 4.8411849924879046e-05, + "loss": 5.0066, + "step": 19181 + }, + { + "epoch": 0.11408078789608907, + "grad_norm": 1.236006736755371, + "learning_rate": 4.8411686092046695e-05, + "loss": 4.6585, + "step": 19182 + }, + { + "epoch": 0.11408673517937007, + "grad_norm": 1.2381118535995483, + "learning_rate": 4.841152225104156e-05, + "loss": 5.0935, + "step": 19183 + }, + { + "epoch": 0.11409268246265106, + "grad_norm": 1.3557883501052856, + "learning_rate": 4.84113584018637e-05, + "loss": 5.1536, + "step": 19184 + }, + { + "epoch": 0.11409862974593206, + "grad_norm": 1.3191505670547485, + "learning_rate": 4.8411194544513184e-05, + "loss": 5.2857, + "step": 19185 + }, + { + "epoch": 0.11410457702921306, + "grad_norm": 1.2058855295181274, + "learning_rate": 4.841103067899006e-05, + "loss": 5.142, + "step": 19186 + }, + { + "epoch": 0.11411052431249405, + "grad_norm": 1.163136601448059, + "learning_rate": 4.8410866805294384e-05, + "loss": 5.1891, + "step": 19187 + }, + { + "epoch": 0.11411647159577505, + "grad_norm": 1.3245770931243896, + "learning_rate": 4.841070292342622e-05, + "loss": 5.0629, + "step": 19188 + }, + { + "epoch": 0.11412241887905605, + "grad_norm": 1.13837730884552, + "learning_rate": 4.841053903338562e-05, + "loss": 5.1045, + "step": 19189 + }, + { + "epoch": 0.11412836616233704, + "grad_norm": 1.4724907875061035, + "learning_rate": 4.8410375135172646e-05, + "loss": 5.01, + "step": 19190 + }, + { + "epoch": 0.11413431344561804, + "grad_norm": 1.3786016702651978, + "learning_rate": 4.841021122878735e-05, + "loss": 5.0188, + "step": 19191 + }, + { + "epoch": 0.11414026072889905, + "grad_norm": 1.2996101379394531, + "learning_rate": 4.841004731422979e-05, + "loss": 4.954, + "step": 19192 + }, + { + "epoch": 0.11414620801218003, + "grad_norm": 1.297892451286316, + "learning_rate": 4.840988339150002e-05, + "loss": 4.9841, + "step": 19193 + }, + { + "epoch": 0.11415215529546104, + "grad_norm": 1.3011624813079834, + "learning_rate": 4.84097194605981e-05, + "loss": 4.8547, + "step": 19194 + }, + { + "epoch": 0.11415810257874202, + "grad_norm": 1.2169194221496582, + "learning_rate": 4.8409555521524096e-05, + "loss": 4.8801, + "step": 19195 + }, + { + "epoch": 0.11416404986202303, + "grad_norm": 1.4189658164978027, + "learning_rate": 4.8409391574278065e-05, + "loss": 4.9521, + "step": 19196 + }, + { + "epoch": 0.11416999714530403, + "grad_norm": 1.4178590774536133, + "learning_rate": 4.840922761886004e-05, + "loss": 4.7847, + "step": 19197 + }, + { + "epoch": 0.11417594442858502, + "grad_norm": 1.395585536956787, + "learning_rate": 4.8409063655270105e-05, + "loss": 5.0404, + "step": 19198 + }, + { + "epoch": 0.11418189171186602, + "grad_norm": 1.4803121089935303, + "learning_rate": 4.840889968350831e-05, + "loss": 4.8851, + "step": 19199 + }, + { + "epoch": 0.11418783899514702, + "grad_norm": 1.4736177921295166, + "learning_rate": 4.84087357035747e-05, + "loss": 4.9127, + "step": 19200 + }, + { + "epoch": 0.11419378627842801, + "grad_norm": 1.2947148084640503, + "learning_rate": 4.8408571715469354e-05, + "loss": 4.9169, + "step": 19201 + }, + { + "epoch": 0.11419973356170901, + "grad_norm": 1.2428392171859741, + "learning_rate": 4.840840771919232e-05, + "loss": 5.2759, + "step": 19202 + }, + { + "epoch": 0.11420568084499001, + "grad_norm": 1.2743968963623047, + "learning_rate": 4.840824371474364e-05, + "loss": 5.2273, + "step": 19203 + }, + { + "epoch": 0.114211628128271, + "grad_norm": 1.3068950176239014, + "learning_rate": 4.840807970212339e-05, + "loss": 5.3455, + "step": 19204 + }, + { + "epoch": 0.114217575411552, + "grad_norm": 1.2238211631774902, + "learning_rate": 4.8407915681331614e-05, + "loss": 5.024, + "step": 19205 + }, + { + "epoch": 0.114223522694833, + "grad_norm": 1.1461126804351807, + "learning_rate": 4.8407751652368384e-05, + "loss": 5.2113, + "step": 19206 + }, + { + "epoch": 0.11422946997811399, + "grad_norm": 1.2286972999572754, + "learning_rate": 4.840758761523375e-05, + "loss": 5.006, + "step": 19207 + }, + { + "epoch": 0.114235417261395, + "grad_norm": 1.3054790496826172, + "learning_rate": 4.840742356992777e-05, + "loss": 5.0592, + "step": 19208 + }, + { + "epoch": 0.114241364544676, + "grad_norm": 1.2426046133041382, + "learning_rate": 4.84072595164505e-05, + "loss": 5.1058, + "step": 19209 + }, + { + "epoch": 0.11424731182795698, + "grad_norm": 1.325263261795044, + "learning_rate": 4.840709545480199e-05, + "loss": 5.0528, + "step": 19210 + }, + { + "epoch": 0.11425325911123799, + "grad_norm": 1.1753286123275757, + "learning_rate": 4.840693138498231e-05, + "loss": 5.2193, + "step": 19211 + }, + { + "epoch": 0.11425920639451899, + "grad_norm": 1.486204743385315, + "learning_rate": 4.8406767306991515e-05, + "loss": 5.0389, + "step": 19212 + }, + { + "epoch": 0.11426515367779998, + "grad_norm": 1.344887614250183, + "learning_rate": 4.8406603220829655e-05, + "loss": 5.0072, + "step": 19213 + }, + { + "epoch": 0.11427110096108098, + "grad_norm": 1.270340919494629, + "learning_rate": 4.840643912649679e-05, + "loss": 5.0154, + "step": 19214 + }, + { + "epoch": 0.11427704824436198, + "grad_norm": 1.390960454940796, + "learning_rate": 4.8406275023992983e-05, + "loss": 5.0803, + "step": 19215 + }, + { + "epoch": 0.11428299552764297, + "grad_norm": 1.2927583456039429, + "learning_rate": 4.8406110913318294e-05, + "loss": 5.04, + "step": 19216 + }, + { + "epoch": 0.11428894281092397, + "grad_norm": 1.3101180791854858, + "learning_rate": 4.840594679447275e-05, + "loss": 4.9988, + "step": 19217 + }, + { + "epoch": 0.11429489009420497, + "grad_norm": 1.2187588214874268, + "learning_rate": 4.8405782667456454e-05, + "loss": 5.1006, + "step": 19218 + }, + { + "epoch": 0.11430083737748596, + "grad_norm": 1.3578346967697144, + "learning_rate": 4.840561853226944e-05, + "loss": 5.0528, + "step": 19219 + }, + { + "epoch": 0.11430678466076696, + "grad_norm": 1.8960474729537964, + "learning_rate": 4.840545438891176e-05, + "loss": 5.323, + "step": 19220 + }, + { + "epoch": 0.11431273194404797, + "grad_norm": 1.3410239219665527, + "learning_rate": 4.840529023738348e-05, + "loss": 5.1488, + "step": 19221 + }, + { + "epoch": 0.11431867922732895, + "grad_norm": 1.381373405456543, + "learning_rate": 4.840512607768465e-05, + "loss": 5.1477, + "step": 19222 + }, + { + "epoch": 0.11432462651060996, + "grad_norm": 1.4095546007156372, + "learning_rate": 4.8404961909815336e-05, + "loss": 5.1515, + "step": 19223 + }, + { + "epoch": 0.11433057379389094, + "grad_norm": 1.254451870918274, + "learning_rate": 4.840479773377559e-05, + "loss": 5.1276, + "step": 19224 + }, + { + "epoch": 0.11433652107717195, + "grad_norm": 1.3001519441604614, + "learning_rate": 4.840463354956548e-05, + "loss": 5.1561, + "step": 19225 + }, + { + "epoch": 0.11434246836045295, + "grad_norm": 1.231469750404358, + "learning_rate": 4.840446935718505e-05, + "loss": 4.963, + "step": 19226 + }, + { + "epoch": 0.11434841564373394, + "grad_norm": 1.323225736618042, + "learning_rate": 4.840430515663435e-05, + "loss": 5.0998, + "step": 19227 + }, + { + "epoch": 0.11435436292701494, + "grad_norm": 1.2244281768798828, + "learning_rate": 4.8404140947913456e-05, + "loss": 5.0727, + "step": 19228 + }, + { + "epoch": 0.11436031021029594, + "grad_norm": 1.2634974718093872, + "learning_rate": 4.840397673102242e-05, + "loss": 5.2049, + "step": 19229 + }, + { + "epoch": 0.11436625749357693, + "grad_norm": 1.5431766510009766, + "learning_rate": 4.84038125059613e-05, + "loss": 5.1387, + "step": 19230 + }, + { + "epoch": 0.11437220477685793, + "grad_norm": 1.485696792602539, + "learning_rate": 4.8403648272730145e-05, + "loss": 4.7971, + "step": 19231 + }, + { + "epoch": 0.11437815206013893, + "grad_norm": 1.4774583578109741, + "learning_rate": 4.840348403132902e-05, + "loss": 4.8967, + "step": 19232 + }, + { + "epoch": 0.11438409934341992, + "grad_norm": 1.1903584003448486, + "learning_rate": 4.840331978175798e-05, + "loss": 4.8827, + "step": 19233 + }, + { + "epoch": 0.11439004662670092, + "grad_norm": 1.3851109743118286, + "learning_rate": 4.840315552401708e-05, + "loss": 4.8348, + "step": 19234 + }, + { + "epoch": 0.11439599390998192, + "grad_norm": 1.3834025859832764, + "learning_rate": 4.840299125810639e-05, + "loss": 4.9392, + "step": 19235 + }, + { + "epoch": 0.11440194119326291, + "grad_norm": 1.2576985359191895, + "learning_rate": 4.840282698402595e-05, + "loss": 4.9092, + "step": 19236 + }, + { + "epoch": 0.11440788847654391, + "grad_norm": 1.2408863306045532, + "learning_rate": 4.840266270177583e-05, + "loss": 4.9041, + "step": 19237 + }, + { + "epoch": 0.11441383575982492, + "grad_norm": 1.4397286176681519, + "learning_rate": 4.840249841135608e-05, + "loss": 4.9588, + "step": 19238 + }, + { + "epoch": 0.1144197830431059, + "grad_norm": 1.3446424007415771, + "learning_rate": 4.840233411276676e-05, + "loss": 4.9757, + "step": 19239 + }, + { + "epoch": 0.1144257303263869, + "grad_norm": 1.2520800828933716, + "learning_rate": 4.840216980600793e-05, + "loss": 4.9746, + "step": 19240 + }, + { + "epoch": 0.11443167760966791, + "grad_norm": 1.2509692907333374, + "learning_rate": 4.840200549107963e-05, + "loss": 5.063, + "step": 19241 + }, + { + "epoch": 0.1144376248929489, + "grad_norm": 1.3295235633850098, + "learning_rate": 4.840184116798194e-05, + "loss": 5.02, + "step": 19242 + }, + { + "epoch": 0.1144435721762299, + "grad_norm": 1.3346072435379028, + "learning_rate": 4.8401676836714916e-05, + "loss": 5.0393, + "step": 19243 + }, + { + "epoch": 0.1144495194595109, + "grad_norm": 1.6711392402648926, + "learning_rate": 4.84015124972786e-05, + "loss": 5.0856, + "step": 19244 + }, + { + "epoch": 0.11445546674279189, + "grad_norm": 1.2785863876342773, + "learning_rate": 4.8401348149673065e-05, + "loss": 5.1181, + "step": 19245 + }, + { + "epoch": 0.11446141402607289, + "grad_norm": 1.4998282194137573, + "learning_rate": 4.8401183793898354e-05, + "loss": 5.0101, + "step": 19246 + }, + { + "epoch": 0.1144673613093539, + "grad_norm": 1.4768141508102417, + "learning_rate": 4.840101942995454e-05, + "loss": 4.8256, + "step": 19247 + }, + { + "epoch": 0.11447330859263488, + "grad_norm": 1.3829854726791382, + "learning_rate": 4.840085505784167e-05, + "loss": 4.8298, + "step": 19248 + }, + { + "epoch": 0.11447925587591588, + "grad_norm": 1.2079180479049683, + "learning_rate": 4.840069067755979e-05, + "loss": 4.9054, + "step": 19249 + }, + { + "epoch": 0.11448520315919689, + "grad_norm": 1.464245080947876, + "learning_rate": 4.8400526289108984e-05, + "loss": 4.8943, + "step": 19250 + }, + { + "epoch": 0.11449115044247787, + "grad_norm": 1.400992512702942, + "learning_rate": 4.840036189248929e-05, + "loss": 4.754, + "step": 19251 + }, + { + "epoch": 0.11449709772575888, + "grad_norm": 1.41909921169281, + "learning_rate": 4.840019748770077e-05, + "loss": 4.9179, + "step": 19252 + }, + { + "epoch": 0.11450304500903986, + "grad_norm": 1.3990073204040527, + "learning_rate": 4.840003307474349e-05, + "loss": 4.7989, + "step": 19253 + }, + { + "epoch": 0.11450899229232087, + "grad_norm": 1.2858465909957886, + "learning_rate": 4.8399868653617497e-05, + "loss": 4.7556, + "step": 19254 + }, + { + "epoch": 0.11451493957560187, + "grad_norm": 1.2721470594406128, + "learning_rate": 4.8399704224322854e-05, + "loss": 4.8441, + "step": 19255 + }, + { + "epoch": 0.11452088685888286, + "grad_norm": 1.2352218627929688, + "learning_rate": 4.839953978685961e-05, + "loss": 4.753, + "step": 19256 + }, + { + "epoch": 0.11452683414216386, + "grad_norm": 1.3000402450561523, + "learning_rate": 4.8399375341227834e-05, + "loss": 4.7634, + "step": 19257 + }, + { + "epoch": 0.11453278142544486, + "grad_norm": 1.2934285402297974, + "learning_rate": 4.839921088742757e-05, + "loss": 4.8047, + "step": 19258 + }, + { + "epoch": 0.11453872870872585, + "grad_norm": 1.5773643255233765, + "learning_rate": 4.839904642545889e-05, + "loss": 4.8588, + "step": 19259 + }, + { + "epoch": 0.11454467599200685, + "grad_norm": 1.3872511386871338, + "learning_rate": 4.8398881955321844e-05, + "loss": 5.0781, + "step": 19260 + }, + { + "epoch": 0.11455062327528785, + "grad_norm": 1.403011679649353, + "learning_rate": 4.839871747701649e-05, + "loss": 5.1375, + "step": 19261 + }, + { + "epoch": 0.11455657055856884, + "grad_norm": 1.2086342573165894, + "learning_rate": 4.839855299054289e-05, + "loss": 5.1052, + "step": 19262 + }, + { + "epoch": 0.11456251784184984, + "grad_norm": 1.3916890621185303, + "learning_rate": 4.8398388495901085e-05, + "loss": 5.0687, + "step": 19263 + }, + { + "epoch": 0.11456846512513084, + "grad_norm": 1.4591625928878784, + "learning_rate": 4.839822399309115e-05, + "loss": 5.0098, + "step": 19264 + }, + { + "epoch": 0.11457441240841183, + "grad_norm": 1.3421653509140015, + "learning_rate": 4.839805948211314e-05, + "loss": 4.9511, + "step": 19265 + }, + { + "epoch": 0.11458035969169283, + "grad_norm": 1.3959892988204956, + "learning_rate": 4.83978949629671e-05, + "loss": 5.0206, + "step": 19266 + }, + { + "epoch": 0.11458630697497384, + "grad_norm": 1.3058884143829346, + "learning_rate": 4.839773043565311e-05, + "loss": 5.0885, + "step": 19267 + }, + { + "epoch": 0.11459225425825482, + "grad_norm": 1.452760100364685, + "learning_rate": 4.839756590017121e-05, + "loss": 4.9945, + "step": 19268 + }, + { + "epoch": 0.11459820154153583, + "grad_norm": 1.4445050954818726, + "learning_rate": 4.8397401356521454e-05, + "loss": 4.8128, + "step": 19269 + }, + { + "epoch": 0.11460414882481683, + "grad_norm": 1.2491203546524048, + "learning_rate": 4.8397236804703916e-05, + "loss": 4.7355, + "step": 19270 + }, + { + "epoch": 0.11461009610809782, + "grad_norm": 1.3198809623718262, + "learning_rate": 4.839707224471864e-05, + "loss": 4.7621, + "step": 19271 + }, + { + "epoch": 0.11461604339137882, + "grad_norm": 1.4831585884094238, + "learning_rate": 4.8396907676565686e-05, + "loss": 4.7393, + "step": 19272 + }, + { + "epoch": 0.11462199067465982, + "grad_norm": 1.2767844200134277, + "learning_rate": 4.839674310024512e-05, + "loss": 4.8063, + "step": 19273 + }, + { + "epoch": 0.11462793795794081, + "grad_norm": 1.4342589378356934, + "learning_rate": 4.839657851575698e-05, + "loss": 4.7615, + "step": 19274 + }, + { + "epoch": 0.11463388524122181, + "grad_norm": 1.30052649974823, + "learning_rate": 4.839641392310135e-05, + "loss": 4.7389, + "step": 19275 + }, + { + "epoch": 0.11463983252450281, + "grad_norm": 1.3592944145202637, + "learning_rate": 4.8396249322278266e-05, + "loss": 4.704, + "step": 19276 + }, + { + "epoch": 0.1146457798077838, + "grad_norm": 1.1905149221420288, + "learning_rate": 4.83960847132878e-05, + "loss": 4.7189, + "step": 19277 + }, + { + "epoch": 0.1146517270910648, + "grad_norm": 1.4920209646224976, + "learning_rate": 4.8395920096129996e-05, + "loss": 4.8844, + "step": 19278 + }, + { + "epoch": 0.1146576743743458, + "grad_norm": 1.486556887626648, + "learning_rate": 4.839575547080491e-05, + "loss": 4.9462, + "step": 19279 + }, + { + "epoch": 0.1146636216576268, + "grad_norm": 1.500434160232544, + "learning_rate": 4.839559083731262e-05, + "loss": 4.9118, + "step": 19280 + }, + { + "epoch": 0.1146695689409078, + "grad_norm": 1.5061683654785156, + "learning_rate": 4.839542619565317e-05, + "loss": 4.7921, + "step": 19281 + }, + { + "epoch": 0.11467551622418878, + "grad_norm": 1.587161660194397, + "learning_rate": 4.839526154582662e-05, + "loss": 5.1129, + "step": 19282 + }, + { + "epoch": 0.11468146350746979, + "grad_norm": 1.3225055932998657, + "learning_rate": 4.839509688783302e-05, + "loss": 4.8538, + "step": 19283 + }, + { + "epoch": 0.11468741079075079, + "grad_norm": 1.3121862411499023, + "learning_rate": 4.839493222167244e-05, + "loss": 4.8695, + "step": 19284 + }, + { + "epoch": 0.11469335807403178, + "grad_norm": 1.4202474355697632, + "learning_rate": 4.839476754734492e-05, + "loss": 4.8628, + "step": 19285 + }, + { + "epoch": 0.11469930535731278, + "grad_norm": 1.283316969871521, + "learning_rate": 4.8394602864850534e-05, + "loss": 4.8431, + "step": 19286 + }, + { + "epoch": 0.11470525264059378, + "grad_norm": 1.3255420923233032, + "learning_rate": 4.839443817418934e-05, + "loss": 4.9993, + "step": 19287 + }, + { + "epoch": 0.11471119992387477, + "grad_norm": 1.3569047451019287, + "learning_rate": 4.8394273475361386e-05, + "loss": 4.9478, + "step": 19288 + }, + { + "epoch": 0.11471714720715577, + "grad_norm": 1.2374382019042969, + "learning_rate": 4.839410876836673e-05, + "loss": 5.1119, + "step": 19289 + }, + { + "epoch": 0.11472309449043677, + "grad_norm": 1.3518184423446655, + "learning_rate": 4.839394405320543e-05, + "loss": 5.2506, + "step": 19290 + }, + { + "epoch": 0.11472904177371776, + "grad_norm": 1.2599278688430786, + "learning_rate": 4.839377932987755e-05, + "loss": 5.208, + "step": 19291 + }, + { + "epoch": 0.11473498905699876, + "grad_norm": 1.3122080564498901, + "learning_rate": 4.839361459838314e-05, + "loss": 5.2356, + "step": 19292 + }, + { + "epoch": 0.11474093634027976, + "grad_norm": 1.1587629318237305, + "learning_rate": 4.839344985872226e-05, + "loss": 5.2469, + "step": 19293 + }, + { + "epoch": 0.11474688362356075, + "grad_norm": 1.2733700275421143, + "learning_rate": 4.839328511089498e-05, + "loss": 5.2365, + "step": 19294 + }, + { + "epoch": 0.11475283090684175, + "grad_norm": 1.3206977844238281, + "learning_rate": 4.8393120354901334e-05, + "loss": 5.2242, + "step": 19295 + }, + { + "epoch": 0.11475877819012276, + "grad_norm": 1.1924374103546143, + "learning_rate": 4.83929555907414e-05, + "loss": 5.2916, + "step": 19296 + }, + { + "epoch": 0.11476472547340374, + "grad_norm": 1.2989557981491089, + "learning_rate": 4.8392790818415215e-05, + "loss": 5.173, + "step": 19297 + }, + { + "epoch": 0.11477067275668475, + "grad_norm": 1.3470929861068726, + "learning_rate": 4.839262603792286e-05, + "loss": 5.2309, + "step": 19298 + }, + { + "epoch": 0.11477662003996575, + "grad_norm": 1.1529438495635986, + "learning_rate": 4.8392461249264376e-05, + "loss": 5.2373, + "step": 19299 + }, + { + "epoch": 0.11478256732324674, + "grad_norm": 1.1988370418548584, + "learning_rate": 4.839229645243982e-05, + "loss": 5.2067, + "step": 19300 + }, + { + "epoch": 0.11478851460652774, + "grad_norm": 1.3069959878921509, + "learning_rate": 4.839213164744926e-05, + "loss": 5.1413, + "step": 19301 + }, + { + "epoch": 0.11479446188980874, + "grad_norm": 1.230211615562439, + "learning_rate": 4.839196683429275e-05, + "loss": 5.2076, + "step": 19302 + }, + { + "epoch": 0.11480040917308973, + "grad_norm": 1.3232944011688232, + "learning_rate": 4.839180201297034e-05, + "loss": 5.2077, + "step": 19303 + }, + { + "epoch": 0.11480635645637073, + "grad_norm": 1.2436466217041016, + "learning_rate": 4.839163718348211e-05, + "loss": 5.1646, + "step": 19304 + }, + { + "epoch": 0.11481230373965173, + "grad_norm": 1.160416841506958, + "learning_rate": 4.8391472345828085e-05, + "loss": 5.0582, + "step": 19305 + }, + { + "epoch": 0.11481825102293272, + "grad_norm": 1.3895483016967773, + "learning_rate": 4.8391307500008344e-05, + "loss": 5.2516, + "step": 19306 + }, + { + "epoch": 0.11482419830621372, + "grad_norm": 1.5018577575683594, + "learning_rate": 4.8391142646022935e-05, + "loss": 5.4308, + "step": 19307 + }, + { + "epoch": 0.11483014558949473, + "grad_norm": 1.5278204679489136, + "learning_rate": 4.8390977783871925e-05, + "loss": 5.2238, + "step": 19308 + }, + { + "epoch": 0.11483609287277571, + "grad_norm": 1.5735019445419312, + "learning_rate": 4.839081291355536e-05, + "loss": 5.4874, + "step": 19309 + }, + { + "epoch": 0.11484204015605672, + "grad_norm": 1.4098745584487915, + "learning_rate": 4.839064803507332e-05, + "loss": 5.082, + "step": 19310 + }, + { + "epoch": 0.1148479874393377, + "grad_norm": 1.47605299949646, + "learning_rate": 4.8390483148425824e-05, + "loss": 5.0869, + "step": 19311 + }, + { + "epoch": 0.1148539347226187, + "grad_norm": 1.442550778388977, + "learning_rate": 4.8390318253612966e-05, + "loss": 5.1232, + "step": 19312 + }, + { + "epoch": 0.11485988200589971, + "grad_norm": 1.1225110292434692, + "learning_rate": 4.8390153350634785e-05, + "loss": 5.0782, + "step": 19313 + }, + { + "epoch": 0.1148658292891807, + "grad_norm": 1.329656720161438, + "learning_rate": 4.838998843949135e-05, + "loss": 4.9912, + "step": 19314 + }, + { + "epoch": 0.1148717765724617, + "grad_norm": 1.6484954357147217, + "learning_rate": 4.8389823520182704e-05, + "loss": 4.785, + "step": 19315 + }, + { + "epoch": 0.1148777238557427, + "grad_norm": 1.46773099899292, + "learning_rate": 4.838965859270891e-05, + "loss": 4.7835, + "step": 19316 + }, + { + "epoch": 0.11488367113902369, + "grad_norm": 1.717592477798462, + "learning_rate": 4.838949365707004e-05, + "loss": 5.1603, + "step": 19317 + }, + { + "epoch": 0.11488961842230469, + "grad_norm": 1.7265046834945679, + "learning_rate": 4.838932871326613e-05, + "loss": 4.9057, + "step": 19318 + }, + { + "epoch": 0.11489556570558569, + "grad_norm": 1.6203346252441406, + "learning_rate": 4.838916376129725e-05, + "loss": 4.8206, + "step": 19319 + }, + { + "epoch": 0.11490151298886668, + "grad_norm": 1.2972123622894287, + "learning_rate": 4.838899880116345e-05, + "loss": 4.7026, + "step": 19320 + }, + { + "epoch": 0.11490746027214768, + "grad_norm": 1.4215303659439087, + "learning_rate": 4.838883383286479e-05, + "loss": 4.7032, + "step": 19321 + }, + { + "epoch": 0.11491340755542868, + "grad_norm": 1.442439317703247, + "learning_rate": 4.838866885640134e-05, + "loss": 4.6853, + "step": 19322 + }, + { + "epoch": 0.11491935483870967, + "grad_norm": 1.3752079010009766, + "learning_rate": 4.838850387177315e-05, + "loss": 4.6842, + "step": 19323 + }, + { + "epoch": 0.11492530212199067, + "grad_norm": 1.4834825992584229, + "learning_rate": 4.838833887898026e-05, + "loss": 4.6455, + "step": 19324 + }, + { + "epoch": 0.11493124940527168, + "grad_norm": 1.3493545055389404, + "learning_rate": 4.8388173878022743e-05, + "loss": 4.5489, + "step": 19325 + }, + { + "epoch": 0.11493719668855266, + "grad_norm": 1.5903066396713257, + "learning_rate": 4.838800886890067e-05, + "loss": 4.5574, + "step": 19326 + }, + { + "epoch": 0.11494314397183367, + "grad_norm": 1.3842332363128662, + "learning_rate": 4.8387843851614076e-05, + "loss": 4.7516, + "step": 19327 + }, + { + "epoch": 0.11494909125511467, + "grad_norm": 1.5355647802352905, + "learning_rate": 4.838767882616303e-05, + "loss": 4.5984, + "step": 19328 + }, + { + "epoch": 0.11495503853839566, + "grad_norm": 1.6534103155136108, + "learning_rate": 4.838751379254759e-05, + "loss": 4.7761, + "step": 19329 + }, + { + "epoch": 0.11496098582167666, + "grad_norm": 1.7028656005859375, + "learning_rate": 4.83873487507678e-05, + "loss": 5.0164, + "step": 19330 + }, + { + "epoch": 0.11496693310495766, + "grad_norm": 1.7165244817733765, + "learning_rate": 4.838718370082374e-05, + "loss": 5.1044, + "step": 19331 + }, + { + "epoch": 0.11497288038823865, + "grad_norm": 1.3272297382354736, + "learning_rate": 4.838701864271545e-05, + "loss": 5.0072, + "step": 19332 + }, + { + "epoch": 0.11497882767151965, + "grad_norm": 1.553613543510437, + "learning_rate": 4.8386853576442994e-05, + "loss": 4.945, + "step": 19333 + }, + { + "epoch": 0.11498477495480065, + "grad_norm": 1.4403818845748901, + "learning_rate": 4.8386688502006425e-05, + "loss": 5.0661, + "step": 19334 + }, + { + "epoch": 0.11499072223808164, + "grad_norm": 1.5347598791122437, + "learning_rate": 4.8386523419405814e-05, + "loss": 5.0603, + "step": 19335 + }, + { + "epoch": 0.11499666952136264, + "grad_norm": 1.3777856826782227, + "learning_rate": 4.83863583286412e-05, + "loss": 5.112, + "step": 19336 + }, + { + "epoch": 0.11500261680464365, + "grad_norm": 1.794287919998169, + "learning_rate": 4.8386193229712654e-05, + "loss": 5.1972, + "step": 19337 + }, + { + "epoch": 0.11500856408792463, + "grad_norm": 1.3142359256744385, + "learning_rate": 4.8386028122620234e-05, + "loss": 5.3577, + "step": 19338 + }, + { + "epoch": 0.11501451137120564, + "grad_norm": 1.0925400257110596, + "learning_rate": 4.838586300736399e-05, + "loss": 5.2094, + "step": 19339 + }, + { + "epoch": 0.11502045865448662, + "grad_norm": 1.6456180810928345, + "learning_rate": 4.838569788394398e-05, + "loss": 4.8287, + "step": 19340 + }, + { + "epoch": 0.11502640593776763, + "grad_norm": 1.2811404466629028, + "learning_rate": 4.8385532752360265e-05, + "loss": 5.0659, + "step": 19341 + }, + { + "epoch": 0.11503235322104863, + "grad_norm": 1.392863154411316, + "learning_rate": 4.83853676126129e-05, + "loss": 5.2655, + "step": 19342 + }, + { + "epoch": 0.11503830050432962, + "grad_norm": 1.2255772352218628, + "learning_rate": 4.838520246470195e-05, + "loss": 5.0422, + "step": 19343 + }, + { + "epoch": 0.11504424778761062, + "grad_norm": 1.735661506652832, + "learning_rate": 4.8385037308627465e-05, + "loss": 6.0562, + "step": 19344 + }, + { + "epoch": 0.11505019507089162, + "grad_norm": 1.2034478187561035, + "learning_rate": 4.838487214438951e-05, + "loss": 4.9773, + "step": 19345 + }, + { + "epoch": 0.11505614235417261, + "grad_norm": 1.2786695957183838, + "learning_rate": 4.838470697198813e-05, + "loss": 4.8771, + "step": 19346 + }, + { + "epoch": 0.11506208963745361, + "grad_norm": 1.2345244884490967, + "learning_rate": 4.8384541791423394e-05, + "loss": 5.0098, + "step": 19347 + }, + { + "epoch": 0.11506803692073461, + "grad_norm": 1.3156319856643677, + "learning_rate": 4.838437660269536e-05, + "loss": 5.1089, + "step": 19348 + }, + { + "epoch": 0.1150739842040156, + "grad_norm": 1.3406500816345215, + "learning_rate": 4.838421140580407e-05, + "loss": 4.8374, + "step": 19349 + }, + { + "epoch": 0.1150799314872966, + "grad_norm": 1.412318468093872, + "learning_rate": 4.83840462007496e-05, + "loss": 4.9074, + "step": 19350 + }, + { + "epoch": 0.1150858787705776, + "grad_norm": 1.3075577020645142, + "learning_rate": 4.8383880987532004e-05, + "loss": 4.9694, + "step": 19351 + }, + { + "epoch": 0.11509182605385859, + "grad_norm": 1.178300380706787, + "learning_rate": 4.838371576615134e-05, + "loss": 4.9863, + "step": 19352 + }, + { + "epoch": 0.1150977733371396, + "grad_norm": 1.5120453834533691, + "learning_rate": 4.838355053660765e-05, + "loss": 4.8766, + "step": 19353 + }, + { + "epoch": 0.1151037206204206, + "grad_norm": 1.4834094047546387, + "learning_rate": 4.8383385298901014e-05, + "loss": 4.9724, + "step": 19354 + }, + { + "epoch": 0.11510966790370158, + "grad_norm": 1.561998724937439, + "learning_rate": 4.8383220053031475e-05, + "loss": 4.9239, + "step": 19355 + }, + { + "epoch": 0.11511561518698259, + "grad_norm": 1.4366774559020996, + "learning_rate": 4.83830547989991e-05, + "loss": 4.8052, + "step": 19356 + }, + { + "epoch": 0.11512156247026359, + "grad_norm": 1.2530354261398315, + "learning_rate": 4.8382889536803936e-05, + "loss": 5.0115, + "step": 19357 + }, + { + "epoch": 0.11512750975354458, + "grad_norm": 1.4827991724014282, + "learning_rate": 4.838272426644606e-05, + "loss": 5.1592, + "step": 19358 + }, + { + "epoch": 0.11513345703682558, + "grad_norm": 1.5874660015106201, + "learning_rate": 4.83825589879255e-05, + "loss": 5.0255, + "step": 19359 + }, + { + "epoch": 0.11513940432010658, + "grad_norm": 1.4771748781204224, + "learning_rate": 4.8382393701242335e-05, + "loss": 5.1537, + "step": 19360 + }, + { + "epoch": 0.11514535160338757, + "grad_norm": 1.4980419874191284, + "learning_rate": 4.8382228406396625e-05, + "loss": 5.0109, + "step": 19361 + }, + { + "epoch": 0.11515129888666857, + "grad_norm": 1.5008245706558228, + "learning_rate": 4.8382063103388405e-05, + "loss": 5.1644, + "step": 19362 + }, + { + "epoch": 0.11515724616994957, + "grad_norm": 1.425648808479309, + "learning_rate": 4.838189779221777e-05, + "loss": 4.8298, + "step": 19363 + }, + { + "epoch": 0.11516319345323056, + "grad_norm": 1.4478559494018555, + "learning_rate": 4.8381732472884744e-05, + "loss": 5.2984, + "step": 19364 + }, + { + "epoch": 0.11516914073651156, + "grad_norm": 1.5071446895599365, + "learning_rate": 4.83815671453894e-05, + "loss": 4.9557, + "step": 19365 + }, + { + "epoch": 0.11517508801979257, + "grad_norm": 1.6358442306518555, + "learning_rate": 4.8381401809731785e-05, + "loss": 4.7956, + "step": 19366 + }, + { + "epoch": 0.11518103530307355, + "grad_norm": 1.5035837888717651, + "learning_rate": 4.838123646591197e-05, + "loss": 4.816, + "step": 19367 + }, + { + "epoch": 0.11518698258635456, + "grad_norm": 1.4265867471694946, + "learning_rate": 4.838107111393e-05, + "loss": 4.7911, + "step": 19368 + }, + { + "epoch": 0.11519292986963554, + "grad_norm": 1.489668369293213, + "learning_rate": 4.838090575378595e-05, + "loss": 4.8403, + "step": 19369 + }, + { + "epoch": 0.11519887715291655, + "grad_norm": 1.4454714059829712, + "learning_rate": 4.838074038547986e-05, + "loss": 4.8848, + "step": 19370 + }, + { + "epoch": 0.11520482443619755, + "grad_norm": 1.42531418800354, + "learning_rate": 4.83805750090118e-05, + "loss": 5.0249, + "step": 19371 + }, + { + "epoch": 0.11521077171947854, + "grad_norm": 1.4370076656341553, + "learning_rate": 4.8380409624381826e-05, + "loss": 4.9219, + "step": 19372 + }, + { + "epoch": 0.11521671900275954, + "grad_norm": 1.543291449546814, + "learning_rate": 4.838024423158999e-05, + "loss": 4.9835, + "step": 19373 + }, + { + "epoch": 0.11522266628604054, + "grad_norm": 1.2460718154907227, + "learning_rate": 4.838007883063634e-05, + "loss": 5.0426, + "step": 19374 + }, + { + "epoch": 0.11522861356932153, + "grad_norm": 1.5159900188446045, + "learning_rate": 4.837991342152096e-05, + "loss": 5.0214, + "step": 19375 + }, + { + "epoch": 0.11523456085260253, + "grad_norm": 1.3800876140594482, + "learning_rate": 4.837974800424389e-05, + "loss": 4.7606, + "step": 19376 + }, + { + "epoch": 0.11524050813588353, + "grad_norm": 1.509788155555725, + "learning_rate": 4.8379582578805197e-05, + "loss": 4.9886, + "step": 19377 + }, + { + "epoch": 0.11524645541916452, + "grad_norm": 1.292523741722107, + "learning_rate": 4.837941714520492e-05, + "loss": 5.1574, + "step": 19378 + }, + { + "epoch": 0.11525240270244552, + "grad_norm": 1.351827621459961, + "learning_rate": 4.837925170344314e-05, + "loss": 5.3133, + "step": 19379 + }, + { + "epoch": 0.11525834998572652, + "grad_norm": 1.4871753454208374, + "learning_rate": 4.83790862535199e-05, + "loss": 4.843, + "step": 19380 + }, + { + "epoch": 0.11526429726900751, + "grad_norm": 1.6031657457351685, + "learning_rate": 4.8378920795435264e-05, + "loss": 4.8244, + "step": 19381 + }, + { + "epoch": 0.11527024455228851, + "grad_norm": 1.3754857778549194, + "learning_rate": 4.8378755329189294e-05, + "loss": 4.8421, + "step": 19382 + }, + { + "epoch": 0.11527619183556952, + "grad_norm": 1.5428962707519531, + "learning_rate": 4.837858985478203e-05, + "loss": 4.9472, + "step": 19383 + }, + { + "epoch": 0.1152821391188505, + "grad_norm": 1.45586097240448, + "learning_rate": 4.837842437221356e-05, + "loss": 4.874, + "step": 19384 + }, + { + "epoch": 0.1152880864021315, + "grad_norm": 1.5139529705047607, + "learning_rate": 4.837825888148391e-05, + "loss": 4.8867, + "step": 19385 + }, + { + "epoch": 0.11529403368541251, + "grad_norm": 1.6341979503631592, + "learning_rate": 4.837809338259315e-05, + "loss": 4.8476, + "step": 19386 + }, + { + "epoch": 0.1152999809686935, + "grad_norm": 1.45046865940094, + "learning_rate": 4.837792787554134e-05, + "loss": 5.0273, + "step": 19387 + }, + { + "epoch": 0.1153059282519745, + "grad_norm": 1.2840397357940674, + "learning_rate": 4.8377762360328547e-05, + "loss": 5.1717, + "step": 19388 + }, + { + "epoch": 0.1153118755352555, + "grad_norm": 1.4211467504501343, + "learning_rate": 4.8377596836954805e-05, + "loss": 5.021, + "step": 19389 + }, + { + "epoch": 0.11531782281853649, + "grad_norm": 1.3885877132415771, + "learning_rate": 4.837743130542019e-05, + "loss": 5.2158, + "step": 19390 + }, + { + "epoch": 0.11532377010181749, + "grad_norm": 1.2344088554382324, + "learning_rate": 4.837726576572476e-05, + "loss": 5.212, + "step": 19391 + }, + { + "epoch": 0.11532971738509849, + "grad_norm": 1.1903822422027588, + "learning_rate": 4.837710021786857e-05, + "loss": 5.3071, + "step": 19392 + }, + { + "epoch": 0.11533566466837948, + "grad_norm": 1.4263699054718018, + "learning_rate": 4.837693466185167e-05, + "loss": 5.1472, + "step": 19393 + }, + { + "epoch": 0.11534161195166048, + "grad_norm": 1.201027512550354, + "learning_rate": 4.837676909767412e-05, + "loss": 5.1779, + "step": 19394 + }, + { + "epoch": 0.11534755923494149, + "grad_norm": 1.2903262376785278, + "learning_rate": 4.8376603525335995e-05, + "loss": 5.038, + "step": 19395 + }, + { + "epoch": 0.11535350651822247, + "grad_norm": 1.3125475645065308, + "learning_rate": 4.837643794483733e-05, + "loss": 4.8948, + "step": 19396 + }, + { + "epoch": 0.11535945380150348, + "grad_norm": 1.1773933172225952, + "learning_rate": 4.837627235617819e-05, + "loss": 5.0854, + "step": 19397 + }, + { + "epoch": 0.11536540108478446, + "grad_norm": 1.2542996406555176, + "learning_rate": 4.837610675935864e-05, + "loss": 5.1329, + "step": 19398 + }, + { + "epoch": 0.11537134836806547, + "grad_norm": 1.1876561641693115, + "learning_rate": 4.837594115437873e-05, + "loss": 4.9757, + "step": 19399 + }, + { + "epoch": 0.11537729565134647, + "grad_norm": 1.2957814931869507, + "learning_rate": 4.837577554123852e-05, + "loss": 5.1203, + "step": 19400 + }, + { + "epoch": 0.11538324293462746, + "grad_norm": 1.2537682056427002, + "learning_rate": 4.837560991993807e-05, + "loss": 4.975, + "step": 19401 + }, + { + "epoch": 0.11538919021790846, + "grad_norm": 1.1898986101150513, + "learning_rate": 4.837544429047743e-05, + "loss": 4.9028, + "step": 19402 + }, + { + "epoch": 0.11539513750118946, + "grad_norm": 1.4129477739334106, + "learning_rate": 4.837527865285667e-05, + "loss": 4.7576, + "step": 19403 + }, + { + "epoch": 0.11540108478447045, + "grad_norm": 1.5386319160461426, + "learning_rate": 4.837511300707585e-05, + "loss": 4.9332, + "step": 19404 + }, + { + "epoch": 0.11540703206775145, + "grad_norm": 1.3597557544708252, + "learning_rate": 4.8374947353135e-05, + "loss": 4.8007, + "step": 19405 + }, + { + "epoch": 0.11541297935103245, + "grad_norm": 1.8251479864120483, + "learning_rate": 4.837478169103421e-05, + "loss": 5.048, + "step": 19406 + }, + { + "epoch": 0.11541892663431344, + "grad_norm": 1.488844871520996, + "learning_rate": 4.8374616020773523e-05, + "loss": 4.855, + "step": 19407 + }, + { + "epoch": 0.11542487391759444, + "grad_norm": 1.1640641689300537, + "learning_rate": 4.8374450342352996e-05, + "loss": 4.7714, + "step": 19408 + }, + { + "epoch": 0.11543082120087544, + "grad_norm": 1.1133109331130981, + "learning_rate": 4.8374284655772696e-05, + "loss": 4.849, + "step": 19409 + }, + { + "epoch": 0.11543676848415643, + "grad_norm": 1.2767143249511719, + "learning_rate": 4.837411896103266e-05, + "loss": 4.8078, + "step": 19410 + }, + { + "epoch": 0.11544271576743743, + "grad_norm": 1.2564034461975098, + "learning_rate": 4.837395325813298e-05, + "loss": 4.8602, + "step": 19411 + }, + { + "epoch": 0.11544866305071844, + "grad_norm": 1.2702561616897583, + "learning_rate": 4.837378754707369e-05, + "loss": 4.9148, + "step": 19412 + }, + { + "epoch": 0.11545461033399942, + "grad_norm": 1.1960140466690063, + "learning_rate": 4.8373621827854845e-05, + "loss": 4.9242, + "step": 19413 + }, + { + "epoch": 0.11546055761728043, + "grad_norm": 1.3663053512573242, + "learning_rate": 4.837345610047651e-05, + "loss": 4.9837, + "step": 19414 + }, + { + "epoch": 0.11546650490056143, + "grad_norm": 1.340897560119629, + "learning_rate": 4.837329036493875e-05, + "loss": 4.8059, + "step": 19415 + }, + { + "epoch": 0.11547245218384242, + "grad_norm": 1.326195478439331, + "learning_rate": 4.8373124621241616e-05, + "loss": 4.7115, + "step": 19416 + }, + { + "epoch": 0.11547839946712342, + "grad_norm": 1.2291951179504395, + "learning_rate": 4.837295886938516e-05, + "loss": 5.0075, + "step": 19417 + }, + { + "epoch": 0.11548434675040442, + "grad_norm": 1.3071776628494263, + "learning_rate": 4.837279310936945e-05, + "loss": 4.7839, + "step": 19418 + }, + { + "epoch": 0.11549029403368541, + "grad_norm": 1.4331681728363037, + "learning_rate": 4.837262734119453e-05, + "loss": 4.7494, + "step": 19419 + }, + { + "epoch": 0.11549624131696641, + "grad_norm": 1.4209895133972168, + "learning_rate": 4.837246156486048e-05, + "loss": 4.8538, + "step": 19420 + }, + { + "epoch": 0.11550218860024741, + "grad_norm": 1.2397242784500122, + "learning_rate": 4.837229578036734e-05, + "loss": 4.7616, + "step": 19421 + }, + { + "epoch": 0.1155081358835284, + "grad_norm": 1.2271560430526733, + "learning_rate": 4.837212998771517e-05, + "loss": 4.7361, + "step": 19422 + }, + { + "epoch": 0.1155140831668094, + "grad_norm": 1.3334344625473022, + "learning_rate": 4.837196418690403e-05, + "loss": 4.8971, + "step": 19423 + }, + { + "epoch": 0.1155200304500904, + "grad_norm": 1.3195756673812866, + "learning_rate": 4.837179837793398e-05, + "loss": 4.8944, + "step": 19424 + }, + { + "epoch": 0.1155259777333714, + "grad_norm": 1.4583542346954346, + "learning_rate": 4.837163256080508e-05, + "loss": 4.7857, + "step": 19425 + }, + { + "epoch": 0.1155319250166524, + "grad_norm": 1.5155558586120605, + "learning_rate": 4.837146673551739e-05, + "loss": 4.7728, + "step": 19426 + }, + { + "epoch": 0.1155378722999334, + "grad_norm": 1.3582627773284912, + "learning_rate": 4.837130090207095e-05, + "loss": 4.7065, + "step": 19427 + }, + { + "epoch": 0.11554381958321439, + "grad_norm": 1.2635151147842407, + "learning_rate": 4.837113506046584e-05, + "loss": 4.882, + "step": 19428 + }, + { + "epoch": 0.11554976686649539, + "grad_norm": 1.417083501815796, + "learning_rate": 4.83709692107021e-05, + "loss": 4.8928, + "step": 19429 + }, + { + "epoch": 0.11555571414977638, + "grad_norm": 1.4780973196029663, + "learning_rate": 4.8370803352779806e-05, + "loss": 4.9458, + "step": 19430 + }, + { + "epoch": 0.11556166143305738, + "grad_norm": 1.2949103116989136, + "learning_rate": 4.8370637486699e-05, + "loss": 4.8753, + "step": 19431 + }, + { + "epoch": 0.11556760871633838, + "grad_norm": 1.4755308628082275, + "learning_rate": 4.8370471612459744e-05, + "loss": 4.7886, + "step": 19432 + }, + { + "epoch": 0.11557355599961937, + "grad_norm": 1.4527158737182617, + "learning_rate": 4.8370305730062095e-05, + "loss": 4.8442, + "step": 19433 + }, + { + "epoch": 0.11557950328290037, + "grad_norm": 1.3422110080718994, + "learning_rate": 4.8370139839506124e-05, + "loss": 4.9745, + "step": 19434 + }, + { + "epoch": 0.11558545056618137, + "grad_norm": 1.5843584537506104, + "learning_rate": 4.836997394079187e-05, + "loss": 4.8432, + "step": 19435 + }, + { + "epoch": 0.11559139784946236, + "grad_norm": 1.3267780542373657, + "learning_rate": 4.836980803391941e-05, + "loss": 4.7816, + "step": 19436 + }, + { + "epoch": 0.11559734513274336, + "grad_norm": 1.3092966079711914, + "learning_rate": 4.836964211888878e-05, + "loss": 5.0283, + "step": 19437 + }, + { + "epoch": 0.11560329241602436, + "grad_norm": 1.4653512239456177, + "learning_rate": 4.836947619570005e-05, + "loss": 4.9265, + "step": 19438 + }, + { + "epoch": 0.11560923969930535, + "grad_norm": 1.344672441482544, + "learning_rate": 4.836931026435328e-05, + "loss": 5.0426, + "step": 19439 + }, + { + "epoch": 0.11561518698258635, + "grad_norm": 1.3949403762817383, + "learning_rate": 4.836914432484853e-05, + "loss": 5.1539, + "step": 19440 + }, + { + "epoch": 0.11562113426586736, + "grad_norm": 1.3876662254333496, + "learning_rate": 4.836897837718585e-05, + "loss": 4.9346, + "step": 19441 + }, + { + "epoch": 0.11562708154914834, + "grad_norm": 1.3399412631988525, + "learning_rate": 4.83688124213653e-05, + "loss": 4.8688, + "step": 19442 + }, + { + "epoch": 0.11563302883242935, + "grad_norm": 1.3819881677627563, + "learning_rate": 4.836864645738694e-05, + "loss": 4.9527, + "step": 19443 + }, + { + "epoch": 0.11563897611571035, + "grad_norm": 1.509074091911316, + "learning_rate": 4.8368480485250825e-05, + "loss": 4.9273, + "step": 19444 + }, + { + "epoch": 0.11564492339899134, + "grad_norm": 1.2591453790664673, + "learning_rate": 4.836831450495701e-05, + "loss": 4.9065, + "step": 19445 + }, + { + "epoch": 0.11565087068227234, + "grad_norm": 1.4065910577774048, + "learning_rate": 4.836814851650557e-05, + "loss": 4.9699, + "step": 19446 + }, + { + "epoch": 0.11565681796555334, + "grad_norm": 1.3355581760406494, + "learning_rate": 4.836798251989655e-05, + "loss": 5.1639, + "step": 19447 + }, + { + "epoch": 0.11566276524883433, + "grad_norm": 1.3715496063232422, + "learning_rate": 4.836781651513e-05, + "loss": 4.855, + "step": 19448 + }, + { + "epoch": 0.11566871253211533, + "grad_norm": 1.569305658340454, + "learning_rate": 4.836765050220599e-05, + "loss": 4.6329, + "step": 19449 + }, + { + "epoch": 0.11567465981539633, + "grad_norm": 1.3613293170928955, + "learning_rate": 4.836748448112458e-05, + "loss": 4.9897, + "step": 19450 + }, + { + "epoch": 0.11568060709867732, + "grad_norm": 1.2653577327728271, + "learning_rate": 4.836731845188581e-05, + "loss": 4.9819, + "step": 19451 + }, + { + "epoch": 0.11568655438195832, + "grad_norm": 1.5030022859573364, + "learning_rate": 4.836715241448976e-05, + "loss": 4.8387, + "step": 19452 + }, + { + "epoch": 0.11569250166523933, + "grad_norm": 1.2560715675354004, + "learning_rate": 4.836698636893647e-05, + "loss": 5.0862, + "step": 19453 + }, + { + "epoch": 0.11569844894852031, + "grad_norm": 1.1981379985809326, + "learning_rate": 4.836682031522602e-05, + "loss": 4.7682, + "step": 19454 + }, + { + "epoch": 0.11570439623180132, + "grad_norm": 1.3572615385055542, + "learning_rate": 4.8366654253358444e-05, + "loss": 4.9008, + "step": 19455 + }, + { + "epoch": 0.11571034351508232, + "grad_norm": 1.2542002201080322, + "learning_rate": 4.8366488183333816e-05, + "loss": 4.911, + "step": 19456 + }, + { + "epoch": 0.1157162907983633, + "grad_norm": 1.4759174585342407, + "learning_rate": 4.8366322105152186e-05, + "loss": 4.789, + "step": 19457 + }, + { + "epoch": 0.11572223808164431, + "grad_norm": 1.2307411432266235, + "learning_rate": 4.8366156018813616e-05, + "loss": 4.9556, + "step": 19458 + }, + { + "epoch": 0.1157281853649253, + "grad_norm": 1.240334153175354, + "learning_rate": 4.836598992431816e-05, + "loss": 4.9996, + "step": 19459 + }, + { + "epoch": 0.1157341326482063, + "grad_norm": 1.3100368976593018, + "learning_rate": 4.8365823821665876e-05, + "loss": 5.0693, + "step": 19460 + }, + { + "epoch": 0.1157400799314873, + "grad_norm": 1.0904709100723267, + "learning_rate": 4.8365657710856835e-05, + "loss": 5.0327, + "step": 19461 + }, + { + "epoch": 0.11574602721476829, + "grad_norm": 1.3847914934158325, + "learning_rate": 4.836549159189108e-05, + "loss": 5.0512, + "step": 19462 + }, + { + "epoch": 0.11575197449804929, + "grad_norm": 1.2307064533233643, + "learning_rate": 4.836532546476866e-05, + "loss": 5.0687, + "step": 19463 + }, + { + "epoch": 0.11575792178133029, + "grad_norm": 1.3900285959243774, + "learning_rate": 4.836515932948966e-05, + "loss": 5.1044, + "step": 19464 + }, + { + "epoch": 0.11576386906461128, + "grad_norm": 1.2194246053695679, + "learning_rate": 4.836499318605412e-05, + "loss": 5.0412, + "step": 19465 + }, + { + "epoch": 0.11576981634789228, + "grad_norm": 1.3460240364074707, + "learning_rate": 4.83648270344621e-05, + "loss": 5.14, + "step": 19466 + }, + { + "epoch": 0.11577576363117328, + "grad_norm": 1.2739115953445435, + "learning_rate": 4.8364660874713664e-05, + "loss": 5.0782, + "step": 19467 + }, + { + "epoch": 0.11578171091445427, + "grad_norm": 1.987092137336731, + "learning_rate": 4.836449470680887e-05, + "loss": 4.8106, + "step": 19468 + }, + { + "epoch": 0.11578765819773527, + "grad_norm": 1.3820792436599731, + "learning_rate": 4.8364328530747765e-05, + "loss": 5.3549, + "step": 19469 + }, + { + "epoch": 0.11579360548101628, + "grad_norm": 1.5276916027069092, + "learning_rate": 4.836416234653042e-05, + "loss": 5.3479, + "step": 19470 + }, + { + "epoch": 0.11579955276429726, + "grad_norm": 1.5292818546295166, + "learning_rate": 4.836399615415688e-05, + "loss": 5.2627, + "step": 19471 + }, + { + "epoch": 0.11580550004757827, + "grad_norm": 1.5759434700012207, + "learning_rate": 4.836382995362722e-05, + "loss": 5.2925, + "step": 19472 + }, + { + "epoch": 0.11581144733085927, + "grad_norm": 1.3807876110076904, + "learning_rate": 4.836366374494148e-05, + "loss": 5.0794, + "step": 19473 + }, + { + "epoch": 0.11581739461414026, + "grad_norm": 1.3631199598312378, + "learning_rate": 4.836349752809973e-05, + "loss": 5.0606, + "step": 19474 + }, + { + "epoch": 0.11582334189742126, + "grad_norm": 1.5250667333602905, + "learning_rate": 4.836333130310202e-05, + "loss": 5.1799, + "step": 19475 + }, + { + "epoch": 0.11582928918070226, + "grad_norm": 1.4191410541534424, + "learning_rate": 4.836316506994842e-05, + "loss": 5.2812, + "step": 19476 + }, + { + "epoch": 0.11583523646398325, + "grad_norm": 1.5502076148986816, + "learning_rate": 4.8362998828638975e-05, + "loss": 5.3503, + "step": 19477 + }, + { + "epoch": 0.11584118374726425, + "grad_norm": 1.441786766052246, + "learning_rate": 4.836283257917375e-05, + "loss": 5.1526, + "step": 19478 + }, + { + "epoch": 0.11584713103054525, + "grad_norm": 1.3994730710983276, + "learning_rate": 4.83626663215528e-05, + "loss": 5.1969, + "step": 19479 + }, + { + "epoch": 0.11585307831382624, + "grad_norm": 1.5141762495040894, + "learning_rate": 4.836250005577619e-05, + "loss": 5.099, + "step": 19480 + }, + { + "epoch": 0.11585902559710724, + "grad_norm": 1.4504029750823975, + "learning_rate": 4.836233378184397e-05, + "loss": 5.5225, + "step": 19481 + }, + { + "epoch": 0.11586497288038825, + "grad_norm": 1.3617264032363892, + "learning_rate": 4.8362167499756194e-05, + "loss": 5.3426, + "step": 19482 + }, + { + "epoch": 0.11587092016366923, + "grad_norm": 1.3681023120880127, + "learning_rate": 4.8362001209512934e-05, + "loss": 5.3476, + "step": 19483 + }, + { + "epoch": 0.11587686744695024, + "grad_norm": 1.050550937652588, + "learning_rate": 4.836183491111424e-05, + "loss": 5.1338, + "step": 19484 + }, + { + "epoch": 0.11588281473023124, + "grad_norm": 1.386715054512024, + "learning_rate": 4.836166860456017e-05, + "loss": 5.2761, + "step": 19485 + }, + { + "epoch": 0.11588876201351223, + "grad_norm": 1.2128262519836426, + "learning_rate": 4.836150228985078e-05, + "loss": 5.165, + "step": 19486 + }, + { + "epoch": 0.11589470929679323, + "grad_norm": 1.224721074104309, + "learning_rate": 4.836133596698614e-05, + "loss": 5.1631, + "step": 19487 + }, + { + "epoch": 0.11590065658007422, + "grad_norm": 1.2348668575286865, + "learning_rate": 4.8361169635966285e-05, + "loss": 5.3206, + "step": 19488 + }, + { + "epoch": 0.11590660386335522, + "grad_norm": 1.1665185689926147, + "learning_rate": 4.836100329679129e-05, + "loss": 5.3162, + "step": 19489 + }, + { + "epoch": 0.11591255114663622, + "grad_norm": 1.2063257694244385, + "learning_rate": 4.836083694946122e-05, + "loss": 5.0348, + "step": 19490 + }, + { + "epoch": 0.11591849842991721, + "grad_norm": 1.5199745893478394, + "learning_rate": 4.836067059397612e-05, + "loss": 5.0793, + "step": 19491 + }, + { + "epoch": 0.11592444571319821, + "grad_norm": 1.2285770177841187, + "learning_rate": 4.8360504230336044e-05, + "loss": 5.1478, + "step": 19492 + }, + { + "epoch": 0.11593039299647921, + "grad_norm": 1.3429020643234253, + "learning_rate": 4.836033785854107e-05, + "loss": 5.3225, + "step": 19493 + }, + { + "epoch": 0.1159363402797602, + "grad_norm": 1.3870415687561035, + "learning_rate": 4.836017147859123e-05, + "loss": 5.2711, + "step": 19494 + }, + { + "epoch": 0.1159422875630412, + "grad_norm": 1.3311539888381958, + "learning_rate": 4.8360005090486603e-05, + "loss": 5.1778, + "step": 19495 + }, + { + "epoch": 0.1159482348463222, + "grad_norm": 1.1331884860992432, + "learning_rate": 4.8359838694227236e-05, + "loss": 5.1435, + "step": 19496 + }, + { + "epoch": 0.11595418212960319, + "grad_norm": 1.427506685256958, + "learning_rate": 4.83596722898132e-05, + "loss": 5.2153, + "step": 19497 + }, + { + "epoch": 0.1159601294128842, + "grad_norm": 1.4716016054153442, + "learning_rate": 4.835950587724453e-05, + "loss": 4.9599, + "step": 19498 + }, + { + "epoch": 0.1159660766961652, + "grad_norm": 1.073724389076233, + "learning_rate": 4.8359339456521305e-05, + "loss": 5.3481, + "step": 19499 + }, + { + "epoch": 0.11597202397944618, + "grad_norm": 1.1965457201004028, + "learning_rate": 4.835917302764358e-05, + "loss": 5.128, + "step": 19500 + }, + { + "epoch": 0.11597797126272719, + "grad_norm": 1.2589031457901, + "learning_rate": 4.83590065906114e-05, + "loss": 5.1952, + "step": 19501 + }, + { + "epoch": 0.11598391854600819, + "grad_norm": 1.5062520503997803, + "learning_rate": 4.8358840145424835e-05, + "loss": 5.3431, + "step": 19502 + }, + { + "epoch": 0.11598986582928918, + "grad_norm": 1.3464981317520142, + "learning_rate": 4.8358673692083944e-05, + "loss": 5.187, + "step": 19503 + }, + { + "epoch": 0.11599581311257018, + "grad_norm": 1.195157766342163, + "learning_rate": 4.8358507230588776e-05, + "loss": 5.4018, + "step": 19504 + }, + { + "epoch": 0.11600176039585118, + "grad_norm": 1.185371994972229, + "learning_rate": 4.83583407609394e-05, + "loss": 5.3204, + "step": 19505 + }, + { + "epoch": 0.11600770767913217, + "grad_norm": 1.1011184453964233, + "learning_rate": 4.835817428313586e-05, + "loss": 5.2426, + "step": 19506 + }, + { + "epoch": 0.11601365496241317, + "grad_norm": 1.2706186771392822, + "learning_rate": 4.835800779717823e-05, + "loss": 5.3277, + "step": 19507 + }, + { + "epoch": 0.11601960224569417, + "grad_norm": 1.23444664478302, + "learning_rate": 4.8357841303066564e-05, + "loss": 5.304, + "step": 19508 + }, + { + "epoch": 0.11602554952897516, + "grad_norm": 1.3166215419769287, + "learning_rate": 4.8357674800800915e-05, + "loss": 5.1755, + "step": 19509 + }, + { + "epoch": 0.11603149681225616, + "grad_norm": 1.0634559392929077, + "learning_rate": 4.835750829038134e-05, + "loss": 5.2188, + "step": 19510 + }, + { + "epoch": 0.11603744409553716, + "grad_norm": 1.0847052335739136, + "learning_rate": 4.8357341771807894e-05, + "loss": 5.1993, + "step": 19511 + }, + { + "epoch": 0.11604339137881815, + "grad_norm": 1.2893394231796265, + "learning_rate": 4.8357175245080645e-05, + "loss": 5.278, + "step": 19512 + }, + { + "epoch": 0.11604933866209916, + "grad_norm": 1.1346744298934937, + "learning_rate": 4.8357008710199653e-05, + "loss": 5.0915, + "step": 19513 + }, + { + "epoch": 0.11605528594538016, + "grad_norm": 1.2405723333358765, + "learning_rate": 4.835684216716497e-05, + "loss": 5.3274, + "step": 19514 + }, + { + "epoch": 0.11606123322866115, + "grad_norm": 1.2367215156555176, + "learning_rate": 4.8356675615976646e-05, + "loss": 5.3145, + "step": 19515 + }, + { + "epoch": 0.11606718051194215, + "grad_norm": 1.23695969581604, + "learning_rate": 4.835650905663476e-05, + "loss": 5.1454, + "step": 19516 + }, + { + "epoch": 0.11607312779522314, + "grad_norm": 1.649644136428833, + "learning_rate": 4.835634248913935e-05, + "loss": 4.9684, + "step": 19517 + }, + { + "epoch": 0.11607907507850414, + "grad_norm": 1.3828257322311401, + "learning_rate": 4.835617591349049e-05, + "loss": 4.8913, + "step": 19518 + }, + { + "epoch": 0.11608502236178514, + "grad_norm": 1.4446587562561035, + "learning_rate": 4.8356009329688215e-05, + "loss": 4.9248, + "step": 19519 + }, + { + "epoch": 0.11609096964506613, + "grad_norm": 1.4149401187896729, + "learning_rate": 4.835584273773261e-05, + "loss": 5.0446, + "step": 19520 + }, + { + "epoch": 0.11609691692834713, + "grad_norm": 1.4073368310928345, + "learning_rate": 4.835567613762372e-05, + "loss": 5.1451, + "step": 19521 + }, + { + "epoch": 0.11610286421162813, + "grad_norm": 1.438539743423462, + "learning_rate": 4.835550952936161e-05, + "loss": 5.3629, + "step": 19522 + }, + { + "epoch": 0.11610881149490912, + "grad_norm": 1.4686654806137085, + "learning_rate": 4.835534291294632e-05, + "loss": 5.4386, + "step": 19523 + }, + { + "epoch": 0.11611475877819012, + "grad_norm": 1.3416131734848022, + "learning_rate": 4.835517628837793e-05, + "loss": 5.4625, + "step": 19524 + }, + { + "epoch": 0.11612070606147112, + "grad_norm": 1.38942551612854, + "learning_rate": 4.835500965565649e-05, + "loss": 5.2164, + "step": 19525 + }, + { + "epoch": 0.11612665334475211, + "grad_norm": 1.157583475112915, + "learning_rate": 4.835484301478205e-05, + "loss": 4.931, + "step": 19526 + }, + { + "epoch": 0.11613260062803311, + "grad_norm": 1.1182529926300049, + "learning_rate": 4.835467636575468e-05, + "loss": 5.0804, + "step": 19527 + }, + { + "epoch": 0.11613854791131412, + "grad_norm": 1.1087690591812134, + "learning_rate": 4.835450970857444e-05, + "loss": 4.9112, + "step": 19528 + }, + { + "epoch": 0.1161444951945951, + "grad_norm": 1.1217858791351318, + "learning_rate": 4.8354343043241374e-05, + "loss": 4.8775, + "step": 19529 + }, + { + "epoch": 0.1161504424778761, + "grad_norm": 1.703722596168518, + "learning_rate": 4.8354176369755556e-05, + "loss": 5.0991, + "step": 19530 + }, + { + "epoch": 0.11615638976115711, + "grad_norm": 1.5027599334716797, + "learning_rate": 4.8354009688117026e-05, + "loss": 5.3486, + "step": 19531 + }, + { + "epoch": 0.1161623370444381, + "grad_norm": 1.3976017236709595, + "learning_rate": 4.835384299832586e-05, + "loss": 5.3045, + "step": 19532 + }, + { + "epoch": 0.1161682843277191, + "grad_norm": 1.4341175556182861, + "learning_rate": 4.83536763003821e-05, + "loss": 5.2463, + "step": 19533 + }, + { + "epoch": 0.1161742316110001, + "grad_norm": 1.248632550239563, + "learning_rate": 4.835350959428582e-05, + "loss": 5.1573, + "step": 19534 + }, + { + "epoch": 0.11618017889428109, + "grad_norm": 1.2873725891113281, + "learning_rate": 4.835334288003707e-05, + "loss": 5.3115, + "step": 19535 + }, + { + "epoch": 0.11618612617756209, + "grad_norm": 1.4359512329101562, + "learning_rate": 4.835317615763591e-05, + "loss": 5.1134, + "step": 19536 + }, + { + "epoch": 0.11619207346084309, + "grad_norm": 1.3092215061187744, + "learning_rate": 4.8353009427082395e-05, + "loss": 5.2955, + "step": 19537 + }, + { + "epoch": 0.11619802074412408, + "grad_norm": 1.292256474494934, + "learning_rate": 4.8352842688376585e-05, + "loss": 5.2163, + "step": 19538 + }, + { + "epoch": 0.11620396802740508, + "grad_norm": 1.2327983379364014, + "learning_rate": 4.8352675941518545e-05, + "loss": 5.2785, + "step": 19539 + }, + { + "epoch": 0.11620991531068608, + "grad_norm": 1.3402459621429443, + "learning_rate": 4.835250918650832e-05, + "loss": 5.2474, + "step": 19540 + }, + { + "epoch": 0.11621586259396707, + "grad_norm": 1.4312702417373657, + "learning_rate": 4.835234242334598e-05, + "loss": 5.1451, + "step": 19541 + }, + { + "epoch": 0.11622180987724808, + "grad_norm": 1.4165308475494385, + "learning_rate": 4.8352175652031576e-05, + "loss": 5.2241, + "step": 19542 + }, + { + "epoch": 0.11622775716052908, + "grad_norm": 1.1984010934829712, + "learning_rate": 4.835200887256517e-05, + "loss": 5.2084, + "step": 19543 + }, + { + "epoch": 0.11623370444381007, + "grad_norm": 1.277029275894165, + "learning_rate": 4.835184208494682e-05, + "loss": 5.1136, + "step": 19544 + }, + { + "epoch": 0.11623965172709107, + "grad_norm": 1.4002219438552856, + "learning_rate": 4.8351675289176586e-05, + "loss": 5.1313, + "step": 19545 + }, + { + "epoch": 0.11624559901037206, + "grad_norm": 1.397129774093628, + "learning_rate": 4.835150848525452e-05, + "loss": 5.2001, + "step": 19546 + }, + { + "epoch": 0.11625154629365306, + "grad_norm": 1.3968653678894043, + "learning_rate": 4.8351341673180686e-05, + "loss": 5.1292, + "step": 19547 + }, + { + "epoch": 0.11625749357693406, + "grad_norm": 1.298600435256958, + "learning_rate": 4.8351174852955125e-05, + "loss": 5.1185, + "step": 19548 + }, + { + "epoch": 0.11626344086021505, + "grad_norm": 1.119382619857788, + "learning_rate": 4.835100802457793e-05, + "loss": 5.2052, + "step": 19549 + }, + { + "epoch": 0.11626938814349605, + "grad_norm": 1.2555358409881592, + "learning_rate": 4.835084118804913e-05, + "loss": 5.2604, + "step": 19550 + }, + { + "epoch": 0.11627533542677705, + "grad_norm": 1.293525218963623, + "learning_rate": 4.835067434336879e-05, + "loss": 5.1402, + "step": 19551 + }, + { + "epoch": 0.11628128271005804, + "grad_norm": 1.3321988582611084, + "learning_rate": 4.8350507490536976e-05, + "loss": 5.0959, + "step": 19552 + }, + { + "epoch": 0.11628722999333904, + "grad_norm": 1.3231252431869507, + "learning_rate": 4.835034062955374e-05, + "loss": 5.0461, + "step": 19553 + }, + { + "epoch": 0.11629317727662004, + "grad_norm": 1.2743831872940063, + "learning_rate": 4.835017376041914e-05, + "loss": 5.1215, + "step": 19554 + }, + { + "epoch": 0.11629912455990103, + "grad_norm": 1.3750208616256714, + "learning_rate": 4.835000688313323e-05, + "loss": 5.0459, + "step": 19555 + }, + { + "epoch": 0.11630507184318203, + "grad_norm": 1.394209861755371, + "learning_rate": 4.834983999769609e-05, + "loss": 5.1577, + "step": 19556 + }, + { + "epoch": 0.11631101912646304, + "grad_norm": 1.2393178939819336, + "learning_rate": 4.834967310410775e-05, + "loss": 5.1217, + "step": 19557 + }, + { + "epoch": 0.11631696640974402, + "grad_norm": 1.2668427228927612, + "learning_rate": 4.834950620236829e-05, + "loss": 5.0266, + "step": 19558 + }, + { + "epoch": 0.11632291369302503, + "grad_norm": 1.4088828563690186, + "learning_rate": 4.834933929247775e-05, + "loss": 4.8089, + "step": 19559 + }, + { + "epoch": 0.11632886097630603, + "grad_norm": 1.2668780088424683, + "learning_rate": 4.83491723744362e-05, + "loss": 5.2791, + "step": 19560 + }, + { + "epoch": 0.11633480825958702, + "grad_norm": 1.3243741989135742, + "learning_rate": 4.834900544824369e-05, + "loss": 5.1743, + "step": 19561 + }, + { + "epoch": 0.11634075554286802, + "grad_norm": 1.497856616973877, + "learning_rate": 4.834883851390029e-05, + "loss": 4.8667, + "step": 19562 + }, + { + "epoch": 0.11634670282614902, + "grad_norm": 1.426867961883545, + "learning_rate": 4.834867157140605e-05, + "loss": 4.9758, + "step": 19563 + }, + { + "epoch": 0.11635265010943001, + "grad_norm": 1.4427236318588257, + "learning_rate": 4.834850462076103e-05, + "loss": 5.45, + "step": 19564 + }, + { + "epoch": 0.11635859739271101, + "grad_norm": 1.4465901851654053, + "learning_rate": 4.834833766196528e-05, + "loss": 5.0877, + "step": 19565 + }, + { + "epoch": 0.11636454467599201, + "grad_norm": 1.76282799243927, + "learning_rate": 4.834817069501888e-05, + "loss": 5.0607, + "step": 19566 + }, + { + "epoch": 0.116370491959273, + "grad_norm": 1.4688469171524048, + "learning_rate": 4.8348003719921864e-05, + "loss": 4.9929, + "step": 19567 + }, + { + "epoch": 0.116376439242554, + "grad_norm": 1.576390266418457, + "learning_rate": 4.834783673667431e-05, + "loss": 5.7283, + "step": 19568 + }, + { + "epoch": 0.116382386525835, + "grad_norm": 1.517745852470398, + "learning_rate": 4.834766974527626e-05, + "loss": 5.3711, + "step": 19569 + }, + { + "epoch": 0.11638833380911599, + "grad_norm": 1.5122108459472656, + "learning_rate": 4.834750274572778e-05, + "loss": 5.6297, + "step": 19570 + }, + { + "epoch": 0.116394281092397, + "grad_norm": 1.9188055992126465, + "learning_rate": 4.8347335738028934e-05, + "loss": 5.0911, + "step": 19571 + }, + { + "epoch": 0.116400228375678, + "grad_norm": 1.7408324480056763, + "learning_rate": 4.834716872217977e-05, + "loss": 5.1396, + "step": 19572 + }, + { + "epoch": 0.11640617565895899, + "grad_norm": 1.7669044733047485, + "learning_rate": 4.834700169818035e-05, + "loss": 5.1463, + "step": 19573 + }, + { + "epoch": 0.11641212294223999, + "grad_norm": 1.7838845252990723, + "learning_rate": 4.834683466603074e-05, + "loss": 5.3486, + "step": 19574 + }, + { + "epoch": 0.11641807022552098, + "grad_norm": 1.8427141904830933, + "learning_rate": 4.834666762573098e-05, + "loss": 5.1454, + "step": 19575 + }, + { + "epoch": 0.11642401750880198, + "grad_norm": 1.8620864152908325, + "learning_rate": 4.8346500577281145e-05, + "loss": 4.9462, + "step": 19576 + }, + { + "epoch": 0.11642996479208298, + "grad_norm": 1.7334544658660889, + "learning_rate": 4.834633352068129e-05, + "loss": 4.9012, + "step": 19577 + }, + { + "epoch": 0.11643591207536397, + "grad_norm": 1.7202188968658447, + "learning_rate": 4.834616645593147e-05, + "loss": 5.2577, + "step": 19578 + }, + { + "epoch": 0.11644185935864497, + "grad_norm": 1.5666993856430054, + "learning_rate": 4.834599938303174e-05, + "loss": 4.9502, + "step": 19579 + }, + { + "epoch": 0.11644780664192597, + "grad_norm": 1.5880829095840454, + "learning_rate": 4.834583230198217e-05, + "loss": 5.1193, + "step": 19580 + }, + { + "epoch": 0.11645375392520696, + "grad_norm": 1.7851444482803345, + "learning_rate": 4.834566521278281e-05, + "loss": 5.1411, + "step": 19581 + }, + { + "epoch": 0.11645970120848796, + "grad_norm": 1.8817992210388184, + "learning_rate": 4.834549811543371e-05, + "loss": 5.1773, + "step": 19582 + }, + { + "epoch": 0.11646564849176896, + "grad_norm": 1.8055325746536255, + "learning_rate": 4.834533100993495e-05, + "loss": 4.8526, + "step": 19583 + }, + { + "epoch": 0.11647159577504995, + "grad_norm": 1.501705527305603, + "learning_rate": 4.834516389628657e-05, + "loss": 4.9943, + "step": 19584 + }, + { + "epoch": 0.11647754305833095, + "grad_norm": 1.8224765062332153, + "learning_rate": 4.8344996774488635e-05, + "loss": 5.3321, + "step": 19585 + }, + { + "epoch": 0.11648349034161196, + "grad_norm": 1.7806826829910278, + "learning_rate": 4.83448296445412e-05, + "loss": 5.1565, + "step": 19586 + }, + { + "epoch": 0.11648943762489294, + "grad_norm": 1.64619779586792, + "learning_rate": 4.8344662506444334e-05, + "loss": 4.9259, + "step": 19587 + }, + { + "epoch": 0.11649538490817395, + "grad_norm": 1.7176555395126343, + "learning_rate": 4.834449536019808e-05, + "loss": 4.9173, + "step": 19588 + }, + { + "epoch": 0.11650133219145495, + "grad_norm": 1.7485530376434326, + "learning_rate": 4.834432820580251e-05, + "loss": 4.9548, + "step": 19589 + }, + { + "epoch": 0.11650727947473594, + "grad_norm": 1.8407695293426514, + "learning_rate": 4.834416104325767e-05, + "loss": 5.5323, + "step": 19590 + }, + { + "epoch": 0.11651322675801694, + "grad_norm": 1.37450110912323, + "learning_rate": 4.834399387256363e-05, + "loss": 5.0058, + "step": 19591 + }, + { + "epoch": 0.11651917404129794, + "grad_norm": 1.6784085035324097, + "learning_rate": 4.834382669372044e-05, + "loss": 5.0886, + "step": 19592 + }, + { + "epoch": 0.11652512132457893, + "grad_norm": 1.9228695631027222, + "learning_rate": 4.834365950672816e-05, + "loss": 5.5382, + "step": 19593 + }, + { + "epoch": 0.11653106860785993, + "grad_norm": 1.7998968362808228, + "learning_rate": 4.834349231158685e-05, + "loss": 5.3286, + "step": 19594 + }, + { + "epoch": 0.11653701589114093, + "grad_norm": 1.9077783823013306, + "learning_rate": 4.8343325108296574e-05, + "loss": 4.9033, + "step": 19595 + }, + { + "epoch": 0.11654296317442192, + "grad_norm": 1.3677197694778442, + "learning_rate": 4.834315789685738e-05, + "loss": 5.4146, + "step": 19596 + }, + { + "epoch": 0.11654891045770292, + "grad_norm": 1.5490330457687378, + "learning_rate": 4.834299067726933e-05, + "loss": 5.8435, + "step": 19597 + }, + { + "epoch": 0.11655485774098392, + "grad_norm": 1.7260395288467407, + "learning_rate": 4.8342823449532484e-05, + "loss": 4.9687, + "step": 19598 + }, + { + "epoch": 0.11656080502426491, + "grad_norm": 1.5140855312347412, + "learning_rate": 4.83426562136469e-05, + "loss": 4.8185, + "step": 19599 + }, + { + "epoch": 0.11656675230754591, + "grad_norm": 1.7183781862258911, + "learning_rate": 4.834248896961263e-05, + "loss": 4.954, + "step": 19600 + }, + { + "epoch": 0.11657269959082692, + "grad_norm": 1.3909941911697388, + "learning_rate": 4.834232171742975e-05, + "loss": 5.3393, + "step": 19601 + }, + { + "epoch": 0.1165786468741079, + "grad_norm": 1.437046766281128, + "learning_rate": 4.83421544570983e-05, + "loss": 5.5486, + "step": 19602 + }, + { + "epoch": 0.11658459415738891, + "grad_norm": 1.4513304233551025, + "learning_rate": 4.8341987188618344e-05, + "loss": 5.6754, + "step": 19603 + }, + { + "epoch": 0.1165905414406699, + "grad_norm": 1.7366830110549927, + "learning_rate": 4.8341819911989936e-05, + "loss": 5.5651, + "step": 19604 + }, + { + "epoch": 0.1165964887239509, + "grad_norm": 1.7084081172943115, + "learning_rate": 4.834165262721315e-05, + "loss": 5.5237, + "step": 19605 + }, + { + "epoch": 0.1166024360072319, + "grad_norm": 1.588749647140503, + "learning_rate": 4.834148533428803e-05, + "loss": 5.5371, + "step": 19606 + }, + { + "epoch": 0.11660838329051289, + "grad_norm": 1.6907262802124023, + "learning_rate": 4.834131803321464e-05, + "loss": 5.3998, + "step": 19607 + }, + { + "epoch": 0.11661433057379389, + "grad_norm": 1.676530122756958, + "learning_rate": 4.834115072399304e-05, + "loss": 5.1636, + "step": 19608 + }, + { + "epoch": 0.11662027785707489, + "grad_norm": 1.6379070281982422, + "learning_rate": 4.834098340662327e-05, + "loss": 5.4196, + "step": 19609 + }, + { + "epoch": 0.11662622514035588, + "grad_norm": 1.6794102191925049, + "learning_rate": 4.8340816081105424e-05, + "loss": 5.3671, + "step": 19610 + }, + { + "epoch": 0.11663217242363688, + "grad_norm": 1.7833147048950195, + "learning_rate": 4.834064874743953e-05, + "loss": 5.3417, + "step": 19611 + }, + { + "epoch": 0.11663811970691788, + "grad_norm": 1.649409532546997, + "learning_rate": 4.834048140562566e-05, + "loss": 5.2781, + "step": 19612 + }, + { + "epoch": 0.11664406699019887, + "grad_norm": 1.6082829236984253, + "learning_rate": 4.834031405566387e-05, + "loss": 5.1188, + "step": 19613 + }, + { + "epoch": 0.11665001427347987, + "grad_norm": 1.6651804447174072, + "learning_rate": 4.834014669755421e-05, + "loss": 5.1683, + "step": 19614 + }, + { + "epoch": 0.11665596155676088, + "grad_norm": 1.715795636177063, + "learning_rate": 4.8339979331296755e-05, + "loss": 5.2491, + "step": 19615 + }, + { + "epoch": 0.11666190884004186, + "grad_norm": 1.6809749603271484, + "learning_rate": 4.8339811956891546e-05, + "loss": 5.0614, + "step": 19616 + }, + { + "epoch": 0.11666785612332287, + "grad_norm": 1.563790202140808, + "learning_rate": 4.833964457433865e-05, + "loss": 5.231, + "step": 19617 + }, + { + "epoch": 0.11667380340660387, + "grad_norm": 1.464647650718689, + "learning_rate": 4.8339477183638136e-05, + "loss": 5.0405, + "step": 19618 + }, + { + "epoch": 0.11667975068988486, + "grad_norm": 1.989701509475708, + "learning_rate": 4.8339309784790043e-05, + "loss": 5.4454, + "step": 19619 + }, + { + "epoch": 0.11668569797316586, + "grad_norm": 2.438558340072632, + "learning_rate": 4.833914237779444e-05, + "loss": 5.7298, + "step": 19620 + }, + { + "epoch": 0.11669164525644686, + "grad_norm": 1.7590994834899902, + "learning_rate": 4.833897496265139e-05, + "loss": 5.4473, + "step": 19621 + }, + { + "epoch": 0.11669759253972785, + "grad_norm": 2.1040074825286865, + "learning_rate": 4.833880753936093e-05, + "loss": 5.2399, + "step": 19622 + }, + { + "epoch": 0.11670353982300885, + "grad_norm": 1.7136433124542236, + "learning_rate": 4.8338640107923146e-05, + "loss": 5.21, + "step": 19623 + }, + { + "epoch": 0.11670948710628985, + "grad_norm": 1.5797784328460693, + "learning_rate": 4.8338472668338074e-05, + "loss": 5.3555, + "step": 19624 + }, + { + "epoch": 0.11671543438957084, + "grad_norm": 1.512645959854126, + "learning_rate": 4.833830522060579e-05, + "loss": 5.4964, + "step": 19625 + }, + { + "epoch": 0.11672138167285184, + "grad_norm": 1.9328651428222656, + "learning_rate": 4.833813776472634e-05, + "loss": 5.9072, + "step": 19626 + }, + { + "epoch": 0.11672732895613284, + "grad_norm": 1.882068395614624, + "learning_rate": 4.8337970300699795e-05, + "loss": 5.4304, + "step": 19627 + }, + { + "epoch": 0.11673327623941383, + "grad_norm": 2.1347815990448, + "learning_rate": 4.83378028285262e-05, + "loss": 5.1286, + "step": 19628 + }, + { + "epoch": 0.11673922352269483, + "grad_norm": 2.0237247943878174, + "learning_rate": 4.833763534820562e-05, + "loss": 5.113, + "step": 19629 + }, + { + "epoch": 0.11674517080597584, + "grad_norm": 1.5656205415725708, + "learning_rate": 4.833746785973811e-05, + "loss": 4.8452, + "step": 19630 + }, + { + "epoch": 0.11675111808925683, + "grad_norm": 2.268324613571167, + "learning_rate": 4.833730036312374e-05, + "loss": 5.7184, + "step": 19631 + }, + { + "epoch": 0.11675706537253783, + "grad_norm": 2.1705756187438965, + "learning_rate": 4.833713285836255e-05, + "loss": 5.6489, + "step": 19632 + }, + { + "epoch": 0.11676301265581882, + "grad_norm": 1.7976182699203491, + "learning_rate": 4.833696534545461e-05, + "loss": 5.7016, + "step": 19633 + }, + { + "epoch": 0.11676895993909982, + "grad_norm": 1.2853381633758545, + "learning_rate": 4.8336797824399976e-05, + "loss": 5.654, + "step": 19634 + }, + { + "epoch": 0.11677490722238082, + "grad_norm": 1.8741413354873657, + "learning_rate": 4.833663029519871e-05, + "loss": 5.6735, + "step": 19635 + }, + { + "epoch": 0.11678085450566181, + "grad_norm": 1.4911704063415527, + "learning_rate": 4.8336462757850864e-05, + "loss": 5.3877, + "step": 19636 + }, + { + "epoch": 0.11678680178894281, + "grad_norm": 1.7979151010513306, + "learning_rate": 4.8336295212356506e-05, + "loss": 5.5677, + "step": 19637 + }, + { + "epoch": 0.11679274907222381, + "grad_norm": 2.036970376968384, + "learning_rate": 4.8336127658715677e-05, + "loss": 5.4768, + "step": 19638 + }, + { + "epoch": 0.1167986963555048, + "grad_norm": 1.9423377513885498, + "learning_rate": 4.833596009692846e-05, + "loss": 5.4021, + "step": 19639 + }, + { + "epoch": 0.1168046436387858, + "grad_norm": 1.5860786437988281, + "learning_rate": 4.8335792526994894e-05, + "loss": 5.3363, + "step": 19640 + }, + { + "epoch": 0.1168105909220668, + "grad_norm": 1.5712209939956665, + "learning_rate": 4.833562494891504e-05, + "loss": 5.432, + "step": 19641 + }, + { + "epoch": 0.11681653820534779, + "grad_norm": 1.3889914751052856, + "learning_rate": 4.833545736268897e-05, + "loss": 5.3272, + "step": 19642 + }, + { + "epoch": 0.1168224854886288, + "grad_norm": 1.607134461402893, + "learning_rate": 4.8335289768316726e-05, + "loss": 5.9617, + "step": 19643 + }, + { + "epoch": 0.1168284327719098, + "grad_norm": 1.6738252639770508, + "learning_rate": 4.8335122165798376e-05, + "loss": 5.6361, + "step": 19644 + }, + { + "epoch": 0.11683438005519078, + "grad_norm": 1.6006174087524414, + "learning_rate": 4.8334954555133974e-05, + "loss": 5.7384, + "step": 19645 + }, + { + "epoch": 0.11684032733847179, + "grad_norm": 1.7018747329711914, + "learning_rate": 4.833478693632358e-05, + "loss": 5.0784, + "step": 19646 + }, + { + "epoch": 0.11684627462175279, + "grad_norm": 1.7542921304702759, + "learning_rate": 4.833461930936726e-05, + "loss": 5.2674, + "step": 19647 + }, + { + "epoch": 0.11685222190503378, + "grad_norm": 1.6434245109558105, + "learning_rate": 4.8334451674265055e-05, + "loss": 4.7117, + "step": 19648 + }, + { + "epoch": 0.11685816918831478, + "grad_norm": 1.7878485918045044, + "learning_rate": 4.8334284031017044e-05, + "loss": 4.8068, + "step": 19649 + }, + { + "epoch": 0.11686411647159578, + "grad_norm": 1.7029922008514404, + "learning_rate": 4.833411637962327e-05, + "loss": 4.9168, + "step": 19650 + }, + { + "epoch": 0.11687006375487677, + "grad_norm": 1.8004266023635864, + "learning_rate": 4.83339487200838e-05, + "loss": 4.9931, + "step": 19651 + }, + { + "epoch": 0.11687601103815777, + "grad_norm": 1.7843881845474243, + "learning_rate": 4.833378105239869e-05, + "loss": 5.0786, + "step": 19652 + }, + { + "epoch": 0.11688195832143877, + "grad_norm": 1.697993278503418, + "learning_rate": 4.833361337656799e-05, + "loss": 5.188, + "step": 19653 + }, + { + "epoch": 0.11688790560471976, + "grad_norm": 1.8484392166137695, + "learning_rate": 4.833344569259177e-05, + "loss": 5.4858, + "step": 19654 + }, + { + "epoch": 0.11689385288800076, + "grad_norm": 1.6850509643554688, + "learning_rate": 4.833327800047009e-05, + "loss": 5.7946, + "step": 19655 + }, + { + "epoch": 0.11689980017128176, + "grad_norm": 1.709845781326294, + "learning_rate": 4.8333110300203e-05, + "loss": 6.0674, + "step": 19656 + }, + { + "epoch": 0.11690574745456275, + "grad_norm": 1.6634660959243774, + "learning_rate": 4.833294259179057e-05, + "loss": 5.8038, + "step": 19657 + }, + { + "epoch": 0.11691169473784375, + "grad_norm": 1.6274930238723755, + "learning_rate": 4.833277487523283e-05, + "loss": 5.6752, + "step": 19658 + }, + { + "epoch": 0.11691764202112476, + "grad_norm": 1.5415219068527222, + "learning_rate": 4.833260715052988e-05, + "loss": 5.4002, + "step": 19659 + }, + { + "epoch": 0.11692358930440575, + "grad_norm": 1.6023998260498047, + "learning_rate": 4.833243941768175e-05, + "loss": 5.2429, + "step": 19660 + }, + { + "epoch": 0.11692953658768675, + "grad_norm": 1.4608384370803833, + "learning_rate": 4.8332271676688515e-05, + "loss": 5.5144, + "step": 19661 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 1.700076937675476, + "learning_rate": 4.833210392755021e-05, + "loss": 5.6356, + "step": 19662 + }, + { + "epoch": 0.11694143115424874, + "grad_norm": 1.415705919265747, + "learning_rate": 4.833193617026692e-05, + "loss": 5.6977, + "step": 19663 + }, + { + "epoch": 0.11694737843752974, + "grad_norm": 1.620815634727478, + "learning_rate": 4.833176840483868e-05, + "loss": 5.8967, + "step": 19664 + }, + { + "epoch": 0.11695332572081073, + "grad_norm": 1.4221736192703247, + "learning_rate": 4.833160063126558e-05, + "loss": 5.5351, + "step": 19665 + }, + { + "epoch": 0.11695927300409173, + "grad_norm": 1.460254192352295, + "learning_rate": 4.833143284954764e-05, + "loss": 5.327, + "step": 19666 + }, + { + "epoch": 0.11696522028737273, + "grad_norm": 1.8340283632278442, + "learning_rate": 4.833126505968495e-05, + "loss": 5.199, + "step": 19667 + }, + { + "epoch": 0.11697116757065372, + "grad_norm": 1.4036595821380615, + "learning_rate": 4.8331097261677555e-05, + "loss": 5.185, + "step": 19668 + }, + { + "epoch": 0.11697711485393472, + "grad_norm": 1.5454041957855225, + "learning_rate": 4.833092945552551e-05, + "loss": 5.3545, + "step": 19669 + }, + { + "epoch": 0.11698306213721572, + "grad_norm": 1.4965288639068604, + "learning_rate": 4.8330761641228886e-05, + "loss": 5.2993, + "step": 19670 + }, + { + "epoch": 0.11698900942049671, + "grad_norm": 2.4290192127227783, + "learning_rate": 4.833059381878773e-05, + "loss": 5.2738, + "step": 19671 + }, + { + "epoch": 0.11699495670377771, + "grad_norm": 2.502086877822876, + "learning_rate": 4.8330425988202097e-05, + "loss": 5.3218, + "step": 19672 + }, + { + "epoch": 0.11700090398705872, + "grad_norm": 2.1629221439361572, + "learning_rate": 4.833025814947206e-05, + "loss": 5.304, + "step": 19673 + }, + { + "epoch": 0.1170068512703397, + "grad_norm": 2.096604824066162, + "learning_rate": 4.8330090302597675e-05, + "loss": 5.3423, + "step": 19674 + }, + { + "epoch": 0.1170127985536207, + "grad_norm": 2.2843055725097656, + "learning_rate": 4.832992244757899e-05, + "loss": 5.2463, + "step": 19675 + }, + { + "epoch": 0.11701874583690171, + "grad_norm": 2.1538522243499756, + "learning_rate": 4.8329754584416074e-05, + "loss": 5.0529, + "step": 19676 + }, + { + "epoch": 0.1170246931201827, + "grad_norm": 1.763832688331604, + "learning_rate": 4.832958671310898e-05, + "loss": 5.105, + "step": 19677 + }, + { + "epoch": 0.1170306404034637, + "grad_norm": 2.048945426940918, + "learning_rate": 4.832941883365777e-05, + "loss": 5.1724, + "step": 19678 + }, + { + "epoch": 0.1170365876867447, + "grad_norm": 2.324202537536621, + "learning_rate": 4.83292509460625e-05, + "loss": 5.1574, + "step": 19679 + }, + { + "epoch": 0.11704253497002569, + "grad_norm": 2.447587728500366, + "learning_rate": 4.8329083050323235e-05, + "loss": 5.2401, + "step": 19680 + }, + { + "epoch": 0.11704848225330669, + "grad_norm": 2.212921380996704, + "learning_rate": 4.832891514644002e-05, + "loss": 5.1122, + "step": 19681 + }, + { + "epoch": 0.11705442953658769, + "grad_norm": 2.1183717250823975, + "learning_rate": 4.832874723441292e-05, + "loss": 4.985, + "step": 19682 + }, + { + "epoch": 0.11706037681986868, + "grad_norm": 2.1509101390838623, + "learning_rate": 4.8328579314242006e-05, + "loss": 5.1369, + "step": 19683 + }, + { + "epoch": 0.11706632410314968, + "grad_norm": 1.9071851968765259, + "learning_rate": 4.832841138592732e-05, + "loss": 5.0454, + "step": 19684 + }, + { + "epoch": 0.11707227138643068, + "grad_norm": 2.262612819671631, + "learning_rate": 4.8328243449468926e-05, + "loss": 5.0763, + "step": 19685 + }, + { + "epoch": 0.11707821866971167, + "grad_norm": 2.073665142059326, + "learning_rate": 4.8328075504866874e-05, + "loss": 5.0779, + "step": 19686 + }, + { + "epoch": 0.11708416595299267, + "grad_norm": 1.9270633459091187, + "learning_rate": 4.832790755212124e-05, + "loss": 4.8148, + "step": 19687 + }, + { + "epoch": 0.11709011323627368, + "grad_norm": 1.9167968034744263, + "learning_rate": 4.832773959123208e-05, + "loss": 4.8027, + "step": 19688 + }, + { + "epoch": 0.11709606051955466, + "grad_norm": 2.0495805740356445, + "learning_rate": 4.8327571622199444e-05, + "loss": 4.9483, + "step": 19689 + }, + { + "epoch": 0.11710200780283567, + "grad_norm": 2.203997850418091, + "learning_rate": 4.83274036450234e-05, + "loss": 5.1086, + "step": 19690 + }, + { + "epoch": 0.11710795508611666, + "grad_norm": 2.0023131370544434, + "learning_rate": 4.8327235659703984e-05, + "loss": 5.0601, + "step": 19691 + }, + { + "epoch": 0.11711390236939766, + "grad_norm": 2.3212523460388184, + "learning_rate": 4.832706766624128e-05, + "loss": 4.9391, + "step": 19692 + }, + { + "epoch": 0.11711984965267866, + "grad_norm": 2.2633869647979736, + "learning_rate": 4.8326899664635336e-05, + "loss": 5.0262, + "step": 19693 + }, + { + "epoch": 0.11712579693595965, + "grad_norm": 2.2608723640441895, + "learning_rate": 4.832673165488622e-05, + "loss": 4.9814, + "step": 19694 + }, + { + "epoch": 0.11713174421924065, + "grad_norm": 2.0270745754241943, + "learning_rate": 4.8326563636993975e-05, + "loss": 4.9321, + "step": 19695 + }, + { + "epoch": 0.11713769150252165, + "grad_norm": 2.1299290657043457, + "learning_rate": 4.832639561095867e-05, + "loss": 4.8248, + "step": 19696 + }, + { + "epoch": 0.11714363878580264, + "grad_norm": 2.1891887187957764, + "learning_rate": 4.8326227576780355e-05, + "loss": 4.963, + "step": 19697 + }, + { + "epoch": 0.11714958606908364, + "grad_norm": 2.35532546043396, + "learning_rate": 4.8326059534459114e-05, + "loss": 4.8617, + "step": 19698 + }, + { + "epoch": 0.11715553335236464, + "grad_norm": 2.215864658355713, + "learning_rate": 4.8325891483994964e-05, + "loss": 5.1467, + "step": 19699 + }, + { + "epoch": 0.11716148063564563, + "grad_norm": 1.7004871368408203, + "learning_rate": 4.8325723425387996e-05, + "loss": 4.8682, + "step": 19700 + }, + { + "epoch": 0.11716742791892663, + "grad_norm": 2.537426471710205, + "learning_rate": 4.832555535863826e-05, + "loss": 5.0373, + "step": 19701 + }, + { + "epoch": 0.11717337520220764, + "grad_norm": 2.3324837684631348, + "learning_rate": 4.832538728374581e-05, + "loss": 4.9261, + "step": 19702 + }, + { + "epoch": 0.11717932248548862, + "grad_norm": 2.107374906539917, + "learning_rate": 4.832521920071071e-05, + "loss": 5.0036, + "step": 19703 + }, + { + "epoch": 0.11718526976876963, + "grad_norm": 2.0933899879455566, + "learning_rate": 4.8325051109533024e-05, + "loss": 5.086, + "step": 19704 + }, + { + "epoch": 0.11719121705205063, + "grad_norm": 1.9250128269195557, + "learning_rate": 4.8324883010212794e-05, + "loss": 4.9056, + "step": 19705 + }, + { + "epoch": 0.11719716433533162, + "grad_norm": 2.0679538249969482, + "learning_rate": 4.832471490275009e-05, + "loss": 5.0291, + "step": 19706 + }, + { + "epoch": 0.11720311161861262, + "grad_norm": 2.1115055084228516, + "learning_rate": 4.8324546787144974e-05, + "loss": 4.8649, + "step": 19707 + }, + { + "epoch": 0.11720905890189362, + "grad_norm": 2.123899459838867, + "learning_rate": 4.832437866339749e-05, + "loss": 4.9011, + "step": 19708 + }, + { + "epoch": 0.11721500618517461, + "grad_norm": 2.2809536457061768, + "learning_rate": 4.832421053150772e-05, + "loss": 5.1844, + "step": 19709 + }, + { + "epoch": 0.11722095346845561, + "grad_norm": 2.04567551612854, + "learning_rate": 4.83240423914757e-05, + "loss": 4.8685, + "step": 19710 + }, + { + "epoch": 0.11722690075173661, + "grad_norm": 1.5762519836425781, + "learning_rate": 4.8323874243301495e-05, + "loss": 5.4069, + "step": 19711 + }, + { + "epoch": 0.1172328480350176, + "grad_norm": 1.719250202178955, + "learning_rate": 4.832370608698518e-05, + "loss": 5.6127, + "step": 19712 + }, + { + "epoch": 0.1172387953182986, + "grad_norm": 1.6808120012283325, + "learning_rate": 4.8323537922526785e-05, + "loss": 5.5401, + "step": 19713 + }, + { + "epoch": 0.1172447426015796, + "grad_norm": 1.6794480085372925, + "learning_rate": 4.832336974992639e-05, + "loss": 5.6679, + "step": 19714 + }, + { + "epoch": 0.11725068988486059, + "grad_norm": 1.7805535793304443, + "learning_rate": 4.832320156918405e-05, + "loss": 5.5025, + "step": 19715 + }, + { + "epoch": 0.1172566371681416, + "grad_norm": 2.1433472633361816, + "learning_rate": 4.832303338029982e-05, + "loss": 5.2425, + "step": 19716 + }, + { + "epoch": 0.1172625844514226, + "grad_norm": 1.5449565649032593, + "learning_rate": 4.832286518327376e-05, + "loss": 5.3278, + "step": 19717 + }, + { + "epoch": 0.11726853173470358, + "grad_norm": 1.7341786623001099, + "learning_rate": 4.832269697810592e-05, + "loss": 5.3393, + "step": 19718 + }, + { + "epoch": 0.11727447901798459, + "grad_norm": 1.4936028718948364, + "learning_rate": 4.832252876479638e-05, + "loss": 5.0499, + "step": 19719 + }, + { + "epoch": 0.11728042630126558, + "grad_norm": 1.7648371458053589, + "learning_rate": 4.832236054334518e-05, + "loss": 5.3585, + "step": 19720 + }, + { + "epoch": 0.11728637358454658, + "grad_norm": 1.8131940364837646, + "learning_rate": 4.832219231375238e-05, + "loss": 5.2496, + "step": 19721 + }, + { + "epoch": 0.11729232086782758, + "grad_norm": 1.5939579010009766, + "learning_rate": 4.832202407601806e-05, + "loss": 5.2294, + "step": 19722 + }, + { + "epoch": 0.11729826815110857, + "grad_norm": 1.6752222776412964, + "learning_rate": 4.832185583014225e-05, + "loss": 5.2679, + "step": 19723 + }, + { + "epoch": 0.11730421543438957, + "grad_norm": 1.4784640073776245, + "learning_rate": 4.832168757612502e-05, + "loss": 5.1567, + "step": 19724 + }, + { + "epoch": 0.11731016271767057, + "grad_norm": 1.5112851858139038, + "learning_rate": 4.8321519313966436e-05, + "loss": 5.0304, + "step": 19725 + }, + { + "epoch": 0.11731611000095156, + "grad_norm": 1.5895473957061768, + "learning_rate": 4.832135104366654e-05, + "loss": 5.0681, + "step": 19726 + }, + { + "epoch": 0.11732205728423256, + "grad_norm": 1.510641098022461, + "learning_rate": 4.832118276522541e-05, + "loss": 5.0667, + "step": 19727 + }, + { + "epoch": 0.11732800456751356, + "grad_norm": 1.7403017282485962, + "learning_rate": 4.83210144786431e-05, + "loss": 4.9199, + "step": 19728 + }, + { + "epoch": 0.11733395185079455, + "grad_norm": 2.239452600479126, + "learning_rate": 4.832084618391966e-05, + "loss": 5.2846, + "step": 19729 + }, + { + "epoch": 0.11733989913407555, + "grad_norm": 1.977001428604126, + "learning_rate": 4.8320677881055154e-05, + "loss": 4.9573, + "step": 19730 + }, + { + "epoch": 0.11734584641735656, + "grad_norm": 2.2819485664367676, + "learning_rate": 4.8320509570049633e-05, + "loss": 4.6549, + "step": 19731 + }, + { + "epoch": 0.11735179370063754, + "grad_norm": 2.3943941593170166, + "learning_rate": 4.832034125090317e-05, + "loss": 4.8411, + "step": 19732 + }, + { + "epoch": 0.11735774098391855, + "grad_norm": 2.5439767837524414, + "learning_rate": 4.832017292361582e-05, + "loss": 4.7305, + "step": 19733 + }, + { + "epoch": 0.11736368826719955, + "grad_norm": 2.21797251701355, + "learning_rate": 4.8320004588187636e-05, + "loss": 4.8963, + "step": 19734 + }, + { + "epoch": 0.11736963555048054, + "grad_norm": 1.9822254180908203, + "learning_rate": 4.831983624461868e-05, + "loss": 4.8062, + "step": 19735 + }, + { + "epoch": 0.11737558283376154, + "grad_norm": 2.56172513961792, + "learning_rate": 4.8319667892909004e-05, + "loss": 4.6495, + "step": 19736 + }, + { + "epoch": 0.11738153011704254, + "grad_norm": 2.3328988552093506, + "learning_rate": 4.831949953305868e-05, + "loss": 4.3587, + "step": 19737 + }, + { + "epoch": 0.11738747740032353, + "grad_norm": 2.4720728397369385, + "learning_rate": 4.831933116506775e-05, + "loss": 4.5648, + "step": 19738 + }, + { + "epoch": 0.11739342468360453, + "grad_norm": 2.3738696575164795, + "learning_rate": 4.831916278893629e-05, + "loss": 4.391, + "step": 19739 + }, + { + "epoch": 0.11739937196688553, + "grad_norm": 2.400050640106201, + "learning_rate": 4.831899440466435e-05, + "loss": 4.5792, + "step": 19740 + }, + { + "epoch": 0.11740531925016652, + "grad_norm": 1.7596909999847412, + "learning_rate": 4.831882601225199e-05, + "loss": 4.8026, + "step": 19741 + }, + { + "epoch": 0.11741126653344752, + "grad_norm": 2.2190558910369873, + "learning_rate": 4.831865761169927e-05, + "loss": 4.578, + "step": 19742 + }, + { + "epoch": 0.11741721381672852, + "grad_norm": 2.468982458114624, + "learning_rate": 4.831848920300624e-05, + "loss": 4.3132, + "step": 19743 + }, + { + "epoch": 0.11742316110000951, + "grad_norm": 2.1495306491851807, + "learning_rate": 4.831832078617298e-05, + "loss": 4.5307, + "step": 19744 + }, + { + "epoch": 0.11742910838329051, + "grad_norm": 2.2298312187194824, + "learning_rate": 4.831815236119953e-05, + "loss": 4.3435, + "step": 19745 + }, + { + "epoch": 0.11743505566657152, + "grad_norm": 2.0968551635742188, + "learning_rate": 4.831798392808595e-05, + "loss": 4.4348, + "step": 19746 + }, + { + "epoch": 0.1174410029498525, + "grad_norm": 2.2520592212677, + "learning_rate": 4.831781548683231e-05, + "loss": 4.4347, + "step": 19747 + }, + { + "epoch": 0.1174469502331335, + "grad_norm": 2.5319058895111084, + "learning_rate": 4.8317647037438655e-05, + "loss": 4.3817, + "step": 19748 + }, + { + "epoch": 0.1174528975164145, + "grad_norm": 2.186539649963379, + "learning_rate": 4.8317478579905054e-05, + "loss": 4.6415, + "step": 19749 + }, + { + "epoch": 0.1174588447996955, + "grad_norm": 2.472963571548462, + "learning_rate": 4.8317310114231554e-05, + "loss": 4.4495, + "step": 19750 + }, + { + "epoch": 0.1174647920829765, + "grad_norm": 2.3692901134490967, + "learning_rate": 4.831714164041823e-05, + "loss": 4.3571, + "step": 19751 + }, + { + "epoch": 0.11747073936625749, + "grad_norm": 1.8001717329025269, + "learning_rate": 4.831697315846513e-05, + "loss": 5.3843, + "step": 19752 + }, + { + "epoch": 0.11747668664953849, + "grad_norm": 1.6087725162506104, + "learning_rate": 4.8316804668372315e-05, + "loss": 5.7155, + "step": 19753 + }, + { + "epoch": 0.11748263393281949, + "grad_norm": 1.5348961353302002, + "learning_rate": 4.8316636170139845e-05, + "loss": 4.8697, + "step": 19754 + }, + { + "epoch": 0.11748858121610048, + "grad_norm": 1.790076494216919, + "learning_rate": 4.831646766376778e-05, + "loss": 5.708, + "step": 19755 + }, + { + "epoch": 0.11749452849938148, + "grad_norm": 1.8615236282348633, + "learning_rate": 4.831629914925617e-05, + "loss": 5.3669, + "step": 19756 + }, + { + "epoch": 0.11750047578266248, + "grad_norm": 1.5969476699829102, + "learning_rate": 4.8316130626605096e-05, + "loss": 5.4041, + "step": 19757 + }, + { + "epoch": 0.11750642306594347, + "grad_norm": 1.5471712350845337, + "learning_rate": 4.8315962095814584e-05, + "loss": 5.5293, + "step": 19758 + }, + { + "epoch": 0.11751237034922447, + "grad_norm": 1.6281818151474, + "learning_rate": 4.831579355688472e-05, + "loss": 5.51, + "step": 19759 + }, + { + "epoch": 0.11751831763250548, + "grad_norm": 1.5264689922332764, + "learning_rate": 4.831562500981555e-05, + "loss": 4.9906, + "step": 19760 + }, + { + "epoch": 0.11752426491578646, + "grad_norm": 1.8446382284164429, + "learning_rate": 4.8315456454607145e-05, + "loss": 4.8351, + "step": 19761 + }, + { + "epoch": 0.11753021219906747, + "grad_norm": 2.0462918281555176, + "learning_rate": 4.8315287891259545e-05, + "loss": 4.7906, + "step": 19762 + }, + { + "epoch": 0.11753615948234847, + "grad_norm": 1.664975643157959, + "learning_rate": 4.831511931977282e-05, + "loss": 5.4149, + "step": 19763 + }, + { + "epoch": 0.11754210676562946, + "grad_norm": 1.8824998140335083, + "learning_rate": 4.831495074014703e-05, + "loss": 5.2587, + "step": 19764 + }, + { + "epoch": 0.11754805404891046, + "grad_norm": 1.6167455911636353, + "learning_rate": 4.8314782152382235e-05, + "loss": 5.3213, + "step": 19765 + }, + { + "epoch": 0.11755400133219146, + "grad_norm": 1.686562180519104, + "learning_rate": 4.831461355647848e-05, + "loss": 5.3497, + "step": 19766 + }, + { + "epoch": 0.11755994861547245, + "grad_norm": 1.7332249879837036, + "learning_rate": 4.831444495243584e-05, + "loss": 5.3139, + "step": 19767 + }, + { + "epoch": 0.11756589589875345, + "grad_norm": 1.6482213735580444, + "learning_rate": 4.8314276340254375e-05, + "loss": 5.5488, + "step": 19768 + }, + { + "epoch": 0.11757184318203445, + "grad_norm": 1.6714067459106445, + "learning_rate": 4.8314107719934134e-05, + "loss": 4.7354, + "step": 19769 + }, + { + "epoch": 0.11757779046531544, + "grad_norm": 1.5826655626296997, + "learning_rate": 4.8313939091475166e-05, + "loss": 5.5232, + "step": 19770 + }, + { + "epoch": 0.11758373774859644, + "grad_norm": 1.4177565574645996, + "learning_rate": 4.831377045487756e-05, + "loss": 5.4262, + "step": 19771 + }, + { + "epoch": 0.11758968503187744, + "grad_norm": 1.4056715965270996, + "learning_rate": 4.831360181014135e-05, + "loss": 5.6306, + "step": 19772 + }, + { + "epoch": 0.11759563231515843, + "grad_norm": 1.7903814315795898, + "learning_rate": 4.83134331572666e-05, + "loss": 4.5016, + "step": 19773 + }, + { + "epoch": 0.11760157959843943, + "grad_norm": 1.8719782829284668, + "learning_rate": 4.831326449625337e-05, + "loss": 4.3561, + "step": 19774 + }, + { + "epoch": 0.11760752688172044, + "grad_norm": 2.0182130336761475, + "learning_rate": 4.831309582710173e-05, + "loss": 4.3988, + "step": 19775 + }, + { + "epoch": 0.11761347416500142, + "grad_norm": 1.828475832939148, + "learning_rate": 4.8312927149811726e-05, + "loss": 4.4127, + "step": 19776 + }, + { + "epoch": 0.11761942144828243, + "grad_norm": 1.8332375288009644, + "learning_rate": 4.831275846438341e-05, + "loss": 4.3285, + "step": 19777 + }, + { + "epoch": 0.11762536873156341, + "grad_norm": 1.7542626857757568, + "learning_rate": 4.831258977081686e-05, + "loss": 5.4412, + "step": 19778 + }, + { + "epoch": 0.11763131601484442, + "grad_norm": 1.9277591705322266, + "learning_rate": 4.831242106911212e-05, + "loss": 4.1537, + "step": 19779 + }, + { + "epoch": 0.11763726329812542, + "grad_norm": 1.943296194076538, + "learning_rate": 4.8312252359269265e-05, + "loss": 4.448, + "step": 19780 + }, + { + "epoch": 0.11764321058140641, + "grad_norm": 1.8032363653182983, + "learning_rate": 4.831208364128834e-05, + "loss": 4.9847, + "step": 19781 + }, + { + "epoch": 0.11764915786468741, + "grad_norm": 1.9383130073547363, + "learning_rate": 4.83119149151694e-05, + "loss": 4.7231, + "step": 19782 + }, + { + "epoch": 0.11765510514796841, + "grad_norm": 1.8854987621307373, + "learning_rate": 4.831174618091252e-05, + "loss": 4.1493, + "step": 19783 + }, + { + "epoch": 0.1176610524312494, + "grad_norm": 1.932180404663086, + "learning_rate": 4.831157743851775e-05, + "loss": 4.0519, + "step": 19784 + }, + { + "epoch": 0.1176669997145304, + "grad_norm": 1.885292887687683, + "learning_rate": 4.831140868798514e-05, + "loss": 4.1593, + "step": 19785 + }, + { + "epoch": 0.1176729469978114, + "grad_norm": 1.8257746696472168, + "learning_rate": 4.8311239929314764e-05, + "loss": 4.3896, + "step": 19786 + }, + { + "epoch": 0.11767889428109239, + "grad_norm": 1.9383732080459595, + "learning_rate": 4.831107116250667e-05, + "loss": 4.1973, + "step": 19787 + }, + { + "epoch": 0.1176848415643734, + "grad_norm": 1.9942466020584106, + "learning_rate": 4.831090238756093e-05, + "loss": 4.3542, + "step": 19788 + }, + { + "epoch": 0.1176907888476544, + "grad_norm": 1.5551074743270874, + "learning_rate": 4.831073360447759e-05, + "loss": 4.9338, + "step": 19789 + }, + { + "epoch": 0.11769673613093538, + "grad_norm": 1.5898525714874268, + "learning_rate": 4.831056481325672e-05, + "loss": 4.8582, + "step": 19790 + }, + { + "epoch": 0.11770268341421639, + "grad_norm": 1.7175228595733643, + "learning_rate": 4.831039601389836e-05, + "loss": 4.6618, + "step": 19791 + }, + { + "epoch": 0.11770863069749739, + "grad_norm": 2.3165528774261475, + "learning_rate": 4.8310227206402594e-05, + "loss": 4.8579, + "step": 19792 + }, + { + "epoch": 0.11771457798077838, + "grad_norm": 1.4406440258026123, + "learning_rate": 4.8310058390769464e-05, + "loss": 5.6443, + "step": 19793 + }, + { + "epoch": 0.11772052526405938, + "grad_norm": 1.6670812368392944, + "learning_rate": 4.8309889566999037e-05, + "loss": 5.2096, + "step": 19794 + }, + { + "epoch": 0.11772647254734038, + "grad_norm": 1.6150201559066772, + "learning_rate": 4.8309720735091354e-05, + "loss": 5.2055, + "step": 19795 + }, + { + "epoch": 0.11773241983062137, + "grad_norm": 1.7714163064956665, + "learning_rate": 4.83095518950465e-05, + "loss": 5.9145, + "step": 19796 + }, + { + "epoch": 0.11773836711390237, + "grad_norm": 1.3608043193817139, + "learning_rate": 4.8309383046864526e-05, + "loss": 5.1546, + "step": 19797 + }, + { + "epoch": 0.11774431439718337, + "grad_norm": 1.2962807416915894, + "learning_rate": 4.830921419054548e-05, + "loss": 5.3574, + "step": 19798 + }, + { + "epoch": 0.11775026168046436, + "grad_norm": 2.0007364749908447, + "learning_rate": 4.8309045326089434e-05, + "loss": 5.0939, + "step": 19799 + }, + { + "epoch": 0.11775620896374536, + "grad_norm": 1.6526695489883423, + "learning_rate": 4.830887645349644e-05, + "loss": 5.7498, + "step": 19800 + }, + { + "epoch": 0.11776215624702636, + "grad_norm": 1.4990460872650146, + "learning_rate": 4.830870757276655e-05, + "loss": 5.2728, + "step": 19801 + }, + { + "epoch": 0.11776810353030735, + "grad_norm": 2.182511806488037, + "learning_rate": 4.830853868389984e-05, + "loss": 5.1598, + "step": 19802 + }, + { + "epoch": 0.11777405081358835, + "grad_norm": 2.515284538269043, + "learning_rate": 4.8308369786896354e-05, + "loss": 5.1378, + "step": 19803 + }, + { + "epoch": 0.11777999809686936, + "grad_norm": 1.9783490896224976, + "learning_rate": 4.830820088175616e-05, + "loss": 4.9242, + "step": 19804 + }, + { + "epoch": 0.11778594538015034, + "grad_norm": 1.790901780128479, + "learning_rate": 4.8308031968479315e-05, + "loss": 5.1156, + "step": 19805 + }, + { + "epoch": 0.11779189266343135, + "grad_norm": 1.751846432685852, + "learning_rate": 4.830786304706587e-05, + "loss": 5.2306, + "step": 19806 + }, + { + "epoch": 0.11779783994671233, + "grad_norm": 1.588497519493103, + "learning_rate": 4.83076941175159e-05, + "loss": 5.3987, + "step": 19807 + }, + { + "epoch": 0.11780378722999334, + "grad_norm": 1.9150582551956177, + "learning_rate": 4.830752517982945e-05, + "loss": 4.977, + "step": 19808 + }, + { + "epoch": 0.11780973451327434, + "grad_norm": 1.706708312034607, + "learning_rate": 4.8307356234006584e-05, + "loss": 5.0455, + "step": 19809 + }, + { + "epoch": 0.11781568179655533, + "grad_norm": 1.9373780488967896, + "learning_rate": 4.830718728004736e-05, + "loss": 5.0547, + "step": 19810 + }, + { + "epoch": 0.11782162907983633, + "grad_norm": 1.6948046684265137, + "learning_rate": 4.830701831795184e-05, + "loss": 5.0943, + "step": 19811 + }, + { + "epoch": 0.11782757636311733, + "grad_norm": 1.630083680152893, + "learning_rate": 4.8306849347720087e-05, + "loss": 5.6369, + "step": 19812 + }, + { + "epoch": 0.11783352364639832, + "grad_norm": 1.4906461238861084, + "learning_rate": 4.830668036935214e-05, + "loss": 5.2921, + "step": 19813 + }, + { + "epoch": 0.11783947092967932, + "grad_norm": 1.6434717178344727, + "learning_rate": 4.8306511382848076e-05, + "loss": 5.3473, + "step": 19814 + }, + { + "epoch": 0.11784541821296032, + "grad_norm": 1.5606834888458252, + "learning_rate": 4.8306342388207956e-05, + "loss": 5.3031, + "step": 19815 + }, + { + "epoch": 0.11785136549624131, + "grad_norm": 2.157352924346924, + "learning_rate": 4.830617338543183e-05, + "loss": 4.4939, + "step": 19816 + }, + { + "epoch": 0.11785731277952231, + "grad_norm": 2.49686598777771, + "learning_rate": 4.830600437451975e-05, + "loss": 4.506, + "step": 19817 + }, + { + "epoch": 0.11786326006280332, + "grad_norm": 1.943969964981079, + "learning_rate": 4.830583535547179e-05, + "loss": 4.411, + "step": 19818 + }, + { + "epoch": 0.1178692073460843, + "grad_norm": 1.9092329740524292, + "learning_rate": 4.830566632828801e-05, + "loss": 4.4121, + "step": 19819 + }, + { + "epoch": 0.1178751546293653, + "grad_norm": 1.7568551301956177, + "learning_rate": 4.830549729296846e-05, + "loss": 4.317, + "step": 19820 + }, + { + "epoch": 0.11788110191264631, + "grad_norm": 1.788150429725647, + "learning_rate": 4.83053282495132e-05, + "loss": 4.2928, + "step": 19821 + }, + { + "epoch": 0.1178870491959273, + "grad_norm": 1.9792863130569458, + "learning_rate": 4.830515919792229e-05, + "loss": 4.3219, + "step": 19822 + }, + { + "epoch": 0.1178929964792083, + "grad_norm": 2.2407681941986084, + "learning_rate": 4.8304990138195795e-05, + "loss": 4.296, + "step": 19823 + }, + { + "epoch": 0.1178989437624893, + "grad_norm": 1.993288516998291, + "learning_rate": 4.830482107033377e-05, + "loss": 4.2922, + "step": 19824 + }, + { + "epoch": 0.11790489104577029, + "grad_norm": 2.1966097354888916, + "learning_rate": 4.8304651994336264e-05, + "loss": 4.1215, + "step": 19825 + }, + { + "epoch": 0.11791083832905129, + "grad_norm": 1.569989562034607, + "learning_rate": 4.8304482910203345e-05, + "loss": 5.5432, + "step": 19826 + }, + { + "epoch": 0.11791678561233229, + "grad_norm": 1.522828459739685, + "learning_rate": 4.8304313817935075e-05, + "loss": 5.465, + "step": 19827 + }, + { + "epoch": 0.11792273289561328, + "grad_norm": 1.9455969333648682, + "learning_rate": 4.830414471753151e-05, + "loss": 5.1462, + "step": 19828 + }, + { + "epoch": 0.11792868017889428, + "grad_norm": 1.8587162494659424, + "learning_rate": 4.830397560899271e-05, + "loss": 5.1987, + "step": 19829 + }, + { + "epoch": 0.11793462746217528, + "grad_norm": 2.1671674251556396, + "learning_rate": 4.830380649231873e-05, + "loss": 5.3333, + "step": 19830 + }, + { + "epoch": 0.11794057474545627, + "grad_norm": 1.8267066478729248, + "learning_rate": 4.8303637367509636e-05, + "loss": 5.5306, + "step": 19831 + }, + { + "epoch": 0.11794652202873727, + "grad_norm": 1.80419921875, + "learning_rate": 4.830346823456548e-05, + "loss": 5.3077, + "step": 19832 + }, + { + "epoch": 0.11795246931201828, + "grad_norm": 1.9116721153259277, + "learning_rate": 4.830329909348632e-05, + "loss": 4.8531, + "step": 19833 + }, + { + "epoch": 0.11795841659529926, + "grad_norm": 1.9208347797393799, + "learning_rate": 4.830312994427223e-05, + "loss": 4.9645, + "step": 19834 + }, + { + "epoch": 0.11796436387858027, + "grad_norm": 1.8385374546051025, + "learning_rate": 4.8302960786923246e-05, + "loss": 4.7095, + "step": 19835 + }, + { + "epoch": 0.11797031116186125, + "grad_norm": 1.9271587133407593, + "learning_rate": 4.830279162143945e-05, + "loss": 4.5788, + "step": 19836 + }, + { + "epoch": 0.11797625844514226, + "grad_norm": 2.0168333053588867, + "learning_rate": 4.8302622447820885e-05, + "loss": 4.7595, + "step": 19837 + }, + { + "epoch": 0.11798220572842326, + "grad_norm": 1.9674837589263916, + "learning_rate": 4.8302453266067616e-05, + "loss": 4.674, + "step": 19838 + }, + { + "epoch": 0.11798815301170425, + "grad_norm": 1.944601058959961, + "learning_rate": 4.830228407617969e-05, + "loss": 4.6683, + "step": 19839 + }, + { + "epoch": 0.11799410029498525, + "grad_norm": 1.8970340490341187, + "learning_rate": 4.83021148781572e-05, + "loss": 5.2577, + "step": 19840 + }, + { + "epoch": 0.11800004757826625, + "grad_norm": 2.035505533218384, + "learning_rate": 4.8301945672000164e-05, + "loss": 4.7872, + "step": 19841 + }, + { + "epoch": 0.11800599486154724, + "grad_norm": 2.4211058616638184, + "learning_rate": 4.830177645770867e-05, + "loss": 4.9424, + "step": 19842 + }, + { + "epoch": 0.11801194214482824, + "grad_norm": 2.080132484436035, + "learning_rate": 4.830160723528276e-05, + "loss": 4.7908, + "step": 19843 + }, + { + "epoch": 0.11801788942810924, + "grad_norm": 3.5975728034973145, + "learning_rate": 4.83014380047225e-05, + "loss": 5.3434, + "step": 19844 + }, + { + "epoch": 0.11802383671139023, + "grad_norm": 1.6917449235916138, + "learning_rate": 4.830126876602795e-05, + "loss": 5.2593, + "step": 19845 + }, + { + "epoch": 0.11802978399467123, + "grad_norm": 1.8179433345794678, + "learning_rate": 4.8301099519199173e-05, + "loss": 5.9407, + "step": 19846 + }, + { + "epoch": 0.11803573127795224, + "grad_norm": 1.652653694152832, + "learning_rate": 4.8300930264236216e-05, + "loss": 5.505, + "step": 19847 + }, + { + "epoch": 0.11804167856123322, + "grad_norm": 1.6400798559188843, + "learning_rate": 4.830076100113915e-05, + "loss": 5.7281, + "step": 19848 + }, + { + "epoch": 0.11804762584451423, + "grad_norm": 1.865049123764038, + "learning_rate": 4.830059172990802e-05, + "loss": 5.4562, + "step": 19849 + }, + { + "epoch": 0.11805357312779523, + "grad_norm": 1.68345308303833, + "learning_rate": 4.8300422450542906e-05, + "loss": 5.3027, + "step": 19850 + }, + { + "epoch": 0.11805952041107622, + "grad_norm": 2.1790804862976074, + "learning_rate": 4.8300253163043855e-05, + "loss": 4.5531, + "step": 19851 + }, + { + "epoch": 0.11806546769435722, + "grad_norm": 2.63421368598938, + "learning_rate": 4.8300083867410915e-05, + "loss": 4.0978, + "step": 19852 + }, + { + "epoch": 0.11807141497763822, + "grad_norm": 1.8692448139190674, + "learning_rate": 4.829991456364417e-05, + "loss": 5.5482, + "step": 19853 + }, + { + "epoch": 0.11807736226091921, + "grad_norm": 1.684128761291504, + "learning_rate": 4.829974525174365e-05, + "loss": 5.5612, + "step": 19854 + }, + { + "epoch": 0.11808330954420021, + "grad_norm": 1.5720278024673462, + "learning_rate": 4.829957593170944e-05, + "loss": 5.6787, + "step": 19855 + }, + { + "epoch": 0.11808925682748121, + "grad_norm": 1.834423303604126, + "learning_rate": 4.829940660354159e-05, + "loss": 4.5591, + "step": 19856 + }, + { + "epoch": 0.1180952041107622, + "grad_norm": 1.7370680570602417, + "learning_rate": 4.829923726724015e-05, + "loss": 5.1643, + "step": 19857 + }, + { + "epoch": 0.1181011513940432, + "grad_norm": 2.1546318531036377, + "learning_rate": 4.829906792280519e-05, + "loss": 4.5788, + "step": 19858 + }, + { + "epoch": 0.1181070986773242, + "grad_norm": 2.5604169368743896, + "learning_rate": 4.829889857023677e-05, + "loss": 3.1948, + "step": 19859 + }, + { + "epoch": 0.11811304596060519, + "grad_norm": 2.072169780731201, + "learning_rate": 4.829872920953494e-05, + "loss": 3.9707, + "step": 19860 + }, + { + "epoch": 0.1181189932438862, + "grad_norm": 1.7981303930282593, + "learning_rate": 4.829855984069976e-05, + "loss": 5.8413, + "step": 19861 + }, + { + "epoch": 0.1181249405271672, + "grad_norm": 1.621327519416809, + "learning_rate": 4.8298390463731305e-05, + "loss": 5.4867, + "step": 19862 + }, + { + "epoch": 0.11813088781044818, + "grad_norm": 1.5245294570922852, + "learning_rate": 4.829822107862962e-05, + "loss": 5.7148, + "step": 19863 + }, + { + "epoch": 0.11813683509372919, + "grad_norm": 2.2656896114349365, + "learning_rate": 4.8298051685394765e-05, + "loss": 5.6678, + "step": 19864 + }, + { + "epoch": 0.11814278237701017, + "grad_norm": 1.8529094457626343, + "learning_rate": 4.8297882284026805e-05, + "loss": 5.4445, + "step": 19865 + }, + { + "epoch": 0.11814872966029118, + "grad_norm": 1.5151565074920654, + "learning_rate": 4.829771287452579e-05, + "loss": 5.2794, + "step": 19866 + }, + { + "epoch": 0.11815467694357218, + "grad_norm": 1.8492248058319092, + "learning_rate": 4.829754345689178e-05, + "loss": 5.0797, + "step": 19867 + }, + { + "epoch": 0.11816062422685317, + "grad_norm": 2.7612802982330322, + "learning_rate": 4.829737403112484e-05, + "loss": 5.1486, + "step": 19868 + }, + { + "epoch": 0.11816657151013417, + "grad_norm": 1.9457459449768066, + "learning_rate": 4.8297204597225035e-05, + "loss": 5.6507, + "step": 19869 + }, + { + "epoch": 0.11817251879341517, + "grad_norm": 1.6429107189178467, + "learning_rate": 4.829703515519242e-05, + "loss": 5.8414, + "step": 19870 + }, + { + "epoch": 0.11817846607669616, + "grad_norm": 1.556187391281128, + "learning_rate": 4.829686570502704e-05, + "loss": 5.9028, + "step": 19871 + }, + { + "epoch": 0.11818441335997716, + "grad_norm": 1.451532006263733, + "learning_rate": 4.8296696246728965e-05, + "loss": 5.8497, + "step": 19872 + }, + { + "epoch": 0.11819036064325816, + "grad_norm": 1.7325583696365356, + "learning_rate": 4.8296526780298256e-05, + "loss": 5.3531, + "step": 19873 + }, + { + "epoch": 0.11819630792653915, + "grad_norm": 1.784332275390625, + "learning_rate": 4.829635730573497e-05, + "loss": 5.6025, + "step": 19874 + }, + { + "epoch": 0.11820225520982015, + "grad_norm": 1.6109933853149414, + "learning_rate": 4.829618782303917e-05, + "loss": 5.5626, + "step": 19875 + }, + { + "epoch": 0.11820820249310116, + "grad_norm": 1.6639639139175415, + "learning_rate": 4.8296018332210905e-05, + "loss": 5.5679, + "step": 19876 + }, + { + "epoch": 0.11821414977638214, + "grad_norm": 1.8205533027648926, + "learning_rate": 4.829584883325025e-05, + "loss": 5.448, + "step": 19877 + }, + { + "epoch": 0.11822009705966315, + "grad_norm": 1.6450576782226562, + "learning_rate": 4.829567932615725e-05, + "loss": 5.5966, + "step": 19878 + }, + { + "epoch": 0.11822604434294415, + "grad_norm": 1.456151008605957, + "learning_rate": 4.829550981093196e-05, + "loss": 5.5194, + "step": 19879 + }, + { + "epoch": 0.11823199162622514, + "grad_norm": 1.6064491271972656, + "learning_rate": 4.829534028757446e-05, + "loss": 5.6929, + "step": 19880 + }, + { + "epoch": 0.11823793890950614, + "grad_norm": 1.438132405281067, + "learning_rate": 4.829517075608479e-05, + "loss": 5.6738, + "step": 19881 + }, + { + "epoch": 0.11824388619278714, + "grad_norm": 2.503048896789551, + "learning_rate": 4.8295001216463024e-05, + "loss": 4.9929, + "step": 19882 + }, + { + "epoch": 0.11824983347606813, + "grad_norm": 2.3379812240600586, + "learning_rate": 4.829483166870921e-05, + "loss": 4.7947, + "step": 19883 + }, + { + "epoch": 0.11825578075934913, + "grad_norm": 2.055328130722046, + "learning_rate": 4.829466211282341e-05, + "loss": 5.3265, + "step": 19884 + }, + { + "epoch": 0.11826172804263013, + "grad_norm": 1.7393126487731934, + "learning_rate": 4.829449254880569e-05, + "loss": 5.0483, + "step": 19885 + }, + { + "epoch": 0.11826767532591112, + "grad_norm": 2.3054347038269043, + "learning_rate": 4.829432297665609e-05, + "loss": 4.9002, + "step": 19886 + }, + { + "epoch": 0.11827362260919212, + "grad_norm": 2.434323310852051, + "learning_rate": 4.82941533963747e-05, + "loss": 4.8013, + "step": 19887 + }, + { + "epoch": 0.11827956989247312, + "grad_norm": 2.0834875106811523, + "learning_rate": 4.829398380796155e-05, + "loss": 4.786, + "step": 19888 + }, + { + "epoch": 0.11828551717575411, + "grad_norm": 1.6682358980178833, + "learning_rate": 4.829381421141671e-05, + "loss": 5.6843, + "step": 19889 + }, + { + "epoch": 0.11829146445903511, + "grad_norm": 1.8787375688552856, + "learning_rate": 4.829364460674025e-05, + "loss": 5.5191, + "step": 19890 + }, + { + "epoch": 0.11829741174231612, + "grad_norm": 1.7496438026428223, + "learning_rate": 4.829347499393221e-05, + "loss": 5.6968, + "step": 19891 + }, + { + "epoch": 0.1183033590255971, + "grad_norm": 1.5585973262786865, + "learning_rate": 4.829330537299266e-05, + "loss": 5.5588, + "step": 19892 + }, + { + "epoch": 0.1183093063088781, + "grad_norm": 1.8294848203659058, + "learning_rate": 4.8293135743921664e-05, + "loss": 5.2407, + "step": 19893 + }, + { + "epoch": 0.11831525359215911, + "grad_norm": 1.4877654314041138, + "learning_rate": 4.829296610671927e-05, + "loss": 5.5383, + "step": 19894 + }, + { + "epoch": 0.1183212008754401, + "grad_norm": 1.5250638723373413, + "learning_rate": 4.829279646138554e-05, + "loss": 5.6443, + "step": 19895 + }, + { + "epoch": 0.1183271481587211, + "grad_norm": 1.5662062168121338, + "learning_rate": 4.829262680792054e-05, + "loss": 5.5409, + "step": 19896 + }, + { + "epoch": 0.11833309544200209, + "grad_norm": 1.1783791780471802, + "learning_rate": 4.829245714632432e-05, + "loss": 5.6169, + "step": 19897 + }, + { + "epoch": 0.11833904272528309, + "grad_norm": 1.4960299730300903, + "learning_rate": 4.829228747659695e-05, + "loss": 5.7195, + "step": 19898 + }, + { + "epoch": 0.11834499000856409, + "grad_norm": 1.437047004699707, + "learning_rate": 4.829211779873848e-05, + "loss": 5.7229, + "step": 19899 + }, + { + "epoch": 0.11835093729184508, + "grad_norm": 1.4095619916915894, + "learning_rate": 4.829194811274897e-05, + "loss": 5.7227, + "step": 19900 + }, + { + "epoch": 0.11835688457512608, + "grad_norm": 1.5694538354873657, + "learning_rate": 4.829177841862849e-05, + "loss": 5.356, + "step": 19901 + }, + { + "epoch": 0.11836283185840708, + "grad_norm": 1.7124476432800293, + "learning_rate": 4.829160871637708e-05, + "loss": 4.9185, + "step": 19902 + }, + { + "epoch": 0.11836877914168807, + "grad_norm": 2.2423064708709717, + "learning_rate": 4.829143900599481e-05, + "loss": 5.4345, + "step": 19903 + }, + { + "epoch": 0.11837472642496907, + "grad_norm": 1.8333791494369507, + "learning_rate": 4.829126928748175e-05, + "loss": 5.3666, + "step": 19904 + }, + { + "epoch": 0.11838067370825008, + "grad_norm": 1.5184969902038574, + "learning_rate": 4.8291099560837936e-05, + "loss": 5.4372, + "step": 19905 + }, + { + "epoch": 0.11838662099153106, + "grad_norm": 1.628544807434082, + "learning_rate": 4.829092982606345e-05, + "loss": 5.2682, + "step": 19906 + }, + { + "epoch": 0.11839256827481207, + "grad_norm": 1.5791584253311157, + "learning_rate": 4.829076008315834e-05, + "loss": 5.2149, + "step": 19907 + }, + { + "epoch": 0.11839851555809307, + "grad_norm": 1.299560546875, + "learning_rate": 4.8290590332122656e-05, + "loss": 5.1735, + "step": 19908 + }, + { + "epoch": 0.11840446284137406, + "grad_norm": 1.343913197517395, + "learning_rate": 4.829042057295647e-05, + "loss": 5.2344, + "step": 19909 + }, + { + "epoch": 0.11841041012465506, + "grad_norm": 1.2621396780014038, + "learning_rate": 4.829025080565985e-05, + "loss": 5.2982, + "step": 19910 + }, + { + "epoch": 0.11841635740793606, + "grad_norm": 1.2189174890518188, + "learning_rate": 4.829008103023284e-05, + "loss": 5.3347, + "step": 19911 + }, + { + "epoch": 0.11842230469121705, + "grad_norm": 1.2917883396148682, + "learning_rate": 4.82899112466755e-05, + "loss": 5.0745, + "step": 19912 + }, + { + "epoch": 0.11842825197449805, + "grad_norm": 1.2382320165634155, + "learning_rate": 4.828974145498789e-05, + "loss": 5.1999, + "step": 19913 + }, + { + "epoch": 0.11843419925777905, + "grad_norm": 1.398218035697937, + "learning_rate": 4.828957165517007e-05, + "loss": 5.4944, + "step": 19914 + }, + { + "epoch": 0.11844014654106004, + "grad_norm": 1.448901653289795, + "learning_rate": 4.8289401847222115e-05, + "loss": 5.4645, + "step": 19915 + }, + { + "epoch": 0.11844609382434104, + "grad_norm": 1.4628182649612427, + "learning_rate": 4.828923203114406e-05, + "loss": 5.003, + "step": 19916 + }, + { + "epoch": 0.11845204110762204, + "grad_norm": 1.3390740156173706, + "learning_rate": 4.828906220693598e-05, + "loss": 5.3482, + "step": 19917 + }, + { + "epoch": 0.11845798839090303, + "grad_norm": 1.539097547531128, + "learning_rate": 4.8288892374597925e-05, + "loss": 5.304, + "step": 19918 + }, + { + "epoch": 0.11846393567418403, + "grad_norm": 1.4011404514312744, + "learning_rate": 4.828872253412996e-05, + "loss": 5.2073, + "step": 19919 + }, + { + "epoch": 0.11846988295746504, + "grad_norm": 1.4064414501190186, + "learning_rate": 4.828855268553214e-05, + "loss": 5.2316, + "step": 19920 + }, + { + "epoch": 0.11847583024074602, + "grad_norm": 1.5808193683624268, + "learning_rate": 4.828838282880452e-05, + "loss": 5.211, + "step": 19921 + }, + { + "epoch": 0.11848177752402703, + "grad_norm": 1.5043809413909912, + "learning_rate": 4.828821296394718e-05, + "loss": 5.0564, + "step": 19922 + }, + { + "epoch": 0.11848772480730803, + "grad_norm": 1.2494529485702515, + "learning_rate": 4.828804309096016e-05, + "loss": 5.1523, + "step": 19923 + }, + { + "epoch": 0.11849367209058902, + "grad_norm": 1.4186055660247803, + "learning_rate": 4.8287873209843524e-05, + "loss": 4.9103, + "step": 19924 + }, + { + "epoch": 0.11849961937387002, + "grad_norm": 1.6093229055404663, + "learning_rate": 4.828770332059733e-05, + "loss": 4.9215, + "step": 19925 + }, + { + "epoch": 0.118505566657151, + "grad_norm": 1.5125865936279297, + "learning_rate": 4.8287533423221643e-05, + "loss": 5.0515, + "step": 19926 + }, + { + "epoch": 0.11851151394043201, + "grad_norm": 1.5410135984420776, + "learning_rate": 4.828736351771652e-05, + "loss": 4.9576, + "step": 19927 + }, + { + "epoch": 0.11851746122371301, + "grad_norm": 1.5431303977966309, + "learning_rate": 4.828719360408201e-05, + "loss": 5.1606, + "step": 19928 + }, + { + "epoch": 0.118523408506994, + "grad_norm": 1.4709242582321167, + "learning_rate": 4.828702368231819e-05, + "loss": 4.7685, + "step": 19929 + }, + { + "epoch": 0.118529355790275, + "grad_norm": 1.173568606376648, + "learning_rate": 4.828685375242511e-05, + "loss": 4.7591, + "step": 19930 + }, + { + "epoch": 0.118535303073556, + "grad_norm": 1.3113515377044678, + "learning_rate": 4.828668381440283e-05, + "loss": 4.786, + "step": 19931 + }, + { + "epoch": 0.11854125035683699, + "grad_norm": 1.4658124446868896, + "learning_rate": 4.828651386825141e-05, + "loss": 4.7776, + "step": 19932 + }, + { + "epoch": 0.118547197640118, + "grad_norm": 1.3406554460525513, + "learning_rate": 4.828634391397091e-05, + "loss": 5.0733, + "step": 19933 + }, + { + "epoch": 0.118553144923399, + "grad_norm": 1.2102482318878174, + "learning_rate": 4.828617395156138e-05, + "loss": 5.0069, + "step": 19934 + }, + { + "epoch": 0.11855909220667998, + "grad_norm": 0.989989697933197, + "learning_rate": 4.828600398102289e-05, + "loss": 4.759, + "step": 19935 + }, + { + "epoch": 0.11856503948996099, + "grad_norm": 1.2296501398086548, + "learning_rate": 4.82858340023555e-05, + "loss": 4.6269, + "step": 19936 + }, + { + "epoch": 0.11857098677324199, + "grad_norm": 1.5649582147598267, + "learning_rate": 4.828566401555926e-05, + "loss": 5.0196, + "step": 19937 + }, + { + "epoch": 0.11857693405652298, + "grad_norm": 1.2393609285354614, + "learning_rate": 4.8285494020634245e-05, + "loss": 5.059, + "step": 19938 + }, + { + "epoch": 0.11858288133980398, + "grad_norm": 1.450697422027588, + "learning_rate": 4.82853240175805e-05, + "loss": 5.1143, + "step": 19939 + }, + { + "epoch": 0.11858882862308498, + "grad_norm": 1.4795258045196533, + "learning_rate": 4.8285154006398084e-05, + "loss": 5.075, + "step": 19940 + }, + { + "epoch": 0.11859477590636597, + "grad_norm": 1.5858484506607056, + "learning_rate": 4.828498398708707e-05, + "loss": 5.0665, + "step": 19941 + }, + { + "epoch": 0.11860072318964697, + "grad_norm": 1.3411937952041626, + "learning_rate": 4.82848139596475e-05, + "loss": 4.9864, + "step": 19942 + }, + { + "epoch": 0.11860667047292797, + "grad_norm": 1.4348468780517578, + "learning_rate": 4.828464392407945e-05, + "loss": 4.904, + "step": 19943 + }, + { + "epoch": 0.11861261775620896, + "grad_norm": 1.4753068685531616, + "learning_rate": 4.8284473880382967e-05, + "loss": 5.0784, + "step": 19944 + }, + { + "epoch": 0.11861856503948996, + "grad_norm": 1.379059076309204, + "learning_rate": 4.828430382855811e-05, + "loss": 4.9782, + "step": 19945 + }, + { + "epoch": 0.11862451232277096, + "grad_norm": 1.444729208946228, + "learning_rate": 4.828413376860495e-05, + "loss": 5.5804, + "step": 19946 + }, + { + "epoch": 0.11863045960605195, + "grad_norm": 1.3467416763305664, + "learning_rate": 4.8283963700523535e-05, + "loss": 5.3278, + "step": 19947 + }, + { + "epoch": 0.11863640688933295, + "grad_norm": 1.5206544399261475, + "learning_rate": 4.8283793624313936e-05, + "loss": 5.01, + "step": 19948 + }, + { + "epoch": 0.11864235417261396, + "grad_norm": 1.394729733467102, + "learning_rate": 4.8283623539976195e-05, + "loss": 5.2139, + "step": 19949 + }, + { + "epoch": 0.11864830145589494, + "grad_norm": 1.3675029277801514, + "learning_rate": 4.8283453447510394e-05, + "loss": 5.4559, + "step": 19950 + }, + { + "epoch": 0.11865424873917595, + "grad_norm": 1.1950232982635498, + "learning_rate": 4.828328334691657e-05, + "loss": 5.2233, + "step": 19951 + }, + { + "epoch": 0.11866019602245695, + "grad_norm": 1.3517179489135742, + "learning_rate": 4.82831132381948e-05, + "loss": 5.0519, + "step": 19952 + }, + { + "epoch": 0.11866614330573794, + "grad_norm": 1.4184643030166626, + "learning_rate": 4.828294312134512e-05, + "loss": 4.8722, + "step": 19953 + }, + { + "epoch": 0.11867209058901894, + "grad_norm": 1.4558582305908203, + "learning_rate": 4.828277299636762e-05, + "loss": 5.3876, + "step": 19954 + }, + { + "epoch": 0.11867803787229993, + "grad_norm": 1.4617977142333984, + "learning_rate": 4.8282602863262345e-05, + "loss": 5.4784, + "step": 19955 + }, + { + "epoch": 0.11868398515558093, + "grad_norm": 1.4997669458389282, + "learning_rate": 4.828243272202935e-05, + "loss": 5.2556, + "step": 19956 + }, + { + "epoch": 0.11868993243886193, + "grad_norm": 1.2730913162231445, + "learning_rate": 4.8282262572668696e-05, + "loss": 5.3194, + "step": 19957 + }, + { + "epoch": 0.11869587972214292, + "grad_norm": 1.4149047136306763, + "learning_rate": 4.8282092415180444e-05, + "loss": 5.5139, + "step": 19958 + }, + { + "epoch": 0.11870182700542392, + "grad_norm": 1.2510145902633667, + "learning_rate": 4.828192224956466e-05, + "loss": 5.2486, + "step": 19959 + }, + { + "epoch": 0.11870777428870492, + "grad_norm": 1.2229409217834473, + "learning_rate": 4.828175207582139e-05, + "loss": 5.2391, + "step": 19960 + }, + { + "epoch": 0.11871372157198591, + "grad_norm": 1.3316899538040161, + "learning_rate": 4.828158189395071e-05, + "loss": 5.2928, + "step": 19961 + }, + { + "epoch": 0.11871966885526691, + "grad_norm": 1.4331640005111694, + "learning_rate": 4.828141170395266e-05, + "loss": 5.3311, + "step": 19962 + }, + { + "epoch": 0.11872561613854792, + "grad_norm": 1.3313428163528442, + "learning_rate": 4.828124150582732e-05, + "loss": 5.2203, + "step": 19963 + }, + { + "epoch": 0.1187315634218289, + "grad_norm": 1.6505075693130493, + "learning_rate": 4.828107129957473e-05, + "loss": 4.8604, + "step": 19964 + }, + { + "epoch": 0.1187375107051099, + "grad_norm": 1.3544394969940186, + "learning_rate": 4.828090108519496e-05, + "loss": 5.17, + "step": 19965 + }, + { + "epoch": 0.11874345798839091, + "grad_norm": 1.3194384574890137, + "learning_rate": 4.828073086268808e-05, + "loss": 5.2197, + "step": 19966 + }, + { + "epoch": 0.1187494052716719, + "grad_norm": 1.4014582633972168, + "learning_rate": 4.8280560632054126e-05, + "loss": 5.2865, + "step": 19967 + }, + { + "epoch": 0.1187553525549529, + "grad_norm": 1.5148218870162964, + "learning_rate": 4.828039039329317e-05, + "loss": 5.3765, + "step": 19968 + }, + { + "epoch": 0.1187612998382339, + "grad_norm": 1.3657969236373901, + "learning_rate": 4.828022014640527e-05, + "loss": 4.9787, + "step": 19969 + }, + { + "epoch": 0.11876724712151489, + "grad_norm": 1.547717571258545, + "learning_rate": 4.828004989139049e-05, + "loss": 5.0538, + "step": 19970 + }, + { + "epoch": 0.11877319440479589, + "grad_norm": 1.5132863521575928, + "learning_rate": 4.827987962824888e-05, + "loss": 5.0301, + "step": 19971 + }, + { + "epoch": 0.11877914168807689, + "grad_norm": 1.4020887613296509, + "learning_rate": 4.827970935698051e-05, + "loss": 4.9646, + "step": 19972 + }, + { + "epoch": 0.11878508897135788, + "grad_norm": 1.4983519315719604, + "learning_rate": 4.8279539077585424e-05, + "loss": 5.2266, + "step": 19973 + }, + { + "epoch": 0.11879103625463888, + "grad_norm": 1.3545745611190796, + "learning_rate": 4.82793687900637e-05, + "loss": 5.108, + "step": 19974 + }, + { + "epoch": 0.11879698353791988, + "grad_norm": 1.4865717887878418, + "learning_rate": 4.827919849441539e-05, + "loss": 5.257, + "step": 19975 + }, + { + "epoch": 0.11880293082120087, + "grad_norm": 1.4389182329177856, + "learning_rate": 4.8279028190640546e-05, + "loss": 4.976, + "step": 19976 + }, + { + "epoch": 0.11880887810448187, + "grad_norm": 1.2823866605758667, + "learning_rate": 4.827885787873924e-05, + "loss": 4.7617, + "step": 19977 + }, + { + "epoch": 0.11881482538776288, + "grad_norm": 1.369992971420288, + "learning_rate": 4.8278687558711525e-05, + "loss": 4.7165, + "step": 19978 + }, + { + "epoch": 0.11882077267104386, + "grad_norm": 1.2873594760894775, + "learning_rate": 4.827851723055745e-05, + "loss": 4.6705, + "step": 19979 + }, + { + "epoch": 0.11882671995432487, + "grad_norm": 1.3779295682907104, + "learning_rate": 4.827834689427709e-05, + "loss": 4.9752, + "step": 19980 + }, + { + "epoch": 0.11883266723760587, + "grad_norm": 1.5264688730239868, + "learning_rate": 4.82781765498705e-05, + "loss": 5.0295, + "step": 19981 + }, + { + "epoch": 0.11883861452088686, + "grad_norm": 1.6745606660842896, + "learning_rate": 4.827800619733774e-05, + "loss": 5.4265, + "step": 19982 + }, + { + "epoch": 0.11884456180416786, + "grad_norm": 1.5993295907974243, + "learning_rate": 4.8277835836678874e-05, + "loss": 5.0611, + "step": 19983 + }, + { + "epoch": 0.11885050908744885, + "grad_norm": 1.6451520919799805, + "learning_rate": 4.827766546789395e-05, + "loss": 4.9504, + "step": 19984 + }, + { + "epoch": 0.11885645637072985, + "grad_norm": 1.4769519567489624, + "learning_rate": 4.827749509098304e-05, + "loss": 5.1324, + "step": 19985 + }, + { + "epoch": 0.11886240365401085, + "grad_norm": 1.6930506229400635, + "learning_rate": 4.827732470594619e-05, + "loss": 5.134, + "step": 19986 + }, + { + "epoch": 0.11886835093729184, + "grad_norm": 1.1951912641525269, + "learning_rate": 4.827715431278347e-05, + "loss": 5.2521, + "step": 19987 + }, + { + "epoch": 0.11887429822057284, + "grad_norm": 1.3520997762680054, + "learning_rate": 4.827698391149493e-05, + "loss": 5.1791, + "step": 19988 + }, + { + "epoch": 0.11888024550385384, + "grad_norm": 1.3710130453109741, + "learning_rate": 4.8276813502080644e-05, + "loss": 5.1179, + "step": 19989 + }, + { + "epoch": 0.11888619278713483, + "grad_norm": 1.4977210760116577, + "learning_rate": 4.827664308454066e-05, + "loss": 5.1492, + "step": 19990 + }, + { + "epoch": 0.11889214007041583, + "grad_norm": 1.2681607007980347, + "learning_rate": 4.8276472658875035e-05, + "loss": 5.1178, + "step": 19991 + }, + { + "epoch": 0.11889808735369684, + "grad_norm": 1.2606865167617798, + "learning_rate": 4.827630222508385e-05, + "loss": 5.2796, + "step": 19992 + }, + { + "epoch": 0.11890403463697782, + "grad_norm": 1.477273941040039, + "learning_rate": 4.827613178316713e-05, + "loss": 5.251, + "step": 19993 + }, + { + "epoch": 0.11890998192025883, + "grad_norm": 1.4194386005401611, + "learning_rate": 4.8275961333124956e-05, + "loss": 5.157, + "step": 19994 + }, + { + "epoch": 0.11891592920353983, + "grad_norm": 1.2693103551864624, + "learning_rate": 4.8275790874957396e-05, + "loss": 5.2037, + "step": 19995 + }, + { + "epoch": 0.11892187648682082, + "grad_norm": 1.2035702466964722, + "learning_rate": 4.8275620408664487e-05, + "loss": 5.1613, + "step": 19996 + }, + { + "epoch": 0.11892782377010182, + "grad_norm": 1.1674199104309082, + "learning_rate": 4.8275449934246295e-05, + "loss": 5.2415, + "step": 19997 + }, + { + "epoch": 0.11893377105338282, + "grad_norm": 1.5064369440078735, + "learning_rate": 4.8275279451702895e-05, + "loss": 5.2025, + "step": 19998 + }, + { + "epoch": 0.11893971833666381, + "grad_norm": 1.3770934343338013, + "learning_rate": 4.827510896103433e-05, + "loss": 5.0804, + "step": 19999 + }, + { + "epoch": 0.11894566561994481, + "grad_norm": 1.4852590560913086, + "learning_rate": 4.827493846224067e-05, + "loss": 5.0169, + "step": 20000 + } + ], + "logging_steps": 1, + "max_steps": 168144, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 8.443217717834547e+16, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-30000/config.json b/checkpoint-30000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-30000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-30000/generation_config.json b/checkpoint-30000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-30000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-30000/model.safetensors.index.json b/checkpoint-30000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-30000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-30000/rng_state_0.pth b/checkpoint-30000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-30000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-30000/rng_state_1.pth b/checkpoint-30000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-30000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-30000/rng_state_2.pth b/checkpoint-30000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-30000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-30000/rng_state_4.pth b/checkpoint-30000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-30000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-30000/rng_state_5.pth b/checkpoint-30000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-30000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-30000/rng_state_6.pth b/checkpoint-30000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-30000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-30000/trainer_state.json b/checkpoint-30000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..88dbd7ee9a46c090fcf8b9a23ad816ed16c0ce4b --- /dev/null +++ b/checkpoint-30000/trainer_state.json @@ -0,0 +1,210034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.17841849842991722, + "eval_steps": 500, + "global_step": 30000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.947283280997241e-06, + "grad_norm": 179.1047821044922, + "learning_rate": 5e-05, + "loss": 14.5158, + "step": 1 + }, + { + "epoch": 1.1894566561994482e-05, + "grad_norm": 40.39401626586914, + "learning_rate": 4.999999999563638e-05, + "loss": 14.152, + "step": 2 + }, + { + "epoch": 1.7841849842991722e-05, + "grad_norm": 137.05079650878906, + "learning_rate": 4.999999998254552e-05, + "loss": 14.6334, + "step": 3 + }, + { + "epoch": 2.3789133123988963e-05, + "grad_norm": 23.315088272094727, + "learning_rate": 4.9999999960727415e-05, + "loss": 12.6458, + "step": 4 + }, + { + "epoch": 2.97364164049862e-05, + "grad_norm": 7.943603992462158, + "learning_rate": 4.9999999930182065e-05, + "loss": 11.8435, + "step": 5 + }, + { + "epoch": 3.5683699685983445e-05, + "grad_norm": 6.374181270599365, + "learning_rate": 4.999999989090948e-05, + "loss": 11.4544, + "step": 6 + }, + { + "epoch": 4.1630982966980686e-05, + "grad_norm": 8.948652267456055, + "learning_rate": 4.999999984290965e-05, + "loss": 11.3516, + "step": 7 + }, + { + "epoch": 4.7578266247977927e-05, + "grad_norm": 3.2318713665008545, + "learning_rate": 4.999999978618258e-05, + "loss": 11.1021, + "step": 8 + }, + { + "epoch": 5.352554952897517e-05, + "grad_norm": 5.6542534828186035, + "learning_rate": 4.9999999720728266e-05, + "loss": 11.0132, + "step": 9 + }, + { + "epoch": 5.94728328099724e-05, + "grad_norm": 3.623577356338501, + "learning_rate": 4.999999964654671e-05, + "loss": 10.8896, + "step": 10 + }, + { + "epoch": 6.542011609096965e-05, + "grad_norm": 3.3209445476531982, + "learning_rate": 4.9999999563637915e-05, + "loss": 10.7339, + "step": 11 + }, + { + "epoch": 7.136739937196689e-05, + "grad_norm": 3.4527082443237305, + "learning_rate": 4.999999947200188e-05, + "loss": 10.5472, + "step": 12 + }, + { + "epoch": 7.731468265296413e-05, + "grad_norm": 3.784444570541382, + "learning_rate": 4.99999993716386e-05, + "loss": 10.4353, + "step": 13 + }, + { + "epoch": 8.326196593396137e-05, + "grad_norm": 4.304569244384766, + "learning_rate": 4.999999926254808e-05, + "loss": 10.4652, + "step": 14 + }, + { + "epoch": 8.920924921495861e-05, + "grad_norm": 3.5867838859558105, + "learning_rate": 4.999999914473032e-05, + "loss": 10.5746, + "step": 15 + }, + { + "epoch": 9.515653249595585e-05, + "grad_norm": 6.1308207511901855, + "learning_rate": 4.9999999018185316e-05, + "loss": 10.4129, + "step": 16 + }, + { + "epoch": 0.0001011038157769531, + "grad_norm": 3.4687230587005615, + "learning_rate": 4.999999888291307e-05, + "loss": 10.2246, + "step": 17 + }, + { + "epoch": 0.00010705109905795033, + "grad_norm": 4.041895866394043, + "learning_rate": 4.9999998738913586e-05, + "loss": 10.0852, + "step": 18 + }, + { + "epoch": 0.00011299838233894758, + "grad_norm": 4.437602519989014, + "learning_rate": 4.999999858618686e-05, + "loss": 9.8841, + "step": 19 + }, + { + "epoch": 0.0001189456656199448, + "grad_norm": 3.9608142375946045, + "learning_rate": 4.9999998424732884e-05, + "loss": 10.0537, + "step": 20 + }, + { + "epoch": 0.00012489294890094204, + "grad_norm": 3.799363613128662, + "learning_rate": 4.999999825455168e-05, + "loss": 9.8487, + "step": 21 + }, + { + "epoch": 0.0001308402321819393, + "grad_norm": 3.626058340072632, + "learning_rate": 4.999999807564323e-05, + "loss": 9.8048, + "step": 22 + }, + { + "epoch": 0.00013678751546293653, + "grad_norm": 4.21406364440918, + "learning_rate": 4.999999788800754e-05, + "loss": 9.6091, + "step": 23 + }, + { + "epoch": 0.00014273479874393378, + "grad_norm": 5.26548957824707, + "learning_rate": 4.9999997691644605e-05, + "loss": 9.3935, + "step": 24 + }, + { + "epoch": 0.000148682082024931, + "grad_norm": 6.5113396644592285, + "learning_rate": 4.999999748655443e-05, + "loss": 9.2602, + "step": 25 + }, + { + "epoch": 0.00015462936530592826, + "grad_norm": 4.6141133308410645, + "learning_rate": 4.9999997272737014e-05, + "loss": 9.1492, + "step": 26 + }, + { + "epoch": 0.0001605766485869255, + "grad_norm": 4.645262241363525, + "learning_rate": 4.999999705019236e-05, + "loss": 9.2238, + "step": 27 + }, + { + "epoch": 0.00016652393186792274, + "grad_norm": 4.599213123321533, + "learning_rate": 4.9999996818920464e-05, + "loss": 9.1673, + "step": 28 + }, + { + "epoch": 0.00017247121514891997, + "grad_norm": 4.820634365081787, + "learning_rate": 4.999999657892133e-05, + "loss": 9.0044, + "step": 29 + }, + { + "epoch": 0.00017841849842991722, + "grad_norm": 4.57854700088501, + "learning_rate": 4.9999996330194956e-05, + "loss": 8.8746, + "step": 30 + }, + { + "epoch": 0.00018436578171091445, + "grad_norm": 4.567880153656006, + "learning_rate": 4.999999607274133e-05, + "loss": 8.7224, + "step": 31 + }, + { + "epoch": 0.0001903130649919117, + "grad_norm": 4.545701503753662, + "learning_rate": 4.9999995806560475e-05, + "loss": 8.6979, + "step": 32 + }, + { + "epoch": 0.00019626034827290893, + "grad_norm": 4.098274230957031, + "learning_rate": 4.9999995531652374e-05, + "loss": 8.5787, + "step": 33 + }, + { + "epoch": 0.0002022076315539062, + "grad_norm": 4.341195106506348, + "learning_rate": 4.999999524801704e-05, + "loss": 8.4452, + "step": 34 + }, + { + "epoch": 0.00020815491483490341, + "grad_norm": 4.651747703552246, + "learning_rate": 4.999999495565446e-05, + "loss": 8.4383, + "step": 35 + }, + { + "epoch": 0.00021410219811590067, + "grad_norm": 4.187220573425293, + "learning_rate": 4.999999465456464e-05, + "loss": 8.2441, + "step": 36 + }, + { + "epoch": 0.0002200494813968979, + "grad_norm": 4.094058990478516, + "learning_rate": 4.999999434474758e-05, + "loss": 8.2784, + "step": 37 + }, + { + "epoch": 0.00022599676467789515, + "grad_norm": 4.6094794273376465, + "learning_rate": 4.999999402620329e-05, + "loss": 8.3893, + "step": 38 + }, + { + "epoch": 0.00023194404795889238, + "grad_norm": 5.391327381134033, + "learning_rate": 4.999999369893175e-05, + "loss": 8.6491, + "step": 39 + }, + { + "epoch": 0.0002378913312398896, + "grad_norm": 5.03748893737793, + "learning_rate": 4.9999993362932974e-05, + "loss": 8.5279, + "step": 40 + }, + { + "epoch": 0.00024383861452088686, + "grad_norm": 5.306002616882324, + "learning_rate": 4.9999993018206956e-05, + "loss": 9.9965, + "step": 41 + }, + { + "epoch": 0.0002497858978018841, + "grad_norm": 5.5374274253845215, + "learning_rate": 4.99999926647537e-05, + "loss": 10.5594, + "step": 42 + }, + { + "epoch": 0.00025573318108288134, + "grad_norm": 3.8107693195343018, + "learning_rate": 4.999999230257321e-05, + "loss": 10.5451, + "step": 43 + }, + { + "epoch": 0.0002616804643638786, + "grad_norm": 3.922286033630371, + "learning_rate": 4.999999193166547e-05, + "loss": 10.4123, + "step": 44 + }, + { + "epoch": 0.0002676277476448758, + "grad_norm": 3.2090535163879395, + "learning_rate": 4.99999915520305e-05, + "loss": 10.0646, + "step": 45 + }, + { + "epoch": 0.00027357503092587305, + "grad_norm": 3.153404474258423, + "learning_rate": 4.9999991163668285e-05, + "loss": 10.237, + "step": 46 + }, + { + "epoch": 0.0002795223142068703, + "grad_norm": 4.83523416519165, + "learning_rate": 4.999999076657884e-05, + "loss": 8.9392, + "step": 47 + }, + { + "epoch": 0.00028546959748786756, + "grad_norm": 3.954632043838501, + "learning_rate": 4.999999036076215e-05, + "loss": 8.8562, + "step": 48 + }, + { + "epoch": 0.00029141688076886476, + "grad_norm": 4.452631950378418, + "learning_rate": 4.999998994621822e-05, + "loss": 9.8819, + "step": 49 + }, + { + "epoch": 0.000297364164049862, + "grad_norm": 4.71603536605835, + "learning_rate": 4.9999989522947055e-05, + "loss": 9.8503, + "step": 50 + }, + { + "epoch": 0.00030331144733085927, + "grad_norm": 3.8008105754852295, + "learning_rate": 4.999998909094865e-05, + "loss": 9.8072, + "step": 51 + }, + { + "epoch": 0.0003092587306118565, + "grad_norm": 3.9906716346740723, + "learning_rate": 4.999998865022301e-05, + "loss": 9.168, + "step": 52 + }, + { + "epoch": 0.0003152060138928537, + "grad_norm": 3.9425785541534424, + "learning_rate": 4.999998820077013e-05, + "loss": 9.8441, + "step": 53 + }, + { + "epoch": 0.000321153297173851, + "grad_norm": 3.6698031425476074, + "learning_rate": 4.999998774259002e-05, + "loss": 10.036, + "step": 54 + }, + { + "epoch": 0.00032710058045484823, + "grad_norm": 3.3027005195617676, + "learning_rate": 4.999998727568266e-05, + "loss": 9.8701, + "step": 55 + }, + { + "epoch": 0.0003330478637358455, + "grad_norm": 3.312570333480835, + "learning_rate": 4.999998680004807e-05, + "loss": 9.3354, + "step": 56 + }, + { + "epoch": 0.0003389951470168427, + "grad_norm": 3.323969602584839, + "learning_rate": 4.999998631568624e-05, + "loss": 9.2899, + "step": 57 + }, + { + "epoch": 0.00034494243029783994, + "grad_norm": 3.1319313049316406, + "learning_rate": 4.999998582259717e-05, + "loss": 9.1033, + "step": 58 + }, + { + "epoch": 0.0003508897135788372, + "grad_norm": 3.655060291290283, + "learning_rate": 4.999998532078087e-05, + "loss": 9.1574, + "step": 59 + }, + { + "epoch": 0.00035683699685983445, + "grad_norm": 3.2051918506622314, + "learning_rate": 4.999998481023733e-05, + "loss": 9.564, + "step": 60 + }, + { + "epoch": 0.00036278428014083165, + "grad_norm": 3.223015308380127, + "learning_rate": 4.999998429096656e-05, + "loss": 9.46, + "step": 61 + }, + { + "epoch": 0.0003687315634218289, + "grad_norm": 4.121186256408691, + "learning_rate": 4.999998376296855e-05, + "loss": 8.4136, + "step": 62 + }, + { + "epoch": 0.00037467884670282616, + "grad_norm": 3.5580086708068848, + "learning_rate": 4.9999983226243296e-05, + "loss": 9.3504, + "step": 63 + }, + { + "epoch": 0.0003806261299838234, + "grad_norm": 3.664219379425049, + "learning_rate": 4.999998268079081e-05, + "loss": 9.2889, + "step": 64 + }, + { + "epoch": 0.0003865734132648206, + "grad_norm": 2.955582618713379, + "learning_rate": 4.99999821266111e-05, + "loss": 8.9193, + "step": 65 + }, + { + "epoch": 0.00039252069654581787, + "grad_norm": 3.0592539310455322, + "learning_rate": 4.9999981563704144e-05, + "loss": 9.6739, + "step": 66 + }, + { + "epoch": 0.0003984679798268151, + "grad_norm": 3.32024884223938, + "learning_rate": 4.999998099206995e-05, + "loss": 9.3648, + "step": 67 + }, + { + "epoch": 0.0004044152631078124, + "grad_norm": 3.2716033458709717, + "learning_rate": 4.9999980411708524e-05, + "loss": 9.3652, + "step": 68 + }, + { + "epoch": 0.0004103625463888096, + "grad_norm": 3.1926631927490234, + "learning_rate": 4.999997982261987e-05, + "loss": 9.2924, + "step": 69 + }, + { + "epoch": 0.00041630982966980683, + "grad_norm": 3.589841604232788, + "learning_rate": 4.999997922480397e-05, + "loss": 9.2185, + "step": 70 + }, + { + "epoch": 0.0004222571129508041, + "grad_norm": 2.902132034301758, + "learning_rate": 4.999997861826084e-05, + "loss": 9.1047, + "step": 71 + }, + { + "epoch": 0.00042820439623180134, + "grad_norm": 3.2352359294891357, + "learning_rate": 4.999997800299048e-05, + "loss": 9.0309, + "step": 72 + }, + { + "epoch": 0.00043415167951279854, + "grad_norm": 2.683664560317993, + "learning_rate": 4.9999977378992884e-05, + "loss": 8.9977, + "step": 73 + }, + { + "epoch": 0.0004400989627937958, + "grad_norm": 3.0073423385620117, + "learning_rate": 4.9999976746268055e-05, + "loss": 9.0967, + "step": 74 + }, + { + "epoch": 0.00044604624607479305, + "grad_norm": 3.364819288253784, + "learning_rate": 4.9999976104815994e-05, + "loss": 8.9401, + "step": 75 + }, + { + "epoch": 0.0004519935293557903, + "grad_norm": 3.478936195373535, + "learning_rate": 4.9999975454636695e-05, + "loss": 8.8173, + "step": 76 + }, + { + "epoch": 0.0004579408126367875, + "grad_norm": 3.059669017791748, + "learning_rate": 4.9999974795730165e-05, + "loss": 9.2588, + "step": 77 + }, + { + "epoch": 0.00046388809591778476, + "grad_norm": 3.1980936527252197, + "learning_rate": 4.999997412809639e-05, + "loss": 9.3374, + "step": 78 + }, + { + "epoch": 0.000469835379198782, + "grad_norm": 2.859935998916626, + "learning_rate": 4.9999973451735405e-05, + "loss": 8.8996, + "step": 79 + }, + { + "epoch": 0.0004757826624797792, + "grad_norm": 3.6268489360809326, + "learning_rate": 4.9999972766647175e-05, + "loss": 8.7878, + "step": 80 + }, + { + "epoch": 0.00048172994576077647, + "grad_norm": 3.0187010765075684, + "learning_rate": 4.9999972072831714e-05, + "loss": 8.9177, + "step": 81 + }, + { + "epoch": 0.0004876772290417737, + "grad_norm": 3.304633378982544, + "learning_rate": 4.9999971370289014e-05, + "loss": 8.8098, + "step": 82 + }, + { + "epoch": 0.0004936245123227709, + "grad_norm": 3.678696870803833, + "learning_rate": 4.999997065901909e-05, + "loss": 8.9408, + "step": 83 + }, + { + "epoch": 0.0004995717956037682, + "grad_norm": 3.485488176345825, + "learning_rate": 4.9999969939021936e-05, + "loss": 8.7374, + "step": 84 + }, + { + "epoch": 0.0005055190788847654, + "grad_norm": 3.276916265487671, + "learning_rate": 4.999996921029755e-05, + "loss": 8.7177, + "step": 85 + }, + { + "epoch": 0.0005114663621657627, + "grad_norm": 3.060227632522583, + "learning_rate": 4.9999968472845926e-05, + "loss": 8.9673, + "step": 86 + }, + { + "epoch": 0.0005174136454467599, + "grad_norm": 3.359055995941162, + "learning_rate": 4.999996772666708e-05, + "loss": 8.8029, + "step": 87 + }, + { + "epoch": 0.0005233609287277572, + "grad_norm": 3.8916943073272705, + "learning_rate": 4.9999966971761004e-05, + "loss": 8.8363, + "step": 88 + }, + { + "epoch": 0.0005293082120087544, + "grad_norm": 3.825075387954712, + "learning_rate": 4.9999966208127694e-05, + "loss": 8.5683, + "step": 89 + }, + { + "epoch": 0.0005352554952897516, + "grad_norm": 3.475759267807007, + "learning_rate": 4.999996543576715e-05, + "loss": 8.5723, + "step": 90 + }, + { + "epoch": 0.0005412027785707488, + "grad_norm": 3.609776020050049, + "learning_rate": 4.9999964654679385e-05, + "loss": 8.6123, + "step": 91 + }, + { + "epoch": 0.0005471500618517461, + "grad_norm": 3.3749685287475586, + "learning_rate": 4.999996386486439e-05, + "loss": 8.4887, + "step": 92 + }, + { + "epoch": 0.0005530973451327434, + "grad_norm": 3.3853306770324707, + "learning_rate": 4.999996306632215e-05, + "loss": 8.56, + "step": 93 + }, + { + "epoch": 0.0005590446284137406, + "grad_norm": 3.9347422122955322, + "learning_rate": 4.99999622590527e-05, + "loss": 8.5053, + "step": 94 + }, + { + "epoch": 0.0005649919116947379, + "grad_norm": 3.6037611961364746, + "learning_rate": 4.999996144305601e-05, + "loss": 8.3367, + "step": 95 + }, + { + "epoch": 0.0005709391949757351, + "grad_norm": 3.4608941078186035, + "learning_rate": 4.99999606183321e-05, + "loss": 8.0674, + "step": 96 + }, + { + "epoch": 0.0005768864782567324, + "grad_norm": 3.4882898330688477, + "learning_rate": 4.999995978488096e-05, + "loss": 8.1728, + "step": 97 + }, + { + "epoch": 0.0005828337615377295, + "grad_norm": 3.6789562702178955, + "learning_rate": 4.999995894270258e-05, + "loss": 7.9535, + "step": 98 + }, + { + "epoch": 0.0005887810448187268, + "grad_norm": 3.57328200340271, + "learning_rate": 4.9999958091796986e-05, + "loss": 8.2048, + "step": 99 + }, + { + "epoch": 0.000594728328099724, + "grad_norm": 3.803468942642212, + "learning_rate": 4.999995723216416e-05, + "loss": 7.8073, + "step": 100 + }, + { + "epoch": 0.0006006756113807213, + "grad_norm": 3.8187785148620605, + "learning_rate": 4.9999956363804116e-05, + "loss": 7.6325, + "step": 101 + }, + { + "epoch": 0.0006066228946617185, + "grad_norm": 3.8681981563568115, + "learning_rate": 4.999995548671684e-05, + "loss": 7.7104, + "step": 102 + }, + { + "epoch": 0.0006125701779427158, + "grad_norm": 3.869074583053589, + "learning_rate": 4.9999954600902334e-05, + "loss": 7.8445, + "step": 103 + }, + { + "epoch": 0.000618517461223713, + "grad_norm": 3.852057695388794, + "learning_rate": 4.99999537063606e-05, + "loss": 7.872, + "step": 104 + }, + { + "epoch": 0.0006244647445047103, + "grad_norm": 4.784586429595947, + "learning_rate": 4.9999952803091654e-05, + "loss": 9.2218, + "step": 105 + }, + { + "epoch": 0.0006304120277857074, + "grad_norm": 4.296675682067871, + "learning_rate": 4.9999951891095474e-05, + "loss": 9.0957, + "step": 106 + }, + { + "epoch": 0.0006363593110667047, + "grad_norm": 3.9155995845794678, + "learning_rate": 4.999995097037207e-05, + "loss": 8.9829, + "step": 107 + }, + { + "epoch": 0.000642306594347702, + "grad_norm": 3.8967478275299072, + "learning_rate": 4.999995004092144e-05, + "loss": 8.2017, + "step": 108 + }, + { + "epoch": 0.0006482538776286992, + "grad_norm": 5.238500595092773, + "learning_rate": 4.999994910274358e-05, + "loss": 7.7976, + "step": 109 + }, + { + "epoch": 0.0006542011609096965, + "grad_norm": 3.7043144702911377, + "learning_rate": 4.9999948155838504e-05, + "loss": 8.3116, + "step": 110 + }, + { + "epoch": 0.0006601484441906937, + "grad_norm": 2.9745211601257324, + "learning_rate": 4.99999472002062e-05, + "loss": 8.69, + "step": 111 + }, + { + "epoch": 0.000666095727471691, + "grad_norm": 3.172652006149292, + "learning_rate": 4.999994623584668e-05, + "loss": 8.6244, + "step": 112 + }, + { + "epoch": 0.0006720430107526882, + "grad_norm": 3.224888801574707, + "learning_rate": 4.999994526275993e-05, + "loss": 8.6823, + "step": 113 + }, + { + "epoch": 0.0006779902940336854, + "grad_norm": 3.53104305267334, + "learning_rate": 4.9999944280945964e-05, + "loss": 8.495, + "step": 114 + }, + { + "epoch": 0.0006839375773146826, + "grad_norm": 3.013505697250366, + "learning_rate": 4.999994329040477e-05, + "loss": 8.4807, + "step": 115 + }, + { + "epoch": 0.0006898848605956799, + "grad_norm": 4.4741339683532715, + "learning_rate": 4.999994229113636e-05, + "loss": 8.94, + "step": 116 + }, + { + "epoch": 0.0006958321438766771, + "grad_norm": 4.78712272644043, + "learning_rate": 4.999994128314072e-05, + "loss": 8.9367, + "step": 117 + }, + { + "epoch": 0.0007017794271576744, + "grad_norm": 3.6983933448791504, + "learning_rate": 4.999994026641787e-05, + "loss": 8.7524, + "step": 118 + }, + { + "epoch": 0.0007077267104386716, + "grad_norm": 3.74997615814209, + "learning_rate": 4.9999939240967784e-05, + "loss": 8.3417, + "step": 119 + }, + { + "epoch": 0.0007136739937196689, + "grad_norm": 3.614593982696533, + "learning_rate": 4.999993820679049e-05, + "loss": 8.4848, + "step": 120 + }, + { + "epoch": 0.000719621277000666, + "grad_norm": 2.903045654296875, + "learning_rate": 4.999993716388597e-05, + "loss": 8.5519, + "step": 121 + }, + { + "epoch": 0.0007255685602816633, + "grad_norm": 3.402444839477539, + "learning_rate": 4.999993611225423e-05, + "loss": 8.2905, + "step": 122 + }, + { + "epoch": 0.0007315158435626606, + "grad_norm": 3.663893938064575, + "learning_rate": 4.9999935051895274e-05, + "loss": 8.4842, + "step": 123 + }, + { + "epoch": 0.0007374631268436578, + "grad_norm": 3.7535622119903564, + "learning_rate": 4.99999339828091e-05, + "loss": 8.4766, + "step": 124 + }, + { + "epoch": 0.0007434104101246551, + "grad_norm": 3.1285574436187744, + "learning_rate": 4.99999329049957e-05, + "loss": 8.3716, + "step": 125 + }, + { + "epoch": 0.0007493576934056523, + "grad_norm": 3.648869752883911, + "learning_rate": 4.9999931818455086e-05, + "loss": 8.3413, + "step": 126 + }, + { + "epoch": 0.0007553049766866496, + "grad_norm": 3.253399133682251, + "learning_rate": 4.9999930723187255e-05, + "loss": 8.0412, + "step": 127 + }, + { + "epoch": 0.0007612522599676468, + "grad_norm": 3.5694124698638916, + "learning_rate": 4.999992961919221e-05, + "loss": 8.0895, + "step": 128 + }, + { + "epoch": 0.000767199543248644, + "grad_norm": 4.106658458709717, + "learning_rate": 4.999992850646994e-05, + "loss": 8.3654, + "step": 129 + }, + { + "epoch": 0.0007731468265296412, + "grad_norm": 4.082829475402832, + "learning_rate": 4.9999927385020455e-05, + "loss": 8.2663, + "step": 130 + }, + { + "epoch": 0.0007790941098106385, + "grad_norm": 4.349386215209961, + "learning_rate": 4.9999926254843753e-05, + "loss": 8.2435, + "step": 131 + }, + { + "epoch": 0.0007850413930916357, + "grad_norm": 3.375697135925293, + "learning_rate": 4.999992511593984e-05, + "loss": 8.0827, + "step": 132 + }, + { + "epoch": 0.000790988676372633, + "grad_norm": 3.2566957473754883, + "learning_rate": 4.999992396830871e-05, + "loss": 8.4891, + "step": 133 + }, + { + "epoch": 0.0007969359596536302, + "grad_norm": 3.791579008102417, + "learning_rate": 4.999992281195036e-05, + "loss": 8.1567, + "step": 134 + }, + { + "epoch": 0.0008028832429346275, + "grad_norm": 3.8741838932037354, + "learning_rate": 4.99999216468648e-05, + "loss": 8.4033, + "step": 135 + }, + { + "epoch": 0.0008088305262156248, + "grad_norm": 4.229452133178711, + "learning_rate": 4.999992047305203e-05, + "loss": 8.3897, + "step": 136 + }, + { + "epoch": 0.0008147778094966219, + "grad_norm": 3.2732088565826416, + "learning_rate": 4.9999919290512034e-05, + "loss": 8.1758, + "step": 137 + }, + { + "epoch": 0.0008207250927776192, + "grad_norm": 3.2048966884613037, + "learning_rate": 4.9999918099244836e-05, + "loss": 8.1459, + "step": 138 + }, + { + "epoch": 0.0008266723760586164, + "grad_norm": 3.8639938831329346, + "learning_rate": 4.999991689925042e-05, + "loss": 7.9437, + "step": 139 + }, + { + "epoch": 0.0008326196593396137, + "grad_norm": 3.297252655029297, + "learning_rate": 4.9999915690528794e-05, + "loss": 8.1751, + "step": 140 + }, + { + "epoch": 0.0008385669426206109, + "grad_norm": 3.878218173980713, + "learning_rate": 4.999991447307995e-05, + "loss": 8.0572, + "step": 141 + }, + { + "epoch": 0.0008445142259016082, + "grad_norm": 3.6870739459991455, + "learning_rate": 4.9999913246903895e-05, + "loss": 8.0958, + "step": 142 + }, + { + "epoch": 0.0008504615091826054, + "grad_norm": 3.1817922592163086, + "learning_rate": 4.9999912012000636e-05, + "loss": 8.2683, + "step": 143 + }, + { + "epoch": 0.0008564087924636027, + "grad_norm": 3.4008772373199463, + "learning_rate": 4.999991076837016e-05, + "loss": 8.4171, + "step": 144 + }, + { + "epoch": 0.0008623560757445998, + "grad_norm": 3.002333641052246, + "learning_rate": 4.999990951601247e-05, + "loss": 8.1149, + "step": 145 + }, + { + "epoch": 0.0008683033590255971, + "grad_norm": 3.51910662651062, + "learning_rate": 4.999990825492757e-05, + "loss": 8.5284, + "step": 146 + }, + { + "epoch": 0.0008742506423065943, + "grad_norm": 2.978875160217285, + "learning_rate": 4.999990698511548e-05, + "loss": 8.4855, + "step": 147 + }, + { + "epoch": 0.0008801979255875916, + "grad_norm": 3.4708774089813232, + "learning_rate": 4.999990570657616e-05, + "loss": 8.333, + "step": 148 + }, + { + "epoch": 0.0008861452088685888, + "grad_norm": 2.994084596633911, + "learning_rate": 4.999990441930963e-05, + "loss": 8.3456, + "step": 149 + }, + { + "epoch": 0.0008920924921495861, + "grad_norm": 3.1295697689056396, + "learning_rate": 4.99999031233159e-05, + "loss": 8.2204, + "step": 150 + }, + { + "epoch": 0.0008980397754305833, + "grad_norm": 3.349720001220703, + "learning_rate": 4.9999901818594966e-05, + "loss": 8.2739, + "step": 151 + }, + { + "epoch": 0.0009039870587115806, + "grad_norm": 3.852964401245117, + "learning_rate": 4.999990050514681e-05, + "loss": 8.4225, + "step": 152 + }, + { + "epoch": 0.0009099343419925777, + "grad_norm": 3.92203950881958, + "learning_rate": 4.9999899182971456e-05, + "loss": 8.2882, + "step": 153 + }, + { + "epoch": 0.000915881625273575, + "grad_norm": 3.9960269927978516, + "learning_rate": 4.99998978520689e-05, + "loss": 8.2091, + "step": 154 + }, + { + "epoch": 0.0009218289085545723, + "grad_norm": 3.952327251434326, + "learning_rate": 4.999989651243913e-05, + "loss": 8.1726, + "step": 155 + }, + { + "epoch": 0.0009277761918355695, + "grad_norm": 3.9594647884368896, + "learning_rate": 4.9999895164082156e-05, + "loss": 8.0241, + "step": 156 + }, + { + "epoch": 0.0009337234751165668, + "grad_norm": 3.1129961013793945, + "learning_rate": 4.999989380699798e-05, + "loss": 8.14, + "step": 157 + }, + { + "epoch": 0.000939670758397564, + "grad_norm": 4.7737860679626465, + "learning_rate": 4.9999892441186604e-05, + "loss": 7.869, + "step": 158 + }, + { + "epoch": 0.0009456180416785613, + "grad_norm": 3.351327657699585, + "learning_rate": 4.9999891066648006e-05, + "loss": 8.1831, + "step": 159 + }, + { + "epoch": 0.0009515653249595584, + "grad_norm": 3.0245375633239746, + "learning_rate": 4.999988968338222e-05, + "loss": 8.3871, + "step": 160 + }, + { + "epoch": 0.0009575126082405557, + "grad_norm": 4.766855716705322, + "learning_rate": 4.999988829138923e-05, + "loss": 8.0078, + "step": 161 + }, + { + "epoch": 0.0009634598915215529, + "grad_norm": 3.975804090499878, + "learning_rate": 4.999988689066903e-05, + "loss": 7.6923, + "step": 162 + }, + { + "epoch": 0.0009694071748025502, + "grad_norm": 4.024605751037598, + "learning_rate": 4.999988548122163e-05, + "loss": 8.2986, + "step": 163 + }, + { + "epoch": 0.0009753544580835474, + "grad_norm": 4.230019569396973, + "learning_rate": 4.999988406304703e-05, + "loss": 8.2903, + "step": 164 + }, + { + "epoch": 0.0009813017413645446, + "grad_norm": 3.972825050354004, + "learning_rate": 4.9999882636145236e-05, + "loss": 8.3589, + "step": 165 + }, + { + "epoch": 0.0009872490246455418, + "grad_norm": 3.6381688117980957, + "learning_rate": 4.999988120051623e-05, + "loss": 8.2648, + "step": 166 + }, + { + "epoch": 0.000993196307926539, + "grad_norm": 4.203462600708008, + "learning_rate": 4.9999879756160025e-05, + "loss": 8.363, + "step": 167 + }, + { + "epoch": 0.0009991435912075363, + "grad_norm": 2.944103479385376, + "learning_rate": 4.9999878303076624e-05, + "loss": 7.9752, + "step": 168 + }, + { + "epoch": 0.0010050908744885336, + "grad_norm": 3.4115283489227295, + "learning_rate": 4.9999876841266025e-05, + "loss": 8.1044, + "step": 169 + }, + { + "epoch": 0.0010110381577695309, + "grad_norm": 4.185582160949707, + "learning_rate": 4.999987537072822e-05, + "loss": 8.0347, + "step": 170 + }, + { + "epoch": 0.0010169854410505281, + "grad_norm": 3.333649158477783, + "learning_rate": 4.999987389146323e-05, + "loss": 8.0545, + "step": 171 + }, + { + "epoch": 0.0010229327243315254, + "grad_norm": 3.7702765464782715, + "learning_rate": 4.999987240347103e-05, + "loss": 7.8936, + "step": 172 + }, + { + "epoch": 0.0010288800076125226, + "grad_norm": 4.113167762756348, + "learning_rate": 4.9999870906751636e-05, + "loss": 7.9447, + "step": 173 + }, + { + "epoch": 0.0010348272908935199, + "grad_norm": 3.370821714401245, + "learning_rate": 4.999986940130505e-05, + "loss": 7.9745, + "step": 174 + }, + { + "epoch": 0.0010407745741745171, + "grad_norm": 3.552391767501831, + "learning_rate": 4.999986788713126e-05, + "loss": 7.8882, + "step": 175 + }, + { + "epoch": 0.0010467218574555144, + "grad_norm": 3.3497536182403564, + "learning_rate": 4.999986636423028e-05, + "loss": 7.8601, + "step": 176 + }, + { + "epoch": 0.0010526691407365116, + "grad_norm": 3.256685733795166, + "learning_rate": 4.9999864832602105e-05, + "loss": 7.8341, + "step": 177 + }, + { + "epoch": 0.001058616424017509, + "grad_norm": 3.028108835220337, + "learning_rate": 4.999986329224674e-05, + "loss": 7.884, + "step": 178 + }, + { + "epoch": 0.0010645637072985061, + "grad_norm": 2.9583778381347656, + "learning_rate": 4.9999861743164165e-05, + "loss": 7.7875, + "step": 179 + }, + { + "epoch": 0.0010705109905795032, + "grad_norm": 3.109215497970581, + "learning_rate": 4.999986018535441e-05, + "loss": 8.4081, + "step": 180 + }, + { + "epoch": 0.0010764582738605004, + "grad_norm": 3.8907759189605713, + "learning_rate": 4.999985861881746e-05, + "loss": 8.0971, + "step": 181 + }, + { + "epoch": 0.0010824055571414977, + "grad_norm": 4.20400857925415, + "learning_rate": 4.9999857043553314e-05, + "loss": 7.9077, + "step": 182 + }, + { + "epoch": 0.001088352840422495, + "grad_norm": 3.580486297607422, + "learning_rate": 4.999985545956198e-05, + "loss": 7.8935, + "step": 183 + }, + { + "epoch": 0.0010943001237034922, + "grad_norm": 3.3833847045898438, + "learning_rate": 4.999985386684345e-05, + "loss": 7.9956, + "step": 184 + }, + { + "epoch": 0.0011002474069844895, + "grad_norm": 2.8848624229431152, + "learning_rate": 4.9999852265397734e-05, + "loss": 8.0718, + "step": 185 + }, + { + "epoch": 0.0011061946902654867, + "grad_norm": 3.8933818340301514, + "learning_rate": 4.999985065522483e-05, + "loss": 8.0517, + "step": 186 + }, + { + "epoch": 0.001112141973546484, + "grad_norm": 3.6559605598449707, + "learning_rate": 4.999984903632473e-05, + "loss": 8.3664, + "step": 187 + }, + { + "epoch": 0.0011180892568274812, + "grad_norm": 3.4633536338806152, + "learning_rate": 4.999984740869744e-05, + "loss": 8.3481, + "step": 188 + }, + { + "epoch": 0.0011240365401084785, + "grad_norm": 3.483020305633545, + "learning_rate": 4.999984577234297e-05, + "loss": 8.3407, + "step": 189 + }, + { + "epoch": 0.0011299838233894757, + "grad_norm": 2.772434711456299, + "learning_rate": 4.999984412726131e-05, + "loss": 8.4524, + "step": 190 + }, + { + "epoch": 0.001135931106670473, + "grad_norm": 3.3341007232666016, + "learning_rate": 4.999984247345246e-05, + "loss": 8.1063, + "step": 191 + }, + { + "epoch": 0.0011418783899514702, + "grad_norm": 3.0063467025756836, + "learning_rate": 4.999984081091642e-05, + "loss": 8.0077, + "step": 192 + }, + { + "epoch": 0.0011478256732324675, + "grad_norm": 2.9670779705047607, + "learning_rate": 4.99998391396532e-05, + "loss": 8.2338, + "step": 193 + }, + { + "epoch": 0.0011537729565134647, + "grad_norm": 3.024505138397217, + "learning_rate": 4.999983745966279e-05, + "loss": 8.1794, + "step": 194 + }, + { + "epoch": 0.0011597202397944618, + "grad_norm": 2.834131956100464, + "learning_rate": 4.9999835770945195e-05, + "loss": 8.2078, + "step": 195 + }, + { + "epoch": 0.001165667523075459, + "grad_norm": 3.555525064468384, + "learning_rate": 4.999983407350042e-05, + "loss": 8.0838, + "step": 196 + }, + { + "epoch": 0.0011716148063564563, + "grad_norm": 3.5013587474823, + "learning_rate": 4.999983236732846e-05, + "loss": 8.092, + "step": 197 + }, + { + "epoch": 0.0011775620896374535, + "grad_norm": 3.3721518516540527, + "learning_rate": 4.9999830652429314e-05, + "loss": 8.1137, + "step": 198 + }, + { + "epoch": 0.0011835093729184508, + "grad_norm": 3.364952564239502, + "learning_rate": 4.9999828928802986e-05, + "loss": 8.1197, + "step": 199 + }, + { + "epoch": 0.001189456656199448, + "grad_norm": 3.691249132156372, + "learning_rate": 4.999982719644948e-05, + "loss": 8.0922, + "step": 200 + }, + { + "epoch": 0.0011954039394804453, + "grad_norm": 6.919185161590576, + "learning_rate": 4.9999825455368785e-05, + "loss": 7.9215, + "step": 201 + }, + { + "epoch": 0.0012013512227614426, + "grad_norm": 3.3332598209381104, + "learning_rate": 4.999982370556091e-05, + "loss": 7.7605, + "step": 202 + }, + { + "epoch": 0.0012072985060424398, + "grad_norm": 2.842517375946045, + "learning_rate": 4.999982194702586e-05, + "loss": 8.0527, + "step": 203 + }, + { + "epoch": 0.001213245789323437, + "grad_norm": 3.086371660232544, + "learning_rate": 4.999982017976364e-05, + "loss": 8.2637, + "step": 204 + }, + { + "epoch": 0.0012191930726044343, + "grad_norm": 3.0870208740234375, + "learning_rate": 4.999981840377422e-05, + "loss": 8.3538, + "step": 205 + }, + { + "epoch": 0.0012251403558854316, + "grad_norm": 3.1244094371795654, + "learning_rate": 4.9999816619057633e-05, + "loss": 8.4604, + "step": 206 + }, + { + "epoch": 0.0012310876391664288, + "grad_norm": 2.7808034420013428, + "learning_rate": 4.999981482561387e-05, + "loss": 8.3227, + "step": 207 + }, + { + "epoch": 0.001237034922447426, + "grad_norm": 2.791182518005371, + "learning_rate": 4.999981302344292e-05, + "loss": 8.1481, + "step": 208 + }, + { + "epoch": 0.0012429822057284233, + "grad_norm": 3.045971632003784, + "learning_rate": 4.99998112125448e-05, + "loss": 7.7842, + "step": 209 + }, + { + "epoch": 0.0012489294890094206, + "grad_norm": 3.2548067569732666, + "learning_rate": 4.99998093929195e-05, + "loss": 7.9935, + "step": 210 + }, + { + "epoch": 0.0012548767722904176, + "grad_norm": 3.5448713302612305, + "learning_rate": 4.999980756456704e-05, + "loss": 8.0323, + "step": 211 + }, + { + "epoch": 0.0012608240555714149, + "grad_norm": 3.717900514602661, + "learning_rate": 4.9999805727487395e-05, + "loss": 8.0532, + "step": 212 + }, + { + "epoch": 0.0012667713388524121, + "grad_norm": 3.2943921089172363, + "learning_rate": 4.9999803881680576e-05, + "loss": 8.0326, + "step": 213 + }, + { + "epoch": 0.0012727186221334094, + "grad_norm": 3.4586269855499268, + "learning_rate": 4.999980202714658e-05, + "loss": 7.8765, + "step": 214 + }, + { + "epoch": 0.0012786659054144067, + "grad_norm": 3.1898810863494873, + "learning_rate": 4.9999800163885414e-05, + "loss": 7.8859, + "step": 215 + }, + { + "epoch": 0.001284613188695404, + "grad_norm": 2.977229595184326, + "learning_rate": 4.9999798291897084e-05, + "loss": 7.8841, + "step": 216 + }, + { + "epoch": 0.0012905604719764012, + "grad_norm": 3.368680000305176, + "learning_rate": 4.999979641118157e-05, + "loss": 7.8055, + "step": 217 + }, + { + "epoch": 0.0012965077552573984, + "grad_norm": 4.295344352722168, + "learning_rate": 4.9999794521738894e-05, + "loss": 7.6456, + "step": 218 + }, + { + "epoch": 0.0013024550385383957, + "grad_norm": 3.985480546951294, + "learning_rate": 4.999979262356904e-05, + "loss": 7.6987, + "step": 219 + }, + { + "epoch": 0.001308402321819393, + "grad_norm": 3.8719842433929443, + "learning_rate": 4.999979071667202e-05, + "loss": 7.6994, + "step": 220 + }, + { + "epoch": 0.0013143496051003902, + "grad_norm": 4.699835300445557, + "learning_rate": 4.999978880104784e-05, + "loss": 8.1815, + "step": 221 + }, + { + "epoch": 0.0013202968883813874, + "grad_norm": 3.9221127033233643, + "learning_rate": 4.9999786876696485e-05, + "loss": 7.8765, + "step": 222 + }, + { + "epoch": 0.0013262441716623847, + "grad_norm": 4.4223504066467285, + "learning_rate": 4.9999784943617964e-05, + "loss": 7.7244, + "step": 223 + }, + { + "epoch": 0.001332191454943382, + "grad_norm": 3.4598348140716553, + "learning_rate": 4.999978300181227e-05, + "loss": 7.7072, + "step": 224 + }, + { + "epoch": 0.0013381387382243792, + "grad_norm": 3.536752223968506, + "learning_rate": 4.999978105127941e-05, + "loss": 7.6337, + "step": 225 + }, + { + "epoch": 0.0013440860215053765, + "grad_norm": 3.6432204246520996, + "learning_rate": 4.99997790920194e-05, + "loss": 7.8078, + "step": 226 + }, + { + "epoch": 0.0013500333047863735, + "grad_norm": 4.8305768966674805, + "learning_rate": 4.999977712403221e-05, + "loss": 7.9003, + "step": 227 + }, + { + "epoch": 0.0013559805880673707, + "grad_norm": 3.773876428604126, + "learning_rate": 4.999977514731786e-05, + "loss": 8.0513, + "step": 228 + }, + { + "epoch": 0.001361927871348368, + "grad_norm": 4.465645790100098, + "learning_rate": 4.999977316187635e-05, + "loss": 7.9847, + "step": 229 + }, + { + "epoch": 0.0013678751546293653, + "grad_norm": 3.9466493129730225, + "learning_rate": 4.9999771167707674e-05, + "loss": 7.9902, + "step": 230 + }, + { + "epoch": 0.0013738224379103625, + "grad_norm": 4.432138919830322, + "learning_rate": 4.9999769164811846e-05, + "loss": 7.8929, + "step": 231 + }, + { + "epoch": 0.0013797697211913598, + "grad_norm": 3.5211949348449707, + "learning_rate": 4.999976715318885e-05, + "loss": 8.1838, + "step": 232 + }, + { + "epoch": 0.001385717004472357, + "grad_norm": 3.0819287300109863, + "learning_rate": 4.9999765132838686e-05, + "loss": 8.2823, + "step": 233 + }, + { + "epoch": 0.0013916642877533543, + "grad_norm": 3.436112880706787, + "learning_rate": 4.9999763103761374e-05, + "loss": 7.7796, + "step": 234 + }, + { + "epoch": 0.0013976115710343515, + "grad_norm": 3.6699061393737793, + "learning_rate": 4.99997610659569e-05, + "loss": 7.5792, + "step": 235 + }, + { + "epoch": 0.0014035588543153488, + "grad_norm": 3.814182758331299, + "learning_rate": 4.999975901942526e-05, + "loss": 7.5631, + "step": 236 + }, + { + "epoch": 0.001409506137596346, + "grad_norm": 3.84110164642334, + "learning_rate": 4.9999756964166465e-05, + "loss": 7.4244, + "step": 237 + }, + { + "epoch": 0.0014154534208773433, + "grad_norm": 3.278045415878296, + "learning_rate": 4.999975490018052e-05, + "loss": 7.9049, + "step": 238 + }, + { + "epoch": 0.0014214007041583405, + "grad_norm": 3.5502712726593018, + "learning_rate": 4.999975282746742e-05, + "loss": 8.0021, + "step": 239 + }, + { + "epoch": 0.0014273479874393378, + "grad_norm": 2.7919108867645264, + "learning_rate": 4.9999750746027153e-05, + "loss": 8.2854, + "step": 240 + }, + { + "epoch": 0.001433295270720335, + "grad_norm": 3.1689581871032715, + "learning_rate": 4.999974865585973e-05, + "loss": 8.3177, + "step": 241 + }, + { + "epoch": 0.001439242554001332, + "grad_norm": 2.728679656982422, + "learning_rate": 4.999974655696517e-05, + "loss": 8.3181, + "step": 242 + }, + { + "epoch": 0.0014451898372823293, + "grad_norm": 3.5175108909606934, + "learning_rate": 4.9999744449343445e-05, + "loss": 8.03, + "step": 243 + }, + { + "epoch": 0.0014511371205633266, + "grad_norm": 3.714219808578491, + "learning_rate": 4.999974233299457e-05, + "loss": 8.0824, + "step": 244 + }, + { + "epoch": 0.0014570844038443239, + "grad_norm": 3.42090106010437, + "learning_rate": 4.9999740207918546e-05, + "loss": 8.0455, + "step": 245 + }, + { + "epoch": 0.001463031687125321, + "grad_norm": 3.035047769546509, + "learning_rate": 4.999973807411537e-05, + "loss": 8.0117, + "step": 246 + }, + { + "epoch": 0.0014689789704063184, + "grad_norm": 3.4878122806549072, + "learning_rate": 4.9999735931585034e-05, + "loss": 8.1368, + "step": 247 + }, + { + "epoch": 0.0014749262536873156, + "grad_norm": 3.648115873336792, + "learning_rate": 4.999973378032756e-05, + "loss": 7.9987, + "step": 248 + }, + { + "epoch": 0.0014808735369683129, + "grad_norm": 3.171255588531494, + "learning_rate": 4.9999731620342936e-05, + "loss": 7.9733, + "step": 249 + }, + { + "epoch": 0.0014868208202493101, + "grad_norm": 3.157804250717163, + "learning_rate": 4.999972945163116e-05, + "loss": 7.8511, + "step": 250 + }, + { + "epoch": 0.0014927681035303074, + "grad_norm": 3.4346978664398193, + "learning_rate": 4.999972727419224e-05, + "loss": 7.9075, + "step": 251 + }, + { + "epoch": 0.0014987153868113046, + "grad_norm": 3.281135082244873, + "learning_rate": 4.9999725088026175e-05, + "loss": 7.876, + "step": 252 + }, + { + "epoch": 0.0015046626700923019, + "grad_norm": 3.1481714248657227, + "learning_rate": 4.9999722893132954e-05, + "loss": 8.1458, + "step": 253 + }, + { + "epoch": 0.0015106099533732991, + "grad_norm": 2.821460247039795, + "learning_rate": 4.99997206895126e-05, + "loss": 7.9141, + "step": 254 + }, + { + "epoch": 0.0015165572366542964, + "grad_norm": 2.887997627258301, + "learning_rate": 4.999971847716509e-05, + "loss": 8.2246, + "step": 255 + }, + { + "epoch": 0.0015225045199352936, + "grad_norm": 2.8097078800201416, + "learning_rate": 4.999971625609044e-05, + "loss": 7.8576, + "step": 256 + }, + { + "epoch": 0.001528451803216291, + "grad_norm": 2.9272890090942383, + "learning_rate": 4.999971402628866e-05, + "loss": 7.6856, + "step": 257 + }, + { + "epoch": 0.001534399086497288, + "grad_norm": 3.487027168273926, + "learning_rate": 4.999971178775973e-05, + "loss": 7.8179, + "step": 258 + }, + { + "epoch": 0.0015403463697782852, + "grad_norm": 3.575681209564209, + "learning_rate": 4.9999709540503656e-05, + "loss": 7.8115, + "step": 259 + }, + { + "epoch": 0.0015462936530592824, + "grad_norm": 3.457756757736206, + "learning_rate": 4.9999707284520435e-05, + "loss": 7.7985, + "step": 260 + }, + { + "epoch": 0.0015522409363402797, + "grad_norm": 3.732728958129883, + "learning_rate": 4.999970501981009e-05, + "loss": 7.8369, + "step": 261 + }, + { + "epoch": 0.001558188219621277, + "grad_norm": 4.1466898918151855, + "learning_rate": 4.99997027463726e-05, + "loss": 8.2435, + "step": 262 + }, + { + "epoch": 0.0015641355029022742, + "grad_norm": 4.028534889221191, + "learning_rate": 4.9999700464207965e-05, + "loss": 8.2338, + "step": 263 + }, + { + "epoch": 0.0015700827861832715, + "grad_norm": 3.7445273399353027, + "learning_rate": 4.99996981733162e-05, + "loss": 8.1182, + "step": 264 + }, + { + "epoch": 0.0015760300694642687, + "grad_norm": 3.455228567123413, + "learning_rate": 4.99996958736973e-05, + "loss": 8.1932, + "step": 265 + }, + { + "epoch": 0.001581977352745266, + "grad_norm": 3.1530332565307617, + "learning_rate": 4.9999693565351256e-05, + "loss": 7.8304, + "step": 266 + }, + { + "epoch": 0.0015879246360262632, + "grad_norm": 3.113161325454712, + "learning_rate": 4.999969124827809e-05, + "loss": 7.6625, + "step": 267 + }, + { + "epoch": 0.0015938719193072605, + "grad_norm": 3.621076822280884, + "learning_rate": 4.999968892247778e-05, + "loss": 8.0983, + "step": 268 + }, + { + "epoch": 0.0015998192025882577, + "grad_norm": 3.533395767211914, + "learning_rate": 4.9999686587950346e-05, + "loss": 7.9564, + "step": 269 + }, + { + "epoch": 0.001605766485869255, + "grad_norm": 3.6486849784851074, + "learning_rate": 4.999968424469577e-05, + "loss": 7.9864, + "step": 270 + }, + { + "epoch": 0.0016117137691502522, + "grad_norm": 3.223167657852173, + "learning_rate": 4.999968189271407e-05, + "loss": 7.8516, + "step": 271 + }, + { + "epoch": 0.0016176610524312495, + "grad_norm": 3.282062530517578, + "learning_rate": 4.999967953200523e-05, + "loss": 7.9247, + "step": 272 + }, + { + "epoch": 0.0016236083357122465, + "grad_norm": 2.8589930534362793, + "learning_rate": 4.999967716256927e-05, + "loss": 7.8871, + "step": 273 + }, + { + "epoch": 0.0016295556189932438, + "grad_norm": 3.136882781982422, + "learning_rate": 4.9999674784406174e-05, + "loss": 7.8793, + "step": 274 + }, + { + "epoch": 0.001635502902274241, + "grad_norm": 3.9103915691375732, + "learning_rate": 4.999967239751595e-05, + "loss": 7.9005, + "step": 275 + }, + { + "epoch": 0.0016414501855552383, + "grad_norm": 4.40267276763916, + "learning_rate": 4.99996700018986e-05, + "loss": 7.9247, + "step": 276 + }, + { + "epoch": 0.0016473974688362356, + "grad_norm": 3.6620242595672607, + "learning_rate": 4.9999667597554136e-05, + "loss": 8.0719, + "step": 277 + }, + { + "epoch": 0.0016533447521172328, + "grad_norm": 3.1278858184814453, + "learning_rate": 4.999966518448253e-05, + "loss": 8.0822, + "step": 278 + }, + { + "epoch": 0.00165929203539823, + "grad_norm": 3.321831464767456, + "learning_rate": 4.9999662762683805e-05, + "loss": 8.1266, + "step": 279 + }, + { + "epoch": 0.0016652393186792273, + "grad_norm": 3.4116811752319336, + "learning_rate": 4.999966033215795e-05, + "loss": 8.2159, + "step": 280 + }, + { + "epoch": 0.0016711866019602246, + "grad_norm": 3.58381724357605, + "learning_rate": 4.999965789290498e-05, + "loss": 8.0275, + "step": 281 + }, + { + "epoch": 0.0016771338852412218, + "grad_norm": 3.0357518196105957, + "learning_rate": 4.9999655444924884e-05, + "loss": 8.1171, + "step": 282 + }, + { + "epoch": 0.001683081168522219, + "grad_norm": 3.237764596939087, + "learning_rate": 4.999965298821767e-05, + "loss": 7.822, + "step": 283 + }, + { + "epoch": 0.0016890284518032163, + "grad_norm": 3.0861873626708984, + "learning_rate": 4.999965052278334e-05, + "loss": 7.7991, + "step": 284 + }, + { + "epoch": 0.0016949757350842136, + "grad_norm": 2.8045542240142822, + "learning_rate": 4.999964804862187e-05, + "loss": 7.9659, + "step": 285 + }, + { + "epoch": 0.0017009230183652108, + "grad_norm": 3.1282641887664795, + "learning_rate": 4.9999645565733297e-05, + "loss": 7.8354, + "step": 286 + }, + { + "epoch": 0.001706870301646208, + "grad_norm": 2.980001211166382, + "learning_rate": 4.999964307411761e-05, + "loss": 7.806, + "step": 287 + }, + { + "epoch": 0.0017128175849272054, + "grad_norm": 3.114238977432251, + "learning_rate": 4.99996405737748e-05, + "loss": 7.6173, + "step": 288 + }, + { + "epoch": 0.0017187648682082024, + "grad_norm": 2.6732640266418457, + "learning_rate": 4.9999638064704866e-05, + "loss": 7.5944, + "step": 289 + }, + { + "epoch": 0.0017247121514891996, + "grad_norm": 3.2139906883239746, + "learning_rate": 4.999963554690783e-05, + "loss": 7.5738, + "step": 290 + }, + { + "epoch": 0.001730659434770197, + "grad_norm": 3.0964555740356445, + "learning_rate": 4.999963302038368e-05, + "loss": 7.4431, + "step": 291 + }, + { + "epoch": 0.0017366067180511942, + "grad_norm": 3.0611374378204346, + "learning_rate": 4.99996304851324e-05, + "loss": 7.3748, + "step": 292 + }, + { + "epoch": 0.0017425540013321914, + "grad_norm": 2.88114333152771, + "learning_rate": 4.999962794115402e-05, + "loss": 7.3554, + "step": 293 + }, + { + "epoch": 0.0017485012846131887, + "grad_norm": 2.895141363143921, + "learning_rate": 4.999962538844852e-05, + "loss": 7.2801, + "step": 294 + }, + { + "epoch": 0.001754448567894186, + "grad_norm": 3.0645008087158203, + "learning_rate": 4.9999622827015914e-05, + "loss": 7.1753, + "step": 295 + }, + { + "epoch": 0.0017603958511751832, + "grad_norm": 3.0750465393066406, + "learning_rate": 4.99996202568562e-05, + "loss": 7.1905, + "step": 296 + }, + { + "epoch": 0.0017663431344561804, + "grad_norm": 3.1322436332702637, + "learning_rate": 4.9999617677969374e-05, + "loss": 7.0851, + "step": 297 + }, + { + "epoch": 0.0017722904177371777, + "grad_norm": 3.8287153244018555, + "learning_rate": 4.999961509035544e-05, + "loss": 7.0842, + "step": 298 + }, + { + "epoch": 0.001778237701018175, + "grad_norm": 2.874312162399292, + "learning_rate": 4.9999612494014403e-05, + "loss": 6.9588, + "step": 299 + }, + { + "epoch": 0.0017841849842991722, + "grad_norm": 2.916250705718994, + "learning_rate": 4.999960988894625e-05, + "loss": 7.1342, + "step": 300 + }, + { + "epoch": 0.0017901322675801694, + "grad_norm": 2.71624755859375, + "learning_rate": 4.9999607275151e-05, + "loss": 7.0418, + "step": 301 + }, + { + "epoch": 0.0017960795508611667, + "grad_norm": 2.655630350112915, + "learning_rate": 4.999960465262864e-05, + "loss": 6.937, + "step": 302 + }, + { + "epoch": 0.001802026834142164, + "grad_norm": 2.8819122314453125, + "learning_rate": 4.999960202137918e-05, + "loss": 7.0116, + "step": 303 + }, + { + "epoch": 0.0018079741174231612, + "grad_norm": 2.909701108932495, + "learning_rate": 4.999959938140262e-05, + "loss": 6.9588, + "step": 304 + }, + { + "epoch": 0.0018139214007041582, + "grad_norm": 3.276395797729492, + "learning_rate": 4.999959673269895e-05, + "loss": 6.9066, + "step": 305 + }, + { + "epoch": 0.0018198686839851555, + "grad_norm": 2.8774867057800293, + "learning_rate": 4.9999594075268186e-05, + "loss": 7.0112, + "step": 306 + }, + { + "epoch": 0.0018258159672661528, + "grad_norm": 2.9667818546295166, + "learning_rate": 4.999959140911032e-05, + "loss": 7.1467, + "step": 307 + }, + { + "epoch": 0.00183176325054715, + "grad_norm": 6.6612958908081055, + "learning_rate": 4.999958873422536e-05, + "loss": 8.4457, + "step": 308 + }, + { + "epoch": 0.0018377105338281473, + "grad_norm": 4.234557628631592, + "learning_rate": 4.999958605061329e-05, + "loss": 8.904, + "step": 309 + }, + { + "epoch": 0.0018436578171091445, + "grad_norm": 4.049502372741699, + "learning_rate": 4.999958335827413e-05, + "loss": 7.5174, + "step": 310 + }, + { + "epoch": 0.0018496051003901418, + "grad_norm": 3.574474334716797, + "learning_rate": 4.999958065720787e-05, + "loss": 8.6537, + "step": 311 + }, + { + "epoch": 0.001855552383671139, + "grad_norm": 3.6154026985168457, + "learning_rate": 4.9999577947414515e-05, + "loss": 8.5833, + "step": 312 + }, + { + "epoch": 0.0018614996669521363, + "grad_norm": 2.9204158782958984, + "learning_rate": 4.999957522889407e-05, + "loss": 8.5486, + "step": 313 + }, + { + "epoch": 0.0018674469502331335, + "grad_norm": 3.095310688018799, + "learning_rate": 4.999957250164653e-05, + "loss": 8.3855, + "step": 314 + }, + { + "epoch": 0.0018733942335141308, + "grad_norm": 3.872267723083496, + "learning_rate": 4.999956976567189e-05, + "loss": 8.2715, + "step": 315 + }, + { + "epoch": 0.001879341516795128, + "grad_norm": 3.5560686588287354, + "learning_rate": 4.9999567020970175e-05, + "loss": 8.1571, + "step": 316 + }, + { + "epoch": 0.0018852888000761253, + "grad_norm": 2.6759164333343506, + "learning_rate": 4.9999564267541356e-05, + "loss": 8.4072, + "step": 317 + }, + { + "epoch": 0.0018912360833571226, + "grad_norm": 4.034712791442871, + "learning_rate": 4.999956150538545e-05, + "loss": 7.7622, + "step": 318 + }, + { + "epoch": 0.0018971833666381198, + "grad_norm": 3.8927831649780273, + "learning_rate": 4.999955873450246e-05, + "loss": 7.5012, + "step": 319 + }, + { + "epoch": 0.0019031306499191168, + "grad_norm": 3.4422812461853027, + "learning_rate": 4.999955595489237e-05, + "loss": 7.6894, + "step": 320 + }, + { + "epoch": 0.001909077933200114, + "grad_norm": 3.0367283821105957, + "learning_rate": 4.999955316655521e-05, + "loss": 7.8151, + "step": 321 + }, + { + "epoch": 0.0019150252164811114, + "grad_norm": 3.7553489208221436, + "learning_rate": 4.9999550369490955e-05, + "loss": 8.0462, + "step": 322 + }, + { + "epoch": 0.0019209724997621086, + "grad_norm": 3.432591438293457, + "learning_rate": 4.999954756369962e-05, + "loss": 7.8782, + "step": 323 + }, + { + "epoch": 0.0019269197830431059, + "grad_norm": 2.7325966358184814, + "learning_rate": 4.9999544749181196e-05, + "loss": 7.9045, + "step": 324 + }, + { + "epoch": 0.0019328670663241031, + "grad_norm": 4.31963586807251, + "learning_rate": 4.9999541925935686e-05, + "loss": 7.7791, + "step": 325 + }, + { + "epoch": 0.0019388143496051004, + "grad_norm": 2.840189218521118, + "learning_rate": 4.999953909396311e-05, + "loss": 7.8334, + "step": 326 + }, + { + "epoch": 0.0019447616328860976, + "grad_norm": 3.2388041019439697, + "learning_rate": 4.9999536253263434e-05, + "loss": 7.6756, + "step": 327 + }, + { + "epoch": 0.0019507089161670949, + "grad_norm": 3.6291563510894775, + "learning_rate": 4.999953340383669e-05, + "loss": 7.6511, + "step": 328 + }, + { + "epoch": 0.001956656199448092, + "grad_norm": 3.35703706741333, + "learning_rate": 4.999953054568287e-05, + "loss": 7.6382, + "step": 329 + }, + { + "epoch": 0.001962603482729089, + "grad_norm": 3.117281198501587, + "learning_rate": 4.999952767880196e-05, + "loss": 7.6233, + "step": 330 + }, + { + "epoch": 0.0019685507660100864, + "grad_norm": 2.8385257720947266, + "learning_rate": 4.999952480319398e-05, + "loss": 7.6594, + "step": 331 + }, + { + "epoch": 0.0019744980492910837, + "grad_norm": 2.5914418697357178, + "learning_rate": 4.999952191885893e-05, + "loss": 8.2647, + "step": 332 + }, + { + "epoch": 0.001980445332572081, + "grad_norm": 2.5847742557525635, + "learning_rate": 4.9999519025796795e-05, + "loss": 8.339, + "step": 333 + }, + { + "epoch": 0.001986392615853078, + "grad_norm": 2.7022132873535156, + "learning_rate": 4.999951612400759e-05, + "loss": 7.9114, + "step": 334 + }, + { + "epoch": 0.0019923398991340754, + "grad_norm": 3.0290884971618652, + "learning_rate": 4.999951321349131e-05, + "loss": 7.4531, + "step": 335 + }, + { + "epoch": 0.0019982871824150727, + "grad_norm": 2.8910324573516846, + "learning_rate": 4.999951029424796e-05, + "loss": 7.398, + "step": 336 + }, + { + "epoch": 0.00200423446569607, + "grad_norm": 2.8917605876922607, + "learning_rate": 4.9999507366277545e-05, + "loss": 7.48, + "step": 337 + }, + { + "epoch": 0.002010181748977067, + "grad_norm": 2.8957982063293457, + "learning_rate": 4.999950442958005e-05, + "loss": 7.8662, + "step": 338 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 3.562232255935669, + "learning_rate": 4.9999501484155485e-05, + "loss": 7.8388, + "step": 339 + }, + { + "epoch": 0.0020220763155390617, + "grad_norm": 2.51676607131958, + "learning_rate": 4.9999498530003866e-05, + "loss": 8.2834, + "step": 340 + }, + { + "epoch": 0.002028023598820059, + "grad_norm": 2.326110363006592, + "learning_rate": 4.999949556712517e-05, + "loss": 8.2528, + "step": 341 + }, + { + "epoch": 0.0020339708821010562, + "grad_norm": 2.7621335983276367, + "learning_rate": 4.999949259551941e-05, + "loss": 7.9791, + "step": 342 + }, + { + "epoch": 0.0020399181653820535, + "grad_norm": 3.045431137084961, + "learning_rate": 4.999948961518659e-05, + "loss": 7.8575, + "step": 343 + }, + { + "epoch": 0.0020458654486630507, + "grad_norm": 3.1940131187438965, + "learning_rate": 4.9999486626126703e-05, + "loss": 7.8581, + "step": 344 + }, + { + "epoch": 0.002051812731944048, + "grad_norm": 2.964136838912964, + "learning_rate": 4.999948362833975e-05, + "loss": 7.9656, + "step": 345 + }, + { + "epoch": 0.0020577600152250452, + "grad_norm": 3.167573928833008, + "learning_rate": 4.999948062182574e-05, + "loss": 7.7448, + "step": 346 + }, + { + "epoch": 0.0020637072985060425, + "grad_norm": 3.062666177749634, + "learning_rate": 4.9999477606584666e-05, + "loss": 7.7655, + "step": 347 + }, + { + "epoch": 0.0020696545817870397, + "grad_norm": 3.1097402572631836, + "learning_rate": 4.999947458261653e-05, + "loss": 7.643, + "step": 348 + }, + { + "epoch": 0.002075601865068037, + "grad_norm": 3.1663928031921387, + "learning_rate": 4.999947154992135e-05, + "loss": 7.8348, + "step": 349 + }, + { + "epoch": 0.0020815491483490343, + "grad_norm": 2.8295886516571045, + "learning_rate": 4.99994685084991e-05, + "loss": 7.7752, + "step": 350 + }, + { + "epoch": 0.0020874964316300315, + "grad_norm": 2.7384233474731445, + "learning_rate": 4.99994654583498e-05, + "loss": 7.7644, + "step": 351 + }, + { + "epoch": 0.0020934437149110288, + "grad_norm": 2.6654486656188965, + "learning_rate": 4.999946239947344e-05, + "loss": 7.7489, + "step": 352 + }, + { + "epoch": 0.002099390998192026, + "grad_norm": 2.8949942588806152, + "learning_rate": 4.999945933187003e-05, + "loss": 7.7105, + "step": 353 + }, + { + "epoch": 0.0021053382814730233, + "grad_norm": 2.590036630630493, + "learning_rate": 4.999945625553957e-05, + "loss": 7.6821, + "step": 354 + }, + { + "epoch": 0.0021112855647540205, + "grad_norm": 3.4601457118988037, + "learning_rate": 4.999945317048205e-05, + "loss": 7.3552, + "step": 355 + }, + { + "epoch": 0.002117232848035018, + "grad_norm": 4.022705078125, + "learning_rate": 4.999945007669748e-05, + "loss": 7.0281, + "step": 356 + }, + { + "epoch": 0.002123180131316015, + "grad_norm": 3.249699592590332, + "learning_rate": 4.999944697418587e-05, + "loss": 7.9279, + "step": 357 + }, + { + "epoch": 0.0021291274145970123, + "grad_norm": 2.8424601554870605, + "learning_rate": 4.99994438629472e-05, + "loss": 8.1485, + "step": 358 + }, + { + "epoch": 0.002135074697878009, + "grad_norm": 3.0473172664642334, + "learning_rate": 4.9999440742981486e-05, + "loss": 8.0877, + "step": 359 + }, + { + "epoch": 0.0021410219811590064, + "grad_norm": 3.0614171028137207, + "learning_rate": 4.9999437614288726e-05, + "loss": 7.7817, + "step": 360 + }, + { + "epoch": 0.0021469692644400036, + "grad_norm": 3.309464931488037, + "learning_rate": 4.9999434476868925e-05, + "loss": 7.857, + "step": 361 + }, + { + "epoch": 0.002152916547721001, + "grad_norm": 3.031921148300171, + "learning_rate": 4.999943133072207e-05, + "loss": 7.6393, + "step": 362 + }, + { + "epoch": 0.002158863831001998, + "grad_norm": 3.3756978511810303, + "learning_rate": 4.999942817584818e-05, + "loss": 7.7422, + "step": 363 + }, + { + "epoch": 0.0021648111142829954, + "grad_norm": 3.53362774848938, + "learning_rate": 4.999942501224724e-05, + "loss": 7.9388, + "step": 364 + }, + { + "epoch": 0.0021707583975639926, + "grad_norm": 3.4082882404327393, + "learning_rate": 4.999942183991927e-05, + "loss": 7.3578, + "step": 365 + }, + { + "epoch": 0.00217670568084499, + "grad_norm": 4.035211086273193, + "learning_rate": 4.999941865886425e-05, + "loss": 7.7833, + "step": 366 + }, + { + "epoch": 0.002182652964125987, + "grad_norm": 3.0394630432128906, + "learning_rate": 4.99994154690822e-05, + "loss": 7.9392, + "step": 367 + }, + { + "epoch": 0.0021886002474069844, + "grad_norm": 3.088926076889038, + "learning_rate": 4.99994122705731e-05, + "loss": 7.8149, + "step": 368 + }, + { + "epoch": 0.0021945475306879817, + "grad_norm": 2.3173277378082275, + "learning_rate": 4.9999409063336976e-05, + "loss": 8.2211, + "step": 369 + }, + { + "epoch": 0.002200494813968979, + "grad_norm": 2.9960854053497314, + "learning_rate": 4.9999405847373815e-05, + "loss": 7.6764, + "step": 370 + }, + { + "epoch": 0.002206442097249976, + "grad_norm": 2.841848134994507, + "learning_rate": 4.999940262268361e-05, + "loss": 7.9418, + "step": 371 + }, + { + "epoch": 0.0022123893805309734, + "grad_norm": 3.748779058456421, + "learning_rate": 4.999939938926638e-05, + "loss": 7.7843, + "step": 372 + }, + { + "epoch": 0.0022183366638119707, + "grad_norm": 2.8345019817352295, + "learning_rate": 4.999939614712212e-05, + "loss": 7.592, + "step": 373 + }, + { + "epoch": 0.002224283947092968, + "grad_norm": 3.12503719329834, + "learning_rate": 4.9999392896250826e-05, + "loss": 7.9543, + "step": 374 + }, + { + "epoch": 0.002230231230373965, + "grad_norm": 2.7812912464141846, + "learning_rate": 4.99993896366525e-05, + "loss": 7.8738, + "step": 375 + }, + { + "epoch": 0.0022361785136549624, + "grad_norm": 2.9477410316467285, + "learning_rate": 4.9999386368327144e-05, + "loss": 7.7738, + "step": 376 + }, + { + "epoch": 0.0022421257969359597, + "grad_norm": 2.305204391479492, + "learning_rate": 4.999938309127477e-05, + "loss": 7.9123, + "step": 377 + }, + { + "epoch": 0.002248073080216957, + "grad_norm": 3.3839781284332275, + "learning_rate": 4.999937980549536e-05, + "loss": 7.8542, + "step": 378 + }, + { + "epoch": 0.002254020363497954, + "grad_norm": 3.6973462104797363, + "learning_rate": 4.9999376510988924e-05, + "loss": 7.6953, + "step": 379 + }, + { + "epoch": 0.0022599676467789515, + "grad_norm": 3.8176333904266357, + "learning_rate": 4.999937320775547e-05, + "loss": 7.6548, + "step": 380 + }, + { + "epoch": 0.0022659149300599487, + "grad_norm": 3.0237386226654053, + "learning_rate": 4.999936989579499e-05, + "loss": 7.7843, + "step": 381 + }, + { + "epoch": 0.002271862213340946, + "grad_norm": 2.699695348739624, + "learning_rate": 4.999936657510749e-05, + "loss": 7.8841, + "step": 382 + }, + { + "epoch": 0.0022778094966219432, + "grad_norm": 3.7468206882476807, + "learning_rate": 4.9999363245692965e-05, + "loss": 7.8069, + "step": 383 + }, + { + "epoch": 0.0022837567799029405, + "grad_norm": 3.1074821949005127, + "learning_rate": 4.999935990755142e-05, + "loss": 7.8392, + "step": 384 + }, + { + "epoch": 0.0022897040631839377, + "grad_norm": 2.420884609222412, + "learning_rate": 4.999935656068287e-05, + "loss": 7.9238, + "step": 385 + }, + { + "epoch": 0.002295651346464935, + "grad_norm": 3.1354825496673584, + "learning_rate": 4.9999353205087296e-05, + "loss": 7.9766, + "step": 386 + }, + { + "epoch": 0.0023015986297459322, + "grad_norm": 2.7911901473999023, + "learning_rate": 4.9999349840764695e-05, + "loss": 7.9118, + "step": 387 + }, + { + "epoch": 0.0023075459130269295, + "grad_norm": 2.59529447555542, + "learning_rate": 4.999934646771509e-05, + "loss": 7.8839, + "step": 388 + }, + { + "epoch": 0.0023134931963079267, + "grad_norm": 4.121276378631592, + "learning_rate": 4.999934308593848e-05, + "loss": 7.8406, + "step": 389 + }, + { + "epoch": 0.0023194404795889236, + "grad_norm": 2.9091265201568604, + "learning_rate": 4.999933969543485e-05, + "loss": 7.86, + "step": 390 + }, + { + "epoch": 0.002325387762869921, + "grad_norm": 3.0700483322143555, + "learning_rate": 4.9999336296204195e-05, + "loss": 7.8214, + "step": 391 + }, + { + "epoch": 0.002331335046150918, + "grad_norm": 3.3008790016174316, + "learning_rate": 4.999933288824654e-05, + "loss": 7.5863, + "step": 392 + }, + { + "epoch": 0.0023372823294319153, + "grad_norm": 3.1414108276367188, + "learning_rate": 4.999932947156188e-05, + "loss": 7.5815, + "step": 393 + }, + { + "epoch": 0.0023432296127129126, + "grad_norm": 2.6881701946258545, + "learning_rate": 4.999932604615021e-05, + "loss": 7.959, + "step": 394 + }, + { + "epoch": 0.00234917689599391, + "grad_norm": 2.45609712600708, + "learning_rate": 4.9999322612011534e-05, + "loss": 7.9668, + "step": 395 + }, + { + "epoch": 0.002355124179274907, + "grad_norm": 3.1126747131347656, + "learning_rate": 4.999931916914585e-05, + "loss": 7.774, + "step": 396 + }, + { + "epoch": 0.0023610714625559043, + "grad_norm": 2.806708574295044, + "learning_rate": 4.999931571755316e-05, + "loss": 7.6297, + "step": 397 + }, + { + "epoch": 0.0023670187458369016, + "grad_norm": 3.220013380050659, + "learning_rate": 4.999931225723348e-05, + "loss": 7.3856, + "step": 398 + }, + { + "epoch": 0.002372966029117899, + "grad_norm": 3.0159943103790283, + "learning_rate": 4.9999308788186786e-05, + "loss": 7.3822, + "step": 399 + }, + { + "epoch": 0.002378913312398896, + "grad_norm": 3.1066205501556396, + "learning_rate": 4.9999305310413094e-05, + "loss": 7.3905, + "step": 400 + }, + { + "epoch": 0.0023848605956798934, + "grad_norm": 2.8004367351531982, + "learning_rate": 4.99993018239124e-05, + "loss": 7.8548, + "step": 401 + }, + { + "epoch": 0.0023908078789608906, + "grad_norm": 3.004378318786621, + "learning_rate": 4.999929832868471e-05, + "loss": 7.7846, + "step": 402 + }, + { + "epoch": 0.002396755162241888, + "grad_norm": 3.42901349067688, + "learning_rate": 4.9999294824730025e-05, + "loss": 7.9188, + "step": 403 + }, + { + "epoch": 0.002402702445522885, + "grad_norm": 3.7258527278900146, + "learning_rate": 4.9999291312048343e-05, + "loss": 7.7302, + "step": 404 + }, + { + "epoch": 0.0024086497288038824, + "grad_norm": 4.215145111083984, + "learning_rate": 4.999928779063967e-05, + "loss": 7.6597, + "step": 405 + }, + { + "epoch": 0.0024145970120848796, + "grad_norm": 3.157273769378662, + "learning_rate": 4.9999284260504004e-05, + "loss": 7.7262, + "step": 406 + }, + { + "epoch": 0.002420544295365877, + "grad_norm": 2.9977381229400635, + "learning_rate": 4.999928072164135e-05, + "loss": 7.72, + "step": 407 + }, + { + "epoch": 0.002426491578646874, + "grad_norm": 2.791682720184326, + "learning_rate": 4.9999277174051696e-05, + "loss": 7.8022, + "step": 408 + }, + { + "epoch": 0.0024324388619278714, + "grad_norm": 3.4143035411834717, + "learning_rate": 4.999927361773506e-05, + "loss": 7.5116, + "step": 409 + }, + { + "epoch": 0.0024383861452088687, + "grad_norm": 3.3458821773529053, + "learning_rate": 4.9999270052691425e-05, + "loss": 7.4337, + "step": 410 + }, + { + "epoch": 0.002444333428489866, + "grad_norm": 3.3339595794677734, + "learning_rate": 4.999926647892081e-05, + "loss": 7.7345, + "step": 411 + }, + { + "epoch": 0.002450280711770863, + "grad_norm": 4.285780429840088, + "learning_rate": 4.999926289642321e-05, + "loss": 7.9388, + "step": 412 + }, + { + "epoch": 0.0024562279950518604, + "grad_norm": 3.9473414421081543, + "learning_rate": 4.9999259305198624e-05, + "loss": 7.6038, + "step": 413 + }, + { + "epoch": 0.0024621752783328577, + "grad_norm": 3.504227638244629, + "learning_rate": 4.999925570524706e-05, + "loss": 7.4818, + "step": 414 + }, + { + "epoch": 0.002468122561613855, + "grad_norm": 3.2182157039642334, + "learning_rate": 4.999925209656851e-05, + "loss": 7.3493, + "step": 415 + }, + { + "epoch": 0.002474069844894852, + "grad_norm": 3.1944262981414795, + "learning_rate": 4.999924847916297e-05, + "loss": 7.3646, + "step": 416 + }, + { + "epoch": 0.0024800171281758494, + "grad_norm": 2.957244634628296, + "learning_rate": 4.999924485303047e-05, + "loss": 7.4403, + "step": 417 + }, + { + "epoch": 0.0024859644114568467, + "grad_norm": 2.971285343170166, + "learning_rate": 4.999924121817098e-05, + "loss": 7.7266, + "step": 418 + }, + { + "epoch": 0.002491911694737844, + "grad_norm": 4.029009819030762, + "learning_rate": 4.999923757458451e-05, + "loss": 7.3919, + "step": 419 + }, + { + "epoch": 0.002497858978018841, + "grad_norm": 3.9034767150878906, + "learning_rate": 4.999923392227107e-05, + "loss": 7.2349, + "step": 420 + }, + { + "epoch": 0.002503806261299838, + "grad_norm": 3.23218035697937, + "learning_rate": 4.9999230261230656e-05, + "loss": 7.5146, + "step": 421 + }, + { + "epoch": 0.0025097535445808353, + "grad_norm": 3.193225622177124, + "learning_rate": 4.9999226591463265e-05, + "loss": 7.1699, + "step": 422 + }, + { + "epoch": 0.0025157008278618325, + "grad_norm": 2.9796435832977295, + "learning_rate": 4.999922291296891e-05, + "loss": 7.5719, + "step": 423 + }, + { + "epoch": 0.0025216481111428298, + "grad_norm": 2.6746885776519775, + "learning_rate": 4.999921922574758e-05, + "loss": 7.8086, + "step": 424 + }, + { + "epoch": 0.002527595394423827, + "grad_norm": 3.0622920989990234, + "learning_rate": 4.999921552979928e-05, + "loss": 7.3233, + "step": 425 + }, + { + "epoch": 0.0025335426777048243, + "grad_norm": 3.0908501148223877, + "learning_rate": 4.999921182512402e-05, + "loss": 7.2582, + "step": 426 + }, + { + "epoch": 0.0025394899609858215, + "grad_norm": 2.6913537979125977, + "learning_rate": 4.999920811172178e-05, + "loss": 7.6643, + "step": 427 + }, + { + "epoch": 0.002545437244266819, + "grad_norm": 2.7793848514556885, + "learning_rate": 4.999920438959258e-05, + "loss": 7.9445, + "step": 428 + }, + { + "epoch": 0.002551384527547816, + "grad_norm": 2.741617202758789, + "learning_rate": 4.999920065873642e-05, + "loss": 8.0755, + "step": 429 + }, + { + "epoch": 0.0025573318108288133, + "grad_norm": 2.7102227210998535, + "learning_rate": 4.999919691915329e-05, + "loss": 7.8908, + "step": 430 + }, + { + "epoch": 0.0025632790941098106, + "grad_norm": 2.687788248062134, + "learning_rate": 4.9999193170843206e-05, + "loss": 7.9025, + "step": 431 + }, + { + "epoch": 0.002569226377390808, + "grad_norm": 2.923664093017578, + "learning_rate": 4.999918941380616e-05, + "loss": 7.9331, + "step": 432 + }, + { + "epoch": 0.002575173660671805, + "grad_norm": 2.934735059738159, + "learning_rate": 4.999918564804215e-05, + "loss": 7.722, + "step": 433 + }, + { + "epoch": 0.0025811209439528023, + "grad_norm": 3.8156228065490723, + "learning_rate": 4.999918187355119e-05, + "loss": 7.9392, + "step": 434 + }, + { + "epoch": 0.0025870682272337996, + "grad_norm": 2.333798408508301, + "learning_rate": 4.999917809033327e-05, + "loss": 7.9093, + "step": 435 + }, + { + "epoch": 0.002593015510514797, + "grad_norm": 2.078932046890259, + "learning_rate": 4.99991742983884e-05, + "loss": 7.8484, + "step": 436 + }, + { + "epoch": 0.002598962793795794, + "grad_norm": 2.433375835418701, + "learning_rate": 4.999917049771657e-05, + "loss": 7.9124, + "step": 437 + }, + { + "epoch": 0.0026049100770767913, + "grad_norm": 3.1881024837493896, + "learning_rate": 4.999916668831779e-05, + "loss": 7.3966, + "step": 438 + }, + { + "epoch": 0.0026108573603577886, + "grad_norm": 2.4724855422973633, + "learning_rate": 4.9999162870192065e-05, + "loss": 7.535, + "step": 439 + }, + { + "epoch": 0.002616804643638786, + "grad_norm": 2.8757777214050293, + "learning_rate": 4.999915904333938e-05, + "loss": 7.6728, + "step": 440 + }, + { + "epoch": 0.002622751926919783, + "grad_norm": 3.5439565181732178, + "learning_rate": 4.999915520775975e-05, + "loss": 7.5308, + "step": 441 + }, + { + "epoch": 0.0026286992102007804, + "grad_norm": 2.8345577716827393, + "learning_rate": 4.999915136345318e-05, + "loss": 7.7083, + "step": 442 + }, + { + "epoch": 0.0026346464934817776, + "grad_norm": 3.0842509269714355, + "learning_rate": 4.999914751041965e-05, + "loss": 7.9281, + "step": 443 + }, + { + "epoch": 0.002640593776762775, + "grad_norm": 3.0017757415771484, + "learning_rate": 4.999914364865919e-05, + "loss": 7.4727, + "step": 444 + }, + { + "epoch": 0.002646541060043772, + "grad_norm": 2.637838125228882, + "learning_rate": 4.9999139778171785e-05, + "loss": 7.5284, + "step": 445 + }, + { + "epoch": 0.0026524883433247694, + "grad_norm": 2.7749550342559814, + "learning_rate": 4.999913589895743e-05, + "loss": 7.7006, + "step": 446 + }, + { + "epoch": 0.0026584356266057666, + "grad_norm": 3.1636059284210205, + "learning_rate": 4.9999132011016146e-05, + "loss": 7.6441, + "step": 447 + }, + { + "epoch": 0.002664382909886764, + "grad_norm": 2.623776435852051, + "learning_rate": 4.9999128114347913e-05, + "loss": 7.8027, + "step": 448 + }, + { + "epoch": 0.002670330193167761, + "grad_norm": 2.803612232208252, + "learning_rate": 4.9999124208952755e-05, + "loss": 7.553, + "step": 449 + }, + { + "epoch": 0.0026762774764487584, + "grad_norm": 3.3169047832489014, + "learning_rate": 4.9999120294830656e-05, + "loss": 8.0965, + "step": 450 + }, + { + "epoch": 0.0026822247597297556, + "grad_norm": 3.9928581714630127, + "learning_rate": 4.999911637198161e-05, + "loss": 7.8152, + "step": 451 + }, + { + "epoch": 0.002688172043010753, + "grad_norm": 2.8126320838928223, + "learning_rate": 4.9999112440405646e-05, + "loss": 7.4843, + "step": 452 + }, + { + "epoch": 0.0026941193262917497, + "grad_norm": 2.773427963256836, + "learning_rate": 4.999910850010275e-05, + "loss": 7.7074, + "step": 453 + }, + { + "epoch": 0.002700066609572747, + "grad_norm": 2.8877642154693604, + "learning_rate": 4.999910455107292e-05, + "loss": 7.7764, + "step": 454 + }, + { + "epoch": 0.0027060138928537442, + "grad_norm": 2.6323535442352295, + "learning_rate": 4.9999100593316155e-05, + "loss": 7.7336, + "step": 455 + }, + { + "epoch": 0.0027119611761347415, + "grad_norm": 2.939509153366089, + "learning_rate": 4.9999096626832465e-05, + "loss": 7.8184, + "step": 456 + }, + { + "epoch": 0.0027179084594157387, + "grad_norm": 2.6926229000091553, + "learning_rate": 4.9999092651621855e-05, + "loss": 7.5027, + "step": 457 + }, + { + "epoch": 0.002723855742696736, + "grad_norm": 2.889389991760254, + "learning_rate": 4.999908866768431e-05, + "loss": 7.1138, + "step": 458 + }, + { + "epoch": 0.0027298030259777332, + "grad_norm": 2.951796531677246, + "learning_rate": 4.999908467501985e-05, + "loss": 7.7549, + "step": 459 + }, + { + "epoch": 0.0027357503092587305, + "grad_norm": 2.9076783657073975, + "learning_rate": 4.999908067362847e-05, + "loss": 7.6577, + "step": 460 + }, + { + "epoch": 0.0027416975925397278, + "grad_norm": 3.010636806488037, + "learning_rate": 4.9999076663510155e-05, + "loss": 7.6467, + "step": 461 + }, + { + "epoch": 0.002747644875820725, + "grad_norm": 2.7591371536254883, + "learning_rate": 4.9999072644664935e-05, + "loss": 7.5825, + "step": 462 + }, + { + "epoch": 0.0027535921591017223, + "grad_norm": 2.503632068634033, + "learning_rate": 4.9999068617092795e-05, + "loss": 7.711, + "step": 463 + }, + { + "epoch": 0.0027595394423827195, + "grad_norm": 2.6518661975860596, + "learning_rate": 4.999906458079373e-05, + "loss": 7.557, + "step": 464 + }, + { + "epoch": 0.0027654867256637168, + "grad_norm": 2.6865615844726562, + "learning_rate": 4.9999060535767764e-05, + "loss": 7.5788, + "step": 465 + }, + { + "epoch": 0.002771434008944714, + "grad_norm": 2.715190887451172, + "learning_rate": 4.999905648201487e-05, + "loss": 7.517, + "step": 466 + }, + { + "epoch": 0.0027773812922257113, + "grad_norm": 3.1603381633758545, + "learning_rate": 4.999905241953506e-05, + "loss": 7.6176, + "step": 467 + }, + { + "epoch": 0.0027833285755067085, + "grad_norm": 3.1451528072357178, + "learning_rate": 4.999904834832836e-05, + "loss": 7.6051, + "step": 468 + }, + { + "epoch": 0.002789275858787706, + "grad_norm": 2.5310862064361572, + "learning_rate": 4.9999044268394736e-05, + "loss": 7.6075, + "step": 469 + }, + { + "epoch": 0.002795223142068703, + "grad_norm": 2.9285359382629395, + "learning_rate": 4.99990401797342e-05, + "loss": 7.5399, + "step": 470 + }, + { + "epoch": 0.0028011704253497003, + "grad_norm": 3.2180614471435547, + "learning_rate": 4.9999036082346766e-05, + "loss": 7.6952, + "step": 471 + }, + { + "epoch": 0.0028071177086306976, + "grad_norm": 4.041499614715576, + "learning_rate": 4.9999031976232426e-05, + "loss": 7.841, + "step": 472 + }, + { + "epoch": 0.002813064991911695, + "grad_norm": 3.233492612838745, + "learning_rate": 4.999902786139118e-05, + "loss": 7.5267, + "step": 473 + }, + { + "epoch": 0.002819012275192692, + "grad_norm": 2.7749760150909424, + "learning_rate": 4.9999023737823034e-05, + "loss": 7.3703, + "step": 474 + }, + { + "epoch": 0.0028249595584736893, + "grad_norm": 2.9886162281036377, + "learning_rate": 4.999901960552798e-05, + "loss": 7.4684, + "step": 475 + }, + { + "epoch": 0.0028309068417546866, + "grad_norm": 2.934190511703491, + "learning_rate": 4.999901546450604e-05, + "loss": 7.4432, + "step": 476 + }, + { + "epoch": 0.002836854125035684, + "grad_norm": 3.696247100830078, + "learning_rate": 4.9999011314757196e-05, + "loss": 7.4944, + "step": 477 + }, + { + "epoch": 0.002842801408316681, + "grad_norm": 3.6706700325012207, + "learning_rate": 4.9999007156281454e-05, + "loss": 7.3726, + "step": 478 + }, + { + "epoch": 0.0028487486915976783, + "grad_norm": 3.8638553619384766, + "learning_rate": 4.999900298907881e-05, + "loss": 7.072, + "step": 479 + }, + { + "epoch": 0.0028546959748786756, + "grad_norm": 4.307566165924072, + "learning_rate": 4.999899881314928e-05, + "loss": 6.9371, + "step": 480 + }, + { + "epoch": 0.002860643258159673, + "grad_norm": 3.337372064590454, + "learning_rate": 4.9998994628492854e-05, + "loss": 7.7299, + "step": 481 + }, + { + "epoch": 0.00286659054144067, + "grad_norm": 3.1284921169281006, + "learning_rate": 4.9998990435109535e-05, + "loss": 7.5629, + "step": 482 + }, + { + "epoch": 0.0028725378247216674, + "grad_norm": 3.06904935836792, + "learning_rate": 4.999898623299933e-05, + "loss": 7.5332, + "step": 483 + }, + { + "epoch": 0.002878485108002664, + "grad_norm": 2.985121011734009, + "learning_rate": 4.999898202216224e-05, + "loss": 7.5972, + "step": 484 + }, + { + "epoch": 0.0028844323912836614, + "grad_norm": 2.9188039302825928, + "learning_rate": 4.999897780259827e-05, + "loss": 7.6242, + "step": 485 + }, + { + "epoch": 0.0028903796745646587, + "grad_norm": 3.2263259887695312, + "learning_rate": 4.9998973574307406e-05, + "loss": 7.5746, + "step": 486 + }, + { + "epoch": 0.002896326957845656, + "grad_norm": 2.645188331604004, + "learning_rate": 4.999896933728966e-05, + "loss": 7.6122, + "step": 487 + }, + { + "epoch": 0.002902274241126653, + "grad_norm": 2.89583158493042, + "learning_rate": 4.9998965091545035e-05, + "loss": 7.6157, + "step": 488 + }, + { + "epoch": 0.0029082215244076504, + "grad_norm": 3.6182286739349365, + "learning_rate": 4.9998960837073524e-05, + "loss": 7.4056, + "step": 489 + }, + { + "epoch": 0.0029141688076886477, + "grad_norm": 3.377560615539551, + "learning_rate": 4.9998956573875135e-05, + "loss": 7.4408, + "step": 490 + }, + { + "epoch": 0.002920116090969645, + "grad_norm": 3.0581517219543457, + "learning_rate": 4.9998952301949874e-05, + "loss": 7.5776, + "step": 491 + }, + { + "epoch": 0.002926063374250642, + "grad_norm": 3.5199148654937744, + "learning_rate": 4.999894802129773e-05, + "loss": 7.4747, + "step": 492 + }, + { + "epoch": 0.0029320106575316395, + "grad_norm": 3.866055727005005, + "learning_rate": 4.9998943731918714e-05, + "loss": 7.5985, + "step": 493 + }, + { + "epoch": 0.0029379579408126367, + "grad_norm": 2.856255054473877, + "learning_rate": 4.999893943381283e-05, + "loss": 7.9698, + "step": 494 + }, + { + "epoch": 0.002943905224093634, + "grad_norm": 3.0758626461029053, + "learning_rate": 4.999893512698007e-05, + "loss": 7.6311, + "step": 495 + }, + { + "epoch": 0.0029498525073746312, + "grad_norm": 3.739844560623169, + "learning_rate": 4.999893081142044e-05, + "loss": 7.6829, + "step": 496 + }, + { + "epoch": 0.0029557997906556285, + "grad_norm": 4.025709629058838, + "learning_rate": 4.999892648713394e-05, + "loss": 7.2717, + "step": 497 + }, + { + "epoch": 0.0029617470739366257, + "grad_norm": 3.6604738235473633, + "learning_rate": 4.999892215412057e-05, + "loss": 7.2985, + "step": 498 + }, + { + "epoch": 0.002967694357217623, + "grad_norm": 3.230109930038452, + "learning_rate": 4.999891781238034e-05, + "loss": 8.1041, + "step": 499 + }, + { + "epoch": 0.0029736416404986202, + "grad_norm": 2.5046725273132324, + "learning_rate": 4.999891346191325e-05, + "loss": 8.0888, + "step": 500 + }, + { + "epoch": 0.0029795889237796175, + "grad_norm": 2.916459798812866, + "learning_rate": 4.999890910271929e-05, + "loss": 7.8675, + "step": 501 + }, + { + "epoch": 0.0029855362070606148, + "grad_norm": 2.7806055545806885, + "learning_rate": 4.999890473479848e-05, + "loss": 7.8903, + "step": 502 + }, + { + "epoch": 0.002991483490341612, + "grad_norm": 2.9877662658691406, + "learning_rate": 4.99989003581508e-05, + "loss": 7.473, + "step": 503 + }, + { + "epoch": 0.0029974307736226093, + "grad_norm": 3.1581692695617676, + "learning_rate": 4.999889597277626e-05, + "loss": 7.5654, + "step": 504 + }, + { + "epoch": 0.0030033780569036065, + "grad_norm": 3.102539539337158, + "learning_rate": 4.9998891578674866e-05, + "loss": 7.8865, + "step": 505 + }, + { + "epoch": 0.0030093253401846038, + "grad_norm": 3.0357863903045654, + "learning_rate": 4.999888717584662e-05, + "loss": 7.291, + "step": 506 + }, + { + "epoch": 0.003015272623465601, + "grad_norm": 2.604048252105713, + "learning_rate": 4.999888276429152e-05, + "loss": 7.4892, + "step": 507 + }, + { + "epoch": 0.0030212199067465983, + "grad_norm": 2.734354257583618, + "learning_rate": 4.999887834400957e-05, + "loss": 7.1182, + "step": 508 + }, + { + "epoch": 0.0030271671900275955, + "grad_norm": 2.5255348682403564, + "learning_rate": 4.9998873915000775e-05, + "loss": 7.449, + "step": 509 + }, + { + "epoch": 0.003033114473308593, + "grad_norm": 2.864072322845459, + "learning_rate": 4.999886947726512e-05, + "loss": 7.3213, + "step": 510 + }, + { + "epoch": 0.00303906175658959, + "grad_norm": 2.764187812805176, + "learning_rate": 4.999886503080262e-05, + "loss": 7.337, + "step": 511 + }, + { + "epoch": 0.0030450090398705873, + "grad_norm": 3.5725066661834717, + "learning_rate": 4.9998860575613285e-05, + "loss": 7.8398, + "step": 512 + }, + { + "epoch": 0.0030509563231515846, + "grad_norm": 3.8559648990631104, + "learning_rate": 4.9998856111697096e-05, + "loss": 7.395, + "step": 513 + }, + { + "epoch": 0.003056903606432582, + "grad_norm": 2.9047908782958984, + "learning_rate": 4.999885163905407e-05, + "loss": 7.7016, + "step": 514 + }, + { + "epoch": 0.0030628508897135786, + "grad_norm": 3.1485037803649902, + "learning_rate": 4.99988471576842e-05, + "loss": 6.9411, + "step": 515 + }, + { + "epoch": 0.003068798172994576, + "grad_norm": 3.2763617038726807, + "learning_rate": 4.999884266758749e-05, + "loss": 6.4778, + "step": 516 + }, + { + "epoch": 0.003074745456275573, + "grad_norm": 2.7609500885009766, + "learning_rate": 4.999883816876394e-05, + "loss": 7.0576, + "step": 517 + }, + { + "epoch": 0.0030806927395565704, + "grad_norm": 3.7407751083374023, + "learning_rate": 4.999883366121356e-05, + "loss": 7.7389, + "step": 518 + }, + { + "epoch": 0.0030866400228375676, + "grad_norm": 3.3356568813323975, + "learning_rate": 4.999882914493634e-05, + "loss": 7.7, + "step": 519 + }, + { + "epoch": 0.003092587306118565, + "grad_norm": 2.635594129562378, + "learning_rate": 4.999882461993229e-05, + "loss": 7.6103, + "step": 520 + }, + { + "epoch": 0.003098534589399562, + "grad_norm": 3.7604281902313232, + "learning_rate": 4.9998820086201406e-05, + "loss": 7.6814, + "step": 521 + }, + { + "epoch": 0.0031044818726805594, + "grad_norm": 3.6567211151123047, + "learning_rate": 4.99988155437437e-05, + "loss": 7.6729, + "step": 522 + }, + { + "epoch": 0.0031104291559615567, + "grad_norm": 3.605442523956299, + "learning_rate": 4.999881099255916e-05, + "loss": 7.7464, + "step": 523 + }, + { + "epoch": 0.003116376439242554, + "grad_norm": 3.015500783920288, + "learning_rate": 4.99988064326478e-05, + "loss": 7.5168, + "step": 524 + }, + { + "epoch": 0.003122323722523551, + "grad_norm": 2.9037563800811768, + "learning_rate": 4.9998801864009604e-05, + "loss": 7.7059, + "step": 525 + }, + { + "epoch": 0.0031282710058045484, + "grad_norm": 2.812509059906006, + "learning_rate": 4.999879728664458e-05, + "loss": 7.4178, + "step": 526 + }, + { + "epoch": 0.0031342182890855457, + "grad_norm": 3.340226888656616, + "learning_rate": 4.9998792700552746e-05, + "loss": 7.7872, + "step": 527 + }, + { + "epoch": 0.003140165572366543, + "grad_norm": 3.0951550006866455, + "learning_rate": 4.999878810573409e-05, + "loss": 8.0153, + "step": 528 + }, + { + "epoch": 0.00314611285564754, + "grad_norm": 3.1077651977539062, + "learning_rate": 4.9998783502188616e-05, + "loss": 7.7053, + "step": 529 + }, + { + "epoch": 0.0031520601389285374, + "grad_norm": 3.442451000213623, + "learning_rate": 4.999877888991632e-05, + "loss": 7.5149, + "step": 530 + }, + { + "epoch": 0.0031580074222095347, + "grad_norm": 3.7479207515716553, + "learning_rate": 4.9998774268917215e-05, + "loss": 7.3448, + "step": 531 + }, + { + "epoch": 0.003163954705490532, + "grad_norm": 2.660789966583252, + "learning_rate": 4.999876963919129e-05, + "loss": 7.8348, + "step": 532 + }, + { + "epoch": 0.003169901988771529, + "grad_norm": 2.6255943775177, + "learning_rate": 4.9998765000738556e-05, + "loss": 7.542, + "step": 533 + }, + { + "epoch": 0.0031758492720525265, + "grad_norm": 3.121521472930908, + "learning_rate": 4.9998760353559017e-05, + "loss": 7.46, + "step": 534 + }, + { + "epoch": 0.0031817965553335237, + "grad_norm": 2.958880662918091, + "learning_rate": 4.999875569765266e-05, + "loss": 7.5385, + "step": 535 + }, + { + "epoch": 0.003187743838614521, + "grad_norm": 3.4153661727905273, + "learning_rate": 4.99987510330195e-05, + "loss": 7.4989, + "step": 536 + }, + { + "epoch": 0.0031936911218955182, + "grad_norm": 3.0877597332000732, + "learning_rate": 4.999874635965953e-05, + "loss": 7.5512, + "step": 537 + }, + { + "epoch": 0.0031996384051765155, + "grad_norm": 3.109522581100464, + "learning_rate": 4.9998741677572756e-05, + "loss": 7.4679, + "step": 538 + }, + { + "epoch": 0.0032055856884575127, + "grad_norm": 3.4434239864349365, + "learning_rate": 4.999873698675919e-05, + "loss": 7.0599, + "step": 539 + }, + { + "epoch": 0.00321153297173851, + "grad_norm": 3.83335018157959, + "learning_rate": 4.999873228721882e-05, + "loss": 7.5355, + "step": 540 + }, + { + "epoch": 0.0032174802550195072, + "grad_norm": 3.0679752826690674, + "learning_rate": 4.999872757895164e-05, + "loss": 7.7231, + "step": 541 + }, + { + "epoch": 0.0032234275383005045, + "grad_norm": 3.272196054458618, + "learning_rate": 4.999872286195767e-05, + "loss": 7.6674, + "step": 542 + }, + { + "epoch": 0.0032293748215815017, + "grad_norm": 2.8453965187072754, + "learning_rate": 4.9998718136236897e-05, + "loss": 7.4451, + "step": 543 + }, + { + "epoch": 0.003235322104862499, + "grad_norm": 3.074399709701538, + "learning_rate": 4.999871340178934e-05, + "loss": 7.6011, + "step": 544 + }, + { + "epoch": 0.0032412693881434963, + "grad_norm": 3.173004150390625, + "learning_rate": 4.999870865861499e-05, + "loss": 7.5268, + "step": 545 + }, + { + "epoch": 0.003247216671424493, + "grad_norm": 2.820848226547241, + "learning_rate": 4.999870390671384e-05, + "loss": 7.9872, + "step": 546 + }, + { + "epoch": 0.0032531639547054903, + "grad_norm": 2.692702293395996, + "learning_rate": 4.9998699146085906e-05, + "loss": 7.4676, + "step": 547 + }, + { + "epoch": 0.0032591112379864876, + "grad_norm": 2.2766902446746826, + "learning_rate": 4.999869437673119e-05, + "loss": 7.3826, + "step": 548 + }, + { + "epoch": 0.003265058521267485, + "grad_norm": 2.1190011501312256, + "learning_rate": 4.9998689598649686e-05, + "loss": 7.4767, + "step": 549 + }, + { + "epoch": 0.003271005804548482, + "grad_norm": 2.687633514404297, + "learning_rate": 4.999868481184139e-05, + "loss": 7.9922, + "step": 550 + }, + { + "epoch": 0.0032769530878294794, + "grad_norm": 3.403298854827881, + "learning_rate": 4.999868001630632e-05, + "loss": 7.8035, + "step": 551 + }, + { + "epoch": 0.0032829003711104766, + "grad_norm": 3.074881076812744, + "learning_rate": 4.999867521204446e-05, + "loss": 7.7106, + "step": 552 + }, + { + "epoch": 0.003288847654391474, + "grad_norm": 3.28725004196167, + "learning_rate": 4.9998670399055827e-05, + "loss": 7.4661, + "step": 553 + }, + { + "epoch": 0.003294794937672471, + "grad_norm": 3.8624775409698486, + "learning_rate": 4.999866557734041e-05, + "loss": 7.7156, + "step": 554 + }, + { + "epoch": 0.0033007422209534684, + "grad_norm": 2.53586745262146, + "learning_rate": 4.999866074689823e-05, + "loss": 7.945, + "step": 555 + }, + { + "epoch": 0.0033066895042344656, + "grad_norm": 3.8261072635650635, + "learning_rate": 4.9998655907729265e-05, + "loss": 8.0446, + "step": 556 + }, + { + "epoch": 0.003312636787515463, + "grad_norm": 2.7173407077789307, + "learning_rate": 4.999865105983353e-05, + "loss": 7.8363, + "step": 557 + }, + { + "epoch": 0.00331858407079646, + "grad_norm": 4.68424654006958, + "learning_rate": 4.999864620321102e-05, + "loss": 7.667, + "step": 558 + }, + { + "epoch": 0.0033245313540774574, + "grad_norm": 2.8763632774353027, + "learning_rate": 4.999864133786175e-05, + "loss": 7.6133, + "step": 559 + }, + { + "epoch": 0.0033304786373584546, + "grad_norm": 3.0986382961273193, + "learning_rate": 4.9998636463785705e-05, + "loss": 7.6257, + "step": 560 + }, + { + "epoch": 0.003336425920639452, + "grad_norm": 2.6826348304748535, + "learning_rate": 4.9998631580982905e-05, + "loss": 7.5187, + "step": 561 + }, + { + "epoch": 0.003342373203920449, + "grad_norm": 2.2172515392303467, + "learning_rate": 4.9998626689453334e-05, + "loss": 7.961, + "step": 562 + }, + { + "epoch": 0.0033483204872014464, + "grad_norm": 2.6083858013153076, + "learning_rate": 4.9998621789197e-05, + "loss": 7.7887, + "step": 563 + }, + { + "epoch": 0.0033542677704824437, + "grad_norm": 3.6838009357452393, + "learning_rate": 4.99986168802139e-05, + "loss": 7.4945, + "step": 564 + }, + { + "epoch": 0.003360215053763441, + "grad_norm": 3.2091991901397705, + "learning_rate": 4.999861196250405e-05, + "loss": 7.4243, + "step": 565 + }, + { + "epoch": 0.003366162337044438, + "grad_norm": 3.142982244491577, + "learning_rate": 4.9998607036067434e-05, + "loss": 7.4684, + "step": 566 + }, + { + "epoch": 0.0033721096203254354, + "grad_norm": 3.7751007080078125, + "learning_rate": 4.9998602100904065e-05, + "loss": 7.3722, + "step": 567 + }, + { + "epoch": 0.0033780569036064327, + "grad_norm": 3.276843547821045, + "learning_rate": 4.9998597157013946e-05, + "loss": 7.4012, + "step": 568 + }, + { + "epoch": 0.00338400418688743, + "grad_norm": 2.840106725692749, + "learning_rate": 4.999859220439708e-05, + "loss": 7.4013, + "step": 569 + }, + { + "epoch": 0.003389951470168427, + "grad_norm": 2.7816810607910156, + "learning_rate": 4.999858724305346e-05, + "loss": 7.3136, + "step": 570 + }, + { + "epoch": 0.0033958987534494244, + "grad_norm": 4.523340225219727, + "learning_rate": 4.999858227298308e-05, + "loss": 7.0553, + "step": 571 + }, + { + "epoch": 0.0034018460367304217, + "grad_norm": 3.9653191566467285, + "learning_rate": 4.9998577294185964e-05, + "loss": 7.1907, + "step": 572 + }, + { + "epoch": 0.003407793320011419, + "grad_norm": 3.243089199066162, + "learning_rate": 4.999857230666211e-05, + "loss": 7.0749, + "step": 573 + }, + { + "epoch": 0.003413740603292416, + "grad_norm": 3.3622777462005615, + "learning_rate": 4.99985673104115e-05, + "loss": 7.0005, + "step": 574 + }, + { + "epoch": 0.0034196878865734135, + "grad_norm": 2.561732292175293, + "learning_rate": 4.9998562305434154e-05, + "loss": 7.271, + "step": 575 + }, + { + "epoch": 0.0034256351698544107, + "grad_norm": 3.1846745014190674, + "learning_rate": 4.999855729173006e-05, + "loss": 7.7333, + "step": 576 + }, + { + "epoch": 0.0034315824531354075, + "grad_norm": 3.0318918228149414, + "learning_rate": 4.999855226929924e-05, + "loss": 7.5535, + "step": 577 + }, + { + "epoch": 0.003437529736416405, + "grad_norm": 2.993086099624634, + "learning_rate": 4.999854723814168e-05, + "loss": 7.6272, + "step": 578 + }, + { + "epoch": 0.003443477019697402, + "grad_norm": 2.8511712551116943, + "learning_rate": 4.999854219825738e-05, + "loss": 7.6619, + "step": 579 + }, + { + "epoch": 0.0034494243029783993, + "grad_norm": 2.6181185245513916, + "learning_rate": 4.9998537149646355e-05, + "loss": 7.7452, + "step": 580 + }, + { + "epoch": 0.0034553715862593965, + "grad_norm": 2.9932363033294678, + "learning_rate": 4.9998532092308593e-05, + "loss": 7.7475, + "step": 581 + }, + { + "epoch": 0.003461318869540394, + "grad_norm": 3.541944742202759, + "learning_rate": 4.99985270262441e-05, + "loss": 7.5808, + "step": 582 + }, + { + "epoch": 0.003467266152821391, + "grad_norm": 2.780372381210327, + "learning_rate": 4.9998521951452895e-05, + "loss": 7.8167, + "step": 583 + }, + { + "epoch": 0.0034732134361023883, + "grad_norm": 2.9156363010406494, + "learning_rate": 4.9998516867934945e-05, + "loss": 7.74, + "step": 584 + }, + { + "epoch": 0.0034791607193833856, + "grad_norm": 3.9492485523223877, + "learning_rate": 4.9998511775690285e-05, + "loss": 7.1128, + "step": 585 + }, + { + "epoch": 0.003485108002664383, + "grad_norm": 2.8288252353668213, + "learning_rate": 4.9998506674718896e-05, + "loss": 7.4884, + "step": 586 + }, + { + "epoch": 0.00349105528594538, + "grad_norm": 2.8906798362731934, + "learning_rate": 4.999850156502078e-05, + "loss": 7.6378, + "step": 587 + }, + { + "epoch": 0.0034970025692263773, + "grad_norm": 2.8806405067443848, + "learning_rate": 4.9998496446595955e-05, + "loss": 7.4641, + "step": 588 + }, + { + "epoch": 0.0035029498525073746, + "grad_norm": 3.1794772148132324, + "learning_rate": 4.999849131944441e-05, + "loss": 7.1633, + "step": 589 + }, + { + "epoch": 0.003508897135788372, + "grad_norm": 2.886009454727173, + "learning_rate": 4.999848618356615e-05, + "loss": 7.1793, + "step": 590 + }, + { + "epoch": 0.003514844419069369, + "grad_norm": 2.76184344291687, + "learning_rate": 4.999848103896118e-05, + "loss": 7.1377, + "step": 591 + }, + { + "epoch": 0.0035207917023503663, + "grad_norm": 3.127793788909912, + "learning_rate": 4.999847588562949e-05, + "loss": 7.2793, + "step": 592 + }, + { + "epoch": 0.0035267389856313636, + "grad_norm": 3.7768073081970215, + "learning_rate": 4.99984707235711e-05, + "loss": 7.8203, + "step": 593 + }, + { + "epoch": 0.003532686268912361, + "grad_norm": 3.1750540733337402, + "learning_rate": 4.9998465552786e-05, + "loss": 7.7078, + "step": 594 + }, + { + "epoch": 0.003538633552193358, + "grad_norm": 2.8884522914886475, + "learning_rate": 4.999846037327419e-05, + "loss": 7.6864, + "step": 595 + }, + { + "epoch": 0.0035445808354743554, + "grad_norm": 2.783928394317627, + "learning_rate": 4.999845518503568e-05, + "loss": 7.7329, + "step": 596 + }, + { + "epoch": 0.0035505281187553526, + "grad_norm": 2.8093652725219727, + "learning_rate": 4.9998449988070465e-05, + "loss": 7.7157, + "step": 597 + }, + { + "epoch": 0.00355647540203635, + "grad_norm": 2.54380464553833, + "learning_rate": 4.999844478237855e-05, + "loss": 7.6353, + "step": 598 + }, + { + "epoch": 0.003562422685317347, + "grad_norm": 3.478878974914551, + "learning_rate": 4.999843956795993e-05, + "loss": 7.4221, + "step": 599 + }, + { + "epoch": 0.0035683699685983444, + "grad_norm": 3.882807493209839, + "learning_rate": 4.999843434481463e-05, + "loss": 7.4857, + "step": 600 + }, + { + "epoch": 0.0035743172518793416, + "grad_norm": 3.0975584983825684, + "learning_rate": 4.999842911294261e-05, + "loss": 7.5121, + "step": 601 + }, + { + "epoch": 0.003580264535160339, + "grad_norm": 3.1857712268829346, + "learning_rate": 4.999842387234391e-05, + "loss": 7.4469, + "step": 602 + }, + { + "epoch": 0.003586211818441336, + "grad_norm": 2.892927885055542, + "learning_rate": 4.999841862301853e-05, + "loss": 7.4047, + "step": 603 + }, + { + "epoch": 0.0035921591017223334, + "grad_norm": 4.186185359954834, + "learning_rate": 4.999841336496645e-05, + "loss": 7.5146, + "step": 604 + }, + { + "epoch": 0.0035981063850033307, + "grad_norm": 3.27422833442688, + "learning_rate": 4.9998408098187674e-05, + "loss": 7.3347, + "step": 605 + }, + { + "epoch": 0.003604053668284328, + "grad_norm": 4.817208290100098, + "learning_rate": 4.9998402822682225e-05, + "loss": 7.9883, + "step": 606 + }, + { + "epoch": 0.003610000951565325, + "grad_norm": 5.903015613555908, + "learning_rate": 4.999839753845008e-05, + "loss": 7.9043, + "step": 607 + }, + { + "epoch": 0.0036159482348463224, + "grad_norm": 4.720086574554443, + "learning_rate": 4.999839224549127e-05, + "loss": 7.8456, + "step": 608 + }, + { + "epoch": 0.0036218955181273192, + "grad_norm": 4.518443584442139, + "learning_rate": 4.9998386943805764e-05, + "loss": 7.3659, + "step": 609 + }, + { + "epoch": 0.0036278428014083165, + "grad_norm": 2.621833086013794, + "learning_rate": 4.999838163339358e-05, + "loss": 8.0512, + "step": 610 + }, + { + "epoch": 0.0036337900846893137, + "grad_norm": 4.015076160430908, + "learning_rate": 4.9998376314254726e-05, + "loss": 7.8581, + "step": 611 + }, + { + "epoch": 0.003639737367970311, + "grad_norm": 3.8145275115966797, + "learning_rate": 4.999837098638919e-05, + "loss": 7.4288, + "step": 612 + }, + { + "epoch": 0.0036456846512513083, + "grad_norm": 3.396488904953003, + "learning_rate": 4.9998365649796985e-05, + "loss": 7.7812, + "step": 613 + }, + { + "epoch": 0.0036516319345323055, + "grad_norm": 2.931187391281128, + "learning_rate": 4.999836030447811e-05, + "loss": 7.5898, + "step": 614 + }, + { + "epoch": 0.0036575792178133028, + "grad_norm": 2.6349267959594727, + "learning_rate": 4.999835495043257e-05, + "loss": 7.5345, + "step": 615 + }, + { + "epoch": 0.0036635265010943, + "grad_norm": 3.014085531234741, + "learning_rate": 4.999834958766035e-05, + "loss": 7.5985, + "step": 616 + }, + { + "epoch": 0.0036694737843752973, + "grad_norm": 2.971475124359131, + "learning_rate": 4.999834421616147e-05, + "loss": 7.589, + "step": 617 + }, + { + "epoch": 0.0036754210676562945, + "grad_norm": 3.867366075515747, + "learning_rate": 4.999833883593593e-05, + "loss": 7.4026, + "step": 618 + }, + { + "epoch": 0.0036813683509372918, + "grad_norm": 2.3917908668518066, + "learning_rate": 4.9998333446983734e-05, + "loss": 7.4361, + "step": 619 + }, + { + "epoch": 0.003687315634218289, + "grad_norm": 4.583080768585205, + "learning_rate": 4.999832804930487e-05, + "loss": 7.5525, + "step": 620 + }, + { + "epoch": 0.0036932629174992863, + "grad_norm": 2.6039721965789795, + "learning_rate": 4.999832264289934e-05, + "loss": 7.636, + "step": 621 + }, + { + "epoch": 0.0036992102007802835, + "grad_norm": 4.123409748077393, + "learning_rate": 4.9998317227767165e-05, + "loss": 7.7803, + "step": 622 + }, + { + "epoch": 0.003705157484061281, + "grad_norm": 4.220766544342041, + "learning_rate": 4.999831180390834e-05, + "loss": 7.8086, + "step": 623 + }, + { + "epoch": 0.003711104767342278, + "grad_norm": 3.0759594440460205, + "learning_rate": 4.999830637132285e-05, + "loss": 7.4815, + "step": 624 + }, + { + "epoch": 0.0037170520506232753, + "grad_norm": 2.7870442867279053, + "learning_rate": 4.999830093001071e-05, + "loss": 7.3925, + "step": 625 + }, + { + "epoch": 0.0037229993339042726, + "grad_norm": 2.5292582511901855, + "learning_rate": 4.999829547997193e-05, + "loss": 7.2049, + "step": 626 + }, + { + "epoch": 0.00372894661718527, + "grad_norm": 2.5836963653564453, + "learning_rate": 4.99982900212065e-05, + "loss": 7.2858, + "step": 627 + }, + { + "epoch": 0.003734893900466267, + "grad_norm": 2.6433279514312744, + "learning_rate": 4.9998284553714425e-05, + "loss": 7.5894, + "step": 628 + }, + { + "epoch": 0.0037408411837472643, + "grad_norm": 3.1093215942382812, + "learning_rate": 4.999827907749571e-05, + "loss": 7.2859, + "step": 629 + }, + { + "epoch": 0.0037467884670282616, + "grad_norm": 2.313305616378784, + "learning_rate": 4.9998273592550346e-05, + "loss": 7.6275, + "step": 630 + }, + { + "epoch": 0.003752735750309259, + "grad_norm": 3.7002785205841064, + "learning_rate": 4.9998268098878355e-05, + "loss": 7.7068, + "step": 631 + }, + { + "epoch": 0.003758683033590256, + "grad_norm": 3.090707778930664, + "learning_rate": 4.9998262596479715e-05, + "loss": 7.7304, + "step": 632 + }, + { + "epoch": 0.0037646303168712533, + "grad_norm": 2.425614833831787, + "learning_rate": 4.999825708535445e-05, + "loss": 7.927, + "step": 633 + }, + { + "epoch": 0.0037705776001522506, + "grad_norm": 2.1477420330047607, + "learning_rate": 4.999825156550254e-05, + "loss": 8.1082, + "step": 634 + }, + { + "epoch": 0.003776524883433248, + "grad_norm": 2.434638738632202, + "learning_rate": 4.999824603692401e-05, + "loss": 7.8808, + "step": 635 + }, + { + "epoch": 0.003782472166714245, + "grad_norm": 2.563283681869507, + "learning_rate": 4.999824049961884e-05, + "loss": 7.8515, + "step": 636 + }, + { + "epoch": 0.0037884194499952424, + "grad_norm": 2.6878623962402344, + "learning_rate": 4.9998234953587054e-05, + "loss": 7.6393, + "step": 637 + }, + { + "epoch": 0.0037943667332762396, + "grad_norm": 2.6270666122436523, + "learning_rate": 4.999822939882863e-05, + "loss": 7.8246, + "step": 638 + }, + { + "epoch": 0.003800314016557237, + "grad_norm": 3.300494909286499, + "learning_rate": 4.9998223835343596e-05, + "loss": 7.4991, + "step": 639 + }, + { + "epoch": 0.0038062612998382337, + "grad_norm": 2.726902723312378, + "learning_rate": 4.9998218263131925e-05, + "loss": 7.6663, + "step": 640 + }, + { + "epoch": 0.003812208583119231, + "grad_norm": 2.8147871494293213, + "learning_rate": 4.9998212682193645e-05, + "loss": 7.5272, + "step": 641 + }, + { + "epoch": 0.003818155866400228, + "grad_norm": 2.324422597885132, + "learning_rate": 4.9998207092528745e-05, + "loss": 7.6577, + "step": 642 + }, + { + "epoch": 0.0038241031496812255, + "grad_norm": 2.4525058269500732, + "learning_rate": 4.999820149413723e-05, + "loss": 7.6793, + "step": 643 + }, + { + "epoch": 0.0038300504329622227, + "grad_norm": 2.4011337757110596, + "learning_rate": 4.9998195887019094e-05, + "loss": 7.4869, + "step": 644 + }, + { + "epoch": 0.00383599771624322, + "grad_norm": 2.3403005599975586, + "learning_rate": 4.9998190271174364e-05, + "loss": 7.9552, + "step": 645 + }, + { + "epoch": 0.003841944999524217, + "grad_norm": 2.1421074867248535, + "learning_rate": 4.9998184646603005e-05, + "loss": 7.4021, + "step": 646 + }, + { + "epoch": 0.0038478922828052145, + "grad_norm": 2.4157450199127197, + "learning_rate": 4.9998179013305046e-05, + "loss": 7.6666, + "step": 647 + }, + { + "epoch": 0.0038538395660862117, + "grad_norm": 2.737692356109619, + "learning_rate": 4.999817337128048e-05, + "loss": 7.7441, + "step": 648 + }, + { + "epoch": 0.003859786849367209, + "grad_norm": 3.2240428924560547, + "learning_rate": 4.999816772052931e-05, + "loss": 7.5691, + "step": 649 + }, + { + "epoch": 0.0038657341326482062, + "grad_norm": 2.8538997173309326, + "learning_rate": 4.9998162061051534e-05, + "loss": 7.4994, + "step": 650 + }, + { + "epoch": 0.0038716814159292035, + "grad_norm": 2.6562373638153076, + "learning_rate": 4.9998156392847164e-05, + "loss": 7.5156, + "step": 651 + }, + { + "epoch": 0.0038776286992102007, + "grad_norm": 2.5513811111450195, + "learning_rate": 4.999815071591619e-05, + "loss": 7.6503, + "step": 652 + }, + { + "epoch": 0.003883575982491198, + "grad_norm": 2.4196572303771973, + "learning_rate": 4.999814503025863e-05, + "loss": 7.9868, + "step": 653 + }, + { + "epoch": 0.0038895232657721952, + "grad_norm": 3.0201921463012695, + "learning_rate": 4.999813933587447e-05, + "loss": 7.5405, + "step": 654 + }, + { + "epoch": 0.0038954705490531925, + "grad_norm": 2.352625846862793, + "learning_rate": 4.9998133632763714e-05, + "loss": 7.5461, + "step": 655 + }, + { + "epoch": 0.0039014178323341898, + "grad_norm": 2.5318710803985596, + "learning_rate": 4.999812792092637e-05, + "loss": 7.5596, + "step": 656 + }, + { + "epoch": 0.003907365115615187, + "grad_norm": 2.710785388946533, + "learning_rate": 4.9998122200362444e-05, + "loss": 7.4828, + "step": 657 + }, + { + "epoch": 0.003913312398896184, + "grad_norm": 2.7441353797912598, + "learning_rate": 4.999811647107192e-05, + "loss": 7.2496, + "step": 658 + }, + { + "epoch": 0.0039192596821771815, + "grad_norm": 2.4602885246276855, + "learning_rate": 4.9998110733054824e-05, + "loss": 7.6134, + "step": 659 + }, + { + "epoch": 0.003925206965458178, + "grad_norm": 2.6842973232269287, + "learning_rate": 4.999810498631114e-05, + "loss": 7.3544, + "step": 660 + }, + { + "epoch": 0.003931154248739176, + "grad_norm": 2.8062961101531982, + "learning_rate": 4.9998099230840875e-05, + "loss": 7.5162, + "step": 661 + }, + { + "epoch": 0.003937101532020173, + "grad_norm": 4.0753679275512695, + "learning_rate": 4.9998093466644036e-05, + "loss": 7.5241, + "step": 662 + }, + { + "epoch": 0.0039430488153011705, + "grad_norm": 3.0165748596191406, + "learning_rate": 4.999808769372061e-05, + "loss": 7.5313, + "step": 663 + }, + { + "epoch": 0.003948996098582167, + "grad_norm": 2.73825740814209, + "learning_rate": 4.9998081912070623e-05, + "loss": 7.4433, + "step": 664 + }, + { + "epoch": 0.003954943381863165, + "grad_norm": 2.6649749279022217, + "learning_rate": 4.9998076121694056e-05, + "loss": 7.4852, + "step": 665 + }, + { + "epoch": 0.003960890665144162, + "grad_norm": 2.609389066696167, + "learning_rate": 4.999807032259092e-05, + "loss": 7.4127, + "step": 666 + }, + { + "epoch": 0.0039668379484251596, + "grad_norm": 2.50502610206604, + "learning_rate": 4.999806451476122e-05, + "loss": 7.3113, + "step": 667 + }, + { + "epoch": 0.003972785231706156, + "grad_norm": 2.565142869949341, + "learning_rate": 4.999805869820495e-05, + "loss": 7.1875, + "step": 668 + }, + { + "epoch": 0.003978732514987154, + "grad_norm": 2.582742214202881, + "learning_rate": 4.9998052872922117e-05, + "loss": 7.3251, + "step": 669 + }, + { + "epoch": 0.003984679798268151, + "grad_norm": 2.718780279159546, + "learning_rate": 4.999804703891272e-05, + "loss": 7.3599, + "step": 670 + }, + { + "epoch": 0.003990627081549149, + "grad_norm": 2.5971410274505615, + "learning_rate": 4.999804119617677e-05, + "loss": 7.2304, + "step": 671 + }, + { + "epoch": 0.003996574364830145, + "grad_norm": 2.5905725955963135, + "learning_rate": 4.9998035344714255e-05, + "loss": 7.3664, + "step": 672 + }, + { + "epoch": 0.004002521648111143, + "grad_norm": 2.659102439880371, + "learning_rate": 4.999802948452519e-05, + "loss": 7.4296, + "step": 673 + }, + { + "epoch": 0.00400846893139214, + "grad_norm": 2.5933544635772705, + "learning_rate": 4.999802361560957e-05, + "loss": 7.4605, + "step": 674 + }, + { + "epoch": 0.004014416214673138, + "grad_norm": 3.3860044479370117, + "learning_rate": 4.999801773796739e-05, + "loss": 7.5159, + "step": 675 + }, + { + "epoch": 0.004020363497954134, + "grad_norm": 3.742635726928711, + "learning_rate": 4.9998011851598666e-05, + "loss": 7.4988, + "step": 676 + }, + { + "epoch": 0.004026310781235132, + "grad_norm": 3.5960240364074707, + "learning_rate": 4.999800595650339e-05, + "loss": 7.4607, + "step": 677 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 2.654444694519043, + "learning_rate": 4.9998000052681585e-05, + "loss": 7.2166, + "step": 678 + }, + { + "epoch": 0.004038205347797127, + "grad_norm": 2.4538326263427734, + "learning_rate": 4.999799414013322e-05, + "loss": 7.2334, + "step": 679 + }, + { + "epoch": 0.004044152631078123, + "grad_norm": 2.5899672508239746, + "learning_rate": 4.9997988218858316e-05, + "loss": 7.2754, + "step": 680 + }, + { + "epoch": 0.004050099914359121, + "grad_norm": 2.721224069595337, + "learning_rate": 4.999798228885687e-05, + "loss": 7.188, + "step": 681 + }, + { + "epoch": 0.004056047197640118, + "grad_norm": 6.5863189697265625, + "learning_rate": 4.9997976350128894e-05, + "loss": 7.369, + "step": 682 + }, + { + "epoch": 0.004061994480921116, + "grad_norm": 2.6562674045562744, + "learning_rate": 4.999797040267438e-05, + "loss": 7.176, + "step": 683 + }, + { + "epoch": 0.0040679417642021124, + "grad_norm": 2.503666877746582, + "learning_rate": 4.9997964446493326e-05, + "loss": 7.2765, + "step": 684 + }, + { + "epoch": 0.00407388904748311, + "grad_norm": 9.070426940917969, + "learning_rate": 4.9997958481585756e-05, + "loss": 7.5187, + "step": 685 + }, + { + "epoch": 0.004079836330764107, + "grad_norm": 2.7480480670928955, + "learning_rate": 4.9997952507951645e-05, + "loss": 7.5244, + "step": 686 + }, + { + "epoch": 0.004085783614045104, + "grad_norm": 3.8338348865509033, + "learning_rate": 4.999794652559101e-05, + "loss": 7.6672, + "step": 687 + }, + { + "epoch": 0.0040917308973261015, + "grad_norm": 3.1132454872131348, + "learning_rate": 4.999794053450385e-05, + "loss": 7.9594, + "step": 688 + }, + { + "epoch": 0.004097678180607098, + "grad_norm": 2.6279757022857666, + "learning_rate": 4.999793453469017e-05, + "loss": 7.4737, + "step": 689 + }, + { + "epoch": 0.004103625463888096, + "grad_norm": 3.440145492553711, + "learning_rate": 4.9997928526149966e-05, + "loss": 7.2968, + "step": 690 + }, + { + "epoch": 0.004109572747169093, + "grad_norm": 2.3300867080688477, + "learning_rate": 4.9997922508883244e-05, + "loss": 7.3693, + "step": 691 + }, + { + "epoch": 0.0041155200304500905, + "grad_norm": 2.9034078121185303, + "learning_rate": 4.999791648289001e-05, + "loss": 7.7227, + "step": 692 + }, + { + "epoch": 0.004121467313731087, + "grad_norm": 2.5685503482818604, + "learning_rate": 4.9997910448170254e-05, + "loss": 7.9706, + "step": 693 + }, + { + "epoch": 0.004127414597012085, + "grad_norm": 3.260779619216919, + "learning_rate": 4.9997904404723986e-05, + "loss": 7.7231, + "step": 694 + }, + { + "epoch": 0.004133361880293082, + "grad_norm": 2.668193817138672, + "learning_rate": 4.999789835255121e-05, + "loss": 7.7677, + "step": 695 + }, + { + "epoch": 0.0041393091635740795, + "grad_norm": 2.545276641845703, + "learning_rate": 4.999789229165193e-05, + "loss": 7.9297, + "step": 696 + }, + { + "epoch": 0.004145256446855076, + "grad_norm": 3.2137503623962402, + "learning_rate": 4.9997886222026146e-05, + "loss": 7.697, + "step": 697 + }, + { + "epoch": 0.004151203730136074, + "grad_norm": 2.7501730918884277, + "learning_rate": 4.999788014367385e-05, + "loss": 7.3686, + "step": 698 + }, + { + "epoch": 0.004157151013417071, + "grad_norm": 2.2456486225128174, + "learning_rate": 4.9997874056595055e-05, + "loss": 7.7238, + "step": 699 + }, + { + "epoch": 0.0041630982966980685, + "grad_norm": 2.3958070278167725, + "learning_rate": 4.9997867960789764e-05, + "loss": 7.8349, + "step": 700 + }, + { + "epoch": 0.004169045579979065, + "grad_norm": 2.509744644165039, + "learning_rate": 4.9997861856257974e-05, + "loss": 7.5884, + "step": 701 + }, + { + "epoch": 0.004174992863260063, + "grad_norm": 3.6095783710479736, + "learning_rate": 4.9997855742999684e-05, + "loss": 7.4726, + "step": 702 + }, + { + "epoch": 0.00418094014654106, + "grad_norm": 3.3515326976776123, + "learning_rate": 4.99978496210149e-05, + "loss": 7.5214, + "step": 703 + }, + { + "epoch": 0.0041868874298220575, + "grad_norm": 4.7553791999816895, + "learning_rate": 4.999784349030363e-05, + "loss": 7.4577, + "step": 704 + }, + { + "epoch": 0.004192834713103054, + "grad_norm": 5.959117412567139, + "learning_rate": 4.9997837350865874e-05, + "loss": 7.2559, + "step": 705 + }, + { + "epoch": 0.004198781996384052, + "grad_norm": 2.9650065898895264, + "learning_rate": 4.999783120270163e-05, + "loss": 7.3712, + "step": 706 + }, + { + "epoch": 0.004204729279665049, + "grad_norm": 3.4171416759490967, + "learning_rate": 4.9997825045810895e-05, + "loss": 7.5014, + "step": 707 + }, + { + "epoch": 0.0042106765629460466, + "grad_norm": 3.297393798828125, + "learning_rate": 4.9997818880193684e-05, + "loss": 7.4553, + "step": 708 + }, + { + "epoch": 0.004216623846227043, + "grad_norm": 3.193859338760376, + "learning_rate": 4.999781270584999e-05, + "loss": 7.3414, + "step": 709 + }, + { + "epoch": 0.004222571129508041, + "grad_norm": 2.5028324127197266, + "learning_rate": 4.999780652277982e-05, + "loss": 7.4615, + "step": 710 + }, + { + "epoch": 0.004228518412789038, + "grad_norm": 3.43390154838562, + "learning_rate": 4.999780033098317e-05, + "loss": 7.3801, + "step": 711 + }, + { + "epoch": 0.004234465696070036, + "grad_norm": 3.3093984127044678, + "learning_rate": 4.999779413046004e-05, + "loss": 7.2938, + "step": 712 + }, + { + "epoch": 0.004240412979351032, + "grad_norm": 2.6643831729888916, + "learning_rate": 4.999778792121046e-05, + "loss": 7.3916, + "step": 713 + }, + { + "epoch": 0.00424636026263203, + "grad_norm": 2.779407501220703, + "learning_rate": 4.999778170323439e-05, + "loss": 7.5783, + "step": 714 + }, + { + "epoch": 0.004252307545913027, + "grad_norm": 2.959345817565918, + "learning_rate": 4.999777547653186e-05, + "loss": 7.9854, + "step": 715 + }, + { + "epoch": 0.004258254829194025, + "grad_norm": 2.9909780025482178, + "learning_rate": 4.9997769241102866e-05, + "loss": 7.997, + "step": 716 + }, + { + "epoch": 0.004264202112475021, + "grad_norm": 3.081831932067871, + "learning_rate": 4.9997762996947405e-05, + "loss": 7.9393, + "step": 717 + }, + { + "epoch": 0.004270149395756018, + "grad_norm": 2.8901429176330566, + "learning_rate": 4.9997756744065485e-05, + "loss": 7.8152, + "step": 718 + }, + { + "epoch": 0.004276096679037016, + "grad_norm": 3.3065547943115234, + "learning_rate": 4.9997750482457106e-05, + "loss": 7.1176, + "step": 719 + }, + { + "epoch": 0.004282043962318013, + "grad_norm": 3.1083710193634033, + "learning_rate": 4.9997744212122276e-05, + "loss": 7.6215, + "step": 720 + }, + { + "epoch": 0.00428799124559901, + "grad_norm": 4.010551452636719, + "learning_rate": 4.9997737933060987e-05, + "loss": 7.7665, + "step": 721 + }, + { + "epoch": 0.004293938528880007, + "grad_norm": 3.9287984371185303, + "learning_rate": 4.9997731645273245e-05, + "loss": 7.7185, + "step": 722 + }, + { + "epoch": 0.004299885812161005, + "grad_norm": 2.7739338874816895, + "learning_rate": 4.999772534875905e-05, + "loss": 7.7226, + "step": 723 + }, + { + "epoch": 0.004305833095442002, + "grad_norm": 2.675567865371704, + "learning_rate": 4.9997719043518414e-05, + "loss": 7.686, + "step": 724 + }, + { + "epoch": 0.0043117803787229994, + "grad_norm": 3.8513898849487305, + "learning_rate": 4.999771272955133e-05, + "loss": 7.6584, + "step": 725 + }, + { + "epoch": 0.004317727662003996, + "grad_norm": 10.309504508972168, + "learning_rate": 4.99977064068578e-05, + "loss": 7.4006, + "step": 726 + }, + { + "epoch": 0.004323674945284994, + "grad_norm": 2.712939977645874, + "learning_rate": 4.9997700075437836e-05, + "loss": 7.6275, + "step": 727 + }, + { + "epoch": 0.004329622228565991, + "grad_norm": 2.7880115509033203, + "learning_rate": 4.999769373529143e-05, + "loss": 7.4154, + "step": 728 + }, + { + "epoch": 0.0043355695118469885, + "grad_norm": 3.2352819442749023, + "learning_rate": 4.999768738641859e-05, + "loss": 7.4827, + "step": 729 + }, + { + "epoch": 0.004341516795127985, + "grad_norm": 3.5176644325256348, + "learning_rate": 4.999768102881931e-05, + "loss": 7.4748, + "step": 730 + }, + { + "epoch": 0.004347464078408983, + "grad_norm": 2.996829032897949, + "learning_rate": 4.99976746624936e-05, + "loss": 7.445, + "step": 731 + }, + { + "epoch": 0.00435341136168998, + "grad_norm": 4.5892534255981445, + "learning_rate": 4.9997668287441454e-05, + "loss": 7.6464, + "step": 732 + }, + { + "epoch": 0.0043593586449709775, + "grad_norm": 3.689419984817505, + "learning_rate": 4.999766190366289e-05, + "loss": 7.4215, + "step": 733 + }, + { + "epoch": 0.004365305928251974, + "grad_norm": 2.9146885871887207, + "learning_rate": 4.9997655511157896e-05, + "loss": 7.4852, + "step": 734 + }, + { + "epoch": 0.004371253211532972, + "grad_norm": 3.8503024578094482, + "learning_rate": 4.9997649109926484e-05, + "loss": 7.4779, + "step": 735 + }, + { + "epoch": 0.004377200494813969, + "grad_norm": 3.929422616958618, + "learning_rate": 4.9997642699968646e-05, + "loss": 7.3526, + "step": 736 + }, + { + "epoch": 0.0043831477780949665, + "grad_norm": 3.3365838527679443, + "learning_rate": 4.999763628128439e-05, + "loss": 7.3895, + "step": 737 + }, + { + "epoch": 0.004389095061375963, + "grad_norm": 3.147660970687866, + "learning_rate": 4.999762985387372e-05, + "loss": 7.1885, + "step": 738 + }, + { + "epoch": 0.004395042344656961, + "grad_norm": 3.3230104446411133, + "learning_rate": 4.9997623417736626e-05, + "loss": 7.5839, + "step": 739 + }, + { + "epoch": 0.004400989627937958, + "grad_norm": 3.285144090652466, + "learning_rate": 4.999761697287313e-05, + "loss": 7.4859, + "step": 740 + }, + { + "epoch": 0.0044069369112189555, + "grad_norm": 3.3811442852020264, + "learning_rate": 4.9997610519283216e-05, + "loss": 7.4871, + "step": 741 + }, + { + "epoch": 0.004412884194499952, + "grad_norm": 2.9662907123565674, + "learning_rate": 4.9997604056966904e-05, + "loss": 7.2546, + "step": 742 + }, + { + "epoch": 0.00441883147778095, + "grad_norm": 3.1432855129241943, + "learning_rate": 4.999759758592418e-05, + "loss": 7.5273, + "step": 743 + }, + { + "epoch": 0.004424778761061947, + "grad_norm": 3.0559749603271484, + "learning_rate": 4.9997591106155054e-05, + "loss": 7.0754, + "step": 744 + }, + { + "epoch": 0.0044307260443429445, + "grad_norm": 2.6778409481048584, + "learning_rate": 4.999758461765953e-05, + "loss": 7.1723, + "step": 745 + }, + { + "epoch": 0.004436673327623941, + "grad_norm": 2.592228412628174, + "learning_rate": 4.9997578120437606e-05, + "loss": 7.2671, + "step": 746 + }, + { + "epoch": 0.004442620610904939, + "grad_norm": 2.5546112060546875, + "learning_rate": 4.999757161448928e-05, + "loss": 7.2571, + "step": 747 + }, + { + "epoch": 0.004448567894185936, + "grad_norm": 2.745755672454834, + "learning_rate": 4.999756509981457e-05, + "loss": 7.3895, + "step": 748 + }, + { + "epoch": 0.004454515177466933, + "grad_norm": 2.9785144329071045, + "learning_rate": 4.999755857641346e-05, + "loss": 7.2431, + "step": 749 + }, + { + "epoch": 0.00446046246074793, + "grad_norm": 2.918891191482544, + "learning_rate": 4.9997552044285965e-05, + "loss": 7.3805, + "step": 750 + }, + { + "epoch": 0.004466409744028927, + "grad_norm": 2.7858519554138184, + "learning_rate": 4.999754550343209e-05, + "loss": 7.5942, + "step": 751 + }, + { + "epoch": 0.004472357027309925, + "grad_norm": 2.7758638858795166, + "learning_rate": 4.999753895385181e-05, + "loss": 7.5896, + "step": 752 + }, + { + "epoch": 0.004478304310590922, + "grad_norm": 2.7125916481018066, + "learning_rate": 4.999753239554517e-05, + "loss": 7.4341, + "step": 753 + }, + { + "epoch": 0.004484251593871919, + "grad_norm": 4.241726875305176, + "learning_rate": 4.999752582851214e-05, + "loss": 7.0517, + "step": 754 + }, + { + "epoch": 0.004490198877152916, + "grad_norm": 2.9547781944274902, + "learning_rate": 4.999751925275272e-05, + "loss": 7.2616, + "step": 755 + }, + { + "epoch": 0.004496146160433914, + "grad_norm": 4.2594122886657715, + "learning_rate": 4.9997512668266945e-05, + "loss": 7.3069, + "step": 756 + }, + { + "epoch": 0.004502093443714911, + "grad_norm": 4.1758246421813965, + "learning_rate": 4.9997506075054776e-05, + "loss": 7.3417, + "step": 757 + }, + { + "epoch": 0.004508040726995908, + "grad_norm": 2.8398962020874023, + "learning_rate": 4.999749947311625e-05, + "loss": 7.107, + "step": 758 + }, + { + "epoch": 0.004513988010276905, + "grad_norm": 3.487478017807007, + "learning_rate": 4.9997492862451354e-05, + "loss": 7.0014, + "step": 759 + }, + { + "epoch": 0.004519935293557903, + "grad_norm": 2.883409261703491, + "learning_rate": 4.999748624306009e-05, + "loss": 7.4691, + "step": 760 + }, + { + "epoch": 0.0045258825768389, + "grad_norm": 3.0092155933380127, + "learning_rate": 4.999747961494246e-05, + "loss": 7.3771, + "step": 761 + }, + { + "epoch": 0.004531829860119897, + "grad_norm": 2.9571943283081055, + "learning_rate": 4.999747297809847e-05, + "loss": 7.4664, + "step": 762 + }, + { + "epoch": 0.004537777143400894, + "grad_norm": 2.7476816177368164, + "learning_rate": 4.999746633252812e-05, + "loss": 7.2943, + "step": 763 + }, + { + "epoch": 0.004543724426681892, + "grad_norm": 4.903059959411621, + "learning_rate": 4.9997459678231415e-05, + "loss": 7.3467, + "step": 764 + }, + { + "epoch": 0.004549671709962889, + "grad_norm": 3.8205373287200928, + "learning_rate": 4.999745301520835e-05, + "loss": 7.2807, + "step": 765 + }, + { + "epoch": 0.0045556189932438864, + "grad_norm": 2.6003127098083496, + "learning_rate": 4.9997446343458934e-05, + "loss": 7.2736, + "step": 766 + }, + { + "epoch": 0.004561566276524883, + "grad_norm": 3.288313627243042, + "learning_rate": 4.999743966298317e-05, + "loss": 7.3832, + "step": 767 + }, + { + "epoch": 0.004567513559805881, + "grad_norm": 3.4839234352111816, + "learning_rate": 4.999743297378106e-05, + "loss": 7.2932, + "step": 768 + }, + { + "epoch": 0.004573460843086878, + "grad_norm": 3.2667462825775146, + "learning_rate": 4.99974262758526e-05, + "loss": 7.4855, + "step": 769 + }, + { + "epoch": 0.0045794081263678755, + "grad_norm": 3.3637850284576416, + "learning_rate": 4.99974195691978e-05, + "loss": 7.4864, + "step": 770 + }, + { + "epoch": 0.004585355409648872, + "grad_norm": 4.691596508026123, + "learning_rate": 4.999741285381666e-05, + "loss": 7.4751, + "step": 771 + }, + { + "epoch": 0.00459130269292987, + "grad_norm": 3.8831942081451416, + "learning_rate": 4.999740612970918e-05, + "loss": 7.4554, + "step": 772 + }, + { + "epoch": 0.004597249976210867, + "grad_norm": 2.9129562377929688, + "learning_rate": 4.999739939687536e-05, + "loss": 7.7096, + "step": 773 + }, + { + "epoch": 0.0046031972594918645, + "grad_norm": 3.928882598876953, + "learning_rate": 4.9997392655315207e-05, + "loss": 7.6453, + "step": 774 + }, + { + "epoch": 0.004609144542772861, + "grad_norm": 4.19191312789917, + "learning_rate": 4.9997385905028726e-05, + "loss": 7.6038, + "step": 775 + }, + { + "epoch": 0.004615091826053859, + "grad_norm": 2.4585883617401123, + "learning_rate": 4.999737914601591e-05, + "loss": 7.5734, + "step": 776 + }, + { + "epoch": 0.004621039109334856, + "grad_norm": 3.500932455062866, + "learning_rate": 4.9997372378276776e-05, + "loss": 7.6535, + "step": 777 + }, + { + "epoch": 0.0046269863926158535, + "grad_norm": 3.1256210803985596, + "learning_rate": 4.9997365601811306e-05, + "loss": 7.4844, + "step": 778 + }, + { + "epoch": 0.00463293367589685, + "grad_norm": 2.083902597427368, + "learning_rate": 4.999735881661952e-05, + "loss": 7.646, + "step": 779 + }, + { + "epoch": 0.004638880959177847, + "grad_norm": 2.2990450859069824, + "learning_rate": 4.999735202270142e-05, + "loss": 7.5756, + "step": 780 + }, + { + "epoch": 0.004644828242458845, + "grad_norm": 2.782463550567627, + "learning_rate": 4.9997345220057004e-05, + "loss": 7.6191, + "step": 781 + }, + { + "epoch": 0.004650775525739842, + "grad_norm": 4.157378673553467, + "learning_rate": 4.9997338408686255e-05, + "loss": 7.5265, + "step": 782 + }, + { + "epoch": 0.004656722809020839, + "grad_norm": 2.850106716156006, + "learning_rate": 4.999733158858921e-05, + "loss": 7.4562, + "step": 783 + }, + { + "epoch": 0.004662670092301836, + "grad_norm": 2.8073840141296387, + "learning_rate": 4.999732475976585e-05, + "loss": 7.3913, + "step": 784 + }, + { + "epoch": 0.004668617375582834, + "grad_norm": 2.85048770904541, + "learning_rate": 4.999731792221618e-05, + "loss": 7.3945, + "step": 785 + }, + { + "epoch": 0.004674564658863831, + "grad_norm": 2.760990619659424, + "learning_rate": 4.999731107594021e-05, + "loss": 7.6088, + "step": 786 + }, + { + "epoch": 0.004680511942144828, + "grad_norm": 2.4395666122436523, + "learning_rate": 4.9997304220937933e-05, + "loss": 7.6996, + "step": 787 + }, + { + "epoch": 0.004686459225425825, + "grad_norm": 2.5826008319854736, + "learning_rate": 4.9997297357209354e-05, + "loss": 7.5888, + "step": 788 + }, + { + "epoch": 0.004692406508706823, + "grad_norm": 3.434957981109619, + "learning_rate": 4.999729048475448e-05, + "loss": 7.4659, + "step": 789 + }, + { + "epoch": 0.00469835379198782, + "grad_norm": 4.103111743927002, + "learning_rate": 4.9997283603573306e-05, + "loss": 7.6704, + "step": 790 + }, + { + "epoch": 0.004704301075268817, + "grad_norm": 3.7879343032836914, + "learning_rate": 4.999727671366584e-05, + "loss": 7.5387, + "step": 791 + }, + { + "epoch": 0.004710248358549814, + "grad_norm": 3.706599235534668, + "learning_rate": 4.999726981503209e-05, + "loss": 7.3413, + "step": 792 + }, + { + "epoch": 0.004716195641830812, + "grad_norm": 2.1999869346618652, + "learning_rate": 4.999726290767204e-05, + "loss": 7.1809, + "step": 793 + }, + { + "epoch": 0.004722142925111809, + "grad_norm": 2.8561251163482666, + "learning_rate": 4.999725599158571e-05, + "loss": 7.3496, + "step": 794 + }, + { + "epoch": 0.004728090208392806, + "grad_norm": 3.0696613788604736, + "learning_rate": 4.99972490667731e-05, + "loss": 7.542, + "step": 795 + }, + { + "epoch": 0.004734037491673803, + "grad_norm": 2.706404685974121, + "learning_rate": 4.99972421332342e-05, + "loss": 7.4233, + "step": 796 + }, + { + "epoch": 0.004739984774954801, + "grad_norm": 2.388360023498535, + "learning_rate": 4.9997235190969025e-05, + "loss": 7.5754, + "step": 797 + }, + { + "epoch": 0.004745932058235798, + "grad_norm": 2.3414177894592285, + "learning_rate": 4.999722823997758e-05, + "loss": 7.438, + "step": 798 + }, + { + "epoch": 0.004751879341516795, + "grad_norm": 2.46012544631958, + "learning_rate": 4.999722128025985e-05, + "loss": 6.9522, + "step": 799 + }, + { + "epoch": 0.004757826624797792, + "grad_norm": 2.5721335411071777, + "learning_rate": 4.9997214311815855e-05, + "loss": 6.9632, + "step": 800 + }, + { + "epoch": 0.00476377390807879, + "grad_norm": 2.4028279781341553, + "learning_rate": 4.999720733464559e-05, + "loss": 7.3834, + "step": 801 + }, + { + "epoch": 0.004769721191359787, + "grad_norm": 2.378971576690674, + "learning_rate": 4.9997200348749055e-05, + "loss": 7.7919, + "step": 802 + }, + { + "epoch": 0.004775668474640784, + "grad_norm": 2.1871516704559326, + "learning_rate": 4.999719335412626e-05, + "loss": 7.6832, + "step": 803 + }, + { + "epoch": 0.004781615757921781, + "grad_norm": 2.4183239936828613, + "learning_rate": 4.9997186350777206e-05, + "loss": 7.5013, + "step": 804 + }, + { + "epoch": 0.004787563041202779, + "grad_norm": 2.2322120666503906, + "learning_rate": 4.9997179338701884e-05, + "loss": 7.4224, + "step": 805 + }, + { + "epoch": 0.004793510324483776, + "grad_norm": 3.2633447647094727, + "learning_rate": 4.99971723179003e-05, + "loss": 7.1966, + "step": 806 + }, + { + "epoch": 0.004799457607764773, + "grad_norm": 3.1195995807647705, + "learning_rate": 4.999716528837247e-05, + "loss": 7.4057, + "step": 807 + }, + { + "epoch": 0.00480540489104577, + "grad_norm": 2.6904098987579346, + "learning_rate": 4.9997158250118395e-05, + "loss": 7.4585, + "step": 808 + }, + { + "epoch": 0.004811352174326768, + "grad_norm": 2.6955599784851074, + "learning_rate": 4.999715120313806e-05, + "loss": 7.6053, + "step": 809 + }, + { + "epoch": 0.004817299457607765, + "grad_norm": 3.569037675857544, + "learning_rate": 4.999714414743148e-05, + "loss": 7.5085, + "step": 810 + }, + { + "epoch": 0.004823246740888762, + "grad_norm": 3.5231528282165527, + "learning_rate": 4.9997137082998655e-05, + "loss": 7.4554, + "step": 811 + }, + { + "epoch": 0.004829194024169759, + "grad_norm": 2.7118120193481445, + "learning_rate": 4.999713000983959e-05, + "loss": 7.4323, + "step": 812 + }, + { + "epoch": 0.004835141307450756, + "grad_norm": 3.229548931121826, + "learning_rate": 4.9997122927954284e-05, + "loss": 7.3098, + "step": 813 + }, + { + "epoch": 0.004841088590731754, + "grad_norm": 2.4224696159362793, + "learning_rate": 4.999711583734273e-05, + "loss": 7.3488, + "step": 814 + }, + { + "epoch": 0.004847035874012751, + "grad_norm": 2.627565383911133, + "learning_rate": 4.999710873800496e-05, + "loss": 7.457, + "step": 815 + }, + { + "epoch": 0.004852983157293748, + "grad_norm": 2.5339515209198, + "learning_rate": 4.999710162994094e-05, + "loss": 7.6602, + "step": 816 + }, + { + "epoch": 0.004858930440574745, + "grad_norm": 2.663694143295288, + "learning_rate": 4.9997094513150706e-05, + "loss": 7.1064, + "step": 817 + }, + { + "epoch": 0.004864877723855743, + "grad_norm": 2.372504472732544, + "learning_rate": 4.9997087387634234e-05, + "loss": 7.341, + "step": 818 + }, + { + "epoch": 0.00487082500713674, + "grad_norm": 2.145191192626953, + "learning_rate": 4.999708025339154e-05, + "loss": 7.3216, + "step": 819 + }, + { + "epoch": 0.004876772290417737, + "grad_norm": 2.39685320854187, + "learning_rate": 4.9997073110422626e-05, + "loss": 7.3463, + "step": 820 + }, + { + "epoch": 0.004882719573698734, + "grad_norm": 2.2227275371551514, + "learning_rate": 4.999706595872749e-05, + "loss": 7.2517, + "step": 821 + }, + { + "epoch": 0.004888666856979732, + "grad_norm": 2.7770352363586426, + "learning_rate": 4.999705879830614e-05, + "loss": 7.3117, + "step": 822 + }, + { + "epoch": 0.004894614140260729, + "grad_norm": 2.448026180267334, + "learning_rate": 4.999705162915857e-05, + "loss": 6.9883, + "step": 823 + }, + { + "epoch": 0.004900561423541726, + "grad_norm": 2.2304437160491943, + "learning_rate": 4.999704445128479e-05, + "loss": 7.2644, + "step": 824 + }, + { + "epoch": 0.004906508706822723, + "grad_norm": 2.351707696914673, + "learning_rate": 4.9997037264684796e-05, + "loss": 7.1984, + "step": 825 + }, + { + "epoch": 0.004912455990103721, + "grad_norm": 2.7631921768188477, + "learning_rate": 4.99970300693586e-05, + "loss": 7.3774, + "step": 826 + }, + { + "epoch": 0.004918403273384718, + "grad_norm": 2.4636785984039307, + "learning_rate": 4.9997022865306195e-05, + "loss": 7.3778, + "step": 827 + }, + { + "epoch": 0.004924350556665715, + "grad_norm": 3.5510878562927246, + "learning_rate": 4.999701565252759e-05, + "loss": 7.166, + "step": 828 + }, + { + "epoch": 0.004930297839946712, + "grad_norm": 3.2581429481506348, + "learning_rate": 4.999700843102278e-05, + "loss": 7.286, + "step": 829 + }, + { + "epoch": 0.00493624512322771, + "grad_norm": 2.4304182529449463, + "learning_rate": 4.999700120079178e-05, + "loss": 7.5076, + "step": 830 + }, + { + "epoch": 0.004942192406508707, + "grad_norm": 2.428854465484619, + "learning_rate": 4.999699396183458e-05, + "loss": 7.405, + "step": 831 + }, + { + "epoch": 0.004948139689789704, + "grad_norm": 2.7680416107177734, + "learning_rate": 4.9996986714151195e-05, + "loss": 7.4944, + "step": 832 + }, + { + "epoch": 0.004954086973070701, + "grad_norm": 2.6787109375, + "learning_rate": 4.999697945774161e-05, + "loss": 7.5946, + "step": 833 + }, + { + "epoch": 0.004960034256351699, + "grad_norm": 2.6396615505218506, + "learning_rate": 4.9996972192605845e-05, + "loss": 7.5405, + "step": 834 + }, + { + "epoch": 0.004965981539632696, + "grad_norm": 2.89387583732605, + "learning_rate": 4.999696491874389e-05, + "loss": 7.3809, + "step": 835 + }, + { + "epoch": 0.004971928822913693, + "grad_norm": 2.332838535308838, + "learning_rate": 4.999695763615576e-05, + "loss": 7.3638, + "step": 836 + }, + { + "epoch": 0.00497787610619469, + "grad_norm": 2.2880585193634033, + "learning_rate": 4.9996950344841444e-05, + "loss": 7.3557, + "step": 837 + }, + { + "epoch": 0.004983823389475688, + "grad_norm": 2.7478256225585938, + "learning_rate": 4.999694304480096e-05, + "loss": 7.4, + "step": 838 + }, + { + "epoch": 0.004989770672756685, + "grad_norm": 3.4789531230926514, + "learning_rate": 4.999693573603429e-05, + "loss": 7.4438, + "step": 839 + }, + { + "epoch": 0.004995717956037682, + "grad_norm": 2.7377078533172607, + "learning_rate": 4.9996928418541455e-05, + "loss": 7.4074, + "step": 840 + }, + { + "epoch": 0.005001665239318679, + "grad_norm": 3.04420804977417, + "learning_rate": 4.9996921092322444e-05, + "loss": 7.3834, + "step": 841 + }, + { + "epoch": 0.005007612522599676, + "grad_norm": 2.759244203567505, + "learning_rate": 4.999691375737727e-05, + "loss": 7.4492, + "step": 842 + }, + { + "epoch": 0.005013559805880674, + "grad_norm": 2.5327556133270264, + "learning_rate": 4.9996906413705933e-05, + "loss": 7.4403, + "step": 843 + }, + { + "epoch": 0.0050195070891616705, + "grad_norm": 2.8170409202575684, + "learning_rate": 4.9996899061308434e-05, + "loss": 7.623, + "step": 844 + }, + { + "epoch": 0.005025454372442668, + "grad_norm": 3.8642547130584717, + "learning_rate": 4.9996891700184774e-05, + "loss": 7.6099, + "step": 845 + }, + { + "epoch": 0.005031401655723665, + "grad_norm": 4.704552173614502, + "learning_rate": 4.999688433033496e-05, + "loss": 7.6755, + "step": 846 + }, + { + "epoch": 0.005037348939004663, + "grad_norm": 4.128530979156494, + "learning_rate": 4.9996876951758986e-05, + "loss": 7.5246, + "step": 847 + }, + { + "epoch": 0.0050432962222856596, + "grad_norm": 2.233447551727295, + "learning_rate": 4.9996869564456865e-05, + "loss": 7.1139, + "step": 848 + }, + { + "epoch": 0.005049243505566657, + "grad_norm": 5.96085262298584, + "learning_rate": 4.999686216842859e-05, + "loss": 7.4114, + "step": 849 + }, + { + "epoch": 0.005055190788847654, + "grad_norm": 4.828244686126709, + "learning_rate": 4.9996854763674175e-05, + "loss": 7.6743, + "step": 850 + }, + { + "epoch": 0.005061138072128652, + "grad_norm": 3.0259342193603516, + "learning_rate": 4.999684735019362e-05, + "loss": 7.7537, + "step": 851 + }, + { + "epoch": 0.005067085355409649, + "grad_norm": 2.807244062423706, + "learning_rate": 4.999683992798692e-05, + "loss": 7.7744, + "step": 852 + }, + { + "epoch": 0.005073032638690646, + "grad_norm": 2.81384015083313, + "learning_rate": 4.999683249705408e-05, + "loss": 7.2922, + "step": 853 + }, + { + "epoch": 0.005078979921971643, + "grad_norm": 2.582836627960205, + "learning_rate": 4.9996825057395105e-05, + "loss": 7.3421, + "step": 854 + }, + { + "epoch": 0.005084927205252641, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9996817609009996e-05, + "loss": 7.6249, + "step": 855 + }, + { + "epoch": 0.005090874488533638, + "grad_norm": 2.3322219848632812, + "learning_rate": 4.999681015189875e-05, + "loss": 7.4695, + "step": 856 + }, + { + "epoch": 0.005096821771814635, + "grad_norm": 2.5582947731018066, + "learning_rate": 4.9996802686061384e-05, + "loss": 7.2747, + "step": 857 + }, + { + "epoch": 0.005102769055095632, + "grad_norm": 3.192093849182129, + "learning_rate": 4.999679521149789e-05, + "loss": 7.504, + "step": 858 + }, + { + "epoch": 0.00510871633837663, + "grad_norm": 4.1585588455200195, + "learning_rate": 4.999678772820827e-05, + "loss": 7.5966, + "step": 859 + }, + { + "epoch": 0.005114663621657627, + "grad_norm": 5.052750587463379, + "learning_rate": 4.999678023619253e-05, + "loss": 7.3243, + "step": 860 + }, + { + "epoch": 0.005120610904938624, + "grad_norm": 2.395909070968628, + "learning_rate": 4.999677273545068e-05, + "loss": 7.4477, + "step": 861 + }, + { + "epoch": 0.005126558188219621, + "grad_norm": 2.487334966659546, + "learning_rate": 4.999676522598271e-05, + "loss": 7.591, + "step": 862 + }, + { + "epoch": 0.005132505471500619, + "grad_norm": 3.7094171047210693, + "learning_rate": 4.999675770778863e-05, + "loss": 7.5387, + "step": 863 + }, + { + "epoch": 0.005138452754781616, + "grad_norm": 4.468298435211182, + "learning_rate": 4.9996750180868435e-05, + "loss": 7.5754, + "step": 864 + }, + { + "epoch": 0.005144400038062613, + "grad_norm": 3.2769386768341064, + "learning_rate": 4.999674264522213e-05, + "loss": 7.459, + "step": 865 + }, + { + "epoch": 0.00515034732134361, + "grad_norm": 2.7162864208221436, + "learning_rate": 4.9996735100849726e-05, + "loss": 7.3473, + "step": 866 + }, + { + "epoch": 0.005156294604624608, + "grad_norm": 3.646401882171631, + "learning_rate": 4.999672754775122e-05, + "loss": 7.4446, + "step": 867 + }, + { + "epoch": 0.005162241887905605, + "grad_norm": 8.917684555053711, + "learning_rate": 4.999671998592662e-05, + "loss": 7.2016, + "step": 868 + }, + { + "epoch": 0.005168189171186602, + "grad_norm": 2.949993133544922, + "learning_rate": 4.999671241537591e-05, + "loss": 7.3081, + "step": 869 + }, + { + "epoch": 0.005174136454467599, + "grad_norm": 2.4531025886535645, + "learning_rate": 4.999670483609912e-05, + "loss": 7.402, + "step": 870 + }, + { + "epoch": 0.005180083737748597, + "grad_norm": 3.1903798580169678, + "learning_rate": 4.999669724809623e-05, + "loss": 7.2514, + "step": 871 + }, + { + "epoch": 0.005186031021029594, + "grad_norm": 3.461353302001953, + "learning_rate": 4.999668965136726e-05, + "loss": 7.1637, + "step": 872 + }, + { + "epoch": 0.005191978304310591, + "grad_norm": 2.623075246810913, + "learning_rate": 4.9996682045912194e-05, + "loss": 7.5482, + "step": 873 + }, + { + "epoch": 0.005197925587591588, + "grad_norm": 2.9072840213775635, + "learning_rate": 4.9996674431731044e-05, + "loss": 7.484, + "step": 874 + }, + { + "epoch": 0.005203872870872585, + "grad_norm": 3.0219666957855225, + "learning_rate": 4.999666680882382e-05, + "loss": 7.5223, + "step": 875 + }, + { + "epoch": 0.005209820154153583, + "grad_norm": 2.9892475605010986, + "learning_rate": 4.9996659177190514e-05, + "loss": 7.3843, + "step": 876 + }, + { + "epoch": 0.0052157674374345795, + "grad_norm": 2.6199591159820557, + "learning_rate": 4.9996651536831126e-05, + "loss": 7.2728, + "step": 877 + }, + { + "epoch": 0.005221714720715577, + "grad_norm": 2.6897647380828857, + "learning_rate": 4.999664388774567e-05, + "loss": 7.5323, + "step": 878 + }, + { + "epoch": 0.005227662003996574, + "grad_norm": 3.5945560932159424, + "learning_rate": 4.9996636229934155e-05, + "loss": 7.5001, + "step": 879 + }, + { + "epoch": 0.005233609287277572, + "grad_norm": 2.9064812660217285, + "learning_rate": 4.9996628563396563e-05, + "loss": 7.5463, + "step": 880 + }, + { + "epoch": 0.0052395565705585685, + "grad_norm": 3.6150660514831543, + "learning_rate": 4.999662088813291e-05, + "loss": 7.6596, + "step": 881 + }, + { + "epoch": 0.005245503853839566, + "grad_norm": 2.729684591293335, + "learning_rate": 4.99966132041432e-05, + "loss": 7.5342, + "step": 882 + }, + { + "epoch": 0.005251451137120563, + "grad_norm": 2.6782853603363037, + "learning_rate": 4.9996605511427416e-05, + "loss": 7.5837, + "step": 883 + }, + { + "epoch": 0.005257398420401561, + "grad_norm": 4.171568393707275, + "learning_rate": 4.9996597809985576e-05, + "loss": 7.3626, + "step": 884 + }, + { + "epoch": 0.0052633457036825575, + "grad_norm": 2.189725637435913, + "learning_rate": 4.999659009981769e-05, + "loss": 7.5431, + "step": 885 + }, + { + "epoch": 0.005269292986963555, + "grad_norm": 2.2473320960998535, + "learning_rate": 4.999658238092375e-05, + "loss": 7.4731, + "step": 886 + }, + { + "epoch": 0.005275240270244552, + "grad_norm": 3.4393012523651123, + "learning_rate": 4.999657465330376e-05, + "loss": 7.6839, + "step": 887 + }, + { + "epoch": 0.00528118755352555, + "grad_norm": 2.717742919921875, + "learning_rate": 4.9996566916957735e-05, + "loss": 7.6812, + "step": 888 + }, + { + "epoch": 0.0052871348368065466, + "grad_norm": 3.829698085784912, + "learning_rate": 4.9996559171885655e-05, + "loss": 7.4525, + "step": 889 + }, + { + "epoch": 0.005293082120087544, + "grad_norm": 2.764598846435547, + "learning_rate": 4.9996551418087536e-05, + "loss": 7.5379, + "step": 890 + }, + { + "epoch": 0.005299029403368541, + "grad_norm": 2.4230268001556396, + "learning_rate": 4.999654365556338e-05, + "loss": 7.454, + "step": 891 + }, + { + "epoch": 0.005304976686649539, + "grad_norm": 2.31870436668396, + "learning_rate": 4.999653588431319e-05, + "loss": 7.5306, + "step": 892 + }, + { + "epoch": 0.005310923969930536, + "grad_norm": 2.332259178161621, + "learning_rate": 4.999652810433697e-05, + "loss": 7.4008, + "step": 893 + }, + { + "epoch": 0.005316871253211533, + "grad_norm": 2.630568504333496, + "learning_rate": 4.999652031563471e-05, + "loss": 7.4046, + "step": 894 + }, + { + "epoch": 0.00532281853649253, + "grad_norm": 3.327211856842041, + "learning_rate": 4.999651251820643e-05, + "loss": 7.2901, + "step": 895 + }, + { + "epoch": 0.005328765819773528, + "grad_norm": 2.2383713722229004, + "learning_rate": 4.999650471205213e-05, + "loss": 7.5116, + "step": 896 + }, + { + "epoch": 0.005334713103054525, + "grad_norm": 2.972820997238159, + "learning_rate": 4.99964968971718e-05, + "loss": 7.4013, + "step": 897 + }, + { + "epoch": 0.005340660386335522, + "grad_norm": 2.7254672050476074, + "learning_rate": 4.999648907356545e-05, + "loss": 7.3174, + "step": 898 + }, + { + "epoch": 0.005346607669616519, + "grad_norm": 2.6943607330322266, + "learning_rate": 4.9996481241233096e-05, + "loss": 7.386, + "step": 899 + }, + { + "epoch": 0.005352554952897517, + "grad_norm": 2.9217519760131836, + "learning_rate": 4.999647340017473e-05, + "loss": 7.5398, + "step": 900 + }, + { + "epoch": 0.005358502236178514, + "grad_norm": 2.7950780391693115, + "learning_rate": 4.999646555039034e-05, + "loss": 7.6336, + "step": 901 + }, + { + "epoch": 0.005364449519459511, + "grad_norm": 2.763364553451538, + "learning_rate": 4.999645769187995e-05, + "loss": 7.5161, + "step": 902 + }, + { + "epoch": 0.005370396802740508, + "grad_norm": 2.3095102310180664, + "learning_rate": 4.999644982464355e-05, + "loss": 7.5859, + "step": 903 + }, + { + "epoch": 0.005376344086021506, + "grad_norm": 2.7287917137145996, + "learning_rate": 4.999644194868115e-05, + "loss": 7.3983, + "step": 904 + }, + { + "epoch": 0.005382291369302503, + "grad_norm": 2.6175942420959473, + "learning_rate": 4.999643406399275e-05, + "loss": 7.4278, + "step": 905 + }, + { + "epoch": 0.0053882386525834994, + "grad_norm": 2.3898375034332275, + "learning_rate": 4.999642617057835e-05, + "loss": 7.4537, + "step": 906 + }, + { + "epoch": 0.005394185935864497, + "grad_norm": 2.964381694793701, + "learning_rate": 4.999641826843796e-05, + "loss": 7.3258, + "step": 907 + }, + { + "epoch": 0.005400133219145494, + "grad_norm": 3.1146717071533203, + "learning_rate": 4.999641035757158e-05, + "loss": 7.5412, + "step": 908 + }, + { + "epoch": 0.005406080502426492, + "grad_norm": 3.4733238220214844, + "learning_rate": 4.999640243797921e-05, + "loss": 7.423, + "step": 909 + }, + { + "epoch": 0.0054120277857074885, + "grad_norm": 3.621044158935547, + "learning_rate": 4.999639450966085e-05, + "loss": 7.5885, + "step": 910 + }, + { + "epoch": 0.005417975068988486, + "grad_norm": 2.4800662994384766, + "learning_rate": 4.999638657261651e-05, + "loss": 7.5231, + "step": 911 + }, + { + "epoch": 0.005423922352269483, + "grad_norm": 3.3247363567352295, + "learning_rate": 4.999637862684619e-05, + "loss": 7.2367, + "step": 912 + }, + { + "epoch": 0.005429869635550481, + "grad_norm": 4.293686866760254, + "learning_rate": 4.999637067234989e-05, + "loss": 6.8423, + "step": 913 + }, + { + "epoch": 0.0054358169188314775, + "grad_norm": 2.6713979244232178, + "learning_rate": 4.999636270912762e-05, + "loss": 6.7962, + "step": 914 + }, + { + "epoch": 0.005441764202112475, + "grad_norm": 2.9386653900146484, + "learning_rate": 4.9996354737179376e-05, + "loss": 6.7582, + "step": 915 + }, + { + "epoch": 0.005447711485393472, + "grad_norm": 2.8030481338500977, + "learning_rate": 4.999634675650516e-05, + "loss": 6.6516, + "step": 916 + }, + { + "epoch": 0.00545365876867447, + "grad_norm": 2.7315666675567627, + "learning_rate": 4.9996338767104985e-05, + "loss": 6.6159, + "step": 917 + }, + { + "epoch": 0.0054596060519554665, + "grad_norm": 3.116098403930664, + "learning_rate": 4.999633076897884e-05, + "loss": 7.2121, + "step": 918 + }, + { + "epoch": 0.005465553335236464, + "grad_norm": 2.867687940597534, + "learning_rate": 4.999632276212673e-05, + "loss": 7.5124, + "step": 919 + }, + { + "epoch": 0.005471500618517461, + "grad_norm": 2.9864203929901123, + "learning_rate": 4.9996314746548676e-05, + "loss": 7.5168, + "step": 920 + }, + { + "epoch": 0.005477447901798459, + "grad_norm": 2.9083375930786133, + "learning_rate": 4.9996306722244656e-05, + "loss": 7.5027, + "step": 921 + }, + { + "epoch": 0.0054833951850794555, + "grad_norm": 2.5569801330566406, + "learning_rate": 4.9996298689214686e-05, + "loss": 7.2988, + "step": 922 + }, + { + "epoch": 0.005489342468360453, + "grad_norm": 3.7101242542266846, + "learning_rate": 4.9996290647458765e-05, + "loss": 7.33, + "step": 923 + }, + { + "epoch": 0.00549528975164145, + "grad_norm": 2.848881244659424, + "learning_rate": 4.99962825969769e-05, + "loss": 7.4534, + "step": 924 + }, + { + "epoch": 0.005501237034922448, + "grad_norm": 3.072282075881958, + "learning_rate": 4.999627453776909e-05, + "loss": 7.4398, + "step": 925 + }, + { + "epoch": 0.0055071843182034445, + "grad_norm": 2.8132996559143066, + "learning_rate": 4.999626646983534e-05, + "loss": 7.5617, + "step": 926 + }, + { + "epoch": 0.005513131601484442, + "grad_norm": 2.2710142135620117, + "learning_rate": 4.999625839317565e-05, + "loss": 7.5975, + "step": 927 + }, + { + "epoch": 0.005519078884765439, + "grad_norm": 2.745007276535034, + "learning_rate": 4.9996250307790026e-05, + "loss": 7.4599, + "step": 928 + }, + { + "epoch": 0.005525026168046437, + "grad_norm": 3.2031302452087402, + "learning_rate": 4.999624221367847e-05, + "loss": 7.3528, + "step": 929 + }, + { + "epoch": 0.0055309734513274336, + "grad_norm": 6.417830467224121, + "learning_rate": 4.999623411084098e-05, + "loss": 7.5118, + "step": 930 + }, + { + "epoch": 0.005536920734608431, + "grad_norm": 2.7960314750671387, + "learning_rate": 4.999622599927756e-05, + "loss": 6.5016, + "step": 931 + }, + { + "epoch": 0.005542868017889428, + "grad_norm": 2.959507703781128, + "learning_rate": 4.999621787898822e-05, + "loss": 7.6521, + "step": 932 + }, + { + "epoch": 0.005548815301170426, + "grad_norm": 3.328834056854248, + "learning_rate": 4.999620974997296e-05, + "loss": 7.6267, + "step": 933 + }, + { + "epoch": 0.005554762584451423, + "grad_norm": 2.5232200622558594, + "learning_rate": 4.9996201612231786e-05, + "loss": 7.471, + "step": 934 + }, + { + "epoch": 0.00556070986773242, + "grad_norm": 2.2766942977905273, + "learning_rate": 4.999619346576468e-05, + "loss": 7.4204, + "step": 935 + }, + { + "epoch": 0.005566657151013417, + "grad_norm": 2.584068536758423, + "learning_rate": 4.999618531057168e-05, + "loss": 7.4384, + "step": 936 + }, + { + "epoch": 0.005572604434294414, + "grad_norm": 3.004523277282715, + "learning_rate": 4.999617714665276e-05, + "loss": 7.5681, + "step": 937 + }, + { + "epoch": 0.005578551717575412, + "grad_norm": 4.102936267852783, + "learning_rate": 4.999616897400794e-05, + "loss": 7.4571, + "step": 938 + }, + { + "epoch": 0.005584499000856408, + "grad_norm": 2.745293378829956, + "learning_rate": 4.99961607926372e-05, + "loss": 7.588, + "step": 939 + }, + { + "epoch": 0.005590446284137406, + "grad_norm": 2.9720282554626465, + "learning_rate": 4.9996152602540576e-05, + "loss": 7.4761, + "step": 940 + }, + { + "epoch": 0.005596393567418403, + "grad_norm": 3.150047540664673, + "learning_rate": 4.999614440371805e-05, + "loss": 7.4525, + "step": 941 + }, + { + "epoch": 0.005602340850699401, + "grad_norm": 2.6735856533050537, + "learning_rate": 4.999613619616962e-05, + "loss": 7.2754, + "step": 942 + }, + { + "epoch": 0.005608288133980397, + "grad_norm": 2.6451661586761475, + "learning_rate": 4.9996127979895304e-05, + "loss": 7.5742, + "step": 943 + }, + { + "epoch": 0.005614235417261395, + "grad_norm": 2.7551536560058594, + "learning_rate": 4.9996119754895095e-05, + "loss": 7.4981, + "step": 944 + }, + { + "epoch": 0.005620182700542392, + "grad_norm": 2.7445640563964844, + "learning_rate": 4.9996111521168995e-05, + "loss": 7.4761, + "step": 945 + }, + { + "epoch": 0.00562612998382339, + "grad_norm": 2.537924289703369, + "learning_rate": 4.9996103278717013e-05, + "loss": 7.5483, + "step": 946 + }, + { + "epoch": 0.0056320772671043864, + "grad_norm": 3.503661632537842, + "learning_rate": 4.9996095027539156e-05, + "loss": 7.3074, + "step": 947 + }, + { + "epoch": 0.005638024550385384, + "grad_norm": 2.8088479042053223, + "learning_rate": 4.999608676763542e-05, + "loss": 7.5675, + "step": 948 + }, + { + "epoch": 0.005643971833666381, + "grad_norm": 2.6219863891601562, + "learning_rate": 4.99960784990058e-05, + "loss": 7.6037, + "step": 949 + }, + { + "epoch": 0.005649919116947379, + "grad_norm": 2.88737416267395, + "learning_rate": 4.999607022165031e-05, + "loss": 7.4815, + "step": 950 + }, + { + "epoch": 0.0056558664002283755, + "grad_norm": 2.455707550048828, + "learning_rate": 4.999606193556895e-05, + "loss": 7.553, + "step": 951 + }, + { + "epoch": 0.005661813683509373, + "grad_norm": 2.2502405643463135, + "learning_rate": 4.999605364076173e-05, + "loss": 7.387, + "step": 952 + }, + { + "epoch": 0.00566776096679037, + "grad_norm": 2.754972457885742, + "learning_rate": 4.9996045337228635e-05, + "loss": 7.3088, + "step": 953 + }, + { + "epoch": 0.005673708250071368, + "grad_norm": 3.111553192138672, + "learning_rate": 4.9996037024969686e-05, + "loss": 7.5063, + "step": 954 + }, + { + "epoch": 0.0056796555333523645, + "grad_norm": 2.4000720977783203, + "learning_rate": 4.9996028703984875e-05, + "loss": 7.5705, + "step": 955 + }, + { + "epoch": 0.005685602816633362, + "grad_norm": 2.495659351348877, + "learning_rate": 4.9996020374274215e-05, + "loss": 7.5421, + "step": 956 + }, + { + "epoch": 0.005691550099914359, + "grad_norm": 3.025509834289551, + "learning_rate": 4.99960120358377e-05, + "loss": 7.5406, + "step": 957 + }, + { + "epoch": 0.005697497383195357, + "grad_norm": 2.224342107772827, + "learning_rate": 4.999600368867533e-05, + "loss": 7.4323, + "step": 958 + }, + { + "epoch": 0.0057034446664763535, + "grad_norm": 2.661423683166504, + "learning_rate": 4.999599533278712e-05, + "loss": 7.565, + "step": 959 + }, + { + "epoch": 0.005709391949757351, + "grad_norm": 2.503293037414551, + "learning_rate": 4.999598696817307e-05, + "loss": 7.3552, + "step": 960 + }, + { + "epoch": 0.005715339233038348, + "grad_norm": 2.2878923416137695, + "learning_rate": 4.999597859483316e-05, + "loss": 7.4542, + "step": 961 + }, + { + "epoch": 0.005721286516319346, + "grad_norm": 2.759594678878784, + "learning_rate": 4.999597021276743e-05, + "loss": 7.2349, + "step": 962 + }, + { + "epoch": 0.0057272337996003425, + "grad_norm": 4.5453314781188965, + "learning_rate": 4.999596182197586e-05, + "loss": 7.4728, + "step": 963 + }, + { + "epoch": 0.00573318108288134, + "grad_norm": 2.4369568824768066, + "learning_rate": 4.999595342245846e-05, + "loss": 7.4396, + "step": 964 + }, + { + "epoch": 0.005739128366162337, + "grad_norm": 2.4081692695617676, + "learning_rate": 4.999594501421523e-05, + "loss": 7.536, + "step": 965 + }, + { + "epoch": 0.005745075649443335, + "grad_norm": 3.0494678020477295, + "learning_rate": 4.9995936597246176e-05, + "loss": 7.4061, + "step": 966 + }, + { + "epoch": 0.0057510229327243315, + "grad_norm": 3.3492188453674316, + "learning_rate": 4.999592817155129e-05, + "loss": 7.5419, + "step": 967 + }, + { + "epoch": 0.005756970216005328, + "grad_norm": 2.254714012145996, + "learning_rate": 4.999591973713059e-05, + "loss": 7.4568, + "step": 968 + }, + { + "epoch": 0.005762917499286326, + "grad_norm": 2.3336634635925293, + "learning_rate": 4.999591129398407e-05, + "loss": 7.4386, + "step": 969 + }, + { + "epoch": 0.005768864782567323, + "grad_norm": 2.545154094696045, + "learning_rate": 4.999590284211174e-05, + "loss": 7.226, + "step": 970 + }, + { + "epoch": 0.0057748120658483205, + "grad_norm": 2.891068458557129, + "learning_rate": 4.99958943815136e-05, + "loss": 7.4235, + "step": 971 + }, + { + "epoch": 0.005780759349129317, + "grad_norm": 3.0321712493896484, + "learning_rate": 4.999588591218964e-05, + "loss": 7.2918, + "step": 972 + }, + { + "epoch": 0.005786706632410315, + "grad_norm": 2.935490846633911, + "learning_rate": 4.9995877434139884e-05, + "loss": 7.4172, + "step": 973 + }, + { + "epoch": 0.005792653915691312, + "grad_norm": 3.0021424293518066, + "learning_rate": 4.9995868947364324e-05, + "loss": 7.521, + "step": 974 + }, + { + "epoch": 0.0057986011989723096, + "grad_norm": 2.2784783840179443, + "learning_rate": 4.9995860451862964e-05, + "loss": 7.5716, + "step": 975 + }, + { + "epoch": 0.005804548482253306, + "grad_norm": 2.9321484565734863, + "learning_rate": 4.999585194763581e-05, + "loss": 7.0965, + "step": 976 + }, + { + "epoch": 0.005810495765534304, + "grad_norm": 2.284874439239502, + "learning_rate": 4.999584343468285e-05, + "loss": 7.4376, + "step": 977 + }, + { + "epoch": 0.005816443048815301, + "grad_norm": 2.2066683769226074, + "learning_rate": 4.9995834913004115e-05, + "loss": 7.4478, + "step": 978 + }, + { + "epoch": 0.005822390332096299, + "grad_norm": 2.286323070526123, + "learning_rate": 4.999582638259959e-05, + "loss": 7.4139, + "step": 979 + }, + { + "epoch": 0.005828337615377295, + "grad_norm": 2.5052928924560547, + "learning_rate": 4.999581784346927e-05, + "loss": 7.4278, + "step": 980 + }, + { + "epoch": 0.005834284898658293, + "grad_norm": 2.273698091506958, + "learning_rate": 4.9995809295613175e-05, + "loss": 7.4019, + "step": 981 + }, + { + "epoch": 0.00584023218193929, + "grad_norm": 2.729466676712036, + "learning_rate": 4.999580073903129e-05, + "loss": 7.4716, + "step": 982 + }, + { + "epoch": 0.005846179465220288, + "grad_norm": 2.5776185989379883, + "learning_rate": 4.999579217372365e-05, + "loss": 7.4708, + "step": 983 + }, + { + "epoch": 0.005852126748501284, + "grad_norm": 2.4125893115997314, + "learning_rate": 4.9995783599690226e-05, + "loss": 7.4505, + "step": 984 + }, + { + "epoch": 0.005858074031782282, + "grad_norm": 2.975911855697632, + "learning_rate": 4.9995775016931035e-05, + "loss": 7.4095, + "step": 985 + }, + { + "epoch": 0.005864021315063279, + "grad_norm": 2.4155962467193604, + "learning_rate": 4.9995766425446076e-05, + "loss": 7.3084, + "step": 986 + }, + { + "epoch": 0.005869968598344277, + "grad_norm": 2.436950922012329, + "learning_rate": 4.999575782523535e-05, + "loss": 7.2782, + "step": 987 + }, + { + "epoch": 0.0058759158816252734, + "grad_norm": 2.2371575832366943, + "learning_rate": 4.999574921629887e-05, + "loss": 7.3879, + "step": 988 + }, + { + "epoch": 0.005881863164906271, + "grad_norm": 2.3079733848571777, + "learning_rate": 4.999574059863663e-05, + "loss": 7.5117, + "step": 989 + }, + { + "epoch": 0.005887810448187268, + "grad_norm": 2.4018514156341553, + "learning_rate": 4.9995731972248626e-05, + "loss": 7.4486, + "step": 990 + }, + { + "epoch": 0.005893757731468266, + "grad_norm": 2.3437294960021973, + "learning_rate": 4.9995723337134884e-05, + "loss": 7.461, + "step": 991 + }, + { + "epoch": 0.0058997050147492625, + "grad_norm": 3.15254545211792, + "learning_rate": 4.999571469329538e-05, + "loss": 7.014, + "step": 992 + }, + { + "epoch": 0.00590565229803026, + "grad_norm": 2.4809768199920654, + "learning_rate": 4.999570604073014e-05, + "loss": 7.4339, + "step": 993 + }, + { + "epoch": 0.005911599581311257, + "grad_norm": 3.4286630153656006, + "learning_rate": 4.9995697379439154e-05, + "loss": 7.3086, + "step": 994 + }, + { + "epoch": 0.005917546864592255, + "grad_norm": 3.9362127780914307, + "learning_rate": 4.999568870942243e-05, + "loss": 7.2635, + "step": 995 + }, + { + "epoch": 0.0059234941478732515, + "grad_norm": 2.6632091999053955, + "learning_rate": 4.9995680030679965e-05, + "loss": 7.2779, + "step": 996 + }, + { + "epoch": 0.005929441431154249, + "grad_norm": 5.218096733093262, + "learning_rate": 4.999567134321177e-05, + "loss": 7.4285, + "step": 997 + }, + { + "epoch": 0.005935388714435246, + "grad_norm": 3.441894769668579, + "learning_rate": 4.9995662647017835e-05, + "loss": 7.5576, + "step": 998 + }, + { + "epoch": 0.005941335997716243, + "grad_norm": 2.560178279876709, + "learning_rate": 4.9995653942098184e-05, + "loss": 7.5692, + "step": 999 + }, + { + "epoch": 0.0059472832809972405, + "grad_norm": 2.458313226699829, + "learning_rate": 4.999564522845281e-05, + "loss": 7.0495, + "step": 1000 + }, + { + "epoch": 0.005953230564278237, + "grad_norm": 2.539314031600952, + "learning_rate": 4.999563650608171e-05, + "loss": 7.1919, + "step": 1001 + }, + { + "epoch": 0.005959177847559235, + "grad_norm": 3.6134390830993652, + "learning_rate": 4.999562777498489e-05, + "loss": 7.0725, + "step": 1002 + }, + { + "epoch": 0.005965125130840232, + "grad_norm": 2.6582295894622803, + "learning_rate": 4.9995619035162355e-05, + "loss": 7.3008, + "step": 1003 + }, + { + "epoch": 0.0059710724141212295, + "grad_norm": 2.4968035221099854, + "learning_rate": 4.999561028661411e-05, + "loss": 7.2862, + "step": 1004 + }, + { + "epoch": 0.005977019697402226, + "grad_norm": 3.002840042114258, + "learning_rate": 4.999560152934015e-05, + "loss": 7.1721, + "step": 1005 + }, + { + "epoch": 0.005982966980683224, + "grad_norm": 3.4327914714813232, + "learning_rate": 4.999559276334049e-05, + "loss": 7.242, + "step": 1006 + }, + { + "epoch": 0.005988914263964221, + "grad_norm": 2.4082493782043457, + "learning_rate": 4.999558398861513e-05, + "loss": 7.1588, + "step": 1007 + }, + { + "epoch": 0.0059948615472452185, + "grad_norm": 2.39475417137146, + "learning_rate": 4.9995575205164056e-05, + "loss": 7.1713, + "step": 1008 + }, + { + "epoch": 0.006000808830526215, + "grad_norm": 2.946331024169922, + "learning_rate": 4.99955664129873e-05, + "loss": 7.1553, + "step": 1009 + }, + { + "epoch": 0.006006756113807213, + "grad_norm": 2.4334871768951416, + "learning_rate": 4.999555761208484e-05, + "loss": 7.1898, + "step": 1010 + }, + { + "epoch": 0.00601270339708821, + "grad_norm": 2.3159971237182617, + "learning_rate": 4.999554880245669e-05, + "loss": 7.0642, + "step": 1011 + }, + { + "epoch": 0.0060186506803692075, + "grad_norm": 2.9773905277252197, + "learning_rate": 4.9995539984102854e-05, + "loss": 7.3285, + "step": 1012 + }, + { + "epoch": 0.006024597963650204, + "grad_norm": 3.444267749786377, + "learning_rate": 4.999553115702334e-05, + "loss": 7.1263, + "step": 1013 + }, + { + "epoch": 0.006030545246931202, + "grad_norm": 2.6518173217773438, + "learning_rate": 4.9995522321218136e-05, + "loss": 7.3915, + "step": 1014 + }, + { + "epoch": 0.006036492530212199, + "grad_norm": 2.46230149269104, + "learning_rate": 4.9995513476687254e-05, + "loss": 7.1808, + "step": 1015 + }, + { + "epoch": 0.0060424398134931966, + "grad_norm": 2.2243192195892334, + "learning_rate": 4.99955046234307e-05, + "loss": 7.4262, + "step": 1016 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 3.0834670066833496, + "learning_rate": 4.999549576144847e-05, + "loss": 7.4028, + "step": 1017 + }, + { + "epoch": 0.006054334380055191, + "grad_norm": 3.2453930377960205, + "learning_rate": 4.9995486890740573e-05, + "loss": 7.5537, + "step": 1018 + }, + { + "epoch": 0.006060281663336188, + "grad_norm": 2.7142229080200195, + "learning_rate": 4.9995478011307015e-05, + "loss": 7.4131, + "step": 1019 + }, + { + "epoch": 0.006066228946617186, + "grad_norm": 2.9567463397979736, + "learning_rate": 4.9995469123147784e-05, + "loss": 7.5969, + "step": 1020 + }, + { + "epoch": 0.006072176229898182, + "grad_norm": 2.5698695182800293, + "learning_rate": 4.99954602262629e-05, + "loss": 7.2721, + "step": 1021 + }, + { + "epoch": 0.00607812351317918, + "grad_norm": 2.3958864212036133, + "learning_rate": 4.999545132065235e-05, + "loss": 7.3414, + "step": 1022 + }, + { + "epoch": 0.006084070796460177, + "grad_norm": 2.528024911880493, + "learning_rate": 4.9995442406316156e-05, + "loss": 7.2821, + "step": 1023 + }, + { + "epoch": 0.006090018079741175, + "grad_norm": 2.6904075145721436, + "learning_rate": 4.999543348325431e-05, + "loss": 7.3726, + "step": 1024 + }, + { + "epoch": 0.006095965363022171, + "grad_norm": 2.8618202209472656, + "learning_rate": 4.999542455146681e-05, + "loss": 7.4232, + "step": 1025 + }, + { + "epoch": 0.006101912646303169, + "grad_norm": 1.978455662727356, + "learning_rate": 4.999541561095367e-05, + "loss": 7.5949, + "step": 1026 + }, + { + "epoch": 0.006107859929584166, + "grad_norm": 2.882568836212158, + "learning_rate": 4.999540666171489e-05, + "loss": 7.4868, + "step": 1027 + }, + { + "epoch": 0.006113807212865164, + "grad_norm": 2.9586474895477295, + "learning_rate": 4.999539770375047e-05, + "loss": 7.1556, + "step": 1028 + }, + { + "epoch": 0.00611975449614616, + "grad_norm": 2.5675363540649414, + "learning_rate": 4.999538873706041e-05, + "loss": 7.3306, + "step": 1029 + }, + { + "epoch": 0.006125701779427157, + "grad_norm": 3.440857410430908, + "learning_rate": 4.999537976164472e-05, + "loss": 7.3654, + "step": 1030 + }, + { + "epoch": 0.006131649062708155, + "grad_norm": 3.7741217613220215, + "learning_rate": 4.999537077750341e-05, + "loss": 6.8088, + "step": 1031 + }, + { + "epoch": 0.006137596345989152, + "grad_norm": 3.801609754562378, + "learning_rate": 4.999536178463647e-05, + "loss": 6.989, + "step": 1032 + }, + { + "epoch": 0.0061435436292701495, + "grad_norm": 2.627225875854492, + "learning_rate": 4.9995352783043905e-05, + "loss": 7.4066, + "step": 1033 + }, + { + "epoch": 0.006149490912551146, + "grad_norm": 3.3529040813446045, + "learning_rate": 4.9995343772725725e-05, + "loss": 7.0403, + "step": 1034 + }, + { + "epoch": 0.006155438195832144, + "grad_norm": 3.248558521270752, + "learning_rate": 4.999533475368192e-05, + "loss": 7.2664, + "step": 1035 + }, + { + "epoch": 0.006161385479113141, + "grad_norm": 3.1260814666748047, + "learning_rate": 4.9995325725912515e-05, + "loss": 7.3257, + "step": 1036 + }, + { + "epoch": 0.0061673327623941385, + "grad_norm": 2.379659414291382, + "learning_rate": 4.999531668941748e-05, + "loss": 7.4448, + "step": 1037 + }, + { + "epoch": 0.006173280045675135, + "grad_norm": 2.8478498458862305, + "learning_rate": 4.999530764419685e-05, + "loss": 7.3892, + "step": 1038 + }, + { + "epoch": 0.006179227328956133, + "grad_norm": 4.104954719543457, + "learning_rate": 4.999529859025062e-05, + "loss": 7.5172, + "step": 1039 + }, + { + "epoch": 0.00618517461223713, + "grad_norm": 2.50160813331604, + "learning_rate": 4.999528952757879e-05, + "loss": 7.1894, + "step": 1040 + }, + { + "epoch": 0.0061911218955181275, + "grad_norm": 2.5545871257781982, + "learning_rate": 4.999528045618136e-05, + "loss": 7.3892, + "step": 1041 + }, + { + "epoch": 0.006197069178799124, + "grad_norm": 2.9980626106262207, + "learning_rate": 4.999527137605833e-05, + "loss": 7.3517, + "step": 1042 + }, + { + "epoch": 0.006203016462080122, + "grad_norm": 2.5920562744140625, + "learning_rate": 4.999526228720971e-05, + "loss": 7.1716, + "step": 1043 + }, + { + "epoch": 0.006208963745361119, + "grad_norm": 2.5224244594573975, + "learning_rate": 4.999525318963551e-05, + "loss": 7.1892, + "step": 1044 + }, + { + "epoch": 0.0062149110286421165, + "grad_norm": 2.7092106342315674, + "learning_rate": 4.999524408333572e-05, + "loss": 7.178, + "step": 1045 + }, + { + "epoch": 0.006220858311923113, + "grad_norm": 2.523320198059082, + "learning_rate": 4.999523496831035e-05, + "loss": 7.1486, + "step": 1046 + }, + { + "epoch": 0.006226805595204111, + "grad_norm": 2.4491217136383057, + "learning_rate": 4.99952258445594e-05, + "loss": 7.121, + "step": 1047 + }, + { + "epoch": 0.006232752878485108, + "grad_norm": 2.29109263420105, + "learning_rate": 4.9995216712082875e-05, + "loss": 7.4323, + "step": 1048 + }, + { + "epoch": 0.0062387001617661055, + "grad_norm": 2.5234057903289795, + "learning_rate": 4.9995207570880783e-05, + "loss": 7.1552, + "step": 1049 + }, + { + "epoch": 0.006244647445047102, + "grad_norm": 2.301316499710083, + "learning_rate": 4.9995198420953115e-05, + "loss": 7.3625, + "step": 1050 + }, + { + "epoch": 0.0062505947283281, + "grad_norm": 2.4358527660369873, + "learning_rate": 4.999518926229989e-05, + "loss": 7.2462, + "step": 1051 + }, + { + "epoch": 0.006256542011609097, + "grad_norm": 2.3915181159973145, + "learning_rate": 4.999518009492109e-05, + "loss": 7.173, + "step": 1052 + }, + { + "epoch": 0.0062624892948900945, + "grad_norm": 2.5529091358184814, + "learning_rate": 4.999517091881674e-05, + "loss": 7.2463, + "step": 1053 + }, + { + "epoch": 0.006268436578171091, + "grad_norm": 3.235435724258423, + "learning_rate": 4.999516173398683e-05, + "loss": 7.1149, + "step": 1054 + }, + { + "epoch": 0.006274383861452089, + "grad_norm": 2.692140817642212, + "learning_rate": 4.9995152540431375e-05, + "loss": 7.3554, + "step": 1055 + }, + { + "epoch": 0.006280331144733086, + "grad_norm": 2.910116195678711, + "learning_rate": 4.999514333815036e-05, + "loss": 7.4424, + "step": 1056 + }, + { + "epoch": 0.0062862784280140836, + "grad_norm": 2.897463798522949, + "learning_rate": 4.9995134127143804e-05, + "loss": 7.2345, + "step": 1057 + }, + { + "epoch": 0.00629222571129508, + "grad_norm": 2.5925514698028564, + "learning_rate": 4.999512490741171e-05, + "loss": 7.1539, + "step": 1058 + }, + { + "epoch": 0.006298172994576078, + "grad_norm": 2.693816900253296, + "learning_rate": 4.999511567895407e-05, + "loss": 7.0905, + "step": 1059 + }, + { + "epoch": 0.006304120277857075, + "grad_norm": 3.3717474937438965, + "learning_rate": 4.9995106441770896e-05, + "loss": 7.1407, + "step": 1060 + }, + { + "epoch": 0.006310067561138072, + "grad_norm": 2.6128973960876465, + "learning_rate": 4.999509719586218e-05, + "loss": 7.2748, + "step": 1061 + }, + { + "epoch": 0.006316014844419069, + "grad_norm": 2.24324369430542, + "learning_rate": 4.999508794122795e-05, + "loss": 7.2553, + "step": 1062 + }, + { + "epoch": 0.006321962127700066, + "grad_norm": 2.7593698501586914, + "learning_rate": 4.999507867786818e-05, + "loss": 7.1039, + "step": 1063 + }, + { + "epoch": 0.006327909410981064, + "grad_norm": 2.6210618019104004, + "learning_rate": 4.999506940578289e-05, + "loss": 7.0247, + "step": 1064 + }, + { + "epoch": 0.006333856694262061, + "grad_norm": 2.410187244415283, + "learning_rate": 4.9995060124972084e-05, + "loss": 7.3931, + "step": 1065 + }, + { + "epoch": 0.006339803977543058, + "grad_norm": 2.795302391052246, + "learning_rate": 4.999505083543575e-05, + "loss": 7.3168, + "step": 1066 + }, + { + "epoch": 0.006345751260824055, + "grad_norm": 2.3720662593841553, + "learning_rate": 4.999504153717391e-05, + "loss": 7.3719, + "step": 1067 + }, + { + "epoch": 0.006351698544105053, + "grad_norm": 2.721585988998413, + "learning_rate": 4.9995032230186556e-05, + "loss": 7.3847, + "step": 1068 + }, + { + "epoch": 0.00635764582738605, + "grad_norm": 2.967153549194336, + "learning_rate": 4.99950229144737e-05, + "loss": 7.3224, + "step": 1069 + }, + { + "epoch": 0.006363593110667047, + "grad_norm": 3.8144783973693848, + "learning_rate": 4.999501359003533e-05, + "loss": 7.0767, + "step": 1070 + }, + { + "epoch": 0.006369540393948044, + "grad_norm": 3.7694199085235596, + "learning_rate": 4.999500425687147e-05, + "loss": 7.4486, + "step": 1071 + }, + { + "epoch": 0.006375487677229042, + "grad_norm": 2.9668312072753906, + "learning_rate": 4.999499491498211e-05, + "loss": 7.3415, + "step": 1072 + }, + { + "epoch": 0.006381434960510039, + "grad_norm": 4.196050643920898, + "learning_rate": 4.999498556436725e-05, + "loss": 7.3784, + "step": 1073 + }, + { + "epoch": 0.0063873822437910364, + "grad_norm": 4.676602363586426, + "learning_rate": 4.99949762050269e-05, + "loss": 7.3773, + "step": 1074 + }, + { + "epoch": 0.006393329527072033, + "grad_norm": 2.8828656673431396, + "learning_rate": 4.999496683696107e-05, + "loss": 7.2359, + "step": 1075 + }, + { + "epoch": 0.006399276810353031, + "grad_norm": 2.7532308101654053, + "learning_rate": 4.9994957460169745e-05, + "loss": 7.356, + "step": 1076 + }, + { + "epoch": 0.006405224093634028, + "grad_norm": 5.535451412200928, + "learning_rate": 4.999494807465293e-05, + "loss": 7.261, + "step": 1077 + }, + { + "epoch": 0.0064111713769150255, + "grad_norm": 3.6439530849456787, + "learning_rate": 4.999493868041066e-05, + "loss": 7.4664, + "step": 1078 + }, + { + "epoch": 0.006417118660196022, + "grad_norm": 3.563948154449463, + "learning_rate": 4.99949292774429e-05, + "loss": 7.0427, + "step": 1079 + }, + { + "epoch": 0.00642306594347702, + "grad_norm": 3.6243784427642822, + "learning_rate": 4.9994919865749675e-05, + "loss": 7.3292, + "step": 1080 + }, + { + "epoch": 0.006429013226758017, + "grad_norm": 5.1197590827941895, + "learning_rate": 4.999491044533098e-05, + "loss": 7.3717, + "step": 1081 + }, + { + "epoch": 0.0064349605100390145, + "grad_norm": 4.3969902992248535, + "learning_rate": 4.999490101618682e-05, + "loss": 7.2875, + "step": 1082 + }, + { + "epoch": 0.006440907793320011, + "grad_norm": 2.6302945613861084, + "learning_rate": 4.999489157831719e-05, + "loss": 7.1958, + "step": 1083 + }, + { + "epoch": 0.006446855076601009, + "grad_norm": 3.782078504562378, + "learning_rate": 4.9994882131722116e-05, + "loss": 7.2951, + "step": 1084 + }, + { + "epoch": 0.006452802359882006, + "grad_norm": 3.432082414627075, + "learning_rate": 4.999487267640158e-05, + "loss": 7.0974, + "step": 1085 + }, + { + "epoch": 0.0064587496431630035, + "grad_norm": 3.364793300628662, + "learning_rate": 4.999486321235559e-05, + "loss": 7.0847, + "step": 1086 + }, + { + "epoch": 0.006464696926444, + "grad_norm": 2.7063019275665283, + "learning_rate": 4.999485373958416e-05, + "loss": 7.1421, + "step": 1087 + }, + { + "epoch": 0.006470644209724998, + "grad_norm": 3.0648648738861084, + "learning_rate": 4.999484425808727e-05, + "loss": 7.2723, + "step": 1088 + }, + { + "epoch": 0.006476591493005995, + "grad_norm": 3.3968300819396973, + "learning_rate": 4.999483476786495e-05, + "loss": 7.1438, + "step": 1089 + }, + { + "epoch": 0.0064825387762869925, + "grad_norm": 2.864647150039673, + "learning_rate": 4.999482526891719e-05, + "loss": 7.1512, + "step": 1090 + }, + { + "epoch": 0.006488486059567989, + "grad_norm": 2.577043056488037, + "learning_rate": 4.999481576124399e-05, + "loss": 6.8914, + "step": 1091 + }, + { + "epoch": 0.006494433342848986, + "grad_norm": 2.83754563331604, + "learning_rate": 4.999480624484536e-05, + "loss": 6.9999, + "step": 1092 + }, + { + "epoch": 0.006500380626129984, + "grad_norm": 3.5623857975006104, + "learning_rate": 4.999479671972131e-05, + "loss": 7.0567, + "step": 1093 + }, + { + "epoch": 0.006506327909410981, + "grad_norm": 2.35555362701416, + "learning_rate": 4.9994787185871814e-05, + "loss": 7.3075, + "step": 1094 + }, + { + "epoch": 0.006512275192691978, + "grad_norm": 3.8677117824554443, + "learning_rate": 4.9994777643296914e-05, + "loss": 7.3608, + "step": 1095 + }, + { + "epoch": 0.006518222475972975, + "grad_norm": 3.8163843154907227, + "learning_rate": 4.999476809199659e-05, + "loss": 7.4368, + "step": 1096 + }, + { + "epoch": 0.006524169759253973, + "grad_norm": 2.5424652099609375, + "learning_rate": 4.999475853197085e-05, + "loss": 7.4968, + "step": 1097 + }, + { + "epoch": 0.00653011704253497, + "grad_norm": 2.876898765563965, + "learning_rate": 4.99947489632197e-05, + "loss": 6.9948, + "step": 1098 + }, + { + "epoch": 0.006536064325815967, + "grad_norm": 3.3934860229492188, + "learning_rate": 4.999473938574314e-05, + "loss": 6.9588, + "step": 1099 + }, + { + "epoch": 0.006542011609096964, + "grad_norm": 2.1184024810791016, + "learning_rate": 4.9994729799541176e-05, + "loss": 7.1933, + "step": 1100 + }, + { + "epoch": 0.006547958892377962, + "grad_norm": 2.2882895469665527, + "learning_rate": 4.999472020461381e-05, + "loss": 7.0796, + "step": 1101 + }, + { + "epoch": 0.006553906175658959, + "grad_norm": 3.239429235458374, + "learning_rate": 4.9994710600961045e-05, + "loss": 6.9535, + "step": 1102 + }, + { + "epoch": 0.006559853458939956, + "grad_norm": 2.4653263092041016, + "learning_rate": 4.9994700988582884e-05, + "loss": 6.9316, + "step": 1103 + }, + { + "epoch": 0.006565800742220953, + "grad_norm": 2.511516571044922, + "learning_rate": 4.999469136747933e-05, + "loss": 6.9844, + "step": 1104 + }, + { + "epoch": 0.006571748025501951, + "grad_norm": 2.9725844860076904, + "learning_rate": 4.9994681737650384e-05, + "loss": 7.1955, + "step": 1105 + }, + { + "epoch": 0.006577695308782948, + "grad_norm": 3.04697585105896, + "learning_rate": 4.9994672099096066e-05, + "loss": 7.1044, + "step": 1106 + }, + { + "epoch": 0.006583642592063945, + "grad_norm": 3.395076274871826, + "learning_rate": 4.999466245181635e-05, + "loss": 7.1968, + "step": 1107 + }, + { + "epoch": 0.006589589875344942, + "grad_norm": 2.362884044647217, + "learning_rate": 4.999465279581127e-05, + "loss": 7.3114, + "step": 1108 + }, + { + "epoch": 0.00659553715862594, + "grad_norm": 2.730980396270752, + "learning_rate": 4.99946431310808e-05, + "loss": 7.1978, + "step": 1109 + }, + { + "epoch": 0.006601484441906937, + "grad_norm": 3.288687229156494, + "learning_rate": 4.9994633457624974e-05, + "loss": 7.4397, + "step": 1110 + }, + { + "epoch": 0.006607431725187934, + "grad_norm": 3.3060662746429443, + "learning_rate": 4.999462377544377e-05, + "loss": 7.1638, + "step": 1111 + }, + { + "epoch": 0.006613379008468931, + "grad_norm": 2.2697036266326904, + "learning_rate": 4.9994614084537204e-05, + "loss": 7.2654, + "step": 1112 + }, + { + "epoch": 0.006619326291749929, + "grad_norm": 2.330495595932007, + "learning_rate": 4.999460438490528e-05, + "loss": 7.2132, + "step": 1113 + }, + { + "epoch": 0.006625273575030926, + "grad_norm": 2.8239340782165527, + "learning_rate": 4.999459467654799e-05, + "loss": 7.3477, + "step": 1114 + }, + { + "epoch": 0.0066312208583119234, + "grad_norm": 2.591614246368408, + "learning_rate": 4.999458495946535e-05, + "loss": 7.0377, + "step": 1115 + }, + { + "epoch": 0.00663716814159292, + "grad_norm": 4.554818630218506, + "learning_rate": 4.999457523365736e-05, + "loss": 7.1266, + "step": 1116 + }, + { + "epoch": 0.006643115424873918, + "grad_norm": 2.21018123626709, + "learning_rate": 4.999456549912401e-05, + "loss": 7.1433, + "step": 1117 + }, + { + "epoch": 0.006649062708154915, + "grad_norm": 2.0298593044281006, + "learning_rate": 4.999455575586533e-05, + "loss": 7.257, + "step": 1118 + }, + { + "epoch": 0.0066550099914359125, + "grad_norm": 2.4532642364501953, + "learning_rate": 4.9994546003881305e-05, + "loss": 7.0618, + "step": 1119 + }, + { + "epoch": 0.006660957274716909, + "grad_norm": 2.428380012512207, + "learning_rate": 4.999453624317194e-05, + "loss": 7.2039, + "step": 1120 + }, + { + "epoch": 0.006666904557997907, + "grad_norm": 2.5572609901428223, + "learning_rate": 4.999452647373724e-05, + "loss": 7.0991, + "step": 1121 + }, + { + "epoch": 0.006672851841278904, + "grad_norm": 2.379640817642212, + "learning_rate": 4.999451669557721e-05, + "loss": 7.1424, + "step": 1122 + }, + { + "epoch": 0.006678799124559901, + "grad_norm": 2.5764007568359375, + "learning_rate": 4.999450690869185e-05, + "loss": 7.1218, + "step": 1123 + }, + { + "epoch": 0.006684746407840898, + "grad_norm": 2.6560606956481934, + "learning_rate": 4.999449711308117e-05, + "loss": 7.2994, + "step": 1124 + }, + { + "epoch": 0.006690693691121895, + "grad_norm": 2.4687581062316895, + "learning_rate": 4.999448730874518e-05, + "loss": 7.4169, + "step": 1125 + }, + { + "epoch": 0.006696640974402893, + "grad_norm": 2.8232173919677734, + "learning_rate": 4.999447749568386e-05, + "loss": 7.291, + "step": 1126 + }, + { + "epoch": 0.00670258825768389, + "grad_norm": 2.6960325241088867, + "learning_rate": 4.9994467673897224e-05, + "loss": 7.3162, + "step": 1127 + }, + { + "epoch": 0.006708535540964887, + "grad_norm": 2.222391366958618, + "learning_rate": 4.999445784338528e-05, + "loss": 7.221, + "step": 1128 + }, + { + "epoch": 0.006714482824245884, + "grad_norm": 2.334995985031128, + "learning_rate": 4.9994448004148024e-05, + "loss": 7.4813, + "step": 1129 + }, + { + "epoch": 0.006720430107526882, + "grad_norm": 2.653491497039795, + "learning_rate": 4.999443815618548e-05, + "loss": 7.3515, + "step": 1130 + }, + { + "epoch": 0.006726377390807879, + "grad_norm": 2.6943631172180176, + "learning_rate": 4.999442829949762e-05, + "loss": 7.2674, + "step": 1131 + }, + { + "epoch": 0.006732324674088876, + "grad_norm": 2.395573377609253, + "learning_rate": 4.999441843408447e-05, + "loss": 7.483, + "step": 1132 + }, + { + "epoch": 0.006738271957369873, + "grad_norm": 2.3801541328430176, + "learning_rate": 4.999440855994603e-05, + "loss": 7.3355, + "step": 1133 + }, + { + "epoch": 0.006744219240650871, + "grad_norm": 2.8566555976867676, + "learning_rate": 4.999439867708229e-05, + "loss": 6.8323, + "step": 1134 + }, + { + "epoch": 0.006750166523931868, + "grad_norm": 2.5987985134124756, + "learning_rate": 4.999438878549327e-05, + "loss": 6.957, + "step": 1135 + }, + { + "epoch": 0.006756113807212865, + "grad_norm": 2.4411563873291016, + "learning_rate": 4.9994378885178964e-05, + "loss": 6.9935, + "step": 1136 + }, + { + "epoch": 0.006762061090493862, + "grad_norm": 2.4227802753448486, + "learning_rate": 4.9994368976139386e-05, + "loss": 7.2856, + "step": 1137 + }, + { + "epoch": 0.00676800837377486, + "grad_norm": 2.55317759513855, + "learning_rate": 4.999435905837453e-05, + "loss": 7.1741, + "step": 1138 + }, + { + "epoch": 0.006773955657055857, + "grad_norm": 2.3329968452453613, + "learning_rate": 4.9994349131884396e-05, + "loss": 7.2007, + "step": 1139 + }, + { + "epoch": 0.006779902940336854, + "grad_norm": 2.538499593734741, + "learning_rate": 4.999433919666899e-05, + "loss": 7.1755, + "step": 1140 + }, + { + "epoch": 0.006785850223617851, + "grad_norm": 2.3580374717712402, + "learning_rate": 4.999432925272833e-05, + "loss": 7.2249, + "step": 1141 + }, + { + "epoch": 0.006791797506898849, + "grad_norm": 2.2783255577087402, + "learning_rate": 4.99943193000624e-05, + "loss": 7.3627, + "step": 1142 + }, + { + "epoch": 0.006797744790179846, + "grad_norm": 3.0798208713531494, + "learning_rate": 4.999430933867122e-05, + "loss": 7.2718, + "step": 1143 + }, + { + "epoch": 0.006803692073460843, + "grad_norm": 2.703232526779175, + "learning_rate": 4.9994299368554776e-05, + "loss": 7.116, + "step": 1144 + }, + { + "epoch": 0.00680963935674184, + "grad_norm": 2.480327606201172, + "learning_rate": 4.9994289389713076e-05, + "loss": 6.9743, + "step": 1145 + }, + { + "epoch": 0.006815586640022838, + "grad_norm": 2.2707130908966064, + "learning_rate": 4.9994279402146137e-05, + "loss": 6.9919, + "step": 1146 + }, + { + "epoch": 0.006821533923303835, + "grad_norm": 2.0424580574035645, + "learning_rate": 4.999426940585396e-05, + "loss": 7.0366, + "step": 1147 + }, + { + "epoch": 0.006827481206584832, + "grad_norm": 1.9720054864883423, + "learning_rate": 4.999425940083653e-05, + "loss": 6.8622, + "step": 1148 + }, + { + "epoch": 0.006833428489865829, + "grad_norm": 2.7109742164611816, + "learning_rate": 4.9994249387093864e-05, + "loss": 7.5375, + "step": 1149 + }, + { + "epoch": 0.006839375773146827, + "grad_norm": 2.267328977584839, + "learning_rate": 4.999423936462596e-05, + "loss": 7.5606, + "step": 1150 + }, + { + "epoch": 0.006845323056427824, + "grad_norm": 2.958360433578491, + "learning_rate": 4.999422933343283e-05, + "loss": 7.3503, + "step": 1151 + }, + { + "epoch": 0.006851270339708821, + "grad_norm": 2.2681283950805664, + "learning_rate": 4.9994219293514475e-05, + "loss": 6.9278, + "step": 1152 + }, + { + "epoch": 0.006857217622989818, + "grad_norm": 2.4755337238311768, + "learning_rate": 4.999420924487089e-05, + "loss": 7.1385, + "step": 1153 + }, + { + "epoch": 0.006863164906270815, + "grad_norm": 2.283277988433838, + "learning_rate": 4.999419918750209e-05, + "loss": 6.9287, + "step": 1154 + }, + { + "epoch": 0.006869112189551813, + "grad_norm": 2.3692893981933594, + "learning_rate": 4.999418912140808e-05, + "loss": 7.0648, + "step": 1155 + }, + { + "epoch": 0.00687505947283281, + "grad_norm": 2.2676453590393066, + "learning_rate": 4.999417904658884e-05, + "loss": 6.9754, + "step": 1156 + }, + { + "epoch": 0.006881006756113807, + "grad_norm": 2.4106669425964355, + "learning_rate": 4.9994168963044405e-05, + "loss": 7.033, + "step": 1157 + }, + { + "epoch": 0.006886954039394804, + "grad_norm": 2.947758913040161, + "learning_rate": 4.9994158870774754e-05, + "loss": 7.0821, + "step": 1158 + }, + { + "epoch": 0.006892901322675802, + "grad_norm": 2.5338058471679688, + "learning_rate": 4.9994148769779905e-05, + "loss": 6.9426, + "step": 1159 + }, + { + "epoch": 0.006898848605956799, + "grad_norm": 2.4848148822784424, + "learning_rate": 4.999413866005985e-05, + "loss": 7.2488, + "step": 1160 + }, + { + "epoch": 0.006904795889237796, + "grad_norm": 2.444077730178833, + "learning_rate": 4.999412854161461e-05, + "loss": 6.871, + "step": 1161 + }, + { + "epoch": 0.006910743172518793, + "grad_norm": 2.376962661743164, + "learning_rate": 4.9994118414444174e-05, + "loss": 7.0258, + "step": 1162 + }, + { + "epoch": 0.006916690455799791, + "grad_norm": 3.502023458480835, + "learning_rate": 4.9994108278548545e-05, + "loss": 7.4869, + "step": 1163 + }, + { + "epoch": 0.006922637739080788, + "grad_norm": 3.117741584777832, + "learning_rate": 4.999409813392774e-05, + "loss": 7.4437, + "step": 1164 + }, + { + "epoch": 0.006928585022361785, + "grad_norm": 3.805560827255249, + "learning_rate": 4.999408798058175e-05, + "loss": 7.3796, + "step": 1165 + }, + { + "epoch": 0.006934532305642782, + "grad_norm": 3.67065167427063, + "learning_rate": 4.9994077818510576e-05, + "loss": 7.2304, + "step": 1166 + }, + { + "epoch": 0.00694047958892378, + "grad_norm": 2.5749545097351074, + "learning_rate": 4.9994067647714236e-05, + "loss": 7.0943, + "step": 1167 + }, + { + "epoch": 0.006946426872204777, + "grad_norm": 2.561405897140503, + "learning_rate": 4.9994057468192724e-05, + "loss": 6.9496, + "step": 1168 + }, + { + "epoch": 0.006952374155485774, + "grad_norm": 2.477344512939453, + "learning_rate": 4.999404727994604e-05, + "loss": 7.3494, + "step": 1169 + }, + { + "epoch": 0.006958321438766771, + "grad_norm": 2.897580146789551, + "learning_rate": 4.999403708297419e-05, + "loss": 7.6081, + "step": 1170 + }, + { + "epoch": 0.006964268722047769, + "grad_norm": 3.899249792098999, + "learning_rate": 4.999402687727719e-05, + "loss": 7.4448, + "step": 1171 + }, + { + "epoch": 0.006970216005328766, + "grad_norm": 3.0791561603546143, + "learning_rate": 4.9994016662855025e-05, + "loss": 7.1616, + "step": 1172 + }, + { + "epoch": 0.006976163288609763, + "grad_norm": 2.8212931156158447, + "learning_rate": 4.999400643970771e-05, + "loss": 7.1824, + "step": 1173 + }, + { + "epoch": 0.00698211057189076, + "grad_norm": 4.33271598815918, + "learning_rate": 4.9993996207835246e-05, + "loss": 7.2432, + "step": 1174 + }, + { + "epoch": 0.006988057855171758, + "grad_norm": 2.985125780105591, + "learning_rate": 4.999398596723764e-05, + "loss": 7.6521, + "step": 1175 + }, + { + "epoch": 0.006994005138452755, + "grad_norm": 3.1069905757904053, + "learning_rate": 4.9993975717914885e-05, + "loss": 7.0071, + "step": 1176 + }, + { + "epoch": 0.006999952421733752, + "grad_norm": 2.915214776992798, + "learning_rate": 4.9993965459866995e-05, + "loss": 7.6192, + "step": 1177 + }, + { + "epoch": 0.007005899705014749, + "grad_norm": 5.314033031463623, + "learning_rate": 4.999395519309397e-05, + "loss": 6.9447, + "step": 1178 + }, + { + "epoch": 0.007011846988295747, + "grad_norm": 2.2723114490509033, + "learning_rate": 4.999394491759581e-05, + "loss": 7.1228, + "step": 1179 + }, + { + "epoch": 0.007017794271576744, + "grad_norm": 2.936365842819214, + "learning_rate": 4.999393463337253e-05, + "loss": 7.136, + "step": 1180 + }, + { + "epoch": 0.007023741554857741, + "grad_norm": 2.864250898361206, + "learning_rate": 4.9993924340424115e-05, + "loss": 7.026, + "step": 1181 + }, + { + "epoch": 0.007029688838138738, + "grad_norm": 3.299370050430298, + "learning_rate": 4.9993914038750586e-05, + "loss": 7.1114, + "step": 1182 + }, + { + "epoch": 0.007035636121419736, + "grad_norm": 3.0609943866729736, + "learning_rate": 4.999390372835193e-05, + "loss": 7.3052, + "step": 1183 + }, + { + "epoch": 0.007041583404700733, + "grad_norm": 3.54488468170166, + "learning_rate": 4.9993893409228176e-05, + "loss": 7.4845, + "step": 1184 + }, + { + "epoch": 0.0070475306879817295, + "grad_norm": 2.5196385383605957, + "learning_rate": 4.99938830813793e-05, + "loss": 7.312, + "step": 1185 + }, + { + "epoch": 0.007053477971262727, + "grad_norm": 3.570802927017212, + "learning_rate": 4.9993872744805326e-05, + "loss": 7.0038, + "step": 1186 + }, + { + "epoch": 0.007059425254543724, + "grad_norm": 2.631058931350708, + "learning_rate": 4.999386239950624e-05, + "loss": 7.5574, + "step": 1187 + }, + { + "epoch": 0.007065372537824722, + "grad_norm": 3.027251958847046, + "learning_rate": 4.999385204548206e-05, + "loss": 6.9837, + "step": 1188 + }, + { + "epoch": 0.0070713198211057185, + "grad_norm": 3.00128173828125, + "learning_rate": 4.999384168273279e-05, + "loss": 7.4479, + "step": 1189 + }, + { + "epoch": 0.007077267104386716, + "grad_norm": 2.127028226852417, + "learning_rate": 4.999383131125842e-05, + "loss": 7.3609, + "step": 1190 + }, + { + "epoch": 0.007083214387667713, + "grad_norm": 2.375511646270752, + "learning_rate": 4.9993820931058965e-05, + "loss": 7.3695, + "step": 1191 + }, + { + "epoch": 0.007089161670948711, + "grad_norm": 2.527743101119995, + "learning_rate": 4.999381054213442e-05, + "loss": 7.1478, + "step": 1192 + }, + { + "epoch": 0.0070951089542297075, + "grad_norm": 2.1600632667541504, + "learning_rate": 4.99938001444848e-05, + "loss": 7.7111, + "step": 1193 + }, + { + "epoch": 0.007101056237510705, + "grad_norm": 2.3242850303649902, + "learning_rate": 4.99937897381101e-05, + "loss": 7.6751, + "step": 1194 + }, + { + "epoch": 0.007107003520791702, + "grad_norm": 3.4553158283233643, + "learning_rate": 4.9993779323010334e-05, + "loss": 7.775, + "step": 1195 + }, + { + "epoch": 0.0071129508040727, + "grad_norm": 2.4339516162872314, + "learning_rate": 4.999376889918549e-05, + "loss": 7.099, + "step": 1196 + }, + { + "epoch": 0.0071188980873536966, + "grad_norm": 2.531851291656494, + "learning_rate": 4.9993758466635574e-05, + "loss": 7.5222, + "step": 1197 + }, + { + "epoch": 0.007124845370634694, + "grad_norm": 2.6549220085144043, + "learning_rate": 4.999374802536061e-05, + "loss": 7.4917, + "step": 1198 + }, + { + "epoch": 0.007130792653915691, + "grad_norm": 2.9149320125579834, + "learning_rate": 4.999373757536058e-05, + "loss": 7.0438, + "step": 1199 + }, + { + "epoch": 0.007136739937196689, + "grad_norm": 3.0234971046447754, + "learning_rate": 4.999372711663549e-05, + "loss": 7.6838, + "step": 1200 + }, + { + "epoch": 0.007142687220477686, + "grad_norm": 2.4006800651550293, + "learning_rate": 4.999371664918535e-05, + "loss": 7.6607, + "step": 1201 + }, + { + "epoch": 0.007148634503758683, + "grad_norm": 2.6191699504852295, + "learning_rate": 4.9993706173010164e-05, + "loss": 7.4727, + "step": 1202 + }, + { + "epoch": 0.00715458178703968, + "grad_norm": 3.040844440460205, + "learning_rate": 4.999369568810993e-05, + "loss": 7.1459, + "step": 1203 + }, + { + "epoch": 0.007160529070320678, + "grad_norm": 2.8474466800689697, + "learning_rate": 4.9993685194484654e-05, + "loss": 7.4615, + "step": 1204 + }, + { + "epoch": 0.007166476353601675, + "grad_norm": 1.928662657737732, + "learning_rate": 4.999367469213435e-05, + "loss": 7.4259, + "step": 1205 + }, + { + "epoch": 0.007172423636882672, + "grad_norm": 2.369540214538574, + "learning_rate": 4.999366418105901e-05, + "loss": 6.9342, + "step": 1206 + }, + { + "epoch": 0.007178370920163669, + "grad_norm": 4.003239154815674, + "learning_rate": 4.999365366125863e-05, + "loss": 7.3289, + "step": 1207 + }, + { + "epoch": 0.007184318203444667, + "grad_norm": 4.491976261138916, + "learning_rate": 4.9993643132733234e-05, + "loss": 7.3479, + "step": 1208 + }, + { + "epoch": 0.007190265486725664, + "grad_norm": 2.3678557872772217, + "learning_rate": 4.9993632595482806e-05, + "loss": 7.3091, + "step": 1209 + }, + { + "epoch": 0.007196212770006661, + "grad_norm": 2.9310050010681152, + "learning_rate": 4.999362204950737e-05, + "loss": 7.1996, + "step": 1210 + }, + { + "epoch": 0.007202160053287658, + "grad_norm": 3.6861345767974854, + "learning_rate": 4.999361149480691e-05, + "loss": 7.43, + "step": 1211 + }, + { + "epoch": 0.007208107336568656, + "grad_norm": 2.657515287399292, + "learning_rate": 4.9993600931381446e-05, + "loss": 6.9888, + "step": 1212 + }, + { + "epoch": 0.007214054619849653, + "grad_norm": 2.8346996307373047, + "learning_rate": 4.999359035923097e-05, + "loss": 7.0366, + "step": 1213 + }, + { + "epoch": 0.00722000190313065, + "grad_norm": 3.494162082672119, + "learning_rate": 4.9993579778355487e-05, + "loss": 7.499, + "step": 1214 + }, + { + "epoch": 0.007225949186411647, + "grad_norm": 2.9848556518554688, + "learning_rate": 4.999356918875501e-05, + "loss": 7.2064, + "step": 1215 + }, + { + "epoch": 0.007231896469692645, + "grad_norm": 2.391390562057495, + "learning_rate": 4.999355859042953e-05, + "loss": 7.2752, + "step": 1216 + }, + { + "epoch": 0.007237843752973642, + "grad_norm": 2.872891902923584, + "learning_rate": 4.9993547983379065e-05, + "loss": 6.9865, + "step": 1217 + }, + { + "epoch": 0.0072437910362546385, + "grad_norm": 2.760213613510132, + "learning_rate": 4.99935373676036e-05, + "loss": 7.0211, + "step": 1218 + }, + { + "epoch": 0.007249738319535636, + "grad_norm": 2.8857531547546387, + "learning_rate": 4.9993526743103156e-05, + "loss": 6.9162, + "step": 1219 + }, + { + "epoch": 0.007255685602816633, + "grad_norm": 3.150836229324341, + "learning_rate": 4.999351610987772e-05, + "loss": 7.2929, + "step": 1220 + }, + { + "epoch": 0.007261632886097631, + "grad_norm": 2.2004289627075195, + "learning_rate": 4.999350546792732e-05, + "loss": 7.4729, + "step": 1221 + }, + { + "epoch": 0.0072675801693786275, + "grad_norm": 2.5004026889801025, + "learning_rate": 4.999349481725194e-05, + "loss": 7.5235, + "step": 1222 + }, + { + "epoch": 0.007273527452659625, + "grad_norm": 2.8355395793914795, + "learning_rate": 4.999348415785159e-05, + "loss": 7.3535, + "step": 1223 + }, + { + "epoch": 0.007279474735940622, + "grad_norm": 2.559330701828003, + "learning_rate": 4.9993473489726276e-05, + "loss": 6.9634, + "step": 1224 + }, + { + "epoch": 0.00728542201922162, + "grad_norm": 2.3559181690216064, + "learning_rate": 4.999346281287599e-05, + "loss": 6.9246, + "step": 1225 + }, + { + "epoch": 0.0072913693025026165, + "grad_norm": 2.3852717876434326, + "learning_rate": 4.999345212730075e-05, + "loss": 6.6417, + "step": 1226 + }, + { + "epoch": 0.007297316585783614, + "grad_norm": 2.2604117393493652, + "learning_rate": 4.999344143300055e-05, + "loss": 7.4182, + "step": 1227 + }, + { + "epoch": 0.007303263869064611, + "grad_norm": 2.57983660697937, + "learning_rate": 4.9993430729975396e-05, + "loss": 7.4841, + "step": 1228 + }, + { + "epoch": 0.007309211152345609, + "grad_norm": 2.653935670852661, + "learning_rate": 4.99934200182253e-05, + "loss": 7.5477, + "step": 1229 + }, + { + "epoch": 0.0073151584356266055, + "grad_norm": 2.0740158557891846, + "learning_rate": 4.999340929775026e-05, + "loss": 7.4359, + "step": 1230 + }, + { + "epoch": 0.007321105718907603, + "grad_norm": 2.62064528465271, + "learning_rate": 4.9993398568550275e-05, + "loss": 7.1817, + "step": 1231 + }, + { + "epoch": 0.0073270530021886, + "grad_norm": 2.318244457244873, + "learning_rate": 4.999338783062536e-05, + "loss": 7.1663, + "step": 1232 + }, + { + "epoch": 0.007333000285469598, + "grad_norm": 3.0533225536346436, + "learning_rate": 4.99933770839755e-05, + "loss": 7.3051, + "step": 1233 + }, + { + "epoch": 0.0073389475687505945, + "grad_norm": 4.821422100067139, + "learning_rate": 4.999336632860072e-05, + "loss": 7.3435, + "step": 1234 + }, + { + "epoch": 0.007344894852031592, + "grad_norm": 2.680873155593872, + "learning_rate": 4.999335556450101e-05, + "loss": 7.3447, + "step": 1235 + }, + { + "epoch": 0.007350842135312589, + "grad_norm": 3.287454605102539, + "learning_rate": 4.999334479167638e-05, + "loss": 7.1957, + "step": 1236 + }, + { + "epoch": 0.007356789418593587, + "grad_norm": 3.7452759742736816, + "learning_rate": 4.999333401012682e-05, + "loss": 7.2093, + "step": 1237 + }, + { + "epoch": 0.0073627367018745836, + "grad_norm": 3.363443374633789, + "learning_rate": 4.999332321985236e-05, + "loss": 7.297, + "step": 1238 + }, + { + "epoch": 0.007368683985155581, + "grad_norm": 3.070962905883789, + "learning_rate": 4.999331242085299e-05, + "loss": 7.0831, + "step": 1239 + }, + { + "epoch": 0.007374631268436578, + "grad_norm": 3.635183095932007, + "learning_rate": 4.9993301613128706e-05, + "loss": 7.3116, + "step": 1240 + }, + { + "epoch": 0.007380578551717576, + "grad_norm": 2.532179594039917, + "learning_rate": 4.9993290796679516e-05, + "loss": 7.5238, + "step": 1241 + }, + { + "epoch": 0.007386525834998573, + "grad_norm": 2.1147687435150146, + "learning_rate": 4.999327997150543e-05, + "loss": 7.2279, + "step": 1242 + }, + { + "epoch": 0.00739247311827957, + "grad_norm": 2.1221182346343994, + "learning_rate": 4.999326913760645e-05, + "loss": 7.6575, + "step": 1243 + }, + { + "epoch": 0.007398420401560567, + "grad_norm": 2.2920000553131104, + "learning_rate": 4.999325829498257e-05, + "loss": 7.5652, + "step": 1244 + }, + { + "epoch": 0.007404367684841565, + "grad_norm": 2.3444230556488037, + "learning_rate": 4.9993247443633814e-05, + "loss": 7.3992, + "step": 1245 + }, + { + "epoch": 0.007410314968122562, + "grad_norm": 2.2778663635253906, + "learning_rate": 4.9993236583560164e-05, + "loss": 7.1212, + "step": 1246 + }, + { + "epoch": 0.007416262251403559, + "grad_norm": 2.38369083404541, + "learning_rate": 4.999322571476164e-05, + "loss": 7.4605, + "step": 1247 + }, + { + "epoch": 0.007422209534684556, + "grad_norm": 3.578537702560425, + "learning_rate": 4.999321483723823e-05, + "loss": 7.1446, + "step": 1248 + }, + { + "epoch": 0.007428156817965553, + "grad_norm": 5.227176666259766, + "learning_rate": 4.9993203950989954e-05, + "loss": 7.2308, + "step": 1249 + }, + { + "epoch": 0.007434104101246551, + "grad_norm": 2.665844440460205, + "learning_rate": 4.9993193056016805e-05, + "loss": 7.102, + "step": 1250 + }, + { + "epoch": 0.007440051384527547, + "grad_norm": 4.462922096252441, + "learning_rate": 4.9993182152318796e-05, + "loss": 7.003, + "step": 1251 + }, + { + "epoch": 0.007445998667808545, + "grad_norm": 4.9459099769592285, + "learning_rate": 4.999317123989592e-05, + "loss": 7.1338, + "step": 1252 + }, + { + "epoch": 0.007451945951089542, + "grad_norm": 3.127427339553833, + "learning_rate": 4.9993160318748186e-05, + "loss": 7.045, + "step": 1253 + }, + { + "epoch": 0.00745789323437054, + "grad_norm": 3.03910231590271, + "learning_rate": 4.9993149388875606e-05, + "loss": 6.8523, + "step": 1254 + }, + { + "epoch": 0.0074638405176515365, + "grad_norm": 2.931033134460449, + "learning_rate": 4.9993138450278166e-05, + "loss": 7.3065, + "step": 1255 + }, + { + "epoch": 0.007469787800932534, + "grad_norm": 4.60735559463501, + "learning_rate": 4.999312750295588e-05, + "loss": 7.5384, + "step": 1256 + }, + { + "epoch": 0.007475735084213531, + "grad_norm": 3.0745065212249756, + "learning_rate": 4.9993116546908755e-05, + "loss": 7.6279, + "step": 1257 + }, + { + "epoch": 0.007481682367494529, + "grad_norm": 2.7158751487731934, + "learning_rate": 4.9993105582136804e-05, + "loss": 7.1885, + "step": 1258 + }, + { + "epoch": 0.0074876296507755255, + "grad_norm": 3.5049819946289062, + "learning_rate": 4.999309460864e-05, + "loss": 6.6833, + "step": 1259 + }, + { + "epoch": 0.007493576934056523, + "grad_norm": 3.229778289794922, + "learning_rate": 4.999308362641837e-05, + "loss": 6.784, + "step": 1260 + }, + { + "epoch": 0.00749952421733752, + "grad_norm": 2.7032854557037354, + "learning_rate": 4.999307263547191e-05, + "loss": 6.8003, + "step": 1261 + }, + { + "epoch": 0.007505471500618518, + "grad_norm": 5.892059326171875, + "learning_rate": 4.999306163580063e-05, + "loss": 7.2365, + "step": 1262 + }, + { + "epoch": 0.0075114187838995145, + "grad_norm": 5.8021135330200195, + "learning_rate": 4.999305062740453e-05, + "loss": 7.3822, + "step": 1263 + }, + { + "epoch": 0.007517366067180512, + "grad_norm": 5.1242899894714355, + "learning_rate": 4.9993039610283614e-05, + "loss": 7.2192, + "step": 1264 + }, + { + "epoch": 0.007523313350461509, + "grad_norm": 3.102980375289917, + "learning_rate": 4.9993028584437884e-05, + "loss": 7.4895, + "step": 1265 + }, + { + "epoch": 0.007529260633742507, + "grad_norm": 4.993838310241699, + "learning_rate": 4.999301754986735e-05, + "loss": 7.4771, + "step": 1266 + }, + { + "epoch": 0.0075352079170235035, + "grad_norm": 4.003589630126953, + "learning_rate": 4.999300650657201e-05, + "loss": 7.3591, + "step": 1267 + }, + { + "epoch": 0.007541155200304501, + "grad_norm": 3.6125710010528564, + "learning_rate": 4.999299545455187e-05, + "loss": 7.262, + "step": 1268 + }, + { + "epoch": 0.007547102483585498, + "grad_norm": 3.182196617126465, + "learning_rate": 4.999298439380693e-05, + "loss": 7.2689, + "step": 1269 + }, + { + "epoch": 0.007553049766866496, + "grad_norm": 2.428313732147217, + "learning_rate": 4.99929733243372e-05, + "loss": 7.2364, + "step": 1270 + }, + { + "epoch": 0.0075589970501474925, + "grad_norm": 2.673356771469116, + "learning_rate": 4.999296224614268e-05, + "loss": 7.2356, + "step": 1271 + }, + { + "epoch": 0.00756494433342849, + "grad_norm": 2.508026361465454, + "learning_rate": 4.9992951159223376e-05, + "loss": 7.1052, + "step": 1272 + }, + { + "epoch": 0.007570891616709487, + "grad_norm": 2.7501845359802246, + "learning_rate": 4.99929400635793e-05, + "loss": 7.5041, + "step": 1273 + }, + { + "epoch": 0.007576838899990485, + "grad_norm": 2.4604434967041016, + "learning_rate": 4.999292895921044e-05, + "loss": 7.5042, + "step": 1274 + }, + { + "epoch": 0.0075827861832714815, + "grad_norm": 2.4926865100860596, + "learning_rate": 4.99929178461168e-05, + "loss": 7.2104, + "step": 1275 + }, + { + "epoch": 0.007588733466552479, + "grad_norm": 2.631985664367676, + "learning_rate": 4.999290672429839e-05, + "loss": 6.8608, + "step": 1276 + }, + { + "epoch": 0.007594680749833476, + "grad_norm": 2.5684268474578857, + "learning_rate": 4.999289559375523e-05, + "loss": 7.1199, + "step": 1277 + }, + { + "epoch": 0.007600628033114474, + "grad_norm": 2.4312644004821777, + "learning_rate": 4.99928844544873e-05, + "loss": 7.1814, + "step": 1278 + }, + { + "epoch": 0.0076065753163954706, + "grad_norm": 2.794407367706299, + "learning_rate": 4.99928733064946e-05, + "loss": 7.2909, + "step": 1279 + }, + { + "epoch": 0.007612522599676467, + "grad_norm": 2.5903992652893066, + "learning_rate": 4.9992862149777166e-05, + "loss": 7.354, + "step": 1280 + }, + { + "epoch": 0.007618469882957465, + "grad_norm": 2.266364336013794, + "learning_rate": 4.999285098433497e-05, + "loss": 7.5697, + "step": 1281 + }, + { + "epoch": 0.007624417166238462, + "grad_norm": 3.1871070861816406, + "learning_rate": 4.999283981016803e-05, + "loss": 7.4393, + "step": 1282 + }, + { + "epoch": 0.00763036444951946, + "grad_norm": 2.137981653213501, + "learning_rate": 4.999282862727635e-05, + "loss": 7.3591, + "step": 1283 + }, + { + "epoch": 0.007636311732800456, + "grad_norm": 2.3166019916534424, + "learning_rate": 4.999281743565993e-05, + "loss": 7.4307, + "step": 1284 + }, + { + "epoch": 0.007642259016081454, + "grad_norm": 2.331110954284668, + "learning_rate": 4.999280623531878e-05, + "loss": 7.3214, + "step": 1285 + }, + { + "epoch": 0.007648206299362451, + "grad_norm": 2.7417728900909424, + "learning_rate": 4.999279502625289e-05, + "loss": 7.3593, + "step": 1286 + }, + { + "epoch": 0.007654153582643449, + "grad_norm": 3.089448928833008, + "learning_rate": 4.999278380846228e-05, + "loss": 7.3347, + "step": 1287 + }, + { + "epoch": 0.007660100865924445, + "grad_norm": 2.9446022510528564, + "learning_rate": 4.999277258194694e-05, + "loss": 7.3109, + "step": 1288 + }, + { + "epoch": 0.007666048149205443, + "grad_norm": 2.713355302810669, + "learning_rate": 4.9992761346706896e-05, + "loss": 7.2962, + "step": 1289 + }, + { + "epoch": 0.00767199543248644, + "grad_norm": 2.9480702877044678, + "learning_rate": 4.9992750102742125e-05, + "loss": 7.2081, + "step": 1290 + }, + { + "epoch": 0.007677942715767438, + "grad_norm": 2.737271785736084, + "learning_rate": 4.999273885005265e-05, + "loss": 7.2251, + "step": 1291 + }, + { + "epoch": 0.007683889999048434, + "grad_norm": 2.6954190731048584, + "learning_rate": 4.9992727588638466e-05, + "loss": 7.3437, + "step": 1292 + }, + { + "epoch": 0.007689837282329432, + "grad_norm": 3.0270752906799316, + "learning_rate": 4.999271631849958e-05, + "loss": 7.2516, + "step": 1293 + }, + { + "epoch": 0.007695784565610429, + "grad_norm": 2.824052333831787, + "learning_rate": 4.999270503963599e-05, + "loss": 7.2706, + "step": 1294 + }, + { + "epoch": 0.007701731848891427, + "grad_norm": 2.800713300704956, + "learning_rate": 4.999269375204771e-05, + "loss": 7.2497, + "step": 1295 + }, + { + "epoch": 0.0077076791321724234, + "grad_norm": 3.2510271072387695, + "learning_rate": 4.999268245573474e-05, + "loss": 7.025, + "step": 1296 + }, + { + "epoch": 0.007713626415453421, + "grad_norm": 3.095862627029419, + "learning_rate": 4.999267115069708e-05, + "loss": 7.1815, + "step": 1297 + }, + { + "epoch": 0.007719573698734418, + "grad_norm": 3.2238826751708984, + "learning_rate": 4.999265983693473e-05, + "loss": 7.2268, + "step": 1298 + }, + { + "epoch": 0.007725520982015416, + "grad_norm": 3.18687105178833, + "learning_rate": 4.999264851444771e-05, + "loss": 7.2076, + "step": 1299 + }, + { + "epoch": 0.0077314682652964125, + "grad_norm": 3.1385931968688965, + "learning_rate": 4.9992637183236016e-05, + "loss": 7.2323, + "step": 1300 + }, + { + "epoch": 0.00773741554857741, + "grad_norm": 2.3172361850738525, + "learning_rate": 4.999262584329964e-05, + "loss": 7.1225, + "step": 1301 + }, + { + "epoch": 0.007743362831858407, + "grad_norm": 3.3223013877868652, + "learning_rate": 4.99926144946386e-05, + "loss": 7.2108, + "step": 1302 + }, + { + "epoch": 0.007749310115139405, + "grad_norm": 3.197218894958496, + "learning_rate": 4.99926031372529e-05, + "loss": 7.5123, + "step": 1303 + }, + { + "epoch": 0.0077552573984204015, + "grad_norm": 2.8411800861358643, + "learning_rate": 4.999259177114254e-05, + "loss": 7.3047, + "step": 1304 + }, + { + "epoch": 0.007761204681701399, + "grad_norm": 2.7549736499786377, + "learning_rate": 4.9992580396307524e-05, + "loss": 7.3478, + "step": 1305 + }, + { + "epoch": 0.007767151964982396, + "grad_norm": 2.8829352855682373, + "learning_rate": 4.999256901274786e-05, + "loss": 7.1871, + "step": 1306 + }, + { + "epoch": 0.007773099248263394, + "grad_norm": 2.710076332092285, + "learning_rate": 4.999255762046354e-05, + "loss": 7.0891, + "step": 1307 + }, + { + "epoch": 0.0077790465315443905, + "grad_norm": 2.6598877906799316, + "learning_rate": 4.999254621945458e-05, + "loss": 7.6178, + "step": 1308 + }, + { + "epoch": 0.007784993814825388, + "grad_norm": 2.4012649059295654, + "learning_rate": 4.999253480972099e-05, + "loss": 7.5925, + "step": 1309 + }, + { + "epoch": 0.007790941098106385, + "grad_norm": 2.1501622200012207, + "learning_rate": 4.999252339126275e-05, + "loss": 7.6471, + "step": 1310 + }, + { + "epoch": 0.007796888381387382, + "grad_norm": 3.2150895595550537, + "learning_rate": 4.9992511964079886e-05, + "loss": 7.3995, + "step": 1311 + }, + { + "epoch": 0.0078028356646683795, + "grad_norm": 2.450465440750122, + "learning_rate": 4.9992500528172395e-05, + "loss": 7.219, + "step": 1312 + }, + { + "epoch": 0.007808782947949376, + "grad_norm": 2.714510679244995, + "learning_rate": 4.9992489083540274e-05, + "loss": 7.2023, + "step": 1313 + }, + { + "epoch": 0.007814730231230374, + "grad_norm": 2.660019636154175, + "learning_rate": 4.999247763018354e-05, + "loss": 6.8686, + "step": 1314 + }, + { + "epoch": 0.00782067751451137, + "grad_norm": 2.1031477451324463, + "learning_rate": 4.999246616810218e-05, + "loss": 7.305, + "step": 1315 + }, + { + "epoch": 0.007826624797792368, + "grad_norm": 3.0037856101989746, + "learning_rate": 4.999245469729622e-05, + "loss": 6.9788, + "step": 1316 + }, + { + "epoch": 0.007832572081073366, + "grad_norm": 3.1931207180023193, + "learning_rate": 4.999244321776565e-05, + "loss": 6.9312, + "step": 1317 + }, + { + "epoch": 0.007838519364354363, + "grad_norm": 2.7419891357421875, + "learning_rate": 4.999243172951047e-05, + "loss": 6.7732, + "step": 1318 + }, + { + "epoch": 0.00784446664763536, + "grad_norm": 2.772061824798584, + "learning_rate": 4.99924202325307e-05, + "loss": 6.9576, + "step": 1319 + }, + { + "epoch": 0.007850413930916357, + "grad_norm": 2.9300522804260254, + "learning_rate": 4.999240872682632e-05, + "loss": 6.8366, + "step": 1320 + }, + { + "epoch": 0.007856361214197355, + "grad_norm": 3.4697458744049072, + "learning_rate": 4.9992397212397365e-05, + "loss": 6.9234, + "step": 1321 + }, + { + "epoch": 0.007862308497478352, + "grad_norm": 3.044647693634033, + "learning_rate": 4.999238568924381e-05, + "loss": 6.8406, + "step": 1322 + }, + { + "epoch": 0.007868255780759349, + "grad_norm": 2.4429051876068115, + "learning_rate": 4.999237415736567e-05, + "loss": 6.9815, + "step": 1323 + }, + { + "epoch": 0.007874203064040346, + "grad_norm": 2.6193530559539795, + "learning_rate": 4.999236261676296e-05, + "loss": 7.3867, + "step": 1324 + }, + { + "epoch": 0.007880150347321344, + "grad_norm": 3.9543204307556152, + "learning_rate": 4.999235106743567e-05, + "loss": 7.2391, + "step": 1325 + }, + { + "epoch": 0.007886097630602341, + "grad_norm": 3.12777042388916, + "learning_rate": 4.9992339509383814e-05, + "loss": 7.0976, + "step": 1326 + }, + { + "epoch": 0.007892044913883338, + "grad_norm": 2.4543895721435547, + "learning_rate": 4.999232794260739e-05, + "loss": 7.1865, + "step": 1327 + }, + { + "epoch": 0.007897992197164335, + "grad_norm": 4.254832744598389, + "learning_rate": 4.999231636710639e-05, + "loss": 6.777, + "step": 1328 + }, + { + "epoch": 0.007903939480445333, + "grad_norm": 2.7835497856140137, + "learning_rate": 4.999230478288084e-05, + "loss": 6.8508, + "step": 1329 + }, + { + "epoch": 0.00790988676372633, + "grad_norm": 3.2724666595458984, + "learning_rate": 4.999229318993073e-05, + "loss": 6.7636, + "step": 1330 + }, + { + "epoch": 0.007915834047007327, + "grad_norm": 4.657248020172119, + "learning_rate": 4.9992281588256075e-05, + "loss": 7.3677, + "step": 1331 + }, + { + "epoch": 0.007921781330288324, + "grad_norm": 6.201416492462158, + "learning_rate": 4.999226997785686e-05, + "loss": 7.5804, + "step": 1332 + }, + { + "epoch": 0.007927728613569322, + "grad_norm": 4.955161094665527, + "learning_rate": 4.999225835873312e-05, + "loss": 7.1867, + "step": 1333 + }, + { + "epoch": 0.007933675896850319, + "grad_norm": 3.4105887413024902, + "learning_rate": 4.9992246730884826e-05, + "loss": 7.0948, + "step": 1334 + }, + { + "epoch": 0.007939623180131316, + "grad_norm": 2.514570951461792, + "learning_rate": 4.999223509431201e-05, + "loss": 6.9367, + "step": 1335 + }, + { + "epoch": 0.007945570463412313, + "grad_norm": 3.7689249515533447, + "learning_rate": 4.9992223449014654e-05, + "loss": 7.2209, + "step": 1336 + }, + { + "epoch": 0.007951517746693311, + "grad_norm": 4.997833728790283, + "learning_rate": 4.999221179499277e-05, + "loss": 7.3336, + "step": 1337 + }, + { + "epoch": 0.007957465029974308, + "grad_norm": 5.1314287185668945, + "learning_rate": 4.999220013224637e-05, + "loss": 6.933, + "step": 1338 + }, + { + "epoch": 0.007963412313255305, + "grad_norm": 3.708528518676758, + "learning_rate": 4.9992188460775447e-05, + "loss": 6.9598, + "step": 1339 + }, + { + "epoch": 0.007969359596536302, + "grad_norm": 3.029602289199829, + "learning_rate": 4.999217678058001e-05, + "loss": 7.3674, + "step": 1340 + }, + { + "epoch": 0.007975306879817299, + "grad_norm": 3.000312089920044, + "learning_rate": 4.999216509166006e-05, + "loss": 7.2705, + "step": 1341 + }, + { + "epoch": 0.007981254163098297, + "grad_norm": 4.852355480194092, + "learning_rate": 4.999215339401561e-05, + "loss": 7.1842, + "step": 1342 + }, + { + "epoch": 0.007987201446379294, + "grad_norm": 3.0430521965026855, + "learning_rate": 4.999214168764664e-05, + "loss": 7.5616, + "step": 1343 + }, + { + "epoch": 0.00799314872966029, + "grad_norm": 2.793760061264038, + "learning_rate": 4.999212997255319e-05, + "loss": 7.4867, + "step": 1344 + }, + { + "epoch": 0.007999096012941288, + "grad_norm": 3.516545295715332, + "learning_rate": 4.9992118248735245e-05, + "loss": 7.5857, + "step": 1345 + }, + { + "epoch": 0.008005043296222286, + "grad_norm": 4.272013187408447, + "learning_rate": 4.9992106516192796e-05, + "loss": 7.5686, + "step": 1346 + }, + { + "epoch": 0.008010990579503283, + "grad_norm": 3.176974058151245, + "learning_rate": 4.999209477492587e-05, + "loss": 7.1826, + "step": 1347 + }, + { + "epoch": 0.00801693786278428, + "grad_norm": 3.2615413665771484, + "learning_rate": 4.999208302493447e-05, + "loss": 7.3933, + "step": 1348 + }, + { + "epoch": 0.008022885146065277, + "grad_norm": 2.9548113346099854, + "learning_rate": 4.999207126621858e-05, + "loss": 7.339, + "step": 1349 + }, + { + "epoch": 0.008028832429346275, + "grad_norm": 3.445829153060913, + "learning_rate": 4.999205949877822e-05, + "loss": 7.4223, + "step": 1350 + }, + { + "epoch": 0.008034779712627272, + "grad_norm": 3.471991777420044, + "learning_rate": 4.999204772261338e-05, + "loss": 7.4192, + "step": 1351 + }, + { + "epoch": 0.008040726995908269, + "grad_norm": 3.1682589054107666, + "learning_rate": 4.999203593772409e-05, + "loss": 7.3433, + "step": 1352 + }, + { + "epoch": 0.008046674279189266, + "grad_norm": 4.693798065185547, + "learning_rate": 4.999202414411033e-05, + "loss": 7.1479, + "step": 1353 + }, + { + "epoch": 0.008052621562470264, + "grad_norm": 3.0599937438964844, + "learning_rate": 4.9992012341772114e-05, + "loss": 7.3137, + "step": 1354 + }, + { + "epoch": 0.008058568845751261, + "grad_norm": 2.9557557106018066, + "learning_rate": 4.999200053070945e-05, + "loss": 7.4466, + "step": 1355 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 2.5595791339874268, + "learning_rate": 4.999198871092233e-05, + "loss": 7.4716, + "step": 1356 + }, + { + "epoch": 0.008070463412313255, + "grad_norm": 2.919729709625244, + "learning_rate": 4.999197688241076e-05, + "loss": 7.0754, + "step": 1357 + }, + { + "epoch": 0.008076410695594253, + "grad_norm": 2.5880625247955322, + "learning_rate": 4.9991965045174763e-05, + "loss": 7.2794, + "step": 1358 + }, + { + "epoch": 0.00808235797887525, + "grad_norm": 2.9933066368103027, + "learning_rate": 4.999195319921432e-05, + "loss": 7.3547, + "step": 1359 + }, + { + "epoch": 0.008088305262156247, + "grad_norm": 5.097862243652344, + "learning_rate": 4.999194134452945e-05, + "loss": 7.1922, + "step": 1360 + }, + { + "epoch": 0.008094252545437244, + "grad_norm": 4.1795830726623535, + "learning_rate": 4.9991929481120146e-05, + "loss": 7.0437, + "step": 1361 + }, + { + "epoch": 0.008100199828718242, + "grad_norm": 3.292961835861206, + "learning_rate": 4.999191760898642e-05, + "loss": 6.8637, + "step": 1362 + }, + { + "epoch": 0.008106147111999239, + "grad_norm": 3.052610397338867, + "learning_rate": 4.999190572812828e-05, + "loss": 7.1675, + "step": 1363 + }, + { + "epoch": 0.008112094395280236, + "grad_norm": 2.975646734237671, + "learning_rate": 4.999189383854571e-05, + "loss": 7.1309, + "step": 1364 + }, + { + "epoch": 0.008118041678561233, + "grad_norm": 2.71195912361145, + "learning_rate": 4.999188194023874e-05, + "loss": 7.2247, + "step": 1365 + }, + { + "epoch": 0.008123988961842231, + "grad_norm": 2.751002311706543, + "learning_rate": 4.9991870033207354e-05, + "loss": 6.8553, + "step": 1366 + }, + { + "epoch": 0.008129936245123228, + "grad_norm": 3.4521234035491943, + "learning_rate": 4.999185811745157e-05, + "loss": 6.8373, + "step": 1367 + }, + { + "epoch": 0.008135883528404225, + "grad_norm": 3.054330348968506, + "learning_rate": 4.999184619297138e-05, + "loss": 6.6982, + "step": 1368 + }, + { + "epoch": 0.008141830811685222, + "grad_norm": 3.513794183731079, + "learning_rate": 4.99918342597668e-05, + "loss": 6.5567, + "step": 1369 + }, + { + "epoch": 0.00814777809496622, + "grad_norm": 3.681838274002075, + "learning_rate": 4.9991822317837836e-05, + "loss": 6.6335, + "step": 1370 + }, + { + "epoch": 0.008153725378247217, + "grad_norm": 4.144393444061279, + "learning_rate": 4.999181036718447e-05, + "loss": 6.5361, + "step": 1371 + }, + { + "epoch": 0.008159672661528214, + "grad_norm": 2.9771196842193604, + "learning_rate": 4.9991798407806736e-05, + "loss": 7.0085, + "step": 1372 + }, + { + "epoch": 0.00816561994480921, + "grad_norm": 3.114884376525879, + "learning_rate": 4.9991786439704615e-05, + "loss": 7.1498, + "step": 1373 + }, + { + "epoch": 0.008171567228090208, + "grad_norm": 2.76042103767395, + "learning_rate": 4.9991774462878115e-05, + "loss": 6.8462, + "step": 1374 + }, + { + "epoch": 0.008177514511371206, + "grad_norm": 3.257528066635132, + "learning_rate": 4.999176247732725e-05, + "loss": 6.4595, + "step": 1375 + }, + { + "epoch": 0.008183461794652203, + "grad_norm": 3.377774238586426, + "learning_rate": 4.999175048305202e-05, + "loss": 6.3131, + "step": 1376 + }, + { + "epoch": 0.0081894090779332, + "grad_norm": 3.029477834701538, + "learning_rate": 4.999173848005243e-05, + "loss": 6.7182, + "step": 1377 + }, + { + "epoch": 0.008195356361214197, + "grad_norm": 3.0353076457977295, + "learning_rate": 4.9991726468328476e-05, + "loss": 7.009, + "step": 1378 + }, + { + "epoch": 0.008201303644495195, + "grad_norm": 2.465014934539795, + "learning_rate": 4.999171444788017e-05, + "loss": 7.6277, + "step": 1379 + }, + { + "epoch": 0.008207250927776192, + "grad_norm": 3.025954484939575, + "learning_rate": 4.999170241870752e-05, + "loss": 7.2815, + "step": 1380 + }, + { + "epoch": 0.008213198211057189, + "grad_norm": 3.8414018154144287, + "learning_rate": 4.999169038081052e-05, + "loss": 7.2238, + "step": 1381 + }, + { + "epoch": 0.008219145494338186, + "grad_norm": 3.2927470207214355, + "learning_rate": 4.999167833418918e-05, + "loss": 7.1505, + "step": 1382 + }, + { + "epoch": 0.008225092777619184, + "grad_norm": 2.6132330894470215, + "learning_rate": 4.999166627884351e-05, + "loss": 7.2499, + "step": 1383 + }, + { + "epoch": 0.008231040060900181, + "grad_norm": 2.523366689682007, + "learning_rate": 4.9991654214773497e-05, + "loss": 6.9812, + "step": 1384 + }, + { + "epoch": 0.008236987344181178, + "grad_norm": 3.977471351623535, + "learning_rate": 4.9991642141979154e-05, + "loss": 7.3196, + "step": 1385 + }, + { + "epoch": 0.008242934627462175, + "grad_norm": 2.731952428817749, + "learning_rate": 4.99916300604605e-05, + "loss": 7.1014, + "step": 1386 + }, + { + "epoch": 0.008248881910743173, + "grad_norm": 2.6128756999969482, + "learning_rate": 4.999161797021752e-05, + "loss": 7.0235, + "step": 1387 + }, + { + "epoch": 0.00825482919402417, + "grad_norm": 2.263430595397949, + "learning_rate": 4.999160587125023e-05, + "loss": 7.0183, + "step": 1388 + }, + { + "epoch": 0.008260776477305167, + "grad_norm": 2.799994707107544, + "learning_rate": 4.9991593763558614e-05, + "loss": 6.9553, + "step": 1389 + }, + { + "epoch": 0.008266723760586164, + "grad_norm": 2.5443058013916016, + "learning_rate": 4.99915816471427e-05, + "loss": 7.2302, + "step": 1390 + }, + { + "epoch": 0.008272671043867162, + "grad_norm": 2.304185152053833, + "learning_rate": 4.999156952200248e-05, + "loss": 7.2589, + "step": 1391 + }, + { + "epoch": 0.008278618327148159, + "grad_norm": 2.1639649868011475, + "learning_rate": 4.999155738813797e-05, + "loss": 7.0067, + "step": 1392 + }, + { + "epoch": 0.008284565610429156, + "grad_norm": 2.276514768600464, + "learning_rate": 4.999154524554915e-05, + "loss": 7.2721, + "step": 1393 + }, + { + "epoch": 0.008290512893710153, + "grad_norm": 2.212200880050659, + "learning_rate": 4.9991533094236055e-05, + "loss": 7.1183, + "step": 1394 + }, + { + "epoch": 0.008296460176991151, + "grad_norm": 2.5289459228515625, + "learning_rate": 4.999152093419867e-05, + "loss": 7.0289, + "step": 1395 + }, + { + "epoch": 0.008302407460272148, + "grad_norm": 2.5915603637695312, + "learning_rate": 4.999150876543699e-05, + "loss": 6.7497, + "step": 1396 + }, + { + "epoch": 0.008308354743553145, + "grad_norm": 2.680513858795166, + "learning_rate": 4.999149658795105e-05, + "loss": 6.7139, + "step": 1397 + }, + { + "epoch": 0.008314302026834142, + "grad_norm": 2.65744948387146, + "learning_rate": 4.999148440174083e-05, + "loss": 6.6151, + "step": 1398 + }, + { + "epoch": 0.00832024931011514, + "grad_norm": 3.8028745651245117, + "learning_rate": 4.9991472206806334e-05, + "loss": 7.1992, + "step": 1399 + }, + { + "epoch": 0.008326196593396137, + "grad_norm": 2.8436119556427, + "learning_rate": 4.999146000314758e-05, + "loss": 7.165, + "step": 1400 + }, + { + "epoch": 0.008332143876677134, + "grad_norm": 2.6658496856689453, + "learning_rate": 4.999144779076457e-05, + "loss": 7.5945, + "step": 1401 + }, + { + "epoch": 0.00833809115995813, + "grad_norm": 2.909703016281128, + "learning_rate": 4.99914355696573e-05, + "loss": 7.6378, + "step": 1402 + }, + { + "epoch": 0.00834403844323913, + "grad_norm": 2.5827598571777344, + "learning_rate": 4.9991423339825776e-05, + "loss": 7.5441, + "step": 1403 + }, + { + "epoch": 0.008349985726520126, + "grad_norm": 3.0283706188201904, + "learning_rate": 4.999141110127e-05, + "loss": 7.1162, + "step": 1404 + }, + { + "epoch": 0.008355933009801123, + "grad_norm": 3.11690354347229, + "learning_rate": 4.999139885398999e-05, + "loss": 6.5123, + "step": 1405 + }, + { + "epoch": 0.00836188029308212, + "grad_norm": 2.6188690662384033, + "learning_rate": 4.999138659798574e-05, + "loss": 7.6384, + "step": 1406 + }, + { + "epoch": 0.008367827576363117, + "grad_norm": 3.4412481784820557, + "learning_rate": 4.999137433325725e-05, + "loss": 7.4067, + "step": 1407 + }, + { + "epoch": 0.008373774859644115, + "grad_norm": 3.1690893173217773, + "learning_rate": 4.999136205980454e-05, + "loss": 7.3937, + "step": 1408 + }, + { + "epoch": 0.008379722142925112, + "grad_norm": 2.1589877605438232, + "learning_rate": 4.999134977762759e-05, + "loss": 7.454, + "step": 1409 + }, + { + "epoch": 0.008385669426206109, + "grad_norm": 2.485901117324829, + "learning_rate": 4.999133748672642e-05, + "loss": 7.3421, + "step": 1410 + }, + { + "epoch": 0.008391616709487106, + "grad_norm": 2.543128252029419, + "learning_rate": 4.999132518710104e-05, + "loss": 7.3162, + "step": 1411 + }, + { + "epoch": 0.008397563992768104, + "grad_norm": 2.8048489093780518, + "learning_rate": 4.999131287875144e-05, + "loss": 7.297, + "step": 1412 + }, + { + "epoch": 0.008403511276049101, + "grad_norm": 3.0391035079956055, + "learning_rate": 4.9991300561677634e-05, + "loss": 7.2409, + "step": 1413 + }, + { + "epoch": 0.008409458559330098, + "grad_norm": 2.3196053504943848, + "learning_rate": 4.999128823587962e-05, + "loss": 7.1358, + "step": 1414 + }, + { + "epoch": 0.008415405842611095, + "grad_norm": 3.1876983642578125, + "learning_rate": 4.999127590135741e-05, + "loss": 7.1501, + "step": 1415 + }, + { + "epoch": 0.008421353125892093, + "grad_norm": 3.6832327842712402, + "learning_rate": 4.9991263558111e-05, + "loss": 7.181, + "step": 1416 + }, + { + "epoch": 0.00842730040917309, + "grad_norm": 3.7491936683654785, + "learning_rate": 4.99912512061404e-05, + "loss": 6.9669, + "step": 1417 + }, + { + "epoch": 0.008433247692454087, + "grad_norm": 3.1583478450775146, + "learning_rate": 4.9991238845445615e-05, + "loss": 7.2155, + "step": 1418 + }, + { + "epoch": 0.008439194975735084, + "grad_norm": 3.11611008644104, + "learning_rate": 4.999122647602664e-05, + "loss": 7.164, + "step": 1419 + }, + { + "epoch": 0.008445142259016082, + "grad_norm": 6.127118110656738, + "learning_rate": 4.9991214097883495e-05, + "loss": 7.232, + "step": 1420 + }, + { + "epoch": 0.008451089542297079, + "grad_norm": 4.736495494842529, + "learning_rate": 4.9991201711016166e-05, + "loss": 7.3685, + "step": 1421 + }, + { + "epoch": 0.008457036825578076, + "grad_norm": 2.9656684398651123, + "learning_rate": 4.999118931542467e-05, + "loss": 7.2658, + "step": 1422 + }, + { + "epoch": 0.008462984108859073, + "grad_norm": 2.5959243774414062, + "learning_rate": 4.999117691110901e-05, + "loss": 7.0908, + "step": 1423 + }, + { + "epoch": 0.008468931392140071, + "grad_norm": 4.546379089355469, + "learning_rate": 4.999116449806919e-05, + "loss": 7.1343, + "step": 1424 + }, + { + "epoch": 0.008474878675421068, + "grad_norm": 3.6856796741485596, + "learning_rate": 4.9991152076305206e-05, + "loss": 6.9205, + "step": 1425 + }, + { + "epoch": 0.008480825958702065, + "grad_norm": 3.293973922729492, + "learning_rate": 4.9991139645817075e-05, + "loss": 6.9954, + "step": 1426 + }, + { + "epoch": 0.008486773241983062, + "grad_norm": 3.2511162757873535, + "learning_rate": 4.999112720660479e-05, + "loss": 6.7661, + "step": 1427 + }, + { + "epoch": 0.00849272052526406, + "grad_norm": 3.990840196609497, + "learning_rate": 4.9991114758668364e-05, + "loss": 6.7402, + "step": 1428 + }, + { + "epoch": 0.008498667808545057, + "grad_norm": 3.306809186935425, + "learning_rate": 4.9991102302007804e-05, + "loss": 6.6801, + "step": 1429 + }, + { + "epoch": 0.008504615091826054, + "grad_norm": 5.208675384521484, + "learning_rate": 4.99910898366231e-05, + "loss": 7.0128, + "step": 1430 + }, + { + "epoch": 0.00851056237510705, + "grad_norm": 4.131346225738525, + "learning_rate": 4.9991077362514266e-05, + "loss": 7.0992, + "step": 1431 + }, + { + "epoch": 0.00851650965838805, + "grad_norm": 2.60927152633667, + "learning_rate": 4.99910648796813e-05, + "loss": 7.2731, + "step": 1432 + }, + { + "epoch": 0.008522456941669046, + "grad_norm": 5.654631614685059, + "learning_rate": 4.9991052388124224e-05, + "loss": 6.6105, + "step": 1433 + }, + { + "epoch": 0.008528404224950043, + "grad_norm": 6.108455657958984, + "learning_rate": 4.9991039887843025e-05, + "loss": 6.3548, + "step": 1434 + }, + { + "epoch": 0.00853435150823104, + "grad_norm": 3.758371591567993, + "learning_rate": 4.9991027378837705e-05, + "loss": 6.6171, + "step": 1435 + }, + { + "epoch": 0.008540298791512036, + "grad_norm": 2.1995320320129395, + "learning_rate": 4.9991014861108285e-05, + "loss": 6.5987, + "step": 1436 + }, + { + "epoch": 0.008546246074793035, + "grad_norm": 2.3778254985809326, + "learning_rate": 4.999100233465476e-05, + "loss": 6.8067, + "step": 1437 + }, + { + "epoch": 0.008552193358074032, + "grad_norm": 2.521928310394287, + "learning_rate": 4.999098979947713e-05, + "loss": 6.7756, + "step": 1438 + }, + { + "epoch": 0.008558140641355029, + "grad_norm": 2.109605073928833, + "learning_rate": 4.99909772555754e-05, + "loss": 6.7091, + "step": 1439 + }, + { + "epoch": 0.008564087924636025, + "grad_norm": 2.55838680267334, + "learning_rate": 4.9990964702949585e-05, + "loss": 6.8989, + "step": 1440 + }, + { + "epoch": 0.008570035207917024, + "grad_norm": 2.4499685764312744, + "learning_rate": 4.9990952141599675e-05, + "loss": 6.6241, + "step": 1441 + }, + { + "epoch": 0.00857598249119802, + "grad_norm": 2.265371322631836, + "learning_rate": 4.9990939571525685e-05, + "loss": 7.6681, + "step": 1442 + }, + { + "epoch": 0.008581929774479018, + "grad_norm": 2.4496965408325195, + "learning_rate": 4.999092699272762e-05, + "loss": 6.8177, + "step": 1443 + }, + { + "epoch": 0.008587877057760014, + "grad_norm": 2.5555005073547363, + "learning_rate": 4.999091440520548e-05, + "loss": 6.6402, + "step": 1444 + }, + { + "epoch": 0.008593824341041013, + "grad_norm": 2.042592763900757, + "learning_rate": 4.999090180895927e-05, + "loss": 6.6114, + "step": 1445 + }, + { + "epoch": 0.00859977162432201, + "grad_norm": 2.3100671768188477, + "learning_rate": 4.9990889203988986e-05, + "loss": 6.712, + "step": 1446 + }, + { + "epoch": 0.008605718907603007, + "grad_norm": 2.7600841522216797, + "learning_rate": 4.999087659029465e-05, + "loss": 6.6531, + "step": 1447 + }, + { + "epoch": 0.008611666190884004, + "grad_norm": 3.292684316635132, + "learning_rate": 4.999086396787625e-05, + "loss": 6.9896, + "step": 1448 + }, + { + "epoch": 0.008617613474165002, + "grad_norm": 2.7579386234283447, + "learning_rate": 4.999085133673381e-05, + "loss": 7.1559, + "step": 1449 + }, + { + "epoch": 0.008623560757445999, + "grad_norm": 2.7898707389831543, + "learning_rate": 4.999083869686731e-05, + "loss": 6.9861, + "step": 1450 + }, + { + "epoch": 0.008629508040726996, + "grad_norm": 3.439809799194336, + "learning_rate": 4.999082604827677e-05, + "loss": 6.759, + "step": 1451 + }, + { + "epoch": 0.008635455324007993, + "grad_norm": 2.924859046936035, + "learning_rate": 4.999081339096219e-05, + "loss": 6.5438, + "step": 1452 + }, + { + "epoch": 0.008641402607288991, + "grad_norm": 3.363886594772339, + "learning_rate": 4.999080072492358e-05, + "loss": 7.0477, + "step": 1453 + }, + { + "epoch": 0.008647349890569988, + "grad_norm": 2.924988031387329, + "learning_rate": 4.999078805016093e-05, + "loss": 6.9228, + "step": 1454 + }, + { + "epoch": 0.008653297173850985, + "grad_norm": 3.2283847332000732, + "learning_rate": 4.999077536667426e-05, + "loss": 6.8763, + "step": 1455 + }, + { + "epoch": 0.008659244457131982, + "grad_norm": 2.635744094848633, + "learning_rate": 4.999076267446357e-05, + "loss": 6.6438, + "step": 1456 + }, + { + "epoch": 0.00866519174041298, + "grad_norm": 2.829801559448242, + "learning_rate": 4.9990749973528864e-05, + "loss": 6.9466, + "step": 1457 + }, + { + "epoch": 0.008671139023693977, + "grad_norm": 3.3631057739257812, + "learning_rate": 4.999073726387014e-05, + "loss": 7.2652, + "step": 1458 + }, + { + "epoch": 0.008677086306974974, + "grad_norm": 3.9970719814300537, + "learning_rate": 4.999072454548741e-05, + "loss": 7.053, + "step": 1459 + }, + { + "epoch": 0.00868303359025597, + "grad_norm": 3.322787046432495, + "learning_rate": 4.9990711818380674e-05, + "loss": 7.0272, + "step": 1460 + }, + { + "epoch": 0.008688980873536969, + "grad_norm": 2.7370798587799072, + "learning_rate": 4.999069908254995e-05, + "loss": 6.8545, + "step": 1461 + }, + { + "epoch": 0.008694928156817966, + "grad_norm": 2.845191240310669, + "learning_rate": 4.999068633799522e-05, + "loss": 6.9393, + "step": 1462 + }, + { + "epoch": 0.008700875440098963, + "grad_norm": 3.064960241317749, + "learning_rate": 4.99906735847165e-05, + "loss": 6.7734, + "step": 1463 + }, + { + "epoch": 0.00870682272337996, + "grad_norm": 7.113090515136719, + "learning_rate": 4.99906608227138e-05, + "loss": 7.0532, + "step": 1464 + }, + { + "epoch": 0.008712770006660958, + "grad_norm": 5.90821647644043, + "learning_rate": 4.999064805198711e-05, + "loss": 7.1494, + "step": 1465 + }, + { + "epoch": 0.008718717289941955, + "grad_norm": 3.9366238117218018, + "learning_rate": 4.9990635272536454e-05, + "loss": 7.623, + "step": 1466 + }, + { + "epoch": 0.008724664573222952, + "grad_norm": 3.1239330768585205, + "learning_rate": 4.9990622484361814e-05, + "loss": 7.4938, + "step": 1467 + }, + { + "epoch": 0.008730611856503949, + "grad_norm": 2.6688928604125977, + "learning_rate": 4.9990609687463216e-05, + "loss": 7.3445, + "step": 1468 + }, + { + "epoch": 0.008736559139784945, + "grad_norm": 3.047154664993286, + "learning_rate": 4.9990596881840646e-05, + "loss": 7.158, + "step": 1469 + }, + { + "epoch": 0.008742506423065944, + "grad_norm": 2.5230467319488525, + "learning_rate": 4.999058406749412e-05, + "loss": 7.1368, + "step": 1470 + }, + { + "epoch": 0.00874845370634694, + "grad_norm": 2.729705333709717, + "learning_rate": 4.999057124442364e-05, + "loss": 7.0144, + "step": 1471 + }, + { + "epoch": 0.008754400989627938, + "grad_norm": 2.5796756744384766, + "learning_rate": 4.999055841262921e-05, + "loss": 7.2157, + "step": 1472 + }, + { + "epoch": 0.008760348272908934, + "grad_norm": 3.458691358566284, + "learning_rate": 4.999054557211084e-05, + "loss": 6.7631, + "step": 1473 + }, + { + "epoch": 0.008766295556189933, + "grad_norm": 2.7262747287750244, + "learning_rate": 4.999053272286851e-05, + "loss": 6.9784, + "step": 1474 + }, + { + "epoch": 0.00877224283947093, + "grad_norm": 2.6003808975219727, + "learning_rate": 4.9990519864902267e-05, + "loss": 7.1369, + "step": 1475 + }, + { + "epoch": 0.008778190122751927, + "grad_norm": 3.4032137393951416, + "learning_rate": 4.999050699821207e-05, + "loss": 6.9569, + "step": 1476 + }, + { + "epoch": 0.008784137406032923, + "grad_norm": 4.099828243255615, + "learning_rate": 4.9990494122797957e-05, + "loss": 6.9977, + "step": 1477 + }, + { + "epoch": 0.008790084689313922, + "grad_norm": 3.1837944984436035, + "learning_rate": 4.999048123865992e-05, + "loss": 7.1331, + "step": 1478 + }, + { + "epoch": 0.008796031972594919, + "grad_norm": 2.618847131729126, + "learning_rate": 4.999046834579796e-05, + "loss": 7.0043, + "step": 1479 + }, + { + "epoch": 0.008801979255875916, + "grad_norm": 3.0132501125335693, + "learning_rate": 4.999045544421209e-05, + "loss": 6.7836, + "step": 1480 + }, + { + "epoch": 0.008807926539156912, + "grad_norm": 2.4608371257781982, + "learning_rate": 4.999044253390231e-05, + "loss": 7.0721, + "step": 1481 + }, + { + "epoch": 0.008813873822437911, + "grad_norm": 3.280649423599243, + "learning_rate": 4.999042961486863e-05, + "loss": 7.959, + "step": 1482 + }, + { + "epoch": 0.008819821105718908, + "grad_norm": 2.7038395404815674, + "learning_rate": 4.999041668711104e-05, + "loss": 7.1256, + "step": 1483 + }, + { + "epoch": 0.008825768388999905, + "grad_norm": 2.1451892852783203, + "learning_rate": 4.9990403750629556e-05, + "loss": 7.2219, + "step": 1484 + }, + { + "epoch": 0.008831715672280901, + "grad_norm": 2.3731601238250732, + "learning_rate": 4.999039080542418e-05, + "loss": 7.2023, + "step": 1485 + }, + { + "epoch": 0.0088376629555619, + "grad_norm": 2.444089651107788, + "learning_rate": 4.999037785149492e-05, + "loss": 7.0988, + "step": 1486 + }, + { + "epoch": 0.008843610238842897, + "grad_norm": 2.644712448120117, + "learning_rate": 4.999036488884177e-05, + "loss": 7.1916, + "step": 1487 + }, + { + "epoch": 0.008849557522123894, + "grad_norm": 5.477145671844482, + "learning_rate": 4.999035191746475e-05, + "loss": 6.7256, + "step": 1488 + }, + { + "epoch": 0.00885550480540489, + "grad_norm": 2.2691709995269775, + "learning_rate": 4.999033893736386e-05, + "loss": 7.2505, + "step": 1489 + }, + { + "epoch": 0.008861452088685889, + "grad_norm": 2.5880343914031982, + "learning_rate": 4.999032594853909e-05, + "loss": 6.9549, + "step": 1490 + }, + { + "epoch": 0.008867399371966886, + "grad_norm": 2.2748520374298096, + "learning_rate": 4.999031295099046e-05, + "loss": 6.8269, + "step": 1491 + }, + { + "epoch": 0.008873346655247883, + "grad_norm": 2.262706995010376, + "learning_rate": 4.999029994471797e-05, + "loss": 6.8876, + "step": 1492 + }, + { + "epoch": 0.00887929393852888, + "grad_norm": 2.264256238937378, + "learning_rate": 4.999028692972162e-05, + "loss": 7.1545, + "step": 1493 + }, + { + "epoch": 0.008885241221809878, + "grad_norm": 2.489259719848633, + "learning_rate": 4.9990273906001424e-05, + "loss": 7.194, + "step": 1494 + }, + { + "epoch": 0.008891188505090875, + "grad_norm": 2.7545981407165527, + "learning_rate": 4.999026087355738e-05, + "loss": 7.0148, + "step": 1495 + }, + { + "epoch": 0.008897135788371872, + "grad_norm": 2.6869328022003174, + "learning_rate": 4.999024783238949e-05, + "loss": 7.2535, + "step": 1496 + }, + { + "epoch": 0.008903083071652869, + "grad_norm": 2.5216503143310547, + "learning_rate": 4.999023478249777e-05, + "loss": 6.4351, + "step": 1497 + }, + { + "epoch": 0.008909030354933865, + "grad_norm": 2.5090575218200684, + "learning_rate": 4.9990221723882216e-05, + "loss": 7.3068, + "step": 1498 + }, + { + "epoch": 0.008914977638214864, + "grad_norm": 2.5026490688323975, + "learning_rate": 4.999020865654283e-05, + "loss": 7.1274, + "step": 1499 + }, + { + "epoch": 0.00892092492149586, + "grad_norm": 2.8030898571014404, + "learning_rate": 4.999019558047963e-05, + "loss": 7.0016, + "step": 1500 + }, + { + "epoch": 0.008926872204776858, + "grad_norm": 2.533383846282959, + "learning_rate": 4.99901824956926e-05, + "loss": 6.8991, + "step": 1501 + }, + { + "epoch": 0.008932819488057854, + "grad_norm": 2.5584118366241455, + "learning_rate": 4.999016940218175e-05, + "loss": 6.9237, + "step": 1502 + }, + { + "epoch": 0.008938766771338853, + "grad_norm": 2.778592586517334, + "learning_rate": 4.99901562999471e-05, + "loss": 7.0941, + "step": 1503 + }, + { + "epoch": 0.00894471405461985, + "grad_norm": 4.023860931396484, + "learning_rate": 4.999014318898865e-05, + "loss": 6.5188, + "step": 1504 + }, + { + "epoch": 0.008950661337900847, + "grad_norm": 3.018118143081665, + "learning_rate": 4.999013006930639e-05, + "loss": 7.0557, + "step": 1505 + }, + { + "epoch": 0.008956608621181843, + "grad_norm": 2.802061080932617, + "learning_rate": 4.999011694090033e-05, + "loss": 7.2645, + "step": 1506 + }, + { + "epoch": 0.008962555904462842, + "grad_norm": 2.3782076835632324, + "learning_rate": 4.999010380377049e-05, + "loss": 7.3707, + "step": 1507 + }, + { + "epoch": 0.008968503187743839, + "grad_norm": 2.451878309249878, + "learning_rate": 4.999009065791686e-05, + "loss": 7.2783, + "step": 1508 + }, + { + "epoch": 0.008974450471024836, + "grad_norm": 3.85514235496521, + "learning_rate": 4.999007750333945e-05, + "loss": 6.3543, + "step": 1509 + }, + { + "epoch": 0.008980397754305832, + "grad_norm": 2.617177963256836, + "learning_rate": 4.999006434003825e-05, + "loss": 7.0175, + "step": 1510 + }, + { + "epoch": 0.008986345037586831, + "grad_norm": 2.6909587383270264, + "learning_rate": 4.999005116801329e-05, + "loss": 7.3282, + "step": 1511 + }, + { + "epoch": 0.008992292320867828, + "grad_norm": 2.332165241241455, + "learning_rate": 4.9990037987264546e-05, + "loss": 7.0993, + "step": 1512 + }, + { + "epoch": 0.008998239604148825, + "grad_norm": 2.5398497581481934, + "learning_rate": 4.9990024797792055e-05, + "loss": 7.2867, + "step": 1513 + }, + { + "epoch": 0.009004186887429821, + "grad_norm": 2.432264566421509, + "learning_rate": 4.9990011599595796e-05, + "loss": 7.1619, + "step": 1514 + }, + { + "epoch": 0.00901013417071082, + "grad_norm": 2.2937278747558594, + "learning_rate": 4.998999839267578e-05, + "loss": 7.1138, + "step": 1515 + }, + { + "epoch": 0.009016081453991817, + "grad_norm": 2.3305680751800537, + "learning_rate": 4.998998517703202e-05, + "loss": 7.0569, + "step": 1516 + }, + { + "epoch": 0.009022028737272814, + "grad_norm": 3.0785884857177734, + "learning_rate": 4.998997195266451e-05, + "loss": 7.0922, + "step": 1517 + }, + { + "epoch": 0.00902797602055381, + "grad_norm": 2.354283571243286, + "learning_rate": 4.998995871957326e-05, + "loss": 7.0024, + "step": 1518 + }, + { + "epoch": 0.009033923303834809, + "grad_norm": 2.488194465637207, + "learning_rate": 4.998994547775827e-05, + "loss": 7.0045, + "step": 1519 + }, + { + "epoch": 0.009039870587115806, + "grad_norm": 2.6196579933166504, + "learning_rate": 4.998993222721956e-05, + "loss": 6.9416, + "step": 1520 + }, + { + "epoch": 0.009045817870396803, + "grad_norm": 2.6524155139923096, + "learning_rate": 4.998991896795711e-05, + "loss": 6.9562, + "step": 1521 + }, + { + "epoch": 0.0090517651536778, + "grad_norm": 3.308661460876465, + "learning_rate": 4.998990569997094e-05, + "loss": 6.8602, + "step": 1522 + }, + { + "epoch": 0.009057712436958798, + "grad_norm": 2.7995994091033936, + "learning_rate": 4.9989892423261055e-05, + "loss": 7.7049, + "step": 1523 + }, + { + "epoch": 0.009063659720239795, + "grad_norm": 2.547189235687256, + "learning_rate": 4.9989879137827456e-05, + "loss": 7.0254, + "step": 1524 + }, + { + "epoch": 0.009069607003520792, + "grad_norm": 2.796393871307373, + "learning_rate": 4.998986584367015e-05, + "loss": 7.0124, + "step": 1525 + }, + { + "epoch": 0.009075554286801788, + "grad_norm": 2.9441823959350586, + "learning_rate": 4.9989852540789136e-05, + "loss": 7.0174, + "step": 1526 + }, + { + "epoch": 0.009081501570082787, + "grad_norm": 2.509150743484497, + "learning_rate": 4.998983922918443e-05, + "loss": 6.9405, + "step": 1527 + }, + { + "epoch": 0.009087448853363784, + "grad_norm": 2.3686184883117676, + "learning_rate": 4.998982590885603e-05, + "loss": 6.794, + "step": 1528 + }, + { + "epoch": 0.00909339613664478, + "grad_norm": 2.937530755996704, + "learning_rate": 4.998981257980393e-05, + "loss": 6.9716, + "step": 1529 + }, + { + "epoch": 0.009099343419925777, + "grad_norm": 2.493178606033325, + "learning_rate": 4.998979924202814e-05, + "loss": 6.5986, + "step": 1530 + }, + { + "epoch": 0.009105290703206774, + "grad_norm": 2.071356773376465, + "learning_rate": 4.9989785895528686e-05, + "loss": 6.536, + "step": 1531 + }, + { + "epoch": 0.009111237986487773, + "grad_norm": 1.9372920989990234, + "learning_rate": 4.998977254030554e-05, + "loss": 6.4036, + "step": 1532 + }, + { + "epoch": 0.00911718526976877, + "grad_norm": 2.3329098224639893, + "learning_rate": 4.998975917635873e-05, + "loss": 6.4861, + "step": 1533 + }, + { + "epoch": 0.009123132553049767, + "grad_norm": 2.9681191444396973, + "learning_rate": 4.998974580368826e-05, + "loss": 6.939, + "step": 1534 + }, + { + "epoch": 0.009129079836330763, + "grad_norm": 2.5993690490722656, + "learning_rate": 4.9989732422294125e-05, + "loss": 7.0809, + "step": 1535 + }, + { + "epoch": 0.009135027119611762, + "grad_norm": 2.827244997024536, + "learning_rate": 4.998971903217633e-05, + "loss": 7.597, + "step": 1536 + }, + { + "epoch": 0.009140974402892759, + "grad_norm": 2.712247848510742, + "learning_rate": 4.9989705633334884e-05, + "loss": 7.3695, + "step": 1537 + }, + { + "epoch": 0.009146921686173756, + "grad_norm": 1.7997468709945679, + "learning_rate": 4.998969222576978e-05, + "loss": 7.6497, + "step": 1538 + }, + { + "epoch": 0.009152868969454752, + "grad_norm": 2.234931230545044, + "learning_rate": 4.998967880948104e-05, + "loss": 7.1636, + "step": 1539 + }, + { + "epoch": 0.009158816252735751, + "grad_norm": 2.150766611099243, + "learning_rate": 4.9989665384468666e-05, + "loss": 6.8621, + "step": 1540 + }, + { + "epoch": 0.009164763536016748, + "grad_norm": 2.9628021717071533, + "learning_rate": 4.998965195073265e-05, + "loss": 6.5059, + "step": 1541 + }, + { + "epoch": 0.009170710819297745, + "grad_norm": 2.720155715942383, + "learning_rate": 4.998963850827301e-05, + "loss": 7.0129, + "step": 1542 + }, + { + "epoch": 0.009176658102578741, + "grad_norm": 2.994684934616089, + "learning_rate": 4.9989625057089744e-05, + "loss": 7.3621, + "step": 1543 + }, + { + "epoch": 0.00918260538585974, + "grad_norm": 2.5991618633270264, + "learning_rate": 4.998961159718286e-05, + "loss": 6.7278, + "step": 1544 + }, + { + "epoch": 0.009188552669140737, + "grad_norm": 2.406353712081909, + "learning_rate": 4.9989598128552355e-05, + "loss": 7.5987, + "step": 1545 + }, + { + "epoch": 0.009194499952421734, + "grad_norm": 3.1308467388153076, + "learning_rate": 4.998958465119824e-05, + "loss": 7.1947, + "step": 1546 + }, + { + "epoch": 0.00920044723570273, + "grad_norm": 2.5381908416748047, + "learning_rate": 4.998957116512053e-05, + "loss": 6.8415, + "step": 1547 + }, + { + "epoch": 0.009206394518983729, + "grad_norm": 2.666410446166992, + "learning_rate": 4.998955767031921e-05, + "loss": 6.9052, + "step": 1548 + }, + { + "epoch": 0.009212341802264726, + "grad_norm": 2.156036138534546, + "learning_rate": 4.9989544166794286e-05, + "loss": 7.6604, + "step": 1549 + }, + { + "epoch": 0.009218289085545723, + "grad_norm": 2.620114803314209, + "learning_rate": 4.998953065454578e-05, + "loss": 6.5475, + "step": 1550 + }, + { + "epoch": 0.00922423636882672, + "grad_norm": 3.2780802249908447, + "learning_rate": 4.9989517133573694e-05, + "loss": 7.0572, + "step": 1551 + }, + { + "epoch": 0.009230183652107718, + "grad_norm": 3.6108100414276123, + "learning_rate": 4.998950360387802e-05, + "loss": 7.0149, + "step": 1552 + }, + { + "epoch": 0.009236130935388715, + "grad_norm": 3.4336259365081787, + "learning_rate": 4.998949006545876e-05, + "loss": 7.2436, + "step": 1553 + }, + { + "epoch": 0.009242078218669712, + "grad_norm": 3.271630048751831, + "learning_rate": 4.9989476518315934e-05, + "loss": 7.3807, + "step": 1554 + }, + { + "epoch": 0.009248025501950708, + "grad_norm": 3.0718438625335693, + "learning_rate": 4.998946296244954e-05, + "loss": 7.2313, + "step": 1555 + }, + { + "epoch": 0.009253972785231707, + "grad_norm": 2.2010579109191895, + "learning_rate": 4.9989449397859575e-05, + "loss": 7.4269, + "step": 1556 + }, + { + "epoch": 0.009259920068512704, + "grad_norm": 2.9805495738983154, + "learning_rate": 4.998943582454607e-05, + "loss": 7.2107, + "step": 1557 + }, + { + "epoch": 0.0092658673517937, + "grad_norm": 2.8313159942626953, + "learning_rate": 4.9989422242508995e-05, + "loss": 7.0453, + "step": 1558 + }, + { + "epoch": 0.009271814635074697, + "grad_norm": 2.7660701274871826, + "learning_rate": 4.998940865174837e-05, + "loss": 7.2205, + "step": 1559 + }, + { + "epoch": 0.009277761918355694, + "grad_norm": 3.808122396469116, + "learning_rate": 4.998939505226421e-05, + "loss": 6.9966, + "step": 1560 + }, + { + "epoch": 0.009283709201636693, + "grad_norm": 3.188976526260376, + "learning_rate": 4.99893814440565e-05, + "loss": 7.0049, + "step": 1561 + }, + { + "epoch": 0.00928965648491769, + "grad_norm": 2.5491533279418945, + "learning_rate": 4.998936782712526e-05, + "loss": 7.0451, + "step": 1562 + }, + { + "epoch": 0.009295603768198686, + "grad_norm": 3.4607698917388916, + "learning_rate": 4.99893542014705e-05, + "loss": 7.0304, + "step": 1563 + }, + { + "epoch": 0.009301551051479683, + "grad_norm": 3.4761910438537598, + "learning_rate": 4.99893405670922e-05, + "loss": 6.9787, + "step": 1564 + }, + { + "epoch": 0.009307498334760682, + "grad_norm": 3.15938138961792, + "learning_rate": 4.998932692399039e-05, + "loss": 7.0203, + "step": 1565 + }, + { + "epoch": 0.009313445618041679, + "grad_norm": 2.600304126739502, + "learning_rate": 4.9989313272165064e-05, + "loss": 7.0782, + "step": 1566 + }, + { + "epoch": 0.009319392901322675, + "grad_norm": 2.54158616065979, + "learning_rate": 4.9989299611616216e-05, + "loss": 6.8354, + "step": 1567 + }, + { + "epoch": 0.009325340184603672, + "grad_norm": 3.4649429321289062, + "learning_rate": 4.9989285942343864e-05, + "loss": 6.8238, + "step": 1568 + }, + { + "epoch": 0.00933128746788467, + "grad_norm": 2.522388458251953, + "learning_rate": 4.998927226434802e-05, + "loss": 6.9544, + "step": 1569 + }, + { + "epoch": 0.009337234751165668, + "grad_norm": 4.074129581451416, + "learning_rate": 4.9989258577628675e-05, + "loss": 6.7229, + "step": 1570 + }, + { + "epoch": 0.009343182034446664, + "grad_norm": 3.395894765853882, + "learning_rate": 4.998924488218584e-05, + "loss": 7.1372, + "step": 1571 + }, + { + "epoch": 0.009349129317727661, + "grad_norm": 2.9850378036499023, + "learning_rate": 4.9989231178019516e-05, + "loss": 6.8966, + "step": 1572 + }, + { + "epoch": 0.00935507660100866, + "grad_norm": 3.1391544342041016, + "learning_rate": 4.9989217465129704e-05, + "loss": 6.6744, + "step": 1573 + }, + { + "epoch": 0.009361023884289657, + "grad_norm": 3.8727803230285645, + "learning_rate": 4.9989203743516414e-05, + "loss": 6.9359, + "step": 1574 + }, + { + "epoch": 0.009366971167570654, + "grad_norm": 3.466169595718384, + "learning_rate": 4.998919001317966e-05, + "loss": 6.979, + "step": 1575 + }, + { + "epoch": 0.00937291845085165, + "grad_norm": 3.3481826782226562, + "learning_rate": 4.998917627411943e-05, + "loss": 6.7749, + "step": 1576 + }, + { + "epoch": 0.009378865734132649, + "grad_norm": 2.425971031188965, + "learning_rate": 4.9989162526335745e-05, + "loss": 7.0127, + "step": 1577 + }, + { + "epoch": 0.009384813017413646, + "grad_norm": 2.8379313945770264, + "learning_rate": 4.9989148769828595e-05, + "loss": 6.5782, + "step": 1578 + }, + { + "epoch": 0.009390760300694643, + "grad_norm": 3.0456466674804688, + "learning_rate": 4.9989135004597994e-05, + "loss": 6.9832, + "step": 1579 + }, + { + "epoch": 0.00939670758397564, + "grad_norm": 2.690138101577759, + "learning_rate": 4.9989121230643944e-05, + "loss": 7.0079, + "step": 1580 + }, + { + "epoch": 0.009402654867256638, + "grad_norm": 3.683105945587158, + "learning_rate": 4.9989107447966444e-05, + "loss": 7.2734, + "step": 1581 + }, + { + "epoch": 0.009408602150537635, + "grad_norm": 2.3310985565185547, + "learning_rate": 4.9989093656565513e-05, + "loss": 7.2388, + "step": 1582 + }, + { + "epoch": 0.009414549433818632, + "grad_norm": 2.353322982788086, + "learning_rate": 4.998907985644115e-05, + "loss": 7.0612, + "step": 1583 + }, + { + "epoch": 0.009420496717099628, + "grad_norm": 2.8458571434020996, + "learning_rate": 4.9989066047593344e-05, + "loss": 7.3093, + "step": 1584 + }, + { + "epoch": 0.009426444000380627, + "grad_norm": 2.3322811126708984, + "learning_rate": 4.9989052230022125e-05, + "loss": 6.983, + "step": 1585 + }, + { + "epoch": 0.009432391283661624, + "grad_norm": 2.7431764602661133, + "learning_rate": 4.998903840372748e-05, + "loss": 6.9694, + "step": 1586 + }, + { + "epoch": 0.00943833856694262, + "grad_norm": 2.7704508304595947, + "learning_rate": 4.998902456870942e-05, + "loss": 6.7727, + "step": 1587 + }, + { + "epoch": 0.009444285850223617, + "grad_norm": 2.4920814037323, + "learning_rate": 4.998901072496796e-05, + "loss": 7.0612, + "step": 1588 + }, + { + "epoch": 0.009450233133504616, + "grad_norm": 2.5911498069763184, + "learning_rate": 4.998899687250308e-05, + "loss": 6.8774, + "step": 1589 + }, + { + "epoch": 0.009456180416785613, + "grad_norm": 2.7269680500030518, + "learning_rate": 4.998898301131481e-05, + "loss": 7.0782, + "step": 1590 + }, + { + "epoch": 0.00946212770006661, + "grad_norm": 2.9707436561584473, + "learning_rate": 4.998896914140314e-05, + "loss": 7.307, + "step": 1591 + }, + { + "epoch": 0.009468074983347606, + "grad_norm": 3.064683675765991, + "learning_rate": 4.998895526276808e-05, + "loss": 7.3708, + "step": 1592 + }, + { + "epoch": 0.009474022266628603, + "grad_norm": 2.4465317726135254, + "learning_rate": 4.998894137540963e-05, + "loss": 7.0085, + "step": 1593 + }, + { + "epoch": 0.009479969549909602, + "grad_norm": 3.3061211109161377, + "learning_rate": 4.99889274793278e-05, + "loss": 6.8353, + "step": 1594 + }, + { + "epoch": 0.009485916833190599, + "grad_norm": 3.283397912979126, + "learning_rate": 4.9988913574522594e-05, + "loss": 6.6848, + "step": 1595 + }, + { + "epoch": 0.009491864116471595, + "grad_norm": 2.770745277404785, + "learning_rate": 4.9988899660994014e-05, + "loss": 7.1742, + "step": 1596 + }, + { + "epoch": 0.009497811399752592, + "grad_norm": 2.7975432872772217, + "learning_rate": 4.998888573874207e-05, + "loss": 6.7329, + "step": 1597 + }, + { + "epoch": 0.00950375868303359, + "grad_norm": 2.545919418334961, + "learning_rate": 4.998887180776677e-05, + "loss": 6.7203, + "step": 1598 + }, + { + "epoch": 0.009509705966314588, + "grad_norm": 2.7961528301239014, + "learning_rate": 4.99888578680681e-05, + "loss": 7.384, + "step": 1599 + }, + { + "epoch": 0.009515653249595584, + "grad_norm": 2.570570230484009, + "learning_rate": 4.9988843919646096e-05, + "loss": 7.0246, + "step": 1600 + }, + { + "epoch": 0.009521600532876581, + "grad_norm": 2.5365843772888184, + "learning_rate": 4.9988829962500734e-05, + "loss": 6.8801, + "step": 1601 + }, + { + "epoch": 0.00952754781615758, + "grad_norm": 2.4713737964630127, + "learning_rate": 4.998881599663203e-05, + "loss": 7.1974, + "step": 1602 + }, + { + "epoch": 0.009533495099438577, + "grad_norm": 2.5286331176757812, + "learning_rate": 4.998880202203999e-05, + "loss": 7.26, + "step": 1603 + }, + { + "epoch": 0.009539442382719573, + "grad_norm": 2.2333719730377197, + "learning_rate": 4.998878803872461e-05, + "loss": 7.3254, + "step": 1604 + }, + { + "epoch": 0.00954538966600057, + "grad_norm": 2.544095277786255, + "learning_rate": 4.9988774046685915e-05, + "loss": 7.407, + "step": 1605 + }, + { + "epoch": 0.009551336949281569, + "grad_norm": 3.057140588760376, + "learning_rate": 4.9988760045923886e-05, + "loss": 6.5303, + "step": 1606 + }, + { + "epoch": 0.009557284232562566, + "grad_norm": 3.0190670490264893, + "learning_rate": 4.998874603643854e-05, + "loss": 6.3276, + "step": 1607 + }, + { + "epoch": 0.009563231515843562, + "grad_norm": 2.208249568939209, + "learning_rate": 4.998873201822989e-05, + "loss": 6.856, + "step": 1608 + }, + { + "epoch": 0.00956917879912456, + "grad_norm": 2.3519229888916016, + "learning_rate": 4.998871799129793e-05, + "loss": 6.9854, + "step": 1609 + }, + { + "epoch": 0.009575126082405558, + "grad_norm": 2.604816198348999, + "learning_rate": 4.9988703955642655e-05, + "loss": 7.3127, + "step": 1610 + }, + { + "epoch": 0.009581073365686555, + "grad_norm": 2.320030927658081, + "learning_rate": 4.9988689911264094e-05, + "loss": 7.216, + "step": 1611 + }, + { + "epoch": 0.009587020648967551, + "grad_norm": 2.8475282192230225, + "learning_rate": 4.998867585816224e-05, + "loss": 6.6743, + "step": 1612 + }, + { + "epoch": 0.009592967932248548, + "grad_norm": 2.518707036972046, + "learning_rate": 4.998866179633709e-05, + "loss": 7.0257, + "step": 1613 + }, + { + "epoch": 0.009598915215529547, + "grad_norm": 2.7348618507385254, + "learning_rate": 4.998864772578866e-05, + "loss": 7.1933, + "step": 1614 + }, + { + "epoch": 0.009604862498810544, + "grad_norm": 2.5701184272766113, + "learning_rate": 4.9988633646516946e-05, + "loss": 7.1071, + "step": 1615 + }, + { + "epoch": 0.00961080978209154, + "grad_norm": 2.916544198989868, + "learning_rate": 4.998861955852197e-05, + "loss": 7.1331, + "step": 1616 + }, + { + "epoch": 0.009616757065372537, + "grad_norm": 2.390934944152832, + "learning_rate": 4.998860546180371e-05, + "loss": 7.3252, + "step": 1617 + }, + { + "epoch": 0.009622704348653536, + "grad_norm": 2.6720097064971924, + "learning_rate": 4.998859135636219e-05, + "loss": 7.0105, + "step": 1618 + }, + { + "epoch": 0.009628651631934533, + "grad_norm": 2.3859329223632812, + "learning_rate": 4.998857724219742e-05, + "loss": 7.023, + "step": 1619 + }, + { + "epoch": 0.00963459891521553, + "grad_norm": 2.9713187217712402, + "learning_rate": 4.998856311930939e-05, + "loss": 7.0338, + "step": 1620 + }, + { + "epoch": 0.009640546198496526, + "grad_norm": 2.33858060836792, + "learning_rate": 4.998854898769811e-05, + "loss": 7.0103, + "step": 1621 + }, + { + "epoch": 0.009646493481777523, + "grad_norm": 2.8897042274475098, + "learning_rate": 4.9988534847363585e-05, + "loss": 7.1225, + "step": 1622 + }, + { + "epoch": 0.009652440765058522, + "grad_norm": 2.354513645172119, + "learning_rate": 4.9988520698305826e-05, + "loss": 6.9272, + "step": 1623 + }, + { + "epoch": 0.009658388048339519, + "grad_norm": 2.5571863651275635, + "learning_rate": 4.9988506540524826e-05, + "loss": 6.3418, + "step": 1624 + }, + { + "epoch": 0.009664335331620515, + "grad_norm": 2.342381238937378, + "learning_rate": 4.99884923740206e-05, + "loss": 6.4265, + "step": 1625 + }, + { + "epoch": 0.009670282614901512, + "grad_norm": 2.5594370365142822, + "learning_rate": 4.998847819879315e-05, + "loss": 6.9801, + "step": 1626 + }, + { + "epoch": 0.00967622989818251, + "grad_norm": 3.6932148933410645, + "learning_rate": 4.9988464014842476e-05, + "loss": 7.0231, + "step": 1627 + }, + { + "epoch": 0.009682177181463508, + "grad_norm": 2.713508367538452, + "learning_rate": 4.998844982216859e-05, + "loss": 6.9041, + "step": 1628 + }, + { + "epoch": 0.009688124464744504, + "grad_norm": 2.703103542327881, + "learning_rate": 4.99884356207715e-05, + "loss": 6.9272, + "step": 1629 + }, + { + "epoch": 0.009694071748025501, + "grad_norm": 3.228708267211914, + "learning_rate": 4.9988421410651197e-05, + "loss": 6.9242, + "step": 1630 + }, + { + "epoch": 0.0097000190313065, + "grad_norm": 3.3407063484191895, + "learning_rate": 4.9988407191807694e-05, + "loss": 6.8871, + "step": 1631 + }, + { + "epoch": 0.009705966314587497, + "grad_norm": 2.3833165168762207, + "learning_rate": 4.9988392964241005e-05, + "loss": 6.9667, + "step": 1632 + }, + { + "epoch": 0.009711913597868493, + "grad_norm": 3.607023239135742, + "learning_rate": 4.9988378727951123e-05, + "loss": 6.93, + "step": 1633 + }, + { + "epoch": 0.00971786088114949, + "grad_norm": 3.797107219696045, + "learning_rate": 4.9988364482938056e-05, + "loss": 6.8115, + "step": 1634 + }, + { + "epoch": 0.009723808164430489, + "grad_norm": 2.5586941242218018, + "learning_rate": 4.998835022920181e-05, + "loss": 6.7322, + "step": 1635 + }, + { + "epoch": 0.009729755447711486, + "grad_norm": 2.377680540084839, + "learning_rate": 4.9988335966742385e-05, + "loss": 6.7127, + "step": 1636 + }, + { + "epoch": 0.009735702730992482, + "grad_norm": 2.510584592819214, + "learning_rate": 4.998832169555979e-05, + "loss": 6.836, + "step": 1637 + }, + { + "epoch": 0.00974165001427348, + "grad_norm": 2.8817014694213867, + "learning_rate": 4.9988307415654025e-05, + "loss": 6.7812, + "step": 1638 + }, + { + "epoch": 0.009747597297554478, + "grad_norm": 2.878535509109497, + "learning_rate": 4.998829312702511e-05, + "loss": 6.7852, + "step": 1639 + }, + { + "epoch": 0.009753544580835475, + "grad_norm": 2.5870323181152344, + "learning_rate": 4.998827882967304e-05, + "loss": 6.8569, + "step": 1640 + }, + { + "epoch": 0.009759491864116471, + "grad_norm": 2.7275760173797607, + "learning_rate": 4.998826452359782e-05, + "loss": 6.8304, + "step": 1641 + }, + { + "epoch": 0.009765439147397468, + "grad_norm": 2.24550461769104, + "learning_rate": 4.998825020879945e-05, + "loss": 6.7609, + "step": 1642 + }, + { + "epoch": 0.009771386430678467, + "grad_norm": 2.2101621627807617, + "learning_rate": 4.9988235885277934e-05, + "loss": 6.7548, + "step": 1643 + }, + { + "epoch": 0.009777333713959464, + "grad_norm": 2.289870023727417, + "learning_rate": 4.9988221553033294e-05, + "loss": 6.8899, + "step": 1644 + }, + { + "epoch": 0.00978328099724046, + "grad_norm": 2.6337740421295166, + "learning_rate": 4.9988207212065516e-05, + "loss": 6.7605, + "step": 1645 + }, + { + "epoch": 0.009789228280521457, + "grad_norm": 2.442605972290039, + "learning_rate": 4.998819286237462e-05, + "loss": 6.6299, + "step": 1646 + }, + { + "epoch": 0.009795175563802456, + "grad_norm": 2.6570451259613037, + "learning_rate": 4.9988178503960606e-05, + "loss": 6.6933, + "step": 1647 + }, + { + "epoch": 0.009801122847083453, + "grad_norm": 2.597043752670288, + "learning_rate": 4.9988164136823467e-05, + "loss": 6.7667, + "step": 1648 + }, + { + "epoch": 0.00980707013036445, + "grad_norm": 3.2576608657836914, + "learning_rate": 4.998814976096323e-05, + "loss": 7.1774, + "step": 1649 + }, + { + "epoch": 0.009813017413645446, + "grad_norm": 3.110119342803955, + "learning_rate": 4.998813537637988e-05, + "loss": 7.2139, + "step": 1650 + }, + { + "epoch": 0.009818964696926445, + "grad_norm": 3.038086414337158, + "learning_rate": 4.998812098307343e-05, + "loss": 7.2752, + "step": 1651 + }, + { + "epoch": 0.009824911980207442, + "grad_norm": 2.965916872024536, + "learning_rate": 4.998810658104389e-05, + "loss": 7.1151, + "step": 1652 + }, + { + "epoch": 0.009830859263488438, + "grad_norm": 3.011476755142212, + "learning_rate": 4.998809217029126e-05, + "loss": 7.1335, + "step": 1653 + }, + { + "epoch": 0.009836806546769435, + "grad_norm": 3.8196349143981934, + "learning_rate": 4.9988077750815534e-05, + "loss": 7.0865, + "step": 1654 + }, + { + "epoch": 0.009842753830050432, + "grad_norm": 3.2577872276306152, + "learning_rate": 4.998806332261674e-05, + "loss": 7.4285, + "step": 1655 + }, + { + "epoch": 0.00984870111333143, + "grad_norm": 2.847039222717285, + "learning_rate": 4.998804888569487e-05, + "loss": 7.3251, + "step": 1656 + }, + { + "epoch": 0.009854648396612428, + "grad_norm": 3.4066355228424072, + "learning_rate": 4.998803444004992e-05, + "loss": 7.3137, + "step": 1657 + }, + { + "epoch": 0.009860595679893424, + "grad_norm": 3.6774044036865234, + "learning_rate": 4.998801998568192e-05, + "loss": 7.0772, + "step": 1658 + }, + { + "epoch": 0.009866542963174421, + "grad_norm": 3.1404600143432617, + "learning_rate": 4.998800552259085e-05, + "loss": 7.1143, + "step": 1659 + }, + { + "epoch": 0.00987249024645542, + "grad_norm": 3.6337625980377197, + "learning_rate": 4.998799105077674e-05, + "loss": 7.1296, + "step": 1660 + }, + { + "epoch": 0.009878437529736417, + "grad_norm": 4.551114082336426, + "learning_rate": 4.9987976570239566e-05, + "loss": 7.1343, + "step": 1661 + }, + { + "epoch": 0.009884384813017413, + "grad_norm": 3.2305374145507812, + "learning_rate": 4.998796208097935e-05, + "loss": 7.0852, + "step": 1662 + }, + { + "epoch": 0.00989033209629841, + "grad_norm": 2.5174615383148193, + "learning_rate": 4.99879475829961e-05, + "loss": 7.2315, + "step": 1663 + }, + { + "epoch": 0.009896279379579409, + "grad_norm": 3.623525381088257, + "learning_rate": 4.9987933076289804e-05, + "loss": 7.4222, + "step": 1664 + }, + { + "epoch": 0.009902226662860406, + "grad_norm": 4.217465877532959, + "learning_rate": 4.998791856086049e-05, + "loss": 7.4003, + "step": 1665 + }, + { + "epoch": 0.009908173946141402, + "grad_norm": 2.42301344871521, + "learning_rate": 4.998790403670815e-05, + "loss": 7.3295, + "step": 1666 + }, + { + "epoch": 0.0099141212294224, + "grad_norm": 2.3003029823303223, + "learning_rate": 4.998788950383279e-05, + "loss": 7.2072, + "step": 1667 + }, + { + "epoch": 0.009920068512703398, + "grad_norm": 3.3792307376861572, + "learning_rate": 4.9987874962234414e-05, + "loss": 7.2882, + "step": 1668 + }, + { + "epoch": 0.009926015795984395, + "grad_norm": 3.42130184173584, + "learning_rate": 4.998786041191303e-05, + "loss": 7.1231, + "step": 1669 + }, + { + "epoch": 0.009931963079265391, + "grad_norm": 3.496676445007324, + "learning_rate": 4.9987845852868644e-05, + "loss": 7.2535, + "step": 1670 + }, + { + "epoch": 0.009937910362546388, + "grad_norm": 2.695780038833618, + "learning_rate": 4.9987831285101255e-05, + "loss": 7.3784, + "step": 1671 + }, + { + "epoch": 0.009943857645827387, + "grad_norm": 2.2745561599731445, + "learning_rate": 4.998781670861088e-05, + "loss": 7.1184, + "step": 1672 + }, + { + "epoch": 0.009949804929108384, + "grad_norm": 3.8487844467163086, + "learning_rate": 4.99878021233975e-05, + "loss": 7.277, + "step": 1673 + }, + { + "epoch": 0.00995575221238938, + "grad_norm": 2.6628305912017822, + "learning_rate": 4.998778752946115e-05, + "loss": 6.8204, + "step": 1674 + }, + { + "epoch": 0.009961699495670377, + "grad_norm": 3.6330301761627197, + "learning_rate": 4.998777292680182e-05, + "loss": 7.3003, + "step": 1675 + }, + { + "epoch": 0.009967646778951376, + "grad_norm": 2.644237995147705, + "learning_rate": 4.998775831541952e-05, + "loss": 7.1492, + "step": 1676 + }, + { + "epoch": 0.009973594062232373, + "grad_norm": 2.895193099975586, + "learning_rate": 4.998774369531424e-05, + "loss": 7.3986, + "step": 1677 + }, + { + "epoch": 0.00997954134551337, + "grad_norm": 3.2180328369140625, + "learning_rate": 4.998772906648601e-05, + "loss": 7.1085, + "step": 1678 + }, + { + "epoch": 0.009985488628794366, + "grad_norm": 3.5874838829040527, + "learning_rate": 4.9987714428934815e-05, + "loss": 6.9554, + "step": 1679 + }, + { + "epoch": 0.009991435912075365, + "grad_norm": 2.419516086578369, + "learning_rate": 4.9987699782660666e-05, + "loss": 6.6222, + "step": 1680 + }, + { + "epoch": 0.009997383195356362, + "grad_norm": 2.715808153152466, + "learning_rate": 4.9987685127663574e-05, + "loss": 6.8417, + "step": 1681 + }, + { + "epoch": 0.010003330478637358, + "grad_norm": 2.2847111225128174, + "learning_rate": 4.9987670463943534e-05, + "loss": 7.1649, + "step": 1682 + }, + { + "epoch": 0.010009277761918355, + "grad_norm": 2.402684450149536, + "learning_rate": 4.998765579150056e-05, + "loss": 7.6113, + "step": 1683 + }, + { + "epoch": 0.010015225045199352, + "grad_norm": 2.54388689994812, + "learning_rate": 4.998764111033465e-05, + "loss": 7.1261, + "step": 1684 + }, + { + "epoch": 0.01002117232848035, + "grad_norm": 2.8077542781829834, + "learning_rate": 4.9987626420445823e-05, + "loss": 7.1349, + "step": 1685 + }, + { + "epoch": 0.010027119611761347, + "grad_norm": 2.228707790374756, + "learning_rate": 4.9987611721834063e-05, + "loss": 7.1123, + "step": 1686 + }, + { + "epoch": 0.010033066895042344, + "grad_norm": 2.648607015609741, + "learning_rate": 4.998759701449939e-05, + "loss": 7.0263, + "step": 1687 + }, + { + "epoch": 0.010039014178323341, + "grad_norm": 3.0278162956237793, + "learning_rate": 4.99875822984418e-05, + "loss": 6.6463, + "step": 1688 + }, + { + "epoch": 0.01004496146160434, + "grad_norm": 3.1550052165985107, + "learning_rate": 4.998756757366131e-05, + "loss": 6.8773, + "step": 1689 + }, + { + "epoch": 0.010050908744885336, + "grad_norm": 3.3911843299865723, + "learning_rate": 4.998755284015792e-05, + "loss": 7.5045, + "step": 1690 + }, + { + "epoch": 0.010056856028166333, + "grad_norm": 2.668861150741577, + "learning_rate": 4.998753809793162e-05, + "loss": 7.5545, + "step": 1691 + }, + { + "epoch": 0.01006280331144733, + "grad_norm": 2.182792901992798, + "learning_rate": 4.998752334698244e-05, + "loss": 7.2315, + "step": 1692 + }, + { + "epoch": 0.010068750594728329, + "grad_norm": 2.981476068496704, + "learning_rate": 4.998750858731037e-05, + "loss": 7.3455, + "step": 1693 + }, + { + "epoch": 0.010074697878009325, + "grad_norm": 3.1855525970458984, + "learning_rate": 4.998749381891542e-05, + "loss": 7.3408, + "step": 1694 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 2.5677361488342285, + "learning_rate": 4.998747904179759e-05, + "loss": 6.7591, + "step": 1695 + }, + { + "epoch": 0.010086592444571319, + "grad_norm": 2.7397539615631104, + "learning_rate": 4.9987464255956894e-05, + "loss": 7.3976, + "step": 1696 + }, + { + "epoch": 0.010092539727852318, + "grad_norm": 2.1141586303710938, + "learning_rate": 4.998744946139333e-05, + "loss": 7.4287, + "step": 1697 + }, + { + "epoch": 0.010098487011133314, + "grad_norm": 2.1999096870422363, + "learning_rate": 4.998743465810691e-05, + "loss": 7.4804, + "step": 1698 + }, + { + "epoch": 0.010104434294414311, + "grad_norm": 2.4150960445404053, + "learning_rate": 4.9987419846097634e-05, + "loss": 7.1743, + "step": 1699 + }, + { + "epoch": 0.010110381577695308, + "grad_norm": 2.564270496368408, + "learning_rate": 4.998740502536551e-05, + "loss": 7.262, + "step": 1700 + }, + { + "epoch": 0.010116328860976307, + "grad_norm": 3.045964241027832, + "learning_rate": 4.9987390195910536e-05, + "loss": 7.0778, + "step": 1701 + }, + { + "epoch": 0.010122276144257304, + "grad_norm": 3.2720210552215576, + "learning_rate": 4.998737535773272e-05, + "loss": 7.2188, + "step": 1702 + }, + { + "epoch": 0.0101282234275383, + "grad_norm": 2.54496693611145, + "learning_rate": 4.998736051083207e-05, + "loss": 6.9985, + "step": 1703 + }, + { + "epoch": 0.010134170710819297, + "grad_norm": 3.6252541542053223, + "learning_rate": 4.998734565520859e-05, + "loss": 7.3502, + "step": 1704 + }, + { + "epoch": 0.010140117994100296, + "grad_norm": 3.468963146209717, + "learning_rate": 4.99873307908623e-05, + "loss": 6.9642, + "step": 1705 + }, + { + "epoch": 0.010146065277381293, + "grad_norm": 2.8778045177459717, + "learning_rate": 4.9987315917793174e-05, + "loss": 6.8675, + "step": 1706 + }, + { + "epoch": 0.01015201256066229, + "grad_norm": 2.4492053985595703, + "learning_rate": 4.9987301036001236e-05, + "loss": 7.3484, + "step": 1707 + }, + { + "epoch": 0.010157959843943286, + "grad_norm": 2.5170838832855225, + "learning_rate": 4.99872861454865e-05, + "loss": 7.6004, + "step": 1708 + }, + { + "epoch": 0.010163907127224285, + "grad_norm": 2.3539648056030273, + "learning_rate": 4.998727124624895e-05, + "loss": 7.3304, + "step": 1709 + }, + { + "epoch": 0.010169854410505282, + "grad_norm": 2.6097705364227295, + "learning_rate": 4.998725633828861e-05, + "loss": 7.3227, + "step": 1710 + }, + { + "epoch": 0.010175801693786278, + "grad_norm": 2.5909392833709717, + "learning_rate": 4.9987241421605466e-05, + "loss": 7.3797, + "step": 1711 + }, + { + "epoch": 0.010181748977067275, + "grad_norm": 3.143157958984375, + "learning_rate": 4.998722649619954e-05, + "loss": 7.1236, + "step": 1712 + }, + { + "epoch": 0.010187696260348274, + "grad_norm": 2.0621843338012695, + "learning_rate": 4.9987211562070835e-05, + "loss": 7.5322, + "step": 1713 + }, + { + "epoch": 0.01019364354362927, + "grad_norm": 1.7781084775924683, + "learning_rate": 4.9987196619219354e-05, + "loss": 7.428, + "step": 1714 + }, + { + "epoch": 0.010199590826910267, + "grad_norm": 2.3108980655670166, + "learning_rate": 4.9987181667645094e-05, + "loss": 7.3814, + "step": 1715 + }, + { + "epoch": 0.010205538110191264, + "grad_norm": 2.5184621810913086, + "learning_rate": 4.998716670734807e-05, + "loss": 7.374, + "step": 1716 + }, + { + "epoch": 0.010211485393472261, + "grad_norm": 1.9185826778411865, + "learning_rate": 4.9987151738328284e-05, + "loss": 7.3352, + "step": 1717 + }, + { + "epoch": 0.01021743267675326, + "grad_norm": 2.794224262237549, + "learning_rate": 4.998713676058574e-05, + "loss": 7.0293, + "step": 1718 + }, + { + "epoch": 0.010223379960034256, + "grad_norm": 3.601804733276367, + "learning_rate": 4.998712177412045e-05, + "loss": 7.0277, + "step": 1719 + }, + { + "epoch": 0.010229327243315253, + "grad_norm": 3.3258707523345947, + "learning_rate": 4.998710677893241e-05, + "loss": 6.9478, + "step": 1720 + }, + { + "epoch": 0.01023527452659625, + "grad_norm": 3.147439956665039, + "learning_rate": 4.9987091775021625e-05, + "loss": 6.7295, + "step": 1721 + }, + { + "epoch": 0.010241221809877249, + "grad_norm": 2.7821006774902344, + "learning_rate": 4.998707676238811e-05, + "loss": 6.7587, + "step": 1722 + }, + { + "epoch": 0.010247169093158245, + "grad_norm": 2.580597400665283, + "learning_rate": 4.998706174103186e-05, + "loss": 6.9091, + "step": 1723 + }, + { + "epoch": 0.010253116376439242, + "grad_norm": 2.5501208305358887, + "learning_rate": 4.998704671095289e-05, + "loss": 7.3262, + "step": 1724 + }, + { + "epoch": 0.010259063659720239, + "grad_norm": 2.5460124015808105, + "learning_rate": 4.99870316721512e-05, + "loss": 7.278, + "step": 1725 + }, + { + "epoch": 0.010265010943001238, + "grad_norm": 2.0253796577453613, + "learning_rate": 4.998701662462679e-05, + "loss": 7.1757, + "step": 1726 + }, + { + "epoch": 0.010270958226282234, + "grad_norm": 2.3127388954162598, + "learning_rate": 4.998700156837968e-05, + "loss": 7.1057, + "step": 1727 + }, + { + "epoch": 0.010276905509563231, + "grad_norm": 2.931878089904785, + "learning_rate": 4.998698650340986e-05, + "loss": 6.9993, + "step": 1728 + }, + { + "epoch": 0.010282852792844228, + "grad_norm": 3.239272356033325, + "learning_rate": 4.998697142971734e-05, + "loss": 6.7754, + "step": 1729 + }, + { + "epoch": 0.010288800076125227, + "grad_norm": 2.388212203979492, + "learning_rate": 4.998695634730213e-05, + "loss": 7.2794, + "step": 1730 + }, + { + "epoch": 0.010294747359406223, + "grad_norm": 2.7766799926757812, + "learning_rate": 4.998694125616423e-05, + "loss": 7.4636, + "step": 1731 + }, + { + "epoch": 0.01030069464268722, + "grad_norm": 2.543757915496826, + "learning_rate": 4.9986926156303646e-05, + "loss": 6.8801, + "step": 1732 + }, + { + "epoch": 0.010306641925968217, + "grad_norm": 1.8907097578048706, + "learning_rate": 4.9986911047720384e-05, + "loss": 7.0353, + "step": 1733 + }, + { + "epoch": 0.010312589209249216, + "grad_norm": 1.9585598707199097, + "learning_rate": 4.9986895930414444e-05, + "loss": 7.0469, + "step": 1734 + }, + { + "epoch": 0.010318536492530212, + "grad_norm": 2.5191497802734375, + "learning_rate": 4.998688080438585e-05, + "loss": 7.1469, + "step": 1735 + }, + { + "epoch": 0.01032448377581121, + "grad_norm": 3.5709545612335205, + "learning_rate": 4.998686566963459e-05, + "loss": 7.0499, + "step": 1736 + }, + { + "epoch": 0.010330431059092206, + "grad_norm": 2.3778624534606934, + "learning_rate": 4.998685052616067e-05, + "loss": 7.5897, + "step": 1737 + }, + { + "epoch": 0.010336378342373205, + "grad_norm": 2.0795674324035645, + "learning_rate": 4.9986835373964094e-05, + "loss": 6.8778, + "step": 1738 + }, + { + "epoch": 0.010342325625654201, + "grad_norm": 2.7674901485443115, + "learning_rate": 4.9986820213044875e-05, + "loss": 6.4428, + "step": 1739 + }, + { + "epoch": 0.010348272908935198, + "grad_norm": 2.7203595638275146, + "learning_rate": 4.998680504340302e-05, + "loss": 7.4668, + "step": 1740 + }, + { + "epoch": 0.010354220192216195, + "grad_norm": 2.840240955352783, + "learning_rate": 4.998678986503853e-05, + "loss": 7.2219, + "step": 1741 + }, + { + "epoch": 0.010360167475497194, + "grad_norm": 2.7803452014923096, + "learning_rate": 4.9986774677951404e-05, + "loss": 6.5674, + "step": 1742 + }, + { + "epoch": 0.01036611475877819, + "grad_norm": 2.467574119567871, + "learning_rate": 4.998675948214165e-05, + "loss": 6.9621, + "step": 1743 + }, + { + "epoch": 0.010372062042059187, + "grad_norm": 2.1437904834747314, + "learning_rate": 4.998674427760929e-05, + "loss": 7.1564, + "step": 1744 + }, + { + "epoch": 0.010378009325340184, + "grad_norm": 2.504685163497925, + "learning_rate": 4.9986729064354304e-05, + "loss": 6.8836, + "step": 1745 + }, + { + "epoch": 0.010383956608621183, + "grad_norm": 2.401296615600586, + "learning_rate": 4.998671384237671e-05, + "loss": 7.2906, + "step": 1746 + }, + { + "epoch": 0.01038990389190218, + "grad_norm": 2.233701705932617, + "learning_rate": 4.9986698611676516e-05, + "loss": 6.6854, + "step": 1747 + }, + { + "epoch": 0.010395851175183176, + "grad_norm": 2.9597983360290527, + "learning_rate": 4.998668337225373e-05, + "loss": 6.8859, + "step": 1748 + }, + { + "epoch": 0.010401798458464173, + "grad_norm": 3.2164804935455322, + "learning_rate": 4.998666812410834e-05, + "loss": 6.8255, + "step": 1749 + }, + { + "epoch": 0.01040774574174517, + "grad_norm": 3.010002374649048, + "learning_rate": 4.9986652867240364e-05, + "loss": 6.7092, + "step": 1750 + }, + { + "epoch": 0.010413693025026169, + "grad_norm": 2.8442068099975586, + "learning_rate": 4.998663760164981e-05, + "loss": 6.7231, + "step": 1751 + }, + { + "epoch": 0.010419640308307165, + "grad_norm": 3.127922773361206, + "learning_rate": 4.9986622327336676e-05, + "loss": 6.6072, + "step": 1752 + }, + { + "epoch": 0.010425587591588162, + "grad_norm": 2.7306833267211914, + "learning_rate": 4.998660704430097e-05, + "loss": 6.696, + "step": 1753 + }, + { + "epoch": 0.010431534874869159, + "grad_norm": 2.9005799293518066, + "learning_rate": 4.99865917525427e-05, + "loss": 6.6598, + "step": 1754 + }, + { + "epoch": 0.010437482158150158, + "grad_norm": 3.17934513092041, + "learning_rate": 4.9986576452061865e-05, + "loss": 6.5887, + "step": 1755 + }, + { + "epoch": 0.010443429441431154, + "grad_norm": 2.9390244483947754, + "learning_rate": 4.9986561142858476e-05, + "loss": 6.5375, + "step": 1756 + }, + { + "epoch": 0.010449376724712151, + "grad_norm": 2.5547196865081787, + "learning_rate": 4.998654582493254e-05, + "loss": 6.7484, + "step": 1757 + }, + { + "epoch": 0.010455324007993148, + "grad_norm": 2.9969568252563477, + "learning_rate": 4.9986530498284054e-05, + "loss": 6.6496, + "step": 1758 + }, + { + "epoch": 0.010461271291274147, + "grad_norm": 2.843932867050171, + "learning_rate": 4.998651516291303e-05, + "loss": 6.5713, + "step": 1759 + }, + { + "epoch": 0.010467218574555143, + "grad_norm": 2.9114811420440674, + "learning_rate": 4.9986499818819476e-05, + "loss": 7.5248, + "step": 1760 + }, + { + "epoch": 0.01047316585783614, + "grad_norm": 3.0292229652404785, + "learning_rate": 4.998648446600339e-05, + "loss": 7.2346, + "step": 1761 + }, + { + "epoch": 0.010479113141117137, + "grad_norm": 2.553088426589966, + "learning_rate": 4.998646910446478e-05, + "loss": 7.1531, + "step": 1762 + }, + { + "epoch": 0.010485060424398136, + "grad_norm": 2.9838356971740723, + "learning_rate": 4.998645373420365e-05, + "loss": 6.6561, + "step": 1763 + }, + { + "epoch": 0.010491007707679132, + "grad_norm": 2.8948864936828613, + "learning_rate": 4.9986438355220014e-05, + "loss": 6.463, + "step": 1764 + }, + { + "epoch": 0.01049695499096013, + "grad_norm": 2.805084228515625, + "learning_rate": 4.9986422967513856e-05, + "loss": 6.701, + "step": 1765 + }, + { + "epoch": 0.010502902274241126, + "grad_norm": 2.748077869415283, + "learning_rate": 4.998640757108522e-05, + "loss": 7.3223, + "step": 1766 + }, + { + "epoch": 0.010508849557522125, + "grad_norm": 3.0048258304595947, + "learning_rate": 4.998639216593406e-05, + "loss": 7.2582, + "step": 1767 + }, + { + "epoch": 0.010514796840803121, + "grad_norm": 2.538522958755493, + "learning_rate": 4.998637675206043e-05, + "loss": 7.1208, + "step": 1768 + }, + { + "epoch": 0.010520744124084118, + "grad_norm": 2.2091188430786133, + "learning_rate": 4.99863613294643e-05, + "loss": 7.0577, + "step": 1769 + }, + { + "epoch": 0.010526691407365115, + "grad_norm": 2.8454909324645996, + "learning_rate": 4.998634589814569e-05, + "loss": 7.1296, + "step": 1770 + }, + { + "epoch": 0.010532638690646114, + "grad_norm": 3.4139351844787598, + "learning_rate": 4.998633045810461e-05, + "loss": 6.9565, + "step": 1771 + }, + { + "epoch": 0.01053858597392711, + "grad_norm": 2.3192107677459717, + "learning_rate": 4.9986315009341066e-05, + "loss": 6.6027, + "step": 1772 + }, + { + "epoch": 0.010544533257208107, + "grad_norm": 2.309290647506714, + "learning_rate": 4.998629955185505e-05, + "loss": 7.0417, + "step": 1773 + }, + { + "epoch": 0.010550480540489104, + "grad_norm": 3.2046520709991455, + "learning_rate": 4.998628408564657e-05, + "loss": 7.0368, + "step": 1774 + }, + { + "epoch": 0.010556427823770103, + "grad_norm": 2.459064483642578, + "learning_rate": 4.9986268610715646e-05, + "loss": 7.2726, + "step": 1775 + }, + { + "epoch": 0.0105623751070511, + "grad_norm": 2.602522134780884, + "learning_rate": 4.998625312706227e-05, + "loss": 7.3377, + "step": 1776 + }, + { + "epoch": 0.010568322390332096, + "grad_norm": 3.9599175453186035, + "learning_rate": 4.998623763468645e-05, + "loss": 6.9146, + "step": 1777 + }, + { + "epoch": 0.010574269673613093, + "grad_norm": 3.312527894973755, + "learning_rate": 4.99862221335882e-05, + "loss": 6.7457, + "step": 1778 + }, + { + "epoch": 0.01058021695689409, + "grad_norm": 2.5287606716156006, + "learning_rate": 4.9986206623767506e-05, + "loss": 7.2651, + "step": 1779 + }, + { + "epoch": 0.010586164240175088, + "grad_norm": 2.4065616130828857, + "learning_rate": 4.99861911052244e-05, + "loss": 7.1135, + "step": 1780 + }, + { + "epoch": 0.010592111523456085, + "grad_norm": 2.321385383605957, + "learning_rate": 4.998617557795886e-05, + "loss": 7.1985, + "step": 1781 + }, + { + "epoch": 0.010598058806737082, + "grad_norm": 2.118995189666748, + "learning_rate": 4.9986160041970906e-05, + "loss": 7.2832, + "step": 1782 + }, + { + "epoch": 0.010604006090018079, + "grad_norm": 2.2536606788635254, + "learning_rate": 4.9986144497260544e-05, + "loss": 7.191, + "step": 1783 + }, + { + "epoch": 0.010609953373299078, + "grad_norm": 2.2956738471984863, + "learning_rate": 4.998612894382778e-05, + "loss": 7.0496, + "step": 1784 + }, + { + "epoch": 0.010615900656580074, + "grad_norm": 2.4258289337158203, + "learning_rate": 4.9986113381672614e-05, + "loss": 7.2767, + "step": 1785 + }, + { + "epoch": 0.010621847939861071, + "grad_norm": 2.4731507301330566, + "learning_rate": 4.998609781079505e-05, + "loss": 6.8805, + "step": 1786 + }, + { + "epoch": 0.010627795223142068, + "grad_norm": 2.3245391845703125, + "learning_rate": 4.9986082231195105e-05, + "loss": 6.8921, + "step": 1787 + }, + { + "epoch": 0.010633742506423067, + "grad_norm": 2.6239898204803467, + "learning_rate": 4.998606664287278e-05, + "loss": 6.9353, + "step": 1788 + }, + { + "epoch": 0.010639689789704063, + "grad_norm": 2.186162233352661, + "learning_rate": 4.9986051045828065e-05, + "loss": 6.8466, + "step": 1789 + }, + { + "epoch": 0.01064563707298506, + "grad_norm": 2.2362232208251953, + "learning_rate": 4.998603544006098e-05, + "loss": 6.82, + "step": 1790 + }, + { + "epoch": 0.010651584356266057, + "grad_norm": 2.2302427291870117, + "learning_rate": 4.998601982557153e-05, + "loss": 6.7034, + "step": 1791 + }, + { + "epoch": 0.010657531639547056, + "grad_norm": 2.0393195152282715, + "learning_rate": 4.998600420235972e-05, + "loss": 6.6646, + "step": 1792 + }, + { + "epoch": 0.010663478922828052, + "grad_norm": 1.976536512374878, + "learning_rate": 4.9985988570425556e-05, + "loss": 6.4994, + "step": 1793 + }, + { + "epoch": 0.01066942620610905, + "grad_norm": 2.4167046546936035, + "learning_rate": 4.998597292976904e-05, + "loss": 6.7849, + "step": 1794 + }, + { + "epoch": 0.010675373489390046, + "grad_norm": 2.3077776432037354, + "learning_rate": 4.998595728039018e-05, + "loss": 6.8356, + "step": 1795 + }, + { + "epoch": 0.010681320772671045, + "grad_norm": 2.5263309478759766, + "learning_rate": 4.998594162228898e-05, + "loss": 6.6351, + "step": 1796 + }, + { + "epoch": 0.010687268055952041, + "grad_norm": 2.153365135192871, + "learning_rate": 4.9985925955465443e-05, + "loss": 6.7911, + "step": 1797 + }, + { + "epoch": 0.010693215339233038, + "grad_norm": 3.3034393787384033, + "learning_rate": 4.998591027991958e-05, + "loss": 6.7589, + "step": 1798 + }, + { + "epoch": 0.010699162622514035, + "grad_norm": 2.2177388668060303, + "learning_rate": 4.998589459565139e-05, + "loss": 6.571, + "step": 1799 + }, + { + "epoch": 0.010705109905795034, + "grad_norm": 2.3165230751037598, + "learning_rate": 4.9985878902660886e-05, + "loss": 6.9124, + "step": 1800 + }, + { + "epoch": 0.01071105718907603, + "grad_norm": 2.270045757293701, + "learning_rate": 4.998586320094807e-05, + "loss": 6.4442, + "step": 1801 + }, + { + "epoch": 0.010717004472357027, + "grad_norm": 2.1198744773864746, + "learning_rate": 4.9985847490512945e-05, + "loss": 6.555, + "step": 1802 + }, + { + "epoch": 0.010722951755638024, + "grad_norm": 2.5428359508514404, + "learning_rate": 4.998583177135552e-05, + "loss": 6.8991, + "step": 1803 + }, + { + "epoch": 0.010728899038919023, + "grad_norm": 1.983817219734192, + "learning_rate": 4.99858160434758e-05, + "loss": 6.6428, + "step": 1804 + }, + { + "epoch": 0.01073484632220002, + "grad_norm": 2.2749712467193604, + "learning_rate": 4.998580030687379e-05, + "loss": 6.7294, + "step": 1805 + }, + { + "epoch": 0.010740793605481016, + "grad_norm": 1.914762258529663, + "learning_rate": 4.998578456154949e-05, + "loss": 7.0395, + "step": 1806 + }, + { + "epoch": 0.010746740888762013, + "grad_norm": 1.6850765943527222, + "learning_rate": 4.998576880750292e-05, + "loss": 6.862, + "step": 1807 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 2.2930233478546143, + "learning_rate": 4.9985753044734076e-05, + "loss": 6.8213, + "step": 1808 + }, + { + "epoch": 0.010758635455324008, + "grad_norm": 2.193464756011963, + "learning_rate": 4.998573727324295e-05, + "loss": 6.9303, + "step": 1809 + }, + { + "epoch": 0.010764582738605005, + "grad_norm": 2.2451658248901367, + "learning_rate": 4.9985721493029576e-05, + "loss": 6.8061, + "step": 1810 + }, + { + "epoch": 0.010770530021886002, + "grad_norm": 2.164214849472046, + "learning_rate": 4.998570570409394e-05, + "loss": 6.6485, + "step": 1811 + }, + { + "epoch": 0.010776477305166999, + "grad_norm": 2.3530375957489014, + "learning_rate": 4.9985689906436054e-05, + "loss": 6.6826, + "step": 1812 + }, + { + "epoch": 0.010782424588447997, + "grad_norm": 3.007641553878784, + "learning_rate": 4.998567410005591e-05, + "loss": 6.0781, + "step": 1813 + }, + { + "epoch": 0.010788371871728994, + "grad_norm": 2.500411033630371, + "learning_rate": 4.998565828495354e-05, + "loss": 7.0544, + "step": 1814 + }, + { + "epoch": 0.010794319155009991, + "grad_norm": 2.329221725463867, + "learning_rate": 4.998564246112893e-05, + "loss": 7.2505, + "step": 1815 + }, + { + "epoch": 0.010800266438290988, + "grad_norm": 2.05120849609375, + "learning_rate": 4.998562662858209e-05, + "loss": 7.3094, + "step": 1816 + }, + { + "epoch": 0.010806213721571986, + "grad_norm": 1.83049738407135, + "learning_rate": 4.9985610787313023e-05, + "loss": 6.7752, + "step": 1817 + }, + { + "epoch": 0.010812161004852983, + "grad_norm": 2.2754576206207275, + "learning_rate": 4.998559493732174e-05, + "loss": 6.9396, + "step": 1818 + }, + { + "epoch": 0.01081810828813398, + "grad_norm": 2.104849338531494, + "learning_rate": 4.998557907860825e-05, + "loss": 7.2624, + "step": 1819 + }, + { + "epoch": 0.010824055571414977, + "grad_norm": 3.152069568634033, + "learning_rate": 4.998556321117254e-05, + "loss": 6.6763, + "step": 1820 + }, + { + "epoch": 0.010830002854695975, + "grad_norm": 3.4046475887298584, + "learning_rate": 4.9985547335014636e-05, + "loss": 6.7145, + "step": 1821 + }, + { + "epoch": 0.010835950137976972, + "grad_norm": 1.9208084344863892, + "learning_rate": 4.9985531450134534e-05, + "loss": 6.8985, + "step": 1822 + }, + { + "epoch": 0.010841897421257969, + "grad_norm": 2.4949824810028076, + "learning_rate": 4.998551555653224e-05, + "loss": 6.8196, + "step": 1823 + }, + { + "epoch": 0.010847844704538966, + "grad_norm": 2.613175392150879, + "learning_rate": 4.998549965420776e-05, + "loss": 6.7918, + "step": 1824 + }, + { + "epoch": 0.010853791987819965, + "grad_norm": 2.3322529792785645, + "learning_rate": 4.9985483743161105e-05, + "loss": 6.6133, + "step": 1825 + }, + { + "epoch": 0.010859739271100961, + "grad_norm": 3.116680860519409, + "learning_rate": 4.998546782339227e-05, + "loss": 7.4026, + "step": 1826 + }, + { + "epoch": 0.010865686554381958, + "grad_norm": 2.673938274383545, + "learning_rate": 4.998545189490127e-05, + "loss": 6.9181, + "step": 1827 + }, + { + "epoch": 0.010871633837662955, + "grad_norm": 2.135727643966675, + "learning_rate": 4.998543595768811e-05, + "loss": 6.9514, + "step": 1828 + }, + { + "epoch": 0.010877581120943954, + "grad_norm": 2.241696357727051, + "learning_rate": 4.9985420011752784e-05, + "loss": 7.126, + "step": 1829 + }, + { + "epoch": 0.01088352840422495, + "grad_norm": 2.316342830657959, + "learning_rate": 4.9985404057095315e-05, + "loss": 6.9752, + "step": 1830 + }, + { + "epoch": 0.010889475687505947, + "grad_norm": 2.591611623764038, + "learning_rate": 4.998538809371569e-05, + "loss": 6.8721, + "step": 1831 + }, + { + "epoch": 0.010895422970786944, + "grad_norm": 2.2846317291259766, + "learning_rate": 4.9985372121613935e-05, + "loss": 6.9468, + "step": 1832 + }, + { + "epoch": 0.010901370254067943, + "grad_norm": 2.0799343585968018, + "learning_rate": 4.998535614079004e-05, + "loss": 7.0839, + "step": 1833 + }, + { + "epoch": 0.01090731753734894, + "grad_norm": 2.1908833980560303, + "learning_rate": 4.998534015124401e-05, + "loss": 6.7228, + "step": 1834 + }, + { + "epoch": 0.010913264820629936, + "grad_norm": 2.329401969909668, + "learning_rate": 4.998532415297587e-05, + "loss": 6.715, + "step": 1835 + }, + { + "epoch": 0.010919212103910933, + "grad_norm": 1.9492794275283813, + "learning_rate": 4.998530814598559e-05, + "loss": 6.6762, + "step": 1836 + }, + { + "epoch": 0.010925159387191932, + "grad_norm": 1.9564979076385498, + "learning_rate": 4.998529213027321e-05, + "loss": 6.8545, + "step": 1837 + }, + { + "epoch": 0.010931106670472928, + "grad_norm": 1.8424931764602661, + "learning_rate": 4.998527610583872e-05, + "loss": 6.8505, + "step": 1838 + }, + { + "epoch": 0.010937053953753925, + "grad_norm": 1.9743967056274414, + "learning_rate": 4.998526007268213e-05, + "loss": 6.8413, + "step": 1839 + }, + { + "epoch": 0.010943001237034922, + "grad_norm": 2.31296968460083, + "learning_rate": 4.998524403080345e-05, + "loss": 6.7327, + "step": 1840 + }, + { + "epoch": 0.010948948520315919, + "grad_norm": 2.049689292907715, + "learning_rate": 4.9985227980202665e-05, + "loss": 7.0029, + "step": 1841 + }, + { + "epoch": 0.010954895803596917, + "grad_norm": 2.1640658378601074, + "learning_rate": 4.99852119208798e-05, + "loss": 7.0749, + "step": 1842 + }, + { + "epoch": 0.010960843086877914, + "grad_norm": 1.8896230459213257, + "learning_rate": 4.998519585283486e-05, + "loss": 6.7249, + "step": 1843 + }, + { + "epoch": 0.010966790370158911, + "grad_norm": 2.4835314750671387, + "learning_rate": 4.998517977606785e-05, + "loss": 6.5605, + "step": 1844 + }, + { + "epoch": 0.010972737653439908, + "grad_norm": 2.2472622394561768, + "learning_rate": 4.998516369057876e-05, + "loss": 6.8291, + "step": 1845 + }, + { + "epoch": 0.010978684936720906, + "grad_norm": 2.499096155166626, + "learning_rate": 4.998514759636762e-05, + "loss": 6.6921, + "step": 1846 + }, + { + "epoch": 0.010984632220001903, + "grad_norm": 2.296786308288574, + "learning_rate": 4.998513149343442e-05, + "loss": 7.0475, + "step": 1847 + }, + { + "epoch": 0.0109905795032829, + "grad_norm": 2.2896368503570557, + "learning_rate": 4.998511538177916e-05, + "loss": 6.775, + "step": 1848 + }, + { + "epoch": 0.010996526786563897, + "grad_norm": 2.025575637817383, + "learning_rate": 4.998509926140186e-05, + "loss": 6.9538, + "step": 1849 + }, + { + "epoch": 0.011002474069844895, + "grad_norm": 2.23502779006958, + "learning_rate": 4.9985083132302525e-05, + "loss": 7.0595, + "step": 1850 + }, + { + "epoch": 0.011008421353125892, + "grad_norm": 2.7158777713775635, + "learning_rate": 4.998506699448115e-05, + "loss": 7.0086, + "step": 1851 + }, + { + "epoch": 0.011014368636406889, + "grad_norm": 2.2707183361053467, + "learning_rate": 4.998505084793775e-05, + "loss": 6.6396, + "step": 1852 + }, + { + "epoch": 0.011020315919687886, + "grad_norm": 3.196085214614868, + "learning_rate": 4.998503469267232e-05, + "loss": 6.6026, + "step": 1853 + }, + { + "epoch": 0.011026263202968884, + "grad_norm": 2.4472603797912598, + "learning_rate": 4.9985018528684876e-05, + "loss": 7.1332, + "step": 1854 + }, + { + "epoch": 0.011032210486249881, + "grad_norm": 2.7070915699005127, + "learning_rate": 4.998500235597542e-05, + "loss": 6.9669, + "step": 1855 + }, + { + "epoch": 0.011038157769530878, + "grad_norm": 2.127729654312134, + "learning_rate": 4.998498617454396e-05, + "loss": 6.9589, + "step": 1856 + }, + { + "epoch": 0.011044105052811875, + "grad_norm": 2.2897160053253174, + "learning_rate": 4.99849699843905e-05, + "loss": 7.0402, + "step": 1857 + }, + { + "epoch": 0.011050052336092873, + "grad_norm": 1.888961672782898, + "learning_rate": 4.998495378551504e-05, + "loss": 6.9406, + "step": 1858 + }, + { + "epoch": 0.01105599961937387, + "grad_norm": 1.9889254570007324, + "learning_rate": 4.9984937577917594e-05, + "loss": 6.8392, + "step": 1859 + }, + { + "epoch": 0.011061946902654867, + "grad_norm": 3.042891025543213, + "learning_rate": 4.998492136159817e-05, + "loss": 6.7743, + "step": 1860 + }, + { + "epoch": 0.011067894185935864, + "grad_norm": 2.423988103866577, + "learning_rate": 4.998490513655676e-05, + "loss": 6.9802, + "step": 1861 + }, + { + "epoch": 0.011073841469216862, + "grad_norm": 2.6415674686431885, + "learning_rate": 4.998488890279338e-05, + "loss": 6.7104, + "step": 1862 + }, + { + "epoch": 0.01107978875249786, + "grad_norm": 2.686969518661499, + "learning_rate": 4.998487266030804e-05, + "loss": 7.0539, + "step": 1863 + }, + { + "epoch": 0.011085736035778856, + "grad_norm": 2.6695480346679688, + "learning_rate": 4.998485640910072e-05, + "loss": 6.9812, + "step": 1864 + }, + { + "epoch": 0.011091683319059853, + "grad_norm": 2.6251392364501953, + "learning_rate": 4.9984840149171466e-05, + "loss": 6.9954, + "step": 1865 + }, + { + "epoch": 0.011097630602340851, + "grad_norm": 2.487593650817871, + "learning_rate": 4.998482388052025e-05, + "loss": 7.0847, + "step": 1866 + }, + { + "epoch": 0.011103577885621848, + "grad_norm": 2.3249282836914062, + "learning_rate": 4.998480760314709e-05, + "loss": 6.9936, + "step": 1867 + }, + { + "epoch": 0.011109525168902845, + "grad_norm": 2.170452833175659, + "learning_rate": 4.9984791317052e-05, + "loss": 6.9155, + "step": 1868 + }, + { + "epoch": 0.011115472452183842, + "grad_norm": 3.331779718399048, + "learning_rate": 4.9984775022234975e-05, + "loss": 6.9128, + "step": 1869 + }, + { + "epoch": 0.01112141973546484, + "grad_norm": 2.7665064334869385, + "learning_rate": 4.9984758718696026e-05, + "loss": 6.9002, + "step": 1870 + }, + { + "epoch": 0.011127367018745837, + "grad_norm": 2.2872116565704346, + "learning_rate": 4.998474240643515e-05, + "loss": 6.9058, + "step": 1871 + }, + { + "epoch": 0.011133314302026834, + "grad_norm": 2.2125210762023926, + "learning_rate": 4.998472608545236e-05, + "loss": 6.932, + "step": 1872 + }, + { + "epoch": 0.011139261585307831, + "grad_norm": 2.1135666370391846, + "learning_rate": 4.998470975574766e-05, + "loss": 7.0018, + "step": 1873 + }, + { + "epoch": 0.011145208868588828, + "grad_norm": 2.0649492740631104, + "learning_rate": 4.998469341732105e-05, + "loss": 7.0132, + "step": 1874 + }, + { + "epoch": 0.011151156151869826, + "grad_norm": 4.0558576583862305, + "learning_rate": 4.9984677070172546e-05, + "loss": 6.8826, + "step": 1875 + }, + { + "epoch": 0.011157103435150823, + "grad_norm": 2.5675904750823975, + "learning_rate": 4.998466071430216e-05, + "loss": 7.0314, + "step": 1876 + }, + { + "epoch": 0.01116305071843182, + "grad_norm": 2.9773342609405518, + "learning_rate": 4.998464434970987e-05, + "loss": 6.8608, + "step": 1877 + }, + { + "epoch": 0.011168998001712817, + "grad_norm": 2.804995059967041, + "learning_rate": 4.9984627976395705e-05, + "loss": 6.6857, + "step": 1878 + }, + { + "epoch": 0.011174945284993815, + "grad_norm": 3.758509874343872, + "learning_rate": 4.9984611594359664e-05, + "loss": 6.9995, + "step": 1879 + }, + { + "epoch": 0.011180892568274812, + "grad_norm": 2.583061933517456, + "learning_rate": 4.998459520360176e-05, + "loss": 6.5844, + "step": 1880 + }, + { + "epoch": 0.011186839851555809, + "grad_norm": 2.357642889022827, + "learning_rate": 4.998457880412198e-05, + "loss": 6.6435, + "step": 1881 + }, + { + "epoch": 0.011192787134836806, + "grad_norm": 2.181558609008789, + "learning_rate": 4.9984562395920356e-05, + "loss": 7.045, + "step": 1882 + }, + { + "epoch": 0.011198734418117804, + "grad_norm": 2.4768264293670654, + "learning_rate": 4.998454597899688e-05, + "loss": 7.2053, + "step": 1883 + }, + { + "epoch": 0.011204681701398801, + "grad_norm": 2.4422380924224854, + "learning_rate": 4.998452955335154e-05, + "loss": 6.8038, + "step": 1884 + }, + { + "epoch": 0.011210628984679798, + "grad_norm": 3.3173701763153076, + "learning_rate": 4.998451311898437e-05, + "loss": 6.8619, + "step": 1885 + }, + { + "epoch": 0.011216576267960795, + "grad_norm": 2.4492833614349365, + "learning_rate": 4.9984496675895366e-05, + "loss": 6.6681, + "step": 1886 + }, + { + "epoch": 0.011222523551241793, + "grad_norm": 3.065016031265259, + "learning_rate": 4.998448022408453e-05, + "loss": 6.7439, + "step": 1887 + }, + { + "epoch": 0.01122847083452279, + "grad_norm": 3.327730655670166, + "learning_rate": 4.998446376355187e-05, + "loss": 6.735, + "step": 1888 + }, + { + "epoch": 0.011234418117803787, + "grad_norm": 3.428292751312256, + "learning_rate": 4.998444729429739e-05, + "loss": 6.5277, + "step": 1889 + }, + { + "epoch": 0.011240365401084784, + "grad_norm": 2.4982972145080566, + "learning_rate": 4.9984430816321095e-05, + "loss": 6.8228, + "step": 1890 + }, + { + "epoch": 0.011246312684365782, + "grad_norm": 2.568232297897339, + "learning_rate": 4.9984414329623e-05, + "loss": 7.0772, + "step": 1891 + }, + { + "epoch": 0.01125225996764678, + "grad_norm": 2.534109115600586, + "learning_rate": 4.99843978342031e-05, + "loss": 7.0259, + "step": 1892 + }, + { + "epoch": 0.011258207250927776, + "grad_norm": 2.6394994258880615, + "learning_rate": 4.998438133006141e-05, + "loss": 6.8692, + "step": 1893 + }, + { + "epoch": 0.011264154534208773, + "grad_norm": 2.4049339294433594, + "learning_rate": 4.998436481719792e-05, + "loss": 6.8653, + "step": 1894 + }, + { + "epoch": 0.011270101817489771, + "grad_norm": 2.661191701889038, + "learning_rate": 4.998434829561266e-05, + "loss": 6.628, + "step": 1895 + }, + { + "epoch": 0.011276049100770768, + "grad_norm": 2.395829916000366, + "learning_rate": 4.998433176530561e-05, + "loss": 6.9876, + "step": 1896 + }, + { + "epoch": 0.011281996384051765, + "grad_norm": 2.547858715057373, + "learning_rate": 4.99843152262768e-05, + "loss": 7.3832, + "step": 1897 + }, + { + "epoch": 0.011287943667332762, + "grad_norm": 2.364246368408203, + "learning_rate": 4.998429867852621e-05, + "loss": 7.3771, + "step": 1898 + }, + { + "epoch": 0.01129389095061376, + "grad_norm": 2.3385260105133057, + "learning_rate": 4.998428212205387e-05, + "loss": 6.971, + "step": 1899 + }, + { + "epoch": 0.011299838233894757, + "grad_norm": 2.253760576248169, + "learning_rate": 4.998426555685977e-05, + "loss": 7.0588, + "step": 1900 + }, + { + "epoch": 0.011305785517175754, + "grad_norm": 2.4103500843048096, + "learning_rate": 4.998424898294392e-05, + "loss": 6.8731, + "step": 1901 + }, + { + "epoch": 0.011311732800456751, + "grad_norm": 2.4819014072418213, + "learning_rate": 4.998423240030633e-05, + "loss": 6.9502, + "step": 1902 + }, + { + "epoch": 0.011317680083737748, + "grad_norm": 2.503901243209839, + "learning_rate": 4.998421580894701e-05, + "loss": 7.017, + "step": 1903 + }, + { + "epoch": 0.011323627367018746, + "grad_norm": 2.2224137783050537, + "learning_rate": 4.9984199208865943e-05, + "loss": 7.1938, + "step": 1904 + }, + { + "epoch": 0.011329574650299743, + "grad_norm": 2.1291286945343018, + "learning_rate": 4.998418260006316e-05, + "loss": 7.1152, + "step": 1905 + }, + { + "epoch": 0.01133552193358074, + "grad_norm": 2.4611241817474365, + "learning_rate": 4.9984165982538655e-05, + "loss": 7.0316, + "step": 1906 + }, + { + "epoch": 0.011341469216861737, + "grad_norm": 2.329432487487793, + "learning_rate": 4.998414935629243e-05, + "loss": 7.0032, + "step": 1907 + }, + { + "epoch": 0.011347416500142735, + "grad_norm": 2.0618371963500977, + "learning_rate": 4.9984132721324505e-05, + "loss": 7.2566, + "step": 1908 + }, + { + "epoch": 0.011353363783423732, + "grad_norm": 2.063511371612549, + "learning_rate": 4.998411607763487e-05, + "loss": 7.0144, + "step": 1909 + }, + { + "epoch": 0.011359311066704729, + "grad_norm": 2.188871145248413, + "learning_rate": 4.998409942522355e-05, + "loss": 6.9652, + "step": 1910 + }, + { + "epoch": 0.011365258349985726, + "grad_norm": 2.499746322631836, + "learning_rate": 4.998408276409053e-05, + "loss": 6.9173, + "step": 1911 + }, + { + "epoch": 0.011371205633266724, + "grad_norm": 2.2809276580810547, + "learning_rate": 4.9984066094235826e-05, + "loss": 6.9202, + "step": 1912 + }, + { + "epoch": 0.011377152916547721, + "grad_norm": 1.7967042922973633, + "learning_rate": 4.998404941565944e-05, + "loss": 7.0652, + "step": 1913 + }, + { + "epoch": 0.011383100199828718, + "grad_norm": 2.339747667312622, + "learning_rate": 4.9984032728361384e-05, + "loss": 6.943, + "step": 1914 + }, + { + "epoch": 0.011389047483109715, + "grad_norm": 2.65795636177063, + "learning_rate": 4.998401603234166e-05, + "loss": 6.7197, + "step": 1915 + }, + { + "epoch": 0.011394994766390713, + "grad_norm": 2.181105852127075, + "learning_rate": 4.998399932760027e-05, + "loss": 6.7358, + "step": 1916 + }, + { + "epoch": 0.01140094204967171, + "grad_norm": 2.4130990505218506, + "learning_rate": 4.998398261413723e-05, + "loss": 6.8653, + "step": 1917 + }, + { + "epoch": 0.011406889332952707, + "grad_norm": 2.23822021484375, + "learning_rate": 4.998396589195254e-05, + "loss": 7.2125, + "step": 1918 + }, + { + "epoch": 0.011412836616233704, + "grad_norm": 2.176309823989868, + "learning_rate": 4.9983949161046207e-05, + "loss": 7.1077, + "step": 1919 + }, + { + "epoch": 0.011418783899514702, + "grad_norm": 2.2468202114105225, + "learning_rate": 4.9983932421418226e-05, + "loss": 7.1411, + "step": 1920 + }, + { + "epoch": 0.0114247311827957, + "grad_norm": 2.0748138427734375, + "learning_rate": 4.998391567306862e-05, + "loss": 7.0605, + "step": 1921 + }, + { + "epoch": 0.011430678466076696, + "grad_norm": 2.93007230758667, + "learning_rate": 4.998389891599738e-05, + "loss": 6.5832, + "step": 1922 + }, + { + "epoch": 0.011436625749357693, + "grad_norm": 2.125582218170166, + "learning_rate": 4.9983882150204534e-05, + "loss": 7.0761, + "step": 1923 + }, + { + "epoch": 0.011442573032638691, + "grad_norm": 2.3291571140289307, + "learning_rate": 4.998386537569005e-05, + "loss": 6.8781, + "step": 1924 + }, + { + "epoch": 0.011448520315919688, + "grad_norm": 2.8930649757385254, + "learning_rate": 4.9983848592453975e-05, + "loss": 7.1694, + "step": 1925 + }, + { + "epoch": 0.011454467599200685, + "grad_norm": 2.8450441360473633, + "learning_rate": 4.998383180049629e-05, + "loss": 7.1474, + "step": 1926 + }, + { + "epoch": 0.011460414882481682, + "grad_norm": 2.5900778770446777, + "learning_rate": 4.9983814999817016e-05, + "loss": 7.0423, + "step": 1927 + }, + { + "epoch": 0.01146636216576268, + "grad_norm": 2.289428949356079, + "learning_rate": 4.998379819041614e-05, + "loss": 6.9777, + "step": 1928 + }, + { + "epoch": 0.011472309449043677, + "grad_norm": 2.609384059906006, + "learning_rate": 4.998378137229368e-05, + "loss": 7.0488, + "step": 1929 + }, + { + "epoch": 0.011478256732324674, + "grad_norm": 2.1039459705352783, + "learning_rate": 4.998376454544964e-05, + "loss": 6.9308, + "step": 1930 + }, + { + "epoch": 0.01148420401560567, + "grad_norm": 2.1776134967803955, + "learning_rate": 4.9983747709884024e-05, + "loss": 6.9951, + "step": 1931 + }, + { + "epoch": 0.01149015129888667, + "grad_norm": 2.3150827884674072, + "learning_rate": 4.998373086559684e-05, + "loss": 6.9165, + "step": 1932 + }, + { + "epoch": 0.011496098582167666, + "grad_norm": 2.308370590209961, + "learning_rate": 4.99837140125881e-05, + "loss": 7.0155, + "step": 1933 + }, + { + "epoch": 0.011502045865448663, + "grad_norm": 2.234208106994629, + "learning_rate": 4.99836971508578e-05, + "loss": 6.9901, + "step": 1934 + }, + { + "epoch": 0.01150799314872966, + "grad_norm": 2.2340307235717773, + "learning_rate": 4.9983680280405953e-05, + "loss": 7.004, + "step": 1935 + }, + { + "epoch": 0.011513940432010657, + "grad_norm": 2.9458208084106445, + "learning_rate": 4.998366340123256e-05, + "loss": 7.3797, + "step": 1936 + }, + { + "epoch": 0.011519887715291655, + "grad_norm": 2.8516271114349365, + "learning_rate": 4.998364651333762e-05, + "loss": 7.3503, + "step": 1937 + }, + { + "epoch": 0.011525834998572652, + "grad_norm": 1.974025845527649, + "learning_rate": 4.998362961672116e-05, + "loss": 7.21, + "step": 1938 + }, + { + "epoch": 0.011531782281853649, + "grad_norm": 2.110117197036743, + "learning_rate": 4.998361271138317e-05, + "loss": 6.9494, + "step": 1939 + }, + { + "epoch": 0.011537729565134646, + "grad_norm": 2.2003207206726074, + "learning_rate": 4.9983595797323646e-05, + "loss": 6.8858, + "step": 1940 + }, + { + "epoch": 0.011543676848415644, + "grad_norm": 2.200982093811035, + "learning_rate": 4.998357887454262e-05, + "loss": 6.9512, + "step": 1941 + }, + { + "epoch": 0.011549624131696641, + "grad_norm": 2.303903102874756, + "learning_rate": 4.998356194304008e-05, + "loss": 7.2823, + "step": 1942 + }, + { + "epoch": 0.011555571414977638, + "grad_norm": 2.1376724243164062, + "learning_rate": 4.9983545002816035e-05, + "loss": 7.0321, + "step": 1943 + }, + { + "epoch": 0.011561518698258635, + "grad_norm": 2.3128151893615723, + "learning_rate": 4.99835280538705e-05, + "loss": 6.9714, + "step": 1944 + }, + { + "epoch": 0.011567465981539633, + "grad_norm": 2.359212636947632, + "learning_rate": 4.9983511096203465e-05, + "loss": 7.0496, + "step": 1945 + }, + { + "epoch": 0.01157341326482063, + "grad_norm": 2.346946954727173, + "learning_rate": 4.9983494129814945e-05, + "loss": 6.9865, + "step": 1946 + }, + { + "epoch": 0.011579360548101627, + "grad_norm": 2.447598934173584, + "learning_rate": 4.998347715470495e-05, + "loss": 6.9609, + "step": 1947 + }, + { + "epoch": 0.011585307831382624, + "grad_norm": 2.355300188064575, + "learning_rate": 4.998346017087348e-05, + "loss": 7.03, + "step": 1948 + }, + { + "epoch": 0.011591255114663622, + "grad_norm": 2.3207437992095947, + "learning_rate": 4.9983443178320545e-05, + "loss": 6.8181, + "step": 1949 + }, + { + "epoch": 0.011597202397944619, + "grad_norm": 2.359839677810669, + "learning_rate": 4.998342617704615e-05, + "loss": 6.8828, + "step": 1950 + }, + { + "epoch": 0.011603149681225616, + "grad_norm": 2.264890432357788, + "learning_rate": 4.9983409167050284e-05, + "loss": 7.3467, + "step": 1951 + }, + { + "epoch": 0.011609096964506613, + "grad_norm": 2.2720789909362793, + "learning_rate": 4.998339214833298e-05, + "loss": 7.3912, + "step": 1952 + }, + { + "epoch": 0.011615044247787611, + "grad_norm": 2.414433240890503, + "learning_rate": 4.9983375120894226e-05, + "loss": 7.1505, + "step": 1953 + }, + { + "epoch": 0.011620991531068608, + "grad_norm": 2.095290422439575, + "learning_rate": 4.998335808473404e-05, + "loss": 7.1642, + "step": 1954 + }, + { + "epoch": 0.011626938814349605, + "grad_norm": 2.118901252746582, + "learning_rate": 4.998334103985242e-05, + "loss": 7.0528, + "step": 1955 + }, + { + "epoch": 0.011632886097630602, + "grad_norm": 2.4361472129821777, + "learning_rate": 4.998332398624937e-05, + "loss": 7.3064, + "step": 1956 + }, + { + "epoch": 0.0116388333809116, + "grad_norm": 2.0978667736053467, + "learning_rate": 4.99833069239249e-05, + "loss": 7.0041, + "step": 1957 + }, + { + "epoch": 0.011644780664192597, + "grad_norm": 3.156329393386841, + "learning_rate": 4.998328985287902e-05, + "loss": 6.9169, + "step": 1958 + }, + { + "epoch": 0.011650727947473594, + "grad_norm": 2.311004400253296, + "learning_rate": 4.9983272773111735e-05, + "loss": 7.1128, + "step": 1959 + }, + { + "epoch": 0.01165667523075459, + "grad_norm": 2.406993865966797, + "learning_rate": 4.9983255684623036e-05, + "loss": 7.1403, + "step": 1960 + }, + { + "epoch": 0.01166262251403559, + "grad_norm": 2.0262861251831055, + "learning_rate": 4.998323858741295e-05, + "loss": 7.1014, + "step": 1961 + }, + { + "epoch": 0.011668569797316586, + "grad_norm": 2.369420051574707, + "learning_rate": 4.998322148148147e-05, + "loss": 7.1422, + "step": 1962 + }, + { + "epoch": 0.011674517080597583, + "grad_norm": 2.156019687652588, + "learning_rate": 4.998320436682861e-05, + "loss": 6.8405, + "step": 1963 + }, + { + "epoch": 0.01168046436387858, + "grad_norm": 2.35737681388855, + "learning_rate": 4.998318724345436e-05, + "loss": 6.8004, + "step": 1964 + }, + { + "epoch": 0.011686411647159577, + "grad_norm": 2.443676233291626, + "learning_rate": 4.998317011135875e-05, + "loss": 7.1959, + "step": 1965 + }, + { + "epoch": 0.011692358930440575, + "grad_norm": 2.1023004055023193, + "learning_rate": 4.998315297054177e-05, + "loss": 7.0684, + "step": 1966 + }, + { + "epoch": 0.011698306213721572, + "grad_norm": 2.5166187286376953, + "learning_rate": 4.998313582100342e-05, + "loss": 6.5876, + "step": 1967 + }, + { + "epoch": 0.011704253497002569, + "grad_norm": 2.1868557929992676, + "learning_rate": 4.9983118662743726e-05, + "loss": 6.6097, + "step": 1968 + }, + { + "epoch": 0.011710200780283566, + "grad_norm": 2.196786880493164, + "learning_rate": 4.998310149576269e-05, + "loss": 6.9798, + "step": 1969 + }, + { + "epoch": 0.011716148063564564, + "grad_norm": 2.361915111541748, + "learning_rate": 4.998308432006029e-05, + "loss": 6.8441, + "step": 1970 + }, + { + "epoch": 0.011722095346845561, + "grad_norm": 2.3234047889709473, + "learning_rate": 4.998306713563657e-05, + "loss": 6.9481, + "step": 1971 + }, + { + "epoch": 0.011728042630126558, + "grad_norm": 2.4995763301849365, + "learning_rate": 4.9983049942491514e-05, + "loss": 6.9903, + "step": 1972 + }, + { + "epoch": 0.011733989913407555, + "grad_norm": 2.21274995803833, + "learning_rate": 4.998303274062514e-05, + "loss": 7.1484, + "step": 1973 + }, + { + "epoch": 0.011739937196688553, + "grad_norm": 2.4777519702911377, + "learning_rate": 4.998301553003743e-05, + "loss": 7.144, + "step": 1974 + }, + { + "epoch": 0.01174588447996955, + "grad_norm": 2.089796304702759, + "learning_rate": 4.9982998310728426e-05, + "loss": 6.6765, + "step": 1975 + }, + { + "epoch": 0.011751831763250547, + "grad_norm": 3.012753963470459, + "learning_rate": 4.998298108269811e-05, + "loss": 6.8501, + "step": 1976 + }, + { + "epoch": 0.011757779046531544, + "grad_norm": 2.5427911281585693, + "learning_rate": 4.9982963845946486e-05, + "loss": 7.0171, + "step": 1977 + }, + { + "epoch": 0.011763726329812542, + "grad_norm": 2.8591670989990234, + "learning_rate": 4.998294660047358e-05, + "loss": 6.9881, + "step": 1978 + }, + { + "epoch": 0.011769673613093539, + "grad_norm": 2.952085256576538, + "learning_rate": 4.998292934627937e-05, + "loss": 6.9459, + "step": 1979 + }, + { + "epoch": 0.011775620896374536, + "grad_norm": 2.451958656311035, + "learning_rate": 4.998291208336388e-05, + "loss": 6.9515, + "step": 1980 + }, + { + "epoch": 0.011781568179655533, + "grad_norm": 2.448319435119629, + "learning_rate": 4.998289481172713e-05, + "loss": 6.8618, + "step": 1981 + }, + { + "epoch": 0.011787515462936531, + "grad_norm": 3.1797080039978027, + "learning_rate": 4.99828775313691e-05, + "loss": 6.7528, + "step": 1982 + }, + { + "epoch": 0.011793462746217528, + "grad_norm": 2.841120719909668, + "learning_rate": 4.99828602422898e-05, + "loss": 6.8, + "step": 1983 + }, + { + "epoch": 0.011799410029498525, + "grad_norm": 3.128098726272583, + "learning_rate": 4.998284294448925e-05, + "loss": 6.7574, + "step": 1984 + }, + { + "epoch": 0.011805357312779522, + "grad_norm": 2.7724568843841553, + "learning_rate": 4.998282563796744e-05, + "loss": 6.6119, + "step": 1985 + }, + { + "epoch": 0.01181130459606052, + "grad_norm": 2.8025269508361816, + "learning_rate": 4.998280832272439e-05, + "loss": 6.4676, + "step": 1986 + }, + { + "epoch": 0.011817251879341517, + "grad_norm": 2.5756618976593018, + "learning_rate": 4.99827909987601e-05, + "loss": 6.5421, + "step": 1987 + }, + { + "epoch": 0.011823199162622514, + "grad_norm": 2.9116249084472656, + "learning_rate": 4.998277366607457e-05, + "loss": 6.5446, + "step": 1988 + }, + { + "epoch": 0.01182914644590351, + "grad_norm": 2.571019411087036, + "learning_rate": 4.9982756324667815e-05, + "loss": 6.7898, + "step": 1989 + }, + { + "epoch": 0.01183509372918451, + "grad_norm": 2.818885326385498, + "learning_rate": 4.998273897453984e-05, + "loss": 6.6604, + "step": 1990 + }, + { + "epoch": 0.011841041012465506, + "grad_norm": 2.8561007976531982, + "learning_rate": 4.998272161569064e-05, + "loss": 6.5473, + "step": 1991 + }, + { + "epoch": 0.011846988295746503, + "grad_norm": 2.5539605617523193, + "learning_rate": 4.998270424812024e-05, + "loss": 6.5492, + "step": 1992 + }, + { + "epoch": 0.0118529355790275, + "grad_norm": 2.3242900371551514, + "learning_rate": 4.998268687182863e-05, + "loss": 6.4577, + "step": 1993 + }, + { + "epoch": 0.011858882862308498, + "grad_norm": 2.874807596206665, + "learning_rate": 4.998266948681582e-05, + "loss": 6.6071, + "step": 1994 + }, + { + "epoch": 0.011864830145589495, + "grad_norm": 2.9014296531677246, + "learning_rate": 4.9982652093081827e-05, + "loss": 7.2221, + "step": 1995 + }, + { + "epoch": 0.011870777428870492, + "grad_norm": 2.5874252319335938, + "learning_rate": 4.998263469062665e-05, + "loss": 6.593, + "step": 1996 + }, + { + "epoch": 0.011876724712151489, + "grad_norm": 2.4252052307128906, + "learning_rate": 4.998261727945028e-05, + "loss": 7.0138, + "step": 1997 + }, + { + "epoch": 0.011882671995432486, + "grad_norm": 2.3569211959838867, + "learning_rate": 4.998259985955275e-05, + "loss": 6.8743, + "step": 1998 + }, + { + "epoch": 0.011888619278713484, + "grad_norm": 2.560659408569336, + "learning_rate": 4.9982582430934045e-05, + "loss": 6.8926, + "step": 1999 + }, + { + "epoch": 0.011894566561994481, + "grad_norm": 2.0855636596679688, + "learning_rate": 4.9982564993594184e-05, + "loss": 7.1691, + "step": 2000 + }, + { + "epoch": 0.011900513845275478, + "grad_norm": 2.024829387664795, + "learning_rate": 4.998254754753316e-05, + "loss": 7.1797, + "step": 2001 + }, + { + "epoch": 0.011906461128556475, + "grad_norm": 2.093733549118042, + "learning_rate": 4.998253009275099e-05, + "loss": 6.9706, + "step": 2002 + }, + { + "epoch": 0.011912408411837473, + "grad_norm": 1.9211688041687012, + "learning_rate": 4.998251262924768e-05, + "loss": 7.018, + "step": 2003 + }, + { + "epoch": 0.01191835569511847, + "grad_norm": 2.3146321773529053, + "learning_rate": 4.998249515702323e-05, + "loss": 6.9384, + "step": 2004 + }, + { + "epoch": 0.011924302978399467, + "grad_norm": 2.346309185028076, + "learning_rate": 4.998247767607765e-05, + "loss": 6.5674, + "step": 2005 + }, + { + "epoch": 0.011930250261680464, + "grad_norm": 2.39471697807312, + "learning_rate": 4.998246018641094e-05, + "loss": 6.769, + "step": 2006 + }, + { + "epoch": 0.011936197544961462, + "grad_norm": 2.1689298152923584, + "learning_rate": 4.998244268802312e-05, + "loss": 7.0945, + "step": 2007 + }, + { + "epoch": 0.011942144828242459, + "grad_norm": 2.4209859371185303, + "learning_rate": 4.998242518091418e-05, + "loss": 6.98, + "step": 2008 + }, + { + "epoch": 0.011948092111523456, + "grad_norm": 2.6378684043884277, + "learning_rate": 4.998240766508414e-05, + "loss": 6.6833, + "step": 2009 + }, + { + "epoch": 0.011954039394804453, + "grad_norm": 2.2804839611053467, + "learning_rate": 4.9982390140532995e-05, + "loss": 6.7129, + "step": 2010 + }, + { + "epoch": 0.011959986678085451, + "grad_norm": 2.1788251399993896, + "learning_rate": 4.998237260726075e-05, + "loss": 7.0175, + "step": 2011 + }, + { + "epoch": 0.011965933961366448, + "grad_norm": 1.8988546133041382, + "learning_rate": 4.998235506526743e-05, + "loss": 7.0857, + "step": 2012 + }, + { + "epoch": 0.011971881244647445, + "grad_norm": 2.560107469558716, + "learning_rate": 4.9982337514553026e-05, + "loss": 7.0771, + "step": 2013 + }, + { + "epoch": 0.011977828527928442, + "grad_norm": 2.1771798133850098, + "learning_rate": 4.998231995511754e-05, + "loss": 7.071, + "step": 2014 + }, + { + "epoch": 0.01198377581120944, + "grad_norm": 1.9619860649108887, + "learning_rate": 4.998230238696098e-05, + "loss": 6.9109, + "step": 2015 + }, + { + "epoch": 0.011989723094490437, + "grad_norm": 2.16719126701355, + "learning_rate": 4.998228481008337e-05, + "loss": 6.903, + "step": 2016 + }, + { + "epoch": 0.011995670377771434, + "grad_norm": 2.4643077850341797, + "learning_rate": 4.998226722448469e-05, + "loss": 6.5301, + "step": 2017 + }, + { + "epoch": 0.01200161766105243, + "grad_norm": 2.5153393745422363, + "learning_rate": 4.9982249630164965e-05, + "loss": 7.107, + "step": 2018 + }, + { + "epoch": 0.01200756494433343, + "grad_norm": 2.6180920600891113, + "learning_rate": 4.998223202712419e-05, + "loss": 6.9905, + "step": 2019 + }, + { + "epoch": 0.012013512227614426, + "grad_norm": 2.333186149597168, + "learning_rate": 4.998221441536238e-05, + "loss": 7.074, + "step": 2020 + }, + { + "epoch": 0.012019459510895423, + "grad_norm": 2.138176918029785, + "learning_rate": 4.998219679487953e-05, + "loss": 7.0211, + "step": 2021 + }, + { + "epoch": 0.01202540679417642, + "grad_norm": 2.9845499992370605, + "learning_rate": 4.998217916567567e-05, + "loss": 6.7341, + "step": 2022 + }, + { + "epoch": 0.012031354077457418, + "grad_norm": 3.1216208934783936, + "learning_rate": 4.998216152775077e-05, + "loss": 7.1569, + "step": 2023 + }, + { + "epoch": 0.012037301360738415, + "grad_norm": 2.4693727493286133, + "learning_rate": 4.998214388110487e-05, + "loss": 6.6427, + "step": 2024 + }, + { + "epoch": 0.012043248644019412, + "grad_norm": 2.784562349319458, + "learning_rate": 4.9982126225737955e-05, + "loss": 6.6898, + "step": 2025 + }, + { + "epoch": 0.012049195927300409, + "grad_norm": 3.0549166202545166, + "learning_rate": 4.9982108561650036e-05, + "loss": 6.6004, + "step": 2026 + }, + { + "epoch": 0.012055143210581406, + "grad_norm": 2.565505266189575, + "learning_rate": 4.998209088884113e-05, + "loss": 6.5981, + "step": 2027 + }, + { + "epoch": 0.012061090493862404, + "grad_norm": 2.862548828125, + "learning_rate": 4.998207320731122e-05, + "loss": 6.4329, + "step": 2028 + }, + { + "epoch": 0.012067037777143401, + "grad_norm": 2.835280179977417, + "learning_rate": 4.998205551706033e-05, + "loss": 6.6854, + "step": 2029 + }, + { + "epoch": 0.012072985060424398, + "grad_norm": 2.4550364017486572, + "learning_rate": 4.9982037818088474e-05, + "loss": 6.7115, + "step": 2030 + }, + { + "epoch": 0.012078932343705395, + "grad_norm": 2.9977426528930664, + "learning_rate": 4.998202011039564e-05, + "loss": 6.341, + "step": 2031 + }, + { + "epoch": 0.012084879626986393, + "grad_norm": 2.258370876312256, + "learning_rate": 4.998200239398184e-05, + "loss": 6.7094, + "step": 2032 + }, + { + "epoch": 0.01209082691026739, + "grad_norm": 2.4484050273895264, + "learning_rate": 4.9981984668847085e-05, + "loss": 7.1115, + "step": 2033 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 2.4668514728546143, + "learning_rate": 4.9981966934991366e-05, + "loss": 6.9411, + "step": 2034 + }, + { + "epoch": 0.012102721476829384, + "grad_norm": 2.218479871749878, + "learning_rate": 4.998194919241471e-05, + "loss": 6.7175, + "step": 2035 + }, + { + "epoch": 0.012108668760110382, + "grad_norm": 2.201815366744995, + "learning_rate": 4.9981931441117115e-05, + "loss": 6.8684, + "step": 2036 + }, + { + "epoch": 0.012114616043391379, + "grad_norm": 2.4610331058502197, + "learning_rate": 4.998191368109858e-05, + "loss": 6.7214, + "step": 2037 + }, + { + "epoch": 0.012120563326672376, + "grad_norm": 2.7274906635284424, + "learning_rate": 4.998189591235912e-05, + "loss": 6.7611, + "step": 2038 + }, + { + "epoch": 0.012126510609953373, + "grad_norm": 2.7716658115386963, + "learning_rate": 4.9981878134898735e-05, + "loss": 6.7679, + "step": 2039 + }, + { + "epoch": 0.012132457893234371, + "grad_norm": 3.3206236362457275, + "learning_rate": 4.9981860348717434e-05, + "loss": 6.6283, + "step": 2040 + }, + { + "epoch": 0.012138405176515368, + "grad_norm": 2.511906862258911, + "learning_rate": 4.9981842553815225e-05, + "loss": 6.9537, + "step": 2041 + }, + { + "epoch": 0.012144352459796365, + "grad_norm": 2.7797024250030518, + "learning_rate": 4.998182475019212e-05, + "loss": 7.0488, + "step": 2042 + }, + { + "epoch": 0.012150299743077362, + "grad_norm": 3.523092031478882, + "learning_rate": 4.998180693784811e-05, + "loss": 6.9249, + "step": 2043 + }, + { + "epoch": 0.01215624702635836, + "grad_norm": 3.1001851558685303, + "learning_rate": 4.998178911678322e-05, + "loss": 7.0998, + "step": 2044 + }, + { + "epoch": 0.012162194309639357, + "grad_norm": 2.5291028022766113, + "learning_rate": 4.998177128699743e-05, + "loss": 6.8381, + "step": 2045 + }, + { + "epoch": 0.012168141592920354, + "grad_norm": 3.308398723602295, + "learning_rate": 4.998175344849077e-05, + "loss": 6.6849, + "step": 2046 + }, + { + "epoch": 0.01217408887620135, + "grad_norm": 3.4255475997924805, + "learning_rate": 4.998173560126323e-05, + "loss": 6.7816, + "step": 2047 + }, + { + "epoch": 0.01218003615948235, + "grad_norm": 3.4510817527770996, + "learning_rate": 4.998171774531484e-05, + "loss": 6.7961, + "step": 2048 + }, + { + "epoch": 0.012185983442763346, + "grad_norm": 3.15468168258667, + "learning_rate": 4.998169988064558e-05, + "loss": 6.9409, + "step": 2049 + }, + { + "epoch": 0.012191930726044343, + "grad_norm": 2.5568132400512695, + "learning_rate": 4.998168200725547e-05, + "loss": 6.8573, + "step": 2050 + }, + { + "epoch": 0.01219787800932534, + "grad_norm": 1.9745045900344849, + "learning_rate": 4.9981664125144515e-05, + "loss": 6.7948, + "step": 2051 + }, + { + "epoch": 0.012203825292606338, + "grad_norm": 2.2304463386535645, + "learning_rate": 4.9981646234312714e-05, + "loss": 6.6896, + "step": 2052 + }, + { + "epoch": 0.012209772575887335, + "grad_norm": 2.4391567707061768, + "learning_rate": 4.998162833476008e-05, + "loss": 6.7129, + "step": 2053 + }, + { + "epoch": 0.012215719859168332, + "grad_norm": 3.243905544281006, + "learning_rate": 4.9981610426486615e-05, + "loss": 7.0744, + "step": 2054 + }, + { + "epoch": 0.012221667142449329, + "grad_norm": 3.2596933841705322, + "learning_rate": 4.998159250949233e-05, + "loss": 6.9361, + "step": 2055 + }, + { + "epoch": 0.012227614425730327, + "grad_norm": 2.554436445236206, + "learning_rate": 4.998157458377723e-05, + "loss": 6.9354, + "step": 2056 + }, + { + "epoch": 0.012233561709011324, + "grad_norm": 2.3636975288391113, + "learning_rate": 4.998155664934132e-05, + "loss": 6.849, + "step": 2057 + }, + { + "epoch": 0.01223950899229232, + "grad_norm": 2.224684953689575, + "learning_rate": 4.99815387061846e-05, + "loss": 6.7011, + "step": 2058 + }, + { + "epoch": 0.012245456275573318, + "grad_norm": 2.6892964839935303, + "learning_rate": 4.9981520754307096e-05, + "loss": 6.753, + "step": 2059 + }, + { + "epoch": 0.012251403558854315, + "grad_norm": 2.7645084857940674, + "learning_rate": 4.9981502793708796e-05, + "loss": 6.5437, + "step": 2060 + }, + { + "epoch": 0.012257350842135313, + "grad_norm": 2.1315746307373047, + "learning_rate": 4.9981484824389716e-05, + "loss": 6.8843, + "step": 2061 + }, + { + "epoch": 0.01226329812541631, + "grad_norm": 2.6275408267974854, + "learning_rate": 4.998146684634984e-05, + "loss": 6.7275, + "step": 2062 + }, + { + "epoch": 0.012269245408697307, + "grad_norm": 2.530688762664795, + "learning_rate": 4.998144885958921e-05, + "loss": 6.6089, + "step": 2063 + }, + { + "epoch": 0.012275192691978304, + "grad_norm": 2.0959835052490234, + "learning_rate": 4.998143086410781e-05, + "loss": 6.7425, + "step": 2064 + }, + { + "epoch": 0.012281139975259302, + "grad_norm": 2.887242078781128, + "learning_rate": 4.998141285990565e-05, + "loss": 6.6867, + "step": 2065 + }, + { + "epoch": 0.012287087258540299, + "grad_norm": 2.430122137069702, + "learning_rate": 4.9981394846982734e-05, + "loss": 6.6636, + "step": 2066 + }, + { + "epoch": 0.012293034541821296, + "grad_norm": 2.269162654876709, + "learning_rate": 4.998137682533907e-05, + "loss": 7.1165, + "step": 2067 + }, + { + "epoch": 0.012298981825102293, + "grad_norm": 2.6741089820861816, + "learning_rate": 4.998135879497467e-05, + "loss": 6.6678, + "step": 2068 + }, + { + "epoch": 0.012304929108383291, + "grad_norm": 2.3362507820129395, + "learning_rate": 4.998134075588953e-05, + "loss": 7.0103, + "step": 2069 + }, + { + "epoch": 0.012310876391664288, + "grad_norm": 2.310638189315796, + "learning_rate": 4.9981322708083666e-05, + "loss": 6.9235, + "step": 2070 + }, + { + "epoch": 0.012316823674945285, + "grad_norm": 2.161853790283203, + "learning_rate": 4.998130465155708e-05, + "loss": 6.9392, + "step": 2071 + }, + { + "epoch": 0.012322770958226282, + "grad_norm": 2.2609059810638428, + "learning_rate": 4.9981286586309786e-05, + "loss": 6.888, + "step": 2072 + }, + { + "epoch": 0.01232871824150728, + "grad_norm": 2.6072967052459717, + "learning_rate": 4.998126851234177e-05, + "loss": 6.7739, + "step": 2073 + }, + { + "epoch": 0.012334665524788277, + "grad_norm": 3.092834711074829, + "learning_rate": 4.9981250429653056e-05, + "loss": 6.5529, + "step": 2074 + }, + { + "epoch": 0.012340612808069274, + "grad_norm": 2.303149461746216, + "learning_rate": 4.998123233824366e-05, + "loss": 6.618, + "step": 2075 + }, + { + "epoch": 0.01234656009135027, + "grad_norm": 2.888063907623291, + "learning_rate": 4.998121423811355e-05, + "loss": 6.9224, + "step": 2076 + }, + { + "epoch": 0.012352507374631269, + "grad_norm": 2.990727424621582, + "learning_rate": 4.998119612926277e-05, + "loss": 6.94, + "step": 2077 + }, + { + "epoch": 0.012358454657912266, + "grad_norm": 3.016002893447876, + "learning_rate": 4.998117801169131e-05, + "loss": 6.6231, + "step": 2078 + }, + { + "epoch": 0.012364401941193263, + "grad_norm": 2.057124614715576, + "learning_rate": 4.998115988539918e-05, + "loss": 6.803, + "step": 2079 + }, + { + "epoch": 0.01237034922447426, + "grad_norm": 2.371136426925659, + "learning_rate": 4.998114175038639e-05, + "loss": 6.8244, + "step": 2080 + }, + { + "epoch": 0.012376296507755258, + "grad_norm": 2.804365873336792, + "learning_rate": 4.998112360665292e-05, + "loss": 6.8787, + "step": 2081 + }, + { + "epoch": 0.012382243791036255, + "grad_norm": 3.4987633228302, + "learning_rate": 4.998110545419882e-05, + "loss": 6.6946, + "step": 2082 + }, + { + "epoch": 0.012388191074317252, + "grad_norm": 2.950608968734741, + "learning_rate": 4.998108729302407e-05, + "loss": 6.7915, + "step": 2083 + }, + { + "epoch": 0.012394138357598249, + "grad_norm": 2.4327776432037354, + "learning_rate": 4.998106912312868e-05, + "loss": 6.727, + "step": 2084 + }, + { + "epoch": 0.012400085640879247, + "grad_norm": 2.46014142036438, + "learning_rate": 4.998105094451265e-05, + "loss": 6.6797, + "step": 2085 + }, + { + "epoch": 0.012406032924160244, + "grad_norm": 2.947566270828247, + "learning_rate": 4.9981032757175995e-05, + "loss": 6.6401, + "step": 2086 + }, + { + "epoch": 0.01241198020744124, + "grad_norm": 2.5999064445495605, + "learning_rate": 4.9981014561118724e-05, + "loss": 6.58, + "step": 2087 + }, + { + "epoch": 0.012417927490722238, + "grad_norm": 2.9761807918548584, + "learning_rate": 4.9980996356340836e-05, + "loss": 6.8538, + "step": 2088 + }, + { + "epoch": 0.012423874774003236, + "grad_norm": 2.690925121307373, + "learning_rate": 4.9980978142842336e-05, + "loss": 6.9087, + "step": 2089 + }, + { + "epoch": 0.012429822057284233, + "grad_norm": 2.218524217605591, + "learning_rate": 4.998095992062325e-05, + "loss": 6.7221, + "step": 2090 + }, + { + "epoch": 0.01243576934056523, + "grad_norm": 2.630094051361084, + "learning_rate": 4.998094168968355e-05, + "loss": 6.7346, + "step": 2091 + }, + { + "epoch": 0.012441716623846227, + "grad_norm": 2.7839179039001465, + "learning_rate": 4.9980923450023276e-05, + "loss": 6.8668, + "step": 2092 + }, + { + "epoch": 0.012447663907127223, + "grad_norm": 2.422914743423462, + "learning_rate": 4.9980905201642415e-05, + "loss": 6.7953, + "step": 2093 + }, + { + "epoch": 0.012453611190408222, + "grad_norm": 2.525883674621582, + "learning_rate": 4.998088694454097e-05, + "loss": 6.6322, + "step": 2094 + }, + { + "epoch": 0.012459558473689219, + "grad_norm": 2.515536308288574, + "learning_rate": 4.998086867871896e-05, + "loss": 7.4297, + "step": 2095 + }, + { + "epoch": 0.012465505756970216, + "grad_norm": 2.689542055130005, + "learning_rate": 4.998085040417639e-05, + "loss": 7.4316, + "step": 2096 + }, + { + "epoch": 0.012471453040251212, + "grad_norm": 2.4374492168426514, + "learning_rate": 4.998083212091327e-05, + "loss": 6.8035, + "step": 2097 + }, + { + "epoch": 0.012477400323532211, + "grad_norm": 2.284153699874878, + "learning_rate": 4.998081382892959e-05, + "loss": 6.6644, + "step": 2098 + }, + { + "epoch": 0.012483347606813208, + "grad_norm": 2.113539218902588, + "learning_rate": 4.9980795528225366e-05, + "loss": 6.5201, + "step": 2099 + }, + { + "epoch": 0.012489294890094205, + "grad_norm": 2.2590157985687256, + "learning_rate": 4.998077721880061e-05, + "loss": 6.8074, + "step": 2100 + }, + { + "epoch": 0.012495242173375202, + "grad_norm": 2.077986717224121, + "learning_rate": 4.9980758900655316e-05, + "loss": 6.6986, + "step": 2101 + }, + { + "epoch": 0.0125011894566562, + "grad_norm": 2.495882987976074, + "learning_rate": 4.99807405737895e-05, + "loss": 6.6949, + "step": 2102 + }, + { + "epoch": 0.012507136739937197, + "grad_norm": 2.224621295928955, + "learning_rate": 4.998072223820317e-05, + "loss": 6.5723, + "step": 2103 + }, + { + "epoch": 0.012513084023218194, + "grad_norm": 2.515867233276367, + "learning_rate": 4.998070389389632e-05, + "loss": 6.4327, + "step": 2104 + }, + { + "epoch": 0.01251903130649919, + "grad_norm": 2.3134326934814453, + "learning_rate": 4.998068554086897e-05, + "loss": 6.2818, + "step": 2105 + }, + { + "epoch": 0.012524978589780189, + "grad_norm": 2.7688093185424805, + "learning_rate": 4.998066717912112e-05, + "loss": 6.4585, + "step": 2106 + }, + { + "epoch": 0.012530925873061186, + "grad_norm": 3.211790084838867, + "learning_rate": 4.998064880865277e-05, + "loss": 6.5227, + "step": 2107 + }, + { + "epoch": 0.012536873156342183, + "grad_norm": 2.9701578617095947, + "learning_rate": 4.998063042946395e-05, + "loss": 6.5674, + "step": 2108 + }, + { + "epoch": 0.01254282043962318, + "grad_norm": 2.1295664310455322, + "learning_rate": 4.998061204155463e-05, + "loss": 6.5697, + "step": 2109 + }, + { + "epoch": 0.012548767722904178, + "grad_norm": 2.841683864593506, + "learning_rate": 4.998059364492485e-05, + "loss": 6.453, + "step": 2110 + }, + { + "epoch": 0.012554715006185175, + "grad_norm": 2.481001615524292, + "learning_rate": 4.99805752395746e-05, + "loss": 6.555, + "step": 2111 + }, + { + "epoch": 0.012560662289466172, + "grad_norm": 2.357745885848999, + "learning_rate": 4.998055682550389e-05, + "loss": 6.7916, + "step": 2112 + }, + { + "epoch": 0.012566609572747169, + "grad_norm": 2.349417209625244, + "learning_rate": 4.9980538402712725e-05, + "loss": 6.7257, + "step": 2113 + }, + { + "epoch": 0.012572556856028167, + "grad_norm": 2.846930742263794, + "learning_rate": 4.998051997120111e-05, + "loss": 6.7095, + "step": 2114 + }, + { + "epoch": 0.012578504139309164, + "grad_norm": 2.362506628036499, + "learning_rate": 4.998050153096906e-05, + "loss": 6.675, + "step": 2115 + }, + { + "epoch": 0.01258445142259016, + "grad_norm": 2.3275344371795654, + "learning_rate": 4.998048308201656e-05, + "loss": 6.9031, + "step": 2116 + }, + { + "epoch": 0.012590398705871158, + "grad_norm": 2.194359540939331, + "learning_rate": 4.9980464624343644e-05, + "loss": 6.8258, + "step": 2117 + }, + { + "epoch": 0.012596345989152156, + "grad_norm": 2.3926312923431396, + "learning_rate": 4.99804461579503e-05, + "loss": 6.7136, + "step": 2118 + }, + { + "epoch": 0.012602293272433153, + "grad_norm": 2.7430222034454346, + "learning_rate": 4.9980427682836546e-05, + "loss": 6.5475, + "step": 2119 + }, + { + "epoch": 0.01260824055571415, + "grad_norm": 2.1563844680786133, + "learning_rate": 4.998040919900237e-05, + "loss": 6.7105, + "step": 2120 + }, + { + "epoch": 0.012614187838995147, + "grad_norm": 2.1061437129974365, + "learning_rate": 4.998039070644781e-05, + "loss": 6.6411, + "step": 2121 + }, + { + "epoch": 0.012620135122276143, + "grad_norm": 2.6192378997802734, + "learning_rate": 4.9980372205172844e-05, + "loss": 6.6831, + "step": 2122 + }, + { + "epoch": 0.012626082405557142, + "grad_norm": 2.794616222381592, + "learning_rate": 4.9980353695177495e-05, + "loss": 6.8128, + "step": 2123 + }, + { + "epoch": 0.012632029688838139, + "grad_norm": 2.3656489849090576, + "learning_rate": 4.998033517646176e-05, + "loss": 6.8109, + "step": 2124 + }, + { + "epoch": 0.012637976972119136, + "grad_norm": 2.658433437347412, + "learning_rate": 4.998031664902564e-05, + "loss": 6.7979, + "step": 2125 + }, + { + "epoch": 0.012643924255400132, + "grad_norm": 2.889954090118408, + "learning_rate": 4.9980298112869154e-05, + "loss": 6.6745, + "step": 2126 + }, + { + "epoch": 0.012649871538681131, + "grad_norm": 2.469790458679199, + "learning_rate": 4.9980279567992304e-05, + "loss": 6.7056, + "step": 2127 + }, + { + "epoch": 0.012655818821962128, + "grad_norm": 2.4310262203216553, + "learning_rate": 4.9980261014395094e-05, + "loss": 6.8809, + "step": 2128 + }, + { + "epoch": 0.012661766105243125, + "grad_norm": 2.772359609603882, + "learning_rate": 4.998024245207754e-05, + "loss": 7.0383, + "step": 2129 + }, + { + "epoch": 0.012667713388524121, + "grad_norm": 2.292144775390625, + "learning_rate": 4.9980223881039635e-05, + "loss": 6.9062, + "step": 2130 + }, + { + "epoch": 0.01267366067180512, + "grad_norm": 2.590363025665283, + "learning_rate": 4.998020530128139e-05, + "loss": 6.5803, + "step": 2131 + }, + { + "epoch": 0.012679607955086117, + "grad_norm": 2.78432035446167, + "learning_rate": 4.9980186712802824e-05, + "loss": 6.788, + "step": 2132 + }, + { + "epoch": 0.012685555238367114, + "grad_norm": 2.6188290119171143, + "learning_rate": 4.998016811560392e-05, + "loss": 6.5827, + "step": 2133 + }, + { + "epoch": 0.01269150252164811, + "grad_norm": 2.868215560913086, + "learning_rate": 4.99801495096847e-05, + "loss": 6.5845, + "step": 2134 + }, + { + "epoch": 0.012697449804929109, + "grad_norm": 2.4738945960998535, + "learning_rate": 4.998013089504518e-05, + "loss": 6.5019, + "step": 2135 + }, + { + "epoch": 0.012703397088210106, + "grad_norm": 2.5315287113189697, + "learning_rate": 4.998011227168534e-05, + "loss": 6.6765, + "step": 2136 + }, + { + "epoch": 0.012709344371491103, + "grad_norm": 2.7871086597442627, + "learning_rate": 4.998009363960521e-05, + "loss": 6.64, + "step": 2137 + }, + { + "epoch": 0.0127152916547721, + "grad_norm": 2.267502784729004, + "learning_rate": 4.998007499880479e-05, + "loss": 6.8665, + "step": 2138 + }, + { + "epoch": 0.012721238938053098, + "grad_norm": 2.5014212131500244, + "learning_rate": 4.998005634928408e-05, + "loss": 6.6757, + "step": 2139 + }, + { + "epoch": 0.012727186221334095, + "grad_norm": 2.3600070476531982, + "learning_rate": 4.998003769104308e-05, + "loss": 6.5425, + "step": 2140 + }, + { + "epoch": 0.012733133504615092, + "grad_norm": 2.32123064994812, + "learning_rate": 4.998001902408182e-05, + "loss": 6.5192, + "step": 2141 + }, + { + "epoch": 0.012739080787896088, + "grad_norm": 2.5059258937835693, + "learning_rate": 4.998000034840029e-05, + "loss": 6.6315, + "step": 2142 + }, + { + "epoch": 0.012745028071177087, + "grad_norm": 2.2143092155456543, + "learning_rate": 4.99799816639985e-05, + "loss": 6.6058, + "step": 2143 + }, + { + "epoch": 0.012750975354458084, + "grad_norm": 2.3660342693328857, + "learning_rate": 4.997996297087645e-05, + "loss": 6.554, + "step": 2144 + }, + { + "epoch": 0.01275692263773908, + "grad_norm": 2.4286036491394043, + "learning_rate": 4.9979944269034164e-05, + "loss": 6.4857, + "step": 2145 + }, + { + "epoch": 0.012762869921020078, + "grad_norm": 2.4002180099487305, + "learning_rate": 4.997992555847163e-05, + "loss": 6.5083, + "step": 2146 + }, + { + "epoch": 0.012768817204301076, + "grad_norm": 2.418942451477051, + "learning_rate": 4.997990683918886e-05, + "loss": 6.5471, + "step": 2147 + }, + { + "epoch": 0.012774764487582073, + "grad_norm": 2.535654067993164, + "learning_rate": 4.997988811118587e-05, + "loss": 6.5999, + "step": 2148 + }, + { + "epoch": 0.01278071177086307, + "grad_norm": 2.581505298614502, + "learning_rate": 4.9979869374462655e-05, + "loss": 6.2525, + "step": 2149 + }, + { + "epoch": 0.012786659054144067, + "grad_norm": 2.681297779083252, + "learning_rate": 4.997985062901923e-05, + "loss": 6.1463, + "step": 2150 + }, + { + "epoch": 0.012792606337425065, + "grad_norm": 2.3542990684509277, + "learning_rate": 4.997983187485559e-05, + "loss": 6.433, + "step": 2151 + }, + { + "epoch": 0.012798553620706062, + "grad_norm": 2.2994048595428467, + "learning_rate": 4.997981311197175e-05, + "loss": 6.5952, + "step": 2152 + }, + { + "epoch": 0.012804500903987059, + "grad_norm": 2.4703454971313477, + "learning_rate": 4.9979794340367724e-05, + "loss": 6.5581, + "step": 2153 + }, + { + "epoch": 0.012810448187268056, + "grad_norm": 2.511383533477783, + "learning_rate": 4.9979775560043504e-05, + "loss": 6.577, + "step": 2154 + }, + { + "epoch": 0.012816395470549052, + "grad_norm": 2.3300156593322754, + "learning_rate": 4.99797567709991e-05, + "loss": 6.4349, + "step": 2155 + }, + { + "epoch": 0.012822342753830051, + "grad_norm": 2.523878574371338, + "learning_rate": 4.997973797323452e-05, + "loss": 6.5044, + "step": 2156 + }, + { + "epoch": 0.012828290037111048, + "grad_norm": 2.4185073375701904, + "learning_rate": 4.9979719166749776e-05, + "loss": 6.537, + "step": 2157 + }, + { + "epoch": 0.012834237320392045, + "grad_norm": 2.324090003967285, + "learning_rate": 4.997970035154487e-05, + "loss": 6.803, + "step": 2158 + }, + { + "epoch": 0.012840184603673041, + "grad_norm": 2.468872547149658, + "learning_rate": 4.9979681527619804e-05, + "loss": 7.0837, + "step": 2159 + }, + { + "epoch": 0.01284613188695404, + "grad_norm": 2.1467936038970947, + "learning_rate": 4.99796626949746e-05, + "loss": 6.7373, + "step": 2160 + }, + { + "epoch": 0.012852079170235037, + "grad_norm": 2.3208062648773193, + "learning_rate": 4.9979643853609246e-05, + "loss": 6.5483, + "step": 2161 + }, + { + "epoch": 0.012858026453516034, + "grad_norm": 2.2797584533691406, + "learning_rate": 4.997962500352376e-05, + "loss": 6.5857, + "step": 2162 + }, + { + "epoch": 0.01286397373679703, + "grad_norm": 2.3447721004486084, + "learning_rate": 4.9979606144718135e-05, + "loss": 6.8511, + "step": 2163 + }, + { + "epoch": 0.012869921020078029, + "grad_norm": 2.6456334590911865, + "learning_rate": 4.9979587277192395e-05, + "loss": 6.9457, + "step": 2164 + }, + { + "epoch": 0.012875868303359026, + "grad_norm": 3.2567737102508545, + "learning_rate": 4.997956840094654e-05, + "loss": 6.6405, + "step": 2165 + }, + { + "epoch": 0.012881815586640023, + "grad_norm": 2.847371816635132, + "learning_rate": 4.9979549515980574e-05, + "loss": 6.751, + "step": 2166 + }, + { + "epoch": 0.01288776286992102, + "grad_norm": 2.999779462814331, + "learning_rate": 4.99795306222945e-05, + "loss": 6.7437, + "step": 2167 + }, + { + "epoch": 0.012893710153202018, + "grad_norm": 2.3793458938598633, + "learning_rate": 4.9979511719888336e-05, + "loss": 6.6864, + "step": 2168 + }, + { + "epoch": 0.012899657436483015, + "grad_norm": 2.284724473953247, + "learning_rate": 4.9979492808762084e-05, + "loss": 6.4237, + "step": 2169 + }, + { + "epoch": 0.012905604719764012, + "grad_norm": 2.560758352279663, + "learning_rate": 4.997947388891575e-05, + "loss": 6.5964, + "step": 2170 + }, + { + "epoch": 0.012911552003045008, + "grad_norm": 2.7461421489715576, + "learning_rate": 4.997945496034934e-05, + "loss": 6.5354, + "step": 2171 + }, + { + "epoch": 0.012917499286326007, + "grad_norm": 3.0868208408355713, + "learning_rate": 4.9979436023062854e-05, + "loss": 6.6445, + "step": 2172 + }, + { + "epoch": 0.012923446569607004, + "grad_norm": 2.565009593963623, + "learning_rate": 4.997941707705631e-05, + "loss": 6.6015, + "step": 2173 + }, + { + "epoch": 0.012929393852888, + "grad_norm": 2.9424686431884766, + "learning_rate": 4.997939812232971e-05, + "loss": 6.4887, + "step": 2174 + }, + { + "epoch": 0.012935341136168997, + "grad_norm": 3.0674476623535156, + "learning_rate": 4.997937915888305e-05, + "loss": 6.4728, + "step": 2175 + }, + { + "epoch": 0.012941288419449996, + "grad_norm": 3.040189266204834, + "learning_rate": 4.997936018671636e-05, + "loss": 6.3788, + "step": 2176 + }, + { + "epoch": 0.012947235702730993, + "grad_norm": 2.756211042404175, + "learning_rate": 4.9979341205829626e-05, + "loss": 6.4167, + "step": 2177 + }, + { + "epoch": 0.01295318298601199, + "grad_norm": 2.6333322525024414, + "learning_rate": 4.997932221622287e-05, + "loss": 6.6392, + "step": 2178 + }, + { + "epoch": 0.012959130269292986, + "grad_norm": 2.6951076984405518, + "learning_rate": 4.997930321789608e-05, + "loss": 6.3299, + "step": 2179 + }, + { + "epoch": 0.012965077552573985, + "grad_norm": 2.5388028621673584, + "learning_rate": 4.997928421084928e-05, + "loss": 6.2646, + "step": 2180 + }, + { + "epoch": 0.012971024835854982, + "grad_norm": 3.312171459197998, + "learning_rate": 4.997926519508247e-05, + "loss": 6.6331, + "step": 2181 + }, + { + "epoch": 0.012976972119135979, + "grad_norm": 3.437025547027588, + "learning_rate": 4.997924617059565e-05, + "loss": 5.5981, + "step": 2182 + }, + { + "epoch": 0.012982919402416975, + "grad_norm": 2.74035906791687, + "learning_rate": 4.997922713738884e-05, + "loss": 5.1641, + "step": 2183 + }, + { + "epoch": 0.012988866685697972, + "grad_norm": 2.618525505065918, + "learning_rate": 4.9979208095462036e-05, + "loss": 5.9978, + "step": 2184 + }, + { + "epoch": 0.012994813968978971, + "grad_norm": 2.633692502975464, + "learning_rate": 4.9979189044815254e-05, + "loss": 6.2812, + "step": 2185 + }, + { + "epoch": 0.013000761252259968, + "grad_norm": 2.087557792663574, + "learning_rate": 4.997916998544849e-05, + "loss": 6.2864, + "step": 2186 + }, + { + "epoch": 0.013006708535540965, + "grad_norm": 3.365112066268921, + "learning_rate": 4.997915091736176e-05, + "loss": 5.3517, + "step": 2187 + }, + { + "epoch": 0.013012655818821961, + "grad_norm": 2.7561593055725098, + "learning_rate": 4.997913184055506e-05, + "loss": 6.3667, + "step": 2188 + }, + { + "epoch": 0.01301860310210296, + "grad_norm": 2.630976676940918, + "learning_rate": 4.9979112755028415e-05, + "loss": 6.5858, + "step": 2189 + }, + { + "epoch": 0.013024550385383957, + "grad_norm": 2.56007981300354, + "learning_rate": 4.9979093660781805e-05, + "loss": 6.6862, + "step": 2190 + }, + { + "epoch": 0.013030497668664954, + "grad_norm": 2.509631633758545, + "learning_rate": 4.997907455781526e-05, + "loss": 6.4699, + "step": 2191 + }, + { + "epoch": 0.01303644495194595, + "grad_norm": 2.442028522491455, + "learning_rate": 4.997905544612878e-05, + "loss": 6.5755, + "step": 2192 + }, + { + "epoch": 0.013042392235226949, + "grad_norm": 2.561016321182251, + "learning_rate": 4.997903632572236e-05, + "loss": 6.4529, + "step": 2193 + }, + { + "epoch": 0.013048339518507946, + "grad_norm": 2.585753917694092, + "learning_rate": 4.9979017196596025e-05, + "loss": 6.188, + "step": 2194 + }, + { + "epoch": 0.013054286801788943, + "grad_norm": 2.3657655715942383, + "learning_rate": 4.997899805874977e-05, + "loss": 6.1414, + "step": 2195 + }, + { + "epoch": 0.01306023408506994, + "grad_norm": 2.818251609802246, + "learning_rate": 4.997897891218361e-05, + "loss": 6.5276, + "step": 2196 + }, + { + "epoch": 0.013066181368350938, + "grad_norm": 2.9687695503234863, + "learning_rate": 4.997895975689754e-05, + "loss": 6.131, + "step": 2197 + }, + { + "epoch": 0.013072128651631935, + "grad_norm": 2.8505353927612305, + "learning_rate": 4.997894059289157e-05, + "loss": 6.5269, + "step": 2198 + }, + { + "epoch": 0.013078075934912932, + "grad_norm": 2.331573486328125, + "learning_rate": 4.997892142016573e-05, + "loss": 6.1101, + "step": 2199 + }, + { + "epoch": 0.013084023218193928, + "grad_norm": 2.3241569995880127, + "learning_rate": 4.997890223871998e-05, + "loss": 6.5081, + "step": 2200 + }, + { + "epoch": 0.013089970501474927, + "grad_norm": 2.658834218978882, + "learning_rate": 4.997888304855437e-05, + "loss": 6.554, + "step": 2201 + }, + { + "epoch": 0.013095917784755924, + "grad_norm": 2.703911304473877, + "learning_rate": 4.997886384966889e-05, + "loss": 6.337, + "step": 2202 + }, + { + "epoch": 0.01310186506803692, + "grad_norm": 3.020775318145752, + "learning_rate": 4.997884464206354e-05, + "loss": 6.4375, + "step": 2203 + }, + { + "epoch": 0.013107812351317917, + "grad_norm": 3.324218273162842, + "learning_rate": 4.9978825425738334e-05, + "loss": 6.4871, + "step": 2204 + }, + { + "epoch": 0.013113759634598916, + "grad_norm": 3.822019577026367, + "learning_rate": 4.9978806200693276e-05, + "loss": 6.6372, + "step": 2205 + }, + { + "epoch": 0.013119706917879913, + "grad_norm": 3.3639512062072754, + "learning_rate": 4.997878696692838e-05, + "loss": 6.1826, + "step": 2206 + }, + { + "epoch": 0.01312565420116091, + "grad_norm": 3.580603837966919, + "learning_rate": 4.997876772444365e-05, + "loss": 6.793, + "step": 2207 + }, + { + "epoch": 0.013131601484441906, + "grad_norm": 2.472733497619629, + "learning_rate": 4.9978748473239084e-05, + "loss": 6.9054, + "step": 2208 + }, + { + "epoch": 0.013137548767722905, + "grad_norm": 3.327461004257202, + "learning_rate": 4.99787292133147e-05, + "loss": 6.6735, + "step": 2209 + }, + { + "epoch": 0.013143496051003902, + "grad_norm": 3.493234157562256, + "learning_rate": 4.99787099446705e-05, + "loss": 6.9702, + "step": 2210 + }, + { + "epoch": 0.013149443334284899, + "grad_norm": 2.2516424655914307, + "learning_rate": 4.9978690667306483e-05, + "loss": 7.196, + "step": 2211 + }, + { + "epoch": 0.013155390617565895, + "grad_norm": 1.8846355676651, + "learning_rate": 4.9978671381222665e-05, + "loss": 7.0373, + "step": 2212 + }, + { + "epoch": 0.013161337900846894, + "grad_norm": 2.9334232807159424, + "learning_rate": 4.997865208641906e-05, + "loss": 6.2065, + "step": 2213 + }, + { + "epoch": 0.01316728518412789, + "grad_norm": 2.713006019592285, + "learning_rate": 4.997863278289565e-05, + "loss": 6.788, + "step": 2214 + }, + { + "epoch": 0.013173232467408888, + "grad_norm": 2.6246018409729004, + "learning_rate": 4.9978613470652466e-05, + "loss": 6.7979, + "step": 2215 + }, + { + "epoch": 0.013179179750689884, + "grad_norm": 2.2770373821258545, + "learning_rate": 4.997859414968951e-05, + "loss": 6.8307, + "step": 2216 + }, + { + "epoch": 0.013185127033970881, + "grad_norm": 2.6244993209838867, + "learning_rate": 4.997857482000679e-05, + "loss": 6.3176, + "step": 2217 + }, + { + "epoch": 0.01319107431725188, + "grad_norm": 3.4668054580688477, + "learning_rate": 4.997855548160429e-05, + "loss": 6.8962, + "step": 2218 + }, + { + "epoch": 0.013197021600532877, + "grad_norm": 2.711785078048706, + "learning_rate": 4.9978536134482047e-05, + "loss": 6.7111, + "step": 2219 + }, + { + "epoch": 0.013202968883813873, + "grad_norm": 2.6757078170776367, + "learning_rate": 4.997851677864005e-05, + "loss": 6.5501, + "step": 2220 + }, + { + "epoch": 0.01320891616709487, + "grad_norm": 2.150338888168335, + "learning_rate": 4.997849741407831e-05, + "loss": 6.43, + "step": 2221 + }, + { + "epoch": 0.013214863450375869, + "grad_norm": 3.115309953689575, + "learning_rate": 4.9978478040796836e-05, + "loss": 6.4074, + "step": 2222 + }, + { + "epoch": 0.013220810733656866, + "grad_norm": 2.8754189014434814, + "learning_rate": 4.997845865879564e-05, + "loss": 6.2663, + "step": 2223 + }, + { + "epoch": 0.013226758016937862, + "grad_norm": 2.6169707775115967, + "learning_rate": 4.9978439268074716e-05, + "loss": 6.5987, + "step": 2224 + }, + { + "epoch": 0.01323270530021886, + "grad_norm": 2.3814637660980225, + "learning_rate": 4.997841986863408e-05, + "loss": 6.8124, + "step": 2225 + }, + { + "epoch": 0.013238652583499858, + "grad_norm": 2.0276811122894287, + "learning_rate": 4.997840046047373e-05, + "loss": 6.6632, + "step": 2226 + }, + { + "epoch": 0.013244599866780855, + "grad_norm": 2.7943263053894043, + "learning_rate": 4.997838104359368e-05, + "loss": 6.5452, + "step": 2227 + }, + { + "epoch": 0.013250547150061852, + "grad_norm": 2.4058234691619873, + "learning_rate": 4.997836161799393e-05, + "loss": 6.4697, + "step": 2228 + }, + { + "epoch": 0.013256494433342848, + "grad_norm": 2.2487008571624756, + "learning_rate": 4.9978342183674504e-05, + "loss": 6.3361, + "step": 2229 + }, + { + "epoch": 0.013262441716623847, + "grad_norm": 2.3470170497894287, + "learning_rate": 4.997832274063539e-05, + "loss": 6.4024, + "step": 2230 + }, + { + "epoch": 0.013268388999904844, + "grad_norm": 2.589695692062378, + "learning_rate": 4.9978303288876606e-05, + "loss": 6.4184, + "step": 2231 + }, + { + "epoch": 0.01327433628318584, + "grad_norm": 2.691371440887451, + "learning_rate": 4.997828382839815e-05, + "loss": 6.4225, + "step": 2232 + }, + { + "epoch": 0.013280283566466837, + "grad_norm": 3.110410213470459, + "learning_rate": 4.997826435920003e-05, + "loss": 6.5307, + "step": 2233 + }, + { + "epoch": 0.013286230849747836, + "grad_norm": 2.688519239425659, + "learning_rate": 4.9978244881282266e-05, + "loss": 6.568, + "step": 2234 + }, + { + "epoch": 0.013292178133028833, + "grad_norm": 2.3346059322357178, + "learning_rate": 4.997822539464485e-05, + "loss": 6.8837, + "step": 2235 + }, + { + "epoch": 0.01329812541630983, + "grad_norm": 2.679826021194458, + "learning_rate": 4.997820589928779e-05, + "loss": 6.3961, + "step": 2236 + }, + { + "epoch": 0.013304072699590826, + "grad_norm": 2.388120412826538, + "learning_rate": 4.99781863952111e-05, + "loss": 6.4363, + "step": 2237 + }, + { + "epoch": 0.013310019982871825, + "grad_norm": 2.834341049194336, + "learning_rate": 4.997816688241478e-05, + "loss": 6.4855, + "step": 2238 + }, + { + "epoch": 0.013315967266152822, + "grad_norm": 2.8623831272125244, + "learning_rate": 4.997814736089885e-05, + "loss": 6.8607, + "step": 2239 + }, + { + "epoch": 0.013321914549433819, + "grad_norm": 3.001241683959961, + "learning_rate": 4.99781278306633e-05, + "loss": 6.9777, + "step": 2240 + }, + { + "epoch": 0.013327861832714815, + "grad_norm": 2.9721016883850098, + "learning_rate": 4.9978108291708135e-05, + "loss": 6.9821, + "step": 2241 + }, + { + "epoch": 0.013333809115995814, + "grad_norm": 2.798360824584961, + "learning_rate": 4.997808874403338e-05, + "loss": 7.0096, + "step": 2242 + }, + { + "epoch": 0.01333975639927681, + "grad_norm": 3.2242093086242676, + "learning_rate": 4.997806918763903e-05, + "loss": 6.9091, + "step": 2243 + }, + { + "epoch": 0.013345703682557808, + "grad_norm": 2.681920289993286, + "learning_rate": 4.99780496225251e-05, + "loss": 6.7769, + "step": 2244 + }, + { + "epoch": 0.013351650965838804, + "grad_norm": 3.199514865875244, + "learning_rate": 4.9978030048691584e-05, + "loss": 6.6202, + "step": 2245 + }, + { + "epoch": 0.013357598249119801, + "grad_norm": 2.89886474609375, + "learning_rate": 4.9978010466138496e-05, + "loss": 6.7075, + "step": 2246 + }, + { + "epoch": 0.0133635455324008, + "grad_norm": 2.7091262340545654, + "learning_rate": 4.997799087486584e-05, + "loss": 6.9129, + "step": 2247 + }, + { + "epoch": 0.013369492815681797, + "grad_norm": 2.2538888454437256, + "learning_rate": 4.997797127487364e-05, + "loss": 6.6412, + "step": 2248 + }, + { + "epoch": 0.013375440098962793, + "grad_norm": 2.668286085128784, + "learning_rate": 4.997795166616187e-05, + "loss": 6.8506, + "step": 2249 + }, + { + "epoch": 0.01338138738224379, + "grad_norm": 3.915975570678711, + "learning_rate": 4.997793204873057e-05, + "loss": 6.567, + "step": 2250 + }, + { + "epoch": 0.013387334665524789, + "grad_norm": 2.5549614429473877, + "learning_rate": 4.997791242257972e-05, + "loss": 6.7971, + "step": 2251 + }, + { + "epoch": 0.013393281948805786, + "grad_norm": 2.511810064315796, + "learning_rate": 4.997789278770935e-05, + "loss": 7.1949, + "step": 2252 + }, + { + "epoch": 0.013399229232086782, + "grad_norm": 2.026937484741211, + "learning_rate": 4.9977873144119445e-05, + "loss": 7.2067, + "step": 2253 + }, + { + "epoch": 0.01340517651536778, + "grad_norm": 3.6016058921813965, + "learning_rate": 4.997785349181002e-05, + "loss": 6.549, + "step": 2254 + }, + { + "epoch": 0.013411123798648778, + "grad_norm": 2.867418050765991, + "learning_rate": 4.9977833830781094e-05, + "loss": 6.5562, + "step": 2255 + }, + { + "epoch": 0.013417071081929775, + "grad_norm": 2.2168800830841064, + "learning_rate": 4.9977814161032665e-05, + "loss": 7.1798, + "step": 2256 + }, + { + "epoch": 0.013423018365210771, + "grad_norm": 2.728299856185913, + "learning_rate": 4.997779448256473e-05, + "loss": 6.9314, + "step": 2257 + }, + { + "epoch": 0.013428965648491768, + "grad_norm": 2.7336437702178955, + "learning_rate": 4.997777479537732e-05, + "loss": 7.0643, + "step": 2258 + }, + { + "epoch": 0.013434912931772767, + "grad_norm": 3.1546053886413574, + "learning_rate": 4.997775509947041e-05, + "loss": 6.8853, + "step": 2259 + }, + { + "epoch": 0.013440860215053764, + "grad_norm": 3.037036180496216, + "learning_rate": 4.997773539484404e-05, + "loss": 6.6892, + "step": 2260 + }, + { + "epoch": 0.01344680749833476, + "grad_norm": 2.8779382705688477, + "learning_rate": 4.997771568149818e-05, + "loss": 6.4991, + "step": 2261 + }, + { + "epoch": 0.013452754781615757, + "grad_norm": 3.1105282306671143, + "learning_rate": 4.997769595943288e-05, + "loss": 6.4253, + "step": 2262 + }, + { + "epoch": 0.013458702064896756, + "grad_norm": 4.604808330535889, + "learning_rate": 4.997767622864811e-05, + "loss": 6.504, + "step": 2263 + }, + { + "epoch": 0.013464649348177753, + "grad_norm": 4.345273017883301, + "learning_rate": 4.9977656489143896e-05, + "loss": 6.2, + "step": 2264 + }, + { + "epoch": 0.01347059663145875, + "grad_norm": 2.9744133949279785, + "learning_rate": 4.9977636740920243e-05, + "loss": 6.5458, + "step": 2265 + }, + { + "epoch": 0.013476543914739746, + "grad_norm": 3.3981447219848633, + "learning_rate": 4.9977616983977146e-05, + "loss": 6.9791, + "step": 2266 + }, + { + "epoch": 0.013482491198020745, + "grad_norm": 2.5855109691619873, + "learning_rate": 4.997759721831463e-05, + "loss": 6.7425, + "step": 2267 + }, + { + "epoch": 0.013488438481301742, + "grad_norm": 3.961195707321167, + "learning_rate": 4.997757744393269e-05, + "loss": 6.4042, + "step": 2268 + }, + { + "epoch": 0.013494385764582739, + "grad_norm": 3.8216230869293213, + "learning_rate": 4.997755766083133e-05, + "loss": 6.4962, + "step": 2269 + }, + { + "epoch": 0.013500333047863735, + "grad_norm": 3.077279567718506, + "learning_rate": 4.9977537869010574e-05, + "loss": 6.4298, + "step": 2270 + }, + { + "epoch": 0.013506280331144734, + "grad_norm": 2.56152081489563, + "learning_rate": 4.9977518068470406e-05, + "loss": 6.35, + "step": 2271 + }, + { + "epoch": 0.01351222761442573, + "grad_norm": 2.4069855213165283, + "learning_rate": 4.9977498259210854e-05, + "loss": 6.2923, + "step": 2272 + }, + { + "epoch": 0.013518174897706728, + "grad_norm": 2.9591124057769775, + "learning_rate": 4.9977478441231904e-05, + "loss": 6.2477, + "step": 2273 + }, + { + "epoch": 0.013524122180987724, + "grad_norm": 2.627110481262207, + "learning_rate": 4.997745861453359e-05, + "loss": 6.1012, + "step": 2274 + }, + { + "epoch": 0.013530069464268723, + "grad_norm": 2.3042867183685303, + "learning_rate": 4.997743877911589e-05, + "loss": 6.1155, + "step": 2275 + }, + { + "epoch": 0.01353601674754972, + "grad_norm": 2.709324359893799, + "learning_rate": 4.997741893497882e-05, + "loss": 6.0103, + "step": 2276 + }, + { + "epoch": 0.013541964030830717, + "grad_norm": 2.7087934017181396, + "learning_rate": 4.997739908212241e-05, + "loss": 6.0709, + "step": 2277 + }, + { + "epoch": 0.013547911314111713, + "grad_norm": 3.560149669647217, + "learning_rate": 4.997737922054664e-05, + "loss": 6.1775, + "step": 2278 + }, + { + "epoch": 0.01355385859739271, + "grad_norm": 4.623898506164551, + "learning_rate": 4.997735935025152e-05, + "loss": 6.1993, + "step": 2279 + }, + { + "epoch": 0.013559805880673709, + "grad_norm": 2.9960882663726807, + "learning_rate": 4.997733947123707e-05, + "loss": 6.4211, + "step": 2280 + }, + { + "epoch": 0.013565753163954706, + "grad_norm": 3.8918421268463135, + "learning_rate": 4.9977319583503276e-05, + "loss": 6.0194, + "step": 2281 + }, + { + "epoch": 0.013571700447235702, + "grad_norm": 3.4164741039276123, + "learning_rate": 4.997729968705017e-05, + "loss": 5.9824, + "step": 2282 + }, + { + "epoch": 0.0135776477305167, + "grad_norm": 2.4005794525146484, + "learning_rate": 4.997727978187774e-05, + "loss": 5.9727, + "step": 2283 + }, + { + "epoch": 0.013583595013797698, + "grad_norm": 2.4654550552368164, + "learning_rate": 4.9977259867986e-05, + "loss": 6.2681, + "step": 2284 + }, + { + "epoch": 0.013589542297078695, + "grad_norm": 3.193905830383301, + "learning_rate": 4.997723994537496e-05, + "loss": 6.4996, + "step": 2285 + }, + { + "epoch": 0.013595489580359691, + "grad_norm": 2.4845757484436035, + "learning_rate": 4.997722001404462e-05, + "loss": 7.0464, + "step": 2286 + }, + { + "epoch": 0.013601436863640688, + "grad_norm": 3.170182466506958, + "learning_rate": 4.9977200073995e-05, + "loss": 6.1071, + "step": 2287 + }, + { + "epoch": 0.013607384146921687, + "grad_norm": 2.2331149578094482, + "learning_rate": 4.997718012522609e-05, + "loss": 6.6823, + "step": 2288 + }, + { + "epoch": 0.013613331430202684, + "grad_norm": 2.4146671295166016, + "learning_rate": 4.9977160167737904e-05, + "loss": 6.4398, + "step": 2289 + }, + { + "epoch": 0.01361927871348368, + "grad_norm": 3.23956561088562, + "learning_rate": 4.9977140201530445e-05, + "loss": 6.9295, + "step": 2290 + }, + { + "epoch": 0.013625225996764677, + "grad_norm": 3.402979850769043, + "learning_rate": 4.997712022660374e-05, + "loss": 6.7116, + "step": 2291 + }, + { + "epoch": 0.013631173280045676, + "grad_norm": 3.241320848464966, + "learning_rate": 4.997710024295777e-05, + "loss": 6.8871, + "step": 2292 + }, + { + "epoch": 0.013637120563326673, + "grad_norm": 2.5378634929656982, + "learning_rate": 4.997708025059255e-05, + "loss": 6.9548, + "step": 2293 + }, + { + "epoch": 0.01364306784660767, + "grad_norm": 3.1968839168548584, + "learning_rate": 4.9977060249508087e-05, + "loss": 6.6388, + "step": 2294 + }, + { + "epoch": 0.013649015129888666, + "grad_norm": 2.6951656341552734, + "learning_rate": 4.99770402397044e-05, + "loss": 6.9654, + "step": 2295 + }, + { + "epoch": 0.013654962413169665, + "grad_norm": 2.4168484210968018, + "learning_rate": 4.997702022118147e-05, + "loss": 6.6666, + "step": 2296 + }, + { + "epoch": 0.013660909696450662, + "grad_norm": 3.1395177841186523, + "learning_rate": 4.997700019393934e-05, + "loss": 6.4957, + "step": 2297 + }, + { + "epoch": 0.013666856979731658, + "grad_norm": 3.1591687202453613, + "learning_rate": 4.9976980157977985e-05, + "loss": 6.4392, + "step": 2298 + }, + { + "epoch": 0.013672804263012655, + "grad_norm": 2.2415151596069336, + "learning_rate": 4.9976960113297436e-05, + "loss": 6.4543, + "step": 2299 + }, + { + "epoch": 0.013678751546293654, + "grad_norm": 3.9113616943359375, + "learning_rate": 4.997694005989767e-05, + "loss": 6.7088, + "step": 2300 + }, + { + "epoch": 0.01368469882957465, + "grad_norm": 4.218390941619873, + "learning_rate": 4.997691999777873e-05, + "loss": 6.7199, + "step": 2301 + }, + { + "epoch": 0.013690646112855647, + "grad_norm": 4.200760841369629, + "learning_rate": 4.997689992694059e-05, + "loss": 6.6343, + "step": 2302 + }, + { + "epoch": 0.013696593396136644, + "grad_norm": 3.7164547443389893, + "learning_rate": 4.997687984738328e-05, + "loss": 6.772, + "step": 2303 + }, + { + "epoch": 0.013702540679417643, + "grad_norm": 2.1898231506347656, + "learning_rate": 4.99768597591068e-05, + "loss": 6.6165, + "step": 2304 + }, + { + "epoch": 0.01370848796269864, + "grad_norm": 2.72632098197937, + "learning_rate": 4.9976839662111166e-05, + "loss": 6.6474, + "step": 2305 + }, + { + "epoch": 0.013714435245979636, + "grad_norm": 3.64900279045105, + "learning_rate": 4.997681955639636e-05, + "loss": 6.4322, + "step": 2306 + }, + { + "epoch": 0.013720382529260633, + "grad_norm": 3.978445053100586, + "learning_rate": 4.997679944196241e-05, + "loss": 6.5434, + "step": 2307 + }, + { + "epoch": 0.01372632981254163, + "grad_norm": 5.709702491760254, + "learning_rate": 4.997677931880931e-05, + "loss": 6.5234, + "step": 2308 + }, + { + "epoch": 0.013732277095822629, + "grad_norm": 3.0389838218688965, + "learning_rate": 4.997675918693708e-05, + "loss": 6.4163, + "step": 2309 + }, + { + "epoch": 0.013738224379103625, + "grad_norm": 2.695113182067871, + "learning_rate": 4.9976739046345725e-05, + "loss": 6.6956, + "step": 2310 + }, + { + "epoch": 0.013744171662384622, + "grad_norm": 2.9768142700195312, + "learning_rate": 4.997671889703525e-05, + "loss": 6.5315, + "step": 2311 + }, + { + "epoch": 0.01375011894566562, + "grad_norm": 3.750454902648926, + "learning_rate": 4.997669873900566e-05, + "loss": 6.5568, + "step": 2312 + }, + { + "epoch": 0.013756066228946618, + "grad_norm": 3.390232801437378, + "learning_rate": 4.9976678572256955e-05, + "loss": 6.4916, + "step": 2313 + }, + { + "epoch": 0.013762013512227615, + "grad_norm": 3.1487748622894287, + "learning_rate": 4.997665839678915e-05, + "loss": 6.6378, + "step": 2314 + }, + { + "epoch": 0.013767960795508611, + "grad_norm": 2.5654940605163574, + "learning_rate": 4.997663821260226e-05, + "loss": 6.5817, + "step": 2315 + }, + { + "epoch": 0.013773908078789608, + "grad_norm": 2.7092552185058594, + "learning_rate": 4.9976618019696275e-05, + "loss": 6.982, + "step": 2316 + }, + { + "epoch": 0.013779855362070607, + "grad_norm": 3.642826557159424, + "learning_rate": 4.9976597818071214e-05, + "loss": 6.7951, + "step": 2317 + }, + { + "epoch": 0.013785802645351604, + "grad_norm": 3.4288947582244873, + "learning_rate": 4.997657760772708e-05, + "loss": 6.4366, + "step": 2318 + }, + { + "epoch": 0.0137917499286326, + "grad_norm": 2.7620253562927246, + "learning_rate": 4.997655738866389e-05, + "loss": 6.6588, + "step": 2319 + }, + { + "epoch": 0.013797697211913597, + "grad_norm": 2.4266698360443115, + "learning_rate": 4.997653716088163e-05, + "loss": 6.697, + "step": 2320 + }, + { + "epoch": 0.013803644495194596, + "grad_norm": 2.289365768432617, + "learning_rate": 4.9976516924380325e-05, + "loss": 6.7583, + "step": 2321 + }, + { + "epoch": 0.013809591778475593, + "grad_norm": 2.4238948822021484, + "learning_rate": 4.9976496679159976e-05, + "loss": 6.7949, + "step": 2322 + }, + { + "epoch": 0.01381553906175659, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.997647642522059e-05, + "loss": 6.5914, + "step": 2323 + }, + { + "epoch": 0.013821486345037586, + "grad_norm": 2.961089849472046, + "learning_rate": 4.997645616256217e-05, + "loss": 6.3513, + "step": 2324 + }, + { + "epoch": 0.013827433628318585, + "grad_norm": 2.437685251235962, + "learning_rate": 4.997643589118472e-05, + "loss": 6.4626, + "step": 2325 + }, + { + "epoch": 0.013833380911599582, + "grad_norm": 2.769731044769287, + "learning_rate": 4.9976415611088267e-05, + "loss": 6.2801, + "step": 2326 + }, + { + "epoch": 0.013839328194880578, + "grad_norm": 2.700697183609009, + "learning_rate": 4.9976395322272805e-05, + "loss": 6.1969, + "step": 2327 + }, + { + "epoch": 0.013845275478161575, + "grad_norm": 3.8049886226654053, + "learning_rate": 4.997637502473834e-05, + "loss": 6.769, + "step": 2328 + }, + { + "epoch": 0.013851222761442574, + "grad_norm": 3.748903512954712, + "learning_rate": 4.9976354718484875e-05, + "loss": 6.6486, + "step": 2329 + }, + { + "epoch": 0.01385717004472357, + "grad_norm": 3.7807834148406982, + "learning_rate": 4.9976334403512426e-05, + "loss": 6.6251, + "step": 2330 + }, + { + "epoch": 0.013863117328004567, + "grad_norm": 2.5358874797821045, + "learning_rate": 4.997631407982099e-05, + "loss": 6.4425, + "step": 2331 + }, + { + "epoch": 0.013869064611285564, + "grad_norm": 2.4619522094726562, + "learning_rate": 4.9976293747410596e-05, + "loss": 7.2166, + "step": 2332 + }, + { + "epoch": 0.013875011894566563, + "grad_norm": 2.740412473678589, + "learning_rate": 4.997627340628123e-05, + "loss": 6.8907, + "step": 2333 + }, + { + "epoch": 0.01388095917784756, + "grad_norm": 2.872852087020874, + "learning_rate": 4.9976253056432895e-05, + "loss": 6.6142, + "step": 2334 + }, + { + "epoch": 0.013886906461128556, + "grad_norm": 2.01629900932312, + "learning_rate": 4.997623269786562e-05, + "loss": 6.398, + "step": 2335 + }, + { + "epoch": 0.013892853744409553, + "grad_norm": 2.4405698776245117, + "learning_rate": 4.99762123305794e-05, + "loss": 6.9282, + "step": 2336 + }, + { + "epoch": 0.013898801027690552, + "grad_norm": 2.2520413398742676, + "learning_rate": 4.9976191954574235e-05, + "loss": 6.5565, + "step": 2337 + }, + { + "epoch": 0.013904748310971549, + "grad_norm": 2.314852476119995, + "learning_rate": 4.997617156985014e-05, + "loss": 6.3055, + "step": 2338 + }, + { + "epoch": 0.013910695594252545, + "grad_norm": 2.9049081802368164, + "learning_rate": 4.9976151176407124e-05, + "loss": 7.1806, + "step": 2339 + }, + { + "epoch": 0.013916642877533542, + "grad_norm": 2.7533769607543945, + "learning_rate": 4.9976130774245197e-05, + "loss": 7.0047, + "step": 2340 + }, + { + "epoch": 0.013922590160814539, + "grad_norm": 2.124826431274414, + "learning_rate": 4.997611036336435e-05, + "loss": 7.1897, + "step": 2341 + }, + { + "epoch": 0.013928537444095538, + "grad_norm": 2.5205366611480713, + "learning_rate": 4.997608994376461e-05, + "loss": 6.8592, + "step": 2342 + }, + { + "epoch": 0.013934484727376534, + "grad_norm": 2.8026719093322754, + "learning_rate": 4.9976069515445975e-05, + "loss": 6.6622, + "step": 2343 + }, + { + "epoch": 0.013940432010657531, + "grad_norm": 3.045438051223755, + "learning_rate": 4.997604907840845e-05, + "loss": 6.6176, + "step": 2344 + }, + { + "epoch": 0.013946379293938528, + "grad_norm": 2.820199489593506, + "learning_rate": 4.997602863265204e-05, + "loss": 6.4489, + "step": 2345 + }, + { + "epoch": 0.013952326577219527, + "grad_norm": 2.997990369796753, + "learning_rate": 4.997600817817676e-05, + "loss": 7.0989, + "step": 2346 + }, + { + "epoch": 0.013958273860500523, + "grad_norm": 3.316575050354004, + "learning_rate": 4.9975987714982606e-05, + "loss": 6.9042, + "step": 2347 + }, + { + "epoch": 0.01396422114378152, + "grad_norm": 2.3339803218841553, + "learning_rate": 4.99759672430696e-05, + "loss": 6.8831, + "step": 2348 + }, + { + "epoch": 0.013970168427062517, + "grad_norm": 2.510274648666382, + "learning_rate": 4.997594676243775e-05, + "loss": 7.1093, + "step": 2349 + }, + { + "epoch": 0.013976115710343516, + "grad_norm": 2.893909215927124, + "learning_rate": 4.997592627308705e-05, + "loss": 6.5477, + "step": 2350 + }, + { + "epoch": 0.013982062993624512, + "grad_norm": 3.6036674976348877, + "learning_rate": 4.9975905775017505e-05, + "loss": 6.3278, + "step": 2351 + }, + { + "epoch": 0.01398801027690551, + "grad_norm": 2.1260125637054443, + "learning_rate": 4.9975885268229127e-05, + "loss": 6.7883, + "step": 2352 + }, + { + "epoch": 0.013993957560186506, + "grad_norm": 2.328247308731079, + "learning_rate": 4.997586475272193e-05, + "loss": 6.4832, + "step": 2353 + }, + { + "epoch": 0.013999904843467505, + "grad_norm": 2.8075780868530273, + "learning_rate": 4.997584422849593e-05, + "loss": 6.9333, + "step": 2354 + }, + { + "epoch": 0.014005852126748502, + "grad_norm": 1.9339990615844727, + "learning_rate": 4.9975823695551106e-05, + "loss": 6.6856, + "step": 2355 + }, + { + "epoch": 0.014011799410029498, + "grad_norm": 2.842968225479126, + "learning_rate": 4.997580315388748e-05, + "loss": 6.48, + "step": 2356 + }, + { + "epoch": 0.014017746693310495, + "grad_norm": 1.8715558052062988, + "learning_rate": 4.997578260350506e-05, + "loss": 6.8702, + "step": 2357 + }, + { + "epoch": 0.014023693976591494, + "grad_norm": 2.4310202598571777, + "learning_rate": 4.9975762044403865e-05, + "loss": 7.0112, + "step": 2358 + }, + { + "epoch": 0.01402964125987249, + "grad_norm": 2.292121648788452, + "learning_rate": 4.997574147658387e-05, + "loss": 6.6505, + "step": 2359 + }, + { + "epoch": 0.014035588543153487, + "grad_norm": 2.374007225036621, + "learning_rate": 4.997572090004511e-05, + "loss": 6.7332, + "step": 2360 + }, + { + "epoch": 0.014041535826434484, + "grad_norm": 2.198131561279297, + "learning_rate": 4.997570031478759e-05, + "loss": 6.6358, + "step": 2361 + }, + { + "epoch": 0.014047483109715483, + "grad_norm": 2.3109302520751953, + "learning_rate": 4.997567972081131e-05, + "loss": 6.6194, + "step": 2362 + }, + { + "epoch": 0.01405343039299648, + "grad_norm": 2.49338698387146, + "learning_rate": 4.997565911811627e-05, + "loss": 6.5036, + "step": 2363 + }, + { + "epoch": 0.014059377676277476, + "grad_norm": 2.6462419033050537, + "learning_rate": 4.997563850670249e-05, + "loss": 6.4294, + "step": 2364 + }, + { + "epoch": 0.014065324959558473, + "grad_norm": 3.0072524547576904, + "learning_rate": 4.997561788656997e-05, + "loss": 6.8814, + "step": 2365 + }, + { + "epoch": 0.014071272242839472, + "grad_norm": 2.435209035873413, + "learning_rate": 4.997559725771872e-05, + "loss": 6.4684, + "step": 2366 + }, + { + "epoch": 0.014077219526120469, + "grad_norm": 2.8023672103881836, + "learning_rate": 4.997557662014875e-05, + "loss": 6.7922, + "step": 2367 + }, + { + "epoch": 0.014083166809401465, + "grad_norm": 2.6129658222198486, + "learning_rate": 4.9975555973860065e-05, + "loss": 6.4539, + "step": 2368 + }, + { + "epoch": 0.014089114092682462, + "grad_norm": 2.559117317199707, + "learning_rate": 4.997553531885267e-05, + "loss": 6.4713, + "step": 2369 + }, + { + "epoch": 0.014095061375963459, + "grad_norm": 2.4535956382751465, + "learning_rate": 4.9975514655126575e-05, + "loss": 6.963, + "step": 2370 + }, + { + "epoch": 0.014101008659244458, + "grad_norm": 2.3025150299072266, + "learning_rate": 4.997549398268178e-05, + "loss": 6.9299, + "step": 2371 + }, + { + "epoch": 0.014106955942525454, + "grad_norm": 2.834411382675171, + "learning_rate": 4.997547330151831e-05, + "loss": 6.299, + "step": 2372 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 2.8046083450317383, + "learning_rate": 4.997545261163615e-05, + "loss": 5.7691, + "step": 2373 + }, + { + "epoch": 0.014118850509087448, + "grad_norm": 2.663776159286499, + "learning_rate": 4.997543191303532e-05, + "loss": 5.969, + "step": 2374 + }, + { + "epoch": 0.014124797792368447, + "grad_norm": 2.725154161453247, + "learning_rate": 4.997541120571582e-05, + "loss": 5.7473, + "step": 2375 + }, + { + "epoch": 0.014130745075649443, + "grad_norm": 2.9021074771881104, + "learning_rate": 4.9975390489677663e-05, + "loss": 6.3177, + "step": 2376 + }, + { + "epoch": 0.01413669235893044, + "grad_norm": 2.4043307304382324, + "learning_rate": 4.9975369764920866e-05, + "loss": 6.358, + "step": 2377 + }, + { + "epoch": 0.014142639642211437, + "grad_norm": 2.4163010120391846, + "learning_rate": 4.997534903144542e-05, + "loss": 6.6807, + "step": 2378 + }, + { + "epoch": 0.014148586925492436, + "grad_norm": 3.0710666179656982, + "learning_rate": 4.9975328289251335e-05, + "loss": 6.2416, + "step": 2379 + }, + { + "epoch": 0.014154534208773432, + "grad_norm": 2.159627676010132, + "learning_rate": 4.997530753833862e-05, + "loss": 7.1434, + "step": 2380 + }, + { + "epoch": 0.01416048149205443, + "grad_norm": 2.308382034301758, + "learning_rate": 4.997528677870729e-05, + "loss": 7.1243, + "step": 2381 + }, + { + "epoch": 0.014166428775335426, + "grad_norm": 2.7461323738098145, + "learning_rate": 4.997526601035734e-05, + "loss": 6.3066, + "step": 2382 + }, + { + "epoch": 0.014172376058616425, + "grad_norm": 2.8835322856903076, + "learning_rate": 4.997524523328878e-05, + "loss": 6.28, + "step": 2383 + }, + { + "epoch": 0.014178323341897421, + "grad_norm": 2.5195534229278564, + "learning_rate": 4.997522444750162e-05, + "loss": 6.9561, + "step": 2384 + }, + { + "epoch": 0.014184270625178418, + "grad_norm": 3.1697885990142822, + "learning_rate": 4.997520365299587e-05, + "loss": 6.7432, + "step": 2385 + }, + { + "epoch": 0.014190217908459415, + "grad_norm": 3.6300339698791504, + "learning_rate": 4.997518284977154e-05, + "loss": 6.3676, + "step": 2386 + }, + { + "epoch": 0.014196165191740414, + "grad_norm": 3.261981964111328, + "learning_rate": 4.9975162037828625e-05, + "loss": 6.0991, + "step": 2387 + }, + { + "epoch": 0.01420211247502141, + "grad_norm": 3.6291120052337646, + "learning_rate": 4.9975141217167146e-05, + "loss": 6.1239, + "step": 2388 + }, + { + "epoch": 0.014208059758302407, + "grad_norm": 3.192958116531372, + "learning_rate": 4.997512038778709e-05, + "loss": 6.4455, + "step": 2389 + }, + { + "epoch": 0.014214007041583404, + "grad_norm": 2.8887948989868164, + "learning_rate": 4.997509954968849e-05, + "loss": 6.9441, + "step": 2390 + }, + { + "epoch": 0.014219954324864403, + "grad_norm": 2.3568248748779297, + "learning_rate": 4.9975078702871336e-05, + "loss": 7.0207, + "step": 2391 + }, + { + "epoch": 0.0142259016081454, + "grad_norm": 2.2629294395446777, + "learning_rate": 4.997505784733564e-05, + "loss": 6.9575, + "step": 2392 + }, + { + "epoch": 0.014231848891426396, + "grad_norm": 2.5458898544311523, + "learning_rate": 4.99750369830814e-05, + "loss": 6.8533, + "step": 2393 + }, + { + "epoch": 0.014237796174707393, + "grad_norm": 2.5125060081481934, + "learning_rate": 4.997501611010865e-05, + "loss": 6.8615, + "step": 2394 + }, + { + "epoch": 0.014243743457988392, + "grad_norm": 2.9903738498687744, + "learning_rate": 4.997499522841737e-05, + "loss": 6.6927, + "step": 2395 + }, + { + "epoch": 0.014249690741269389, + "grad_norm": 2.7536470890045166, + "learning_rate": 4.997497433800758e-05, + "loss": 6.6454, + "step": 2396 + }, + { + "epoch": 0.014255638024550385, + "grad_norm": 3.5041043758392334, + "learning_rate": 4.997495343887928e-05, + "loss": 6.485, + "step": 2397 + }, + { + "epoch": 0.014261585307831382, + "grad_norm": 3.8025100231170654, + "learning_rate": 4.997493253103249e-05, + "loss": 6.3731, + "step": 2398 + }, + { + "epoch": 0.01426753259111238, + "grad_norm": 3.2657718658447266, + "learning_rate": 4.99749116144672e-05, + "loss": 6.23, + "step": 2399 + }, + { + "epoch": 0.014273479874393378, + "grad_norm": 2.721632719039917, + "learning_rate": 4.997489068918343e-05, + "loss": 6.7292, + "step": 2400 + }, + { + "epoch": 0.014279427157674374, + "grad_norm": 2.3483569622039795, + "learning_rate": 4.9974869755181186e-05, + "loss": 6.4842, + "step": 2401 + }, + { + "epoch": 0.014285374440955371, + "grad_norm": 2.4931676387786865, + "learning_rate": 4.997484881246047e-05, + "loss": 7.0529, + "step": 2402 + }, + { + "epoch": 0.014291321724236368, + "grad_norm": 2.4944825172424316, + "learning_rate": 4.99748278610213e-05, + "loss": 7.0185, + "step": 2403 + }, + { + "epoch": 0.014297269007517367, + "grad_norm": 2.9124202728271484, + "learning_rate": 4.997480690086367e-05, + "loss": 6.9847, + "step": 2404 + }, + { + "epoch": 0.014303216290798363, + "grad_norm": 2.5802674293518066, + "learning_rate": 4.997478593198759e-05, + "loss": 7.0389, + "step": 2405 + }, + { + "epoch": 0.01430916357407936, + "grad_norm": 2.636709451675415, + "learning_rate": 4.9974764954393075e-05, + "loss": 6.7281, + "step": 2406 + }, + { + "epoch": 0.014315110857360357, + "grad_norm": 3.801760196685791, + "learning_rate": 4.997474396808012e-05, + "loss": 5.9962, + "step": 2407 + }, + { + "epoch": 0.014321058140641356, + "grad_norm": 3.7983996868133545, + "learning_rate": 4.997472297304875e-05, + "loss": 6.3821, + "step": 2408 + }, + { + "epoch": 0.014327005423922352, + "grad_norm": 2.863408088684082, + "learning_rate": 4.997470196929895e-05, + "loss": 6.2206, + "step": 2409 + }, + { + "epoch": 0.01433295270720335, + "grad_norm": 2.6187095642089844, + "learning_rate": 4.997468095683076e-05, + "loss": 6.2205, + "step": 2410 + }, + { + "epoch": 0.014338899990484346, + "grad_norm": 3.202986240386963, + "learning_rate": 4.997465993564414e-05, + "loss": 6.259, + "step": 2411 + }, + { + "epoch": 0.014344847273765345, + "grad_norm": 2.9131264686584473, + "learning_rate": 4.9974638905739146e-05, + "loss": 6.4159, + "step": 2412 + }, + { + "epoch": 0.014350794557046341, + "grad_norm": 2.384477376937866, + "learning_rate": 4.9974617867115754e-05, + "loss": 6.6669, + "step": 2413 + }, + { + "epoch": 0.014356741840327338, + "grad_norm": 2.448495626449585, + "learning_rate": 4.997459681977398e-05, + "loss": 6.5679, + "step": 2414 + }, + { + "epoch": 0.014362689123608335, + "grad_norm": 2.1945343017578125, + "learning_rate": 4.997457576371384e-05, + "loss": 6.3856, + "step": 2415 + }, + { + "epoch": 0.014368636406889334, + "grad_norm": 1.867848515510559, + "learning_rate": 4.997455469893533e-05, + "loss": 6.3127, + "step": 2416 + }, + { + "epoch": 0.01437458369017033, + "grad_norm": 2.560976266860962, + "learning_rate": 4.997453362543846e-05, + "loss": 6.4619, + "step": 2417 + }, + { + "epoch": 0.014380530973451327, + "grad_norm": 3.2440431118011475, + "learning_rate": 4.997451254322323e-05, + "loss": 6.399, + "step": 2418 + }, + { + "epoch": 0.014386478256732324, + "grad_norm": 3.0021307468414307, + "learning_rate": 4.9974491452289664e-05, + "loss": 6.174, + "step": 2419 + }, + { + "epoch": 0.014392425540013323, + "grad_norm": 2.6046524047851562, + "learning_rate": 4.997447035263776e-05, + "loss": 6.8284, + "step": 2420 + }, + { + "epoch": 0.01439837282329432, + "grad_norm": 3.1395344734191895, + "learning_rate": 4.997444924426753e-05, + "loss": 6.3395, + "step": 2421 + }, + { + "epoch": 0.014404320106575316, + "grad_norm": 3.056152582168579, + "learning_rate": 4.997442812717897e-05, + "loss": 6.3468, + "step": 2422 + }, + { + "epoch": 0.014410267389856313, + "grad_norm": 2.2532267570495605, + "learning_rate": 4.9974407001372105e-05, + "loss": 6.5187, + "step": 2423 + }, + { + "epoch": 0.014416214673137312, + "grad_norm": 2.0228383541107178, + "learning_rate": 4.997438586684693e-05, + "loss": 6.4452, + "step": 2424 + }, + { + "epoch": 0.014422161956418308, + "grad_norm": 3.2889909744262695, + "learning_rate": 4.997436472360345e-05, + "loss": 6.6466, + "step": 2425 + }, + { + "epoch": 0.014428109239699305, + "grad_norm": 2.957916498184204, + "learning_rate": 4.9974343571641677e-05, + "loss": 6.9617, + "step": 2426 + }, + { + "epoch": 0.014434056522980302, + "grad_norm": 2.7629241943359375, + "learning_rate": 4.997432241096162e-05, + "loss": 6.1687, + "step": 2427 + }, + { + "epoch": 0.0144400038062613, + "grad_norm": 2.849297285079956, + "learning_rate": 4.997430124156329e-05, + "loss": 6.4647, + "step": 2428 + }, + { + "epoch": 0.014445951089542297, + "grad_norm": 2.2432122230529785, + "learning_rate": 4.997428006344669e-05, + "loss": 7.1739, + "step": 2429 + }, + { + "epoch": 0.014451898372823294, + "grad_norm": 2.814807891845703, + "learning_rate": 4.997425887661181e-05, + "loss": 5.945, + "step": 2430 + }, + { + "epoch": 0.014457845656104291, + "grad_norm": 3.140153646469116, + "learning_rate": 4.997423768105869e-05, + "loss": 6.5948, + "step": 2431 + }, + { + "epoch": 0.01446379293938529, + "grad_norm": 2.5276620388031006, + "learning_rate": 4.997421647678732e-05, + "loss": 6.9813, + "step": 2432 + }, + { + "epoch": 0.014469740222666286, + "grad_norm": 2.462204694747925, + "learning_rate": 4.9974195263797705e-05, + "loss": 6.8987, + "step": 2433 + }, + { + "epoch": 0.014475687505947283, + "grad_norm": 3.117255210876465, + "learning_rate": 4.997417404208986e-05, + "loss": 5.883, + "step": 2434 + }, + { + "epoch": 0.01448163478922828, + "grad_norm": 2.6207518577575684, + "learning_rate": 4.997415281166379e-05, + "loss": 6.8065, + "step": 2435 + }, + { + "epoch": 0.014487582072509277, + "grad_norm": 2.996624231338501, + "learning_rate": 4.99741315725195e-05, + "loss": 6.5162, + "step": 2436 + }, + { + "epoch": 0.014493529355790276, + "grad_norm": 2.1946496963500977, + "learning_rate": 4.9974110324656996e-05, + "loss": 6.9521, + "step": 2437 + }, + { + "epoch": 0.014499476639071272, + "grad_norm": 2.273017406463623, + "learning_rate": 4.997408906807629e-05, + "loss": 7.0144, + "step": 2438 + }, + { + "epoch": 0.01450542392235227, + "grad_norm": 2.516509771347046, + "learning_rate": 4.997406780277739e-05, + "loss": 7.013, + "step": 2439 + }, + { + "epoch": 0.014511371205633266, + "grad_norm": 3.0296435356140137, + "learning_rate": 4.9974046528760296e-05, + "loss": 6.934, + "step": 2440 + }, + { + "epoch": 0.014517318488914265, + "grad_norm": 2.6135010719299316, + "learning_rate": 4.9974025246025024e-05, + "loss": 6.7151, + "step": 2441 + }, + { + "epoch": 0.014523265772195261, + "grad_norm": 2.6850788593292236, + "learning_rate": 4.997400395457158e-05, + "loss": 6.5223, + "step": 2442 + }, + { + "epoch": 0.014529213055476258, + "grad_norm": 3.0401692390441895, + "learning_rate": 4.9973982654399966e-05, + "loss": 7.2006, + "step": 2443 + }, + { + "epoch": 0.014535160338757255, + "grad_norm": 3.016805410385132, + "learning_rate": 4.997396134551019e-05, + "loss": 7.0633, + "step": 2444 + }, + { + "epoch": 0.014541107622038254, + "grad_norm": 3.107154130935669, + "learning_rate": 4.9973940027902264e-05, + "loss": 6.9096, + "step": 2445 + }, + { + "epoch": 0.01454705490531925, + "grad_norm": 2.720054864883423, + "learning_rate": 4.9973918701576196e-05, + "loss": 6.7061, + "step": 2446 + }, + { + "epoch": 0.014553002188600247, + "grad_norm": 2.386401414871216, + "learning_rate": 4.9973897366531984e-05, + "loss": 6.5877, + "step": 2447 + }, + { + "epoch": 0.014558949471881244, + "grad_norm": 2.488243579864502, + "learning_rate": 4.997387602276965e-05, + "loss": 6.7792, + "step": 2448 + }, + { + "epoch": 0.014564896755162243, + "grad_norm": 2.7504360675811768, + "learning_rate": 4.9973854670289196e-05, + "loss": 6.6164, + "step": 2449 + }, + { + "epoch": 0.01457084403844324, + "grad_norm": 3.001441240310669, + "learning_rate": 4.9973833309090626e-05, + "loss": 6.5933, + "step": 2450 + }, + { + "epoch": 0.014576791321724236, + "grad_norm": 2.6449999809265137, + "learning_rate": 4.997381193917394e-05, + "loss": 6.5323, + "step": 2451 + }, + { + "epoch": 0.014582738605005233, + "grad_norm": 2.81846022605896, + "learning_rate": 4.9973790560539156e-05, + "loss": 6.5146, + "step": 2452 + }, + { + "epoch": 0.014588685888286232, + "grad_norm": 2.662916421890259, + "learning_rate": 4.997376917318629e-05, + "loss": 6.161, + "step": 2453 + }, + { + "epoch": 0.014594633171567228, + "grad_norm": 2.689601421356201, + "learning_rate": 4.997374777711533e-05, + "loss": 6.2008, + "step": 2454 + }, + { + "epoch": 0.014600580454848225, + "grad_norm": 2.6690561771392822, + "learning_rate": 4.99737263723263e-05, + "loss": 6.4418, + "step": 2455 + }, + { + "epoch": 0.014606527738129222, + "grad_norm": 2.897270917892456, + "learning_rate": 4.997370495881919e-05, + "loss": 6.3968, + "step": 2456 + }, + { + "epoch": 0.01461247502141022, + "grad_norm": 2.9327831268310547, + "learning_rate": 4.997368353659402e-05, + "loss": 6.4665, + "step": 2457 + }, + { + "epoch": 0.014618422304691217, + "grad_norm": 2.658013343811035, + "learning_rate": 4.99736621056508e-05, + "loss": 6.399, + "step": 2458 + }, + { + "epoch": 0.014624369587972214, + "grad_norm": 2.6055238246917725, + "learning_rate": 4.997364066598953e-05, + "loss": 6.4679, + "step": 2459 + }, + { + "epoch": 0.014630316871253211, + "grad_norm": 3.0595951080322266, + "learning_rate": 4.997361921761022e-05, + "loss": 5.8797, + "step": 2460 + }, + { + "epoch": 0.01463626415453421, + "grad_norm": 2.994694471359253, + "learning_rate": 4.997359776051288e-05, + "loss": 5.704, + "step": 2461 + }, + { + "epoch": 0.014642211437815206, + "grad_norm": 2.78153657913208, + "learning_rate": 4.9973576294697514e-05, + "loss": 5.7289, + "step": 2462 + }, + { + "epoch": 0.014648158721096203, + "grad_norm": 2.5119385719299316, + "learning_rate": 4.997355482016414e-05, + "loss": 5.5494, + "step": 2463 + }, + { + "epoch": 0.0146541060043772, + "grad_norm": 2.7880990505218506, + "learning_rate": 4.997353333691274e-05, + "loss": 5.5905, + "step": 2464 + }, + { + "epoch": 0.014660053287658197, + "grad_norm": 2.827352523803711, + "learning_rate": 4.9973511844943346e-05, + "loss": 6.4429, + "step": 2465 + }, + { + "epoch": 0.014666000570939195, + "grad_norm": 2.4297358989715576, + "learning_rate": 4.997349034425595e-05, + "loss": 6.8647, + "step": 2466 + }, + { + "epoch": 0.014671947854220192, + "grad_norm": 2.649064064025879, + "learning_rate": 4.997346883485057e-05, + "loss": 6.5568, + "step": 2467 + }, + { + "epoch": 0.014677895137501189, + "grad_norm": 3.2215452194213867, + "learning_rate": 4.9973447316727215e-05, + "loss": 5.5684, + "step": 2468 + }, + { + "epoch": 0.014683842420782186, + "grad_norm": 2.8760056495666504, + "learning_rate": 4.9973425789885884e-05, + "loss": 5.6395, + "step": 2469 + }, + { + "epoch": 0.014689789704063184, + "grad_norm": 2.4002890586853027, + "learning_rate": 4.9973404254326585e-05, + "loss": 5.9525, + "step": 2470 + }, + { + "epoch": 0.014695736987344181, + "grad_norm": 2.32314395904541, + "learning_rate": 4.997338271004933e-05, + "loss": 6.9675, + "step": 2471 + }, + { + "epoch": 0.014701684270625178, + "grad_norm": 2.262680768966675, + "learning_rate": 4.997336115705413e-05, + "loss": 7.1361, + "step": 2472 + }, + { + "epoch": 0.014707631553906175, + "grad_norm": 2.2855215072631836, + "learning_rate": 4.997333959534098e-05, + "loss": 7.1141, + "step": 2473 + }, + { + "epoch": 0.014713578837187173, + "grad_norm": 2.5461738109588623, + "learning_rate": 4.99733180249099e-05, + "loss": 7.0492, + "step": 2474 + }, + { + "epoch": 0.01471952612046817, + "grad_norm": 2.455561399459839, + "learning_rate": 4.99732964457609e-05, + "loss": 6.9303, + "step": 2475 + }, + { + "epoch": 0.014725473403749167, + "grad_norm": 3.3767740726470947, + "learning_rate": 4.997327485789397e-05, + "loss": 6.8531, + "step": 2476 + }, + { + "epoch": 0.014731420687030164, + "grad_norm": 2.9320104122161865, + "learning_rate": 4.9973253261309125e-05, + "loss": 6.9258, + "step": 2477 + }, + { + "epoch": 0.014737367970311162, + "grad_norm": 2.380960464477539, + "learning_rate": 4.997323165600638e-05, + "loss": 6.8581, + "step": 2478 + }, + { + "epoch": 0.01474331525359216, + "grad_norm": 2.727154016494751, + "learning_rate": 4.997321004198574e-05, + "loss": 7.3814, + "step": 2479 + }, + { + "epoch": 0.014749262536873156, + "grad_norm": 2.8693020343780518, + "learning_rate": 4.997318841924721e-05, + "loss": 6.3793, + "step": 2480 + }, + { + "epoch": 0.014755209820154153, + "grad_norm": 2.941622734069824, + "learning_rate": 4.997316678779079e-05, + "loss": 7.3567, + "step": 2481 + }, + { + "epoch": 0.014761157103435152, + "grad_norm": 3.0310213565826416, + "learning_rate": 4.9973145147616505e-05, + "loss": 6.8832, + "step": 2482 + }, + { + "epoch": 0.014767104386716148, + "grad_norm": 1.9184696674346924, + "learning_rate": 4.9973123498724353e-05, + "loss": 6.7369, + "step": 2483 + }, + { + "epoch": 0.014773051669997145, + "grad_norm": 2.3090195655822754, + "learning_rate": 4.9973101841114335e-05, + "loss": 6.8927, + "step": 2484 + }, + { + "epoch": 0.014778998953278142, + "grad_norm": 2.2947685718536377, + "learning_rate": 4.997308017478647e-05, + "loss": 6.9441, + "step": 2485 + }, + { + "epoch": 0.01478494623655914, + "grad_norm": 2.363690137863159, + "learning_rate": 4.997305849974076e-05, + "loss": 6.9397, + "step": 2486 + }, + { + "epoch": 0.014790893519840137, + "grad_norm": 1.7546948194503784, + "learning_rate": 4.997303681597721e-05, + "loss": 6.7888, + "step": 2487 + }, + { + "epoch": 0.014796840803121134, + "grad_norm": 1.8824211359024048, + "learning_rate": 4.997301512349584e-05, + "loss": 6.6486, + "step": 2488 + }, + { + "epoch": 0.014802788086402131, + "grad_norm": 3.68865704536438, + "learning_rate": 4.9972993422296636e-05, + "loss": 7.0318, + "step": 2489 + }, + { + "epoch": 0.01480873536968313, + "grad_norm": 3.0788486003875732, + "learning_rate": 4.997297171237962e-05, + "loss": 6.814, + "step": 2490 + }, + { + "epoch": 0.014814682652964126, + "grad_norm": 2.6903607845306396, + "learning_rate": 4.997294999374481e-05, + "loss": 6.9752, + "step": 2491 + }, + { + "epoch": 0.014820629936245123, + "grad_norm": 2.6673712730407715, + "learning_rate": 4.9972928266392194e-05, + "loss": 6.9083, + "step": 2492 + }, + { + "epoch": 0.01482657721952612, + "grad_norm": 2.335632801055908, + "learning_rate": 4.9972906530321786e-05, + "loss": 7.027, + "step": 2493 + }, + { + "epoch": 0.014832524502807119, + "grad_norm": 3.2885966300964355, + "learning_rate": 4.997288478553359e-05, + "loss": 6.6551, + "step": 2494 + }, + { + "epoch": 0.014838471786088115, + "grad_norm": 2.7297918796539307, + "learning_rate": 4.997286303202762e-05, + "loss": 6.7345, + "step": 2495 + }, + { + "epoch": 0.014844419069369112, + "grad_norm": 2.640814781188965, + "learning_rate": 4.997284126980388e-05, + "loss": 6.743, + "step": 2496 + }, + { + "epoch": 0.014850366352650109, + "grad_norm": 2.699632167816162, + "learning_rate": 4.997281949886239e-05, + "loss": 6.4633, + "step": 2497 + }, + { + "epoch": 0.014856313635931106, + "grad_norm": 2.5185790061950684, + "learning_rate": 4.9972797719203135e-05, + "loss": 6.5496, + "step": 2498 + }, + { + "epoch": 0.014862260919212104, + "grad_norm": 2.659393548965454, + "learning_rate": 4.9972775930826144e-05, + "loss": 6.5066, + "step": 2499 + }, + { + "epoch": 0.014868208202493101, + "grad_norm": 2.160808563232422, + "learning_rate": 4.99727541337314e-05, + "loss": 6.9851, + "step": 2500 + }, + { + "epoch": 0.014874155485774098, + "grad_norm": 2.656506299972534, + "learning_rate": 4.997273232791894e-05, + "loss": 7.5696, + "step": 2501 + }, + { + "epoch": 0.014880102769055095, + "grad_norm": 2.490612506866455, + "learning_rate": 4.9972710513388754e-05, + "loss": 7.2623, + "step": 2502 + }, + { + "epoch": 0.014886050052336093, + "grad_norm": 2.1744866371154785, + "learning_rate": 4.997268869014085e-05, + "loss": 6.5208, + "step": 2503 + }, + { + "epoch": 0.01489199733561709, + "grad_norm": 2.8058252334594727, + "learning_rate": 4.9972666858175236e-05, + "loss": 6.1527, + "step": 2504 + }, + { + "epoch": 0.014897944618898087, + "grad_norm": 2.418827533721924, + "learning_rate": 4.997264501749193e-05, + "loss": 6.2244, + "step": 2505 + }, + { + "epoch": 0.014903891902179084, + "grad_norm": 2.499648332595825, + "learning_rate": 4.997262316809092e-05, + "loss": 6.8904, + "step": 2506 + }, + { + "epoch": 0.014909839185460082, + "grad_norm": 2.3598594665527344, + "learning_rate": 4.9972601309972235e-05, + "loss": 7.0794, + "step": 2507 + }, + { + "epoch": 0.01491578646874108, + "grad_norm": 2.2443082332611084, + "learning_rate": 4.997257944313587e-05, + "loss": 7.3078, + "step": 2508 + }, + { + "epoch": 0.014921733752022076, + "grad_norm": 2.407501459121704, + "learning_rate": 4.9972557567581835e-05, + "loss": 7.0677, + "step": 2509 + }, + { + "epoch": 0.014927681035303073, + "grad_norm": 2.060865640640259, + "learning_rate": 4.997253568331014e-05, + "loss": 6.7128, + "step": 2510 + }, + { + "epoch": 0.014933628318584071, + "grad_norm": 2.3876516819000244, + "learning_rate": 4.997251379032078e-05, + "loss": 6.7562, + "step": 2511 + }, + { + "epoch": 0.014939575601865068, + "grad_norm": 2.387176990509033, + "learning_rate": 4.997249188861379e-05, + "loss": 6.8237, + "step": 2512 + }, + { + "epoch": 0.014945522885146065, + "grad_norm": 2.7324886322021484, + "learning_rate": 4.997246997818915e-05, + "loss": 6.8963, + "step": 2513 + }, + { + "epoch": 0.014951470168427062, + "grad_norm": 2.3832128047943115, + "learning_rate": 4.997244805904689e-05, + "loss": 6.9467, + "step": 2514 + }, + { + "epoch": 0.01495741745170806, + "grad_norm": 1.8594162464141846, + "learning_rate": 4.9972426131187e-05, + "loss": 7.0712, + "step": 2515 + }, + { + "epoch": 0.014963364734989057, + "grad_norm": 2.322068691253662, + "learning_rate": 4.997240419460949e-05, + "loss": 6.8898, + "step": 2516 + }, + { + "epoch": 0.014969312018270054, + "grad_norm": 2.4850032329559326, + "learning_rate": 4.997238224931438e-05, + "loss": 6.5439, + "step": 2517 + }, + { + "epoch": 0.014975259301551051, + "grad_norm": 2.919579029083252, + "learning_rate": 4.997236029530166e-05, + "loss": 6.3987, + "step": 2518 + }, + { + "epoch": 0.01498120658483205, + "grad_norm": 2.651900053024292, + "learning_rate": 4.997233833257135e-05, + "loss": 6.2735, + "step": 2519 + }, + { + "epoch": 0.014987153868113046, + "grad_norm": 2.7912142276763916, + "learning_rate": 4.997231636112346e-05, + "loss": 6.9835, + "step": 2520 + }, + { + "epoch": 0.014993101151394043, + "grad_norm": 2.5735433101654053, + "learning_rate": 4.997229438095799e-05, + "loss": 7.1218, + "step": 2521 + }, + { + "epoch": 0.01499904843467504, + "grad_norm": 2.483186721801758, + "learning_rate": 4.997227239207494e-05, + "loss": 7.0343, + "step": 2522 + }, + { + "epoch": 0.015004995717956039, + "grad_norm": 2.9296681880950928, + "learning_rate": 4.997225039447434e-05, + "loss": 6.5455, + "step": 2523 + }, + { + "epoch": 0.015010943001237035, + "grad_norm": 2.5536422729492188, + "learning_rate": 4.997222838815618e-05, + "loss": 6.7173, + "step": 2524 + }, + { + "epoch": 0.015016890284518032, + "grad_norm": 6.365324020385742, + "learning_rate": 4.997220637312047e-05, + "loss": 6.0909, + "step": 2525 + }, + { + "epoch": 0.015022837567799029, + "grad_norm": 3.7258150577545166, + "learning_rate": 4.997218434936723e-05, + "loss": 5.9019, + "step": 2526 + }, + { + "epoch": 0.015028784851080026, + "grad_norm": 2.9021997451782227, + "learning_rate": 4.997216231689645e-05, + "loss": 5.8601, + "step": 2527 + }, + { + "epoch": 0.015034732134361024, + "grad_norm": 2.570988416671753, + "learning_rate": 4.997214027570815e-05, + "loss": 6.1513, + "step": 2528 + }, + { + "epoch": 0.015040679417642021, + "grad_norm": 3.013540029525757, + "learning_rate": 4.997211822580233e-05, + "loss": 6.6471, + "step": 2529 + }, + { + "epoch": 0.015046626700923018, + "grad_norm": 2.612210750579834, + "learning_rate": 4.997209616717901e-05, + "loss": 6.5523, + "step": 2530 + }, + { + "epoch": 0.015052573984204015, + "grad_norm": 2.93513822555542, + "learning_rate": 4.9972074099838186e-05, + "loss": 6.1845, + "step": 2531 + }, + { + "epoch": 0.015058521267485013, + "grad_norm": 3.569002389907837, + "learning_rate": 4.9972052023779865e-05, + "loss": 6.7383, + "step": 2532 + }, + { + "epoch": 0.01506446855076601, + "grad_norm": 2.560023784637451, + "learning_rate": 4.9972029939004064e-05, + "loss": 6.4978, + "step": 2533 + }, + { + "epoch": 0.015070415834047007, + "grad_norm": 2.304612398147583, + "learning_rate": 4.997200784551078e-05, + "loss": 6.3316, + "step": 2534 + }, + { + "epoch": 0.015076363117328004, + "grad_norm": 2.4442996978759766, + "learning_rate": 4.997198574330003e-05, + "loss": 6.4245, + "step": 2535 + }, + { + "epoch": 0.015082310400609002, + "grad_norm": 2.764831304550171, + "learning_rate": 4.997196363237181e-05, + "loss": 6.2251, + "step": 2536 + }, + { + "epoch": 0.01508825768389, + "grad_norm": 2.6534347534179688, + "learning_rate": 4.997194151272615e-05, + "loss": 6.6674, + "step": 2537 + }, + { + "epoch": 0.015094204967170996, + "grad_norm": 2.5901331901550293, + "learning_rate": 4.997191938436303e-05, + "loss": 6.5724, + "step": 2538 + }, + { + "epoch": 0.015100152250451993, + "grad_norm": 2.6827733516693115, + "learning_rate": 4.9971897247282474e-05, + "loss": 6.4774, + "step": 2539 + }, + { + "epoch": 0.015106099533732991, + "grad_norm": 2.087397813796997, + "learning_rate": 4.997187510148449e-05, + "loss": 6.5011, + "step": 2540 + }, + { + "epoch": 0.015112046817013988, + "grad_norm": 2.157935619354248, + "learning_rate": 4.9971852946969076e-05, + "loss": 6.3258, + "step": 2541 + }, + { + "epoch": 0.015117994100294985, + "grad_norm": 2.680481195449829, + "learning_rate": 4.997183078373625e-05, + "loss": 6.5631, + "step": 2542 + }, + { + "epoch": 0.015123941383575982, + "grad_norm": 2.897608995437622, + "learning_rate": 4.997180861178602e-05, + "loss": 6.7913, + "step": 2543 + }, + { + "epoch": 0.01512988866685698, + "grad_norm": 2.5714452266693115, + "learning_rate": 4.997178643111838e-05, + "loss": 6.767, + "step": 2544 + }, + { + "epoch": 0.015135835950137977, + "grad_norm": 2.096376419067383, + "learning_rate": 4.997176424173336e-05, + "loss": 6.7365, + "step": 2545 + }, + { + "epoch": 0.015141783233418974, + "grad_norm": 2.083101987838745, + "learning_rate": 4.9971742043630955e-05, + "loss": 6.4693, + "step": 2546 + }, + { + "epoch": 0.015147730516699971, + "grad_norm": 3.509512186050415, + "learning_rate": 4.997171983681116e-05, + "loss": 6.4068, + "step": 2547 + }, + { + "epoch": 0.01515367779998097, + "grad_norm": 3.055772304534912, + "learning_rate": 4.997169762127401e-05, + "loss": 6.3411, + "step": 2548 + }, + { + "epoch": 0.015159625083261966, + "grad_norm": 2.627429485321045, + "learning_rate": 4.997167539701949e-05, + "loss": 6.3788, + "step": 2549 + }, + { + "epoch": 0.015165572366542963, + "grad_norm": 2.408599853515625, + "learning_rate": 4.997165316404761e-05, + "loss": 6.2822, + "step": 2550 + }, + { + "epoch": 0.01517151964982396, + "grad_norm": 2.906006336212158, + "learning_rate": 4.997163092235839e-05, + "loss": 6.2615, + "step": 2551 + }, + { + "epoch": 0.015177466933104958, + "grad_norm": 2.4585347175598145, + "learning_rate": 4.997160867195183e-05, + "loss": 6.4076, + "step": 2552 + }, + { + "epoch": 0.015183414216385955, + "grad_norm": 2.495539665222168, + "learning_rate": 4.9971586412827944e-05, + "loss": 6.4893, + "step": 2553 + }, + { + "epoch": 0.015189361499666952, + "grad_norm": 2.719583034515381, + "learning_rate": 4.9971564144986734e-05, + "loss": 6.276, + "step": 2554 + }, + { + "epoch": 0.015195308782947949, + "grad_norm": 2.464207887649536, + "learning_rate": 4.9971541868428206e-05, + "loss": 6.2713, + "step": 2555 + }, + { + "epoch": 0.015201256066228947, + "grad_norm": 2.3604822158813477, + "learning_rate": 4.997151958315237e-05, + "loss": 6.2648, + "step": 2556 + }, + { + "epoch": 0.015207203349509944, + "grad_norm": 2.729820966720581, + "learning_rate": 4.997149728915924e-05, + "loss": 6.2985, + "step": 2557 + }, + { + "epoch": 0.015213150632790941, + "grad_norm": 2.565760612487793, + "learning_rate": 4.997147498644882e-05, + "loss": 6.401, + "step": 2558 + }, + { + "epoch": 0.015219097916071938, + "grad_norm": 3.091628074645996, + "learning_rate": 4.9971452675021104e-05, + "loss": 6.1774, + "step": 2559 + }, + { + "epoch": 0.015225045199352935, + "grad_norm": 2.452453851699829, + "learning_rate": 4.9971430354876125e-05, + "loss": 6.4669, + "step": 2560 + }, + { + "epoch": 0.015230992482633933, + "grad_norm": 2.4285218715667725, + "learning_rate": 4.997140802601387e-05, + "loss": 6.4086, + "step": 2561 + }, + { + "epoch": 0.01523693976591493, + "grad_norm": 2.094043254852295, + "learning_rate": 4.9971385688434356e-05, + "loss": 6.2502, + "step": 2562 + }, + { + "epoch": 0.015242887049195927, + "grad_norm": 2.5989573001861572, + "learning_rate": 4.9971363342137586e-05, + "loss": 6.2948, + "step": 2563 + }, + { + "epoch": 0.015248834332476924, + "grad_norm": 2.5372314453125, + "learning_rate": 4.9971340987123574e-05, + "loss": 6.5643, + "step": 2564 + }, + { + "epoch": 0.015254781615757922, + "grad_norm": 2.3666064739227295, + "learning_rate": 4.9971318623392325e-05, + "loss": 6.4807, + "step": 2565 + }, + { + "epoch": 0.01526072889903892, + "grad_norm": 2.3216497898101807, + "learning_rate": 4.997129625094385e-05, + "loss": 6.448, + "step": 2566 + }, + { + "epoch": 0.015266676182319916, + "grad_norm": 2.202665090560913, + "learning_rate": 4.9971273869778153e-05, + "loss": 6.3766, + "step": 2567 + }, + { + "epoch": 0.015272623465600913, + "grad_norm": 2.5678982734680176, + "learning_rate": 4.997125147989524e-05, + "loss": 6.0799, + "step": 2568 + }, + { + "epoch": 0.015278570748881911, + "grad_norm": 2.7904717922210693, + "learning_rate": 4.997122908129512e-05, + "loss": 6.3446, + "step": 2569 + }, + { + "epoch": 0.015284518032162908, + "grad_norm": 2.383120059967041, + "learning_rate": 4.99712066739778e-05, + "loss": 6.2398, + "step": 2570 + }, + { + "epoch": 0.015290465315443905, + "grad_norm": 2.4302077293395996, + "learning_rate": 4.9971184257943294e-05, + "loss": 6.2678, + "step": 2571 + }, + { + "epoch": 0.015296412598724902, + "grad_norm": 2.2923178672790527, + "learning_rate": 4.99711618331916e-05, + "loss": 6.4742, + "step": 2572 + }, + { + "epoch": 0.0153023598820059, + "grad_norm": 2.582810878753662, + "learning_rate": 4.9971139399722735e-05, + "loss": 6.4679, + "step": 2573 + }, + { + "epoch": 0.015308307165286897, + "grad_norm": 2.718228578567505, + "learning_rate": 4.997111695753671e-05, + "loss": 6.2475, + "step": 2574 + }, + { + "epoch": 0.015314254448567894, + "grad_norm": 2.4639811515808105, + "learning_rate": 4.997109450663352e-05, + "loss": 6.463, + "step": 2575 + }, + { + "epoch": 0.01532020173184889, + "grad_norm": 2.6998252868652344, + "learning_rate": 4.997107204701318e-05, + "loss": 6.2885, + "step": 2576 + }, + { + "epoch": 0.01532614901512989, + "grad_norm": 2.831291437149048, + "learning_rate": 4.997104957867569e-05, + "loss": 6.2056, + "step": 2577 + }, + { + "epoch": 0.015332096298410886, + "grad_norm": 2.9070980548858643, + "learning_rate": 4.997102710162107e-05, + "loss": 6.3247, + "step": 2578 + }, + { + "epoch": 0.015338043581691883, + "grad_norm": 2.2583134174346924, + "learning_rate": 4.997100461584933e-05, + "loss": 6.3241, + "step": 2579 + }, + { + "epoch": 0.01534399086497288, + "grad_norm": 2.1661887168884277, + "learning_rate": 4.997098212136045e-05, + "loss": 6.173, + "step": 2580 + }, + { + "epoch": 0.015349938148253878, + "grad_norm": 2.146256446838379, + "learning_rate": 4.997095961815448e-05, + "loss": 6.2267, + "step": 2581 + }, + { + "epoch": 0.015355885431534875, + "grad_norm": 2.5691211223602295, + "learning_rate": 4.997093710623139e-05, + "loss": 6.3302, + "step": 2582 + }, + { + "epoch": 0.015361832714815872, + "grad_norm": 2.5439505577087402, + "learning_rate": 4.997091458559121e-05, + "loss": 6.2111, + "step": 2583 + }, + { + "epoch": 0.015367779998096869, + "grad_norm": 2.451582670211792, + "learning_rate": 4.997089205623394e-05, + "loss": 6.2369, + "step": 2584 + }, + { + "epoch": 0.015373727281377867, + "grad_norm": 2.6275687217712402, + "learning_rate": 4.99708695181596e-05, + "loss": 6.1104, + "step": 2585 + }, + { + "epoch": 0.015379674564658864, + "grad_norm": 2.7068562507629395, + "learning_rate": 4.997084697136818e-05, + "loss": 6.1646, + "step": 2586 + }, + { + "epoch": 0.015385621847939861, + "grad_norm": 2.7819957733154297, + "learning_rate": 4.9970824415859694e-05, + "loss": 6.4203, + "step": 2587 + }, + { + "epoch": 0.015391569131220858, + "grad_norm": 2.7021708488464355, + "learning_rate": 4.9970801851634154e-05, + "loss": 6.1535, + "step": 2588 + }, + { + "epoch": 0.015397516414501855, + "grad_norm": 2.50740909576416, + "learning_rate": 4.997077927869156e-05, + "loss": 6.0139, + "step": 2589 + }, + { + "epoch": 0.015403463697782853, + "grad_norm": 2.5769078731536865, + "learning_rate": 4.997075669703193e-05, + "loss": 6.129, + "step": 2590 + }, + { + "epoch": 0.01540941098106385, + "grad_norm": 2.7379090785980225, + "learning_rate": 4.997073410665526e-05, + "loss": 6.4168, + "step": 2591 + }, + { + "epoch": 0.015415358264344847, + "grad_norm": 2.3530659675598145, + "learning_rate": 4.9970711507561565e-05, + "loss": 6.3114, + "step": 2592 + }, + { + "epoch": 0.015421305547625844, + "grad_norm": 2.6025893688201904, + "learning_rate": 4.997068889975086e-05, + "loss": 6.2506, + "step": 2593 + }, + { + "epoch": 0.015427252830906842, + "grad_norm": 2.311833143234253, + "learning_rate": 4.9970666283223145e-05, + "loss": 6.3372, + "step": 2594 + }, + { + "epoch": 0.015433200114187839, + "grad_norm": 2.339947462081909, + "learning_rate": 4.997064365797842e-05, + "loss": 6.2987, + "step": 2595 + }, + { + "epoch": 0.015439147397468836, + "grad_norm": 2.2132725715637207, + "learning_rate": 4.9970621024016714e-05, + "loss": 6.2473, + "step": 2596 + }, + { + "epoch": 0.015445094680749833, + "grad_norm": 2.7063987255096436, + "learning_rate": 4.9970598381338014e-05, + "loss": 6.1702, + "step": 2597 + }, + { + "epoch": 0.015451041964030831, + "grad_norm": 2.4952430725097656, + "learning_rate": 4.9970575729942335e-05, + "loss": 6.3301, + "step": 2598 + }, + { + "epoch": 0.015456989247311828, + "grad_norm": 2.7442502975463867, + "learning_rate": 4.997055306982969e-05, + "loss": 6.1922, + "step": 2599 + }, + { + "epoch": 0.015462936530592825, + "grad_norm": 2.860058069229126, + "learning_rate": 4.997053040100008e-05, + "loss": 6.0674, + "step": 2600 + }, + { + "epoch": 0.015468883813873822, + "grad_norm": 2.821620464324951, + "learning_rate": 4.997050772345352e-05, + "loss": 6.0445, + "step": 2601 + }, + { + "epoch": 0.01547483109715482, + "grad_norm": 2.369174003601074, + "learning_rate": 4.997048503719001e-05, + "loss": 5.8641, + "step": 2602 + }, + { + "epoch": 0.015480778380435817, + "grad_norm": 2.2836029529571533, + "learning_rate": 4.997046234220956e-05, + "loss": 5.7629, + "step": 2603 + }, + { + "epoch": 0.015486725663716814, + "grad_norm": 3.13094162940979, + "learning_rate": 4.997043963851218e-05, + "loss": 6.7871, + "step": 2604 + }, + { + "epoch": 0.01549267294699781, + "grad_norm": 2.884119749069214, + "learning_rate": 4.9970416926097885e-05, + "loss": 6.1079, + "step": 2605 + }, + { + "epoch": 0.01549862023027881, + "grad_norm": 3.0921716690063477, + "learning_rate": 4.997039420496666e-05, + "loss": 5.9221, + "step": 2606 + }, + { + "epoch": 0.015504567513559806, + "grad_norm": 2.6903741359710693, + "learning_rate": 4.997037147511855e-05, + "loss": 5.7377, + "step": 2607 + }, + { + "epoch": 0.015510514796840803, + "grad_norm": 2.177030086517334, + "learning_rate": 4.997034873655352e-05, + "loss": 5.7272, + "step": 2608 + }, + { + "epoch": 0.0155164620801218, + "grad_norm": 2.41406512260437, + "learning_rate": 4.997032598927162e-05, + "loss": 5.6456, + "step": 2609 + }, + { + "epoch": 0.015522409363402798, + "grad_norm": 2.6853182315826416, + "learning_rate": 4.997030323327282e-05, + "loss": 6.1634, + "step": 2610 + }, + { + "epoch": 0.015528356646683795, + "grad_norm": 2.734081983566284, + "learning_rate": 4.997028046855715e-05, + "loss": 6.1366, + "step": 2611 + }, + { + "epoch": 0.015534303929964792, + "grad_norm": 2.234046459197998, + "learning_rate": 4.997025769512461e-05, + "loss": 5.6773, + "step": 2612 + }, + { + "epoch": 0.015540251213245789, + "grad_norm": 2.467381715774536, + "learning_rate": 4.9970234912975226e-05, + "loss": 5.6409, + "step": 2613 + }, + { + "epoch": 0.015546198496526787, + "grad_norm": 2.4890551567077637, + "learning_rate": 4.997021212210897e-05, + "loss": 5.5961, + "step": 2614 + }, + { + "epoch": 0.015552145779807784, + "grad_norm": 2.254138708114624, + "learning_rate": 4.997018932252588e-05, + "loss": 5.6039, + "step": 2615 + }, + { + "epoch": 0.015558093063088781, + "grad_norm": 2.5773816108703613, + "learning_rate": 4.9970166514225955e-05, + "loss": 5.9935, + "step": 2616 + }, + { + "epoch": 0.015564040346369778, + "grad_norm": 2.308300733566284, + "learning_rate": 4.997014369720921e-05, + "loss": 5.8307, + "step": 2617 + }, + { + "epoch": 0.015569987629650776, + "grad_norm": 2.3276724815368652, + "learning_rate": 4.9970120871475634e-05, + "loss": 5.5819, + "step": 2618 + }, + { + "epoch": 0.015575934912931773, + "grad_norm": 2.7989203929901123, + "learning_rate": 4.997009803702526e-05, + "loss": 6.0816, + "step": 2619 + }, + { + "epoch": 0.01558188219621277, + "grad_norm": 2.5614469051361084, + "learning_rate": 4.997007519385807e-05, + "loss": 5.6677, + "step": 2620 + }, + { + "epoch": 0.015587829479493767, + "grad_norm": 2.4494402408599854, + "learning_rate": 4.9970052341974096e-05, + "loss": 5.7754, + "step": 2621 + }, + { + "epoch": 0.015593776762774764, + "grad_norm": 2.214578151702881, + "learning_rate": 4.997002948137333e-05, + "loss": 6.4244, + "step": 2622 + }, + { + "epoch": 0.015599724046055762, + "grad_norm": 2.8115196228027344, + "learning_rate": 4.9970006612055776e-05, + "loss": 5.9822, + "step": 2623 + }, + { + "epoch": 0.015605671329336759, + "grad_norm": 2.4020626544952393, + "learning_rate": 4.996998373402146e-05, + "loss": 6.0481, + "step": 2624 + }, + { + "epoch": 0.015611618612617756, + "grad_norm": 2.3936421871185303, + "learning_rate": 4.996996084727038e-05, + "loss": 6.0663, + "step": 2625 + }, + { + "epoch": 0.015617565895898753, + "grad_norm": 2.2710554599761963, + "learning_rate": 4.996993795180254e-05, + "loss": 6.0668, + "step": 2626 + }, + { + "epoch": 0.015623513179179751, + "grad_norm": 2.141789436340332, + "learning_rate": 4.9969915047617955e-05, + "loss": 6.2159, + "step": 2627 + }, + { + "epoch": 0.015629460462460748, + "grad_norm": 2.557889461517334, + "learning_rate": 4.9969892134716635e-05, + "loss": 6.262, + "step": 2628 + }, + { + "epoch": 0.015635407745741747, + "grad_norm": 2.3966641426086426, + "learning_rate": 4.9969869213098574e-05, + "loss": 6.0412, + "step": 2629 + }, + { + "epoch": 0.01564135502902274, + "grad_norm": 2.301426410675049, + "learning_rate": 4.99698462827638e-05, + "loss": 6.0798, + "step": 2630 + }, + { + "epoch": 0.01564730231230374, + "grad_norm": 2.4315614700317383, + "learning_rate": 4.996982334371231e-05, + "loss": 5.8736, + "step": 2631 + }, + { + "epoch": 0.015653249595584735, + "grad_norm": 2.5549440383911133, + "learning_rate": 4.9969800395944105e-05, + "loss": 5.7858, + "step": 2632 + }, + { + "epoch": 0.015659196878865734, + "grad_norm": 2.480375289916992, + "learning_rate": 4.99697774394592e-05, + "loss": 6.3261, + "step": 2633 + }, + { + "epoch": 0.015665144162146732, + "grad_norm": 2.42866849899292, + "learning_rate": 4.9969754474257614e-05, + "loss": 6.1729, + "step": 2634 + }, + { + "epoch": 0.015671091445427728, + "grad_norm": 2.32722544670105, + "learning_rate": 4.9969731500339335e-05, + "loss": 5.7746, + "step": 2635 + }, + { + "epoch": 0.015677038728708726, + "grad_norm": 2.6797266006469727, + "learning_rate": 4.996970851770438e-05, + "loss": 6.1657, + "step": 2636 + }, + { + "epoch": 0.015682986011989725, + "grad_norm": 2.87758731842041, + "learning_rate": 4.9969685526352775e-05, + "loss": 6.1475, + "step": 2637 + }, + { + "epoch": 0.01568893329527072, + "grad_norm": 2.898663282394409, + "learning_rate": 4.996966252628449e-05, + "loss": 6.2942, + "step": 2638 + }, + { + "epoch": 0.01569488057855172, + "grad_norm": 3.3087987899780273, + "learning_rate": 4.996963951749957e-05, + "loss": 5.9962, + "step": 2639 + }, + { + "epoch": 0.015700827861832713, + "grad_norm": 2.4418020248413086, + "learning_rate": 4.996961649999799e-05, + "loss": 6.1065, + "step": 2640 + }, + { + "epoch": 0.015706775145113712, + "grad_norm": 2.5839014053344727, + "learning_rate": 4.9969593473779786e-05, + "loss": 6.2303, + "step": 2641 + }, + { + "epoch": 0.01571272242839471, + "grad_norm": 2.683163642883301, + "learning_rate": 4.996957043884495e-05, + "loss": 5.7194, + "step": 2642 + }, + { + "epoch": 0.015718669711675706, + "grad_norm": 2.628574848175049, + "learning_rate": 4.99695473951935e-05, + "loss": 5.6239, + "step": 2643 + }, + { + "epoch": 0.015724616994956704, + "grad_norm": 3.0716800689697266, + "learning_rate": 4.9969524342825434e-05, + "loss": 6.1957, + "step": 2644 + }, + { + "epoch": 0.015730564278237703, + "grad_norm": 2.415626287460327, + "learning_rate": 4.996950128174077e-05, + "loss": 6.2953, + "step": 2645 + }, + { + "epoch": 0.015736511561518698, + "grad_norm": 2.6836612224578857, + "learning_rate": 4.996947821193951e-05, + "loss": 6.103, + "step": 2646 + }, + { + "epoch": 0.015742458844799696, + "grad_norm": 2.2673206329345703, + "learning_rate": 4.996945513342166e-05, + "loss": 6.2628, + "step": 2647 + }, + { + "epoch": 0.01574840612808069, + "grad_norm": 2.629955530166626, + "learning_rate": 4.996943204618724e-05, + "loss": 6.2444, + "step": 2648 + }, + { + "epoch": 0.01575435341136169, + "grad_norm": 2.6730127334594727, + "learning_rate": 4.996940895023623e-05, + "loss": 6.0595, + "step": 2649 + }, + { + "epoch": 0.01576030069464269, + "grad_norm": 2.607389450073242, + "learning_rate": 4.996938584556867e-05, + "loss": 6.0253, + "step": 2650 + }, + { + "epoch": 0.015766247977923684, + "grad_norm": 2.264345407485962, + "learning_rate": 4.996936273218456e-05, + "loss": 6.1011, + "step": 2651 + }, + { + "epoch": 0.015772195261204682, + "grad_norm": 2.218766450881958, + "learning_rate": 4.99693396100839e-05, + "loss": 6.0545, + "step": 2652 + }, + { + "epoch": 0.015778142544485677, + "grad_norm": 2.435213088989258, + "learning_rate": 4.99693164792667e-05, + "loss": 6.0679, + "step": 2653 + }, + { + "epoch": 0.015784089827766676, + "grad_norm": 2.2278120517730713, + "learning_rate": 4.996929333973297e-05, + "loss": 6.0864, + "step": 2654 + }, + { + "epoch": 0.015790037111047674, + "grad_norm": 1.983554482460022, + "learning_rate": 4.9969270191482715e-05, + "loss": 6.124, + "step": 2655 + }, + { + "epoch": 0.01579598439432867, + "grad_norm": 1.9382312297821045, + "learning_rate": 4.996924703451594e-05, + "loss": 6.392, + "step": 2656 + }, + { + "epoch": 0.015801931677609668, + "grad_norm": 2.8142831325531006, + "learning_rate": 4.9969223868832674e-05, + "loss": 6.017, + "step": 2657 + }, + { + "epoch": 0.015807878960890667, + "grad_norm": 2.3466787338256836, + "learning_rate": 4.9969200694432904e-05, + "loss": 5.9588, + "step": 2658 + }, + { + "epoch": 0.01581382624417166, + "grad_norm": 2.0172243118286133, + "learning_rate": 4.996917751131664e-05, + "loss": 5.9513, + "step": 2659 + }, + { + "epoch": 0.01581977352745266, + "grad_norm": 2.3778223991394043, + "learning_rate": 4.99691543194839e-05, + "loss": 6.2205, + "step": 2660 + }, + { + "epoch": 0.015825720810733655, + "grad_norm": 2.4351084232330322, + "learning_rate": 4.9969131118934675e-05, + "loss": 6.0916, + "step": 2661 + }, + { + "epoch": 0.015831668094014654, + "grad_norm": 2.22328519821167, + "learning_rate": 4.9969107909669e-05, + "loss": 6.5546, + "step": 2662 + }, + { + "epoch": 0.015837615377295652, + "grad_norm": 2.4626407623291016, + "learning_rate": 4.996908469168685e-05, + "loss": 6.522, + "step": 2663 + }, + { + "epoch": 0.015843562660576647, + "grad_norm": 2.1032283306121826, + "learning_rate": 4.9969061464988266e-05, + "loss": 6.3372, + "step": 2664 + }, + { + "epoch": 0.015849509943857646, + "grad_norm": 2.1436524391174316, + "learning_rate": 4.9969038229573236e-05, + "loss": 6.3792, + "step": 2665 + }, + { + "epoch": 0.015855457227138645, + "grad_norm": 2.42084002494812, + "learning_rate": 4.996901498544176e-05, + "loss": 6.701, + "step": 2666 + }, + { + "epoch": 0.01586140451041964, + "grad_norm": 2.854630947113037, + "learning_rate": 4.996899173259388e-05, + "loss": 6.3273, + "step": 2667 + }, + { + "epoch": 0.015867351793700638, + "grad_norm": 2.2480521202087402, + "learning_rate": 4.996896847102957e-05, + "loss": 6.4314, + "step": 2668 + }, + { + "epoch": 0.015873299076981633, + "grad_norm": 3.7074203491210938, + "learning_rate": 4.996894520074886e-05, + "loss": 5.9438, + "step": 2669 + }, + { + "epoch": 0.015879246360262632, + "grad_norm": 3.1037209033966064, + "learning_rate": 4.9968921921751735e-05, + "loss": 5.7915, + "step": 2670 + }, + { + "epoch": 0.01588519364354363, + "grad_norm": 2.8338170051574707, + "learning_rate": 4.996889863403823e-05, + "loss": 6.7765, + "step": 2671 + }, + { + "epoch": 0.015891140926824626, + "grad_norm": 2.6366934776306152, + "learning_rate": 4.996887533760833e-05, + "loss": 6.8019, + "step": 2672 + }, + { + "epoch": 0.015897088210105624, + "grad_norm": 2.3954126834869385, + "learning_rate": 4.996885203246207e-05, + "loss": 6.3946, + "step": 2673 + }, + { + "epoch": 0.015903035493386623, + "grad_norm": 2.5771238803863525, + "learning_rate": 4.996882871859943e-05, + "loss": 6.3767, + "step": 2674 + }, + { + "epoch": 0.015908982776667618, + "grad_norm": 3.8544304370880127, + "learning_rate": 4.9968805396020424e-05, + "loss": 7.0813, + "step": 2675 + }, + { + "epoch": 0.015914930059948616, + "grad_norm": 3.4221606254577637, + "learning_rate": 4.996878206472507e-05, + "loss": 6.4782, + "step": 2676 + }, + { + "epoch": 0.01592087734322961, + "grad_norm": 3.6425843238830566, + "learning_rate": 4.996875872471338e-05, + "loss": 5.8685, + "step": 2677 + }, + { + "epoch": 0.01592682462651061, + "grad_norm": 3.255345344543457, + "learning_rate": 4.996873537598535e-05, + "loss": 5.7099, + "step": 2678 + }, + { + "epoch": 0.01593277190979161, + "grad_norm": 2.5217175483703613, + "learning_rate": 4.9968712018540997e-05, + "loss": 5.8978, + "step": 2679 + }, + { + "epoch": 0.015938719193072604, + "grad_norm": 2.2415871620178223, + "learning_rate": 4.996868865238031e-05, + "loss": 6.8186, + "step": 2680 + }, + { + "epoch": 0.015944666476353602, + "grad_norm": 2.1412270069122314, + "learning_rate": 4.996866527750332e-05, + "loss": 6.8056, + "step": 2681 + }, + { + "epoch": 0.015950613759634597, + "grad_norm": 2.423093557357788, + "learning_rate": 4.996864189391004e-05, + "loss": 7.0769, + "step": 2682 + }, + { + "epoch": 0.015956561042915596, + "grad_norm": 2.2334039211273193, + "learning_rate": 4.9968618501600454e-05, + "loss": 6.9954, + "step": 2683 + }, + { + "epoch": 0.015962508326196594, + "grad_norm": 2.4311838150024414, + "learning_rate": 4.996859510057458e-05, + "loss": 6.8375, + "step": 2684 + }, + { + "epoch": 0.01596845560947759, + "grad_norm": 4.861137866973877, + "learning_rate": 4.996857169083242e-05, + "loss": 6.2628, + "step": 2685 + }, + { + "epoch": 0.015974402892758588, + "grad_norm": 3.064213991165161, + "learning_rate": 4.996854827237401e-05, + "loss": 6.4316, + "step": 2686 + }, + { + "epoch": 0.015980350176039586, + "grad_norm": 2.307011365890503, + "learning_rate": 4.996852484519932e-05, + "loss": 6.6212, + "step": 2687 + }, + { + "epoch": 0.01598629745932058, + "grad_norm": 2.5157034397125244, + "learning_rate": 4.9968501409308374e-05, + "loss": 7.153, + "step": 2688 + }, + { + "epoch": 0.01599224474260158, + "grad_norm": 2.4122424125671387, + "learning_rate": 4.996847796470119e-05, + "loss": 7.2244, + "step": 2689 + }, + { + "epoch": 0.015998192025882575, + "grad_norm": 2.305055618286133, + "learning_rate": 4.9968454511377773e-05, + "loss": 7.4751, + "step": 2690 + }, + { + "epoch": 0.016004139309163574, + "grad_norm": 3.068027973175049, + "learning_rate": 4.9968431049338116e-05, + "loss": 6.5709, + "step": 2691 + }, + { + "epoch": 0.016010086592444572, + "grad_norm": 2.09893798828125, + "learning_rate": 4.9968407578582246e-05, + "loss": 6.7212, + "step": 2692 + }, + { + "epoch": 0.016016033875725567, + "grad_norm": 2.3161933422088623, + "learning_rate": 4.9968384099110163e-05, + "loss": 6.6243, + "step": 2693 + }, + { + "epoch": 0.016021981159006566, + "grad_norm": 2.913304090499878, + "learning_rate": 4.9968360610921874e-05, + "loss": 6.1946, + "step": 2694 + }, + { + "epoch": 0.016027928442287565, + "grad_norm": 2.746368408203125, + "learning_rate": 4.9968337114017386e-05, + "loss": 6.3783, + "step": 2695 + }, + { + "epoch": 0.01603387572556856, + "grad_norm": 2.40331768989563, + "learning_rate": 4.9968313608396705e-05, + "loss": 6.9898, + "step": 2696 + }, + { + "epoch": 0.016039823008849558, + "grad_norm": 2.214869976043701, + "learning_rate": 4.9968290094059844e-05, + "loss": 6.4497, + "step": 2697 + }, + { + "epoch": 0.016045770292130553, + "grad_norm": 2.050436019897461, + "learning_rate": 4.996826657100682e-05, + "loss": 6.8897, + "step": 2698 + }, + { + "epoch": 0.016051717575411552, + "grad_norm": 2.294149398803711, + "learning_rate": 4.996824303923763e-05, + "loss": 6.5583, + "step": 2699 + }, + { + "epoch": 0.01605766485869255, + "grad_norm": 2.26918625831604, + "learning_rate": 4.996821949875228e-05, + "loss": 6.7411, + "step": 2700 + }, + { + "epoch": 0.016063612141973545, + "grad_norm": 2.1330158710479736, + "learning_rate": 4.9968195949550775e-05, + "loss": 6.8068, + "step": 2701 + }, + { + "epoch": 0.016069559425254544, + "grad_norm": 1.8605769872665405, + "learning_rate": 4.996817239163315e-05, + "loss": 6.4833, + "step": 2702 + }, + { + "epoch": 0.016075506708535543, + "grad_norm": 3.132803440093994, + "learning_rate": 4.996814882499938e-05, + "loss": 5.8281, + "step": 2703 + }, + { + "epoch": 0.016081453991816538, + "grad_norm": 3.1079390048980713, + "learning_rate": 4.996812524964949e-05, + "loss": 5.6894, + "step": 2704 + }, + { + "epoch": 0.016087401275097536, + "grad_norm": 2.2877023220062256, + "learning_rate": 4.996810166558349e-05, + "loss": 7.0128, + "step": 2705 + }, + { + "epoch": 0.01609334855837853, + "grad_norm": 2.415696859359741, + "learning_rate": 4.996807807280138e-05, + "loss": 6.8098, + "step": 2706 + }, + { + "epoch": 0.01609929584165953, + "grad_norm": 2.342111110687256, + "learning_rate": 4.996805447130317e-05, + "loss": 7.2452, + "step": 2707 + }, + { + "epoch": 0.01610524312494053, + "grad_norm": 2.6504852771759033, + "learning_rate": 4.996803086108887e-05, + "loss": 6.6731, + "step": 2708 + }, + { + "epoch": 0.016111190408221523, + "grad_norm": 2.6157166957855225, + "learning_rate": 4.996800724215849e-05, + "loss": 6.9377, + "step": 2709 + }, + { + "epoch": 0.016117137691502522, + "grad_norm": 2.6289443969726562, + "learning_rate": 4.9967983614512036e-05, + "loss": 6.639, + "step": 2710 + }, + { + "epoch": 0.01612308497478352, + "grad_norm": 2.966489791870117, + "learning_rate": 4.996795997814952e-05, + "loss": 6.3681, + "step": 2711 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 3.7333364486694336, + "learning_rate": 4.9967936333070944e-05, + "loss": 5.6015, + "step": 2712 + }, + { + "epoch": 0.016134979541345514, + "grad_norm": 2.942728281021118, + "learning_rate": 4.9967912679276316e-05, + "loss": 5.6548, + "step": 2713 + }, + { + "epoch": 0.01614092682462651, + "grad_norm": 2.394622802734375, + "learning_rate": 4.996788901676566e-05, + "loss": 6.5119, + "step": 2714 + }, + { + "epoch": 0.016146874107907508, + "grad_norm": 2.8388447761535645, + "learning_rate": 4.9967865345538963e-05, + "loss": 6.4424, + "step": 2715 + }, + { + "epoch": 0.016152821391188506, + "grad_norm": 2.7682905197143555, + "learning_rate": 4.9967841665596245e-05, + "loss": 6.4688, + "step": 2716 + }, + { + "epoch": 0.0161587686744695, + "grad_norm": 3.0281460285186768, + "learning_rate": 4.996781797693751e-05, + "loss": 6.52, + "step": 2717 + }, + { + "epoch": 0.0161647159577505, + "grad_norm": 2.9734318256378174, + "learning_rate": 4.996779427956276e-05, + "loss": 6.4307, + "step": 2718 + }, + { + "epoch": 0.016170663241031495, + "grad_norm": 2.7653586864471436, + "learning_rate": 4.996777057347202e-05, + "loss": 6.1783, + "step": 2719 + }, + { + "epoch": 0.016176610524312494, + "grad_norm": 2.9418516159057617, + "learning_rate": 4.996774685866529e-05, + "loss": 6.5466, + "step": 2720 + }, + { + "epoch": 0.016182557807593492, + "grad_norm": 2.789217233657837, + "learning_rate": 4.996772313514258e-05, + "loss": 6.9296, + "step": 2721 + }, + { + "epoch": 0.016188505090874487, + "grad_norm": 2.8092539310455322, + "learning_rate": 4.996769940290389e-05, + "loss": 6.6186, + "step": 2722 + }, + { + "epoch": 0.016194452374155486, + "grad_norm": 2.696572780609131, + "learning_rate": 4.996767566194923e-05, + "loss": 6.5361, + "step": 2723 + }, + { + "epoch": 0.016200399657436484, + "grad_norm": 2.5987300872802734, + "learning_rate": 4.996765191227862e-05, + "loss": 6.4029, + "step": 2724 + }, + { + "epoch": 0.01620634694071748, + "grad_norm": 2.083057165145874, + "learning_rate": 4.996762815389205e-05, + "loss": 6.4747, + "step": 2725 + }, + { + "epoch": 0.016212294223998478, + "grad_norm": 2.912338972091675, + "learning_rate": 4.9967604386789555e-05, + "loss": 6.8869, + "step": 2726 + }, + { + "epoch": 0.016218241507279473, + "grad_norm": 2.642224073410034, + "learning_rate": 4.9967580610971124e-05, + "loss": 6.6701, + "step": 2727 + }, + { + "epoch": 0.016224188790560472, + "grad_norm": 2.673652410507202, + "learning_rate": 4.996755682643676e-05, + "loss": 6.8624, + "step": 2728 + }, + { + "epoch": 0.01623013607384147, + "grad_norm": 2.5223872661590576, + "learning_rate": 4.996753303318648e-05, + "loss": 6.8247, + "step": 2729 + }, + { + "epoch": 0.016236083357122465, + "grad_norm": 2.252037525177002, + "learning_rate": 4.99675092312203e-05, + "loss": 6.7924, + "step": 2730 + }, + { + "epoch": 0.016242030640403464, + "grad_norm": 2.2854461669921875, + "learning_rate": 4.9967485420538216e-05, + "loss": 6.4761, + "step": 2731 + }, + { + "epoch": 0.016247977923684463, + "grad_norm": 2.426912546157837, + "learning_rate": 4.9967461601140244e-05, + "loss": 6.6028, + "step": 2732 + }, + { + "epoch": 0.016253925206965458, + "grad_norm": 2.7375681400299072, + "learning_rate": 4.9967437773026384e-05, + "loss": 6.5283, + "step": 2733 + }, + { + "epoch": 0.016259872490246456, + "grad_norm": 2.7669689655303955, + "learning_rate": 4.996741393619665e-05, + "loss": 6.4382, + "step": 2734 + }, + { + "epoch": 0.01626581977352745, + "grad_norm": 2.294597864151001, + "learning_rate": 4.996739009065105e-05, + "loss": 6.7479, + "step": 2735 + }, + { + "epoch": 0.01627176705680845, + "grad_norm": 2.4791014194488525, + "learning_rate": 4.996736623638959e-05, + "loss": 6.7043, + "step": 2736 + }, + { + "epoch": 0.01627771434008945, + "grad_norm": 2.4080021381378174, + "learning_rate": 4.9967342373412286e-05, + "loss": 6.6046, + "step": 2737 + }, + { + "epoch": 0.016283661623370443, + "grad_norm": 2.463109254837036, + "learning_rate": 4.996731850171914e-05, + "loss": 6.3895, + "step": 2738 + }, + { + "epoch": 0.016289608906651442, + "grad_norm": 2.665908098220825, + "learning_rate": 4.9967294621310155e-05, + "loss": 6.6482, + "step": 2739 + }, + { + "epoch": 0.01629555618993244, + "grad_norm": 2.399526357650757, + "learning_rate": 4.996727073218536e-05, + "loss": 6.7098, + "step": 2740 + }, + { + "epoch": 0.016301503473213436, + "grad_norm": 2.678091287612915, + "learning_rate": 4.996724683434473e-05, + "loss": 6.419, + "step": 2741 + }, + { + "epoch": 0.016307450756494434, + "grad_norm": 2.5573642253875732, + "learning_rate": 4.99672229277883e-05, + "loss": 6.4703, + "step": 2742 + }, + { + "epoch": 0.01631339803977543, + "grad_norm": 2.644097089767456, + "learning_rate": 4.996719901251607e-05, + "loss": 5.9854, + "step": 2743 + }, + { + "epoch": 0.016319345323056428, + "grad_norm": 2.6165592670440674, + "learning_rate": 4.996717508852805e-05, + "loss": 6.1776, + "step": 2744 + }, + { + "epoch": 0.016325292606337426, + "grad_norm": 2.175647020339966, + "learning_rate": 4.996715115582426e-05, + "loss": 6.5533, + "step": 2745 + }, + { + "epoch": 0.01633123988961842, + "grad_norm": 2.112217664718628, + "learning_rate": 4.996712721440467e-05, + "loss": 6.5572, + "step": 2746 + }, + { + "epoch": 0.01633718717289942, + "grad_norm": 2.165111541748047, + "learning_rate": 4.996710326426933e-05, + "loss": 6.2798, + "step": 2747 + }, + { + "epoch": 0.016343134456180415, + "grad_norm": 2.5812315940856934, + "learning_rate": 4.996707930541823e-05, + "loss": 6.0831, + "step": 2748 + }, + { + "epoch": 0.016349081739461414, + "grad_norm": 2.2306227684020996, + "learning_rate": 4.996705533785138e-05, + "loss": 6.5833, + "step": 2749 + }, + { + "epoch": 0.016355029022742412, + "grad_norm": 1.999974250793457, + "learning_rate": 4.996703136156878e-05, + "loss": 6.2461, + "step": 2750 + }, + { + "epoch": 0.016360976306023407, + "grad_norm": 2.0521416664123535, + "learning_rate": 4.996700737657046e-05, + "loss": 6.4606, + "step": 2751 + }, + { + "epoch": 0.016366923589304406, + "grad_norm": 1.8630053997039795, + "learning_rate": 4.996698338285642e-05, + "loss": 6.1375, + "step": 2752 + }, + { + "epoch": 0.016372870872585404, + "grad_norm": 1.7525913715362549, + "learning_rate": 4.9966959380426646e-05, + "loss": 6.1769, + "step": 2753 + }, + { + "epoch": 0.0163788181558664, + "grad_norm": 2.8151230812072754, + "learning_rate": 4.996693536928118e-05, + "loss": 5.9066, + "step": 2754 + }, + { + "epoch": 0.016384765439147398, + "grad_norm": 2.503230571746826, + "learning_rate": 4.9966911349420004e-05, + "loss": 6.3725, + "step": 2755 + }, + { + "epoch": 0.016390712722428393, + "grad_norm": 2.676284074783325, + "learning_rate": 4.996688732084314e-05, + "loss": 6.9086, + "step": 2756 + }, + { + "epoch": 0.01639666000570939, + "grad_norm": 2.3367252349853516, + "learning_rate": 4.99668632835506e-05, + "loss": 6.1323, + "step": 2757 + }, + { + "epoch": 0.01640260728899039, + "grad_norm": 3.3071084022521973, + "learning_rate": 4.996683923754237e-05, + "loss": 6.162, + "step": 2758 + }, + { + "epoch": 0.016408554572271385, + "grad_norm": 2.64388370513916, + "learning_rate": 4.9966815182818494e-05, + "loss": 6.171, + "step": 2759 + }, + { + "epoch": 0.016414501855552384, + "grad_norm": 2.2378199100494385, + "learning_rate": 4.996679111937895e-05, + "loss": 6.4466, + "step": 2760 + }, + { + "epoch": 0.016420449138833382, + "grad_norm": 2.5944395065307617, + "learning_rate": 4.996676704722376e-05, + "loss": 6.7034, + "step": 2761 + }, + { + "epoch": 0.016426396422114378, + "grad_norm": 2.768211841583252, + "learning_rate": 4.996674296635293e-05, + "loss": 6.7551, + "step": 2762 + }, + { + "epoch": 0.016432343705395376, + "grad_norm": 2.80188250541687, + "learning_rate": 4.9966718876766467e-05, + "loss": 6.8437, + "step": 2763 + }, + { + "epoch": 0.01643829098867637, + "grad_norm": 2.2422847747802734, + "learning_rate": 4.996669477846438e-05, + "loss": 6.5365, + "step": 2764 + }, + { + "epoch": 0.01644423827195737, + "grad_norm": 2.526724100112915, + "learning_rate": 4.996667067144668e-05, + "loss": 6.3735, + "step": 2765 + }, + { + "epoch": 0.01645018555523837, + "grad_norm": 3.2267372608184814, + "learning_rate": 4.996664655571337e-05, + "loss": 6.0508, + "step": 2766 + }, + { + "epoch": 0.016456132838519363, + "grad_norm": 3.393270969390869, + "learning_rate": 4.996662243126446e-05, + "loss": 6.5543, + "step": 2767 + }, + { + "epoch": 0.016462080121800362, + "grad_norm": 2.7712342739105225, + "learning_rate": 4.996659829809996e-05, + "loss": 6.5891, + "step": 2768 + }, + { + "epoch": 0.01646802740508136, + "grad_norm": 2.5687179565429688, + "learning_rate": 4.996657415621988e-05, + "loss": 6.464, + "step": 2769 + }, + { + "epoch": 0.016473974688362356, + "grad_norm": 3.059953451156616, + "learning_rate": 4.996655000562424e-05, + "loss": 6.4286, + "step": 2770 + }, + { + "epoch": 0.016479921971643354, + "grad_norm": 3.3729803562164307, + "learning_rate": 4.9966525846313015e-05, + "loss": 6.5937, + "step": 2771 + }, + { + "epoch": 0.01648586925492435, + "grad_norm": 2.907397985458374, + "learning_rate": 4.996650167828624e-05, + "loss": 6.2559, + "step": 2772 + }, + { + "epoch": 0.016491816538205348, + "grad_norm": 3.5011706352233887, + "learning_rate": 4.996647750154392e-05, + "loss": 5.7897, + "step": 2773 + }, + { + "epoch": 0.016497763821486346, + "grad_norm": 2.5495986938476562, + "learning_rate": 4.996645331608607e-05, + "loss": 6.688, + "step": 2774 + }, + { + "epoch": 0.01650371110476734, + "grad_norm": 2.486416816711426, + "learning_rate": 4.9966429121912675e-05, + "loss": 6.8169, + "step": 2775 + }, + { + "epoch": 0.01650965838804834, + "grad_norm": 2.272162437438965, + "learning_rate": 4.9966404919023755e-05, + "loss": 6.696, + "step": 2776 + }, + { + "epoch": 0.016515605671329335, + "grad_norm": 2.9408323764801025, + "learning_rate": 4.9966380707419334e-05, + "loss": 6.1711, + "step": 2777 + }, + { + "epoch": 0.016521552954610334, + "grad_norm": 3.361907958984375, + "learning_rate": 4.99663564870994e-05, + "loss": 5.6029, + "step": 2778 + }, + { + "epoch": 0.016527500237891332, + "grad_norm": 3.06835675239563, + "learning_rate": 4.996633225806397e-05, + "loss": 5.332, + "step": 2779 + }, + { + "epoch": 0.016533447521172327, + "grad_norm": 3.058638572692871, + "learning_rate": 4.9966308020313054e-05, + "loss": 6.3345, + "step": 2780 + }, + { + "epoch": 0.016539394804453326, + "grad_norm": 2.8265507221221924, + "learning_rate": 4.9966283773846654e-05, + "loss": 5.4231, + "step": 2781 + }, + { + "epoch": 0.016545342087734324, + "grad_norm": 3.128094434738159, + "learning_rate": 4.996625951866478e-05, + "loss": 5.4144, + "step": 2782 + }, + { + "epoch": 0.01655128937101532, + "grad_norm": 2.6830554008483887, + "learning_rate": 4.9966235254767445e-05, + "loss": 6.0084, + "step": 2783 + }, + { + "epoch": 0.016557236654296318, + "grad_norm": 2.7146122455596924, + "learning_rate": 4.996621098215466e-05, + "loss": 6.7104, + "step": 2784 + }, + { + "epoch": 0.016563183937577313, + "grad_norm": 3.518169403076172, + "learning_rate": 4.9966186700826425e-05, + "loss": 5.4509, + "step": 2785 + }, + { + "epoch": 0.01656913122085831, + "grad_norm": 2.7607035636901855, + "learning_rate": 4.9966162410782755e-05, + "loss": 6.2149, + "step": 2786 + }, + { + "epoch": 0.01657507850413931, + "grad_norm": 2.897862195968628, + "learning_rate": 4.996613811202365e-05, + "loss": 6.4713, + "step": 2787 + }, + { + "epoch": 0.016581025787420305, + "grad_norm": 2.6984574794769287, + "learning_rate": 4.9966113804549134e-05, + "loss": 6.2298, + "step": 2788 + }, + { + "epoch": 0.016586973070701304, + "grad_norm": 2.7281908988952637, + "learning_rate": 4.996608948835919e-05, + "loss": 6.0244, + "step": 2789 + }, + { + "epoch": 0.016592920353982302, + "grad_norm": 2.314769983291626, + "learning_rate": 4.996606516345386e-05, + "loss": 6.8523, + "step": 2790 + }, + { + "epoch": 0.016598867637263297, + "grad_norm": 2.887943744659424, + "learning_rate": 4.9966040829833115e-05, + "loss": 6.8407, + "step": 2791 + }, + { + "epoch": 0.016604814920544296, + "grad_norm": 3.4924309253692627, + "learning_rate": 4.9966016487497e-05, + "loss": 6.3646, + "step": 2792 + }, + { + "epoch": 0.01661076220382529, + "grad_norm": 2.3095340728759766, + "learning_rate": 4.9965992136445495e-05, + "loss": 6.407, + "step": 2793 + }, + { + "epoch": 0.01661670948710629, + "grad_norm": 3.771980047225952, + "learning_rate": 4.9965967776678627e-05, + "loss": 6.0596, + "step": 2794 + }, + { + "epoch": 0.016622656770387288, + "grad_norm": 3.452252149581909, + "learning_rate": 4.99659434081964e-05, + "loss": 6.1351, + "step": 2795 + }, + { + "epoch": 0.016628604053668283, + "grad_norm": 2.4391021728515625, + "learning_rate": 4.996591903099881e-05, + "loss": 6.3304, + "step": 2796 + }, + { + "epoch": 0.016634551336949282, + "grad_norm": 2.7057220935821533, + "learning_rate": 4.9965894645085885e-05, + "loss": 6.8328, + "step": 2797 + }, + { + "epoch": 0.01664049862023028, + "grad_norm": 2.392627716064453, + "learning_rate": 4.996587025045762e-05, + "loss": 6.8491, + "step": 2798 + }, + { + "epoch": 0.016646445903511276, + "grad_norm": 2.47928786277771, + "learning_rate": 4.9965845847114024e-05, + "loss": 6.6323, + "step": 2799 + }, + { + "epoch": 0.016652393186792274, + "grad_norm": 2.438870668411255, + "learning_rate": 4.9965821435055115e-05, + "loss": 6.3832, + "step": 2800 + }, + { + "epoch": 0.01665834047007327, + "grad_norm": 2.6875247955322266, + "learning_rate": 4.9965797014280895e-05, + "loss": 6.6994, + "step": 2801 + }, + { + "epoch": 0.016664287753354268, + "grad_norm": 2.71785044670105, + "learning_rate": 4.996577258479137e-05, + "loss": 6.2505, + "step": 2802 + }, + { + "epoch": 0.016670235036635266, + "grad_norm": 2.32853102684021, + "learning_rate": 4.996574814658655e-05, + "loss": 6.4409, + "step": 2803 + }, + { + "epoch": 0.01667618231991626, + "grad_norm": 2.271027088165283, + "learning_rate": 4.996572369966646e-05, + "loss": 6.4928, + "step": 2804 + }, + { + "epoch": 0.01668212960319726, + "grad_norm": 2.621448278427124, + "learning_rate": 4.996569924403108e-05, + "loss": 6.7248, + "step": 2805 + }, + { + "epoch": 0.01668807688647826, + "grad_norm": 3.621654748916626, + "learning_rate": 4.9965674779680435e-05, + "loss": 6.7268, + "step": 2806 + }, + { + "epoch": 0.016694024169759254, + "grad_norm": 2.2045094966888428, + "learning_rate": 4.9965650306614534e-05, + "loss": 6.6406, + "step": 2807 + }, + { + "epoch": 0.016699971453040252, + "grad_norm": 2.4885873794555664, + "learning_rate": 4.9965625824833376e-05, + "loss": 6.611, + "step": 2808 + }, + { + "epoch": 0.016705918736321247, + "grad_norm": 2.796971082687378, + "learning_rate": 4.996560133433697e-05, + "loss": 6.455, + "step": 2809 + }, + { + "epoch": 0.016711866019602246, + "grad_norm": 2.539395570755005, + "learning_rate": 4.996557683512535e-05, + "loss": 6.8169, + "step": 2810 + }, + { + "epoch": 0.016717813302883244, + "grad_norm": 2.322824239730835, + "learning_rate": 4.99655523271985e-05, + "loss": 6.3217, + "step": 2811 + }, + { + "epoch": 0.01672376058616424, + "grad_norm": 2.4404520988464355, + "learning_rate": 4.9965527810556424e-05, + "loss": 6.5026, + "step": 2812 + }, + { + "epoch": 0.016729707869445238, + "grad_norm": 2.287362575531006, + "learning_rate": 4.996550328519915e-05, + "loss": 6.9183, + "step": 2813 + }, + { + "epoch": 0.016735655152726233, + "grad_norm": 2.369877815246582, + "learning_rate": 4.996547875112667e-05, + "loss": 6.7488, + "step": 2814 + }, + { + "epoch": 0.01674160243600723, + "grad_norm": 2.323082685470581, + "learning_rate": 4.996545420833899e-05, + "loss": 6.6177, + "step": 2815 + }, + { + "epoch": 0.01674754971928823, + "grad_norm": 2.221214532852173, + "learning_rate": 4.9965429656836145e-05, + "loss": 6.6844, + "step": 2816 + }, + { + "epoch": 0.016753497002569225, + "grad_norm": 2.246819496154785, + "learning_rate": 4.9965405096618116e-05, + "loss": 6.5631, + "step": 2817 + }, + { + "epoch": 0.016759444285850224, + "grad_norm": 2.411806583404541, + "learning_rate": 4.996538052768493e-05, + "loss": 6.4037, + "step": 2818 + }, + { + "epoch": 0.016765391569131222, + "grad_norm": 1.941197395324707, + "learning_rate": 4.996535595003658e-05, + "loss": 6.5232, + "step": 2819 + }, + { + "epoch": 0.016771338852412217, + "grad_norm": 2.149991750717163, + "learning_rate": 4.996533136367309e-05, + "loss": 6.4166, + "step": 2820 + }, + { + "epoch": 0.016777286135693216, + "grad_norm": 2.5388433933258057, + "learning_rate": 4.9965306768594454e-05, + "loss": 6.5733, + "step": 2821 + }, + { + "epoch": 0.01678323341897421, + "grad_norm": 2.1857333183288574, + "learning_rate": 4.9965282164800694e-05, + "loss": 6.5558, + "step": 2822 + }, + { + "epoch": 0.01678918070225521, + "grad_norm": 2.1090164184570312, + "learning_rate": 4.9965257552291804e-05, + "loss": 6.6916, + "step": 2823 + }, + { + "epoch": 0.016795127985536208, + "grad_norm": 2.1102349758148193, + "learning_rate": 4.9965232931067806e-05, + "loss": 6.5852, + "step": 2824 + }, + { + "epoch": 0.016801075268817203, + "grad_norm": 2.384660005569458, + "learning_rate": 4.99652083011287e-05, + "loss": 6.5033, + "step": 2825 + }, + { + "epoch": 0.016807022552098202, + "grad_norm": 2.314896821975708, + "learning_rate": 4.9965183662474504e-05, + "loss": 6.4108, + "step": 2826 + }, + { + "epoch": 0.0168129698353792, + "grad_norm": 2.4358227252960205, + "learning_rate": 4.9965159015105215e-05, + "loss": 6.5309, + "step": 2827 + }, + { + "epoch": 0.016818917118660195, + "grad_norm": 2.179905652999878, + "learning_rate": 4.9965134359020844e-05, + "loss": 6.4593, + "step": 2828 + }, + { + "epoch": 0.016824864401941194, + "grad_norm": 2.2742464542388916, + "learning_rate": 4.99651096942214e-05, + "loss": 6.6654, + "step": 2829 + }, + { + "epoch": 0.01683081168522219, + "grad_norm": 2.211026668548584, + "learning_rate": 4.9965085020706906e-05, + "loss": 6.4527, + "step": 2830 + }, + { + "epoch": 0.016836758968503188, + "grad_norm": 2.552072763442993, + "learning_rate": 4.996506033847735e-05, + "loss": 6.5338, + "step": 2831 + }, + { + "epoch": 0.016842706251784186, + "grad_norm": 2.3208038806915283, + "learning_rate": 4.996503564753276e-05, + "loss": 6.473, + "step": 2832 + }, + { + "epoch": 0.01684865353506518, + "grad_norm": 2.3756048679351807, + "learning_rate": 4.996501094787312e-05, + "loss": 6.4223, + "step": 2833 + }, + { + "epoch": 0.01685460081834618, + "grad_norm": 2.386152982711792, + "learning_rate": 4.996498623949846e-05, + "loss": 6.317, + "step": 2834 + }, + { + "epoch": 0.01686054810162718, + "grad_norm": 2.144510507583618, + "learning_rate": 4.996496152240878e-05, + "loss": 6.4039, + "step": 2835 + }, + { + "epoch": 0.016866495384908173, + "grad_norm": 2.3362607955932617, + "learning_rate": 4.996493679660409e-05, + "loss": 6.5411, + "step": 2836 + }, + { + "epoch": 0.016872442668189172, + "grad_norm": 2.156428337097168, + "learning_rate": 4.9964912062084404e-05, + "loss": 6.3399, + "step": 2837 + }, + { + "epoch": 0.016878389951470167, + "grad_norm": 2.3429903984069824, + "learning_rate": 4.9964887318849715e-05, + "loss": 6.5159, + "step": 2838 + }, + { + "epoch": 0.016884337234751166, + "grad_norm": 2.1888442039489746, + "learning_rate": 4.9964862566900045e-05, + "loss": 6.3906, + "step": 2839 + }, + { + "epoch": 0.016890284518032164, + "grad_norm": 2.3973047733306885, + "learning_rate": 4.9964837806235396e-05, + "loss": 6.3452, + "step": 2840 + }, + { + "epoch": 0.01689623180131316, + "grad_norm": 2.232057809829712, + "learning_rate": 4.996481303685578e-05, + "loss": 6.5203, + "step": 2841 + }, + { + "epoch": 0.016902179084594158, + "grad_norm": 2.672342300415039, + "learning_rate": 4.996478825876122e-05, + "loss": 6.8615, + "step": 2842 + }, + { + "epoch": 0.016908126367875153, + "grad_norm": 2.603943347930908, + "learning_rate": 4.996476347195171e-05, + "loss": 7.1632, + "step": 2843 + }, + { + "epoch": 0.01691407365115615, + "grad_norm": 2.684616804122925, + "learning_rate": 4.9964738676427234e-05, + "loss": 6.5546, + "step": 2844 + }, + { + "epoch": 0.01692002093443715, + "grad_norm": 2.1103904247283936, + "learning_rate": 4.996471387218785e-05, + "loss": 6.4666, + "step": 2845 + }, + { + "epoch": 0.016925968217718145, + "grad_norm": 2.8278937339782715, + "learning_rate": 4.9964689059233525e-05, + "loss": 6.3685, + "step": 2846 + }, + { + "epoch": 0.016931915500999144, + "grad_norm": 3.2611489295959473, + "learning_rate": 4.9964664237564296e-05, + "loss": 6.5537, + "step": 2847 + }, + { + "epoch": 0.016937862784280142, + "grad_norm": 3.029353141784668, + "learning_rate": 4.9964639407180155e-05, + "loss": 6.6097, + "step": 2848 + }, + { + "epoch": 0.016943810067561137, + "grad_norm": 2.6735312938690186, + "learning_rate": 4.996461456808112e-05, + "loss": 6.5854, + "step": 2849 + }, + { + "epoch": 0.016949757350842136, + "grad_norm": 2.7619409561157227, + "learning_rate": 4.99645897202672e-05, + "loss": 6.5944, + "step": 2850 + }, + { + "epoch": 0.01695570463412313, + "grad_norm": 3.0398738384246826, + "learning_rate": 4.9964564863738396e-05, + "loss": 6.3804, + "step": 2851 + }, + { + "epoch": 0.01696165191740413, + "grad_norm": 3.5388784408569336, + "learning_rate": 4.996453999849472e-05, + "loss": 7.0993, + "step": 2852 + }, + { + "epoch": 0.016967599200685128, + "grad_norm": 2.3602113723754883, + "learning_rate": 4.9964515124536185e-05, + "loss": 6.4981, + "step": 2853 + }, + { + "epoch": 0.016973546483966123, + "grad_norm": 2.346632957458496, + "learning_rate": 4.996449024186278e-05, + "loss": 6.4892, + "step": 2854 + }, + { + "epoch": 0.016979493767247122, + "grad_norm": 2.9653544425964355, + "learning_rate": 4.996446535047454e-05, + "loss": 6.2772, + "step": 2855 + }, + { + "epoch": 0.01698544105052812, + "grad_norm": 3.1064538955688477, + "learning_rate": 4.996444045037147e-05, + "loss": 6.238, + "step": 2856 + }, + { + "epoch": 0.016991388333809115, + "grad_norm": 2.9617815017700195, + "learning_rate": 4.9964415541553564e-05, + "loss": 6.2991, + "step": 2857 + }, + { + "epoch": 0.016997335617090114, + "grad_norm": 2.5993905067443848, + "learning_rate": 4.996439062402084e-05, + "loss": 6.5482, + "step": 2858 + }, + { + "epoch": 0.01700328290037111, + "grad_norm": 2.5469226837158203, + "learning_rate": 4.996436569777331e-05, + "loss": 6.437, + "step": 2859 + }, + { + "epoch": 0.017009230183652108, + "grad_norm": 2.709184408187866, + "learning_rate": 4.9964340762810965e-05, + "loss": 6.1362, + "step": 2860 + }, + { + "epoch": 0.017015177466933106, + "grad_norm": 2.843942880630493, + "learning_rate": 4.9964315819133837e-05, + "loss": 6.2443, + "step": 2861 + }, + { + "epoch": 0.0170211247502141, + "grad_norm": 3.022735357284546, + "learning_rate": 4.9964290866741925e-05, + "loss": 6.3161, + "step": 2862 + }, + { + "epoch": 0.0170270720334951, + "grad_norm": 2.487271308898926, + "learning_rate": 4.996426590563523e-05, + "loss": 6.3352, + "step": 2863 + }, + { + "epoch": 0.0170330193167761, + "grad_norm": 2.624000072479248, + "learning_rate": 4.996424093581377e-05, + "loss": 6.3575, + "step": 2864 + }, + { + "epoch": 0.017038966600057093, + "grad_norm": 2.378368854522705, + "learning_rate": 4.996421595727756e-05, + "loss": 6.3284, + "step": 2865 + }, + { + "epoch": 0.017044913883338092, + "grad_norm": 2.6903984546661377, + "learning_rate": 4.996419097002659e-05, + "loss": 6.271, + "step": 2866 + }, + { + "epoch": 0.017050861166619087, + "grad_norm": 2.536391019821167, + "learning_rate": 4.9964165974060875e-05, + "loss": 6.1276, + "step": 2867 + }, + { + "epoch": 0.017056808449900086, + "grad_norm": 2.470395803451538, + "learning_rate": 4.9964140969380434e-05, + "loss": 6.1032, + "step": 2868 + }, + { + "epoch": 0.017062755733181084, + "grad_norm": 2.929818630218506, + "learning_rate": 4.996411595598528e-05, + "loss": 6.0994, + "step": 2869 + }, + { + "epoch": 0.01706870301646208, + "grad_norm": 2.548701763153076, + "learning_rate": 4.99640909338754e-05, + "loss": 6.2227, + "step": 2870 + }, + { + "epoch": 0.017074650299743078, + "grad_norm": 2.6044397354125977, + "learning_rate": 4.99640659030508e-05, + "loss": 6.0778, + "step": 2871 + }, + { + "epoch": 0.017080597583024073, + "grad_norm": 2.687392473220825, + "learning_rate": 4.996404086351153e-05, + "loss": 6.2975, + "step": 2872 + }, + { + "epoch": 0.01708654486630507, + "grad_norm": 2.740201711654663, + "learning_rate": 4.9964015815257556e-05, + "loss": 6.5955, + "step": 2873 + }, + { + "epoch": 0.01709249214958607, + "grad_norm": 2.605958938598633, + "learning_rate": 4.99639907582889e-05, + "loss": 6.2112, + "step": 2874 + }, + { + "epoch": 0.017098439432867065, + "grad_norm": 2.9691529273986816, + "learning_rate": 4.996396569260558e-05, + "loss": 6.1435, + "step": 2875 + }, + { + "epoch": 0.017104386716148064, + "grad_norm": 2.822201728820801, + "learning_rate": 4.9963940618207593e-05, + "loss": 6.1949, + "step": 2876 + }, + { + "epoch": 0.017110333999429062, + "grad_norm": 2.6231529712677, + "learning_rate": 4.996391553509495e-05, + "loss": 6.5082, + "step": 2877 + }, + { + "epoch": 0.017116281282710057, + "grad_norm": 2.6511785984039307, + "learning_rate": 4.9963890443267666e-05, + "loss": 6.4461, + "step": 2878 + }, + { + "epoch": 0.017122228565991056, + "grad_norm": 2.4790167808532715, + "learning_rate": 4.996386534272575e-05, + "loss": 6.4642, + "step": 2879 + }, + { + "epoch": 0.01712817584927205, + "grad_norm": 3.6982533931732178, + "learning_rate": 4.99638402334692e-05, + "loss": 6.2957, + "step": 2880 + }, + { + "epoch": 0.01713412313255305, + "grad_norm": 2.380385160446167, + "learning_rate": 4.996381511549804e-05, + "loss": 6.3174, + "step": 2881 + }, + { + "epoch": 0.017140070415834048, + "grad_norm": 2.425537347793579, + "learning_rate": 4.996378998881226e-05, + "loss": 6.2055, + "step": 2882 + }, + { + "epoch": 0.017146017699115043, + "grad_norm": 2.4667842388153076, + "learning_rate": 4.996376485341188e-05, + "loss": 6.245, + "step": 2883 + }, + { + "epoch": 0.01715196498239604, + "grad_norm": 2.6306424140930176, + "learning_rate": 4.996373970929691e-05, + "loss": 6.1162, + "step": 2884 + }, + { + "epoch": 0.01715791226567704, + "grad_norm": 4.439255714416504, + "learning_rate": 4.996371455646736e-05, + "loss": 5.9868, + "step": 2885 + }, + { + "epoch": 0.017163859548958035, + "grad_norm": 3.3248472213745117, + "learning_rate": 4.9963689394923224e-05, + "loss": 5.861, + "step": 2886 + }, + { + "epoch": 0.017169806832239034, + "grad_norm": 2.45271897315979, + "learning_rate": 4.996366422466453e-05, + "loss": 6.1588, + "step": 2887 + }, + { + "epoch": 0.01717575411552003, + "grad_norm": 3.1748130321502686, + "learning_rate": 4.996363904569128e-05, + "loss": 6.3607, + "step": 2888 + }, + { + "epoch": 0.017181701398801028, + "grad_norm": 3.300736427307129, + "learning_rate": 4.996361385800348e-05, + "loss": 6.0709, + "step": 2889 + }, + { + "epoch": 0.017187648682082026, + "grad_norm": 2.720550060272217, + "learning_rate": 4.9963588661601136e-05, + "loss": 6.0496, + "step": 2890 + }, + { + "epoch": 0.01719359596536302, + "grad_norm": 2.251845121383667, + "learning_rate": 4.9963563456484266e-05, + "loss": 6.0088, + "step": 2891 + }, + { + "epoch": 0.01719954324864402, + "grad_norm": 2.7863035202026367, + "learning_rate": 4.996353824265288e-05, + "loss": 5.9478, + "step": 2892 + }, + { + "epoch": 0.01720549053192502, + "grad_norm": 2.831744432449341, + "learning_rate": 4.996351302010697e-05, + "loss": 6.1629, + "step": 2893 + }, + { + "epoch": 0.017211437815206013, + "grad_norm": 4.583891868591309, + "learning_rate": 4.9963487788846556e-05, + "loss": 6.7936, + "step": 2894 + }, + { + "epoch": 0.017217385098487012, + "grad_norm": 2.4525468349456787, + "learning_rate": 4.996346254887165e-05, + "loss": 6.3188, + "step": 2895 + }, + { + "epoch": 0.017223332381768007, + "grad_norm": 3.0866281986236572, + "learning_rate": 4.9963437300182254e-05, + "loss": 6.0207, + "step": 2896 + }, + { + "epoch": 0.017229279665049006, + "grad_norm": 3.1188113689422607, + "learning_rate": 4.996341204277838e-05, + "loss": 5.9873, + "step": 2897 + }, + { + "epoch": 0.017235226948330004, + "grad_norm": 2.4119350910186768, + "learning_rate": 4.996338677666004e-05, + "loss": 5.8104, + "step": 2898 + }, + { + "epoch": 0.017241174231611, + "grad_norm": 1.9601647853851318, + "learning_rate": 4.996336150182724e-05, + "loss": 6.2166, + "step": 2899 + }, + { + "epoch": 0.017247121514891998, + "grad_norm": 3.428379535675049, + "learning_rate": 4.9963336218279986e-05, + "loss": 6.4284, + "step": 2900 + }, + { + "epoch": 0.017253068798172993, + "grad_norm": 2.629446506500244, + "learning_rate": 4.996331092601829e-05, + "loss": 6.4916, + "step": 2901 + }, + { + "epoch": 0.01725901608145399, + "grad_norm": 2.3860316276550293, + "learning_rate": 4.996328562504216e-05, + "loss": 6.5035, + "step": 2902 + }, + { + "epoch": 0.01726496336473499, + "grad_norm": 2.6754682064056396, + "learning_rate": 4.996326031535161e-05, + "loss": 6.6374, + "step": 2903 + }, + { + "epoch": 0.017270910648015985, + "grad_norm": 2.737901210784912, + "learning_rate": 4.9963234996946635e-05, + "loss": 6.5023, + "step": 2904 + }, + { + "epoch": 0.017276857931296984, + "grad_norm": 2.481691837310791, + "learning_rate": 4.996320966982726e-05, + "loss": 6.5211, + "step": 2905 + }, + { + "epoch": 0.017282805214577982, + "grad_norm": 3.3993568420410156, + "learning_rate": 4.996318433399348e-05, + "loss": 6.4239, + "step": 2906 + }, + { + "epoch": 0.017288752497858977, + "grad_norm": 3.9149057865142822, + "learning_rate": 4.9963158989445316e-05, + "loss": 6.3874, + "step": 2907 + }, + { + "epoch": 0.017294699781139976, + "grad_norm": 2.3808562755584717, + "learning_rate": 4.996313363618276e-05, + "loss": 6.2887, + "step": 2908 + }, + { + "epoch": 0.01730064706442097, + "grad_norm": 2.6186649799346924, + "learning_rate": 4.996310827420585e-05, + "loss": 6.2944, + "step": 2909 + }, + { + "epoch": 0.01730659434770197, + "grad_norm": 2.5251142978668213, + "learning_rate": 4.9963082903514554e-05, + "loss": 6.0944, + "step": 2910 + }, + { + "epoch": 0.017312541630982968, + "grad_norm": 2.8212270736694336, + "learning_rate": 4.9963057524108926e-05, + "loss": 6.6621, + "step": 2911 + }, + { + "epoch": 0.017318488914263963, + "grad_norm": 2.477485418319702, + "learning_rate": 4.996303213598894e-05, + "loss": 6.3941, + "step": 2912 + }, + { + "epoch": 0.01732443619754496, + "grad_norm": 3.6508305072784424, + "learning_rate": 4.996300673915462e-05, + "loss": 6.3234, + "step": 2913 + }, + { + "epoch": 0.01733038348082596, + "grad_norm": 2.1635468006134033, + "learning_rate": 4.996298133360598e-05, + "loss": 6.2877, + "step": 2914 + }, + { + "epoch": 0.017336330764106955, + "grad_norm": 3.431082010269165, + "learning_rate": 4.9962955919343004e-05, + "loss": 6.2627, + "step": 2915 + }, + { + "epoch": 0.017342278047387954, + "grad_norm": 3.272376775741577, + "learning_rate": 4.9962930496365736e-05, + "loss": 6.1458, + "step": 2916 + }, + { + "epoch": 0.01734822533066895, + "grad_norm": 3.5927000045776367, + "learning_rate": 4.996290506467415e-05, + "loss": 5.9828, + "step": 2917 + }, + { + "epoch": 0.017354172613949947, + "grad_norm": 3.569641351699829, + "learning_rate": 4.996287962426829e-05, + "loss": 6.5957, + "step": 2918 + }, + { + "epoch": 0.017360119897230946, + "grad_norm": 3.281855344772339, + "learning_rate": 4.9962854175148134e-05, + "loss": 6.3393, + "step": 2919 + }, + { + "epoch": 0.01736606718051194, + "grad_norm": 2.6009061336517334, + "learning_rate": 4.9962828717313706e-05, + "loss": 6.3537, + "step": 2920 + }, + { + "epoch": 0.01737201446379294, + "grad_norm": 3.964467763900757, + "learning_rate": 4.996280325076501e-05, + "loss": 6.0281, + "step": 2921 + }, + { + "epoch": 0.017377961747073938, + "grad_norm": 3.9164865016937256, + "learning_rate": 4.9962777775502064e-05, + "loss": 6.5255, + "step": 2922 + }, + { + "epoch": 0.017383909030354933, + "grad_norm": 2.349709987640381, + "learning_rate": 4.996275229152486e-05, + "loss": 6.2459, + "step": 2923 + }, + { + "epoch": 0.017389856313635932, + "grad_norm": 2.5735161304473877, + "learning_rate": 4.9962726798833425e-05, + "loss": 6.0463, + "step": 2924 + }, + { + "epoch": 0.017395803596916927, + "grad_norm": 2.228271961212158, + "learning_rate": 4.9962701297427764e-05, + "loss": 6.1147, + "step": 2925 + }, + { + "epoch": 0.017401750880197926, + "grad_norm": 2.4587175846099854, + "learning_rate": 4.9962675787307875e-05, + "loss": 7.0868, + "step": 2926 + }, + { + "epoch": 0.017407698163478924, + "grad_norm": 2.2712674140930176, + "learning_rate": 4.996265026847378e-05, + "loss": 6.175, + "step": 2927 + }, + { + "epoch": 0.01741364544675992, + "grad_norm": 3.0724384784698486, + "learning_rate": 4.996262474092547e-05, + "loss": 6.5354, + "step": 2928 + }, + { + "epoch": 0.017419592730040918, + "grad_norm": 4.872220039367676, + "learning_rate": 4.996259920466297e-05, + "loss": 6.1938, + "step": 2929 + }, + { + "epoch": 0.017425540013321916, + "grad_norm": 4.508706569671631, + "learning_rate": 4.996257365968629e-05, + "loss": 6.1813, + "step": 2930 + }, + { + "epoch": 0.01743148729660291, + "grad_norm": 3.0419485569000244, + "learning_rate": 4.996254810599543e-05, + "loss": 5.9529, + "step": 2931 + }, + { + "epoch": 0.01743743457988391, + "grad_norm": 2.8372066020965576, + "learning_rate": 4.996252254359041e-05, + "loss": 5.9422, + "step": 2932 + }, + { + "epoch": 0.017443381863164905, + "grad_norm": 4.554285526275635, + "learning_rate": 4.996249697247122e-05, + "loss": 6.9073, + "step": 2933 + }, + { + "epoch": 0.017449329146445904, + "grad_norm": 3.121094226837158, + "learning_rate": 4.996247139263788e-05, + "loss": 6.2827, + "step": 2934 + }, + { + "epoch": 0.017455276429726902, + "grad_norm": 3.936596632003784, + "learning_rate": 4.996244580409041e-05, + "loss": 6.7863, + "step": 2935 + }, + { + "epoch": 0.017461223713007897, + "grad_norm": 3.5771539211273193, + "learning_rate": 4.99624202068288e-05, + "loss": 7.0691, + "step": 2936 + }, + { + "epoch": 0.017467170996288896, + "grad_norm": 2.0674471855163574, + "learning_rate": 4.996239460085307e-05, + "loss": 6.9768, + "step": 2937 + }, + { + "epoch": 0.01747311827956989, + "grad_norm": 2.600167989730835, + "learning_rate": 4.996236898616322e-05, + "loss": 6.4235, + "step": 2938 + }, + { + "epoch": 0.01747906556285089, + "grad_norm": 2.9444847106933594, + "learning_rate": 4.9962343362759267e-05, + "loss": 6.7305, + "step": 2939 + }, + { + "epoch": 0.017485012846131888, + "grad_norm": 3.721101999282837, + "learning_rate": 4.996231773064122e-05, + "loss": 6.5147, + "step": 2940 + }, + { + "epoch": 0.017490960129412883, + "grad_norm": 5.715269565582275, + "learning_rate": 4.9962292089809086e-05, + "loss": 6.1433, + "step": 2941 + }, + { + "epoch": 0.01749690741269388, + "grad_norm": 4.245530128479004, + "learning_rate": 4.996226644026287e-05, + "loss": 6.2163, + "step": 2942 + }, + { + "epoch": 0.01750285469597488, + "grad_norm": 2.7717039585113525, + "learning_rate": 4.996224078200259e-05, + "loss": 5.877, + "step": 2943 + }, + { + "epoch": 0.017508801979255875, + "grad_norm": 3.4189441204071045, + "learning_rate": 4.9962215115028255e-05, + "loss": 5.9575, + "step": 2944 + }, + { + "epoch": 0.017514749262536874, + "grad_norm": 3.754513740539551, + "learning_rate": 4.996218943933986e-05, + "loss": 5.7512, + "step": 2945 + }, + { + "epoch": 0.01752069654581787, + "grad_norm": 3.4231228828430176, + "learning_rate": 4.9962163754937426e-05, + "loss": 6.4566, + "step": 2946 + }, + { + "epoch": 0.017526643829098867, + "grad_norm": 2.7481472492218018, + "learning_rate": 4.996213806182095e-05, + "loss": 6.1385, + "step": 2947 + }, + { + "epoch": 0.017532591112379866, + "grad_norm": 2.802342414855957, + "learning_rate": 4.996211235999046e-05, + "loss": 5.6656, + "step": 2948 + }, + { + "epoch": 0.01753853839566086, + "grad_norm": 2.60530686378479, + "learning_rate": 4.996208664944595e-05, + "loss": 5.7339, + "step": 2949 + }, + { + "epoch": 0.01754448567894186, + "grad_norm": 2.476100206375122, + "learning_rate": 4.996206093018744e-05, + "loss": 6.0447, + "step": 2950 + }, + { + "epoch": 0.017550432962222858, + "grad_norm": 2.3516924381256104, + "learning_rate": 4.9962035202214916e-05, + "loss": 6.2046, + "step": 2951 + }, + { + "epoch": 0.017556380245503853, + "grad_norm": 2.447519302368164, + "learning_rate": 4.996200946552842e-05, + "loss": 6.0279, + "step": 2952 + }, + { + "epoch": 0.017562327528784852, + "grad_norm": 2.679766893386841, + "learning_rate": 4.996198372012794e-05, + "loss": 5.9072, + "step": 2953 + }, + { + "epoch": 0.017568274812065847, + "grad_norm": 2.3413944244384766, + "learning_rate": 4.9961957966013486e-05, + "loss": 5.9214, + "step": 2954 + }, + { + "epoch": 0.017574222095346845, + "grad_norm": 2.273725986480713, + "learning_rate": 4.996193220318507e-05, + "loss": 6.2107, + "step": 2955 + }, + { + "epoch": 0.017580169378627844, + "grad_norm": 2.9424052238464355, + "learning_rate": 4.99619064316427e-05, + "loss": 5.8618, + "step": 2956 + }, + { + "epoch": 0.01758611666190884, + "grad_norm": 2.40987229347229, + "learning_rate": 4.9961880651386394e-05, + "loss": 6.1306, + "step": 2957 + }, + { + "epoch": 0.017592063945189838, + "grad_norm": 2.542084217071533, + "learning_rate": 4.9961854862416144e-05, + "loss": 6.2225, + "step": 2958 + }, + { + "epoch": 0.017598011228470836, + "grad_norm": 2.06935977935791, + "learning_rate": 4.996182906473198e-05, + "loss": 5.9899, + "step": 2959 + }, + { + "epoch": 0.01760395851175183, + "grad_norm": 2.1998584270477295, + "learning_rate": 4.99618032583339e-05, + "loss": 6.2268, + "step": 2960 + }, + { + "epoch": 0.01760990579503283, + "grad_norm": 2.5595617294311523, + "learning_rate": 4.99617774432219e-05, + "loss": 6.2856, + "step": 2961 + }, + { + "epoch": 0.017615853078313825, + "grad_norm": 2.9262382984161377, + "learning_rate": 4.9961751619396e-05, + "loss": 6.2747, + "step": 2962 + }, + { + "epoch": 0.017621800361594823, + "grad_norm": 2.3705809116363525, + "learning_rate": 4.996172578685622e-05, + "loss": 6.1376, + "step": 2963 + }, + { + "epoch": 0.017627747644875822, + "grad_norm": 2.20991849899292, + "learning_rate": 4.996169994560256e-05, + "loss": 6.0118, + "step": 2964 + }, + { + "epoch": 0.017633694928156817, + "grad_norm": 2.2801706790924072, + "learning_rate": 4.996167409563502e-05, + "loss": 6.0924, + "step": 2965 + }, + { + "epoch": 0.017639642211437816, + "grad_norm": 2.5618062019348145, + "learning_rate": 4.996164823695362e-05, + "loss": 6.0931, + "step": 2966 + }, + { + "epoch": 0.01764558949471881, + "grad_norm": 2.2933573722839355, + "learning_rate": 4.996162236955837e-05, + "loss": 6.1584, + "step": 2967 + }, + { + "epoch": 0.01765153677799981, + "grad_norm": 2.2387471199035645, + "learning_rate": 4.996159649344928e-05, + "loss": 6.1224, + "step": 2968 + }, + { + "epoch": 0.017657484061280808, + "grad_norm": 2.425929069519043, + "learning_rate": 4.9961570608626347e-05, + "loss": 6.2419, + "step": 2969 + }, + { + "epoch": 0.017663431344561803, + "grad_norm": 3.0279812812805176, + "learning_rate": 4.996154471508959e-05, + "loss": 6.0478, + "step": 2970 + }, + { + "epoch": 0.0176693786278428, + "grad_norm": 2.8950276374816895, + "learning_rate": 4.9961518812839015e-05, + "loss": 5.9663, + "step": 2971 + }, + { + "epoch": 0.0176753259111238, + "grad_norm": 2.9908859729766846, + "learning_rate": 4.996149290187463e-05, + "loss": 5.8101, + "step": 2972 + }, + { + "epoch": 0.017681273194404795, + "grad_norm": 2.900987148284912, + "learning_rate": 4.996146698219645e-05, + "loss": 6.133, + "step": 2973 + }, + { + "epoch": 0.017687220477685794, + "grad_norm": 3.3194754123687744, + "learning_rate": 4.996144105380447e-05, + "loss": 5.9763, + "step": 2974 + }, + { + "epoch": 0.01769316776096679, + "grad_norm": 2.4997923374176025, + "learning_rate": 4.996141511669872e-05, + "loss": 6.1062, + "step": 2975 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 2.3048369884490967, + "learning_rate": 4.996138917087919e-05, + "loss": 6.138, + "step": 2976 + }, + { + "epoch": 0.017705062327528786, + "grad_norm": 2.3391027450561523, + "learning_rate": 4.99613632163459e-05, + "loss": 6.0612, + "step": 2977 + }, + { + "epoch": 0.01771100961080978, + "grad_norm": 2.6164605617523193, + "learning_rate": 4.996133725309886e-05, + "loss": 6.0402, + "step": 2978 + }, + { + "epoch": 0.01771695689409078, + "grad_norm": 2.6534295082092285, + "learning_rate": 4.996131128113807e-05, + "loss": 5.9027, + "step": 2979 + }, + { + "epoch": 0.017722904177371778, + "grad_norm": 2.1807172298431396, + "learning_rate": 4.996128530046354e-05, + "loss": 5.7083, + "step": 2980 + }, + { + "epoch": 0.017728851460652773, + "grad_norm": 2.433762550354004, + "learning_rate": 4.9961259311075296e-05, + "loss": 6.1587, + "step": 2981 + }, + { + "epoch": 0.017734798743933772, + "grad_norm": 2.4656107425689697, + "learning_rate": 4.996123331297333e-05, + "loss": 5.9831, + "step": 2982 + }, + { + "epoch": 0.017740746027214767, + "grad_norm": 2.536060333251953, + "learning_rate": 4.996120730615765e-05, + "loss": 5.9083, + "step": 2983 + }, + { + "epoch": 0.017746693310495765, + "grad_norm": 2.2993409633636475, + "learning_rate": 4.996118129062828e-05, + "loss": 6.0156, + "step": 2984 + }, + { + "epoch": 0.017752640593776764, + "grad_norm": 2.0221481323242188, + "learning_rate": 4.996115526638521e-05, + "loss": 5.9836, + "step": 2985 + }, + { + "epoch": 0.01775858787705776, + "grad_norm": 2.401350498199463, + "learning_rate": 4.996112923342846e-05, + "loss": 5.8071, + "step": 2986 + }, + { + "epoch": 0.017764535160338758, + "grad_norm": 2.469214677810669, + "learning_rate": 4.996110319175804e-05, + "loss": 5.8784, + "step": 2987 + }, + { + "epoch": 0.017770482443619756, + "grad_norm": 2.454481601715088, + "learning_rate": 4.9961077141373955e-05, + "loss": 5.9168, + "step": 2988 + }, + { + "epoch": 0.01777642972690075, + "grad_norm": 2.3173487186431885, + "learning_rate": 4.996105108227621e-05, + "loss": 5.8797, + "step": 2989 + }, + { + "epoch": 0.01778237701018175, + "grad_norm": 2.1967554092407227, + "learning_rate": 4.996102501446483e-05, + "loss": 5.972, + "step": 2990 + }, + { + "epoch": 0.017788324293462745, + "grad_norm": 2.1263201236724854, + "learning_rate": 4.996099893793981e-05, + "loss": 5.9301, + "step": 2991 + }, + { + "epoch": 0.017794271576743743, + "grad_norm": 2.1959195137023926, + "learning_rate": 4.9960972852701165e-05, + "loss": 6.0422, + "step": 2992 + }, + { + "epoch": 0.017800218860024742, + "grad_norm": 2.3290374279022217, + "learning_rate": 4.99609467587489e-05, + "loss": 6.1926, + "step": 2993 + }, + { + "epoch": 0.017806166143305737, + "grad_norm": 2.3518059253692627, + "learning_rate": 4.996092065608303e-05, + "loss": 5.8583, + "step": 2994 + }, + { + "epoch": 0.017812113426586736, + "grad_norm": 2.4263339042663574, + "learning_rate": 4.996089454470355e-05, + "loss": 5.8149, + "step": 2995 + }, + { + "epoch": 0.01781806070986773, + "grad_norm": 2.0764389038085938, + "learning_rate": 4.99608684246105e-05, + "loss": 5.8782, + "step": 2996 + }, + { + "epoch": 0.01782400799314873, + "grad_norm": 2.086904764175415, + "learning_rate": 4.996084229580385e-05, + "loss": 5.7885, + "step": 2997 + }, + { + "epoch": 0.017829955276429728, + "grad_norm": 2.1907291412353516, + "learning_rate": 4.996081615828363e-05, + "loss": 5.9246, + "step": 2998 + }, + { + "epoch": 0.017835902559710723, + "grad_norm": 2.4596495628356934, + "learning_rate": 4.9960790012049854e-05, + "loss": 5.7786, + "step": 2999 + }, + { + "epoch": 0.01784184984299172, + "grad_norm": 2.0762453079223633, + "learning_rate": 4.996076385710252e-05, + "loss": 5.9901, + "step": 3000 + }, + { + "epoch": 0.01784779712627272, + "grad_norm": 2.068714141845703, + "learning_rate": 4.996073769344164e-05, + "loss": 5.9437, + "step": 3001 + }, + { + "epoch": 0.017853744409553715, + "grad_norm": 2.4760496616363525, + "learning_rate": 4.9960711521067226e-05, + "loss": 5.8633, + "step": 3002 + }, + { + "epoch": 0.017859691692834714, + "grad_norm": 2.395643949508667, + "learning_rate": 4.996068533997928e-05, + "loss": 5.8024, + "step": 3003 + }, + { + "epoch": 0.01786563897611571, + "grad_norm": 2.120586633682251, + "learning_rate": 4.996065915017783e-05, + "loss": 6.0712, + "step": 3004 + }, + { + "epoch": 0.017871586259396707, + "grad_norm": 2.384794235229492, + "learning_rate": 4.9960632951662866e-05, + "loss": 5.9089, + "step": 3005 + }, + { + "epoch": 0.017877533542677706, + "grad_norm": 2.24297833442688, + "learning_rate": 4.99606067444344e-05, + "loss": 6.0263, + "step": 3006 + }, + { + "epoch": 0.0178834808259587, + "grad_norm": 1.983299732208252, + "learning_rate": 4.996058052849245e-05, + "loss": 5.8706, + "step": 3007 + }, + { + "epoch": 0.0178894281092397, + "grad_norm": 2.2866950035095215, + "learning_rate": 4.996055430383701e-05, + "loss": 5.9031, + "step": 3008 + }, + { + "epoch": 0.017895375392520698, + "grad_norm": 2.3343560695648193, + "learning_rate": 4.996052807046811e-05, + "loss": 5.9155, + "step": 3009 + }, + { + "epoch": 0.017901322675801693, + "grad_norm": 2.079763650894165, + "learning_rate": 4.9960501828385734e-05, + "loss": 5.8102, + "step": 3010 + }, + { + "epoch": 0.01790726995908269, + "grad_norm": 2.0398895740509033, + "learning_rate": 4.996047557758991e-05, + "loss": 5.773, + "step": 3011 + }, + { + "epoch": 0.017913217242363687, + "grad_norm": 2.2478318214416504, + "learning_rate": 4.996044931808064e-05, + "loss": 5.8584, + "step": 3012 + }, + { + "epoch": 0.017919164525644685, + "grad_norm": 2.301398992538452, + "learning_rate": 4.996042304985794e-05, + "loss": 5.9053, + "step": 3013 + }, + { + "epoch": 0.017925111808925684, + "grad_norm": 2.0428216457366943, + "learning_rate": 4.996039677292181e-05, + "loss": 5.9571, + "step": 3014 + }, + { + "epoch": 0.01793105909220668, + "grad_norm": 2.049572467803955, + "learning_rate": 4.9960370487272266e-05, + "loss": 5.9464, + "step": 3015 + }, + { + "epoch": 0.017937006375487678, + "grad_norm": 2.1681618690490723, + "learning_rate": 4.996034419290931e-05, + "loss": 5.9969, + "step": 3016 + }, + { + "epoch": 0.017942953658768676, + "grad_norm": 2.3879425525665283, + "learning_rate": 4.996031788983296e-05, + "loss": 5.7962, + "step": 3017 + }, + { + "epoch": 0.01794890094204967, + "grad_norm": 2.232508420944214, + "learning_rate": 4.996029157804323e-05, + "loss": 5.8479, + "step": 3018 + }, + { + "epoch": 0.01795484822533067, + "grad_norm": 2.222257137298584, + "learning_rate": 4.9960265257540104e-05, + "loss": 5.952, + "step": 3019 + }, + { + "epoch": 0.017960795508611665, + "grad_norm": 2.213777542114258, + "learning_rate": 4.996023892832362e-05, + "loss": 5.9891, + "step": 3020 + }, + { + "epoch": 0.017966742791892663, + "grad_norm": 2.286097764968872, + "learning_rate": 4.996021259039377e-05, + "loss": 5.8995, + "step": 3021 + }, + { + "epoch": 0.017972690075173662, + "grad_norm": 2.1588432788848877, + "learning_rate": 4.996018624375056e-05, + "loss": 5.988, + "step": 3022 + }, + { + "epoch": 0.017978637358454657, + "grad_norm": 2.2468602657318115, + "learning_rate": 4.996015988839402e-05, + "loss": 5.9303, + "step": 3023 + }, + { + "epoch": 0.017984584641735656, + "grad_norm": 2.1732120513916016, + "learning_rate": 4.9960133524324135e-05, + "loss": 5.8696, + "step": 3024 + }, + { + "epoch": 0.01799053192501665, + "grad_norm": 2.2985105514526367, + "learning_rate": 4.996010715154093e-05, + "loss": 5.9251, + "step": 3025 + }, + { + "epoch": 0.01799647920829765, + "grad_norm": 2.1920788288116455, + "learning_rate": 4.996008077004441e-05, + "loss": 5.8023, + "step": 3026 + }, + { + "epoch": 0.018002426491578648, + "grad_norm": 1.9393725395202637, + "learning_rate": 4.996005437983458e-05, + "loss": 5.9576, + "step": 3027 + }, + { + "epoch": 0.018008373774859643, + "grad_norm": 2.115035057067871, + "learning_rate": 4.9960027980911455e-05, + "loss": 5.9105, + "step": 3028 + }, + { + "epoch": 0.01801432105814064, + "grad_norm": 2.143432855606079, + "learning_rate": 4.996000157327504e-05, + "loss": 5.9951, + "step": 3029 + }, + { + "epoch": 0.01802026834142164, + "grad_norm": 2.4353296756744385, + "learning_rate": 4.995997515692536e-05, + "loss": 5.9761, + "step": 3030 + }, + { + "epoch": 0.018026215624702635, + "grad_norm": 1.999054193496704, + "learning_rate": 4.995994873186239e-05, + "loss": 6.028, + "step": 3031 + }, + { + "epoch": 0.018032162907983634, + "grad_norm": 2.05645751953125, + "learning_rate": 4.995992229808617e-05, + "loss": 5.9778, + "step": 3032 + }, + { + "epoch": 0.01803811019126463, + "grad_norm": 1.948923110961914, + "learning_rate": 4.99598958555967e-05, + "loss": 5.8735, + "step": 3033 + }, + { + "epoch": 0.018044057474545627, + "grad_norm": 2.1208486557006836, + "learning_rate": 4.995986940439399e-05, + "loss": 5.7913, + "step": 3034 + }, + { + "epoch": 0.018050004757826626, + "grad_norm": 2.051079750061035, + "learning_rate": 4.995984294447804e-05, + "loss": 5.8097, + "step": 3035 + }, + { + "epoch": 0.01805595204110762, + "grad_norm": 2.021207571029663, + "learning_rate": 4.995981647584887e-05, + "loss": 5.8425, + "step": 3036 + }, + { + "epoch": 0.01806189932438862, + "grad_norm": 2.471315622329712, + "learning_rate": 4.995978999850649e-05, + "loss": 5.7735, + "step": 3037 + }, + { + "epoch": 0.018067846607669618, + "grad_norm": 2.604836940765381, + "learning_rate": 4.9959763512450896e-05, + "loss": 6.4525, + "step": 3038 + }, + { + "epoch": 0.018073793890950613, + "grad_norm": 2.375361919403076, + "learning_rate": 4.995973701768212e-05, + "loss": 5.8072, + "step": 3039 + }, + { + "epoch": 0.01807974117423161, + "grad_norm": 2.354280471801758, + "learning_rate": 4.995971051420014e-05, + "loss": 5.9434, + "step": 3040 + }, + { + "epoch": 0.018085688457512607, + "grad_norm": 2.7335755825042725, + "learning_rate": 4.9959684002005e-05, + "loss": 5.5899, + "step": 3041 + }, + { + "epoch": 0.018091635740793605, + "grad_norm": 2.244917869567871, + "learning_rate": 4.995965748109668e-05, + "loss": 5.799, + "step": 3042 + }, + { + "epoch": 0.018097583024074604, + "grad_norm": 2.2413697242736816, + "learning_rate": 4.995963095147521e-05, + "loss": 5.8635, + "step": 3043 + }, + { + "epoch": 0.0181035303073556, + "grad_norm": 2.122586488723755, + "learning_rate": 4.9959604413140584e-05, + "loss": 5.8098, + "step": 3044 + }, + { + "epoch": 0.018109477590636597, + "grad_norm": 2.407517910003662, + "learning_rate": 4.995957786609282e-05, + "loss": 6.0319, + "step": 3045 + }, + { + "epoch": 0.018115424873917596, + "grad_norm": 2.5628743171691895, + "learning_rate": 4.9959551310331934e-05, + "loss": 5.9561, + "step": 3046 + }, + { + "epoch": 0.01812137215719859, + "grad_norm": 2.335650682449341, + "learning_rate": 4.995952474585791e-05, + "loss": 6.1168, + "step": 3047 + }, + { + "epoch": 0.01812731944047959, + "grad_norm": 2.169771432876587, + "learning_rate": 4.995949817267078e-05, + "loss": 6.0555, + "step": 3048 + }, + { + "epoch": 0.018133266723760585, + "grad_norm": 2.2245211601257324, + "learning_rate": 4.995947159077056e-05, + "loss": 5.9084, + "step": 3049 + }, + { + "epoch": 0.018139214007041583, + "grad_norm": 2.2296931743621826, + "learning_rate": 4.995944500015723e-05, + "loss": 5.8878, + "step": 3050 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 2.2372493743896484, + "learning_rate": 4.995941840083082e-05, + "loss": 5.9521, + "step": 3051 + }, + { + "epoch": 0.018151108573603577, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.995939179279134e-05, + "loss": 5.899, + "step": 3052 + }, + { + "epoch": 0.018157055856884576, + "grad_norm": 2.218245267868042, + "learning_rate": 4.995936517603879e-05, + "loss": 6.0311, + "step": 3053 + }, + { + "epoch": 0.018163003140165574, + "grad_norm": 2.2877273559570312, + "learning_rate": 4.995933855057318e-05, + "loss": 6.0052, + "step": 3054 + }, + { + "epoch": 0.01816895042344657, + "grad_norm": 2.225764751434326, + "learning_rate": 4.995931191639453e-05, + "loss": 6.0373, + "step": 3055 + }, + { + "epoch": 0.018174897706727568, + "grad_norm": 2.5069313049316406, + "learning_rate": 4.995928527350284e-05, + "loss": 5.8729, + "step": 3056 + }, + { + "epoch": 0.018180844990008563, + "grad_norm": 2.089759588241577, + "learning_rate": 4.995925862189812e-05, + "loss": 5.9462, + "step": 3057 + }, + { + "epoch": 0.01818679227328956, + "grad_norm": 2.0159049034118652, + "learning_rate": 4.9959231961580376e-05, + "loss": 5.9276, + "step": 3058 + }, + { + "epoch": 0.01819273955657056, + "grad_norm": 2.207636594772339, + "learning_rate": 4.995920529254963e-05, + "loss": 5.9921, + "step": 3059 + }, + { + "epoch": 0.018198686839851555, + "grad_norm": 2.380232810974121, + "learning_rate": 4.995917861480588e-05, + "loss": 5.9092, + "step": 3060 + }, + { + "epoch": 0.018204634123132554, + "grad_norm": 2.073237895965576, + "learning_rate": 4.9959151928349134e-05, + "loss": 5.8472, + "step": 3061 + }, + { + "epoch": 0.01821058140641355, + "grad_norm": 1.824062705039978, + "learning_rate": 4.995912523317942e-05, + "loss": 5.7958, + "step": 3062 + }, + { + "epoch": 0.018216528689694547, + "grad_norm": 2.3961215019226074, + "learning_rate": 4.995909852929672e-05, + "loss": 6.1388, + "step": 3063 + }, + { + "epoch": 0.018222475972975546, + "grad_norm": 2.8391239643096924, + "learning_rate": 4.9959071816701065e-05, + "loss": 5.7564, + "step": 3064 + }, + { + "epoch": 0.01822842325625654, + "grad_norm": 2.4684112071990967, + "learning_rate": 4.995904509539244e-05, + "loss": 5.8372, + "step": 3065 + }, + { + "epoch": 0.01823437053953754, + "grad_norm": 2.419983386993408, + "learning_rate": 4.995901836537089e-05, + "loss": 5.9332, + "step": 3066 + }, + { + "epoch": 0.018240317822818538, + "grad_norm": 2.500227928161621, + "learning_rate": 4.99589916266364e-05, + "loss": 6.0848, + "step": 3067 + }, + { + "epoch": 0.018246265106099533, + "grad_norm": 2.1683971881866455, + "learning_rate": 4.9958964879188976e-05, + "loss": 6.0911, + "step": 3068 + }, + { + "epoch": 0.01825221238938053, + "grad_norm": 2.2345223426818848, + "learning_rate": 4.995893812302864e-05, + "loss": 6.016, + "step": 3069 + }, + { + "epoch": 0.018258159672661527, + "grad_norm": 2.318321466445923, + "learning_rate": 4.995891135815539e-05, + "loss": 5.9622, + "step": 3070 + }, + { + "epoch": 0.018264106955942525, + "grad_norm": 2.294602155685425, + "learning_rate": 4.9958884584569255e-05, + "loss": 5.8908, + "step": 3071 + }, + { + "epoch": 0.018270054239223524, + "grad_norm": 2.5472419261932373, + "learning_rate": 4.995885780227022e-05, + "loss": 5.7906, + "step": 3072 + }, + { + "epoch": 0.01827600152250452, + "grad_norm": 2.319101095199585, + "learning_rate": 4.995883101125831e-05, + "loss": 6.3366, + "step": 3073 + }, + { + "epoch": 0.018281948805785517, + "grad_norm": 2.3564186096191406, + "learning_rate": 4.995880421153353e-05, + "loss": 5.9863, + "step": 3074 + }, + { + "epoch": 0.018287896089066516, + "grad_norm": 2.434756278991699, + "learning_rate": 4.995877740309589e-05, + "loss": 5.885, + "step": 3075 + }, + { + "epoch": 0.01829384337234751, + "grad_norm": 2.062861442565918, + "learning_rate": 4.99587505859454e-05, + "loss": 6.0813, + "step": 3076 + }, + { + "epoch": 0.01829979065562851, + "grad_norm": 2.127049684524536, + "learning_rate": 4.995872376008206e-05, + "loss": 6.1226, + "step": 3077 + }, + { + "epoch": 0.018305737938909505, + "grad_norm": 2.288405656814575, + "learning_rate": 4.995869692550589e-05, + "loss": 5.9625, + "step": 3078 + }, + { + "epoch": 0.018311685222190503, + "grad_norm": 2.2387006282806396, + "learning_rate": 4.9958670082216905e-05, + "loss": 5.9479, + "step": 3079 + }, + { + "epoch": 0.018317632505471502, + "grad_norm": 2.18864107131958, + "learning_rate": 4.9958643230215096e-05, + "loss": 5.9223, + "step": 3080 + }, + { + "epoch": 0.018323579788752497, + "grad_norm": 2.3457415103912354, + "learning_rate": 4.995861636950049e-05, + "loss": 5.7857, + "step": 3081 + }, + { + "epoch": 0.018329527072033495, + "grad_norm": 2.6946494579315186, + "learning_rate": 4.995858950007309e-05, + "loss": 5.5546, + "step": 3082 + }, + { + "epoch": 0.018335474355314494, + "grad_norm": 2.5135412216186523, + "learning_rate": 4.99585626219329e-05, + "loss": 5.5624, + "step": 3083 + }, + { + "epoch": 0.01834142163859549, + "grad_norm": 2.6617767810821533, + "learning_rate": 4.9958535735079934e-05, + "loss": 5.8789, + "step": 3084 + }, + { + "epoch": 0.018347368921876488, + "grad_norm": 2.099261522293091, + "learning_rate": 4.9958508839514196e-05, + "loss": 5.9365, + "step": 3085 + }, + { + "epoch": 0.018353316205157483, + "grad_norm": 2.5267064571380615, + "learning_rate": 4.9958481935235715e-05, + "loss": 6.0935, + "step": 3086 + }, + { + "epoch": 0.01835926348843848, + "grad_norm": 2.3353283405303955, + "learning_rate": 4.995845502224447e-05, + "loss": 5.909, + "step": 3087 + }, + { + "epoch": 0.01836521077171948, + "grad_norm": 2.396430492401123, + "learning_rate": 4.9958428100540496e-05, + "loss": 6.0272, + "step": 3088 + }, + { + "epoch": 0.018371158055000475, + "grad_norm": 2.095308303833008, + "learning_rate": 4.9958401170123784e-05, + "loss": 5.9791, + "step": 3089 + }, + { + "epoch": 0.018377105338281473, + "grad_norm": 2.7606077194213867, + "learning_rate": 4.9958374230994357e-05, + "loss": 5.9716, + "step": 3090 + }, + { + "epoch": 0.01838305262156247, + "grad_norm": 2.4490914344787598, + "learning_rate": 4.995834728315222e-05, + "loss": 5.8763, + "step": 3091 + }, + { + "epoch": 0.018388999904843467, + "grad_norm": 2.709092855453491, + "learning_rate": 4.9958320326597385e-05, + "loss": 5.74, + "step": 3092 + }, + { + "epoch": 0.018394947188124466, + "grad_norm": 2.8829305171966553, + "learning_rate": 4.9958293361329856e-05, + "loss": 5.8469, + "step": 3093 + }, + { + "epoch": 0.01840089447140546, + "grad_norm": 2.6500396728515625, + "learning_rate": 4.995826638734964e-05, + "loss": 5.8578, + "step": 3094 + }, + { + "epoch": 0.01840684175468646, + "grad_norm": 2.0665056705474854, + "learning_rate": 4.9958239404656755e-05, + "loss": 5.9662, + "step": 3095 + }, + { + "epoch": 0.018412789037967458, + "grad_norm": 2.3198931217193604, + "learning_rate": 4.9958212413251205e-05, + "loss": 6.0663, + "step": 3096 + }, + { + "epoch": 0.018418736321248453, + "grad_norm": 2.9056031703948975, + "learning_rate": 4.9958185413133e-05, + "loss": 5.8015, + "step": 3097 + }, + { + "epoch": 0.01842468360452945, + "grad_norm": 2.446164131164551, + "learning_rate": 4.995815840430216e-05, + "loss": 5.6878, + "step": 3098 + }, + { + "epoch": 0.018430630887810447, + "grad_norm": 2.797506093978882, + "learning_rate": 4.995813138675867e-05, + "loss": 5.7675, + "step": 3099 + }, + { + "epoch": 0.018436578171091445, + "grad_norm": 3.2914962768554688, + "learning_rate": 4.995810436050256e-05, + "loss": 6.3661, + "step": 3100 + }, + { + "epoch": 0.018442525454372444, + "grad_norm": 2.444363594055176, + "learning_rate": 4.995807732553384e-05, + "loss": 5.9251, + "step": 3101 + }, + { + "epoch": 0.01844847273765344, + "grad_norm": 2.526951551437378, + "learning_rate": 4.9958050281852505e-05, + "loss": 5.8202, + "step": 3102 + }, + { + "epoch": 0.018454420020934437, + "grad_norm": 2.2046117782592773, + "learning_rate": 4.995802322945857e-05, + "loss": 6.0572, + "step": 3103 + }, + { + "epoch": 0.018460367304215436, + "grad_norm": 2.5484018325805664, + "learning_rate": 4.9957996168352055e-05, + "loss": 6.1215, + "step": 3104 + }, + { + "epoch": 0.01846631458749643, + "grad_norm": 2.4785003662109375, + "learning_rate": 4.9957969098532965e-05, + "loss": 5.9524, + "step": 3105 + }, + { + "epoch": 0.01847226187077743, + "grad_norm": 2.9028711318969727, + "learning_rate": 4.9957942020001294e-05, + "loss": 6.1175, + "step": 3106 + }, + { + "epoch": 0.018478209154058425, + "grad_norm": 2.1766602993011475, + "learning_rate": 4.995791493275707e-05, + "loss": 5.9746, + "step": 3107 + }, + { + "epoch": 0.018484156437339423, + "grad_norm": 2.079423189163208, + "learning_rate": 4.995788783680029e-05, + "loss": 5.9463, + "step": 3108 + }, + { + "epoch": 0.018490103720620422, + "grad_norm": 2.285184144973755, + "learning_rate": 4.995786073213098e-05, + "loss": 5.5174, + "step": 3109 + }, + { + "epoch": 0.018496051003901417, + "grad_norm": 2.170018196105957, + "learning_rate": 4.9957833618749126e-05, + "loss": 5.7948, + "step": 3110 + }, + { + "epoch": 0.018501998287182415, + "grad_norm": 2.284517526626587, + "learning_rate": 4.9957806496654754e-05, + "loss": 5.9455, + "step": 3111 + }, + { + "epoch": 0.018507945570463414, + "grad_norm": 2.5539982318878174, + "learning_rate": 4.9957779365847876e-05, + "loss": 5.9791, + "step": 3112 + }, + { + "epoch": 0.01851389285374441, + "grad_norm": 2.1735522747039795, + "learning_rate": 4.995775222632849e-05, + "loss": 5.9549, + "step": 3113 + }, + { + "epoch": 0.018519840137025408, + "grad_norm": 2.2272653579711914, + "learning_rate": 4.995772507809662e-05, + "loss": 5.8618, + "step": 3114 + }, + { + "epoch": 0.018525787420306403, + "grad_norm": 1.9390417337417603, + "learning_rate": 4.995769792115225e-05, + "loss": 5.9617, + "step": 3115 + }, + { + "epoch": 0.0185317347035874, + "grad_norm": 2.6526312828063965, + "learning_rate": 4.9957670755495414e-05, + "loss": 5.9296, + "step": 3116 + }, + { + "epoch": 0.0185376819868684, + "grad_norm": 2.533996105194092, + "learning_rate": 4.995764358112611e-05, + "loss": 6.0045, + "step": 3117 + }, + { + "epoch": 0.018543629270149395, + "grad_norm": 2.183347225189209, + "learning_rate": 4.995761639804436e-05, + "loss": 5.9254, + "step": 3118 + }, + { + "epoch": 0.018549576553430393, + "grad_norm": 1.9411321878433228, + "learning_rate": 4.995758920625015e-05, + "loss": 5.9404, + "step": 3119 + }, + { + "epoch": 0.01855552383671139, + "grad_norm": 4.914453029632568, + "learning_rate": 4.9957562005743514e-05, + "loss": 5.8139, + "step": 3120 + }, + { + "epoch": 0.018561471119992387, + "grad_norm": 2.3052754402160645, + "learning_rate": 4.9957534796524444e-05, + "loss": 5.6525, + "step": 3121 + }, + { + "epoch": 0.018567418403273386, + "grad_norm": 2.424464464187622, + "learning_rate": 4.995750757859296e-05, + "loss": 5.9599, + "step": 3122 + }, + { + "epoch": 0.01857336568655438, + "grad_norm": 2.1392033100128174, + "learning_rate": 4.995748035194907e-05, + "loss": 5.9558, + "step": 3123 + }, + { + "epoch": 0.01857931296983538, + "grad_norm": 4.67656135559082, + "learning_rate": 4.995745311659278e-05, + "loss": 5.7606, + "step": 3124 + }, + { + "epoch": 0.018585260253116378, + "grad_norm": 2.0772082805633545, + "learning_rate": 4.99574258725241e-05, + "loss": 5.9328, + "step": 3125 + }, + { + "epoch": 0.018591207536397373, + "grad_norm": 2.0255486965179443, + "learning_rate": 4.995739861974303e-05, + "loss": 5.9395, + "step": 3126 + }, + { + "epoch": 0.01859715481967837, + "grad_norm": 2.3629064559936523, + "learning_rate": 4.995737135824961e-05, + "loss": 5.9663, + "step": 3127 + }, + { + "epoch": 0.018603102102959367, + "grad_norm": 1.9924237728118896, + "learning_rate": 4.9957344088043814e-05, + "loss": 5.8998, + "step": 3128 + }, + { + "epoch": 0.018609049386240365, + "grad_norm": 2.096774101257324, + "learning_rate": 4.9957316809125676e-05, + "loss": 5.7178, + "step": 3129 + }, + { + "epoch": 0.018614996669521364, + "grad_norm": 2.2288100719451904, + "learning_rate": 4.9957289521495194e-05, + "loss": 5.9096, + "step": 3130 + }, + { + "epoch": 0.01862094395280236, + "grad_norm": 2.456099033355713, + "learning_rate": 4.995726222515238e-05, + "loss": 5.7738, + "step": 3131 + }, + { + "epoch": 0.018626891236083357, + "grad_norm": 2.238218069076538, + "learning_rate": 4.995723492009724e-05, + "loss": 5.6929, + "step": 3132 + }, + { + "epoch": 0.018632838519364356, + "grad_norm": 1.8309845924377441, + "learning_rate": 4.9957207606329795e-05, + "loss": 5.9339, + "step": 3133 + }, + { + "epoch": 0.01863878580264535, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.995718028385003e-05, + "loss": 5.9704, + "step": 3134 + }, + { + "epoch": 0.01864473308592635, + "grad_norm": 2.0929813385009766, + "learning_rate": 4.9957152952657995e-05, + "loss": 5.7598, + "step": 3135 + }, + { + "epoch": 0.018650680369207345, + "grad_norm": 2.2813265323638916, + "learning_rate": 4.995712561275366e-05, + "loss": 5.7986, + "step": 3136 + }, + { + "epoch": 0.018656627652488343, + "grad_norm": 2.1189653873443604, + "learning_rate": 4.995709826413705e-05, + "loss": 5.6603, + "step": 3137 + }, + { + "epoch": 0.01866257493576934, + "grad_norm": 2.1439480781555176, + "learning_rate": 4.9957070906808185e-05, + "loss": 5.6952, + "step": 3138 + }, + { + "epoch": 0.018668522219050337, + "grad_norm": 2.4345993995666504, + "learning_rate": 4.995704354076706e-05, + "loss": 5.7531, + "step": 3139 + }, + { + "epoch": 0.018674469502331335, + "grad_norm": 2.5551047325134277, + "learning_rate": 4.995701616601368e-05, + "loss": 5.544, + "step": 3140 + }, + { + "epoch": 0.018680416785612334, + "grad_norm": 2.333603620529175, + "learning_rate": 4.9956988782548075e-05, + "loss": 5.5732, + "step": 3141 + }, + { + "epoch": 0.01868636406889333, + "grad_norm": 2.2983827590942383, + "learning_rate": 4.995696139037024e-05, + "loss": 5.8779, + "step": 3142 + }, + { + "epoch": 0.018692311352174328, + "grad_norm": 2.7525672912597656, + "learning_rate": 4.995693398948018e-05, + "loss": 5.5998, + "step": 3143 + }, + { + "epoch": 0.018698258635455323, + "grad_norm": 2.3622052669525146, + "learning_rate": 4.995690657987793e-05, + "loss": 5.8851, + "step": 3144 + }, + { + "epoch": 0.01870420591873632, + "grad_norm": 2.4975669384002686, + "learning_rate": 4.995687916156346e-05, + "loss": 5.6388, + "step": 3145 + }, + { + "epoch": 0.01871015320201732, + "grad_norm": 2.5763049125671387, + "learning_rate": 4.9956851734536816e-05, + "loss": 5.4931, + "step": 3146 + }, + { + "epoch": 0.018716100485298315, + "grad_norm": 2.7156779766082764, + "learning_rate": 4.995682429879799e-05, + "loss": 5.8035, + "step": 3147 + }, + { + "epoch": 0.018722047768579313, + "grad_norm": 2.259134292602539, + "learning_rate": 4.995679685434699e-05, + "loss": 5.9519, + "step": 3148 + }, + { + "epoch": 0.018727995051860312, + "grad_norm": 2.544829845428467, + "learning_rate": 4.995676940118383e-05, + "loss": 5.7373, + "step": 3149 + }, + { + "epoch": 0.018733942335141307, + "grad_norm": 2.326660633087158, + "learning_rate": 4.995674193930853e-05, + "loss": 5.7719, + "step": 3150 + }, + { + "epoch": 0.018739889618422306, + "grad_norm": 2.25370192527771, + "learning_rate": 4.995671446872108e-05, + "loss": 5.813, + "step": 3151 + }, + { + "epoch": 0.0187458369017033, + "grad_norm": 2.1467692852020264, + "learning_rate": 4.99566869894215e-05, + "loss": 5.5836, + "step": 3152 + }, + { + "epoch": 0.0187517841849843, + "grad_norm": 2.30096697807312, + "learning_rate": 4.9956659501409796e-05, + "loss": 5.8249, + "step": 3153 + }, + { + "epoch": 0.018757731468265298, + "grad_norm": 2.3050386905670166, + "learning_rate": 4.9956632004685986e-05, + "loss": 5.6806, + "step": 3154 + }, + { + "epoch": 0.018763678751546293, + "grad_norm": 2.473008632659912, + "learning_rate": 4.995660449925007e-05, + "loss": 5.4512, + "step": 3155 + }, + { + "epoch": 0.01876962603482729, + "grad_norm": 2.0691702365875244, + "learning_rate": 4.995657698510206e-05, + "loss": 5.6582, + "step": 3156 + }, + { + "epoch": 0.018775573318108287, + "grad_norm": 2.332423686981201, + "learning_rate": 4.995654946224197e-05, + "loss": 5.6017, + "step": 3157 + }, + { + "epoch": 0.018781520601389285, + "grad_norm": 2.6423730850219727, + "learning_rate": 4.9956521930669806e-05, + "loss": 5.619, + "step": 3158 + }, + { + "epoch": 0.018787467884670284, + "grad_norm": 3.0884950160980225, + "learning_rate": 4.995649439038558e-05, + "loss": 5.7813, + "step": 3159 + }, + { + "epoch": 0.01879341516795128, + "grad_norm": 2.4923598766326904, + "learning_rate": 4.995646684138929e-05, + "loss": 5.8089, + "step": 3160 + }, + { + "epoch": 0.018799362451232277, + "grad_norm": 2.5505683422088623, + "learning_rate": 4.9956439283680965e-05, + "loss": 5.8171, + "step": 3161 + }, + { + "epoch": 0.018805309734513276, + "grad_norm": 2.7343056201934814, + "learning_rate": 4.99564117172606e-05, + "loss": 6.3472, + "step": 3162 + }, + { + "epoch": 0.01881125701779427, + "grad_norm": 2.9170796871185303, + "learning_rate": 4.995638414212821e-05, + "loss": 5.7478, + "step": 3163 + }, + { + "epoch": 0.01881720430107527, + "grad_norm": 2.392648696899414, + "learning_rate": 4.9956356558283815e-05, + "loss": 5.8105, + "step": 3164 + }, + { + "epoch": 0.018823151584356265, + "grad_norm": 2.532207727432251, + "learning_rate": 4.9956328965727394e-05, + "loss": 5.9285, + "step": 3165 + }, + { + "epoch": 0.018829098867637263, + "grad_norm": 2.6717050075531006, + "learning_rate": 4.995630136445899e-05, + "loss": 6.0344, + "step": 3166 + }, + { + "epoch": 0.01883504615091826, + "grad_norm": 2.1829564571380615, + "learning_rate": 4.99562737544786e-05, + "loss": 6.0078, + "step": 3167 + }, + { + "epoch": 0.018840993434199257, + "grad_norm": 2.2728323936462402, + "learning_rate": 4.995624613578622e-05, + "loss": 5.8211, + "step": 3168 + }, + { + "epoch": 0.018846940717480255, + "grad_norm": 2.046717882156372, + "learning_rate": 4.995621850838189e-05, + "loss": 5.9685, + "step": 3169 + }, + { + "epoch": 0.018852888000761254, + "grad_norm": 2.737494945526123, + "learning_rate": 4.995619087226559e-05, + "loss": 5.649, + "step": 3170 + }, + { + "epoch": 0.01885883528404225, + "grad_norm": 2.276503801345825, + "learning_rate": 4.9956163227437345e-05, + "loss": 5.8137, + "step": 3171 + }, + { + "epoch": 0.018864782567323247, + "grad_norm": 2.2799227237701416, + "learning_rate": 4.9956135573897155e-05, + "loss": 5.8277, + "step": 3172 + }, + { + "epoch": 0.018870729850604243, + "grad_norm": 2.131425619125366, + "learning_rate": 4.995610791164505e-05, + "loss": 5.8909, + "step": 3173 + }, + { + "epoch": 0.01887667713388524, + "grad_norm": 2.2295737266540527, + "learning_rate": 4.995608024068102e-05, + "loss": 5.8236, + "step": 3174 + }, + { + "epoch": 0.01888262441716624, + "grad_norm": 2.30082631111145, + "learning_rate": 4.9956052561005076e-05, + "loss": 5.7331, + "step": 3175 + }, + { + "epoch": 0.018888571700447235, + "grad_norm": 2.751847505569458, + "learning_rate": 4.9956024872617225e-05, + "loss": 5.8673, + "step": 3176 + }, + { + "epoch": 0.018894518983728233, + "grad_norm": 2.4597535133361816, + "learning_rate": 4.995599717551749e-05, + "loss": 5.7561, + "step": 3177 + }, + { + "epoch": 0.018900466267009232, + "grad_norm": 2.1418228149414062, + "learning_rate": 4.9955969469705874e-05, + "loss": 5.7112, + "step": 3178 + }, + { + "epoch": 0.018906413550290227, + "grad_norm": 2.0560619831085205, + "learning_rate": 4.9955941755182395e-05, + "loss": 5.7764, + "step": 3179 + }, + { + "epoch": 0.018912360833571226, + "grad_norm": 2.268781900405884, + "learning_rate": 4.9955914031947046e-05, + "loss": 5.7319, + "step": 3180 + }, + { + "epoch": 0.01891830811685222, + "grad_norm": 2.6272811889648438, + "learning_rate": 4.995588629999985e-05, + "loss": 6.0601, + "step": 3181 + }, + { + "epoch": 0.01892425540013322, + "grad_norm": 2.1991870403289795, + "learning_rate": 4.995585855934081e-05, + "loss": 5.602, + "step": 3182 + }, + { + "epoch": 0.018930202683414218, + "grad_norm": 2.0521514415740967, + "learning_rate": 4.995583080996994e-05, + "loss": 5.8075, + "step": 3183 + }, + { + "epoch": 0.018936149966695213, + "grad_norm": 2.153473138809204, + "learning_rate": 4.995580305188724e-05, + "loss": 5.8219, + "step": 3184 + }, + { + "epoch": 0.01894209724997621, + "grad_norm": 2.0663251876831055, + "learning_rate": 4.9955775285092735e-05, + "loss": 5.836, + "step": 3185 + }, + { + "epoch": 0.018948044533257206, + "grad_norm": 1.8808318376541138, + "learning_rate": 4.995574750958642e-05, + "loss": 5.7938, + "step": 3186 + }, + { + "epoch": 0.018953991816538205, + "grad_norm": 2.256012201309204, + "learning_rate": 4.995571972536831e-05, + "loss": 5.6404, + "step": 3187 + }, + { + "epoch": 0.018959939099819204, + "grad_norm": 2.29636287689209, + "learning_rate": 4.995569193243843e-05, + "loss": 5.7161, + "step": 3188 + }, + { + "epoch": 0.0189658863831002, + "grad_norm": 2.728804588317871, + "learning_rate": 4.995566413079676e-05, + "loss": 5.8165, + "step": 3189 + }, + { + "epoch": 0.018971833666381197, + "grad_norm": 2.3115599155426025, + "learning_rate": 4.995563632044333e-05, + "loss": 5.7004, + "step": 3190 + }, + { + "epoch": 0.018977780949662196, + "grad_norm": 2.1607725620269775, + "learning_rate": 4.995560850137815e-05, + "loss": 5.7788, + "step": 3191 + }, + { + "epoch": 0.01898372823294319, + "grad_norm": 2.322132110595703, + "learning_rate": 4.995558067360122e-05, + "loss": 5.5677, + "step": 3192 + }, + { + "epoch": 0.01898967551622419, + "grad_norm": 2.148022174835205, + "learning_rate": 4.995555283711256e-05, + "loss": 5.7708, + "step": 3193 + }, + { + "epoch": 0.018995622799505184, + "grad_norm": 2.339812994003296, + "learning_rate": 4.9955524991912165e-05, + "loss": 5.7945, + "step": 3194 + }, + { + "epoch": 0.019001570082786183, + "grad_norm": 1.9469980001449585, + "learning_rate": 4.995549713800006e-05, + "loss": 5.695, + "step": 3195 + }, + { + "epoch": 0.01900751736606718, + "grad_norm": 2.1744890213012695, + "learning_rate": 4.9955469275376254e-05, + "loss": 5.7544, + "step": 3196 + }, + { + "epoch": 0.019013464649348177, + "grad_norm": 2.175123691558838, + "learning_rate": 4.9955441404040745e-05, + "loss": 5.598, + "step": 3197 + }, + { + "epoch": 0.019019411932629175, + "grad_norm": 2.3011369705200195, + "learning_rate": 4.995541352399355e-05, + "loss": 5.7069, + "step": 3198 + }, + { + "epoch": 0.019025359215910174, + "grad_norm": 2.2227025032043457, + "learning_rate": 4.9955385635234675e-05, + "loss": 5.6854, + "step": 3199 + }, + { + "epoch": 0.01903130649919117, + "grad_norm": 2.5465073585510254, + "learning_rate": 4.995535773776414e-05, + "loss": 5.9085, + "step": 3200 + }, + { + "epoch": 0.019037253782472167, + "grad_norm": 2.936612844467163, + "learning_rate": 4.995532983158194e-05, + "loss": 6.0519, + "step": 3201 + }, + { + "epoch": 0.019043201065753163, + "grad_norm": 2.8298418521881104, + "learning_rate": 4.9955301916688094e-05, + "loss": 5.9473, + "step": 3202 + }, + { + "epoch": 0.01904914834903416, + "grad_norm": 2.2295944690704346, + "learning_rate": 4.9955273993082615e-05, + "loss": 5.9652, + "step": 3203 + }, + { + "epoch": 0.01905509563231516, + "grad_norm": 2.7771801948547363, + "learning_rate": 4.9955246060765505e-05, + "loss": 5.9291, + "step": 3204 + }, + { + "epoch": 0.019061042915596155, + "grad_norm": 3.0721678733825684, + "learning_rate": 4.9955218119736776e-05, + "loss": 6.2319, + "step": 3205 + }, + { + "epoch": 0.019066990198877153, + "grad_norm": 2.7866547107696533, + "learning_rate": 4.9955190169996434e-05, + "loss": 6.0412, + "step": 3206 + }, + { + "epoch": 0.019072937482158152, + "grad_norm": 2.287216901779175, + "learning_rate": 4.99551622115445e-05, + "loss": 5.6435, + "step": 3207 + }, + { + "epoch": 0.019078884765439147, + "grad_norm": 2.3618898391723633, + "learning_rate": 4.995513424438098e-05, + "loss": 5.7711, + "step": 3208 + }, + { + "epoch": 0.019084832048720145, + "grad_norm": 2.192997932434082, + "learning_rate": 4.995510626850587e-05, + "loss": 5.8351, + "step": 3209 + }, + { + "epoch": 0.01909077933200114, + "grad_norm": 2.252722978591919, + "learning_rate": 4.995507828391919e-05, + "loss": 5.5989, + "step": 3210 + }, + { + "epoch": 0.01909672661528214, + "grad_norm": 2.451167106628418, + "learning_rate": 4.995505029062095e-05, + "loss": 5.8533, + "step": 3211 + }, + { + "epoch": 0.019102673898563138, + "grad_norm": 2.1897904872894287, + "learning_rate": 4.995502228861116e-05, + "loss": 6.2807, + "step": 3212 + }, + { + "epoch": 0.019108621181844133, + "grad_norm": 2.196805715560913, + "learning_rate": 4.995499427788984e-05, + "loss": 5.9418, + "step": 3213 + }, + { + "epoch": 0.01911456846512513, + "grad_norm": 1.9791160821914673, + "learning_rate": 4.995496625845698e-05, + "loss": 5.9909, + "step": 3214 + }, + { + "epoch": 0.019120515748406126, + "grad_norm": 2.3592171669006348, + "learning_rate": 4.995493823031261e-05, + "loss": 5.807, + "step": 3215 + }, + { + "epoch": 0.019126463031687125, + "grad_norm": 2.8238747119903564, + "learning_rate": 4.9954910193456713e-05, + "loss": 5.7587, + "step": 3216 + }, + { + "epoch": 0.019132410314968123, + "grad_norm": 2.4695584774017334, + "learning_rate": 4.9954882147889326e-05, + "loss": 5.746, + "step": 3217 + }, + { + "epoch": 0.01913835759824912, + "grad_norm": 2.3983800411224365, + "learning_rate": 4.995485409361044e-05, + "loss": 5.9364, + "step": 3218 + }, + { + "epoch": 0.019144304881530117, + "grad_norm": 2.1279618740081787, + "learning_rate": 4.995482603062008e-05, + "loss": 5.9383, + "step": 3219 + }, + { + "epoch": 0.019150252164811116, + "grad_norm": 18.583581924438477, + "learning_rate": 4.9954797958918244e-05, + "loss": 5.8596, + "step": 3220 + }, + { + "epoch": 0.01915619944809211, + "grad_norm": 2.1420741081237793, + "learning_rate": 4.995476987850495e-05, + "loss": 5.9311, + "step": 3221 + }, + { + "epoch": 0.01916214673137311, + "grad_norm": 2.314380645751953, + "learning_rate": 4.99547417893802e-05, + "loss": 5.8229, + "step": 3222 + }, + { + "epoch": 0.019168094014654104, + "grad_norm": 2.3818936347961426, + "learning_rate": 4.9954713691544004e-05, + "loss": 6.1124, + "step": 3223 + }, + { + "epoch": 0.019174041297935103, + "grad_norm": 2.521789789199829, + "learning_rate": 4.9954685584996377e-05, + "loss": 5.8939, + "step": 3224 + }, + { + "epoch": 0.0191799885812161, + "grad_norm": 1.9583165645599365, + "learning_rate": 4.9954657469737334e-05, + "loss": 6.0005, + "step": 3225 + }, + { + "epoch": 0.019185935864497097, + "grad_norm": 2.349581241607666, + "learning_rate": 4.995462934576687e-05, + "loss": 5.8467, + "step": 3226 + }, + { + "epoch": 0.019191883147778095, + "grad_norm": 2.081836223602295, + "learning_rate": 4.9954601213085e-05, + "loss": 6.1001, + "step": 3227 + }, + { + "epoch": 0.019197830431059094, + "grad_norm": 2.3207972049713135, + "learning_rate": 4.995457307169175e-05, + "loss": 5.794, + "step": 3228 + }, + { + "epoch": 0.01920377771434009, + "grad_norm": 1.8516380786895752, + "learning_rate": 4.99545449215871e-05, + "loss": 5.785, + "step": 3229 + }, + { + "epoch": 0.019209724997621087, + "grad_norm": 2.3822309970855713, + "learning_rate": 4.995451676277109e-05, + "loss": 5.7861, + "step": 3230 + }, + { + "epoch": 0.019215672280902082, + "grad_norm": 2.857161283493042, + "learning_rate": 4.995448859524371e-05, + "loss": 5.8333, + "step": 3231 + }, + { + "epoch": 0.01922161956418308, + "grad_norm": 2.201551914215088, + "learning_rate": 4.9954460419004974e-05, + "loss": 5.8653, + "step": 3232 + }, + { + "epoch": 0.01922756684746408, + "grad_norm": 2.1707022190093994, + "learning_rate": 4.995443223405489e-05, + "loss": 5.772, + "step": 3233 + }, + { + "epoch": 0.019233514130745075, + "grad_norm": 2.1242458820343018, + "learning_rate": 4.995440404039348e-05, + "loss": 5.8806, + "step": 3234 + }, + { + "epoch": 0.019239461414026073, + "grad_norm": 2.106945514678955, + "learning_rate": 4.995437583802074e-05, + "loss": 5.6746, + "step": 3235 + }, + { + "epoch": 0.019245408697307072, + "grad_norm": 2.083181858062744, + "learning_rate": 4.995434762693669e-05, + "loss": 5.9332, + "step": 3236 + }, + { + "epoch": 0.019251355980588067, + "grad_norm": 2.1857783794403076, + "learning_rate": 4.995431940714134e-05, + "loss": 5.6663, + "step": 3237 + }, + { + "epoch": 0.019257303263869065, + "grad_norm": 2.031041145324707, + "learning_rate": 4.995429117863468e-05, + "loss": 5.6734, + "step": 3238 + }, + { + "epoch": 0.01926325054715006, + "grad_norm": 2.31980037689209, + "learning_rate": 4.995426294141674e-05, + "loss": 5.8851, + "step": 3239 + }, + { + "epoch": 0.01926919783043106, + "grad_norm": 2.102965831756592, + "learning_rate": 4.9954234695487535e-05, + "loss": 5.7092, + "step": 3240 + }, + { + "epoch": 0.019275145113712058, + "grad_norm": 2.031169891357422, + "learning_rate": 4.995420644084705e-05, + "loss": 5.9755, + "step": 3241 + }, + { + "epoch": 0.019281092396993053, + "grad_norm": 2.2460241317749023, + "learning_rate": 4.995417817749532e-05, + "loss": 5.8895, + "step": 3242 + }, + { + "epoch": 0.01928703968027405, + "grad_norm": 2.618539571762085, + "learning_rate": 4.9954149905432336e-05, + "loss": 5.6964, + "step": 3243 + }, + { + "epoch": 0.019292986963555046, + "grad_norm": 2.1615748405456543, + "learning_rate": 4.995412162465812e-05, + "loss": 5.7162, + "step": 3244 + }, + { + "epoch": 0.019298934246836045, + "grad_norm": 2.363663673400879, + "learning_rate": 4.995409333517268e-05, + "loss": 5.7957, + "step": 3245 + }, + { + "epoch": 0.019304881530117043, + "grad_norm": 2.131084680557251, + "learning_rate": 4.9954065036976025e-05, + "loss": 5.7925, + "step": 3246 + }, + { + "epoch": 0.01931082881339804, + "grad_norm": 2.4043118953704834, + "learning_rate": 4.9954036730068155e-05, + "loss": 5.7895, + "step": 3247 + }, + { + "epoch": 0.019316776096679037, + "grad_norm": 2.521756887435913, + "learning_rate": 4.995400841444909e-05, + "loss": 5.6279, + "step": 3248 + }, + { + "epoch": 0.019322723379960036, + "grad_norm": 2.1791021823883057, + "learning_rate": 4.9953980090118846e-05, + "loss": 5.717, + "step": 3249 + }, + { + "epoch": 0.01932867066324103, + "grad_norm": 2.6562376022338867, + "learning_rate": 4.995395175707742e-05, + "loss": 5.7407, + "step": 3250 + }, + { + "epoch": 0.01933461794652203, + "grad_norm": 2.4377942085266113, + "learning_rate": 4.995392341532483e-05, + "loss": 5.539, + "step": 3251 + }, + { + "epoch": 0.019340565229803024, + "grad_norm": 2.3716847896575928, + "learning_rate": 4.995389506486109e-05, + "loss": 5.7251, + "step": 3252 + }, + { + "epoch": 0.019346512513084023, + "grad_norm": 2.2509348392486572, + "learning_rate": 4.995386670568619e-05, + "loss": 5.8749, + "step": 3253 + }, + { + "epoch": 0.01935245979636502, + "grad_norm": 2.265608072280884, + "learning_rate": 4.995383833780016e-05, + "loss": 5.8236, + "step": 3254 + }, + { + "epoch": 0.019358407079646017, + "grad_norm": 1.972179651260376, + "learning_rate": 4.9953809961203e-05, + "loss": 5.9235, + "step": 3255 + }, + { + "epoch": 0.019364354362927015, + "grad_norm": 2.314030170440674, + "learning_rate": 4.9953781575894723e-05, + "loss": 5.7355, + "step": 3256 + }, + { + "epoch": 0.019370301646208014, + "grad_norm": 2.3061349391937256, + "learning_rate": 4.995375318187534e-05, + "loss": 5.7337, + "step": 3257 + }, + { + "epoch": 0.01937624892948901, + "grad_norm": 1.9106477499008179, + "learning_rate": 4.9953724779144864e-05, + "loss": 5.8342, + "step": 3258 + }, + { + "epoch": 0.019382196212770007, + "grad_norm": 2.313750982284546, + "learning_rate": 4.9953696367703296e-05, + "loss": 5.7981, + "step": 3259 + }, + { + "epoch": 0.019388143496051002, + "grad_norm": 2.4477834701538086, + "learning_rate": 4.9953667947550644e-05, + "loss": 5.8212, + "step": 3260 + }, + { + "epoch": 0.019394090779332, + "grad_norm": 2.072659730911255, + "learning_rate": 4.9953639518686936e-05, + "loss": 5.7335, + "step": 3261 + }, + { + "epoch": 0.019400038062613, + "grad_norm": 2.0848984718322754, + "learning_rate": 4.995361108111216e-05, + "loss": 5.7427, + "step": 3262 + }, + { + "epoch": 0.019405985345893995, + "grad_norm": 1.938265323638916, + "learning_rate": 4.9953582634826345e-05, + "loss": 5.7946, + "step": 3263 + }, + { + "epoch": 0.019411932629174993, + "grad_norm": 2.227194309234619, + "learning_rate": 4.995355417982949e-05, + "loss": 5.9095, + "step": 3264 + }, + { + "epoch": 0.01941787991245599, + "grad_norm": 2.3245849609375, + "learning_rate": 4.9953525716121604e-05, + "loss": 5.802, + "step": 3265 + }, + { + "epoch": 0.019423827195736987, + "grad_norm": 2.08950138092041, + "learning_rate": 4.9953497243702696e-05, + "loss": 5.9001, + "step": 3266 + }, + { + "epoch": 0.019429774479017985, + "grad_norm": 1.93153715133667, + "learning_rate": 4.9953468762572786e-05, + "loss": 5.9042, + "step": 3267 + }, + { + "epoch": 0.01943572176229898, + "grad_norm": 2.4099066257476807, + "learning_rate": 4.9953440272731874e-05, + "loss": 5.8181, + "step": 3268 + }, + { + "epoch": 0.01944166904557998, + "grad_norm": 2.078752279281616, + "learning_rate": 4.995341177417998e-05, + "loss": 5.8771, + "step": 3269 + }, + { + "epoch": 0.019447616328860978, + "grad_norm": 2.012592077255249, + "learning_rate": 4.9953383266917106e-05, + "loss": 5.8135, + "step": 3270 + }, + { + "epoch": 0.019453563612141973, + "grad_norm": 2.0364151000976562, + "learning_rate": 4.995335475094326e-05, + "loss": 5.8767, + "step": 3271 + }, + { + "epoch": 0.01945951089542297, + "grad_norm": 2.0447049140930176, + "learning_rate": 4.995332622625846e-05, + "loss": 5.8236, + "step": 3272 + }, + { + "epoch": 0.01946545817870397, + "grad_norm": 2.2354300022125244, + "learning_rate": 4.995329769286271e-05, + "loss": 5.7794, + "step": 3273 + }, + { + "epoch": 0.019471405461984965, + "grad_norm": 2.031331777572632, + "learning_rate": 4.995326915075602e-05, + "loss": 5.87, + "step": 3274 + }, + { + "epoch": 0.019477352745265963, + "grad_norm": 2.2116496562957764, + "learning_rate": 4.99532405999384e-05, + "loss": 5.885, + "step": 3275 + }, + { + "epoch": 0.01948330002854696, + "grad_norm": 1.9008034467697144, + "learning_rate": 4.995321204040987e-05, + "loss": 5.8646, + "step": 3276 + }, + { + "epoch": 0.019489247311827957, + "grad_norm": 2.1743087768554688, + "learning_rate": 4.995318347217042e-05, + "loss": 5.9742, + "step": 3277 + }, + { + "epoch": 0.019495194595108956, + "grad_norm": 2.09171724319458, + "learning_rate": 4.995315489522008e-05, + "loss": 5.882, + "step": 3278 + }, + { + "epoch": 0.01950114187838995, + "grad_norm": 1.816938042640686, + "learning_rate": 4.995312630955885e-05, + "loss": 5.9164, + "step": 3279 + }, + { + "epoch": 0.01950708916167095, + "grad_norm": 2.065207004547119, + "learning_rate": 4.995309771518674e-05, + "loss": 5.9273, + "step": 3280 + }, + { + "epoch": 0.019513036444951944, + "grad_norm": 2.1037240028381348, + "learning_rate": 4.9953069112103757e-05, + "loss": 5.863, + "step": 3281 + }, + { + "epoch": 0.019518983728232943, + "grad_norm": 2.011705160140991, + "learning_rate": 4.995304050030992e-05, + "loss": 5.712, + "step": 3282 + }, + { + "epoch": 0.01952493101151394, + "grad_norm": 2.2053868770599365, + "learning_rate": 4.995301187980523e-05, + "loss": 5.6988, + "step": 3283 + }, + { + "epoch": 0.019530878294794937, + "grad_norm": 2.0522396564483643, + "learning_rate": 4.995298325058971e-05, + "loss": 5.6831, + "step": 3284 + }, + { + "epoch": 0.019536825578075935, + "grad_norm": 1.9751875400543213, + "learning_rate": 4.995295461266336e-05, + "loss": 6.0187, + "step": 3285 + }, + { + "epoch": 0.019542772861356934, + "grad_norm": 2.79711651802063, + "learning_rate": 4.9952925966026185e-05, + "loss": 6.4995, + "step": 3286 + }, + { + "epoch": 0.01954872014463793, + "grad_norm": 2.1059019565582275, + "learning_rate": 4.9952897310678206e-05, + "loss": 5.9603, + "step": 3287 + }, + { + "epoch": 0.019554667427918927, + "grad_norm": 2.169428825378418, + "learning_rate": 4.995286864661942e-05, + "loss": 5.7973, + "step": 3288 + }, + { + "epoch": 0.019560614711199922, + "grad_norm": 2.165508985519409, + "learning_rate": 4.995283997384985e-05, + "loss": 5.9132, + "step": 3289 + }, + { + "epoch": 0.01956656199448092, + "grad_norm": 2.248450994491577, + "learning_rate": 4.9952811292369506e-05, + "loss": 5.8202, + "step": 3290 + }, + { + "epoch": 0.01957250927776192, + "grad_norm": 2.3068084716796875, + "learning_rate": 4.9952782602178394e-05, + "loss": 5.8223, + "step": 3291 + }, + { + "epoch": 0.019578456561042915, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.9952753903276516e-05, + "loss": 5.6231, + "step": 3292 + }, + { + "epoch": 0.019584403844323913, + "grad_norm": 2.136564254760742, + "learning_rate": 4.9952725195663895e-05, + "loss": 5.9859, + "step": 3293 + }, + { + "epoch": 0.01959035112760491, + "grad_norm": 2.6265337467193604, + "learning_rate": 4.9952696479340535e-05, + "loss": 5.9126, + "step": 3294 + }, + { + "epoch": 0.019596298410885907, + "grad_norm": 2.442678928375244, + "learning_rate": 4.9952667754306445e-05, + "loss": 5.9361, + "step": 3295 + }, + { + "epoch": 0.019602245694166905, + "grad_norm": 2.0740134716033936, + "learning_rate": 4.9952639020561644e-05, + "loss": 5.913, + "step": 3296 + }, + { + "epoch": 0.0196081929774479, + "grad_norm": 2.4088518619537354, + "learning_rate": 4.995261027810612e-05, + "loss": 5.8297, + "step": 3297 + }, + { + "epoch": 0.0196141402607289, + "grad_norm": 2.1514804363250732, + "learning_rate": 4.995258152693991e-05, + "loss": 5.8256, + "step": 3298 + }, + { + "epoch": 0.019620087544009897, + "grad_norm": 2.921570062637329, + "learning_rate": 4.9952552767063e-05, + "loss": 6.0243, + "step": 3299 + }, + { + "epoch": 0.019626034827290893, + "grad_norm": 2.398749828338623, + "learning_rate": 4.995252399847542e-05, + "loss": 6.004, + "step": 3300 + }, + { + "epoch": 0.01963198211057189, + "grad_norm": 2.2024805545806885, + "learning_rate": 4.995249522117717e-05, + "loss": 5.9201, + "step": 3301 + }, + { + "epoch": 0.01963792939385289, + "grad_norm": 2.112269401550293, + "learning_rate": 4.9952466435168266e-05, + "loss": 5.8488, + "step": 3302 + }, + { + "epoch": 0.019643876677133885, + "grad_norm": 2.04632568359375, + "learning_rate": 4.99524376404487e-05, + "loss": 5.8054, + "step": 3303 + }, + { + "epoch": 0.019649823960414883, + "grad_norm": 2.6293606758117676, + "learning_rate": 4.995240883701851e-05, + "loss": 5.6799, + "step": 3304 + }, + { + "epoch": 0.01965577124369588, + "grad_norm": 2.5172793865203857, + "learning_rate": 4.995238002487769e-05, + "loss": 5.712, + "step": 3305 + }, + { + "epoch": 0.019661718526976877, + "grad_norm": 2.549194097518921, + "learning_rate": 4.995235120402625e-05, + "loss": 5.7208, + "step": 3306 + }, + { + "epoch": 0.019667665810257876, + "grad_norm": 2.2993295192718506, + "learning_rate": 4.99523223744642e-05, + "loss": 5.7952, + "step": 3307 + }, + { + "epoch": 0.01967361309353887, + "grad_norm": 2.1270902156829834, + "learning_rate": 4.9952293536191555e-05, + "loss": 5.6988, + "step": 3308 + }, + { + "epoch": 0.01967956037681987, + "grad_norm": 2.349858283996582, + "learning_rate": 4.9952264689208315e-05, + "loss": 5.623, + "step": 3309 + }, + { + "epoch": 0.019685507660100864, + "grad_norm": 2.1501529216766357, + "learning_rate": 4.9952235833514506e-05, + "loss": 5.6498, + "step": 3310 + }, + { + "epoch": 0.019691454943381863, + "grad_norm": 2.0577821731567383, + "learning_rate": 4.995220696911012e-05, + "loss": 5.6863, + "step": 3311 + }, + { + "epoch": 0.01969740222666286, + "grad_norm": 2.0787386894226074, + "learning_rate": 4.9952178095995185e-05, + "loss": 5.6314, + "step": 3312 + }, + { + "epoch": 0.019703349509943856, + "grad_norm": 2.4042680263519287, + "learning_rate": 4.99521492141697e-05, + "loss": 5.6152, + "step": 3313 + }, + { + "epoch": 0.019709296793224855, + "grad_norm": 2.444410800933838, + "learning_rate": 4.995212032363368e-05, + "loss": 5.5375, + "step": 3314 + }, + { + "epoch": 0.019715244076505854, + "grad_norm": 2.1678028106689453, + "learning_rate": 4.995209142438712e-05, + "loss": 5.6239, + "step": 3315 + }, + { + "epoch": 0.01972119135978685, + "grad_norm": 2.5436410903930664, + "learning_rate": 4.9952062516430054e-05, + "loss": 5.4234, + "step": 3316 + }, + { + "epoch": 0.019727138643067847, + "grad_norm": 2.454561471939087, + "learning_rate": 4.9952033599762484e-05, + "loss": 5.4198, + "step": 3317 + }, + { + "epoch": 0.019733085926348842, + "grad_norm": 2.388125419616699, + "learning_rate": 4.9952004674384413e-05, + "loss": 5.5073, + "step": 3318 + }, + { + "epoch": 0.01973903320962984, + "grad_norm": 2.1900579929351807, + "learning_rate": 4.995197574029585e-05, + "loss": 5.3463, + "step": 3319 + }, + { + "epoch": 0.01974498049291084, + "grad_norm": 2.5625739097595215, + "learning_rate": 4.995194679749681e-05, + "loss": 5.4291, + "step": 3320 + }, + { + "epoch": 0.019750927776191834, + "grad_norm": 2.52402400970459, + "learning_rate": 4.995191784598731e-05, + "loss": 5.3826, + "step": 3321 + }, + { + "epoch": 0.019756875059472833, + "grad_norm": 2.5888168811798096, + "learning_rate": 4.995188888576735e-05, + "loss": 5.381, + "step": 3322 + }, + { + "epoch": 0.01976282234275383, + "grad_norm": 2.637080669403076, + "learning_rate": 4.995185991683694e-05, + "loss": 5.3321, + "step": 3323 + }, + { + "epoch": 0.019768769626034827, + "grad_norm": 2.46553111076355, + "learning_rate": 4.9951830939196095e-05, + "loss": 5.3663, + "step": 3324 + }, + { + "epoch": 0.019774716909315825, + "grad_norm": 2.2397992610931396, + "learning_rate": 4.9951801952844826e-05, + "loss": 5.3237, + "step": 3325 + }, + { + "epoch": 0.01978066419259682, + "grad_norm": 2.3519208431243896, + "learning_rate": 4.9951772957783144e-05, + "loss": 5.4166, + "step": 3326 + }, + { + "epoch": 0.01978661147587782, + "grad_norm": 2.6235291957855225, + "learning_rate": 4.9951743954011056e-05, + "loss": 5.8094, + "step": 3327 + }, + { + "epoch": 0.019792558759158817, + "grad_norm": 2.162285327911377, + "learning_rate": 4.995171494152856e-05, + "loss": 5.6491, + "step": 3328 + }, + { + "epoch": 0.019798506042439813, + "grad_norm": 2.231853485107422, + "learning_rate": 4.995168592033569e-05, + "loss": 5.69, + "step": 3329 + }, + { + "epoch": 0.01980445332572081, + "grad_norm": 2.7305827140808105, + "learning_rate": 4.995165689043244e-05, + "loss": 5.5028, + "step": 3330 + }, + { + "epoch": 0.01981040060900181, + "grad_norm": 2.9917726516723633, + "learning_rate": 4.9951627851818824e-05, + "loss": 5.3227, + "step": 3331 + }, + { + "epoch": 0.019816347892282805, + "grad_norm": 3.0039985179901123, + "learning_rate": 4.995159880449486e-05, + "loss": 5.5965, + "step": 3332 + }, + { + "epoch": 0.019822295175563803, + "grad_norm": 3.081099510192871, + "learning_rate": 4.995156974846054e-05, + "loss": 5.6945, + "step": 3333 + }, + { + "epoch": 0.0198282424588448, + "grad_norm": 2.042445182800293, + "learning_rate": 4.995154068371589e-05, + "loss": 5.693, + "step": 3334 + }, + { + "epoch": 0.019834189742125797, + "grad_norm": 2.8875865936279297, + "learning_rate": 4.995151161026091e-05, + "loss": 5.5981, + "step": 3335 + }, + { + "epoch": 0.019840137025406795, + "grad_norm": 2.4203453063964844, + "learning_rate": 4.9951482528095615e-05, + "loss": 5.6269, + "step": 3336 + }, + { + "epoch": 0.01984608430868779, + "grad_norm": 2.332151174545288, + "learning_rate": 4.995145343722002e-05, + "loss": 5.6002, + "step": 3337 + }, + { + "epoch": 0.01985203159196879, + "grad_norm": 2.556549310684204, + "learning_rate": 4.995142433763413e-05, + "loss": 5.7715, + "step": 3338 + }, + { + "epoch": 0.019857978875249784, + "grad_norm": 2.453113079071045, + "learning_rate": 4.995139522933796e-05, + "loss": 5.8958, + "step": 3339 + }, + { + "epoch": 0.019863926158530783, + "grad_norm": 1.9842414855957031, + "learning_rate": 4.995136611233151e-05, + "loss": 5.9781, + "step": 3340 + }, + { + "epoch": 0.01986987344181178, + "grad_norm": 2.3725521564483643, + "learning_rate": 4.995133698661479e-05, + "loss": 5.9902, + "step": 3341 + }, + { + "epoch": 0.019875820725092776, + "grad_norm": 2.679001808166504, + "learning_rate": 4.9951307852187824e-05, + "loss": 5.9526, + "step": 3342 + }, + { + "epoch": 0.019881768008373775, + "grad_norm": 2.272595167160034, + "learning_rate": 4.995127870905061e-05, + "loss": 5.9685, + "step": 3343 + }, + { + "epoch": 0.019887715291654774, + "grad_norm": 2.0300357341766357, + "learning_rate": 4.995124955720317e-05, + "loss": 5.7702, + "step": 3344 + }, + { + "epoch": 0.01989366257493577, + "grad_norm": 2.5023481845855713, + "learning_rate": 4.9951220396645504e-05, + "loss": 5.6612, + "step": 3345 + }, + { + "epoch": 0.019899609858216767, + "grad_norm": 2.426457166671753, + "learning_rate": 4.995119122737762e-05, + "loss": 5.767, + "step": 3346 + }, + { + "epoch": 0.019905557141497762, + "grad_norm": 2.4919028282165527, + "learning_rate": 4.995116204939954e-05, + "loss": 6.0578, + "step": 3347 + }, + { + "epoch": 0.01991150442477876, + "grad_norm": 3.099792957305908, + "learning_rate": 4.995113286271126e-05, + "loss": 7.053, + "step": 3348 + }, + { + "epoch": 0.01991745170805976, + "grad_norm": 2.597169876098633, + "learning_rate": 4.9951103667312795e-05, + "loss": 5.8467, + "step": 3349 + }, + { + "epoch": 0.019923398991340754, + "grad_norm": 2.1132469177246094, + "learning_rate": 4.995107446320416e-05, + "loss": 5.7296, + "step": 3350 + }, + { + "epoch": 0.019929346274621753, + "grad_norm": 2.4141721725463867, + "learning_rate": 4.995104525038537e-05, + "loss": 5.8705, + "step": 3351 + }, + { + "epoch": 0.01993529355790275, + "grad_norm": 1.9012199640274048, + "learning_rate": 4.995101602885642e-05, + "loss": 5.8759, + "step": 3352 + }, + { + "epoch": 0.019941240841183747, + "grad_norm": 2.168673038482666, + "learning_rate": 4.9950986798617335e-05, + "loss": 5.8161, + "step": 3353 + }, + { + "epoch": 0.019947188124464745, + "grad_norm": 2.1579155921936035, + "learning_rate": 4.995095755966811e-05, + "loss": 5.8699, + "step": 3354 + }, + { + "epoch": 0.01995313540774574, + "grad_norm": 2.1460800170898438, + "learning_rate": 4.9950928312008774e-05, + "loss": 5.9144, + "step": 3355 + }, + { + "epoch": 0.01995908269102674, + "grad_norm": 2.402167558670044, + "learning_rate": 4.995089905563932e-05, + "loss": 5.8857, + "step": 3356 + }, + { + "epoch": 0.019965029974307737, + "grad_norm": 2.6381726264953613, + "learning_rate": 4.995086979055976e-05, + "loss": 6.0021, + "step": 3357 + }, + { + "epoch": 0.019970977257588732, + "grad_norm": 2.5577943325042725, + "learning_rate": 4.995084051677012e-05, + "loss": 5.9425, + "step": 3358 + }, + { + "epoch": 0.01997692454086973, + "grad_norm": 2.188215494155884, + "learning_rate": 4.995081123427039e-05, + "loss": 6.0656, + "step": 3359 + }, + { + "epoch": 0.01998287182415073, + "grad_norm": 1.8278366327285767, + "learning_rate": 4.9950781943060596e-05, + "loss": 5.8229, + "step": 3360 + }, + { + "epoch": 0.019988819107431725, + "grad_norm": 1.9054077863693237, + "learning_rate": 4.995075264314074e-05, + "loss": 5.8158, + "step": 3361 + }, + { + "epoch": 0.019994766390712723, + "grad_norm": 2.1255416870117188, + "learning_rate": 4.9950723334510826e-05, + "loss": 5.8816, + "step": 3362 + }, + { + "epoch": 0.02000071367399372, + "grad_norm": 2.026923656463623, + "learning_rate": 4.995069401717088e-05, + "loss": 5.7463, + "step": 3363 + }, + { + "epoch": 0.020006660957274717, + "grad_norm": 2.015178680419922, + "learning_rate": 4.9950664691120905e-05, + "loss": 5.6689, + "step": 3364 + }, + { + "epoch": 0.020012608240555715, + "grad_norm": 1.7729417085647583, + "learning_rate": 4.995063535636091e-05, + "loss": 5.701, + "step": 3365 + }, + { + "epoch": 0.02001855552383671, + "grad_norm": 1.9893600940704346, + "learning_rate": 4.9950606012890905e-05, + "loss": 5.7502, + "step": 3366 + }, + { + "epoch": 0.02002450280711771, + "grad_norm": 1.8950870037078857, + "learning_rate": 4.99505766607109e-05, + "loss": 5.6094, + "step": 3367 + }, + { + "epoch": 0.020030450090398704, + "grad_norm": 2.4140830039978027, + "learning_rate": 4.995054729982091e-05, + "loss": 5.8387, + "step": 3368 + }, + { + "epoch": 0.020036397373679703, + "grad_norm": 2.1887669563293457, + "learning_rate": 4.995051793022094e-05, + "loss": 5.7348, + "step": 3369 + }, + { + "epoch": 0.0200423446569607, + "grad_norm": 1.9632731676101685, + "learning_rate": 4.9950488551911e-05, + "loss": 5.5568, + "step": 3370 + }, + { + "epoch": 0.020048291940241696, + "grad_norm": 2.116834878921509, + "learning_rate": 4.995045916489111e-05, + "loss": 5.461, + "step": 3371 + }, + { + "epoch": 0.020054239223522695, + "grad_norm": 2.021256923675537, + "learning_rate": 4.9950429769161266e-05, + "loss": 5.6601, + "step": 3372 + }, + { + "epoch": 0.020060186506803693, + "grad_norm": 2.1648659706115723, + "learning_rate": 4.9950400364721486e-05, + "loss": 5.5364, + "step": 3373 + }, + { + "epoch": 0.02006613379008469, + "grad_norm": 2.043499231338501, + "learning_rate": 4.9950370951571775e-05, + "loss": 5.7273, + "step": 3374 + }, + { + "epoch": 0.020072081073365687, + "grad_norm": 2.296121597290039, + "learning_rate": 4.995034152971215e-05, + "loss": 5.8494, + "step": 3375 + }, + { + "epoch": 0.020078028356646682, + "grad_norm": 2.401031494140625, + "learning_rate": 4.995031209914261e-05, + "loss": 5.719, + "step": 3376 + }, + { + "epoch": 0.02008397563992768, + "grad_norm": 2.3130364418029785, + "learning_rate": 4.995028265986319e-05, + "loss": 5.7998, + "step": 3377 + }, + { + "epoch": 0.02008992292320868, + "grad_norm": 2.3820009231567383, + "learning_rate": 4.9950253211873874e-05, + "loss": 6.0632, + "step": 3378 + }, + { + "epoch": 0.020095870206489674, + "grad_norm": 2.1970956325531006, + "learning_rate": 4.995022375517469e-05, + "loss": 5.9776, + "step": 3379 + }, + { + "epoch": 0.020101817489770673, + "grad_norm": 1.912102460861206, + "learning_rate": 4.995019428976564e-05, + "loss": 5.7194, + "step": 3380 + }, + { + "epoch": 0.02010776477305167, + "grad_norm": 2.3187389373779297, + "learning_rate": 4.995016481564673e-05, + "loss": 6.0225, + "step": 3381 + }, + { + "epoch": 0.020113712056332667, + "grad_norm": 1.959000587463379, + "learning_rate": 4.995013533281797e-05, + "loss": 5.8453, + "step": 3382 + }, + { + "epoch": 0.020119659339613665, + "grad_norm": 2.0283286571502686, + "learning_rate": 4.995010584127938e-05, + "loss": 5.6837, + "step": 3383 + }, + { + "epoch": 0.02012560662289466, + "grad_norm": 2.410351037979126, + "learning_rate": 4.995007634103097e-05, + "loss": 5.8172, + "step": 3384 + }, + { + "epoch": 0.02013155390617566, + "grad_norm": 2.2864298820495605, + "learning_rate": 4.995004683207275e-05, + "loss": 5.8995, + "step": 3385 + }, + { + "epoch": 0.020137501189456657, + "grad_norm": 2.830883026123047, + "learning_rate": 4.995001731440472e-05, + "loss": 5.7273, + "step": 3386 + }, + { + "epoch": 0.020143448472737652, + "grad_norm": 2.486783981323242, + "learning_rate": 4.9949987788026896e-05, + "loss": 5.88, + "step": 3387 + }, + { + "epoch": 0.02014939575601865, + "grad_norm": 2.109975576400757, + "learning_rate": 4.994995825293929e-05, + "loss": 5.8618, + "step": 3388 + }, + { + "epoch": 0.02015534303929965, + "grad_norm": 2.249293327331543, + "learning_rate": 4.994992870914191e-05, + "loss": 5.8511, + "step": 3389 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.5433366298675537, + "learning_rate": 4.9949899156634774e-05, + "loss": 5.7375, + "step": 3390 + }, + { + "epoch": 0.020167237605861643, + "grad_norm": 2.7013652324676514, + "learning_rate": 4.9949869595417876e-05, + "loss": 5.8886, + "step": 3391 + }, + { + "epoch": 0.020173184889142638, + "grad_norm": 2.536972761154175, + "learning_rate": 4.994984002549124e-05, + "loss": 5.4203, + "step": 3392 + }, + { + "epoch": 0.020179132172423637, + "grad_norm": 2.596230983734131, + "learning_rate": 4.9949810446854876e-05, + "loss": 5.7882, + "step": 3393 + }, + { + "epoch": 0.020185079455704635, + "grad_norm": 2.6889936923980713, + "learning_rate": 4.9949780859508786e-05, + "loss": 5.6822, + "step": 3394 + }, + { + "epoch": 0.02019102673898563, + "grad_norm": 2.541027069091797, + "learning_rate": 4.994975126345299e-05, + "loss": 5.7394, + "step": 3395 + }, + { + "epoch": 0.02019697402226663, + "grad_norm": 2.2267251014709473, + "learning_rate": 4.9949721658687485e-05, + "loss": 5.7847, + "step": 3396 + }, + { + "epoch": 0.020202921305547628, + "grad_norm": 2.439689874649048, + "learning_rate": 4.994969204521231e-05, + "loss": 5.6222, + "step": 3397 + }, + { + "epoch": 0.020208868588828623, + "grad_norm": 2.9407742023468018, + "learning_rate": 4.9949662423027434e-05, + "loss": 5.6629, + "step": 3398 + }, + { + "epoch": 0.02021481587210962, + "grad_norm": 2.42802357673645, + "learning_rate": 4.9949632792132894e-05, + "loss": 5.3369, + "step": 3399 + }, + { + "epoch": 0.020220763155390616, + "grad_norm": 2.465508222579956, + "learning_rate": 4.99496031525287e-05, + "loss": 5.3365, + "step": 3400 + }, + { + "epoch": 0.020226710438671615, + "grad_norm": 2.408794403076172, + "learning_rate": 4.9949573504214854e-05, + "loss": 5.3156, + "step": 3401 + }, + { + "epoch": 0.020232657721952613, + "grad_norm": 2.229372978210449, + "learning_rate": 4.9949543847191374e-05, + "loss": 5.9194, + "step": 3402 + }, + { + "epoch": 0.02023860500523361, + "grad_norm": 4.567020416259766, + "learning_rate": 4.9949514181458254e-05, + "loss": 6.3379, + "step": 3403 + }, + { + "epoch": 0.020244552288514607, + "grad_norm": 3.9927520751953125, + "learning_rate": 4.9949484507015534e-05, + "loss": 6.3351, + "step": 3404 + }, + { + "epoch": 0.020250499571795602, + "grad_norm": 2.4830081462860107, + "learning_rate": 4.9949454823863195e-05, + "loss": 6.4046, + "step": 3405 + }, + { + "epoch": 0.0202564468550766, + "grad_norm": 2.282722234725952, + "learning_rate": 4.994942513200126e-05, + "loss": 6.5473, + "step": 3406 + }, + { + "epoch": 0.0202623941383576, + "grad_norm": 2.411367416381836, + "learning_rate": 4.994939543142973e-05, + "loss": 5.7898, + "step": 3407 + }, + { + "epoch": 0.020268341421638594, + "grad_norm": 3.2052342891693115, + "learning_rate": 4.994936572214864e-05, + "loss": 5.6695, + "step": 3408 + }, + { + "epoch": 0.020274288704919593, + "grad_norm": 4.142974853515625, + "learning_rate": 4.994933600415798e-05, + "loss": 6.2037, + "step": 3409 + }, + { + "epoch": 0.02028023598820059, + "grad_norm": 2.839066982269287, + "learning_rate": 4.994930627745776e-05, + "loss": 6.7308, + "step": 3410 + }, + { + "epoch": 0.020286183271481587, + "grad_norm": 3.3138885498046875, + "learning_rate": 4.9949276542048e-05, + "loss": 5.8873, + "step": 3411 + }, + { + "epoch": 0.020292130554762585, + "grad_norm": 2.6651928424835205, + "learning_rate": 4.9949246797928704e-05, + "loss": 6.6325, + "step": 3412 + }, + { + "epoch": 0.02029807783804358, + "grad_norm": 2.919436454772949, + "learning_rate": 4.994921704509988e-05, + "loss": 6.3239, + "step": 3413 + }, + { + "epoch": 0.02030402512132458, + "grad_norm": 2.6901097297668457, + "learning_rate": 4.994918728356155e-05, + "loss": 6.1712, + "step": 3414 + }, + { + "epoch": 0.020309972404605577, + "grad_norm": 2.573249340057373, + "learning_rate": 4.9949157513313704e-05, + "loss": 5.8194, + "step": 3415 + }, + { + "epoch": 0.020315919687886572, + "grad_norm": 3.0603950023651123, + "learning_rate": 4.994912773435637e-05, + "loss": 6.3881, + "step": 3416 + }, + { + "epoch": 0.02032186697116757, + "grad_norm": 3.1800057888031006, + "learning_rate": 4.994909794668956e-05, + "loss": 5.9486, + "step": 3417 + }, + { + "epoch": 0.02032781425444857, + "grad_norm": 2.537182092666626, + "learning_rate": 4.994906815031327e-05, + "loss": 6.5454, + "step": 3418 + }, + { + "epoch": 0.020333761537729565, + "grad_norm": 2.474705457687378, + "learning_rate": 4.9949038345227525e-05, + "loss": 6.5356, + "step": 3419 + }, + { + "epoch": 0.020339708821010563, + "grad_norm": 3.054689645767212, + "learning_rate": 4.994900853143232e-05, + "loss": 6.4526, + "step": 3420 + }, + { + "epoch": 0.020345656104291558, + "grad_norm": 2.587644100189209, + "learning_rate": 4.994897870892769e-05, + "loss": 6.2811, + "step": 3421 + }, + { + "epoch": 0.020351603387572557, + "grad_norm": 2.110041618347168, + "learning_rate": 4.994894887771361e-05, + "loss": 6.0428, + "step": 3422 + }, + { + "epoch": 0.020357550670853555, + "grad_norm": 2.4931492805480957, + "learning_rate": 4.9948919037790115e-05, + "loss": 6.3683, + "step": 3423 + }, + { + "epoch": 0.02036349795413455, + "grad_norm": 2.7169463634490967, + "learning_rate": 4.994888918915721e-05, + "loss": 6.5335, + "step": 3424 + }, + { + "epoch": 0.02036944523741555, + "grad_norm": 2.164363145828247, + "learning_rate": 4.994885933181491e-05, + "loss": 6.0409, + "step": 3425 + }, + { + "epoch": 0.020375392520696547, + "grad_norm": 2.480468273162842, + "learning_rate": 4.994882946576322e-05, + "loss": 5.8816, + "step": 3426 + }, + { + "epoch": 0.020381339803977543, + "grad_norm": 2.928361415863037, + "learning_rate": 4.994879959100215e-05, + "loss": 6.1706, + "step": 3427 + }, + { + "epoch": 0.02038728708725854, + "grad_norm": 2.1536660194396973, + "learning_rate": 4.994876970753171e-05, + "loss": 6.0559, + "step": 3428 + }, + { + "epoch": 0.020393234370539536, + "grad_norm": 2.6913530826568604, + "learning_rate": 4.994873981535192e-05, + "loss": 6.7411, + "step": 3429 + }, + { + "epoch": 0.020399181653820535, + "grad_norm": 2.647124767303467, + "learning_rate": 4.994870991446278e-05, + "loss": 6.5251, + "step": 3430 + }, + { + "epoch": 0.020405128937101533, + "grad_norm": 2.621612310409546, + "learning_rate": 4.994868000486429e-05, + "loss": 6.7029, + "step": 3431 + }, + { + "epoch": 0.02041107622038253, + "grad_norm": 2.1986844539642334, + "learning_rate": 4.994865008655649e-05, + "loss": 6.4561, + "step": 3432 + }, + { + "epoch": 0.020417023503663527, + "grad_norm": 2.706897735595703, + "learning_rate": 4.994862015953936e-05, + "loss": 6.3125, + "step": 3433 + }, + { + "epoch": 0.020422970786944522, + "grad_norm": 2.403346300125122, + "learning_rate": 4.994859022381294e-05, + "loss": 6.0808, + "step": 3434 + }, + { + "epoch": 0.02042891807022552, + "grad_norm": 2.367835521697998, + "learning_rate": 4.994856027937722e-05, + "loss": 6.2634, + "step": 3435 + }, + { + "epoch": 0.02043486535350652, + "grad_norm": 2.8564250469207764, + "learning_rate": 4.9948530326232205e-05, + "loss": 6.579, + "step": 3436 + }, + { + "epoch": 0.020440812636787514, + "grad_norm": 2.9472100734710693, + "learning_rate": 4.9948500364377925e-05, + "loss": 6.3873, + "step": 3437 + }, + { + "epoch": 0.020446759920068513, + "grad_norm": 2.3005917072296143, + "learning_rate": 4.994847039381438e-05, + "loss": 6.2316, + "step": 3438 + }, + { + "epoch": 0.02045270720334951, + "grad_norm": 2.0548787117004395, + "learning_rate": 4.9948440414541584e-05, + "loss": 6.5022, + "step": 3439 + }, + { + "epoch": 0.020458654486630506, + "grad_norm": 2.1332197189331055, + "learning_rate": 4.9948410426559536e-05, + "loss": 6.1486, + "step": 3440 + }, + { + "epoch": 0.020464601769911505, + "grad_norm": 2.112738847732544, + "learning_rate": 4.994838042986827e-05, + "loss": 5.9125, + "step": 3441 + }, + { + "epoch": 0.0204705490531925, + "grad_norm": 2.714627981185913, + "learning_rate": 4.9948350424467774e-05, + "loss": 6.1164, + "step": 3442 + }, + { + "epoch": 0.0204764963364735, + "grad_norm": 2.337571382522583, + "learning_rate": 4.994832041035806e-05, + "loss": 6.0567, + "step": 3443 + }, + { + "epoch": 0.020482443619754497, + "grad_norm": 2.354389190673828, + "learning_rate": 4.994829038753915e-05, + "loss": 5.5922, + "step": 3444 + }, + { + "epoch": 0.020488390903035492, + "grad_norm": 2.3885531425476074, + "learning_rate": 4.994826035601106e-05, + "loss": 6.4178, + "step": 3445 + }, + { + "epoch": 0.02049433818631649, + "grad_norm": 2.931328058242798, + "learning_rate": 4.994823031577378e-05, + "loss": 6.356, + "step": 3446 + }, + { + "epoch": 0.02050028546959749, + "grad_norm": 2.4858877658843994, + "learning_rate": 4.994820026682733e-05, + "loss": 6.0601, + "step": 3447 + }, + { + "epoch": 0.020506232752878484, + "grad_norm": 2.626811981201172, + "learning_rate": 4.9948170209171725e-05, + "loss": 6.4372, + "step": 3448 + }, + { + "epoch": 0.020512180036159483, + "grad_norm": 2.2917356491088867, + "learning_rate": 4.994814014280696e-05, + "loss": 5.9828, + "step": 3449 + }, + { + "epoch": 0.020518127319440478, + "grad_norm": 2.174531936645508, + "learning_rate": 4.9948110067733075e-05, + "loss": 6.3382, + "step": 3450 + }, + { + "epoch": 0.020524074602721477, + "grad_norm": 2.9880006313323975, + "learning_rate": 4.994807998395005e-05, + "loss": 6.7493, + "step": 3451 + }, + { + "epoch": 0.020530021886002475, + "grad_norm": 2.6577212810516357, + "learning_rate": 4.994804989145792e-05, + "loss": 6.853, + "step": 3452 + }, + { + "epoch": 0.02053596916928347, + "grad_norm": 2.8832437992095947, + "learning_rate": 4.994801979025667e-05, + "loss": 6.5829, + "step": 3453 + }, + { + "epoch": 0.02054191645256447, + "grad_norm": 2.473177194595337, + "learning_rate": 4.994798968034633e-05, + "loss": 6.2879, + "step": 3454 + }, + { + "epoch": 0.020547863735845467, + "grad_norm": 2.7484633922576904, + "learning_rate": 4.994795956172691e-05, + "loss": 6.2037, + "step": 3455 + }, + { + "epoch": 0.020553811019126463, + "grad_norm": 1.6647555828094482, + "learning_rate": 4.9947929434398403e-05, + "loss": 6.5639, + "step": 3456 + }, + { + "epoch": 0.02055975830240746, + "grad_norm": 3.71087908744812, + "learning_rate": 4.994789929836084e-05, + "loss": 6.8464, + "step": 3457 + }, + { + "epoch": 0.020565705585688456, + "grad_norm": 2.705892324447632, + "learning_rate": 4.994786915361422e-05, + "loss": 6.8316, + "step": 3458 + }, + { + "epoch": 0.020571652868969455, + "grad_norm": 2.3619437217712402, + "learning_rate": 4.994783900015856e-05, + "loss": 6.3441, + "step": 3459 + }, + { + "epoch": 0.020577600152250453, + "grad_norm": 2.490499258041382, + "learning_rate": 4.9947808837993864e-05, + "loss": 6.1467, + "step": 3460 + }, + { + "epoch": 0.02058354743553145, + "grad_norm": 2.546614170074463, + "learning_rate": 4.994777866712015e-05, + "loss": 5.6677, + "step": 3461 + }, + { + "epoch": 0.020589494718812447, + "grad_norm": 2.473695755004883, + "learning_rate": 4.994774848753741e-05, + "loss": 5.7815, + "step": 3462 + }, + { + "epoch": 0.020595442002093442, + "grad_norm": 2.0494625568389893, + "learning_rate": 4.994771829924569e-05, + "loss": 5.674, + "step": 3463 + }, + { + "epoch": 0.02060138928537444, + "grad_norm": 2.1504273414611816, + "learning_rate": 4.9947688102244964e-05, + "loss": 5.5299, + "step": 3464 + }, + { + "epoch": 0.02060733656865544, + "grad_norm": 2.908170700073242, + "learning_rate": 4.994765789653526e-05, + "loss": 5.8448, + "step": 3465 + }, + { + "epoch": 0.020613283851936434, + "grad_norm": 3.1434714794158936, + "learning_rate": 4.994762768211659e-05, + "loss": 5.8413, + "step": 3466 + }, + { + "epoch": 0.020619231135217433, + "grad_norm": 2.4688189029693604, + "learning_rate": 4.994759745898896e-05, + "loss": 5.6458, + "step": 3467 + }, + { + "epoch": 0.02062517841849843, + "grad_norm": 2.172083854675293, + "learning_rate": 4.994756722715238e-05, + "loss": 5.723, + "step": 3468 + }, + { + "epoch": 0.020631125701779426, + "grad_norm": 2.0702707767486572, + "learning_rate": 4.994753698660687e-05, + "loss": 5.6199, + "step": 3469 + }, + { + "epoch": 0.020637072985060425, + "grad_norm": 2.2142136096954346, + "learning_rate": 4.9947506737352425e-05, + "loss": 5.5476, + "step": 3470 + }, + { + "epoch": 0.02064302026834142, + "grad_norm": 2.156874179840088, + "learning_rate": 4.994747647938907e-05, + "loss": 5.4773, + "step": 3471 + }, + { + "epoch": 0.02064896755162242, + "grad_norm": 3.3683371543884277, + "learning_rate": 4.9947446212716795e-05, + "loss": 6.4804, + "step": 3472 + }, + { + "epoch": 0.020654914834903417, + "grad_norm": 2.2435977458953857, + "learning_rate": 4.9947415937335635e-05, + "loss": 6.0622, + "step": 3473 + }, + { + "epoch": 0.020660862118184412, + "grad_norm": 3.0824263095855713, + "learning_rate": 4.994738565324558e-05, + "loss": 6.8809, + "step": 3474 + }, + { + "epoch": 0.02066680940146541, + "grad_norm": 2.6978909969329834, + "learning_rate": 4.9947355360446664e-05, + "loss": 6.823, + "step": 3475 + }, + { + "epoch": 0.02067275668474641, + "grad_norm": 3.041680097579956, + "learning_rate": 4.9947325058938874e-05, + "loss": 6.4268, + "step": 3476 + }, + { + "epoch": 0.020678703968027404, + "grad_norm": 3.5326781272888184, + "learning_rate": 4.9947294748722237e-05, + "loss": 6.3516, + "step": 3477 + }, + { + "epoch": 0.020684651251308403, + "grad_norm": 2.7611732482910156, + "learning_rate": 4.994726442979675e-05, + "loss": 6.2206, + "step": 3478 + }, + { + "epoch": 0.020690598534589398, + "grad_norm": 3.8533458709716797, + "learning_rate": 4.994723410216244e-05, + "loss": 6.7907, + "step": 3479 + }, + { + "epoch": 0.020696545817870397, + "grad_norm": 2.8091351985931396, + "learning_rate": 4.99472037658193e-05, + "loss": 6.7468, + "step": 3480 + }, + { + "epoch": 0.020702493101151395, + "grad_norm": 2.4317073822021484, + "learning_rate": 4.994717342076736e-05, + "loss": 6.4682, + "step": 3481 + }, + { + "epoch": 0.02070844038443239, + "grad_norm": 2.5132029056549072, + "learning_rate": 4.994714306700661e-05, + "loss": 6.1966, + "step": 3482 + }, + { + "epoch": 0.02071438766771339, + "grad_norm": 2.8161535263061523, + "learning_rate": 4.994711270453707e-05, + "loss": 5.6045, + "step": 3483 + }, + { + "epoch": 0.020720334950994387, + "grad_norm": 2.654115915298462, + "learning_rate": 4.994708233335875e-05, + "loss": 5.8983, + "step": 3484 + }, + { + "epoch": 0.020726282234275382, + "grad_norm": 2.5971553325653076, + "learning_rate": 4.9947051953471664e-05, + "loss": 5.4422, + "step": 3485 + }, + { + "epoch": 0.02073222951755638, + "grad_norm": 2.5758557319641113, + "learning_rate": 4.9947021564875816e-05, + "loss": 5.5921, + "step": 3486 + }, + { + "epoch": 0.020738176800837376, + "grad_norm": 2.635345458984375, + "learning_rate": 4.994699116757122e-05, + "loss": 6.2316, + "step": 3487 + }, + { + "epoch": 0.020744124084118375, + "grad_norm": 2.573514938354492, + "learning_rate": 4.9946960761557896e-05, + "loss": 6.5069, + "step": 3488 + }, + { + "epoch": 0.020750071367399373, + "grad_norm": 2.587735176086426, + "learning_rate": 4.994693034683584e-05, + "loss": 5.9114, + "step": 3489 + }, + { + "epoch": 0.02075601865068037, + "grad_norm": 2.4980244636535645, + "learning_rate": 4.9946899923405075e-05, + "loss": 6.1805, + "step": 3490 + }, + { + "epoch": 0.020761965933961367, + "grad_norm": 2.614003896713257, + "learning_rate": 4.9946869491265594e-05, + "loss": 6.2294, + "step": 3491 + }, + { + "epoch": 0.020767913217242365, + "grad_norm": 3.3819997310638428, + "learning_rate": 4.994683905041743e-05, + "loss": 5.4716, + "step": 3492 + }, + { + "epoch": 0.02077386050052336, + "grad_norm": 3.168170213699341, + "learning_rate": 4.994680860086057e-05, + "loss": 5.4041, + "step": 3493 + }, + { + "epoch": 0.02077980778380436, + "grad_norm": 3.05253267288208, + "learning_rate": 4.994677814259504e-05, + "loss": 5.4958, + "step": 3494 + }, + { + "epoch": 0.020785755067085354, + "grad_norm": 2.8560431003570557, + "learning_rate": 4.994674767562085e-05, + "loss": 5.4153, + "step": 3495 + }, + { + "epoch": 0.020791702350366353, + "grad_norm": 2.790382146835327, + "learning_rate": 4.994671719993801e-05, + "loss": 6.3581, + "step": 3496 + }, + { + "epoch": 0.02079764963364735, + "grad_norm": 2.9860496520996094, + "learning_rate": 4.9946686715546535e-05, + "loss": 6.5779, + "step": 3497 + }, + { + "epoch": 0.020803596916928346, + "grad_norm": 2.744859457015991, + "learning_rate": 4.994665622244642e-05, + "loss": 6.5748, + "step": 3498 + }, + { + "epoch": 0.020809544200209345, + "grad_norm": 2.7951292991638184, + "learning_rate": 4.9946625720637683e-05, + "loss": 6.1954, + "step": 3499 + }, + { + "epoch": 0.02081549148349034, + "grad_norm": 3.2961854934692383, + "learning_rate": 4.994659521012034e-05, + "loss": 6.243, + "step": 3500 + }, + { + "epoch": 0.02082143876677134, + "grad_norm": 2.934246301651001, + "learning_rate": 4.99465646908944e-05, + "loss": 6.1307, + "step": 3501 + }, + { + "epoch": 0.020827386050052337, + "grad_norm": 3.9152729511260986, + "learning_rate": 4.994653416295987e-05, + "loss": 6.0167, + "step": 3502 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 4.510169506072998, + "learning_rate": 4.994650362631676e-05, + "loss": 6.533, + "step": 3503 + }, + { + "epoch": 0.02083928061661433, + "grad_norm": 3.415665864944458, + "learning_rate": 4.994647308096509e-05, + "loss": 6.4978, + "step": 3504 + }, + { + "epoch": 0.02084522789989533, + "grad_norm": 2.6515185832977295, + "learning_rate": 4.9946442526904856e-05, + "loss": 6.3859, + "step": 3505 + }, + { + "epoch": 0.020851175183176324, + "grad_norm": 2.8215248584747314, + "learning_rate": 4.994641196413609e-05, + "loss": 6.243, + "step": 3506 + }, + { + "epoch": 0.020857122466457323, + "grad_norm": 2.644529104232788, + "learning_rate": 4.9946381392658773e-05, + "loss": 6.2954, + "step": 3507 + }, + { + "epoch": 0.020863069749738318, + "grad_norm": 3.349699020385742, + "learning_rate": 4.994635081247294e-05, + "loss": 6.5617, + "step": 3508 + }, + { + "epoch": 0.020869017033019317, + "grad_norm": 3.3669090270996094, + "learning_rate": 4.9946320223578596e-05, + "loss": 6.6458, + "step": 3509 + }, + { + "epoch": 0.020874964316300315, + "grad_norm": 2.5562078952789307, + "learning_rate": 4.994628962597575e-05, + "loss": 5.5041, + "step": 3510 + }, + { + "epoch": 0.02088091159958131, + "grad_norm": 2.851809501647949, + "learning_rate": 4.994625901966441e-05, + "loss": 5.4607, + "step": 3511 + }, + { + "epoch": 0.02088685888286231, + "grad_norm": 3.2769458293914795, + "learning_rate": 4.994622840464458e-05, + "loss": 5.3115, + "step": 3512 + }, + { + "epoch": 0.020892806166143307, + "grad_norm": 2.5495102405548096, + "learning_rate": 4.994619778091629e-05, + "loss": 5.9997, + "step": 3513 + }, + { + "epoch": 0.020898753449424302, + "grad_norm": 2.609463930130005, + "learning_rate": 4.994616714847954e-05, + "loss": 6.562, + "step": 3514 + }, + { + "epoch": 0.0209047007327053, + "grad_norm": 2.5731685161590576, + "learning_rate": 4.994613650733433e-05, + "loss": 6.5341, + "step": 3515 + }, + { + "epoch": 0.020910648015986296, + "grad_norm": 2.481297254562378, + "learning_rate": 4.99461058574807e-05, + "loss": 6.5878, + "step": 3516 + }, + { + "epoch": 0.020916595299267295, + "grad_norm": 2.4096593856811523, + "learning_rate": 4.9946075198918624e-05, + "loss": 6.5054, + "step": 3517 + }, + { + "epoch": 0.020922542582548293, + "grad_norm": 2.4417459964752197, + "learning_rate": 4.994604453164814e-05, + "loss": 6.3292, + "step": 3518 + }, + { + "epoch": 0.020928489865829288, + "grad_norm": 2.7062435150146484, + "learning_rate": 4.994601385566925e-05, + "loss": 5.564, + "step": 3519 + }, + { + "epoch": 0.020934437149110287, + "grad_norm": 2.613614559173584, + "learning_rate": 4.9945983170981955e-05, + "loss": 5.3929, + "step": 3520 + }, + { + "epoch": 0.020940384432391285, + "grad_norm": 2.4933719635009766, + "learning_rate": 4.994595247758629e-05, + "loss": 6.1841, + "step": 3521 + }, + { + "epoch": 0.02094633171567228, + "grad_norm": 2.251507043838501, + "learning_rate": 4.994592177548224e-05, + "loss": 6.3109, + "step": 3522 + }, + { + "epoch": 0.02095227899895328, + "grad_norm": 2.3830223083496094, + "learning_rate": 4.994589106466983e-05, + "loss": 5.9421, + "step": 3523 + }, + { + "epoch": 0.020958226282234274, + "grad_norm": 2.2940196990966797, + "learning_rate": 4.994586034514906e-05, + "loss": 6.0858, + "step": 3524 + }, + { + "epoch": 0.020964173565515273, + "grad_norm": 2.916836977005005, + "learning_rate": 4.994582961691996e-05, + "loss": 5.166, + "step": 3525 + }, + { + "epoch": 0.02097012084879627, + "grad_norm": 2.7183029651641846, + "learning_rate": 4.994579887998252e-05, + "loss": 6.9732, + "step": 3526 + }, + { + "epoch": 0.020976068132077266, + "grad_norm": 2.70143985748291, + "learning_rate": 4.994576813433676e-05, + "loss": 5.917, + "step": 3527 + }, + { + "epoch": 0.020982015415358265, + "grad_norm": 2.7375986576080322, + "learning_rate": 4.994573737998269e-05, + "loss": 5.3025, + "step": 3528 + }, + { + "epoch": 0.02098796269863926, + "grad_norm": 2.656982183456421, + "learning_rate": 4.994570661692033e-05, + "loss": 5.2383, + "step": 3529 + }, + { + "epoch": 0.02099390998192026, + "grad_norm": 2.2119734287261963, + "learning_rate": 4.994567584514968e-05, + "loss": 6.0456, + "step": 3530 + }, + { + "epoch": 0.020999857265201257, + "grad_norm": 2.9191582202911377, + "learning_rate": 4.9945645064670737e-05, + "loss": 6.3808, + "step": 3531 + }, + { + "epoch": 0.021005804548482252, + "grad_norm": 3.124101400375366, + "learning_rate": 4.994561427548354e-05, + "loss": 5.3631, + "step": 3532 + }, + { + "epoch": 0.02101175183176325, + "grad_norm": 2.803938150405884, + "learning_rate": 4.994558347758808e-05, + "loss": 5.3172, + "step": 3533 + }, + { + "epoch": 0.02101769911504425, + "grad_norm": 2.6231577396392822, + "learning_rate": 4.994555267098438e-05, + "loss": 6.4466, + "step": 3534 + }, + { + "epoch": 0.021023646398325244, + "grad_norm": 2.735590696334839, + "learning_rate": 4.994552185567244e-05, + "loss": 5.3115, + "step": 3535 + }, + { + "epoch": 0.021029593681606243, + "grad_norm": 2.730459690093994, + "learning_rate": 4.994549103165228e-05, + "loss": 5.2311, + "step": 3536 + }, + { + "epoch": 0.021035540964887238, + "grad_norm": 2.1241424083709717, + "learning_rate": 4.994546019892391e-05, + "loss": 5.6599, + "step": 3537 + }, + { + "epoch": 0.021041488248168237, + "grad_norm": 2.607807159423828, + "learning_rate": 4.994542935748733e-05, + "loss": 6.1182, + "step": 3538 + }, + { + "epoch": 0.021047435531449235, + "grad_norm": 2.6896564960479736, + "learning_rate": 4.9945398507342567e-05, + "loss": 6.2827, + "step": 3539 + }, + { + "epoch": 0.02105338281473023, + "grad_norm": 2.9237961769104004, + "learning_rate": 4.994536764848962e-05, + "loss": 5.9629, + "step": 3540 + }, + { + "epoch": 0.02105933009801123, + "grad_norm": 2.7576143741607666, + "learning_rate": 4.99453367809285e-05, + "loss": 5.7612, + "step": 3541 + }, + { + "epoch": 0.021065277381292227, + "grad_norm": 3.1622097492218018, + "learning_rate": 4.9945305904659226e-05, + "loss": 6.0415, + "step": 3542 + }, + { + "epoch": 0.021071224664573222, + "grad_norm": 2.471127510070801, + "learning_rate": 4.994527501968179e-05, + "loss": 6.1264, + "step": 3543 + }, + { + "epoch": 0.02107717194785422, + "grad_norm": 2.797504425048828, + "learning_rate": 4.994524412599623e-05, + "loss": 6.3515, + "step": 3544 + }, + { + "epoch": 0.021083119231135216, + "grad_norm": 2.4932103157043457, + "learning_rate": 4.9945213223602535e-05, + "loss": 6.4327, + "step": 3545 + }, + { + "epoch": 0.021089066514416215, + "grad_norm": 2.5194599628448486, + "learning_rate": 4.9945182312500725e-05, + "loss": 6.4003, + "step": 3546 + }, + { + "epoch": 0.021095013797697213, + "grad_norm": 2.287858247756958, + "learning_rate": 4.9945151392690814e-05, + "loss": 6.3287, + "step": 3547 + }, + { + "epoch": 0.021100961080978208, + "grad_norm": 2.941619873046875, + "learning_rate": 4.994512046417281e-05, + "loss": 6.1364, + "step": 3548 + }, + { + "epoch": 0.021106908364259207, + "grad_norm": 3.1448967456817627, + "learning_rate": 4.994508952694672e-05, + "loss": 5.8638, + "step": 3549 + }, + { + "epoch": 0.021112855647540205, + "grad_norm": 2.869966983795166, + "learning_rate": 4.994505858101255e-05, + "loss": 6.0122, + "step": 3550 + }, + { + "epoch": 0.0211188029308212, + "grad_norm": 2.421264886856079, + "learning_rate": 4.9945027626370325e-05, + "loss": 6.1243, + "step": 3551 + }, + { + "epoch": 0.0211247502141022, + "grad_norm": 2.599456310272217, + "learning_rate": 4.9944996663020047e-05, + "loss": 5.9484, + "step": 3552 + }, + { + "epoch": 0.021130697497383194, + "grad_norm": 3.1029574871063232, + "learning_rate": 4.994496569096173e-05, + "loss": 5.9347, + "step": 3553 + }, + { + "epoch": 0.021136644780664193, + "grad_norm": 3.02494478225708, + "learning_rate": 4.994493471019538e-05, + "loss": 5.814, + "step": 3554 + }, + { + "epoch": 0.02114259206394519, + "grad_norm": 2.359682559967041, + "learning_rate": 4.994490372072101e-05, + "loss": 5.8533, + "step": 3555 + }, + { + "epoch": 0.021148539347226186, + "grad_norm": 2.7072582244873047, + "learning_rate": 4.994487272253864e-05, + "loss": 5.855, + "step": 3556 + }, + { + "epoch": 0.021154486630507185, + "grad_norm": 2.3102664947509766, + "learning_rate": 4.994484171564826e-05, + "loss": 5.6701, + "step": 3557 + }, + { + "epoch": 0.02116043391378818, + "grad_norm": 2.3804259300231934, + "learning_rate": 4.9944810700049906e-05, + "loss": 5.5096, + "step": 3558 + }, + { + "epoch": 0.02116638119706918, + "grad_norm": 2.463280439376831, + "learning_rate": 4.994477967574357e-05, + "loss": 5.5178, + "step": 3559 + }, + { + "epoch": 0.021172328480350177, + "grad_norm": 2.884152412414551, + "learning_rate": 4.9944748642729265e-05, + "loss": 6.1013, + "step": 3560 + }, + { + "epoch": 0.021178275763631172, + "grad_norm": 3.009460210800171, + "learning_rate": 4.9944717601007006e-05, + "loss": 6.2725, + "step": 3561 + }, + { + "epoch": 0.02118422304691217, + "grad_norm": 2.5930371284484863, + "learning_rate": 4.9944686550576814e-05, + "loss": 6.1138, + "step": 3562 + }, + { + "epoch": 0.02119017033019317, + "grad_norm": 2.8212878704071045, + "learning_rate": 4.9944655491438684e-05, + "loss": 5.6209, + "step": 3563 + }, + { + "epoch": 0.021196117613474164, + "grad_norm": 2.9814743995666504, + "learning_rate": 4.9944624423592634e-05, + "loss": 5.8912, + "step": 3564 + }, + { + "epoch": 0.021202064896755163, + "grad_norm": 3.1456093788146973, + "learning_rate": 4.994459334703867e-05, + "loss": 5.961, + "step": 3565 + }, + { + "epoch": 0.021208012180036158, + "grad_norm": 2.9300050735473633, + "learning_rate": 4.9944562261776805e-05, + "loss": 6.773, + "step": 3566 + }, + { + "epoch": 0.021213959463317156, + "grad_norm": 2.570685625076294, + "learning_rate": 4.994453116780705e-05, + "loss": 6.3575, + "step": 3567 + }, + { + "epoch": 0.021219906746598155, + "grad_norm": 2.7060914039611816, + "learning_rate": 4.994450006512943e-05, + "loss": 6.249, + "step": 3568 + }, + { + "epoch": 0.02122585402987915, + "grad_norm": 3.0027518272399902, + "learning_rate": 4.994446895374393e-05, + "loss": 5.8243, + "step": 3569 + }, + { + "epoch": 0.02123180131316015, + "grad_norm": 2.785888195037842, + "learning_rate": 4.994443783365058e-05, + "loss": 5.9836, + "step": 3570 + }, + { + "epoch": 0.021237748596441147, + "grad_norm": 2.5480010509490967, + "learning_rate": 4.994440670484938e-05, + "loss": 6.4237, + "step": 3571 + }, + { + "epoch": 0.021243695879722142, + "grad_norm": 2.687121629714966, + "learning_rate": 4.9944375567340345e-05, + "loss": 6.4497, + "step": 3572 + }, + { + "epoch": 0.02124964316300314, + "grad_norm": 2.6066362857818604, + "learning_rate": 4.994434442112349e-05, + "loss": 6.3853, + "step": 3573 + }, + { + "epoch": 0.021255590446284136, + "grad_norm": 2.880352020263672, + "learning_rate": 4.994431326619882e-05, + "loss": 6.382, + "step": 3574 + }, + { + "epoch": 0.021261537729565134, + "grad_norm": 3.0415213108062744, + "learning_rate": 4.9944282102566345e-05, + "loss": 6.4472, + "step": 3575 + }, + { + "epoch": 0.021267485012846133, + "grad_norm": 2.4917140007019043, + "learning_rate": 4.994425093022609e-05, + "loss": 6.2546, + "step": 3576 + }, + { + "epoch": 0.021273432296127128, + "grad_norm": 2.53648042678833, + "learning_rate": 4.9944219749178044e-05, + "loss": 6.37, + "step": 3577 + }, + { + "epoch": 0.021279379579408127, + "grad_norm": 2.796342134475708, + "learning_rate": 4.994418855942223e-05, + "loss": 6.1691, + "step": 3578 + }, + { + "epoch": 0.021285326862689125, + "grad_norm": 2.9148125648498535, + "learning_rate": 4.9944157360958656e-05, + "loss": 6.2552, + "step": 3579 + }, + { + "epoch": 0.02129127414597012, + "grad_norm": 3.0777838230133057, + "learning_rate": 4.994412615378734e-05, + "loss": 6.2359, + "step": 3580 + }, + { + "epoch": 0.02129722142925112, + "grad_norm": 2.5878093242645264, + "learning_rate": 4.994409493790828e-05, + "loss": 6.0746, + "step": 3581 + }, + { + "epoch": 0.021303168712532114, + "grad_norm": 3.2084906101226807, + "learning_rate": 4.99440637133215e-05, + "loss": 6.1357, + "step": 3582 + }, + { + "epoch": 0.021309115995813113, + "grad_norm": 3.7210965156555176, + "learning_rate": 4.9944032480027004e-05, + "loss": 6.5117, + "step": 3583 + }, + { + "epoch": 0.02131506327909411, + "grad_norm": 2.8332109451293945, + "learning_rate": 4.994400123802481e-05, + "loss": 6.0908, + "step": 3584 + }, + { + "epoch": 0.021321010562375106, + "grad_norm": 2.83854341506958, + "learning_rate": 4.994396998731491e-05, + "loss": 6.1522, + "step": 3585 + }, + { + "epoch": 0.021326957845656105, + "grad_norm": 2.5171611309051514, + "learning_rate": 4.9943938727897335e-05, + "loss": 6.2253, + "step": 3586 + }, + { + "epoch": 0.0213329051289371, + "grad_norm": 2.2111763954162598, + "learning_rate": 4.9943907459772086e-05, + "loss": 5.7673, + "step": 3587 + }, + { + "epoch": 0.0213388524122181, + "grad_norm": 2.5147926807403564, + "learning_rate": 4.994387618293918e-05, + "loss": 6.8327, + "step": 3588 + }, + { + "epoch": 0.021344799695499097, + "grad_norm": 2.969285488128662, + "learning_rate": 4.9943844897398626e-05, + "loss": 6.9995, + "step": 3589 + }, + { + "epoch": 0.021350746978780092, + "grad_norm": 4.00917911529541, + "learning_rate": 4.994381360315043e-05, + "loss": 6.6377, + "step": 3590 + }, + { + "epoch": 0.02135669426206109, + "grad_norm": 3.899319887161255, + "learning_rate": 4.994378230019461e-05, + "loss": 6.162, + "step": 3591 + }, + { + "epoch": 0.02136264154534209, + "grad_norm": 2.9522764682769775, + "learning_rate": 4.994375098853117e-05, + "loss": 6.4405, + "step": 3592 + }, + { + "epoch": 0.021368588828623084, + "grad_norm": 3.0569825172424316, + "learning_rate": 4.994371966816012e-05, + "loss": 6.2631, + "step": 3593 + }, + { + "epoch": 0.021374536111904083, + "grad_norm": 2.9470009803771973, + "learning_rate": 4.994368833908148e-05, + "loss": 6.4785, + "step": 3594 + }, + { + "epoch": 0.021380483395185078, + "grad_norm": 2.913940668106079, + "learning_rate": 4.994365700129525e-05, + "loss": 6.6566, + "step": 3595 + }, + { + "epoch": 0.021386430678466076, + "grad_norm": 2.6037404537200928, + "learning_rate": 4.9943625654801465e-05, + "loss": 6.2535, + "step": 3596 + }, + { + "epoch": 0.021392377961747075, + "grad_norm": 2.998276948928833, + "learning_rate": 4.99435942996001e-05, + "loss": 6.8851, + "step": 3597 + }, + { + "epoch": 0.02139832524502807, + "grad_norm": 2.2189996242523193, + "learning_rate": 4.994356293569119e-05, + "loss": 6.8707, + "step": 3598 + }, + { + "epoch": 0.02140427252830907, + "grad_norm": 2.4528486728668213, + "learning_rate": 4.994353156307474e-05, + "loss": 6.9166, + "step": 3599 + }, + { + "epoch": 0.021410219811590067, + "grad_norm": 3.0538241863250732, + "learning_rate": 4.994350018175076e-05, + "loss": 6.3258, + "step": 3600 + }, + { + "epoch": 0.021416167094871062, + "grad_norm": 3.789745569229126, + "learning_rate": 4.994346879171926e-05, + "loss": 6.1962, + "step": 3601 + }, + { + "epoch": 0.02142211437815206, + "grad_norm": 3.2789254188537598, + "learning_rate": 4.994343739298025e-05, + "loss": 6.2126, + "step": 3602 + }, + { + "epoch": 0.021428061661433056, + "grad_norm": 3.0887696743011475, + "learning_rate": 4.994340598553375e-05, + "loss": 6.2395, + "step": 3603 + }, + { + "epoch": 0.021434008944714054, + "grad_norm": 2.9189252853393555, + "learning_rate": 4.994337456937977e-05, + "loss": 6.193, + "step": 3604 + }, + { + "epoch": 0.021439956227995053, + "grad_norm": 2.8582170009613037, + "learning_rate": 4.9943343144518306e-05, + "loss": 6.1077, + "step": 3605 + }, + { + "epoch": 0.021445903511276048, + "grad_norm": 3.076979160308838, + "learning_rate": 4.994331171094938e-05, + "loss": 6.0474, + "step": 3606 + }, + { + "epoch": 0.021451850794557047, + "grad_norm": 3.482161045074463, + "learning_rate": 4.994328026867301e-05, + "loss": 6.0551, + "step": 3607 + }, + { + "epoch": 0.021457798077838045, + "grad_norm": 3.001046895980835, + "learning_rate": 4.994324881768919e-05, + "loss": 6.0393, + "step": 3608 + }, + { + "epoch": 0.02146374536111904, + "grad_norm": 2.8006365299224854, + "learning_rate": 4.994321735799794e-05, + "loss": 6.0042, + "step": 3609 + }, + { + "epoch": 0.02146969264440004, + "grad_norm": 3.10727858543396, + "learning_rate": 4.994318588959927e-05, + "loss": 5.8981, + "step": 3610 + }, + { + "epoch": 0.021475639927681034, + "grad_norm": 2.660557985305786, + "learning_rate": 4.9943154412493194e-05, + "loss": 6.0426, + "step": 3611 + }, + { + "epoch": 0.021481587210962032, + "grad_norm": 2.8504562377929688, + "learning_rate": 4.994312292667972e-05, + "loss": 6.9774, + "step": 3612 + }, + { + "epoch": 0.02148753449424303, + "grad_norm": 3.0076539516448975, + "learning_rate": 4.994309143215886e-05, + "loss": 6.3238, + "step": 3613 + }, + { + "epoch": 0.021493481777524026, + "grad_norm": 2.2966883182525635, + "learning_rate": 4.9943059928930626e-05, + "loss": 7.0015, + "step": 3614 + }, + { + "epoch": 0.021499429060805025, + "grad_norm": 2.5054080486297607, + "learning_rate": 4.994302841699502e-05, + "loss": 6.9226, + "step": 3615 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 2.856278657913208, + "learning_rate": 4.9942996896352066e-05, + "loss": 6.7836, + "step": 3616 + }, + { + "epoch": 0.02151132362736702, + "grad_norm": 2.4902377128601074, + "learning_rate": 4.994296536700177e-05, + "loss": 6.7077, + "step": 3617 + }, + { + "epoch": 0.021517270910648017, + "grad_norm": 2.477932929992676, + "learning_rate": 4.994293382894414e-05, + "loss": 6.8284, + "step": 3618 + }, + { + "epoch": 0.021523218193929012, + "grad_norm": 2.3034260272979736, + "learning_rate": 4.994290228217919e-05, + "loss": 6.8012, + "step": 3619 + }, + { + "epoch": 0.02152916547721001, + "grad_norm": 2.3850560188293457, + "learning_rate": 4.9942870726706934e-05, + "loss": 6.6208, + "step": 3620 + }, + { + "epoch": 0.02153511276049101, + "grad_norm": 2.4397644996643066, + "learning_rate": 4.994283916252738e-05, + "loss": 6.7522, + "step": 3621 + }, + { + "epoch": 0.021541060043772004, + "grad_norm": 2.400846242904663, + "learning_rate": 4.994280758964053e-05, + "loss": 6.7529, + "step": 3622 + }, + { + "epoch": 0.021547007327053003, + "grad_norm": 2.358290195465088, + "learning_rate": 4.994277600804641e-05, + "loss": 6.6812, + "step": 3623 + }, + { + "epoch": 0.021552954610333998, + "grad_norm": 2.7409300804138184, + "learning_rate": 4.994274441774503e-05, + "loss": 6.668, + "step": 3624 + }, + { + "epoch": 0.021558901893614996, + "grad_norm": 2.6890954971313477, + "learning_rate": 4.994271281873639e-05, + "loss": 6.5537, + "step": 3625 + }, + { + "epoch": 0.021564849176895995, + "grad_norm": 2.8959596157073975, + "learning_rate": 4.9942681211020505e-05, + "loss": 6.4492, + "step": 3626 + }, + { + "epoch": 0.02157079646017699, + "grad_norm": 2.4325244426727295, + "learning_rate": 4.994264959459738e-05, + "loss": 6.9819, + "step": 3627 + }, + { + "epoch": 0.02157674374345799, + "grad_norm": 2.92891263961792, + "learning_rate": 4.9942617969467045e-05, + "loss": 6.9266, + "step": 3628 + }, + { + "epoch": 0.021582691026738987, + "grad_norm": 2.4398467540740967, + "learning_rate": 4.994258633562951e-05, + "loss": 6.514, + "step": 3629 + }, + { + "epoch": 0.021588638310019982, + "grad_norm": 2.577467203140259, + "learning_rate": 4.9942554693084756e-05, + "loss": 6.7248, + "step": 3630 + }, + { + "epoch": 0.02159458559330098, + "grad_norm": 2.3682591915130615, + "learning_rate": 4.9942523041832824e-05, + "loss": 6.7798, + "step": 3631 + }, + { + "epoch": 0.021600532876581976, + "grad_norm": 2.1863434314727783, + "learning_rate": 4.9942491381873705e-05, + "loss": 6.6636, + "step": 3632 + }, + { + "epoch": 0.021606480159862974, + "grad_norm": 2.0172441005706787, + "learning_rate": 4.9942459713207426e-05, + "loss": 6.6772, + "step": 3633 + }, + { + "epoch": 0.021612427443143973, + "grad_norm": 1.8671952486038208, + "learning_rate": 4.9942428035834e-05, + "loss": 6.3648, + "step": 3634 + }, + { + "epoch": 0.021618374726424968, + "grad_norm": 3.226900815963745, + "learning_rate": 4.9942396349753416e-05, + "loss": 6.4127, + "step": 3635 + }, + { + "epoch": 0.021624322009705967, + "grad_norm": 2.7766973972320557, + "learning_rate": 4.994236465496571e-05, + "loss": 6.4476, + "step": 3636 + }, + { + "epoch": 0.021630269292986965, + "grad_norm": 2.157118082046509, + "learning_rate": 4.9942332951470875e-05, + "loss": 6.5876, + "step": 3637 + }, + { + "epoch": 0.02163621657626796, + "grad_norm": 2.3870396614074707, + "learning_rate": 4.994230123926893e-05, + "loss": 6.5861, + "step": 3638 + }, + { + "epoch": 0.02164216385954896, + "grad_norm": 2.8139939308166504, + "learning_rate": 4.994226951835989e-05, + "loss": 6.4845, + "step": 3639 + }, + { + "epoch": 0.021648111142829954, + "grad_norm": 2.856207847595215, + "learning_rate": 4.9942237788743764e-05, + "loss": 6.1514, + "step": 3640 + }, + { + "epoch": 0.021654058426110952, + "grad_norm": 3.523162603378296, + "learning_rate": 4.9942206050420545e-05, + "loss": 5.8114, + "step": 3641 + }, + { + "epoch": 0.02166000570939195, + "grad_norm": 2.746587038040161, + "learning_rate": 4.9942174303390274e-05, + "loss": 5.7397, + "step": 3642 + }, + { + "epoch": 0.021665952992672946, + "grad_norm": 2.902067184448242, + "learning_rate": 4.9942142547652946e-05, + "loss": 6.4353, + "step": 3643 + }, + { + "epoch": 0.021671900275953945, + "grad_norm": 2.981391191482544, + "learning_rate": 4.994211078320857e-05, + "loss": 6.2153, + "step": 3644 + }, + { + "epoch": 0.021677847559234943, + "grad_norm": 2.6004254817962646, + "learning_rate": 4.994207901005716e-05, + "loss": 6.2365, + "step": 3645 + }, + { + "epoch": 0.021683794842515938, + "grad_norm": 2.748678684234619, + "learning_rate": 4.994204722819873e-05, + "loss": 5.8126, + "step": 3646 + }, + { + "epoch": 0.021689742125796937, + "grad_norm": 2.675466299057007, + "learning_rate": 4.994201543763329e-05, + "loss": 6.3032, + "step": 3647 + }, + { + "epoch": 0.021695689409077932, + "grad_norm": 2.681823253631592, + "learning_rate": 4.9941983638360855e-05, + "loss": 6.2706, + "step": 3648 + }, + { + "epoch": 0.02170163669235893, + "grad_norm": 2.481586217880249, + "learning_rate": 4.994195183038142e-05, + "loss": 6.1792, + "step": 3649 + }, + { + "epoch": 0.02170758397563993, + "grad_norm": 2.3379831314086914, + "learning_rate": 4.9941920013695024e-05, + "loss": 6.2689, + "step": 3650 + }, + { + "epoch": 0.021713531258920924, + "grad_norm": 2.5885238647460938, + "learning_rate": 4.994188818830164e-05, + "loss": 6.3018, + "step": 3651 + }, + { + "epoch": 0.021719478542201923, + "grad_norm": 2.341939687728882, + "learning_rate": 4.994185635420131e-05, + "loss": 5.6178, + "step": 3652 + }, + { + "epoch": 0.021725425825482918, + "grad_norm": 2.4126031398773193, + "learning_rate": 4.9941824511394044e-05, + "loss": 5.4044, + "step": 3653 + }, + { + "epoch": 0.021731373108763916, + "grad_norm": 2.2289719581604004, + "learning_rate": 4.994179265987983e-05, + "loss": 5.4134, + "step": 3654 + }, + { + "epoch": 0.021737320392044915, + "grad_norm": 2.5151331424713135, + "learning_rate": 4.994176079965871e-05, + "loss": 5.3321, + "step": 3655 + }, + { + "epoch": 0.02174326767532591, + "grad_norm": 2.0761523246765137, + "learning_rate": 4.9941728930730665e-05, + "loss": 5.3363, + "step": 3656 + }, + { + "epoch": 0.02174921495860691, + "grad_norm": 2.272510051727295, + "learning_rate": 4.994169705309573e-05, + "loss": 6.0208, + "step": 3657 + }, + { + "epoch": 0.021755162241887907, + "grad_norm": 2.6145198345184326, + "learning_rate": 4.994166516675389e-05, + "loss": 6.299, + "step": 3658 + }, + { + "epoch": 0.021761109525168902, + "grad_norm": 2.978618621826172, + "learning_rate": 4.994163327170519e-05, + "loss": 5.1248, + "step": 3659 + }, + { + "epoch": 0.0217670568084499, + "grad_norm": 2.398813247680664, + "learning_rate": 4.994160136794962e-05, + "loss": 5.1217, + "step": 3660 + }, + { + "epoch": 0.021773004091730896, + "grad_norm": 2.1145291328430176, + "learning_rate": 4.994156945548719e-05, + "loss": 5.2676, + "step": 3661 + }, + { + "epoch": 0.021778951375011894, + "grad_norm": 2.045334577560425, + "learning_rate": 4.9941537534317915e-05, + "loss": 5.2088, + "step": 3662 + }, + { + "epoch": 0.021784898658292893, + "grad_norm": 2.0598506927490234, + "learning_rate": 4.9941505604441806e-05, + "loss": 5.363, + "step": 3663 + }, + { + "epoch": 0.021790845941573888, + "grad_norm": 2.189143657684326, + "learning_rate": 4.9941473665858884e-05, + "loss": 6.0592, + "step": 3664 + }, + { + "epoch": 0.021796793224854887, + "grad_norm": 6.8580780029296875, + "learning_rate": 4.994144171856915e-05, + "loss": 6.0323, + "step": 3665 + }, + { + "epoch": 0.021802740508135885, + "grad_norm": 2.0607001781463623, + "learning_rate": 4.994140976257261e-05, + "loss": 6.0883, + "step": 3666 + }, + { + "epoch": 0.02180868779141688, + "grad_norm": 2.1669631004333496, + "learning_rate": 4.9941377797869284e-05, + "loss": 6.0546, + "step": 3667 + }, + { + "epoch": 0.02181463507469788, + "grad_norm": 2.912822961807251, + "learning_rate": 4.994134582445917e-05, + "loss": 6.0285, + "step": 3668 + }, + { + "epoch": 0.021820582357978874, + "grad_norm": 2.3223111629486084, + "learning_rate": 4.994131384234231e-05, + "loss": 6.0948, + "step": 3669 + }, + { + "epoch": 0.021826529641259872, + "grad_norm": 2.067002296447754, + "learning_rate": 4.994128185151868e-05, + "loss": 6.2908, + "step": 3670 + }, + { + "epoch": 0.02183247692454087, + "grad_norm": 2.593642473220825, + "learning_rate": 4.9941249851988317e-05, + "loss": 6.2878, + "step": 3671 + }, + { + "epoch": 0.021838424207821866, + "grad_norm": 2.6345975399017334, + "learning_rate": 4.994121784375121e-05, + "loss": 6.0796, + "step": 3672 + }, + { + "epoch": 0.021844371491102865, + "grad_norm": 2.398861885070801, + "learning_rate": 4.994118582680739e-05, + "loss": 6.096, + "step": 3673 + }, + { + "epoch": 0.021850318774383863, + "grad_norm": 2.102933883666992, + "learning_rate": 4.994115380115686e-05, + "loss": 6.1347, + "step": 3674 + }, + { + "epoch": 0.021856266057664858, + "grad_norm": 2.43632435798645, + "learning_rate": 4.994112176679963e-05, + "loss": 6.074, + "step": 3675 + }, + { + "epoch": 0.021862213340945857, + "grad_norm": 2.304213523864746, + "learning_rate": 4.9941089723735706e-05, + "loss": 5.8897, + "step": 3676 + }, + { + "epoch": 0.021868160624226852, + "grad_norm": 2.6283092498779297, + "learning_rate": 4.9941057671965106e-05, + "loss": 5.9605, + "step": 3677 + }, + { + "epoch": 0.02187410790750785, + "grad_norm": 2.0781428813934326, + "learning_rate": 4.994102561148785e-05, + "loss": 6.0645, + "step": 3678 + }, + { + "epoch": 0.02188005519078885, + "grad_norm": 2.229210376739502, + "learning_rate": 4.994099354230393e-05, + "loss": 6.223, + "step": 3679 + }, + { + "epoch": 0.021886002474069844, + "grad_norm": 2.4410789012908936, + "learning_rate": 4.9940961464413374e-05, + "loss": 6.1115, + "step": 3680 + }, + { + "epoch": 0.021891949757350843, + "grad_norm": 2.99076771736145, + "learning_rate": 4.994092937781618e-05, + "loss": 5.9028, + "step": 3681 + }, + { + "epoch": 0.021897897040631838, + "grad_norm": 2.8403074741363525, + "learning_rate": 4.994089728251237e-05, + "loss": 5.7286, + "step": 3682 + }, + { + "epoch": 0.021903844323912836, + "grad_norm": 2.0928149223327637, + "learning_rate": 4.994086517850195e-05, + "loss": 5.849, + "step": 3683 + }, + { + "epoch": 0.021909791607193835, + "grad_norm": 2.320279836654663, + "learning_rate": 4.994083306578492e-05, + "loss": 5.6767, + "step": 3684 + }, + { + "epoch": 0.02191573889047483, + "grad_norm": 3.0701658725738525, + "learning_rate": 4.994080094436132e-05, + "loss": 5.9555, + "step": 3685 + }, + { + "epoch": 0.02192168617375583, + "grad_norm": 2.1042048931121826, + "learning_rate": 4.994076881423113e-05, + "loss": 5.7651, + "step": 3686 + }, + { + "epoch": 0.021927633457036827, + "grad_norm": 2.35819673538208, + "learning_rate": 4.9940736675394385e-05, + "loss": 6.0203, + "step": 3687 + }, + { + "epoch": 0.021933580740317822, + "grad_norm": 2.659224510192871, + "learning_rate": 4.994070452785108e-05, + "loss": 5.9935, + "step": 3688 + }, + { + "epoch": 0.02193952802359882, + "grad_norm": 2.4628207683563232, + "learning_rate": 4.994067237160124e-05, + "loss": 5.9135, + "step": 3689 + }, + { + "epoch": 0.021945475306879816, + "grad_norm": 3.7227911949157715, + "learning_rate": 4.9940640206644865e-05, + "loss": 5.8365, + "step": 3690 + }, + { + "epoch": 0.021951422590160814, + "grad_norm": 3.5226151943206787, + "learning_rate": 4.994060803298197e-05, + "loss": 5.7807, + "step": 3691 + }, + { + "epoch": 0.021957369873441813, + "grad_norm": 2.3665735721588135, + "learning_rate": 4.994057585061256e-05, + "loss": 5.9632, + "step": 3692 + }, + { + "epoch": 0.021963317156722808, + "grad_norm": 2.877263069152832, + "learning_rate": 4.9940543659536666e-05, + "loss": 5.6425, + "step": 3693 + }, + { + "epoch": 0.021969264440003806, + "grad_norm": 2.5431532859802246, + "learning_rate": 4.994051145975428e-05, + "loss": 5.6531, + "step": 3694 + }, + { + "epoch": 0.021975211723284805, + "grad_norm": 2.7033538818359375, + "learning_rate": 4.9940479251265415e-05, + "loss": 5.6907, + "step": 3695 + }, + { + "epoch": 0.0219811590065658, + "grad_norm": 3.6627206802368164, + "learning_rate": 4.9940447034070093e-05, + "loss": 5.9118, + "step": 3696 + }, + { + "epoch": 0.0219871062898468, + "grad_norm": 3.896959066390991, + "learning_rate": 4.994041480816831e-05, + "loss": 5.9926, + "step": 3697 + }, + { + "epoch": 0.021993053573127794, + "grad_norm": 3.37575626373291, + "learning_rate": 4.994038257356009e-05, + "loss": 5.9768, + "step": 3698 + }, + { + "epoch": 0.021999000856408792, + "grad_norm": 2.7694313526153564, + "learning_rate": 4.9940350330245444e-05, + "loss": 5.8486, + "step": 3699 + }, + { + "epoch": 0.02200494813968979, + "grad_norm": 2.3815293312072754, + "learning_rate": 4.9940318078224376e-05, + "loss": 6.0663, + "step": 3700 + }, + { + "epoch": 0.022010895422970786, + "grad_norm": 2.3171627521514893, + "learning_rate": 4.99402858174969e-05, + "loss": 5.8543, + "step": 3701 + }, + { + "epoch": 0.022016842706251784, + "grad_norm": 2.5090551376342773, + "learning_rate": 4.994025354806303e-05, + "loss": 5.7005, + "step": 3702 + }, + { + "epoch": 0.022022789989532783, + "grad_norm": 2.7024855613708496, + "learning_rate": 4.9940221269922774e-05, + "loss": 5.7375, + "step": 3703 + }, + { + "epoch": 0.022028737272813778, + "grad_norm": 2.7900679111480713, + "learning_rate": 4.994018898307614e-05, + "loss": 6.0094, + "step": 3704 + }, + { + "epoch": 0.022034684556094777, + "grad_norm": 2.3678438663482666, + "learning_rate": 4.994015668752315e-05, + "loss": 5.822, + "step": 3705 + }, + { + "epoch": 0.022040631839375772, + "grad_norm": 2.5406653881073, + "learning_rate": 4.9940124383263807e-05, + "loss": 5.8984, + "step": 3706 + }, + { + "epoch": 0.02204657912265677, + "grad_norm": 2.371800422668457, + "learning_rate": 4.994009207029813e-05, + "loss": 5.9821, + "step": 3707 + }, + { + "epoch": 0.02205252640593777, + "grad_norm": 2.004669666290283, + "learning_rate": 4.994005974862612e-05, + "loss": 5.8801, + "step": 3708 + }, + { + "epoch": 0.022058473689218764, + "grad_norm": 2.777472972869873, + "learning_rate": 4.9940027418247787e-05, + "loss": 5.8821, + "step": 3709 + }, + { + "epoch": 0.022064420972499763, + "grad_norm": 2.599883556365967, + "learning_rate": 4.9939995079163156e-05, + "loss": 5.8716, + "step": 3710 + }, + { + "epoch": 0.022070368255780758, + "grad_norm": 2.5891127586364746, + "learning_rate": 4.993996273137223e-05, + "loss": 5.7607, + "step": 3711 + }, + { + "epoch": 0.022076315539061756, + "grad_norm": 2.3737518787384033, + "learning_rate": 4.993993037487501e-05, + "loss": 5.7825, + "step": 3712 + }, + { + "epoch": 0.022082262822342755, + "grad_norm": 2.421785831451416, + "learning_rate": 4.9939898009671524e-05, + "loss": 5.7143, + "step": 3713 + }, + { + "epoch": 0.02208821010562375, + "grad_norm": 2.4267804622650146, + "learning_rate": 4.9939865635761785e-05, + "loss": 5.8031, + "step": 3714 + }, + { + "epoch": 0.02209415738890475, + "grad_norm": 2.390333414077759, + "learning_rate": 4.993983325314579e-05, + "loss": 5.7985, + "step": 3715 + }, + { + "epoch": 0.022100104672185747, + "grad_norm": 2.2265970706939697, + "learning_rate": 4.993980086182356e-05, + "loss": 5.6261, + "step": 3716 + }, + { + "epoch": 0.022106051955466742, + "grad_norm": 2.3872458934783936, + "learning_rate": 4.99397684617951e-05, + "loss": 5.8185, + "step": 3717 + }, + { + "epoch": 0.02211199923874774, + "grad_norm": 2.077075958251953, + "learning_rate": 4.9939736053060425e-05, + "loss": 5.6252, + "step": 3718 + }, + { + "epoch": 0.022117946522028736, + "grad_norm": 2.0642287731170654, + "learning_rate": 4.993970363561954e-05, + "loss": 5.8034, + "step": 3719 + }, + { + "epoch": 0.022123893805309734, + "grad_norm": 3.5353951454162598, + "learning_rate": 4.9939671209472474e-05, + "loss": 6.7808, + "step": 3720 + }, + { + "epoch": 0.022129841088590733, + "grad_norm": 2.910531520843506, + "learning_rate": 4.9939638774619216e-05, + "loss": 5.9323, + "step": 3721 + }, + { + "epoch": 0.022135788371871728, + "grad_norm": 2.7450106143951416, + "learning_rate": 4.9939606331059794e-05, + "loss": 5.9926, + "step": 3722 + }, + { + "epoch": 0.022141735655152726, + "grad_norm": 2.7628188133239746, + "learning_rate": 4.993957387879421e-05, + "loss": 5.9129, + "step": 3723 + }, + { + "epoch": 0.022147682938433725, + "grad_norm": 2.6644890308380127, + "learning_rate": 4.9939541417822485e-05, + "loss": 5.7038, + "step": 3724 + }, + { + "epoch": 0.02215363022171472, + "grad_norm": 2.143744707107544, + "learning_rate": 4.993950894814461e-05, + "loss": 5.5821, + "step": 3725 + }, + { + "epoch": 0.02215957750499572, + "grad_norm": 2.1691160202026367, + "learning_rate": 4.993947646976063e-05, + "loss": 5.5929, + "step": 3726 + }, + { + "epoch": 0.022165524788276714, + "grad_norm": 2.1479709148406982, + "learning_rate": 4.993944398267052e-05, + "loss": 5.6653, + "step": 3727 + }, + { + "epoch": 0.022171472071557712, + "grad_norm": 2.7749600410461426, + "learning_rate": 4.993941148687431e-05, + "loss": 5.5682, + "step": 3728 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 2.668672561645508, + "learning_rate": 4.993937898237201e-05, + "loss": 5.5968, + "step": 3729 + }, + { + "epoch": 0.022183366638119706, + "grad_norm": 2.3903374671936035, + "learning_rate": 4.993934646916364e-05, + "loss": 5.7541, + "step": 3730 + }, + { + "epoch": 0.022189313921400704, + "grad_norm": 1.8555344343185425, + "learning_rate": 4.993931394724919e-05, + "loss": 5.5449, + "step": 3731 + }, + { + "epoch": 0.022195261204681703, + "grad_norm": 2.1140637397766113, + "learning_rate": 4.993928141662869e-05, + "loss": 5.8201, + "step": 3732 + }, + { + "epoch": 0.022201208487962698, + "grad_norm": 2.221573829650879, + "learning_rate": 4.993924887730213e-05, + "loss": 5.7583, + "step": 3733 + }, + { + "epoch": 0.022207155771243697, + "grad_norm": 2.0801634788513184, + "learning_rate": 4.993921632926956e-05, + "loss": 5.7083, + "step": 3734 + }, + { + "epoch": 0.02221310305452469, + "grad_norm": 2.0167016983032227, + "learning_rate": 4.993918377253095e-05, + "loss": 5.7798, + "step": 3735 + }, + { + "epoch": 0.02221905033780569, + "grad_norm": 2.104529619216919, + "learning_rate": 4.993915120708634e-05, + "loss": 5.7346, + "step": 3736 + }, + { + "epoch": 0.02222499762108669, + "grad_norm": 2.0807201862335205, + "learning_rate": 4.993911863293572e-05, + "loss": 5.7663, + "step": 3737 + }, + { + "epoch": 0.022230944904367684, + "grad_norm": 1.9223891496658325, + "learning_rate": 4.9939086050079115e-05, + "loss": 5.648, + "step": 3738 + }, + { + "epoch": 0.022236892187648682, + "grad_norm": 2.3831584453582764, + "learning_rate": 4.9939053458516535e-05, + "loss": 5.7988, + "step": 3739 + }, + { + "epoch": 0.02224283947092968, + "grad_norm": 2.433318853378296, + "learning_rate": 4.993902085824799e-05, + "loss": 5.7794, + "step": 3740 + }, + { + "epoch": 0.022248786754210676, + "grad_norm": 2.2488365173339844, + "learning_rate": 4.993898824927348e-05, + "loss": 5.7332, + "step": 3741 + }, + { + "epoch": 0.022254734037491675, + "grad_norm": 2.2924392223358154, + "learning_rate": 4.993895563159303e-05, + "loss": 5.8977, + "step": 3742 + }, + { + "epoch": 0.02226068132077267, + "grad_norm": 2.1601176261901855, + "learning_rate": 4.9938923005206664e-05, + "loss": 5.8588, + "step": 3743 + }, + { + "epoch": 0.02226662860405367, + "grad_norm": 2.256439447402954, + "learning_rate": 4.993889037011436e-05, + "loss": 5.6111, + "step": 3744 + }, + { + "epoch": 0.022272575887334667, + "grad_norm": 2.184950828552246, + "learning_rate": 4.993885772631615e-05, + "loss": 5.7544, + "step": 3745 + }, + { + "epoch": 0.022278523170615662, + "grad_norm": 2.250422716140747, + "learning_rate": 4.993882507381205e-05, + "loss": 5.6534, + "step": 3746 + }, + { + "epoch": 0.02228447045389666, + "grad_norm": 2.473811626434326, + "learning_rate": 4.9938792412602056e-05, + "loss": 5.5699, + "step": 3747 + }, + { + "epoch": 0.022290417737177656, + "grad_norm": 2.2859978675842285, + "learning_rate": 4.993875974268619e-05, + "loss": 5.8712, + "step": 3748 + }, + { + "epoch": 0.022296365020458654, + "grad_norm": 2.4002318382263184, + "learning_rate": 4.993872706406446e-05, + "loss": 5.8121, + "step": 3749 + }, + { + "epoch": 0.022302312303739653, + "grad_norm": 2.2692153453826904, + "learning_rate": 4.9938694376736884e-05, + "loss": 5.5516, + "step": 3750 + }, + { + "epoch": 0.022308259587020648, + "grad_norm": 2.1874892711639404, + "learning_rate": 4.9938661680703456e-05, + "loss": 5.8264, + "step": 3751 + }, + { + "epoch": 0.022314206870301646, + "grad_norm": 2.3802871704101562, + "learning_rate": 4.993862897596421e-05, + "loss": 5.6523, + "step": 3752 + }, + { + "epoch": 0.022320154153582645, + "grad_norm": 2.514646530151367, + "learning_rate": 4.9938596262519145e-05, + "loss": 5.5193, + "step": 3753 + }, + { + "epoch": 0.02232610143686364, + "grad_norm": 2.3175413608551025, + "learning_rate": 4.993856354036827e-05, + "loss": 5.5372, + "step": 3754 + }, + { + "epoch": 0.02233204872014464, + "grad_norm": 2.2071855068206787, + "learning_rate": 4.9938530809511595e-05, + "loss": 5.5002, + "step": 3755 + }, + { + "epoch": 0.022337996003425634, + "grad_norm": 2.046440839767456, + "learning_rate": 4.9938498069949144e-05, + "loss": 5.585, + "step": 3756 + }, + { + "epoch": 0.022343943286706632, + "grad_norm": 2.3971145153045654, + "learning_rate": 4.9938465321680915e-05, + "loss": 5.7858, + "step": 3757 + }, + { + "epoch": 0.02234989056998763, + "grad_norm": 2.462597131729126, + "learning_rate": 4.9938432564706936e-05, + "loss": 5.5606, + "step": 3758 + }, + { + "epoch": 0.022355837853268626, + "grad_norm": 2.3134138584136963, + "learning_rate": 4.99383997990272e-05, + "loss": 5.4587, + "step": 3759 + }, + { + "epoch": 0.022361785136549624, + "grad_norm": 2.137929916381836, + "learning_rate": 4.993836702464173e-05, + "loss": 5.4768, + "step": 3760 + }, + { + "epoch": 0.022367732419830623, + "grad_norm": 2.647691011428833, + "learning_rate": 4.993833424155053e-05, + "loss": 5.7902, + "step": 3761 + }, + { + "epoch": 0.022373679703111618, + "grad_norm": 2.535640239715576, + "learning_rate": 4.993830144975361e-05, + "loss": 5.8263, + "step": 3762 + }, + { + "epoch": 0.022379626986392617, + "grad_norm": 2.422997236251831, + "learning_rate": 4.9938268649251e-05, + "loss": 5.7751, + "step": 3763 + }, + { + "epoch": 0.02238557426967361, + "grad_norm": 2.6906728744506836, + "learning_rate": 4.9938235840042694e-05, + "loss": 5.5974, + "step": 3764 + }, + { + "epoch": 0.02239152155295461, + "grad_norm": 2.0284483432769775, + "learning_rate": 4.99382030221287e-05, + "loss": 5.6816, + "step": 3765 + }, + { + "epoch": 0.02239746883623561, + "grad_norm": 2.6392064094543457, + "learning_rate": 4.9938170195509035e-05, + "loss": 5.9052, + "step": 3766 + }, + { + "epoch": 0.022403416119516604, + "grad_norm": 2.6770617961883545, + "learning_rate": 4.993813736018372e-05, + "loss": 5.9041, + "step": 3767 + }, + { + "epoch": 0.022409363402797602, + "grad_norm": 2.5972392559051514, + "learning_rate": 4.993810451615276e-05, + "loss": 5.7834, + "step": 3768 + }, + { + "epoch": 0.0224153106860786, + "grad_norm": 2.0095736980438232, + "learning_rate": 4.993807166341616e-05, + "loss": 5.6074, + "step": 3769 + }, + { + "epoch": 0.022421257969359596, + "grad_norm": 2.412578582763672, + "learning_rate": 4.9938038801973945e-05, + "loss": 5.742, + "step": 3770 + }, + { + "epoch": 0.022427205252640595, + "grad_norm": 2.1285388469696045, + "learning_rate": 4.993800593182612e-05, + "loss": 5.7665, + "step": 3771 + }, + { + "epoch": 0.02243315253592159, + "grad_norm": 2.091252326965332, + "learning_rate": 4.993797305297268e-05, + "loss": 5.7165, + "step": 3772 + }, + { + "epoch": 0.022439099819202588, + "grad_norm": 2.5366342067718506, + "learning_rate": 4.993794016541367e-05, + "loss": 6.259, + "step": 3773 + }, + { + "epoch": 0.022445047102483587, + "grad_norm": 2.2637953758239746, + "learning_rate": 4.9937907269149063e-05, + "loss": 6.2132, + "step": 3774 + }, + { + "epoch": 0.022450994385764582, + "grad_norm": 2.570979595184326, + "learning_rate": 4.99378743641789e-05, + "loss": 5.9656, + "step": 3775 + }, + { + "epoch": 0.02245694166904558, + "grad_norm": 2.0587873458862305, + "learning_rate": 4.993784145050319e-05, + "loss": 5.7096, + "step": 3776 + }, + { + "epoch": 0.022462888952326576, + "grad_norm": 2.396812677383423, + "learning_rate": 4.993780852812192e-05, + "loss": 5.7258, + "step": 3777 + }, + { + "epoch": 0.022468836235607574, + "grad_norm": 2.081541061401367, + "learning_rate": 4.993777559703513e-05, + "loss": 5.6777, + "step": 3778 + }, + { + "epoch": 0.022474783518888573, + "grad_norm": 2.5242559909820557, + "learning_rate": 4.993774265724281e-05, + "loss": 5.961, + "step": 3779 + }, + { + "epoch": 0.022480730802169568, + "grad_norm": 2.4249329566955566, + "learning_rate": 4.993770970874499e-05, + "loss": 6.0494, + "step": 3780 + }, + { + "epoch": 0.022486678085450566, + "grad_norm": 2.7482552528381348, + "learning_rate": 4.993767675154169e-05, + "loss": 5.7579, + "step": 3781 + }, + { + "epoch": 0.022492625368731565, + "grad_norm": 4.115204811096191, + "learning_rate": 4.993764378563288e-05, + "loss": 6.3891, + "step": 3782 + }, + { + "epoch": 0.02249857265201256, + "grad_norm": 2.51346755027771, + "learning_rate": 4.99376108110186e-05, + "loss": 5.7982, + "step": 3783 + }, + { + "epoch": 0.02250451993529356, + "grad_norm": 2.2737278938293457, + "learning_rate": 4.993757782769887e-05, + "loss": 5.7576, + "step": 3784 + }, + { + "epoch": 0.022510467218574554, + "grad_norm": 2.2068402767181396, + "learning_rate": 4.9937544835673674e-05, + "loss": 5.9801, + "step": 3785 + }, + { + "epoch": 0.022516414501855552, + "grad_norm": 1.8548356294631958, + "learning_rate": 4.993751183494305e-05, + "loss": 6.2054, + "step": 3786 + }, + { + "epoch": 0.02252236178513655, + "grad_norm": 2.3499045372009277, + "learning_rate": 4.993747882550699e-05, + "loss": 6.0694, + "step": 3787 + }, + { + "epoch": 0.022528309068417546, + "grad_norm": 2.2253386974334717, + "learning_rate": 4.993744580736552e-05, + "loss": 5.709, + "step": 3788 + }, + { + "epoch": 0.022534256351698544, + "grad_norm": 2.1136696338653564, + "learning_rate": 4.993741278051864e-05, + "loss": 5.9546, + "step": 3789 + }, + { + "epoch": 0.022540203634979543, + "grad_norm": 1.8777605295181274, + "learning_rate": 4.9937379744966375e-05, + "loss": 5.7587, + "step": 3790 + }, + { + "epoch": 0.022546150918260538, + "grad_norm": 2.527571201324463, + "learning_rate": 4.9937346700708723e-05, + "loss": 5.0992, + "step": 3791 + }, + { + "epoch": 0.022552098201541537, + "grad_norm": 2.515805244445801, + "learning_rate": 4.99373136477457e-05, + "loss": 4.9766, + "step": 3792 + }, + { + "epoch": 0.02255804548482253, + "grad_norm": 2.442979574203491, + "learning_rate": 4.9937280586077315e-05, + "loss": 5.0981, + "step": 3793 + }, + { + "epoch": 0.02256399276810353, + "grad_norm": 2.575383424758911, + "learning_rate": 4.993724751570359e-05, + "loss": 5.0809, + "step": 3794 + }, + { + "epoch": 0.02256994005138453, + "grad_norm": 2.0855023860931396, + "learning_rate": 4.9937214436624524e-05, + "loss": 5.5744, + "step": 3795 + }, + { + "epoch": 0.022575887334665524, + "grad_norm": 2.237565040588379, + "learning_rate": 4.993718134884013e-05, + "loss": 5.6796, + "step": 3796 + }, + { + "epoch": 0.022581834617946522, + "grad_norm": 2.5895159244537354, + "learning_rate": 4.993714825235044e-05, + "loss": 5.2068, + "step": 3797 + }, + { + "epoch": 0.02258778190122752, + "grad_norm": 2.1277096271514893, + "learning_rate": 4.993711514715544e-05, + "loss": 5.5588, + "step": 3798 + }, + { + "epoch": 0.022593729184508516, + "grad_norm": 2.7074246406555176, + "learning_rate": 4.993708203325515e-05, + "loss": 5.0104, + "step": 3799 + }, + { + "epoch": 0.022599676467789515, + "grad_norm": 2.114569664001465, + "learning_rate": 4.993704891064958e-05, + "loss": 5.0453, + "step": 3800 + }, + { + "epoch": 0.02260562375107051, + "grad_norm": 2.4222404956817627, + "learning_rate": 4.9937015779338746e-05, + "loss": 5.3799, + "step": 3801 + }, + { + "epoch": 0.022611571034351508, + "grad_norm": 2.238755941390991, + "learning_rate": 4.993698263932266e-05, + "loss": 5.0075, + "step": 3802 + }, + { + "epoch": 0.022617518317632507, + "grad_norm": 2.0748255252838135, + "learning_rate": 4.993694949060133e-05, + "loss": 5.0007, + "step": 3803 + }, + { + "epoch": 0.022623465600913502, + "grad_norm": 2.1528635025024414, + "learning_rate": 4.993691633317477e-05, + "loss": 5.1048, + "step": 3804 + }, + { + "epoch": 0.0226294128841945, + "grad_norm": 2.0237200260162354, + "learning_rate": 4.993688316704298e-05, + "loss": 5.1465, + "step": 3805 + }, + { + "epoch": 0.022635360167475495, + "grad_norm": 2.2698304653167725, + "learning_rate": 4.993684999220599e-05, + "loss": 4.9642, + "step": 3806 + }, + { + "epoch": 0.022641307450756494, + "grad_norm": 2.7863757610321045, + "learning_rate": 4.993681680866381e-05, + "loss": 5.6277, + "step": 3807 + }, + { + "epoch": 0.022647254734037493, + "grad_norm": 2.394087553024292, + "learning_rate": 4.9936783616416436e-05, + "loss": 6.0895, + "step": 3808 + }, + { + "epoch": 0.022653202017318488, + "grad_norm": 2.8036317825317383, + "learning_rate": 4.993675041546389e-05, + "loss": 6.2002, + "step": 3809 + }, + { + "epoch": 0.022659149300599486, + "grad_norm": 2.4970054626464844, + "learning_rate": 4.993671720580618e-05, + "loss": 5.5114, + "step": 3810 + }, + { + "epoch": 0.022665096583880485, + "grad_norm": 3.2434241771698, + "learning_rate": 4.993668398744332e-05, + "loss": 5.0366, + "step": 3811 + }, + { + "epoch": 0.02267104386716148, + "grad_norm": 2.707104206085205, + "learning_rate": 4.9936650760375326e-05, + "loss": 5.5132, + "step": 3812 + }, + { + "epoch": 0.02267699115044248, + "grad_norm": 2.540231466293335, + "learning_rate": 4.9936617524602204e-05, + "loss": 5.8026, + "step": 3813 + }, + { + "epoch": 0.022682938433723474, + "grad_norm": 2.8549184799194336, + "learning_rate": 4.993658428012397e-05, + "loss": 6.0854, + "step": 3814 + }, + { + "epoch": 0.022688885717004472, + "grad_norm": 2.5972952842712402, + "learning_rate": 4.993655102694062e-05, + "loss": 5.8055, + "step": 3815 + }, + { + "epoch": 0.02269483300028547, + "grad_norm": 3.1625113487243652, + "learning_rate": 4.9936517765052184e-05, + "loss": 5.9683, + "step": 3816 + }, + { + "epoch": 0.022700780283566466, + "grad_norm": 3.239820718765259, + "learning_rate": 4.993648449445867e-05, + "loss": 5.9725, + "step": 3817 + }, + { + "epoch": 0.022706727566847464, + "grad_norm": 2.9632809162139893, + "learning_rate": 4.993645121516008e-05, + "loss": 5.9767, + "step": 3818 + }, + { + "epoch": 0.022712674850128463, + "grad_norm": 2.7486021518707275, + "learning_rate": 4.9936417927156435e-05, + "loss": 6.3471, + "step": 3819 + }, + { + "epoch": 0.022718622133409458, + "grad_norm": 3.8044490814208984, + "learning_rate": 4.993638463044775e-05, + "loss": 6.1275, + "step": 3820 + }, + { + "epoch": 0.022724569416690456, + "grad_norm": 4.851193428039551, + "learning_rate": 4.9936351325034024e-05, + "loss": 5.6658, + "step": 3821 + }, + { + "epoch": 0.02273051669997145, + "grad_norm": 3.1302716732025146, + "learning_rate": 4.993631801091528e-05, + "loss": 5.5256, + "step": 3822 + }, + { + "epoch": 0.02273646398325245, + "grad_norm": 5.310885906219482, + "learning_rate": 4.9936284688091526e-05, + "loss": 5.4771, + "step": 3823 + }, + { + "epoch": 0.02274241126653345, + "grad_norm": 5.493198394775391, + "learning_rate": 4.9936251356562765e-05, + "loss": 6.0993, + "step": 3824 + }, + { + "epoch": 0.022748358549814444, + "grad_norm": 3.5346286296844482, + "learning_rate": 4.993621801632902e-05, + "loss": 6.6862, + "step": 3825 + }, + { + "epoch": 0.022754305833095442, + "grad_norm": 4.550736904144287, + "learning_rate": 4.9936184667390304e-05, + "loss": 6.5658, + "step": 3826 + }, + { + "epoch": 0.02276025311637644, + "grad_norm": 3.3957576751708984, + "learning_rate": 4.993615130974662e-05, + "loss": 6.0596, + "step": 3827 + }, + { + "epoch": 0.022766200399657436, + "grad_norm": 2.614089012145996, + "learning_rate": 4.993611794339798e-05, + "loss": 6.77, + "step": 3828 + }, + { + "epoch": 0.022772147682938434, + "grad_norm": 3.712106704711914, + "learning_rate": 4.99360845683444e-05, + "loss": 6.4084, + "step": 3829 + }, + { + "epoch": 0.02277809496621943, + "grad_norm": 3.7331995964050293, + "learning_rate": 4.99360511845859e-05, + "loss": 6.2627, + "step": 3830 + }, + { + "epoch": 0.022784042249500428, + "grad_norm": 3.8898067474365234, + "learning_rate": 4.993601779212247e-05, + "loss": 6.6476, + "step": 3831 + }, + { + "epoch": 0.022789989532781427, + "grad_norm": 2.829078435897827, + "learning_rate": 4.9935984390954136e-05, + "loss": 6.2307, + "step": 3832 + }, + { + "epoch": 0.022795936816062422, + "grad_norm": 3.467954635620117, + "learning_rate": 4.9935950981080906e-05, + "loss": 6.5283, + "step": 3833 + }, + { + "epoch": 0.02280188409934342, + "grad_norm": 2.317840099334717, + "learning_rate": 4.99359175625028e-05, + "loss": 6.4549, + "step": 3834 + }, + { + "epoch": 0.02280783138262442, + "grad_norm": 2.7261998653411865, + "learning_rate": 4.9935884135219825e-05, + "loss": 6.2049, + "step": 3835 + }, + { + "epoch": 0.022813778665905414, + "grad_norm": 2.623098373413086, + "learning_rate": 4.993585069923198e-05, + "loss": 6.3847, + "step": 3836 + }, + { + "epoch": 0.022819725949186413, + "grad_norm": 2.4825377464294434, + "learning_rate": 4.993581725453929e-05, + "loss": 6.3532, + "step": 3837 + }, + { + "epoch": 0.022825673232467408, + "grad_norm": 2.278151750564575, + "learning_rate": 4.993578380114176e-05, + "loss": 5.8885, + "step": 3838 + }, + { + "epoch": 0.022831620515748406, + "grad_norm": 2.045839548110962, + "learning_rate": 4.9935750339039425e-05, + "loss": 6.6852, + "step": 3839 + }, + { + "epoch": 0.022837567799029405, + "grad_norm": 2.4009597301483154, + "learning_rate": 4.993571686823226e-05, + "loss": 6.1676, + "step": 3840 + }, + { + "epoch": 0.0228435150823104, + "grad_norm": 2.759819507598877, + "learning_rate": 4.9935683388720296e-05, + "loss": 6.3913, + "step": 3841 + }, + { + "epoch": 0.0228494623655914, + "grad_norm": 2.798785924911499, + "learning_rate": 4.9935649900503546e-05, + "loss": 6.8169, + "step": 3842 + }, + { + "epoch": 0.022855409648872393, + "grad_norm": 2.389890432357788, + "learning_rate": 4.9935616403582015e-05, + "loss": 6.7506, + "step": 3843 + }, + { + "epoch": 0.022861356932153392, + "grad_norm": 2.882474184036255, + "learning_rate": 4.9935582897955715e-05, + "loss": 6.2458, + "step": 3844 + }, + { + "epoch": 0.02286730421543439, + "grad_norm": 2.2487478256225586, + "learning_rate": 4.993554938362467e-05, + "loss": 6.7296, + "step": 3845 + }, + { + "epoch": 0.022873251498715386, + "grad_norm": 1.9563521146774292, + "learning_rate": 4.993551586058888e-05, + "loss": 6.6878, + "step": 3846 + }, + { + "epoch": 0.022879198781996384, + "grad_norm": 7.555780410766602, + "learning_rate": 4.993548232884835e-05, + "loss": 6.3309, + "step": 3847 + }, + { + "epoch": 0.022885146065277383, + "grad_norm": 2.2573931217193604, + "learning_rate": 4.99354487884031e-05, + "loss": 6.3384, + "step": 3848 + }, + { + "epoch": 0.022891093348558378, + "grad_norm": 2.063267946243286, + "learning_rate": 4.993541523925316e-05, + "loss": 6.2342, + "step": 3849 + }, + { + "epoch": 0.022897040631839376, + "grad_norm": 2.1032445430755615, + "learning_rate": 4.9935381681398505e-05, + "loss": 6.5458, + "step": 3850 + }, + { + "epoch": 0.02290298791512037, + "grad_norm": 2.233400583267212, + "learning_rate": 4.9935348114839176e-05, + "loss": 6.46, + "step": 3851 + }, + { + "epoch": 0.02290893519840137, + "grad_norm": 2.069182872772217, + "learning_rate": 4.9935314539575174e-05, + "loss": 6.4829, + "step": 3852 + }, + { + "epoch": 0.02291488248168237, + "grad_norm": 1.9986059665679932, + "learning_rate": 4.993528095560651e-05, + "loss": 6.4651, + "step": 3853 + }, + { + "epoch": 0.022920829764963364, + "grad_norm": 2.0529284477233887, + "learning_rate": 4.99352473629332e-05, + "loss": 6.1151, + "step": 3854 + }, + { + "epoch": 0.022926777048244362, + "grad_norm": 1.9643630981445312, + "learning_rate": 4.993521376155525e-05, + "loss": 5.991, + "step": 3855 + }, + { + "epoch": 0.02293272433152536, + "grad_norm": 2.2183501720428467, + "learning_rate": 4.9935180151472674e-05, + "loss": 6.8568, + "step": 3856 + }, + { + "epoch": 0.022938671614806356, + "grad_norm": 2.2095682621002197, + "learning_rate": 4.993514653268548e-05, + "loss": 6.8145, + "step": 3857 + }, + { + "epoch": 0.022944618898087354, + "grad_norm": 2.194451332092285, + "learning_rate": 4.9935112905193694e-05, + "loss": 6.4781, + "step": 3858 + }, + { + "epoch": 0.02295056618136835, + "grad_norm": 2.2242066860198975, + "learning_rate": 4.9935079268997306e-05, + "loss": 6.0535, + "step": 3859 + }, + { + "epoch": 0.022956513464649348, + "grad_norm": 2.336190938949585, + "learning_rate": 4.9935045624096354e-05, + "loss": 6.2453, + "step": 3860 + }, + { + "epoch": 0.022962460747930347, + "grad_norm": 1.9997279644012451, + "learning_rate": 4.9935011970490824e-05, + "loss": 6.3852, + "step": 3861 + }, + { + "epoch": 0.02296840803121134, + "grad_norm": 2.9107778072357178, + "learning_rate": 4.993497830818074e-05, + "loss": 6.0891, + "step": 3862 + }, + { + "epoch": 0.02297435531449234, + "grad_norm": 2.1357171535491943, + "learning_rate": 4.993494463716612e-05, + "loss": 6.5111, + "step": 3863 + }, + { + "epoch": 0.02298030259777334, + "grad_norm": 2.0228497982025146, + "learning_rate": 4.9934910957446954e-05, + "loss": 6.6009, + "step": 3864 + }, + { + "epoch": 0.022986249881054334, + "grad_norm": 2.8057942390441895, + "learning_rate": 4.993487726902328e-05, + "loss": 6.414, + "step": 3865 + }, + { + "epoch": 0.022992197164335332, + "grad_norm": 3.0660998821258545, + "learning_rate": 4.99348435718951e-05, + "loss": 6.3673, + "step": 3866 + }, + { + "epoch": 0.022998144447616328, + "grad_norm": 2.2440497875213623, + "learning_rate": 4.9934809866062416e-05, + "loss": 6.1793, + "step": 3867 + }, + { + "epoch": 0.023004091730897326, + "grad_norm": 2.342358350753784, + "learning_rate": 4.993477615152525e-05, + "loss": 6.5279, + "step": 3868 + }, + { + "epoch": 0.023010039014178325, + "grad_norm": 1.9231956005096436, + "learning_rate": 4.993474242828361e-05, + "loss": 6.4975, + "step": 3869 + }, + { + "epoch": 0.02301598629745932, + "grad_norm": 2.503028631210327, + "learning_rate": 4.9934708696337516e-05, + "loss": 6.5261, + "step": 3870 + }, + { + "epoch": 0.02302193358074032, + "grad_norm": 2.2343928813934326, + "learning_rate": 4.993467495568697e-05, + "loss": 6.0525, + "step": 3871 + }, + { + "epoch": 0.023027880864021313, + "grad_norm": 2.851964235305786, + "learning_rate": 4.993464120633198e-05, + "loss": 6.1271, + "step": 3872 + }, + { + "epoch": 0.023033828147302312, + "grad_norm": 2.580017328262329, + "learning_rate": 4.993460744827257e-05, + "loss": 6.2018, + "step": 3873 + }, + { + "epoch": 0.02303977543058331, + "grad_norm": 2.227879047393799, + "learning_rate": 4.9934573681508744e-05, + "loss": 6.0177, + "step": 3874 + }, + { + "epoch": 0.023045722713864306, + "grad_norm": 2.696531295776367, + "learning_rate": 4.993453990604051e-05, + "loss": 6.627, + "step": 3875 + }, + { + "epoch": 0.023051669997145304, + "grad_norm": 2.3439393043518066, + "learning_rate": 4.99345061218679e-05, + "loss": 6.5388, + "step": 3876 + }, + { + "epoch": 0.023057617280426303, + "grad_norm": 2.5400748252868652, + "learning_rate": 4.99344723289909e-05, + "loss": 5.9162, + "step": 3877 + }, + { + "epoch": 0.023063564563707298, + "grad_norm": 2.658193588256836, + "learning_rate": 4.9934438527409535e-05, + "loss": 5.6645, + "step": 3878 + }, + { + "epoch": 0.023069511846988296, + "grad_norm": 2.3102848529815674, + "learning_rate": 4.9934404717123814e-05, + "loss": 5.9969, + "step": 3879 + }, + { + "epoch": 0.02307545913026929, + "grad_norm": 2.6107916831970215, + "learning_rate": 4.993437089813376e-05, + "loss": 6.1776, + "step": 3880 + }, + { + "epoch": 0.02308140641355029, + "grad_norm": 2.6275434494018555, + "learning_rate": 4.993433707043937e-05, + "loss": 6.2563, + "step": 3881 + }, + { + "epoch": 0.02308735369683129, + "grad_norm": 2.8595218658447266, + "learning_rate": 4.993430323404066e-05, + "loss": 5.9371, + "step": 3882 + }, + { + "epoch": 0.023093300980112284, + "grad_norm": 2.2947659492492676, + "learning_rate": 4.993426938893764e-05, + "loss": 5.7263, + "step": 3883 + }, + { + "epoch": 0.023099248263393282, + "grad_norm": 3.3769729137420654, + "learning_rate": 4.9934235535130326e-05, + "loss": 6.2706, + "step": 3884 + }, + { + "epoch": 0.02310519554667428, + "grad_norm": 2.792043447494507, + "learning_rate": 4.9934201672618716e-05, + "loss": 5.9264, + "step": 3885 + }, + { + "epoch": 0.023111142829955276, + "grad_norm": 2.592167615890503, + "learning_rate": 4.993416780140285e-05, + "loss": 6.4031, + "step": 3886 + }, + { + "epoch": 0.023117090113236274, + "grad_norm": 2.429898977279663, + "learning_rate": 4.9934133921482716e-05, + "loss": 6.4609, + "step": 3887 + }, + { + "epoch": 0.02312303739651727, + "grad_norm": 2.1771554946899414, + "learning_rate": 4.993410003285834e-05, + "loss": 6.2873, + "step": 3888 + }, + { + "epoch": 0.023128984679798268, + "grad_norm": 2.7799339294433594, + "learning_rate": 4.9934066135529724e-05, + "loss": 5.7405, + "step": 3889 + }, + { + "epoch": 0.023134931963079267, + "grad_norm": 2.626492977142334, + "learning_rate": 4.993403222949688e-05, + "loss": 5.783, + "step": 3890 + }, + { + "epoch": 0.02314087924636026, + "grad_norm": 2.837663412094116, + "learning_rate": 4.993399831475982e-05, + "loss": 5.8039, + "step": 3891 + }, + { + "epoch": 0.02314682652964126, + "grad_norm": 2.68230938911438, + "learning_rate": 4.9933964391318564e-05, + "loss": 5.6587, + "step": 3892 + }, + { + "epoch": 0.02315277381292226, + "grad_norm": 3.2064061164855957, + "learning_rate": 4.993393045917312e-05, + "loss": 5.9516, + "step": 3893 + }, + { + "epoch": 0.023158721096203254, + "grad_norm": 3.5179402828216553, + "learning_rate": 4.99338965183235e-05, + "loss": 5.7925, + "step": 3894 + }, + { + "epoch": 0.023164668379484252, + "grad_norm": 2.9261434078216553, + "learning_rate": 4.993386256876971e-05, + "loss": 5.8677, + "step": 3895 + }, + { + "epoch": 0.023170615662765248, + "grad_norm": 3.092033624649048, + "learning_rate": 4.9933828610511766e-05, + "loss": 5.6248, + "step": 3896 + }, + { + "epoch": 0.023176562946046246, + "grad_norm": 2.7650182247161865, + "learning_rate": 4.9933794643549683e-05, + "loss": 5.7371, + "step": 3897 + }, + { + "epoch": 0.023182510229327245, + "grad_norm": 2.402839422225952, + "learning_rate": 4.993376066788347e-05, + "loss": 5.4802, + "step": 3898 + }, + { + "epoch": 0.02318845751260824, + "grad_norm": 2.606062889099121, + "learning_rate": 4.993372668351314e-05, + "loss": 5.5766, + "step": 3899 + }, + { + "epoch": 0.023194404795889238, + "grad_norm": 2.2177329063415527, + "learning_rate": 4.99336926904387e-05, + "loss": 5.5744, + "step": 3900 + }, + { + "epoch": 0.023200352079170233, + "grad_norm": 2.6953063011169434, + "learning_rate": 4.9933658688660166e-05, + "loss": 5.6414, + "step": 3901 + }, + { + "epoch": 0.023206299362451232, + "grad_norm": 2.90512752532959, + "learning_rate": 4.993362467817755e-05, + "loss": 5.5445, + "step": 3902 + }, + { + "epoch": 0.02321224664573223, + "grad_norm": 3.724168062210083, + "learning_rate": 4.993359065899086e-05, + "loss": 5.7733, + "step": 3903 + }, + { + "epoch": 0.023218193929013226, + "grad_norm": 2.9355592727661133, + "learning_rate": 4.993355663110012e-05, + "loss": 5.579, + "step": 3904 + }, + { + "epoch": 0.023224141212294224, + "grad_norm": 2.7822163105010986, + "learning_rate": 4.993352259450532e-05, + "loss": 5.5105, + "step": 3905 + }, + { + "epoch": 0.023230088495575223, + "grad_norm": 3.672539710998535, + "learning_rate": 4.99334885492065e-05, + "loss": 6.3865, + "step": 3906 + }, + { + "epoch": 0.023236035778856218, + "grad_norm": 2.26755952835083, + "learning_rate": 4.993345449520364e-05, + "loss": 5.5472, + "step": 3907 + }, + { + "epoch": 0.023241983062137216, + "grad_norm": 2.8935770988464355, + "learning_rate": 4.993342043249678e-05, + "loss": 5.5948, + "step": 3908 + }, + { + "epoch": 0.02324793034541821, + "grad_norm": 3.077798366546631, + "learning_rate": 4.9933386361085924e-05, + "loss": 5.288, + "step": 3909 + }, + { + "epoch": 0.02325387762869921, + "grad_norm": 2.479198694229126, + "learning_rate": 4.993335228097107e-05, + "loss": 5.3743, + "step": 3910 + }, + { + "epoch": 0.02325982491198021, + "grad_norm": 2.429049015045166, + "learning_rate": 4.9933318192152244e-05, + "loss": 5.6709, + "step": 3911 + }, + { + "epoch": 0.023265772195261204, + "grad_norm": 2.4515016078948975, + "learning_rate": 4.993328409462945e-05, + "loss": 5.4946, + "step": 3912 + }, + { + "epoch": 0.023271719478542202, + "grad_norm": 2.3859386444091797, + "learning_rate": 4.993324998840271e-05, + "loss": 5.5947, + "step": 3913 + }, + { + "epoch": 0.0232776667618232, + "grad_norm": 2.746438503265381, + "learning_rate": 4.993321587347203e-05, + "loss": 5.6743, + "step": 3914 + }, + { + "epoch": 0.023283614045104196, + "grad_norm": 2.416118621826172, + "learning_rate": 4.993318174983742e-05, + "loss": 5.7073, + "step": 3915 + }, + { + "epoch": 0.023289561328385194, + "grad_norm": 2.3427727222442627, + "learning_rate": 4.99331476174989e-05, + "loss": 5.5933, + "step": 3916 + }, + { + "epoch": 0.02329550861166619, + "grad_norm": 2.2179009914398193, + "learning_rate": 4.993311347645647e-05, + "loss": 5.7726, + "step": 3917 + }, + { + "epoch": 0.023301455894947188, + "grad_norm": 2.732923984527588, + "learning_rate": 4.993307932671014e-05, + "loss": 5.5783, + "step": 3918 + }, + { + "epoch": 0.023307403178228187, + "grad_norm": 2.5090553760528564, + "learning_rate": 4.993304516825994e-05, + "loss": 5.6598, + "step": 3919 + }, + { + "epoch": 0.02331335046150918, + "grad_norm": 2.690276622772217, + "learning_rate": 4.993301100110587e-05, + "loss": 5.9688, + "step": 3920 + }, + { + "epoch": 0.02331929774479018, + "grad_norm": 2.559215784072876, + "learning_rate": 4.993297682524794e-05, + "loss": 6.3315, + "step": 3921 + }, + { + "epoch": 0.02332524502807118, + "grad_norm": 2.2800240516662598, + "learning_rate": 4.993294264068617e-05, + "loss": 6.2787, + "step": 3922 + }, + { + "epoch": 0.023331192311352174, + "grad_norm": 2.478898525238037, + "learning_rate": 4.993290844742057e-05, + "loss": 6.1145, + "step": 3923 + }, + { + "epoch": 0.023337139594633172, + "grad_norm": 2.4902184009552, + "learning_rate": 4.993287424545115e-05, + "loss": 6.0665, + "step": 3924 + }, + { + "epoch": 0.023343086877914167, + "grad_norm": 2.4157116413116455, + "learning_rate": 4.9932840034777906e-05, + "loss": 6.1697, + "step": 3925 + }, + { + "epoch": 0.023349034161195166, + "grad_norm": 2.340575933456421, + "learning_rate": 4.993280581540087e-05, + "loss": 6.1121, + "step": 3926 + }, + { + "epoch": 0.023354981444476165, + "grad_norm": 2.586881160736084, + "learning_rate": 4.993277158732006e-05, + "loss": 6.1792, + "step": 3927 + }, + { + "epoch": 0.02336092872775716, + "grad_norm": 2.448880910873413, + "learning_rate": 4.9932737350535476e-05, + "loss": 6.084, + "step": 3928 + }, + { + "epoch": 0.023366876011038158, + "grad_norm": 2.525082588195801, + "learning_rate": 4.993270310504712e-05, + "loss": 5.6726, + "step": 3929 + }, + { + "epoch": 0.023372823294319153, + "grad_norm": 2.310445547103882, + "learning_rate": 4.993266885085503e-05, + "loss": 5.9496, + "step": 3930 + }, + { + "epoch": 0.023378770577600152, + "grad_norm": 2.275416612625122, + "learning_rate": 4.993263458795918e-05, + "loss": 6.0042, + "step": 3931 + }, + { + "epoch": 0.02338471786088115, + "grad_norm": 2.481973648071289, + "learning_rate": 4.993260031635963e-05, + "loss": 5.6177, + "step": 3932 + }, + { + "epoch": 0.023390665144162145, + "grad_norm": 2.439544677734375, + "learning_rate": 4.993256603605635e-05, + "loss": 5.9745, + "step": 3933 + }, + { + "epoch": 0.023396612427443144, + "grad_norm": 2.1909360885620117, + "learning_rate": 4.993253174704937e-05, + "loss": 5.9966, + "step": 3934 + }, + { + "epoch": 0.023402559710724143, + "grad_norm": 2.1893911361694336, + "learning_rate": 4.993249744933871e-05, + "loss": 6.0643, + "step": 3935 + }, + { + "epoch": 0.023408506994005138, + "grad_norm": 3.2023842334747314, + "learning_rate": 4.993246314292437e-05, + "loss": 6.2284, + "step": 3936 + }, + { + "epoch": 0.023414454277286136, + "grad_norm": 2.980842113494873, + "learning_rate": 4.9932428827806356e-05, + "loss": 6.2359, + "step": 3937 + }, + { + "epoch": 0.02342040156056713, + "grad_norm": 2.6659433841705322, + "learning_rate": 4.99323945039847e-05, + "loss": 6.2901, + "step": 3938 + }, + { + "epoch": 0.02342634884384813, + "grad_norm": 2.2173492908477783, + "learning_rate": 4.993236017145939e-05, + "loss": 5.8157, + "step": 3939 + }, + { + "epoch": 0.02343229612712913, + "grad_norm": 2.592771530151367, + "learning_rate": 4.993232583023046e-05, + "loss": 5.7747, + "step": 3940 + }, + { + "epoch": 0.023438243410410124, + "grad_norm": 2.328951835632324, + "learning_rate": 4.9932291480297915e-05, + "loss": 5.7367, + "step": 3941 + }, + { + "epoch": 0.023444190693691122, + "grad_norm": 2.3135616779327393, + "learning_rate": 4.993225712166176e-05, + "loss": 6.0592, + "step": 3942 + }, + { + "epoch": 0.02345013797697212, + "grad_norm": 2.49661922454834, + "learning_rate": 4.993222275432201e-05, + "loss": 5.9737, + "step": 3943 + }, + { + "epoch": 0.023456085260253116, + "grad_norm": 2.6462106704711914, + "learning_rate": 4.9932188378278683e-05, + "loss": 5.7053, + "step": 3944 + }, + { + "epoch": 0.023462032543534114, + "grad_norm": 2.102663516998291, + "learning_rate": 4.993215399353178e-05, + "loss": 5.9006, + "step": 3945 + }, + { + "epoch": 0.02346797982681511, + "grad_norm": 2.474500894546509, + "learning_rate": 4.9932119600081326e-05, + "loss": 6.092, + "step": 3946 + }, + { + "epoch": 0.023473927110096108, + "grad_norm": 2.6023428440093994, + "learning_rate": 4.993208519792732e-05, + "loss": 5.9045, + "step": 3947 + }, + { + "epoch": 0.023479874393377106, + "grad_norm": 2.76432466506958, + "learning_rate": 4.99320507870698e-05, + "loss": 5.8178, + "step": 3948 + }, + { + "epoch": 0.0234858216766581, + "grad_norm": 2.250816822052002, + "learning_rate": 4.993201636750874e-05, + "loss": 5.9091, + "step": 3949 + }, + { + "epoch": 0.0234917689599391, + "grad_norm": 2.1984071731567383, + "learning_rate": 4.993198193924417e-05, + "loss": 5.8804, + "step": 3950 + }, + { + "epoch": 0.0234977162432201, + "grad_norm": 2.5217959880828857, + "learning_rate": 4.993194750227611e-05, + "loss": 5.9879, + "step": 3951 + }, + { + "epoch": 0.023503663526501094, + "grad_norm": 2.080110788345337, + "learning_rate": 4.993191305660456e-05, + "loss": 5.6352, + "step": 3952 + }, + { + "epoch": 0.023509610809782092, + "grad_norm": 2.637500286102295, + "learning_rate": 4.9931878602229545e-05, + "loss": 5.7924, + "step": 3953 + }, + { + "epoch": 0.023515558093063087, + "grad_norm": 2.660531759262085, + "learning_rate": 4.9931844139151056e-05, + "loss": 6.1936, + "step": 3954 + }, + { + "epoch": 0.023521505376344086, + "grad_norm": 2.423699378967285, + "learning_rate": 4.993180966736913e-05, + "loss": 5.8974, + "step": 3955 + }, + { + "epoch": 0.023527452659625085, + "grad_norm": 2.581876277923584, + "learning_rate": 4.993177518688375e-05, + "loss": 5.833, + "step": 3956 + }, + { + "epoch": 0.02353339994290608, + "grad_norm": 2.586538076400757, + "learning_rate": 4.9931740697694965e-05, + "loss": 5.9649, + "step": 3957 + }, + { + "epoch": 0.023539347226187078, + "grad_norm": 2.5123441219329834, + "learning_rate": 4.993170619980276e-05, + "loss": 6.1251, + "step": 3958 + }, + { + "epoch": 0.023545294509468077, + "grad_norm": 3.076904535293579, + "learning_rate": 4.993167169320715e-05, + "loss": 5.9559, + "step": 3959 + }, + { + "epoch": 0.023551241792749072, + "grad_norm": 2.572312593460083, + "learning_rate": 4.9931637177908153e-05, + "loss": 6.0291, + "step": 3960 + }, + { + "epoch": 0.02355718907603007, + "grad_norm": 1.9910492897033691, + "learning_rate": 4.9931602653905776e-05, + "loss": 5.8413, + "step": 3961 + }, + { + "epoch": 0.023563136359311065, + "grad_norm": 2.530710458755493, + "learning_rate": 4.993156812120004e-05, + "loss": 6.1217, + "step": 3962 + }, + { + "epoch": 0.023569083642592064, + "grad_norm": 2.3089046478271484, + "learning_rate": 4.993153357979095e-05, + "loss": 5.822, + "step": 3963 + }, + { + "epoch": 0.023575030925873063, + "grad_norm": 2.8980624675750732, + "learning_rate": 4.993149902967852e-05, + "loss": 6.3906, + "step": 3964 + }, + { + "epoch": 0.023580978209154058, + "grad_norm": 2.2176012992858887, + "learning_rate": 4.993146447086275e-05, + "loss": 5.9259, + "step": 3965 + }, + { + "epoch": 0.023586925492435056, + "grad_norm": 2.01096773147583, + "learning_rate": 4.993142990334367e-05, + "loss": 6.3141, + "step": 3966 + }, + { + "epoch": 0.02359287277571605, + "grad_norm": 3.4096288681030273, + "learning_rate": 4.993139532712129e-05, + "loss": 6.3165, + "step": 3967 + }, + { + "epoch": 0.02359882005899705, + "grad_norm": 2.20595645904541, + "learning_rate": 4.9931360742195623e-05, + "loss": 6.016, + "step": 3968 + }, + { + "epoch": 0.02360476734227805, + "grad_norm": 3.543301820755005, + "learning_rate": 4.993132614856666e-05, + "loss": 5.722, + "step": 3969 + }, + { + "epoch": 0.023610714625559043, + "grad_norm": 2.82092547416687, + "learning_rate": 4.993129154623444e-05, + "loss": 5.8217, + "step": 3970 + }, + { + "epoch": 0.023616661908840042, + "grad_norm": 2.4585440158843994, + "learning_rate": 4.9931256935198954e-05, + "loss": 6.3298, + "step": 3971 + }, + { + "epoch": 0.02362260919212104, + "grad_norm": 2.104340076446533, + "learning_rate": 4.993122231546024e-05, + "loss": 5.9174, + "step": 3972 + }, + { + "epoch": 0.023628556475402036, + "grad_norm": 2.5130183696746826, + "learning_rate": 4.993118768701828e-05, + "loss": 6.3075, + "step": 3973 + }, + { + "epoch": 0.023634503758683034, + "grad_norm": 2.4567196369171143, + "learning_rate": 4.99311530498731e-05, + "loss": 6.0088, + "step": 3974 + }, + { + "epoch": 0.02364045104196403, + "grad_norm": 2.5174858570098877, + "learning_rate": 4.993111840402471e-05, + "loss": 6.6739, + "step": 3975 + }, + { + "epoch": 0.023646398325245028, + "grad_norm": 2.0032241344451904, + "learning_rate": 4.9931083749473136e-05, + "loss": 5.7052, + "step": 3976 + }, + { + "epoch": 0.023652345608526026, + "grad_norm": 2.9536757469177246, + "learning_rate": 4.993104908621837e-05, + "loss": 5.415, + "step": 3977 + }, + { + "epoch": 0.02365829289180702, + "grad_norm": 2.6650888919830322, + "learning_rate": 4.9931014414260435e-05, + "loss": 5.4333, + "step": 3978 + }, + { + "epoch": 0.02366424017508802, + "grad_norm": 2.3574490547180176, + "learning_rate": 4.9930979733599334e-05, + "loss": 5.5802, + "step": 3979 + }, + { + "epoch": 0.02367018745836902, + "grad_norm": 2.855534791946411, + "learning_rate": 4.99309450442351e-05, + "loss": 5.5131, + "step": 3980 + }, + { + "epoch": 0.023676134741650014, + "grad_norm": 2.430943727493286, + "learning_rate": 4.993091034616772e-05, + "loss": 6.2497, + "step": 3981 + }, + { + "epoch": 0.023682082024931012, + "grad_norm": 2.1671106815338135, + "learning_rate": 4.993087563939722e-05, + "loss": 5.9994, + "step": 3982 + }, + { + "epoch": 0.023688029308212007, + "grad_norm": 2.3268723487854004, + "learning_rate": 4.9930840923923606e-05, + "loss": 5.4779, + "step": 3983 + }, + { + "epoch": 0.023693976591493006, + "grad_norm": 2.3953616619110107, + "learning_rate": 4.993080619974689e-05, + "loss": 5.4044, + "step": 3984 + }, + { + "epoch": 0.023699923874774004, + "grad_norm": 2.043724775314331, + "learning_rate": 4.993077146686709e-05, + "loss": 5.6252, + "step": 3985 + }, + { + "epoch": 0.023705871158055, + "grad_norm": 2.5629520416259766, + "learning_rate": 4.9930736725284224e-05, + "loss": 5.1765, + "step": 3986 + }, + { + "epoch": 0.023711818441335998, + "grad_norm": 2.2148349285125732, + "learning_rate": 4.993070197499828e-05, + "loss": 5.5452, + "step": 3987 + }, + { + "epoch": 0.023717765724616997, + "grad_norm": 2.3913650512695312, + "learning_rate": 4.9930667216009295e-05, + "loss": 6.0882, + "step": 3988 + }, + { + "epoch": 0.02372371300789799, + "grad_norm": 2.619607925415039, + "learning_rate": 4.993063244831727e-05, + "loss": 6.4482, + "step": 3989 + }, + { + "epoch": 0.02372966029117899, + "grad_norm": 2.0585055351257324, + "learning_rate": 4.993059767192222e-05, + "loss": 6.0467, + "step": 3990 + }, + { + "epoch": 0.023735607574459985, + "grad_norm": 2.3380227088928223, + "learning_rate": 4.993056288682416e-05, + "loss": 5.9382, + "step": 3991 + }, + { + "epoch": 0.023741554857740984, + "grad_norm": 2.7252683639526367, + "learning_rate": 4.9930528093023085e-05, + "loss": 6.0444, + "step": 3992 + }, + { + "epoch": 0.023747502141021982, + "grad_norm": 2.333296060562134, + "learning_rate": 4.993049329051903e-05, + "loss": 5.6614, + "step": 3993 + }, + { + "epoch": 0.023753449424302978, + "grad_norm": 2.3571507930755615, + "learning_rate": 4.9930458479312e-05, + "loss": 6.328, + "step": 3994 + }, + { + "epoch": 0.023759396707583976, + "grad_norm": 2.7106499671936035, + "learning_rate": 4.9930423659402005e-05, + "loss": 6.0347, + "step": 3995 + }, + { + "epoch": 0.02376534399086497, + "grad_norm": 3.000009298324585, + "learning_rate": 4.9930388830789043e-05, + "loss": 5.5511, + "step": 3996 + }, + { + "epoch": 0.02377129127414597, + "grad_norm": 2.787912130355835, + "learning_rate": 4.993035399347316e-05, + "loss": 5.2059, + "step": 3997 + }, + { + "epoch": 0.02377723855742697, + "grad_norm": 2.7351326942443848, + "learning_rate": 4.993031914745433e-05, + "loss": 5.2997, + "step": 3998 + }, + { + "epoch": 0.023783185840707963, + "grad_norm": 2.770566701889038, + "learning_rate": 4.993028429273259e-05, + "loss": 5.8871, + "step": 3999 + }, + { + "epoch": 0.023789133123988962, + "grad_norm": 2.9528706073760986, + "learning_rate": 4.993024942930794e-05, + "loss": 5.8177, + "step": 4000 + }, + { + "epoch": 0.02379508040726996, + "grad_norm": 2.543329954147339, + "learning_rate": 4.993021455718041e-05, + "loss": 5.6446, + "step": 4001 + }, + { + "epoch": 0.023801027690550956, + "grad_norm": 2.7284936904907227, + "learning_rate": 4.993017967634999e-05, + "loss": 5.8404, + "step": 4002 + }, + { + "epoch": 0.023806974973831954, + "grad_norm": 2.752187728881836, + "learning_rate": 4.99301447868167e-05, + "loss": 5.6959, + "step": 4003 + }, + { + "epoch": 0.02381292225711295, + "grad_norm": 2.86651611328125, + "learning_rate": 4.993010988858056e-05, + "loss": 5.6329, + "step": 4004 + }, + { + "epoch": 0.023818869540393948, + "grad_norm": 3.9363176822662354, + "learning_rate": 4.9930074981641574e-05, + "loss": 5.31, + "step": 4005 + }, + { + "epoch": 0.023824816823674946, + "grad_norm": 3.41188907623291, + "learning_rate": 4.9930040065999764e-05, + "loss": 5.9905, + "step": 4006 + }, + { + "epoch": 0.02383076410695594, + "grad_norm": 3.4761459827423096, + "learning_rate": 4.9930005141655125e-05, + "loss": 6.0575, + "step": 4007 + }, + { + "epoch": 0.02383671139023694, + "grad_norm": 3.1562440395355225, + "learning_rate": 4.992997020860768e-05, + "loss": 5.9915, + "step": 4008 + }, + { + "epoch": 0.02384265867351794, + "grad_norm": 2.884049415588379, + "learning_rate": 4.992993526685744e-05, + "loss": 5.8051, + "step": 4009 + }, + { + "epoch": 0.023848605956798934, + "grad_norm": 3.3188138008117676, + "learning_rate": 4.992990031640442e-05, + "loss": 5.9637, + "step": 4010 + }, + { + "epoch": 0.023854553240079932, + "grad_norm": 3.2048282623291016, + "learning_rate": 4.992986535724862e-05, + "loss": 6.631, + "step": 4011 + }, + { + "epoch": 0.023860500523360927, + "grad_norm": 2.80204701423645, + "learning_rate": 4.992983038939008e-05, + "loss": 6.0063, + "step": 4012 + }, + { + "epoch": 0.023866447806641926, + "grad_norm": 2.993398427963257, + "learning_rate": 4.992979541282877e-05, + "loss": 5.9778, + "step": 4013 + }, + { + "epoch": 0.023872395089922924, + "grad_norm": 2.7519168853759766, + "learning_rate": 4.9929760427564744e-05, + "loss": 6.4272, + "step": 4014 + }, + { + "epoch": 0.02387834237320392, + "grad_norm": 2.9606168270111084, + "learning_rate": 4.992972543359799e-05, + "loss": 5.5372, + "step": 4015 + }, + { + "epoch": 0.023884289656484918, + "grad_norm": 2.1724514961242676, + "learning_rate": 4.992969043092853e-05, + "loss": 6.3115, + "step": 4016 + }, + { + "epoch": 0.023890236939765917, + "grad_norm": 2.1742191314697266, + "learning_rate": 4.9929655419556365e-05, + "loss": 6.5097, + "step": 4017 + }, + { + "epoch": 0.02389618422304691, + "grad_norm": 1.9729878902435303, + "learning_rate": 4.9929620399481526e-05, + "loss": 6.7061, + "step": 4018 + }, + { + "epoch": 0.02390213150632791, + "grad_norm": 2.6273725032806396, + "learning_rate": 4.9929585370704e-05, + "loss": 6.2838, + "step": 4019 + }, + { + "epoch": 0.023908078789608905, + "grad_norm": 2.5495283603668213, + "learning_rate": 4.9929550333223826e-05, + "loss": 6.1175, + "step": 4020 + }, + { + "epoch": 0.023914026072889904, + "grad_norm": 2.50193452835083, + "learning_rate": 4.9929515287041e-05, + "loss": 5.7689, + "step": 4021 + }, + { + "epoch": 0.023919973356170902, + "grad_norm": 2.402991771697998, + "learning_rate": 4.992948023215553e-05, + "loss": 6.4222, + "step": 4022 + }, + { + "epoch": 0.023925920639451898, + "grad_norm": 2.1722981929779053, + "learning_rate": 4.9929445168567444e-05, + "loss": 6.2335, + "step": 4023 + }, + { + "epoch": 0.023931867922732896, + "grad_norm": 1.6895688772201538, + "learning_rate": 4.992941009627675e-05, + "loss": 6.163, + "step": 4024 + }, + { + "epoch": 0.02393781520601389, + "grad_norm": 1.9944639205932617, + "learning_rate": 4.992937501528345e-05, + "loss": 6.2622, + "step": 4025 + }, + { + "epoch": 0.02394376248929489, + "grad_norm": 2.6157150268554688, + "learning_rate": 4.9929339925587565e-05, + "loss": 6.4582, + "step": 4026 + }, + { + "epoch": 0.023949709772575888, + "grad_norm": 2.021772623062134, + "learning_rate": 4.992930482718911e-05, + "loss": 6.2921, + "step": 4027 + }, + { + "epoch": 0.023955657055856883, + "grad_norm": 2.465402603149414, + "learning_rate": 4.992926972008808e-05, + "loss": 6.6426, + "step": 4028 + }, + { + "epoch": 0.023961604339137882, + "grad_norm": 2.337763547897339, + "learning_rate": 4.99292346042845e-05, + "loss": 6.4988, + "step": 4029 + }, + { + "epoch": 0.02396755162241888, + "grad_norm": 2.400064706802368, + "learning_rate": 4.9929199479778394e-05, + "loss": 6.6666, + "step": 4030 + }, + { + "epoch": 0.023973498905699876, + "grad_norm": 2.4205784797668457, + "learning_rate": 4.9929164346569756e-05, + "loss": 5.8805, + "step": 4031 + }, + { + "epoch": 0.023979446188980874, + "grad_norm": 2.312434673309326, + "learning_rate": 4.9929129204658605e-05, + "loss": 6.5161, + "step": 4032 + }, + { + "epoch": 0.02398539347226187, + "grad_norm": 2.02748966217041, + "learning_rate": 4.9929094054044944e-05, + "loss": 6.1272, + "step": 4033 + }, + { + "epoch": 0.023991340755542868, + "grad_norm": 2.280242443084717, + "learning_rate": 4.992905889472881e-05, + "loss": 5.7217, + "step": 4034 + }, + { + "epoch": 0.023997288038823866, + "grad_norm": 2.3911778926849365, + "learning_rate": 4.992902372671019e-05, + "loss": 5.7441, + "step": 4035 + }, + { + "epoch": 0.02400323532210486, + "grad_norm": 2.1767921447753906, + "learning_rate": 4.99289885499891e-05, + "loss": 5.7212, + "step": 4036 + }, + { + "epoch": 0.02400918260538586, + "grad_norm": 2.3067142963409424, + "learning_rate": 4.992895336456557e-05, + "loss": 5.6689, + "step": 4037 + }, + { + "epoch": 0.02401512988866686, + "grad_norm": 2.1564273834228516, + "learning_rate": 4.992891817043959e-05, + "loss": 6.1445, + "step": 4038 + }, + { + "epoch": 0.024021077171947854, + "grad_norm": 2.4852945804595947, + "learning_rate": 4.9928882967611184e-05, + "loss": 6.1883, + "step": 4039 + }, + { + "epoch": 0.024027024455228852, + "grad_norm": 2.9280812740325928, + "learning_rate": 4.992884775608036e-05, + "loss": 6.097, + "step": 4040 + }, + { + "epoch": 0.024032971738509847, + "grad_norm": 2.3219356536865234, + "learning_rate": 4.992881253584714e-05, + "loss": 6.3163, + "step": 4041 + }, + { + "epoch": 0.024038919021790846, + "grad_norm": 2.672386884689331, + "learning_rate": 4.9928777306911525e-05, + "loss": 5.9615, + "step": 4042 + }, + { + "epoch": 0.024044866305071844, + "grad_norm": 2.5886473655700684, + "learning_rate": 4.992874206927353e-05, + "loss": 6.0114, + "step": 4043 + }, + { + "epoch": 0.02405081358835284, + "grad_norm": 2.991230010986328, + "learning_rate": 4.992870682293318e-05, + "loss": 5.6805, + "step": 4044 + }, + { + "epoch": 0.024056760871633838, + "grad_norm": 2.3270034790039062, + "learning_rate": 4.9928671567890464e-05, + "loss": 5.7503, + "step": 4045 + }, + { + "epoch": 0.024062708154914837, + "grad_norm": 2.591627359390259, + "learning_rate": 4.99286363041454e-05, + "loss": 5.5707, + "step": 4046 + }, + { + "epoch": 0.02406865543819583, + "grad_norm": 2.1936891078948975, + "learning_rate": 4.992860103169802e-05, + "loss": 5.6503, + "step": 4047 + }, + { + "epoch": 0.02407460272147683, + "grad_norm": 2.2928214073181152, + "learning_rate": 4.992856575054832e-05, + "loss": 5.6067, + "step": 4048 + }, + { + "epoch": 0.024080550004757825, + "grad_norm": 2.4503591060638428, + "learning_rate": 4.992853046069632e-05, + "loss": 6.0067, + "step": 4049 + }, + { + "epoch": 0.024086497288038824, + "grad_norm": 2.84260630607605, + "learning_rate": 4.992849516214202e-05, + "loss": 6.4533, + "step": 4050 + }, + { + "epoch": 0.024092444571319822, + "grad_norm": 2.7172651290893555, + "learning_rate": 4.992845985488543e-05, + "loss": 6.4901, + "step": 4051 + }, + { + "epoch": 0.024098391854600817, + "grad_norm": 2.2101316452026367, + "learning_rate": 4.992842453892659e-05, + "loss": 6.3481, + "step": 4052 + }, + { + "epoch": 0.024104339137881816, + "grad_norm": 2.488199234008789, + "learning_rate": 4.992838921426549e-05, + "loss": 6.4893, + "step": 4053 + }, + { + "epoch": 0.02411028642116281, + "grad_norm": 2.3767058849334717, + "learning_rate": 4.992835388090215e-05, + "loss": 5.9828, + "step": 4054 + }, + { + "epoch": 0.02411623370444381, + "grad_norm": 2.3979814052581787, + "learning_rate": 4.992831853883657e-05, + "loss": 5.7607, + "step": 4055 + }, + { + "epoch": 0.024122180987724808, + "grad_norm": 2.766644239425659, + "learning_rate": 4.992828318806877e-05, + "loss": 5.523, + "step": 4056 + }, + { + "epoch": 0.024128128271005803, + "grad_norm": 3.3954427242279053, + "learning_rate": 4.9928247828598775e-05, + "loss": 6.1247, + "step": 4057 + }, + { + "epoch": 0.024134075554286802, + "grad_norm": 3.5597097873687744, + "learning_rate": 4.9928212460426585e-05, + "loss": 6.0877, + "step": 4058 + }, + { + "epoch": 0.0241400228375678, + "grad_norm": 2.8089418411254883, + "learning_rate": 4.992817708355221e-05, + "loss": 5.324, + "step": 4059 + }, + { + "epoch": 0.024145970120848795, + "grad_norm": 2.6756842136383057, + "learning_rate": 4.992814169797566e-05, + "loss": 5.5516, + "step": 4060 + }, + { + "epoch": 0.024151917404129794, + "grad_norm": 2.1218929290771484, + "learning_rate": 4.992810630369696e-05, + "loss": 6.102, + "step": 4061 + }, + { + "epoch": 0.02415786468741079, + "grad_norm": 2.7189652919769287, + "learning_rate": 4.992807090071611e-05, + "loss": 6.4258, + "step": 4062 + }, + { + "epoch": 0.024163811970691788, + "grad_norm": 2.4340744018554688, + "learning_rate": 4.992803548903313e-05, + "loss": 5.8059, + "step": 4063 + }, + { + "epoch": 0.024169759253972786, + "grad_norm": 2.46604323387146, + "learning_rate": 4.992800006864804e-05, + "loss": 5.8963, + "step": 4064 + }, + { + "epoch": 0.02417570653725378, + "grad_norm": 2.1969218254089355, + "learning_rate": 4.9927964639560835e-05, + "loss": 5.7835, + "step": 4065 + }, + { + "epoch": 0.02418165382053478, + "grad_norm": 2.4529223442077637, + "learning_rate": 4.9927929201771535e-05, + "loss": 6.3405, + "step": 4066 + }, + { + "epoch": 0.02418760110381578, + "grad_norm": 2.145331859588623, + "learning_rate": 4.992789375528015e-05, + "loss": 6.14, + "step": 4067 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 2.212646961212158, + "learning_rate": 4.99278583000867e-05, + "loss": 5.8793, + "step": 4068 + }, + { + "epoch": 0.024199495670377772, + "grad_norm": 2.3249876499176025, + "learning_rate": 4.992782283619118e-05, + "loss": 5.8702, + "step": 4069 + }, + { + "epoch": 0.024205442953658767, + "grad_norm": 2.180964946746826, + "learning_rate": 4.9927787363593634e-05, + "loss": 6.216, + "step": 4070 + }, + { + "epoch": 0.024211390236939766, + "grad_norm": 2.5633153915405273, + "learning_rate": 4.992775188229405e-05, + "loss": 6.031, + "step": 4071 + }, + { + "epoch": 0.024217337520220764, + "grad_norm": 2.867342233657837, + "learning_rate": 4.992771639229244e-05, + "loss": 5.9853, + "step": 4072 + }, + { + "epoch": 0.02422328480350176, + "grad_norm": 2.111253023147583, + "learning_rate": 4.992768089358882e-05, + "loss": 5.8404, + "step": 4073 + }, + { + "epoch": 0.024229232086782758, + "grad_norm": 1.9325549602508545, + "learning_rate": 4.992764538618321e-05, + "loss": 6.0175, + "step": 4074 + }, + { + "epoch": 0.024235179370063756, + "grad_norm": 2.721740484237671, + "learning_rate": 4.992760987007561e-05, + "loss": 5.9274, + "step": 4075 + }, + { + "epoch": 0.02424112665334475, + "grad_norm": 3.5240588188171387, + "learning_rate": 4.992757434526604e-05, + "loss": 5.3593, + "step": 4076 + }, + { + "epoch": 0.02424707393662575, + "grad_norm": 2.744248867034912, + "learning_rate": 4.9927538811754516e-05, + "loss": 5.8938, + "step": 4077 + }, + { + "epoch": 0.024253021219906745, + "grad_norm": 2.545384645462036, + "learning_rate": 4.992750326954104e-05, + "loss": 6.2127, + "step": 4078 + }, + { + "epoch": 0.024258968503187744, + "grad_norm": 2.7550806999206543, + "learning_rate": 4.992746771862563e-05, + "loss": 6.0784, + "step": 4079 + }, + { + "epoch": 0.024264915786468742, + "grad_norm": 2.408040761947632, + "learning_rate": 4.9927432159008305e-05, + "loss": 5.5908, + "step": 4080 + }, + { + "epoch": 0.024270863069749737, + "grad_norm": 2.581378698348999, + "learning_rate": 4.9927396590689066e-05, + "loss": 5.4438, + "step": 4081 + }, + { + "epoch": 0.024276810353030736, + "grad_norm": 2.4320218563079834, + "learning_rate": 4.992736101366794e-05, + "loss": 5.6239, + "step": 4082 + }, + { + "epoch": 0.024282757636311735, + "grad_norm": 2.4725472927093506, + "learning_rate": 4.992732542794492e-05, + "loss": 6.237, + "step": 4083 + }, + { + "epoch": 0.02428870491959273, + "grad_norm": 2.3081839084625244, + "learning_rate": 4.992728983352003e-05, + "loss": 5.9917, + "step": 4084 + }, + { + "epoch": 0.024294652202873728, + "grad_norm": 1.9090701341629028, + "learning_rate": 4.9927254230393287e-05, + "loss": 5.9125, + "step": 4085 + }, + { + "epoch": 0.024300599486154723, + "grad_norm": 2.3943240642547607, + "learning_rate": 4.992721861856468e-05, + "loss": 5.3431, + "step": 4086 + }, + { + "epoch": 0.024306546769435722, + "grad_norm": 2.226968765258789, + "learning_rate": 4.992718299803425e-05, + "loss": 5.4328, + "step": 4087 + }, + { + "epoch": 0.02431249405271672, + "grad_norm": 2.238218307495117, + "learning_rate": 4.9927147368801994e-05, + "loss": 5.4877, + "step": 4088 + }, + { + "epoch": 0.024318441335997715, + "grad_norm": 2.216540575027466, + "learning_rate": 4.992711173086794e-05, + "loss": 5.4037, + "step": 4089 + }, + { + "epoch": 0.024324388619278714, + "grad_norm": 2.3136301040649414, + "learning_rate": 4.992707608423208e-05, + "loss": 5.4576, + "step": 4090 + }, + { + "epoch": 0.02433033590255971, + "grad_norm": 2.0434980392456055, + "learning_rate": 4.9927040428894436e-05, + "loss": 5.8044, + "step": 4091 + }, + { + "epoch": 0.024336283185840708, + "grad_norm": 2.7837064266204834, + "learning_rate": 4.992700476485502e-05, + "loss": 6.4183, + "step": 4092 + }, + { + "epoch": 0.024342230469121706, + "grad_norm": 2.580411195755005, + "learning_rate": 4.992696909211384e-05, + "loss": 5.4545, + "step": 4093 + }, + { + "epoch": 0.0243481777524027, + "grad_norm": 2.1215696334838867, + "learning_rate": 4.9926933410670916e-05, + "loss": 5.5629, + "step": 4094 + }, + { + "epoch": 0.0243541250356837, + "grad_norm": 1.9621074199676514, + "learning_rate": 4.992689772052626e-05, + "loss": 5.5248, + "step": 4095 + }, + { + "epoch": 0.0243600723189647, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.992686202167988e-05, + "loss": 5.3285, + "step": 4096 + }, + { + "epoch": 0.024366019602245693, + "grad_norm": 1.9506359100341797, + "learning_rate": 4.992682631413179e-05, + "loss": 5.7989, + "step": 4097 + }, + { + "epoch": 0.024371966885526692, + "grad_norm": 1.9154741764068604, + "learning_rate": 4.9926790597882e-05, + "loss": 5.6029, + "step": 4098 + }, + { + "epoch": 0.024377914168807687, + "grad_norm": 2.2147481441497803, + "learning_rate": 4.9926754872930524e-05, + "loss": 5.5406, + "step": 4099 + }, + { + "epoch": 0.024383861452088686, + "grad_norm": 2.1268460750579834, + "learning_rate": 4.992671913927738e-05, + "loss": 5.6434, + "step": 4100 + }, + { + "epoch": 0.024389808735369684, + "grad_norm": 2.1212456226348877, + "learning_rate": 4.992668339692258e-05, + "loss": 5.6888, + "step": 4101 + }, + { + "epoch": 0.02439575601865068, + "grad_norm": 2.2292001247406006, + "learning_rate": 4.992664764586612e-05, + "loss": 5.3982, + "step": 4102 + }, + { + "epoch": 0.024401703301931678, + "grad_norm": 2.2713210582733154, + "learning_rate": 4.9926611886108035e-05, + "loss": 5.3521, + "step": 4103 + }, + { + "epoch": 0.024407650585212676, + "grad_norm": 2.273437738418579, + "learning_rate": 4.9926576117648314e-05, + "loss": 5.474, + "step": 4104 + }, + { + "epoch": 0.02441359786849367, + "grad_norm": 2.2879083156585693, + "learning_rate": 4.9926540340487e-05, + "loss": 5.4474, + "step": 4105 + }, + { + "epoch": 0.02441954515177467, + "grad_norm": 2.2517430782318115, + "learning_rate": 4.992650455462408e-05, + "loss": 5.5013, + "step": 4106 + }, + { + "epoch": 0.024425492435055665, + "grad_norm": 2.1391677856445312, + "learning_rate": 4.992646876005957e-05, + "loss": 5.3899, + "step": 4107 + }, + { + "epoch": 0.024431439718336664, + "grad_norm": 2.2989962100982666, + "learning_rate": 4.9926432956793494e-05, + "loss": 5.7995, + "step": 4108 + }, + { + "epoch": 0.024437387001617662, + "grad_norm": 2.550706386566162, + "learning_rate": 4.992639714482586e-05, + "loss": 5.6599, + "step": 4109 + }, + { + "epoch": 0.024443334284898657, + "grad_norm": 2.321398973464966, + "learning_rate": 4.992636132415667e-05, + "loss": 5.6852, + "step": 4110 + }, + { + "epoch": 0.024449281568179656, + "grad_norm": 2.300795555114746, + "learning_rate": 4.992632549478595e-05, + "loss": 5.7318, + "step": 4111 + }, + { + "epoch": 0.024455228851460654, + "grad_norm": 2.229156970977783, + "learning_rate": 4.992628965671371e-05, + "loss": 5.6617, + "step": 4112 + }, + { + "epoch": 0.02446117613474165, + "grad_norm": 2.253934144973755, + "learning_rate": 4.992625380993995e-05, + "loss": 5.5762, + "step": 4113 + }, + { + "epoch": 0.024467123418022648, + "grad_norm": 2.0932998657226562, + "learning_rate": 4.992621795446471e-05, + "loss": 5.568, + "step": 4114 + }, + { + "epoch": 0.024473070701303643, + "grad_norm": 2.5969886779785156, + "learning_rate": 4.9926182090287966e-05, + "loss": 5.6626, + "step": 4115 + }, + { + "epoch": 0.02447901798458464, + "grad_norm": 2.5260698795318604, + "learning_rate": 4.992614621740976e-05, + "loss": 5.6333, + "step": 4116 + }, + { + "epoch": 0.02448496526786564, + "grad_norm": 2.0017902851104736, + "learning_rate": 4.992611033583009e-05, + "loss": 5.793, + "step": 4117 + }, + { + "epoch": 0.024490912551146635, + "grad_norm": 2.1847705841064453, + "learning_rate": 4.992607444554898e-05, + "loss": 5.8348, + "step": 4118 + }, + { + "epoch": 0.024496859834427634, + "grad_norm": 2.141007900238037, + "learning_rate": 4.992603854656642e-05, + "loss": 5.7835, + "step": 4119 + }, + { + "epoch": 0.02450280711770863, + "grad_norm": 2.294605255126953, + "learning_rate": 4.992600263888245e-05, + "loss": 5.6615, + "step": 4120 + }, + { + "epoch": 0.024508754400989628, + "grad_norm": 2.433936357498169, + "learning_rate": 4.9925966722497064e-05, + "loss": 5.6479, + "step": 4121 + }, + { + "epoch": 0.024514701684270626, + "grad_norm": 2.1522979736328125, + "learning_rate": 4.992593079741028e-05, + "loss": 5.5761, + "step": 4122 + }, + { + "epoch": 0.02452064896755162, + "grad_norm": 2.141065835952759, + "learning_rate": 4.9925894863622114e-05, + "loss": 5.602, + "step": 4123 + }, + { + "epoch": 0.02452659625083262, + "grad_norm": 2.187838554382324, + "learning_rate": 4.9925858921132576e-05, + "loss": 5.6337, + "step": 4124 + }, + { + "epoch": 0.02453254353411362, + "grad_norm": 2.303027629852295, + "learning_rate": 4.992582296994167e-05, + "loss": 5.6126, + "step": 4125 + }, + { + "epoch": 0.024538490817394613, + "grad_norm": 1.9233589172363281, + "learning_rate": 4.992578701004943e-05, + "loss": 5.5852, + "step": 4126 + }, + { + "epoch": 0.024544438100675612, + "grad_norm": 2.0383386611938477, + "learning_rate": 4.992575104145585e-05, + "loss": 5.6477, + "step": 4127 + }, + { + "epoch": 0.024550385383956607, + "grad_norm": 2.2752933502197266, + "learning_rate": 4.9925715064160946e-05, + "loss": 5.6263, + "step": 4128 + }, + { + "epoch": 0.024556332667237606, + "grad_norm": 2.400083541870117, + "learning_rate": 4.9925679078164734e-05, + "loss": 5.5249, + "step": 4129 + }, + { + "epoch": 0.024562279950518604, + "grad_norm": 2.167365312576294, + "learning_rate": 4.992564308346722e-05, + "loss": 5.7299, + "step": 4130 + }, + { + "epoch": 0.0245682272337996, + "grad_norm": 1.9696096181869507, + "learning_rate": 4.9925607080068426e-05, + "loss": 5.7961, + "step": 4131 + }, + { + "epoch": 0.024574174517080598, + "grad_norm": 2.1817007064819336, + "learning_rate": 4.992557106796836e-05, + "loss": 5.7973, + "step": 4132 + }, + { + "epoch": 0.024580121800361596, + "grad_norm": 2.4329075813293457, + "learning_rate": 4.992553504716704e-05, + "loss": 6.2428, + "step": 4133 + }, + { + "epoch": 0.02458606908364259, + "grad_norm": 2.159193754196167, + "learning_rate": 4.9925499017664464e-05, + "loss": 5.5784, + "step": 4134 + }, + { + "epoch": 0.02459201636692359, + "grad_norm": 2.2614853382110596, + "learning_rate": 4.992546297946066e-05, + "loss": 5.7572, + "step": 4135 + }, + { + "epoch": 0.024597963650204585, + "grad_norm": 2.2874412536621094, + "learning_rate": 4.992542693255563e-05, + "loss": 5.5726, + "step": 4136 + }, + { + "epoch": 0.024603910933485584, + "grad_norm": 2.1634466648101807, + "learning_rate": 4.992539087694939e-05, + "loss": 5.5112, + "step": 4137 + }, + { + "epoch": 0.024609858216766582, + "grad_norm": 2.195528507232666, + "learning_rate": 4.9925354812641955e-05, + "loss": 5.6073, + "step": 4138 + }, + { + "epoch": 0.024615805500047577, + "grad_norm": 2.0328054428100586, + "learning_rate": 4.992531873963334e-05, + "loss": 5.5686, + "step": 4139 + }, + { + "epoch": 0.024621752783328576, + "grad_norm": 2.244218349456787, + "learning_rate": 4.992528265792355e-05, + "loss": 5.6871, + "step": 4140 + }, + { + "epoch": 0.024627700066609574, + "grad_norm": 2.081721544265747, + "learning_rate": 4.992524656751261e-05, + "loss": 5.5327, + "step": 4141 + }, + { + "epoch": 0.02463364734989057, + "grad_norm": 1.9305940866470337, + "learning_rate": 4.992521046840051e-05, + "loss": 5.5265, + "step": 4142 + }, + { + "epoch": 0.024639594633171568, + "grad_norm": 2.624286651611328, + "learning_rate": 4.992517436058728e-05, + "loss": 5.3881, + "step": 4143 + }, + { + "epoch": 0.024645541916452563, + "grad_norm": 2.204803705215454, + "learning_rate": 4.9925138244072935e-05, + "loss": 5.6686, + "step": 4144 + }, + { + "epoch": 0.02465148919973356, + "grad_norm": 2.4664852619171143, + "learning_rate": 4.992510211885748e-05, + "loss": 5.3152, + "step": 4145 + }, + { + "epoch": 0.02465743648301456, + "grad_norm": 2.3428542613983154, + "learning_rate": 4.992506598494093e-05, + "loss": 5.5875, + "step": 4146 + }, + { + "epoch": 0.024663383766295555, + "grad_norm": 2.1902847290039062, + "learning_rate": 4.992502984232329e-05, + "loss": 5.4826, + "step": 4147 + }, + { + "epoch": 0.024669331049576554, + "grad_norm": 2.0401039123535156, + "learning_rate": 4.992499369100459e-05, + "loss": 5.518, + "step": 4148 + }, + { + "epoch": 0.02467527833285755, + "grad_norm": 2.5250306129455566, + "learning_rate": 4.9924957530984825e-05, + "loss": 5.5744, + "step": 4149 + }, + { + "epoch": 0.024681225616138548, + "grad_norm": 1.9975959062576294, + "learning_rate": 4.9924921362264016e-05, + "loss": 5.6834, + "step": 4150 + }, + { + "epoch": 0.024687172899419546, + "grad_norm": 2.047011375427246, + "learning_rate": 4.992488518484217e-05, + "loss": 5.6703, + "step": 4151 + }, + { + "epoch": 0.02469312018270054, + "grad_norm": 2.142411470413208, + "learning_rate": 4.9924848998719314e-05, + "loss": 5.781, + "step": 4152 + }, + { + "epoch": 0.02469906746598154, + "grad_norm": 2.1012768745422363, + "learning_rate": 4.992481280389545e-05, + "loss": 5.618, + "step": 4153 + }, + { + "epoch": 0.024705014749262538, + "grad_norm": 2.4698173999786377, + "learning_rate": 4.9924776600370584e-05, + "loss": 6.4773, + "step": 4154 + }, + { + "epoch": 0.024710962032543533, + "grad_norm": 2.4975368976593018, + "learning_rate": 4.992474038814474e-05, + "loss": 5.2568, + "step": 4155 + }, + { + "epoch": 0.024716909315824532, + "grad_norm": 1.8329259157180786, + "learning_rate": 4.992470416721793e-05, + "loss": 5.775, + "step": 4156 + }, + { + "epoch": 0.024722856599105527, + "grad_norm": 1.9757754802703857, + "learning_rate": 4.992466793759015e-05, + "loss": 5.5408, + "step": 4157 + }, + { + "epoch": 0.024728803882386526, + "grad_norm": 1.8300005197525024, + "learning_rate": 4.9924631699261434e-05, + "loss": 5.5356, + "step": 4158 + }, + { + "epoch": 0.024734751165667524, + "grad_norm": 2.099102735519409, + "learning_rate": 4.992459545223179e-05, + "loss": 5.6811, + "step": 4159 + }, + { + "epoch": 0.02474069844894852, + "grad_norm": 2.000169277191162, + "learning_rate": 4.992455919650123e-05, + "loss": 5.511, + "step": 4160 + }, + { + "epoch": 0.024746645732229518, + "grad_norm": 2.0555150508880615, + "learning_rate": 4.992452293206976e-05, + "loss": 5.7553, + "step": 4161 + }, + { + "epoch": 0.024752593015510516, + "grad_norm": 2.0416486263275146, + "learning_rate": 4.99244866589374e-05, + "loss": 5.6965, + "step": 4162 + }, + { + "epoch": 0.02475854029879151, + "grad_norm": 2.0028059482574463, + "learning_rate": 4.9924450377104146e-05, + "loss": 5.7211, + "step": 4163 + }, + { + "epoch": 0.02476448758207251, + "grad_norm": 2.22377872467041, + "learning_rate": 4.992441408657004e-05, + "loss": 5.6384, + "step": 4164 + }, + { + "epoch": 0.024770434865353505, + "grad_norm": 2.038804531097412, + "learning_rate": 4.9924377787335064e-05, + "loss": 5.6351, + "step": 4165 + }, + { + "epoch": 0.024776382148634504, + "grad_norm": 2.357773542404175, + "learning_rate": 4.992434147939925e-05, + "loss": 5.2791, + "step": 4166 + }, + { + "epoch": 0.024782329431915502, + "grad_norm": 2.1949357986450195, + "learning_rate": 4.992430516276261e-05, + "loss": 5.7389, + "step": 4167 + }, + { + "epoch": 0.024788276715196497, + "grad_norm": 2.1015608310699463, + "learning_rate": 4.992426883742516e-05, + "loss": 5.632, + "step": 4168 + }, + { + "epoch": 0.024794223998477496, + "grad_norm": 2.166201591491699, + "learning_rate": 4.992423250338689e-05, + "loss": 5.5701, + "step": 4169 + }, + { + "epoch": 0.024800171281758494, + "grad_norm": 2.0805492401123047, + "learning_rate": 4.9924196160647836e-05, + "loss": 5.5955, + "step": 4170 + }, + { + "epoch": 0.02480611856503949, + "grad_norm": 1.803229570388794, + "learning_rate": 4.9924159809208e-05, + "loss": 5.6267, + "step": 4171 + }, + { + "epoch": 0.024812065848320488, + "grad_norm": 2.008639335632324, + "learning_rate": 4.9924123449067393e-05, + "loss": 5.6667, + "step": 4172 + }, + { + "epoch": 0.024818013131601483, + "grad_norm": 1.9843655824661255, + "learning_rate": 4.9924087080226044e-05, + "loss": 5.5981, + "step": 4173 + }, + { + "epoch": 0.02482396041488248, + "grad_norm": 2.10270357131958, + "learning_rate": 4.9924050702683946e-05, + "loss": 5.5293, + "step": 4174 + }, + { + "epoch": 0.02482990769816348, + "grad_norm": 2.315976142883301, + "learning_rate": 4.992401431644112e-05, + "loss": 5.6046, + "step": 4175 + }, + { + "epoch": 0.024835854981444475, + "grad_norm": 2.168473482131958, + "learning_rate": 4.992397792149758e-05, + "loss": 5.4271, + "step": 4176 + }, + { + "epoch": 0.024841802264725474, + "grad_norm": 2.1870200634002686, + "learning_rate": 4.9923941517853335e-05, + "loss": 5.6399, + "step": 4177 + }, + { + "epoch": 0.024847749548006472, + "grad_norm": 2.2944717407226562, + "learning_rate": 4.9923905105508394e-05, + "loss": 5.4483, + "step": 4178 + }, + { + "epoch": 0.024853696831287467, + "grad_norm": 2.1662731170654297, + "learning_rate": 4.9923868684462785e-05, + "loss": 5.6773, + "step": 4179 + }, + { + "epoch": 0.024859644114568466, + "grad_norm": 1.7448937892913818, + "learning_rate": 4.992383225471651e-05, + "loss": 5.6097, + "step": 4180 + }, + { + "epoch": 0.02486559139784946, + "grad_norm": 2.3577585220336914, + "learning_rate": 4.9923795816269576e-05, + "loss": 5.5003, + "step": 4181 + }, + { + "epoch": 0.02487153868113046, + "grad_norm": 2.4175360202789307, + "learning_rate": 4.9923759369122e-05, + "loss": 5.4925, + "step": 4182 + }, + { + "epoch": 0.024877485964411458, + "grad_norm": 2.199329137802124, + "learning_rate": 4.992372291327381e-05, + "loss": 5.6239, + "step": 4183 + }, + { + "epoch": 0.024883433247692453, + "grad_norm": 2.054450511932373, + "learning_rate": 4.9923686448724994e-05, + "loss": 5.59, + "step": 4184 + }, + { + "epoch": 0.024889380530973452, + "grad_norm": 2.0354533195495605, + "learning_rate": 4.9923649975475585e-05, + "loss": 5.6092, + "step": 4185 + }, + { + "epoch": 0.024895327814254447, + "grad_norm": 2.0409371852874756, + "learning_rate": 4.9923613493525576e-05, + "loss": 5.5009, + "step": 4186 + }, + { + "epoch": 0.024901275097535445, + "grad_norm": 2.3314719200134277, + "learning_rate": 4.992357700287501e-05, + "loss": 5.5077, + "step": 4187 + }, + { + "epoch": 0.024907222380816444, + "grad_norm": 2.050706386566162, + "learning_rate": 4.9923540503523865e-05, + "loss": 5.5857, + "step": 4188 + }, + { + "epoch": 0.02491316966409744, + "grad_norm": 2.3477721214294434, + "learning_rate": 4.992350399547218e-05, + "loss": 5.5119, + "step": 4189 + }, + { + "epoch": 0.024919116947378438, + "grad_norm": 2.365171194076538, + "learning_rate": 4.992346747871994e-05, + "loss": 5.583, + "step": 4190 + }, + { + "epoch": 0.024925064230659436, + "grad_norm": 1.9642738103866577, + "learning_rate": 4.992343095326719e-05, + "loss": 5.3527, + "step": 4191 + }, + { + "epoch": 0.02493101151394043, + "grad_norm": 2.25437593460083, + "learning_rate": 4.992339441911392e-05, + "loss": 5.4751, + "step": 4192 + }, + { + "epoch": 0.02493695879722143, + "grad_norm": 2.0476715564727783, + "learning_rate": 4.992335787626016e-05, + "loss": 5.5808, + "step": 4193 + }, + { + "epoch": 0.024942906080502425, + "grad_norm": 2.248382329940796, + "learning_rate": 4.992332132470591e-05, + "loss": 5.5771, + "step": 4194 + }, + { + "epoch": 0.024948853363783424, + "grad_norm": 2.279232978820801, + "learning_rate": 4.992328476445118e-05, + "loss": 5.3803, + "step": 4195 + }, + { + "epoch": 0.024954800647064422, + "grad_norm": 2.0171918869018555, + "learning_rate": 4.992324819549599e-05, + "loss": 5.662, + "step": 4196 + }, + { + "epoch": 0.024960747930345417, + "grad_norm": 2.14736008644104, + "learning_rate": 4.992321161784036e-05, + "loss": 5.6422, + "step": 4197 + }, + { + "epoch": 0.024966695213626416, + "grad_norm": 2.1694438457489014, + "learning_rate": 4.9923175031484284e-05, + "loss": 5.4377, + "step": 4198 + }, + { + "epoch": 0.024972642496907414, + "grad_norm": 1.9280356168746948, + "learning_rate": 4.9923138436427784e-05, + "loss": 5.5499, + "step": 4199 + }, + { + "epoch": 0.02497858978018841, + "grad_norm": 2.185974359512329, + "learning_rate": 4.992310183267088e-05, + "loss": 5.6404, + "step": 4200 + }, + { + "epoch": 0.024984537063469408, + "grad_norm": 2.102681875228882, + "learning_rate": 4.9923065220213585e-05, + "loss": 5.5888, + "step": 4201 + }, + { + "epoch": 0.024990484346750403, + "grad_norm": 2.07100772857666, + "learning_rate": 4.99230285990559e-05, + "loss": 5.6473, + "step": 4202 + }, + { + "epoch": 0.0249964316300314, + "grad_norm": 2.088634967803955, + "learning_rate": 4.992299196919784e-05, + "loss": 5.4993, + "step": 4203 + }, + { + "epoch": 0.0250023789133124, + "grad_norm": 2.2086873054504395, + "learning_rate": 4.992295533063942e-05, + "loss": 5.5797, + "step": 4204 + }, + { + "epoch": 0.025008326196593395, + "grad_norm": 2.250753164291382, + "learning_rate": 4.992291868338066e-05, + "loss": 5.5666, + "step": 4205 + }, + { + "epoch": 0.025014273479874394, + "grad_norm": 2.132636785507202, + "learning_rate": 4.992288202742156e-05, + "loss": 5.6715, + "step": 4206 + }, + { + "epoch": 0.025020220763155392, + "grad_norm": 2.8332200050354004, + "learning_rate": 4.992284536276214e-05, + "loss": 4.9687, + "step": 4207 + }, + { + "epoch": 0.025026168046436387, + "grad_norm": 2.345991849899292, + "learning_rate": 4.992280868940241e-05, + "loss": 5.2181, + "step": 4208 + }, + { + "epoch": 0.025032115329717386, + "grad_norm": 2.149568557739258, + "learning_rate": 4.992277200734239e-05, + "loss": 5.5336, + "step": 4209 + }, + { + "epoch": 0.02503806261299838, + "grad_norm": 2.031353235244751, + "learning_rate": 4.992273531658209e-05, + "loss": 5.5779, + "step": 4210 + }, + { + "epoch": 0.02504400989627938, + "grad_norm": 2.217374086380005, + "learning_rate": 4.9922698617121524e-05, + "loss": 5.782, + "step": 4211 + }, + { + "epoch": 0.025049957179560378, + "grad_norm": 2.3629000186920166, + "learning_rate": 4.992266190896069e-05, + "loss": 5.7916, + "step": 4212 + }, + { + "epoch": 0.025055904462841373, + "grad_norm": 2.2439091205596924, + "learning_rate": 4.9922625192099616e-05, + "loss": 5.8002, + "step": 4213 + }, + { + "epoch": 0.025061851746122372, + "grad_norm": 2.1707634925842285, + "learning_rate": 4.992258846653831e-05, + "loss": 6.5789, + "step": 4214 + }, + { + "epoch": 0.025067799029403367, + "grad_norm": 3.1655468940734863, + "learning_rate": 4.992255173227679e-05, + "loss": 6.3867, + "step": 4215 + }, + { + "epoch": 0.025073746312684365, + "grad_norm": 3.1309874057769775, + "learning_rate": 4.992251498931506e-05, + "loss": 6.2682, + "step": 4216 + }, + { + "epoch": 0.025079693595965364, + "grad_norm": 3.2077460289001465, + "learning_rate": 4.992247823765315e-05, + "loss": 5.8593, + "step": 4217 + }, + { + "epoch": 0.02508564087924636, + "grad_norm": 2.2944962978363037, + "learning_rate": 4.992244147729105e-05, + "loss": 5.7994, + "step": 4218 + }, + { + "epoch": 0.025091588162527358, + "grad_norm": 2.2380926609039307, + "learning_rate": 4.9922404708228776e-05, + "loss": 5.7606, + "step": 4219 + }, + { + "epoch": 0.025097535445808356, + "grad_norm": 2.601795196533203, + "learning_rate": 4.992236793046636e-05, + "loss": 5.7585, + "step": 4220 + }, + { + "epoch": 0.02510348272908935, + "grad_norm": 2.494765520095825, + "learning_rate": 4.99223311440038e-05, + "loss": 5.8102, + "step": 4221 + }, + { + "epoch": 0.02510943001237035, + "grad_norm": 2.4690544605255127, + "learning_rate": 4.992229434884111e-05, + "loss": 5.8682, + "step": 4222 + }, + { + "epoch": 0.025115377295651345, + "grad_norm": 2.1011085510253906, + "learning_rate": 4.99222575449783e-05, + "loss": 5.6982, + "step": 4223 + }, + { + "epoch": 0.025121324578932343, + "grad_norm": 2.2298128604888916, + "learning_rate": 4.992222073241539e-05, + "loss": 5.7606, + "step": 4224 + }, + { + "epoch": 0.025127271862213342, + "grad_norm": 1.93464994430542, + "learning_rate": 4.99221839111524e-05, + "loss": 5.7097, + "step": 4225 + }, + { + "epoch": 0.025133219145494337, + "grad_norm": 2.15191650390625, + "learning_rate": 4.9922147081189324e-05, + "loss": 5.5852, + "step": 4226 + }, + { + "epoch": 0.025139166428775336, + "grad_norm": 2.086954355239868, + "learning_rate": 4.992211024252619e-05, + "loss": 5.5871, + "step": 4227 + }, + { + "epoch": 0.025145113712056334, + "grad_norm": 2.212296724319458, + "learning_rate": 4.9922073395162995e-05, + "loss": 5.562, + "step": 4228 + }, + { + "epoch": 0.02515106099533733, + "grad_norm": 2.0786778926849365, + "learning_rate": 4.992203653909977e-05, + "loss": 5.6599, + "step": 4229 + }, + { + "epoch": 0.025157008278618328, + "grad_norm": 2.3243489265441895, + "learning_rate": 4.9921999674336514e-05, + "loss": 5.9791, + "step": 4230 + }, + { + "epoch": 0.025162955561899323, + "grad_norm": 2.1922898292541504, + "learning_rate": 4.9921962800873247e-05, + "loss": 5.7352, + "step": 4231 + }, + { + "epoch": 0.02516890284518032, + "grad_norm": 2.1154398918151855, + "learning_rate": 4.992192591870998e-05, + "loss": 5.6408, + "step": 4232 + }, + { + "epoch": 0.02517485012846132, + "grad_norm": 2.3520143032073975, + "learning_rate": 4.992188902784673e-05, + "loss": 5.6318, + "step": 4233 + }, + { + "epoch": 0.025180797411742315, + "grad_norm": 2.16597580909729, + "learning_rate": 4.99218521282835e-05, + "loss": 5.4978, + "step": 4234 + }, + { + "epoch": 0.025186744695023314, + "grad_norm": 2.2510032653808594, + "learning_rate": 4.992181522002032e-05, + "loss": 5.4863, + "step": 4235 + }, + { + "epoch": 0.025192691978304312, + "grad_norm": 1.9984945058822632, + "learning_rate": 4.9921778303057174e-05, + "loss": 5.7514, + "step": 4236 + }, + { + "epoch": 0.025198639261585307, + "grad_norm": 2.019435167312622, + "learning_rate": 4.9921741377394106e-05, + "loss": 5.6481, + "step": 4237 + }, + { + "epoch": 0.025204586544866306, + "grad_norm": 1.8546136617660522, + "learning_rate": 4.9921704443031114e-05, + "loss": 5.5907, + "step": 4238 + }, + { + "epoch": 0.0252105338281473, + "grad_norm": 2.012821912765503, + "learning_rate": 4.9921667499968214e-05, + "loss": 5.6942, + "step": 4239 + }, + { + "epoch": 0.0252164811114283, + "grad_norm": 2.215322971343994, + "learning_rate": 4.992163054820541e-05, + "loss": 5.6248, + "step": 4240 + }, + { + "epoch": 0.025222428394709298, + "grad_norm": 2.1009631156921387, + "learning_rate": 4.9921593587742726e-05, + "loss": 5.7769, + "step": 4241 + }, + { + "epoch": 0.025228375677990293, + "grad_norm": 2.280970335006714, + "learning_rate": 4.992155661858017e-05, + "loss": 5.4233, + "step": 4242 + }, + { + "epoch": 0.025234322961271292, + "grad_norm": 2.324589729309082, + "learning_rate": 4.992151964071776e-05, + "loss": 5.7138, + "step": 4243 + }, + { + "epoch": 0.025240270244552287, + "grad_norm": 2.01705002784729, + "learning_rate": 4.9921482654155506e-05, + "loss": 5.6946, + "step": 4244 + }, + { + "epoch": 0.025246217527833285, + "grad_norm": 2.0912036895751953, + "learning_rate": 4.9921445658893414e-05, + "loss": 5.8085, + "step": 4245 + }, + { + "epoch": 0.025252164811114284, + "grad_norm": 2.03450870513916, + "learning_rate": 4.99214086549315e-05, + "loss": 5.9129, + "step": 4246 + }, + { + "epoch": 0.02525811209439528, + "grad_norm": 2.1532092094421387, + "learning_rate": 4.9921371642269786e-05, + "loss": 5.708, + "step": 4247 + }, + { + "epoch": 0.025264059377676278, + "grad_norm": 2.2842540740966797, + "learning_rate": 4.992133462090828e-05, + "loss": 5.6693, + "step": 4248 + }, + { + "epoch": 0.025270006660957276, + "grad_norm": 2.0693325996398926, + "learning_rate": 4.9921297590846997e-05, + "loss": 5.7278, + "step": 4249 + }, + { + "epoch": 0.02527595394423827, + "grad_norm": 2.0139124393463135, + "learning_rate": 4.9921260552085934e-05, + "loss": 5.5897, + "step": 4250 + }, + { + "epoch": 0.02528190122751927, + "grad_norm": 2.4587321281433105, + "learning_rate": 4.9921223504625125e-05, + "loss": 5.6884, + "step": 4251 + }, + { + "epoch": 0.025287848510800265, + "grad_norm": 2.062640428543091, + "learning_rate": 4.992118644846457e-05, + "loss": 5.6189, + "step": 4252 + }, + { + "epoch": 0.025293795794081263, + "grad_norm": 1.9889299869537354, + "learning_rate": 4.992114938360429e-05, + "loss": 5.7326, + "step": 4253 + }, + { + "epoch": 0.025299743077362262, + "grad_norm": 2.001913547515869, + "learning_rate": 4.992111231004429e-05, + "loss": 5.6765, + "step": 4254 + }, + { + "epoch": 0.025305690360643257, + "grad_norm": 2.0345358848571777, + "learning_rate": 4.992107522778459e-05, + "loss": 5.5783, + "step": 4255 + }, + { + "epoch": 0.025311637643924256, + "grad_norm": 2.277817487716675, + "learning_rate": 4.9921038136825205e-05, + "loss": 5.6672, + "step": 4256 + }, + { + "epoch": 0.025317584927205254, + "grad_norm": 1.8992491960525513, + "learning_rate": 4.992100103716614e-05, + "loss": 5.532, + "step": 4257 + }, + { + "epoch": 0.02532353221048625, + "grad_norm": 2.202746629714966, + "learning_rate": 4.992096392880741e-05, + "loss": 5.697, + "step": 4258 + }, + { + "epoch": 0.025329479493767248, + "grad_norm": 2.020514488220215, + "learning_rate": 4.992092681174903e-05, + "loss": 5.9102, + "step": 4259 + }, + { + "epoch": 0.025335426777048243, + "grad_norm": 2.0697989463806152, + "learning_rate": 4.9920889685991e-05, + "loss": 5.5165, + "step": 4260 + }, + { + "epoch": 0.02534137406032924, + "grad_norm": 2.619258165359497, + "learning_rate": 4.992085255153336e-05, + "loss": 5.6577, + "step": 4261 + }, + { + "epoch": 0.02534732134361024, + "grad_norm": 2.1612637042999268, + "learning_rate": 4.99208154083761e-05, + "loss": 5.8193, + "step": 4262 + }, + { + "epoch": 0.025353268626891235, + "grad_norm": 1.9237465858459473, + "learning_rate": 4.9920778256519244e-05, + "loss": 5.6533, + "step": 4263 + }, + { + "epoch": 0.025359215910172234, + "grad_norm": 2.164339065551758, + "learning_rate": 4.99207410959628e-05, + "loss": 5.5566, + "step": 4264 + }, + { + "epoch": 0.025365163193453232, + "grad_norm": 2.0753626823425293, + "learning_rate": 4.992070392670678e-05, + "loss": 5.8444, + "step": 4265 + }, + { + "epoch": 0.025371110476734227, + "grad_norm": 1.977522850036621, + "learning_rate": 4.992066674875121e-05, + "loss": 5.6615, + "step": 4266 + }, + { + "epoch": 0.025377057760015226, + "grad_norm": 1.9911431074142456, + "learning_rate": 4.992062956209608e-05, + "loss": 5.6366, + "step": 4267 + }, + { + "epoch": 0.02538300504329622, + "grad_norm": 2.0334808826446533, + "learning_rate": 4.992059236674142e-05, + "loss": 5.8399, + "step": 4268 + }, + { + "epoch": 0.02538895232657722, + "grad_norm": 2.2869162559509277, + "learning_rate": 4.992055516268724e-05, + "loss": 5.7302, + "step": 4269 + }, + { + "epoch": 0.025394899609858218, + "grad_norm": 2.0845389366149902, + "learning_rate": 4.9920517949933556e-05, + "loss": 5.619, + "step": 4270 + }, + { + "epoch": 0.025400846893139213, + "grad_norm": 2.290881633758545, + "learning_rate": 4.9920480728480376e-05, + "loss": 5.5629, + "step": 4271 + }, + { + "epoch": 0.02540679417642021, + "grad_norm": 2.0897767543792725, + "learning_rate": 4.9920443498327706e-05, + "loss": 5.7009, + "step": 4272 + }, + { + "epoch": 0.025412741459701207, + "grad_norm": 1.8389668464660645, + "learning_rate": 4.9920406259475574e-05, + "loss": 5.6359, + "step": 4273 + }, + { + "epoch": 0.025418688742982205, + "grad_norm": 2.0262937545776367, + "learning_rate": 4.992036901192399e-05, + "loss": 5.6707, + "step": 4274 + }, + { + "epoch": 0.025424636026263204, + "grad_norm": 2.04280686378479, + "learning_rate": 4.992033175567295e-05, + "loss": 5.7917, + "step": 4275 + }, + { + "epoch": 0.0254305833095442, + "grad_norm": 2.0945205688476562, + "learning_rate": 4.992029449072249e-05, + "loss": 5.7208, + "step": 4276 + }, + { + "epoch": 0.025436530592825198, + "grad_norm": 1.9662036895751953, + "learning_rate": 4.992025721707261e-05, + "loss": 5.7141, + "step": 4277 + }, + { + "epoch": 0.025442477876106196, + "grad_norm": 2.582284450531006, + "learning_rate": 4.9920219934723316e-05, + "loss": 5.9514, + "step": 4278 + }, + { + "epoch": 0.02544842515938719, + "grad_norm": 1.9792051315307617, + "learning_rate": 4.992018264367464e-05, + "loss": 5.3867, + "step": 4279 + }, + { + "epoch": 0.02545437244266819, + "grad_norm": 2.0107717514038086, + "learning_rate": 4.992014534392658e-05, + "loss": 5.5985, + "step": 4280 + }, + { + "epoch": 0.025460319725949185, + "grad_norm": 2.2035727500915527, + "learning_rate": 4.9920108035479166e-05, + "loss": 5.6356, + "step": 4281 + }, + { + "epoch": 0.025466267009230183, + "grad_norm": 2.1973958015441895, + "learning_rate": 4.992007071833239e-05, + "loss": 5.3557, + "step": 4282 + }, + { + "epoch": 0.025472214292511182, + "grad_norm": 2.031371831893921, + "learning_rate": 4.9920033392486275e-05, + "loss": 5.484, + "step": 4283 + }, + { + "epoch": 0.025478161575792177, + "grad_norm": 1.9966185092926025, + "learning_rate": 4.991999605794084e-05, + "loss": 5.4137, + "step": 4284 + }, + { + "epoch": 0.025484108859073176, + "grad_norm": 1.699460506439209, + "learning_rate": 4.9919958714696085e-05, + "loss": 5.7099, + "step": 4285 + }, + { + "epoch": 0.025490056142354174, + "grad_norm": 2.270535945892334, + "learning_rate": 4.991992136275203e-05, + "loss": 5.6654, + "step": 4286 + }, + { + "epoch": 0.02549600342563517, + "grad_norm": 2.0636515617370605, + "learning_rate": 4.99198840021087e-05, + "loss": 5.6996, + "step": 4287 + }, + { + "epoch": 0.025501950708916168, + "grad_norm": 2.217365026473999, + "learning_rate": 4.991984663276608e-05, + "loss": 5.6148, + "step": 4288 + }, + { + "epoch": 0.025507897992197163, + "grad_norm": 2.182109832763672, + "learning_rate": 4.99198092547242e-05, + "loss": 5.6469, + "step": 4289 + }, + { + "epoch": 0.02551384527547816, + "grad_norm": 1.995924472808838, + "learning_rate": 4.9919771867983084e-05, + "loss": 5.7607, + "step": 4290 + }, + { + "epoch": 0.02551979255875916, + "grad_norm": 1.9308382272720337, + "learning_rate": 4.991973447254272e-05, + "loss": 5.7219, + "step": 4291 + }, + { + "epoch": 0.025525739842040155, + "grad_norm": 2.2675700187683105, + "learning_rate": 4.991969706840315e-05, + "loss": 5.7348, + "step": 4292 + }, + { + "epoch": 0.025531687125321154, + "grad_norm": 2.0441880226135254, + "learning_rate": 4.991965965556435e-05, + "loss": 5.5827, + "step": 4293 + }, + { + "epoch": 0.025537634408602152, + "grad_norm": 2.0111331939697266, + "learning_rate": 4.9919622234026376e-05, + "loss": 5.5355, + "step": 4294 + }, + { + "epoch": 0.025543581691883147, + "grad_norm": 2.214946985244751, + "learning_rate": 4.991958480378921e-05, + "loss": 5.5327, + "step": 4295 + }, + { + "epoch": 0.025549528975164146, + "grad_norm": 1.9673919677734375, + "learning_rate": 4.991954736485287e-05, + "loss": 5.5744, + "step": 4296 + }, + { + "epoch": 0.02555547625844514, + "grad_norm": 2.0662097930908203, + "learning_rate": 4.991950991721738e-05, + "loss": 5.5301, + "step": 4297 + }, + { + "epoch": 0.02556142354172614, + "grad_norm": 2.1912949085235596, + "learning_rate": 4.991947246088274e-05, + "loss": 5.6505, + "step": 4298 + }, + { + "epoch": 0.025567370825007138, + "grad_norm": 2.1073548793792725, + "learning_rate": 4.991943499584898e-05, + "loss": 5.7429, + "step": 4299 + }, + { + "epoch": 0.025573318108288133, + "grad_norm": 2.4015331268310547, + "learning_rate": 4.9919397522116096e-05, + "loss": 5.9959, + "step": 4300 + }, + { + "epoch": 0.02557926539156913, + "grad_norm": 2.5571470260620117, + "learning_rate": 4.99193600396841e-05, + "loss": 5.9058, + "step": 4301 + }, + { + "epoch": 0.02558521267485013, + "grad_norm": 2.148449182510376, + "learning_rate": 4.9919322548553026e-05, + "loss": 5.6298, + "step": 4302 + }, + { + "epoch": 0.025591159958131125, + "grad_norm": 2.3006222248077393, + "learning_rate": 4.991928504872287e-05, + "loss": 5.4854, + "step": 4303 + }, + { + "epoch": 0.025597107241412124, + "grad_norm": 2.2384679317474365, + "learning_rate": 4.9919247540193646e-05, + "loss": 5.7089, + "step": 4304 + }, + { + "epoch": 0.02560305452469312, + "grad_norm": 2.195736885070801, + "learning_rate": 4.9919210022965376e-05, + "loss": 5.986, + "step": 4305 + }, + { + "epoch": 0.025609001807974117, + "grad_norm": 2.3446342945098877, + "learning_rate": 4.991917249703806e-05, + "loss": 5.88, + "step": 4306 + }, + { + "epoch": 0.025614949091255116, + "grad_norm": 2.3800623416900635, + "learning_rate": 4.9919134962411724e-05, + "loss": 5.6897, + "step": 4307 + }, + { + "epoch": 0.02562089637453611, + "grad_norm": 1.8407396078109741, + "learning_rate": 4.991909741908637e-05, + "loss": 5.7359, + "step": 4308 + }, + { + "epoch": 0.02562684365781711, + "grad_norm": 2.3566956520080566, + "learning_rate": 4.9919059867062026e-05, + "loss": 5.5606, + "step": 4309 + }, + { + "epoch": 0.025632790941098105, + "grad_norm": 2.149317741394043, + "learning_rate": 4.991902230633869e-05, + "loss": 5.6966, + "step": 4310 + }, + { + "epoch": 0.025638738224379103, + "grad_norm": 2.3567728996276855, + "learning_rate": 4.991898473691638e-05, + "loss": 5.4694, + "step": 4311 + }, + { + "epoch": 0.025644685507660102, + "grad_norm": 1.9388068914413452, + "learning_rate": 4.9918947158795106e-05, + "loss": 5.5947, + "step": 4312 + }, + { + "epoch": 0.025650632790941097, + "grad_norm": 1.844419002532959, + "learning_rate": 4.9918909571974893e-05, + "loss": 5.6159, + "step": 4313 + }, + { + "epoch": 0.025656580074222095, + "grad_norm": 1.8664250373840332, + "learning_rate": 4.991887197645574e-05, + "loss": 5.7211, + "step": 4314 + }, + { + "epoch": 0.025662527357503094, + "grad_norm": 2.073004961013794, + "learning_rate": 4.991883437223767e-05, + "loss": 5.8873, + "step": 4315 + }, + { + "epoch": 0.02566847464078409, + "grad_norm": 2.316938877105713, + "learning_rate": 4.991879675932068e-05, + "loss": 5.4372, + "step": 4316 + }, + { + "epoch": 0.025674421924065088, + "grad_norm": 2.2646546363830566, + "learning_rate": 4.991875913770481e-05, + "loss": 5.5486, + "step": 4317 + }, + { + "epoch": 0.025680369207346083, + "grad_norm": 2.2417361736297607, + "learning_rate": 4.991872150739005e-05, + "loss": 5.2264, + "step": 4318 + }, + { + "epoch": 0.02568631649062708, + "grad_norm": 2.271566867828369, + "learning_rate": 4.9918683868376437e-05, + "loss": 5.1546, + "step": 4319 + }, + { + "epoch": 0.02569226377390808, + "grad_norm": 2.211650848388672, + "learning_rate": 4.9918646220663954e-05, + "loss": 5.382, + "step": 4320 + }, + { + "epoch": 0.025698211057189075, + "grad_norm": 2.3627288341522217, + "learning_rate": 4.991860856425263e-05, + "loss": 5.6099, + "step": 4321 + }, + { + "epoch": 0.025704158340470074, + "grad_norm": 2.3968141078948975, + "learning_rate": 4.991857089914249e-05, + "loss": 5.3689, + "step": 4322 + }, + { + "epoch": 0.025710105623751072, + "grad_norm": 2.3576786518096924, + "learning_rate": 4.991853322533352e-05, + "loss": 5.4441, + "step": 4323 + }, + { + "epoch": 0.025716052907032067, + "grad_norm": 2.0814530849456787, + "learning_rate": 4.991849554282575e-05, + "loss": 5.6137, + "step": 4324 + }, + { + "epoch": 0.025722000190313066, + "grad_norm": 2.103505849838257, + "learning_rate": 4.991845785161919e-05, + "loss": 5.5518, + "step": 4325 + }, + { + "epoch": 0.02572794747359406, + "grad_norm": 2.188350200653076, + "learning_rate": 4.991842015171386e-05, + "loss": 5.5958, + "step": 4326 + }, + { + "epoch": 0.02573389475687506, + "grad_norm": 2.124088764190674, + "learning_rate": 4.9918382443109766e-05, + "loss": 5.3851, + "step": 4327 + }, + { + "epoch": 0.025739842040156058, + "grad_norm": 2.181466579437256, + "learning_rate": 4.991834472580692e-05, + "loss": 5.4629, + "step": 4328 + }, + { + "epoch": 0.025745789323437053, + "grad_norm": 1.9634013175964355, + "learning_rate": 4.9918306999805344e-05, + "loss": 5.4768, + "step": 4329 + }, + { + "epoch": 0.02575173660671805, + "grad_norm": 2.2046115398406982, + "learning_rate": 4.991826926510503e-05, + "loss": 5.3977, + "step": 4330 + }, + { + "epoch": 0.02575768388999905, + "grad_norm": 1.8660465478897095, + "learning_rate": 4.9918231521706014e-05, + "loss": 5.4837, + "step": 4331 + }, + { + "epoch": 0.025763631173280045, + "grad_norm": 1.9825572967529297, + "learning_rate": 4.99181937696083e-05, + "loss": 5.5158, + "step": 4332 + }, + { + "epoch": 0.025769578456561044, + "grad_norm": 1.9114030599594116, + "learning_rate": 4.9918156008811906e-05, + "loss": 5.3291, + "step": 4333 + }, + { + "epoch": 0.02577552573984204, + "grad_norm": 2.008059024810791, + "learning_rate": 4.9918118239316835e-05, + "loss": 5.2993, + "step": 4334 + }, + { + "epoch": 0.025781473023123037, + "grad_norm": 2.0090153217315674, + "learning_rate": 4.991808046112311e-05, + "loss": 5.2951, + "step": 4335 + }, + { + "epoch": 0.025787420306404036, + "grad_norm": 2.013878345489502, + "learning_rate": 4.991804267423074e-05, + "loss": 5.3491, + "step": 4336 + }, + { + "epoch": 0.02579336758968503, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.9918004878639734e-05, + "loss": 5.2744, + "step": 4337 + }, + { + "epoch": 0.02579931487296603, + "grad_norm": 1.9945006370544434, + "learning_rate": 4.991796707435012e-05, + "loss": 5.5176, + "step": 4338 + }, + { + "epoch": 0.025805262156247025, + "grad_norm": 2.1205811500549316, + "learning_rate": 4.9917929261361894e-05, + "loss": 5.6534, + "step": 4339 + }, + { + "epoch": 0.025811209439528023, + "grad_norm": 2.6607353687286377, + "learning_rate": 4.991789143967508e-05, + "loss": 6.343, + "step": 4340 + }, + { + "epoch": 0.025817156722809022, + "grad_norm": 2.241818904876709, + "learning_rate": 4.991785360928968e-05, + "loss": 5.6774, + "step": 4341 + }, + { + "epoch": 0.025823104006090017, + "grad_norm": 1.9817326068878174, + "learning_rate": 4.9917815770205723e-05, + "loss": 5.7686, + "step": 4342 + }, + { + "epoch": 0.025829051289371015, + "grad_norm": 2.323802947998047, + "learning_rate": 4.991777792242321e-05, + "loss": 5.9564, + "step": 4343 + }, + { + "epoch": 0.025834998572652014, + "grad_norm": 2.3318228721618652, + "learning_rate": 4.991774006594216e-05, + "loss": 5.9057, + "step": 4344 + }, + { + "epoch": 0.02584094585593301, + "grad_norm": 2.032776355743408, + "learning_rate": 4.991770220076258e-05, + "loss": 5.9753, + "step": 4345 + }, + { + "epoch": 0.025846893139214008, + "grad_norm": 2.116837739944458, + "learning_rate": 4.9917664326884495e-05, + "loss": 5.8458, + "step": 4346 + }, + { + "epoch": 0.025852840422495003, + "grad_norm": 2.312878370285034, + "learning_rate": 4.991762644430791e-05, + "loss": 5.5128, + "step": 4347 + }, + { + "epoch": 0.025858787705776, + "grad_norm": 2.3003859519958496, + "learning_rate": 4.991758855303283e-05, + "loss": 5.7192, + "step": 4348 + }, + { + "epoch": 0.025864734989057, + "grad_norm": 1.898258924484253, + "learning_rate": 4.9917550653059286e-05, + "loss": 5.6422, + "step": 4349 + }, + { + "epoch": 0.025870682272337995, + "grad_norm": 1.9477754831314087, + "learning_rate": 4.9917512744387276e-05, + "loss": 5.7885, + "step": 4350 + }, + { + "epoch": 0.025876629555618993, + "grad_norm": 2.479979991912842, + "learning_rate": 4.991747482701683e-05, + "loss": 5.4692, + "step": 4351 + }, + { + "epoch": 0.025882576838899992, + "grad_norm": 2.324336290359497, + "learning_rate": 4.991743690094794e-05, + "loss": 5.4186, + "step": 4352 + }, + { + "epoch": 0.025888524122180987, + "grad_norm": 2.076723337173462, + "learning_rate": 4.9917398966180625e-05, + "loss": 5.4363, + "step": 4353 + }, + { + "epoch": 0.025894471405461986, + "grad_norm": 1.9004534482955933, + "learning_rate": 4.991736102271492e-05, + "loss": 5.6451, + "step": 4354 + }, + { + "epoch": 0.02590041868874298, + "grad_norm": 1.8098558187484741, + "learning_rate": 4.991732307055082e-05, + "loss": 5.8666, + "step": 4355 + }, + { + "epoch": 0.02590636597202398, + "grad_norm": 2.1158571243286133, + "learning_rate": 4.991728510968833e-05, + "loss": 5.5421, + "step": 4356 + }, + { + "epoch": 0.025912313255304978, + "grad_norm": 2.1235690116882324, + "learning_rate": 4.991724714012748e-05, + "loss": 5.9947, + "step": 4357 + }, + { + "epoch": 0.025918260538585973, + "grad_norm": 2.1306662559509277, + "learning_rate": 4.9917209161868276e-05, + "loss": 5.4648, + "step": 4358 + }, + { + "epoch": 0.02592420782186697, + "grad_norm": 1.7927355766296387, + "learning_rate": 4.991717117491073e-05, + "loss": 5.4339, + "step": 4359 + }, + { + "epoch": 0.02593015510514797, + "grad_norm": 2.314069986343384, + "learning_rate": 4.991713317925485e-05, + "loss": 5.5534, + "step": 4360 + }, + { + "epoch": 0.025936102388428965, + "grad_norm": 2.2628493309020996, + "learning_rate": 4.9917095174900665e-05, + "loss": 5.5996, + "step": 4361 + }, + { + "epoch": 0.025942049671709964, + "grad_norm": 2.1669869422912598, + "learning_rate": 4.991705716184818e-05, + "loss": 5.704, + "step": 4362 + }, + { + "epoch": 0.02594799695499096, + "grad_norm": 2.2048137187957764, + "learning_rate": 4.99170191400974e-05, + "loss": 5.6576, + "step": 4363 + }, + { + "epoch": 0.025953944238271957, + "grad_norm": 2.172398328781128, + "learning_rate": 4.991698110964835e-05, + "loss": 5.7254, + "step": 4364 + }, + { + "epoch": 0.025959891521552956, + "grad_norm": 1.9689068794250488, + "learning_rate": 4.9916943070501047e-05, + "loss": 5.7303, + "step": 4365 + }, + { + "epoch": 0.02596583880483395, + "grad_norm": 1.7037044763565063, + "learning_rate": 4.991690502265549e-05, + "loss": 5.6542, + "step": 4366 + }, + { + "epoch": 0.02597178608811495, + "grad_norm": 1.7666655778884888, + "learning_rate": 4.9916866966111695e-05, + "loss": 5.7833, + "step": 4367 + }, + { + "epoch": 0.025977733371395945, + "grad_norm": 2.0178141593933105, + "learning_rate": 4.991682890086968e-05, + "loss": 5.7759, + "step": 4368 + }, + { + "epoch": 0.025983680654676943, + "grad_norm": 1.7989983558654785, + "learning_rate": 4.991679082692946e-05, + "loss": 5.8772, + "step": 4369 + }, + { + "epoch": 0.025989627937957942, + "grad_norm": 1.8004199266433716, + "learning_rate": 4.9916752744291054e-05, + "loss": 5.6145, + "step": 4370 + }, + { + "epoch": 0.025995575221238937, + "grad_norm": 1.837074637413025, + "learning_rate": 4.991671465295446e-05, + "loss": 5.4874, + "step": 4371 + }, + { + "epoch": 0.026001522504519935, + "grad_norm": 1.7436491250991821, + "learning_rate": 4.991667655291969e-05, + "loss": 5.7212, + "step": 4372 + }, + { + "epoch": 0.026007469787800934, + "grad_norm": 1.7802095413208008, + "learning_rate": 4.991663844418678e-05, + "loss": 5.7004, + "step": 4373 + }, + { + "epoch": 0.02601341707108193, + "grad_norm": 2.112487316131592, + "learning_rate": 4.991660032675572e-05, + "loss": 5.5579, + "step": 4374 + }, + { + "epoch": 0.026019364354362928, + "grad_norm": 2.0917413234710693, + "learning_rate": 4.9916562200626535e-05, + "loss": 5.7825, + "step": 4375 + }, + { + "epoch": 0.026025311637643923, + "grad_norm": 1.8323053121566772, + "learning_rate": 4.991652406579924e-05, + "loss": 5.7699, + "step": 4376 + }, + { + "epoch": 0.02603125892092492, + "grad_norm": 1.9480723142623901, + "learning_rate": 4.9916485922273835e-05, + "loss": 5.6591, + "step": 4377 + }, + { + "epoch": 0.02603720620420592, + "grad_norm": 2.000739812850952, + "learning_rate": 4.991644777005035e-05, + "loss": 5.8919, + "step": 4378 + }, + { + "epoch": 0.026043153487486915, + "grad_norm": 2.093573808670044, + "learning_rate": 4.991640960912879e-05, + "loss": 5.7357, + "step": 4379 + }, + { + "epoch": 0.026049100770767913, + "grad_norm": 1.932019591331482, + "learning_rate": 4.991637143950916e-05, + "loss": 5.7268, + "step": 4380 + }, + { + "epoch": 0.026055048054048912, + "grad_norm": 1.820102572441101, + "learning_rate": 4.991633326119149e-05, + "loss": 5.8733, + "step": 4381 + }, + { + "epoch": 0.026060995337329907, + "grad_norm": 1.9091769456863403, + "learning_rate": 4.991629507417578e-05, + "loss": 5.5532, + "step": 4382 + }, + { + "epoch": 0.026066942620610906, + "grad_norm": 2.0037779808044434, + "learning_rate": 4.991625687846205e-05, + "loss": 5.7841, + "step": 4383 + }, + { + "epoch": 0.0260728899038919, + "grad_norm": 1.7106568813323975, + "learning_rate": 4.991621867405032e-05, + "loss": 5.4486, + "step": 4384 + }, + { + "epoch": 0.0260788371871729, + "grad_norm": 1.7802643775939941, + "learning_rate": 4.9916180460940585e-05, + "loss": 5.7494, + "step": 4385 + }, + { + "epoch": 0.026084784470453898, + "grad_norm": 2.089503288269043, + "learning_rate": 4.991614223913288e-05, + "loss": 5.6044, + "step": 4386 + }, + { + "epoch": 0.026090731753734893, + "grad_norm": 2.3315577507019043, + "learning_rate": 4.99161040086272e-05, + "loss": 5.9552, + "step": 4387 + }, + { + "epoch": 0.02609667903701589, + "grad_norm": 2.1202025413513184, + "learning_rate": 4.9916065769423566e-05, + "loss": 5.778, + "step": 4388 + }, + { + "epoch": 0.02610262632029689, + "grad_norm": 2.3448777198791504, + "learning_rate": 4.991602752152199e-05, + "loss": 5.8014, + "step": 4389 + }, + { + "epoch": 0.026108573603577885, + "grad_norm": 2.1613330841064453, + "learning_rate": 4.9915989264922495e-05, + "loss": 5.731, + "step": 4390 + }, + { + "epoch": 0.026114520886858884, + "grad_norm": 2.0314743518829346, + "learning_rate": 4.991595099962507e-05, + "loss": 5.8181, + "step": 4391 + }, + { + "epoch": 0.02612046817013988, + "grad_norm": 2.053994655609131, + "learning_rate": 4.9915912725629755e-05, + "loss": 5.7264, + "step": 4392 + }, + { + "epoch": 0.026126415453420877, + "grad_norm": 1.8720483779907227, + "learning_rate": 4.991587444293655e-05, + "loss": 5.5229, + "step": 4393 + }, + { + "epoch": 0.026132362736701876, + "grad_norm": 1.8745067119598389, + "learning_rate": 4.991583615154547e-05, + "loss": 5.612, + "step": 4394 + }, + { + "epoch": 0.02613831001998287, + "grad_norm": 2.124157428741455, + "learning_rate": 4.9915797851456525e-05, + "loss": 5.7276, + "step": 4395 + }, + { + "epoch": 0.02614425730326387, + "grad_norm": 2.2587873935699463, + "learning_rate": 4.991575954266974e-05, + "loss": 5.7994, + "step": 4396 + }, + { + "epoch": 0.026150204586544865, + "grad_norm": 1.9030078649520874, + "learning_rate": 4.9915721225185116e-05, + "loss": 5.7491, + "step": 4397 + }, + { + "epoch": 0.026156151869825863, + "grad_norm": 2.2278738021850586, + "learning_rate": 4.991568289900267e-05, + "loss": 5.4701, + "step": 4398 + }, + { + "epoch": 0.02616209915310686, + "grad_norm": 2.190974473953247, + "learning_rate": 4.991564456412242e-05, + "loss": 5.6731, + "step": 4399 + }, + { + "epoch": 0.026168046436387857, + "grad_norm": 2.3491454124450684, + "learning_rate": 4.991560622054438e-05, + "loss": 5.4041, + "step": 4400 + }, + { + "epoch": 0.026173993719668855, + "grad_norm": 2.2767796516418457, + "learning_rate": 4.991556786826854e-05, + "loss": 5.9005, + "step": 4401 + }, + { + "epoch": 0.026179941002949854, + "grad_norm": 2.3645145893096924, + "learning_rate": 4.991552950729496e-05, + "loss": 6.3108, + "step": 4402 + }, + { + "epoch": 0.02618588828623085, + "grad_norm": 2.1715476512908936, + "learning_rate": 4.9915491137623605e-05, + "loss": 5.8186, + "step": 4403 + }, + { + "epoch": 0.026191835569511848, + "grad_norm": 2.195758581161499, + "learning_rate": 4.991545275925452e-05, + "loss": 5.692, + "step": 4404 + }, + { + "epoch": 0.026197782852792843, + "grad_norm": 2.1124489307403564, + "learning_rate": 4.9915414372187705e-05, + "loss": 5.6582, + "step": 4405 + }, + { + "epoch": 0.02620373013607384, + "grad_norm": 1.9873831272125244, + "learning_rate": 4.991537597642317e-05, + "loss": 5.6309, + "step": 4406 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 1.9675770998001099, + "learning_rate": 4.991533757196094e-05, + "loss": 5.7095, + "step": 4407 + }, + { + "epoch": 0.026215624702635835, + "grad_norm": 1.9072648286819458, + "learning_rate": 4.991529915880103e-05, + "loss": 5.6449, + "step": 4408 + }, + { + "epoch": 0.026221571985916833, + "grad_norm": 2.3060495853424072, + "learning_rate": 4.9915260736943435e-05, + "loss": 5.6712, + "step": 4409 + }, + { + "epoch": 0.026227519269197832, + "grad_norm": 2.4438107013702393, + "learning_rate": 4.991522230638819e-05, + "loss": 5.2384, + "step": 4410 + }, + { + "epoch": 0.026233466552478827, + "grad_norm": 1.8102613687515259, + "learning_rate": 4.991518386713529e-05, + "loss": 5.5508, + "step": 4411 + }, + { + "epoch": 0.026239413835759826, + "grad_norm": 2.0226693153381348, + "learning_rate": 4.991514541918476e-05, + "loss": 5.4049, + "step": 4412 + }, + { + "epoch": 0.02624536111904082, + "grad_norm": 2.261418104171753, + "learning_rate": 4.991510696253661e-05, + "loss": 5.3324, + "step": 4413 + }, + { + "epoch": 0.02625130840232182, + "grad_norm": 2.232844352722168, + "learning_rate": 4.9915068497190856e-05, + "loss": 5.2601, + "step": 4414 + }, + { + "epoch": 0.026257255685602818, + "grad_norm": 2.2306487560272217, + "learning_rate": 4.99150300231475e-05, + "loss": 5.3329, + "step": 4415 + }, + { + "epoch": 0.026263202968883813, + "grad_norm": 2.1368730068206787, + "learning_rate": 4.9914991540406574e-05, + "loss": 5.573, + "step": 4416 + }, + { + "epoch": 0.02626915025216481, + "grad_norm": 1.984078288078308, + "learning_rate": 4.991495304896808e-05, + "loss": 5.6518, + "step": 4417 + }, + { + "epoch": 0.02627509753544581, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.9914914548832034e-05, + "loss": 5.7076, + "step": 4418 + }, + { + "epoch": 0.026281044818726805, + "grad_norm": 1.9880858659744263, + "learning_rate": 4.991487603999845e-05, + "loss": 5.6533, + "step": 4419 + }, + { + "epoch": 0.026286992102007804, + "grad_norm": 2.0475687980651855, + "learning_rate": 4.991483752246734e-05, + "loss": 5.6311, + "step": 4420 + }, + { + "epoch": 0.0262929393852888, + "grad_norm": 2.2796714305877686, + "learning_rate": 4.991479899623871e-05, + "loss": 5.364, + "step": 4421 + }, + { + "epoch": 0.026298886668569797, + "grad_norm": 1.8535730838775635, + "learning_rate": 4.991476046131259e-05, + "loss": 5.6153, + "step": 4422 + }, + { + "epoch": 0.026304833951850796, + "grad_norm": 1.97511887550354, + "learning_rate": 4.9914721917688976e-05, + "loss": 5.5682, + "step": 4423 + }, + { + "epoch": 0.02631078123513179, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.99146833653679e-05, + "loss": 5.5609, + "step": 4424 + }, + { + "epoch": 0.02631672851841279, + "grad_norm": 1.9997434616088867, + "learning_rate": 4.9914644804349356e-05, + "loss": 5.6196, + "step": 4425 + }, + { + "epoch": 0.026322675801693788, + "grad_norm": 1.6116957664489746, + "learning_rate": 4.991460623463337e-05, + "loss": 5.5003, + "step": 4426 + }, + { + "epoch": 0.026328623084974783, + "grad_norm": 1.8156583309173584, + "learning_rate": 4.991456765621996e-05, + "loss": 5.5875, + "step": 4427 + }, + { + "epoch": 0.02633457036825578, + "grad_norm": 2.0364272594451904, + "learning_rate": 4.991452906910912e-05, + "loss": 5.6541, + "step": 4428 + }, + { + "epoch": 0.026340517651536777, + "grad_norm": 1.8430767059326172, + "learning_rate": 4.991449047330088e-05, + "loss": 5.5408, + "step": 4429 + }, + { + "epoch": 0.026346464934817775, + "grad_norm": 2.049476385116577, + "learning_rate": 4.991445186879525e-05, + "loss": 5.5644, + "step": 4430 + }, + { + "epoch": 0.026352412218098774, + "grad_norm": 1.9186240434646606, + "learning_rate": 4.991441325559224e-05, + "loss": 5.5977, + "step": 4431 + }, + { + "epoch": 0.02635835950137977, + "grad_norm": 1.80244779586792, + "learning_rate": 4.991437463369186e-05, + "loss": 5.5114, + "step": 4432 + }, + { + "epoch": 0.026364306784660767, + "grad_norm": 2.2580177783966064, + "learning_rate": 4.991433600309414e-05, + "loss": 5.4132, + "step": 4433 + }, + { + "epoch": 0.026370254067941763, + "grad_norm": 2.0970637798309326, + "learning_rate": 4.991429736379908e-05, + "loss": 5.6211, + "step": 4434 + }, + { + "epoch": 0.02637620135122276, + "grad_norm": 2.0690932273864746, + "learning_rate": 4.9914258715806696e-05, + "loss": 5.6511, + "step": 4435 + }, + { + "epoch": 0.02638214863450376, + "grad_norm": 2.063052177429199, + "learning_rate": 4.9914220059117e-05, + "loss": 5.5169, + "step": 4436 + }, + { + "epoch": 0.026388095917784755, + "grad_norm": 1.990708827972412, + "learning_rate": 4.991418139373001e-05, + "loss": 5.5018, + "step": 4437 + }, + { + "epoch": 0.026394043201065753, + "grad_norm": 2.1311633586883545, + "learning_rate": 4.9914142719645736e-05, + "loss": 5.4714, + "step": 4438 + }, + { + "epoch": 0.026399990484346752, + "grad_norm": 1.7688508033752441, + "learning_rate": 4.991410403686419e-05, + "loss": 5.5208, + "step": 4439 + }, + { + "epoch": 0.026405937767627747, + "grad_norm": 2.3486130237579346, + "learning_rate": 4.9914065345385383e-05, + "loss": 5.4524, + "step": 4440 + }, + { + "epoch": 0.026411885050908745, + "grad_norm": 2.0333707332611084, + "learning_rate": 4.9914026645209344e-05, + "loss": 5.6747, + "step": 4441 + }, + { + "epoch": 0.02641783233418974, + "grad_norm": 1.8731845617294312, + "learning_rate": 4.991398793633607e-05, + "loss": 5.6436, + "step": 4442 + }, + { + "epoch": 0.02642377961747074, + "grad_norm": 2.003361225128174, + "learning_rate": 4.991394921876558e-05, + "loss": 5.4628, + "step": 4443 + }, + { + "epoch": 0.026429726900751738, + "grad_norm": 2.1195411682128906, + "learning_rate": 4.991391049249789e-05, + "loss": 5.4096, + "step": 4444 + }, + { + "epoch": 0.026435674184032733, + "grad_norm": 1.857364535331726, + "learning_rate": 4.991387175753301e-05, + "loss": 5.3928, + "step": 4445 + }, + { + "epoch": 0.02644162146731373, + "grad_norm": 1.8932915925979614, + "learning_rate": 4.991383301387095e-05, + "loss": 5.4917, + "step": 4446 + }, + { + "epoch": 0.02644756875059473, + "grad_norm": 1.8743010759353638, + "learning_rate": 4.991379426151174e-05, + "loss": 5.6766, + "step": 4447 + }, + { + "epoch": 0.026453516033875725, + "grad_norm": 1.910796046257019, + "learning_rate": 4.991375550045537e-05, + "loss": 5.4347, + "step": 4448 + }, + { + "epoch": 0.026459463317156724, + "grad_norm": 1.7901744842529297, + "learning_rate": 4.991371673070187e-05, + "loss": 5.5339, + "step": 4449 + }, + { + "epoch": 0.02646541060043772, + "grad_norm": 1.86943519115448, + "learning_rate": 4.9913677952251244e-05, + "loss": 5.4867, + "step": 4450 + }, + { + "epoch": 0.026471357883718717, + "grad_norm": 1.8662208318710327, + "learning_rate": 4.991363916510352e-05, + "loss": 5.4992, + "step": 4451 + }, + { + "epoch": 0.026477305166999716, + "grad_norm": 1.7465355396270752, + "learning_rate": 4.99136003692587e-05, + "loss": 5.5243, + "step": 4452 + }, + { + "epoch": 0.02648325245028071, + "grad_norm": 1.9097687005996704, + "learning_rate": 4.9913561564716794e-05, + "loss": 5.5096, + "step": 4453 + }, + { + "epoch": 0.02648919973356171, + "grad_norm": 2.1472127437591553, + "learning_rate": 4.991352275147783e-05, + "loss": 5.4462, + "step": 4454 + }, + { + "epoch": 0.026495147016842708, + "grad_norm": 2.3966939449310303, + "learning_rate": 4.9913483929541806e-05, + "loss": 5.2938, + "step": 4455 + }, + { + "epoch": 0.026501094300123703, + "grad_norm": 2.1738977432250977, + "learning_rate": 4.991344509890874e-05, + "loss": 5.317, + "step": 4456 + }, + { + "epoch": 0.0265070415834047, + "grad_norm": 1.963944435119629, + "learning_rate": 4.9913406259578646e-05, + "loss": 5.3827, + "step": 4457 + }, + { + "epoch": 0.026512988866685697, + "grad_norm": 2.1755871772766113, + "learning_rate": 4.991336741155155e-05, + "loss": 5.2941, + "step": 4458 + }, + { + "epoch": 0.026518936149966695, + "grad_norm": 2.2461934089660645, + "learning_rate": 4.991332855482744e-05, + "loss": 5.3503, + "step": 4459 + }, + { + "epoch": 0.026524883433247694, + "grad_norm": 2.2270491123199463, + "learning_rate": 4.9913289689406355e-05, + "loss": 5.417, + "step": 4460 + }, + { + "epoch": 0.02653083071652869, + "grad_norm": 2.437074661254883, + "learning_rate": 4.991325081528829e-05, + "loss": 5.1938, + "step": 4461 + }, + { + "epoch": 0.026536777999809687, + "grad_norm": 2.159170150756836, + "learning_rate": 4.991321193247328e-05, + "loss": 5.2088, + "step": 4462 + }, + { + "epoch": 0.026542725283090682, + "grad_norm": 2.08797287940979, + "learning_rate": 4.9913173040961315e-05, + "loss": 5.1829, + "step": 4463 + }, + { + "epoch": 0.02654867256637168, + "grad_norm": 2.805191993713379, + "learning_rate": 4.991313414075242e-05, + "loss": 6.3049, + "step": 4464 + }, + { + "epoch": 0.02655461984965268, + "grad_norm": 2.3204843997955322, + "learning_rate": 4.991309523184661e-05, + "loss": 5.3831, + "step": 4465 + }, + { + "epoch": 0.026560567132933675, + "grad_norm": 2.217212200164795, + "learning_rate": 4.991305631424389e-05, + "loss": 5.4647, + "step": 4466 + }, + { + "epoch": 0.026566514416214673, + "grad_norm": 2.1094207763671875, + "learning_rate": 4.991301738794429e-05, + "loss": 5.5837, + "step": 4467 + }, + { + "epoch": 0.026572461699495672, + "grad_norm": 2.225660562515259, + "learning_rate": 4.99129784529478e-05, + "loss": 5.8316, + "step": 4468 + }, + { + "epoch": 0.026578408982776667, + "grad_norm": 2.361238956451416, + "learning_rate": 4.991293950925446e-05, + "loss": 5.8358, + "step": 4469 + }, + { + "epoch": 0.026584356266057665, + "grad_norm": 2.3268609046936035, + "learning_rate": 4.991290055686426e-05, + "loss": 5.732, + "step": 4470 + }, + { + "epoch": 0.02659030354933866, + "grad_norm": 2.1456172466278076, + "learning_rate": 4.9912861595777226e-05, + "loss": 5.9, + "step": 4471 + }, + { + "epoch": 0.02659625083261966, + "grad_norm": 2.114696979522705, + "learning_rate": 4.991282262599337e-05, + "loss": 5.4464, + "step": 4472 + }, + { + "epoch": 0.026602198115900658, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9912783647512705e-05, + "loss": 5.5053, + "step": 4473 + }, + { + "epoch": 0.026608145399181653, + "grad_norm": 1.9743404388427734, + "learning_rate": 4.9912744660335245e-05, + "loss": 5.5877, + "step": 4474 + }, + { + "epoch": 0.02661409268246265, + "grad_norm": 2.052358865737915, + "learning_rate": 4.991270566446101e-05, + "loss": 5.5891, + "step": 4475 + }, + { + "epoch": 0.02662003996574365, + "grad_norm": 2.1602041721343994, + "learning_rate": 4.991266665989e-05, + "loss": 5.581, + "step": 4476 + }, + { + "epoch": 0.026625987249024645, + "grad_norm": 2.241586685180664, + "learning_rate": 4.9912627646622236e-05, + "loss": 5.5375, + "step": 4477 + }, + { + "epoch": 0.026631934532305643, + "grad_norm": 1.7952601909637451, + "learning_rate": 4.991258862465773e-05, + "loss": 5.5273, + "step": 4478 + }, + { + "epoch": 0.02663788181558664, + "grad_norm": 1.9767752885818481, + "learning_rate": 4.991254959399649e-05, + "loss": 5.4476, + "step": 4479 + }, + { + "epoch": 0.026643829098867637, + "grad_norm": 1.7997682094573975, + "learning_rate": 4.991251055463855e-05, + "loss": 5.5666, + "step": 4480 + }, + { + "epoch": 0.026649776382148636, + "grad_norm": 2.3247575759887695, + "learning_rate": 4.9912471506583905e-05, + "loss": 5.5247, + "step": 4481 + }, + { + "epoch": 0.02665572366542963, + "grad_norm": 2.165900230407715, + "learning_rate": 4.991243244983257e-05, + "loss": 5.6807, + "step": 4482 + }, + { + "epoch": 0.02666167094871063, + "grad_norm": 2.598257303237915, + "learning_rate": 4.991239338438456e-05, + "loss": 5.6609, + "step": 4483 + }, + { + "epoch": 0.026667618231991628, + "grad_norm": 2.2752041816711426, + "learning_rate": 4.991235431023989e-05, + "loss": 5.5199, + "step": 4484 + }, + { + "epoch": 0.026673565515272623, + "grad_norm": 2.3482842445373535, + "learning_rate": 4.9912315227398586e-05, + "loss": 5.6438, + "step": 4485 + }, + { + "epoch": 0.02667951279855362, + "grad_norm": 2.034403085708618, + "learning_rate": 4.991227613586065e-05, + "loss": 5.6191, + "step": 4486 + }, + { + "epoch": 0.026685460081834617, + "grad_norm": 1.9002971649169922, + "learning_rate": 4.9912237035626085e-05, + "loss": 5.6627, + "step": 4487 + }, + { + "epoch": 0.026691407365115615, + "grad_norm": 2.0305564403533936, + "learning_rate": 4.9912197926694924e-05, + "loss": 5.7009, + "step": 4488 + }, + { + "epoch": 0.026697354648396614, + "grad_norm": 2.029777765274048, + "learning_rate": 4.991215880906717e-05, + "loss": 5.5201, + "step": 4489 + }, + { + "epoch": 0.02670330193167761, + "grad_norm": 1.8889492750167847, + "learning_rate": 4.991211968274283e-05, + "loss": 5.602, + "step": 4490 + }, + { + "epoch": 0.026709249214958607, + "grad_norm": 1.9616930484771729, + "learning_rate": 4.9912080547721934e-05, + "loss": 5.5352, + "step": 4491 + }, + { + "epoch": 0.026715196498239602, + "grad_norm": 2.449345827102661, + "learning_rate": 4.9912041404004485e-05, + "loss": 5.7103, + "step": 4492 + }, + { + "epoch": 0.0267211437815206, + "grad_norm": 2.5550389289855957, + "learning_rate": 4.991200225159051e-05, + "loss": 5.5593, + "step": 4493 + }, + { + "epoch": 0.0267270910648016, + "grad_norm": 2.2512362003326416, + "learning_rate": 4.9911963090479996e-05, + "loss": 5.6329, + "step": 4494 + }, + { + "epoch": 0.026733038348082595, + "grad_norm": 2.0346968173980713, + "learning_rate": 4.9911923920672984e-05, + "loss": 5.5966, + "step": 4495 + }, + { + "epoch": 0.026738985631363593, + "grad_norm": 2.013648271560669, + "learning_rate": 4.991188474216947e-05, + "loss": 5.6532, + "step": 4496 + }, + { + "epoch": 0.026744932914644592, + "grad_norm": 1.8361715078353882, + "learning_rate": 4.9911845554969484e-05, + "loss": 5.519, + "step": 4497 + }, + { + "epoch": 0.026750880197925587, + "grad_norm": 2.1487016677856445, + "learning_rate": 4.991180635907302e-05, + "loss": 5.436, + "step": 4498 + }, + { + "epoch": 0.026756827481206585, + "grad_norm": 2.277714967727661, + "learning_rate": 4.991176715448011e-05, + "loss": 5.3574, + "step": 4499 + }, + { + "epoch": 0.02676277476448758, + "grad_norm": 2.3313565254211426, + "learning_rate": 4.9911727941190755e-05, + "loss": 5.5408, + "step": 4500 + }, + { + "epoch": 0.02676872204776858, + "grad_norm": 2.105825662612915, + "learning_rate": 4.9911688719204975e-05, + "loss": 5.4801, + "step": 4501 + }, + { + "epoch": 0.026774669331049578, + "grad_norm": 2.122138261795044, + "learning_rate": 4.991164948852278e-05, + "loss": 5.4645, + "step": 4502 + }, + { + "epoch": 0.026780616614330573, + "grad_norm": 1.8742777109146118, + "learning_rate": 4.991161024914419e-05, + "loss": 5.5646, + "step": 4503 + }, + { + "epoch": 0.02678656389761157, + "grad_norm": 1.762276291847229, + "learning_rate": 4.991157100106921e-05, + "loss": 5.5672, + "step": 4504 + }, + { + "epoch": 0.02679251118089257, + "grad_norm": 1.9174740314483643, + "learning_rate": 4.9911531744297855e-05, + "loss": 5.4296, + "step": 4505 + }, + { + "epoch": 0.026798458464173565, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.991149247883015e-05, + "loss": 5.5685, + "step": 4506 + }, + { + "epoch": 0.026804405747454563, + "grad_norm": 1.8675988912582397, + "learning_rate": 4.9911453204666094e-05, + "loss": 5.4757, + "step": 4507 + }, + { + "epoch": 0.02681035303073556, + "grad_norm": 2.3117783069610596, + "learning_rate": 4.99114139218057e-05, + "loss": 5.7057, + "step": 4508 + }, + { + "epoch": 0.026816300314016557, + "grad_norm": 2.5439465045928955, + "learning_rate": 4.9911374630249007e-05, + "loss": 5.7393, + "step": 4509 + }, + { + "epoch": 0.026822247597297556, + "grad_norm": 2.4611666202545166, + "learning_rate": 4.9911335329996e-05, + "loss": 5.7215, + "step": 4510 + }, + { + "epoch": 0.02682819488057855, + "grad_norm": 2.1540768146514893, + "learning_rate": 4.99112960210467e-05, + "loss": 5.7059, + "step": 4511 + }, + { + "epoch": 0.02683414216385955, + "grad_norm": 2.1183645725250244, + "learning_rate": 4.9911256703401134e-05, + "loss": 5.4454, + "step": 4512 + }, + { + "epoch": 0.026840089447140548, + "grad_norm": 2.1757540702819824, + "learning_rate": 4.9911217377059295e-05, + "loss": 5.6851, + "step": 4513 + }, + { + "epoch": 0.026846036730421543, + "grad_norm": 2.2770378589630127, + "learning_rate": 4.9911178042021214e-05, + "loss": 5.5957, + "step": 4514 + }, + { + "epoch": 0.02685198401370254, + "grad_norm": 2.320993185043335, + "learning_rate": 4.9911138698286895e-05, + "loss": 5.4674, + "step": 4515 + }, + { + "epoch": 0.026857931296983537, + "grad_norm": 2.2340428829193115, + "learning_rate": 4.991109934585636e-05, + "loss": 5.4514, + "step": 4516 + }, + { + "epoch": 0.026863878580264535, + "grad_norm": 2.1531431674957275, + "learning_rate": 4.991105998472962e-05, + "loss": 5.4386, + "step": 4517 + }, + { + "epoch": 0.026869825863545534, + "grad_norm": 2.1567044258117676, + "learning_rate": 4.991102061490667e-05, + "loss": 5.422, + "step": 4518 + }, + { + "epoch": 0.02687577314682653, + "grad_norm": 2.1181681156158447, + "learning_rate": 4.9910981236387554e-05, + "loss": 5.7214, + "step": 4519 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 2.3410873413085938, + "learning_rate": 4.9910941849172263e-05, + "loss": 5.8603, + "step": 4520 + }, + { + "epoch": 0.026887667713388526, + "grad_norm": 2.4943840503692627, + "learning_rate": 4.9910902453260824e-05, + "loss": 5.7084, + "step": 4521 + }, + { + "epoch": 0.02689361499666952, + "grad_norm": 2.1420044898986816, + "learning_rate": 4.991086304865325e-05, + "loss": 5.528, + "step": 4522 + }, + { + "epoch": 0.02689956227995052, + "grad_norm": 2.3257980346679688, + "learning_rate": 4.991082363534955e-05, + "loss": 5.6791, + "step": 4523 + }, + { + "epoch": 0.026905509563231515, + "grad_norm": 2.335049867630005, + "learning_rate": 4.991078421334974e-05, + "loss": 5.6184, + "step": 4524 + }, + { + "epoch": 0.026911456846512513, + "grad_norm": 3.7381551265716553, + "learning_rate": 4.9910744782653825e-05, + "loss": 5.954, + "step": 4525 + }, + { + "epoch": 0.02691740412979351, + "grad_norm": 3.1807587146759033, + "learning_rate": 4.991070534326183e-05, + "loss": 6.5662, + "step": 4526 + }, + { + "epoch": 0.026923351413074507, + "grad_norm": 2.378366708755493, + "learning_rate": 4.991066589517376e-05, + "loss": 6.2312, + "step": 4527 + }, + { + "epoch": 0.026929298696355505, + "grad_norm": 2.5797109603881836, + "learning_rate": 4.991062643838964e-05, + "loss": 5.9969, + "step": 4528 + }, + { + "epoch": 0.0269352459796365, + "grad_norm": 2.522815704345703, + "learning_rate": 4.991058697290948e-05, + "loss": 5.919, + "step": 4529 + }, + { + "epoch": 0.0269411932629175, + "grad_norm": 2.5215437412261963, + "learning_rate": 4.991054749873329e-05, + "loss": 5.8812, + "step": 4530 + }, + { + "epoch": 0.026947140546198498, + "grad_norm": 2.1608335971832275, + "learning_rate": 4.991050801586108e-05, + "loss": 5.8381, + "step": 4531 + }, + { + "epoch": 0.026953087829479493, + "grad_norm": 2.37752366065979, + "learning_rate": 4.991046852429288e-05, + "loss": 5.7612, + "step": 4532 + }, + { + "epoch": 0.02695903511276049, + "grad_norm": 2.117534875869751, + "learning_rate": 4.991042902402868e-05, + "loss": 5.6762, + "step": 4533 + }, + { + "epoch": 0.02696498239604149, + "grad_norm": 2.595797061920166, + "learning_rate": 4.991038951506851e-05, + "loss": 6.19, + "step": 4534 + }, + { + "epoch": 0.026970929679322485, + "grad_norm": 2.2216086387634277, + "learning_rate": 4.991034999741239e-05, + "loss": 6.1612, + "step": 4535 + }, + { + "epoch": 0.026976876962603483, + "grad_norm": 2.829735279083252, + "learning_rate": 4.991031047106032e-05, + "loss": 5.6955, + "step": 4536 + }, + { + "epoch": 0.02698282424588448, + "grad_norm": 2.5018115043640137, + "learning_rate": 4.991027093601231e-05, + "loss": 5.4966, + "step": 4537 + }, + { + "epoch": 0.026988771529165477, + "grad_norm": 2.334052085876465, + "learning_rate": 4.9910231392268385e-05, + "loss": 6.1603, + "step": 4538 + }, + { + "epoch": 0.026994718812446476, + "grad_norm": 2.497351884841919, + "learning_rate": 4.991019183982856e-05, + "loss": 6.0128, + "step": 4539 + }, + { + "epoch": 0.02700066609572747, + "grad_norm": 2.2976267337799072, + "learning_rate": 4.991015227869284e-05, + "loss": 5.6696, + "step": 4540 + }, + { + "epoch": 0.02700661337900847, + "grad_norm": 2.6851742267608643, + "learning_rate": 4.991011270886125e-05, + "loss": 5.7996, + "step": 4541 + }, + { + "epoch": 0.027012560662289468, + "grad_norm": 2.531029224395752, + "learning_rate": 4.991007313033379e-05, + "loss": 5.6671, + "step": 4542 + }, + { + "epoch": 0.027018507945570463, + "grad_norm": 2.195552110671997, + "learning_rate": 4.991003354311048e-05, + "loss": 6.3213, + "step": 4543 + }, + { + "epoch": 0.02702445522885146, + "grad_norm": 2.2973361015319824, + "learning_rate": 4.9909993947191336e-05, + "loss": 6.1523, + "step": 4544 + }, + { + "epoch": 0.027030402512132456, + "grad_norm": 2.4766385555267334, + "learning_rate": 4.990995434257637e-05, + "loss": 5.7894, + "step": 4545 + }, + { + "epoch": 0.027036349795413455, + "grad_norm": 2.486384630203247, + "learning_rate": 4.9909914729265606e-05, + "loss": 6.2814, + "step": 4546 + }, + { + "epoch": 0.027042297078694454, + "grad_norm": 2.5054233074188232, + "learning_rate": 4.9909875107259036e-05, + "loss": 6.2859, + "step": 4547 + }, + { + "epoch": 0.02704824436197545, + "grad_norm": 2.70576548576355, + "learning_rate": 4.990983547655669e-05, + "loss": 6.2424, + "step": 4548 + }, + { + "epoch": 0.027054191645256447, + "grad_norm": 3.0937716960906982, + "learning_rate": 4.990979583715858e-05, + "loss": 6.4392, + "step": 4549 + }, + { + "epoch": 0.027060138928537446, + "grad_norm": 2.6290581226348877, + "learning_rate": 4.9909756189064714e-05, + "loss": 6.3565, + "step": 4550 + }, + { + "epoch": 0.02706608621181844, + "grad_norm": 2.5180583000183105, + "learning_rate": 4.990971653227511e-05, + "loss": 6.1482, + "step": 4551 + }, + { + "epoch": 0.02707203349509944, + "grad_norm": 2.6096208095550537, + "learning_rate": 4.990967686678978e-05, + "loss": 5.7724, + "step": 4552 + }, + { + "epoch": 0.027077980778380435, + "grad_norm": 3.187276840209961, + "learning_rate": 4.990963719260874e-05, + "loss": 5.682, + "step": 4553 + }, + { + "epoch": 0.027083928061661433, + "grad_norm": 2.3522419929504395, + "learning_rate": 4.9909597509732006e-05, + "loss": 6.7045, + "step": 4554 + }, + { + "epoch": 0.02708987534494243, + "grad_norm": 2.6016366481781006, + "learning_rate": 4.990955781815959e-05, + "loss": 6.0653, + "step": 4555 + }, + { + "epoch": 0.027095822628223427, + "grad_norm": 2.5409183502197266, + "learning_rate": 4.99095181178915e-05, + "loss": 5.861, + "step": 4556 + }, + { + "epoch": 0.027101769911504425, + "grad_norm": 2.5297863483428955, + "learning_rate": 4.9909478408927754e-05, + "loss": 5.5301, + "step": 4557 + }, + { + "epoch": 0.02710771719478542, + "grad_norm": 2.4822275638580322, + "learning_rate": 4.990943869126837e-05, + "loss": 5.6919, + "step": 4558 + }, + { + "epoch": 0.02711366447806642, + "grad_norm": 2.3832650184631348, + "learning_rate": 4.9909398964913365e-05, + "loss": 5.9589, + "step": 4559 + }, + { + "epoch": 0.027119611761347417, + "grad_norm": 2.0038483142852783, + "learning_rate": 4.9909359229862734e-05, + "loss": 6.1847, + "step": 4560 + }, + { + "epoch": 0.027125559044628413, + "grad_norm": 2.3678700923919678, + "learning_rate": 4.990931948611651e-05, + "loss": 6.4794, + "step": 4561 + }, + { + "epoch": 0.02713150632790941, + "grad_norm": 2.7433204650878906, + "learning_rate": 4.990927973367469e-05, + "loss": 6.6997, + "step": 4562 + }, + { + "epoch": 0.02713745361119041, + "grad_norm": 3.5579798221588135, + "learning_rate": 4.990923997253731e-05, + "loss": 6.1809, + "step": 4563 + }, + { + "epoch": 0.027143400894471405, + "grad_norm": 3.254093647003174, + "learning_rate": 4.990920020270436e-05, + "loss": 6.1446, + "step": 4564 + }, + { + "epoch": 0.027149348177752403, + "grad_norm": 3.0661215782165527, + "learning_rate": 4.990916042417588e-05, + "loss": 6.6702, + "step": 4565 + }, + { + "epoch": 0.0271552954610334, + "grad_norm": 2.641291618347168, + "learning_rate": 4.9909120636951864e-05, + "loss": 6.4951, + "step": 4566 + }, + { + "epoch": 0.027161242744314397, + "grad_norm": 2.050675868988037, + "learning_rate": 4.990908084103233e-05, + "loss": 6.3365, + "step": 4567 + }, + { + "epoch": 0.027167190027595396, + "grad_norm": 2.081108331680298, + "learning_rate": 4.990904103641729e-05, + "loss": 6.1874, + "step": 4568 + }, + { + "epoch": 0.02717313731087639, + "grad_norm": 2.5833899974823, + "learning_rate": 4.9909001223106766e-05, + "loss": 6.0892, + "step": 4569 + }, + { + "epoch": 0.02717908459415739, + "grad_norm": 2.7387397289276123, + "learning_rate": 4.990896140110076e-05, + "loss": 6.1036, + "step": 4570 + }, + { + "epoch": 0.027185031877438388, + "grad_norm": 2.5665578842163086, + "learning_rate": 4.99089215703993e-05, + "loss": 5.9577, + "step": 4571 + }, + { + "epoch": 0.027190979160719383, + "grad_norm": 2.3825178146362305, + "learning_rate": 4.990888173100239e-05, + "loss": 5.9654, + "step": 4572 + }, + { + "epoch": 0.02719692644400038, + "grad_norm": 2.562509059906006, + "learning_rate": 4.990884188291005e-05, + "loss": 6.009, + "step": 4573 + }, + { + "epoch": 0.027202873727281376, + "grad_norm": 2.141941785812378, + "learning_rate": 4.9908802026122284e-05, + "loss": 5.8315, + "step": 4574 + }, + { + "epoch": 0.027208821010562375, + "grad_norm": 2.5348474979400635, + "learning_rate": 4.990876216063912e-05, + "loss": 6.3763, + "step": 4575 + }, + { + "epoch": 0.027214768293843374, + "grad_norm": 2.751520872116089, + "learning_rate": 4.990872228646056e-05, + "loss": 6.5684, + "step": 4576 + }, + { + "epoch": 0.02722071557712437, + "grad_norm": 4.626354694366455, + "learning_rate": 4.990868240358662e-05, + "loss": 6.115, + "step": 4577 + }, + { + "epoch": 0.027226662860405367, + "grad_norm": 2.648479700088501, + "learning_rate": 4.990864251201732e-05, + "loss": 6.0879, + "step": 4578 + }, + { + "epoch": 0.027232610143686366, + "grad_norm": 2.21056866645813, + "learning_rate": 4.990860261175268e-05, + "loss": 6.2923, + "step": 4579 + }, + { + "epoch": 0.02723855742696736, + "grad_norm": 2.3460421562194824, + "learning_rate": 4.9908562702792684e-05, + "loss": 6.4044, + "step": 4580 + }, + { + "epoch": 0.02724450471024836, + "grad_norm": 2.6087262630462646, + "learning_rate": 4.990852278513738e-05, + "loss": 6.5131, + "step": 4581 + }, + { + "epoch": 0.027250451993529354, + "grad_norm": 2.6969377994537354, + "learning_rate": 4.9908482858786765e-05, + "loss": 6.3483, + "step": 4582 + }, + { + "epoch": 0.027256399276810353, + "grad_norm": 2.64043927192688, + "learning_rate": 4.990844292374085e-05, + "loss": 5.8712, + "step": 4583 + }, + { + "epoch": 0.02726234656009135, + "grad_norm": 2.5738205909729004, + "learning_rate": 4.9908402979999654e-05, + "loss": 5.9165, + "step": 4584 + }, + { + "epoch": 0.027268293843372347, + "grad_norm": 2.2725625038146973, + "learning_rate": 4.99083630275632e-05, + "loss": 5.8454, + "step": 4585 + }, + { + "epoch": 0.027274241126653345, + "grad_norm": 2.5911824703216553, + "learning_rate": 4.9908323066431494e-05, + "loss": 5.6729, + "step": 4586 + }, + { + "epoch": 0.02728018840993434, + "grad_norm": 2.6691668033599854, + "learning_rate": 4.9908283096604546e-05, + "loss": 5.7726, + "step": 4587 + }, + { + "epoch": 0.02728613569321534, + "grad_norm": 2.6512796878814697, + "learning_rate": 4.990824311808238e-05, + "loss": 6.1295, + "step": 4588 + }, + { + "epoch": 0.027292082976496337, + "grad_norm": 2.816943645477295, + "learning_rate": 4.9908203130865e-05, + "loss": 5.5172, + "step": 4589 + }, + { + "epoch": 0.027298030259777332, + "grad_norm": 2.6252098083496094, + "learning_rate": 4.990816313495242e-05, + "loss": 5.5955, + "step": 4590 + }, + { + "epoch": 0.02730397754305833, + "grad_norm": 2.3711740970611572, + "learning_rate": 4.990812313034466e-05, + "loss": 5.3348, + "step": 4591 + }, + { + "epoch": 0.02730992482633933, + "grad_norm": 2.355436086654663, + "learning_rate": 4.990808311704173e-05, + "loss": 5.6171, + "step": 4592 + }, + { + "epoch": 0.027315872109620325, + "grad_norm": 2.3344695568084717, + "learning_rate": 4.990804309504365e-05, + "loss": 5.46, + "step": 4593 + }, + { + "epoch": 0.027321819392901323, + "grad_norm": 2.3890786170959473, + "learning_rate": 4.990800306435043e-05, + "loss": 5.5658, + "step": 4594 + }, + { + "epoch": 0.02732776667618232, + "grad_norm": 2.5606987476348877, + "learning_rate": 4.990796302496208e-05, + "loss": 5.4778, + "step": 4595 + }, + { + "epoch": 0.027333713959463317, + "grad_norm": 2.2443172931671143, + "learning_rate": 4.9907922976878616e-05, + "loss": 5.486, + "step": 4596 + }, + { + "epoch": 0.027339661242744315, + "grad_norm": 2.3428351879119873, + "learning_rate": 4.990788292010005e-05, + "loss": 5.3332, + "step": 4597 + }, + { + "epoch": 0.02734560852602531, + "grad_norm": 2.6336300373077393, + "learning_rate": 4.9907842854626406e-05, + "loss": 5.4606, + "step": 4598 + }, + { + "epoch": 0.02735155580930631, + "grad_norm": 2.3052382469177246, + "learning_rate": 4.990780278045769e-05, + "loss": 5.4028, + "step": 4599 + }, + { + "epoch": 0.027357503092587308, + "grad_norm": 2.4661340713500977, + "learning_rate": 4.990776269759392e-05, + "loss": 5.6011, + "step": 4600 + }, + { + "epoch": 0.027363450375868303, + "grad_norm": 2.400527238845825, + "learning_rate": 4.99077226060351e-05, + "loss": 5.5952, + "step": 4601 + }, + { + "epoch": 0.0273693976591493, + "grad_norm": 2.364900827407837, + "learning_rate": 4.9907682505781256e-05, + "loss": 5.2125, + "step": 4602 + }, + { + "epoch": 0.027375344942430296, + "grad_norm": 2.383680820465088, + "learning_rate": 4.99076423968324e-05, + "loss": 5.4253, + "step": 4603 + }, + { + "epoch": 0.027381292225711295, + "grad_norm": 2.681903839111328, + "learning_rate": 4.990760227918854e-05, + "loss": 5.3741, + "step": 4604 + }, + { + "epoch": 0.027387239508992293, + "grad_norm": 2.3454341888427734, + "learning_rate": 4.990756215284969e-05, + "loss": 5.3032, + "step": 4605 + }, + { + "epoch": 0.02739318679227329, + "grad_norm": 2.439807653427124, + "learning_rate": 4.990752201781587e-05, + "loss": 5.3368, + "step": 4606 + }, + { + "epoch": 0.027399134075554287, + "grad_norm": 2.938976764678955, + "learning_rate": 4.990748187408709e-05, + "loss": 6.1251, + "step": 4607 + }, + { + "epoch": 0.027405081358835286, + "grad_norm": 3.353973865509033, + "learning_rate": 4.990744172166337e-05, + "loss": 6.72, + "step": 4608 + }, + { + "epoch": 0.02741102864211628, + "grad_norm": 2.4661834239959717, + "learning_rate": 4.990740156054472e-05, + "loss": 5.7156, + "step": 4609 + }, + { + "epoch": 0.02741697592539728, + "grad_norm": 2.303976058959961, + "learning_rate": 4.990736139073116e-05, + "loss": 5.3493, + "step": 4610 + }, + { + "epoch": 0.027422923208678274, + "grad_norm": 2.4225149154663086, + "learning_rate": 4.990732121222268e-05, + "loss": 5.4831, + "step": 4611 + }, + { + "epoch": 0.027428870491959273, + "grad_norm": 2.5566627979278564, + "learning_rate": 4.990728102501932e-05, + "loss": 5.9159, + "step": 4612 + }, + { + "epoch": 0.02743481777524027, + "grad_norm": 2.64258074760437, + "learning_rate": 4.9907240829121085e-05, + "loss": 6.7137, + "step": 4613 + }, + { + "epoch": 0.027440765058521267, + "grad_norm": 2.967501640319824, + "learning_rate": 4.9907200624527986e-05, + "loss": 6.3333, + "step": 4614 + }, + { + "epoch": 0.027446712341802265, + "grad_norm": 2.6084952354431152, + "learning_rate": 4.990716041124005e-05, + "loss": 6.1201, + "step": 4615 + }, + { + "epoch": 0.02745265962508326, + "grad_norm": 3.0721616744995117, + "learning_rate": 4.990712018925727e-05, + "loss": 6.396, + "step": 4616 + }, + { + "epoch": 0.02745860690836426, + "grad_norm": 2.888263463973999, + "learning_rate": 4.990707995857968e-05, + "loss": 6.0773, + "step": 4617 + }, + { + "epoch": 0.027464554191645257, + "grad_norm": 2.7506093978881836, + "learning_rate": 4.990703971920728e-05, + "loss": 5.9909, + "step": 4618 + }, + { + "epoch": 0.027470501474926252, + "grad_norm": 2.8273298740386963, + "learning_rate": 4.99069994711401e-05, + "loss": 5.9591, + "step": 4619 + }, + { + "epoch": 0.02747644875820725, + "grad_norm": 2.451011896133423, + "learning_rate": 4.990695921437813e-05, + "loss": 6.1596, + "step": 4620 + }, + { + "epoch": 0.02748239604148825, + "grad_norm": 2.762265920639038, + "learning_rate": 4.990691894892141e-05, + "loss": 6.6233, + "step": 4621 + }, + { + "epoch": 0.027488343324769245, + "grad_norm": 2.4570846557617188, + "learning_rate": 4.990687867476994e-05, + "loss": 6.5025, + "step": 4622 + }, + { + "epoch": 0.027494290608050243, + "grad_norm": 3.108992576599121, + "learning_rate": 4.990683839192373e-05, + "loss": 5.921, + "step": 4623 + }, + { + "epoch": 0.02750023789133124, + "grad_norm": 2.887580156326294, + "learning_rate": 4.99067981003828e-05, + "loss": 5.9266, + "step": 4624 + }, + { + "epoch": 0.027506185174612237, + "grad_norm": 3.083556890487671, + "learning_rate": 4.990675780014718e-05, + "loss": 5.765, + "step": 4625 + }, + { + "epoch": 0.027512132457893235, + "grad_norm": 2.710231304168701, + "learning_rate": 4.990671749121685e-05, + "loss": 5.7674, + "step": 4626 + }, + { + "epoch": 0.02751807974117423, + "grad_norm": 2.738926410675049, + "learning_rate": 4.9906677173591845e-05, + "loss": 5.801, + "step": 4627 + }, + { + "epoch": 0.02752402702445523, + "grad_norm": 2.6737735271453857, + "learning_rate": 4.9906636847272176e-05, + "loss": 6.2581, + "step": 4628 + }, + { + "epoch": 0.027529974307736228, + "grad_norm": 2.623969554901123, + "learning_rate": 4.990659651225786e-05, + "loss": 5.5044, + "step": 4629 + }, + { + "epoch": 0.027535921591017223, + "grad_norm": 3.069460153579712, + "learning_rate": 4.990655616854891e-05, + "loss": 5.9639, + "step": 4630 + }, + { + "epoch": 0.02754186887429822, + "grad_norm": 2.6889147758483887, + "learning_rate": 4.990651581614534e-05, + "loss": 6.3032, + "step": 4631 + }, + { + "epoch": 0.027547816157579216, + "grad_norm": 3.5284838676452637, + "learning_rate": 4.990647545504716e-05, + "loss": 6.4104, + "step": 4632 + }, + { + "epoch": 0.027553763440860215, + "grad_norm": 2.326162338256836, + "learning_rate": 4.9906435085254384e-05, + "loss": 6.2593, + "step": 4633 + }, + { + "epoch": 0.027559710724141213, + "grad_norm": 1.946542739868164, + "learning_rate": 4.990639470676703e-05, + "loss": 6.1522, + "step": 4634 + }, + { + "epoch": 0.02756565800742221, + "grad_norm": 2.26143741607666, + "learning_rate": 4.990635431958511e-05, + "loss": 6.0189, + "step": 4635 + }, + { + "epoch": 0.027571605290703207, + "grad_norm": 2.8332626819610596, + "learning_rate": 4.990631392370865e-05, + "loss": 5.6226, + "step": 4636 + }, + { + "epoch": 0.027577552573984206, + "grad_norm": 3.919443130493164, + "learning_rate": 4.9906273519137636e-05, + "loss": 6.2147, + "step": 4637 + }, + { + "epoch": 0.0275834998572652, + "grad_norm": 2.4030275344848633, + "learning_rate": 4.9906233105872115e-05, + "loss": 5.6589, + "step": 4638 + }, + { + "epoch": 0.0275894471405462, + "grad_norm": 2.7806994915008545, + "learning_rate": 4.990619268391207e-05, + "loss": 5.4349, + "step": 4639 + }, + { + "epoch": 0.027595394423827194, + "grad_norm": 2.5759501457214355, + "learning_rate": 4.990615225325754e-05, + "loss": 6.1171, + "step": 4640 + }, + { + "epoch": 0.027601341707108193, + "grad_norm": 2.337517023086548, + "learning_rate": 4.990611181390853e-05, + "loss": 5.5514, + "step": 4641 + }, + { + "epoch": 0.02760728899038919, + "grad_norm": 2.6464250087738037, + "learning_rate": 4.990607136586505e-05, + "loss": 6.1852, + "step": 4642 + }, + { + "epoch": 0.027613236273670187, + "grad_norm": 2.030210256576538, + "learning_rate": 4.9906030909127125e-05, + "loss": 6.0919, + "step": 4643 + }, + { + "epoch": 0.027619183556951185, + "grad_norm": 2.4546520709991455, + "learning_rate": 4.990599044369475e-05, + "loss": 6.3018, + "step": 4644 + }, + { + "epoch": 0.027625130840232184, + "grad_norm": 2.508500337600708, + "learning_rate": 4.990594996956796e-05, + "loss": 5.7933, + "step": 4645 + }, + { + "epoch": 0.02763107812351318, + "grad_norm": 2.3363263607025146, + "learning_rate": 4.990590948674676e-05, + "loss": 6.4252, + "step": 4646 + }, + { + "epoch": 0.027637025406794177, + "grad_norm": 2.794673442840576, + "learning_rate": 4.990586899523116e-05, + "loss": 5.3554, + "step": 4647 + }, + { + "epoch": 0.027642972690075172, + "grad_norm": 2.5396835803985596, + "learning_rate": 4.990582849502118e-05, + "loss": 5.2352, + "step": 4648 + }, + { + "epoch": 0.02764891997335617, + "grad_norm": 2.6878976821899414, + "learning_rate": 4.990578798611684e-05, + "loss": 4.9262, + "step": 4649 + }, + { + "epoch": 0.02765486725663717, + "grad_norm": 2.2143187522888184, + "learning_rate": 4.9905747468518136e-05, + "loss": 6.0785, + "step": 4650 + }, + { + "epoch": 0.027660814539918165, + "grad_norm": 2.6812448501586914, + "learning_rate": 4.9905706942225094e-05, + "loss": 5.1692, + "step": 4651 + }, + { + "epoch": 0.027666761823199163, + "grad_norm": 2.5155227184295654, + "learning_rate": 4.9905666407237726e-05, + "loss": 5.0194, + "step": 4652 + }, + { + "epoch": 0.027672709106480158, + "grad_norm": 2.406834363937378, + "learning_rate": 4.9905625863556047e-05, + "loss": 5.1249, + "step": 4653 + }, + { + "epoch": 0.027678656389761157, + "grad_norm": 3.3666698932647705, + "learning_rate": 4.990558531118008e-05, + "loss": 5.9619, + "step": 4654 + }, + { + "epoch": 0.027684603673042155, + "grad_norm": 2.6557607650756836, + "learning_rate": 4.9905544750109826e-05, + "loss": 5.9118, + "step": 4655 + }, + { + "epoch": 0.02769055095632315, + "grad_norm": 2.60469651222229, + "learning_rate": 4.9905504180345304e-05, + "loss": 6.3746, + "step": 4656 + }, + { + "epoch": 0.02769649823960415, + "grad_norm": 2.5417349338531494, + "learning_rate": 4.9905463601886526e-05, + "loss": 5.6975, + "step": 4657 + }, + { + "epoch": 0.027702445522885148, + "grad_norm": 2.723829984664917, + "learning_rate": 4.990542301473351e-05, + "loss": 5.6189, + "step": 4658 + }, + { + "epoch": 0.027708392806166143, + "grad_norm": 3.0544204711914062, + "learning_rate": 4.990538241888627e-05, + "loss": 5.4999, + "step": 4659 + }, + { + "epoch": 0.02771434008944714, + "grad_norm": 3.0536513328552246, + "learning_rate": 4.990534181434481e-05, + "loss": 6.0636, + "step": 4660 + }, + { + "epoch": 0.027720287372728136, + "grad_norm": 3.0618786811828613, + "learning_rate": 4.990530120110916e-05, + "loss": 6.0856, + "step": 4661 + }, + { + "epoch": 0.027726234656009135, + "grad_norm": 2.6602306365966797, + "learning_rate": 4.9905260579179325e-05, + "loss": 5.8341, + "step": 4662 + }, + { + "epoch": 0.027732181939290133, + "grad_norm": 2.729137420654297, + "learning_rate": 4.990521994855532e-05, + "loss": 6.7052, + "step": 4663 + }, + { + "epoch": 0.02773812922257113, + "grad_norm": 3.0878489017486572, + "learning_rate": 4.990517930923716e-05, + "loss": 6.1308, + "step": 4664 + }, + { + "epoch": 0.027744076505852127, + "grad_norm": 2.524418354034424, + "learning_rate": 4.990513866122486e-05, + "loss": 6.2547, + "step": 4665 + }, + { + "epoch": 0.027750023789133126, + "grad_norm": 2.457075595855713, + "learning_rate": 4.990509800451844e-05, + "loss": 6.6615, + "step": 4666 + }, + { + "epoch": 0.02775597107241412, + "grad_norm": 2.474487543106079, + "learning_rate": 4.9905057339117894e-05, + "loss": 6.63, + "step": 4667 + }, + { + "epoch": 0.02776191835569512, + "grad_norm": 2.611098289489746, + "learning_rate": 4.9905016665023254e-05, + "loss": 5.8232, + "step": 4668 + }, + { + "epoch": 0.027767865638976114, + "grad_norm": 2.8012242317199707, + "learning_rate": 4.990497598223454e-05, + "loss": 5.8478, + "step": 4669 + }, + { + "epoch": 0.027773812922257113, + "grad_norm": 2.706725597381592, + "learning_rate": 4.990493529075174e-05, + "loss": 5.8585, + "step": 4670 + }, + { + "epoch": 0.02777976020553811, + "grad_norm": 2.490032196044922, + "learning_rate": 4.99048945905749e-05, + "loss": 6.2181, + "step": 4671 + }, + { + "epoch": 0.027785707488819106, + "grad_norm": 2.4735357761383057, + "learning_rate": 4.990485388170401e-05, + "loss": 6.2153, + "step": 4672 + }, + { + "epoch": 0.027791654772100105, + "grad_norm": 2.7573068141937256, + "learning_rate": 4.9904813164139094e-05, + "loss": 6.217, + "step": 4673 + }, + { + "epoch": 0.027797602055381104, + "grad_norm": 2.4663283824920654, + "learning_rate": 4.990477243788017e-05, + "loss": 6.4153, + "step": 4674 + }, + { + "epoch": 0.0278035493386621, + "grad_norm": 2.737656831741333, + "learning_rate": 4.9904731702927234e-05, + "loss": 6.5209, + "step": 4675 + }, + { + "epoch": 0.027809496621943097, + "grad_norm": 2.5112721920013428, + "learning_rate": 4.990469095928032e-05, + "loss": 5.979, + "step": 4676 + }, + { + "epoch": 0.027815443905224092, + "grad_norm": 2.6602795124053955, + "learning_rate": 4.990465020693944e-05, + "loss": 5.9206, + "step": 4677 + }, + { + "epoch": 0.02782139118850509, + "grad_norm": 2.460538625717163, + "learning_rate": 4.9904609445904606e-05, + "loss": 5.9855, + "step": 4678 + }, + { + "epoch": 0.02782733847178609, + "grad_norm": 2.750138998031616, + "learning_rate": 4.990456867617582e-05, + "loss": 5.8425, + "step": 4679 + }, + { + "epoch": 0.027833285755067085, + "grad_norm": 2.9843833446502686, + "learning_rate": 4.9904527897753114e-05, + "loss": 6.1385, + "step": 4680 + }, + { + "epoch": 0.027839233038348083, + "grad_norm": 2.586923360824585, + "learning_rate": 4.99044871106365e-05, + "loss": 5.6278, + "step": 4681 + }, + { + "epoch": 0.027845180321629078, + "grad_norm": 3.114211082458496, + "learning_rate": 4.990444631482597e-05, + "loss": 6.1259, + "step": 4682 + }, + { + "epoch": 0.027851127604910077, + "grad_norm": 2.3222453594207764, + "learning_rate": 4.990440551032157e-05, + "loss": 6.3048, + "step": 4683 + }, + { + "epoch": 0.027857074888191075, + "grad_norm": 2.15678334236145, + "learning_rate": 4.99043646971233e-05, + "loss": 5.9082, + "step": 4684 + }, + { + "epoch": 0.02786302217147207, + "grad_norm": 3.946350574493408, + "learning_rate": 4.990432387523116e-05, + "loss": 5.6907, + "step": 4685 + }, + { + "epoch": 0.02786896945475307, + "grad_norm": 2.9612419605255127, + "learning_rate": 4.9904283044645185e-05, + "loss": 5.3894, + "step": 4686 + }, + { + "epoch": 0.027874916738034067, + "grad_norm": 2.3602261543273926, + "learning_rate": 4.990424220536538e-05, + "loss": 6.0716, + "step": 4687 + }, + { + "epoch": 0.027880864021315063, + "grad_norm": 2.822300672531128, + "learning_rate": 4.990420135739177e-05, + "loss": 5.9788, + "step": 4688 + }, + { + "epoch": 0.02788681130459606, + "grad_norm": 2.766280174255371, + "learning_rate": 4.990416050072435e-05, + "loss": 5.9945, + "step": 4689 + }, + { + "epoch": 0.027892758587877056, + "grad_norm": 2.810359239578247, + "learning_rate": 4.990411963536315e-05, + "loss": 6.0598, + "step": 4690 + }, + { + "epoch": 0.027898705871158055, + "grad_norm": 2.510014295578003, + "learning_rate": 4.990407876130818e-05, + "loss": 6.1793, + "step": 4691 + }, + { + "epoch": 0.027904653154439053, + "grad_norm": 2.5394086837768555, + "learning_rate": 4.990403787855945e-05, + "loss": 6.1309, + "step": 4692 + }, + { + "epoch": 0.02791060043772005, + "grad_norm": 2.922084093093872, + "learning_rate": 4.990399698711698e-05, + "loss": 6.1956, + "step": 4693 + }, + { + "epoch": 0.027916547721001047, + "grad_norm": 3.6614181995391846, + "learning_rate": 4.9903956086980785e-05, + "loss": 6.535, + "step": 4694 + }, + { + "epoch": 0.027922495004282046, + "grad_norm": 3.3680684566497803, + "learning_rate": 4.990391517815087e-05, + "loss": 6.5729, + "step": 4695 + }, + { + "epoch": 0.02792844228756304, + "grad_norm": 2.522193431854248, + "learning_rate": 4.990387426062726e-05, + "loss": 5.9406, + "step": 4696 + }, + { + "epoch": 0.02793438957084404, + "grad_norm": 2.9665534496307373, + "learning_rate": 4.990383333440996e-05, + "loss": 6.0281, + "step": 4697 + }, + { + "epoch": 0.027940336854125034, + "grad_norm": 2.643218755722046, + "learning_rate": 4.9903792399498996e-05, + "loss": 5.8965, + "step": 4698 + }, + { + "epoch": 0.027946284137406033, + "grad_norm": 2.498765230178833, + "learning_rate": 4.990375145589436e-05, + "loss": 6.0975, + "step": 4699 + }, + { + "epoch": 0.02795223142068703, + "grad_norm": 4.380255699157715, + "learning_rate": 4.99037105035961e-05, + "loss": 6.6298, + "step": 4700 + }, + { + "epoch": 0.027958178703968026, + "grad_norm": 3.925454616546631, + "learning_rate": 4.990366954260421e-05, + "loss": 6.5742, + "step": 4701 + }, + { + "epoch": 0.027964125987249025, + "grad_norm": 2.5388591289520264, + "learning_rate": 4.99036285729187e-05, + "loss": 6.6102, + "step": 4702 + }, + { + "epoch": 0.027970073270530024, + "grad_norm": 2.6793510913848877, + "learning_rate": 4.9903587594539594e-05, + "loss": 6.4265, + "step": 4703 + }, + { + "epoch": 0.02797602055381102, + "grad_norm": 2.8652729988098145, + "learning_rate": 4.9903546607466903e-05, + "loss": 6.4567, + "step": 4704 + }, + { + "epoch": 0.027981967837092017, + "grad_norm": 2.936021089553833, + "learning_rate": 4.990350561170063e-05, + "loss": 6.404, + "step": 4705 + }, + { + "epoch": 0.027987915120373012, + "grad_norm": 3.256253719329834, + "learning_rate": 4.9903464607240816e-05, + "loss": 6.2291, + "step": 4706 + }, + { + "epoch": 0.02799386240365401, + "grad_norm": 2.8268187046051025, + "learning_rate": 4.990342359408745e-05, + "loss": 6.2582, + "step": 4707 + }, + { + "epoch": 0.02799980968693501, + "grad_norm": 2.5889041423797607, + "learning_rate": 4.9903382572240556e-05, + "loss": 6.3325, + "step": 4708 + }, + { + "epoch": 0.028005756970216004, + "grad_norm": 2.635388135910034, + "learning_rate": 4.9903341541700154e-05, + "loss": 6.1256, + "step": 4709 + }, + { + "epoch": 0.028011704253497003, + "grad_norm": 2.562976360321045, + "learning_rate": 4.990330050246625e-05, + "loss": 5.9333, + "step": 4710 + }, + { + "epoch": 0.028017651536777998, + "grad_norm": 3.488809585571289, + "learning_rate": 4.990325945453887e-05, + "loss": 6.3651, + "step": 4711 + }, + { + "epoch": 0.028023598820058997, + "grad_norm": 2.963324546813965, + "learning_rate": 4.9903218397918e-05, + "loss": 6.718, + "step": 4712 + }, + { + "epoch": 0.028029546103339995, + "grad_norm": 2.4070823192596436, + "learning_rate": 4.990317733260369e-05, + "loss": 6.2502, + "step": 4713 + }, + { + "epoch": 0.02803549338662099, + "grad_norm": 2.711190938949585, + "learning_rate": 4.9903136258595925e-05, + "loss": 6.0397, + "step": 4714 + }, + { + "epoch": 0.02804144066990199, + "grad_norm": 2.466150999069214, + "learning_rate": 4.9903095175894746e-05, + "loss": 5.9344, + "step": 4715 + }, + { + "epoch": 0.028047387953182987, + "grad_norm": 2.4558048248291016, + "learning_rate": 4.990305408450014e-05, + "loss": 6.1121, + "step": 4716 + }, + { + "epoch": 0.028053335236463982, + "grad_norm": 2.4023051261901855, + "learning_rate": 4.990301298441215e-05, + "loss": 6.0202, + "step": 4717 + }, + { + "epoch": 0.02805928251974498, + "grad_norm": 3.118098258972168, + "learning_rate": 4.9902971875630765e-05, + "loss": 6.5365, + "step": 4718 + }, + { + "epoch": 0.028065229803025976, + "grad_norm": 2.3716087341308594, + "learning_rate": 4.990293075815602e-05, + "loss": 6.1382, + "step": 4719 + }, + { + "epoch": 0.028071177086306975, + "grad_norm": 2.4663496017456055, + "learning_rate": 4.990288963198791e-05, + "loss": 5.9804, + "step": 4720 + }, + { + "epoch": 0.028077124369587973, + "grad_norm": 2.2623326778411865, + "learning_rate": 4.9902848497126466e-05, + "loss": 5.9666, + "step": 4721 + }, + { + "epoch": 0.02808307165286897, + "grad_norm": 2.4884161949157715, + "learning_rate": 4.990280735357168e-05, + "loss": 6.0203, + "step": 4722 + }, + { + "epoch": 0.028089018936149967, + "grad_norm": 2.6154520511627197, + "learning_rate": 4.990276620132359e-05, + "loss": 5.9191, + "step": 4723 + }, + { + "epoch": 0.028094966219430965, + "grad_norm": 2.692396879196167, + "learning_rate": 4.990272504038221e-05, + "loss": 6.5314, + "step": 4724 + }, + { + "epoch": 0.02810091350271196, + "grad_norm": 2.483306407928467, + "learning_rate": 4.990268387074754e-05, + "loss": 6.6522, + "step": 4725 + }, + { + "epoch": 0.02810686078599296, + "grad_norm": 3.2098593711853027, + "learning_rate": 4.99026426924196e-05, + "loss": 5.8712, + "step": 4726 + }, + { + "epoch": 0.028112808069273954, + "grad_norm": 2.7335867881774902, + "learning_rate": 4.99026015053984e-05, + "loss": 5.7678, + "step": 4727 + }, + { + "epoch": 0.028118755352554953, + "grad_norm": 2.7587473392486572, + "learning_rate": 4.990256030968396e-05, + "loss": 6.4233, + "step": 4728 + }, + { + "epoch": 0.02812470263583595, + "grad_norm": 2.7686030864715576, + "learning_rate": 4.99025191052763e-05, + "loss": 6.4572, + "step": 4729 + }, + { + "epoch": 0.028130649919116946, + "grad_norm": 2.755916118621826, + "learning_rate": 4.990247789217543e-05, + "loss": 5.9858, + "step": 4730 + }, + { + "epoch": 0.028136597202397945, + "grad_norm": 2.614316463470459, + "learning_rate": 4.990243667038135e-05, + "loss": 6.2315, + "step": 4731 + }, + { + "epoch": 0.028142544485678943, + "grad_norm": 2.0796027183532715, + "learning_rate": 4.990239543989409e-05, + "loss": 6.236, + "step": 4732 + }, + { + "epoch": 0.02814849176895994, + "grad_norm": 2.623412847518921, + "learning_rate": 4.9902354200713665e-05, + "loss": 6.3962, + "step": 4733 + }, + { + "epoch": 0.028154439052240937, + "grad_norm": 2.2746191024780273, + "learning_rate": 4.9902312952840086e-05, + "loss": 5.9101, + "step": 4734 + }, + { + "epoch": 0.028160386335521932, + "grad_norm": 2.102444887161255, + "learning_rate": 4.990227169627336e-05, + "loss": 6.4652, + "step": 4735 + }, + { + "epoch": 0.02816633361880293, + "grad_norm": 2.7720580101013184, + "learning_rate": 4.990223043101352e-05, + "loss": 5.8981, + "step": 4736 + }, + { + "epoch": 0.02817228090208393, + "grad_norm": 2.4479453563690186, + "learning_rate": 4.9902189157060564e-05, + "loss": 6.3554, + "step": 4737 + }, + { + "epoch": 0.028178228185364924, + "grad_norm": 2.7894740104675293, + "learning_rate": 4.990214787441451e-05, + "loss": 6.0017, + "step": 4738 + }, + { + "epoch": 0.028184175468645923, + "grad_norm": 2.869884490966797, + "learning_rate": 4.990210658307537e-05, + "loss": 5.9419, + "step": 4739 + }, + { + "epoch": 0.028190122751926918, + "grad_norm": 2.262723207473755, + "learning_rate": 4.990206528304316e-05, + "loss": 6.172, + "step": 4740 + }, + { + "epoch": 0.028196070035207917, + "grad_norm": 2.179358720779419, + "learning_rate": 4.99020239743179e-05, + "loss": 6.5204, + "step": 4741 + }, + { + "epoch": 0.028202017318488915, + "grad_norm": 2.085179328918457, + "learning_rate": 4.9901982656899606e-05, + "loss": 6.3972, + "step": 4742 + }, + { + "epoch": 0.02820796460176991, + "grad_norm": 1.657567024230957, + "learning_rate": 4.990194133078828e-05, + "loss": 6.4199, + "step": 4743 + }, + { + "epoch": 0.02821391188505091, + "grad_norm": 1.8054349422454834, + "learning_rate": 4.990189999598395e-05, + "loss": 6.3768, + "step": 4744 + }, + { + "epoch": 0.028219859168331907, + "grad_norm": 2.0365710258483887, + "learning_rate": 4.990185865248662e-05, + "loss": 6.3228, + "step": 4745 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 2.069211006164551, + "learning_rate": 4.9901817300296304e-05, + "loss": 5.9874, + "step": 4746 + }, + { + "epoch": 0.0282317537348939, + "grad_norm": 2.3339149951934814, + "learning_rate": 4.9901775939413026e-05, + "loss": 6.1526, + "step": 4747 + }, + { + "epoch": 0.028237701018174896, + "grad_norm": 2.0425326824188232, + "learning_rate": 4.99017345698368e-05, + "loss": 6.2157, + "step": 4748 + }, + { + "epoch": 0.028243648301455895, + "grad_norm": 2.1598799228668213, + "learning_rate": 4.9901693191567625e-05, + "loss": 6.2653, + "step": 4749 + }, + { + "epoch": 0.028249595584736893, + "grad_norm": 2.066566228866577, + "learning_rate": 4.990165180460553e-05, + "loss": 6.3788, + "step": 4750 + }, + { + "epoch": 0.02825554286801789, + "grad_norm": 2.2870383262634277, + "learning_rate": 4.9901610408950527e-05, + "loss": 6.2608, + "step": 4751 + }, + { + "epoch": 0.028261490151298887, + "grad_norm": 2.3180785179138184, + "learning_rate": 4.990156900460263e-05, + "loss": 6.3545, + "step": 4752 + }, + { + "epoch": 0.028267437434579885, + "grad_norm": 2.55261492729187, + "learning_rate": 4.990152759156185e-05, + "loss": 6.3888, + "step": 4753 + }, + { + "epoch": 0.02827338471786088, + "grad_norm": 2.087925910949707, + "learning_rate": 4.990148616982821e-05, + "loss": 6.3585, + "step": 4754 + }, + { + "epoch": 0.02827933200114188, + "grad_norm": 2.2446579933166504, + "learning_rate": 4.9901444739401714e-05, + "loss": 6.4655, + "step": 4755 + }, + { + "epoch": 0.028285279284422874, + "grad_norm": 2.2980077266693115, + "learning_rate": 4.990140330028238e-05, + "loss": 6.3776, + "step": 4756 + }, + { + "epoch": 0.028291226567703873, + "grad_norm": 2.0658226013183594, + "learning_rate": 4.9901361852470224e-05, + "loss": 6.0412, + "step": 4757 + }, + { + "epoch": 0.02829717385098487, + "grad_norm": 2.8402137756347656, + "learning_rate": 4.990132039596526e-05, + "loss": 6.0017, + "step": 4758 + }, + { + "epoch": 0.028303121134265866, + "grad_norm": 2.4620237350463867, + "learning_rate": 4.99012789307675e-05, + "loss": 5.9235, + "step": 4759 + }, + { + "epoch": 0.028309068417546865, + "grad_norm": 2.3318607807159424, + "learning_rate": 4.990123745687697e-05, + "loss": 6.2464, + "step": 4760 + }, + { + "epoch": 0.028315015700827863, + "grad_norm": 2.4998981952667236, + "learning_rate": 4.9901195974293666e-05, + "loss": 6.2731, + "step": 4761 + }, + { + "epoch": 0.02832096298410886, + "grad_norm": 2.4374287128448486, + "learning_rate": 4.9901154483017614e-05, + "loss": 6.362, + "step": 4762 + }, + { + "epoch": 0.028326910267389857, + "grad_norm": 2.6257424354553223, + "learning_rate": 4.990111298304882e-05, + "loss": 6.1456, + "step": 4763 + }, + { + "epoch": 0.028332857550670852, + "grad_norm": 2.74934458732605, + "learning_rate": 4.990107147438732e-05, + "loss": 6.0121, + "step": 4764 + }, + { + "epoch": 0.02833880483395185, + "grad_norm": 2.33137583732605, + "learning_rate": 4.9901029957033106e-05, + "loss": 6.0207, + "step": 4765 + }, + { + "epoch": 0.02834475211723285, + "grad_norm": 1.9006321430206299, + "learning_rate": 4.9900988430986196e-05, + "loss": 5.8946, + "step": 4766 + }, + { + "epoch": 0.028350699400513844, + "grad_norm": 1.9786534309387207, + "learning_rate": 4.990094689624661e-05, + "loss": 5.7782, + "step": 4767 + }, + { + "epoch": 0.028356646683794843, + "grad_norm": 2.1215951442718506, + "learning_rate": 4.9900905352814365e-05, + "loss": 5.8129, + "step": 4768 + }, + { + "epoch": 0.02836259396707584, + "grad_norm": 2.9569597244262695, + "learning_rate": 4.9900863800689465e-05, + "loss": 5.7882, + "step": 4769 + }, + { + "epoch": 0.028368541250356837, + "grad_norm": 2.720447540283203, + "learning_rate": 4.990082223987193e-05, + "loss": 5.9075, + "step": 4770 + }, + { + "epoch": 0.028374488533637835, + "grad_norm": 2.8727002143859863, + "learning_rate": 4.990078067036178e-05, + "loss": 6.1571, + "step": 4771 + }, + { + "epoch": 0.02838043581691883, + "grad_norm": 2.2992594242095947, + "learning_rate": 4.990073909215902e-05, + "loss": 6.0195, + "step": 4772 + }, + { + "epoch": 0.02838638310019983, + "grad_norm": 2.0323293209075928, + "learning_rate": 4.990069750526368e-05, + "loss": 5.8049, + "step": 4773 + }, + { + "epoch": 0.028392330383480827, + "grad_norm": 2.938795328140259, + "learning_rate": 4.9900655909675755e-05, + "loss": 6.9215, + "step": 4774 + }, + { + "epoch": 0.028398277666761822, + "grad_norm": 2.6333048343658447, + "learning_rate": 4.990061430539527e-05, + "loss": 5.868, + "step": 4775 + }, + { + "epoch": 0.02840422495004282, + "grad_norm": 2.8569674491882324, + "learning_rate": 4.990057269242223e-05, + "loss": 5.8782, + "step": 4776 + }, + { + "epoch": 0.028410172233323816, + "grad_norm": 2.62206768989563, + "learning_rate": 4.9900531070756666e-05, + "loss": 5.7751, + "step": 4777 + }, + { + "epoch": 0.028416119516604815, + "grad_norm": 2.2112414836883545, + "learning_rate": 4.990048944039858e-05, + "loss": 5.7985, + "step": 4778 + }, + { + "epoch": 0.028422066799885813, + "grad_norm": 2.1571342945098877, + "learning_rate": 4.990044780134799e-05, + "loss": 5.9089, + "step": 4779 + }, + { + "epoch": 0.028428014083166808, + "grad_norm": 2.4310410022735596, + "learning_rate": 4.9900406153604916e-05, + "loss": 5.6728, + "step": 4780 + }, + { + "epoch": 0.028433961366447807, + "grad_norm": 2.25822377204895, + "learning_rate": 4.990036449716937e-05, + "loss": 5.5808, + "step": 4781 + }, + { + "epoch": 0.028439908649728805, + "grad_norm": 2.3068299293518066, + "learning_rate": 4.990032283204136e-05, + "loss": 5.729, + "step": 4782 + }, + { + "epoch": 0.0284458559330098, + "grad_norm": 2.0582191944122314, + "learning_rate": 4.9900281158220905e-05, + "loss": 5.6877, + "step": 4783 + }, + { + "epoch": 0.0284518032162908, + "grad_norm": 2.572824239730835, + "learning_rate": 4.9900239475708015e-05, + "loss": 5.9522, + "step": 4784 + }, + { + "epoch": 0.028457750499571794, + "grad_norm": 2.299001693725586, + "learning_rate": 4.990019778450271e-05, + "loss": 5.7579, + "step": 4785 + }, + { + "epoch": 0.028463697782852793, + "grad_norm": 2.231381893157959, + "learning_rate": 4.990015608460501e-05, + "loss": 5.756, + "step": 4786 + }, + { + "epoch": 0.02846964506613379, + "grad_norm": 1.7982486486434937, + "learning_rate": 4.990011437601492e-05, + "loss": 5.8076, + "step": 4787 + }, + { + "epoch": 0.028475592349414786, + "grad_norm": 1.8788951635360718, + "learning_rate": 4.990007265873245e-05, + "loss": 5.8798, + "step": 4788 + }, + { + "epoch": 0.028481539632695785, + "grad_norm": 1.6190022230148315, + "learning_rate": 4.9900030932757623e-05, + "loss": 5.5695, + "step": 4789 + }, + { + "epoch": 0.028487486915976783, + "grad_norm": 1.9226019382476807, + "learning_rate": 4.9899989198090455e-05, + "loss": 5.671, + "step": 4790 + }, + { + "epoch": 0.02849343419925778, + "grad_norm": 1.7437139749526978, + "learning_rate": 4.989994745473097e-05, + "loss": 5.6728, + "step": 4791 + }, + { + "epoch": 0.028499381482538777, + "grad_norm": 1.624126672744751, + "learning_rate": 4.989990570267915e-05, + "loss": 5.6209, + "step": 4792 + }, + { + "epoch": 0.028505328765819772, + "grad_norm": 2.1894004344940186, + "learning_rate": 4.9899863941935046e-05, + "loss": 5.6669, + "step": 4793 + }, + { + "epoch": 0.02851127604910077, + "grad_norm": 2.2243428230285645, + "learning_rate": 4.9899822172498646e-05, + "loss": 5.4557, + "step": 4794 + }, + { + "epoch": 0.02851722333238177, + "grad_norm": 2.032611608505249, + "learning_rate": 4.989978039436998e-05, + "loss": 5.7883, + "step": 4795 + }, + { + "epoch": 0.028523170615662764, + "grad_norm": 1.8496538400650024, + "learning_rate": 4.989973860754906e-05, + "loss": 5.6329, + "step": 4796 + }, + { + "epoch": 0.028529117898943763, + "grad_norm": 1.7072707414627075, + "learning_rate": 4.989969681203589e-05, + "loss": 5.7242, + "step": 4797 + }, + { + "epoch": 0.02853506518222476, + "grad_norm": 1.7351912260055542, + "learning_rate": 4.9899655007830504e-05, + "loss": 5.648, + "step": 4798 + }, + { + "epoch": 0.028541012465505756, + "grad_norm": 2.514162302017212, + "learning_rate": 4.9899613194932904e-05, + "loss": 5.556, + "step": 4799 + }, + { + "epoch": 0.028546959748786755, + "grad_norm": 10.245063781738281, + "learning_rate": 4.98995713733431e-05, + "loss": 5.5922, + "step": 4800 + }, + { + "epoch": 0.02855290703206775, + "grad_norm": 2.012106418609619, + "learning_rate": 4.989952954306112e-05, + "loss": 5.5092, + "step": 4801 + }, + { + "epoch": 0.02855885431534875, + "grad_norm": 1.8654139041900635, + "learning_rate": 4.9899487704086966e-05, + "loss": 5.4164, + "step": 4802 + }, + { + "epoch": 0.028564801598629747, + "grad_norm": 1.778798222541809, + "learning_rate": 4.9899445856420656e-05, + "loss": 5.5537, + "step": 4803 + }, + { + "epoch": 0.028570748881910742, + "grad_norm": 2.205038547515869, + "learning_rate": 4.989940400006221e-05, + "loss": 5.9338, + "step": 4804 + }, + { + "epoch": 0.02857669616519174, + "grad_norm": 2.3908839225769043, + "learning_rate": 4.989936213501164e-05, + "loss": 5.8962, + "step": 4805 + }, + { + "epoch": 0.028582643448472736, + "grad_norm": 2.3438172340393066, + "learning_rate": 4.9899320261268966e-05, + "loss": 5.8133, + "step": 4806 + }, + { + "epoch": 0.028588590731753735, + "grad_norm": 2.4021737575531006, + "learning_rate": 4.989927837883419e-05, + "loss": 5.8366, + "step": 4807 + }, + { + "epoch": 0.028594538015034733, + "grad_norm": 1.9976004362106323, + "learning_rate": 4.989923648770734e-05, + "loss": 5.6976, + "step": 4808 + }, + { + "epoch": 0.028600485298315728, + "grad_norm": 2.2234697341918945, + "learning_rate": 4.989919458788841e-05, + "loss": 5.7871, + "step": 4809 + }, + { + "epoch": 0.028606432581596727, + "grad_norm": 2.203223705291748, + "learning_rate": 4.989915267937744e-05, + "loss": 5.5799, + "step": 4810 + }, + { + "epoch": 0.028612379864877725, + "grad_norm": 2.2155261039733887, + "learning_rate": 4.989911076217442e-05, + "loss": 5.6022, + "step": 4811 + }, + { + "epoch": 0.02861832714815872, + "grad_norm": 1.9379621744155884, + "learning_rate": 4.989906883627939e-05, + "loss": 5.8647, + "step": 4812 + }, + { + "epoch": 0.02862427443143972, + "grad_norm": 2.0589749813079834, + "learning_rate": 4.9899026901692345e-05, + "loss": 5.6048, + "step": 4813 + }, + { + "epoch": 0.028630221714720714, + "grad_norm": 2.3813774585723877, + "learning_rate": 4.9898984958413315e-05, + "loss": 5.6726, + "step": 4814 + }, + { + "epoch": 0.028636168998001713, + "grad_norm": 2.06425142288208, + "learning_rate": 4.98989430064423e-05, + "loss": 5.8505, + "step": 4815 + }, + { + "epoch": 0.02864211628128271, + "grad_norm": 2.199697494506836, + "learning_rate": 4.9898901045779326e-05, + "loss": 5.6114, + "step": 4816 + }, + { + "epoch": 0.028648063564563706, + "grad_norm": 2.136411428451538, + "learning_rate": 4.98988590764244e-05, + "loss": 5.3987, + "step": 4817 + }, + { + "epoch": 0.028654010847844705, + "grad_norm": 1.914929986000061, + "learning_rate": 4.9898817098377534e-05, + "loss": 5.702, + "step": 4818 + }, + { + "epoch": 0.028659958131125703, + "grad_norm": 2.316027879714966, + "learning_rate": 4.989877511163876e-05, + "loss": 5.5886, + "step": 4819 + }, + { + "epoch": 0.0286659054144067, + "grad_norm": 3.2775018215179443, + "learning_rate": 4.9898733116208076e-05, + "loss": 5.5337, + "step": 4820 + }, + { + "epoch": 0.028671852697687697, + "grad_norm": 2.16430926322937, + "learning_rate": 4.989869111208549e-05, + "loss": 5.7189, + "step": 4821 + }, + { + "epoch": 0.028677799980968692, + "grad_norm": 2.1936638355255127, + "learning_rate": 4.9898649099271046e-05, + "loss": 5.2942, + "step": 4822 + }, + { + "epoch": 0.02868374726424969, + "grad_norm": 2.262485980987549, + "learning_rate": 4.9898607077764736e-05, + "loss": 5.4284, + "step": 4823 + }, + { + "epoch": 0.02868969454753069, + "grad_norm": 1.7890170812606812, + "learning_rate": 4.989856504756657e-05, + "loss": 5.6021, + "step": 4824 + }, + { + "epoch": 0.028695641830811684, + "grad_norm": 1.747862696647644, + "learning_rate": 4.9898523008676585e-05, + "loss": 5.72, + "step": 4825 + }, + { + "epoch": 0.028701589114092683, + "grad_norm": 1.9750064611434937, + "learning_rate": 4.989848096109477e-05, + "loss": 5.8923, + "step": 4826 + }, + { + "epoch": 0.02870753639737368, + "grad_norm": 2.0249626636505127, + "learning_rate": 4.989843890482117e-05, + "loss": 5.4866, + "step": 4827 + }, + { + "epoch": 0.028713483680654676, + "grad_norm": 2.2737395763397217, + "learning_rate": 4.9898396839855765e-05, + "loss": 5.5498, + "step": 4828 + }, + { + "epoch": 0.028719430963935675, + "grad_norm": 2.2852187156677246, + "learning_rate": 4.98983547661986e-05, + "loss": 5.672, + "step": 4829 + }, + { + "epoch": 0.02872537824721667, + "grad_norm": 1.9441994428634644, + "learning_rate": 4.989831268384967e-05, + "loss": 5.4933, + "step": 4830 + }, + { + "epoch": 0.02873132553049767, + "grad_norm": 1.9561070203781128, + "learning_rate": 4.989827059280899e-05, + "loss": 5.7465, + "step": 4831 + }, + { + "epoch": 0.028737272813778667, + "grad_norm": 2.482849597930908, + "learning_rate": 4.9898228493076594e-05, + "loss": 5.4338, + "step": 4832 + }, + { + "epoch": 0.028743220097059662, + "grad_norm": 1.8582524061203003, + "learning_rate": 4.989818638465247e-05, + "loss": 5.5378, + "step": 4833 + }, + { + "epoch": 0.02874916738034066, + "grad_norm": 2.119783639907837, + "learning_rate": 4.9898144267536654e-05, + "loss": 5.6012, + "step": 4834 + }, + { + "epoch": 0.028755114663621656, + "grad_norm": 2.333965301513672, + "learning_rate": 4.989810214172915e-05, + "loss": 5.7376, + "step": 4835 + }, + { + "epoch": 0.028761061946902654, + "grad_norm": 2.600861072540283, + "learning_rate": 4.989806000722999e-05, + "loss": 6.2747, + "step": 4836 + }, + { + "epoch": 0.028767009230183653, + "grad_norm": 2.3250534534454346, + "learning_rate": 4.989801786403916e-05, + "loss": 5.5993, + "step": 4837 + }, + { + "epoch": 0.028772956513464648, + "grad_norm": 2.507377862930298, + "learning_rate": 4.9897975712156686e-05, + "loss": 5.3919, + "step": 4838 + }, + { + "epoch": 0.028778903796745647, + "grad_norm": 1.9882018566131592, + "learning_rate": 4.9897933551582596e-05, + "loss": 5.5939, + "step": 4839 + }, + { + "epoch": 0.028784851080026645, + "grad_norm": 2.235269784927368, + "learning_rate": 4.989789138231688e-05, + "loss": 5.4036, + "step": 4840 + }, + { + "epoch": 0.02879079836330764, + "grad_norm": 1.895071029663086, + "learning_rate": 4.989784920435959e-05, + "loss": 5.7259, + "step": 4841 + }, + { + "epoch": 0.02879674564658864, + "grad_norm": 2.0197908878326416, + "learning_rate": 4.989780701771071e-05, + "loss": 5.5114, + "step": 4842 + }, + { + "epoch": 0.028802692929869634, + "grad_norm": 1.9679557085037231, + "learning_rate": 4.989776482237025e-05, + "loss": 5.5798, + "step": 4843 + }, + { + "epoch": 0.028808640213150633, + "grad_norm": 1.980610728263855, + "learning_rate": 4.989772261833825e-05, + "loss": 5.5509, + "step": 4844 + }, + { + "epoch": 0.02881458749643163, + "grad_norm": 2.4565272331237793, + "learning_rate": 4.989768040561471e-05, + "loss": 5.4723, + "step": 4845 + }, + { + "epoch": 0.028820534779712626, + "grad_norm": 2.0567848682403564, + "learning_rate": 4.989763818419964e-05, + "loss": 5.546, + "step": 4846 + }, + { + "epoch": 0.028826482062993625, + "grad_norm": 2.0259108543395996, + "learning_rate": 4.989759595409307e-05, + "loss": 5.4138, + "step": 4847 + }, + { + "epoch": 0.028832429346274623, + "grad_norm": 1.9334442615509033, + "learning_rate": 4.9897553715295003e-05, + "loss": 5.7036, + "step": 4848 + }, + { + "epoch": 0.02883837662955562, + "grad_norm": 1.8335916996002197, + "learning_rate": 4.989751146780546e-05, + "loss": 5.6399, + "step": 4849 + }, + { + "epoch": 0.028844323912836617, + "grad_norm": 2.129821538925171, + "learning_rate": 4.989746921162445e-05, + "loss": 5.7108, + "step": 4850 + }, + { + "epoch": 0.028850271196117612, + "grad_norm": 2.4127001762390137, + "learning_rate": 4.9897426946751994e-05, + "loss": 5.3901, + "step": 4851 + }, + { + "epoch": 0.02885621847939861, + "grad_norm": 1.9506126642227173, + "learning_rate": 4.98973846731881e-05, + "loss": 5.7781, + "step": 4852 + }, + { + "epoch": 0.02886216576267961, + "grad_norm": 1.6746875047683716, + "learning_rate": 4.9897342390932786e-05, + "loss": 5.7408, + "step": 4853 + }, + { + "epoch": 0.028868113045960604, + "grad_norm": 1.95681893825531, + "learning_rate": 4.989730009998607e-05, + "loss": 5.7181, + "step": 4854 + }, + { + "epoch": 0.028874060329241603, + "grad_norm": 1.782030701637268, + "learning_rate": 4.9897257800347964e-05, + "loss": 5.5901, + "step": 4855 + }, + { + "epoch": 0.0288800076125226, + "grad_norm": 1.7590057849884033, + "learning_rate": 4.9897215492018476e-05, + "loss": 5.4566, + "step": 4856 + }, + { + "epoch": 0.028885954895803596, + "grad_norm": 2.4675025939941406, + "learning_rate": 4.989717317499764e-05, + "loss": 5.7738, + "step": 4857 + }, + { + "epoch": 0.028891902179084595, + "grad_norm": 2.221975326538086, + "learning_rate": 4.989713084928545e-05, + "loss": 5.591, + "step": 4858 + }, + { + "epoch": 0.02889784946236559, + "grad_norm": 2.21158504486084, + "learning_rate": 4.989708851488192e-05, + "loss": 5.7755, + "step": 4859 + }, + { + "epoch": 0.02890379674564659, + "grad_norm": 2.2253987789154053, + "learning_rate": 4.989704617178709e-05, + "loss": 5.8653, + "step": 4860 + }, + { + "epoch": 0.028909744028927587, + "grad_norm": 2.3298027515411377, + "learning_rate": 4.989700382000094e-05, + "loss": 5.3371, + "step": 4861 + }, + { + "epoch": 0.028915691312208582, + "grad_norm": 2.1918935775756836, + "learning_rate": 4.989696145952352e-05, + "loss": 5.4893, + "step": 4862 + }, + { + "epoch": 0.02892163859548958, + "grad_norm": 2.422117233276367, + "learning_rate": 4.989691909035482e-05, + "loss": 5.8775, + "step": 4863 + }, + { + "epoch": 0.02892758587877058, + "grad_norm": 2.4346981048583984, + "learning_rate": 4.989687671249487e-05, + "loss": 6.3671, + "step": 4864 + }, + { + "epoch": 0.028933533162051574, + "grad_norm": 2.094780921936035, + "learning_rate": 4.989683432594367e-05, + "loss": 5.7814, + "step": 4865 + }, + { + "epoch": 0.028939480445332573, + "grad_norm": 2.240318775177002, + "learning_rate": 4.9896791930701244e-05, + "loss": 5.6606, + "step": 4866 + }, + { + "epoch": 0.028945427728613568, + "grad_norm": 2.102381706237793, + "learning_rate": 4.989674952676761e-05, + "loss": 5.8477, + "step": 4867 + }, + { + "epoch": 0.028951375011894567, + "grad_norm": 2.2786238193511963, + "learning_rate": 4.989670711414277e-05, + "loss": 5.8786, + "step": 4868 + }, + { + "epoch": 0.028957322295175565, + "grad_norm": 2.079899549484253, + "learning_rate": 4.989666469282675e-05, + "loss": 6.2171, + "step": 4869 + }, + { + "epoch": 0.02896326957845656, + "grad_norm": 2.024061679840088, + "learning_rate": 4.989662226281956e-05, + "loss": 6.2889, + "step": 4870 + }, + { + "epoch": 0.02896921686173756, + "grad_norm": 2.1397578716278076, + "learning_rate": 4.989657982412122e-05, + "loss": 6.2477, + "step": 4871 + }, + { + "epoch": 0.028975164145018554, + "grad_norm": 2.1303393840789795, + "learning_rate": 4.989653737673174e-05, + "loss": 6.3005, + "step": 4872 + }, + { + "epoch": 0.028981111428299552, + "grad_norm": 2.4091451168060303, + "learning_rate": 4.989649492065114e-05, + "loss": 5.997, + "step": 4873 + }, + { + "epoch": 0.02898705871158055, + "grad_norm": 2.2236886024475098, + "learning_rate": 4.989645245587942e-05, + "loss": 5.7886, + "step": 4874 + }, + { + "epoch": 0.028993005994861546, + "grad_norm": 2.6160736083984375, + "learning_rate": 4.989640998241661e-05, + "loss": 6.1542, + "step": 4875 + }, + { + "epoch": 0.028998953278142545, + "grad_norm": 2.4163296222686768, + "learning_rate": 4.989636750026273e-05, + "loss": 6.392, + "step": 4876 + }, + { + "epoch": 0.029004900561423543, + "grad_norm": 2.079172372817993, + "learning_rate": 4.989632500941778e-05, + "loss": 6.2886, + "step": 4877 + }, + { + "epoch": 0.02901084784470454, + "grad_norm": 2.628694772720337, + "learning_rate": 4.989628250988178e-05, + "loss": 6.0359, + "step": 4878 + }, + { + "epoch": 0.029016795127985537, + "grad_norm": 2.2080392837524414, + "learning_rate": 4.989624000165474e-05, + "loss": 5.9916, + "step": 4879 + }, + { + "epoch": 0.029022742411266532, + "grad_norm": 2.4130380153656006, + "learning_rate": 4.9896197484736685e-05, + "loss": 6.3835, + "step": 4880 + }, + { + "epoch": 0.02902868969454753, + "grad_norm": 2.328511953353882, + "learning_rate": 4.989615495912762e-05, + "loss": 5.838, + "step": 4881 + }, + { + "epoch": 0.02903463697782853, + "grad_norm": 2.273345470428467, + "learning_rate": 4.989611242482757e-05, + "loss": 5.8764, + "step": 4882 + }, + { + "epoch": 0.029040584261109524, + "grad_norm": 2.1498537063598633, + "learning_rate": 4.9896069881836535e-05, + "loss": 6.1562, + "step": 4883 + }, + { + "epoch": 0.029046531544390523, + "grad_norm": 2.497267723083496, + "learning_rate": 4.989602733015455e-05, + "loss": 5.6708, + "step": 4884 + }, + { + "epoch": 0.02905247882767152, + "grad_norm": 2.232802152633667, + "learning_rate": 4.989598476978161e-05, + "loss": 5.6854, + "step": 4885 + }, + { + "epoch": 0.029058426110952516, + "grad_norm": 2.0582375526428223, + "learning_rate": 4.989594220071775e-05, + "loss": 6.5288, + "step": 4886 + }, + { + "epoch": 0.029064373394233515, + "grad_norm": 3.2556731700897217, + "learning_rate": 4.989589962296296e-05, + "loss": 5.9985, + "step": 4887 + }, + { + "epoch": 0.02907032067751451, + "grad_norm": 2.2807655334472656, + "learning_rate": 4.989585703651728e-05, + "loss": 6.1802, + "step": 4888 + }, + { + "epoch": 0.02907626796079551, + "grad_norm": 2.379136085510254, + "learning_rate": 4.989581444138071e-05, + "loss": 6.3531, + "step": 4889 + }, + { + "epoch": 0.029082215244076507, + "grad_norm": 2.9518685340881348, + "learning_rate": 4.989577183755327e-05, + "loss": 6.0689, + "step": 4890 + }, + { + "epoch": 0.029088162527357502, + "grad_norm": 2.823340654373169, + "learning_rate": 4.9895729225034973e-05, + "loss": 6.3405, + "step": 4891 + }, + { + "epoch": 0.0290941098106385, + "grad_norm": 2.4327731132507324, + "learning_rate": 4.989568660382583e-05, + "loss": 6.4928, + "step": 4892 + }, + { + "epoch": 0.0291000570939195, + "grad_norm": 2.0744240283966064, + "learning_rate": 4.9895643973925864e-05, + "loss": 6.2664, + "step": 4893 + }, + { + "epoch": 0.029106004377200494, + "grad_norm": 2.373710870742798, + "learning_rate": 4.9895601335335085e-05, + "loss": 5.9738, + "step": 4894 + }, + { + "epoch": 0.029111951660481493, + "grad_norm": 2.2934412956237793, + "learning_rate": 4.9895558688053505e-05, + "loss": 6.1353, + "step": 4895 + }, + { + "epoch": 0.029117898943762488, + "grad_norm": 2.4360926151275635, + "learning_rate": 4.989551603208114e-05, + "loss": 5.4768, + "step": 4896 + }, + { + "epoch": 0.029123846227043487, + "grad_norm": 2.8072469234466553, + "learning_rate": 4.989547336741802e-05, + "loss": 5.977, + "step": 4897 + }, + { + "epoch": 0.029129793510324485, + "grad_norm": 2.7759921550750732, + "learning_rate": 4.9895430694064135e-05, + "loss": 6.3918, + "step": 4898 + }, + { + "epoch": 0.02913574079360548, + "grad_norm": 2.4547574520111084, + "learning_rate": 4.989538801201953e-05, + "loss": 6.0461, + "step": 4899 + }, + { + "epoch": 0.02914168807688648, + "grad_norm": 2.6097168922424316, + "learning_rate": 4.9895345321284184e-05, + "loss": 5.88, + "step": 4900 + }, + { + "epoch": 0.029147635360167474, + "grad_norm": 2.8312575817108154, + "learning_rate": 4.989530262185814e-05, + "loss": 6.0314, + "step": 4901 + }, + { + "epoch": 0.029153582643448472, + "grad_norm": 2.928974151611328, + "learning_rate": 4.98952599137414e-05, + "loss": 6.3698, + "step": 4902 + }, + { + "epoch": 0.02915952992672947, + "grad_norm": 2.527578115463257, + "learning_rate": 4.989521719693398e-05, + "loss": 6.4301, + "step": 4903 + }, + { + "epoch": 0.029165477210010466, + "grad_norm": 2.392106771469116, + "learning_rate": 4.9895174471435904e-05, + "loss": 6.3515, + "step": 4904 + }, + { + "epoch": 0.029171424493291465, + "grad_norm": 1.9899437427520752, + "learning_rate": 4.989513173724717e-05, + "loss": 6.3265, + "step": 4905 + }, + { + "epoch": 0.029177371776572463, + "grad_norm": 2.057600736618042, + "learning_rate": 4.9895088994367806e-05, + "loss": 6.2402, + "step": 4906 + }, + { + "epoch": 0.029183319059853458, + "grad_norm": 2.8310391902923584, + "learning_rate": 4.989504624279783e-05, + "loss": 5.9056, + "step": 4907 + }, + { + "epoch": 0.029189266343134457, + "grad_norm": 2.904785394668579, + "learning_rate": 4.989500348253724e-05, + "loss": 5.8847, + "step": 4908 + }, + { + "epoch": 0.029195213626415452, + "grad_norm": 2.7728030681610107, + "learning_rate": 4.989496071358607e-05, + "loss": 5.8997, + "step": 4909 + }, + { + "epoch": 0.02920116090969645, + "grad_norm": 2.768862009048462, + "learning_rate": 4.989491793594432e-05, + "loss": 6.1267, + "step": 4910 + }, + { + "epoch": 0.02920710819297745, + "grad_norm": 2.4353668689727783, + "learning_rate": 4.989487514961201e-05, + "loss": 5.9087, + "step": 4911 + }, + { + "epoch": 0.029213055476258444, + "grad_norm": 2.5170469284057617, + "learning_rate": 4.9894832354589164e-05, + "loss": 6.0971, + "step": 4912 + }, + { + "epoch": 0.029219002759539443, + "grad_norm": 2.345998764038086, + "learning_rate": 4.9894789550875784e-05, + "loss": 6.2518, + "step": 4913 + }, + { + "epoch": 0.02922495004282044, + "grad_norm": 2.429123878479004, + "learning_rate": 4.98947467384719e-05, + "loss": 6.238, + "step": 4914 + }, + { + "epoch": 0.029230897326101436, + "grad_norm": 2.531514883041382, + "learning_rate": 4.9894703917377506e-05, + "loss": 6.0177, + "step": 4915 + }, + { + "epoch": 0.029236844609382435, + "grad_norm": 2.833874464035034, + "learning_rate": 4.9894661087592634e-05, + "loss": 6.2018, + "step": 4916 + }, + { + "epoch": 0.02924279189266343, + "grad_norm": 2.521381378173828, + "learning_rate": 4.9894618249117287e-05, + "loss": 6.1777, + "step": 4917 + }, + { + "epoch": 0.02924873917594443, + "grad_norm": 2.731703758239746, + "learning_rate": 4.989457540195149e-05, + "loss": 6.0237, + "step": 4918 + }, + { + "epoch": 0.029254686459225427, + "grad_norm": 2.918398141860962, + "learning_rate": 4.989453254609525e-05, + "loss": 6.5688, + "step": 4919 + }, + { + "epoch": 0.029260633742506422, + "grad_norm": 2.407552480697632, + "learning_rate": 4.989448968154859e-05, + "loss": 5.9751, + "step": 4920 + }, + { + "epoch": 0.02926658102578742, + "grad_norm": 2.575258731842041, + "learning_rate": 4.989444680831152e-05, + "loss": 5.7587, + "step": 4921 + }, + { + "epoch": 0.02927252830906842, + "grad_norm": 2.6550750732421875, + "learning_rate": 4.989440392638406e-05, + "loss": 6.6404, + "step": 4922 + }, + { + "epoch": 0.029278475592349414, + "grad_norm": 2.569438934326172, + "learning_rate": 4.989436103576621e-05, + "loss": 5.8615, + "step": 4923 + }, + { + "epoch": 0.029284422875630413, + "grad_norm": 2.4601991176605225, + "learning_rate": 4.989431813645801e-05, + "loss": 5.8969, + "step": 4924 + }, + { + "epoch": 0.029290370158911408, + "grad_norm": 3.579819917678833, + "learning_rate": 4.989427522845945e-05, + "loss": 5.8832, + "step": 4925 + }, + { + "epoch": 0.029296317442192406, + "grad_norm": 2.5762264728546143, + "learning_rate": 4.9894232311770556e-05, + "loss": 5.4841, + "step": 4926 + }, + { + "epoch": 0.029302264725473405, + "grad_norm": 3.352381706237793, + "learning_rate": 4.989418938639134e-05, + "loss": 5.8936, + "step": 4927 + }, + { + "epoch": 0.0293082120087544, + "grad_norm": 2.824322462081909, + "learning_rate": 4.9894146452321835e-05, + "loss": 5.8291, + "step": 4928 + }, + { + "epoch": 0.0293141592920354, + "grad_norm": 2.6431384086608887, + "learning_rate": 4.9894103509562026e-05, + "loss": 6.2519, + "step": 4929 + }, + { + "epoch": 0.029320106575316394, + "grad_norm": 3.0580949783325195, + "learning_rate": 4.989406055811195e-05, + "loss": 6.4141, + "step": 4930 + }, + { + "epoch": 0.029326053858597392, + "grad_norm": 2.757420778274536, + "learning_rate": 4.989401759797161e-05, + "loss": 6.1427, + "step": 4931 + }, + { + "epoch": 0.02933200114187839, + "grad_norm": 2.713111639022827, + "learning_rate": 4.989397462914103e-05, + "loss": 6.4107, + "step": 4932 + }, + { + "epoch": 0.029337948425159386, + "grad_norm": 2.7954351902008057, + "learning_rate": 4.9893931651620215e-05, + "loss": 5.7657, + "step": 4933 + }, + { + "epoch": 0.029343895708440385, + "grad_norm": 2.3637917041778564, + "learning_rate": 4.9893888665409196e-05, + "loss": 5.8209, + "step": 4934 + }, + { + "epoch": 0.029349842991721383, + "grad_norm": 2.938631296157837, + "learning_rate": 4.9893845670507964e-05, + "loss": 6.0502, + "step": 4935 + }, + { + "epoch": 0.029355790275002378, + "grad_norm": 2.8911824226379395, + "learning_rate": 4.989380266691655e-05, + "loss": 5.9736, + "step": 4936 + }, + { + "epoch": 0.029361737558283377, + "grad_norm": 2.9410245418548584, + "learning_rate": 4.989375965463498e-05, + "loss": 5.2824, + "step": 4937 + }, + { + "epoch": 0.029367684841564372, + "grad_norm": 2.4925217628479004, + "learning_rate": 4.9893716633663244e-05, + "loss": 5.5829, + "step": 4938 + }, + { + "epoch": 0.02937363212484537, + "grad_norm": 2.485349178314209, + "learning_rate": 4.9893673604001366e-05, + "loss": 5.8812, + "step": 4939 + }, + { + "epoch": 0.02937957940812637, + "grad_norm": 2.3950133323669434, + "learning_rate": 4.9893630565649376e-05, + "loss": 5.9314, + "step": 4940 + }, + { + "epoch": 0.029385526691407364, + "grad_norm": 2.28104829788208, + "learning_rate": 4.989358751860726e-05, + "loss": 6.1768, + "step": 4941 + }, + { + "epoch": 0.029391473974688363, + "grad_norm": 2.4479010105133057, + "learning_rate": 4.989354446287507e-05, + "loss": 6.1645, + "step": 4942 + }, + { + "epoch": 0.02939742125796936, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.989350139845279e-05, + "loss": 5.7145, + "step": 4943 + }, + { + "epoch": 0.029403368541250356, + "grad_norm": 2.4120032787323, + "learning_rate": 4.989345832534045e-05, + "loss": 5.695, + "step": 4944 + }, + { + "epoch": 0.029409315824531355, + "grad_norm": 2.6345109939575195, + "learning_rate": 4.989341524353805e-05, + "loss": 5.4805, + "step": 4945 + }, + { + "epoch": 0.02941526310781235, + "grad_norm": 2.8750240802764893, + "learning_rate": 4.989337215304563e-05, + "loss": 5.0352, + "step": 4946 + }, + { + "epoch": 0.02942121039109335, + "grad_norm": 2.7220489978790283, + "learning_rate": 4.989332905386318e-05, + "loss": 5.1646, + "step": 4947 + }, + { + "epoch": 0.029427157674374347, + "grad_norm": 2.464871883392334, + "learning_rate": 4.9893285945990734e-05, + "loss": 4.9989, + "step": 4948 + }, + { + "epoch": 0.029433104957655342, + "grad_norm": 2.261049270629883, + "learning_rate": 4.989324282942829e-05, + "loss": 6.2217, + "step": 4949 + }, + { + "epoch": 0.02943905224093634, + "grad_norm": 2.224818468093872, + "learning_rate": 4.9893199704175876e-05, + "loss": 6.3964, + "step": 4950 + }, + { + "epoch": 0.02944499952421734, + "grad_norm": 2.366520643234253, + "learning_rate": 4.989315657023351e-05, + "loss": 6.3572, + "step": 4951 + }, + { + "epoch": 0.029450946807498334, + "grad_norm": 2.4811010360717773, + "learning_rate": 4.989311342760119e-05, + "loss": 5.7867, + "step": 4952 + }, + { + "epoch": 0.029456894090779333, + "grad_norm": 2.246730089187622, + "learning_rate": 4.989307027627895e-05, + "loss": 6.0865, + "step": 4953 + }, + { + "epoch": 0.029462841374060328, + "grad_norm": 2.297379493713379, + "learning_rate": 4.989302711626679e-05, + "loss": 5.9257, + "step": 4954 + }, + { + "epoch": 0.029468788657341326, + "grad_norm": 2.5890488624572754, + "learning_rate": 4.989298394756473e-05, + "loss": 5.7631, + "step": 4955 + }, + { + "epoch": 0.029474735940622325, + "grad_norm": 3.3777449131011963, + "learning_rate": 4.989294077017279e-05, + "loss": 5.4014, + "step": 4956 + }, + { + "epoch": 0.02948068322390332, + "grad_norm": 2.0395402908325195, + "learning_rate": 4.9892897584090986e-05, + "loss": 6.2429, + "step": 4957 + }, + { + "epoch": 0.02948663050718432, + "grad_norm": 2.0414693355560303, + "learning_rate": 4.989285438931932e-05, + "loss": 6.4685, + "step": 4958 + }, + { + "epoch": 0.029492577790465314, + "grad_norm": 2.2383265495300293, + "learning_rate": 4.989281118585783e-05, + "loss": 6.1651, + "step": 4959 + }, + { + "epoch": 0.029498525073746312, + "grad_norm": 2.559720754623413, + "learning_rate": 4.98927679737065e-05, + "loss": 6.3822, + "step": 4960 + }, + { + "epoch": 0.02950447235702731, + "grad_norm": 2.810699939727783, + "learning_rate": 4.989272475286537e-05, + "loss": 6.2076, + "step": 4961 + }, + { + "epoch": 0.029510419640308306, + "grad_norm": 2.9151525497436523, + "learning_rate": 4.989268152333445e-05, + "loss": 5.9892, + "step": 4962 + }, + { + "epoch": 0.029516366923589304, + "grad_norm": 2.295197010040283, + "learning_rate": 4.9892638285113744e-05, + "loss": 6.1392, + "step": 4963 + }, + { + "epoch": 0.029522314206870303, + "grad_norm": 2.271088123321533, + "learning_rate": 4.989259503820328e-05, + "loss": 6.6991, + "step": 4964 + }, + { + "epoch": 0.029528261490151298, + "grad_norm": 2.338074207305908, + "learning_rate": 4.9892551782603064e-05, + "loss": 5.9615, + "step": 4965 + }, + { + "epoch": 0.029534208773432297, + "grad_norm": 2.3510494232177734, + "learning_rate": 4.989250851831312e-05, + "loss": 5.8894, + "step": 4966 + }, + { + "epoch": 0.029540156056713292, + "grad_norm": 2.1170454025268555, + "learning_rate": 4.989246524533345e-05, + "loss": 5.6921, + "step": 4967 + }, + { + "epoch": 0.02954610333999429, + "grad_norm": 3.289508104324341, + "learning_rate": 4.989242196366409e-05, + "loss": 6.1689, + "step": 4968 + }, + { + "epoch": 0.02955205062327529, + "grad_norm": 2.068229913711548, + "learning_rate": 4.989237867330504e-05, + "loss": 6.3342, + "step": 4969 + }, + { + "epoch": 0.029557997906556284, + "grad_norm": 2.198928117752075, + "learning_rate": 4.9892335374256316e-05, + "loss": 6.5125, + "step": 4970 + }, + { + "epoch": 0.029563945189837283, + "grad_norm": 2.3634228706359863, + "learning_rate": 4.989229206651793e-05, + "loss": 5.8328, + "step": 4971 + }, + { + "epoch": 0.02956989247311828, + "grad_norm": 2.1632115840911865, + "learning_rate": 4.989224875008991e-05, + "loss": 6.0702, + "step": 4972 + }, + { + "epoch": 0.029575839756399276, + "grad_norm": 2.461888313293457, + "learning_rate": 4.989220542497226e-05, + "loss": 6.01, + "step": 4973 + }, + { + "epoch": 0.029581787039680275, + "grad_norm": 2.668333053588867, + "learning_rate": 4.9892162091164997e-05, + "loss": 6.0369, + "step": 4974 + }, + { + "epoch": 0.02958773432296127, + "grad_norm": 3.0210723876953125, + "learning_rate": 4.9892118748668135e-05, + "loss": 6.0652, + "step": 4975 + }, + { + "epoch": 0.02959368160624227, + "grad_norm": 2.937350034713745, + "learning_rate": 4.98920753974817e-05, + "loss": 6.0205, + "step": 4976 + }, + { + "epoch": 0.029599628889523267, + "grad_norm": 2.904499053955078, + "learning_rate": 4.9892032037605685e-05, + "loss": 5.9561, + "step": 4977 + }, + { + "epoch": 0.029605576172804262, + "grad_norm": 2.218867778778076, + "learning_rate": 4.989198866904013e-05, + "loss": 5.4173, + "step": 4978 + }, + { + "epoch": 0.02961152345608526, + "grad_norm": 3.009920835494995, + "learning_rate": 4.9891945291785034e-05, + "loss": 5.5577, + "step": 4979 + }, + { + "epoch": 0.02961747073936626, + "grad_norm": 2.731687545776367, + "learning_rate": 4.9891901905840424e-05, + "loss": 5.6591, + "step": 4980 + }, + { + "epoch": 0.029623418022647254, + "grad_norm": 2.244101047515869, + "learning_rate": 4.98918585112063e-05, + "loss": 6.1434, + "step": 4981 + }, + { + "epoch": 0.029629365305928253, + "grad_norm": 2.3366870880126953, + "learning_rate": 4.989181510788269e-05, + "loss": 6.0132, + "step": 4982 + }, + { + "epoch": 0.029635312589209248, + "grad_norm": 3.2757890224456787, + "learning_rate": 4.98917716958696e-05, + "loss": 5.7486, + "step": 4983 + }, + { + "epoch": 0.029641259872490246, + "grad_norm": 2.361041784286499, + "learning_rate": 4.989172827516705e-05, + "loss": 5.8192, + "step": 4984 + }, + { + "epoch": 0.029647207155771245, + "grad_norm": 3.3433775901794434, + "learning_rate": 4.9891684845775054e-05, + "loss": 5.8688, + "step": 4985 + }, + { + "epoch": 0.02965315443905224, + "grad_norm": 2.6427462100982666, + "learning_rate": 4.9891641407693635e-05, + "loss": 5.9459, + "step": 4986 + }, + { + "epoch": 0.02965910172233324, + "grad_norm": 3.0931055545806885, + "learning_rate": 4.9891597960922795e-05, + "loss": 6.4822, + "step": 4987 + }, + { + "epoch": 0.029665049005614237, + "grad_norm": 2.598477840423584, + "learning_rate": 4.989155450546256e-05, + "loss": 6.0362, + "step": 4988 + }, + { + "epoch": 0.029670996288895232, + "grad_norm": 2.460313081741333, + "learning_rate": 4.989151104131294e-05, + "loss": 5.6209, + "step": 4989 + }, + { + "epoch": 0.02967694357217623, + "grad_norm": 2.4712390899658203, + "learning_rate": 4.989146756847395e-05, + "loss": 6.3849, + "step": 4990 + }, + { + "epoch": 0.029682890855457226, + "grad_norm": 2.365860939025879, + "learning_rate": 4.98914240869456e-05, + "loss": 6.2791, + "step": 4991 + }, + { + "epoch": 0.029688838138738224, + "grad_norm": 2.6213366985321045, + "learning_rate": 4.9891380596727915e-05, + "loss": 6.2888, + "step": 4992 + }, + { + "epoch": 0.029694785422019223, + "grad_norm": 2.742213487625122, + "learning_rate": 4.989133709782091e-05, + "loss": 6.3522, + "step": 4993 + }, + { + "epoch": 0.029700732705300218, + "grad_norm": 2.2428665161132812, + "learning_rate": 4.9891293590224594e-05, + "loss": 6.6735, + "step": 4994 + }, + { + "epoch": 0.029706679988581217, + "grad_norm": 2.4242279529571533, + "learning_rate": 4.989125007393898e-05, + "loss": 6.2283, + "step": 4995 + }, + { + "epoch": 0.02971262727186221, + "grad_norm": 2.422177314758301, + "learning_rate": 4.989120654896409e-05, + "loss": 6.0273, + "step": 4996 + }, + { + "epoch": 0.02971857455514321, + "grad_norm": 2.4325926303863525, + "learning_rate": 4.989116301529994e-05, + "loss": 5.9504, + "step": 4997 + }, + { + "epoch": 0.02972452183842421, + "grad_norm": 2.42901873588562, + "learning_rate": 4.9891119472946544e-05, + "loss": 5.8156, + "step": 4998 + }, + { + "epoch": 0.029730469121705204, + "grad_norm": 2.4361307621002197, + "learning_rate": 4.989107592190391e-05, + "loss": 5.9025, + "step": 4999 + }, + { + "epoch": 0.029736416404986202, + "grad_norm": 2.9486470222473145, + "learning_rate": 4.9891032362172065e-05, + "loss": 6.3204, + "step": 5000 + }, + { + "epoch": 0.0297423636882672, + "grad_norm": 2.456681966781616, + "learning_rate": 4.989098879375101e-05, + "loss": 5.8203, + "step": 5001 + }, + { + "epoch": 0.029748310971548196, + "grad_norm": 2.5065391063690186, + "learning_rate": 4.9890945216640775e-05, + "loss": 6.452, + "step": 5002 + }, + { + "epoch": 0.029754258254829195, + "grad_norm": 2.386488199234009, + "learning_rate": 4.989090163084136e-05, + "loss": 5.9195, + "step": 5003 + }, + { + "epoch": 0.02976020553811019, + "grad_norm": 2.1387040615081787, + "learning_rate": 4.9890858036352796e-05, + "loss": 6.2127, + "step": 5004 + }, + { + "epoch": 0.02976615282139119, + "grad_norm": 2.518099784851074, + "learning_rate": 4.989081443317508e-05, + "loss": 6.1099, + "step": 5005 + }, + { + "epoch": 0.029772100104672187, + "grad_norm": 3.2108826637268066, + "learning_rate": 4.989077082130825e-05, + "loss": 5.9808, + "step": 5006 + }, + { + "epoch": 0.029778047387953182, + "grad_norm": 2.176065444946289, + "learning_rate": 4.9890727200752304e-05, + "loss": 6.0825, + "step": 5007 + }, + { + "epoch": 0.02978399467123418, + "grad_norm": 2.2961249351501465, + "learning_rate": 4.9890683571507265e-05, + "loss": 5.968, + "step": 5008 + }, + { + "epoch": 0.02978994195451518, + "grad_norm": 2.1954386234283447, + "learning_rate": 4.9890639933573144e-05, + "loss": 6.0799, + "step": 5009 + }, + { + "epoch": 0.029795889237796174, + "grad_norm": 2.256039619445801, + "learning_rate": 4.989059628694995e-05, + "loss": 5.9503, + "step": 5010 + }, + { + "epoch": 0.029801836521077173, + "grad_norm": 2.4350922107696533, + "learning_rate": 4.9890552631637715e-05, + "loss": 5.6741, + "step": 5011 + }, + { + "epoch": 0.029807783804358168, + "grad_norm": 2.68904447555542, + "learning_rate": 4.989050896763645e-05, + "loss": 5.5872, + "step": 5012 + }, + { + "epoch": 0.029813731087639166, + "grad_norm": 2.2877871990203857, + "learning_rate": 4.989046529494615e-05, + "loss": 6.1273, + "step": 5013 + }, + { + "epoch": 0.029819678370920165, + "grad_norm": 2.350348711013794, + "learning_rate": 4.989042161356686e-05, + "loss": 6.1113, + "step": 5014 + }, + { + "epoch": 0.02982562565420116, + "grad_norm": 2.295382499694824, + "learning_rate": 4.989037792349858e-05, + "loss": 6.036, + "step": 5015 + }, + { + "epoch": 0.02983157293748216, + "grad_norm": 2.317863941192627, + "learning_rate": 4.989033422474131e-05, + "loss": 5.961, + "step": 5016 + }, + { + "epoch": 0.029837520220763157, + "grad_norm": 2.286289930343628, + "learning_rate": 4.9890290517295095e-05, + "loss": 5.8163, + "step": 5017 + }, + { + "epoch": 0.029843467504044152, + "grad_norm": 2.246863842010498, + "learning_rate": 4.989024680115993e-05, + "loss": 5.9689, + "step": 5018 + }, + { + "epoch": 0.02984941478732515, + "grad_norm": 1.8732661008834839, + "learning_rate": 4.989020307633585e-05, + "loss": 5.9046, + "step": 5019 + }, + { + "epoch": 0.029855362070606146, + "grad_norm": 2.0211753845214844, + "learning_rate": 4.989015934282285e-05, + "loss": 5.95, + "step": 5020 + }, + { + "epoch": 0.029861309353887144, + "grad_norm": 2.014890193939209, + "learning_rate": 4.9890115600620946e-05, + "loss": 5.7312, + "step": 5021 + }, + { + "epoch": 0.029867256637168143, + "grad_norm": 2.2749524116516113, + "learning_rate": 4.989007184973017e-05, + "loss": 6.2573, + "step": 5022 + }, + { + "epoch": 0.029873203920449138, + "grad_norm": 2.080747604370117, + "learning_rate": 4.989002809015052e-05, + "loss": 5.7607, + "step": 5023 + }, + { + "epoch": 0.029879151203730137, + "grad_norm": 2.3403279781341553, + "learning_rate": 4.988998432188202e-05, + "loss": 5.7876, + "step": 5024 + }, + { + "epoch": 0.02988509848701113, + "grad_norm": 2.573802947998047, + "learning_rate": 4.988994054492468e-05, + "loss": 5.9036, + "step": 5025 + }, + { + "epoch": 0.02989104577029213, + "grad_norm": 2.267409324645996, + "learning_rate": 4.988989675927853e-05, + "loss": 5.7433, + "step": 5026 + }, + { + "epoch": 0.02989699305357313, + "grad_norm": 2.8241517543792725, + "learning_rate": 4.9889852964943566e-05, + "loss": 6.2338, + "step": 5027 + }, + { + "epoch": 0.029902940336854124, + "grad_norm": 2.338927745819092, + "learning_rate": 4.988980916191982e-05, + "loss": 6.0226, + "step": 5028 + }, + { + "epoch": 0.029908887620135122, + "grad_norm": 2.0798492431640625, + "learning_rate": 4.9889765350207285e-05, + "loss": 5.6919, + "step": 5029 + }, + { + "epoch": 0.02991483490341612, + "grad_norm": 2.3199923038482666, + "learning_rate": 4.9889721529806e-05, + "loss": 5.7533, + "step": 5030 + }, + { + "epoch": 0.029920782186697116, + "grad_norm": 2.1074399948120117, + "learning_rate": 4.988967770071596e-05, + "loss": 5.7486, + "step": 5031 + }, + { + "epoch": 0.029926729469978115, + "grad_norm": 2.2539381980895996, + "learning_rate": 4.9889633862937205e-05, + "loss": 5.6816, + "step": 5032 + }, + { + "epoch": 0.02993267675325911, + "grad_norm": 2.1393015384674072, + "learning_rate": 4.9889590016469726e-05, + "loss": 5.6635, + "step": 5033 + }, + { + "epoch": 0.029938624036540108, + "grad_norm": 2.6661975383758545, + "learning_rate": 4.988954616131355e-05, + "loss": 6.0218, + "step": 5034 + }, + { + "epoch": 0.029944571319821107, + "grad_norm": 2.6529600620269775, + "learning_rate": 4.988950229746869e-05, + "loss": 5.8847, + "step": 5035 + }, + { + "epoch": 0.029950518603102102, + "grad_norm": 2.510859966278076, + "learning_rate": 4.988945842493517e-05, + "loss": 5.7154, + "step": 5036 + }, + { + "epoch": 0.0299564658863831, + "grad_norm": 2.875394105911255, + "learning_rate": 4.9889414543712985e-05, + "loss": 5.6304, + "step": 5037 + }, + { + "epoch": 0.0299624131696641, + "grad_norm": 2.718808650970459, + "learning_rate": 4.988937065380217e-05, + "loss": 5.6562, + "step": 5038 + }, + { + "epoch": 0.029968360452945094, + "grad_norm": 2.702265501022339, + "learning_rate": 4.988932675520273e-05, + "loss": 5.6484, + "step": 5039 + }, + { + "epoch": 0.029974307736226093, + "grad_norm": 2.765209436416626, + "learning_rate": 4.988928284791469e-05, + "loss": 5.793, + "step": 5040 + }, + { + "epoch": 0.029980255019507088, + "grad_norm": 3.386352062225342, + "learning_rate": 4.9889238931938047e-05, + "loss": 5.5392, + "step": 5041 + }, + { + "epoch": 0.029986202302788086, + "grad_norm": 2.1632583141326904, + "learning_rate": 4.988919500727284e-05, + "loss": 5.8032, + "step": 5042 + }, + { + "epoch": 0.029992149586069085, + "grad_norm": 2.4121060371398926, + "learning_rate": 4.9889151073919064e-05, + "loss": 5.9793, + "step": 5043 + }, + { + "epoch": 0.02999809686935008, + "grad_norm": 2.2160584926605225, + "learning_rate": 4.988910713187674e-05, + "loss": 5.8802, + "step": 5044 + }, + { + "epoch": 0.03000404415263108, + "grad_norm": 3.120509386062622, + "learning_rate": 4.988906318114589e-05, + "loss": 5.5691, + "step": 5045 + }, + { + "epoch": 0.030009991435912077, + "grad_norm": 3.0660078525543213, + "learning_rate": 4.988901922172652e-05, + "loss": 5.3687, + "step": 5046 + }, + { + "epoch": 0.030015938719193072, + "grad_norm": 1.939757227897644, + "learning_rate": 4.988897525361867e-05, + "loss": 5.526, + "step": 5047 + }, + { + "epoch": 0.03002188600247407, + "grad_norm": 2.2970168590545654, + "learning_rate": 4.9888931276822315e-05, + "loss": 5.6334, + "step": 5048 + }, + { + "epoch": 0.030027833285755066, + "grad_norm": 2.162632942199707, + "learning_rate": 4.988888729133749e-05, + "loss": 5.8887, + "step": 5049 + }, + { + "epoch": 0.030033780569036064, + "grad_norm": 2.027017831802368, + "learning_rate": 4.9888843297164223e-05, + "loss": 5.9237, + "step": 5050 + }, + { + "epoch": 0.030039727852317063, + "grad_norm": 1.9226456880569458, + "learning_rate": 4.988879929430251e-05, + "loss": 5.6833, + "step": 5051 + }, + { + "epoch": 0.030045675135598058, + "grad_norm": 1.6490615606307983, + "learning_rate": 4.9888755282752384e-05, + "loss": 5.5738, + "step": 5052 + }, + { + "epoch": 0.030051622418879056, + "grad_norm": 2.456385850906372, + "learning_rate": 4.9888711262513846e-05, + "loss": 5.3771, + "step": 5053 + }, + { + "epoch": 0.03005756970216005, + "grad_norm": 2.480044364929199, + "learning_rate": 4.988866723358692e-05, + "loss": 5.2456, + "step": 5054 + }, + { + "epoch": 0.03006351698544105, + "grad_norm": 2.4033162593841553, + "learning_rate": 4.988862319597161e-05, + "loss": 5.1629, + "step": 5055 + }, + { + "epoch": 0.03006946426872205, + "grad_norm": 2.7228541374206543, + "learning_rate": 4.9888579149667935e-05, + "loss": 5.0195, + "step": 5056 + }, + { + "epoch": 0.030075411552003044, + "grad_norm": 2.4641635417938232, + "learning_rate": 4.9888535094675926e-05, + "loss": 5.3259, + "step": 5057 + }, + { + "epoch": 0.030081358835284042, + "grad_norm": 2.443666458129883, + "learning_rate": 4.9888491030995575e-05, + "loss": 5.4212, + "step": 5058 + }, + { + "epoch": 0.03008730611856504, + "grad_norm": 2.3267531394958496, + "learning_rate": 4.988844695862692e-05, + "loss": 5.6517, + "step": 5059 + }, + { + "epoch": 0.030093253401846036, + "grad_norm": 1.9090640544891357, + "learning_rate": 4.988840287756996e-05, + "loss": 5.7946, + "step": 5060 + }, + { + "epoch": 0.030099200685127035, + "grad_norm": 1.6169202327728271, + "learning_rate": 4.988835878782472e-05, + "loss": 5.7332, + "step": 5061 + }, + { + "epoch": 0.03010514796840803, + "grad_norm": 1.9369432926177979, + "learning_rate": 4.9888314689391205e-05, + "loss": 5.5954, + "step": 5062 + }, + { + "epoch": 0.030111095251689028, + "grad_norm": 2.0444133281707764, + "learning_rate": 4.9888270582269434e-05, + "loss": 5.5332, + "step": 5063 + }, + { + "epoch": 0.030117042534970027, + "grad_norm": 1.949061632156372, + "learning_rate": 4.988822646645943e-05, + "loss": 5.6064, + "step": 5064 + }, + { + "epoch": 0.030122989818251022, + "grad_norm": 1.5208648443222046, + "learning_rate": 4.988818234196121e-05, + "loss": 5.6615, + "step": 5065 + }, + { + "epoch": 0.03012893710153202, + "grad_norm": 1.8466709852218628, + "learning_rate": 4.988813820877477e-05, + "loss": 5.79, + "step": 5066 + }, + { + "epoch": 0.03013488438481302, + "grad_norm": 1.7094037532806396, + "learning_rate": 4.988809406690015e-05, + "loss": 5.8194, + "step": 5067 + }, + { + "epoch": 0.030140831668094014, + "grad_norm": 1.5698916912078857, + "learning_rate": 4.988804991633734e-05, + "loss": 5.5981, + "step": 5068 + }, + { + "epoch": 0.030146778951375013, + "grad_norm": 2.032156467437744, + "learning_rate": 4.988800575708638e-05, + "loss": 5.6729, + "step": 5069 + }, + { + "epoch": 0.030152726234656008, + "grad_norm": 1.9716484546661377, + "learning_rate": 4.988796158914727e-05, + "loss": 5.5227, + "step": 5070 + }, + { + "epoch": 0.030158673517937006, + "grad_norm": 1.8809682130813599, + "learning_rate": 4.988791741252002e-05, + "loss": 5.6231, + "step": 5071 + }, + { + "epoch": 0.030164620801218005, + "grad_norm": 1.8293371200561523, + "learning_rate": 4.9887873227204675e-05, + "loss": 5.5067, + "step": 5072 + }, + { + "epoch": 0.030170568084499, + "grad_norm": 2.225281000137329, + "learning_rate": 4.988782903320122e-05, + "loss": 5.3056, + "step": 5073 + }, + { + "epoch": 0.03017651536778, + "grad_norm": 2.0776474475860596, + "learning_rate": 4.988778483050968e-05, + "loss": 5.206, + "step": 5074 + }, + { + "epoch": 0.030182462651060997, + "grad_norm": 2.068323850631714, + "learning_rate": 4.9887740619130076e-05, + "loss": 5.5975, + "step": 5075 + }, + { + "epoch": 0.030188409934341992, + "grad_norm": 2.077782392501831, + "learning_rate": 4.988769639906241e-05, + "loss": 5.6967, + "step": 5076 + }, + { + "epoch": 0.03019435721762299, + "grad_norm": 1.9837195873260498, + "learning_rate": 4.988765217030672e-05, + "loss": 5.7834, + "step": 5077 + }, + { + "epoch": 0.030200304500903986, + "grad_norm": 1.9612236022949219, + "learning_rate": 4.9887607932863e-05, + "loss": 5.5472, + "step": 5078 + }, + { + "epoch": 0.030206251784184984, + "grad_norm": 2.022251605987549, + "learning_rate": 4.988756368673127e-05, + "loss": 5.704, + "step": 5079 + }, + { + "epoch": 0.030212199067465983, + "grad_norm": 2.02227783203125, + "learning_rate": 4.988751943191156e-05, + "loss": 5.4125, + "step": 5080 + }, + { + "epoch": 0.030218146350746978, + "grad_norm": 2.0527732372283936, + "learning_rate": 4.9887475168403856e-05, + "loss": 5.464, + "step": 5081 + }, + { + "epoch": 0.030224093634027976, + "grad_norm": 2.1465423107147217, + "learning_rate": 4.9887430896208205e-05, + "loss": 5.3415, + "step": 5082 + }, + { + "epoch": 0.03023004091730897, + "grad_norm": 1.9170550107955933, + "learning_rate": 4.9887386615324606e-05, + "loss": 5.5762, + "step": 5083 + }, + { + "epoch": 0.03023598820058997, + "grad_norm": 3.367650032043457, + "learning_rate": 4.988734232575307e-05, + "loss": 6.26, + "step": 5084 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 2.0784621238708496, + "learning_rate": 4.988729802749363e-05, + "loss": 5.5316, + "step": 5085 + }, + { + "epoch": 0.030247882767151964, + "grad_norm": 1.9531089067459106, + "learning_rate": 4.988725372054629e-05, + "loss": 5.5901, + "step": 5086 + }, + { + "epoch": 0.030253830050432962, + "grad_norm": 1.9677239656448364, + "learning_rate": 4.988720940491106e-05, + "loss": 5.4963, + "step": 5087 + }, + { + "epoch": 0.03025977733371396, + "grad_norm": 1.9835426807403564, + "learning_rate": 4.988716508058797e-05, + "loss": 5.6355, + "step": 5088 + }, + { + "epoch": 0.030265724616994956, + "grad_norm": 1.908250331878662, + "learning_rate": 4.988712074757703e-05, + "loss": 5.165, + "step": 5089 + }, + { + "epoch": 0.030271671900275954, + "grad_norm": 1.9852073192596436, + "learning_rate": 4.9887076405878246e-05, + "loss": 5.6623, + "step": 5090 + }, + { + "epoch": 0.03027761918355695, + "grad_norm": 1.9073505401611328, + "learning_rate": 4.988703205549164e-05, + "loss": 5.6685, + "step": 5091 + }, + { + "epoch": 0.030283566466837948, + "grad_norm": 1.744931697845459, + "learning_rate": 4.988698769641724e-05, + "loss": 5.4004, + "step": 5092 + }, + { + "epoch": 0.030289513750118947, + "grad_norm": 2.0623345375061035, + "learning_rate": 4.9886943328655034e-05, + "loss": 5.3846, + "step": 5093 + }, + { + "epoch": 0.030295461033399942, + "grad_norm": 1.647375226020813, + "learning_rate": 4.9886898952205064e-05, + "loss": 5.5823, + "step": 5094 + }, + { + "epoch": 0.03030140831668094, + "grad_norm": 2.2364108562469482, + "learning_rate": 4.9886854567067334e-05, + "loss": 5.5959, + "step": 5095 + }, + { + "epoch": 0.03030735559996194, + "grad_norm": 2.059187650680542, + "learning_rate": 4.988681017324185e-05, + "loss": 5.6043, + "step": 5096 + }, + { + "epoch": 0.030313302883242934, + "grad_norm": 1.8996437788009644, + "learning_rate": 4.988676577072865e-05, + "loss": 5.4366, + "step": 5097 + }, + { + "epoch": 0.030319250166523933, + "grad_norm": 2.0983266830444336, + "learning_rate": 4.988672135952773e-05, + "loss": 5.5568, + "step": 5098 + }, + { + "epoch": 0.030325197449804928, + "grad_norm": 2.065119743347168, + "learning_rate": 4.988667693963911e-05, + "loss": 5.4239, + "step": 5099 + }, + { + "epoch": 0.030331144733085926, + "grad_norm": 1.9394044876098633, + "learning_rate": 4.988663251106282e-05, + "loss": 5.573, + "step": 5100 + }, + { + "epoch": 0.030337092016366925, + "grad_norm": 2.225097417831421, + "learning_rate": 4.9886588073798855e-05, + "loss": 5.5877, + "step": 5101 + }, + { + "epoch": 0.03034303929964792, + "grad_norm": 2.185018539428711, + "learning_rate": 4.9886543627847236e-05, + "loss": 5.6884, + "step": 5102 + }, + { + "epoch": 0.03034898658292892, + "grad_norm": 1.9751871824264526, + "learning_rate": 4.988649917320799e-05, + "loss": 5.4836, + "step": 5103 + }, + { + "epoch": 0.030354933866209917, + "grad_norm": 1.8753101825714111, + "learning_rate": 4.988645470988113e-05, + "loss": 5.4049, + "step": 5104 + }, + { + "epoch": 0.030360881149490912, + "grad_norm": 2.12246036529541, + "learning_rate": 4.988641023786665e-05, + "loss": 5.5365, + "step": 5105 + }, + { + "epoch": 0.03036682843277191, + "grad_norm": 2.1078991889953613, + "learning_rate": 4.988636575716459e-05, + "loss": 5.5269, + "step": 5106 + }, + { + "epoch": 0.030372775716052906, + "grad_norm": 1.9127923250198364, + "learning_rate": 4.9886321267774946e-05, + "loss": 5.48, + "step": 5107 + }, + { + "epoch": 0.030378722999333904, + "grad_norm": 1.8971906900405884, + "learning_rate": 4.988627676969776e-05, + "loss": 5.5202, + "step": 5108 + }, + { + "epoch": 0.030384670282614903, + "grad_norm": 2.162097454071045, + "learning_rate": 4.9886232262933024e-05, + "loss": 5.5229, + "step": 5109 + }, + { + "epoch": 0.030390617565895898, + "grad_norm": 2.21211838722229, + "learning_rate": 4.988618774748076e-05, + "loss": 5.3648, + "step": 5110 + }, + { + "epoch": 0.030396564849176896, + "grad_norm": 1.8907619714736938, + "learning_rate": 4.988614322334099e-05, + "loss": 5.4338, + "step": 5111 + }, + { + "epoch": 0.030402512132457895, + "grad_norm": 2.0131993293762207, + "learning_rate": 4.9886098690513725e-05, + "loss": 5.4005, + "step": 5112 + }, + { + "epoch": 0.03040845941573889, + "grad_norm": 1.9474748373031616, + "learning_rate": 4.9886054148998975e-05, + "loss": 5.5544, + "step": 5113 + }, + { + "epoch": 0.03041440669901989, + "grad_norm": 1.9809894561767578, + "learning_rate": 4.988600959879676e-05, + "loss": 5.6204, + "step": 5114 + }, + { + "epoch": 0.030420353982300884, + "grad_norm": 2.1792514324188232, + "learning_rate": 4.9885965039907104e-05, + "loss": 5.5368, + "step": 5115 + }, + { + "epoch": 0.030426301265581882, + "grad_norm": 2.050903081893921, + "learning_rate": 4.9885920472330004e-05, + "loss": 5.4717, + "step": 5116 + }, + { + "epoch": 0.03043224854886288, + "grad_norm": 1.9938042163848877, + "learning_rate": 4.988587589606549e-05, + "loss": 5.5373, + "step": 5117 + }, + { + "epoch": 0.030438195832143876, + "grad_norm": 1.7375110387802124, + "learning_rate": 4.988583131111358e-05, + "loss": 5.5621, + "step": 5118 + }, + { + "epoch": 0.030444143115424874, + "grad_norm": 2.077605962753296, + "learning_rate": 4.988578671747428e-05, + "loss": 5.5451, + "step": 5119 + }, + { + "epoch": 0.03045009039870587, + "grad_norm": 2.071706771850586, + "learning_rate": 4.988574211514761e-05, + "loss": 5.327, + "step": 5120 + }, + { + "epoch": 0.030456037681986868, + "grad_norm": 1.8317911624908447, + "learning_rate": 4.9885697504133574e-05, + "loss": 5.4123, + "step": 5121 + }, + { + "epoch": 0.030461984965267867, + "grad_norm": 2.1231188774108887, + "learning_rate": 4.988565288443221e-05, + "loss": 5.3789, + "step": 5122 + }, + { + "epoch": 0.03046793224854886, + "grad_norm": 2.1298999786376953, + "learning_rate": 4.988560825604352e-05, + "loss": 5.4382, + "step": 5123 + }, + { + "epoch": 0.03047387953182986, + "grad_norm": 1.791053056716919, + "learning_rate": 4.9885563618967525e-05, + "loss": 5.3918, + "step": 5124 + }, + { + "epoch": 0.03047982681511086, + "grad_norm": 1.9610999822616577, + "learning_rate": 4.988551897320423e-05, + "loss": 5.3232, + "step": 5125 + }, + { + "epoch": 0.030485774098391854, + "grad_norm": 1.9926520586013794, + "learning_rate": 4.9885474318753654e-05, + "loss": 5.4316, + "step": 5126 + }, + { + "epoch": 0.030491721381672852, + "grad_norm": 1.8942431211471558, + "learning_rate": 4.988542965561582e-05, + "loss": 5.4055, + "step": 5127 + }, + { + "epoch": 0.030497668664953848, + "grad_norm": 1.7872856855392456, + "learning_rate": 4.988538498379074e-05, + "loss": 5.5117, + "step": 5128 + }, + { + "epoch": 0.030503615948234846, + "grad_norm": 2.040205478668213, + "learning_rate": 4.988534030327843e-05, + "loss": 5.4068, + "step": 5129 + }, + { + "epoch": 0.030509563231515845, + "grad_norm": 2.0108931064605713, + "learning_rate": 4.988529561407891e-05, + "loss": 5.3636, + "step": 5130 + }, + { + "epoch": 0.03051551051479684, + "grad_norm": 2.0339555740356445, + "learning_rate": 4.988525091619218e-05, + "loss": 5.2811, + "step": 5131 + }, + { + "epoch": 0.03052145779807784, + "grad_norm": 1.7631195783615112, + "learning_rate": 4.988520620961828e-05, + "loss": 5.3407, + "step": 5132 + }, + { + "epoch": 0.030527405081358837, + "grad_norm": 1.6906533241271973, + "learning_rate": 4.988516149435719e-05, + "loss": 5.3121, + "step": 5133 + }, + { + "epoch": 0.030533352364639832, + "grad_norm": 2.0753448009490967, + "learning_rate": 4.988511677040897e-05, + "loss": 5.4532, + "step": 5134 + }, + { + "epoch": 0.03053929964792083, + "grad_norm": 1.9836634397506714, + "learning_rate": 4.9885072037773595e-05, + "loss": 5.4345, + "step": 5135 + }, + { + "epoch": 0.030545246931201826, + "grad_norm": 1.8526780605316162, + "learning_rate": 4.988502729645111e-05, + "loss": 5.446, + "step": 5136 + }, + { + "epoch": 0.030551194214482824, + "grad_norm": 2.126626968383789, + "learning_rate": 4.988498254644152e-05, + "loss": 5.703, + "step": 5137 + }, + { + "epoch": 0.030557141497763823, + "grad_norm": 1.9711220264434814, + "learning_rate": 4.988493778774483e-05, + "loss": 5.5872, + "step": 5138 + }, + { + "epoch": 0.030563088781044818, + "grad_norm": 2.070727586746216, + "learning_rate": 4.988489302036107e-05, + "loss": 5.4407, + "step": 5139 + }, + { + "epoch": 0.030569036064325816, + "grad_norm": 2.1414859294891357, + "learning_rate": 4.988484824429025e-05, + "loss": 5.5291, + "step": 5140 + }, + { + "epoch": 0.030574983347606815, + "grad_norm": 2.01366925239563, + "learning_rate": 4.9884803459532384e-05, + "loss": 5.3561, + "step": 5141 + }, + { + "epoch": 0.03058093063088781, + "grad_norm": 1.851836085319519, + "learning_rate": 4.988475866608749e-05, + "loss": 5.679, + "step": 5142 + }, + { + "epoch": 0.03058687791416881, + "grad_norm": 1.6984909772872925, + "learning_rate": 4.988471386395559e-05, + "loss": 5.6075, + "step": 5143 + }, + { + "epoch": 0.030592825197449804, + "grad_norm": 1.9371756315231323, + "learning_rate": 4.9884669053136696e-05, + "loss": 5.7062, + "step": 5144 + }, + { + "epoch": 0.030598772480730802, + "grad_norm": 1.9286617040634155, + "learning_rate": 4.9884624233630815e-05, + "loss": 5.573, + "step": 5145 + }, + { + "epoch": 0.0306047197640118, + "grad_norm": 2.7633650302886963, + "learning_rate": 4.988457940543797e-05, + "loss": 6.2082, + "step": 5146 + }, + { + "epoch": 0.030610667047292796, + "grad_norm": 2.6948676109313965, + "learning_rate": 4.9884534568558173e-05, + "loss": 5.7475, + "step": 5147 + }, + { + "epoch": 0.030616614330573794, + "grad_norm": 2.1618316173553467, + "learning_rate": 4.988448972299145e-05, + "loss": 5.4049, + "step": 5148 + }, + { + "epoch": 0.03062256161385479, + "grad_norm": 2.417043685913086, + "learning_rate": 4.98844448687378e-05, + "loss": 5.3663, + "step": 5149 + }, + { + "epoch": 0.030628508897135788, + "grad_norm": 1.9748867750167847, + "learning_rate": 4.988440000579725e-05, + "loss": 5.1876, + "step": 5150 + }, + { + "epoch": 0.030634456180416787, + "grad_norm": 2.0534770488739014, + "learning_rate": 4.988435513416981e-05, + "loss": 5.4519, + "step": 5151 + }, + { + "epoch": 0.03064040346369778, + "grad_norm": 1.9772714376449585, + "learning_rate": 4.98843102538555e-05, + "loss": 5.5241, + "step": 5152 + }, + { + "epoch": 0.03064635074697878, + "grad_norm": 2.4160993099212646, + "learning_rate": 4.988426536485434e-05, + "loss": 5.6535, + "step": 5153 + }, + { + "epoch": 0.03065229803025978, + "grad_norm": 1.9931175708770752, + "learning_rate": 4.9884220467166345e-05, + "loss": 5.6693, + "step": 5154 + }, + { + "epoch": 0.030658245313540774, + "grad_norm": 1.9071956872940063, + "learning_rate": 4.9884175560791516e-05, + "loss": 5.5533, + "step": 5155 + }, + { + "epoch": 0.030664192596821772, + "grad_norm": 1.8562983274459839, + "learning_rate": 4.9884130645729876e-05, + "loss": 5.5621, + "step": 5156 + }, + { + "epoch": 0.030670139880102767, + "grad_norm": 2.087606430053711, + "learning_rate": 4.9884085721981446e-05, + "loss": 5.5256, + "step": 5157 + }, + { + "epoch": 0.030676087163383766, + "grad_norm": 2.3242955207824707, + "learning_rate": 4.988404078954624e-05, + "loss": 5.3906, + "step": 5158 + }, + { + "epoch": 0.030682034446664765, + "grad_norm": 2.221330404281616, + "learning_rate": 4.988399584842427e-05, + "loss": 5.5719, + "step": 5159 + }, + { + "epoch": 0.03068798172994576, + "grad_norm": 1.7819960117340088, + "learning_rate": 4.988395089861556e-05, + "loss": 5.5823, + "step": 5160 + }, + { + "epoch": 0.030693929013226758, + "grad_norm": 1.781802773475647, + "learning_rate": 4.988390594012011e-05, + "loss": 5.6087, + "step": 5161 + }, + { + "epoch": 0.030699876296507757, + "grad_norm": 2.0003581047058105, + "learning_rate": 4.988386097293796e-05, + "loss": 5.5695, + "step": 5162 + }, + { + "epoch": 0.030705823579788752, + "grad_norm": 1.9411736726760864, + "learning_rate": 4.98838159970691e-05, + "loss": 5.441, + "step": 5163 + }, + { + "epoch": 0.03071177086306975, + "grad_norm": 2.159541368484497, + "learning_rate": 4.9883771012513556e-05, + "loss": 5.6191, + "step": 5164 + }, + { + "epoch": 0.030717718146350746, + "grad_norm": 2.1045689582824707, + "learning_rate": 4.988372601927135e-05, + "loss": 5.3261, + "step": 5165 + }, + { + "epoch": 0.030723665429631744, + "grad_norm": 2.004770040512085, + "learning_rate": 4.988368101734249e-05, + "loss": 5.3392, + "step": 5166 + }, + { + "epoch": 0.030729612712912743, + "grad_norm": 2.1851232051849365, + "learning_rate": 4.9883636006726996e-05, + "loss": 5.3048, + "step": 5167 + }, + { + "epoch": 0.030735559996193738, + "grad_norm": 2.1333882808685303, + "learning_rate": 4.988359098742488e-05, + "loss": 5.336, + "step": 5168 + }, + { + "epoch": 0.030741507279474736, + "grad_norm": 2.1911604404449463, + "learning_rate": 4.9883545959436165e-05, + "loss": 5.757, + "step": 5169 + }, + { + "epoch": 0.030747454562755735, + "grad_norm": 2.0385994911193848, + "learning_rate": 4.988350092276085e-05, + "loss": 5.7889, + "step": 5170 + }, + { + "epoch": 0.03075340184603673, + "grad_norm": 2.2300381660461426, + "learning_rate": 4.988345587739897e-05, + "loss": 5.3812, + "step": 5171 + }, + { + "epoch": 0.03075934912931773, + "grad_norm": 2.4643938541412354, + "learning_rate": 4.988341082335053e-05, + "loss": 5.2503, + "step": 5172 + }, + { + "epoch": 0.030765296412598724, + "grad_norm": 2.0791194438934326, + "learning_rate": 4.988336576061555e-05, + "loss": 5.2958, + "step": 5173 + }, + { + "epoch": 0.030771243695879722, + "grad_norm": 2.1123111248016357, + "learning_rate": 4.988332068919405e-05, + "loss": 5.3656, + "step": 5174 + }, + { + "epoch": 0.03077719097916072, + "grad_norm": 2.199747323989868, + "learning_rate": 4.9883275609086026e-05, + "loss": 5.7015, + "step": 5175 + }, + { + "epoch": 0.030783138262441716, + "grad_norm": 2.0083510875701904, + "learning_rate": 4.988323052029151e-05, + "loss": 5.7068, + "step": 5176 + }, + { + "epoch": 0.030789085545722714, + "grad_norm": 2.1027777194976807, + "learning_rate": 4.988318542281053e-05, + "loss": 5.6986, + "step": 5177 + }, + { + "epoch": 0.03079503282900371, + "grad_norm": 1.8593190908432007, + "learning_rate": 4.9883140316643074e-05, + "loss": 5.7194, + "step": 5178 + }, + { + "epoch": 0.030800980112284708, + "grad_norm": 1.9712544679641724, + "learning_rate": 4.988309520178918e-05, + "loss": 5.6472, + "step": 5179 + }, + { + "epoch": 0.030806927395565707, + "grad_norm": 2.1114501953125, + "learning_rate": 4.9883050078248836e-05, + "loss": 5.6767, + "step": 5180 + }, + { + "epoch": 0.0308128746788467, + "grad_norm": 3.0505895614624023, + "learning_rate": 4.988300494602209e-05, + "loss": 5.3705, + "step": 5181 + }, + { + "epoch": 0.0308188219621277, + "grad_norm": 2.648364782333374, + "learning_rate": 4.988295980510895e-05, + "loss": 5.3072, + "step": 5182 + }, + { + "epoch": 0.0308247692454087, + "grad_norm": 2.2162837982177734, + "learning_rate": 4.9882914655509414e-05, + "loss": 5.3359, + "step": 5183 + }, + { + "epoch": 0.030830716528689694, + "grad_norm": 2.16666316986084, + "learning_rate": 4.988286949722352e-05, + "loss": 5.3446, + "step": 5184 + }, + { + "epoch": 0.030836663811970692, + "grad_norm": 2.951157569885254, + "learning_rate": 4.988282433025126e-05, + "loss": 5.7776, + "step": 5185 + }, + { + "epoch": 0.030842611095251687, + "grad_norm": 2.9967124462127686, + "learning_rate": 4.988277915459267e-05, + "loss": 5.6004, + "step": 5186 + }, + { + "epoch": 0.030848558378532686, + "grad_norm": 2.3998372554779053, + "learning_rate": 4.988273397024777e-05, + "loss": 5.3562, + "step": 5187 + }, + { + "epoch": 0.030854505661813685, + "grad_norm": 2.290592670440674, + "learning_rate": 4.9882688777216544e-05, + "loss": 5.3211, + "step": 5188 + }, + { + "epoch": 0.03086045294509468, + "grad_norm": 2.0349433422088623, + "learning_rate": 4.988264357549904e-05, + "loss": 5.2917, + "step": 5189 + }, + { + "epoch": 0.030866400228375678, + "grad_norm": 1.922006607055664, + "learning_rate": 4.988259836509526e-05, + "loss": 5.2297, + "step": 5190 + }, + { + "epoch": 0.030872347511656677, + "grad_norm": 1.9518259763717651, + "learning_rate": 4.9882553146005225e-05, + "loss": 5.2232, + "step": 5191 + }, + { + "epoch": 0.030878294794937672, + "grad_norm": 2.1054210662841797, + "learning_rate": 4.988250791822894e-05, + "loss": 5.3705, + "step": 5192 + }, + { + "epoch": 0.03088424207821867, + "grad_norm": 2.0954079627990723, + "learning_rate": 4.988246268176644e-05, + "loss": 5.2522, + "step": 5193 + }, + { + "epoch": 0.030890189361499665, + "grad_norm": 1.8628660440444946, + "learning_rate": 4.9882417436617724e-05, + "loss": 5.3856, + "step": 5194 + }, + { + "epoch": 0.030896136644780664, + "grad_norm": 2.2788021564483643, + "learning_rate": 4.988237218278281e-05, + "loss": 5.4399, + "step": 5195 + }, + { + "epoch": 0.030902083928061663, + "grad_norm": 1.981086015701294, + "learning_rate": 4.9882326920261717e-05, + "loss": 5.2853, + "step": 5196 + }, + { + "epoch": 0.030908031211342658, + "grad_norm": 1.9278241395950317, + "learning_rate": 4.988228164905446e-05, + "loss": 5.3997, + "step": 5197 + }, + { + "epoch": 0.030913978494623656, + "grad_norm": 1.842748999595642, + "learning_rate": 4.988223636916106e-05, + "loss": 5.3215, + "step": 5198 + }, + { + "epoch": 0.030919925777904655, + "grad_norm": 1.9974339008331299, + "learning_rate": 4.988219108058153e-05, + "loss": 5.4851, + "step": 5199 + }, + { + "epoch": 0.03092587306118565, + "grad_norm": 2.015939474105835, + "learning_rate": 4.988214578331588e-05, + "loss": 5.322, + "step": 5200 + }, + { + "epoch": 0.03093182034446665, + "grad_norm": 2.035209894180298, + "learning_rate": 4.9882100477364135e-05, + "loss": 5.3896, + "step": 5201 + }, + { + "epoch": 0.030937767627747643, + "grad_norm": 1.9803009033203125, + "learning_rate": 4.9882055162726296e-05, + "loss": 5.2624, + "step": 5202 + }, + { + "epoch": 0.030943714911028642, + "grad_norm": 1.9504352807998657, + "learning_rate": 4.98820098394024e-05, + "loss": 5.2333, + "step": 5203 + }, + { + "epoch": 0.03094966219430964, + "grad_norm": 1.850542664527893, + "learning_rate": 4.9881964507392443e-05, + "loss": 5.5632, + "step": 5204 + }, + { + "epoch": 0.030955609477590636, + "grad_norm": 1.8594067096710205, + "learning_rate": 4.9881919166696456e-05, + "loss": 5.3775, + "step": 5205 + }, + { + "epoch": 0.030961556760871634, + "grad_norm": 2.019274950027466, + "learning_rate": 4.988187381731444e-05, + "loss": 5.4565, + "step": 5206 + }, + { + "epoch": 0.030967504044152633, + "grad_norm": 1.7151249647140503, + "learning_rate": 4.988182845924643e-05, + "loss": 5.5984, + "step": 5207 + }, + { + "epoch": 0.030973451327433628, + "grad_norm": 2.5127339363098145, + "learning_rate": 4.988178309249242e-05, + "loss": 6.2724, + "step": 5208 + }, + { + "epoch": 0.030979398610714626, + "grad_norm": 1.869344711303711, + "learning_rate": 4.9881737717052436e-05, + "loss": 5.5408, + "step": 5209 + }, + { + "epoch": 0.03098534589399562, + "grad_norm": 2.035419225692749, + "learning_rate": 4.98816923329265e-05, + "loss": 5.4154, + "step": 5210 + }, + { + "epoch": 0.03099129317727662, + "grad_norm": 1.7084250450134277, + "learning_rate": 4.9881646940114624e-05, + "loss": 5.6327, + "step": 5211 + }, + { + "epoch": 0.03099724046055762, + "grad_norm": 2.1035211086273193, + "learning_rate": 4.9881601538616816e-05, + "loss": 5.5041, + "step": 5212 + }, + { + "epoch": 0.031003187743838614, + "grad_norm": 1.920366883277893, + "learning_rate": 4.9881556128433105e-05, + "loss": 5.5919, + "step": 5213 + }, + { + "epoch": 0.031009135027119612, + "grad_norm": 2.000555992126465, + "learning_rate": 4.988151070956349e-05, + "loss": 5.5078, + "step": 5214 + }, + { + "epoch": 0.031015082310400607, + "grad_norm": 1.9930146932601929, + "learning_rate": 4.9881465282008e-05, + "loss": 5.5002, + "step": 5215 + }, + { + "epoch": 0.031021029593681606, + "grad_norm": 2.163329839706421, + "learning_rate": 4.988141984576665e-05, + "loss": 5.3504, + "step": 5216 + }, + { + "epoch": 0.031026976876962604, + "grad_norm": 1.766228437423706, + "learning_rate": 4.988137440083946e-05, + "loss": 5.5304, + "step": 5217 + }, + { + "epoch": 0.0310329241602436, + "grad_norm": 2.1399648189544678, + "learning_rate": 4.988132894722644e-05, + "loss": 5.4757, + "step": 5218 + }, + { + "epoch": 0.031038871443524598, + "grad_norm": 2.2287001609802246, + "learning_rate": 4.988128348492759e-05, + "loss": 5.4902, + "step": 5219 + }, + { + "epoch": 0.031044818726805597, + "grad_norm": 2.095080852508545, + "learning_rate": 4.988123801394295e-05, + "loss": 5.3462, + "step": 5220 + }, + { + "epoch": 0.031050766010086592, + "grad_norm": 2.0873003005981445, + "learning_rate": 4.988119253427253e-05, + "loss": 5.2825, + "step": 5221 + }, + { + "epoch": 0.03105671329336759, + "grad_norm": 2.0918655395507812, + "learning_rate": 4.988114704591633e-05, + "loss": 5.2859, + "step": 5222 + }, + { + "epoch": 0.031062660576648585, + "grad_norm": 1.9637762308120728, + "learning_rate": 4.9881101548874384e-05, + "loss": 5.4687, + "step": 5223 + }, + { + "epoch": 0.031068607859929584, + "grad_norm": 2.046672821044922, + "learning_rate": 4.988105604314671e-05, + "loss": 5.5095, + "step": 5224 + }, + { + "epoch": 0.031074555143210583, + "grad_norm": 2.0264053344726562, + "learning_rate": 4.988101052873332e-05, + "loss": 5.4221, + "step": 5225 + }, + { + "epoch": 0.031080502426491578, + "grad_norm": 1.9367676973342896, + "learning_rate": 4.9880965005634216e-05, + "loss": 5.1881, + "step": 5226 + }, + { + "epoch": 0.031086449709772576, + "grad_norm": 2.0398001670837402, + "learning_rate": 4.9880919473849425e-05, + "loss": 5.4938, + "step": 5227 + }, + { + "epoch": 0.031092396993053575, + "grad_norm": 2.037411689758301, + "learning_rate": 4.988087393337896e-05, + "loss": 5.0893, + "step": 5228 + }, + { + "epoch": 0.03109834427633457, + "grad_norm": 2.1337075233459473, + "learning_rate": 4.988082838422285e-05, + "loss": 4.9822, + "step": 5229 + }, + { + "epoch": 0.03110429155961557, + "grad_norm": 1.9911794662475586, + "learning_rate": 4.988078282638109e-05, + "loss": 5.2472, + "step": 5230 + }, + { + "epoch": 0.031110238842896563, + "grad_norm": 2.1050829887390137, + "learning_rate": 4.98807372598537e-05, + "loss": 5.3478, + "step": 5231 + }, + { + "epoch": 0.031116186126177562, + "grad_norm": 1.9364343881607056, + "learning_rate": 4.988069168464071e-05, + "loss": 5.2551, + "step": 5232 + }, + { + "epoch": 0.03112213340945856, + "grad_norm": 1.9834885597229004, + "learning_rate": 4.988064610074213e-05, + "loss": 5.2147, + "step": 5233 + }, + { + "epoch": 0.031128080692739556, + "grad_norm": 2.0815906524658203, + "learning_rate": 4.9880600508157974e-05, + "loss": 5.1607, + "step": 5234 + }, + { + "epoch": 0.031134027976020554, + "grad_norm": 1.9558357000350952, + "learning_rate": 4.988055490688825e-05, + "loss": 5.4, + "step": 5235 + }, + { + "epoch": 0.031139975259301553, + "grad_norm": 1.9036076068878174, + "learning_rate": 4.9880509296932986e-05, + "loss": 5.4953, + "step": 5236 + }, + { + "epoch": 0.031145922542582548, + "grad_norm": 2.4709548950195312, + "learning_rate": 4.98804636782922e-05, + "loss": 5.2628, + "step": 5237 + }, + { + "epoch": 0.031151869825863546, + "grad_norm": 2.2380030155181885, + "learning_rate": 4.988041805096589e-05, + "loss": 5.2423, + "step": 5238 + }, + { + "epoch": 0.03115781710914454, + "grad_norm": 2.348639726638794, + "learning_rate": 4.988037241495409e-05, + "loss": 5.1966, + "step": 5239 + }, + { + "epoch": 0.03116376439242554, + "grad_norm": 1.9384468793869019, + "learning_rate": 4.9880326770256805e-05, + "loss": 5.47, + "step": 5240 + }, + { + "epoch": 0.03116971167570654, + "grad_norm": 2.2664244174957275, + "learning_rate": 4.988028111687406e-05, + "loss": 5.5511, + "step": 5241 + }, + { + "epoch": 0.031175658958987534, + "grad_norm": 2.1356422901153564, + "learning_rate": 4.988023545480586e-05, + "loss": 5.6462, + "step": 5242 + }, + { + "epoch": 0.031181606242268532, + "grad_norm": 2.240190267562866, + "learning_rate": 4.9880189784052226e-05, + "loss": 5.3494, + "step": 5243 + }, + { + "epoch": 0.031187553525549527, + "grad_norm": 1.8032485246658325, + "learning_rate": 4.988014410461318e-05, + "loss": 5.2305, + "step": 5244 + }, + { + "epoch": 0.031193500808830526, + "grad_norm": 2.177501678466797, + "learning_rate": 4.988009841648873e-05, + "loss": 5.1891, + "step": 5245 + }, + { + "epoch": 0.031199448092111524, + "grad_norm": 2.157317876815796, + "learning_rate": 4.988005271967889e-05, + "loss": 5.1038, + "step": 5246 + }, + { + "epoch": 0.03120539537539252, + "grad_norm": 1.9995821714401245, + "learning_rate": 4.988000701418369e-05, + "loss": 5.1098, + "step": 5247 + }, + { + "epoch": 0.031211342658673518, + "grad_norm": 2.201558828353882, + "learning_rate": 4.987996130000313e-05, + "loss": 5.0702, + "step": 5248 + }, + { + "epoch": 0.031217289941954517, + "grad_norm": 2.065645933151245, + "learning_rate": 4.987991557713724e-05, + "loss": 5.2012, + "step": 5249 + }, + { + "epoch": 0.03122323722523551, + "grad_norm": 1.908347487449646, + "learning_rate": 4.9879869845586024e-05, + "loss": 5.0913, + "step": 5250 + }, + { + "epoch": 0.03122918450851651, + "grad_norm": 1.913979411125183, + "learning_rate": 4.98798241053495e-05, + "loss": 5.0036, + "step": 5251 + }, + { + "epoch": 0.031235131791797505, + "grad_norm": 2.217616558074951, + "learning_rate": 4.9879778356427686e-05, + "loss": 5.0621, + "step": 5252 + }, + { + "epoch": 0.031241079075078504, + "grad_norm": 2.419713258743286, + "learning_rate": 4.9879732598820605e-05, + "loss": 5.1264, + "step": 5253 + }, + { + "epoch": 0.031247026358359502, + "grad_norm": 2.298295497894287, + "learning_rate": 4.987968683252826e-05, + "loss": 5.0576, + "step": 5254 + }, + { + "epoch": 0.0312529736416405, + "grad_norm": 2.120589256286621, + "learning_rate": 4.987964105755067e-05, + "loss": 5.175, + "step": 5255 + }, + { + "epoch": 0.031258920924921496, + "grad_norm": 2.3129806518554688, + "learning_rate": 4.987959527388787e-05, + "loss": 5.1827, + "step": 5256 + }, + { + "epoch": 0.03126486820820249, + "grad_norm": 2.251680612564087, + "learning_rate": 4.9879549481539846e-05, + "loss": 5.0473, + "step": 5257 + }, + { + "epoch": 0.03127081549148349, + "grad_norm": 2.101229429244995, + "learning_rate": 4.987950368050663e-05, + "loss": 5.0453, + "step": 5258 + }, + { + "epoch": 0.03127676277476449, + "grad_norm": 2.189565420150757, + "learning_rate": 4.987945787078824e-05, + "loss": 5.087, + "step": 5259 + }, + { + "epoch": 0.03128271005804548, + "grad_norm": 2.05485463142395, + "learning_rate": 4.9879412052384687e-05, + "loss": 5.0192, + "step": 5260 + }, + { + "epoch": 0.031288657341326485, + "grad_norm": 1.8166489601135254, + "learning_rate": 4.9879366225295994e-05, + "loss": 5.0456, + "step": 5261 + }, + { + "epoch": 0.03129460462460748, + "grad_norm": 2.1403279304504395, + "learning_rate": 4.9879320389522165e-05, + "loss": 4.9455, + "step": 5262 + }, + { + "epoch": 0.031300551907888476, + "grad_norm": 1.8833802938461304, + "learning_rate": 4.9879274545063226e-05, + "loss": 5.0891, + "step": 5263 + }, + { + "epoch": 0.03130649919116947, + "grad_norm": 2.000692367553711, + "learning_rate": 4.987922869191918e-05, + "loss": 5.1125, + "step": 5264 + }, + { + "epoch": 0.03131244647445047, + "grad_norm": 1.947544813156128, + "learning_rate": 4.9879182830090065e-05, + "loss": 4.9139, + "step": 5265 + }, + { + "epoch": 0.03131839375773147, + "grad_norm": 1.8827823400497437, + "learning_rate": 4.987913695957588e-05, + "loss": 5.0154, + "step": 5266 + }, + { + "epoch": 0.03132434104101246, + "grad_norm": 2.268115997314453, + "learning_rate": 4.987909108037664e-05, + "loss": 5.0379, + "step": 5267 + }, + { + "epoch": 0.031330288324293465, + "grad_norm": 1.85139000415802, + "learning_rate": 4.987904519249237e-05, + "loss": 4.9428, + "step": 5268 + }, + { + "epoch": 0.03133623560757446, + "grad_norm": 2.208338737487793, + "learning_rate": 4.987899929592308e-05, + "loss": 4.9366, + "step": 5269 + }, + { + "epoch": 0.031342182890855455, + "grad_norm": 3.5571236610412598, + "learning_rate": 4.987895339066879e-05, + "loss": 6.8471, + "step": 5270 + }, + { + "epoch": 0.03134813017413646, + "grad_norm": 2.000157594680786, + "learning_rate": 4.9878907476729516e-05, + "loss": 5.025, + "step": 5271 + }, + { + "epoch": 0.03135407745741745, + "grad_norm": 2.0588366985321045, + "learning_rate": 4.987886155410527e-05, + "loss": 4.8955, + "step": 5272 + }, + { + "epoch": 0.03136002474069845, + "grad_norm": 2.217839241027832, + "learning_rate": 4.9878815622796074e-05, + "loss": 4.9889, + "step": 5273 + }, + { + "epoch": 0.03136597202397945, + "grad_norm": 2.2453126907348633, + "learning_rate": 4.987876968280194e-05, + "loss": 5.3774, + "step": 5274 + }, + { + "epoch": 0.031371919307260444, + "grad_norm": 1.9839471578598022, + "learning_rate": 4.9878723734122876e-05, + "loss": 4.993, + "step": 5275 + }, + { + "epoch": 0.03137786659054144, + "grad_norm": 1.9534602165222168, + "learning_rate": 4.987867777675892e-05, + "loss": 4.9079, + "step": 5276 + }, + { + "epoch": 0.031383813873822435, + "grad_norm": 1.96163809299469, + "learning_rate": 4.9878631810710066e-05, + "loss": 4.9829, + "step": 5277 + }, + { + "epoch": 0.03138976115710344, + "grad_norm": 2.0814366340637207, + "learning_rate": 4.987858583597634e-05, + "loss": 4.8731, + "step": 5278 + }, + { + "epoch": 0.03139570844038443, + "grad_norm": 1.9846211671829224, + "learning_rate": 4.987853985255776e-05, + "loss": 4.9495, + "step": 5279 + }, + { + "epoch": 0.03140165572366543, + "grad_norm": 2.1237289905548096, + "learning_rate": 4.9878493860454335e-05, + "loss": 5.3887, + "step": 5280 + }, + { + "epoch": 0.03140760300694643, + "grad_norm": 2.1526784896850586, + "learning_rate": 4.9878447859666086e-05, + "loss": 5.3603, + "step": 5281 + }, + { + "epoch": 0.031413550290227424, + "grad_norm": 2.0563082695007324, + "learning_rate": 4.987840185019303e-05, + "loss": 5.4104, + "step": 5282 + }, + { + "epoch": 0.03141949757350842, + "grad_norm": 2.0586647987365723, + "learning_rate": 4.9878355832035175e-05, + "loss": 5.517, + "step": 5283 + }, + { + "epoch": 0.03142544485678942, + "grad_norm": 1.8817695379257202, + "learning_rate": 4.9878309805192546e-05, + "loss": 5.3616, + "step": 5284 + }, + { + "epoch": 0.031431392140070416, + "grad_norm": 2.0987086296081543, + "learning_rate": 4.987826376966516e-05, + "loss": 5.3237, + "step": 5285 + }, + { + "epoch": 0.03143733942335141, + "grad_norm": 2.3505301475524902, + "learning_rate": 4.987821772545302e-05, + "loss": 5.5165, + "step": 5286 + }, + { + "epoch": 0.03144328670663241, + "grad_norm": 2.1199939250946045, + "learning_rate": 4.987817167255616e-05, + "loss": 5.3029, + "step": 5287 + }, + { + "epoch": 0.03144923398991341, + "grad_norm": 1.7463518381118774, + "learning_rate": 4.987812561097458e-05, + "loss": 5.3589, + "step": 5288 + }, + { + "epoch": 0.0314551812731944, + "grad_norm": 1.9957356452941895, + "learning_rate": 4.987807954070831e-05, + "loss": 5.2459, + "step": 5289 + }, + { + "epoch": 0.031461128556475405, + "grad_norm": 1.7865337133407593, + "learning_rate": 4.987803346175736e-05, + "loss": 5.3041, + "step": 5290 + }, + { + "epoch": 0.0314670758397564, + "grad_norm": 1.82949960231781, + "learning_rate": 4.9877987374121744e-05, + "loss": 5.5761, + "step": 5291 + }, + { + "epoch": 0.031473023123037396, + "grad_norm": 1.974692940711975, + "learning_rate": 4.9877941277801475e-05, + "loss": 5.5033, + "step": 5292 + }, + { + "epoch": 0.03147897040631839, + "grad_norm": 2.1808922290802, + "learning_rate": 4.9877895172796577e-05, + "loss": 5.6739, + "step": 5293 + }, + { + "epoch": 0.03148491768959939, + "grad_norm": 2.7555716037750244, + "learning_rate": 4.987784905910706e-05, + "loss": 5.2489, + "step": 5294 + }, + { + "epoch": 0.03149086497288039, + "grad_norm": 2.475541353225708, + "learning_rate": 4.9877802936732955e-05, + "loss": 5.2304, + "step": 5295 + }, + { + "epoch": 0.03149681225616138, + "grad_norm": 1.945482611656189, + "learning_rate": 4.987775680567425e-05, + "loss": 5.4085, + "step": 5296 + }, + { + "epoch": 0.031502759539442385, + "grad_norm": 1.9879848957061768, + "learning_rate": 4.987771066593099e-05, + "loss": 5.5372, + "step": 5297 + }, + { + "epoch": 0.03150870682272338, + "grad_norm": 2.0529556274414062, + "learning_rate": 4.987766451750317e-05, + "loss": 5.578, + "step": 5298 + }, + { + "epoch": 0.031514654106004375, + "grad_norm": 1.7769572734832764, + "learning_rate": 4.9877618360390816e-05, + "loss": 5.5348, + "step": 5299 + }, + { + "epoch": 0.03152060138928538, + "grad_norm": 1.9111005067825317, + "learning_rate": 4.987757219459395e-05, + "loss": 5.4267, + "step": 5300 + }, + { + "epoch": 0.03152654867256637, + "grad_norm": 1.9047571420669556, + "learning_rate": 4.987752602011256e-05, + "loss": 5.433, + "step": 5301 + }, + { + "epoch": 0.03153249595584737, + "grad_norm": 1.9031875133514404, + "learning_rate": 4.98774798369467e-05, + "loss": 5.4929, + "step": 5302 + }, + { + "epoch": 0.03153844323912837, + "grad_norm": 1.858656883239746, + "learning_rate": 4.987743364509637e-05, + "loss": 5.3583, + "step": 5303 + }, + { + "epoch": 0.031544390522409364, + "grad_norm": 1.9254835844039917, + "learning_rate": 4.987738744456158e-05, + "loss": 5.4885, + "step": 5304 + }, + { + "epoch": 0.03155033780569036, + "grad_norm": 1.96173095703125, + "learning_rate": 4.987734123534235e-05, + "loss": 5.4869, + "step": 5305 + }, + { + "epoch": 0.031556285088971354, + "grad_norm": 1.7857433557510376, + "learning_rate": 4.98772950174387e-05, + "loss": 5.3845, + "step": 5306 + }, + { + "epoch": 0.031562232372252357, + "grad_norm": 1.9360556602478027, + "learning_rate": 4.9877248790850636e-05, + "loss": 5.3809, + "step": 5307 + }, + { + "epoch": 0.03156817965553335, + "grad_norm": 2.2044126987457275, + "learning_rate": 4.9877202555578197e-05, + "loss": 5.2413, + "step": 5308 + }, + { + "epoch": 0.03157412693881435, + "grad_norm": 1.8200992345809937, + "learning_rate": 4.9877156311621365e-05, + "loss": 5.6241, + "step": 5309 + }, + { + "epoch": 0.03158007422209535, + "grad_norm": 2.0771358013153076, + "learning_rate": 4.987711005898019e-05, + "loss": 5.6854, + "step": 5310 + }, + { + "epoch": 0.031586021505376344, + "grad_norm": 1.8330012559890747, + "learning_rate": 4.987706379765466e-05, + "loss": 5.712, + "step": 5311 + }, + { + "epoch": 0.03159196878865734, + "grad_norm": 1.941501498222351, + "learning_rate": 4.987701752764481e-05, + "loss": 5.4131, + "step": 5312 + }, + { + "epoch": 0.03159791607193834, + "grad_norm": 1.8688616752624512, + "learning_rate": 4.987697124895065e-05, + "loss": 5.3719, + "step": 5313 + }, + { + "epoch": 0.031603863355219336, + "grad_norm": 1.8723224401474, + "learning_rate": 4.98769249615722e-05, + "loss": 5.665, + "step": 5314 + }, + { + "epoch": 0.03160981063850033, + "grad_norm": 1.9460058212280273, + "learning_rate": 4.9876878665509474e-05, + "loss": 5.7048, + "step": 5315 + }, + { + "epoch": 0.03161575792178133, + "grad_norm": 1.9752602577209473, + "learning_rate": 4.987683236076248e-05, + "loss": 5.7098, + "step": 5316 + }, + { + "epoch": 0.03162170520506233, + "grad_norm": 1.8122695684432983, + "learning_rate": 4.9876786047331244e-05, + "loss": 5.2717, + "step": 5317 + }, + { + "epoch": 0.03162765248834332, + "grad_norm": 1.961983323097229, + "learning_rate": 4.9876739725215775e-05, + "loss": 5.5593, + "step": 5318 + }, + { + "epoch": 0.031633599771624325, + "grad_norm": 1.7362732887268066, + "learning_rate": 4.98766933944161e-05, + "loss": 5.5002, + "step": 5319 + }, + { + "epoch": 0.03163954705490532, + "grad_norm": 2.084033489227295, + "learning_rate": 4.9876647054932226e-05, + "loss": 5.5398, + "step": 5320 + }, + { + "epoch": 0.031645494338186315, + "grad_norm": 1.869452953338623, + "learning_rate": 4.9876600706764165e-05, + "loss": 5.5985, + "step": 5321 + }, + { + "epoch": 0.03165144162146731, + "grad_norm": 3.597667694091797, + "learning_rate": 4.9876554349911943e-05, + "loss": 5.4143, + "step": 5322 + }, + { + "epoch": 0.03165738890474831, + "grad_norm": 2.2364773750305176, + "learning_rate": 4.9876507984375574e-05, + "loss": 5.3756, + "step": 5323 + }, + { + "epoch": 0.03166333618802931, + "grad_norm": 2.0204551219940186, + "learning_rate": 4.987646161015508e-05, + "loss": 5.4964, + "step": 5324 + }, + { + "epoch": 0.0316692834713103, + "grad_norm": 1.7375823259353638, + "learning_rate": 4.987641522725046e-05, + "loss": 5.5249, + "step": 5325 + }, + { + "epoch": 0.031675230754591305, + "grad_norm": 1.661597728729248, + "learning_rate": 4.987636883566175e-05, + "loss": 5.4828, + "step": 5326 + }, + { + "epoch": 0.0316811780378723, + "grad_norm": 1.8612693548202515, + "learning_rate": 4.9876322435388944e-05, + "loss": 5.4711, + "step": 5327 + }, + { + "epoch": 0.031687125321153295, + "grad_norm": 1.8282328844070435, + "learning_rate": 4.987627602643208e-05, + "loss": 5.5234, + "step": 5328 + }, + { + "epoch": 0.0316930726044343, + "grad_norm": 1.951170802116394, + "learning_rate": 4.987622960879116e-05, + "loss": 5.4117, + "step": 5329 + }, + { + "epoch": 0.03169901988771529, + "grad_norm": 1.819174885749817, + "learning_rate": 4.9876183182466207e-05, + "loss": 5.3446, + "step": 5330 + }, + { + "epoch": 0.03170496717099629, + "grad_norm": 1.8710874319076538, + "learning_rate": 4.9876136747457245e-05, + "loss": 5.3755, + "step": 5331 + }, + { + "epoch": 0.03171091445427729, + "grad_norm": 2.1957387924194336, + "learning_rate": 4.9876090303764264e-05, + "loss": 6.3036, + "step": 5332 + }, + { + "epoch": 0.031716861737558284, + "grad_norm": 1.774741530418396, + "learning_rate": 4.987604385138731e-05, + "loss": 5.3822, + "step": 5333 + }, + { + "epoch": 0.03172280902083928, + "grad_norm": 1.793230414390564, + "learning_rate": 4.987599739032638e-05, + "loss": 5.4224, + "step": 5334 + }, + { + "epoch": 0.031728756304120274, + "grad_norm": 1.7986340522766113, + "learning_rate": 4.98759509205815e-05, + "loss": 5.3939, + "step": 5335 + }, + { + "epoch": 0.031734703587401276, + "grad_norm": 1.7775462865829468, + "learning_rate": 4.9875904442152675e-05, + "loss": 5.4356, + "step": 5336 + }, + { + "epoch": 0.03174065087068227, + "grad_norm": 1.882104516029358, + "learning_rate": 4.987585795503994e-05, + "loss": 5.2852, + "step": 5337 + }, + { + "epoch": 0.03174659815396327, + "grad_norm": 1.9842430353164673, + "learning_rate": 4.987581145924329e-05, + "loss": 5.4089, + "step": 5338 + }, + { + "epoch": 0.03175254543724427, + "grad_norm": 1.7098103761672974, + "learning_rate": 4.9875764954762754e-05, + "loss": 5.2442, + "step": 5339 + }, + { + "epoch": 0.031758492720525264, + "grad_norm": 1.8304857015609741, + "learning_rate": 4.9875718441598354e-05, + "loss": 5.5403, + "step": 5340 + }, + { + "epoch": 0.03176444000380626, + "grad_norm": 2.0763137340545654, + "learning_rate": 4.987567191975009e-05, + "loss": 5.8295, + "step": 5341 + }, + { + "epoch": 0.03177038728708726, + "grad_norm": 1.907271385192871, + "learning_rate": 4.9875625389217984e-05, + "loss": 5.6979, + "step": 5342 + }, + { + "epoch": 0.031776334570368256, + "grad_norm": 2.1263620853424072, + "learning_rate": 4.9875578850002056e-05, + "loss": 5.7713, + "step": 5343 + }, + { + "epoch": 0.03178228185364925, + "grad_norm": 2.038358211517334, + "learning_rate": 4.987553230210232e-05, + "loss": 6.0019, + "step": 5344 + }, + { + "epoch": 0.03178822913693025, + "grad_norm": 1.5671371221542358, + "learning_rate": 4.987548574551879e-05, + "loss": 5.9237, + "step": 5345 + }, + { + "epoch": 0.03179417642021125, + "grad_norm": 1.9159321784973145, + "learning_rate": 4.987543918025149e-05, + "loss": 6.0363, + "step": 5346 + }, + { + "epoch": 0.03180012370349224, + "grad_norm": 1.8012747764587402, + "learning_rate": 4.987539260630043e-05, + "loss": 5.901, + "step": 5347 + }, + { + "epoch": 0.031806070986773245, + "grad_norm": 2.154933214187622, + "learning_rate": 4.9875346023665625e-05, + "loss": 5.6379, + "step": 5348 + }, + { + "epoch": 0.03181201827005424, + "grad_norm": 2.191539764404297, + "learning_rate": 4.98752994323471e-05, + "loss": 5.5322, + "step": 5349 + }, + { + "epoch": 0.031817965553335235, + "grad_norm": 2.0007123947143555, + "learning_rate": 4.9875252832344856e-05, + "loss": 5.7398, + "step": 5350 + }, + { + "epoch": 0.03182391283661623, + "grad_norm": 1.7119163274765015, + "learning_rate": 4.9875206223658924e-05, + "loss": 5.8507, + "step": 5351 + }, + { + "epoch": 0.03182986011989723, + "grad_norm": 1.8882098197937012, + "learning_rate": 4.987515960628931e-05, + "loss": 5.8668, + "step": 5352 + }, + { + "epoch": 0.03183580740317823, + "grad_norm": 2.005493402481079, + "learning_rate": 4.987511298023604e-05, + "loss": 5.9672, + "step": 5353 + }, + { + "epoch": 0.03184175468645922, + "grad_norm": 1.858807921409607, + "learning_rate": 4.987506634549912e-05, + "loss": 5.9344, + "step": 5354 + }, + { + "epoch": 0.031847701969740225, + "grad_norm": 2.2698724269866943, + "learning_rate": 4.987501970207858e-05, + "loss": 5.6553, + "step": 5355 + }, + { + "epoch": 0.03185364925302122, + "grad_norm": 1.7690725326538086, + "learning_rate": 4.987497304997442e-05, + "loss": 5.6255, + "step": 5356 + }, + { + "epoch": 0.031859596536302215, + "grad_norm": 2.008002758026123, + "learning_rate": 4.987492638918667e-05, + "loss": 5.5578, + "step": 5357 + }, + { + "epoch": 0.03186554381958322, + "grad_norm": 1.6483304500579834, + "learning_rate": 4.987487971971533e-05, + "loss": 5.4786, + "step": 5358 + }, + { + "epoch": 0.03187149110286421, + "grad_norm": 1.9136204719543457, + "learning_rate": 4.987483304156044e-05, + "loss": 5.6043, + "step": 5359 + }, + { + "epoch": 0.03187743838614521, + "grad_norm": 1.9811625480651855, + "learning_rate": 4.987478635472199e-05, + "loss": 5.6172, + "step": 5360 + }, + { + "epoch": 0.03188338566942621, + "grad_norm": 2.012134075164795, + "learning_rate": 4.987473965920002e-05, + "loss": 5.6715, + "step": 5361 + }, + { + "epoch": 0.031889332952707204, + "grad_norm": 1.930550217628479, + "learning_rate": 4.987469295499453e-05, + "loss": 5.516, + "step": 5362 + }, + { + "epoch": 0.0318952802359882, + "grad_norm": 2.1190578937530518, + "learning_rate": 4.987464624210554e-05, + "loss": 5.5176, + "step": 5363 + }, + { + "epoch": 0.031901227519269194, + "grad_norm": 2.428710699081421, + "learning_rate": 4.987459952053307e-05, + "loss": 5.4088, + "step": 5364 + }, + { + "epoch": 0.031907174802550196, + "grad_norm": 1.8820819854736328, + "learning_rate": 4.987455279027713e-05, + "loss": 5.3753, + "step": 5365 + }, + { + "epoch": 0.03191312208583119, + "grad_norm": 1.6506859064102173, + "learning_rate": 4.987450605133775e-05, + "loss": 5.6018, + "step": 5366 + }, + { + "epoch": 0.03191906936911219, + "grad_norm": 2.060772657394409, + "learning_rate": 4.9874459303714925e-05, + "loss": 5.3587, + "step": 5367 + }, + { + "epoch": 0.03192501665239319, + "grad_norm": 2.3591532707214355, + "learning_rate": 4.9874412547408694e-05, + "loss": 5.7685, + "step": 5368 + }, + { + "epoch": 0.031930963935674184, + "grad_norm": 2.140322685241699, + "learning_rate": 4.987436578241906e-05, + "loss": 5.9015, + "step": 5369 + }, + { + "epoch": 0.03193691121895518, + "grad_norm": 2.2479233741760254, + "learning_rate": 4.987431900874604e-05, + "loss": 5.6079, + "step": 5370 + }, + { + "epoch": 0.03194285850223618, + "grad_norm": 2.0334317684173584, + "learning_rate": 4.987427222638965e-05, + "loss": 5.6364, + "step": 5371 + }, + { + "epoch": 0.031948805785517176, + "grad_norm": 2.0599231719970703, + "learning_rate": 4.987422543534991e-05, + "loss": 5.6578, + "step": 5372 + }, + { + "epoch": 0.03195475306879817, + "grad_norm": 2.237504720687866, + "learning_rate": 4.9874178635626836e-05, + "loss": 5.5784, + "step": 5373 + }, + { + "epoch": 0.03196070035207917, + "grad_norm": 2.013193130493164, + "learning_rate": 4.987413182722044e-05, + "loss": 5.4874, + "step": 5374 + }, + { + "epoch": 0.03196664763536017, + "grad_norm": 1.9806950092315674, + "learning_rate": 4.987408501013075e-05, + "loss": 5.41, + "step": 5375 + }, + { + "epoch": 0.03197259491864116, + "grad_norm": 1.7534204721450806, + "learning_rate": 4.9874038184357766e-05, + "loss": 5.4596, + "step": 5376 + }, + { + "epoch": 0.031978542201922165, + "grad_norm": 1.5722386837005615, + "learning_rate": 4.987399134990152e-05, + "loss": 5.508, + "step": 5377 + }, + { + "epoch": 0.03198448948520316, + "grad_norm": 7.868972301483154, + "learning_rate": 4.987394450676201e-05, + "loss": 5.1734, + "step": 5378 + }, + { + "epoch": 0.031990436768484155, + "grad_norm": 2.2103798389434814, + "learning_rate": 4.9873897654939274e-05, + "loss": 5.6766, + "step": 5379 + }, + { + "epoch": 0.03199638405176515, + "grad_norm": 1.9590017795562744, + "learning_rate": 4.9873850794433306e-05, + "loss": 5.7764, + "step": 5380 + }, + { + "epoch": 0.03200233133504615, + "grad_norm": 1.96006441116333, + "learning_rate": 4.9873803925244146e-05, + "loss": 5.7933, + "step": 5381 + }, + { + "epoch": 0.03200827861832715, + "grad_norm": 1.7377163171768188, + "learning_rate": 4.987375704737178e-05, + "loss": 5.692, + "step": 5382 + }, + { + "epoch": 0.03201422590160814, + "grad_norm": 2.0734782218933105, + "learning_rate": 4.9873710160816256e-05, + "loss": 5.5466, + "step": 5383 + }, + { + "epoch": 0.032020173184889145, + "grad_norm": 2.4700942039489746, + "learning_rate": 4.9873663265577574e-05, + "loss": 5.5837, + "step": 5384 + }, + { + "epoch": 0.03202612046817014, + "grad_norm": 2.067009925842285, + "learning_rate": 4.987361636165576e-05, + "loss": 5.4777, + "step": 5385 + }, + { + "epoch": 0.032032067751451135, + "grad_norm": 1.9585732221603394, + "learning_rate": 4.9873569449050815e-05, + "loss": 5.62, + "step": 5386 + }, + { + "epoch": 0.03203801503473214, + "grad_norm": 2.0210976600646973, + "learning_rate": 4.9873522527762766e-05, + "loss": 5.3554, + "step": 5387 + }, + { + "epoch": 0.03204396231801313, + "grad_norm": 2.0345299243927, + "learning_rate": 4.987347559779163e-05, + "loss": 5.3912, + "step": 5388 + }, + { + "epoch": 0.03204990960129413, + "grad_norm": 2.0960853099823, + "learning_rate": 4.987342865913742e-05, + "loss": 5.3497, + "step": 5389 + }, + { + "epoch": 0.03205585688457513, + "grad_norm": 2.0156044960021973, + "learning_rate": 4.987338171180015e-05, + "loss": 5.2769, + "step": 5390 + }, + { + "epoch": 0.032061804167856124, + "grad_norm": 2.0021722316741943, + "learning_rate": 4.987333475577984e-05, + "loss": 5.2338, + "step": 5391 + }, + { + "epoch": 0.03206775145113712, + "grad_norm": 1.8502025604248047, + "learning_rate": 4.987328779107651e-05, + "loss": 5.4231, + "step": 5392 + }, + { + "epoch": 0.03207369873441812, + "grad_norm": 2.0788064002990723, + "learning_rate": 4.987324081769016e-05, + "loss": 5.3989, + "step": 5393 + }, + { + "epoch": 0.032079646017699116, + "grad_norm": 5.172029495239258, + "learning_rate": 4.987319383562083e-05, + "loss": 6.5943, + "step": 5394 + }, + { + "epoch": 0.03208559330098011, + "grad_norm": 1.8732082843780518, + "learning_rate": 4.987314684486852e-05, + "loss": 5.3085, + "step": 5395 + }, + { + "epoch": 0.032091540584261107, + "grad_norm": 2.0511786937713623, + "learning_rate": 4.987309984543326e-05, + "loss": 5.1598, + "step": 5396 + }, + { + "epoch": 0.03209748786754211, + "grad_norm": 2.1821703910827637, + "learning_rate": 4.987305283731505e-05, + "loss": 5.3575, + "step": 5397 + }, + { + "epoch": 0.032103435150823104, + "grad_norm": 2.1190478801727295, + "learning_rate": 4.9873005820513906e-05, + "loss": 5.2371, + "step": 5398 + }, + { + "epoch": 0.0321093824341041, + "grad_norm": 2.1476964950561523, + "learning_rate": 4.987295879502987e-05, + "loss": 5.1378, + "step": 5399 + }, + { + "epoch": 0.0321153297173851, + "grad_norm": 2.3466129302978516, + "learning_rate": 4.987291176086293e-05, + "loss": 5.0642, + "step": 5400 + }, + { + "epoch": 0.032121277000666096, + "grad_norm": 2.267949104309082, + "learning_rate": 4.9872864718013115e-05, + "loss": 5.6835, + "step": 5401 + }, + { + "epoch": 0.03212722428394709, + "grad_norm": 3.1235604286193848, + "learning_rate": 4.987281766648044e-05, + "loss": 6.2094, + "step": 5402 + }, + { + "epoch": 0.03213317156722809, + "grad_norm": 2.494929790496826, + "learning_rate": 4.987277060626493e-05, + "loss": 6.2387, + "step": 5403 + }, + { + "epoch": 0.03213911885050909, + "grad_norm": 2.554422616958618, + "learning_rate": 4.987272353736658e-05, + "loss": 5.9655, + "step": 5404 + }, + { + "epoch": 0.03214506613379008, + "grad_norm": 3.688295841217041, + "learning_rate": 4.987267645978543e-05, + "loss": 6.3994, + "step": 5405 + }, + { + "epoch": 0.032151013417071085, + "grad_norm": 2.773847818374634, + "learning_rate": 4.987262937352147e-05, + "loss": 5.515, + "step": 5406 + }, + { + "epoch": 0.03215696070035208, + "grad_norm": 3.067812204360962, + "learning_rate": 4.987258227857475e-05, + "loss": 5.7388, + "step": 5407 + }, + { + "epoch": 0.032162907983633075, + "grad_norm": 3.0557258129119873, + "learning_rate": 4.987253517494525e-05, + "loss": 6.0334, + "step": 5408 + }, + { + "epoch": 0.03216885526691407, + "grad_norm": 2.2864489555358887, + "learning_rate": 4.9872488062633026e-05, + "loss": 6.2805, + "step": 5409 + }, + { + "epoch": 0.03217480255019507, + "grad_norm": 3.2848916053771973, + "learning_rate": 4.987244094163807e-05, + "loss": 6.4782, + "step": 5410 + }, + { + "epoch": 0.03218074983347607, + "grad_norm": 3.7147631645202637, + "learning_rate": 4.987239381196039e-05, + "loss": 6.6618, + "step": 5411 + }, + { + "epoch": 0.03218669711675706, + "grad_norm": 2.740705966949463, + "learning_rate": 4.9872346673600017e-05, + "loss": 6.0261, + "step": 5412 + }, + { + "epoch": 0.032192644400038065, + "grad_norm": 2.6408498287200928, + "learning_rate": 4.9872299526556965e-05, + "loss": 5.8645, + "step": 5413 + }, + { + "epoch": 0.03219859168331906, + "grad_norm": 2.8298256397247314, + "learning_rate": 4.987225237083125e-05, + "loss": 5.9263, + "step": 5414 + }, + { + "epoch": 0.032204538966600055, + "grad_norm": 2.9417197704315186, + "learning_rate": 4.987220520642289e-05, + "loss": 5.8018, + "step": 5415 + }, + { + "epoch": 0.03221048624988106, + "grad_norm": 3.2862906455993652, + "learning_rate": 4.9872158033331904e-05, + "loss": 5.8429, + "step": 5416 + }, + { + "epoch": 0.03221643353316205, + "grad_norm": 2.7724359035491943, + "learning_rate": 4.9872110851558306e-05, + "loss": 5.9504, + "step": 5417 + }, + { + "epoch": 0.03222238081644305, + "grad_norm": 2.2753829956054688, + "learning_rate": 4.9872063661102106e-05, + "loss": 5.6443, + "step": 5418 + }, + { + "epoch": 0.03222832809972405, + "grad_norm": 2.597649097442627, + "learning_rate": 4.987201646196332e-05, + "loss": 6.4441, + "step": 5419 + }, + { + "epoch": 0.032234275383005044, + "grad_norm": 2.7298800945281982, + "learning_rate": 4.987196925414198e-05, + "loss": 6.2988, + "step": 5420 + }, + { + "epoch": 0.03224022266628604, + "grad_norm": 3.2329537868499756, + "learning_rate": 4.987192203763809e-05, + "loss": 5.8743, + "step": 5421 + }, + { + "epoch": 0.03224616994956704, + "grad_norm": 3.033226251602173, + "learning_rate": 4.987187481245167e-05, + "loss": 5.4863, + "step": 5422 + }, + { + "epoch": 0.032252117232848036, + "grad_norm": 2.7728521823883057, + "learning_rate": 4.987182757858273e-05, + "loss": 5.5722, + "step": 5423 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.6083309650421143, + "learning_rate": 4.98717803360313e-05, + "loss": 6.5257, + "step": 5424 + }, + { + "epoch": 0.032264011799410026, + "grad_norm": 2.5422329902648926, + "learning_rate": 4.987173308479738e-05, + "loss": 6.5582, + "step": 5425 + }, + { + "epoch": 0.03226995908269103, + "grad_norm": 2.7634811401367188, + "learning_rate": 4.9871685824881e-05, + "loss": 6.0987, + "step": 5426 + }, + { + "epoch": 0.032275906365972024, + "grad_norm": 3.631476640701294, + "learning_rate": 4.987163855628217e-05, + "loss": 5.8506, + "step": 5427 + }, + { + "epoch": 0.03228185364925302, + "grad_norm": 2.9783661365509033, + "learning_rate": 4.9871591279000904e-05, + "loss": 5.9387, + "step": 5428 + }, + { + "epoch": 0.03228780093253402, + "grad_norm": 2.369645357131958, + "learning_rate": 4.9871543993037225e-05, + "loss": 5.8097, + "step": 5429 + }, + { + "epoch": 0.032293748215815016, + "grad_norm": 2.782055616378784, + "learning_rate": 4.9871496698391155e-05, + "loss": 5.5301, + "step": 5430 + }, + { + "epoch": 0.03229969549909601, + "grad_norm": 2.408205270767212, + "learning_rate": 4.98714493950627e-05, + "loss": 5.6514, + "step": 5431 + }, + { + "epoch": 0.03230564278237701, + "grad_norm": 2.0641589164733887, + "learning_rate": 4.987140208305187e-05, + "loss": 5.6168, + "step": 5432 + }, + { + "epoch": 0.03231159006565801, + "grad_norm": 2.109773874282837, + "learning_rate": 4.987135476235869e-05, + "loss": 5.6678, + "step": 5433 + }, + { + "epoch": 0.032317537348939, + "grad_norm": 2.9809730052948, + "learning_rate": 4.987130743298318e-05, + "loss": 6.0531, + "step": 5434 + }, + { + "epoch": 0.032323484632220005, + "grad_norm": 2.5728509426116943, + "learning_rate": 4.9871260094925365e-05, + "loss": 6.05, + "step": 5435 + }, + { + "epoch": 0.032329431915501, + "grad_norm": 2.477074146270752, + "learning_rate": 4.9871212748185236e-05, + "loss": 6.351, + "step": 5436 + }, + { + "epoch": 0.032335379198781995, + "grad_norm": 2.3485517501831055, + "learning_rate": 4.987116539276283e-05, + "loss": 6.3033, + "step": 5437 + }, + { + "epoch": 0.03234132648206299, + "grad_norm": 2.4214296340942383, + "learning_rate": 4.987111802865816e-05, + "loss": 6.1152, + "step": 5438 + }, + { + "epoch": 0.03234727376534399, + "grad_norm": 3.5628256797790527, + "learning_rate": 4.9871070655871234e-05, + "loss": 5.6502, + "step": 5439 + }, + { + "epoch": 0.03235322104862499, + "grad_norm": 3.190075159072876, + "learning_rate": 4.987102327440208e-05, + "loss": 5.4164, + "step": 5440 + }, + { + "epoch": 0.03235916833190598, + "grad_norm": 2.402754306793213, + "learning_rate": 4.9870975884250696e-05, + "loss": 5.7116, + "step": 5441 + }, + { + "epoch": 0.032365115615186985, + "grad_norm": 2.846653938293457, + "learning_rate": 4.987092848541712e-05, + "loss": 6.1456, + "step": 5442 + }, + { + "epoch": 0.03237106289846798, + "grad_norm": 2.6700549125671387, + "learning_rate": 4.987088107790136e-05, + "loss": 5.9777, + "step": 5443 + }, + { + "epoch": 0.032377010181748975, + "grad_norm": 2.8929460048675537, + "learning_rate": 4.987083366170343e-05, + "loss": 6.1459, + "step": 5444 + }, + { + "epoch": 0.03238295746502998, + "grad_norm": 2.524376153945923, + "learning_rate": 4.987078623682335e-05, + "loss": 6.4341, + "step": 5445 + }, + { + "epoch": 0.03238890474831097, + "grad_norm": 2.0901076793670654, + "learning_rate": 4.987073880326114e-05, + "loss": 6.3968, + "step": 5446 + }, + { + "epoch": 0.03239485203159197, + "grad_norm": 3.0033867359161377, + "learning_rate": 4.9870691361016805e-05, + "loss": 5.8656, + "step": 5447 + }, + { + "epoch": 0.03240079931487297, + "grad_norm": 2.7715492248535156, + "learning_rate": 4.987064391009038e-05, + "loss": 6.1634, + "step": 5448 + }, + { + "epoch": 0.032406746598153964, + "grad_norm": 2.6102347373962402, + "learning_rate": 4.9870596450481855e-05, + "loss": 6.2521, + "step": 5449 + }, + { + "epoch": 0.03241269388143496, + "grad_norm": 2.326253890991211, + "learning_rate": 4.9870548982191265e-05, + "loss": 6.2517, + "step": 5450 + }, + { + "epoch": 0.03241864116471596, + "grad_norm": 2.3012197017669678, + "learning_rate": 4.987050150521863e-05, + "loss": 6.2261, + "step": 5451 + }, + { + "epoch": 0.032424588447996956, + "grad_norm": 2.100337505340576, + "learning_rate": 4.987045401956396e-05, + "loss": 5.6291, + "step": 5452 + }, + { + "epoch": 0.03243053573127795, + "grad_norm": 3.094754219055176, + "learning_rate": 4.987040652522727e-05, + "loss": 5.897, + "step": 5453 + }, + { + "epoch": 0.032436483014558946, + "grad_norm": 2.7406179904937744, + "learning_rate": 4.987035902220857e-05, + "loss": 6.0083, + "step": 5454 + }, + { + "epoch": 0.03244243029783995, + "grad_norm": 2.4106287956237793, + "learning_rate": 4.9870311510507895e-05, + "loss": 5.8538, + "step": 5455 + }, + { + "epoch": 0.032448377581120944, + "grad_norm": 2.7335946559906006, + "learning_rate": 4.987026399012525e-05, + "loss": 5.9181, + "step": 5456 + }, + { + "epoch": 0.03245432486440194, + "grad_norm": 2.796175003051758, + "learning_rate": 4.987021646106064e-05, + "loss": 5.6461, + "step": 5457 + }, + { + "epoch": 0.03246027214768294, + "grad_norm": 3.086470127105713, + "learning_rate": 4.987016892331411e-05, + "loss": 5.6692, + "step": 5458 + }, + { + "epoch": 0.032466219430963936, + "grad_norm": 2.394465923309326, + "learning_rate": 4.9870121376885656e-05, + "loss": 6.3046, + "step": 5459 + }, + { + "epoch": 0.03247216671424493, + "grad_norm": 2.0745291709899902, + "learning_rate": 4.98700738217753e-05, + "loss": 6.0491, + "step": 5460 + }, + { + "epoch": 0.03247811399752593, + "grad_norm": 2.66359281539917, + "learning_rate": 4.987002625798305e-05, + "loss": 5.6468, + "step": 5461 + }, + { + "epoch": 0.03248406128080693, + "grad_norm": 2.392833948135376, + "learning_rate": 4.9869978685508936e-05, + "loss": 5.8421, + "step": 5462 + }, + { + "epoch": 0.03249000856408792, + "grad_norm": 2.671710252761841, + "learning_rate": 4.9869931104352975e-05, + "loss": 5.6892, + "step": 5463 + }, + { + "epoch": 0.032495955847368925, + "grad_norm": 2.7013144493103027, + "learning_rate": 4.986988351451517e-05, + "loss": 5.7911, + "step": 5464 + }, + { + "epoch": 0.03250190313064992, + "grad_norm": 1.926703929901123, + "learning_rate": 4.9869835915995555e-05, + "loss": 5.5492, + "step": 5465 + }, + { + "epoch": 0.032507850413930915, + "grad_norm": 2.5668530464172363, + "learning_rate": 4.986978830879413e-05, + "loss": 5.8949, + "step": 5466 + }, + { + "epoch": 0.03251379769721191, + "grad_norm": 2.555305004119873, + "learning_rate": 4.986974069291092e-05, + "loss": 5.7408, + "step": 5467 + }, + { + "epoch": 0.03251974498049291, + "grad_norm": 2.551226854324341, + "learning_rate": 4.986969306834594e-05, + "loss": 5.7738, + "step": 5468 + }, + { + "epoch": 0.03252569226377391, + "grad_norm": 2.3194847106933594, + "learning_rate": 4.986964543509921e-05, + "loss": 6.2837, + "step": 5469 + }, + { + "epoch": 0.0325316395470549, + "grad_norm": 1.9618690013885498, + "learning_rate": 4.986959779317074e-05, + "loss": 5.9236, + "step": 5470 + }, + { + "epoch": 0.032537586830335904, + "grad_norm": 2.351971387863159, + "learning_rate": 4.986955014256055e-05, + "loss": 5.591, + "step": 5471 + }, + { + "epoch": 0.0325435341136169, + "grad_norm": 2.3772034645080566, + "learning_rate": 4.986950248326866e-05, + "loss": 5.6785, + "step": 5472 + }, + { + "epoch": 0.032549481396897895, + "grad_norm": 2.5764195919036865, + "learning_rate": 4.9869454815295085e-05, + "loss": 5.525, + "step": 5473 + }, + { + "epoch": 0.0325554286801789, + "grad_norm": 2.231048107147217, + "learning_rate": 4.986940713863984e-05, + "loss": 5.6789, + "step": 5474 + }, + { + "epoch": 0.03256137596345989, + "grad_norm": 2.8053946495056152, + "learning_rate": 4.986935945330294e-05, + "loss": 5.6319, + "step": 5475 + }, + { + "epoch": 0.03256732324674089, + "grad_norm": 3.4610519409179688, + "learning_rate": 4.98693117592844e-05, + "loss": 5.9855, + "step": 5476 + }, + { + "epoch": 0.03257327053002189, + "grad_norm": 2.5019664764404297, + "learning_rate": 4.986926405658425e-05, + "loss": 5.9997, + "step": 5477 + }, + { + "epoch": 0.032579217813302884, + "grad_norm": 2.6583313941955566, + "learning_rate": 4.986921634520249e-05, + "loss": 6.3755, + "step": 5478 + }, + { + "epoch": 0.03258516509658388, + "grad_norm": 2.990699291229248, + "learning_rate": 4.986916862513914e-05, + "loss": 5.8932, + "step": 5479 + }, + { + "epoch": 0.03259111237986488, + "grad_norm": 3.282546043395996, + "learning_rate": 4.986912089639423e-05, + "loss": 5.5508, + "step": 5480 + }, + { + "epoch": 0.032597059663145876, + "grad_norm": 3.1012487411499023, + "learning_rate": 4.9869073158967755e-05, + "loss": 5.5567, + "step": 5481 + }, + { + "epoch": 0.03260300694642687, + "grad_norm": 2.141892433166504, + "learning_rate": 4.986902541285975e-05, + "loss": 5.6195, + "step": 5482 + }, + { + "epoch": 0.032608954229707866, + "grad_norm": 2.173670530319214, + "learning_rate": 4.986897765807023e-05, + "loss": 5.6913, + "step": 5483 + }, + { + "epoch": 0.03261490151298887, + "grad_norm": 2.4076435565948486, + "learning_rate": 4.98689298945992e-05, + "loss": 5.8324, + "step": 5484 + }, + { + "epoch": 0.03262084879626986, + "grad_norm": 2.8968818187713623, + "learning_rate": 4.986888212244668e-05, + "loss": 6.0086, + "step": 5485 + }, + { + "epoch": 0.03262679607955086, + "grad_norm": 2.2434191703796387, + "learning_rate": 4.9868834341612696e-05, + "loss": 5.9645, + "step": 5486 + }, + { + "epoch": 0.03263274336283186, + "grad_norm": 1.9683157205581665, + "learning_rate": 4.9868786552097255e-05, + "loss": 5.9173, + "step": 5487 + }, + { + "epoch": 0.032638690646112856, + "grad_norm": 2.369816303253174, + "learning_rate": 4.9868738753900384e-05, + "loss": 6.2728, + "step": 5488 + }, + { + "epoch": 0.03264463792939385, + "grad_norm": 2.1152775287628174, + "learning_rate": 4.986869094702209e-05, + "loss": 6.0474, + "step": 5489 + }, + { + "epoch": 0.03265058521267485, + "grad_norm": 2.3219857215881348, + "learning_rate": 4.9868643131462397e-05, + "loss": 5.7451, + "step": 5490 + }, + { + "epoch": 0.03265653249595585, + "grad_norm": 2.236046075820923, + "learning_rate": 4.986859530722131e-05, + "loss": 5.7775, + "step": 5491 + }, + { + "epoch": 0.03266247977923684, + "grad_norm": 2.3334364891052246, + "learning_rate": 4.986854747429886e-05, + "loss": 5.7429, + "step": 5492 + }, + { + "epoch": 0.032668427062517845, + "grad_norm": 2.5464704036712646, + "learning_rate": 4.986849963269505e-05, + "loss": 5.5781, + "step": 5493 + }, + { + "epoch": 0.03267437434579884, + "grad_norm": 2.104419469833374, + "learning_rate": 4.986845178240991e-05, + "loss": 5.6378, + "step": 5494 + }, + { + "epoch": 0.032680321629079835, + "grad_norm": 2.3115224838256836, + "learning_rate": 4.9868403923443444e-05, + "loss": 5.7617, + "step": 5495 + }, + { + "epoch": 0.03268626891236083, + "grad_norm": 2.3370540142059326, + "learning_rate": 4.9868356055795685e-05, + "loss": 6.1278, + "step": 5496 + }, + { + "epoch": 0.03269221619564183, + "grad_norm": 2.8618736267089844, + "learning_rate": 4.986830817946663e-05, + "loss": 6.0879, + "step": 5497 + }, + { + "epoch": 0.03269816347892283, + "grad_norm": 2.3229949474334717, + "learning_rate": 4.986826029445631e-05, + "loss": 6.0915, + "step": 5498 + }, + { + "epoch": 0.03270411076220382, + "grad_norm": 2.549914598464966, + "learning_rate": 4.986821240076473e-05, + "loss": 6.2375, + "step": 5499 + }, + { + "epoch": 0.032710058045484824, + "grad_norm": 2.595916271209717, + "learning_rate": 4.986816449839192e-05, + "loss": 6.095, + "step": 5500 + }, + { + "epoch": 0.03271600532876582, + "grad_norm": 2.4409420490264893, + "learning_rate": 4.98681165873379e-05, + "loss": 5.353, + "step": 5501 + }, + { + "epoch": 0.032721952612046815, + "grad_norm": 2.550156593322754, + "learning_rate": 4.986806866760266e-05, + "loss": 5.558, + "step": 5502 + }, + { + "epoch": 0.03272789989532782, + "grad_norm": 2.7811737060546875, + "learning_rate": 4.986802073918625e-05, + "loss": 5.7174, + "step": 5503 + }, + { + "epoch": 0.03273384717860881, + "grad_norm": 2.8430123329162598, + "learning_rate": 4.986797280208866e-05, + "loss": 5.5644, + "step": 5504 + }, + { + "epoch": 0.03273979446188981, + "grad_norm": 3.021040201187134, + "learning_rate": 4.986792485630992e-05, + "loss": 5.9451, + "step": 5505 + }, + { + "epoch": 0.03274574174517081, + "grad_norm": 2.69866681098938, + "learning_rate": 4.986787690185005e-05, + "loss": 5.9934, + "step": 5506 + }, + { + "epoch": 0.032751689028451804, + "grad_norm": 2.7202444076538086, + "learning_rate": 4.986782893870906e-05, + "loss": 6.1298, + "step": 5507 + }, + { + "epoch": 0.0327576363117328, + "grad_norm": 2.223405122756958, + "learning_rate": 4.986778096688696e-05, + "loss": 5.8968, + "step": 5508 + }, + { + "epoch": 0.0327635835950138, + "grad_norm": 2.5733680725097656, + "learning_rate": 4.986773298638378e-05, + "loss": 6.0928, + "step": 5509 + }, + { + "epoch": 0.032769530878294796, + "grad_norm": 2.584397554397583, + "learning_rate": 4.986768499719953e-05, + "loss": 5.7879, + "step": 5510 + }, + { + "epoch": 0.03277547816157579, + "grad_norm": 3.160489797592163, + "learning_rate": 4.986763699933423e-05, + "loss": 5.6413, + "step": 5511 + }, + { + "epoch": 0.032781425444856786, + "grad_norm": 2.8224406242370605, + "learning_rate": 4.9867588992787894e-05, + "loss": 6.1476, + "step": 5512 + }, + { + "epoch": 0.03278737272813779, + "grad_norm": 2.2565996646881104, + "learning_rate": 4.986754097756054e-05, + "loss": 6.208, + "step": 5513 + }, + { + "epoch": 0.03279332001141878, + "grad_norm": 2.5425479412078857, + "learning_rate": 4.9867492953652184e-05, + "loss": 5.934, + "step": 5514 + }, + { + "epoch": 0.03279926729469978, + "grad_norm": 2.6598689556121826, + "learning_rate": 4.986744492106284e-05, + "loss": 5.7433, + "step": 5515 + }, + { + "epoch": 0.03280521457798078, + "grad_norm": 2.419388771057129, + "learning_rate": 4.986739687979253e-05, + "loss": 5.378, + "step": 5516 + }, + { + "epoch": 0.032811161861261776, + "grad_norm": 2.72784161567688, + "learning_rate": 4.986734882984127e-05, + "loss": 5.4089, + "step": 5517 + }, + { + "epoch": 0.03281710914454277, + "grad_norm": 3.0592923164367676, + "learning_rate": 4.9867300771209075e-05, + "loss": 5.9573, + "step": 5518 + }, + { + "epoch": 0.03282305642782377, + "grad_norm": 2.7681832313537598, + "learning_rate": 4.9867252703895965e-05, + "loss": 5.5325, + "step": 5519 + }, + { + "epoch": 0.03282900371110477, + "grad_norm": 2.6752777099609375, + "learning_rate": 4.9867204627901946e-05, + "loss": 5.7543, + "step": 5520 + }, + { + "epoch": 0.03283495099438576, + "grad_norm": 2.481203317642212, + "learning_rate": 4.9867156543227046e-05, + "loss": 5.575, + "step": 5521 + }, + { + "epoch": 0.032840898277666765, + "grad_norm": 2.6403908729553223, + "learning_rate": 4.986710844987128e-05, + "loss": 5.4381, + "step": 5522 + }, + { + "epoch": 0.03284684556094776, + "grad_norm": 2.6146085262298584, + "learning_rate": 4.986706034783466e-05, + "loss": 5.8672, + "step": 5523 + }, + { + "epoch": 0.032852792844228755, + "grad_norm": 3.453666925430298, + "learning_rate": 4.986701223711722e-05, + "loss": 5.8353, + "step": 5524 + }, + { + "epoch": 0.03285874012750975, + "grad_norm": 2.511216640472412, + "learning_rate": 4.986696411771895e-05, + "loss": 5.9567, + "step": 5525 + }, + { + "epoch": 0.03286468741079075, + "grad_norm": 2.57395601272583, + "learning_rate": 4.986691598963988e-05, + "loss": 5.6396, + "step": 5526 + }, + { + "epoch": 0.03287063469407175, + "grad_norm": 2.778801441192627, + "learning_rate": 4.986686785288003e-05, + "loss": 6.0237, + "step": 5527 + }, + { + "epoch": 0.03287658197735274, + "grad_norm": 2.5216047763824463, + "learning_rate": 4.986681970743941e-05, + "loss": 6.1305, + "step": 5528 + }, + { + "epoch": 0.032882529260633744, + "grad_norm": 2.5105085372924805, + "learning_rate": 4.986677155331804e-05, + "loss": 6.4951, + "step": 5529 + }, + { + "epoch": 0.03288847654391474, + "grad_norm": 2.4105372428894043, + "learning_rate": 4.9866723390515946e-05, + "loss": 6.291, + "step": 5530 + }, + { + "epoch": 0.032894423827195735, + "grad_norm": 2.740095853805542, + "learning_rate": 4.9866675219033125e-05, + "loss": 5.762, + "step": 5531 + }, + { + "epoch": 0.03290037111047674, + "grad_norm": 2.327892541885376, + "learning_rate": 4.9866627038869605e-05, + "loss": 6.1023, + "step": 5532 + }, + { + "epoch": 0.03290631839375773, + "grad_norm": 2.71732497215271, + "learning_rate": 4.9866578850025414e-05, + "loss": 6.0739, + "step": 5533 + }, + { + "epoch": 0.03291226567703873, + "grad_norm": 2.1895039081573486, + "learning_rate": 4.9866530652500545e-05, + "loss": 5.801, + "step": 5534 + }, + { + "epoch": 0.03291821296031973, + "grad_norm": 2.39670729637146, + "learning_rate": 4.986648244629503e-05, + "loss": 6.0105, + "step": 5535 + }, + { + "epoch": 0.032924160243600724, + "grad_norm": 2.14630126953125, + "learning_rate": 4.986643423140889e-05, + "loss": 5.8457, + "step": 5536 + }, + { + "epoch": 0.03293010752688172, + "grad_norm": 2.111196994781494, + "learning_rate": 4.9866386007842125e-05, + "loss": 6.0804, + "step": 5537 + }, + { + "epoch": 0.03293605481016272, + "grad_norm": 2.8245434761047363, + "learning_rate": 4.986633777559476e-05, + "loss": 6.3152, + "step": 5538 + }, + { + "epoch": 0.032942002093443716, + "grad_norm": 2.3561060428619385, + "learning_rate": 4.9866289534666824e-05, + "loss": 6.286, + "step": 5539 + }, + { + "epoch": 0.03294794937672471, + "grad_norm": 3.21701979637146, + "learning_rate": 4.986624128505832e-05, + "loss": 5.9775, + "step": 5540 + }, + { + "epoch": 0.032953896660005706, + "grad_norm": 3.9414072036743164, + "learning_rate": 4.9866193026769265e-05, + "loss": 5.9413, + "step": 5541 + }, + { + "epoch": 0.03295984394328671, + "grad_norm": 2.7801051139831543, + "learning_rate": 4.986614475979968e-05, + "loss": 5.8642, + "step": 5542 + }, + { + "epoch": 0.0329657912265677, + "grad_norm": 2.7095935344696045, + "learning_rate": 4.986609648414958e-05, + "loss": 5.6952, + "step": 5543 + }, + { + "epoch": 0.0329717385098487, + "grad_norm": 2.5800812244415283, + "learning_rate": 4.986604819981898e-05, + "loss": 6.0285, + "step": 5544 + }, + { + "epoch": 0.0329776857931297, + "grad_norm": 2.6105730533599854, + "learning_rate": 4.9865999906807904e-05, + "loss": 5.6683, + "step": 5545 + }, + { + "epoch": 0.032983633076410696, + "grad_norm": 2.635570764541626, + "learning_rate": 4.9865951605116366e-05, + "loss": 5.9092, + "step": 5546 + }, + { + "epoch": 0.03298958035969169, + "grad_norm": 2.3708200454711914, + "learning_rate": 4.9865903294744373e-05, + "loss": 6.0034, + "step": 5547 + }, + { + "epoch": 0.03299552764297269, + "grad_norm": 2.437201499938965, + "learning_rate": 4.986585497569196e-05, + "loss": 6.2587, + "step": 5548 + }, + { + "epoch": 0.03300147492625369, + "grad_norm": 2.076016426086426, + "learning_rate": 4.9865806647959126e-05, + "loss": 6.358, + "step": 5549 + }, + { + "epoch": 0.03300742220953468, + "grad_norm": 1.8261257410049438, + "learning_rate": 4.98657583115459e-05, + "loss": 6.0431, + "step": 5550 + }, + { + "epoch": 0.033013369492815685, + "grad_norm": 2.8339858055114746, + "learning_rate": 4.98657099664523e-05, + "loss": 5.7956, + "step": 5551 + }, + { + "epoch": 0.03301931677609668, + "grad_norm": 2.7288596630096436, + "learning_rate": 4.986566161267833e-05, + "loss": 5.7092, + "step": 5552 + }, + { + "epoch": 0.033025264059377675, + "grad_norm": 2.7197329998016357, + "learning_rate": 4.986561325022402e-05, + "loss": 5.649, + "step": 5553 + }, + { + "epoch": 0.03303121134265867, + "grad_norm": 2.6161739826202393, + "learning_rate": 4.986556487908937e-05, + "loss": 5.6935, + "step": 5554 + }, + { + "epoch": 0.03303715862593967, + "grad_norm": 2.695068597793579, + "learning_rate": 4.986551649927441e-05, + "loss": 5.6901, + "step": 5555 + }, + { + "epoch": 0.03304310590922067, + "grad_norm": 3.0315186977386475, + "learning_rate": 4.986546811077917e-05, + "loss": 5.6317, + "step": 5556 + }, + { + "epoch": 0.03304905319250166, + "grad_norm": 2.3597543239593506, + "learning_rate": 4.986541971360364e-05, + "loss": 5.8129, + "step": 5557 + }, + { + "epoch": 0.033055000475782664, + "grad_norm": 2.8090550899505615, + "learning_rate": 4.986537130774785e-05, + "loss": 6.4427, + "step": 5558 + }, + { + "epoch": 0.03306094775906366, + "grad_norm": 3.4232771396636963, + "learning_rate": 4.986532289321182e-05, + "loss": 6.5737, + "step": 5559 + }, + { + "epoch": 0.033066895042344654, + "grad_norm": 2.1425294876098633, + "learning_rate": 4.986527446999556e-05, + "loss": 6.2395, + "step": 5560 + }, + { + "epoch": 0.033072842325625657, + "grad_norm": 2.5348880290985107, + "learning_rate": 4.986522603809909e-05, + "loss": 6.0425, + "step": 5561 + }, + { + "epoch": 0.03307878960890665, + "grad_norm": 3.0824179649353027, + "learning_rate": 4.986517759752242e-05, + "loss": 5.8785, + "step": 5562 + }, + { + "epoch": 0.03308473689218765, + "grad_norm": 2.297706365585327, + "learning_rate": 4.986512914826558e-05, + "loss": 5.8989, + "step": 5563 + }, + { + "epoch": 0.03309068417546865, + "grad_norm": 2.866257667541504, + "learning_rate": 4.986508069032858e-05, + "loss": 5.8905, + "step": 5564 + }, + { + "epoch": 0.033096631458749644, + "grad_norm": 2.2450008392333984, + "learning_rate": 4.9865032223711436e-05, + "loss": 6.3302, + "step": 5565 + }, + { + "epoch": 0.03310257874203064, + "grad_norm": 2.235558271408081, + "learning_rate": 4.9864983748414166e-05, + "loss": 6.4235, + "step": 5566 + }, + { + "epoch": 0.03310852602531164, + "grad_norm": 2.5197713375091553, + "learning_rate": 4.986493526443679e-05, + "loss": 6.3999, + "step": 5567 + }, + { + "epoch": 0.033114473308592636, + "grad_norm": 2.5716195106506348, + "learning_rate": 4.986488677177932e-05, + "loss": 6.0258, + "step": 5568 + }, + { + "epoch": 0.03312042059187363, + "grad_norm": 2.468663454055786, + "learning_rate": 4.986483827044177e-05, + "loss": 6.7553, + "step": 5569 + }, + { + "epoch": 0.033126367875154626, + "grad_norm": 2.4334170818328857, + "learning_rate": 4.986478976042417e-05, + "loss": 6.4722, + "step": 5570 + }, + { + "epoch": 0.03313231515843563, + "grad_norm": 2.234487533569336, + "learning_rate": 4.986474124172652e-05, + "loss": 5.7158, + "step": 5571 + }, + { + "epoch": 0.03313826244171662, + "grad_norm": 2.8017537593841553, + "learning_rate": 4.9864692714348857e-05, + "loss": 5.9552, + "step": 5572 + }, + { + "epoch": 0.03314420972499762, + "grad_norm": 3.171354055404663, + "learning_rate": 4.986464417829118e-05, + "loss": 6.027, + "step": 5573 + }, + { + "epoch": 0.03315015700827862, + "grad_norm": 2.890169620513916, + "learning_rate": 4.9864595633553516e-05, + "loss": 6.2768, + "step": 5574 + }, + { + "epoch": 0.033156104291559615, + "grad_norm": 3.010934829711914, + "learning_rate": 4.986454708013587e-05, + "loss": 6.4054, + "step": 5575 + }, + { + "epoch": 0.03316205157484061, + "grad_norm": 2.143833875656128, + "learning_rate": 4.9864498518038274e-05, + "loss": 6.3771, + "step": 5576 + }, + { + "epoch": 0.03316799885812161, + "grad_norm": 2.2067418098449707, + "learning_rate": 4.986444994726074e-05, + "loss": 6.0158, + "step": 5577 + }, + { + "epoch": 0.03317394614140261, + "grad_norm": 2.3396403789520264, + "learning_rate": 4.986440136780328e-05, + "loss": 6.4286, + "step": 5578 + }, + { + "epoch": 0.0331798934246836, + "grad_norm": 2.8305866718292236, + "learning_rate": 4.9864352779665915e-05, + "loss": 5.7804, + "step": 5579 + }, + { + "epoch": 0.033185840707964605, + "grad_norm": 2.748194456100464, + "learning_rate": 4.9864304182848664e-05, + "loss": 6.1711, + "step": 5580 + }, + { + "epoch": 0.0331917879912456, + "grad_norm": 2.329761505126953, + "learning_rate": 4.9864255577351534e-05, + "loss": 6.2722, + "step": 5581 + }, + { + "epoch": 0.033197735274526595, + "grad_norm": 2.4633524417877197, + "learning_rate": 4.986420696317457e-05, + "loss": 6.1349, + "step": 5582 + }, + { + "epoch": 0.03320368255780759, + "grad_norm": 1.8909802436828613, + "learning_rate": 4.986415834031775e-05, + "loss": 6.2181, + "step": 5583 + }, + { + "epoch": 0.03320962984108859, + "grad_norm": 2.1794517040252686, + "learning_rate": 4.9864109708781104e-05, + "loss": 6.2808, + "step": 5584 + }, + { + "epoch": 0.03321557712436959, + "grad_norm": 2.1766669750213623, + "learning_rate": 4.986406106856466e-05, + "loss": 6.3004, + "step": 5585 + }, + { + "epoch": 0.03322152440765058, + "grad_norm": 2.27526593208313, + "learning_rate": 4.986401241966844e-05, + "loss": 5.9225, + "step": 5586 + }, + { + "epoch": 0.033227471690931584, + "grad_norm": 3.2843096256256104, + "learning_rate": 4.986396376209244e-05, + "loss": 5.8364, + "step": 5587 + }, + { + "epoch": 0.03323341897421258, + "grad_norm": 2.509831666946411, + "learning_rate": 4.9863915095836685e-05, + "loss": 5.6958, + "step": 5588 + }, + { + "epoch": 0.033239366257493574, + "grad_norm": 2.5235815048217773, + "learning_rate": 4.98638664209012e-05, + "loss": 5.4937, + "step": 5589 + }, + { + "epoch": 0.033245313540774576, + "grad_norm": 2.918334484100342, + "learning_rate": 4.986381773728599e-05, + "loss": 5.8284, + "step": 5590 + }, + { + "epoch": 0.03325126082405557, + "grad_norm": 2.8091490268707275, + "learning_rate": 4.986376904499108e-05, + "loss": 5.8126, + "step": 5591 + }, + { + "epoch": 0.03325720810733657, + "grad_norm": 2.555173635482788, + "learning_rate": 4.986372034401649e-05, + "loss": 5.6393, + "step": 5592 + }, + { + "epoch": 0.03326315539061757, + "grad_norm": 2.6366164684295654, + "learning_rate": 4.986367163436223e-05, + "loss": 6.6675, + "step": 5593 + }, + { + "epoch": 0.033269102673898564, + "grad_norm": 2.5691051483154297, + "learning_rate": 4.9863622916028316e-05, + "loss": 6.5808, + "step": 5594 + }, + { + "epoch": 0.03327504995717956, + "grad_norm": 2.239384889602661, + "learning_rate": 4.986357418901477e-05, + "loss": 6.0191, + "step": 5595 + }, + { + "epoch": 0.03328099724046056, + "grad_norm": 2.3877806663513184, + "learning_rate": 4.9863525453321614e-05, + "loss": 5.7429, + "step": 5596 + }, + { + "epoch": 0.033286944523741556, + "grad_norm": 2.559633731842041, + "learning_rate": 4.9863476708948846e-05, + "loss": 5.4866, + "step": 5597 + }, + { + "epoch": 0.03329289180702255, + "grad_norm": 3.7681171894073486, + "learning_rate": 4.98634279558965e-05, + "loss": 5.6139, + "step": 5598 + }, + { + "epoch": 0.033298839090303546, + "grad_norm": 3.999264717102051, + "learning_rate": 4.9863379194164594e-05, + "loss": 5.6031, + "step": 5599 + }, + { + "epoch": 0.03330478637358455, + "grad_norm": 3.1031601428985596, + "learning_rate": 4.986333042375313e-05, + "loss": 5.5397, + "step": 5600 + }, + { + "epoch": 0.03331073365686554, + "grad_norm": 3.104998826980591, + "learning_rate": 4.986328164466214e-05, + "loss": 5.4274, + "step": 5601 + }, + { + "epoch": 0.03331668094014654, + "grad_norm": 2.9426207542419434, + "learning_rate": 4.986323285689163e-05, + "loss": 5.5859, + "step": 5602 + }, + { + "epoch": 0.03332262822342754, + "grad_norm": 2.6912827491760254, + "learning_rate": 4.986318406044163e-05, + "loss": 5.7375, + "step": 5603 + }, + { + "epoch": 0.033328575506708535, + "grad_norm": 4.394237041473389, + "learning_rate": 4.9863135255312145e-05, + "loss": 5.8246, + "step": 5604 + }, + { + "epoch": 0.03333452278998953, + "grad_norm": 2.812197685241699, + "learning_rate": 4.986308644150319e-05, + "loss": 5.6263, + "step": 5605 + }, + { + "epoch": 0.03334047007327053, + "grad_norm": 3.1969878673553467, + "learning_rate": 4.98630376190148e-05, + "loss": 5.4174, + "step": 5606 + }, + { + "epoch": 0.03334641735655153, + "grad_norm": 2.6018595695495605, + "learning_rate": 4.9862988787846975e-05, + "loss": 5.3917, + "step": 5607 + }, + { + "epoch": 0.03335236463983252, + "grad_norm": 2.5274007320404053, + "learning_rate": 4.986293994799974e-05, + "loss": 5.4252, + "step": 5608 + }, + { + "epoch": 0.033358311923113525, + "grad_norm": 2.57043194770813, + "learning_rate": 4.9862891099473105e-05, + "loss": 5.5321, + "step": 5609 + }, + { + "epoch": 0.03336425920639452, + "grad_norm": 3.4353785514831543, + "learning_rate": 4.986284224226709e-05, + "loss": 5.6599, + "step": 5610 + }, + { + "epoch": 0.033370206489675515, + "grad_norm": 3.308945894241333, + "learning_rate": 4.986279337638172e-05, + "loss": 5.8668, + "step": 5611 + }, + { + "epoch": 0.03337615377295652, + "grad_norm": 2.789703607559204, + "learning_rate": 4.9862744501817006e-05, + "loss": 5.8352, + "step": 5612 + }, + { + "epoch": 0.03338210105623751, + "grad_norm": 1.9887118339538574, + "learning_rate": 4.986269561857296e-05, + "loss": 5.7527, + "step": 5613 + }, + { + "epoch": 0.03338804833951851, + "grad_norm": 2.5447990894317627, + "learning_rate": 4.986264672664961e-05, + "loss": 5.5539, + "step": 5614 + }, + { + "epoch": 0.0333939956227995, + "grad_norm": 2.2903668880462646, + "learning_rate": 4.9862597826046965e-05, + "loss": 5.4555, + "step": 5615 + }, + { + "epoch": 0.033399942906080504, + "grad_norm": 3.1669414043426514, + "learning_rate": 4.986254891676504e-05, + "loss": 5.6852, + "step": 5616 + }, + { + "epoch": 0.0334058901893615, + "grad_norm": 3.7491395473480225, + "learning_rate": 4.986249999880386e-05, + "loss": 5.682, + "step": 5617 + }, + { + "epoch": 0.033411837472642494, + "grad_norm": 3.0548582077026367, + "learning_rate": 4.986245107216343e-05, + "loss": 5.7844, + "step": 5618 + }, + { + "epoch": 0.033417784755923496, + "grad_norm": 2.628957509994507, + "learning_rate": 4.986240213684378e-05, + "loss": 5.5646, + "step": 5619 + }, + { + "epoch": 0.03342373203920449, + "grad_norm": 2.050936460494995, + "learning_rate": 4.986235319284492e-05, + "loss": 5.7187, + "step": 5620 + }, + { + "epoch": 0.03342967932248549, + "grad_norm": 2.2839999198913574, + "learning_rate": 4.986230424016688e-05, + "loss": 5.6613, + "step": 5621 + }, + { + "epoch": 0.03343562660576649, + "grad_norm": 2.177778959274292, + "learning_rate": 4.986225527880966e-05, + "loss": 5.7205, + "step": 5622 + }, + { + "epoch": 0.033441573889047484, + "grad_norm": 2.1690266132354736, + "learning_rate": 4.9862206308773286e-05, + "loss": 5.4344, + "step": 5623 + }, + { + "epoch": 0.03344752117232848, + "grad_norm": 2.0134127140045166, + "learning_rate": 4.9862157330057766e-05, + "loss": 5.7872, + "step": 5624 + }, + { + "epoch": 0.03345346845560948, + "grad_norm": 2.0246710777282715, + "learning_rate": 4.986210834266313e-05, + "loss": 5.3291, + "step": 5625 + }, + { + "epoch": 0.033459415738890476, + "grad_norm": 2.020939350128174, + "learning_rate": 4.986205934658939e-05, + "loss": 5.3966, + "step": 5626 + }, + { + "epoch": 0.03346536302217147, + "grad_norm": 2.3261308670043945, + "learning_rate": 4.986201034183655e-05, + "loss": 5.4667, + "step": 5627 + }, + { + "epoch": 0.033471310305452466, + "grad_norm": 2.135641574859619, + "learning_rate": 4.9861961328404646e-05, + "loss": 5.4925, + "step": 5628 + }, + { + "epoch": 0.03347725758873347, + "grad_norm": 2.3122894763946533, + "learning_rate": 4.986191230629369e-05, + "loss": 5.6665, + "step": 5629 + }, + { + "epoch": 0.03348320487201446, + "grad_norm": 2.4461214542388916, + "learning_rate": 4.98618632755037e-05, + "loss": 5.8442, + "step": 5630 + }, + { + "epoch": 0.03348915215529546, + "grad_norm": 2.189009189605713, + "learning_rate": 4.9861814236034685e-05, + "loss": 5.5793, + "step": 5631 + }, + { + "epoch": 0.03349509943857646, + "grad_norm": 2.1961586475372314, + "learning_rate": 4.986176518788667e-05, + "loss": 5.5364, + "step": 5632 + }, + { + "epoch": 0.033501046721857455, + "grad_norm": 2.120177745819092, + "learning_rate": 4.986171613105967e-05, + "loss": 5.4042, + "step": 5633 + }, + { + "epoch": 0.03350699400513845, + "grad_norm": 1.9021252393722534, + "learning_rate": 4.9861667065553696e-05, + "loss": 5.2665, + "step": 5634 + }, + { + "epoch": 0.03351294128841945, + "grad_norm": 1.8944766521453857, + "learning_rate": 4.986161799136878e-05, + "loss": 5.3853, + "step": 5635 + }, + { + "epoch": 0.03351888857170045, + "grad_norm": 2.059847354888916, + "learning_rate": 4.9861568908504916e-05, + "loss": 5.3046, + "step": 5636 + }, + { + "epoch": 0.03352483585498144, + "grad_norm": 2.1350111961364746, + "learning_rate": 4.9861519816962155e-05, + "loss": 5.3684, + "step": 5637 + }, + { + "epoch": 0.033530783138262445, + "grad_norm": 2.0733792781829834, + "learning_rate": 4.986147071674048e-05, + "loss": 5.4581, + "step": 5638 + }, + { + "epoch": 0.03353673042154344, + "grad_norm": 2.0736827850341797, + "learning_rate": 4.986142160783993e-05, + "loss": 5.7019, + "step": 5639 + }, + { + "epoch": 0.033542677704824435, + "grad_norm": 2.1903107166290283, + "learning_rate": 4.986137249026051e-05, + "loss": 5.4353, + "step": 5640 + }, + { + "epoch": 0.03354862498810544, + "grad_norm": 2.2678940296173096, + "learning_rate": 4.9861323364002244e-05, + "loss": 5.4951, + "step": 5641 + }, + { + "epoch": 0.03355457227138643, + "grad_norm": 3.590702772140503, + "learning_rate": 4.9861274229065145e-05, + "loss": 6.1522, + "step": 5642 + }, + { + "epoch": 0.03356051955466743, + "grad_norm": 2.0955893993377686, + "learning_rate": 4.9861225085449224e-05, + "loss": 5.3544, + "step": 5643 + }, + { + "epoch": 0.03356646683794842, + "grad_norm": 1.9370301961898804, + "learning_rate": 4.986117593315452e-05, + "loss": 5.4732, + "step": 5644 + }, + { + "epoch": 0.033572414121229424, + "grad_norm": 2.141752243041992, + "learning_rate": 4.986112677218103e-05, + "loss": 5.5768, + "step": 5645 + }, + { + "epoch": 0.03357836140451042, + "grad_norm": 1.9236360788345337, + "learning_rate": 4.986107760252878e-05, + "loss": 5.7641, + "step": 5646 + }, + { + "epoch": 0.033584308687791414, + "grad_norm": 1.8353725671768188, + "learning_rate": 4.9861028424197785e-05, + "loss": 5.8011, + "step": 5647 + }, + { + "epoch": 0.033590255971072416, + "grad_norm": 2.0918078422546387, + "learning_rate": 4.9860979237188055e-05, + "loss": 5.6862, + "step": 5648 + }, + { + "epoch": 0.03359620325435341, + "grad_norm": 2.2244462966918945, + "learning_rate": 4.986093004149962e-05, + "loss": 5.472, + "step": 5649 + }, + { + "epoch": 0.033602150537634407, + "grad_norm": 2.1517422199249268, + "learning_rate": 4.9860880837132495e-05, + "loss": 5.3655, + "step": 5650 + }, + { + "epoch": 0.03360809782091541, + "grad_norm": 2.241863489151001, + "learning_rate": 4.986083162408669e-05, + "loss": 5.5385, + "step": 5651 + }, + { + "epoch": 0.033614045104196404, + "grad_norm": 2.458171844482422, + "learning_rate": 4.986078240236222e-05, + "loss": 5.5531, + "step": 5652 + }, + { + "epoch": 0.0336199923874774, + "grad_norm": 2.2601864337921143, + "learning_rate": 4.986073317195911e-05, + "loss": 5.9313, + "step": 5653 + }, + { + "epoch": 0.0336259396707584, + "grad_norm": 2.243647575378418, + "learning_rate": 4.986068393287738e-05, + "loss": 5.4064, + "step": 5654 + }, + { + "epoch": 0.033631886954039396, + "grad_norm": 2.283515453338623, + "learning_rate": 4.986063468511704e-05, + "loss": 5.295, + "step": 5655 + }, + { + "epoch": 0.03363783423732039, + "grad_norm": 2.701770305633545, + "learning_rate": 4.986058542867811e-05, + "loss": 5.8548, + "step": 5656 + }, + { + "epoch": 0.033643781520601386, + "grad_norm": 2.8186864852905273, + "learning_rate": 4.98605361635606e-05, + "loss": 5.378, + "step": 5657 + }, + { + "epoch": 0.03364972880388239, + "grad_norm": 2.6508500576019287, + "learning_rate": 4.9860486889764536e-05, + "loss": 5.469, + "step": 5658 + }, + { + "epoch": 0.03365567608716338, + "grad_norm": 2.3984878063201904, + "learning_rate": 4.986043760728994e-05, + "loss": 5.3978, + "step": 5659 + }, + { + "epoch": 0.03366162337044438, + "grad_norm": 3.64663028717041, + "learning_rate": 4.9860388316136814e-05, + "loss": 5.502, + "step": 5660 + }, + { + "epoch": 0.03366757065372538, + "grad_norm": 3.1112046241760254, + "learning_rate": 4.986033901630519e-05, + "loss": 5.7347, + "step": 5661 + }, + { + "epoch": 0.033673517937006375, + "grad_norm": 2.619877338409424, + "learning_rate": 4.9860289707795074e-05, + "loss": 6.2099, + "step": 5662 + }, + { + "epoch": 0.03367946522028737, + "grad_norm": 2.0318470001220703, + "learning_rate": 4.986024039060648e-05, + "loss": 6.246, + "step": 5663 + }, + { + "epoch": 0.03368541250356837, + "grad_norm": 2.1484673023223877, + "learning_rate": 4.986019106473945e-05, + "loss": 6.1689, + "step": 5664 + }, + { + "epoch": 0.03369135978684937, + "grad_norm": 2.6159844398498535, + "learning_rate": 4.9860141730193974e-05, + "loss": 5.8217, + "step": 5665 + }, + { + "epoch": 0.03369730707013036, + "grad_norm": 2.5019965171813965, + "learning_rate": 4.9860092386970084e-05, + "loss": 6.1138, + "step": 5666 + }, + { + "epoch": 0.033703254353411365, + "grad_norm": 2.962315797805786, + "learning_rate": 4.9860043035067785e-05, + "loss": 5.7057, + "step": 5667 + }, + { + "epoch": 0.03370920163669236, + "grad_norm": 2.455721139907837, + "learning_rate": 4.9859993674487106e-05, + "loss": 5.6203, + "step": 5668 + }, + { + "epoch": 0.033715148919973355, + "grad_norm": 2.432368278503418, + "learning_rate": 4.9859944305228066e-05, + "loss": 6.2337, + "step": 5669 + }, + { + "epoch": 0.03372109620325436, + "grad_norm": 2.3222782611846924, + "learning_rate": 4.985989492729067e-05, + "loss": 6.2845, + "step": 5670 + }, + { + "epoch": 0.03372704348653535, + "grad_norm": 2.107440948486328, + "learning_rate": 4.985984554067494e-05, + "loss": 6.2404, + "step": 5671 + }, + { + "epoch": 0.03373299076981635, + "grad_norm": 1.9450268745422363, + "learning_rate": 4.98597961453809e-05, + "loss": 6.1679, + "step": 5672 + }, + { + "epoch": 0.03373893805309734, + "grad_norm": 1.7591795921325684, + "learning_rate": 4.9859746741408554e-05, + "loss": 6.3425, + "step": 5673 + }, + { + "epoch": 0.033744885336378344, + "grad_norm": 2.009420871734619, + "learning_rate": 4.985969732875794e-05, + "loss": 6.3607, + "step": 5674 + }, + { + "epoch": 0.03375083261965934, + "grad_norm": 2.097215175628662, + "learning_rate": 4.9859647907429054e-05, + "loss": 6.2009, + "step": 5675 + }, + { + "epoch": 0.033756779902940334, + "grad_norm": 1.7670379877090454, + "learning_rate": 4.985959847742192e-05, + "loss": 5.935, + "step": 5676 + }, + { + "epoch": 0.033762727186221336, + "grad_norm": 2.052022695541382, + "learning_rate": 4.985954903873656e-05, + "loss": 5.4054, + "step": 5677 + }, + { + "epoch": 0.03376867446950233, + "grad_norm": 1.9225167036056519, + "learning_rate": 4.985949959137298e-05, + "loss": 5.6905, + "step": 5678 + }, + { + "epoch": 0.033774621752783326, + "grad_norm": 2.4080653190612793, + "learning_rate": 4.985945013533122e-05, + "loss": 6.5566, + "step": 5679 + }, + { + "epoch": 0.03378056903606433, + "grad_norm": 2.8340251445770264, + "learning_rate": 4.985940067061128e-05, + "loss": 6.3556, + "step": 5680 + }, + { + "epoch": 0.033786516319345324, + "grad_norm": 2.2872672080993652, + "learning_rate": 4.985935119721317e-05, + "loss": 6.1806, + "step": 5681 + }, + { + "epoch": 0.03379246360262632, + "grad_norm": 3.309203863143921, + "learning_rate": 4.985930171513692e-05, + "loss": 6.1766, + "step": 5682 + }, + { + "epoch": 0.03379841088590732, + "grad_norm": 2.936709403991699, + "learning_rate": 4.985925222438255e-05, + "loss": 5.907, + "step": 5683 + }, + { + "epoch": 0.033804358169188316, + "grad_norm": 2.3226964473724365, + "learning_rate": 4.985920272495007e-05, + "loss": 5.5734, + "step": 5684 + }, + { + "epoch": 0.03381030545246931, + "grad_norm": 2.3053154945373535, + "learning_rate": 4.98591532168395e-05, + "loss": 6.5688, + "step": 5685 + }, + { + "epoch": 0.033816252735750306, + "grad_norm": 2.2494077682495117, + "learning_rate": 4.985910370005086e-05, + "loss": 6.3539, + "step": 5686 + }, + { + "epoch": 0.03382220001903131, + "grad_norm": 1.9559924602508545, + "learning_rate": 4.9859054174584155e-05, + "loss": 6.2015, + "step": 5687 + }, + { + "epoch": 0.0338281473023123, + "grad_norm": 2.7915425300598145, + "learning_rate": 4.985900464043942e-05, + "loss": 5.7426, + "step": 5688 + }, + { + "epoch": 0.0338340945855933, + "grad_norm": 2.448496103286743, + "learning_rate": 4.985895509761665e-05, + "loss": 6.2697, + "step": 5689 + }, + { + "epoch": 0.0338400418688743, + "grad_norm": 1.7736696004867554, + "learning_rate": 4.9858905546115885e-05, + "loss": 6.5513, + "step": 5690 + }, + { + "epoch": 0.033845989152155295, + "grad_norm": 1.668285608291626, + "learning_rate": 4.9858855985937136e-05, + "loss": 6.0179, + "step": 5691 + }, + { + "epoch": 0.03385193643543629, + "grad_norm": 2.157799243927002, + "learning_rate": 4.985880641708042e-05, + "loss": 6.1863, + "step": 5692 + }, + { + "epoch": 0.03385788371871729, + "grad_norm": 2.2437758445739746, + "learning_rate": 4.985875683954574e-05, + "loss": 6.128, + "step": 5693 + }, + { + "epoch": 0.03386383100199829, + "grad_norm": 2.8323628902435303, + "learning_rate": 4.9858707253333124e-05, + "loss": 6.2746, + "step": 5694 + }, + { + "epoch": 0.03386977828527928, + "grad_norm": 2.270587205886841, + "learning_rate": 4.98586576584426e-05, + "loss": 6.1002, + "step": 5695 + }, + { + "epoch": 0.033875725568560285, + "grad_norm": 1.9165533781051636, + "learning_rate": 4.985860805487417e-05, + "loss": 5.7016, + "step": 5696 + }, + { + "epoch": 0.03388167285184128, + "grad_norm": 2.230407953262329, + "learning_rate": 4.985855844262786e-05, + "loss": 5.9649, + "step": 5697 + }, + { + "epoch": 0.033887620135122275, + "grad_norm": 2.5094211101531982, + "learning_rate": 4.985850882170368e-05, + "loss": 6.0184, + "step": 5698 + }, + { + "epoch": 0.03389356741840328, + "grad_norm": 2.6195943355560303, + "learning_rate": 4.9858459192101656e-05, + "loss": 5.8501, + "step": 5699 + }, + { + "epoch": 0.03389951470168427, + "grad_norm": 2.747486114501953, + "learning_rate": 4.9858409553821794e-05, + "loss": 5.7066, + "step": 5700 + }, + { + "epoch": 0.03390546198496527, + "grad_norm": 2.154109001159668, + "learning_rate": 4.985835990686413e-05, + "loss": 6.1072, + "step": 5701 + }, + { + "epoch": 0.03391140926824626, + "grad_norm": 2.4329216480255127, + "learning_rate": 4.9858310251228655e-05, + "loss": 5.9552, + "step": 5702 + }, + { + "epoch": 0.033917356551527264, + "grad_norm": 2.4760935306549072, + "learning_rate": 4.9858260586915405e-05, + "loss": 5.9023, + "step": 5703 + }, + { + "epoch": 0.03392330383480826, + "grad_norm": 2.400474786758423, + "learning_rate": 4.9858210913924397e-05, + "loss": 6.1688, + "step": 5704 + }, + { + "epoch": 0.033929251118089254, + "grad_norm": 2.402930498123169, + "learning_rate": 4.9858161232255644e-05, + "loss": 6.0776, + "step": 5705 + }, + { + "epoch": 0.033935198401370256, + "grad_norm": 2.0408313274383545, + "learning_rate": 4.985811154190916e-05, + "loss": 6.1841, + "step": 5706 + }, + { + "epoch": 0.03394114568465125, + "grad_norm": 1.889190912246704, + "learning_rate": 4.9858061842884976e-05, + "loss": 5.9689, + "step": 5707 + }, + { + "epoch": 0.033947092967932246, + "grad_norm": 2.2231624126434326, + "learning_rate": 4.9858012135183086e-05, + "loss": 6.0009, + "step": 5708 + }, + { + "epoch": 0.03395304025121325, + "grad_norm": 2.0229554176330566, + "learning_rate": 4.985796241880353e-05, + "loss": 6.3237, + "step": 5709 + }, + { + "epoch": 0.033958987534494244, + "grad_norm": 2.0570971965789795, + "learning_rate": 4.985791269374631e-05, + "loss": 6.3104, + "step": 5710 + }, + { + "epoch": 0.03396493481777524, + "grad_norm": 2.584663152694702, + "learning_rate": 4.9857862960011454e-05, + "loss": 5.8493, + "step": 5711 + }, + { + "epoch": 0.03397088210105624, + "grad_norm": 1.7870328426361084, + "learning_rate": 4.985781321759897e-05, + "loss": 6.2321, + "step": 5712 + }, + { + "epoch": 0.033976829384337236, + "grad_norm": 2.201756000518799, + "learning_rate": 4.9857763466508886e-05, + "loss": 6.1936, + "step": 5713 + }, + { + "epoch": 0.03398277666761823, + "grad_norm": 2.4489476680755615, + "learning_rate": 4.9857713706741216e-05, + "loss": 6.11, + "step": 5714 + }, + { + "epoch": 0.033988723950899226, + "grad_norm": 2.007643461227417, + "learning_rate": 4.9857663938295964e-05, + "loss": 6.288, + "step": 5715 + }, + { + "epoch": 0.03399467123418023, + "grad_norm": 1.8299764394760132, + "learning_rate": 4.9857614161173165e-05, + "loss": 6.0719, + "step": 5716 + }, + { + "epoch": 0.03400061851746122, + "grad_norm": 1.7619884014129639, + "learning_rate": 4.985756437537283e-05, + "loss": 6.1418, + "step": 5717 + }, + { + "epoch": 0.03400656580074222, + "grad_norm": 1.9445360898971558, + "learning_rate": 4.985751458089498e-05, + "loss": 6.1223, + "step": 5718 + }, + { + "epoch": 0.03401251308402322, + "grad_norm": 2.2320010662078857, + "learning_rate": 4.985746477773962e-05, + "loss": 5.5239, + "step": 5719 + }, + { + "epoch": 0.034018460367304215, + "grad_norm": 2.631765365600586, + "learning_rate": 4.985741496590678e-05, + "loss": 5.6348, + "step": 5720 + }, + { + "epoch": 0.03402440765058521, + "grad_norm": 2.4715576171875, + "learning_rate": 4.985736514539647e-05, + "loss": 5.9608, + "step": 5721 + }, + { + "epoch": 0.03403035493386621, + "grad_norm": 2.633188009262085, + "learning_rate": 4.985731531620871e-05, + "loss": 5.602, + "step": 5722 + }, + { + "epoch": 0.03403630221714721, + "grad_norm": 2.4303035736083984, + "learning_rate": 4.9857265478343526e-05, + "loss": 5.495, + "step": 5723 + }, + { + "epoch": 0.0340422495004282, + "grad_norm": 2.463447332382202, + "learning_rate": 4.985721563180092e-05, + "loss": 5.4633, + "step": 5724 + }, + { + "epoch": 0.034048196783709204, + "grad_norm": 2.349965810775757, + "learning_rate": 4.985716577658092e-05, + "loss": 6.0067, + "step": 5725 + }, + { + "epoch": 0.0340541440669902, + "grad_norm": 1.8741793632507324, + "learning_rate": 4.985711591268354e-05, + "loss": 5.8658, + "step": 5726 + }, + { + "epoch": 0.034060091350271195, + "grad_norm": 1.957612156867981, + "learning_rate": 4.98570660401088e-05, + "loss": 6.2016, + "step": 5727 + }, + { + "epoch": 0.0340660386335522, + "grad_norm": 2.4883556365966797, + "learning_rate": 4.985701615885671e-05, + "loss": 6.3056, + "step": 5728 + }, + { + "epoch": 0.03407198591683319, + "grad_norm": 2.6959800720214844, + "learning_rate": 4.98569662689273e-05, + "loss": 5.7267, + "step": 5729 + }, + { + "epoch": 0.03407793320011419, + "grad_norm": 2.579802989959717, + "learning_rate": 4.985691637032057e-05, + "loss": 5.2467, + "step": 5730 + }, + { + "epoch": 0.03408388048339518, + "grad_norm": 2.136262893676758, + "learning_rate": 4.985686646303656e-05, + "loss": 5.7071, + "step": 5731 + }, + { + "epoch": 0.034089827766676184, + "grad_norm": 2.1442244052886963, + "learning_rate": 4.985681654707526e-05, + "loss": 6.3961, + "step": 5732 + }, + { + "epoch": 0.03409577504995718, + "grad_norm": 2.164340019226074, + "learning_rate": 4.9856766622436714e-05, + "loss": 6.2455, + "step": 5733 + }, + { + "epoch": 0.034101722333238174, + "grad_norm": 2.199791193008423, + "learning_rate": 4.985671668912092e-05, + "loss": 5.8804, + "step": 5734 + }, + { + "epoch": 0.034107669616519176, + "grad_norm": 2.0359933376312256, + "learning_rate": 4.9856666747127905e-05, + "loss": 6.359, + "step": 5735 + }, + { + "epoch": 0.03411361689980017, + "grad_norm": 2.17069935798645, + "learning_rate": 4.985661679645769e-05, + "loss": 6.6736, + "step": 5736 + }, + { + "epoch": 0.034119564183081166, + "grad_norm": 1.9114634990692139, + "learning_rate": 4.9856566837110275e-05, + "loss": 5.9629, + "step": 5737 + }, + { + "epoch": 0.03412551146636217, + "grad_norm": 2.2872474193573, + "learning_rate": 4.9856516869085704e-05, + "loss": 5.5856, + "step": 5738 + }, + { + "epoch": 0.03413145874964316, + "grad_norm": 2.0800466537475586, + "learning_rate": 4.9856466892383965e-05, + "loss": 5.7732, + "step": 5739 + }, + { + "epoch": 0.03413740603292416, + "grad_norm": 2.37117338180542, + "learning_rate": 4.98564169070051e-05, + "loss": 5.667, + "step": 5740 + }, + { + "epoch": 0.03414335331620516, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.985636691294911e-05, + "loss": 5.4874, + "step": 5741 + }, + { + "epoch": 0.034149300599486156, + "grad_norm": 2.0097250938415527, + "learning_rate": 4.9856316910216024e-05, + "loss": 5.5469, + "step": 5742 + }, + { + "epoch": 0.03415524788276715, + "grad_norm": 2.430954933166504, + "learning_rate": 4.985626689880586e-05, + "loss": 5.7635, + "step": 5743 + }, + { + "epoch": 0.034161195166048146, + "grad_norm": 2.1000874042510986, + "learning_rate": 4.985621687871862e-05, + "loss": 5.7102, + "step": 5744 + }, + { + "epoch": 0.03416714244932915, + "grad_norm": 2.2048611640930176, + "learning_rate": 4.9856166849954336e-05, + "loss": 5.8156, + "step": 5745 + }, + { + "epoch": 0.03417308973261014, + "grad_norm": 2.145538330078125, + "learning_rate": 4.985611681251302e-05, + "loss": 5.9101, + "step": 5746 + }, + { + "epoch": 0.03417903701589114, + "grad_norm": 2.86169695854187, + "learning_rate": 4.9856066766394685e-05, + "loss": 5.7358, + "step": 5747 + }, + { + "epoch": 0.03418498429917214, + "grad_norm": 2.0648229122161865, + "learning_rate": 4.985601671159936e-05, + "loss": 6.0529, + "step": 5748 + }, + { + "epoch": 0.034190931582453135, + "grad_norm": 2.191251039505005, + "learning_rate": 4.985596664812706e-05, + "loss": 6.1999, + "step": 5749 + }, + { + "epoch": 0.03419687886573413, + "grad_norm": 2.556640148162842, + "learning_rate": 4.985591657597779e-05, + "loss": 6.0671, + "step": 5750 + }, + { + "epoch": 0.03420282614901513, + "grad_norm": 2.1796281337738037, + "learning_rate": 4.985586649515158e-05, + "loss": 6.1537, + "step": 5751 + }, + { + "epoch": 0.03420877343229613, + "grad_norm": 2.1884169578552246, + "learning_rate": 4.985581640564845e-05, + "loss": 5.7667, + "step": 5752 + }, + { + "epoch": 0.03421472071557712, + "grad_norm": 2.3836331367492676, + "learning_rate": 4.9855766307468404e-05, + "loss": 5.6608, + "step": 5753 + }, + { + "epoch": 0.034220667998858124, + "grad_norm": 2.0464322566986084, + "learning_rate": 4.985571620061147e-05, + "loss": 5.5317, + "step": 5754 + }, + { + "epoch": 0.03422661528213912, + "grad_norm": 2.3275644779205322, + "learning_rate": 4.9855666085077654e-05, + "loss": 5.8611, + "step": 5755 + }, + { + "epoch": 0.034232562565420115, + "grad_norm": 2.7268338203430176, + "learning_rate": 4.9855615960867e-05, + "loss": 5.6323, + "step": 5756 + }, + { + "epoch": 0.03423850984870112, + "grad_norm": 2.578986406326294, + "learning_rate": 4.985556582797949e-05, + "loss": 5.6108, + "step": 5757 + }, + { + "epoch": 0.03424445713198211, + "grad_norm": 2.4127955436706543, + "learning_rate": 4.985551568641516e-05, + "loss": 5.7054, + "step": 5758 + }, + { + "epoch": 0.03425040441526311, + "grad_norm": 2.1954357624053955, + "learning_rate": 4.985546553617404e-05, + "loss": 6.194, + "step": 5759 + }, + { + "epoch": 0.0342563516985441, + "grad_norm": 2.43851900100708, + "learning_rate": 4.985541537725612e-05, + "loss": 5.9067, + "step": 5760 + }, + { + "epoch": 0.034262298981825104, + "grad_norm": 2.0910801887512207, + "learning_rate": 4.9855365209661445e-05, + "loss": 6.1017, + "step": 5761 + }, + { + "epoch": 0.0342682462651061, + "grad_norm": 1.9936187267303467, + "learning_rate": 4.985531503339e-05, + "loss": 6.1239, + "step": 5762 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 2.0663299560546875, + "learning_rate": 4.985526484844183e-05, + "loss": 6.0514, + "step": 5763 + }, + { + "epoch": 0.034280140831668096, + "grad_norm": 2.4357266426086426, + "learning_rate": 4.985521465481695e-05, + "loss": 5.3695, + "step": 5764 + }, + { + "epoch": 0.03428608811494909, + "grad_norm": 2.12214994430542, + "learning_rate": 4.985516445251537e-05, + "loss": 5.5531, + "step": 5765 + }, + { + "epoch": 0.034292035398230086, + "grad_norm": 2.731661319732666, + "learning_rate": 4.9855114241537105e-05, + "loss": 6.2403, + "step": 5766 + }, + { + "epoch": 0.03429798268151109, + "grad_norm": 2.0668931007385254, + "learning_rate": 4.985506402188217e-05, + "loss": 6.0873, + "step": 5767 + }, + { + "epoch": 0.03430392996479208, + "grad_norm": 2.3165833950042725, + "learning_rate": 4.98550137935506e-05, + "loss": 5.9365, + "step": 5768 + }, + { + "epoch": 0.03430987724807308, + "grad_norm": 1.8637720346450806, + "learning_rate": 4.98549635565424e-05, + "loss": 6.0837, + "step": 5769 + }, + { + "epoch": 0.03431582453135408, + "grad_norm": 2.1689205169677734, + "learning_rate": 4.985491331085758e-05, + "loss": 5.703, + "step": 5770 + }, + { + "epoch": 0.034321771814635076, + "grad_norm": 2.245283365249634, + "learning_rate": 4.985486305649618e-05, + "loss": 6.0134, + "step": 5771 + }, + { + "epoch": 0.03432771909791607, + "grad_norm": 2.2685303688049316, + "learning_rate": 4.98548127934582e-05, + "loss": 5.279, + "step": 5772 + }, + { + "epoch": 0.034333666381197066, + "grad_norm": 2.376253128051758, + "learning_rate": 4.985476252174365e-05, + "loss": 5.5812, + "step": 5773 + }, + { + "epoch": 0.03433961366447807, + "grad_norm": 2.2636559009552, + "learning_rate": 4.985471224135257e-05, + "loss": 5.6906, + "step": 5774 + }, + { + "epoch": 0.03434556094775906, + "grad_norm": 2.22103214263916, + "learning_rate": 4.9854661952284965e-05, + "loss": 6.2066, + "step": 5775 + }, + { + "epoch": 0.03435150823104006, + "grad_norm": 2.308610439300537, + "learning_rate": 4.985461165454085e-05, + "loss": 6.1582, + "step": 5776 + }, + { + "epoch": 0.03435745551432106, + "grad_norm": 1.9191935062408447, + "learning_rate": 4.985456134812026e-05, + "loss": 5.4587, + "step": 5777 + }, + { + "epoch": 0.034363402797602055, + "grad_norm": 2.3127100467681885, + "learning_rate": 4.9854511033023184e-05, + "loss": 5.3375, + "step": 5778 + }, + { + "epoch": 0.03436935008088305, + "grad_norm": 2.4817371368408203, + "learning_rate": 4.985446070924966e-05, + "loss": 5.4961, + "step": 5779 + }, + { + "epoch": 0.03437529736416405, + "grad_norm": 2.0995922088623047, + "learning_rate": 4.9854410376799695e-05, + "loss": 5.7676, + "step": 5780 + }, + { + "epoch": 0.03438124464744505, + "grad_norm": 2.261229991912842, + "learning_rate": 4.985436003567332e-05, + "loss": 5.4446, + "step": 5781 + }, + { + "epoch": 0.03438719193072604, + "grad_norm": 2.275536060333252, + "learning_rate": 4.985430968587055e-05, + "loss": 5.4297, + "step": 5782 + }, + { + "epoch": 0.034393139214007044, + "grad_norm": 2.3733773231506348, + "learning_rate": 4.985425932739138e-05, + "loss": 5.7658, + "step": 5783 + }, + { + "epoch": 0.03439908649728804, + "grad_norm": 2.201716184616089, + "learning_rate": 4.985420896023586e-05, + "loss": 5.5502, + "step": 5784 + }, + { + "epoch": 0.034405033780569035, + "grad_norm": 2.1012730598449707, + "learning_rate": 4.9854158584403985e-05, + "loss": 5.7199, + "step": 5785 + }, + { + "epoch": 0.03441098106385004, + "grad_norm": 2.065568685531616, + "learning_rate": 4.985410819989579e-05, + "loss": 6.1547, + "step": 5786 + }, + { + "epoch": 0.03441692834713103, + "grad_norm": 1.9217867851257324, + "learning_rate": 4.9854057806711275e-05, + "loss": 6.2556, + "step": 5787 + }, + { + "epoch": 0.03442287563041203, + "grad_norm": 2.028602123260498, + "learning_rate": 4.985400740485047e-05, + "loss": 5.9347, + "step": 5788 + }, + { + "epoch": 0.03442882291369302, + "grad_norm": 2.002855062484741, + "learning_rate": 4.9853956994313376e-05, + "loss": 5.3966, + "step": 5789 + }, + { + "epoch": 0.034434770196974024, + "grad_norm": 2.3740642070770264, + "learning_rate": 4.985390657510003e-05, + "loss": 5.7801, + "step": 5790 + }, + { + "epoch": 0.03444071748025502, + "grad_norm": 2.1149635314941406, + "learning_rate": 4.9853856147210444e-05, + "loss": 5.6504, + "step": 5791 + }, + { + "epoch": 0.034446664763536014, + "grad_norm": 2.3519630432128906, + "learning_rate": 4.985380571064463e-05, + "loss": 5.9172, + "step": 5792 + }, + { + "epoch": 0.034452612046817016, + "grad_norm": 2.38930082321167, + "learning_rate": 4.985375526540261e-05, + "loss": 5.6196, + "step": 5793 + }, + { + "epoch": 0.03445855933009801, + "grad_norm": 2.245596408843994, + "learning_rate": 4.98537048114844e-05, + "loss": 5.5034, + "step": 5794 + }, + { + "epoch": 0.034464506613379006, + "grad_norm": 2.272158622741699, + "learning_rate": 4.985365434889002e-05, + "loss": 5.5867, + "step": 5795 + }, + { + "epoch": 0.03447045389666001, + "grad_norm": 2.2090094089508057, + "learning_rate": 4.9853603877619485e-05, + "loss": 5.68, + "step": 5796 + }, + { + "epoch": 0.034476401179941, + "grad_norm": 2.0545220375061035, + "learning_rate": 4.985355339767281e-05, + "loss": 5.8382, + "step": 5797 + }, + { + "epoch": 0.034482348463222, + "grad_norm": 2.143134593963623, + "learning_rate": 4.985350290905003e-05, + "loss": 5.5753, + "step": 5798 + }, + { + "epoch": 0.034488295746503, + "grad_norm": 2.3938257694244385, + "learning_rate": 4.985345241175114e-05, + "loss": 5.7545, + "step": 5799 + }, + { + "epoch": 0.034494243029783996, + "grad_norm": 2.132998466491699, + "learning_rate": 4.985340190577616e-05, + "loss": 5.5477, + "step": 5800 + }, + { + "epoch": 0.03450019031306499, + "grad_norm": 3.141417980194092, + "learning_rate": 4.9853351391125126e-05, + "loss": 5.3509, + "step": 5801 + }, + { + "epoch": 0.034506137596345986, + "grad_norm": 2.4776933193206787, + "learning_rate": 4.9853300867798034e-05, + "loss": 6.1052, + "step": 5802 + }, + { + "epoch": 0.03451208487962699, + "grad_norm": 2.1782073974609375, + "learning_rate": 4.985325033579492e-05, + "loss": 5.9599, + "step": 5803 + }, + { + "epoch": 0.03451803216290798, + "grad_norm": 2.2631704807281494, + "learning_rate": 4.9853199795115794e-05, + "loss": 5.534, + "step": 5804 + }, + { + "epoch": 0.03452397944618898, + "grad_norm": 2.140612840652466, + "learning_rate": 4.985314924576066e-05, + "loss": 5.7479, + "step": 5805 + }, + { + "epoch": 0.03452992672946998, + "grad_norm": 2.726651668548584, + "learning_rate": 4.9853098687729563e-05, + "loss": 5.4639, + "step": 5806 + }, + { + "epoch": 0.034535874012750975, + "grad_norm": 1.852423071861267, + "learning_rate": 4.985304812102249e-05, + "loss": 5.4209, + "step": 5807 + }, + { + "epoch": 0.03454182129603197, + "grad_norm": 2.5236833095550537, + "learning_rate": 4.9852997545639485e-05, + "loss": 5.9653, + "step": 5808 + }, + { + "epoch": 0.03454776857931297, + "grad_norm": 2.2740652561187744, + "learning_rate": 4.985294696158056e-05, + "loss": 5.9457, + "step": 5809 + }, + { + "epoch": 0.03455371586259397, + "grad_norm": 2.931777000427246, + "learning_rate": 4.9852896368845715e-05, + "loss": 5.6709, + "step": 5810 + }, + { + "epoch": 0.03455966314587496, + "grad_norm": 2.6981759071350098, + "learning_rate": 4.9852845767434986e-05, + "loss": 5.1747, + "step": 5811 + }, + { + "epoch": 0.034565610429155964, + "grad_norm": 2.2675211429595947, + "learning_rate": 4.985279515734839e-05, + "loss": 5.2393, + "step": 5812 + }, + { + "epoch": 0.03457155771243696, + "grad_norm": 2.535473346710205, + "learning_rate": 4.985274453858594e-05, + "loss": 6.2184, + "step": 5813 + }, + { + "epoch": 0.034577504995717954, + "grad_norm": 2.8692495822906494, + "learning_rate": 4.985269391114765e-05, + "loss": 5.2557, + "step": 5814 + }, + { + "epoch": 0.034583452278998957, + "grad_norm": 2.908472776412964, + "learning_rate": 4.985264327503354e-05, + "loss": 5.1559, + "step": 5815 + }, + { + "epoch": 0.03458939956227995, + "grad_norm": 2.3630192279815674, + "learning_rate": 4.985259263024363e-05, + "loss": 5.3159, + "step": 5816 + }, + { + "epoch": 0.03459534684556095, + "grad_norm": 2.1287102699279785, + "learning_rate": 4.9852541976777933e-05, + "loss": 5.2069, + "step": 5817 + }, + { + "epoch": 0.03460129412884194, + "grad_norm": 2.751567840576172, + "learning_rate": 4.985249131463647e-05, + "loss": 5.6561, + "step": 5818 + }, + { + "epoch": 0.034607241412122944, + "grad_norm": 2.505608081817627, + "learning_rate": 4.985244064381927e-05, + "loss": 5.9708, + "step": 5819 + }, + { + "epoch": 0.03461318869540394, + "grad_norm": 2.351593255996704, + "learning_rate": 4.9852389964326337e-05, + "loss": 5.9046, + "step": 5820 + }, + { + "epoch": 0.034619135978684934, + "grad_norm": 2.3037939071655273, + "learning_rate": 4.985233927615769e-05, + "loss": 6.0069, + "step": 5821 + }, + { + "epoch": 0.034625083261965936, + "grad_norm": 2.2482705116271973, + "learning_rate": 4.985228857931334e-05, + "loss": 5.9492, + "step": 5822 + }, + { + "epoch": 0.03463103054524693, + "grad_norm": 2.23640513420105, + "learning_rate": 4.985223787379332e-05, + "loss": 5.6631, + "step": 5823 + }, + { + "epoch": 0.034636977828527926, + "grad_norm": 2.710275411605835, + "learning_rate": 4.985218715959764e-05, + "loss": 5.5961, + "step": 5824 + }, + { + "epoch": 0.03464292511180893, + "grad_norm": 2.7220160961151123, + "learning_rate": 4.9852136436726313e-05, + "loss": 5.6922, + "step": 5825 + }, + { + "epoch": 0.03464887239508992, + "grad_norm": 2.4542758464813232, + "learning_rate": 4.985208570517937e-05, + "loss": 5.4742, + "step": 5826 + }, + { + "epoch": 0.03465481967837092, + "grad_norm": 2.7492685317993164, + "learning_rate": 4.9852034964956816e-05, + "loss": 5.4598, + "step": 5827 + }, + { + "epoch": 0.03466076696165192, + "grad_norm": 2.757937431335449, + "learning_rate": 4.9851984216058677e-05, + "loss": 6.1865, + "step": 5828 + }, + { + "epoch": 0.034666714244932915, + "grad_norm": 2.835890531539917, + "learning_rate": 4.985193345848497e-05, + "loss": 5.3368, + "step": 5829 + }, + { + "epoch": 0.03467266152821391, + "grad_norm": 2.694884777069092, + "learning_rate": 4.98518826922357e-05, + "loss": 5.3654, + "step": 5830 + }, + { + "epoch": 0.03467860881149491, + "grad_norm": 2.443784236907959, + "learning_rate": 4.98518319173109e-05, + "loss": 5.7879, + "step": 5831 + }, + { + "epoch": 0.03468455609477591, + "grad_norm": 2.0198488235473633, + "learning_rate": 4.985178113371058e-05, + "loss": 5.766, + "step": 5832 + }, + { + "epoch": 0.0346905033780569, + "grad_norm": 2.8718788623809814, + "learning_rate": 4.985173034143476e-05, + "loss": 5.5506, + "step": 5833 + }, + { + "epoch": 0.0346964506613379, + "grad_norm": 2.4353652000427246, + "learning_rate": 4.9851679540483455e-05, + "loss": 5.7139, + "step": 5834 + }, + { + "epoch": 0.0347023979446189, + "grad_norm": 1.9376598596572876, + "learning_rate": 4.985162873085669e-05, + "loss": 6.2326, + "step": 5835 + }, + { + "epoch": 0.034708345227899895, + "grad_norm": 2.2225289344787598, + "learning_rate": 4.985157791255448e-05, + "loss": 5.5997, + "step": 5836 + }, + { + "epoch": 0.03471429251118089, + "grad_norm": 2.011493682861328, + "learning_rate": 4.985152708557684e-05, + "loss": 5.6882, + "step": 5837 + }, + { + "epoch": 0.03472023979446189, + "grad_norm": 1.8679020404815674, + "learning_rate": 4.985147624992378e-05, + "loss": 5.5427, + "step": 5838 + }, + { + "epoch": 0.03472618707774289, + "grad_norm": 1.9470884799957275, + "learning_rate": 4.9851425405595334e-05, + "loss": 5.5957, + "step": 5839 + }, + { + "epoch": 0.03473213436102388, + "grad_norm": 2.0765669345855713, + "learning_rate": 4.985137455259151e-05, + "loss": 5.4416, + "step": 5840 + }, + { + "epoch": 0.034738081644304884, + "grad_norm": 2.0521979331970215, + "learning_rate": 4.985132369091233e-05, + "loss": 5.4641, + "step": 5841 + }, + { + "epoch": 0.03474402892758588, + "grad_norm": 1.7439172267913818, + "learning_rate": 4.985127282055781e-05, + "loss": 5.1998, + "step": 5842 + }, + { + "epoch": 0.034749976210866874, + "grad_norm": 1.7347313165664673, + "learning_rate": 4.985122194152797e-05, + "loss": 5.2392, + "step": 5843 + }, + { + "epoch": 0.034755923494147876, + "grad_norm": 1.7362169027328491, + "learning_rate": 4.985117105382282e-05, + "loss": 5.1769, + "step": 5844 + }, + { + "epoch": 0.03476187077742887, + "grad_norm": 1.7468090057373047, + "learning_rate": 4.985112015744239e-05, + "loss": 5.3915, + "step": 5845 + }, + { + "epoch": 0.03476781806070987, + "grad_norm": 1.8685250282287598, + "learning_rate": 4.985106925238668e-05, + "loss": 5.6119, + "step": 5846 + }, + { + "epoch": 0.03477376534399086, + "grad_norm": 1.9595715999603271, + "learning_rate": 4.985101833865572e-05, + "loss": 5.5536, + "step": 5847 + }, + { + "epoch": 0.034779712627271864, + "grad_norm": 1.8454965353012085, + "learning_rate": 4.985096741624953e-05, + "loss": 5.8127, + "step": 5848 + }, + { + "epoch": 0.03478565991055286, + "grad_norm": 1.9182006120681763, + "learning_rate": 4.985091648516813e-05, + "loss": 5.8807, + "step": 5849 + }, + { + "epoch": 0.034791607193833854, + "grad_norm": 2.042923927307129, + "learning_rate": 4.9850865545411526e-05, + "loss": 5.9013, + "step": 5850 + }, + { + "epoch": 0.034797554477114856, + "grad_norm": 2.341055393218994, + "learning_rate": 4.985081459697974e-05, + "loss": 6.214, + "step": 5851 + }, + { + "epoch": 0.03480350176039585, + "grad_norm": 2.026190996170044, + "learning_rate": 4.985076363987279e-05, + "loss": 5.3693, + "step": 5852 + }, + { + "epoch": 0.034809449043676846, + "grad_norm": 2.045264482498169, + "learning_rate": 4.98507126740907e-05, + "loss": 5.6325, + "step": 5853 + }, + { + "epoch": 0.03481539632695785, + "grad_norm": 2.2710580825805664, + "learning_rate": 4.985066169963348e-05, + "loss": 5.8355, + "step": 5854 + }, + { + "epoch": 0.03482134361023884, + "grad_norm": 1.8813494443893433, + "learning_rate": 4.985061071650115e-05, + "loss": 5.5849, + "step": 5855 + }, + { + "epoch": 0.03482729089351984, + "grad_norm": 2.2177746295928955, + "learning_rate": 4.985055972469373e-05, + "loss": 5.5518, + "step": 5856 + }, + { + "epoch": 0.03483323817680084, + "grad_norm": 1.897653341293335, + "learning_rate": 4.9850508724211234e-05, + "loss": 5.6035, + "step": 5857 + }, + { + "epoch": 0.034839185460081835, + "grad_norm": 2.349821090698242, + "learning_rate": 4.985045771505369e-05, + "loss": 5.8181, + "step": 5858 + }, + { + "epoch": 0.03484513274336283, + "grad_norm": 1.900538682937622, + "learning_rate": 4.98504066972211e-05, + "loss": 5.2751, + "step": 5859 + }, + { + "epoch": 0.03485108002664383, + "grad_norm": 2.1902174949645996, + "learning_rate": 4.985035567071349e-05, + "loss": 5.2709, + "step": 5860 + }, + { + "epoch": 0.03485702730992483, + "grad_norm": 1.7833307981491089, + "learning_rate": 4.9850304635530884e-05, + "loss": 5.2104, + "step": 5861 + }, + { + "epoch": 0.03486297459320582, + "grad_norm": 2.017603874206543, + "learning_rate": 4.985025359167329e-05, + "loss": 5.2257, + "step": 5862 + }, + { + "epoch": 0.03486892187648682, + "grad_norm": 1.9828181266784668, + "learning_rate": 4.9850202539140724e-05, + "loss": 5.2303, + "step": 5863 + }, + { + "epoch": 0.03487486915976782, + "grad_norm": 2.0273706912994385, + "learning_rate": 4.9850151477933216e-05, + "loss": 5.1743, + "step": 5864 + }, + { + "epoch": 0.034880816443048815, + "grad_norm": 1.9634721279144287, + "learning_rate": 4.985010040805077e-05, + "loss": 5.1541, + "step": 5865 + }, + { + "epoch": 0.03488676372632981, + "grad_norm": 2.2766621112823486, + "learning_rate": 4.985004932949342e-05, + "loss": 5.1372, + "step": 5866 + }, + { + "epoch": 0.03489271100961081, + "grad_norm": 2.0768795013427734, + "learning_rate": 4.984999824226117e-05, + "loss": 5.2567, + "step": 5867 + }, + { + "epoch": 0.03489865829289181, + "grad_norm": 1.8665590286254883, + "learning_rate": 4.984994714635404e-05, + "loss": 5.1356, + "step": 5868 + }, + { + "epoch": 0.0349046055761728, + "grad_norm": 2.056450843811035, + "learning_rate": 4.984989604177205e-05, + "loss": 5.1667, + "step": 5869 + }, + { + "epoch": 0.034910552859453804, + "grad_norm": 2.1191976070404053, + "learning_rate": 4.984984492851522e-05, + "loss": 5.1898, + "step": 5870 + }, + { + "epoch": 0.0349165001427348, + "grad_norm": 2.049450397491455, + "learning_rate": 4.9849793806583566e-05, + "loss": 5.1568, + "step": 5871 + }, + { + "epoch": 0.034922447426015794, + "grad_norm": 1.79837167263031, + "learning_rate": 4.984974267597711e-05, + "loss": 5.1288, + "step": 5872 + }, + { + "epoch": 0.034928394709296796, + "grad_norm": 1.959088683128357, + "learning_rate": 4.984969153669585e-05, + "loss": 5.1063, + "step": 5873 + }, + { + "epoch": 0.03493434199257779, + "grad_norm": 1.9193873405456543, + "learning_rate": 4.9849640388739836e-05, + "loss": 5.1608, + "step": 5874 + }, + { + "epoch": 0.03494028927585879, + "grad_norm": 1.6684316396713257, + "learning_rate": 4.9849589232109065e-05, + "loss": 5.0926, + "step": 5875 + }, + { + "epoch": 0.03494623655913978, + "grad_norm": 1.8383700847625732, + "learning_rate": 4.984953806680356e-05, + "loss": 5.0474, + "step": 5876 + }, + { + "epoch": 0.034952183842420784, + "grad_norm": 2.233779191970825, + "learning_rate": 4.984948689282333e-05, + "loss": 5.5046, + "step": 5877 + }, + { + "epoch": 0.03495813112570178, + "grad_norm": 2.2267282009124756, + "learning_rate": 4.9849435710168415e-05, + "loss": 5.6235, + "step": 5878 + }, + { + "epoch": 0.034964078408982774, + "grad_norm": 1.7933586835861206, + "learning_rate": 4.9849384518838804e-05, + "loss": 5.0968, + "step": 5879 + }, + { + "epoch": 0.034970025692263776, + "grad_norm": 2.0050230026245117, + "learning_rate": 4.984933331883453e-05, + "loss": 4.9789, + "step": 5880 + }, + { + "epoch": 0.03497597297554477, + "grad_norm": 1.7422970533370972, + "learning_rate": 4.9849282110155627e-05, + "loss": 5.1556, + "step": 5881 + }, + { + "epoch": 0.034981920258825766, + "grad_norm": 2.1242151260375977, + "learning_rate": 4.984923089280209e-05, + "loss": 5.7039, + "step": 5882 + }, + { + "epoch": 0.03498786754210677, + "grad_norm": 1.8656666278839111, + "learning_rate": 4.9849179666773934e-05, + "loss": 5.7185, + "step": 5883 + }, + { + "epoch": 0.03499381482538776, + "grad_norm": 1.6954991817474365, + "learning_rate": 4.984912843207119e-05, + "loss": 5.5686, + "step": 5884 + }, + { + "epoch": 0.03499976210866876, + "grad_norm": 1.7692710161209106, + "learning_rate": 4.984907718869387e-05, + "loss": 5.4058, + "step": 5885 + }, + { + "epoch": 0.03500570939194976, + "grad_norm": 1.8496350049972534, + "learning_rate": 4.9849025936642004e-05, + "loss": 5.5037, + "step": 5886 + }, + { + "epoch": 0.035011656675230755, + "grad_norm": 2.0124640464782715, + "learning_rate": 4.984897467591559e-05, + "loss": 5.6146, + "step": 5887 + }, + { + "epoch": 0.03501760395851175, + "grad_norm": 2.5522549152374268, + "learning_rate": 4.984892340651466e-05, + "loss": 5.6403, + "step": 5888 + }, + { + "epoch": 0.03502355124179275, + "grad_norm": 2.2127344608306885, + "learning_rate": 4.9848872128439224e-05, + "loss": 5.6277, + "step": 5889 + }, + { + "epoch": 0.03502949852507375, + "grad_norm": 2.578322172164917, + "learning_rate": 4.9848820841689305e-05, + "loss": 5.849, + "step": 5890 + }, + { + "epoch": 0.03503544580835474, + "grad_norm": 1.8083957433700562, + "learning_rate": 4.9848769546264915e-05, + "loss": 5.4407, + "step": 5891 + }, + { + "epoch": 0.03504139309163574, + "grad_norm": 1.885387897491455, + "learning_rate": 4.984871824216609e-05, + "loss": 5.4486, + "step": 5892 + }, + { + "epoch": 0.03504734037491674, + "grad_norm": 1.9450737237930298, + "learning_rate": 4.9848666929392817e-05, + "loss": 5.4196, + "step": 5893 + }, + { + "epoch": 0.035053287658197735, + "grad_norm": 1.9072003364562988, + "learning_rate": 4.984861560794514e-05, + "loss": 5.6293, + "step": 5894 + }, + { + "epoch": 0.03505923494147873, + "grad_norm": 2.064192056655884, + "learning_rate": 4.984856427782307e-05, + "loss": 5.7105, + "step": 5895 + }, + { + "epoch": 0.03506518222475973, + "grad_norm": 2.0101802349090576, + "learning_rate": 4.984851293902663e-05, + "loss": 5.5623, + "step": 5896 + }, + { + "epoch": 0.03507112950804073, + "grad_norm": 1.9813642501831055, + "learning_rate": 4.984846159155581e-05, + "loss": 5.653, + "step": 5897 + }, + { + "epoch": 0.03507707679132172, + "grad_norm": 1.9213227033615112, + "learning_rate": 4.9848410235410666e-05, + "loss": 5.5194, + "step": 5898 + }, + { + "epoch": 0.035083024074602724, + "grad_norm": 1.803076982498169, + "learning_rate": 4.984835887059119e-05, + "loss": 5.4101, + "step": 5899 + }, + { + "epoch": 0.03508897135788372, + "grad_norm": 1.8419232368469238, + "learning_rate": 4.9848307497097414e-05, + "loss": 5.7329, + "step": 5900 + }, + { + "epoch": 0.035094918641164714, + "grad_norm": 1.9258531332015991, + "learning_rate": 4.984825611492935e-05, + "loss": 5.559, + "step": 5901 + }, + { + "epoch": 0.035100865924445716, + "grad_norm": 1.869529366493225, + "learning_rate": 4.984820472408701e-05, + "loss": 5.5682, + "step": 5902 + }, + { + "epoch": 0.03510681320772671, + "grad_norm": 1.753365159034729, + "learning_rate": 4.984815332457042e-05, + "loss": 5.6241, + "step": 5903 + }, + { + "epoch": 0.035112760491007707, + "grad_norm": 1.6581326723098755, + "learning_rate": 4.98481019163796e-05, + "loss": 5.4752, + "step": 5904 + }, + { + "epoch": 0.0351187077742887, + "grad_norm": 1.9120882749557495, + "learning_rate": 4.9848050499514565e-05, + "loss": 5.5678, + "step": 5905 + }, + { + "epoch": 0.035124655057569704, + "grad_norm": 1.9840329885482788, + "learning_rate": 4.984799907397533e-05, + "loss": 5.5369, + "step": 5906 + }, + { + "epoch": 0.0351306023408507, + "grad_norm": 1.7970712184906006, + "learning_rate": 4.9847947639761914e-05, + "loss": 5.5857, + "step": 5907 + }, + { + "epoch": 0.035136549624131694, + "grad_norm": 1.7219270467758179, + "learning_rate": 4.984789619687435e-05, + "loss": 5.609, + "step": 5908 + }, + { + "epoch": 0.035142496907412696, + "grad_norm": 1.8945105075836182, + "learning_rate": 4.984784474531262e-05, + "loss": 5.5893, + "step": 5909 + }, + { + "epoch": 0.03514844419069369, + "grad_norm": 1.8570127487182617, + "learning_rate": 4.984779328507678e-05, + "loss": 5.4556, + "step": 5910 + }, + { + "epoch": 0.035154391473974686, + "grad_norm": 1.9291017055511475, + "learning_rate": 4.984774181616683e-05, + "loss": 5.476, + "step": 5911 + }, + { + "epoch": 0.03516033875725569, + "grad_norm": 1.9138598442077637, + "learning_rate": 4.984769033858278e-05, + "loss": 5.6329, + "step": 5912 + }, + { + "epoch": 0.03516628604053668, + "grad_norm": 1.9484977722167969, + "learning_rate": 4.9847638852324665e-05, + "loss": 5.5305, + "step": 5913 + }, + { + "epoch": 0.03517223332381768, + "grad_norm": 1.7338584661483765, + "learning_rate": 4.984758735739249e-05, + "loss": 5.4842, + "step": 5914 + }, + { + "epoch": 0.03517818060709868, + "grad_norm": 1.8625437021255493, + "learning_rate": 4.984753585378629e-05, + "loss": 5.3696, + "step": 5915 + }, + { + "epoch": 0.035184127890379675, + "grad_norm": 1.798782229423523, + "learning_rate": 4.984748434150607e-05, + "loss": 5.5803, + "step": 5916 + }, + { + "epoch": 0.03519007517366067, + "grad_norm": 2.0596888065338135, + "learning_rate": 4.9847432820551845e-05, + "loss": 5.3274, + "step": 5917 + }, + { + "epoch": 0.03519602245694167, + "grad_norm": 2.0848498344421387, + "learning_rate": 4.984738129092364e-05, + "loss": 5.3334, + "step": 5918 + }, + { + "epoch": 0.03520196974022267, + "grad_norm": 2.000460386276245, + "learning_rate": 4.984732975262147e-05, + "loss": 5.4411, + "step": 5919 + }, + { + "epoch": 0.03520791702350366, + "grad_norm": 1.676957607269287, + "learning_rate": 4.9847278205645355e-05, + "loss": 5.47, + "step": 5920 + }, + { + "epoch": 0.03521386430678466, + "grad_norm": 1.911482334136963, + "learning_rate": 4.984722664999531e-05, + "loss": 5.5736, + "step": 5921 + }, + { + "epoch": 0.03521981159006566, + "grad_norm": 1.9573029279708862, + "learning_rate": 4.9847175085671356e-05, + "loss": 5.5509, + "step": 5922 + }, + { + "epoch": 0.035225758873346655, + "grad_norm": 1.8878334760665894, + "learning_rate": 4.984712351267351e-05, + "loss": 5.6437, + "step": 5923 + }, + { + "epoch": 0.03523170615662765, + "grad_norm": 1.9107712507247925, + "learning_rate": 4.984707193100179e-05, + "loss": 5.4471, + "step": 5924 + }, + { + "epoch": 0.03523765343990865, + "grad_norm": 1.7408612966537476, + "learning_rate": 4.9847020340656215e-05, + "loss": 5.3706, + "step": 5925 + }, + { + "epoch": 0.03524360072318965, + "grad_norm": 1.9594995975494385, + "learning_rate": 4.98469687416368e-05, + "loss": 5.4113, + "step": 5926 + }, + { + "epoch": 0.03524954800647064, + "grad_norm": 1.8772166967391968, + "learning_rate": 4.984691713394356e-05, + "loss": 5.368, + "step": 5927 + }, + { + "epoch": 0.035255495289751644, + "grad_norm": 2.1143953800201416, + "learning_rate": 4.9846865517576524e-05, + "loss": 5.3829, + "step": 5928 + }, + { + "epoch": 0.03526144257303264, + "grad_norm": 2.0923383235931396, + "learning_rate": 4.984681389253571e-05, + "loss": 5.9834, + "step": 5929 + }, + { + "epoch": 0.035267389856313634, + "grad_norm": 2.016749620437622, + "learning_rate": 4.984676225882112e-05, + "loss": 5.68, + "step": 5930 + }, + { + "epoch": 0.035273337139594636, + "grad_norm": 1.6040265560150146, + "learning_rate": 4.984671061643279e-05, + "loss": 5.7406, + "step": 5931 + }, + { + "epoch": 0.03527928442287563, + "grad_norm": 2.100774049758911, + "learning_rate": 4.984665896537072e-05, + "loss": 5.5545, + "step": 5932 + }, + { + "epoch": 0.035285231706156626, + "grad_norm": 2.008575439453125, + "learning_rate": 4.984660730563494e-05, + "loss": 5.3769, + "step": 5933 + }, + { + "epoch": 0.03529117898943762, + "grad_norm": 1.9622136354446411, + "learning_rate": 4.984655563722547e-05, + "loss": 5.5792, + "step": 5934 + }, + { + "epoch": 0.035297126272718624, + "grad_norm": 1.764647364616394, + "learning_rate": 4.9846503960142325e-05, + "loss": 5.6543, + "step": 5935 + }, + { + "epoch": 0.03530307355599962, + "grad_norm": 1.6166809797286987, + "learning_rate": 4.984645227438552e-05, + "loss": 5.7948, + "step": 5936 + }, + { + "epoch": 0.035309020839280614, + "grad_norm": 1.7368977069854736, + "learning_rate": 4.9846400579955074e-05, + "loss": 5.6288, + "step": 5937 + }, + { + "epoch": 0.035314968122561616, + "grad_norm": 1.649059772491455, + "learning_rate": 4.984634887685101e-05, + "loss": 5.8538, + "step": 5938 + }, + { + "epoch": 0.03532091540584261, + "grad_norm": 1.6092652082443237, + "learning_rate": 4.984629716507334e-05, + "loss": 5.7077, + "step": 5939 + }, + { + "epoch": 0.035326862689123606, + "grad_norm": 1.76821768283844, + "learning_rate": 4.984624544462209e-05, + "loss": 5.4206, + "step": 5940 + }, + { + "epoch": 0.03533280997240461, + "grad_norm": 1.5885004997253418, + "learning_rate": 4.984619371549727e-05, + "loss": 5.3997, + "step": 5941 + }, + { + "epoch": 0.0353387572556856, + "grad_norm": 1.6730574369430542, + "learning_rate": 4.984614197769889e-05, + "loss": 5.4952, + "step": 5942 + }, + { + "epoch": 0.0353447045389666, + "grad_norm": 1.9951595067977905, + "learning_rate": 4.984609023122699e-05, + "loss": 5.5658, + "step": 5943 + }, + { + "epoch": 0.0353506518222476, + "grad_norm": 1.8277794122695923, + "learning_rate": 4.984603847608157e-05, + "loss": 5.5313, + "step": 5944 + }, + { + "epoch": 0.035356599105528595, + "grad_norm": 1.5988150835037231, + "learning_rate": 4.984598671226266e-05, + "loss": 5.4661, + "step": 5945 + }, + { + "epoch": 0.03536254638880959, + "grad_norm": 1.8313721418380737, + "learning_rate": 4.9845934939770264e-05, + "loss": 5.3005, + "step": 5946 + }, + { + "epoch": 0.03536849367209059, + "grad_norm": 1.8441407680511475, + "learning_rate": 4.984588315860442e-05, + "loss": 5.4564, + "step": 5947 + }, + { + "epoch": 0.03537444095537159, + "grad_norm": 2.8165388107299805, + "learning_rate": 4.9845831368765126e-05, + "loss": 5.4582, + "step": 5948 + }, + { + "epoch": 0.03538038823865258, + "grad_norm": 1.8860023021697998, + "learning_rate": 4.9845779570252415e-05, + "loss": 5.4952, + "step": 5949 + }, + { + "epoch": 0.03538633552193358, + "grad_norm": 1.7752633094787598, + "learning_rate": 4.98457277630663e-05, + "loss": 5.4301, + "step": 5950 + }, + { + "epoch": 0.03539228280521458, + "grad_norm": 1.9038548469543457, + "learning_rate": 4.984567594720679e-05, + "loss": 5.2591, + "step": 5951 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 2.6449787616729736, + "learning_rate": 4.984562412267392e-05, + "loss": 5.9317, + "step": 5952 + }, + { + "epoch": 0.03540417737177657, + "grad_norm": 1.95949125289917, + "learning_rate": 4.98455722894677e-05, + "loss": 5.4686, + "step": 5953 + }, + { + "epoch": 0.03541012465505757, + "grad_norm": 2.0208640098571777, + "learning_rate": 4.984552044758814e-05, + "loss": 5.6361, + "step": 5954 + }, + { + "epoch": 0.03541607193833857, + "grad_norm": 2.2328197956085205, + "learning_rate": 4.9845468597035274e-05, + "loss": 5.455, + "step": 5955 + }, + { + "epoch": 0.03542201922161956, + "grad_norm": 2.115952968597412, + "learning_rate": 4.9845416737809105e-05, + "loss": 5.3275, + "step": 5956 + }, + { + "epoch": 0.035427966504900564, + "grad_norm": 2.023791790008545, + "learning_rate": 4.984536486990966e-05, + "loss": 5.3135, + "step": 5957 + }, + { + "epoch": 0.03543391378818156, + "grad_norm": 1.9721077680587769, + "learning_rate": 4.9845312993336945e-05, + "loss": 5.3429, + "step": 5958 + }, + { + "epoch": 0.035439861071462554, + "grad_norm": 2.047588586807251, + "learning_rate": 4.9845261108091e-05, + "loss": 5.4027, + "step": 5959 + }, + { + "epoch": 0.035445808354743556, + "grad_norm": 1.9019498825073242, + "learning_rate": 4.9845209214171826e-05, + "loss": 5.3867, + "step": 5960 + }, + { + "epoch": 0.03545175563802455, + "grad_norm": 1.9442843198776245, + "learning_rate": 4.984515731157945e-05, + "loss": 5.3189, + "step": 5961 + }, + { + "epoch": 0.035457702921305546, + "grad_norm": 2.051422357559204, + "learning_rate": 4.9845105400313885e-05, + "loss": 5.5713, + "step": 5962 + }, + { + "epoch": 0.03546365020458654, + "grad_norm": 1.811908483505249, + "learning_rate": 4.9845053480375145e-05, + "loss": 5.6221, + "step": 5963 + }, + { + "epoch": 0.035469597487867544, + "grad_norm": 2.017991542816162, + "learning_rate": 4.984500155176326e-05, + "loss": 5.2774, + "step": 5964 + }, + { + "epoch": 0.03547554477114854, + "grad_norm": 1.972644329071045, + "learning_rate": 4.9844949614478244e-05, + "loss": 5.3208, + "step": 5965 + }, + { + "epoch": 0.035481492054429534, + "grad_norm": 1.9937026500701904, + "learning_rate": 4.984489766852011e-05, + "loss": 5.455, + "step": 5966 + }, + { + "epoch": 0.035487439337710536, + "grad_norm": 1.7297019958496094, + "learning_rate": 4.984484571388887e-05, + "loss": 5.3829, + "step": 5967 + }, + { + "epoch": 0.03549338662099153, + "grad_norm": 1.6428204774856567, + "learning_rate": 4.984479375058456e-05, + "loss": 5.3638, + "step": 5968 + }, + { + "epoch": 0.035499333904272526, + "grad_norm": 1.9522719383239746, + "learning_rate": 4.9844741778607186e-05, + "loss": 5.3379, + "step": 5969 + }, + { + "epoch": 0.03550528118755353, + "grad_norm": 2.0280921459198, + "learning_rate": 4.984468979795677e-05, + "loss": 5.4366, + "step": 5970 + }, + { + "epoch": 0.03551122847083452, + "grad_norm": 2.0396251678466797, + "learning_rate": 4.9844637808633334e-05, + "loss": 5.5681, + "step": 5971 + }, + { + "epoch": 0.03551717575411552, + "grad_norm": 1.5256271362304688, + "learning_rate": 4.984458581063689e-05, + "loss": 5.602, + "step": 5972 + }, + { + "epoch": 0.03552312303739652, + "grad_norm": 1.8829892873764038, + "learning_rate": 4.984453380396745e-05, + "loss": 5.3851, + "step": 5973 + }, + { + "epoch": 0.035529070320677515, + "grad_norm": 2.047106981277466, + "learning_rate": 4.984448178862505e-05, + "loss": 5.3724, + "step": 5974 + }, + { + "epoch": 0.03553501760395851, + "grad_norm": 2.066572904586792, + "learning_rate": 4.984442976460969e-05, + "loss": 5.3352, + "step": 5975 + }, + { + "epoch": 0.03554096488723951, + "grad_norm": 1.9785430431365967, + "learning_rate": 4.98443777319214e-05, + "loss": 5.2641, + "step": 5976 + }, + { + "epoch": 0.03554691217052051, + "grad_norm": 1.8999443054199219, + "learning_rate": 4.98443256905602e-05, + "loss": 5.3402, + "step": 5977 + }, + { + "epoch": 0.0355528594538015, + "grad_norm": 1.8599263429641724, + "learning_rate": 4.98442736405261e-05, + "loss": 5.2612, + "step": 5978 + }, + { + "epoch": 0.0355588067370825, + "grad_norm": 1.7216875553131104, + "learning_rate": 4.984422158181911e-05, + "loss": 5.4041, + "step": 5979 + }, + { + "epoch": 0.0355647540203635, + "grad_norm": 2.0259687900543213, + "learning_rate": 4.984416951443926e-05, + "loss": 5.4895, + "step": 5980 + }, + { + "epoch": 0.035570701303644495, + "grad_norm": 1.705736756324768, + "learning_rate": 4.9844117438386583e-05, + "loss": 5.5845, + "step": 5981 + }, + { + "epoch": 0.03557664858692549, + "grad_norm": 1.9546462297439575, + "learning_rate": 4.9844065353661074e-05, + "loss": 5.6803, + "step": 5982 + }, + { + "epoch": 0.03558259587020649, + "grad_norm": 1.829689383506775, + "learning_rate": 4.984401326026275e-05, + "loss": 5.5816, + "step": 5983 + }, + { + "epoch": 0.03558854315348749, + "grad_norm": 1.6464663743972778, + "learning_rate": 4.984396115819164e-05, + "loss": 5.5738, + "step": 5984 + }, + { + "epoch": 0.03559449043676848, + "grad_norm": 1.7786076068878174, + "learning_rate": 4.984390904744777e-05, + "loss": 5.3667, + "step": 5985 + }, + { + "epoch": 0.035600437720049484, + "grad_norm": 2.210754871368408, + "learning_rate": 4.984385692803114e-05, + "loss": 5.5259, + "step": 5986 + }, + { + "epoch": 0.03560638500333048, + "grad_norm": 1.7361842393875122, + "learning_rate": 4.984380479994179e-05, + "loss": 5.6108, + "step": 5987 + }, + { + "epoch": 0.035612332286611474, + "grad_norm": 1.926477313041687, + "learning_rate": 4.9843752663179703e-05, + "loss": 5.593, + "step": 5988 + }, + { + "epoch": 0.035618279569892476, + "grad_norm": 1.6683733463287354, + "learning_rate": 4.984370051774493e-05, + "loss": 5.6305, + "step": 5989 + }, + { + "epoch": 0.03562422685317347, + "grad_norm": 1.790499210357666, + "learning_rate": 4.9843648363637475e-05, + "loss": 5.596, + "step": 5990 + }, + { + "epoch": 0.035630174136454466, + "grad_norm": 1.8355207443237305, + "learning_rate": 4.984359620085736e-05, + "loss": 5.5818, + "step": 5991 + }, + { + "epoch": 0.03563612141973546, + "grad_norm": 1.9352680444717407, + "learning_rate": 4.98435440294046e-05, + "loss": 5.187, + "step": 5992 + }, + { + "epoch": 0.03564206870301646, + "grad_norm": 2.063159465789795, + "learning_rate": 4.9843491849279225e-05, + "loss": 5.3245, + "step": 5993 + }, + { + "epoch": 0.03564801598629746, + "grad_norm": 1.6848958730697632, + "learning_rate": 4.984343966048123e-05, + "loss": 5.4454, + "step": 5994 + }, + { + "epoch": 0.035653963269578454, + "grad_norm": 2.1244423389434814, + "learning_rate": 4.9843387463010654e-05, + "loss": 5.5018, + "step": 5995 + }, + { + "epoch": 0.035659910552859456, + "grad_norm": 1.9100427627563477, + "learning_rate": 4.9843335256867505e-05, + "loss": 5.5597, + "step": 5996 + }, + { + "epoch": 0.03566585783614045, + "grad_norm": 1.9130252599716187, + "learning_rate": 4.984328304205181e-05, + "loss": 5.4538, + "step": 5997 + }, + { + "epoch": 0.035671805119421446, + "grad_norm": 1.6285213232040405, + "learning_rate": 4.984323081856358e-05, + "loss": 5.7361, + "step": 5998 + }, + { + "epoch": 0.03567775240270245, + "grad_norm": 1.6690980195999146, + "learning_rate": 4.984317858640283e-05, + "loss": 5.7537, + "step": 5999 + }, + { + "epoch": 0.03568369968598344, + "grad_norm": 1.5258572101593018, + "learning_rate": 4.984312634556959e-05, + "loss": 5.7419, + "step": 6000 + }, + { + "epoch": 0.03568964696926444, + "grad_norm": 1.9586881399154663, + "learning_rate": 4.984307409606386e-05, + "loss": 5.4449, + "step": 6001 + }, + { + "epoch": 0.03569559425254544, + "grad_norm": 2.1795685291290283, + "learning_rate": 4.9843021837885684e-05, + "loss": 5.3833, + "step": 6002 + }, + { + "epoch": 0.035701541535826435, + "grad_norm": 2.1241326332092285, + "learning_rate": 4.984296957103506e-05, + "loss": 5.3064, + "step": 6003 + }, + { + "epoch": 0.03570748881910743, + "grad_norm": 1.9621204137802124, + "learning_rate": 4.9842917295512004e-05, + "loss": 5.3002, + "step": 6004 + }, + { + "epoch": 0.03571343610238843, + "grad_norm": 2.041503429412842, + "learning_rate": 4.984286501131655e-05, + "loss": 5.2885, + "step": 6005 + }, + { + "epoch": 0.03571938338566943, + "grad_norm": 2.1099791526794434, + "learning_rate": 4.984281271844871e-05, + "loss": 5.3038, + "step": 6006 + }, + { + "epoch": 0.03572533066895042, + "grad_norm": 2.0209009647369385, + "learning_rate": 4.98427604169085e-05, + "loss": 5.8373, + "step": 6007 + }, + { + "epoch": 0.03573127795223142, + "grad_norm": 1.7534282207489014, + "learning_rate": 4.9842708106695934e-05, + "loss": 5.6522, + "step": 6008 + }, + { + "epoch": 0.03573722523551242, + "grad_norm": 2.3014237880706787, + "learning_rate": 4.984265578781104e-05, + "loss": 5.462, + "step": 6009 + }, + { + "epoch": 0.035743172518793415, + "grad_norm": 2.123767614364624, + "learning_rate": 4.984260346025382e-05, + "loss": 5.3901, + "step": 6010 + }, + { + "epoch": 0.03574911980207441, + "grad_norm": 2.4190175533294678, + "learning_rate": 4.9842551124024315e-05, + "loss": 5.1526, + "step": 6011 + }, + { + "epoch": 0.03575506708535541, + "grad_norm": 1.9972834587097168, + "learning_rate": 4.984249877912254e-05, + "loss": 5.2987, + "step": 6012 + }, + { + "epoch": 0.03576101436863641, + "grad_norm": 2.002969980239868, + "learning_rate": 4.9842446425548494e-05, + "loss": 5.5244, + "step": 6013 + }, + { + "epoch": 0.0357669616519174, + "grad_norm": 2.8208391666412354, + "learning_rate": 4.984239406330221e-05, + "loss": 5.834, + "step": 6014 + }, + { + "epoch": 0.035772908935198404, + "grad_norm": 2.409303665161133, + "learning_rate": 4.98423416923837e-05, + "loss": 5.1709, + "step": 6015 + }, + { + "epoch": 0.0357788562184794, + "grad_norm": 2.215888500213623, + "learning_rate": 4.984228931279298e-05, + "loss": 5.38, + "step": 6016 + }, + { + "epoch": 0.035784803501760394, + "grad_norm": 1.9130421876907349, + "learning_rate": 4.9842236924530086e-05, + "loss": 5.4551, + "step": 6017 + }, + { + "epoch": 0.035790750785041396, + "grad_norm": 1.8963314294815063, + "learning_rate": 4.9842184527595015e-05, + "loss": 5.3512, + "step": 6018 + }, + { + "epoch": 0.03579669806832239, + "grad_norm": 2.0085666179656982, + "learning_rate": 4.98421321219878e-05, + "loss": 5.3013, + "step": 6019 + }, + { + "epoch": 0.035802645351603386, + "grad_norm": 2.1059834957122803, + "learning_rate": 4.9842079707708446e-05, + "loss": 5.4052, + "step": 6020 + }, + { + "epoch": 0.03580859263488438, + "grad_norm": 1.965694785118103, + "learning_rate": 4.984202728475699e-05, + "loss": 5.5392, + "step": 6021 + }, + { + "epoch": 0.03581453991816538, + "grad_norm": 1.9495680332183838, + "learning_rate": 4.9841974853133425e-05, + "loss": 5.309, + "step": 6022 + }, + { + "epoch": 0.03582048720144638, + "grad_norm": 1.9762555360794067, + "learning_rate": 4.9841922412837795e-05, + "loss": 5.3979, + "step": 6023 + }, + { + "epoch": 0.035826434484727374, + "grad_norm": 1.7825839519500732, + "learning_rate": 4.98418699638701e-05, + "loss": 5.3502, + "step": 6024 + }, + { + "epoch": 0.035832381768008376, + "grad_norm": 1.9636192321777344, + "learning_rate": 4.984181750623037e-05, + "loss": 5.6341, + "step": 6025 + }, + { + "epoch": 0.03583832905128937, + "grad_norm": 1.833883285522461, + "learning_rate": 4.984176503991861e-05, + "loss": 5.5861, + "step": 6026 + }, + { + "epoch": 0.035844276334570366, + "grad_norm": 1.91568124294281, + "learning_rate": 4.984171256493485e-05, + "loss": 5.591, + "step": 6027 + }, + { + "epoch": 0.03585022361785137, + "grad_norm": 2.153472423553467, + "learning_rate": 4.9841660081279105e-05, + "loss": 5.3463, + "step": 6028 + }, + { + "epoch": 0.03585617090113236, + "grad_norm": 1.8164830207824707, + "learning_rate": 4.984160758895139e-05, + "loss": 5.4886, + "step": 6029 + }, + { + "epoch": 0.03586211818441336, + "grad_norm": 2.0216922760009766, + "learning_rate": 4.984155508795174e-05, + "loss": 5.5777, + "step": 6030 + }, + { + "epoch": 0.03586806546769436, + "grad_norm": 1.966779351234436, + "learning_rate": 4.984150257828014e-05, + "loss": 5.1867, + "step": 6031 + }, + { + "epoch": 0.035874012750975355, + "grad_norm": 2.091109275817871, + "learning_rate": 4.9841450059936645e-05, + "loss": 5.5302, + "step": 6032 + }, + { + "epoch": 0.03587996003425635, + "grad_norm": 1.8772802352905273, + "learning_rate": 4.984139753292125e-05, + "loss": 5.2904, + "step": 6033 + }, + { + "epoch": 0.03588590731753735, + "grad_norm": 2.049431800842285, + "learning_rate": 4.984134499723397e-05, + "loss": 5.293, + "step": 6034 + }, + { + "epoch": 0.03589185460081835, + "grad_norm": 2.0902609825134277, + "learning_rate": 4.984129245287485e-05, + "loss": 5.2689, + "step": 6035 + }, + { + "epoch": 0.03589780188409934, + "grad_norm": 1.91702139377594, + "learning_rate": 4.9841239899843886e-05, + "loss": 5.255, + "step": 6036 + }, + { + "epoch": 0.03590374916738034, + "grad_norm": 1.7073708772659302, + "learning_rate": 4.984118733814109e-05, + "loss": 5.3272, + "step": 6037 + }, + { + "epoch": 0.03590969645066134, + "grad_norm": 1.625712275505066, + "learning_rate": 4.9841134767766506e-05, + "loss": 5.5366, + "step": 6038 + }, + { + "epoch": 0.035915643733942335, + "grad_norm": 1.8465087413787842, + "learning_rate": 4.984108218872014e-05, + "loss": 5.3373, + "step": 6039 + }, + { + "epoch": 0.03592159101722333, + "grad_norm": 2.2392280101776123, + "learning_rate": 4.9841029601002e-05, + "loss": 5.5898, + "step": 6040 + }, + { + "epoch": 0.03592753830050433, + "grad_norm": 2.6571459770202637, + "learning_rate": 4.984097700461212e-05, + "loss": 5.963, + "step": 6041 + }, + { + "epoch": 0.03593348558378533, + "grad_norm": 2.7220845222473145, + "learning_rate": 4.98409243995505e-05, + "loss": 5.6997, + "step": 6042 + }, + { + "epoch": 0.03593943286706632, + "grad_norm": 2.430968999862671, + "learning_rate": 4.9840871785817185e-05, + "loss": 5.2949, + "step": 6043 + }, + { + "epoch": 0.035945380150347324, + "grad_norm": 2.3006606101989746, + "learning_rate": 4.984081916341217e-05, + "loss": 5.2045, + "step": 6044 + }, + { + "epoch": 0.03595132743362832, + "grad_norm": 2.2382659912109375, + "learning_rate": 4.984076653233548e-05, + "loss": 5.417, + "step": 6045 + }, + { + "epoch": 0.035957274716909314, + "grad_norm": 2.1896233558654785, + "learning_rate": 4.9840713892587146e-05, + "loss": 5.7215, + "step": 6046 + }, + { + "epoch": 0.035963222000190316, + "grad_norm": 1.8175956010818481, + "learning_rate": 4.9840661244167166e-05, + "loss": 5.569, + "step": 6047 + }, + { + "epoch": 0.03596916928347131, + "grad_norm": 2.066828727722168, + "learning_rate": 4.984060858707557e-05, + "loss": 5.6285, + "step": 6048 + }, + { + "epoch": 0.035975116566752306, + "grad_norm": 2.246291160583496, + "learning_rate": 4.984055592131237e-05, + "loss": 5.5583, + "step": 6049 + }, + { + "epoch": 0.0359810638500333, + "grad_norm": 2.2394871711730957, + "learning_rate": 4.984050324687759e-05, + "loss": 5.3917, + "step": 6050 + }, + { + "epoch": 0.0359870111333143, + "grad_norm": 2.5051162242889404, + "learning_rate": 4.984045056377125e-05, + "loss": 5.6955, + "step": 6051 + }, + { + "epoch": 0.0359929584165953, + "grad_norm": 2.1360414028167725, + "learning_rate": 4.984039787199336e-05, + "loss": 5.5451, + "step": 6052 + }, + { + "epoch": 0.035998905699876294, + "grad_norm": 2.0267562866210938, + "learning_rate": 4.984034517154395e-05, + "loss": 5.4559, + "step": 6053 + }, + { + "epoch": 0.036004852983157296, + "grad_norm": 1.7683112621307373, + "learning_rate": 4.984029246242303e-05, + "loss": 5.4663, + "step": 6054 + }, + { + "epoch": 0.03601080026643829, + "grad_norm": 2.0600638389587402, + "learning_rate": 4.9840239744630626e-05, + "loss": 5.5081, + "step": 6055 + }, + { + "epoch": 0.036016747549719286, + "grad_norm": 2.093698740005493, + "learning_rate": 4.984018701816674e-05, + "loss": 5.5435, + "step": 6056 + }, + { + "epoch": 0.03602269483300029, + "grad_norm": 2.217721462249756, + "learning_rate": 4.984013428303141e-05, + "loss": 5.7482, + "step": 6057 + }, + { + "epoch": 0.03602864211628128, + "grad_norm": 1.9680962562561035, + "learning_rate": 4.9840081539224636e-05, + "loss": 5.9722, + "step": 6058 + }, + { + "epoch": 0.03603458939956228, + "grad_norm": 1.8606425523757935, + "learning_rate": 4.9840028786746455e-05, + "loss": 5.8379, + "step": 6059 + }, + { + "epoch": 0.03604053668284328, + "grad_norm": 2.0129475593566895, + "learning_rate": 4.983997602559688e-05, + "loss": 5.7199, + "step": 6060 + }, + { + "epoch": 0.036046483966124275, + "grad_norm": 1.9370187520980835, + "learning_rate": 4.9839923255775917e-05, + "loss": 5.3563, + "step": 6061 + }, + { + "epoch": 0.03605243124940527, + "grad_norm": 1.775894284248352, + "learning_rate": 4.983987047728359e-05, + "loss": 5.5201, + "step": 6062 + }, + { + "epoch": 0.03605837853268627, + "grad_norm": 1.9943023920059204, + "learning_rate": 4.9839817690119934e-05, + "loss": 5.4034, + "step": 6063 + }, + { + "epoch": 0.03606432581596727, + "grad_norm": 1.9605768918991089, + "learning_rate": 4.983976489428494e-05, + "loss": 5.5314, + "step": 6064 + }, + { + "epoch": 0.03607027309924826, + "grad_norm": 1.7820254564285278, + "learning_rate": 4.983971208977866e-05, + "loss": 5.6131, + "step": 6065 + }, + { + "epoch": 0.03607622038252926, + "grad_norm": 2.010796070098877, + "learning_rate": 4.983965927660108e-05, + "loss": 5.5114, + "step": 6066 + }, + { + "epoch": 0.03608216766581026, + "grad_norm": 1.8461687564849854, + "learning_rate": 4.983960645475223e-05, + "loss": 5.4752, + "step": 6067 + }, + { + "epoch": 0.036088114949091255, + "grad_norm": 2.048119068145752, + "learning_rate": 4.983955362423214e-05, + "loss": 5.3325, + "step": 6068 + }, + { + "epoch": 0.03609406223237225, + "grad_norm": 2.021646499633789, + "learning_rate": 4.9839500785040804e-05, + "loss": 5.2238, + "step": 6069 + }, + { + "epoch": 0.03610000951565325, + "grad_norm": 1.9979503154754639, + "learning_rate": 4.9839447937178264e-05, + "loss": 5.4054, + "step": 6070 + }, + { + "epoch": 0.03610595679893425, + "grad_norm": 1.980776071548462, + "learning_rate": 4.983939508064453e-05, + "loss": 5.4094, + "step": 6071 + }, + { + "epoch": 0.03611190408221524, + "grad_norm": 1.8364293575286865, + "learning_rate": 4.9839342215439615e-05, + "loss": 5.4372, + "step": 6072 + }, + { + "epoch": 0.036117851365496244, + "grad_norm": 1.8870443105697632, + "learning_rate": 4.983928934156354e-05, + "loss": 5.4075, + "step": 6073 + }, + { + "epoch": 0.03612379864877724, + "grad_norm": 2.176180124282837, + "learning_rate": 4.9839236459016337e-05, + "loss": 5.4302, + "step": 6074 + }, + { + "epoch": 0.036129745932058234, + "grad_norm": 2.054960012435913, + "learning_rate": 4.983918356779801e-05, + "loss": 5.3796, + "step": 6075 + }, + { + "epoch": 0.036135693215339236, + "grad_norm": 2.2146401405334473, + "learning_rate": 4.9839130667908576e-05, + "loss": 5.651, + "step": 6076 + }, + { + "epoch": 0.03614164049862023, + "grad_norm": 1.908640742301941, + "learning_rate": 4.983907775934806e-05, + "loss": 5.3002, + "step": 6077 + }, + { + "epoch": 0.036147587781901226, + "grad_norm": 1.9364973306655884, + "learning_rate": 4.983902484211648e-05, + "loss": 5.2299, + "step": 6078 + }, + { + "epoch": 0.03615353506518223, + "grad_norm": 1.7405542135238647, + "learning_rate": 4.983897191621385e-05, + "loss": 5.268, + "step": 6079 + }, + { + "epoch": 0.03615948234846322, + "grad_norm": 2.0347912311553955, + "learning_rate": 4.9838918981640195e-05, + "loss": 5.4887, + "step": 6080 + }, + { + "epoch": 0.03616542963174422, + "grad_norm": 2.0755162239074707, + "learning_rate": 4.9838866038395524e-05, + "loss": 5.2208, + "step": 6081 + }, + { + "epoch": 0.03617137691502521, + "grad_norm": 1.9119634628295898, + "learning_rate": 4.9838813086479865e-05, + "loss": 5.2659, + "step": 6082 + }, + { + "epoch": 0.036177324198306215, + "grad_norm": 1.9172658920288086, + "learning_rate": 4.983876012589324e-05, + "loss": 5.4098, + "step": 6083 + }, + { + "epoch": 0.03618327148158721, + "grad_norm": 2.09004545211792, + "learning_rate": 4.983870715663565e-05, + "loss": 5.5866, + "step": 6084 + }, + { + "epoch": 0.036189218764868206, + "grad_norm": 2.0952436923980713, + "learning_rate": 4.983865417870712e-05, + "loss": 5.5288, + "step": 6085 + }, + { + "epoch": 0.03619516604814921, + "grad_norm": 1.8599412441253662, + "learning_rate": 4.9838601192107686e-05, + "loss": 5.7538, + "step": 6086 + }, + { + "epoch": 0.0362011133314302, + "grad_norm": 1.8318936824798584, + "learning_rate": 4.983854819683735e-05, + "loss": 5.9613, + "step": 6087 + }, + { + "epoch": 0.0362070606147112, + "grad_norm": 1.8312503099441528, + "learning_rate": 4.983849519289613e-05, + "loss": 5.2749, + "step": 6088 + }, + { + "epoch": 0.0362130078979922, + "grad_norm": 2.157576560974121, + "learning_rate": 4.983844218028405e-05, + "loss": 5.2826, + "step": 6089 + }, + { + "epoch": 0.036218955181273195, + "grad_norm": 2.1377198696136475, + "learning_rate": 4.983838915900112e-05, + "loss": 5.2843, + "step": 6090 + }, + { + "epoch": 0.03622490246455419, + "grad_norm": 2.0167126655578613, + "learning_rate": 4.983833612904737e-05, + "loss": 5.4713, + "step": 6091 + }, + { + "epoch": 0.03623084974783519, + "grad_norm": 1.748759388923645, + "learning_rate": 4.9838283090422814e-05, + "loss": 5.3685, + "step": 6092 + }, + { + "epoch": 0.03623679703111619, + "grad_norm": 2.0344316959381104, + "learning_rate": 4.983823004312747e-05, + "loss": 5.1093, + "step": 6093 + }, + { + "epoch": 0.03624274431439718, + "grad_norm": 1.9061161279678345, + "learning_rate": 4.9838176987161356e-05, + "loss": 5.2035, + "step": 6094 + }, + { + "epoch": 0.03624869159767818, + "grad_norm": 1.9090344905853271, + "learning_rate": 4.983812392252449e-05, + "loss": 5.3863, + "step": 6095 + }, + { + "epoch": 0.03625463888095918, + "grad_norm": 1.9536118507385254, + "learning_rate": 4.9838070849216894e-05, + "loss": 5.5349, + "step": 6096 + }, + { + "epoch": 0.036260586164240174, + "grad_norm": 1.89446222782135, + "learning_rate": 4.983801776723858e-05, + "loss": 5.7098, + "step": 6097 + }, + { + "epoch": 0.03626653344752117, + "grad_norm": 1.6403870582580566, + "learning_rate": 4.983796467658958e-05, + "loss": 5.6726, + "step": 6098 + }, + { + "epoch": 0.03627248073080217, + "grad_norm": 1.7792481184005737, + "learning_rate": 4.983791157726989e-05, + "loss": 5.6761, + "step": 6099 + }, + { + "epoch": 0.03627842801408317, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.9837858469279554e-05, + "loss": 5.6576, + "step": 6100 + }, + { + "epoch": 0.03628437529736416, + "grad_norm": 1.9885895252227783, + "learning_rate": 4.983780535261857e-05, + "loss": 5.5944, + "step": 6101 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 1.771620750427246, + "learning_rate": 4.983775222728697e-05, + "loss": 5.7949, + "step": 6102 + }, + { + "epoch": 0.03629626986392616, + "grad_norm": 1.684471845626831, + "learning_rate": 4.9837699093284765e-05, + "loss": 5.5435, + "step": 6103 + }, + { + "epoch": 0.036302217147207154, + "grad_norm": 1.8454065322875977, + "learning_rate": 4.9837645950611966e-05, + "loss": 5.4526, + "step": 6104 + }, + { + "epoch": 0.036308164430488156, + "grad_norm": 1.6522735357284546, + "learning_rate": 4.983759279926862e-05, + "loss": 5.7302, + "step": 6105 + }, + { + "epoch": 0.03631411171376915, + "grad_norm": 1.8691065311431885, + "learning_rate": 4.9837539639254713e-05, + "loss": 5.6494, + "step": 6106 + }, + { + "epoch": 0.036320058997050146, + "grad_norm": 1.9420015811920166, + "learning_rate": 4.9837486470570286e-05, + "loss": 5.77, + "step": 6107 + }, + { + "epoch": 0.03632600628033115, + "grad_norm": 1.8399784564971924, + "learning_rate": 4.9837433293215344e-05, + "loss": 5.6669, + "step": 6108 + }, + { + "epoch": 0.03633195356361214, + "grad_norm": 1.799460530281067, + "learning_rate": 4.983738010718991e-05, + "loss": 5.5557, + "step": 6109 + }, + { + "epoch": 0.03633790084689314, + "grad_norm": 1.8826879262924194, + "learning_rate": 4.9837326912494e-05, + "loss": 5.4865, + "step": 6110 + }, + { + "epoch": 0.03634384813017413, + "grad_norm": 1.9582240581512451, + "learning_rate": 4.983727370912764e-05, + "loss": 5.5882, + "step": 6111 + }, + { + "epoch": 0.036349795413455135, + "grad_norm": 2.011892795562744, + "learning_rate": 4.9837220497090846e-05, + "loss": 5.4932, + "step": 6112 + }, + { + "epoch": 0.03635574269673613, + "grad_norm": 1.7751367092132568, + "learning_rate": 4.983716727638363e-05, + "loss": 5.4981, + "step": 6113 + }, + { + "epoch": 0.036361689980017126, + "grad_norm": 1.984121322631836, + "learning_rate": 4.983711404700603e-05, + "loss": 5.4801, + "step": 6114 + }, + { + "epoch": 0.03636763726329813, + "grad_norm": 1.9601882696151733, + "learning_rate": 4.983706080895804e-05, + "loss": 5.218, + "step": 6115 + }, + { + "epoch": 0.03637358454657912, + "grad_norm": 1.800227165222168, + "learning_rate": 4.9837007562239684e-05, + "loss": 5.5178, + "step": 6116 + }, + { + "epoch": 0.03637953182986012, + "grad_norm": 1.9257889986038208, + "learning_rate": 4.983695430685099e-05, + "loss": 5.6695, + "step": 6117 + }, + { + "epoch": 0.03638547911314112, + "grad_norm": 1.8011913299560547, + "learning_rate": 4.9836901042791976e-05, + "loss": 5.7478, + "step": 6118 + }, + { + "epoch": 0.036391426396422115, + "grad_norm": 1.8668690919876099, + "learning_rate": 4.983684777006264e-05, + "loss": 5.7027, + "step": 6119 + }, + { + "epoch": 0.03639737367970311, + "grad_norm": 1.898126244544983, + "learning_rate": 4.983679448866304e-05, + "loss": 5.5206, + "step": 6120 + }, + { + "epoch": 0.03640332096298411, + "grad_norm": 1.8264409303665161, + "learning_rate": 4.983674119859316e-05, + "loss": 5.4686, + "step": 6121 + }, + { + "epoch": 0.03640926824626511, + "grad_norm": 1.8090230226516724, + "learning_rate": 4.983668789985303e-05, + "loss": 5.4761, + "step": 6122 + }, + { + "epoch": 0.0364152155295461, + "grad_norm": 1.8193403482437134, + "learning_rate": 4.983663459244266e-05, + "loss": 5.3443, + "step": 6123 + }, + { + "epoch": 0.0364211628128271, + "grad_norm": 1.8199255466461182, + "learning_rate": 4.9836581276362095e-05, + "loss": 5.427, + "step": 6124 + }, + { + "epoch": 0.0364271100961081, + "grad_norm": 1.72145414352417, + "learning_rate": 4.9836527951611325e-05, + "loss": 5.4372, + "step": 6125 + }, + { + "epoch": 0.036433057379389094, + "grad_norm": 1.8164423704147339, + "learning_rate": 4.9836474618190386e-05, + "loss": 5.4702, + "step": 6126 + }, + { + "epoch": 0.03643900466267009, + "grad_norm": 1.897775650024414, + "learning_rate": 4.9836421276099287e-05, + "loss": 5.4259, + "step": 6127 + }, + { + "epoch": 0.03644495194595109, + "grad_norm": 1.851101279258728, + "learning_rate": 4.9836367925338046e-05, + "loss": 5.3837, + "step": 6128 + }, + { + "epoch": 0.03645089922923209, + "grad_norm": 1.749374508857727, + "learning_rate": 4.98363145659067e-05, + "loss": 5.3232, + "step": 6129 + }, + { + "epoch": 0.03645684651251308, + "grad_norm": 1.95986008644104, + "learning_rate": 4.9836261197805235e-05, + "loss": 5.2692, + "step": 6130 + }, + { + "epoch": 0.036462793795794084, + "grad_norm": 1.7947750091552734, + "learning_rate": 4.98362078210337e-05, + "loss": 5.409, + "step": 6131 + }, + { + "epoch": 0.03646874107907508, + "grad_norm": 2.119044303894043, + "learning_rate": 4.983615443559209e-05, + "loss": 5.5924, + "step": 6132 + }, + { + "epoch": 0.036474688362356074, + "grad_norm": 1.7285267114639282, + "learning_rate": 4.983610104148044e-05, + "loss": 5.6955, + "step": 6133 + }, + { + "epoch": 0.036480635645637076, + "grad_norm": 2.1711652278900146, + "learning_rate": 4.983604763869877e-05, + "loss": 5.1941, + "step": 6134 + }, + { + "epoch": 0.03648658292891807, + "grad_norm": 2.060039758682251, + "learning_rate": 4.983599422724709e-05, + "loss": 5.5131, + "step": 6135 + }, + { + "epoch": 0.036492530212199066, + "grad_norm": 1.6212393045425415, + "learning_rate": 4.9835940807125415e-05, + "loss": 5.4856, + "step": 6136 + }, + { + "epoch": 0.03649847749548007, + "grad_norm": 1.7602918148040771, + "learning_rate": 4.983588737833378e-05, + "loss": 5.4177, + "step": 6137 + }, + { + "epoch": 0.03650442477876106, + "grad_norm": 2.660930633544922, + "learning_rate": 4.983583394087218e-05, + "loss": 5.5879, + "step": 6138 + }, + { + "epoch": 0.03651037206204206, + "grad_norm": 2.3608336448669434, + "learning_rate": 4.9835780494740655e-05, + "loss": 5.3894, + "step": 6139 + }, + { + "epoch": 0.03651631934532305, + "grad_norm": 2.071632146835327, + "learning_rate": 4.983572703993922e-05, + "loss": 5.6185, + "step": 6140 + }, + { + "epoch": 0.036522266628604055, + "grad_norm": 1.7023842334747314, + "learning_rate": 4.983567357646788e-05, + "loss": 5.5648, + "step": 6141 + }, + { + "epoch": 0.03652821391188505, + "grad_norm": 2.2168798446655273, + "learning_rate": 4.983562010432667e-05, + "loss": 5.4578, + "step": 6142 + }, + { + "epoch": 0.036534161195166046, + "grad_norm": 2.0916104316711426, + "learning_rate": 4.98355666235156e-05, + "loss": 5.4977, + "step": 6143 + }, + { + "epoch": 0.03654010847844705, + "grad_norm": 1.7101606130599976, + "learning_rate": 4.9835513134034686e-05, + "loss": 5.4081, + "step": 6144 + }, + { + "epoch": 0.03654605576172804, + "grad_norm": 1.9058302640914917, + "learning_rate": 4.983545963588395e-05, + "loss": 5.2145, + "step": 6145 + }, + { + "epoch": 0.03655200304500904, + "grad_norm": 2.319023847579956, + "learning_rate": 4.9835406129063424e-05, + "loss": 5.3023, + "step": 6146 + }, + { + "epoch": 0.03655795032829004, + "grad_norm": 2.1135916709899902, + "learning_rate": 4.98353526135731e-05, + "loss": 5.4796, + "step": 6147 + }, + { + "epoch": 0.036563897611571035, + "grad_norm": 2.409088373184204, + "learning_rate": 4.983529908941302e-05, + "loss": 5.3124, + "step": 6148 + }, + { + "epoch": 0.03656984489485203, + "grad_norm": 1.8679871559143066, + "learning_rate": 4.9835245556583185e-05, + "loss": 5.3741, + "step": 6149 + }, + { + "epoch": 0.03657579217813303, + "grad_norm": 1.9335602521896362, + "learning_rate": 4.983519201508363e-05, + "loss": 5.3231, + "step": 6150 + }, + { + "epoch": 0.03658173946141403, + "grad_norm": 2.0352535247802734, + "learning_rate": 4.9835138464914366e-05, + "loss": 5.4643, + "step": 6151 + }, + { + "epoch": 0.03658768674469502, + "grad_norm": 2.4156594276428223, + "learning_rate": 4.983508490607541e-05, + "loss": 5.4092, + "step": 6152 + }, + { + "epoch": 0.03659363402797602, + "grad_norm": 2.1936473846435547, + "learning_rate": 4.983503133856678e-05, + "loss": 5.5093, + "step": 6153 + }, + { + "epoch": 0.03659958131125702, + "grad_norm": 1.6346958875656128, + "learning_rate": 4.98349777623885e-05, + "loss": 5.512, + "step": 6154 + }, + { + "epoch": 0.036605528594538014, + "grad_norm": 1.9810141324996948, + "learning_rate": 4.9834924177540584e-05, + "loss": 5.4981, + "step": 6155 + }, + { + "epoch": 0.03661147587781901, + "grad_norm": 2.1253950595855713, + "learning_rate": 4.9834870584023055e-05, + "loss": 5.4022, + "step": 6156 + }, + { + "epoch": 0.03661742316110001, + "grad_norm": 2.011754274368286, + "learning_rate": 4.9834816981835926e-05, + "loss": 5.6107, + "step": 6157 + }, + { + "epoch": 0.036623370444381007, + "grad_norm": 2.210934638977051, + "learning_rate": 4.983476337097922e-05, + "loss": 5.4348, + "step": 6158 + }, + { + "epoch": 0.036629317727662, + "grad_norm": 2.1351871490478516, + "learning_rate": 4.983470975145296e-05, + "loss": 5.2022, + "step": 6159 + }, + { + "epoch": 0.036635265010943004, + "grad_norm": 2.1564714908599854, + "learning_rate": 4.983465612325715e-05, + "loss": 5.3583, + "step": 6160 + }, + { + "epoch": 0.036641212294224, + "grad_norm": 1.9411755800247192, + "learning_rate": 4.983460248639182e-05, + "loss": 5.4643, + "step": 6161 + }, + { + "epoch": 0.036647159577504994, + "grad_norm": 2.129741907119751, + "learning_rate": 4.983454884085699e-05, + "loss": 5.3834, + "step": 6162 + }, + { + "epoch": 0.036653106860785996, + "grad_norm": 2.12172269821167, + "learning_rate": 4.983449518665268e-05, + "loss": 5.4418, + "step": 6163 + }, + { + "epoch": 0.03665905414406699, + "grad_norm": 2.097452163696289, + "learning_rate": 4.9834441523778893e-05, + "loss": 5.3741, + "step": 6164 + }, + { + "epoch": 0.036665001427347986, + "grad_norm": 2.0458765029907227, + "learning_rate": 4.983438785223567e-05, + "loss": 5.373, + "step": 6165 + }, + { + "epoch": 0.03667094871062899, + "grad_norm": 1.9431376457214355, + "learning_rate": 4.983433417202301e-05, + "loss": 5.4003, + "step": 6166 + }, + { + "epoch": 0.03667689599390998, + "grad_norm": 2.136819362640381, + "learning_rate": 4.983428048314095e-05, + "loss": 5.503, + "step": 6167 + }, + { + "epoch": 0.03668284327719098, + "grad_norm": 1.863153338432312, + "learning_rate": 4.983422678558949e-05, + "loss": 5.4357, + "step": 6168 + }, + { + "epoch": 0.03668879056047197, + "grad_norm": 1.9198437929153442, + "learning_rate": 4.9834173079368665e-05, + "loss": 5.4304, + "step": 6169 + }, + { + "epoch": 0.036694737843752975, + "grad_norm": 1.9080480337142944, + "learning_rate": 4.9834119364478484e-05, + "loss": 5.4329, + "step": 6170 + }, + { + "epoch": 0.03670068512703397, + "grad_norm": 1.9116952419281006, + "learning_rate": 4.983406564091897e-05, + "loss": 5.3248, + "step": 6171 + }, + { + "epoch": 0.036706632410314965, + "grad_norm": 2.007685661315918, + "learning_rate": 4.983401190869014e-05, + "loss": 5.3554, + "step": 6172 + }, + { + "epoch": 0.03671257969359597, + "grad_norm": 1.8134535551071167, + "learning_rate": 4.983395816779201e-05, + "loss": 5.2907, + "step": 6173 + }, + { + "epoch": 0.03671852697687696, + "grad_norm": 2.093061685562134, + "learning_rate": 4.9833904418224606e-05, + "loss": 5.4055, + "step": 6174 + }, + { + "epoch": 0.03672447426015796, + "grad_norm": 2.1263599395751953, + "learning_rate": 4.9833850659987934e-05, + "loss": 5.2758, + "step": 6175 + }, + { + "epoch": 0.03673042154343896, + "grad_norm": 1.9442895650863647, + "learning_rate": 4.983379689308203e-05, + "loss": 5.4183, + "step": 6176 + }, + { + "epoch": 0.036736368826719955, + "grad_norm": 1.9587830305099487, + "learning_rate": 4.98337431175069e-05, + "loss": 5.3624, + "step": 6177 + }, + { + "epoch": 0.03674231611000095, + "grad_norm": 1.9845789670944214, + "learning_rate": 4.9833689333262565e-05, + "loss": 5.3933, + "step": 6178 + }, + { + "epoch": 0.03674826339328195, + "grad_norm": 1.9748643636703491, + "learning_rate": 4.9833635540349055e-05, + "loss": 5.5221, + "step": 6179 + }, + { + "epoch": 0.03675421067656295, + "grad_norm": 1.8139559030532837, + "learning_rate": 4.983358173876638e-05, + "loss": 5.5524, + "step": 6180 + }, + { + "epoch": 0.03676015795984394, + "grad_norm": 1.93784499168396, + "learning_rate": 4.9833527928514546e-05, + "loss": 5.7145, + "step": 6181 + }, + { + "epoch": 0.03676610524312494, + "grad_norm": 1.9064222574234009, + "learning_rate": 4.9833474109593594e-05, + "loss": 5.5283, + "step": 6182 + }, + { + "epoch": 0.03677205252640594, + "grad_norm": 1.7044670581817627, + "learning_rate": 4.9833420282003524e-05, + "loss": 5.2877, + "step": 6183 + }, + { + "epoch": 0.036777999809686934, + "grad_norm": 1.8328427076339722, + "learning_rate": 4.983336644574437e-05, + "loss": 5.5019, + "step": 6184 + }, + { + "epoch": 0.03678394709296793, + "grad_norm": 1.600780725479126, + "learning_rate": 4.983331260081614e-05, + "loss": 5.5347, + "step": 6185 + }, + { + "epoch": 0.03678989437624893, + "grad_norm": 1.8333978652954102, + "learning_rate": 4.983325874721886e-05, + "loss": 5.5127, + "step": 6186 + }, + { + "epoch": 0.036795841659529926, + "grad_norm": 1.8825682401657104, + "learning_rate": 4.9833204884952546e-05, + "loss": 5.5338, + "step": 6187 + }, + { + "epoch": 0.03680178894281092, + "grad_norm": 1.6875951290130615, + "learning_rate": 4.983315101401721e-05, + "loss": 5.2465, + "step": 6188 + }, + { + "epoch": 0.036807736226091924, + "grad_norm": 1.6224017143249512, + "learning_rate": 4.983309713441289e-05, + "loss": 5.4741, + "step": 6189 + }, + { + "epoch": 0.03681368350937292, + "grad_norm": 1.991721272468567, + "learning_rate": 4.983304324613958e-05, + "loss": 5.4547, + "step": 6190 + }, + { + "epoch": 0.036819630792653914, + "grad_norm": 1.843961238861084, + "learning_rate": 4.983298934919732e-05, + "loss": 5.3262, + "step": 6191 + }, + { + "epoch": 0.036825578075934916, + "grad_norm": 1.8342533111572266, + "learning_rate": 4.983293544358612e-05, + "loss": 5.6808, + "step": 6192 + }, + { + "epoch": 0.03683152535921591, + "grad_norm": 1.8796159029006958, + "learning_rate": 4.983288152930599e-05, + "loss": 5.5454, + "step": 6193 + }, + { + "epoch": 0.036837472642496906, + "grad_norm": 1.9033316373825073, + "learning_rate": 4.983282760635696e-05, + "loss": 5.3566, + "step": 6194 + }, + { + "epoch": 0.03684341992577791, + "grad_norm": 1.915873408317566, + "learning_rate": 4.9832773674739054e-05, + "loss": 5.4555, + "step": 6195 + }, + { + "epoch": 0.0368493672090589, + "grad_norm": 1.8510993719100952, + "learning_rate": 4.983271973445228e-05, + "loss": 5.5042, + "step": 6196 + }, + { + "epoch": 0.0368553144923399, + "grad_norm": 1.7180782556533813, + "learning_rate": 4.983266578549666e-05, + "loss": 5.4671, + "step": 6197 + }, + { + "epoch": 0.03686126177562089, + "grad_norm": 1.7828874588012695, + "learning_rate": 4.983261182787221e-05, + "loss": 5.4943, + "step": 6198 + }, + { + "epoch": 0.036867209058901895, + "grad_norm": 1.5032141208648682, + "learning_rate": 4.983255786157895e-05, + "loss": 5.3881, + "step": 6199 + }, + { + "epoch": 0.03687315634218289, + "grad_norm": 2.530954599380493, + "learning_rate": 4.983250388661691e-05, + "loss": 5.4449, + "step": 6200 + }, + { + "epoch": 0.036879103625463885, + "grad_norm": 2.011044979095459, + "learning_rate": 4.983244990298609e-05, + "loss": 5.2722, + "step": 6201 + }, + { + "epoch": 0.03688505090874489, + "grad_norm": 2.2209532260894775, + "learning_rate": 4.9832395910686525e-05, + "loss": 5.0932, + "step": 6202 + }, + { + "epoch": 0.03689099819202588, + "grad_norm": 1.8695623874664307, + "learning_rate": 4.983234190971823e-05, + "loss": 5.2891, + "step": 6203 + }, + { + "epoch": 0.03689694547530688, + "grad_norm": 2.172349691390991, + "learning_rate": 4.983228790008121e-05, + "loss": 5.578, + "step": 6204 + }, + { + "epoch": 0.03690289275858788, + "grad_norm": 2.1099209785461426, + "learning_rate": 4.9832233881775505e-05, + "loss": 5.3708, + "step": 6205 + }, + { + "epoch": 0.036908840041868875, + "grad_norm": 2.16737961769104, + "learning_rate": 4.9832179854801116e-05, + "loss": 5.303, + "step": 6206 + }, + { + "epoch": 0.03691478732514987, + "grad_norm": 2.248220682144165, + "learning_rate": 4.983212581915807e-05, + "loss": 5.362, + "step": 6207 + }, + { + "epoch": 0.03692073460843087, + "grad_norm": 2.0701045989990234, + "learning_rate": 4.983207177484639e-05, + "loss": 5.4528, + "step": 6208 + }, + { + "epoch": 0.03692668189171187, + "grad_norm": 1.9989019632339478, + "learning_rate": 4.983201772186609e-05, + "loss": 5.786, + "step": 6209 + }, + { + "epoch": 0.03693262917499286, + "grad_norm": 1.9126088619232178, + "learning_rate": 4.983196366021719e-05, + "loss": 5.2312, + "step": 6210 + }, + { + "epoch": 0.03693857645827386, + "grad_norm": 2.1317548751831055, + "learning_rate": 4.9831909589899695e-05, + "loss": 5.3028, + "step": 6211 + }, + { + "epoch": 0.03694452374155486, + "grad_norm": 2.164898157119751, + "learning_rate": 4.983185551091365e-05, + "loss": 5.3186, + "step": 6212 + }, + { + "epoch": 0.036950471024835854, + "grad_norm": 2.1085855960845947, + "learning_rate": 4.983180142325906e-05, + "loss": 5.3026, + "step": 6213 + }, + { + "epoch": 0.03695641830811685, + "grad_norm": 1.8321222066879272, + "learning_rate": 4.983174732693594e-05, + "loss": 5.6632, + "step": 6214 + }, + { + "epoch": 0.03696236559139785, + "grad_norm": 2.0537941455841064, + "learning_rate": 4.983169322194432e-05, + "loss": 5.2269, + "step": 6215 + }, + { + "epoch": 0.036968312874678846, + "grad_norm": 1.9598063230514526, + "learning_rate": 4.98316391082842e-05, + "loss": 5.4974, + "step": 6216 + }, + { + "epoch": 0.03697426015795984, + "grad_norm": 2.3764376640319824, + "learning_rate": 4.983158498595563e-05, + "loss": 5.7715, + "step": 6217 + }, + { + "epoch": 0.036980207441240844, + "grad_norm": 1.8938835859298706, + "learning_rate": 4.9831530854958595e-05, + "loss": 5.5577, + "step": 6218 + }, + { + "epoch": 0.03698615472452184, + "grad_norm": 2.2023189067840576, + "learning_rate": 4.9831476715293134e-05, + "loss": 5.2596, + "step": 6219 + }, + { + "epoch": 0.036992102007802834, + "grad_norm": 1.9010800123214722, + "learning_rate": 4.9831422566959266e-05, + "loss": 5.3313, + "step": 6220 + }, + { + "epoch": 0.036998049291083836, + "grad_norm": 1.9679474830627441, + "learning_rate": 4.9831368409957e-05, + "loss": 5.2701, + "step": 6221 + }, + { + "epoch": 0.03700399657436483, + "grad_norm": 1.903558373451233, + "learning_rate": 4.983131424428635e-05, + "loss": 5.2821, + "step": 6222 + }, + { + "epoch": 0.037009943857645826, + "grad_norm": 1.976114273071289, + "learning_rate": 4.983126006994736e-05, + "loss": 5.374, + "step": 6223 + }, + { + "epoch": 0.03701589114092683, + "grad_norm": 2.9803311824798584, + "learning_rate": 4.983120588694003e-05, + "loss": 5.3576, + "step": 6224 + }, + { + "epoch": 0.03702183842420782, + "grad_norm": 1.5921218395233154, + "learning_rate": 4.983115169526438e-05, + "loss": 5.1654, + "step": 6225 + }, + { + "epoch": 0.03702778570748882, + "grad_norm": 1.7458349466323853, + "learning_rate": 4.983109749492043e-05, + "loss": 5.1038, + "step": 6226 + }, + { + "epoch": 0.03703373299076981, + "grad_norm": 1.9425132274627686, + "learning_rate": 4.983104328590821e-05, + "loss": 5.3815, + "step": 6227 + }, + { + "epoch": 0.037039680274050815, + "grad_norm": 1.9506715536117554, + "learning_rate": 4.983098906822772e-05, + "loss": 5.2215, + "step": 6228 + }, + { + "epoch": 0.03704562755733181, + "grad_norm": 1.8596410751342773, + "learning_rate": 4.983093484187899e-05, + "loss": 5.2058, + "step": 6229 + }, + { + "epoch": 0.037051574840612805, + "grad_norm": 1.720473289489746, + "learning_rate": 4.9830880606862043e-05, + "loss": 5.2701, + "step": 6230 + }, + { + "epoch": 0.03705752212389381, + "grad_norm": 1.7786411046981812, + "learning_rate": 4.983082636317688e-05, + "loss": 5.3216, + "step": 6231 + }, + { + "epoch": 0.0370634694071748, + "grad_norm": 3.6291537284851074, + "learning_rate": 4.983077211082354e-05, + "loss": 5.2282, + "step": 6232 + }, + { + "epoch": 0.0370694166904558, + "grad_norm": 1.7453030347824097, + "learning_rate": 4.983071784980203e-05, + "loss": 5.2667, + "step": 6233 + }, + { + "epoch": 0.0370753639737368, + "grad_norm": 1.7036694288253784, + "learning_rate": 4.983066358011238e-05, + "loss": 5.3023, + "step": 6234 + }, + { + "epoch": 0.037081311257017795, + "grad_norm": 1.7196505069732666, + "learning_rate": 4.9830609301754595e-05, + "loss": 5.2211, + "step": 6235 + }, + { + "epoch": 0.03708725854029879, + "grad_norm": 3.4630305767059326, + "learning_rate": 4.983055501472871e-05, + "loss": 5.6159, + "step": 6236 + }, + { + "epoch": 0.03709320582357979, + "grad_norm": 2.9739367961883545, + "learning_rate": 4.9830500719034726e-05, + "loss": 5.4477, + "step": 6237 + }, + { + "epoch": 0.03709915310686079, + "grad_norm": 2.760664463043213, + "learning_rate": 4.983044641467267e-05, + "loss": 5.0879, + "step": 6238 + }, + { + "epoch": 0.03710510039014178, + "grad_norm": 2.166203022003174, + "learning_rate": 4.9830392101642566e-05, + "loss": 5.5635, + "step": 6239 + }, + { + "epoch": 0.03711104767342278, + "grad_norm": 2.3798410892486572, + "learning_rate": 4.9830337779944425e-05, + "loss": 5.0676, + "step": 6240 + }, + { + "epoch": 0.03711699495670378, + "grad_norm": 2.3990557193756104, + "learning_rate": 4.983028344957827e-05, + "loss": 5.2788, + "step": 6241 + }, + { + "epoch": 0.037122942239984774, + "grad_norm": 2.487978458404541, + "learning_rate": 4.9830229110544124e-05, + "loss": 5.852, + "step": 6242 + }, + { + "epoch": 0.03712888952326577, + "grad_norm": 2.304749011993408, + "learning_rate": 4.9830174762842e-05, + "loss": 6.0886, + "step": 6243 + }, + { + "epoch": 0.03713483680654677, + "grad_norm": 2.169614791870117, + "learning_rate": 4.983012040647191e-05, + "loss": 6.1178, + "step": 6244 + }, + { + "epoch": 0.037140784089827766, + "grad_norm": 2.119131326675415, + "learning_rate": 4.98300660414339e-05, + "loss": 6.25, + "step": 6245 + }, + { + "epoch": 0.03714673137310876, + "grad_norm": 2.3797547817230225, + "learning_rate": 4.9830011667727964e-05, + "loss": 5.879, + "step": 6246 + }, + { + "epoch": 0.03715267865638976, + "grad_norm": 2.303718328475952, + "learning_rate": 4.982995728535411e-05, + "loss": 6.0015, + "step": 6247 + }, + { + "epoch": 0.03715862593967076, + "grad_norm": 2.867103099822998, + "learning_rate": 4.9829902894312396e-05, + "loss": 5.8726, + "step": 6248 + }, + { + "epoch": 0.037164573222951754, + "grad_norm": 2.4248557090759277, + "learning_rate": 4.9829848494602806e-05, + "loss": 5.6579, + "step": 6249 + }, + { + "epoch": 0.037170520506232756, + "grad_norm": 2.2622148990631104, + "learning_rate": 4.982979408622538e-05, + "loss": 5.7677, + "step": 6250 + }, + { + "epoch": 0.03717646778951375, + "grad_norm": 2.320502996444702, + "learning_rate": 4.9829739669180126e-05, + "loss": 5.7362, + "step": 6251 + }, + { + "epoch": 0.037182415072794746, + "grad_norm": 2.2096636295318604, + "learning_rate": 4.9829685243467065e-05, + "loss": 5.9069, + "step": 6252 + }, + { + "epoch": 0.03718836235607575, + "grad_norm": 2.620361089706421, + "learning_rate": 4.982963080908623e-05, + "loss": 5.9419, + "step": 6253 + }, + { + "epoch": 0.03719430963935674, + "grad_norm": 2.478158950805664, + "learning_rate": 4.982957636603761e-05, + "loss": 6.4776, + "step": 6254 + }, + { + "epoch": 0.03720025692263774, + "grad_norm": 2.5912528038024902, + "learning_rate": 4.982952191432125e-05, + "loss": 5.7176, + "step": 6255 + }, + { + "epoch": 0.03720620420591873, + "grad_norm": 2.57177734375, + "learning_rate": 4.982946745393716e-05, + "loss": 5.4271, + "step": 6256 + }, + { + "epoch": 0.037212151489199735, + "grad_norm": 2.424567699432373, + "learning_rate": 4.982941298488535e-05, + "loss": 5.82, + "step": 6257 + }, + { + "epoch": 0.03721809877248073, + "grad_norm": 2.477827548980713, + "learning_rate": 4.9829358507165856e-05, + "loss": 5.7961, + "step": 6258 + }, + { + "epoch": 0.037224046055761725, + "grad_norm": 2.0598270893096924, + "learning_rate": 4.982930402077869e-05, + "loss": 5.9264, + "step": 6259 + }, + { + "epoch": 0.03722999333904273, + "grad_norm": 2.0599095821380615, + "learning_rate": 4.9829249525723875e-05, + "loss": 6.0518, + "step": 6260 + }, + { + "epoch": 0.03723594062232372, + "grad_norm": 2.110170841217041, + "learning_rate": 4.982919502200142e-05, + "loss": 5.8631, + "step": 6261 + }, + { + "epoch": 0.03724188790560472, + "grad_norm": 2.333972930908203, + "learning_rate": 4.982914050961135e-05, + "loss": 5.5361, + "step": 6262 + }, + { + "epoch": 0.03724783518888572, + "grad_norm": 2.2322769165039062, + "learning_rate": 4.982908598855369e-05, + "loss": 5.8002, + "step": 6263 + }, + { + "epoch": 0.037253782472166715, + "grad_norm": 1.9915717840194702, + "learning_rate": 4.982903145882845e-05, + "loss": 5.7096, + "step": 6264 + }, + { + "epoch": 0.03725972975544771, + "grad_norm": 2.2031619548797607, + "learning_rate": 4.9828976920435645e-05, + "loss": 5.5716, + "step": 6265 + }, + { + "epoch": 0.03726567703872871, + "grad_norm": 2.9422314167022705, + "learning_rate": 4.9828922373375295e-05, + "loss": 5.929, + "step": 6266 + }, + { + "epoch": 0.03727162432200971, + "grad_norm": 3.264784336090088, + "learning_rate": 4.982886781764744e-05, + "loss": 5.9801, + "step": 6267 + }, + { + "epoch": 0.0372775716052907, + "grad_norm": 2.8314197063446045, + "learning_rate": 4.982881325325208e-05, + "loss": 6.0173, + "step": 6268 + }, + { + "epoch": 0.0372835188885717, + "grad_norm": 2.9550328254699707, + "learning_rate": 4.9828758680189234e-05, + "loss": 5.9838, + "step": 6269 + }, + { + "epoch": 0.0372894661718527, + "grad_norm": 2.6827526092529297, + "learning_rate": 4.9828704098458924e-05, + "loss": 6.0235, + "step": 6270 + }, + { + "epoch": 0.037295413455133694, + "grad_norm": 2.7174222469329834, + "learning_rate": 4.982864950806118e-05, + "loss": 5.8315, + "step": 6271 + }, + { + "epoch": 0.03730136073841469, + "grad_norm": 2.6177315711975098, + "learning_rate": 4.9828594908996e-05, + "loss": 5.8577, + "step": 6272 + }, + { + "epoch": 0.03730730802169569, + "grad_norm": 2.449669361114502, + "learning_rate": 4.982854030126342e-05, + "loss": 5.9591, + "step": 6273 + }, + { + "epoch": 0.037313255304976686, + "grad_norm": 2.5328989028930664, + "learning_rate": 4.9828485684863446e-05, + "loss": 5.7764, + "step": 6274 + }, + { + "epoch": 0.03731920258825768, + "grad_norm": 2.2581989765167236, + "learning_rate": 4.982843105979611e-05, + "loss": 5.9524, + "step": 6275 + }, + { + "epoch": 0.03732514987153868, + "grad_norm": 2.261212110519409, + "learning_rate": 4.982837642606142e-05, + "loss": 5.5814, + "step": 6276 + }, + { + "epoch": 0.03733109715481968, + "grad_norm": 2.2957348823547363, + "learning_rate": 4.98283217836594e-05, + "loss": 5.6967, + "step": 6277 + }, + { + "epoch": 0.037337044438100674, + "grad_norm": 2.814037322998047, + "learning_rate": 4.982826713259008e-05, + "loss": 5.8787, + "step": 6278 + }, + { + "epoch": 0.037342991721381676, + "grad_norm": 2.678133249282837, + "learning_rate": 4.9828212472853464e-05, + "loss": 5.94, + "step": 6279 + }, + { + "epoch": 0.03734893900466267, + "grad_norm": 2.2949652671813965, + "learning_rate": 4.982815780444957e-05, + "loss": 5.7263, + "step": 6280 + }, + { + "epoch": 0.037354886287943666, + "grad_norm": 2.4542131423950195, + "learning_rate": 4.982810312737842e-05, + "loss": 5.8317, + "step": 6281 + }, + { + "epoch": 0.03736083357122467, + "grad_norm": 2.7850544452667236, + "learning_rate": 4.982804844164005e-05, + "loss": 5.5631, + "step": 6282 + }, + { + "epoch": 0.03736678085450566, + "grad_norm": 2.6285061836242676, + "learning_rate": 4.9827993747234454e-05, + "loss": 5.6212, + "step": 6283 + }, + { + "epoch": 0.03737272813778666, + "grad_norm": 2.602590799331665, + "learning_rate": 4.9827939044161666e-05, + "loss": 5.5529, + "step": 6284 + }, + { + "epoch": 0.03737867542106765, + "grad_norm": 2.6196670532226562, + "learning_rate": 4.98278843324217e-05, + "loss": 5.6915, + "step": 6285 + }, + { + "epoch": 0.037384622704348655, + "grad_norm": 2.7072317600250244, + "learning_rate": 4.982782961201457e-05, + "loss": 5.7535, + "step": 6286 + }, + { + "epoch": 0.03739056998762965, + "grad_norm": 2.626033067703247, + "learning_rate": 4.982777488294031e-05, + "loss": 5.6053, + "step": 6287 + }, + { + "epoch": 0.037396517270910645, + "grad_norm": 1.8426648378372192, + "learning_rate": 4.982772014519892e-05, + "loss": 5.6167, + "step": 6288 + }, + { + "epoch": 0.03740246455419165, + "grad_norm": 2.5587830543518066, + "learning_rate": 4.9827665398790445e-05, + "loss": 5.6442, + "step": 6289 + }, + { + "epoch": 0.03740841183747264, + "grad_norm": 2.6163039207458496, + "learning_rate": 4.9827610643714877e-05, + "loss": 5.699, + "step": 6290 + }, + { + "epoch": 0.03741435912075364, + "grad_norm": 2.5752358436584473, + "learning_rate": 4.982755587997225e-05, + "loss": 5.666, + "step": 6291 + }, + { + "epoch": 0.03742030640403464, + "grad_norm": 2.6609575748443604, + "learning_rate": 4.982750110756258e-05, + "loss": 5.5634, + "step": 6292 + }, + { + "epoch": 0.037426253687315635, + "grad_norm": 2.724731683731079, + "learning_rate": 4.9827446326485884e-05, + "loss": 5.6259, + "step": 6293 + }, + { + "epoch": 0.03743220097059663, + "grad_norm": 2.5849807262420654, + "learning_rate": 4.9827391536742185e-05, + "loss": 5.6182, + "step": 6294 + }, + { + "epoch": 0.03743814825387763, + "grad_norm": 2.6737449169158936, + "learning_rate": 4.9827336738331496e-05, + "loss": 5.5426, + "step": 6295 + }, + { + "epoch": 0.03744409553715863, + "grad_norm": 2.5739669799804688, + "learning_rate": 4.9827281931253844e-05, + "loss": 5.6283, + "step": 6296 + }, + { + "epoch": 0.03745004282043962, + "grad_norm": 2.652730703353882, + "learning_rate": 4.982722711550924e-05, + "loss": 5.5241, + "step": 6297 + }, + { + "epoch": 0.037455990103720624, + "grad_norm": 2.7140653133392334, + "learning_rate": 4.982717229109772e-05, + "loss": 5.7052, + "step": 6298 + }, + { + "epoch": 0.03746193738700162, + "grad_norm": 2.1617860794067383, + "learning_rate": 4.982711745801928e-05, + "loss": 5.6224, + "step": 6299 + }, + { + "epoch": 0.037467884670282614, + "grad_norm": 2.1400585174560547, + "learning_rate": 4.982706261627395e-05, + "loss": 5.5753, + "step": 6300 + }, + { + "epoch": 0.03747383195356361, + "grad_norm": 2.4439101219177246, + "learning_rate": 4.9827007765861754e-05, + "loss": 5.6219, + "step": 6301 + }, + { + "epoch": 0.03747977923684461, + "grad_norm": 2.507141351699829, + "learning_rate": 4.9826952906782697e-05, + "loss": 5.6666, + "step": 6302 + }, + { + "epoch": 0.037485726520125606, + "grad_norm": 2.2664029598236084, + "learning_rate": 4.982689803903682e-05, + "loss": 5.7792, + "step": 6303 + }, + { + "epoch": 0.0374916738034066, + "grad_norm": 2.49678635597229, + "learning_rate": 4.982684316262411e-05, + "loss": 5.5899, + "step": 6304 + }, + { + "epoch": 0.0374976210866876, + "grad_norm": 2.244603395462036, + "learning_rate": 4.9826788277544625e-05, + "loss": 5.4624, + "step": 6305 + }, + { + "epoch": 0.0375035683699686, + "grad_norm": 2.144343376159668, + "learning_rate": 4.9826733383798366e-05, + "loss": 5.3428, + "step": 6306 + }, + { + "epoch": 0.037509515653249594, + "grad_norm": 1.7709565162658691, + "learning_rate": 4.982667848138534e-05, + "loss": 5.3596, + "step": 6307 + }, + { + "epoch": 0.037515462936530596, + "grad_norm": 2.0245232582092285, + "learning_rate": 4.9826623570305574e-05, + "loss": 5.4005, + "step": 6308 + }, + { + "epoch": 0.03752141021981159, + "grad_norm": 2.5346829891204834, + "learning_rate": 4.9826568650559095e-05, + "loss": 5.5089, + "step": 6309 + }, + { + "epoch": 0.037527357503092586, + "grad_norm": 2.638684034347534, + "learning_rate": 4.982651372214592e-05, + "loss": 5.6847, + "step": 6310 + }, + { + "epoch": 0.03753330478637359, + "grad_norm": 2.024423122406006, + "learning_rate": 4.982645878506606e-05, + "loss": 5.3633, + "step": 6311 + }, + { + "epoch": 0.03753925206965458, + "grad_norm": 1.983167290687561, + "learning_rate": 4.982640383931955e-05, + "loss": 5.2086, + "step": 6312 + }, + { + "epoch": 0.03754519935293558, + "grad_norm": 1.8388524055480957, + "learning_rate": 4.982634888490639e-05, + "loss": 5.1904, + "step": 6313 + }, + { + "epoch": 0.03755114663621657, + "grad_norm": 1.8280584812164307, + "learning_rate": 4.982629392182661e-05, + "loss": 5.3072, + "step": 6314 + }, + { + "epoch": 0.037557093919497575, + "grad_norm": 1.6278408765792847, + "learning_rate": 4.982623895008023e-05, + "loss": 5.3003, + "step": 6315 + }, + { + "epoch": 0.03756304120277857, + "grad_norm": 2.0519096851348877, + "learning_rate": 4.982618396966726e-05, + "loss": 5.3494, + "step": 6316 + }, + { + "epoch": 0.037568988486059565, + "grad_norm": 1.935744285583496, + "learning_rate": 4.982612898058773e-05, + "loss": 5.6993, + "step": 6317 + }, + { + "epoch": 0.03757493576934057, + "grad_norm": 1.882163166999817, + "learning_rate": 4.9826073982841656e-05, + "loss": 5.758, + "step": 6318 + }, + { + "epoch": 0.03758088305262156, + "grad_norm": 1.7747882604599, + "learning_rate": 4.982601897642906e-05, + "loss": 5.1501, + "step": 6319 + }, + { + "epoch": 0.03758683033590256, + "grad_norm": 2.044093370437622, + "learning_rate": 4.982596396134995e-05, + "loss": 5.2801, + "step": 6320 + }, + { + "epoch": 0.03759277761918356, + "grad_norm": 1.739441990852356, + "learning_rate": 4.9825908937604346e-05, + "loss": 5.1619, + "step": 6321 + }, + { + "epoch": 0.037598724902464555, + "grad_norm": 2.0353312492370605, + "learning_rate": 4.982585390519229e-05, + "loss": 5.6796, + "step": 6322 + }, + { + "epoch": 0.03760467218574555, + "grad_norm": 2.076667308807373, + "learning_rate": 4.9825798864113774e-05, + "loss": 6.2522, + "step": 6323 + }, + { + "epoch": 0.03761061946902655, + "grad_norm": 2.773676633834839, + "learning_rate": 4.982574381436883e-05, + "loss": 5.879, + "step": 6324 + }, + { + "epoch": 0.03761656675230755, + "grad_norm": 2.2013933658599854, + "learning_rate": 4.982568875595748e-05, + "loss": 6.0341, + "step": 6325 + }, + { + "epoch": 0.03762251403558854, + "grad_norm": 2.288806915283203, + "learning_rate": 4.9825633688879736e-05, + "loss": 6.219, + "step": 6326 + }, + { + "epoch": 0.037628461318869544, + "grad_norm": 2.874372720718384, + "learning_rate": 4.982557861313561e-05, + "loss": 5.7616, + "step": 6327 + }, + { + "epoch": 0.03763440860215054, + "grad_norm": 2.7471537590026855, + "learning_rate": 4.982552352872515e-05, + "loss": 5.7214, + "step": 6328 + }, + { + "epoch": 0.037640355885431534, + "grad_norm": 2.475513458251953, + "learning_rate": 4.982546843564834e-05, + "loss": 6.0039, + "step": 6329 + }, + { + "epoch": 0.03764630316871253, + "grad_norm": 2.5376412868499756, + "learning_rate": 4.982541333390523e-05, + "loss": 6.3042, + "step": 6330 + }, + { + "epoch": 0.03765225045199353, + "grad_norm": 2.599989414215088, + "learning_rate": 4.9825358223495814e-05, + "loss": 6.488, + "step": 6331 + }, + { + "epoch": 0.037658197735274526, + "grad_norm": 2.2657089233398438, + "learning_rate": 4.9825303104420115e-05, + "loss": 6.2743, + "step": 6332 + }, + { + "epoch": 0.03766414501855552, + "grad_norm": 2.303926467895508, + "learning_rate": 4.982524797667818e-05, + "loss": 6.3888, + "step": 6333 + }, + { + "epoch": 0.03767009230183652, + "grad_norm": 2.771775007247925, + "learning_rate": 4.982519284026999e-05, + "loss": 6.0911, + "step": 6334 + }, + { + "epoch": 0.03767603958511752, + "grad_norm": 2.492748260498047, + "learning_rate": 4.982513769519559e-05, + "loss": 5.9905, + "step": 6335 + }, + { + "epoch": 0.03768198686839851, + "grad_norm": 2.294985771179199, + "learning_rate": 4.982508254145498e-05, + "loss": 6.4574, + "step": 6336 + }, + { + "epoch": 0.037687934151679515, + "grad_norm": 2.6514554023742676, + "learning_rate": 4.9825027379048205e-05, + "loss": 6.1541, + "step": 6337 + }, + { + "epoch": 0.03769388143496051, + "grad_norm": 2.0114963054656982, + "learning_rate": 4.982497220797526e-05, + "loss": 6.0602, + "step": 6338 + }, + { + "epoch": 0.037699828718241506, + "grad_norm": 2.6345295906066895, + "learning_rate": 4.982491702823618e-05, + "loss": 6.024, + "step": 6339 + }, + { + "epoch": 0.03770577600152251, + "grad_norm": 2.619980573654175, + "learning_rate": 4.982486183983097e-05, + "loss": 6.0642, + "step": 6340 + }, + { + "epoch": 0.0377117232848035, + "grad_norm": 2.491279125213623, + "learning_rate": 4.9824806642759664e-05, + "loss": 5.8517, + "step": 6341 + }, + { + "epoch": 0.0377176705680845, + "grad_norm": 2.5161385536193848, + "learning_rate": 4.982475143702227e-05, + "loss": 5.7467, + "step": 6342 + }, + { + "epoch": 0.03772361785136549, + "grad_norm": 2.3237602710723877, + "learning_rate": 4.982469622261882e-05, + "loss": 5.801, + "step": 6343 + }, + { + "epoch": 0.037729565134646495, + "grad_norm": 2.21382999420166, + "learning_rate": 4.9824640999549314e-05, + "loss": 5.968, + "step": 6344 + }, + { + "epoch": 0.03773551241792749, + "grad_norm": 2.1770498752593994, + "learning_rate": 4.9824585767813794e-05, + "loss": 6.2998, + "step": 6345 + }, + { + "epoch": 0.037741459701208485, + "grad_norm": 2.321563720703125, + "learning_rate": 4.982453052741225e-05, + "loss": 5.631, + "step": 6346 + }, + { + "epoch": 0.03774740698448949, + "grad_norm": 3.2769439220428467, + "learning_rate": 4.982447527834473e-05, + "loss": 5.4845, + "step": 6347 + }, + { + "epoch": 0.03775335426777048, + "grad_norm": 2.954331874847412, + "learning_rate": 4.9824420020611244e-05, + "loss": 5.2, + "step": 6348 + }, + { + "epoch": 0.03775930155105148, + "grad_norm": 2.735182523727417, + "learning_rate": 4.98243647542118e-05, + "loss": 5.1907, + "step": 6349 + }, + { + "epoch": 0.03776524883433248, + "grad_norm": 2.872142791748047, + "learning_rate": 4.982430947914644e-05, + "loss": 5.5159, + "step": 6350 + }, + { + "epoch": 0.037771196117613474, + "grad_norm": 3.14219331741333, + "learning_rate": 4.982425419541517e-05, + "loss": 5.0843, + "step": 6351 + }, + { + "epoch": 0.03777714340089447, + "grad_norm": 2.2689874172210693, + "learning_rate": 4.9824198903018e-05, + "loss": 6.0446, + "step": 6352 + }, + { + "epoch": 0.03778309068417547, + "grad_norm": 2.3468856811523438, + "learning_rate": 4.982414360195496e-05, + "loss": 5.952, + "step": 6353 + }, + { + "epoch": 0.03778903796745647, + "grad_norm": 2.944509983062744, + "learning_rate": 4.9824088292226065e-05, + "loss": 5.4918, + "step": 6354 + }, + { + "epoch": 0.03779498525073746, + "grad_norm": 2.8139286041259766, + "learning_rate": 4.982403297383135e-05, + "loss": 5.3296, + "step": 6355 + }, + { + "epoch": 0.037800932534018464, + "grad_norm": 2.540224552154541, + "learning_rate": 4.982397764677081e-05, + "loss": 5.3464, + "step": 6356 + }, + { + "epoch": 0.03780687981729946, + "grad_norm": 2.56709885597229, + "learning_rate": 4.982392231104448e-05, + "loss": 5.2313, + "step": 6357 + }, + { + "epoch": 0.037812827100580454, + "grad_norm": 2.2051165103912354, + "learning_rate": 4.982386696665238e-05, + "loss": 5.7783, + "step": 6358 + }, + { + "epoch": 0.03781877438386145, + "grad_norm": 2.5773870944976807, + "learning_rate": 4.9823811613594515e-05, + "loss": 5.6691, + "step": 6359 + }, + { + "epoch": 0.03782472166714245, + "grad_norm": 2.5163073539733887, + "learning_rate": 4.982375625187092e-05, + "loss": 5.7936, + "step": 6360 + }, + { + "epoch": 0.037830668950423446, + "grad_norm": 2.4268851280212402, + "learning_rate": 4.98237008814816e-05, + "loss": 5.8116, + "step": 6361 + }, + { + "epoch": 0.03783661623370444, + "grad_norm": 2.397402286529541, + "learning_rate": 4.9823645502426597e-05, + "loss": 5.9895, + "step": 6362 + }, + { + "epoch": 0.03784256351698544, + "grad_norm": 2.590672731399536, + "learning_rate": 4.98235901147059e-05, + "loss": 5.9022, + "step": 6363 + }, + { + "epoch": 0.03784851080026644, + "grad_norm": 2.268540859222412, + "learning_rate": 4.9823534718319557e-05, + "loss": 5.8958, + "step": 6364 + }, + { + "epoch": 0.03785445808354743, + "grad_norm": 2.1419460773468018, + "learning_rate": 4.982347931326757e-05, + "loss": 5.8446, + "step": 6365 + }, + { + "epoch": 0.037860405366828435, + "grad_norm": 2.3988053798675537, + "learning_rate": 4.9823423899549957e-05, + "loss": 6.2267, + "step": 6366 + }, + { + "epoch": 0.03786635265010943, + "grad_norm": 2.120121955871582, + "learning_rate": 4.9823368477166755e-05, + "loss": 6.1352, + "step": 6367 + }, + { + "epoch": 0.037872299933390426, + "grad_norm": 2.274610996246338, + "learning_rate": 4.982331304611796e-05, + "loss": 6.1342, + "step": 6368 + }, + { + "epoch": 0.03787824721667143, + "grad_norm": 1.6934765577316284, + "learning_rate": 4.98232576064036e-05, + "loss": 5.7969, + "step": 6369 + }, + { + "epoch": 0.03788419449995242, + "grad_norm": 2.62416672706604, + "learning_rate": 4.982320215802371e-05, + "loss": 5.9669, + "step": 6370 + }, + { + "epoch": 0.03789014178323342, + "grad_norm": 2.416639804840088, + "learning_rate": 4.98231467009783e-05, + "loss": 5.9628, + "step": 6371 + }, + { + "epoch": 0.03789608906651441, + "grad_norm": 2.049412965774536, + "learning_rate": 4.9823091235267375e-05, + "loss": 5.658, + "step": 6372 + }, + { + "epoch": 0.037902036349795415, + "grad_norm": 2.0502147674560547, + "learning_rate": 4.982303576089097e-05, + "loss": 5.9114, + "step": 6373 + }, + { + "epoch": 0.03790798363307641, + "grad_norm": 2.1566948890686035, + "learning_rate": 4.982298027784909e-05, + "loss": 5.6932, + "step": 6374 + }, + { + "epoch": 0.037913930916357405, + "grad_norm": 2.394083261489868, + "learning_rate": 4.9822924786141774e-05, + "loss": 6.3041, + "step": 6375 + }, + { + "epoch": 0.03791987819963841, + "grad_norm": 2.545910120010376, + "learning_rate": 4.9822869285769024e-05, + "loss": 6.2125, + "step": 6376 + }, + { + "epoch": 0.0379258254829194, + "grad_norm": 2.271461248397827, + "learning_rate": 4.9822813776730875e-05, + "loss": 6.2322, + "step": 6377 + }, + { + "epoch": 0.0379317727662004, + "grad_norm": 2.3840630054473877, + "learning_rate": 4.9822758259027336e-05, + "loss": 6.0167, + "step": 6378 + }, + { + "epoch": 0.0379377200494814, + "grad_norm": 2.600618600845337, + "learning_rate": 4.9822702732658426e-05, + "loss": 5.6722, + "step": 6379 + }, + { + "epoch": 0.037943667332762394, + "grad_norm": 2.0911965370178223, + "learning_rate": 4.982264719762417e-05, + "loss": 5.579, + "step": 6380 + }, + { + "epoch": 0.03794961461604339, + "grad_norm": 2.015505075454712, + "learning_rate": 4.9822591653924575e-05, + "loss": 5.9747, + "step": 6381 + }, + { + "epoch": 0.03795556189932439, + "grad_norm": 2.237262010574341, + "learning_rate": 4.982253610155968e-05, + "loss": 6.3792, + "step": 6382 + }, + { + "epoch": 0.03796150918260539, + "grad_norm": 2.1448137760162354, + "learning_rate": 4.982248054052949e-05, + "loss": 6.1049, + "step": 6383 + }, + { + "epoch": 0.03796745646588638, + "grad_norm": 2.2597758769989014, + "learning_rate": 4.9822424970834034e-05, + "loss": 5.8428, + "step": 6384 + }, + { + "epoch": 0.037973403749167384, + "grad_norm": 1.9935969114303589, + "learning_rate": 4.982236939247332e-05, + "loss": 6.0032, + "step": 6385 + }, + { + "epoch": 0.03797935103244838, + "grad_norm": 2.506916046142578, + "learning_rate": 4.982231380544737e-05, + "loss": 5.9221, + "step": 6386 + }, + { + "epoch": 0.037985298315729374, + "grad_norm": 2.083393096923828, + "learning_rate": 4.9822258209756214e-05, + "loss": 5.8862, + "step": 6387 + }, + { + "epoch": 0.03799124559901037, + "grad_norm": 2.631091594696045, + "learning_rate": 4.982220260539987e-05, + "loss": 5.6593, + "step": 6388 + }, + { + "epoch": 0.03799719288229137, + "grad_norm": 2.5732531547546387, + "learning_rate": 4.982214699237834e-05, + "loss": 5.5084, + "step": 6389 + }, + { + "epoch": 0.038003140165572366, + "grad_norm": 2.7797791957855225, + "learning_rate": 4.982209137069166e-05, + "loss": 5.6792, + "step": 6390 + }, + { + "epoch": 0.03800908744885336, + "grad_norm": 2.2800772190093994, + "learning_rate": 4.982203574033984e-05, + "loss": 5.6299, + "step": 6391 + }, + { + "epoch": 0.03801503473213436, + "grad_norm": 2.4182863235473633, + "learning_rate": 4.9821980101322905e-05, + "loss": 5.71, + "step": 6392 + }, + { + "epoch": 0.03802098201541536, + "grad_norm": 2.2968835830688477, + "learning_rate": 4.982192445364088e-05, + "loss": 5.6112, + "step": 6393 + }, + { + "epoch": 0.03802692929869635, + "grad_norm": 2.3713324069976807, + "learning_rate": 4.982186879729377e-05, + "loss": 5.423, + "step": 6394 + }, + { + "epoch": 0.038032876581977355, + "grad_norm": 2.745352268218994, + "learning_rate": 4.98218131322816e-05, + "loss": 5.5145, + "step": 6395 + }, + { + "epoch": 0.03803882386525835, + "grad_norm": 2.755211353302002, + "learning_rate": 4.98217574586044e-05, + "loss": 5.4399, + "step": 6396 + }, + { + "epoch": 0.038044771148539346, + "grad_norm": 2.5452096462249756, + "learning_rate": 4.982170177626217e-05, + "loss": 5.5691, + "step": 6397 + }, + { + "epoch": 0.03805071843182035, + "grad_norm": 2.6195876598358154, + "learning_rate": 4.9821646085254954e-05, + "loss": 5.4512, + "step": 6398 + }, + { + "epoch": 0.03805666571510134, + "grad_norm": 2.4931671619415283, + "learning_rate": 4.982159038558275e-05, + "loss": 6.0505, + "step": 6399 + }, + { + "epoch": 0.03806261299838234, + "grad_norm": 2.45062255859375, + "learning_rate": 4.982153467724558e-05, + "loss": 6.2367, + "step": 6400 + }, + { + "epoch": 0.03806856028166333, + "grad_norm": 2.688624620437622, + "learning_rate": 4.982147896024348e-05, + "loss": 6.0522, + "step": 6401 + }, + { + "epoch": 0.038074507564944335, + "grad_norm": 2.421660900115967, + "learning_rate": 4.982142323457645e-05, + "loss": 5.8166, + "step": 6402 + }, + { + "epoch": 0.03808045484822533, + "grad_norm": 2.594134569168091, + "learning_rate": 4.982136750024452e-05, + "loss": 5.5476, + "step": 6403 + }, + { + "epoch": 0.038086402131506325, + "grad_norm": 2.4492971897125244, + "learning_rate": 4.982131175724771e-05, + "loss": 5.2302, + "step": 6404 + }, + { + "epoch": 0.03809234941478733, + "grad_norm": 2.4200360774993896, + "learning_rate": 4.9821256005586036e-05, + "loss": 6.1404, + "step": 6405 + }, + { + "epoch": 0.03809829669806832, + "grad_norm": 2.1949775218963623, + "learning_rate": 4.982120024525951e-05, + "loss": 5.9589, + "step": 6406 + }, + { + "epoch": 0.03810424398134932, + "grad_norm": 2.3570375442504883, + "learning_rate": 4.9821144476268164e-05, + "loss": 5.9022, + "step": 6407 + }, + { + "epoch": 0.03811019126463032, + "grad_norm": 2.16460919380188, + "learning_rate": 4.9821088698612016e-05, + "loss": 5.8535, + "step": 6408 + }, + { + "epoch": 0.038116138547911314, + "grad_norm": 1.8189443349838257, + "learning_rate": 4.982103291229108e-05, + "loss": 5.9345, + "step": 6409 + }, + { + "epoch": 0.03812208583119231, + "grad_norm": 2.553919792175293, + "learning_rate": 4.9820977117305376e-05, + "loss": 5.31, + "step": 6410 + }, + { + "epoch": 0.03812803311447331, + "grad_norm": 2.8085403442382812, + "learning_rate": 4.982092131365493e-05, + "loss": 4.9902, + "step": 6411 + }, + { + "epoch": 0.03813398039775431, + "grad_norm": 2.3698999881744385, + "learning_rate": 4.982086550133976e-05, + "loss": 5.4982, + "step": 6412 + }, + { + "epoch": 0.0381399276810353, + "grad_norm": 1.996026873588562, + "learning_rate": 4.9820809680359876e-05, + "loss": 5.6556, + "step": 6413 + }, + { + "epoch": 0.038145874964316304, + "grad_norm": 2.0816900730133057, + "learning_rate": 4.9820753850715305e-05, + "loss": 5.8823, + "step": 6414 + }, + { + "epoch": 0.0381518222475973, + "grad_norm": 2.282745122909546, + "learning_rate": 4.982069801240606e-05, + "loss": 5.1641, + "step": 6415 + }, + { + "epoch": 0.038157769530878294, + "grad_norm": 2.043991804122925, + "learning_rate": 4.982064216543217e-05, + "loss": 5.7569, + "step": 6416 + }, + { + "epoch": 0.03816371681415929, + "grad_norm": 2.086071014404297, + "learning_rate": 4.982058630979365e-05, + "loss": 5.9586, + "step": 6417 + }, + { + "epoch": 0.03816966409744029, + "grad_norm": 2.295060873031616, + "learning_rate": 4.9820530445490525e-05, + "loss": 5.3733, + "step": 6418 + }, + { + "epoch": 0.038175611380721286, + "grad_norm": 2.512267827987671, + "learning_rate": 4.98204745725228e-05, + "loss": 5.0399, + "step": 6419 + }, + { + "epoch": 0.03818155866400228, + "grad_norm": 2.5434467792510986, + "learning_rate": 4.982041869089051e-05, + "loss": 4.7907, + "step": 6420 + }, + { + "epoch": 0.03818750594728328, + "grad_norm": 2.4192142486572266, + "learning_rate": 4.9820362800593666e-05, + "loss": 4.9116, + "step": 6421 + }, + { + "epoch": 0.03819345323056428, + "grad_norm": 2.867542028427124, + "learning_rate": 4.9820306901632296e-05, + "loss": 5.9905, + "step": 6422 + }, + { + "epoch": 0.03819940051384527, + "grad_norm": 2.3099327087402344, + "learning_rate": 4.982025099400641e-05, + "loss": 5.9319, + "step": 6423 + }, + { + "epoch": 0.038205347797126275, + "grad_norm": 2.28169584274292, + "learning_rate": 4.9820195077716026e-05, + "loss": 6.2533, + "step": 6424 + }, + { + "epoch": 0.03821129508040727, + "grad_norm": 2.1065595149993896, + "learning_rate": 4.9820139152761167e-05, + "loss": 5.7123, + "step": 6425 + }, + { + "epoch": 0.038217242363688265, + "grad_norm": 2.0210213661193848, + "learning_rate": 4.9820083219141865e-05, + "loss": 5.7758, + "step": 6426 + }, + { + "epoch": 0.03822318964696927, + "grad_norm": 1.6545369625091553, + "learning_rate": 4.9820027276858114e-05, + "loss": 5.6792, + "step": 6427 + }, + { + "epoch": 0.03822913693025026, + "grad_norm": 2.177621841430664, + "learning_rate": 4.981997132590996e-05, + "loss": 6.0167, + "step": 6428 + }, + { + "epoch": 0.03823508421353126, + "grad_norm": 2.3910553455352783, + "learning_rate": 4.981991536629741e-05, + "loss": 6.1161, + "step": 6429 + }, + { + "epoch": 0.03824103149681225, + "grad_norm": 2.4915859699249268, + "learning_rate": 4.981985939802047e-05, + "loss": 5.6449, + "step": 6430 + }, + { + "epoch": 0.038246978780093255, + "grad_norm": 2.0343215465545654, + "learning_rate": 4.981980342107919e-05, + "loss": 5.967, + "step": 6431 + }, + { + "epoch": 0.03825292606337425, + "grad_norm": 1.8326199054718018, + "learning_rate": 4.9819747435473565e-05, + "loss": 5.9183, + "step": 6432 + }, + { + "epoch": 0.038258873346655245, + "grad_norm": 2.1482350826263428, + "learning_rate": 4.981969144120362e-05, + "loss": 5.794, + "step": 6433 + }, + { + "epoch": 0.03826482062993625, + "grad_norm": 2.346355438232422, + "learning_rate": 4.9819635438269384e-05, + "loss": 5.6775, + "step": 6434 + }, + { + "epoch": 0.03827076791321724, + "grad_norm": 2.252150774002075, + "learning_rate": 4.981957942667087e-05, + "loss": 5.9383, + "step": 6435 + }, + { + "epoch": 0.03827671519649824, + "grad_norm": 2.1851654052734375, + "learning_rate": 4.981952340640809e-05, + "loss": 6.0555, + "step": 6436 + }, + { + "epoch": 0.03828266247977924, + "grad_norm": 2.0609381198883057, + "learning_rate": 4.9819467377481076e-05, + "loss": 6.3209, + "step": 6437 + }, + { + "epoch": 0.038288609763060234, + "grad_norm": 2.4882800579071045, + "learning_rate": 4.981941133988984e-05, + "loss": 6.2411, + "step": 6438 + }, + { + "epoch": 0.03829455704634123, + "grad_norm": 1.8794118165969849, + "learning_rate": 4.981935529363441e-05, + "loss": 5.5696, + "step": 6439 + }, + { + "epoch": 0.03830050432962223, + "grad_norm": 2.542656660079956, + "learning_rate": 4.981929923871479e-05, + "loss": 5.8106, + "step": 6440 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 2.3871288299560547, + "learning_rate": 4.981924317513101e-05, + "loss": 5.6354, + "step": 6441 + }, + { + "epoch": 0.03831239889618422, + "grad_norm": 2.4628939628601074, + "learning_rate": 4.981918710288309e-05, + "loss": 5.9695, + "step": 6442 + }, + { + "epoch": 0.038318346179465224, + "grad_norm": 2.908543586730957, + "learning_rate": 4.9819131021971056e-05, + "loss": 5.2742, + "step": 6443 + }, + { + "epoch": 0.03832429346274622, + "grad_norm": 3.353813886642456, + "learning_rate": 4.9819074932394916e-05, + "loss": 5.3823, + "step": 6444 + }, + { + "epoch": 0.038330240746027214, + "grad_norm": 2.5253870487213135, + "learning_rate": 4.981901883415469e-05, + "loss": 5.7, + "step": 6445 + }, + { + "epoch": 0.03833618802930821, + "grad_norm": 2.3375632762908936, + "learning_rate": 4.98189627272504e-05, + "loss": 5.2862, + "step": 6446 + }, + { + "epoch": 0.03834213531258921, + "grad_norm": 2.534599542617798, + "learning_rate": 4.981890661168207e-05, + "loss": 5.3961, + "step": 6447 + }, + { + "epoch": 0.038348082595870206, + "grad_norm": 2.383511781692505, + "learning_rate": 4.9818850487449716e-05, + "loss": 6.4658, + "step": 6448 + }, + { + "epoch": 0.0383540298791512, + "grad_norm": 2.2824161052703857, + "learning_rate": 4.981879435455336e-05, + "loss": 5.5221, + "step": 6449 + }, + { + "epoch": 0.0383599771624322, + "grad_norm": 2.355271100997925, + "learning_rate": 4.981873821299301e-05, + "loss": 5.5054, + "step": 6450 + }, + { + "epoch": 0.0383659244457132, + "grad_norm": 2.0071253776550293, + "learning_rate": 4.981868206276871e-05, + "loss": 5.5911, + "step": 6451 + }, + { + "epoch": 0.03837187172899419, + "grad_norm": 2.2770705223083496, + "learning_rate": 4.9818625903880445e-05, + "loss": 5.8978, + "step": 6452 + }, + { + "epoch": 0.038377819012275195, + "grad_norm": 2.2425332069396973, + "learning_rate": 4.981856973632827e-05, + "loss": 6.3189, + "step": 6453 + }, + { + "epoch": 0.03838376629555619, + "grad_norm": 2.300560235977173, + "learning_rate": 4.981851356011218e-05, + "loss": 5.745, + "step": 6454 + }, + { + "epoch": 0.038389713578837185, + "grad_norm": 2.4516983032226562, + "learning_rate": 4.981845737523221e-05, + "loss": 5.8978, + "step": 6455 + }, + { + "epoch": 0.03839566086211819, + "grad_norm": 2.3463354110717773, + "learning_rate": 4.981840118168837e-05, + "loss": 5.668, + "step": 6456 + }, + { + "epoch": 0.03840160814539918, + "grad_norm": 2.623608112335205, + "learning_rate": 4.981834497948068e-05, + "loss": 5.471, + "step": 6457 + }, + { + "epoch": 0.03840755542868018, + "grad_norm": 2.441089391708374, + "learning_rate": 4.9818288768609166e-05, + "loss": 5.0986, + "step": 6458 + }, + { + "epoch": 0.03841350271196117, + "grad_norm": 2.597635507583618, + "learning_rate": 4.981823254907384e-05, + "loss": 5.1046, + "step": 6459 + }, + { + "epoch": 0.038419449995242175, + "grad_norm": 2.344855785369873, + "learning_rate": 4.9818176320874727e-05, + "loss": 5.8878, + "step": 6460 + }, + { + "epoch": 0.03842539727852317, + "grad_norm": 2.2569222450256348, + "learning_rate": 4.981812008401184e-05, + "loss": 5.342, + "step": 6461 + }, + { + "epoch": 0.038431344561804165, + "grad_norm": 2.276780843734741, + "learning_rate": 4.981806383848522e-05, + "loss": 5.566, + "step": 6462 + }, + { + "epoch": 0.03843729184508517, + "grad_norm": 2.1354174613952637, + "learning_rate": 4.9818007584294856e-05, + "loss": 5.8678, + "step": 6463 + }, + { + "epoch": 0.03844323912836616, + "grad_norm": 2.164092779159546, + "learning_rate": 4.981795132144078e-05, + "loss": 5.7937, + "step": 6464 + }, + { + "epoch": 0.03844918641164716, + "grad_norm": 2.3034324645996094, + "learning_rate": 4.981789504992303e-05, + "loss": 5.843, + "step": 6465 + }, + { + "epoch": 0.03845513369492816, + "grad_norm": 1.9616999626159668, + "learning_rate": 4.9817838769741584e-05, + "loss": 6.0563, + "step": 6466 + }, + { + "epoch": 0.038461080978209154, + "grad_norm": 2.2784626483917236, + "learning_rate": 4.9817782480896505e-05, + "loss": 6.4152, + "step": 6467 + }, + { + "epoch": 0.03846702826149015, + "grad_norm": 1.8581526279449463, + "learning_rate": 4.981772618338779e-05, + "loss": 5.9833, + "step": 6468 + }, + { + "epoch": 0.03847297554477115, + "grad_norm": 2.2493395805358887, + "learning_rate": 4.9817669877215466e-05, + "loss": 6.2985, + "step": 6469 + }, + { + "epoch": 0.038478922828052146, + "grad_norm": 2.289125919342041, + "learning_rate": 4.981761356237955e-05, + "loss": 5.8555, + "step": 6470 + }, + { + "epoch": 0.03848487011133314, + "grad_norm": 2.11012601852417, + "learning_rate": 4.981755723888006e-05, + "loss": 6.6137, + "step": 6471 + }, + { + "epoch": 0.038490817394614144, + "grad_norm": 2.1793103218078613, + "learning_rate": 4.981750090671702e-05, + "loss": 6.0117, + "step": 6472 + }, + { + "epoch": 0.03849676467789514, + "grad_norm": 2.1857750415802, + "learning_rate": 4.9817444565890436e-05, + "loss": 5.9877, + "step": 6473 + }, + { + "epoch": 0.038502711961176134, + "grad_norm": 1.7430874109268188, + "learning_rate": 4.981738821640035e-05, + "loss": 5.829, + "step": 6474 + }, + { + "epoch": 0.03850865924445713, + "grad_norm": 1.8017771244049072, + "learning_rate": 4.981733185824676e-05, + "loss": 6.3853, + "step": 6475 + }, + { + "epoch": 0.03851460652773813, + "grad_norm": 2.1420724391937256, + "learning_rate": 4.9817275491429705e-05, + "loss": 5.982, + "step": 6476 + }, + { + "epoch": 0.038520553811019126, + "grad_norm": 2.441521167755127, + "learning_rate": 4.9817219115949195e-05, + "loss": 6.1159, + "step": 6477 + }, + { + "epoch": 0.03852650109430012, + "grad_norm": 2.158682346343994, + "learning_rate": 4.9817162731805246e-05, + "loss": 6.1306, + "step": 6478 + }, + { + "epoch": 0.03853244837758112, + "grad_norm": 2.154538869857788, + "learning_rate": 4.9817106338997884e-05, + "loss": 6.0745, + "step": 6479 + }, + { + "epoch": 0.03853839566086212, + "grad_norm": 2.077674388885498, + "learning_rate": 4.981704993752713e-05, + "loss": 6.2171, + "step": 6480 + }, + { + "epoch": 0.03854434294414311, + "grad_norm": 2.181500196456909, + "learning_rate": 4.981699352739299e-05, + "loss": 6.228, + "step": 6481 + }, + { + "epoch": 0.038550290227424115, + "grad_norm": 2.678189992904663, + "learning_rate": 4.98169371085955e-05, + "loss": 5.965, + "step": 6482 + }, + { + "epoch": 0.03855623751070511, + "grad_norm": 2.713480234146118, + "learning_rate": 4.981688068113467e-05, + "loss": 5.9078, + "step": 6483 + }, + { + "epoch": 0.038562184793986105, + "grad_norm": 2.4872853755950928, + "learning_rate": 4.981682424501053e-05, + "loss": 5.7525, + "step": 6484 + }, + { + "epoch": 0.03856813207726711, + "grad_norm": 2.274711847305298, + "learning_rate": 4.98167678002231e-05, + "loss": 5.9193, + "step": 6485 + }, + { + "epoch": 0.0385740793605481, + "grad_norm": 2.4730162620544434, + "learning_rate": 4.981671134677238e-05, + "loss": 6.2961, + "step": 6486 + }, + { + "epoch": 0.0385800266438291, + "grad_norm": 1.7856062650680542, + "learning_rate": 4.9816654884658396e-05, + "loss": 5.9005, + "step": 6487 + }, + { + "epoch": 0.03858597392711009, + "grad_norm": 1.8812140226364136, + "learning_rate": 4.981659841388119e-05, + "loss": 5.9428, + "step": 6488 + }, + { + "epoch": 0.038591921210391095, + "grad_norm": 1.9963254928588867, + "learning_rate": 4.9816541934440756e-05, + "loss": 6.0136, + "step": 6489 + }, + { + "epoch": 0.03859786849367209, + "grad_norm": 2.741892099380493, + "learning_rate": 4.981648544633713e-05, + "loss": 6.5065, + "step": 6490 + }, + { + "epoch": 0.038603815776953085, + "grad_norm": 2.226672410964966, + "learning_rate": 4.981642894957032e-05, + "loss": 5.9705, + "step": 6491 + }, + { + "epoch": 0.03860976306023409, + "grad_norm": 2.015429973602295, + "learning_rate": 4.981637244414036e-05, + "loss": 6.1418, + "step": 6492 + }, + { + "epoch": 0.03861571034351508, + "grad_norm": 2.032304286956787, + "learning_rate": 4.981631593004725e-05, + "loss": 6.2104, + "step": 6493 + }, + { + "epoch": 0.03862165762679608, + "grad_norm": 2.0174217224121094, + "learning_rate": 4.981625940729102e-05, + "loss": 5.9861, + "step": 6494 + }, + { + "epoch": 0.03862760491007708, + "grad_norm": 1.9466323852539062, + "learning_rate": 4.98162028758717e-05, + "loss": 6.0958, + "step": 6495 + }, + { + "epoch": 0.038633552193358074, + "grad_norm": 1.6796106100082397, + "learning_rate": 4.9816146335789296e-05, + "loss": 6.0708, + "step": 6496 + }, + { + "epoch": 0.03863949947663907, + "grad_norm": 2.0496580600738525, + "learning_rate": 4.9816089787043826e-05, + "loss": 6.0137, + "step": 6497 + }, + { + "epoch": 0.03864544675992007, + "grad_norm": 2.5402488708496094, + "learning_rate": 4.9816033229635324e-05, + "loss": 6.1389, + "step": 6498 + }, + { + "epoch": 0.038651394043201066, + "grad_norm": 2.2701938152313232, + "learning_rate": 4.9815976663563795e-05, + "loss": 6.1277, + "step": 6499 + }, + { + "epoch": 0.03865734132648206, + "grad_norm": 2.328554630279541, + "learning_rate": 4.9815920088829273e-05, + "loss": 6.0402, + "step": 6500 + }, + { + "epoch": 0.038663288609763063, + "grad_norm": 2.1817965507507324, + "learning_rate": 4.981586350543176e-05, + "loss": 6.2732, + "step": 6501 + }, + { + "epoch": 0.03866923589304406, + "grad_norm": 2.4273757934570312, + "learning_rate": 4.981580691337129e-05, + "loss": 6.1842, + "step": 6502 + }, + { + "epoch": 0.038675183176325054, + "grad_norm": 2.1365530490875244, + "learning_rate": 4.981575031264787e-05, + "loss": 6.1527, + "step": 6503 + }, + { + "epoch": 0.03868113045960605, + "grad_norm": 2.2198991775512695, + "learning_rate": 4.981569370326154e-05, + "loss": 6.0841, + "step": 6504 + }, + { + "epoch": 0.03868707774288705, + "grad_norm": 2.0078141689300537, + "learning_rate": 4.98156370852123e-05, + "loss": 6.0401, + "step": 6505 + }, + { + "epoch": 0.038693025026168046, + "grad_norm": 2.0243566036224365, + "learning_rate": 4.9815580458500184e-05, + "loss": 5.9111, + "step": 6506 + }, + { + "epoch": 0.03869897230944904, + "grad_norm": 2.3084707260131836, + "learning_rate": 4.98155238231252e-05, + "loss": 5.9865, + "step": 6507 + }, + { + "epoch": 0.03870491959273004, + "grad_norm": 1.8110517263412476, + "learning_rate": 4.981546717908738e-05, + "loss": 5.9132, + "step": 6508 + }, + { + "epoch": 0.03871086687601104, + "grad_norm": 2.2639706134796143, + "learning_rate": 4.981541052638673e-05, + "loss": 5.8195, + "step": 6509 + }, + { + "epoch": 0.03871681415929203, + "grad_norm": 2.2684152126312256, + "learning_rate": 4.981535386502327e-05, + "loss": 6.4894, + "step": 6510 + }, + { + "epoch": 0.038722761442573035, + "grad_norm": 2.363118886947632, + "learning_rate": 4.981529719499704e-05, + "loss": 6.1888, + "step": 6511 + }, + { + "epoch": 0.03872870872585403, + "grad_norm": 2.2158865928649902, + "learning_rate": 4.9815240516308045e-05, + "loss": 6.3361, + "step": 6512 + }, + { + "epoch": 0.038734656009135025, + "grad_norm": 2.096928834915161, + "learning_rate": 4.98151838289563e-05, + "loss": 5.8554, + "step": 6513 + }, + { + "epoch": 0.03874060329241603, + "grad_norm": 2.2228331565856934, + "learning_rate": 4.981512713294183e-05, + "loss": 5.9961, + "step": 6514 + }, + { + "epoch": 0.03874655057569702, + "grad_norm": 1.8646903038024902, + "learning_rate": 4.981507042826466e-05, + "loss": 6.1471, + "step": 6515 + }, + { + "epoch": 0.03875249785897802, + "grad_norm": 2.227267265319824, + "learning_rate": 4.98150137149248e-05, + "loss": 5.9655, + "step": 6516 + }, + { + "epoch": 0.03875844514225902, + "grad_norm": 2.6884701251983643, + "learning_rate": 4.981495699292228e-05, + "loss": 5.7958, + "step": 6517 + }, + { + "epoch": 0.038764392425540015, + "grad_norm": 2.953523635864258, + "learning_rate": 4.981490026225711e-05, + "loss": 5.8305, + "step": 6518 + }, + { + "epoch": 0.03877033970882101, + "grad_norm": 2.5009984970092773, + "learning_rate": 4.981484352292932e-05, + "loss": 5.7838, + "step": 6519 + }, + { + "epoch": 0.038776286992102005, + "grad_norm": 2.2291715145111084, + "learning_rate": 4.981478677493892e-05, + "loss": 5.7622, + "step": 6520 + }, + { + "epoch": 0.03878223427538301, + "grad_norm": 2.1492466926574707, + "learning_rate": 4.9814730018285935e-05, + "loss": 5.5379, + "step": 6521 + }, + { + "epoch": 0.038788181558664, + "grad_norm": 1.8914062976837158, + "learning_rate": 4.981467325297039e-05, + "loss": 5.8368, + "step": 6522 + }, + { + "epoch": 0.038794128841945, + "grad_norm": 2.301670789718628, + "learning_rate": 4.981461647899229e-05, + "loss": 5.9019, + "step": 6523 + }, + { + "epoch": 0.038800076125226, + "grad_norm": 2.2850520610809326, + "learning_rate": 4.981455969635167e-05, + "loss": 5.6616, + "step": 6524 + }, + { + "epoch": 0.038806023408506994, + "grad_norm": 2.4155313968658447, + "learning_rate": 4.9814502905048546e-05, + "loss": 5.7842, + "step": 6525 + }, + { + "epoch": 0.03881197069178799, + "grad_norm": 2.0731799602508545, + "learning_rate": 4.981444610508293e-05, + "loss": 6.084, + "step": 6526 + }, + { + "epoch": 0.03881791797506899, + "grad_norm": 2.990232229232788, + "learning_rate": 4.981438929645484e-05, + "loss": 5.2556, + "step": 6527 + }, + { + "epoch": 0.038823865258349986, + "grad_norm": 3.0814263820648193, + "learning_rate": 4.981433247916432e-05, + "loss": 5.1895, + "step": 6528 + }, + { + "epoch": 0.03882981254163098, + "grad_norm": 3.197000503540039, + "learning_rate": 4.9814275653211365e-05, + "loss": 4.9539, + "step": 6529 + }, + { + "epoch": 0.03883575982491198, + "grad_norm": 3.062098979949951, + "learning_rate": 4.9814218818596e-05, + "loss": 4.8417, + "step": 6530 + }, + { + "epoch": 0.03884170710819298, + "grad_norm": 3.092667579650879, + "learning_rate": 4.981416197531825e-05, + "loss": 5.0479, + "step": 6531 + }, + { + "epoch": 0.038847654391473974, + "grad_norm": 3.00508713722229, + "learning_rate": 4.981410512337813e-05, + "loss": 5.864, + "step": 6532 + }, + { + "epoch": 0.03885360167475497, + "grad_norm": 3.3760926723480225, + "learning_rate": 4.981404826277567e-05, + "loss": 6.5745, + "step": 6533 + }, + { + "epoch": 0.03885954895803597, + "grad_norm": 2.6170921325683594, + "learning_rate": 4.981399139351087e-05, + "loss": 5.7959, + "step": 6534 + }, + { + "epoch": 0.038865496241316966, + "grad_norm": 2.9855849742889404, + "learning_rate": 4.981393451558377e-05, + "loss": 4.9118, + "step": 6535 + }, + { + "epoch": 0.03887144352459796, + "grad_norm": 2.885373830795288, + "learning_rate": 4.981387762899438e-05, + "loss": 4.8342, + "step": 6536 + }, + { + "epoch": 0.03887739080787896, + "grad_norm": 2.6936960220336914, + "learning_rate": 4.981382073374272e-05, + "loss": 4.7323, + "step": 6537 + }, + { + "epoch": 0.03888333809115996, + "grad_norm": 2.7214853763580322, + "learning_rate": 4.981376382982882e-05, + "loss": 5.5414, + "step": 6538 + }, + { + "epoch": 0.03888928537444095, + "grad_norm": 2.449828863143921, + "learning_rate": 4.981370691725269e-05, + "loss": 5.6385, + "step": 6539 + }, + { + "epoch": 0.038895232657721955, + "grad_norm": 2.551046133041382, + "learning_rate": 4.981364999601434e-05, + "loss": 5.4699, + "step": 6540 + }, + { + "epoch": 0.03890117994100295, + "grad_norm": 2.1208136081695557, + "learning_rate": 4.981359306611381e-05, + "loss": 5.6674, + "step": 6541 + }, + { + "epoch": 0.038907127224283945, + "grad_norm": 2.4039392471313477, + "learning_rate": 4.9813536127551105e-05, + "loss": 6.1872, + "step": 6542 + }, + { + "epoch": 0.03891307450756495, + "grad_norm": 2.0119946002960205, + "learning_rate": 4.9813479180326256e-05, + "loss": 6.0917, + "step": 6543 + }, + { + "epoch": 0.03891902179084594, + "grad_norm": 3.2959303855895996, + "learning_rate": 4.9813422224439275e-05, + "loss": 5.5646, + "step": 6544 + }, + { + "epoch": 0.03892496907412694, + "grad_norm": 2.9011316299438477, + "learning_rate": 4.981336525989019e-05, + "loss": 5.5324, + "step": 6545 + }, + { + "epoch": 0.03893091635740794, + "grad_norm": 2.2984118461608887, + "learning_rate": 4.981330828667901e-05, + "loss": 5.4961, + "step": 6546 + }, + { + "epoch": 0.038936863640688935, + "grad_norm": 2.1745059490203857, + "learning_rate": 4.981325130480576e-05, + "loss": 5.6631, + "step": 6547 + }, + { + "epoch": 0.03894281092396993, + "grad_norm": 2.3001794815063477, + "learning_rate": 4.981319431427046e-05, + "loss": 5.5897, + "step": 6548 + }, + { + "epoch": 0.038948758207250925, + "grad_norm": 2.329446315765381, + "learning_rate": 4.9813137315073136e-05, + "loss": 5.4599, + "step": 6549 + }, + { + "epoch": 0.03895470549053193, + "grad_norm": 2.4700307846069336, + "learning_rate": 4.98130803072138e-05, + "loss": 5.2788, + "step": 6550 + }, + { + "epoch": 0.03896065277381292, + "grad_norm": 2.309767484664917, + "learning_rate": 4.9813023290692467e-05, + "loss": 5.3828, + "step": 6551 + }, + { + "epoch": 0.03896660005709392, + "grad_norm": 2.1923089027404785, + "learning_rate": 4.981296626550917e-05, + "loss": 5.225, + "step": 6552 + }, + { + "epoch": 0.03897254734037492, + "grad_norm": 2.424954652786255, + "learning_rate": 4.981290923166392e-05, + "loss": 5.2007, + "step": 6553 + }, + { + "epoch": 0.038978494623655914, + "grad_norm": 2.53446102142334, + "learning_rate": 4.981285218915674e-05, + "loss": 5.142, + "step": 6554 + }, + { + "epoch": 0.03898444190693691, + "grad_norm": 2.492788791656494, + "learning_rate": 4.9812795137987655e-05, + "loss": 5.5755, + "step": 6555 + }, + { + "epoch": 0.03899038919021791, + "grad_norm": 2.8081278800964355, + "learning_rate": 4.9812738078156674e-05, + "loss": 4.9815, + "step": 6556 + }, + { + "epoch": 0.038996336473498906, + "grad_norm": 2.535109758377075, + "learning_rate": 4.981268100966383e-05, + "loss": 5.3678, + "step": 6557 + }, + { + "epoch": 0.0390022837567799, + "grad_norm": 2.36004900932312, + "learning_rate": 4.981262393250913e-05, + "loss": 5.0422, + "step": 6558 + }, + { + "epoch": 0.0390082310400609, + "grad_norm": 2.2315657138824463, + "learning_rate": 4.98125668466926e-05, + "loss": 5.0345, + "step": 6559 + }, + { + "epoch": 0.0390141783233419, + "grad_norm": 2.293947696685791, + "learning_rate": 4.981250975221425e-05, + "loss": 4.9308, + "step": 6560 + }, + { + "epoch": 0.039020125606622894, + "grad_norm": 2.239915132522583, + "learning_rate": 4.9812452649074124e-05, + "loss": 5.3504, + "step": 6561 + }, + { + "epoch": 0.03902607288990389, + "grad_norm": 1.8740140199661255, + "learning_rate": 4.981239553727222e-05, + "loss": 5.9432, + "step": 6562 + }, + { + "epoch": 0.03903202017318489, + "grad_norm": 1.7221744060516357, + "learning_rate": 4.981233841680857e-05, + "loss": 5.8387, + "step": 6563 + }, + { + "epoch": 0.039037967456465886, + "grad_norm": 1.9648221731185913, + "learning_rate": 4.981228128768318e-05, + "loss": 5.7836, + "step": 6564 + }, + { + "epoch": 0.03904391473974688, + "grad_norm": 1.7790826559066772, + "learning_rate": 4.981222414989608e-05, + "loss": 5.842, + "step": 6565 + }, + { + "epoch": 0.03904986202302788, + "grad_norm": 2.039483070373535, + "learning_rate": 4.9812167003447296e-05, + "loss": 5.6509, + "step": 6566 + }, + { + "epoch": 0.03905580930630888, + "grad_norm": 2.1241865158081055, + "learning_rate": 4.981210984833684e-05, + "loss": 5.5626, + "step": 6567 + }, + { + "epoch": 0.03906175658958987, + "grad_norm": 2.1290524005889893, + "learning_rate": 4.981205268456473e-05, + "loss": 5.5114, + "step": 6568 + }, + { + "epoch": 0.039067703872870875, + "grad_norm": 2.181558132171631, + "learning_rate": 4.981199551213099e-05, + "loss": 5.5356, + "step": 6569 + }, + { + "epoch": 0.03907365115615187, + "grad_norm": 2.1696360111236572, + "learning_rate": 4.9811938331035635e-05, + "loss": 5.5684, + "step": 6570 + }, + { + "epoch": 0.039079598439432865, + "grad_norm": 1.8040674924850464, + "learning_rate": 4.98118811412787e-05, + "loss": 5.605, + "step": 6571 + }, + { + "epoch": 0.03908554572271387, + "grad_norm": 2.4475252628326416, + "learning_rate": 4.981182394286018e-05, + "loss": 6.4733, + "step": 6572 + }, + { + "epoch": 0.03909149300599486, + "grad_norm": 2.0800678730010986, + "learning_rate": 4.981176673578011e-05, + "loss": 5.5613, + "step": 6573 + }, + { + "epoch": 0.03909744028927586, + "grad_norm": 1.7632306814193726, + "learning_rate": 4.981170952003852e-05, + "loss": 5.5971, + "step": 6574 + }, + { + "epoch": 0.03910338757255686, + "grad_norm": 1.6671072244644165, + "learning_rate": 4.981165229563541e-05, + "loss": 5.4462, + "step": 6575 + }, + { + "epoch": 0.039109334855837855, + "grad_norm": 1.8972923755645752, + "learning_rate": 4.981159506257081e-05, + "loss": 5.7747, + "step": 6576 + }, + { + "epoch": 0.03911528213911885, + "grad_norm": 1.8343021869659424, + "learning_rate": 4.981153782084473e-05, + "loss": 5.7542, + "step": 6577 + }, + { + "epoch": 0.039121229422399845, + "grad_norm": 1.669877529144287, + "learning_rate": 4.9811480570457216e-05, + "loss": 5.6736, + "step": 6578 + }, + { + "epoch": 0.03912717670568085, + "grad_norm": 1.9555165767669678, + "learning_rate": 4.981142331140825e-05, + "loss": 5.2997, + "step": 6579 + }, + { + "epoch": 0.03913312398896184, + "grad_norm": 2.5131587982177734, + "learning_rate": 4.981136604369789e-05, + "loss": 5.2093, + "step": 6580 + }, + { + "epoch": 0.03913907127224284, + "grad_norm": 2.0637567043304443, + "learning_rate": 4.9811308767326134e-05, + "loss": 5.1671, + "step": 6581 + }, + { + "epoch": 0.03914501855552384, + "grad_norm": 2.140839099884033, + "learning_rate": 4.9811251482293e-05, + "loss": 5.3237, + "step": 6582 + }, + { + "epoch": 0.039150965838804834, + "grad_norm": 1.968489408493042, + "learning_rate": 4.981119418859852e-05, + "loss": 5.6015, + "step": 6583 + }, + { + "epoch": 0.03915691312208583, + "grad_norm": 1.873827338218689, + "learning_rate": 4.9811136886242705e-05, + "loss": 5.3316, + "step": 6584 + }, + { + "epoch": 0.03916286040536683, + "grad_norm": 1.9897359609603882, + "learning_rate": 4.981107957522558e-05, + "loss": 5.1548, + "step": 6585 + }, + { + "epoch": 0.039168807688647826, + "grad_norm": 2.004457950592041, + "learning_rate": 4.9811022255547165e-05, + "loss": 5.1977, + "step": 6586 + }, + { + "epoch": 0.03917475497192882, + "grad_norm": 2.1058437824249268, + "learning_rate": 4.9810964927207485e-05, + "loss": 5.0217, + "step": 6587 + }, + { + "epoch": 0.03918070225520982, + "grad_norm": 1.9846851825714111, + "learning_rate": 4.981090759020654e-05, + "loss": 5.1123, + "step": 6588 + }, + { + "epoch": 0.03918664953849082, + "grad_norm": 2.018026828765869, + "learning_rate": 4.981085024454437e-05, + "loss": 5.0516, + "step": 6589 + }, + { + "epoch": 0.039192596821771813, + "grad_norm": 1.7792260646820068, + "learning_rate": 4.9810792890220995e-05, + "loss": 5.5266, + "step": 6590 + }, + { + "epoch": 0.03919854410505281, + "grad_norm": 2.0855109691619873, + "learning_rate": 4.981073552723642e-05, + "loss": 5.5504, + "step": 6591 + }, + { + "epoch": 0.03920449138833381, + "grad_norm": 1.9998018741607666, + "learning_rate": 4.9810678155590676e-05, + "loss": 5.3447, + "step": 6592 + }, + { + "epoch": 0.039210438671614806, + "grad_norm": 2.332714557647705, + "learning_rate": 4.981062077528377e-05, + "loss": 5.6166, + "step": 6593 + }, + { + "epoch": 0.0392163859548958, + "grad_norm": 1.9647892713546753, + "learning_rate": 4.981056338631575e-05, + "loss": 5.0113, + "step": 6594 + }, + { + "epoch": 0.0392223332381768, + "grad_norm": 1.9961154460906982, + "learning_rate": 4.9810505988686604e-05, + "loss": 5.0143, + "step": 6595 + }, + { + "epoch": 0.0392282805214578, + "grad_norm": 1.9039133787155151, + "learning_rate": 4.981044858239637e-05, + "loss": 5.3602, + "step": 6596 + }, + { + "epoch": 0.03923422780473879, + "grad_norm": 1.9076604843139648, + "learning_rate": 4.981039116744507e-05, + "loss": 5.4165, + "step": 6597 + }, + { + "epoch": 0.039240175088019795, + "grad_norm": 1.6676216125488281, + "learning_rate": 4.981033374383272e-05, + "loss": 5.4018, + "step": 6598 + }, + { + "epoch": 0.03924612237130079, + "grad_norm": 1.7158783674240112, + "learning_rate": 4.981027631155933e-05, + "loss": 5.3233, + "step": 6599 + }, + { + "epoch": 0.039252069654581785, + "grad_norm": 1.6659481525421143, + "learning_rate": 4.9810218870624945e-05, + "loss": 5.4671, + "step": 6600 + }, + { + "epoch": 0.03925801693786279, + "grad_norm": 2.008171319961548, + "learning_rate": 4.981016142102956e-05, + "loss": 5.6424, + "step": 6601 + }, + { + "epoch": 0.03926396422114378, + "grad_norm": 2.213045835494995, + "learning_rate": 4.9810103962773204e-05, + "loss": 5.419, + "step": 6602 + }, + { + "epoch": 0.03926991150442478, + "grad_norm": 2.0159718990325928, + "learning_rate": 4.981004649585589e-05, + "loss": 5.4301, + "step": 6603 + }, + { + "epoch": 0.03927585878770578, + "grad_norm": 1.982701063156128, + "learning_rate": 4.9809989020277646e-05, + "loss": 5.6001, + "step": 6604 + }, + { + "epoch": 0.039281806070986774, + "grad_norm": 2.1933834552764893, + "learning_rate": 4.98099315360385e-05, + "loss": 5.6756, + "step": 6605 + }, + { + "epoch": 0.03928775335426777, + "grad_norm": 1.858798623085022, + "learning_rate": 4.980987404313846e-05, + "loss": 5.43, + "step": 6606 + }, + { + "epoch": 0.039293700637548765, + "grad_norm": 1.8233433961868286, + "learning_rate": 4.980981654157755e-05, + "loss": 5.4638, + "step": 6607 + }, + { + "epoch": 0.03929964792082977, + "grad_norm": 2.0368216037750244, + "learning_rate": 4.9809759031355784e-05, + "loss": 5.71, + "step": 6608 + }, + { + "epoch": 0.03930559520411076, + "grad_norm": 1.9923310279846191, + "learning_rate": 4.9809701512473196e-05, + "loss": 5.6443, + "step": 6609 + }, + { + "epoch": 0.03931154248739176, + "grad_norm": 2.391463279724121, + "learning_rate": 4.9809643984929785e-05, + "loss": 5.4701, + "step": 6610 + }, + { + "epoch": 0.03931748977067276, + "grad_norm": 1.8456658124923706, + "learning_rate": 4.98095864487256e-05, + "loss": 5.4346, + "step": 6611 + }, + { + "epoch": 0.039323437053953754, + "grad_norm": 1.7941107749938965, + "learning_rate": 4.980952890386063e-05, + "loss": 5.4198, + "step": 6612 + }, + { + "epoch": 0.03932938433723475, + "grad_norm": 1.8455369472503662, + "learning_rate": 4.980947135033492e-05, + "loss": 5.3915, + "step": 6613 + }, + { + "epoch": 0.03933533162051575, + "grad_norm": 1.8710846900939941, + "learning_rate": 4.980941378814847e-05, + "loss": 5.2744, + "step": 6614 + }, + { + "epoch": 0.039341278903796746, + "grad_norm": 2.203129768371582, + "learning_rate": 4.980935621730132e-05, + "loss": 5.4409, + "step": 6615 + }, + { + "epoch": 0.03934722618707774, + "grad_norm": 1.8944141864776611, + "learning_rate": 4.980929863779348e-05, + "loss": 5.4661, + "step": 6616 + }, + { + "epoch": 0.03935317347035874, + "grad_norm": 1.8268091678619385, + "learning_rate": 4.9809241049624966e-05, + "loss": 5.4088, + "step": 6617 + }, + { + "epoch": 0.03935912075363974, + "grad_norm": 1.838927984237671, + "learning_rate": 4.98091834527958e-05, + "loss": 5.5335, + "step": 6618 + }, + { + "epoch": 0.03936506803692073, + "grad_norm": 1.8441804647445679, + "learning_rate": 4.9809125847306e-05, + "loss": 5.4639, + "step": 6619 + }, + { + "epoch": 0.03937101532020173, + "grad_norm": 2.012754440307617, + "learning_rate": 4.980906823315561e-05, + "loss": 5.5606, + "step": 6620 + }, + { + "epoch": 0.03937696260348273, + "grad_norm": 1.8358973264694214, + "learning_rate": 4.980901061034461e-05, + "loss": 5.4217, + "step": 6621 + }, + { + "epoch": 0.039382909886763726, + "grad_norm": 2.0668959617614746, + "learning_rate": 4.980895297887305e-05, + "loss": 5.5164, + "step": 6622 + }, + { + "epoch": 0.03938885717004472, + "grad_norm": 2.032320976257324, + "learning_rate": 4.9808895338740934e-05, + "loss": 5.4914, + "step": 6623 + }, + { + "epoch": 0.03939480445332572, + "grad_norm": 1.8650145530700684, + "learning_rate": 4.980883768994829e-05, + "loss": 5.3718, + "step": 6624 + }, + { + "epoch": 0.03940075173660672, + "grad_norm": 4.494358539581299, + "learning_rate": 4.980878003249515e-05, + "loss": 5.5253, + "step": 6625 + }, + { + "epoch": 0.03940669901988771, + "grad_norm": 1.9295374155044556, + "learning_rate": 4.980872236638151e-05, + "loss": 5.3187, + "step": 6626 + }, + { + "epoch": 0.039412646303168715, + "grad_norm": 2.089717388153076, + "learning_rate": 4.980866469160741e-05, + "loss": 5.5311, + "step": 6627 + }, + { + "epoch": 0.03941859358644971, + "grad_norm": 1.701429843902588, + "learning_rate": 4.980860700817285e-05, + "loss": 5.4529, + "step": 6628 + }, + { + "epoch": 0.039424540869730705, + "grad_norm": 1.8336073160171509, + "learning_rate": 4.980854931607787e-05, + "loss": 5.2987, + "step": 6629 + }, + { + "epoch": 0.03943048815301171, + "grad_norm": 2.7922565937042236, + "learning_rate": 4.9808491615322475e-05, + "loss": 5.3492, + "step": 6630 + }, + { + "epoch": 0.0394364354362927, + "grad_norm": 1.8253742456436157, + "learning_rate": 4.980843390590669e-05, + "loss": 5.3928, + "step": 6631 + }, + { + "epoch": 0.0394423827195737, + "grad_norm": 2.646916151046753, + "learning_rate": 4.980837618783055e-05, + "loss": 5.4329, + "step": 6632 + }, + { + "epoch": 0.0394483300028547, + "grad_norm": 2.1956236362457275, + "learning_rate": 4.980831846109405e-05, + "loss": 5.4794, + "step": 6633 + }, + { + "epoch": 0.039454277286135694, + "grad_norm": 2.7274577617645264, + "learning_rate": 4.980826072569723e-05, + "loss": 5.9666, + "step": 6634 + }, + { + "epoch": 0.03946022456941669, + "grad_norm": 1.9890350103378296, + "learning_rate": 4.98082029816401e-05, + "loss": 5.5518, + "step": 6635 + }, + { + "epoch": 0.039466171852697685, + "grad_norm": 2.7760517597198486, + "learning_rate": 4.980814522892268e-05, + "loss": 5.2777, + "step": 6636 + }, + { + "epoch": 0.03947211913597869, + "grad_norm": 2.035254716873169, + "learning_rate": 4.9808087467544995e-05, + "loss": 5.5872, + "step": 6637 + }, + { + "epoch": 0.03947806641925968, + "grad_norm": 1.9728864431381226, + "learning_rate": 4.980802969750706e-05, + "loss": 5.3357, + "step": 6638 + }, + { + "epoch": 0.03948401370254068, + "grad_norm": 1.795480489730835, + "learning_rate": 4.98079719188089e-05, + "loss": 5.6414, + "step": 6639 + }, + { + "epoch": 0.03948996098582168, + "grad_norm": 1.7882109880447388, + "learning_rate": 4.980791413145054e-05, + "loss": 5.3499, + "step": 6640 + }, + { + "epoch": 0.039495908269102674, + "grad_norm": 1.8416422605514526, + "learning_rate": 4.9807856335431994e-05, + "loss": 5.3292, + "step": 6641 + }, + { + "epoch": 0.03950185555238367, + "grad_norm": 1.9525254964828491, + "learning_rate": 4.9807798530753266e-05, + "loss": 5.2782, + "step": 6642 + }, + { + "epoch": 0.03950780283566467, + "grad_norm": 1.5100830793380737, + "learning_rate": 4.9807740717414406e-05, + "loss": 5.2807, + "step": 6643 + }, + { + "epoch": 0.039513750118945666, + "grad_norm": 2.029430866241455, + "learning_rate": 4.9807682895415406e-05, + "loss": 5.4496, + "step": 6644 + }, + { + "epoch": 0.03951969740222666, + "grad_norm": 1.7976901531219482, + "learning_rate": 4.9807625064756315e-05, + "loss": 5.1021, + "step": 6645 + }, + { + "epoch": 0.03952564468550766, + "grad_norm": 1.5770336389541626, + "learning_rate": 4.980756722543714e-05, + "loss": 5.3946, + "step": 6646 + }, + { + "epoch": 0.03953159196878866, + "grad_norm": 1.8289496898651123, + "learning_rate": 4.980750937745788e-05, + "loss": 5.4821, + "step": 6647 + }, + { + "epoch": 0.03953753925206965, + "grad_norm": 1.7413506507873535, + "learning_rate": 4.980745152081859e-05, + "loss": 5.4827, + "step": 6648 + }, + { + "epoch": 0.03954348653535065, + "grad_norm": 2.048400402069092, + "learning_rate": 4.980739365551927e-05, + "loss": 5.2359, + "step": 6649 + }, + { + "epoch": 0.03954943381863165, + "grad_norm": 2.331897735595703, + "learning_rate": 4.980733578155995e-05, + "loss": 5.2988, + "step": 6650 + }, + { + "epoch": 0.039555381101912646, + "grad_norm": 2.1224608421325684, + "learning_rate": 4.980727789894065e-05, + "loss": 5.1228, + "step": 6651 + }, + { + "epoch": 0.03956132838519364, + "grad_norm": 1.5331578254699707, + "learning_rate": 4.9807220007661374e-05, + "loss": 5.184, + "step": 6652 + }, + { + "epoch": 0.03956727566847464, + "grad_norm": 1.773489236831665, + "learning_rate": 4.980716210772216e-05, + "loss": 5.1883, + "step": 6653 + }, + { + "epoch": 0.03957322295175564, + "grad_norm": 2.119302749633789, + "learning_rate": 4.9807104199123016e-05, + "loss": 5.5437, + "step": 6654 + }, + { + "epoch": 0.03957917023503663, + "grad_norm": 2.0695033073425293, + "learning_rate": 4.9807046281863974e-05, + "loss": 5.5951, + "step": 6655 + }, + { + "epoch": 0.039585117518317635, + "grad_norm": 2.0522243976593018, + "learning_rate": 4.980698835594505e-05, + "loss": 5.2736, + "step": 6656 + }, + { + "epoch": 0.03959106480159863, + "grad_norm": 2.3200113773345947, + "learning_rate": 4.980693042136626e-05, + "loss": 5.5701, + "step": 6657 + }, + { + "epoch": 0.039597012084879625, + "grad_norm": 1.8731193542480469, + "learning_rate": 4.980687247812762e-05, + "loss": 5.3929, + "step": 6658 + }, + { + "epoch": 0.03960295936816063, + "grad_norm": 1.8390223979949951, + "learning_rate": 4.980681452622916e-05, + "loss": 5.1684, + "step": 6659 + }, + { + "epoch": 0.03960890665144162, + "grad_norm": 2.24766206741333, + "learning_rate": 4.980675656567091e-05, + "loss": 5.0232, + "step": 6660 + }, + { + "epoch": 0.03961485393472262, + "grad_norm": 2.2592451572418213, + "learning_rate": 4.980669859645286e-05, + "loss": 4.9878, + "step": 6661 + }, + { + "epoch": 0.03962080121800362, + "grad_norm": 2.14709734916687, + "learning_rate": 4.9806640618575064e-05, + "loss": 5.1036, + "step": 6662 + }, + { + "epoch": 0.039626748501284614, + "grad_norm": 2.133910655975342, + "learning_rate": 4.9806582632037516e-05, + "loss": 5.0356, + "step": 6663 + }, + { + "epoch": 0.03963269578456561, + "grad_norm": 2.2513222694396973, + "learning_rate": 4.980652463684025e-05, + "loss": 5.2357, + "step": 6664 + }, + { + "epoch": 0.039638643067846605, + "grad_norm": 2.078355312347412, + "learning_rate": 4.980646663298328e-05, + "loss": 5.3857, + "step": 6665 + }, + { + "epoch": 0.03964459035112761, + "grad_norm": 2.3798105716705322, + "learning_rate": 4.980640862046663e-05, + "loss": 5.0888, + "step": 6666 + }, + { + "epoch": 0.0396505376344086, + "grad_norm": 2.241868019104004, + "learning_rate": 4.980635059929032e-05, + "loss": 5.1397, + "step": 6667 + }, + { + "epoch": 0.0396564849176896, + "grad_norm": 2.2053534984588623, + "learning_rate": 4.9806292569454365e-05, + "loss": 4.799, + "step": 6668 + }, + { + "epoch": 0.0396624322009706, + "grad_norm": 2.2996716499328613, + "learning_rate": 4.980623453095879e-05, + "loss": 4.9597, + "step": 6669 + }, + { + "epoch": 0.039668379484251594, + "grad_norm": 1.9892657995224, + "learning_rate": 4.9806176483803615e-05, + "loss": 5.0784, + "step": 6670 + }, + { + "epoch": 0.03967432676753259, + "grad_norm": 2.2087242603302, + "learning_rate": 4.980611842798887e-05, + "loss": 5.4099, + "step": 6671 + }, + { + "epoch": 0.03968027405081359, + "grad_norm": 2.215728521347046, + "learning_rate": 4.980606036351455e-05, + "loss": 5.2889, + "step": 6672 + }, + { + "epoch": 0.039686221334094586, + "grad_norm": 2.228073835372925, + "learning_rate": 4.9806002290380705e-05, + "loss": 5.3816, + "step": 6673 + }, + { + "epoch": 0.03969216861737558, + "grad_norm": 2.209808826446533, + "learning_rate": 4.980594420858733e-05, + "loss": 5.6233, + "step": 6674 + }, + { + "epoch": 0.03969811590065658, + "grad_norm": 1.8294177055358887, + "learning_rate": 4.980588611813446e-05, + "loss": 5.5756, + "step": 6675 + }, + { + "epoch": 0.03970406318393758, + "grad_norm": 2.236435890197754, + "learning_rate": 4.980582801902212e-05, + "loss": 5.4807, + "step": 6676 + }, + { + "epoch": 0.03971001046721857, + "grad_norm": 2.528804063796997, + "learning_rate": 4.980576991125031e-05, + "loss": 5.6503, + "step": 6677 + }, + { + "epoch": 0.03971595775049957, + "grad_norm": 2.312063217163086, + "learning_rate": 4.9805711794819065e-05, + "loss": 5.5517, + "step": 6678 + }, + { + "epoch": 0.03972190503378057, + "grad_norm": 2.336134672164917, + "learning_rate": 4.98056536697284e-05, + "loss": 5.5708, + "step": 6679 + }, + { + "epoch": 0.039727852317061566, + "grad_norm": 2.2809929847717285, + "learning_rate": 4.980559553597834e-05, + "loss": 5.453, + "step": 6680 + }, + { + "epoch": 0.03973379960034256, + "grad_norm": 2.0603368282318115, + "learning_rate": 4.98055373935689e-05, + "loss": 5.3482, + "step": 6681 + }, + { + "epoch": 0.03973974688362356, + "grad_norm": 1.9654933214187622, + "learning_rate": 4.980547924250011e-05, + "loss": 5.29, + "step": 6682 + }, + { + "epoch": 0.03974569416690456, + "grad_norm": 2.4211983680725098, + "learning_rate": 4.9805421082771985e-05, + "loss": 5.4261, + "step": 6683 + }, + { + "epoch": 0.03975164145018555, + "grad_norm": 2.129987955093384, + "learning_rate": 4.9805362914384533e-05, + "loss": 5.3551, + "step": 6684 + }, + { + "epoch": 0.039757588733466555, + "grad_norm": 2.127936601638794, + "learning_rate": 4.9805304737337796e-05, + "loss": 5.4647, + "step": 6685 + }, + { + "epoch": 0.03976353601674755, + "grad_norm": 2.303382158279419, + "learning_rate": 4.980524655163178e-05, + "loss": 5.1699, + "step": 6686 + }, + { + "epoch": 0.039769483300028545, + "grad_norm": 2.6889941692352295, + "learning_rate": 4.98051883572665e-05, + "loss": 5.2031, + "step": 6687 + }, + { + "epoch": 0.03977543058330955, + "grad_norm": 3.321950674057007, + "learning_rate": 4.9805130154242e-05, + "loss": 4.9815, + "step": 6688 + }, + { + "epoch": 0.03978137786659054, + "grad_norm": 3.1951568126678467, + "learning_rate": 4.980507194255827e-05, + "loss": 4.8946, + "step": 6689 + }, + { + "epoch": 0.03978732514987154, + "grad_norm": 2.355271816253662, + "learning_rate": 4.9805013722215355e-05, + "loss": 5.9223, + "step": 6690 + }, + { + "epoch": 0.03979327243315254, + "grad_norm": 2.3401644229888916, + "learning_rate": 4.9804955493213264e-05, + "loss": 6.1826, + "step": 6691 + }, + { + "epoch": 0.039799219716433534, + "grad_norm": 2.191997766494751, + "learning_rate": 4.980489725555202e-05, + "loss": 5.5617, + "step": 6692 + }, + { + "epoch": 0.03980516699971453, + "grad_norm": 2.377803087234497, + "learning_rate": 4.9804839009231644e-05, + "loss": 5.684, + "step": 6693 + }, + { + "epoch": 0.039811114282995524, + "grad_norm": 1.9084972143173218, + "learning_rate": 4.980478075425215e-05, + "loss": 6.0291, + "step": 6694 + }, + { + "epoch": 0.039817061566276526, + "grad_norm": 2.185628890991211, + "learning_rate": 4.9804722490613566e-05, + "loss": 5.5808, + "step": 6695 + }, + { + "epoch": 0.03982300884955752, + "grad_norm": 2.3253934383392334, + "learning_rate": 4.980466421831591e-05, + "loss": 5.7076, + "step": 6696 + }, + { + "epoch": 0.03982895613283852, + "grad_norm": 2.1599392890930176, + "learning_rate": 4.98046059373592e-05, + "loss": 5.9607, + "step": 6697 + }, + { + "epoch": 0.03983490341611952, + "grad_norm": 2.093137741088867, + "learning_rate": 4.980454764774346e-05, + "loss": 6.0014, + "step": 6698 + }, + { + "epoch": 0.039840850699400514, + "grad_norm": 2.4242093563079834, + "learning_rate": 4.980448934946871e-05, + "loss": 5.6255, + "step": 6699 + }, + { + "epoch": 0.03984679798268151, + "grad_norm": 2.523277521133423, + "learning_rate": 4.980443104253497e-05, + "loss": 5.5302, + "step": 6700 + }, + { + "epoch": 0.03985274526596251, + "grad_norm": 1.7926498651504517, + "learning_rate": 4.980437272694225e-05, + "loss": 5.6467, + "step": 6701 + }, + { + "epoch": 0.039858692549243506, + "grad_norm": 1.7630435228347778, + "learning_rate": 4.980431440269059e-05, + "loss": 5.9615, + "step": 6702 + }, + { + "epoch": 0.0398646398325245, + "grad_norm": 1.8051058053970337, + "learning_rate": 4.980425606978e-05, + "loss": 6.13, + "step": 6703 + }, + { + "epoch": 0.0398705871158055, + "grad_norm": 2.104901075363159, + "learning_rate": 4.98041977282105e-05, + "loss": 6.142, + "step": 6704 + }, + { + "epoch": 0.0398765343990865, + "grad_norm": 1.7022942304611206, + "learning_rate": 4.98041393779821e-05, + "loss": 5.6764, + "step": 6705 + }, + { + "epoch": 0.03988248168236749, + "grad_norm": 2.140230178833008, + "learning_rate": 4.980408101909485e-05, + "loss": 5.9796, + "step": 6706 + }, + { + "epoch": 0.03988842896564849, + "grad_norm": 1.9564754962921143, + "learning_rate": 4.9804022651548734e-05, + "loss": 6.005, + "step": 6707 + }, + { + "epoch": 0.03989437624892949, + "grad_norm": 1.9460588693618774, + "learning_rate": 4.9803964275343795e-05, + "loss": 5.9784, + "step": 6708 + }, + { + "epoch": 0.039900323532210485, + "grad_norm": 1.7314271926879883, + "learning_rate": 4.980390589048005e-05, + "loss": 5.7766, + "step": 6709 + }, + { + "epoch": 0.03990627081549148, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.9803847496957524e-05, + "loss": 5.7386, + "step": 6710 + }, + { + "epoch": 0.03991221809877248, + "grad_norm": 2.3194711208343506, + "learning_rate": 4.980378909477622e-05, + "loss": 6.1324, + "step": 6711 + }, + { + "epoch": 0.03991816538205348, + "grad_norm": 2.3532958030700684, + "learning_rate": 4.980373068393618e-05, + "loss": 6.027, + "step": 6712 + }, + { + "epoch": 0.03992411266533447, + "grad_norm": 2.5944385528564453, + "learning_rate": 4.980367226443741e-05, + "loss": 6.2892, + "step": 6713 + }, + { + "epoch": 0.039930059948615475, + "grad_norm": 1.5707015991210938, + "learning_rate": 4.9803613836279926e-05, + "loss": 5.6525, + "step": 6714 + }, + { + "epoch": 0.03993600723189647, + "grad_norm": 2.022613286972046, + "learning_rate": 4.980355539946376e-05, + "loss": 5.8943, + "step": 6715 + }, + { + "epoch": 0.039941954515177465, + "grad_norm": 1.7783907651901245, + "learning_rate": 4.980349695398894e-05, + "loss": 5.6451, + "step": 6716 + }, + { + "epoch": 0.03994790179845847, + "grad_norm": 2.098841428756714, + "learning_rate": 4.980343849985547e-05, + "loss": 6.1143, + "step": 6717 + }, + { + "epoch": 0.03995384908173946, + "grad_norm": 2.045955181121826, + "learning_rate": 4.9803380037063374e-05, + "loss": 6.1802, + "step": 6718 + }, + { + "epoch": 0.03995979636502046, + "grad_norm": 1.7324507236480713, + "learning_rate": 4.980332156561267e-05, + "loss": 6.081, + "step": 6719 + }, + { + "epoch": 0.03996574364830146, + "grad_norm": 1.795184850692749, + "learning_rate": 4.9803263085503385e-05, + "loss": 5.6075, + "step": 6720 + }, + { + "epoch": 0.039971690931582454, + "grad_norm": 2.1466586589813232, + "learning_rate": 4.980320459673554e-05, + "loss": 6.045, + "step": 6721 + }, + { + "epoch": 0.03997763821486345, + "grad_norm": 2.1261258125305176, + "learning_rate": 4.980314609930915e-05, + "loss": 6.0589, + "step": 6722 + }, + { + "epoch": 0.039983585498144444, + "grad_norm": 2.559584617614746, + "learning_rate": 4.980308759322424e-05, + "loss": 6.3894, + "step": 6723 + }, + { + "epoch": 0.039989532781425446, + "grad_norm": 2.4580929279327393, + "learning_rate": 4.980302907848083e-05, + "loss": 6.3979, + "step": 6724 + }, + { + "epoch": 0.03999548006470644, + "grad_norm": 1.8877859115600586, + "learning_rate": 4.9802970555078934e-05, + "loss": 5.5076, + "step": 6725 + }, + { + "epoch": 0.04000142734798744, + "grad_norm": 2.145123243331909, + "learning_rate": 4.9802912023018585e-05, + "loss": 6.1913, + "step": 6726 + }, + { + "epoch": 0.04000737463126844, + "grad_norm": 1.9321368932724, + "learning_rate": 4.980285348229979e-05, + "loss": 5.9614, + "step": 6727 + }, + { + "epoch": 0.040013321914549434, + "grad_norm": 1.883589506149292, + "learning_rate": 4.9802794932922577e-05, + "loss": 5.4293, + "step": 6728 + }, + { + "epoch": 0.04001926919783043, + "grad_norm": 1.9066367149353027, + "learning_rate": 4.980273637488696e-05, + "loss": 5.4299, + "step": 6729 + }, + { + "epoch": 0.04002521648111143, + "grad_norm": 1.845290184020996, + "learning_rate": 4.9802677808192963e-05, + "loss": 5.596, + "step": 6730 + }, + { + "epoch": 0.040031163764392426, + "grad_norm": 2.3295016288757324, + "learning_rate": 4.980261923284062e-05, + "loss": 6.1266, + "step": 6731 + }, + { + "epoch": 0.04003711104767342, + "grad_norm": 2.451676368713379, + "learning_rate": 4.980256064882993e-05, + "loss": 6.0578, + "step": 6732 + }, + { + "epoch": 0.04004305833095442, + "grad_norm": 2.1317830085754395, + "learning_rate": 4.9802502056160915e-05, + "loss": 6.2627, + "step": 6733 + }, + { + "epoch": 0.04004900561423542, + "grad_norm": 2.223085641860962, + "learning_rate": 4.980244345483361e-05, + "loss": 5.5751, + "step": 6734 + }, + { + "epoch": 0.04005495289751641, + "grad_norm": 2.508385181427002, + "learning_rate": 4.9802384844848035e-05, + "loss": 5.572, + "step": 6735 + }, + { + "epoch": 0.04006090018079741, + "grad_norm": 2.5150837898254395, + "learning_rate": 4.98023262262042e-05, + "loss": 5.3443, + "step": 6736 + }, + { + "epoch": 0.04006684746407841, + "grad_norm": 2.293503761291504, + "learning_rate": 4.980226759890212e-05, + "loss": 5.37, + "step": 6737 + }, + { + "epoch": 0.040072794747359405, + "grad_norm": 1.8764920234680176, + "learning_rate": 4.9802208962941834e-05, + "loss": 5.3804, + "step": 6738 + }, + { + "epoch": 0.0400787420306404, + "grad_norm": 1.8443305492401123, + "learning_rate": 4.980215031832335e-05, + "loss": 5.7787, + "step": 6739 + }, + { + "epoch": 0.0400846893139214, + "grad_norm": 2.6707816123962402, + "learning_rate": 4.980209166504669e-05, + "loss": 6.2858, + "step": 6740 + }, + { + "epoch": 0.0400906365972024, + "grad_norm": 2.3520665168762207, + "learning_rate": 4.980203300311188e-05, + "loss": 5.8069, + "step": 6741 + }, + { + "epoch": 0.04009658388048339, + "grad_norm": 2.0564348697662354, + "learning_rate": 4.980197433251893e-05, + "loss": 6.1698, + "step": 6742 + }, + { + "epoch": 0.040102531163764395, + "grad_norm": 2.205469846725464, + "learning_rate": 4.9801915653267875e-05, + "loss": 5.8401, + "step": 6743 + }, + { + "epoch": 0.04010847844704539, + "grad_norm": 2.042363405227661, + "learning_rate": 4.980185696535873e-05, + "loss": 5.9673, + "step": 6744 + }, + { + "epoch": 0.040114425730326385, + "grad_norm": 1.7575644254684448, + "learning_rate": 4.98017982687915e-05, + "loss": 5.7852, + "step": 6745 + }, + { + "epoch": 0.04012037301360739, + "grad_norm": 1.968548059463501, + "learning_rate": 4.980173956356623e-05, + "loss": 6.2085, + "step": 6746 + }, + { + "epoch": 0.04012632029688838, + "grad_norm": 2.0365097522735596, + "learning_rate": 4.980168084968292e-05, + "loss": 6.4235, + "step": 6747 + }, + { + "epoch": 0.04013226758016938, + "grad_norm": 2.7265079021453857, + "learning_rate": 4.9801622127141605e-05, + "loss": 6.0804, + "step": 6748 + }, + { + "epoch": 0.04013821486345038, + "grad_norm": 2.1604299545288086, + "learning_rate": 4.98015633959423e-05, + "loss": 5.942, + "step": 6749 + }, + { + "epoch": 0.040144162146731374, + "grad_norm": 2.4122307300567627, + "learning_rate": 4.980150465608502e-05, + "loss": 6.2877, + "step": 6750 + }, + { + "epoch": 0.04015010943001237, + "grad_norm": 2.040780782699585, + "learning_rate": 4.98014459075698e-05, + "loss": 5.645, + "step": 6751 + }, + { + "epoch": 0.040156056713293364, + "grad_norm": 2.3660147190093994, + "learning_rate": 4.980138715039665e-05, + "loss": 5.975, + "step": 6752 + }, + { + "epoch": 0.040162003996574366, + "grad_norm": 2.2332143783569336, + "learning_rate": 4.980132838456558e-05, + "loss": 6.1383, + "step": 6753 + }, + { + "epoch": 0.04016795127985536, + "grad_norm": 2.7028262615203857, + "learning_rate": 4.9801269610076635e-05, + "loss": 6.3817, + "step": 6754 + }, + { + "epoch": 0.04017389856313636, + "grad_norm": 2.4653360843658447, + "learning_rate": 4.980121082692982e-05, + "loss": 6.3079, + "step": 6755 + }, + { + "epoch": 0.04017984584641736, + "grad_norm": 2.1470963954925537, + "learning_rate": 4.980115203512515e-05, + "loss": 6.063, + "step": 6756 + }, + { + "epoch": 0.040185793129698354, + "grad_norm": 2.3440990447998047, + "learning_rate": 4.9801093234662666e-05, + "loss": 5.818, + "step": 6757 + }, + { + "epoch": 0.04019174041297935, + "grad_norm": 2.120245933532715, + "learning_rate": 4.980103442554237e-05, + "loss": 5.5867, + "step": 6758 + }, + { + "epoch": 0.04019768769626035, + "grad_norm": 3.196829080581665, + "learning_rate": 4.980097560776429e-05, + "loss": 6.0369, + "step": 6759 + }, + { + "epoch": 0.040203634979541346, + "grad_norm": 2.247997522354126, + "learning_rate": 4.9800916781328456e-05, + "loss": 5.8383, + "step": 6760 + }, + { + "epoch": 0.04020958226282234, + "grad_norm": 2.26254940032959, + "learning_rate": 4.9800857946234866e-05, + "loss": 5.8477, + "step": 6761 + }, + { + "epoch": 0.04021552954610334, + "grad_norm": 2.200495958328247, + "learning_rate": 4.9800799102483556e-05, + "loss": 5.681, + "step": 6762 + }, + { + "epoch": 0.04022147682938434, + "grad_norm": 2.136009454727173, + "learning_rate": 4.980074025007454e-05, + "loss": 5.6453, + "step": 6763 + }, + { + "epoch": 0.04022742411266533, + "grad_norm": 2.3510351181030273, + "learning_rate": 4.980068138900785e-05, + "loss": 5.5735, + "step": 6764 + }, + { + "epoch": 0.040233371395946335, + "grad_norm": 2.249199628829956, + "learning_rate": 4.980062251928349e-05, + "loss": 5.9883, + "step": 6765 + }, + { + "epoch": 0.04023931867922733, + "grad_norm": 2.426816463470459, + "learning_rate": 4.9800563640901494e-05, + "loss": 6.1658, + "step": 6766 + }, + { + "epoch": 0.040245265962508325, + "grad_norm": 2.1044836044311523, + "learning_rate": 4.9800504753861874e-05, + "loss": 5.8627, + "step": 6767 + }, + { + "epoch": 0.04025121324578932, + "grad_norm": 1.9563783407211304, + "learning_rate": 4.9800445858164656e-05, + "loss": 5.9642, + "step": 6768 + }, + { + "epoch": 0.04025716052907032, + "grad_norm": 2.3810997009277344, + "learning_rate": 4.980038695380986e-05, + "loss": 5.2938, + "step": 6769 + }, + { + "epoch": 0.04026310781235132, + "grad_norm": 2.3180932998657227, + "learning_rate": 4.98003280407975e-05, + "loss": 5.7682, + "step": 6770 + }, + { + "epoch": 0.04026905509563231, + "grad_norm": 2.420954704284668, + "learning_rate": 4.980026911912761e-05, + "loss": 5.5724, + "step": 6771 + }, + { + "epoch": 0.040275002378913315, + "grad_norm": 2.447460651397705, + "learning_rate": 4.9800210188800193e-05, + "loss": 5.4844, + "step": 6772 + }, + { + "epoch": 0.04028094966219431, + "grad_norm": 2.4059863090515137, + "learning_rate": 4.980015124981529e-05, + "loss": 5.604, + "step": 6773 + }, + { + "epoch": 0.040286896945475305, + "grad_norm": 2.251492977142334, + "learning_rate": 4.9800092302172894e-05, + "loss": 5.4565, + "step": 6774 + }, + { + "epoch": 0.04029284422875631, + "grad_norm": 2.478682279586792, + "learning_rate": 4.980003334587305e-05, + "loss": 5.9416, + "step": 6775 + }, + { + "epoch": 0.0402987915120373, + "grad_norm": 2.2685835361480713, + "learning_rate": 4.9799974380915785e-05, + "loss": 5.9659, + "step": 6776 + }, + { + "epoch": 0.0403047387953183, + "grad_norm": 2.833101987838745, + "learning_rate": 4.979991540730108e-05, + "loss": 5.3406, + "step": 6777 + }, + { + "epoch": 0.0403106860785993, + "grad_norm": 3.0967416763305664, + "learning_rate": 4.9799856425029e-05, + "loss": 5.5848, + "step": 6778 + }, + { + "epoch": 0.040316633361880294, + "grad_norm": 2.3081796169281006, + "learning_rate": 4.9799797434099536e-05, + "loss": 5.5964, + "step": 6779 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 2.359531879425049, + "learning_rate": 4.9799738434512724e-05, + "loss": 5.6614, + "step": 6780 + }, + { + "epoch": 0.040328527928442284, + "grad_norm": 2.1566221714019775, + "learning_rate": 4.979967942626858e-05, + "loss": 6.0517, + "step": 6781 + }, + { + "epoch": 0.040334475211723286, + "grad_norm": 2.3964991569519043, + "learning_rate": 4.979962040936712e-05, + "loss": 5.9516, + "step": 6782 + }, + { + "epoch": 0.04034042249500428, + "grad_norm": 1.9913266897201538, + "learning_rate": 4.9799561383808365e-05, + "loss": 5.9144, + "step": 6783 + }, + { + "epoch": 0.040346369778285276, + "grad_norm": 1.7329169511795044, + "learning_rate": 4.979950234959235e-05, + "loss": 6.0393, + "step": 6784 + }, + { + "epoch": 0.04035231706156628, + "grad_norm": 1.8278034925460815, + "learning_rate": 4.979944330671908e-05, + "loss": 5.9318, + "step": 6785 + }, + { + "epoch": 0.040358264344847274, + "grad_norm": 2.089806318283081, + "learning_rate": 4.979938425518858e-05, + "loss": 5.5726, + "step": 6786 + }, + { + "epoch": 0.04036421162812827, + "grad_norm": 2.03664231300354, + "learning_rate": 4.9799325195000874e-05, + "loss": 5.8265, + "step": 6787 + }, + { + "epoch": 0.04037015891140927, + "grad_norm": 1.8801567554473877, + "learning_rate": 4.979926612615597e-05, + "loss": 5.7575, + "step": 6788 + }, + { + "epoch": 0.040376106194690266, + "grad_norm": 1.814959168434143, + "learning_rate": 4.979920704865391e-05, + "loss": 5.8737, + "step": 6789 + }, + { + "epoch": 0.04038205347797126, + "grad_norm": 1.7018035650253296, + "learning_rate": 4.97991479624947e-05, + "loss": 5.6768, + "step": 6790 + }, + { + "epoch": 0.04038800076125226, + "grad_norm": 2.21545147895813, + "learning_rate": 4.979908886767837e-05, + "loss": 5.4206, + "step": 6791 + }, + { + "epoch": 0.04039394804453326, + "grad_norm": 2.6184499263763428, + "learning_rate": 4.979902976420492e-05, + "loss": 5.0255, + "step": 6792 + }, + { + "epoch": 0.04039989532781425, + "grad_norm": 2.3914453983306885, + "learning_rate": 4.9798970652074396e-05, + "loss": 4.884, + "step": 6793 + }, + { + "epoch": 0.040405842611095255, + "grad_norm": 2.4367334842681885, + "learning_rate": 4.97989115312868e-05, + "loss": 4.7445, + "step": 6794 + }, + { + "epoch": 0.04041178989437625, + "grad_norm": 2.794490337371826, + "learning_rate": 4.9798852401842165e-05, + "loss": 4.9686, + "step": 6795 + }, + { + "epoch": 0.040417737177657245, + "grad_norm": 2.665395736694336, + "learning_rate": 4.979879326374051e-05, + "loss": 4.854, + "step": 6796 + }, + { + "epoch": 0.04042368446093824, + "grad_norm": 2.0832581520080566, + "learning_rate": 4.979873411698184e-05, + "loss": 5.0371, + "step": 6797 + }, + { + "epoch": 0.04042963174421924, + "grad_norm": 2.4604554176330566, + "learning_rate": 4.979867496156619e-05, + "loss": 4.7524, + "step": 6798 + }, + { + "epoch": 0.04043557902750024, + "grad_norm": 2.3760480880737305, + "learning_rate": 4.979861579749359e-05, + "loss": 4.7645, + "step": 6799 + }, + { + "epoch": 0.04044152631078123, + "grad_norm": 2.468043088912964, + "learning_rate": 4.979855662476405e-05, + "loss": 4.7791, + "step": 6800 + }, + { + "epoch": 0.040447473594062235, + "grad_norm": 2.516026258468628, + "learning_rate": 4.979849744337758e-05, + "loss": 4.7978, + "step": 6801 + }, + { + "epoch": 0.04045342087734323, + "grad_norm": 2.1882307529449463, + "learning_rate": 4.979843825333421e-05, + "loss": 5.002, + "step": 6802 + }, + { + "epoch": 0.040459368160624225, + "grad_norm": 2.423140525817871, + "learning_rate": 4.979837905463397e-05, + "loss": 5.0161, + "step": 6803 + }, + { + "epoch": 0.04046531544390523, + "grad_norm": 2.485739231109619, + "learning_rate": 4.979831984727687e-05, + "loss": 4.7613, + "step": 6804 + }, + { + "epoch": 0.04047126272718622, + "grad_norm": 2.267744302749634, + "learning_rate": 4.979826063126293e-05, + "loss": 4.7496, + "step": 6805 + }, + { + "epoch": 0.04047721001046722, + "grad_norm": 2.3172249794006348, + "learning_rate": 4.9798201406592176e-05, + "loss": 4.8153, + "step": 6806 + }, + { + "epoch": 0.04048315729374822, + "grad_norm": 2.309471607208252, + "learning_rate": 4.979814217326463e-05, + "loss": 4.9874, + "step": 6807 + }, + { + "epoch": 0.040489104577029214, + "grad_norm": 1.989372968673706, + "learning_rate": 4.97980829312803e-05, + "loss": 5.1254, + "step": 6808 + }, + { + "epoch": 0.04049505186031021, + "grad_norm": 2.4409830570220947, + "learning_rate": 4.9798023680639216e-05, + "loss": 4.6476, + "step": 6809 + }, + { + "epoch": 0.040500999143591204, + "grad_norm": 2.5192453861236572, + "learning_rate": 4.97979644213414e-05, + "loss": 4.6933, + "step": 6810 + }, + { + "epoch": 0.040506946426872206, + "grad_norm": 2.294718027114868, + "learning_rate": 4.979790515338688e-05, + "loss": 4.8266, + "step": 6811 + }, + { + "epoch": 0.0405128937101532, + "grad_norm": 2.294550657272339, + "learning_rate": 4.979784587677565e-05, + "loss": 4.6691, + "step": 6812 + }, + { + "epoch": 0.040518840993434196, + "grad_norm": 2.332326889038086, + "learning_rate": 4.979778659150776e-05, + "loss": 4.8366, + "step": 6813 + }, + { + "epoch": 0.0405247882767152, + "grad_norm": 2.325439929962158, + "learning_rate": 4.979772729758322e-05, + "loss": 4.8149, + "step": 6814 + }, + { + "epoch": 0.040530735559996194, + "grad_norm": 2.165926456451416, + "learning_rate": 4.979766799500204e-05, + "loss": 4.7309, + "step": 6815 + }, + { + "epoch": 0.04053668284327719, + "grad_norm": 2.3184943199157715, + "learning_rate": 4.9797608683764264e-05, + "loss": 4.7163, + "step": 6816 + }, + { + "epoch": 0.04054263012655819, + "grad_norm": 2.2161147594451904, + "learning_rate": 4.979754936386989e-05, + "loss": 4.5549, + "step": 6817 + }, + { + "epoch": 0.040548577409839186, + "grad_norm": 2.415496587753296, + "learning_rate": 4.979749003531895e-05, + "loss": 4.7676, + "step": 6818 + }, + { + "epoch": 0.04055452469312018, + "grad_norm": 2.1700618267059326, + "learning_rate": 4.979743069811146e-05, + "loss": 4.8448, + "step": 6819 + }, + { + "epoch": 0.04056047197640118, + "grad_norm": 2.4978747367858887, + "learning_rate": 4.9797371352247446e-05, + "loss": 6.363, + "step": 6820 + }, + { + "epoch": 0.04056641925968218, + "grad_norm": 1.9293922185897827, + "learning_rate": 4.979731199772693e-05, + "loss": 5.6502, + "step": 6821 + }, + { + "epoch": 0.04057236654296317, + "grad_norm": 2.5583136081695557, + "learning_rate": 4.9797252634549915e-05, + "loss": 4.874, + "step": 6822 + }, + { + "epoch": 0.040578313826244175, + "grad_norm": 2.263460159301758, + "learning_rate": 4.979719326271645e-05, + "loss": 5.8457, + "step": 6823 + }, + { + "epoch": 0.04058426110952517, + "grad_norm": 2.5630266666412354, + "learning_rate": 4.979713388222653e-05, + "loss": 4.8668, + "step": 6824 + }, + { + "epoch": 0.040590208392806165, + "grad_norm": 2.2965216636657715, + "learning_rate": 4.9797074493080186e-05, + "loss": 5.0049, + "step": 6825 + }, + { + "epoch": 0.04059615567608716, + "grad_norm": 2.222405433654785, + "learning_rate": 4.979701509527745e-05, + "loss": 5.0204, + "step": 6826 + }, + { + "epoch": 0.04060210295936816, + "grad_norm": 2.4425504207611084, + "learning_rate": 4.979695568881833e-05, + "loss": 5.687, + "step": 6827 + }, + { + "epoch": 0.04060805024264916, + "grad_norm": 2.329901933670044, + "learning_rate": 4.979689627370284e-05, + "loss": 5.9447, + "step": 6828 + }, + { + "epoch": 0.04061399752593015, + "grad_norm": 2.3041510581970215, + "learning_rate": 4.9796836849931015e-05, + "loss": 5.9277, + "step": 6829 + }, + { + "epoch": 0.040619944809211155, + "grad_norm": 2.3020026683807373, + "learning_rate": 4.979677741750287e-05, + "loss": 5.9675, + "step": 6830 + }, + { + "epoch": 0.04062589209249215, + "grad_norm": 2.1861371994018555, + "learning_rate": 4.9796717976418426e-05, + "loss": 6.1312, + "step": 6831 + }, + { + "epoch": 0.040631839375773145, + "grad_norm": 1.9544565677642822, + "learning_rate": 4.979665852667771e-05, + "loss": 5.9218, + "step": 6832 + }, + { + "epoch": 0.04063778665905415, + "grad_norm": 2.346431016921997, + "learning_rate": 4.979659906828073e-05, + "loss": 6.1668, + "step": 6833 + }, + { + "epoch": 0.04064373394233514, + "grad_norm": 2.0405263900756836, + "learning_rate": 4.979653960122751e-05, + "loss": 6.0501, + "step": 6834 + }, + { + "epoch": 0.04064968122561614, + "grad_norm": 1.7645004987716675, + "learning_rate": 4.979648012551809e-05, + "loss": 6.0299, + "step": 6835 + }, + { + "epoch": 0.04065562850889714, + "grad_norm": 2.284703016281128, + "learning_rate": 4.979642064115246e-05, + "loss": 5.5501, + "step": 6836 + }, + { + "epoch": 0.040661575792178134, + "grad_norm": 1.7246543169021606, + "learning_rate": 4.979636114813066e-05, + "loss": 5.5733, + "step": 6837 + }, + { + "epoch": 0.04066752307545913, + "grad_norm": 2.0958921909332275, + "learning_rate": 4.9796301646452705e-05, + "loss": 5.8998, + "step": 6838 + }, + { + "epoch": 0.040673470358740124, + "grad_norm": 2.2123169898986816, + "learning_rate": 4.979624213611862e-05, + "loss": 6.0322, + "step": 6839 + }, + { + "epoch": 0.040679417642021126, + "grad_norm": 1.9541656970977783, + "learning_rate": 4.9796182617128426e-05, + "loss": 5.9255, + "step": 6840 + }, + { + "epoch": 0.04068536492530212, + "grad_norm": 2.077601909637451, + "learning_rate": 4.979612308948213e-05, + "loss": 5.6975, + "step": 6841 + }, + { + "epoch": 0.040691312208583116, + "grad_norm": 2.0595803260803223, + "learning_rate": 4.979606355317977e-05, + "loss": 6.0696, + "step": 6842 + }, + { + "epoch": 0.04069725949186412, + "grad_norm": 1.9800641536712646, + "learning_rate": 4.979600400822136e-05, + "loss": 5.7357, + "step": 6843 + }, + { + "epoch": 0.040703206775145113, + "grad_norm": 2.26238751411438, + "learning_rate": 4.979594445460692e-05, + "loss": 5.9119, + "step": 6844 + }, + { + "epoch": 0.04070915405842611, + "grad_norm": 2.0941457748413086, + "learning_rate": 4.979588489233648e-05, + "loss": 5.945, + "step": 6845 + }, + { + "epoch": 0.04071510134170711, + "grad_norm": 2.1995291709899902, + "learning_rate": 4.979582532141005e-05, + "loss": 5.8406, + "step": 6846 + }, + { + "epoch": 0.040721048624988106, + "grad_norm": 2.0138349533081055, + "learning_rate": 4.9795765741827646e-05, + "loss": 5.7984, + "step": 6847 + }, + { + "epoch": 0.0407269959082691, + "grad_norm": 1.9314415454864502, + "learning_rate": 4.9795706153589304e-05, + "loss": 5.8686, + "step": 6848 + }, + { + "epoch": 0.0407329431915501, + "grad_norm": 2.1324212551116943, + "learning_rate": 4.979564655669503e-05, + "loss": 5.8477, + "step": 6849 + }, + { + "epoch": 0.0407388904748311, + "grad_norm": 1.9601761102676392, + "learning_rate": 4.979558695114486e-05, + "loss": 5.9078, + "step": 6850 + }, + { + "epoch": 0.04074483775811209, + "grad_norm": 2.004333734512329, + "learning_rate": 4.97955273369388e-05, + "loss": 5.9852, + "step": 6851 + }, + { + "epoch": 0.040750785041393095, + "grad_norm": 1.9015164375305176, + "learning_rate": 4.979546771407688e-05, + "loss": 5.6286, + "step": 6852 + }, + { + "epoch": 0.04075673232467409, + "grad_norm": 1.9674208164215088, + "learning_rate": 4.979540808255911e-05, + "loss": 5.8715, + "step": 6853 + }, + { + "epoch": 0.040762679607955085, + "grad_norm": 2.0473713874816895, + "learning_rate": 4.9795348442385534e-05, + "loss": 5.7488, + "step": 6854 + }, + { + "epoch": 0.04076862689123608, + "grad_norm": 1.9536950588226318, + "learning_rate": 4.979528879355615e-05, + "loss": 5.6755, + "step": 6855 + }, + { + "epoch": 0.04077457417451708, + "grad_norm": 2.189659595489502, + "learning_rate": 4.979522913607099e-05, + "loss": 5.7934, + "step": 6856 + }, + { + "epoch": 0.04078052145779808, + "grad_norm": 1.999742031097412, + "learning_rate": 4.9795169469930067e-05, + "loss": 5.7341, + "step": 6857 + }, + { + "epoch": 0.04078646874107907, + "grad_norm": 2.1212494373321533, + "learning_rate": 4.9795109795133414e-05, + "loss": 5.8465, + "step": 6858 + }, + { + "epoch": 0.040792416024360074, + "grad_norm": 1.966467261314392, + "learning_rate": 4.979505011168104e-05, + "loss": 5.8699, + "step": 6859 + }, + { + "epoch": 0.04079836330764107, + "grad_norm": 2.290205955505371, + "learning_rate": 4.979499041957297e-05, + "loss": 6.387, + "step": 6860 + }, + { + "epoch": 0.040804310590922065, + "grad_norm": 2.41827130317688, + "learning_rate": 4.979493071880923e-05, + "loss": 6.893, + "step": 6861 + }, + { + "epoch": 0.04081025787420307, + "grad_norm": 2.0652520656585693, + "learning_rate": 4.979487100938983e-05, + "loss": 6.6435, + "step": 6862 + }, + { + "epoch": 0.04081620515748406, + "grad_norm": 1.8594858646392822, + "learning_rate": 4.979481129131479e-05, + "loss": 5.7441, + "step": 6863 + }, + { + "epoch": 0.04082215244076506, + "grad_norm": 2.269240617752075, + "learning_rate": 4.979475156458415e-05, + "loss": 5.8468, + "step": 6864 + }, + { + "epoch": 0.04082809972404606, + "grad_norm": 2.2355518341064453, + "learning_rate": 4.979469182919792e-05, + "loss": 5.8717, + "step": 6865 + }, + { + "epoch": 0.040834047007327054, + "grad_norm": 1.9578050374984741, + "learning_rate": 4.9794632085156105e-05, + "loss": 5.6777, + "step": 6866 + }, + { + "epoch": 0.04083999429060805, + "grad_norm": 2.354609727859497, + "learning_rate": 4.979457233245875e-05, + "loss": 5.7993, + "step": 6867 + }, + { + "epoch": 0.040845941573889044, + "grad_norm": 1.978289008140564, + "learning_rate": 4.9794512571105865e-05, + "loss": 5.7429, + "step": 6868 + }, + { + "epoch": 0.040851888857170046, + "grad_norm": 1.9695252180099487, + "learning_rate": 4.979445280109747e-05, + "loss": 6.1322, + "step": 6869 + }, + { + "epoch": 0.04085783614045104, + "grad_norm": 2.172510862350464, + "learning_rate": 4.9794393022433586e-05, + "loss": 5.9443, + "step": 6870 + }, + { + "epoch": 0.040863783423732036, + "grad_norm": 2.1992416381835938, + "learning_rate": 4.9794333235114244e-05, + "loss": 6.4094, + "step": 6871 + }, + { + "epoch": 0.04086973070701304, + "grad_norm": 2.1804773807525635, + "learning_rate": 4.979427343913945e-05, + "loss": 6.3871, + "step": 6872 + }, + { + "epoch": 0.04087567799029403, + "grad_norm": 2.2877554893493652, + "learning_rate": 4.979421363450923e-05, + "loss": 6.2509, + "step": 6873 + }, + { + "epoch": 0.04088162527357503, + "grad_norm": 2.0697927474975586, + "learning_rate": 4.979415382122361e-05, + "loss": 5.9008, + "step": 6874 + }, + { + "epoch": 0.04088757255685603, + "grad_norm": 2.2907917499542236, + "learning_rate": 4.97940939992826e-05, + "loss": 5.6137, + "step": 6875 + }, + { + "epoch": 0.040893519840137026, + "grad_norm": 1.9960983991622925, + "learning_rate": 4.979403416868623e-05, + "loss": 5.7283, + "step": 6876 + }, + { + "epoch": 0.04089946712341802, + "grad_norm": 2.2767558097839355, + "learning_rate": 4.9793974329434525e-05, + "loss": 5.3632, + "step": 6877 + }, + { + "epoch": 0.04090541440669902, + "grad_norm": 2.295635461807251, + "learning_rate": 4.97939144815275e-05, + "loss": 5.4524, + "step": 6878 + }, + { + "epoch": 0.04091136168998002, + "grad_norm": 2.247194766998291, + "learning_rate": 4.9793854624965166e-05, + "loss": 5.7846, + "step": 6879 + }, + { + "epoch": 0.04091730897326101, + "grad_norm": 2.2641420364379883, + "learning_rate": 4.9793794759747565e-05, + "loss": 5.7479, + "step": 6880 + }, + { + "epoch": 0.040923256256542015, + "grad_norm": 2.002126455307007, + "learning_rate": 4.97937348858747e-05, + "loss": 5.2694, + "step": 6881 + }, + { + "epoch": 0.04092920353982301, + "grad_norm": 2.079157590866089, + "learning_rate": 4.9793675003346596e-05, + "loss": 6.2711, + "step": 6882 + }, + { + "epoch": 0.040935150823104005, + "grad_norm": 1.9030524492263794, + "learning_rate": 4.979361511216328e-05, + "loss": 5.7259, + "step": 6883 + }, + { + "epoch": 0.040941098106385, + "grad_norm": 1.9157373905181885, + "learning_rate": 4.9793555212324774e-05, + "loss": 6.086, + "step": 6884 + }, + { + "epoch": 0.040947045389666, + "grad_norm": 1.8622015714645386, + "learning_rate": 4.979349530383108e-05, + "loss": 6.1318, + "step": 6885 + }, + { + "epoch": 0.040952992672947, + "grad_norm": 2.3341257572174072, + "learning_rate": 4.9793435386682256e-05, + "loss": 5.9421, + "step": 6886 + }, + { + "epoch": 0.04095893995622799, + "grad_norm": 2.6894209384918213, + "learning_rate": 4.979337546087828e-05, + "loss": 5.5351, + "step": 6887 + }, + { + "epoch": 0.040964887239508994, + "grad_norm": 2.5316739082336426, + "learning_rate": 4.979331552641919e-05, + "loss": 5.5056, + "step": 6888 + }, + { + "epoch": 0.04097083452278999, + "grad_norm": 2.5129077434539795, + "learning_rate": 4.979325558330502e-05, + "loss": 5.3091, + "step": 6889 + }, + { + "epoch": 0.040976781806070985, + "grad_norm": 2.275536298751831, + "learning_rate": 4.979319563153578e-05, + "loss": 5.494, + "step": 6890 + }, + { + "epoch": 0.04098272908935199, + "grad_norm": 2.749375104904175, + "learning_rate": 4.9793135671111494e-05, + "loss": 6.0139, + "step": 6891 + }, + { + "epoch": 0.04098867637263298, + "grad_norm": 2.419163227081299, + "learning_rate": 4.9793075702032177e-05, + "loss": 6.1102, + "step": 6892 + }, + { + "epoch": 0.04099462365591398, + "grad_norm": 2.311450958251953, + "learning_rate": 4.9793015724297856e-05, + "loss": 5.9798, + "step": 6893 + }, + { + "epoch": 0.04100057093919498, + "grad_norm": 2.0522212982177734, + "learning_rate": 4.979295573790854e-05, + "loss": 5.9247, + "step": 6894 + }, + { + "epoch": 0.041006518222475974, + "grad_norm": 2.1928513050079346, + "learning_rate": 4.979289574286427e-05, + "loss": 5.8001, + "step": 6895 + }, + { + "epoch": 0.04101246550575697, + "grad_norm": 2.1945207118988037, + "learning_rate": 4.979283573916505e-05, + "loss": 5.9975, + "step": 6896 + }, + { + "epoch": 0.041018412789037964, + "grad_norm": 2.274843454360962, + "learning_rate": 4.979277572681091e-05, + "loss": 5.693, + "step": 6897 + }, + { + "epoch": 0.041024360072318966, + "grad_norm": 2.2715282440185547, + "learning_rate": 4.979271570580186e-05, + "loss": 5.9952, + "step": 6898 + }, + { + "epoch": 0.04103030735559996, + "grad_norm": 2.4459903240203857, + "learning_rate": 4.9792655676137943e-05, + "loss": 6.0305, + "step": 6899 + }, + { + "epoch": 0.041036254638880956, + "grad_norm": 2.8737339973449707, + "learning_rate": 4.9792595637819165e-05, + "loss": 6.0982, + "step": 6900 + }, + { + "epoch": 0.04104220192216196, + "grad_norm": 2.382143974304199, + "learning_rate": 4.979253559084553e-05, + "loss": 5.6122, + "step": 6901 + }, + { + "epoch": 0.04104814920544295, + "grad_norm": 2.4127237796783447, + "learning_rate": 4.97924755352171e-05, + "loss": 5.7723, + "step": 6902 + }, + { + "epoch": 0.04105409648872395, + "grad_norm": 2.3108956813812256, + "learning_rate": 4.979241547093386e-05, + "loss": 6.1655, + "step": 6903 + }, + { + "epoch": 0.04106004377200495, + "grad_norm": 2.250555992126465, + "learning_rate": 4.979235539799584e-05, + "loss": 6.0627, + "step": 6904 + }, + { + "epoch": 0.041065991055285946, + "grad_norm": 2.187957525253296, + "learning_rate": 4.979229531640307e-05, + "loss": 6.1438, + "step": 6905 + }, + { + "epoch": 0.04107193833856694, + "grad_norm": 1.9089539051055908, + "learning_rate": 4.979223522615557e-05, + "loss": 6.1431, + "step": 6906 + }, + { + "epoch": 0.04107788562184794, + "grad_norm": 2.343569040298462, + "learning_rate": 4.979217512725336e-05, + "loss": 5.9774, + "step": 6907 + }, + { + "epoch": 0.04108383290512894, + "grad_norm": 2.759631633758545, + "learning_rate": 4.979211501969645e-05, + "loss": 5.7982, + "step": 6908 + }, + { + "epoch": 0.04108978018840993, + "grad_norm": 2.295811414718628, + "learning_rate": 4.979205490348487e-05, + "loss": 6.0843, + "step": 6909 + }, + { + "epoch": 0.041095727471690935, + "grad_norm": 2.6259605884552, + "learning_rate": 4.979199477861864e-05, + "loss": 5.6498, + "step": 6910 + }, + { + "epoch": 0.04110167475497193, + "grad_norm": 2.396895408630371, + "learning_rate": 4.9791934645097785e-05, + "loss": 5.9936, + "step": 6911 + }, + { + "epoch": 0.041107622038252925, + "grad_norm": 2.020845651626587, + "learning_rate": 4.979187450292231e-05, + "loss": 5.4867, + "step": 6912 + }, + { + "epoch": 0.04111356932153392, + "grad_norm": 2.6473753452301025, + "learning_rate": 4.979181435209226e-05, + "loss": 5.3556, + "step": 6913 + }, + { + "epoch": 0.04111951660481492, + "grad_norm": 2.353158712387085, + "learning_rate": 4.9791754192607636e-05, + "loss": 6.3122, + "step": 6914 + }, + { + "epoch": 0.04112546388809592, + "grad_norm": 2.499817132949829, + "learning_rate": 4.9791694024468474e-05, + "loss": 5.816, + "step": 6915 + }, + { + "epoch": 0.04113141117137691, + "grad_norm": 2.009239673614502, + "learning_rate": 4.979163384767478e-05, + "loss": 5.5982, + "step": 6916 + }, + { + "epoch": 0.041137358454657914, + "grad_norm": 2.3885819911956787, + "learning_rate": 4.9791573662226586e-05, + "loss": 5.7403, + "step": 6917 + }, + { + "epoch": 0.04114330573793891, + "grad_norm": 2.3135135173797607, + "learning_rate": 4.979151346812391e-05, + "loss": 5.3151, + "step": 6918 + }, + { + "epoch": 0.041149253021219905, + "grad_norm": 1.9801241159439087, + "learning_rate": 4.979145326536677e-05, + "loss": 5.5148, + "step": 6919 + }, + { + "epoch": 0.04115520030450091, + "grad_norm": 2.0724904537200928, + "learning_rate": 4.979139305395519e-05, + "loss": 5.5355, + "step": 6920 + }, + { + "epoch": 0.0411611475877819, + "grad_norm": 1.8104170560836792, + "learning_rate": 4.97913328338892e-05, + "loss": 5.4861, + "step": 6921 + }, + { + "epoch": 0.0411670948710629, + "grad_norm": 1.81072998046875, + "learning_rate": 4.9791272605168804e-05, + "loss": 5.5075, + "step": 6922 + }, + { + "epoch": 0.0411730421543439, + "grad_norm": 1.709191083908081, + "learning_rate": 4.979121236779403e-05, + "loss": 6.1353, + "step": 6923 + }, + { + "epoch": 0.041178989437624894, + "grad_norm": 2.004974126815796, + "learning_rate": 4.9791152121764903e-05, + "loss": 5.478, + "step": 6924 + }, + { + "epoch": 0.04118493672090589, + "grad_norm": 1.937933325767517, + "learning_rate": 4.979109186708144e-05, + "loss": 5.4022, + "step": 6925 + }, + { + "epoch": 0.041190884004186884, + "grad_norm": 1.9453305006027222, + "learning_rate": 4.979103160374367e-05, + "loss": 5.243, + "step": 6926 + }, + { + "epoch": 0.041196831287467886, + "grad_norm": 1.8552072048187256, + "learning_rate": 4.979097133175159e-05, + "loss": 5.3104, + "step": 6927 + }, + { + "epoch": 0.04120277857074888, + "grad_norm": 1.9148203134536743, + "learning_rate": 4.9790911051105246e-05, + "loss": 5.5538, + "step": 6928 + }, + { + "epoch": 0.041208725854029876, + "grad_norm": 1.9658032655715942, + "learning_rate": 4.979085076180466e-05, + "loss": 5.5285, + "step": 6929 + }, + { + "epoch": 0.04121467313731088, + "grad_norm": 1.7332781553268433, + "learning_rate": 4.9790790463849835e-05, + "loss": 5.1959, + "step": 6930 + }, + { + "epoch": 0.04122062042059187, + "grad_norm": 1.5762557983398438, + "learning_rate": 4.9790730157240804e-05, + "loss": 5.3672, + "step": 6931 + }, + { + "epoch": 0.04122656770387287, + "grad_norm": 1.7899656295776367, + "learning_rate": 4.979066984197759e-05, + "loss": 5.3588, + "step": 6932 + }, + { + "epoch": 0.04123251498715387, + "grad_norm": 1.5992622375488281, + "learning_rate": 4.97906095180602e-05, + "loss": 5.275, + "step": 6933 + }, + { + "epoch": 0.041238462270434866, + "grad_norm": 1.875116229057312, + "learning_rate": 4.9790549185488666e-05, + "loss": 5.3428, + "step": 6934 + }, + { + "epoch": 0.04124440955371586, + "grad_norm": 1.8110510110855103, + "learning_rate": 4.979048884426301e-05, + "loss": 5.2416, + "step": 6935 + }, + { + "epoch": 0.04125035683699686, + "grad_norm": 1.5512267351150513, + "learning_rate": 4.979042849438325e-05, + "loss": 5.3643, + "step": 6936 + }, + { + "epoch": 0.04125630412027786, + "grad_norm": 1.8929630517959595, + "learning_rate": 4.979036813584941e-05, + "loss": 5.4232, + "step": 6937 + }, + { + "epoch": 0.04126225140355885, + "grad_norm": 1.8569291830062866, + "learning_rate": 4.9790307768661504e-05, + "loss": 5.2949, + "step": 6938 + }, + { + "epoch": 0.041268198686839855, + "grad_norm": 1.6058611869812012, + "learning_rate": 4.9790247392819564e-05, + "loss": 5.3736, + "step": 6939 + }, + { + "epoch": 0.04127414597012085, + "grad_norm": 1.8455227613449097, + "learning_rate": 4.97901870083236e-05, + "loss": 5.2768, + "step": 6940 + }, + { + "epoch": 0.041280093253401845, + "grad_norm": 1.9346935749053955, + "learning_rate": 4.979012661517364e-05, + "loss": 5.4316, + "step": 6941 + }, + { + "epoch": 0.04128604053668284, + "grad_norm": 1.8085594177246094, + "learning_rate": 4.97900662133697e-05, + "loss": 5.365, + "step": 6942 + }, + { + "epoch": 0.04129198781996384, + "grad_norm": 1.73456871509552, + "learning_rate": 4.9790005802911804e-05, + "loss": 5.2726, + "step": 6943 + }, + { + "epoch": 0.04129793510324484, + "grad_norm": 2.1071617603302, + "learning_rate": 4.978994538379997e-05, + "loss": 6.2313, + "step": 6944 + }, + { + "epoch": 0.04130388238652583, + "grad_norm": 1.7098963260650635, + "learning_rate": 4.978988495603423e-05, + "loss": 5.3162, + "step": 6945 + }, + { + "epoch": 0.041309829669806834, + "grad_norm": 1.8131905794143677, + "learning_rate": 4.978982451961459e-05, + "loss": 5.2486, + "step": 6946 + }, + { + "epoch": 0.04131577695308783, + "grad_norm": 1.8162381649017334, + "learning_rate": 4.978976407454109e-05, + "loss": 5.2806, + "step": 6947 + }, + { + "epoch": 0.041321724236368824, + "grad_norm": 1.9250297546386719, + "learning_rate": 4.9789703620813734e-05, + "loss": 5.1742, + "step": 6948 + }, + { + "epoch": 0.041327671519649826, + "grad_norm": 1.8263678550720215, + "learning_rate": 4.978964315843254e-05, + "loss": 5.1786, + "step": 6949 + }, + { + "epoch": 0.04133361880293082, + "grad_norm": 1.6751807928085327, + "learning_rate": 4.9789582687397546e-05, + "loss": 5.4798, + "step": 6950 + }, + { + "epoch": 0.04133956608621182, + "grad_norm": 1.7842947244644165, + "learning_rate": 4.9789522207708764e-05, + "loss": 5.201, + "step": 6951 + }, + { + "epoch": 0.04134551336949282, + "grad_norm": 1.6785067319869995, + "learning_rate": 4.978946171936621e-05, + "loss": 5.3852, + "step": 6952 + }, + { + "epoch": 0.041351460652773814, + "grad_norm": 1.5475291013717651, + "learning_rate": 4.978940122236992e-05, + "loss": 5.4083, + "step": 6953 + }, + { + "epoch": 0.04135740793605481, + "grad_norm": 1.7445106506347656, + "learning_rate": 4.97893407167199e-05, + "loss": 5.3125, + "step": 6954 + }, + { + "epoch": 0.041363355219335804, + "grad_norm": 1.7334082126617432, + "learning_rate": 4.9789280202416175e-05, + "loss": 5.5388, + "step": 6955 + }, + { + "epoch": 0.041369302502616806, + "grad_norm": 1.7267119884490967, + "learning_rate": 4.9789219679458774e-05, + "loss": 5.5175, + "step": 6956 + }, + { + "epoch": 0.0413752497858978, + "grad_norm": 1.8033246994018555, + "learning_rate": 4.978915914784771e-05, + "loss": 5.3523, + "step": 6957 + }, + { + "epoch": 0.041381197069178796, + "grad_norm": 1.9836528301239014, + "learning_rate": 4.978909860758301e-05, + "loss": 5.3808, + "step": 6958 + }, + { + "epoch": 0.0413871443524598, + "grad_norm": 1.6260416507720947, + "learning_rate": 4.978903805866469e-05, + "loss": 5.4642, + "step": 6959 + }, + { + "epoch": 0.04139309163574079, + "grad_norm": 1.7260626554489136, + "learning_rate": 4.978897750109277e-05, + "loss": 5.4975, + "step": 6960 + }, + { + "epoch": 0.04139903891902179, + "grad_norm": 1.6948668956756592, + "learning_rate": 4.978891693486728e-05, + "loss": 5.5768, + "step": 6961 + }, + { + "epoch": 0.04140498620230279, + "grad_norm": 1.7885476350784302, + "learning_rate": 4.978885635998824e-05, + "loss": 5.4156, + "step": 6962 + }, + { + "epoch": 0.041410933485583785, + "grad_norm": 1.8626813888549805, + "learning_rate": 4.978879577645565e-05, + "loss": 5.354, + "step": 6963 + }, + { + "epoch": 0.04141688076886478, + "grad_norm": 1.867090106010437, + "learning_rate": 4.9788735184269553e-05, + "loss": 5.2934, + "step": 6964 + }, + { + "epoch": 0.04142282805214578, + "grad_norm": 1.7208340167999268, + "learning_rate": 4.9788674583429974e-05, + "loss": 5.2116, + "step": 6965 + }, + { + "epoch": 0.04142877533542678, + "grad_norm": 1.934480905532837, + "learning_rate": 4.9788613973936916e-05, + "loss": 5.5801, + "step": 6966 + }, + { + "epoch": 0.04143472261870777, + "grad_norm": 1.6263724565505981, + "learning_rate": 4.978855335579041e-05, + "loss": 5.3835, + "step": 6967 + }, + { + "epoch": 0.041440669901988775, + "grad_norm": 1.743996262550354, + "learning_rate": 4.9788492728990474e-05, + "loss": 5.3281, + "step": 6968 + }, + { + "epoch": 0.04144661718526977, + "grad_norm": 1.5556843280792236, + "learning_rate": 4.978843209353714e-05, + "loss": 5.442, + "step": 6969 + }, + { + "epoch": 0.041452564468550765, + "grad_norm": 1.5540435314178467, + "learning_rate": 4.978837144943041e-05, + "loss": 5.3621, + "step": 6970 + }, + { + "epoch": 0.04145851175183176, + "grad_norm": 1.7884414196014404, + "learning_rate": 4.9788310796670326e-05, + "loss": 5.571, + "step": 6971 + }, + { + "epoch": 0.04146445903511276, + "grad_norm": 1.7550957202911377, + "learning_rate": 4.9788250135256886e-05, + "loss": 5.61, + "step": 6972 + }, + { + "epoch": 0.04147040631839376, + "grad_norm": 1.9336804151535034, + "learning_rate": 4.978818946519013e-05, + "loss": 5.6142, + "step": 6973 + }, + { + "epoch": 0.04147635360167475, + "grad_norm": 1.8888505697250366, + "learning_rate": 4.978812878647008e-05, + "loss": 5.4908, + "step": 6974 + }, + { + "epoch": 0.041482300884955754, + "grad_norm": 1.940371036529541, + "learning_rate": 4.978806809909674e-05, + "loss": 5.5407, + "step": 6975 + }, + { + "epoch": 0.04148824816823675, + "grad_norm": 2.0182151794433594, + "learning_rate": 4.9788007403070146e-05, + "loss": 5.3643, + "step": 6976 + }, + { + "epoch": 0.041494195451517744, + "grad_norm": 1.7960541248321533, + "learning_rate": 4.978794669839032e-05, + "loss": 5.4994, + "step": 6977 + }, + { + "epoch": 0.041500142734798746, + "grad_norm": 1.8403207063674927, + "learning_rate": 4.978788598505727e-05, + "loss": 5.4501, + "step": 6978 + }, + { + "epoch": 0.04150609001807974, + "grad_norm": 1.7232698202133179, + "learning_rate": 4.978782526307103e-05, + "loss": 5.5406, + "step": 6979 + }, + { + "epoch": 0.04151203730136074, + "grad_norm": 1.7003169059753418, + "learning_rate": 4.9787764532431615e-05, + "loss": 5.3427, + "step": 6980 + }, + { + "epoch": 0.04151798458464174, + "grad_norm": 2.041384696960449, + "learning_rate": 4.978770379313904e-05, + "loss": 5.5121, + "step": 6981 + }, + { + "epoch": 0.041523931867922734, + "grad_norm": 1.5773900747299194, + "learning_rate": 4.978764304519334e-05, + "loss": 5.4604, + "step": 6982 + }, + { + "epoch": 0.04152987915120373, + "grad_norm": 1.8834172487258911, + "learning_rate": 4.9787582288594535e-05, + "loss": 5.5141, + "step": 6983 + }, + { + "epoch": 0.04153582643448473, + "grad_norm": 1.7956576347351074, + "learning_rate": 4.978752152334264e-05, + "loss": 5.5664, + "step": 6984 + }, + { + "epoch": 0.041541773717765726, + "grad_norm": 1.8676495552062988, + "learning_rate": 4.978746074943767e-05, + "loss": 5.2846, + "step": 6985 + }, + { + "epoch": 0.04154772100104672, + "grad_norm": 1.7709665298461914, + "learning_rate": 4.9787399966879654e-05, + "loss": 5.3375, + "step": 6986 + }, + { + "epoch": 0.041553668284327716, + "grad_norm": 2.012941837310791, + "learning_rate": 4.978733917566862e-05, + "loss": 5.6973, + "step": 6987 + }, + { + "epoch": 0.04155961556760872, + "grad_norm": 1.8220570087432861, + "learning_rate": 4.978727837580458e-05, + "loss": 5.191, + "step": 6988 + }, + { + "epoch": 0.04156556285088971, + "grad_norm": 1.6511586904525757, + "learning_rate": 4.978721756728755e-05, + "loss": 5.2787, + "step": 6989 + }, + { + "epoch": 0.04157151013417071, + "grad_norm": 1.9026141166687012, + "learning_rate": 4.978715675011757e-05, + "loss": 5.4456, + "step": 6990 + }, + { + "epoch": 0.04157745741745171, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.9787095924294633e-05, + "loss": 5.5013, + "step": 6991 + }, + { + "epoch": 0.041583404700732705, + "grad_norm": 1.8720741271972656, + "learning_rate": 4.978703508981879e-05, + "loss": 5.3952, + "step": 6992 + }, + { + "epoch": 0.0415893519840137, + "grad_norm": 1.817356824874878, + "learning_rate": 4.978697424669005e-05, + "loss": 5.4719, + "step": 6993 + }, + { + "epoch": 0.0415952992672947, + "grad_norm": 1.740702509880066, + "learning_rate": 4.978691339490843e-05, + "loss": 5.6484, + "step": 6994 + }, + { + "epoch": 0.0416012465505757, + "grad_norm": 1.8752427101135254, + "learning_rate": 4.978685253447395e-05, + "loss": 5.6394, + "step": 6995 + }, + { + "epoch": 0.04160719383385669, + "grad_norm": 1.8180509805679321, + "learning_rate": 4.978679166538665e-05, + "loss": 5.3401, + "step": 6996 + }, + { + "epoch": 0.041613141117137695, + "grad_norm": 1.9002251625061035, + "learning_rate": 4.9786730787646516e-05, + "loss": 5.3237, + "step": 6997 + }, + { + "epoch": 0.04161908840041869, + "grad_norm": 1.741176724433899, + "learning_rate": 4.978666990125361e-05, + "loss": 5.2311, + "step": 6998 + }, + { + "epoch": 0.041625035683699685, + "grad_norm": 2.0994246006011963, + "learning_rate": 4.9786609006207925e-05, + "loss": 5.3549, + "step": 6999 + }, + { + "epoch": 0.04163098296698068, + "grad_norm": 1.8438987731933594, + "learning_rate": 4.978654810250949e-05, + "loss": 5.4322, + "step": 7000 + }, + { + "epoch": 0.04163693025026168, + "grad_norm": 1.7411181926727295, + "learning_rate": 4.978648719015833e-05, + "loss": 5.455, + "step": 7001 + }, + { + "epoch": 0.04164287753354268, + "grad_norm": 1.6879174709320068, + "learning_rate": 4.978642626915446e-05, + "loss": 5.3676, + "step": 7002 + }, + { + "epoch": 0.04164882481682367, + "grad_norm": 1.8912461996078491, + "learning_rate": 4.9786365339497906e-05, + "loss": 5.6181, + "step": 7003 + }, + { + "epoch": 0.041654772100104674, + "grad_norm": 1.9234617948532104, + "learning_rate": 4.978630440118869e-05, + "loss": 5.5388, + "step": 7004 + }, + { + "epoch": 0.04166071938338567, + "grad_norm": 2.1059048175811768, + "learning_rate": 4.9786243454226824e-05, + "loss": 5.6856, + "step": 7005 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 2.1900687217712402, + "learning_rate": 4.9786182498612347e-05, + "loss": 6.2426, + "step": 7006 + }, + { + "epoch": 0.041672613949947666, + "grad_norm": 1.7580265998840332, + "learning_rate": 4.9786121534345265e-05, + "loss": 5.2342, + "step": 7007 + }, + { + "epoch": 0.04167856123322866, + "grad_norm": 1.4747200012207031, + "learning_rate": 4.97860605614256e-05, + "loss": 5.1977, + "step": 7008 + }, + { + "epoch": 0.04168450851650966, + "grad_norm": 1.8164165019989014, + "learning_rate": 4.978599957985338e-05, + "loss": 5.1362, + "step": 7009 + }, + { + "epoch": 0.04169045579979066, + "grad_norm": 1.468550443649292, + "learning_rate": 4.978593858962863e-05, + "loss": 5.1265, + "step": 7010 + }, + { + "epoch": 0.041696403083071654, + "grad_norm": 1.584343433380127, + "learning_rate": 4.9785877590751356e-05, + "loss": 5.2611, + "step": 7011 + }, + { + "epoch": 0.04170235036635265, + "grad_norm": 1.7864785194396973, + "learning_rate": 4.978581658322159e-05, + "loss": 5.5214, + "step": 7012 + }, + { + "epoch": 0.04170829764963365, + "grad_norm": 1.8359016180038452, + "learning_rate": 4.978575556703936e-05, + "loss": 5.3808, + "step": 7013 + }, + { + "epoch": 0.041714244932914646, + "grad_norm": 1.8298325538635254, + "learning_rate": 4.978569454220467e-05, + "loss": 5.5606, + "step": 7014 + }, + { + "epoch": 0.04172019221619564, + "grad_norm": 2.1555540561676025, + "learning_rate": 4.978563350871755e-05, + "loss": 5.6592, + "step": 7015 + }, + { + "epoch": 0.041726139499476636, + "grad_norm": 2.5251846313476562, + "learning_rate": 4.9785572466578026e-05, + "loss": 5.5771, + "step": 7016 + }, + { + "epoch": 0.04173208678275764, + "grad_norm": 1.7765661478042603, + "learning_rate": 4.9785511415786115e-05, + "loss": 5.5558, + "step": 7017 + }, + { + "epoch": 0.04173803406603863, + "grad_norm": 1.9711554050445557, + "learning_rate": 4.978545035634183e-05, + "loss": 5.5565, + "step": 7018 + }, + { + "epoch": 0.04174398134931963, + "grad_norm": 1.8080202341079712, + "learning_rate": 4.978538928824521e-05, + "loss": 5.5037, + "step": 7019 + }, + { + "epoch": 0.04174992863260063, + "grad_norm": 1.7506872415542603, + "learning_rate": 4.978532821149626e-05, + "loss": 5.3362, + "step": 7020 + }, + { + "epoch": 0.041755875915881625, + "grad_norm": 1.5606149435043335, + "learning_rate": 4.978526712609501e-05, + "loss": 5.3541, + "step": 7021 + }, + { + "epoch": 0.04176182319916262, + "grad_norm": 1.8840737342834473, + "learning_rate": 4.9785206032041476e-05, + "loss": 5.2315, + "step": 7022 + }, + { + "epoch": 0.04176777048244362, + "grad_norm": 2.118178606033325, + "learning_rate": 4.978514492933569e-05, + "loss": 5.6174, + "step": 7023 + }, + { + "epoch": 0.04177371776572462, + "grad_norm": 2.043907403945923, + "learning_rate": 4.978508381797766e-05, + "loss": 5.6272, + "step": 7024 + }, + { + "epoch": 0.04177966504900561, + "grad_norm": 1.764411211013794, + "learning_rate": 4.978502269796742e-05, + "loss": 5.6153, + "step": 7025 + }, + { + "epoch": 0.041785612332286615, + "grad_norm": 1.5760626792907715, + "learning_rate": 4.978496156930498e-05, + "loss": 5.5734, + "step": 7026 + }, + { + "epoch": 0.04179155961556761, + "grad_norm": 1.8857802152633667, + "learning_rate": 4.9784900431990366e-05, + "loss": 5.5295, + "step": 7027 + }, + { + "epoch": 0.041797506898848605, + "grad_norm": 1.7287275791168213, + "learning_rate": 4.97848392860236e-05, + "loss": 5.3175, + "step": 7028 + }, + { + "epoch": 0.0418034541821296, + "grad_norm": 1.915263295173645, + "learning_rate": 4.97847781314047e-05, + "loss": 5.4838, + "step": 7029 + }, + { + "epoch": 0.0418094014654106, + "grad_norm": 2.049435615539551, + "learning_rate": 4.97847169681337e-05, + "loss": 5.5508, + "step": 7030 + }, + { + "epoch": 0.0418153487486916, + "grad_norm": 1.8955415487289429, + "learning_rate": 4.97846557962106e-05, + "loss": 5.4618, + "step": 7031 + }, + { + "epoch": 0.04182129603197259, + "grad_norm": 1.8957183361053467, + "learning_rate": 4.978459461563543e-05, + "loss": 5.5293, + "step": 7032 + }, + { + "epoch": 0.041827243315253594, + "grad_norm": 2.050734043121338, + "learning_rate": 4.978453342640822e-05, + "loss": 5.8002, + "step": 7033 + }, + { + "epoch": 0.04183319059853459, + "grad_norm": 1.9867476224899292, + "learning_rate": 4.978447222852899e-05, + "loss": 5.466, + "step": 7034 + }, + { + "epoch": 0.041839137881815584, + "grad_norm": 1.7928507328033447, + "learning_rate": 4.978441102199775e-05, + "loss": 5.3312, + "step": 7035 + }, + { + "epoch": 0.041845085165096586, + "grad_norm": 1.7984018325805664, + "learning_rate": 4.978434980681453e-05, + "loss": 5.2936, + "step": 7036 + }, + { + "epoch": 0.04185103244837758, + "grad_norm": 1.8011672496795654, + "learning_rate": 4.9784288582979355e-05, + "loss": 5.484, + "step": 7037 + }, + { + "epoch": 0.041856979731658576, + "grad_norm": 1.9439928531646729, + "learning_rate": 4.9784227350492236e-05, + "loss": 5.4563, + "step": 7038 + }, + { + "epoch": 0.04186292701493958, + "grad_norm": 1.71321439743042, + "learning_rate": 4.97841661093532e-05, + "loss": 5.3909, + "step": 7039 + }, + { + "epoch": 0.041868874298220574, + "grad_norm": 1.629333734512329, + "learning_rate": 4.9784104859562266e-05, + "loss": 5.3112, + "step": 7040 + }, + { + "epoch": 0.04187482158150157, + "grad_norm": 1.5248417854309082, + "learning_rate": 4.9784043601119456e-05, + "loss": 5.3724, + "step": 7041 + }, + { + "epoch": 0.04188076886478257, + "grad_norm": 1.8886220455169678, + "learning_rate": 4.97839823340248e-05, + "loss": 5.443, + "step": 7042 + }, + { + "epoch": 0.041886716148063566, + "grad_norm": 1.5902595520019531, + "learning_rate": 4.9783921058278307e-05, + "loss": 5.4249, + "step": 7043 + }, + { + "epoch": 0.04189266343134456, + "grad_norm": 1.837579369544983, + "learning_rate": 4.978385977388e-05, + "loss": 5.3767, + "step": 7044 + }, + { + "epoch": 0.041898610714625556, + "grad_norm": 1.8306061029434204, + "learning_rate": 4.9783798480829905e-05, + "loss": 5.4206, + "step": 7045 + }, + { + "epoch": 0.04190455799790656, + "grad_norm": 1.6887965202331543, + "learning_rate": 4.9783737179128044e-05, + "loss": 5.5327, + "step": 7046 + }, + { + "epoch": 0.04191050528118755, + "grad_norm": 1.8081728219985962, + "learning_rate": 4.978367586877444e-05, + "loss": 5.4547, + "step": 7047 + }, + { + "epoch": 0.04191645256446855, + "grad_norm": 1.8341114521026611, + "learning_rate": 4.97836145497691e-05, + "loss": 5.4175, + "step": 7048 + }, + { + "epoch": 0.04192239984774955, + "grad_norm": 1.965240240097046, + "learning_rate": 4.978355322211207e-05, + "loss": 5.4253, + "step": 7049 + }, + { + "epoch": 0.041928347131030545, + "grad_norm": 1.7060484886169434, + "learning_rate": 4.9783491885803343e-05, + "loss": 5.3493, + "step": 7050 + }, + { + "epoch": 0.04193429441431154, + "grad_norm": 1.8203076124191284, + "learning_rate": 4.978343054084297e-05, + "loss": 5.4601, + "step": 7051 + }, + { + "epoch": 0.04194024169759254, + "grad_norm": 1.919954538345337, + "learning_rate": 4.9783369187230945e-05, + "loss": 5.4921, + "step": 7052 + }, + { + "epoch": 0.04194618898087354, + "grad_norm": 1.4519730806350708, + "learning_rate": 4.9783307824967306e-05, + "loss": 5.4922, + "step": 7053 + }, + { + "epoch": 0.04195213626415453, + "grad_norm": 1.8431898355484009, + "learning_rate": 4.9783246454052066e-05, + "loss": 5.384, + "step": 7054 + }, + { + "epoch": 0.041958083547435535, + "grad_norm": 1.5493370294570923, + "learning_rate": 4.978318507448526e-05, + "loss": 5.5294, + "step": 7055 + }, + { + "epoch": 0.04196403083071653, + "grad_norm": 1.6405844688415527, + "learning_rate": 4.97831236862669e-05, + "loss": 5.492, + "step": 7056 + }, + { + "epoch": 0.041969978113997525, + "grad_norm": 1.7830392122268677, + "learning_rate": 4.9783062289396996e-05, + "loss": 5.2977, + "step": 7057 + }, + { + "epoch": 0.04197592539727852, + "grad_norm": 1.8268102407455444, + "learning_rate": 4.9783000883875595e-05, + "loss": 5.3396, + "step": 7058 + }, + { + "epoch": 0.04198187268055952, + "grad_norm": 1.942901849746704, + "learning_rate": 4.9782939469702694e-05, + "loss": 5.3338, + "step": 7059 + }, + { + "epoch": 0.04198781996384052, + "grad_norm": 1.5793414115905762, + "learning_rate": 4.9782878046878334e-05, + "loss": 5.3286, + "step": 7060 + }, + { + "epoch": 0.04199376724712151, + "grad_norm": 1.5777463912963867, + "learning_rate": 4.9782816615402515e-05, + "loss": 5.2942, + "step": 7061 + }, + { + "epoch": 0.041999714530402514, + "grad_norm": 1.6393412351608276, + "learning_rate": 4.978275517527528e-05, + "loss": 5.2557, + "step": 7062 + }, + { + "epoch": 0.04200566181368351, + "grad_norm": 1.9657515287399292, + "learning_rate": 4.978269372649664e-05, + "loss": 5.3875, + "step": 7063 + }, + { + "epoch": 0.042011609096964504, + "grad_norm": 2.1419737339019775, + "learning_rate": 4.9782632269066623e-05, + "loss": 5.2014, + "step": 7064 + }, + { + "epoch": 0.042017556380245506, + "grad_norm": 2.0425620079040527, + "learning_rate": 4.978257080298523e-05, + "loss": 5.194, + "step": 7065 + }, + { + "epoch": 0.0420235036635265, + "grad_norm": 1.7248409986495972, + "learning_rate": 4.978250932825251e-05, + "loss": 5.1922, + "step": 7066 + }, + { + "epoch": 0.042029450946807496, + "grad_norm": 1.8265177011489868, + "learning_rate": 4.978244784486847e-05, + "loss": 5.4474, + "step": 7067 + }, + { + "epoch": 0.0420353982300885, + "grad_norm": 1.803701400756836, + "learning_rate": 4.9782386352833134e-05, + "loss": 6.2155, + "step": 7068 + }, + { + "epoch": 0.042041345513369494, + "grad_norm": 1.9970064163208008, + "learning_rate": 4.978232485214652e-05, + "loss": 5.3622, + "step": 7069 + }, + { + "epoch": 0.04204729279665049, + "grad_norm": 1.7449073791503906, + "learning_rate": 4.978226334280865e-05, + "loss": 5.3146, + "step": 7070 + }, + { + "epoch": 0.04205324007993149, + "grad_norm": 2.0284547805786133, + "learning_rate": 4.978220182481955e-05, + "loss": 5.0169, + "step": 7071 + }, + { + "epoch": 0.042059187363212486, + "grad_norm": 1.6801714897155762, + "learning_rate": 4.978214029817924e-05, + "loss": 5.1294, + "step": 7072 + }, + { + "epoch": 0.04206513464649348, + "grad_norm": 2.160585641860962, + "learning_rate": 4.978207876288774e-05, + "loss": 5.072, + "step": 7073 + }, + { + "epoch": 0.042071081929774476, + "grad_norm": 2.07739520072937, + "learning_rate": 4.978201721894508e-05, + "loss": 5.2065, + "step": 7074 + }, + { + "epoch": 0.04207702921305548, + "grad_norm": 2.1396286487579346, + "learning_rate": 4.978195566635127e-05, + "loss": 5.1066, + "step": 7075 + }, + { + "epoch": 0.04208297649633647, + "grad_norm": 1.883280634880066, + "learning_rate": 4.978189410510633e-05, + "loss": 5.2842, + "step": 7076 + }, + { + "epoch": 0.04208892377961747, + "grad_norm": 1.9917101860046387, + "learning_rate": 4.978183253521029e-05, + "loss": 5.0799, + "step": 7077 + }, + { + "epoch": 0.04209487106289847, + "grad_norm": 1.9387022256851196, + "learning_rate": 4.9781770956663164e-05, + "loss": 5.1898, + "step": 7078 + }, + { + "epoch": 0.042100818346179465, + "grad_norm": 1.9767060279846191, + "learning_rate": 4.978170936946498e-05, + "loss": 5.0692, + "step": 7079 + }, + { + "epoch": 0.04210676562946046, + "grad_norm": 2.0076138973236084, + "learning_rate": 4.978164777361576e-05, + "loss": 5.0255, + "step": 7080 + }, + { + "epoch": 0.04211271291274146, + "grad_norm": 1.8253445625305176, + "learning_rate": 4.978158616911552e-05, + "loss": 5.0111, + "step": 7081 + }, + { + "epoch": 0.04211866019602246, + "grad_norm": 1.6551930904388428, + "learning_rate": 4.978152455596429e-05, + "loss": 4.9849, + "step": 7082 + }, + { + "epoch": 0.04212460747930345, + "grad_norm": 1.8462406396865845, + "learning_rate": 4.9781462934162084e-05, + "loss": 5.0862, + "step": 7083 + }, + { + "epoch": 0.042130554762584455, + "grad_norm": 2.0828206539154053, + "learning_rate": 4.978140130370892e-05, + "loss": 5.031, + "step": 7084 + }, + { + "epoch": 0.04213650204586545, + "grad_norm": 1.7917357683181763, + "learning_rate": 4.978133966460483e-05, + "loss": 5.0028, + "step": 7085 + }, + { + "epoch": 0.042142449329146445, + "grad_norm": 1.7324126958847046, + "learning_rate": 4.9781278016849834e-05, + "loss": 4.9759, + "step": 7086 + }, + { + "epoch": 0.04214839661242744, + "grad_norm": 1.8673282861709595, + "learning_rate": 4.978121636044394e-05, + "loss": 5.3631, + "step": 7087 + }, + { + "epoch": 0.04215434389570844, + "grad_norm": 1.7723935842514038, + "learning_rate": 4.9781154695387186e-05, + "loss": 5.3427, + "step": 7088 + }, + { + "epoch": 0.04216029117898944, + "grad_norm": 1.4671146869659424, + "learning_rate": 4.978109302167958e-05, + "loss": 5.3003, + "step": 7089 + }, + { + "epoch": 0.04216623846227043, + "grad_norm": 1.9667481184005737, + "learning_rate": 4.9781031339321156e-05, + "loss": 5.0957, + "step": 7090 + }, + { + "epoch": 0.042172185745551434, + "grad_norm": 1.8162986040115356, + "learning_rate": 4.978096964831193e-05, + "loss": 5.1472, + "step": 7091 + }, + { + "epoch": 0.04217813302883243, + "grad_norm": 1.7793545722961426, + "learning_rate": 4.9780907948651926e-05, + "loss": 5.1771, + "step": 7092 + }, + { + "epoch": 0.042184080312113424, + "grad_norm": 1.8093308210372925, + "learning_rate": 4.9780846240341156e-05, + "loss": 5.1611, + "step": 7093 + }, + { + "epoch": 0.042190027595394426, + "grad_norm": 1.7010010480880737, + "learning_rate": 4.978078452337965e-05, + "loss": 5.4478, + "step": 7094 + }, + { + "epoch": 0.04219597487867542, + "grad_norm": 1.7978744506835938, + "learning_rate": 4.9780722797767434e-05, + "loss": 5.4443, + "step": 7095 + }, + { + "epoch": 0.042201922161956416, + "grad_norm": 1.4861794710159302, + "learning_rate": 4.9780661063504516e-05, + "loss": 5.3773, + "step": 7096 + }, + { + "epoch": 0.04220786944523742, + "grad_norm": 1.7805769443511963, + "learning_rate": 4.978059932059093e-05, + "loss": 5.0896, + "step": 7097 + }, + { + "epoch": 0.042213816728518413, + "grad_norm": 1.7392783164978027, + "learning_rate": 4.9780537569026695e-05, + "loss": 5.0602, + "step": 7098 + }, + { + "epoch": 0.04221976401179941, + "grad_norm": 1.8742554187774658, + "learning_rate": 4.978047580881182e-05, + "loss": 5.2595, + "step": 7099 + }, + { + "epoch": 0.04222571129508041, + "grad_norm": 1.6077641248703003, + "learning_rate": 4.978041403994635e-05, + "loss": 5.0925, + "step": 7100 + }, + { + "epoch": 0.042231658578361406, + "grad_norm": 1.7536481618881226, + "learning_rate": 4.9780352262430286e-05, + "loss": 5.2546, + "step": 7101 + }, + { + "epoch": 0.0422376058616424, + "grad_norm": 1.6404869556427002, + "learning_rate": 4.9780290476263656e-05, + "loss": 5.1349, + "step": 7102 + }, + { + "epoch": 0.042243553144923396, + "grad_norm": 1.7223635911941528, + "learning_rate": 4.978022868144649e-05, + "loss": 5.2894, + "step": 7103 + }, + { + "epoch": 0.0422495004282044, + "grad_norm": 1.7856663465499878, + "learning_rate": 4.9780166877978796e-05, + "loss": 5.384, + "step": 7104 + }, + { + "epoch": 0.04225544771148539, + "grad_norm": 1.6434816122055054, + "learning_rate": 4.978010506586061e-05, + "loss": 5.257, + "step": 7105 + }, + { + "epoch": 0.04226139499476639, + "grad_norm": 1.668371558189392, + "learning_rate": 4.9780043245091936e-05, + "loss": 5.2698, + "step": 7106 + }, + { + "epoch": 0.04226734227804739, + "grad_norm": 1.7553619146347046, + "learning_rate": 4.97799814156728e-05, + "loss": 5.1591, + "step": 7107 + }, + { + "epoch": 0.042273289561328385, + "grad_norm": 1.6918652057647705, + "learning_rate": 4.977991957760324e-05, + "loss": 5.2727, + "step": 7108 + }, + { + "epoch": 0.04227923684460938, + "grad_norm": 1.6634269952774048, + "learning_rate": 4.977985773088326e-05, + "loss": 5.3099, + "step": 7109 + }, + { + "epoch": 0.04228518412789038, + "grad_norm": 2.131647825241089, + "learning_rate": 4.977979587551289e-05, + "loss": 5.0885, + "step": 7110 + }, + { + "epoch": 0.04229113141117138, + "grad_norm": 1.6632722616195679, + "learning_rate": 4.977973401149215e-05, + "loss": 5.1546, + "step": 7111 + }, + { + "epoch": 0.04229707869445237, + "grad_norm": 1.762418270111084, + "learning_rate": 4.977967213882107e-05, + "loss": 5.0884, + "step": 7112 + }, + { + "epoch": 0.042303025977733374, + "grad_norm": 1.9325755834579468, + "learning_rate": 4.977961025749964e-05, + "loss": 5.1857, + "step": 7113 + }, + { + "epoch": 0.04230897326101437, + "grad_norm": 1.8359284400939941, + "learning_rate": 4.9779548367527926e-05, + "loss": 5.165, + "step": 7114 + }, + { + "epoch": 0.042314920544295365, + "grad_norm": 1.8305978775024414, + "learning_rate": 4.977948646890591e-05, + "loss": 5.1347, + "step": 7115 + }, + { + "epoch": 0.04232086782757636, + "grad_norm": 1.7374697923660278, + "learning_rate": 4.9779424561633644e-05, + "loss": 5.5219, + "step": 7116 + }, + { + "epoch": 0.04232681511085736, + "grad_norm": 1.9947689771652222, + "learning_rate": 4.9779362645711135e-05, + "loss": 5.4445, + "step": 7117 + }, + { + "epoch": 0.04233276239413836, + "grad_norm": 1.6639795303344727, + "learning_rate": 4.97793007211384e-05, + "loss": 5.3798, + "step": 7118 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 1.6983096599578857, + "learning_rate": 4.977923878791547e-05, + "loss": 5.2847, + "step": 7119 + }, + { + "epoch": 0.042344656960700354, + "grad_norm": 1.7397092580795288, + "learning_rate": 4.9779176846042366e-05, + "loss": 5.3175, + "step": 7120 + }, + { + "epoch": 0.04235060424398135, + "grad_norm": 1.5255639553070068, + "learning_rate": 4.977911489551911e-05, + "loss": 5.2735, + "step": 7121 + }, + { + "epoch": 0.042356551527262344, + "grad_norm": 1.5646785497665405, + "learning_rate": 4.9779052936345715e-05, + "loss": 5.3892, + "step": 7122 + }, + { + "epoch": 0.042362498810543346, + "grad_norm": 1.7479640245437622, + "learning_rate": 4.977899096852221e-05, + "loss": 5.4341, + "step": 7123 + }, + { + "epoch": 0.04236844609382434, + "grad_norm": 1.6275604963302612, + "learning_rate": 4.9778928992048615e-05, + "loss": 5.5209, + "step": 7124 + }, + { + "epoch": 0.042374393377105336, + "grad_norm": 1.6917749643325806, + "learning_rate": 4.977886700692496e-05, + "loss": 5.5779, + "step": 7125 + }, + { + "epoch": 0.04238034066038634, + "grad_norm": 1.683716058731079, + "learning_rate": 4.977880501315125e-05, + "loss": 5.475, + "step": 7126 + }, + { + "epoch": 0.04238628794366733, + "grad_norm": 1.7665706872940063, + "learning_rate": 4.977874301072751e-05, + "loss": 5.3666, + "step": 7127 + }, + { + "epoch": 0.04239223522694833, + "grad_norm": 1.715329885482788, + "learning_rate": 4.977868099965377e-05, + "loss": 5.407, + "step": 7128 + }, + { + "epoch": 0.04239818251022933, + "grad_norm": 1.8468618392944336, + "learning_rate": 4.977861897993006e-05, + "loss": 5.328, + "step": 7129 + }, + { + "epoch": 0.042404129793510326, + "grad_norm": 1.59178626537323, + "learning_rate": 4.977855695155638e-05, + "loss": 5.7797, + "step": 7130 + }, + { + "epoch": 0.04241007707679132, + "grad_norm": 1.4733757972717285, + "learning_rate": 4.977849491453277e-05, + "loss": 5.3019, + "step": 7131 + }, + { + "epoch": 0.042416024360072316, + "grad_norm": 1.4632091522216797, + "learning_rate": 4.977843286885923e-05, + "loss": 5.1754, + "step": 7132 + }, + { + "epoch": 0.04242197164335332, + "grad_norm": 1.530564308166504, + "learning_rate": 4.97783708145358e-05, + "loss": 5.3613, + "step": 7133 + }, + { + "epoch": 0.04242791892663431, + "grad_norm": 1.954219102859497, + "learning_rate": 4.97783087515625e-05, + "loss": 5.4013, + "step": 7134 + }, + { + "epoch": 0.04243386620991531, + "grad_norm": 1.8276890516281128, + "learning_rate": 4.977824667993935e-05, + "loss": 5.3611, + "step": 7135 + }, + { + "epoch": 0.04243981349319631, + "grad_norm": 2.1430561542510986, + "learning_rate": 4.977818459966637e-05, + "loss": 5.1501, + "step": 7136 + }, + { + "epoch": 0.042445760776477305, + "grad_norm": 1.9150115251541138, + "learning_rate": 4.977812251074357e-05, + "loss": 5.1778, + "step": 7137 + }, + { + "epoch": 0.0424517080597583, + "grad_norm": 1.6958523988723755, + "learning_rate": 4.9778060413171004e-05, + "loss": 5.5029, + "step": 7138 + }, + { + "epoch": 0.0424576553430393, + "grad_norm": 1.7183772325515747, + "learning_rate": 4.977799830694866e-05, + "loss": 5.4323, + "step": 7139 + }, + { + "epoch": 0.0424636026263203, + "grad_norm": 1.717731237411499, + "learning_rate": 4.977793619207657e-05, + "loss": 5.3418, + "step": 7140 + }, + { + "epoch": 0.04246954990960129, + "grad_norm": 1.8155564069747925, + "learning_rate": 4.9777874068554766e-05, + "loss": 5.2865, + "step": 7141 + }, + { + "epoch": 0.042475497192882294, + "grad_norm": 1.9890762567520142, + "learning_rate": 4.9777811936383254e-05, + "loss": 5.4101, + "step": 7142 + }, + { + "epoch": 0.04248144447616329, + "grad_norm": 1.8181748390197754, + "learning_rate": 4.977774979556207e-05, + "loss": 5.2719, + "step": 7143 + }, + { + "epoch": 0.042487391759444285, + "grad_norm": 1.7353019714355469, + "learning_rate": 4.9777687646091234e-05, + "loss": 5.4202, + "step": 7144 + }, + { + "epoch": 0.04249333904272528, + "grad_norm": 1.6121984720230103, + "learning_rate": 4.977762548797076e-05, + "loss": 5.3174, + "step": 7145 + }, + { + "epoch": 0.04249928632600628, + "grad_norm": 1.9579551219940186, + "learning_rate": 4.977756332120067e-05, + "loss": 5.135, + "step": 7146 + }, + { + "epoch": 0.04250523360928728, + "grad_norm": 1.9396319389343262, + "learning_rate": 4.977750114578099e-05, + "loss": 5.7521, + "step": 7147 + }, + { + "epoch": 0.04251118089256827, + "grad_norm": 1.8567198514938354, + "learning_rate": 4.977743896171173e-05, + "loss": 5.7521, + "step": 7148 + }, + { + "epoch": 0.042517128175849274, + "grad_norm": 2.139861583709717, + "learning_rate": 4.977737676899293e-05, + "loss": 5.472, + "step": 7149 + }, + { + "epoch": 0.04252307545913027, + "grad_norm": 1.6526445150375366, + "learning_rate": 4.977731456762461e-05, + "loss": 5.5557, + "step": 7150 + }, + { + "epoch": 0.042529022742411264, + "grad_norm": 1.7761725187301636, + "learning_rate": 4.9777252357606784e-05, + "loss": 5.1922, + "step": 7151 + }, + { + "epoch": 0.042534970025692266, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.977719013893947e-05, + "loss": 5.5067, + "step": 7152 + }, + { + "epoch": 0.04254091730897326, + "grad_norm": 1.746470332145691, + "learning_rate": 4.97771279116227e-05, + "loss": 5.28, + "step": 7153 + }, + { + "epoch": 0.042546864592254256, + "grad_norm": 1.9258379936218262, + "learning_rate": 4.9777065675656484e-05, + "loss": 5.7223, + "step": 7154 + }, + { + "epoch": 0.04255281187553526, + "grad_norm": 1.9928748607635498, + "learning_rate": 4.977700343104086e-05, + "loss": 5.727, + "step": 7155 + }, + { + "epoch": 0.04255875915881625, + "grad_norm": 1.7435163259506226, + "learning_rate": 4.9776941177775824e-05, + "loss": 5.6636, + "step": 7156 + }, + { + "epoch": 0.04256470644209725, + "grad_norm": 1.6818004846572876, + "learning_rate": 4.977687891586143e-05, + "loss": 5.6589, + "step": 7157 + }, + { + "epoch": 0.04257065372537825, + "grad_norm": 1.812779426574707, + "learning_rate": 4.9776816645297676e-05, + "loss": 5.2705, + "step": 7158 + }, + { + "epoch": 0.042576601008659246, + "grad_norm": 1.7637232542037964, + "learning_rate": 4.977675436608459e-05, + "loss": 5.2872, + "step": 7159 + }, + { + "epoch": 0.04258254829194024, + "grad_norm": 1.9504014253616333, + "learning_rate": 4.97766920782222e-05, + "loss": 5.1324, + "step": 7160 + }, + { + "epoch": 0.042588495575221236, + "grad_norm": 1.7741994857788086, + "learning_rate": 4.9776629781710525e-05, + "loss": 5.4164, + "step": 7161 + }, + { + "epoch": 0.04259444285850224, + "grad_norm": 2.0005195140838623, + "learning_rate": 4.9776567476549576e-05, + "loss": 5.4667, + "step": 7162 + }, + { + "epoch": 0.04260039014178323, + "grad_norm": 2.256420612335205, + "learning_rate": 4.977650516273939e-05, + "loss": 5.1116, + "step": 7163 + }, + { + "epoch": 0.04260633742506423, + "grad_norm": 2.0806920528411865, + "learning_rate": 4.977644284027998e-05, + "loss": 5.2333, + "step": 7164 + }, + { + "epoch": 0.04261228470834523, + "grad_norm": 1.898760199546814, + "learning_rate": 4.9776380509171364e-05, + "loss": 5.4761, + "step": 7165 + }, + { + "epoch": 0.042618231991626225, + "grad_norm": 1.7251659631729126, + "learning_rate": 4.977631816941358e-05, + "loss": 5.5584, + "step": 7166 + }, + { + "epoch": 0.04262417927490722, + "grad_norm": 1.741645336151123, + "learning_rate": 4.977625582100664e-05, + "loss": 5.4133, + "step": 7167 + }, + { + "epoch": 0.04263012655818822, + "grad_norm": 1.921617031097412, + "learning_rate": 4.977619346395055e-05, + "loss": 5.1829, + "step": 7168 + }, + { + "epoch": 0.04263607384146922, + "grad_norm": 1.7597262859344482, + "learning_rate": 4.977613109824536e-05, + "loss": 5.1743, + "step": 7169 + }, + { + "epoch": 0.04264202112475021, + "grad_norm": 1.8069764375686646, + "learning_rate": 4.977606872389107e-05, + "loss": 5.4004, + "step": 7170 + }, + { + "epoch": 0.042647968408031214, + "grad_norm": 1.7694367170333862, + "learning_rate": 4.9776006340887714e-05, + "loss": 5.2018, + "step": 7171 + }, + { + "epoch": 0.04265391569131221, + "grad_norm": 1.8260759115219116, + "learning_rate": 4.9775943949235316e-05, + "loss": 5.4115, + "step": 7172 + }, + { + "epoch": 0.042659862974593205, + "grad_norm": 1.71034574508667, + "learning_rate": 4.9775881548933884e-05, + "loss": 5.2781, + "step": 7173 + }, + { + "epoch": 0.0426658102578742, + "grad_norm": 1.7208900451660156, + "learning_rate": 4.977581913998345e-05, + "loss": 5.4686, + "step": 7174 + }, + { + "epoch": 0.0426717575411552, + "grad_norm": 1.8545277118682861, + "learning_rate": 4.977575672238404e-05, + "loss": 5.4545, + "step": 7175 + }, + { + "epoch": 0.0426777048244362, + "grad_norm": 1.7892229557037354, + "learning_rate": 4.9775694296135656e-05, + "loss": 5.6612, + "step": 7176 + }, + { + "epoch": 0.04268365210771719, + "grad_norm": 1.8321889638900757, + "learning_rate": 4.9775631861238343e-05, + "loss": 5.5889, + "step": 7177 + }, + { + "epoch": 0.042689599390998194, + "grad_norm": 1.7925626039505005, + "learning_rate": 4.977556941769211e-05, + "loss": 5.6218, + "step": 7178 + }, + { + "epoch": 0.04269554667427919, + "grad_norm": 1.9650121927261353, + "learning_rate": 4.9775506965496984e-05, + "loss": 5.5228, + "step": 7179 + }, + { + "epoch": 0.042701493957560184, + "grad_norm": 1.9050647020339966, + "learning_rate": 4.977544450465298e-05, + "loss": 5.5547, + "step": 7180 + }, + { + "epoch": 0.042707441240841186, + "grad_norm": 1.8334670066833496, + "learning_rate": 4.977538203516013e-05, + "loss": 5.3895, + "step": 7181 + }, + { + "epoch": 0.04271338852412218, + "grad_norm": 1.803544521331787, + "learning_rate": 4.9775319557018444e-05, + "loss": 5.6288, + "step": 7182 + }, + { + "epoch": 0.042719335807403176, + "grad_norm": 1.823440432548523, + "learning_rate": 4.9775257070227956e-05, + "loss": 5.4996, + "step": 7183 + }, + { + "epoch": 0.04272528309068418, + "grad_norm": 1.9730159044265747, + "learning_rate": 4.977519457478868e-05, + "loss": 5.5004, + "step": 7184 + }, + { + "epoch": 0.04273123037396517, + "grad_norm": 1.9566004276275635, + "learning_rate": 4.977513207070064e-05, + "loss": 5.5496, + "step": 7185 + }, + { + "epoch": 0.04273717765724617, + "grad_norm": 2.0958995819091797, + "learning_rate": 4.977506955796385e-05, + "loss": 5.5256, + "step": 7186 + }, + { + "epoch": 0.04274312494052717, + "grad_norm": 1.8957890272140503, + "learning_rate": 4.977500703657835e-05, + "loss": 5.3337, + "step": 7187 + }, + { + "epoch": 0.042749072223808166, + "grad_norm": 1.8224141597747803, + "learning_rate": 4.977494450654414e-05, + "loss": 5.1362, + "step": 7188 + }, + { + "epoch": 0.04275501950708916, + "grad_norm": 1.648296594619751, + "learning_rate": 4.977488196786126e-05, + "loss": 5.3398, + "step": 7189 + }, + { + "epoch": 0.042760966790370156, + "grad_norm": 1.6238311529159546, + "learning_rate": 4.977481942052972e-05, + "loss": 5.2083, + "step": 7190 + }, + { + "epoch": 0.04276691407365116, + "grad_norm": 1.7399996519088745, + "learning_rate": 4.977475686454956e-05, + "loss": 5.2403, + "step": 7191 + }, + { + "epoch": 0.04277286135693215, + "grad_norm": 1.7260342836380005, + "learning_rate": 4.977469429992077e-05, + "loss": 5.2282, + "step": 7192 + }, + { + "epoch": 0.04277880864021315, + "grad_norm": 4.4954447746276855, + "learning_rate": 4.9774631726643396e-05, + "loss": 5.1044, + "step": 7193 + }, + { + "epoch": 0.04278475592349415, + "grad_norm": 1.879869818687439, + "learning_rate": 4.977456914471746e-05, + "loss": 5.3431, + "step": 7194 + }, + { + "epoch": 0.042790703206775145, + "grad_norm": 1.8826582431793213, + "learning_rate": 4.977450655414297e-05, + "loss": 5.2951, + "step": 7195 + }, + { + "epoch": 0.04279665049005614, + "grad_norm": 1.8973712921142578, + "learning_rate": 4.977444395491996e-05, + "loss": 5.343, + "step": 7196 + }, + { + "epoch": 0.04280259777333714, + "grad_norm": 1.6125551462173462, + "learning_rate": 4.977438134704845e-05, + "loss": 5.2849, + "step": 7197 + }, + { + "epoch": 0.04280854505661814, + "grad_norm": 1.441159963607788, + "learning_rate": 4.9774318730528456e-05, + "loss": 5.2955, + "step": 7198 + }, + { + "epoch": 0.04281449233989913, + "grad_norm": 1.9655884504318237, + "learning_rate": 4.9774256105360004e-05, + "loss": 5.2093, + "step": 7199 + }, + { + "epoch": 0.042820439623180134, + "grad_norm": 1.7824043035507202, + "learning_rate": 4.9774193471543116e-05, + "loss": 5.2105, + "step": 7200 + }, + { + "epoch": 0.04282638690646113, + "grad_norm": 1.8331031799316406, + "learning_rate": 4.977413082907781e-05, + "loss": 5.3359, + "step": 7201 + }, + { + "epoch": 0.042832334189742124, + "grad_norm": 1.8695242404937744, + "learning_rate": 4.977406817796412e-05, + "loss": 5.3686, + "step": 7202 + }, + { + "epoch": 0.042838281473023126, + "grad_norm": 1.70205557346344, + "learning_rate": 4.977400551820205e-05, + "loss": 5.2689, + "step": 7203 + }, + { + "epoch": 0.04284422875630412, + "grad_norm": 1.700307846069336, + "learning_rate": 4.9773942849791635e-05, + "loss": 5.3946, + "step": 7204 + }, + { + "epoch": 0.04285017603958512, + "grad_norm": 1.625637173652649, + "learning_rate": 4.977388017273288e-05, + "loss": 5.095, + "step": 7205 + }, + { + "epoch": 0.04285612332286611, + "grad_norm": 1.7689390182495117, + "learning_rate": 4.977381748702583e-05, + "loss": 5.0097, + "step": 7206 + }, + { + "epoch": 0.042862070606147114, + "grad_norm": 1.856493353843689, + "learning_rate": 4.97737547926705e-05, + "loss": 5.0551, + "step": 7207 + }, + { + "epoch": 0.04286801788942811, + "grad_norm": 1.6497242450714111, + "learning_rate": 4.97736920896669e-05, + "loss": 5.031, + "step": 7208 + }, + { + "epoch": 0.042873965172709104, + "grad_norm": 1.5884608030319214, + "learning_rate": 4.977362937801506e-05, + "loss": 5.0758, + "step": 7209 + }, + { + "epoch": 0.042879912455990106, + "grad_norm": 1.5206499099731445, + "learning_rate": 4.9773566657715006e-05, + "loss": 5.049, + "step": 7210 + }, + { + "epoch": 0.0428858597392711, + "grad_norm": 1.7026933431625366, + "learning_rate": 4.977350392876676e-05, + "loss": 5.001, + "step": 7211 + }, + { + "epoch": 0.042891807022552096, + "grad_norm": 1.4197289943695068, + "learning_rate": 4.977344119117034e-05, + "loss": 5.0446, + "step": 7212 + }, + { + "epoch": 0.0428977543058331, + "grad_norm": 1.498713731765747, + "learning_rate": 4.977337844492576e-05, + "loss": 5.0574, + "step": 7213 + }, + { + "epoch": 0.04290370158911409, + "grad_norm": 1.7583528757095337, + "learning_rate": 4.9773315690033054e-05, + "loss": 4.994, + "step": 7214 + }, + { + "epoch": 0.04290964887239509, + "grad_norm": 1.8511004447937012, + "learning_rate": 4.9773252926492236e-05, + "loss": 4.9888, + "step": 7215 + }, + { + "epoch": 0.04291559615567609, + "grad_norm": 1.5799078941345215, + "learning_rate": 4.9773190154303334e-05, + "loss": 5.0028, + "step": 7216 + }, + { + "epoch": 0.042921543438957085, + "grad_norm": 1.6737205982208252, + "learning_rate": 4.977312737346637e-05, + "loss": 5.0701, + "step": 7217 + }, + { + "epoch": 0.04292749072223808, + "grad_norm": 1.537049412727356, + "learning_rate": 4.977306458398136e-05, + "loss": 5.0747, + "step": 7218 + }, + { + "epoch": 0.042933438005519076, + "grad_norm": 1.7501899003982544, + "learning_rate": 4.977300178584833e-05, + "loss": 5.0172, + "step": 7219 + }, + { + "epoch": 0.04293938528880008, + "grad_norm": 1.5130890607833862, + "learning_rate": 4.9772938979067294e-05, + "loss": 5.0196, + "step": 7220 + }, + { + "epoch": 0.04294533257208107, + "grad_norm": 1.628053903579712, + "learning_rate": 4.977287616363829e-05, + "loss": 5.0526, + "step": 7221 + }, + { + "epoch": 0.04295127985536207, + "grad_norm": 1.6736811399459839, + "learning_rate": 4.977281333956133e-05, + "loss": 5.0093, + "step": 7222 + }, + { + "epoch": 0.04295722713864307, + "grad_norm": 1.6157552003860474, + "learning_rate": 4.977275050683643e-05, + "loss": 4.9562, + "step": 7223 + }, + { + "epoch": 0.042963174421924065, + "grad_norm": 1.6699459552764893, + "learning_rate": 4.9772687665463625e-05, + "loss": 4.9603, + "step": 7224 + }, + { + "epoch": 0.04296912170520506, + "grad_norm": 1.4698256254196167, + "learning_rate": 4.9772624815442925e-05, + "loss": 4.9908, + "step": 7225 + }, + { + "epoch": 0.04297506898848606, + "grad_norm": 1.5310906171798706, + "learning_rate": 4.9772561956774365e-05, + "loss": 5.0081, + "step": 7226 + }, + { + "epoch": 0.04298101627176706, + "grad_norm": 1.6135941743850708, + "learning_rate": 4.977249908945795e-05, + "loss": 5.1394, + "step": 7227 + }, + { + "epoch": 0.04298696355504805, + "grad_norm": 1.7632607221603394, + "learning_rate": 4.977243621349372e-05, + "loss": 4.9992, + "step": 7228 + }, + { + "epoch": 0.042992910838329054, + "grad_norm": 1.574826955795288, + "learning_rate": 4.977237332888168e-05, + "loss": 4.9361, + "step": 7229 + }, + { + "epoch": 0.04299885812161005, + "grad_norm": 1.6633859872817993, + "learning_rate": 4.9772310435621874e-05, + "loss": 4.9085, + "step": 7230 + }, + { + "epoch": 0.043004805404891044, + "grad_norm": 1.6180634498596191, + "learning_rate": 4.97722475337143e-05, + "loss": 4.939, + "step": 7231 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 1.959694266319275, + "learning_rate": 4.9772184623158996e-05, + "loss": 5.231, + "step": 7232 + }, + { + "epoch": 0.04301669997145304, + "grad_norm": 1.6264785528182983, + "learning_rate": 4.977212170395598e-05, + "loss": 5.3228, + "step": 7233 + }, + { + "epoch": 0.04302264725473404, + "grad_norm": 2.109292507171631, + "learning_rate": 4.9772058776105264e-05, + "loss": 5.4579, + "step": 7234 + }, + { + "epoch": 0.04302859453801503, + "grad_norm": 1.991877555847168, + "learning_rate": 4.977199583960688e-05, + "loss": 5.355, + "step": 7235 + }, + { + "epoch": 0.043034541821296034, + "grad_norm": 2.23330020904541, + "learning_rate": 4.977193289446085e-05, + "loss": 5.3233, + "step": 7236 + }, + { + "epoch": 0.04304048910457703, + "grad_norm": 2.077359914779663, + "learning_rate": 4.9771869940667194e-05, + "loss": 5.2003, + "step": 7237 + }, + { + "epoch": 0.043046436387858024, + "grad_norm": 1.652498722076416, + "learning_rate": 4.977180697822593e-05, + "loss": 5.0232, + "step": 7238 + }, + { + "epoch": 0.043052383671139026, + "grad_norm": 1.9277194738388062, + "learning_rate": 4.977174400713709e-05, + "loss": 5.3826, + "step": 7239 + }, + { + "epoch": 0.04305833095442002, + "grad_norm": 1.9263273477554321, + "learning_rate": 4.9771681027400694e-05, + "loss": 5.5258, + "step": 7240 + }, + { + "epoch": 0.043064278237701016, + "grad_norm": 2.066934108734131, + "learning_rate": 4.9771618039016756e-05, + "loss": 5.6398, + "step": 7241 + }, + { + "epoch": 0.04307022552098202, + "grad_norm": 1.7810741662979126, + "learning_rate": 4.9771555041985295e-05, + "loss": 5.3716, + "step": 7242 + }, + { + "epoch": 0.04307617280426301, + "grad_norm": 1.7068313360214233, + "learning_rate": 4.977149203630635e-05, + "loss": 5.4042, + "step": 7243 + }, + { + "epoch": 0.04308212008754401, + "grad_norm": 1.8587994575500488, + "learning_rate": 4.977142902197992e-05, + "loss": 5.3635, + "step": 7244 + }, + { + "epoch": 0.04308806737082501, + "grad_norm": 2.101649284362793, + "learning_rate": 4.9771365999006054e-05, + "loss": 5.5292, + "step": 7245 + }, + { + "epoch": 0.043094014654106005, + "grad_norm": 1.8571972846984863, + "learning_rate": 4.9771302967384756e-05, + "loss": 5.4577, + "step": 7246 + }, + { + "epoch": 0.043099961937387, + "grad_norm": 1.9837383031845093, + "learning_rate": 4.9771239927116045e-05, + "loss": 5.4976, + "step": 7247 + }, + { + "epoch": 0.043105909220667996, + "grad_norm": 1.7688343524932861, + "learning_rate": 4.977117687819996e-05, + "loss": 5.448, + "step": 7248 + }, + { + "epoch": 0.043111856503949, + "grad_norm": 1.923824429512024, + "learning_rate": 4.9771113820636505e-05, + "loss": 5.3436, + "step": 7249 + }, + { + "epoch": 0.04311780378722999, + "grad_norm": 1.4405949115753174, + "learning_rate": 4.9771050754425715e-05, + "loss": 5.2751, + "step": 7250 + }, + { + "epoch": 0.04312375107051099, + "grad_norm": 1.7337450981140137, + "learning_rate": 4.977098767956761e-05, + "loss": 5.4693, + "step": 7251 + }, + { + "epoch": 0.04312969835379199, + "grad_norm": 2.063887119293213, + "learning_rate": 4.977092459606221e-05, + "loss": 5.4576, + "step": 7252 + }, + { + "epoch": 0.043135645637072985, + "grad_norm": 1.576517105102539, + "learning_rate": 4.9770861503909524e-05, + "loss": 5.4052, + "step": 7253 + }, + { + "epoch": 0.04314159292035398, + "grad_norm": 1.8137834072113037, + "learning_rate": 4.9770798403109596e-05, + "loss": 5.5732, + "step": 7254 + }, + { + "epoch": 0.04314754020363498, + "grad_norm": 1.7954564094543457, + "learning_rate": 4.977073529366244e-05, + "loss": 5.4213, + "step": 7255 + }, + { + "epoch": 0.04315348748691598, + "grad_norm": 1.993961215019226, + "learning_rate": 4.977067217556807e-05, + "loss": 5.2909, + "step": 7256 + }, + { + "epoch": 0.04315943477019697, + "grad_norm": 1.6993632316589355, + "learning_rate": 4.977060904882651e-05, + "loss": 5.4523, + "step": 7257 + }, + { + "epoch": 0.043165382053477974, + "grad_norm": 1.8541932106018066, + "learning_rate": 4.977054591343779e-05, + "loss": 5.3182, + "step": 7258 + }, + { + "epoch": 0.04317132933675897, + "grad_norm": 1.7425625324249268, + "learning_rate": 4.9770482769401935e-05, + "loss": 5.2527, + "step": 7259 + }, + { + "epoch": 0.043177276620039964, + "grad_norm": 1.7028024196624756, + "learning_rate": 4.9770419616718955e-05, + "loss": 5.1305, + "step": 7260 + }, + { + "epoch": 0.043183223903320966, + "grad_norm": 1.745316982269287, + "learning_rate": 4.977035645538888e-05, + "loss": 5.0368, + "step": 7261 + }, + { + "epoch": 0.04318917118660196, + "grad_norm": 1.8373509645462036, + "learning_rate": 4.977029328541173e-05, + "loss": 5.353, + "step": 7262 + }, + { + "epoch": 0.04319511846988296, + "grad_norm": 1.9976449012756348, + "learning_rate": 4.9770230106787526e-05, + "loss": 5.363, + "step": 7263 + }, + { + "epoch": 0.04320106575316395, + "grad_norm": 1.7109822034835815, + "learning_rate": 4.977016691951629e-05, + "loss": 5.3462, + "step": 7264 + }, + { + "epoch": 0.043207013036444954, + "grad_norm": 1.8688478469848633, + "learning_rate": 4.9770103723598036e-05, + "loss": 5.3564, + "step": 7265 + }, + { + "epoch": 0.04321296031972595, + "grad_norm": 1.8680217266082764, + "learning_rate": 4.9770040519032804e-05, + "loss": 5.2713, + "step": 7266 + }, + { + "epoch": 0.043218907603006944, + "grad_norm": 1.8022522926330566, + "learning_rate": 4.976997730582061e-05, + "loss": 5.153, + "step": 7267 + }, + { + "epoch": 0.043224854886287946, + "grad_norm": 1.7128162384033203, + "learning_rate": 4.976991408396147e-05, + "loss": 5.3107, + "step": 7268 + }, + { + "epoch": 0.04323080216956894, + "grad_norm": 1.8222606182098389, + "learning_rate": 4.9769850853455404e-05, + "loss": 5.3599, + "step": 7269 + }, + { + "epoch": 0.043236749452849936, + "grad_norm": 1.829373836517334, + "learning_rate": 4.976978761430244e-05, + "loss": 5.3991, + "step": 7270 + }, + { + "epoch": 0.04324269673613094, + "grad_norm": 1.8270717859268188, + "learning_rate": 4.97697243665026e-05, + "loss": 5.2434, + "step": 7271 + }, + { + "epoch": 0.04324864401941193, + "grad_norm": 1.9759695529937744, + "learning_rate": 4.976966111005591e-05, + "loss": 5.4585, + "step": 7272 + }, + { + "epoch": 0.04325459130269293, + "grad_norm": 2.0235564708709717, + "learning_rate": 4.9769597844962376e-05, + "loss": 5.3996, + "step": 7273 + }, + { + "epoch": 0.04326053858597393, + "grad_norm": 1.9220880270004272, + "learning_rate": 4.976953457122204e-05, + "loss": 5.344, + "step": 7274 + }, + { + "epoch": 0.043266485869254925, + "grad_norm": 1.6257338523864746, + "learning_rate": 4.976947128883492e-05, + "loss": 5.4012, + "step": 7275 + }, + { + "epoch": 0.04327243315253592, + "grad_norm": 1.6390771865844727, + "learning_rate": 4.976940799780103e-05, + "loss": 5.3693, + "step": 7276 + }, + { + "epoch": 0.043278380435816916, + "grad_norm": 1.5769712924957275, + "learning_rate": 4.976934469812039e-05, + "loss": 5.3214, + "step": 7277 + }, + { + "epoch": 0.04328432771909792, + "grad_norm": 1.539920687675476, + "learning_rate": 4.9769281389793035e-05, + "loss": 5.2784, + "step": 7278 + }, + { + "epoch": 0.04329027500237891, + "grad_norm": 1.662835717201233, + "learning_rate": 4.976921807281897e-05, + "loss": 5.2717, + "step": 7279 + }, + { + "epoch": 0.04329622228565991, + "grad_norm": 1.3613345623016357, + "learning_rate": 4.9769154747198234e-05, + "loss": 5.4241, + "step": 7280 + }, + { + "epoch": 0.04330216956894091, + "grad_norm": 1.5267658233642578, + "learning_rate": 4.976909141293084e-05, + "loss": 5.454, + "step": 7281 + }, + { + "epoch": 0.043308116852221905, + "grad_norm": 1.5050435066223145, + "learning_rate": 4.976902807001681e-05, + "loss": 5.4975, + "step": 7282 + }, + { + "epoch": 0.0433140641355029, + "grad_norm": 1.292698621749878, + "learning_rate": 4.976896471845617e-05, + "loss": 5.4071, + "step": 7283 + }, + { + "epoch": 0.0433200114187839, + "grad_norm": 1.6818265914916992, + "learning_rate": 4.9768901358248946e-05, + "loss": 5.3561, + "step": 7284 + }, + { + "epoch": 0.0433259587020649, + "grad_norm": 1.5995383262634277, + "learning_rate": 4.976883798939515e-05, + "loss": 5.2623, + "step": 7285 + }, + { + "epoch": 0.04333190598534589, + "grad_norm": 1.6959342956542969, + "learning_rate": 4.976877461189481e-05, + "loss": 5.3193, + "step": 7286 + }, + { + "epoch": 0.043337853268626894, + "grad_norm": 1.6978071928024292, + "learning_rate": 4.976871122574794e-05, + "loss": 5.5653, + "step": 7287 + }, + { + "epoch": 0.04334380055190789, + "grad_norm": 1.7587183713912964, + "learning_rate": 4.976864783095457e-05, + "loss": 5.545, + "step": 7288 + }, + { + "epoch": 0.043349747835188884, + "grad_norm": 1.6225430965423584, + "learning_rate": 4.976858442751473e-05, + "loss": 5.5804, + "step": 7289 + }, + { + "epoch": 0.043355695118469886, + "grad_norm": 1.5895410776138306, + "learning_rate": 4.976852101542843e-05, + "loss": 5.4798, + "step": 7290 + }, + { + "epoch": 0.04336164240175088, + "grad_norm": 1.759022831916809, + "learning_rate": 4.976845759469569e-05, + "loss": 5.4794, + "step": 7291 + }, + { + "epoch": 0.043367589685031877, + "grad_norm": 1.483383059501648, + "learning_rate": 4.976839416531654e-05, + "loss": 5.2547, + "step": 7292 + }, + { + "epoch": 0.04337353696831287, + "grad_norm": 2.136172294616699, + "learning_rate": 4.9768330727291e-05, + "loss": 5.1655, + "step": 7293 + }, + { + "epoch": 0.043379484251593874, + "grad_norm": 1.9202553033828735, + "learning_rate": 4.9768267280619094e-05, + "loss": 5.1945, + "step": 7294 + }, + { + "epoch": 0.04338543153487487, + "grad_norm": 1.7927708625793457, + "learning_rate": 4.976820382530084e-05, + "loss": 5.4936, + "step": 7295 + }, + { + "epoch": 0.043391378818155864, + "grad_norm": 1.597887396812439, + "learning_rate": 4.976814036133626e-05, + "loss": 5.5516, + "step": 7296 + }, + { + "epoch": 0.043397326101436866, + "grad_norm": 1.493356466293335, + "learning_rate": 4.9768076888725376e-05, + "loss": 5.552, + "step": 7297 + }, + { + "epoch": 0.04340327338471786, + "grad_norm": 1.6748720407485962, + "learning_rate": 4.976801340746822e-05, + "loss": 5.3957, + "step": 7298 + }, + { + "epoch": 0.043409220667998856, + "grad_norm": 1.541945457458496, + "learning_rate": 4.9767949917564794e-05, + "loss": 5.5558, + "step": 7299 + }, + { + "epoch": 0.04341516795127986, + "grad_norm": 1.6436586380004883, + "learning_rate": 4.976788641901514e-05, + "loss": 5.4918, + "step": 7300 + }, + { + "epoch": 0.04342111523456085, + "grad_norm": 1.69910728931427, + "learning_rate": 4.9767822911819274e-05, + "loss": 5.4688, + "step": 7301 + }, + { + "epoch": 0.04342706251784185, + "grad_norm": 1.8294274806976318, + "learning_rate": 4.976775939597721e-05, + "loss": 5.505, + "step": 7302 + }, + { + "epoch": 0.04343300980112285, + "grad_norm": 1.720880389213562, + "learning_rate": 4.976769587148899e-05, + "loss": 5.3509, + "step": 7303 + }, + { + "epoch": 0.043438957084403845, + "grad_norm": 1.5898194313049316, + "learning_rate": 4.976763233835461e-05, + "loss": 5.2955, + "step": 7304 + }, + { + "epoch": 0.04344490436768484, + "grad_norm": 1.569218397140503, + "learning_rate": 4.976756879657412e-05, + "loss": 5.5695, + "step": 7305 + }, + { + "epoch": 0.043450851650965835, + "grad_norm": 1.5551841259002686, + "learning_rate": 4.976750524614752e-05, + "loss": 5.5313, + "step": 7306 + }, + { + "epoch": 0.04345679893424684, + "grad_norm": 1.5870057344436646, + "learning_rate": 4.9767441687074834e-05, + "loss": 5.7525, + "step": 7307 + }, + { + "epoch": 0.04346274621752783, + "grad_norm": 1.5421022176742554, + "learning_rate": 4.97673781193561e-05, + "loss": 5.6176, + "step": 7308 + }, + { + "epoch": 0.04346869350080883, + "grad_norm": 1.9368326663970947, + "learning_rate": 4.976731454299132e-05, + "loss": 5.4239, + "step": 7309 + }, + { + "epoch": 0.04347464078408983, + "grad_norm": 1.719084620475769, + "learning_rate": 4.976725095798053e-05, + "loss": 5.3526, + "step": 7310 + }, + { + "epoch": 0.043480588067370825, + "grad_norm": 1.8004268407821655, + "learning_rate": 4.9767187364323756e-05, + "loss": 5.7112, + "step": 7311 + }, + { + "epoch": 0.04348653535065182, + "grad_norm": 1.9922735691070557, + "learning_rate": 4.9767123762021003e-05, + "loss": 5.4993, + "step": 7312 + }, + { + "epoch": 0.04349248263393282, + "grad_norm": 1.6768959760665894, + "learning_rate": 4.976706015107231e-05, + "loss": 5.4713, + "step": 7313 + }, + { + "epoch": 0.04349842991721382, + "grad_norm": 1.6070122718811035, + "learning_rate": 4.976699653147768e-05, + "loss": 5.4695, + "step": 7314 + }, + { + "epoch": 0.04350437720049481, + "grad_norm": 1.5641200542449951, + "learning_rate": 4.976693290323716e-05, + "loss": 5.3596, + "step": 7315 + }, + { + "epoch": 0.043510324483775814, + "grad_norm": 3.0344419479370117, + "learning_rate": 4.976686926635076e-05, + "loss": 5.7371, + "step": 7316 + }, + { + "epoch": 0.04351627176705681, + "grad_norm": 1.8784242868423462, + "learning_rate": 4.9766805620818494e-05, + "loss": 5.5142, + "step": 7317 + }, + { + "epoch": 0.043522219050337804, + "grad_norm": 2.0644166469573975, + "learning_rate": 4.9766741966640394e-05, + "loss": 5.276, + "step": 7318 + }, + { + "epoch": 0.043528166333618806, + "grad_norm": 1.8128771781921387, + "learning_rate": 4.976667830381649e-05, + "loss": 5.3515, + "step": 7319 + }, + { + "epoch": 0.0435341136168998, + "grad_norm": 1.8899081945419312, + "learning_rate": 4.9766614632346786e-05, + "loss": 5.3981, + "step": 7320 + }, + { + "epoch": 0.043540060900180796, + "grad_norm": 1.89181649684906, + "learning_rate": 4.976655095223131e-05, + "loss": 5.4378, + "step": 7321 + }, + { + "epoch": 0.04354600818346179, + "grad_norm": 1.6332184076309204, + "learning_rate": 4.976648726347009e-05, + "loss": 5.4023, + "step": 7322 + }, + { + "epoch": 0.043551955466742794, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.976642356606315e-05, + "loss": 5.8375, + "step": 7323 + }, + { + "epoch": 0.04355790275002379, + "grad_norm": 2.029244899749756, + "learning_rate": 4.97663598600105e-05, + "loss": 5.5617, + "step": 7324 + }, + { + "epoch": 0.043563850033304784, + "grad_norm": 2.138946056365967, + "learning_rate": 4.9766296145312175e-05, + "loss": 5.5076, + "step": 7325 + }, + { + "epoch": 0.043569797316585786, + "grad_norm": 1.8702884912490845, + "learning_rate": 4.9766232421968184e-05, + "loss": 5.123, + "step": 7326 + }, + { + "epoch": 0.04357574459986678, + "grad_norm": 1.8917137384414673, + "learning_rate": 4.976616868997856e-05, + "loss": 5.4809, + "step": 7327 + }, + { + "epoch": 0.043581691883147776, + "grad_norm": 2.2203474044799805, + "learning_rate": 4.976610494934333e-05, + "loss": 5.6359, + "step": 7328 + }, + { + "epoch": 0.04358763916642878, + "grad_norm": 2.4505302906036377, + "learning_rate": 4.976604120006251e-05, + "loss": 6.1423, + "step": 7329 + }, + { + "epoch": 0.04359358644970977, + "grad_norm": 2.4601128101348877, + "learning_rate": 4.976597744213611e-05, + "loss": 6.0908, + "step": 7330 + }, + { + "epoch": 0.04359953373299077, + "grad_norm": 1.9502687454223633, + "learning_rate": 4.976591367556417e-05, + "loss": 5.918, + "step": 7331 + }, + { + "epoch": 0.04360548101627177, + "grad_norm": 2.180250644683838, + "learning_rate": 4.9765849900346696e-05, + "loss": 5.7203, + "step": 7332 + }, + { + "epoch": 0.043611428299552765, + "grad_norm": 2.125669002532959, + "learning_rate": 4.9765786116483726e-05, + "loss": 5.7875, + "step": 7333 + }, + { + "epoch": 0.04361737558283376, + "grad_norm": 2.0372321605682373, + "learning_rate": 4.9765722323975286e-05, + "loss": 5.6777, + "step": 7334 + }, + { + "epoch": 0.043623322866114755, + "grad_norm": 2.5857362747192383, + "learning_rate": 4.976565852282137e-05, + "loss": 5.2989, + "step": 7335 + }, + { + "epoch": 0.04362927014939576, + "grad_norm": 2.5774800777435303, + "learning_rate": 4.976559471302203e-05, + "loss": 6.0479, + "step": 7336 + }, + { + "epoch": 0.04363521743267675, + "grad_norm": 2.0820937156677246, + "learning_rate": 4.976553089457727e-05, + "loss": 5.7636, + "step": 7337 + }, + { + "epoch": 0.04364116471595775, + "grad_norm": 2.287719964981079, + "learning_rate": 4.9765467067487126e-05, + "loss": 5.7706, + "step": 7338 + }, + { + "epoch": 0.04364711199923875, + "grad_norm": 2.6578378677368164, + "learning_rate": 4.9765403231751614e-05, + "loss": 6.1506, + "step": 7339 + }, + { + "epoch": 0.043653059282519745, + "grad_norm": 2.503955841064453, + "learning_rate": 4.976533938737075e-05, + "loss": 6.0658, + "step": 7340 + }, + { + "epoch": 0.04365900656580074, + "grad_norm": 2.28857684135437, + "learning_rate": 4.976527553434456e-05, + "loss": 5.833, + "step": 7341 + }, + { + "epoch": 0.04366495384908174, + "grad_norm": 2.327331781387329, + "learning_rate": 4.976521167267307e-05, + "loss": 5.934, + "step": 7342 + }, + { + "epoch": 0.04367090113236274, + "grad_norm": 1.7726761102676392, + "learning_rate": 4.976514780235631e-05, + "loss": 6.034, + "step": 7343 + }, + { + "epoch": 0.04367684841564373, + "grad_norm": 2.180790662765503, + "learning_rate": 4.9765083923394285e-05, + "loss": 6.1377, + "step": 7344 + }, + { + "epoch": 0.043682795698924734, + "grad_norm": 2.031378984451294, + "learning_rate": 4.9765020035787024e-05, + "loss": 5.7203, + "step": 7345 + }, + { + "epoch": 0.04368874298220573, + "grad_norm": 2.453611135482788, + "learning_rate": 4.9764956139534545e-05, + "loss": 5.9798, + "step": 7346 + }, + { + "epoch": 0.043694690265486724, + "grad_norm": 2.3802528381347656, + "learning_rate": 4.976489223463688e-05, + "loss": 5.9343, + "step": 7347 + }, + { + "epoch": 0.043700637548767726, + "grad_norm": 2.771704912185669, + "learning_rate": 4.976482832109406e-05, + "loss": 6.5202, + "step": 7348 + }, + { + "epoch": 0.04370658483204872, + "grad_norm": 1.9455180168151855, + "learning_rate": 4.9764764398906084e-05, + "loss": 6.1159, + "step": 7349 + }, + { + "epoch": 0.043712532115329716, + "grad_norm": 1.9527102708816528, + "learning_rate": 4.9764700468072976e-05, + "loss": 5.7773, + "step": 7350 + }, + { + "epoch": 0.04371847939861071, + "grad_norm": 1.9531358480453491, + "learning_rate": 4.976463652859478e-05, + "loss": 5.9918, + "step": 7351 + }, + { + "epoch": 0.043724426681891713, + "grad_norm": 2.375239849090576, + "learning_rate": 4.97645725804715e-05, + "loss": 5.5054, + "step": 7352 + }, + { + "epoch": 0.04373037396517271, + "grad_norm": 2.156553030014038, + "learning_rate": 4.9764508623703166e-05, + "loss": 5.664, + "step": 7353 + }, + { + "epoch": 0.043736321248453704, + "grad_norm": 2.317331075668335, + "learning_rate": 4.9764444658289796e-05, + "loss": 5.4473, + "step": 7354 + }, + { + "epoch": 0.043742268531734706, + "grad_norm": 2.1958348751068115, + "learning_rate": 4.976438068423141e-05, + "loss": 5.3584, + "step": 7355 + }, + { + "epoch": 0.0437482158150157, + "grad_norm": 2.152045249938965, + "learning_rate": 4.976431670152803e-05, + "loss": 5.4388, + "step": 7356 + }, + { + "epoch": 0.043754163098296696, + "grad_norm": 2.0661544799804688, + "learning_rate": 4.976425271017971e-05, + "loss": 5.3866, + "step": 7357 + }, + { + "epoch": 0.0437601103815777, + "grad_norm": 2.106480598449707, + "learning_rate": 4.976418871018642e-05, + "loss": 5.5928, + "step": 7358 + }, + { + "epoch": 0.04376605766485869, + "grad_norm": 2.5921759605407715, + "learning_rate": 4.976412470154821e-05, + "loss": 6.0133, + "step": 7359 + }, + { + "epoch": 0.04377200494813969, + "grad_norm": 2.4117794036865234, + "learning_rate": 4.97640606842651e-05, + "loss": 6.0988, + "step": 7360 + }, + { + "epoch": 0.04377795223142069, + "grad_norm": 1.9839050769805908, + "learning_rate": 4.976399665833712e-05, + "loss": 5.9568, + "step": 7361 + }, + { + "epoch": 0.043783899514701685, + "grad_norm": 2.166215419769287, + "learning_rate": 4.9763932623764285e-05, + "loss": 5.9205, + "step": 7362 + }, + { + "epoch": 0.04378984679798268, + "grad_norm": 2.8216545581817627, + "learning_rate": 4.9763868580546616e-05, + "loss": 5.792, + "step": 7363 + }, + { + "epoch": 0.043795794081263675, + "grad_norm": 2.907707929611206, + "learning_rate": 4.976380452868413e-05, + "loss": 5.5824, + "step": 7364 + }, + { + "epoch": 0.04380174136454468, + "grad_norm": 2.173025369644165, + "learning_rate": 4.976374046817686e-05, + "loss": 6.2752, + "step": 7365 + }, + { + "epoch": 0.04380768864782567, + "grad_norm": 2.1098685264587402, + "learning_rate": 4.9763676399024814e-05, + "loss": 5.8052, + "step": 7366 + }, + { + "epoch": 0.04381363593110667, + "grad_norm": 2.1980762481689453, + "learning_rate": 4.9763612321228035e-05, + "loss": 5.3456, + "step": 7367 + }, + { + "epoch": 0.04381958321438767, + "grad_norm": 2.091327667236328, + "learning_rate": 4.976354823478654e-05, + "loss": 5.211, + "step": 7368 + }, + { + "epoch": 0.043825530497668665, + "grad_norm": 2.37920880317688, + "learning_rate": 4.976348413970033e-05, + "loss": 5.8652, + "step": 7369 + }, + { + "epoch": 0.04383147778094966, + "grad_norm": 2.454202175140381, + "learning_rate": 4.976342003596946e-05, + "loss": 5.9654, + "step": 7370 + }, + { + "epoch": 0.04383742506423066, + "grad_norm": 2.04577898979187, + "learning_rate": 4.9763355923593927e-05, + "loss": 6.3042, + "step": 7371 + }, + { + "epoch": 0.04384337234751166, + "grad_norm": 2.358250141143799, + "learning_rate": 4.976329180257376e-05, + "loss": 6.1403, + "step": 7372 + }, + { + "epoch": 0.04384931963079265, + "grad_norm": 2.177819013595581, + "learning_rate": 4.9763227672909e-05, + "loss": 5.8993, + "step": 7373 + }, + { + "epoch": 0.043855266914073654, + "grad_norm": 2.24910569190979, + "learning_rate": 4.976316353459963e-05, + "loss": 5.9763, + "step": 7374 + }, + { + "epoch": 0.04386121419735465, + "grad_norm": 2.3985965251922607, + "learning_rate": 4.976309938764571e-05, + "loss": 6.2288, + "step": 7375 + }, + { + "epoch": 0.043867161480635644, + "grad_norm": 2.1250808238983154, + "learning_rate": 4.9763035232047244e-05, + "loss": 6.1588, + "step": 7376 + }, + { + "epoch": 0.043873108763916646, + "grad_norm": 1.9815669059753418, + "learning_rate": 4.976297106780426e-05, + "loss": 6.3202, + "step": 7377 + }, + { + "epoch": 0.04387905604719764, + "grad_norm": 2.181999683380127, + "learning_rate": 4.976290689491677e-05, + "loss": 5.9125, + "step": 7378 + }, + { + "epoch": 0.043885003330478636, + "grad_norm": 2.365546703338623, + "learning_rate": 4.9762842713384815e-05, + "loss": 6.0991, + "step": 7379 + }, + { + "epoch": 0.04389095061375963, + "grad_norm": 2.0843441486358643, + "learning_rate": 4.9762778523208406e-05, + "loss": 5.9675, + "step": 7380 + }, + { + "epoch": 0.04389689789704063, + "grad_norm": 2.271576404571533, + "learning_rate": 4.9762714324387566e-05, + "loss": 5.5703, + "step": 7381 + }, + { + "epoch": 0.04390284518032163, + "grad_norm": 2.244211435317993, + "learning_rate": 4.9762650116922314e-05, + "loss": 5.4674, + "step": 7382 + }, + { + "epoch": 0.043908792463602624, + "grad_norm": 1.728034257888794, + "learning_rate": 4.9762585900812684e-05, + "loss": 5.6264, + "step": 7383 + }, + { + "epoch": 0.043914739746883626, + "grad_norm": 2.400587320327759, + "learning_rate": 4.976252167605869e-05, + "loss": 6.052, + "step": 7384 + }, + { + "epoch": 0.04392068703016462, + "grad_norm": 1.9865821599960327, + "learning_rate": 4.9762457442660346e-05, + "loss": 5.8544, + "step": 7385 + }, + { + "epoch": 0.043926634313445616, + "grad_norm": 2.236527681350708, + "learning_rate": 4.97623932006177e-05, + "loss": 5.5033, + "step": 7386 + }, + { + "epoch": 0.04393258159672662, + "grad_norm": 2.0424020290374756, + "learning_rate": 4.9762328949930746e-05, + "loss": 5.4088, + "step": 7387 + }, + { + "epoch": 0.04393852888000761, + "grad_norm": 2.0601999759674072, + "learning_rate": 4.976226469059952e-05, + "loss": 5.8599, + "step": 7388 + }, + { + "epoch": 0.04394447616328861, + "grad_norm": 2.5052783489227295, + "learning_rate": 4.976220042262404e-05, + "loss": 5.8202, + "step": 7389 + }, + { + "epoch": 0.04395042344656961, + "grad_norm": 2.178549289703369, + "learning_rate": 4.9762136146004344e-05, + "loss": 5.4554, + "step": 7390 + }, + { + "epoch": 0.043956370729850605, + "grad_norm": 1.9407802820205688, + "learning_rate": 4.976207186074043e-05, + "loss": 5.4062, + "step": 7391 + }, + { + "epoch": 0.0439623180131316, + "grad_norm": 1.4814093112945557, + "learning_rate": 4.9762007566832336e-05, + "loss": 5.4662, + "step": 7392 + }, + { + "epoch": 0.043968265296412595, + "grad_norm": 1.8808835744857788, + "learning_rate": 4.9761943264280086e-05, + "loss": 6.1617, + "step": 7393 + }, + { + "epoch": 0.0439742125796936, + "grad_norm": 1.9318643808364868, + "learning_rate": 4.97618789530837e-05, + "loss": 6.1357, + "step": 7394 + }, + { + "epoch": 0.04398015986297459, + "grad_norm": 2.2515900135040283, + "learning_rate": 4.976181463324319e-05, + "loss": 6.11, + "step": 7395 + }, + { + "epoch": 0.04398610714625559, + "grad_norm": 2.375298023223877, + "learning_rate": 4.9761750304758584e-05, + "loss": 6.1121, + "step": 7396 + }, + { + "epoch": 0.04399205442953659, + "grad_norm": 2.2254321575164795, + "learning_rate": 4.9761685967629914e-05, + "loss": 6.0136, + "step": 7397 + }, + { + "epoch": 0.043998001712817585, + "grad_norm": 2.146164894104004, + "learning_rate": 4.976162162185719e-05, + "loss": 5.8391, + "step": 7398 + }, + { + "epoch": 0.04400394899609858, + "grad_norm": 2.3237650394439697, + "learning_rate": 4.976155726744044e-05, + "loss": 5.461, + "step": 7399 + }, + { + "epoch": 0.04400989627937958, + "grad_norm": 2.2263002395629883, + "learning_rate": 4.976149290437969e-05, + "loss": 5.5885, + "step": 7400 + }, + { + "epoch": 0.04401584356266058, + "grad_norm": 1.9597729444503784, + "learning_rate": 4.9761428532674956e-05, + "loss": 5.348, + "step": 7401 + }, + { + "epoch": 0.04402179084594157, + "grad_norm": 2.2215018272399902, + "learning_rate": 4.976136415232626e-05, + "loss": 5.933, + "step": 7402 + }, + { + "epoch": 0.044027738129222574, + "grad_norm": 2.258618116378784, + "learning_rate": 4.9761299763333635e-05, + "loss": 6.0685, + "step": 7403 + }, + { + "epoch": 0.04403368541250357, + "grad_norm": 2.3045873641967773, + "learning_rate": 4.976123536569709e-05, + "loss": 5.7277, + "step": 7404 + }, + { + "epoch": 0.044039632695784564, + "grad_norm": 2.546252489089966, + "learning_rate": 4.976117095941666e-05, + "loss": 5.8839, + "step": 7405 + }, + { + "epoch": 0.044045579979065566, + "grad_norm": 1.8963768482208252, + "learning_rate": 4.976110654449235e-05, + "loss": 6.1247, + "step": 7406 + }, + { + "epoch": 0.04405152726234656, + "grad_norm": 2.6287784576416016, + "learning_rate": 4.976104212092421e-05, + "loss": 5.9712, + "step": 7407 + }, + { + "epoch": 0.044057474545627556, + "grad_norm": 2.562612295150757, + "learning_rate": 4.976097768871223e-05, + "loss": 6.1226, + "step": 7408 + }, + { + "epoch": 0.04406342182890855, + "grad_norm": 2.2308688163757324, + "learning_rate": 4.976091324785645e-05, + "loss": 6.3235, + "step": 7409 + }, + { + "epoch": 0.04406936911218955, + "grad_norm": 2.4595553874969482, + "learning_rate": 4.976084879835691e-05, + "loss": 5.8164, + "step": 7410 + }, + { + "epoch": 0.04407531639547055, + "grad_norm": 2.3693978786468506, + "learning_rate": 4.97607843402136e-05, + "loss": 5.7727, + "step": 7411 + }, + { + "epoch": 0.044081263678751544, + "grad_norm": 4.144592761993408, + "learning_rate": 4.9760719873426546e-05, + "loss": 5.6382, + "step": 7412 + }, + { + "epoch": 0.044087210962032546, + "grad_norm": 2.5423779487609863, + "learning_rate": 4.9760655397995794e-05, + "loss": 5.7526, + "step": 7413 + }, + { + "epoch": 0.04409315824531354, + "grad_norm": 2.119281053543091, + "learning_rate": 4.976059091392135e-05, + "loss": 5.7246, + "step": 7414 + }, + { + "epoch": 0.044099105528594536, + "grad_norm": 2.177074432373047, + "learning_rate": 4.976052642120324e-05, + "loss": 5.7296, + "step": 7415 + }, + { + "epoch": 0.04410505281187554, + "grad_norm": 1.8897806406021118, + "learning_rate": 4.9760461919841486e-05, + "loss": 5.6349, + "step": 7416 + }, + { + "epoch": 0.04411100009515653, + "grad_norm": 2.445082187652588, + "learning_rate": 4.97603974098361e-05, + "loss": 5.7414, + "step": 7417 + }, + { + "epoch": 0.04411694737843753, + "grad_norm": 2.2564280033111572, + "learning_rate": 4.976033289118713e-05, + "loss": 5.6709, + "step": 7418 + }, + { + "epoch": 0.04412289466171853, + "grad_norm": 2.1907529830932617, + "learning_rate": 4.976026836389458e-05, + "loss": 5.6067, + "step": 7419 + }, + { + "epoch": 0.044128841944999525, + "grad_norm": 2.1872594356536865, + "learning_rate": 4.976020382795848e-05, + "loss": 5.5166, + "step": 7420 + }, + { + "epoch": 0.04413478922828052, + "grad_norm": 1.7740691900253296, + "learning_rate": 4.9760139283378835e-05, + "loss": 5.5833, + "step": 7421 + }, + { + "epoch": 0.044140736511561515, + "grad_norm": 2.128389358520508, + "learning_rate": 4.976007473015569e-05, + "loss": 5.6403, + "step": 7422 + }, + { + "epoch": 0.04414668379484252, + "grad_norm": 2.6193220615386963, + "learning_rate": 4.9760010168289053e-05, + "loss": 5.8139, + "step": 7423 + }, + { + "epoch": 0.04415263107812351, + "grad_norm": 2.727902412414551, + "learning_rate": 4.9759945597778955e-05, + "loss": 5.3286, + "step": 7424 + }, + { + "epoch": 0.04415857836140451, + "grad_norm": 2.4500436782836914, + "learning_rate": 4.975988101862542e-05, + "loss": 5.2647, + "step": 7425 + }, + { + "epoch": 0.04416452564468551, + "grad_norm": 2.1040356159210205, + "learning_rate": 4.975981643082846e-05, + "loss": 6.0935, + "step": 7426 + }, + { + "epoch": 0.044170472927966505, + "grad_norm": 1.9168792963027954, + "learning_rate": 4.975975183438811e-05, + "loss": 5.5147, + "step": 7427 + }, + { + "epoch": 0.0441764202112475, + "grad_norm": 2.0156469345092773, + "learning_rate": 4.9759687229304384e-05, + "loss": 6.2896, + "step": 7428 + }, + { + "epoch": 0.0441823674945285, + "grad_norm": 2.362933874130249, + "learning_rate": 4.975962261557731e-05, + "loss": 5.9514, + "step": 7429 + }, + { + "epoch": 0.0441883147778095, + "grad_norm": 2.2892727851867676, + "learning_rate": 4.9759557993206906e-05, + "loss": 5.5646, + "step": 7430 + }, + { + "epoch": 0.04419426206109049, + "grad_norm": 2.287722587585449, + "learning_rate": 4.97594933621932e-05, + "loss": 5.364, + "step": 7431 + }, + { + "epoch": 0.044200209344371494, + "grad_norm": 2.0421855449676514, + "learning_rate": 4.9759428722536194e-05, + "loss": 5.6838, + "step": 7432 + }, + { + "epoch": 0.04420615662765249, + "grad_norm": 2.2392499446868896, + "learning_rate": 4.9759364074235944e-05, + "loss": 6.0727, + "step": 7433 + }, + { + "epoch": 0.044212103910933484, + "grad_norm": 2.084768295288086, + "learning_rate": 4.975929941729245e-05, + "loss": 6.1208, + "step": 7434 + }, + { + "epoch": 0.044218051194214486, + "grad_norm": 1.817015528678894, + "learning_rate": 4.975923475170574e-05, + "loss": 6.3405, + "step": 7435 + }, + { + "epoch": 0.04422399847749548, + "grad_norm": 1.974926233291626, + "learning_rate": 4.9759170077475834e-05, + "loss": 5.9607, + "step": 7436 + }, + { + "epoch": 0.044229945760776476, + "grad_norm": 2.1244025230407715, + "learning_rate": 4.975910539460277e-05, + "loss": 6.2579, + "step": 7437 + }, + { + "epoch": 0.04423589304405747, + "grad_norm": 1.9459706544876099, + "learning_rate": 4.975904070308655e-05, + "loss": 5.5877, + "step": 7438 + }, + { + "epoch": 0.04424184032733847, + "grad_norm": 2.1891977787017822, + "learning_rate": 4.97589760029272e-05, + "loss": 5.9913, + "step": 7439 + }, + { + "epoch": 0.04424778761061947, + "grad_norm": 2.0368902683258057, + "learning_rate": 4.9758911294124756e-05, + "loss": 5.9478, + "step": 7440 + }, + { + "epoch": 0.044253734893900463, + "grad_norm": 2.2937796115875244, + "learning_rate": 4.975884657667922e-05, + "loss": 6.1529, + "step": 7441 + }, + { + "epoch": 0.044259682177181466, + "grad_norm": 2.601637125015259, + "learning_rate": 4.975878185059064e-05, + "loss": 5.4446, + "step": 7442 + }, + { + "epoch": 0.04426562946046246, + "grad_norm": 2.2025954723358154, + "learning_rate": 4.975871711585902e-05, + "loss": 5.8911, + "step": 7443 + }, + { + "epoch": 0.044271576743743456, + "grad_norm": 2.0498836040496826, + "learning_rate": 4.975865237248438e-05, + "loss": 6.0604, + "step": 7444 + }, + { + "epoch": 0.04427752402702446, + "grad_norm": 2.308239459991455, + "learning_rate": 4.975858762046676e-05, + "loss": 5.9599, + "step": 7445 + }, + { + "epoch": 0.04428347131030545, + "grad_norm": 2.286747455596924, + "learning_rate": 4.9758522859806165e-05, + "loss": 6.3528, + "step": 7446 + }, + { + "epoch": 0.04428941859358645, + "grad_norm": 2.2376902103424072, + "learning_rate": 4.975845809050264e-05, + "loss": 6.205, + "step": 7447 + }, + { + "epoch": 0.04429536587686745, + "grad_norm": 1.8052057027816772, + "learning_rate": 4.9758393312556176e-05, + "loss": 6.2188, + "step": 7448 + }, + { + "epoch": 0.044301313160148445, + "grad_norm": 1.9839476346969604, + "learning_rate": 4.975832852596682e-05, + "loss": 6.1479, + "step": 7449 + }, + { + "epoch": 0.04430726044342944, + "grad_norm": 1.8890517950057983, + "learning_rate": 4.975826373073459e-05, + "loss": 6.2524, + "step": 7450 + }, + { + "epoch": 0.04431320772671044, + "grad_norm": 2.049192428588867, + "learning_rate": 4.97581989268595e-05, + "loss": 5.5486, + "step": 7451 + }, + { + "epoch": 0.04431915500999144, + "grad_norm": 2.8271291255950928, + "learning_rate": 4.975813411434158e-05, + "loss": 5.1916, + "step": 7452 + }, + { + "epoch": 0.04432510229327243, + "grad_norm": 1.94833505153656, + "learning_rate": 4.975806929318085e-05, + "loss": 5.6747, + "step": 7453 + }, + { + "epoch": 0.04433104957655343, + "grad_norm": 2.14536190032959, + "learning_rate": 4.975800446337734e-05, + "loss": 5.4066, + "step": 7454 + }, + { + "epoch": 0.04433699685983443, + "grad_norm": 2.5557188987731934, + "learning_rate": 4.975793962493106e-05, + "loss": 5.2257, + "step": 7455 + }, + { + "epoch": 0.044342944143115424, + "grad_norm": 2.4718832969665527, + "learning_rate": 4.975787477784205e-05, + "loss": 6.0248, + "step": 7456 + }, + { + "epoch": 0.04434889142639642, + "grad_norm": 2.8627419471740723, + "learning_rate": 4.975780992211031e-05, + "loss": 5.3245, + "step": 7457 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 2.932990789413452, + "learning_rate": 4.9757745057735876e-05, + "loss": 4.8914, + "step": 7458 + }, + { + "epoch": 0.04436078599295842, + "grad_norm": 2.6231770515441895, + "learning_rate": 4.975768018471877e-05, + "loss": 5.3323, + "step": 7459 + }, + { + "epoch": 0.04436673327623941, + "grad_norm": 2.5591986179351807, + "learning_rate": 4.975761530305901e-05, + "loss": 5.4972, + "step": 7460 + }, + { + "epoch": 0.044372680559520414, + "grad_norm": 2.4060492515563965, + "learning_rate": 4.975755041275664e-05, + "loss": 5.5988, + "step": 7461 + }, + { + "epoch": 0.04437862784280141, + "grad_norm": 2.377260446548462, + "learning_rate": 4.975748551381164e-05, + "loss": 5.2137, + "step": 7462 + }, + { + "epoch": 0.044384575126082404, + "grad_norm": 2.171934127807617, + "learning_rate": 4.9757420606224076e-05, + "loss": 5.6313, + "step": 7463 + }, + { + "epoch": 0.044390522409363406, + "grad_norm": 2.1225788593292236, + "learning_rate": 4.975735568999394e-05, + "loss": 5.839, + "step": 7464 + }, + { + "epoch": 0.0443964696926444, + "grad_norm": 2.271127939224243, + "learning_rate": 4.975729076512128e-05, + "loss": 5.7111, + "step": 7465 + }, + { + "epoch": 0.044402416975925396, + "grad_norm": 2.7138264179229736, + "learning_rate": 4.975722583160609e-05, + "loss": 5.3169, + "step": 7466 + }, + { + "epoch": 0.04440836425920639, + "grad_norm": 2.8181982040405273, + "learning_rate": 4.9757160889448416e-05, + "loss": 5.3323, + "step": 7467 + }, + { + "epoch": 0.04441431154248739, + "grad_norm": 2.680816411972046, + "learning_rate": 4.975709593864828e-05, + "loss": 5.6924, + "step": 7468 + }, + { + "epoch": 0.04442025882576839, + "grad_norm": 2.3682074546813965, + "learning_rate": 4.975703097920569e-05, + "loss": 6.0049, + "step": 7469 + }, + { + "epoch": 0.04442620610904938, + "grad_norm": 2.3080508708953857, + "learning_rate": 4.9756966011120674e-05, + "loss": 6.4438, + "step": 7470 + }, + { + "epoch": 0.044432153392330385, + "grad_norm": 2.2631113529205322, + "learning_rate": 4.9756901034393265e-05, + "loss": 5.9296, + "step": 7471 + }, + { + "epoch": 0.04443810067561138, + "grad_norm": 2.283712148666382, + "learning_rate": 4.975683604902347e-05, + "loss": 5.831, + "step": 7472 + }, + { + "epoch": 0.044444047958892376, + "grad_norm": 2.2130608558654785, + "learning_rate": 4.975677105501132e-05, + "loss": 5.8757, + "step": 7473 + }, + { + "epoch": 0.04444999524217338, + "grad_norm": 1.9392763376235962, + "learning_rate": 4.975670605235684e-05, + "loss": 5.5836, + "step": 7474 + }, + { + "epoch": 0.04445594252545437, + "grad_norm": 2.097076416015625, + "learning_rate": 4.975664104106005e-05, + "loss": 6.0782, + "step": 7475 + }, + { + "epoch": 0.04446188980873537, + "grad_norm": 2.063021183013916, + "learning_rate": 4.975657602112097e-05, + "loss": 6.2171, + "step": 7476 + }, + { + "epoch": 0.04446783709201637, + "grad_norm": 2.4466049671173096, + "learning_rate": 4.9756510992539626e-05, + "loss": 5.8649, + "step": 7477 + }, + { + "epoch": 0.044473784375297365, + "grad_norm": 2.2160751819610596, + "learning_rate": 4.975644595531605e-05, + "loss": 5.9297, + "step": 7478 + }, + { + "epoch": 0.04447973165857836, + "grad_norm": 2.69352650642395, + "learning_rate": 4.975638090945024e-05, + "loss": 6.1062, + "step": 7479 + }, + { + "epoch": 0.04448567894185936, + "grad_norm": 2.2830610275268555, + "learning_rate": 4.975631585494224e-05, + "loss": 6.1663, + "step": 7480 + }, + { + "epoch": 0.04449162622514036, + "grad_norm": 2.936842203140259, + "learning_rate": 4.975625079179206e-05, + "loss": 5.9952, + "step": 7481 + }, + { + "epoch": 0.04449757350842135, + "grad_norm": 2.1398322582244873, + "learning_rate": 4.9756185719999725e-05, + "loss": 6.0005, + "step": 7482 + }, + { + "epoch": 0.04450352079170235, + "grad_norm": 2.2835536003112793, + "learning_rate": 4.9756120639565275e-05, + "loss": 5.7155, + "step": 7483 + }, + { + "epoch": 0.04450946807498335, + "grad_norm": 2.22917103767395, + "learning_rate": 4.975605555048871e-05, + "loss": 5.7134, + "step": 7484 + }, + { + "epoch": 0.044515415358264344, + "grad_norm": 2.0195605754852295, + "learning_rate": 4.975599045277006e-05, + "loss": 5.6369, + "step": 7485 + }, + { + "epoch": 0.04452136264154534, + "grad_norm": 1.8495477437973022, + "learning_rate": 4.975592534640936e-05, + "loss": 5.9035, + "step": 7486 + }, + { + "epoch": 0.04452730992482634, + "grad_norm": 2.4814226627349854, + "learning_rate": 4.9755860231406616e-05, + "loss": 6.1024, + "step": 7487 + }, + { + "epoch": 0.04453325720810734, + "grad_norm": 2.221820831298828, + "learning_rate": 4.975579510776186e-05, + "loss": 6.1193, + "step": 7488 + }, + { + "epoch": 0.04453920449138833, + "grad_norm": 1.935722827911377, + "learning_rate": 4.975572997547511e-05, + "loss": 6.1088, + "step": 7489 + }, + { + "epoch": 0.044545151774669334, + "grad_norm": 2.1287481784820557, + "learning_rate": 4.975566483454638e-05, + "loss": 6.1064, + "step": 7490 + }, + { + "epoch": 0.04455109905795033, + "grad_norm": 2.1914093494415283, + "learning_rate": 4.9755599684975716e-05, + "loss": 6.072, + "step": 7491 + }, + { + "epoch": 0.044557046341231324, + "grad_norm": 2.1979966163635254, + "learning_rate": 4.975553452676312e-05, + "loss": 6.1447, + "step": 7492 + }, + { + "epoch": 0.044562993624512326, + "grad_norm": 2.108259916305542, + "learning_rate": 4.975546935990863e-05, + "loss": 6.0109, + "step": 7493 + }, + { + "epoch": 0.04456894090779332, + "grad_norm": 2.2454450130462646, + "learning_rate": 4.975540418441226e-05, + "loss": 5.8627, + "step": 7494 + }, + { + "epoch": 0.044574888191074316, + "grad_norm": 2.151130437850952, + "learning_rate": 4.9755339000274027e-05, + "loss": 6.0241, + "step": 7495 + }, + { + "epoch": 0.04458083547435531, + "grad_norm": 1.9150489568710327, + "learning_rate": 4.975527380749397e-05, + "loss": 6.0179, + "step": 7496 + }, + { + "epoch": 0.04458678275763631, + "grad_norm": 1.9065133333206177, + "learning_rate": 4.97552086060721e-05, + "loss": 5.9991, + "step": 7497 + }, + { + "epoch": 0.04459273004091731, + "grad_norm": 1.9627622365951538, + "learning_rate": 4.975514339600844e-05, + "loss": 5.9633, + "step": 7498 + }, + { + "epoch": 0.0445986773241983, + "grad_norm": 1.7777502536773682, + "learning_rate": 4.975507817730302e-05, + "loss": 5.9426, + "step": 7499 + }, + { + "epoch": 0.044604624607479305, + "grad_norm": 1.6735023260116577, + "learning_rate": 4.9755012949955846e-05, + "loss": 5.9432, + "step": 7500 + }, + { + "epoch": 0.0446105718907603, + "grad_norm": 2.1570491790771484, + "learning_rate": 4.975494771396697e-05, + "loss": 6.2032, + "step": 7501 + }, + { + "epoch": 0.044616519174041296, + "grad_norm": 2.286522150039673, + "learning_rate": 4.9754882469336387e-05, + "loss": 5.7226, + "step": 7502 + }, + { + "epoch": 0.0446224664573223, + "grad_norm": 2.1940622329711914, + "learning_rate": 4.975481721606413e-05, + "loss": 6.2215, + "step": 7503 + }, + { + "epoch": 0.04462841374060329, + "grad_norm": 2.329263210296631, + "learning_rate": 4.9754751954150224e-05, + "loss": 5.5403, + "step": 7504 + }, + { + "epoch": 0.04463436102388429, + "grad_norm": 2.112712860107422, + "learning_rate": 4.975468668359469e-05, + "loss": 5.7581, + "step": 7505 + }, + { + "epoch": 0.04464030830716529, + "grad_norm": 2.2875239849090576, + "learning_rate": 4.975462140439755e-05, + "loss": 5.9593, + "step": 7506 + }, + { + "epoch": 0.044646255590446285, + "grad_norm": 2.282121419906616, + "learning_rate": 4.975455611655883e-05, + "loss": 5.8684, + "step": 7507 + }, + { + "epoch": 0.04465220287372728, + "grad_norm": 1.8482197523117065, + "learning_rate": 4.975449082007855e-05, + "loss": 5.753, + "step": 7508 + }, + { + "epoch": 0.04465815015700828, + "grad_norm": 2.6635684967041016, + "learning_rate": 4.9754425514956724e-05, + "loss": 5.0732, + "step": 7509 + }, + { + "epoch": 0.04466409744028928, + "grad_norm": 2.6632800102233887, + "learning_rate": 4.9754360201193395e-05, + "loss": 5.1644, + "step": 7510 + }, + { + "epoch": 0.04467004472357027, + "grad_norm": 2.630445718765259, + "learning_rate": 4.9754294878788574e-05, + "loss": 5.0322, + "step": 7511 + }, + { + "epoch": 0.04467599200685127, + "grad_norm": 2.4036223888397217, + "learning_rate": 4.975422954774228e-05, + "loss": 4.8949, + "step": 7512 + }, + { + "epoch": 0.04468193929013227, + "grad_norm": 2.381810426712036, + "learning_rate": 4.9754164208054535e-05, + "loss": 5.7921, + "step": 7513 + }, + { + "epoch": 0.044687886573413264, + "grad_norm": 2.570949077606201, + "learning_rate": 4.9754098859725377e-05, + "loss": 5.9612, + "step": 7514 + }, + { + "epoch": 0.04469383385669426, + "grad_norm": 2.510998010635376, + "learning_rate": 4.9754033502754815e-05, + "loss": 5.7273, + "step": 7515 + }, + { + "epoch": 0.04469978113997526, + "grad_norm": 2.6216115951538086, + "learning_rate": 4.975396813714288e-05, + "loss": 5.7601, + "step": 7516 + }, + { + "epoch": 0.04470572842325626, + "grad_norm": 2.5298542976379395, + "learning_rate": 4.975390276288958e-05, + "loss": 5.8007, + "step": 7517 + }, + { + "epoch": 0.04471167570653725, + "grad_norm": 2.6195290088653564, + "learning_rate": 4.975383737999496e-05, + "loss": 5.6071, + "step": 7518 + }, + { + "epoch": 0.044717622989818254, + "grad_norm": 2.5432629585266113, + "learning_rate": 4.975377198845902e-05, + "loss": 6.0224, + "step": 7519 + }, + { + "epoch": 0.04472357027309925, + "grad_norm": 2.2290337085723877, + "learning_rate": 4.97537065882818e-05, + "loss": 5.7141, + "step": 7520 + }, + { + "epoch": 0.044729517556380244, + "grad_norm": 2.627206802368164, + "learning_rate": 4.975364117946332e-05, + "loss": 6.2518, + "step": 7521 + }, + { + "epoch": 0.044735464839661246, + "grad_norm": 2.386993169784546, + "learning_rate": 4.975357576200359e-05, + "loss": 6.0494, + "step": 7522 + }, + { + "epoch": 0.04474141212294224, + "grad_norm": 2.20511794090271, + "learning_rate": 4.9753510335902656e-05, + "loss": 6.2563, + "step": 7523 + }, + { + "epoch": 0.044747359406223236, + "grad_norm": 2.5564749240875244, + "learning_rate": 4.975344490116052e-05, + "loss": 6.2498, + "step": 7524 + }, + { + "epoch": 0.04475330668950423, + "grad_norm": 2.6001932621002197, + "learning_rate": 4.975337945777721e-05, + "loss": 5.6721, + "step": 7525 + }, + { + "epoch": 0.04475925397278523, + "grad_norm": 2.6677772998809814, + "learning_rate": 4.975331400575275e-05, + "loss": 5.88, + "step": 7526 + }, + { + "epoch": 0.04476520125606623, + "grad_norm": 3.616734027862549, + "learning_rate": 4.975324854508716e-05, + "loss": 5.4835, + "step": 7527 + }, + { + "epoch": 0.04477114853934722, + "grad_norm": 3.0301461219787598, + "learning_rate": 4.975318307578048e-05, + "loss": 5.326, + "step": 7528 + }, + { + "epoch": 0.044777095822628225, + "grad_norm": 2.029836893081665, + "learning_rate": 4.975311759783271e-05, + "loss": 5.3516, + "step": 7529 + }, + { + "epoch": 0.04478304310590922, + "grad_norm": 1.9886969327926636, + "learning_rate": 4.9753052111243885e-05, + "loss": 5.3442, + "step": 7530 + }, + { + "epoch": 0.044788990389190216, + "grad_norm": 2.4227612018585205, + "learning_rate": 4.975298661601403e-05, + "loss": 5.4273, + "step": 7531 + }, + { + "epoch": 0.04479493767247122, + "grad_norm": 2.8426849842071533, + "learning_rate": 4.975292111214316e-05, + "loss": 5.6604, + "step": 7532 + }, + { + "epoch": 0.04480088495575221, + "grad_norm": 2.4818854331970215, + "learning_rate": 4.97528555996313e-05, + "loss": 6.4941, + "step": 7533 + }, + { + "epoch": 0.04480683223903321, + "grad_norm": 2.291642904281616, + "learning_rate": 4.9752790078478465e-05, + "loss": 6.404, + "step": 7534 + }, + { + "epoch": 0.04481277952231421, + "grad_norm": 2.4973669052124023, + "learning_rate": 4.9752724548684695e-05, + "loss": 5.6068, + "step": 7535 + }, + { + "epoch": 0.044818726805595205, + "grad_norm": 2.273130416870117, + "learning_rate": 4.975265901025001e-05, + "loss": 6.1689, + "step": 7536 + }, + { + "epoch": 0.0448246740888762, + "grad_norm": 3.362520456314087, + "learning_rate": 4.9752593463174424e-05, + "loss": 5.5346, + "step": 7537 + }, + { + "epoch": 0.0448306213721572, + "grad_norm": 5.170871257781982, + "learning_rate": 4.9752527907457956e-05, + "loss": 5.3831, + "step": 7538 + }, + { + "epoch": 0.0448365686554382, + "grad_norm": 4.224242687225342, + "learning_rate": 4.975246234310064e-05, + "loss": 5.2511, + "step": 7539 + }, + { + "epoch": 0.04484251593871919, + "grad_norm": 3.1753036975860596, + "learning_rate": 4.97523967701025e-05, + "loss": 5.06, + "step": 7540 + }, + { + "epoch": 0.04484846322200019, + "grad_norm": 2.4226467609405518, + "learning_rate": 4.975233118846355e-05, + "loss": 5.5225, + "step": 7541 + }, + { + "epoch": 0.04485441050528119, + "grad_norm": 2.5356781482696533, + "learning_rate": 4.9752265598183814e-05, + "loss": 5.5865, + "step": 7542 + }, + { + "epoch": 0.044860357788562184, + "grad_norm": 2.1505908966064453, + "learning_rate": 4.9752199999263326e-05, + "loss": 5.7436, + "step": 7543 + }, + { + "epoch": 0.04486630507184318, + "grad_norm": 2.675703763961792, + "learning_rate": 4.97521343917021e-05, + "loss": 5.3693, + "step": 7544 + }, + { + "epoch": 0.04487225235512418, + "grad_norm": 3.5228023529052734, + "learning_rate": 4.975206877550015e-05, + "loss": 4.8527, + "step": 7545 + }, + { + "epoch": 0.044878199638405177, + "grad_norm": 3.1165566444396973, + "learning_rate": 4.975200315065752e-05, + "loss": 4.7971, + "step": 7546 + }, + { + "epoch": 0.04488414692168617, + "grad_norm": 2.6216177940368652, + "learning_rate": 4.975193751717421e-05, + "loss": 4.9328, + "step": 7547 + }, + { + "epoch": 0.044890094204967174, + "grad_norm": 2.352031707763672, + "learning_rate": 4.975187187505026e-05, + "loss": 5.0021, + "step": 7548 + }, + { + "epoch": 0.04489604148824817, + "grad_norm": 1.8147127628326416, + "learning_rate": 4.975180622428569e-05, + "loss": 5.7009, + "step": 7549 + }, + { + "epoch": 0.044901988771529164, + "grad_norm": 2.1674726009368896, + "learning_rate": 4.9751740564880516e-05, + "loss": 5.2545, + "step": 7550 + }, + { + "epoch": 0.044907936054810166, + "grad_norm": 2.2935330867767334, + "learning_rate": 4.975167489683477e-05, + "loss": 5.2351, + "step": 7551 + }, + { + "epoch": 0.04491388333809116, + "grad_norm": 2.2964932918548584, + "learning_rate": 4.975160922014846e-05, + "loss": 5.483, + "step": 7552 + }, + { + "epoch": 0.044919830621372156, + "grad_norm": 1.8180936574935913, + "learning_rate": 4.9751543534821635e-05, + "loss": 5.668, + "step": 7553 + }, + { + "epoch": 0.04492577790465315, + "grad_norm": 1.906435251235962, + "learning_rate": 4.9751477840854286e-05, + "loss": 5.6664, + "step": 7554 + }, + { + "epoch": 0.04493172518793415, + "grad_norm": 2.459702253341675, + "learning_rate": 4.9751412138246455e-05, + "loss": 5.5272, + "step": 7555 + }, + { + "epoch": 0.04493767247121515, + "grad_norm": 2.1219170093536377, + "learning_rate": 4.975134642699817e-05, + "loss": 5.638, + "step": 7556 + }, + { + "epoch": 0.04494361975449614, + "grad_norm": 2.1492953300476074, + "learning_rate": 4.975128070710944e-05, + "loss": 5.9422, + "step": 7557 + }, + { + "epoch": 0.044949567037777145, + "grad_norm": 1.813988208770752, + "learning_rate": 4.97512149785803e-05, + "loss": 5.9875, + "step": 7558 + }, + { + "epoch": 0.04495551432105814, + "grad_norm": 1.6336817741394043, + "learning_rate": 4.975114924141075e-05, + "loss": 5.9245, + "step": 7559 + }, + { + "epoch": 0.044961461604339135, + "grad_norm": 1.9339455366134644, + "learning_rate": 4.9751083495600847e-05, + "loss": 5.3263, + "step": 7560 + }, + { + "epoch": 0.04496740888762014, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.975101774115059e-05, + "loss": 5.4625, + "step": 7561 + }, + { + "epoch": 0.04497335617090113, + "grad_norm": 2.2994346618652344, + "learning_rate": 4.9750951978060004e-05, + "loss": 5.6327, + "step": 7562 + }, + { + "epoch": 0.04497930345418213, + "grad_norm": 2.1627299785614014, + "learning_rate": 4.975088620632912e-05, + "loss": 5.4882, + "step": 7563 + }, + { + "epoch": 0.04498525073746313, + "grad_norm": 2.763397693634033, + "learning_rate": 4.9750820425957954e-05, + "loss": 5.727, + "step": 7564 + }, + { + "epoch": 0.044991198020744125, + "grad_norm": 2.0107216835021973, + "learning_rate": 4.975075463694654e-05, + "loss": 5.3852, + "step": 7565 + }, + { + "epoch": 0.04499714530402512, + "grad_norm": 1.8424763679504395, + "learning_rate": 4.975068883929489e-05, + "loss": 5.3072, + "step": 7566 + }, + { + "epoch": 0.04500309258730612, + "grad_norm": 1.946702003479004, + "learning_rate": 4.975062303300303e-05, + "loss": 5.3184, + "step": 7567 + }, + { + "epoch": 0.04500903987058712, + "grad_norm": 2.1091182231903076, + "learning_rate": 4.9750557218070984e-05, + "loss": 5.0689, + "step": 7568 + }, + { + "epoch": 0.04501498715386811, + "grad_norm": 2.0064187049865723, + "learning_rate": 4.975049139449877e-05, + "loss": 4.8495, + "step": 7569 + }, + { + "epoch": 0.04502093443714911, + "grad_norm": 1.7544279098510742, + "learning_rate": 4.9750425562286416e-05, + "loss": 4.9524, + "step": 7570 + }, + { + "epoch": 0.04502688172043011, + "grad_norm": 2.0814568996429443, + "learning_rate": 4.9750359721433945e-05, + "loss": 4.798, + "step": 7571 + }, + { + "epoch": 0.045032829003711104, + "grad_norm": 2.1185543537139893, + "learning_rate": 4.975029387194139e-05, + "loss": 4.9313, + "step": 7572 + }, + { + "epoch": 0.0450387762869921, + "grad_norm": 2.3774518966674805, + "learning_rate": 4.975022801380875e-05, + "loss": 5.5954, + "step": 7573 + }, + { + "epoch": 0.0450447235702731, + "grad_norm": 2.261306047439575, + "learning_rate": 4.975016214703606e-05, + "loss": 5.5598, + "step": 7574 + }, + { + "epoch": 0.045050670853554096, + "grad_norm": 2.128244161605835, + "learning_rate": 4.975009627162335e-05, + "loss": 5.359, + "step": 7575 + }, + { + "epoch": 0.04505661813683509, + "grad_norm": 2.0767438411712646, + "learning_rate": 4.975003038757064e-05, + "loss": 5.6855, + "step": 7576 + }, + { + "epoch": 0.045062565420116094, + "grad_norm": 1.9789010286331177, + "learning_rate": 4.974996449487794e-05, + "loss": 5.1807, + "step": 7577 + }, + { + "epoch": 0.04506851270339709, + "grad_norm": 1.9136112928390503, + "learning_rate": 4.97498985935453e-05, + "loss": 5.3811, + "step": 7578 + }, + { + "epoch": 0.045074459986678084, + "grad_norm": 2.150641441345215, + "learning_rate": 4.974983268357271e-05, + "loss": 5.3281, + "step": 7579 + }, + { + "epoch": 0.045080407269959086, + "grad_norm": 1.9636656045913696, + "learning_rate": 4.9749766764960215e-05, + "loss": 5.5003, + "step": 7580 + }, + { + "epoch": 0.04508635455324008, + "grad_norm": 1.826335072517395, + "learning_rate": 4.974970083770783e-05, + "loss": 5.4687, + "step": 7581 + }, + { + "epoch": 0.045092301836521076, + "grad_norm": 1.9246041774749756, + "learning_rate": 4.974963490181558e-05, + "loss": 5.5373, + "step": 7582 + }, + { + "epoch": 0.04509824911980207, + "grad_norm": 1.8421686887741089, + "learning_rate": 4.974956895728349e-05, + "loss": 5.386, + "step": 7583 + }, + { + "epoch": 0.04510419640308307, + "grad_norm": 1.8685556650161743, + "learning_rate": 4.974950300411158e-05, + "loss": 5.5857, + "step": 7584 + }, + { + "epoch": 0.04511014368636407, + "grad_norm": 1.7022168636322021, + "learning_rate": 4.974943704229987e-05, + "loss": 5.2562, + "step": 7585 + }, + { + "epoch": 0.04511609096964506, + "grad_norm": 1.876855731010437, + "learning_rate": 4.97493710718484e-05, + "loss": 5.1359, + "step": 7586 + }, + { + "epoch": 0.045122038252926065, + "grad_norm": 1.8728361129760742, + "learning_rate": 4.974930509275717e-05, + "loss": 5.3124, + "step": 7587 + }, + { + "epoch": 0.04512798553620706, + "grad_norm": 1.930086612701416, + "learning_rate": 4.974923910502622e-05, + "loss": 5.3261, + "step": 7588 + }, + { + "epoch": 0.045133932819488055, + "grad_norm": 2.0309081077575684, + "learning_rate": 4.9749173108655564e-05, + "loss": 5.1138, + "step": 7589 + }, + { + "epoch": 0.04513988010276906, + "grad_norm": 2.042174816131592, + "learning_rate": 4.974910710364522e-05, + "loss": 5.3521, + "step": 7590 + }, + { + "epoch": 0.04514582738605005, + "grad_norm": 1.5278770923614502, + "learning_rate": 4.9749041089995224e-05, + "loss": 5.4075, + "step": 7591 + }, + { + "epoch": 0.04515177466933105, + "grad_norm": 1.7624976634979248, + "learning_rate": 4.974897506770559e-05, + "loss": 5.1698, + "step": 7592 + }, + { + "epoch": 0.04515772195261205, + "grad_norm": 1.9077380895614624, + "learning_rate": 4.974890903677635e-05, + "loss": 5.3973, + "step": 7593 + }, + { + "epoch": 0.045163669235893045, + "grad_norm": 1.5724380016326904, + "learning_rate": 4.974884299720752e-05, + "loss": 5.6325, + "step": 7594 + }, + { + "epoch": 0.04516961651917404, + "grad_norm": 1.9702832698822021, + "learning_rate": 4.974877694899913e-05, + "loss": 5.247, + "step": 7595 + }, + { + "epoch": 0.04517556380245504, + "grad_norm": 1.9913853406906128, + "learning_rate": 4.974871089215118e-05, + "loss": 5.6393, + "step": 7596 + }, + { + "epoch": 0.04518151108573604, + "grad_norm": 1.806470274925232, + "learning_rate": 4.974864482666372e-05, + "loss": 5.302, + "step": 7597 + }, + { + "epoch": 0.04518745836901703, + "grad_norm": 1.7056912183761597, + "learning_rate": 4.974857875253678e-05, + "loss": 5.4066, + "step": 7598 + }, + { + "epoch": 0.04519340565229803, + "grad_norm": 1.5990647077560425, + "learning_rate": 4.974851266977035e-05, + "loss": 5.4087, + "step": 7599 + }, + { + "epoch": 0.04519935293557903, + "grad_norm": 1.9233685731887817, + "learning_rate": 4.974844657836447e-05, + "loss": 5.4891, + "step": 7600 + }, + { + "epoch": 0.045205300218860024, + "grad_norm": 1.8654414415359497, + "learning_rate": 4.9748380478319165e-05, + "loss": 5.4955, + "step": 7601 + }, + { + "epoch": 0.04521124750214102, + "grad_norm": 1.7592424154281616, + "learning_rate": 4.974831436963446e-05, + "loss": 5.2298, + "step": 7602 + }, + { + "epoch": 0.04521719478542202, + "grad_norm": 1.8132792711257935, + "learning_rate": 4.974824825231037e-05, + "loss": 5.3487, + "step": 7603 + }, + { + "epoch": 0.045223142068703016, + "grad_norm": 1.8109947443008423, + "learning_rate": 4.974818212634692e-05, + "loss": 5.4511, + "step": 7604 + }, + { + "epoch": 0.04522908935198401, + "grad_norm": 1.96711266040802, + "learning_rate": 4.974811599174414e-05, + "loss": 5.3249, + "step": 7605 + }, + { + "epoch": 0.045235036635265014, + "grad_norm": 1.9123655557632446, + "learning_rate": 4.9748049848502054e-05, + "loss": 5.3681, + "step": 7606 + }, + { + "epoch": 0.04524098391854601, + "grad_norm": 1.7210376262664795, + "learning_rate": 4.974798369662067e-05, + "loss": 5.3441, + "step": 7607 + }, + { + "epoch": 0.045246931201827004, + "grad_norm": 1.590617060661316, + "learning_rate": 4.974791753610002e-05, + "loss": 5.5619, + "step": 7608 + }, + { + "epoch": 0.045252878485108006, + "grad_norm": 1.77785062789917, + "learning_rate": 4.974785136694013e-05, + "loss": 5.4717, + "step": 7609 + }, + { + "epoch": 0.045258825768389, + "grad_norm": 1.66475510597229, + "learning_rate": 4.9747785189141025e-05, + "loss": 5.3501, + "step": 7610 + }, + { + "epoch": 0.045264773051669996, + "grad_norm": 1.9176442623138428, + "learning_rate": 4.974771900270272e-05, + "loss": 5.1197, + "step": 7611 + }, + { + "epoch": 0.04527072033495099, + "grad_norm": 1.8143234252929688, + "learning_rate": 4.974765280762525e-05, + "loss": 5.3103, + "step": 7612 + }, + { + "epoch": 0.04527666761823199, + "grad_norm": 1.8954168558120728, + "learning_rate": 4.974758660390861e-05, + "loss": 5.2009, + "step": 7613 + }, + { + "epoch": 0.04528261490151299, + "grad_norm": 1.7779622077941895, + "learning_rate": 4.974752039155286e-05, + "loss": 5.519, + "step": 7614 + }, + { + "epoch": 0.04528856218479398, + "grad_norm": 1.8181761503219604, + "learning_rate": 4.9747454170558e-05, + "loss": 5.4967, + "step": 7615 + }, + { + "epoch": 0.045294509468074985, + "grad_norm": 1.657665491104126, + "learning_rate": 4.9747387940924064e-05, + "loss": 5.6437, + "step": 7616 + }, + { + "epoch": 0.04530045675135598, + "grad_norm": 1.7993237972259521, + "learning_rate": 4.974732170265107e-05, + "loss": 5.3094, + "step": 7617 + }, + { + "epoch": 0.045306404034636975, + "grad_norm": 1.8798805475234985, + "learning_rate": 4.974725545573904e-05, + "loss": 5.3268, + "step": 7618 + }, + { + "epoch": 0.04531235131791798, + "grad_norm": 1.9271420240402222, + "learning_rate": 4.974718920018799e-05, + "loss": 5.3405, + "step": 7619 + }, + { + "epoch": 0.04531829860119897, + "grad_norm": 1.9256294965744019, + "learning_rate": 4.9747122935997967e-05, + "loss": 5.3118, + "step": 7620 + }, + { + "epoch": 0.04532424588447997, + "grad_norm": 2.3345041275024414, + "learning_rate": 4.9747056663168965e-05, + "loss": 4.9813, + "step": 7621 + }, + { + "epoch": 0.04533019316776097, + "grad_norm": 1.7056258916854858, + "learning_rate": 4.974699038170103e-05, + "loss": 5.4725, + "step": 7622 + }, + { + "epoch": 0.045336140451041965, + "grad_norm": 2.075711250305176, + "learning_rate": 4.9746924091594174e-05, + "loss": 5.2215, + "step": 7623 + }, + { + "epoch": 0.04534208773432296, + "grad_norm": 1.818048357963562, + "learning_rate": 4.974685779284843e-05, + "loss": 5.0463, + "step": 7624 + }, + { + "epoch": 0.04534803501760396, + "grad_norm": 1.6590908765792847, + "learning_rate": 4.9746791485463806e-05, + "loss": 5.2476, + "step": 7625 + }, + { + "epoch": 0.04535398230088496, + "grad_norm": 2.2024991512298584, + "learning_rate": 4.974672516944033e-05, + "loss": 5.6437, + "step": 7626 + }, + { + "epoch": 0.04535992958416595, + "grad_norm": 1.71639883518219, + "learning_rate": 4.974665884477803e-05, + "loss": 5.2418, + "step": 7627 + }, + { + "epoch": 0.04536587686744695, + "grad_norm": 1.75436270236969, + "learning_rate": 4.974659251147693e-05, + "loss": 5.2209, + "step": 7628 + }, + { + "epoch": 0.04537182415072795, + "grad_norm": 2.577916383743286, + "learning_rate": 4.974652616953705e-05, + "loss": 5.2385, + "step": 7629 + }, + { + "epoch": 0.045377771434008944, + "grad_norm": 1.9784717559814453, + "learning_rate": 4.9746459818958416e-05, + "loss": 5.265, + "step": 7630 + }, + { + "epoch": 0.04538371871728994, + "grad_norm": 1.971383810043335, + "learning_rate": 4.974639345974104e-05, + "loss": 5.0548, + "step": 7631 + }, + { + "epoch": 0.04538966600057094, + "grad_norm": 2.096876621246338, + "learning_rate": 4.974632709188496e-05, + "loss": 5.1491, + "step": 7632 + }, + { + "epoch": 0.045395613283851936, + "grad_norm": 1.6079102754592896, + "learning_rate": 4.974626071539019e-05, + "loss": 5.1959, + "step": 7633 + }, + { + "epoch": 0.04540156056713293, + "grad_norm": 1.6881030797958374, + "learning_rate": 4.9746194330256755e-05, + "loss": 5.1772, + "step": 7634 + }, + { + "epoch": 0.04540750785041393, + "grad_norm": 1.7459675073623657, + "learning_rate": 4.974612793648469e-05, + "loss": 5.1885, + "step": 7635 + }, + { + "epoch": 0.04541345513369493, + "grad_norm": 1.739272117614746, + "learning_rate": 4.9746061534073993e-05, + "loss": 5.318, + "step": 7636 + }, + { + "epoch": 0.045419402416975924, + "grad_norm": 1.7761027812957764, + "learning_rate": 4.974599512302471e-05, + "loss": 5.1525, + "step": 7637 + }, + { + "epoch": 0.045425349700256926, + "grad_norm": 1.8695855140686035, + "learning_rate": 4.9745928703336854e-05, + "loss": 5.5754, + "step": 7638 + }, + { + "epoch": 0.04543129698353792, + "grad_norm": 1.8737404346466064, + "learning_rate": 4.9745862275010446e-05, + "loss": 5.2908, + "step": 7639 + }, + { + "epoch": 0.045437244266818916, + "grad_norm": 1.731676459312439, + "learning_rate": 4.9745795838045515e-05, + "loss": 5.2671, + "step": 7640 + }, + { + "epoch": 0.04544319155009991, + "grad_norm": 1.6687474250793457, + "learning_rate": 4.974572939244209e-05, + "loss": 5.1629, + "step": 7641 + }, + { + "epoch": 0.04544913883338091, + "grad_norm": 2.1376633644104004, + "learning_rate": 4.974566293820018e-05, + "loss": 5.2853, + "step": 7642 + }, + { + "epoch": 0.04545508611666191, + "grad_norm": 2.0989861488342285, + "learning_rate": 4.974559647531981e-05, + "loss": 5.1311, + "step": 7643 + }, + { + "epoch": 0.0454610333999429, + "grad_norm": 2.3433620929718018, + "learning_rate": 4.974553000380102e-05, + "loss": 4.9854, + "step": 7644 + }, + { + "epoch": 0.045466980683223905, + "grad_norm": 2.306170701980591, + "learning_rate": 4.974546352364381e-05, + "loss": 5.3152, + "step": 7645 + }, + { + "epoch": 0.0454729279665049, + "grad_norm": 1.9588537216186523, + "learning_rate": 4.974539703484822e-05, + "loss": 5.3903, + "step": 7646 + }, + { + "epoch": 0.045478875249785895, + "grad_norm": 1.7994736433029175, + "learning_rate": 4.9745330537414265e-05, + "loss": 5.2505, + "step": 7647 + }, + { + "epoch": 0.0454848225330669, + "grad_norm": 1.983175277709961, + "learning_rate": 4.974526403134197e-05, + "loss": 5.2607, + "step": 7648 + }, + { + "epoch": 0.04549076981634789, + "grad_norm": 1.8853832483291626, + "learning_rate": 4.974519751663136e-05, + "loss": 5.1475, + "step": 7649 + }, + { + "epoch": 0.04549671709962889, + "grad_norm": 1.9374700784683228, + "learning_rate": 4.9745130993282464e-05, + "loss": 5.2039, + "step": 7650 + }, + { + "epoch": 0.04550266438290989, + "grad_norm": 1.8200404644012451, + "learning_rate": 4.974506446129529e-05, + "loss": 5.2794, + "step": 7651 + }, + { + "epoch": 0.045508611666190885, + "grad_norm": 1.8375320434570312, + "learning_rate": 4.974499792066987e-05, + "loss": 5.1149, + "step": 7652 + }, + { + "epoch": 0.04551455894947188, + "grad_norm": 1.7842520475387573, + "learning_rate": 4.974493137140623e-05, + "loss": 5.0332, + "step": 7653 + }, + { + "epoch": 0.04552050623275288, + "grad_norm": 2.0220818519592285, + "learning_rate": 4.974486481350439e-05, + "loss": 5.0277, + "step": 7654 + }, + { + "epoch": 0.04552645351603388, + "grad_norm": 2.0787746906280518, + "learning_rate": 4.9744798246964375e-05, + "loss": 5.0587, + "step": 7655 + }, + { + "epoch": 0.04553240079931487, + "grad_norm": 1.7024985551834106, + "learning_rate": 4.97447316717862e-05, + "loss": 5.0184, + "step": 7656 + }, + { + "epoch": 0.04553834808259587, + "grad_norm": 1.9057540893554688, + "learning_rate": 4.97446650879699e-05, + "loss": 5.3945, + "step": 7657 + }, + { + "epoch": 0.04554429536587687, + "grad_norm": 1.7963287830352783, + "learning_rate": 4.974459849551549e-05, + "loss": 4.9869, + "step": 7658 + }, + { + "epoch": 0.045550242649157864, + "grad_norm": 2.027353286743164, + "learning_rate": 4.974453189442299e-05, + "loss": 5.1389, + "step": 7659 + }, + { + "epoch": 0.04555618993243886, + "grad_norm": 1.7137126922607422, + "learning_rate": 4.9744465284692445e-05, + "loss": 5.058, + "step": 7660 + }, + { + "epoch": 0.04556213721571986, + "grad_norm": 2.0363876819610596, + "learning_rate": 4.9744398666323854e-05, + "loss": 4.9174, + "step": 7661 + }, + { + "epoch": 0.045568084499000856, + "grad_norm": 2.1440837383270264, + "learning_rate": 4.9744332039317255e-05, + "loss": 4.8894, + "step": 7662 + }, + { + "epoch": 0.04557403178228185, + "grad_norm": 1.9582308530807495, + "learning_rate": 4.9744265403672655e-05, + "loss": 5.0666, + "step": 7663 + }, + { + "epoch": 0.04557997906556285, + "grad_norm": 1.9997116327285767, + "learning_rate": 4.97441987593901e-05, + "loss": 5.0804, + "step": 7664 + }, + { + "epoch": 0.04558592634884385, + "grad_norm": 2.067361831665039, + "learning_rate": 4.9744132106469586e-05, + "loss": 4.8655, + "step": 7665 + }, + { + "epoch": 0.045591873632124844, + "grad_norm": 1.7066930532455444, + "learning_rate": 4.9744065444911165e-05, + "loss": 4.792, + "step": 7666 + }, + { + "epoch": 0.045597820915405846, + "grad_norm": 1.8526182174682617, + "learning_rate": 4.974399877471484e-05, + "loss": 4.755, + "step": 7667 + }, + { + "epoch": 0.04560376819868684, + "grad_norm": 1.8744564056396484, + "learning_rate": 4.9743932095880644e-05, + "loss": 4.7732, + "step": 7668 + }, + { + "epoch": 0.045609715481967836, + "grad_norm": 1.849574327468872, + "learning_rate": 4.97438654084086e-05, + "loss": 4.7743, + "step": 7669 + }, + { + "epoch": 0.04561566276524884, + "grad_norm": 1.87284255027771, + "learning_rate": 4.9743798712298714e-05, + "loss": 5.0582, + "step": 7670 + }, + { + "epoch": 0.04562161004852983, + "grad_norm": 2.206273078918457, + "learning_rate": 4.974373200755104e-05, + "loss": 5.4683, + "step": 7671 + }, + { + "epoch": 0.04562755733181083, + "grad_norm": 1.9849058389663696, + "learning_rate": 4.974366529416557e-05, + "loss": 5.4087, + "step": 7672 + }, + { + "epoch": 0.04563350461509182, + "grad_norm": 1.9440083503723145, + "learning_rate": 4.974359857214235e-05, + "loss": 4.9607, + "step": 7673 + }, + { + "epoch": 0.045639451898372825, + "grad_norm": 1.7112319469451904, + "learning_rate": 4.974353184148139e-05, + "loss": 5.6589, + "step": 7674 + }, + { + "epoch": 0.04564539918165382, + "grad_norm": 1.921215295791626, + "learning_rate": 4.974346510218273e-05, + "loss": 5.4495, + "step": 7675 + }, + { + "epoch": 0.045651346464934815, + "grad_norm": 1.9582061767578125, + "learning_rate": 4.974339835424637e-05, + "loss": 5.2459, + "step": 7676 + }, + { + "epoch": 0.04565729374821582, + "grad_norm": 1.9781824350357056, + "learning_rate": 4.974333159767235e-05, + "loss": 5.3424, + "step": 7677 + }, + { + "epoch": 0.04566324103149681, + "grad_norm": 1.7183479070663452, + "learning_rate": 4.974326483246069e-05, + "loss": 5.3741, + "step": 7678 + }, + { + "epoch": 0.04566918831477781, + "grad_norm": 1.7942447662353516, + "learning_rate": 4.974319805861141e-05, + "loss": 5.4008, + "step": 7679 + }, + { + "epoch": 0.04567513559805881, + "grad_norm": 1.8255115747451782, + "learning_rate": 4.974313127612454e-05, + "loss": 5.1849, + "step": 7680 + }, + { + "epoch": 0.045681082881339805, + "grad_norm": 1.7907564640045166, + "learning_rate": 4.974306448500009e-05, + "loss": 5.1757, + "step": 7681 + }, + { + "epoch": 0.0456870301646208, + "grad_norm": 2.911489486694336, + "learning_rate": 4.97429976852381e-05, + "loss": 4.8909, + "step": 7682 + }, + { + "epoch": 0.0456929774479018, + "grad_norm": 2.849125623703003, + "learning_rate": 4.9742930876838576e-05, + "loss": 4.7733, + "step": 7683 + }, + { + "epoch": 0.0456989247311828, + "grad_norm": 2.4196949005126953, + "learning_rate": 4.9742864059801565e-05, + "loss": 4.8571, + "step": 7684 + }, + { + "epoch": 0.04570487201446379, + "grad_norm": 1.9430558681488037, + "learning_rate": 4.974279723412706e-05, + "loss": 5.1338, + "step": 7685 + }, + { + "epoch": 0.04571081929774479, + "grad_norm": 1.7538554668426514, + "learning_rate": 4.9742730399815105e-05, + "loss": 5.5524, + "step": 7686 + }, + { + "epoch": 0.04571676658102579, + "grad_norm": 2.006115198135376, + "learning_rate": 4.9742663556865724e-05, + "loss": 5.3343, + "step": 7687 + }, + { + "epoch": 0.045722713864306784, + "grad_norm": 2.554234027862549, + "learning_rate": 4.974259670527893e-05, + "loss": 5.8426, + "step": 7688 + }, + { + "epoch": 0.04572866114758778, + "grad_norm": 2.656747579574585, + "learning_rate": 4.974252984505475e-05, + "loss": 5.1578, + "step": 7689 + }, + { + "epoch": 0.04573460843086878, + "grad_norm": 2.800208568572998, + "learning_rate": 4.9742462976193216e-05, + "loss": 4.8019, + "step": 7690 + }, + { + "epoch": 0.045740555714149776, + "grad_norm": 2.674938201904297, + "learning_rate": 4.974239609869433e-05, + "loss": 4.7177, + "step": 7691 + }, + { + "epoch": 0.04574650299743077, + "grad_norm": 2.751533269882202, + "learning_rate": 4.974232921255815e-05, + "loss": 4.7568, + "step": 7692 + }, + { + "epoch": 0.04575245028071177, + "grad_norm": 2.623917818069458, + "learning_rate": 4.974226231778466e-05, + "loss": 4.5908, + "step": 7693 + }, + { + "epoch": 0.04575839756399277, + "grad_norm": 2.2248899936676025, + "learning_rate": 4.9742195414373904e-05, + "loss": 5.4066, + "step": 7694 + }, + { + "epoch": 0.045764344847273764, + "grad_norm": 1.7959388494491577, + "learning_rate": 4.974212850232591e-05, + "loss": 6.1414, + "step": 7695 + }, + { + "epoch": 0.045770292130554766, + "grad_norm": 2.0049352645874023, + "learning_rate": 4.974206158164069e-05, + "loss": 6.0106, + "step": 7696 + }, + { + "epoch": 0.04577623941383576, + "grad_norm": 2.4794270992279053, + "learning_rate": 4.9741994652318276e-05, + "loss": 5.8647, + "step": 7697 + }, + { + "epoch": 0.045782186697116756, + "grad_norm": 3.9380109310150146, + "learning_rate": 4.974192771435868e-05, + "loss": 5.719, + "step": 7698 + }, + { + "epoch": 0.04578813398039776, + "grad_norm": 2.564023017883301, + "learning_rate": 4.974186076776194e-05, + "loss": 4.7294, + "step": 7699 + }, + { + "epoch": 0.04579408126367875, + "grad_norm": 3.7082693576812744, + "learning_rate": 4.974179381252807e-05, + "loss": 5.1975, + "step": 7700 + }, + { + "epoch": 0.04580002854695975, + "grad_norm": 4.0067524909973145, + "learning_rate": 4.97417268486571e-05, + "loss": 5.4047, + "step": 7701 + }, + { + "epoch": 0.04580597583024074, + "grad_norm": 3.978787660598755, + "learning_rate": 4.974165987614904e-05, + "loss": 5.7023, + "step": 7702 + }, + { + "epoch": 0.045811923113521745, + "grad_norm": 4.597605228424072, + "learning_rate": 4.974159289500392e-05, + "loss": 6.5186, + "step": 7703 + }, + { + "epoch": 0.04581787039680274, + "grad_norm": 2.8793985843658447, + "learning_rate": 4.974152590522177e-05, + "loss": 6.1476, + "step": 7704 + }, + { + "epoch": 0.045823817680083735, + "grad_norm": 2.466089963912964, + "learning_rate": 4.974145890680262e-05, + "loss": 5.5154, + "step": 7705 + }, + { + "epoch": 0.04582976496336474, + "grad_norm": 2.937228202819824, + "learning_rate": 4.974139189974647e-05, + "loss": 5.5146, + "step": 7706 + }, + { + "epoch": 0.04583571224664573, + "grad_norm": 2.4580399990081787, + "learning_rate": 4.974132488405336e-05, + "loss": 6.214, + "step": 7707 + }, + { + "epoch": 0.04584165952992673, + "grad_norm": 4.910717010498047, + "learning_rate": 4.97412578597233e-05, + "loss": 5.819, + "step": 7708 + }, + { + "epoch": 0.04584760681320773, + "grad_norm": 5.372139930725098, + "learning_rate": 4.974119082675634e-05, + "loss": 5.3242, + "step": 7709 + }, + { + "epoch": 0.045853554096488724, + "grad_norm": 2.050492525100708, + "learning_rate": 4.9741123785152474e-05, + "loss": 6.0468, + "step": 7710 + }, + { + "epoch": 0.04585950137976972, + "grad_norm": 1.7090541124343872, + "learning_rate": 4.974105673491174e-05, + "loss": 5.7652, + "step": 7711 + }, + { + "epoch": 0.04586544866305072, + "grad_norm": 2.512538194656372, + "learning_rate": 4.974098967603415e-05, + "loss": 5.3184, + "step": 7712 + }, + { + "epoch": 0.04587139594633172, + "grad_norm": 3.311289072036743, + "learning_rate": 4.974092260851975e-05, + "loss": 5.5379, + "step": 7713 + }, + { + "epoch": 0.04587734322961271, + "grad_norm": 3.3318710327148438, + "learning_rate": 4.974085553236854e-05, + "loss": 5.5543, + "step": 7714 + }, + { + "epoch": 0.04588329051289371, + "grad_norm": 2.6384379863739014, + "learning_rate": 4.9740788447580555e-05, + "loss": 6.3475, + "step": 7715 + }, + { + "epoch": 0.04588923779617471, + "grad_norm": 2.0066304206848145, + "learning_rate": 4.974072135415582e-05, + "loss": 6.3685, + "step": 7716 + }, + { + "epoch": 0.045895185079455704, + "grad_norm": 2.4189116954803467, + "learning_rate": 4.9740654252094356e-05, + "loss": 5.4128, + "step": 7717 + }, + { + "epoch": 0.0459011323627367, + "grad_norm": 2.431011438369751, + "learning_rate": 4.974058714139618e-05, + "loss": 5.34, + "step": 7718 + }, + { + "epoch": 0.0459070796460177, + "grad_norm": 2.1997156143188477, + "learning_rate": 4.974052002206132e-05, + "loss": 5.4223, + "step": 7719 + }, + { + "epoch": 0.045913026929298696, + "grad_norm": 2.0700082778930664, + "learning_rate": 4.9740452894089806e-05, + "loss": 5.4255, + "step": 7720 + }, + { + "epoch": 0.04591897421257969, + "grad_norm": 2.3476040363311768, + "learning_rate": 4.974038575748165e-05, + "loss": 5.5055, + "step": 7721 + }, + { + "epoch": 0.04592492149586069, + "grad_norm": 4.2995524406433105, + "learning_rate": 4.974031861223688e-05, + "loss": 5.8869, + "step": 7722 + }, + { + "epoch": 0.04593086877914169, + "grad_norm": 4.690639495849609, + "learning_rate": 4.974025145835552e-05, + "loss": 6.0808, + "step": 7723 + }, + { + "epoch": 0.04593681606242268, + "grad_norm": 3.9823479652404785, + "learning_rate": 4.97401842958376e-05, + "loss": 6.0844, + "step": 7724 + }, + { + "epoch": 0.045942763345703685, + "grad_norm": 3.69808030128479, + "learning_rate": 4.9740117124683136e-05, + "loss": 5.9611, + "step": 7725 + }, + { + "epoch": 0.04594871062898468, + "grad_norm": 2.5912535190582275, + "learning_rate": 4.974004994489215e-05, + "loss": 5.9669, + "step": 7726 + }, + { + "epoch": 0.045954657912265676, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.973998275646467e-05, + "loss": 5.6717, + "step": 7727 + }, + { + "epoch": 0.04596060519554668, + "grad_norm": 2.179302930831909, + "learning_rate": 4.973991555940072e-05, + "loss": 5.4077, + "step": 7728 + }, + { + "epoch": 0.04596655247882767, + "grad_norm": 2.4919214248657227, + "learning_rate": 4.973984835370031e-05, + "loss": 6.118, + "step": 7729 + }, + { + "epoch": 0.04597249976210867, + "grad_norm": 3.5036723613739014, + "learning_rate": 4.9739781139363485e-05, + "loss": 5.436, + "step": 7730 + }, + { + "epoch": 0.04597844704538966, + "grad_norm": 4.129561424255371, + "learning_rate": 4.973971391639026e-05, + "loss": 4.8414, + "step": 7731 + }, + { + "epoch": 0.045984394328670665, + "grad_norm": 2.867039203643799, + "learning_rate": 4.973964668478065e-05, + "loss": 4.7385, + "step": 7732 + }, + { + "epoch": 0.04599034161195166, + "grad_norm": 2.754023313522339, + "learning_rate": 4.973957944453469e-05, + "loss": 4.6063, + "step": 7733 + }, + { + "epoch": 0.045996288895232655, + "grad_norm": 2.1025235652923584, + "learning_rate": 4.973951219565239e-05, + "loss": 5.3233, + "step": 7734 + }, + { + "epoch": 0.04600223617851366, + "grad_norm": 2.352883815765381, + "learning_rate": 4.973944493813379e-05, + "loss": 5.5648, + "step": 7735 + }, + { + "epoch": 0.04600818346179465, + "grad_norm": 2.049377679824829, + "learning_rate": 4.97393776719789e-05, + "loss": 6.1241, + "step": 7736 + }, + { + "epoch": 0.04601413074507565, + "grad_norm": 1.7124110460281372, + "learning_rate": 4.9739310397187756e-05, + "loss": 6.1258, + "step": 7737 + }, + { + "epoch": 0.04602007802835665, + "grad_norm": 2.2592861652374268, + "learning_rate": 4.9739243113760364e-05, + "loss": 6.1972, + "step": 7738 + }, + { + "epoch": 0.046026025311637644, + "grad_norm": 2.3926188945770264, + "learning_rate": 4.973917582169677e-05, + "loss": 6.1681, + "step": 7739 + }, + { + "epoch": 0.04603197259491864, + "grad_norm": 1.9956084489822388, + "learning_rate": 4.973910852099698e-05, + "loss": 6.2068, + "step": 7740 + }, + { + "epoch": 0.04603791987819964, + "grad_norm": 1.924467921257019, + "learning_rate": 4.973904121166102e-05, + "loss": 6.4391, + "step": 7741 + }, + { + "epoch": 0.04604386716148064, + "grad_norm": 1.9410041570663452, + "learning_rate": 4.973897389368891e-05, + "loss": 5.9378, + "step": 7742 + }, + { + "epoch": 0.04604981444476163, + "grad_norm": 2.0418617725372314, + "learning_rate": 4.9738906567080686e-05, + "loss": 5.8823, + "step": 7743 + }, + { + "epoch": 0.04605576172804263, + "grad_norm": 2.696143627166748, + "learning_rate": 4.973883923183637e-05, + "loss": 5.8551, + "step": 7744 + }, + { + "epoch": 0.04606170901132363, + "grad_norm": 2.482703447341919, + "learning_rate": 4.973877188795598e-05, + "loss": 5.5752, + "step": 7745 + }, + { + "epoch": 0.046067656294604624, + "grad_norm": 2.520437240600586, + "learning_rate": 4.973870453543954e-05, + "loss": 5.571, + "step": 7746 + }, + { + "epoch": 0.04607360357788562, + "grad_norm": 2.568150758743286, + "learning_rate": 4.973863717428707e-05, + "loss": 5.9145, + "step": 7747 + }, + { + "epoch": 0.04607955086116662, + "grad_norm": 2.6373183727264404, + "learning_rate": 4.9738569804498605e-05, + "loss": 5.9414, + "step": 7748 + }, + { + "epoch": 0.046085498144447616, + "grad_norm": 2.1663565635681152, + "learning_rate": 4.973850242607415e-05, + "loss": 6.2316, + "step": 7749 + }, + { + "epoch": 0.04609144542772861, + "grad_norm": 2.044316053390503, + "learning_rate": 4.973843503901374e-05, + "loss": 5.7232, + "step": 7750 + }, + { + "epoch": 0.04609739271100961, + "grad_norm": 2.1740782260894775, + "learning_rate": 4.9738367643317405e-05, + "loss": 6.0388, + "step": 7751 + }, + { + "epoch": 0.04610333999429061, + "grad_norm": 2.0643458366394043, + "learning_rate": 4.973830023898516e-05, + "loss": 5.8201, + "step": 7752 + }, + { + "epoch": 0.0461092872775716, + "grad_norm": 1.7433217763900757, + "learning_rate": 4.973823282601703e-05, + "loss": 6.0464, + "step": 7753 + }, + { + "epoch": 0.046115234560852605, + "grad_norm": 2.657677412033081, + "learning_rate": 4.9738165404413037e-05, + "loss": 5.2849, + "step": 7754 + }, + { + "epoch": 0.0461211818441336, + "grad_norm": 1.7317034006118774, + "learning_rate": 4.9738097974173205e-05, + "loss": 6.0619, + "step": 7755 + }, + { + "epoch": 0.046127129127414596, + "grad_norm": 1.6109949350357056, + "learning_rate": 4.973803053529756e-05, + "loss": 5.7832, + "step": 7756 + }, + { + "epoch": 0.0461330764106956, + "grad_norm": 2.2980475425720215, + "learning_rate": 4.9737963087786125e-05, + "loss": 5.4346, + "step": 7757 + }, + { + "epoch": 0.04613902369397659, + "grad_norm": 2.5162737369537354, + "learning_rate": 4.973789563163892e-05, + "loss": 5.3723, + "step": 7758 + }, + { + "epoch": 0.04614497097725759, + "grad_norm": 2.3493261337280273, + "learning_rate": 4.973782816685597e-05, + "loss": 5.7474, + "step": 7759 + }, + { + "epoch": 0.04615091826053858, + "grad_norm": 2.1428544521331787, + "learning_rate": 4.9737760693437306e-05, + "loss": 5.6318, + "step": 7760 + }, + { + "epoch": 0.046156865543819585, + "grad_norm": 2.11627197265625, + "learning_rate": 4.973769321138294e-05, + "loss": 5.38, + "step": 7761 + }, + { + "epoch": 0.04616281282710058, + "grad_norm": 2.411957263946533, + "learning_rate": 4.9737625720692906e-05, + "loss": 5.1822, + "step": 7762 + }, + { + "epoch": 0.046168760110381575, + "grad_norm": 2.3566222190856934, + "learning_rate": 4.973755822136722e-05, + "loss": 5.0405, + "step": 7763 + }, + { + "epoch": 0.04617470739366258, + "grad_norm": 2.2235679626464844, + "learning_rate": 4.973749071340591e-05, + "loss": 5.4746, + "step": 7764 + }, + { + "epoch": 0.04618065467694357, + "grad_norm": 2.4175586700439453, + "learning_rate": 4.973742319680899e-05, + "loss": 5.7519, + "step": 7765 + }, + { + "epoch": 0.04618660196022457, + "grad_norm": 2.3386452198028564, + "learning_rate": 4.9737355671576496e-05, + "loss": 6.1765, + "step": 7766 + }, + { + "epoch": 0.04619254924350557, + "grad_norm": 2.084333658218384, + "learning_rate": 4.973728813770845e-05, + "loss": 6.1439, + "step": 7767 + }, + { + "epoch": 0.046198496526786564, + "grad_norm": 2.0523531436920166, + "learning_rate": 4.973722059520487e-05, + "loss": 6.294, + "step": 7768 + }, + { + "epoch": 0.04620444381006756, + "grad_norm": 2.1187572479248047, + "learning_rate": 4.973715304406578e-05, + "loss": 5.3679, + "step": 7769 + }, + { + "epoch": 0.04621039109334856, + "grad_norm": 2.5249836444854736, + "learning_rate": 4.9737085484291204e-05, + "loss": 5.9086, + "step": 7770 + }, + { + "epoch": 0.04621633837662956, + "grad_norm": 2.35662841796875, + "learning_rate": 4.973701791588117e-05, + "loss": 6.3135, + "step": 7771 + }, + { + "epoch": 0.04622228565991055, + "grad_norm": 2.070955276489258, + "learning_rate": 4.9736950338835695e-05, + "loss": 5.8748, + "step": 7772 + }, + { + "epoch": 0.04622823294319155, + "grad_norm": 2.151587963104248, + "learning_rate": 4.9736882753154814e-05, + "loss": 6.2053, + "step": 7773 + }, + { + "epoch": 0.04623418022647255, + "grad_norm": 2.2187843322753906, + "learning_rate": 4.9736815158838534e-05, + "loss": 5.762, + "step": 7774 + }, + { + "epoch": 0.046240127509753544, + "grad_norm": 1.8676223754882812, + "learning_rate": 4.973674755588689e-05, + "loss": 6.06, + "step": 7775 + }, + { + "epoch": 0.04624607479303454, + "grad_norm": 2.2110252380371094, + "learning_rate": 4.9736679944299906e-05, + "loss": 5.6474, + "step": 7776 + }, + { + "epoch": 0.04625202207631554, + "grad_norm": 2.0635151863098145, + "learning_rate": 4.9736612324077605e-05, + "loss": 5.5579, + "step": 7777 + }, + { + "epoch": 0.046257969359596536, + "grad_norm": 2.1654598712921143, + "learning_rate": 4.973654469522e-05, + "loss": 5.5388, + "step": 7778 + }, + { + "epoch": 0.04626391664287753, + "grad_norm": 2.3735673427581787, + "learning_rate": 4.973647705772713e-05, + "loss": 5.4383, + "step": 7779 + }, + { + "epoch": 0.04626986392615853, + "grad_norm": 2.344160318374634, + "learning_rate": 4.9736409411599e-05, + "loss": 5.6501, + "step": 7780 + }, + { + "epoch": 0.04627581120943953, + "grad_norm": 3.023350477218628, + "learning_rate": 4.973634175683566e-05, + "loss": 5.2688, + "step": 7781 + }, + { + "epoch": 0.04628175849272052, + "grad_norm": 2.8814494609832764, + "learning_rate": 4.973627409343711e-05, + "loss": 5.08, + "step": 7782 + }, + { + "epoch": 0.046287705776001525, + "grad_norm": 2.475191831588745, + "learning_rate": 4.973620642140339e-05, + "loss": 5.0761, + "step": 7783 + }, + { + "epoch": 0.04629365305928252, + "grad_norm": 2.5567755699157715, + "learning_rate": 4.9736138740734504e-05, + "loss": 5.46, + "step": 7784 + }, + { + "epoch": 0.046299600342563516, + "grad_norm": 2.9225175380706787, + "learning_rate": 4.973607105143049e-05, + "loss": 5.5219, + "step": 7785 + }, + { + "epoch": 0.04630554762584452, + "grad_norm": 2.3112781047821045, + "learning_rate": 4.973600335349138e-05, + "loss": 6.4204, + "step": 7786 + }, + { + "epoch": 0.04631149490912551, + "grad_norm": 2.228182554244995, + "learning_rate": 4.973593564691717e-05, + "loss": 6.3299, + "step": 7787 + }, + { + "epoch": 0.04631744219240651, + "grad_norm": 1.8612277507781982, + "learning_rate": 4.973586793170792e-05, + "loss": 5.994, + "step": 7788 + }, + { + "epoch": 0.0463233894756875, + "grad_norm": 1.9788155555725098, + "learning_rate": 4.9735800207863626e-05, + "loss": 6.1676, + "step": 7789 + }, + { + "epoch": 0.046329336758968505, + "grad_norm": 2.2335264682769775, + "learning_rate": 4.973573247538431e-05, + "loss": 6.3112, + "step": 7790 + }, + { + "epoch": 0.0463352840422495, + "grad_norm": 2.168656349182129, + "learning_rate": 4.973566473427001e-05, + "loss": 5.8326, + "step": 7791 + }, + { + "epoch": 0.046341231325530495, + "grad_norm": 1.9187591075897217, + "learning_rate": 4.9735596984520755e-05, + "loss": 5.8734, + "step": 7792 + }, + { + "epoch": 0.0463471786088115, + "grad_norm": 2.195242166519165, + "learning_rate": 4.973552922613655e-05, + "loss": 6.1325, + "step": 7793 + }, + { + "epoch": 0.04635312589209249, + "grad_norm": 1.9698888063430786, + "learning_rate": 4.973546145911743e-05, + "loss": 5.8586, + "step": 7794 + }, + { + "epoch": 0.04635907317537349, + "grad_norm": 2.2149972915649414, + "learning_rate": 4.973539368346342e-05, + "loss": 5.4087, + "step": 7795 + }, + { + "epoch": 0.04636502045865449, + "grad_norm": 1.8587820529937744, + "learning_rate": 4.973532589917453e-05, + "loss": 5.9956, + "step": 7796 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 2.022866725921631, + "learning_rate": 4.97352581062508e-05, + "loss": 6.0905, + "step": 7797 + }, + { + "epoch": 0.04637691502521648, + "grad_norm": 2.0257678031921387, + "learning_rate": 4.973519030469225e-05, + "loss": 6.02, + "step": 7798 + }, + { + "epoch": 0.04638286230849748, + "grad_norm": 1.6909089088439941, + "learning_rate": 4.973512249449889e-05, + "loss": 5.727, + "step": 7799 + }, + { + "epoch": 0.046388809591778477, + "grad_norm": 1.8882997035980225, + "learning_rate": 4.9735054675670754e-05, + "loss": 5.655, + "step": 7800 + }, + { + "epoch": 0.04639475687505947, + "grad_norm": 2.1775193214416504, + "learning_rate": 4.9734986848207876e-05, + "loss": 5.8067, + "step": 7801 + }, + { + "epoch": 0.04640070415834047, + "grad_norm": 2.136690139770508, + "learning_rate": 4.973491901211027e-05, + "loss": 5.5515, + "step": 7802 + }, + { + "epoch": 0.04640665144162147, + "grad_norm": 1.8036144971847534, + "learning_rate": 4.973485116737795e-05, + "loss": 5.8404, + "step": 7803 + }, + { + "epoch": 0.046412598724902464, + "grad_norm": 2.1350481510162354, + "learning_rate": 4.973478331401096e-05, + "loss": 6.1635, + "step": 7804 + }, + { + "epoch": 0.04641854600818346, + "grad_norm": 2.4152462482452393, + "learning_rate": 4.97347154520093e-05, + "loss": 5.9882, + "step": 7805 + }, + { + "epoch": 0.04642449329146446, + "grad_norm": 2.166402578353882, + "learning_rate": 4.9734647581373015e-05, + "loss": 5.8982, + "step": 7806 + }, + { + "epoch": 0.046430440574745456, + "grad_norm": 1.8684437274932861, + "learning_rate": 4.973457970210211e-05, + "loss": 5.9501, + "step": 7807 + }, + { + "epoch": 0.04643638785802645, + "grad_norm": 1.775829792022705, + "learning_rate": 4.973451181419663e-05, + "loss": 5.83, + "step": 7808 + }, + { + "epoch": 0.04644233514130745, + "grad_norm": 1.7500759363174438, + "learning_rate": 4.973444391765659e-05, + "loss": 6.0084, + "step": 7809 + }, + { + "epoch": 0.04644828242458845, + "grad_norm": 2.3920938968658447, + "learning_rate": 4.9734376012482e-05, + "loss": 5.559, + "step": 7810 + }, + { + "epoch": 0.04645422970786944, + "grad_norm": 2.7680983543395996, + "learning_rate": 4.97343080986729e-05, + "loss": 5.3521, + "step": 7811 + }, + { + "epoch": 0.046460176991150445, + "grad_norm": 2.6618781089782715, + "learning_rate": 4.9734240176229316e-05, + "loss": 5.6917, + "step": 7812 + }, + { + "epoch": 0.04646612427443144, + "grad_norm": 2.086775541305542, + "learning_rate": 4.9734172245151256e-05, + "loss": 5.582, + "step": 7813 + }, + { + "epoch": 0.046472071557712435, + "grad_norm": 2.190012216567993, + "learning_rate": 4.973410430543875e-05, + "loss": 5.9132, + "step": 7814 + }, + { + "epoch": 0.04647801884099344, + "grad_norm": 2.317610740661621, + "learning_rate": 4.973403635709183e-05, + "loss": 5.7055, + "step": 7815 + }, + { + "epoch": 0.04648396612427443, + "grad_norm": 2.1291167736053467, + "learning_rate": 4.973396840011051e-05, + "loss": 5.6711, + "step": 7816 + }, + { + "epoch": 0.04648991340755543, + "grad_norm": 1.5421113967895508, + "learning_rate": 4.9733900434494815e-05, + "loss": 5.6433, + "step": 7817 + }, + { + "epoch": 0.04649586069083642, + "grad_norm": 2.222355604171753, + "learning_rate": 4.973383246024477e-05, + "loss": 5.3685, + "step": 7818 + }, + { + "epoch": 0.046501807974117425, + "grad_norm": 2.097116708755493, + "learning_rate": 4.97337644773604e-05, + "loss": 5.6528, + "step": 7819 + }, + { + "epoch": 0.04650775525739842, + "grad_norm": 2.0224382877349854, + "learning_rate": 4.973369648584174e-05, + "loss": 5.8849, + "step": 7820 + }, + { + "epoch": 0.046513702540679415, + "grad_norm": 2.1581428050994873, + "learning_rate": 4.973362848568879e-05, + "loss": 5.985, + "step": 7821 + }, + { + "epoch": 0.04651964982396042, + "grad_norm": 2.43945574760437, + "learning_rate": 4.9733560476901584e-05, + "loss": 5.5682, + "step": 7822 + }, + { + "epoch": 0.04652559710724141, + "grad_norm": 3.174143075942993, + "learning_rate": 4.9733492459480157e-05, + "loss": 4.832, + "step": 7823 + }, + { + "epoch": 0.04653154439052241, + "grad_norm": 2.269339084625244, + "learning_rate": 4.973342443342452e-05, + "loss": 5.5804, + "step": 7824 + }, + { + "epoch": 0.04653749167380341, + "grad_norm": 2.3775289058685303, + "learning_rate": 4.9733356398734695e-05, + "loss": 5.8299, + "step": 7825 + }, + { + "epoch": 0.046543438957084404, + "grad_norm": 2.065579414367676, + "learning_rate": 4.9733288355410716e-05, + "loss": 5.6985, + "step": 7826 + }, + { + "epoch": 0.0465493862403654, + "grad_norm": 1.9699875116348267, + "learning_rate": 4.9733220303452604e-05, + "loss": 6.0161, + "step": 7827 + }, + { + "epoch": 0.0465553335236464, + "grad_norm": 2.1414806842803955, + "learning_rate": 4.9733152242860374e-05, + "loss": 6.2534, + "step": 7828 + }, + { + "epoch": 0.046561280806927396, + "grad_norm": 2.414738416671753, + "learning_rate": 4.973308417363406e-05, + "loss": 5.8402, + "step": 7829 + }, + { + "epoch": 0.04656722809020839, + "grad_norm": 2.4105031490325928, + "learning_rate": 4.973301609577368e-05, + "loss": 5.8728, + "step": 7830 + }, + { + "epoch": 0.04657317537348939, + "grad_norm": 2.7718660831451416, + "learning_rate": 4.9732948009279264e-05, + "loss": 5.637, + "step": 7831 + }, + { + "epoch": 0.04657912265677039, + "grad_norm": 2.205103874206543, + "learning_rate": 4.9732879914150824e-05, + "loss": 5.4119, + "step": 7832 + }, + { + "epoch": 0.046585069940051384, + "grad_norm": 1.9080390930175781, + "learning_rate": 4.9732811810388394e-05, + "loss": 5.3387, + "step": 7833 + }, + { + "epoch": 0.04659101722333238, + "grad_norm": 1.6600725650787354, + "learning_rate": 4.9732743697992e-05, + "loss": 5.3192, + "step": 7834 + }, + { + "epoch": 0.04659696450661338, + "grad_norm": 1.9428787231445312, + "learning_rate": 4.973267557696165e-05, + "loss": 5.3127, + "step": 7835 + }, + { + "epoch": 0.046602911789894376, + "grad_norm": 2.174811840057373, + "learning_rate": 4.973260744729738e-05, + "loss": 5.7181, + "step": 7836 + }, + { + "epoch": 0.04660885907317537, + "grad_norm": 2.5420422554016113, + "learning_rate": 4.9732539308999224e-05, + "loss": 5.934, + "step": 7837 + }, + { + "epoch": 0.04661480635645637, + "grad_norm": 2.079343795776367, + "learning_rate": 4.973247116206719e-05, + "loss": 5.236, + "step": 7838 + }, + { + "epoch": 0.04662075363973737, + "grad_norm": 1.7748003005981445, + "learning_rate": 4.97324030065013e-05, + "loss": 5.2929, + "step": 7839 + }, + { + "epoch": 0.04662670092301836, + "grad_norm": 2.2746875286102295, + "learning_rate": 4.973233484230159e-05, + "loss": 5.182, + "step": 7840 + }, + { + "epoch": 0.046632648206299365, + "grad_norm": 1.7846394777297974, + "learning_rate": 4.9732266669468074e-05, + "loss": 5.2682, + "step": 7841 + }, + { + "epoch": 0.04663859548958036, + "grad_norm": 2.078132152557373, + "learning_rate": 4.973219848800078e-05, + "loss": 5.3245, + "step": 7842 + }, + { + "epoch": 0.046644542772861355, + "grad_norm": 1.7784876823425293, + "learning_rate": 4.9732130297899726e-05, + "loss": 5.4582, + "step": 7843 + }, + { + "epoch": 0.04665049005614236, + "grad_norm": 1.8421920537948608, + "learning_rate": 4.973206209916495e-05, + "loss": 5.3504, + "step": 7844 + }, + { + "epoch": 0.04665643733942335, + "grad_norm": 1.9958820343017578, + "learning_rate": 4.9731993891796455e-05, + "loss": 5.2914, + "step": 7845 + }, + { + "epoch": 0.04666238462270435, + "grad_norm": 2.0615813732147217, + "learning_rate": 4.9731925675794286e-05, + "loss": 5.3318, + "step": 7846 + }, + { + "epoch": 0.04666833190598534, + "grad_norm": 1.7690422534942627, + "learning_rate": 4.973185745115846e-05, + "loss": 5.3169, + "step": 7847 + }, + { + "epoch": 0.046674279189266345, + "grad_norm": 1.7990578413009644, + "learning_rate": 4.9731789217888994e-05, + "loss": 5.3136, + "step": 7848 + }, + { + "epoch": 0.04668022647254734, + "grad_norm": 2.0028672218322754, + "learning_rate": 4.9731720975985905e-05, + "loss": 5.2115, + "step": 7849 + }, + { + "epoch": 0.046686173755828335, + "grad_norm": 2.0703940391540527, + "learning_rate": 4.973165272544924e-05, + "loss": 5.2439, + "step": 7850 + }, + { + "epoch": 0.04669212103910934, + "grad_norm": 2.1105704307556152, + "learning_rate": 4.973158446627901e-05, + "loss": 5.5812, + "step": 7851 + }, + { + "epoch": 0.04669806832239033, + "grad_norm": 1.7391036748886108, + "learning_rate": 4.9731516198475236e-05, + "loss": 5.229, + "step": 7852 + }, + { + "epoch": 0.04670401560567133, + "grad_norm": 1.6907505989074707, + "learning_rate": 4.973144792203795e-05, + "loss": 5.2674, + "step": 7853 + }, + { + "epoch": 0.04670996288895233, + "grad_norm": 1.608168125152588, + "learning_rate": 4.973137963696717e-05, + "loss": 5.389, + "step": 7854 + }, + { + "epoch": 0.046715910172233324, + "grad_norm": 1.7521610260009766, + "learning_rate": 4.9731311343262913e-05, + "loss": 5.2436, + "step": 7855 + }, + { + "epoch": 0.04672185745551432, + "grad_norm": 2.0182595252990723, + "learning_rate": 4.973124304092522e-05, + "loss": 5.2746, + "step": 7856 + }, + { + "epoch": 0.04672780473879532, + "grad_norm": 1.7990871667861938, + "learning_rate": 4.97311747299541e-05, + "loss": 5.4241, + "step": 7857 + }, + { + "epoch": 0.046733752022076316, + "grad_norm": 2.124717950820923, + "learning_rate": 4.973110641034958e-05, + "loss": 5.5133, + "step": 7858 + }, + { + "epoch": 0.04673969930535731, + "grad_norm": 2.066869020462036, + "learning_rate": 4.973103808211169e-05, + "loss": 5.252, + "step": 7859 + }, + { + "epoch": 0.04674564658863831, + "grad_norm": 1.8004878759384155, + "learning_rate": 4.9730969745240455e-05, + "loss": 5.483, + "step": 7860 + }, + { + "epoch": 0.04675159387191931, + "grad_norm": 1.6822713613510132, + "learning_rate": 4.9730901399735886e-05, + "loss": 5.3916, + "step": 7861 + }, + { + "epoch": 0.046757541155200304, + "grad_norm": 1.7024493217468262, + "learning_rate": 4.973083304559802e-05, + "loss": 5.3504, + "step": 7862 + }, + { + "epoch": 0.0467634884384813, + "grad_norm": 1.5939997434616089, + "learning_rate": 4.973076468282687e-05, + "loss": 5.4151, + "step": 7863 + }, + { + "epoch": 0.0467694357217623, + "grad_norm": 1.7603535652160645, + "learning_rate": 4.9730696311422475e-05, + "loss": 5.351, + "step": 7864 + }, + { + "epoch": 0.046775383005043296, + "grad_norm": 1.737897276878357, + "learning_rate": 4.973062793138484e-05, + "loss": 5.0834, + "step": 7865 + }, + { + "epoch": 0.04678133028832429, + "grad_norm": 2.4130520820617676, + "learning_rate": 4.973055954271401e-05, + "loss": 4.833, + "step": 7866 + }, + { + "epoch": 0.04678727757160529, + "grad_norm": 1.9712201356887817, + "learning_rate": 4.9730491145409987e-05, + "loss": 5.0048, + "step": 7867 + }, + { + "epoch": 0.04679322485488629, + "grad_norm": 1.808608055114746, + "learning_rate": 4.97304227394728e-05, + "loss": 5.3134, + "step": 7868 + }, + { + "epoch": 0.04679917213816728, + "grad_norm": 1.8121775388717651, + "learning_rate": 4.973035432490249e-05, + "loss": 5.2594, + "step": 7869 + }, + { + "epoch": 0.046805119421448285, + "grad_norm": 1.7191296815872192, + "learning_rate": 4.9730285901699064e-05, + "loss": 5.206, + "step": 7870 + }, + { + "epoch": 0.04681106670472928, + "grad_norm": 1.931894063949585, + "learning_rate": 4.973021746986255e-05, + "loss": 5.3349, + "step": 7871 + }, + { + "epoch": 0.046817013988010275, + "grad_norm": 2.5420172214508057, + "learning_rate": 4.973014902939297e-05, + "loss": 5.2894, + "step": 7872 + }, + { + "epoch": 0.04682296127129128, + "grad_norm": 2.5522336959838867, + "learning_rate": 4.973008058029036e-05, + "loss": 5.2144, + "step": 7873 + }, + { + "epoch": 0.04682890855457227, + "grad_norm": 3.1389801502227783, + "learning_rate": 4.973001212255472e-05, + "loss": 5.7229, + "step": 7874 + }, + { + "epoch": 0.04683485583785327, + "grad_norm": 1.8687554597854614, + "learning_rate": 4.97299436561861e-05, + "loss": 5.483, + "step": 7875 + }, + { + "epoch": 0.04684080312113426, + "grad_norm": 2.2526602745056152, + "learning_rate": 4.972987518118451e-05, + "loss": 5.4562, + "step": 7876 + }, + { + "epoch": 0.046846750404415265, + "grad_norm": 2.108677625656128, + "learning_rate": 4.972980669754997e-05, + "loss": 5.2005, + "step": 7877 + }, + { + "epoch": 0.04685269768769626, + "grad_norm": 2.023118019104004, + "learning_rate": 4.972973820528252e-05, + "loss": 5.3674, + "step": 7878 + }, + { + "epoch": 0.046858644970977255, + "grad_norm": 1.6553964614868164, + "learning_rate": 4.9729669704382165e-05, + "loss": 5.3256, + "step": 7879 + }, + { + "epoch": 0.04686459225425826, + "grad_norm": 1.8197314739227295, + "learning_rate": 4.972960119484894e-05, + "loss": 5.1738, + "step": 7880 + }, + { + "epoch": 0.04687053953753925, + "grad_norm": 1.6142289638519287, + "learning_rate": 4.972953267668287e-05, + "loss": 5.245, + "step": 7881 + }, + { + "epoch": 0.04687648682082025, + "grad_norm": 1.4962797164916992, + "learning_rate": 4.972946414988398e-05, + "loss": 5.3121, + "step": 7882 + }, + { + "epoch": 0.04688243410410125, + "grad_norm": 1.487801432609558, + "learning_rate": 4.972939561445228e-05, + "loss": 5.1828, + "step": 7883 + }, + { + "epoch": 0.046888381387382244, + "grad_norm": 1.9139772653579712, + "learning_rate": 4.972932707038781e-05, + "loss": 5.2432, + "step": 7884 + }, + { + "epoch": 0.04689432867066324, + "grad_norm": 1.7533615827560425, + "learning_rate": 4.972925851769058e-05, + "loss": 5.6451, + "step": 7885 + }, + { + "epoch": 0.04690027595394424, + "grad_norm": 1.8561608791351318, + "learning_rate": 4.972918995636062e-05, + "loss": 5.4293, + "step": 7886 + }, + { + "epoch": 0.046906223237225236, + "grad_norm": 1.6891844272613525, + "learning_rate": 4.972912138639797e-05, + "loss": 5.2736, + "step": 7887 + }, + { + "epoch": 0.04691217052050623, + "grad_norm": 1.9279890060424805, + "learning_rate": 4.972905280780262e-05, + "loss": 5.5733, + "step": 7888 + }, + { + "epoch": 0.04691811780378723, + "grad_norm": 1.7810181379318237, + "learning_rate": 4.9728984220574624e-05, + "loss": 5.2036, + "step": 7889 + }, + { + "epoch": 0.04692406508706823, + "grad_norm": 1.6455233097076416, + "learning_rate": 4.9728915624714004e-05, + "loss": 5.3493, + "step": 7890 + }, + { + "epoch": 0.046930012370349224, + "grad_norm": 1.5345048904418945, + "learning_rate": 4.9728847020220756e-05, + "loss": 5.2528, + "step": 7891 + }, + { + "epoch": 0.04693595965363022, + "grad_norm": 1.455165982246399, + "learning_rate": 4.9728778407094935e-05, + "loss": 5.2769, + "step": 7892 + }, + { + "epoch": 0.04694190693691122, + "grad_norm": 1.577910304069519, + "learning_rate": 4.972870978533655e-05, + "loss": 5.2182, + "step": 7893 + }, + { + "epoch": 0.046947854220192216, + "grad_norm": 1.728143334388733, + "learning_rate": 4.972864115494563e-05, + "loss": 5.3446, + "step": 7894 + }, + { + "epoch": 0.04695380150347321, + "grad_norm": 1.6157398223876953, + "learning_rate": 4.972857251592219e-05, + "loss": 5.4866, + "step": 7895 + }, + { + "epoch": 0.04695974878675421, + "grad_norm": 1.5386699438095093, + "learning_rate": 4.9728503868266266e-05, + "loss": 5.4626, + "step": 7896 + }, + { + "epoch": 0.04696569607003521, + "grad_norm": 1.874915599822998, + "learning_rate": 4.972843521197788e-05, + "loss": 5.4152, + "step": 7897 + }, + { + "epoch": 0.0469716433533162, + "grad_norm": 1.7093253135681152, + "learning_rate": 4.9728366547057046e-05, + "loss": 5.2852, + "step": 7898 + }, + { + "epoch": 0.046977590636597205, + "grad_norm": 1.6435173749923706, + "learning_rate": 4.9728297873503806e-05, + "loss": 5.3985, + "step": 7899 + }, + { + "epoch": 0.0469835379198782, + "grad_norm": 1.5776588916778564, + "learning_rate": 4.972822919131816e-05, + "loss": 5.2914, + "step": 7900 + }, + { + "epoch": 0.046989485203159195, + "grad_norm": 2.051072835922241, + "learning_rate": 4.972816050050015e-05, + "loss": 5.343, + "step": 7901 + }, + { + "epoch": 0.0469954324864402, + "grad_norm": 2.003816604614258, + "learning_rate": 4.972809180104979e-05, + "loss": 5.3577, + "step": 7902 + }, + { + "epoch": 0.04700137976972119, + "grad_norm": 1.9092657566070557, + "learning_rate": 4.9728023092967116e-05, + "loss": 5.551, + "step": 7903 + }, + { + "epoch": 0.04700732705300219, + "grad_norm": 1.763007640838623, + "learning_rate": 4.972795437625214e-05, + "loss": 5.5611, + "step": 7904 + }, + { + "epoch": 0.04701327433628318, + "grad_norm": 2.637850046157837, + "learning_rate": 4.9727885650904895e-05, + "loss": 5.937, + "step": 7905 + }, + { + "epoch": 0.047019221619564185, + "grad_norm": 1.6650307178497314, + "learning_rate": 4.9727816916925395e-05, + "loss": 5.6418, + "step": 7906 + }, + { + "epoch": 0.04702516890284518, + "grad_norm": 1.6943029165267944, + "learning_rate": 4.972774817431367e-05, + "loss": 5.4826, + "step": 7907 + }, + { + "epoch": 0.047031116186126175, + "grad_norm": 1.4689685106277466, + "learning_rate": 4.972767942306975e-05, + "loss": 5.4849, + "step": 7908 + }, + { + "epoch": 0.04703706346940718, + "grad_norm": 1.759244441986084, + "learning_rate": 4.9727610663193644e-05, + "loss": 5.3496, + "step": 7909 + }, + { + "epoch": 0.04704301075268817, + "grad_norm": 1.8706889152526855, + "learning_rate": 4.9727541894685395e-05, + "loss": 5.2836, + "step": 7910 + }, + { + "epoch": 0.04704895803596917, + "grad_norm": 1.486164927482605, + "learning_rate": 4.972747311754501e-05, + "loss": 5.4125, + "step": 7911 + }, + { + "epoch": 0.04705490531925017, + "grad_norm": 1.6479889154434204, + "learning_rate": 4.972740433177252e-05, + "loss": 5.1986, + "step": 7912 + }, + { + "epoch": 0.047060852602531164, + "grad_norm": 1.5741796493530273, + "learning_rate": 4.9727335537367944e-05, + "loss": 5.4761, + "step": 7913 + }, + { + "epoch": 0.04706679988581216, + "grad_norm": 1.5001682043075562, + "learning_rate": 4.972726673433131e-05, + "loss": 5.6267, + "step": 7914 + }, + { + "epoch": 0.04707274716909316, + "grad_norm": 1.774282455444336, + "learning_rate": 4.972719792266265e-05, + "loss": 5.5944, + "step": 7915 + }, + { + "epoch": 0.047078694452374156, + "grad_norm": 1.6656653881072998, + "learning_rate": 4.972712910236198e-05, + "loss": 5.4159, + "step": 7916 + }, + { + "epoch": 0.04708464173565515, + "grad_norm": 1.7174065113067627, + "learning_rate": 4.972706027342933e-05, + "loss": 5.4239, + "step": 7917 + }, + { + "epoch": 0.04709058901893615, + "grad_norm": 1.607878565788269, + "learning_rate": 4.9726991435864705e-05, + "loss": 5.4517, + "step": 7918 + }, + { + "epoch": 0.04709653630221715, + "grad_norm": 1.9639167785644531, + "learning_rate": 4.972692258966815e-05, + "loss": 5.5371, + "step": 7919 + }, + { + "epoch": 0.047102483585498144, + "grad_norm": 1.5418875217437744, + "learning_rate": 4.9726853734839684e-05, + "loss": 5.4798, + "step": 7920 + }, + { + "epoch": 0.04710843086877914, + "grad_norm": 1.54796302318573, + "learning_rate": 4.9726784871379326e-05, + "loss": 5.5329, + "step": 7921 + }, + { + "epoch": 0.04711437815206014, + "grad_norm": 1.8075921535491943, + "learning_rate": 4.97267159992871e-05, + "loss": 5.6049, + "step": 7922 + }, + { + "epoch": 0.047120325435341136, + "grad_norm": 1.4973857402801514, + "learning_rate": 4.972664711856304e-05, + "loss": 5.27, + "step": 7923 + }, + { + "epoch": 0.04712627271862213, + "grad_norm": 2.1028542518615723, + "learning_rate": 4.9726578229207155e-05, + "loss": 5.3626, + "step": 7924 + }, + { + "epoch": 0.04713222000190313, + "grad_norm": 2.2057480812072754, + "learning_rate": 4.9726509331219485e-05, + "loss": 5.1767, + "step": 7925 + }, + { + "epoch": 0.04713816728518413, + "grad_norm": 2.0549347400665283, + "learning_rate": 4.972644042460004e-05, + "loss": 5.3362, + "step": 7926 + }, + { + "epoch": 0.04714411456846512, + "grad_norm": 2.0960693359375, + "learning_rate": 4.972637150934885e-05, + "loss": 5.5162, + "step": 7927 + }, + { + "epoch": 0.047150061851746125, + "grad_norm": 2.2022509574890137, + "learning_rate": 4.9726302585465945e-05, + "loss": 5.3263, + "step": 7928 + }, + { + "epoch": 0.04715600913502712, + "grad_norm": 1.7065988779067993, + "learning_rate": 4.9726233652951335e-05, + "loss": 5.4349, + "step": 7929 + }, + { + "epoch": 0.047161956418308115, + "grad_norm": 1.742591142654419, + "learning_rate": 4.972616471180506e-05, + "loss": 5.2396, + "step": 7930 + }, + { + "epoch": 0.04716790370158912, + "grad_norm": 1.888846755027771, + "learning_rate": 4.972609576202713e-05, + "loss": 5.3453, + "step": 7931 + }, + { + "epoch": 0.04717385098487011, + "grad_norm": 1.6499360799789429, + "learning_rate": 4.972602680361758e-05, + "loss": 5.2819, + "step": 7932 + }, + { + "epoch": 0.04717979826815111, + "grad_norm": 1.8801236152648926, + "learning_rate": 4.9725957836576434e-05, + "loss": 5.2456, + "step": 7933 + }, + { + "epoch": 0.0471857455514321, + "grad_norm": 2.050522565841675, + "learning_rate": 4.97258888609037e-05, + "loss": 5.2069, + "step": 7934 + }, + { + "epoch": 0.047191692834713105, + "grad_norm": 2.0722391605377197, + "learning_rate": 4.972581987659942e-05, + "loss": 5.5057, + "step": 7935 + }, + { + "epoch": 0.0471976401179941, + "grad_norm": 2.728468179702759, + "learning_rate": 4.972575088366361e-05, + "loss": 5.5485, + "step": 7936 + }, + { + "epoch": 0.047203587401275095, + "grad_norm": 2.0293211936950684, + "learning_rate": 4.9725681882096295e-05, + "loss": 5.7126, + "step": 7937 + }, + { + "epoch": 0.0472095346845561, + "grad_norm": 2.1351194381713867, + "learning_rate": 4.97256128718975e-05, + "loss": 5.7313, + "step": 7938 + }, + { + "epoch": 0.04721548196783709, + "grad_norm": 1.9040015935897827, + "learning_rate": 4.972554385306726e-05, + "loss": 5.696, + "step": 7939 + }, + { + "epoch": 0.04722142925111809, + "grad_norm": 1.640110731124878, + "learning_rate": 4.9725474825605574e-05, + "loss": 5.2626, + "step": 7940 + }, + { + "epoch": 0.04722737653439909, + "grad_norm": 1.887408971786499, + "learning_rate": 4.972540578951249e-05, + "loss": 5.2734, + "step": 7941 + }, + { + "epoch": 0.047233323817680084, + "grad_norm": 1.8867583274841309, + "learning_rate": 4.972533674478801e-05, + "loss": 5.6811, + "step": 7942 + }, + { + "epoch": 0.04723927110096108, + "grad_norm": 1.811104655265808, + "learning_rate": 4.9725267691432174e-05, + "loss": 5.575, + "step": 7943 + }, + { + "epoch": 0.04724521838424208, + "grad_norm": 1.8644812107086182, + "learning_rate": 4.9725198629445014e-05, + "loss": 5.5718, + "step": 7944 + }, + { + "epoch": 0.047251165667523076, + "grad_norm": 1.693788766860962, + "learning_rate": 4.972512955882653e-05, + "loss": 5.5924, + "step": 7945 + }, + { + "epoch": 0.04725711295080407, + "grad_norm": 1.8305641412734985, + "learning_rate": 4.9725060479576766e-05, + "loss": 5.6529, + "step": 7946 + }, + { + "epoch": 0.04726306023408507, + "grad_norm": 1.7662039995193481, + "learning_rate": 4.9724991391695734e-05, + "loss": 5.6709, + "step": 7947 + }, + { + "epoch": 0.04726900751736607, + "grad_norm": 2.1799724102020264, + "learning_rate": 4.972492229518347e-05, + "loss": 5.6266, + "step": 7948 + }, + { + "epoch": 0.047274954800647064, + "grad_norm": 1.9300130605697632, + "learning_rate": 4.972485319003998e-05, + "loss": 5.6494, + "step": 7949 + }, + { + "epoch": 0.04728090208392806, + "grad_norm": 1.9196375608444214, + "learning_rate": 4.9724784076265307e-05, + "loss": 5.571, + "step": 7950 + }, + { + "epoch": 0.04728684936720906, + "grad_norm": 1.906616449356079, + "learning_rate": 4.972471495385947e-05, + "loss": 5.6537, + "step": 7951 + }, + { + "epoch": 0.047292796650490056, + "grad_norm": 1.826536774635315, + "learning_rate": 4.972464582282249e-05, + "loss": 5.6251, + "step": 7952 + }, + { + "epoch": 0.04729874393377105, + "grad_norm": 1.7790716886520386, + "learning_rate": 4.972457668315438e-05, + "loss": 5.3488, + "step": 7953 + }, + { + "epoch": 0.04730469121705205, + "grad_norm": 1.8892159461975098, + "learning_rate": 4.972450753485519e-05, + "loss": 5.4794, + "step": 7954 + }, + { + "epoch": 0.04731063850033305, + "grad_norm": 1.9409239292144775, + "learning_rate": 4.972443837792492e-05, + "loss": 5.6058, + "step": 7955 + }, + { + "epoch": 0.04731658578361404, + "grad_norm": 1.9935575723648071, + "learning_rate": 4.972436921236361e-05, + "loss": 5.6481, + "step": 7956 + }, + { + "epoch": 0.047322533066895045, + "grad_norm": 1.8507076501846313, + "learning_rate": 4.9724300038171276e-05, + "loss": 5.4723, + "step": 7957 + }, + { + "epoch": 0.04732848035017604, + "grad_norm": 1.9355841875076294, + "learning_rate": 4.972423085534794e-05, + "loss": 5.3843, + "step": 7958 + }, + { + "epoch": 0.047334427633457035, + "grad_norm": 1.9815531969070435, + "learning_rate": 4.972416166389363e-05, + "loss": 5.5635, + "step": 7959 + }, + { + "epoch": 0.04734037491673804, + "grad_norm": 1.7955007553100586, + "learning_rate": 4.972409246380838e-05, + "loss": 5.6002, + "step": 7960 + }, + { + "epoch": 0.04734632220001903, + "grad_norm": 2.0184547901153564, + "learning_rate": 4.97240232550922e-05, + "loss": 5.5458, + "step": 7961 + }, + { + "epoch": 0.04735226948330003, + "grad_norm": 1.7418156862258911, + "learning_rate": 4.972395403774512e-05, + "loss": 5.6443, + "step": 7962 + }, + { + "epoch": 0.04735821676658102, + "grad_norm": 1.9832762479782104, + "learning_rate": 4.972388481176716e-05, + "loss": 5.3799, + "step": 7963 + }, + { + "epoch": 0.047364164049862024, + "grad_norm": 1.8777718544006348, + "learning_rate": 4.972381557715835e-05, + "loss": 5.4349, + "step": 7964 + }, + { + "epoch": 0.04737011133314302, + "grad_norm": 1.519038438796997, + "learning_rate": 4.972374633391871e-05, + "loss": 5.2418, + "step": 7965 + }, + { + "epoch": 0.047376058616424015, + "grad_norm": 1.6425752639770508, + "learning_rate": 4.972367708204826e-05, + "loss": 5.1648, + "step": 7966 + }, + { + "epoch": 0.04738200589970502, + "grad_norm": 1.7461836338043213, + "learning_rate": 4.972360782154704e-05, + "loss": 5.1745, + "step": 7967 + }, + { + "epoch": 0.04738795318298601, + "grad_norm": 1.7991663217544556, + "learning_rate": 4.9723538552415064e-05, + "loss": 5.2268, + "step": 7968 + }, + { + "epoch": 0.04739390046626701, + "grad_norm": 1.9127873182296753, + "learning_rate": 4.9723469274652345e-05, + "loss": 5.5205, + "step": 7969 + }, + { + "epoch": 0.04739984774954801, + "grad_norm": 1.8836725950241089, + "learning_rate": 4.972339998825893e-05, + "loss": 5.3803, + "step": 7970 + }, + { + "epoch": 0.047405795032829004, + "grad_norm": 1.8391705751419067, + "learning_rate": 4.9723330693234825e-05, + "loss": 5.3084, + "step": 7971 + }, + { + "epoch": 0.04741174231611, + "grad_norm": 1.6707972288131714, + "learning_rate": 4.9723261389580063e-05, + "loss": 5.3275, + "step": 7972 + }, + { + "epoch": 0.047417689599391, + "grad_norm": 1.8807258605957031, + "learning_rate": 4.972319207729467e-05, + "loss": 5.0766, + "step": 7973 + }, + { + "epoch": 0.047423636882671996, + "grad_norm": 1.8980032205581665, + "learning_rate": 4.9723122756378655e-05, + "loss": 5.185, + "step": 7974 + }, + { + "epoch": 0.04742958416595299, + "grad_norm": 1.9011166095733643, + "learning_rate": 4.9723053426832055e-05, + "loss": 5.2494, + "step": 7975 + }, + { + "epoch": 0.04743553144923399, + "grad_norm": 1.6457782983779907, + "learning_rate": 4.97229840886549e-05, + "loss": 5.4205, + "step": 7976 + }, + { + "epoch": 0.04744147873251499, + "grad_norm": 1.558515191078186, + "learning_rate": 4.9722914741847206e-05, + "loss": 5.2111, + "step": 7977 + }, + { + "epoch": 0.04744742601579598, + "grad_norm": 1.4780910015106201, + "learning_rate": 4.9722845386409e-05, + "loss": 5.3365, + "step": 7978 + }, + { + "epoch": 0.04745337329907698, + "grad_norm": 1.529249668121338, + "learning_rate": 4.9722776022340296e-05, + "loss": 5.1323, + "step": 7979 + }, + { + "epoch": 0.04745932058235798, + "grad_norm": 1.66848886013031, + "learning_rate": 4.972270664964113e-05, + "loss": 5.2057, + "step": 7980 + }, + { + "epoch": 0.047465267865638976, + "grad_norm": 1.5645034313201904, + "learning_rate": 4.972263726831152e-05, + "loss": 5.1537, + "step": 7981 + }, + { + "epoch": 0.04747121514891997, + "grad_norm": 1.8793894052505493, + "learning_rate": 4.9722567878351496e-05, + "loss": 5.4403, + "step": 7982 + }, + { + "epoch": 0.04747716243220097, + "grad_norm": 1.7316640615463257, + "learning_rate": 4.972249847976108e-05, + "loss": 5.3642, + "step": 7983 + }, + { + "epoch": 0.04748310971548197, + "grad_norm": 1.7195171117782593, + "learning_rate": 4.972242907254029e-05, + "loss": 5.2603, + "step": 7984 + }, + { + "epoch": 0.04748905699876296, + "grad_norm": 1.6860026121139526, + "learning_rate": 4.972235965668916e-05, + "loss": 5.356, + "step": 7985 + }, + { + "epoch": 0.047495004282043965, + "grad_norm": 1.5396910905838013, + "learning_rate": 4.972229023220771e-05, + "loss": 5.2566, + "step": 7986 + }, + { + "epoch": 0.04750095156532496, + "grad_norm": 1.694547176361084, + "learning_rate": 4.9722220799095956e-05, + "loss": 5.0897, + "step": 7987 + }, + { + "epoch": 0.047506898848605955, + "grad_norm": 1.7608548402786255, + "learning_rate": 4.972215135735394e-05, + "loss": 5.4084, + "step": 7988 + }, + { + "epoch": 0.04751284613188696, + "grad_norm": 1.697198748588562, + "learning_rate": 4.9722081906981675e-05, + "loss": 5.4133, + "step": 7989 + }, + { + "epoch": 0.04751879341516795, + "grad_norm": 1.6107436418533325, + "learning_rate": 4.972201244797918e-05, + "loss": 5.2839, + "step": 7990 + }, + { + "epoch": 0.04752474069844895, + "grad_norm": 1.8178008794784546, + "learning_rate": 4.972194298034649e-05, + "loss": 5.3722, + "step": 7991 + }, + { + "epoch": 0.04753068798172994, + "grad_norm": 1.6542725563049316, + "learning_rate": 4.972187350408363e-05, + "loss": 5.3434, + "step": 7992 + }, + { + "epoch": 0.047536635265010944, + "grad_norm": 1.8194152116775513, + "learning_rate": 4.972180401919061e-05, + "loss": 5.3763, + "step": 7993 + }, + { + "epoch": 0.04754258254829194, + "grad_norm": 1.890317678451538, + "learning_rate": 4.9721734525667476e-05, + "loss": 5.529, + "step": 7994 + }, + { + "epoch": 0.047548529831572935, + "grad_norm": 1.813226342201233, + "learning_rate": 4.972166502351423e-05, + "loss": 5.0826, + "step": 7995 + }, + { + "epoch": 0.04755447711485394, + "grad_norm": 1.7679328918457031, + "learning_rate": 4.9721595512730905e-05, + "loss": 5.3589, + "step": 7996 + }, + { + "epoch": 0.04756042439813493, + "grad_norm": 1.8390278816223145, + "learning_rate": 4.972152599331753e-05, + "loss": 5.1568, + "step": 7997 + }, + { + "epoch": 0.04756637168141593, + "grad_norm": 2.9323909282684326, + "learning_rate": 4.972145646527413e-05, + "loss": 5.6457, + "step": 7998 + }, + { + "epoch": 0.04757231896469693, + "grad_norm": 1.8839350938796997, + "learning_rate": 4.972138692860072e-05, + "loss": 5.1204, + "step": 7999 + }, + { + "epoch": 0.047578266247977924, + "grad_norm": 1.9047685861587524, + "learning_rate": 4.972131738329733e-05, + "loss": 5.2741, + "step": 8000 + }, + { + "epoch": 0.04758421353125892, + "grad_norm": 2.39807391166687, + "learning_rate": 4.972124782936398e-05, + "loss": 5.0134, + "step": 8001 + }, + { + "epoch": 0.04759016081453992, + "grad_norm": 2.197404146194458, + "learning_rate": 4.972117826680071e-05, + "loss": 5.3012, + "step": 8002 + }, + { + "epoch": 0.047596108097820916, + "grad_norm": 2.2648651599884033, + "learning_rate": 4.9721108695607515e-05, + "loss": 5.7196, + "step": 8003 + }, + { + "epoch": 0.04760205538110191, + "grad_norm": 1.7686847448349, + "learning_rate": 4.972103911578444e-05, + "loss": 5.4261, + "step": 8004 + }, + { + "epoch": 0.04760800266438291, + "grad_norm": 1.726653814315796, + "learning_rate": 4.972096952733152e-05, + "loss": 5.33, + "step": 8005 + }, + { + "epoch": 0.04761394994766391, + "grad_norm": 1.6855807304382324, + "learning_rate": 4.972089993024875e-05, + "loss": 5.2382, + "step": 8006 + }, + { + "epoch": 0.0476198972309449, + "grad_norm": 1.644954800605774, + "learning_rate": 4.972083032453617e-05, + "loss": 5.3309, + "step": 8007 + }, + { + "epoch": 0.0476258445142259, + "grad_norm": 1.8630400896072388, + "learning_rate": 4.9720760710193816e-05, + "loss": 5.282, + "step": 8008 + }, + { + "epoch": 0.0476317917975069, + "grad_norm": 1.862716555595398, + "learning_rate": 4.972069108722168e-05, + "loss": 5.3307, + "step": 8009 + }, + { + "epoch": 0.047637739080787896, + "grad_norm": 1.8025259971618652, + "learning_rate": 4.972062145561982e-05, + "loss": 5.2236, + "step": 8010 + }, + { + "epoch": 0.04764368636406889, + "grad_norm": 1.7213356494903564, + "learning_rate": 4.972055181538825e-05, + "loss": 5.0635, + "step": 8011 + }, + { + "epoch": 0.04764963364734989, + "grad_norm": 1.5237104892730713, + "learning_rate": 4.9720482166526986e-05, + "loss": 5.3089, + "step": 8012 + }, + { + "epoch": 0.04765558093063089, + "grad_norm": 1.628957748413086, + "learning_rate": 4.972041250903605e-05, + "loss": 5.2299, + "step": 8013 + }, + { + "epoch": 0.04766152821391188, + "grad_norm": 1.9217725992202759, + "learning_rate": 4.972034284291548e-05, + "loss": 5.2504, + "step": 8014 + }, + { + "epoch": 0.047667475497192885, + "grad_norm": 2.114549160003662, + "learning_rate": 4.97202731681653e-05, + "loss": 5.219, + "step": 8015 + }, + { + "epoch": 0.04767342278047388, + "grad_norm": 1.9268896579742432, + "learning_rate": 4.9720203484785525e-05, + "loss": 5.145, + "step": 8016 + }, + { + "epoch": 0.047679370063754875, + "grad_norm": 2.04050874710083, + "learning_rate": 4.9720133792776166e-05, + "loss": 5.354, + "step": 8017 + }, + { + "epoch": 0.04768531734703588, + "grad_norm": 1.8002599477767944, + "learning_rate": 4.972006409213728e-05, + "loss": 5.0547, + "step": 8018 + }, + { + "epoch": 0.04769126463031687, + "grad_norm": 1.9655365943908691, + "learning_rate": 4.9719994382868876e-05, + "loss": 5.2188, + "step": 8019 + }, + { + "epoch": 0.04769721191359787, + "grad_norm": 1.7188535928726196, + "learning_rate": 4.971992466497097e-05, + "loss": 5.1792, + "step": 8020 + }, + { + "epoch": 0.04770315919687886, + "grad_norm": 1.582184910774231, + "learning_rate": 4.97198549384436e-05, + "loss": 5.2295, + "step": 8021 + }, + { + "epoch": 0.047709106480159864, + "grad_norm": 1.4490164518356323, + "learning_rate": 4.971978520328677e-05, + "loss": 5.1677, + "step": 8022 + }, + { + "epoch": 0.04771505376344086, + "grad_norm": 1.472896695137024, + "learning_rate": 4.971971545950054e-05, + "loss": 4.9954, + "step": 8023 + }, + { + "epoch": 0.047721001046721855, + "grad_norm": 1.5845187902450562, + "learning_rate": 4.97196457070849e-05, + "loss": 5.1273, + "step": 8024 + }, + { + "epoch": 0.04772694833000286, + "grad_norm": 1.6418551206588745, + "learning_rate": 4.9719575946039887e-05, + "loss": 5.0835, + "step": 8025 + }, + { + "epoch": 0.04773289561328385, + "grad_norm": 1.379805088043213, + "learning_rate": 4.971950617636553e-05, + "loss": 5.1058, + "step": 8026 + }, + { + "epoch": 0.04773884289656485, + "grad_norm": 1.7939400672912598, + "learning_rate": 4.9719436398061835e-05, + "loss": 5.0105, + "step": 8027 + }, + { + "epoch": 0.04774479017984585, + "grad_norm": 1.5610185861587524, + "learning_rate": 4.971936661112886e-05, + "loss": 5.032, + "step": 8028 + }, + { + "epoch": 0.047750737463126844, + "grad_norm": 1.524402379989624, + "learning_rate": 4.9719296815566594e-05, + "loss": 5.1376, + "step": 8029 + }, + { + "epoch": 0.04775668474640784, + "grad_norm": 1.7448087930679321, + "learning_rate": 4.971922701137509e-05, + "loss": 4.9496, + "step": 8030 + }, + { + "epoch": 0.04776263202968884, + "grad_norm": 1.7382763624191284, + "learning_rate": 4.971915719855435e-05, + "loss": 4.9755, + "step": 8031 + }, + { + "epoch": 0.047768579312969836, + "grad_norm": 1.6728250980377197, + "learning_rate": 4.971908737710441e-05, + "loss": 5.1436, + "step": 8032 + }, + { + "epoch": 0.04777452659625083, + "grad_norm": 1.4256306886672974, + "learning_rate": 4.971901754702529e-05, + "loss": 4.9739, + "step": 8033 + }, + { + "epoch": 0.04778047387953183, + "grad_norm": 1.660714864730835, + "learning_rate": 4.971894770831702e-05, + "loss": 5.1337, + "step": 8034 + }, + { + "epoch": 0.04778642116281283, + "grad_norm": 1.5240182876586914, + "learning_rate": 4.9718877860979615e-05, + "loss": 5.1143, + "step": 8035 + }, + { + "epoch": 0.04779236844609382, + "grad_norm": 1.478852391242981, + "learning_rate": 4.971880800501311e-05, + "loss": 4.968, + "step": 8036 + }, + { + "epoch": 0.04779831572937482, + "grad_norm": 1.5343812704086304, + "learning_rate": 4.971873814041752e-05, + "loss": 4.9393, + "step": 8037 + }, + { + "epoch": 0.04780426301265582, + "grad_norm": 1.6728276014328003, + "learning_rate": 4.971866826719288e-05, + "loss": 5.0535, + "step": 8038 + }, + { + "epoch": 0.047810210295936816, + "grad_norm": 1.4831758737564087, + "learning_rate": 4.971859838533921e-05, + "loss": 5.0705, + "step": 8039 + }, + { + "epoch": 0.04781615757921781, + "grad_norm": 1.7412161827087402, + "learning_rate": 4.971852849485653e-05, + "loss": 4.9338, + "step": 8040 + }, + { + "epoch": 0.04782210486249881, + "grad_norm": 1.4696041345596313, + "learning_rate": 4.971845859574487e-05, + "loss": 5.0643, + "step": 8041 + }, + { + "epoch": 0.04782805214577981, + "grad_norm": 1.4190481901168823, + "learning_rate": 4.9718388688004235e-05, + "loss": 5.0743, + "step": 8042 + }, + { + "epoch": 0.0478339994290608, + "grad_norm": 1.513454556465149, + "learning_rate": 4.9718318771634686e-05, + "loss": 4.8832, + "step": 8043 + }, + { + "epoch": 0.047839946712341805, + "grad_norm": 1.7310774326324463, + "learning_rate": 4.9718248846636216e-05, + "loss": 4.957, + "step": 8044 + }, + { + "epoch": 0.0478458939956228, + "grad_norm": 1.4895838499069214, + "learning_rate": 4.971817891300886e-05, + "loss": 4.9121, + "step": 8045 + }, + { + "epoch": 0.047851841278903795, + "grad_norm": 1.6848632097244263, + "learning_rate": 4.9718108970752656e-05, + "loss": 5.1337, + "step": 8046 + }, + { + "epoch": 0.0478577885621848, + "grad_norm": 1.7145766019821167, + "learning_rate": 4.97180390198676e-05, + "loss": 5.1827, + "step": 8047 + }, + { + "epoch": 0.04786373584546579, + "grad_norm": 1.668140172958374, + "learning_rate": 4.971796906035374e-05, + "loss": 5.4071, + "step": 8048 + }, + { + "epoch": 0.04786968312874679, + "grad_norm": 1.6927748918533325, + "learning_rate": 4.9717899092211094e-05, + "loss": 5.4319, + "step": 8049 + }, + { + "epoch": 0.04787563041202778, + "grad_norm": 1.6696170568466187, + "learning_rate": 4.971782911543968e-05, + "loss": 5.4137, + "step": 8050 + }, + { + "epoch": 0.047881577695308784, + "grad_norm": 1.9299427270889282, + "learning_rate": 4.971775913003953e-05, + "loss": 5.6676, + "step": 8051 + }, + { + "epoch": 0.04788752497858978, + "grad_norm": 1.7163755893707275, + "learning_rate": 4.971768913601066e-05, + "loss": 5.2916, + "step": 8052 + }, + { + "epoch": 0.047893472261870774, + "grad_norm": 1.7822209596633911, + "learning_rate": 4.971761913335311e-05, + "loss": 5.6364, + "step": 8053 + }, + { + "epoch": 0.047899419545151777, + "grad_norm": 1.725375771522522, + "learning_rate": 4.971754912206689e-05, + "loss": 5.045, + "step": 8054 + }, + { + "epoch": 0.04790536682843277, + "grad_norm": 1.5243995189666748, + "learning_rate": 4.9717479102152027e-05, + "loss": 5.4691, + "step": 8055 + }, + { + "epoch": 0.04791131411171377, + "grad_norm": 1.6673872470855713, + "learning_rate": 4.971740907360854e-05, + "loss": 5.4851, + "step": 8056 + }, + { + "epoch": 0.04791726139499477, + "grad_norm": 1.6378693580627441, + "learning_rate": 4.971733903643647e-05, + "loss": 5.2574, + "step": 8057 + }, + { + "epoch": 0.047923208678275764, + "grad_norm": 1.484250545501709, + "learning_rate": 4.9717268990635835e-05, + "loss": 5.2988, + "step": 8058 + }, + { + "epoch": 0.04792915596155676, + "grad_norm": 1.626955270767212, + "learning_rate": 4.971719893620665e-05, + "loss": 5.3502, + "step": 8059 + }, + { + "epoch": 0.04793510324483776, + "grad_norm": 2.1421375274658203, + "learning_rate": 4.9717128873148954e-05, + "loss": 5.3006, + "step": 8060 + }, + { + "epoch": 0.047941050528118756, + "grad_norm": 1.5175740718841553, + "learning_rate": 4.971705880146276e-05, + "loss": 5.4144, + "step": 8061 + }, + { + "epoch": 0.04794699781139975, + "grad_norm": 1.6170361042022705, + "learning_rate": 4.9716988721148095e-05, + "loss": 5.3635, + "step": 8062 + }, + { + "epoch": 0.04795294509468075, + "grad_norm": 1.7269384860992432, + "learning_rate": 4.971691863220499e-05, + "loss": 5.2813, + "step": 8063 + }, + { + "epoch": 0.04795889237796175, + "grad_norm": 1.5144844055175781, + "learning_rate": 4.971684853463345e-05, + "loss": 5.3242, + "step": 8064 + }, + { + "epoch": 0.04796483966124274, + "grad_norm": 1.7125827074050903, + "learning_rate": 4.971677842843353e-05, + "loss": 5.2968, + "step": 8065 + }, + { + "epoch": 0.04797078694452374, + "grad_norm": 1.6067146062850952, + "learning_rate": 4.9716708313605234e-05, + "loss": 5.4446, + "step": 8066 + }, + { + "epoch": 0.04797673422780474, + "grad_norm": 1.8911150693893433, + "learning_rate": 4.9716638190148585e-05, + "loss": 5.1875, + "step": 8067 + }, + { + "epoch": 0.047982681511085735, + "grad_norm": 1.6865830421447754, + "learning_rate": 4.971656805806362e-05, + "loss": 5.1909, + "step": 8068 + }, + { + "epoch": 0.04798862879436673, + "grad_norm": 2.009566068649292, + "learning_rate": 4.9716497917350345e-05, + "loss": 4.9392, + "step": 8069 + }, + { + "epoch": 0.04799457607764773, + "grad_norm": 1.8578897714614868, + "learning_rate": 4.97164277680088e-05, + "loss": 5.3101, + "step": 8070 + }, + { + "epoch": 0.04800052336092873, + "grad_norm": 1.8935741186141968, + "learning_rate": 4.971635761003901e-05, + "loss": 5.3952, + "step": 8071 + }, + { + "epoch": 0.04800647064420972, + "grad_norm": 2.0030407905578613, + "learning_rate": 4.9716287443440994e-05, + "loss": 5.1685, + "step": 8072 + }, + { + "epoch": 0.048012417927490725, + "grad_norm": 2.0079195499420166, + "learning_rate": 4.9716217268214775e-05, + "loss": 5.4942, + "step": 8073 + }, + { + "epoch": 0.04801836521077172, + "grad_norm": 1.7105878591537476, + "learning_rate": 4.971614708436038e-05, + "loss": 5.4124, + "step": 8074 + }, + { + "epoch": 0.048024312494052715, + "grad_norm": 1.7642161846160889, + "learning_rate": 4.971607689187784e-05, + "loss": 5.3187, + "step": 8075 + }, + { + "epoch": 0.04803025977733372, + "grad_norm": 1.7304610013961792, + "learning_rate": 4.9716006690767165e-05, + "loss": 5.308, + "step": 8076 + }, + { + "epoch": 0.04803620706061471, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.971593648102839e-05, + "loss": 5.4581, + "step": 8077 + }, + { + "epoch": 0.04804215434389571, + "grad_norm": 1.8008997440338135, + "learning_rate": 4.971586626266154e-05, + "loss": 5.3266, + "step": 8078 + }, + { + "epoch": 0.0480481016271767, + "grad_norm": 1.8691446781158447, + "learning_rate": 4.971579603566663e-05, + "loss": 5.2847, + "step": 8079 + }, + { + "epoch": 0.048054048910457704, + "grad_norm": 1.7805777788162231, + "learning_rate": 4.97157258000437e-05, + "loss": 5.446, + "step": 8080 + }, + { + "epoch": 0.0480599961937387, + "grad_norm": 1.4973244667053223, + "learning_rate": 4.971565555579275e-05, + "loss": 5.412, + "step": 8081 + }, + { + "epoch": 0.048065943477019694, + "grad_norm": 1.5994775295257568, + "learning_rate": 4.971558530291384e-05, + "loss": 5.3285, + "step": 8082 + }, + { + "epoch": 0.048071890760300696, + "grad_norm": 1.7743935585021973, + "learning_rate": 4.971551504140696e-05, + "loss": 5.326, + "step": 8083 + }, + { + "epoch": 0.04807783804358169, + "grad_norm": 1.5922112464904785, + "learning_rate": 4.9715444771272154e-05, + "loss": 5.3338, + "step": 8084 + }, + { + "epoch": 0.04808378532686269, + "grad_norm": 1.5587191581726074, + "learning_rate": 4.971537449250944e-05, + "loss": 5.2437, + "step": 8085 + }, + { + "epoch": 0.04808973261014369, + "grad_norm": 1.4972636699676514, + "learning_rate": 4.971530420511884e-05, + "loss": 5.2271, + "step": 8086 + }, + { + "epoch": 0.048095679893424684, + "grad_norm": 1.6221843957901, + "learning_rate": 4.971523390910039e-05, + "loss": 5.3225, + "step": 8087 + }, + { + "epoch": 0.04810162717670568, + "grad_norm": 1.5826990604400635, + "learning_rate": 4.971516360445411e-05, + "loss": 5.2955, + "step": 8088 + }, + { + "epoch": 0.04810757445998668, + "grad_norm": 1.729963779449463, + "learning_rate": 4.971509329118001e-05, + "loss": 5.3263, + "step": 8089 + }, + { + "epoch": 0.048113521743267676, + "grad_norm": 1.680851697921753, + "learning_rate": 4.971502296927813e-05, + "loss": 5.3579, + "step": 8090 + }, + { + "epoch": 0.04811946902654867, + "grad_norm": 2.028024673461914, + "learning_rate": 4.9714952638748504e-05, + "loss": 5.3632, + "step": 8091 + }, + { + "epoch": 0.04812541630982967, + "grad_norm": 1.6236159801483154, + "learning_rate": 4.9714882299591127e-05, + "loss": 5.222, + "step": 8092 + }, + { + "epoch": 0.04813136359311067, + "grad_norm": 1.7522811889648438, + "learning_rate": 4.971481195180605e-05, + "loss": 5.3752, + "step": 8093 + }, + { + "epoch": 0.04813731087639166, + "grad_norm": 1.7108362913131714, + "learning_rate": 4.9714741595393274e-05, + "loss": 5.2994, + "step": 8094 + }, + { + "epoch": 0.04814325815967266, + "grad_norm": 1.7863954305648804, + "learning_rate": 4.971467123035285e-05, + "loss": 5.2386, + "step": 8095 + }, + { + "epoch": 0.04814920544295366, + "grad_norm": 2.0054473876953125, + "learning_rate": 4.971460085668479e-05, + "loss": 5.3565, + "step": 8096 + }, + { + "epoch": 0.048155152726234655, + "grad_norm": 1.6878743171691895, + "learning_rate": 4.971453047438911e-05, + "loss": 5.3448, + "step": 8097 + }, + { + "epoch": 0.04816110000951565, + "grad_norm": 1.8534557819366455, + "learning_rate": 4.971446008346585e-05, + "loss": 5.1446, + "step": 8098 + }, + { + "epoch": 0.04816704729279665, + "grad_norm": 1.8549425601959229, + "learning_rate": 4.9714389683915025e-05, + "loss": 5.2433, + "step": 8099 + }, + { + "epoch": 0.04817299457607765, + "grad_norm": 1.5624927282333374, + "learning_rate": 4.9714319275736666e-05, + "loss": 5.0645, + "step": 8100 + }, + { + "epoch": 0.04817894185935864, + "grad_norm": 1.670462965965271, + "learning_rate": 4.971424885893078e-05, + "loss": 5.1213, + "step": 8101 + }, + { + "epoch": 0.048184889142639645, + "grad_norm": 2.039595603942871, + "learning_rate": 4.9714178433497414e-05, + "loss": 5.1797, + "step": 8102 + }, + { + "epoch": 0.04819083642592064, + "grad_norm": 1.9546380043029785, + "learning_rate": 4.971410799943659e-05, + "loss": 5.2432, + "step": 8103 + }, + { + "epoch": 0.048196783709201635, + "grad_norm": 1.892397403717041, + "learning_rate": 4.971403755674832e-05, + "loss": 5.1775, + "step": 8104 + }, + { + "epoch": 0.04820273099248264, + "grad_norm": 1.7021955251693726, + "learning_rate": 4.971396710543263e-05, + "loss": 5.2242, + "step": 8105 + }, + { + "epoch": 0.04820867827576363, + "grad_norm": 1.7652686834335327, + "learning_rate": 4.9713896645489556e-05, + "loss": 5.1419, + "step": 8106 + }, + { + "epoch": 0.04821462555904463, + "grad_norm": 1.8669620752334595, + "learning_rate": 4.971382617691911e-05, + "loss": 5.1392, + "step": 8107 + }, + { + "epoch": 0.04822057284232562, + "grad_norm": 1.8774491548538208, + "learning_rate": 4.971375569972133e-05, + "loss": 5.1853, + "step": 8108 + }, + { + "epoch": 0.048226520125606624, + "grad_norm": 1.6108628511428833, + "learning_rate": 4.971368521389623e-05, + "loss": 5.4858, + "step": 8109 + }, + { + "epoch": 0.04823246740888762, + "grad_norm": 1.6839191913604736, + "learning_rate": 4.9713614719443835e-05, + "loss": 5.4217, + "step": 8110 + }, + { + "epoch": 0.048238414692168614, + "grad_norm": 1.9300925731658936, + "learning_rate": 4.9713544216364176e-05, + "loss": 5.2259, + "step": 8111 + }, + { + "epoch": 0.048244361975449616, + "grad_norm": 1.9142355918884277, + "learning_rate": 4.971347370465728e-05, + "loss": 5.2, + "step": 8112 + }, + { + "epoch": 0.04825030925873061, + "grad_norm": 1.8046603202819824, + "learning_rate": 4.971340318432315e-05, + "loss": 5.0951, + "step": 8113 + }, + { + "epoch": 0.04825625654201161, + "grad_norm": 1.9129396677017212, + "learning_rate": 4.971333265536184e-05, + "loss": 5.0376, + "step": 8114 + }, + { + "epoch": 0.04826220382529261, + "grad_norm": 1.6774524450302124, + "learning_rate": 4.971326211777335e-05, + "loss": 5.4313, + "step": 8115 + }, + { + "epoch": 0.048268151108573604, + "grad_norm": 1.8156472444534302, + "learning_rate": 4.971319157155773e-05, + "loss": 5.4336, + "step": 8116 + }, + { + "epoch": 0.0482740983918546, + "grad_norm": 1.5704171657562256, + "learning_rate": 4.9713121016714976e-05, + "loss": 5.6878, + "step": 8117 + }, + { + "epoch": 0.0482800456751356, + "grad_norm": 1.585528016090393, + "learning_rate": 4.9713050453245135e-05, + "loss": 5.6208, + "step": 8118 + }, + { + "epoch": 0.048285992958416596, + "grad_norm": 1.3975930213928223, + "learning_rate": 4.9712979881148215e-05, + "loss": 5.8001, + "step": 8119 + }, + { + "epoch": 0.04829194024169759, + "grad_norm": 1.8124761581420898, + "learning_rate": 4.971290930042426e-05, + "loss": 5.6006, + "step": 8120 + }, + { + "epoch": 0.04829788752497859, + "grad_norm": 1.8448232412338257, + "learning_rate": 4.971283871107327e-05, + "loss": 5.4324, + "step": 8121 + }, + { + "epoch": 0.04830383480825959, + "grad_norm": 1.772218108177185, + "learning_rate": 4.97127681130953e-05, + "loss": 6.0943, + "step": 8122 + }, + { + "epoch": 0.04830978209154058, + "grad_norm": 2.038703441619873, + "learning_rate": 4.9712697506490345e-05, + "loss": 5.4224, + "step": 8123 + }, + { + "epoch": 0.04831572937482158, + "grad_norm": 1.576430320739746, + "learning_rate": 4.971262689125845e-05, + "loss": 5.351, + "step": 8124 + }, + { + "epoch": 0.04832167665810258, + "grad_norm": 1.857021450996399, + "learning_rate": 4.971255626739963e-05, + "loss": 5.258, + "step": 8125 + }, + { + "epoch": 0.048327623941383575, + "grad_norm": 1.7989404201507568, + "learning_rate": 4.971248563491391e-05, + "loss": 5.3925, + "step": 8126 + }, + { + "epoch": 0.04833357122466457, + "grad_norm": 1.8104023933410645, + "learning_rate": 4.9712414993801314e-05, + "loss": 5.4326, + "step": 8127 + }, + { + "epoch": 0.04833951850794557, + "grad_norm": 1.898054838180542, + "learning_rate": 4.971234434406188e-05, + "loss": 5.2094, + "step": 8128 + }, + { + "epoch": 0.04834546579122657, + "grad_norm": 1.436633586883545, + "learning_rate": 4.971227368569561e-05, + "loss": 5.2994, + "step": 8129 + }, + { + "epoch": 0.04835141307450756, + "grad_norm": 1.4576120376586914, + "learning_rate": 4.971220301870255e-05, + "loss": 5.3504, + "step": 8130 + }, + { + "epoch": 0.048357360357788565, + "grad_norm": 1.7260229587554932, + "learning_rate": 4.971213234308271e-05, + "loss": 5.1083, + "step": 8131 + }, + { + "epoch": 0.04836330764106956, + "grad_norm": 1.8110415935516357, + "learning_rate": 4.971206165883612e-05, + "loss": 5.1298, + "step": 8132 + }, + { + "epoch": 0.048369254924350555, + "grad_norm": 2.1696786880493164, + "learning_rate": 4.9711990965962804e-05, + "loss": 5.8155, + "step": 8133 + }, + { + "epoch": 0.04837520220763156, + "grad_norm": 1.9905856847763062, + "learning_rate": 4.971192026446279e-05, + "loss": 5.5814, + "step": 8134 + }, + { + "epoch": 0.04838114949091255, + "grad_norm": 1.7459521293640137, + "learning_rate": 4.97118495543361e-05, + "loss": 5.4358, + "step": 8135 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 1.8495198488235474, + "learning_rate": 4.9711778835582756e-05, + "loss": 5.3652, + "step": 8136 + }, + { + "epoch": 0.04839304405747455, + "grad_norm": 1.782850742340088, + "learning_rate": 4.971170810820279e-05, + "loss": 5.2361, + "step": 8137 + }, + { + "epoch": 0.048398991340755544, + "grad_norm": 1.7327016592025757, + "learning_rate": 4.971163737219622e-05, + "loss": 5.0802, + "step": 8138 + }, + { + "epoch": 0.04840493862403654, + "grad_norm": 1.663620114326477, + "learning_rate": 4.9711566627563066e-05, + "loss": 5.1566, + "step": 8139 + }, + { + "epoch": 0.048410885907317534, + "grad_norm": 1.5109026432037354, + "learning_rate": 4.971149587430336e-05, + "loss": 5.1499, + "step": 8140 + }, + { + "epoch": 0.048416833190598536, + "grad_norm": 1.3494226932525635, + "learning_rate": 4.971142511241714e-05, + "loss": 5.1684, + "step": 8141 + }, + { + "epoch": 0.04842278047387953, + "grad_norm": 1.721880555152893, + "learning_rate": 4.97113543419044e-05, + "loss": 5.0199, + "step": 8142 + }, + { + "epoch": 0.048428727757160527, + "grad_norm": 1.7465516328811646, + "learning_rate": 4.971128356276519e-05, + "loss": 5.1181, + "step": 8143 + }, + { + "epoch": 0.04843467504044153, + "grad_norm": 1.8127025365829468, + "learning_rate": 4.971121277499953e-05, + "loss": 5.6514, + "step": 8144 + }, + { + "epoch": 0.048440622323722524, + "grad_norm": 1.6027450561523438, + "learning_rate": 4.971114197860743e-05, + "loss": 5.3408, + "step": 8145 + }, + { + "epoch": 0.04844656960700352, + "grad_norm": 1.6985208988189697, + "learning_rate": 4.971107117358894e-05, + "loss": 5.2002, + "step": 8146 + }, + { + "epoch": 0.04845251689028452, + "grad_norm": 1.681305170059204, + "learning_rate": 4.971100035994406e-05, + "loss": 5.1389, + "step": 8147 + }, + { + "epoch": 0.048458464173565516, + "grad_norm": 1.6053674221038818, + "learning_rate": 4.971092953767282e-05, + "loss": 5.0665, + "step": 8148 + }, + { + "epoch": 0.04846441145684651, + "grad_norm": 1.743134617805481, + "learning_rate": 4.9710858706775266e-05, + "loss": 5.1427, + "step": 8149 + }, + { + "epoch": 0.04847035874012751, + "grad_norm": 1.4901342391967773, + "learning_rate": 4.9710787867251396e-05, + "loss": 5.1957, + "step": 8150 + }, + { + "epoch": 0.04847630602340851, + "grad_norm": 1.6003857851028442, + "learning_rate": 4.971071701910125e-05, + "loss": 5.0658, + "step": 8151 + }, + { + "epoch": 0.0484822533066895, + "grad_norm": 1.7036428451538086, + "learning_rate": 4.971064616232484e-05, + "loss": 5.0823, + "step": 8152 + }, + { + "epoch": 0.0484882005899705, + "grad_norm": 1.5894789695739746, + "learning_rate": 4.97105752969222e-05, + "loss": 5.093, + "step": 8153 + }, + { + "epoch": 0.0484941478732515, + "grad_norm": 1.487648367881775, + "learning_rate": 4.9710504422893364e-05, + "loss": 5.0089, + "step": 8154 + }, + { + "epoch": 0.048500095156532495, + "grad_norm": 2.0251479148864746, + "learning_rate": 4.971043354023834e-05, + "loss": 5.0552, + "step": 8155 + }, + { + "epoch": 0.04850604243981349, + "grad_norm": 1.7097325325012207, + "learning_rate": 4.971036264895715e-05, + "loss": 5.2737, + "step": 8156 + }, + { + "epoch": 0.04851198972309449, + "grad_norm": 1.784836769104004, + "learning_rate": 4.971029174904984e-05, + "loss": 5.2863, + "step": 8157 + }, + { + "epoch": 0.04851793700637549, + "grad_norm": 1.4765781164169312, + "learning_rate": 4.9710220840516416e-05, + "loss": 5.4057, + "step": 8158 + }, + { + "epoch": 0.04852388428965648, + "grad_norm": 1.4173041582107544, + "learning_rate": 4.9710149923356915e-05, + "loss": 5.187, + "step": 8159 + }, + { + "epoch": 0.048529831572937485, + "grad_norm": 1.488173007965088, + "learning_rate": 4.971007899757135e-05, + "loss": 4.975, + "step": 8160 + }, + { + "epoch": 0.04853577885621848, + "grad_norm": 1.391435980796814, + "learning_rate": 4.9710008063159756e-05, + "loss": 5.0782, + "step": 8161 + }, + { + "epoch": 0.048541726139499475, + "grad_norm": 1.7100436687469482, + "learning_rate": 4.970993712012215e-05, + "loss": 5.4953, + "step": 8162 + }, + { + "epoch": 0.04854767342278048, + "grad_norm": 1.8748459815979004, + "learning_rate": 4.970986616845856e-05, + "loss": 5.4535, + "step": 8163 + }, + { + "epoch": 0.04855362070606147, + "grad_norm": 1.901802897453308, + "learning_rate": 4.970979520816902e-05, + "loss": 5.3619, + "step": 8164 + }, + { + "epoch": 0.04855956798934247, + "grad_norm": 1.9850586652755737, + "learning_rate": 4.970972423925354e-05, + "loss": 5.039, + "step": 8165 + }, + { + "epoch": 0.04856551527262347, + "grad_norm": 1.5195177793502808, + "learning_rate": 4.970965326171214e-05, + "loss": 5.1721, + "step": 8166 + }, + { + "epoch": 0.048571462555904464, + "grad_norm": 1.4180214405059814, + "learning_rate": 4.9709582275544866e-05, + "loss": 5.2319, + "step": 8167 + }, + { + "epoch": 0.04857740983918546, + "grad_norm": 1.3797354698181152, + "learning_rate": 4.970951128075173e-05, + "loss": 5.1813, + "step": 8168 + }, + { + "epoch": 0.048583357122466454, + "grad_norm": 1.6448336839675903, + "learning_rate": 4.970944027733276e-05, + "loss": 5.1968, + "step": 8169 + }, + { + "epoch": 0.048589304405747456, + "grad_norm": 1.6626337766647339, + "learning_rate": 4.9709369265287986e-05, + "loss": 5.1303, + "step": 8170 + }, + { + "epoch": 0.04859525168902845, + "grad_norm": 1.5715514421463013, + "learning_rate": 4.970929824461742e-05, + "loss": 5.1609, + "step": 8171 + }, + { + "epoch": 0.048601198972309446, + "grad_norm": 1.5971697568893433, + "learning_rate": 4.970922721532108e-05, + "loss": 5.1489, + "step": 8172 + }, + { + "epoch": 0.04860714625559045, + "grad_norm": 1.6784114837646484, + "learning_rate": 4.970915617739903e-05, + "loss": 5.2778, + "step": 8173 + }, + { + "epoch": 0.048613093538871444, + "grad_norm": 1.7507476806640625, + "learning_rate": 4.970908513085125e-05, + "loss": 5.5719, + "step": 8174 + }, + { + "epoch": 0.04861904082215244, + "grad_norm": 1.7017735242843628, + "learning_rate": 4.970901407567779e-05, + "loss": 5.5197, + "step": 8175 + }, + { + "epoch": 0.04862498810543344, + "grad_norm": 1.8569817543029785, + "learning_rate": 4.9708943011878674e-05, + "loss": 5.3823, + "step": 8176 + }, + { + "epoch": 0.048630935388714436, + "grad_norm": 1.5183817148208618, + "learning_rate": 4.970887193945391e-05, + "loss": 5.5518, + "step": 8177 + }, + { + "epoch": 0.04863688267199543, + "grad_norm": 1.4175498485565186, + "learning_rate": 4.970880085840354e-05, + "loss": 5.4526, + "step": 8178 + }, + { + "epoch": 0.04864282995527643, + "grad_norm": 1.7228561639785767, + "learning_rate": 4.970872976872758e-05, + "loss": 5.5162, + "step": 8179 + }, + { + "epoch": 0.04864877723855743, + "grad_norm": 2.043182849884033, + "learning_rate": 4.970865867042606e-05, + "loss": 5.4212, + "step": 8180 + }, + { + "epoch": 0.04865472452183842, + "grad_norm": 1.377565622329712, + "learning_rate": 4.970858756349901e-05, + "loss": 5.2817, + "step": 8181 + }, + { + "epoch": 0.04866067180511942, + "grad_norm": 1.6977208852767944, + "learning_rate": 4.970851644794643e-05, + "loss": 5.4081, + "step": 8182 + }, + { + "epoch": 0.04866661908840042, + "grad_norm": 1.3136184215545654, + "learning_rate": 4.970844532376838e-05, + "loss": 5.4272, + "step": 8183 + }, + { + "epoch": 0.048672566371681415, + "grad_norm": 1.8863121271133423, + "learning_rate": 4.9708374190964854e-05, + "loss": 5.441, + "step": 8184 + }, + { + "epoch": 0.04867851365496241, + "grad_norm": 1.6755374670028687, + "learning_rate": 4.97083030495359e-05, + "loss": 5.5045, + "step": 8185 + }, + { + "epoch": 0.04868446093824341, + "grad_norm": 1.8439961671829224, + "learning_rate": 4.970823189948153e-05, + "loss": 5.5252, + "step": 8186 + }, + { + "epoch": 0.04869040822152441, + "grad_norm": 1.9662889242172241, + "learning_rate": 4.9708160740801765e-05, + "loss": 5.4379, + "step": 8187 + }, + { + "epoch": 0.0486963555048054, + "grad_norm": 1.691857099533081, + "learning_rate": 4.970808957349664e-05, + "loss": 5.3652, + "step": 8188 + }, + { + "epoch": 0.048702302788086405, + "grad_norm": 1.7482357025146484, + "learning_rate": 4.970801839756618e-05, + "loss": 5.1436, + "step": 8189 + }, + { + "epoch": 0.0487082500713674, + "grad_norm": 1.9221199750900269, + "learning_rate": 4.9707947213010396e-05, + "loss": 5.1936, + "step": 8190 + }, + { + "epoch": 0.048714197354648395, + "grad_norm": 1.9124062061309814, + "learning_rate": 4.970787601982933e-05, + "loss": 5.28, + "step": 8191 + }, + { + "epoch": 0.0487201446379294, + "grad_norm": 1.8999123573303223, + "learning_rate": 4.9707804818023e-05, + "loss": 5.3262, + "step": 8192 + }, + { + "epoch": 0.04872609192121039, + "grad_norm": 1.7711995840072632, + "learning_rate": 4.970773360759143e-05, + "loss": 5.1764, + "step": 8193 + }, + { + "epoch": 0.04873203920449139, + "grad_norm": 2.122689962387085, + "learning_rate": 4.970766238853465e-05, + "loss": 5.4345, + "step": 8194 + }, + { + "epoch": 0.04873798648777239, + "grad_norm": 2.1027848720550537, + "learning_rate": 4.9707591160852675e-05, + "loss": 5.4547, + "step": 8195 + }, + { + "epoch": 0.048743933771053384, + "grad_norm": 1.6944631338119507, + "learning_rate": 4.970751992454553e-05, + "loss": 5.3638, + "step": 8196 + }, + { + "epoch": 0.04874988105433438, + "grad_norm": 1.7444918155670166, + "learning_rate": 4.9707448679613256e-05, + "loss": 5.2378, + "step": 8197 + }, + { + "epoch": 0.048755828337615374, + "grad_norm": 1.8864104747772217, + "learning_rate": 4.970737742605586e-05, + "loss": 5.3142, + "step": 8198 + }, + { + "epoch": 0.048761775620896376, + "grad_norm": 1.968748927116394, + "learning_rate": 4.970730616387338e-05, + "loss": 5.0824, + "step": 8199 + }, + { + "epoch": 0.04876772290417737, + "grad_norm": 2.166405439376831, + "learning_rate": 4.9707234893065824e-05, + "loss": 5.0999, + "step": 8200 + }, + { + "epoch": 0.048773670187458366, + "grad_norm": 1.9185746908187866, + "learning_rate": 4.970716361363323e-05, + "loss": 5.1465, + "step": 8201 + }, + { + "epoch": 0.04877961747073937, + "grad_norm": 1.9191651344299316, + "learning_rate": 4.9707092325575635e-05, + "loss": 5.0713, + "step": 8202 + }, + { + "epoch": 0.048785564754020364, + "grad_norm": 1.6470153331756592, + "learning_rate": 4.9707021028893034e-05, + "loss": 5.0816, + "step": 8203 + }, + { + "epoch": 0.04879151203730136, + "grad_norm": 1.6995042562484741, + "learning_rate": 4.9706949723585475e-05, + "loss": 5.0207, + "step": 8204 + }, + { + "epoch": 0.04879745932058236, + "grad_norm": 1.8208703994750977, + "learning_rate": 4.970687840965297e-05, + "loss": 4.9789, + "step": 8205 + }, + { + "epoch": 0.048803406603863356, + "grad_norm": 1.8558207750320435, + "learning_rate": 4.9706807087095555e-05, + "loss": 5.0655, + "step": 8206 + }, + { + "epoch": 0.04880935388714435, + "grad_norm": 1.6349478960037231, + "learning_rate": 4.9706735755913234e-05, + "loss": 5.2657, + "step": 8207 + }, + { + "epoch": 0.04881530117042535, + "grad_norm": 1.587143063545227, + "learning_rate": 4.9706664416106065e-05, + "loss": 5.0765, + "step": 8208 + }, + { + "epoch": 0.04882124845370635, + "grad_norm": 1.8467018604278564, + "learning_rate": 4.9706593067674047e-05, + "loss": 5.1458, + "step": 8209 + }, + { + "epoch": 0.04882719573698734, + "grad_norm": 1.8066186904907227, + "learning_rate": 4.9706521710617214e-05, + "loss": 5.0656, + "step": 8210 + }, + { + "epoch": 0.04883314302026834, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9706450344935586e-05, + "loss": 5.1448, + "step": 8211 + }, + { + "epoch": 0.04883909030354934, + "grad_norm": 1.8924201726913452, + "learning_rate": 4.97063789706292e-05, + "loss": 4.748, + "step": 8212 + }, + { + "epoch": 0.048845037586830335, + "grad_norm": 2.091324806213379, + "learning_rate": 4.9706307587698064e-05, + "loss": 5.6537, + "step": 8213 + }, + { + "epoch": 0.04885098487011133, + "grad_norm": 3.1737043857574463, + "learning_rate": 4.970623619614221e-05, + "loss": 5.6898, + "step": 8214 + }, + { + "epoch": 0.04885693215339233, + "grad_norm": 2.194577932357788, + "learning_rate": 4.970616479596167e-05, + "loss": 5.4958, + "step": 8215 + }, + { + "epoch": 0.04886287943667333, + "grad_norm": 2.2362759113311768, + "learning_rate": 4.970609338715646e-05, + "loss": 4.9919, + "step": 8216 + }, + { + "epoch": 0.04886882671995432, + "grad_norm": 1.703684687614441, + "learning_rate": 4.970602196972661e-05, + "loss": 4.8733, + "step": 8217 + }, + { + "epoch": 0.048874774003235325, + "grad_norm": 2.0205307006835938, + "learning_rate": 4.970595054367214e-05, + "loss": 5.1177, + "step": 8218 + }, + { + "epoch": 0.04888072128651632, + "grad_norm": 2.1270928382873535, + "learning_rate": 4.970587910899308e-05, + "loss": 5.6208, + "step": 8219 + }, + { + "epoch": 0.048886668569797315, + "grad_norm": 1.8992488384246826, + "learning_rate": 4.9705807665689455e-05, + "loss": 5.7754, + "step": 8220 + }, + { + "epoch": 0.04889261585307832, + "grad_norm": 2.279099225997925, + "learning_rate": 4.9705736213761286e-05, + "loss": 5.5924, + "step": 8221 + }, + { + "epoch": 0.04889856313635931, + "grad_norm": 1.9186346530914307, + "learning_rate": 4.9705664753208594e-05, + "loss": 5.9424, + "step": 8222 + }, + { + "epoch": 0.04890451041964031, + "grad_norm": 2.0286009311676025, + "learning_rate": 4.970559328403141e-05, + "loss": 5.8461, + "step": 8223 + }, + { + "epoch": 0.04891045770292131, + "grad_norm": 1.797555685043335, + "learning_rate": 4.970552180622977e-05, + "loss": 5.4929, + "step": 8224 + }, + { + "epoch": 0.048916404986202304, + "grad_norm": 2.4879684448242188, + "learning_rate": 4.970545031980368e-05, + "loss": 5.5253, + "step": 8225 + }, + { + "epoch": 0.0489223522694833, + "grad_norm": 2.749763011932373, + "learning_rate": 4.970537882475318e-05, + "loss": 5.6001, + "step": 8226 + }, + { + "epoch": 0.048928299552764294, + "grad_norm": 2.2076292037963867, + "learning_rate": 4.970530732107827e-05, + "loss": 5.5876, + "step": 8227 + }, + { + "epoch": 0.048934246836045296, + "grad_norm": 2.6566662788391113, + "learning_rate": 4.970523580877901e-05, + "loss": 5.7151, + "step": 8228 + }, + { + "epoch": 0.04894019411932629, + "grad_norm": 2.4873850345611572, + "learning_rate": 4.97051642878554e-05, + "loss": 5.7124, + "step": 8229 + }, + { + "epoch": 0.048946141402607286, + "grad_norm": 1.8365200757980347, + "learning_rate": 4.970509275830748e-05, + "loss": 5.292, + "step": 8230 + }, + { + "epoch": 0.04895208868588829, + "grad_norm": 2.064730644226074, + "learning_rate": 4.9705021220135254e-05, + "loss": 5.2854, + "step": 8231 + }, + { + "epoch": 0.04895803596916928, + "grad_norm": 1.969298005104065, + "learning_rate": 4.970494967333877e-05, + "loss": 5.2113, + "step": 8232 + }, + { + "epoch": 0.04896398325245028, + "grad_norm": 1.8438071012496948, + "learning_rate": 4.9704878117918044e-05, + "loss": 5.2281, + "step": 8233 + }, + { + "epoch": 0.04896993053573128, + "grad_norm": 1.9163525104522705, + "learning_rate": 4.97048065538731e-05, + "loss": 5.043, + "step": 8234 + }, + { + "epoch": 0.048975877819012276, + "grad_norm": 1.802356243133545, + "learning_rate": 4.970473498120395e-05, + "loss": 5.2079, + "step": 8235 + }, + { + "epoch": 0.04898182510229327, + "grad_norm": 1.7572704553604126, + "learning_rate": 4.9704663399910645e-05, + "loss": 5.1119, + "step": 8236 + }, + { + "epoch": 0.04898777238557427, + "grad_norm": 1.848747730255127, + "learning_rate": 4.970459180999319e-05, + "loss": 5.0233, + "step": 8237 + }, + { + "epoch": 0.04899371966885527, + "grad_norm": 2.023036003112793, + "learning_rate": 4.9704520211451624e-05, + "loss": 5.2793, + "step": 8238 + }, + { + "epoch": 0.04899966695213626, + "grad_norm": 1.6738852262496948, + "learning_rate": 4.9704448604285965e-05, + "loss": 5.5255, + "step": 8239 + }, + { + "epoch": 0.04900561423541726, + "grad_norm": 1.6676057577133179, + "learning_rate": 4.970437698849624e-05, + "loss": 5.4287, + "step": 8240 + }, + { + "epoch": 0.04901156151869826, + "grad_norm": 1.9960590600967407, + "learning_rate": 4.970430536408247e-05, + "loss": 5.2939, + "step": 8241 + }, + { + "epoch": 0.049017508801979255, + "grad_norm": 2.7218708992004395, + "learning_rate": 4.9704233731044675e-05, + "loss": 5.9019, + "step": 8242 + }, + { + "epoch": 0.04902345608526025, + "grad_norm": 2.385664224624634, + "learning_rate": 4.970416208938289e-05, + "loss": 5.9146, + "step": 8243 + }, + { + "epoch": 0.04902940336854125, + "grad_norm": 2.2598092555999756, + "learning_rate": 4.970409043909714e-05, + "loss": 5.7451, + "step": 8244 + }, + { + "epoch": 0.04903535065182225, + "grad_norm": 2.3063299655914307, + "learning_rate": 4.970401878018745e-05, + "loss": 5.8675, + "step": 8245 + }, + { + "epoch": 0.04904129793510324, + "grad_norm": 2.1543853282928467, + "learning_rate": 4.9703947112653836e-05, + "loss": 5.9136, + "step": 8246 + }, + { + "epoch": 0.049047245218384244, + "grad_norm": 2.267531633377075, + "learning_rate": 4.970387543649634e-05, + "loss": 5.6834, + "step": 8247 + }, + { + "epoch": 0.04905319250166524, + "grad_norm": 2.047351121902466, + "learning_rate": 4.970380375171496e-05, + "loss": 5.5754, + "step": 8248 + }, + { + "epoch": 0.049059139784946235, + "grad_norm": 2.2565114498138428, + "learning_rate": 4.9703732058309745e-05, + "loss": 5.7067, + "step": 8249 + }, + { + "epoch": 0.04906508706822724, + "grad_norm": 1.7584022283554077, + "learning_rate": 4.970366035628073e-05, + "loss": 5.3926, + "step": 8250 + }, + { + "epoch": 0.04907103435150823, + "grad_norm": 1.9898183345794678, + "learning_rate": 4.9703588645627896e-05, + "loss": 5.7163, + "step": 8251 + }, + { + "epoch": 0.04907698163478923, + "grad_norm": 2.4134786128997803, + "learning_rate": 4.970351692635131e-05, + "loss": 5.672, + "step": 8252 + }, + { + "epoch": 0.04908292891807023, + "grad_norm": 2.1059436798095703, + "learning_rate": 4.970344519845097e-05, + "loss": 5.7719, + "step": 8253 + }, + { + "epoch": 0.049088876201351224, + "grad_norm": 2.0731539726257324, + "learning_rate": 4.970337346192692e-05, + "loss": 5.7104, + "step": 8254 + }, + { + "epoch": 0.04909482348463222, + "grad_norm": 2.3058536052703857, + "learning_rate": 4.970330171677918e-05, + "loss": 5.7435, + "step": 8255 + }, + { + "epoch": 0.049100770767913214, + "grad_norm": 2.051424980163574, + "learning_rate": 4.970322996300777e-05, + "loss": 5.7371, + "step": 8256 + }, + { + "epoch": 0.049106718051194216, + "grad_norm": 2.1715517044067383, + "learning_rate": 4.970315820061271e-05, + "loss": 5.5805, + "step": 8257 + }, + { + "epoch": 0.04911266533447521, + "grad_norm": 2.136617422103882, + "learning_rate": 4.9703086429594034e-05, + "loss": 5.8689, + "step": 8258 + }, + { + "epoch": 0.049118612617756206, + "grad_norm": 1.7089059352874756, + "learning_rate": 4.970301464995178e-05, + "loss": 6.0614, + "step": 8259 + }, + { + "epoch": 0.04912455990103721, + "grad_norm": 2.410067319869995, + "learning_rate": 4.970294286168595e-05, + "loss": 5.8762, + "step": 8260 + }, + { + "epoch": 0.0491305071843182, + "grad_norm": 2.2186291217803955, + "learning_rate": 4.970287106479657e-05, + "loss": 5.4903, + "step": 8261 + }, + { + "epoch": 0.0491364544675992, + "grad_norm": 2.312793016433716, + "learning_rate": 4.970279925928368e-05, + "loss": 6.2488, + "step": 8262 + }, + { + "epoch": 0.0491424017508802, + "grad_norm": 2.127859354019165, + "learning_rate": 4.9702727445147305e-05, + "loss": 5.9976, + "step": 8263 + }, + { + "epoch": 0.049148349034161196, + "grad_norm": 2.604367733001709, + "learning_rate": 4.9702655622387454e-05, + "loss": 5.4153, + "step": 8264 + }, + { + "epoch": 0.04915429631744219, + "grad_norm": 1.7832142114639282, + "learning_rate": 4.9702583791004165e-05, + "loss": 5.4024, + "step": 8265 + }, + { + "epoch": 0.04916024360072319, + "grad_norm": 2.04298734664917, + "learning_rate": 4.970251195099746e-05, + "loss": 5.7034, + "step": 8266 + }, + { + "epoch": 0.04916619088400419, + "grad_norm": 2.1806769371032715, + "learning_rate": 4.970244010236736e-05, + "loss": 6.1212, + "step": 8267 + }, + { + "epoch": 0.04917213816728518, + "grad_norm": 1.8740427494049072, + "learning_rate": 4.970236824511389e-05, + "loss": 5.7562, + "step": 8268 + }, + { + "epoch": 0.04917808545056618, + "grad_norm": 1.7718658447265625, + "learning_rate": 4.970229637923709e-05, + "loss": 5.5126, + "step": 8269 + }, + { + "epoch": 0.04918403273384718, + "grad_norm": 1.4966565370559692, + "learning_rate": 4.970222450473696e-05, + "loss": 5.5422, + "step": 8270 + }, + { + "epoch": 0.049189980017128175, + "grad_norm": 1.8283390998840332, + "learning_rate": 4.970215262161355e-05, + "loss": 5.9333, + "step": 8271 + }, + { + "epoch": 0.04919592730040917, + "grad_norm": 2.087460517883301, + "learning_rate": 4.970208072986687e-05, + "loss": 5.5413, + "step": 8272 + }, + { + "epoch": 0.04920187458369017, + "grad_norm": 2.2952873706817627, + "learning_rate": 4.970200882949694e-05, + "loss": 5.7848, + "step": 8273 + }, + { + "epoch": 0.04920782186697117, + "grad_norm": 1.9511842727661133, + "learning_rate": 4.9701936920503804e-05, + "loss": 5.6172, + "step": 8274 + }, + { + "epoch": 0.04921376915025216, + "grad_norm": 1.992211937904358, + "learning_rate": 4.970186500288748e-05, + "loss": 5.48, + "step": 8275 + }, + { + "epoch": 0.049219716433533164, + "grad_norm": 1.739013910293579, + "learning_rate": 4.9701793076647984e-05, + "loss": 5.6351, + "step": 8276 + }, + { + "epoch": 0.04922566371681416, + "grad_norm": 2.150797128677368, + "learning_rate": 4.970172114178534e-05, + "loss": 5.5957, + "step": 8277 + }, + { + "epoch": 0.049231611000095155, + "grad_norm": 2.074070930480957, + "learning_rate": 4.9701649198299594e-05, + "loss": 5.4751, + "step": 8278 + }, + { + "epoch": 0.04923755828337616, + "grad_norm": 2.2276322841644287, + "learning_rate": 4.970157724619075e-05, + "loss": 5.4434, + "step": 8279 + }, + { + "epoch": 0.04924350556665715, + "grad_norm": 1.9707896709442139, + "learning_rate": 4.970150528545884e-05, + "loss": 5.6935, + "step": 8280 + }, + { + "epoch": 0.04924945284993815, + "grad_norm": 2.07774019241333, + "learning_rate": 4.9701433316103895e-05, + "loss": 6.0455, + "step": 8281 + }, + { + "epoch": 0.04925540013321915, + "grad_norm": 2.3262722492218018, + "learning_rate": 4.970136133812593e-05, + "loss": 5.6039, + "step": 8282 + }, + { + "epoch": 0.049261347416500144, + "grad_norm": 2.4353108406066895, + "learning_rate": 4.970128935152498e-05, + "loss": 5.3823, + "step": 8283 + }, + { + "epoch": 0.04926729469978114, + "grad_norm": 2.7383084297180176, + "learning_rate": 4.970121735630106e-05, + "loss": 5.4039, + "step": 8284 + }, + { + "epoch": 0.049273241983062134, + "grad_norm": 2.9022698402404785, + "learning_rate": 4.9701145352454205e-05, + "loss": 5.3571, + "step": 8285 + }, + { + "epoch": 0.049279189266343136, + "grad_norm": 2.314373731613159, + "learning_rate": 4.970107333998443e-05, + "loss": 5.4877, + "step": 8286 + }, + { + "epoch": 0.04928513654962413, + "grad_norm": 1.9494023323059082, + "learning_rate": 4.970100131889177e-05, + "loss": 5.5171, + "step": 8287 + }, + { + "epoch": 0.049291083832905126, + "grad_norm": 2.7892074584960938, + "learning_rate": 4.9700929289176245e-05, + "loss": 5.5347, + "step": 8288 + }, + { + "epoch": 0.04929703111618613, + "grad_norm": 2.305204391479492, + "learning_rate": 4.970085725083788e-05, + "loss": 5.8689, + "step": 8289 + }, + { + "epoch": 0.04930297839946712, + "grad_norm": 2.4212634563446045, + "learning_rate": 4.97007852038767e-05, + "loss": 5.8982, + "step": 8290 + }, + { + "epoch": 0.04930892568274812, + "grad_norm": 3.584625482559204, + "learning_rate": 4.9700713148292734e-05, + "loss": 5.2341, + "step": 8291 + }, + { + "epoch": 0.04931487296602912, + "grad_norm": 2.874703884124756, + "learning_rate": 4.9700641084086e-05, + "loss": 5.2312, + "step": 8292 + }, + { + "epoch": 0.049320820249310116, + "grad_norm": 2.113234519958496, + "learning_rate": 4.9700569011256524e-05, + "loss": 5.5779, + "step": 8293 + }, + { + "epoch": 0.04932676753259111, + "grad_norm": 3.027318000793457, + "learning_rate": 4.970049692980434e-05, + "loss": 5.3899, + "step": 8294 + }, + { + "epoch": 0.04933271481587211, + "grad_norm": 2.779520273208618, + "learning_rate": 4.970042483972947e-05, + "loss": 5.4023, + "step": 8295 + }, + { + "epoch": 0.04933866209915311, + "grad_norm": 2.4358251094818115, + "learning_rate": 4.970035274103193e-05, + "loss": 5.4932, + "step": 8296 + }, + { + "epoch": 0.0493446093824341, + "grad_norm": 1.926193118095398, + "learning_rate": 4.970028063371176e-05, + "loss": 5.4058, + "step": 8297 + }, + { + "epoch": 0.0493505566657151, + "grad_norm": 1.7216569185256958, + "learning_rate": 4.970020851776898e-05, + "loss": 5.3265, + "step": 8298 + }, + { + "epoch": 0.0493565039489961, + "grad_norm": 1.9850976467132568, + "learning_rate": 4.97001363932036e-05, + "loss": 5.1626, + "step": 8299 + }, + { + "epoch": 0.049362451232277095, + "grad_norm": 2.1380982398986816, + "learning_rate": 4.9700064260015666e-05, + "loss": 5.3285, + "step": 8300 + }, + { + "epoch": 0.04936839851555809, + "grad_norm": 2.118781566619873, + "learning_rate": 4.969999211820518e-05, + "loss": 5.3544, + "step": 8301 + }, + { + "epoch": 0.04937434579883909, + "grad_norm": 2.0255584716796875, + "learning_rate": 4.96999199677722e-05, + "loss": 5.4256, + "step": 8302 + }, + { + "epoch": 0.04938029308212009, + "grad_norm": 2.0269806385040283, + "learning_rate": 4.9699847808716724e-05, + "loss": 5.9744, + "step": 8303 + }, + { + "epoch": 0.04938624036540108, + "grad_norm": 2.60446834564209, + "learning_rate": 4.969977564103879e-05, + "loss": 5.3926, + "step": 8304 + }, + { + "epoch": 0.049392187648682084, + "grad_norm": 2.1011881828308105, + "learning_rate": 4.9699703464738426e-05, + "loss": 5.4278, + "step": 8305 + }, + { + "epoch": 0.04939813493196308, + "grad_norm": 1.9267319440841675, + "learning_rate": 4.969963127981564e-05, + "loss": 5.6232, + "step": 8306 + }, + { + "epoch": 0.049404082215244075, + "grad_norm": 2.1958322525024414, + "learning_rate": 4.969955908627048e-05, + "loss": 5.8577, + "step": 8307 + }, + { + "epoch": 0.049410029498525077, + "grad_norm": 2.392241954803467, + "learning_rate": 4.969948688410294e-05, + "loss": 5.8013, + "step": 8308 + }, + { + "epoch": 0.04941597678180607, + "grad_norm": 2.8284695148468018, + "learning_rate": 4.969941467331308e-05, + "loss": 6.1246, + "step": 8309 + }, + { + "epoch": 0.04942192406508707, + "grad_norm": 2.8590078353881836, + "learning_rate": 4.96993424539009e-05, + "loss": 6.1068, + "step": 8310 + }, + { + "epoch": 0.04942787134836807, + "grad_norm": 1.876207709312439, + "learning_rate": 4.969927022586644e-05, + "loss": 5.5493, + "step": 8311 + }, + { + "epoch": 0.049433818631649064, + "grad_norm": 1.988061547279358, + "learning_rate": 4.969919798920972e-05, + "loss": 5.7059, + "step": 8312 + }, + { + "epoch": 0.04943976591493006, + "grad_norm": 2.8230605125427246, + "learning_rate": 4.969912574393077e-05, + "loss": 5.9381, + "step": 8313 + }, + { + "epoch": 0.049445713198211054, + "grad_norm": 2.4622697830200195, + "learning_rate": 4.96990534900296e-05, + "loss": 6.0935, + "step": 8314 + }, + { + "epoch": 0.049451660481492056, + "grad_norm": 2.0811798572540283, + "learning_rate": 4.9698981227506254e-05, + "loss": 6.3475, + "step": 8315 + }, + { + "epoch": 0.04945760776477305, + "grad_norm": 2.099489212036133, + "learning_rate": 4.9698908956360745e-05, + "loss": 5.7266, + "step": 8316 + }, + { + "epoch": 0.049463555048054046, + "grad_norm": 2.1711854934692383, + "learning_rate": 4.9698836676593104e-05, + "loss": 5.6067, + "step": 8317 + }, + { + "epoch": 0.04946950233133505, + "grad_norm": 2.195296287536621, + "learning_rate": 4.969876438820335e-05, + "loss": 5.3896, + "step": 8318 + }, + { + "epoch": 0.04947544961461604, + "grad_norm": 2.114830255508423, + "learning_rate": 4.969869209119151e-05, + "loss": 5.6922, + "step": 8319 + }, + { + "epoch": 0.04948139689789704, + "grad_norm": 2.1534018516540527, + "learning_rate": 4.969861978555762e-05, + "loss": 6.1372, + "step": 8320 + }, + { + "epoch": 0.04948734418117804, + "grad_norm": 2.151495933532715, + "learning_rate": 4.9698547471301696e-05, + "loss": 6.0915, + "step": 8321 + }, + { + "epoch": 0.049493291464459035, + "grad_norm": 1.8232096433639526, + "learning_rate": 4.9698475148423764e-05, + "loss": 6.1492, + "step": 8322 + }, + { + "epoch": 0.04949923874774003, + "grad_norm": 2.1538467407226562, + "learning_rate": 4.9698402816923844e-05, + "loss": 5.6253, + "step": 8323 + }, + { + "epoch": 0.04950518603102103, + "grad_norm": 2.278797149658203, + "learning_rate": 4.969833047680197e-05, + "loss": 6.0055, + "step": 8324 + }, + { + "epoch": 0.04951113331430203, + "grad_norm": 2.479342460632324, + "learning_rate": 4.9698258128058164e-05, + "loss": 5.7909, + "step": 8325 + }, + { + "epoch": 0.04951708059758302, + "grad_norm": 2.2959346771240234, + "learning_rate": 4.969818577069245e-05, + "loss": 5.6888, + "step": 8326 + }, + { + "epoch": 0.04952302788086402, + "grad_norm": 1.841544270515442, + "learning_rate": 4.969811340470486e-05, + "loss": 5.5091, + "step": 8327 + }, + { + "epoch": 0.04952897516414502, + "grad_norm": 2.4512903690338135, + "learning_rate": 4.969804103009541e-05, + "loss": 5.7271, + "step": 8328 + }, + { + "epoch": 0.049534922447426015, + "grad_norm": 2.035473585128784, + "learning_rate": 4.969796864686413e-05, + "loss": 5.3056, + "step": 8329 + }, + { + "epoch": 0.04954086973070701, + "grad_norm": 2.030576705932617, + "learning_rate": 4.9697896255011046e-05, + "loss": 5.2765, + "step": 8330 + }, + { + "epoch": 0.04954681701398801, + "grad_norm": 1.680253505706787, + "learning_rate": 4.9697823854536175e-05, + "loss": 5.1968, + "step": 8331 + }, + { + "epoch": 0.04955276429726901, + "grad_norm": 1.962259292602539, + "learning_rate": 4.969775144543955e-05, + "loss": 5.0743, + "step": 8332 + }, + { + "epoch": 0.04955871158055, + "grad_norm": 2.499044895172119, + "learning_rate": 4.96976790277212e-05, + "loss": 5.5204, + "step": 8333 + }, + { + "epoch": 0.049564658863831004, + "grad_norm": 2.004849672317505, + "learning_rate": 4.969760660138114e-05, + "loss": 5.5714, + "step": 8334 + }, + { + "epoch": 0.049570606147112, + "grad_norm": 2.255171775817871, + "learning_rate": 4.9697534166419405e-05, + "loss": 5.0766, + "step": 8335 + }, + { + "epoch": 0.049576553430392994, + "grad_norm": 2.1219112873077393, + "learning_rate": 4.969746172283601e-05, + "loss": 5.0613, + "step": 8336 + }, + { + "epoch": 0.049582500713673996, + "grad_norm": 1.9718400239944458, + "learning_rate": 4.9697389270631004e-05, + "loss": 5.0007, + "step": 8337 + }, + { + "epoch": 0.04958844799695499, + "grad_norm": 1.87917160987854, + "learning_rate": 4.969731680980437e-05, + "loss": 4.9533, + "step": 8338 + }, + { + "epoch": 0.04959439528023599, + "grad_norm": 1.9610000848770142, + "learning_rate": 4.969724434035618e-05, + "loss": 4.9761, + "step": 8339 + }, + { + "epoch": 0.04960034256351699, + "grad_norm": 1.859434723854065, + "learning_rate": 4.969717186228642e-05, + "loss": 5.2373, + "step": 8340 + }, + { + "epoch": 0.049606289846797984, + "grad_norm": 1.9905357360839844, + "learning_rate": 4.9697099375595144e-05, + "loss": 4.8858, + "step": 8341 + }, + { + "epoch": 0.04961223713007898, + "grad_norm": 1.995355486869812, + "learning_rate": 4.969702688028236e-05, + "loss": 4.9468, + "step": 8342 + }, + { + "epoch": 0.049618184413359974, + "grad_norm": 1.9970706701278687, + "learning_rate": 4.96969543763481e-05, + "loss": 4.8891, + "step": 8343 + }, + { + "epoch": 0.049624131696640976, + "grad_norm": 1.9036997556686401, + "learning_rate": 4.9696881863792385e-05, + "loss": 4.7622, + "step": 8344 + }, + { + "epoch": 0.04963007897992197, + "grad_norm": 1.9532603025436401, + "learning_rate": 4.9696809342615245e-05, + "loss": 4.7832, + "step": 8345 + }, + { + "epoch": 0.049636026263202966, + "grad_norm": 1.9032143354415894, + "learning_rate": 4.969673681281671e-05, + "loss": 4.7569, + "step": 8346 + }, + { + "epoch": 0.04964197354648397, + "grad_norm": 3.4294323921203613, + "learning_rate": 4.96966642743968e-05, + "loss": 5.9381, + "step": 8347 + }, + { + "epoch": 0.04964792082976496, + "grad_norm": 4.137698173522949, + "learning_rate": 4.969659172735554e-05, + "loss": 6.4081, + "step": 8348 + }, + { + "epoch": 0.04965386811304596, + "grad_norm": 2.774838447570801, + "learning_rate": 4.969651917169295e-05, + "loss": 5.9888, + "step": 8349 + }, + { + "epoch": 0.04965981539632696, + "grad_norm": 2.4056432247161865, + "learning_rate": 4.9696446607409054e-05, + "loss": 6.1239, + "step": 8350 + }, + { + "epoch": 0.049665762679607955, + "grad_norm": 2.098475456237793, + "learning_rate": 4.969637403450389e-05, + "loss": 6.4226, + "step": 8351 + }, + { + "epoch": 0.04967170996288895, + "grad_norm": 2.1402597427368164, + "learning_rate": 4.9696301452977475e-05, + "loss": 5.8836, + "step": 8352 + }, + { + "epoch": 0.04967765724616995, + "grad_norm": 2.8023130893707275, + "learning_rate": 4.9696228862829844e-05, + "loss": 6.2452, + "step": 8353 + }, + { + "epoch": 0.04968360452945095, + "grad_norm": 2.7669503688812256, + "learning_rate": 4.9696156264061e-05, + "loss": 6.0093, + "step": 8354 + }, + { + "epoch": 0.04968955181273194, + "grad_norm": 2.2357375621795654, + "learning_rate": 4.9696083656671e-05, + "loss": 6.0614, + "step": 8355 + }, + { + "epoch": 0.049695499096012945, + "grad_norm": 2.1435539722442627, + "learning_rate": 4.969601104065984e-05, + "loss": 6.0718, + "step": 8356 + }, + { + "epoch": 0.04970144637929394, + "grad_norm": 2.6372897624969482, + "learning_rate": 4.969593841602757e-05, + "loss": 5.4878, + "step": 8357 + }, + { + "epoch": 0.049707393662574935, + "grad_norm": 1.9730110168457031, + "learning_rate": 4.9695865782774186e-05, + "loss": 5.8913, + "step": 8358 + }, + { + "epoch": 0.04971334094585593, + "grad_norm": 2.262437105178833, + "learning_rate": 4.9695793140899737e-05, + "loss": 5.0382, + "step": 8359 + }, + { + "epoch": 0.04971928822913693, + "grad_norm": 1.794268250465393, + "learning_rate": 4.9695720490404254e-05, + "loss": 5.784, + "step": 8360 + }, + { + "epoch": 0.04972523551241793, + "grad_norm": 1.9568414688110352, + "learning_rate": 4.969564783128773e-05, + "loss": 5.8939, + "step": 8361 + }, + { + "epoch": 0.04973118279569892, + "grad_norm": 2.0560479164123535, + "learning_rate": 4.969557516355022e-05, + "loss": 5.8806, + "step": 8362 + }, + { + "epoch": 0.049737130078979924, + "grad_norm": 1.9009175300598145, + "learning_rate": 4.9695502487191746e-05, + "loss": 5.5568, + "step": 8363 + }, + { + "epoch": 0.04974307736226092, + "grad_norm": 2.1240882873535156, + "learning_rate": 4.9695429802212325e-05, + "loss": 5.4514, + "step": 8364 + }, + { + "epoch": 0.049749024645541914, + "grad_norm": 2.0803675651550293, + "learning_rate": 4.969535710861198e-05, + "loss": 5.7679, + "step": 8365 + }, + { + "epoch": 0.049754971928822916, + "grad_norm": 1.9357428550720215, + "learning_rate": 4.969528440639074e-05, + "loss": 6.1658, + "step": 8366 + }, + { + "epoch": 0.04976091921210391, + "grad_norm": 1.89462411403656, + "learning_rate": 4.9695211695548635e-05, + "loss": 6.0559, + "step": 8367 + }, + { + "epoch": 0.04976686649538491, + "grad_norm": 1.5986123085021973, + "learning_rate": 4.969513897608569e-05, + "loss": 5.7787, + "step": 8368 + }, + { + "epoch": 0.04977281377866591, + "grad_norm": 2.0391738414764404, + "learning_rate": 4.969506624800192e-05, + "loss": 5.5559, + "step": 8369 + }, + { + "epoch": 0.049778761061946904, + "grad_norm": 2.1463794708251953, + "learning_rate": 4.969499351129736e-05, + "loss": 5.5734, + "step": 8370 + }, + { + "epoch": 0.0497847083452279, + "grad_norm": 2.1488826274871826, + "learning_rate": 4.969492076597203e-05, + "loss": 5.7502, + "step": 8371 + }, + { + "epoch": 0.049790655628508894, + "grad_norm": 2.214439868927002, + "learning_rate": 4.9694848012025966e-05, + "loss": 5.8829, + "step": 8372 + }, + { + "epoch": 0.049796602911789896, + "grad_norm": 2.366196632385254, + "learning_rate": 4.969477524945918e-05, + "loss": 5.3428, + "step": 8373 + }, + { + "epoch": 0.04980255019507089, + "grad_norm": 2.239044189453125, + "learning_rate": 4.96947024782717e-05, + "loss": 5.7258, + "step": 8374 + }, + { + "epoch": 0.049808497478351886, + "grad_norm": 2.315492868423462, + "learning_rate": 4.9694629698463554e-05, + "loss": 5.6542, + "step": 8375 + }, + { + "epoch": 0.04981444476163289, + "grad_norm": 2.340740919113159, + "learning_rate": 4.969455691003478e-05, + "loss": 5.0699, + "step": 8376 + }, + { + "epoch": 0.04982039204491388, + "grad_norm": 2.644800901412964, + "learning_rate": 4.9694484112985386e-05, + "loss": 5.3808, + "step": 8377 + }, + { + "epoch": 0.04982633932819488, + "grad_norm": 2.7073781490325928, + "learning_rate": 4.96944113073154e-05, + "loss": 5.5233, + "step": 8378 + }, + { + "epoch": 0.04983228661147588, + "grad_norm": 2.5480713844299316, + "learning_rate": 4.969433849302485e-05, + "loss": 5.3908, + "step": 8379 + }, + { + "epoch": 0.049838233894756875, + "grad_norm": 2.494356155395508, + "learning_rate": 4.969426567011376e-05, + "loss": 5.3528, + "step": 8380 + }, + { + "epoch": 0.04984418117803787, + "grad_norm": 2.4249942302703857, + "learning_rate": 4.9694192838582155e-05, + "loss": 5.2995, + "step": 8381 + }, + { + "epoch": 0.04985012846131887, + "grad_norm": 2.5930840969085693, + "learning_rate": 4.9694119998430066e-05, + "loss": 6.0202, + "step": 8382 + }, + { + "epoch": 0.04985607574459987, + "grad_norm": 2.391972541809082, + "learning_rate": 4.969404714965752e-05, + "loss": 6.0247, + "step": 8383 + }, + { + "epoch": 0.04986202302788086, + "grad_norm": 2.2849159240722656, + "learning_rate": 4.9693974292264535e-05, + "loss": 5.892, + "step": 8384 + }, + { + "epoch": 0.049867970311161865, + "grad_norm": 2.1887097358703613, + "learning_rate": 4.9693901426251134e-05, + "loss": 6.0196, + "step": 8385 + }, + { + "epoch": 0.04987391759444286, + "grad_norm": 2.3988685607910156, + "learning_rate": 4.969382855161735e-05, + "loss": 5.5596, + "step": 8386 + }, + { + "epoch": 0.049879864877723855, + "grad_norm": 2.675144910812378, + "learning_rate": 4.9693755668363204e-05, + "loss": 5.3495, + "step": 8387 + }, + { + "epoch": 0.04988581216100485, + "grad_norm": 2.3753585815429688, + "learning_rate": 4.969368277648873e-05, + "loss": 5.8823, + "step": 8388 + }, + { + "epoch": 0.04989175944428585, + "grad_norm": 2.3168766498565674, + "learning_rate": 4.969360987599394e-05, + "loss": 5.9768, + "step": 8389 + }, + { + "epoch": 0.04989770672756685, + "grad_norm": 2.427138566970825, + "learning_rate": 4.969353696687886e-05, + "loss": 6.1823, + "step": 8390 + }, + { + "epoch": 0.04990365401084784, + "grad_norm": 2.304731845855713, + "learning_rate": 4.9693464049143526e-05, + "loss": 5.8697, + "step": 8391 + }, + { + "epoch": 0.049909601294128844, + "grad_norm": 2.2139687538146973, + "learning_rate": 4.9693391122787966e-05, + "loss": 6.0274, + "step": 8392 + }, + { + "epoch": 0.04991554857740984, + "grad_norm": 2.1165316104888916, + "learning_rate": 4.9693318187812185e-05, + "loss": 5.2499, + "step": 8393 + }, + { + "epoch": 0.049921495860690834, + "grad_norm": 2.5213639736175537, + "learning_rate": 4.969324524421624e-05, + "loss": 4.9105, + "step": 8394 + }, + { + "epoch": 0.049927443143971836, + "grad_norm": 2.2188315391540527, + "learning_rate": 4.9693172292000125e-05, + "loss": 4.8652, + "step": 8395 + }, + { + "epoch": 0.04993339042725283, + "grad_norm": 2.393179416656494, + "learning_rate": 4.9693099331163886e-05, + "loss": 4.924, + "step": 8396 + }, + { + "epoch": 0.04993933771053383, + "grad_norm": 2.150264024734497, + "learning_rate": 4.969302636170753e-05, + "loss": 4.9168, + "step": 8397 + }, + { + "epoch": 0.04994528499381483, + "grad_norm": 2.252499580383301, + "learning_rate": 4.96929533836311e-05, + "loss": 4.7822, + "step": 8398 + }, + { + "epoch": 0.049951232277095824, + "grad_norm": 2.342132806777954, + "learning_rate": 4.969288039693461e-05, + "loss": 5.3691, + "step": 8399 + }, + { + "epoch": 0.04995717956037682, + "grad_norm": 2.3533523082733154, + "learning_rate": 4.96928074016181e-05, + "loss": 5.9989, + "step": 8400 + }, + { + "epoch": 0.049963126843657814, + "grad_norm": 2.185727834701538, + "learning_rate": 4.969273439768158e-05, + "loss": 5.6101, + "step": 8401 + }, + { + "epoch": 0.049969074126938816, + "grad_norm": 2.3396189212799072, + "learning_rate": 4.969266138512509e-05, + "loss": 5.845, + "step": 8402 + }, + { + "epoch": 0.04997502141021981, + "grad_norm": 2.2145371437072754, + "learning_rate": 4.969258836394864e-05, + "loss": 5.6657, + "step": 8403 + }, + { + "epoch": 0.049980968693500806, + "grad_norm": 2.2084364891052246, + "learning_rate": 4.969251533415226e-05, + "loss": 5.8823, + "step": 8404 + }, + { + "epoch": 0.04998691597678181, + "grad_norm": 1.7423903942108154, + "learning_rate": 4.9692442295735984e-05, + "loss": 5.8209, + "step": 8405 + }, + { + "epoch": 0.0499928632600628, + "grad_norm": 2.3057217597961426, + "learning_rate": 4.9692369248699824e-05, + "loss": 5.8352, + "step": 8406 + }, + { + "epoch": 0.0499988105433438, + "grad_norm": 2.1800148487091064, + "learning_rate": 4.969229619304382e-05, + "loss": 5.783, + "step": 8407 + }, + { + "epoch": 0.0500047578266248, + "grad_norm": 1.8594306707382202, + "learning_rate": 4.969222312876799e-05, + "loss": 6.01, + "step": 8408 + }, + { + "epoch": 0.050010705109905795, + "grad_norm": 2.119917392730713, + "learning_rate": 4.9692150055872355e-05, + "loss": 5.7282, + "step": 8409 + }, + { + "epoch": 0.05001665239318679, + "grad_norm": 2.5282747745513916, + "learning_rate": 4.969207697435695e-05, + "loss": 5.0853, + "step": 8410 + }, + { + "epoch": 0.05002259967646779, + "grad_norm": 2.5683388710021973, + "learning_rate": 4.969200388422179e-05, + "loss": 4.9841, + "step": 8411 + }, + { + "epoch": 0.05002854695974879, + "grad_norm": 2.649918794631958, + "learning_rate": 4.969193078546692e-05, + "loss": 5.6365, + "step": 8412 + }, + { + "epoch": 0.05003449424302978, + "grad_norm": 2.3040120601654053, + "learning_rate": 4.969185767809234e-05, + "loss": 5.8272, + "step": 8413 + }, + { + "epoch": 0.050040441526310785, + "grad_norm": 2.033600330352783, + "learning_rate": 4.9691784562098084e-05, + "loss": 5.9779, + "step": 8414 + }, + { + "epoch": 0.05004638880959178, + "grad_norm": 2.1903419494628906, + "learning_rate": 4.96917114374842e-05, + "loss": 5.8651, + "step": 8415 + }, + { + "epoch": 0.050052336092872775, + "grad_norm": 2.4431047439575195, + "learning_rate": 4.969163830425068e-05, + "loss": 4.7787, + "step": 8416 + }, + { + "epoch": 0.05005828337615377, + "grad_norm": 2.6652824878692627, + "learning_rate": 4.969156516239756e-05, + "loss": 4.7133, + "step": 8417 + }, + { + "epoch": 0.05006423065943477, + "grad_norm": 2.4090182781219482, + "learning_rate": 4.969149201192488e-05, + "loss": 4.4506, + "step": 8418 + }, + { + "epoch": 0.05007017794271577, + "grad_norm": 2.5310218334198, + "learning_rate": 4.969141885283265e-05, + "loss": 4.5286, + "step": 8419 + }, + { + "epoch": 0.05007612522599676, + "grad_norm": 2.5333101749420166, + "learning_rate": 4.9691345685120905e-05, + "loss": 4.6012, + "step": 8420 + }, + { + "epoch": 0.050082072509277764, + "grad_norm": 2.172724485397339, + "learning_rate": 4.9691272508789665e-05, + "loss": 4.9161, + "step": 8421 + }, + { + "epoch": 0.05008801979255876, + "grad_norm": 2.034684181213379, + "learning_rate": 4.969119932383896e-05, + "loss": 5.3105, + "step": 8422 + }, + { + "epoch": 0.050093967075839754, + "grad_norm": 1.9046155214309692, + "learning_rate": 4.969112613026881e-05, + "loss": 5.4308, + "step": 8423 + }, + { + "epoch": 0.050099914359120756, + "grad_norm": 1.7256773710250854, + "learning_rate": 4.9691052928079226e-05, + "loss": 5.2232, + "step": 8424 + }, + { + "epoch": 0.05010586164240175, + "grad_norm": 2.0075321197509766, + "learning_rate": 4.969097971727027e-05, + "loss": 6.1764, + "step": 8425 + }, + { + "epoch": 0.050111808925682746, + "grad_norm": 2.1523852348327637, + "learning_rate": 4.9690906497841946e-05, + "loss": 5.8419, + "step": 8426 + }, + { + "epoch": 0.05011775620896375, + "grad_norm": 1.9675406217575073, + "learning_rate": 4.969083326979428e-05, + "loss": 5.7919, + "step": 8427 + }, + { + "epoch": 0.050123703492244744, + "grad_norm": 2.0327789783477783, + "learning_rate": 4.9690760033127295e-05, + "loss": 5.0232, + "step": 8428 + }, + { + "epoch": 0.05012965077552574, + "grad_norm": 1.677471399307251, + "learning_rate": 4.969068678784102e-05, + "loss": 5.1106, + "step": 8429 + }, + { + "epoch": 0.050135598058806734, + "grad_norm": 1.727847933769226, + "learning_rate": 4.9690613533935496e-05, + "loss": 5.1589, + "step": 8430 + }, + { + "epoch": 0.050141545342087736, + "grad_norm": 1.8167927265167236, + "learning_rate": 4.9690540271410726e-05, + "loss": 5.1207, + "step": 8431 + }, + { + "epoch": 0.05014749262536873, + "grad_norm": 2.277425527572632, + "learning_rate": 4.969046700026674e-05, + "loss": 5.6614, + "step": 8432 + }, + { + "epoch": 0.050153439908649726, + "grad_norm": 1.6471065282821655, + "learning_rate": 4.969039372050356e-05, + "loss": 5.2065, + "step": 8433 + }, + { + "epoch": 0.05015938719193073, + "grad_norm": 1.9049899578094482, + "learning_rate": 4.9690320432121226e-05, + "loss": 5.7453, + "step": 8434 + }, + { + "epoch": 0.05016533447521172, + "grad_norm": 1.9145495891571045, + "learning_rate": 4.969024713511976e-05, + "loss": 6.2207, + "step": 8435 + }, + { + "epoch": 0.05017128175849272, + "grad_norm": 1.6634061336517334, + "learning_rate": 4.969017382949918e-05, + "loss": 6.1694, + "step": 8436 + }, + { + "epoch": 0.05017722904177372, + "grad_norm": 1.9804925918579102, + "learning_rate": 4.969010051525952e-05, + "loss": 6.2917, + "step": 8437 + }, + { + "epoch": 0.050183176325054715, + "grad_norm": 1.9674698114395142, + "learning_rate": 4.969002719240079e-05, + "loss": 6.3105, + "step": 8438 + }, + { + "epoch": 0.05018912360833571, + "grad_norm": 2.1540520191192627, + "learning_rate": 4.968995386092303e-05, + "loss": 5.964, + "step": 8439 + }, + { + "epoch": 0.05019507089161671, + "grad_norm": 1.8545453548431396, + "learning_rate": 4.9689880520826274e-05, + "loss": 5.8744, + "step": 8440 + }, + { + "epoch": 0.05020101817489771, + "grad_norm": 1.8022514581680298, + "learning_rate": 4.968980717211053e-05, + "loss": 6.1547, + "step": 8441 + }, + { + "epoch": 0.0502069654581787, + "grad_norm": 1.6297475099563599, + "learning_rate": 4.968973381477582e-05, + "loss": 6.1397, + "step": 8442 + }, + { + "epoch": 0.050212912741459705, + "grad_norm": 1.6256400346755981, + "learning_rate": 4.968966044882219e-05, + "loss": 6.0529, + "step": 8443 + }, + { + "epoch": 0.0502188600247407, + "grad_norm": 1.5988365411758423, + "learning_rate": 4.968958707424965e-05, + "loss": 6.0653, + "step": 8444 + }, + { + "epoch": 0.050224807308021695, + "grad_norm": 1.7062568664550781, + "learning_rate": 4.968951369105823e-05, + "loss": 5.6761, + "step": 8445 + }, + { + "epoch": 0.05023075459130269, + "grad_norm": 2.6108970642089844, + "learning_rate": 4.968944029924796e-05, + "loss": 5.7222, + "step": 8446 + }, + { + "epoch": 0.05023670187458369, + "grad_norm": 2.2341887950897217, + "learning_rate": 4.9689366898818854e-05, + "loss": 6.057, + "step": 8447 + }, + { + "epoch": 0.05024264915786469, + "grad_norm": 2.1819159984588623, + "learning_rate": 4.968929348977095e-05, + "loss": 6.0386, + "step": 8448 + }, + { + "epoch": 0.05024859644114568, + "grad_norm": 1.9941349029541016, + "learning_rate": 4.968922007210427e-05, + "loss": 6.132, + "step": 8449 + }, + { + "epoch": 0.050254543724426684, + "grad_norm": 1.7330418825149536, + "learning_rate": 4.968914664581883e-05, + "loss": 6.0834, + "step": 8450 + }, + { + "epoch": 0.05026049100770768, + "grad_norm": 1.8946608304977417, + "learning_rate": 4.968907321091467e-05, + "loss": 5.9147, + "step": 8451 + }, + { + "epoch": 0.050266438290988674, + "grad_norm": 2.314767599105835, + "learning_rate": 4.9688999767391815e-05, + "loss": 5.7087, + "step": 8452 + }, + { + "epoch": 0.050272385574269676, + "grad_norm": 2.604673147201538, + "learning_rate": 4.968892631525028e-05, + "loss": 5.7348, + "step": 8453 + }, + { + "epoch": 0.05027833285755067, + "grad_norm": 2.3386125564575195, + "learning_rate": 4.9688852854490097e-05, + "loss": 5.7509, + "step": 8454 + }, + { + "epoch": 0.050284280140831666, + "grad_norm": 2.3919529914855957, + "learning_rate": 4.968877938511129e-05, + "loss": 5.5851, + "step": 8455 + }, + { + "epoch": 0.05029022742411267, + "grad_norm": 2.0978026390075684, + "learning_rate": 4.9688705907113886e-05, + "loss": 5.3663, + "step": 8456 + }, + { + "epoch": 0.050296174707393664, + "grad_norm": 2.1700327396392822, + "learning_rate": 4.9688632420497904e-05, + "loss": 6.0197, + "step": 8457 + }, + { + "epoch": 0.05030212199067466, + "grad_norm": 2.1657676696777344, + "learning_rate": 4.968855892526338e-05, + "loss": 6.1721, + "step": 8458 + }, + { + "epoch": 0.050308069273955654, + "grad_norm": 2.434732437133789, + "learning_rate": 4.968848542141033e-05, + "loss": 6.0217, + "step": 8459 + }, + { + "epoch": 0.050314016557236656, + "grad_norm": 1.8453216552734375, + "learning_rate": 4.96884119089388e-05, + "loss": 6.4071, + "step": 8460 + }, + { + "epoch": 0.05031996384051765, + "grad_norm": 1.930168628692627, + "learning_rate": 4.9688338387848784e-05, + "loss": 6.5024, + "step": 8461 + }, + { + "epoch": 0.050325911123798646, + "grad_norm": 2.1785950660705566, + "learning_rate": 4.968826485814033e-05, + "loss": 5.803, + "step": 8462 + }, + { + "epoch": 0.05033185840707965, + "grad_norm": 2.003187894821167, + "learning_rate": 4.968819131981346e-05, + "loss": 6.2269, + "step": 8463 + }, + { + "epoch": 0.05033780569036064, + "grad_norm": 2.9522452354431152, + "learning_rate": 4.9688117772868195e-05, + "loss": 5.5603, + "step": 8464 + }, + { + "epoch": 0.05034375297364164, + "grad_norm": 1.9813052415847778, + "learning_rate": 4.968804421730457e-05, + "loss": 6.0101, + "step": 8465 + }, + { + "epoch": 0.05034970025692264, + "grad_norm": 2.370225667953491, + "learning_rate": 4.9687970653122596e-05, + "loss": 6.3236, + "step": 8466 + }, + { + "epoch": 0.050355647540203635, + "grad_norm": 1.9233943223953247, + "learning_rate": 4.968789708032231e-05, + "loss": 6.2962, + "step": 8467 + }, + { + "epoch": 0.05036159482348463, + "grad_norm": 1.8740222454071045, + "learning_rate": 4.968782349890373e-05, + "loss": 5.5454, + "step": 8468 + }, + { + "epoch": 0.05036754210676563, + "grad_norm": 1.8627724647521973, + "learning_rate": 4.968774990886689e-05, + "loss": 5.9242, + "step": 8469 + }, + { + "epoch": 0.05037348939004663, + "grad_norm": 1.7016552686691284, + "learning_rate": 4.968767631021181e-05, + "loss": 6.3302, + "step": 8470 + }, + { + "epoch": 0.05037943667332762, + "grad_norm": 1.8826018571853638, + "learning_rate": 4.9687602702938515e-05, + "loss": 6.3308, + "step": 8471 + }, + { + "epoch": 0.050385383956608625, + "grad_norm": 1.777480959892273, + "learning_rate": 4.9687529087047036e-05, + "loss": 6.3948, + "step": 8472 + }, + { + "epoch": 0.05039133123988962, + "grad_norm": 2.10075306892395, + "learning_rate": 4.9687455462537396e-05, + "loss": 6.1615, + "step": 8473 + }, + { + "epoch": 0.050397278523170615, + "grad_norm": 2.3484537601470947, + "learning_rate": 4.9687381829409616e-05, + "loss": 5.8286, + "step": 8474 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 1.8243837356567383, + "learning_rate": 4.968730818766373e-05, + "loss": 6.014, + "step": 8475 + }, + { + "epoch": 0.05040917308973261, + "grad_norm": 1.8149470090866089, + "learning_rate": 4.9687234537299765e-05, + "loss": 5.9723, + "step": 8476 + }, + { + "epoch": 0.05041512037301361, + "grad_norm": 2.400754451751709, + "learning_rate": 4.968716087831773e-05, + "loss": 5.237, + "step": 8477 + }, + { + "epoch": 0.0504210676562946, + "grad_norm": 2.4394338130950928, + "learning_rate": 4.968708721071767e-05, + "loss": 5.1106, + "step": 8478 + }, + { + "epoch": 0.050427014939575604, + "grad_norm": 2.210686445236206, + "learning_rate": 4.96870135344996e-05, + "loss": 5.0002, + "step": 8479 + }, + { + "epoch": 0.0504329622228566, + "grad_norm": 2.302997589111328, + "learning_rate": 4.968693984966355e-05, + "loss": 5.689, + "step": 8480 + }, + { + "epoch": 0.050438909506137594, + "grad_norm": 2.0761525630950928, + "learning_rate": 4.9686866156209546e-05, + "loss": 5.4452, + "step": 8481 + }, + { + "epoch": 0.050444856789418596, + "grad_norm": 2.3239383697509766, + "learning_rate": 4.968679245413761e-05, + "loss": 5.4427, + "step": 8482 + }, + { + "epoch": 0.05045080407269959, + "grad_norm": 3.2064802646636963, + "learning_rate": 4.9686718743447766e-05, + "loss": 5.2947, + "step": 8483 + }, + { + "epoch": 0.050456751355980586, + "grad_norm": 2.680786371231079, + "learning_rate": 4.968664502414004e-05, + "loss": 5.4776, + "step": 8484 + }, + { + "epoch": 0.05046269863926159, + "grad_norm": 2.107583522796631, + "learning_rate": 4.9686571296214476e-05, + "loss": 5.5172, + "step": 8485 + }, + { + "epoch": 0.050468645922542583, + "grad_norm": 1.939788579940796, + "learning_rate": 4.9686497559671075e-05, + "loss": 5.6056, + "step": 8486 + }, + { + "epoch": 0.05047459320582358, + "grad_norm": 1.883991003036499, + "learning_rate": 4.968642381450987e-05, + "loss": 5.6511, + "step": 8487 + }, + { + "epoch": 0.050480540489104574, + "grad_norm": 1.8518444299697876, + "learning_rate": 4.96863500607309e-05, + "loss": 5.5897, + "step": 8488 + }, + { + "epoch": 0.050486487772385576, + "grad_norm": 1.6704350709915161, + "learning_rate": 4.968627629833418e-05, + "loss": 5.5002, + "step": 8489 + }, + { + "epoch": 0.05049243505566657, + "grad_norm": 1.755231261253357, + "learning_rate": 4.968620252731972e-05, + "loss": 5.6012, + "step": 8490 + }, + { + "epoch": 0.050498382338947566, + "grad_norm": 1.8532077074050903, + "learning_rate": 4.968612874768758e-05, + "loss": 5.4443, + "step": 8491 + }, + { + "epoch": 0.05050432962222857, + "grad_norm": 1.787781000137329, + "learning_rate": 4.9686054959437756e-05, + "loss": 5.5623, + "step": 8492 + }, + { + "epoch": 0.05051027690550956, + "grad_norm": 1.6963365077972412, + "learning_rate": 4.9685981162570295e-05, + "loss": 5.5349, + "step": 8493 + }, + { + "epoch": 0.05051622418879056, + "grad_norm": 4.328898906707764, + "learning_rate": 4.96859073570852e-05, + "loss": 5.8026, + "step": 8494 + }, + { + "epoch": 0.05052217147207156, + "grad_norm": 1.6906582117080688, + "learning_rate": 4.968583354298252e-05, + "loss": 5.4804, + "step": 8495 + }, + { + "epoch": 0.050528118755352555, + "grad_norm": 1.5316333770751953, + "learning_rate": 4.968575972026227e-05, + "loss": 5.6005, + "step": 8496 + }, + { + "epoch": 0.05053406603863355, + "grad_norm": 1.6029349565505981, + "learning_rate": 4.968568588892447e-05, + "loss": 5.5991, + "step": 8497 + }, + { + "epoch": 0.05054001332191455, + "grad_norm": 2.246537685394287, + "learning_rate": 4.968561204896916e-05, + "loss": 5.8537, + "step": 8498 + }, + { + "epoch": 0.05054596060519555, + "grad_norm": 2.0347564220428467, + "learning_rate": 4.9685538200396355e-05, + "loss": 5.7968, + "step": 8499 + }, + { + "epoch": 0.05055190788847654, + "grad_norm": 1.7635436058044434, + "learning_rate": 4.968546434320608e-05, + "loss": 5.6324, + "step": 8500 + }, + { + "epoch": 0.050557855171757544, + "grad_norm": 2.415397882461548, + "learning_rate": 4.9685390477398363e-05, + "loss": 5.3795, + "step": 8501 + }, + { + "epoch": 0.05056380245503854, + "grad_norm": 2.1499149799346924, + "learning_rate": 4.9685316602973245e-05, + "loss": 5.5638, + "step": 8502 + }, + { + "epoch": 0.050569749738319535, + "grad_norm": 2.0479557514190674, + "learning_rate": 4.9685242719930725e-05, + "loss": 5.3902, + "step": 8503 + }, + { + "epoch": 0.05057569702160053, + "grad_norm": 1.874993085861206, + "learning_rate": 4.9685168828270845e-05, + "loss": 5.4607, + "step": 8504 + }, + { + "epoch": 0.05058164430488153, + "grad_norm": 1.6361217498779297, + "learning_rate": 4.9685094927993623e-05, + "loss": 5.4378, + "step": 8505 + }, + { + "epoch": 0.05058759158816253, + "grad_norm": 1.598026990890503, + "learning_rate": 4.9685021019099096e-05, + "loss": 5.4336, + "step": 8506 + }, + { + "epoch": 0.05059353887144352, + "grad_norm": 1.7636823654174805, + "learning_rate": 4.968494710158728e-05, + "loss": 5.4757, + "step": 8507 + }, + { + "epoch": 0.050599486154724524, + "grad_norm": 1.7823325395584106, + "learning_rate": 4.968487317545821e-05, + "loss": 5.4872, + "step": 8508 + }, + { + "epoch": 0.05060543343800552, + "grad_norm": 2.39149808883667, + "learning_rate": 4.9684799240711896e-05, + "loss": 5.039, + "step": 8509 + }, + { + "epoch": 0.050611380721286514, + "grad_norm": 2.0295841693878174, + "learning_rate": 4.968472529734838e-05, + "loss": 5.1086, + "step": 8510 + }, + { + "epoch": 0.050617328004567516, + "grad_norm": 2.6830973625183105, + "learning_rate": 4.9684651345367684e-05, + "loss": 4.8889, + "step": 8511 + }, + { + "epoch": 0.05062327528784851, + "grad_norm": 2.3600027561187744, + "learning_rate": 4.9684577384769825e-05, + "loss": 5.5305, + "step": 8512 + }, + { + "epoch": 0.050629222571129506, + "grad_norm": 2.1680233478546143, + "learning_rate": 4.968450341555484e-05, + "loss": 5.8196, + "step": 8513 + }, + { + "epoch": 0.05063516985441051, + "grad_norm": 1.800645351409912, + "learning_rate": 4.968442943772275e-05, + "loss": 5.2689, + "step": 8514 + }, + { + "epoch": 0.0506411171376915, + "grad_norm": 1.983245849609375, + "learning_rate": 4.9684355451273566e-05, + "loss": 4.7782, + "step": 8515 + }, + { + "epoch": 0.0506470644209725, + "grad_norm": 2.12082576751709, + "learning_rate": 4.968428145620735e-05, + "loss": 4.7946, + "step": 8516 + }, + { + "epoch": 0.050653011704253494, + "grad_norm": 1.7249135971069336, + "learning_rate": 4.968420745252409e-05, + "loss": 4.7055, + "step": 8517 + }, + { + "epoch": 0.050658958987534496, + "grad_norm": 1.971240758895874, + "learning_rate": 4.968413344022384e-05, + "loss": 4.7343, + "step": 8518 + }, + { + "epoch": 0.05066490627081549, + "grad_norm": 1.780387282371521, + "learning_rate": 4.968405941930661e-05, + "loss": 4.7502, + "step": 8519 + }, + { + "epoch": 0.050670853554096486, + "grad_norm": 1.772007942199707, + "learning_rate": 4.968398538977242e-05, + "loss": 4.7439, + "step": 8520 + }, + { + "epoch": 0.05067680083737749, + "grad_norm": 1.9167592525482178, + "learning_rate": 4.9683911351621324e-05, + "loss": 4.6393, + "step": 8521 + }, + { + "epoch": 0.05068274812065848, + "grad_norm": 2.0527031421661377, + "learning_rate": 4.968383730485331e-05, + "loss": 4.6379, + "step": 8522 + }, + { + "epoch": 0.05068869540393948, + "grad_norm": 2.0608508586883545, + "learning_rate": 4.968376324946844e-05, + "loss": 4.6128, + "step": 8523 + }, + { + "epoch": 0.05069464268722048, + "grad_norm": 1.984731674194336, + "learning_rate": 4.968368918546672e-05, + "loss": 4.5969, + "step": 8524 + }, + { + "epoch": 0.050700589970501475, + "grad_norm": 1.7904438972473145, + "learning_rate": 4.968361511284817e-05, + "loss": 4.6853, + "step": 8525 + }, + { + "epoch": 0.05070653725378247, + "grad_norm": 1.8095389604568481, + "learning_rate": 4.968354103161283e-05, + "loss": 4.5748, + "step": 8526 + }, + { + "epoch": 0.05071248453706347, + "grad_norm": 1.8565012216567993, + "learning_rate": 4.968346694176073e-05, + "loss": 4.5249, + "step": 8527 + }, + { + "epoch": 0.05071843182034447, + "grad_norm": 1.7721836566925049, + "learning_rate": 4.968339284329188e-05, + "loss": 4.6593, + "step": 8528 + }, + { + "epoch": 0.05072437910362546, + "grad_norm": 1.9470161199569702, + "learning_rate": 4.968331873620631e-05, + "loss": 4.5432, + "step": 8529 + }, + { + "epoch": 0.050730326386906464, + "grad_norm": 1.8639118671417236, + "learning_rate": 4.968324462050404e-05, + "loss": 4.4464, + "step": 8530 + }, + { + "epoch": 0.05073627367018746, + "grad_norm": 1.9226467609405518, + "learning_rate": 4.9683170496185114e-05, + "loss": 4.4364, + "step": 8531 + }, + { + "epoch": 0.050742220953468455, + "grad_norm": 1.988198161125183, + "learning_rate": 4.9683096363249545e-05, + "loss": 4.6614, + "step": 8532 + }, + { + "epoch": 0.05074816823674945, + "grad_norm": 1.903645396232605, + "learning_rate": 4.9683022221697374e-05, + "loss": 4.5168, + "step": 8533 + }, + { + "epoch": 0.05075411552003045, + "grad_norm": 1.903448224067688, + "learning_rate": 4.96829480715286e-05, + "loss": 4.5899, + "step": 8534 + }, + { + "epoch": 0.05076006280331145, + "grad_norm": 1.864522099494934, + "learning_rate": 4.9682873912743274e-05, + "loss": 4.5896, + "step": 8535 + }, + { + "epoch": 0.05076601008659244, + "grad_norm": 1.8760302066802979, + "learning_rate": 4.9682799745341406e-05, + "loss": 4.593, + "step": 8536 + }, + { + "epoch": 0.050771957369873444, + "grad_norm": 1.9024009704589844, + "learning_rate": 4.968272556932303e-05, + "loss": 4.9861, + "step": 8537 + }, + { + "epoch": 0.05077790465315444, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9682651384688176e-05, + "loss": 5.6755, + "step": 8538 + }, + { + "epoch": 0.050783851936435434, + "grad_norm": 1.758934736251831, + "learning_rate": 4.9682577191436854e-05, + "loss": 5.4334, + "step": 8539 + }, + { + "epoch": 0.050789799219716436, + "grad_norm": 2.3531200885772705, + "learning_rate": 4.968250298956909e-05, + "loss": 4.9819, + "step": 8540 + }, + { + "epoch": 0.05079574650299743, + "grad_norm": 1.901681661605835, + "learning_rate": 4.968242877908494e-05, + "loss": 5.1642, + "step": 8541 + }, + { + "epoch": 0.050801693786278426, + "grad_norm": 1.7250633239746094, + "learning_rate": 4.96823545599844e-05, + "loss": 5.4847, + "step": 8542 + }, + { + "epoch": 0.05080764106955943, + "grad_norm": 1.7400966882705688, + "learning_rate": 4.968228033226751e-05, + "loss": 5.5902, + "step": 8543 + }, + { + "epoch": 0.05081358835284042, + "grad_norm": 1.5469578504562378, + "learning_rate": 4.968220609593428e-05, + "loss": 5.6432, + "step": 8544 + }, + { + "epoch": 0.05081953563612142, + "grad_norm": 1.8277182579040527, + "learning_rate": 4.968213185098475e-05, + "loss": 5.3296, + "step": 8545 + }, + { + "epoch": 0.050825482919402414, + "grad_norm": 2.0535261631011963, + "learning_rate": 4.9682057597418943e-05, + "loss": 5.5278, + "step": 8546 + }, + { + "epoch": 0.050831430202683416, + "grad_norm": 1.8631746768951416, + "learning_rate": 4.9681983335236894e-05, + "loss": 5.556, + "step": 8547 + }, + { + "epoch": 0.05083737748596441, + "grad_norm": 1.6663711071014404, + "learning_rate": 4.968190906443861e-05, + "loss": 5.4321, + "step": 8548 + }, + { + "epoch": 0.050843324769245406, + "grad_norm": 1.8302260637283325, + "learning_rate": 4.968183478502413e-05, + "loss": 5.4746, + "step": 8549 + }, + { + "epoch": 0.05084927205252641, + "grad_norm": 1.9203182458877563, + "learning_rate": 4.968176049699347e-05, + "loss": 5.4334, + "step": 8550 + }, + { + "epoch": 0.0508552193358074, + "grad_norm": 2.0406670570373535, + "learning_rate": 4.9681686200346674e-05, + "loss": 5.6509, + "step": 8551 + }, + { + "epoch": 0.0508611666190884, + "grad_norm": 2.3438572883605957, + "learning_rate": 4.968161189508374e-05, + "loss": 5.8662, + "step": 8552 + }, + { + "epoch": 0.0508671139023694, + "grad_norm": 1.9612985849380493, + "learning_rate": 4.968153758120473e-05, + "loss": 5.6813, + "step": 8553 + }, + { + "epoch": 0.050873061185650395, + "grad_norm": 1.4175993204116821, + "learning_rate": 4.968146325870964e-05, + "loss": 5.4593, + "step": 8554 + }, + { + "epoch": 0.05087900846893139, + "grad_norm": 1.3445212841033936, + "learning_rate": 4.96813889275985e-05, + "loss": 5.4195, + "step": 8555 + }, + { + "epoch": 0.05088495575221239, + "grad_norm": 1.9938427209854126, + "learning_rate": 4.968131458787135e-05, + "loss": 5.8791, + "step": 8556 + }, + { + "epoch": 0.05089090303549339, + "grad_norm": 1.7449276447296143, + "learning_rate": 4.9681240239528216e-05, + "loss": 5.3574, + "step": 8557 + }, + { + "epoch": 0.05089685031877438, + "grad_norm": 2.0117087364196777, + "learning_rate": 4.96811658825691e-05, + "loss": 5.3548, + "step": 8558 + }, + { + "epoch": 0.050902797602055384, + "grad_norm": 1.97372567653656, + "learning_rate": 4.968109151699406e-05, + "loss": 5.5281, + "step": 8559 + }, + { + "epoch": 0.05090874488533638, + "grad_norm": 1.8815237283706665, + "learning_rate": 4.9681017142803095e-05, + "loss": 5.4849, + "step": 8560 + }, + { + "epoch": 0.050914692168617375, + "grad_norm": 1.627252221107483, + "learning_rate": 4.968094275999624e-05, + "loss": 5.2125, + "step": 8561 + }, + { + "epoch": 0.05092063945189837, + "grad_norm": 1.4768601655960083, + "learning_rate": 4.968086836857353e-05, + "loss": 5.0817, + "step": 8562 + }, + { + "epoch": 0.05092658673517937, + "grad_norm": 2.0249485969543457, + "learning_rate": 4.968079396853498e-05, + "loss": 5.4025, + "step": 8563 + }, + { + "epoch": 0.05093253401846037, + "grad_norm": 2.0904550552368164, + "learning_rate": 4.968071955988062e-05, + "loss": 5.4404, + "step": 8564 + }, + { + "epoch": 0.05093848130174136, + "grad_norm": 1.935063123703003, + "learning_rate": 4.9680645142610475e-05, + "loss": 5.4961, + "step": 8565 + }, + { + "epoch": 0.050944428585022364, + "grad_norm": 1.9836292266845703, + "learning_rate": 4.968057071672457e-05, + "loss": 5.2469, + "step": 8566 + }, + { + "epoch": 0.05095037586830336, + "grad_norm": 1.8337205648422241, + "learning_rate": 4.9680496282222944e-05, + "loss": 5.4432, + "step": 8567 + }, + { + "epoch": 0.050956323151584354, + "grad_norm": 1.9169154167175293, + "learning_rate": 4.9680421839105604e-05, + "loss": 5.2606, + "step": 8568 + }, + { + "epoch": 0.050962270434865356, + "grad_norm": 1.5869332551956177, + "learning_rate": 4.968034738737258e-05, + "loss": 5.006, + "step": 8569 + }, + { + "epoch": 0.05096821771814635, + "grad_norm": 1.5824979543685913, + "learning_rate": 4.968027292702391e-05, + "loss": 5.2078, + "step": 8570 + }, + { + "epoch": 0.050974165001427346, + "grad_norm": 1.7121458053588867, + "learning_rate": 4.96801984580596e-05, + "loss": 5.3913, + "step": 8571 + }, + { + "epoch": 0.05098011228470835, + "grad_norm": 1.7111082077026367, + "learning_rate": 4.96801239804797e-05, + "loss": 5.3957, + "step": 8572 + }, + { + "epoch": 0.05098605956798934, + "grad_norm": 1.834083080291748, + "learning_rate": 4.968004949428421e-05, + "loss": 5.501, + "step": 8573 + }, + { + "epoch": 0.05099200685127034, + "grad_norm": 1.773421287536621, + "learning_rate": 4.967997499947318e-05, + "loss": 5.429, + "step": 8574 + }, + { + "epoch": 0.05099795413455134, + "grad_norm": 1.7471132278442383, + "learning_rate": 4.967990049604663e-05, + "loss": 5.4853, + "step": 8575 + }, + { + "epoch": 0.051003901417832335, + "grad_norm": 1.7264289855957031, + "learning_rate": 4.967982598400457e-05, + "loss": 5.4415, + "step": 8576 + }, + { + "epoch": 0.05100984870111333, + "grad_norm": 1.750982403755188, + "learning_rate": 4.9679751463347044e-05, + "loss": 5.1731, + "step": 8577 + }, + { + "epoch": 0.051015795984394326, + "grad_norm": 1.6106518507003784, + "learning_rate": 4.967967693407407e-05, + "loss": 5.2692, + "step": 8578 + }, + { + "epoch": 0.05102174326767533, + "grad_norm": 1.8728212118148804, + "learning_rate": 4.967960239618568e-05, + "loss": 5.2416, + "step": 8579 + }, + { + "epoch": 0.05102769055095632, + "grad_norm": 1.6410562992095947, + "learning_rate": 4.967952784968189e-05, + "loss": 5.1824, + "step": 8580 + }, + { + "epoch": 0.05103363783423732, + "grad_norm": 1.7119427919387817, + "learning_rate": 4.967945329456274e-05, + "loss": 5.2316, + "step": 8581 + }, + { + "epoch": 0.05103958511751832, + "grad_norm": 1.667602300643921, + "learning_rate": 4.967937873082824e-05, + "loss": 4.9599, + "step": 8582 + }, + { + "epoch": 0.051045532400799315, + "grad_norm": 1.9595974683761597, + "learning_rate": 4.967930415847842e-05, + "loss": 4.9613, + "step": 8583 + }, + { + "epoch": 0.05105147968408031, + "grad_norm": 1.70210862159729, + "learning_rate": 4.967922957751332e-05, + "loss": 5.3587, + "step": 8584 + }, + { + "epoch": 0.05105742696736131, + "grad_norm": 2.101145029067993, + "learning_rate": 4.967915498793295e-05, + "loss": 5.2782, + "step": 8585 + }, + { + "epoch": 0.05106337425064231, + "grad_norm": 1.8836926221847534, + "learning_rate": 4.9679080389737344e-05, + "loss": 5.3128, + "step": 8586 + }, + { + "epoch": 0.0510693215339233, + "grad_norm": 1.7542184591293335, + "learning_rate": 4.967900578292652e-05, + "loss": 5.2236, + "step": 8587 + }, + { + "epoch": 0.051075268817204304, + "grad_norm": 1.8415964841842651, + "learning_rate": 4.967893116750052e-05, + "loss": 5.1267, + "step": 8588 + }, + { + "epoch": 0.0510812161004853, + "grad_norm": 1.7702316045761108, + "learning_rate": 4.967885654345936e-05, + "loss": 5.6495, + "step": 8589 + }, + { + "epoch": 0.051087163383766294, + "grad_norm": 1.7790406942367554, + "learning_rate": 4.967878191080306e-05, + "loss": 5.2561, + "step": 8590 + }, + { + "epoch": 0.05109311066704729, + "grad_norm": 1.7282217741012573, + "learning_rate": 4.967870726953165e-05, + "loss": 5.2589, + "step": 8591 + }, + { + "epoch": 0.05109905795032829, + "grad_norm": 1.6590560674667358, + "learning_rate": 4.967863261964517e-05, + "loss": 5.1952, + "step": 8592 + }, + { + "epoch": 0.05110500523360929, + "grad_norm": 1.5948386192321777, + "learning_rate": 4.9678557961143625e-05, + "loss": 5.297, + "step": 8593 + }, + { + "epoch": 0.05111095251689028, + "grad_norm": 1.8219022750854492, + "learning_rate": 4.9678483294027046e-05, + "loss": 5.3391, + "step": 8594 + }, + { + "epoch": 0.051116899800171284, + "grad_norm": 1.547616720199585, + "learning_rate": 4.967840861829547e-05, + "loss": 5.4224, + "step": 8595 + }, + { + "epoch": 0.05112284708345228, + "grad_norm": 1.7924590110778809, + "learning_rate": 4.9678333933948914e-05, + "loss": 5.2371, + "step": 8596 + }, + { + "epoch": 0.051128794366733274, + "grad_norm": 1.7630747556686401, + "learning_rate": 4.9678259240987416e-05, + "loss": 5.4849, + "step": 8597 + }, + { + "epoch": 0.051134741650014276, + "grad_norm": 1.7853891849517822, + "learning_rate": 4.967818453941098e-05, + "loss": 5.1753, + "step": 8598 + }, + { + "epoch": 0.05114068893329527, + "grad_norm": 1.6572301387786865, + "learning_rate": 4.9678109829219654e-05, + "loss": 5.3747, + "step": 8599 + }, + { + "epoch": 0.051146636216576266, + "grad_norm": 1.6574329137802124, + "learning_rate": 4.9678035110413445e-05, + "loss": 5.417, + "step": 8600 + }, + { + "epoch": 0.05115258349985727, + "grad_norm": 1.7093894481658936, + "learning_rate": 4.9677960382992396e-05, + "loss": 5.4605, + "step": 8601 + }, + { + "epoch": 0.05115853078313826, + "grad_norm": 1.6304559707641602, + "learning_rate": 4.967788564695652e-05, + "loss": 5.6186, + "step": 8602 + }, + { + "epoch": 0.05116447806641926, + "grad_norm": 1.6134929656982422, + "learning_rate": 4.967781090230586e-05, + "loss": 5.5084, + "step": 8603 + }, + { + "epoch": 0.05117042534970026, + "grad_norm": 1.7007251977920532, + "learning_rate": 4.9677736149040426e-05, + "loss": 5.2542, + "step": 8604 + }, + { + "epoch": 0.051176372632981255, + "grad_norm": 1.6648818254470825, + "learning_rate": 4.967766138716025e-05, + "loss": 5.4136, + "step": 8605 + }, + { + "epoch": 0.05118231991626225, + "grad_norm": 1.5595816373825073, + "learning_rate": 4.967758661666535e-05, + "loss": 5.181, + "step": 8606 + }, + { + "epoch": 0.051188267199543246, + "grad_norm": 1.7358763217926025, + "learning_rate": 4.967751183755577e-05, + "loss": 5.3509, + "step": 8607 + }, + { + "epoch": 0.05119421448282425, + "grad_norm": 1.6836191415786743, + "learning_rate": 4.967743704983152e-05, + "loss": 5.4656, + "step": 8608 + }, + { + "epoch": 0.05120016176610524, + "grad_norm": 1.4641087055206299, + "learning_rate": 4.967736225349263e-05, + "loss": 5.5304, + "step": 8609 + }, + { + "epoch": 0.05120610904938624, + "grad_norm": 1.6273541450500488, + "learning_rate": 4.967728744853913e-05, + "loss": 5.4029, + "step": 8610 + }, + { + "epoch": 0.05121205633266724, + "grad_norm": 1.6471314430236816, + "learning_rate": 4.967721263497105e-05, + "loss": 5.4333, + "step": 8611 + }, + { + "epoch": 0.051218003615948235, + "grad_norm": 1.798155665397644, + "learning_rate": 4.96771378127884e-05, + "loss": 5.5214, + "step": 8612 + }, + { + "epoch": 0.05122395089922923, + "grad_norm": 1.8606700897216797, + "learning_rate": 4.967706298199122e-05, + "loss": 4.8808, + "step": 8613 + }, + { + "epoch": 0.05122989818251023, + "grad_norm": 1.7144849300384521, + "learning_rate": 4.967698814257953e-05, + "loss": 4.9451, + "step": 8614 + }, + { + "epoch": 0.05123584546579123, + "grad_norm": 1.7411640882492065, + "learning_rate": 4.9676913294553364e-05, + "loss": 4.9771, + "step": 8615 + }, + { + "epoch": 0.05124179274907222, + "grad_norm": 1.7012072801589966, + "learning_rate": 4.9676838437912736e-05, + "loss": 4.9028, + "step": 8616 + }, + { + "epoch": 0.051247740032353224, + "grad_norm": 1.8154243230819702, + "learning_rate": 4.967676357265768e-05, + "loss": 5.4115, + "step": 8617 + }, + { + "epoch": 0.05125368731563422, + "grad_norm": 2.7746822834014893, + "learning_rate": 4.967668869878823e-05, + "loss": 5.5487, + "step": 8618 + }, + { + "epoch": 0.051259634598915214, + "grad_norm": 1.8362152576446533, + "learning_rate": 4.9676613816304395e-05, + "loss": 5.486, + "step": 8619 + }, + { + "epoch": 0.05126558188219621, + "grad_norm": 1.975853681564331, + "learning_rate": 4.967653892520621e-05, + "loss": 5.4348, + "step": 8620 + }, + { + "epoch": 0.05127152916547721, + "grad_norm": 1.8126581907272339, + "learning_rate": 4.96764640254937e-05, + "loss": 5.4558, + "step": 8621 + }, + { + "epoch": 0.05127747644875821, + "grad_norm": 1.6068531274795532, + "learning_rate": 4.967638911716689e-05, + "loss": 5.4672, + "step": 8622 + }, + { + "epoch": 0.0512834237320392, + "grad_norm": 1.6384878158569336, + "learning_rate": 4.9676314200225804e-05, + "loss": 5.1591, + "step": 8623 + }, + { + "epoch": 0.051289371015320204, + "grad_norm": 2.0413742065429688, + "learning_rate": 4.9676239274670474e-05, + "loss": 4.8992, + "step": 8624 + }, + { + "epoch": 0.0512953182986012, + "grad_norm": 1.7591389417648315, + "learning_rate": 4.967616434050093e-05, + "loss": 5.3629, + "step": 8625 + }, + { + "epoch": 0.051301265581882194, + "grad_norm": 1.9222301244735718, + "learning_rate": 4.967608939771719e-05, + "loss": 5.5082, + "step": 8626 + }, + { + "epoch": 0.051307212865163196, + "grad_norm": 1.8040579557418823, + "learning_rate": 4.967601444631928e-05, + "loss": 5.4019, + "step": 8627 + }, + { + "epoch": 0.05131316014844419, + "grad_norm": 2.0685603618621826, + "learning_rate": 4.967593948630723e-05, + "loss": 5.1959, + "step": 8628 + }, + { + "epoch": 0.051319107431725186, + "grad_norm": 1.446341872215271, + "learning_rate": 4.967586451768106e-05, + "loss": 5.4233, + "step": 8629 + }, + { + "epoch": 0.05132505471500619, + "grad_norm": 1.4487289190292358, + "learning_rate": 4.9675789540440806e-05, + "loss": 5.4065, + "step": 8630 + }, + { + "epoch": 0.05133100199828718, + "grad_norm": 2.367469310760498, + "learning_rate": 4.967571455458648e-05, + "loss": 5.3512, + "step": 8631 + }, + { + "epoch": 0.05133694928156818, + "grad_norm": 2.7115249633789062, + "learning_rate": 4.967563956011812e-05, + "loss": 5.4494, + "step": 8632 + }, + { + "epoch": 0.05134289656484918, + "grad_norm": 2.6692097187042236, + "learning_rate": 4.967556455703576e-05, + "loss": 5.2747, + "step": 8633 + }, + { + "epoch": 0.051348843848130175, + "grad_norm": 2.516005754470825, + "learning_rate": 4.967548954533941e-05, + "loss": 5.2305, + "step": 8634 + }, + { + "epoch": 0.05135479113141117, + "grad_norm": 1.6234782934188843, + "learning_rate": 4.96754145250291e-05, + "loss": 5.5192, + "step": 8635 + }, + { + "epoch": 0.051360738414692166, + "grad_norm": 1.9273806810379028, + "learning_rate": 4.9675339496104855e-05, + "loss": 5.4479, + "step": 8636 + }, + { + "epoch": 0.05136668569797317, + "grad_norm": 2.510847568511963, + "learning_rate": 4.967526445856671e-05, + "loss": 4.9858, + "step": 8637 + }, + { + "epoch": 0.05137263298125416, + "grad_norm": 2.3722991943359375, + "learning_rate": 4.967518941241468e-05, + "loss": 5.2287, + "step": 8638 + }, + { + "epoch": 0.05137858026453516, + "grad_norm": 2.286569118499756, + "learning_rate": 4.96751143576488e-05, + "loss": 5.2643, + "step": 8639 + }, + { + "epoch": 0.05138452754781616, + "grad_norm": 2.493534803390503, + "learning_rate": 4.9675039294269086e-05, + "loss": 5.1207, + "step": 8640 + }, + { + "epoch": 0.051390474831097155, + "grad_norm": 2.622694969177246, + "learning_rate": 4.967496422227558e-05, + "loss": 4.9735, + "step": 8641 + }, + { + "epoch": 0.05139642211437815, + "grad_norm": 1.7518365383148193, + "learning_rate": 4.967488914166829e-05, + "loss": 5.8818, + "step": 8642 + }, + { + "epoch": 0.05140236939765915, + "grad_norm": 2.0281870365142822, + "learning_rate": 4.9674814052447256e-05, + "loss": 6.3773, + "step": 8643 + }, + { + "epoch": 0.05140831668094015, + "grad_norm": 1.880083441734314, + "learning_rate": 4.96747389546125e-05, + "loss": 5.831, + "step": 8644 + }, + { + "epoch": 0.05141426396422114, + "grad_norm": 2.0792593955993652, + "learning_rate": 4.967466384816404e-05, + "loss": 5.8799, + "step": 8645 + }, + { + "epoch": 0.051420211247502144, + "grad_norm": 2.4550280570983887, + "learning_rate": 4.967458873310192e-05, + "loss": 5.2983, + "step": 8646 + }, + { + "epoch": 0.05142615853078314, + "grad_norm": 2.5590765476226807, + "learning_rate": 4.967451360942615e-05, + "loss": 5.1157, + "step": 8647 + }, + { + "epoch": 0.051432105814064134, + "grad_norm": 2.2328450679779053, + "learning_rate": 4.967443847713677e-05, + "loss": 5.047, + "step": 8648 + }, + { + "epoch": 0.05143805309734513, + "grad_norm": 2.0624022483825684, + "learning_rate": 4.9674363336233786e-05, + "loss": 5.6819, + "step": 8649 + }, + { + "epoch": 0.05144400038062613, + "grad_norm": 2.075239658355713, + "learning_rate": 4.9674288186717246e-05, + "loss": 5.895, + "step": 8650 + }, + { + "epoch": 0.05144994766390713, + "grad_norm": 1.7228562831878662, + "learning_rate": 4.967421302858716e-05, + "loss": 5.9199, + "step": 8651 + }, + { + "epoch": 0.05145589494718812, + "grad_norm": 2.235020637512207, + "learning_rate": 4.967413786184356e-05, + "loss": 5.0644, + "step": 8652 + }, + { + "epoch": 0.051461842230469124, + "grad_norm": 1.8620972633361816, + "learning_rate": 4.967406268648648e-05, + "loss": 5.7956, + "step": 8653 + }, + { + "epoch": 0.05146778951375012, + "grad_norm": 1.7914378643035889, + "learning_rate": 4.967398750251594e-05, + "loss": 5.742, + "step": 8654 + }, + { + "epoch": 0.051473736797031114, + "grad_norm": 2.0010504722595215, + "learning_rate": 4.967391230993196e-05, + "loss": 5.7808, + "step": 8655 + }, + { + "epoch": 0.051479684080312116, + "grad_norm": 2.1851212978363037, + "learning_rate": 4.9673837108734575e-05, + "loss": 5.4217, + "step": 8656 + }, + { + "epoch": 0.05148563136359311, + "grad_norm": 1.6896641254425049, + "learning_rate": 4.967376189892382e-05, + "loss": 6.321, + "step": 8657 + }, + { + "epoch": 0.051491578646874106, + "grad_norm": 1.7083675861358643, + "learning_rate": 4.967368668049969e-05, + "loss": 5.495, + "step": 8658 + }, + { + "epoch": 0.05149752593015511, + "grad_norm": 2.537256956100464, + "learning_rate": 4.967361145346224e-05, + "loss": 5.4096, + "step": 8659 + }, + { + "epoch": 0.0515034732134361, + "grad_norm": 2.3463892936706543, + "learning_rate": 4.967353621781149e-05, + "loss": 6.2461, + "step": 8660 + }, + { + "epoch": 0.0515094204967171, + "grad_norm": 1.6834701299667358, + "learning_rate": 4.967346097354746e-05, + "loss": 6.1007, + "step": 8661 + }, + { + "epoch": 0.0515153677799981, + "grad_norm": 2.140557289123535, + "learning_rate": 4.9673385720670184e-05, + "loss": 5.9908, + "step": 8662 + }, + { + "epoch": 0.051521315063279095, + "grad_norm": 2.211639165878296, + "learning_rate": 4.9673310459179676e-05, + "loss": 6.4192, + "step": 8663 + }, + { + "epoch": 0.05152726234656009, + "grad_norm": 1.8421399593353271, + "learning_rate": 4.9673235189075975e-05, + "loss": 6.099, + "step": 8664 + }, + { + "epoch": 0.051533209629841085, + "grad_norm": 1.7775965929031372, + "learning_rate": 4.96731599103591e-05, + "loss": 5.9572, + "step": 8665 + }, + { + "epoch": 0.05153915691312209, + "grad_norm": 1.7500132322311401, + "learning_rate": 4.967308462302909e-05, + "loss": 6.0987, + "step": 8666 + }, + { + "epoch": 0.05154510419640308, + "grad_norm": 1.7952892780303955, + "learning_rate": 4.967300932708595e-05, + "loss": 6.0235, + "step": 8667 + }, + { + "epoch": 0.05155105147968408, + "grad_norm": 1.7696008682250977, + "learning_rate": 4.967293402252972e-05, + "loss": 5.8253, + "step": 8668 + }, + { + "epoch": 0.05155699876296508, + "grad_norm": 1.848975419998169, + "learning_rate": 4.967285870936042e-05, + "loss": 6.0942, + "step": 8669 + }, + { + "epoch": 0.051562946046246075, + "grad_norm": 2.412909507751465, + "learning_rate": 4.967278338757808e-05, + "loss": 5.5752, + "step": 8670 + }, + { + "epoch": 0.05156889332952707, + "grad_norm": 2.0214738845825195, + "learning_rate": 4.967270805718273e-05, + "loss": 5.5721, + "step": 8671 + }, + { + "epoch": 0.05157484061280807, + "grad_norm": 2.3830201625823975, + "learning_rate": 4.967263271817439e-05, + "loss": 6.034, + "step": 8672 + }, + { + "epoch": 0.05158078789608907, + "grad_norm": 2.213979959487915, + "learning_rate": 4.9672557370553094e-05, + "loss": 6.0169, + "step": 8673 + }, + { + "epoch": 0.05158673517937006, + "grad_norm": 1.9657354354858398, + "learning_rate": 4.967248201431887e-05, + "loss": 6.0159, + "step": 8674 + }, + { + "epoch": 0.051592682462651064, + "grad_norm": 2.0882673263549805, + "learning_rate": 4.967240664947172e-05, + "loss": 6.1088, + "step": 8675 + }, + { + "epoch": 0.05159862974593206, + "grad_norm": 2.291152000427246, + "learning_rate": 4.96723312760117e-05, + "loss": 5.4534, + "step": 8676 + }, + { + "epoch": 0.051604577029213054, + "grad_norm": 2.3495421409606934, + "learning_rate": 4.967225589393881e-05, + "loss": 5.5524, + "step": 8677 + }, + { + "epoch": 0.05161052431249405, + "grad_norm": 2.2665255069732666, + "learning_rate": 4.9672180503253106e-05, + "loss": 5.5208, + "step": 8678 + }, + { + "epoch": 0.05161647159577505, + "grad_norm": 2.1587207317352295, + "learning_rate": 4.9672105103954594e-05, + "loss": 5.7016, + "step": 8679 + }, + { + "epoch": 0.051622418879056046, + "grad_norm": 2.2260420322418213, + "learning_rate": 4.96720296960433e-05, + "loss": 5.6179, + "step": 8680 + }, + { + "epoch": 0.05162836616233704, + "grad_norm": 3.1678147315979004, + "learning_rate": 4.967195427951926e-05, + "loss": 5.4655, + "step": 8681 + }, + { + "epoch": 0.051634313445618044, + "grad_norm": 3.0126166343688965, + "learning_rate": 4.967187885438249e-05, + "loss": 5.5663, + "step": 8682 + }, + { + "epoch": 0.05164026072889904, + "grad_norm": 2.290069341659546, + "learning_rate": 4.9671803420633034e-05, + "loss": 5.7462, + "step": 8683 + }, + { + "epoch": 0.051646208012180034, + "grad_norm": 2.1958532333374023, + "learning_rate": 4.96717279782709e-05, + "loss": 5.8359, + "step": 8684 + }, + { + "epoch": 0.051652155295461036, + "grad_norm": 2.063312530517578, + "learning_rate": 4.967165252729611e-05, + "loss": 5.847, + "step": 8685 + }, + { + "epoch": 0.05165810257874203, + "grad_norm": 1.8041539192199707, + "learning_rate": 4.967157706770872e-05, + "loss": 5.9408, + "step": 8686 + }, + { + "epoch": 0.051664049862023026, + "grad_norm": 1.684831976890564, + "learning_rate": 4.967150159950873e-05, + "loss": 6.019, + "step": 8687 + }, + { + "epoch": 0.05166999714530403, + "grad_norm": 2.4915740489959717, + "learning_rate": 4.967142612269616e-05, + "loss": 5.357, + "step": 8688 + }, + { + "epoch": 0.05167594442858502, + "grad_norm": 2.2621138095855713, + "learning_rate": 4.967135063727106e-05, + "loss": 5.7726, + "step": 8689 + }, + { + "epoch": 0.05168189171186602, + "grad_norm": 1.9304747581481934, + "learning_rate": 4.967127514323345e-05, + "loss": 6.0958, + "step": 8690 + }, + { + "epoch": 0.05168783899514702, + "grad_norm": 1.7657890319824219, + "learning_rate": 4.9671199640583354e-05, + "loss": 6.1036, + "step": 8691 + }, + { + "epoch": 0.051693786278428015, + "grad_norm": 1.7449486255645752, + "learning_rate": 4.9671124129320794e-05, + "loss": 6.0843, + "step": 8692 + }, + { + "epoch": 0.05169973356170901, + "grad_norm": 2.0155117511749268, + "learning_rate": 4.96710486094458e-05, + "loss": 5.9626, + "step": 8693 + }, + { + "epoch": 0.051705680844990005, + "grad_norm": 2.1015188694000244, + "learning_rate": 4.967097308095839e-05, + "loss": 5.6053, + "step": 8694 + }, + { + "epoch": 0.05171162812827101, + "grad_norm": 1.9602909088134766, + "learning_rate": 4.967089754385861e-05, + "loss": 5.1988, + "step": 8695 + }, + { + "epoch": 0.051717575411552, + "grad_norm": 2.141657590866089, + "learning_rate": 4.9670821998146474e-05, + "loss": 5.2994, + "step": 8696 + }, + { + "epoch": 0.051723522694833, + "grad_norm": 2.1301774978637695, + "learning_rate": 4.9670746443822006e-05, + "loss": 5.7935, + "step": 8697 + }, + { + "epoch": 0.051729469978114, + "grad_norm": 1.9465678930282593, + "learning_rate": 4.9670670880885225e-05, + "loss": 5.1861, + "step": 8698 + }, + { + "epoch": 0.051735417261394995, + "grad_norm": 2.177234411239624, + "learning_rate": 4.967059530933618e-05, + "loss": 5.1114, + "step": 8699 + }, + { + "epoch": 0.05174136454467599, + "grad_norm": 2.0886077880859375, + "learning_rate": 4.967051972917488e-05, + "loss": 5.2905, + "step": 8700 + }, + { + "epoch": 0.05174731182795699, + "grad_norm": 1.8517125844955444, + "learning_rate": 4.967044414040136e-05, + "loss": 5.1672, + "step": 8701 + }, + { + "epoch": 0.05175325911123799, + "grad_norm": 1.7342808246612549, + "learning_rate": 4.967036854301564e-05, + "loss": 5.2767, + "step": 8702 + }, + { + "epoch": 0.05175920639451898, + "grad_norm": 1.7315362691879272, + "learning_rate": 4.9670292937017746e-05, + "loss": 5.2897, + "step": 8703 + }, + { + "epoch": 0.051765153677799984, + "grad_norm": 1.8794540166854858, + "learning_rate": 4.967021732240772e-05, + "loss": 5.3808, + "step": 8704 + }, + { + "epoch": 0.05177110096108098, + "grad_norm": 1.8047478199005127, + "learning_rate": 4.9670141699185565e-05, + "loss": 5.1074, + "step": 8705 + }, + { + "epoch": 0.051777048244361974, + "grad_norm": 1.699475884437561, + "learning_rate": 4.967006606735132e-05, + "loss": 5.8162, + "step": 8706 + }, + { + "epoch": 0.05178299552764297, + "grad_norm": 2.008352518081665, + "learning_rate": 4.966999042690501e-05, + "loss": 6.3593, + "step": 8707 + }, + { + "epoch": 0.05178894281092397, + "grad_norm": 1.8776370286941528, + "learning_rate": 4.966991477784667e-05, + "loss": 6.3419, + "step": 8708 + }, + { + "epoch": 0.051794890094204966, + "grad_norm": 2.018157720565796, + "learning_rate": 4.9669839120176306e-05, + "loss": 6.1927, + "step": 8709 + }, + { + "epoch": 0.05180083737748596, + "grad_norm": 1.833764910697937, + "learning_rate": 4.966976345389396e-05, + "loss": 5.0803, + "step": 8710 + }, + { + "epoch": 0.051806784660766964, + "grad_norm": 1.7809339761734009, + "learning_rate": 4.9669687778999655e-05, + "loss": 5.3891, + "step": 8711 + }, + { + "epoch": 0.05181273194404796, + "grad_norm": 1.9905017614364624, + "learning_rate": 4.966961209549341e-05, + "loss": 6.247, + "step": 8712 + }, + { + "epoch": 0.051818679227328954, + "grad_norm": 2.1396658420562744, + "learning_rate": 4.966953640337527e-05, + "loss": 6.2506, + "step": 8713 + }, + { + "epoch": 0.051824626510609956, + "grad_norm": 1.778996467590332, + "learning_rate": 4.9669460702645244e-05, + "loss": 6.1333, + "step": 8714 + }, + { + "epoch": 0.05183057379389095, + "grad_norm": 1.9936842918395996, + "learning_rate": 4.9669384993303366e-05, + "loss": 5.6486, + "step": 8715 + }, + { + "epoch": 0.051836521077171946, + "grad_norm": 1.8064475059509277, + "learning_rate": 4.9669309275349656e-05, + "loss": 6.1217, + "step": 8716 + }, + { + "epoch": 0.05184246836045295, + "grad_norm": 1.9532819986343384, + "learning_rate": 4.966923354878414e-05, + "loss": 5.5402, + "step": 8717 + }, + { + "epoch": 0.05184841564373394, + "grad_norm": 2.4843015670776367, + "learning_rate": 4.966915781360686e-05, + "loss": 4.7674, + "step": 8718 + }, + { + "epoch": 0.05185436292701494, + "grad_norm": 2.7453129291534424, + "learning_rate": 4.9669082069817835e-05, + "loss": 4.4489, + "step": 8719 + }, + { + "epoch": 0.05186031021029594, + "grad_norm": 3.0180628299713135, + "learning_rate": 4.9669006317417084e-05, + "loss": 4.1401, + "step": 8720 + }, + { + "epoch": 0.051866257493576935, + "grad_norm": 2.44638991355896, + "learning_rate": 4.966893055640464e-05, + "loss": 4.7241, + "step": 8721 + }, + { + "epoch": 0.05187220477685793, + "grad_norm": 2.0131804943084717, + "learning_rate": 4.9668854786780514e-05, + "loss": 5.6495, + "step": 8722 + }, + { + "epoch": 0.051878152060138925, + "grad_norm": 2.0331337451934814, + "learning_rate": 4.966877900854476e-05, + "loss": 5.6812, + "step": 8723 + }, + { + "epoch": 0.05188409934341993, + "grad_norm": 2.5784926414489746, + "learning_rate": 4.9668703221697385e-05, + "loss": 5.3617, + "step": 8724 + }, + { + "epoch": 0.05189004662670092, + "grad_norm": 2.599321126937866, + "learning_rate": 4.9668627426238425e-05, + "loss": 5.6273, + "step": 8725 + }, + { + "epoch": 0.05189599390998192, + "grad_norm": 2.53541898727417, + "learning_rate": 4.966855162216789e-05, + "loss": 5.2916, + "step": 8726 + }, + { + "epoch": 0.05190194119326292, + "grad_norm": 2.165160655975342, + "learning_rate": 4.9668475809485825e-05, + "loss": 5.6152, + "step": 8727 + }, + { + "epoch": 0.051907888476543915, + "grad_norm": 2.4488654136657715, + "learning_rate": 4.966839998819225e-05, + "loss": 5.4163, + "step": 8728 + }, + { + "epoch": 0.05191383575982491, + "grad_norm": 2.2756056785583496, + "learning_rate": 4.96683241582872e-05, + "loss": 5.9449, + "step": 8729 + }, + { + "epoch": 0.05191978304310591, + "grad_norm": 2.7889063358306885, + "learning_rate": 4.9668248319770683e-05, + "loss": 5.9502, + "step": 8730 + }, + { + "epoch": 0.05192573032638691, + "grad_norm": 2.620378255844116, + "learning_rate": 4.9668172472642735e-05, + "loss": 4.8344, + "step": 8731 + }, + { + "epoch": 0.0519316776096679, + "grad_norm": 2.2405688762664795, + "learning_rate": 4.9668096616903395e-05, + "loss": 5.598, + "step": 8732 + }, + { + "epoch": 0.051937624892948904, + "grad_norm": 2.3559701442718506, + "learning_rate": 4.9668020752552664e-05, + "loss": 5.7951, + "step": 8733 + }, + { + "epoch": 0.0519435721762299, + "grad_norm": 1.9856364727020264, + "learning_rate": 4.966794487959058e-05, + "loss": 5.3907, + "step": 8734 + }, + { + "epoch": 0.051949519459510894, + "grad_norm": 2.345541000366211, + "learning_rate": 4.966786899801718e-05, + "loss": 5.9875, + "step": 8735 + }, + { + "epoch": 0.05195546674279189, + "grad_norm": 2.4069056510925293, + "learning_rate": 4.9667793107832485e-05, + "loss": 6.0062, + "step": 8736 + }, + { + "epoch": 0.05196141402607289, + "grad_norm": 1.9191378355026245, + "learning_rate": 4.966771720903651e-05, + "loss": 6.1341, + "step": 8737 + }, + { + "epoch": 0.051967361309353886, + "grad_norm": 2.135986089706421, + "learning_rate": 4.9667641301629284e-05, + "loss": 5.6993, + "step": 8738 + }, + { + "epoch": 0.05197330859263488, + "grad_norm": 2.0774824619293213, + "learning_rate": 4.966756538561085e-05, + "loss": 5.9791, + "step": 8739 + }, + { + "epoch": 0.051979255875915883, + "grad_norm": 2.1451659202575684, + "learning_rate": 4.9667489460981224e-05, + "loss": 5.8181, + "step": 8740 + }, + { + "epoch": 0.05198520315919688, + "grad_norm": 2.2769901752471924, + "learning_rate": 4.966741352774043e-05, + "loss": 5.6799, + "step": 8741 + }, + { + "epoch": 0.051991150442477874, + "grad_norm": 2.22038197517395, + "learning_rate": 4.9667337585888494e-05, + "loss": 5.8781, + "step": 8742 + }, + { + "epoch": 0.051997097725758876, + "grad_norm": 2.417508125305176, + "learning_rate": 4.9667261635425446e-05, + "loss": 5.3458, + "step": 8743 + }, + { + "epoch": 0.05200304500903987, + "grad_norm": 2.0334360599517822, + "learning_rate": 4.966718567635131e-05, + "loss": 5.5241, + "step": 8744 + }, + { + "epoch": 0.052008992292320866, + "grad_norm": 2.3476316928863525, + "learning_rate": 4.9667109708666126e-05, + "loss": 5.8786, + "step": 8745 + }, + { + "epoch": 0.05201493957560187, + "grad_norm": 2.160106897354126, + "learning_rate": 4.96670337323699e-05, + "loss": 5.616, + "step": 8746 + }, + { + "epoch": 0.05202088685888286, + "grad_norm": 2.0048086643218994, + "learning_rate": 4.9666957747462665e-05, + "loss": 5.5787, + "step": 8747 + }, + { + "epoch": 0.05202683414216386, + "grad_norm": 2.9226925373077393, + "learning_rate": 4.966688175394446e-05, + "loss": 5.3708, + "step": 8748 + }, + { + "epoch": 0.05203278142544486, + "grad_norm": 1.9020568132400513, + "learning_rate": 4.9666805751815294e-05, + "loss": 5.6037, + "step": 8749 + }, + { + "epoch": 0.052038728708725855, + "grad_norm": 2.218637466430664, + "learning_rate": 4.966672974107519e-05, + "loss": 5.2983, + "step": 8750 + }, + { + "epoch": 0.05204467599200685, + "grad_norm": 2.906625270843506, + "learning_rate": 4.96666537217242e-05, + "loss": 5.1234, + "step": 8751 + }, + { + "epoch": 0.052050623275287845, + "grad_norm": 2.0095551013946533, + "learning_rate": 4.966657769376234e-05, + "loss": 5.2695, + "step": 8752 + }, + { + "epoch": 0.05205657055856885, + "grad_norm": 2.1369643211364746, + "learning_rate": 4.966650165718963e-05, + "loss": 5.5426, + "step": 8753 + }, + { + "epoch": 0.05206251784184984, + "grad_norm": 2.4762122631073, + "learning_rate": 4.966642561200608e-05, + "loss": 5.5595, + "step": 8754 + }, + { + "epoch": 0.05206846512513084, + "grad_norm": 2.199430227279663, + "learning_rate": 4.966634955821176e-05, + "loss": 5.5155, + "step": 8755 + }, + { + "epoch": 0.05207441240841184, + "grad_norm": 2.132460355758667, + "learning_rate": 4.966627349580666e-05, + "loss": 5.5344, + "step": 8756 + }, + { + "epoch": 0.052080359691692835, + "grad_norm": 2.4437100887298584, + "learning_rate": 4.966619742479082e-05, + "loss": 5.0135, + "step": 8757 + }, + { + "epoch": 0.05208630697497383, + "grad_norm": 1.5223499536514282, + "learning_rate": 4.9666121345164265e-05, + "loss": 5.5467, + "step": 8758 + }, + { + "epoch": 0.05209225425825483, + "grad_norm": 2.101797580718994, + "learning_rate": 4.966604525692702e-05, + "loss": 5.9493, + "step": 8759 + }, + { + "epoch": 0.05209820154153583, + "grad_norm": 1.9338927268981934, + "learning_rate": 4.966596916007912e-05, + "loss": 5.6625, + "step": 8760 + }, + { + "epoch": 0.05210414882481682, + "grad_norm": 2.1328654289245605, + "learning_rate": 4.966589305462058e-05, + "loss": 6.3202, + "step": 8761 + }, + { + "epoch": 0.052110096108097824, + "grad_norm": 1.963287115097046, + "learning_rate": 4.9665816940551434e-05, + "loss": 5.8885, + "step": 8762 + }, + { + "epoch": 0.05211604339137882, + "grad_norm": 2.124155282974243, + "learning_rate": 4.96657408178717e-05, + "loss": 5.6015, + "step": 8763 + }, + { + "epoch": 0.052121990674659814, + "grad_norm": 2.1011505126953125, + "learning_rate": 4.966566468658142e-05, + "loss": 5.7786, + "step": 8764 + }, + { + "epoch": 0.05212793795794081, + "grad_norm": 1.769573450088501, + "learning_rate": 4.966558854668061e-05, + "loss": 5.8229, + "step": 8765 + }, + { + "epoch": 0.05213388524122181, + "grad_norm": 1.7712751626968384, + "learning_rate": 4.966551239816929e-05, + "loss": 5.733, + "step": 8766 + }, + { + "epoch": 0.052139832524502806, + "grad_norm": 1.68185555934906, + "learning_rate": 4.9665436241047503e-05, + "loss": 6.015, + "step": 8767 + }, + { + "epoch": 0.0521457798077838, + "grad_norm": 1.8619519472122192, + "learning_rate": 4.966536007531526e-05, + "loss": 5.9545, + "step": 8768 + }, + { + "epoch": 0.0521517270910648, + "grad_norm": 1.6538097858428955, + "learning_rate": 4.96652839009726e-05, + "loss": 5.6138, + "step": 8769 + }, + { + "epoch": 0.0521576743743458, + "grad_norm": 1.721737027168274, + "learning_rate": 4.966520771801955e-05, + "loss": 6.0001, + "step": 8770 + }, + { + "epoch": 0.052163621657626794, + "grad_norm": 1.8449060916900635, + "learning_rate": 4.966513152645612e-05, + "loss": 5.6811, + "step": 8771 + }, + { + "epoch": 0.052169568940907796, + "grad_norm": 2.3810017108917236, + "learning_rate": 4.966505532628235e-05, + "loss": 5.4662, + "step": 8772 + }, + { + "epoch": 0.05217551622418879, + "grad_norm": 2.9262144565582275, + "learning_rate": 4.9664979117498265e-05, + "loss": 5.3555, + "step": 8773 + }, + { + "epoch": 0.052181463507469786, + "grad_norm": 2.1560001373291016, + "learning_rate": 4.966490290010389e-05, + "loss": 5.988, + "step": 8774 + }, + { + "epoch": 0.05218741079075079, + "grad_norm": 1.8220587968826294, + "learning_rate": 4.966482667409925e-05, + "loss": 5.8334, + "step": 8775 + }, + { + "epoch": 0.05219335807403178, + "grad_norm": 2.393651008605957, + "learning_rate": 4.9664750439484375e-05, + "loss": 5.5866, + "step": 8776 + }, + { + "epoch": 0.05219930535731278, + "grad_norm": 2.193864583969116, + "learning_rate": 4.966467419625929e-05, + "loss": 5.6642, + "step": 8777 + }, + { + "epoch": 0.05220525264059378, + "grad_norm": 2.24094820022583, + "learning_rate": 4.966459794442403e-05, + "loss": 5.7149, + "step": 8778 + }, + { + "epoch": 0.052211199923874775, + "grad_norm": 2.447439670562744, + "learning_rate": 4.9664521683978606e-05, + "loss": 5.4759, + "step": 8779 + }, + { + "epoch": 0.05221714720715577, + "grad_norm": 1.9538700580596924, + "learning_rate": 4.9664445414923055e-05, + "loss": 5.7, + "step": 8780 + }, + { + "epoch": 0.052223094490436765, + "grad_norm": 1.8960500955581665, + "learning_rate": 4.966436913725739e-05, + "loss": 5.7852, + "step": 8781 + }, + { + "epoch": 0.05222904177371777, + "grad_norm": 1.9234421253204346, + "learning_rate": 4.966429285098166e-05, + "loss": 5.9842, + "step": 8782 + }, + { + "epoch": 0.05223498905699876, + "grad_norm": 2.2879858016967773, + "learning_rate": 4.966421655609588e-05, + "loss": 5.6572, + "step": 8783 + }, + { + "epoch": 0.05224093634027976, + "grad_norm": 2.287932872772217, + "learning_rate": 4.966414025260008e-05, + "loss": 6.0675, + "step": 8784 + }, + { + "epoch": 0.05224688362356076, + "grad_norm": 1.6395118236541748, + "learning_rate": 4.9664063940494275e-05, + "loss": 5.6846, + "step": 8785 + }, + { + "epoch": 0.052252830906841755, + "grad_norm": 1.7121644020080566, + "learning_rate": 4.966398761977851e-05, + "loss": 5.7014, + "step": 8786 + }, + { + "epoch": 0.05225877819012275, + "grad_norm": 1.6225544214248657, + "learning_rate": 4.966391129045279e-05, + "loss": 5.6152, + "step": 8787 + }, + { + "epoch": 0.05226472547340375, + "grad_norm": 1.8484382629394531, + "learning_rate": 4.966383495251716e-05, + "loss": 5.8109, + "step": 8788 + }, + { + "epoch": 0.05227067275668475, + "grad_norm": 1.8225692510604858, + "learning_rate": 4.966375860597164e-05, + "loss": 6.0587, + "step": 8789 + }, + { + "epoch": 0.05227662003996574, + "grad_norm": 2.0333876609802246, + "learning_rate": 4.9663682250816255e-05, + "loss": 6.1406, + "step": 8790 + }, + { + "epoch": 0.052282567323246744, + "grad_norm": 2.0004124641418457, + "learning_rate": 4.9663605887051036e-05, + "loss": 5.6227, + "step": 8791 + }, + { + "epoch": 0.05228851460652774, + "grad_norm": 1.723655343055725, + "learning_rate": 4.9663529514676005e-05, + "loss": 5.5013, + "step": 8792 + }, + { + "epoch": 0.052294461889808734, + "grad_norm": 1.8351995944976807, + "learning_rate": 4.966345313369119e-05, + "loss": 5.3327, + "step": 8793 + }, + { + "epoch": 0.05230040917308973, + "grad_norm": 1.7514569759368896, + "learning_rate": 4.9663376744096615e-05, + "loss": 5.235, + "step": 8794 + }, + { + "epoch": 0.05230635645637073, + "grad_norm": 1.6678166389465332, + "learning_rate": 4.966330034589232e-05, + "loss": 5.2269, + "step": 8795 + }, + { + "epoch": 0.052312303739651726, + "grad_norm": 1.82132887840271, + "learning_rate": 4.9663223939078315e-05, + "loss": 5.0288, + "step": 8796 + }, + { + "epoch": 0.05231825102293272, + "grad_norm": 1.7815704345703125, + "learning_rate": 4.966314752365463e-05, + "loss": 5.4489, + "step": 8797 + }, + { + "epoch": 0.05232419830621372, + "grad_norm": 2.5268197059631348, + "learning_rate": 4.96630710996213e-05, + "loss": 5.0321, + "step": 8798 + }, + { + "epoch": 0.05233014558949472, + "grad_norm": 2.921208620071411, + "learning_rate": 4.9662994666978346e-05, + "loss": 5.0826, + "step": 8799 + }, + { + "epoch": 0.052336092872775714, + "grad_norm": 2.83243727684021, + "learning_rate": 4.9662918225725794e-05, + "loss": 4.9754, + "step": 8800 + }, + { + "epoch": 0.052342040156056716, + "grad_norm": 2.960346221923828, + "learning_rate": 4.966284177586368e-05, + "loss": 5.5808, + "step": 8801 + }, + { + "epoch": 0.05234798743933771, + "grad_norm": 2.479055643081665, + "learning_rate": 4.966276531739201e-05, + "loss": 5.3779, + "step": 8802 + }, + { + "epoch": 0.052353934722618706, + "grad_norm": 2.8753128051757812, + "learning_rate": 4.966268885031083e-05, + "loss": 5.4023, + "step": 8803 + }, + { + "epoch": 0.05235988200589971, + "grad_norm": 2.1152822971343994, + "learning_rate": 4.966261237462016e-05, + "loss": 6.1181, + "step": 8804 + }, + { + "epoch": 0.0523658292891807, + "grad_norm": 2.7178313732147217, + "learning_rate": 4.966253589032003e-05, + "loss": 5.1597, + "step": 8805 + }, + { + "epoch": 0.0523717765724617, + "grad_norm": 2.6567695140838623, + "learning_rate": 4.966245939741045e-05, + "loss": 5.0582, + "step": 8806 + }, + { + "epoch": 0.0523777238557427, + "grad_norm": 3.0211431980133057, + "learning_rate": 4.966238289589147e-05, + "loss": 4.8331, + "step": 8807 + }, + { + "epoch": 0.052383671139023695, + "grad_norm": 2.9341561794281006, + "learning_rate": 4.9662306385763114e-05, + "loss": 4.8482, + "step": 8808 + }, + { + "epoch": 0.05238961842230469, + "grad_norm": 2.781118631362915, + "learning_rate": 4.966222986702539e-05, + "loss": 4.9199, + "step": 8809 + }, + { + "epoch": 0.052395565705585685, + "grad_norm": 2.459233283996582, + "learning_rate": 4.9662153339678344e-05, + "loss": 5.4156, + "step": 8810 + }, + { + "epoch": 0.05240151298886669, + "grad_norm": 1.9862231016159058, + "learning_rate": 4.966207680372199e-05, + "loss": 5.3937, + "step": 8811 + }, + { + "epoch": 0.05240746027214768, + "grad_norm": 3.3698437213897705, + "learning_rate": 4.966200025915636e-05, + "loss": 4.6231, + "step": 8812 + }, + { + "epoch": 0.05241340755542868, + "grad_norm": 2.9254424571990967, + "learning_rate": 4.9661923705981486e-05, + "loss": 4.5612, + "step": 8813 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 2.684386968612671, + "learning_rate": 4.966184714419738e-05, + "loss": 4.8646, + "step": 8814 + }, + { + "epoch": 0.052425302121990675, + "grad_norm": 2.812406539916992, + "learning_rate": 4.966177057380409e-05, + "loss": 4.5116, + "step": 8815 + }, + { + "epoch": 0.05243124940527167, + "grad_norm": 2.1739046573638916, + "learning_rate": 4.966169399480162e-05, + "loss": 5.3369, + "step": 8816 + }, + { + "epoch": 0.05243719668855267, + "grad_norm": 2.408341407775879, + "learning_rate": 4.966161740719001e-05, + "loss": 5.0368, + "step": 8817 + }, + { + "epoch": 0.05244314397183367, + "grad_norm": 2.2844927310943604, + "learning_rate": 4.966154081096929e-05, + "loss": 5.0657, + "step": 8818 + }, + { + "epoch": 0.05244909125511466, + "grad_norm": 2.5329723358154297, + "learning_rate": 4.9661464206139475e-05, + "loss": 5.2006, + "step": 8819 + }, + { + "epoch": 0.052455038538395664, + "grad_norm": 2.154224395751953, + "learning_rate": 4.9661387592700595e-05, + "loss": 5.238, + "step": 8820 + }, + { + "epoch": 0.05246098582167666, + "grad_norm": 2.1069657802581787, + "learning_rate": 4.966131097065269e-05, + "loss": 5.0894, + "step": 8821 + }, + { + "epoch": 0.052466933104957654, + "grad_norm": 2.165954351425171, + "learning_rate": 4.9661234339995763e-05, + "loss": 5.1148, + "step": 8822 + }, + { + "epoch": 0.052472880388238656, + "grad_norm": 1.8859459161758423, + "learning_rate": 4.9661157700729866e-05, + "loss": 5.1703, + "step": 8823 + }, + { + "epoch": 0.05247882767151965, + "grad_norm": 1.9739452600479126, + "learning_rate": 4.9661081052855004e-05, + "loss": 5.3978, + "step": 8824 + }, + { + "epoch": 0.052484774954800646, + "grad_norm": 1.95566987991333, + "learning_rate": 4.966100439637122e-05, + "loss": 5.3592, + "step": 8825 + }, + { + "epoch": 0.05249072223808164, + "grad_norm": 1.8613550662994385, + "learning_rate": 4.966092773127853e-05, + "loss": 5.3746, + "step": 8826 + }, + { + "epoch": 0.05249666952136264, + "grad_norm": 2.001701831817627, + "learning_rate": 4.9660851057576966e-05, + "loss": 5.3269, + "step": 8827 + }, + { + "epoch": 0.05250261680464364, + "grad_norm": 1.8846383094787598, + "learning_rate": 4.9660774375266556e-05, + "loss": 5.7906, + "step": 8828 + }, + { + "epoch": 0.052508564087924633, + "grad_norm": 1.982998251914978, + "learning_rate": 4.966069768434732e-05, + "loss": 5.6609, + "step": 8829 + }, + { + "epoch": 0.052514511371205636, + "grad_norm": 2.3036038875579834, + "learning_rate": 4.9660620984819294e-05, + "loss": 5.6172, + "step": 8830 + }, + { + "epoch": 0.05252045865448663, + "grad_norm": 1.9227113723754883, + "learning_rate": 4.9660544276682496e-05, + "loss": 5.4734, + "step": 8831 + }, + { + "epoch": 0.052526405937767626, + "grad_norm": 2.038203716278076, + "learning_rate": 4.9660467559936964e-05, + "loss": 5.6484, + "step": 8832 + }, + { + "epoch": 0.05253235322104863, + "grad_norm": 2.217108964920044, + "learning_rate": 4.9660390834582704e-05, + "loss": 5.4064, + "step": 8833 + }, + { + "epoch": 0.05253830050432962, + "grad_norm": 2.4458765983581543, + "learning_rate": 4.966031410061976e-05, + "loss": 5.605, + "step": 8834 + }, + { + "epoch": 0.05254424778761062, + "grad_norm": 2.2767014503479004, + "learning_rate": 4.966023735804817e-05, + "loss": 5.4258, + "step": 8835 + }, + { + "epoch": 0.05255019507089162, + "grad_norm": 2.3594579696655273, + "learning_rate": 4.9660160606867936e-05, + "loss": 5.5138, + "step": 8836 + }, + { + "epoch": 0.052556142354172615, + "grad_norm": 1.8961461782455444, + "learning_rate": 4.966008384707909e-05, + "loss": 5.9879, + "step": 8837 + }, + { + "epoch": 0.05256208963745361, + "grad_norm": 1.824751615524292, + "learning_rate": 4.966000707868167e-05, + "loss": 5.4558, + "step": 8838 + }, + { + "epoch": 0.052568036920734605, + "grad_norm": 2.005291223526001, + "learning_rate": 4.9659930301675694e-05, + "loss": 5.821, + "step": 8839 + }, + { + "epoch": 0.05257398420401561, + "grad_norm": 2.0951414108276367, + "learning_rate": 4.965985351606119e-05, + "loss": 5.2816, + "step": 8840 + }, + { + "epoch": 0.0525799314872966, + "grad_norm": 2.236849069595337, + "learning_rate": 4.9659776721838194e-05, + "loss": 5.4734, + "step": 8841 + }, + { + "epoch": 0.0525858787705776, + "grad_norm": 1.8877390623092651, + "learning_rate": 4.965969991900671e-05, + "loss": 5.2445, + "step": 8842 + }, + { + "epoch": 0.0525918260538586, + "grad_norm": 2.726071834564209, + "learning_rate": 4.9659623107566785e-05, + "loss": 5.6059, + "step": 8843 + }, + { + "epoch": 0.052597773337139594, + "grad_norm": 2.279759168624878, + "learning_rate": 4.965954628751844e-05, + "loss": 5.6755, + "step": 8844 + }, + { + "epoch": 0.05260372062042059, + "grad_norm": 1.9941623210906982, + "learning_rate": 4.965946945886171e-05, + "loss": 5.5222, + "step": 8845 + }, + { + "epoch": 0.05260966790370159, + "grad_norm": 2.0556750297546387, + "learning_rate": 4.965939262159661e-05, + "loss": 5.6064, + "step": 8846 + }, + { + "epoch": 0.05261561518698259, + "grad_norm": 1.9260958433151245, + "learning_rate": 4.965931577572317e-05, + "loss": 5.6264, + "step": 8847 + }, + { + "epoch": 0.05262156247026358, + "grad_norm": 2.1252758502960205, + "learning_rate": 4.9659238921241413e-05, + "loss": 5.9832, + "step": 8848 + }, + { + "epoch": 0.052627509753544584, + "grad_norm": 1.8081480264663696, + "learning_rate": 4.9659162058151377e-05, + "loss": 5.4391, + "step": 8849 + }, + { + "epoch": 0.05263345703682558, + "grad_norm": 1.8439849615097046, + "learning_rate": 4.965908518645308e-05, + "loss": 5.5351, + "step": 8850 + }, + { + "epoch": 0.052639404320106574, + "grad_norm": 2.1782681941986084, + "learning_rate": 4.9659008306146556e-05, + "loss": 5.9692, + "step": 8851 + }, + { + "epoch": 0.052645351603387576, + "grad_norm": 2.0206944942474365, + "learning_rate": 4.965893141723182e-05, + "loss": 5.4736, + "step": 8852 + }, + { + "epoch": 0.05265129888666857, + "grad_norm": 2.283517360687256, + "learning_rate": 4.965885451970891e-05, + "loss": 5.4504, + "step": 8853 + }, + { + "epoch": 0.052657246169949566, + "grad_norm": 2.701608180999756, + "learning_rate": 4.965877761357784e-05, + "loss": 5.318, + "step": 8854 + }, + { + "epoch": 0.05266319345323056, + "grad_norm": 2.8494722843170166, + "learning_rate": 4.965870069883866e-05, + "loss": 4.9835, + "step": 8855 + }, + { + "epoch": 0.05266914073651156, + "grad_norm": 2.0555408000946045, + "learning_rate": 4.965862377549137e-05, + "loss": 5.7587, + "step": 8856 + }, + { + "epoch": 0.05267508801979256, + "grad_norm": 2.3476004600524902, + "learning_rate": 4.9658546843536014e-05, + "loss": 5.8775, + "step": 8857 + }, + { + "epoch": 0.05268103530307355, + "grad_norm": 1.8152700662612915, + "learning_rate": 4.965846990297262e-05, + "loss": 5.6274, + "step": 8858 + }, + { + "epoch": 0.052686982586354555, + "grad_norm": 2.1541671752929688, + "learning_rate": 4.965839295380119e-05, + "loss": 5.6786, + "step": 8859 + }, + { + "epoch": 0.05269292986963555, + "grad_norm": 2.1708984375, + "learning_rate": 4.965831599602179e-05, + "loss": 5.8817, + "step": 8860 + }, + { + "epoch": 0.052698877152916546, + "grad_norm": 1.6558966636657715, + "learning_rate": 4.9658239029634415e-05, + "loss": 5.5375, + "step": 8861 + }, + { + "epoch": 0.05270482443619755, + "grad_norm": 2.1165130138397217, + "learning_rate": 4.9658162054639115e-05, + "loss": 5.5936, + "step": 8862 + }, + { + "epoch": 0.05271077171947854, + "grad_norm": 2.4143176078796387, + "learning_rate": 4.9658085071035893e-05, + "loss": 5.71, + "step": 8863 + }, + { + "epoch": 0.05271671900275954, + "grad_norm": 1.9471622705459595, + "learning_rate": 4.965800807882479e-05, + "loss": 5.7588, + "step": 8864 + }, + { + "epoch": 0.05272266628604054, + "grad_norm": 2.2014408111572266, + "learning_rate": 4.9657931078005835e-05, + "loss": 5.7699, + "step": 8865 + }, + { + "epoch": 0.052728613569321535, + "grad_norm": 1.7588191032409668, + "learning_rate": 4.965785406857905e-05, + "loss": 5.3921, + "step": 8866 + }, + { + "epoch": 0.05273456085260253, + "grad_norm": 1.835635781288147, + "learning_rate": 4.965777705054446e-05, + "loss": 5.1531, + "step": 8867 + }, + { + "epoch": 0.052740508135883525, + "grad_norm": 2.3071937561035156, + "learning_rate": 4.96577000239021e-05, + "loss": 5.5926, + "step": 8868 + }, + { + "epoch": 0.05274645541916453, + "grad_norm": 2.195712089538574, + "learning_rate": 4.9657622988651995e-05, + "loss": 5.4579, + "step": 8869 + }, + { + "epoch": 0.05275240270244552, + "grad_norm": 2.273738145828247, + "learning_rate": 4.9657545944794156e-05, + "loss": 5.6138, + "step": 8870 + }, + { + "epoch": 0.05275834998572652, + "grad_norm": 2.208343982696533, + "learning_rate": 4.9657468892328626e-05, + "loss": 5.5508, + "step": 8871 + }, + { + "epoch": 0.05276429726900752, + "grad_norm": 2.2111566066741943, + "learning_rate": 4.965739183125544e-05, + "loss": 5.7044, + "step": 8872 + }, + { + "epoch": 0.052770244552288514, + "grad_norm": 1.7516666650772095, + "learning_rate": 4.96573147615746e-05, + "loss": 5.4357, + "step": 8873 + }, + { + "epoch": 0.05277619183556951, + "grad_norm": 2.0703322887420654, + "learning_rate": 4.9657237683286155e-05, + "loss": 5.5383, + "step": 8874 + }, + { + "epoch": 0.05278213911885051, + "grad_norm": 1.796243667602539, + "learning_rate": 4.965716059639012e-05, + "loss": 5.5024, + "step": 8875 + }, + { + "epoch": 0.05278808640213151, + "grad_norm": 2.322397232055664, + "learning_rate": 4.9657083500886526e-05, + "loss": 5.8814, + "step": 8876 + }, + { + "epoch": 0.0527940336854125, + "grad_norm": 2.6743311882019043, + "learning_rate": 4.96570063967754e-05, + "loss": 5.4989, + "step": 8877 + }, + { + "epoch": 0.052799980968693504, + "grad_norm": 2.4381649494171143, + "learning_rate": 4.965692928405676e-05, + "loss": 5.5807, + "step": 8878 + }, + { + "epoch": 0.0528059282519745, + "grad_norm": 2.3703296184539795, + "learning_rate": 4.9656852162730646e-05, + "loss": 5.5586, + "step": 8879 + }, + { + "epoch": 0.052811875535255494, + "grad_norm": 1.7828437089920044, + "learning_rate": 4.9656775032797075e-05, + "loss": 5.2553, + "step": 8880 + }, + { + "epoch": 0.052817822818536496, + "grad_norm": 1.730290412902832, + "learning_rate": 4.9656697894256085e-05, + "loss": 5.3558, + "step": 8881 + }, + { + "epoch": 0.05282377010181749, + "grad_norm": 1.6909739971160889, + "learning_rate": 4.9656620747107694e-05, + "loss": 5.4397, + "step": 8882 + }, + { + "epoch": 0.052829717385098486, + "grad_norm": 1.9772145748138428, + "learning_rate": 4.965654359135193e-05, + "loss": 5.5786, + "step": 8883 + }, + { + "epoch": 0.05283566466837948, + "grad_norm": 1.8624964952468872, + "learning_rate": 4.965646642698883e-05, + "loss": 5.5466, + "step": 8884 + }, + { + "epoch": 0.05284161195166048, + "grad_norm": 1.7061936855316162, + "learning_rate": 4.96563892540184e-05, + "loss": 5.3439, + "step": 8885 + }, + { + "epoch": 0.05284755923494148, + "grad_norm": 1.715483546257019, + "learning_rate": 4.965631207244069e-05, + "loss": 5.2732, + "step": 8886 + }, + { + "epoch": 0.05285350651822247, + "grad_norm": 1.7801883220672607, + "learning_rate": 4.965623488225571e-05, + "loss": 5.2427, + "step": 8887 + }, + { + "epoch": 0.052859453801503475, + "grad_norm": 1.5122452974319458, + "learning_rate": 4.9656157683463495e-05, + "loss": 5.2812, + "step": 8888 + }, + { + "epoch": 0.05286540108478447, + "grad_norm": 1.878077507019043, + "learning_rate": 4.965608047606407e-05, + "loss": 5.6385, + "step": 8889 + }, + { + "epoch": 0.052871348368065466, + "grad_norm": 2.0781304836273193, + "learning_rate": 4.965600326005746e-05, + "loss": 5.3345, + "step": 8890 + }, + { + "epoch": 0.05287729565134647, + "grad_norm": 1.953302264213562, + "learning_rate": 4.965592603544369e-05, + "loss": 5.2694, + "step": 8891 + }, + { + "epoch": 0.05288324293462746, + "grad_norm": 1.9993265867233276, + "learning_rate": 4.96558488022228e-05, + "loss": 5.3323, + "step": 8892 + }, + { + "epoch": 0.05288919021790846, + "grad_norm": 1.7653480768203735, + "learning_rate": 4.96557715603948e-05, + "loss": 5.389, + "step": 8893 + }, + { + "epoch": 0.05289513750118946, + "grad_norm": 1.8843438625335693, + "learning_rate": 4.965569430995973e-05, + "loss": 5.3334, + "step": 8894 + }, + { + "epoch": 0.052901084784470455, + "grad_norm": 1.6673407554626465, + "learning_rate": 4.9655617050917616e-05, + "loss": 5.4469, + "step": 8895 + }, + { + "epoch": 0.05290703206775145, + "grad_norm": 1.8208844661712646, + "learning_rate": 4.9655539783268476e-05, + "loss": 5.6288, + "step": 8896 + }, + { + "epoch": 0.052912979351032445, + "grad_norm": 1.755162000656128, + "learning_rate": 4.965546250701234e-05, + "loss": 5.4388, + "step": 8897 + }, + { + "epoch": 0.05291892663431345, + "grad_norm": 1.9435405731201172, + "learning_rate": 4.965538522214924e-05, + "loss": 5.5877, + "step": 8898 + }, + { + "epoch": 0.05292487391759444, + "grad_norm": 1.8579509258270264, + "learning_rate": 4.9655307928679196e-05, + "loss": 5.4405, + "step": 8899 + }, + { + "epoch": 0.05293082120087544, + "grad_norm": 1.8897236585617065, + "learning_rate": 4.9655230626602246e-05, + "loss": 5.2931, + "step": 8900 + }, + { + "epoch": 0.05293676848415644, + "grad_norm": 1.928133487701416, + "learning_rate": 4.9655153315918403e-05, + "loss": 5.2345, + "step": 8901 + }, + { + "epoch": 0.052942715767437434, + "grad_norm": 1.8830339908599854, + "learning_rate": 4.96550759966277e-05, + "loss": 5.3288, + "step": 8902 + }, + { + "epoch": 0.05294866305071843, + "grad_norm": 1.6774102449417114, + "learning_rate": 4.9654998668730167e-05, + "loss": 5.2939, + "step": 8903 + }, + { + "epoch": 0.05295461033399943, + "grad_norm": 1.7440418004989624, + "learning_rate": 4.9654921332225826e-05, + "loss": 5.4663, + "step": 8904 + }, + { + "epoch": 0.05296055761728043, + "grad_norm": 1.92295241355896, + "learning_rate": 4.965484398711471e-05, + "loss": 5.556, + "step": 8905 + }, + { + "epoch": 0.05296650490056142, + "grad_norm": 1.5319017171859741, + "learning_rate": 4.965476663339684e-05, + "loss": 5.5267, + "step": 8906 + }, + { + "epoch": 0.052972452183842424, + "grad_norm": 1.7626374959945679, + "learning_rate": 4.9654689271072255e-05, + "loss": 5.3774, + "step": 8907 + }, + { + "epoch": 0.05297839946712342, + "grad_norm": 1.745743989944458, + "learning_rate": 4.965461190014096e-05, + "loss": 5.4877, + "step": 8908 + }, + { + "epoch": 0.052984346750404414, + "grad_norm": 1.6091177463531494, + "learning_rate": 4.9654534520603e-05, + "loss": 5.2969, + "step": 8909 + }, + { + "epoch": 0.052990294033685416, + "grad_norm": 1.7392489910125732, + "learning_rate": 4.96544571324584e-05, + "loss": 5.4247, + "step": 8910 + }, + { + "epoch": 0.05299624131696641, + "grad_norm": 1.9275293350219727, + "learning_rate": 4.965437973570718e-05, + "loss": 5.2184, + "step": 8911 + }, + { + "epoch": 0.053002188600247406, + "grad_norm": 1.6901222467422485, + "learning_rate": 4.965430233034937e-05, + "loss": 5.1459, + "step": 8912 + }, + { + "epoch": 0.0530081358835284, + "grad_norm": 1.9212596416473389, + "learning_rate": 4.965422491638499e-05, + "loss": 5.2439, + "step": 8913 + }, + { + "epoch": 0.0530140831668094, + "grad_norm": 1.814706802368164, + "learning_rate": 4.965414749381409e-05, + "loss": 5.5608, + "step": 8914 + }, + { + "epoch": 0.0530200304500904, + "grad_norm": 1.7997081279754639, + "learning_rate": 4.965407006263668e-05, + "loss": 5.6099, + "step": 8915 + }, + { + "epoch": 0.05302597773337139, + "grad_norm": 1.8545546531677246, + "learning_rate": 4.9653992622852777e-05, + "loss": 5.5844, + "step": 8916 + }, + { + "epoch": 0.053031925016652395, + "grad_norm": 1.665958285331726, + "learning_rate": 4.965391517446243e-05, + "loss": 5.4967, + "step": 8917 + }, + { + "epoch": 0.05303787229993339, + "grad_norm": 1.6157240867614746, + "learning_rate": 4.9653837717465655e-05, + "loss": 5.2523, + "step": 8918 + }, + { + "epoch": 0.053043819583214386, + "grad_norm": 1.9782540798187256, + "learning_rate": 4.965376025186248e-05, + "loss": 5.2384, + "step": 8919 + }, + { + "epoch": 0.05304976686649539, + "grad_norm": 2.0229971408843994, + "learning_rate": 4.9653682777652925e-05, + "loss": 5.1703, + "step": 8920 + }, + { + "epoch": 0.05305571414977638, + "grad_norm": 1.8299061059951782, + "learning_rate": 4.965360529483703e-05, + "loss": 5.0257, + "step": 8921 + }, + { + "epoch": 0.05306166143305738, + "grad_norm": 1.9080857038497925, + "learning_rate": 4.965352780341482e-05, + "loss": 5.2516, + "step": 8922 + }, + { + "epoch": 0.05306760871633838, + "grad_norm": 1.9998538494110107, + "learning_rate": 4.965345030338631e-05, + "loss": 5.1991, + "step": 8923 + }, + { + "epoch": 0.053073555999619375, + "grad_norm": 1.7606618404388428, + "learning_rate": 4.965337279475154e-05, + "loss": 5.2194, + "step": 8924 + }, + { + "epoch": 0.05307950328290037, + "grad_norm": 1.9633625745773315, + "learning_rate": 4.9653295277510525e-05, + "loss": 5.2463, + "step": 8925 + }, + { + "epoch": 0.053085450566181365, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.9653217751663306e-05, + "loss": 5.2737, + "step": 8926 + }, + { + "epoch": 0.05309139784946237, + "grad_norm": 1.836289405822754, + "learning_rate": 4.965314021720991e-05, + "loss": 5.1157, + "step": 8927 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 1.8526496887207031, + "learning_rate": 4.965306267415035e-05, + "loss": 5.6541, + "step": 8928 + }, + { + "epoch": 0.05310329241602436, + "grad_norm": 1.9928539991378784, + "learning_rate": 4.965298512248466e-05, + "loss": 5.194, + "step": 8929 + }, + { + "epoch": 0.05310923969930536, + "grad_norm": 1.601536512374878, + "learning_rate": 4.9652907562212867e-05, + "loss": 5.285, + "step": 8930 + }, + { + "epoch": 0.053115186982586354, + "grad_norm": 1.8940081596374512, + "learning_rate": 4.9652829993335e-05, + "loss": 5.1791, + "step": 8931 + }, + { + "epoch": 0.05312113426586735, + "grad_norm": 1.7984519004821777, + "learning_rate": 4.9652752415851085e-05, + "loss": 5.2225, + "step": 8932 + }, + { + "epoch": 0.05312708154914835, + "grad_norm": 1.7474113702774048, + "learning_rate": 4.965267482976115e-05, + "loss": 5.0099, + "step": 8933 + }, + { + "epoch": 0.053133028832429346, + "grad_norm": 1.7044427394866943, + "learning_rate": 4.9652597235065214e-05, + "loss": 5.1456, + "step": 8934 + }, + { + "epoch": 0.05313897611571034, + "grad_norm": 1.5422965288162231, + "learning_rate": 4.9652519631763316e-05, + "loss": 5.0714, + "step": 8935 + }, + { + "epoch": 0.053144923398991344, + "grad_norm": 1.6831375360488892, + "learning_rate": 4.965244201985548e-05, + "loss": 5.0742, + "step": 8936 + }, + { + "epoch": 0.05315087068227234, + "grad_norm": 1.7648097276687622, + "learning_rate": 4.9652364399341734e-05, + "loss": 5.1108, + "step": 8937 + }, + { + "epoch": 0.053156817965553334, + "grad_norm": 1.669393539428711, + "learning_rate": 4.965228677022209e-05, + "loss": 5.1801, + "step": 8938 + }, + { + "epoch": 0.053162765248834336, + "grad_norm": 2.0252909660339355, + "learning_rate": 4.96522091324966e-05, + "loss": 5.3955, + "step": 8939 + }, + { + "epoch": 0.05316871253211533, + "grad_norm": 1.686355710029602, + "learning_rate": 4.965213148616527e-05, + "loss": 5.2626, + "step": 8940 + }, + { + "epoch": 0.053174659815396326, + "grad_norm": 1.7601011991500854, + "learning_rate": 4.965205383122814e-05, + "loss": 5.1603, + "step": 8941 + }, + { + "epoch": 0.05318060709867732, + "grad_norm": 1.7249791622161865, + "learning_rate": 4.9651976167685235e-05, + "loss": 5.4245, + "step": 8942 + }, + { + "epoch": 0.05318655438195832, + "grad_norm": 1.869367003440857, + "learning_rate": 4.9651898495536574e-05, + "loss": 5.2269, + "step": 8943 + }, + { + "epoch": 0.05319250166523932, + "grad_norm": 1.8296380043029785, + "learning_rate": 4.965182081478219e-05, + "loss": 5.3236, + "step": 8944 + }, + { + "epoch": 0.05319844894852031, + "grad_norm": 1.8211008310317993, + "learning_rate": 4.9651743125422115e-05, + "loss": 5.269, + "step": 8945 + }, + { + "epoch": 0.053204396231801315, + "grad_norm": 1.868295431137085, + "learning_rate": 4.965166542745637e-05, + "loss": 5.2733, + "step": 8946 + }, + { + "epoch": 0.05321034351508231, + "grad_norm": 1.6603426933288574, + "learning_rate": 4.965158772088498e-05, + "loss": 5.2685, + "step": 8947 + }, + { + "epoch": 0.053216290798363305, + "grad_norm": 1.680565357208252, + "learning_rate": 4.965151000570798e-05, + "loss": 5.4452, + "step": 8948 + }, + { + "epoch": 0.05322223808164431, + "grad_norm": 1.6473147869110107, + "learning_rate": 4.9651432281925394e-05, + "loss": 5.4476, + "step": 8949 + }, + { + "epoch": 0.0532281853649253, + "grad_norm": 1.5291423797607422, + "learning_rate": 4.965135454953724e-05, + "loss": 5.4617, + "step": 8950 + }, + { + "epoch": 0.0532341326482063, + "grad_norm": 1.4708455801010132, + "learning_rate": 4.965127680854356e-05, + "loss": 5.5431, + "step": 8951 + }, + { + "epoch": 0.0532400799314873, + "grad_norm": 1.4297362565994263, + "learning_rate": 4.9651199058944366e-05, + "loss": 5.431, + "step": 8952 + }, + { + "epoch": 0.053246027214768295, + "grad_norm": 1.726123571395874, + "learning_rate": 4.96511213007397e-05, + "loss": 5.2801, + "step": 8953 + }, + { + "epoch": 0.05325197449804929, + "grad_norm": 1.7977174520492554, + "learning_rate": 4.9651043533929584e-05, + "loss": 5.3273, + "step": 8954 + }, + { + "epoch": 0.053257921781330285, + "grad_norm": 1.8125461339950562, + "learning_rate": 4.9650965758514034e-05, + "loss": 5.3135, + "step": 8955 + }, + { + "epoch": 0.05326386906461129, + "grad_norm": 1.4925352334976196, + "learning_rate": 4.965088797449309e-05, + "loss": 5.1454, + "step": 8956 + }, + { + "epoch": 0.05326981634789228, + "grad_norm": 1.6977181434631348, + "learning_rate": 4.965081018186678e-05, + "loss": 5.3207, + "step": 8957 + }, + { + "epoch": 0.05327576363117328, + "grad_norm": 1.7767595052719116, + "learning_rate": 4.965073238063512e-05, + "loss": 5.203, + "step": 8958 + }, + { + "epoch": 0.05328171091445428, + "grad_norm": 1.53665292263031, + "learning_rate": 4.965065457079815e-05, + "loss": 5.3088, + "step": 8959 + }, + { + "epoch": 0.053287658197735274, + "grad_norm": 1.724476933479309, + "learning_rate": 4.965057675235589e-05, + "loss": 5.2628, + "step": 8960 + }, + { + "epoch": 0.05329360548101627, + "grad_norm": 1.7339463233947754, + "learning_rate": 4.965049892530837e-05, + "loss": 5.3174, + "step": 8961 + }, + { + "epoch": 0.05329955276429727, + "grad_norm": 1.8414005041122437, + "learning_rate": 4.965042108965561e-05, + "loss": 5.2121, + "step": 8962 + }, + { + "epoch": 0.053305500047578266, + "grad_norm": 1.7969903945922852, + "learning_rate": 4.9650343245397655e-05, + "loss": 5.0947, + "step": 8963 + }, + { + "epoch": 0.05331144733085926, + "grad_norm": 1.573320746421814, + "learning_rate": 4.965026539253451e-05, + "loss": 5.0624, + "step": 8964 + }, + { + "epoch": 0.053317394614140264, + "grad_norm": 1.7296351194381714, + "learning_rate": 4.9650187531066204e-05, + "loss": 5.5497, + "step": 8965 + }, + { + "epoch": 0.05332334189742126, + "grad_norm": 1.931847095489502, + "learning_rate": 4.9650109660992784e-05, + "loss": 5.537, + "step": 8966 + }, + { + "epoch": 0.053329289180702254, + "grad_norm": 1.8911564350128174, + "learning_rate": 4.965003178231427e-05, + "loss": 5.4891, + "step": 8967 + }, + { + "epoch": 0.053335236463983256, + "grad_norm": 1.933401107788086, + "learning_rate": 4.964995389503067e-05, + "loss": 5.3157, + "step": 8968 + }, + { + "epoch": 0.05334118374726425, + "grad_norm": 1.8299031257629395, + "learning_rate": 4.964987599914204e-05, + "loss": 5.2955, + "step": 8969 + }, + { + "epoch": 0.053347131030545246, + "grad_norm": 1.5823233127593994, + "learning_rate": 4.964979809464838e-05, + "loss": 5.2708, + "step": 8970 + }, + { + "epoch": 0.05335307831382624, + "grad_norm": 1.602689504623413, + "learning_rate": 4.9649720181549737e-05, + "loss": 5.3646, + "step": 8971 + }, + { + "epoch": 0.05335902559710724, + "grad_norm": 2.2379884719848633, + "learning_rate": 4.964964225984613e-05, + "loss": 5.5453, + "step": 8972 + }, + { + "epoch": 0.05336497288038824, + "grad_norm": 2.2210440635681152, + "learning_rate": 4.964956432953759e-05, + "loss": 5.2123, + "step": 8973 + }, + { + "epoch": 0.05337092016366923, + "grad_norm": 2.4450249671936035, + "learning_rate": 4.964948639062413e-05, + "loss": 5.172, + "step": 8974 + }, + { + "epoch": 0.053376867446950235, + "grad_norm": 1.7727516889572144, + "learning_rate": 4.9649408443105806e-05, + "loss": 5.3447, + "step": 8975 + }, + { + "epoch": 0.05338281473023123, + "grad_norm": 1.8239831924438477, + "learning_rate": 4.964933048698262e-05, + "loss": 5.3628, + "step": 8976 + }, + { + "epoch": 0.053388762013512225, + "grad_norm": 1.9517360925674438, + "learning_rate": 4.964925252225461e-05, + "loss": 5.6118, + "step": 8977 + }, + { + "epoch": 0.05339470929679323, + "grad_norm": 2.1735262870788574, + "learning_rate": 4.9649174548921796e-05, + "loss": 5.7332, + "step": 8978 + }, + { + "epoch": 0.05340065658007422, + "grad_norm": 1.4132062196731567, + "learning_rate": 4.964909656698421e-05, + "loss": 5.8078, + "step": 8979 + }, + { + "epoch": 0.05340660386335522, + "grad_norm": 1.5568846464157104, + "learning_rate": 4.964901857644188e-05, + "loss": 5.6328, + "step": 8980 + }, + { + "epoch": 0.05341255114663622, + "grad_norm": 1.6015586853027344, + "learning_rate": 4.964894057729484e-05, + "loss": 5.3738, + "step": 8981 + }, + { + "epoch": 0.053418498429917215, + "grad_norm": 1.492748737335205, + "learning_rate": 4.9648862569543105e-05, + "loss": 5.4336, + "step": 8982 + }, + { + "epoch": 0.05342444571319821, + "grad_norm": 1.9008845090866089, + "learning_rate": 4.96487845531867e-05, + "loss": 5.455, + "step": 8983 + }, + { + "epoch": 0.053430392996479205, + "grad_norm": 1.9590948820114136, + "learning_rate": 4.9648706528225664e-05, + "loss": 5.3308, + "step": 8984 + }, + { + "epoch": 0.05343634027976021, + "grad_norm": 1.9980428218841553, + "learning_rate": 4.964862849466002e-05, + "loss": 5.3777, + "step": 8985 + }, + { + "epoch": 0.0534422875630412, + "grad_norm": 1.769711971282959, + "learning_rate": 4.964855045248979e-05, + "loss": 5.4451, + "step": 8986 + }, + { + "epoch": 0.0534482348463222, + "grad_norm": 1.769977331161499, + "learning_rate": 4.964847240171502e-05, + "loss": 5.277, + "step": 8987 + }, + { + "epoch": 0.0534541821296032, + "grad_norm": 1.6647396087646484, + "learning_rate": 4.9648394342335705e-05, + "loss": 5.4655, + "step": 8988 + }, + { + "epoch": 0.053460129412884194, + "grad_norm": 1.861554503440857, + "learning_rate": 4.9648316274351906e-05, + "loss": 5.308, + "step": 8989 + }, + { + "epoch": 0.05346607669616519, + "grad_norm": 1.9457745552062988, + "learning_rate": 4.964823819776362e-05, + "loss": 6.2361, + "step": 8990 + }, + { + "epoch": 0.05347202397944619, + "grad_norm": 1.7702157497406006, + "learning_rate": 4.9648160112570896e-05, + "loss": 5.366, + "step": 8991 + }, + { + "epoch": 0.053477971262727186, + "grad_norm": 2.0074565410614014, + "learning_rate": 4.964808201877375e-05, + "loss": 5.3598, + "step": 8992 + }, + { + "epoch": 0.05348391854600818, + "grad_norm": 1.8686721324920654, + "learning_rate": 4.964800391637222e-05, + "loss": 5.4607, + "step": 8993 + }, + { + "epoch": 0.053489865829289183, + "grad_norm": 1.9749736785888672, + "learning_rate": 4.964792580536632e-05, + "loss": 5.3734, + "step": 8994 + }, + { + "epoch": 0.05349581311257018, + "grad_norm": 1.8435015678405762, + "learning_rate": 4.964784768575609e-05, + "loss": 5.3815, + "step": 8995 + }, + { + "epoch": 0.053501760395851174, + "grad_norm": 2.01983380317688, + "learning_rate": 4.9647769557541546e-05, + "loss": 5.4089, + "step": 8996 + }, + { + "epoch": 0.053507707679132176, + "grad_norm": 2.014798402786255, + "learning_rate": 4.964769142072272e-05, + "loss": 5.3906, + "step": 8997 + }, + { + "epoch": 0.05351365496241317, + "grad_norm": 1.8822753429412842, + "learning_rate": 4.9647613275299644e-05, + "loss": 5.3598, + "step": 8998 + }, + { + "epoch": 0.053519602245694166, + "grad_norm": 1.6534459590911865, + "learning_rate": 4.9647535121272334e-05, + "loss": 5.4577, + "step": 8999 + }, + { + "epoch": 0.05352554952897516, + "grad_norm": 1.6497015953063965, + "learning_rate": 4.964745695864083e-05, + "loss": 5.3915, + "step": 9000 + }, + { + "epoch": 0.05353149681225616, + "grad_norm": 1.5535780191421509, + "learning_rate": 4.964737878740515e-05, + "loss": 5.2444, + "step": 9001 + }, + { + "epoch": 0.05353744409553716, + "grad_norm": 1.6840674877166748, + "learning_rate": 4.964730060756533e-05, + "loss": 5.3439, + "step": 9002 + }, + { + "epoch": 0.05354339137881815, + "grad_norm": 1.7857226133346558, + "learning_rate": 4.9647222419121384e-05, + "loss": 5.3231, + "step": 9003 + }, + { + "epoch": 0.053549338662099155, + "grad_norm": 1.6067994832992554, + "learning_rate": 4.964714422207335e-05, + "loss": 5.4019, + "step": 9004 + }, + { + "epoch": 0.05355528594538015, + "grad_norm": 1.7026724815368652, + "learning_rate": 4.964706601642125e-05, + "loss": 5.2716, + "step": 9005 + }, + { + "epoch": 0.053561233228661145, + "grad_norm": 1.632804036140442, + "learning_rate": 4.964698780216512e-05, + "loss": 5.4132, + "step": 9006 + }, + { + "epoch": 0.05356718051194215, + "grad_norm": 1.6569499969482422, + "learning_rate": 4.964690957930498e-05, + "loss": 5.294, + "step": 9007 + }, + { + "epoch": 0.05357312779522314, + "grad_norm": 1.8141810894012451, + "learning_rate": 4.964683134784086e-05, + "loss": 5.3365, + "step": 9008 + }, + { + "epoch": 0.05357907507850414, + "grad_norm": 1.6555678844451904, + "learning_rate": 4.964675310777278e-05, + "loss": 5.3488, + "step": 9009 + }, + { + "epoch": 0.05358502236178514, + "grad_norm": 1.8363603353500366, + "learning_rate": 4.964667485910078e-05, + "loss": 5.3679, + "step": 9010 + }, + { + "epoch": 0.053590969645066135, + "grad_norm": 1.7839024066925049, + "learning_rate": 4.9646596601824874e-05, + "loss": 5.2514, + "step": 9011 + }, + { + "epoch": 0.05359691692834713, + "grad_norm": 1.8712091445922852, + "learning_rate": 4.96465183359451e-05, + "loss": 5.4313, + "step": 9012 + }, + { + "epoch": 0.053602864211628125, + "grad_norm": 1.9677501916885376, + "learning_rate": 4.964644006146148e-05, + "loss": 5.2442, + "step": 9013 + }, + { + "epoch": 0.05360881149490913, + "grad_norm": 1.8567090034484863, + "learning_rate": 4.964636177837404e-05, + "loss": 5.105, + "step": 9014 + }, + { + "epoch": 0.05361475877819012, + "grad_norm": 1.7319908142089844, + "learning_rate": 4.964628348668281e-05, + "loss": 5.2962, + "step": 9015 + }, + { + "epoch": 0.05362070606147112, + "grad_norm": 1.6412272453308105, + "learning_rate": 4.9646205186387824e-05, + "loss": 5.2302, + "step": 9016 + }, + { + "epoch": 0.05362665334475212, + "grad_norm": 1.9401088953018188, + "learning_rate": 4.96461268774891e-05, + "loss": 5.4425, + "step": 9017 + }, + { + "epoch": 0.053632600628033114, + "grad_norm": 1.7045506238937378, + "learning_rate": 4.964604855998666e-05, + "loss": 5.2325, + "step": 9018 + }, + { + "epoch": 0.05363854791131411, + "grad_norm": 1.8232519626617432, + "learning_rate": 4.9645970233880545e-05, + "loss": 5.5047, + "step": 9019 + }, + { + "epoch": 0.05364449519459511, + "grad_norm": 1.718833327293396, + "learning_rate": 4.964589189917077e-05, + "loss": 5.3323, + "step": 9020 + }, + { + "epoch": 0.053650442477876106, + "grad_norm": 1.608774185180664, + "learning_rate": 4.9645813555857376e-05, + "loss": 5.2374, + "step": 9021 + }, + { + "epoch": 0.0536563897611571, + "grad_norm": 1.6789363622665405, + "learning_rate": 4.964573520394039e-05, + "loss": 5.3291, + "step": 9022 + }, + { + "epoch": 0.0536623370444381, + "grad_norm": 1.6596689224243164, + "learning_rate": 4.964565684341982e-05, + "loss": 5.308, + "step": 9023 + }, + { + "epoch": 0.0536682843277191, + "grad_norm": 1.8141522407531738, + "learning_rate": 4.9645578474295703e-05, + "loss": 5.2033, + "step": 9024 + }, + { + "epoch": 0.053674231611000094, + "grad_norm": 1.428606390953064, + "learning_rate": 4.964550009656808e-05, + "loss": 5.2441, + "step": 9025 + }, + { + "epoch": 0.053680178894281096, + "grad_norm": 1.5033652782440186, + "learning_rate": 4.9645421710236965e-05, + "loss": 5.2132, + "step": 9026 + }, + { + "epoch": 0.05368612617756209, + "grad_norm": 1.7123147249221802, + "learning_rate": 4.9645343315302385e-05, + "loss": 5.3145, + "step": 9027 + }, + { + "epoch": 0.053692073460843086, + "grad_norm": 1.5851943492889404, + "learning_rate": 4.9645264911764376e-05, + "loss": 5.353, + "step": 9028 + }, + { + "epoch": 0.05369802074412408, + "grad_norm": 1.6627084016799927, + "learning_rate": 4.964518649962295e-05, + "loss": 5.1049, + "step": 9029 + }, + { + "epoch": 0.05370396802740508, + "grad_norm": 1.51585853099823, + "learning_rate": 4.964510807887815e-05, + "loss": 4.9433, + "step": 9030 + }, + { + "epoch": 0.05370991531068608, + "grad_norm": 1.7350785732269287, + "learning_rate": 4.964502964952999e-05, + "loss": 5.1761, + "step": 9031 + }, + { + "epoch": 0.05371586259396707, + "grad_norm": 1.925410509109497, + "learning_rate": 4.964495121157852e-05, + "loss": 5.0528, + "step": 9032 + }, + { + "epoch": 0.053721809877248075, + "grad_norm": 1.794162631034851, + "learning_rate": 4.964487276502374e-05, + "loss": 5.2009, + "step": 9033 + }, + { + "epoch": 0.05372775716052907, + "grad_norm": 1.6729109287261963, + "learning_rate": 4.964479430986569e-05, + "loss": 5.16, + "step": 9034 + }, + { + "epoch": 0.053733704443810065, + "grad_norm": 1.8543394804000854, + "learning_rate": 4.9644715846104406e-05, + "loss": 5.3545, + "step": 9035 + }, + { + "epoch": 0.05373965172709107, + "grad_norm": 1.6876883506774902, + "learning_rate": 4.96446373737399e-05, + "loss": 5.2074, + "step": 9036 + }, + { + "epoch": 0.05374559901037206, + "grad_norm": 1.816701054573059, + "learning_rate": 4.9644558892772205e-05, + "loss": 5.154, + "step": 9037 + }, + { + "epoch": 0.05375154629365306, + "grad_norm": 1.471283197402954, + "learning_rate": 4.964448040320135e-05, + "loss": 5.2577, + "step": 9038 + }, + { + "epoch": 0.05375749357693406, + "grad_norm": 1.5764297246932983, + "learning_rate": 4.964440190502736e-05, + "loss": 5.0115, + "step": 9039 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 1.6854795217514038, + "learning_rate": 4.964432339825027e-05, + "loss": 5.1957, + "step": 9040 + }, + { + "epoch": 0.05376938814349605, + "grad_norm": 1.889570951461792, + "learning_rate": 4.964424488287009e-05, + "loss": 5.1229, + "step": 9041 + }, + { + "epoch": 0.05377533542677705, + "grad_norm": 1.7528218030929565, + "learning_rate": 4.964416635888687e-05, + "loss": 5.0002, + "step": 9042 + }, + { + "epoch": 0.05378128271005805, + "grad_norm": 1.68081796169281, + "learning_rate": 4.964408782630062e-05, + "loss": 5.0567, + "step": 9043 + }, + { + "epoch": 0.05378722999333904, + "grad_norm": 1.6083979606628418, + "learning_rate": 4.9644009285111384e-05, + "loss": 5.0775, + "step": 9044 + }, + { + "epoch": 0.05379317727662004, + "grad_norm": 1.676720380783081, + "learning_rate": 4.9643930735319164e-05, + "loss": 5.0446, + "step": 9045 + }, + { + "epoch": 0.05379912455990104, + "grad_norm": 1.6502453088760376, + "learning_rate": 4.964385217692401e-05, + "loss": 5.3751, + "step": 9046 + }, + { + "epoch": 0.053805071843182034, + "grad_norm": 1.9226343631744385, + "learning_rate": 4.9643773609925935e-05, + "loss": 5.2442, + "step": 9047 + }, + { + "epoch": 0.05381101912646303, + "grad_norm": 1.8054014444351196, + "learning_rate": 4.964369503432498e-05, + "loss": 5.4844, + "step": 9048 + }, + { + "epoch": 0.05381696640974403, + "grad_norm": 1.5151008367538452, + "learning_rate": 4.9643616450121166e-05, + "loss": 5.2834, + "step": 9049 + }, + { + "epoch": 0.053822913693025026, + "grad_norm": 2.0237820148468018, + "learning_rate": 4.964353785731452e-05, + "loss": 5.3166, + "step": 9050 + }, + { + "epoch": 0.05382886097630602, + "grad_norm": 2.145364999771118, + "learning_rate": 4.964345925590507e-05, + "loss": 5.3803, + "step": 9051 + }, + { + "epoch": 0.05383480825958702, + "grad_norm": 1.747369408607483, + "learning_rate": 4.964338064589284e-05, + "loss": 6.1041, + "step": 9052 + }, + { + "epoch": 0.05384075554286802, + "grad_norm": 1.9964301586151123, + "learning_rate": 4.964330202727786e-05, + "loss": 5.1707, + "step": 9053 + }, + { + "epoch": 0.053846702826149014, + "grad_norm": 1.630233645439148, + "learning_rate": 4.9643223400060155e-05, + "loss": 4.9385, + "step": 9054 + }, + { + "epoch": 0.053852650109430016, + "grad_norm": 1.5782960653305054, + "learning_rate": 4.9643144764239765e-05, + "loss": 4.9953, + "step": 9055 + }, + { + "epoch": 0.05385859739271101, + "grad_norm": 2.1511783599853516, + "learning_rate": 4.9643066119816706e-05, + "loss": 5.4329, + "step": 9056 + }, + { + "epoch": 0.053864544675992006, + "grad_norm": 2.2133493423461914, + "learning_rate": 4.9642987466791004e-05, + "loss": 5.7347, + "step": 9057 + }, + { + "epoch": 0.053870491959273, + "grad_norm": 1.7669782638549805, + "learning_rate": 4.9642908805162686e-05, + "loss": 5.4129, + "step": 9058 + }, + { + "epoch": 0.053876439242554, + "grad_norm": 1.8005794286727905, + "learning_rate": 4.9642830134931787e-05, + "loss": 5.2397, + "step": 9059 + }, + { + "epoch": 0.053882386525835, + "grad_norm": 1.697607398033142, + "learning_rate": 4.9642751456098325e-05, + "loss": 5.3388, + "step": 9060 + }, + { + "epoch": 0.05388833380911599, + "grad_norm": 1.4916869401931763, + "learning_rate": 4.9642672768662344e-05, + "loss": 5.2574, + "step": 9061 + }, + { + "epoch": 0.053894281092396995, + "grad_norm": 1.7112784385681152, + "learning_rate": 4.964259407262385e-05, + "loss": 4.9881, + "step": 9062 + }, + { + "epoch": 0.05390022837567799, + "grad_norm": 1.4831846952438354, + "learning_rate": 4.964251536798289e-05, + "loss": 5.3976, + "step": 9063 + }, + { + "epoch": 0.053906175658958985, + "grad_norm": 1.626370906829834, + "learning_rate": 4.9642436654739476e-05, + "loss": 5.2409, + "step": 9064 + }, + { + "epoch": 0.05391212294223999, + "grad_norm": 1.7369413375854492, + "learning_rate": 4.964235793289365e-05, + "loss": 5.2732, + "step": 9065 + }, + { + "epoch": 0.05391807022552098, + "grad_norm": 1.7028629779815674, + "learning_rate": 4.964227920244542e-05, + "loss": 5.3161, + "step": 9066 + }, + { + "epoch": 0.05392401750880198, + "grad_norm": 1.9031678438186646, + "learning_rate": 4.964220046339483e-05, + "loss": 5.2517, + "step": 9067 + }, + { + "epoch": 0.05392996479208298, + "grad_norm": 1.8210735321044922, + "learning_rate": 4.96421217157419e-05, + "loss": 5.2819, + "step": 9068 + }, + { + "epoch": 0.053935912075363975, + "grad_norm": 1.7334645986557007, + "learning_rate": 4.9642042959486666e-05, + "loss": 5.4296, + "step": 9069 + }, + { + "epoch": 0.05394185935864497, + "grad_norm": 1.732790231704712, + "learning_rate": 4.964196419462914e-05, + "loss": 5.3589, + "step": 9070 + }, + { + "epoch": 0.05394780664192597, + "grad_norm": 1.417751669883728, + "learning_rate": 4.964188542116937e-05, + "loss": 5.0958, + "step": 9071 + }, + { + "epoch": 0.05395375392520697, + "grad_norm": 1.8562361001968384, + "learning_rate": 4.964180663910737e-05, + "loss": 5.2622, + "step": 9072 + }, + { + "epoch": 0.05395970120848796, + "grad_norm": 1.7366154193878174, + "learning_rate": 4.9641727848443166e-05, + "loss": 5.2329, + "step": 9073 + }, + { + "epoch": 0.05396564849176896, + "grad_norm": 1.8587182760238647, + "learning_rate": 4.9641649049176785e-05, + "loss": 4.9392, + "step": 9074 + }, + { + "epoch": 0.05397159577504996, + "grad_norm": 1.6152398586273193, + "learning_rate": 4.964157024130827e-05, + "loss": 5.473, + "step": 9075 + }, + { + "epoch": 0.053977543058330954, + "grad_norm": 1.5967273712158203, + "learning_rate": 4.9641491424837626e-05, + "loss": 5.2877, + "step": 9076 + }, + { + "epoch": 0.05398349034161195, + "grad_norm": 1.4986391067504883, + "learning_rate": 4.96414125997649e-05, + "loss": 5.2163, + "step": 9077 + }, + { + "epoch": 0.05398943762489295, + "grad_norm": 1.563905119895935, + "learning_rate": 4.964133376609011e-05, + "loss": 5.2043, + "step": 9078 + }, + { + "epoch": 0.053995384908173946, + "grad_norm": 1.5690317153930664, + "learning_rate": 4.964125492381329e-05, + "loss": 5.2226, + "step": 9079 + }, + { + "epoch": 0.05400133219145494, + "grad_norm": 1.7732517719268799, + "learning_rate": 4.9641176072934446e-05, + "loss": 5.3123, + "step": 9080 + }, + { + "epoch": 0.05400727947473594, + "grad_norm": 1.7045226097106934, + "learning_rate": 4.964109721345364e-05, + "loss": 5.0872, + "step": 9081 + }, + { + "epoch": 0.05401322675801694, + "grad_norm": 1.6405664682388306, + "learning_rate": 4.964101834537087e-05, + "loss": 5.3863, + "step": 9082 + }, + { + "epoch": 0.054019174041297933, + "grad_norm": 1.7410979270935059, + "learning_rate": 4.964093946868618e-05, + "loss": 5.0952, + "step": 9083 + }, + { + "epoch": 0.054025121324578936, + "grad_norm": 2.0102951526641846, + "learning_rate": 4.964086058339959e-05, + "loss": 4.9484, + "step": 9084 + }, + { + "epoch": 0.05403106860785993, + "grad_norm": 1.8228510618209839, + "learning_rate": 4.9640781689511133e-05, + "loss": 5.1141, + "step": 9085 + }, + { + "epoch": 0.054037015891140926, + "grad_norm": 1.7363582849502563, + "learning_rate": 4.964070278702083e-05, + "loss": 5.1164, + "step": 9086 + }, + { + "epoch": 0.05404296317442192, + "grad_norm": 1.6060153245925903, + "learning_rate": 4.9640623875928714e-05, + "loss": 5.1746, + "step": 9087 + }, + { + "epoch": 0.05404891045770292, + "grad_norm": 1.6690374612808228, + "learning_rate": 4.9640544956234814e-05, + "loss": 5.0931, + "step": 9088 + }, + { + "epoch": 0.05405485774098392, + "grad_norm": 1.613527774810791, + "learning_rate": 4.964046602793916e-05, + "loss": 5.2224, + "step": 9089 + }, + { + "epoch": 0.05406080502426491, + "grad_norm": 1.6461642980575562, + "learning_rate": 4.964038709104176e-05, + "loss": 5.3175, + "step": 9090 + }, + { + "epoch": 0.054066752307545915, + "grad_norm": 1.839709758758545, + "learning_rate": 4.9640308145542664e-05, + "loss": 5.3247, + "step": 9091 + }, + { + "epoch": 0.05407269959082691, + "grad_norm": 1.8977348804473877, + "learning_rate": 4.9640229191441886e-05, + "loss": 5.4256, + "step": 9092 + }, + { + "epoch": 0.054078646874107905, + "grad_norm": 1.9805532693862915, + "learning_rate": 4.9640150228739454e-05, + "loss": 4.9413, + "step": 9093 + }, + { + "epoch": 0.05408459415738891, + "grad_norm": 2.0237114429473877, + "learning_rate": 4.964007125743542e-05, + "loss": 4.8808, + "step": 9094 + }, + { + "epoch": 0.0540905414406699, + "grad_norm": 1.9848511219024658, + "learning_rate": 4.963999227752977e-05, + "loss": 5.0295, + "step": 9095 + }, + { + "epoch": 0.0540964887239509, + "grad_norm": 1.925876498222351, + "learning_rate": 4.9639913289022564e-05, + "loss": 5.0129, + "step": 9096 + }, + { + "epoch": 0.0541024360072319, + "grad_norm": 1.4887725114822388, + "learning_rate": 4.963983429191382e-05, + "loss": 4.9706, + "step": 9097 + }, + { + "epoch": 0.054108383290512894, + "grad_norm": 1.615160584449768, + "learning_rate": 4.963975528620356e-05, + "loss": 5.0066, + "step": 9098 + }, + { + "epoch": 0.05411433057379389, + "grad_norm": 1.969086766242981, + "learning_rate": 4.9639676271891816e-05, + "loss": 4.9539, + "step": 9099 + }, + { + "epoch": 0.05412027785707489, + "grad_norm": 1.8290555477142334, + "learning_rate": 4.963959724897862e-05, + "loss": 5.2467, + "step": 9100 + }, + { + "epoch": 0.05412622514035589, + "grad_norm": 2.004157066345215, + "learning_rate": 4.963951821746399e-05, + "loss": 4.8, + "step": 9101 + }, + { + "epoch": 0.05413217242363688, + "grad_norm": 1.9732778072357178, + "learning_rate": 4.9639439177347955e-05, + "loss": 4.8828, + "step": 9102 + }, + { + "epoch": 0.05413811970691788, + "grad_norm": 1.8653557300567627, + "learning_rate": 4.963936012863056e-05, + "loss": 5.0591, + "step": 9103 + }, + { + "epoch": 0.05414406699019888, + "grad_norm": 1.7854375839233398, + "learning_rate": 4.9639281071311804e-05, + "loss": 5.0914, + "step": 9104 + }, + { + "epoch": 0.054150014273479874, + "grad_norm": 1.7956377267837524, + "learning_rate": 4.963920200539174e-05, + "loss": 5.3484, + "step": 9105 + }, + { + "epoch": 0.05415596155676087, + "grad_norm": 1.7851346731185913, + "learning_rate": 4.963912293087039e-05, + "loss": 5.3146, + "step": 9106 + }, + { + "epoch": 0.05416190884004187, + "grad_norm": 1.72859787940979, + "learning_rate": 4.9639043847747756e-05, + "loss": 5.1611, + "step": 9107 + }, + { + "epoch": 0.054167856123322866, + "grad_norm": 1.5961265563964844, + "learning_rate": 4.9638964756023904e-05, + "loss": 5.247, + "step": 9108 + }, + { + "epoch": 0.05417380340660386, + "grad_norm": 1.7507922649383545, + "learning_rate": 4.963888565569884e-05, + "loss": 5.2011, + "step": 9109 + }, + { + "epoch": 0.05417975068988486, + "grad_norm": 1.8338440656661987, + "learning_rate": 4.9638806546772594e-05, + "loss": 5.2413, + "step": 9110 + }, + { + "epoch": 0.05418569797316586, + "grad_norm": 1.8935306072235107, + "learning_rate": 4.963872742924519e-05, + "loss": 5.1042, + "step": 9111 + }, + { + "epoch": 0.05419164525644685, + "grad_norm": 1.6512808799743652, + "learning_rate": 4.963864830311667e-05, + "loss": 5.2437, + "step": 9112 + }, + { + "epoch": 0.054197592539727855, + "grad_norm": 1.6099332571029663, + "learning_rate": 4.963856916838705e-05, + "loss": 5.2828, + "step": 9113 + }, + { + "epoch": 0.05420353982300885, + "grad_norm": 2.114581823348999, + "learning_rate": 4.9638490025056355e-05, + "loss": 6.1534, + "step": 9114 + }, + { + "epoch": 0.054209487106289846, + "grad_norm": 1.762335181236267, + "learning_rate": 4.963841087312462e-05, + "loss": 5.1504, + "step": 9115 + }, + { + "epoch": 0.05421543438957084, + "grad_norm": 1.7669222354888916, + "learning_rate": 4.963833171259187e-05, + "loss": 5.0365, + "step": 9116 + }, + { + "epoch": 0.05422138167285184, + "grad_norm": 1.7319819927215576, + "learning_rate": 4.963825254345814e-05, + "loss": 5.0724, + "step": 9117 + }, + { + "epoch": 0.05422732895613284, + "grad_norm": 1.618116021156311, + "learning_rate": 4.9638173365723444e-05, + "loss": 5.0964, + "step": 9118 + }, + { + "epoch": 0.05423327623941383, + "grad_norm": 1.6506006717681885, + "learning_rate": 4.9638094179387814e-05, + "loss": 5.1189, + "step": 9119 + }, + { + "epoch": 0.054239223522694835, + "grad_norm": 1.7512328624725342, + "learning_rate": 4.963801498445129e-05, + "loss": 5.2732, + "step": 9120 + }, + { + "epoch": 0.05424517080597583, + "grad_norm": 1.5639985799789429, + "learning_rate": 4.963793578091388e-05, + "loss": 5.0718, + "step": 9121 + }, + { + "epoch": 0.054251118089256825, + "grad_norm": 1.7059093713760376, + "learning_rate": 4.963785656877562e-05, + "loss": 5.0744, + "step": 9122 + }, + { + "epoch": 0.05425706537253783, + "grad_norm": 1.574802279472351, + "learning_rate": 4.9637777348036546e-05, + "loss": 5.2663, + "step": 9123 + }, + { + "epoch": 0.05426301265581882, + "grad_norm": 1.7343204021453857, + "learning_rate": 4.9637698118696674e-05, + "loss": 5.0805, + "step": 9124 + }, + { + "epoch": 0.05426895993909982, + "grad_norm": 1.6154165267944336, + "learning_rate": 4.963761888075604e-05, + "loss": 5.1402, + "step": 9125 + }, + { + "epoch": 0.05427490722238082, + "grad_norm": 1.6474148035049438, + "learning_rate": 4.9637539634214666e-05, + "loss": 5.0601, + "step": 9126 + }, + { + "epoch": 0.054280854505661814, + "grad_norm": 1.7573519945144653, + "learning_rate": 4.963746037907258e-05, + "loss": 5.1846, + "step": 9127 + }, + { + "epoch": 0.05428680178894281, + "grad_norm": 1.4558652639389038, + "learning_rate": 4.963738111532981e-05, + "loss": 5.3132, + "step": 9128 + }, + { + "epoch": 0.05429274907222381, + "grad_norm": 1.6261000633239746, + "learning_rate": 4.963730184298639e-05, + "loss": 5.2843, + "step": 9129 + }, + { + "epoch": 0.05429869635550481, + "grad_norm": 1.4502191543579102, + "learning_rate": 4.963722256204234e-05, + "loss": 5.14, + "step": 9130 + }, + { + "epoch": 0.0543046436387858, + "grad_norm": 1.6366747617721558, + "learning_rate": 4.9637143272497686e-05, + "loss": 5.1496, + "step": 9131 + }, + { + "epoch": 0.0543105909220668, + "grad_norm": 1.603745698928833, + "learning_rate": 4.963706397435246e-05, + "loss": 5.0644, + "step": 9132 + }, + { + "epoch": 0.0543165382053478, + "grad_norm": 1.419536828994751, + "learning_rate": 4.963698466760669e-05, + "loss": 5.3182, + "step": 9133 + }, + { + "epoch": 0.054322485488628794, + "grad_norm": 1.511765480041504, + "learning_rate": 4.963690535226041e-05, + "loss": 5.2808, + "step": 9134 + }, + { + "epoch": 0.05432843277190979, + "grad_norm": 1.4999688863754272, + "learning_rate": 4.963682602831364e-05, + "loss": 4.9235, + "step": 9135 + }, + { + "epoch": 0.05433438005519079, + "grad_norm": 1.5918420553207397, + "learning_rate": 4.96367466957664e-05, + "loss": 4.9293, + "step": 9136 + }, + { + "epoch": 0.054340327338471786, + "grad_norm": 1.502748727798462, + "learning_rate": 4.963666735461874e-05, + "loss": 5.2692, + "step": 9137 + }, + { + "epoch": 0.05434627462175278, + "grad_norm": 1.6474169492721558, + "learning_rate": 4.963658800487066e-05, + "loss": 5.1638, + "step": 9138 + }, + { + "epoch": 0.05435222190503378, + "grad_norm": 2.0195884704589844, + "learning_rate": 4.9636508646522204e-05, + "loss": 5.1085, + "step": 9139 + }, + { + "epoch": 0.05435816918831478, + "grad_norm": 1.7266180515289307, + "learning_rate": 4.9636429279573406e-05, + "loss": 5.0747, + "step": 9140 + }, + { + "epoch": 0.05436411647159577, + "grad_norm": 1.6965065002441406, + "learning_rate": 4.963634990402428e-05, + "loss": 5.1246, + "step": 9141 + }, + { + "epoch": 0.054370063754876775, + "grad_norm": 1.7629759311676025, + "learning_rate": 4.9636270519874856e-05, + "loss": 5.274, + "step": 9142 + }, + { + "epoch": 0.05437601103815777, + "grad_norm": 1.6365042924880981, + "learning_rate": 4.9636191127125164e-05, + "loss": 5.2469, + "step": 9143 + }, + { + "epoch": 0.054381958321438766, + "grad_norm": 1.6777831315994263, + "learning_rate": 4.9636111725775235e-05, + "loss": 5.3041, + "step": 9144 + }, + { + "epoch": 0.05438790560471976, + "grad_norm": 1.5354039669036865, + "learning_rate": 4.9636032315825096e-05, + "loss": 5.1799, + "step": 9145 + }, + { + "epoch": 0.05439385288800076, + "grad_norm": 1.508083701133728, + "learning_rate": 4.9635952897274773e-05, + "loss": 5.0822, + "step": 9146 + }, + { + "epoch": 0.05439980017128176, + "grad_norm": 1.5960441827774048, + "learning_rate": 4.963587347012429e-05, + "loss": 5.1618, + "step": 9147 + }, + { + "epoch": 0.05440574745456275, + "grad_norm": 1.4927520751953125, + "learning_rate": 4.9635794034373675e-05, + "loss": 5.1464, + "step": 9148 + }, + { + "epoch": 0.054411694737843755, + "grad_norm": 1.7420401573181152, + "learning_rate": 4.9635714590022966e-05, + "loss": 5.2866, + "step": 9149 + }, + { + "epoch": 0.05441764202112475, + "grad_norm": 1.7907800674438477, + "learning_rate": 4.9635635137072176e-05, + "loss": 5.1042, + "step": 9150 + }, + { + "epoch": 0.054423589304405745, + "grad_norm": 1.7073547840118408, + "learning_rate": 4.963555567552135e-05, + "loss": 5.1986, + "step": 9151 + }, + { + "epoch": 0.05442953658768675, + "grad_norm": 1.894405484199524, + "learning_rate": 4.96354762053705e-05, + "loss": 5.225, + "step": 9152 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 1.5830878019332886, + "learning_rate": 4.9635396726619656e-05, + "loss": 5.2902, + "step": 9153 + }, + { + "epoch": 0.05444143115424874, + "grad_norm": 1.5435214042663574, + "learning_rate": 4.963531723926885e-05, + "loss": 5.0773, + "step": 9154 + }, + { + "epoch": 0.05444737843752974, + "grad_norm": 1.4262596368789673, + "learning_rate": 4.9635237743318117e-05, + "loss": 5.129, + "step": 9155 + }, + { + "epoch": 0.054453325720810734, + "grad_norm": 1.5793390274047852, + "learning_rate": 4.9635158238767475e-05, + "loss": 5.1693, + "step": 9156 + }, + { + "epoch": 0.05445927300409173, + "grad_norm": 1.767318606376648, + "learning_rate": 4.963507872561695e-05, + "loss": 5.2541, + "step": 9157 + }, + { + "epoch": 0.05446522028737273, + "grad_norm": 1.5084065198898315, + "learning_rate": 4.963499920386658e-05, + "loss": 5.2531, + "step": 9158 + }, + { + "epoch": 0.05447116757065373, + "grad_norm": 1.797877311706543, + "learning_rate": 4.963491967351638e-05, + "loss": 5.2278, + "step": 9159 + }, + { + "epoch": 0.05447711485393472, + "grad_norm": 1.7463361024856567, + "learning_rate": 4.963484013456639e-05, + "loss": 5.1005, + "step": 9160 + }, + { + "epoch": 0.05448306213721572, + "grad_norm": 1.8208277225494385, + "learning_rate": 4.9634760587016626e-05, + "loss": 5.1437, + "step": 9161 + }, + { + "epoch": 0.05448900942049672, + "grad_norm": 1.9020015001296997, + "learning_rate": 4.9634681030867116e-05, + "loss": 5.1554, + "step": 9162 + }, + { + "epoch": 0.054494956703777714, + "grad_norm": 1.8370200395584106, + "learning_rate": 4.9634601466117904e-05, + "loss": 5.2418, + "step": 9163 + }, + { + "epoch": 0.05450090398705871, + "grad_norm": 1.785875678062439, + "learning_rate": 4.9634521892769004e-05, + "loss": 5.1916, + "step": 9164 + }, + { + "epoch": 0.05450685127033971, + "grad_norm": 1.7501643896102905, + "learning_rate": 4.963444231082045e-05, + "loss": 5.0887, + "step": 9165 + }, + { + "epoch": 0.054512798553620706, + "grad_norm": 1.6924220323562622, + "learning_rate": 4.963436272027227e-05, + "loss": 5.2458, + "step": 9166 + }, + { + "epoch": 0.0545187458369017, + "grad_norm": 1.895605206489563, + "learning_rate": 4.963428312112447e-05, + "loss": 5.1286, + "step": 9167 + }, + { + "epoch": 0.0545246931201827, + "grad_norm": 1.842207908630371, + "learning_rate": 4.963420351337711e-05, + "loss": 5.1177, + "step": 9168 + }, + { + "epoch": 0.0545306404034637, + "grad_norm": 1.7467048168182373, + "learning_rate": 4.963412389703021e-05, + "loss": 5.1616, + "step": 9169 + }, + { + "epoch": 0.05453658768674469, + "grad_norm": 1.8047499656677246, + "learning_rate": 4.963404427208378e-05, + "loss": 5.0543, + "step": 9170 + }, + { + "epoch": 0.054542534970025695, + "grad_norm": 1.5830637216567993, + "learning_rate": 4.963396463853786e-05, + "loss": 5.0989, + "step": 9171 + }, + { + "epoch": 0.05454848225330669, + "grad_norm": 1.7481937408447266, + "learning_rate": 4.9633884996392485e-05, + "loss": 5.1686, + "step": 9172 + }, + { + "epoch": 0.054554429536587686, + "grad_norm": 1.7132925987243652, + "learning_rate": 4.9633805345647664e-05, + "loss": 4.9683, + "step": 9173 + }, + { + "epoch": 0.05456037681986868, + "grad_norm": 1.8369117975234985, + "learning_rate": 4.9633725686303445e-05, + "loss": 5.154, + "step": 9174 + }, + { + "epoch": 0.05456632410314968, + "grad_norm": 1.615011215209961, + "learning_rate": 4.963364601835985e-05, + "loss": 5.0982, + "step": 9175 + }, + { + "epoch": 0.05457227138643068, + "grad_norm": 1.853742003440857, + "learning_rate": 4.963356634181689e-05, + "loss": 6.0599, + "step": 9176 + }, + { + "epoch": 0.05457821866971167, + "grad_norm": 1.5529752969741821, + "learning_rate": 4.963348665667462e-05, + "loss": 5.1355, + "step": 9177 + }, + { + "epoch": 0.054584165952992675, + "grad_norm": 1.5113881826400757, + "learning_rate": 4.963340696293305e-05, + "loss": 5.1947, + "step": 9178 + }, + { + "epoch": 0.05459011323627367, + "grad_norm": 1.6840931177139282, + "learning_rate": 4.963332726059221e-05, + "loss": 5.2163, + "step": 9179 + }, + { + "epoch": 0.054596060519554665, + "grad_norm": 1.7720422744750977, + "learning_rate": 4.963324754965214e-05, + "loss": 5.4737, + "step": 9180 + }, + { + "epoch": 0.05460200780283567, + "grad_norm": 1.632574200630188, + "learning_rate": 4.963316783011285e-05, + "loss": 5.2274, + "step": 9181 + }, + { + "epoch": 0.05460795508611666, + "grad_norm": 1.5859557390213013, + "learning_rate": 4.963308810197437e-05, + "loss": 5.3503, + "step": 9182 + }, + { + "epoch": 0.05461390236939766, + "grad_norm": 1.8342604637145996, + "learning_rate": 4.963300836523674e-05, + "loss": 5.1967, + "step": 9183 + }, + { + "epoch": 0.05461984965267866, + "grad_norm": 1.7443957328796387, + "learning_rate": 4.963292861989998e-05, + "loss": 5.0935, + "step": 9184 + }, + { + "epoch": 0.054625796935959654, + "grad_norm": 1.9289584159851074, + "learning_rate": 4.963284886596412e-05, + "loss": 5.1817, + "step": 9185 + }, + { + "epoch": 0.05463174421924065, + "grad_norm": 1.8695822954177856, + "learning_rate": 4.9632769103429186e-05, + "loss": 5.4304, + "step": 9186 + }, + { + "epoch": 0.05463769150252165, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.9632689332295206e-05, + "loss": 5.3924, + "step": 9187 + }, + { + "epoch": 0.054643638785802646, + "grad_norm": 1.6061500310897827, + "learning_rate": 4.963260955256221e-05, + "loss": 5.2309, + "step": 9188 + }, + { + "epoch": 0.05464958606908364, + "grad_norm": 1.5478893518447876, + "learning_rate": 4.963252976423022e-05, + "loss": 5.2615, + "step": 9189 + }, + { + "epoch": 0.05465553335236464, + "grad_norm": 1.4304052591323853, + "learning_rate": 4.9632449967299276e-05, + "loss": 5.2116, + "step": 9190 + }, + { + "epoch": 0.05466148063564564, + "grad_norm": 1.5438693761825562, + "learning_rate": 4.9632370161769395e-05, + "loss": 5.1176, + "step": 9191 + }, + { + "epoch": 0.054667427918926634, + "grad_norm": 1.6602065563201904, + "learning_rate": 4.9632290347640606e-05, + "loss": 5.1521, + "step": 9192 + }, + { + "epoch": 0.05467337520220763, + "grad_norm": 1.530038595199585, + "learning_rate": 4.9632210524912934e-05, + "loss": 5.1437, + "step": 9193 + }, + { + "epoch": 0.05467932248548863, + "grad_norm": 1.617691159248352, + "learning_rate": 4.963213069358643e-05, + "loss": 5.0601, + "step": 9194 + }, + { + "epoch": 0.054685269768769626, + "grad_norm": 1.722401738166809, + "learning_rate": 4.963205085366108e-05, + "loss": 5.2664, + "step": 9195 + }, + { + "epoch": 0.05469121705205062, + "grad_norm": 1.803673267364502, + "learning_rate": 4.963197100513696e-05, + "loss": 5.4164, + "step": 9196 + }, + { + "epoch": 0.05469716433533162, + "grad_norm": 1.8565739393234253, + "learning_rate": 4.963189114801405e-05, + "loss": 5.225, + "step": 9197 + }, + { + "epoch": 0.05470311161861262, + "grad_norm": 1.780698299407959, + "learning_rate": 4.963181128229242e-05, + "loss": 5.1694, + "step": 9198 + }, + { + "epoch": 0.05470905890189361, + "grad_norm": 1.820416808128357, + "learning_rate": 4.963173140797207e-05, + "loss": 5.3305, + "step": 9199 + }, + { + "epoch": 0.054715006185174615, + "grad_norm": 1.471983551979065, + "learning_rate": 4.963165152505304e-05, + "loss": 5.3217, + "step": 9200 + }, + { + "epoch": 0.05472095346845561, + "grad_norm": 1.504616141319275, + "learning_rate": 4.9631571633535354e-05, + "loss": 5.3349, + "step": 9201 + }, + { + "epoch": 0.054726900751736605, + "grad_norm": 1.5888862609863281, + "learning_rate": 4.963149173341903e-05, + "loss": 5.3431, + "step": 9202 + }, + { + "epoch": 0.0547328480350176, + "grad_norm": 1.6633155345916748, + "learning_rate": 4.963141182470412e-05, + "loss": 5.2678, + "step": 9203 + }, + { + "epoch": 0.0547387953182986, + "grad_norm": 1.7259690761566162, + "learning_rate": 4.9631331907390636e-05, + "loss": 5.348, + "step": 9204 + }, + { + "epoch": 0.0547447426015796, + "grad_norm": 1.703925371170044, + "learning_rate": 4.963125198147861e-05, + "loss": 5.4123, + "step": 9205 + }, + { + "epoch": 0.05475068988486059, + "grad_norm": 1.6619760990142822, + "learning_rate": 4.963117204696807e-05, + "loss": 5.1732, + "step": 9206 + }, + { + "epoch": 0.054756637168141595, + "grad_norm": 1.7368190288543701, + "learning_rate": 4.963109210385903e-05, + "loss": 5.0843, + "step": 9207 + }, + { + "epoch": 0.05476258445142259, + "grad_norm": 1.781179666519165, + "learning_rate": 4.9631012152151545e-05, + "loss": 5.1343, + "step": 9208 + }, + { + "epoch": 0.054768531734703585, + "grad_norm": 1.674793004989624, + "learning_rate": 4.9630932191845624e-05, + "loss": 5.4079, + "step": 9209 + }, + { + "epoch": 0.05477447901798459, + "grad_norm": 1.7708344459533691, + "learning_rate": 4.9630852222941296e-05, + "loss": 4.9702, + "step": 9210 + }, + { + "epoch": 0.05478042630126558, + "grad_norm": 1.684725046157837, + "learning_rate": 4.9630772245438594e-05, + "loss": 5.263, + "step": 9211 + }, + { + "epoch": 0.05478637358454658, + "grad_norm": 1.6064784526824951, + "learning_rate": 4.963069225933754e-05, + "loss": 5.3402, + "step": 9212 + }, + { + "epoch": 0.05479232086782758, + "grad_norm": 1.5189318656921387, + "learning_rate": 4.963061226463816e-05, + "loss": 5.1928, + "step": 9213 + }, + { + "epoch": 0.054798268151108574, + "grad_norm": 1.8095827102661133, + "learning_rate": 4.96305322613405e-05, + "loss": 5.262, + "step": 9214 + }, + { + "epoch": 0.05480421543438957, + "grad_norm": 1.8325434923171997, + "learning_rate": 4.963045224944458e-05, + "loss": 5.4975, + "step": 9215 + }, + { + "epoch": 0.05481016271767057, + "grad_norm": 1.6597868204116821, + "learning_rate": 4.963037222895042e-05, + "loss": 5.6232, + "step": 9216 + }, + { + "epoch": 0.054816110000951566, + "grad_norm": 1.6402417421340942, + "learning_rate": 4.9630292199858044e-05, + "loss": 5.5358, + "step": 9217 + }, + { + "epoch": 0.05482205728423256, + "grad_norm": 1.3956371545791626, + "learning_rate": 4.963021216216749e-05, + "loss": 5.2563, + "step": 9218 + }, + { + "epoch": 0.05482800456751356, + "grad_norm": 1.5958374738693237, + "learning_rate": 4.963013211587878e-05, + "loss": 5.1539, + "step": 9219 + }, + { + "epoch": 0.05483395185079456, + "grad_norm": 1.6152080297470093, + "learning_rate": 4.963005206099195e-05, + "loss": 5.4025, + "step": 9220 + }, + { + "epoch": 0.054839899134075554, + "grad_norm": 1.392427921295166, + "learning_rate": 4.962997199750702e-05, + "loss": 5.4149, + "step": 9221 + }, + { + "epoch": 0.05484584641735655, + "grad_norm": 1.5625338554382324, + "learning_rate": 4.962989192542403e-05, + "loss": 5.5837, + "step": 9222 + }, + { + "epoch": 0.05485179370063755, + "grad_norm": 1.6465163230895996, + "learning_rate": 4.962981184474299e-05, + "loss": 5.2934, + "step": 9223 + }, + { + "epoch": 0.054857740983918546, + "grad_norm": 1.5344611406326294, + "learning_rate": 4.962973175546394e-05, + "loss": 5.4734, + "step": 9224 + }, + { + "epoch": 0.05486368826719954, + "grad_norm": 1.2378648519515991, + "learning_rate": 4.962965165758691e-05, + "loss": 5.3368, + "step": 9225 + }, + { + "epoch": 0.05486963555048054, + "grad_norm": 1.396785020828247, + "learning_rate": 4.9629571551111915e-05, + "loss": 5.3163, + "step": 9226 + }, + { + "epoch": 0.05487558283376154, + "grad_norm": 1.639452338218689, + "learning_rate": 4.9629491436038994e-05, + "loss": 5.3933, + "step": 9227 + }, + { + "epoch": 0.05488153011704253, + "grad_norm": 1.5648834705352783, + "learning_rate": 4.9629411312368166e-05, + "loss": 5.3717, + "step": 9228 + }, + { + "epoch": 0.054887477400323535, + "grad_norm": 1.4774922132492065, + "learning_rate": 4.962933118009947e-05, + "loss": 5.1318, + "step": 9229 + }, + { + "epoch": 0.05489342468360453, + "grad_norm": 1.4987083673477173, + "learning_rate": 4.9629251039232935e-05, + "loss": 5.1436, + "step": 9230 + }, + { + "epoch": 0.054899371966885525, + "grad_norm": 1.660605788230896, + "learning_rate": 4.9629170889768586e-05, + "loss": 5.1841, + "step": 9231 + }, + { + "epoch": 0.05490531925016652, + "grad_norm": 1.4441273212432861, + "learning_rate": 4.962909073170643e-05, + "loss": 5.3108, + "step": 9232 + }, + { + "epoch": 0.05491126653344752, + "grad_norm": 1.3297922611236572, + "learning_rate": 4.962901056504653e-05, + "loss": 5.1441, + "step": 9233 + }, + { + "epoch": 0.05491721381672852, + "grad_norm": 1.2989814281463623, + "learning_rate": 4.9628930389788886e-05, + "loss": 5.5146, + "step": 9234 + }, + { + "epoch": 0.05492316110000951, + "grad_norm": 1.350948452949524, + "learning_rate": 4.962885020593354e-05, + "loss": 5.2832, + "step": 9235 + }, + { + "epoch": 0.054929108383290515, + "grad_norm": 1.5801438093185425, + "learning_rate": 4.962877001348052e-05, + "loss": 5.4251, + "step": 9236 + }, + { + "epoch": 0.05493505566657151, + "grad_norm": 1.4355653524398804, + "learning_rate": 4.9628689812429854e-05, + "loss": 5.4092, + "step": 9237 + }, + { + "epoch": 0.054941002949852505, + "grad_norm": 1.692746639251709, + "learning_rate": 4.962860960278156e-05, + "loss": 5.3858, + "step": 9238 + }, + { + "epoch": 0.05494695023313351, + "grad_norm": 1.5125641822814941, + "learning_rate": 4.962852938453567e-05, + "loss": 5.6584, + "step": 9239 + }, + { + "epoch": 0.0549528975164145, + "grad_norm": 1.4158848524093628, + "learning_rate": 4.962844915769221e-05, + "loss": 5.652, + "step": 9240 + }, + { + "epoch": 0.0549588447996955, + "grad_norm": 1.314286231994629, + "learning_rate": 4.9628368922251235e-05, + "loss": 5.501, + "step": 9241 + }, + { + "epoch": 0.0549647920829765, + "grad_norm": 1.4003247022628784, + "learning_rate": 4.962828867821273e-05, + "loss": 5.448, + "step": 9242 + }, + { + "epoch": 0.054970739366257494, + "grad_norm": 1.7670220136642456, + "learning_rate": 4.962820842557675e-05, + "loss": 5.4854, + "step": 9243 + }, + { + "epoch": 0.05497668664953849, + "grad_norm": 1.9435075521469116, + "learning_rate": 4.962812816434332e-05, + "loss": 5.3824, + "step": 9244 + }, + { + "epoch": 0.05498263393281949, + "grad_norm": 2.1733458042144775, + "learning_rate": 4.9628047894512466e-05, + "loss": 5.6771, + "step": 9245 + }, + { + "epoch": 0.054988581216100486, + "grad_norm": 1.5455420017242432, + "learning_rate": 4.962796761608421e-05, + "loss": 5.4634, + "step": 9246 + }, + { + "epoch": 0.05499452849938148, + "grad_norm": 1.623382806777954, + "learning_rate": 4.962788732905859e-05, + "loss": 5.8441, + "step": 9247 + }, + { + "epoch": 0.05500047578266248, + "grad_norm": 1.928788423538208, + "learning_rate": 4.962780703343563e-05, + "loss": 5.6553, + "step": 9248 + }, + { + "epoch": 0.05500642306594348, + "grad_norm": 1.660984992980957, + "learning_rate": 4.962772672921535e-05, + "loss": 5.5953, + "step": 9249 + }, + { + "epoch": 0.055012370349224474, + "grad_norm": 2.081026792526245, + "learning_rate": 4.962764641639779e-05, + "loss": 5.7065, + "step": 9250 + }, + { + "epoch": 0.05501831763250547, + "grad_norm": 1.8750234842300415, + "learning_rate": 4.962756609498297e-05, + "loss": 5.8814, + "step": 9251 + }, + { + "epoch": 0.05502426491578647, + "grad_norm": 1.9573127031326294, + "learning_rate": 4.9627485764970916e-05, + "loss": 5.7415, + "step": 9252 + }, + { + "epoch": 0.055030212199067466, + "grad_norm": 1.7536600828170776, + "learning_rate": 4.962740542636167e-05, + "loss": 5.5638, + "step": 9253 + }, + { + "epoch": 0.05503615948234846, + "grad_norm": 1.692557692527771, + "learning_rate": 4.962732507915525e-05, + "loss": 5.5362, + "step": 9254 + }, + { + "epoch": 0.05504210676562946, + "grad_norm": 1.9066821336746216, + "learning_rate": 4.962724472335168e-05, + "loss": 5.3094, + "step": 9255 + }, + { + "epoch": 0.05504805404891046, + "grad_norm": 2.069007158279419, + "learning_rate": 4.9627164358951e-05, + "loss": 5.766, + "step": 9256 + }, + { + "epoch": 0.05505400133219145, + "grad_norm": 2.0293545722961426, + "learning_rate": 4.9627083985953227e-05, + "loss": 5.7769, + "step": 9257 + }, + { + "epoch": 0.055059948615472455, + "grad_norm": 1.7953507900238037, + "learning_rate": 4.962700360435839e-05, + "loss": 5.8435, + "step": 9258 + }, + { + "epoch": 0.05506589589875345, + "grad_norm": 1.9281821250915527, + "learning_rate": 4.9626923214166535e-05, + "loss": 5.8342, + "step": 9259 + }, + { + "epoch": 0.055071843182034445, + "grad_norm": 1.4612617492675781, + "learning_rate": 4.962684281537766e-05, + "loss": 5.8273, + "step": 9260 + }, + { + "epoch": 0.05507779046531545, + "grad_norm": 1.8589900732040405, + "learning_rate": 4.9626762407991817e-05, + "loss": 5.7607, + "step": 9261 + }, + { + "epoch": 0.05508373774859644, + "grad_norm": 1.9395030736923218, + "learning_rate": 4.9626681992009025e-05, + "loss": 5.7573, + "step": 9262 + }, + { + "epoch": 0.05508968503187744, + "grad_norm": 1.7344708442687988, + "learning_rate": 4.962660156742931e-05, + "loss": 5.7999, + "step": 9263 + }, + { + "epoch": 0.05509563231515843, + "grad_norm": 1.7719827890396118, + "learning_rate": 4.9626521134252704e-05, + "loss": 5.7882, + "step": 9264 + }, + { + "epoch": 0.055101579598439435, + "grad_norm": 1.4955536127090454, + "learning_rate": 4.9626440692479236e-05, + "loss": 5.639, + "step": 9265 + }, + { + "epoch": 0.05510752688172043, + "grad_norm": 2.0087990760803223, + "learning_rate": 4.9626360242108925e-05, + "loss": 5.841, + "step": 9266 + }, + { + "epoch": 0.055113474165001425, + "grad_norm": 1.7334564924240112, + "learning_rate": 4.962627978314181e-05, + "loss": 5.4267, + "step": 9267 + }, + { + "epoch": 0.05511942144828243, + "grad_norm": 2.1204535961151123, + "learning_rate": 4.962619931557792e-05, + "loss": 5.4451, + "step": 9268 + }, + { + "epoch": 0.05512536873156342, + "grad_norm": 2.2374279499053955, + "learning_rate": 4.962611883941727e-05, + "loss": 5.5095, + "step": 9269 + }, + { + "epoch": 0.05513131601484442, + "grad_norm": 1.735070824623108, + "learning_rate": 4.9626038354659904e-05, + "loss": 5.3609, + "step": 9270 + }, + { + "epoch": 0.05513726329812542, + "grad_norm": 1.9748501777648926, + "learning_rate": 4.9625957861305837e-05, + "loss": 5.3366, + "step": 9271 + }, + { + "epoch": 0.055143210581406414, + "grad_norm": 1.8736618757247925, + "learning_rate": 4.96258773593551e-05, + "loss": 5.4706, + "step": 9272 + }, + { + "epoch": 0.05514915786468741, + "grad_norm": 2.571755886077881, + "learning_rate": 4.9625796848807736e-05, + "loss": 5.0393, + "step": 9273 + }, + { + "epoch": 0.05515510514796841, + "grad_norm": 2.1467013359069824, + "learning_rate": 4.962571632966375e-05, + "loss": 5.5798, + "step": 9274 + }, + { + "epoch": 0.055161052431249406, + "grad_norm": 2.4553916454315186, + "learning_rate": 4.962563580192319e-05, + "loss": 5.4323, + "step": 9275 + }, + { + "epoch": 0.0551669997145304, + "grad_norm": 2.4478797912597656, + "learning_rate": 4.962555526558607e-05, + "loss": 5.2591, + "step": 9276 + }, + { + "epoch": 0.055172946997811396, + "grad_norm": 2.2164270877838135, + "learning_rate": 4.9625474720652416e-05, + "loss": 5.3404, + "step": 9277 + }, + { + "epoch": 0.0551788942810924, + "grad_norm": 1.9161698818206787, + "learning_rate": 4.962539416712227e-05, + "loss": 5.2591, + "step": 9278 + }, + { + "epoch": 0.055184841564373394, + "grad_norm": 2.348734140396118, + "learning_rate": 4.962531360499565e-05, + "loss": 5.8132, + "step": 9279 + }, + { + "epoch": 0.05519078884765439, + "grad_norm": 2.400090456008911, + "learning_rate": 4.962523303427259e-05, + "loss": 5.7786, + "step": 9280 + }, + { + "epoch": 0.05519673613093539, + "grad_norm": 2.1626594066619873, + "learning_rate": 4.9625152454953115e-05, + "loss": 5.8488, + "step": 9281 + }, + { + "epoch": 0.055202683414216386, + "grad_norm": 1.7470853328704834, + "learning_rate": 4.962507186703725e-05, + "loss": 5.72, + "step": 9282 + }, + { + "epoch": 0.05520863069749738, + "grad_norm": 1.9191921949386597, + "learning_rate": 4.962499127052503e-05, + "loss": 5.6321, + "step": 9283 + }, + { + "epoch": 0.05521457798077838, + "grad_norm": 2.1550769805908203, + "learning_rate": 4.962491066541649e-05, + "loss": 5.4521, + "step": 9284 + }, + { + "epoch": 0.05522052526405938, + "grad_norm": 2.0529074668884277, + "learning_rate": 4.9624830051711634e-05, + "loss": 5.4108, + "step": 9285 + }, + { + "epoch": 0.05522647254734037, + "grad_norm": 1.7673834562301636, + "learning_rate": 4.962474942941051e-05, + "loss": 5.5955, + "step": 9286 + }, + { + "epoch": 0.055232419830621375, + "grad_norm": 1.9575849771499634, + "learning_rate": 4.9624668798513143e-05, + "loss": 5.6295, + "step": 9287 + }, + { + "epoch": 0.05523836711390237, + "grad_norm": 1.8054029941558838, + "learning_rate": 4.9624588159019546e-05, + "loss": 5.3372, + "step": 9288 + }, + { + "epoch": 0.055244314397183365, + "grad_norm": 1.8002424240112305, + "learning_rate": 4.962450751092978e-05, + "loss": 5.4404, + "step": 9289 + }, + { + "epoch": 0.05525026168046437, + "grad_norm": 2.052530527114868, + "learning_rate": 4.962442685424383e-05, + "loss": 5.4921, + "step": 9290 + }, + { + "epoch": 0.05525620896374536, + "grad_norm": 1.8559443950653076, + "learning_rate": 4.962434618896176e-05, + "loss": 5.5776, + "step": 9291 + }, + { + "epoch": 0.05526215624702636, + "grad_norm": 1.8794355392456055, + "learning_rate": 4.962426551508359e-05, + "loss": 5.5818, + "step": 9292 + }, + { + "epoch": 0.05526810353030735, + "grad_norm": 1.8995412588119507, + "learning_rate": 4.962418483260933e-05, + "loss": 5.6274, + "step": 9293 + }, + { + "epoch": 0.055274050813588355, + "grad_norm": 1.8608371019363403, + "learning_rate": 4.962410414153903e-05, + "loss": 5.4655, + "step": 9294 + }, + { + "epoch": 0.05527999809686935, + "grad_norm": 2.0378072261810303, + "learning_rate": 4.9624023441872715e-05, + "loss": 5.5579, + "step": 9295 + }, + { + "epoch": 0.055285945380150345, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.9623942733610397e-05, + "loss": 5.6663, + "step": 9296 + }, + { + "epoch": 0.05529189266343135, + "grad_norm": 2.4487335681915283, + "learning_rate": 4.962386201675212e-05, + "loss": 5.6792, + "step": 9297 + }, + { + "epoch": 0.05529783994671234, + "grad_norm": 2.0460383892059326, + "learning_rate": 4.96237812912979e-05, + "loss": 5.917, + "step": 9298 + }, + { + "epoch": 0.05530378722999334, + "grad_norm": 2.4838030338287354, + "learning_rate": 4.962370055724778e-05, + "loss": 5.1067, + "step": 9299 + }, + { + "epoch": 0.05530973451327434, + "grad_norm": 1.9340513944625854, + "learning_rate": 4.962361981460178e-05, + "loss": 5.2529, + "step": 9300 + }, + { + "epoch": 0.055315681796555334, + "grad_norm": 2.201068878173828, + "learning_rate": 4.9623539063359925e-05, + "loss": 5.6055, + "step": 9301 + }, + { + "epoch": 0.05532162907983633, + "grad_norm": 2.0552330017089844, + "learning_rate": 4.962345830352225e-05, + "loss": 5.3531, + "step": 9302 + }, + { + "epoch": 0.05532757636311733, + "grad_norm": 2.611407995223999, + "learning_rate": 4.9623377535088785e-05, + "loss": 5.5829, + "step": 9303 + }, + { + "epoch": 0.055333523646398326, + "grad_norm": 2.2239346504211426, + "learning_rate": 4.962329675805955e-05, + "loss": 5.3558, + "step": 9304 + }, + { + "epoch": 0.05533947092967932, + "grad_norm": 2.3899872303009033, + "learning_rate": 4.9623215972434566e-05, + "loss": 5.7277, + "step": 9305 + }, + { + "epoch": 0.055345418212960316, + "grad_norm": 2.8471267223358154, + "learning_rate": 4.962313517821389e-05, + "loss": 6.1046, + "step": 9306 + }, + { + "epoch": 0.05535136549624132, + "grad_norm": 2.426400661468506, + "learning_rate": 4.962305437539752e-05, + "loss": 5.8942, + "step": 9307 + }, + { + "epoch": 0.055357312779522314, + "grad_norm": 2.3548812866210938, + "learning_rate": 4.962297356398549e-05, + "loss": 6.0552, + "step": 9308 + }, + { + "epoch": 0.05536326006280331, + "grad_norm": 1.8423515558242798, + "learning_rate": 4.9622892743977844e-05, + "loss": 5.9377, + "step": 9309 + }, + { + "epoch": 0.05536920734608431, + "grad_norm": 2.1509203910827637, + "learning_rate": 4.96228119153746e-05, + "loss": 5.7195, + "step": 9310 + }, + { + "epoch": 0.055375154629365306, + "grad_norm": 2.3096275329589844, + "learning_rate": 4.962273107817579e-05, + "loss": 5.3461, + "step": 9311 + }, + { + "epoch": 0.0553811019126463, + "grad_norm": 1.980205774307251, + "learning_rate": 4.962265023238143e-05, + "loss": 5.8851, + "step": 9312 + }, + { + "epoch": 0.0553870491959273, + "grad_norm": 1.8162591457366943, + "learning_rate": 4.962256937799156e-05, + "loss": 5.7092, + "step": 9313 + }, + { + "epoch": 0.0553929964792083, + "grad_norm": 1.873853087425232, + "learning_rate": 4.962248851500621e-05, + "loss": 5.8939, + "step": 9314 + }, + { + "epoch": 0.05539894376248929, + "grad_norm": 1.8039345741271973, + "learning_rate": 4.96224076434254e-05, + "loss": 5.9289, + "step": 9315 + }, + { + "epoch": 0.055404891045770295, + "grad_norm": 2.3106470108032227, + "learning_rate": 4.962232676324916e-05, + "loss": 5.9103, + "step": 9316 + }, + { + "epoch": 0.05541083832905129, + "grad_norm": 2.2209455966949463, + "learning_rate": 4.962224587447752e-05, + "loss": 6.0053, + "step": 9317 + }, + { + "epoch": 0.055416785612332285, + "grad_norm": 2.0624780654907227, + "learning_rate": 4.962216497711052e-05, + "loss": 5.9258, + "step": 9318 + }, + { + "epoch": 0.05542273289561329, + "grad_norm": 2.371662139892578, + "learning_rate": 4.962208407114817e-05, + "loss": 6.4127, + "step": 9319 + }, + { + "epoch": 0.05542868017889428, + "grad_norm": 2.7035610675811768, + "learning_rate": 4.96220031565905e-05, + "loss": 5.9742, + "step": 9320 + }, + { + "epoch": 0.05543462746217528, + "grad_norm": 2.060577392578125, + "learning_rate": 4.9621922233437544e-05, + "loss": 5.9729, + "step": 9321 + }, + { + "epoch": 0.05544057474545627, + "grad_norm": 1.7935984134674072, + "learning_rate": 4.962184130168933e-05, + "loss": 5.4077, + "step": 9322 + }, + { + "epoch": 0.055446522028737275, + "grad_norm": 1.8716622591018677, + "learning_rate": 4.9621760361345885e-05, + "loss": 5.4554, + "step": 9323 + }, + { + "epoch": 0.05545246931201827, + "grad_norm": 1.9150923490524292, + "learning_rate": 4.962167941240724e-05, + "loss": 5.8121, + "step": 9324 + }, + { + "epoch": 0.055458416595299265, + "grad_norm": 1.9207059144973755, + "learning_rate": 4.962159845487342e-05, + "loss": 5.8593, + "step": 9325 + }, + { + "epoch": 0.05546436387858027, + "grad_norm": 1.962039589881897, + "learning_rate": 4.9621517488744454e-05, + "loss": 6.0174, + "step": 9326 + }, + { + "epoch": 0.05547031116186126, + "grad_norm": 2.0445704460144043, + "learning_rate": 4.9621436514020376e-05, + "loss": 5.5782, + "step": 9327 + }, + { + "epoch": 0.05547625844514226, + "grad_norm": 2.0861823558807373, + "learning_rate": 4.9621355530701204e-05, + "loss": 5.6102, + "step": 9328 + }, + { + "epoch": 0.05548220572842326, + "grad_norm": 2.0184309482574463, + "learning_rate": 4.962127453878697e-05, + "loss": 5.8072, + "step": 9329 + }, + { + "epoch": 0.055488153011704254, + "grad_norm": 1.899994134902954, + "learning_rate": 4.962119353827771e-05, + "loss": 5.7361, + "step": 9330 + }, + { + "epoch": 0.05549410029498525, + "grad_norm": 1.8874105215072632, + "learning_rate": 4.962111252917344e-05, + "loss": 5.7988, + "step": 9331 + }, + { + "epoch": 0.05550004757826625, + "grad_norm": 2.046682119369507, + "learning_rate": 4.9621031511474194e-05, + "loss": 5.7037, + "step": 9332 + }, + { + "epoch": 0.055505994861547246, + "grad_norm": 2.2552926540374756, + "learning_rate": 4.962095048517999e-05, + "loss": 5.7556, + "step": 9333 + }, + { + "epoch": 0.05551194214482824, + "grad_norm": 2.1904358863830566, + "learning_rate": 4.962086945029089e-05, + "loss": 5.6529, + "step": 9334 + }, + { + "epoch": 0.055517889428109236, + "grad_norm": 2.03745698928833, + "learning_rate": 4.9620788406806883e-05, + "loss": 5.8504, + "step": 9335 + }, + { + "epoch": 0.05552383671139024, + "grad_norm": 1.81668221950531, + "learning_rate": 4.9620707354728017e-05, + "loss": 5.3275, + "step": 9336 + }, + { + "epoch": 0.055529783994671233, + "grad_norm": 2.570976734161377, + "learning_rate": 4.962062629405432e-05, + "loss": 5.666, + "step": 9337 + }, + { + "epoch": 0.05553573127795223, + "grad_norm": 2.6855766773223877, + "learning_rate": 4.962054522478581e-05, + "loss": 5.7798, + "step": 9338 + }, + { + "epoch": 0.05554167856123323, + "grad_norm": 2.329690933227539, + "learning_rate": 4.962046414692252e-05, + "loss": 5.9334, + "step": 9339 + }, + { + "epoch": 0.055547625844514226, + "grad_norm": 1.6809495687484741, + "learning_rate": 4.962038306046449e-05, + "loss": 5.8506, + "step": 9340 + }, + { + "epoch": 0.05555357312779522, + "grad_norm": 1.7170113325119019, + "learning_rate": 4.962030196541173e-05, + "loss": 6.0863, + "step": 9341 + }, + { + "epoch": 0.05555952041107622, + "grad_norm": 2.247680902481079, + "learning_rate": 4.962022086176428e-05, + "loss": 5.2188, + "step": 9342 + }, + { + "epoch": 0.05556546769435722, + "grad_norm": 2.680091381072998, + "learning_rate": 4.9620139749522165e-05, + "loss": 4.8506, + "step": 9343 + }, + { + "epoch": 0.05557141497763821, + "grad_norm": 2.1886465549468994, + "learning_rate": 4.962005862868542e-05, + "loss": 5.5164, + "step": 9344 + }, + { + "epoch": 0.055577362260919215, + "grad_norm": 2.061368227005005, + "learning_rate": 4.961997749925405e-05, + "loss": 5.4491, + "step": 9345 + }, + { + "epoch": 0.05558330954420021, + "grad_norm": 2.368156909942627, + "learning_rate": 4.961989636122812e-05, + "loss": 5.9053, + "step": 9346 + }, + { + "epoch": 0.055589256827481205, + "grad_norm": 2.562565803527832, + "learning_rate": 4.961981521460763e-05, + "loss": 5.7683, + "step": 9347 + }, + { + "epoch": 0.05559520411076221, + "grad_norm": 2.388779640197754, + "learning_rate": 4.961973405939262e-05, + "loss": 5.1235, + "step": 9348 + }, + { + "epoch": 0.0556011513940432, + "grad_norm": 2.546994686126709, + "learning_rate": 4.9619652895583104e-05, + "loss": 4.7793, + "step": 9349 + }, + { + "epoch": 0.0556070986773242, + "grad_norm": 2.379549026489258, + "learning_rate": 4.9619571723179135e-05, + "loss": 4.8949, + "step": 9350 + }, + { + "epoch": 0.05561304596060519, + "grad_norm": 2.1621344089508057, + "learning_rate": 4.961949054218072e-05, + "loss": 4.6824, + "step": 9351 + }, + { + "epoch": 0.055618993243886194, + "grad_norm": 2.136289119720459, + "learning_rate": 4.96194093525879e-05, + "loss": 4.834, + "step": 9352 + }, + { + "epoch": 0.05562494052716719, + "grad_norm": 2.3572680950164795, + "learning_rate": 4.9619328154400694e-05, + "loss": 4.9755, + "step": 9353 + }, + { + "epoch": 0.055630887810448185, + "grad_norm": 2.2439966201782227, + "learning_rate": 4.961924694761913e-05, + "loss": 5.7662, + "step": 9354 + }, + { + "epoch": 0.05563683509372919, + "grad_norm": 2.287597894668579, + "learning_rate": 4.961916573224326e-05, + "loss": 4.6108, + "step": 9355 + }, + { + "epoch": 0.05564278237701018, + "grad_norm": 2.1382369995117188, + "learning_rate": 4.961908450827308e-05, + "loss": 4.5993, + "step": 9356 + }, + { + "epoch": 0.05564872966029118, + "grad_norm": 2.112348794937134, + "learning_rate": 4.961900327570863e-05, + "loss": 4.6798, + "step": 9357 + }, + { + "epoch": 0.05565467694357218, + "grad_norm": 2.0453972816467285, + "learning_rate": 4.9618922034549946e-05, + "loss": 4.5424, + "step": 9358 + }, + { + "epoch": 0.055660624226853174, + "grad_norm": 2.0547754764556885, + "learning_rate": 4.961884078479705e-05, + "loss": 5.0661, + "step": 9359 + }, + { + "epoch": 0.05566657151013417, + "grad_norm": 2.5003650188446045, + "learning_rate": 4.9618759526449965e-05, + "loss": 5.3388, + "step": 9360 + }, + { + "epoch": 0.05567251879341517, + "grad_norm": 2.0582423210144043, + "learning_rate": 4.9618678259508736e-05, + "loss": 5.8437, + "step": 9361 + }, + { + "epoch": 0.055678466076696166, + "grad_norm": 1.7867279052734375, + "learning_rate": 4.9618596983973376e-05, + "loss": 5.369, + "step": 9362 + }, + { + "epoch": 0.05568441335997716, + "grad_norm": 2.03729248046875, + "learning_rate": 4.961851569984392e-05, + "loss": 5.9932, + "step": 9363 + }, + { + "epoch": 0.055690360643258156, + "grad_norm": 2.2527456283569336, + "learning_rate": 4.961843440712038e-05, + "loss": 5.893, + "step": 9364 + }, + { + "epoch": 0.05569630792653916, + "grad_norm": 2.0027201175689697, + "learning_rate": 4.9618353105802815e-05, + "loss": 5.8216, + "step": 9365 + }, + { + "epoch": 0.05570225520982015, + "grad_norm": 2.236548662185669, + "learning_rate": 4.961827179589124e-05, + "loss": 5.5371, + "step": 9366 + }, + { + "epoch": 0.05570820249310115, + "grad_norm": 2.4477334022521973, + "learning_rate": 4.9618190477385666e-05, + "loss": 5.6552, + "step": 9367 + }, + { + "epoch": 0.05571414977638215, + "grad_norm": 2.504549026489258, + "learning_rate": 4.9618109150286145e-05, + "loss": 5.5732, + "step": 9368 + }, + { + "epoch": 0.055720097059663146, + "grad_norm": 2.1413187980651855, + "learning_rate": 4.9618027814592695e-05, + "loss": 5.1792, + "step": 9369 + }, + { + "epoch": 0.05572604434294414, + "grad_norm": 2.1714866161346436, + "learning_rate": 4.9617946470305344e-05, + "loss": 5.3444, + "step": 9370 + }, + { + "epoch": 0.05573199162622514, + "grad_norm": 1.7478383779525757, + "learning_rate": 4.9617865117424126e-05, + "loss": 5.7151, + "step": 9371 + }, + { + "epoch": 0.05573793890950614, + "grad_norm": 2.0415220260620117, + "learning_rate": 4.9617783755949067e-05, + "loss": 5.8765, + "step": 9372 + }, + { + "epoch": 0.05574388619278713, + "grad_norm": 1.917108416557312, + "learning_rate": 4.961770238588019e-05, + "loss": 6.0797, + "step": 9373 + }, + { + "epoch": 0.055749833476068135, + "grad_norm": 1.9404850006103516, + "learning_rate": 4.961762100721753e-05, + "loss": 6.1376, + "step": 9374 + }, + { + "epoch": 0.05575578075934913, + "grad_norm": 1.7101916074752808, + "learning_rate": 4.9617539619961104e-05, + "loss": 5.9375, + "step": 9375 + }, + { + "epoch": 0.055761728042630125, + "grad_norm": 2.591960906982422, + "learning_rate": 4.9617458224110954e-05, + "loss": 5.3716, + "step": 9376 + }, + { + "epoch": 0.05576767532591113, + "grad_norm": 2.070600986480713, + "learning_rate": 4.961737681966711e-05, + "loss": 5.3822, + "step": 9377 + }, + { + "epoch": 0.05577362260919212, + "grad_norm": 2.100820302963257, + "learning_rate": 4.9617295406629594e-05, + "loss": 5.7703, + "step": 9378 + }, + { + "epoch": 0.05577956989247312, + "grad_norm": 2.2413878440856934, + "learning_rate": 4.961721398499843e-05, + "loss": 4.9197, + "step": 9379 + }, + { + "epoch": 0.05578551717575411, + "grad_norm": 1.9762401580810547, + "learning_rate": 4.961713255477365e-05, + "loss": 5.6705, + "step": 9380 + }, + { + "epoch": 0.055791464459035114, + "grad_norm": 2.22676420211792, + "learning_rate": 4.961705111595528e-05, + "loss": 5.0196, + "step": 9381 + }, + { + "epoch": 0.05579741174231611, + "grad_norm": 2.0652241706848145, + "learning_rate": 4.9616969668543364e-05, + "loss": 5.3894, + "step": 9382 + }, + { + "epoch": 0.055803359025597105, + "grad_norm": 2.156890630722046, + "learning_rate": 4.96168882125379e-05, + "loss": 5.3063, + "step": 9383 + }, + { + "epoch": 0.05580930630887811, + "grad_norm": 2.131964683532715, + "learning_rate": 4.961680674793895e-05, + "loss": 5.9304, + "step": 9384 + }, + { + "epoch": 0.0558152535921591, + "grad_norm": 2.2117621898651123, + "learning_rate": 4.9616725274746525e-05, + "loss": 5.9553, + "step": 9385 + }, + { + "epoch": 0.0558212008754401, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.9616643792960654e-05, + "loss": 5.9911, + "step": 9386 + }, + { + "epoch": 0.0558271481587211, + "grad_norm": 1.7709077596664429, + "learning_rate": 4.961656230258136e-05, + "loss": 5.6291, + "step": 9387 + }, + { + "epoch": 0.055833095442002094, + "grad_norm": 1.838767170906067, + "learning_rate": 4.961648080360869e-05, + "loss": 6.0152, + "step": 9388 + }, + { + "epoch": 0.05583904272528309, + "grad_norm": 2.117058515548706, + "learning_rate": 4.9616399296042656e-05, + "loss": 4.8079, + "step": 9389 + }, + { + "epoch": 0.05584499000856409, + "grad_norm": 2.147491693496704, + "learning_rate": 4.9616317779883293e-05, + "loss": 4.6489, + "step": 9390 + }, + { + "epoch": 0.055850937291845086, + "grad_norm": 2.1025705337524414, + "learning_rate": 4.961623625513062e-05, + "loss": 4.4984, + "step": 9391 + }, + { + "epoch": 0.05585688457512608, + "grad_norm": 1.799986720085144, + "learning_rate": 4.961615472178468e-05, + "loss": 5.1008, + "step": 9392 + }, + { + "epoch": 0.055862831858407076, + "grad_norm": 2.2975053787231445, + "learning_rate": 4.961607317984549e-05, + "loss": 5.9754, + "step": 9393 + }, + { + "epoch": 0.05586877914168808, + "grad_norm": 1.9996155500411987, + "learning_rate": 4.961599162931309e-05, + "loss": 5.9255, + "step": 9394 + }, + { + "epoch": 0.05587472642496907, + "grad_norm": 1.7344794273376465, + "learning_rate": 4.9615910070187496e-05, + "loss": 6.0873, + "step": 9395 + }, + { + "epoch": 0.05588067370825007, + "grad_norm": 2.260706901550293, + "learning_rate": 4.961582850246875e-05, + "loss": 5.9454, + "step": 9396 + }, + { + "epoch": 0.05588662099153107, + "grad_norm": 2.1810765266418457, + "learning_rate": 4.961574692615686e-05, + "loss": 5.7548, + "step": 9397 + }, + { + "epoch": 0.055892568274812066, + "grad_norm": 2.0940003395080566, + "learning_rate": 4.961566534125188e-05, + "loss": 5.8184, + "step": 9398 + }, + { + "epoch": 0.05589851555809306, + "grad_norm": 2.066464900970459, + "learning_rate": 4.961558374775382e-05, + "loss": 5.7867, + "step": 9399 + }, + { + "epoch": 0.05590446284137406, + "grad_norm": 1.7197705507278442, + "learning_rate": 4.961550214566271e-05, + "loss": 5.9211, + "step": 9400 + }, + { + "epoch": 0.05591041012465506, + "grad_norm": 2.3055293560028076, + "learning_rate": 4.9615420534978583e-05, + "loss": 5.9531, + "step": 9401 + }, + { + "epoch": 0.05591635740793605, + "grad_norm": 2.0974669456481934, + "learning_rate": 4.961533891570147e-05, + "loss": 5.9347, + "step": 9402 + }, + { + "epoch": 0.055922304691217055, + "grad_norm": 2.5196354389190674, + "learning_rate": 4.96152572878314e-05, + "loss": 5.0729, + "step": 9403 + }, + { + "epoch": 0.05592825197449805, + "grad_norm": 2.157181978225708, + "learning_rate": 4.9615175651368395e-05, + "loss": 5.9513, + "step": 9404 + }, + { + "epoch": 0.055934199257779045, + "grad_norm": 1.94083833694458, + "learning_rate": 4.9615094006312485e-05, + "loss": 5.9239, + "step": 9405 + }, + { + "epoch": 0.05594014654106005, + "grad_norm": 2.2118191719055176, + "learning_rate": 4.9615012352663704e-05, + "loss": 5.6936, + "step": 9406 + }, + { + "epoch": 0.05594609382434104, + "grad_norm": 2.2255051136016846, + "learning_rate": 4.9614930690422065e-05, + "loss": 5.7475, + "step": 9407 + }, + { + "epoch": 0.05595204110762204, + "grad_norm": 2.1640844345092773, + "learning_rate": 4.961484901958762e-05, + "loss": 5.8138, + "step": 9408 + }, + { + "epoch": 0.05595798839090303, + "grad_norm": 2.2722928524017334, + "learning_rate": 4.961476734016038e-05, + "loss": 5.5784, + "step": 9409 + }, + { + "epoch": 0.055963935674184034, + "grad_norm": 2.0541749000549316, + "learning_rate": 4.961468565214039e-05, + "loss": 5.6871, + "step": 9410 + }, + { + "epoch": 0.05596988295746503, + "grad_norm": 2.3496010303497314, + "learning_rate": 4.9614603955527655e-05, + "loss": 5.4195, + "step": 9411 + }, + { + "epoch": 0.055975830240746025, + "grad_norm": 2.333435297012329, + "learning_rate": 4.9614522250322215e-05, + "loss": 5.4257, + "step": 9412 + }, + { + "epoch": 0.05598177752402703, + "grad_norm": 2.339057445526123, + "learning_rate": 4.9614440536524106e-05, + "loss": 5.4158, + "step": 9413 + }, + { + "epoch": 0.05598772480730802, + "grad_norm": 2.4383058547973633, + "learning_rate": 4.961435881413335e-05, + "loss": 5.4569, + "step": 9414 + }, + { + "epoch": 0.05599367209058902, + "grad_norm": 2.1405389308929443, + "learning_rate": 4.961427708314997e-05, + "loss": 5.6178, + "step": 9415 + }, + { + "epoch": 0.05599961937387002, + "grad_norm": 2.2082836627960205, + "learning_rate": 4.961419534357401e-05, + "loss": 5.386, + "step": 9416 + }, + { + "epoch": 0.056005566657151014, + "grad_norm": 2.0305027961730957, + "learning_rate": 4.961411359540548e-05, + "loss": 5.2822, + "step": 9417 + }, + { + "epoch": 0.05601151394043201, + "grad_norm": 2.606452226638794, + "learning_rate": 4.961403183864442e-05, + "loss": 5.2691, + "step": 9418 + }, + { + "epoch": 0.05601746122371301, + "grad_norm": 2.3506669998168945, + "learning_rate": 4.961395007329086e-05, + "loss": 5.3307, + "step": 9419 + }, + { + "epoch": 0.056023408506994006, + "grad_norm": 2.3472225666046143, + "learning_rate": 4.961386829934482e-05, + "loss": 5.2247, + "step": 9420 + }, + { + "epoch": 0.056029355790275, + "grad_norm": 2.1121721267700195, + "learning_rate": 4.961378651680633e-05, + "loss": 5.2857, + "step": 9421 + }, + { + "epoch": 0.056035303073555996, + "grad_norm": 2.4357142448425293, + "learning_rate": 4.9613704725675427e-05, + "loss": 5.3398, + "step": 9422 + }, + { + "epoch": 0.056041250356837, + "grad_norm": 2.639418125152588, + "learning_rate": 4.961362292595213e-05, + "loss": 5.3008, + "step": 9423 + }, + { + "epoch": 0.05604719764011799, + "grad_norm": 3.297189712524414, + "learning_rate": 4.961354111763647e-05, + "loss": 5.5908, + "step": 9424 + }, + { + "epoch": 0.05605314492339899, + "grad_norm": 2.095613718032837, + "learning_rate": 4.961345930072848e-05, + "loss": 5.2389, + "step": 9425 + }, + { + "epoch": 0.05605909220667999, + "grad_norm": 2.2495081424713135, + "learning_rate": 4.9613377475228186e-05, + "loss": 5.474, + "step": 9426 + }, + { + "epoch": 0.056065039489960986, + "grad_norm": 2.282697916030884, + "learning_rate": 4.961329564113562e-05, + "loss": 5.3253, + "step": 9427 + }, + { + "epoch": 0.05607098677324198, + "grad_norm": 2.515075206756592, + "learning_rate": 4.96132137984508e-05, + "loss": 5.238, + "step": 9428 + }, + { + "epoch": 0.05607693405652298, + "grad_norm": 2.072274684906006, + "learning_rate": 4.961313194717376e-05, + "loss": 5.3627, + "step": 9429 + }, + { + "epoch": 0.05608288133980398, + "grad_norm": 2.4552547931671143, + "learning_rate": 4.961305008730454e-05, + "loss": 6.1799, + "step": 9430 + }, + { + "epoch": 0.05608882862308497, + "grad_norm": 2.2289538383483887, + "learning_rate": 4.9612968218843146e-05, + "loss": 5.5477, + "step": 9431 + }, + { + "epoch": 0.056094775906365975, + "grad_norm": 2.6174185276031494, + "learning_rate": 4.9612886341789635e-05, + "loss": 5.1779, + "step": 9432 + }, + { + "epoch": 0.05610072318964697, + "grad_norm": 2.4489150047302246, + "learning_rate": 4.9612804456144005e-05, + "loss": 5.2067, + "step": 9433 + }, + { + "epoch": 0.056106670472927965, + "grad_norm": 2.2651829719543457, + "learning_rate": 4.96127225619063e-05, + "loss": 5.3582, + "step": 9434 + }, + { + "epoch": 0.05611261775620897, + "grad_norm": 2.1985251903533936, + "learning_rate": 4.9612640659076556e-05, + "loss": 5.2034, + "step": 9435 + }, + { + "epoch": 0.05611856503948996, + "grad_norm": 1.9510128498077393, + "learning_rate": 4.961255874765479e-05, + "loss": 5.2263, + "step": 9436 + }, + { + "epoch": 0.05612451232277096, + "grad_norm": 2.338815212249756, + "learning_rate": 4.961247682764104e-05, + "loss": 5.9091, + "step": 9437 + }, + { + "epoch": 0.05613045960605195, + "grad_norm": 2.097111225128174, + "learning_rate": 4.961239489903532e-05, + "loss": 6.3285, + "step": 9438 + }, + { + "epoch": 0.056136406889332954, + "grad_norm": 1.9965720176696777, + "learning_rate": 4.961231296183767e-05, + "loss": 6.3141, + "step": 9439 + }, + { + "epoch": 0.05614235417261395, + "grad_norm": 2.2406206130981445, + "learning_rate": 4.9612231016048114e-05, + "loss": 5.7335, + "step": 9440 + }, + { + "epoch": 0.056148301455894944, + "grad_norm": 2.2798993587493896, + "learning_rate": 4.961214906166668e-05, + "loss": 4.9959, + "step": 9441 + }, + { + "epoch": 0.056154248739175947, + "grad_norm": 2.482706069946289, + "learning_rate": 4.96120670986934e-05, + "loss": 5.295, + "step": 9442 + }, + { + "epoch": 0.05616019602245694, + "grad_norm": 2.398867607116699, + "learning_rate": 4.961198512712831e-05, + "loss": 4.9592, + "step": 9443 + }, + { + "epoch": 0.05616614330573794, + "grad_norm": 2.1979055404663086, + "learning_rate": 4.961190314697143e-05, + "loss": 5.1003, + "step": 9444 + }, + { + "epoch": 0.05617209058901894, + "grad_norm": 2.3249244689941406, + "learning_rate": 4.961182115822278e-05, + "loss": 5.1408, + "step": 9445 + }, + { + "epoch": 0.056178037872299934, + "grad_norm": 2.3679821491241455, + "learning_rate": 4.96117391608824e-05, + "loss": 5.4006, + "step": 9446 + }, + { + "epoch": 0.05618398515558093, + "grad_norm": 1.8706363439559937, + "learning_rate": 4.961165715495032e-05, + "loss": 6.1741, + "step": 9447 + }, + { + "epoch": 0.05618993243886193, + "grad_norm": 2.1825344562530518, + "learning_rate": 4.961157514042656e-05, + "loss": 6.0869, + "step": 9448 + }, + { + "epoch": 0.056195879722142926, + "grad_norm": 1.85076904296875, + "learning_rate": 4.961149311731116e-05, + "loss": 5.9252, + "step": 9449 + }, + { + "epoch": 0.05620182700542392, + "grad_norm": 1.9433631896972656, + "learning_rate": 4.961141108560413e-05, + "loss": 5.968, + "step": 9450 + }, + { + "epoch": 0.056207774288704916, + "grad_norm": 2.5718259811401367, + "learning_rate": 4.961132904530552e-05, + "loss": 5.4274, + "step": 9451 + }, + { + "epoch": 0.05621372157198592, + "grad_norm": 1.919552206993103, + "learning_rate": 4.961124699641535e-05, + "loss": 5.1943, + "step": 9452 + }, + { + "epoch": 0.05621966885526691, + "grad_norm": 2.1371817588806152, + "learning_rate": 4.961116493893364e-05, + "loss": 5.9949, + "step": 9453 + }, + { + "epoch": 0.05622561613854791, + "grad_norm": 2.5715489387512207, + "learning_rate": 4.961108287286044e-05, + "loss": 6.2061, + "step": 9454 + }, + { + "epoch": 0.05623156342182891, + "grad_norm": 2.1871471405029297, + "learning_rate": 4.961100079819575e-05, + "loss": 5.7872, + "step": 9455 + }, + { + "epoch": 0.056237510705109905, + "grad_norm": 2.011925220489502, + "learning_rate": 4.961091871493962e-05, + "loss": 5.7992, + "step": 9456 + }, + { + "epoch": 0.0562434579883909, + "grad_norm": 2.516580820083618, + "learning_rate": 4.9610836623092074e-05, + "loss": 5.9154, + "step": 9457 + }, + { + "epoch": 0.0562494052716719, + "grad_norm": 1.9336326122283936, + "learning_rate": 4.961075452265314e-05, + "loss": 5.7933, + "step": 9458 + }, + { + "epoch": 0.0562553525549529, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.961067241362285e-05, + "loss": 6.1897, + "step": 9459 + }, + { + "epoch": 0.05626129983823389, + "grad_norm": 1.9757578372955322, + "learning_rate": 4.961059029600122e-05, + "loss": 6.0909, + "step": 9460 + }, + { + "epoch": 0.056267247121514895, + "grad_norm": 1.9767241477966309, + "learning_rate": 4.9610508169788294e-05, + "loss": 6.2212, + "step": 9461 + }, + { + "epoch": 0.05627319440479589, + "grad_norm": 1.9890403747558594, + "learning_rate": 4.961042603498409e-05, + "loss": 6.5071, + "step": 9462 + }, + { + "epoch": 0.056279141688076885, + "grad_norm": 1.9011937379837036, + "learning_rate": 4.961034389158864e-05, + "loss": 5.8098, + "step": 9463 + }, + { + "epoch": 0.05628508897135789, + "grad_norm": 2.236356735229492, + "learning_rate": 4.961026173960197e-05, + "loss": 4.8901, + "step": 9464 + }, + { + "epoch": 0.05629103625463888, + "grad_norm": 1.9147372245788574, + "learning_rate": 4.961017957902412e-05, + "loss": 5.1372, + "step": 9465 + }, + { + "epoch": 0.05629698353791988, + "grad_norm": 1.9628163576126099, + "learning_rate": 4.9610097409855106e-05, + "loss": 5.1161, + "step": 9466 + }, + { + "epoch": 0.05630293082120087, + "grad_norm": 2.0323991775512695, + "learning_rate": 4.961001523209496e-05, + "loss": 5.1493, + "step": 9467 + }, + { + "epoch": 0.056308878104481874, + "grad_norm": 1.7026360034942627, + "learning_rate": 4.9609933045743714e-05, + "loss": 5.2349, + "step": 9468 + }, + { + "epoch": 0.05631482538776287, + "grad_norm": 1.7758761644363403, + "learning_rate": 4.9609850850801394e-05, + "loss": 5.231, + "step": 9469 + }, + { + "epoch": 0.056320772671043864, + "grad_norm": 2.3305037021636963, + "learning_rate": 4.9609768647268026e-05, + "loss": 5.9209, + "step": 9470 + }, + { + "epoch": 0.056326719954324866, + "grad_norm": 2.2628681659698486, + "learning_rate": 4.960968643514365e-05, + "loss": 5.4753, + "step": 9471 + }, + { + "epoch": 0.05633266723760586, + "grad_norm": 2.4022347927093506, + "learning_rate": 4.9609604214428286e-05, + "loss": 4.8414, + "step": 9472 + }, + { + "epoch": 0.05633861452088686, + "grad_norm": 2.2767343521118164, + "learning_rate": 4.9609521985121955e-05, + "loss": 4.7178, + "step": 9473 + }, + { + "epoch": 0.05634456180416786, + "grad_norm": 2.547600507736206, + "learning_rate": 4.96094397472247e-05, + "loss": 4.7365, + "step": 9474 + }, + { + "epoch": 0.056350509087448854, + "grad_norm": 2.3546998500823975, + "learning_rate": 4.960935750073654e-05, + "loss": 5.4846, + "step": 9475 + }, + { + "epoch": 0.05635645637072985, + "grad_norm": 2.9641268253326416, + "learning_rate": 4.960927524565751e-05, + "loss": 5.7409, + "step": 9476 + }, + { + "epoch": 0.05636240365401085, + "grad_norm": 3.1727824211120605, + "learning_rate": 4.960919298198764e-05, + "loss": 5.8456, + "step": 9477 + }, + { + "epoch": 0.056368350937291846, + "grad_norm": 2.620507001876831, + "learning_rate": 4.960911070972695e-05, + "loss": 5.6295, + "step": 9478 + }, + { + "epoch": 0.05637429822057284, + "grad_norm": 2.6132571697235107, + "learning_rate": 4.960902842887548e-05, + "loss": 5.697, + "step": 9479 + }, + { + "epoch": 0.056380245503853836, + "grad_norm": 2.2931299209594727, + "learning_rate": 4.960894613943324e-05, + "loss": 5.4723, + "step": 9480 + }, + { + "epoch": 0.05638619278713484, + "grad_norm": 2.176729202270508, + "learning_rate": 4.9608863841400284e-05, + "loss": 5.7403, + "step": 9481 + }, + { + "epoch": 0.05639214007041583, + "grad_norm": 1.932180404663086, + "learning_rate": 4.9608781534776616e-05, + "loss": 5.9256, + "step": 9482 + }, + { + "epoch": 0.05639808735369683, + "grad_norm": 1.7315243482589722, + "learning_rate": 4.9608699219562286e-05, + "loss": 5.9176, + "step": 9483 + }, + { + "epoch": 0.05640403463697783, + "grad_norm": 1.6548408269882202, + "learning_rate": 4.9608616895757306e-05, + "loss": 5.7495, + "step": 9484 + }, + { + "epoch": 0.056409981920258825, + "grad_norm": 1.8549202680587769, + "learning_rate": 4.960853456336172e-05, + "loss": 5.5261, + "step": 9485 + }, + { + "epoch": 0.05641592920353982, + "grad_norm": 2.5990993976593018, + "learning_rate": 4.9608452222375544e-05, + "loss": 5.5934, + "step": 9486 + }, + { + "epoch": 0.05642187648682082, + "grad_norm": 1.705051302909851, + "learning_rate": 4.9608369872798815e-05, + "loss": 5.3613, + "step": 9487 + }, + { + "epoch": 0.05642782377010182, + "grad_norm": 1.6170406341552734, + "learning_rate": 4.960828751463156e-05, + "loss": 5.2743, + "step": 9488 + }, + { + "epoch": 0.05643377105338281, + "grad_norm": 1.6247482299804688, + "learning_rate": 4.9608205147873796e-05, + "loss": 5.2772, + "step": 9489 + }, + { + "epoch": 0.056439718336663815, + "grad_norm": 1.7574137449264526, + "learning_rate": 4.9608122772525575e-05, + "loss": 5.3464, + "step": 9490 + }, + { + "epoch": 0.05644566561994481, + "grad_norm": 1.8814537525177002, + "learning_rate": 4.960804038858691e-05, + "loss": 5.3092, + "step": 9491 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 2.0222842693328857, + "learning_rate": 4.9607957996057816e-05, + "loss": 4.8234, + "step": 9492 + }, + { + "epoch": 0.05645756018650681, + "grad_norm": 1.6224759817123413, + "learning_rate": 4.960787559493836e-05, + "loss": 5.3962, + "step": 9493 + }, + { + "epoch": 0.0564635074697878, + "grad_norm": 1.4097533226013184, + "learning_rate": 4.960779318522853e-05, + "loss": 5.8302, + "step": 9494 + }, + { + "epoch": 0.0564694547530688, + "grad_norm": 1.7296205759048462, + "learning_rate": 4.960771076692839e-05, + "loss": 5.5679, + "step": 9495 + }, + { + "epoch": 0.05647540203634979, + "grad_norm": 1.6300212144851685, + "learning_rate": 4.960762834003794e-05, + "loss": 5.4315, + "step": 9496 + }, + { + "epoch": 0.056481349319630794, + "grad_norm": 1.8587864637374878, + "learning_rate": 4.960754590455723e-05, + "loss": 5.5492, + "step": 9497 + }, + { + "epoch": 0.05648729660291179, + "grad_norm": 1.8136985301971436, + "learning_rate": 4.960746346048628e-05, + "loss": 5.6363, + "step": 9498 + }, + { + "epoch": 0.056493243886192784, + "grad_norm": 2.1277284622192383, + "learning_rate": 4.960738100782511e-05, + "loss": 5.593, + "step": 9499 + }, + { + "epoch": 0.056499191169473786, + "grad_norm": 2.0262863636016846, + "learning_rate": 4.960729854657377e-05, + "loss": 5.6396, + "step": 9500 + }, + { + "epoch": 0.05650513845275478, + "grad_norm": 1.7870309352874756, + "learning_rate": 4.9607216076732266e-05, + "loss": 5.6523, + "step": 9501 + }, + { + "epoch": 0.05651108573603578, + "grad_norm": 1.734782099723816, + "learning_rate": 4.9607133598300636e-05, + "loss": 5.5313, + "step": 9502 + }, + { + "epoch": 0.05651703301931678, + "grad_norm": 2.2485032081604004, + "learning_rate": 4.9607051111278914e-05, + "loss": 5.3814, + "step": 9503 + }, + { + "epoch": 0.056522980302597774, + "grad_norm": 1.5091774463653564, + "learning_rate": 4.9606968615667125e-05, + "loss": 5.5277, + "step": 9504 + }, + { + "epoch": 0.05652892758587877, + "grad_norm": 1.7117774486541748, + "learning_rate": 4.9606886111465303e-05, + "loss": 5.2649, + "step": 9505 + }, + { + "epoch": 0.05653487486915977, + "grad_norm": 1.7309353351593018, + "learning_rate": 4.960680359867346e-05, + "loss": 5.2276, + "step": 9506 + }, + { + "epoch": 0.056540822152440766, + "grad_norm": 1.7058963775634766, + "learning_rate": 4.960672107729164e-05, + "loss": 5.1848, + "step": 9507 + }, + { + "epoch": 0.05654676943572176, + "grad_norm": 1.7862296104431152, + "learning_rate": 4.960663854731987e-05, + "loss": 5.2424, + "step": 9508 + }, + { + "epoch": 0.05655271671900276, + "grad_norm": 1.8900794982910156, + "learning_rate": 4.960655600875818e-05, + "loss": 5.283, + "step": 9509 + }, + { + "epoch": 0.05655866400228376, + "grad_norm": 1.9991587400436401, + "learning_rate": 4.960647346160658e-05, + "loss": 5.3525, + "step": 9510 + }, + { + "epoch": 0.05656461128556475, + "grad_norm": 1.6889851093292236, + "learning_rate": 4.960639090586513e-05, + "loss": 5.0592, + "step": 9511 + }, + { + "epoch": 0.05657055856884575, + "grad_norm": 1.6314234733581543, + "learning_rate": 4.9606308341533844e-05, + "loss": 5.1733, + "step": 9512 + }, + { + "epoch": 0.05657650585212675, + "grad_norm": 1.7801847457885742, + "learning_rate": 4.960622576861275e-05, + "loss": 5.2358, + "step": 9513 + }, + { + "epoch": 0.056582453135407745, + "grad_norm": 1.6572017669677734, + "learning_rate": 4.9606143187101864e-05, + "loss": 5.2429, + "step": 9514 + }, + { + "epoch": 0.05658840041868874, + "grad_norm": 1.7574421167373657, + "learning_rate": 4.960606059700124e-05, + "loss": 5.0717, + "step": 9515 + }, + { + "epoch": 0.05659434770196974, + "grad_norm": 1.8162970542907715, + "learning_rate": 4.960597799831088e-05, + "loss": 5.1513, + "step": 9516 + }, + { + "epoch": 0.05660029498525074, + "grad_norm": 1.9231795072555542, + "learning_rate": 4.960589539103084e-05, + "loss": 5.1539, + "step": 9517 + }, + { + "epoch": 0.05660624226853173, + "grad_norm": 1.624566674232483, + "learning_rate": 4.9605812775161136e-05, + "loss": 5.0999, + "step": 9518 + }, + { + "epoch": 0.056612189551812735, + "grad_norm": 1.4293668270111084, + "learning_rate": 4.960573015070179e-05, + "loss": 5.2365, + "step": 9519 + }, + { + "epoch": 0.05661813683509373, + "grad_norm": 1.789515495300293, + "learning_rate": 4.960564751765284e-05, + "loss": 5.2233, + "step": 9520 + }, + { + "epoch": 0.056624084118374725, + "grad_norm": 1.7212306261062622, + "learning_rate": 4.960556487601432e-05, + "loss": 5.1902, + "step": 9521 + }, + { + "epoch": 0.05663003140165573, + "grad_norm": 1.7691519260406494, + "learning_rate": 4.960548222578625e-05, + "loss": 5.2136, + "step": 9522 + }, + { + "epoch": 0.05663597868493672, + "grad_norm": 1.5925794839859009, + "learning_rate": 4.960539956696866e-05, + "loss": 5.4808, + "step": 9523 + }, + { + "epoch": 0.05664192596821772, + "grad_norm": 1.7014095783233643, + "learning_rate": 4.960531689956157e-05, + "loss": 5.1934, + "step": 9524 + }, + { + "epoch": 0.05664787325149871, + "grad_norm": 1.3620802164077759, + "learning_rate": 4.960523422356502e-05, + "loss": 5.0169, + "step": 9525 + }, + { + "epoch": 0.056653820534779714, + "grad_norm": 1.4778205156326294, + "learning_rate": 4.960515153897904e-05, + "loss": 5.1535, + "step": 9526 + }, + { + "epoch": 0.05665976781806071, + "grad_norm": 1.6393300294876099, + "learning_rate": 4.960506884580366e-05, + "loss": 5.2494, + "step": 9527 + }, + { + "epoch": 0.056665715101341704, + "grad_norm": 1.6070711612701416, + "learning_rate": 4.96049861440389e-05, + "loss": 5.3117, + "step": 9528 + }, + { + "epoch": 0.056671662384622706, + "grad_norm": 1.6023461818695068, + "learning_rate": 4.96049034336848e-05, + "loss": 5.1554, + "step": 9529 + }, + { + "epoch": 0.0566776096679037, + "grad_norm": 1.6061514616012573, + "learning_rate": 4.9604820714741374e-05, + "loss": 5.4123, + "step": 9530 + }, + { + "epoch": 0.056683556951184697, + "grad_norm": 1.8043792247772217, + "learning_rate": 4.960473798720866e-05, + "loss": 5.2582, + "step": 9531 + }, + { + "epoch": 0.0566895042344657, + "grad_norm": 1.6002432107925415, + "learning_rate": 4.960465525108669e-05, + "loss": 5.211, + "step": 9532 + }, + { + "epoch": 0.056695451517746694, + "grad_norm": 1.851266622543335, + "learning_rate": 4.960457250637549e-05, + "loss": 5.0949, + "step": 9533 + }, + { + "epoch": 0.05670139880102769, + "grad_norm": 1.7806520462036133, + "learning_rate": 4.9604489753075085e-05, + "loss": 5.1178, + "step": 9534 + }, + { + "epoch": 0.05670734608430869, + "grad_norm": 1.9938620328903198, + "learning_rate": 4.9604406991185506e-05, + "loss": 5.098, + "step": 9535 + }, + { + "epoch": 0.056713293367589686, + "grad_norm": 1.7983622550964355, + "learning_rate": 4.960432422070679e-05, + "loss": 4.98, + "step": 9536 + }, + { + "epoch": 0.05671924065087068, + "grad_norm": 1.845821499824524, + "learning_rate": 4.960424144163895e-05, + "loss": 4.951, + "step": 9537 + }, + { + "epoch": 0.05672518793415168, + "grad_norm": 1.8922109603881836, + "learning_rate": 4.960415865398202e-05, + "loss": 5.0327, + "step": 9538 + }, + { + "epoch": 0.05673113521743268, + "grad_norm": 2.159832239151001, + "learning_rate": 4.960407585773604e-05, + "loss": 5.5287, + "step": 9539 + }, + { + "epoch": 0.05673708250071367, + "grad_norm": 1.9966739416122437, + "learning_rate": 4.960399305290103e-05, + "loss": 5.7114, + "step": 9540 + }, + { + "epoch": 0.05674302978399467, + "grad_norm": 1.8796072006225586, + "learning_rate": 4.9603910239477026e-05, + "loss": 5.4673, + "step": 9541 + }, + { + "epoch": 0.05674897706727567, + "grad_norm": 1.6589174270629883, + "learning_rate": 4.9603827417464045e-05, + "loss": 5.3755, + "step": 9542 + }, + { + "epoch": 0.056754924350556665, + "grad_norm": 1.975807547569275, + "learning_rate": 4.960374458686212e-05, + "loss": 5.0648, + "step": 9543 + }, + { + "epoch": 0.05676087163383766, + "grad_norm": 1.7437241077423096, + "learning_rate": 4.960366174767128e-05, + "loss": 5.2338, + "step": 9544 + }, + { + "epoch": 0.05676681891711866, + "grad_norm": 1.8508884906768799, + "learning_rate": 4.9603578899891564e-05, + "loss": 5.3432, + "step": 9545 + }, + { + "epoch": 0.05677276620039966, + "grad_norm": 2.2117562294006348, + "learning_rate": 4.960349604352299e-05, + "loss": 5.0623, + "step": 9546 + }, + { + "epoch": 0.05677871348368065, + "grad_norm": 1.7681034803390503, + "learning_rate": 4.9603413178565586e-05, + "loss": 5.1998, + "step": 9547 + }, + { + "epoch": 0.056784660766961655, + "grad_norm": 2.4477179050445557, + "learning_rate": 4.960333030501939e-05, + "loss": 5.3317, + "step": 9548 + }, + { + "epoch": 0.05679060805024265, + "grad_norm": 1.8297652006149292, + "learning_rate": 4.9603247422884426e-05, + "loss": 5.3608, + "step": 9549 + }, + { + "epoch": 0.056796555333523645, + "grad_norm": 1.8361153602600098, + "learning_rate": 4.9603164532160715e-05, + "loss": 5.3914, + "step": 9550 + }, + { + "epoch": 0.05680250261680465, + "grad_norm": 1.748226523399353, + "learning_rate": 4.96030816328483e-05, + "loss": 5.3436, + "step": 9551 + }, + { + "epoch": 0.05680844990008564, + "grad_norm": 1.744964599609375, + "learning_rate": 4.96029987249472e-05, + "loss": 5.4287, + "step": 9552 + }, + { + "epoch": 0.05681439718336664, + "grad_norm": 1.9512866735458374, + "learning_rate": 4.9602915808457454e-05, + "loss": 5.3601, + "step": 9553 + }, + { + "epoch": 0.05682034446664763, + "grad_norm": 1.5863629579544067, + "learning_rate": 4.9602832883379077e-05, + "loss": 5.5491, + "step": 9554 + }, + { + "epoch": 0.056826291749928634, + "grad_norm": 1.967677354812622, + "learning_rate": 4.96027499497121e-05, + "loss": 5.2402, + "step": 9555 + }, + { + "epoch": 0.05683223903320963, + "grad_norm": 2.277714252471924, + "learning_rate": 4.960266700745657e-05, + "loss": 5.5155, + "step": 9556 + }, + { + "epoch": 0.056838186316490624, + "grad_norm": 1.8371034860610962, + "learning_rate": 4.96025840566125e-05, + "loss": 5.2694, + "step": 9557 + }, + { + "epoch": 0.056844133599771626, + "grad_norm": 1.723008155822754, + "learning_rate": 4.9602501097179915e-05, + "loss": 5.4983, + "step": 9558 + }, + { + "epoch": 0.05685008088305262, + "grad_norm": 1.6955413818359375, + "learning_rate": 4.960241812915886e-05, + "loss": 5.6888, + "step": 9559 + }, + { + "epoch": 0.056856028166333616, + "grad_norm": 1.5899012088775635, + "learning_rate": 4.960233515254935e-05, + "loss": 5.4241, + "step": 9560 + }, + { + "epoch": 0.05686197544961462, + "grad_norm": 1.493268370628357, + "learning_rate": 4.9602252167351416e-05, + "loss": 5.1889, + "step": 9561 + }, + { + "epoch": 0.056867922732895614, + "grad_norm": 1.8037081956863403, + "learning_rate": 4.9602169173565094e-05, + "loss": 5.1785, + "step": 9562 + }, + { + "epoch": 0.05687387001617661, + "grad_norm": 1.6377664804458618, + "learning_rate": 4.960208617119041e-05, + "loss": 5.2593, + "step": 9563 + }, + { + "epoch": 0.05687981729945761, + "grad_norm": 2.077209234237671, + "learning_rate": 4.960200316022739e-05, + "loss": 5.1012, + "step": 9564 + }, + { + "epoch": 0.056885764582738606, + "grad_norm": 2.3584885597229004, + "learning_rate": 4.9601920140676064e-05, + "loss": 5.1141, + "step": 9565 + }, + { + "epoch": 0.0568917118660196, + "grad_norm": 1.990319013595581, + "learning_rate": 4.960183711253646e-05, + "loss": 4.9336, + "step": 9566 + }, + { + "epoch": 0.0568976591493006, + "grad_norm": 2.037742853164673, + "learning_rate": 4.960175407580861e-05, + "loss": 4.8494, + "step": 9567 + }, + { + "epoch": 0.0569036064325816, + "grad_norm": 1.8493839502334595, + "learning_rate": 4.9601671030492546e-05, + "loss": 5.337, + "step": 9568 + }, + { + "epoch": 0.05690955371586259, + "grad_norm": 1.9864604473114014, + "learning_rate": 4.960158797658829e-05, + "loss": 5.5684, + "step": 9569 + }, + { + "epoch": 0.05691550099914359, + "grad_norm": 1.9740629196166992, + "learning_rate": 4.960150491409587e-05, + "loss": 5.444, + "step": 9570 + }, + { + "epoch": 0.05692144828242459, + "grad_norm": 1.9429807662963867, + "learning_rate": 4.960142184301533e-05, + "loss": 5.277, + "step": 9571 + }, + { + "epoch": 0.056927395565705585, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.960133876334668e-05, + "loss": 5.1694, + "step": 9572 + }, + { + "epoch": 0.05693334284898658, + "grad_norm": 1.7716888189315796, + "learning_rate": 4.960125567508996e-05, + "loss": 5.1383, + "step": 9573 + }, + { + "epoch": 0.05693929013226758, + "grad_norm": 1.8266246318817139, + "learning_rate": 4.9601172578245194e-05, + "loss": 5.4019, + "step": 9574 + }, + { + "epoch": 0.05694523741554858, + "grad_norm": 1.8929648399353027, + "learning_rate": 4.9601089472812414e-05, + "loss": 5.3948, + "step": 9575 + }, + { + "epoch": 0.05695118469882957, + "grad_norm": 1.9918208122253418, + "learning_rate": 4.960100635879165e-05, + "loss": 5.3195, + "step": 9576 + }, + { + "epoch": 0.056957131982110575, + "grad_norm": 1.4987989664077759, + "learning_rate": 4.960092323618292e-05, + "loss": 5.5292, + "step": 9577 + }, + { + "epoch": 0.05696307926539157, + "grad_norm": 1.683800220489502, + "learning_rate": 4.960084010498627e-05, + "loss": 5.5069, + "step": 9578 + }, + { + "epoch": 0.056969026548672565, + "grad_norm": 1.767561435699463, + "learning_rate": 4.960075696520171e-05, + "loss": 5.4134, + "step": 9579 + }, + { + "epoch": 0.05697497383195357, + "grad_norm": 2.077564239501953, + "learning_rate": 4.960067381682929e-05, + "loss": 5.3362, + "step": 9580 + }, + { + "epoch": 0.05698092111523456, + "grad_norm": 2.0167109966278076, + "learning_rate": 4.960059065986903e-05, + "loss": 5.4235, + "step": 9581 + }, + { + "epoch": 0.05698686839851556, + "grad_norm": 1.647669792175293, + "learning_rate": 4.9600507494320953e-05, + "loss": 5.3273, + "step": 9582 + }, + { + "epoch": 0.05699281568179655, + "grad_norm": 1.6051719188690186, + "learning_rate": 4.960042432018509e-05, + "loss": 5.2486, + "step": 9583 + }, + { + "epoch": 0.056998762965077554, + "grad_norm": 1.9283394813537598, + "learning_rate": 4.960034113746148e-05, + "loss": 5.233, + "step": 9584 + }, + { + "epoch": 0.05700471024835855, + "grad_norm": 1.6215802431106567, + "learning_rate": 4.960025794615014e-05, + "loss": 5.2322, + "step": 9585 + }, + { + "epoch": 0.057010657531639544, + "grad_norm": 1.8902918100357056, + "learning_rate": 4.960017474625111e-05, + "loss": 5.063, + "step": 9586 + }, + { + "epoch": 0.057016604814920546, + "grad_norm": 2.4694666862487793, + "learning_rate": 4.9600091537764415e-05, + "loss": 4.498, + "step": 9587 + }, + { + "epoch": 0.05702255209820154, + "grad_norm": 1.98915433883667, + "learning_rate": 4.960000832069007e-05, + "loss": 4.8781, + "step": 9588 + }, + { + "epoch": 0.057028499381482536, + "grad_norm": 2.0424818992614746, + "learning_rate": 4.9599925095028126e-05, + "loss": 5.5803, + "step": 9589 + }, + { + "epoch": 0.05703444666476354, + "grad_norm": 1.471275806427002, + "learning_rate": 4.95998418607786e-05, + "loss": 5.5604, + "step": 9590 + }, + { + "epoch": 0.057040393948044534, + "grad_norm": 1.6512761116027832, + "learning_rate": 4.959975861794152e-05, + "loss": 5.2147, + "step": 9591 + }, + { + "epoch": 0.05704634123132553, + "grad_norm": 1.6902865171432495, + "learning_rate": 4.959967536651693e-05, + "loss": 5.2654, + "step": 9592 + }, + { + "epoch": 0.05705228851460653, + "grad_norm": 1.5656665563583374, + "learning_rate": 4.9599592106504835e-05, + "loss": 5.1106, + "step": 9593 + }, + { + "epoch": 0.057058235797887526, + "grad_norm": 1.760901927947998, + "learning_rate": 4.959950883790528e-05, + "loss": 5.1833, + "step": 9594 + }, + { + "epoch": 0.05706418308116852, + "grad_norm": 1.5585325956344604, + "learning_rate": 4.9599425560718294e-05, + "loss": 5.202, + "step": 9595 + }, + { + "epoch": 0.05707013036444952, + "grad_norm": 1.5477479696273804, + "learning_rate": 4.959934227494389e-05, + "loss": 5.121, + "step": 9596 + }, + { + "epoch": 0.05707607764773052, + "grad_norm": 1.9299825429916382, + "learning_rate": 4.959925898058213e-05, + "loss": 5.0026, + "step": 9597 + }, + { + "epoch": 0.05708202493101151, + "grad_norm": 1.866237759590149, + "learning_rate": 4.959917567763301e-05, + "loss": 4.999, + "step": 9598 + }, + { + "epoch": 0.05708797221429251, + "grad_norm": 1.6670162677764893, + "learning_rate": 4.959909236609657e-05, + "loss": 5.4047, + "step": 9599 + }, + { + "epoch": 0.05709391949757351, + "grad_norm": 1.4666836261749268, + "learning_rate": 4.9599009045972844e-05, + "loss": 5.3598, + "step": 9600 + }, + { + "epoch": 0.057099866780854505, + "grad_norm": 1.928645372390747, + "learning_rate": 4.959892571726186e-05, + "loss": 5.7015, + "step": 9601 + }, + { + "epoch": 0.0571058140641355, + "grad_norm": 1.9761322736740112, + "learning_rate": 4.959884237996365e-05, + "loss": 4.8682, + "step": 9602 + }, + { + "epoch": 0.0571117613474165, + "grad_norm": 1.9823036193847656, + "learning_rate": 4.959875903407823e-05, + "loss": 4.8752, + "step": 9603 + }, + { + "epoch": 0.0571177086306975, + "grad_norm": 1.9242253303527832, + "learning_rate": 4.959867567960564e-05, + "loss": 4.9314, + "step": 9604 + }, + { + "epoch": 0.05712365591397849, + "grad_norm": 1.740980625152588, + "learning_rate": 4.9598592316545904e-05, + "loss": 4.9843, + "step": 9605 + }, + { + "epoch": 0.057129603197259494, + "grad_norm": 2.0768508911132812, + "learning_rate": 4.959850894489906e-05, + "loss": 4.8528, + "step": 9606 + }, + { + "epoch": 0.05713555048054049, + "grad_norm": 1.7417833805084229, + "learning_rate": 4.959842556466513e-05, + "loss": 5.1374, + "step": 9607 + }, + { + "epoch": 0.057141497763821485, + "grad_norm": 1.933691382408142, + "learning_rate": 4.959834217584414e-05, + "loss": 5.349, + "step": 9608 + }, + { + "epoch": 0.05714744504710249, + "grad_norm": 1.8035194873809814, + "learning_rate": 4.959825877843612e-05, + "loss": 5.0212, + "step": 9609 + }, + { + "epoch": 0.05715339233038348, + "grad_norm": 2.323709487915039, + "learning_rate": 4.9598175372441106e-05, + "loss": 5.5346, + "step": 9610 + }, + { + "epoch": 0.05715933961366448, + "grad_norm": 1.755983591079712, + "learning_rate": 4.959809195785912e-05, + "loss": 4.8425, + "step": 9611 + }, + { + "epoch": 0.05716528689694547, + "grad_norm": 1.6614432334899902, + "learning_rate": 4.95980085346902e-05, + "loss": 4.912, + "step": 9612 + }, + { + "epoch": 0.057171234180226474, + "grad_norm": 1.8319662809371948, + "learning_rate": 4.959792510293436e-05, + "loss": 5.0125, + "step": 9613 + }, + { + "epoch": 0.05717718146350747, + "grad_norm": 1.8528090715408325, + "learning_rate": 4.959784166259165e-05, + "loss": 4.898, + "step": 9614 + }, + { + "epoch": 0.057183128746788464, + "grad_norm": 2.163757562637329, + "learning_rate": 4.959775821366208e-05, + "loss": 5.2041, + "step": 9615 + }, + { + "epoch": 0.057189076030069466, + "grad_norm": 1.939430832862854, + "learning_rate": 4.959767475614569e-05, + "loss": 5.3337, + "step": 9616 + }, + { + "epoch": 0.05719502331335046, + "grad_norm": 1.7198511362075806, + "learning_rate": 4.959759129004251e-05, + "loss": 5.2682, + "step": 9617 + }, + { + "epoch": 0.057200970596631456, + "grad_norm": 1.7674570083618164, + "learning_rate": 4.959750781535255e-05, + "loss": 5.4188, + "step": 9618 + }, + { + "epoch": 0.05720691787991246, + "grad_norm": 1.7197433710098267, + "learning_rate": 4.959742433207587e-05, + "loss": 5.1725, + "step": 9619 + }, + { + "epoch": 0.05721286516319345, + "grad_norm": 1.6682969331741333, + "learning_rate": 4.959734084021248e-05, + "loss": 5.1349, + "step": 9620 + }, + { + "epoch": 0.05721881244647445, + "grad_norm": 1.3784568309783936, + "learning_rate": 4.959725733976241e-05, + "loss": 5.2408, + "step": 9621 + }, + { + "epoch": 0.05722475972975545, + "grad_norm": 1.690483808517456, + "learning_rate": 4.9597173830725686e-05, + "loss": 5.2616, + "step": 9622 + }, + { + "epoch": 0.057230707013036446, + "grad_norm": 1.5313903093338013, + "learning_rate": 4.959709031310235e-05, + "loss": 5.1481, + "step": 9623 + }, + { + "epoch": 0.05723665429631744, + "grad_norm": 1.6266121864318848, + "learning_rate": 4.959700678689242e-05, + "loss": 5.0192, + "step": 9624 + }, + { + "epoch": 0.05724260157959844, + "grad_norm": 2.3125410079956055, + "learning_rate": 4.959692325209593e-05, + "loss": 4.5513, + "step": 9625 + }, + { + "epoch": 0.05724854886287944, + "grad_norm": 1.6884924173355103, + "learning_rate": 4.9596839708712913e-05, + "loss": 5.1917, + "step": 9626 + }, + { + "epoch": 0.05725449614616043, + "grad_norm": 1.5797723531723022, + "learning_rate": 4.9596756156743385e-05, + "loss": 5.5674, + "step": 9627 + }, + { + "epoch": 0.05726044342944143, + "grad_norm": 1.6152269840240479, + "learning_rate": 4.959667259618739e-05, + "loss": 5.4566, + "step": 9628 + }, + { + "epoch": 0.05726639071272243, + "grad_norm": 1.611608624458313, + "learning_rate": 4.959658902704495e-05, + "loss": 5.3678, + "step": 9629 + }, + { + "epoch": 0.057272337996003425, + "grad_norm": 1.774327278137207, + "learning_rate": 4.9596505449316086e-05, + "loss": 5.2438, + "step": 9630 + }, + { + "epoch": 0.05727828527928442, + "grad_norm": 1.7961443662643433, + "learning_rate": 4.9596421863000856e-05, + "loss": 5.3061, + "step": 9631 + }, + { + "epoch": 0.05728423256256542, + "grad_norm": 1.709675669670105, + "learning_rate": 4.959633826809925e-05, + "loss": 5.0095, + "step": 9632 + }, + { + "epoch": 0.05729017984584642, + "grad_norm": 1.7140734195709229, + "learning_rate": 4.959625466461132e-05, + "loss": 5.313, + "step": 9633 + }, + { + "epoch": 0.05729612712912741, + "grad_norm": 1.8302016258239746, + "learning_rate": 4.95961710525371e-05, + "loss": 5.4008, + "step": 9634 + }, + { + "epoch": 0.057302074412408414, + "grad_norm": 1.8570395708084106, + "learning_rate": 4.95960874318766e-05, + "loss": 5.513, + "step": 9635 + }, + { + "epoch": 0.05730802169568941, + "grad_norm": 1.6907027959823608, + "learning_rate": 4.959600380262987e-05, + "loss": 5.1933, + "step": 9636 + }, + { + "epoch": 0.057313968978970405, + "grad_norm": 1.6505299806594849, + "learning_rate": 4.9595920164796926e-05, + "loss": 5.1537, + "step": 9637 + }, + { + "epoch": 0.05731991626225141, + "grad_norm": 1.5248258113861084, + "learning_rate": 4.95958365183778e-05, + "loss": 5.4232, + "step": 9638 + }, + { + "epoch": 0.0573258635455324, + "grad_norm": 1.4630048274993896, + "learning_rate": 4.9595752863372524e-05, + "loss": 5.565, + "step": 9639 + }, + { + "epoch": 0.0573318108288134, + "grad_norm": 1.5858573913574219, + "learning_rate": 4.959566919978112e-05, + "loss": 5.4364, + "step": 9640 + }, + { + "epoch": 0.05733775811209439, + "grad_norm": 1.7803694009780884, + "learning_rate": 4.9595585527603625e-05, + "loss": 5.1727, + "step": 9641 + }, + { + "epoch": 0.057343705395375394, + "grad_norm": 1.639163851737976, + "learning_rate": 4.959550184684007e-05, + "loss": 5.5538, + "step": 9642 + }, + { + "epoch": 0.05734965267865639, + "grad_norm": 1.5917890071868896, + "learning_rate": 4.959541815749046e-05, + "loss": 5.6788, + "step": 9643 + }, + { + "epoch": 0.057355599961937384, + "grad_norm": 1.5524990558624268, + "learning_rate": 4.959533445955487e-05, + "loss": 5.7832, + "step": 9644 + }, + { + "epoch": 0.057361547245218386, + "grad_norm": 1.7229019403457642, + "learning_rate": 4.959525075303328e-05, + "loss": 5.4417, + "step": 9645 + }, + { + "epoch": 0.05736749452849938, + "grad_norm": 1.5434623956680298, + "learning_rate": 4.959516703792575e-05, + "loss": 5.3629, + "step": 9646 + }, + { + "epoch": 0.057373441811780376, + "grad_norm": 1.4929866790771484, + "learning_rate": 4.9595083314232306e-05, + "loss": 5.8586, + "step": 9647 + }, + { + "epoch": 0.05737938909506138, + "grad_norm": 1.209796667098999, + "learning_rate": 4.959499958195297e-05, + "loss": 5.5001, + "step": 9648 + }, + { + "epoch": 0.05738533637834237, + "grad_norm": 2.703871488571167, + "learning_rate": 4.9594915841087775e-05, + "loss": 5.6564, + "step": 9649 + }, + { + "epoch": 0.05739128366162337, + "grad_norm": 1.9408828020095825, + "learning_rate": 4.959483209163674e-05, + "loss": 5.6683, + "step": 9650 + }, + { + "epoch": 0.05739723094490437, + "grad_norm": 1.8055803775787354, + "learning_rate": 4.9594748333599914e-05, + "loss": 5.3046, + "step": 9651 + }, + { + "epoch": 0.057403178228185366, + "grad_norm": 2.3453104496002197, + "learning_rate": 4.959466456697731e-05, + "loss": 6.1944, + "step": 9652 + }, + { + "epoch": 0.05740912551146636, + "grad_norm": 2.3799800872802734, + "learning_rate": 4.959458079176897e-05, + "loss": 5.6706, + "step": 9653 + }, + { + "epoch": 0.05741507279474736, + "grad_norm": 2.111069440841675, + "learning_rate": 4.959449700797491e-05, + "loss": 5.1808, + "step": 9654 + }, + { + "epoch": 0.05742102007802836, + "grad_norm": 2.237873077392578, + "learning_rate": 4.9594413215595164e-05, + "loss": 5.0609, + "step": 9655 + }, + { + "epoch": 0.05742696736130935, + "grad_norm": 1.956520438194275, + "learning_rate": 4.959432941462977e-05, + "loss": 5.1431, + "step": 9656 + }, + { + "epoch": 0.05743291464459035, + "grad_norm": 2.3761603832244873, + "learning_rate": 4.9594245605078735e-05, + "loss": 4.8722, + "step": 9657 + }, + { + "epoch": 0.05743886192787135, + "grad_norm": 1.820745825767517, + "learning_rate": 4.959416178694212e-05, + "loss": 5.0149, + "step": 9658 + }, + { + "epoch": 0.057444809211152345, + "grad_norm": 2.0804755687713623, + "learning_rate": 4.9594077960219924e-05, + "loss": 5.7698, + "step": 9659 + }, + { + "epoch": 0.05745075649443334, + "grad_norm": 1.9319117069244385, + "learning_rate": 4.9593994124912196e-05, + "loss": 5.3054, + "step": 9660 + }, + { + "epoch": 0.05745670377771434, + "grad_norm": 2.386338472366333, + "learning_rate": 4.959391028101896e-05, + "loss": 5.2093, + "step": 9661 + }, + { + "epoch": 0.05746265106099534, + "grad_norm": 1.852386474609375, + "learning_rate": 4.9593826428540244e-05, + "loss": 5.1943, + "step": 9662 + }, + { + "epoch": 0.05746859834427633, + "grad_norm": 1.9619694948196411, + "learning_rate": 4.959374256747607e-05, + "loss": 4.8275, + "step": 9663 + }, + { + "epoch": 0.057474545627557334, + "grad_norm": 2.4797024726867676, + "learning_rate": 4.9593658697826485e-05, + "loss": 5.5257, + "step": 9664 + }, + { + "epoch": 0.05748049291083833, + "grad_norm": 2.1713874340057373, + "learning_rate": 4.959357481959149e-05, + "loss": 5.4486, + "step": 9665 + }, + { + "epoch": 0.057486440194119325, + "grad_norm": 1.9605398178100586, + "learning_rate": 4.9593490932771145e-05, + "loss": 5.1512, + "step": 9666 + }, + { + "epoch": 0.05749238747740033, + "grad_norm": 1.9853549003601074, + "learning_rate": 4.959340703736547e-05, + "loss": 5.665, + "step": 9667 + }, + { + "epoch": 0.05749833476068132, + "grad_norm": 1.984279990196228, + "learning_rate": 4.9593323133374494e-05, + "loss": 5.7797, + "step": 9668 + }, + { + "epoch": 0.05750428204396232, + "grad_norm": 1.8343236446380615, + "learning_rate": 4.9593239220798225e-05, + "loss": 5.0261, + "step": 9669 + }, + { + "epoch": 0.05751022932724331, + "grad_norm": 1.8675687313079834, + "learning_rate": 4.959315529963673e-05, + "loss": 4.8754, + "step": 9670 + }, + { + "epoch": 0.057516176610524314, + "grad_norm": 1.9129834175109863, + "learning_rate": 4.959307136989e-05, + "loss": 5.1056, + "step": 9671 + }, + { + "epoch": 0.05752212389380531, + "grad_norm": 3.142893075942993, + "learning_rate": 4.95929874315581e-05, + "loss": 5.6029, + "step": 9672 + }, + { + "epoch": 0.057528071177086304, + "grad_norm": 1.80843985080719, + "learning_rate": 4.9592903484641026e-05, + "loss": 5.57, + "step": 9673 + }, + { + "epoch": 0.057534018460367306, + "grad_norm": 1.9195841550827026, + "learning_rate": 4.9592819529138835e-05, + "loss": 5.6964, + "step": 9674 + }, + { + "epoch": 0.0575399657436483, + "grad_norm": 2.026477813720703, + "learning_rate": 4.959273556505154e-05, + "loss": 5.8544, + "step": 9675 + }, + { + "epoch": 0.057545913026929296, + "grad_norm": 2.111274003982544, + "learning_rate": 4.959265159237918e-05, + "loss": 5.8014, + "step": 9676 + }, + { + "epoch": 0.0575518603102103, + "grad_norm": 1.9789505004882812, + "learning_rate": 4.9592567611121776e-05, + "loss": 5.7646, + "step": 9677 + }, + { + "epoch": 0.05755780759349129, + "grad_norm": 1.8776015043258667, + "learning_rate": 4.9592483621279365e-05, + "loss": 6.1603, + "step": 9678 + }, + { + "epoch": 0.05756375487677229, + "grad_norm": 2.135849714279175, + "learning_rate": 4.9592399622851956e-05, + "loss": 5.6372, + "step": 9679 + }, + { + "epoch": 0.05756970216005329, + "grad_norm": 2.3335585594177246, + "learning_rate": 4.959231561583961e-05, + "loss": 5.5515, + "step": 9680 + }, + { + "epoch": 0.057575649443334286, + "grad_norm": 1.9315869808197021, + "learning_rate": 4.9592231600242337e-05, + "loss": 5.9287, + "step": 9681 + }, + { + "epoch": 0.05758159672661528, + "grad_norm": 2.4559311866760254, + "learning_rate": 4.959214757606017e-05, + "loss": 5.6079, + "step": 9682 + }, + { + "epoch": 0.05758754400989628, + "grad_norm": 2.6558609008789062, + "learning_rate": 4.959206354329314e-05, + "loss": 5.5728, + "step": 9683 + }, + { + "epoch": 0.05759349129317728, + "grad_norm": 2.2376396656036377, + "learning_rate": 4.9591979501941274e-05, + "loss": 5.5318, + "step": 9684 + }, + { + "epoch": 0.05759943857645827, + "grad_norm": 1.8506240844726562, + "learning_rate": 4.95918954520046e-05, + "loss": 5.7957, + "step": 9685 + }, + { + "epoch": 0.05760538585973927, + "grad_norm": 2.2428138256073, + "learning_rate": 4.9591811393483144e-05, + "loss": 5.7223, + "step": 9686 + }, + { + "epoch": 0.05761133314302027, + "grad_norm": 2.5734875202178955, + "learning_rate": 4.9591727326376955e-05, + "loss": 5.3401, + "step": 9687 + }, + { + "epoch": 0.057617280426301265, + "grad_norm": 2.567263126373291, + "learning_rate": 4.959164325068604e-05, + "loss": 5.4853, + "step": 9688 + }, + { + "epoch": 0.05762322770958226, + "grad_norm": 2.4430556297302246, + "learning_rate": 4.959155916641043e-05, + "loss": 5.9845, + "step": 9689 + }, + { + "epoch": 0.05762917499286326, + "grad_norm": 2.039846181869507, + "learning_rate": 4.959147507355017e-05, + "loss": 6.0689, + "step": 9690 + }, + { + "epoch": 0.05763512227614426, + "grad_norm": 2.207920551300049, + "learning_rate": 4.959139097210528e-05, + "loss": 5.6658, + "step": 9691 + }, + { + "epoch": 0.05764106955942525, + "grad_norm": 1.7421616315841675, + "learning_rate": 4.959130686207578e-05, + "loss": 6.0915, + "step": 9692 + }, + { + "epoch": 0.057647016842706254, + "grad_norm": 1.7738968133926392, + "learning_rate": 4.9591222743461716e-05, + "loss": 6.2092, + "step": 9693 + }, + { + "epoch": 0.05765296412598725, + "grad_norm": 1.8665943145751953, + "learning_rate": 4.959113861626311e-05, + "loss": 6.0922, + "step": 9694 + }, + { + "epoch": 0.057658911409268244, + "grad_norm": 2.0272347927093506, + "learning_rate": 4.959105448047999e-05, + "loss": 5.8291, + "step": 9695 + }, + { + "epoch": 0.057664858692549247, + "grad_norm": 2.8527796268463135, + "learning_rate": 4.9590970336112395e-05, + "loss": 5.428, + "step": 9696 + }, + { + "epoch": 0.05767080597583024, + "grad_norm": 1.8518950939178467, + "learning_rate": 4.959088618316033e-05, + "loss": 5.4199, + "step": 9697 + }, + { + "epoch": 0.05767675325911124, + "grad_norm": 2.38712739944458, + "learning_rate": 4.959080202162386e-05, + "loss": 5.1627, + "step": 9698 + }, + { + "epoch": 0.05768270054239223, + "grad_norm": 1.8407059907913208, + "learning_rate": 4.959071785150298e-05, + "loss": 5.1827, + "step": 9699 + }, + { + "epoch": 0.057688647825673234, + "grad_norm": 2.431151866912842, + "learning_rate": 4.9590633672797744e-05, + "loss": 6.1722, + "step": 9700 + }, + { + "epoch": 0.05769459510895423, + "grad_norm": 2.498046398162842, + "learning_rate": 4.9590549485508165e-05, + "loss": 6.2321, + "step": 9701 + }, + { + "epoch": 0.057700542392235224, + "grad_norm": 1.8793575763702393, + "learning_rate": 4.959046528963428e-05, + "loss": 5.4019, + "step": 9702 + }, + { + "epoch": 0.057706489675516226, + "grad_norm": 2.137622117996216, + "learning_rate": 4.9590381085176115e-05, + "loss": 5.9118, + "step": 9703 + }, + { + "epoch": 0.05771243695879722, + "grad_norm": 1.9514268636703491, + "learning_rate": 4.959029687213371e-05, + "loss": 5.6651, + "step": 9704 + }, + { + "epoch": 0.057718384242078216, + "grad_norm": 2.3678367137908936, + "learning_rate": 4.9590212650507085e-05, + "loss": 5.2054, + "step": 9705 + }, + { + "epoch": 0.05772433152535922, + "grad_norm": 2.8808276653289795, + "learning_rate": 4.9590128420296266e-05, + "loss": 5.3066, + "step": 9706 + }, + { + "epoch": 0.05773027880864021, + "grad_norm": 2.2405474185943604, + "learning_rate": 4.9590044181501297e-05, + "loss": 5.2904, + "step": 9707 + }, + { + "epoch": 0.05773622609192121, + "grad_norm": 2.3762283325195312, + "learning_rate": 4.958995993412219e-05, + "loss": 5.5847, + "step": 9708 + }, + { + "epoch": 0.05774217337520221, + "grad_norm": 2.5258681774139404, + "learning_rate": 4.958987567815898e-05, + "loss": 5.4852, + "step": 9709 + }, + { + "epoch": 0.057748120658483205, + "grad_norm": 2.31478214263916, + "learning_rate": 4.9589791413611704e-05, + "loss": 5.5658, + "step": 9710 + }, + { + "epoch": 0.0577540679417642, + "grad_norm": 1.735771894454956, + "learning_rate": 4.958970714048038e-05, + "loss": 6.0311, + "step": 9711 + }, + { + "epoch": 0.0577600152250452, + "grad_norm": 2.2843849658966064, + "learning_rate": 4.958962285876505e-05, + "loss": 5.9535, + "step": 9712 + }, + { + "epoch": 0.0577659625083262, + "grad_norm": 2.3449392318725586, + "learning_rate": 4.958953856846573e-05, + "loss": 5.9835, + "step": 9713 + }, + { + "epoch": 0.05777190979160719, + "grad_norm": 2.319952964782715, + "learning_rate": 4.9589454269582456e-05, + "loss": 5.5318, + "step": 9714 + }, + { + "epoch": 0.05777785707488819, + "grad_norm": 2.6801493167877197, + "learning_rate": 4.958936996211526e-05, + "loss": 4.8672, + "step": 9715 + }, + { + "epoch": 0.05778380435816919, + "grad_norm": 2.622528553009033, + "learning_rate": 4.958928564606418e-05, + "loss": 6.0755, + "step": 9716 + }, + { + "epoch": 0.057789751641450185, + "grad_norm": 1.973480224609375, + "learning_rate": 4.9589201321429216e-05, + "loss": 5.8197, + "step": 9717 + }, + { + "epoch": 0.05779569892473118, + "grad_norm": 2.060497760772705, + "learning_rate": 4.958911698821043e-05, + "loss": 5.2838, + "step": 9718 + }, + { + "epoch": 0.05780164620801218, + "grad_norm": 2.068103551864624, + "learning_rate": 4.958903264640783e-05, + "loss": 5.4917, + "step": 9719 + }, + { + "epoch": 0.05780759349129318, + "grad_norm": 2.5899293422698975, + "learning_rate": 4.958894829602145e-05, + "loss": 5.1312, + "step": 9720 + }, + { + "epoch": 0.05781354077457417, + "grad_norm": 3.2153897285461426, + "learning_rate": 4.958886393705132e-05, + "loss": 4.7502, + "step": 9721 + }, + { + "epoch": 0.057819488057855174, + "grad_norm": 2.805802345275879, + "learning_rate": 4.9588779569497484e-05, + "loss": 4.6876, + "step": 9722 + }, + { + "epoch": 0.05782543534113617, + "grad_norm": 2.3670101165771484, + "learning_rate": 4.958869519335995e-05, + "loss": 4.6025, + "step": 9723 + }, + { + "epoch": 0.057831382624417164, + "grad_norm": 1.992903709411621, + "learning_rate": 4.9588610808638755e-05, + "loss": 5.3602, + "step": 9724 + }, + { + "epoch": 0.057837329907698166, + "grad_norm": 2.249572277069092, + "learning_rate": 4.958852641533394e-05, + "loss": 4.9574, + "step": 9725 + }, + { + "epoch": 0.05784327719097916, + "grad_norm": 2.500433921813965, + "learning_rate": 4.958844201344552e-05, + "loss": 5.3656, + "step": 9726 + }, + { + "epoch": 0.05784922447426016, + "grad_norm": 2.0277605056762695, + "learning_rate": 4.9588357602973526e-05, + "loss": 5.6467, + "step": 9727 + }, + { + "epoch": 0.05785517175754116, + "grad_norm": 2.1196112632751465, + "learning_rate": 4.958827318391799e-05, + "loss": 5.6257, + "step": 9728 + }, + { + "epoch": 0.057861119040822154, + "grad_norm": 3.160593271255493, + "learning_rate": 4.9588188756278945e-05, + "loss": 4.9618, + "step": 9729 + }, + { + "epoch": 0.05786706632410315, + "grad_norm": 1.90407395362854, + "learning_rate": 4.958810432005642e-05, + "loss": 5.4551, + "step": 9730 + }, + { + "epoch": 0.057873013607384144, + "grad_norm": 2.0096004009246826, + "learning_rate": 4.958801987525043e-05, + "loss": 5.6562, + "step": 9731 + }, + { + "epoch": 0.057878960890665146, + "grad_norm": 2.617847442626953, + "learning_rate": 4.958793542186103e-05, + "loss": 5.747, + "step": 9732 + }, + { + "epoch": 0.05788490817394614, + "grad_norm": 2.3982057571411133, + "learning_rate": 4.9587850959888226e-05, + "loss": 5.6146, + "step": 9733 + }, + { + "epoch": 0.057890855457227136, + "grad_norm": 2.0222113132476807, + "learning_rate": 4.9587766489332065e-05, + "loss": 6.0204, + "step": 9734 + }, + { + "epoch": 0.05789680274050814, + "grad_norm": 2.1110177040100098, + "learning_rate": 4.958768201019257e-05, + "loss": 5.2957, + "step": 9735 + }, + { + "epoch": 0.05790275002378913, + "grad_norm": 1.8278865814208984, + "learning_rate": 4.958759752246977e-05, + "loss": 5.9902, + "step": 9736 + }, + { + "epoch": 0.05790869730707013, + "grad_norm": 2.2461514472961426, + "learning_rate": 4.958751302616368e-05, + "loss": 5.8572, + "step": 9737 + }, + { + "epoch": 0.05791464459035113, + "grad_norm": 1.7453250885009766, + "learning_rate": 4.958742852127435e-05, + "loss": 5.6658, + "step": 9738 + }, + { + "epoch": 0.057920591873632125, + "grad_norm": 2.480726718902588, + "learning_rate": 4.95873440078018e-05, + "loss": 5.4231, + "step": 9739 + }, + { + "epoch": 0.05792653915691312, + "grad_norm": 2.2310776710510254, + "learning_rate": 4.958725948574607e-05, + "loss": 5.4768, + "step": 9740 + }, + { + "epoch": 0.05793248644019412, + "grad_norm": 1.9454891681671143, + "learning_rate": 4.958717495510718e-05, + "loss": 5.4503, + "step": 9741 + }, + { + "epoch": 0.05793843372347512, + "grad_norm": 2.196054458618164, + "learning_rate": 4.958709041588516e-05, + "loss": 5.1987, + "step": 9742 + }, + { + "epoch": 0.05794438100675611, + "grad_norm": 2.385000228881836, + "learning_rate": 4.958700586808004e-05, + "loss": 5.8413, + "step": 9743 + }, + { + "epoch": 0.05795032829003711, + "grad_norm": 2.0967705249786377, + "learning_rate": 4.958692131169185e-05, + "loss": 5.8531, + "step": 9744 + }, + { + "epoch": 0.05795627557331811, + "grad_norm": 2.186253309249878, + "learning_rate": 4.958683674672062e-05, + "loss": 5.8241, + "step": 9745 + }, + { + "epoch": 0.057962222856599105, + "grad_norm": 1.8932995796203613, + "learning_rate": 4.958675217316638e-05, + "loss": 5.8724, + "step": 9746 + }, + { + "epoch": 0.0579681701398801, + "grad_norm": 1.9706943035125732, + "learning_rate": 4.958666759102916e-05, + "loss": 5.6565, + "step": 9747 + }, + { + "epoch": 0.0579741174231611, + "grad_norm": 1.7686703205108643, + "learning_rate": 4.958658300030898e-05, + "loss": 5.6299, + "step": 9748 + }, + { + "epoch": 0.0579800647064421, + "grad_norm": 2.309403419494629, + "learning_rate": 4.958649840100589e-05, + "loss": 4.6907, + "step": 9749 + }, + { + "epoch": 0.05798601198972309, + "grad_norm": 2.139760971069336, + "learning_rate": 4.95864137931199e-05, + "loss": 4.7311, + "step": 9750 + }, + { + "epoch": 0.057991959273004094, + "grad_norm": 1.960402011871338, + "learning_rate": 4.958632917665105e-05, + "loss": 5.598, + "step": 9751 + }, + { + "epoch": 0.05799790655628509, + "grad_norm": 1.721853256225586, + "learning_rate": 4.958624455159936e-05, + "loss": 6.0519, + "step": 9752 + }, + { + "epoch": 0.058003853839566084, + "grad_norm": 1.8527748584747314, + "learning_rate": 4.958615991796487e-05, + "loss": 5.3347, + "step": 9753 + }, + { + "epoch": 0.058009801122847086, + "grad_norm": 2.070084810256958, + "learning_rate": 4.958607527574761e-05, + "loss": 4.6653, + "step": 9754 + }, + { + "epoch": 0.05801574840612808, + "grad_norm": 2.143115997314453, + "learning_rate": 4.9585990624947605e-05, + "loss": 4.6522, + "step": 9755 + }, + { + "epoch": 0.05802169568940908, + "grad_norm": 2.2870991230010986, + "learning_rate": 4.9585905965564884e-05, + "loss": 4.7037, + "step": 9756 + }, + { + "epoch": 0.05802764297269008, + "grad_norm": 2.0633544921875, + "learning_rate": 4.958582129759947e-05, + "loss": 4.689, + "step": 9757 + }, + { + "epoch": 0.058033590255971074, + "grad_norm": 1.8845857381820679, + "learning_rate": 4.95857366210514e-05, + "loss": 4.8077, + "step": 9758 + }, + { + "epoch": 0.05803953753925207, + "grad_norm": 1.7319310903549194, + "learning_rate": 4.9585651935920715e-05, + "loss": 5.3528, + "step": 9759 + }, + { + "epoch": 0.058045484822533064, + "grad_norm": 2.2369909286499023, + "learning_rate": 4.958556724220742e-05, + "loss": 4.6549, + "step": 9760 + }, + { + "epoch": 0.058051432105814066, + "grad_norm": 2.076901912689209, + "learning_rate": 4.9585482539911566e-05, + "loss": 4.4642, + "step": 9761 + }, + { + "epoch": 0.05805737938909506, + "grad_norm": 2.0487091541290283, + "learning_rate": 4.958539782903318e-05, + "loss": 4.6575, + "step": 9762 + }, + { + "epoch": 0.058063326672376056, + "grad_norm": 2.2116169929504395, + "learning_rate": 4.9585313109572274e-05, + "loss": 4.4866, + "step": 9763 + }, + { + "epoch": 0.05806927395565706, + "grad_norm": 1.9818168878555298, + "learning_rate": 4.958522838152889e-05, + "loss": 4.7502, + "step": 9764 + }, + { + "epoch": 0.05807522123893805, + "grad_norm": 2.1484010219573975, + "learning_rate": 4.958514364490306e-05, + "loss": 5.7809, + "step": 9765 + }, + { + "epoch": 0.05808116852221905, + "grad_norm": 2.4087398052215576, + "learning_rate": 4.958505889969481e-05, + "loss": 5.5236, + "step": 9766 + }, + { + "epoch": 0.05808711580550005, + "grad_norm": 2.000459909439087, + "learning_rate": 4.9584974145904165e-05, + "loss": 4.7356, + "step": 9767 + }, + { + "epoch": 0.058093063088781045, + "grad_norm": 2.3958399295806885, + "learning_rate": 4.958488938353116e-05, + "loss": 4.3695, + "step": 9768 + }, + { + "epoch": 0.05809901037206204, + "grad_norm": 2.039053440093994, + "learning_rate": 4.958480461257584e-05, + "loss": 4.6128, + "step": 9769 + }, + { + "epoch": 0.05810495765534304, + "grad_norm": 1.7663822174072266, + "learning_rate": 4.95847198330382e-05, + "loss": 4.8533, + "step": 9770 + }, + { + "epoch": 0.05811090493862404, + "grad_norm": 2.594289779663086, + "learning_rate": 4.9584635044918295e-05, + "loss": 5.3048, + "step": 9771 + }, + { + "epoch": 0.05811685222190503, + "grad_norm": 2.712372303009033, + "learning_rate": 4.958455024821615e-05, + "loss": 5.4435, + "step": 9772 + }, + { + "epoch": 0.05812279950518603, + "grad_norm": 2.4295241832733154, + "learning_rate": 4.9584465442931794e-05, + "loss": 5.2665, + "step": 9773 + }, + { + "epoch": 0.05812874678846703, + "grad_norm": 2.5820906162261963, + "learning_rate": 4.9584380629065245e-05, + "loss": 5.6227, + "step": 9774 + }, + { + "epoch": 0.058134694071748025, + "grad_norm": 2.140291213989258, + "learning_rate": 4.958429580661655e-05, + "loss": 5.1792, + "step": 9775 + }, + { + "epoch": 0.05814064135502902, + "grad_norm": 2.111551523208618, + "learning_rate": 4.9584210975585734e-05, + "loss": 5.7262, + "step": 9776 + }, + { + "epoch": 0.05814658863831002, + "grad_norm": 2.5887086391448975, + "learning_rate": 4.958412613597282e-05, + "loss": 5.1613, + "step": 9777 + }, + { + "epoch": 0.05815253592159102, + "grad_norm": 1.9678863286972046, + "learning_rate": 4.9584041287777835e-05, + "loss": 5.7693, + "step": 9778 + }, + { + "epoch": 0.05815848320487201, + "grad_norm": 2.000265121459961, + "learning_rate": 4.958395643100083e-05, + "loss": 5.654, + "step": 9779 + }, + { + "epoch": 0.058164430488153014, + "grad_norm": 1.8926239013671875, + "learning_rate": 4.958387156564181e-05, + "loss": 5.3004, + "step": 9780 + }, + { + "epoch": 0.05817037777143401, + "grad_norm": 2.3557002544403076, + "learning_rate": 4.958378669170082e-05, + "loss": 5.5437, + "step": 9781 + }, + { + "epoch": 0.058176325054715004, + "grad_norm": 1.9434150457382202, + "learning_rate": 4.958370180917787e-05, + "loss": 5.8442, + "step": 9782 + }, + { + "epoch": 0.058182272337996006, + "grad_norm": 1.875900387763977, + "learning_rate": 4.9583616918073026e-05, + "loss": 5.9312, + "step": 9783 + }, + { + "epoch": 0.058188219621277, + "grad_norm": 1.8945306539535522, + "learning_rate": 4.958353201838628e-05, + "loss": 5.7166, + "step": 9784 + }, + { + "epoch": 0.058194166904557997, + "grad_norm": 1.7081416845321655, + "learning_rate": 4.9583447110117684e-05, + "loss": 6.0803, + "step": 9785 + }, + { + "epoch": 0.058200114187839, + "grad_norm": 1.6520098447799683, + "learning_rate": 4.958336219326725e-05, + "loss": 6.0181, + "step": 9786 + }, + { + "epoch": 0.058206061471119994, + "grad_norm": 1.90665602684021, + "learning_rate": 4.9583277267835024e-05, + "loss": 5.586, + "step": 9787 + }, + { + "epoch": 0.05821200875440099, + "grad_norm": 1.8179740905761719, + "learning_rate": 4.958319233382104e-05, + "loss": 5.8637, + "step": 9788 + }, + { + "epoch": 0.058217956037681984, + "grad_norm": 1.8228380680084229, + "learning_rate": 4.95831073912253e-05, + "loss": 5.7406, + "step": 9789 + }, + { + "epoch": 0.058223903320962986, + "grad_norm": 1.691999912261963, + "learning_rate": 4.958302244004786e-05, + "loss": 5.8021, + "step": 9790 + }, + { + "epoch": 0.05822985060424398, + "grad_norm": 1.8590795993804932, + "learning_rate": 4.958293748028875e-05, + "loss": 5.5897, + "step": 9791 + }, + { + "epoch": 0.058235797887524976, + "grad_norm": 1.5923960208892822, + "learning_rate": 4.958285251194797e-05, + "loss": 5.7424, + "step": 9792 + }, + { + "epoch": 0.05824174517080598, + "grad_norm": 1.6928486824035645, + "learning_rate": 4.958276753502559e-05, + "loss": 5.905, + "step": 9793 + }, + { + "epoch": 0.05824769245408697, + "grad_norm": 2.120725393295288, + "learning_rate": 4.958268254952161e-05, + "loss": 5.9974, + "step": 9794 + }, + { + "epoch": 0.05825363973736797, + "grad_norm": 1.850441813468933, + "learning_rate": 4.9582597555436075e-05, + "loss": 5.7171, + "step": 9795 + }, + { + "epoch": 0.05825958702064897, + "grad_norm": 2.196037530899048, + "learning_rate": 4.9582512552769e-05, + "loss": 6.1243, + "step": 9796 + }, + { + "epoch": 0.058265534303929965, + "grad_norm": 1.9170193672180176, + "learning_rate": 4.9582427541520423e-05, + "loss": 5.8087, + "step": 9797 + }, + { + "epoch": 0.05827148158721096, + "grad_norm": 1.974478006362915, + "learning_rate": 4.958234252169039e-05, + "loss": 5.794, + "step": 9798 + }, + { + "epoch": 0.05827742887049196, + "grad_norm": 1.824965476989746, + "learning_rate": 4.9582257493278904e-05, + "loss": 5.6904, + "step": 9799 + }, + { + "epoch": 0.05828337615377296, + "grad_norm": 1.828037142753601, + "learning_rate": 4.9582172456286e-05, + "loss": 5.6793, + "step": 9800 + }, + { + "epoch": 0.05828932343705395, + "grad_norm": 1.8949617147445679, + "learning_rate": 4.9582087410711726e-05, + "loss": 5.6685, + "step": 9801 + }, + { + "epoch": 0.05829527072033495, + "grad_norm": 1.8183050155639648, + "learning_rate": 4.958200235655609e-05, + "loss": 5.7754, + "step": 9802 + }, + { + "epoch": 0.05830121800361595, + "grad_norm": 1.6816062927246094, + "learning_rate": 4.9581917293819135e-05, + "loss": 5.6931, + "step": 9803 + }, + { + "epoch": 0.058307165286896945, + "grad_norm": 1.875659465789795, + "learning_rate": 4.958183222250089e-05, + "loss": 5.7568, + "step": 9804 + }, + { + "epoch": 0.05831311257017794, + "grad_norm": 2.162404775619507, + "learning_rate": 4.958174714260137e-05, + "loss": 5.7969, + "step": 9805 + }, + { + "epoch": 0.05831905985345894, + "grad_norm": 2.2122790813446045, + "learning_rate": 4.958166205412064e-05, + "loss": 5.7301, + "step": 9806 + }, + { + "epoch": 0.05832500713673994, + "grad_norm": 1.8822424411773682, + "learning_rate": 4.9581576957058686e-05, + "loss": 5.7034, + "step": 9807 + }, + { + "epoch": 0.05833095442002093, + "grad_norm": 1.8780319690704346, + "learning_rate": 4.958149185141556e-05, + "loss": 5.6573, + "step": 9808 + }, + { + "epoch": 0.058336901703301934, + "grad_norm": 1.9177708625793457, + "learning_rate": 4.958140673719129e-05, + "loss": 5.6619, + "step": 9809 + }, + { + "epoch": 0.05834284898658293, + "grad_norm": 1.8662844896316528, + "learning_rate": 4.95813216143859e-05, + "loss": 5.5857, + "step": 9810 + }, + { + "epoch": 0.058348796269863924, + "grad_norm": 2.1798834800720215, + "learning_rate": 4.958123648299944e-05, + "loss": 5.5811, + "step": 9811 + }, + { + "epoch": 0.058354743553144926, + "grad_norm": 2.1575138568878174, + "learning_rate": 4.958115134303191e-05, + "loss": 5.6761, + "step": 9812 + }, + { + "epoch": 0.05836069083642592, + "grad_norm": 2.055314302444458, + "learning_rate": 4.958106619448336e-05, + "loss": 5.721, + "step": 9813 + }, + { + "epoch": 0.058366638119706916, + "grad_norm": 1.8962149620056152, + "learning_rate": 4.958098103735381e-05, + "loss": 5.6132, + "step": 9814 + }, + { + "epoch": 0.05837258540298792, + "grad_norm": 1.7715760469436646, + "learning_rate": 4.95808958716433e-05, + "loss": 5.6461, + "step": 9815 + }, + { + "epoch": 0.058378532686268914, + "grad_norm": 1.9166070222854614, + "learning_rate": 4.958081069735184e-05, + "loss": 5.5628, + "step": 9816 + }, + { + "epoch": 0.05838447996954991, + "grad_norm": 1.8872902393341064, + "learning_rate": 4.9580725514479484e-05, + "loss": 5.6476, + "step": 9817 + }, + { + "epoch": 0.058390427252830904, + "grad_norm": 1.8257521390914917, + "learning_rate": 4.9580640323026254e-05, + "loss": 5.6175, + "step": 9818 + }, + { + "epoch": 0.058396374536111906, + "grad_norm": 1.919291377067566, + "learning_rate": 4.958055512299217e-05, + "loss": 5.5954, + "step": 9819 + }, + { + "epoch": 0.0584023218193929, + "grad_norm": 1.8318076133728027, + "learning_rate": 4.958046991437726e-05, + "loss": 5.6255, + "step": 9820 + }, + { + "epoch": 0.058408269102673896, + "grad_norm": 1.9153858423233032, + "learning_rate": 4.958038469718158e-05, + "loss": 5.6787, + "step": 9821 + }, + { + "epoch": 0.0584142163859549, + "grad_norm": 1.967021107673645, + "learning_rate": 4.958029947140513e-05, + "loss": 5.6714, + "step": 9822 + }, + { + "epoch": 0.05842016366923589, + "grad_norm": 1.654997706413269, + "learning_rate": 4.958021423704795e-05, + "loss": 5.4809, + "step": 9823 + }, + { + "epoch": 0.05842611095251689, + "grad_norm": 1.8183335065841675, + "learning_rate": 4.9580128994110074e-05, + "loss": 5.5223, + "step": 9824 + }, + { + "epoch": 0.05843205823579789, + "grad_norm": 1.7665660381317139, + "learning_rate": 4.958004374259153e-05, + "loss": 5.5639, + "step": 9825 + }, + { + "epoch": 0.058438005519078885, + "grad_norm": 1.8233551979064941, + "learning_rate": 4.957995848249235e-05, + "loss": 5.6358, + "step": 9826 + }, + { + "epoch": 0.05844395280235988, + "grad_norm": 1.721301555633545, + "learning_rate": 4.957987321381256e-05, + "loss": 5.4989, + "step": 9827 + }, + { + "epoch": 0.05844990008564088, + "grad_norm": 1.6921659708023071, + "learning_rate": 4.957978793655218e-05, + "loss": 5.448, + "step": 9828 + }, + { + "epoch": 0.05845584736892188, + "grad_norm": 1.810354232788086, + "learning_rate": 4.957970265071126e-05, + "loss": 5.4501, + "step": 9829 + }, + { + "epoch": 0.05846179465220287, + "grad_norm": 1.7205116748809814, + "learning_rate": 4.957961735628982e-05, + "loss": 5.5222, + "step": 9830 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 1.9636965990066528, + "learning_rate": 4.957953205328788e-05, + "loss": 5.5894, + "step": 9831 + }, + { + "epoch": 0.05847368921876487, + "grad_norm": 1.9312820434570312, + "learning_rate": 4.9579446741705485e-05, + "loss": 5.6543, + "step": 9832 + }, + { + "epoch": 0.058479636502045865, + "grad_norm": 1.870448112487793, + "learning_rate": 4.9579361421542665e-05, + "loss": 5.6707, + "step": 9833 + }, + { + "epoch": 0.05848558378532686, + "grad_norm": 1.5943735837936401, + "learning_rate": 4.9579276092799435e-05, + "loss": 5.5184, + "step": 9834 + }, + { + "epoch": 0.05849153106860786, + "grad_norm": 1.6929852962493896, + "learning_rate": 4.957919075547584e-05, + "loss": 5.5188, + "step": 9835 + }, + { + "epoch": 0.05849747835188886, + "grad_norm": 2.0268075466156006, + "learning_rate": 4.95791054095719e-05, + "loss": 5.4909, + "step": 9836 + }, + { + "epoch": 0.05850342563516985, + "grad_norm": 2.047982931137085, + "learning_rate": 4.957902005508765e-05, + "loss": 5.6459, + "step": 9837 + }, + { + "epoch": 0.058509372918450854, + "grad_norm": 1.7938467264175415, + "learning_rate": 4.957893469202311e-05, + "loss": 5.4805, + "step": 9838 + }, + { + "epoch": 0.05851532020173185, + "grad_norm": 1.803093433380127, + "learning_rate": 4.957884932037833e-05, + "loss": 5.4092, + "step": 9839 + }, + { + "epoch": 0.058521267485012844, + "grad_norm": 1.8001232147216797, + "learning_rate": 4.957876394015333e-05, + "loss": 5.9168, + "step": 9840 + }, + { + "epoch": 0.058527214768293846, + "grad_norm": 1.9442622661590576, + "learning_rate": 4.9578678551348125e-05, + "loss": 6.0317, + "step": 9841 + }, + { + "epoch": 0.05853316205157484, + "grad_norm": 2.013845205307007, + "learning_rate": 4.957859315396276e-05, + "loss": 5.6855, + "step": 9842 + }, + { + "epoch": 0.058539109334855836, + "grad_norm": 2.7557523250579834, + "learning_rate": 4.9578507747997264e-05, + "loss": 5.3782, + "step": 9843 + }, + { + "epoch": 0.05854505661813684, + "grad_norm": 1.9822032451629639, + "learning_rate": 4.957842233345167e-05, + "loss": 6.22, + "step": 9844 + }, + { + "epoch": 0.058551003901417834, + "grad_norm": 1.7408699989318848, + "learning_rate": 4.9578336910326e-05, + "loss": 5.2347, + "step": 9845 + }, + { + "epoch": 0.05855695118469883, + "grad_norm": 3.2186660766601562, + "learning_rate": 4.957825147862028e-05, + "loss": 5.3282, + "step": 9846 + }, + { + "epoch": 0.058562898467979824, + "grad_norm": 3.3589892387390137, + "learning_rate": 4.957816603833455e-05, + "loss": 5.5689, + "step": 9847 + }, + { + "epoch": 0.058568845751260826, + "grad_norm": 3.4228861331939697, + "learning_rate": 4.957808058946883e-05, + "loss": 5.5797, + "step": 9848 + }, + { + "epoch": 0.05857479303454182, + "grad_norm": 2.420506238937378, + "learning_rate": 4.957799513202317e-05, + "loss": 5.735, + "step": 9849 + }, + { + "epoch": 0.058580740317822816, + "grad_norm": 1.8269212245941162, + "learning_rate": 4.957790966599758e-05, + "loss": 5.7571, + "step": 9850 + }, + { + "epoch": 0.05858668760110382, + "grad_norm": 2.011110305786133, + "learning_rate": 4.957782419139209e-05, + "loss": 5.9786, + "step": 9851 + }, + { + "epoch": 0.05859263488438481, + "grad_norm": 2.3139355182647705, + "learning_rate": 4.957773870820674e-05, + "loss": 5.8356, + "step": 9852 + }, + { + "epoch": 0.05859858216766581, + "grad_norm": 2.3406572341918945, + "learning_rate": 4.957765321644155e-05, + "loss": 5.8426, + "step": 9853 + }, + { + "epoch": 0.05860452945094681, + "grad_norm": 2.1194591522216797, + "learning_rate": 4.957756771609657e-05, + "loss": 5.6152, + "step": 9854 + }, + { + "epoch": 0.058610476734227805, + "grad_norm": 1.9966599941253662, + "learning_rate": 4.95774822071718e-05, + "loss": 5.8189, + "step": 9855 + }, + { + "epoch": 0.0586164240175088, + "grad_norm": 1.8953092098236084, + "learning_rate": 4.95773966896673e-05, + "loss": 5.8185, + "step": 9856 + }, + { + "epoch": 0.0586223713007898, + "grad_norm": 1.9035093784332275, + "learning_rate": 4.957731116358307e-05, + "loss": 5.6554, + "step": 9857 + }, + { + "epoch": 0.0586283185840708, + "grad_norm": 3.507546901702881, + "learning_rate": 4.9577225628919157e-05, + "loss": 5.8906, + "step": 9858 + }, + { + "epoch": 0.05863426586735179, + "grad_norm": 2.1840403079986572, + "learning_rate": 4.9577140085675586e-05, + "loss": 5.6084, + "step": 9859 + }, + { + "epoch": 0.05864021315063279, + "grad_norm": 2.008424758911133, + "learning_rate": 4.95770545338524e-05, + "loss": 5.8435, + "step": 9860 + }, + { + "epoch": 0.05864616043391379, + "grad_norm": 1.9004656076431274, + "learning_rate": 4.957696897344961e-05, + "loss": 5.5906, + "step": 9861 + }, + { + "epoch": 0.058652107717194785, + "grad_norm": 1.8043147325515747, + "learning_rate": 4.9576883404467255e-05, + "loss": 5.6057, + "step": 9862 + }, + { + "epoch": 0.05865805500047578, + "grad_norm": 1.6765285730361938, + "learning_rate": 4.957679782690537e-05, + "loss": 5.7246, + "step": 9863 + }, + { + "epoch": 0.05866400228375678, + "grad_norm": 2.0207018852233887, + "learning_rate": 4.9576712240763974e-05, + "loss": 5.8459, + "step": 9864 + }, + { + "epoch": 0.05866994956703778, + "grad_norm": 1.975874423980713, + "learning_rate": 4.95766266460431e-05, + "loss": 5.7313, + "step": 9865 + }, + { + "epoch": 0.05867589685031877, + "grad_norm": 2.085277557373047, + "learning_rate": 4.957654104274279e-05, + "loss": 5.1359, + "step": 9866 + }, + { + "epoch": 0.058681844133599774, + "grad_norm": 2.039437770843506, + "learning_rate": 4.957645543086305e-05, + "loss": 5.5673, + "step": 9867 + }, + { + "epoch": 0.05868779141688077, + "grad_norm": 2.0692098140716553, + "learning_rate": 4.9576369810403926e-05, + "loss": 5.6326, + "step": 9868 + }, + { + "epoch": 0.058693738700161764, + "grad_norm": 2.3873767852783203, + "learning_rate": 4.957628418136545e-05, + "loss": 5.5133, + "step": 9869 + }, + { + "epoch": 0.058699685983442766, + "grad_norm": 2.9347658157348633, + "learning_rate": 4.957619854374764e-05, + "loss": 5.5444, + "step": 9870 + }, + { + "epoch": 0.05870563326672376, + "grad_norm": 2.955348014831543, + "learning_rate": 4.957611289755054e-05, + "loss": 5.4883, + "step": 9871 + }, + { + "epoch": 0.058711580550004756, + "grad_norm": 2.147033214569092, + "learning_rate": 4.957602724277417e-05, + "loss": 5.4554, + "step": 9872 + }, + { + "epoch": 0.05871752783328576, + "grad_norm": 2.1422510147094727, + "learning_rate": 4.957594157941856e-05, + "loss": 5.56, + "step": 9873 + }, + { + "epoch": 0.05872347511656675, + "grad_norm": 2.018935203552246, + "learning_rate": 4.957585590748375e-05, + "loss": 5.5176, + "step": 9874 + }, + { + "epoch": 0.05872942239984775, + "grad_norm": 3.0146446228027344, + "learning_rate": 4.957577022696976e-05, + "loss": 5.2623, + "step": 9875 + }, + { + "epoch": 0.058735369683128744, + "grad_norm": 2.923011064529419, + "learning_rate": 4.957568453787662e-05, + "loss": 5.1828, + "step": 9876 + }, + { + "epoch": 0.058741316966409746, + "grad_norm": 2.7203526496887207, + "learning_rate": 4.9575598840204366e-05, + "loss": 5.1565, + "step": 9877 + }, + { + "epoch": 0.05874726424969074, + "grad_norm": 2.056260108947754, + "learning_rate": 4.9575513133953025e-05, + "loss": 5.1345, + "step": 9878 + }, + { + "epoch": 0.058753211532971736, + "grad_norm": 2.3120932579040527, + "learning_rate": 4.9575427419122616e-05, + "loss": 5.1792, + "step": 9879 + }, + { + "epoch": 0.05875915881625274, + "grad_norm": 2.1298701763153076, + "learning_rate": 4.9575341695713186e-05, + "loss": 5.1447, + "step": 9880 + }, + { + "epoch": 0.05876510609953373, + "grad_norm": 2.393869638442993, + "learning_rate": 4.9575255963724756e-05, + "loss": 5.2938, + "step": 9881 + }, + { + "epoch": 0.05877105338281473, + "grad_norm": 2.324061155319214, + "learning_rate": 4.9575170223157366e-05, + "loss": 5.1488, + "step": 9882 + }, + { + "epoch": 0.05877700066609573, + "grad_norm": 2.1416141986846924, + "learning_rate": 4.957508447401103e-05, + "loss": 5.0551, + "step": 9883 + }, + { + "epoch": 0.058782947949376725, + "grad_norm": 2.127350091934204, + "learning_rate": 4.9574998716285795e-05, + "loss": 5.03, + "step": 9884 + }, + { + "epoch": 0.05878889523265772, + "grad_norm": 2.317267417907715, + "learning_rate": 4.957491294998167e-05, + "loss": 5.049, + "step": 9885 + }, + { + "epoch": 0.05879484251593872, + "grad_norm": 2.3667004108428955, + "learning_rate": 4.9574827175098704e-05, + "loss": 5.009, + "step": 9886 + }, + { + "epoch": 0.05880078979921972, + "grad_norm": 2.4034934043884277, + "learning_rate": 4.9574741391636915e-05, + "loss": 4.9419, + "step": 9887 + }, + { + "epoch": 0.05880673708250071, + "grad_norm": 2.3792901039123535, + "learning_rate": 4.957465559959634e-05, + "loss": 4.8517, + "step": 9888 + }, + { + "epoch": 0.05881268436578171, + "grad_norm": 2.139249086380005, + "learning_rate": 4.957456979897701e-05, + "loss": 5.0767, + "step": 9889 + }, + { + "epoch": 0.05881863164906271, + "grad_norm": 2.5370614528656006, + "learning_rate": 4.957448398977894e-05, + "loss": 5.0243, + "step": 9890 + }, + { + "epoch": 0.058824578932343705, + "grad_norm": 2.0474746227264404, + "learning_rate": 4.957439817200218e-05, + "loss": 4.988, + "step": 9891 + }, + { + "epoch": 0.0588305262156247, + "grad_norm": 2.1323394775390625, + "learning_rate": 4.957431234564675e-05, + "loss": 5.7499, + "step": 9892 + }, + { + "epoch": 0.0588364734989057, + "grad_norm": 2.135988473892212, + "learning_rate": 4.957422651071269e-05, + "loss": 6.0197, + "step": 9893 + }, + { + "epoch": 0.0588424207821867, + "grad_norm": 2.4457356929779053, + "learning_rate": 4.957414066720001e-05, + "loss": 5.4461, + "step": 9894 + }, + { + "epoch": 0.05884836806546769, + "grad_norm": 2.3973019123077393, + "learning_rate": 4.957405481510876e-05, + "loss": 5.0372, + "step": 9895 + }, + { + "epoch": 0.058854315348748694, + "grad_norm": 2.5532052516937256, + "learning_rate": 4.957396895443896e-05, + "loss": 5.1462, + "step": 9896 + }, + { + "epoch": 0.05886026263202969, + "grad_norm": 2.3662166595458984, + "learning_rate": 4.9573883085190633e-05, + "loss": 5.1894, + "step": 9897 + }, + { + "epoch": 0.058866209915310684, + "grad_norm": 2.153883695602417, + "learning_rate": 4.9573797207363825e-05, + "loss": 5.6859, + "step": 9898 + }, + { + "epoch": 0.058872157198591686, + "grad_norm": 1.9541380405426025, + "learning_rate": 4.957371132095856e-05, + "loss": 5.5487, + "step": 9899 + }, + { + "epoch": 0.05887810448187268, + "grad_norm": 1.7920335531234741, + "learning_rate": 4.957362542597486e-05, + "loss": 5.4021, + "step": 9900 + }, + { + "epoch": 0.058884051765153676, + "grad_norm": 2.351090431213379, + "learning_rate": 4.9573539522412756e-05, + "loss": 4.9377, + "step": 9901 + }, + { + "epoch": 0.05888999904843468, + "grad_norm": 2.4780900478363037, + "learning_rate": 4.95734536102723e-05, + "loss": 5.04, + "step": 9902 + }, + { + "epoch": 0.05889594633171567, + "grad_norm": 1.7211192846298218, + "learning_rate": 4.957336768955349e-05, + "loss": 5.2959, + "step": 9903 + }, + { + "epoch": 0.05890189361499667, + "grad_norm": 1.9051212072372437, + "learning_rate": 4.957328176025638e-05, + "loss": 5.5587, + "step": 9904 + }, + { + "epoch": 0.058907840898277664, + "grad_norm": 2.009725332260132, + "learning_rate": 4.957319582238099e-05, + "loss": 5.5366, + "step": 9905 + }, + { + "epoch": 0.058913788181558666, + "grad_norm": 1.835423231124878, + "learning_rate": 4.957310987592735e-05, + "loss": 5.2522, + "step": 9906 + }, + { + "epoch": 0.05891973546483966, + "grad_norm": 1.6150819063186646, + "learning_rate": 4.957302392089549e-05, + "loss": 5.3935, + "step": 9907 + }, + { + "epoch": 0.058925682748120656, + "grad_norm": 1.825942873954773, + "learning_rate": 4.9572937957285435e-05, + "loss": 5.5435, + "step": 9908 + }, + { + "epoch": 0.05893163003140166, + "grad_norm": 1.5434985160827637, + "learning_rate": 4.957285198509724e-05, + "loss": 5.2508, + "step": 9909 + }, + { + "epoch": 0.05893757731468265, + "grad_norm": 1.7675530910491943, + "learning_rate": 4.9572766004330894e-05, + "loss": 5.2811, + "step": 9910 + }, + { + "epoch": 0.05894352459796365, + "grad_norm": 1.5196996927261353, + "learning_rate": 4.957268001498646e-05, + "loss": 5.1829, + "step": 9911 + }, + { + "epoch": 0.05894947188124465, + "grad_norm": 1.5598126649856567, + "learning_rate": 4.9572594017063964e-05, + "loss": 5.2067, + "step": 9912 + }, + { + "epoch": 0.058955419164525645, + "grad_norm": 1.6600217819213867, + "learning_rate": 4.957250801056342e-05, + "loss": 5.1591, + "step": 9913 + }, + { + "epoch": 0.05896136644780664, + "grad_norm": 2.040682315826416, + "learning_rate": 4.957242199548487e-05, + "loss": 4.8792, + "step": 9914 + }, + { + "epoch": 0.05896731373108764, + "grad_norm": 2.0122241973876953, + "learning_rate": 4.9572335971828346e-05, + "loss": 5.9489, + "step": 9915 + }, + { + "epoch": 0.05897326101436864, + "grad_norm": 2.4522452354431152, + "learning_rate": 4.957224993959386e-05, + "loss": 5.943, + "step": 9916 + }, + { + "epoch": 0.05897920829764963, + "grad_norm": 1.9101065397262573, + "learning_rate": 4.957216389878147e-05, + "loss": 5.858, + "step": 9917 + }, + { + "epoch": 0.05898515558093063, + "grad_norm": 1.6488839387893677, + "learning_rate": 4.957207784939118e-05, + "loss": 5.4935, + "step": 9918 + }, + { + "epoch": 0.05899110286421163, + "grad_norm": 1.7620775699615479, + "learning_rate": 4.957199179142303e-05, + "loss": 5.6067, + "step": 9919 + }, + { + "epoch": 0.058997050147492625, + "grad_norm": 2.6018314361572266, + "learning_rate": 4.957190572487707e-05, + "loss": 5.5249, + "step": 9920 + }, + { + "epoch": 0.05900299743077362, + "grad_norm": 1.810274600982666, + "learning_rate": 4.957181964975329e-05, + "loss": 5.4063, + "step": 9921 + }, + { + "epoch": 0.05900894471405462, + "grad_norm": 1.7467454671859741, + "learning_rate": 4.957173356605176e-05, + "loss": 5.4476, + "step": 9922 + }, + { + "epoch": 0.05901489199733562, + "grad_norm": 1.9074509143829346, + "learning_rate": 4.9571647473772483e-05, + "loss": 5.8014, + "step": 9923 + }, + { + "epoch": 0.05902083928061661, + "grad_norm": 1.6376137733459473, + "learning_rate": 4.9571561372915496e-05, + "loss": 5.6813, + "step": 9924 + }, + { + "epoch": 0.059026786563897614, + "grad_norm": 1.9984129667282104, + "learning_rate": 4.957147526348083e-05, + "loss": 5.9534, + "step": 9925 + }, + { + "epoch": 0.05903273384717861, + "grad_norm": 2.38493013381958, + "learning_rate": 4.957138914546852e-05, + "loss": 5.6903, + "step": 9926 + }, + { + "epoch": 0.059038681130459604, + "grad_norm": 1.86250901222229, + "learning_rate": 4.957130301887859e-05, + "loss": 5.1777, + "step": 9927 + }, + { + "epoch": 0.059044628413740606, + "grad_norm": 1.6241644620895386, + "learning_rate": 4.957121688371107e-05, + "loss": 5.1693, + "step": 9928 + }, + { + "epoch": 0.0590505756970216, + "grad_norm": 1.5627753734588623, + "learning_rate": 4.9571130739965996e-05, + "loss": 5.0313, + "step": 9929 + }, + { + "epoch": 0.059056522980302596, + "grad_norm": 1.6763062477111816, + "learning_rate": 4.957104458764339e-05, + "loss": 4.9973, + "step": 9930 + }, + { + "epoch": 0.0590624702635836, + "grad_norm": 1.6215085983276367, + "learning_rate": 4.957095842674329e-05, + "loss": 5.2216, + "step": 9931 + }, + { + "epoch": 0.05906841754686459, + "grad_norm": 1.5599844455718994, + "learning_rate": 4.957087225726572e-05, + "loss": 5.4525, + "step": 9932 + }, + { + "epoch": 0.05907436483014559, + "grad_norm": 1.3916441202163696, + "learning_rate": 4.957078607921072e-05, + "loss": 5.4434, + "step": 9933 + }, + { + "epoch": 0.059080312113426584, + "grad_norm": 1.524478554725647, + "learning_rate": 4.9570699892578295e-05, + "loss": 5.3979, + "step": 9934 + }, + { + "epoch": 0.059086259396707586, + "grad_norm": 1.264108657836914, + "learning_rate": 4.9570613697368505e-05, + "loss": 5.2892, + "step": 9935 + }, + { + "epoch": 0.05909220667998858, + "grad_norm": 1.7481588125228882, + "learning_rate": 4.957052749358137e-05, + "loss": 4.8539, + "step": 9936 + }, + { + "epoch": 0.059098153963269576, + "grad_norm": 1.675515055656433, + "learning_rate": 4.957044128121692e-05, + "loss": 5.4645, + "step": 9937 + }, + { + "epoch": 0.05910410124655058, + "grad_norm": 1.6560577154159546, + "learning_rate": 4.957035506027517e-05, + "loss": 4.9354, + "step": 9938 + }, + { + "epoch": 0.05911004852983157, + "grad_norm": 1.5030722618103027, + "learning_rate": 4.9570268830756174e-05, + "loss": 5.206, + "step": 9939 + }, + { + "epoch": 0.05911599581311257, + "grad_norm": 1.65435791015625, + "learning_rate": 4.957018259265994e-05, + "loss": 5.2132, + "step": 9940 + }, + { + "epoch": 0.05912194309639357, + "grad_norm": 1.6701000928878784, + "learning_rate": 4.9570096345986515e-05, + "loss": 5.2313, + "step": 9941 + }, + { + "epoch": 0.059127890379674565, + "grad_norm": 1.412954330444336, + "learning_rate": 4.957001009073593e-05, + "loss": 5.2511, + "step": 9942 + }, + { + "epoch": 0.05913383766295556, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.95699238269082e-05, + "loss": 5.3646, + "step": 9943 + }, + { + "epoch": 0.05913978494623656, + "grad_norm": 1.6969150304794312, + "learning_rate": 4.9569837554503365e-05, + "loss": 5.3001, + "step": 9944 + }, + { + "epoch": 0.05914573222951756, + "grad_norm": 1.8579715490341187, + "learning_rate": 4.9569751273521454e-05, + "loss": 5.0944, + "step": 9945 + }, + { + "epoch": 0.05915167951279855, + "grad_norm": 1.6907633543014526, + "learning_rate": 4.956966498396249e-05, + "loss": 5.1447, + "step": 9946 + }, + { + "epoch": 0.059157626796079554, + "grad_norm": 1.7581912279129028, + "learning_rate": 4.9569578685826525e-05, + "loss": 5.2065, + "step": 9947 + }, + { + "epoch": 0.05916357407936055, + "grad_norm": 1.4447051286697388, + "learning_rate": 4.9569492379113555e-05, + "loss": 5.081, + "step": 9948 + }, + { + "epoch": 0.059169521362641544, + "grad_norm": 1.731697916984558, + "learning_rate": 4.9569406063823644e-05, + "loss": 5.241, + "step": 9949 + }, + { + "epoch": 0.05917546864592254, + "grad_norm": 1.6483672857284546, + "learning_rate": 4.956931973995681e-05, + "loss": 5.306, + "step": 9950 + }, + { + "epoch": 0.05918141592920354, + "grad_norm": 2.2123141288757324, + "learning_rate": 4.956923340751306e-05, + "loss": 5.6134, + "step": 9951 + }, + { + "epoch": 0.05918736321248454, + "grad_norm": 1.8569937944412231, + "learning_rate": 4.956914706649246e-05, + "loss": 5.4819, + "step": 9952 + }, + { + "epoch": 0.05919331049576553, + "grad_norm": 1.8417435884475708, + "learning_rate": 4.956906071689502e-05, + "loss": 5.4116, + "step": 9953 + }, + { + "epoch": 0.059199257779046534, + "grad_norm": 1.7050427198410034, + "learning_rate": 4.956897435872078e-05, + "loss": 5.238, + "step": 9954 + }, + { + "epoch": 0.05920520506232753, + "grad_norm": 1.6636401414871216, + "learning_rate": 4.956888799196976e-05, + "loss": 5.0962, + "step": 9955 + }, + { + "epoch": 0.059211152345608524, + "grad_norm": 1.9194599390029907, + "learning_rate": 4.9568801616642e-05, + "loss": 5.2078, + "step": 9956 + }, + { + "epoch": 0.059217099628889526, + "grad_norm": 1.6154237985610962, + "learning_rate": 4.956871523273752e-05, + "loss": 5.3562, + "step": 9957 + }, + { + "epoch": 0.05922304691217052, + "grad_norm": 1.4500404596328735, + "learning_rate": 4.956862884025636e-05, + "loss": 5.2061, + "step": 9958 + }, + { + "epoch": 0.059228994195451516, + "grad_norm": 1.6681636571884155, + "learning_rate": 4.956854243919854e-05, + "loss": 5.3455, + "step": 9959 + }, + { + "epoch": 0.05923494147873252, + "grad_norm": 1.7175511121749878, + "learning_rate": 4.9568456029564104e-05, + "loss": 5.2967, + "step": 9960 + }, + { + "epoch": 0.05924088876201351, + "grad_norm": 1.5013905763626099, + "learning_rate": 4.956836961135306e-05, + "loss": 4.9836, + "step": 9961 + }, + { + "epoch": 0.05924683604529451, + "grad_norm": 1.6521363258361816, + "learning_rate": 4.956828318456546e-05, + "loss": 5.0295, + "step": 9962 + }, + { + "epoch": 0.0592527833285755, + "grad_norm": 1.5945814847946167, + "learning_rate": 4.9568196749201326e-05, + "loss": 4.9511, + "step": 9963 + }, + { + "epoch": 0.059258730611856505, + "grad_norm": 1.508301854133606, + "learning_rate": 4.95681103052607e-05, + "loss": 4.9469, + "step": 9964 + }, + { + "epoch": 0.0592646778951375, + "grad_norm": 1.5902310609817505, + "learning_rate": 4.956802385274358e-05, + "loss": 4.9761, + "step": 9965 + }, + { + "epoch": 0.059270625178418496, + "grad_norm": 1.739424467086792, + "learning_rate": 4.956793739165003e-05, + "loss": 5.2443, + "step": 9966 + }, + { + "epoch": 0.0592765724616995, + "grad_norm": 1.8317997455596924, + "learning_rate": 4.9567850921980056e-05, + "loss": 5.0046, + "step": 9967 + }, + { + "epoch": 0.05928251974498049, + "grad_norm": 1.8073506355285645, + "learning_rate": 4.956776444373371e-05, + "loss": 5.1779, + "step": 9968 + }, + { + "epoch": 0.05928846702826149, + "grad_norm": 1.8806017637252808, + "learning_rate": 4.956767795691101e-05, + "loss": 5.2956, + "step": 9969 + }, + { + "epoch": 0.05929441431154249, + "grad_norm": 1.8397493362426758, + "learning_rate": 4.956759146151198e-05, + "loss": 5.1775, + "step": 9970 + }, + { + "epoch": 0.059300361594823485, + "grad_norm": 2.001387119293213, + "learning_rate": 4.9567504957536656e-05, + "loss": 5.2149, + "step": 9971 + }, + { + "epoch": 0.05930630887810448, + "grad_norm": 2.011504650115967, + "learning_rate": 4.956741844498508e-05, + "loss": 5.2384, + "step": 9972 + }, + { + "epoch": 0.05931225616138548, + "grad_norm": 1.7936465740203857, + "learning_rate": 4.956733192385727e-05, + "loss": 5.2297, + "step": 9973 + }, + { + "epoch": 0.05931820344466648, + "grad_norm": 1.7336666584014893, + "learning_rate": 4.9567245394153255e-05, + "loss": 5.1637, + "step": 9974 + }, + { + "epoch": 0.05932415072794747, + "grad_norm": 1.7429137229919434, + "learning_rate": 4.956715885587307e-05, + "loss": 5.1315, + "step": 9975 + }, + { + "epoch": 0.059330098011228474, + "grad_norm": 1.6609208583831787, + "learning_rate": 4.956707230901674e-05, + "loss": 5.1554, + "step": 9976 + }, + { + "epoch": 0.05933604529450947, + "grad_norm": 1.630026936531067, + "learning_rate": 4.95669857535843e-05, + "loss": 5.1569, + "step": 9977 + }, + { + "epoch": 0.059341992577790464, + "grad_norm": 1.6968966722488403, + "learning_rate": 4.956689918957579e-05, + "loss": 5.06, + "step": 9978 + }, + { + "epoch": 0.05934793986107146, + "grad_norm": 1.6973050832748413, + "learning_rate": 4.9566812616991214e-05, + "loss": 5.2044, + "step": 9979 + }, + { + "epoch": 0.05935388714435246, + "grad_norm": 1.436073899269104, + "learning_rate": 4.9566726035830624e-05, + "loss": 5.2638, + "step": 9980 + }, + { + "epoch": 0.05935983442763346, + "grad_norm": 1.7667059898376465, + "learning_rate": 4.956663944609404e-05, + "loss": 5.0912, + "step": 9981 + }, + { + "epoch": 0.05936578171091445, + "grad_norm": 2.277327060699463, + "learning_rate": 4.9566552847781504e-05, + "loss": 5.6089, + "step": 9982 + }, + { + "epoch": 0.059371728994195454, + "grad_norm": 1.521134376525879, + "learning_rate": 4.956646624089304e-05, + "loss": 5.0213, + "step": 9983 + }, + { + "epoch": 0.05937767627747645, + "grad_norm": 1.556511402130127, + "learning_rate": 4.956637962542867e-05, + "loss": 5.1126, + "step": 9984 + }, + { + "epoch": 0.059383623560757444, + "grad_norm": 1.6691070795059204, + "learning_rate": 4.9566293001388423e-05, + "loss": 5.1351, + "step": 9985 + }, + { + "epoch": 0.059389570844038446, + "grad_norm": 1.5213310718536377, + "learning_rate": 4.956620636877235e-05, + "loss": 5.2402, + "step": 9986 + }, + { + "epoch": 0.05939551812731944, + "grad_norm": 1.5169057846069336, + "learning_rate": 4.956611972758046e-05, + "loss": 5.214, + "step": 9987 + }, + { + "epoch": 0.059401465410600436, + "grad_norm": 1.6076115369796753, + "learning_rate": 4.956603307781279e-05, + "loss": 5.1081, + "step": 9988 + }, + { + "epoch": 0.05940741269388144, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.9565946419469376e-05, + "loss": 5.1582, + "step": 9989 + }, + { + "epoch": 0.05941335997716243, + "grad_norm": 1.5118008852005005, + "learning_rate": 4.956585975255025e-05, + "loss": 5.0515, + "step": 9990 + }, + { + "epoch": 0.05941930726044343, + "grad_norm": 1.8852020502090454, + "learning_rate": 4.956577307705543e-05, + "loss": 5.3811, + "step": 9991 + }, + { + "epoch": 0.05942525454372442, + "grad_norm": 1.7066764831542969, + "learning_rate": 4.9565686392984955e-05, + "loss": 5.4599, + "step": 9992 + }, + { + "epoch": 0.059431201827005425, + "grad_norm": 1.5517010688781738, + "learning_rate": 4.956559970033885e-05, + "loss": 5.0728, + "step": 9993 + }, + { + "epoch": 0.05943714911028642, + "grad_norm": 1.508901596069336, + "learning_rate": 4.956551299911715e-05, + "loss": 5.1857, + "step": 9994 + }, + { + "epoch": 0.059443096393567416, + "grad_norm": 1.8867852687835693, + "learning_rate": 4.9565426289319874e-05, + "loss": 5.2223, + "step": 9995 + }, + { + "epoch": 0.05944904367684842, + "grad_norm": 1.4767159223556519, + "learning_rate": 4.9565339570947076e-05, + "loss": 5.1404, + "step": 9996 + }, + { + "epoch": 0.05945499096012941, + "grad_norm": 1.6351869106292725, + "learning_rate": 4.956525284399876e-05, + "loss": 5.3235, + "step": 9997 + }, + { + "epoch": 0.05946093824341041, + "grad_norm": 1.543565273284912, + "learning_rate": 4.956516610847497e-05, + "loss": 5.3365, + "step": 9998 + }, + { + "epoch": 0.05946688552669141, + "grad_norm": 1.4907768964767456, + "learning_rate": 4.9565079364375746e-05, + "loss": 5.4215, + "step": 9999 + }, + { + "epoch": 0.059472832809972405, + "grad_norm": 1.5810034275054932, + "learning_rate": 4.956499261170109e-05, + "loss": 5.3899, + "step": 10000 + }, + { + "epoch": 0.0594787800932534, + "grad_norm": 1.6342787742614746, + "learning_rate": 4.956490585045106e-05, + "loss": 5.4278, + "step": 10001 + }, + { + "epoch": 0.0594847273765344, + "grad_norm": 1.5474039316177368, + "learning_rate": 4.956481908062567e-05, + "loss": 5.1232, + "step": 10002 + }, + { + "epoch": 0.0594906746598154, + "grad_norm": 1.5679951906204224, + "learning_rate": 4.956473230222496e-05, + "loss": 5.3245, + "step": 10003 + }, + { + "epoch": 0.05949662194309639, + "grad_norm": 1.4851021766662598, + "learning_rate": 4.9564645515248955e-05, + "loss": 5.1806, + "step": 10004 + }, + { + "epoch": 0.059502569226377394, + "grad_norm": 1.8518844842910767, + "learning_rate": 4.956455871969768e-05, + "loss": 5.2543, + "step": 10005 + }, + { + "epoch": 0.05950851650965839, + "grad_norm": 1.7865514755249023, + "learning_rate": 4.956447191557118e-05, + "loss": 5.405, + "step": 10006 + }, + { + "epoch": 0.059514463792939384, + "grad_norm": 1.9051682949066162, + "learning_rate": 4.956438510286946e-05, + "loss": 5.0509, + "step": 10007 + }, + { + "epoch": 0.05952041107622038, + "grad_norm": 1.5150926113128662, + "learning_rate": 4.956429828159258e-05, + "loss": 5.0065, + "step": 10008 + }, + { + "epoch": 0.05952635835950138, + "grad_norm": 1.6085938215255737, + "learning_rate": 4.956421145174056e-05, + "loss": 5.2295, + "step": 10009 + }, + { + "epoch": 0.05953230564278238, + "grad_norm": 1.6337605714797974, + "learning_rate": 4.9564124613313424e-05, + "loss": 5.1666, + "step": 10010 + }, + { + "epoch": 0.05953825292606337, + "grad_norm": 1.5093178749084473, + "learning_rate": 4.9564037766311205e-05, + "loss": 5.2268, + "step": 10011 + }, + { + "epoch": 0.059544200209344374, + "grad_norm": 1.5047305822372437, + "learning_rate": 4.9563950910733936e-05, + "loss": 5.1065, + "step": 10012 + }, + { + "epoch": 0.05955014749262537, + "grad_norm": 1.6275629997253418, + "learning_rate": 4.9563864046581645e-05, + "loss": 5.2366, + "step": 10013 + }, + { + "epoch": 0.059556094775906364, + "grad_norm": 1.535582184791565, + "learning_rate": 4.956377717385436e-05, + "loss": 5.1799, + "step": 10014 + }, + { + "epoch": 0.059562042059187366, + "grad_norm": 1.448477864265442, + "learning_rate": 4.956369029255211e-05, + "loss": 5.2207, + "step": 10015 + }, + { + "epoch": 0.05956798934246836, + "grad_norm": 1.5288492441177368, + "learning_rate": 4.956360340267494e-05, + "loss": 5.3646, + "step": 10016 + }, + { + "epoch": 0.059573936625749356, + "grad_norm": 1.5746785402297974, + "learning_rate": 4.956351650422287e-05, + "loss": 5.1941, + "step": 10017 + }, + { + "epoch": 0.05957988390903036, + "grad_norm": 1.7088212966918945, + "learning_rate": 4.956342959719592e-05, + "loss": 5.1667, + "step": 10018 + }, + { + "epoch": 0.05958583119231135, + "grad_norm": 1.7666717767715454, + "learning_rate": 4.956334268159414e-05, + "loss": 5.1808, + "step": 10019 + }, + { + "epoch": 0.05959177847559235, + "grad_norm": 1.6472598314285278, + "learning_rate": 4.956325575741755e-05, + "loss": 5.3369, + "step": 10020 + }, + { + "epoch": 0.05959772575887334, + "grad_norm": 1.7340562343597412, + "learning_rate": 4.9563168824666174e-05, + "loss": 5.5623, + "step": 10021 + }, + { + "epoch": 0.059603673042154345, + "grad_norm": 1.9677515029907227, + "learning_rate": 4.9563081883340054e-05, + "loss": 4.7612, + "step": 10022 + }, + { + "epoch": 0.05960962032543534, + "grad_norm": 1.4823256731033325, + "learning_rate": 4.9562994933439215e-05, + "loss": 5.4504, + "step": 10023 + }, + { + "epoch": 0.059615567608716336, + "grad_norm": 1.5346739292144775, + "learning_rate": 4.956290797496369e-05, + "loss": 5.5455, + "step": 10024 + }, + { + "epoch": 0.05962151489199734, + "grad_norm": 1.5420036315917969, + "learning_rate": 4.956282100791351e-05, + "loss": 5.1363, + "step": 10025 + }, + { + "epoch": 0.05962746217527833, + "grad_norm": 1.7927091121673584, + "learning_rate": 4.956273403228869e-05, + "loss": 5.0768, + "step": 10026 + }, + { + "epoch": 0.05963340945855933, + "grad_norm": 1.7139612436294556, + "learning_rate": 4.9562647048089287e-05, + "loss": 5.2046, + "step": 10027 + }, + { + "epoch": 0.05963935674184033, + "grad_norm": 1.627684473991394, + "learning_rate": 4.956256005531531e-05, + "loss": 5.3844, + "step": 10028 + }, + { + "epoch": 0.059645304025121325, + "grad_norm": 1.5006085634231567, + "learning_rate": 4.9562473053966805e-05, + "loss": 5.4948, + "step": 10029 + }, + { + "epoch": 0.05965125130840232, + "grad_norm": 1.5670723915100098, + "learning_rate": 4.956238604404378e-05, + "loss": 5.5465, + "step": 10030 + }, + { + "epoch": 0.05965719859168332, + "grad_norm": 1.5671201944351196, + "learning_rate": 4.95622990255463e-05, + "loss": 5.1969, + "step": 10031 + }, + { + "epoch": 0.05966314587496432, + "grad_norm": 2.1628634929656982, + "learning_rate": 4.956221199847436e-05, + "loss": 5.0244, + "step": 10032 + }, + { + "epoch": 0.05966909315824531, + "grad_norm": 1.5766685009002686, + "learning_rate": 4.956212496282801e-05, + "loss": 5.4698, + "step": 10033 + }, + { + "epoch": 0.059675040441526314, + "grad_norm": 1.625812292098999, + "learning_rate": 4.956203791860728e-05, + "loss": 5.3825, + "step": 10034 + }, + { + "epoch": 0.05968098772480731, + "grad_norm": 1.4307054281234741, + "learning_rate": 4.956195086581219e-05, + "loss": 5.3576, + "step": 10035 + }, + { + "epoch": 0.059686935008088304, + "grad_norm": 1.4459644556045532, + "learning_rate": 4.9561863804442785e-05, + "loss": 5.3478, + "step": 10036 + }, + { + "epoch": 0.0596928822913693, + "grad_norm": 1.8038474321365356, + "learning_rate": 4.9561776734499075e-05, + "loss": 5.4967, + "step": 10037 + }, + { + "epoch": 0.0596988295746503, + "grad_norm": 1.41011381149292, + "learning_rate": 4.9561689655981115e-05, + "loss": 5.4224, + "step": 10038 + }, + { + "epoch": 0.059704776857931297, + "grad_norm": 1.6678937673568726, + "learning_rate": 4.956160256888891e-05, + "loss": 5.27, + "step": 10039 + }, + { + "epoch": 0.05971072414121229, + "grad_norm": 1.794647455215454, + "learning_rate": 4.956151547322251e-05, + "loss": 5.2822, + "step": 10040 + }, + { + "epoch": 0.059716671424493294, + "grad_norm": 1.5010912418365479, + "learning_rate": 4.9561428368981944e-05, + "loss": 5.3778, + "step": 10041 + }, + { + "epoch": 0.05972261870777429, + "grad_norm": 1.785395860671997, + "learning_rate": 4.9561341256167234e-05, + "loss": 5.4213, + "step": 10042 + }, + { + "epoch": 0.059728565991055284, + "grad_norm": 1.889667272567749, + "learning_rate": 4.956125413477841e-05, + "loss": 5.2795, + "step": 10043 + }, + { + "epoch": 0.059734513274336286, + "grad_norm": 2.209780216217041, + "learning_rate": 4.95611670048155e-05, + "loss": 5.6823, + "step": 10044 + }, + { + "epoch": 0.05974046055761728, + "grad_norm": 1.979069471359253, + "learning_rate": 4.956107986627855e-05, + "loss": 5.3437, + "step": 10045 + }, + { + "epoch": 0.059746407840898276, + "grad_norm": 1.8391239643096924, + "learning_rate": 4.9560992719167584e-05, + "loss": 5.2246, + "step": 10046 + }, + { + "epoch": 0.05975235512417928, + "grad_norm": 2.0196359157562256, + "learning_rate": 4.956090556348262e-05, + "loss": 5.3549, + "step": 10047 + }, + { + "epoch": 0.05975830240746027, + "grad_norm": 1.7103056907653809, + "learning_rate": 4.95608183992237e-05, + "loss": 5.4016, + "step": 10048 + }, + { + "epoch": 0.05976424969074127, + "grad_norm": 1.543308138847351, + "learning_rate": 4.956073122639085e-05, + "loss": 5.2628, + "step": 10049 + }, + { + "epoch": 0.05977019697402226, + "grad_norm": 2.0719797611236572, + "learning_rate": 4.956064404498411e-05, + "loss": 5.3149, + "step": 10050 + }, + { + "epoch": 0.059776144257303265, + "grad_norm": 1.9024063348770142, + "learning_rate": 4.95605568550035e-05, + "loss": 5.2804, + "step": 10051 + }, + { + "epoch": 0.05978209154058426, + "grad_norm": 1.6171611547470093, + "learning_rate": 4.9560469656449046e-05, + "loss": 5.2558, + "step": 10052 + }, + { + "epoch": 0.059788038823865255, + "grad_norm": 1.5416970252990723, + "learning_rate": 4.9560382449320795e-05, + "loss": 5.3164, + "step": 10053 + }, + { + "epoch": 0.05979398610714626, + "grad_norm": 1.6956002712249756, + "learning_rate": 4.956029523361877e-05, + "loss": 5.2123, + "step": 10054 + }, + { + "epoch": 0.05979993339042725, + "grad_norm": 1.6414602994918823, + "learning_rate": 4.956020800934299e-05, + "loss": 5.3302, + "step": 10055 + }, + { + "epoch": 0.05980588067370825, + "grad_norm": 1.6868051290512085, + "learning_rate": 4.95601207764935e-05, + "loss": 5.2076, + "step": 10056 + }, + { + "epoch": 0.05981182795698925, + "grad_norm": 1.7299697399139404, + "learning_rate": 4.956003353507033e-05, + "loss": 5.3502, + "step": 10057 + }, + { + "epoch": 0.059817775240270245, + "grad_norm": 1.4923878908157349, + "learning_rate": 4.95599462850735e-05, + "loss": 5.3081, + "step": 10058 + }, + { + "epoch": 0.05982372252355124, + "grad_norm": 1.571413516998291, + "learning_rate": 4.9559859026503045e-05, + "loss": 5.1434, + "step": 10059 + }, + { + "epoch": 0.05982966980683224, + "grad_norm": 1.6265422105789185, + "learning_rate": 4.9559771759359e-05, + "loss": 5.2455, + "step": 10060 + }, + { + "epoch": 0.05983561709011324, + "grad_norm": 1.7889208793640137, + "learning_rate": 4.9559684483641395e-05, + "loss": 5.2429, + "step": 10061 + }, + { + "epoch": 0.05984156437339423, + "grad_norm": 1.5957598686218262, + "learning_rate": 4.955959719935025e-05, + "loss": 5.2299, + "step": 10062 + }, + { + "epoch": 0.059847511656675234, + "grad_norm": 1.6366177797317505, + "learning_rate": 4.955950990648561e-05, + "loss": 5.366, + "step": 10063 + }, + { + "epoch": 0.05985345893995623, + "grad_norm": 1.6712719202041626, + "learning_rate": 4.95594226050475e-05, + "loss": 5.3602, + "step": 10064 + }, + { + "epoch": 0.059859406223237224, + "grad_norm": 1.8273069858551025, + "learning_rate": 4.955933529503595e-05, + "loss": 5.3586, + "step": 10065 + }, + { + "epoch": 0.05986535350651822, + "grad_norm": 1.6638576984405518, + "learning_rate": 4.955924797645098e-05, + "loss": 5.2359, + "step": 10066 + }, + { + "epoch": 0.05987130078979922, + "grad_norm": 1.8127614259719849, + "learning_rate": 4.955916064929264e-05, + "loss": 5.3815, + "step": 10067 + }, + { + "epoch": 0.059877248073080216, + "grad_norm": 1.7204198837280273, + "learning_rate": 4.955907331356095e-05, + "loss": 5.5576, + "step": 10068 + }, + { + "epoch": 0.05988319535636121, + "grad_norm": 1.9153103828430176, + "learning_rate": 4.9558985969255936e-05, + "loss": 5.4363, + "step": 10069 + }, + { + "epoch": 0.059889142639642214, + "grad_norm": 1.6427290439605713, + "learning_rate": 4.9558898616377634e-05, + "loss": 5.4497, + "step": 10070 + }, + { + "epoch": 0.05989508992292321, + "grad_norm": 1.660217046737671, + "learning_rate": 4.955881125492608e-05, + "loss": 5.4988, + "step": 10071 + }, + { + "epoch": 0.059901037206204204, + "grad_norm": 1.7776225805282593, + "learning_rate": 4.955872388490129e-05, + "loss": 5.2714, + "step": 10072 + }, + { + "epoch": 0.059906984489485206, + "grad_norm": 1.5099388360977173, + "learning_rate": 4.9558636506303314e-05, + "loss": 5.4714, + "step": 10073 + }, + { + "epoch": 0.0599129317727662, + "grad_norm": 1.523537039756775, + "learning_rate": 4.955854911913217e-05, + "loss": 5.3528, + "step": 10074 + }, + { + "epoch": 0.059918879056047196, + "grad_norm": 1.3424321413040161, + "learning_rate": 4.9558461723387885e-05, + "loss": 5.3385, + "step": 10075 + }, + { + "epoch": 0.0599248263393282, + "grad_norm": 1.3843169212341309, + "learning_rate": 4.955837431907049e-05, + "loss": 5.383, + "step": 10076 + }, + { + "epoch": 0.05993077362260919, + "grad_norm": 1.4927351474761963, + "learning_rate": 4.955828690618003e-05, + "loss": 5.3536, + "step": 10077 + }, + { + "epoch": 0.05993672090589019, + "grad_norm": 1.5207486152648926, + "learning_rate": 4.955819948471653e-05, + "loss": 5.3557, + "step": 10078 + }, + { + "epoch": 0.05994266818917118, + "grad_norm": 1.5589584112167358, + "learning_rate": 4.9558112054680004e-05, + "loss": 5.3747, + "step": 10079 + }, + { + "epoch": 0.059948615472452185, + "grad_norm": 1.436951756477356, + "learning_rate": 4.9558024616070496e-05, + "loss": 5.2807, + "step": 10080 + }, + { + "epoch": 0.05995456275573318, + "grad_norm": 1.4345866441726685, + "learning_rate": 4.955793716888804e-05, + "loss": 5.4, + "step": 10081 + }, + { + "epoch": 0.059960510039014175, + "grad_norm": 1.2811249494552612, + "learning_rate": 4.955784971313267e-05, + "loss": 5.2531, + "step": 10082 + }, + { + "epoch": 0.05996645732229518, + "grad_norm": 1.5558568239212036, + "learning_rate": 4.955776224880439e-05, + "loss": 5.1136, + "step": 10083 + }, + { + "epoch": 0.05997240460557617, + "grad_norm": 1.3918567895889282, + "learning_rate": 4.955767477590326e-05, + "loss": 5.2748, + "step": 10084 + }, + { + "epoch": 0.05997835188885717, + "grad_norm": 1.3277204036712646, + "learning_rate": 4.9557587294429295e-05, + "loss": 5.2346, + "step": 10085 + }, + { + "epoch": 0.05998429917213817, + "grad_norm": 1.2874623537063599, + "learning_rate": 4.955749980438253e-05, + "loss": 5.2616, + "step": 10086 + }, + { + "epoch": 0.059990246455419165, + "grad_norm": 1.7534229755401611, + "learning_rate": 4.9557412305763004e-05, + "loss": 5.2509, + "step": 10087 + }, + { + "epoch": 0.05999619373870016, + "grad_norm": 1.4560372829437256, + "learning_rate": 4.955732479857072e-05, + "loss": 5.2385, + "step": 10088 + }, + { + "epoch": 0.06000214102198116, + "grad_norm": 1.232779860496521, + "learning_rate": 4.955723728280575e-05, + "loss": 5.2726, + "step": 10089 + }, + { + "epoch": 0.06000808830526216, + "grad_norm": 1.6178683042526245, + "learning_rate": 4.955714975846809e-05, + "loss": 5.3816, + "step": 10090 + }, + { + "epoch": 0.06001403558854315, + "grad_norm": 1.5438450574874878, + "learning_rate": 4.955706222555779e-05, + "loss": 5.2706, + "step": 10091 + }, + { + "epoch": 0.060019982871824154, + "grad_norm": 1.5367876291275024, + "learning_rate": 4.955697468407486e-05, + "loss": 5.1955, + "step": 10092 + }, + { + "epoch": 0.06002593015510515, + "grad_norm": 1.2902512550354004, + "learning_rate": 4.955688713401936e-05, + "loss": 5.166, + "step": 10093 + }, + { + "epoch": 0.060031877438386144, + "grad_norm": 1.5516488552093506, + "learning_rate": 4.95567995753913e-05, + "loss": 5.1256, + "step": 10094 + }, + { + "epoch": 0.06003782472166714, + "grad_norm": 1.3104857206344604, + "learning_rate": 4.9556712008190706e-05, + "loss": 5.1604, + "step": 10095 + }, + { + "epoch": 0.06004377200494814, + "grad_norm": 1.6237741708755493, + "learning_rate": 4.955662443241762e-05, + "loss": 5.2686, + "step": 10096 + }, + { + "epoch": 0.060049719288229136, + "grad_norm": 1.6566027402877808, + "learning_rate": 4.955653684807208e-05, + "loss": 5.3376, + "step": 10097 + }, + { + "epoch": 0.06005566657151013, + "grad_norm": 1.4010981321334839, + "learning_rate": 4.9556449255154106e-05, + "loss": 5.4008, + "step": 10098 + }, + { + "epoch": 0.060061613854791134, + "grad_norm": 1.6399116516113281, + "learning_rate": 4.955636165366372e-05, + "loss": 5.2718, + "step": 10099 + }, + { + "epoch": 0.06006756113807213, + "grad_norm": 1.5371499061584473, + "learning_rate": 4.955627404360096e-05, + "loss": 5.2107, + "step": 10100 + }, + { + "epoch": 0.060073508421353124, + "grad_norm": 1.598186731338501, + "learning_rate": 4.955618642496587e-05, + "loss": 5.3482, + "step": 10101 + }, + { + "epoch": 0.060079455704634126, + "grad_norm": 1.526595115661621, + "learning_rate": 4.955609879775846e-05, + "loss": 5.2335, + "step": 10102 + }, + { + "epoch": 0.06008540298791512, + "grad_norm": 1.509990930557251, + "learning_rate": 4.955601116197877e-05, + "loss": 5.168, + "step": 10103 + }, + { + "epoch": 0.060091350271196116, + "grad_norm": 1.368203043937683, + "learning_rate": 4.9555923517626836e-05, + "loss": 5.2183, + "step": 10104 + }, + { + "epoch": 0.06009729755447712, + "grad_norm": 1.5153454542160034, + "learning_rate": 4.955583586470268e-05, + "loss": 5.2558, + "step": 10105 + }, + { + "epoch": 0.06010324483775811, + "grad_norm": 2.9330217838287354, + "learning_rate": 4.955574820320633e-05, + "loss": 5.6863, + "step": 10106 + }, + { + "epoch": 0.06010919212103911, + "grad_norm": 1.6096080541610718, + "learning_rate": 4.9555660533137825e-05, + "loss": 5.2243, + "step": 10107 + }, + { + "epoch": 0.0601151394043201, + "grad_norm": 1.5425163507461548, + "learning_rate": 4.95555728544972e-05, + "loss": 5.4308, + "step": 10108 + }, + { + "epoch": 0.060121086687601105, + "grad_norm": 1.4898573160171509, + "learning_rate": 4.955548516728447e-05, + "loss": 5.389, + "step": 10109 + }, + { + "epoch": 0.0601270339708821, + "grad_norm": 1.5746946334838867, + "learning_rate": 4.955539747149968e-05, + "loss": 5.1414, + "step": 10110 + }, + { + "epoch": 0.060132981254163095, + "grad_norm": 1.7621461153030396, + "learning_rate": 4.955530976714285e-05, + "loss": 5.4572, + "step": 10111 + }, + { + "epoch": 0.0601389285374441, + "grad_norm": 1.4524224996566772, + "learning_rate": 4.9555222054214015e-05, + "loss": 5.4577, + "step": 10112 + }, + { + "epoch": 0.06014487582072509, + "grad_norm": 1.5630146265029907, + "learning_rate": 4.95551343327132e-05, + "loss": 5.277, + "step": 10113 + }, + { + "epoch": 0.06015082310400609, + "grad_norm": 1.9279972314834595, + "learning_rate": 4.955504660264045e-05, + "loss": 5.1485, + "step": 10114 + }, + { + "epoch": 0.06015677038728709, + "grad_norm": 1.618775725364685, + "learning_rate": 4.9554958863995786e-05, + "loss": 5.1262, + "step": 10115 + }, + { + "epoch": 0.060162717670568085, + "grad_norm": 1.8578898906707764, + "learning_rate": 4.955487111677924e-05, + "loss": 5.3451, + "step": 10116 + }, + { + "epoch": 0.06016866495384908, + "grad_norm": 1.5652815103530884, + "learning_rate": 4.955478336099084e-05, + "loss": 5.2326, + "step": 10117 + }, + { + "epoch": 0.06017461223713008, + "grad_norm": 1.4957774877548218, + "learning_rate": 4.9554695596630616e-05, + "loss": 5.3332, + "step": 10118 + }, + { + "epoch": 0.06018055952041108, + "grad_norm": 1.428112506866455, + "learning_rate": 4.9554607823698606e-05, + "loss": 5.2647, + "step": 10119 + }, + { + "epoch": 0.06018650680369207, + "grad_norm": 1.9383279085159302, + "learning_rate": 4.955452004219484e-05, + "loss": 5.5897, + "step": 10120 + }, + { + "epoch": 0.060192454086973074, + "grad_norm": 1.8523132801055908, + "learning_rate": 4.955443225211934e-05, + "loss": 5.6204, + "step": 10121 + }, + { + "epoch": 0.06019840137025407, + "grad_norm": 1.7980049848556519, + "learning_rate": 4.955434445347214e-05, + "loss": 5.4383, + "step": 10122 + }, + { + "epoch": 0.060204348653535064, + "grad_norm": 1.7927988767623901, + "learning_rate": 4.9554256646253274e-05, + "loss": 5.6066, + "step": 10123 + }, + { + "epoch": 0.06021029593681606, + "grad_norm": 1.8549528121948242, + "learning_rate": 4.955416883046277e-05, + "loss": 5.2963, + "step": 10124 + }, + { + "epoch": 0.06021624322009706, + "grad_norm": 1.7140870094299316, + "learning_rate": 4.955408100610066e-05, + "loss": 5.4636, + "step": 10125 + }, + { + "epoch": 0.060222190503378056, + "grad_norm": 1.3744412660598755, + "learning_rate": 4.955399317316697e-05, + "loss": 5.2985, + "step": 10126 + }, + { + "epoch": 0.06022813778665905, + "grad_norm": 1.572782278060913, + "learning_rate": 4.9553905331661734e-05, + "loss": 5.2598, + "step": 10127 + }, + { + "epoch": 0.06023408506994005, + "grad_norm": 1.6485692262649536, + "learning_rate": 4.955381748158499e-05, + "loss": 5.3764, + "step": 10128 + }, + { + "epoch": 0.06024003235322105, + "grad_norm": 1.5442413091659546, + "learning_rate": 4.955372962293676e-05, + "loss": 5.2504, + "step": 10129 + }, + { + "epoch": 0.060245979636502044, + "grad_norm": 1.807518482208252, + "learning_rate": 4.9553641755717075e-05, + "loss": 5.2853, + "step": 10130 + }, + { + "epoch": 0.060251926919783046, + "grad_norm": 1.5858244895935059, + "learning_rate": 4.9553553879925965e-05, + "loss": 5.2645, + "step": 10131 + }, + { + "epoch": 0.06025787420306404, + "grad_norm": 1.596307396888733, + "learning_rate": 4.955346599556347e-05, + "loss": 5.4094, + "step": 10132 + }, + { + "epoch": 0.060263821486345036, + "grad_norm": 1.4624857902526855, + "learning_rate": 4.955337810262961e-05, + "loss": 5.4366, + "step": 10133 + }, + { + "epoch": 0.06026976876962604, + "grad_norm": 1.426866888999939, + "learning_rate": 4.955329020112442e-05, + "loss": 5.324, + "step": 10134 + }, + { + "epoch": 0.06027571605290703, + "grad_norm": 1.6577516794204712, + "learning_rate": 4.955320229104793e-05, + "loss": 5.2937, + "step": 10135 + }, + { + "epoch": 0.06028166333618803, + "grad_norm": 1.3958433866500854, + "learning_rate": 4.9553114372400166e-05, + "loss": 5.421, + "step": 10136 + }, + { + "epoch": 0.06028761061946902, + "grad_norm": 1.3242517709732056, + "learning_rate": 4.9553026445181173e-05, + "loss": 5.2697, + "step": 10137 + }, + { + "epoch": 0.060293557902750025, + "grad_norm": 1.519018530845642, + "learning_rate": 4.955293850939096e-05, + "loss": 5.1432, + "step": 10138 + }, + { + "epoch": 0.06029950518603102, + "grad_norm": 1.528515338897705, + "learning_rate": 4.955285056502958e-05, + "loss": 5.1388, + "step": 10139 + }, + { + "epoch": 0.060305452469312015, + "grad_norm": 1.4830992221832275, + "learning_rate": 4.955276261209705e-05, + "loss": 5.3222, + "step": 10140 + }, + { + "epoch": 0.06031139975259302, + "grad_norm": 1.4149411916732788, + "learning_rate": 4.95526746505934e-05, + "loss": 5.2706, + "step": 10141 + }, + { + "epoch": 0.06031734703587401, + "grad_norm": 1.4466478824615479, + "learning_rate": 4.9552586680518676e-05, + "loss": 5.2309, + "step": 10142 + }, + { + "epoch": 0.06032329431915501, + "grad_norm": 1.4246203899383545, + "learning_rate": 4.9552498701872884e-05, + "loss": 5.1539, + "step": 10143 + }, + { + "epoch": 0.06032924160243601, + "grad_norm": 1.632572889328003, + "learning_rate": 4.955241071465608e-05, + "loss": 5.3788, + "step": 10144 + }, + { + "epoch": 0.060335188885717005, + "grad_norm": 1.5974568128585815, + "learning_rate": 4.955232271886828e-05, + "loss": 5.3558, + "step": 10145 + }, + { + "epoch": 0.060341136168998, + "grad_norm": 1.6396468877792358, + "learning_rate": 4.9552234714509516e-05, + "loss": 5.2162, + "step": 10146 + }, + { + "epoch": 0.060347083452279, + "grad_norm": 1.5349491834640503, + "learning_rate": 4.9552146701579815e-05, + "loss": 5.212, + "step": 10147 + }, + { + "epoch": 0.06035303073556, + "grad_norm": 1.5236495733261108, + "learning_rate": 4.955205868007922e-05, + "loss": 5.2984, + "step": 10148 + }, + { + "epoch": 0.06035897801884099, + "grad_norm": 1.4593411684036255, + "learning_rate": 4.955197065000775e-05, + "loss": 5.268, + "step": 10149 + }, + { + "epoch": 0.060364925302121994, + "grad_norm": 1.4498536586761475, + "learning_rate": 4.955188261136545e-05, + "loss": 5.1437, + "step": 10150 + }, + { + "epoch": 0.06037087258540299, + "grad_norm": 1.5059176683425903, + "learning_rate": 4.9551794564152334e-05, + "loss": 5.3011, + "step": 10151 + }, + { + "epoch": 0.060376819868683984, + "grad_norm": 1.5773544311523438, + "learning_rate": 4.9551706508368445e-05, + "loss": 5.2066, + "step": 10152 + }, + { + "epoch": 0.06038276715196498, + "grad_norm": 1.4858072996139526, + "learning_rate": 4.95516184440138e-05, + "loss": 5.2757, + "step": 10153 + }, + { + "epoch": 0.06038871443524598, + "grad_norm": 1.486055612564087, + "learning_rate": 4.955153037108845e-05, + "loss": 5.1416, + "step": 10154 + }, + { + "epoch": 0.060394661718526976, + "grad_norm": 1.3411048650741577, + "learning_rate": 4.955144228959241e-05, + "loss": 5.1708, + "step": 10155 + }, + { + "epoch": 0.06040060900180797, + "grad_norm": 1.2979127168655396, + "learning_rate": 4.9551354199525714e-05, + "loss": 5.1421, + "step": 10156 + }, + { + "epoch": 0.06040655628508897, + "grad_norm": 1.4928209781646729, + "learning_rate": 4.9551266100888395e-05, + "loss": 5.2185, + "step": 10157 + }, + { + "epoch": 0.06041250356836997, + "grad_norm": 1.58747398853302, + "learning_rate": 4.955117799368048e-05, + "loss": 5.2587, + "step": 10158 + }, + { + "epoch": 0.060418450851650964, + "grad_norm": 1.1862558126449585, + "learning_rate": 4.9551089877902e-05, + "loss": 5.2405, + "step": 10159 + }, + { + "epoch": 0.060424398134931966, + "grad_norm": 1.5547248125076294, + "learning_rate": 4.955100175355299e-05, + "loss": 5.2326, + "step": 10160 + }, + { + "epoch": 0.06043034541821296, + "grad_norm": 1.6986664533615112, + "learning_rate": 4.955091362063349e-05, + "loss": 5.2261, + "step": 10161 + }, + { + "epoch": 0.060436292701493956, + "grad_norm": 1.531891107559204, + "learning_rate": 4.95508254791435e-05, + "loss": 5.4475, + "step": 10162 + }, + { + "epoch": 0.06044223998477496, + "grad_norm": 1.57411789894104, + "learning_rate": 4.955073732908309e-05, + "loss": 5.1346, + "step": 10163 + }, + { + "epoch": 0.06044818726805595, + "grad_norm": 1.548439383506775, + "learning_rate": 4.9550649170452255e-05, + "loss": 5.1953, + "step": 10164 + }, + { + "epoch": 0.06045413455133695, + "grad_norm": 1.645850419998169, + "learning_rate": 4.955056100325105e-05, + "loss": 5.2728, + "step": 10165 + }, + { + "epoch": 0.06046008183461794, + "grad_norm": 1.6308786869049072, + "learning_rate": 4.95504728274795e-05, + "loss": 5.3134, + "step": 10166 + }, + { + "epoch": 0.060466029117898945, + "grad_norm": 1.4754101037979126, + "learning_rate": 4.955038464313763e-05, + "loss": 5.3938, + "step": 10167 + }, + { + "epoch": 0.06047197640117994, + "grad_norm": 2.408869981765747, + "learning_rate": 4.955029645022548e-05, + "loss": 5.4687, + "step": 10168 + }, + { + "epoch": 0.060477923684460935, + "grad_norm": 1.6601638793945312, + "learning_rate": 4.955020824874307e-05, + "loss": 5.165, + "step": 10169 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 1.5239113569259644, + "learning_rate": 4.955012003869043e-05, + "loss": 5.133, + "step": 10170 + }, + { + "epoch": 0.06048981825102293, + "grad_norm": 1.6661083698272705, + "learning_rate": 4.955003182006761e-05, + "loss": 5.2033, + "step": 10171 + }, + { + "epoch": 0.06049576553430393, + "grad_norm": 1.4320698976516724, + "learning_rate": 4.9549943592874615e-05, + "loss": 5.1842, + "step": 10172 + }, + { + "epoch": 0.06050171281758493, + "grad_norm": 1.789302110671997, + "learning_rate": 4.95498553571115e-05, + "loss": 5.1052, + "step": 10173 + }, + { + "epoch": 0.060507660100865925, + "grad_norm": 1.598085880279541, + "learning_rate": 4.954976711277828e-05, + "loss": 5.3194, + "step": 10174 + }, + { + "epoch": 0.06051360738414692, + "grad_norm": 1.4569145441055298, + "learning_rate": 4.954967885987498e-05, + "loss": 5.2009, + "step": 10175 + }, + { + "epoch": 0.06051955466742792, + "grad_norm": 1.5980345010757446, + "learning_rate": 4.954959059840165e-05, + "loss": 5.1686, + "step": 10176 + }, + { + "epoch": 0.06052550195070892, + "grad_norm": 1.5382320880889893, + "learning_rate": 4.954950232835831e-05, + "loss": 5.303, + "step": 10177 + }, + { + "epoch": 0.06053144923398991, + "grad_norm": 1.5568296909332275, + "learning_rate": 4.954941404974499e-05, + "loss": 5.2044, + "step": 10178 + }, + { + "epoch": 0.060537396517270914, + "grad_norm": 1.6732075214385986, + "learning_rate": 4.954932576256173e-05, + "loss": 5.3133, + "step": 10179 + }, + { + "epoch": 0.06054334380055191, + "grad_norm": 1.6905434131622314, + "learning_rate": 4.954923746680855e-05, + "loss": 5.3868, + "step": 10180 + }, + { + "epoch": 0.060549291083832904, + "grad_norm": 1.4349027872085571, + "learning_rate": 4.954914916248549e-05, + "loss": 5.2215, + "step": 10181 + }, + { + "epoch": 0.0605552383671139, + "grad_norm": 1.5257092714309692, + "learning_rate": 4.9549060849592566e-05, + "loss": 5.2148, + "step": 10182 + }, + { + "epoch": 0.0605611856503949, + "grad_norm": 1.5402655601501465, + "learning_rate": 4.954897252812982e-05, + "loss": 5.3069, + "step": 10183 + }, + { + "epoch": 0.060567132933675896, + "grad_norm": 1.801798701286316, + "learning_rate": 4.954888419809729e-05, + "loss": 5.0786, + "step": 10184 + }, + { + "epoch": 0.06057308021695689, + "grad_norm": 1.4860090017318726, + "learning_rate": 4.954879585949499e-05, + "loss": 4.8878, + "step": 10185 + }, + { + "epoch": 0.06057902750023789, + "grad_norm": 1.7319056987762451, + "learning_rate": 4.954870751232296e-05, + "loss": 4.9013, + "step": 10186 + }, + { + "epoch": 0.06058497478351889, + "grad_norm": 1.4376243352890015, + "learning_rate": 4.954861915658123e-05, + "loss": 5.37, + "step": 10187 + }, + { + "epoch": 0.060590922066799884, + "grad_norm": 1.2903879880905151, + "learning_rate": 4.954853079226983e-05, + "loss": 5.5355, + "step": 10188 + }, + { + "epoch": 0.060596869350080886, + "grad_norm": 1.5223259925842285, + "learning_rate": 4.95484424193888e-05, + "loss": 5.3451, + "step": 10189 + }, + { + "epoch": 0.06060281663336188, + "grad_norm": 1.283892035484314, + "learning_rate": 4.954835403793815e-05, + "loss": 5.2245, + "step": 10190 + }, + { + "epoch": 0.060608763916642876, + "grad_norm": 1.5581207275390625, + "learning_rate": 4.9548265647917936e-05, + "loss": 5.303, + "step": 10191 + }, + { + "epoch": 0.06061471119992388, + "grad_norm": 1.4258673191070557, + "learning_rate": 4.9548177249328164e-05, + "loss": 5.4569, + "step": 10192 + }, + { + "epoch": 0.06062065848320487, + "grad_norm": 1.4326061010360718, + "learning_rate": 4.9548088842168886e-05, + "loss": 5.2761, + "step": 10193 + }, + { + "epoch": 0.06062660576648587, + "grad_norm": 1.9100563526153564, + "learning_rate": 4.9548000426440114e-05, + "loss": 4.9366, + "step": 10194 + }, + { + "epoch": 0.06063255304976687, + "grad_norm": 1.7059932947158813, + "learning_rate": 4.9547912002141895e-05, + "loss": 4.9135, + "step": 10195 + }, + { + "epoch": 0.060638500333047865, + "grad_norm": 1.6715087890625, + "learning_rate": 4.954782356927425e-05, + "loss": 5.0662, + "step": 10196 + }, + { + "epoch": 0.06064444761632886, + "grad_norm": 1.966430902481079, + "learning_rate": 4.9547735127837223e-05, + "loss": 4.7995, + "step": 10197 + }, + { + "epoch": 0.060650394899609855, + "grad_norm": 1.7138090133666992, + "learning_rate": 4.954764667783083e-05, + "loss": 4.9745, + "step": 10198 + }, + { + "epoch": 0.06065634218289086, + "grad_norm": 1.832889199256897, + "learning_rate": 4.95475582192551e-05, + "loss": 4.9795, + "step": 10199 + }, + { + "epoch": 0.06066228946617185, + "grad_norm": 1.883525013923645, + "learning_rate": 4.954746975211008e-05, + "loss": 4.8523, + "step": 10200 + }, + { + "epoch": 0.06066823674945285, + "grad_norm": 1.747101068496704, + "learning_rate": 4.954738127639579e-05, + "loss": 4.9402, + "step": 10201 + }, + { + "epoch": 0.06067418403273385, + "grad_norm": 1.583900809288025, + "learning_rate": 4.9547292792112256e-05, + "loss": 5.176, + "step": 10202 + }, + { + "epoch": 0.060680131316014845, + "grad_norm": 1.6390752792358398, + "learning_rate": 4.954720429925953e-05, + "loss": 5.1014, + "step": 10203 + }, + { + "epoch": 0.06068607859929584, + "grad_norm": 1.4499305486679077, + "learning_rate": 4.954711579783762e-05, + "loss": 5.1473, + "step": 10204 + }, + { + "epoch": 0.06069202588257684, + "grad_norm": 1.2734607458114624, + "learning_rate": 4.954702728784656e-05, + "loss": 5.0919, + "step": 10205 + }, + { + "epoch": 0.06069797316585784, + "grad_norm": 1.4447498321533203, + "learning_rate": 4.954693876928639e-05, + "loss": 5.0145, + "step": 10206 + }, + { + "epoch": 0.06070392044913883, + "grad_norm": 1.7052301168441772, + "learning_rate": 4.954685024215714e-05, + "loss": 5.109, + "step": 10207 + }, + { + "epoch": 0.060709867732419834, + "grad_norm": 1.6922130584716797, + "learning_rate": 4.9546761706458836e-05, + "loss": 5.2519, + "step": 10208 + }, + { + "epoch": 0.06071581501570083, + "grad_norm": 1.7998334169387817, + "learning_rate": 4.954667316219151e-05, + "loss": 5.2272, + "step": 10209 + }, + { + "epoch": 0.060721762298981824, + "grad_norm": 1.6331555843353271, + "learning_rate": 4.95465846093552e-05, + "loss": 5.1382, + "step": 10210 + }, + { + "epoch": 0.06072770958226282, + "grad_norm": 1.4777888059616089, + "learning_rate": 4.954649604794993e-05, + "loss": 5.0601, + "step": 10211 + }, + { + "epoch": 0.06073365686554382, + "grad_norm": 1.6776998043060303, + "learning_rate": 4.954640747797573e-05, + "loss": 5.0229, + "step": 10212 + }, + { + "epoch": 0.060739604148824816, + "grad_norm": 1.9567780494689941, + "learning_rate": 4.9546318899432634e-05, + "loss": 5.483, + "step": 10213 + }, + { + "epoch": 0.06074555143210581, + "grad_norm": 1.7381116151809692, + "learning_rate": 4.9546230312320664e-05, + "loss": 5.4088, + "step": 10214 + }, + { + "epoch": 0.06075149871538681, + "grad_norm": 2.290041446685791, + "learning_rate": 4.954614171663986e-05, + "loss": 5.0879, + "step": 10215 + }, + { + "epoch": 0.06075744599866781, + "grad_norm": 1.680309534072876, + "learning_rate": 4.9546053112390255e-05, + "loss": 5.1931, + "step": 10216 + }, + { + "epoch": 0.0607633932819488, + "grad_norm": 1.997379183769226, + "learning_rate": 4.9545964499571885e-05, + "loss": 5.0834, + "step": 10217 + }, + { + "epoch": 0.060769340565229805, + "grad_norm": 1.9145865440368652, + "learning_rate": 4.954587587818476e-05, + "loss": 5.3478, + "step": 10218 + }, + { + "epoch": 0.0607752878485108, + "grad_norm": 1.565874457359314, + "learning_rate": 4.954578724822893e-05, + "loss": 5.2579, + "step": 10219 + }, + { + "epoch": 0.060781235131791796, + "grad_norm": 1.5997511148452759, + "learning_rate": 4.9545698609704416e-05, + "loss": 5.233, + "step": 10220 + }, + { + "epoch": 0.0607871824150728, + "grad_norm": 2.205021619796753, + "learning_rate": 4.954560996261125e-05, + "loss": 5.227, + "step": 10221 + }, + { + "epoch": 0.06079312969835379, + "grad_norm": 1.5360487699508667, + "learning_rate": 4.954552130694947e-05, + "loss": 5.182, + "step": 10222 + }, + { + "epoch": 0.06079907698163479, + "grad_norm": 1.5571166276931763, + "learning_rate": 4.95454326427191e-05, + "loss": 5.3671, + "step": 10223 + }, + { + "epoch": 0.06080502426491579, + "grad_norm": 1.7289685010910034, + "learning_rate": 4.9545343969920175e-05, + "loss": 5.1256, + "step": 10224 + }, + { + "epoch": 0.060810971548196785, + "grad_norm": 1.7945314645767212, + "learning_rate": 4.954525528855272e-05, + "loss": 5.0339, + "step": 10225 + }, + { + "epoch": 0.06081691883147778, + "grad_norm": 1.7037841081619263, + "learning_rate": 4.954516659861678e-05, + "loss": 4.9308, + "step": 10226 + }, + { + "epoch": 0.060822866114758775, + "grad_norm": 1.8096303939819336, + "learning_rate": 4.954507790011237e-05, + "loss": 5.1173, + "step": 10227 + }, + { + "epoch": 0.06082881339803978, + "grad_norm": 1.7563896179199219, + "learning_rate": 4.954498919303952e-05, + "loss": 5.1713, + "step": 10228 + }, + { + "epoch": 0.06083476068132077, + "grad_norm": 1.8820421695709229, + "learning_rate": 4.954490047739827e-05, + "loss": 5.2372, + "step": 10229 + }, + { + "epoch": 0.06084070796460177, + "grad_norm": 2.7050085067749023, + "learning_rate": 4.954481175318865e-05, + "loss": 5.6108, + "step": 10230 + }, + { + "epoch": 0.06084665524788277, + "grad_norm": 1.6424611806869507, + "learning_rate": 4.954472302041069e-05, + "loss": 5.1423, + "step": 10231 + }, + { + "epoch": 0.060852602531163764, + "grad_norm": 1.7690013647079468, + "learning_rate": 4.954463427906443e-05, + "loss": 5.0232, + "step": 10232 + }, + { + "epoch": 0.06085854981444476, + "grad_norm": 1.8925920724868774, + "learning_rate": 4.9544545529149874e-05, + "loss": 4.8949, + "step": 10233 + }, + { + "epoch": 0.06086449709772576, + "grad_norm": 1.7629793882369995, + "learning_rate": 4.954445677066709e-05, + "loss": 4.8832, + "step": 10234 + }, + { + "epoch": 0.06087044438100676, + "grad_norm": 1.5553311109542847, + "learning_rate": 4.9544368003616084e-05, + "loss": 4.8787, + "step": 10235 + }, + { + "epoch": 0.06087639166428775, + "grad_norm": 1.6236152648925781, + "learning_rate": 4.9544279227996884e-05, + "loss": 4.8583, + "step": 10236 + }, + { + "epoch": 0.060882338947568754, + "grad_norm": 1.7591924667358398, + "learning_rate": 4.954419044380954e-05, + "loss": 5.1468, + "step": 10237 + }, + { + "epoch": 0.06088828623084975, + "grad_norm": 1.8084702491760254, + "learning_rate": 4.954410165105406e-05, + "loss": 5.3178, + "step": 10238 + }, + { + "epoch": 0.060894233514130744, + "grad_norm": 1.6629832983016968, + "learning_rate": 4.9544012849730495e-05, + "loss": 5.2955, + "step": 10239 + }, + { + "epoch": 0.06090018079741174, + "grad_norm": 1.6681956052780151, + "learning_rate": 4.954392403983887e-05, + "loss": 4.9919, + "step": 10240 + }, + { + "epoch": 0.06090612808069274, + "grad_norm": 1.7849150896072388, + "learning_rate": 4.954383522137922e-05, + "loss": 4.9667, + "step": 10241 + }, + { + "epoch": 0.060912075363973736, + "grad_norm": 1.6313222646713257, + "learning_rate": 4.954374639435157e-05, + "loss": 4.9842, + "step": 10242 + }, + { + "epoch": 0.06091802264725473, + "grad_norm": 1.3376604318618774, + "learning_rate": 4.954365755875594e-05, + "loss": 5.2643, + "step": 10243 + }, + { + "epoch": 0.06092396993053573, + "grad_norm": 1.5971726179122925, + "learning_rate": 4.954356871459238e-05, + "loss": 5.2225, + "step": 10244 + }, + { + "epoch": 0.06092991721381673, + "grad_norm": 1.638786792755127, + "learning_rate": 4.954347986186091e-05, + "loss": 5.2855, + "step": 10245 + }, + { + "epoch": 0.06093586449709772, + "grad_norm": 1.6273027658462524, + "learning_rate": 4.954339100056157e-05, + "loss": 5.3825, + "step": 10246 + }, + { + "epoch": 0.060941811780378725, + "grad_norm": 1.4666591882705688, + "learning_rate": 4.954330213069438e-05, + "loss": 5.3148, + "step": 10247 + }, + { + "epoch": 0.06094775906365972, + "grad_norm": 1.447332501411438, + "learning_rate": 4.954321325225938e-05, + "loss": 5.1907, + "step": 10248 + }, + { + "epoch": 0.060953706346940716, + "grad_norm": 1.7162379026412964, + "learning_rate": 4.95431243652566e-05, + "loss": 5.289, + "step": 10249 + }, + { + "epoch": 0.06095965363022172, + "grad_norm": 1.7236372232437134, + "learning_rate": 4.954303546968606e-05, + "loss": 5.1839, + "step": 10250 + }, + { + "epoch": 0.06096560091350271, + "grad_norm": 1.76384437084198, + "learning_rate": 4.954294656554781e-05, + "loss": 5.1665, + "step": 10251 + }, + { + "epoch": 0.06097154819678371, + "grad_norm": 1.595041275024414, + "learning_rate": 4.954285765284187e-05, + "loss": 5.2667, + "step": 10252 + }, + { + "epoch": 0.06097749548006471, + "grad_norm": 1.6735886335372925, + "learning_rate": 4.954276873156827e-05, + "loss": 5.3367, + "step": 10253 + }, + { + "epoch": 0.060983442763345705, + "grad_norm": 1.656801462173462, + "learning_rate": 4.9542679801727044e-05, + "loss": 5.3188, + "step": 10254 + }, + { + "epoch": 0.0609893900466267, + "grad_norm": 1.7149133682250977, + "learning_rate": 4.9542590863318214e-05, + "loss": 5.0618, + "step": 10255 + }, + { + "epoch": 0.060995337329907695, + "grad_norm": 1.715561032295227, + "learning_rate": 4.954250191634183e-05, + "loss": 5.2589, + "step": 10256 + }, + { + "epoch": 0.0610012846131887, + "grad_norm": 1.4005486965179443, + "learning_rate": 4.95424129607979e-05, + "loss": 5.1061, + "step": 10257 + }, + { + "epoch": 0.06100723189646969, + "grad_norm": 1.6608542203903198, + "learning_rate": 4.954232399668648e-05, + "loss": 5.3779, + "step": 10258 + }, + { + "epoch": 0.06101317917975069, + "grad_norm": 1.5471054315567017, + "learning_rate": 4.954223502400758e-05, + "loss": 5.448, + "step": 10259 + }, + { + "epoch": 0.06101912646303169, + "grad_norm": 1.6794294118881226, + "learning_rate": 4.9542146042761246e-05, + "loss": 5.1452, + "step": 10260 + }, + { + "epoch": 0.061025073746312684, + "grad_norm": 1.5416966676712036, + "learning_rate": 4.95420570529475e-05, + "loss": 5.2192, + "step": 10261 + }, + { + "epoch": 0.06103102102959368, + "grad_norm": 1.6667221784591675, + "learning_rate": 4.954196805456637e-05, + "loss": 5.3682, + "step": 10262 + }, + { + "epoch": 0.06103696831287468, + "grad_norm": 1.3199689388275146, + "learning_rate": 4.95418790476179e-05, + "loss": 5.1038, + "step": 10263 + }, + { + "epoch": 0.06104291559615568, + "grad_norm": 1.5326366424560547, + "learning_rate": 4.954179003210211e-05, + "loss": 5.3002, + "step": 10264 + }, + { + "epoch": 0.06104886287943667, + "grad_norm": 1.529453992843628, + "learning_rate": 4.954170100801904e-05, + "loss": 5.4515, + "step": 10265 + }, + { + "epoch": 0.061054810162717674, + "grad_norm": 1.719894528388977, + "learning_rate": 4.954161197536871e-05, + "loss": 5.4161, + "step": 10266 + }, + { + "epoch": 0.06106075744599867, + "grad_norm": 1.4632771015167236, + "learning_rate": 4.954152293415115e-05, + "loss": 5.4669, + "step": 10267 + }, + { + "epoch": 0.061066704729279664, + "grad_norm": 1.7698414325714111, + "learning_rate": 4.954143388436641e-05, + "loss": 5.4045, + "step": 10268 + }, + { + "epoch": 0.06107265201256066, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.95413448260145e-05, + "loss": 5.3637, + "step": 10269 + }, + { + "epoch": 0.06107859929584166, + "grad_norm": 1.6832401752471924, + "learning_rate": 4.954125575909547e-05, + "loss": 5.2123, + "step": 10270 + }, + { + "epoch": 0.061084546579122656, + "grad_norm": 1.6782628297805786, + "learning_rate": 4.954116668360933e-05, + "loss": 5.3007, + "step": 10271 + }, + { + "epoch": 0.06109049386240365, + "grad_norm": 1.598941683769226, + "learning_rate": 4.954107759955613e-05, + "loss": 5.1452, + "step": 10272 + }, + { + "epoch": 0.06109644114568465, + "grad_norm": 1.4137005805969238, + "learning_rate": 4.954098850693589e-05, + "loss": 5.1348, + "step": 10273 + }, + { + "epoch": 0.06110238842896565, + "grad_norm": 1.388108730316162, + "learning_rate": 4.9540899405748646e-05, + "loss": 5.4108, + "step": 10274 + }, + { + "epoch": 0.06110833571224664, + "grad_norm": 1.5997217893600464, + "learning_rate": 4.954081029599443e-05, + "loss": 5.3727, + "step": 10275 + }, + { + "epoch": 0.061114282995527645, + "grad_norm": 1.5805003643035889, + "learning_rate": 4.954072117767327e-05, + "loss": 5.4151, + "step": 10276 + }, + { + "epoch": 0.06112023027880864, + "grad_norm": 1.402063250541687, + "learning_rate": 4.9540632050785194e-05, + "loss": 5.287, + "step": 10277 + }, + { + "epoch": 0.061126177562089636, + "grad_norm": 1.6100205183029175, + "learning_rate": 4.9540542915330236e-05, + "loss": 5.2047, + "step": 10278 + }, + { + "epoch": 0.06113212484537064, + "grad_norm": 1.6199030876159668, + "learning_rate": 4.9540453771308435e-05, + "loss": 5.2141, + "step": 10279 + }, + { + "epoch": 0.06113807212865163, + "grad_norm": 1.485408067703247, + "learning_rate": 4.95403646187198e-05, + "loss": 5.1893, + "step": 10280 + }, + { + "epoch": 0.06114401941193263, + "grad_norm": 1.5842605829238892, + "learning_rate": 4.9540275457564395e-05, + "loss": 5.1383, + "step": 10281 + }, + { + "epoch": 0.06114996669521363, + "grad_norm": 1.5824682712554932, + "learning_rate": 4.9540186287842225e-05, + "loss": 5.1754, + "step": 10282 + }, + { + "epoch": 0.061155913978494625, + "grad_norm": 1.7714753150939941, + "learning_rate": 4.954009710955333e-05, + "loss": 5.2951, + "step": 10283 + }, + { + "epoch": 0.06116186126177562, + "grad_norm": 1.6528159379959106, + "learning_rate": 4.954000792269774e-05, + "loss": 5.2391, + "step": 10284 + }, + { + "epoch": 0.061167808545056615, + "grad_norm": 1.54135262966156, + "learning_rate": 4.953991872727549e-05, + "loss": 5.3849, + "step": 10285 + }, + { + "epoch": 0.06117375582833762, + "grad_norm": 1.4225090742111206, + "learning_rate": 4.953982952328661e-05, + "loss": 5.2211, + "step": 10286 + }, + { + "epoch": 0.06117970311161861, + "grad_norm": 1.7174444198608398, + "learning_rate": 4.953974031073112e-05, + "loss": 5.2873, + "step": 10287 + }, + { + "epoch": 0.06118565039489961, + "grad_norm": 1.4754962921142578, + "learning_rate": 4.953965108960907e-05, + "loss": 5.3137, + "step": 10288 + }, + { + "epoch": 0.06119159767818061, + "grad_norm": 1.6911029815673828, + "learning_rate": 4.9539561859920475e-05, + "loss": 5.1914, + "step": 10289 + }, + { + "epoch": 0.061197544961461604, + "grad_norm": 1.5569958686828613, + "learning_rate": 4.953947262166537e-05, + "loss": 5.2141, + "step": 10290 + }, + { + "epoch": 0.0612034922447426, + "grad_norm": 1.5939570665359497, + "learning_rate": 4.9539383374843794e-05, + "loss": 5.2059, + "step": 10291 + }, + { + "epoch": 0.0612094395280236, + "grad_norm": 1.7220442295074463, + "learning_rate": 4.953929411945577e-05, + "loss": 5.3399, + "step": 10292 + }, + { + "epoch": 0.061215386811304597, + "grad_norm": 1.7158905267715454, + "learning_rate": 4.953920485550134e-05, + "loss": 5.3392, + "step": 10293 + }, + { + "epoch": 0.06122133409458559, + "grad_norm": 1.5761021375656128, + "learning_rate": 4.9539115582980525e-05, + "loss": 5.1523, + "step": 10294 + }, + { + "epoch": 0.061227281377866594, + "grad_norm": 1.7746198177337646, + "learning_rate": 4.953902630189335e-05, + "loss": 5.1577, + "step": 10295 + }, + { + "epoch": 0.06123322866114759, + "grad_norm": 1.9633466005325317, + "learning_rate": 4.953893701223986e-05, + "loss": 5.448, + "step": 10296 + }, + { + "epoch": 0.061239175944428584, + "grad_norm": 1.7086774110794067, + "learning_rate": 4.953884771402007e-05, + "loss": 5.2624, + "step": 10297 + }, + { + "epoch": 0.06124512322770958, + "grad_norm": 1.5247907638549805, + "learning_rate": 4.953875840723403e-05, + "loss": 5.1644, + "step": 10298 + }, + { + "epoch": 0.06125107051099058, + "grad_norm": 1.7014293670654297, + "learning_rate": 4.953866909188177e-05, + "loss": 5.2118, + "step": 10299 + }, + { + "epoch": 0.061257017794271576, + "grad_norm": 1.390368103981018, + "learning_rate": 4.9538579767963305e-05, + "loss": 5.3159, + "step": 10300 + }, + { + "epoch": 0.06126296507755257, + "grad_norm": 1.4748090505599976, + "learning_rate": 4.953849043547868e-05, + "loss": 5.5283, + "step": 10301 + }, + { + "epoch": 0.06126891236083357, + "grad_norm": 1.6433857679367065, + "learning_rate": 4.953840109442792e-05, + "loss": 5.3388, + "step": 10302 + }, + { + "epoch": 0.06127485964411457, + "grad_norm": 1.6636543273925781, + "learning_rate": 4.9538311744811056e-05, + "loss": 5.4523, + "step": 10303 + }, + { + "epoch": 0.06128080692739556, + "grad_norm": 1.6074668169021606, + "learning_rate": 4.953822238662812e-05, + "loss": 5.2963, + "step": 10304 + }, + { + "epoch": 0.061286754210676565, + "grad_norm": 1.8746674060821533, + "learning_rate": 4.9538133019879155e-05, + "loss": 5.359, + "step": 10305 + }, + { + "epoch": 0.06129270149395756, + "grad_norm": 1.5438963174819946, + "learning_rate": 4.953804364456417e-05, + "loss": 5.2039, + "step": 10306 + }, + { + "epoch": 0.061298648777238555, + "grad_norm": 1.5594170093536377, + "learning_rate": 4.9537954260683205e-05, + "loss": 5.3003, + "step": 10307 + }, + { + "epoch": 0.06130459606051956, + "grad_norm": 1.3331657648086548, + "learning_rate": 4.95378648682363e-05, + "loss": 5.3051, + "step": 10308 + }, + { + "epoch": 0.06131054334380055, + "grad_norm": 1.5514707565307617, + "learning_rate": 4.953777546722348e-05, + "loss": 5.3344, + "step": 10309 + }, + { + "epoch": 0.06131649062708155, + "grad_norm": 1.6396936178207397, + "learning_rate": 4.953768605764477e-05, + "loss": 5.1244, + "step": 10310 + }, + { + "epoch": 0.06132243791036255, + "grad_norm": 1.576407551765442, + "learning_rate": 4.953759663950022e-05, + "loss": 5.1908, + "step": 10311 + }, + { + "epoch": 0.061328385193643545, + "grad_norm": 1.5868182182312012, + "learning_rate": 4.953750721278984e-05, + "loss": 5.2538, + "step": 10312 + }, + { + "epoch": 0.06133433247692454, + "grad_norm": 1.7734450101852417, + "learning_rate": 4.9537417777513664e-05, + "loss": 5.3727, + "step": 10313 + }, + { + "epoch": 0.061340279760205535, + "grad_norm": 1.5105754137039185, + "learning_rate": 4.953732833367174e-05, + "loss": 5.3547, + "step": 10314 + }, + { + "epoch": 0.06134622704348654, + "grad_norm": 1.5607833862304688, + "learning_rate": 4.953723888126408e-05, + "loss": 5.2265, + "step": 10315 + }, + { + "epoch": 0.06135217432676753, + "grad_norm": 1.2882065773010254, + "learning_rate": 4.9537149420290726e-05, + "loss": 4.9719, + "step": 10316 + }, + { + "epoch": 0.06135812161004853, + "grad_norm": 1.4349958896636963, + "learning_rate": 4.953705995075171e-05, + "loss": 5.2773, + "step": 10317 + }, + { + "epoch": 0.06136406889332953, + "grad_norm": 2.3595380783081055, + "learning_rate": 4.953697047264706e-05, + "loss": 5.7403, + "step": 10318 + }, + { + "epoch": 0.061370016176610524, + "grad_norm": 1.6126785278320312, + "learning_rate": 4.9536880985976805e-05, + "loss": 5.5316, + "step": 10319 + }, + { + "epoch": 0.06137596345989152, + "grad_norm": 1.7738999128341675, + "learning_rate": 4.953679149074098e-05, + "loss": 5.602, + "step": 10320 + }, + { + "epoch": 0.06138191074317252, + "grad_norm": 1.9263441562652588, + "learning_rate": 4.953670198693961e-05, + "loss": 5.0669, + "step": 10321 + }, + { + "epoch": 0.061387858026453516, + "grad_norm": 1.6290051937103271, + "learning_rate": 4.953661247457273e-05, + "loss": 5.2163, + "step": 10322 + }, + { + "epoch": 0.06139380530973451, + "grad_norm": 1.6354936361312866, + "learning_rate": 4.9536522953640374e-05, + "loss": 5.1678, + "step": 10323 + }, + { + "epoch": 0.061399752593015514, + "grad_norm": 1.7600759267807007, + "learning_rate": 4.953643342414257e-05, + "loss": 5.946, + "step": 10324 + }, + { + "epoch": 0.06140569987629651, + "grad_norm": 2.0515828132629395, + "learning_rate": 4.9536343886079357e-05, + "loss": 5.463, + "step": 10325 + }, + { + "epoch": 0.061411647159577504, + "grad_norm": 1.9990586042404175, + "learning_rate": 4.9536254339450754e-05, + "loss": 5.3084, + "step": 10326 + }, + { + "epoch": 0.0614175944428585, + "grad_norm": 1.7596598863601685, + "learning_rate": 4.95361647842568e-05, + "loss": 5.9268, + "step": 10327 + }, + { + "epoch": 0.0614235417261395, + "grad_norm": 1.8702850341796875, + "learning_rate": 4.953607522049752e-05, + "loss": 5.4303, + "step": 10328 + }, + { + "epoch": 0.061429489009420496, + "grad_norm": 1.9598991870880127, + "learning_rate": 4.953598564817296e-05, + "loss": 5.1813, + "step": 10329 + }, + { + "epoch": 0.06143543629270149, + "grad_norm": 1.5180566310882568, + "learning_rate": 4.953589606728314e-05, + "loss": 5.6051, + "step": 10330 + }, + { + "epoch": 0.06144138357598249, + "grad_norm": 1.4654324054718018, + "learning_rate": 4.953580647782808e-05, + "loss": 5.7188, + "step": 10331 + }, + { + "epoch": 0.06144733085926349, + "grad_norm": 1.351413607597351, + "learning_rate": 4.9535716879807835e-05, + "loss": 5.6928, + "step": 10332 + }, + { + "epoch": 0.06145327814254448, + "grad_norm": 1.4495320320129395, + "learning_rate": 4.953562727322242e-05, + "loss": 5.5576, + "step": 10333 + }, + { + "epoch": 0.061459225425825485, + "grad_norm": 1.4851731061935425, + "learning_rate": 4.953553765807187e-05, + "loss": 5.31, + "step": 10334 + }, + { + "epoch": 0.06146517270910648, + "grad_norm": 1.9790018796920776, + "learning_rate": 4.953544803435622e-05, + "loss": 5.5375, + "step": 10335 + }, + { + "epoch": 0.061471119992387475, + "grad_norm": 1.6931076049804688, + "learning_rate": 4.953535840207549e-05, + "loss": 5.6863, + "step": 10336 + }, + { + "epoch": 0.06147706727566848, + "grad_norm": 1.7479010820388794, + "learning_rate": 4.9535268761229735e-05, + "loss": 5.571, + "step": 10337 + }, + { + "epoch": 0.06148301455894947, + "grad_norm": 2.0722434520721436, + "learning_rate": 4.953517911181896e-05, + "loss": 5.2462, + "step": 10338 + }, + { + "epoch": 0.06148896184223047, + "grad_norm": 2.125288486480713, + "learning_rate": 4.953508945384322e-05, + "loss": 5.6343, + "step": 10339 + }, + { + "epoch": 0.06149490912551147, + "grad_norm": 2.0187058448791504, + "learning_rate": 4.953499978730252e-05, + "loss": 5.8642, + "step": 10340 + }, + { + "epoch": 0.061500856408792465, + "grad_norm": 1.6849068403244019, + "learning_rate": 4.9534910112196906e-05, + "loss": 5.5534, + "step": 10341 + }, + { + "epoch": 0.06150680369207346, + "grad_norm": 2.008009433746338, + "learning_rate": 4.953482042852641e-05, + "loss": 5.464, + "step": 10342 + }, + { + "epoch": 0.061512750975354455, + "grad_norm": 1.7537699937820435, + "learning_rate": 4.953473073629107e-05, + "loss": 5.9052, + "step": 10343 + }, + { + "epoch": 0.06151869825863546, + "grad_norm": 1.5746090412139893, + "learning_rate": 4.95346410354909e-05, + "loss": 5.6898, + "step": 10344 + }, + { + "epoch": 0.06152464554191645, + "grad_norm": 2.027543783187866, + "learning_rate": 4.9534551326125944e-05, + "loss": 6.0481, + "step": 10345 + }, + { + "epoch": 0.06153059282519745, + "grad_norm": 1.6113003492355347, + "learning_rate": 4.9534461608196224e-05, + "loss": 5.4792, + "step": 10346 + }, + { + "epoch": 0.06153654010847845, + "grad_norm": 1.5709928274154663, + "learning_rate": 4.953437188170178e-05, + "loss": 5.7601, + "step": 10347 + }, + { + "epoch": 0.061542487391759444, + "grad_norm": 1.7116700410842896, + "learning_rate": 4.953428214664265e-05, + "loss": 5.7284, + "step": 10348 + }, + { + "epoch": 0.06154843467504044, + "grad_norm": 2.262103796005249, + "learning_rate": 4.953419240301884e-05, + "loss": 5.7247, + "step": 10349 + }, + { + "epoch": 0.06155438195832144, + "grad_norm": 1.8536508083343506, + "learning_rate": 4.9534102650830406e-05, + "loss": 5.7509, + "step": 10350 + }, + { + "epoch": 0.061560329241602436, + "grad_norm": 2.1372785568237305, + "learning_rate": 4.953401289007737e-05, + "loss": 5.8436, + "step": 10351 + }, + { + "epoch": 0.06156627652488343, + "grad_norm": 2.5555527210235596, + "learning_rate": 4.953392312075976e-05, + "loss": 5.6481, + "step": 10352 + }, + { + "epoch": 0.061572223808164434, + "grad_norm": 2.607111692428589, + "learning_rate": 4.953383334287761e-05, + "loss": 5.4822, + "step": 10353 + }, + { + "epoch": 0.06157817109144543, + "grad_norm": 2.728994369506836, + "learning_rate": 4.953374355643095e-05, + "loss": 5.4327, + "step": 10354 + }, + { + "epoch": 0.061584118374726424, + "grad_norm": 2.3375606536865234, + "learning_rate": 4.953365376141983e-05, + "loss": 5.537, + "step": 10355 + }, + { + "epoch": 0.06159006565800742, + "grad_norm": 2.4509146213531494, + "learning_rate": 4.953356395784425e-05, + "loss": 5.5717, + "step": 10356 + }, + { + "epoch": 0.06159601294128842, + "grad_norm": 2.412198781967163, + "learning_rate": 4.953347414570426e-05, + "loss": 5.5216, + "step": 10357 + }, + { + "epoch": 0.061601960224569416, + "grad_norm": 1.7105822563171387, + "learning_rate": 4.9533384324999886e-05, + "loss": 5.6661, + "step": 10358 + }, + { + "epoch": 0.06160790750785041, + "grad_norm": 2.2394793033599854, + "learning_rate": 4.953329449573116e-05, + "loss": 5.2062, + "step": 10359 + }, + { + "epoch": 0.06161385479113141, + "grad_norm": 2.1791203022003174, + "learning_rate": 4.9533204657898127e-05, + "loss": 5.1961, + "step": 10360 + }, + { + "epoch": 0.06161980207441241, + "grad_norm": 2.0430495738983154, + "learning_rate": 4.953311481150079e-05, + "loss": 5.1492, + "step": 10361 + }, + { + "epoch": 0.0616257493576934, + "grad_norm": 2.157975435256958, + "learning_rate": 4.9533024956539204e-05, + "loss": 4.9354, + "step": 10362 + }, + { + "epoch": 0.061631696640974405, + "grad_norm": 2.101484537124634, + "learning_rate": 4.953293509301339e-05, + "loss": 4.9212, + "step": 10363 + }, + { + "epoch": 0.0616376439242554, + "grad_norm": 1.740793228149414, + "learning_rate": 4.953284522092338e-05, + "loss": 5.1234, + "step": 10364 + }, + { + "epoch": 0.061643591207536395, + "grad_norm": 1.9694514274597168, + "learning_rate": 4.953275534026921e-05, + "loss": 5.3688, + "step": 10365 + }, + { + "epoch": 0.0616495384908174, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.953266545105091e-05, + "loss": 4.7194, + "step": 10366 + }, + { + "epoch": 0.06165548577409839, + "grad_norm": 2.016284942626953, + "learning_rate": 4.95325755532685e-05, + "loss": 4.7397, + "step": 10367 + }, + { + "epoch": 0.06166143305737939, + "grad_norm": 2.3073251247406006, + "learning_rate": 4.9532485646922036e-05, + "loss": 4.59, + "step": 10368 + }, + { + "epoch": 0.06166738034066039, + "grad_norm": 2.265873670578003, + "learning_rate": 4.9532395732011524e-05, + "loss": 4.7713, + "step": 10369 + }, + { + "epoch": 0.061673327623941385, + "grad_norm": 1.8176212310791016, + "learning_rate": 4.953230580853701e-05, + "loss": 5.2288, + "step": 10370 + }, + { + "epoch": 0.06167927490722238, + "grad_norm": 2.3636794090270996, + "learning_rate": 4.953221587649852e-05, + "loss": 5.1683, + "step": 10371 + }, + { + "epoch": 0.061685222190503375, + "grad_norm": 1.8074215650558472, + "learning_rate": 4.953212593589609e-05, + "loss": 6.037, + "step": 10372 + }, + { + "epoch": 0.06169116947378438, + "grad_norm": 2.1368768215179443, + "learning_rate": 4.953203598672975e-05, + "loss": 5.8481, + "step": 10373 + }, + { + "epoch": 0.06169711675706537, + "grad_norm": 2.924474000930786, + "learning_rate": 4.953194602899952e-05, + "loss": 4.327, + "step": 10374 + }, + { + "epoch": 0.06170306404034637, + "grad_norm": 2.412336826324463, + "learning_rate": 4.953185606270545e-05, + "loss": 4.3885, + "step": 10375 + }, + { + "epoch": 0.06170901132362737, + "grad_norm": 1.9676904678344727, + "learning_rate": 4.953176608784756e-05, + "loss": 5.4581, + "step": 10376 + }, + { + "epoch": 0.061714958606908364, + "grad_norm": 2.1357827186584473, + "learning_rate": 4.953167610442588e-05, + "loss": 6.1762, + "step": 10377 + }, + { + "epoch": 0.06172090589018936, + "grad_norm": 1.912763237953186, + "learning_rate": 4.953158611244045e-05, + "loss": 6.3403, + "step": 10378 + }, + { + "epoch": 0.06172685317347036, + "grad_norm": 2.0528855323791504, + "learning_rate": 4.95314961118913e-05, + "loss": 6.1921, + "step": 10379 + }, + { + "epoch": 0.061732800456751356, + "grad_norm": 2.1858723163604736, + "learning_rate": 4.953140610277846e-05, + "loss": 5.1944, + "step": 10380 + }, + { + "epoch": 0.06173874774003235, + "grad_norm": 2.04040265083313, + "learning_rate": 4.9531316085101944e-05, + "loss": 5.1866, + "step": 10381 + }, + { + "epoch": 0.06174469502331335, + "grad_norm": 2.216113567352295, + "learning_rate": 4.953122605886181e-05, + "loss": 5.5625, + "step": 10382 + }, + { + "epoch": 0.06175064230659435, + "grad_norm": 1.7107234001159668, + "learning_rate": 4.9531136024058076e-05, + "loss": 5.917, + "step": 10383 + }, + { + "epoch": 0.061756589589875344, + "grad_norm": 1.983104944229126, + "learning_rate": 4.9531045980690776e-05, + "loss": 6.0113, + "step": 10384 + }, + { + "epoch": 0.06176253687315634, + "grad_norm": 2.0186147689819336, + "learning_rate": 4.9530955928759945e-05, + "loss": 6.5227, + "step": 10385 + }, + { + "epoch": 0.06176848415643734, + "grad_norm": 1.8337477445602417, + "learning_rate": 4.9530865868265605e-05, + "loss": 5.9586, + "step": 10386 + }, + { + "epoch": 0.061774431439718336, + "grad_norm": 1.6523345708847046, + "learning_rate": 4.9530775799207795e-05, + "loss": 5.7073, + "step": 10387 + }, + { + "epoch": 0.06178037872299933, + "grad_norm": 1.617838740348816, + "learning_rate": 4.953068572158654e-05, + "loss": 5.3771, + "step": 10388 + }, + { + "epoch": 0.06178632600628033, + "grad_norm": 1.7327697277069092, + "learning_rate": 4.953059563540189e-05, + "loss": 5.3021, + "step": 10389 + }, + { + "epoch": 0.06179227328956133, + "grad_norm": 2.726762294769287, + "learning_rate": 4.9530505540653856e-05, + "loss": 5.2568, + "step": 10390 + }, + { + "epoch": 0.06179822057284232, + "grad_norm": 2.540090560913086, + "learning_rate": 4.953041543734247e-05, + "loss": 5.114, + "step": 10391 + }, + { + "epoch": 0.061804167856123325, + "grad_norm": 2.26487135887146, + "learning_rate": 4.953032532546777e-05, + "loss": 5.2552, + "step": 10392 + }, + { + "epoch": 0.06181011513940432, + "grad_norm": 1.9986075162887573, + "learning_rate": 4.95302352050298e-05, + "loss": 5.3555, + "step": 10393 + }, + { + "epoch": 0.061816062422685315, + "grad_norm": 2.2121987342834473, + "learning_rate": 4.9530145076028564e-05, + "loss": 5.665, + "step": 10394 + }, + { + "epoch": 0.06182200970596632, + "grad_norm": 1.892927646636963, + "learning_rate": 4.953005493846411e-05, + "loss": 5.2536, + "step": 10395 + }, + { + "epoch": 0.06182795698924731, + "grad_norm": 2.1083126068115234, + "learning_rate": 4.952996479233647e-05, + "loss": 6.1748, + "step": 10396 + }, + { + "epoch": 0.06183390427252831, + "grad_norm": 2.2235448360443115, + "learning_rate": 4.9529874637645675e-05, + "loss": 6.0676, + "step": 10397 + }, + { + "epoch": 0.06183985155580931, + "grad_norm": 2.0888702869415283, + "learning_rate": 4.952978447439175e-05, + "loss": 5.2515, + "step": 10398 + }, + { + "epoch": 0.061845798839090305, + "grad_norm": 1.826622724533081, + "learning_rate": 4.9529694302574736e-05, + "loss": 5.6849, + "step": 10399 + }, + { + "epoch": 0.0618517461223713, + "grad_norm": 1.9772933721542358, + "learning_rate": 4.952960412219465e-05, + "loss": 5.7702, + "step": 10400 + }, + { + "epoch": 0.061857693405652295, + "grad_norm": 2.2230029106140137, + "learning_rate": 4.952951393325154e-05, + "loss": 5.5747, + "step": 10401 + }, + { + "epoch": 0.0618636406889333, + "grad_norm": 1.9372552633285522, + "learning_rate": 4.9529423735745425e-05, + "loss": 5.4728, + "step": 10402 + }, + { + "epoch": 0.06186958797221429, + "grad_norm": 2.2238845825195312, + "learning_rate": 4.952933352967635e-05, + "loss": 5.2462, + "step": 10403 + }, + { + "epoch": 0.06187553525549529, + "grad_norm": 1.7716748714447021, + "learning_rate": 4.952924331504433e-05, + "loss": 5.5651, + "step": 10404 + }, + { + "epoch": 0.06188148253877629, + "grad_norm": 2.2933645248413086, + "learning_rate": 4.9529153091849405e-05, + "loss": 5.8684, + "step": 10405 + }, + { + "epoch": 0.061887429822057284, + "grad_norm": 2.222883939743042, + "learning_rate": 4.9529062860091616e-05, + "loss": 5.8427, + "step": 10406 + }, + { + "epoch": 0.06189337710533828, + "grad_norm": 1.645338773727417, + "learning_rate": 4.9528972619770975e-05, + "loss": 5.7001, + "step": 10407 + }, + { + "epoch": 0.06189932438861928, + "grad_norm": 2.1029653549194336, + "learning_rate": 4.952888237088752e-05, + "loss": 5.728, + "step": 10408 + }, + { + "epoch": 0.061905271671900276, + "grad_norm": 2.2689831256866455, + "learning_rate": 4.952879211344129e-05, + "loss": 5.4678, + "step": 10409 + }, + { + "epoch": 0.06191121895518127, + "grad_norm": 1.908469557762146, + "learning_rate": 4.9528701847432315e-05, + "loss": 6.007, + "step": 10410 + }, + { + "epoch": 0.06191716623846227, + "grad_norm": 1.819381833076477, + "learning_rate": 4.952861157286062e-05, + "loss": 6.2041, + "step": 10411 + }, + { + "epoch": 0.06192311352174327, + "grad_norm": 2.16945743560791, + "learning_rate": 4.952852128972624e-05, + "loss": 5.7757, + "step": 10412 + }, + { + "epoch": 0.061929060805024264, + "grad_norm": 2.1671459674835205, + "learning_rate": 4.952843099802921e-05, + "loss": 5.5212, + "step": 10413 + }, + { + "epoch": 0.061935008088305266, + "grad_norm": 1.730073094367981, + "learning_rate": 4.952834069776956e-05, + "loss": 5.809, + "step": 10414 + }, + { + "epoch": 0.06194095537158626, + "grad_norm": 2.1048457622528076, + "learning_rate": 4.952825038894732e-05, + "loss": 5.7219, + "step": 10415 + }, + { + "epoch": 0.061946902654867256, + "grad_norm": 2.7438642978668213, + "learning_rate": 4.9528160071562516e-05, + "loss": 5.6367, + "step": 10416 + }, + { + "epoch": 0.06195284993814825, + "grad_norm": 2.0103960037231445, + "learning_rate": 4.952806974561518e-05, + "loss": 5.1429, + "step": 10417 + }, + { + "epoch": 0.06195879722142925, + "grad_norm": 2.1754884719848633, + "learning_rate": 4.9527979411105354e-05, + "loss": 5.9337, + "step": 10418 + }, + { + "epoch": 0.06196474450471025, + "grad_norm": 2.553421974182129, + "learning_rate": 4.9527889068033063e-05, + "loss": 5.7076, + "step": 10419 + }, + { + "epoch": 0.06197069178799124, + "grad_norm": 2.0601327419281006, + "learning_rate": 4.952779871639834e-05, + "loss": 5.7855, + "step": 10420 + }, + { + "epoch": 0.061976639071272245, + "grad_norm": 2.0958025455474854, + "learning_rate": 4.952770835620122e-05, + "loss": 5.8621, + "step": 10421 + }, + { + "epoch": 0.06198258635455324, + "grad_norm": 2.2658755779266357, + "learning_rate": 4.952761798744172e-05, + "loss": 5.9306, + "step": 10422 + }, + { + "epoch": 0.061988533637834235, + "grad_norm": 1.933090090751648, + "learning_rate": 4.9527527610119896e-05, + "loss": 5.1557, + "step": 10423 + }, + { + "epoch": 0.06199448092111524, + "grad_norm": 2.5761375427246094, + "learning_rate": 4.952743722423575e-05, + "loss": 5.4438, + "step": 10424 + }, + { + "epoch": 0.06200042820439623, + "grad_norm": 2.0499768257141113, + "learning_rate": 4.9527346829789344e-05, + "loss": 5.4153, + "step": 10425 + }, + { + "epoch": 0.06200637548767723, + "grad_norm": 1.970674991607666, + "learning_rate": 4.952725642678069e-05, + "loss": 5.8678, + "step": 10426 + }, + { + "epoch": 0.06201232277095823, + "grad_norm": 2.4563233852386475, + "learning_rate": 4.9527166015209814e-05, + "loss": 4.926, + "step": 10427 + }, + { + "epoch": 0.062018270054239225, + "grad_norm": 1.8380508422851562, + "learning_rate": 4.9527075595076763e-05, + "loss": 4.9619, + "step": 10428 + }, + { + "epoch": 0.06202421733752022, + "grad_norm": 1.8930846452713013, + "learning_rate": 4.9526985166381565e-05, + "loss": 4.8252, + "step": 10429 + }, + { + "epoch": 0.062030164620801215, + "grad_norm": 2.401026725769043, + "learning_rate": 4.952689472912426e-05, + "loss": 4.5023, + "step": 10430 + }, + { + "epoch": 0.06203611190408222, + "grad_norm": 2.2801949977874756, + "learning_rate": 4.952680428330486e-05, + "loss": 4.6461, + "step": 10431 + }, + { + "epoch": 0.06204205918736321, + "grad_norm": 2.2466189861297607, + "learning_rate": 4.95267138289234e-05, + "loss": 4.5946, + "step": 10432 + }, + { + "epoch": 0.06204800647064421, + "grad_norm": 2.1723902225494385, + "learning_rate": 4.952662336597993e-05, + "loss": 5.6417, + "step": 10433 + }, + { + "epoch": 0.06205395375392521, + "grad_norm": 1.9614545106887817, + "learning_rate": 4.952653289447446e-05, + "loss": 5.0758, + "step": 10434 + }, + { + "epoch": 0.062059901037206204, + "grad_norm": 2.465252637863159, + "learning_rate": 4.9526442414407036e-05, + "loss": 4.6159, + "step": 10435 + }, + { + "epoch": 0.0620658483204872, + "grad_norm": 2.2298080921173096, + "learning_rate": 4.9526351925777684e-05, + "loss": 5.24, + "step": 10436 + }, + { + "epoch": 0.0620717956037682, + "grad_norm": 2.1284472942352295, + "learning_rate": 4.952626142858643e-05, + "loss": 4.5255, + "step": 10437 + }, + { + "epoch": 0.062077742887049196, + "grad_norm": 2.1340067386627197, + "learning_rate": 4.9526170922833314e-05, + "loss": 4.5931, + "step": 10438 + }, + { + "epoch": 0.06208369017033019, + "grad_norm": 2.20354962348938, + "learning_rate": 4.952608040851837e-05, + "loss": 4.7688, + "step": 10439 + }, + { + "epoch": 0.06208963745361119, + "grad_norm": 1.5250015258789062, + "learning_rate": 4.952598988564162e-05, + "loss": 5.3292, + "step": 10440 + }, + { + "epoch": 0.06209558473689219, + "grad_norm": 2.1667168140411377, + "learning_rate": 4.95258993542031e-05, + "loss": 5.6216, + "step": 10441 + }, + { + "epoch": 0.062101532020173184, + "grad_norm": 1.8172663450241089, + "learning_rate": 4.9525808814202846e-05, + "loss": 5.5813, + "step": 10442 + }, + { + "epoch": 0.062107479303454186, + "grad_norm": 1.9832731485366821, + "learning_rate": 4.9525718265640884e-05, + "loss": 5.4444, + "step": 10443 + }, + { + "epoch": 0.06211342658673518, + "grad_norm": 2.051358699798584, + "learning_rate": 4.952562770851724e-05, + "loss": 5.3488, + "step": 10444 + }, + { + "epoch": 0.062119373870016176, + "grad_norm": 2.1487104892730713, + "learning_rate": 4.952553714283196e-05, + "loss": 5.3803, + "step": 10445 + }, + { + "epoch": 0.06212532115329717, + "grad_norm": 2.086853504180908, + "learning_rate": 4.952544656858507e-05, + "loss": 5.4585, + "step": 10446 + }, + { + "epoch": 0.06213126843657817, + "grad_norm": 2.1599764823913574, + "learning_rate": 4.95253559857766e-05, + "loss": 5.3728, + "step": 10447 + }, + { + "epoch": 0.06213721571985917, + "grad_norm": 1.877626657485962, + "learning_rate": 4.9525265394406576e-05, + "loss": 5.433, + "step": 10448 + }, + { + "epoch": 0.06214316300314016, + "grad_norm": 2.022185802459717, + "learning_rate": 4.952517479447504e-05, + "loss": 5.6472, + "step": 10449 + }, + { + "epoch": 0.062149110286421165, + "grad_norm": 2.1667773723602295, + "learning_rate": 4.9525084185982015e-05, + "loss": 5.3174, + "step": 10450 + }, + { + "epoch": 0.06215505756970216, + "grad_norm": 1.6227883100509644, + "learning_rate": 4.952499356892753e-05, + "loss": 5.3747, + "step": 10451 + }, + { + "epoch": 0.062161004852983155, + "grad_norm": 1.935307502746582, + "learning_rate": 4.952490294331164e-05, + "loss": 5.7716, + "step": 10452 + }, + { + "epoch": 0.06216695213626416, + "grad_norm": 2.6584694385528564, + "learning_rate": 4.952481230913435e-05, + "loss": 5.3525, + "step": 10453 + }, + { + "epoch": 0.06217289941954515, + "grad_norm": 2.626344919204712, + "learning_rate": 4.9524721666395705e-05, + "loss": 5.2118, + "step": 10454 + }, + { + "epoch": 0.06217884670282615, + "grad_norm": 2.525580644607544, + "learning_rate": 4.9524631015095735e-05, + "loss": 5.1231, + "step": 10455 + }, + { + "epoch": 0.06218479398610715, + "grad_norm": 2.274801015853882, + "learning_rate": 4.9524540355234464e-05, + "loss": 5.0637, + "step": 10456 + }, + { + "epoch": 0.062190741269388145, + "grad_norm": 1.9937769174575806, + "learning_rate": 4.952444968681193e-05, + "loss": 5.8196, + "step": 10457 + }, + { + "epoch": 0.06219668855266914, + "grad_norm": 2.124290943145752, + "learning_rate": 4.952435900982816e-05, + "loss": 5.5221, + "step": 10458 + }, + { + "epoch": 0.062202635835950135, + "grad_norm": 2.2544684410095215, + "learning_rate": 4.95242683242832e-05, + "loss": 5.6656, + "step": 10459 + }, + { + "epoch": 0.06220858311923114, + "grad_norm": 2.2626397609710693, + "learning_rate": 4.952417763017706e-05, + "loss": 5.5836, + "step": 10460 + }, + { + "epoch": 0.06221453040251213, + "grad_norm": 1.9299595355987549, + "learning_rate": 4.9524086927509796e-05, + "loss": 5.6637, + "step": 10461 + }, + { + "epoch": 0.06222047768579313, + "grad_norm": 1.769463062286377, + "learning_rate": 4.952399621628142e-05, + "loss": 5.4836, + "step": 10462 + }, + { + "epoch": 0.06222642496907413, + "grad_norm": 1.6773936748504639, + "learning_rate": 4.952390549649196e-05, + "loss": 5.2894, + "step": 10463 + }, + { + "epoch": 0.062232372252355124, + "grad_norm": 1.7612723112106323, + "learning_rate": 4.952381476814148e-05, + "loss": 5.5438, + "step": 10464 + }, + { + "epoch": 0.06223831953563612, + "grad_norm": 2.5255069732666016, + "learning_rate": 4.952372403122997e-05, + "loss": 5.7864, + "step": 10465 + }, + { + "epoch": 0.06224426681891712, + "grad_norm": 2.1128363609313965, + "learning_rate": 4.9523633285757486e-05, + "loss": 5.6207, + "step": 10466 + }, + { + "epoch": 0.062250214102198116, + "grad_norm": 1.8612544536590576, + "learning_rate": 4.952354253172407e-05, + "loss": 5.9177, + "step": 10467 + }, + { + "epoch": 0.06225616138547911, + "grad_norm": 2.092707633972168, + "learning_rate": 4.9523451769129715e-05, + "loss": 5.6047, + "step": 10468 + }, + { + "epoch": 0.06226210866876011, + "grad_norm": 2.6695668697357178, + "learning_rate": 4.952336099797449e-05, + "loss": 5.4931, + "step": 10469 + }, + { + "epoch": 0.06226805595204111, + "grad_norm": 2.2714614868164062, + "learning_rate": 4.9523270218258414e-05, + "loss": 5.4481, + "step": 10470 + }, + { + "epoch": 0.0622740032353221, + "grad_norm": 2.035304307937622, + "learning_rate": 4.952317942998151e-05, + "loss": 5.3609, + "step": 10471 + }, + { + "epoch": 0.062279950518603105, + "grad_norm": 2.295647144317627, + "learning_rate": 4.952308863314382e-05, + "loss": 5.5687, + "step": 10472 + }, + { + "epoch": 0.0622858978018841, + "grad_norm": 1.8365178108215332, + "learning_rate": 4.9522997827745375e-05, + "loss": 5.4207, + "step": 10473 + }, + { + "epoch": 0.062291845085165096, + "grad_norm": 1.6130415201187134, + "learning_rate": 4.9522907013786206e-05, + "loss": 5.1894, + "step": 10474 + }, + { + "epoch": 0.06229779236844609, + "grad_norm": 2.01560115814209, + "learning_rate": 4.952281619126634e-05, + "loss": 5.4956, + "step": 10475 + }, + { + "epoch": 0.06230373965172709, + "grad_norm": 2.7854549884796143, + "learning_rate": 4.952272536018582e-05, + "loss": 5.2341, + "step": 10476 + }, + { + "epoch": 0.06230968693500809, + "grad_norm": 2.7532944679260254, + "learning_rate": 4.9522634520544666e-05, + "loss": 5.1863, + "step": 10477 + }, + { + "epoch": 0.06231563421828908, + "grad_norm": 2.193084239959717, + "learning_rate": 4.952254367234291e-05, + "loss": 5.5187, + "step": 10478 + }, + { + "epoch": 0.062321581501570085, + "grad_norm": 2.245664119720459, + "learning_rate": 4.952245281558059e-05, + "loss": 5.1275, + "step": 10479 + }, + { + "epoch": 0.06232752878485108, + "grad_norm": 2.0522654056549072, + "learning_rate": 4.9522361950257734e-05, + "loss": 5.2887, + "step": 10480 + }, + { + "epoch": 0.062333476068132075, + "grad_norm": 2.132280111312866, + "learning_rate": 4.952227107637437e-05, + "loss": 5.8767, + "step": 10481 + }, + { + "epoch": 0.06233942335141308, + "grad_norm": 2.155574083328247, + "learning_rate": 4.952218019393055e-05, + "loss": 5.9499, + "step": 10482 + }, + { + "epoch": 0.06234537063469407, + "grad_norm": 2.3979780673980713, + "learning_rate": 4.952208930292627e-05, + "loss": 5.7622, + "step": 10483 + }, + { + "epoch": 0.06235131791797507, + "grad_norm": 2.444812297821045, + "learning_rate": 4.9521998403361595e-05, + "loss": 5.3332, + "step": 10484 + }, + { + "epoch": 0.06235726520125607, + "grad_norm": 2.369248867034912, + "learning_rate": 4.952190749523654e-05, + "loss": 5.109, + "step": 10485 + }, + { + "epoch": 0.062363212484537064, + "grad_norm": 1.9160844087600708, + "learning_rate": 4.952181657855114e-05, + "loss": 5.1783, + "step": 10486 + }, + { + "epoch": 0.06236915976781806, + "grad_norm": 2.1532788276672363, + "learning_rate": 4.952172565330543e-05, + "loss": 5.913, + "step": 10487 + }, + { + "epoch": 0.062375107051099055, + "grad_norm": 2.132382392883301, + "learning_rate": 4.9521634719499435e-05, + "loss": 5.7748, + "step": 10488 + }, + { + "epoch": 0.06238105433438006, + "grad_norm": 2.22267484664917, + "learning_rate": 4.9521543777133194e-05, + "loss": 5.6464, + "step": 10489 + }, + { + "epoch": 0.06238700161766105, + "grad_norm": 2.0619423389434814, + "learning_rate": 4.952145282620674e-05, + "loss": 5.4881, + "step": 10490 + }, + { + "epoch": 0.06239294890094205, + "grad_norm": 2.9574310779571533, + "learning_rate": 4.952136186672009e-05, + "loss": 5.4401, + "step": 10491 + }, + { + "epoch": 0.06239889618422305, + "grad_norm": 1.7362775802612305, + "learning_rate": 4.952127089867329e-05, + "loss": 6.0755, + "step": 10492 + }, + { + "epoch": 0.062404843467504044, + "grad_norm": 1.8244996070861816, + "learning_rate": 4.952117992206637e-05, + "loss": 6.2588, + "step": 10493 + }, + { + "epoch": 0.06241079075078504, + "grad_norm": 1.8556538820266724, + "learning_rate": 4.952108893689936e-05, + "loss": 6.0827, + "step": 10494 + }, + { + "epoch": 0.06241673803406604, + "grad_norm": 2.2471442222595215, + "learning_rate": 4.9520997943172285e-05, + "loss": 5.98, + "step": 10495 + }, + { + "epoch": 0.062422685317347036, + "grad_norm": 3.0217249393463135, + "learning_rate": 4.9520906940885186e-05, + "loss": 5.5116, + "step": 10496 + }, + { + "epoch": 0.06242863260062803, + "grad_norm": 2.02962064743042, + "learning_rate": 4.9520815930038086e-05, + "loss": 5.9341, + "step": 10497 + }, + { + "epoch": 0.06243457988390903, + "grad_norm": 1.6286019086837769, + "learning_rate": 4.9520724910631034e-05, + "loss": 5.1944, + "step": 10498 + }, + { + "epoch": 0.06244052716719003, + "grad_norm": 1.9963330030441284, + "learning_rate": 4.9520633882664044e-05, + "loss": 6.0584, + "step": 10499 + }, + { + "epoch": 0.06244647445047102, + "grad_norm": 1.884988784790039, + "learning_rate": 4.9520542846137155e-05, + "loss": 6.2744, + "step": 10500 + }, + { + "epoch": 0.062452421733752025, + "grad_norm": 1.9402821063995361, + "learning_rate": 4.95204518010504e-05, + "loss": 5.9201, + "step": 10501 + }, + { + "epoch": 0.06245836901703302, + "grad_norm": 1.9304310083389282, + "learning_rate": 4.9520360747403805e-05, + "loss": 5.7227, + "step": 10502 + }, + { + "epoch": 0.062464316300314016, + "grad_norm": 2.8199663162231445, + "learning_rate": 4.9520269685197405e-05, + "loss": 6.4819, + "step": 10503 + }, + { + "epoch": 0.06247026358359501, + "grad_norm": 1.456852912902832, + "learning_rate": 4.9520178614431236e-05, + "loss": 5.3169, + "step": 10504 + }, + { + "epoch": 0.06247621086687601, + "grad_norm": 2.3753762245178223, + "learning_rate": 4.9520087535105324e-05, + "loss": 5.9817, + "step": 10505 + }, + { + "epoch": 0.06248215815015701, + "grad_norm": 2.329932928085327, + "learning_rate": 4.951999644721971e-05, + "loss": 6.0266, + "step": 10506 + }, + { + "epoch": 0.062488105433438, + "grad_norm": 1.772615671157837, + "learning_rate": 4.951990535077441e-05, + "loss": 5.2548, + "step": 10507 + }, + { + "epoch": 0.062494052716719005, + "grad_norm": 2.1240997314453125, + "learning_rate": 4.951981424576946e-05, + "loss": 5.3991, + "step": 10508 + }, + { + "epoch": 0.0625, + "grad_norm": 1.7283856868743896, + "learning_rate": 4.9519723132204905e-05, + "loss": 5.2065, + "step": 10509 + }, + { + "epoch": 0.062505947283281, + "grad_norm": 2.197404384613037, + "learning_rate": 4.951963201008076e-05, + "loss": 5.7282, + "step": 10510 + }, + { + "epoch": 0.06251189456656199, + "grad_norm": 1.8550727367401123, + "learning_rate": 4.9519540879397075e-05, + "loss": 6.0125, + "step": 10511 + }, + { + "epoch": 0.06251784184984299, + "grad_norm": 1.5998154878616333, + "learning_rate": 4.951944974015387e-05, + "loss": 5.9371, + "step": 10512 + }, + { + "epoch": 0.062523789133124, + "grad_norm": 1.644454836845398, + "learning_rate": 4.951935859235117e-05, + "loss": 5.9315, + "step": 10513 + }, + { + "epoch": 0.06252973641640498, + "grad_norm": 1.9119540452957153, + "learning_rate": 4.951926743598902e-05, + "loss": 5.7104, + "step": 10514 + }, + { + "epoch": 0.06253568369968598, + "grad_norm": 1.8863649368286133, + "learning_rate": 4.951917627106745e-05, + "loss": 5.8639, + "step": 10515 + }, + { + "epoch": 0.06254163098296699, + "grad_norm": 2.1626899242401123, + "learning_rate": 4.951908509758648e-05, + "loss": 5.9727, + "step": 10516 + }, + { + "epoch": 0.06254757826624797, + "grad_norm": 1.9397778511047363, + "learning_rate": 4.9518993915546155e-05, + "loss": 5.9771, + "step": 10517 + }, + { + "epoch": 0.06255352554952898, + "grad_norm": 1.7723463773727417, + "learning_rate": 4.951890272494651e-05, + "loss": 5.8684, + "step": 10518 + }, + { + "epoch": 0.06255947283280998, + "grad_norm": 1.9191977977752686, + "learning_rate": 4.9518811525787565e-05, + "loss": 5.7242, + "step": 10519 + }, + { + "epoch": 0.06256542011609097, + "grad_norm": 1.7599314451217651, + "learning_rate": 4.951872031806935e-05, + "loss": 5.5234, + "step": 10520 + }, + { + "epoch": 0.06257136739937197, + "grad_norm": 1.6560989618301392, + "learning_rate": 4.951862910179191e-05, + "loss": 5.5907, + "step": 10521 + }, + { + "epoch": 0.06257731468265297, + "grad_norm": 1.9756556749343872, + "learning_rate": 4.9518537876955265e-05, + "loss": 6.0013, + "step": 10522 + }, + { + "epoch": 0.06258326196593396, + "grad_norm": 1.9012173414230347, + "learning_rate": 4.9518446643559454e-05, + "loss": 5.8073, + "step": 10523 + }, + { + "epoch": 0.06258920924921496, + "grad_norm": 1.8992196321487427, + "learning_rate": 4.951835540160451e-05, + "loss": 5.8571, + "step": 10524 + }, + { + "epoch": 0.06259515653249595, + "grad_norm": 1.8002395629882812, + "learning_rate": 4.9518264151090455e-05, + "loss": 5.7798, + "step": 10525 + }, + { + "epoch": 0.06260110381577695, + "grad_norm": 1.732063889503479, + "learning_rate": 4.9518172892017335e-05, + "loss": 5.8167, + "step": 10526 + }, + { + "epoch": 0.06260705109905795, + "grad_norm": 1.6961164474487305, + "learning_rate": 4.951808162438517e-05, + "loss": 5.8797, + "step": 10527 + }, + { + "epoch": 0.06261299838233894, + "grad_norm": 1.904102087020874, + "learning_rate": 4.9517990348193996e-05, + "loss": 5.7109, + "step": 10528 + }, + { + "epoch": 0.06261894566561994, + "grad_norm": 1.6908652782440186, + "learning_rate": 4.951789906344384e-05, + "loss": 5.8435, + "step": 10529 + }, + { + "epoch": 0.06262489294890095, + "grad_norm": 1.8550028800964355, + "learning_rate": 4.951780777013475e-05, + "loss": 5.6218, + "step": 10530 + }, + { + "epoch": 0.06263084023218193, + "grad_norm": 1.7106919288635254, + "learning_rate": 4.951771646826674e-05, + "loss": 5.6668, + "step": 10531 + }, + { + "epoch": 0.06263678751546294, + "grad_norm": 1.5522899627685547, + "learning_rate": 4.951762515783984e-05, + "loss": 5.418, + "step": 10532 + }, + { + "epoch": 0.06264273479874394, + "grad_norm": 1.7510137557983398, + "learning_rate": 4.9517533838854104e-05, + "loss": 5.6595, + "step": 10533 + }, + { + "epoch": 0.06264868208202493, + "grad_norm": 2.1222739219665527, + "learning_rate": 4.9517442511309544e-05, + "loss": 6.0008, + "step": 10534 + }, + { + "epoch": 0.06265462936530593, + "grad_norm": 1.977807641029358, + "learning_rate": 4.95173511752062e-05, + "loss": 5.8263, + "step": 10535 + }, + { + "epoch": 0.06266057664858693, + "grad_norm": 1.6423957347869873, + "learning_rate": 4.9517259830544105e-05, + "loss": 6.2078, + "step": 10536 + }, + { + "epoch": 0.06266652393186792, + "grad_norm": 1.9365674257278442, + "learning_rate": 4.9517168477323286e-05, + "loss": 6.0972, + "step": 10537 + }, + { + "epoch": 0.06267247121514892, + "grad_norm": 1.6738137006759644, + "learning_rate": 4.951707711554377e-05, + "loss": 5.7439, + "step": 10538 + }, + { + "epoch": 0.06267841849842992, + "grad_norm": 2.4281718730926514, + "learning_rate": 4.95169857452056e-05, + "loss": 5.4822, + "step": 10539 + }, + { + "epoch": 0.06268436578171091, + "grad_norm": 2.53411602973938, + "learning_rate": 4.951689436630881e-05, + "loss": 5.4883, + "step": 10540 + }, + { + "epoch": 0.06269031306499191, + "grad_norm": 2.116520643234253, + "learning_rate": 4.951680297885342e-05, + "loss": 5.6123, + "step": 10541 + }, + { + "epoch": 0.06269626034827291, + "grad_norm": 1.8546512126922607, + "learning_rate": 4.951671158283946e-05, + "loss": 5.443, + "step": 10542 + }, + { + "epoch": 0.0627022076315539, + "grad_norm": 2.0048365592956543, + "learning_rate": 4.9516620178266975e-05, + "loss": 5.7759, + "step": 10543 + }, + { + "epoch": 0.0627081549148349, + "grad_norm": 1.6800916194915771, + "learning_rate": 4.9516528765136e-05, + "loss": 5.6767, + "step": 10544 + }, + { + "epoch": 0.0627141021981159, + "grad_norm": 1.7444523572921753, + "learning_rate": 4.9516437343446544e-05, + "loss": 5.297, + "step": 10545 + }, + { + "epoch": 0.0627200494813969, + "grad_norm": 1.8653407096862793, + "learning_rate": 4.951634591319866e-05, + "loss": 5.6999, + "step": 10546 + }, + { + "epoch": 0.0627259967646779, + "grad_norm": 1.7988131046295166, + "learning_rate": 4.9516254474392376e-05, + "loss": 5.5244, + "step": 10547 + }, + { + "epoch": 0.0627319440479589, + "grad_norm": 1.7915012836456299, + "learning_rate": 4.951616302702772e-05, + "loss": 5.6766, + "step": 10548 + }, + { + "epoch": 0.06273789133123989, + "grad_norm": 1.8351629972457886, + "learning_rate": 4.951607157110471e-05, + "loss": 5.6332, + "step": 10549 + }, + { + "epoch": 0.06274383861452089, + "grad_norm": 1.6819947957992554, + "learning_rate": 4.951598010662341e-05, + "loss": 5.5773, + "step": 10550 + }, + { + "epoch": 0.06274978589780189, + "grad_norm": 2.2969119548797607, + "learning_rate": 4.951588863358383e-05, + "loss": 5.6847, + "step": 10551 + }, + { + "epoch": 0.06275573318108288, + "grad_norm": 2.346092939376831, + "learning_rate": 4.951579715198601e-05, + "loss": 5.404, + "step": 10552 + }, + { + "epoch": 0.06276168046436388, + "grad_norm": 1.8255709409713745, + "learning_rate": 4.951570566182997e-05, + "loss": 5.9009, + "step": 10553 + }, + { + "epoch": 0.06276762774764487, + "grad_norm": 2.4000492095947266, + "learning_rate": 4.951561416311575e-05, + "loss": 5.4395, + "step": 10554 + }, + { + "epoch": 0.06277357503092587, + "grad_norm": 2.1519010066986084, + "learning_rate": 4.951552265584339e-05, + "loss": 5.6447, + "step": 10555 + }, + { + "epoch": 0.06277952231420687, + "grad_norm": 1.7821810245513916, + "learning_rate": 4.9515431140012915e-05, + "loss": 5.3495, + "step": 10556 + }, + { + "epoch": 0.06278546959748786, + "grad_norm": 1.8359061479568481, + "learning_rate": 4.9515339615624356e-05, + "loss": 5.7258, + "step": 10557 + }, + { + "epoch": 0.06279141688076886, + "grad_norm": 1.899970293045044, + "learning_rate": 4.951524808267774e-05, + "loss": 5.9683, + "step": 10558 + }, + { + "epoch": 0.06279736416404987, + "grad_norm": 1.6407743692398071, + "learning_rate": 4.951515654117311e-05, + "loss": 6.001, + "step": 10559 + }, + { + "epoch": 0.06280331144733085, + "grad_norm": 1.5474567413330078, + "learning_rate": 4.9515064991110485e-05, + "loss": 5.673, + "step": 10560 + }, + { + "epoch": 0.06280925873061186, + "grad_norm": 1.7129321098327637, + "learning_rate": 4.951497343248991e-05, + "loss": 5.7232, + "step": 10561 + }, + { + "epoch": 0.06281520601389286, + "grad_norm": 1.948367953300476, + "learning_rate": 4.95148818653114e-05, + "loss": 5.9378, + "step": 10562 + }, + { + "epoch": 0.06282115329717385, + "grad_norm": 1.788724422454834, + "learning_rate": 4.951479028957501e-05, + "loss": 5.9077, + "step": 10563 + }, + { + "epoch": 0.06282710058045485, + "grad_norm": 1.7036423683166504, + "learning_rate": 4.951469870528076e-05, + "loss": 5.7688, + "step": 10564 + }, + { + "epoch": 0.06283304786373585, + "grad_norm": 1.6055458784103394, + "learning_rate": 4.9514607112428676e-05, + "loss": 5.7234, + "step": 10565 + }, + { + "epoch": 0.06283899514701684, + "grad_norm": 1.9353829622268677, + "learning_rate": 4.95145155110188e-05, + "loss": 6.1046, + "step": 10566 + }, + { + "epoch": 0.06284494243029784, + "grad_norm": 1.6070129871368408, + "learning_rate": 4.9514423901051157e-05, + "loss": 5.7379, + "step": 10567 + }, + { + "epoch": 0.06285088971357884, + "grad_norm": 1.447828769683838, + "learning_rate": 4.951433228252579e-05, + "loss": 5.2944, + "step": 10568 + }, + { + "epoch": 0.06285683699685983, + "grad_norm": 2.5256540775299072, + "learning_rate": 4.951424065544271e-05, + "loss": 5.1358, + "step": 10569 + }, + { + "epoch": 0.06286278428014083, + "grad_norm": 2.29848051071167, + "learning_rate": 4.951414901980197e-05, + "loss": 5.1967, + "step": 10570 + }, + { + "epoch": 0.06286873156342183, + "grad_norm": 1.9477180242538452, + "learning_rate": 4.951405737560359e-05, + "loss": 5.7509, + "step": 10571 + }, + { + "epoch": 0.06287467884670282, + "grad_norm": 1.9303146600723267, + "learning_rate": 4.951396572284761e-05, + "loss": 5.7052, + "step": 10572 + }, + { + "epoch": 0.06288062612998382, + "grad_norm": 1.5632199048995972, + "learning_rate": 4.951387406153405e-05, + "loss": 5.5001, + "step": 10573 + }, + { + "epoch": 0.06288657341326483, + "grad_norm": 1.6798962354660034, + "learning_rate": 4.951378239166296e-05, + "loss": 5.5537, + "step": 10574 + }, + { + "epoch": 0.06289252069654581, + "grad_norm": 1.7395051717758179, + "learning_rate": 4.9513690713234355e-05, + "loss": 5.736, + "step": 10575 + }, + { + "epoch": 0.06289846797982682, + "grad_norm": 1.726020097732544, + "learning_rate": 4.951359902624828e-05, + "loss": 5.6802, + "step": 10576 + }, + { + "epoch": 0.06290441526310782, + "grad_norm": 1.8063993453979492, + "learning_rate": 4.9513507330704755e-05, + "loss": 5.6077, + "step": 10577 + }, + { + "epoch": 0.0629103625463888, + "grad_norm": 1.6284246444702148, + "learning_rate": 4.951341562660382e-05, + "loss": 5.8327, + "step": 10578 + }, + { + "epoch": 0.06291630982966981, + "grad_norm": 2.635869026184082, + "learning_rate": 4.95133239139455e-05, + "loss": 5.8252, + "step": 10579 + }, + { + "epoch": 0.06292225711295081, + "grad_norm": 2.5127367973327637, + "learning_rate": 4.9513232192729845e-05, + "loss": 5.7431, + "step": 10580 + }, + { + "epoch": 0.0629282043962318, + "grad_norm": 2.0740721225738525, + "learning_rate": 4.951314046295686e-05, + "loss": 5.4582, + "step": 10581 + }, + { + "epoch": 0.0629341516795128, + "grad_norm": 2.32232666015625, + "learning_rate": 4.95130487246266e-05, + "loss": 5.2523, + "step": 10582 + }, + { + "epoch": 0.06294009896279379, + "grad_norm": 2.164407730102539, + "learning_rate": 4.951295697773908e-05, + "loss": 5.6436, + "step": 10583 + }, + { + "epoch": 0.06294604624607479, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.951286522229435e-05, + "loss": 5.5333, + "step": 10584 + }, + { + "epoch": 0.0629519935293558, + "grad_norm": 2.025470733642578, + "learning_rate": 4.951277345829242e-05, + "loss": 5.5041, + "step": 10585 + }, + { + "epoch": 0.06295794081263678, + "grad_norm": 1.9415414333343506, + "learning_rate": 4.951268168573334e-05, + "loss": 5.2148, + "step": 10586 + }, + { + "epoch": 0.06296388809591778, + "grad_norm": 1.9229072332382202, + "learning_rate": 4.9512589904617135e-05, + "loss": 5.1461, + "step": 10587 + }, + { + "epoch": 0.06296983537919879, + "grad_norm": 2.414041757583618, + "learning_rate": 4.951249811494384e-05, + "loss": 5.5023, + "step": 10588 + }, + { + "epoch": 0.06297578266247977, + "grad_norm": 2.49826979637146, + "learning_rate": 4.9512406316713486e-05, + "loss": 5.3566, + "step": 10589 + }, + { + "epoch": 0.06298172994576078, + "grad_norm": 1.7222081422805786, + "learning_rate": 4.951231450992611e-05, + "loss": 5.3128, + "step": 10590 + }, + { + "epoch": 0.06298767722904178, + "grad_norm": 1.7181445360183716, + "learning_rate": 4.9512222694581725e-05, + "loss": 5.4598, + "step": 10591 + }, + { + "epoch": 0.06299362451232277, + "grad_norm": 1.547813892364502, + "learning_rate": 4.9512130870680385e-05, + "loss": 5.3997, + "step": 10592 + }, + { + "epoch": 0.06299957179560377, + "grad_norm": 1.6273536682128906, + "learning_rate": 4.95120390382221e-05, + "loss": 5.1668, + "step": 10593 + }, + { + "epoch": 0.06300551907888477, + "grad_norm": 1.6771745681762695, + "learning_rate": 4.9511947197206934e-05, + "loss": 5.2368, + "step": 10594 + }, + { + "epoch": 0.06301146636216576, + "grad_norm": 2.439664125442505, + "learning_rate": 4.951185534763489e-05, + "loss": 5.2178, + "step": 10595 + }, + { + "epoch": 0.06301741364544676, + "grad_norm": 2.194408655166626, + "learning_rate": 4.951176348950601e-05, + "loss": 5.3593, + "step": 10596 + }, + { + "epoch": 0.06302336092872776, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.9511671622820334e-05, + "loss": 6.3141, + "step": 10597 + }, + { + "epoch": 0.06302930821200875, + "grad_norm": 1.9550800323486328, + "learning_rate": 4.951157974757789e-05, + "loss": 5.8944, + "step": 10598 + }, + { + "epoch": 0.06303525549528975, + "grad_norm": 1.764724612236023, + "learning_rate": 4.9511487863778693e-05, + "loss": 5.5796, + "step": 10599 + }, + { + "epoch": 0.06304120277857075, + "grad_norm": 1.7987425327301025, + "learning_rate": 4.951139597142279e-05, + "loss": 5.5231, + "step": 10600 + }, + { + "epoch": 0.06304715006185174, + "grad_norm": 1.495875358581543, + "learning_rate": 4.951130407051022e-05, + "loss": 5.5019, + "step": 10601 + }, + { + "epoch": 0.06305309734513274, + "grad_norm": 2.7586476802825928, + "learning_rate": 4.9511212161041e-05, + "loss": 5.7043, + "step": 10602 + }, + { + "epoch": 0.06305904462841375, + "grad_norm": 2.1746270656585693, + "learning_rate": 4.951112024301517e-05, + "loss": 5.351, + "step": 10603 + }, + { + "epoch": 0.06306499191169473, + "grad_norm": 1.8681105375289917, + "learning_rate": 4.951102831643277e-05, + "loss": 5.4847, + "step": 10604 + }, + { + "epoch": 0.06307093919497574, + "grad_norm": 1.772286057472229, + "learning_rate": 4.951093638129382e-05, + "loss": 5.767, + "step": 10605 + }, + { + "epoch": 0.06307688647825674, + "grad_norm": 1.847748875617981, + "learning_rate": 4.951084443759835e-05, + "loss": 5.7737, + "step": 10606 + }, + { + "epoch": 0.06308283376153773, + "grad_norm": 1.9219080209732056, + "learning_rate": 4.95107524853464e-05, + "loss": 5.9414, + "step": 10607 + }, + { + "epoch": 0.06308878104481873, + "grad_norm": 1.6497199535369873, + "learning_rate": 4.9510660524538e-05, + "loss": 5.7124, + "step": 10608 + }, + { + "epoch": 0.06309472832809973, + "grad_norm": 1.8772788047790527, + "learning_rate": 4.951056855517318e-05, + "loss": 5.6784, + "step": 10609 + }, + { + "epoch": 0.06310067561138072, + "grad_norm": 2.035104990005493, + "learning_rate": 4.951047657725197e-05, + "loss": 5.5975, + "step": 10610 + }, + { + "epoch": 0.06310662289466172, + "grad_norm": 2.000922918319702, + "learning_rate": 4.9510384590774414e-05, + "loss": 5.2133, + "step": 10611 + }, + { + "epoch": 0.06311257017794271, + "grad_norm": 2.2581655979156494, + "learning_rate": 4.9510292595740536e-05, + "loss": 5.468, + "step": 10612 + }, + { + "epoch": 0.06311851746122371, + "grad_norm": 2.0332419872283936, + "learning_rate": 4.9510200592150365e-05, + "loss": 5.4923, + "step": 10613 + }, + { + "epoch": 0.06312446474450471, + "grad_norm": 1.9499238729476929, + "learning_rate": 4.9510108580003934e-05, + "loss": 5.5535, + "step": 10614 + }, + { + "epoch": 0.0631304120277857, + "grad_norm": 2.017491579055786, + "learning_rate": 4.951001655930128e-05, + "loss": 5.3771, + "step": 10615 + }, + { + "epoch": 0.0631363593110667, + "grad_norm": 2.355508804321289, + "learning_rate": 4.950992453004243e-05, + "loss": 5.0035, + "step": 10616 + }, + { + "epoch": 0.0631423065943477, + "grad_norm": 2.0470683574676514, + "learning_rate": 4.9509832492227426e-05, + "loss": 5.6073, + "step": 10617 + }, + { + "epoch": 0.0631482538776287, + "grad_norm": 1.7955858707427979, + "learning_rate": 4.9509740445856284e-05, + "loss": 5.8097, + "step": 10618 + }, + { + "epoch": 0.0631542011609097, + "grad_norm": 2.0126395225524902, + "learning_rate": 4.9509648390929045e-05, + "loss": 5.5989, + "step": 10619 + }, + { + "epoch": 0.0631601484441907, + "grad_norm": 1.8632375001907349, + "learning_rate": 4.950955632744575e-05, + "loss": 5.5585, + "step": 10620 + }, + { + "epoch": 0.06316609572747169, + "grad_norm": 2.2190446853637695, + "learning_rate": 4.950946425540641e-05, + "loss": 5.5182, + "step": 10621 + }, + { + "epoch": 0.06317204301075269, + "grad_norm": 2.082871675491333, + "learning_rate": 4.9509372174811074e-05, + "loss": 5.7849, + "step": 10622 + }, + { + "epoch": 0.06317799029403369, + "grad_norm": 2.17744517326355, + "learning_rate": 4.9509280085659774e-05, + "loss": 5.2332, + "step": 10623 + }, + { + "epoch": 0.06318393757731468, + "grad_norm": 1.7662746906280518, + "learning_rate": 4.950918798795253e-05, + "loss": 5.4136, + "step": 10624 + }, + { + "epoch": 0.06318988486059568, + "grad_norm": 1.6879531145095825, + "learning_rate": 4.950909588168939e-05, + "loss": 5.3747, + "step": 10625 + }, + { + "epoch": 0.06319583214387668, + "grad_norm": 2.0174877643585205, + "learning_rate": 4.950900376687038e-05, + "loss": 5.2927, + "step": 10626 + }, + { + "epoch": 0.06320177942715767, + "grad_norm": 1.9052749872207642, + "learning_rate": 4.950891164349552e-05, + "loss": 5.1492, + "step": 10627 + }, + { + "epoch": 0.06320772671043867, + "grad_norm": 1.7647850513458252, + "learning_rate": 4.950881951156485e-05, + "loss": 5.4182, + "step": 10628 + }, + { + "epoch": 0.06321367399371967, + "grad_norm": 1.9794502258300781, + "learning_rate": 4.950872737107841e-05, + "loss": 5.3838, + "step": 10629 + }, + { + "epoch": 0.06321962127700066, + "grad_norm": 2.3403780460357666, + "learning_rate": 4.950863522203623e-05, + "loss": 5.4542, + "step": 10630 + }, + { + "epoch": 0.06322556856028166, + "grad_norm": 1.8747358322143555, + "learning_rate": 4.9508543064438336e-05, + "loss": 5.4949, + "step": 10631 + }, + { + "epoch": 0.06323151584356267, + "grad_norm": 1.9435046911239624, + "learning_rate": 4.950845089828476e-05, + "loss": 5.6136, + "step": 10632 + }, + { + "epoch": 0.06323746312684365, + "grad_norm": 2.095583438873291, + "learning_rate": 4.9508358723575544e-05, + "loss": 5.2864, + "step": 10633 + }, + { + "epoch": 0.06324341041012466, + "grad_norm": 1.8254145383834839, + "learning_rate": 4.9508266540310705e-05, + "loss": 5.4732, + "step": 10634 + }, + { + "epoch": 0.06324935769340566, + "grad_norm": 2.303638458251953, + "learning_rate": 4.950817434849029e-05, + "loss": 5.1501, + "step": 10635 + }, + { + "epoch": 0.06325530497668665, + "grad_norm": 2.5389420986175537, + "learning_rate": 4.950808214811432e-05, + "loss": 5.0723, + "step": 10636 + }, + { + "epoch": 0.06326125225996765, + "grad_norm": 2.1702539920806885, + "learning_rate": 4.950798993918283e-05, + "loss": 4.8838, + "step": 10637 + }, + { + "epoch": 0.06326719954324865, + "grad_norm": 1.921650767326355, + "learning_rate": 4.9507897721695855e-05, + "loss": 5.9958, + "step": 10638 + }, + { + "epoch": 0.06327314682652964, + "grad_norm": 2.2247352600097656, + "learning_rate": 4.950780549565343e-05, + "loss": 4.9319, + "step": 10639 + }, + { + "epoch": 0.06327909410981064, + "grad_norm": 2.3517649173736572, + "learning_rate": 4.950771326105558e-05, + "loss": 4.6033, + "step": 10640 + }, + { + "epoch": 0.06328504139309163, + "grad_norm": 2.053856134414673, + "learning_rate": 4.950762101790234e-05, + "loss": 4.3799, + "step": 10641 + }, + { + "epoch": 0.06329098867637263, + "grad_norm": 1.8055500984191895, + "learning_rate": 4.9507528766193746e-05, + "loss": 5.244, + "step": 10642 + }, + { + "epoch": 0.06329693595965363, + "grad_norm": 2.0694682598114014, + "learning_rate": 4.950743650592983e-05, + "loss": 5.1965, + "step": 10643 + }, + { + "epoch": 0.06330288324293462, + "grad_norm": 2.027399778366089, + "learning_rate": 4.950734423711061e-05, + "loss": 4.5576, + "step": 10644 + }, + { + "epoch": 0.06330883052621562, + "grad_norm": 2.22308087348938, + "learning_rate": 4.950725195973614e-05, + "loss": 4.4679, + "step": 10645 + }, + { + "epoch": 0.06331477780949663, + "grad_norm": 2.1807515621185303, + "learning_rate": 4.9507159673806436e-05, + "loss": 4.6147, + "step": 10646 + }, + { + "epoch": 0.06332072509277761, + "grad_norm": 2.0173258781433105, + "learning_rate": 4.9507067379321536e-05, + "loss": 4.5657, + "step": 10647 + }, + { + "epoch": 0.06332667237605862, + "grad_norm": 1.832610845565796, + "learning_rate": 4.9506975076281474e-05, + "loss": 4.7433, + "step": 10648 + }, + { + "epoch": 0.06333261965933962, + "grad_norm": 2.027352809906006, + "learning_rate": 4.950688276468628e-05, + "loss": 5.0426, + "step": 10649 + }, + { + "epoch": 0.0633385669426206, + "grad_norm": 1.856307864189148, + "learning_rate": 4.950679044453599e-05, + "loss": 5.2838, + "step": 10650 + }, + { + "epoch": 0.06334451422590161, + "grad_norm": 2.0875375270843506, + "learning_rate": 4.950669811583062e-05, + "loss": 4.5728, + "step": 10651 + }, + { + "epoch": 0.06335046150918261, + "grad_norm": 2.1067941188812256, + "learning_rate": 4.950660577857023e-05, + "loss": 4.5313, + "step": 10652 + }, + { + "epoch": 0.0633564087924636, + "grad_norm": 2.1747500896453857, + "learning_rate": 4.9506513432754825e-05, + "loss": 4.432, + "step": 10653 + }, + { + "epoch": 0.0633623560757446, + "grad_norm": 1.769059181213379, + "learning_rate": 4.950642107838446e-05, + "loss": 5.4667, + "step": 10654 + }, + { + "epoch": 0.0633683033590256, + "grad_norm": 2.2065072059631348, + "learning_rate": 4.9506328715459146e-05, + "loss": 5.9873, + "step": 10655 + }, + { + "epoch": 0.06337425064230659, + "grad_norm": 1.679431438446045, + "learning_rate": 4.950623634397893e-05, + "loss": 5.851, + "step": 10656 + }, + { + "epoch": 0.06338019792558759, + "grad_norm": 1.919668197631836, + "learning_rate": 4.950614396394384e-05, + "loss": 5.8613, + "step": 10657 + }, + { + "epoch": 0.0633861452088686, + "grad_norm": 1.5296612977981567, + "learning_rate": 4.9506051575353915e-05, + "loss": 5.7067, + "step": 10658 + }, + { + "epoch": 0.06339209249214958, + "grad_norm": 2.1283507347106934, + "learning_rate": 4.950595917820917e-05, + "loss": 5.1141, + "step": 10659 + }, + { + "epoch": 0.06339803977543058, + "grad_norm": 1.7011604309082031, + "learning_rate": 4.950586677250966e-05, + "loss": 6.0463, + "step": 10660 + }, + { + "epoch": 0.06340398705871159, + "grad_norm": 1.7479497194290161, + "learning_rate": 4.9505774358255396e-05, + "loss": 5.8942, + "step": 10661 + }, + { + "epoch": 0.06340993434199257, + "grad_norm": 1.939471960067749, + "learning_rate": 4.950568193544642e-05, + "loss": 5.562, + "step": 10662 + }, + { + "epoch": 0.06341588162527358, + "grad_norm": 1.871993899345398, + "learning_rate": 4.9505589504082764e-05, + "loss": 5.746, + "step": 10663 + }, + { + "epoch": 0.06342182890855458, + "grad_norm": 2.173109292984009, + "learning_rate": 4.950549706416446e-05, + "loss": 5.5927, + "step": 10664 + }, + { + "epoch": 0.06342777619183557, + "grad_norm": 1.809971809387207, + "learning_rate": 4.950540461569154e-05, + "loss": 5.8983, + "step": 10665 + }, + { + "epoch": 0.06343372347511657, + "grad_norm": 1.6344120502471924, + "learning_rate": 4.950531215866404e-05, + "loss": 5.5301, + "step": 10666 + }, + { + "epoch": 0.06343967075839757, + "grad_norm": 2.080425500869751, + "learning_rate": 4.9505219693081985e-05, + "loss": 6.0214, + "step": 10667 + }, + { + "epoch": 0.06344561804167856, + "grad_norm": 1.9382790327072144, + "learning_rate": 4.9505127218945415e-05, + "loss": 5.676, + "step": 10668 + }, + { + "epoch": 0.06345156532495956, + "grad_norm": 1.6945782899856567, + "learning_rate": 4.9505034736254354e-05, + "loss": 5.9337, + "step": 10669 + }, + { + "epoch": 0.06345751260824055, + "grad_norm": 1.6129313707351685, + "learning_rate": 4.9504942245008836e-05, + "loss": 5.6561, + "step": 10670 + }, + { + "epoch": 0.06346345989152155, + "grad_norm": 2.002903461456299, + "learning_rate": 4.95048497452089e-05, + "loss": 5.6302, + "step": 10671 + }, + { + "epoch": 0.06346940717480255, + "grad_norm": 1.6016403436660767, + "learning_rate": 4.950475723685457e-05, + "loss": 5.8275, + "step": 10672 + }, + { + "epoch": 0.06347535445808354, + "grad_norm": 1.7645297050476074, + "learning_rate": 4.9504664719945895e-05, + "loss": 5.5541, + "step": 10673 + }, + { + "epoch": 0.06348130174136454, + "grad_norm": 1.9627439975738525, + "learning_rate": 4.950457219448288e-05, + "loss": 5.6425, + "step": 10674 + }, + { + "epoch": 0.06348724902464555, + "grad_norm": 1.6297314167022705, + "learning_rate": 4.950447966046558e-05, + "loss": 5.5735, + "step": 10675 + }, + { + "epoch": 0.06349319630792653, + "grad_norm": 1.7911304235458374, + "learning_rate": 4.9504387117894014e-05, + "loss": 5.7736, + "step": 10676 + }, + { + "epoch": 0.06349914359120754, + "grad_norm": 1.627543330192566, + "learning_rate": 4.950429456676823e-05, + "loss": 5.736, + "step": 10677 + }, + { + "epoch": 0.06350509087448854, + "grad_norm": 1.9574320316314697, + "learning_rate": 4.950420200708824e-05, + "loss": 5.365, + "step": 10678 + }, + { + "epoch": 0.06351103815776953, + "grad_norm": 1.7698450088500977, + "learning_rate": 4.950410943885408e-05, + "loss": 5.5742, + "step": 10679 + }, + { + "epoch": 0.06351698544105053, + "grad_norm": 1.7660366296768188, + "learning_rate": 4.9504016862065806e-05, + "loss": 5.9064, + "step": 10680 + }, + { + "epoch": 0.06352293272433153, + "grad_norm": 2.0279083251953125, + "learning_rate": 4.9503924276723425e-05, + "loss": 5.7938, + "step": 10681 + }, + { + "epoch": 0.06352888000761252, + "grad_norm": 2.101827621459961, + "learning_rate": 4.9503831682826974e-05, + "loss": 5.4898, + "step": 10682 + }, + { + "epoch": 0.06353482729089352, + "grad_norm": 2.04978084564209, + "learning_rate": 4.9503739080376486e-05, + "loss": 5.3753, + "step": 10683 + }, + { + "epoch": 0.06354077457417452, + "grad_norm": 1.8539999723434448, + "learning_rate": 4.950364646937201e-05, + "loss": 5.5575, + "step": 10684 + }, + { + "epoch": 0.06354672185745551, + "grad_norm": 2.077073097229004, + "learning_rate": 4.9503553849813556e-05, + "loss": 5.4628, + "step": 10685 + }, + { + "epoch": 0.06355266914073651, + "grad_norm": 1.8130167722702026, + "learning_rate": 4.950346122170116e-05, + "loss": 5.1648, + "step": 10686 + }, + { + "epoch": 0.06355861642401751, + "grad_norm": 1.810944676399231, + "learning_rate": 4.950336858503486e-05, + "loss": 5.8371, + "step": 10687 + }, + { + "epoch": 0.0635645637072985, + "grad_norm": 2.0081756114959717, + "learning_rate": 4.950327593981469e-05, + "loss": 5.6933, + "step": 10688 + }, + { + "epoch": 0.0635705109905795, + "grad_norm": 1.5824620723724365, + "learning_rate": 4.950318328604068e-05, + "loss": 5.4494, + "step": 10689 + }, + { + "epoch": 0.0635764582738605, + "grad_norm": 1.6470626592636108, + "learning_rate": 4.950309062371286e-05, + "loss": 6.2401, + "step": 10690 + }, + { + "epoch": 0.0635824055571415, + "grad_norm": 1.799074649810791, + "learning_rate": 4.950299795283127e-05, + "loss": 6.1075, + "step": 10691 + }, + { + "epoch": 0.0635883528404225, + "grad_norm": 2.0551035404205322, + "learning_rate": 4.950290527339593e-05, + "loss": 5.6646, + "step": 10692 + }, + { + "epoch": 0.0635943001237035, + "grad_norm": 2.3543875217437744, + "learning_rate": 4.9502812585406875e-05, + "loss": 4.9341, + "step": 10693 + }, + { + "epoch": 0.06360024740698449, + "grad_norm": 2.0479071140289307, + "learning_rate": 4.950271988886415e-05, + "loss": 5.3351, + "step": 10694 + }, + { + "epoch": 0.06360619469026549, + "grad_norm": 1.9331302642822266, + "learning_rate": 4.950262718376778e-05, + "loss": 5.6269, + "step": 10695 + }, + { + "epoch": 0.06361214197354649, + "grad_norm": 1.9922640323638916, + "learning_rate": 4.950253447011779e-05, + "loss": 5.5113, + "step": 10696 + }, + { + "epoch": 0.06361808925682748, + "grad_norm": 1.769916296005249, + "learning_rate": 4.950244174791422e-05, + "loss": 5.5902, + "step": 10697 + }, + { + "epoch": 0.06362403654010848, + "grad_norm": 2.8808071613311768, + "learning_rate": 4.95023490171571e-05, + "loss": 4.9506, + "step": 10698 + }, + { + "epoch": 0.06362998382338947, + "grad_norm": 2.0609331130981445, + "learning_rate": 4.9502256277846466e-05, + "loss": 5.4256, + "step": 10699 + }, + { + "epoch": 0.06363593110667047, + "grad_norm": 2.0112223625183105, + "learning_rate": 4.950216352998234e-05, + "loss": 6.1121, + "step": 10700 + }, + { + "epoch": 0.06364187838995147, + "grad_norm": 1.5665667057037354, + "learning_rate": 4.9502070773564765e-05, + "loss": 5.1959, + "step": 10701 + }, + { + "epoch": 0.06364782567323246, + "grad_norm": 1.9731864929199219, + "learning_rate": 4.9501978008593774e-05, + "loss": 5.2887, + "step": 10702 + }, + { + "epoch": 0.06365377295651346, + "grad_norm": 1.7925242185592651, + "learning_rate": 4.9501885235069404e-05, + "loss": 5.7386, + "step": 10703 + }, + { + "epoch": 0.06365972023979447, + "grad_norm": 1.6686629056930542, + "learning_rate": 4.950179245299166e-05, + "loss": 5.7279, + "step": 10704 + }, + { + "epoch": 0.06366566752307545, + "grad_norm": 2.034392833709717, + "learning_rate": 4.95016996623606e-05, + "loss": 5.6148, + "step": 10705 + }, + { + "epoch": 0.06367161480635646, + "grad_norm": 2.1711995601654053, + "learning_rate": 4.9501606863176254e-05, + "loss": 5.7088, + "step": 10706 + }, + { + "epoch": 0.06367756208963746, + "grad_norm": 2.3276829719543457, + "learning_rate": 4.950151405543865e-05, + "loss": 5.3658, + "step": 10707 + }, + { + "epoch": 0.06368350937291845, + "grad_norm": 2.174130916595459, + "learning_rate": 4.9501421239147824e-05, + "loss": 5.3459, + "step": 10708 + }, + { + "epoch": 0.06368945665619945, + "grad_norm": 1.8721747398376465, + "learning_rate": 4.9501328414303794e-05, + "loss": 5.3375, + "step": 10709 + }, + { + "epoch": 0.06369540393948045, + "grad_norm": 1.8677324056625366, + "learning_rate": 4.9501235580906615e-05, + "loss": 5.8192, + "step": 10710 + }, + { + "epoch": 0.06370135122276144, + "grad_norm": 2.0901246070861816, + "learning_rate": 4.9501142738956294e-05, + "loss": 6.1188, + "step": 10711 + }, + { + "epoch": 0.06370729850604244, + "grad_norm": 1.7860997915267944, + "learning_rate": 4.9501049888452885e-05, + "loss": 5.4011, + "step": 10712 + }, + { + "epoch": 0.06371324578932344, + "grad_norm": 2.000946283340454, + "learning_rate": 4.950095702939642e-05, + "loss": 5.16, + "step": 10713 + }, + { + "epoch": 0.06371919307260443, + "grad_norm": 2.47086501121521, + "learning_rate": 4.950086416178691e-05, + "loss": 5.1543, + "step": 10714 + }, + { + "epoch": 0.06372514035588543, + "grad_norm": 1.8694473505020142, + "learning_rate": 4.9500771285624415e-05, + "loss": 5.3576, + "step": 10715 + }, + { + "epoch": 0.06373108763916643, + "grad_norm": 1.8921676874160767, + "learning_rate": 4.9500678400908946e-05, + "loss": 5.0827, + "step": 10716 + }, + { + "epoch": 0.06373703492244742, + "grad_norm": 1.8423974514007568, + "learning_rate": 4.950058550764054e-05, + "loss": 4.9912, + "step": 10717 + }, + { + "epoch": 0.06374298220572842, + "grad_norm": 1.6893757581710815, + "learning_rate": 4.950049260581924e-05, + "loss": 5.2792, + "step": 10718 + }, + { + "epoch": 0.06374892948900943, + "grad_norm": 1.720799446105957, + "learning_rate": 4.950039969544507e-05, + "loss": 5.4355, + "step": 10719 + }, + { + "epoch": 0.06375487677229041, + "grad_norm": 1.717527151107788, + "learning_rate": 4.9500306776518065e-05, + "loss": 5.2802, + "step": 10720 + }, + { + "epoch": 0.06376082405557142, + "grad_norm": 1.876207947731018, + "learning_rate": 4.950021384903825e-05, + "loss": 5.4667, + "step": 10721 + }, + { + "epoch": 0.06376677133885242, + "grad_norm": 1.7892308235168457, + "learning_rate": 4.9500120913005666e-05, + "loss": 5.6635, + "step": 10722 + }, + { + "epoch": 0.0637727186221334, + "grad_norm": 1.828092336654663, + "learning_rate": 4.950002796842034e-05, + "loss": 5.5301, + "step": 10723 + }, + { + "epoch": 0.06377866590541441, + "grad_norm": 1.5860785245895386, + "learning_rate": 4.949993501528232e-05, + "loss": 5.2337, + "step": 10724 + }, + { + "epoch": 0.06378461318869541, + "grad_norm": 1.731295108795166, + "learning_rate": 4.949984205359161e-05, + "loss": 5.4115, + "step": 10725 + }, + { + "epoch": 0.0637905604719764, + "grad_norm": 2.194288969039917, + "learning_rate": 4.949974908334827e-05, + "loss": 5.4736, + "step": 10726 + }, + { + "epoch": 0.0637965077552574, + "grad_norm": 1.6036415100097656, + "learning_rate": 4.949965610455231e-05, + "loss": 5.4563, + "step": 10727 + }, + { + "epoch": 0.06380245503853839, + "grad_norm": 1.6228232383728027, + "learning_rate": 4.949956311720378e-05, + "loss": 5.4695, + "step": 10728 + }, + { + "epoch": 0.06380840232181939, + "grad_norm": 1.3040069341659546, + "learning_rate": 4.94994701213027e-05, + "loss": 5.0126, + "step": 10729 + }, + { + "epoch": 0.06381434960510039, + "grad_norm": 1.5976930856704712, + "learning_rate": 4.9499377116849116e-05, + "loss": 5.0165, + "step": 10730 + }, + { + "epoch": 0.06382029688838138, + "grad_norm": 1.5877797603607178, + "learning_rate": 4.9499284103843046e-05, + "loss": 5.1634, + "step": 10731 + }, + { + "epoch": 0.06382624417166238, + "grad_norm": 1.6466439962387085, + "learning_rate": 4.949919108228453e-05, + "loss": 5.3954, + "step": 10732 + }, + { + "epoch": 0.06383219145494338, + "grad_norm": 1.5188345909118652, + "learning_rate": 4.949909805217361e-05, + "loss": 5.2876, + "step": 10733 + }, + { + "epoch": 0.06383813873822437, + "grad_norm": 1.836227297782898, + "learning_rate": 4.94990050135103e-05, + "loss": 5.4966, + "step": 10734 + }, + { + "epoch": 0.06384408602150538, + "grad_norm": 1.5542840957641602, + "learning_rate": 4.9498911966294635e-05, + "loss": 5.2188, + "step": 10735 + }, + { + "epoch": 0.06385003330478638, + "grad_norm": 1.3053034543991089, + "learning_rate": 4.9498818910526656e-05, + "loss": 5.3834, + "step": 10736 + }, + { + "epoch": 0.06385598058806737, + "grad_norm": 1.4250247478485107, + "learning_rate": 4.9498725846206395e-05, + "loss": 5.1852, + "step": 10737 + }, + { + "epoch": 0.06386192787134837, + "grad_norm": 1.5885393619537354, + "learning_rate": 4.9498632773333886e-05, + "loss": 5.2518, + "step": 10738 + }, + { + "epoch": 0.06386787515462937, + "grad_norm": 1.5664896965026855, + "learning_rate": 4.949853969190915e-05, + "loss": 5.1186, + "step": 10739 + }, + { + "epoch": 0.06387382243791036, + "grad_norm": 1.5156123638153076, + "learning_rate": 4.949844660193223e-05, + "loss": 5.1111, + "step": 10740 + }, + { + "epoch": 0.06387976972119136, + "grad_norm": 1.5308325290679932, + "learning_rate": 4.949835350340316e-05, + "loss": 5.1577, + "step": 10741 + }, + { + "epoch": 0.06388571700447236, + "grad_norm": 1.3338321447372437, + "learning_rate": 4.949826039632196e-05, + "loss": 5.2386, + "step": 10742 + }, + { + "epoch": 0.06389166428775335, + "grad_norm": 1.5307821035385132, + "learning_rate": 4.9498167280688676e-05, + "loss": 5.1173, + "step": 10743 + }, + { + "epoch": 0.06389761157103435, + "grad_norm": 1.607913613319397, + "learning_rate": 4.9498074156503325e-05, + "loss": 5.3077, + "step": 10744 + }, + { + "epoch": 0.06390355885431535, + "grad_norm": 1.6242469549179077, + "learning_rate": 4.949798102376596e-05, + "loss": 5.3319, + "step": 10745 + }, + { + "epoch": 0.06390950613759634, + "grad_norm": 1.62213134765625, + "learning_rate": 4.9497887882476604e-05, + "loss": 5.3494, + "step": 10746 + }, + { + "epoch": 0.06391545342087734, + "grad_norm": 1.4064897298812866, + "learning_rate": 4.949779473263528e-05, + "loss": 5.207, + "step": 10747 + }, + { + "epoch": 0.06392140070415835, + "grad_norm": 1.7431879043579102, + "learning_rate": 4.949770157424203e-05, + "loss": 5.4068, + "step": 10748 + }, + { + "epoch": 0.06392734798743933, + "grad_norm": 1.5815304517745972, + "learning_rate": 4.949760840729689e-05, + "loss": 5.3917, + "step": 10749 + }, + { + "epoch": 0.06393329527072034, + "grad_norm": 1.576541543006897, + "learning_rate": 4.949751523179988e-05, + "loss": 5.4123, + "step": 10750 + }, + { + "epoch": 0.06393924255400134, + "grad_norm": 1.6717814207077026, + "learning_rate": 4.9497422047751054e-05, + "loss": 5.3028, + "step": 10751 + }, + { + "epoch": 0.06394518983728233, + "grad_norm": 1.4091792106628418, + "learning_rate": 4.9497328855150424e-05, + "loss": 5.2231, + "step": 10752 + }, + { + "epoch": 0.06395113712056333, + "grad_norm": 1.4366726875305176, + "learning_rate": 4.949723565399803e-05, + "loss": 5.2908, + "step": 10753 + }, + { + "epoch": 0.06395708440384433, + "grad_norm": 1.6679248809814453, + "learning_rate": 4.9497142444293906e-05, + "loss": 5.1079, + "step": 10754 + }, + { + "epoch": 0.06396303168712532, + "grad_norm": 1.6619216203689575, + "learning_rate": 4.949704922603808e-05, + "loss": 5.1504, + "step": 10755 + }, + { + "epoch": 0.06396897897040632, + "grad_norm": 1.7149940729141235, + "learning_rate": 4.9496955999230586e-05, + "loss": 5.3031, + "step": 10756 + }, + { + "epoch": 0.06397492625368732, + "grad_norm": 1.711256504058838, + "learning_rate": 4.9496862763871456e-05, + "loss": 5.2146, + "step": 10757 + }, + { + "epoch": 0.06398087353696831, + "grad_norm": 1.654680609703064, + "learning_rate": 4.949676951996073e-05, + "loss": 5.2774, + "step": 10758 + }, + { + "epoch": 0.06398682082024931, + "grad_norm": 1.5115636587142944, + "learning_rate": 4.949667626749843e-05, + "loss": 5.2155, + "step": 10759 + }, + { + "epoch": 0.0639927681035303, + "grad_norm": 1.7153947353363037, + "learning_rate": 4.9496583006484596e-05, + "loss": 5.2711, + "step": 10760 + }, + { + "epoch": 0.0639987153868113, + "grad_norm": 1.8497945070266724, + "learning_rate": 4.949648973691926e-05, + "loss": 5.2864, + "step": 10761 + }, + { + "epoch": 0.0640046626700923, + "grad_norm": 1.5251562595367432, + "learning_rate": 4.9496396458802455e-05, + "loss": 5.2532, + "step": 10762 + }, + { + "epoch": 0.0640106099533733, + "grad_norm": 1.5916621685028076, + "learning_rate": 4.94963031721342e-05, + "loss": 5.2136, + "step": 10763 + }, + { + "epoch": 0.0640165572366543, + "grad_norm": 1.5781627893447876, + "learning_rate": 4.949620987691455e-05, + "loss": 5.3188, + "step": 10764 + }, + { + "epoch": 0.0640225045199353, + "grad_norm": 1.7783690690994263, + "learning_rate": 4.9496116573143515e-05, + "loss": 5.4196, + "step": 10765 + }, + { + "epoch": 0.06402845180321629, + "grad_norm": 1.5746928453445435, + "learning_rate": 4.949602326082115e-05, + "loss": 5.3724, + "step": 10766 + }, + { + "epoch": 0.06403439908649729, + "grad_norm": 1.677771806716919, + "learning_rate": 4.9495929939947475e-05, + "loss": 5.2894, + "step": 10767 + }, + { + "epoch": 0.06404034636977829, + "grad_norm": 1.7747725248336792, + "learning_rate": 4.949583661052252e-05, + "loss": 5.0527, + "step": 10768 + }, + { + "epoch": 0.06404629365305928, + "grad_norm": 1.6927893161773682, + "learning_rate": 4.9495743272546314e-05, + "loss": 5.0999, + "step": 10769 + }, + { + "epoch": 0.06405224093634028, + "grad_norm": 1.6289039850234985, + "learning_rate": 4.949564992601891e-05, + "loss": 5.4197, + "step": 10770 + }, + { + "epoch": 0.06405818821962128, + "grad_norm": 1.742658019065857, + "learning_rate": 4.9495556570940316e-05, + "loss": 5.2927, + "step": 10771 + }, + { + "epoch": 0.06406413550290227, + "grad_norm": 1.6643215417861938, + "learning_rate": 4.949546320731059e-05, + "loss": 5.3262, + "step": 10772 + }, + { + "epoch": 0.06407008278618327, + "grad_norm": 1.6400927305221558, + "learning_rate": 4.949536983512974e-05, + "loss": 5.1072, + "step": 10773 + }, + { + "epoch": 0.06407603006946427, + "grad_norm": 1.7093544006347656, + "learning_rate": 4.949527645439781e-05, + "loss": 5.1849, + "step": 10774 + }, + { + "epoch": 0.06408197735274526, + "grad_norm": 1.6980849504470825, + "learning_rate": 4.949518306511484e-05, + "loss": 5.3661, + "step": 10775 + }, + { + "epoch": 0.06408792463602626, + "grad_norm": 1.7241551876068115, + "learning_rate": 4.949508966728085e-05, + "loss": 5.3315, + "step": 10776 + }, + { + "epoch": 0.06409387191930727, + "grad_norm": 1.8421318531036377, + "learning_rate": 4.9494996260895874e-05, + "loss": 5.3506, + "step": 10777 + }, + { + "epoch": 0.06409981920258825, + "grad_norm": 1.835738182067871, + "learning_rate": 4.949490284595995e-05, + "loss": 5.2087, + "step": 10778 + }, + { + "epoch": 0.06410576648586926, + "grad_norm": 1.6622625589370728, + "learning_rate": 4.949480942247311e-05, + "loss": 5.0072, + "step": 10779 + }, + { + "epoch": 0.06411171376915026, + "grad_norm": 1.5437613725662231, + "learning_rate": 4.949471599043539e-05, + "loss": 5.182, + "step": 10780 + }, + { + "epoch": 0.06411766105243125, + "grad_norm": 1.620758295059204, + "learning_rate": 4.949462254984681e-05, + "loss": 5.2771, + "step": 10781 + }, + { + "epoch": 0.06412360833571225, + "grad_norm": 1.6143954992294312, + "learning_rate": 4.949452910070741e-05, + "loss": 5.1175, + "step": 10782 + }, + { + "epoch": 0.06412955561899325, + "grad_norm": 1.8173086643218994, + "learning_rate": 4.949443564301722e-05, + "loss": 5.175, + "step": 10783 + }, + { + "epoch": 0.06413550290227424, + "grad_norm": 1.75434148311615, + "learning_rate": 4.9494342176776284e-05, + "loss": 5.1133, + "step": 10784 + }, + { + "epoch": 0.06414145018555524, + "grad_norm": 1.7278660535812378, + "learning_rate": 4.949424870198462e-05, + "loss": 5.0704, + "step": 10785 + }, + { + "epoch": 0.06414739746883624, + "grad_norm": 1.793285608291626, + "learning_rate": 4.949415521864228e-05, + "loss": 5.1567, + "step": 10786 + }, + { + "epoch": 0.06415334475211723, + "grad_norm": 1.7892498970031738, + "learning_rate": 4.949406172674927e-05, + "loss": 5.201, + "step": 10787 + }, + { + "epoch": 0.06415929203539823, + "grad_norm": 2.276643991470337, + "learning_rate": 4.9493968226305645e-05, + "loss": 5.5555, + "step": 10788 + }, + { + "epoch": 0.06416523931867922, + "grad_norm": 1.5785993337631226, + "learning_rate": 4.9493874717311416e-05, + "loss": 5.2692, + "step": 10789 + }, + { + "epoch": 0.06417118660196022, + "grad_norm": 1.3982635736465454, + "learning_rate": 4.949378119976664e-05, + "loss": 5.24, + "step": 10790 + }, + { + "epoch": 0.06417713388524122, + "grad_norm": 1.4310967922210693, + "learning_rate": 4.949368767367133e-05, + "loss": 5.2032, + "step": 10791 + }, + { + "epoch": 0.06418308116852221, + "grad_norm": 1.5635451078414917, + "learning_rate": 4.949359413902554e-05, + "loss": 5.2589, + "step": 10792 + }, + { + "epoch": 0.06418902845180322, + "grad_norm": 1.5000566244125366, + "learning_rate": 4.949350059582927e-05, + "loss": 5.147, + "step": 10793 + }, + { + "epoch": 0.06419497573508422, + "grad_norm": 1.7782738208770752, + "learning_rate": 4.9493407044082585e-05, + "loss": 5.1987, + "step": 10794 + }, + { + "epoch": 0.0642009230183652, + "grad_norm": 1.5931564569473267, + "learning_rate": 4.94933134837855e-05, + "loss": 5.2591, + "step": 10795 + }, + { + "epoch": 0.06420687030164621, + "grad_norm": 1.619287371635437, + "learning_rate": 4.9493219914938055e-05, + "loss": 5.1041, + "step": 10796 + }, + { + "epoch": 0.06421281758492721, + "grad_norm": 1.5174281597137451, + "learning_rate": 4.949312633754028e-05, + "loss": 5.1798, + "step": 10797 + }, + { + "epoch": 0.0642187648682082, + "grad_norm": 1.6485828161239624, + "learning_rate": 4.9493032751592205e-05, + "loss": 5.1086, + "step": 10798 + }, + { + "epoch": 0.0642247121514892, + "grad_norm": 1.830984354019165, + "learning_rate": 4.949293915709386e-05, + "loss": 5.2241, + "step": 10799 + }, + { + "epoch": 0.0642306594347702, + "grad_norm": 1.9102944135665894, + "learning_rate": 4.94928455540453e-05, + "loss": 4.9652, + "step": 10800 + }, + { + "epoch": 0.06423660671805119, + "grad_norm": 1.6826778650283813, + "learning_rate": 4.949275194244653e-05, + "loss": 5.0479, + "step": 10801 + }, + { + "epoch": 0.06424255400133219, + "grad_norm": 1.7545628547668457, + "learning_rate": 4.9492658322297595e-05, + "loss": 4.9263, + "step": 10802 + }, + { + "epoch": 0.0642485012846132, + "grad_norm": 1.621121883392334, + "learning_rate": 4.949256469359852e-05, + "loss": 4.9095, + "step": 10803 + }, + { + "epoch": 0.06425444856789418, + "grad_norm": 1.727095603942871, + "learning_rate": 4.9492471056349356e-05, + "loss": 5.1913, + "step": 10804 + }, + { + "epoch": 0.06426039585117518, + "grad_norm": 1.749241590499878, + "learning_rate": 4.949237741055011e-05, + "loss": 5.4284, + "step": 10805 + }, + { + "epoch": 0.06426634313445619, + "grad_norm": 1.627784252166748, + "learning_rate": 4.9492283756200834e-05, + "loss": 5.547, + "step": 10806 + }, + { + "epoch": 0.06427229041773717, + "grad_norm": 1.8133957386016846, + "learning_rate": 4.949219009330155e-05, + "loss": 5.5841, + "step": 10807 + }, + { + "epoch": 0.06427823770101818, + "grad_norm": 1.6667630672454834, + "learning_rate": 4.949209642185231e-05, + "loss": 5.4091, + "step": 10808 + }, + { + "epoch": 0.06428418498429918, + "grad_norm": 1.601288914680481, + "learning_rate": 4.949200274185312e-05, + "loss": 4.9647, + "step": 10809 + }, + { + "epoch": 0.06429013226758017, + "grad_norm": 1.4544743299484253, + "learning_rate": 4.9491909053304025e-05, + "loss": 5.477, + "step": 10810 + }, + { + "epoch": 0.06429607955086117, + "grad_norm": 1.65786874294281, + "learning_rate": 4.949181535620506e-05, + "loss": 5.2401, + "step": 10811 + }, + { + "epoch": 0.06430202683414217, + "grad_norm": 1.561251163482666, + "learning_rate": 4.949172165055625e-05, + "loss": 5.7689, + "step": 10812 + }, + { + "epoch": 0.06430797411742316, + "grad_norm": 1.465378999710083, + "learning_rate": 4.949162793635764e-05, + "loss": 5.4109, + "step": 10813 + }, + { + "epoch": 0.06431392140070416, + "grad_norm": 1.3914259672164917, + "learning_rate": 4.949153421360926e-05, + "loss": 5.5144, + "step": 10814 + }, + { + "epoch": 0.06431986868398516, + "grad_norm": 1.6016005277633667, + "learning_rate": 4.949144048231113e-05, + "loss": 5.2708, + "step": 10815 + }, + { + "epoch": 0.06432581596726615, + "grad_norm": 1.4063479900360107, + "learning_rate": 4.94913467424633e-05, + "loss": 5.0303, + "step": 10816 + }, + { + "epoch": 0.06433176325054715, + "grad_norm": 1.5708017349243164, + "learning_rate": 4.9491252994065785e-05, + "loss": 5.3104, + "step": 10817 + }, + { + "epoch": 0.06433771053382814, + "grad_norm": 1.5542651414871216, + "learning_rate": 4.9491159237118626e-05, + "loss": 5.1308, + "step": 10818 + }, + { + "epoch": 0.06434365781710914, + "grad_norm": 1.3946558237075806, + "learning_rate": 4.9491065471621855e-05, + "loss": 5.243, + "step": 10819 + }, + { + "epoch": 0.06434960510039014, + "grad_norm": 1.3560529947280884, + "learning_rate": 4.9490971697575513e-05, + "loss": 4.9319, + "step": 10820 + }, + { + "epoch": 0.06435555238367113, + "grad_norm": 1.6921281814575195, + "learning_rate": 4.949087791497963e-05, + "loss": 5.2203, + "step": 10821 + }, + { + "epoch": 0.06436149966695213, + "grad_norm": 1.5226655006408691, + "learning_rate": 4.9490784123834225e-05, + "loss": 5.1879, + "step": 10822 + }, + { + "epoch": 0.06436744695023314, + "grad_norm": 1.5012669563293457, + "learning_rate": 4.9490690324139346e-05, + "loss": 5.2373, + "step": 10823 + }, + { + "epoch": 0.06437339423351413, + "grad_norm": 1.8050286769866943, + "learning_rate": 4.949059651589502e-05, + "loss": 5.0441, + "step": 10824 + }, + { + "epoch": 0.06437934151679513, + "grad_norm": 1.6800918579101562, + "learning_rate": 4.9490502699101274e-05, + "loss": 5.0871, + "step": 10825 + }, + { + "epoch": 0.06438528880007613, + "grad_norm": 1.4211550951004028, + "learning_rate": 4.949040887375814e-05, + "loss": 5.118, + "step": 10826 + }, + { + "epoch": 0.06439123608335712, + "grad_norm": 1.7064868211746216, + "learning_rate": 4.949031503986568e-05, + "loss": 5.2285, + "step": 10827 + }, + { + "epoch": 0.06439718336663812, + "grad_norm": 1.862491250038147, + "learning_rate": 4.949022119742388e-05, + "loss": 5.0958, + "step": 10828 + }, + { + "epoch": 0.06440313064991912, + "grad_norm": 1.933610200881958, + "learning_rate": 4.949012734643281e-05, + "loss": 5.1282, + "step": 10829 + }, + { + "epoch": 0.06440907793320011, + "grad_norm": 1.6140058040618896, + "learning_rate": 4.949003348689249e-05, + "loss": 4.9913, + "step": 10830 + }, + { + "epoch": 0.06441502521648111, + "grad_norm": 1.6881496906280518, + "learning_rate": 4.948993961880295e-05, + "loss": 5.1017, + "step": 10831 + }, + { + "epoch": 0.06442097249976211, + "grad_norm": 1.7887358665466309, + "learning_rate": 4.948984574216422e-05, + "loss": 5.1503, + "step": 10832 + }, + { + "epoch": 0.0644269197830431, + "grad_norm": 1.635720133781433, + "learning_rate": 4.948975185697634e-05, + "loss": 5.3381, + "step": 10833 + }, + { + "epoch": 0.0644328670663241, + "grad_norm": 1.6106109619140625, + "learning_rate": 4.9489657963239346e-05, + "loss": 5.0498, + "step": 10834 + }, + { + "epoch": 0.0644388143496051, + "grad_norm": 1.740438461303711, + "learning_rate": 4.9489564060953266e-05, + "loss": 5.0302, + "step": 10835 + }, + { + "epoch": 0.0644447616328861, + "grad_norm": 1.663994312286377, + "learning_rate": 4.9489470150118124e-05, + "loss": 5.1976, + "step": 10836 + }, + { + "epoch": 0.0644507089161671, + "grad_norm": 1.6748932600021362, + "learning_rate": 4.9489376230733965e-05, + "loss": 5.0055, + "step": 10837 + }, + { + "epoch": 0.0644566561994481, + "grad_norm": 1.7139437198638916, + "learning_rate": 4.948928230280082e-05, + "loss": 4.9617, + "step": 10838 + }, + { + "epoch": 0.06446260348272909, + "grad_norm": 1.698791742324829, + "learning_rate": 4.948918836631872e-05, + "loss": 4.9725, + "step": 10839 + }, + { + "epoch": 0.06446855076601009, + "grad_norm": 1.6961768865585327, + "learning_rate": 4.94890944212877e-05, + "loss": 4.9126, + "step": 10840 + }, + { + "epoch": 0.06447449804929109, + "grad_norm": 1.6551483869552612, + "learning_rate": 4.948900046770778e-05, + "loss": 5.0775, + "step": 10841 + }, + { + "epoch": 0.06448044533257208, + "grad_norm": 1.5863447189331055, + "learning_rate": 4.948890650557901e-05, + "loss": 5.0467, + "step": 10842 + }, + { + "epoch": 0.06448639261585308, + "grad_norm": 1.5629637241363525, + "learning_rate": 4.9488812534901414e-05, + "loss": 5.0012, + "step": 10843 + }, + { + "epoch": 0.06449233989913408, + "grad_norm": 1.5247453451156616, + "learning_rate": 4.948871855567503e-05, + "loss": 4.9928, + "step": 10844 + }, + { + "epoch": 0.06449828718241507, + "grad_norm": 1.7595921754837036, + "learning_rate": 4.948862456789988e-05, + "loss": 4.9256, + "step": 10845 + }, + { + "epoch": 0.06450423446569607, + "grad_norm": 1.6370458602905273, + "learning_rate": 4.948853057157601e-05, + "loss": 4.9499, + "step": 10846 + }, + { + "epoch": 0.06451018174897706, + "grad_norm": 1.7747406959533691, + "learning_rate": 4.948843656670345e-05, + "loss": 4.9246, + "step": 10847 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.6769739389419556, + "learning_rate": 4.948834255328222e-05, + "loss": 4.9561, + "step": 10848 + }, + { + "epoch": 0.06452207631553906, + "grad_norm": 1.60416841506958, + "learning_rate": 4.948824853131236e-05, + "loss": 5.0318, + "step": 10849 + }, + { + "epoch": 0.06452802359882005, + "grad_norm": 2.1050093173980713, + "learning_rate": 4.948815450079392e-05, + "loss": 5.5308, + "step": 10850 + }, + { + "epoch": 0.06453397088210105, + "grad_norm": 1.7474935054779053, + "learning_rate": 4.948806046172691e-05, + "loss": 5.0752, + "step": 10851 + }, + { + "epoch": 0.06453991816538206, + "grad_norm": 1.8992688655853271, + "learning_rate": 4.948796641411138e-05, + "loss": 5.3704, + "step": 10852 + }, + { + "epoch": 0.06454586544866305, + "grad_norm": 1.9632636308670044, + "learning_rate": 4.948787235794734e-05, + "loss": 5.4173, + "step": 10853 + }, + { + "epoch": 0.06455181273194405, + "grad_norm": 1.9034284353256226, + "learning_rate": 4.948777829323484e-05, + "loss": 5.2655, + "step": 10854 + }, + { + "epoch": 0.06455776001522505, + "grad_norm": 1.716711163520813, + "learning_rate": 4.9487684219973914e-05, + "loss": 5.4192, + "step": 10855 + }, + { + "epoch": 0.06456370729850604, + "grad_norm": 1.7886557579040527, + "learning_rate": 4.948759013816459e-05, + "loss": 5.2828, + "step": 10856 + }, + { + "epoch": 0.06456965458178704, + "grad_norm": 2.004117250442505, + "learning_rate": 4.9487496047806905e-05, + "loss": 4.9521, + "step": 10857 + }, + { + "epoch": 0.06457560186506804, + "grad_norm": 1.627955436706543, + "learning_rate": 4.948740194890088e-05, + "loss": 5.4288, + "step": 10858 + }, + { + "epoch": 0.06458154914834903, + "grad_norm": 2.2537145614624023, + "learning_rate": 4.948730784144656e-05, + "loss": 5.8176, + "step": 10859 + }, + { + "epoch": 0.06458749643163003, + "grad_norm": 2.216066837310791, + "learning_rate": 4.948721372544397e-05, + "loss": 5.4569, + "step": 10860 + }, + { + "epoch": 0.06459344371491103, + "grad_norm": 1.7641898393630981, + "learning_rate": 4.948711960089315e-05, + "loss": 5.659, + "step": 10861 + }, + { + "epoch": 0.06459939099819202, + "grad_norm": 1.9137814044952393, + "learning_rate": 4.948702546779413e-05, + "loss": 5.6275, + "step": 10862 + }, + { + "epoch": 0.06460533828147302, + "grad_norm": 2.2355434894561768, + "learning_rate": 4.948693132614694e-05, + "loss": 5.1712, + "step": 10863 + }, + { + "epoch": 0.06461128556475403, + "grad_norm": 1.780849814414978, + "learning_rate": 4.9486837175951616e-05, + "loss": 5.4521, + "step": 10864 + }, + { + "epoch": 0.06461723284803501, + "grad_norm": 1.8078423738479614, + "learning_rate": 4.948674301720819e-05, + "loss": 5.3609, + "step": 10865 + }, + { + "epoch": 0.06462318013131602, + "grad_norm": 1.590707540512085, + "learning_rate": 4.94866488499167e-05, + "loss": 5.4121, + "step": 10866 + }, + { + "epoch": 0.06462912741459702, + "grad_norm": 1.4369510412216187, + "learning_rate": 4.948655467407717e-05, + "loss": 5.418, + "step": 10867 + }, + { + "epoch": 0.064635074697878, + "grad_norm": 1.5800751447677612, + "learning_rate": 4.9486460489689634e-05, + "loss": 5.3492, + "step": 10868 + }, + { + "epoch": 0.06464102198115901, + "grad_norm": 1.5271484851837158, + "learning_rate": 4.948636629675413e-05, + "loss": 5.2758, + "step": 10869 + }, + { + "epoch": 0.06464696926444001, + "grad_norm": 1.7175722122192383, + "learning_rate": 4.948627209527069e-05, + "loss": 5.2939, + "step": 10870 + }, + { + "epoch": 0.064652916547721, + "grad_norm": 1.568851113319397, + "learning_rate": 4.948617788523935e-05, + "loss": 5.2559, + "step": 10871 + }, + { + "epoch": 0.064658863831002, + "grad_norm": 1.4012210369110107, + "learning_rate": 4.9486083666660135e-05, + "loss": 5.3195, + "step": 10872 + }, + { + "epoch": 0.064664811114283, + "grad_norm": 1.5386475324630737, + "learning_rate": 4.948598943953308e-05, + "loss": 5.293, + "step": 10873 + }, + { + "epoch": 0.06467075839756399, + "grad_norm": 1.4143292903900146, + "learning_rate": 4.948589520385821e-05, + "loss": 5.2181, + "step": 10874 + }, + { + "epoch": 0.06467670568084499, + "grad_norm": 1.392470121383667, + "learning_rate": 4.9485800959635576e-05, + "loss": 5.3074, + "step": 10875 + }, + { + "epoch": 0.06468265296412598, + "grad_norm": 1.7176567316055298, + "learning_rate": 4.94857067068652e-05, + "loss": 5.3024, + "step": 10876 + }, + { + "epoch": 0.06468860024740698, + "grad_norm": 1.5002285242080688, + "learning_rate": 4.9485612445547115e-05, + "loss": 5.1543, + "step": 10877 + }, + { + "epoch": 0.06469454753068798, + "grad_norm": 1.5615242719650269, + "learning_rate": 4.9485518175681364e-05, + "loss": 5.371, + "step": 10878 + }, + { + "epoch": 0.06470049481396897, + "grad_norm": 1.4294706583023071, + "learning_rate": 4.9485423897267966e-05, + "loss": 5.4151, + "step": 10879 + }, + { + "epoch": 0.06470644209724997, + "grad_norm": 2.0147571563720703, + "learning_rate": 4.948532961030695e-05, + "loss": 5.3082, + "step": 10880 + }, + { + "epoch": 0.06471238938053098, + "grad_norm": 1.5661358833312988, + "learning_rate": 4.948523531479837e-05, + "loss": 5.8232, + "step": 10881 + }, + { + "epoch": 0.06471833666381197, + "grad_norm": 1.5608779191970825, + "learning_rate": 4.9485141010742245e-05, + "loss": 5.5648, + "step": 10882 + }, + { + "epoch": 0.06472428394709297, + "grad_norm": 2.3148789405822754, + "learning_rate": 4.948504669813861e-05, + "loss": 4.8802, + "step": 10883 + }, + { + "epoch": 0.06473023123037397, + "grad_norm": 1.9495759010314941, + "learning_rate": 4.9484952376987504e-05, + "loss": 5.1985, + "step": 10884 + }, + { + "epoch": 0.06473617851365496, + "grad_norm": 2.031764268875122, + "learning_rate": 4.9484858047288944e-05, + "loss": 5.0772, + "step": 10885 + }, + { + "epoch": 0.06474212579693596, + "grad_norm": 1.6575301885604858, + "learning_rate": 4.948476370904298e-05, + "loss": 5.2157, + "step": 10886 + }, + { + "epoch": 0.06474807308021696, + "grad_norm": 1.6381278038024902, + "learning_rate": 4.948466936224964e-05, + "loss": 5.1168, + "step": 10887 + }, + { + "epoch": 0.06475402036349795, + "grad_norm": 1.672555923461914, + "learning_rate": 4.9484575006908945e-05, + "loss": 5.2839, + "step": 10888 + }, + { + "epoch": 0.06475996764677895, + "grad_norm": 1.8838026523590088, + "learning_rate": 4.9484480643020944e-05, + "loss": 5.301, + "step": 10889 + }, + { + "epoch": 0.06476591493005995, + "grad_norm": 1.935205101966858, + "learning_rate": 4.9484386270585656e-05, + "loss": 5.2898, + "step": 10890 + }, + { + "epoch": 0.06477186221334094, + "grad_norm": 1.630003809928894, + "learning_rate": 4.9484291889603134e-05, + "loss": 5.181, + "step": 10891 + }, + { + "epoch": 0.06477780949662194, + "grad_norm": 1.5095784664154053, + "learning_rate": 4.948419750007339e-05, + "loss": 5.3159, + "step": 10892 + }, + { + "epoch": 0.06478375677990295, + "grad_norm": 1.7217234373092651, + "learning_rate": 4.948410310199647e-05, + "loss": 5.3395, + "step": 10893 + }, + { + "epoch": 0.06478970406318393, + "grad_norm": 1.727953314781189, + "learning_rate": 4.94840086953724e-05, + "loss": 5.1374, + "step": 10894 + }, + { + "epoch": 0.06479565134646494, + "grad_norm": 1.7891777753829956, + "learning_rate": 4.9483914280201224e-05, + "loss": 5.2145, + "step": 10895 + }, + { + "epoch": 0.06480159862974594, + "grad_norm": 1.7402048110961914, + "learning_rate": 4.9483819856482956e-05, + "loss": 5.1723, + "step": 10896 + }, + { + "epoch": 0.06480754591302693, + "grad_norm": 1.6635658740997314, + "learning_rate": 4.9483725424217644e-05, + "loss": 5.0995, + "step": 10897 + }, + { + "epoch": 0.06481349319630793, + "grad_norm": 1.6190650463104248, + "learning_rate": 4.9483630983405317e-05, + "loss": 5.2062, + "step": 10898 + }, + { + "epoch": 0.06481944047958893, + "grad_norm": 1.6335800886154175, + "learning_rate": 4.9483536534046006e-05, + "loss": 5.4298, + "step": 10899 + }, + { + "epoch": 0.06482538776286992, + "grad_norm": 1.7549209594726562, + "learning_rate": 4.948344207613974e-05, + "loss": 5.1833, + "step": 10900 + }, + { + "epoch": 0.06483133504615092, + "grad_norm": 1.6011431217193604, + "learning_rate": 4.948334760968656e-05, + "loss": 5.2329, + "step": 10901 + }, + { + "epoch": 0.06483728232943192, + "grad_norm": 1.627424955368042, + "learning_rate": 4.9483253134686505e-05, + "loss": 5.3059, + "step": 10902 + }, + { + "epoch": 0.06484322961271291, + "grad_norm": 1.593361258506775, + "learning_rate": 4.948315865113959e-05, + "loss": 5.2711, + "step": 10903 + }, + { + "epoch": 0.06484917689599391, + "grad_norm": 1.5899426937103271, + "learning_rate": 4.9483064159045854e-05, + "loss": 5.2449, + "step": 10904 + }, + { + "epoch": 0.0648551241792749, + "grad_norm": 1.6572548151016235, + "learning_rate": 4.948296965840534e-05, + "loss": 5.18, + "step": 10905 + }, + { + "epoch": 0.0648610714625559, + "grad_norm": 1.649928092956543, + "learning_rate": 4.948287514921808e-05, + "loss": 5.2434, + "step": 10906 + }, + { + "epoch": 0.0648670187458369, + "grad_norm": 1.4546284675598145, + "learning_rate": 4.9482780631484094e-05, + "loss": 5.405, + "step": 10907 + }, + { + "epoch": 0.06487296602911789, + "grad_norm": 1.624617338180542, + "learning_rate": 4.9482686105203425e-05, + "loss": 5.3537, + "step": 10908 + }, + { + "epoch": 0.0648789133123989, + "grad_norm": 1.5108991861343384, + "learning_rate": 4.94825915703761e-05, + "loss": 5.1709, + "step": 10909 + }, + { + "epoch": 0.0648848605956799, + "grad_norm": 1.571028470993042, + "learning_rate": 4.948249702700215e-05, + "loss": 5.1374, + "step": 10910 + }, + { + "epoch": 0.06489080787896088, + "grad_norm": 1.3280094861984253, + "learning_rate": 4.948240247508162e-05, + "loss": 5.3469, + "step": 10911 + }, + { + "epoch": 0.06489675516224189, + "grad_norm": 1.8487119674682617, + "learning_rate": 4.948230791461454e-05, + "loss": 5.4673, + "step": 10912 + }, + { + "epoch": 0.06490270244552289, + "grad_norm": 1.6253544092178345, + "learning_rate": 4.9482213345600936e-05, + "loss": 5.2096, + "step": 10913 + }, + { + "epoch": 0.06490864972880388, + "grad_norm": 1.8487451076507568, + "learning_rate": 4.9482118768040844e-05, + "loss": 5.1452, + "step": 10914 + }, + { + "epoch": 0.06491459701208488, + "grad_norm": 1.6638668775558472, + "learning_rate": 4.948202418193429e-05, + "loss": 5.2382, + "step": 10915 + }, + { + "epoch": 0.06492054429536588, + "grad_norm": 1.662256121635437, + "learning_rate": 4.9481929587281326e-05, + "loss": 5.3125, + "step": 10916 + }, + { + "epoch": 0.06492649157864687, + "grad_norm": 1.5133339166641235, + "learning_rate": 4.948183498408197e-05, + "loss": 5.2494, + "step": 10917 + }, + { + "epoch": 0.06493243886192787, + "grad_norm": 1.5063300132751465, + "learning_rate": 4.9481740372336256e-05, + "loss": 5.1778, + "step": 10918 + }, + { + "epoch": 0.06493838614520887, + "grad_norm": 1.5223631858825684, + "learning_rate": 4.948164575204421e-05, + "loss": 5.1773, + "step": 10919 + }, + { + "epoch": 0.06494433342848986, + "grad_norm": 1.6163926124572754, + "learning_rate": 4.948155112320589e-05, + "loss": 5.2669, + "step": 10920 + }, + { + "epoch": 0.06495028071177086, + "grad_norm": 1.4077887535095215, + "learning_rate": 4.948145648582131e-05, + "loss": 5.1711, + "step": 10921 + }, + { + "epoch": 0.06495622799505187, + "grad_norm": 1.5710374116897583, + "learning_rate": 4.9481361839890505e-05, + "loss": 5.1687, + "step": 10922 + }, + { + "epoch": 0.06496217527833285, + "grad_norm": 1.5444159507751465, + "learning_rate": 4.9481267185413506e-05, + "loss": 5.2681, + "step": 10923 + }, + { + "epoch": 0.06496812256161386, + "grad_norm": 1.4816917181015015, + "learning_rate": 4.948117252239035e-05, + "loss": 5.2897, + "step": 10924 + }, + { + "epoch": 0.06497406984489486, + "grad_norm": 1.3373851776123047, + "learning_rate": 4.9481077850821075e-05, + "loss": 5.1607, + "step": 10925 + }, + { + "epoch": 0.06498001712817585, + "grad_norm": 1.7353702783584595, + "learning_rate": 4.948098317070571e-05, + "loss": 5.2546, + "step": 10926 + }, + { + "epoch": 0.06498596441145685, + "grad_norm": 1.4494054317474365, + "learning_rate": 4.948088848204428e-05, + "loss": 5.2244, + "step": 10927 + }, + { + "epoch": 0.06499191169473785, + "grad_norm": 1.6031813621520996, + "learning_rate": 4.9480793784836825e-05, + "loss": 5.2487, + "step": 10928 + }, + { + "epoch": 0.06499785897801884, + "grad_norm": 1.4134970903396606, + "learning_rate": 4.948069907908338e-05, + "loss": 5.2224, + "step": 10929 + }, + { + "epoch": 0.06500380626129984, + "grad_norm": 1.5790150165557861, + "learning_rate": 4.948060436478398e-05, + "loss": 5.3096, + "step": 10930 + }, + { + "epoch": 0.06500975354458084, + "grad_norm": 1.3925936222076416, + "learning_rate": 4.9480509641938644e-05, + "loss": 5.1823, + "step": 10931 + }, + { + "epoch": 0.06501570082786183, + "grad_norm": 1.40078866481781, + "learning_rate": 4.948041491054742e-05, + "loss": 5.1352, + "step": 10932 + }, + { + "epoch": 0.06502164811114283, + "grad_norm": 1.509726881980896, + "learning_rate": 4.948032017061034e-05, + "loss": 5.199, + "step": 10933 + }, + { + "epoch": 0.06502759539442382, + "grad_norm": 1.5671876668930054, + "learning_rate": 4.948022542212743e-05, + "loss": 5.2323, + "step": 10934 + }, + { + "epoch": 0.06503354267770482, + "grad_norm": 1.5019149780273438, + "learning_rate": 4.948013066509872e-05, + "loss": 5.244, + "step": 10935 + }, + { + "epoch": 0.06503948996098582, + "grad_norm": 1.576842188835144, + "learning_rate": 4.948003589952426e-05, + "loss": 5.153, + "step": 10936 + }, + { + "epoch": 0.06504543724426681, + "grad_norm": 1.4069315195083618, + "learning_rate": 4.9479941125404074e-05, + "loss": 5.3396, + "step": 10937 + }, + { + "epoch": 0.06505138452754781, + "grad_norm": 1.6663076877593994, + "learning_rate": 4.947984634273818e-05, + "loss": 5.223, + "step": 10938 + }, + { + "epoch": 0.06505733181082882, + "grad_norm": 1.5132073163986206, + "learning_rate": 4.947975155152663e-05, + "loss": 5.1335, + "step": 10939 + }, + { + "epoch": 0.0650632790941098, + "grad_norm": 1.59386146068573, + "learning_rate": 4.9479656751769455e-05, + "loss": 5.4893, + "step": 10940 + }, + { + "epoch": 0.06506922637739081, + "grad_norm": 1.3486778736114502, + "learning_rate": 4.9479561943466686e-05, + "loss": 5.2164, + "step": 10941 + }, + { + "epoch": 0.06507517366067181, + "grad_norm": 1.4107574224472046, + "learning_rate": 4.947946712661835e-05, + "loss": 5.2337, + "step": 10942 + }, + { + "epoch": 0.0650811209439528, + "grad_norm": 1.6905080080032349, + "learning_rate": 4.947937230122449e-05, + "loss": 5.1749, + "step": 10943 + }, + { + "epoch": 0.0650870682272338, + "grad_norm": 1.5062333345413208, + "learning_rate": 4.947927746728513e-05, + "loss": 5.2227, + "step": 10944 + }, + { + "epoch": 0.0650930155105148, + "grad_norm": 1.4318712949752808, + "learning_rate": 4.947918262480031e-05, + "loss": 5.1565, + "step": 10945 + }, + { + "epoch": 0.06509896279379579, + "grad_norm": 1.5121338367462158, + "learning_rate": 4.9479087773770055e-05, + "loss": 5.3718, + "step": 10946 + }, + { + "epoch": 0.06510491007707679, + "grad_norm": 1.2901450395584106, + "learning_rate": 4.947899291419441e-05, + "loss": 5.291, + "step": 10947 + }, + { + "epoch": 0.0651108573603578, + "grad_norm": 1.5350853204727173, + "learning_rate": 4.9478898046073394e-05, + "loss": 5.411, + "step": 10948 + }, + { + "epoch": 0.06511680464363878, + "grad_norm": 1.5083260536193848, + "learning_rate": 4.947880316940705e-05, + "loss": 4.9143, + "step": 10949 + }, + { + "epoch": 0.06512275192691978, + "grad_norm": 1.462415099143982, + "learning_rate": 4.947870828419541e-05, + "loss": 5.0059, + "step": 10950 + }, + { + "epoch": 0.06512869921020079, + "grad_norm": 1.9356911182403564, + "learning_rate": 4.947861339043851e-05, + "loss": 5.3886, + "step": 10951 + }, + { + "epoch": 0.06513464649348177, + "grad_norm": 1.4918417930603027, + "learning_rate": 4.947851848813637e-05, + "loss": 5.3456, + "step": 10952 + }, + { + "epoch": 0.06514059377676278, + "grad_norm": 1.8015687465667725, + "learning_rate": 4.9478423577289044e-05, + "loss": 5.4599, + "step": 10953 + }, + { + "epoch": 0.06514654106004378, + "grad_norm": 1.663827657699585, + "learning_rate": 4.947832865789654e-05, + "loss": 5.4448, + "step": 10954 + }, + { + "epoch": 0.06515248834332477, + "grad_norm": 1.7196985483169556, + "learning_rate": 4.947823372995891e-05, + "loss": 5.4799, + "step": 10955 + }, + { + "epoch": 0.06515843562660577, + "grad_norm": 1.341449499130249, + "learning_rate": 4.947813879347619e-05, + "loss": 5.0305, + "step": 10956 + }, + { + "epoch": 0.06516438290988677, + "grad_norm": 1.9917103052139282, + "learning_rate": 4.9478043848448394e-05, + "loss": 4.9911, + "step": 10957 + }, + { + "epoch": 0.06517033019316776, + "grad_norm": 1.8540695905685425, + "learning_rate": 4.947794889487557e-05, + "loss": 4.9725, + "step": 10958 + }, + { + "epoch": 0.06517627747644876, + "grad_norm": 1.6755226850509644, + "learning_rate": 4.9477853932757744e-05, + "loss": 5.1452, + "step": 10959 + }, + { + "epoch": 0.06518222475972976, + "grad_norm": 1.613694667816162, + "learning_rate": 4.9477758962094954e-05, + "loss": 5.1241, + "step": 10960 + }, + { + "epoch": 0.06518817204301075, + "grad_norm": 1.4891341924667358, + "learning_rate": 4.9477663982887235e-05, + "loss": 5.2139, + "step": 10961 + }, + { + "epoch": 0.06519411932629175, + "grad_norm": 1.451180100440979, + "learning_rate": 4.947756899513461e-05, + "loss": 5.216, + "step": 10962 + }, + { + "epoch": 0.06520006660957274, + "grad_norm": 1.7225643396377563, + "learning_rate": 4.947747399883712e-05, + "loss": 4.9342, + "step": 10963 + }, + { + "epoch": 0.06520601389285374, + "grad_norm": 1.5917341709136963, + "learning_rate": 4.94773789939948e-05, + "loss": 4.9196, + "step": 10964 + }, + { + "epoch": 0.06521196117613474, + "grad_norm": 1.3010936975479126, + "learning_rate": 4.947728398060768e-05, + "loss": 4.8165, + "step": 10965 + }, + { + "epoch": 0.06521790845941573, + "grad_norm": 1.6672911643981934, + "learning_rate": 4.947718895867579e-05, + "loss": 5.082, + "step": 10966 + }, + { + "epoch": 0.06522385574269673, + "grad_norm": 1.5662728548049927, + "learning_rate": 4.947709392819916e-05, + "loss": 5.1654, + "step": 10967 + }, + { + "epoch": 0.06522980302597774, + "grad_norm": 1.3455015420913696, + "learning_rate": 4.947699888917784e-05, + "loss": 4.6897, + "step": 10968 + }, + { + "epoch": 0.06523575030925872, + "grad_norm": 1.6042569875717163, + "learning_rate": 4.947690384161185e-05, + "loss": 4.6814, + "step": 10969 + }, + { + "epoch": 0.06524169759253973, + "grad_norm": 1.436345100402832, + "learning_rate": 4.947680878550123e-05, + "loss": 4.6052, + "step": 10970 + }, + { + "epoch": 0.06524764487582073, + "grad_norm": 1.3438220024108887, + "learning_rate": 4.9476713720846e-05, + "loss": 4.6385, + "step": 10971 + }, + { + "epoch": 0.06525359215910172, + "grad_norm": 1.378206729888916, + "learning_rate": 4.94766186476462e-05, + "loss": 4.5546, + "step": 10972 + }, + { + "epoch": 0.06525953944238272, + "grad_norm": 1.5776808261871338, + "learning_rate": 4.9476523565901874e-05, + "loss": 4.7728, + "step": 10973 + }, + { + "epoch": 0.06526548672566372, + "grad_norm": 1.8892265558242798, + "learning_rate": 4.947642847561305e-05, + "loss": 5.3423, + "step": 10974 + }, + { + "epoch": 0.06527143400894471, + "grad_norm": 1.279730200767517, + "learning_rate": 4.9476333376779746e-05, + "loss": 4.649, + "step": 10975 + }, + { + "epoch": 0.06527738129222571, + "grad_norm": 1.6268417835235596, + "learning_rate": 4.947623826940201e-05, + "loss": 4.6534, + "step": 10976 + }, + { + "epoch": 0.06528332857550671, + "grad_norm": 1.4456939697265625, + "learning_rate": 4.947614315347987e-05, + "loss": 4.6636, + "step": 10977 + }, + { + "epoch": 0.0652892758587877, + "grad_norm": 1.4848358631134033, + "learning_rate": 4.947604802901337e-05, + "loss": 4.6823, + "step": 10978 + }, + { + "epoch": 0.0652952231420687, + "grad_norm": 1.4143959283828735, + "learning_rate": 4.947595289600253e-05, + "loss": 4.546, + "step": 10979 + }, + { + "epoch": 0.0653011704253497, + "grad_norm": 1.7399781942367554, + "learning_rate": 4.947585775444739e-05, + "loss": 5.1456, + "step": 10980 + }, + { + "epoch": 0.0653071177086307, + "grad_norm": 1.9160579442977905, + "learning_rate": 4.947576260434797e-05, + "loss": 5.4101, + "step": 10981 + }, + { + "epoch": 0.0653130649919117, + "grad_norm": 1.9356415271759033, + "learning_rate": 4.947566744570433e-05, + "loss": 5.6235, + "step": 10982 + }, + { + "epoch": 0.0653190122751927, + "grad_norm": 1.756996512413025, + "learning_rate": 4.947557227851648e-05, + "loss": 5.6458, + "step": 10983 + }, + { + "epoch": 0.06532495955847369, + "grad_norm": 1.790447473526001, + "learning_rate": 4.947547710278446e-05, + "loss": 5.1529, + "step": 10984 + }, + { + "epoch": 0.06533090684175469, + "grad_norm": 1.8125256299972534, + "learning_rate": 4.94753819185083e-05, + "loss": 4.8824, + "step": 10985 + }, + { + "epoch": 0.06533685412503569, + "grad_norm": 1.72708261013031, + "learning_rate": 4.947528672568804e-05, + "loss": 5.1252, + "step": 10986 + }, + { + "epoch": 0.06534280140831668, + "grad_norm": 1.5867630243301392, + "learning_rate": 4.9475191524323714e-05, + "loss": 5.2007, + "step": 10987 + }, + { + "epoch": 0.06534874869159768, + "grad_norm": 1.8278383016586304, + "learning_rate": 4.9475096314415356e-05, + "loss": 5.1268, + "step": 10988 + }, + { + "epoch": 0.06535469597487868, + "grad_norm": 1.6850647926330566, + "learning_rate": 4.947500109596298e-05, + "loss": 5.0058, + "step": 10989 + }, + { + "epoch": 0.06536064325815967, + "grad_norm": 1.4993211030960083, + "learning_rate": 4.9474905868966645e-05, + "loss": 5.1911, + "step": 10990 + }, + { + "epoch": 0.06536659054144067, + "grad_norm": 1.4816709756851196, + "learning_rate": 4.947481063342637e-05, + "loss": 5.073, + "step": 10991 + }, + { + "epoch": 0.06537253782472166, + "grad_norm": 1.5394763946533203, + "learning_rate": 4.9474715389342194e-05, + "loss": 5.3133, + "step": 10992 + }, + { + "epoch": 0.06537848510800266, + "grad_norm": 1.6095061302185059, + "learning_rate": 4.9474620136714144e-05, + "loss": 5.1657, + "step": 10993 + }, + { + "epoch": 0.06538443239128366, + "grad_norm": 1.707533597946167, + "learning_rate": 4.947452487554226e-05, + "loss": 5.2022, + "step": 10994 + }, + { + "epoch": 0.06539037967456465, + "grad_norm": 1.6304863691329956, + "learning_rate": 4.947442960582657e-05, + "loss": 5.1454, + "step": 10995 + }, + { + "epoch": 0.06539632695784565, + "grad_norm": 1.5767943859100342, + "learning_rate": 4.9474334327567103e-05, + "loss": 5.0317, + "step": 10996 + }, + { + "epoch": 0.06540227424112666, + "grad_norm": 1.6779369115829468, + "learning_rate": 4.9474239040763916e-05, + "loss": 5.1932, + "step": 10997 + }, + { + "epoch": 0.06540822152440764, + "grad_norm": 1.6607457399368286, + "learning_rate": 4.947414374541701e-05, + "loss": 5.2488, + "step": 10998 + }, + { + "epoch": 0.06541416880768865, + "grad_norm": 1.5271342992782593, + "learning_rate": 4.947404844152644e-05, + "loss": 5.2225, + "step": 10999 + }, + { + "epoch": 0.06542011609096965, + "grad_norm": 1.3633404970169067, + "learning_rate": 4.947395312909223e-05, + "loss": 5.2228, + "step": 11000 + }, + { + "epoch": 0.06542606337425064, + "grad_norm": 1.4911702871322632, + "learning_rate": 4.9473857808114416e-05, + "loss": 5.3533, + "step": 11001 + }, + { + "epoch": 0.06543201065753164, + "grad_norm": 1.350714087486267, + "learning_rate": 4.947376247859303e-05, + "loss": 5.2553, + "step": 11002 + }, + { + "epoch": 0.06543795794081264, + "grad_norm": 1.531064510345459, + "learning_rate": 4.9473667140528116e-05, + "loss": 5.0982, + "step": 11003 + }, + { + "epoch": 0.06544390522409363, + "grad_norm": 1.4037193059921265, + "learning_rate": 4.947357179391968e-05, + "loss": 5.2129, + "step": 11004 + }, + { + "epoch": 0.06544985250737463, + "grad_norm": 1.5746560096740723, + "learning_rate": 4.9473476438767784e-05, + "loss": 5.2561, + "step": 11005 + }, + { + "epoch": 0.06545579979065563, + "grad_norm": 1.4906586408615112, + "learning_rate": 4.947338107507245e-05, + "loss": 5.2584, + "step": 11006 + }, + { + "epoch": 0.06546174707393662, + "grad_norm": 1.687965989112854, + "learning_rate": 4.947328570283371e-05, + "loss": 5.0578, + "step": 11007 + }, + { + "epoch": 0.06546769435721762, + "grad_norm": 1.6732810735702515, + "learning_rate": 4.94731903220516e-05, + "loss": 5.1301, + "step": 11008 + }, + { + "epoch": 0.06547364164049863, + "grad_norm": 1.465431809425354, + "learning_rate": 4.947309493272615e-05, + "loss": 5.2479, + "step": 11009 + }, + { + "epoch": 0.06547958892377961, + "grad_norm": 1.4699040651321411, + "learning_rate": 4.94729995348574e-05, + "loss": 5.263, + "step": 11010 + }, + { + "epoch": 0.06548553620706062, + "grad_norm": 1.5757801532745361, + "learning_rate": 4.947290412844537e-05, + "loss": 5.2938, + "step": 11011 + }, + { + "epoch": 0.06549148349034162, + "grad_norm": 1.5458070039749146, + "learning_rate": 4.947280871349011e-05, + "loss": 5.2755, + "step": 11012 + }, + { + "epoch": 0.0654974307736226, + "grad_norm": 1.4919404983520508, + "learning_rate": 4.9472713289991644e-05, + "loss": 5.1432, + "step": 11013 + }, + { + "epoch": 0.06550337805690361, + "grad_norm": 1.513539433479309, + "learning_rate": 4.947261785795001e-05, + "loss": 5.3262, + "step": 11014 + }, + { + "epoch": 0.06550932534018461, + "grad_norm": 1.610257863998413, + "learning_rate": 4.947252241736523e-05, + "loss": 5.1444, + "step": 11015 + }, + { + "epoch": 0.0655152726234656, + "grad_norm": 1.5597975254058838, + "learning_rate": 4.947242696823735e-05, + "loss": 5.1581, + "step": 11016 + }, + { + "epoch": 0.0655212199067466, + "grad_norm": 1.686418056488037, + "learning_rate": 4.94723315105664e-05, + "loss": 5.1608, + "step": 11017 + }, + { + "epoch": 0.0655271671900276, + "grad_norm": 1.5329445600509644, + "learning_rate": 4.94722360443524e-05, + "loss": 5.1716, + "step": 11018 + }, + { + "epoch": 0.06553311447330859, + "grad_norm": 1.4718917608261108, + "learning_rate": 4.94721405695954e-05, + "loss": 5.0924, + "step": 11019 + }, + { + "epoch": 0.06553906175658959, + "grad_norm": 1.4442907571792603, + "learning_rate": 4.947204508629544e-05, + "loss": 5.3967, + "step": 11020 + }, + { + "epoch": 0.06554500903987058, + "grad_norm": 1.523834466934204, + "learning_rate": 4.947194959445253e-05, + "loss": 5.2068, + "step": 11021 + }, + { + "epoch": 0.06555095632315158, + "grad_norm": 1.4898262023925781, + "learning_rate": 4.947185409406672e-05, + "loss": 5.1664, + "step": 11022 + }, + { + "epoch": 0.06555690360643258, + "grad_norm": 1.504695177078247, + "learning_rate": 4.947175858513804e-05, + "loss": 5.2349, + "step": 11023 + }, + { + "epoch": 0.06556285088971357, + "grad_norm": 1.3538787364959717, + "learning_rate": 4.9471663067666516e-05, + "loss": 5.1034, + "step": 11024 + }, + { + "epoch": 0.06556879817299457, + "grad_norm": 1.3748440742492676, + "learning_rate": 4.94715675416522e-05, + "loss": 4.9759, + "step": 11025 + }, + { + "epoch": 0.06557474545627558, + "grad_norm": 1.5980280637741089, + "learning_rate": 4.94714720070951e-05, + "loss": 5.3042, + "step": 11026 + }, + { + "epoch": 0.06558069273955656, + "grad_norm": 1.641076683998108, + "learning_rate": 4.9471376463995266e-05, + "loss": 5.3373, + "step": 11027 + }, + { + "epoch": 0.06558664002283757, + "grad_norm": 1.5320390462875366, + "learning_rate": 4.947128091235273e-05, + "loss": 5.2308, + "step": 11028 + }, + { + "epoch": 0.06559258730611857, + "grad_norm": 1.5777555704116821, + "learning_rate": 4.9471185352167514e-05, + "loss": 5.2242, + "step": 11029 + }, + { + "epoch": 0.06559853458939956, + "grad_norm": 1.5055029392242432, + "learning_rate": 4.947108978343967e-05, + "loss": 5.1974, + "step": 11030 + }, + { + "epoch": 0.06560448187268056, + "grad_norm": 1.3923927545547485, + "learning_rate": 4.947099420616922e-05, + "loss": 5.3244, + "step": 11031 + }, + { + "epoch": 0.06561042915596156, + "grad_norm": 1.40999174118042, + "learning_rate": 4.9470898620356186e-05, + "loss": 5.3315, + "step": 11032 + }, + { + "epoch": 0.06561637643924255, + "grad_norm": 1.418296456336975, + "learning_rate": 4.947080302600063e-05, + "loss": 5.3942, + "step": 11033 + }, + { + "epoch": 0.06562232372252355, + "grad_norm": 1.7927478551864624, + "learning_rate": 4.9470707423102566e-05, + "loss": 5.3084, + "step": 11034 + }, + { + "epoch": 0.06562827100580455, + "grad_norm": 1.385011911392212, + "learning_rate": 4.947061181166203e-05, + "loss": 5.2043, + "step": 11035 + }, + { + "epoch": 0.06563421828908554, + "grad_norm": 1.5702954530715942, + "learning_rate": 4.9470516191679054e-05, + "loss": 5.9851, + "step": 11036 + }, + { + "epoch": 0.06564016557236654, + "grad_norm": 1.4196525812149048, + "learning_rate": 4.947042056315367e-05, + "loss": 5.2592, + "step": 11037 + }, + { + "epoch": 0.06564611285564755, + "grad_norm": 1.8318798542022705, + "learning_rate": 4.947032492608592e-05, + "loss": 5.3181, + "step": 11038 + }, + { + "epoch": 0.06565206013892853, + "grad_norm": 1.615460991859436, + "learning_rate": 4.947022928047583e-05, + "loss": 5.4053, + "step": 11039 + }, + { + "epoch": 0.06565800742220954, + "grad_norm": 1.384602427482605, + "learning_rate": 4.947013362632344e-05, + "loss": 5.3955, + "step": 11040 + }, + { + "epoch": 0.06566395470549054, + "grad_norm": 1.5959913730621338, + "learning_rate": 4.947003796362878e-05, + "loss": 5.4737, + "step": 11041 + }, + { + "epoch": 0.06566990198877153, + "grad_norm": 1.483659029006958, + "learning_rate": 4.946994229239188e-05, + "loss": 5.3804, + "step": 11042 + }, + { + "epoch": 0.06567584927205253, + "grad_norm": 1.2752004861831665, + "learning_rate": 4.946984661261277e-05, + "loss": 5.3806, + "step": 11043 + }, + { + "epoch": 0.06568179655533353, + "grad_norm": 2.0671582221984863, + "learning_rate": 4.946975092429149e-05, + "loss": 5.3047, + "step": 11044 + }, + { + "epoch": 0.06568774383861452, + "grad_norm": 1.6126081943511963, + "learning_rate": 4.946965522742808e-05, + "loss": 5.1905, + "step": 11045 + }, + { + "epoch": 0.06569369112189552, + "grad_norm": 1.6867598295211792, + "learning_rate": 4.946955952202257e-05, + "loss": 5.1543, + "step": 11046 + }, + { + "epoch": 0.06569963840517652, + "grad_norm": 1.3493974208831787, + "learning_rate": 4.946946380807498e-05, + "loss": 5.1527, + "step": 11047 + }, + { + "epoch": 0.06570558568845751, + "grad_norm": 1.4694898128509521, + "learning_rate": 4.946936808558536e-05, + "loss": 5.238, + "step": 11048 + }, + { + "epoch": 0.06571153297173851, + "grad_norm": 1.7940189838409424, + "learning_rate": 4.946927235455373e-05, + "loss": 5.0666, + "step": 11049 + }, + { + "epoch": 0.0657174802550195, + "grad_norm": 1.7015198469161987, + "learning_rate": 4.946917661498013e-05, + "loss": 5.5182, + "step": 11050 + }, + { + "epoch": 0.0657234275383005, + "grad_norm": 2.214686632156372, + "learning_rate": 4.946908086686459e-05, + "loss": 5.9424, + "step": 11051 + }, + { + "epoch": 0.0657293748215815, + "grad_norm": 1.7855008840560913, + "learning_rate": 4.9468985110207154e-05, + "loss": 5.8496, + "step": 11052 + }, + { + "epoch": 0.06573532210486249, + "grad_norm": 1.8354082107543945, + "learning_rate": 4.946888934500785e-05, + "loss": 5.8044, + "step": 11053 + }, + { + "epoch": 0.0657412693881435, + "grad_norm": 2.0321154594421387, + "learning_rate": 4.9468793571266705e-05, + "loss": 5.9488, + "step": 11054 + }, + { + "epoch": 0.0657472166714245, + "grad_norm": 2.2285213470458984, + "learning_rate": 4.946869778898376e-05, + "loss": 5.1819, + "step": 11055 + }, + { + "epoch": 0.06575316395470548, + "grad_norm": 1.9831287860870361, + "learning_rate": 4.946860199815904e-05, + "loss": 5.2068, + "step": 11056 + }, + { + "epoch": 0.06575911123798649, + "grad_norm": 2.1150667667388916, + "learning_rate": 4.946850619879259e-05, + "loss": 5.1523, + "step": 11057 + }, + { + "epoch": 0.06576505852126749, + "grad_norm": 1.9136968851089478, + "learning_rate": 4.946841039088444e-05, + "loss": 5.0084, + "step": 11058 + }, + { + "epoch": 0.06577100580454848, + "grad_norm": 1.9802511930465698, + "learning_rate": 4.9468314574434604e-05, + "loss": 4.9223, + "step": 11059 + }, + { + "epoch": 0.06577695308782948, + "grad_norm": 1.940656065940857, + "learning_rate": 4.946821874944315e-05, + "loss": 4.9662, + "step": 11060 + }, + { + "epoch": 0.06578290037111048, + "grad_norm": 1.8476706743240356, + "learning_rate": 4.9468122915910084e-05, + "loss": 4.8863, + "step": 11061 + }, + { + "epoch": 0.06578884765439147, + "grad_norm": 2.0490243434906006, + "learning_rate": 4.946802707383546e-05, + "loss": 4.8459, + "step": 11062 + }, + { + "epoch": 0.06579479493767247, + "grad_norm": 1.8996137380599976, + "learning_rate": 4.946793122321928e-05, + "loss": 4.7574, + "step": 11063 + }, + { + "epoch": 0.06580074222095347, + "grad_norm": 1.8910033702850342, + "learning_rate": 4.946783536406161e-05, + "loss": 4.8808, + "step": 11064 + }, + { + "epoch": 0.06580668950423446, + "grad_norm": 2.123816967010498, + "learning_rate": 4.946773949636247e-05, + "loss": 4.8486, + "step": 11065 + }, + { + "epoch": 0.06581263678751546, + "grad_norm": 1.7508260011672974, + "learning_rate": 4.9467643620121906e-05, + "loss": 4.9856, + "step": 11066 + }, + { + "epoch": 0.06581858407079647, + "grad_norm": 1.728398084640503, + "learning_rate": 4.9467547735339926e-05, + "loss": 4.9634, + "step": 11067 + }, + { + "epoch": 0.06582453135407745, + "grad_norm": 2.1020689010620117, + "learning_rate": 4.946745184201659e-05, + "loss": 4.6133, + "step": 11068 + }, + { + "epoch": 0.06583047863735846, + "grad_norm": 2.106549024581909, + "learning_rate": 4.9467355940151904e-05, + "loss": 4.7124, + "step": 11069 + }, + { + "epoch": 0.06583642592063946, + "grad_norm": 2.078505039215088, + "learning_rate": 4.9467260029745924e-05, + "loss": 4.5828, + "step": 11070 + }, + { + "epoch": 0.06584237320392045, + "grad_norm": 1.987950325012207, + "learning_rate": 4.946716411079868e-05, + "loss": 4.5823, + "step": 11071 + }, + { + "epoch": 0.06584832048720145, + "grad_norm": 1.9027208089828491, + "learning_rate": 4.94670681833102e-05, + "loss": 4.8063, + "step": 11072 + }, + { + "epoch": 0.06585426777048245, + "grad_norm": 2.001823902130127, + "learning_rate": 4.946697224728052e-05, + "loss": 4.5405, + "step": 11073 + }, + { + "epoch": 0.06586021505376344, + "grad_norm": 2.1472394466400146, + "learning_rate": 4.946687630270967e-05, + "loss": 4.6565, + "step": 11074 + }, + { + "epoch": 0.06586616233704444, + "grad_norm": 2.0731146335601807, + "learning_rate": 4.946678034959769e-05, + "loss": 4.5022, + "step": 11075 + }, + { + "epoch": 0.06587210962032544, + "grad_norm": 2.0769810676574707, + "learning_rate": 4.946668438794461e-05, + "loss": 4.5248, + "step": 11076 + }, + { + "epoch": 0.06587805690360643, + "grad_norm": 2.183871269226074, + "learning_rate": 4.946658841775046e-05, + "loss": 4.5723, + "step": 11077 + }, + { + "epoch": 0.06588400418688743, + "grad_norm": 2.0304160118103027, + "learning_rate": 4.9466492439015275e-05, + "loss": 4.5928, + "step": 11078 + }, + { + "epoch": 0.06588995147016842, + "grad_norm": 1.9167170524597168, + "learning_rate": 4.94663964517391e-05, + "loss": 4.4162, + "step": 11079 + }, + { + "epoch": 0.06589589875344942, + "grad_norm": 2.1295299530029297, + "learning_rate": 4.9466300455921946e-05, + "loss": 4.6662, + "step": 11080 + }, + { + "epoch": 0.06590184603673042, + "grad_norm": 2.180253744125366, + "learning_rate": 4.946620445156386e-05, + "loss": 4.5101, + "step": 11081 + }, + { + "epoch": 0.06590779332001141, + "grad_norm": 1.887289047241211, + "learning_rate": 4.9466108438664885e-05, + "loss": 4.3611, + "step": 11082 + }, + { + "epoch": 0.06591374060329241, + "grad_norm": 1.8323948383331299, + "learning_rate": 4.946601241722504e-05, + "loss": 4.8711, + "step": 11083 + }, + { + "epoch": 0.06591968788657342, + "grad_norm": 1.944860577583313, + "learning_rate": 4.946591638724436e-05, + "loss": 4.5288, + "step": 11084 + }, + { + "epoch": 0.0659256351698544, + "grad_norm": 1.9748528003692627, + "learning_rate": 4.946582034872288e-05, + "loss": 4.3819, + "step": 11085 + }, + { + "epoch": 0.0659315824531354, + "grad_norm": 2.017582416534424, + "learning_rate": 4.9465724301660635e-05, + "loss": 4.4508, + "step": 11086 + }, + { + "epoch": 0.06593752973641641, + "grad_norm": 1.8043986558914185, + "learning_rate": 4.946562824605766e-05, + "loss": 4.5948, + "step": 11087 + }, + { + "epoch": 0.0659434770196974, + "grad_norm": 1.8695666790008545, + "learning_rate": 4.946553218191399e-05, + "loss": 4.2691, + "step": 11088 + }, + { + "epoch": 0.0659494243029784, + "grad_norm": 2.027717351913452, + "learning_rate": 4.9465436109229656e-05, + "loss": 4.4152, + "step": 11089 + }, + { + "epoch": 0.0659553715862594, + "grad_norm": 1.989127278327942, + "learning_rate": 4.946534002800469e-05, + "loss": 4.5155, + "step": 11090 + }, + { + "epoch": 0.06596131886954039, + "grad_norm": 1.9889907836914062, + "learning_rate": 4.9465243938239124e-05, + "loss": 4.4047, + "step": 11091 + }, + { + "epoch": 0.06596726615282139, + "grad_norm": 2.077021837234497, + "learning_rate": 4.946514783993299e-05, + "loss": 4.5199, + "step": 11092 + }, + { + "epoch": 0.0659732134361024, + "grad_norm": 1.9180271625518799, + "learning_rate": 4.946505173308633e-05, + "loss": 4.4511, + "step": 11093 + }, + { + "epoch": 0.06597916071938338, + "grad_norm": 2.120338201522827, + "learning_rate": 4.946495561769918e-05, + "loss": 4.3034, + "step": 11094 + }, + { + "epoch": 0.06598510800266438, + "grad_norm": 1.9632322788238525, + "learning_rate": 4.946485949377156e-05, + "loss": 5.2411, + "step": 11095 + }, + { + "epoch": 0.06599105528594539, + "grad_norm": 2.0921249389648438, + "learning_rate": 4.946476336130351e-05, + "loss": 4.5768, + "step": 11096 + }, + { + "epoch": 0.06599700256922637, + "grad_norm": 2.1472532749176025, + "learning_rate": 4.9464667220295066e-05, + "loss": 4.6279, + "step": 11097 + }, + { + "epoch": 0.06600294985250738, + "grad_norm": 2.472062349319458, + "learning_rate": 4.946457107074626e-05, + "loss": 5.703, + "step": 11098 + }, + { + "epoch": 0.06600889713578838, + "grad_norm": 1.8995217084884644, + "learning_rate": 4.946447491265712e-05, + "loss": 4.5265, + "step": 11099 + }, + { + "epoch": 0.06601484441906937, + "grad_norm": 2.173339605331421, + "learning_rate": 4.946437874602769e-05, + "loss": 4.5356, + "step": 11100 + }, + { + "epoch": 0.06602079170235037, + "grad_norm": 1.8179867267608643, + "learning_rate": 4.9464282570858e-05, + "loss": 4.3765, + "step": 11101 + }, + { + "epoch": 0.06602673898563137, + "grad_norm": 2.367713212966919, + "learning_rate": 4.946418638714808e-05, + "loss": 5.6831, + "step": 11102 + }, + { + "epoch": 0.06603268626891236, + "grad_norm": 2.3576571941375732, + "learning_rate": 4.9464090194897964e-05, + "loss": 5.563, + "step": 11103 + }, + { + "epoch": 0.06603863355219336, + "grad_norm": 2.0476090908050537, + "learning_rate": 4.946399399410768e-05, + "loss": 5.7503, + "step": 11104 + }, + { + "epoch": 0.06604458083547436, + "grad_norm": 2.104295253753662, + "learning_rate": 4.946389778477728e-05, + "loss": 5.669, + "step": 11105 + }, + { + "epoch": 0.06605052811875535, + "grad_norm": 2.1458580493927, + "learning_rate": 4.946380156690677e-05, + "loss": 5.5317, + "step": 11106 + }, + { + "epoch": 0.06605647540203635, + "grad_norm": 2.0373425483703613, + "learning_rate": 4.946370534049621e-05, + "loss": 5.5952, + "step": 11107 + }, + { + "epoch": 0.06606242268531734, + "grad_norm": 2.232574701309204, + "learning_rate": 4.946360910554563e-05, + "loss": 5.6076, + "step": 11108 + }, + { + "epoch": 0.06606836996859834, + "grad_norm": 2.1477861404418945, + "learning_rate": 4.946351286205505e-05, + "loss": 5.5862, + "step": 11109 + }, + { + "epoch": 0.06607431725187934, + "grad_norm": 2.105203866958618, + "learning_rate": 4.946341661002451e-05, + "loss": 5.5089, + "step": 11110 + }, + { + "epoch": 0.06608026453516033, + "grad_norm": 2.1524410247802734, + "learning_rate": 4.9463320349454047e-05, + "loss": 5.419, + "step": 11111 + }, + { + "epoch": 0.06608621181844133, + "grad_norm": 2.132504463195801, + "learning_rate": 4.946322408034369e-05, + "loss": 5.3421, + "step": 11112 + }, + { + "epoch": 0.06609215910172234, + "grad_norm": 1.7870386838912964, + "learning_rate": 4.9463127802693474e-05, + "loss": 5.1829, + "step": 11113 + }, + { + "epoch": 0.06609810638500332, + "grad_norm": 1.9586358070373535, + "learning_rate": 4.946303151650343e-05, + "loss": 5.228, + "step": 11114 + }, + { + "epoch": 0.06610405366828433, + "grad_norm": 2.092473030090332, + "learning_rate": 4.9462935221773594e-05, + "loss": 5.4616, + "step": 11115 + }, + { + "epoch": 0.06611000095156533, + "grad_norm": 2.204131603240967, + "learning_rate": 4.946283891850401e-05, + "loss": 5.4552, + "step": 11116 + }, + { + "epoch": 0.06611594823484632, + "grad_norm": 1.998795747756958, + "learning_rate": 4.946274260669469e-05, + "loss": 5.5193, + "step": 11117 + }, + { + "epoch": 0.06612189551812732, + "grad_norm": 1.9446638822555542, + "learning_rate": 4.9462646286345684e-05, + "loss": 5.3923, + "step": 11118 + }, + { + "epoch": 0.06612784280140832, + "grad_norm": 1.828114628791809, + "learning_rate": 4.946254995745702e-05, + "loss": 5.4306, + "step": 11119 + }, + { + "epoch": 0.06613379008468931, + "grad_norm": 2.1322944164276123, + "learning_rate": 4.946245362002873e-05, + "loss": 5.3831, + "step": 11120 + }, + { + "epoch": 0.06613973736797031, + "grad_norm": 2.1194324493408203, + "learning_rate": 4.9462357274060856e-05, + "loss": 5.2805, + "step": 11121 + }, + { + "epoch": 0.06614568465125131, + "grad_norm": 2.011417865753174, + "learning_rate": 4.946226091955342e-05, + "loss": 5.3052, + "step": 11122 + }, + { + "epoch": 0.0661516319345323, + "grad_norm": 2.202887773513794, + "learning_rate": 4.9462164556506464e-05, + "loss": 5.5263, + "step": 11123 + }, + { + "epoch": 0.0661575792178133, + "grad_norm": 2.075645685195923, + "learning_rate": 4.946206818492002e-05, + "loss": 5.1033, + "step": 11124 + }, + { + "epoch": 0.0661635265010943, + "grad_norm": 2.0723443031311035, + "learning_rate": 4.946197180479412e-05, + "loss": 4.8365, + "step": 11125 + }, + { + "epoch": 0.0661694737843753, + "grad_norm": 2.245961904525757, + "learning_rate": 4.94618754161288e-05, + "loss": 5.0123, + "step": 11126 + }, + { + "epoch": 0.0661754210676563, + "grad_norm": 2.0513699054718018, + "learning_rate": 4.9461779018924096e-05, + "loss": 4.9909, + "step": 11127 + }, + { + "epoch": 0.0661813683509373, + "grad_norm": 2.1552181243896484, + "learning_rate": 4.9461682613180024e-05, + "loss": 5.165, + "step": 11128 + }, + { + "epoch": 0.06618731563421829, + "grad_norm": 2.1207263469696045, + "learning_rate": 4.946158619889664e-05, + "loss": 5.3254, + "step": 11129 + }, + { + "epoch": 0.06619326291749929, + "grad_norm": 1.8278319835662842, + "learning_rate": 4.946148977607397e-05, + "loss": 5.2462, + "step": 11130 + }, + { + "epoch": 0.06619921020078029, + "grad_norm": 2.434661865234375, + "learning_rate": 4.9461393344712046e-05, + "loss": 5.28, + "step": 11131 + }, + { + "epoch": 0.06620515748406128, + "grad_norm": 2.3434953689575195, + "learning_rate": 4.9461296904810904e-05, + "loss": 5.112, + "step": 11132 + }, + { + "epoch": 0.06621110476734228, + "grad_norm": 2.010430335998535, + "learning_rate": 4.946120045637057e-05, + "loss": 5.1236, + "step": 11133 + }, + { + "epoch": 0.06621705205062328, + "grad_norm": 2.19608736038208, + "learning_rate": 4.946110399939109e-05, + "loss": 5.122, + "step": 11134 + }, + { + "epoch": 0.06622299933390427, + "grad_norm": 1.9471449851989746, + "learning_rate": 4.946100753387249e-05, + "loss": 5.2849, + "step": 11135 + }, + { + "epoch": 0.06622894661718527, + "grad_norm": 2.0541727542877197, + "learning_rate": 4.94609110598148e-05, + "loss": 5.4196, + "step": 11136 + }, + { + "epoch": 0.06623489390046626, + "grad_norm": 2.268826723098755, + "learning_rate": 4.946081457721806e-05, + "loss": 5.449, + "step": 11137 + }, + { + "epoch": 0.06624084118374726, + "grad_norm": 2.075227975845337, + "learning_rate": 4.9460718086082307e-05, + "loss": 5.5463, + "step": 11138 + }, + { + "epoch": 0.06624678846702826, + "grad_norm": 2.0949649810791016, + "learning_rate": 4.9460621586407567e-05, + "loss": 5.3737, + "step": 11139 + }, + { + "epoch": 0.06625273575030925, + "grad_norm": 2.1247878074645996, + "learning_rate": 4.9460525078193874e-05, + "loss": 5.2766, + "step": 11140 + }, + { + "epoch": 0.06625868303359025, + "grad_norm": 1.8304489850997925, + "learning_rate": 4.9460428561441276e-05, + "loss": 5.181, + "step": 11141 + }, + { + "epoch": 0.06626463031687126, + "grad_norm": 2.160853862762451, + "learning_rate": 4.946033203614978e-05, + "loss": 5.5222, + "step": 11142 + }, + { + "epoch": 0.06627057760015224, + "grad_norm": 1.9857962131500244, + "learning_rate": 4.9460235502319446e-05, + "loss": 5.574, + "step": 11143 + }, + { + "epoch": 0.06627652488343325, + "grad_norm": 2.016709804534912, + "learning_rate": 4.9460138959950294e-05, + "loss": 5.5255, + "step": 11144 + }, + { + "epoch": 0.06628247216671425, + "grad_norm": 1.8675861358642578, + "learning_rate": 4.946004240904235e-05, + "loss": 5.3604, + "step": 11145 + }, + { + "epoch": 0.06628841944999524, + "grad_norm": 1.9159897565841675, + "learning_rate": 4.945994584959567e-05, + "loss": 5.5348, + "step": 11146 + }, + { + "epoch": 0.06629436673327624, + "grad_norm": 2.0460150241851807, + "learning_rate": 4.945984928161027e-05, + "loss": 5.3267, + "step": 11147 + }, + { + "epoch": 0.06630031401655724, + "grad_norm": 1.8361427783966064, + "learning_rate": 4.9459752705086196e-05, + "loss": 5.3309, + "step": 11148 + }, + { + "epoch": 0.06630626129983823, + "grad_norm": 1.5448495149612427, + "learning_rate": 4.945965612002347e-05, + "loss": 5.0789, + "step": 11149 + }, + { + "epoch": 0.06631220858311923, + "grad_norm": 1.4580925703048706, + "learning_rate": 4.9459559526422125e-05, + "loss": 5.2011, + "step": 11150 + }, + { + "epoch": 0.06631815586640023, + "grad_norm": 1.606593370437622, + "learning_rate": 4.945946292428221e-05, + "loss": 5.2061, + "step": 11151 + }, + { + "epoch": 0.06632410314968122, + "grad_norm": 1.4270994663238525, + "learning_rate": 4.945936631360375e-05, + "loss": 5.089, + "step": 11152 + }, + { + "epoch": 0.06633005043296222, + "grad_norm": 1.6082873344421387, + "learning_rate": 4.9459269694386766e-05, + "loss": 5.2502, + "step": 11153 + }, + { + "epoch": 0.06633599771624323, + "grad_norm": 1.5378412008285522, + "learning_rate": 4.945917306663131e-05, + "loss": 5.4431, + "step": 11154 + }, + { + "epoch": 0.06634194499952421, + "grad_norm": 1.2726879119873047, + "learning_rate": 4.9459076430337416e-05, + "loss": 5.4568, + "step": 11155 + }, + { + "epoch": 0.06634789228280522, + "grad_norm": 1.6131432056427002, + "learning_rate": 4.94589797855051e-05, + "loss": 5.2507, + "step": 11156 + }, + { + "epoch": 0.06635383956608622, + "grad_norm": 1.5835362672805786, + "learning_rate": 4.945888313213442e-05, + "loss": 5.1122, + "step": 11157 + }, + { + "epoch": 0.0663597868493672, + "grad_norm": 1.5903444290161133, + "learning_rate": 4.945878647022539e-05, + "loss": 5.3236, + "step": 11158 + }, + { + "epoch": 0.06636573413264821, + "grad_norm": 1.7948551177978516, + "learning_rate": 4.945868979977805e-05, + "loss": 5.5939, + "step": 11159 + }, + { + "epoch": 0.06637168141592921, + "grad_norm": 2.1183457374572754, + "learning_rate": 4.945859312079243e-05, + "loss": 5.3639, + "step": 11160 + }, + { + "epoch": 0.0663776286992102, + "grad_norm": 1.5584137439727783, + "learning_rate": 4.945849643326857e-05, + "loss": 5.4302, + "step": 11161 + }, + { + "epoch": 0.0663835759824912, + "grad_norm": 1.5150829553604126, + "learning_rate": 4.9458399737206504e-05, + "loss": 5.2485, + "step": 11162 + }, + { + "epoch": 0.0663895232657722, + "grad_norm": 1.421235203742981, + "learning_rate": 4.9458303032606264e-05, + "loss": 5.2149, + "step": 11163 + }, + { + "epoch": 0.06639547054905319, + "grad_norm": 1.640207052230835, + "learning_rate": 4.945820631946788e-05, + "loss": 5.2807, + "step": 11164 + }, + { + "epoch": 0.06640141783233419, + "grad_norm": 1.5021215677261353, + "learning_rate": 4.945810959779139e-05, + "loss": 5.3684, + "step": 11165 + }, + { + "epoch": 0.06640736511561518, + "grad_norm": 1.802828073501587, + "learning_rate": 4.945801286757682e-05, + "loss": 5.2153, + "step": 11166 + }, + { + "epoch": 0.06641331239889618, + "grad_norm": 1.556386947631836, + "learning_rate": 4.945791612882422e-05, + "loss": 5.1908, + "step": 11167 + }, + { + "epoch": 0.06641925968217718, + "grad_norm": 1.5906118154525757, + "learning_rate": 4.9457819381533616e-05, + "loss": 5.2183, + "step": 11168 + }, + { + "epoch": 0.06642520696545817, + "grad_norm": 1.5778700113296509, + "learning_rate": 4.945772262570503e-05, + "loss": 5.2465, + "step": 11169 + }, + { + "epoch": 0.06643115424873917, + "grad_norm": 1.4705984592437744, + "learning_rate": 4.945762586133852e-05, + "loss": 5.1496, + "step": 11170 + }, + { + "epoch": 0.06643710153202018, + "grad_norm": 1.5118781328201294, + "learning_rate": 4.9457529088434093e-05, + "loss": 5.1764, + "step": 11171 + }, + { + "epoch": 0.06644304881530116, + "grad_norm": 1.5784192085266113, + "learning_rate": 4.94574323069918e-05, + "loss": 5.165, + "step": 11172 + }, + { + "epoch": 0.06644899609858217, + "grad_norm": 1.517220139503479, + "learning_rate": 4.9457335517011666e-05, + "loss": 5.1718, + "step": 11173 + }, + { + "epoch": 0.06645494338186317, + "grad_norm": 1.3823192119598389, + "learning_rate": 4.9457238718493734e-05, + "loss": 5.1945, + "step": 11174 + }, + { + "epoch": 0.06646089066514416, + "grad_norm": 1.4499212503433228, + "learning_rate": 4.945714191143803e-05, + "loss": 5.1044, + "step": 11175 + }, + { + "epoch": 0.06646683794842516, + "grad_norm": 1.4904807806015015, + "learning_rate": 4.945704509584459e-05, + "loss": 5.1781, + "step": 11176 + }, + { + "epoch": 0.06647278523170616, + "grad_norm": 1.6798325777053833, + "learning_rate": 4.945694827171345e-05, + "loss": 4.8879, + "step": 11177 + }, + { + "epoch": 0.06647873251498715, + "grad_norm": 1.3890799283981323, + "learning_rate": 4.945685143904464e-05, + "loss": 4.9941, + "step": 11178 + }, + { + "epoch": 0.06648467979826815, + "grad_norm": 1.4167201519012451, + "learning_rate": 4.94567545978382e-05, + "loss": 5.016, + "step": 11179 + }, + { + "epoch": 0.06649062708154915, + "grad_norm": 1.5122467279434204, + "learning_rate": 4.9456657748094145e-05, + "loss": 4.9937, + "step": 11180 + }, + { + "epoch": 0.06649657436483014, + "grad_norm": 1.4347165822982788, + "learning_rate": 4.9456560889812543e-05, + "loss": 5.0486, + "step": 11181 + }, + { + "epoch": 0.06650252164811114, + "grad_norm": 1.6328964233398438, + "learning_rate": 4.94564640229934e-05, + "loss": 5.1891, + "step": 11182 + }, + { + "epoch": 0.06650846893139215, + "grad_norm": 1.5832617282867432, + "learning_rate": 4.9456367147636765e-05, + "loss": 5.2947, + "step": 11183 + }, + { + "epoch": 0.06651441621467313, + "grad_norm": 1.6932839155197144, + "learning_rate": 4.9456270263742655e-05, + "loss": 5.0755, + "step": 11184 + }, + { + "epoch": 0.06652036349795414, + "grad_norm": 1.6238216161727905, + "learning_rate": 4.945617337131111e-05, + "loss": 5.1903, + "step": 11185 + }, + { + "epoch": 0.06652631078123514, + "grad_norm": 2.362353801727295, + "learning_rate": 4.945607647034218e-05, + "loss": 5.3641, + "step": 11186 + }, + { + "epoch": 0.06653225806451613, + "grad_norm": 1.6447978019714355, + "learning_rate": 4.9455979560835874e-05, + "loss": 5.0174, + "step": 11187 + }, + { + "epoch": 0.06653820534779713, + "grad_norm": 1.6059958934783936, + "learning_rate": 4.945588264279225e-05, + "loss": 4.884, + "step": 11188 + }, + { + "epoch": 0.06654415263107813, + "grad_norm": 1.6291608810424805, + "learning_rate": 4.9455785716211325e-05, + "loss": 4.9735, + "step": 11189 + }, + { + "epoch": 0.06655009991435912, + "grad_norm": 1.6926389932632446, + "learning_rate": 4.9455688781093135e-05, + "loss": 4.9294, + "step": 11190 + }, + { + "epoch": 0.06655604719764012, + "grad_norm": 1.5816938877105713, + "learning_rate": 4.945559183743772e-05, + "loss": 4.9161, + "step": 11191 + }, + { + "epoch": 0.06656199448092112, + "grad_norm": 1.5514836311340332, + "learning_rate": 4.9455494885245115e-05, + "loss": 4.9102, + "step": 11192 + }, + { + "epoch": 0.06656794176420211, + "grad_norm": 1.6787114143371582, + "learning_rate": 4.9455397924515346e-05, + "loss": 4.9628, + "step": 11193 + }, + { + "epoch": 0.06657388904748311, + "grad_norm": 1.5264941453933716, + "learning_rate": 4.945530095524844e-05, + "loss": 5.1685, + "step": 11194 + }, + { + "epoch": 0.06657983633076411, + "grad_norm": 1.80072820186615, + "learning_rate": 4.945520397744445e-05, + "loss": 4.8308, + "step": 11195 + }, + { + "epoch": 0.0665857836140451, + "grad_norm": 1.7497553825378418, + "learning_rate": 4.945510699110341e-05, + "loss": 4.8846, + "step": 11196 + }, + { + "epoch": 0.0665917308973261, + "grad_norm": 1.8938134908676147, + "learning_rate": 4.945500999622533e-05, + "loss": 4.8303, + "step": 11197 + }, + { + "epoch": 0.06659767818060709, + "grad_norm": 1.7286055088043213, + "learning_rate": 4.9454912992810264e-05, + "loss": 4.7686, + "step": 11198 + }, + { + "epoch": 0.0666036254638881, + "grad_norm": 1.7573840618133545, + "learning_rate": 4.945481598085824e-05, + "loss": 4.7527, + "step": 11199 + }, + { + "epoch": 0.0666095727471691, + "grad_norm": 1.9013001918792725, + "learning_rate": 4.94547189603693e-05, + "loss": 5.0987, + "step": 11200 + }, + { + "epoch": 0.06661552003045008, + "grad_norm": 1.5453308820724487, + "learning_rate": 4.945462193134346e-05, + "loss": 5.3799, + "step": 11201 + }, + { + "epoch": 0.06662146731373109, + "grad_norm": 1.763839602470398, + "learning_rate": 4.945452489378076e-05, + "loss": 5.2904, + "step": 11202 + }, + { + "epoch": 0.06662741459701209, + "grad_norm": 1.650407075881958, + "learning_rate": 4.945442784768125e-05, + "loss": 5.3007, + "step": 11203 + }, + { + "epoch": 0.06663336188029308, + "grad_norm": 1.6620690822601318, + "learning_rate": 4.945433079304495e-05, + "loss": 5.394, + "step": 11204 + }, + { + "epoch": 0.06663930916357408, + "grad_norm": 1.5000416040420532, + "learning_rate": 4.945423372987189e-05, + "loss": 5.0648, + "step": 11205 + }, + { + "epoch": 0.06664525644685508, + "grad_norm": 2.1791460514068604, + "learning_rate": 4.945413665816211e-05, + "loss": 5.5261, + "step": 11206 + }, + { + "epoch": 0.06665120373013607, + "grad_norm": 2.084258556365967, + "learning_rate": 4.945403957791565e-05, + "loss": 5.5796, + "step": 11207 + }, + { + "epoch": 0.06665715101341707, + "grad_norm": 1.9391356706619263, + "learning_rate": 4.945394248913253e-05, + "loss": 5.4855, + "step": 11208 + }, + { + "epoch": 0.06666309829669807, + "grad_norm": 1.8323030471801758, + "learning_rate": 4.9453845391812803e-05, + "loss": 5.5711, + "step": 11209 + }, + { + "epoch": 0.06666904557997906, + "grad_norm": 1.9193792343139648, + "learning_rate": 4.945374828595648e-05, + "loss": 5.2585, + "step": 11210 + }, + { + "epoch": 0.06667499286326006, + "grad_norm": 1.7111014127731323, + "learning_rate": 4.9453651171563606e-05, + "loss": 5.1965, + "step": 11211 + }, + { + "epoch": 0.06668094014654107, + "grad_norm": 1.8574761152267456, + "learning_rate": 4.9453554048634224e-05, + "loss": 5.2538, + "step": 11212 + }, + { + "epoch": 0.06668688742982205, + "grad_norm": 2.18009352684021, + "learning_rate": 4.945345691716835e-05, + "loss": 5.2486, + "step": 11213 + }, + { + "epoch": 0.06669283471310306, + "grad_norm": 2.167819023132324, + "learning_rate": 4.945335977716603e-05, + "loss": 5.1877, + "step": 11214 + }, + { + "epoch": 0.06669878199638406, + "grad_norm": 2.086603879928589, + "learning_rate": 4.9453262628627297e-05, + "loss": 5.32, + "step": 11215 + }, + { + "epoch": 0.06670472927966505, + "grad_norm": 2.239917039871216, + "learning_rate": 4.945316547155218e-05, + "loss": 5.5289, + "step": 11216 + }, + { + "epoch": 0.06671067656294605, + "grad_norm": 1.9402177333831787, + "learning_rate": 4.945306830594072e-05, + "loss": 5.5159, + "step": 11217 + }, + { + "epoch": 0.06671662384622705, + "grad_norm": 2.2730953693389893, + "learning_rate": 4.945297113179294e-05, + "loss": 5.5132, + "step": 11218 + }, + { + "epoch": 0.06672257112950804, + "grad_norm": 2.4021079540252686, + "learning_rate": 4.945287394910888e-05, + "loss": 5.7505, + "step": 11219 + }, + { + "epoch": 0.06672851841278904, + "grad_norm": 1.8272559642791748, + "learning_rate": 4.945277675788859e-05, + "loss": 5.7324, + "step": 11220 + }, + { + "epoch": 0.06673446569607004, + "grad_norm": 1.641192078590393, + "learning_rate": 4.945267955813206e-05, + "loss": 5.7665, + "step": 11221 + }, + { + "epoch": 0.06674041297935103, + "grad_norm": 2.1081202030181885, + "learning_rate": 4.945258234983938e-05, + "loss": 5.3633, + "step": 11222 + }, + { + "epoch": 0.06674636026263203, + "grad_norm": 1.7172397375106812, + "learning_rate": 4.945248513301054e-05, + "loss": 5.775, + "step": 11223 + }, + { + "epoch": 0.06675230754591303, + "grad_norm": 1.9968703985214233, + "learning_rate": 4.9452387907645594e-05, + "loss": 5.4817, + "step": 11224 + }, + { + "epoch": 0.06675825482919402, + "grad_norm": 1.9165494441986084, + "learning_rate": 4.9452290673744575e-05, + "loss": 5.6977, + "step": 11225 + }, + { + "epoch": 0.06676420211247502, + "grad_norm": 1.832783579826355, + "learning_rate": 4.945219343130751e-05, + "loss": 5.2065, + "step": 11226 + }, + { + "epoch": 0.06677014939575601, + "grad_norm": 2.073590040206909, + "learning_rate": 4.945209618033444e-05, + "loss": 5.0158, + "step": 11227 + }, + { + "epoch": 0.06677609667903701, + "grad_norm": 2.0305895805358887, + "learning_rate": 4.9451998920825395e-05, + "loss": 4.8452, + "step": 11228 + }, + { + "epoch": 0.06678204396231802, + "grad_norm": 1.8843696117401123, + "learning_rate": 4.945190165278041e-05, + "loss": 5.5082, + "step": 11229 + }, + { + "epoch": 0.066787991245599, + "grad_norm": 1.66866934299469, + "learning_rate": 4.945180437619951e-05, + "loss": 5.4151, + "step": 11230 + }, + { + "epoch": 0.06679393852888, + "grad_norm": 1.8018205165863037, + "learning_rate": 4.9451707091082746e-05, + "loss": 5.124, + "step": 11231 + }, + { + "epoch": 0.06679988581216101, + "grad_norm": 1.760339379310608, + "learning_rate": 4.9451609797430146e-05, + "loss": 4.9834, + "step": 11232 + }, + { + "epoch": 0.066805833095442, + "grad_norm": 1.609376072883606, + "learning_rate": 4.945151249524174e-05, + "loss": 5.0217, + "step": 11233 + }, + { + "epoch": 0.066811780378723, + "grad_norm": 1.5468369722366333, + "learning_rate": 4.9451415184517556e-05, + "loss": 5.1881, + "step": 11234 + }, + { + "epoch": 0.066817727662004, + "grad_norm": 1.2027482986450195, + "learning_rate": 4.945131786525764e-05, + "loss": 5.1014, + "step": 11235 + }, + { + "epoch": 0.06682367494528499, + "grad_norm": 1.6050941944122314, + "learning_rate": 4.945122053746203e-05, + "loss": 5.0314, + "step": 11236 + }, + { + "epoch": 0.06682962222856599, + "grad_norm": 1.4980865716934204, + "learning_rate": 4.9451123201130746e-05, + "loss": 4.9371, + "step": 11237 + }, + { + "epoch": 0.06683556951184699, + "grad_norm": 1.6754953861236572, + "learning_rate": 4.9451025856263824e-05, + "loss": 4.9733, + "step": 11238 + }, + { + "epoch": 0.06684151679512798, + "grad_norm": 1.5051567554473877, + "learning_rate": 4.9450928502861303e-05, + "loss": 4.8994, + "step": 11239 + }, + { + "epoch": 0.06684746407840898, + "grad_norm": 1.5211920738220215, + "learning_rate": 4.945083114092321e-05, + "loss": 4.8459, + "step": 11240 + }, + { + "epoch": 0.06685341136168998, + "grad_norm": 1.6717231273651123, + "learning_rate": 4.9450733770449596e-05, + "loss": 5.1029, + "step": 11241 + }, + { + "epoch": 0.06685935864497097, + "grad_norm": 1.4853429794311523, + "learning_rate": 4.945063639144048e-05, + "loss": 5.2199, + "step": 11242 + }, + { + "epoch": 0.06686530592825198, + "grad_norm": 1.6102755069732666, + "learning_rate": 4.9450539003895894e-05, + "loss": 5.1191, + "step": 11243 + }, + { + "epoch": 0.06687125321153298, + "grad_norm": 1.6091139316558838, + "learning_rate": 4.9450441607815876e-05, + "loss": 5.2492, + "step": 11244 + }, + { + "epoch": 0.06687720049481397, + "grad_norm": 1.5190162658691406, + "learning_rate": 4.945034420320047e-05, + "loss": 5.1763, + "step": 11245 + }, + { + "epoch": 0.06688314777809497, + "grad_norm": 1.636243462562561, + "learning_rate": 4.94502467900497e-05, + "loss": 5.4906, + "step": 11246 + }, + { + "epoch": 0.06688909506137597, + "grad_norm": 1.5214428901672363, + "learning_rate": 4.9450149368363594e-05, + "loss": 5.3554, + "step": 11247 + }, + { + "epoch": 0.06689504234465696, + "grad_norm": 1.696183681488037, + "learning_rate": 4.9450051938142205e-05, + "loss": 5.3185, + "step": 11248 + }, + { + "epoch": 0.06690098962793796, + "grad_norm": 1.5344911813735962, + "learning_rate": 4.944995449938555e-05, + "loss": 5.345, + "step": 11249 + }, + { + "epoch": 0.06690693691121896, + "grad_norm": 1.598035454750061, + "learning_rate": 4.944985705209366e-05, + "loss": 5.2271, + "step": 11250 + }, + { + "epoch": 0.06691288419449995, + "grad_norm": 1.501841425895691, + "learning_rate": 4.944975959626659e-05, + "loss": 5.1807, + "step": 11251 + }, + { + "epoch": 0.06691883147778095, + "grad_norm": 1.3818657398223877, + "learning_rate": 4.944966213190436e-05, + "loss": 5.2953, + "step": 11252 + }, + { + "epoch": 0.06692477876106195, + "grad_norm": 1.5480642318725586, + "learning_rate": 4.9449564659007e-05, + "loss": 5.3048, + "step": 11253 + }, + { + "epoch": 0.06693072604434294, + "grad_norm": 1.5553090572357178, + "learning_rate": 4.9449467177574546e-05, + "loss": 5.1365, + "step": 11254 + }, + { + "epoch": 0.06693667332762394, + "grad_norm": 1.581534743309021, + "learning_rate": 4.944936968760705e-05, + "loss": 5.1498, + "step": 11255 + }, + { + "epoch": 0.06694262061090493, + "grad_norm": 1.8294548988342285, + "learning_rate": 4.944927218910452e-05, + "loss": 5.1331, + "step": 11256 + }, + { + "epoch": 0.06694856789418593, + "grad_norm": 1.3404508829116821, + "learning_rate": 4.944917468206701e-05, + "loss": 5.5092, + "step": 11257 + }, + { + "epoch": 0.06695451517746694, + "grad_norm": 1.5146483182907104, + "learning_rate": 4.944907716649454e-05, + "loss": 5.2797, + "step": 11258 + }, + { + "epoch": 0.06696046246074792, + "grad_norm": 1.571393609046936, + "learning_rate": 4.944897964238715e-05, + "loss": 5.4528, + "step": 11259 + }, + { + "epoch": 0.06696640974402893, + "grad_norm": 1.640459656715393, + "learning_rate": 4.944888210974487e-05, + "loss": 5.1032, + "step": 11260 + }, + { + "epoch": 0.06697235702730993, + "grad_norm": 1.5397419929504395, + "learning_rate": 4.944878456856774e-05, + "loss": 5.2333, + "step": 11261 + }, + { + "epoch": 0.06697830431059092, + "grad_norm": 1.4423824548721313, + "learning_rate": 4.94486870188558e-05, + "loss": 5.1765, + "step": 11262 + }, + { + "epoch": 0.06698425159387192, + "grad_norm": 1.366347074508667, + "learning_rate": 4.9448589460609066e-05, + "loss": 5.2257, + "step": 11263 + }, + { + "epoch": 0.06699019887715292, + "grad_norm": 1.370089054107666, + "learning_rate": 4.944849189382759e-05, + "loss": 5.4681, + "step": 11264 + }, + { + "epoch": 0.06699614616043391, + "grad_norm": 1.3014042377471924, + "learning_rate": 4.9448394318511394e-05, + "loss": 5.3434, + "step": 11265 + }, + { + "epoch": 0.06700209344371491, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.9448296734660516e-05, + "loss": 5.3064, + "step": 11266 + }, + { + "epoch": 0.06700804072699591, + "grad_norm": 1.6640921831130981, + "learning_rate": 4.944819914227499e-05, + "loss": 5.2896, + "step": 11267 + }, + { + "epoch": 0.0670139880102769, + "grad_norm": 1.4969593286514282, + "learning_rate": 4.9448101541354845e-05, + "loss": 5.1413, + "step": 11268 + }, + { + "epoch": 0.0670199352935579, + "grad_norm": 1.4021313190460205, + "learning_rate": 4.9448003931900126e-05, + "loss": 5.2609, + "step": 11269 + }, + { + "epoch": 0.0670258825768389, + "grad_norm": 1.6506398916244507, + "learning_rate": 4.9447906313910865e-05, + "loss": 5.3365, + "step": 11270 + }, + { + "epoch": 0.0670318298601199, + "grad_norm": 1.6469614505767822, + "learning_rate": 4.9447808687387084e-05, + "loss": 5.0384, + "step": 11271 + }, + { + "epoch": 0.0670377771434009, + "grad_norm": 1.5047974586486816, + "learning_rate": 4.944771105232883e-05, + "loss": 5.3565, + "step": 11272 + }, + { + "epoch": 0.0670437244266819, + "grad_norm": 1.4467194080352783, + "learning_rate": 4.9447613408736135e-05, + "loss": 5.5576, + "step": 11273 + }, + { + "epoch": 0.06704967170996289, + "grad_norm": 1.4636478424072266, + "learning_rate": 4.9447515756609034e-05, + "loss": 5.6407, + "step": 11274 + }, + { + "epoch": 0.06705561899324389, + "grad_norm": 1.373046875, + "learning_rate": 4.944741809594755e-05, + "loss": 5.4286, + "step": 11275 + }, + { + "epoch": 0.06706156627652489, + "grad_norm": 1.5114089250564575, + "learning_rate": 4.944732042675172e-05, + "loss": 5.6425, + "step": 11276 + }, + { + "epoch": 0.06706751355980588, + "grad_norm": 1.8263514041900635, + "learning_rate": 4.9447222749021596e-05, + "loss": 5.2469, + "step": 11277 + }, + { + "epoch": 0.06707346084308688, + "grad_norm": 1.780553936958313, + "learning_rate": 4.944712506275719e-05, + "loss": 5.3306, + "step": 11278 + }, + { + "epoch": 0.06707940812636788, + "grad_norm": 1.6208360195159912, + "learning_rate": 4.9447027367958556e-05, + "loss": 5.5365, + "step": 11279 + }, + { + "epoch": 0.06708535540964887, + "grad_norm": 1.336965560913086, + "learning_rate": 4.9446929664625705e-05, + "loss": 5.2694, + "step": 11280 + }, + { + "epoch": 0.06709130269292987, + "grad_norm": 1.6100155115127563, + "learning_rate": 4.9446831952758685e-05, + "loss": 5.5489, + "step": 11281 + }, + { + "epoch": 0.06709724997621087, + "grad_norm": 1.8020440340042114, + "learning_rate": 4.944673423235753e-05, + "loss": 5.3396, + "step": 11282 + }, + { + "epoch": 0.06710319725949186, + "grad_norm": 1.5315353870391846, + "learning_rate": 4.9446636503422276e-05, + "loss": 5.3687, + "step": 11283 + }, + { + "epoch": 0.06710914454277286, + "grad_norm": 2.2560019493103027, + "learning_rate": 4.9446538765952953e-05, + "loss": 5.4584, + "step": 11284 + }, + { + "epoch": 0.06711509182605385, + "grad_norm": 1.4653301239013672, + "learning_rate": 4.94464410199496e-05, + "loss": 5.3438, + "step": 11285 + }, + { + "epoch": 0.06712103910933485, + "grad_norm": 1.5931557416915894, + "learning_rate": 4.9446343265412243e-05, + "loss": 5.5802, + "step": 11286 + }, + { + "epoch": 0.06712698639261586, + "grad_norm": 1.5282461643218994, + "learning_rate": 4.944624550234092e-05, + "loss": 5.5634, + "step": 11287 + }, + { + "epoch": 0.06713293367589684, + "grad_norm": 1.7275618314743042, + "learning_rate": 4.944614773073566e-05, + "loss": 5.3797, + "step": 11288 + }, + { + "epoch": 0.06713888095917785, + "grad_norm": 1.6453620195388794, + "learning_rate": 4.944604995059651e-05, + "loss": 5.4693, + "step": 11289 + }, + { + "epoch": 0.06714482824245885, + "grad_norm": 1.870483636856079, + "learning_rate": 4.944595216192349e-05, + "loss": 5.4693, + "step": 11290 + }, + { + "epoch": 0.06715077552573984, + "grad_norm": 1.5478577613830566, + "learning_rate": 4.944585436471665e-05, + "loss": 5.694, + "step": 11291 + }, + { + "epoch": 0.06715672280902084, + "grad_norm": 1.9456945657730103, + "learning_rate": 4.944575655897601e-05, + "loss": 5.6687, + "step": 11292 + }, + { + "epoch": 0.06716267009230184, + "grad_norm": 1.808176875114441, + "learning_rate": 4.944565874470161e-05, + "loss": 5.7444, + "step": 11293 + }, + { + "epoch": 0.06716861737558283, + "grad_norm": 1.8066149950027466, + "learning_rate": 4.944556092189347e-05, + "loss": 5.5264, + "step": 11294 + }, + { + "epoch": 0.06717456465886383, + "grad_norm": 2.2896971702575684, + "learning_rate": 4.9445463090551656e-05, + "loss": 4.7624, + "step": 11295 + }, + { + "epoch": 0.06718051194214483, + "grad_norm": 1.7178759574890137, + "learning_rate": 4.9445365250676165e-05, + "loss": 5.79, + "step": 11296 + }, + { + "epoch": 0.06718645922542582, + "grad_norm": 1.8841933012008667, + "learning_rate": 4.944526740226707e-05, + "loss": 5.9792, + "step": 11297 + }, + { + "epoch": 0.06719240650870682, + "grad_norm": 1.8618090152740479, + "learning_rate": 4.944516954532437e-05, + "loss": 5.957, + "step": 11298 + }, + { + "epoch": 0.06719835379198782, + "grad_norm": 1.7545913457870483, + "learning_rate": 4.944507167984812e-05, + "loss": 5.4484, + "step": 11299 + }, + { + "epoch": 0.06720430107526881, + "grad_norm": 2.023158073425293, + "learning_rate": 4.9444973805838345e-05, + "loss": 5.0873, + "step": 11300 + }, + { + "epoch": 0.06721024835854982, + "grad_norm": 1.893340826034546, + "learning_rate": 4.944487592329509e-05, + "loss": 5.042, + "step": 11301 + }, + { + "epoch": 0.06721619564183082, + "grad_norm": 1.981518268585205, + "learning_rate": 4.944477803221837e-05, + "loss": 5.1463, + "step": 11302 + }, + { + "epoch": 0.0672221429251118, + "grad_norm": 2.47416090965271, + "learning_rate": 4.9444680132608236e-05, + "loss": 5.2885, + "step": 11303 + }, + { + "epoch": 0.06722809020839281, + "grad_norm": 2.3973519802093506, + "learning_rate": 4.944458222446472e-05, + "loss": 5.3321, + "step": 11304 + }, + { + "epoch": 0.06723403749167381, + "grad_norm": 1.9117941856384277, + "learning_rate": 4.9444484307787846e-05, + "loss": 5.2159, + "step": 11305 + }, + { + "epoch": 0.0672399847749548, + "grad_norm": 1.8732513189315796, + "learning_rate": 4.9444386382577656e-05, + "loss": 5.222, + "step": 11306 + }, + { + "epoch": 0.0672459320582358, + "grad_norm": 1.9202747344970703, + "learning_rate": 4.9444288448834184e-05, + "loss": 5.5766, + "step": 11307 + }, + { + "epoch": 0.0672518793415168, + "grad_norm": 1.8956191539764404, + "learning_rate": 4.944419050655747e-05, + "loss": 5.7129, + "step": 11308 + }, + { + "epoch": 0.06725782662479779, + "grad_norm": 2.7075235843658447, + "learning_rate": 4.9444092555747534e-05, + "loss": 5.2199, + "step": 11309 + }, + { + "epoch": 0.06726377390807879, + "grad_norm": 2.396125078201294, + "learning_rate": 4.944399459640442e-05, + "loss": 5.3548, + "step": 11310 + }, + { + "epoch": 0.0672697211913598, + "grad_norm": 2.6050171852111816, + "learning_rate": 4.9443896628528166e-05, + "loss": 5.616, + "step": 11311 + }, + { + "epoch": 0.06727566847464078, + "grad_norm": 2.512720823287964, + "learning_rate": 4.94437986521188e-05, + "loss": 5.3699, + "step": 11312 + }, + { + "epoch": 0.06728161575792178, + "grad_norm": 2.509716510772705, + "learning_rate": 4.9443700667176345e-05, + "loss": 5.431, + "step": 11313 + }, + { + "epoch": 0.06728756304120277, + "grad_norm": 2.2237601280212402, + "learning_rate": 4.944360267370085e-05, + "loss": 5.3985, + "step": 11314 + }, + { + "epoch": 0.06729351032448377, + "grad_norm": 1.982344627380371, + "learning_rate": 4.9443504671692356e-05, + "loss": 5.4849, + "step": 11315 + }, + { + "epoch": 0.06729945760776478, + "grad_norm": 2.1006124019622803, + "learning_rate": 4.9443406661150874e-05, + "loss": 5.227, + "step": 11316 + }, + { + "epoch": 0.06730540489104576, + "grad_norm": 2.0929529666900635, + "learning_rate": 4.9443308642076456e-05, + "loss": 5.524, + "step": 11317 + }, + { + "epoch": 0.06731135217432677, + "grad_norm": 1.9268262386322021, + "learning_rate": 4.944321061446914e-05, + "loss": 6.0622, + "step": 11318 + }, + { + "epoch": 0.06731729945760777, + "grad_norm": 2.257065773010254, + "learning_rate": 4.944311257832894e-05, + "loss": 4.9455, + "step": 11319 + }, + { + "epoch": 0.06732324674088876, + "grad_norm": 2.056244373321533, + "learning_rate": 4.944301453365591e-05, + "loss": 5.4157, + "step": 11320 + }, + { + "epoch": 0.06732919402416976, + "grad_norm": 2.1667540073394775, + "learning_rate": 4.944291648045007e-05, + "loss": 5.5767, + "step": 11321 + }, + { + "epoch": 0.06733514130745076, + "grad_norm": 1.9596853256225586, + "learning_rate": 4.944281841871146e-05, + "loss": 5.6532, + "step": 11322 + }, + { + "epoch": 0.06734108859073175, + "grad_norm": 1.7050867080688477, + "learning_rate": 4.9442720348440116e-05, + "loss": 5.8881, + "step": 11323 + }, + { + "epoch": 0.06734703587401275, + "grad_norm": 1.8681753873825073, + "learning_rate": 4.944262226963607e-05, + "loss": 5.9369, + "step": 11324 + }, + { + "epoch": 0.06735298315729375, + "grad_norm": 1.9432111978530884, + "learning_rate": 4.9442524182299365e-05, + "loss": 5.9163, + "step": 11325 + }, + { + "epoch": 0.06735893044057474, + "grad_norm": 1.8099175691604614, + "learning_rate": 4.9442426086430026e-05, + "loss": 5.809, + "step": 11326 + }, + { + "epoch": 0.06736487772385574, + "grad_norm": 1.6179800033569336, + "learning_rate": 4.944232798202808e-05, + "loss": 5.5609, + "step": 11327 + }, + { + "epoch": 0.06737082500713674, + "grad_norm": 2.303189992904663, + "learning_rate": 4.944222986909357e-05, + "loss": 5.9291, + "step": 11328 + }, + { + "epoch": 0.06737677229041773, + "grad_norm": 1.913813829421997, + "learning_rate": 4.944213174762654e-05, + "loss": 5.8672, + "step": 11329 + }, + { + "epoch": 0.06738271957369873, + "grad_norm": 2.1856813430786133, + "learning_rate": 4.944203361762701e-05, + "loss": 5.2632, + "step": 11330 + }, + { + "epoch": 0.06738866685697974, + "grad_norm": 2.019679069519043, + "learning_rate": 4.9441935479095016e-05, + "loss": 5.3707, + "step": 11331 + }, + { + "epoch": 0.06739461414026073, + "grad_norm": 1.8531097173690796, + "learning_rate": 4.944183733203059e-05, + "loss": 5.6689, + "step": 11332 + }, + { + "epoch": 0.06740056142354173, + "grad_norm": 2.068208694458008, + "learning_rate": 4.944173917643378e-05, + "loss": 5.6111, + "step": 11333 + }, + { + "epoch": 0.06740650870682273, + "grad_norm": 1.8021270036697388, + "learning_rate": 4.944164101230461e-05, + "loss": 6.0865, + "step": 11334 + }, + { + "epoch": 0.06741245599010372, + "grad_norm": 1.9051427841186523, + "learning_rate": 4.944154283964312e-05, + "loss": 5.5862, + "step": 11335 + }, + { + "epoch": 0.06741840327338472, + "grad_norm": 1.718483805656433, + "learning_rate": 4.944144465844933e-05, + "loss": 5.2505, + "step": 11336 + }, + { + "epoch": 0.06742435055666572, + "grad_norm": 2.205167531967163, + "learning_rate": 4.944134646872329e-05, + "loss": 5.3181, + "step": 11337 + }, + { + "epoch": 0.06743029783994671, + "grad_norm": 1.550945520401001, + "learning_rate": 4.944124827046502e-05, + "loss": 5.4129, + "step": 11338 + }, + { + "epoch": 0.06743624512322771, + "grad_norm": 2.08793044090271, + "learning_rate": 4.944115006367458e-05, + "loss": 5.9705, + "step": 11339 + }, + { + "epoch": 0.06744219240650871, + "grad_norm": 1.8955761194229126, + "learning_rate": 4.944105184835197e-05, + "loss": 4.9629, + "step": 11340 + }, + { + "epoch": 0.0674481396897897, + "grad_norm": 1.7287909984588623, + "learning_rate": 4.944095362449724e-05, + "loss": 5.1097, + "step": 11341 + }, + { + "epoch": 0.0674540869730707, + "grad_norm": 1.8718771934509277, + "learning_rate": 4.944085539211044e-05, + "loss": 5.6443, + "step": 11342 + }, + { + "epoch": 0.06746003425635169, + "grad_norm": 2.220863103866577, + "learning_rate": 4.9440757151191585e-05, + "loss": 5.5042, + "step": 11343 + }, + { + "epoch": 0.0674659815396327, + "grad_norm": 1.9501415491104126, + "learning_rate": 4.944065890174071e-05, + "loss": 5.6788, + "step": 11344 + }, + { + "epoch": 0.0674719288229137, + "grad_norm": 1.8566590547561646, + "learning_rate": 4.944056064375786e-05, + "loss": 5.6531, + "step": 11345 + }, + { + "epoch": 0.06747787610619468, + "grad_norm": 1.895409345626831, + "learning_rate": 4.9440462377243055e-05, + "loss": 5.6441, + "step": 11346 + }, + { + "epoch": 0.06748382338947569, + "grad_norm": 2.1746973991394043, + "learning_rate": 4.9440364102196345e-05, + "loss": 5.8624, + "step": 11347 + }, + { + "epoch": 0.06748977067275669, + "grad_norm": 1.9661751985549927, + "learning_rate": 4.944026581861775e-05, + "loss": 5.6075, + "step": 11348 + }, + { + "epoch": 0.06749571795603768, + "grad_norm": 1.8591458797454834, + "learning_rate": 4.944016752650731e-05, + "loss": 5.9115, + "step": 11349 + }, + { + "epoch": 0.06750166523931868, + "grad_norm": 1.6491025686264038, + "learning_rate": 4.9440069225865065e-05, + "loss": 6.0548, + "step": 11350 + }, + { + "epoch": 0.06750761252259968, + "grad_norm": 1.857928991317749, + "learning_rate": 4.9439970916691045e-05, + "loss": 5.4326, + "step": 11351 + }, + { + "epoch": 0.06751355980588067, + "grad_norm": 1.8189151287078857, + "learning_rate": 4.943987259898528e-05, + "loss": 5.7744, + "step": 11352 + }, + { + "epoch": 0.06751950708916167, + "grad_norm": 1.7486300468444824, + "learning_rate": 4.943977427274781e-05, + "loss": 5.7128, + "step": 11353 + }, + { + "epoch": 0.06752545437244267, + "grad_norm": 1.7272138595581055, + "learning_rate": 4.943967593797866e-05, + "loss": 5.9922, + "step": 11354 + }, + { + "epoch": 0.06753140165572366, + "grad_norm": 1.740860939025879, + "learning_rate": 4.9439577594677875e-05, + "loss": 5.8486, + "step": 11355 + }, + { + "epoch": 0.06753734893900466, + "grad_norm": 1.9054155349731445, + "learning_rate": 4.9439479242845494e-05, + "loss": 5.4694, + "step": 11356 + }, + { + "epoch": 0.06754329622228566, + "grad_norm": 1.9783501625061035, + "learning_rate": 4.943938088248154e-05, + "loss": 5.5185, + "step": 11357 + }, + { + "epoch": 0.06754924350556665, + "grad_norm": 1.8267238140106201, + "learning_rate": 4.943928251358605e-05, + "loss": 5.7589, + "step": 11358 + }, + { + "epoch": 0.06755519078884765, + "grad_norm": 1.6957738399505615, + "learning_rate": 4.943918413615906e-05, + "loss": 5.5716, + "step": 11359 + }, + { + "epoch": 0.06756113807212866, + "grad_norm": 2.0818982124328613, + "learning_rate": 4.94390857502006e-05, + "loss": 5.8969, + "step": 11360 + }, + { + "epoch": 0.06756708535540965, + "grad_norm": 1.8012073040008545, + "learning_rate": 4.9438987355710703e-05, + "loss": 6.1053, + "step": 11361 + }, + { + "epoch": 0.06757303263869065, + "grad_norm": 2.2209696769714355, + "learning_rate": 4.943888895268942e-05, + "loss": 5.9714, + "step": 11362 + }, + { + "epoch": 0.06757897992197165, + "grad_norm": 1.8006336688995361, + "learning_rate": 4.943879054113676e-05, + "loss": 5.6427, + "step": 11363 + }, + { + "epoch": 0.06758492720525264, + "grad_norm": 1.7628017663955688, + "learning_rate": 4.9438692121052775e-05, + "loss": 5.8639, + "step": 11364 + }, + { + "epoch": 0.06759087448853364, + "grad_norm": 1.8574492931365967, + "learning_rate": 4.94385936924375e-05, + "loss": 5.892, + "step": 11365 + }, + { + "epoch": 0.06759682177181464, + "grad_norm": 1.7926831245422363, + "learning_rate": 4.9438495255290964e-05, + "loss": 5.9024, + "step": 11366 + }, + { + "epoch": 0.06760276905509563, + "grad_norm": 2.503370761871338, + "learning_rate": 4.94383968096132e-05, + "loss": 5.994, + "step": 11367 + }, + { + "epoch": 0.06760871633837663, + "grad_norm": 1.7123390436172485, + "learning_rate": 4.943829835540424e-05, + "loss": 5.8052, + "step": 11368 + }, + { + "epoch": 0.06761466362165763, + "grad_norm": 2.0890092849731445, + "learning_rate": 4.943819989266413e-05, + "loss": 5.067, + "step": 11369 + }, + { + "epoch": 0.06762061090493862, + "grad_norm": 1.8000640869140625, + "learning_rate": 4.9438101421392894e-05, + "loss": 5.3562, + "step": 11370 + }, + { + "epoch": 0.06762655818821962, + "grad_norm": 2.254873514175415, + "learning_rate": 4.9438002941590564e-05, + "loss": 5.0557, + "step": 11371 + }, + { + "epoch": 0.06763250547150061, + "grad_norm": 1.8080449104309082, + "learning_rate": 4.943790445325719e-05, + "loss": 5.6702, + "step": 11372 + }, + { + "epoch": 0.06763845275478161, + "grad_norm": 2.0175933837890625, + "learning_rate": 4.943780595639279e-05, + "loss": 5.6227, + "step": 11373 + }, + { + "epoch": 0.06764440003806262, + "grad_norm": 1.9859650135040283, + "learning_rate": 4.943770745099741e-05, + "loss": 5.4437, + "step": 11374 + }, + { + "epoch": 0.0676503473213436, + "grad_norm": 1.975573182106018, + "learning_rate": 4.943760893707107e-05, + "loss": 5.3101, + "step": 11375 + }, + { + "epoch": 0.0676562946046246, + "grad_norm": 2.2590208053588867, + "learning_rate": 4.943751041461382e-05, + "loss": 5.2544, + "step": 11376 + }, + { + "epoch": 0.06766224188790561, + "grad_norm": 1.8615392446517944, + "learning_rate": 4.943741188362568e-05, + "loss": 5.5266, + "step": 11377 + }, + { + "epoch": 0.0676681891711866, + "grad_norm": 2.056810140609741, + "learning_rate": 4.943731334410669e-05, + "loss": 5.1994, + "step": 11378 + }, + { + "epoch": 0.0676741364544676, + "grad_norm": 2.0275685787200928, + "learning_rate": 4.94372147960569e-05, + "loss": 5.7385, + "step": 11379 + }, + { + "epoch": 0.0676800837377486, + "grad_norm": 2.082963466644287, + "learning_rate": 4.9437116239476325e-05, + "loss": 5.1531, + "step": 11380 + }, + { + "epoch": 0.06768603102102959, + "grad_norm": 2.176421642303467, + "learning_rate": 4.9437017674365004e-05, + "loss": 5.521, + "step": 11381 + }, + { + "epoch": 0.06769197830431059, + "grad_norm": 2.1424365043640137, + "learning_rate": 4.9436919100722964e-05, + "loss": 5.4543, + "step": 11382 + }, + { + "epoch": 0.06769792558759159, + "grad_norm": 2.07836651802063, + "learning_rate": 4.9436820518550266e-05, + "loss": 5.5166, + "step": 11383 + }, + { + "epoch": 0.06770387287087258, + "grad_norm": 1.9776746034622192, + "learning_rate": 4.9436721927846915e-05, + "loss": 5.4621, + "step": 11384 + }, + { + "epoch": 0.06770982015415358, + "grad_norm": 1.9985042810440063, + "learning_rate": 4.943662332861296e-05, + "loss": 5.3835, + "step": 11385 + }, + { + "epoch": 0.06771576743743458, + "grad_norm": 1.6877795457839966, + "learning_rate": 4.943652472084843e-05, + "loss": 5.185, + "step": 11386 + }, + { + "epoch": 0.06772171472071557, + "grad_norm": 1.8307565450668335, + "learning_rate": 4.943642610455336e-05, + "loss": 5.117, + "step": 11387 + }, + { + "epoch": 0.06772766200399657, + "grad_norm": 2.0381922721862793, + "learning_rate": 4.943632747972779e-05, + "loss": 5.6004, + "step": 11388 + }, + { + "epoch": 0.06773360928727758, + "grad_norm": 1.9554756879806519, + "learning_rate": 4.943622884637175e-05, + "loss": 5.9638, + "step": 11389 + }, + { + "epoch": 0.06773955657055857, + "grad_norm": 1.878861665725708, + "learning_rate": 4.9436130204485274e-05, + "loss": 5.7961, + "step": 11390 + }, + { + "epoch": 0.06774550385383957, + "grad_norm": 2.040012836456299, + "learning_rate": 4.94360315540684e-05, + "loss": 5.7175, + "step": 11391 + }, + { + "epoch": 0.06775145113712057, + "grad_norm": 2.262408494949341, + "learning_rate": 4.943593289512115e-05, + "loss": 4.8581, + "step": 11392 + }, + { + "epoch": 0.06775739842040156, + "grad_norm": 2.201751232147217, + "learning_rate": 4.943583422764358e-05, + "loss": 5.0647, + "step": 11393 + }, + { + "epoch": 0.06776334570368256, + "grad_norm": 1.9768764972686768, + "learning_rate": 4.943573555163571e-05, + "loss": 5.8836, + "step": 11394 + }, + { + "epoch": 0.06776929298696356, + "grad_norm": 2.1048574447631836, + "learning_rate": 4.9435636867097575e-05, + "loss": 5.9746, + "step": 11395 + }, + { + "epoch": 0.06777524027024455, + "grad_norm": 1.5297552347183228, + "learning_rate": 4.943553817402921e-05, + "loss": 4.912, + "step": 11396 + }, + { + "epoch": 0.06778118755352555, + "grad_norm": 1.5313429832458496, + "learning_rate": 4.943543947243066e-05, + "loss": 4.975, + "step": 11397 + }, + { + "epoch": 0.06778713483680655, + "grad_norm": 1.8882219791412354, + "learning_rate": 4.943534076230194e-05, + "loss": 5.2183, + "step": 11398 + }, + { + "epoch": 0.06779308212008754, + "grad_norm": 1.698997139930725, + "learning_rate": 4.9435242043643094e-05, + "loss": 5.8019, + "step": 11399 + }, + { + "epoch": 0.06779902940336854, + "grad_norm": 1.775140404701233, + "learning_rate": 4.943514331645417e-05, + "loss": 5.7451, + "step": 11400 + }, + { + "epoch": 0.06780497668664953, + "grad_norm": 2.273650884628296, + "learning_rate": 4.943504458073518e-05, + "loss": 4.7727, + "step": 11401 + }, + { + "epoch": 0.06781092396993053, + "grad_norm": 2.166961908340454, + "learning_rate": 4.943494583648617e-05, + "loss": 5.4537, + "step": 11402 + }, + { + "epoch": 0.06781687125321154, + "grad_norm": 2.147876024246216, + "learning_rate": 4.943484708370717e-05, + "loss": 5.2635, + "step": 11403 + }, + { + "epoch": 0.06782281853649252, + "grad_norm": 1.968397855758667, + "learning_rate": 4.943474832239822e-05, + "loss": 5.6591, + "step": 11404 + }, + { + "epoch": 0.06782876581977353, + "grad_norm": 1.8838316202163696, + "learning_rate": 4.943464955255935e-05, + "loss": 5.5462, + "step": 11405 + }, + { + "epoch": 0.06783471310305453, + "grad_norm": 2.4205315113067627, + "learning_rate": 4.94345507741906e-05, + "loss": 4.859, + "step": 11406 + }, + { + "epoch": 0.06784066038633552, + "grad_norm": 2.1272950172424316, + "learning_rate": 4.9434451987292e-05, + "loss": 5.1791, + "step": 11407 + }, + { + "epoch": 0.06784660766961652, + "grad_norm": 2.345055341720581, + "learning_rate": 4.9434353191863595e-05, + "loss": 5.1616, + "step": 11408 + }, + { + "epoch": 0.06785255495289752, + "grad_norm": 2.3967537879943848, + "learning_rate": 4.9434254387905395e-05, + "loss": 5.1805, + "step": 11409 + }, + { + "epoch": 0.06785850223617851, + "grad_norm": 2.2108283042907715, + "learning_rate": 4.943415557541745e-05, + "loss": 5.381, + "step": 11410 + }, + { + "epoch": 0.06786444951945951, + "grad_norm": 2.178776979446411, + "learning_rate": 4.94340567543998e-05, + "loss": 5.4016, + "step": 11411 + }, + { + "epoch": 0.06787039680274051, + "grad_norm": 2.003169059753418, + "learning_rate": 4.943395792485247e-05, + "loss": 5.5632, + "step": 11412 + }, + { + "epoch": 0.0678763440860215, + "grad_norm": 2.0337789058685303, + "learning_rate": 4.9433859086775506e-05, + "loss": 5.4476, + "step": 11413 + }, + { + "epoch": 0.0678822913693025, + "grad_norm": 1.784868836402893, + "learning_rate": 4.943376024016892e-05, + "loss": 5.3578, + "step": 11414 + }, + { + "epoch": 0.0678882386525835, + "grad_norm": 1.7282286882400513, + "learning_rate": 4.943366138503277e-05, + "loss": 5.6202, + "step": 11415 + }, + { + "epoch": 0.06789418593586449, + "grad_norm": 1.9716618061065674, + "learning_rate": 4.943356252136707e-05, + "loss": 4.9861, + "step": 11416 + }, + { + "epoch": 0.0679001332191455, + "grad_norm": 2.399317502975464, + "learning_rate": 4.943346364917188e-05, + "loss": 4.4494, + "step": 11417 + }, + { + "epoch": 0.0679060805024265, + "grad_norm": 2.142995834350586, + "learning_rate": 4.943336476844722e-05, + "loss": 4.5989, + "step": 11418 + }, + { + "epoch": 0.06791202778570748, + "grad_norm": 1.9394404888153076, + "learning_rate": 4.943326587919311e-05, + "loss": 4.4944, + "step": 11419 + }, + { + "epoch": 0.06791797506898849, + "grad_norm": 2.41937518119812, + "learning_rate": 4.9433166981409615e-05, + "loss": 5.1687, + "step": 11420 + }, + { + "epoch": 0.06792392235226949, + "grad_norm": 2.1686136722564697, + "learning_rate": 4.943306807509675e-05, + "loss": 6.2976, + "step": 11421 + }, + { + "epoch": 0.06792986963555048, + "grad_norm": 1.9649391174316406, + "learning_rate": 4.943296916025455e-05, + "loss": 6.0242, + "step": 11422 + }, + { + "epoch": 0.06793581691883148, + "grad_norm": 1.9251484870910645, + "learning_rate": 4.943287023688305e-05, + "loss": 5.9777, + "step": 11423 + }, + { + "epoch": 0.06794176420211248, + "grad_norm": 1.838348388671875, + "learning_rate": 4.9432771304982296e-05, + "loss": 5.8669, + "step": 11424 + }, + { + "epoch": 0.06794771148539347, + "grad_norm": 2.5417487621307373, + "learning_rate": 4.94326723645523e-05, + "loss": 5.5131, + "step": 11425 + }, + { + "epoch": 0.06795365876867447, + "grad_norm": 2.2175936698913574, + "learning_rate": 4.943257341559312e-05, + "loss": 5.4657, + "step": 11426 + }, + { + "epoch": 0.06795960605195547, + "grad_norm": 2.4474873542785645, + "learning_rate": 4.943247445810478e-05, + "loss": 5.2401, + "step": 11427 + }, + { + "epoch": 0.06796555333523646, + "grad_norm": 2.176483392715454, + "learning_rate": 4.9432375492087324e-05, + "loss": 5.7295, + "step": 11428 + }, + { + "epoch": 0.06797150061851746, + "grad_norm": 1.9311527013778687, + "learning_rate": 4.943227651754077e-05, + "loss": 5.8135, + "step": 11429 + }, + { + "epoch": 0.06797744790179845, + "grad_norm": 2.2462544441223145, + "learning_rate": 4.943217753446516e-05, + "loss": 6.0761, + "step": 11430 + }, + { + "epoch": 0.06798339518507945, + "grad_norm": 2.3158276081085205, + "learning_rate": 4.943207854286053e-05, + "loss": 6.0223, + "step": 11431 + }, + { + "epoch": 0.06798934246836046, + "grad_norm": 1.6222623586654663, + "learning_rate": 4.9431979542726914e-05, + "loss": 5.9417, + "step": 11432 + }, + { + "epoch": 0.06799528975164144, + "grad_norm": 1.9809083938598633, + "learning_rate": 4.9431880534064345e-05, + "loss": 5.7476, + "step": 11433 + }, + { + "epoch": 0.06800123703492245, + "grad_norm": 1.9575468301773071, + "learning_rate": 4.9431781516872865e-05, + "loss": 5.6169, + "step": 11434 + }, + { + "epoch": 0.06800718431820345, + "grad_norm": 2.1103882789611816, + "learning_rate": 4.9431682491152495e-05, + "loss": 5.5119, + "step": 11435 + }, + { + "epoch": 0.06801313160148444, + "grad_norm": 2.280287265777588, + "learning_rate": 4.943158345690328e-05, + "loss": 5.2622, + "step": 11436 + }, + { + "epoch": 0.06801907888476544, + "grad_norm": 2.582737684249878, + "learning_rate": 4.943148441412525e-05, + "loss": 5.2644, + "step": 11437 + }, + { + "epoch": 0.06802502616804644, + "grad_norm": 2.1919124126434326, + "learning_rate": 4.9431385362818446e-05, + "loss": 5.0717, + "step": 11438 + }, + { + "epoch": 0.06803097345132743, + "grad_norm": 2.3036141395568848, + "learning_rate": 4.9431286302982896e-05, + "loss": 5.0049, + "step": 11439 + }, + { + "epoch": 0.06803692073460843, + "grad_norm": 2.3675789833068848, + "learning_rate": 4.943118723461864e-05, + "loss": 5.4686, + "step": 11440 + }, + { + "epoch": 0.06804286801788943, + "grad_norm": 2.8305327892303467, + "learning_rate": 4.94310881577257e-05, + "loss": 5.3409, + "step": 11441 + }, + { + "epoch": 0.06804881530117042, + "grad_norm": 1.562173843383789, + "learning_rate": 4.9430989072304126e-05, + "loss": 5.6801, + "step": 11442 + }, + { + "epoch": 0.06805476258445142, + "grad_norm": 1.9728971719741821, + "learning_rate": 4.9430889978353945e-05, + "loss": 5.4252, + "step": 11443 + }, + { + "epoch": 0.06806070986773242, + "grad_norm": 2.054025173187256, + "learning_rate": 4.9430790875875185e-05, + "loss": 5.1155, + "step": 11444 + }, + { + "epoch": 0.06806665715101341, + "grad_norm": 1.8511056900024414, + "learning_rate": 4.9430691764867895e-05, + "loss": 5.102, + "step": 11445 + }, + { + "epoch": 0.06807260443429441, + "grad_norm": 1.9024226665496826, + "learning_rate": 4.943059264533211e-05, + "loss": 5.0761, + "step": 11446 + }, + { + "epoch": 0.06807855171757542, + "grad_norm": 2.4767966270446777, + "learning_rate": 4.9430493517267843e-05, + "loss": 4.9809, + "step": 11447 + }, + { + "epoch": 0.0680844990008564, + "grad_norm": 2.393517255783081, + "learning_rate": 4.943039438067515e-05, + "loss": 5.1191, + "step": 11448 + }, + { + "epoch": 0.06809044628413741, + "grad_norm": 1.9510548114776611, + "learning_rate": 4.9430295235554055e-05, + "loss": 5.7117, + "step": 11449 + }, + { + "epoch": 0.06809639356741841, + "grad_norm": 2.1002418994903564, + "learning_rate": 4.9430196081904605e-05, + "loss": 5.7003, + "step": 11450 + }, + { + "epoch": 0.0681023408506994, + "grad_norm": 2.5328590869903564, + "learning_rate": 4.943009691972682e-05, + "loss": 6.1835, + "step": 11451 + }, + { + "epoch": 0.0681082881339804, + "grad_norm": 1.9173791408538818, + "learning_rate": 4.9429997749020743e-05, + "loss": 5.9596, + "step": 11452 + }, + { + "epoch": 0.0681142354172614, + "grad_norm": 2.0781052112579346, + "learning_rate": 4.9429898569786406e-05, + "loss": 5.7335, + "step": 11453 + }, + { + "epoch": 0.06812018270054239, + "grad_norm": 2.4210550785064697, + "learning_rate": 4.942979938202384e-05, + "loss": 4.9888, + "step": 11454 + }, + { + "epoch": 0.06812612998382339, + "grad_norm": 1.8438634872436523, + "learning_rate": 4.942970018573309e-05, + "loss": 5.8027, + "step": 11455 + }, + { + "epoch": 0.0681320772671044, + "grad_norm": 2.122882843017578, + "learning_rate": 4.942960098091418e-05, + "loss": 5.8569, + "step": 11456 + }, + { + "epoch": 0.06813802455038538, + "grad_norm": 1.6002168655395508, + "learning_rate": 4.942950176756715e-05, + "loss": 5.7362, + "step": 11457 + }, + { + "epoch": 0.06814397183366638, + "grad_norm": 1.8086539506912231, + "learning_rate": 4.942940254569203e-05, + "loss": 5.7537, + "step": 11458 + }, + { + "epoch": 0.06814991911694737, + "grad_norm": 2.0441513061523438, + "learning_rate": 4.942930331528886e-05, + "loss": 5.8255, + "step": 11459 + }, + { + "epoch": 0.06815586640022837, + "grad_norm": 1.8272675275802612, + "learning_rate": 4.942920407635767e-05, + "loss": 5.6915, + "step": 11460 + }, + { + "epoch": 0.06816181368350938, + "grad_norm": 3.3902077674865723, + "learning_rate": 4.94291048288985e-05, + "loss": 4.719, + "step": 11461 + }, + { + "epoch": 0.06816776096679036, + "grad_norm": 3.1770875453948975, + "learning_rate": 4.9429005572911385e-05, + "loss": 4.401, + "step": 11462 + }, + { + "epoch": 0.06817370825007137, + "grad_norm": 1.9011846780776978, + "learning_rate": 4.9428906308396355e-05, + "loss": 5.4768, + "step": 11463 + }, + { + "epoch": 0.06817965553335237, + "grad_norm": 1.7608321905136108, + "learning_rate": 4.9428807035353443e-05, + "loss": 5.5755, + "step": 11464 + }, + { + "epoch": 0.06818560281663336, + "grad_norm": 1.8250397443771362, + "learning_rate": 4.9428707753782686e-05, + "loss": 5.7804, + "step": 11465 + }, + { + "epoch": 0.06819155009991436, + "grad_norm": 2.566436290740967, + "learning_rate": 4.942860846368412e-05, + "loss": 5.0442, + "step": 11466 + }, + { + "epoch": 0.06819749738319536, + "grad_norm": 3.336547613143921, + "learning_rate": 4.942850916505779e-05, + "loss": 4.5331, + "step": 11467 + }, + { + "epoch": 0.06820344466647635, + "grad_norm": 2.6383185386657715, + "learning_rate": 4.9428409857903714e-05, + "loss": 4.5301, + "step": 11468 + }, + { + "epoch": 0.06820939194975735, + "grad_norm": 2.3853955268859863, + "learning_rate": 4.9428310542221924e-05, + "loss": 4.3398, + "step": 11469 + }, + { + "epoch": 0.06821533923303835, + "grad_norm": 2.3954038619995117, + "learning_rate": 4.942821121801246e-05, + "loss": 5.0841, + "step": 11470 + }, + { + "epoch": 0.06822128651631934, + "grad_norm": 2.922161340713501, + "learning_rate": 4.942811188527537e-05, + "loss": 4.5573, + "step": 11471 + }, + { + "epoch": 0.06822723379960034, + "grad_norm": 2.7202560901641846, + "learning_rate": 4.942801254401068e-05, + "loss": 4.5047, + "step": 11472 + }, + { + "epoch": 0.06823318108288134, + "grad_norm": 2.2289440631866455, + "learning_rate": 4.9427913194218424e-05, + "loss": 5.4686, + "step": 11473 + }, + { + "epoch": 0.06823912836616233, + "grad_norm": 2.2033851146698, + "learning_rate": 4.9427813835898635e-05, + "loss": 5.3554, + "step": 11474 + }, + { + "epoch": 0.06824507564944333, + "grad_norm": 2.171147346496582, + "learning_rate": 4.9427714469051345e-05, + "loss": 5.504, + "step": 11475 + }, + { + "epoch": 0.06825102293272434, + "grad_norm": 2.0110602378845215, + "learning_rate": 4.9427615093676594e-05, + "loss": 5.6126, + "step": 11476 + }, + { + "epoch": 0.06825697021600532, + "grad_norm": 2.08642840385437, + "learning_rate": 4.942751570977441e-05, + "loss": 6.0948, + "step": 11477 + }, + { + "epoch": 0.06826291749928633, + "grad_norm": 2.12245774269104, + "learning_rate": 4.9427416317344835e-05, + "loss": 5.2845, + "step": 11478 + }, + { + "epoch": 0.06826886478256733, + "grad_norm": 1.9155166149139404, + "learning_rate": 4.942731691638791e-05, + "loss": 5.4674, + "step": 11479 + }, + { + "epoch": 0.06827481206584832, + "grad_norm": 2.3452367782592773, + "learning_rate": 4.942721750690365e-05, + "loss": 5.2368, + "step": 11480 + }, + { + "epoch": 0.06828075934912932, + "grad_norm": 2.1282498836517334, + "learning_rate": 4.9427118088892105e-05, + "loss": 5.348, + "step": 11481 + }, + { + "epoch": 0.06828670663241032, + "grad_norm": 1.9251933097839355, + "learning_rate": 4.9427018662353306e-05, + "loss": 5.2588, + "step": 11482 + }, + { + "epoch": 0.06829265391569131, + "grad_norm": 1.9481078386306763, + "learning_rate": 4.942691922728728e-05, + "loss": 5.2775, + "step": 11483 + }, + { + "epoch": 0.06829860119897231, + "grad_norm": 1.9506112337112427, + "learning_rate": 4.942681978369408e-05, + "loss": 5.6865, + "step": 11484 + }, + { + "epoch": 0.06830454848225331, + "grad_norm": 2.0636112689971924, + "learning_rate": 4.942672033157373e-05, + "loss": 6.218, + "step": 11485 + }, + { + "epoch": 0.0683104957655343, + "grad_norm": 1.8479397296905518, + "learning_rate": 4.9426620870926256e-05, + "loss": 6.1283, + "step": 11486 + }, + { + "epoch": 0.0683164430488153, + "grad_norm": 1.9079830646514893, + "learning_rate": 4.94265214017517e-05, + "loss": 6.127, + "step": 11487 + }, + { + "epoch": 0.06832239033209629, + "grad_norm": 2.1076481342315674, + "learning_rate": 4.9426421924050105e-05, + "loss": 5.9978, + "step": 11488 + }, + { + "epoch": 0.0683283376153773, + "grad_norm": 1.885231375694275, + "learning_rate": 4.942632243782149e-05, + "loss": 5.8269, + "step": 11489 + }, + { + "epoch": 0.0683342848986583, + "grad_norm": 1.968980073928833, + "learning_rate": 4.942622294306591e-05, + "loss": 5.899, + "step": 11490 + }, + { + "epoch": 0.06834023218193928, + "grad_norm": 1.9857345819473267, + "learning_rate": 4.9426123439783376e-05, + "loss": 5.9416, + "step": 11491 + }, + { + "epoch": 0.06834617946522029, + "grad_norm": 1.8433799743652344, + "learning_rate": 4.942602392797394e-05, + "loss": 6.0714, + "step": 11492 + }, + { + "epoch": 0.06835212674850129, + "grad_norm": 1.9299565553665161, + "learning_rate": 4.942592440763764e-05, + "loss": 6.14, + "step": 11493 + }, + { + "epoch": 0.06835807403178228, + "grad_norm": 1.5700571537017822, + "learning_rate": 4.9425824878774486e-05, + "loss": 6.0496, + "step": 11494 + }, + { + "epoch": 0.06836402131506328, + "grad_norm": 1.6914032697677612, + "learning_rate": 4.942572534138454e-05, + "loss": 5.8301, + "step": 11495 + }, + { + "epoch": 0.06836996859834428, + "grad_norm": 1.6765984296798706, + "learning_rate": 4.942562579546782e-05, + "loss": 6.0701, + "step": 11496 + }, + { + "epoch": 0.06837591588162527, + "grad_norm": 1.715425729751587, + "learning_rate": 4.9425526241024364e-05, + "loss": 5.9499, + "step": 11497 + }, + { + "epoch": 0.06838186316490627, + "grad_norm": 1.8849130868911743, + "learning_rate": 4.942542667805422e-05, + "loss": 5.7088, + "step": 11498 + }, + { + "epoch": 0.06838781044818727, + "grad_norm": 2.1290276050567627, + "learning_rate": 4.9425327106557405e-05, + "loss": 5.9329, + "step": 11499 + }, + { + "epoch": 0.06839375773146826, + "grad_norm": 1.9105192422866821, + "learning_rate": 4.942522752653396e-05, + "loss": 5.9068, + "step": 11500 + }, + { + "epoch": 0.06839970501474926, + "grad_norm": 1.9120036363601685, + "learning_rate": 4.9425127937983926e-05, + "loss": 5.8411, + "step": 11501 + }, + { + "epoch": 0.06840565229803026, + "grad_norm": 2.1045427322387695, + "learning_rate": 4.942502834090732e-05, + "loss": 6.1575, + "step": 11502 + }, + { + "epoch": 0.06841159958131125, + "grad_norm": 1.8271901607513428, + "learning_rate": 4.94249287353042e-05, + "loss": 6.0732, + "step": 11503 + }, + { + "epoch": 0.06841754686459225, + "grad_norm": 1.4770866632461548, + "learning_rate": 4.942482912117459e-05, + "loss": 6.0823, + "step": 11504 + }, + { + "epoch": 0.06842349414787326, + "grad_norm": 1.7055792808532715, + "learning_rate": 4.942472949851852e-05, + "loss": 6.0738, + "step": 11505 + }, + { + "epoch": 0.06842944143115424, + "grad_norm": 1.588705062866211, + "learning_rate": 4.942462986733602e-05, + "loss": 5.9731, + "step": 11506 + }, + { + "epoch": 0.06843538871443525, + "grad_norm": 2.662527561187744, + "learning_rate": 4.942453022762715e-05, + "loss": 5.7745, + "step": 11507 + }, + { + "epoch": 0.06844133599771625, + "grad_norm": 2.0649495124816895, + "learning_rate": 4.9424430579391925e-05, + "loss": 5.7173, + "step": 11508 + }, + { + "epoch": 0.06844728328099724, + "grad_norm": 1.647801160812378, + "learning_rate": 4.942433092263038e-05, + "loss": 6.1516, + "step": 11509 + }, + { + "epoch": 0.06845323056427824, + "grad_norm": 1.743788480758667, + "learning_rate": 4.942423125734256e-05, + "loss": 6.0211, + "step": 11510 + }, + { + "epoch": 0.06845917784755924, + "grad_norm": 1.898647665977478, + "learning_rate": 4.942413158352849e-05, + "loss": 6.0106, + "step": 11511 + }, + { + "epoch": 0.06846512513084023, + "grad_norm": 1.5159860849380493, + "learning_rate": 4.94240319011882e-05, + "loss": 5.8759, + "step": 11512 + }, + { + "epoch": 0.06847107241412123, + "grad_norm": 3.265730142593384, + "learning_rate": 4.9423932210321744e-05, + "loss": 4.7228, + "step": 11513 + }, + { + "epoch": 0.06847701969740223, + "grad_norm": 2.9290871620178223, + "learning_rate": 4.9423832510929136e-05, + "loss": 4.5315, + "step": 11514 + }, + { + "epoch": 0.06848296698068322, + "grad_norm": 2.4189975261688232, + "learning_rate": 4.942373280301042e-05, + "loss": 4.5803, + "step": 11515 + }, + { + "epoch": 0.06848891426396422, + "grad_norm": 2.4018993377685547, + "learning_rate": 4.9423633086565645e-05, + "loss": 5.1411, + "step": 11516 + }, + { + "epoch": 0.06849486154724521, + "grad_norm": 2.4697556495666504, + "learning_rate": 4.9423533361594824e-05, + "loss": 5.1523, + "step": 11517 + }, + { + "epoch": 0.06850080883052621, + "grad_norm": 2.1573715209960938, + "learning_rate": 4.942343362809799e-05, + "loss": 5.3488, + "step": 11518 + }, + { + "epoch": 0.06850675611380722, + "grad_norm": 1.9723131656646729, + "learning_rate": 4.9423333886075205e-05, + "loss": 5.2315, + "step": 11519 + }, + { + "epoch": 0.0685127033970882, + "grad_norm": 1.6925430297851562, + "learning_rate": 4.9423234135526475e-05, + "loss": 5.3055, + "step": 11520 + }, + { + "epoch": 0.0685186506803692, + "grad_norm": 2.8665122985839844, + "learning_rate": 4.942313437645185e-05, + "loss": 4.4905, + "step": 11521 + }, + { + "epoch": 0.06852459796365021, + "grad_norm": 2.7538015842437744, + "learning_rate": 4.942303460885136e-05, + "loss": 4.3863, + "step": 11522 + }, + { + "epoch": 0.0685305452469312, + "grad_norm": 2.335664987564087, + "learning_rate": 4.942293483272504e-05, + "loss": 4.4571, + "step": 11523 + }, + { + "epoch": 0.0685364925302122, + "grad_norm": 1.7987995147705078, + "learning_rate": 4.942283504807293e-05, + "loss": 5.1802, + "step": 11524 + }, + { + "epoch": 0.0685424398134932, + "grad_norm": 2.3286690711975098, + "learning_rate": 4.9422735254895056e-05, + "loss": 5.2883, + "step": 11525 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 2.093317747116089, + "learning_rate": 4.9422635453191466e-05, + "loss": 5.2589, + "step": 11526 + }, + { + "epoch": 0.06855433438005519, + "grad_norm": 1.914236307144165, + "learning_rate": 4.942253564296218e-05, + "loss": 5.4347, + "step": 11527 + }, + { + "epoch": 0.06856028166333619, + "grad_norm": 1.602265477180481, + "learning_rate": 4.942243582420724e-05, + "loss": 5.8021, + "step": 11528 + }, + { + "epoch": 0.06856622894661718, + "grad_norm": 1.4433797597885132, + "learning_rate": 4.9422335996926674e-05, + "loss": 5.7432, + "step": 11529 + }, + { + "epoch": 0.06857217622989818, + "grad_norm": 1.3481166362762451, + "learning_rate": 4.942223616112053e-05, + "loss": 5.2946, + "step": 11530 + }, + { + "epoch": 0.06857812351317918, + "grad_norm": 1.879550576210022, + "learning_rate": 4.942213631678883e-05, + "loss": 5.2669, + "step": 11531 + }, + { + "epoch": 0.06858407079646017, + "grad_norm": 2.7241995334625244, + "learning_rate": 4.942203646393162e-05, + "loss": 5.2248, + "step": 11532 + }, + { + "epoch": 0.06859001807974117, + "grad_norm": 1.9870814085006714, + "learning_rate": 4.942193660254892e-05, + "loss": 5.4025, + "step": 11533 + }, + { + "epoch": 0.06859596536302218, + "grad_norm": 1.89231276512146, + "learning_rate": 4.942183673264079e-05, + "loss": 5.6046, + "step": 11534 + }, + { + "epoch": 0.06860191264630316, + "grad_norm": 2.024684429168701, + "learning_rate": 4.9421736854207235e-05, + "loss": 5.4031, + "step": 11535 + }, + { + "epoch": 0.06860785992958417, + "grad_norm": 1.6764521598815918, + "learning_rate": 4.942163696724831e-05, + "loss": 5.702, + "step": 11536 + }, + { + "epoch": 0.06861380721286517, + "grad_norm": 1.7738621234893799, + "learning_rate": 4.942153707176405e-05, + "loss": 5.1491, + "step": 11537 + }, + { + "epoch": 0.06861975449614616, + "grad_norm": 1.416986346244812, + "learning_rate": 4.942143716775447e-05, + "loss": 5.3883, + "step": 11538 + }, + { + "epoch": 0.06862570177942716, + "grad_norm": 1.837067723274231, + "learning_rate": 4.942133725521963e-05, + "loss": 5.2945, + "step": 11539 + }, + { + "epoch": 0.06863164906270816, + "grad_norm": 1.995610237121582, + "learning_rate": 4.942123733415955e-05, + "loss": 5.2589, + "step": 11540 + }, + { + "epoch": 0.06863759634598915, + "grad_norm": 1.9689414501190186, + "learning_rate": 4.9421137404574264e-05, + "loss": 5.3715, + "step": 11541 + }, + { + "epoch": 0.06864354362927015, + "grad_norm": 1.6984235048294067, + "learning_rate": 4.942103746646382e-05, + "loss": 5.3987, + "step": 11542 + }, + { + "epoch": 0.06864949091255115, + "grad_norm": 1.2645832300186157, + "learning_rate": 4.9420937519828234e-05, + "loss": 5.2142, + "step": 11543 + }, + { + "epoch": 0.06865543819583214, + "grad_norm": 1.6830233335494995, + "learning_rate": 4.9420837564667556e-05, + "loss": 5.1172, + "step": 11544 + }, + { + "epoch": 0.06866138547911314, + "grad_norm": 1.5734926462173462, + "learning_rate": 4.9420737600981816e-05, + "loss": 5.3789, + "step": 11545 + }, + { + "epoch": 0.06866733276239413, + "grad_norm": 1.7375764846801758, + "learning_rate": 4.942063762877105e-05, + "loss": 5.5311, + "step": 11546 + }, + { + "epoch": 0.06867328004567513, + "grad_norm": 1.5421762466430664, + "learning_rate": 4.942053764803529e-05, + "loss": 5.1722, + "step": 11547 + }, + { + "epoch": 0.06867922732895614, + "grad_norm": 1.6282575130462646, + "learning_rate": 4.942043765877457e-05, + "loss": 5.4754, + "step": 11548 + }, + { + "epoch": 0.06868517461223712, + "grad_norm": 1.5595266819000244, + "learning_rate": 4.9420337660988936e-05, + "loss": 5.3516, + "step": 11549 + }, + { + "epoch": 0.06869112189551813, + "grad_norm": 1.5642317533493042, + "learning_rate": 4.9420237654678405e-05, + "loss": 5.2364, + "step": 11550 + }, + { + "epoch": 0.06869706917879913, + "grad_norm": 1.5491602420806885, + "learning_rate": 4.942013763984302e-05, + "loss": 5.1566, + "step": 11551 + }, + { + "epoch": 0.06870301646208012, + "grad_norm": 1.4256258010864258, + "learning_rate": 4.942003761648283e-05, + "loss": 5.1592, + "step": 11552 + }, + { + "epoch": 0.06870896374536112, + "grad_norm": 1.756016492843628, + "learning_rate": 4.9419937584597846e-05, + "loss": 5.012, + "step": 11553 + }, + { + "epoch": 0.06871491102864212, + "grad_norm": 2.5290040969848633, + "learning_rate": 4.941983754418812e-05, + "loss": 4.571, + "step": 11554 + }, + { + "epoch": 0.06872085831192311, + "grad_norm": 2.6146528720855713, + "learning_rate": 4.9419737495253685e-05, + "loss": 4.3515, + "step": 11555 + }, + { + "epoch": 0.06872680559520411, + "grad_norm": 2.3333144187927246, + "learning_rate": 4.941963743779456e-05, + "loss": 4.3032, + "step": 11556 + }, + { + "epoch": 0.06873275287848511, + "grad_norm": 2.342433452606201, + "learning_rate": 4.9419537371810795e-05, + "loss": 4.2942, + "step": 11557 + }, + { + "epoch": 0.0687387001617661, + "grad_norm": 2.423696517944336, + "learning_rate": 4.941943729730243e-05, + "loss": 4.4, + "step": 11558 + }, + { + "epoch": 0.0687446474450471, + "grad_norm": 2.3420050144195557, + "learning_rate": 4.941933721426948e-05, + "loss": 5.0466, + "step": 11559 + }, + { + "epoch": 0.0687505947283281, + "grad_norm": 2.7115821838378906, + "learning_rate": 4.9419237122712e-05, + "loss": 5.1197, + "step": 11560 + }, + { + "epoch": 0.06875654201160909, + "grad_norm": 2.7316489219665527, + "learning_rate": 4.9419137022630014e-05, + "loss": 5.2435, + "step": 11561 + }, + { + "epoch": 0.0687624892948901, + "grad_norm": 2.291551113128662, + "learning_rate": 4.941903691402356e-05, + "loss": 5.0345, + "step": 11562 + }, + { + "epoch": 0.0687684365781711, + "grad_norm": 2.4499049186706543, + "learning_rate": 4.941893679689267e-05, + "loss": 4.503, + "step": 11563 + }, + { + "epoch": 0.06877438386145208, + "grad_norm": 2.7120168209075928, + "learning_rate": 4.9418836671237385e-05, + "loss": 4.2954, + "step": 11564 + }, + { + "epoch": 0.06878033114473309, + "grad_norm": 2.8483526706695557, + "learning_rate": 4.941873653705774e-05, + "loss": 6.269, + "step": 11565 + }, + { + "epoch": 0.06878627842801409, + "grad_norm": 2.3191473484039307, + "learning_rate": 4.941863639435376e-05, + "loss": 6.1628, + "step": 11566 + }, + { + "epoch": 0.06879222571129508, + "grad_norm": 3.4622583389282227, + "learning_rate": 4.9418536243125486e-05, + "loss": 5.6115, + "step": 11567 + }, + { + "epoch": 0.06879817299457608, + "grad_norm": 1.7118897438049316, + "learning_rate": 4.941843608337295e-05, + "loss": 5.4801, + "step": 11568 + }, + { + "epoch": 0.06880412027785708, + "grad_norm": 2.876338243484497, + "learning_rate": 4.9418335915096195e-05, + "loss": 5.0806, + "step": 11569 + }, + { + "epoch": 0.06881006756113807, + "grad_norm": 2.2875587940216064, + "learning_rate": 4.941823573829525e-05, + "loss": 5.2833, + "step": 11570 + }, + { + "epoch": 0.06881601484441907, + "grad_norm": 1.797743320465088, + "learning_rate": 4.9418135552970155e-05, + "loss": 6.1407, + "step": 11571 + }, + { + "epoch": 0.06882196212770007, + "grad_norm": 1.957331895828247, + "learning_rate": 4.941803535912094e-05, + "loss": 5.8743, + "step": 11572 + }, + { + "epoch": 0.06882790941098106, + "grad_norm": 1.9552925825119019, + "learning_rate": 4.9417935156747644e-05, + "loss": 5.584, + "step": 11573 + }, + { + "epoch": 0.06883385669426206, + "grad_norm": 2.057610034942627, + "learning_rate": 4.94178349458503e-05, + "loss": 5.8445, + "step": 11574 + }, + { + "epoch": 0.06883980397754305, + "grad_norm": 1.7856727838516235, + "learning_rate": 4.941773472642893e-05, + "loss": 6.0133, + "step": 11575 + }, + { + "epoch": 0.06884575126082405, + "grad_norm": 1.4494417905807495, + "learning_rate": 4.941763449848359e-05, + "loss": 5.888, + "step": 11576 + }, + { + "epoch": 0.06885169854410506, + "grad_norm": 2.1377499103546143, + "learning_rate": 4.9417534262014306e-05, + "loss": 6.0604, + "step": 11577 + }, + { + "epoch": 0.06885764582738604, + "grad_norm": 1.769888162612915, + "learning_rate": 4.9417434017021105e-05, + "loss": 5.8815, + "step": 11578 + }, + { + "epoch": 0.06886359311066705, + "grad_norm": 1.933935523033142, + "learning_rate": 4.9417333763504036e-05, + "loss": 5.6601, + "step": 11579 + }, + { + "epoch": 0.06886954039394805, + "grad_norm": 1.8672062158584595, + "learning_rate": 4.941723350146313e-05, + "loss": 5.8143, + "step": 11580 + }, + { + "epoch": 0.06887548767722904, + "grad_norm": 1.9899057149887085, + "learning_rate": 4.941713323089842e-05, + "loss": 5.8465, + "step": 11581 + }, + { + "epoch": 0.06888143496051004, + "grad_norm": 2.1053643226623535, + "learning_rate": 4.941703295180994e-05, + "loss": 5.4582, + "step": 11582 + }, + { + "epoch": 0.06888738224379104, + "grad_norm": 1.9435245990753174, + "learning_rate": 4.9416932664197726e-05, + "loss": 5.8503, + "step": 11583 + }, + { + "epoch": 0.06889332952707203, + "grad_norm": 1.9407175779342651, + "learning_rate": 4.941683236806181e-05, + "loss": 5.706, + "step": 11584 + }, + { + "epoch": 0.06889927681035303, + "grad_norm": 2.0505893230438232, + "learning_rate": 4.941673206340224e-05, + "loss": 6.01, + "step": 11585 + }, + { + "epoch": 0.06890522409363403, + "grad_norm": 1.6713486909866333, + "learning_rate": 4.941663175021903e-05, + "loss": 5.8347, + "step": 11586 + }, + { + "epoch": 0.06891117137691502, + "grad_norm": 1.5333812236785889, + "learning_rate": 4.941653142851223e-05, + "loss": 5.8493, + "step": 11587 + }, + { + "epoch": 0.06891711866019602, + "grad_norm": 2.10982346534729, + "learning_rate": 4.9416431098281865e-05, + "loss": 5.4037, + "step": 11588 + }, + { + "epoch": 0.06892306594347702, + "grad_norm": 1.766663908958435, + "learning_rate": 4.9416330759527985e-05, + "loss": 5.0335, + "step": 11589 + }, + { + "epoch": 0.06892901322675801, + "grad_norm": 2.0600688457489014, + "learning_rate": 4.9416230412250615e-05, + "loss": 5.4017, + "step": 11590 + }, + { + "epoch": 0.06893496051003901, + "grad_norm": 1.6271671056747437, + "learning_rate": 4.941613005644979e-05, + "loss": 5.903, + "step": 11591 + }, + { + "epoch": 0.06894090779332002, + "grad_norm": 1.9222697019577026, + "learning_rate": 4.9416029692125544e-05, + "loss": 5.1666, + "step": 11592 + }, + { + "epoch": 0.068946855076601, + "grad_norm": 1.7405030727386475, + "learning_rate": 4.941592931927792e-05, + "loss": 5.0799, + "step": 11593 + }, + { + "epoch": 0.068952802359882, + "grad_norm": 1.7639994621276855, + "learning_rate": 4.941582893790694e-05, + "loss": 5.7596, + "step": 11594 + }, + { + "epoch": 0.06895874964316301, + "grad_norm": 1.9628292322158813, + "learning_rate": 4.941572854801265e-05, + "loss": 4.4573, + "step": 11595 + }, + { + "epoch": 0.068964696926444, + "grad_norm": 1.7616615295410156, + "learning_rate": 4.941562814959508e-05, + "loss": 4.6399, + "step": 11596 + }, + { + "epoch": 0.068970644209725, + "grad_norm": 1.8174281120300293, + "learning_rate": 4.9415527742654265e-05, + "loss": 5.6279, + "step": 11597 + }, + { + "epoch": 0.068976591493006, + "grad_norm": 1.563138723373413, + "learning_rate": 4.941542732719025e-05, + "loss": 5.8696, + "step": 11598 + }, + { + "epoch": 0.06898253877628699, + "grad_norm": 1.4704676866531372, + "learning_rate": 4.9415326903203055e-05, + "loss": 5.7129, + "step": 11599 + }, + { + "epoch": 0.06898848605956799, + "grad_norm": 2.484572410583496, + "learning_rate": 4.9415226470692724e-05, + "loss": 5.336, + "step": 11600 + }, + { + "epoch": 0.068994433342849, + "grad_norm": 1.882876992225647, + "learning_rate": 4.9415126029659284e-05, + "loss": 5.4273, + "step": 11601 + }, + { + "epoch": 0.06900038062612998, + "grad_norm": 1.7827874422073364, + "learning_rate": 4.941502558010278e-05, + "loss": 5.6699, + "step": 11602 + }, + { + "epoch": 0.06900632790941098, + "grad_norm": 1.5609276294708252, + "learning_rate": 4.941492512202325e-05, + "loss": 5.648, + "step": 11603 + }, + { + "epoch": 0.06901227519269197, + "grad_norm": 1.6941063404083252, + "learning_rate": 4.941482465542071e-05, + "loss": 5.633, + "step": 11604 + }, + { + "epoch": 0.06901822247597297, + "grad_norm": 1.768922209739685, + "learning_rate": 4.941472418029521e-05, + "loss": 5.6072, + "step": 11605 + }, + { + "epoch": 0.06902416975925398, + "grad_norm": 2.225846767425537, + "learning_rate": 4.941462369664679e-05, + "loss": 4.9314, + "step": 11606 + }, + { + "epoch": 0.06903011704253496, + "grad_norm": 2.4479281902313232, + "learning_rate": 4.941452320447546e-05, + "loss": 5.0563, + "step": 11607 + }, + { + "epoch": 0.06903606432581597, + "grad_norm": 2.358238935470581, + "learning_rate": 4.941442270378129e-05, + "loss": 4.9379, + "step": 11608 + }, + { + "epoch": 0.06904201160909697, + "grad_norm": 2.2679247856140137, + "learning_rate": 4.941432219456429e-05, + "loss": 5.0655, + "step": 11609 + }, + { + "epoch": 0.06904795889237796, + "grad_norm": 2.524176597595215, + "learning_rate": 4.94142216768245e-05, + "loss": 4.8694, + "step": 11610 + }, + { + "epoch": 0.06905390617565896, + "grad_norm": 2.1919515132904053, + "learning_rate": 4.9414121150561966e-05, + "loss": 5.0889, + "step": 11611 + }, + { + "epoch": 0.06905985345893996, + "grad_norm": 2.2838563919067383, + "learning_rate": 4.94140206157767e-05, + "loss": 4.9942, + "step": 11612 + }, + { + "epoch": 0.06906580074222095, + "grad_norm": 2.2270026206970215, + "learning_rate": 4.9413920072468764e-05, + "loss": 4.9885, + "step": 11613 + }, + { + "epoch": 0.06907174802550195, + "grad_norm": 2.175245761871338, + "learning_rate": 4.9413819520638176e-05, + "loss": 4.9829, + "step": 11614 + }, + { + "epoch": 0.06907769530878295, + "grad_norm": 2.128441572189331, + "learning_rate": 4.941371896028498e-05, + "loss": 4.9802, + "step": 11615 + }, + { + "epoch": 0.06908364259206394, + "grad_norm": 2.7656328678131104, + "learning_rate": 4.94136183914092e-05, + "loss": 5.1302, + "step": 11616 + }, + { + "epoch": 0.06908958987534494, + "grad_norm": 2.23917818069458, + "learning_rate": 4.941351781401088e-05, + "loss": 4.8766, + "step": 11617 + }, + { + "epoch": 0.06909553715862594, + "grad_norm": 1.861399531364441, + "learning_rate": 4.941341722809005e-05, + "loss": 5.8151, + "step": 11618 + }, + { + "epoch": 0.06910148444190693, + "grad_norm": 2.13590145111084, + "learning_rate": 4.9413316633646754e-05, + "loss": 5.6892, + "step": 11619 + }, + { + "epoch": 0.06910743172518793, + "grad_norm": 1.8261966705322266, + "learning_rate": 4.9413216030681024e-05, + "loss": 6.1387, + "step": 11620 + }, + { + "epoch": 0.06911337900846894, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.941311541919289e-05, + "loss": 5.3217, + "step": 11621 + }, + { + "epoch": 0.06911932629174992, + "grad_norm": 2.1011979579925537, + "learning_rate": 4.941301479918239e-05, + "loss": 5.048, + "step": 11622 + }, + { + "epoch": 0.06912527357503093, + "grad_norm": 2.214597225189209, + "learning_rate": 4.941291417064956e-05, + "loss": 5.4312, + "step": 11623 + }, + { + "epoch": 0.06913122085831193, + "grad_norm": 2.6525864601135254, + "learning_rate": 4.941281353359443e-05, + "loss": 4.4151, + "step": 11624 + }, + { + "epoch": 0.06913716814159292, + "grad_norm": 1.9638911485671997, + "learning_rate": 4.941271288801704e-05, + "loss": 5.0091, + "step": 11625 + }, + { + "epoch": 0.06914311542487392, + "grad_norm": 2.062688112258911, + "learning_rate": 4.941261223391742e-05, + "loss": 5.503, + "step": 11626 + }, + { + "epoch": 0.06914906270815492, + "grad_norm": 2.219430685043335, + "learning_rate": 4.941251157129561e-05, + "loss": 4.984, + "step": 11627 + }, + { + "epoch": 0.06915500999143591, + "grad_norm": 2.0745718479156494, + "learning_rate": 4.941241090015165e-05, + "loss": 5.3094, + "step": 11628 + }, + { + "epoch": 0.06916095727471691, + "grad_norm": 1.8852496147155762, + "learning_rate": 4.941231022048557e-05, + "loss": 5.2424, + "step": 11629 + }, + { + "epoch": 0.06916690455799791, + "grad_norm": 2.335723400115967, + "learning_rate": 4.9412209532297404e-05, + "loss": 5.6031, + "step": 11630 + }, + { + "epoch": 0.0691728518412789, + "grad_norm": 2.167698621749878, + "learning_rate": 4.941210883558719e-05, + "loss": 5.3132, + "step": 11631 + }, + { + "epoch": 0.0691787991245599, + "grad_norm": 2.213068962097168, + "learning_rate": 4.941200813035495e-05, + "loss": 5.2049, + "step": 11632 + }, + { + "epoch": 0.06918474640784089, + "grad_norm": 1.9697870016098022, + "learning_rate": 4.941190741660075e-05, + "loss": 5.3118, + "step": 11633 + }, + { + "epoch": 0.0691906936911219, + "grad_norm": 1.7360777854919434, + "learning_rate": 4.941180669432458e-05, + "loss": 5.444, + "step": 11634 + }, + { + "epoch": 0.0691966409744029, + "grad_norm": 1.8400771617889404, + "learning_rate": 4.9411705963526514e-05, + "loss": 5.6975, + "step": 11635 + }, + { + "epoch": 0.06920258825768388, + "grad_norm": 1.492242693901062, + "learning_rate": 4.941160522420657e-05, + "loss": 5.5617, + "step": 11636 + }, + { + "epoch": 0.06920853554096489, + "grad_norm": 1.6014543771743774, + "learning_rate": 4.9411504476364794e-05, + "loss": 5.7317, + "step": 11637 + }, + { + "epoch": 0.06921448282424589, + "grad_norm": 1.7973628044128418, + "learning_rate": 4.9411403720001215e-05, + "loss": 5.3105, + "step": 11638 + }, + { + "epoch": 0.06922043010752688, + "grad_norm": 1.8314461708068848, + "learning_rate": 4.9411302955115853e-05, + "loss": 5.624, + "step": 11639 + }, + { + "epoch": 0.06922637739080788, + "grad_norm": 1.621315836906433, + "learning_rate": 4.941120218170877e-05, + "loss": 5.8243, + "step": 11640 + }, + { + "epoch": 0.06923232467408888, + "grad_norm": 2.0378596782684326, + "learning_rate": 4.941110139977998e-05, + "loss": 4.9275, + "step": 11641 + }, + { + "epoch": 0.06923827195736987, + "grad_norm": 1.8713582754135132, + "learning_rate": 4.941100060932954e-05, + "loss": 5.1218, + "step": 11642 + }, + { + "epoch": 0.06924421924065087, + "grad_norm": 1.878404140472412, + "learning_rate": 4.941089981035746e-05, + "loss": 5.4997, + "step": 11643 + }, + { + "epoch": 0.06925016652393187, + "grad_norm": 1.7230712175369263, + "learning_rate": 4.941079900286379e-05, + "loss": 5.5514, + "step": 11644 + }, + { + "epoch": 0.06925611380721286, + "grad_norm": 1.6272276639938354, + "learning_rate": 4.941069818684856e-05, + "loss": 5.7186, + "step": 11645 + }, + { + "epoch": 0.06926206109049386, + "grad_norm": 1.5610454082489014, + "learning_rate": 4.9410597362311814e-05, + "loss": 5.8929, + "step": 11646 + }, + { + "epoch": 0.06926800837377486, + "grad_norm": 1.7373837232589722, + "learning_rate": 4.941049652925358e-05, + "loss": 5.6428, + "step": 11647 + }, + { + "epoch": 0.06927395565705585, + "grad_norm": 1.9722628593444824, + "learning_rate": 4.9410395687673886e-05, + "loss": 5.9562, + "step": 11648 + }, + { + "epoch": 0.06927990294033685, + "grad_norm": 1.5603039264678955, + "learning_rate": 4.941029483757278e-05, + "loss": 6.031, + "step": 11649 + }, + { + "epoch": 0.06928585022361786, + "grad_norm": 1.6971800327301025, + "learning_rate": 4.941019397895029e-05, + "loss": 5.7527, + "step": 11650 + }, + { + "epoch": 0.06929179750689884, + "grad_norm": 1.9559118747711182, + "learning_rate": 4.9410093111806456e-05, + "loss": 5.0904, + "step": 11651 + }, + { + "epoch": 0.06929774479017985, + "grad_norm": 1.561122179031372, + "learning_rate": 4.9409992236141315e-05, + "loss": 5.7438, + "step": 11652 + }, + { + "epoch": 0.06930369207346085, + "grad_norm": 1.6071819067001343, + "learning_rate": 4.940989135195489e-05, + "loss": 5.8852, + "step": 11653 + }, + { + "epoch": 0.06930963935674184, + "grad_norm": 1.6804322004318237, + "learning_rate": 4.940979045924723e-05, + "loss": 5.7174, + "step": 11654 + }, + { + "epoch": 0.06931558664002284, + "grad_norm": 1.5802178382873535, + "learning_rate": 4.940968955801836e-05, + "loss": 5.8755, + "step": 11655 + }, + { + "epoch": 0.06932153392330384, + "grad_norm": 2.1002743244171143, + "learning_rate": 4.940958864826832e-05, + "loss": 5.6323, + "step": 11656 + }, + { + "epoch": 0.06932748120658483, + "grad_norm": 1.8874709606170654, + "learning_rate": 4.9409487729997144e-05, + "loss": 5.6798, + "step": 11657 + }, + { + "epoch": 0.06933342848986583, + "grad_norm": 1.6967203617095947, + "learning_rate": 4.940938680320487e-05, + "loss": 5.8461, + "step": 11658 + }, + { + "epoch": 0.06933937577314683, + "grad_norm": 1.9648679494857788, + "learning_rate": 4.9409285867891534e-05, + "loss": 5.842, + "step": 11659 + }, + { + "epoch": 0.06934532305642782, + "grad_norm": 1.8681408166885376, + "learning_rate": 4.940918492405716e-05, + "loss": 5.8859, + "step": 11660 + }, + { + "epoch": 0.06935127033970882, + "grad_norm": 2.0480551719665527, + "learning_rate": 4.9409083971701805e-05, + "loss": 5.6415, + "step": 11661 + }, + { + "epoch": 0.06935721762298983, + "grad_norm": 2.102832555770874, + "learning_rate": 4.940898301082548e-05, + "loss": 5.6163, + "step": 11662 + }, + { + "epoch": 0.06936316490627081, + "grad_norm": 1.7471407651901245, + "learning_rate": 4.940888204142824e-05, + "loss": 5.7973, + "step": 11663 + }, + { + "epoch": 0.06936911218955182, + "grad_norm": 1.9675641059875488, + "learning_rate": 4.94087810635101e-05, + "loss": 5.1125, + "step": 11664 + }, + { + "epoch": 0.0693750594728328, + "grad_norm": 1.6316107511520386, + "learning_rate": 4.940868007707111e-05, + "loss": 5.5067, + "step": 11665 + }, + { + "epoch": 0.0693810067561138, + "grad_norm": 1.8663619756698608, + "learning_rate": 4.940857908211131e-05, + "loss": 5.5552, + "step": 11666 + }, + { + "epoch": 0.06938695403939481, + "grad_norm": 2.155702590942383, + "learning_rate": 4.940847807863072e-05, + "loss": 6.0919, + "step": 11667 + }, + { + "epoch": 0.0693929013226758, + "grad_norm": 1.968467354774475, + "learning_rate": 4.9408377066629384e-05, + "loss": 5.8105, + "step": 11668 + }, + { + "epoch": 0.0693988486059568, + "grad_norm": 1.5245625972747803, + "learning_rate": 4.940827604610734e-05, + "loss": 5.8901, + "step": 11669 + }, + { + "epoch": 0.0694047958892378, + "grad_norm": 1.7377501726150513, + "learning_rate": 4.940817501706461e-05, + "loss": 5.5917, + "step": 11670 + }, + { + "epoch": 0.06941074317251879, + "grad_norm": 1.9668710231781006, + "learning_rate": 4.940807397950125e-05, + "loss": 5.6857, + "step": 11671 + }, + { + "epoch": 0.06941669045579979, + "grad_norm": 1.8168022632598877, + "learning_rate": 4.9407972933417266e-05, + "loss": 5.7032, + "step": 11672 + }, + { + "epoch": 0.06942263773908079, + "grad_norm": 2.4009077548980713, + "learning_rate": 4.940787187881273e-05, + "loss": 5.6767, + "step": 11673 + }, + { + "epoch": 0.06942858502236178, + "grad_norm": 1.8541746139526367, + "learning_rate": 4.940777081568765e-05, + "loss": 5.6327, + "step": 11674 + }, + { + "epoch": 0.06943453230564278, + "grad_norm": 2.028602361679077, + "learning_rate": 4.940766974404206e-05, + "loss": 5.0819, + "step": 11675 + }, + { + "epoch": 0.06944047958892378, + "grad_norm": 2.0870065689086914, + "learning_rate": 4.940756866387602e-05, + "loss": 5.1645, + "step": 11676 + }, + { + "epoch": 0.06944642687220477, + "grad_norm": 1.8009755611419678, + "learning_rate": 4.940746757518954e-05, + "loss": 4.9832, + "step": 11677 + }, + { + "epoch": 0.06945237415548577, + "grad_norm": 2.20975399017334, + "learning_rate": 4.9407366477982675e-05, + "loss": 4.9683, + "step": 11678 + }, + { + "epoch": 0.06945832143876678, + "grad_norm": 1.89133882522583, + "learning_rate": 4.940726537225544e-05, + "loss": 4.7736, + "step": 11679 + }, + { + "epoch": 0.06946426872204776, + "grad_norm": 1.7583657503128052, + "learning_rate": 4.940716425800789e-05, + "loss": 5.4275, + "step": 11680 + }, + { + "epoch": 0.06947021600532877, + "grad_norm": 2.1929352283477783, + "learning_rate": 4.940706313524004e-05, + "loss": 4.8441, + "step": 11681 + }, + { + "epoch": 0.06947616328860977, + "grad_norm": 2.1098999977111816, + "learning_rate": 4.940696200395194e-05, + "loss": 5.065, + "step": 11682 + }, + { + "epoch": 0.06948211057189076, + "grad_norm": 1.7651045322418213, + "learning_rate": 4.940686086414363e-05, + "loss": 5.7086, + "step": 11683 + }, + { + "epoch": 0.06948805785517176, + "grad_norm": 1.6675828695297241, + "learning_rate": 4.9406759715815134e-05, + "loss": 5.89, + "step": 11684 + }, + { + "epoch": 0.06949400513845276, + "grad_norm": 1.9754993915557861, + "learning_rate": 4.940665855896648e-05, + "loss": 5.7752, + "step": 11685 + }, + { + "epoch": 0.06949995242173375, + "grad_norm": 1.7652478218078613, + "learning_rate": 4.940655739359773e-05, + "loss": 5.6518, + "step": 11686 + }, + { + "epoch": 0.06950589970501475, + "grad_norm": 1.898997187614441, + "learning_rate": 4.940645621970889e-05, + "loss": 5.4579, + "step": 11687 + }, + { + "epoch": 0.06951184698829575, + "grad_norm": 2.1233060359954834, + "learning_rate": 4.940635503730001e-05, + "loss": 4.3979, + "step": 11688 + }, + { + "epoch": 0.06951779427157674, + "grad_norm": 2.0859549045562744, + "learning_rate": 4.940625384637113e-05, + "loss": 4.4309, + "step": 11689 + }, + { + "epoch": 0.06952374155485774, + "grad_norm": 2.051492929458618, + "learning_rate": 4.940615264692228e-05, + "loss": 4.4332, + "step": 11690 + }, + { + "epoch": 0.06952968883813875, + "grad_norm": 2.0359628200531006, + "learning_rate": 4.940605143895348e-05, + "loss": 4.29, + "step": 11691 + }, + { + "epoch": 0.06953563612141973, + "grad_norm": 2.0122604370117188, + "learning_rate": 4.940595022246479e-05, + "loss": 4.4391, + "step": 11692 + }, + { + "epoch": 0.06954158340470074, + "grad_norm": 2.059694290161133, + "learning_rate": 4.940584899745624e-05, + "loss": 4.3993, + "step": 11693 + }, + { + "epoch": 0.06954753068798172, + "grad_norm": 2.0355825424194336, + "learning_rate": 4.940574776392786e-05, + "loss": 4.2829, + "step": 11694 + }, + { + "epoch": 0.06955347797126273, + "grad_norm": 1.933385968208313, + "learning_rate": 4.940564652187967e-05, + "loss": 4.372, + "step": 11695 + }, + { + "epoch": 0.06955942525454373, + "grad_norm": 2.0848586559295654, + "learning_rate": 4.940554527131174e-05, + "loss": 4.3064, + "step": 11696 + }, + { + "epoch": 0.06956537253782472, + "grad_norm": 1.889845848083496, + "learning_rate": 4.940544401222407e-05, + "loss": 4.3811, + "step": 11697 + }, + { + "epoch": 0.06957131982110572, + "grad_norm": 2.0076160430908203, + "learning_rate": 4.9405342744616724e-05, + "loss": 4.3382, + "step": 11698 + }, + { + "epoch": 0.06957726710438672, + "grad_norm": 1.9708037376403809, + "learning_rate": 4.940524146848971e-05, + "loss": 4.4659, + "step": 11699 + }, + { + "epoch": 0.06958321438766771, + "grad_norm": 2.086454153060913, + "learning_rate": 4.940514018384309e-05, + "loss": 4.196, + "step": 11700 + }, + { + "epoch": 0.06958916167094871, + "grad_norm": 2.095062255859375, + "learning_rate": 4.940503889067689e-05, + "loss": 4.2062, + "step": 11701 + }, + { + "epoch": 0.06959510895422971, + "grad_norm": 2.0661754608154297, + "learning_rate": 4.940493758899114e-05, + "loss": 4.3468, + "step": 11702 + }, + { + "epoch": 0.0696010562375107, + "grad_norm": 2.073573350906372, + "learning_rate": 4.9404836278785875e-05, + "loss": 4.248, + "step": 11703 + }, + { + "epoch": 0.0696070035207917, + "grad_norm": 2.104018449783325, + "learning_rate": 4.940473496006114e-05, + "loss": 4.1523, + "step": 11704 + }, + { + "epoch": 0.0696129508040727, + "grad_norm": 2.067532777786255, + "learning_rate": 4.9404633632816954e-05, + "loss": 4.2721, + "step": 11705 + }, + { + "epoch": 0.06961889808735369, + "grad_norm": 2.036736249923706, + "learning_rate": 4.9404532297053376e-05, + "loss": 4.4057, + "step": 11706 + }, + { + "epoch": 0.0696248453706347, + "grad_norm": 1.9911088943481445, + "learning_rate": 4.940443095277042e-05, + "loss": 4.1875, + "step": 11707 + }, + { + "epoch": 0.0696307926539157, + "grad_norm": 2.017457962036133, + "learning_rate": 4.9404329599968124e-05, + "loss": 4.1506, + "step": 11708 + }, + { + "epoch": 0.06963673993719668, + "grad_norm": 1.8043596744537354, + "learning_rate": 4.940422823864654e-05, + "loss": 4.3937, + "step": 11709 + }, + { + "epoch": 0.06964268722047769, + "grad_norm": 2.0362250804901123, + "learning_rate": 4.9404126868805687e-05, + "loss": 3.8076, + "step": 11710 + }, + { + "epoch": 0.06964863450375869, + "grad_norm": 2.10723876953125, + "learning_rate": 4.940402549044561e-05, + "loss": 4.2487, + "step": 11711 + }, + { + "epoch": 0.06965458178703968, + "grad_norm": 2.1901967525482178, + "learning_rate": 4.940392410356632e-05, + "loss": 4.1183, + "step": 11712 + }, + { + "epoch": 0.06966052907032068, + "grad_norm": 2.196518659591675, + "learning_rate": 4.9403822708167896e-05, + "loss": 4.2959, + "step": 11713 + }, + { + "epoch": 0.06966647635360168, + "grad_norm": 2.1917595863342285, + "learning_rate": 4.940372130425034e-05, + "loss": 4.1011, + "step": 11714 + }, + { + "epoch": 0.06967242363688267, + "grad_norm": 2.14424991607666, + "learning_rate": 4.9403619891813696e-05, + "loss": 3.9033, + "step": 11715 + }, + { + "epoch": 0.06967837092016367, + "grad_norm": 1.9970608949661255, + "learning_rate": 4.9403518470858004e-05, + "loss": 3.9243, + "step": 11716 + }, + { + "epoch": 0.06968431820344467, + "grad_norm": 2.215721607208252, + "learning_rate": 4.9403417041383294e-05, + "loss": 4.0036, + "step": 11717 + }, + { + "epoch": 0.06969026548672566, + "grad_norm": 1.9153071641921997, + "learning_rate": 4.94033156033896e-05, + "loss": 5.6849, + "step": 11718 + }, + { + "epoch": 0.06969621277000666, + "grad_norm": 2.287951707839966, + "learning_rate": 4.9403214156876966e-05, + "loss": 4.3569, + "step": 11719 + }, + { + "epoch": 0.06970216005328767, + "grad_norm": 2.1257216930389404, + "learning_rate": 4.940311270184542e-05, + "loss": 4.1051, + "step": 11720 + }, + { + "epoch": 0.06970810733656865, + "grad_norm": 2.164879560470581, + "learning_rate": 4.9403011238295e-05, + "loss": 4.0754, + "step": 11721 + }, + { + "epoch": 0.06971405461984966, + "grad_norm": 2.2430567741394043, + "learning_rate": 4.940290976622574e-05, + "loss": 4.1251, + "step": 11722 + }, + { + "epoch": 0.06972000190313064, + "grad_norm": 2.2621891498565674, + "learning_rate": 4.940280828563768e-05, + "loss": 4.2302, + "step": 11723 + }, + { + "epoch": 0.06972594918641165, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.940270679653085e-05, + "loss": 4.2853, + "step": 11724 + }, + { + "epoch": 0.06973189646969265, + "grad_norm": 2.211843729019165, + "learning_rate": 4.940260529890528e-05, + "loss": 3.6609, + "step": 11725 + }, + { + "epoch": 0.06973784375297364, + "grad_norm": 1.8500425815582275, + "learning_rate": 4.940250379276102e-05, + "loss": 3.8701, + "step": 11726 + }, + { + "epoch": 0.06974379103625464, + "grad_norm": 2.09136962890625, + "learning_rate": 4.94024022780981e-05, + "loss": 4.5569, + "step": 11727 + }, + { + "epoch": 0.06974973831953564, + "grad_norm": 1.9922528266906738, + "learning_rate": 4.940230075491655e-05, + "loss": 4.4055, + "step": 11728 + }, + { + "epoch": 0.06975568560281663, + "grad_norm": 2.253831624984741, + "learning_rate": 4.940219922321641e-05, + "loss": 4.114, + "step": 11729 + }, + { + "epoch": 0.06976163288609763, + "grad_norm": 2.0647006034851074, + "learning_rate": 4.94020976829977e-05, + "loss": 4.9004, + "step": 11730 + }, + { + "epoch": 0.06976758016937863, + "grad_norm": 2.5659384727478027, + "learning_rate": 4.940199613426049e-05, + "loss": 5.0852, + "step": 11731 + }, + { + "epoch": 0.06977352745265962, + "grad_norm": 2.227599859237671, + "learning_rate": 4.9401894577004796e-05, + "loss": 5.1603, + "step": 11732 + }, + { + "epoch": 0.06977947473594062, + "grad_norm": 1.8170785903930664, + "learning_rate": 4.940179301123063e-05, + "loss": 5.8334, + "step": 11733 + }, + { + "epoch": 0.06978542201922162, + "grad_norm": 2.1795544624328613, + "learning_rate": 4.940169143693807e-05, + "loss": 5.668, + "step": 11734 + }, + { + "epoch": 0.06979136930250261, + "grad_norm": 2.1248555183410645, + "learning_rate": 4.940158985412713e-05, + "loss": 5.7604, + "step": 11735 + }, + { + "epoch": 0.06979731658578361, + "grad_norm": 1.9677635431289673, + "learning_rate": 4.9401488262797845e-05, + "loss": 5.6568, + "step": 11736 + }, + { + "epoch": 0.06980326386906462, + "grad_norm": 1.9796242713928223, + "learning_rate": 4.940138666295025e-05, + "loss": 5.4303, + "step": 11737 + }, + { + "epoch": 0.0698092111523456, + "grad_norm": 1.7489395141601562, + "learning_rate": 4.9401285054584385e-05, + "loss": 6.1782, + "step": 11738 + }, + { + "epoch": 0.0698151584356266, + "grad_norm": 1.8067989349365234, + "learning_rate": 4.940118343770028e-05, + "loss": 6.0974, + "step": 11739 + }, + { + "epoch": 0.06982110571890761, + "grad_norm": 1.7377318143844604, + "learning_rate": 4.940108181229798e-05, + "loss": 5.8477, + "step": 11740 + }, + { + "epoch": 0.0698270530021886, + "grad_norm": 2.297499656677246, + "learning_rate": 4.940098017837751e-05, + "loss": 4.8027, + "step": 11741 + }, + { + "epoch": 0.0698330002854696, + "grad_norm": 1.7340888977050781, + "learning_rate": 4.940087853593891e-05, + "loss": 5.5897, + "step": 11742 + }, + { + "epoch": 0.0698389475687506, + "grad_norm": 2.019639730453491, + "learning_rate": 4.9400776884982216e-05, + "loss": 5.4493, + "step": 11743 + }, + { + "epoch": 0.06984489485203159, + "grad_norm": 1.7959356307983398, + "learning_rate": 4.9400675225507466e-05, + "loss": 5.5995, + "step": 11744 + }, + { + "epoch": 0.06985084213531259, + "grad_norm": 2.234757661819458, + "learning_rate": 4.940057355751468e-05, + "loss": 5.9542, + "step": 11745 + }, + { + "epoch": 0.06985678941859359, + "grad_norm": 2.047755241394043, + "learning_rate": 4.9400471881003925e-05, + "loss": 5.9125, + "step": 11746 + }, + { + "epoch": 0.06986273670187458, + "grad_norm": 1.9563192129135132, + "learning_rate": 4.940037019597521e-05, + "loss": 5.7298, + "step": 11747 + }, + { + "epoch": 0.06986868398515558, + "grad_norm": 2.7170934677124023, + "learning_rate": 4.940026850242857e-05, + "loss": 5.5172, + "step": 11748 + }, + { + "epoch": 0.06987463126843659, + "grad_norm": 2.326277494430542, + "learning_rate": 4.9400166800364056e-05, + "loss": 5.685, + "step": 11749 + }, + { + "epoch": 0.06988057855171757, + "grad_norm": 1.708383321762085, + "learning_rate": 4.94000650897817e-05, + "loss": 5.3879, + "step": 11750 + }, + { + "epoch": 0.06988652583499858, + "grad_norm": 1.897631049156189, + "learning_rate": 4.9399963370681527e-05, + "loss": 5.6856, + "step": 11751 + }, + { + "epoch": 0.06989247311827956, + "grad_norm": 2.227720260620117, + "learning_rate": 4.939986164306357e-05, + "loss": 5.4487, + "step": 11752 + }, + { + "epoch": 0.06989842040156057, + "grad_norm": 2.7821953296661377, + "learning_rate": 4.939975990692789e-05, + "loss": 5.7276, + "step": 11753 + }, + { + "epoch": 0.06990436768484157, + "grad_norm": 1.8389033079147339, + "learning_rate": 4.939965816227449e-05, + "loss": 5.6933, + "step": 11754 + }, + { + "epoch": 0.06991031496812256, + "grad_norm": 1.7653162479400635, + "learning_rate": 4.939955640910343e-05, + "loss": 5.6079, + "step": 11755 + }, + { + "epoch": 0.06991626225140356, + "grad_norm": 1.7504348754882812, + "learning_rate": 4.939945464741475e-05, + "loss": 6.0413, + "step": 11756 + }, + { + "epoch": 0.06992220953468456, + "grad_norm": 2.118326187133789, + "learning_rate": 4.939935287720845e-05, + "loss": 5.8937, + "step": 11757 + }, + { + "epoch": 0.06992815681796555, + "grad_norm": 1.9626812934875488, + "learning_rate": 4.93992510984846e-05, + "loss": 5.9564, + "step": 11758 + }, + { + "epoch": 0.06993410410124655, + "grad_norm": 1.9915722608566284, + "learning_rate": 4.939914931124322e-05, + "loss": 5.6851, + "step": 11759 + }, + { + "epoch": 0.06994005138452755, + "grad_norm": 1.7959195375442505, + "learning_rate": 4.939904751548435e-05, + "loss": 4.785, + "step": 11760 + }, + { + "epoch": 0.06994599866780854, + "grad_norm": 1.8472923040390015, + "learning_rate": 4.9398945711208025e-05, + "loss": 5.2683, + "step": 11761 + }, + { + "epoch": 0.06995194595108954, + "grad_norm": 1.4207996129989624, + "learning_rate": 4.9398843898414274e-05, + "loss": 5.5402, + "step": 11762 + }, + { + "epoch": 0.06995789323437054, + "grad_norm": 2.122070550918579, + "learning_rate": 4.9398742077103146e-05, + "loss": 5.5397, + "step": 11763 + }, + { + "epoch": 0.06996384051765153, + "grad_norm": 2.285970687866211, + "learning_rate": 4.939864024727467e-05, + "loss": 5.1401, + "step": 11764 + }, + { + "epoch": 0.06996978780093253, + "grad_norm": 2.1245667934417725, + "learning_rate": 4.9398538408928874e-05, + "loss": 5.2009, + "step": 11765 + }, + { + "epoch": 0.06997573508421354, + "grad_norm": 1.8151131868362427, + "learning_rate": 4.939843656206581e-05, + "loss": 4.8635, + "step": 11766 + }, + { + "epoch": 0.06998168236749452, + "grad_norm": 1.9139370918273926, + "learning_rate": 4.9398334706685494e-05, + "loss": 5.5998, + "step": 11767 + }, + { + "epoch": 0.06998762965077553, + "grad_norm": 1.6889853477478027, + "learning_rate": 4.9398232842787976e-05, + "loss": 5.6183, + "step": 11768 + }, + { + "epoch": 0.06999357693405653, + "grad_norm": 1.773409366607666, + "learning_rate": 4.939813097037329e-05, + "loss": 5.5083, + "step": 11769 + }, + { + "epoch": 0.06999952421733752, + "grad_norm": 2.195955991744995, + "learning_rate": 4.9398029089441465e-05, + "loss": 6.4436, + "step": 11770 + }, + { + "epoch": 0.07000547150061852, + "grad_norm": 2.058687448501587, + "learning_rate": 4.939792719999254e-05, + "loss": 6.2875, + "step": 11771 + }, + { + "epoch": 0.07001141878389952, + "grad_norm": 1.9074562788009644, + "learning_rate": 4.939782530202655e-05, + "loss": 5.8764, + "step": 11772 + }, + { + "epoch": 0.07001736606718051, + "grad_norm": 2.163663864135742, + "learning_rate": 4.9397723395543535e-05, + "loss": 5.4666, + "step": 11773 + }, + { + "epoch": 0.07002331335046151, + "grad_norm": 2.2188286781311035, + "learning_rate": 4.939762148054352e-05, + "loss": 6.0679, + "step": 11774 + }, + { + "epoch": 0.07002926063374251, + "grad_norm": 1.8202224969863892, + "learning_rate": 4.9397519557026553e-05, + "loss": 6.0465, + "step": 11775 + }, + { + "epoch": 0.0700352079170235, + "grad_norm": 1.9515994787216187, + "learning_rate": 4.939741762499266e-05, + "loss": 5.9634, + "step": 11776 + }, + { + "epoch": 0.0700411552003045, + "grad_norm": 1.772741675376892, + "learning_rate": 4.9397315684441886e-05, + "loss": 5.3117, + "step": 11777 + }, + { + "epoch": 0.0700471024835855, + "grad_norm": 1.7377926111221313, + "learning_rate": 4.9397213735374256e-05, + "loss": 5.7082, + "step": 11778 + }, + { + "epoch": 0.0700530497668665, + "grad_norm": 1.881205439567566, + "learning_rate": 4.939711177778982e-05, + "loss": 5.8463, + "step": 11779 + }, + { + "epoch": 0.0700589970501475, + "grad_norm": 1.893402099609375, + "learning_rate": 4.939700981168859e-05, + "loss": 5.8321, + "step": 11780 + }, + { + "epoch": 0.07006494433342848, + "grad_norm": 1.6830201148986816, + "learning_rate": 4.939690783707063e-05, + "loss": 5.8655, + "step": 11781 + }, + { + "epoch": 0.07007089161670949, + "grad_norm": 1.9164643287658691, + "learning_rate": 4.939680585393595e-05, + "loss": 5.7089, + "step": 11782 + }, + { + "epoch": 0.07007683889999049, + "grad_norm": 1.5564945936203003, + "learning_rate": 4.93967038622846e-05, + "loss": 5.8671, + "step": 11783 + }, + { + "epoch": 0.07008278618327148, + "grad_norm": 1.6557695865631104, + "learning_rate": 4.939660186211662e-05, + "loss": 5.7461, + "step": 11784 + }, + { + "epoch": 0.07008873346655248, + "grad_norm": 1.7161173820495605, + "learning_rate": 4.9396499853432035e-05, + "loss": 5.0569, + "step": 11785 + }, + { + "epoch": 0.07009468074983348, + "grad_norm": 1.6760550737380981, + "learning_rate": 4.939639783623088e-05, + "loss": 5.4683, + "step": 11786 + }, + { + "epoch": 0.07010062803311447, + "grad_norm": 1.818652629852295, + "learning_rate": 4.9396295810513196e-05, + "loss": 4.9676, + "step": 11787 + }, + { + "epoch": 0.07010657531639547, + "grad_norm": 2.016510009765625, + "learning_rate": 4.939619377627901e-05, + "loss": 5.255, + "step": 11788 + }, + { + "epoch": 0.07011252259967647, + "grad_norm": 2.1893560886383057, + "learning_rate": 4.939609173352838e-05, + "loss": 5.0798, + "step": 11789 + }, + { + "epoch": 0.07011846988295746, + "grad_norm": 1.8063241243362427, + "learning_rate": 4.939598968226132e-05, + "loss": 5.049, + "step": 11790 + }, + { + "epoch": 0.07012441716623846, + "grad_norm": 1.7766486406326294, + "learning_rate": 4.939588762247786e-05, + "loss": 4.8375, + "step": 11791 + }, + { + "epoch": 0.07013036444951946, + "grad_norm": 1.6848721504211426, + "learning_rate": 4.9395785554178066e-05, + "loss": 4.7944, + "step": 11792 + }, + { + "epoch": 0.07013631173280045, + "grad_norm": 1.5173190832138062, + "learning_rate": 4.939568347736195e-05, + "loss": 4.8558, + "step": 11793 + }, + { + "epoch": 0.07014225901608145, + "grad_norm": 1.9625753164291382, + "learning_rate": 4.939558139202955e-05, + "loss": 5.0129, + "step": 11794 + }, + { + "epoch": 0.07014820629936246, + "grad_norm": 2.1610453128814697, + "learning_rate": 4.93954792981809e-05, + "loss": 5.7208, + "step": 11795 + }, + { + "epoch": 0.07015415358264344, + "grad_norm": 2.272775411605835, + "learning_rate": 4.939537719581605e-05, + "loss": 5.3673, + "step": 11796 + }, + { + "epoch": 0.07016010086592445, + "grad_norm": 1.8652429580688477, + "learning_rate": 4.9395275084935025e-05, + "loss": 5.7692, + "step": 11797 + }, + { + "epoch": 0.07016604814920545, + "grad_norm": 1.6594206094741821, + "learning_rate": 4.939517296553786e-05, + "loss": 5.7201, + "step": 11798 + }, + { + "epoch": 0.07017199543248644, + "grad_norm": 1.7499476671218872, + "learning_rate": 4.939507083762459e-05, + "loss": 5.6471, + "step": 11799 + }, + { + "epoch": 0.07017794271576744, + "grad_norm": 2.050825834274292, + "learning_rate": 4.939496870119525e-05, + "loss": 5.4805, + "step": 11800 + }, + { + "epoch": 0.07018388999904844, + "grad_norm": 2.033815383911133, + "learning_rate": 4.939486655624988e-05, + "loss": 5.7465, + "step": 11801 + }, + { + "epoch": 0.07018983728232943, + "grad_norm": 1.7499231100082397, + "learning_rate": 4.939476440278852e-05, + "loss": 5.0271, + "step": 11802 + }, + { + "epoch": 0.07019578456561043, + "grad_norm": 2.331024646759033, + "learning_rate": 4.939466224081119e-05, + "loss": 5.0491, + "step": 11803 + }, + { + "epoch": 0.07020173184889143, + "grad_norm": 2.089859962463379, + "learning_rate": 4.939456007031794e-05, + "loss": 5.6678, + "step": 11804 + }, + { + "epoch": 0.07020767913217242, + "grad_norm": 2.0704381465911865, + "learning_rate": 4.93944578913088e-05, + "loss": 5.5128, + "step": 11805 + }, + { + "epoch": 0.07021362641545342, + "grad_norm": 2.3215534687042236, + "learning_rate": 4.939435570378381e-05, + "loss": 4.8886, + "step": 11806 + }, + { + "epoch": 0.07021957369873442, + "grad_norm": 2.2506353855133057, + "learning_rate": 4.9394253507743004e-05, + "loss": 4.8606, + "step": 11807 + }, + { + "epoch": 0.07022552098201541, + "grad_norm": 1.9065401554107666, + "learning_rate": 4.939415130318641e-05, + "loss": 5.4306, + "step": 11808 + }, + { + "epoch": 0.07023146826529642, + "grad_norm": 1.9229549169540405, + "learning_rate": 4.9394049090114076e-05, + "loss": 5.5586, + "step": 11809 + }, + { + "epoch": 0.0702374155485774, + "grad_norm": 1.857392430305481, + "learning_rate": 4.939394686852603e-05, + "loss": 5.382, + "step": 11810 + }, + { + "epoch": 0.0702433628318584, + "grad_norm": 2.0430874824523926, + "learning_rate": 4.939384463842231e-05, + "loss": 5.4362, + "step": 11811 + }, + { + "epoch": 0.07024931011513941, + "grad_norm": 1.839227318763733, + "learning_rate": 4.939374239980294e-05, + "loss": 5.0285, + "step": 11812 + }, + { + "epoch": 0.0702552573984204, + "grad_norm": 1.9690957069396973, + "learning_rate": 4.939364015266798e-05, + "loss": 5.5512, + "step": 11813 + }, + { + "epoch": 0.0702612046817014, + "grad_norm": 1.819841980934143, + "learning_rate": 4.939353789701745e-05, + "loss": 5.4886, + "step": 11814 + }, + { + "epoch": 0.0702671519649824, + "grad_norm": 1.7670280933380127, + "learning_rate": 4.939343563285138e-05, + "loss": 5.0925, + "step": 11815 + }, + { + "epoch": 0.07027309924826339, + "grad_norm": 1.478452444076538, + "learning_rate": 4.9393333360169824e-05, + "loss": 5.6562, + "step": 11816 + }, + { + "epoch": 0.07027904653154439, + "grad_norm": 1.7796739339828491, + "learning_rate": 4.93932310789728e-05, + "loss": 5.7462, + "step": 11817 + }, + { + "epoch": 0.07028499381482539, + "grad_norm": 1.425431728363037, + "learning_rate": 4.939312878926036e-05, + "loss": 5.6002, + "step": 11818 + }, + { + "epoch": 0.07029094109810638, + "grad_norm": 1.7066885232925415, + "learning_rate": 4.939302649103252e-05, + "loss": 5.3827, + "step": 11819 + }, + { + "epoch": 0.07029688838138738, + "grad_norm": 1.5144743919372559, + "learning_rate": 4.939292418428933e-05, + "loss": 5.094, + "step": 11820 + }, + { + "epoch": 0.07030283566466838, + "grad_norm": 1.5426355600357056, + "learning_rate": 4.939282186903082e-05, + "loss": 5.4808, + "step": 11821 + }, + { + "epoch": 0.07030878294794937, + "grad_norm": 1.5655393600463867, + "learning_rate": 4.9392719545257034e-05, + "loss": 5.5422, + "step": 11822 + }, + { + "epoch": 0.07031473023123037, + "grad_norm": 1.2810043096542358, + "learning_rate": 4.9392617212967995e-05, + "loss": 5.5069, + "step": 11823 + }, + { + "epoch": 0.07032067751451138, + "grad_norm": 1.534588098526001, + "learning_rate": 4.9392514872163754e-05, + "loss": 5.4887, + "step": 11824 + }, + { + "epoch": 0.07032662479779236, + "grad_norm": 1.6692357063293457, + "learning_rate": 4.9392412522844325e-05, + "loss": 5.4235, + "step": 11825 + }, + { + "epoch": 0.07033257208107337, + "grad_norm": 2.1246654987335205, + "learning_rate": 4.939231016500977e-05, + "loss": 5.4533, + "step": 11826 + }, + { + "epoch": 0.07033851936435437, + "grad_norm": 2.0235774517059326, + "learning_rate": 4.9392207798660106e-05, + "loss": 5.0393, + "step": 11827 + }, + { + "epoch": 0.07034446664763536, + "grad_norm": 1.7843154668807983, + "learning_rate": 4.939210542379537e-05, + "loss": 5.2501, + "step": 11828 + }, + { + "epoch": 0.07035041393091636, + "grad_norm": 2.1056478023529053, + "learning_rate": 4.939200304041561e-05, + "loss": 5.7809, + "step": 11829 + }, + { + "epoch": 0.07035636121419736, + "grad_norm": 2.0902159214019775, + "learning_rate": 4.939190064852085e-05, + "loss": 5.591, + "step": 11830 + }, + { + "epoch": 0.07036230849747835, + "grad_norm": 2.3349802494049072, + "learning_rate": 4.9391798248111134e-05, + "loss": 4.7641, + "step": 11831 + }, + { + "epoch": 0.07036825578075935, + "grad_norm": 1.6848636865615845, + "learning_rate": 4.939169583918648e-05, + "loss": 5.5082, + "step": 11832 + }, + { + "epoch": 0.07037420306404035, + "grad_norm": 1.958947777748108, + "learning_rate": 4.939159342174695e-05, + "loss": 5.433, + "step": 11833 + }, + { + "epoch": 0.07038015034732134, + "grad_norm": 1.7382566928863525, + "learning_rate": 4.939149099579256e-05, + "loss": 5.5014, + "step": 11834 + }, + { + "epoch": 0.07038609763060234, + "grad_norm": 2.469529867172241, + "learning_rate": 4.939138856132336e-05, + "loss": 4.6383, + "step": 11835 + }, + { + "epoch": 0.07039204491388334, + "grad_norm": 2.127711057662964, + "learning_rate": 4.939128611833937e-05, + "loss": 5.6088, + "step": 11836 + }, + { + "epoch": 0.07039799219716433, + "grad_norm": 2.252210855484009, + "learning_rate": 4.9391183666840636e-05, + "loss": 5.027, + "step": 11837 + }, + { + "epoch": 0.07040393948044534, + "grad_norm": 1.990277647972107, + "learning_rate": 4.9391081206827194e-05, + "loss": 5.6389, + "step": 11838 + }, + { + "epoch": 0.07040988676372632, + "grad_norm": 2.170099973678589, + "learning_rate": 4.939097873829908e-05, + "loss": 5.5588, + "step": 11839 + }, + { + "epoch": 0.07041583404700733, + "grad_norm": 2.4616951942443848, + "learning_rate": 4.939087626125632e-05, + "loss": 5.6505, + "step": 11840 + }, + { + "epoch": 0.07042178133028833, + "grad_norm": 1.9600075483322144, + "learning_rate": 4.9390773775698964e-05, + "loss": 5.1086, + "step": 11841 + }, + { + "epoch": 0.07042772861356932, + "grad_norm": 2.173632860183716, + "learning_rate": 4.939067128162703e-05, + "loss": 5.8069, + "step": 11842 + }, + { + "epoch": 0.07043367589685032, + "grad_norm": 1.9921432733535767, + "learning_rate": 4.939056877904058e-05, + "loss": 5.3222, + "step": 11843 + }, + { + "epoch": 0.07043962318013132, + "grad_norm": 2.1605379581451416, + "learning_rate": 4.939046626793962e-05, + "loss": 5.1565, + "step": 11844 + }, + { + "epoch": 0.07044557046341231, + "grad_norm": 2.2240231037139893, + "learning_rate": 4.9390363748324206e-05, + "loss": 5.3633, + "step": 11845 + }, + { + "epoch": 0.07045151774669331, + "grad_norm": 2.1935648918151855, + "learning_rate": 4.9390261220194374e-05, + "loss": 5.3715, + "step": 11846 + }, + { + "epoch": 0.07045746502997431, + "grad_norm": 2.3079628944396973, + "learning_rate": 4.9390158683550146e-05, + "loss": 5.4728, + "step": 11847 + }, + { + "epoch": 0.0704634123132553, + "grad_norm": 2.1652259826660156, + "learning_rate": 4.939005613839157e-05, + "loss": 5.276, + "step": 11848 + }, + { + "epoch": 0.0704693595965363, + "grad_norm": 1.75044846534729, + "learning_rate": 4.938995358471867e-05, + "loss": 5.3, + "step": 11849 + }, + { + "epoch": 0.0704753068798173, + "grad_norm": 2.11893892288208, + "learning_rate": 4.93898510225315e-05, + "loss": 5.3949, + "step": 11850 + }, + { + "epoch": 0.07048125416309829, + "grad_norm": 1.8546398878097534, + "learning_rate": 4.938974845183008e-05, + "loss": 5.3606, + "step": 11851 + }, + { + "epoch": 0.0704872014463793, + "grad_norm": 2.2334201335906982, + "learning_rate": 4.9389645872614456e-05, + "loss": 5.1987, + "step": 11852 + }, + { + "epoch": 0.0704931487296603, + "grad_norm": 2.0545856952667236, + "learning_rate": 4.938954328488465e-05, + "loss": 5.2742, + "step": 11853 + }, + { + "epoch": 0.07049909601294128, + "grad_norm": 2.011322498321533, + "learning_rate": 4.938944068864071e-05, + "loss": 5.3738, + "step": 11854 + }, + { + "epoch": 0.07050504329622229, + "grad_norm": 1.6539164781570435, + "learning_rate": 4.9389338083882664e-05, + "loss": 5.1915, + "step": 11855 + }, + { + "epoch": 0.07051099057950329, + "grad_norm": 1.9423818588256836, + "learning_rate": 4.9389235470610564e-05, + "loss": 5.4432, + "step": 11856 + }, + { + "epoch": 0.07051693786278428, + "grad_norm": 1.9459011554718018, + "learning_rate": 4.938913284882442e-05, + "loss": 5.2929, + "step": 11857 + }, + { + "epoch": 0.07052288514606528, + "grad_norm": 2.0341713428497314, + "learning_rate": 4.938903021852429e-05, + "loss": 5.1413, + "step": 11858 + }, + { + "epoch": 0.07052883242934628, + "grad_norm": 2.1413371562957764, + "learning_rate": 4.93889275797102e-05, + "loss": 5.0283, + "step": 11859 + }, + { + "epoch": 0.07053477971262727, + "grad_norm": 1.9965273141860962, + "learning_rate": 4.9388824932382185e-05, + "loss": 5.0919, + "step": 11860 + }, + { + "epoch": 0.07054072699590827, + "grad_norm": 1.9912536144256592, + "learning_rate": 4.938872227654028e-05, + "loss": 4.72, + "step": 11861 + }, + { + "epoch": 0.07054667427918927, + "grad_norm": 2.267775058746338, + "learning_rate": 4.9388619612184533e-05, + "loss": 5.3942, + "step": 11862 + }, + { + "epoch": 0.07055262156247026, + "grad_norm": 2.0529544353485107, + "learning_rate": 4.9388516939314965e-05, + "loss": 5.504, + "step": 11863 + }, + { + "epoch": 0.07055856884575126, + "grad_norm": 2.124903678894043, + "learning_rate": 4.938841425793162e-05, + "loss": 5.3684, + "step": 11864 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 2.2070152759552, + "learning_rate": 4.938831156803453e-05, + "loss": 5.1349, + "step": 11865 + }, + { + "epoch": 0.07057046341231325, + "grad_norm": 1.717877745628357, + "learning_rate": 4.9388208869623734e-05, + "loss": 5.2605, + "step": 11866 + }, + { + "epoch": 0.07057641069559425, + "grad_norm": 2.258847951889038, + "learning_rate": 4.9388106162699266e-05, + "loss": 4.9048, + "step": 11867 + }, + { + "epoch": 0.07058235797887524, + "grad_norm": 2.065905809402466, + "learning_rate": 4.938800344726117e-05, + "loss": 5.0523, + "step": 11868 + }, + { + "epoch": 0.07058830526215625, + "grad_norm": 2.13053035736084, + "learning_rate": 4.9387900723309455e-05, + "loss": 5.1551, + "step": 11869 + }, + { + "epoch": 0.07059425254543725, + "grad_norm": 2.0323257446289062, + "learning_rate": 4.938779799084419e-05, + "loss": 5.0807, + "step": 11870 + }, + { + "epoch": 0.07060019982871824, + "grad_norm": 2.0503158569335938, + "learning_rate": 4.9387695249865396e-05, + "loss": 5.1946, + "step": 11871 + }, + { + "epoch": 0.07060614711199924, + "grad_norm": 2.069227933883667, + "learning_rate": 4.9387592500373105e-05, + "loss": 5.0027, + "step": 11872 + }, + { + "epoch": 0.07061209439528024, + "grad_norm": 2.0208382606506348, + "learning_rate": 4.9387489742367354e-05, + "loss": 5.0877, + "step": 11873 + }, + { + "epoch": 0.07061804167856123, + "grad_norm": 2.0159859657287598, + "learning_rate": 4.9387386975848196e-05, + "loss": 4.864, + "step": 11874 + }, + { + "epoch": 0.07062398896184223, + "grad_norm": 1.9365311861038208, + "learning_rate": 4.9387284200815645e-05, + "loss": 4.7373, + "step": 11875 + }, + { + "epoch": 0.07062993624512323, + "grad_norm": 2.1024274826049805, + "learning_rate": 4.9387181417269736e-05, + "loss": 5.0155, + "step": 11876 + }, + { + "epoch": 0.07063588352840422, + "grad_norm": 2.5438032150268555, + "learning_rate": 4.938707862521052e-05, + "loss": 5.3267, + "step": 11877 + }, + { + "epoch": 0.07064183081168522, + "grad_norm": 2.129715919494629, + "learning_rate": 4.938697582463804e-05, + "loss": 5.104, + "step": 11878 + }, + { + "epoch": 0.07064777809496622, + "grad_norm": 2.237442970275879, + "learning_rate": 4.9386873015552303e-05, + "loss": 5.134, + "step": 11879 + }, + { + "epoch": 0.07065372537824721, + "grad_norm": 2.2773404121398926, + "learning_rate": 4.9386770197953366e-05, + "loss": 5.269, + "step": 11880 + }, + { + "epoch": 0.07065967266152821, + "grad_norm": 2.0882620811462402, + "learning_rate": 4.938666737184125e-05, + "loss": 4.8091, + "step": 11881 + }, + { + "epoch": 0.07066561994480922, + "grad_norm": 2.0649476051330566, + "learning_rate": 4.938656453721602e-05, + "loss": 4.9143, + "step": 11882 + }, + { + "epoch": 0.0706715672280902, + "grad_norm": 2.19030499458313, + "learning_rate": 4.938646169407768e-05, + "loss": 4.7439, + "step": 11883 + }, + { + "epoch": 0.0706775145113712, + "grad_norm": 2.8669347763061523, + "learning_rate": 4.938635884242628e-05, + "loss": 4.3684, + "step": 11884 + }, + { + "epoch": 0.07068346179465221, + "grad_norm": 2.3018336296081543, + "learning_rate": 4.9386255982261854e-05, + "loss": 4.8602, + "step": 11885 + }, + { + "epoch": 0.0706894090779332, + "grad_norm": 2.7775471210479736, + "learning_rate": 4.938615311358443e-05, + "loss": 5.2401, + "step": 11886 + }, + { + "epoch": 0.0706953563612142, + "grad_norm": 2.1075756549835205, + "learning_rate": 4.938605023639406e-05, + "loss": 5.1085, + "step": 11887 + }, + { + "epoch": 0.0707013036444952, + "grad_norm": 2.456530809402466, + "learning_rate": 4.9385947350690776e-05, + "loss": 5.0506, + "step": 11888 + }, + { + "epoch": 0.07070725092777619, + "grad_norm": 1.76799738407135, + "learning_rate": 4.9385844456474605e-05, + "loss": 4.8233, + "step": 11889 + }, + { + "epoch": 0.07071319821105719, + "grad_norm": 2.0819127559661865, + "learning_rate": 4.938574155374559e-05, + "loss": 4.4198, + "step": 11890 + }, + { + "epoch": 0.07071914549433819, + "grad_norm": 2.221586227416992, + "learning_rate": 4.9385638642503765e-05, + "loss": 4.2423, + "step": 11891 + }, + { + "epoch": 0.07072509277761918, + "grad_norm": 2.108182668685913, + "learning_rate": 4.938553572274916e-05, + "loss": 4.2564, + "step": 11892 + }, + { + "epoch": 0.07073104006090018, + "grad_norm": 1.9631624221801758, + "learning_rate": 4.938543279448182e-05, + "loss": 4.1641, + "step": 11893 + }, + { + "epoch": 0.07073698734418118, + "grad_norm": 1.9730273485183716, + "learning_rate": 4.938532985770178e-05, + "loss": 4.0728, + "step": 11894 + }, + { + "epoch": 0.07074293462746217, + "grad_norm": 1.9632551670074463, + "learning_rate": 4.9385226912409065e-05, + "loss": 4.2014, + "step": 11895 + }, + { + "epoch": 0.07074888191074317, + "grad_norm": 1.9986671209335327, + "learning_rate": 4.9385123958603726e-05, + "loss": 4.0299, + "step": 11896 + }, + { + "epoch": 0.07075482919402416, + "grad_norm": 2.2256031036376953, + "learning_rate": 4.9385020996285794e-05, + "loss": 4.1397, + "step": 11897 + }, + { + "epoch": 0.07076077647730517, + "grad_norm": 2.231462001800537, + "learning_rate": 4.9384918025455296e-05, + "loss": 4.0977, + "step": 11898 + }, + { + "epoch": 0.07076672376058617, + "grad_norm": 2.0946438312530518, + "learning_rate": 4.938481504611227e-05, + "loss": 3.9446, + "step": 11899 + }, + { + "epoch": 0.07077267104386716, + "grad_norm": 1.6953986883163452, + "learning_rate": 4.938471205825677e-05, + "loss": 4.6809, + "step": 11900 + }, + { + "epoch": 0.07077861832714816, + "grad_norm": 2.1963350772857666, + "learning_rate": 4.938460906188882e-05, + "loss": 4.3626, + "step": 11901 + }, + { + "epoch": 0.07078456561042916, + "grad_norm": 2.2069251537323, + "learning_rate": 4.938450605700845e-05, + "loss": 4.1057, + "step": 11902 + }, + { + "epoch": 0.07079051289371015, + "grad_norm": 2.1809592247009277, + "learning_rate": 4.9384403043615694e-05, + "loss": 3.5619, + "step": 11903 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 2.305171012878418, + "learning_rate": 4.938430002171061e-05, + "loss": 5.8033, + "step": 11904 + }, + { + "epoch": 0.07080240746027215, + "grad_norm": 2.1984407901763916, + "learning_rate": 4.9384196991293205e-05, + "loss": 3.5869, + "step": 11905 + }, + { + "epoch": 0.07080835474355314, + "grad_norm": 1.8870881795883179, + "learning_rate": 4.938409395236353e-05, + "loss": 4.8027, + "step": 11906 + }, + { + "epoch": 0.07081430202683414, + "grad_norm": 2.11314058303833, + "learning_rate": 4.938399090492163e-05, + "loss": 4.1942, + "step": 11907 + }, + { + "epoch": 0.07082024931011514, + "grad_norm": 2.143794298171997, + "learning_rate": 4.938388784896752e-05, + "loss": 3.8526, + "step": 11908 + }, + { + "epoch": 0.07082619659339613, + "grad_norm": 2.4311232566833496, + "learning_rate": 4.938378478450125e-05, + "loss": 3.8572, + "step": 11909 + }, + { + "epoch": 0.07083214387667713, + "grad_norm": 2.0959818363189697, + "learning_rate": 4.9383681711522855e-05, + "loss": 4.3465, + "step": 11910 + }, + { + "epoch": 0.07083809115995814, + "grad_norm": 1.9161559343338013, + "learning_rate": 4.938357863003237e-05, + "loss": 5.5608, + "step": 11911 + }, + { + "epoch": 0.07084403844323912, + "grad_norm": 1.8549482822418213, + "learning_rate": 4.9383475540029824e-05, + "loss": 5.9874, + "step": 11912 + }, + { + "epoch": 0.07084998572652013, + "grad_norm": 1.8600444793701172, + "learning_rate": 4.9383372441515255e-05, + "loss": 6.0579, + "step": 11913 + }, + { + "epoch": 0.07085593300980113, + "grad_norm": 1.6985594034194946, + "learning_rate": 4.938326933448871e-05, + "loss": 5.7963, + "step": 11914 + }, + { + "epoch": 0.07086188029308212, + "grad_norm": 2.06860613822937, + "learning_rate": 4.9383166218950216e-05, + "loss": 5.4789, + "step": 11915 + }, + { + "epoch": 0.07086782757636312, + "grad_norm": 2.8111190795898438, + "learning_rate": 4.938306309489982e-05, + "loss": 5.2546, + "step": 11916 + }, + { + "epoch": 0.07087377485964412, + "grad_norm": 2.700589895248413, + "learning_rate": 4.9382959962337536e-05, + "loss": 5.2021, + "step": 11917 + }, + { + "epoch": 0.07087972214292511, + "grad_norm": 2.364793539047241, + "learning_rate": 4.938285682126341e-05, + "loss": 4.9508, + "step": 11918 + }, + { + "epoch": 0.07088566942620611, + "grad_norm": 2.4212446212768555, + "learning_rate": 4.938275367167749e-05, + "loss": 5.1269, + "step": 11919 + }, + { + "epoch": 0.07089161670948711, + "grad_norm": 1.785733699798584, + "learning_rate": 4.93826505135798e-05, + "loss": 5.7357, + "step": 11920 + }, + { + "epoch": 0.0708975639927681, + "grad_norm": 1.6912823915481567, + "learning_rate": 4.9382547346970376e-05, + "loss": 5.4003, + "step": 11921 + }, + { + "epoch": 0.0709035112760491, + "grad_norm": 1.8408714532852173, + "learning_rate": 4.938244417184926e-05, + "loss": 5.3169, + "step": 11922 + }, + { + "epoch": 0.0709094585593301, + "grad_norm": 2.3245468139648438, + "learning_rate": 4.938234098821648e-05, + "loss": 4.9588, + "step": 11923 + }, + { + "epoch": 0.07091540584261109, + "grad_norm": 1.922179102897644, + "learning_rate": 4.938223779607208e-05, + "loss": 5.431, + "step": 11924 + }, + { + "epoch": 0.0709213531258921, + "grad_norm": 1.8331208229064941, + "learning_rate": 4.9382134595416094e-05, + "loss": 5.9121, + "step": 11925 + }, + { + "epoch": 0.07092730040917308, + "grad_norm": 2.15932297706604, + "learning_rate": 4.9382031386248556e-05, + "loss": 5.058, + "step": 11926 + }, + { + "epoch": 0.07093324769245409, + "grad_norm": 2.2255606651306152, + "learning_rate": 4.93819281685695e-05, + "loss": 4.9215, + "step": 11927 + }, + { + "epoch": 0.07093919497573509, + "grad_norm": 2.3665359020233154, + "learning_rate": 4.938182494237897e-05, + "loss": 4.8405, + "step": 11928 + }, + { + "epoch": 0.07094514225901608, + "grad_norm": 2.1564438343048096, + "learning_rate": 4.938172170767699e-05, + "loss": 4.9598, + "step": 11929 + }, + { + "epoch": 0.07095108954229708, + "grad_norm": 2.2083945274353027, + "learning_rate": 4.938161846446361e-05, + "loss": 4.8603, + "step": 11930 + }, + { + "epoch": 0.07095703682557808, + "grad_norm": 2.3422255516052246, + "learning_rate": 4.938151521273885e-05, + "loss": 4.8926, + "step": 11931 + }, + { + "epoch": 0.07096298410885907, + "grad_norm": 2.5269415378570557, + "learning_rate": 4.9381411952502764e-05, + "loss": 4.876, + "step": 11932 + }, + { + "epoch": 0.07096893139214007, + "grad_norm": 2.1761882305145264, + "learning_rate": 4.9381308683755376e-05, + "loss": 4.7533, + "step": 11933 + }, + { + "epoch": 0.07097487867542107, + "grad_norm": 2.078146457672119, + "learning_rate": 4.938120540649672e-05, + "loss": 4.9606, + "step": 11934 + }, + { + "epoch": 0.07098082595870206, + "grad_norm": 2.3086254596710205, + "learning_rate": 4.9381102120726846e-05, + "loss": 4.7763, + "step": 11935 + }, + { + "epoch": 0.07098677324198306, + "grad_norm": 1.8531124591827393, + "learning_rate": 4.938099882644578e-05, + "loss": 5.0218, + "step": 11936 + }, + { + "epoch": 0.07099272052526406, + "grad_norm": 2.2169790267944336, + "learning_rate": 4.938089552365355e-05, + "loss": 6.0072, + "step": 11937 + }, + { + "epoch": 0.07099866780854505, + "grad_norm": 1.8759880065917969, + "learning_rate": 4.938079221235021e-05, + "loss": 5.8259, + "step": 11938 + }, + { + "epoch": 0.07100461509182605, + "grad_norm": 2.026217222213745, + "learning_rate": 4.938068889253579e-05, + "loss": 5.4426, + "step": 11939 + }, + { + "epoch": 0.07101056237510706, + "grad_norm": 2.5047786235809326, + "learning_rate": 4.938058556421031e-05, + "loss": 4.7276, + "step": 11940 + }, + { + "epoch": 0.07101650965838804, + "grad_norm": 2.243281602859497, + "learning_rate": 4.938048222737383e-05, + "loss": 4.9284, + "step": 11941 + }, + { + "epoch": 0.07102245694166905, + "grad_norm": 1.989563226699829, + "learning_rate": 4.938037888202637e-05, + "loss": 5.7744, + "step": 11942 + }, + { + "epoch": 0.07102840422495005, + "grad_norm": 1.829290509223938, + "learning_rate": 4.9380275528167974e-05, + "loss": 5.6942, + "step": 11943 + }, + { + "epoch": 0.07103435150823104, + "grad_norm": 1.8001593351364136, + "learning_rate": 4.938017216579868e-05, + "loss": 5.6928, + "step": 11944 + }, + { + "epoch": 0.07104029879151204, + "grad_norm": 1.7705434560775757, + "learning_rate": 4.938006879491851e-05, + "loss": 5.6954, + "step": 11945 + }, + { + "epoch": 0.07104624607479304, + "grad_norm": 1.8746812343597412, + "learning_rate": 4.937996541552752e-05, + "loss": 5.7184, + "step": 11946 + }, + { + "epoch": 0.07105219335807403, + "grad_norm": 1.6931661367416382, + "learning_rate": 4.937986202762573e-05, + "loss": 5.398, + "step": 11947 + }, + { + "epoch": 0.07105814064135503, + "grad_norm": 2.0784003734588623, + "learning_rate": 4.937975863121318e-05, + "loss": 5.7164, + "step": 11948 + }, + { + "epoch": 0.07106408792463603, + "grad_norm": 1.8495618104934692, + "learning_rate": 4.937965522628991e-05, + "loss": 5.7093, + "step": 11949 + }, + { + "epoch": 0.07107003520791702, + "grad_norm": 1.7720533609390259, + "learning_rate": 4.9379551812855964e-05, + "loss": 5.7548, + "step": 11950 + }, + { + "epoch": 0.07107598249119802, + "grad_norm": 1.721205472946167, + "learning_rate": 4.937944839091135e-05, + "loss": 5.7496, + "step": 11951 + }, + { + "epoch": 0.07108192977447902, + "grad_norm": 1.896657109260559, + "learning_rate": 4.9379344960456145e-05, + "loss": 5.5989, + "step": 11952 + }, + { + "epoch": 0.07108787705776001, + "grad_norm": 1.4022153615951538, + "learning_rate": 4.9379241521490344e-05, + "loss": 5.5029, + "step": 11953 + }, + { + "epoch": 0.07109382434104101, + "grad_norm": 1.9068467617034912, + "learning_rate": 4.937913807401401e-05, + "loss": 5.6915, + "step": 11954 + }, + { + "epoch": 0.071099771624322, + "grad_norm": 1.6542187929153442, + "learning_rate": 4.9379034618027164e-05, + "loss": 5.6409, + "step": 11955 + }, + { + "epoch": 0.071105718907603, + "grad_norm": 1.5280201435089111, + "learning_rate": 4.937893115352986e-05, + "loss": 5.6264, + "step": 11956 + }, + { + "epoch": 0.07111166619088401, + "grad_norm": 1.767232060432434, + "learning_rate": 4.937882768052211e-05, + "loss": 5.4562, + "step": 11957 + }, + { + "epoch": 0.071117613474165, + "grad_norm": 1.571892261505127, + "learning_rate": 4.9378724199003975e-05, + "loss": 5.7949, + "step": 11958 + }, + { + "epoch": 0.071123560757446, + "grad_norm": 1.9400190114974976, + "learning_rate": 4.937862070897548e-05, + "loss": 5.5872, + "step": 11959 + }, + { + "epoch": 0.071129508040727, + "grad_norm": 1.7246766090393066, + "learning_rate": 4.937851721043665e-05, + "loss": 5.8455, + "step": 11960 + }, + { + "epoch": 0.07113545532400799, + "grad_norm": 1.937168002128601, + "learning_rate": 4.9378413703387534e-05, + "loss": 5.0864, + "step": 11961 + }, + { + "epoch": 0.07114140260728899, + "grad_norm": 2.3808209896087646, + "learning_rate": 4.937831018782817e-05, + "loss": 4.5918, + "step": 11962 + }, + { + "epoch": 0.07114734989056999, + "grad_norm": 2.567026138305664, + "learning_rate": 4.937820666375859e-05, + "loss": 4.7375, + "step": 11963 + }, + { + "epoch": 0.07115329717385098, + "grad_norm": 1.8941316604614258, + "learning_rate": 4.937810313117882e-05, + "loss": 5.811, + "step": 11964 + }, + { + "epoch": 0.07115924445713198, + "grad_norm": 1.9301189184188843, + "learning_rate": 4.9377999590088916e-05, + "loss": 5.7947, + "step": 11965 + }, + { + "epoch": 0.07116519174041298, + "grad_norm": 2.281784772872925, + "learning_rate": 4.93778960404889e-05, + "loss": 5.5993, + "step": 11966 + }, + { + "epoch": 0.07117113902369397, + "grad_norm": 1.7826297283172607, + "learning_rate": 4.937779248237882e-05, + "loss": 6.1836, + "step": 11967 + }, + { + "epoch": 0.07117708630697497, + "grad_norm": 2.8714182376861572, + "learning_rate": 4.9377688915758694e-05, + "loss": 5.3955, + "step": 11968 + }, + { + "epoch": 0.07118303359025598, + "grad_norm": 2.3284013271331787, + "learning_rate": 4.937758534062857e-05, + "loss": 5.3027, + "step": 11969 + }, + { + "epoch": 0.07118898087353696, + "grad_norm": 1.8880923986434937, + "learning_rate": 4.937748175698849e-05, + "loss": 5.8408, + "step": 11970 + }, + { + "epoch": 0.07119492815681797, + "grad_norm": 2.8952460289001465, + "learning_rate": 4.937737816483847e-05, + "loss": 4.7325, + "step": 11971 + }, + { + "epoch": 0.07120087544009897, + "grad_norm": 2.5028738975524902, + "learning_rate": 4.9377274564178574e-05, + "loss": 4.5854, + "step": 11972 + }, + { + "epoch": 0.07120682272337996, + "grad_norm": 1.8834285736083984, + "learning_rate": 4.9377170955008815e-05, + "loss": 5.5415, + "step": 11973 + }, + { + "epoch": 0.07121277000666096, + "grad_norm": 2.162062644958496, + "learning_rate": 4.937706733732924e-05, + "loss": 5.2187, + "step": 11974 + }, + { + "epoch": 0.07121871728994196, + "grad_norm": 2.1506881713867188, + "learning_rate": 4.937696371113988e-05, + "loss": 5.1746, + "step": 11975 + }, + { + "epoch": 0.07122466457322295, + "grad_norm": 2.0309176445007324, + "learning_rate": 4.937686007644078e-05, + "loss": 5.1708, + "step": 11976 + }, + { + "epoch": 0.07123061185650395, + "grad_norm": 2.251579523086548, + "learning_rate": 4.9376756433231966e-05, + "loss": 6.0623, + "step": 11977 + }, + { + "epoch": 0.07123655913978495, + "grad_norm": 2.161918878555298, + "learning_rate": 4.937665278151348e-05, + "loss": 6.2297, + "step": 11978 + }, + { + "epoch": 0.07124250642306594, + "grad_norm": 1.703783631324768, + "learning_rate": 4.937654912128535e-05, + "loss": 5.9388, + "step": 11979 + }, + { + "epoch": 0.07124845370634694, + "grad_norm": 1.7420361042022705, + "learning_rate": 4.937644545254763e-05, + "loss": 5.5426, + "step": 11980 + }, + { + "epoch": 0.07125440098962794, + "grad_norm": 1.8634297847747803, + "learning_rate": 4.937634177530033e-05, + "loss": 5.8412, + "step": 11981 + }, + { + "epoch": 0.07126034827290893, + "grad_norm": 1.8084121942520142, + "learning_rate": 4.937623808954351e-05, + "loss": 6.266, + "step": 11982 + }, + { + "epoch": 0.07126629555618993, + "grad_norm": 1.5925266742706299, + "learning_rate": 4.93761343952772e-05, + "loss": 5.7173, + "step": 11983 + }, + { + "epoch": 0.07127224283947092, + "grad_norm": 1.7778257131576538, + "learning_rate": 4.937603069250143e-05, + "loss": 5.8119, + "step": 11984 + }, + { + "epoch": 0.07127819012275192, + "grad_norm": 1.6839842796325684, + "learning_rate": 4.9375926981216235e-05, + "loss": 5.9446, + "step": 11985 + }, + { + "epoch": 0.07128413740603293, + "grad_norm": 1.7892810106277466, + "learning_rate": 4.937582326142166e-05, + "loss": 5.9564, + "step": 11986 + }, + { + "epoch": 0.07129008468931392, + "grad_norm": 1.7179774045944214, + "learning_rate": 4.9375719533117734e-05, + "loss": 6.1969, + "step": 11987 + }, + { + "epoch": 0.07129603197259492, + "grad_norm": 1.3788355588912964, + "learning_rate": 4.93756157963045e-05, + "loss": 6.0409, + "step": 11988 + }, + { + "epoch": 0.07130197925587592, + "grad_norm": 1.6451042890548706, + "learning_rate": 4.9375512050981986e-05, + "loss": 5.8116, + "step": 11989 + }, + { + "epoch": 0.07130792653915691, + "grad_norm": 1.8904451131820679, + "learning_rate": 4.937540829715024e-05, + "loss": 5.7952, + "step": 11990 + }, + { + "epoch": 0.07131387382243791, + "grad_norm": 1.4976747035980225, + "learning_rate": 4.9375304534809284e-05, + "loss": 5.7092, + "step": 11991 + }, + { + "epoch": 0.07131982110571891, + "grad_norm": 1.5585631132125854, + "learning_rate": 4.937520076395916e-05, + "loss": 6.0693, + "step": 11992 + }, + { + "epoch": 0.0713257683889999, + "grad_norm": 1.8329144716262817, + "learning_rate": 4.937509698459991e-05, + "loss": 5.5883, + "step": 11993 + }, + { + "epoch": 0.0713317156722809, + "grad_norm": 2.6030189990997314, + "learning_rate": 4.937499319673157e-05, + "loss": 5.1776, + "step": 11994 + }, + { + "epoch": 0.0713376629555619, + "grad_norm": 1.744042992591858, + "learning_rate": 4.9374889400354165e-05, + "loss": 5.4105, + "step": 11995 + }, + { + "epoch": 0.07134361023884289, + "grad_norm": 1.819018006324768, + "learning_rate": 4.937478559546774e-05, + "loss": 5.5695, + "step": 11996 + }, + { + "epoch": 0.0713495575221239, + "grad_norm": 1.754894733428955, + "learning_rate": 4.9374681782072325e-05, + "loss": 5.7519, + "step": 11997 + }, + { + "epoch": 0.0713555048054049, + "grad_norm": 2.132507085800171, + "learning_rate": 4.9374577960167964e-05, + "loss": 4.9783, + "step": 11998 + }, + { + "epoch": 0.07136145208868588, + "grad_norm": 2.0926709175109863, + "learning_rate": 4.937447412975469e-05, + "loss": 4.905, + "step": 11999 + }, + { + "epoch": 0.07136739937196689, + "grad_norm": 2.1235594749450684, + "learning_rate": 4.937437029083254e-05, + "loss": 4.7978, + "step": 12000 + }, + { + "epoch": 0.07137334665524789, + "grad_norm": 2.217911720275879, + "learning_rate": 4.937426644340154e-05, + "loss": 4.9506, + "step": 12001 + }, + { + "epoch": 0.07137929393852888, + "grad_norm": 2.0362601280212402, + "learning_rate": 4.937416258746175e-05, + "loss": 5.0299, + "step": 12002 + }, + { + "epoch": 0.07138524122180988, + "grad_norm": 2.2846896648406982, + "learning_rate": 4.937405872301318e-05, + "loss": 5.0606, + "step": 12003 + }, + { + "epoch": 0.07139118850509088, + "grad_norm": 2.2545530796051025, + "learning_rate": 4.937395485005588e-05, + "loss": 4.8651, + "step": 12004 + }, + { + "epoch": 0.07139713578837187, + "grad_norm": 2.32738995552063, + "learning_rate": 4.937385096858989e-05, + "loss": 4.7908, + "step": 12005 + }, + { + "epoch": 0.07140308307165287, + "grad_norm": 2.239215850830078, + "learning_rate": 4.9373747078615235e-05, + "loss": 4.7545, + "step": 12006 + }, + { + "epoch": 0.07140903035493387, + "grad_norm": 2.4766969680786133, + "learning_rate": 4.937364318013196e-05, + "loss": 5.0795, + "step": 12007 + }, + { + "epoch": 0.07141497763821486, + "grad_norm": 2.602111577987671, + "learning_rate": 4.937353927314009e-05, + "loss": 4.6898, + "step": 12008 + }, + { + "epoch": 0.07142092492149586, + "grad_norm": 2.8508496284484863, + "learning_rate": 4.937343535763968e-05, + "loss": 4.3136, + "step": 12009 + }, + { + "epoch": 0.07142687220477686, + "grad_norm": 2.4613311290740967, + "learning_rate": 4.9373331433630754e-05, + "loss": 4.4826, + "step": 12010 + }, + { + "epoch": 0.07143281948805785, + "grad_norm": 2.561643362045288, + "learning_rate": 4.937322750111334e-05, + "loss": 4.251, + "step": 12011 + }, + { + "epoch": 0.07143876677133885, + "grad_norm": 2.397507667541504, + "learning_rate": 4.93731235600875e-05, + "loss": 4.3018, + "step": 12012 + }, + { + "epoch": 0.07144471405461984, + "grad_norm": 2.250120162963867, + "learning_rate": 4.937301961055324e-05, + "loss": 4.1796, + "step": 12013 + }, + { + "epoch": 0.07145066133790084, + "grad_norm": 2.337451934814453, + "learning_rate": 4.9372915652510615e-05, + "loss": 4.2362, + "step": 12014 + }, + { + "epoch": 0.07145660862118185, + "grad_norm": 2.357034921646118, + "learning_rate": 4.937281168595966e-05, + "loss": 4.0961, + "step": 12015 + }, + { + "epoch": 0.07146255590446284, + "grad_norm": 2.0843617916107178, + "learning_rate": 4.93727077109004e-05, + "loss": 4.4584, + "step": 12016 + }, + { + "epoch": 0.07146850318774384, + "grad_norm": 2.149707317352295, + "learning_rate": 4.937260372733289e-05, + "loss": 4.2248, + "step": 12017 + }, + { + "epoch": 0.07147445047102484, + "grad_norm": 2.149765729904175, + "learning_rate": 4.937249973525715e-05, + "loss": 4.154, + "step": 12018 + }, + { + "epoch": 0.07148039775430583, + "grad_norm": 2.1572682857513428, + "learning_rate": 4.937239573467323e-05, + "loss": 4.2345, + "step": 12019 + }, + { + "epoch": 0.07148634503758683, + "grad_norm": 2.246751070022583, + "learning_rate": 4.9372291725581145e-05, + "loss": 3.9739, + "step": 12020 + }, + { + "epoch": 0.07149229232086783, + "grad_norm": 2.2735042572021484, + "learning_rate": 4.9372187707980955e-05, + "loss": 4.0442, + "step": 12021 + }, + { + "epoch": 0.07149823960414882, + "grad_norm": 2.2270023822784424, + "learning_rate": 4.9372083681872684e-05, + "loss": 4.0374, + "step": 12022 + }, + { + "epoch": 0.07150418688742982, + "grad_norm": 2.2228193283081055, + "learning_rate": 4.937197964725637e-05, + "loss": 4.0503, + "step": 12023 + }, + { + "epoch": 0.07151013417071082, + "grad_norm": 2.2630691528320312, + "learning_rate": 4.9371875604132046e-05, + "loss": 4.0431, + "step": 12024 + }, + { + "epoch": 0.07151608145399181, + "grad_norm": 2.2461886405944824, + "learning_rate": 4.937177155249976e-05, + "loss": 4.1164, + "step": 12025 + }, + { + "epoch": 0.07152202873727281, + "grad_norm": 1.9476062059402466, + "learning_rate": 4.937166749235953e-05, + "loss": 4.317, + "step": 12026 + }, + { + "epoch": 0.07152797602055382, + "grad_norm": 2.33138370513916, + "learning_rate": 4.937156342371141e-05, + "loss": 4.1309, + "step": 12027 + }, + { + "epoch": 0.0715339233038348, + "grad_norm": 3.3887436389923096, + "learning_rate": 4.937145934655543e-05, + "loss": 5.1713, + "step": 12028 + }, + { + "epoch": 0.0715398705871158, + "grad_norm": 2.499302625656128, + "learning_rate": 4.937135526089162e-05, + "loss": 4.0553, + "step": 12029 + }, + { + "epoch": 0.07154581787039681, + "grad_norm": 2.4269003868103027, + "learning_rate": 4.937125116672002e-05, + "loss": 4.0425, + "step": 12030 + }, + { + "epoch": 0.0715517651536778, + "grad_norm": 2.1819067001342773, + "learning_rate": 4.937114706404067e-05, + "loss": 4.0591, + "step": 12031 + }, + { + "epoch": 0.0715577124369588, + "grad_norm": 1.8021305799484253, + "learning_rate": 4.937104295285361e-05, + "loss": 4.9171, + "step": 12032 + }, + { + "epoch": 0.0715636597202398, + "grad_norm": 2.1833691596984863, + "learning_rate": 4.937093883315887e-05, + "loss": 4.053, + "step": 12033 + }, + { + "epoch": 0.07156960700352079, + "grad_norm": 2.1684465408325195, + "learning_rate": 4.9370834704956484e-05, + "loss": 4.0692, + "step": 12034 + }, + { + "epoch": 0.07157555428680179, + "grad_norm": 2.1576929092407227, + "learning_rate": 4.937073056824649e-05, + "loss": 3.9958, + "step": 12035 + }, + { + "epoch": 0.07158150157008279, + "grad_norm": 1.5627915859222412, + "learning_rate": 4.9370626423028924e-05, + "loss": 5.3373, + "step": 12036 + }, + { + "epoch": 0.07158744885336378, + "grad_norm": 1.6166819334030151, + "learning_rate": 4.937052226930383e-05, + "loss": 5.801, + "step": 12037 + }, + { + "epoch": 0.07159339613664478, + "grad_norm": 1.4187299013137817, + "learning_rate": 4.937041810707124e-05, + "loss": 5.5937, + "step": 12038 + }, + { + "epoch": 0.07159934341992578, + "grad_norm": 1.5857088565826416, + "learning_rate": 4.937031393633118e-05, + "loss": 5.6268, + "step": 12039 + }, + { + "epoch": 0.07160529070320677, + "grad_norm": 1.5691097974777222, + "learning_rate": 4.93702097570837e-05, + "loss": 5.7414, + "step": 12040 + }, + { + "epoch": 0.07161123798648777, + "grad_norm": 1.4723674058914185, + "learning_rate": 4.9370105569328835e-05, + "loss": 5.4711, + "step": 12041 + }, + { + "epoch": 0.07161718526976876, + "grad_norm": 1.686745047569275, + "learning_rate": 4.937000137306661e-05, + "loss": 5.4302, + "step": 12042 + }, + { + "epoch": 0.07162313255304976, + "grad_norm": 1.7394465208053589, + "learning_rate": 4.936989716829707e-05, + "loss": 5.1609, + "step": 12043 + }, + { + "epoch": 0.07162907983633077, + "grad_norm": 1.4348796606063843, + "learning_rate": 4.9369792955020264e-05, + "loss": 5.2468, + "step": 12044 + }, + { + "epoch": 0.07163502711961175, + "grad_norm": 1.674187421798706, + "learning_rate": 4.93696887332362e-05, + "loss": 5.2451, + "step": 12045 + }, + { + "epoch": 0.07164097440289276, + "grad_norm": 1.6606419086456299, + "learning_rate": 4.9369584502944934e-05, + "loss": 5.2744, + "step": 12046 + }, + { + "epoch": 0.07164692168617376, + "grad_norm": 1.4020198583602905, + "learning_rate": 4.93694802641465e-05, + "loss": 5.2914, + "step": 12047 + }, + { + "epoch": 0.07165286896945475, + "grad_norm": 1.4234102964401245, + "learning_rate": 4.936937601684093e-05, + "loss": 5.2405, + "step": 12048 + }, + { + "epoch": 0.07165881625273575, + "grad_norm": 1.261983036994934, + "learning_rate": 4.936927176102827e-05, + "loss": 5.1532, + "step": 12049 + }, + { + "epoch": 0.07166476353601675, + "grad_norm": 1.3787094354629517, + "learning_rate": 4.9369167496708534e-05, + "loss": 5.2033, + "step": 12050 + }, + { + "epoch": 0.07167071081929774, + "grad_norm": 1.405142068862915, + "learning_rate": 4.9369063223881786e-05, + "loss": 5.0391, + "step": 12051 + }, + { + "epoch": 0.07167665810257874, + "grad_norm": 1.513554573059082, + "learning_rate": 4.936895894254804e-05, + "loss": 5.0236, + "step": 12052 + }, + { + "epoch": 0.07168260538585974, + "grad_norm": 1.4279611110687256, + "learning_rate": 4.9368854652707355e-05, + "loss": 5.1429, + "step": 12053 + }, + { + "epoch": 0.07168855266914073, + "grad_norm": 1.4320182800292969, + "learning_rate": 4.936875035435974e-05, + "loss": 5.0519, + "step": 12054 + }, + { + "epoch": 0.07169449995242173, + "grad_norm": 1.415925145149231, + "learning_rate": 4.936864604750526e-05, + "loss": 4.9904, + "step": 12055 + }, + { + "epoch": 0.07170044723570274, + "grad_norm": 1.403998851776123, + "learning_rate": 4.936854173214393e-05, + "loss": 4.8988, + "step": 12056 + }, + { + "epoch": 0.07170639451898372, + "grad_norm": 1.744532585144043, + "learning_rate": 4.936843740827579e-05, + "loss": 4.9661, + "step": 12057 + }, + { + "epoch": 0.07171234180226473, + "grad_norm": 1.4900517463684082, + "learning_rate": 4.9368333075900884e-05, + "loss": 5.1887, + "step": 12058 + }, + { + "epoch": 0.07171828908554573, + "grad_norm": 1.454063057899475, + "learning_rate": 4.936822873501925e-05, + "loss": 5.2801, + "step": 12059 + }, + { + "epoch": 0.07172423636882672, + "grad_norm": 1.5426071882247925, + "learning_rate": 4.936812438563092e-05, + "loss": 5.1987, + "step": 12060 + }, + { + "epoch": 0.07173018365210772, + "grad_norm": 1.7365894317626953, + "learning_rate": 4.936802002773592e-05, + "loss": 5.1933, + "step": 12061 + }, + { + "epoch": 0.07173613093538872, + "grad_norm": 1.5046216249465942, + "learning_rate": 4.9367915661334295e-05, + "loss": 5.1688, + "step": 12062 + }, + { + "epoch": 0.07174207821866971, + "grad_norm": 1.6715713739395142, + "learning_rate": 4.936781128642609e-05, + "loss": 5.3649, + "step": 12063 + }, + { + "epoch": 0.07174802550195071, + "grad_norm": 1.6386772394180298, + "learning_rate": 4.936770690301134e-05, + "loss": 5.4107, + "step": 12064 + }, + { + "epoch": 0.07175397278523171, + "grad_norm": 1.604153037071228, + "learning_rate": 4.936760251109006e-05, + "loss": 5.2952, + "step": 12065 + }, + { + "epoch": 0.0717599200685127, + "grad_norm": 1.7100228071212769, + "learning_rate": 4.9367498110662306e-05, + "loss": 5.202, + "step": 12066 + }, + { + "epoch": 0.0717658673517937, + "grad_norm": 1.4062007665634155, + "learning_rate": 4.9367393701728116e-05, + "loss": 5.2246, + "step": 12067 + }, + { + "epoch": 0.0717718146350747, + "grad_norm": 1.4552310705184937, + "learning_rate": 4.9367289284287514e-05, + "loss": 5.5919, + "step": 12068 + }, + { + "epoch": 0.07177776191835569, + "grad_norm": 1.5134438276290894, + "learning_rate": 4.9367184858340546e-05, + "loss": 5.3921, + "step": 12069 + }, + { + "epoch": 0.0717837092016367, + "grad_norm": 1.724139928817749, + "learning_rate": 4.9367080423887246e-05, + "loss": 5.6409, + "step": 12070 + }, + { + "epoch": 0.07178965648491768, + "grad_norm": 1.7401317358016968, + "learning_rate": 4.9366975980927655e-05, + "loss": 4.8093, + "step": 12071 + }, + { + "epoch": 0.07179560376819868, + "grad_norm": 2.3226993083953857, + "learning_rate": 4.93668715294618e-05, + "loss": 4.2685, + "step": 12072 + }, + { + "epoch": 0.07180155105147969, + "grad_norm": 2.200608730316162, + "learning_rate": 4.9366767069489715e-05, + "loss": 4.1155, + "step": 12073 + }, + { + "epoch": 0.07180749833476067, + "grad_norm": 2.381131649017334, + "learning_rate": 4.936666260101145e-05, + "loss": 3.9837, + "step": 12074 + }, + { + "epoch": 0.07181344561804168, + "grad_norm": 2.2567548751831055, + "learning_rate": 4.936655812402704e-05, + "loss": 4.0642, + "step": 12075 + }, + { + "epoch": 0.07181939290132268, + "grad_norm": 2.253011703491211, + "learning_rate": 4.9366453638536506e-05, + "loss": 4.0683, + "step": 12076 + }, + { + "epoch": 0.07182534018460367, + "grad_norm": 2.3459978103637695, + "learning_rate": 4.93663491445399e-05, + "loss": 4.0525, + "step": 12077 + }, + { + "epoch": 0.07183128746788467, + "grad_norm": 2.3964619636535645, + "learning_rate": 4.9366244642037254e-05, + "loss": 4.0198, + "step": 12078 + }, + { + "epoch": 0.07183723475116567, + "grad_norm": 2.392293930053711, + "learning_rate": 4.93661401310286e-05, + "loss": 3.7765, + "step": 12079 + }, + { + "epoch": 0.07184318203444666, + "grad_norm": 2.3027987480163574, + "learning_rate": 4.936603561151398e-05, + "loss": 4.0315, + "step": 12080 + }, + { + "epoch": 0.07184912931772766, + "grad_norm": 2.3942925930023193, + "learning_rate": 4.936593108349343e-05, + "loss": 4.1308, + "step": 12081 + }, + { + "epoch": 0.07185507660100866, + "grad_norm": 2.183898687362671, + "learning_rate": 4.9365826546966984e-05, + "loss": 4.0779, + "step": 12082 + }, + { + "epoch": 0.07186102388428965, + "grad_norm": 2.3463728427886963, + "learning_rate": 4.936572200193468e-05, + "loss": 4.0035, + "step": 12083 + }, + { + "epoch": 0.07186697116757065, + "grad_norm": 2.3459651470184326, + "learning_rate": 4.9365617448396556e-05, + "loss": 4.0577, + "step": 12084 + }, + { + "epoch": 0.07187291845085166, + "grad_norm": 2.169189691543579, + "learning_rate": 4.936551288635264e-05, + "loss": 4.2678, + "step": 12085 + }, + { + "epoch": 0.07187886573413264, + "grad_norm": 2.3313188552856445, + "learning_rate": 4.936540831580299e-05, + "loss": 4.9956, + "step": 12086 + }, + { + "epoch": 0.07188481301741365, + "grad_norm": 2.431053400039673, + "learning_rate": 4.936530373674761e-05, + "loss": 5.2317, + "step": 12087 + }, + { + "epoch": 0.07189076030069465, + "grad_norm": 1.8984981775283813, + "learning_rate": 4.936519914918656e-05, + "loss": 5.4541, + "step": 12088 + }, + { + "epoch": 0.07189670758397564, + "grad_norm": 1.8862982988357544, + "learning_rate": 4.9365094553119877e-05, + "loss": 5.6448, + "step": 12089 + }, + { + "epoch": 0.07190265486725664, + "grad_norm": 1.7802925109863281, + "learning_rate": 4.936498994854759e-05, + "loss": 5.3182, + "step": 12090 + }, + { + "epoch": 0.07190860215053764, + "grad_norm": 1.7578701972961426, + "learning_rate": 4.9364885335469734e-05, + "loss": 6.0188, + "step": 12091 + }, + { + "epoch": 0.07191454943381863, + "grad_norm": 1.6750003099441528, + "learning_rate": 4.9364780713886345e-05, + "loss": 6.0822, + "step": 12092 + }, + { + "epoch": 0.07192049671709963, + "grad_norm": 1.4945881366729736, + "learning_rate": 4.936467608379747e-05, + "loss": 6.0554, + "step": 12093 + }, + { + "epoch": 0.07192644400038063, + "grad_norm": 1.5508134365081787, + "learning_rate": 4.936457144520313e-05, + "loss": 5.9712, + "step": 12094 + }, + { + "epoch": 0.07193239128366162, + "grad_norm": 1.4133291244506836, + "learning_rate": 4.936446679810337e-05, + "loss": 5.9137, + "step": 12095 + }, + { + "epoch": 0.07193833856694262, + "grad_norm": 1.415930986404419, + "learning_rate": 4.936436214249823e-05, + "loss": 5.9957, + "step": 12096 + }, + { + "epoch": 0.07194428585022362, + "grad_norm": 1.682356595993042, + "learning_rate": 4.936425747838774e-05, + "loss": 6.2381, + "step": 12097 + }, + { + "epoch": 0.07195023313350461, + "grad_norm": 1.693535566329956, + "learning_rate": 4.9364152805771946e-05, + "loss": 6.0523, + "step": 12098 + }, + { + "epoch": 0.07195618041678561, + "grad_norm": 1.7577873468399048, + "learning_rate": 4.9364048124650875e-05, + "loss": 5.8243, + "step": 12099 + }, + { + "epoch": 0.0719621277000666, + "grad_norm": 1.6486074924468994, + "learning_rate": 4.936394343502457e-05, + "loss": 5.8072, + "step": 12100 + }, + { + "epoch": 0.0719680749833476, + "grad_norm": 1.5245120525360107, + "learning_rate": 4.936383873689306e-05, + "loss": 5.9013, + "step": 12101 + }, + { + "epoch": 0.0719740222666286, + "grad_norm": 1.4771286249160767, + "learning_rate": 4.936373403025638e-05, + "loss": 6.1314, + "step": 12102 + }, + { + "epoch": 0.0719799695499096, + "grad_norm": 1.7547197341918945, + "learning_rate": 4.936362931511458e-05, + "loss": 5.9725, + "step": 12103 + }, + { + "epoch": 0.0719859168331906, + "grad_norm": 1.9942286014556885, + "learning_rate": 4.936352459146769e-05, + "loss": 5.82, + "step": 12104 + }, + { + "epoch": 0.0719918641164716, + "grad_norm": 1.8367860317230225, + "learning_rate": 4.936341985931574e-05, + "loss": 5.8653, + "step": 12105 + }, + { + "epoch": 0.07199781139975259, + "grad_norm": 1.8277100324630737, + "learning_rate": 4.936331511865877e-05, + "loss": 5.6998, + "step": 12106 + }, + { + "epoch": 0.07200375868303359, + "grad_norm": 1.5308998823165894, + "learning_rate": 4.936321036949683e-05, + "loss": 5.822, + "step": 12107 + }, + { + "epoch": 0.07200970596631459, + "grad_norm": 1.7100377082824707, + "learning_rate": 4.936310561182993e-05, + "loss": 5.991, + "step": 12108 + }, + { + "epoch": 0.07201565324959558, + "grad_norm": 1.8563333749771118, + "learning_rate": 4.936300084565813e-05, + "loss": 5.8438, + "step": 12109 + }, + { + "epoch": 0.07202160053287658, + "grad_norm": 1.9967303276062012, + "learning_rate": 4.936289607098146e-05, + "loss": 5.6786, + "step": 12110 + }, + { + "epoch": 0.07202754781615758, + "grad_norm": 2.1997451782226562, + "learning_rate": 4.9362791287799945e-05, + "loss": 5.2983, + "step": 12111 + }, + { + "epoch": 0.07203349509943857, + "grad_norm": 2.144521713256836, + "learning_rate": 4.9362686496113644e-05, + "loss": 5.2942, + "step": 12112 + }, + { + "epoch": 0.07203944238271957, + "grad_norm": 2.0747883319854736, + "learning_rate": 4.936258169592257e-05, + "loss": 5.473, + "step": 12113 + }, + { + "epoch": 0.07204538966600058, + "grad_norm": 2.0386881828308105, + "learning_rate": 4.9362476887226776e-05, + "loss": 5.2557, + "step": 12114 + }, + { + "epoch": 0.07205133694928156, + "grad_norm": 2.190687894821167, + "learning_rate": 4.93623720700263e-05, + "loss": 5.3251, + "step": 12115 + }, + { + "epoch": 0.07205728423256257, + "grad_norm": 1.9349397420883179, + "learning_rate": 4.936226724432116e-05, + "loss": 5.242, + "step": 12116 + }, + { + "epoch": 0.07206323151584357, + "grad_norm": 2.175943613052368, + "learning_rate": 4.93621624101114e-05, + "loss": 5.185, + "step": 12117 + }, + { + "epoch": 0.07206917879912456, + "grad_norm": 2.053994655609131, + "learning_rate": 4.936205756739708e-05, + "loss": 5.0755, + "step": 12118 + }, + { + "epoch": 0.07207512608240556, + "grad_norm": 2.0012362003326416, + "learning_rate": 4.93619527161782e-05, + "loss": 5.1797, + "step": 12119 + }, + { + "epoch": 0.07208107336568656, + "grad_norm": 1.9441219568252563, + "learning_rate": 4.936184785645482e-05, + "loss": 5.5583, + "step": 12120 + }, + { + "epoch": 0.07208702064896755, + "grad_norm": 2.990767002105713, + "learning_rate": 4.936174298822696e-05, + "loss": 4.8348, + "step": 12121 + }, + { + "epoch": 0.07209296793224855, + "grad_norm": 2.8385918140411377, + "learning_rate": 4.936163811149469e-05, + "loss": 4.7299, + "step": 12122 + }, + { + "epoch": 0.07209891521552955, + "grad_norm": 2.5228044986724854, + "learning_rate": 4.9361533226258006e-05, + "loss": 4.622, + "step": 12123 + }, + { + "epoch": 0.07210486249881054, + "grad_norm": 2.317598581314087, + "learning_rate": 4.936142833251697e-05, + "loss": 4.588, + "step": 12124 + }, + { + "epoch": 0.07211080978209154, + "grad_norm": 2.369335889816284, + "learning_rate": 4.936132343027161e-05, + "loss": 4.3843, + "step": 12125 + }, + { + "epoch": 0.07211675706537254, + "grad_norm": 2.4761011600494385, + "learning_rate": 4.936121851952196e-05, + "loss": 4.4101, + "step": 12126 + }, + { + "epoch": 0.07212270434865353, + "grad_norm": 2.3830130100250244, + "learning_rate": 4.9361113600268065e-05, + "loss": 4.5065, + "step": 12127 + }, + { + "epoch": 0.07212865163193453, + "grad_norm": 2.4977028369903564, + "learning_rate": 4.936100867250996e-05, + "loss": 4.4469, + "step": 12128 + }, + { + "epoch": 0.07213459891521554, + "grad_norm": 2.3377795219421387, + "learning_rate": 4.9360903736247663e-05, + "loss": 4.4045, + "step": 12129 + }, + { + "epoch": 0.07214054619849652, + "grad_norm": 2.268906831741333, + "learning_rate": 4.9360798791481245e-05, + "loss": 4.4224, + "step": 12130 + }, + { + "epoch": 0.07214649348177753, + "grad_norm": 2.316899538040161, + "learning_rate": 4.936069383821072e-05, + "loss": 4.3704, + "step": 12131 + }, + { + "epoch": 0.07215244076505851, + "grad_norm": 2.419618606567383, + "learning_rate": 4.936058887643612e-05, + "loss": 5.493, + "step": 12132 + }, + { + "epoch": 0.07215838804833952, + "grad_norm": 2.081756353378296, + "learning_rate": 4.93604839061575e-05, + "loss": 6.2328, + "step": 12133 + }, + { + "epoch": 0.07216433533162052, + "grad_norm": 2.1638660430908203, + "learning_rate": 4.936037892737487e-05, + "loss": 6.3089, + "step": 12134 + }, + { + "epoch": 0.07217028261490151, + "grad_norm": 1.7972848415374756, + "learning_rate": 4.93602739400883e-05, + "loss": 6.4013, + "step": 12135 + }, + { + "epoch": 0.07217622989818251, + "grad_norm": 1.7160871028900146, + "learning_rate": 4.93601689442978e-05, + "loss": 6.1717, + "step": 12136 + }, + { + "epoch": 0.07218217718146351, + "grad_norm": 2.0931475162506104, + "learning_rate": 4.936006394000342e-05, + "loss": 5.3515, + "step": 12137 + }, + { + "epoch": 0.0721881244647445, + "grad_norm": 2.2872977256774902, + "learning_rate": 4.93599589272052e-05, + "loss": 5.8342, + "step": 12138 + }, + { + "epoch": 0.0721940717480255, + "grad_norm": 2.4082720279693604, + "learning_rate": 4.9359853905903166e-05, + "loss": 6.1651, + "step": 12139 + }, + { + "epoch": 0.0722000190313065, + "grad_norm": 2.120962381362915, + "learning_rate": 4.935974887609735e-05, + "loss": 6.1182, + "step": 12140 + }, + { + "epoch": 0.07220596631458749, + "grad_norm": 2.0507090091705322, + "learning_rate": 4.9359643837787805e-05, + "loss": 5.7158, + "step": 12141 + }, + { + "epoch": 0.0722119135978685, + "grad_norm": 2.099963426589966, + "learning_rate": 4.9359538790974556e-05, + "loss": 5.6952, + "step": 12142 + }, + { + "epoch": 0.0722178608811495, + "grad_norm": 1.7631537914276123, + "learning_rate": 4.935943373565765e-05, + "loss": 5.6649, + "step": 12143 + }, + { + "epoch": 0.07222380816443048, + "grad_norm": 1.739601492881775, + "learning_rate": 4.9359328671837115e-05, + "loss": 5.7258, + "step": 12144 + }, + { + "epoch": 0.07222975544771149, + "grad_norm": 1.630116581916809, + "learning_rate": 4.9359223599512996e-05, + "loss": 5.7305, + "step": 12145 + }, + { + "epoch": 0.07223570273099249, + "grad_norm": 1.6106374263763428, + "learning_rate": 4.935911851868531e-05, + "loss": 5.6779, + "step": 12146 + }, + { + "epoch": 0.07224165001427348, + "grad_norm": 1.945662021636963, + "learning_rate": 4.935901342935412e-05, + "loss": 5.716, + "step": 12147 + }, + { + "epoch": 0.07224759729755448, + "grad_norm": 1.8601467609405518, + "learning_rate": 4.935890833151944e-05, + "loss": 5.7539, + "step": 12148 + }, + { + "epoch": 0.07225354458083548, + "grad_norm": 1.8324257135391235, + "learning_rate": 4.9358803225181324e-05, + "loss": 5.7309, + "step": 12149 + }, + { + "epoch": 0.07225949186411647, + "grad_norm": 2.0564095973968506, + "learning_rate": 4.93586981103398e-05, + "loss": 5.7201, + "step": 12150 + }, + { + "epoch": 0.07226543914739747, + "grad_norm": 1.925706386566162, + "learning_rate": 4.93585929869949e-05, + "loss": 5.5736, + "step": 12151 + }, + { + "epoch": 0.07227138643067847, + "grad_norm": 1.5965845584869385, + "learning_rate": 4.935848785514667e-05, + "loss": 5.4351, + "step": 12152 + }, + { + "epoch": 0.07227733371395946, + "grad_norm": 2.2522077560424805, + "learning_rate": 4.935838271479515e-05, + "loss": 5.8261, + "step": 12153 + }, + { + "epoch": 0.07228328099724046, + "grad_norm": 2.242398738861084, + "learning_rate": 4.935827756594036e-05, + "loss": 5.9923, + "step": 12154 + }, + { + "epoch": 0.07228922828052146, + "grad_norm": 2.043266534805298, + "learning_rate": 4.935817240858236e-05, + "loss": 5.6127, + "step": 12155 + }, + { + "epoch": 0.07229517556380245, + "grad_norm": 2.4922964572906494, + "learning_rate": 4.935806724272116e-05, + "loss": 5.3549, + "step": 12156 + }, + { + "epoch": 0.07230112284708345, + "grad_norm": 2.5241329669952393, + "learning_rate": 4.935796206835682e-05, + "loss": 5.2194, + "step": 12157 + }, + { + "epoch": 0.07230707013036446, + "grad_norm": 2.4680237770080566, + "learning_rate": 4.9357856885489365e-05, + "loss": 5.1154, + "step": 12158 + }, + { + "epoch": 0.07231301741364544, + "grad_norm": 2.1012492179870605, + "learning_rate": 4.9357751694118824e-05, + "loss": 4.8526, + "step": 12159 + }, + { + "epoch": 0.07231896469692645, + "grad_norm": 1.9997994899749756, + "learning_rate": 4.935764649424526e-05, + "loss": 4.9778, + "step": 12160 + }, + { + "epoch": 0.07232491198020743, + "grad_norm": 1.770112156867981, + "learning_rate": 4.935754128586868e-05, + "loss": 5.0855, + "step": 12161 + }, + { + "epoch": 0.07233085926348844, + "grad_norm": 2.0865485668182373, + "learning_rate": 4.935743606898914e-05, + "loss": 5.1566, + "step": 12162 + }, + { + "epoch": 0.07233680654676944, + "grad_norm": 2.0801351070404053, + "learning_rate": 4.9357330843606677e-05, + "loss": 5.0611, + "step": 12163 + }, + { + "epoch": 0.07234275383005043, + "grad_norm": 1.8675305843353271, + "learning_rate": 4.935722560972131e-05, + "loss": 4.9216, + "step": 12164 + }, + { + "epoch": 0.07234870111333143, + "grad_norm": 1.9125452041625977, + "learning_rate": 4.935712036733309e-05, + "loss": 4.8363, + "step": 12165 + }, + { + "epoch": 0.07235464839661243, + "grad_norm": 2.4954965114593506, + "learning_rate": 4.935701511644205e-05, + "loss": 4.9816, + "step": 12166 + }, + { + "epoch": 0.07236059567989342, + "grad_norm": 2.412381410598755, + "learning_rate": 4.935690985704823e-05, + "loss": 4.9616, + "step": 12167 + }, + { + "epoch": 0.07236654296317442, + "grad_norm": 2.356994152069092, + "learning_rate": 4.9356804589151665e-05, + "loss": 4.8326, + "step": 12168 + }, + { + "epoch": 0.07237249024645542, + "grad_norm": 2.2399415969848633, + "learning_rate": 4.93566993127524e-05, + "loss": 4.8955, + "step": 12169 + }, + { + "epoch": 0.07237843752973641, + "grad_norm": 2.691772222518921, + "learning_rate": 4.935659402785044e-05, + "loss": 5.6475, + "step": 12170 + }, + { + "epoch": 0.07238438481301741, + "grad_norm": 2.954955816268921, + "learning_rate": 4.9356488734445865e-05, + "loss": 6.2151, + "step": 12171 + }, + { + "epoch": 0.07239033209629842, + "grad_norm": 2.010998010635376, + "learning_rate": 4.935638343253869e-05, + "loss": 5.9124, + "step": 12172 + }, + { + "epoch": 0.0723962793795794, + "grad_norm": 2.2737836837768555, + "learning_rate": 4.935627812212894e-05, + "loss": 5.4068, + "step": 12173 + }, + { + "epoch": 0.0724022266628604, + "grad_norm": 2.2700793743133545, + "learning_rate": 4.9356172803216675e-05, + "loss": 4.8156, + "step": 12174 + }, + { + "epoch": 0.07240817394614141, + "grad_norm": 2.2795162200927734, + "learning_rate": 4.935606747580192e-05, + "loss": 4.7882, + "step": 12175 + }, + { + "epoch": 0.0724141212294224, + "grad_norm": 2.1849277019500732, + "learning_rate": 4.9355962139884715e-05, + "loss": 4.9914, + "step": 12176 + }, + { + "epoch": 0.0724200685127034, + "grad_norm": 2.5336532592773438, + "learning_rate": 4.935585679546509e-05, + "loss": 4.8487, + "step": 12177 + }, + { + "epoch": 0.0724260157959844, + "grad_norm": 2.624995708465576, + "learning_rate": 4.935575144254309e-05, + "loss": 4.9523, + "step": 12178 + }, + { + "epoch": 0.07243196307926539, + "grad_norm": 2.5450191497802734, + "learning_rate": 4.935564608111875e-05, + "loss": 4.9958, + "step": 12179 + }, + { + "epoch": 0.07243791036254639, + "grad_norm": 2.2714452743530273, + "learning_rate": 4.9355540711192107e-05, + "loss": 5.301, + "step": 12180 + }, + { + "epoch": 0.07244385764582739, + "grad_norm": 2.0173168182373047, + "learning_rate": 4.935543533276319e-05, + "loss": 5.7992, + "step": 12181 + }, + { + "epoch": 0.07244980492910838, + "grad_norm": 2.9326014518737793, + "learning_rate": 4.9355329945832054e-05, + "loss": 5.6065, + "step": 12182 + }, + { + "epoch": 0.07245575221238938, + "grad_norm": 2.142066478729248, + "learning_rate": 4.935522455039871e-05, + "loss": 5.5339, + "step": 12183 + }, + { + "epoch": 0.07246169949567038, + "grad_norm": 1.8901113271713257, + "learning_rate": 4.9355119146463214e-05, + "loss": 5.8829, + "step": 12184 + }, + { + "epoch": 0.07246764677895137, + "grad_norm": 1.996052622795105, + "learning_rate": 4.93550137340256e-05, + "loss": 6.2189, + "step": 12185 + }, + { + "epoch": 0.07247359406223237, + "grad_norm": 1.7420963048934937, + "learning_rate": 4.93549083130859e-05, + "loss": 5.9254, + "step": 12186 + }, + { + "epoch": 0.07247954134551338, + "grad_norm": 2.8487229347229004, + "learning_rate": 4.935480288364416e-05, + "loss": 5.8643, + "step": 12187 + }, + { + "epoch": 0.07248548862879436, + "grad_norm": 3.0168306827545166, + "learning_rate": 4.93546974457004e-05, + "loss": 5.811, + "step": 12188 + }, + { + "epoch": 0.07249143591207537, + "grad_norm": 2.841353416442871, + "learning_rate": 4.935459199925467e-05, + "loss": 5.6832, + "step": 12189 + }, + { + "epoch": 0.07249738319535635, + "grad_norm": 2.3517918586730957, + "learning_rate": 4.9354486544307e-05, + "loss": 4.3651, + "step": 12190 + }, + { + "epoch": 0.07250333047863736, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.935438108085744e-05, + "loss": 4.2884, + "step": 12191 + }, + { + "epoch": 0.07250927776191836, + "grad_norm": 2.0812551975250244, + "learning_rate": 4.935427560890601e-05, + "loss": 4.168, + "step": 12192 + }, + { + "epoch": 0.07251522504519935, + "grad_norm": 2.0546631813049316, + "learning_rate": 4.935417012845275e-05, + "loss": 3.862, + "step": 12193 + }, + { + "epoch": 0.07252117232848035, + "grad_norm": 2.130612850189209, + "learning_rate": 4.935406463949771e-05, + "loss": 3.6729, + "step": 12194 + }, + { + "epoch": 0.07252711961176135, + "grad_norm": 2.35225510597229, + "learning_rate": 4.9353959142040917e-05, + "loss": 3.7075, + "step": 12195 + }, + { + "epoch": 0.07253306689504234, + "grad_norm": 2.418698310852051, + "learning_rate": 4.93538536360824e-05, + "loss": 3.679, + "step": 12196 + }, + { + "epoch": 0.07253901417832334, + "grad_norm": 2.4452991485595703, + "learning_rate": 4.9353748121622214e-05, + "loss": 3.7827, + "step": 12197 + }, + { + "epoch": 0.07254496146160434, + "grad_norm": 2.3787992000579834, + "learning_rate": 4.935364259866038e-05, + "loss": 3.7484, + "step": 12198 + }, + { + "epoch": 0.07255090874488533, + "grad_norm": 2.299149751663208, + "learning_rate": 4.935353706719694e-05, + "loss": 3.6186, + "step": 12199 + }, + { + "epoch": 0.07255685602816633, + "grad_norm": 2.666121244430542, + "learning_rate": 4.9353431527231944e-05, + "loss": 3.5323, + "step": 12200 + }, + { + "epoch": 0.07256280331144734, + "grad_norm": 2.4448325634002686, + "learning_rate": 4.9353325978765404e-05, + "loss": 3.8176, + "step": 12201 + }, + { + "epoch": 0.07256875059472832, + "grad_norm": 2.5082852840423584, + "learning_rate": 4.935322042179737e-05, + "loss": 3.7838, + "step": 12202 + }, + { + "epoch": 0.07257469787800933, + "grad_norm": 2.3247005939483643, + "learning_rate": 4.935311485632788e-05, + "loss": 3.8036, + "step": 12203 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 2.4917871952056885, + "learning_rate": 4.9353009282356974e-05, + "loss": 3.6734, + "step": 12204 + }, + { + "epoch": 0.07258659244457132, + "grad_norm": 2.2535903453826904, + "learning_rate": 4.935290369988468e-05, + "loss": 3.7451, + "step": 12205 + }, + { + "epoch": 0.07259253972785232, + "grad_norm": 2.355896472930908, + "learning_rate": 4.9352798108911036e-05, + "loss": 3.5963, + "step": 12206 + }, + { + "epoch": 0.07259848701113332, + "grad_norm": 2.21923828125, + "learning_rate": 4.935269250943609e-05, + "loss": 3.5492, + "step": 12207 + }, + { + "epoch": 0.07260443429441431, + "grad_norm": 2.3795714378356934, + "learning_rate": 4.935258690145986e-05, + "loss": 3.7146, + "step": 12208 + }, + { + "epoch": 0.07261038157769531, + "grad_norm": 2.3866682052612305, + "learning_rate": 4.93524812849824e-05, + "loss": 3.7359, + "step": 12209 + }, + { + "epoch": 0.07261632886097631, + "grad_norm": 2.411289691925049, + "learning_rate": 4.935237566000374e-05, + "loss": 3.6958, + "step": 12210 + }, + { + "epoch": 0.0726222761442573, + "grad_norm": 2.3831989765167236, + "learning_rate": 4.935227002652392e-05, + "loss": 3.6696, + "step": 12211 + }, + { + "epoch": 0.0726282234275383, + "grad_norm": 2.1831908226013184, + "learning_rate": 4.935216438454297e-05, + "loss": 3.905, + "step": 12212 + }, + { + "epoch": 0.0726341707108193, + "grad_norm": 2.1136345863342285, + "learning_rate": 4.9352058734060934e-05, + "loss": 5.0188, + "step": 12213 + }, + { + "epoch": 0.07264011799410029, + "grad_norm": 2.2617692947387695, + "learning_rate": 4.935195307507784e-05, + "loss": 5.1883, + "step": 12214 + }, + { + "epoch": 0.0726460652773813, + "grad_norm": 2.4442226886749268, + "learning_rate": 4.935184740759374e-05, + "loss": 5.1883, + "step": 12215 + }, + { + "epoch": 0.0726520125606623, + "grad_norm": 2.300234794616699, + "learning_rate": 4.935174173160865e-05, + "loss": 4.9925, + "step": 12216 + }, + { + "epoch": 0.07265795984394328, + "grad_norm": 2.1512858867645264, + "learning_rate": 4.935163604712263e-05, + "loss": 4.883, + "step": 12217 + }, + { + "epoch": 0.07266390712722429, + "grad_norm": 2.210825204849243, + "learning_rate": 4.93515303541357e-05, + "loss": 5.165, + "step": 12218 + }, + { + "epoch": 0.07266985441050527, + "grad_norm": 2.1589086055755615, + "learning_rate": 4.935142465264791e-05, + "loss": 4.931, + "step": 12219 + }, + { + "epoch": 0.07267580169378628, + "grad_norm": 2.0527892112731934, + "learning_rate": 4.935131894265927e-05, + "loss": 5.0566, + "step": 12220 + }, + { + "epoch": 0.07268174897706728, + "grad_norm": 2.202828884124756, + "learning_rate": 4.935121322416985e-05, + "loss": 4.9519, + "step": 12221 + }, + { + "epoch": 0.07268769626034827, + "grad_norm": 2.262834310531616, + "learning_rate": 4.935110749717967e-05, + "loss": 4.9596, + "step": 12222 + }, + { + "epoch": 0.07269364354362927, + "grad_norm": 2.169311761856079, + "learning_rate": 4.935100176168877e-05, + "loss": 4.8968, + "step": 12223 + }, + { + "epoch": 0.07269959082691027, + "grad_norm": 2.137746572494507, + "learning_rate": 4.935089601769719e-05, + "loss": 4.8535, + "step": 12224 + }, + { + "epoch": 0.07270553811019126, + "grad_norm": 2.060861587524414, + "learning_rate": 4.935079026520496e-05, + "loss": 5.0784, + "step": 12225 + }, + { + "epoch": 0.07271148539347226, + "grad_norm": 2.235352039337158, + "learning_rate": 4.935068450421213e-05, + "loss": 4.7351, + "step": 12226 + }, + { + "epoch": 0.07271743267675326, + "grad_norm": 2.3832550048828125, + "learning_rate": 4.935057873471872e-05, + "loss": 4.618, + "step": 12227 + }, + { + "epoch": 0.07272337996003425, + "grad_norm": 2.3591537475585938, + "learning_rate": 4.935047295672477e-05, + "loss": 4.7029, + "step": 12228 + }, + { + "epoch": 0.07272932724331525, + "grad_norm": 2.2797207832336426, + "learning_rate": 4.935036717023033e-05, + "loss": 4.9199, + "step": 12229 + }, + { + "epoch": 0.07273527452659626, + "grad_norm": 2.4931957721710205, + "learning_rate": 4.935026137523542e-05, + "loss": 4.5923, + "step": 12230 + }, + { + "epoch": 0.07274122180987724, + "grad_norm": 2.152064323425293, + "learning_rate": 4.9350155571740095e-05, + "loss": 5.1495, + "step": 12231 + }, + { + "epoch": 0.07274716909315825, + "grad_norm": 2.470526695251465, + "learning_rate": 4.935004975974438e-05, + "loss": 4.8257, + "step": 12232 + }, + { + "epoch": 0.07275311637643925, + "grad_norm": 2.262578248977661, + "learning_rate": 4.9349943939248304e-05, + "loss": 5.7004, + "step": 12233 + }, + { + "epoch": 0.07275906365972024, + "grad_norm": 2.0813188552856445, + "learning_rate": 4.934983811025192e-05, + "loss": 5.6048, + "step": 12234 + }, + { + "epoch": 0.07276501094300124, + "grad_norm": 2.4882686138153076, + "learning_rate": 4.934973227275527e-05, + "loss": 5.8121, + "step": 12235 + }, + { + "epoch": 0.07277095822628224, + "grad_norm": 2.5181429386138916, + "learning_rate": 4.9349626426758364e-05, + "loss": 4.5581, + "step": 12236 + }, + { + "epoch": 0.07277690550956323, + "grad_norm": 2.6369354724884033, + "learning_rate": 4.934952057226127e-05, + "loss": 4.7938, + "step": 12237 + }, + { + "epoch": 0.07278285279284423, + "grad_norm": 1.8615930080413818, + "learning_rate": 4.9349414709264e-05, + "loss": 5.2097, + "step": 12238 + }, + { + "epoch": 0.07278880007612523, + "grad_norm": 1.4905575513839722, + "learning_rate": 4.93493088377666e-05, + "loss": 5.5717, + "step": 12239 + }, + { + "epoch": 0.07279474735940622, + "grad_norm": 1.8339897394180298, + "learning_rate": 4.9349202957769106e-05, + "loss": 5.6908, + "step": 12240 + }, + { + "epoch": 0.07280069464268722, + "grad_norm": 1.5875110626220703, + "learning_rate": 4.934909706927156e-05, + "loss": 5.6246, + "step": 12241 + }, + { + "epoch": 0.07280664192596822, + "grad_norm": 1.8365919589996338, + "learning_rate": 4.934899117227399e-05, + "loss": 5.394, + "step": 12242 + }, + { + "epoch": 0.07281258920924921, + "grad_norm": 1.9548145532608032, + "learning_rate": 4.934888526677645e-05, + "loss": 5.2427, + "step": 12243 + }, + { + "epoch": 0.07281853649253021, + "grad_norm": 1.8174974918365479, + "learning_rate": 4.934877935277896e-05, + "loss": 5.5844, + "step": 12244 + }, + { + "epoch": 0.07282448377581122, + "grad_norm": 1.800117015838623, + "learning_rate": 4.934867343028157e-05, + "loss": 4.9386, + "step": 12245 + }, + { + "epoch": 0.0728304310590922, + "grad_norm": 2.0356900691986084, + "learning_rate": 4.93485674992843e-05, + "loss": 4.6911, + "step": 12246 + }, + { + "epoch": 0.0728363783423732, + "grad_norm": 2.009455442428589, + "learning_rate": 4.93484615597872e-05, + "loss": 4.6121, + "step": 12247 + }, + { + "epoch": 0.0728423256256542, + "grad_norm": 1.9252879619598389, + "learning_rate": 4.934835561179031e-05, + "loss": 4.737, + "step": 12248 + }, + { + "epoch": 0.0728482729089352, + "grad_norm": 2.3497977256774902, + "learning_rate": 4.934824965529365e-05, + "loss": 5.6921, + "step": 12249 + }, + { + "epoch": 0.0728542201922162, + "grad_norm": 2.0821962356567383, + "learning_rate": 4.934814369029727e-05, + "loss": 5.3845, + "step": 12250 + }, + { + "epoch": 0.07286016747549719, + "grad_norm": 1.9725046157836914, + "learning_rate": 4.934803771680121e-05, + "loss": 5.5557, + "step": 12251 + }, + { + "epoch": 0.07286611475877819, + "grad_norm": 2.290238618850708, + "learning_rate": 4.93479317348055e-05, + "loss": 5.4258, + "step": 12252 + }, + { + "epoch": 0.07287206204205919, + "grad_norm": 1.9502376317977905, + "learning_rate": 4.934782574431017e-05, + "loss": 5.0531, + "step": 12253 + }, + { + "epoch": 0.07287800932534018, + "grad_norm": 2.128431797027588, + "learning_rate": 4.9347719745315275e-05, + "loss": 5.0241, + "step": 12254 + }, + { + "epoch": 0.07288395660862118, + "grad_norm": 1.9173803329467773, + "learning_rate": 4.934761373782084e-05, + "loss": 5.7107, + "step": 12255 + }, + { + "epoch": 0.07288990389190218, + "grad_norm": 1.5167652368545532, + "learning_rate": 4.93475077218269e-05, + "loss": 5.2304, + "step": 12256 + }, + { + "epoch": 0.07289585117518317, + "grad_norm": 1.4125497341156006, + "learning_rate": 4.9347401697333505e-05, + "loss": 5.1099, + "step": 12257 + }, + { + "epoch": 0.07290179845846417, + "grad_norm": 2.384801149368286, + "learning_rate": 4.934729566434068e-05, + "loss": 5.0051, + "step": 12258 + }, + { + "epoch": 0.07290774574174518, + "grad_norm": 1.9343961477279663, + "learning_rate": 4.934718962284846e-05, + "loss": 5.3367, + "step": 12259 + }, + { + "epoch": 0.07291369302502616, + "grad_norm": 2.048220157623291, + "learning_rate": 4.93470835728569e-05, + "loss": 5.8502, + "step": 12260 + }, + { + "epoch": 0.07291964030830717, + "grad_norm": 2.037167549133301, + "learning_rate": 4.934697751436601e-05, + "loss": 5.1993, + "step": 12261 + }, + { + "epoch": 0.07292558759158817, + "grad_norm": 1.8141452074050903, + "learning_rate": 4.9346871447375854e-05, + "loss": 5.8308, + "step": 12262 + }, + { + "epoch": 0.07293153487486916, + "grad_norm": 1.7525955438613892, + "learning_rate": 4.934676537188645e-05, + "loss": 5.5946, + "step": 12263 + }, + { + "epoch": 0.07293748215815016, + "grad_norm": 1.9784163236618042, + "learning_rate": 4.9346659287897846e-05, + "loss": 5.7214, + "step": 12264 + }, + { + "epoch": 0.07294342944143116, + "grad_norm": 1.8948242664337158, + "learning_rate": 4.934655319541007e-05, + "loss": 5.7434, + "step": 12265 + }, + { + "epoch": 0.07294937672471215, + "grad_norm": 1.698625087738037, + "learning_rate": 4.934644709442317e-05, + "loss": 5.7828, + "step": 12266 + }, + { + "epoch": 0.07295532400799315, + "grad_norm": 1.6057854890823364, + "learning_rate": 4.934634098493717e-05, + "loss": 5.8815, + "step": 12267 + }, + { + "epoch": 0.07296127129127415, + "grad_norm": 1.4753777980804443, + "learning_rate": 4.9346234866952125e-05, + "loss": 5.8368, + "step": 12268 + }, + { + "epoch": 0.07296721857455514, + "grad_norm": 1.8265280723571777, + "learning_rate": 4.9346128740468046e-05, + "loss": 5.7511, + "step": 12269 + }, + { + "epoch": 0.07297316585783614, + "grad_norm": 1.7212530374526978, + "learning_rate": 4.9346022605485e-05, + "loss": 5.6741, + "step": 12270 + }, + { + "epoch": 0.07297911314111714, + "grad_norm": 1.8423148393630981, + "learning_rate": 4.9345916462002996e-05, + "loss": 5.5199, + "step": 12271 + }, + { + "epoch": 0.07298506042439813, + "grad_norm": 1.7754487991333008, + "learning_rate": 4.934581031002209e-05, + "loss": 5.9655, + "step": 12272 + }, + { + "epoch": 0.07299100770767913, + "grad_norm": 1.794704794883728, + "learning_rate": 4.9345704149542313e-05, + "loss": 5.886, + "step": 12273 + }, + { + "epoch": 0.07299695499096014, + "grad_norm": 1.807165503501892, + "learning_rate": 4.93455979805637e-05, + "loss": 5.5222, + "step": 12274 + }, + { + "epoch": 0.07300290227424112, + "grad_norm": 1.6476585865020752, + "learning_rate": 4.934549180308629e-05, + "loss": 5.6588, + "step": 12275 + }, + { + "epoch": 0.07300884955752213, + "grad_norm": 1.8332840204238892, + "learning_rate": 4.9345385617110125e-05, + "loss": 5.0781, + "step": 12276 + }, + { + "epoch": 0.07301479684080311, + "grad_norm": 1.837471842765808, + "learning_rate": 4.934527942263523e-05, + "loss": 5.8881, + "step": 12277 + }, + { + "epoch": 0.07302074412408412, + "grad_norm": 1.538299798965454, + "learning_rate": 4.934517321966165e-05, + "loss": 6.0547, + "step": 12278 + }, + { + "epoch": 0.07302669140736512, + "grad_norm": 1.9346814155578613, + "learning_rate": 4.934506700818943e-05, + "loss": 5.7853, + "step": 12279 + }, + { + "epoch": 0.0730326386906461, + "grad_norm": 1.9108514785766602, + "learning_rate": 4.93449607882186e-05, + "loss": 5.8034, + "step": 12280 + }, + { + "epoch": 0.07303858597392711, + "grad_norm": 2.0216846466064453, + "learning_rate": 4.934485455974919e-05, + "loss": 5.5127, + "step": 12281 + }, + { + "epoch": 0.07304453325720811, + "grad_norm": 2.2365148067474365, + "learning_rate": 4.9344748322781244e-05, + "loss": 5.5519, + "step": 12282 + }, + { + "epoch": 0.0730504805404891, + "grad_norm": 1.872934103012085, + "learning_rate": 4.934464207731479e-05, + "loss": 5.783, + "step": 12283 + }, + { + "epoch": 0.0730564278237701, + "grad_norm": 1.944606900215149, + "learning_rate": 4.934453582334988e-05, + "loss": 5.9803, + "step": 12284 + }, + { + "epoch": 0.0730623751070511, + "grad_norm": 1.765257477760315, + "learning_rate": 4.934442956088654e-05, + "loss": 5.8434, + "step": 12285 + }, + { + "epoch": 0.07306832239033209, + "grad_norm": 1.9726130962371826, + "learning_rate": 4.934432328992482e-05, + "loss": 5.6173, + "step": 12286 + }, + { + "epoch": 0.0730742696736131, + "grad_norm": 2.0510616302490234, + "learning_rate": 4.934421701046474e-05, + "loss": 5.4661, + "step": 12287 + }, + { + "epoch": 0.0730802169568941, + "grad_norm": 1.6038832664489746, + "learning_rate": 4.934411072250635e-05, + "loss": 5.2786, + "step": 12288 + }, + { + "epoch": 0.07308616424017508, + "grad_norm": 2.0088446140289307, + "learning_rate": 4.934400442604968e-05, + "loss": 4.9999, + "step": 12289 + }, + { + "epoch": 0.07309211152345609, + "grad_norm": 1.4760913848876953, + "learning_rate": 4.934389812109477e-05, + "loss": 4.785, + "step": 12290 + }, + { + "epoch": 0.07309805880673709, + "grad_norm": 2.2036757469177246, + "learning_rate": 4.934379180764166e-05, + "loss": 5.8303, + "step": 12291 + }, + { + "epoch": 0.07310400609001808, + "grad_norm": 2.0261359214782715, + "learning_rate": 4.9343685485690385e-05, + "loss": 5.6823, + "step": 12292 + }, + { + "epoch": 0.07310995337329908, + "grad_norm": 1.7493160963058472, + "learning_rate": 4.934357915524097e-05, + "loss": 5.6144, + "step": 12293 + }, + { + "epoch": 0.07311590065658008, + "grad_norm": 1.887373685836792, + "learning_rate": 4.934347281629347e-05, + "loss": 5.9405, + "step": 12294 + }, + { + "epoch": 0.07312184793986107, + "grad_norm": 1.6655008792877197, + "learning_rate": 4.9343366468847915e-05, + "loss": 5.8376, + "step": 12295 + }, + { + "epoch": 0.07312779522314207, + "grad_norm": 1.9241079092025757, + "learning_rate": 4.9343260112904345e-05, + "loss": 5.6072, + "step": 12296 + }, + { + "epoch": 0.07313374250642307, + "grad_norm": 1.7873997688293457, + "learning_rate": 4.934315374846279e-05, + "loss": 5.539, + "step": 12297 + }, + { + "epoch": 0.07313968978970406, + "grad_norm": 1.9266597032546997, + "learning_rate": 4.9343047375523296e-05, + "loss": 5.3921, + "step": 12298 + }, + { + "epoch": 0.07314563707298506, + "grad_norm": 1.9283325672149658, + "learning_rate": 4.934294099408589e-05, + "loss": 5.2326, + "step": 12299 + }, + { + "epoch": 0.07315158435626606, + "grad_norm": 1.739047884941101, + "learning_rate": 4.934283460415062e-05, + "loss": 5.4831, + "step": 12300 + }, + { + "epoch": 0.07315753163954705, + "grad_norm": 1.6729072332382202, + "learning_rate": 4.934272820571752e-05, + "loss": 5.633, + "step": 12301 + }, + { + "epoch": 0.07316347892282805, + "grad_norm": 1.6901992559432983, + "learning_rate": 4.9342621798786616e-05, + "loss": 5.6121, + "step": 12302 + }, + { + "epoch": 0.07316942620610906, + "grad_norm": 1.8640037775039673, + "learning_rate": 4.9342515383357956e-05, + "loss": 5.6498, + "step": 12303 + }, + { + "epoch": 0.07317537348939004, + "grad_norm": 1.9629018306732178, + "learning_rate": 4.9342408959431576e-05, + "loss": 5.9364, + "step": 12304 + }, + { + "epoch": 0.07318132077267105, + "grad_norm": 1.9370427131652832, + "learning_rate": 4.934230252700752e-05, + "loss": 5.8945, + "step": 12305 + }, + { + "epoch": 0.07318726805595203, + "grad_norm": 1.6541575193405151, + "learning_rate": 4.9342196086085814e-05, + "loss": 5.5826, + "step": 12306 + }, + { + "epoch": 0.07319321533923304, + "grad_norm": 1.6640154123306274, + "learning_rate": 4.934208963666649e-05, + "loss": 5.7065, + "step": 12307 + }, + { + "epoch": 0.07319916262251404, + "grad_norm": 1.596665620803833, + "learning_rate": 4.934198317874961e-05, + "loss": 5.6764, + "step": 12308 + }, + { + "epoch": 0.07320510990579503, + "grad_norm": 1.841260552406311, + "learning_rate": 4.9341876712335176e-05, + "loss": 5.624, + "step": 12309 + }, + { + "epoch": 0.07321105718907603, + "grad_norm": 1.921162724494934, + "learning_rate": 4.9341770237423254e-05, + "loss": 5.3177, + "step": 12310 + }, + { + "epoch": 0.07321700447235703, + "grad_norm": 1.844192624092102, + "learning_rate": 4.934166375401388e-05, + "loss": 5.6236, + "step": 12311 + }, + { + "epoch": 0.07322295175563802, + "grad_norm": 1.9088208675384521, + "learning_rate": 4.934155726210707e-05, + "loss": 5.7487, + "step": 12312 + }, + { + "epoch": 0.07322889903891902, + "grad_norm": 2.1057817935943604, + "learning_rate": 4.934145076170288e-05, + "loss": 5.3372, + "step": 12313 + }, + { + "epoch": 0.07323484632220002, + "grad_norm": 1.9507678747177124, + "learning_rate": 4.9341344252801335e-05, + "loss": 5.9318, + "step": 12314 + }, + { + "epoch": 0.07324079360548101, + "grad_norm": 1.9885265827178955, + "learning_rate": 4.934123773540249e-05, + "loss": 5.7724, + "step": 12315 + }, + { + "epoch": 0.07324674088876201, + "grad_norm": 1.81960129737854, + "learning_rate": 4.934113120950636e-05, + "loss": 5.7624, + "step": 12316 + }, + { + "epoch": 0.07325268817204302, + "grad_norm": 1.7848392724990845, + "learning_rate": 4.9341024675112994e-05, + "loss": 5.8135, + "step": 12317 + }, + { + "epoch": 0.073258635455324, + "grad_norm": 1.8326808214187622, + "learning_rate": 4.9340918132222436e-05, + "loss": 5.9725, + "step": 12318 + }, + { + "epoch": 0.073264582738605, + "grad_norm": 1.731719970703125, + "learning_rate": 4.93408115808347e-05, + "loss": 5.8932, + "step": 12319 + }, + { + "epoch": 0.07327053002188601, + "grad_norm": 1.7635269165039062, + "learning_rate": 4.934070502094985e-05, + "loss": 5.4953, + "step": 12320 + }, + { + "epoch": 0.073276477305167, + "grad_norm": 1.61715829372406, + "learning_rate": 4.934059845256791e-05, + "loss": 5.4043, + "step": 12321 + }, + { + "epoch": 0.073282424588448, + "grad_norm": 1.9188543558120728, + "learning_rate": 4.9340491875688914e-05, + "loss": 5.2762, + "step": 12322 + }, + { + "epoch": 0.073288371871729, + "grad_norm": 2.098680019378662, + "learning_rate": 4.9340385290312904e-05, + "loss": 5.4673, + "step": 12323 + }, + { + "epoch": 0.07329431915500999, + "grad_norm": 2.15560245513916, + "learning_rate": 4.934027869643992e-05, + "loss": 5.9124, + "step": 12324 + }, + { + "epoch": 0.07330026643829099, + "grad_norm": 1.9819902181625366, + "learning_rate": 4.934017209407e-05, + "loss": 5.5686, + "step": 12325 + }, + { + "epoch": 0.07330621372157199, + "grad_norm": 2.517003059387207, + "learning_rate": 4.934006548320317e-05, + "loss": 3.9751, + "step": 12326 + }, + { + "epoch": 0.07331216100485298, + "grad_norm": 2.458714723587036, + "learning_rate": 4.9339958863839474e-05, + "loss": 3.7976, + "step": 12327 + }, + { + "epoch": 0.07331810828813398, + "grad_norm": 2.2642102241516113, + "learning_rate": 4.9339852235978955e-05, + "loss": 3.8853, + "step": 12328 + }, + { + "epoch": 0.07332405557141498, + "grad_norm": 2.3097565174102783, + "learning_rate": 4.9339745599621645e-05, + "loss": 3.5699, + "step": 12329 + }, + { + "epoch": 0.07333000285469597, + "grad_norm": 2.312995195388794, + "learning_rate": 4.933963895476758e-05, + "loss": 3.8338, + "step": 12330 + }, + { + "epoch": 0.07333595013797697, + "grad_norm": 2.69657826423645, + "learning_rate": 4.93395323014168e-05, + "loss": 5.3459, + "step": 12331 + }, + { + "epoch": 0.07334189742125798, + "grad_norm": 2.263038396835327, + "learning_rate": 4.9339425639569336e-05, + "loss": 5.712, + "step": 12332 + }, + { + "epoch": 0.07334784470453896, + "grad_norm": 1.9429599046707153, + "learning_rate": 4.9339318969225235e-05, + "loss": 5.7465, + "step": 12333 + }, + { + "epoch": 0.07335379198781997, + "grad_norm": 2.07045841217041, + "learning_rate": 4.933921229038453e-05, + "loss": 5.6726, + "step": 12334 + }, + { + "epoch": 0.07335973927110095, + "grad_norm": 2.0304102897644043, + "learning_rate": 4.933910560304725e-05, + "loss": 5.8084, + "step": 12335 + }, + { + "epoch": 0.07336568655438196, + "grad_norm": 1.8316701650619507, + "learning_rate": 4.933899890721344e-05, + "loss": 5.3852, + "step": 12336 + }, + { + "epoch": 0.07337163383766296, + "grad_norm": 2.1406614780426025, + "learning_rate": 4.933889220288315e-05, + "loss": 5.1097, + "step": 12337 + }, + { + "epoch": 0.07337758112094395, + "grad_norm": 1.7518030405044556, + "learning_rate": 4.9338785490056395e-05, + "loss": 5.2038, + "step": 12338 + }, + { + "epoch": 0.07338352840422495, + "grad_norm": 1.8387973308563232, + "learning_rate": 4.933867876873322e-05, + "loss": 5.0847, + "step": 12339 + }, + { + "epoch": 0.07338947568750595, + "grad_norm": 1.692947506904602, + "learning_rate": 4.933857203891367e-05, + "loss": 5.6124, + "step": 12340 + }, + { + "epoch": 0.07339542297078694, + "grad_norm": 1.6367069482803345, + "learning_rate": 4.933846530059776e-05, + "loss": 5.7119, + "step": 12341 + }, + { + "epoch": 0.07340137025406794, + "grad_norm": 2.0395610332489014, + "learning_rate": 4.933835855378556e-05, + "loss": 5.4164, + "step": 12342 + }, + { + "epoch": 0.07340731753734894, + "grad_norm": 2.074073314666748, + "learning_rate": 4.933825179847709e-05, + "loss": 5.3952, + "step": 12343 + }, + { + "epoch": 0.07341326482062993, + "grad_norm": 2.2825684547424316, + "learning_rate": 4.9338145034672376e-05, + "loss": 5.4019, + "step": 12344 + }, + { + "epoch": 0.07341921210391093, + "grad_norm": 2.006591796875, + "learning_rate": 4.9338038262371476e-05, + "loss": 5.4422, + "step": 12345 + }, + { + "epoch": 0.07342515938719194, + "grad_norm": 2.10418701171875, + "learning_rate": 4.9337931481574415e-05, + "loss": 5.3801, + "step": 12346 + }, + { + "epoch": 0.07343110667047292, + "grad_norm": 1.9998257160186768, + "learning_rate": 4.9337824692281233e-05, + "loss": 5.1673, + "step": 12347 + }, + { + "epoch": 0.07343705395375393, + "grad_norm": 2.175896644592285, + "learning_rate": 4.933771789449197e-05, + "loss": 5.118, + "step": 12348 + }, + { + "epoch": 0.07344300123703493, + "grad_norm": 2.075164318084717, + "learning_rate": 4.933761108820666e-05, + "loss": 5.1662, + "step": 12349 + }, + { + "epoch": 0.07344894852031592, + "grad_norm": 2.0672569274902344, + "learning_rate": 4.933750427342534e-05, + "loss": 5.0957, + "step": 12350 + }, + { + "epoch": 0.07345489580359692, + "grad_norm": 2.0570287704467773, + "learning_rate": 4.9337397450148055e-05, + "loss": 5.2772, + "step": 12351 + }, + { + "epoch": 0.07346084308687792, + "grad_norm": 2.0653116703033447, + "learning_rate": 4.933729061837483e-05, + "loss": 5.4755, + "step": 12352 + }, + { + "epoch": 0.07346679037015891, + "grad_norm": 2.832578420639038, + "learning_rate": 4.933718377810571e-05, + "loss": 4.8128, + "step": 12353 + }, + { + "epoch": 0.07347273765343991, + "grad_norm": 2.378556251525879, + "learning_rate": 4.933707692934073e-05, + "loss": 5.109, + "step": 12354 + }, + { + "epoch": 0.07347868493672091, + "grad_norm": 2.1819205284118652, + "learning_rate": 4.933697007207993e-05, + "loss": 4.8603, + "step": 12355 + }, + { + "epoch": 0.0734846322200019, + "grad_norm": 2.104738473892212, + "learning_rate": 4.9336863206323345e-05, + "loss": 4.7806, + "step": 12356 + }, + { + "epoch": 0.0734905795032829, + "grad_norm": 1.8287266492843628, + "learning_rate": 4.933675633207101e-05, + "loss": 4.7082, + "step": 12357 + }, + { + "epoch": 0.0734965267865639, + "grad_norm": 2.0478014945983887, + "learning_rate": 4.933664944932297e-05, + "loss": 4.6145, + "step": 12358 + }, + { + "epoch": 0.07350247406984489, + "grad_norm": 2.208263397216797, + "learning_rate": 4.9336542558079244e-05, + "loss": 4.7523, + "step": 12359 + }, + { + "epoch": 0.0735084213531259, + "grad_norm": 2.1506083011627197, + "learning_rate": 4.93364356583399e-05, + "loss": 4.7444, + "step": 12360 + }, + { + "epoch": 0.0735143686364069, + "grad_norm": 2.04584002494812, + "learning_rate": 4.933632875010494e-05, + "loss": 4.6706, + "step": 12361 + }, + { + "epoch": 0.07352031591968788, + "grad_norm": 1.8598030805587769, + "learning_rate": 4.933622183337443e-05, + "loss": 4.6404, + "step": 12362 + }, + { + "epoch": 0.07352626320296889, + "grad_norm": 2.5650441646575928, + "learning_rate": 4.93361149081484e-05, + "loss": 5.382, + "step": 12363 + }, + { + "epoch": 0.07353221048624987, + "grad_norm": 2.1182446479797363, + "learning_rate": 4.933600797442688e-05, + "loss": 5.9041, + "step": 12364 + }, + { + "epoch": 0.07353815776953088, + "grad_norm": 1.8753353357315063, + "learning_rate": 4.933590103220991e-05, + "loss": 5.6615, + "step": 12365 + }, + { + "epoch": 0.07354410505281188, + "grad_norm": 1.9428893327713013, + "learning_rate": 4.933579408149752e-05, + "loss": 5.3549, + "step": 12366 + }, + { + "epoch": 0.07355005233609287, + "grad_norm": 1.809191346168518, + "learning_rate": 4.9335687122289766e-05, + "loss": 5.5603, + "step": 12367 + }, + { + "epoch": 0.07355599961937387, + "grad_norm": 1.7782649993896484, + "learning_rate": 4.933558015458667e-05, + "loss": 5.2848, + "step": 12368 + }, + { + "epoch": 0.07356194690265487, + "grad_norm": 1.71909499168396, + "learning_rate": 4.933547317838828e-05, + "loss": 5.3774, + "step": 12369 + }, + { + "epoch": 0.07356789418593586, + "grad_norm": 1.6399723291397095, + "learning_rate": 4.9335366193694625e-05, + "loss": 5.629, + "step": 12370 + }, + { + "epoch": 0.07357384146921686, + "grad_norm": 1.8646855354309082, + "learning_rate": 4.9335259200505746e-05, + "loss": 5.6297, + "step": 12371 + }, + { + "epoch": 0.07357978875249786, + "grad_norm": 1.5271104574203491, + "learning_rate": 4.9335152198821676e-05, + "loss": 5.6112, + "step": 12372 + }, + { + "epoch": 0.07358573603577885, + "grad_norm": 1.6217905282974243, + "learning_rate": 4.933504518864246e-05, + "loss": 5.2959, + "step": 12373 + }, + { + "epoch": 0.07359168331905985, + "grad_norm": 1.5774266719818115, + "learning_rate": 4.933493816996812e-05, + "loss": 5.4181, + "step": 12374 + }, + { + "epoch": 0.07359763060234085, + "grad_norm": 1.3641432523727417, + "learning_rate": 4.933483114279872e-05, + "loss": 5.3903, + "step": 12375 + }, + { + "epoch": 0.07360357788562184, + "grad_norm": 1.67635178565979, + "learning_rate": 4.933472410713428e-05, + "loss": 5.6771, + "step": 12376 + }, + { + "epoch": 0.07360952516890285, + "grad_norm": 1.6944624185562134, + "learning_rate": 4.933461706297483e-05, + "loss": 5.6008, + "step": 12377 + }, + { + "epoch": 0.07361547245218385, + "grad_norm": 1.3603699207305908, + "learning_rate": 4.933451001032042e-05, + "loss": 5.5396, + "step": 12378 + }, + { + "epoch": 0.07362141973546484, + "grad_norm": 1.6585369110107422, + "learning_rate": 4.9334402949171086e-05, + "loss": 5.5697, + "step": 12379 + }, + { + "epoch": 0.07362736701874584, + "grad_norm": 1.503786563873291, + "learning_rate": 4.9334295879526865e-05, + "loss": 5.4539, + "step": 12380 + }, + { + "epoch": 0.07363331430202684, + "grad_norm": 1.4761176109313965, + "learning_rate": 4.933418880138779e-05, + "loss": 5.4573, + "step": 12381 + }, + { + "epoch": 0.07363926158530783, + "grad_norm": 1.671972393989563, + "learning_rate": 4.93340817147539e-05, + "loss": 5.4143, + "step": 12382 + }, + { + "epoch": 0.07364520886858883, + "grad_norm": 1.5486379861831665, + "learning_rate": 4.9333974619625236e-05, + "loss": 5.4134, + "step": 12383 + }, + { + "epoch": 0.07365115615186983, + "grad_norm": 1.340108036994934, + "learning_rate": 4.933386751600183e-05, + "loss": 5.4587, + "step": 12384 + }, + { + "epoch": 0.07365710343515082, + "grad_norm": 1.3910952806472778, + "learning_rate": 4.933376040388372e-05, + "loss": 5.4129, + "step": 12385 + }, + { + "epoch": 0.07366305071843182, + "grad_norm": 1.5878056287765503, + "learning_rate": 4.9333653283270955e-05, + "loss": 5.3633, + "step": 12386 + }, + { + "epoch": 0.07366899800171282, + "grad_norm": 1.6040968894958496, + "learning_rate": 4.933354615416356e-05, + "loss": 5.2486, + "step": 12387 + }, + { + "epoch": 0.07367494528499381, + "grad_norm": 1.4824137687683105, + "learning_rate": 4.933343901656157e-05, + "loss": 5.2947, + "step": 12388 + }, + { + "epoch": 0.07368089256827481, + "grad_norm": 1.6114120483398438, + "learning_rate": 4.933333187046503e-05, + "loss": 5.2948, + "step": 12389 + }, + { + "epoch": 0.07368683985155582, + "grad_norm": 1.4269661903381348, + "learning_rate": 4.933322471587398e-05, + "loss": 5.1633, + "step": 12390 + }, + { + "epoch": 0.0736927871348368, + "grad_norm": 1.430588960647583, + "learning_rate": 4.933311755278844e-05, + "loss": 5.2846, + "step": 12391 + }, + { + "epoch": 0.0736987344181178, + "grad_norm": 1.3490641117095947, + "learning_rate": 4.9333010381208476e-05, + "loss": 5.2067, + "step": 12392 + }, + { + "epoch": 0.0737046817013988, + "grad_norm": 1.9292722940444946, + "learning_rate": 4.9332903201134104e-05, + "loss": 5.6196, + "step": 12393 + }, + { + "epoch": 0.0737106289846798, + "grad_norm": 1.8885586261749268, + "learning_rate": 4.933279601256536e-05, + "loss": 5.5225, + "step": 12394 + }, + { + "epoch": 0.0737165762679608, + "grad_norm": 1.5985313653945923, + "learning_rate": 4.93326888155023e-05, + "loss": 5.7447, + "step": 12395 + }, + { + "epoch": 0.07372252355124179, + "grad_norm": 2.819392681121826, + "learning_rate": 4.933258160994494e-05, + "loss": 6.002, + "step": 12396 + }, + { + "epoch": 0.07372847083452279, + "grad_norm": 2.006615161895752, + "learning_rate": 4.933247439589333e-05, + "loss": 5.7733, + "step": 12397 + }, + { + "epoch": 0.07373441811780379, + "grad_norm": 1.628408432006836, + "learning_rate": 4.933236717334751e-05, + "loss": 5.3899, + "step": 12398 + }, + { + "epoch": 0.07374036540108478, + "grad_norm": 1.5265247821807861, + "learning_rate": 4.93322599423075e-05, + "loss": 5.3891, + "step": 12399 + }, + { + "epoch": 0.07374631268436578, + "grad_norm": 1.6663800477981567, + "learning_rate": 4.933215270277336e-05, + "loss": 5.6172, + "step": 12400 + }, + { + "epoch": 0.07375225996764678, + "grad_norm": 1.7699551582336426, + "learning_rate": 4.933204545474511e-05, + "loss": 5.7088, + "step": 12401 + }, + { + "epoch": 0.07375820725092777, + "grad_norm": 1.5542314052581787, + "learning_rate": 4.93319381982228e-05, + "loss": 5.5925, + "step": 12402 + }, + { + "epoch": 0.07376415453420877, + "grad_norm": 1.5389710664749146, + "learning_rate": 4.933183093320646e-05, + "loss": 5.572, + "step": 12403 + }, + { + "epoch": 0.07377010181748977, + "grad_norm": 1.381242275238037, + "learning_rate": 4.9331723659696124e-05, + "loss": 5.4964, + "step": 12404 + }, + { + "epoch": 0.07377604910077076, + "grad_norm": 1.5536670684814453, + "learning_rate": 4.933161637769184e-05, + "loss": 5.3748, + "step": 12405 + }, + { + "epoch": 0.07378199638405177, + "grad_norm": 1.6656473875045776, + "learning_rate": 4.933150908719364e-05, + "loss": 5.3267, + "step": 12406 + }, + { + "epoch": 0.07378794366733277, + "grad_norm": 1.9200701713562012, + "learning_rate": 4.933140178820156e-05, + "loss": 5.2928, + "step": 12407 + }, + { + "epoch": 0.07379389095061376, + "grad_norm": 1.6290313005447388, + "learning_rate": 4.933129448071564e-05, + "loss": 5.4969, + "step": 12408 + }, + { + "epoch": 0.07379983823389476, + "grad_norm": 1.7247267961502075, + "learning_rate": 4.933118716473592e-05, + "loss": 5.564, + "step": 12409 + }, + { + "epoch": 0.07380578551717576, + "grad_norm": 1.4726417064666748, + "learning_rate": 4.933107984026243e-05, + "loss": 5.1759, + "step": 12410 + }, + { + "epoch": 0.07381173280045675, + "grad_norm": 1.4726674556732178, + "learning_rate": 4.933097250729522e-05, + "loss": 5.1731, + "step": 12411 + }, + { + "epoch": 0.07381768008373775, + "grad_norm": 1.4694938659667969, + "learning_rate": 4.93308651658343e-05, + "loss": 5.4539, + "step": 12412 + }, + { + "epoch": 0.07382362736701875, + "grad_norm": 1.5212653875350952, + "learning_rate": 4.9330757815879734e-05, + "loss": 5.5035, + "step": 12413 + }, + { + "epoch": 0.07382957465029974, + "grad_norm": 1.3731454610824585, + "learning_rate": 4.933065045743156e-05, + "loss": 5.415, + "step": 12414 + }, + { + "epoch": 0.07383552193358074, + "grad_norm": 1.5576610565185547, + "learning_rate": 4.93305430904898e-05, + "loss": 5.2776, + "step": 12415 + }, + { + "epoch": 0.07384146921686174, + "grad_norm": 1.72965407371521, + "learning_rate": 4.93304357150545e-05, + "loss": 5.3598, + "step": 12416 + }, + { + "epoch": 0.07384741650014273, + "grad_norm": 1.5218521356582642, + "learning_rate": 4.93303283311257e-05, + "loss": 5.295, + "step": 12417 + }, + { + "epoch": 0.07385336378342373, + "grad_norm": 1.5174230337142944, + "learning_rate": 4.933022093870343e-05, + "loss": 5.3506, + "step": 12418 + }, + { + "epoch": 0.07385931106670474, + "grad_norm": 1.3844187259674072, + "learning_rate": 4.933011353778773e-05, + "loss": 5.4345, + "step": 12419 + }, + { + "epoch": 0.07386525834998572, + "grad_norm": 1.5130188465118408, + "learning_rate": 4.9330006128378645e-05, + "loss": 5.4359, + "step": 12420 + }, + { + "epoch": 0.07387120563326673, + "grad_norm": 1.599004864692688, + "learning_rate": 4.93298987104762e-05, + "loss": 5.1631, + "step": 12421 + }, + { + "epoch": 0.07387715291654771, + "grad_norm": 1.6220343112945557, + "learning_rate": 4.932979128408044e-05, + "loss": 5.1244, + "step": 12422 + }, + { + "epoch": 0.07388310019982872, + "grad_norm": 1.5366616249084473, + "learning_rate": 4.93296838491914e-05, + "loss": 5.0368, + "step": 12423 + }, + { + "epoch": 0.07388904748310972, + "grad_norm": 1.5800726413726807, + "learning_rate": 4.932957640580912e-05, + "loss": 4.9906, + "step": 12424 + }, + { + "epoch": 0.0738949947663907, + "grad_norm": 1.6035537719726562, + "learning_rate": 4.9329468953933637e-05, + "loss": 5.0616, + "step": 12425 + }, + { + "epoch": 0.07390094204967171, + "grad_norm": 1.580127239227295, + "learning_rate": 4.932936149356499e-05, + "loss": 5.145, + "step": 12426 + }, + { + "epoch": 0.07390688933295271, + "grad_norm": 1.724788784980774, + "learning_rate": 4.932925402470321e-05, + "loss": 4.9589, + "step": 12427 + }, + { + "epoch": 0.0739128366162337, + "grad_norm": 1.5442367792129517, + "learning_rate": 4.932914654734834e-05, + "loss": 5.077, + "step": 12428 + }, + { + "epoch": 0.0739187838995147, + "grad_norm": 1.3692456483840942, + "learning_rate": 4.932903906150042e-05, + "loss": 5.1778, + "step": 12429 + }, + { + "epoch": 0.0739247311827957, + "grad_norm": 1.8229175806045532, + "learning_rate": 4.932893156715948e-05, + "loss": 5.4053, + "step": 12430 + }, + { + "epoch": 0.07393067846607669, + "grad_norm": 1.7769286632537842, + "learning_rate": 4.9328824064325566e-05, + "loss": 5.2541, + "step": 12431 + }, + { + "epoch": 0.07393662574935769, + "grad_norm": 1.7022631168365479, + "learning_rate": 4.93287165529987e-05, + "loss": 4.8555, + "step": 12432 + }, + { + "epoch": 0.0739425730326387, + "grad_norm": 1.5031015872955322, + "learning_rate": 4.932860903317894e-05, + "loss": 5.019, + "step": 12433 + }, + { + "epoch": 0.07394852031591968, + "grad_norm": 1.352550983428955, + "learning_rate": 4.932850150486631e-05, + "loss": 5.239, + "step": 12434 + }, + { + "epoch": 0.07395446759920069, + "grad_norm": 1.5571177005767822, + "learning_rate": 4.932839396806085e-05, + "loss": 5.2511, + "step": 12435 + }, + { + "epoch": 0.07396041488248169, + "grad_norm": 1.7673511505126953, + "learning_rate": 4.93282864227626e-05, + "loss": 5.1811, + "step": 12436 + }, + { + "epoch": 0.07396636216576268, + "grad_norm": 1.6385267972946167, + "learning_rate": 4.932817886897161e-05, + "loss": 5.1644, + "step": 12437 + }, + { + "epoch": 0.07397230944904368, + "grad_norm": 1.6142395734786987, + "learning_rate": 4.932807130668788e-05, + "loss": 5.173, + "step": 12438 + }, + { + "epoch": 0.07397825673232468, + "grad_norm": 1.6966745853424072, + "learning_rate": 4.932796373591149e-05, + "loss": 5.1495, + "step": 12439 + }, + { + "epoch": 0.07398420401560567, + "grad_norm": 1.6631567478179932, + "learning_rate": 4.932785615664245e-05, + "loss": 5.1787, + "step": 12440 + }, + { + "epoch": 0.07399015129888667, + "grad_norm": 1.7747845649719238, + "learning_rate": 4.9327748568880816e-05, + "loss": 5.1303, + "step": 12441 + }, + { + "epoch": 0.07399609858216767, + "grad_norm": 1.457535982131958, + "learning_rate": 4.932764097262661e-05, + "loss": 5.1573, + "step": 12442 + }, + { + "epoch": 0.07400204586544866, + "grad_norm": 1.602452039718628, + "learning_rate": 4.9327533367879875e-05, + "loss": 5.1039, + "step": 12443 + }, + { + "epoch": 0.07400799314872966, + "grad_norm": 1.644687294960022, + "learning_rate": 4.932742575464065e-05, + "loss": 5.3112, + "step": 12444 + }, + { + "epoch": 0.07401394043201066, + "grad_norm": 1.5873420238494873, + "learning_rate": 4.932731813290897e-05, + "loss": 5.1128, + "step": 12445 + }, + { + "epoch": 0.07401988771529165, + "grad_norm": 1.8046668767929077, + "learning_rate": 4.932721050268489e-05, + "loss": 4.9776, + "step": 12446 + }, + { + "epoch": 0.07402583499857265, + "grad_norm": 1.6964846849441528, + "learning_rate": 4.932710286396841e-05, + "loss": 5.0039, + "step": 12447 + }, + { + "epoch": 0.07403178228185366, + "grad_norm": 1.5332229137420654, + "learning_rate": 4.93269952167596e-05, + "loss": 4.9873, + "step": 12448 + }, + { + "epoch": 0.07403772956513464, + "grad_norm": 1.6128625869750977, + "learning_rate": 4.9326887561058485e-05, + "loss": 5.1139, + "step": 12449 + }, + { + "epoch": 0.07404367684841565, + "grad_norm": 1.5800291299819946, + "learning_rate": 4.932677989686511e-05, + "loss": 4.9687, + "step": 12450 + }, + { + "epoch": 0.07404962413169663, + "grad_norm": 1.6543092727661133, + "learning_rate": 4.932667222417951e-05, + "loss": 4.8345, + "step": 12451 + }, + { + "epoch": 0.07405557141497764, + "grad_norm": 1.4438380002975464, + "learning_rate": 4.932656454300171e-05, + "loss": 4.9677, + "step": 12452 + }, + { + "epoch": 0.07406151869825864, + "grad_norm": 1.6437597274780273, + "learning_rate": 4.932645685333176e-05, + "loss": 4.9016, + "step": 12453 + }, + { + "epoch": 0.07406746598153963, + "grad_norm": 1.5359379053115845, + "learning_rate": 4.932634915516969e-05, + "loss": 4.8357, + "step": 12454 + }, + { + "epoch": 0.07407341326482063, + "grad_norm": 1.6683440208435059, + "learning_rate": 4.9326241448515554e-05, + "loss": 4.8715, + "step": 12455 + }, + { + "epoch": 0.07407936054810163, + "grad_norm": 1.5654494762420654, + "learning_rate": 4.932613373336937e-05, + "loss": 4.8993, + "step": 12456 + }, + { + "epoch": 0.07408530783138262, + "grad_norm": 1.5333384275436401, + "learning_rate": 4.932602600973119e-05, + "loss": 4.9181, + "step": 12457 + }, + { + "epoch": 0.07409125511466362, + "grad_norm": 1.5674177408218384, + "learning_rate": 4.9325918277601046e-05, + "loss": 4.905, + "step": 12458 + }, + { + "epoch": 0.07409720239794462, + "grad_norm": 1.410294771194458, + "learning_rate": 4.9325810536978965e-05, + "loss": 4.8645, + "step": 12459 + }, + { + "epoch": 0.07410314968122561, + "grad_norm": 1.4950916767120361, + "learning_rate": 4.9325702787865006e-05, + "loss": 4.8289, + "step": 12460 + }, + { + "epoch": 0.07410909696450661, + "grad_norm": 1.7529935836791992, + "learning_rate": 4.9325595030259195e-05, + "loss": 4.8917, + "step": 12461 + }, + { + "epoch": 0.07411504424778761, + "grad_norm": 3.5575430393218994, + "learning_rate": 4.932548726416157e-05, + "loss": 5.5795, + "step": 12462 + }, + { + "epoch": 0.0741209915310686, + "grad_norm": 1.5091896057128906, + "learning_rate": 4.9325379489572165e-05, + "loss": 4.9864, + "step": 12463 + }, + { + "epoch": 0.0741269388143496, + "grad_norm": 1.6818382740020752, + "learning_rate": 4.932527170649102e-05, + "loss": 5.3386, + "step": 12464 + }, + { + "epoch": 0.07413288609763061, + "grad_norm": 1.7938569784164429, + "learning_rate": 4.932516391491818e-05, + "loss": 5.2668, + "step": 12465 + }, + { + "epoch": 0.0741388333809116, + "grad_norm": 1.89009428024292, + "learning_rate": 4.932505611485367e-05, + "loss": 5.1755, + "step": 12466 + }, + { + "epoch": 0.0741447806641926, + "grad_norm": 1.5277502536773682, + "learning_rate": 4.932494830629753e-05, + "loss": 5.3271, + "step": 12467 + }, + { + "epoch": 0.0741507279474736, + "grad_norm": 1.7720823287963867, + "learning_rate": 4.932484048924981e-05, + "loss": 5.7089, + "step": 12468 + }, + { + "epoch": 0.07415667523075459, + "grad_norm": 1.6797159910202026, + "learning_rate": 4.932473266371054e-05, + "loss": 5.5563, + "step": 12469 + }, + { + "epoch": 0.07416262251403559, + "grad_norm": 1.6536195278167725, + "learning_rate": 4.932462482967976e-05, + "loss": 5.4271, + "step": 12470 + }, + { + "epoch": 0.07416856979731659, + "grad_norm": 1.5667130947113037, + "learning_rate": 4.93245169871575e-05, + "loss": 5.3703, + "step": 12471 + }, + { + "epoch": 0.07417451708059758, + "grad_norm": 1.3659738302230835, + "learning_rate": 4.93244091361438e-05, + "loss": 5.4114, + "step": 12472 + }, + { + "epoch": 0.07418046436387858, + "grad_norm": 1.5106414556503296, + "learning_rate": 4.9324301276638705e-05, + "loss": 5.386, + "step": 12473 + }, + { + "epoch": 0.07418641164715958, + "grad_norm": 1.5054755210876465, + "learning_rate": 4.932419340864225e-05, + "loss": 5.3067, + "step": 12474 + }, + { + "epoch": 0.07419235893044057, + "grad_norm": 1.4413330554962158, + "learning_rate": 4.932408553215446e-05, + "loss": 5.358, + "step": 12475 + }, + { + "epoch": 0.07419830621372157, + "grad_norm": 1.3034652471542358, + "learning_rate": 4.932397764717539e-05, + "loss": 5.2942, + "step": 12476 + }, + { + "epoch": 0.07420425349700258, + "grad_norm": 1.494664192199707, + "learning_rate": 4.9323869753705074e-05, + "loss": 5.4243, + "step": 12477 + }, + { + "epoch": 0.07421020078028356, + "grad_norm": 1.2644178867340088, + "learning_rate": 4.932376185174354e-05, + "loss": 5.2212, + "step": 12478 + }, + { + "epoch": 0.07421614806356457, + "grad_norm": 1.5576590299606323, + "learning_rate": 4.9323653941290836e-05, + "loss": 5.2077, + "step": 12479 + }, + { + "epoch": 0.07422209534684555, + "grad_norm": 1.5699479579925537, + "learning_rate": 4.932354602234699e-05, + "loss": 5.3849, + "step": 12480 + }, + { + "epoch": 0.07422804263012656, + "grad_norm": 1.6582329273223877, + "learning_rate": 4.932343809491205e-05, + "loss": 5.3961, + "step": 12481 + }, + { + "epoch": 0.07423398991340756, + "grad_norm": 1.6159483194351196, + "learning_rate": 4.932333015898605e-05, + "loss": 5.3711, + "step": 12482 + }, + { + "epoch": 0.07423993719668855, + "grad_norm": 1.453933596611023, + "learning_rate": 4.932322221456902e-05, + "loss": 5.2899, + "step": 12483 + }, + { + "epoch": 0.07424588447996955, + "grad_norm": 1.3830047845840454, + "learning_rate": 4.9323114261661014e-05, + "loss": 5.3839, + "step": 12484 + }, + { + "epoch": 0.07425183176325055, + "grad_norm": 1.5541338920593262, + "learning_rate": 4.932300630026205e-05, + "loss": 5.257, + "step": 12485 + }, + { + "epoch": 0.07425777904653154, + "grad_norm": 1.5887267589569092, + "learning_rate": 4.932289833037219e-05, + "loss": 5.2079, + "step": 12486 + }, + { + "epoch": 0.07426372632981254, + "grad_norm": 1.6341818571090698, + "learning_rate": 4.932279035199144e-05, + "loss": 5.2529, + "step": 12487 + }, + { + "epoch": 0.07426967361309354, + "grad_norm": 1.5520392656326294, + "learning_rate": 4.9322682365119866e-05, + "loss": 5.2416, + "step": 12488 + }, + { + "epoch": 0.07427562089637453, + "grad_norm": 1.610711693763733, + "learning_rate": 4.93225743697575e-05, + "loss": 5.3172, + "step": 12489 + }, + { + "epoch": 0.07428156817965553, + "grad_norm": 1.5997258424758911, + "learning_rate": 4.932246636590436e-05, + "loss": 5.2343, + "step": 12490 + }, + { + "epoch": 0.07428751546293653, + "grad_norm": 1.5319284200668335, + "learning_rate": 4.932235835356051e-05, + "loss": 5.2021, + "step": 12491 + }, + { + "epoch": 0.07429346274621752, + "grad_norm": 1.6516488790512085, + "learning_rate": 4.932225033272597e-05, + "loss": 5.2678, + "step": 12492 + }, + { + "epoch": 0.07429941002949852, + "grad_norm": 1.9008166790008545, + "learning_rate": 4.9322142303400786e-05, + "loss": 5.1424, + "step": 12493 + }, + { + "epoch": 0.07430535731277953, + "grad_norm": 1.8372108936309814, + "learning_rate": 4.932203426558499e-05, + "loss": 5.321, + "step": 12494 + }, + { + "epoch": 0.07431130459606052, + "grad_norm": 1.4764071702957153, + "learning_rate": 4.932192621927863e-05, + "loss": 5.3627, + "step": 12495 + }, + { + "epoch": 0.07431725187934152, + "grad_norm": 1.6356589794158936, + "learning_rate": 4.932181816448173e-05, + "loss": 5.2061, + "step": 12496 + }, + { + "epoch": 0.07432319916262252, + "grad_norm": 1.6335545778274536, + "learning_rate": 4.932171010119434e-05, + "loss": 5.2283, + "step": 12497 + }, + { + "epoch": 0.07432914644590351, + "grad_norm": 1.499968409538269, + "learning_rate": 4.932160202941649e-05, + "loss": 5.4862, + "step": 12498 + }, + { + "epoch": 0.07433509372918451, + "grad_norm": 1.7292691469192505, + "learning_rate": 4.932149394914822e-05, + "loss": 5.4055, + "step": 12499 + }, + { + "epoch": 0.07434104101246551, + "grad_norm": 1.6818633079528809, + "learning_rate": 4.932138586038957e-05, + "loss": 5.5262, + "step": 12500 + }, + { + "epoch": 0.0743469882957465, + "grad_norm": 1.4048001766204834, + "learning_rate": 4.932127776314057e-05, + "loss": 5.1876, + "step": 12501 + }, + { + "epoch": 0.0743529355790275, + "grad_norm": 1.6041479110717773, + "learning_rate": 4.9321169657401264e-05, + "loss": 5.0791, + "step": 12502 + }, + { + "epoch": 0.0743588828623085, + "grad_norm": 1.3542897701263428, + "learning_rate": 4.932106154317169e-05, + "loss": 5.189, + "step": 12503 + }, + { + "epoch": 0.07436483014558949, + "grad_norm": 1.7782005071640015, + "learning_rate": 4.932095342045189e-05, + "loss": 5.2823, + "step": 12504 + }, + { + "epoch": 0.0743707774288705, + "grad_norm": 1.5981978178024292, + "learning_rate": 4.932084528924189e-05, + "loss": 5.3978, + "step": 12505 + }, + { + "epoch": 0.0743767247121515, + "grad_norm": 1.5224134922027588, + "learning_rate": 4.9320737149541734e-05, + "loss": 5.336, + "step": 12506 + }, + { + "epoch": 0.07438267199543248, + "grad_norm": 1.4827311038970947, + "learning_rate": 4.932062900135147e-05, + "loss": 5.2284, + "step": 12507 + }, + { + "epoch": 0.07438861927871349, + "grad_norm": 1.4394789934158325, + "learning_rate": 4.932052084467111e-05, + "loss": 5.1672, + "step": 12508 + }, + { + "epoch": 0.07439456656199447, + "grad_norm": 1.5112950801849365, + "learning_rate": 4.9320412679500715e-05, + "loss": 5.4069, + "step": 12509 + }, + { + "epoch": 0.07440051384527548, + "grad_norm": 1.4547615051269531, + "learning_rate": 4.932030450584032e-05, + "loss": 5.3317, + "step": 12510 + }, + { + "epoch": 0.07440646112855648, + "grad_norm": 1.5839279890060425, + "learning_rate": 4.9320196323689946e-05, + "loss": 5.2042, + "step": 12511 + }, + { + "epoch": 0.07441240841183747, + "grad_norm": 1.6392362117767334, + "learning_rate": 4.9320088133049655e-05, + "loss": 5.2595, + "step": 12512 + }, + { + "epoch": 0.07441835569511847, + "grad_norm": 1.530236840248108, + "learning_rate": 4.931997993391947e-05, + "loss": 5.4417, + "step": 12513 + }, + { + "epoch": 0.07442430297839947, + "grad_norm": 1.7665959596633911, + "learning_rate": 4.931987172629943e-05, + "loss": 5.5164, + "step": 12514 + }, + { + "epoch": 0.07443025026168046, + "grad_norm": 1.5256375074386597, + "learning_rate": 4.931976351018957e-05, + "loss": 5.3645, + "step": 12515 + }, + { + "epoch": 0.07443619754496146, + "grad_norm": 1.5948551893234253, + "learning_rate": 4.9319655285589937e-05, + "loss": 5.1964, + "step": 12516 + }, + { + "epoch": 0.07444214482824246, + "grad_norm": 1.451249361038208, + "learning_rate": 4.931954705250056e-05, + "loss": 5.3043, + "step": 12517 + }, + { + "epoch": 0.07444809211152345, + "grad_norm": 1.5874381065368652, + "learning_rate": 4.931943881092148e-05, + "loss": 5.3769, + "step": 12518 + }, + { + "epoch": 0.07445403939480445, + "grad_norm": 1.597102165222168, + "learning_rate": 4.931933056085274e-05, + "loss": 5.2909, + "step": 12519 + }, + { + "epoch": 0.07445998667808545, + "grad_norm": 1.3787156343460083, + "learning_rate": 4.9319222302294364e-05, + "loss": 5.5499, + "step": 12520 + }, + { + "epoch": 0.07446593396136644, + "grad_norm": 1.5816805362701416, + "learning_rate": 4.931911403524641e-05, + "loss": 5.255, + "step": 12521 + }, + { + "epoch": 0.07447188124464744, + "grad_norm": 1.636619210243225, + "learning_rate": 4.93190057597089e-05, + "loss": 5.3816, + "step": 12522 + }, + { + "epoch": 0.07447782852792845, + "grad_norm": 1.518872857093811, + "learning_rate": 4.931889747568187e-05, + "loss": 5.3376, + "step": 12523 + }, + { + "epoch": 0.07448377581120944, + "grad_norm": 1.9586291313171387, + "learning_rate": 4.931878918316537e-05, + "loss": 5.6678, + "step": 12524 + }, + { + "epoch": 0.07448972309449044, + "grad_norm": 1.5893887281417847, + "learning_rate": 4.9318680882159435e-05, + "loss": 5.266, + "step": 12525 + }, + { + "epoch": 0.07449567037777144, + "grad_norm": 1.5339915752410889, + "learning_rate": 4.93185725726641e-05, + "loss": 5.1891, + "step": 12526 + }, + { + "epoch": 0.07450161766105243, + "grad_norm": 1.730128288269043, + "learning_rate": 4.9318464254679396e-05, + "loss": 5.1534, + "step": 12527 + }, + { + "epoch": 0.07450756494433343, + "grad_norm": 1.691015362739563, + "learning_rate": 4.931835592820537e-05, + "loss": 5.2599, + "step": 12528 + }, + { + "epoch": 0.07451351222761443, + "grad_norm": 1.2936137914657593, + "learning_rate": 4.9318247593242056e-05, + "loss": 5.2432, + "step": 12529 + }, + { + "epoch": 0.07451945951089542, + "grad_norm": 1.4507200717926025, + "learning_rate": 4.93181392497895e-05, + "loss": 5.1539, + "step": 12530 + }, + { + "epoch": 0.07452540679417642, + "grad_norm": 1.6212667226791382, + "learning_rate": 4.931803089784772e-05, + "loss": 5.1212, + "step": 12531 + }, + { + "epoch": 0.07453135407745742, + "grad_norm": 1.48690927028656, + "learning_rate": 4.9317922537416775e-05, + "loss": 5.168, + "step": 12532 + }, + { + "epoch": 0.07453730136073841, + "grad_norm": 1.5102870464324951, + "learning_rate": 4.931781416849669e-05, + "loss": 5.2024, + "step": 12533 + }, + { + "epoch": 0.07454324864401941, + "grad_norm": 1.4186264276504517, + "learning_rate": 4.9317705791087516e-05, + "loss": 5.1154, + "step": 12534 + }, + { + "epoch": 0.07454919592730042, + "grad_norm": 1.623822569847107, + "learning_rate": 4.931759740518928e-05, + "loss": 5.0244, + "step": 12535 + }, + { + "epoch": 0.0745551432105814, + "grad_norm": 1.4694246053695679, + "learning_rate": 4.9317489010802015e-05, + "loss": 5.1737, + "step": 12536 + }, + { + "epoch": 0.0745610904938624, + "grad_norm": 1.553551435470581, + "learning_rate": 4.931738060792577e-05, + "loss": 5.1339, + "step": 12537 + }, + { + "epoch": 0.0745670377771434, + "grad_norm": 1.744367003440857, + "learning_rate": 4.9317272196560575e-05, + "loss": 5.1564, + "step": 12538 + }, + { + "epoch": 0.0745729850604244, + "grad_norm": 1.6584309339523315, + "learning_rate": 4.931716377670648e-05, + "loss": 5.1871, + "step": 12539 + }, + { + "epoch": 0.0745789323437054, + "grad_norm": 1.6894947290420532, + "learning_rate": 4.931705534836351e-05, + "loss": 5.1432, + "step": 12540 + }, + { + "epoch": 0.07458487962698639, + "grad_norm": 1.467315912246704, + "learning_rate": 4.93169469115317e-05, + "loss": 5.2072, + "step": 12541 + }, + { + "epoch": 0.07459082691026739, + "grad_norm": 1.478841781616211, + "learning_rate": 4.93168384662111e-05, + "loss": 5.3644, + "step": 12542 + }, + { + "epoch": 0.07459677419354839, + "grad_norm": 1.6001938581466675, + "learning_rate": 4.9316730012401745e-05, + "loss": 5.2031, + "step": 12543 + }, + { + "epoch": 0.07460272147682938, + "grad_norm": 1.480236530303955, + "learning_rate": 4.931662155010367e-05, + "loss": 5.0113, + "step": 12544 + }, + { + "epoch": 0.07460866876011038, + "grad_norm": 1.490511178970337, + "learning_rate": 4.9316513079316914e-05, + "loss": 5.0416, + "step": 12545 + }, + { + "epoch": 0.07461461604339138, + "grad_norm": 1.7327873706817627, + "learning_rate": 4.931640460004152e-05, + "loss": 5.0578, + "step": 12546 + }, + { + "epoch": 0.07462056332667237, + "grad_norm": 1.6410421133041382, + "learning_rate": 4.9316296112277514e-05, + "loss": 5.0239, + "step": 12547 + }, + { + "epoch": 0.07462651060995337, + "grad_norm": 1.5255141258239746, + "learning_rate": 4.9316187616024936e-05, + "loss": 5.1592, + "step": 12548 + }, + { + "epoch": 0.07463245789323437, + "grad_norm": 1.5555649995803833, + "learning_rate": 4.9316079111283835e-05, + "loss": 5.3981, + "step": 12549 + }, + { + "epoch": 0.07463840517651536, + "grad_norm": 1.4196929931640625, + "learning_rate": 4.931597059805424e-05, + "loss": 5.0682, + "step": 12550 + }, + { + "epoch": 0.07464435245979636, + "grad_norm": 1.562338948249817, + "learning_rate": 4.93158620763362e-05, + "loss": 5.3551, + "step": 12551 + }, + { + "epoch": 0.07465029974307737, + "grad_norm": 1.5955942869186401, + "learning_rate": 4.931575354612973e-05, + "loss": 5.3108, + "step": 12552 + }, + { + "epoch": 0.07465624702635835, + "grad_norm": 1.4173908233642578, + "learning_rate": 4.9315645007434885e-05, + "loss": 5.3793, + "step": 12553 + }, + { + "epoch": 0.07466219430963936, + "grad_norm": 1.4075239896774292, + "learning_rate": 4.93155364602517e-05, + "loss": 5.4409, + "step": 12554 + }, + { + "epoch": 0.07466814159292036, + "grad_norm": 1.3041841983795166, + "learning_rate": 4.9315427904580216e-05, + "loss": 5.5285, + "step": 12555 + }, + { + "epoch": 0.07467408887620135, + "grad_norm": 1.4277441501617432, + "learning_rate": 4.9315319340420465e-05, + "loss": 5.5017, + "step": 12556 + }, + { + "epoch": 0.07468003615948235, + "grad_norm": 1.407895803451538, + "learning_rate": 4.931521076777248e-05, + "loss": 5.3675, + "step": 12557 + }, + { + "epoch": 0.07468598344276335, + "grad_norm": 1.429131031036377, + "learning_rate": 4.931510218663632e-05, + "loss": 5.3712, + "step": 12558 + }, + { + "epoch": 0.07469193072604434, + "grad_norm": 1.7229793071746826, + "learning_rate": 4.9314993597011995e-05, + "loss": 5.4513, + "step": 12559 + }, + { + "epoch": 0.07469787800932534, + "grad_norm": 1.5961774587631226, + "learning_rate": 4.9314884998899565e-05, + "loss": 5.5478, + "step": 12560 + }, + { + "epoch": 0.07470382529260634, + "grad_norm": 1.4570807218551636, + "learning_rate": 4.931477639229906e-05, + "loss": 5.3973, + "step": 12561 + }, + { + "epoch": 0.07470977257588733, + "grad_norm": 1.6308903694152832, + "learning_rate": 4.931466777721052e-05, + "loss": 5.1951, + "step": 12562 + }, + { + "epoch": 0.07471571985916833, + "grad_norm": 1.438491940498352, + "learning_rate": 4.9314559153633974e-05, + "loss": 5.4237, + "step": 12563 + }, + { + "epoch": 0.07472166714244934, + "grad_norm": 1.7219120264053345, + "learning_rate": 4.931445052156947e-05, + "loss": 5.2303, + "step": 12564 + }, + { + "epoch": 0.07472761442573032, + "grad_norm": 1.557895302772522, + "learning_rate": 4.931434188101704e-05, + "loss": 5.2383, + "step": 12565 + }, + { + "epoch": 0.07473356170901133, + "grad_norm": 1.3585479259490967, + "learning_rate": 4.931423323197672e-05, + "loss": 5.2698, + "step": 12566 + }, + { + "epoch": 0.07473950899229233, + "grad_norm": 1.643608808517456, + "learning_rate": 4.931412457444857e-05, + "loss": 5.285, + "step": 12567 + }, + { + "epoch": 0.07474545627557332, + "grad_norm": 1.7847453355789185, + "learning_rate": 4.93140159084326e-05, + "loss": 5.413, + "step": 12568 + }, + { + "epoch": 0.07475140355885432, + "grad_norm": 1.5010985136032104, + "learning_rate": 4.931390723392886e-05, + "loss": 5.3665, + "step": 12569 + }, + { + "epoch": 0.0747573508421353, + "grad_norm": 1.3640403747558594, + "learning_rate": 4.931379855093738e-05, + "loss": 5.2253, + "step": 12570 + }, + { + "epoch": 0.07476329812541631, + "grad_norm": 1.4886012077331543, + "learning_rate": 4.9313689859458214e-05, + "loss": 5.5954, + "step": 12571 + }, + { + "epoch": 0.07476924540869731, + "grad_norm": 1.6626142263412476, + "learning_rate": 4.931358115949138e-05, + "loss": 5.3558, + "step": 12572 + }, + { + "epoch": 0.0747751926919783, + "grad_norm": 1.6350460052490234, + "learning_rate": 4.931347245103693e-05, + "loss": 5.3222, + "step": 12573 + }, + { + "epoch": 0.0747811399752593, + "grad_norm": 1.586182951927185, + "learning_rate": 4.93133637340949e-05, + "loss": 5.2056, + "step": 12574 + }, + { + "epoch": 0.0747870872585403, + "grad_norm": 1.6866692304611206, + "learning_rate": 4.931325500866532e-05, + "loss": 5.2698, + "step": 12575 + }, + { + "epoch": 0.07479303454182129, + "grad_norm": 1.4165509939193726, + "learning_rate": 4.9313146274748235e-05, + "loss": 5.2572, + "step": 12576 + }, + { + "epoch": 0.07479898182510229, + "grad_norm": 1.6259573698043823, + "learning_rate": 4.931303753234369e-05, + "loss": 5.2585, + "step": 12577 + }, + { + "epoch": 0.0748049291083833, + "grad_norm": 1.4159972667694092, + "learning_rate": 4.931292878145171e-05, + "loss": 5.1748, + "step": 12578 + }, + { + "epoch": 0.07481087639166428, + "grad_norm": 1.3880494832992554, + "learning_rate": 4.931282002207234e-05, + "loss": 5.2181, + "step": 12579 + }, + { + "epoch": 0.07481682367494528, + "grad_norm": 1.4466285705566406, + "learning_rate": 4.931271125420561e-05, + "loss": 5.2041, + "step": 12580 + }, + { + "epoch": 0.07482277095822629, + "grad_norm": 1.5111972093582153, + "learning_rate": 4.931260247785157e-05, + "loss": 5.2388, + "step": 12581 + }, + { + "epoch": 0.07482871824150727, + "grad_norm": 1.368296504020691, + "learning_rate": 4.9312493693010245e-05, + "loss": 5.0964, + "step": 12582 + }, + { + "epoch": 0.07483466552478828, + "grad_norm": 1.5604379177093506, + "learning_rate": 4.931238489968168e-05, + "loss": 5.2031, + "step": 12583 + }, + { + "epoch": 0.07484061280806928, + "grad_norm": 1.6104371547698975, + "learning_rate": 4.9312276097865916e-05, + "loss": 5.1122, + "step": 12584 + }, + { + "epoch": 0.07484656009135027, + "grad_norm": 1.5082486867904663, + "learning_rate": 4.931216728756299e-05, + "loss": 5.2092, + "step": 12585 + }, + { + "epoch": 0.07485250737463127, + "grad_norm": 2.1802000999450684, + "learning_rate": 4.931205846877293e-05, + "loss": 5.859, + "step": 12586 + }, + { + "epoch": 0.07485845465791227, + "grad_norm": 1.7069321870803833, + "learning_rate": 4.931194964149579e-05, + "loss": 4.9751, + "step": 12587 + }, + { + "epoch": 0.07486440194119326, + "grad_norm": 1.3614740371704102, + "learning_rate": 4.931184080573159e-05, + "loss": 5.2341, + "step": 12588 + }, + { + "epoch": 0.07487034922447426, + "grad_norm": 1.3952617645263672, + "learning_rate": 4.931173196148039e-05, + "loss": 5.0472, + "step": 12589 + }, + { + "epoch": 0.07487629650775526, + "grad_norm": 1.435829758644104, + "learning_rate": 4.9311623108742205e-05, + "loss": 5.0165, + "step": 12590 + }, + { + "epoch": 0.07488224379103625, + "grad_norm": 1.3875840902328491, + "learning_rate": 4.931151424751709e-05, + "loss": 5.5455, + "step": 12591 + }, + { + "epoch": 0.07488819107431725, + "grad_norm": 1.4364032745361328, + "learning_rate": 4.931140537780508e-05, + "loss": 5.5106, + "step": 12592 + }, + { + "epoch": 0.07489413835759826, + "grad_norm": 1.5878878831863403, + "learning_rate": 4.9311296499606194e-05, + "loss": 5.2372, + "step": 12593 + }, + { + "epoch": 0.07490008564087924, + "grad_norm": 1.5724025964736938, + "learning_rate": 4.9311187612920495e-05, + "loss": 5.3771, + "step": 12594 + }, + { + "epoch": 0.07490603292416025, + "grad_norm": 1.4630738496780396, + "learning_rate": 4.9311078717748014e-05, + "loss": 5.3378, + "step": 12595 + }, + { + "epoch": 0.07491198020744125, + "grad_norm": 1.4438437223434448, + "learning_rate": 4.931096981408878e-05, + "loss": 5.3019, + "step": 12596 + }, + { + "epoch": 0.07491792749072224, + "grad_norm": 1.674564242362976, + "learning_rate": 4.931086090194285e-05, + "loss": 5.2957, + "step": 12597 + }, + { + "epoch": 0.07492387477400324, + "grad_norm": 1.237748384475708, + "learning_rate": 4.9310751981310236e-05, + "loss": 5.1994, + "step": 12598 + }, + { + "epoch": 0.07492982205728423, + "grad_norm": 1.5828932523727417, + "learning_rate": 4.9310643052191e-05, + "loss": 5.2326, + "step": 12599 + }, + { + "epoch": 0.07493576934056523, + "grad_norm": 1.2774053812026978, + "learning_rate": 4.931053411458516e-05, + "loss": 5.2496, + "step": 12600 + }, + { + "epoch": 0.07494171662384623, + "grad_norm": 1.2986499071121216, + "learning_rate": 4.9310425168492766e-05, + "loss": 5.3061, + "step": 12601 + }, + { + "epoch": 0.07494766390712722, + "grad_norm": 1.3973673582077026, + "learning_rate": 4.931031621391386e-05, + "loss": 5.1437, + "step": 12602 + }, + { + "epoch": 0.07495361119040822, + "grad_norm": 1.4217787981033325, + "learning_rate": 4.9310207250848475e-05, + "loss": 5.1636, + "step": 12603 + }, + { + "epoch": 0.07495955847368922, + "grad_norm": 1.5062726736068726, + "learning_rate": 4.9310098279296634e-05, + "loss": 5.2944, + "step": 12604 + }, + { + "epoch": 0.07496550575697021, + "grad_norm": 1.4844671487808228, + "learning_rate": 4.9309989299258404e-05, + "loss": 5.1899, + "step": 12605 + }, + { + "epoch": 0.07497145304025121, + "grad_norm": 1.3542430400848389, + "learning_rate": 4.9309880310733805e-05, + "loss": 5.1636, + "step": 12606 + }, + { + "epoch": 0.07497740032353221, + "grad_norm": 1.58526611328125, + "learning_rate": 4.930977131372287e-05, + "loss": 5.5748, + "step": 12607 + }, + { + "epoch": 0.0749833476068132, + "grad_norm": 1.6003972291946411, + "learning_rate": 4.930966230822564e-05, + "loss": 5.3992, + "step": 12608 + }, + { + "epoch": 0.0749892948900942, + "grad_norm": 1.6475237607955933, + "learning_rate": 4.930955329424218e-05, + "loss": 5.4515, + "step": 12609 + }, + { + "epoch": 0.0749952421733752, + "grad_norm": 1.5395694971084595, + "learning_rate": 4.9309444271772486e-05, + "loss": 5.5117, + "step": 12610 + }, + { + "epoch": 0.0750011894566562, + "grad_norm": 1.3863389492034912, + "learning_rate": 4.930933524081663e-05, + "loss": 5.5771, + "step": 12611 + }, + { + "epoch": 0.0750071367399372, + "grad_norm": 1.431830644607544, + "learning_rate": 4.9309226201374626e-05, + "loss": 5.412, + "step": 12612 + }, + { + "epoch": 0.0750130840232182, + "grad_norm": 1.4647631645202637, + "learning_rate": 4.930911715344653e-05, + "loss": 5.1849, + "step": 12613 + }, + { + "epoch": 0.07501903130649919, + "grad_norm": 2.126068592071533, + "learning_rate": 4.930900809703237e-05, + "loss": 5.1712, + "step": 12614 + }, + { + "epoch": 0.07502497858978019, + "grad_norm": 1.3078912496566772, + "learning_rate": 4.9308899032132183e-05, + "loss": 5.3937, + "step": 12615 + }, + { + "epoch": 0.07503092587306119, + "grad_norm": 1.2535938024520874, + "learning_rate": 4.9308789958746016e-05, + "loss": 5.5708, + "step": 12616 + }, + { + "epoch": 0.07503687315634218, + "grad_norm": 1.3942710161209106, + "learning_rate": 4.9308680876873894e-05, + "loss": 5.5907, + "step": 12617 + }, + { + "epoch": 0.07504282043962318, + "grad_norm": 1.3061814308166504, + "learning_rate": 4.930857178651587e-05, + "loss": 5.2515, + "step": 12618 + }, + { + "epoch": 0.07504876772290418, + "grad_norm": 1.8493753671646118, + "learning_rate": 4.930846268767197e-05, + "loss": 4.9958, + "step": 12619 + }, + { + "epoch": 0.07505471500618517, + "grad_norm": 1.5966380834579468, + "learning_rate": 4.9308353580342234e-05, + "loss": 4.8784, + "step": 12620 + }, + { + "epoch": 0.07506066228946617, + "grad_norm": 1.6849051713943481, + "learning_rate": 4.930824446452671e-05, + "loss": 5.1549, + "step": 12621 + }, + { + "epoch": 0.07506660957274718, + "grad_norm": 1.5844405889511108, + "learning_rate": 4.9308135340225426e-05, + "loss": 4.9807, + "step": 12622 + }, + { + "epoch": 0.07507255685602816, + "grad_norm": 1.520621418952942, + "learning_rate": 4.9308026207438424e-05, + "loss": 5.2237, + "step": 12623 + }, + { + "epoch": 0.07507850413930917, + "grad_norm": 1.5273483991622925, + "learning_rate": 4.9307917066165744e-05, + "loss": 5.4053, + "step": 12624 + }, + { + "epoch": 0.07508445142259017, + "grad_norm": 1.7137775421142578, + "learning_rate": 4.9307807916407414e-05, + "loss": 5.0427, + "step": 12625 + }, + { + "epoch": 0.07509039870587116, + "grad_norm": 1.7140679359436035, + "learning_rate": 4.930769875816348e-05, + "loss": 5.0354, + "step": 12626 + }, + { + "epoch": 0.07509634598915216, + "grad_norm": 1.5592498779296875, + "learning_rate": 4.930758959143399e-05, + "loss": 4.9663, + "step": 12627 + }, + { + "epoch": 0.07510229327243315, + "grad_norm": 1.4611366987228394, + "learning_rate": 4.930748041621896e-05, + "loss": 4.9469, + "step": 12628 + }, + { + "epoch": 0.07510824055571415, + "grad_norm": 1.4682248830795288, + "learning_rate": 4.930737123251844e-05, + "loss": 5.0217, + "step": 12629 + }, + { + "epoch": 0.07511418783899515, + "grad_norm": 1.5643991231918335, + "learning_rate": 4.9307262040332474e-05, + "loss": 5.0488, + "step": 12630 + }, + { + "epoch": 0.07512013512227614, + "grad_norm": 1.680577278137207, + "learning_rate": 4.9307152839661094e-05, + "loss": 5.0813, + "step": 12631 + }, + { + "epoch": 0.07512608240555714, + "grad_norm": 1.9138245582580566, + "learning_rate": 4.9307043630504334e-05, + "loss": 5.0965, + "step": 12632 + }, + { + "epoch": 0.07513202968883814, + "grad_norm": 1.7382584810256958, + "learning_rate": 4.9306934412862236e-05, + "loss": 5.3726, + "step": 12633 + }, + { + "epoch": 0.07513797697211913, + "grad_norm": 1.684213638305664, + "learning_rate": 4.930682518673484e-05, + "loss": 5.2511, + "step": 12634 + }, + { + "epoch": 0.07514392425540013, + "grad_norm": 1.6976017951965332, + "learning_rate": 4.9306715952122185e-05, + "loss": 4.9669, + "step": 12635 + }, + { + "epoch": 0.07514987153868113, + "grad_norm": 1.526212453842163, + "learning_rate": 4.930660670902431e-05, + "loss": 4.9405, + "step": 12636 + }, + { + "epoch": 0.07515581882196212, + "grad_norm": 1.6616593599319458, + "learning_rate": 4.930649745744124e-05, + "loss": 5.0266, + "step": 12637 + }, + { + "epoch": 0.07516176610524312, + "grad_norm": 1.7911401987075806, + "learning_rate": 4.930638819737303e-05, + "loss": 4.8774, + "step": 12638 + }, + { + "epoch": 0.07516771338852413, + "grad_norm": 1.3613603115081787, + "learning_rate": 4.93062789288197e-05, + "loss": 5.4048, + "step": 12639 + }, + { + "epoch": 0.07517366067180511, + "grad_norm": 1.5945172309875488, + "learning_rate": 4.930616965178131e-05, + "loss": 5.1918, + "step": 12640 + }, + { + "epoch": 0.07517960795508612, + "grad_norm": 1.816091775894165, + "learning_rate": 4.930606036625789e-05, + "loss": 5.3138, + "step": 12641 + }, + { + "epoch": 0.07518555523836712, + "grad_norm": 1.642877459526062, + "learning_rate": 4.930595107224947e-05, + "loss": 5.2438, + "step": 12642 + }, + { + "epoch": 0.07519150252164811, + "grad_norm": 1.8904980421066284, + "learning_rate": 4.930584176975609e-05, + "loss": 5.1565, + "step": 12643 + }, + { + "epoch": 0.07519744980492911, + "grad_norm": 1.6247447729110718, + "learning_rate": 4.93057324587778e-05, + "loss": 5.1795, + "step": 12644 + }, + { + "epoch": 0.07520339708821011, + "grad_norm": 1.4699510335922241, + "learning_rate": 4.930562313931461e-05, + "loss": 5.3628, + "step": 12645 + }, + { + "epoch": 0.0752093443714911, + "grad_norm": 1.537920355796814, + "learning_rate": 4.93055138113666e-05, + "loss": 5.492, + "step": 12646 + }, + { + "epoch": 0.0752152916547721, + "grad_norm": 1.3268204927444458, + "learning_rate": 4.930540447493378e-05, + "loss": 5.2169, + "step": 12647 + }, + { + "epoch": 0.0752212389380531, + "grad_norm": 1.627005934715271, + "learning_rate": 4.930529513001619e-05, + "loss": 5.9358, + "step": 12648 + }, + { + "epoch": 0.07522718622133409, + "grad_norm": 1.445926547050476, + "learning_rate": 4.930518577661388e-05, + "loss": 5.0762, + "step": 12649 + }, + { + "epoch": 0.0752331335046151, + "grad_norm": 1.5958713293075562, + "learning_rate": 4.930507641472688e-05, + "loss": 5.2345, + "step": 12650 + }, + { + "epoch": 0.0752390807878961, + "grad_norm": 1.470540165901184, + "learning_rate": 4.9304967044355225e-05, + "loss": 5.1259, + "step": 12651 + }, + { + "epoch": 0.07524502807117708, + "grad_norm": 1.4679489135742188, + "learning_rate": 4.930485766549896e-05, + "loss": 5.1456, + "step": 12652 + }, + { + "epoch": 0.07525097535445809, + "grad_norm": 1.3032207489013672, + "learning_rate": 4.930474827815812e-05, + "loss": 5.1479, + "step": 12653 + }, + { + "epoch": 0.07525692263773909, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.930463888233274e-05, + "loss": 5.173, + "step": 12654 + }, + { + "epoch": 0.07526286992102008, + "grad_norm": 1.5788590908050537, + "learning_rate": 4.930452947802286e-05, + "loss": 5.0608, + "step": 12655 + }, + { + "epoch": 0.07526881720430108, + "grad_norm": 1.4392722845077515, + "learning_rate": 4.9304420065228526e-05, + "loss": 5.1209, + "step": 12656 + }, + { + "epoch": 0.07527476448758207, + "grad_norm": 1.4725446701049805, + "learning_rate": 4.930431064394977e-05, + "loss": 5.1249, + "step": 12657 + }, + { + "epoch": 0.07528071177086307, + "grad_norm": 1.4239790439605713, + "learning_rate": 4.930420121418663e-05, + "loss": 5.0262, + "step": 12658 + }, + { + "epoch": 0.07528665905414407, + "grad_norm": 1.3037468194961548, + "learning_rate": 4.930409177593914e-05, + "loss": 5.1158, + "step": 12659 + }, + { + "epoch": 0.07529260633742506, + "grad_norm": 1.430015206336975, + "learning_rate": 4.930398232920734e-05, + "loss": 5.1362, + "step": 12660 + }, + { + "epoch": 0.07529855362070606, + "grad_norm": 1.2381033897399902, + "learning_rate": 4.930387287399127e-05, + "loss": 5.2351, + "step": 12661 + }, + { + "epoch": 0.07530450090398706, + "grad_norm": 1.4459912776947021, + "learning_rate": 4.930376341029098e-05, + "loss": 5.1413, + "step": 12662 + }, + { + "epoch": 0.07531044818726805, + "grad_norm": 1.4875576496124268, + "learning_rate": 4.93036539381065e-05, + "loss": 5.0556, + "step": 12663 + }, + { + "epoch": 0.07531639547054905, + "grad_norm": 1.1632124185562134, + "learning_rate": 4.930354445743785e-05, + "loss": 5.2317, + "step": 12664 + }, + { + "epoch": 0.07532234275383005, + "grad_norm": 1.324722170829773, + "learning_rate": 4.9303434968285096e-05, + "loss": 5.0562, + "step": 12665 + }, + { + "epoch": 0.07532829003711104, + "grad_norm": 1.4292213916778564, + "learning_rate": 4.9303325470648254e-05, + "loss": 5.0991, + "step": 12666 + }, + { + "epoch": 0.07533423732039204, + "grad_norm": 1.4528483152389526, + "learning_rate": 4.930321596452738e-05, + "loss": 5.0675, + "step": 12667 + }, + { + "epoch": 0.07534018460367305, + "grad_norm": 1.5489269495010376, + "learning_rate": 4.9303106449922504e-05, + "loss": 4.9073, + "step": 12668 + }, + { + "epoch": 0.07534613188695403, + "grad_norm": 1.440854787826538, + "learning_rate": 4.9302996926833664e-05, + "loss": 5.0401, + "step": 12669 + }, + { + "epoch": 0.07535207917023504, + "grad_norm": 1.4586740732192993, + "learning_rate": 4.9302887395260894e-05, + "loss": 5.0483, + "step": 12670 + }, + { + "epoch": 0.07535802645351604, + "grad_norm": 1.390376091003418, + "learning_rate": 4.930277785520424e-05, + "loss": 5.1417, + "step": 12671 + }, + { + "epoch": 0.07536397373679703, + "grad_norm": 1.296410083770752, + "learning_rate": 4.9302668306663736e-05, + "loss": 5.461, + "step": 12672 + }, + { + "epoch": 0.07536992102007803, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.930255874963943e-05, + "loss": 5.4972, + "step": 12673 + }, + { + "epoch": 0.07537586830335903, + "grad_norm": 1.4567232131958008, + "learning_rate": 4.930244918413134e-05, + "loss": 5.1921, + "step": 12674 + }, + { + "epoch": 0.07538181558664002, + "grad_norm": 1.7850147485733032, + "learning_rate": 4.930233961013953e-05, + "loss": 5.0658, + "step": 12675 + }, + { + "epoch": 0.07538776286992102, + "grad_norm": 1.5736637115478516, + "learning_rate": 4.930223002766401e-05, + "loss": 5.6874, + "step": 12676 + }, + { + "epoch": 0.07539371015320202, + "grad_norm": 1.5202080011367798, + "learning_rate": 4.9302120436704836e-05, + "loss": 5.7279, + "step": 12677 + }, + { + "epoch": 0.07539965743648301, + "grad_norm": 1.4259493350982666, + "learning_rate": 4.930201083726205e-05, + "loss": 5.5445, + "step": 12678 + }, + { + "epoch": 0.07540560471976401, + "grad_norm": 1.5141973495483398, + "learning_rate": 4.9301901229335674e-05, + "loss": 5.5086, + "step": 12679 + }, + { + "epoch": 0.07541155200304502, + "grad_norm": 1.5044218301773071, + "learning_rate": 4.930179161292576e-05, + "loss": 5.4279, + "step": 12680 + }, + { + "epoch": 0.075417499286326, + "grad_norm": 1.5342620611190796, + "learning_rate": 4.930168198803234e-05, + "loss": 5.0885, + "step": 12681 + }, + { + "epoch": 0.075423446569607, + "grad_norm": 1.8139567375183105, + "learning_rate": 4.930157235465546e-05, + "loss": 5.5586, + "step": 12682 + }, + { + "epoch": 0.07542939385288801, + "grad_norm": 1.606778621673584, + "learning_rate": 4.9301462712795144e-05, + "loss": 5.4007, + "step": 12683 + }, + { + "epoch": 0.075435341136169, + "grad_norm": 1.6451623439788818, + "learning_rate": 4.930135306245144e-05, + "loss": 5.2882, + "step": 12684 + }, + { + "epoch": 0.07544128841945, + "grad_norm": 1.915991187095642, + "learning_rate": 4.9301243403624385e-05, + "loss": 5.0727, + "step": 12685 + }, + { + "epoch": 0.07544723570273099, + "grad_norm": 1.536456823348999, + "learning_rate": 4.930113373631402e-05, + "loss": 5.2154, + "step": 12686 + }, + { + "epoch": 0.07545318298601199, + "grad_norm": 1.5820670127868652, + "learning_rate": 4.9301024060520375e-05, + "loss": 5.0613, + "step": 12687 + }, + { + "epoch": 0.07545913026929299, + "grad_norm": 1.5905929803848267, + "learning_rate": 4.93009143762435e-05, + "loss": 5.08, + "step": 12688 + }, + { + "epoch": 0.07546507755257398, + "grad_norm": 1.5759062767028809, + "learning_rate": 4.9300804683483426e-05, + "loss": 5.0874, + "step": 12689 + }, + { + "epoch": 0.07547102483585498, + "grad_norm": 1.4619840383529663, + "learning_rate": 4.9300694982240186e-05, + "loss": 5.1803, + "step": 12690 + }, + { + "epoch": 0.07547697211913598, + "grad_norm": 1.2742846012115479, + "learning_rate": 4.930058527251383e-05, + "loss": 5.2721, + "step": 12691 + }, + { + "epoch": 0.07548291940241697, + "grad_norm": 1.4095741510391235, + "learning_rate": 4.930047555430439e-05, + "loss": 5.055, + "step": 12692 + }, + { + "epoch": 0.07548886668569797, + "grad_norm": 1.3399991989135742, + "learning_rate": 4.93003658276119e-05, + "loss": 5.0315, + "step": 12693 + }, + { + "epoch": 0.07549481396897897, + "grad_norm": 1.4075208902359009, + "learning_rate": 4.9300256092436407e-05, + "loss": 5.2634, + "step": 12694 + }, + { + "epoch": 0.07550076125225996, + "grad_norm": 1.681321144104004, + "learning_rate": 4.930014634877795e-05, + "loss": 4.9749, + "step": 12695 + }, + { + "epoch": 0.07550670853554096, + "grad_norm": 1.842136263847351, + "learning_rate": 4.9300036596636555e-05, + "loss": 4.797, + "step": 12696 + }, + { + "epoch": 0.07551265581882197, + "grad_norm": 1.8733257055282593, + "learning_rate": 4.929992683601228e-05, + "loss": 5.4726, + "step": 12697 + }, + { + "epoch": 0.07551860310210295, + "grad_norm": 1.747514009475708, + "learning_rate": 4.929981706690514e-05, + "loss": 5.1081, + "step": 12698 + }, + { + "epoch": 0.07552455038538396, + "grad_norm": 1.8107210397720337, + "learning_rate": 4.9299707289315187e-05, + "loss": 4.983, + "step": 12699 + }, + { + "epoch": 0.07553049766866496, + "grad_norm": 1.6319682598114014, + "learning_rate": 4.929959750324246e-05, + "loss": 4.9968, + "step": 12700 + }, + { + "epoch": 0.07553644495194595, + "grad_norm": 1.4653065204620361, + "learning_rate": 4.9299487708687e-05, + "loss": 5.3013, + "step": 12701 + }, + { + "epoch": 0.07554239223522695, + "grad_norm": 1.4665262699127197, + "learning_rate": 4.929937790564883e-05, + "loss": 5.4431, + "step": 12702 + }, + { + "epoch": 0.07554833951850795, + "grad_norm": 1.4962518215179443, + "learning_rate": 4.9299268094127996e-05, + "loss": 5.3692, + "step": 12703 + }, + { + "epoch": 0.07555428680178894, + "grad_norm": 1.7913219928741455, + "learning_rate": 4.929915827412454e-05, + "loss": 5.0082, + "step": 12704 + }, + { + "epoch": 0.07556023408506994, + "grad_norm": 1.5508856773376465, + "learning_rate": 4.929904844563851e-05, + "loss": 5.1501, + "step": 12705 + }, + { + "epoch": 0.07556618136835094, + "grad_norm": 1.5882935523986816, + "learning_rate": 4.929893860866993e-05, + "loss": 4.9579, + "step": 12706 + }, + { + "epoch": 0.07557212865163193, + "grad_norm": 1.4550399780273438, + "learning_rate": 4.9298828763218833e-05, + "loss": 5.0165, + "step": 12707 + }, + { + "epoch": 0.07557807593491293, + "grad_norm": 1.5075403451919556, + "learning_rate": 4.929871890928527e-05, + "loss": 4.933, + "step": 12708 + }, + { + "epoch": 0.07558402321819394, + "grad_norm": 1.7094134092330933, + "learning_rate": 4.929860904686928e-05, + "loss": 4.8842, + "step": 12709 + }, + { + "epoch": 0.07558997050147492, + "grad_norm": 1.5615170001983643, + "learning_rate": 4.929849917597089e-05, + "loss": 5.5301, + "step": 12710 + }, + { + "epoch": 0.07559591778475593, + "grad_norm": 1.6687208414077759, + "learning_rate": 4.929838929659015e-05, + "loss": 4.9325, + "step": 12711 + }, + { + "epoch": 0.07560186506803693, + "grad_norm": 1.3476423025131226, + "learning_rate": 4.9298279408727086e-05, + "loss": 5.1274, + "step": 12712 + }, + { + "epoch": 0.07560781235131792, + "grad_norm": 1.359786868095398, + "learning_rate": 4.929816951238175e-05, + "loss": 4.7549, + "step": 12713 + }, + { + "epoch": 0.07561375963459892, + "grad_norm": 1.305482029914856, + "learning_rate": 4.9298059607554184e-05, + "loss": 4.7371, + "step": 12714 + }, + { + "epoch": 0.0756197069178799, + "grad_norm": 1.408693790435791, + "learning_rate": 4.92979496942444e-05, + "loss": 5.0733, + "step": 12715 + }, + { + "epoch": 0.07562565420116091, + "grad_norm": 1.3604625463485718, + "learning_rate": 4.9297839772452456e-05, + "loss": 4.7947, + "step": 12716 + }, + { + "epoch": 0.07563160148444191, + "grad_norm": 1.4101814031600952, + "learning_rate": 4.929772984217839e-05, + "loss": 5.2003, + "step": 12717 + }, + { + "epoch": 0.0756375487677229, + "grad_norm": 1.4409375190734863, + "learning_rate": 4.929761990342224e-05, + "loss": 5.167, + "step": 12718 + }, + { + "epoch": 0.0756434960510039, + "grad_norm": 1.4309754371643066, + "learning_rate": 4.9297509956184044e-05, + "loss": 5.1499, + "step": 12719 + }, + { + "epoch": 0.0756494433342849, + "grad_norm": 1.6380341053009033, + "learning_rate": 4.929740000046382e-05, + "loss": 4.8282, + "step": 12720 + }, + { + "epoch": 0.07565539061756589, + "grad_norm": 1.6795456409454346, + "learning_rate": 4.929729003626164e-05, + "loss": 4.708, + "step": 12721 + }, + { + "epoch": 0.07566133790084689, + "grad_norm": 1.7367075681686401, + "learning_rate": 4.929718006357753e-05, + "loss": 5.3364, + "step": 12722 + }, + { + "epoch": 0.0756672851841279, + "grad_norm": 1.5842353105545044, + "learning_rate": 4.929707008241152e-05, + "loss": 5.2025, + "step": 12723 + }, + { + "epoch": 0.07567323246740888, + "grad_norm": 1.5129985809326172, + "learning_rate": 4.9296960092763657e-05, + "loss": 5.1788, + "step": 12724 + }, + { + "epoch": 0.07567917975068988, + "grad_norm": 1.4276295900344849, + "learning_rate": 4.929685009463397e-05, + "loss": 5.2597, + "step": 12725 + }, + { + "epoch": 0.07568512703397089, + "grad_norm": 1.499213457107544, + "learning_rate": 4.9296740088022506e-05, + "loss": 5.1778, + "step": 12726 + }, + { + "epoch": 0.07569107431725187, + "grad_norm": 1.4656083583831787, + "learning_rate": 4.92966300729293e-05, + "loss": 5.2689, + "step": 12727 + }, + { + "epoch": 0.07569702160053288, + "grad_norm": 1.6160268783569336, + "learning_rate": 4.9296520049354393e-05, + "loss": 5.1829, + "step": 12728 + }, + { + "epoch": 0.07570296888381388, + "grad_norm": 1.514891266822815, + "learning_rate": 4.929641001729782e-05, + "loss": 5.2586, + "step": 12729 + }, + { + "epoch": 0.07570891616709487, + "grad_norm": 1.4635345935821533, + "learning_rate": 4.929629997675963e-05, + "loss": 5.2159, + "step": 12730 + }, + { + "epoch": 0.07571486345037587, + "grad_norm": 1.704380750656128, + "learning_rate": 4.9296189927739846e-05, + "loss": 5.1068, + "step": 12731 + }, + { + "epoch": 0.07572081073365687, + "grad_norm": 1.5786374807357788, + "learning_rate": 4.929607987023851e-05, + "loss": 5.2306, + "step": 12732 + }, + { + "epoch": 0.07572675801693786, + "grad_norm": 1.5011721849441528, + "learning_rate": 4.929596980425567e-05, + "loss": 5.1594, + "step": 12733 + }, + { + "epoch": 0.07573270530021886, + "grad_norm": 1.4532456398010254, + "learning_rate": 4.9295859729791354e-05, + "loss": 5.0955, + "step": 12734 + }, + { + "epoch": 0.07573865258349986, + "grad_norm": 1.5734699964523315, + "learning_rate": 4.9295749646845604e-05, + "loss": 5.1523, + "step": 12735 + }, + { + "epoch": 0.07574459986678085, + "grad_norm": 1.578141450881958, + "learning_rate": 4.929563955541846e-05, + "loss": 5.0784, + "step": 12736 + }, + { + "epoch": 0.07575054715006185, + "grad_norm": 1.408524513244629, + "learning_rate": 4.929552945550996e-05, + "loss": 5.1411, + "step": 12737 + }, + { + "epoch": 0.07575649443334286, + "grad_norm": 1.4755773544311523, + "learning_rate": 4.929541934712014e-05, + "loss": 5.0666, + "step": 12738 + }, + { + "epoch": 0.07576244171662384, + "grad_norm": 1.5521161556243896, + "learning_rate": 4.929530923024904e-05, + "loss": 5.0938, + "step": 12739 + }, + { + "epoch": 0.07576838899990485, + "grad_norm": 1.4772706031799316, + "learning_rate": 4.929519910489671e-05, + "loss": 5.1178, + "step": 12740 + }, + { + "epoch": 0.07577433628318585, + "grad_norm": 1.2669662237167358, + "learning_rate": 4.9295088971063164e-05, + "loss": 5.2565, + "step": 12741 + }, + { + "epoch": 0.07578028356646684, + "grad_norm": 1.5846413373947144, + "learning_rate": 4.929497882874845e-05, + "loss": 5.2109, + "step": 12742 + }, + { + "epoch": 0.07578623084974784, + "grad_norm": 1.779228687286377, + "learning_rate": 4.929486867795262e-05, + "loss": 5.0196, + "step": 12743 + }, + { + "epoch": 0.07579217813302883, + "grad_norm": 1.6306418180465698, + "learning_rate": 4.92947585186757e-05, + "loss": 5.1982, + "step": 12744 + }, + { + "epoch": 0.07579812541630983, + "grad_norm": 1.5107831954956055, + "learning_rate": 4.9294648350917726e-05, + "loss": 5.0652, + "step": 12745 + }, + { + "epoch": 0.07580407269959083, + "grad_norm": 1.3846759796142578, + "learning_rate": 4.9294538174678744e-05, + "loss": 5.0322, + "step": 12746 + }, + { + "epoch": 0.07581001998287182, + "grad_norm": 1.4558676481246948, + "learning_rate": 4.9294427989958794e-05, + "loss": 4.9626, + "step": 12747 + }, + { + "epoch": 0.07581596726615282, + "grad_norm": 1.3155016899108887, + "learning_rate": 4.92943177967579e-05, + "loss": 4.9965, + "step": 12748 + }, + { + "epoch": 0.07582191454943382, + "grad_norm": 1.3237980604171753, + "learning_rate": 4.9294207595076125e-05, + "loss": 4.9697, + "step": 12749 + }, + { + "epoch": 0.07582786183271481, + "grad_norm": 1.4439423084259033, + "learning_rate": 4.929409738491349e-05, + "loss": 5.0636, + "step": 12750 + }, + { + "epoch": 0.07583380911599581, + "grad_norm": 1.4793460369110107, + "learning_rate": 4.9293987166270024e-05, + "loss": 5.1122, + "step": 12751 + }, + { + "epoch": 0.07583975639927681, + "grad_norm": 1.5353471040725708, + "learning_rate": 4.929387693914578e-05, + "loss": 5.174, + "step": 12752 + }, + { + "epoch": 0.0758457036825578, + "grad_norm": 1.690537452697754, + "learning_rate": 4.929376670354081e-05, + "loss": 5.1515, + "step": 12753 + }, + { + "epoch": 0.0758516509658388, + "grad_norm": 1.4602952003479004, + "learning_rate": 4.9293656459455124e-05, + "loss": 5.1244, + "step": 12754 + }, + { + "epoch": 0.0758575982491198, + "grad_norm": 1.5871785879135132, + "learning_rate": 4.929354620688878e-05, + "loss": 5.2856, + "step": 12755 + }, + { + "epoch": 0.0758635455324008, + "grad_norm": 1.588065505027771, + "learning_rate": 4.92934359458418e-05, + "loss": 5.3694, + "step": 12756 + }, + { + "epoch": 0.0758694928156818, + "grad_norm": 1.5489270687103271, + "learning_rate": 4.929332567631424e-05, + "loss": 5.3546, + "step": 12757 + }, + { + "epoch": 0.0758754400989628, + "grad_norm": 1.493815541267395, + "learning_rate": 4.9293215398306136e-05, + "loss": 5.0878, + "step": 12758 + }, + { + "epoch": 0.07588138738224379, + "grad_norm": 1.3329546451568604, + "learning_rate": 4.929310511181751e-05, + "loss": 5.2171, + "step": 12759 + }, + { + "epoch": 0.07588733466552479, + "grad_norm": 1.5299288034439087, + "learning_rate": 4.929299481684842e-05, + "loss": 5.1695, + "step": 12760 + }, + { + "epoch": 0.07589328194880579, + "grad_norm": 1.5130664110183716, + "learning_rate": 4.9292884513398894e-05, + "loss": 5.3169, + "step": 12761 + }, + { + "epoch": 0.07589922923208678, + "grad_norm": 1.420339584350586, + "learning_rate": 4.9292774201468974e-05, + "loss": 5.1995, + "step": 12762 + }, + { + "epoch": 0.07590517651536778, + "grad_norm": 1.4740930795669556, + "learning_rate": 4.9292663881058696e-05, + "loss": 5.3321, + "step": 12763 + }, + { + "epoch": 0.07591112379864878, + "grad_norm": 1.448968768119812, + "learning_rate": 4.92925535521681e-05, + "loss": 5.1292, + "step": 12764 + }, + { + "epoch": 0.07591707108192977, + "grad_norm": 1.3219209909439087, + "learning_rate": 4.929244321479722e-05, + "loss": 5.1873, + "step": 12765 + }, + { + "epoch": 0.07592301836521077, + "grad_norm": 1.3336325883865356, + "learning_rate": 4.929233286894611e-05, + "loss": 5.248, + "step": 12766 + }, + { + "epoch": 0.07592896564849178, + "grad_norm": 1.4230278730392456, + "learning_rate": 4.9292222514614795e-05, + "loss": 5.2072, + "step": 12767 + }, + { + "epoch": 0.07593491293177276, + "grad_norm": 1.4522627592086792, + "learning_rate": 4.929211215180331e-05, + "loss": 5.4323, + "step": 12768 + }, + { + "epoch": 0.07594086021505377, + "grad_norm": 1.4863537549972534, + "learning_rate": 4.929200178051171e-05, + "loss": 5.241, + "step": 12769 + }, + { + "epoch": 0.07594680749833477, + "grad_norm": 1.7619402408599854, + "learning_rate": 4.929189140074001e-05, + "loss": 5.4853, + "step": 12770 + }, + { + "epoch": 0.07595275478161576, + "grad_norm": 1.6116011142730713, + "learning_rate": 4.929178101248827e-05, + "loss": 5.4793, + "step": 12771 + }, + { + "epoch": 0.07595870206489676, + "grad_norm": 1.8669662475585938, + "learning_rate": 4.9291670615756516e-05, + "loss": 5.4062, + "step": 12772 + }, + { + "epoch": 0.07596464934817775, + "grad_norm": 1.6439383029937744, + "learning_rate": 4.9291560210544796e-05, + "loss": 5.148, + "step": 12773 + }, + { + "epoch": 0.07597059663145875, + "grad_norm": 1.4800657033920288, + "learning_rate": 4.929144979685314e-05, + "loss": 5.3895, + "step": 12774 + }, + { + "epoch": 0.07597654391473975, + "grad_norm": 1.4091606140136719, + "learning_rate": 4.929133937468159e-05, + "loss": 5.3307, + "step": 12775 + }, + { + "epoch": 0.07598249119802074, + "grad_norm": 1.3786438703536987, + "learning_rate": 4.9291228944030176e-05, + "loss": 5.0786, + "step": 12776 + }, + { + "epoch": 0.07598843848130174, + "grad_norm": 1.6039817333221436, + "learning_rate": 4.929111850489896e-05, + "loss": 5.0606, + "step": 12777 + }, + { + "epoch": 0.07599438576458274, + "grad_norm": 1.5277283191680908, + "learning_rate": 4.929100805728796e-05, + "loss": 5.1949, + "step": 12778 + }, + { + "epoch": 0.07600033304786373, + "grad_norm": 1.6756436824798584, + "learning_rate": 4.929089760119722e-05, + "loss": 5.125, + "step": 12779 + }, + { + "epoch": 0.07600628033114473, + "grad_norm": 1.7082979679107666, + "learning_rate": 4.929078713662677e-05, + "loss": 5.1984, + "step": 12780 + }, + { + "epoch": 0.07601222761442573, + "grad_norm": 1.607293963432312, + "learning_rate": 4.929067666357666e-05, + "loss": 5.1809, + "step": 12781 + }, + { + "epoch": 0.07601817489770672, + "grad_norm": 1.5133613348007202, + "learning_rate": 4.9290566182046936e-05, + "loss": 5.2602, + "step": 12782 + }, + { + "epoch": 0.07602412218098772, + "grad_norm": 1.6572481393814087, + "learning_rate": 4.9290455692037616e-05, + "loss": 5.0959, + "step": 12783 + }, + { + "epoch": 0.07603006946426873, + "grad_norm": 1.6593372821807861, + "learning_rate": 4.929034519354876e-05, + "loss": 5.1672, + "step": 12784 + }, + { + "epoch": 0.07603601674754971, + "grad_norm": 1.4214340448379517, + "learning_rate": 4.929023468658038e-05, + "loss": 5.1064, + "step": 12785 + }, + { + "epoch": 0.07604196403083072, + "grad_norm": 1.4875116348266602, + "learning_rate": 4.929012417113255e-05, + "loss": 5.0657, + "step": 12786 + }, + { + "epoch": 0.07604791131411172, + "grad_norm": 1.7354154586791992, + "learning_rate": 4.929001364720527e-05, + "loss": 5.0415, + "step": 12787 + }, + { + "epoch": 0.0760538585973927, + "grad_norm": 1.5597622394561768, + "learning_rate": 4.928990311479861e-05, + "loss": 5.1404, + "step": 12788 + }, + { + "epoch": 0.07605980588067371, + "grad_norm": 1.6819382905960083, + "learning_rate": 4.928979257391258e-05, + "loss": 4.9487, + "step": 12789 + }, + { + "epoch": 0.07606575316395471, + "grad_norm": 1.4722174406051636, + "learning_rate": 4.928968202454725e-05, + "loss": 5.1677, + "step": 12790 + }, + { + "epoch": 0.0760717004472357, + "grad_norm": 1.5145434141159058, + "learning_rate": 4.9289571466702635e-05, + "loss": 5.2197, + "step": 12791 + }, + { + "epoch": 0.0760776477305167, + "grad_norm": 1.6052699089050293, + "learning_rate": 4.9289460900378784e-05, + "loss": 5.2508, + "step": 12792 + }, + { + "epoch": 0.0760835950137977, + "grad_norm": 1.3738253116607666, + "learning_rate": 4.9289350325575734e-05, + "loss": 5.1253, + "step": 12793 + }, + { + "epoch": 0.07608954229707869, + "grad_norm": 1.2580832242965698, + "learning_rate": 4.9289239742293524e-05, + "loss": 5.2497, + "step": 12794 + }, + { + "epoch": 0.0760954895803597, + "grad_norm": 1.6756019592285156, + "learning_rate": 4.928912915053219e-05, + "loss": 5.2471, + "step": 12795 + }, + { + "epoch": 0.0761014368636407, + "grad_norm": 1.6785964965820312, + "learning_rate": 4.928901855029177e-05, + "loss": 4.9893, + "step": 12796 + }, + { + "epoch": 0.07610738414692168, + "grad_norm": 1.6926941871643066, + "learning_rate": 4.92889079415723e-05, + "loss": 5.1558, + "step": 12797 + }, + { + "epoch": 0.07611333143020269, + "grad_norm": 1.4381680488586426, + "learning_rate": 4.9288797324373835e-05, + "loss": 4.9754, + "step": 12798 + }, + { + "epoch": 0.07611927871348369, + "grad_norm": 1.4430698156356812, + "learning_rate": 4.9288686698696393e-05, + "loss": 5.0197, + "step": 12799 + }, + { + "epoch": 0.07612522599676468, + "grad_norm": 1.4745796918869019, + "learning_rate": 4.928857606454002e-05, + "loss": 4.8857, + "step": 12800 + }, + { + "epoch": 0.07613117328004568, + "grad_norm": 1.5430330038070679, + "learning_rate": 4.928846542190477e-05, + "loss": 5.0407, + "step": 12801 + }, + { + "epoch": 0.07613712056332667, + "grad_norm": 1.6061021089553833, + "learning_rate": 4.928835477079066e-05, + "loss": 5.068, + "step": 12802 + }, + { + "epoch": 0.07614306784660767, + "grad_norm": 1.699568510055542, + "learning_rate": 4.9288244111197734e-05, + "loss": 4.9067, + "step": 12803 + }, + { + "epoch": 0.07614901512988867, + "grad_norm": 1.4770212173461914, + "learning_rate": 4.928813344312603e-05, + "loss": 5.0807, + "step": 12804 + }, + { + "epoch": 0.07615496241316966, + "grad_norm": 1.4657871723175049, + "learning_rate": 4.928802276657559e-05, + "loss": 5.1982, + "step": 12805 + }, + { + "epoch": 0.07616090969645066, + "grad_norm": 1.7897653579711914, + "learning_rate": 4.928791208154646e-05, + "loss": 5.1154, + "step": 12806 + }, + { + "epoch": 0.07616685697973166, + "grad_norm": 1.6905261278152466, + "learning_rate": 4.928780138803866e-05, + "loss": 5.3129, + "step": 12807 + }, + { + "epoch": 0.07617280426301265, + "grad_norm": 1.4763284921646118, + "learning_rate": 4.928769068605225e-05, + "loss": 5.2104, + "step": 12808 + }, + { + "epoch": 0.07617875154629365, + "grad_norm": 1.38632333278656, + "learning_rate": 4.928757997558725e-05, + "loss": 5.0857, + "step": 12809 + }, + { + "epoch": 0.07618469882957465, + "grad_norm": 1.5099103450775146, + "learning_rate": 4.928746925664371e-05, + "loss": 5.1264, + "step": 12810 + }, + { + "epoch": 0.07619064611285564, + "grad_norm": 1.285243272781372, + "learning_rate": 4.928735852922167e-05, + "loss": 5.1177, + "step": 12811 + }, + { + "epoch": 0.07619659339613664, + "grad_norm": 1.2749274969100952, + "learning_rate": 4.928724779332116e-05, + "loss": 5.0831, + "step": 12812 + }, + { + "epoch": 0.07620254067941765, + "grad_norm": 2.413712978363037, + "learning_rate": 4.928713704894222e-05, + "loss": 5.2416, + "step": 12813 + }, + { + "epoch": 0.07620848796269863, + "grad_norm": 1.602721929550171, + "learning_rate": 4.9287026296084895e-05, + "loss": 4.9799, + "step": 12814 + }, + { + "epoch": 0.07621443524597964, + "grad_norm": 1.515821099281311, + "learning_rate": 4.928691553474921e-05, + "loss": 5.034, + "step": 12815 + }, + { + "epoch": 0.07622038252926064, + "grad_norm": 1.3245290517807007, + "learning_rate": 4.928680476493523e-05, + "loss": 4.9559, + "step": 12816 + }, + { + "epoch": 0.07622632981254163, + "grad_norm": 1.5383784770965576, + "learning_rate": 4.928669398664297e-05, + "loss": 4.9085, + "step": 12817 + }, + { + "epoch": 0.07623227709582263, + "grad_norm": 1.4406317472457886, + "learning_rate": 4.928658319987247e-05, + "loss": 5.0073, + "step": 12818 + }, + { + "epoch": 0.07623822437910363, + "grad_norm": 1.6843304634094238, + "learning_rate": 4.928647240462378e-05, + "loss": 5.0262, + "step": 12819 + }, + { + "epoch": 0.07624417166238462, + "grad_norm": 1.655497431755066, + "learning_rate": 4.928636160089693e-05, + "loss": 5.0633, + "step": 12820 + }, + { + "epoch": 0.07625011894566562, + "grad_norm": 1.4143035411834717, + "learning_rate": 4.9286250788691973e-05, + "loss": 5.1131, + "step": 12821 + }, + { + "epoch": 0.07625606622894662, + "grad_norm": 1.5316637754440308, + "learning_rate": 4.9286139968008926e-05, + "loss": 5.2727, + "step": 12822 + }, + { + "epoch": 0.07626201351222761, + "grad_norm": 1.6708348989486694, + "learning_rate": 4.9286029138847844e-05, + "loss": 5.1469, + "step": 12823 + }, + { + "epoch": 0.07626796079550861, + "grad_norm": 1.48544180393219, + "learning_rate": 4.928591830120876e-05, + "loss": 5.0916, + "step": 12824 + }, + { + "epoch": 0.07627390807878962, + "grad_norm": 1.3884835243225098, + "learning_rate": 4.9285807455091715e-05, + "loss": 5.1451, + "step": 12825 + }, + { + "epoch": 0.0762798553620706, + "grad_norm": 1.7265839576721191, + "learning_rate": 4.928569660049674e-05, + "loss": 5.0478, + "step": 12826 + }, + { + "epoch": 0.0762858026453516, + "grad_norm": 1.678852915763855, + "learning_rate": 4.9285585737423875e-05, + "loss": 5.2127, + "step": 12827 + }, + { + "epoch": 0.07629174992863261, + "grad_norm": 1.4907126426696777, + "learning_rate": 4.928547486587317e-05, + "loss": 4.9706, + "step": 12828 + }, + { + "epoch": 0.0762976972119136, + "grad_norm": 1.610822319984436, + "learning_rate": 4.928536398584466e-05, + "loss": 5.2416, + "step": 12829 + }, + { + "epoch": 0.0763036444951946, + "grad_norm": 1.5226528644561768, + "learning_rate": 4.9285253097338375e-05, + "loss": 5.2665, + "step": 12830 + }, + { + "epoch": 0.07630959177847559, + "grad_norm": 1.6021392345428467, + "learning_rate": 4.928514220035436e-05, + "loss": 5.2129, + "step": 12831 + }, + { + "epoch": 0.07631553906175659, + "grad_norm": 1.4113723039627075, + "learning_rate": 4.928503129489265e-05, + "loss": 5.3568, + "step": 12832 + }, + { + "epoch": 0.07632148634503759, + "grad_norm": 1.7851402759552002, + "learning_rate": 4.928492038095329e-05, + "loss": 5.2028, + "step": 12833 + }, + { + "epoch": 0.07632743362831858, + "grad_norm": 2.0881283283233643, + "learning_rate": 4.928480945853631e-05, + "loss": 5.2721, + "step": 12834 + }, + { + "epoch": 0.07633338091159958, + "grad_norm": 1.376695156097412, + "learning_rate": 4.928469852764176e-05, + "loss": 5.0203, + "step": 12835 + }, + { + "epoch": 0.07633932819488058, + "grad_norm": 1.585046648979187, + "learning_rate": 4.928458758826967e-05, + "loss": 5.4281, + "step": 12836 + }, + { + "epoch": 0.07634527547816157, + "grad_norm": 1.7124192714691162, + "learning_rate": 4.928447664042008e-05, + "loss": 5.4921, + "step": 12837 + }, + { + "epoch": 0.07635122276144257, + "grad_norm": 1.5693449974060059, + "learning_rate": 4.928436568409304e-05, + "loss": 5.5729, + "step": 12838 + }, + { + "epoch": 0.07635717004472357, + "grad_norm": 2.072880506515503, + "learning_rate": 4.928425471928857e-05, + "loss": 5.1023, + "step": 12839 + }, + { + "epoch": 0.07636311732800456, + "grad_norm": 1.674325704574585, + "learning_rate": 4.928414374600672e-05, + "loss": 5.5319, + "step": 12840 + }, + { + "epoch": 0.07636906461128556, + "grad_norm": 1.3941127061843872, + "learning_rate": 4.9284032764247523e-05, + "loss": 5.4425, + "step": 12841 + }, + { + "epoch": 0.07637501189456657, + "grad_norm": 1.670743703842163, + "learning_rate": 4.9283921774011025e-05, + "loss": 5.2595, + "step": 12842 + }, + { + "epoch": 0.07638095917784755, + "grad_norm": 2.852534294128418, + "learning_rate": 4.928381077529726e-05, + "loss": 5.321, + "step": 12843 + }, + { + "epoch": 0.07638690646112856, + "grad_norm": 1.930977463722229, + "learning_rate": 4.928369976810626e-05, + "loss": 5.2649, + "step": 12844 + }, + { + "epoch": 0.07639285374440956, + "grad_norm": 1.8886314630508423, + "learning_rate": 4.928358875243808e-05, + "loss": 5.1882, + "step": 12845 + }, + { + "epoch": 0.07639880102769055, + "grad_norm": 1.793514609336853, + "learning_rate": 4.9283477728292745e-05, + "loss": 5.0946, + "step": 12846 + }, + { + "epoch": 0.07640474831097155, + "grad_norm": 1.8616431951522827, + "learning_rate": 4.9283366695670304e-05, + "loss": 5.1097, + "step": 12847 + }, + { + "epoch": 0.07641069559425255, + "grad_norm": 1.9281915426254272, + "learning_rate": 4.9283255654570785e-05, + "loss": 5.0054, + "step": 12848 + }, + { + "epoch": 0.07641664287753354, + "grad_norm": 2.036522150039673, + "learning_rate": 4.9283144604994234e-05, + "loss": 4.9115, + "step": 12849 + }, + { + "epoch": 0.07642259016081454, + "grad_norm": 1.7962864637374878, + "learning_rate": 4.928303354694069e-05, + "loss": 4.8951, + "step": 12850 + }, + { + "epoch": 0.07642853744409554, + "grad_norm": 2.1671249866485596, + "learning_rate": 4.9282922480410195e-05, + "loss": 5.1393, + "step": 12851 + }, + { + "epoch": 0.07643448472737653, + "grad_norm": 1.9870150089263916, + "learning_rate": 4.9282811405402774e-05, + "loss": 5.5572, + "step": 12852 + }, + { + "epoch": 0.07644043201065753, + "grad_norm": 2.1498360633850098, + "learning_rate": 4.928270032191847e-05, + "loss": 5.7031, + "step": 12853 + }, + { + "epoch": 0.07644637929393854, + "grad_norm": 2.06821870803833, + "learning_rate": 4.928258922995734e-05, + "loss": 5.723, + "step": 12854 + }, + { + "epoch": 0.07645232657721952, + "grad_norm": 2.283720016479492, + "learning_rate": 4.92824781295194e-05, + "loss": 5.2129, + "step": 12855 + }, + { + "epoch": 0.07645827386050053, + "grad_norm": 2.1862099170684814, + "learning_rate": 4.9282367020604704e-05, + "loss": 4.7535, + "step": 12856 + }, + { + "epoch": 0.07646422114378153, + "grad_norm": 1.7297099828720093, + "learning_rate": 4.928225590321328e-05, + "loss": 5.1965, + "step": 12857 + }, + { + "epoch": 0.07647016842706252, + "grad_norm": 2.0406720638275146, + "learning_rate": 4.9282144777345176e-05, + "loss": 5.289, + "step": 12858 + }, + { + "epoch": 0.07647611571034352, + "grad_norm": 1.8368127346038818, + "learning_rate": 4.928203364300042e-05, + "loss": 5.5448, + "step": 12859 + }, + { + "epoch": 0.0764820629936245, + "grad_norm": 1.837804913520813, + "learning_rate": 4.9281922500179054e-05, + "loss": 5.5284, + "step": 12860 + }, + { + "epoch": 0.07648801027690551, + "grad_norm": 1.7191063165664673, + "learning_rate": 4.928181134888113e-05, + "loss": 5.8212, + "step": 12861 + }, + { + "epoch": 0.07649395756018651, + "grad_norm": 1.757323980331421, + "learning_rate": 4.928170018910667e-05, + "loss": 5.8421, + "step": 12862 + }, + { + "epoch": 0.0764999048434675, + "grad_norm": 1.9213273525238037, + "learning_rate": 4.928158902085572e-05, + "loss": 5.1923, + "step": 12863 + }, + { + "epoch": 0.0765058521267485, + "grad_norm": 1.888006567955017, + "learning_rate": 4.928147784412832e-05, + "loss": 5.4282, + "step": 12864 + }, + { + "epoch": 0.0765117994100295, + "grad_norm": 1.555870771408081, + "learning_rate": 4.9281366658924506e-05, + "loss": 5.8256, + "step": 12865 + }, + { + "epoch": 0.07651774669331049, + "grad_norm": 1.8194485902786255, + "learning_rate": 4.9281255465244314e-05, + "loss": 5.5886, + "step": 12866 + }, + { + "epoch": 0.07652369397659149, + "grad_norm": 1.7867372035980225, + "learning_rate": 4.9281144263087795e-05, + "loss": 5.4818, + "step": 12867 + }, + { + "epoch": 0.0765296412598725, + "grad_norm": 1.8511155843734741, + "learning_rate": 4.928103305245497e-05, + "loss": 5.519, + "step": 12868 + }, + { + "epoch": 0.07653558854315348, + "grad_norm": 2.728428602218628, + "learning_rate": 4.928092183334589e-05, + "loss": 5.0085, + "step": 12869 + }, + { + "epoch": 0.07654153582643448, + "grad_norm": 2.5393402576446533, + "learning_rate": 4.92808106057606e-05, + "loss": 5.0862, + "step": 12870 + }, + { + "epoch": 0.07654748310971549, + "grad_norm": 2.494248151779175, + "learning_rate": 4.928069936969912e-05, + "loss": 5.5557, + "step": 12871 + }, + { + "epoch": 0.07655343039299647, + "grad_norm": 2.4287991523742676, + "learning_rate": 4.9280588125161496e-05, + "loss": 5.6646, + "step": 12872 + }, + { + "epoch": 0.07655937767627748, + "grad_norm": 2.188556432723999, + "learning_rate": 4.928047687214778e-05, + "loss": 5.6618, + "step": 12873 + }, + { + "epoch": 0.07656532495955848, + "grad_norm": 2.7367382049560547, + "learning_rate": 4.9280365610657996e-05, + "loss": 4.6788, + "step": 12874 + }, + { + "epoch": 0.07657127224283947, + "grad_norm": 2.492922067642212, + "learning_rate": 4.9280254340692187e-05, + "loss": 4.4132, + "step": 12875 + }, + { + "epoch": 0.07657721952612047, + "grad_norm": 2.361133575439453, + "learning_rate": 4.928014306225039e-05, + "loss": 4.3957, + "step": 12876 + }, + { + "epoch": 0.07658316680940147, + "grad_norm": 2.652127742767334, + "learning_rate": 4.9280031775332646e-05, + "loss": 4.4568, + "step": 12877 + }, + { + "epoch": 0.07658911409268246, + "grad_norm": 2.40895938873291, + "learning_rate": 4.9279920479938995e-05, + "loss": 4.6276, + "step": 12878 + }, + { + "epoch": 0.07659506137596346, + "grad_norm": 1.9418548345565796, + "learning_rate": 4.927980917606948e-05, + "loss": 5.6008, + "step": 12879 + }, + { + "epoch": 0.07660100865924446, + "grad_norm": 1.7706143856048584, + "learning_rate": 4.9279697863724125e-05, + "loss": 5.4946, + "step": 12880 + }, + { + "epoch": 0.07660695594252545, + "grad_norm": 2.856342077255249, + "learning_rate": 4.9279586542902986e-05, + "loss": 4.9182, + "step": 12881 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 2.713515043258667, + "learning_rate": 4.927947521360608e-05, + "loss": 5.2341, + "step": 12882 + }, + { + "epoch": 0.07661885050908745, + "grad_norm": 2.186169147491455, + "learning_rate": 4.927936387583348e-05, + "loss": 5.1348, + "step": 12883 + }, + { + "epoch": 0.07662479779236844, + "grad_norm": 2.3114492893218994, + "learning_rate": 4.9279252529585195e-05, + "loss": 5.0016, + "step": 12884 + }, + { + "epoch": 0.07663074507564945, + "grad_norm": 2.256502866744995, + "learning_rate": 4.927914117486128e-05, + "loss": 5.1759, + "step": 12885 + }, + { + "epoch": 0.07663669235893045, + "grad_norm": 2.281243324279785, + "learning_rate": 4.927902981166176e-05, + "loss": 5.1437, + "step": 12886 + }, + { + "epoch": 0.07664263964221144, + "grad_norm": 2.3553836345672607, + "learning_rate": 4.927891843998668e-05, + "loss": 5.1622, + "step": 12887 + }, + { + "epoch": 0.07664858692549244, + "grad_norm": 2.420192003250122, + "learning_rate": 4.927880705983609e-05, + "loss": 4.994, + "step": 12888 + }, + { + "epoch": 0.07665453420877343, + "grad_norm": 2.3391306400299072, + "learning_rate": 4.927869567121001e-05, + "loss": 4.9445, + "step": 12889 + }, + { + "epoch": 0.07666048149205443, + "grad_norm": 2.2093355655670166, + "learning_rate": 4.9278584274108484e-05, + "loss": 5.05, + "step": 12890 + }, + { + "epoch": 0.07666642877533543, + "grad_norm": 2.3378305435180664, + "learning_rate": 4.927847286853157e-05, + "loss": 4.8694, + "step": 12891 + }, + { + "epoch": 0.07667237605861642, + "grad_norm": 2.2110583782196045, + "learning_rate": 4.927836145447928e-05, + "loss": 4.8622, + "step": 12892 + }, + { + "epoch": 0.07667832334189742, + "grad_norm": 2.2865991592407227, + "learning_rate": 4.927825003195167e-05, + "loss": 4.9485, + "step": 12893 + }, + { + "epoch": 0.07668427062517842, + "grad_norm": 2.343135356903076, + "learning_rate": 4.927813860094878e-05, + "loss": 4.8874, + "step": 12894 + }, + { + "epoch": 0.07669021790845941, + "grad_norm": 2.1939613819122314, + "learning_rate": 4.927802716147063e-05, + "loss": 4.8349, + "step": 12895 + }, + { + "epoch": 0.07669616519174041, + "grad_norm": 2.866560697555542, + "learning_rate": 4.927791571351728e-05, + "loss": 5.1409, + "step": 12896 + }, + { + "epoch": 0.07670211247502141, + "grad_norm": 2.1052801609039307, + "learning_rate": 4.927780425708876e-05, + "loss": 5.3716, + "step": 12897 + }, + { + "epoch": 0.0767080597583024, + "grad_norm": 2.141184091567993, + "learning_rate": 4.9277692792185106e-05, + "loss": 5.2985, + "step": 12898 + }, + { + "epoch": 0.0767140070415834, + "grad_norm": 1.93148934841156, + "learning_rate": 4.927758131880636e-05, + "loss": 5.6222, + "step": 12899 + }, + { + "epoch": 0.0767199543248644, + "grad_norm": 1.8454651832580566, + "learning_rate": 4.927746983695256e-05, + "loss": 5.6966, + "step": 12900 + }, + { + "epoch": 0.0767259016081454, + "grad_norm": 1.764281153678894, + "learning_rate": 4.9277358346623746e-05, + "loss": 5.4979, + "step": 12901 + }, + { + "epoch": 0.0767318488914264, + "grad_norm": 1.6969131231307983, + "learning_rate": 4.9277246847819965e-05, + "loss": 5.5221, + "step": 12902 + }, + { + "epoch": 0.0767377961747074, + "grad_norm": 1.7118967771530151, + "learning_rate": 4.927713534054124e-05, + "loss": 5.6067, + "step": 12903 + }, + { + "epoch": 0.07674374345798839, + "grad_norm": 2.1508536338806152, + "learning_rate": 4.9277023824787625e-05, + "loss": 5.8241, + "step": 12904 + }, + { + "epoch": 0.07674969074126939, + "grad_norm": 1.8613126277923584, + "learning_rate": 4.927691230055914e-05, + "loss": 5.7141, + "step": 12905 + }, + { + "epoch": 0.07675563802455039, + "grad_norm": 1.8942763805389404, + "learning_rate": 4.927680076785585e-05, + "loss": 5.6909, + "step": 12906 + }, + { + "epoch": 0.07676158530783138, + "grad_norm": 1.8824634552001953, + "learning_rate": 4.927668922667777e-05, + "loss": 5.5055, + "step": 12907 + }, + { + "epoch": 0.07676753259111238, + "grad_norm": 1.8920915126800537, + "learning_rate": 4.927657767702495e-05, + "loss": 5.1783, + "step": 12908 + }, + { + "epoch": 0.07677347987439338, + "grad_norm": 1.8226712942123413, + "learning_rate": 4.927646611889743e-05, + "loss": 5.7529, + "step": 12909 + }, + { + "epoch": 0.07677942715767437, + "grad_norm": 1.88478684425354, + "learning_rate": 4.9276354552295245e-05, + "loss": 5.7034, + "step": 12910 + }, + { + "epoch": 0.07678537444095537, + "grad_norm": 1.6312634944915771, + "learning_rate": 4.927624297721844e-05, + "loss": 5.6476, + "step": 12911 + }, + { + "epoch": 0.07679132172423637, + "grad_norm": 1.5183994770050049, + "learning_rate": 4.927613139366704e-05, + "loss": 5.8517, + "step": 12912 + }, + { + "epoch": 0.07679726900751736, + "grad_norm": 1.6718844175338745, + "learning_rate": 4.92760198016411e-05, + "loss": 5.9619, + "step": 12913 + }, + { + "epoch": 0.07680321629079837, + "grad_norm": 2.575932741165161, + "learning_rate": 4.9275908201140654e-05, + "loss": 5.6903, + "step": 12914 + }, + { + "epoch": 0.07680916357407937, + "grad_norm": 2.2863197326660156, + "learning_rate": 4.927579659216574e-05, + "loss": 5.7517, + "step": 12915 + }, + { + "epoch": 0.07681511085736036, + "grad_norm": 2.231417417526245, + "learning_rate": 4.9275684974716384e-05, + "loss": 5.2323, + "step": 12916 + }, + { + "epoch": 0.07682105814064136, + "grad_norm": 1.9159691333770752, + "learning_rate": 4.927557334879265e-05, + "loss": 5.2548, + "step": 12917 + }, + { + "epoch": 0.07682700542392235, + "grad_norm": 1.6682984828948975, + "learning_rate": 4.927546171439455e-05, + "loss": 5.4639, + "step": 12918 + }, + { + "epoch": 0.07683295270720335, + "grad_norm": 2.1923654079437256, + "learning_rate": 4.927535007152215e-05, + "loss": 5.6016, + "step": 12919 + }, + { + "epoch": 0.07683889999048435, + "grad_norm": 2.2393245697021484, + "learning_rate": 4.9275238420175474e-05, + "loss": 5.9433, + "step": 12920 + }, + { + "epoch": 0.07684484727376534, + "grad_norm": 1.8611164093017578, + "learning_rate": 4.9275126760354565e-05, + "loss": 5.3477, + "step": 12921 + }, + { + "epoch": 0.07685079455704634, + "grad_norm": 1.902567982673645, + "learning_rate": 4.927501509205945e-05, + "loss": 5.4417, + "step": 12922 + }, + { + "epoch": 0.07685674184032734, + "grad_norm": 1.7735011577606201, + "learning_rate": 4.9274903415290184e-05, + "loss": 5.652, + "step": 12923 + }, + { + "epoch": 0.07686268912360833, + "grad_norm": 1.886060357093811, + "learning_rate": 4.927479173004681e-05, + "loss": 5.5927, + "step": 12924 + }, + { + "epoch": 0.07686863640688933, + "grad_norm": 1.8315941095352173, + "learning_rate": 4.927468003632935e-05, + "loss": 5.6559, + "step": 12925 + }, + { + "epoch": 0.07687458369017033, + "grad_norm": 1.7790045738220215, + "learning_rate": 4.927456833413784e-05, + "loss": 5.463, + "step": 12926 + }, + { + "epoch": 0.07688053097345132, + "grad_norm": 1.9559917449951172, + "learning_rate": 4.927445662347234e-05, + "loss": 5.6154, + "step": 12927 + }, + { + "epoch": 0.07688647825673232, + "grad_norm": 1.7274752855300903, + "learning_rate": 4.927434490433287e-05, + "loss": 5.5621, + "step": 12928 + }, + { + "epoch": 0.07689242554001333, + "grad_norm": 1.594190001487732, + "learning_rate": 4.9274233176719486e-05, + "loss": 5.4674, + "step": 12929 + }, + { + "epoch": 0.07689837282329431, + "grad_norm": 1.79281485080719, + "learning_rate": 4.927412144063222e-05, + "loss": 5.5166, + "step": 12930 + }, + { + "epoch": 0.07690432010657532, + "grad_norm": 1.6584967374801636, + "learning_rate": 4.92740096960711e-05, + "loss": 5.4249, + "step": 12931 + }, + { + "epoch": 0.07691026738985632, + "grad_norm": 1.8458021879196167, + "learning_rate": 4.927389794303617e-05, + "loss": 5.6073, + "step": 12932 + }, + { + "epoch": 0.0769162146731373, + "grad_norm": 1.5526570081710815, + "learning_rate": 4.927378618152748e-05, + "loss": 5.3992, + "step": 12933 + }, + { + "epoch": 0.07692216195641831, + "grad_norm": 1.6043710708618164, + "learning_rate": 4.927367441154507e-05, + "loss": 5.3786, + "step": 12934 + }, + { + "epoch": 0.07692810923969931, + "grad_norm": 1.6580268144607544, + "learning_rate": 4.927356263308896e-05, + "loss": 5.5177, + "step": 12935 + }, + { + "epoch": 0.0769340565229803, + "grad_norm": 1.7199897766113281, + "learning_rate": 4.9273450846159194e-05, + "loss": 5.4281, + "step": 12936 + }, + { + "epoch": 0.0769400038062613, + "grad_norm": 1.6920559406280518, + "learning_rate": 4.9273339050755835e-05, + "loss": 5.562, + "step": 12937 + }, + { + "epoch": 0.0769459510895423, + "grad_norm": 1.8027700185775757, + "learning_rate": 4.9273227246878894e-05, + "loss": 5.5473, + "step": 12938 + }, + { + "epoch": 0.07695189837282329, + "grad_norm": 1.6055867671966553, + "learning_rate": 4.927311543452842e-05, + "loss": 5.4903, + "step": 12939 + }, + { + "epoch": 0.07695784565610429, + "grad_norm": 1.5789201259613037, + "learning_rate": 4.9273003613704456e-05, + "loss": 5.4514, + "step": 12940 + }, + { + "epoch": 0.0769637929393853, + "grad_norm": 1.6153863668441772, + "learning_rate": 4.9272891784407034e-05, + "loss": 5.4343, + "step": 12941 + }, + { + "epoch": 0.07696974022266628, + "grad_norm": 1.8802043199539185, + "learning_rate": 4.927277994663619e-05, + "loss": 5.4691, + "step": 12942 + }, + { + "epoch": 0.07697568750594729, + "grad_norm": 1.869836688041687, + "learning_rate": 4.9272668100391984e-05, + "loss": 5.5037, + "step": 12943 + }, + { + "epoch": 0.07698163478922829, + "grad_norm": 1.9082410335540771, + "learning_rate": 4.927255624567443e-05, + "loss": 5.4814, + "step": 12944 + }, + { + "epoch": 0.07698758207250928, + "grad_norm": 1.5890675783157349, + "learning_rate": 4.927244438248358e-05, + "loss": 5.4627, + "step": 12945 + }, + { + "epoch": 0.07699352935579028, + "grad_norm": 1.7432551383972168, + "learning_rate": 4.9272332510819475e-05, + "loss": 5.4301, + "step": 12946 + }, + { + "epoch": 0.07699947663907127, + "grad_norm": 1.7112667560577393, + "learning_rate": 4.927222063068214e-05, + "loss": 5.4028, + "step": 12947 + }, + { + "epoch": 0.07700542392235227, + "grad_norm": 1.7046465873718262, + "learning_rate": 4.9272108742071634e-05, + "loss": 5.4688, + "step": 12948 + }, + { + "epoch": 0.07701137120563327, + "grad_norm": 1.6928964853286743, + "learning_rate": 4.927199684498798e-05, + "loss": 5.4553, + "step": 12949 + }, + { + "epoch": 0.07701731848891426, + "grad_norm": 1.8731732368469238, + "learning_rate": 4.927188493943122e-05, + "loss": 5.3542, + "step": 12950 + }, + { + "epoch": 0.07702326577219526, + "grad_norm": 1.6586295366287231, + "learning_rate": 4.92717730254014e-05, + "loss": 5.2852, + "step": 12951 + }, + { + "epoch": 0.07702921305547626, + "grad_norm": 1.724252462387085, + "learning_rate": 4.927166110289855e-05, + "loss": 5.3982, + "step": 12952 + }, + { + "epoch": 0.07703516033875725, + "grad_norm": 1.7133373022079468, + "learning_rate": 4.9271549171922716e-05, + "loss": 5.3642, + "step": 12953 + }, + { + "epoch": 0.07704110762203825, + "grad_norm": 1.779291033744812, + "learning_rate": 4.927143723247394e-05, + "loss": 5.3949, + "step": 12954 + }, + { + "epoch": 0.07704705490531925, + "grad_norm": 1.8439239263534546, + "learning_rate": 4.927132528455225e-05, + "loss": 5.3829, + "step": 12955 + }, + { + "epoch": 0.07705300218860024, + "grad_norm": 1.7440255880355835, + "learning_rate": 4.927121332815769e-05, + "loss": 5.3881, + "step": 12956 + }, + { + "epoch": 0.07705894947188124, + "grad_norm": 1.8459028005599976, + "learning_rate": 4.927110136329031e-05, + "loss": 5.3575, + "step": 12957 + }, + { + "epoch": 0.07706489675516225, + "grad_norm": 2.8051815032958984, + "learning_rate": 4.927098938995013e-05, + "loss": 5.2814, + "step": 12958 + }, + { + "epoch": 0.07707084403844323, + "grad_norm": 1.8814127445220947, + "learning_rate": 4.9270877408137194e-05, + "loss": 5.3614, + "step": 12959 + }, + { + "epoch": 0.07707679132172424, + "grad_norm": 1.570408821105957, + "learning_rate": 4.927076541785156e-05, + "loss": 5.3453, + "step": 12960 + }, + { + "epoch": 0.07708273860500524, + "grad_norm": 1.607393741607666, + "learning_rate": 4.927065341909324e-05, + "loss": 5.4766, + "step": 12961 + }, + { + "epoch": 0.07708868588828623, + "grad_norm": 1.475420594215393, + "learning_rate": 4.927054141186229e-05, + "loss": 5.4511, + "step": 12962 + }, + { + "epoch": 0.07709463317156723, + "grad_norm": 1.7785848379135132, + "learning_rate": 4.927042939615875e-05, + "loss": 5.3839, + "step": 12963 + }, + { + "epoch": 0.07710058045484823, + "grad_norm": 1.7313402891159058, + "learning_rate": 4.9270317371982645e-05, + "loss": 5.3398, + "step": 12964 + }, + { + "epoch": 0.07710652773812922, + "grad_norm": 1.666938066482544, + "learning_rate": 4.927020533933403e-05, + "loss": 5.4462, + "step": 12965 + }, + { + "epoch": 0.07711247502141022, + "grad_norm": 1.5219112634658813, + "learning_rate": 4.9270093298212933e-05, + "loss": 5.7593, + "step": 12966 + }, + { + "epoch": 0.07711842230469122, + "grad_norm": 2.0760631561279297, + "learning_rate": 4.92699812486194e-05, + "loss": 5.5765, + "step": 12967 + }, + { + "epoch": 0.07712436958797221, + "grad_norm": 1.7648851871490479, + "learning_rate": 4.926986919055346e-05, + "loss": 5.8786, + "step": 12968 + }, + { + "epoch": 0.07713031687125321, + "grad_norm": 1.832141399383545, + "learning_rate": 4.926975712401517e-05, + "loss": 5.6695, + "step": 12969 + }, + { + "epoch": 0.07713626415453421, + "grad_norm": 1.9032765626907349, + "learning_rate": 4.926964504900455e-05, + "loss": 5.701, + "step": 12970 + }, + { + "epoch": 0.0771422114378152, + "grad_norm": 1.7294973134994507, + "learning_rate": 4.9269532965521656e-05, + "loss": 5.6569, + "step": 12971 + }, + { + "epoch": 0.0771481587210962, + "grad_norm": 1.927510142326355, + "learning_rate": 4.926942087356651e-05, + "loss": 5.1289, + "step": 12972 + }, + { + "epoch": 0.07715410600437721, + "grad_norm": 1.6945842504501343, + "learning_rate": 4.926930877313917e-05, + "loss": 5.5703, + "step": 12973 + }, + { + "epoch": 0.0771600532876582, + "grad_norm": 1.7665363550186157, + "learning_rate": 4.926919666423966e-05, + "loss": 5.822, + "step": 12974 + }, + { + "epoch": 0.0771660005709392, + "grad_norm": 1.5802277326583862, + "learning_rate": 4.926908454686801e-05, + "loss": 5.5438, + "step": 12975 + }, + { + "epoch": 0.07717194785422019, + "grad_norm": 1.9065684080123901, + "learning_rate": 4.9268972421024295e-05, + "loss": 5.5556, + "step": 12976 + }, + { + "epoch": 0.07717789513750119, + "grad_norm": 1.7630208730697632, + "learning_rate": 4.9268860286708526e-05, + "loss": 5.6079, + "step": 12977 + }, + { + "epoch": 0.07718384242078219, + "grad_norm": 1.6295850276947021, + "learning_rate": 4.9268748143920746e-05, + "loss": 5.6163, + "step": 12978 + }, + { + "epoch": 0.07718978970406318, + "grad_norm": 1.753202199935913, + "learning_rate": 4.926863599266099e-05, + "loss": 5.549, + "step": 12979 + }, + { + "epoch": 0.07719573698734418, + "grad_norm": 1.7823643684387207, + "learning_rate": 4.9268523832929314e-05, + "loss": 5.6917, + "step": 12980 + }, + { + "epoch": 0.07720168427062518, + "grad_norm": 1.7990792989730835, + "learning_rate": 4.926841166472574e-05, + "loss": 5.5897, + "step": 12981 + }, + { + "epoch": 0.07720763155390617, + "grad_norm": 1.7813109159469604, + "learning_rate": 4.926829948805033e-05, + "loss": 5.5953, + "step": 12982 + }, + { + "epoch": 0.07721357883718717, + "grad_norm": 1.7127541303634644, + "learning_rate": 4.926818730290309e-05, + "loss": 5.5476, + "step": 12983 + }, + { + "epoch": 0.07721952612046817, + "grad_norm": 2.0513558387756348, + "learning_rate": 4.9268075109284084e-05, + "loss": 5.5721, + "step": 12984 + }, + { + "epoch": 0.07722547340374916, + "grad_norm": 1.8053756952285767, + "learning_rate": 4.9267962907193346e-05, + "loss": 5.5344, + "step": 12985 + }, + { + "epoch": 0.07723142068703016, + "grad_norm": 1.7184503078460693, + "learning_rate": 4.9267850696630904e-05, + "loss": 5.602, + "step": 12986 + }, + { + "epoch": 0.07723736797031117, + "grad_norm": 1.8753174543380737, + "learning_rate": 4.926773847759682e-05, + "loss": 5.701, + "step": 12987 + }, + { + "epoch": 0.07724331525359215, + "grad_norm": 1.7761272192001343, + "learning_rate": 4.9267626250091106e-05, + "loss": 5.5026, + "step": 12988 + }, + { + "epoch": 0.07724926253687316, + "grad_norm": 1.6833654642105103, + "learning_rate": 4.926751401411381e-05, + "loss": 5.5615, + "step": 12989 + }, + { + "epoch": 0.07725520982015416, + "grad_norm": 1.8640247583389282, + "learning_rate": 4.926740176966499e-05, + "loss": 5.8367, + "step": 12990 + }, + { + "epoch": 0.07726115710343515, + "grad_norm": 2.036540985107422, + "learning_rate": 4.9267289516744665e-05, + "loss": 5.6258, + "step": 12991 + }, + { + "epoch": 0.07726710438671615, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.926717725535288e-05, + "loss": 5.1961, + "step": 12992 + }, + { + "epoch": 0.07727305166999715, + "grad_norm": 2.149548292160034, + "learning_rate": 4.9267064985489674e-05, + "loss": 5.1735, + "step": 12993 + }, + { + "epoch": 0.07727899895327814, + "grad_norm": 1.7929832935333252, + "learning_rate": 4.926695270715508e-05, + "loss": 5.6889, + "step": 12994 + }, + { + "epoch": 0.07728494623655914, + "grad_norm": 1.7964575290679932, + "learning_rate": 4.926684042034916e-05, + "loss": 5.0576, + "step": 12995 + }, + { + "epoch": 0.07729089351984014, + "grad_norm": 1.8207305669784546, + "learning_rate": 4.926672812507192e-05, + "loss": 5.2703, + "step": 12996 + }, + { + "epoch": 0.07729684080312113, + "grad_norm": 1.6263490915298462, + "learning_rate": 4.9266615821323425e-05, + "loss": 5.5999, + "step": 12997 + }, + { + "epoch": 0.07730278808640213, + "grad_norm": 2.0018131732940674, + "learning_rate": 4.92665035091037e-05, + "loss": 4.9439, + "step": 12998 + }, + { + "epoch": 0.07730873536968313, + "grad_norm": 2.32818341255188, + "learning_rate": 4.926639118841279e-05, + "loss": 4.6071, + "step": 12999 + }, + { + "epoch": 0.07731468265296412, + "grad_norm": 2.3354949951171875, + "learning_rate": 4.926627885925074e-05, + "loss": 4.6642, + "step": 13000 + }, + { + "epoch": 0.07732062993624512, + "grad_norm": 1.71230149269104, + "learning_rate": 4.926616652161757e-05, + "loss": 5.161, + "step": 13001 + }, + { + "epoch": 0.07732657721952613, + "grad_norm": 1.4890326261520386, + "learning_rate": 4.9266054175513345e-05, + "loss": 5.1714, + "step": 13002 + }, + { + "epoch": 0.07733252450280712, + "grad_norm": 1.5844224691390991, + "learning_rate": 4.926594182093809e-05, + "loss": 4.869, + "step": 13003 + }, + { + "epoch": 0.07733847178608812, + "grad_norm": 2.328636884689331, + "learning_rate": 4.926582945789185e-05, + "loss": 5.1571, + "step": 13004 + }, + { + "epoch": 0.0773444190693691, + "grad_norm": 2.067760467529297, + "learning_rate": 4.926571708637464e-05, + "loss": 5.4416, + "step": 13005 + }, + { + "epoch": 0.07735036635265011, + "grad_norm": 1.7148468494415283, + "learning_rate": 4.926560470638653e-05, + "loss": 5.464, + "step": 13006 + }, + { + "epoch": 0.07735631363593111, + "grad_norm": 1.6869080066680908, + "learning_rate": 4.926549231792755e-05, + "loss": 5.5537, + "step": 13007 + }, + { + "epoch": 0.0773622609192121, + "grad_norm": 2.239408254623413, + "learning_rate": 4.9265379920997735e-05, + "loss": 5.1551, + "step": 13008 + }, + { + "epoch": 0.0773682082024931, + "grad_norm": 2.4059038162231445, + "learning_rate": 4.926526751559713e-05, + "loss": 5.2639, + "step": 13009 + }, + { + "epoch": 0.0773741554857741, + "grad_norm": 2.0787813663482666, + "learning_rate": 4.926515510172577e-05, + "loss": 5.3485, + "step": 13010 + }, + { + "epoch": 0.07738010276905509, + "grad_norm": 1.912137508392334, + "learning_rate": 4.9265042679383685e-05, + "loss": 5.551, + "step": 13011 + }, + { + "epoch": 0.07738605005233609, + "grad_norm": 2.0865983963012695, + "learning_rate": 4.926493024857094e-05, + "loss": 5.0343, + "step": 13012 + }, + { + "epoch": 0.0773919973356171, + "grad_norm": 1.9341247081756592, + "learning_rate": 4.926481780928754e-05, + "loss": 5.5904, + "step": 13013 + }, + { + "epoch": 0.07739794461889808, + "grad_norm": 1.7777684926986694, + "learning_rate": 4.926470536153356e-05, + "loss": 5.5396, + "step": 13014 + }, + { + "epoch": 0.07740389190217908, + "grad_norm": 1.7952098846435547, + "learning_rate": 4.926459290530902e-05, + "loss": 5.3212, + "step": 13015 + }, + { + "epoch": 0.07740983918546009, + "grad_norm": 1.7674907445907593, + "learning_rate": 4.926448044061396e-05, + "loss": 5.3316, + "step": 13016 + }, + { + "epoch": 0.07741578646874107, + "grad_norm": 1.8327823877334595, + "learning_rate": 4.926436796744841e-05, + "loss": 5.3129, + "step": 13017 + }, + { + "epoch": 0.07742173375202208, + "grad_norm": 1.613867998123169, + "learning_rate": 4.9264255485812425e-05, + "loss": 5.4935, + "step": 13018 + }, + { + "epoch": 0.07742768103530308, + "grad_norm": 1.7167906761169434, + "learning_rate": 4.9264142995706044e-05, + "loss": 5.3054, + "step": 13019 + }, + { + "epoch": 0.07743362831858407, + "grad_norm": 2.272038698196411, + "learning_rate": 4.92640304971293e-05, + "loss": 5.1327, + "step": 13020 + }, + { + "epoch": 0.07743957560186507, + "grad_norm": 1.6358660459518433, + "learning_rate": 4.926391799008223e-05, + "loss": 5.3285, + "step": 13021 + }, + { + "epoch": 0.07744552288514607, + "grad_norm": 2.166813373565674, + "learning_rate": 4.926380547456488e-05, + "loss": 5.2846, + "step": 13022 + }, + { + "epoch": 0.07745147016842706, + "grad_norm": 2.3251235485076904, + "learning_rate": 4.926369295057729e-05, + "loss": 5.2482, + "step": 13023 + }, + { + "epoch": 0.07745741745170806, + "grad_norm": 1.9402974843978882, + "learning_rate": 4.926358041811949e-05, + "loss": 5.3514, + "step": 13024 + }, + { + "epoch": 0.07746336473498906, + "grad_norm": 2.1346986293792725, + "learning_rate": 4.9263467877191525e-05, + "loss": 5.1912, + "step": 13025 + }, + { + "epoch": 0.07746931201827005, + "grad_norm": 2.0809762477874756, + "learning_rate": 4.926335532779344e-05, + "loss": 5.0547, + "step": 13026 + }, + { + "epoch": 0.07747525930155105, + "grad_norm": 2.110558032989502, + "learning_rate": 4.9263242769925256e-05, + "loss": 5.2177, + "step": 13027 + }, + { + "epoch": 0.07748120658483205, + "grad_norm": 2.3498575687408447, + "learning_rate": 4.926313020358704e-05, + "loss": 4.9997, + "step": 13028 + }, + { + "epoch": 0.07748715386811304, + "grad_norm": 2.4052765369415283, + "learning_rate": 4.92630176287788e-05, + "loss": 4.9736, + "step": 13029 + }, + { + "epoch": 0.07749310115139404, + "grad_norm": 2.3132238388061523, + "learning_rate": 4.9262905045500603e-05, + "loss": 4.9149, + "step": 13030 + }, + { + "epoch": 0.07749904843467505, + "grad_norm": 2.315483331680298, + "learning_rate": 4.926279245375247e-05, + "loss": 4.9096, + "step": 13031 + }, + { + "epoch": 0.07750499571795604, + "grad_norm": 2.0887367725372314, + "learning_rate": 4.926267985353445e-05, + "loss": 5.3274, + "step": 13032 + }, + { + "epoch": 0.07751094300123704, + "grad_norm": 2.3138368129730225, + "learning_rate": 4.926256724484658e-05, + "loss": 4.8627, + "step": 13033 + }, + { + "epoch": 0.07751689028451804, + "grad_norm": 2.348411798477173, + "learning_rate": 4.926245462768889e-05, + "loss": 4.9815, + "step": 13034 + }, + { + "epoch": 0.07752283756779903, + "grad_norm": 1.7357233762741089, + "learning_rate": 4.926234200206144e-05, + "loss": 5.2836, + "step": 13035 + }, + { + "epoch": 0.07752878485108003, + "grad_norm": 1.8633183240890503, + "learning_rate": 4.9262229367964255e-05, + "loss": 5.1838, + "step": 13036 + }, + { + "epoch": 0.07753473213436102, + "grad_norm": 1.736359715461731, + "learning_rate": 4.926211672539737e-05, + "loss": 5.6746, + "step": 13037 + }, + { + "epoch": 0.07754067941764202, + "grad_norm": 2.368511915206909, + "learning_rate": 4.9262004074360834e-05, + "loss": 4.5786, + "step": 13038 + }, + { + "epoch": 0.07754662670092302, + "grad_norm": 1.859297752380371, + "learning_rate": 4.926189141485468e-05, + "loss": 5.8459, + "step": 13039 + }, + { + "epoch": 0.07755257398420401, + "grad_norm": 2.2050845623016357, + "learning_rate": 4.9261778746878955e-05, + "loss": 5.8982, + "step": 13040 + }, + { + "epoch": 0.07755852126748501, + "grad_norm": 1.7485835552215576, + "learning_rate": 4.926166607043369e-05, + "loss": 5.789, + "step": 13041 + }, + { + "epoch": 0.07756446855076601, + "grad_norm": 1.7780888080596924, + "learning_rate": 4.9261553385518936e-05, + "loss": 5.48, + "step": 13042 + }, + { + "epoch": 0.077570415834047, + "grad_norm": 1.8764269351959229, + "learning_rate": 4.9261440692134716e-05, + "loss": 5.093, + "step": 13043 + }, + { + "epoch": 0.077576363117328, + "grad_norm": 1.784196376800537, + "learning_rate": 4.926132799028108e-05, + "loss": 5.4335, + "step": 13044 + }, + { + "epoch": 0.077582310400609, + "grad_norm": 2.173844337463379, + "learning_rate": 4.926121527995806e-05, + "loss": 4.5078, + "step": 13045 + }, + { + "epoch": 0.07758825768389, + "grad_norm": 2.410778045654297, + "learning_rate": 4.9261102561165705e-05, + "loss": 5.2113, + "step": 13046 + }, + { + "epoch": 0.077594204967171, + "grad_norm": 2.0470073223114014, + "learning_rate": 4.9260989833904057e-05, + "loss": 5.4695, + "step": 13047 + }, + { + "epoch": 0.077600152250452, + "grad_norm": 1.619314193725586, + "learning_rate": 4.926087709817314e-05, + "loss": 5.8778, + "step": 13048 + }, + { + "epoch": 0.07760609953373299, + "grad_norm": 2.2353031635284424, + "learning_rate": 4.9260764353973e-05, + "loss": 5.2482, + "step": 13049 + }, + { + "epoch": 0.07761204681701399, + "grad_norm": 2.0858941078186035, + "learning_rate": 4.926065160130369e-05, + "loss": 5.2752, + "step": 13050 + }, + { + "epoch": 0.07761799410029499, + "grad_norm": 2.275660514831543, + "learning_rate": 4.926053884016522e-05, + "loss": 5.004, + "step": 13051 + }, + { + "epoch": 0.07762394138357598, + "grad_norm": 1.9338358640670776, + "learning_rate": 4.926042607055765e-05, + "loss": 5.4688, + "step": 13052 + }, + { + "epoch": 0.07762988866685698, + "grad_norm": 1.7377573251724243, + "learning_rate": 4.926031329248103e-05, + "loss": 5.6429, + "step": 13053 + }, + { + "epoch": 0.07763583595013798, + "grad_norm": 1.8915661573410034, + "learning_rate": 4.9260200505935374e-05, + "loss": 5.543, + "step": 13054 + }, + { + "epoch": 0.07764178323341897, + "grad_norm": 1.7961910963058472, + "learning_rate": 4.926008771092073e-05, + "loss": 5.4245, + "step": 13055 + }, + { + "epoch": 0.07764773051669997, + "grad_norm": 1.9412139654159546, + "learning_rate": 4.9259974907437145e-05, + "loss": 5.5858, + "step": 13056 + }, + { + "epoch": 0.07765367779998097, + "grad_norm": 2.458508253097534, + "learning_rate": 4.925986209548466e-05, + "loss": 5.3307, + "step": 13057 + }, + { + "epoch": 0.07765962508326196, + "grad_norm": 2.23331880569458, + "learning_rate": 4.92597492750633e-05, + "loss": 5.6979, + "step": 13058 + }, + { + "epoch": 0.07766557236654296, + "grad_norm": 2.38264536857605, + "learning_rate": 4.9259636446173104e-05, + "loss": 5.5771, + "step": 13059 + }, + { + "epoch": 0.07767151964982397, + "grad_norm": 2.0892632007598877, + "learning_rate": 4.925952360881413e-05, + "loss": 5.8596, + "step": 13060 + }, + { + "epoch": 0.07767746693310495, + "grad_norm": 1.82732355594635, + "learning_rate": 4.92594107629864e-05, + "loss": 5.3724, + "step": 13061 + }, + { + "epoch": 0.07768341421638596, + "grad_norm": 1.821089506149292, + "learning_rate": 4.925929790868997e-05, + "loss": 5.6499, + "step": 13062 + }, + { + "epoch": 0.07768936149966696, + "grad_norm": 1.9662789106369019, + "learning_rate": 4.925918504592487e-05, + "loss": 5.5132, + "step": 13063 + }, + { + "epoch": 0.07769530878294795, + "grad_norm": 1.830101490020752, + "learning_rate": 4.925907217469113e-05, + "loss": 5.4492, + "step": 13064 + }, + { + "epoch": 0.07770125606622895, + "grad_norm": 1.8362375497817993, + "learning_rate": 4.9258959294988804e-05, + "loss": 5.8314, + "step": 13065 + }, + { + "epoch": 0.07770720334950994, + "grad_norm": 2.23861026763916, + "learning_rate": 4.9258846406817926e-05, + "loss": 6.2564, + "step": 13066 + }, + { + "epoch": 0.07771315063279094, + "grad_norm": 2.2672650814056396, + "learning_rate": 4.9258733510178536e-05, + "loss": 6.3396, + "step": 13067 + }, + { + "epoch": 0.07771909791607194, + "grad_norm": 1.8667620420455933, + "learning_rate": 4.9258620605070665e-05, + "loss": 5.8509, + "step": 13068 + }, + { + "epoch": 0.07772504519935293, + "grad_norm": 1.7386364936828613, + "learning_rate": 4.925850769149436e-05, + "loss": 5.567, + "step": 13069 + }, + { + "epoch": 0.07773099248263393, + "grad_norm": 1.3638315200805664, + "learning_rate": 4.9258394769449675e-05, + "loss": 5.6892, + "step": 13070 + }, + { + "epoch": 0.07773693976591493, + "grad_norm": 1.7117588520050049, + "learning_rate": 4.9258281838936624e-05, + "loss": 5.461, + "step": 13071 + }, + { + "epoch": 0.07774288704919592, + "grad_norm": 1.7597805261611938, + "learning_rate": 4.925816889995526e-05, + "loss": 5.6783, + "step": 13072 + }, + { + "epoch": 0.07774883433247692, + "grad_norm": 1.8734283447265625, + "learning_rate": 4.9258055952505624e-05, + "loss": 5.633, + "step": 13073 + }, + { + "epoch": 0.07775478161575793, + "grad_norm": 1.5552877187728882, + "learning_rate": 4.9257942996587744e-05, + "loss": 5.8804, + "step": 13074 + }, + { + "epoch": 0.07776072889903891, + "grad_norm": 1.2786669731140137, + "learning_rate": 4.925783003220167e-05, + "loss": 5.3208, + "step": 13075 + }, + { + "epoch": 0.07776667618231992, + "grad_norm": 1.558182954788208, + "learning_rate": 4.925771705934744e-05, + "loss": 5.4023, + "step": 13076 + }, + { + "epoch": 0.07777262346560092, + "grad_norm": 1.3482223749160767, + "learning_rate": 4.925760407802509e-05, + "loss": 5.3879, + "step": 13077 + }, + { + "epoch": 0.0777785707488819, + "grad_norm": 1.5111918449401855, + "learning_rate": 4.925749108823466e-05, + "loss": 5.329, + "step": 13078 + }, + { + "epoch": 0.07778451803216291, + "grad_norm": 1.7119463682174683, + "learning_rate": 4.925737808997619e-05, + "loss": 5.7282, + "step": 13079 + }, + { + "epoch": 0.07779046531544391, + "grad_norm": 1.7753342390060425, + "learning_rate": 4.925726508324972e-05, + "loss": 5.2677, + "step": 13080 + }, + { + "epoch": 0.0777964125987249, + "grad_norm": 1.8957557678222656, + "learning_rate": 4.925715206805529e-05, + "loss": 4.7193, + "step": 13081 + }, + { + "epoch": 0.0778023598820059, + "grad_norm": 2.503037214279175, + "learning_rate": 4.9257039044392935e-05, + "loss": 5.034, + "step": 13082 + }, + { + "epoch": 0.0778083071652869, + "grad_norm": 2.031312942504883, + "learning_rate": 4.92569260122627e-05, + "loss": 5.1982, + "step": 13083 + }, + { + "epoch": 0.07781425444856789, + "grad_norm": 1.8345115184783936, + "learning_rate": 4.9256812971664635e-05, + "loss": 5.6059, + "step": 13084 + }, + { + "epoch": 0.07782020173184889, + "grad_norm": 2.134131669998169, + "learning_rate": 4.925669992259875e-05, + "loss": 5.8174, + "step": 13085 + }, + { + "epoch": 0.0778261490151299, + "grad_norm": 1.9598990678787231, + "learning_rate": 4.9256586865065114e-05, + "loss": 5.76, + "step": 13086 + }, + { + "epoch": 0.07783209629841088, + "grad_norm": 1.8105463981628418, + "learning_rate": 4.925647379906375e-05, + "loss": 5.5112, + "step": 13087 + }, + { + "epoch": 0.07783804358169188, + "grad_norm": 1.5290614366531372, + "learning_rate": 4.9256360724594696e-05, + "loss": 5.7122, + "step": 13088 + }, + { + "epoch": 0.07784399086497289, + "grad_norm": 1.6188294887542725, + "learning_rate": 4.9256247641658005e-05, + "loss": 5.58, + "step": 13089 + }, + { + "epoch": 0.07784993814825387, + "grad_norm": 1.8662221431732178, + "learning_rate": 4.925613455025371e-05, + "loss": 5.4975, + "step": 13090 + }, + { + "epoch": 0.07785588543153488, + "grad_norm": 1.808813452720642, + "learning_rate": 4.925602145038184e-05, + "loss": 5.6704, + "step": 13091 + }, + { + "epoch": 0.07786183271481588, + "grad_norm": 1.776418924331665, + "learning_rate": 4.925590834204245e-05, + "loss": 5.7558, + "step": 13092 + }, + { + "epoch": 0.07786777999809687, + "grad_norm": 1.704537034034729, + "learning_rate": 4.925579522523557e-05, + "loss": 5.6667, + "step": 13093 + }, + { + "epoch": 0.07787372728137787, + "grad_norm": 2.115651845932007, + "learning_rate": 4.9255682099961246e-05, + "loss": 5.5823, + "step": 13094 + }, + { + "epoch": 0.07787967456465886, + "grad_norm": 1.851914882659912, + "learning_rate": 4.9255568966219504e-05, + "loss": 5.6749, + "step": 13095 + }, + { + "epoch": 0.07788562184793986, + "grad_norm": 1.8792526721954346, + "learning_rate": 4.92554558240104e-05, + "loss": 5.8539, + "step": 13096 + }, + { + "epoch": 0.07789156913122086, + "grad_norm": 1.805280327796936, + "learning_rate": 4.925534267333397e-05, + "loss": 5.8522, + "step": 13097 + }, + { + "epoch": 0.07789751641450185, + "grad_norm": 1.7457916736602783, + "learning_rate": 4.925522951419025e-05, + "loss": 5.9419, + "step": 13098 + }, + { + "epoch": 0.07790346369778285, + "grad_norm": 1.6427416801452637, + "learning_rate": 4.925511634657928e-05, + "loss": 5.8924, + "step": 13099 + }, + { + "epoch": 0.07790941098106385, + "grad_norm": 1.7034873962402344, + "learning_rate": 4.9255003170501095e-05, + "loss": 5.8701, + "step": 13100 + }, + { + "epoch": 0.07791535826434484, + "grad_norm": 1.6852953433990479, + "learning_rate": 4.925488998595574e-05, + "loss": 5.771, + "step": 13101 + }, + { + "epoch": 0.07792130554762584, + "grad_norm": 1.6478735208511353, + "learning_rate": 4.9254776792943255e-05, + "loss": 5.4274, + "step": 13102 + }, + { + "epoch": 0.07792725283090685, + "grad_norm": 1.5896925926208496, + "learning_rate": 4.925466359146368e-05, + "loss": 5.8217, + "step": 13103 + }, + { + "epoch": 0.07793320011418783, + "grad_norm": 1.649539828300476, + "learning_rate": 4.9254550381517054e-05, + "loss": 5.7899, + "step": 13104 + }, + { + "epoch": 0.07793914739746884, + "grad_norm": 1.5224459171295166, + "learning_rate": 4.925443716310341e-05, + "loss": 5.7931, + "step": 13105 + }, + { + "epoch": 0.07794509468074984, + "grad_norm": 2.009038209915161, + "learning_rate": 4.9254323936222796e-05, + "loss": 5.854, + "step": 13106 + }, + { + "epoch": 0.07795104196403083, + "grad_norm": 1.5545878410339355, + "learning_rate": 4.9254210700875245e-05, + "loss": 5.7212, + "step": 13107 + }, + { + "epoch": 0.07795698924731183, + "grad_norm": 2.0804193019866943, + "learning_rate": 4.92540974570608e-05, + "loss": 5.7195, + "step": 13108 + }, + { + "epoch": 0.07796293653059283, + "grad_norm": 1.940432071685791, + "learning_rate": 4.92539842047795e-05, + "loss": 5.4998, + "step": 13109 + }, + { + "epoch": 0.07796888381387382, + "grad_norm": 2.3788061141967773, + "learning_rate": 4.925387094403139e-05, + "loss": 5.5975, + "step": 13110 + }, + { + "epoch": 0.07797483109715482, + "grad_norm": 1.6193798780441284, + "learning_rate": 4.92537576748165e-05, + "loss": 5.4489, + "step": 13111 + }, + { + "epoch": 0.07798077838043582, + "grad_norm": 1.7056760787963867, + "learning_rate": 4.9253644397134866e-05, + "loss": 5.5584, + "step": 13112 + }, + { + "epoch": 0.07798672566371681, + "grad_norm": 1.2604116201400757, + "learning_rate": 4.925353111098655e-05, + "loss": 5.5681, + "step": 13113 + }, + { + "epoch": 0.07799267294699781, + "grad_norm": 1.305413842201233, + "learning_rate": 4.925341781637157e-05, + "loss": 5.6966, + "step": 13114 + }, + { + "epoch": 0.07799862023027881, + "grad_norm": 2.6248581409454346, + "learning_rate": 4.9253304513289975e-05, + "loss": 5.3666, + "step": 13115 + }, + { + "epoch": 0.0780045675135598, + "grad_norm": 1.687741994857788, + "learning_rate": 4.92531912017418e-05, + "loss": 5.5511, + "step": 13116 + }, + { + "epoch": 0.0780105147968408, + "grad_norm": 1.5827749967575073, + "learning_rate": 4.9253077881727086e-05, + "loss": 5.3363, + "step": 13117 + }, + { + "epoch": 0.0780164620801218, + "grad_norm": 1.5989108085632324, + "learning_rate": 4.925296455324587e-05, + "loss": 5.472, + "step": 13118 + }, + { + "epoch": 0.0780224093634028, + "grad_norm": 1.5687717199325562, + "learning_rate": 4.9252851216298194e-05, + "loss": 5.6894, + "step": 13119 + }, + { + "epoch": 0.0780283566466838, + "grad_norm": 1.312949538230896, + "learning_rate": 4.9252737870884106e-05, + "loss": 5.6735, + "step": 13120 + }, + { + "epoch": 0.0780343039299648, + "grad_norm": 1.5779353380203247, + "learning_rate": 4.925262451700363e-05, + "loss": 5.3281, + "step": 13121 + }, + { + "epoch": 0.07804025121324579, + "grad_norm": 1.6127909421920776, + "learning_rate": 4.9252511154656825e-05, + "loss": 5.27, + "step": 13122 + }, + { + "epoch": 0.07804619849652679, + "grad_norm": 1.6496199369430542, + "learning_rate": 4.925239778384371e-05, + "loss": 5.4913, + "step": 13123 + }, + { + "epoch": 0.07805214577980778, + "grad_norm": 2.394230842590332, + "learning_rate": 4.925228440456433e-05, + "loss": 5.1788, + "step": 13124 + }, + { + "epoch": 0.07805809306308878, + "grad_norm": 2.169250249862671, + "learning_rate": 4.925217101681873e-05, + "loss": 5.4087, + "step": 13125 + }, + { + "epoch": 0.07806404034636978, + "grad_norm": 2.150338649749756, + "learning_rate": 4.925205762060695e-05, + "loss": 5.5004, + "step": 13126 + }, + { + "epoch": 0.07806998762965077, + "grad_norm": 2.0131516456604004, + "learning_rate": 4.925194421592903e-05, + "loss": 5.5791, + "step": 13127 + }, + { + "epoch": 0.07807593491293177, + "grad_norm": 1.8154455423355103, + "learning_rate": 4.925183080278501e-05, + "loss": 5.5479, + "step": 13128 + }, + { + "epoch": 0.07808188219621277, + "grad_norm": 1.7489157915115356, + "learning_rate": 4.925171738117492e-05, + "loss": 5.7169, + "step": 13129 + }, + { + "epoch": 0.07808782947949376, + "grad_norm": 1.6712158918380737, + "learning_rate": 4.92516039510988e-05, + "loss": 6.0751, + "step": 13130 + }, + { + "epoch": 0.07809377676277476, + "grad_norm": 1.7542296648025513, + "learning_rate": 4.9251490512556706e-05, + "loss": 5.8998, + "step": 13131 + }, + { + "epoch": 0.07809972404605577, + "grad_norm": 1.5962193012237549, + "learning_rate": 4.9251377065548666e-05, + "loss": 5.7781, + "step": 13132 + }, + { + "epoch": 0.07810567132933675, + "grad_norm": 1.783756136894226, + "learning_rate": 4.9251263610074714e-05, + "loss": 5.8384, + "step": 13133 + }, + { + "epoch": 0.07811161861261776, + "grad_norm": 1.6608144044876099, + "learning_rate": 4.92511501461349e-05, + "loss": 5.7603, + "step": 13134 + }, + { + "epoch": 0.07811756589589876, + "grad_norm": 1.8659160137176514, + "learning_rate": 4.925103667372926e-05, + "loss": 5.5039, + "step": 13135 + }, + { + "epoch": 0.07812351317917975, + "grad_norm": 1.591565489768982, + "learning_rate": 4.925092319285783e-05, + "loss": 5.7034, + "step": 13136 + }, + { + "epoch": 0.07812946046246075, + "grad_norm": 1.5772358179092407, + "learning_rate": 4.925080970352066e-05, + "loss": 5.6347, + "step": 13137 + }, + { + "epoch": 0.07813540774574175, + "grad_norm": 1.7196561098098755, + "learning_rate": 4.925069620571778e-05, + "loss": 5.7086, + "step": 13138 + }, + { + "epoch": 0.07814135502902274, + "grad_norm": 1.9582041501998901, + "learning_rate": 4.9250582699449237e-05, + "loss": 5.9774, + "step": 13139 + }, + { + "epoch": 0.07814730231230374, + "grad_norm": 2.0566928386688232, + "learning_rate": 4.9250469184715064e-05, + "loss": 5.8527, + "step": 13140 + }, + { + "epoch": 0.07815324959558474, + "grad_norm": 1.9961296319961548, + "learning_rate": 4.92503556615153e-05, + "loss": 5.65, + "step": 13141 + }, + { + "epoch": 0.07815919687886573, + "grad_norm": 1.672601342201233, + "learning_rate": 4.925024212984999e-05, + "loss": 5.7242, + "step": 13142 + }, + { + "epoch": 0.07816514416214673, + "grad_norm": 1.6791996955871582, + "learning_rate": 4.9250128589719166e-05, + "loss": 5.7365, + "step": 13143 + }, + { + "epoch": 0.07817109144542773, + "grad_norm": 2.4464364051818848, + "learning_rate": 4.925001504112288e-05, + "loss": 4.9673, + "step": 13144 + }, + { + "epoch": 0.07817703872870872, + "grad_norm": 2.0053181648254395, + "learning_rate": 4.9249901484061156e-05, + "loss": 5.7916, + "step": 13145 + }, + { + "epoch": 0.07818298601198972, + "grad_norm": 2.512120246887207, + "learning_rate": 4.924978791853405e-05, + "loss": 5.914, + "step": 13146 + }, + { + "epoch": 0.07818893329527073, + "grad_norm": 2.2429497241973877, + "learning_rate": 4.924967434454159e-05, + "loss": 5.8806, + "step": 13147 + }, + { + "epoch": 0.07819488057855171, + "grad_norm": 1.9966307878494263, + "learning_rate": 4.924956076208381e-05, + "loss": 5.8883, + "step": 13148 + }, + { + "epoch": 0.07820082786183272, + "grad_norm": 2.492926836013794, + "learning_rate": 4.924944717116077e-05, + "loss": 5.361, + "step": 13149 + }, + { + "epoch": 0.07820677514511372, + "grad_norm": 2.050769090652466, + "learning_rate": 4.92493335717725e-05, + "loss": 5.5682, + "step": 13150 + }, + { + "epoch": 0.07821272242839471, + "grad_norm": 2.2797789573669434, + "learning_rate": 4.9249219963919037e-05, + "loss": 5.8695, + "step": 13151 + }, + { + "epoch": 0.07821866971167571, + "grad_norm": 2.1034891605377197, + "learning_rate": 4.924910634760041e-05, + "loss": 4.987, + "step": 13152 + }, + { + "epoch": 0.0782246169949567, + "grad_norm": 1.7718714475631714, + "learning_rate": 4.924899272281669e-05, + "loss": 5.112, + "step": 13153 + }, + { + "epoch": 0.0782305642782377, + "grad_norm": 1.730656385421753, + "learning_rate": 4.9248879089567884e-05, + "loss": 5.6589, + "step": 13154 + }, + { + "epoch": 0.0782365115615187, + "grad_norm": 1.7784979343414307, + "learning_rate": 4.9248765447854054e-05, + "loss": 5.6812, + "step": 13155 + }, + { + "epoch": 0.07824245884479969, + "grad_norm": 1.5646599531173706, + "learning_rate": 4.9248651797675213e-05, + "loss": 5.7598, + "step": 13156 + }, + { + "epoch": 0.07824840612808069, + "grad_norm": 2.6416964530944824, + "learning_rate": 4.924853813903144e-05, + "loss": 5.9888, + "step": 13157 + }, + { + "epoch": 0.0782543534113617, + "grad_norm": 1.978983998298645, + "learning_rate": 4.924842447192274e-05, + "loss": 5.8919, + "step": 13158 + }, + { + "epoch": 0.07826030069464268, + "grad_norm": 2.3622004985809326, + "learning_rate": 4.924831079634916e-05, + "loss": 5.706, + "step": 13159 + }, + { + "epoch": 0.07826624797792368, + "grad_norm": 2.4118547439575195, + "learning_rate": 4.9248197112310754e-05, + "loss": 5.529, + "step": 13160 + }, + { + "epoch": 0.07827219526120469, + "grad_norm": 1.9290462732315063, + "learning_rate": 4.9248083419807554e-05, + "loss": 5.6403, + "step": 13161 + }, + { + "epoch": 0.07827814254448567, + "grad_norm": 1.9591599702835083, + "learning_rate": 4.92479697188396e-05, + "loss": 5.3365, + "step": 13162 + }, + { + "epoch": 0.07828408982776668, + "grad_norm": 1.7800555229187012, + "learning_rate": 4.9247856009406924e-05, + "loss": 6.4051, + "step": 13163 + }, + { + "epoch": 0.07829003711104768, + "grad_norm": 1.8390953540802002, + "learning_rate": 4.924774229150958e-05, + "loss": 5.775, + "step": 13164 + }, + { + "epoch": 0.07829598439432867, + "grad_norm": 1.8265724182128906, + "learning_rate": 4.924762856514759e-05, + "loss": 6.1238, + "step": 13165 + }, + { + "epoch": 0.07830193167760967, + "grad_norm": 1.5573666095733643, + "learning_rate": 4.9247514830321005e-05, + "loss": 5.9823, + "step": 13166 + }, + { + "epoch": 0.07830787896089067, + "grad_norm": 2.2647573947906494, + "learning_rate": 4.924740108702987e-05, + "loss": 5.0975, + "step": 13167 + }, + { + "epoch": 0.07831382624417166, + "grad_norm": 2.509573459625244, + "learning_rate": 4.924728733527422e-05, + "loss": 5.1327, + "step": 13168 + }, + { + "epoch": 0.07831977352745266, + "grad_norm": 2.2974681854248047, + "learning_rate": 4.924717357505408e-05, + "loss": 5.1493, + "step": 13169 + }, + { + "epoch": 0.07832572081073366, + "grad_norm": 1.958938717842102, + "learning_rate": 4.924705980636951e-05, + "loss": 6.0291, + "step": 13170 + }, + { + "epoch": 0.07833166809401465, + "grad_norm": 1.7714133262634277, + "learning_rate": 4.924694602922054e-05, + "loss": 5.9623, + "step": 13171 + }, + { + "epoch": 0.07833761537729565, + "grad_norm": 1.7545043230056763, + "learning_rate": 4.924683224360721e-05, + "loss": 5.9123, + "step": 13172 + }, + { + "epoch": 0.07834356266057665, + "grad_norm": 1.4791491031646729, + "learning_rate": 4.924671844952957e-05, + "loss": 5.8959, + "step": 13173 + }, + { + "epoch": 0.07834950994385764, + "grad_norm": 1.783353567123413, + "learning_rate": 4.924660464698764e-05, + "loss": 5.732, + "step": 13174 + }, + { + "epoch": 0.07835545722713864, + "grad_norm": 1.9444235563278198, + "learning_rate": 4.9246490835981474e-05, + "loss": 5.5167, + "step": 13175 + }, + { + "epoch": 0.07836140451041965, + "grad_norm": 1.9656537771224976, + "learning_rate": 4.924637701651111e-05, + "loss": 5.4557, + "step": 13176 + }, + { + "epoch": 0.07836735179370063, + "grad_norm": 1.8164803981781006, + "learning_rate": 4.9246263188576594e-05, + "loss": 5.44, + "step": 13177 + }, + { + "epoch": 0.07837329907698164, + "grad_norm": 1.8245429992675781, + "learning_rate": 4.9246149352177946e-05, + "loss": 5.2164, + "step": 13178 + }, + { + "epoch": 0.07837924636026264, + "grad_norm": 1.76225745677948, + "learning_rate": 4.924603550731522e-05, + "loss": 5.2325, + "step": 13179 + }, + { + "epoch": 0.07838519364354363, + "grad_norm": 2.052314519882202, + "learning_rate": 4.924592165398846e-05, + "loss": 5.7905, + "step": 13180 + }, + { + "epoch": 0.07839114092682463, + "grad_norm": 1.63084077835083, + "learning_rate": 4.924580779219769e-05, + "loss": 5.2703, + "step": 13181 + }, + { + "epoch": 0.07839708821010562, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.9245693921942965e-05, + "loss": 5.5974, + "step": 13182 + }, + { + "epoch": 0.07840303549338662, + "grad_norm": 2.201376438140869, + "learning_rate": 4.9245580043224315e-05, + "loss": 5.1298, + "step": 13183 + }, + { + "epoch": 0.07840898277666762, + "grad_norm": 2.3778293132781982, + "learning_rate": 4.924546615604179e-05, + "loss": 5.2289, + "step": 13184 + }, + { + "epoch": 0.07841493005994861, + "grad_norm": 2.5284171104431152, + "learning_rate": 4.9245352260395414e-05, + "loss": 5.0038, + "step": 13185 + }, + { + "epoch": 0.07842087734322961, + "grad_norm": 2.230825424194336, + "learning_rate": 4.9245238356285244e-05, + "loss": 5.0699, + "step": 13186 + }, + { + "epoch": 0.07842682462651061, + "grad_norm": 2.1288161277770996, + "learning_rate": 4.924512444371131e-05, + "loss": 5.1093, + "step": 13187 + }, + { + "epoch": 0.0784327719097916, + "grad_norm": 1.912685751914978, + "learning_rate": 4.924501052267365e-05, + "loss": 5.5926, + "step": 13188 + }, + { + "epoch": 0.0784387191930726, + "grad_norm": 2.394078254699707, + "learning_rate": 4.924489659317231e-05, + "loss": 5.129, + "step": 13189 + }, + { + "epoch": 0.0784446664763536, + "grad_norm": 2.7360801696777344, + "learning_rate": 4.924478265520733e-05, + "loss": 4.9682, + "step": 13190 + }, + { + "epoch": 0.0784506137596346, + "grad_norm": 2.4817416667938232, + "learning_rate": 4.924466870877874e-05, + "loss": 5.0193, + "step": 13191 + }, + { + "epoch": 0.0784565610429156, + "grad_norm": 2.5156679153442383, + "learning_rate": 4.92445547538866e-05, + "loss": 5.0044, + "step": 13192 + }, + { + "epoch": 0.0784625083261966, + "grad_norm": 2.519080638885498, + "learning_rate": 4.924444079053092e-05, + "loss": 5.0109, + "step": 13193 + }, + { + "epoch": 0.07846845560947759, + "grad_norm": 2.3944201469421387, + "learning_rate": 4.924432681871176e-05, + "loss": 5.0032, + "step": 13194 + }, + { + "epoch": 0.07847440289275859, + "grad_norm": 2.4199647903442383, + "learning_rate": 4.924421283842916e-05, + "loss": 4.8158, + "step": 13195 + }, + { + "epoch": 0.07848035017603959, + "grad_norm": 2.4517173767089844, + "learning_rate": 4.924409884968316e-05, + "loss": 4.8194, + "step": 13196 + }, + { + "epoch": 0.07848629745932058, + "grad_norm": 2.231703042984009, + "learning_rate": 4.924398485247379e-05, + "loss": 4.882, + "step": 13197 + }, + { + "epoch": 0.07849224474260158, + "grad_norm": 2.218252182006836, + "learning_rate": 4.924387084680109e-05, + "loss": 4.872, + "step": 13198 + }, + { + "epoch": 0.07849819202588258, + "grad_norm": 2.2126224040985107, + "learning_rate": 4.924375683266511e-05, + "loss": 5.019, + "step": 13199 + }, + { + "epoch": 0.07850413930916357, + "grad_norm": 2.197240114212036, + "learning_rate": 4.924364281006589e-05, + "loss": 4.9801, + "step": 13200 + }, + { + "epoch": 0.07851008659244457, + "grad_norm": 2.11427640914917, + "learning_rate": 4.9243528779003456e-05, + "loss": 4.992, + "step": 13201 + }, + { + "epoch": 0.07851603387572557, + "grad_norm": 1.9424201250076294, + "learning_rate": 4.9243414739477864e-05, + "loss": 4.9275, + "step": 13202 + }, + { + "epoch": 0.07852198115900656, + "grad_norm": 1.897208571434021, + "learning_rate": 4.9243300691489146e-05, + "loss": 5.0482, + "step": 13203 + }, + { + "epoch": 0.07852792844228756, + "grad_norm": 1.7149171829223633, + "learning_rate": 4.924318663503734e-05, + "loss": 5.4713, + "step": 13204 + }, + { + "epoch": 0.07853387572556857, + "grad_norm": 1.770279049873352, + "learning_rate": 4.924307257012248e-05, + "loss": 5.5565, + "step": 13205 + }, + { + "epoch": 0.07853982300884955, + "grad_norm": 2.043506145477295, + "learning_rate": 4.924295849674463e-05, + "loss": 4.9129, + "step": 13206 + }, + { + "epoch": 0.07854577029213056, + "grad_norm": 1.91255521774292, + "learning_rate": 4.92428444149038e-05, + "loss": 5.5405, + "step": 13207 + }, + { + "epoch": 0.07855171757541156, + "grad_norm": 2.371006965637207, + "learning_rate": 4.924273032460005e-05, + "loss": 5.8047, + "step": 13208 + }, + { + "epoch": 0.07855766485869255, + "grad_norm": 2.1126253604888916, + "learning_rate": 4.9242616225833416e-05, + "loss": 5.6397, + "step": 13209 + }, + { + "epoch": 0.07856361214197355, + "grad_norm": 1.9398634433746338, + "learning_rate": 4.9242502118603925e-05, + "loss": 5.7703, + "step": 13210 + }, + { + "epoch": 0.07856955942525454, + "grad_norm": 1.7660777568817139, + "learning_rate": 4.924238800291164e-05, + "loss": 5.6485, + "step": 13211 + }, + { + "epoch": 0.07857550670853554, + "grad_norm": 1.835633397102356, + "learning_rate": 4.924227387875658e-05, + "loss": 5.701, + "step": 13212 + }, + { + "epoch": 0.07858145399181654, + "grad_norm": 1.8192920684814453, + "learning_rate": 4.9242159746138796e-05, + "loss": 5.5682, + "step": 13213 + }, + { + "epoch": 0.07858740127509753, + "grad_norm": 1.8342156410217285, + "learning_rate": 4.924204560505832e-05, + "loss": 5.2546, + "step": 13214 + }, + { + "epoch": 0.07859334855837853, + "grad_norm": 1.855446696281433, + "learning_rate": 4.92419314555152e-05, + "loss": 5.7471, + "step": 13215 + }, + { + "epoch": 0.07859929584165953, + "grad_norm": 1.7786341905593872, + "learning_rate": 4.924181729750946e-05, + "loss": 5.8774, + "step": 13216 + }, + { + "epoch": 0.07860524312494052, + "grad_norm": 1.7919361591339111, + "learning_rate": 4.9241703131041175e-05, + "loss": 5.7796, + "step": 13217 + }, + { + "epoch": 0.07861119040822152, + "grad_norm": 2.1065824031829834, + "learning_rate": 4.924158895611034e-05, + "loss": 5.2471, + "step": 13218 + }, + { + "epoch": 0.07861713769150253, + "grad_norm": 2.18803334236145, + "learning_rate": 4.9241474772717036e-05, + "loss": 4.8654, + "step": 13219 + }, + { + "epoch": 0.07862308497478351, + "grad_norm": 2.156651020050049, + "learning_rate": 4.924136058086127e-05, + "loss": 4.7614, + "step": 13220 + }, + { + "epoch": 0.07862903225806452, + "grad_norm": 2.098242998123169, + "learning_rate": 4.9241246380543095e-05, + "loss": 4.8152, + "step": 13221 + }, + { + "epoch": 0.07863497954134552, + "grad_norm": 1.9857498407363892, + "learning_rate": 4.924113217176256e-05, + "loss": 4.7955, + "step": 13222 + }, + { + "epoch": 0.0786409268246265, + "grad_norm": 2.046926259994507, + "learning_rate": 4.9241017954519685e-05, + "loss": 4.9851, + "step": 13223 + }, + { + "epoch": 0.07864687410790751, + "grad_norm": 1.804005742073059, + "learning_rate": 4.924090372881454e-05, + "loss": 5.5084, + "step": 13224 + }, + { + "epoch": 0.07865282139118851, + "grad_norm": 1.8413509130477905, + "learning_rate": 4.924078949464713e-05, + "loss": 5.462, + "step": 13225 + }, + { + "epoch": 0.0786587686744695, + "grad_norm": 1.7599927186965942, + "learning_rate": 4.924067525201751e-05, + "loss": 5.4255, + "step": 13226 + }, + { + "epoch": 0.0786647159577505, + "grad_norm": 1.7645682096481323, + "learning_rate": 4.924056100092573e-05, + "loss": 5.4837, + "step": 13227 + }, + { + "epoch": 0.0786706632410315, + "grad_norm": 1.7478766441345215, + "learning_rate": 4.924044674137182e-05, + "loss": 5.2957, + "step": 13228 + }, + { + "epoch": 0.07867661052431249, + "grad_norm": 1.7865453958511353, + "learning_rate": 4.924033247335581e-05, + "loss": 5.1909, + "step": 13229 + }, + { + "epoch": 0.07868255780759349, + "grad_norm": 1.8167400360107422, + "learning_rate": 4.924021819687776e-05, + "loss": 5.2732, + "step": 13230 + }, + { + "epoch": 0.0786885050908745, + "grad_norm": 1.8745819330215454, + "learning_rate": 4.92401039119377e-05, + "loss": 5.3222, + "step": 13231 + }, + { + "epoch": 0.07869445237415548, + "grad_norm": 1.7355458736419678, + "learning_rate": 4.9239989618535665e-05, + "loss": 5.4142, + "step": 13232 + }, + { + "epoch": 0.07870039965743648, + "grad_norm": 1.7634247541427612, + "learning_rate": 4.9239875316671705e-05, + "loss": 5.3114, + "step": 13233 + }, + { + "epoch": 0.07870634694071749, + "grad_norm": 1.8516123294830322, + "learning_rate": 4.9239761006345845e-05, + "loss": 5.3014, + "step": 13234 + }, + { + "epoch": 0.07871229422399847, + "grad_norm": 1.8192317485809326, + "learning_rate": 4.9239646687558146e-05, + "loss": 5.407, + "step": 13235 + }, + { + "epoch": 0.07871824150727948, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.923953236030863e-05, + "loss": 5.4235, + "step": 13236 + }, + { + "epoch": 0.07872418879056048, + "grad_norm": 1.681746006011963, + "learning_rate": 4.923941802459735e-05, + "loss": 5.3367, + "step": 13237 + }, + { + "epoch": 0.07873013607384147, + "grad_norm": 1.6417745351791382, + "learning_rate": 4.9239303680424334e-05, + "loss": 5.253, + "step": 13238 + }, + { + "epoch": 0.07873608335712247, + "grad_norm": 1.6522557735443115, + "learning_rate": 4.9239189327789626e-05, + "loss": 5.0855, + "step": 13239 + }, + { + "epoch": 0.07874203064040346, + "grad_norm": 1.7547293901443481, + "learning_rate": 4.9239074966693275e-05, + "loss": 5.9017, + "step": 13240 + }, + { + "epoch": 0.07874797792368446, + "grad_norm": 1.998478889465332, + "learning_rate": 4.923896059713531e-05, + "loss": 5.4774, + "step": 13241 + }, + { + "epoch": 0.07875392520696546, + "grad_norm": 1.869710922241211, + "learning_rate": 4.9238846219115774e-05, + "loss": 5.4591, + "step": 13242 + }, + { + "epoch": 0.07875987249024645, + "grad_norm": 1.8957170248031616, + "learning_rate": 4.923873183263471e-05, + "loss": 5.2823, + "step": 13243 + }, + { + "epoch": 0.07876581977352745, + "grad_norm": 1.9052289724349976, + "learning_rate": 4.9238617437692146e-05, + "loss": 5.4753, + "step": 13244 + }, + { + "epoch": 0.07877176705680845, + "grad_norm": 1.8786853551864624, + "learning_rate": 4.923850303428814e-05, + "loss": 5.2234, + "step": 13245 + }, + { + "epoch": 0.07877771434008944, + "grad_norm": 2.298356533050537, + "learning_rate": 4.923838862242271e-05, + "loss": 4.7138, + "step": 13246 + }, + { + "epoch": 0.07878366162337044, + "grad_norm": 2.1191911697387695, + "learning_rate": 4.923827420209592e-05, + "loss": 4.6354, + "step": 13247 + }, + { + "epoch": 0.07878960890665145, + "grad_norm": 2.1735050678253174, + "learning_rate": 4.923815977330781e-05, + "loss": 4.454, + "step": 13248 + }, + { + "epoch": 0.07879555618993243, + "grad_norm": 2.0126335620880127, + "learning_rate": 4.923804533605839e-05, + "loss": 4.3387, + "step": 13249 + }, + { + "epoch": 0.07880150347321344, + "grad_norm": 2.00081729888916, + "learning_rate": 4.9237930890347726e-05, + "loss": 4.4009, + "step": 13250 + }, + { + "epoch": 0.07880745075649444, + "grad_norm": 2.198625326156616, + "learning_rate": 4.923781643617586e-05, + "loss": 4.4334, + "step": 13251 + }, + { + "epoch": 0.07881339803977543, + "grad_norm": 2.0630993843078613, + "learning_rate": 4.923770197354281e-05, + "loss": 4.6349, + "step": 13252 + }, + { + "epoch": 0.07881934532305643, + "grad_norm": 1.7470935583114624, + "learning_rate": 4.923758750244863e-05, + "loss": 5.1363, + "step": 13253 + }, + { + "epoch": 0.07882529260633743, + "grad_norm": 1.5461190938949585, + "learning_rate": 4.923747302289335e-05, + "loss": 5.7365, + "step": 13254 + }, + { + "epoch": 0.07883123988961842, + "grad_norm": 1.800528645515442, + "learning_rate": 4.9237358534877036e-05, + "loss": 5.949, + "step": 13255 + }, + { + "epoch": 0.07883718717289942, + "grad_norm": 2.096055746078491, + "learning_rate": 4.923724403839971e-05, + "loss": 5.4203, + "step": 13256 + }, + { + "epoch": 0.07884313445618042, + "grad_norm": 2.0838513374328613, + "learning_rate": 4.92371295334614e-05, + "loss": 5.0542, + "step": 13257 + }, + { + "epoch": 0.07884908173946141, + "grad_norm": 1.711534023284912, + "learning_rate": 4.923701502006217e-05, + "loss": 5.7168, + "step": 13258 + }, + { + "epoch": 0.07885502902274241, + "grad_norm": 1.6610822677612305, + "learning_rate": 4.9236900498202035e-05, + "loss": 5.5605, + "step": 13259 + }, + { + "epoch": 0.07886097630602341, + "grad_norm": 1.549854040145874, + "learning_rate": 4.9236785967881064e-05, + "loss": 5.7792, + "step": 13260 + }, + { + "epoch": 0.0788669235893044, + "grad_norm": 1.9194339513778687, + "learning_rate": 4.923667142909927e-05, + "loss": 5.5481, + "step": 13261 + }, + { + "epoch": 0.0788728708725854, + "grad_norm": 1.6644178628921509, + "learning_rate": 4.923655688185671e-05, + "loss": 5.7271, + "step": 13262 + }, + { + "epoch": 0.0788788181558664, + "grad_norm": 1.820898175239563, + "learning_rate": 4.9236442326153414e-05, + "loss": 6.2458, + "step": 13263 + }, + { + "epoch": 0.0788847654391474, + "grad_norm": 1.732539176940918, + "learning_rate": 4.923632776198943e-05, + "loss": 5.5854, + "step": 13264 + }, + { + "epoch": 0.0788907127224284, + "grad_norm": 1.769140601158142, + "learning_rate": 4.923621318936479e-05, + "loss": 5.5511, + "step": 13265 + }, + { + "epoch": 0.0788966600057094, + "grad_norm": 1.728833556175232, + "learning_rate": 4.923609860827955e-05, + "loss": 5.6215, + "step": 13266 + }, + { + "epoch": 0.07890260728899039, + "grad_norm": 1.5940407514572144, + "learning_rate": 4.923598401873373e-05, + "loss": 5.6572, + "step": 13267 + }, + { + "epoch": 0.07890855457227139, + "grad_norm": 2.153200149536133, + "learning_rate": 4.923586942072737e-05, + "loss": 5.0235, + "step": 13268 + }, + { + "epoch": 0.07891450185555238, + "grad_norm": 1.6448415517807007, + "learning_rate": 4.9235754814260526e-05, + "loss": 5.5353, + "step": 13269 + }, + { + "epoch": 0.07892044913883338, + "grad_norm": 1.706984281539917, + "learning_rate": 4.9235640199333235e-05, + "loss": 5.5278, + "step": 13270 + }, + { + "epoch": 0.07892639642211438, + "grad_norm": 1.6129798889160156, + "learning_rate": 4.923552557594553e-05, + "loss": 5.4643, + "step": 13271 + }, + { + "epoch": 0.07893234370539537, + "grad_norm": 1.612748384475708, + "learning_rate": 4.923541094409745e-05, + "loss": 5.4994, + "step": 13272 + }, + { + "epoch": 0.07893829098867637, + "grad_norm": 1.6947647333145142, + "learning_rate": 4.923529630378904e-05, + "loss": 5.5117, + "step": 13273 + }, + { + "epoch": 0.07894423827195737, + "grad_norm": 1.629684567451477, + "learning_rate": 4.9235181655020336e-05, + "loss": 5.4266, + "step": 13274 + }, + { + "epoch": 0.07895018555523836, + "grad_norm": 1.6417474746704102, + "learning_rate": 4.923506699779139e-05, + "loss": 5.4803, + "step": 13275 + }, + { + "epoch": 0.07895613283851936, + "grad_norm": 1.5188243389129639, + "learning_rate": 4.9234952332102226e-05, + "loss": 5.4066, + "step": 13276 + }, + { + "epoch": 0.07896208012180037, + "grad_norm": 1.4906466007232666, + "learning_rate": 4.9234837657952885e-05, + "loss": 5.4622, + "step": 13277 + }, + { + "epoch": 0.07896802740508135, + "grad_norm": 1.745351791381836, + "learning_rate": 4.9234722975343414e-05, + "loss": 5.458, + "step": 13278 + }, + { + "epoch": 0.07897397468836236, + "grad_norm": 1.734399676322937, + "learning_rate": 4.9234608284273866e-05, + "loss": 5.3542, + "step": 13279 + }, + { + "epoch": 0.07897992197164336, + "grad_norm": 2.396031379699707, + "learning_rate": 4.9234493584744254e-05, + "loss": 5.0978, + "step": 13280 + }, + { + "epoch": 0.07898586925492435, + "grad_norm": 2.0151939392089844, + "learning_rate": 4.9234378876754626e-05, + "loss": 5.5051, + "step": 13281 + }, + { + "epoch": 0.07899181653820535, + "grad_norm": 2.1796762943267822, + "learning_rate": 4.9234264160305036e-05, + "loss": 5.2788, + "step": 13282 + }, + { + "epoch": 0.07899776382148635, + "grad_norm": 2.069291830062866, + "learning_rate": 4.923414943539552e-05, + "loss": 5.4454, + "step": 13283 + }, + { + "epoch": 0.07900371110476734, + "grad_norm": 2.034498929977417, + "learning_rate": 4.92340347020261e-05, + "loss": 5.3849, + "step": 13284 + }, + { + "epoch": 0.07900965838804834, + "grad_norm": 1.8353052139282227, + "learning_rate": 4.9233919960196835e-05, + "loss": 5.3975, + "step": 13285 + }, + { + "epoch": 0.07901560567132934, + "grad_norm": 1.9896777868270874, + "learning_rate": 4.923380520990776e-05, + "loss": 5.1199, + "step": 13286 + }, + { + "epoch": 0.07902155295461033, + "grad_norm": 1.9539830684661865, + "learning_rate": 4.923369045115891e-05, + "loss": 5.3908, + "step": 13287 + }, + { + "epoch": 0.07902750023789133, + "grad_norm": 1.682651162147522, + "learning_rate": 4.923357568395033e-05, + "loss": 5.4719, + "step": 13288 + }, + { + "epoch": 0.07903344752117233, + "grad_norm": 2.0095672607421875, + "learning_rate": 4.923346090828206e-05, + "loss": 5.9258, + "step": 13289 + }, + { + "epoch": 0.07903939480445332, + "grad_norm": 1.7949076890945435, + "learning_rate": 4.923334612415413e-05, + "loss": 5.646, + "step": 13290 + }, + { + "epoch": 0.07904534208773432, + "grad_norm": 2.1651079654693604, + "learning_rate": 4.92332313315666e-05, + "loss": 5.2527, + "step": 13291 + }, + { + "epoch": 0.07905128937101533, + "grad_norm": 2.0362184047698975, + "learning_rate": 4.92331165305195e-05, + "loss": 5.2671, + "step": 13292 + }, + { + "epoch": 0.07905723665429631, + "grad_norm": 1.5425541400909424, + "learning_rate": 4.923300172101287e-05, + "loss": 5.5149, + "step": 13293 + }, + { + "epoch": 0.07906318393757732, + "grad_norm": 2.13031005859375, + "learning_rate": 4.923288690304675e-05, + "loss": 5.9304, + "step": 13294 + }, + { + "epoch": 0.07906913122085832, + "grad_norm": 2.165199041366577, + "learning_rate": 4.923277207662117e-05, + "loss": 5.9153, + "step": 13295 + }, + { + "epoch": 0.0790750785041393, + "grad_norm": 2.1479499340057373, + "learning_rate": 4.923265724173619e-05, + "loss": 5.7215, + "step": 13296 + }, + { + "epoch": 0.07908102578742031, + "grad_norm": 1.8908145427703857, + "learning_rate": 4.923254239839183e-05, + "loss": 5.5801, + "step": 13297 + }, + { + "epoch": 0.0790869730707013, + "grad_norm": 1.7739901542663574, + "learning_rate": 4.9232427546588145e-05, + "loss": 5.283, + "step": 13298 + }, + { + "epoch": 0.0790929203539823, + "grad_norm": 1.8153715133666992, + "learning_rate": 4.9232312686325175e-05, + "loss": 5.4626, + "step": 13299 + }, + { + "epoch": 0.0790988676372633, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.923219781760295e-05, + "loss": 5.5246, + "step": 13300 + }, + { + "epoch": 0.07910481492054429, + "grad_norm": 2.161536455154419, + "learning_rate": 4.923208294042152e-05, + "loss": 5.6865, + "step": 13301 + }, + { + "epoch": 0.07911076220382529, + "grad_norm": 2.5373623371124268, + "learning_rate": 4.9231968054780905e-05, + "loss": 5.8634, + "step": 13302 + }, + { + "epoch": 0.0791167094871063, + "grad_norm": 2.4957666397094727, + "learning_rate": 4.923185316068117e-05, + "loss": 4.9065, + "step": 13303 + }, + { + "epoch": 0.07912265677038728, + "grad_norm": 2.260540246963501, + "learning_rate": 4.923173825812235e-05, + "loss": 5.0815, + "step": 13304 + }, + { + "epoch": 0.07912860405366828, + "grad_norm": 2.406765937805176, + "learning_rate": 4.923162334710448e-05, + "loss": 4.8599, + "step": 13305 + }, + { + "epoch": 0.07913455133694929, + "grad_norm": 2.282153606414795, + "learning_rate": 4.923150842762759e-05, + "loss": 5.1024, + "step": 13306 + }, + { + "epoch": 0.07914049862023027, + "grad_norm": 1.8351432085037231, + "learning_rate": 4.9231393499691744e-05, + "loss": 5.3715, + "step": 13307 + }, + { + "epoch": 0.07914644590351128, + "grad_norm": 1.8290963172912598, + "learning_rate": 4.9231278563296965e-05, + "loss": 5.4456, + "step": 13308 + }, + { + "epoch": 0.07915239318679228, + "grad_norm": 1.7157766819000244, + "learning_rate": 4.923116361844329e-05, + "loss": 5.4952, + "step": 13309 + }, + { + "epoch": 0.07915834047007327, + "grad_norm": 2.051391124725342, + "learning_rate": 4.923104866513077e-05, + "loss": 5.7754, + "step": 13310 + }, + { + "epoch": 0.07916428775335427, + "grad_norm": 1.8714796304702759, + "learning_rate": 4.923093370335944e-05, + "loss": 5.4118, + "step": 13311 + }, + { + "epoch": 0.07917023503663527, + "grad_norm": 2.4251246452331543, + "learning_rate": 4.923081873312935e-05, + "loss": 4.9677, + "step": 13312 + }, + { + "epoch": 0.07917618231991626, + "grad_norm": 3.490328550338745, + "learning_rate": 4.923070375444052e-05, + "loss": 4.5336, + "step": 13313 + }, + { + "epoch": 0.07918212960319726, + "grad_norm": 2.820434331893921, + "learning_rate": 4.9230588767293004e-05, + "loss": 4.2865, + "step": 13314 + }, + { + "epoch": 0.07918807688647826, + "grad_norm": 2.3713653087615967, + "learning_rate": 4.923047377168685e-05, + "loss": 4.2558, + "step": 13315 + }, + { + "epoch": 0.07919402416975925, + "grad_norm": 2.484199285507202, + "learning_rate": 4.923035876762208e-05, + "loss": 3.9565, + "step": 13316 + }, + { + "epoch": 0.07919997145304025, + "grad_norm": 2.771982431411743, + "learning_rate": 4.9230243755098735e-05, + "loss": 3.9478, + "step": 13317 + }, + { + "epoch": 0.07920591873632125, + "grad_norm": 2.613006591796875, + "learning_rate": 4.9230128734116874e-05, + "loss": 4.0285, + "step": 13318 + }, + { + "epoch": 0.07921186601960224, + "grad_norm": 2.378276824951172, + "learning_rate": 4.923001370467653e-05, + "loss": 4.129, + "step": 13319 + }, + { + "epoch": 0.07921781330288324, + "grad_norm": 2.6948869228363037, + "learning_rate": 4.922989866677772e-05, + "loss": 5.7581, + "step": 13320 + }, + { + "epoch": 0.07922376058616425, + "grad_norm": 2.058387517929077, + "learning_rate": 4.922978362042051e-05, + "loss": 5.7589, + "step": 13321 + }, + { + "epoch": 0.07922970786944523, + "grad_norm": 2.2277138233184814, + "learning_rate": 4.9229668565604936e-05, + "loss": 5.691, + "step": 13322 + }, + { + "epoch": 0.07923565515272624, + "grad_norm": 1.827525019645691, + "learning_rate": 4.922955350233104e-05, + "loss": 5.6555, + "step": 13323 + }, + { + "epoch": 0.07924160243600724, + "grad_norm": 1.5456974506378174, + "learning_rate": 4.922943843059885e-05, + "loss": 5.445, + "step": 13324 + }, + { + "epoch": 0.07924754971928823, + "grad_norm": 1.859805703163147, + "learning_rate": 4.922932335040842e-05, + "loss": 5.5864, + "step": 13325 + }, + { + "epoch": 0.07925349700256923, + "grad_norm": 2.0083398818969727, + "learning_rate": 4.922920826175977e-05, + "loss": 5.7598, + "step": 13326 + }, + { + "epoch": 0.07925944428585022, + "grad_norm": 1.9759368896484375, + "learning_rate": 4.922909316465296e-05, + "loss": 5.7778, + "step": 13327 + }, + { + "epoch": 0.07926539156913122, + "grad_norm": 1.9937580823898315, + "learning_rate": 4.9228978059088035e-05, + "loss": 5.7291, + "step": 13328 + }, + { + "epoch": 0.07927133885241222, + "grad_norm": 2.6860668659210205, + "learning_rate": 4.922886294506501e-05, + "loss": 5.0277, + "step": 13329 + }, + { + "epoch": 0.07927728613569321, + "grad_norm": 2.03318190574646, + "learning_rate": 4.9228747822583945e-05, + "loss": 5.2387, + "step": 13330 + }, + { + "epoch": 0.07928323341897421, + "grad_norm": 2.250929117202759, + "learning_rate": 4.9228632691644874e-05, + "loss": 5.2348, + "step": 13331 + }, + { + "epoch": 0.07928918070225521, + "grad_norm": 2.0255093574523926, + "learning_rate": 4.922851755224784e-05, + "loss": 5.6585, + "step": 13332 + }, + { + "epoch": 0.0792951279855362, + "grad_norm": 1.9353551864624023, + "learning_rate": 4.922840240439288e-05, + "loss": 5.3989, + "step": 13333 + }, + { + "epoch": 0.0793010752688172, + "grad_norm": 1.9392589330673218, + "learning_rate": 4.922828724808003e-05, + "loss": 5.9127, + "step": 13334 + }, + { + "epoch": 0.0793070225520982, + "grad_norm": 2.312340021133423, + "learning_rate": 4.922817208330934e-05, + "loss": 5.656, + "step": 13335 + }, + { + "epoch": 0.0793129698353792, + "grad_norm": 2.1480720043182373, + "learning_rate": 4.9228056910080845e-05, + "loss": 5.4582, + "step": 13336 + }, + { + "epoch": 0.0793189171186602, + "grad_norm": 2.0460312366485596, + "learning_rate": 4.922794172839458e-05, + "loss": 5.5177, + "step": 13337 + }, + { + "epoch": 0.0793248644019412, + "grad_norm": 1.8319480419158936, + "learning_rate": 4.92278265382506e-05, + "loss": 5.5872, + "step": 13338 + }, + { + "epoch": 0.07933081168522219, + "grad_norm": 1.610379934310913, + "learning_rate": 4.922771133964893e-05, + "loss": 5.5398, + "step": 13339 + }, + { + "epoch": 0.07933675896850319, + "grad_norm": 1.767022728919983, + "learning_rate": 4.9227596132589616e-05, + "loss": 6.0004, + "step": 13340 + }, + { + "epoch": 0.07934270625178419, + "grad_norm": 2.108621835708618, + "learning_rate": 4.92274809170727e-05, + "loss": 5.1513, + "step": 13341 + }, + { + "epoch": 0.07934865353506518, + "grad_norm": 2.2562835216522217, + "learning_rate": 4.922736569309822e-05, + "loss": 4.7642, + "step": 13342 + }, + { + "epoch": 0.07935460081834618, + "grad_norm": 1.7953063249588013, + "learning_rate": 4.922725046066622e-05, + "loss": 5.2453, + "step": 13343 + }, + { + "epoch": 0.07936054810162718, + "grad_norm": 1.8957513570785522, + "learning_rate": 4.922713521977673e-05, + "loss": 5.0673, + "step": 13344 + }, + { + "epoch": 0.07936649538490817, + "grad_norm": 1.8375275135040283, + "learning_rate": 4.922701997042981e-05, + "loss": 5.0301, + "step": 13345 + }, + { + "epoch": 0.07937244266818917, + "grad_norm": 2.306138515472412, + "learning_rate": 4.9226904712625473e-05, + "loss": 4.7415, + "step": 13346 + }, + { + "epoch": 0.07937838995147017, + "grad_norm": 2.058403730392456, + "learning_rate": 4.922678944636379e-05, + "loss": 5.4454, + "step": 13347 + }, + { + "epoch": 0.07938433723475116, + "grad_norm": 1.9230997562408447, + "learning_rate": 4.922667417164477e-05, + "loss": 5.3755, + "step": 13348 + }, + { + "epoch": 0.07939028451803216, + "grad_norm": 1.9053308963775635, + "learning_rate": 4.922655888846848e-05, + "loss": 5.7708, + "step": 13349 + }, + { + "epoch": 0.07939623180131317, + "grad_norm": 1.8009783029556274, + "learning_rate": 4.922644359683494e-05, + "loss": 4.9939, + "step": 13350 + }, + { + "epoch": 0.07940217908459415, + "grad_norm": 1.6748642921447754, + "learning_rate": 4.92263282967442e-05, + "loss": 5.4869, + "step": 13351 + }, + { + "epoch": 0.07940812636787516, + "grad_norm": 1.532475471496582, + "learning_rate": 4.92262129881963e-05, + "loss": 5.755, + "step": 13352 + }, + { + "epoch": 0.07941407365115616, + "grad_norm": 1.513795018196106, + "learning_rate": 4.9226097671191284e-05, + "loss": 5.4083, + "step": 13353 + }, + { + "epoch": 0.07942002093443715, + "grad_norm": 1.66012442111969, + "learning_rate": 4.922598234572918e-05, + "loss": 5.5185, + "step": 13354 + }, + { + "epoch": 0.07942596821771815, + "grad_norm": 1.6519379615783691, + "learning_rate": 4.922586701181005e-05, + "loss": 5.3482, + "step": 13355 + }, + { + "epoch": 0.07943191550099914, + "grad_norm": 1.4444184303283691, + "learning_rate": 4.922575166943391e-05, + "loss": 5.4466, + "step": 13356 + }, + { + "epoch": 0.07943786278428014, + "grad_norm": 1.4603393077850342, + "learning_rate": 4.92256363186008e-05, + "loss": 5.4343, + "step": 13357 + }, + { + "epoch": 0.07944381006756114, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.922552095931078e-05, + "loss": 5.4224, + "step": 13358 + }, + { + "epoch": 0.07944975735084213, + "grad_norm": 1.3054184913635254, + "learning_rate": 4.922540559156389e-05, + "loss": 5.4801, + "step": 13359 + }, + { + "epoch": 0.07945570463412313, + "grad_norm": 1.6295130252838135, + "learning_rate": 4.922529021536015e-05, + "loss": 5.4593, + "step": 13360 + }, + { + "epoch": 0.07946165191740413, + "grad_norm": 1.6684668064117432, + "learning_rate": 4.922517483069962e-05, + "loss": 5.2817, + "step": 13361 + }, + { + "epoch": 0.07946759920068512, + "grad_norm": 1.580409049987793, + "learning_rate": 4.922505943758232e-05, + "loss": 5.4399, + "step": 13362 + }, + { + "epoch": 0.07947354648396612, + "grad_norm": 1.613756775856018, + "learning_rate": 4.922494403600831e-05, + "loss": 5.2646, + "step": 13363 + }, + { + "epoch": 0.07947949376724713, + "grad_norm": 1.4371063709259033, + "learning_rate": 4.9224828625977616e-05, + "loss": 5.2866, + "step": 13364 + }, + { + "epoch": 0.07948544105052811, + "grad_norm": 1.5926525592803955, + "learning_rate": 4.9224713207490294e-05, + "loss": 5.5958, + "step": 13365 + }, + { + "epoch": 0.07949138833380912, + "grad_norm": 1.5216618776321411, + "learning_rate": 4.9224597780546365e-05, + "loss": 5.6094, + "step": 13366 + }, + { + "epoch": 0.07949733561709012, + "grad_norm": 1.7261598110198975, + "learning_rate": 4.922448234514588e-05, + "loss": 5.2781, + "step": 13367 + }, + { + "epoch": 0.0795032829003711, + "grad_norm": 1.6909232139587402, + "learning_rate": 4.922436690128889e-05, + "loss": 5.3299, + "step": 13368 + }, + { + "epoch": 0.07950923018365211, + "grad_norm": 1.6486754417419434, + "learning_rate": 4.922425144897541e-05, + "loss": 5.2478, + "step": 13369 + }, + { + "epoch": 0.07951517746693311, + "grad_norm": 1.4019837379455566, + "learning_rate": 4.922413598820551e-05, + "loss": 5.2383, + "step": 13370 + }, + { + "epoch": 0.0795211247502141, + "grad_norm": 1.7588412761688232, + "learning_rate": 4.92240205189792e-05, + "loss": 5.3224, + "step": 13371 + }, + { + "epoch": 0.0795270720334951, + "grad_norm": 1.5354480743408203, + "learning_rate": 4.922390504129654e-05, + "loss": 5.1617, + "step": 13372 + }, + { + "epoch": 0.0795330193167761, + "grad_norm": 1.5183011293411255, + "learning_rate": 4.922378955515756e-05, + "loss": 5.3082, + "step": 13373 + }, + { + "epoch": 0.07953896660005709, + "grad_norm": 1.436281681060791, + "learning_rate": 4.922367406056232e-05, + "loss": 5.4446, + "step": 13374 + }, + { + "epoch": 0.07954491388333809, + "grad_norm": 1.526934266090393, + "learning_rate": 4.922355855751083e-05, + "loss": 5.3067, + "step": 13375 + }, + { + "epoch": 0.0795508611666191, + "grad_norm": 1.516784906387329, + "learning_rate": 4.922344304600315e-05, + "loss": 5.4982, + "step": 13376 + }, + { + "epoch": 0.07955680844990008, + "grad_norm": 1.5154777765274048, + "learning_rate": 4.922332752603932e-05, + "loss": 5.3459, + "step": 13377 + }, + { + "epoch": 0.07956275573318108, + "grad_norm": 1.542508840560913, + "learning_rate": 4.9223211997619376e-05, + "loss": 5.3677, + "step": 13378 + }, + { + "epoch": 0.07956870301646209, + "grad_norm": 1.3413010835647583, + "learning_rate": 4.922309646074336e-05, + "loss": 5.2684, + "step": 13379 + }, + { + "epoch": 0.07957465029974307, + "grad_norm": 1.6295002698898315, + "learning_rate": 4.9222980915411306e-05, + "loss": 5.2737, + "step": 13380 + }, + { + "epoch": 0.07958059758302408, + "grad_norm": 1.5810730457305908, + "learning_rate": 4.922286536162326e-05, + "loss": 5.2471, + "step": 13381 + }, + { + "epoch": 0.07958654486630508, + "grad_norm": 1.3186451196670532, + "learning_rate": 4.9222749799379266e-05, + "loss": 5.3081, + "step": 13382 + }, + { + "epoch": 0.07959249214958607, + "grad_norm": 1.3897243738174438, + "learning_rate": 4.922263422867936e-05, + "loss": 5.2658, + "step": 13383 + }, + { + "epoch": 0.07959843943286707, + "grad_norm": 1.3873858451843262, + "learning_rate": 4.922251864952358e-05, + "loss": 5.334, + "step": 13384 + }, + { + "epoch": 0.07960438671614806, + "grad_norm": 1.4205409288406372, + "learning_rate": 4.922240306191197e-05, + "loss": 5.3007, + "step": 13385 + }, + { + "epoch": 0.07961033399942906, + "grad_norm": 1.3726485967636108, + "learning_rate": 4.922228746584457e-05, + "loss": 5.1949, + "step": 13386 + }, + { + "epoch": 0.07961628128271006, + "grad_norm": 1.708837628364563, + "learning_rate": 4.922217186132142e-05, + "loss": 5.2061, + "step": 13387 + }, + { + "epoch": 0.07962222856599105, + "grad_norm": 1.7818368673324585, + "learning_rate": 4.9222056248342556e-05, + "loss": 5.1182, + "step": 13388 + }, + { + "epoch": 0.07962817584927205, + "grad_norm": 1.4941715002059937, + "learning_rate": 4.9221940626908024e-05, + "loss": 5.0899, + "step": 13389 + }, + { + "epoch": 0.07963412313255305, + "grad_norm": 1.3581326007843018, + "learning_rate": 4.922182499701787e-05, + "loss": 5.0551, + "step": 13390 + }, + { + "epoch": 0.07964007041583404, + "grad_norm": 1.5772393941879272, + "learning_rate": 4.922170935867212e-05, + "loss": 5.245, + "step": 13391 + }, + { + "epoch": 0.07964601769911504, + "grad_norm": 1.9635555744171143, + "learning_rate": 4.922159371187082e-05, + "loss": 5.2898, + "step": 13392 + }, + { + "epoch": 0.07965196498239605, + "grad_norm": 1.535050392150879, + "learning_rate": 4.922147805661402e-05, + "loss": 5.2043, + "step": 13393 + }, + { + "epoch": 0.07965791226567703, + "grad_norm": 1.4985787868499756, + "learning_rate": 4.922136239290175e-05, + "loss": 5.1682, + "step": 13394 + }, + { + "epoch": 0.07966385954895804, + "grad_norm": 1.5314218997955322, + "learning_rate": 4.922124672073405e-05, + "loss": 5.321, + "step": 13395 + }, + { + "epoch": 0.07966980683223904, + "grad_norm": 1.440621018409729, + "learning_rate": 4.9221131040110954e-05, + "loss": 5.3013, + "step": 13396 + }, + { + "epoch": 0.07967575411552003, + "grad_norm": 1.5103110074996948, + "learning_rate": 4.9221015351032527e-05, + "loss": 5.2825, + "step": 13397 + }, + { + "epoch": 0.07968170139880103, + "grad_norm": 1.3581254482269287, + "learning_rate": 4.9220899653498786e-05, + "loss": 5.2433, + "step": 13398 + }, + { + "epoch": 0.07968764868208203, + "grad_norm": 1.5673763751983643, + "learning_rate": 4.922078394750978e-05, + "loss": 5.2279, + "step": 13399 + }, + { + "epoch": 0.07969359596536302, + "grad_norm": 1.5550049543380737, + "learning_rate": 4.922066823306555e-05, + "loss": 5.0406, + "step": 13400 + }, + { + "epoch": 0.07969954324864402, + "grad_norm": 1.6366932392120361, + "learning_rate": 4.922055251016613e-05, + "loss": 5.1299, + "step": 13401 + }, + { + "epoch": 0.07970549053192502, + "grad_norm": 1.45979642868042, + "learning_rate": 4.922043677881157e-05, + "loss": 4.9527, + "step": 13402 + }, + { + "epoch": 0.07971143781520601, + "grad_norm": 1.594494104385376, + "learning_rate": 4.922032103900191e-05, + "loss": 5.6511, + "step": 13403 + }, + { + "epoch": 0.07971738509848701, + "grad_norm": 1.419045329093933, + "learning_rate": 4.9220205290737175e-05, + "loss": 5.0936, + "step": 13404 + }, + { + "epoch": 0.07972333238176801, + "grad_norm": 1.5998183488845825, + "learning_rate": 4.922008953401742e-05, + "loss": 5.2774, + "step": 13405 + }, + { + "epoch": 0.079729279665049, + "grad_norm": 1.3942409753799438, + "learning_rate": 4.9219973768842685e-05, + "loss": 5.5466, + "step": 13406 + }, + { + "epoch": 0.07973522694833, + "grad_norm": 1.4478344917297363, + "learning_rate": 4.9219857995213015e-05, + "loss": 5.5757, + "step": 13407 + }, + { + "epoch": 0.079741174231611, + "grad_norm": 1.4197556972503662, + "learning_rate": 4.921974221312843e-05, + "loss": 5.3194, + "step": 13408 + }, + { + "epoch": 0.079747121514892, + "grad_norm": 1.7690924406051636, + "learning_rate": 4.9219626422588996e-05, + "loss": 5.3551, + "step": 13409 + }, + { + "epoch": 0.079753068798173, + "grad_norm": 1.8233799934387207, + "learning_rate": 4.921951062359473e-05, + "loss": 5.3143, + "step": 13410 + }, + { + "epoch": 0.079759016081454, + "grad_norm": 1.738848090171814, + "learning_rate": 4.921939481614568e-05, + "loss": 5.0194, + "step": 13411 + }, + { + "epoch": 0.07976496336473499, + "grad_norm": 1.6401729583740234, + "learning_rate": 4.92192790002419e-05, + "loss": 5.3347, + "step": 13412 + }, + { + "epoch": 0.07977091064801599, + "grad_norm": 1.425485372543335, + "learning_rate": 4.921916317588341e-05, + "loss": 5.0384, + "step": 13413 + }, + { + "epoch": 0.07977685793129698, + "grad_norm": 1.6337133646011353, + "learning_rate": 4.921904734307027e-05, + "loss": 5.3213, + "step": 13414 + }, + { + "epoch": 0.07978280521457798, + "grad_norm": 1.561292052268982, + "learning_rate": 4.92189315018025e-05, + "loss": 5.1502, + "step": 13415 + }, + { + "epoch": 0.07978875249785898, + "grad_norm": 1.6225664615631104, + "learning_rate": 4.921881565208016e-05, + "loss": 5.2638, + "step": 13416 + }, + { + "epoch": 0.07979469978113997, + "grad_norm": 1.5074353218078613, + "learning_rate": 4.921869979390328e-05, + "loss": 5.0872, + "step": 13417 + }, + { + "epoch": 0.07980064706442097, + "grad_norm": 1.4769634008407593, + "learning_rate": 4.92185839272719e-05, + "loss": 5.1341, + "step": 13418 + }, + { + "epoch": 0.07980659434770197, + "grad_norm": 1.5929937362670898, + "learning_rate": 4.921846805218607e-05, + "loss": 5.2799, + "step": 13419 + }, + { + "epoch": 0.07981254163098296, + "grad_norm": 1.4583854675292969, + "learning_rate": 4.921835216864581e-05, + "loss": 5.0822, + "step": 13420 + }, + { + "epoch": 0.07981848891426396, + "grad_norm": 1.4904375076293945, + "learning_rate": 4.921823627665119e-05, + "loss": 5.055, + "step": 13421 + }, + { + "epoch": 0.07982443619754497, + "grad_norm": 1.6971831321716309, + "learning_rate": 4.921812037620221e-05, + "loss": 5.1968, + "step": 13422 + }, + { + "epoch": 0.07983038348082595, + "grad_norm": 1.5604689121246338, + "learning_rate": 4.9218004467298956e-05, + "loss": 4.9681, + "step": 13423 + }, + { + "epoch": 0.07983633076410696, + "grad_norm": 1.678427815437317, + "learning_rate": 4.9217888549941436e-05, + "loss": 5.2044, + "step": 13424 + }, + { + "epoch": 0.07984227804738796, + "grad_norm": 1.521996259689331, + "learning_rate": 4.921777262412971e-05, + "loss": 4.9741, + "step": 13425 + }, + { + "epoch": 0.07984822533066895, + "grad_norm": 1.5315868854522705, + "learning_rate": 4.92176566898638e-05, + "loss": 5.0064, + "step": 13426 + }, + { + "epoch": 0.07985417261394995, + "grad_norm": 1.465867280960083, + "learning_rate": 4.9217540747143765e-05, + "loss": 4.942, + "step": 13427 + }, + { + "epoch": 0.07986011989723095, + "grad_norm": 1.4323827028274536, + "learning_rate": 4.9217424795969634e-05, + "loss": 4.8934, + "step": 13428 + }, + { + "epoch": 0.07986606718051194, + "grad_norm": 1.4645717144012451, + "learning_rate": 4.921730883634145e-05, + "loss": 5.0473, + "step": 13429 + }, + { + "epoch": 0.07987201446379294, + "grad_norm": 1.5992658138275146, + "learning_rate": 4.9217192868259246e-05, + "loss": 4.8968, + "step": 13430 + }, + { + "epoch": 0.07987796174707394, + "grad_norm": 1.4294894933700562, + "learning_rate": 4.921707689172308e-05, + "loss": 5.0719, + "step": 13431 + }, + { + "epoch": 0.07988390903035493, + "grad_norm": 1.5885019302368164, + "learning_rate": 4.921696090673298e-05, + "loss": 5.1505, + "step": 13432 + }, + { + "epoch": 0.07988985631363593, + "grad_norm": 1.4929580688476562, + "learning_rate": 4.921684491328898e-05, + "loss": 5.016, + "step": 13433 + }, + { + "epoch": 0.07989580359691693, + "grad_norm": 1.4980381727218628, + "learning_rate": 4.921672891139114e-05, + "loss": 5.0601, + "step": 13434 + }, + { + "epoch": 0.07990175088019792, + "grad_norm": 1.5698089599609375, + "learning_rate": 4.9216612901039495e-05, + "loss": 5.0251, + "step": 13435 + }, + { + "epoch": 0.07990769816347892, + "grad_norm": 1.459037184715271, + "learning_rate": 4.921649688223407e-05, + "loss": 4.8417, + "step": 13436 + }, + { + "epoch": 0.07991364544675993, + "grad_norm": 1.5418161153793335, + "learning_rate": 4.921638085497492e-05, + "loss": 5.1989, + "step": 13437 + }, + { + "epoch": 0.07991959273004091, + "grad_norm": 1.546325922012329, + "learning_rate": 4.9216264819262084e-05, + "loss": 5.3004, + "step": 13438 + }, + { + "epoch": 0.07992554001332192, + "grad_norm": 1.5820508003234863, + "learning_rate": 4.9216148775095594e-05, + "loss": 5.3327, + "step": 13439 + }, + { + "epoch": 0.07993148729660292, + "grad_norm": 1.5077866315841675, + "learning_rate": 4.9216032722475504e-05, + "loss": 5.2423, + "step": 13440 + }, + { + "epoch": 0.0799374345798839, + "grad_norm": 1.3654597997665405, + "learning_rate": 4.921591666140184e-05, + "loss": 5.1563, + "step": 13441 + }, + { + "epoch": 0.07994338186316491, + "grad_norm": 1.6721473932266235, + "learning_rate": 4.921580059187466e-05, + "loss": 5.1848, + "step": 13442 + }, + { + "epoch": 0.0799493291464459, + "grad_norm": 1.5349076986312866, + "learning_rate": 4.921568451389398e-05, + "loss": 5.1836, + "step": 13443 + }, + { + "epoch": 0.0799552764297269, + "grad_norm": 1.6246919631958008, + "learning_rate": 4.921556842745987e-05, + "loss": 4.8715, + "step": 13444 + }, + { + "epoch": 0.0799612237130079, + "grad_norm": 1.5361920595169067, + "learning_rate": 4.921545233257234e-05, + "loss": 4.8203, + "step": 13445 + }, + { + "epoch": 0.07996717099628889, + "grad_norm": 1.6185765266418457, + "learning_rate": 4.921533622923146e-05, + "loss": 4.8039, + "step": 13446 + }, + { + "epoch": 0.07997311827956989, + "grad_norm": 1.402462363243103, + "learning_rate": 4.9215220117437246e-05, + "loss": 4.8524, + "step": 13447 + }, + { + "epoch": 0.07997906556285089, + "grad_norm": 1.5282337665557861, + "learning_rate": 4.921510399718975e-05, + "loss": 4.8081, + "step": 13448 + }, + { + "epoch": 0.07998501284613188, + "grad_norm": 1.336254596710205, + "learning_rate": 4.921498786848902e-05, + "loss": 4.8468, + "step": 13449 + }, + { + "epoch": 0.07999096012941288, + "grad_norm": 1.4701998233795166, + "learning_rate": 4.921487173133508e-05, + "loss": 4.6873, + "step": 13450 + }, + { + "epoch": 0.07999690741269389, + "grad_norm": 1.6340824365615845, + "learning_rate": 4.921475558572798e-05, + "loss": 4.6779, + "step": 13451 + }, + { + "epoch": 0.08000285469597487, + "grad_norm": 1.557027816772461, + "learning_rate": 4.921463943166775e-05, + "loss": 4.6467, + "step": 13452 + }, + { + "epoch": 0.08000880197925588, + "grad_norm": 1.6390316486358643, + "learning_rate": 4.9214523269154454e-05, + "loss": 4.7376, + "step": 13453 + }, + { + "epoch": 0.08001474926253688, + "grad_norm": 2.3929800987243652, + "learning_rate": 4.921440709818811e-05, + "loss": 5.2623, + "step": 13454 + }, + { + "epoch": 0.08002069654581787, + "grad_norm": 1.5896660089492798, + "learning_rate": 4.921429091876877e-05, + "loss": 4.6952, + "step": 13455 + }, + { + "epoch": 0.08002664382909887, + "grad_norm": 1.6705348491668701, + "learning_rate": 4.921417473089647e-05, + "loss": 4.7963, + "step": 13456 + }, + { + "epoch": 0.08003259111237987, + "grad_norm": 1.5925310850143433, + "learning_rate": 4.9214058534571253e-05, + "loss": 4.7398, + "step": 13457 + }, + { + "epoch": 0.08003853839566086, + "grad_norm": 1.5314396619796753, + "learning_rate": 4.921394232979316e-05, + "loss": 4.7578, + "step": 13458 + }, + { + "epoch": 0.08004448567894186, + "grad_norm": 1.6665661334991455, + "learning_rate": 4.921382611656222e-05, + "loss": 4.7767, + "step": 13459 + }, + { + "epoch": 0.08005043296222286, + "grad_norm": 1.5145021677017212, + "learning_rate": 4.9213709894878495e-05, + "loss": 4.7892, + "step": 13460 + }, + { + "epoch": 0.08005638024550385, + "grad_norm": 1.8332866430282593, + "learning_rate": 4.921359366474201e-05, + "loss": 4.6434, + "step": 13461 + }, + { + "epoch": 0.08006232752878485, + "grad_norm": 1.467970371246338, + "learning_rate": 4.921347742615281e-05, + "loss": 4.6611, + "step": 13462 + }, + { + "epoch": 0.08006827481206585, + "grad_norm": 1.5667515993118286, + "learning_rate": 4.9213361179110936e-05, + "loss": 4.5792, + "step": 13463 + }, + { + "epoch": 0.08007422209534684, + "grad_norm": 1.5370365381240845, + "learning_rate": 4.9213244923616434e-05, + "loss": 4.6724, + "step": 13464 + }, + { + "epoch": 0.08008016937862784, + "grad_norm": 1.7298029661178589, + "learning_rate": 4.921312865966933e-05, + "loss": 4.7808, + "step": 13465 + }, + { + "epoch": 0.08008611666190885, + "grad_norm": 1.5497710704803467, + "learning_rate": 4.921301238726966e-05, + "loss": 4.8228, + "step": 13466 + }, + { + "epoch": 0.08009206394518983, + "grad_norm": 1.4589923620224, + "learning_rate": 4.92128961064175e-05, + "loss": 4.757, + "step": 13467 + }, + { + "epoch": 0.08009801122847084, + "grad_norm": 1.6503071784973145, + "learning_rate": 4.921277981711286e-05, + "loss": 4.6074, + "step": 13468 + }, + { + "epoch": 0.08010395851175184, + "grad_norm": 1.621209979057312, + "learning_rate": 4.921266351935578e-05, + "loss": 4.6338, + "step": 13469 + }, + { + "epoch": 0.08010990579503283, + "grad_norm": 1.6513469219207764, + "learning_rate": 4.921254721314632e-05, + "loss": 4.7399, + "step": 13470 + }, + { + "epoch": 0.08011585307831383, + "grad_norm": 1.5691003799438477, + "learning_rate": 4.9212430898484505e-05, + "loss": 4.8002, + "step": 13471 + }, + { + "epoch": 0.08012180036159482, + "grad_norm": 1.6764090061187744, + "learning_rate": 4.921231457537039e-05, + "loss": 4.7913, + "step": 13472 + }, + { + "epoch": 0.08012774764487582, + "grad_norm": 1.5193006992340088, + "learning_rate": 4.9212198243804e-05, + "loss": 4.8346, + "step": 13473 + }, + { + "epoch": 0.08013369492815682, + "grad_norm": 1.722706913948059, + "learning_rate": 4.921208190378538e-05, + "loss": 4.6969, + "step": 13474 + }, + { + "epoch": 0.08013964221143781, + "grad_norm": 1.6551017761230469, + "learning_rate": 4.921196555531457e-05, + "loss": 4.6504, + "step": 13475 + }, + { + "epoch": 0.08014558949471881, + "grad_norm": 1.462902307510376, + "learning_rate": 4.921184919839162e-05, + "loss": 4.7678, + "step": 13476 + }, + { + "epoch": 0.08015153677799981, + "grad_norm": 1.4332460165023804, + "learning_rate": 4.9211732833016554e-05, + "loss": 4.7563, + "step": 13477 + }, + { + "epoch": 0.0801574840612808, + "grad_norm": 1.466042160987854, + "learning_rate": 4.9211616459189434e-05, + "loss": 4.7071, + "step": 13478 + }, + { + "epoch": 0.0801634313445618, + "grad_norm": 1.5814018249511719, + "learning_rate": 4.9211500076910275e-05, + "loss": 4.7497, + "step": 13479 + }, + { + "epoch": 0.0801693786278428, + "grad_norm": 1.5666007995605469, + "learning_rate": 4.921138368617915e-05, + "loss": 4.7757, + "step": 13480 + }, + { + "epoch": 0.0801753259111238, + "grad_norm": 1.6804678440093994, + "learning_rate": 4.9211267286996064e-05, + "loss": 4.6921, + "step": 13481 + }, + { + "epoch": 0.0801812731944048, + "grad_norm": 1.6126580238342285, + "learning_rate": 4.921115087936108e-05, + "loss": 4.746, + "step": 13482 + }, + { + "epoch": 0.0801872204776858, + "grad_norm": 1.5597195625305176, + "learning_rate": 4.9211034463274235e-05, + "loss": 4.8135, + "step": 13483 + }, + { + "epoch": 0.08019316776096679, + "grad_norm": 1.4779510498046875, + "learning_rate": 4.9210918038735565e-05, + "loss": 4.9011, + "step": 13484 + }, + { + "epoch": 0.08019911504424779, + "grad_norm": 1.449723243713379, + "learning_rate": 4.921080160574512e-05, + "loss": 4.648, + "step": 13485 + }, + { + "epoch": 0.08020506232752879, + "grad_norm": 1.609134554862976, + "learning_rate": 4.921068516430293e-05, + "loss": 4.6809, + "step": 13486 + }, + { + "epoch": 0.08021100961080978, + "grad_norm": 1.5483453273773193, + "learning_rate": 4.921056871440905e-05, + "loss": 4.7247, + "step": 13487 + }, + { + "epoch": 0.08021695689409078, + "grad_norm": 1.5850282907485962, + "learning_rate": 4.921045225606349e-05, + "loss": 4.6378, + "step": 13488 + }, + { + "epoch": 0.08022290417737178, + "grad_norm": 1.746030569076538, + "learning_rate": 4.9210335789266325e-05, + "loss": 4.6986, + "step": 13489 + }, + { + "epoch": 0.08022885146065277, + "grad_norm": 1.5930465459823608, + "learning_rate": 4.921021931401758e-05, + "loss": 4.6339, + "step": 13490 + }, + { + "epoch": 0.08023479874393377, + "grad_norm": 1.5435012578964233, + "learning_rate": 4.92101028303173e-05, + "loss": 4.5761, + "step": 13491 + }, + { + "epoch": 0.08024074602721477, + "grad_norm": 1.8166500329971313, + "learning_rate": 4.920998633816552e-05, + "loss": 4.5668, + "step": 13492 + }, + { + "epoch": 0.08024669331049576, + "grad_norm": 1.659976601600647, + "learning_rate": 4.920986983756228e-05, + "loss": 4.7431, + "step": 13493 + }, + { + "epoch": 0.08025264059377676, + "grad_norm": 1.6075677871704102, + "learning_rate": 4.920975332850762e-05, + "loss": 4.7744, + "step": 13494 + }, + { + "epoch": 0.08025858787705777, + "grad_norm": 1.6895835399627686, + "learning_rate": 4.9209636811001605e-05, + "loss": 4.638, + "step": 13495 + }, + { + "epoch": 0.08026453516033875, + "grad_norm": 1.4848902225494385, + "learning_rate": 4.9209520285044244e-05, + "loss": 4.7314, + "step": 13496 + }, + { + "epoch": 0.08027048244361976, + "grad_norm": 1.6041605472564697, + "learning_rate": 4.920940375063559e-05, + "loss": 4.7329, + "step": 13497 + }, + { + "epoch": 0.08027642972690076, + "grad_norm": 1.5055692195892334, + "learning_rate": 4.920928720777568e-05, + "loss": 4.721, + "step": 13498 + }, + { + "epoch": 0.08028237701018175, + "grad_norm": 1.3238314390182495, + "learning_rate": 4.920917065646456e-05, + "loss": 5.3071, + "step": 13499 + }, + { + "epoch": 0.08028832429346275, + "grad_norm": 1.463626742362976, + "learning_rate": 4.9209054096702266e-05, + "loss": 5.1885, + "step": 13500 + }, + { + "epoch": 0.08029427157674375, + "grad_norm": 1.4844539165496826, + "learning_rate": 4.9208937528488844e-05, + "loss": 5.2873, + "step": 13501 + }, + { + "epoch": 0.08030021886002474, + "grad_norm": 1.5207467079162598, + "learning_rate": 4.920882095182434e-05, + "loss": 5.1049, + "step": 13502 + }, + { + "epoch": 0.08030616614330574, + "grad_norm": 1.3113683462142944, + "learning_rate": 4.920870436670878e-05, + "loss": 5.1821, + "step": 13503 + }, + { + "epoch": 0.08031211342658673, + "grad_norm": 1.3822054862976074, + "learning_rate": 4.920858777314221e-05, + "loss": 5.1467, + "step": 13504 + }, + { + "epoch": 0.08031806070986773, + "grad_norm": 1.7611572742462158, + "learning_rate": 4.920847117112467e-05, + "loss": 5.0616, + "step": 13505 + }, + { + "epoch": 0.08032400799314873, + "grad_norm": 1.632802963256836, + "learning_rate": 4.920835456065621e-05, + "loss": 5.1535, + "step": 13506 + }, + { + "epoch": 0.08032995527642972, + "grad_norm": 1.6254185438156128, + "learning_rate": 4.920823794173686e-05, + "loss": 5.211, + "step": 13507 + }, + { + "epoch": 0.08033590255971072, + "grad_norm": 1.4769513607025146, + "learning_rate": 4.920812131436666e-05, + "loss": 5.0879, + "step": 13508 + }, + { + "epoch": 0.08034184984299172, + "grad_norm": 1.531504511833191, + "learning_rate": 4.920800467854566e-05, + "loss": 4.9068, + "step": 13509 + }, + { + "epoch": 0.08034779712627271, + "grad_norm": 1.6325825452804565, + "learning_rate": 4.9207888034273895e-05, + "loss": 5.0463, + "step": 13510 + }, + { + "epoch": 0.08035374440955372, + "grad_norm": 1.3797351121902466, + "learning_rate": 4.9207771381551406e-05, + "loss": 5.0644, + "step": 13511 + }, + { + "epoch": 0.08035969169283472, + "grad_norm": 1.7325141429901123, + "learning_rate": 4.920765472037823e-05, + "loss": 4.9095, + "step": 13512 + }, + { + "epoch": 0.0803656389761157, + "grad_norm": 1.3197063207626343, + "learning_rate": 4.920753805075442e-05, + "loss": 5.1837, + "step": 13513 + }, + { + "epoch": 0.08037158625939671, + "grad_norm": 1.532212734222412, + "learning_rate": 4.9207421372680006e-05, + "loss": 5.1011, + "step": 13514 + }, + { + "epoch": 0.08037753354267771, + "grad_norm": 1.2958672046661377, + "learning_rate": 4.9207304686155034e-05, + "loss": 5.1349, + "step": 13515 + }, + { + "epoch": 0.0803834808259587, + "grad_norm": 2.914010524749756, + "learning_rate": 4.9207187991179533e-05, + "loss": 5.4637, + "step": 13516 + }, + { + "epoch": 0.0803894281092397, + "grad_norm": 1.490577220916748, + "learning_rate": 4.920707128775356e-05, + "loss": 5.2322, + "step": 13517 + }, + { + "epoch": 0.0803953753925207, + "grad_norm": 1.5756994485855103, + "learning_rate": 4.920695457587714e-05, + "loss": 5.1501, + "step": 13518 + }, + { + "epoch": 0.08040132267580169, + "grad_norm": 1.7483723163604736, + "learning_rate": 4.920683785555033e-05, + "loss": 5.131, + "step": 13519 + }, + { + "epoch": 0.08040726995908269, + "grad_norm": 1.426866054534912, + "learning_rate": 4.920672112677316e-05, + "loss": 5.5304, + "step": 13520 + }, + { + "epoch": 0.0804132172423637, + "grad_norm": 1.3744142055511475, + "learning_rate": 4.920660438954568e-05, + "loss": 5.1042, + "step": 13521 + }, + { + "epoch": 0.08041916452564468, + "grad_norm": 1.5924170017242432, + "learning_rate": 4.9206487643867916e-05, + "loss": 5.261, + "step": 13522 + }, + { + "epoch": 0.08042511180892568, + "grad_norm": 1.566296935081482, + "learning_rate": 4.920637088973992e-05, + "loss": 5.0451, + "step": 13523 + }, + { + "epoch": 0.08043105909220669, + "grad_norm": 1.4542006254196167, + "learning_rate": 4.9206254127161734e-05, + "loss": 5.0351, + "step": 13524 + }, + { + "epoch": 0.08043700637548767, + "grad_norm": 1.4084336757659912, + "learning_rate": 4.920613735613339e-05, + "loss": 5.1177, + "step": 13525 + }, + { + "epoch": 0.08044295365876868, + "grad_norm": 1.5498062372207642, + "learning_rate": 4.920602057665493e-05, + "loss": 4.9068, + "step": 13526 + }, + { + "epoch": 0.08044890094204968, + "grad_norm": 1.4482768774032593, + "learning_rate": 4.920590378872641e-05, + "loss": 4.9393, + "step": 13527 + }, + { + "epoch": 0.08045484822533067, + "grad_norm": 1.4438153505325317, + "learning_rate": 4.920578699234785e-05, + "loss": 5.0109, + "step": 13528 + }, + { + "epoch": 0.08046079550861167, + "grad_norm": 1.5769532918930054, + "learning_rate": 4.9205670187519305e-05, + "loss": 4.916, + "step": 13529 + }, + { + "epoch": 0.08046674279189267, + "grad_norm": 1.6127451658248901, + "learning_rate": 4.9205553374240806e-05, + "loss": 5.0038, + "step": 13530 + }, + { + "epoch": 0.08047269007517366, + "grad_norm": 1.5733160972595215, + "learning_rate": 4.92054365525124e-05, + "loss": 5.2705, + "step": 13531 + }, + { + "epoch": 0.08047863735845466, + "grad_norm": 1.956769585609436, + "learning_rate": 4.920531972233413e-05, + "loss": 5.0572, + "step": 13532 + }, + { + "epoch": 0.08048458464173565, + "grad_norm": 1.614670753479004, + "learning_rate": 4.9205202883706025e-05, + "loss": 5.0323, + "step": 13533 + }, + { + "epoch": 0.08049053192501665, + "grad_norm": 1.3706777095794678, + "learning_rate": 4.920508603662814e-05, + "loss": 5.1335, + "step": 13534 + }, + { + "epoch": 0.08049647920829765, + "grad_norm": 1.5787118673324585, + "learning_rate": 4.9204969181100505e-05, + "loss": 4.9626, + "step": 13535 + }, + { + "epoch": 0.08050242649157864, + "grad_norm": 1.6258914470672607, + "learning_rate": 4.9204852317123175e-05, + "loss": 5.1592, + "step": 13536 + }, + { + "epoch": 0.08050837377485964, + "grad_norm": 1.662347435951233, + "learning_rate": 4.920473544469617e-05, + "loss": 5.053, + "step": 13537 + }, + { + "epoch": 0.08051432105814064, + "grad_norm": 1.8060719966888428, + "learning_rate": 4.920461856381955e-05, + "loss": 5.0823, + "step": 13538 + }, + { + "epoch": 0.08052026834142163, + "grad_norm": 1.7381904125213623, + "learning_rate": 4.920450167449334e-05, + "loss": 4.7485, + "step": 13539 + }, + { + "epoch": 0.08052621562470264, + "grad_norm": 1.838526964187622, + "learning_rate": 4.9204384776717594e-05, + "loss": 5.1404, + "step": 13540 + }, + { + "epoch": 0.08053216290798364, + "grad_norm": 1.8131240606307983, + "learning_rate": 4.920426787049234e-05, + "loss": 5.2337, + "step": 13541 + }, + { + "epoch": 0.08053811019126463, + "grad_norm": 1.7523903846740723, + "learning_rate": 4.9204150955817635e-05, + "loss": 5.2375, + "step": 13542 + }, + { + "epoch": 0.08054405747454563, + "grad_norm": 1.5962380170822144, + "learning_rate": 4.9204034032693505e-05, + "loss": 5.1667, + "step": 13543 + }, + { + "epoch": 0.08055000475782663, + "grad_norm": 1.566009283065796, + "learning_rate": 4.920391710112e-05, + "loss": 5.1105, + "step": 13544 + }, + { + "epoch": 0.08055595204110762, + "grad_norm": 1.6253767013549805, + "learning_rate": 4.920380016109716e-05, + "loss": 5.2942, + "step": 13545 + }, + { + "epoch": 0.08056189932438862, + "grad_norm": 1.538004994392395, + "learning_rate": 4.920368321262502e-05, + "loss": 5.1847, + "step": 13546 + }, + { + "epoch": 0.08056784660766962, + "grad_norm": 1.6407667398452759, + "learning_rate": 4.9203566255703625e-05, + "loss": 5.1368, + "step": 13547 + }, + { + "epoch": 0.08057379389095061, + "grad_norm": 1.5777368545532227, + "learning_rate": 4.9203449290333016e-05, + "loss": 5.1507, + "step": 13548 + }, + { + "epoch": 0.08057974117423161, + "grad_norm": 1.5601979494094849, + "learning_rate": 4.920333231651323e-05, + "loss": 5.0926, + "step": 13549 + }, + { + "epoch": 0.08058568845751261, + "grad_norm": 1.4342397451400757, + "learning_rate": 4.9203215334244315e-05, + "loss": 4.9536, + "step": 13550 + }, + { + "epoch": 0.0805916357407936, + "grad_norm": 1.6202988624572754, + "learning_rate": 4.9203098343526305e-05, + "loss": 4.9009, + "step": 13551 + }, + { + "epoch": 0.0805975830240746, + "grad_norm": 1.4504165649414062, + "learning_rate": 4.9202981344359243e-05, + "loss": 5.3843, + "step": 13552 + }, + { + "epoch": 0.0806035303073556, + "grad_norm": 1.6187599897384644, + "learning_rate": 4.920286433674317e-05, + "loss": 5.3396, + "step": 13553 + }, + { + "epoch": 0.0806094775906366, + "grad_norm": 1.6162225008010864, + "learning_rate": 4.920274732067813e-05, + "loss": 5.3163, + "step": 13554 + }, + { + "epoch": 0.0806154248739176, + "grad_norm": 1.6445814371109009, + "learning_rate": 4.920263029616416e-05, + "loss": 5.207, + "step": 13555 + }, + { + "epoch": 0.0806213721571986, + "grad_norm": 1.5133748054504395, + "learning_rate": 4.9202513263201296e-05, + "loss": 5.4284, + "step": 13556 + }, + { + "epoch": 0.08062731944047959, + "grad_norm": 1.5004390478134155, + "learning_rate": 4.920239622178959e-05, + "loss": 5.0013, + "step": 13557 + }, + { + "epoch": 0.08063326672376059, + "grad_norm": 1.6617141962051392, + "learning_rate": 4.920227917192908e-05, + "loss": 5.346, + "step": 13558 + }, + { + "epoch": 0.08063921400704159, + "grad_norm": 1.5505567789077759, + "learning_rate": 4.92021621136198e-05, + "loss": 5.2799, + "step": 13559 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 1.5264419317245483, + "learning_rate": 4.92020450468618e-05, + "loss": 5.1277, + "step": 13560 + }, + { + "epoch": 0.08065110857360358, + "grad_norm": 1.6758075952529907, + "learning_rate": 4.920192797165511e-05, + "loss": 5.2519, + "step": 13561 + }, + { + "epoch": 0.08065705585688457, + "grad_norm": 1.5858482122421265, + "learning_rate": 4.920181088799978e-05, + "loss": 5.3231, + "step": 13562 + }, + { + "epoch": 0.08066300314016557, + "grad_norm": 1.5122928619384766, + "learning_rate": 4.920169379589585e-05, + "loss": 5.1791, + "step": 13563 + }, + { + "epoch": 0.08066895042344657, + "grad_norm": 1.4593915939331055, + "learning_rate": 4.9201576695343354e-05, + "loss": 5.0555, + "step": 13564 + }, + { + "epoch": 0.08067489770672756, + "grad_norm": 1.6524077653884888, + "learning_rate": 4.9201459586342336e-05, + "loss": 5.1981, + "step": 13565 + }, + { + "epoch": 0.08068084499000856, + "grad_norm": 1.5063152313232422, + "learning_rate": 4.920134246889285e-05, + "loss": 5.0406, + "step": 13566 + }, + { + "epoch": 0.08068679227328956, + "grad_norm": 1.3544602394104004, + "learning_rate": 4.9201225342994914e-05, + "loss": 5.0385, + "step": 13567 + }, + { + "epoch": 0.08069273955657055, + "grad_norm": 1.5672118663787842, + "learning_rate": 4.920110820864858e-05, + "loss": 5.2393, + "step": 13568 + }, + { + "epoch": 0.08069868683985155, + "grad_norm": 1.5031840801239014, + "learning_rate": 4.92009910658539e-05, + "loss": 5.1584, + "step": 13569 + }, + { + "epoch": 0.08070463412313256, + "grad_norm": 1.682307243347168, + "learning_rate": 4.920087391461089e-05, + "loss": 4.8473, + "step": 13570 + }, + { + "epoch": 0.08071058140641355, + "grad_norm": 1.5047411918640137, + "learning_rate": 4.9200756754919616e-05, + "loss": 4.8286, + "step": 13571 + }, + { + "epoch": 0.08071652868969455, + "grad_norm": 1.4234607219696045, + "learning_rate": 4.920063958678011e-05, + "loss": 4.8309, + "step": 13572 + }, + { + "epoch": 0.08072247597297555, + "grad_norm": 1.5061196088790894, + "learning_rate": 4.920052241019239e-05, + "loss": 5.0132, + "step": 13573 + }, + { + "epoch": 0.08072842325625654, + "grad_norm": 1.5565897226333618, + "learning_rate": 4.920040522515654e-05, + "loss": 4.9357, + "step": 13574 + }, + { + "epoch": 0.08073437053953754, + "grad_norm": 1.442288875579834, + "learning_rate": 4.920028803167257e-05, + "loss": 4.7943, + "step": 13575 + }, + { + "epoch": 0.08074031782281854, + "grad_norm": 1.6255996227264404, + "learning_rate": 4.9200170829740534e-05, + "loss": 4.824, + "step": 13576 + }, + { + "epoch": 0.08074626510609953, + "grad_norm": 1.7027612924575806, + "learning_rate": 4.920005361936047e-05, + "loss": 5.1223, + "step": 13577 + }, + { + "epoch": 0.08075221238938053, + "grad_norm": 2.5931310653686523, + "learning_rate": 4.919993640053241e-05, + "loss": 5.3487, + "step": 13578 + }, + { + "epoch": 0.08075815967266153, + "grad_norm": 1.5481868982315063, + "learning_rate": 4.91998191732564e-05, + "loss": 5.0844, + "step": 13579 + }, + { + "epoch": 0.08076410695594252, + "grad_norm": 1.3663432598114014, + "learning_rate": 4.919970193753248e-05, + "loss": 5.2151, + "step": 13580 + }, + { + "epoch": 0.08077005423922352, + "grad_norm": 1.4602998495101929, + "learning_rate": 4.919958469336071e-05, + "loss": 5.3133, + "step": 13581 + }, + { + "epoch": 0.08077600152250453, + "grad_norm": 1.6350071430206299, + "learning_rate": 4.919946744074111e-05, + "loss": 5.5026, + "step": 13582 + }, + { + "epoch": 0.08078194880578551, + "grad_norm": 1.4492799043655396, + "learning_rate": 4.919935017967372e-05, + "loss": 5.4211, + "step": 13583 + }, + { + "epoch": 0.08078789608906652, + "grad_norm": 1.398373007774353, + "learning_rate": 4.919923291015859e-05, + "loss": 5.2947, + "step": 13584 + }, + { + "epoch": 0.08079384337234752, + "grad_norm": 1.543583869934082, + "learning_rate": 4.9199115632195755e-05, + "loss": 5.0361, + "step": 13585 + }, + { + "epoch": 0.0807997906556285, + "grad_norm": 1.7753655910491943, + "learning_rate": 4.9198998345785265e-05, + "loss": 5.1897, + "step": 13586 + }, + { + "epoch": 0.08080573793890951, + "grad_norm": 1.668168544769287, + "learning_rate": 4.919888105092715e-05, + "loss": 5.3786, + "step": 13587 + }, + { + "epoch": 0.08081168522219051, + "grad_norm": 1.3956975936889648, + "learning_rate": 4.919876374762145e-05, + "loss": 5.4662, + "step": 13588 + }, + { + "epoch": 0.0808176325054715, + "grad_norm": 1.3362425565719604, + "learning_rate": 4.9198646435868226e-05, + "loss": 5.4723, + "step": 13589 + }, + { + "epoch": 0.0808235797887525, + "grad_norm": 1.3419675827026367, + "learning_rate": 4.919852911566749e-05, + "loss": 5.3888, + "step": 13590 + }, + { + "epoch": 0.08082952707203349, + "grad_norm": 1.5144484043121338, + "learning_rate": 4.9198411787019304e-05, + "loss": 5.292, + "step": 13591 + }, + { + "epoch": 0.08083547435531449, + "grad_norm": 1.4561097621917725, + "learning_rate": 4.91982944499237e-05, + "loss": 5.3688, + "step": 13592 + }, + { + "epoch": 0.08084142163859549, + "grad_norm": 1.4536436796188354, + "learning_rate": 4.919817710438073e-05, + "loss": 5.3606, + "step": 13593 + }, + { + "epoch": 0.08084736892187648, + "grad_norm": 1.3266935348510742, + "learning_rate": 4.919805975039041e-05, + "loss": 5.3999, + "step": 13594 + }, + { + "epoch": 0.08085331620515748, + "grad_norm": 1.4032717943191528, + "learning_rate": 4.919794238795281e-05, + "loss": 5.3494, + "step": 13595 + }, + { + "epoch": 0.08085926348843848, + "grad_norm": 1.6235400438308716, + "learning_rate": 4.919782501706796e-05, + "loss": 5.1499, + "step": 13596 + }, + { + "epoch": 0.08086521077171947, + "grad_norm": 1.349752426147461, + "learning_rate": 4.919770763773589e-05, + "loss": 5.3599, + "step": 13597 + }, + { + "epoch": 0.08087115805500047, + "grad_norm": 1.9415758848190308, + "learning_rate": 4.919759024995666e-05, + "loss": 5.3427, + "step": 13598 + }, + { + "epoch": 0.08087710533828148, + "grad_norm": 1.688825249671936, + "learning_rate": 4.9197472853730296e-05, + "loss": 5.2918, + "step": 13599 + }, + { + "epoch": 0.08088305262156247, + "grad_norm": 1.55258309841156, + "learning_rate": 4.919735544905685e-05, + "loss": 5.3016, + "step": 13600 + }, + { + "epoch": 0.08088899990484347, + "grad_norm": 1.3860005140304565, + "learning_rate": 4.919723803593634e-05, + "loss": 5.3049, + "step": 13601 + }, + { + "epoch": 0.08089494718812447, + "grad_norm": 1.289819359779358, + "learning_rate": 4.919712061436884e-05, + "loss": 5.1657, + "step": 13602 + }, + { + "epoch": 0.08090089447140546, + "grad_norm": 1.5799275636672974, + "learning_rate": 4.9197003184354375e-05, + "loss": 5.2638, + "step": 13603 + }, + { + "epoch": 0.08090684175468646, + "grad_norm": 1.5292985439300537, + "learning_rate": 4.919688574589299e-05, + "loss": 5.2643, + "step": 13604 + }, + { + "epoch": 0.08091278903796746, + "grad_norm": 1.6338304281234741, + "learning_rate": 4.919676829898471e-05, + "loss": 5.2377, + "step": 13605 + }, + { + "epoch": 0.08091873632124845, + "grad_norm": 1.7117339372634888, + "learning_rate": 4.919665084362959e-05, + "loss": 5.262, + "step": 13606 + }, + { + "epoch": 0.08092468360452945, + "grad_norm": 1.606644868850708, + "learning_rate": 4.919653337982767e-05, + "loss": 5.2308, + "step": 13607 + }, + { + "epoch": 0.08093063088781045, + "grad_norm": 1.5751184225082397, + "learning_rate": 4.9196415907578994e-05, + "loss": 5.1455, + "step": 13608 + }, + { + "epoch": 0.08093657817109144, + "grad_norm": 1.7105200290679932, + "learning_rate": 4.9196298426883595e-05, + "loss": 5.2608, + "step": 13609 + }, + { + "epoch": 0.08094252545437244, + "grad_norm": 1.4504178762435913, + "learning_rate": 4.919618093774152e-05, + "loss": 5.3592, + "step": 13610 + }, + { + "epoch": 0.08094847273765345, + "grad_norm": 1.2036757469177246, + "learning_rate": 4.9196063440152804e-05, + "loss": 5.3256, + "step": 13611 + }, + { + "epoch": 0.08095442002093443, + "grad_norm": 1.4795072078704834, + "learning_rate": 4.9195945934117507e-05, + "loss": 5.2968, + "step": 13612 + }, + { + "epoch": 0.08096036730421544, + "grad_norm": 1.2796508073806763, + "learning_rate": 4.9195828419635644e-05, + "loss": 5.1288, + "step": 13613 + }, + { + "epoch": 0.08096631458749644, + "grad_norm": 1.4119127988815308, + "learning_rate": 4.9195710896707264e-05, + "loss": 5.3238, + "step": 13614 + }, + { + "epoch": 0.08097226187077743, + "grad_norm": 1.618862509727478, + "learning_rate": 4.919559336533241e-05, + "loss": 5.301, + "step": 13615 + }, + { + "epoch": 0.08097820915405843, + "grad_norm": 1.5049046277999878, + "learning_rate": 4.919547582551114e-05, + "loss": 5.3395, + "step": 13616 + }, + { + "epoch": 0.08098415643733943, + "grad_norm": 1.3821018934249878, + "learning_rate": 4.9195358277243464e-05, + "loss": 5.4033, + "step": 13617 + }, + { + "epoch": 0.08099010372062042, + "grad_norm": 1.4585113525390625, + "learning_rate": 4.9195240720529446e-05, + "loss": 5.3098, + "step": 13618 + }, + { + "epoch": 0.08099605100390142, + "grad_norm": 1.5766072273254395, + "learning_rate": 4.9195123155369114e-05, + "loss": 5.2672, + "step": 13619 + }, + { + "epoch": 0.08100199828718241, + "grad_norm": 1.5132715702056885, + "learning_rate": 4.919500558176252e-05, + "loss": 5.1707, + "step": 13620 + }, + { + "epoch": 0.08100794557046341, + "grad_norm": 1.594093918800354, + "learning_rate": 4.91948879997097e-05, + "loss": 5.2988, + "step": 13621 + }, + { + "epoch": 0.08101389285374441, + "grad_norm": 1.529877781867981, + "learning_rate": 4.919477040921069e-05, + "loss": 5.4418, + "step": 13622 + }, + { + "epoch": 0.0810198401370254, + "grad_norm": 1.4329211711883545, + "learning_rate": 4.919465281026554e-05, + "loss": 5.308, + "step": 13623 + }, + { + "epoch": 0.0810257874203064, + "grad_norm": 1.4308300018310547, + "learning_rate": 4.919453520287428e-05, + "loss": 5.259, + "step": 13624 + }, + { + "epoch": 0.0810317347035874, + "grad_norm": 1.248282790184021, + "learning_rate": 4.919441758703697e-05, + "loss": 5.2129, + "step": 13625 + }, + { + "epoch": 0.08103768198686839, + "grad_norm": 1.4535733461380005, + "learning_rate": 4.919429996275363e-05, + "loss": 5.1989, + "step": 13626 + }, + { + "epoch": 0.0810436292701494, + "grad_norm": 1.6055153608322144, + "learning_rate": 4.9194182330024306e-05, + "loss": 5.1669, + "step": 13627 + }, + { + "epoch": 0.0810495765534304, + "grad_norm": 1.6016899347305298, + "learning_rate": 4.919406468884905e-05, + "loss": 5.1958, + "step": 13628 + }, + { + "epoch": 0.08105552383671139, + "grad_norm": 1.4217112064361572, + "learning_rate": 4.91939470392279e-05, + "loss": 4.9775, + "step": 13629 + }, + { + "epoch": 0.08106147111999239, + "grad_norm": 1.4405405521392822, + "learning_rate": 4.919382938116088e-05, + "loss": 5.1865, + "step": 13630 + }, + { + "epoch": 0.08106741840327339, + "grad_norm": 1.3826597929000854, + "learning_rate": 4.919371171464805e-05, + "loss": 5.1909, + "step": 13631 + }, + { + "epoch": 0.08107336568655438, + "grad_norm": 1.942305088043213, + "learning_rate": 4.919359403968944e-05, + "loss": 5.227, + "step": 13632 + }, + { + "epoch": 0.08107931296983538, + "grad_norm": 1.8932685852050781, + "learning_rate": 4.919347635628511e-05, + "loss": 5.3257, + "step": 13633 + }, + { + "epoch": 0.08108526025311638, + "grad_norm": 1.8511128425598145, + "learning_rate": 4.9193358664435074e-05, + "loss": 5.4229, + "step": 13634 + }, + { + "epoch": 0.08109120753639737, + "grad_norm": 1.6317822933197021, + "learning_rate": 4.919324096413939e-05, + "loss": 5.3067, + "step": 13635 + }, + { + "epoch": 0.08109715481967837, + "grad_norm": 1.835503101348877, + "learning_rate": 4.91931232553981e-05, + "loss": 5.3246, + "step": 13636 + }, + { + "epoch": 0.08110310210295937, + "grad_norm": 1.8521870374679565, + "learning_rate": 4.919300553821124e-05, + "loss": 5.3367, + "step": 13637 + }, + { + "epoch": 0.08110904938624036, + "grad_norm": 1.7814146280288696, + "learning_rate": 4.9192887812578844e-05, + "loss": 5.2949, + "step": 13638 + }, + { + "epoch": 0.08111499666952136, + "grad_norm": 1.6024845838546753, + "learning_rate": 4.919277007850097e-05, + "loss": 5.3159, + "step": 13639 + }, + { + "epoch": 0.08112094395280237, + "grad_norm": 2.955554246902466, + "learning_rate": 4.919265233597765e-05, + "loss": 4.8802, + "step": 13640 + }, + { + "epoch": 0.08112689123608335, + "grad_norm": 1.7217108011245728, + "learning_rate": 4.919253458500892e-05, + "loss": 5.08, + "step": 13641 + }, + { + "epoch": 0.08113283851936436, + "grad_norm": 1.686672329902649, + "learning_rate": 4.9192416825594825e-05, + "loss": 5.1349, + "step": 13642 + }, + { + "epoch": 0.08113878580264536, + "grad_norm": 1.5377975702285767, + "learning_rate": 4.9192299057735416e-05, + "loss": 5.1327, + "step": 13643 + }, + { + "epoch": 0.08114473308592635, + "grad_norm": 1.7383031845092773, + "learning_rate": 4.9192181281430716e-05, + "loss": 5.0938, + "step": 13644 + }, + { + "epoch": 0.08115068036920735, + "grad_norm": 1.6174112558364868, + "learning_rate": 4.919206349668077e-05, + "loss": 5.0123, + "step": 13645 + }, + { + "epoch": 0.08115662765248835, + "grad_norm": 1.5967239141464233, + "learning_rate": 4.9191945703485646e-05, + "loss": 5.0334, + "step": 13646 + }, + { + "epoch": 0.08116257493576934, + "grad_norm": 1.5330301523208618, + "learning_rate": 4.919182790184534e-05, + "loss": 5.1615, + "step": 13647 + }, + { + "epoch": 0.08116852221905034, + "grad_norm": 1.5532622337341309, + "learning_rate": 4.919171009175993e-05, + "loss": 5.1565, + "step": 13648 + }, + { + "epoch": 0.08117446950233133, + "grad_norm": 1.4814139604568481, + "learning_rate": 4.919159227322945e-05, + "loss": 5.0991, + "step": 13649 + }, + { + "epoch": 0.08118041678561233, + "grad_norm": 1.2586545944213867, + "learning_rate": 4.919147444625392e-05, + "loss": 5.2482, + "step": 13650 + }, + { + "epoch": 0.08118636406889333, + "grad_norm": 1.5292212963104248, + "learning_rate": 4.91913566108334e-05, + "loss": 5.1787, + "step": 13651 + }, + { + "epoch": 0.08119231135217432, + "grad_norm": 1.5354405641555786, + "learning_rate": 4.919123876696793e-05, + "loss": 5.0046, + "step": 13652 + }, + { + "epoch": 0.08119825863545532, + "grad_norm": 1.3921040296554565, + "learning_rate": 4.919112091465755e-05, + "loss": 5.2199, + "step": 13653 + }, + { + "epoch": 0.08120420591873632, + "grad_norm": 1.471068263053894, + "learning_rate": 4.91910030539023e-05, + "loss": 5.0445, + "step": 13654 + }, + { + "epoch": 0.08121015320201731, + "grad_norm": 1.3318332433700562, + "learning_rate": 4.919088518470222e-05, + "loss": 5.1973, + "step": 13655 + }, + { + "epoch": 0.08121610048529831, + "grad_norm": 1.5445464849472046, + "learning_rate": 4.919076730705735e-05, + "loss": 5.4165, + "step": 13656 + }, + { + "epoch": 0.08122204776857932, + "grad_norm": 1.3854666948318481, + "learning_rate": 4.9190649420967735e-05, + "loss": 5.336, + "step": 13657 + }, + { + "epoch": 0.0812279950518603, + "grad_norm": 1.4703121185302734, + "learning_rate": 4.919053152643342e-05, + "loss": 5.4837, + "step": 13658 + }, + { + "epoch": 0.08123394233514131, + "grad_norm": 1.3189783096313477, + "learning_rate": 4.9190413623454425e-05, + "loss": 5.4163, + "step": 13659 + }, + { + "epoch": 0.08123988961842231, + "grad_norm": 1.469601035118103, + "learning_rate": 4.919029571203081e-05, + "loss": 5.2772, + "step": 13660 + }, + { + "epoch": 0.0812458369017033, + "grad_norm": 1.4215590953826904, + "learning_rate": 4.919017779216262e-05, + "loss": 5.5008, + "step": 13661 + }, + { + "epoch": 0.0812517841849843, + "grad_norm": 1.577255129814148, + "learning_rate": 4.919005986384989e-05, + "loss": 5.2565, + "step": 13662 + }, + { + "epoch": 0.0812577314682653, + "grad_norm": 1.5910719633102417, + "learning_rate": 4.918994192709265e-05, + "loss": 5.1143, + "step": 13663 + }, + { + "epoch": 0.08126367875154629, + "grad_norm": 1.5665141344070435, + "learning_rate": 4.9189823981890964e-05, + "loss": 5.1911, + "step": 13664 + }, + { + "epoch": 0.08126962603482729, + "grad_norm": 1.6348809003829956, + "learning_rate": 4.918970602824485e-05, + "loss": 5.2257, + "step": 13665 + }, + { + "epoch": 0.0812755733181083, + "grad_norm": 1.4213917255401611, + "learning_rate": 4.9189588066154365e-05, + "loss": 5.0528, + "step": 13666 + }, + { + "epoch": 0.08128152060138928, + "grad_norm": 1.497758388519287, + "learning_rate": 4.918947009561955e-05, + "loss": 5.2421, + "step": 13667 + }, + { + "epoch": 0.08128746788467028, + "grad_norm": 1.4052904844284058, + "learning_rate": 4.918935211664043e-05, + "loss": 5.5054, + "step": 13668 + }, + { + "epoch": 0.08129341516795129, + "grad_norm": 1.5615813732147217, + "learning_rate": 4.9189234129217064e-05, + "loss": 5.2711, + "step": 13669 + }, + { + "epoch": 0.08129936245123227, + "grad_norm": 1.2366914749145508, + "learning_rate": 4.9189116133349485e-05, + "loss": 5.4035, + "step": 13670 + }, + { + "epoch": 0.08130530973451328, + "grad_norm": 1.5328080654144287, + "learning_rate": 4.918899812903773e-05, + "loss": 5.3269, + "step": 13671 + }, + { + "epoch": 0.08131125701779428, + "grad_norm": 1.6515448093414307, + "learning_rate": 4.918888011628185e-05, + "loss": 5.1734, + "step": 13672 + }, + { + "epoch": 0.08131720430107527, + "grad_norm": 1.385549783706665, + "learning_rate": 4.918876209508188e-05, + "loss": 5.3769, + "step": 13673 + }, + { + "epoch": 0.08132315158435627, + "grad_norm": 1.4133338928222656, + "learning_rate": 4.9188644065437875e-05, + "loss": 5.2607, + "step": 13674 + }, + { + "epoch": 0.08132909886763727, + "grad_norm": 1.6652443408966064, + "learning_rate": 4.918852602734984e-05, + "loss": 5.3939, + "step": 13675 + }, + { + "epoch": 0.08133504615091826, + "grad_norm": 1.455493450164795, + "learning_rate": 4.918840798081786e-05, + "loss": 5.3051, + "step": 13676 + }, + { + "epoch": 0.08134099343419926, + "grad_norm": 1.5490756034851074, + "learning_rate": 4.918828992584196e-05, + "loss": 5.4309, + "step": 13677 + }, + { + "epoch": 0.08134694071748025, + "grad_norm": 1.5857222080230713, + "learning_rate": 4.918817186242216e-05, + "loss": 5.1158, + "step": 13678 + }, + { + "epoch": 0.08135288800076125, + "grad_norm": 1.6051661968231201, + "learning_rate": 4.918805379055853e-05, + "loss": 5.2668, + "step": 13679 + }, + { + "epoch": 0.08135883528404225, + "grad_norm": 1.6476162672042847, + "learning_rate": 4.91879357102511e-05, + "loss": 5.2367, + "step": 13680 + }, + { + "epoch": 0.08136478256732324, + "grad_norm": 1.4255136251449585, + "learning_rate": 4.918781762149991e-05, + "loss": 5.0348, + "step": 13681 + }, + { + "epoch": 0.08137072985060424, + "grad_norm": 1.4585214853286743, + "learning_rate": 4.9187699524305e-05, + "loss": 5.2323, + "step": 13682 + }, + { + "epoch": 0.08137667713388524, + "grad_norm": 1.3733863830566406, + "learning_rate": 4.9187581418666415e-05, + "loss": 5.0898, + "step": 13683 + }, + { + "epoch": 0.08138262441716623, + "grad_norm": 1.5789494514465332, + "learning_rate": 4.91874633045842e-05, + "loss": 5.0886, + "step": 13684 + }, + { + "epoch": 0.08138857170044723, + "grad_norm": 1.4390051364898682, + "learning_rate": 4.918734518205839e-05, + "loss": 5.4305, + "step": 13685 + }, + { + "epoch": 0.08139451898372824, + "grad_norm": 1.8984171152114868, + "learning_rate": 4.9187227051089025e-05, + "loss": 5.0593, + "step": 13686 + }, + { + "epoch": 0.08140046626700922, + "grad_norm": 1.940045714378357, + "learning_rate": 4.918710891167615e-05, + "loss": 5.3115, + "step": 13687 + }, + { + "epoch": 0.08140641355029023, + "grad_norm": 1.6479912996292114, + "learning_rate": 4.918699076381981e-05, + "loss": 5.1585, + "step": 13688 + }, + { + "epoch": 0.08141236083357123, + "grad_norm": 1.554114818572998, + "learning_rate": 4.918687260752003e-05, + "loss": 5.1581, + "step": 13689 + }, + { + "epoch": 0.08141830811685222, + "grad_norm": 1.6920353174209595, + "learning_rate": 4.9186754442776874e-05, + "loss": 5.2263, + "step": 13690 + }, + { + "epoch": 0.08142425540013322, + "grad_norm": 1.572787880897522, + "learning_rate": 4.9186636269590366e-05, + "loss": 5.1019, + "step": 13691 + }, + { + "epoch": 0.08143020268341422, + "grad_norm": 1.646004319190979, + "learning_rate": 4.918651808796055e-05, + "loss": 5.1426, + "step": 13692 + }, + { + "epoch": 0.08143614996669521, + "grad_norm": 1.578749179840088, + "learning_rate": 4.9186399897887475e-05, + "loss": 4.9682, + "step": 13693 + }, + { + "epoch": 0.08144209724997621, + "grad_norm": 1.7725828886032104, + "learning_rate": 4.918628169937118e-05, + "loss": 5.0772, + "step": 13694 + }, + { + "epoch": 0.08144804453325721, + "grad_norm": 1.808596134185791, + "learning_rate": 4.91861634924117e-05, + "loss": 5.077, + "step": 13695 + }, + { + "epoch": 0.0814539918165382, + "grad_norm": 1.8685991764068604, + "learning_rate": 4.9186045277009084e-05, + "loss": 5.1322, + "step": 13696 + }, + { + "epoch": 0.0814599390998192, + "grad_norm": 1.6144567728042603, + "learning_rate": 4.9185927053163366e-05, + "loss": 5.3354, + "step": 13697 + }, + { + "epoch": 0.0814658863831002, + "grad_norm": 1.767673373222351, + "learning_rate": 4.918580882087459e-05, + "loss": 5.0358, + "step": 13698 + }, + { + "epoch": 0.0814718336663812, + "grad_norm": 1.7151973247528076, + "learning_rate": 4.9185690580142805e-05, + "loss": 5.0371, + "step": 13699 + }, + { + "epoch": 0.0814777809496622, + "grad_norm": 1.710990071296692, + "learning_rate": 4.918557233096803e-05, + "loss": 4.9236, + "step": 13700 + }, + { + "epoch": 0.0814837282329432, + "grad_norm": 1.8118677139282227, + "learning_rate": 4.9185454073350335e-05, + "loss": 4.9112, + "step": 13701 + }, + { + "epoch": 0.08148967551622419, + "grad_norm": 2.0120832920074463, + "learning_rate": 4.918533580728974e-05, + "loss": 4.8201, + "step": 13702 + }, + { + "epoch": 0.08149562279950519, + "grad_norm": 1.742125153541565, + "learning_rate": 4.91852175327863e-05, + "loss": 5.0618, + "step": 13703 + }, + { + "epoch": 0.08150157008278619, + "grad_norm": 1.6496554613113403, + "learning_rate": 4.9185099249840054e-05, + "loss": 5.217, + "step": 13704 + }, + { + "epoch": 0.08150751736606718, + "grad_norm": 1.6782381534576416, + "learning_rate": 4.9184980958451034e-05, + "loss": 5.0362, + "step": 13705 + }, + { + "epoch": 0.08151346464934818, + "grad_norm": 1.8002519607543945, + "learning_rate": 4.918486265861929e-05, + "loss": 4.8812, + "step": 13706 + }, + { + "epoch": 0.08151941193262917, + "grad_norm": 1.5939546823501587, + "learning_rate": 4.918474435034486e-05, + "loss": 5.0571, + "step": 13707 + }, + { + "epoch": 0.08152535921591017, + "grad_norm": 1.6342964172363281, + "learning_rate": 4.918462603362778e-05, + "loss": 5.087, + "step": 13708 + }, + { + "epoch": 0.08153130649919117, + "grad_norm": 1.549822449684143, + "learning_rate": 4.91845077084681e-05, + "loss": 5.1654, + "step": 13709 + }, + { + "epoch": 0.08153725378247216, + "grad_norm": 1.5732479095458984, + "learning_rate": 4.9184389374865855e-05, + "loss": 4.9085, + "step": 13710 + }, + { + "epoch": 0.08154320106575316, + "grad_norm": 1.4182745218276978, + "learning_rate": 4.9184271032821094e-05, + "loss": 4.8846, + "step": 13711 + }, + { + "epoch": 0.08154914834903416, + "grad_norm": 1.3679918050765991, + "learning_rate": 4.918415268233385e-05, + "loss": 5.0263, + "step": 13712 + }, + { + "epoch": 0.08155509563231515, + "grad_norm": 1.4714219570159912, + "learning_rate": 4.918403432340418e-05, + "loss": 5.5169, + "step": 13713 + }, + { + "epoch": 0.08156104291559615, + "grad_norm": 1.8351292610168457, + "learning_rate": 4.91839159560321e-05, + "loss": 5.215, + "step": 13714 + }, + { + "epoch": 0.08156699019887716, + "grad_norm": 1.530781865119934, + "learning_rate": 4.918379758021767e-05, + "loss": 5.0882, + "step": 13715 + }, + { + "epoch": 0.08157293748215814, + "grad_norm": 1.799901008605957, + "learning_rate": 4.918367919596093e-05, + "loss": 5.2248, + "step": 13716 + }, + { + "epoch": 0.08157888476543915, + "grad_norm": 1.7563488483428955, + "learning_rate": 4.9183560803261915e-05, + "loss": 5.3192, + "step": 13717 + }, + { + "epoch": 0.08158483204872015, + "grad_norm": 1.7521497011184692, + "learning_rate": 4.918344240212066e-05, + "loss": 5.4841, + "step": 13718 + }, + { + "epoch": 0.08159077933200114, + "grad_norm": 1.7345610857009888, + "learning_rate": 4.918332399253722e-05, + "loss": 5.0716, + "step": 13719 + }, + { + "epoch": 0.08159672661528214, + "grad_norm": 1.4790915250778198, + "learning_rate": 4.918320557451164e-05, + "loss": 5.1833, + "step": 13720 + }, + { + "epoch": 0.08160267389856314, + "grad_norm": 1.4721198081970215, + "learning_rate": 4.918308714804395e-05, + "loss": 5.1355, + "step": 13721 + }, + { + "epoch": 0.08160862118184413, + "grad_norm": 1.4949108362197876, + "learning_rate": 4.918296871313419e-05, + "loss": 4.9666, + "step": 13722 + }, + { + "epoch": 0.08161456846512513, + "grad_norm": 1.3814501762390137, + "learning_rate": 4.91828502697824e-05, + "loss": 5.0575, + "step": 13723 + }, + { + "epoch": 0.08162051574840613, + "grad_norm": 1.4503964185714722, + "learning_rate": 4.918273181798864e-05, + "loss": 5.4112, + "step": 13724 + }, + { + "epoch": 0.08162646303168712, + "grad_norm": 1.5512415170669556, + "learning_rate": 4.9182613357752925e-05, + "loss": 5.1501, + "step": 13725 + }, + { + "epoch": 0.08163241031496812, + "grad_norm": 1.7429851293563843, + "learning_rate": 4.9182494889075315e-05, + "loss": 5.2736, + "step": 13726 + }, + { + "epoch": 0.08163835759824913, + "grad_norm": 1.325498104095459, + "learning_rate": 4.918237641195584e-05, + "loss": 5.3702, + "step": 13727 + }, + { + "epoch": 0.08164430488153011, + "grad_norm": 1.2677874565124512, + "learning_rate": 4.918225792639456e-05, + "loss": 5.2681, + "step": 13728 + }, + { + "epoch": 0.08165025216481112, + "grad_norm": 1.4957364797592163, + "learning_rate": 4.918213943239149e-05, + "loss": 5.4956, + "step": 13729 + }, + { + "epoch": 0.08165619944809212, + "grad_norm": 1.3380833864212036, + "learning_rate": 4.91820209299467e-05, + "loss": 5.3286, + "step": 13730 + }, + { + "epoch": 0.0816621467313731, + "grad_norm": 1.6803557872772217, + "learning_rate": 4.918190241906021e-05, + "loss": 5.3119, + "step": 13731 + }, + { + "epoch": 0.08166809401465411, + "grad_norm": 1.7933920621871948, + "learning_rate": 4.918178389973206e-05, + "loss": 5.139, + "step": 13732 + }, + { + "epoch": 0.08167404129793511, + "grad_norm": 1.5846813917160034, + "learning_rate": 4.91816653719623e-05, + "loss": 5.4431, + "step": 13733 + }, + { + "epoch": 0.0816799885812161, + "grad_norm": 1.9218448400497437, + "learning_rate": 4.918154683575098e-05, + "loss": 5.3245, + "step": 13734 + }, + { + "epoch": 0.0816859358644971, + "grad_norm": 1.4883100986480713, + "learning_rate": 4.918142829109813e-05, + "loss": 5.3007, + "step": 13735 + }, + { + "epoch": 0.08169188314777809, + "grad_norm": 1.4396723508834839, + "learning_rate": 4.918130973800379e-05, + "loss": 5.1956, + "step": 13736 + }, + { + "epoch": 0.08169783043105909, + "grad_norm": 1.4395633935928345, + "learning_rate": 4.918119117646801e-05, + "loss": 5.1637, + "step": 13737 + }, + { + "epoch": 0.08170377771434009, + "grad_norm": 1.540003776550293, + "learning_rate": 4.9181072606490816e-05, + "loss": 5.2278, + "step": 13738 + }, + { + "epoch": 0.08170972499762108, + "grad_norm": 1.446815848350525, + "learning_rate": 4.918095402807227e-05, + "loss": 5.1627, + "step": 13739 + }, + { + "epoch": 0.08171567228090208, + "grad_norm": 1.4501028060913086, + "learning_rate": 4.918083544121239e-05, + "loss": 5.0747, + "step": 13740 + }, + { + "epoch": 0.08172161956418308, + "grad_norm": 1.217608094215393, + "learning_rate": 4.9180716845911244e-05, + "loss": 5.0668, + "step": 13741 + }, + { + "epoch": 0.08172756684746407, + "grad_norm": 1.6321865320205688, + "learning_rate": 4.918059824216885e-05, + "loss": 5.2785, + "step": 13742 + }, + { + "epoch": 0.08173351413074507, + "grad_norm": 1.5838396549224854, + "learning_rate": 4.9180479629985265e-05, + "loss": 5.1675, + "step": 13743 + }, + { + "epoch": 0.08173946141402608, + "grad_norm": 1.7023003101348877, + "learning_rate": 4.918036100936052e-05, + "loss": 5.1664, + "step": 13744 + }, + { + "epoch": 0.08174540869730706, + "grad_norm": 1.767067790031433, + "learning_rate": 4.918024238029466e-05, + "loss": 5.0157, + "step": 13745 + }, + { + "epoch": 0.08175135598058807, + "grad_norm": 1.6058627367019653, + "learning_rate": 4.918012374278773e-05, + "loss": 5.1772, + "step": 13746 + }, + { + "epoch": 0.08175730326386907, + "grad_norm": 1.7853416204452515, + "learning_rate": 4.9180005096839766e-05, + "loss": 5.2678, + "step": 13747 + }, + { + "epoch": 0.08176325054715006, + "grad_norm": 1.4799201488494873, + "learning_rate": 4.917988644245082e-05, + "loss": 5.3153, + "step": 13748 + }, + { + "epoch": 0.08176919783043106, + "grad_norm": 1.4581291675567627, + "learning_rate": 4.917976777962092e-05, + "loss": 5.2755, + "step": 13749 + }, + { + "epoch": 0.08177514511371206, + "grad_norm": 1.7151737213134766, + "learning_rate": 4.917964910835011e-05, + "loss": 5.1761, + "step": 13750 + }, + { + "epoch": 0.08178109239699305, + "grad_norm": 1.5101522207260132, + "learning_rate": 4.917953042863843e-05, + "loss": 5.0003, + "step": 13751 + }, + { + "epoch": 0.08178703968027405, + "grad_norm": 1.4508110284805298, + "learning_rate": 4.9179411740485935e-05, + "loss": 5.1158, + "step": 13752 + }, + { + "epoch": 0.08179298696355505, + "grad_norm": 1.5012980699539185, + "learning_rate": 4.917929304389266e-05, + "loss": 5.2762, + "step": 13753 + }, + { + "epoch": 0.08179893424683604, + "grad_norm": 1.5914186239242554, + "learning_rate": 4.9179174338858635e-05, + "loss": 5.1422, + "step": 13754 + }, + { + "epoch": 0.08180488153011704, + "grad_norm": 1.5001139640808105, + "learning_rate": 4.9179055625383915e-05, + "loss": 5.2158, + "step": 13755 + }, + { + "epoch": 0.08181082881339805, + "grad_norm": 1.382815957069397, + "learning_rate": 4.917893690346853e-05, + "loss": 5.2562, + "step": 13756 + }, + { + "epoch": 0.08181677609667903, + "grad_norm": 1.3576865196228027, + "learning_rate": 4.9178818173112535e-05, + "loss": 5.221, + "step": 13757 + }, + { + "epoch": 0.08182272337996004, + "grad_norm": 1.5542206764221191, + "learning_rate": 4.917869943431596e-05, + "loss": 5.071, + "step": 13758 + }, + { + "epoch": 0.08182867066324104, + "grad_norm": 1.6010403633117676, + "learning_rate": 4.9178580687078855e-05, + "loss": 5.2052, + "step": 13759 + }, + { + "epoch": 0.08183461794652203, + "grad_norm": 1.3808842897415161, + "learning_rate": 4.9178461931401254e-05, + "loss": 5.3007, + "step": 13760 + }, + { + "epoch": 0.08184056522980303, + "grad_norm": 1.3584518432617188, + "learning_rate": 4.91783431672832e-05, + "loss": 5.3137, + "step": 13761 + }, + { + "epoch": 0.08184651251308403, + "grad_norm": 1.4467449188232422, + "learning_rate": 4.917822439472474e-05, + "loss": 5.2208, + "step": 13762 + }, + { + "epoch": 0.08185245979636502, + "grad_norm": 1.298618197441101, + "learning_rate": 4.917810561372591e-05, + "loss": 5.2161, + "step": 13763 + }, + { + "epoch": 0.08185840707964602, + "grad_norm": 2.5304789543151855, + "learning_rate": 4.9177986824286756e-05, + "loss": 4.6644, + "step": 13764 + }, + { + "epoch": 0.08186435436292701, + "grad_norm": 1.607969880104065, + "learning_rate": 4.917786802640732e-05, + "loss": 5.2116, + "step": 13765 + }, + { + "epoch": 0.08187030164620801, + "grad_norm": 1.401207685470581, + "learning_rate": 4.917774922008763e-05, + "loss": 5.2847, + "step": 13766 + }, + { + "epoch": 0.08187624892948901, + "grad_norm": 1.1652514934539795, + "learning_rate": 4.9177630405327746e-05, + "loss": 5.2939, + "step": 13767 + }, + { + "epoch": 0.08188219621277, + "grad_norm": 1.2998749017715454, + "learning_rate": 4.9177511582127694e-05, + "loss": 5.251, + "step": 13768 + }, + { + "epoch": 0.081888143496051, + "grad_norm": 1.33558988571167, + "learning_rate": 4.917739275048753e-05, + "loss": 5.2749, + "step": 13769 + }, + { + "epoch": 0.081894090779332, + "grad_norm": 1.1457966566085815, + "learning_rate": 4.917727391040728e-05, + "loss": 5.3153, + "step": 13770 + }, + { + "epoch": 0.08190003806261299, + "grad_norm": 1.493249773979187, + "learning_rate": 4.917715506188699e-05, + "loss": 5.3702, + "step": 13771 + }, + { + "epoch": 0.081905985345894, + "grad_norm": 1.2591760158538818, + "learning_rate": 4.917703620492672e-05, + "loss": 5.2019, + "step": 13772 + }, + { + "epoch": 0.081911932629175, + "grad_norm": 1.2480885982513428, + "learning_rate": 4.917691733952648e-05, + "loss": 5.1904, + "step": 13773 + }, + { + "epoch": 0.08191787991245598, + "grad_norm": 1.3278160095214844, + "learning_rate": 4.917679846568634e-05, + "loss": 5.0424, + "step": 13774 + }, + { + "epoch": 0.08192382719573699, + "grad_norm": 1.2930511236190796, + "learning_rate": 4.9176679583406325e-05, + "loss": 5.2437, + "step": 13775 + }, + { + "epoch": 0.08192977447901799, + "grad_norm": 1.39852774143219, + "learning_rate": 4.9176560692686485e-05, + "loss": 5.3683, + "step": 13776 + }, + { + "epoch": 0.08193572176229898, + "grad_norm": 1.3392889499664307, + "learning_rate": 4.917644179352685e-05, + "loss": 5.1894, + "step": 13777 + }, + { + "epoch": 0.08194166904557998, + "grad_norm": 1.318595051765442, + "learning_rate": 4.917632288592747e-05, + "loss": 5.382, + "step": 13778 + }, + { + "epoch": 0.08194761632886098, + "grad_norm": 1.0992580652236938, + "learning_rate": 4.9176203969888395e-05, + "loss": 5.1979, + "step": 13779 + }, + { + "epoch": 0.08195356361214197, + "grad_norm": 1.2092480659484863, + "learning_rate": 4.917608504540965e-05, + "loss": 5.2253, + "step": 13780 + }, + { + "epoch": 0.08195951089542297, + "grad_norm": 1.2495516538619995, + "learning_rate": 4.9175966112491286e-05, + "loss": 5.1951, + "step": 13781 + }, + { + "epoch": 0.08196545817870397, + "grad_norm": 1.642177700996399, + "learning_rate": 4.917584717113334e-05, + "loss": 4.9648, + "step": 13782 + }, + { + "epoch": 0.08197140546198496, + "grad_norm": 1.4849772453308105, + "learning_rate": 4.9175728221335856e-05, + "loss": 4.8231, + "step": 13783 + }, + { + "epoch": 0.08197735274526596, + "grad_norm": 1.1743687391281128, + "learning_rate": 4.917560926309888e-05, + "loss": 4.7685, + "step": 13784 + }, + { + "epoch": 0.08198330002854697, + "grad_norm": 1.2688218355178833, + "learning_rate": 4.9175490296422436e-05, + "loss": 5.3023, + "step": 13785 + }, + { + "epoch": 0.08198924731182795, + "grad_norm": 1.2325210571289062, + "learning_rate": 4.9175371321306584e-05, + "loss": 4.8373, + "step": 13786 + }, + { + "epoch": 0.08199519459510896, + "grad_norm": 1.5414066314697266, + "learning_rate": 4.9175252337751364e-05, + "loss": 5.005, + "step": 13787 + }, + { + "epoch": 0.08200114187838996, + "grad_norm": 2.1581833362579346, + "learning_rate": 4.917513334575681e-05, + "loss": 5.5065, + "step": 13788 + }, + { + "epoch": 0.08200708916167095, + "grad_norm": 2.0199508666992188, + "learning_rate": 4.917501434532297e-05, + "loss": 5.8826, + "step": 13789 + }, + { + "epoch": 0.08201303644495195, + "grad_norm": 1.727602481842041, + "learning_rate": 4.917489533644987e-05, + "loss": 5.6967, + "step": 13790 + }, + { + "epoch": 0.08201898372823295, + "grad_norm": 1.5649336576461792, + "learning_rate": 4.917477631913757e-05, + "loss": 5.783, + "step": 13791 + }, + { + "epoch": 0.08202493101151394, + "grad_norm": 1.7326582670211792, + "learning_rate": 4.9174657293386115e-05, + "loss": 5.6705, + "step": 13792 + }, + { + "epoch": 0.08203087829479494, + "grad_norm": 1.8611500263214111, + "learning_rate": 4.917453825919553e-05, + "loss": 5.4881, + "step": 13793 + }, + { + "epoch": 0.08203682557807593, + "grad_norm": 1.9762206077575684, + "learning_rate": 4.917441921656586e-05, + "loss": 5.4826, + "step": 13794 + }, + { + "epoch": 0.08204277286135693, + "grad_norm": 1.6816489696502686, + "learning_rate": 4.9174300165497154e-05, + "loss": 5.466, + "step": 13795 + }, + { + "epoch": 0.08204872014463793, + "grad_norm": 1.8922536373138428, + "learning_rate": 4.9174181105989445e-05, + "loss": 5.3603, + "step": 13796 + }, + { + "epoch": 0.08205466742791892, + "grad_norm": 2.094996213912964, + "learning_rate": 4.917406203804279e-05, + "loss": 5.8687, + "step": 13797 + }, + { + "epoch": 0.08206061471119992, + "grad_norm": 1.8656450510025024, + "learning_rate": 4.9173942961657215e-05, + "loss": 6.2551, + "step": 13798 + }, + { + "epoch": 0.08206656199448092, + "grad_norm": 1.871787428855896, + "learning_rate": 4.917382387683276e-05, + "loss": 5.6612, + "step": 13799 + }, + { + "epoch": 0.08207250927776191, + "grad_norm": 1.8721636533737183, + "learning_rate": 4.9173704783569475e-05, + "loss": 5.8918, + "step": 13800 + }, + { + "epoch": 0.08207845656104291, + "grad_norm": 2.0554919242858887, + "learning_rate": 4.917358568186741e-05, + "loss": 5.6398, + "step": 13801 + }, + { + "epoch": 0.08208440384432392, + "grad_norm": 1.9311691522598267, + "learning_rate": 4.917346657172658e-05, + "loss": 5.6507, + "step": 13802 + }, + { + "epoch": 0.0820903511276049, + "grad_norm": 1.7426981925964355, + "learning_rate": 4.917334745314705e-05, + "loss": 5.3193, + "step": 13803 + }, + { + "epoch": 0.0820962984108859, + "grad_norm": 1.783890724182129, + "learning_rate": 4.9173228326128856e-05, + "loss": 5.1274, + "step": 13804 + }, + { + "epoch": 0.08210224569416691, + "grad_norm": 1.8739385604858398, + "learning_rate": 4.917310919067203e-05, + "loss": 5.378, + "step": 13805 + }, + { + "epoch": 0.0821081929774479, + "grad_norm": 1.6748543977737427, + "learning_rate": 4.917299004677663e-05, + "loss": 5.4772, + "step": 13806 + }, + { + "epoch": 0.0821141402607289, + "grad_norm": 1.498864769935608, + "learning_rate": 4.917287089444269e-05, + "loss": 5.4485, + "step": 13807 + }, + { + "epoch": 0.0821200875440099, + "grad_norm": 1.6129908561706543, + "learning_rate": 4.917275173367024e-05, + "loss": 5.5245, + "step": 13808 + }, + { + "epoch": 0.08212603482729089, + "grad_norm": 1.4655383825302124, + "learning_rate": 4.917263256445934e-05, + "loss": 5.5513, + "step": 13809 + }, + { + "epoch": 0.08213198211057189, + "grad_norm": 1.765244483947754, + "learning_rate": 4.917251338681003e-05, + "loss": 5.5322, + "step": 13810 + }, + { + "epoch": 0.0821379293938529, + "grad_norm": 2.002889633178711, + "learning_rate": 4.917239420072233e-05, + "loss": 5.1273, + "step": 13811 + }, + { + "epoch": 0.08214387667713388, + "grad_norm": 2.4380993843078613, + "learning_rate": 4.917227500619631e-05, + "loss": 4.8983, + "step": 13812 + }, + { + "epoch": 0.08214982396041488, + "grad_norm": 2.0864169597625732, + "learning_rate": 4.917215580323199e-05, + "loss": 5.077, + "step": 13813 + }, + { + "epoch": 0.08215577124369589, + "grad_norm": 2.2942094802856445, + "learning_rate": 4.917203659182942e-05, + "loss": 5.4359, + "step": 13814 + }, + { + "epoch": 0.08216171852697687, + "grad_norm": 2.067659616470337, + "learning_rate": 4.917191737198865e-05, + "loss": 5.7409, + "step": 13815 + }, + { + "epoch": 0.08216766581025788, + "grad_norm": 2.010085344314575, + "learning_rate": 4.917179814370971e-05, + "loss": 5.2279, + "step": 13816 + }, + { + "epoch": 0.08217361309353888, + "grad_norm": 1.8540743589401245, + "learning_rate": 4.917167890699264e-05, + "loss": 5.6146, + "step": 13817 + }, + { + "epoch": 0.08217956037681987, + "grad_norm": 1.9126391410827637, + "learning_rate": 4.917155966183749e-05, + "loss": 5.7007, + "step": 13818 + }, + { + "epoch": 0.08218550766010087, + "grad_norm": 1.6382626295089722, + "learning_rate": 4.91714404082443e-05, + "loss": 5.3641, + "step": 13819 + }, + { + "epoch": 0.08219145494338187, + "grad_norm": 1.8019288778305054, + "learning_rate": 4.9171321146213105e-05, + "loss": 5.1853, + "step": 13820 + }, + { + "epoch": 0.08219740222666286, + "grad_norm": 1.681685447692871, + "learning_rate": 4.917120187574395e-05, + "loss": 5.4141, + "step": 13821 + }, + { + "epoch": 0.08220334950994386, + "grad_norm": 1.9356689453125, + "learning_rate": 4.9171082596836896e-05, + "loss": 5.5379, + "step": 13822 + }, + { + "epoch": 0.08220929679322485, + "grad_norm": 1.9538071155548096, + "learning_rate": 4.917096330949195e-05, + "loss": 5.5723, + "step": 13823 + }, + { + "epoch": 0.08221524407650585, + "grad_norm": 1.7350852489471436, + "learning_rate": 4.9170844013709175e-05, + "loss": 5.5622, + "step": 13824 + }, + { + "epoch": 0.08222119135978685, + "grad_norm": 1.790276050567627, + "learning_rate": 4.9170724709488606e-05, + "loss": 5.5194, + "step": 13825 + }, + { + "epoch": 0.08222713864306784, + "grad_norm": 2.2997219562530518, + "learning_rate": 4.917060539683028e-05, + "loss": 5.0646, + "step": 13826 + }, + { + "epoch": 0.08223308592634884, + "grad_norm": 1.729131817817688, + "learning_rate": 4.9170486075734254e-05, + "loss": 5.5588, + "step": 13827 + }, + { + "epoch": 0.08223903320962984, + "grad_norm": 1.8754487037658691, + "learning_rate": 4.9170366746200566e-05, + "loss": 5.5435, + "step": 13828 + }, + { + "epoch": 0.08224498049291083, + "grad_norm": 1.8330692052841187, + "learning_rate": 4.9170247408229244e-05, + "loss": 5.598, + "step": 13829 + }, + { + "epoch": 0.08225092777619183, + "grad_norm": 1.8318592309951782, + "learning_rate": 4.917012806182034e-05, + "loss": 5.5165, + "step": 13830 + }, + { + "epoch": 0.08225687505947284, + "grad_norm": 1.6818424463272095, + "learning_rate": 4.9170008706973895e-05, + "loss": 5.3377, + "step": 13831 + }, + { + "epoch": 0.08226282234275382, + "grad_norm": 1.7040458917617798, + "learning_rate": 4.916988934368995e-05, + "loss": 5.4644, + "step": 13832 + }, + { + "epoch": 0.08226876962603483, + "grad_norm": 1.8902777433395386, + "learning_rate": 4.916976997196855e-05, + "loss": 5.4526, + "step": 13833 + }, + { + "epoch": 0.08227471690931583, + "grad_norm": 1.7484904527664185, + "learning_rate": 4.9169650591809724e-05, + "loss": 5.3, + "step": 13834 + }, + { + "epoch": 0.08228066419259682, + "grad_norm": 1.726083517074585, + "learning_rate": 4.916953120321353e-05, + "loss": 5.4451, + "step": 13835 + }, + { + "epoch": 0.08228661147587782, + "grad_norm": 1.791942834854126, + "learning_rate": 4.916941180618e-05, + "loss": 5.444, + "step": 13836 + }, + { + "epoch": 0.08229255875915882, + "grad_norm": 1.9032018184661865, + "learning_rate": 4.916929240070918e-05, + "loss": 5.4411, + "step": 13837 + }, + { + "epoch": 0.08229850604243981, + "grad_norm": 1.6170588731765747, + "learning_rate": 4.91691729868011e-05, + "loss": 5.4293, + "step": 13838 + }, + { + "epoch": 0.08230445332572081, + "grad_norm": 1.3972853422164917, + "learning_rate": 4.9169053564455825e-05, + "loss": 5.2889, + "step": 13839 + }, + { + "epoch": 0.08231040060900181, + "grad_norm": 1.782913088798523, + "learning_rate": 4.916893413367338e-05, + "loss": 5.4092, + "step": 13840 + }, + { + "epoch": 0.0823163478922828, + "grad_norm": 1.83617103099823, + "learning_rate": 4.9168814694453807e-05, + "loss": 5.3997, + "step": 13841 + }, + { + "epoch": 0.0823222951755638, + "grad_norm": 1.92609703540802, + "learning_rate": 4.9168695246797146e-05, + "loss": 5.3469, + "step": 13842 + }, + { + "epoch": 0.0823282424588448, + "grad_norm": 2.20027756690979, + "learning_rate": 4.9168575790703454e-05, + "loss": 5.5999, + "step": 13843 + }, + { + "epoch": 0.0823341897421258, + "grad_norm": 3.096323251724243, + "learning_rate": 4.916845632617275e-05, + "loss": 5.3997, + "step": 13844 + }, + { + "epoch": 0.0823401370254068, + "grad_norm": 2.433900833129883, + "learning_rate": 4.91683368532051e-05, + "loss": 5.4937, + "step": 13845 + }, + { + "epoch": 0.0823460843086878, + "grad_norm": 2.371389389038086, + "learning_rate": 4.9168217371800526e-05, + "loss": 5.966, + "step": 13846 + }, + { + "epoch": 0.08235203159196879, + "grad_norm": 1.5628182888031006, + "learning_rate": 4.9168097881959076e-05, + "loss": 5.5971, + "step": 13847 + }, + { + "epoch": 0.08235797887524979, + "grad_norm": 2.733569622039795, + "learning_rate": 4.91679783836808e-05, + "loss": 5.2696, + "step": 13848 + }, + { + "epoch": 0.08236392615853079, + "grad_norm": 2.117197275161743, + "learning_rate": 4.916785887696572e-05, + "loss": 5.3729, + "step": 13849 + }, + { + "epoch": 0.08236987344181178, + "grad_norm": 2.040476083755493, + "learning_rate": 4.9167739361813905e-05, + "loss": 5.6568, + "step": 13850 + }, + { + "epoch": 0.08237582072509278, + "grad_norm": 2.127465009689331, + "learning_rate": 4.916761983822536e-05, + "loss": 5.9168, + "step": 13851 + }, + { + "epoch": 0.08238176800837377, + "grad_norm": 2.00907301902771, + "learning_rate": 4.916750030620017e-05, + "loss": 5.9104, + "step": 13852 + }, + { + "epoch": 0.08238771529165477, + "grad_norm": 1.721428394317627, + "learning_rate": 4.916738076573835e-05, + "loss": 5.8126, + "step": 13853 + }, + { + "epoch": 0.08239366257493577, + "grad_norm": 1.5760809183120728, + "learning_rate": 4.9167261216839946e-05, + "loss": 6.0134, + "step": 13854 + }, + { + "epoch": 0.08239960985821676, + "grad_norm": 1.648639440536499, + "learning_rate": 4.9167141659505e-05, + "loss": 5.3878, + "step": 13855 + }, + { + "epoch": 0.08240555714149776, + "grad_norm": 1.4113967418670654, + "learning_rate": 4.916702209373355e-05, + "loss": 5.8159, + "step": 13856 + }, + { + "epoch": 0.08241150442477876, + "grad_norm": 1.725477933883667, + "learning_rate": 4.916690251952565e-05, + "loss": 5.7185, + "step": 13857 + }, + { + "epoch": 0.08241745170805975, + "grad_norm": 1.8538665771484375, + "learning_rate": 4.9166782936881326e-05, + "loss": 5.1804, + "step": 13858 + }, + { + "epoch": 0.08242339899134075, + "grad_norm": 1.5203232765197754, + "learning_rate": 4.9166663345800635e-05, + "loss": 5.1486, + "step": 13859 + }, + { + "epoch": 0.08242934627462176, + "grad_norm": 1.8738161325454712, + "learning_rate": 4.916654374628361e-05, + "loss": 5.0062, + "step": 13860 + }, + { + "epoch": 0.08243529355790274, + "grad_norm": 1.689563512802124, + "learning_rate": 4.916642413833029e-05, + "loss": 4.9508, + "step": 13861 + }, + { + "epoch": 0.08244124084118375, + "grad_norm": 1.8749178647994995, + "learning_rate": 4.916630452194073e-05, + "loss": 5.4645, + "step": 13862 + }, + { + "epoch": 0.08244718812446475, + "grad_norm": 2.779536247253418, + "learning_rate": 4.9166184897114956e-05, + "loss": 5.9364, + "step": 13863 + }, + { + "epoch": 0.08245313540774574, + "grad_norm": 2.41239333152771, + "learning_rate": 4.9166065263853014e-05, + "loss": 5.9045, + "step": 13864 + }, + { + "epoch": 0.08245908269102674, + "grad_norm": 1.624475359916687, + "learning_rate": 4.916594562215495e-05, + "loss": 5.4222, + "step": 13865 + }, + { + "epoch": 0.08246502997430774, + "grad_norm": 1.6841174364089966, + "learning_rate": 4.916582597202081e-05, + "loss": 5.3455, + "step": 13866 + }, + { + "epoch": 0.08247097725758873, + "grad_norm": 1.6790028810501099, + "learning_rate": 4.916570631345062e-05, + "loss": 5.5397, + "step": 13867 + }, + { + "epoch": 0.08247692454086973, + "grad_norm": 1.87303626537323, + "learning_rate": 4.9165586646444436e-05, + "loss": 5.6022, + "step": 13868 + }, + { + "epoch": 0.08248287182415073, + "grad_norm": 1.7747167348861694, + "learning_rate": 4.91654669710023e-05, + "loss": 5.4631, + "step": 13869 + }, + { + "epoch": 0.08248881910743172, + "grad_norm": 1.694941759109497, + "learning_rate": 4.9165347287124244e-05, + "loss": 5.5634, + "step": 13870 + }, + { + "epoch": 0.08249476639071272, + "grad_norm": 1.8258243799209595, + "learning_rate": 4.9165227594810316e-05, + "loss": 5.526, + "step": 13871 + }, + { + "epoch": 0.08250071367399373, + "grad_norm": 1.708798885345459, + "learning_rate": 4.9165107894060556e-05, + "loss": 5.5127, + "step": 13872 + }, + { + "epoch": 0.08250666095727471, + "grad_norm": 1.7820818424224854, + "learning_rate": 4.916498818487501e-05, + "loss": 5.4169, + "step": 13873 + }, + { + "epoch": 0.08251260824055572, + "grad_norm": 2.38067626953125, + "learning_rate": 4.916486846725372e-05, + "loss": 5.8063, + "step": 13874 + }, + { + "epoch": 0.08251855552383672, + "grad_norm": 1.8507468700408936, + "learning_rate": 4.916474874119671e-05, + "loss": 5.4871, + "step": 13875 + }, + { + "epoch": 0.0825245028071177, + "grad_norm": 1.8866678476333618, + "learning_rate": 4.916462900670404e-05, + "loss": 5.5452, + "step": 13876 + }, + { + "epoch": 0.08253045009039871, + "grad_norm": 1.853668212890625, + "learning_rate": 4.916450926377576e-05, + "loss": 5.8262, + "step": 13877 + }, + { + "epoch": 0.08253639737367971, + "grad_norm": 1.7404545545578003, + "learning_rate": 4.916438951241189e-05, + "loss": 5.5978, + "step": 13878 + }, + { + "epoch": 0.0825423446569607, + "grad_norm": 1.844139814376831, + "learning_rate": 4.916426975261248e-05, + "loss": 5.765, + "step": 13879 + }, + { + "epoch": 0.0825482919402417, + "grad_norm": 1.9454487562179565, + "learning_rate": 4.916414998437758e-05, + "loss": 5.5458, + "step": 13880 + }, + { + "epoch": 0.08255423922352269, + "grad_norm": 1.317144751548767, + "learning_rate": 4.916403020770722e-05, + "loss": 5.7694, + "step": 13881 + }, + { + "epoch": 0.08256018650680369, + "grad_norm": 1.718024730682373, + "learning_rate": 4.916391042260145e-05, + "loss": 5.7369, + "step": 13882 + }, + { + "epoch": 0.08256613379008469, + "grad_norm": 1.4623572826385498, + "learning_rate": 4.9163790629060305e-05, + "loss": 5.72, + "step": 13883 + }, + { + "epoch": 0.08257208107336568, + "grad_norm": 1.908839225769043, + "learning_rate": 4.916367082708383e-05, + "loss": 5.7175, + "step": 13884 + }, + { + "epoch": 0.08257802835664668, + "grad_norm": 1.7910356521606445, + "learning_rate": 4.916355101667206e-05, + "loss": 5.4446, + "step": 13885 + }, + { + "epoch": 0.08258397563992768, + "grad_norm": 2.132512092590332, + "learning_rate": 4.9163431197825055e-05, + "loss": 5.2315, + "step": 13886 + }, + { + "epoch": 0.08258992292320867, + "grad_norm": 2.223329782485962, + "learning_rate": 4.9163311370542844e-05, + "loss": 5.2953, + "step": 13887 + }, + { + "epoch": 0.08259587020648967, + "grad_norm": 2.6441519260406494, + "learning_rate": 4.916319153482547e-05, + "loss": 5.2637, + "step": 13888 + }, + { + "epoch": 0.08260181748977068, + "grad_norm": 2.1528780460357666, + "learning_rate": 4.9163071690672973e-05, + "loss": 5.1602, + "step": 13889 + }, + { + "epoch": 0.08260776477305166, + "grad_norm": 2.6483633518218994, + "learning_rate": 4.91629518380854e-05, + "loss": 5.2487, + "step": 13890 + }, + { + "epoch": 0.08261371205633267, + "grad_norm": 2.276808738708496, + "learning_rate": 4.916283197706279e-05, + "loss": 5.064, + "step": 13891 + }, + { + "epoch": 0.08261965933961367, + "grad_norm": 1.8921101093292236, + "learning_rate": 4.9162712107605184e-05, + "loss": 5.3979, + "step": 13892 + }, + { + "epoch": 0.08262560662289466, + "grad_norm": 2.2009568214416504, + "learning_rate": 4.9162592229712625e-05, + "loss": 5.2434, + "step": 13893 + }, + { + "epoch": 0.08263155390617566, + "grad_norm": 2.199380874633789, + "learning_rate": 4.916247234338516e-05, + "loss": 4.7187, + "step": 13894 + }, + { + "epoch": 0.08263750118945666, + "grad_norm": 2.3620400428771973, + "learning_rate": 4.916235244862282e-05, + "loss": 4.7371, + "step": 13895 + }, + { + "epoch": 0.08264344847273765, + "grad_norm": 2.100086212158203, + "learning_rate": 4.9162232545425646e-05, + "loss": 4.5239, + "step": 13896 + }, + { + "epoch": 0.08264939575601865, + "grad_norm": 2.100106954574585, + "learning_rate": 4.91621126337937e-05, + "loss": 4.5555, + "step": 13897 + }, + { + "epoch": 0.08265534303929965, + "grad_norm": 2.005345344543457, + "learning_rate": 4.9161992713727e-05, + "loss": 4.397, + "step": 13898 + }, + { + "epoch": 0.08266129032258064, + "grad_norm": 1.9393454790115356, + "learning_rate": 4.91618727852256e-05, + "loss": 4.7327, + "step": 13899 + }, + { + "epoch": 0.08266723760586164, + "grad_norm": 2.0109846591949463, + "learning_rate": 4.916175284828955e-05, + "loss": 4.4987, + "step": 13900 + }, + { + "epoch": 0.08267318488914265, + "grad_norm": 2.0040533542633057, + "learning_rate": 4.916163290291886e-05, + "loss": 4.4703, + "step": 13901 + }, + { + "epoch": 0.08267913217242363, + "grad_norm": 2.014885902404785, + "learning_rate": 4.916151294911361e-05, + "loss": 4.374, + "step": 13902 + }, + { + "epoch": 0.08268507945570464, + "grad_norm": 1.9490050077438354, + "learning_rate": 4.916139298687382e-05, + "loss": 4.6281, + "step": 13903 + }, + { + "epoch": 0.08269102673898564, + "grad_norm": 2.0691943168640137, + "learning_rate": 4.916127301619954e-05, + "loss": 4.5008, + "step": 13904 + }, + { + "epoch": 0.08269697402226663, + "grad_norm": 2.1290805339813232, + "learning_rate": 4.916115303709081e-05, + "loss": 5.4876, + "step": 13905 + }, + { + "epoch": 0.08270292130554763, + "grad_norm": 1.981466293334961, + "learning_rate": 4.916103304954767e-05, + "loss": 5.7699, + "step": 13906 + }, + { + "epoch": 0.08270886858882863, + "grad_norm": 1.8898048400878906, + "learning_rate": 4.916091305357016e-05, + "loss": 5.7874, + "step": 13907 + }, + { + "epoch": 0.08271481587210962, + "grad_norm": 1.7809741497039795, + "learning_rate": 4.916079304915833e-05, + "loss": 5.6264, + "step": 13908 + }, + { + "epoch": 0.08272076315539062, + "grad_norm": 1.7516652345657349, + "learning_rate": 4.916067303631221e-05, + "loss": 5.5751, + "step": 13909 + }, + { + "epoch": 0.08272671043867161, + "grad_norm": 1.9051094055175781, + "learning_rate": 4.916055301503185e-05, + "loss": 5.7984, + "step": 13910 + }, + { + "epoch": 0.08273265772195261, + "grad_norm": 1.7115057706832886, + "learning_rate": 4.9160432985317295e-05, + "loss": 5.6187, + "step": 13911 + }, + { + "epoch": 0.08273860500523361, + "grad_norm": 1.790529727935791, + "learning_rate": 4.916031294716858e-05, + "loss": 5.6276, + "step": 13912 + }, + { + "epoch": 0.0827445522885146, + "grad_norm": 1.742039442062378, + "learning_rate": 4.9160192900585754e-05, + "loss": 5.3783, + "step": 13913 + }, + { + "epoch": 0.0827504995717956, + "grad_norm": 1.7544314861297607, + "learning_rate": 4.916007284556885e-05, + "loss": 5.5276, + "step": 13914 + }, + { + "epoch": 0.0827564468550766, + "grad_norm": 2.0135440826416016, + "learning_rate": 4.915995278211791e-05, + "loss": 5.5177, + "step": 13915 + }, + { + "epoch": 0.08276239413835759, + "grad_norm": 1.5759433507919312, + "learning_rate": 4.915983271023299e-05, + "loss": 5.4652, + "step": 13916 + }, + { + "epoch": 0.0827683414216386, + "grad_norm": 1.7974358797073364, + "learning_rate": 4.915971262991411e-05, + "loss": 5.4463, + "step": 13917 + }, + { + "epoch": 0.0827742887049196, + "grad_norm": 1.847692608833313, + "learning_rate": 4.9159592541161335e-05, + "loss": 5.4247, + "step": 13918 + }, + { + "epoch": 0.08278023598820058, + "grad_norm": 1.6701977252960205, + "learning_rate": 4.915947244397469e-05, + "loss": 5.3451, + "step": 13919 + }, + { + "epoch": 0.08278618327148159, + "grad_norm": 1.9226999282836914, + "learning_rate": 4.915935233835423e-05, + "loss": 5.1159, + "step": 13920 + }, + { + "epoch": 0.08279213055476259, + "grad_norm": 2.430760383605957, + "learning_rate": 4.915923222429998e-05, + "loss": 4.9746, + "step": 13921 + }, + { + "epoch": 0.08279807783804358, + "grad_norm": 1.7708054780960083, + "learning_rate": 4.915911210181199e-05, + "loss": 5.4986, + "step": 13922 + }, + { + "epoch": 0.08280402512132458, + "grad_norm": 1.7802354097366333, + "learning_rate": 4.915899197089031e-05, + "loss": 5.4283, + "step": 13923 + }, + { + "epoch": 0.08280997240460558, + "grad_norm": 2.347226142883301, + "learning_rate": 4.9158871831534984e-05, + "loss": 5.2917, + "step": 13924 + }, + { + "epoch": 0.08281591968788657, + "grad_norm": 2.5685782432556152, + "learning_rate": 4.915875168374603e-05, + "loss": 5.243, + "step": 13925 + }, + { + "epoch": 0.08282186697116757, + "grad_norm": 2.460383176803589, + "learning_rate": 4.915863152752351e-05, + "loss": 4.9241, + "step": 13926 + }, + { + "epoch": 0.08282781425444857, + "grad_norm": 2.2505056858062744, + "learning_rate": 4.915851136286747e-05, + "loss": 5.0951, + "step": 13927 + }, + { + "epoch": 0.08283376153772956, + "grad_norm": 2.517544984817505, + "learning_rate": 4.915839118977793e-05, + "loss": 5.151, + "step": 13928 + }, + { + "epoch": 0.08283970882101056, + "grad_norm": 2.445645809173584, + "learning_rate": 4.915827100825495e-05, + "loss": 5.1831, + "step": 13929 + }, + { + "epoch": 0.08284565610429157, + "grad_norm": 2.347383737564087, + "learning_rate": 4.9158150818298564e-05, + "loss": 5.0299, + "step": 13930 + }, + { + "epoch": 0.08285160338757255, + "grad_norm": 2.1791892051696777, + "learning_rate": 4.915803061990882e-05, + "loss": 5.4083, + "step": 13931 + }, + { + "epoch": 0.08285755067085356, + "grad_norm": 1.9959020614624023, + "learning_rate": 4.9157910413085764e-05, + "loss": 5.9036, + "step": 13932 + }, + { + "epoch": 0.08286349795413456, + "grad_norm": 2.3419620990753174, + "learning_rate": 4.915779019782942e-05, + "loss": 4.9082, + "step": 13933 + }, + { + "epoch": 0.08286944523741555, + "grad_norm": 2.452756643295288, + "learning_rate": 4.915766997413985e-05, + "loss": 4.8272, + "step": 13934 + }, + { + "epoch": 0.08287539252069655, + "grad_norm": 2.344353675842285, + "learning_rate": 4.915754974201708e-05, + "loss": 5.0269, + "step": 13935 + }, + { + "epoch": 0.08288133980397755, + "grad_norm": 2.366218090057373, + "learning_rate": 4.9157429501461175e-05, + "loss": 4.8898, + "step": 13936 + }, + { + "epoch": 0.08288728708725854, + "grad_norm": 1.7986581325531006, + "learning_rate": 4.915730925247214e-05, + "loss": 4.9316, + "step": 13937 + }, + { + "epoch": 0.08289323437053954, + "grad_norm": 2.059094190597534, + "learning_rate": 4.915718899505005e-05, + "loss": 5.1297, + "step": 13938 + }, + { + "epoch": 0.08289918165382054, + "grad_norm": 1.9630707502365112, + "learning_rate": 4.915706872919493e-05, + "loss": 5.4844, + "step": 13939 + }, + { + "epoch": 0.08290512893710153, + "grad_norm": 2.0281238555908203, + "learning_rate": 4.9156948454906825e-05, + "loss": 5.9276, + "step": 13940 + }, + { + "epoch": 0.08291107622038253, + "grad_norm": 1.8783270120620728, + "learning_rate": 4.9156828172185786e-05, + "loss": 5.7085, + "step": 13941 + }, + { + "epoch": 0.08291702350366352, + "grad_norm": 2.190317153930664, + "learning_rate": 4.915670788103184e-05, + "loss": 4.9619, + "step": 13942 + }, + { + "epoch": 0.08292297078694452, + "grad_norm": 2.2746498584747314, + "learning_rate": 4.915658758144505e-05, + "loss": 4.8965, + "step": 13943 + }, + { + "epoch": 0.08292891807022552, + "grad_norm": 1.940510630607605, + "learning_rate": 4.915646727342543e-05, + "loss": 5.0367, + "step": 13944 + }, + { + "epoch": 0.08293486535350651, + "grad_norm": 1.9016308784484863, + "learning_rate": 4.915634695697304e-05, + "loss": 5.5002, + "step": 13945 + }, + { + "epoch": 0.08294081263678751, + "grad_norm": 2.0041022300720215, + "learning_rate": 4.915622663208792e-05, + "loss": 5.4193, + "step": 13946 + }, + { + "epoch": 0.08294675992006852, + "grad_norm": 2.0117805004119873, + "learning_rate": 4.9156106298770115e-05, + "loss": 5.2697, + "step": 13947 + }, + { + "epoch": 0.0829527072033495, + "grad_norm": 1.864820957183838, + "learning_rate": 4.9155985957019654e-05, + "loss": 5.1594, + "step": 13948 + }, + { + "epoch": 0.0829586544866305, + "grad_norm": 1.7407771348953247, + "learning_rate": 4.91558656068366e-05, + "loss": 5.1189, + "step": 13949 + }, + { + "epoch": 0.08296460176991151, + "grad_norm": 2.027552366256714, + "learning_rate": 4.9155745248220976e-05, + "loss": 5.6257, + "step": 13950 + }, + { + "epoch": 0.0829705490531925, + "grad_norm": 1.6893701553344727, + "learning_rate": 4.9155624881172834e-05, + "loss": 5.1268, + "step": 13951 + }, + { + "epoch": 0.0829764963364735, + "grad_norm": 1.7216230630874634, + "learning_rate": 4.915550450569221e-05, + "loss": 5.2768, + "step": 13952 + }, + { + "epoch": 0.0829824436197545, + "grad_norm": 1.6723179817199707, + "learning_rate": 4.915538412177915e-05, + "loss": 5.7059, + "step": 13953 + }, + { + "epoch": 0.08298839090303549, + "grad_norm": 1.7645996809005737, + "learning_rate": 4.915526372943369e-05, + "loss": 5.6065, + "step": 13954 + }, + { + "epoch": 0.08299433818631649, + "grad_norm": 1.9206926822662354, + "learning_rate": 4.915514332865588e-05, + "loss": 4.9229, + "step": 13955 + }, + { + "epoch": 0.08300028546959749, + "grad_norm": 1.9269802570343018, + "learning_rate": 4.9155022919445766e-05, + "loss": 5.5678, + "step": 13956 + }, + { + "epoch": 0.08300623275287848, + "grad_norm": 2.378319501876831, + "learning_rate": 4.915490250180338e-05, + "loss": 4.7271, + "step": 13957 + }, + { + "epoch": 0.08301218003615948, + "grad_norm": 1.73631751537323, + "learning_rate": 4.915478207572876e-05, + "loss": 5.1302, + "step": 13958 + }, + { + "epoch": 0.08301812731944049, + "grad_norm": 1.6520816087722778, + "learning_rate": 4.915466164122196e-05, + "loss": 6.0497, + "step": 13959 + }, + { + "epoch": 0.08302407460272147, + "grad_norm": 1.7382736206054688, + "learning_rate": 4.915454119828302e-05, + "loss": 6.0155, + "step": 13960 + }, + { + "epoch": 0.08303002188600248, + "grad_norm": 1.6733272075653076, + "learning_rate": 4.915442074691197e-05, + "loss": 5.2624, + "step": 13961 + }, + { + "epoch": 0.08303596916928348, + "grad_norm": 2.0024397373199463, + "learning_rate": 4.915430028710887e-05, + "loss": 5.4794, + "step": 13962 + }, + { + "epoch": 0.08304191645256447, + "grad_norm": 1.9784339666366577, + "learning_rate": 4.915417981887375e-05, + "loss": 5.1546, + "step": 13963 + }, + { + "epoch": 0.08304786373584547, + "grad_norm": 1.7146525382995605, + "learning_rate": 4.915405934220666e-05, + "loss": 5.6269, + "step": 13964 + }, + { + "epoch": 0.08305381101912647, + "grad_norm": 1.7252057790756226, + "learning_rate": 4.9153938857107626e-05, + "loss": 5.7015, + "step": 13965 + }, + { + "epoch": 0.08305975830240746, + "grad_norm": 1.6623241901397705, + "learning_rate": 4.9153818363576715e-05, + "loss": 5.5249, + "step": 13966 + }, + { + "epoch": 0.08306570558568846, + "grad_norm": 2.0701472759246826, + "learning_rate": 4.9153697861613944e-05, + "loss": 5.3528, + "step": 13967 + }, + { + "epoch": 0.08307165286896946, + "grad_norm": 1.6600522994995117, + "learning_rate": 4.915357735121938e-05, + "loss": 5.3454, + "step": 13968 + }, + { + "epoch": 0.08307760015225045, + "grad_norm": 2.093092918395996, + "learning_rate": 4.915345683239304e-05, + "loss": 5.2417, + "step": 13969 + }, + { + "epoch": 0.08308354743553145, + "grad_norm": 1.9673899412155151, + "learning_rate": 4.915333630513498e-05, + "loss": 5.1908, + "step": 13970 + }, + { + "epoch": 0.08308949471881244, + "grad_norm": 1.8442246913909912, + "learning_rate": 4.915321576944524e-05, + "loss": 5.6287, + "step": 13971 + }, + { + "epoch": 0.08309544200209344, + "grad_norm": 1.5737566947937012, + "learning_rate": 4.9153095225323864e-05, + "loss": 5.7533, + "step": 13972 + }, + { + "epoch": 0.08310138928537444, + "grad_norm": 1.7948611974716187, + "learning_rate": 4.915297467277089e-05, + "loss": 5.5739, + "step": 13973 + }, + { + "epoch": 0.08310733656865543, + "grad_norm": 2.0080626010894775, + "learning_rate": 4.915285411178637e-05, + "loss": 5.5505, + "step": 13974 + }, + { + "epoch": 0.08311328385193643, + "grad_norm": 1.7838460206985474, + "learning_rate": 4.915273354237033e-05, + "loss": 6.0133, + "step": 13975 + }, + { + "epoch": 0.08311923113521744, + "grad_norm": 1.7599917650222778, + "learning_rate": 4.915261296452282e-05, + "loss": 5.6552, + "step": 13976 + }, + { + "epoch": 0.08312517841849842, + "grad_norm": 1.6211295127868652, + "learning_rate": 4.915249237824388e-05, + "loss": 5.6797, + "step": 13977 + }, + { + "epoch": 0.08313112570177943, + "grad_norm": 1.7404415607452393, + "learning_rate": 4.9152371783533565e-05, + "loss": 5.5134, + "step": 13978 + }, + { + "epoch": 0.08313707298506043, + "grad_norm": 1.8577871322631836, + "learning_rate": 4.9152251180391895e-05, + "loss": 5.5823, + "step": 13979 + }, + { + "epoch": 0.08314302026834142, + "grad_norm": 1.6060470342636108, + "learning_rate": 4.915213056881893e-05, + "loss": 5.5875, + "step": 13980 + }, + { + "epoch": 0.08314896755162242, + "grad_norm": 1.915451169013977, + "learning_rate": 4.91520099488147e-05, + "loss": 5.279, + "step": 13981 + }, + { + "epoch": 0.08315491483490342, + "grad_norm": 2.281404972076416, + "learning_rate": 4.9151889320379265e-05, + "loss": 5.0863, + "step": 13982 + }, + { + "epoch": 0.08316086211818441, + "grad_norm": 1.9069279432296753, + "learning_rate": 4.9151768683512646e-05, + "loss": 5.3055, + "step": 13983 + }, + { + "epoch": 0.08316680940146541, + "grad_norm": 1.810571312904358, + "learning_rate": 4.915164803821489e-05, + "loss": 5.4988, + "step": 13984 + }, + { + "epoch": 0.08317275668474641, + "grad_norm": 1.788197636604309, + "learning_rate": 4.915152738448605e-05, + "loss": 5.6627, + "step": 13985 + }, + { + "epoch": 0.0831787039680274, + "grad_norm": 2.294187545776367, + "learning_rate": 4.9151406722326165e-05, + "loss": 5.1977, + "step": 13986 + }, + { + "epoch": 0.0831846512513084, + "grad_norm": 2.584395170211792, + "learning_rate": 4.915128605173527e-05, + "loss": 5.1909, + "step": 13987 + }, + { + "epoch": 0.0831905985345894, + "grad_norm": 2.249406576156616, + "learning_rate": 4.9151165372713405e-05, + "loss": 5.1109, + "step": 13988 + }, + { + "epoch": 0.0831965458178704, + "grad_norm": 1.8678929805755615, + "learning_rate": 4.915104468526062e-05, + "loss": 5.1035, + "step": 13989 + }, + { + "epoch": 0.0832024931011514, + "grad_norm": 2.139711856842041, + "learning_rate": 4.915092398937696e-05, + "loss": 5.0151, + "step": 13990 + }, + { + "epoch": 0.0832084403844324, + "grad_norm": 2.1683461666107178, + "learning_rate": 4.915080328506246e-05, + "loss": 5.1097, + "step": 13991 + }, + { + "epoch": 0.08321438766771339, + "grad_norm": 2.1205332279205322, + "learning_rate": 4.9150682572317165e-05, + "loss": 4.9998, + "step": 13992 + }, + { + "epoch": 0.08322033495099439, + "grad_norm": 1.8642542362213135, + "learning_rate": 4.915056185114111e-05, + "loss": 5.8554, + "step": 13993 + }, + { + "epoch": 0.08322628223427539, + "grad_norm": 2.1150970458984375, + "learning_rate": 4.915044112153435e-05, + "loss": 5.5297, + "step": 13994 + }, + { + "epoch": 0.08323222951755638, + "grad_norm": 2.584157943725586, + "learning_rate": 4.9150320383496915e-05, + "loss": 5.0058, + "step": 13995 + }, + { + "epoch": 0.08323817680083738, + "grad_norm": 2.305853843688965, + "learning_rate": 4.9150199637028854e-05, + "loss": 5.0785, + "step": 13996 + }, + { + "epoch": 0.08324412408411838, + "grad_norm": 2.0386359691619873, + "learning_rate": 4.9150078882130214e-05, + "loss": 5.1104, + "step": 13997 + }, + { + "epoch": 0.08325007136739937, + "grad_norm": 1.6055399179458618, + "learning_rate": 4.914995811880102e-05, + "loss": 5.778, + "step": 13998 + }, + { + "epoch": 0.08325601865068037, + "grad_norm": 1.635704517364502, + "learning_rate": 4.9149837347041334e-05, + "loss": 6.1107, + "step": 13999 + }, + { + "epoch": 0.08326196593396136, + "grad_norm": 1.8098101615905762, + "learning_rate": 4.9149716566851184e-05, + "loss": 6.1197, + "step": 14000 + }, + { + "epoch": 0.08326791321724236, + "grad_norm": 1.5740363597869873, + "learning_rate": 4.914959577823062e-05, + "loss": 5.7821, + "step": 14001 + }, + { + "epoch": 0.08327386050052336, + "grad_norm": 1.4634822607040405, + "learning_rate": 4.914947498117968e-05, + "loss": 5.7062, + "step": 14002 + }, + { + "epoch": 0.08327980778380435, + "grad_norm": 1.7310374975204468, + "learning_rate": 4.914935417569841e-05, + "loss": 5.6689, + "step": 14003 + }, + { + "epoch": 0.08328575506708535, + "grad_norm": 1.5742056369781494, + "learning_rate": 4.914923336178685e-05, + "loss": 5.6529, + "step": 14004 + }, + { + "epoch": 0.08329170235036636, + "grad_norm": 1.6353307962417603, + "learning_rate": 4.914911253944504e-05, + "loss": 5.4564, + "step": 14005 + }, + { + "epoch": 0.08329764963364734, + "grad_norm": 1.8744231462478638, + "learning_rate": 4.9148991708673024e-05, + "loss": 5.305, + "step": 14006 + }, + { + "epoch": 0.08330359691692835, + "grad_norm": 1.9766863584518433, + "learning_rate": 4.914887086947085e-05, + "loss": 5.711, + "step": 14007 + }, + { + "epoch": 0.08330954420020935, + "grad_norm": 2.1832756996154785, + "learning_rate": 4.914875002183855e-05, + "loss": 4.9322, + "step": 14008 + }, + { + "epoch": 0.08331549148349034, + "grad_norm": 2.2370998859405518, + "learning_rate": 4.914862916577617e-05, + "loss": 4.512, + "step": 14009 + }, + { + "epoch": 0.08332143876677134, + "grad_norm": 2.2743804454803467, + "learning_rate": 4.914850830128376e-05, + "loss": 4.5716, + "step": 14010 + }, + { + "epoch": 0.08332738605005234, + "grad_norm": 2.3644347190856934, + "learning_rate": 4.914838742836134e-05, + "loss": 4.1288, + "step": 14011 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 3.1034274101257324, + "learning_rate": 4.9148266547008984e-05, + "loss": 5.2864, + "step": 14012 + }, + { + "epoch": 0.08333928061661433, + "grad_norm": 2.240302801132202, + "learning_rate": 4.914814565722671e-05, + "loss": 5.3452, + "step": 14013 + }, + { + "epoch": 0.08334522789989533, + "grad_norm": 2.0743885040283203, + "learning_rate": 4.9148024759014566e-05, + "loss": 5.4338, + "step": 14014 + }, + { + "epoch": 0.08335117518317632, + "grad_norm": 2.0169663429260254, + "learning_rate": 4.91479038523726e-05, + "loss": 5.5108, + "step": 14015 + }, + { + "epoch": 0.08335712246645732, + "grad_norm": 1.9730015993118286, + "learning_rate": 4.914778293730085e-05, + "loss": 5.6413, + "step": 14016 + }, + { + "epoch": 0.08336306974973832, + "grad_norm": 2.3047432899475098, + "learning_rate": 4.914766201379936e-05, + "loss": 5.4111, + "step": 14017 + }, + { + "epoch": 0.08336901703301931, + "grad_norm": 3.079416275024414, + "learning_rate": 4.914754108186816e-05, + "loss": 5.5591, + "step": 14018 + }, + { + "epoch": 0.08337496431630032, + "grad_norm": 1.9374867677688599, + "learning_rate": 4.9147420141507314e-05, + "loss": 5.9295, + "step": 14019 + }, + { + "epoch": 0.08338091159958132, + "grad_norm": 1.874292016029358, + "learning_rate": 4.9147299192716855e-05, + "loss": 5.6846, + "step": 14020 + }, + { + "epoch": 0.0833868588828623, + "grad_norm": 1.8852506875991821, + "learning_rate": 4.914717823549682e-05, + "loss": 5.621, + "step": 14021 + }, + { + "epoch": 0.08339280616614331, + "grad_norm": 1.9332367181777954, + "learning_rate": 4.914705726984725e-05, + "loss": 5.8584, + "step": 14022 + }, + { + "epoch": 0.08339875344942431, + "grad_norm": 1.6252962350845337, + "learning_rate": 4.91469362957682e-05, + "loss": 5.8173, + "step": 14023 + }, + { + "epoch": 0.0834047007327053, + "grad_norm": 1.6760259866714478, + "learning_rate": 4.9146815313259695e-05, + "loss": 5.5441, + "step": 14024 + }, + { + "epoch": 0.0834106480159863, + "grad_norm": 1.4979921579360962, + "learning_rate": 4.9146694322321785e-05, + "loss": 6.1467, + "step": 14025 + }, + { + "epoch": 0.0834165952992673, + "grad_norm": 1.4720534086227417, + "learning_rate": 4.914657332295453e-05, + "loss": 5.8626, + "step": 14026 + }, + { + "epoch": 0.08342254258254829, + "grad_norm": 1.6709620952606201, + "learning_rate": 4.914645231515794e-05, + "loss": 5.8468, + "step": 14027 + }, + { + "epoch": 0.08342848986582929, + "grad_norm": 1.6389116048812866, + "learning_rate": 4.9146331298932075e-05, + "loss": 5.9222, + "step": 14028 + }, + { + "epoch": 0.08343443714911028, + "grad_norm": 1.4344384670257568, + "learning_rate": 4.9146210274276974e-05, + "loss": 5.5457, + "step": 14029 + }, + { + "epoch": 0.08344038443239128, + "grad_norm": 1.472469449043274, + "learning_rate": 4.914608924119268e-05, + "loss": 5.608, + "step": 14030 + }, + { + "epoch": 0.08344633171567228, + "grad_norm": 1.6688710451126099, + "learning_rate": 4.914596819967925e-05, + "loss": 5.7982, + "step": 14031 + }, + { + "epoch": 0.08345227899895327, + "grad_norm": 1.6417087316513062, + "learning_rate": 4.9145847149736704e-05, + "loss": 5.6498, + "step": 14032 + }, + { + "epoch": 0.08345822628223427, + "grad_norm": 1.5726937055587769, + "learning_rate": 4.9145726091365084e-05, + "loss": 5.8723, + "step": 14033 + }, + { + "epoch": 0.08346417356551528, + "grad_norm": 1.7523616552352905, + "learning_rate": 4.914560502456444e-05, + "loss": 6.1967, + "step": 14034 + }, + { + "epoch": 0.08347012084879626, + "grad_norm": 1.8270281553268433, + "learning_rate": 4.914548394933483e-05, + "loss": 6.0493, + "step": 14035 + }, + { + "epoch": 0.08347606813207727, + "grad_norm": 1.8113981485366821, + "learning_rate": 4.914536286567627e-05, + "loss": 5.2815, + "step": 14036 + }, + { + "epoch": 0.08348201541535827, + "grad_norm": 1.7894388437271118, + "learning_rate": 4.914524177358881e-05, + "loss": 5.2606, + "step": 14037 + }, + { + "epoch": 0.08348796269863926, + "grad_norm": 1.7994349002838135, + "learning_rate": 4.9145120673072505e-05, + "loss": 5.025, + "step": 14038 + }, + { + "epoch": 0.08349390998192026, + "grad_norm": 1.6934137344360352, + "learning_rate": 4.914499956412738e-05, + "loss": 5.0455, + "step": 14039 + }, + { + "epoch": 0.08349985726520126, + "grad_norm": 1.549500823020935, + "learning_rate": 4.914487844675349e-05, + "loss": 5.3836, + "step": 14040 + }, + { + "epoch": 0.08350580454848225, + "grad_norm": 1.7452481985092163, + "learning_rate": 4.9144757320950873e-05, + "loss": 5.0175, + "step": 14041 + }, + { + "epoch": 0.08351175183176325, + "grad_norm": 1.9420257806777954, + "learning_rate": 4.914463618671957e-05, + "loss": 5.0146, + "step": 14042 + }, + { + "epoch": 0.08351769911504425, + "grad_norm": 1.798431158065796, + "learning_rate": 4.914451504405962e-05, + "loss": 4.7656, + "step": 14043 + }, + { + "epoch": 0.08352364639832524, + "grad_norm": 1.7167326211929321, + "learning_rate": 4.914439389297107e-05, + "loss": 4.7518, + "step": 14044 + }, + { + "epoch": 0.08352959368160624, + "grad_norm": 1.7150487899780273, + "learning_rate": 4.914427273345397e-05, + "loss": 4.8298, + "step": 14045 + }, + { + "epoch": 0.08353554096488724, + "grad_norm": 1.7048633098602295, + "learning_rate": 4.914415156550834e-05, + "loss": 5.0039, + "step": 14046 + }, + { + "epoch": 0.08354148824816823, + "grad_norm": 1.364012598991394, + "learning_rate": 4.914403038913425e-05, + "loss": 5.3718, + "step": 14047 + }, + { + "epoch": 0.08354743553144924, + "grad_norm": 2.29878830909729, + "learning_rate": 4.9143909204331716e-05, + "loss": 4.8874, + "step": 14048 + }, + { + "epoch": 0.08355338281473024, + "grad_norm": 2.1153953075408936, + "learning_rate": 4.91437880111008e-05, + "loss": 4.6646, + "step": 14049 + }, + { + "epoch": 0.08355933009801123, + "grad_norm": 2.289346218109131, + "learning_rate": 4.914366680944153e-05, + "loss": 4.7966, + "step": 14050 + }, + { + "epoch": 0.08356527738129223, + "grad_norm": 1.8394019603729248, + "learning_rate": 4.9143545599353965e-05, + "loss": 5.1788, + "step": 14051 + }, + { + "epoch": 0.08357122466457323, + "grad_norm": 2.192802667617798, + "learning_rate": 4.9143424380838136e-05, + "loss": 5.4549, + "step": 14052 + }, + { + "epoch": 0.08357717194785422, + "grad_norm": 2.128356695175171, + "learning_rate": 4.9143303153894085e-05, + "loss": 5.6652, + "step": 14053 + }, + { + "epoch": 0.08358311923113522, + "grad_norm": 2.0716452598571777, + "learning_rate": 4.914318191852186e-05, + "loss": 5.7013, + "step": 14054 + }, + { + "epoch": 0.08358906651441622, + "grad_norm": 2.298940658569336, + "learning_rate": 4.91430606747215e-05, + "loss": 5.565, + "step": 14055 + }, + { + "epoch": 0.08359501379769721, + "grad_norm": 2.250102996826172, + "learning_rate": 4.914293942249304e-05, + "loss": 5.6935, + "step": 14056 + }, + { + "epoch": 0.08360096108097821, + "grad_norm": 2.123037576675415, + "learning_rate": 4.914281816183653e-05, + "loss": 5.624, + "step": 14057 + }, + { + "epoch": 0.0836069083642592, + "grad_norm": 1.833024501800537, + "learning_rate": 4.9142696892752013e-05, + "loss": 5.4329, + "step": 14058 + }, + { + "epoch": 0.0836128556475402, + "grad_norm": 1.8438977003097534, + "learning_rate": 4.9142575615239526e-05, + "loss": 5.294, + "step": 14059 + }, + { + "epoch": 0.0836188029308212, + "grad_norm": 1.805525541305542, + "learning_rate": 4.914245432929913e-05, + "loss": 5.3778, + "step": 14060 + }, + { + "epoch": 0.08362475021410219, + "grad_norm": 1.5750529766082764, + "learning_rate": 4.9142333034930835e-05, + "loss": 5.357, + "step": 14061 + }, + { + "epoch": 0.0836306974973832, + "grad_norm": 1.3928825855255127, + "learning_rate": 4.914221173213471e-05, + "loss": 5.5141, + "step": 14062 + }, + { + "epoch": 0.0836366447806642, + "grad_norm": 1.6307804584503174, + "learning_rate": 4.914209042091079e-05, + "loss": 5.3687, + "step": 14063 + }, + { + "epoch": 0.08364259206394518, + "grad_norm": 1.533963680267334, + "learning_rate": 4.914196910125911e-05, + "loss": 5.7295, + "step": 14064 + }, + { + "epoch": 0.08364853934722619, + "grad_norm": 1.4950587749481201, + "learning_rate": 4.914184777317972e-05, + "loss": 5.816, + "step": 14065 + }, + { + "epoch": 0.08365448663050719, + "grad_norm": 1.3246190547943115, + "learning_rate": 4.914172643667266e-05, + "loss": 5.6925, + "step": 14066 + }, + { + "epoch": 0.08366043391378818, + "grad_norm": 1.4816724061965942, + "learning_rate": 4.9141605091737975e-05, + "loss": 5.6528, + "step": 14067 + }, + { + "epoch": 0.08366638119706918, + "grad_norm": 1.6656372547149658, + "learning_rate": 4.914148373837571e-05, + "loss": 5.4619, + "step": 14068 + }, + { + "epoch": 0.08367232848035018, + "grad_norm": 1.2973356246948242, + "learning_rate": 4.914136237658589e-05, + "loss": 5.5467, + "step": 14069 + }, + { + "epoch": 0.08367827576363117, + "grad_norm": 1.7669901847839355, + "learning_rate": 4.914124100636857e-05, + "loss": 5.2213, + "step": 14070 + }, + { + "epoch": 0.08368422304691217, + "grad_norm": 1.7352882623672485, + "learning_rate": 4.91411196277238e-05, + "loss": 5.2938, + "step": 14071 + }, + { + "epoch": 0.08369017033019317, + "grad_norm": 1.5912410020828247, + "learning_rate": 4.914099824065161e-05, + "loss": 5.4139, + "step": 14072 + }, + { + "epoch": 0.08369611761347416, + "grad_norm": 1.46699059009552, + "learning_rate": 4.914087684515205e-05, + "loss": 5.2317, + "step": 14073 + }, + { + "epoch": 0.08370206489675516, + "grad_norm": 3.0727121829986572, + "learning_rate": 4.914075544122516e-05, + "loss": 5.2324, + "step": 14074 + }, + { + "epoch": 0.08370801218003616, + "grad_norm": 1.4887278079986572, + "learning_rate": 4.914063402887098e-05, + "loss": 5.0331, + "step": 14075 + }, + { + "epoch": 0.08371395946331715, + "grad_norm": 1.4677956104278564, + "learning_rate": 4.9140512608089555e-05, + "loss": 5.0892, + "step": 14076 + }, + { + "epoch": 0.08371990674659816, + "grad_norm": 1.3760831356048584, + "learning_rate": 4.914039117888093e-05, + "loss": 5.3738, + "step": 14077 + }, + { + "epoch": 0.08372585402987916, + "grad_norm": 1.6125822067260742, + "learning_rate": 4.9140269741245135e-05, + "loss": 5.4629, + "step": 14078 + }, + { + "epoch": 0.08373180131316015, + "grad_norm": 1.6336333751678467, + "learning_rate": 4.9140148295182226e-05, + "loss": 5.2533, + "step": 14079 + }, + { + "epoch": 0.08373774859644115, + "grad_norm": 1.6296573877334595, + "learning_rate": 4.9140026840692247e-05, + "loss": 4.8288, + "step": 14080 + }, + { + "epoch": 0.08374369587972215, + "grad_norm": 1.6058591604232788, + "learning_rate": 4.913990537777522e-05, + "loss": 5.0549, + "step": 14081 + }, + { + "epoch": 0.08374964316300314, + "grad_norm": 1.6199642419815063, + "learning_rate": 4.9139783906431214e-05, + "loss": 5.2387, + "step": 14082 + }, + { + "epoch": 0.08375559044628414, + "grad_norm": 1.7537976503372192, + "learning_rate": 4.913966242666025e-05, + "loss": 5.2766, + "step": 14083 + }, + { + "epoch": 0.08376153772956514, + "grad_norm": 1.579128384590149, + "learning_rate": 4.9139540938462384e-05, + "loss": 5.2251, + "step": 14084 + }, + { + "epoch": 0.08376748501284613, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.913941944183765e-05, + "loss": 5.0699, + "step": 14085 + }, + { + "epoch": 0.08377343229612713, + "grad_norm": 1.4739151000976562, + "learning_rate": 4.91392979367861e-05, + "loss": 5.229, + "step": 14086 + }, + { + "epoch": 0.08377937957940812, + "grad_norm": 1.6380045413970947, + "learning_rate": 4.9139176423307764e-05, + "loss": 5.0977, + "step": 14087 + }, + { + "epoch": 0.08378532686268912, + "grad_norm": 1.640865445137024, + "learning_rate": 4.91390549014027e-05, + "loss": 5.1106, + "step": 14088 + }, + { + "epoch": 0.08379127414597012, + "grad_norm": 1.7274518013000488, + "learning_rate": 4.913893337107093e-05, + "loss": 5.2093, + "step": 14089 + }, + { + "epoch": 0.08379722142925111, + "grad_norm": 1.7702603340148926, + "learning_rate": 4.913881183231251e-05, + "loss": 5.1314, + "step": 14090 + }, + { + "epoch": 0.08380316871253211, + "grad_norm": 1.766479253768921, + "learning_rate": 4.913869028512749e-05, + "loss": 5.1266, + "step": 14091 + }, + { + "epoch": 0.08380911599581312, + "grad_norm": 1.5863205194473267, + "learning_rate": 4.91385687295159e-05, + "loss": 5.1487, + "step": 14092 + }, + { + "epoch": 0.0838150632790941, + "grad_norm": 1.6770803928375244, + "learning_rate": 4.913844716547777e-05, + "loss": 5.2479, + "step": 14093 + }, + { + "epoch": 0.0838210105623751, + "grad_norm": 1.8650991916656494, + "learning_rate": 4.913832559301317e-05, + "loss": 5.2748, + "step": 14094 + }, + { + "epoch": 0.08382695784565611, + "grad_norm": 1.7304933071136475, + "learning_rate": 4.913820401212213e-05, + "loss": 5.2572, + "step": 14095 + }, + { + "epoch": 0.0838329051289371, + "grad_norm": 1.7103501558303833, + "learning_rate": 4.9138082422804695e-05, + "loss": 5.1145, + "step": 14096 + }, + { + "epoch": 0.0838388524122181, + "grad_norm": 1.8390073776245117, + "learning_rate": 4.91379608250609e-05, + "loss": 5.1171, + "step": 14097 + }, + { + "epoch": 0.0838447996954991, + "grad_norm": 1.815047264099121, + "learning_rate": 4.913783921889079e-05, + "loss": 5.2329, + "step": 14098 + }, + { + "epoch": 0.08385074697878009, + "grad_norm": 1.4381682872772217, + "learning_rate": 4.9137717604294415e-05, + "loss": 5.1098, + "step": 14099 + }, + { + "epoch": 0.08385669426206109, + "grad_norm": 1.6523853540420532, + "learning_rate": 4.9137595981271815e-05, + "loss": 5.1352, + "step": 14100 + }, + { + "epoch": 0.08386264154534209, + "grad_norm": 1.377199649810791, + "learning_rate": 4.913747434982302e-05, + "loss": 5.1191, + "step": 14101 + }, + { + "epoch": 0.08386858882862308, + "grad_norm": 1.5858699083328247, + "learning_rate": 4.913735270994809e-05, + "loss": 5.0569, + "step": 14102 + }, + { + "epoch": 0.08387453611190408, + "grad_norm": 1.608522891998291, + "learning_rate": 4.913723106164705e-05, + "loss": 4.8834, + "step": 14103 + }, + { + "epoch": 0.08388048339518508, + "grad_norm": 1.7063453197479248, + "learning_rate": 4.913710940491996e-05, + "loss": 4.9019, + "step": 14104 + }, + { + "epoch": 0.08388643067846607, + "grad_norm": 1.5008784532546997, + "learning_rate": 4.913698773976685e-05, + "loss": 4.8423, + "step": 14105 + }, + { + "epoch": 0.08389237796174707, + "grad_norm": 1.8743178844451904, + "learning_rate": 4.913686606618777e-05, + "loss": 4.9256, + "step": 14106 + }, + { + "epoch": 0.08389832524502808, + "grad_norm": 1.813094973564148, + "learning_rate": 4.9136744384182764e-05, + "loss": 4.9245, + "step": 14107 + }, + { + "epoch": 0.08390427252830907, + "grad_norm": 1.9561067819595337, + "learning_rate": 4.913662269375186e-05, + "loss": 4.8459, + "step": 14108 + }, + { + "epoch": 0.08391021981159007, + "grad_norm": 1.6159533262252808, + "learning_rate": 4.913650099489512e-05, + "loss": 4.8092, + "step": 14109 + }, + { + "epoch": 0.08391616709487107, + "grad_norm": 1.5819872617721558, + "learning_rate": 4.913637928761257e-05, + "loss": 4.9047, + "step": 14110 + }, + { + "epoch": 0.08392211437815206, + "grad_norm": 1.6294678449630737, + "learning_rate": 4.913625757190426e-05, + "loss": 4.6908, + "step": 14111 + }, + { + "epoch": 0.08392806166143306, + "grad_norm": 1.5048410892486572, + "learning_rate": 4.913613584777024e-05, + "loss": 5.2021, + "step": 14112 + }, + { + "epoch": 0.08393400894471406, + "grad_norm": 1.626280665397644, + "learning_rate": 4.9136014115210525e-05, + "loss": 5.4592, + "step": 14113 + }, + { + "epoch": 0.08393995622799505, + "grad_norm": 1.662269115447998, + "learning_rate": 4.91358923742252e-05, + "loss": 5.0027, + "step": 14114 + }, + { + "epoch": 0.08394590351127605, + "grad_norm": 1.5630388259887695, + "learning_rate": 4.913577062481427e-05, + "loss": 5.3327, + "step": 14115 + }, + { + "epoch": 0.08395185079455704, + "grad_norm": 1.4223047494888306, + "learning_rate": 4.913564886697779e-05, + "loss": 5.5081, + "step": 14116 + }, + { + "epoch": 0.08395779807783804, + "grad_norm": 1.3298295736312866, + "learning_rate": 4.9135527100715814e-05, + "loss": 5.3783, + "step": 14117 + }, + { + "epoch": 0.08396374536111904, + "grad_norm": 1.335779070854187, + "learning_rate": 4.913540532602837e-05, + "loss": 5.3901, + "step": 14118 + }, + { + "epoch": 0.08396969264440003, + "grad_norm": 1.5331017971038818, + "learning_rate": 4.913528354291551e-05, + "loss": 5.5643, + "step": 14119 + }, + { + "epoch": 0.08397563992768103, + "grad_norm": 1.703400731086731, + "learning_rate": 4.913516175137727e-05, + "loss": 5.4256, + "step": 14120 + }, + { + "epoch": 0.08398158721096204, + "grad_norm": 1.5330191850662231, + "learning_rate": 4.913503995141369e-05, + "loss": 5.2509, + "step": 14121 + }, + { + "epoch": 0.08398753449424302, + "grad_norm": 1.7405961751937866, + "learning_rate": 4.913491814302482e-05, + "loss": 5.4171, + "step": 14122 + }, + { + "epoch": 0.08399348177752403, + "grad_norm": 1.2550197839736938, + "learning_rate": 4.9134796326210696e-05, + "loss": 5.3908, + "step": 14123 + }, + { + "epoch": 0.08399942906080503, + "grad_norm": 1.2029253244400024, + "learning_rate": 4.9134674500971366e-05, + "loss": 5.5355, + "step": 14124 + }, + { + "epoch": 0.08400537634408602, + "grad_norm": 1.2968589067459106, + "learning_rate": 4.913455266730687e-05, + "loss": 5.4007, + "step": 14125 + }, + { + "epoch": 0.08401132362736702, + "grad_norm": 1.2636605501174927, + "learning_rate": 4.913443082521725e-05, + "loss": 5.2402, + "step": 14126 + }, + { + "epoch": 0.08401727091064802, + "grad_norm": 1.2112632989883423, + "learning_rate": 4.9134308974702554e-05, + "loss": 5.2595, + "step": 14127 + }, + { + "epoch": 0.08402321819392901, + "grad_norm": 1.447730302810669, + "learning_rate": 4.913418711576282e-05, + "loss": 5.2688, + "step": 14128 + }, + { + "epoch": 0.08402916547721001, + "grad_norm": 1.4328616857528687, + "learning_rate": 4.913406524839809e-05, + "loss": 5.2368, + "step": 14129 + }, + { + "epoch": 0.08403511276049101, + "grad_norm": 1.4782198667526245, + "learning_rate": 4.91339433726084e-05, + "loss": 5.2019, + "step": 14130 + }, + { + "epoch": 0.084041060043772, + "grad_norm": 1.499373197555542, + "learning_rate": 4.913382148839381e-05, + "loss": 5.3352, + "step": 14131 + }, + { + "epoch": 0.084047007327053, + "grad_norm": 1.37551748752594, + "learning_rate": 4.9133699595754346e-05, + "loss": 5.1566, + "step": 14132 + }, + { + "epoch": 0.084052954610334, + "grad_norm": 1.6400420665740967, + "learning_rate": 4.913357769469006e-05, + "loss": 5.5225, + "step": 14133 + }, + { + "epoch": 0.08405890189361499, + "grad_norm": 1.3855832815170288, + "learning_rate": 4.913345578520099e-05, + "loss": 5.4466, + "step": 14134 + }, + { + "epoch": 0.084064849176896, + "grad_norm": 1.783508062362671, + "learning_rate": 4.913333386728718e-05, + "loss": 5.1713, + "step": 14135 + }, + { + "epoch": 0.084070796460177, + "grad_norm": 2.435201406478882, + "learning_rate": 4.913321194094866e-05, + "loss": 4.9899, + "step": 14136 + }, + { + "epoch": 0.08407674374345799, + "grad_norm": 1.708850622177124, + "learning_rate": 4.91330900061855e-05, + "loss": 5.0808, + "step": 14137 + }, + { + "epoch": 0.08408269102673899, + "grad_norm": 1.583473801612854, + "learning_rate": 4.913296806299773e-05, + "loss": 5.0164, + "step": 14138 + }, + { + "epoch": 0.08408863831001999, + "grad_norm": 1.6990292072296143, + "learning_rate": 4.9132846111385386e-05, + "loss": 4.9476, + "step": 14139 + }, + { + "epoch": 0.08409458559330098, + "grad_norm": 1.6386258602142334, + "learning_rate": 4.913272415134851e-05, + "loss": 4.9357, + "step": 14140 + }, + { + "epoch": 0.08410053287658198, + "grad_norm": 1.258575439453125, + "learning_rate": 4.9132602182887156e-05, + "loss": 4.7666, + "step": 14141 + }, + { + "epoch": 0.08410648015986298, + "grad_norm": 1.3333406448364258, + "learning_rate": 4.913248020600135e-05, + "loss": 4.698, + "step": 14142 + }, + { + "epoch": 0.08411242744314397, + "grad_norm": 1.3663051128387451, + "learning_rate": 4.913235822069116e-05, + "loss": 4.9414, + "step": 14143 + }, + { + "epoch": 0.08411837472642497, + "grad_norm": 1.6906498670578003, + "learning_rate": 4.91322362269566e-05, + "loss": 5.281, + "step": 14144 + }, + { + "epoch": 0.08412432200970596, + "grad_norm": 1.2671558856964111, + "learning_rate": 4.9132114224797735e-05, + "loss": 5.2566, + "step": 14145 + }, + { + "epoch": 0.08413026929298696, + "grad_norm": 1.4022216796875, + "learning_rate": 4.9131992214214586e-05, + "loss": 5.128, + "step": 14146 + }, + { + "epoch": 0.08413621657626796, + "grad_norm": 1.4810549020767212, + "learning_rate": 4.913187019520722e-05, + "loss": 5.0172, + "step": 14147 + }, + { + "epoch": 0.08414216385954895, + "grad_norm": 1.2757905721664429, + "learning_rate": 4.913174816777566e-05, + "loss": 5.3796, + "step": 14148 + }, + { + "epoch": 0.08414811114282995, + "grad_norm": 1.4088176488876343, + "learning_rate": 4.913162613191996e-05, + "loss": 5.4586, + "step": 14149 + }, + { + "epoch": 0.08415405842611096, + "grad_norm": 1.5218896865844727, + "learning_rate": 4.9131504087640154e-05, + "loss": 5.1652, + "step": 14150 + }, + { + "epoch": 0.08416000570939194, + "grad_norm": 1.4234968423843384, + "learning_rate": 4.913138203493629e-05, + "loss": 5.1917, + "step": 14151 + }, + { + "epoch": 0.08416595299267295, + "grad_norm": 1.4841183423995972, + "learning_rate": 4.913125997380842e-05, + "loss": 5.2818, + "step": 14152 + }, + { + "epoch": 0.08417190027595395, + "grad_norm": 1.8631536960601807, + "learning_rate": 4.9131137904256564e-05, + "loss": 5.4848, + "step": 14153 + }, + { + "epoch": 0.08417784755923494, + "grad_norm": 1.5508880615234375, + "learning_rate": 4.913101582628078e-05, + "loss": 5.3698, + "step": 14154 + }, + { + "epoch": 0.08418379484251594, + "grad_norm": 1.2428319454193115, + "learning_rate": 4.913089373988111e-05, + "loss": 5.2071, + "step": 14155 + }, + { + "epoch": 0.08418974212579694, + "grad_norm": 1.405325174331665, + "learning_rate": 4.91307716450576e-05, + "loss": 5.1774, + "step": 14156 + }, + { + "epoch": 0.08419568940907793, + "grad_norm": 1.6800439357757568, + "learning_rate": 4.913064954181028e-05, + "loss": 5.3735, + "step": 14157 + }, + { + "epoch": 0.08420163669235893, + "grad_norm": 1.475174069404602, + "learning_rate": 4.9130527430139194e-05, + "loss": 5.3303, + "step": 14158 + }, + { + "epoch": 0.08420758397563993, + "grad_norm": 1.5441967248916626, + "learning_rate": 4.91304053100444e-05, + "loss": 5.3007, + "step": 14159 + }, + { + "epoch": 0.08421353125892092, + "grad_norm": 1.3798770904541016, + "learning_rate": 4.913028318152593e-05, + "loss": 5.287, + "step": 14160 + }, + { + "epoch": 0.08421947854220192, + "grad_norm": 1.4294620752334595, + "learning_rate": 4.913016104458382e-05, + "loss": 5.3159, + "step": 14161 + }, + { + "epoch": 0.08422542582548292, + "grad_norm": 1.4971884489059448, + "learning_rate": 4.913003889921812e-05, + "loss": 5.4701, + "step": 14162 + }, + { + "epoch": 0.08423137310876391, + "grad_norm": 1.447045922279358, + "learning_rate": 4.912991674542888e-05, + "loss": 5.306, + "step": 14163 + }, + { + "epoch": 0.08423732039204491, + "grad_norm": 1.7867134809494019, + "learning_rate": 4.9129794583216135e-05, + "loss": 4.8653, + "step": 14164 + }, + { + "epoch": 0.08424326767532592, + "grad_norm": 1.6931066513061523, + "learning_rate": 4.912967241257993e-05, + "loss": 4.7628, + "step": 14165 + }, + { + "epoch": 0.0842492149586069, + "grad_norm": 1.6567879915237427, + "learning_rate": 4.91295502335203e-05, + "loss": 4.7857, + "step": 14166 + }, + { + "epoch": 0.08425516224188791, + "grad_norm": 1.6891521215438843, + "learning_rate": 4.91294280460373e-05, + "loss": 4.7873, + "step": 14167 + }, + { + "epoch": 0.08426110952516891, + "grad_norm": 1.6237304210662842, + "learning_rate": 4.912930585013095e-05, + "loss": 4.8596, + "step": 14168 + }, + { + "epoch": 0.0842670568084499, + "grad_norm": 1.585802674293518, + "learning_rate": 4.912918364580132e-05, + "loss": 4.8226, + "step": 14169 + }, + { + "epoch": 0.0842730040917309, + "grad_norm": 1.6892811059951782, + "learning_rate": 4.912906143304844e-05, + "loss": 4.8307, + "step": 14170 + }, + { + "epoch": 0.0842789513750119, + "grad_norm": 1.8254313468933105, + "learning_rate": 4.912893921187236e-05, + "loss": 4.8508, + "step": 14171 + }, + { + "epoch": 0.08428489865829289, + "grad_norm": 1.5577294826507568, + "learning_rate": 4.912881698227311e-05, + "loss": 4.7303, + "step": 14172 + }, + { + "epoch": 0.08429084594157389, + "grad_norm": 1.5635697841644287, + "learning_rate": 4.912869474425074e-05, + "loss": 4.9597, + "step": 14173 + }, + { + "epoch": 0.08429679322485488, + "grad_norm": 1.6620457172393799, + "learning_rate": 4.9128572497805294e-05, + "loss": 5.1012, + "step": 14174 + }, + { + "epoch": 0.08430274050813588, + "grad_norm": 1.4082841873168945, + "learning_rate": 4.912845024293681e-05, + "loss": 5.1785, + "step": 14175 + }, + { + "epoch": 0.08430868779141688, + "grad_norm": 1.5914233922958374, + "learning_rate": 4.9128327979645336e-05, + "loss": 5.2035, + "step": 14176 + }, + { + "epoch": 0.08431463507469787, + "grad_norm": 1.3170946836471558, + "learning_rate": 4.912820570793091e-05, + "loss": 5.35, + "step": 14177 + }, + { + "epoch": 0.08432058235797887, + "grad_norm": 1.3059190511703491, + "learning_rate": 4.912808342779357e-05, + "loss": 5.1428, + "step": 14178 + }, + { + "epoch": 0.08432652964125988, + "grad_norm": 1.438844919204712, + "learning_rate": 4.912796113923337e-05, + "loss": 5.2154, + "step": 14179 + }, + { + "epoch": 0.08433247692454086, + "grad_norm": 1.401469349861145, + "learning_rate": 4.912783884225035e-05, + "loss": 5.0941, + "step": 14180 + }, + { + "epoch": 0.08433842420782187, + "grad_norm": 1.6718204021453857, + "learning_rate": 4.912771653684456e-05, + "loss": 5.3221, + "step": 14181 + }, + { + "epoch": 0.08434437149110287, + "grad_norm": 1.51036536693573, + "learning_rate": 4.912759422301602e-05, + "loss": 5.2619, + "step": 14182 + }, + { + "epoch": 0.08435031877438386, + "grad_norm": 1.6579569578170776, + "learning_rate": 4.9127471900764795e-05, + "loss": 5.1176, + "step": 14183 + }, + { + "epoch": 0.08435626605766486, + "grad_norm": 1.5300757884979248, + "learning_rate": 4.912734957009091e-05, + "loss": 5.1625, + "step": 14184 + }, + { + "epoch": 0.08436221334094586, + "grad_norm": 1.2839969396591187, + "learning_rate": 4.912722723099442e-05, + "loss": 5.0852, + "step": 14185 + }, + { + "epoch": 0.08436816062422685, + "grad_norm": 1.7074840068817139, + "learning_rate": 4.9127104883475364e-05, + "loss": 5.1611, + "step": 14186 + }, + { + "epoch": 0.08437410790750785, + "grad_norm": 1.790992021560669, + "learning_rate": 4.9126982527533797e-05, + "loss": 5.0386, + "step": 14187 + }, + { + "epoch": 0.08438005519078885, + "grad_norm": 1.5269246101379395, + "learning_rate": 4.912686016316973e-05, + "loss": 5.0272, + "step": 14188 + }, + { + "epoch": 0.08438600247406984, + "grad_norm": 1.510847806930542, + "learning_rate": 4.9126737790383234e-05, + "loss": 5.2073, + "step": 14189 + }, + { + "epoch": 0.08439194975735084, + "grad_norm": 1.6551074981689453, + "learning_rate": 4.912661540917435e-05, + "loss": 5.0436, + "step": 14190 + }, + { + "epoch": 0.08439789704063184, + "grad_norm": 1.3152271509170532, + "learning_rate": 4.91264930195431e-05, + "loss": 5.0981, + "step": 14191 + }, + { + "epoch": 0.08440384432391283, + "grad_norm": 1.478190302848816, + "learning_rate": 4.912637062148955e-05, + "loss": 5.1172, + "step": 14192 + }, + { + "epoch": 0.08440979160719383, + "grad_norm": 1.4574978351593018, + "learning_rate": 4.912624821501373e-05, + "loss": 4.9757, + "step": 14193 + }, + { + "epoch": 0.08441573889047484, + "grad_norm": 1.600182056427002, + "learning_rate": 4.912612580011568e-05, + "loss": 5.1763, + "step": 14194 + }, + { + "epoch": 0.08442168617375582, + "grad_norm": 1.5805768966674805, + "learning_rate": 4.912600337679546e-05, + "loss": 5.1949, + "step": 14195 + }, + { + "epoch": 0.08442763345703683, + "grad_norm": 1.465785264968872, + "learning_rate": 4.9125880945053106e-05, + "loss": 5.0695, + "step": 14196 + }, + { + "epoch": 0.08443358074031783, + "grad_norm": 1.6188615560531616, + "learning_rate": 4.912575850488864e-05, + "loss": 5.1263, + "step": 14197 + }, + { + "epoch": 0.08443952802359882, + "grad_norm": 2.4953408241271973, + "learning_rate": 4.9125636056302125e-05, + "loss": 5.6462, + "step": 14198 + }, + { + "epoch": 0.08444547530687982, + "grad_norm": 1.6779934167861938, + "learning_rate": 4.91255135992936e-05, + "loss": 5.1673, + "step": 14199 + }, + { + "epoch": 0.08445142259016082, + "grad_norm": 1.648706316947937, + "learning_rate": 4.912539113386312e-05, + "loss": 5.3792, + "step": 14200 + }, + { + "epoch": 0.08445736987344181, + "grad_norm": 1.4866549968719482, + "learning_rate": 4.91252686600107e-05, + "loss": 5.2828, + "step": 14201 + }, + { + "epoch": 0.08446331715672281, + "grad_norm": 1.6002475023269653, + "learning_rate": 4.912514617773641e-05, + "loss": 5.3255, + "step": 14202 + }, + { + "epoch": 0.0844692644400038, + "grad_norm": 1.4162862300872803, + "learning_rate": 4.912502368704027e-05, + "loss": 5.3363, + "step": 14203 + }, + { + "epoch": 0.0844752117232848, + "grad_norm": 1.4465757608413696, + "learning_rate": 4.912490118792234e-05, + "loss": 5.586, + "step": 14204 + }, + { + "epoch": 0.0844811590065658, + "grad_norm": 1.8178991079330444, + "learning_rate": 4.912477868038266e-05, + "loss": 5.3029, + "step": 14205 + }, + { + "epoch": 0.08448710628984679, + "grad_norm": 1.4270378351211548, + "learning_rate": 4.912465616442126e-05, + "loss": 5.3864, + "step": 14206 + }, + { + "epoch": 0.0844930535731278, + "grad_norm": 1.5574913024902344, + "learning_rate": 4.91245336400382e-05, + "loss": 5.7667, + "step": 14207 + }, + { + "epoch": 0.0844990008564088, + "grad_norm": 1.3866809606552124, + "learning_rate": 4.91244111072335e-05, + "loss": 5.683, + "step": 14208 + }, + { + "epoch": 0.08450494813968978, + "grad_norm": 1.3390960693359375, + "learning_rate": 4.912428856600722e-05, + "loss": 5.7286, + "step": 14209 + }, + { + "epoch": 0.08451089542297079, + "grad_norm": 1.4317498207092285, + "learning_rate": 4.912416601635942e-05, + "loss": 5.6913, + "step": 14210 + }, + { + "epoch": 0.08451684270625179, + "grad_norm": 1.3110778331756592, + "learning_rate": 4.91240434582901e-05, + "loss": 5.6325, + "step": 14211 + }, + { + "epoch": 0.08452278998953278, + "grad_norm": 1.3288872241973877, + "learning_rate": 4.9123920891799344e-05, + "loss": 5.6343, + "step": 14212 + }, + { + "epoch": 0.08452873727281378, + "grad_norm": 1.2967199087142944, + "learning_rate": 4.912379831688716e-05, + "loss": 5.6514, + "step": 14213 + }, + { + "epoch": 0.08453468455609478, + "grad_norm": 1.6022506952285767, + "learning_rate": 4.912367573355362e-05, + "loss": 5.4006, + "step": 14214 + }, + { + "epoch": 0.08454063183937577, + "grad_norm": 1.6698434352874756, + "learning_rate": 4.912355314179875e-05, + "loss": 5.1543, + "step": 14215 + }, + { + "epoch": 0.08454657912265677, + "grad_norm": 1.6759408712387085, + "learning_rate": 4.9123430541622594e-05, + "loss": 4.9744, + "step": 14216 + }, + { + "epoch": 0.08455252640593777, + "grad_norm": 2.470752239227295, + "learning_rate": 4.91233079330252e-05, + "loss": 5.7614, + "step": 14217 + }, + { + "epoch": 0.08455847368921876, + "grad_norm": 2.1985907554626465, + "learning_rate": 4.91231853160066e-05, + "loss": 6.037, + "step": 14218 + }, + { + "epoch": 0.08456442097249976, + "grad_norm": 2.079569101333618, + "learning_rate": 4.912306269056686e-05, + "loss": 5.4943, + "step": 14219 + }, + { + "epoch": 0.08457036825578076, + "grad_norm": 2.2941744327545166, + "learning_rate": 4.9122940056706e-05, + "loss": 5.3733, + "step": 14220 + }, + { + "epoch": 0.08457631553906175, + "grad_norm": 1.9538209438323975, + "learning_rate": 4.912281741442407e-05, + "loss": 5.6362, + "step": 14221 + }, + { + "epoch": 0.08458226282234275, + "grad_norm": 1.7498515844345093, + "learning_rate": 4.9122694763721124e-05, + "loss": 5.7129, + "step": 14222 + }, + { + "epoch": 0.08458821010562376, + "grad_norm": 2.1728787422180176, + "learning_rate": 4.912257210459718e-05, + "loss": 5.4633, + "step": 14223 + }, + { + "epoch": 0.08459415738890474, + "grad_norm": 2.2436587810516357, + "learning_rate": 4.91224494370523e-05, + "loss": 5.3996, + "step": 14224 + }, + { + "epoch": 0.08460010467218575, + "grad_norm": 2.400299549102783, + "learning_rate": 4.912232676108653e-05, + "loss": 5.3994, + "step": 14225 + }, + { + "epoch": 0.08460605195546675, + "grad_norm": 1.9408513307571411, + "learning_rate": 4.91222040766999e-05, + "loss": 5.4537, + "step": 14226 + }, + { + "epoch": 0.08461199923874774, + "grad_norm": 2.4801602363586426, + "learning_rate": 4.912208138389245e-05, + "loss": 4.6625, + "step": 14227 + }, + { + "epoch": 0.08461794652202874, + "grad_norm": 2.021916627883911, + "learning_rate": 4.912195868266424e-05, + "loss": 4.5642, + "step": 14228 + }, + { + "epoch": 0.08462389380530974, + "grad_norm": 1.9586929082870483, + "learning_rate": 4.91218359730153e-05, + "loss": 4.6361, + "step": 14229 + }, + { + "epoch": 0.08462984108859073, + "grad_norm": 1.8478419780731201, + "learning_rate": 4.912171325494568e-05, + "loss": 4.5632, + "step": 14230 + }, + { + "epoch": 0.08463578837187173, + "grad_norm": 1.7078584432601929, + "learning_rate": 4.9121590528455406e-05, + "loss": 4.7259, + "step": 14231 + }, + { + "epoch": 0.08464173565515272, + "grad_norm": 1.7676106691360474, + "learning_rate": 4.912146779354455e-05, + "loss": 5.2565, + "step": 14232 + }, + { + "epoch": 0.08464768293843372, + "grad_norm": 1.8230634927749634, + "learning_rate": 4.912134505021313e-05, + "loss": 5.7668, + "step": 14233 + }, + { + "epoch": 0.08465363022171472, + "grad_norm": 1.8570215702056885, + "learning_rate": 4.91212222984612e-05, + "loss": 6.1849, + "step": 14234 + }, + { + "epoch": 0.08465957750499571, + "grad_norm": 1.7698529958724976, + "learning_rate": 4.9121099538288805e-05, + "loss": 6.0298, + "step": 14235 + }, + { + "epoch": 0.08466552478827671, + "grad_norm": 1.9919711351394653, + "learning_rate": 4.912097676969597e-05, + "loss": 5.7423, + "step": 14236 + }, + { + "epoch": 0.08467147207155772, + "grad_norm": 1.9937268495559692, + "learning_rate": 4.912085399268277e-05, + "loss": 5.8415, + "step": 14237 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 1.9489192962646484, + "learning_rate": 4.912073120724921e-05, + "loss": 5.812, + "step": 14238 + }, + { + "epoch": 0.0846833666381197, + "grad_norm": 1.6114327907562256, + "learning_rate": 4.9120608413395366e-05, + "loss": 5.9458, + "step": 14239 + }, + { + "epoch": 0.08468931392140071, + "grad_norm": 1.5803523063659668, + "learning_rate": 4.9120485611121265e-05, + "loss": 5.8837, + "step": 14240 + }, + { + "epoch": 0.0846952612046817, + "grad_norm": 1.8166266679763794, + "learning_rate": 4.9120362800426946e-05, + "loss": 5.5997, + "step": 14241 + }, + { + "epoch": 0.0847012084879627, + "grad_norm": 2.2683627605438232, + "learning_rate": 4.912023998131246e-05, + "loss": 5.4089, + "step": 14242 + }, + { + "epoch": 0.0847071557712437, + "grad_norm": 1.959498405456543, + "learning_rate": 4.9120117153777846e-05, + "loss": 5.5651, + "step": 14243 + }, + { + "epoch": 0.08471310305452469, + "grad_norm": 2.2388527393341064, + "learning_rate": 4.9119994317823155e-05, + "loss": 6.1511, + "step": 14244 + }, + { + "epoch": 0.08471905033780569, + "grad_norm": 1.9563941955566406, + "learning_rate": 4.911987147344842e-05, + "loss": 6.0499, + "step": 14245 + }, + { + "epoch": 0.08472499762108669, + "grad_norm": 1.7460871934890747, + "learning_rate": 4.911974862065368e-05, + "loss": 5.8368, + "step": 14246 + }, + { + "epoch": 0.08473094490436768, + "grad_norm": 1.820356845855713, + "learning_rate": 4.911962575943899e-05, + "loss": 5.3679, + "step": 14247 + }, + { + "epoch": 0.08473689218764868, + "grad_norm": 2.2215917110443115, + "learning_rate": 4.911950288980439e-05, + "loss": 5.0686, + "step": 14248 + }, + { + "epoch": 0.08474283947092968, + "grad_norm": 1.7801320552825928, + "learning_rate": 4.9119380011749914e-05, + "loss": 5.7665, + "step": 14249 + }, + { + "epoch": 0.08474878675421067, + "grad_norm": 1.8713878393173218, + "learning_rate": 4.911925712527562e-05, + "loss": 5.7, + "step": 14250 + }, + { + "epoch": 0.08475473403749167, + "grad_norm": 1.9371087551116943, + "learning_rate": 4.911913423038154e-05, + "loss": 5.6707, + "step": 14251 + }, + { + "epoch": 0.08476068132077268, + "grad_norm": 2.2298929691314697, + "learning_rate": 4.9119011327067724e-05, + "loss": 5.7042, + "step": 14252 + }, + { + "epoch": 0.08476662860405366, + "grad_norm": 1.7787251472473145, + "learning_rate": 4.91188884153342e-05, + "loss": 5.9205, + "step": 14253 + }, + { + "epoch": 0.08477257588733467, + "grad_norm": 2.0264973640441895, + "learning_rate": 4.911876549518102e-05, + "loss": 5.2057, + "step": 14254 + }, + { + "epoch": 0.08477852317061567, + "grad_norm": 2.7479963302612305, + "learning_rate": 4.911864256660824e-05, + "loss": 4.3828, + "step": 14255 + }, + { + "epoch": 0.08478447045389666, + "grad_norm": 2.3911163806915283, + "learning_rate": 4.9118519629615886e-05, + "loss": 4.1959, + "step": 14256 + }, + { + "epoch": 0.08479041773717766, + "grad_norm": 2.5100319385528564, + "learning_rate": 4.9118396684204005e-05, + "loss": 4.3845, + "step": 14257 + }, + { + "epoch": 0.08479636502045866, + "grad_norm": 2.575680732727051, + "learning_rate": 4.911827373037264e-05, + "loss": 4.1927, + "step": 14258 + }, + { + "epoch": 0.08480231230373965, + "grad_norm": 2.64941143989563, + "learning_rate": 4.9118150768121837e-05, + "loss": 4.2398, + "step": 14259 + }, + { + "epoch": 0.08480825958702065, + "grad_norm": 3.4619154930114746, + "learning_rate": 4.911802779745163e-05, + "loss": 5.9141, + "step": 14260 + }, + { + "epoch": 0.08481420687030164, + "grad_norm": 2.5471723079681396, + "learning_rate": 4.911790481836208e-05, + "loss": 4.1887, + "step": 14261 + }, + { + "epoch": 0.08482015415358264, + "grad_norm": 2.9113502502441406, + "learning_rate": 4.911778183085321e-05, + "loss": 4.3556, + "step": 14262 + }, + { + "epoch": 0.08482610143686364, + "grad_norm": 2.5952084064483643, + "learning_rate": 4.9117658834925076e-05, + "loss": 5.0408, + "step": 14263 + }, + { + "epoch": 0.08483204872014463, + "grad_norm": 2.60726261138916, + "learning_rate": 4.911753583057771e-05, + "loss": 5.5094, + "step": 14264 + }, + { + "epoch": 0.08483799600342563, + "grad_norm": 1.9005889892578125, + "learning_rate": 4.911741281781117e-05, + "loss": 5.2637, + "step": 14265 + }, + { + "epoch": 0.08484394328670664, + "grad_norm": 1.6408629417419434, + "learning_rate": 4.911728979662549e-05, + "loss": 5.4722, + "step": 14266 + }, + { + "epoch": 0.08484989056998762, + "grad_norm": 1.840955376625061, + "learning_rate": 4.911716676702071e-05, + "loss": 5.5073, + "step": 14267 + }, + { + "epoch": 0.08485583785326863, + "grad_norm": 1.8430123329162598, + "learning_rate": 4.911704372899687e-05, + "loss": 6.0372, + "step": 14268 + }, + { + "epoch": 0.08486178513654963, + "grad_norm": 3.2100231647491455, + "learning_rate": 4.911692068255402e-05, + "loss": 5.0497, + "step": 14269 + }, + { + "epoch": 0.08486773241983062, + "grad_norm": 3.191558837890625, + "learning_rate": 4.911679762769221e-05, + "loss": 5.0467, + "step": 14270 + }, + { + "epoch": 0.08487367970311162, + "grad_norm": 3.04190731048584, + "learning_rate": 4.911667456441148e-05, + "loss": 4.8008, + "step": 14271 + }, + { + "epoch": 0.08487962698639262, + "grad_norm": 2.6688694953918457, + "learning_rate": 4.911655149271186e-05, + "loss": 4.722, + "step": 14272 + }, + { + "epoch": 0.08488557426967361, + "grad_norm": 2.1458704471588135, + "learning_rate": 4.9116428412593394e-05, + "loss": 4.788, + "step": 14273 + }, + { + "epoch": 0.08489152155295461, + "grad_norm": 2.345972776412964, + "learning_rate": 4.911630532405615e-05, + "loss": 4.7955, + "step": 14274 + }, + { + "epoch": 0.08489746883623561, + "grad_norm": 2.2022581100463867, + "learning_rate": 4.911618222710014e-05, + "loss": 4.815, + "step": 14275 + }, + { + "epoch": 0.0849034161195166, + "grad_norm": 2.311004877090454, + "learning_rate": 4.911605912172542e-05, + "loss": 4.8632, + "step": 14276 + }, + { + "epoch": 0.0849093634027976, + "grad_norm": 2.5007429122924805, + "learning_rate": 4.911593600793204e-05, + "loss": 4.7273, + "step": 14277 + }, + { + "epoch": 0.0849153106860786, + "grad_norm": 2.257115364074707, + "learning_rate": 4.9115812885720026e-05, + "loss": 4.9697, + "step": 14278 + }, + { + "epoch": 0.08492125796935959, + "grad_norm": 2.7667057514190674, + "learning_rate": 4.9115689755089436e-05, + "loss": 5.1607, + "step": 14279 + }, + { + "epoch": 0.0849272052526406, + "grad_norm": 2.4240612983703613, + "learning_rate": 4.911556661604031e-05, + "loss": 4.9873, + "step": 14280 + }, + { + "epoch": 0.0849331525359216, + "grad_norm": 1.9951629638671875, + "learning_rate": 4.911544346857269e-05, + "loss": 4.9961, + "step": 14281 + }, + { + "epoch": 0.08493909981920258, + "grad_norm": 1.8532124757766724, + "learning_rate": 4.9115320312686605e-05, + "loss": 4.9467, + "step": 14282 + }, + { + "epoch": 0.08494504710248359, + "grad_norm": 2.41200590133667, + "learning_rate": 4.9115197148382126e-05, + "loss": 4.9865, + "step": 14283 + }, + { + "epoch": 0.08495099438576459, + "grad_norm": 2.2735655307769775, + "learning_rate": 4.911507397565928e-05, + "loss": 4.9223, + "step": 14284 + }, + { + "epoch": 0.08495694166904558, + "grad_norm": 2.29052734375, + "learning_rate": 4.91149507945181e-05, + "loss": 4.9479, + "step": 14285 + }, + { + "epoch": 0.08496288895232658, + "grad_norm": 2.71832275390625, + "learning_rate": 4.911482760495865e-05, + "loss": 4.9537, + "step": 14286 + }, + { + "epoch": 0.08496883623560758, + "grad_norm": 2.1351630687713623, + "learning_rate": 4.911470440698096e-05, + "loss": 5.3776, + "step": 14287 + }, + { + "epoch": 0.08497478351888857, + "grad_norm": 2.514810085296631, + "learning_rate": 4.9114581200585066e-05, + "loss": 5.6067, + "step": 14288 + }, + { + "epoch": 0.08498073080216957, + "grad_norm": 1.787312626838684, + "learning_rate": 4.9114457985771036e-05, + "loss": 5.4929, + "step": 14289 + }, + { + "epoch": 0.08498667808545056, + "grad_norm": 1.7784658670425415, + "learning_rate": 4.911433476253889e-05, + "loss": 5.5471, + "step": 14290 + }, + { + "epoch": 0.08499262536873156, + "grad_norm": 1.6120775938034058, + "learning_rate": 4.9114211530888676e-05, + "loss": 5.5455, + "step": 14291 + }, + { + "epoch": 0.08499857265201256, + "grad_norm": 1.6809823513031006, + "learning_rate": 4.9114088290820446e-05, + "loss": 5.7674, + "step": 14292 + }, + { + "epoch": 0.08500451993529355, + "grad_norm": 1.784569501876831, + "learning_rate": 4.9113965042334234e-05, + "loss": 5.554, + "step": 14293 + }, + { + "epoch": 0.08501046721857455, + "grad_norm": 1.8622018098831177, + "learning_rate": 4.9113841785430094e-05, + "loss": 5.5718, + "step": 14294 + }, + { + "epoch": 0.08501641450185556, + "grad_norm": 1.8970091342926025, + "learning_rate": 4.911371852010805e-05, + "loss": 5.6398, + "step": 14295 + }, + { + "epoch": 0.08502236178513654, + "grad_norm": 1.9560039043426514, + "learning_rate": 4.911359524636816e-05, + "loss": 5.3627, + "step": 14296 + }, + { + "epoch": 0.08502830906841755, + "grad_norm": 1.7574408054351807, + "learning_rate": 4.911347196421046e-05, + "loss": 5.6245, + "step": 14297 + }, + { + "epoch": 0.08503425635169855, + "grad_norm": 2.0868546962738037, + "learning_rate": 4.9113348673635004e-05, + "loss": 5.6092, + "step": 14298 + }, + { + "epoch": 0.08504020363497954, + "grad_norm": 2.1157326698303223, + "learning_rate": 4.9113225374641816e-05, + "loss": 5.0796, + "step": 14299 + }, + { + "epoch": 0.08504615091826054, + "grad_norm": 1.7721058130264282, + "learning_rate": 4.911310206723096e-05, + "loss": 5.148, + "step": 14300 + }, + { + "epoch": 0.08505209820154154, + "grad_norm": 1.586799144744873, + "learning_rate": 4.911297875140246e-05, + "loss": 5.5425, + "step": 14301 + }, + { + "epoch": 0.08505804548482253, + "grad_norm": 1.9669803380966187, + "learning_rate": 4.9112855427156376e-05, + "loss": 5.1675, + "step": 14302 + }, + { + "epoch": 0.08506399276810353, + "grad_norm": 2.279446601867676, + "learning_rate": 4.911273209449274e-05, + "loss": 5.8068, + "step": 14303 + }, + { + "epoch": 0.08506994005138453, + "grad_norm": 2.036482572555542, + "learning_rate": 4.9112608753411605e-05, + "loss": 5.3995, + "step": 14304 + }, + { + "epoch": 0.08507588733466552, + "grad_norm": 1.833946704864502, + "learning_rate": 4.9112485403913e-05, + "loss": 6.069, + "step": 14305 + }, + { + "epoch": 0.08508183461794652, + "grad_norm": 1.6984084844589233, + "learning_rate": 4.9112362045996976e-05, + "loss": 5.7842, + "step": 14306 + }, + { + "epoch": 0.08508778190122752, + "grad_norm": 1.6729326248168945, + "learning_rate": 4.911223867966358e-05, + "loss": 5.5225, + "step": 14307 + }, + { + "epoch": 0.08509372918450851, + "grad_norm": 2.046747922897339, + "learning_rate": 4.911211530491284e-05, + "loss": 4.967, + "step": 14308 + }, + { + "epoch": 0.08509967646778951, + "grad_norm": 1.967058539390564, + "learning_rate": 4.911199192174482e-05, + "loss": 5.8046, + "step": 14309 + }, + { + "epoch": 0.08510562375107052, + "grad_norm": 1.8341583013534546, + "learning_rate": 4.911186853015955e-05, + "loss": 4.8317, + "step": 14310 + }, + { + "epoch": 0.0851115710343515, + "grad_norm": 1.9655890464782715, + "learning_rate": 4.911174513015707e-05, + "loss": 4.6122, + "step": 14311 + }, + { + "epoch": 0.0851175183176325, + "grad_norm": 1.7953969240188599, + "learning_rate": 4.9111621721737445e-05, + "loss": 5.3151, + "step": 14312 + }, + { + "epoch": 0.08512346560091351, + "grad_norm": 1.7074720859527588, + "learning_rate": 4.9111498304900684e-05, + "loss": 5.337, + "step": 14313 + }, + { + "epoch": 0.0851294128841945, + "grad_norm": 1.8258756399154663, + "learning_rate": 4.9111374879646854e-05, + "loss": 5.3245, + "step": 14314 + }, + { + "epoch": 0.0851353601674755, + "grad_norm": 1.731689691543579, + "learning_rate": 4.9111251445976e-05, + "loss": 5.149, + "step": 14315 + }, + { + "epoch": 0.0851413074507565, + "grad_norm": 1.9083631038665771, + "learning_rate": 4.9111128003888154e-05, + "loss": 5.2409, + "step": 14316 + }, + { + "epoch": 0.08514725473403749, + "grad_norm": 1.739311933517456, + "learning_rate": 4.911100455338336e-05, + "loss": 5.0946, + "step": 14317 + }, + { + "epoch": 0.08515320201731849, + "grad_norm": 1.6812219619750977, + "learning_rate": 4.9110881094461655e-05, + "loss": 5.3062, + "step": 14318 + }, + { + "epoch": 0.08515914930059948, + "grad_norm": 1.8215876817703247, + "learning_rate": 4.9110757627123096e-05, + "loss": 5.5774, + "step": 14319 + }, + { + "epoch": 0.08516509658388048, + "grad_norm": 1.9548031091690063, + "learning_rate": 4.9110634151367725e-05, + "loss": 5.7895, + "step": 14320 + }, + { + "epoch": 0.08517104386716148, + "grad_norm": 2.266925096511841, + "learning_rate": 4.911051066719558e-05, + "loss": 4.6526, + "step": 14321 + }, + { + "epoch": 0.08517699115044247, + "grad_norm": 2.304807424545288, + "learning_rate": 4.9110387174606695e-05, + "loss": 5.2573, + "step": 14322 + }, + { + "epoch": 0.08518293843372347, + "grad_norm": 2.019482135772705, + "learning_rate": 4.911026367360114e-05, + "loss": 5.2739, + "step": 14323 + }, + { + "epoch": 0.08518888571700448, + "grad_norm": 2.0559775829315186, + "learning_rate": 4.911014016417893e-05, + "loss": 5.7166, + "step": 14324 + }, + { + "epoch": 0.08519483300028546, + "grad_norm": 2.0565741062164307, + "learning_rate": 4.911001664634012e-05, + "loss": 5.6359, + "step": 14325 + }, + { + "epoch": 0.08520078028356647, + "grad_norm": 1.8766587972640991, + "learning_rate": 4.910989312008475e-05, + "loss": 5.2667, + "step": 14326 + }, + { + "epoch": 0.08520672756684747, + "grad_norm": 1.669317364692688, + "learning_rate": 4.910976958541287e-05, + "loss": 5.7565, + "step": 14327 + }, + { + "epoch": 0.08521267485012846, + "grad_norm": 1.9138641357421875, + "learning_rate": 4.910964604232452e-05, + "loss": 5.9362, + "step": 14328 + }, + { + "epoch": 0.08521862213340946, + "grad_norm": 1.740892767906189, + "learning_rate": 4.9109522490819734e-05, + "loss": 5.6964, + "step": 14329 + }, + { + "epoch": 0.08522456941669046, + "grad_norm": 1.788825511932373, + "learning_rate": 4.9109398930898576e-05, + "loss": 5.4266, + "step": 14330 + }, + { + "epoch": 0.08523051669997145, + "grad_norm": 2.035877227783203, + "learning_rate": 4.910927536256106e-05, + "loss": 5.5609, + "step": 14331 + }, + { + "epoch": 0.08523646398325245, + "grad_norm": 2.078150987625122, + "learning_rate": 4.9109151785807265e-05, + "loss": 5.0074, + "step": 14332 + }, + { + "epoch": 0.08524241126653345, + "grad_norm": 2.601290225982666, + "learning_rate": 4.91090282006372e-05, + "loss": 5.2021, + "step": 14333 + }, + { + "epoch": 0.08524835854981444, + "grad_norm": 1.7069159746170044, + "learning_rate": 4.910890460705092e-05, + "loss": 5.0313, + "step": 14334 + }, + { + "epoch": 0.08525430583309544, + "grad_norm": 1.8937885761260986, + "learning_rate": 4.9108781005048473e-05, + "loss": 4.6001, + "step": 14335 + }, + { + "epoch": 0.08526025311637644, + "grad_norm": 2.3120486736297607, + "learning_rate": 4.91086573946299e-05, + "loss": 4.4027, + "step": 14336 + }, + { + "epoch": 0.08526620039965743, + "grad_norm": 2.064420223236084, + "learning_rate": 4.910853377579524e-05, + "loss": 4.8853, + "step": 14337 + }, + { + "epoch": 0.08527214768293843, + "grad_norm": 1.80779230594635, + "learning_rate": 4.910841014854455e-05, + "loss": 5.5493, + "step": 14338 + }, + { + "epoch": 0.08527809496621944, + "grad_norm": 1.6364500522613525, + "learning_rate": 4.910828651287786e-05, + "loss": 5.6569, + "step": 14339 + }, + { + "epoch": 0.08528404224950042, + "grad_norm": 1.7472214698791504, + "learning_rate": 4.910816286879522e-05, + "loss": 5.4057, + "step": 14340 + }, + { + "epoch": 0.08528998953278143, + "grad_norm": 1.6311333179473877, + "learning_rate": 4.910803921629666e-05, + "loss": 5.8406, + "step": 14341 + }, + { + "epoch": 0.08529593681606243, + "grad_norm": 2.2367610931396484, + "learning_rate": 4.9107915555382236e-05, + "loss": 4.9339, + "step": 14342 + }, + { + "epoch": 0.08530188409934342, + "grad_norm": 2.033160924911499, + "learning_rate": 4.910779188605199e-05, + "loss": 4.8923, + "step": 14343 + }, + { + "epoch": 0.08530783138262442, + "grad_norm": 1.852645993232727, + "learning_rate": 4.910766820830596e-05, + "loss": 5.2208, + "step": 14344 + }, + { + "epoch": 0.08531377866590542, + "grad_norm": 1.9810596704483032, + "learning_rate": 4.910754452214419e-05, + "loss": 5.0119, + "step": 14345 + }, + { + "epoch": 0.08531972594918641, + "grad_norm": 1.92807137966156, + "learning_rate": 4.910742082756673e-05, + "loss": 5.6388, + "step": 14346 + }, + { + "epoch": 0.08532567323246741, + "grad_norm": 1.783923864364624, + "learning_rate": 4.910729712457361e-05, + "loss": 5.2831, + "step": 14347 + }, + { + "epoch": 0.0853316205157484, + "grad_norm": 2.008113145828247, + "learning_rate": 4.91071734131649e-05, + "loss": 5.085, + "step": 14348 + }, + { + "epoch": 0.0853375677990294, + "grad_norm": 2.2313408851623535, + "learning_rate": 4.910704969334061e-05, + "loss": 5.243, + "step": 14349 + }, + { + "epoch": 0.0853435150823104, + "grad_norm": 2.155491590499878, + "learning_rate": 4.9106925965100806e-05, + "loss": 6.0776, + "step": 14350 + }, + { + "epoch": 0.08534946236559139, + "grad_norm": 1.995848536491394, + "learning_rate": 4.910680222844551e-05, + "loss": 5.6763, + "step": 14351 + }, + { + "epoch": 0.0853554096488724, + "grad_norm": 2.033620595932007, + "learning_rate": 4.910667848337479e-05, + "loss": 4.4634, + "step": 14352 + }, + { + "epoch": 0.0853613569321534, + "grad_norm": 2.036668062210083, + "learning_rate": 4.910655472988868e-05, + "loss": 4.6367, + "step": 14353 + }, + { + "epoch": 0.08536730421543438, + "grad_norm": 1.9862895011901855, + "learning_rate": 4.910643096798721e-05, + "loss": 4.4623, + "step": 14354 + }, + { + "epoch": 0.08537325149871539, + "grad_norm": 1.9778163433074951, + "learning_rate": 4.910630719767044e-05, + "loss": 4.3706, + "step": 14355 + }, + { + "epoch": 0.08537919878199639, + "grad_norm": 1.984913945198059, + "learning_rate": 4.9106183418938404e-05, + "loss": 4.4573, + "step": 14356 + }, + { + "epoch": 0.08538514606527738, + "grad_norm": 2.0571017265319824, + "learning_rate": 4.910605963179116e-05, + "loss": 4.2782, + "step": 14357 + }, + { + "epoch": 0.08539109334855838, + "grad_norm": 2.028339147567749, + "learning_rate": 4.910593583622872e-05, + "loss": 4.3874, + "step": 14358 + }, + { + "epoch": 0.08539704063183938, + "grad_norm": 2.03485369682312, + "learning_rate": 4.9105812032251165e-05, + "loss": 4.5877, + "step": 14359 + }, + { + "epoch": 0.08540298791512037, + "grad_norm": 1.950490951538086, + "learning_rate": 4.910568821985851e-05, + "loss": 4.6547, + "step": 14360 + }, + { + "epoch": 0.08540893519840137, + "grad_norm": 2.1270785331726074, + "learning_rate": 4.910556439905081e-05, + "loss": 5.3685, + "step": 14361 + }, + { + "epoch": 0.08541488248168237, + "grad_norm": 2.094545364379883, + "learning_rate": 4.910544056982811e-05, + "loss": 6.1109, + "step": 14362 + }, + { + "epoch": 0.08542082976496336, + "grad_norm": 2.2988197803497314, + "learning_rate": 4.910531673219044e-05, + "loss": 5.4789, + "step": 14363 + }, + { + "epoch": 0.08542677704824436, + "grad_norm": 2.2927358150482178, + "learning_rate": 4.910519288613786e-05, + "loss": 5.3853, + "step": 14364 + }, + { + "epoch": 0.08543272433152536, + "grad_norm": 2.223668098449707, + "learning_rate": 4.910506903167041e-05, + "loss": 5.3572, + "step": 14365 + }, + { + "epoch": 0.08543867161480635, + "grad_norm": 2.0522570610046387, + "learning_rate": 4.910494516878813e-05, + "loss": 5.3581, + "step": 14366 + }, + { + "epoch": 0.08544461889808735, + "grad_norm": 2.4349021911621094, + "learning_rate": 4.910482129749106e-05, + "loss": 5.4082, + "step": 14367 + }, + { + "epoch": 0.08545056618136836, + "grad_norm": 1.976344347000122, + "learning_rate": 4.910469741777924e-05, + "loss": 5.6107, + "step": 14368 + }, + { + "epoch": 0.08545651346464934, + "grad_norm": 1.8476877212524414, + "learning_rate": 4.910457352965272e-05, + "loss": 5.5059, + "step": 14369 + }, + { + "epoch": 0.08546246074793035, + "grad_norm": 1.6204098463058472, + "learning_rate": 4.910444963311155e-05, + "loss": 5.6578, + "step": 14370 + }, + { + "epoch": 0.08546840803121135, + "grad_norm": 1.808021903038025, + "learning_rate": 4.910432572815576e-05, + "loss": 5.8263, + "step": 14371 + }, + { + "epoch": 0.08547435531449234, + "grad_norm": 1.4975682497024536, + "learning_rate": 4.91042018147854e-05, + "loss": 5.582, + "step": 14372 + }, + { + "epoch": 0.08548030259777334, + "grad_norm": 1.644845724105835, + "learning_rate": 4.910407789300051e-05, + "loss": 5.7127, + "step": 14373 + }, + { + "epoch": 0.08548624988105434, + "grad_norm": 1.5433874130249023, + "learning_rate": 4.910395396280114e-05, + "loss": 5.6941, + "step": 14374 + }, + { + "epoch": 0.08549219716433533, + "grad_norm": 1.7267838716506958, + "learning_rate": 4.910383002418732e-05, + "loss": 5.632, + "step": 14375 + }, + { + "epoch": 0.08549814444761633, + "grad_norm": 1.4142215251922607, + "learning_rate": 4.9103706077159116e-05, + "loss": 5.6108, + "step": 14376 + }, + { + "epoch": 0.08550409173089732, + "grad_norm": 1.8514180183410645, + "learning_rate": 4.9103582121716554e-05, + "loss": 5.828, + "step": 14377 + }, + { + "epoch": 0.08551003901417832, + "grad_norm": 1.633837103843689, + "learning_rate": 4.9103458157859674e-05, + "loss": 5.8585, + "step": 14378 + }, + { + "epoch": 0.08551598629745932, + "grad_norm": 1.9934178590774536, + "learning_rate": 4.910333418558853e-05, + "loss": 5.5907, + "step": 14379 + }, + { + "epoch": 0.08552193358074031, + "grad_norm": 1.8934741020202637, + "learning_rate": 4.910321020490316e-05, + "loss": 5.579, + "step": 14380 + }, + { + "epoch": 0.08552788086402131, + "grad_norm": 1.9341318607330322, + "learning_rate": 4.910308621580361e-05, + "loss": 5.8737, + "step": 14381 + }, + { + "epoch": 0.08553382814730232, + "grad_norm": 2.1566226482391357, + "learning_rate": 4.9102962218289915e-05, + "loss": 5.6105, + "step": 14382 + }, + { + "epoch": 0.0855397754305833, + "grad_norm": 1.707112431526184, + "learning_rate": 4.910283821236213e-05, + "loss": 5.6875, + "step": 14383 + }, + { + "epoch": 0.0855457227138643, + "grad_norm": 2.8415439128875732, + "learning_rate": 4.9102714198020296e-05, + "loss": 4.9292, + "step": 14384 + }, + { + "epoch": 0.08555166999714531, + "grad_norm": 2.2043650150299072, + "learning_rate": 4.9102590175264445e-05, + "loss": 5.7264, + "step": 14385 + }, + { + "epoch": 0.0855576172804263, + "grad_norm": 2.2063820362091064, + "learning_rate": 4.9102466144094636e-05, + "loss": 5.1616, + "step": 14386 + }, + { + "epoch": 0.0855635645637073, + "grad_norm": 1.9087328910827637, + "learning_rate": 4.9102342104510903e-05, + "loss": 5.1897, + "step": 14387 + }, + { + "epoch": 0.0855695118469883, + "grad_norm": 1.6418956518173218, + "learning_rate": 4.910221805651329e-05, + "loss": 5.0923, + "step": 14388 + }, + { + "epoch": 0.08557545913026929, + "grad_norm": 1.5215847492218018, + "learning_rate": 4.9102094000101836e-05, + "loss": 4.9602, + "step": 14389 + }, + { + "epoch": 0.08558140641355029, + "grad_norm": 2.249983072280884, + "learning_rate": 4.91019699352766e-05, + "loss": 5.1167, + "step": 14390 + }, + { + "epoch": 0.08558735369683129, + "grad_norm": 1.89960777759552, + "learning_rate": 4.9101845862037615e-05, + "loss": 6.1589, + "step": 14391 + }, + { + "epoch": 0.08559330098011228, + "grad_norm": 1.8243924379348755, + "learning_rate": 4.910172178038492e-05, + "loss": 5.8661, + "step": 14392 + }, + { + "epoch": 0.08559924826339328, + "grad_norm": 1.8313872814178467, + "learning_rate": 4.9101597690318567e-05, + "loss": 5.6129, + "step": 14393 + }, + { + "epoch": 0.08560519554667428, + "grad_norm": 1.8717663288116455, + "learning_rate": 4.9101473591838593e-05, + "loss": 5.6346, + "step": 14394 + }, + { + "epoch": 0.08561114282995527, + "grad_norm": 1.6444953680038452, + "learning_rate": 4.910134948494504e-05, + "loss": 5.7237, + "step": 14395 + }, + { + "epoch": 0.08561709011323627, + "grad_norm": 1.8138811588287354, + "learning_rate": 4.910122536963796e-05, + "loss": 5.7682, + "step": 14396 + }, + { + "epoch": 0.08562303739651728, + "grad_norm": 2.629892110824585, + "learning_rate": 4.9101101245917394e-05, + "loss": 5.89, + "step": 14397 + }, + { + "epoch": 0.08562898467979826, + "grad_norm": 1.8197498321533203, + "learning_rate": 4.910097711378337e-05, + "loss": 5.6768, + "step": 14398 + }, + { + "epoch": 0.08563493196307927, + "grad_norm": 2.1121623516082764, + "learning_rate": 4.9100852973235955e-05, + "loss": 5.672, + "step": 14399 + }, + { + "epoch": 0.08564087924636027, + "grad_norm": 1.8823927640914917, + "learning_rate": 4.910072882427518e-05, + "loss": 5.6717, + "step": 14400 + }, + { + "epoch": 0.08564682652964126, + "grad_norm": 2.602023124694824, + "learning_rate": 4.9100604666901084e-05, + "loss": 5.4193, + "step": 14401 + }, + { + "epoch": 0.08565277381292226, + "grad_norm": 2.420342445373535, + "learning_rate": 4.910048050111372e-05, + "loss": 5.2811, + "step": 14402 + }, + { + "epoch": 0.08565872109620326, + "grad_norm": 2.593797206878662, + "learning_rate": 4.910035632691313e-05, + "loss": 5.2942, + "step": 14403 + }, + { + "epoch": 0.08566466837948425, + "grad_norm": 1.9292038679122925, + "learning_rate": 4.910023214429935e-05, + "loss": 5.0231, + "step": 14404 + }, + { + "epoch": 0.08567061566276525, + "grad_norm": 2.159935712814331, + "learning_rate": 4.9100107953272434e-05, + "loss": 4.8778, + "step": 14405 + }, + { + "epoch": 0.08567656294604625, + "grad_norm": 2.2363314628601074, + "learning_rate": 4.9099983753832416e-05, + "loss": 4.8828, + "step": 14406 + }, + { + "epoch": 0.08568251022932724, + "grad_norm": 2.149986505508423, + "learning_rate": 4.909985954597934e-05, + "loss": 5.4351, + "step": 14407 + }, + { + "epoch": 0.08568845751260824, + "grad_norm": 2.05991268157959, + "learning_rate": 4.909973532971325e-05, + "loss": 5.3759, + "step": 14408 + }, + { + "epoch": 0.08569440479588923, + "grad_norm": 2.0030369758605957, + "learning_rate": 4.9099611105034196e-05, + "loss": 5.5126, + "step": 14409 + }, + { + "epoch": 0.08570035207917023, + "grad_norm": 1.7764592170715332, + "learning_rate": 4.9099486871942216e-05, + "loss": 5.1808, + "step": 14410 + }, + { + "epoch": 0.08570629936245124, + "grad_norm": 1.8827999830245972, + "learning_rate": 4.909936263043735e-05, + "loss": 5.5076, + "step": 14411 + }, + { + "epoch": 0.08571224664573222, + "grad_norm": 2.0153589248657227, + "learning_rate": 4.9099238380519655e-05, + "loss": 5.2955, + "step": 14412 + }, + { + "epoch": 0.08571819392901323, + "grad_norm": 2.0739622116088867, + "learning_rate": 4.909911412218916e-05, + "loss": 5.2463, + "step": 14413 + }, + { + "epoch": 0.08572414121229423, + "grad_norm": 2.4668188095092773, + "learning_rate": 4.909898985544591e-05, + "loss": 5.1859, + "step": 14414 + }, + { + "epoch": 0.08573008849557522, + "grad_norm": 2.245546340942383, + "learning_rate": 4.9098865580289956e-05, + "loss": 5.5472, + "step": 14415 + }, + { + "epoch": 0.08573603577885622, + "grad_norm": 2.244086980819702, + "learning_rate": 4.909874129672133e-05, + "loss": 5.5531, + "step": 14416 + }, + { + "epoch": 0.08574198306213722, + "grad_norm": 2.2983627319335938, + "learning_rate": 4.909861700474009e-05, + "loss": 5.6178, + "step": 14417 + }, + { + "epoch": 0.08574793034541821, + "grad_norm": 1.9792771339416504, + "learning_rate": 4.9098492704346265e-05, + "loss": 5.364, + "step": 14418 + }, + { + "epoch": 0.08575387762869921, + "grad_norm": 1.8312867879867554, + "learning_rate": 4.9098368395539914e-05, + "loss": 5.3105, + "step": 14419 + }, + { + "epoch": 0.08575982491198021, + "grad_norm": 1.8415101766586304, + "learning_rate": 4.909824407832107e-05, + "loss": 5.3182, + "step": 14420 + }, + { + "epoch": 0.0857657721952612, + "grad_norm": 1.965531349182129, + "learning_rate": 4.909811975268977e-05, + "loss": 5.496, + "step": 14421 + }, + { + "epoch": 0.0857717194785422, + "grad_norm": 1.9116218090057373, + "learning_rate": 4.909799541864607e-05, + "loss": 5.2531, + "step": 14422 + }, + { + "epoch": 0.0857776667618232, + "grad_norm": 1.863571286201477, + "learning_rate": 4.909787107619001e-05, + "loss": 5.535, + "step": 14423 + }, + { + "epoch": 0.08578361404510419, + "grad_norm": 1.966637372970581, + "learning_rate": 4.909774672532163e-05, + "loss": 5.5072, + "step": 14424 + }, + { + "epoch": 0.0857895613283852, + "grad_norm": 1.9251974821090698, + "learning_rate": 4.9097622366040974e-05, + "loss": 5.1989, + "step": 14425 + }, + { + "epoch": 0.0857955086116662, + "grad_norm": 1.6277741193771362, + "learning_rate": 4.90974979983481e-05, + "loss": 5.357, + "step": 14426 + }, + { + "epoch": 0.08580145589494718, + "grad_norm": 1.6832202672958374, + "learning_rate": 4.909737362224302e-05, + "loss": 5.3485, + "step": 14427 + }, + { + "epoch": 0.08580740317822819, + "grad_norm": 1.7656053304672241, + "learning_rate": 4.909724923772581e-05, + "loss": 5.3965, + "step": 14428 + }, + { + "epoch": 0.08581335046150919, + "grad_norm": 1.748529076576233, + "learning_rate": 4.909712484479649e-05, + "loss": 5.3895, + "step": 14429 + }, + { + "epoch": 0.08581929774479018, + "grad_norm": 2.1317241191864014, + "learning_rate": 4.909700044345511e-05, + "loss": 5.1703, + "step": 14430 + }, + { + "epoch": 0.08582524502807118, + "grad_norm": 2.6896255016326904, + "learning_rate": 4.909687603370172e-05, + "loss": 5.3942, + "step": 14431 + }, + { + "epoch": 0.08583119231135218, + "grad_norm": 2.1061718463897705, + "learning_rate": 4.909675161553637e-05, + "loss": 5.3545, + "step": 14432 + }, + { + "epoch": 0.08583713959463317, + "grad_norm": 2.7201108932495117, + "learning_rate": 4.9096627188959085e-05, + "loss": 4.9659, + "step": 14433 + }, + { + "epoch": 0.08584308687791417, + "grad_norm": 2.0352578163146973, + "learning_rate": 4.909650275396991e-05, + "loss": 5.2667, + "step": 14434 + }, + { + "epoch": 0.08584903416119517, + "grad_norm": 1.6980863809585571, + "learning_rate": 4.9096378310568905e-05, + "loss": 5.4036, + "step": 14435 + }, + { + "epoch": 0.08585498144447616, + "grad_norm": 1.677700161933899, + "learning_rate": 4.90962538587561e-05, + "loss": 5.3104, + "step": 14436 + }, + { + "epoch": 0.08586092872775716, + "grad_norm": 1.995198369026184, + "learning_rate": 4.9096129398531534e-05, + "loss": 5.4235, + "step": 14437 + }, + { + "epoch": 0.08586687601103815, + "grad_norm": 2.136059284210205, + "learning_rate": 4.909600492989527e-05, + "loss": 5.1867, + "step": 14438 + }, + { + "epoch": 0.08587282329431915, + "grad_norm": 1.9917269945144653, + "learning_rate": 4.909588045284733e-05, + "loss": 5.5507, + "step": 14439 + }, + { + "epoch": 0.08587877057760016, + "grad_norm": 1.7341989278793335, + "learning_rate": 4.909575596738777e-05, + "loss": 5.4782, + "step": 14440 + }, + { + "epoch": 0.08588471786088114, + "grad_norm": 2.058920383453369, + "learning_rate": 4.9095631473516635e-05, + "loss": 5.51, + "step": 14441 + }, + { + "epoch": 0.08589066514416215, + "grad_norm": 1.7856314182281494, + "learning_rate": 4.9095506971233965e-05, + "loss": 5.4189, + "step": 14442 + }, + { + "epoch": 0.08589661242744315, + "grad_norm": 1.5290231704711914, + "learning_rate": 4.90953824605398e-05, + "loss": 5.4398, + "step": 14443 + }, + { + "epoch": 0.08590255971072414, + "grad_norm": 1.6302571296691895, + "learning_rate": 4.909525794143418e-05, + "loss": 5.4468, + "step": 14444 + }, + { + "epoch": 0.08590850699400514, + "grad_norm": 1.9898178577423096, + "learning_rate": 4.909513341391716e-05, + "loss": 5.5514, + "step": 14445 + }, + { + "epoch": 0.08591445427728614, + "grad_norm": 2.539473533630371, + "learning_rate": 4.909500887798878e-05, + "loss": 5.0985, + "step": 14446 + }, + { + "epoch": 0.08592040156056713, + "grad_norm": 2.109477996826172, + "learning_rate": 4.909488433364907e-05, + "loss": 5.1304, + "step": 14447 + }, + { + "epoch": 0.08592634884384813, + "grad_norm": 1.627647042274475, + "learning_rate": 4.9094759780898096e-05, + "loss": 5.7772, + "step": 14448 + }, + { + "epoch": 0.08593229612712913, + "grad_norm": 1.7776944637298584, + "learning_rate": 4.909463521973588e-05, + "loss": 6.3219, + "step": 14449 + }, + { + "epoch": 0.08593824341041012, + "grad_norm": 1.8342489004135132, + "learning_rate": 4.909451065016249e-05, + "loss": 5.7136, + "step": 14450 + }, + { + "epoch": 0.08594419069369112, + "grad_norm": 2.109060764312744, + "learning_rate": 4.9094386072177945e-05, + "loss": 5.449, + "step": 14451 + }, + { + "epoch": 0.08595013797697212, + "grad_norm": 2.5615251064300537, + "learning_rate": 4.909426148578231e-05, + "loss": 4.7441, + "step": 14452 + }, + { + "epoch": 0.08595608526025311, + "grad_norm": 1.7670586109161377, + "learning_rate": 4.909413689097561e-05, + "loss": 5.4488, + "step": 14453 + }, + { + "epoch": 0.08596203254353411, + "grad_norm": 1.9190126657485962, + "learning_rate": 4.909401228775789e-05, + "loss": 5.3128, + "step": 14454 + }, + { + "epoch": 0.08596797982681512, + "grad_norm": 1.679866909980774, + "learning_rate": 4.90938876761292e-05, + "loss": 5.4575, + "step": 14455 + }, + { + "epoch": 0.0859739271100961, + "grad_norm": 1.6199991703033447, + "learning_rate": 4.909376305608959e-05, + "loss": 5.541, + "step": 14456 + }, + { + "epoch": 0.0859798743933771, + "grad_norm": 1.876761794090271, + "learning_rate": 4.9093638427639096e-05, + "loss": 5.7256, + "step": 14457 + }, + { + "epoch": 0.08598582167665811, + "grad_norm": 1.7833212614059448, + "learning_rate": 4.909351379077776e-05, + "loss": 5.6512, + "step": 14458 + }, + { + "epoch": 0.0859917689599391, + "grad_norm": 2.249696731567383, + "learning_rate": 4.909338914550562e-05, + "loss": 5.6517, + "step": 14459 + }, + { + "epoch": 0.0859977162432201, + "grad_norm": 1.8037621974945068, + "learning_rate": 4.909326449182273e-05, + "loss": 5.7564, + "step": 14460 + }, + { + "epoch": 0.0860036635265011, + "grad_norm": 1.4057918787002563, + "learning_rate": 4.909313982972914e-05, + "loss": 5.6259, + "step": 14461 + }, + { + "epoch": 0.08600961080978209, + "grad_norm": 1.5501145124435425, + "learning_rate": 4.9093015159224874e-05, + "loss": 5.6626, + "step": 14462 + }, + { + "epoch": 0.08601555809306309, + "grad_norm": 1.8189458847045898, + "learning_rate": 4.909289048030999e-05, + "loss": 5.4682, + "step": 14463 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 1.6819778680801392, + "learning_rate": 4.909276579298452e-05, + "loss": 5.3511, + "step": 14464 + }, + { + "epoch": 0.08602745265962508, + "grad_norm": 1.8401011228561401, + "learning_rate": 4.909264109724853e-05, + "loss": 5.531, + "step": 14465 + }, + { + "epoch": 0.08603339994290608, + "grad_norm": 1.6418116092681885, + "learning_rate": 4.909251639310203e-05, + "loss": 5.2885, + "step": 14466 + }, + { + "epoch": 0.08603934722618707, + "grad_norm": 1.4331059455871582, + "learning_rate": 4.909239168054509e-05, + "loss": 5.2792, + "step": 14467 + }, + { + "epoch": 0.08604529450946807, + "grad_norm": 1.4047703742980957, + "learning_rate": 4.9092266959577745e-05, + "loss": 5.2179, + "step": 14468 + }, + { + "epoch": 0.08605124179274908, + "grad_norm": 1.641930103302002, + "learning_rate": 4.909214223020003e-05, + "loss": 5.475, + "step": 14469 + }, + { + "epoch": 0.08605718907603006, + "grad_norm": 1.9879019260406494, + "learning_rate": 4.909201749241201e-05, + "loss": 5.3893, + "step": 14470 + }, + { + "epoch": 0.08606313635931107, + "grad_norm": 1.4790434837341309, + "learning_rate": 4.909189274621371e-05, + "loss": 5.3011, + "step": 14471 + }, + { + "epoch": 0.08606908364259207, + "grad_norm": 1.4283875226974487, + "learning_rate": 4.909176799160518e-05, + "loss": 5.4181, + "step": 14472 + }, + { + "epoch": 0.08607503092587306, + "grad_norm": 1.6676496267318726, + "learning_rate": 4.909164322858646e-05, + "loss": 5.4682, + "step": 14473 + }, + { + "epoch": 0.08608097820915406, + "grad_norm": 1.4858648777008057, + "learning_rate": 4.9091518457157605e-05, + "loss": 5.3073, + "step": 14474 + }, + { + "epoch": 0.08608692549243506, + "grad_norm": 1.5135246515274048, + "learning_rate": 4.909139367731864e-05, + "loss": 5.4039, + "step": 14475 + }, + { + "epoch": 0.08609287277571605, + "grad_norm": 1.353051781654358, + "learning_rate": 4.909126888906962e-05, + "loss": 5.5455, + "step": 14476 + }, + { + "epoch": 0.08609882005899705, + "grad_norm": 1.2824941873550415, + "learning_rate": 4.909114409241059e-05, + "loss": 5.6465, + "step": 14477 + }, + { + "epoch": 0.08610476734227805, + "grad_norm": 1.3398411273956299, + "learning_rate": 4.909101928734159e-05, + "loss": 5.5299, + "step": 14478 + }, + { + "epoch": 0.08611071462555904, + "grad_norm": 1.167169213294983, + "learning_rate": 4.909089447386266e-05, + "loss": 5.4376, + "step": 14479 + }, + { + "epoch": 0.08611666190884004, + "grad_norm": 1.2469842433929443, + "learning_rate": 4.9090769651973846e-05, + "loss": 5.4945, + "step": 14480 + }, + { + "epoch": 0.08612260919212104, + "grad_norm": 1.3025931119918823, + "learning_rate": 4.90906448216752e-05, + "loss": 5.3283, + "step": 14481 + }, + { + "epoch": 0.08612855647540203, + "grad_norm": 1.597223162651062, + "learning_rate": 4.909051998296675e-05, + "loss": 5.0729, + "step": 14482 + }, + { + "epoch": 0.08613450375868303, + "grad_norm": 1.53999662399292, + "learning_rate": 4.909039513584856e-05, + "loss": 5.2956, + "step": 14483 + }, + { + "epoch": 0.08614045104196404, + "grad_norm": 1.462623953819275, + "learning_rate": 4.909027028032066e-05, + "loss": 5.2748, + "step": 14484 + }, + { + "epoch": 0.08614639832524502, + "grad_norm": 1.380196452140808, + "learning_rate": 4.909014541638309e-05, + "loss": 5.4184, + "step": 14485 + }, + { + "epoch": 0.08615234560852603, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.90900205440359e-05, + "loss": 5.2064, + "step": 14486 + }, + { + "epoch": 0.08615829289180703, + "grad_norm": 1.406848430633545, + "learning_rate": 4.9089895663279136e-05, + "loss": 5.2019, + "step": 14487 + }, + { + "epoch": 0.08616424017508802, + "grad_norm": 1.3956660032272339, + "learning_rate": 4.908977077411283e-05, + "loss": 5.128, + "step": 14488 + }, + { + "epoch": 0.08617018745836902, + "grad_norm": 1.4705348014831543, + "learning_rate": 4.9089645876537044e-05, + "loss": 5.3451, + "step": 14489 + }, + { + "epoch": 0.08617613474165002, + "grad_norm": 1.4385737180709839, + "learning_rate": 4.9089520970551804e-05, + "loss": 5.0668, + "step": 14490 + }, + { + "epoch": 0.08618208202493101, + "grad_norm": 1.584478735923767, + "learning_rate": 4.908939605615717e-05, + "loss": 4.9412, + "step": 14491 + }, + { + "epoch": 0.08618802930821201, + "grad_norm": 1.2740134000778198, + "learning_rate": 4.908927113335317e-05, + "loss": 4.8684, + "step": 14492 + }, + { + "epoch": 0.08619397659149301, + "grad_norm": 1.5669810771942139, + "learning_rate": 4.9089146202139856e-05, + "loss": 5.1903, + "step": 14493 + }, + { + "epoch": 0.086199923874774, + "grad_norm": 1.6113348007202148, + "learning_rate": 4.908902126251727e-05, + "loss": 5.1217, + "step": 14494 + }, + { + "epoch": 0.086205871158055, + "grad_norm": 1.6401634216308594, + "learning_rate": 4.908889631448546e-05, + "loss": 5.2241, + "step": 14495 + }, + { + "epoch": 0.08621181844133599, + "grad_norm": 1.522625207901001, + "learning_rate": 4.9088771358044456e-05, + "loss": 5.1858, + "step": 14496 + }, + { + "epoch": 0.086217765724617, + "grad_norm": 1.3802037239074707, + "learning_rate": 4.9088646393194316e-05, + "loss": 5.2349, + "step": 14497 + }, + { + "epoch": 0.086223713007898, + "grad_norm": 1.5226190090179443, + "learning_rate": 4.9088521419935076e-05, + "loss": 5.2612, + "step": 14498 + }, + { + "epoch": 0.08622966029117898, + "grad_norm": 1.3293451070785522, + "learning_rate": 4.9088396438266785e-05, + "loss": 5.169, + "step": 14499 + }, + { + "epoch": 0.08623560757445999, + "grad_norm": 1.334403157234192, + "learning_rate": 4.908827144818948e-05, + "loss": 5.1139, + "step": 14500 + }, + { + "epoch": 0.08624155485774099, + "grad_norm": 1.5195876359939575, + "learning_rate": 4.908814644970321e-05, + "loss": 5.1473, + "step": 14501 + }, + { + "epoch": 0.08624750214102198, + "grad_norm": 1.3367561101913452, + "learning_rate": 4.908802144280802e-05, + "loss": 5.1148, + "step": 14502 + }, + { + "epoch": 0.08625344942430298, + "grad_norm": 1.485002875328064, + "learning_rate": 4.908789642750395e-05, + "loss": 5.0796, + "step": 14503 + }, + { + "epoch": 0.08625939670758398, + "grad_norm": 1.3907506465911865, + "learning_rate": 4.9087771403791037e-05, + "loss": 5.1382, + "step": 14504 + }, + { + "epoch": 0.08626534399086497, + "grad_norm": 1.5129644870758057, + "learning_rate": 4.9087646371669336e-05, + "loss": 5.037, + "step": 14505 + }, + { + "epoch": 0.08627129127414597, + "grad_norm": 1.4666407108306885, + "learning_rate": 4.9087521331138896e-05, + "loss": 5.1877, + "step": 14506 + }, + { + "epoch": 0.08627723855742697, + "grad_norm": 1.5812102556228638, + "learning_rate": 4.9087396282199736e-05, + "loss": 5.2588, + "step": 14507 + }, + { + "epoch": 0.08628318584070796, + "grad_norm": 2.976067066192627, + "learning_rate": 4.908727122485193e-05, + "loss": 4.7477, + "step": 14508 + }, + { + "epoch": 0.08628913312398896, + "grad_norm": 1.5401511192321777, + "learning_rate": 4.90871461590955e-05, + "loss": 5.2242, + "step": 14509 + }, + { + "epoch": 0.08629508040726996, + "grad_norm": 1.3266774415969849, + "learning_rate": 4.9087021084930486e-05, + "loss": 5.2792, + "step": 14510 + }, + { + "epoch": 0.08630102769055095, + "grad_norm": 1.3292385339736938, + "learning_rate": 4.9086896002356956e-05, + "loss": 5.2434, + "step": 14511 + }, + { + "epoch": 0.08630697497383195, + "grad_norm": 1.237931489944458, + "learning_rate": 4.908677091137493e-05, + "loss": 5.2173, + "step": 14512 + }, + { + "epoch": 0.08631292225711296, + "grad_norm": 1.2488665580749512, + "learning_rate": 4.908664581198447e-05, + "loss": 5.1262, + "step": 14513 + }, + { + "epoch": 0.08631886954039394, + "grad_norm": 1.5126835107803345, + "learning_rate": 4.9086520704185604e-05, + "loss": 5.2258, + "step": 14514 + }, + { + "epoch": 0.08632481682367495, + "grad_norm": 1.3975410461425781, + "learning_rate": 4.908639558797839e-05, + "loss": 4.9266, + "step": 14515 + }, + { + "epoch": 0.08633076410695595, + "grad_norm": 1.2499217987060547, + "learning_rate": 4.908627046336285e-05, + "loss": 5.1564, + "step": 14516 + }, + { + "epoch": 0.08633671139023694, + "grad_norm": 1.6880254745483398, + "learning_rate": 4.908614533033905e-05, + "loss": 5.0906, + "step": 14517 + }, + { + "epoch": 0.08634265867351794, + "grad_norm": 1.498849630355835, + "learning_rate": 4.908602018890702e-05, + "loss": 5.0771, + "step": 14518 + }, + { + "epoch": 0.08634860595679894, + "grad_norm": 1.9192509651184082, + "learning_rate": 4.908589503906682e-05, + "loss": 5.2173, + "step": 14519 + }, + { + "epoch": 0.08635455324007993, + "grad_norm": 1.8038657903671265, + "learning_rate": 4.9085769880818475e-05, + "loss": 5.3003, + "step": 14520 + }, + { + "epoch": 0.08636050052336093, + "grad_norm": 1.3908354043960571, + "learning_rate": 4.9085644714162037e-05, + "loss": 5.1943, + "step": 14521 + }, + { + "epoch": 0.08636644780664193, + "grad_norm": 1.336630940437317, + "learning_rate": 4.9085519539097556e-05, + "loss": 5.2693, + "step": 14522 + }, + { + "epoch": 0.08637239508992292, + "grad_norm": 1.6008005142211914, + "learning_rate": 4.908539435562506e-05, + "loss": 5.2779, + "step": 14523 + }, + { + "epoch": 0.08637834237320392, + "grad_norm": 1.4620133638381958, + "learning_rate": 4.9085269163744605e-05, + "loss": 5.0467, + "step": 14524 + }, + { + "epoch": 0.08638428965648491, + "grad_norm": 1.5825145244598389, + "learning_rate": 4.9085143963456236e-05, + "loss": 4.9838, + "step": 14525 + }, + { + "epoch": 0.08639023693976591, + "grad_norm": 1.751550555229187, + "learning_rate": 4.9085018754759995e-05, + "loss": 5.0467, + "step": 14526 + }, + { + "epoch": 0.08639618422304692, + "grad_norm": 1.5967564582824707, + "learning_rate": 4.908489353765591e-05, + "loss": 5.0685, + "step": 14527 + }, + { + "epoch": 0.0864021315063279, + "grad_norm": 1.646323800086975, + "learning_rate": 4.908476831214405e-05, + "loss": 4.9341, + "step": 14528 + }, + { + "epoch": 0.0864080787896089, + "grad_norm": 1.482224464416504, + "learning_rate": 4.908464307822443e-05, + "loss": 4.9893, + "step": 14529 + }, + { + "epoch": 0.08641402607288991, + "grad_norm": 1.5190521478652954, + "learning_rate": 4.908451783589713e-05, + "loss": 5.0747, + "step": 14530 + }, + { + "epoch": 0.0864199733561709, + "grad_norm": 1.41251802444458, + "learning_rate": 4.908439258516215e-05, + "loss": 5.0098, + "step": 14531 + }, + { + "epoch": 0.0864259206394519, + "grad_norm": 1.678646445274353, + "learning_rate": 4.9084267326019576e-05, + "loss": 5.0224, + "step": 14532 + }, + { + "epoch": 0.0864318679227329, + "grad_norm": 1.5203865766525269, + "learning_rate": 4.908414205846943e-05, + "loss": 5.109, + "step": 14533 + }, + { + "epoch": 0.08643781520601389, + "grad_norm": 1.5437216758728027, + "learning_rate": 4.9084016782511754e-05, + "loss": 5.1168, + "step": 14534 + }, + { + "epoch": 0.08644376248929489, + "grad_norm": 1.3460302352905273, + "learning_rate": 4.90838914981466e-05, + "loss": 5.1038, + "step": 14535 + }, + { + "epoch": 0.08644970977257589, + "grad_norm": 1.4768339395523071, + "learning_rate": 4.908376620537401e-05, + "loss": 5.129, + "step": 14536 + }, + { + "epoch": 0.08645565705585688, + "grad_norm": 1.2669035196304321, + "learning_rate": 4.9083640904194025e-05, + "loss": 5.0856, + "step": 14537 + }, + { + "epoch": 0.08646160433913788, + "grad_norm": 1.5692600011825562, + "learning_rate": 4.9083515594606686e-05, + "loss": 5.0897, + "step": 14538 + }, + { + "epoch": 0.08646755162241888, + "grad_norm": 1.4857045412063599, + "learning_rate": 4.9083390276612044e-05, + "loss": 4.9654, + "step": 14539 + }, + { + "epoch": 0.08647349890569987, + "grad_norm": 1.5537325143814087, + "learning_rate": 4.908326495021014e-05, + "loss": 5.0431, + "step": 14540 + }, + { + "epoch": 0.08647944618898087, + "grad_norm": 1.483089566230774, + "learning_rate": 4.908313961540101e-05, + "loss": 5.0737, + "step": 14541 + }, + { + "epoch": 0.08648539347226188, + "grad_norm": 1.5829899311065674, + "learning_rate": 4.9083014272184716e-05, + "loss": 4.9844, + "step": 14542 + }, + { + "epoch": 0.08649134075554286, + "grad_norm": 1.3660348653793335, + "learning_rate": 4.908288892056128e-05, + "loss": 5.0384, + "step": 14543 + }, + { + "epoch": 0.08649728803882387, + "grad_norm": 1.3721328973770142, + "learning_rate": 4.9082763560530764e-05, + "loss": 5.0993, + "step": 14544 + }, + { + "epoch": 0.08650323532210487, + "grad_norm": 1.412381887435913, + "learning_rate": 4.90826381920932e-05, + "loss": 4.9359, + "step": 14545 + }, + { + "epoch": 0.08650918260538586, + "grad_norm": 1.5164285898208618, + "learning_rate": 4.9082512815248635e-05, + "loss": 5.0156, + "step": 14546 + }, + { + "epoch": 0.08651512988866686, + "grad_norm": 1.5244861841201782, + "learning_rate": 4.9082387429997117e-05, + "loss": 5.0719, + "step": 14547 + }, + { + "epoch": 0.08652107717194786, + "grad_norm": 1.304221510887146, + "learning_rate": 4.908226203633869e-05, + "loss": 4.9553, + "step": 14548 + }, + { + "epoch": 0.08652702445522885, + "grad_norm": 1.328220009803772, + "learning_rate": 4.908213663427338e-05, + "loss": 4.9761, + "step": 14549 + }, + { + "epoch": 0.08653297173850985, + "grad_norm": 1.4459906816482544, + "learning_rate": 4.908201122380126e-05, + "loss": 5.0422, + "step": 14550 + }, + { + "epoch": 0.08653891902179085, + "grad_norm": 1.5402530431747437, + "learning_rate": 4.908188580492235e-05, + "loss": 4.8856, + "step": 14551 + }, + { + "epoch": 0.08654486630507184, + "grad_norm": 1.6573606729507446, + "learning_rate": 4.90817603776367e-05, + "loss": 5.0958, + "step": 14552 + }, + { + "epoch": 0.08655081358835284, + "grad_norm": 1.5214189291000366, + "learning_rate": 4.9081634941944365e-05, + "loss": 4.9494, + "step": 14553 + }, + { + "epoch": 0.08655676087163383, + "grad_norm": 1.4977836608886719, + "learning_rate": 4.908150949784538e-05, + "loss": 4.9166, + "step": 14554 + }, + { + "epoch": 0.08656270815491483, + "grad_norm": 1.4952701330184937, + "learning_rate": 4.908138404533979e-05, + "loss": 4.9371, + "step": 14555 + }, + { + "epoch": 0.08656865543819584, + "grad_norm": 1.2652736902236938, + "learning_rate": 4.9081258584427626e-05, + "loss": 4.9424, + "step": 14556 + }, + { + "epoch": 0.08657460272147682, + "grad_norm": 1.4386261701583862, + "learning_rate": 4.908113311510895e-05, + "loss": 4.8909, + "step": 14557 + }, + { + "epoch": 0.08658055000475783, + "grad_norm": 1.4800533056259155, + "learning_rate": 4.90810076373838e-05, + "loss": 4.9226, + "step": 14558 + }, + { + "epoch": 0.08658649728803883, + "grad_norm": 1.4734489917755127, + "learning_rate": 4.908088215125222e-05, + "loss": 4.9774, + "step": 14559 + }, + { + "epoch": 0.08659244457131982, + "grad_norm": 1.47382390499115, + "learning_rate": 4.9080756656714245e-05, + "loss": 4.9001, + "step": 14560 + }, + { + "epoch": 0.08659839185460082, + "grad_norm": 1.4358749389648438, + "learning_rate": 4.908063115376994e-05, + "loss": 4.8537, + "step": 14561 + }, + { + "epoch": 0.08660433913788182, + "grad_norm": 1.3895947933197021, + "learning_rate": 4.908050564241933e-05, + "loss": 4.9445, + "step": 14562 + }, + { + "epoch": 0.08661028642116281, + "grad_norm": 1.6166354417800903, + "learning_rate": 4.908038012266246e-05, + "loss": 4.9447, + "step": 14563 + }, + { + "epoch": 0.08661623370444381, + "grad_norm": 1.4621998071670532, + "learning_rate": 4.908025459449938e-05, + "loss": 5.0405, + "step": 14564 + }, + { + "epoch": 0.08662218098772481, + "grad_norm": 1.4160699844360352, + "learning_rate": 4.908012905793013e-05, + "loss": 5.1246, + "step": 14565 + }, + { + "epoch": 0.0866281282710058, + "grad_norm": 1.3748950958251953, + "learning_rate": 4.9080003512954756e-05, + "loss": 5.0856, + "step": 14566 + }, + { + "epoch": 0.0866340755542868, + "grad_norm": 1.5496206283569336, + "learning_rate": 4.9079877959573303e-05, + "loss": 5.1539, + "step": 14567 + }, + { + "epoch": 0.0866400228375678, + "grad_norm": 1.2577475309371948, + "learning_rate": 4.9079752397785814e-05, + "loss": 5.033, + "step": 14568 + }, + { + "epoch": 0.08664597012084879, + "grad_norm": 1.3565775156021118, + "learning_rate": 4.9079626827592336e-05, + "loss": 4.977, + "step": 14569 + }, + { + "epoch": 0.0866519174041298, + "grad_norm": 1.869673252105713, + "learning_rate": 4.90795012489929e-05, + "loss": 5.0452, + "step": 14570 + }, + { + "epoch": 0.0866578646874108, + "grad_norm": 1.3931822776794434, + "learning_rate": 4.907937566198757e-05, + "loss": 5.0182, + "step": 14571 + }, + { + "epoch": 0.08666381197069178, + "grad_norm": 1.5796258449554443, + "learning_rate": 4.907925006657637e-05, + "loss": 5.0167, + "step": 14572 + }, + { + "epoch": 0.08666975925397279, + "grad_norm": 1.439174771308899, + "learning_rate": 4.9079124462759356e-05, + "loss": 5.0223, + "step": 14573 + }, + { + "epoch": 0.08667570653725379, + "grad_norm": 1.5269712209701538, + "learning_rate": 4.907899885053657e-05, + "loss": 5.0726, + "step": 14574 + }, + { + "epoch": 0.08668165382053478, + "grad_norm": 1.6334160566329956, + "learning_rate": 4.9078873229908054e-05, + "loss": 4.902, + "step": 14575 + }, + { + "epoch": 0.08668760110381578, + "grad_norm": 1.2883020639419556, + "learning_rate": 4.9078747600873846e-05, + "loss": 5.0168, + "step": 14576 + }, + { + "epoch": 0.08669354838709678, + "grad_norm": 1.3399035930633545, + "learning_rate": 4.9078621963434e-05, + "loss": 5.1285, + "step": 14577 + }, + { + "epoch": 0.08669949567037777, + "grad_norm": 1.6066272258758545, + "learning_rate": 4.9078496317588556e-05, + "loss": 5.1761, + "step": 14578 + }, + { + "epoch": 0.08670544295365877, + "grad_norm": 1.5316112041473389, + "learning_rate": 4.907837066333756e-05, + "loss": 4.9691, + "step": 14579 + }, + { + "epoch": 0.08671139023693977, + "grad_norm": 1.2680541276931763, + "learning_rate": 4.907824500068105e-05, + "loss": 4.984, + "step": 14580 + }, + { + "epoch": 0.08671733752022076, + "grad_norm": 1.3451861143112183, + "learning_rate": 4.9078119329619076e-05, + "loss": 5.1079, + "step": 14581 + }, + { + "epoch": 0.08672328480350176, + "grad_norm": 1.4813716411590576, + "learning_rate": 4.907799365015168e-05, + "loss": 5.0822, + "step": 14582 + }, + { + "epoch": 0.08672923208678275, + "grad_norm": 1.2526417970657349, + "learning_rate": 4.90778679622789e-05, + "loss": 5.0981, + "step": 14583 + }, + { + "epoch": 0.08673517937006375, + "grad_norm": 1.320970058441162, + "learning_rate": 4.907774226600079e-05, + "loss": 5.2046, + "step": 14584 + }, + { + "epoch": 0.08674112665334476, + "grad_norm": 1.4376531839370728, + "learning_rate": 4.907761656131739e-05, + "loss": 5.0422, + "step": 14585 + }, + { + "epoch": 0.08674707393662574, + "grad_norm": 1.3290382623672485, + "learning_rate": 4.907749084822873e-05, + "loss": 4.9587, + "step": 14586 + }, + { + "epoch": 0.08675302121990675, + "grad_norm": 1.4613630771636963, + "learning_rate": 4.907736512673489e-05, + "loss": 5.0141, + "step": 14587 + }, + { + "epoch": 0.08675896850318775, + "grad_norm": 1.2996604442596436, + "learning_rate": 4.907723939683587e-05, + "loss": 5.0881, + "step": 14588 + }, + { + "epoch": 0.08676491578646874, + "grad_norm": 1.5718237161636353, + "learning_rate": 4.907711365853174e-05, + "loss": 5.0104, + "step": 14589 + }, + { + "epoch": 0.08677086306974974, + "grad_norm": 1.5009227991104126, + "learning_rate": 4.907698791182255e-05, + "loss": 4.9257, + "step": 14590 + }, + { + "epoch": 0.08677681035303074, + "grad_norm": 1.4179331064224243, + "learning_rate": 4.907686215670831e-05, + "loss": 5.0209, + "step": 14591 + }, + { + "epoch": 0.08678275763631173, + "grad_norm": 1.3447542190551758, + "learning_rate": 4.9076736393189105e-05, + "loss": 5.0633, + "step": 14592 + }, + { + "epoch": 0.08678870491959273, + "grad_norm": 1.4221898317337036, + "learning_rate": 4.907661062126495e-05, + "loss": 4.907, + "step": 14593 + }, + { + "epoch": 0.08679465220287373, + "grad_norm": 1.5112396478652954, + "learning_rate": 4.907648484093591e-05, + "loss": 5.0703, + "step": 14594 + }, + { + "epoch": 0.08680059948615472, + "grad_norm": 1.3118572235107422, + "learning_rate": 4.907635905220201e-05, + "loss": 5.0089, + "step": 14595 + }, + { + "epoch": 0.08680654676943572, + "grad_norm": 1.6776518821716309, + "learning_rate": 4.90762332550633e-05, + "loss": 4.9705, + "step": 14596 + }, + { + "epoch": 0.08681249405271672, + "grad_norm": 1.467530608177185, + "learning_rate": 4.9076107449519824e-05, + "loss": 5.0596, + "step": 14597 + }, + { + "epoch": 0.08681844133599771, + "grad_norm": 1.5924569368362427, + "learning_rate": 4.907598163557163e-05, + "loss": 4.9904, + "step": 14598 + }, + { + "epoch": 0.08682438861927871, + "grad_norm": 1.1862461566925049, + "learning_rate": 4.907585581321877e-05, + "loss": 5.2065, + "step": 14599 + }, + { + "epoch": 0.08683033590255972, + "grad_norm": 1.5537490844726562, + "learning_rate": 4.9075729982461265e-05, + "loss": 4.9604, + "step": 14600 + }, + { + "epoch": 0.0868362831858407, + "grad_norm": 1.5608946084976196, + "learning_rate": 4.9075604143299176e-05, + "loss": 4.9951, + "step": 14601 + }, + { + "epoch": 0.0868422304691217, + "grad_norm": 1.3890982866287231, + "learning_rate": 4.907547829573254e-05, + "loss": 5.1994, + "step": 14602 + }, + { + "epoch": 0.08684817775240271, + "grad_norm": 1.5367194414138794, + "learning_rate": 4.907535243976141e-05, + "loss": 5.008, + "step": 14603 + }, + { + "epoch": 0.0868541250356837, + "grad_norm": 1.5362403392791748, + "learning_rate": 4.9075226575385814e-05, + "loss": 5.0239, + "step": 14604 + }, + { + "epoch": 0.0868600723189647, + "grad_norm": 1.3252228498458862, + "learning_rate": 4.9075100702605814e-05, + "loss": 4.9663, + "step": 14605 + }, + { + "epoch": 0.0868660196022457, + "grad_norm": 1.4381712675094604, + "learning_rate": 4.907497482142144e-05, + "loss": 5.1457, + "step": 14606 + }, + { + "epoch": 0.08687196688552669, + "grad_norm": 1.5137197971343994, + "learning_rate": 4.907484893183274e-05, + "loss": 4.9831, + "step": 14607 + }, + { + "epoch": 0.08687791416880769, + "grad_norm": 1.5544081926345825, + "learning_rate": 4.907472303383976e-05, + "loss": 5.0485, + "step": 14608 + }, + { + "epoch": 0.08688386145208869, + "grad_norm": 1.4613279104232788, + "learning_rate": 4.907459712744254e-05, + "loss": 5.3929, + "step": 14609 + }, + { + "epoch": 0.08688980873536968, + "grad_norm": 1.2830102443695068, + "learning_rate": 4.907447121264113e-05, + "loss": 5.4241, + "step": 14610 + }, + { + "epoch": 0.08689575601865068, + "grad_norm": 1.2168337106704712, + "learning_rate": 4.907434528943558e-05, + "loss": 5.4678, + "step": 14611 + }, + { + "epoch": 0.08690170330193167, + "grad_norm": 1.3995872735977173, + "learning_rate": 4.907421935782591e-05, + "loss": 5.2, + "step": 14612 + }, + { + "epoch": 0.08690765058521267, + "grad_norm": 1.4081990718841553, + "learning_rate": 4.907409341781219e-05, + "loss": 5.4356, + "step": 14613 + }, + { + "epoch": 0.08691359786849367, + "grad_norm": 1.4506621360778809, + "learning_rate": 4.9073967469394436e-05, + "loss": 5.3816, + "step": 14614 + }, + { + "epoch": 0.08691954515177466, + "grad_norm": 1.3564461469650269, + "learning_rate": 4.907384151257272e-05, + "loss": 5.2808, + "step": 14615 + }, + { + "epoch": 0.08692549243505567, + "grad_norm": 1.3663856983184814, + "learning_rate": 4.907371554734708e-05, + "loss": 5.4286, + "step": 14616 + }, + { + "epoch": 0.08693143971833667, + "grad_norm": 1.5905755758285522, + "learning_rate": 4.907358957371755e-05, + "loss": 5.3404, + "step": 14617 + }, + { + "epoch": 0.08693738700161766, + "grad_norm": 1.6172430515289307, + "learning_rate": 4.9073463591684175e-05, + "loss": 5.2511, + "step": 14618 + }, + { + "epoch": 0.08694333428489866, + "grad_norm": 1.362925410270691, + "learning_rate": 4.9073337601247e-05, + "loss": 5.3786, + "step": 14619 + }, + { + "epoch": 0.08694928156817966, + "grad_norm": 1.4276455640792847, + "learning_rate": 4.907321160240608e-05, + "loss": 5.1243, + "step": 14620 + }, + { + "epoch": 0.08695522885146065, + "grad_norm": 1.5211840867996216, + "learning_rate": 4.907308559516145e-05, + "loss": 5.1465, + "step": 14621 + }, + { + "epoch": 0.08696117613474165, + "grad_norm": 1.4728838205337524, + "learning_rate": 4.9072959579513146e-05, + "loss": 4.9585, + "step": 14622 + }, + { + "epoch": 0.08696712341802265, + "grad_norm": 1.5337111949920654, + "learning_rate": 4.907283355546123e-05, + "loss": 5.0553, + "step": 14623 + }, + { + "epoch": 0.08697307070130364, + "grad_norm": 1.3105639219284058, + "learning_rate": 4.907270752300573e-05, + "loss": 5.2724, + "step": 14624 + }, + { + "epoch": 0.08697901798458464, + "grad_norm": 1.4726678133010864, + "learning_rate": 4.90725814821467e-05, + "loss": 5.2771, + "step": 14625 + }, + { + "epoch": 0.08698496526786564, + "grad_norm": 1.5226463079452515, + "learning_rate": 4.907245543288418e-05, + "loss": 5.2294, + "step": 14626 + }, + { + "epoch": 0.08699091255114663, + "grad_norm": 1.4187650680541992, + "learning_rate": 4.9072329375218215e-05, + "loss": 5.0003, + "step": 14627 + }, + { + "epoch": 0.08699685983442763, + "grad_norm": 1.3565301895141602, + "learning_rate": 4.907220330914885e-05, + "loss": 5.0616, + "step": 14628 + }, + { + "epoch": 0.08700280711770864, + "grad_norm": 1.3763781785964966, + "learning_rate": 4.907207723467612e-05, + "loss": 5.1036, + "step": 14629 + }, + { + "epoch": 0.08700875440098962, + "grad_norm": 1.350926160812378, + "learning_rate": 4.907195115180009e-05, + "loss": 5.3433, + "step": 14630 + }, + { + "epoch": 0.08701470168427063, + "grad_norm": 1.4927095174789429, + "learning_rate": 4.907182506052078e-05, + "loss": 5.3726, + "step": 14631 + }, + { + "epoch": 0.08702064896755163, + "grad_norm": 1.9378905296325684, + "learning_rate": 4.907169896083824e-05, + "loss": 4.9942, + "step": 14632 + }, + { + "epoch": 0.08702659625083262, + "grad_norm": 1.2046253681182861, + "learning_rate": 4.907157285275253e-05, + "loss": 5.2877, + "step": 14633 + }, + { + "epoch": 0.08703254353411362, + "grad_norm": 1.352828025817871, + "learning_rate": 4.907144673626368e-05, + "loss": 5.264, + "step": 14634 + }, + { + "epoch": 0.08703849081739462, + "grad_norm": 1.4438698291778564, + "learning_rate": 4.907132061137173e-05, + "loss": 5.1767, + "step": 14635 + }, + { + "epoch": 0.08704443810067561, + "grad_norm": 1.4066534042358398, + "learning_rate": 4.9071194478076734e-05, + "loss": 5.0919, + "step": 14636 + }, + { + "epoch": 0.08705038538395661, + "grad_norm": 1.4313786029815674, + "learning_rate": 4.9071068336378736e-05, + "loss": 5.0307, + "step": 14637 + }, + { + "epoch": 0.08705633266723761, + "grad_norm": 1.3995366096496582, + "learning_rate": 4.907094218627778e-05, + "loss": 4.9508, + "step": 14638 + }, + { + "epoch": 0.0870622799505186, + "grad_norm": 1.395270824432373, + "learning_rate": 4.90708160277739e-05, + "loss": 5.1403, + "step": 14639 + }, + { + "epoch": 0.0870682272337996, + "grad_norm": 1.4280959367752075, + "learning_rate": 4.9070689860867144e-05, + "loss": 5.1675, + "step": 14640 + }, + { + "epoch": 0.08707417451708059, + "grad_norm": 1.5028926134109497, + "learning_rate": 4.907056368555757e-05, + "loss": 5.1178, + "step": 14641 + }, + { + "epoch": 0.08708012180036159, + "grad_norm": 1.480936884880066, + "learning_rate": 4.90704375018452e-05, + "loss": 5.1681, + "step": 14642 + }, + { + "epoch": 0.0870860690836426, + "grad_norm": 1.474708914756775, + "learning_rate": 4.907031130973009e-05, + "loss": 4.998, + "step": 14643 + }, + { + "epoch": 0.08709201636692358, + "grad_norm": 1.719551920890808, + "learning_rate": 4.907018510921229e-05, + "loss": 5.0486, + "step": 14644 + }, + { + "epoch": 0.08709796365020459, + "grad_norm": 1.6314032077789307, + "learning_rate": 4.907005890029184e-05, + "loss": 4.9233, + "step": 14645 + }, + { + "epoch": 0.08710391093348559, + "grad_norm": 1.635712742805481, + "learning_rate": 4.906993268296877e-05, + "loss": 4.7026, + "step": 14646 + }, + { + "epoch": 0.08710985821676658, + "grad_norm": 1.5682891607284546, + "learning_rate": 4.906980645724314e-05, + "loss": 4.7681, + "step": 14647 + }, + { + "epoch": 0.08711580550004758, + "grad_norm": 1.5149590969085693, + "learning_rate": 4.906968022311499e-05, + "loss": 4.6026, + "step": 14648 + }, + { + "epoch": 0.08712175278332858, + "grad_norm": 1.666756510734558, + "learning_rate": 4.906955398058436e-05, + "loss": 4.6652, + "step": 14649 + }, + { + "epoch": 0.08712770006660957, + "grad_norm": 1.563281536102295, + "learning_rate": 4.906942772965129e-05, + "loss": 4.8195, + "step": 14650 + }, + { + "epoch": 0.08713364734989057, + "grad_norm": 1.3730766773223877, + "learning_rate": 4.906930147031585e-05, + "loss": 5.3917, + "step": 14651 + }, + { + "epoch": 0.08713959463317157, + "grad_norm": 1.344741940498352, + "learning_rate": 4.906917520257805e-05, + "loss": 5.4866, + "step": 14652 + }, + { + "epoch": 0.08714554191645256, + "grad_norm": 1.4403667449951172, + "learning_rate": 4.906904892643796e-05, + "loss": 5.3869, + "step": 14653 + }, + { + "epoch": 0.08715148919973356, + "grad_norm": 1.4251221418380737, + "learning_rate": 4.906892264189561e-05, + "loss": 5.5564, + "step": 14654 + }, + { + "epoch": 0.08715743648301456, + "grad_norm": 1.0403032302856445, + "learning_rate": 4.9068796348951055e-05, + "loss": 5.3422, + "step": 14655 + }, + { + "epoch": 0.08716338376629555, + "grad_norm": 1.4933732748031616, + "learning_rate": 4.9068670047604313e-05, + "loss": 4.9035, + "step": 14656 + }, + { + "epoch": 0.08716933104957655, + "grad_norm": 1.820141315460205, + "learning_rate": 4.9068543737855466e-05, + "loss": 4.8447, + "step": 14657 + }, + { + "epoch": 0.08717527833285756, + "grad_norm": 1.5337603092193604, + "learning_rate": 4.9068417419704526e-05, + "loss": 4.7122, + "step": 14658 + }, + { + "epoch": 0.08718122561613854, + "grad_norm": 1.6933845281600952, + "learning_rate": 4.9068291093151555e-05, + "loss": 4.6246, + "step": 14659 + }, + { + "epoch": 0.08718717289941955, + "grad_norm": 1.607749342918396, + "learning_rate": 4.906816475819659e-05, + "loss": 4.5246, + "step": 14660 + }, + { + "epoch": 0.08719312018270055, + "grad_norm": 1.6468732357025146, + "learning_rate": 4.906803841483969e-05, + "loss": 4.5529, + "step": 14661 + }, + { + "epoch": 0.08719906746598154, + "grad_norm": 1.7252613306045532, + "learning_rate": 4.906791206308087e-05, + "loss": 4.5866, + "step": 14662 + }, + { + "epoch": 0.08720501474926254, + "grad_norm": 1.8178141117095947, + "learning_rate": 4.90677857029202e-05, + "loss": 4.6312, + "step": 14663 + }, + { + "epoch": 0.08721096203254354, + "grad_norm": 1.6173008680343628, + "learning_rate": 4.906765933435771e-05, + "loss": 4.5964, + "step": 14664 + }, + { + "epoch": 0.08721690931582453, + "grad_norm": 1.4914458990097046, + "learning_rate": 4.9067532957393444e-05, + "loss": 4.7123, + "step": 14665 + }, + { + "epoch": 0.08722285659910553, + "grad_norm": 1.5310544967651367, + "learning_rate": 4.9067406572027465e-05, + "loss": 4.6907, + "step": 14666 + }, + { + "epoch": 0.08722880388238653, + "grad_norm": 1.4311203956604004, + "learning_rate": 4.9067280178259794e-05, + "loss": 4.7749, + "step": 14667 + }, + { + "epoch": 0.08723475116566752, + "grad_norm": 1.6848034858703613, + "learning_rate": 4.9067153776090484e-05, + "loss": 5.1676, + "step": 14668 + }, + { + "epoch": 0.08724069844894852, + "grad_norm": 1.510909914970398, + "learning_rate": 4.906702736551958e-05, + "loss": 5.1237, + "step": 14669 + }, + { + "epoch": 0.08724664573222951, + "grad_norm": 1.4135887622833252, + "learning_rate": 4.906690094654713e-05, + "loss": 5.131, + "step": 14670 + }, + { + "epoch": 0.08725259301551051, + "grad_norm": 1.5739595890045166, + "learning_rate": 4.906677451917317e-05, + "loss": 5.2374, + "step": 14671 + }, + { + "epoch": 0.08725854029879151, + "grad_norm": 1.592644214630127, + "learning_rate": 4.9066648083397746e-05, + "loss": 5.0424, + "step": 14672 + }, + { + "epoch": 0.0872644875820725, + "grad_norm": 1.3842464685440063, + "learning_rate": 4.906652163922091e-05, + "loss": 5.106, + "step": 14673 + }, + { + "epoch": 0.0872704348653535, + "grad_norm": 1.4318630695343018, + "learning_rate": 4.906639518664269e-05, + "loss": 5.1223, + "step": 14674 + }, + { + "epoch": 0.08727638214863451, + "grad_norm": 1.5598502159118652, + "learning_rate": 4.906626872566314e-05, + "loss": 5.0363, + "step": 14675 + }, + { + "epoch": 0.0872823294319155, + "grad_norm": 1.9367897510528564, + "learning_rate": 4.9066142256282316e-05, + "loss": 4.8822, + "step": 14676 + }, + { + "epoch": 0.0872882767151965, + "grad_norm": 1.8134979009628296, + "learning_rate": 4.906601577850024e-05, + "loss": 4.7218, + "step": 14677 + }, + { + "epoch": 0.0872942239984775, + "grad_norm": 1.5139638185501099, + "learning_rate": 4.9065889292316976e-05, + "loss": 5.0311, + "step": 14678 + }, + { + "epoch": 0.08730017128175849, + "grad_norm": 1.5324028730392456, + "learning_rate": 4.906576279773255e-05, + "loss": 5.2366, + "step": 14679 + }, + { + "epoch": 0.08730611856503949, + "grad_norm": 1.4219286441802979, + "learning_rate": 4.906563629474702e-05, + "loss": 5.1362, + "step": 14680 + }, + { + "epoch": 0.08731206584832049, + "grad_norm": 1.4673584699630737, + "learning_rate": 4.906550978336042e-05, + "loss": 5.1336, + "step": 14681 + }, + { + "epoch": 0.08731801313160148, + "grad_norm": 1.2611639499664307, + "learning_rate": 4.906538326357281e-05, + "loss": 5.1791, + "step": 14682 + }, + { + "epoch": 0.08732396041488248, + "grad_norm": 1.283827543258667, + "learning_rate": 4.9065256735384205e-05, + "loss": 5.0889, + "step": 14683 + }, + { + "epoch": 0.08732990769816348, + "grad_norm": 1.4508111476898193, + "learning_rate": 4.906513019879468e-05, + "loss": 4.9832, + "step": 14684 + }, + { + "epoch": 0.08733585498144447, + "grad_norm": 1.3923978805541992, + "learning_rate": 4.906500365380427e-05, + "loss": 4.8147, + "step": 14685 + }, + { + "epoch": 0.08734180226472547, + "grad_norm": 1.3737010955810547, + "learning_rate": 4.906487710041301e-05, + "loss": 4.8448, + "step": 14686 + }, + { + "epoch": 0.08734774954800648, + "grad_norm": 1.4765465259552002, + "learning_rate": 4.906475053862095e-05, + "loss": 4.8601, + "step": 14687 + }, + { + "epoch": 0.08735369683128746, + "grad_norm": 1.527372121810913, + "learning_rate": 4.906462396842813e-05, + "loss": 4.8898, + "step": 14688 + }, + { + "epoch": 0.08735964411456847, + "grad_norm": 1.2455743551254272, + "learning_rate": 4.9064497389834604e-05, + "loss": 4.9954, + "step": 14689 + }, + { + "epoch": 0.08736559139784947, + "grad_norm": 1.3169753551483154, + "learning_rate": 4.906437080284041e-05, + "loss": 5.1384, + "step": 14690 + }, + { + "epoch": 0.08737153868113046, + "grad_norm": 1.3158196210861206, + "learning_rate": 4.906424420744559e-05, + "loss": 5.032, + "step": 14691 + }, + { + "epoch": 0.08737748596441146, + "grad_norm": 1.5421653985977173, + "learning_rate": 4.9064117603650197e-05, + "loss": 4.6448, + "step": 14692 + }, + { + "epoch": 0.08738343324769246, + "grad_norm": 1.4324442148208618, + "learning_rate": 4.906399099145427e-05, + "loss": 4.819, + "step": 14693 + }, + { + "epoch": 0.08738938053097345, + "grad_norm": 1.299877643585205, + "learning_rate": 4.9063864370857836e-05, + "loss": 5.4793, + "step": 14694 + }, + { + "epoch": 0.08739532781425445, + "grad_norm": 1.8289762735366821, + "learning_rate": 4.906373774186097e-05, + "loss": 5.0972, + "step": 14695 + }, + { + "epoch": 0.08740127509753545, + "grad_norm": 1.5460636615753174, + "learning_rate": 4.9063611104463705e-05, + "loss": 5.0992, + "step": 14696 + }, + { + "epoch": 0.08740722238081644, + "grad_norm": 1.4720163345336914, + "learning_rate": 4.9063484458666076e-05, + "loss": 5.0918, + "step": 14697 + }, + { + "epoch": 0.08741316966409744, + "grad_norm": 1.4653000831604004, + "learning_rate": 4.906335780446813e-05, + "loss": 5.1523, + "step": 14698 + }, + { + "epoch": 0.08741911694737843, + "grad_norm": 1.461012840270996, + "learning_rate": 4.9063231141869914e-05, + "loss": 5.1848, + "step": 14699 + }, + { + "epoch": 0.08742506423065943, + "grad_norm": 1.6757450103759766, + "learning_rate": 4.906310447087148e-05, + "loss": 4.9809, + "step": 14700 + }, + { + "epoch": 0.08743101151394043, + "grad_norm": 1.498402714729309, + "learning_rate": 4.906297779147286e-05, + "loss": 5.1451, + "step": 14701 + }, + { + "epoch": 0.08743695879722142, + "grad_norm": 1.341667652130127, + "learning_rate": 4.906285110367411e-05, + "loss": 5.1973, + "step": 14702 + }, + { + "epoch": 0.08744290608050242, + "grad_norm": 1.5008035898208618, + "learning_rate": 4.9062724407475255e-05, + "loss": 5.0961, + "step": 14703 + }, + { + "epoch": 0.08744885336378343, + "grad_norm": 1.6110866069793701, + "learning_rate": 4.9062597702876354e-05, + "loss": 4.7201, + "step": 14704 + }, + { + "epoch": 0.08745480064706442, + "grad_norm": 1.5154603719711304, + "learning_rate": 4.906247098987746e-05, + "loss": 4.6537, + "step": 14705 + }, + { + "epoch": 0.08746074793034542, + "grad_norm": 1.6169204711914062, + "learning_rate": 4.90623442684786e-05, + "loss": 4.512, + "step": 14706 + }, + { + "epoch": 0.08746669521362642, + "grad_norm": 1.4967073202133179, + "learning_rate": 4.9062217538679824e-05, + "loss": 4.7159, + "step": 14707 + }, + { + "epoch": 0.08747264249690741, + "grad_norm": 1.4621938467025757, + "learning_rate": 4.9062090800481174e-05, + "loss": 4.7553, + "step": 14708 + }, + { + "epoch": 0.08747858978018841, + "grad_norm": 1.694868564605713, + "learning_rate": 4.9061964053882694e-05, + "loss": 4.6801, + "step": 14709 + }, + { + "epoch": 0.08748453706346941, + "grad_norm": 1.6228396892547607, + "learning_rate": 4.906183729888444e-05, + "loss": 4.5402, + "step": 14710 + }, + { + "epoch": 0.0874904843467504, + "grad_norm": 1.388859748840332, + "learning_rate": 4.9061710535486435e-05, + "loss": 4.5645, + "step": 14711 + }, + { + "epoch": 0.0874964316300314, + "grad_norm": 1.546074390411377, + "learning_rate": 4.9061583763688746e-05, + "loss": 4.4146, + "step": 14712 + }, + { + "epoch": 0.0875023789133124, + "grad_norm": 1.5526363849639893, + "learning_rate": 4.90614569834914e-05, + "loss": 4.6027, + "step": 14713 + }, + { + "epoch": 0.08750832619659339, + "grad_norm": 1.6809604167938232, + "learning_rate": 4.9061330194894454e-05, + "loss": 4.4927, + "step": 14714 + }, + { + "epoch": 0.0875142734798744, + "grad_norm": 1.8013920783996582, + "learning_rate": 4.906120339789795e-05, + "loss": 4.6949, + "step": 14715 + }, + { + "epoch": 0.0875202207631554, + "grad_norm": 1.587863564491272, + "learning_rate": 4.906107659250192e-05, + "loss": 4.7255, + "step": 14716 + }, + { + "epoch": 0.08752616804643638, + "grad_norm": 1.4871174097061157, + "learning_rate": 4.9060949778706415e-05, + "loss": 4.6753, + "step": 14717 + }, + { + "epoch": 0.08753211532971739, + "grad_norm": 1.5521314144134521, + "learning_rate": 4.9060822956511485e-05, + "loss": 4.6963, + "step": 14718 + }, + { + "epoch": 0.08753806261299839, + "grad_norm": 1.5176832675933838, + "learning_rate": 4.906069612591717e-05, + "loss": 4.7475, + "step": 14719 + }, + { + "epoch": 0.08754400989627938, + "grad_norm": 1.7381534576416016, + "learning_rate": 4.906056928692352e-05, + "loss": 4.6952, + "step": 14720 + }, + { + "epoch": 0.08754995717956038, + "grad_norm": 1.604637622833252, + "learning_rate": 4.9060442439530564e-05, + "loss": 4.5792, + "step": 14721 + }, + { + "epoch": 0.08755590446284138, + "grad_norm": 1.6367937326431274, + "learning_rate": 4.9060315583738356e-05, + "loss": 4.6422, + "step": 14722 + }, + { + "epoch": 0.08756185174612237, + "grad_norm": 1.5177057981491089, + "learning_rate": 4.906018871954695e-05, + "loss": 4.5682, + "step": 14723 + }, + { + "epoch": 0.08756779902940337, + "grad_norm": 1.5539237260818481, + "learning_rate": 4.906006184695637e-05, + "loss": 4.5194, + "step": 14724 + }, + { + "epoch": 0.08757374631268437, + "grad_norm": 1.7041072845458984, + "learning_rate": 4.905993496596668e-05, + "loss": 4.6526, + "step": 14725 + }, + { + "epoch": 0.08757969359596536, + "grad_norm": 1.7187644243240356, + "learning_rate": 4.9059808076577914e-05, + "loss": 4.6251, + "step": 14726 + }, + { + "epoch": 0.08758564087924636, + "grad_norm": 1.6393675804138184, + "learning_rate": 4.905968117879012e-05, + "loss": 4.7242, + "step": 14727 + }, + { + "epoch": 0.08759158816252735, + "grad_norm": 1.6426397562026978, + "learning_rate": 4.905955427260333e-05, + "loss": 4.6272, + "step": 14728 + }, + { + "epoch": 0.08759753544580835, + "grad_norm": 1.3231829404830933, + "learning_rate": 4.9059427358017605e-05, + "loss": 4.621, + "step": 14729 + }, + { + "epoch": 0.08760348272908935, + "grad_norm": 1.3970234394073486, + "learning_rate": 4.905930043503298e-05, + "loss": 4.6356, + "step": 14730 + }, + { + "epoch": 0.08760943001237034, + "grad_norm": 1.511977195739746, + "learning_rate": 4.90591735036495e-05, + "loss": 4.7408, + "step": 14731 + }, + { + "epoch": 0.08761537729565134, + "grad_norm": 1.284788727760315, + "learning_rate": 4.9059046563867216e-05, + "loss": 5.2573, + "step": 14732 + }, + { + "epoch": 0.08762132457893235, + "grad_norm": 1.5148005485534668, + "learning_rate": 4.905891961568617e-05, + "loss": 5.0465, + "step": 14733 + }, + { + "epoch": 0.08762727186221334, + "grad_norm": 1.3727401494979858, + "learning_rate": 4.905879265910639e-05, + "loss": 5.0424, + "step": 14734 + }, + { + "epoch": 0.08763321914549434, + "grad_norm": 1.4994157552719116, + "learning_rate": 4.9058665694127945e-05, + "loss": 5.1662, + "step": 14735 + }, + { + "epoch": 0.08763916642877534, + "grad_norm": 1.5002670288085938, + "learning_rate": 4.905853872075087e-05, + "loss": 5.0872, + "step": 14736 + }, + { + "epoch": 0.08764511371205633, + "grad_norm": 1.580439567565918, + "learning_rate": 4.90584117389752e-05, + "loss": 5.1315, + "step": 14737 + }, + { + "epoch": 0.08765106099533733, + "grad_norm": 1.416154384613037, + "learning_rate": 4.9058284748801e-05, + "loss": 5.1066, + "step": 14738 + }, + { + "epoch": 0.08765700827861833, + "grad_norm": 1.5391058921813965, + "learning_rate": 4.905815775022828e-05, + "loss": 5.1724, + "step": 14739 + }, + { + "epoch": 0.08766295556189932, + "grad_norm": 1.20875883102417, + "learning_rate": 4.905803074325712e-05, + "loss": 5.152, + "step": 14740 + }, + { + "epoch": 0.08766890284518032, + "grad_norm": 1.27827787399292, + "learning_rate": 4.9057903727887556e-05, + "loss": 5.0271, + "step": 14741 + }, + { + "epoch": 0.08767485012846132, + "grad_norm": 1.1356613636016846, + "learning_rate": 4.9057776704119615e-05, + "loss": 5.0078, + "step": 14742 + }, + { + "epoch": 0.08768079741174231, + "grad_norm": 1.3931230306625366, + "learning_rate": 4.9057649671953355e-05, + "loss": 5.1253, + "step": 14743 + }, + { + "epoch": 0.08768674469502331, + "grad_norm": 1.553105115890503, + "learning_rate": 4.905752263138882e-05, + "loss": 5.1259, + "step": 14744 + }, + { + "epoch": 0.08769269197830432, + "grad_norm": 1.4004448652267456, + "learning_rate": 4.905739558242605e-05, + "loss": 5.1104, + "step": 14745 + }, + { + "epoch": 0.0876986392615853, + "grad_norm": 1.6295247077941895, + "learning_rate": 4.905726852506509e-05, + "loss": 5.0718, + "step": 14746 + }, + { + "epoch": 0.0877045865448663, + "grad_norm": 1.5966804027557373, + "learning_rate": 4.9057141459306e-05, + "loss": 5.1922, + "step": 14747 + }, + { + "epoch": 0.08771053382814731, + "grad_norm": 1.5448883771896362, + "learning_rate": 4.9057014385148795e-05, + "loss": 4.9715, + "step": 14748 + }, + { + "epoch": 0.0877164811114283, + "grad_norm": 1.5252676010131836, + "learning_rate": 4.905688730259354e-05, + "loss": 5.2128, + "step": 14749 + }, + { + "epoch": 0.0877224283947093, + "grad_norm": 1.387237310409546, + "learning_rate": 4.9056760211640274e-05, + "loss": 5.0933, + "step": 14750 + }, + { + "epoch": 0.0877283756779903, + "grad_norm": 1.3318862915039062, + "learning_rate": 4.905663311228904e-05, + "loss": 5.1849, + "step": 14751 + }, + { + "epoch": 0.08773432296127129, + "grad_norm": 1.4328356981277466, + "learning_rate": 4.905650600453989e-05, + "loss": 5.2287, + "step": 14752 + }, + { + "epoch": 0.08774027024455229, + "grad_norm": 1.4316518306732178, + "learning_rate": 4.905637888839285e-05, + "loss": 4.9774, + "step": 14753 + }, + { + "epoch": 0.08774621752783329, + "grad_norm": 1.1666837930679321, + "learning_rate": 4.9056251763847996e-05, + "loss": 5.2098, + "step": 14754 + }, + { + "epoch": 0.08775216481111428, + "grad_norm": 1.4383636713027954, + "learning_rate": 4.9056124630905333e-05, + "loss": 5.2438, + "step": 14755 + }, + { + "epoch": 0.08775811209439528, + "grad_norm": 2.6009883880615234, + "learning_rate": 4.9055997489564936e-05, + "loss": 5.7232, + "step": 14756 + }, + { + "epoch": 0.08776405937767627, + "grad_norm": 1.3072876930236816, + "learning_rate": 4.905587033982684e-05, + "loss": 5.1811, + "step": 14757 + }, + { + "epoch": 0.08777000666095727, + "grad_norm": 1.2538501024246216, + "learning_rate": 4.9055743181691084e-05, + "loss": 5.1557, + "step": 14758 + }, + { + "epoch": 0.08777595394423827, + "grad_norm": 1.2565419673919678, + "learning_rate": 4.905561601515771e-05, + "loss": 5.129, + "step": 14759 + }, + { + "epoch": 0.08778190122751926, + "grad_norm": 1.3041788339614868, + "learning_rate": 4.905548884022678e-05, + "loss": 5.2048, + "step": 14760 + }, + { + "epoch": 0.08778784851080026, + "grad_norm": 1.4548598527908325, + "learning_rate": 4.905536165689832e-05, + "loss": 5.2405, + "step": 14761 + }, + { + "epoch": 0.08779379579408127, + "grad_norm": 1.1748031377792358, + "learning_rate": 4.905523446517239e-05, + "loss": 5.1804, + "step": 14762 + }, + { + "epoch": 0.08779974307736226, + "grad_norm": 1.210534930229187, + "learning_rate": 4.905510726504902e-05, + "loss": 5.1383, + "step": 14763 + }, + { + "epoch": 0.08780569036064326, + "grad_norm": 1.2154903411865234, + "learning_rate": 4.9054980056528264e-05, + "loss": 5.2757, + "step": 14764 + }, + { + "epoch": 0.08781163764392426, + "grad_norm": 1.4123867750167847, + "learning_rate": 4.9054852839610166e-05, + "loss": 5.1268, + "step": 14765 + }, + { + "epoch": 0.08781758492720525, + "grad_norm": 1.3136295080184937, + "learning_rate": 4.905472561429476e-05, + "loss": 5.2186, + "step": 14766 + }, + { + "epoch": 0.08782353221048625, + "grad_norm": 1.2741068601608276, + "learning_rate": 4.905459838058209e-05, + "loss": 4.9737, + "step": 14767 + }, + { + "epoch": 0.08782947949376725, + "grad_norm": 1.2963054180145264, + "learning_rate": 4.9054471138472225e-05, + "loss": 5.1712, + "step": 14768 + }, + { + "epoch": 0.08783542677704824, + "grad_norm": 1.5352611541748047, + "learning_rate": 4.905434388796519e-05, + "loss": 4.9473, + "step": 14769 + }, + { + "epoch": 0.08784137406032924, + "grad_norm": 1.3399711847305298, + "learning_rate": 4.905421662906103e-05, + "loss": 5.2402, + "step": 14770 + }, + { + "epoch": 0.08784732134361024, + "grad_norm": 1.4278292655944824, + "learning_rate": 4.9054089361759794e-05, + "loss": 4.9331, + "step": 14771 + }, + { + "epoch": 0.08785326862689123, + "grad_norm": 1.5057200193405151, + "learning_rate": 4.905396208606151e-05, + "loss": 5.1553, + "step": 14772 + }, + { + "epoch": 0.08785921591017223, + "grad_norm": 1.4660797119140625, + "learning_rate": 4.905383480196625e-05, + "loss": 5.0792, + "step": 14773 + }, + { + "epoch": 0.08786516319345324, + "grad_norm": 1.4386217594146729, + "learning_rate": 4.905370750947405e-05, + "loss": 4.8363, + "step": 14774 + }, + { + "epoch": 0.08787111047673422, + "grad_norm": 1.4555455446243286, + "learning_rate": 4.905358020858493e-05, + "loss": 4.8934, + "step": 14775 + }, + { + "epoch": 0.08787705776001523, + "grad_norm": 1.5161443948745728, + "learning_rate": 4.905345289929897e-05, + "loss": 4.8227, + "step": 14776 + }, + { + "epoch": 0.08788300504329623, + "grad_norm": 1.2704185247421265, + "learning_rate": 4.9053325581616185e-05, + "loss": 4.9612, + "step": 14777 + }, + { + "epoch": 0.08788895232657722, + "grad_norm": 1.6396795511245728, + "learning_rate": 4.905319825553664e-05, + "loss": 4.8947, + "step": 14778 + }, + { + "epoch": 0.08789489960985822, + "grad_norm": 1.49285888671875, + "learning_rate": 4.905307092106037e-05, + "loss": 5.0814, + "step": 14779 + }, + { + "epoch": 0.08790084689313922, + "grad_norm": 1.3829785585403442, + "learning_rate": 4.9052943578187424e-05, + "loss": 5.3864, + "step": 14780 + }, + { + "epoch": 0.08790679417642021, + "grad_norm": 1.517054557800293, + "learning_rate": 4.905281622691784e-05, + "loss": 5.3053, + "step": 14781 + }, + { + "epoch": 0.08791274145970121, + "grad_norm": 1.491402506828308, + "learning_rate": 4.905268886725167e-05, + "loss": 5.3685, + "step": 14782 + }, + { + "epoch": 0.08791868874298221, + "grad_norm": 1.5034211874008179, + "learning_rate": 4.905256149918895e-05, + "loss": 5.2139, + "step": 14783 + }, + { + "epoch": 0.0879246360262632, + "grad_norm": 1.4021977186203003, + "learning_rate": 4.905243412272974e-05, + "loss": 5.301, + "step": 14784 + }, + { + "epoch": 0.0879305833095442, + "grad_norm": 1.44327974319458, + "learning_rate": 4.9052306737874064e-05, + "loss": 5.296, + "step": 14785 + }, + { + "epoch": 0.08793653059282519, + "grad_norm": 1.4733220338821411, + "learning_rate": 4.905217934462198e-05, + "loss": 5.3302, + "step": 14786 + }, + { + "epoch": 0.08794247787610619, + "grad_norm": 1.3308794498443604, + "learning_rate": 4.9052051942973533e-05, + "loss": 5.1835, + "step": 14787 + }, + { + "epoch": 0.0879484251593872, + "grad_norm": 1.2667236328125, + "learning_rate": 4.905192453292876e-05, + "loss": 5.1801, + "step": 14788 + }, + { + "epoch": 0.08795437244266818, + "grad_norm": 1.3284921646118164, + "learning_rate": 4.90517971144877e-05, + "loss": 5.106, + "step": 14789 + }, + { + "epoch": 0.08796031972594918, + "grad_norm": 1.4089261293411255, + "learning_rate": 4.9051669687650415e-05, + "loss": 5.133, + "step": 14790 + }, + { + "epoch": 0.08796626700923019, + "grad_norm": 1.1701233386993408, + "learning_rate": 4.905154225241694e-05, + "loss": 5.1602, + "step": 14791 + }, + { + "epoch": 0.08797221429251117, + "grad_norm": 1.169570803642273, + "learning_rate": 4.9051414808787324e-05, + "loss": 5.1231, + "step": 14792 + }, + { + "epoch": 0.08797816157579218, + "grad_norm": 1.5104409456253052, + "learning_rate": 4.90512873567616e-05, + "loss": 5.0774, + "step": 14793 + }, + { + "epoch": 0.08798410885907318, + "grad_norm": 1.3065992593765259, + "learning_rate": 4.9051159896339816e-05, + "loss": 4.9547, + "step": 14794 + }, + { + "epoch": 0.08799005614235417, + "grad_norm": 1.6417936086654663, + "learning_rate": 4.905103242752203e-05, + "loss": 5.2734, + "step": 14795 + }, + { + "epoch": 0.08799600342563517, + "grad_norm": 2.1529974937438965, + "learning_rate": 4.905090495030827e-05, + "loss": 5.1999, + "step": 14796 + }, + { + "epoch": 0.08800195070891617, + "grad_norm": 1.6746312379837036, + "learning_rate": 4.90507774646986e-05, + "loss": 4.959, + "step": 14797 + }, + { + "epoch": 0.08800789799219716, + "grad_norm": 1.4422825574874878, + "learning_rate": 4.905064997069304e-05, + "loss": 5.0581, + "step": 14798 + }, + { + "epoch": 0.08801384527547816, + "grad_norm": 1.658833622932434, + "learning_rate": 4.9050522468291646e-05, + "loss": 4.9591, + "step": 14799 + }, + { + "epoch": 0.08801979255875916, + "grad_norm": 1.4971596002578735, + "learning_rate": 4.9050394957494464e-05, + "loss": 5.2515, + "step": 14800 + }, + { + "epoch": 0.08802573984204015, + "grad_norm": 1.5866429805755615, + "learning_rate": 4.9050267438301546e-05, + "loss": 5.1084, + "step": 14801 + }, + { + "epoch": 0.08803168712532115, + "grad_norm": 1.5049015283584595, + "learning_rate": 4.9050139910712925e-05, + "loss": 5.1102, + "step": 14802 + }, + { + "epoch": 0.08803763440860216, + "grad_norm": 1.6711664199829102, + "learning_rate": 4.905001237472864e-05, + "loss": 5.0215, + "step": 14803 + }, + { + "epoch": 0.08804358169188314, + "grad_norm": 1.6390610933303833, + "learning_rate": 4.904988483034875e-05, + "loss": 4.978, + "step": 14804 + }, + { + "epoch": 0.08804952897516415, + "grad_norm": 1.5968292951583862, + "learning_rate": 4.9049757277573295e-05, + "loss": 5.0183, + "step": 14805 + }, + { + "epoch": 0.08805547625844515, + "grad_norm": 1.4864193201065063, + "learning_rate": 4.9049629716402325e-05, + "loss": 5.5199, + "step": 14806 + }, + { + "epoch": 0.08806142354172614, + "grad_norm": 1.5658420324325562, + "learning_rate": 4.904950214683587e-05, + "loss": 5.4906, + "step": 14807 + }, + { + "epoch": 0.08806737082500714, + "grad_norm": 1.5811707973480225, + "learning_rate": 4.9049374568873975e-05, + "loss": 5.5795, + "step": 14808 + }, + { + "epoch": 0.08807331810828814, + "grad_norm": 1.418641448020935, + "learning_rate": 4.90492469825167e-05, + "loss": 5.3616, + "step": 14809 + }, + { + "epoch": 0.08807926539156913, + "grad_norm": 1.323500633239746, + "learning_rate": 4.904911938776408e-05, + "loss": 5.2641, + "step": 14810 + }, + { + "epoch": 0.08808521267485013, + "grad_norm": 1.590867280960083, + "learning_rate": 4.904899178461616e-05, + "loss": 5.3782, + "step": 14811 + }, + { + "epoch": 0.08809115995813113, + "grad_norm": 1.243213176727295, + "learning_rate": 4.904886417307299e-05, + "loss": 5.4743, + "step": 14812 + }, + { + "epoch": 0.08809710724141212, + "grad_norm": 1.5051169395446777, + "learning_rate": 4.9048736553134614e-05, + "loss": 5.3046, + "step": 14813 + }, + { + "epoch": 0.08810305452469312, + "grad_norm": 1.334234356880188, + "learning_rate": 4.904860892480106e-05, + "loss": 5.2673, + "step": 14814 + }, + { + "epoch": 0.08810900180797411, + "grad_norm": 1.4352458715438843, + "learning_rate": 4.904848128807239e-05, + "loss": 5.3465, + "step": 14815 + }, + { + "epoch": 0.08811494909125511, + "grad_norm": 1.6878329515457153, + "learning_rate": 4.904835364294864e-05, + "loss": 5.3467, + "step": 14816 + }, + { + "epoch": 0.08812089637453611, + "grad_norm": 1.542100191116333, + "learning_rate": 4.904822598942986e-05, + "loss": 5.4147, + "step": 14817 + }, + { + "epoch": 0.0881268436578171, + "grad_norm": 1.5099046230316162, + "learning_rate": 4.90480983275161e-05, + "loss": 5.7198, + "step": 14818 + }, + { + "epoch": 0.0881327909410981, + "grad_norm": 1.6120097637176514, + "learning_rate": 4.9047970657207395e-05, + "loss": 5.4417, + "step": 14819 + }, + { + "epoch": 0.0881387382243791, + "grad_norm": 1.455407977104187, + "learning_rate": 4.904784297850379e-05, + "loss": 5.3028, + "step": 14820 + }, + { + "epoch": 0.0881446855076601, + "grad_norm": 1.589712381362915, + "learning_rate": 4.904771529140533e-05, + "loss": 5.2493, + "step": 14821 + }, + { + "epoch": 0.0881506327909411, + "grad_norm": 1.5051584243774414, + "learning_rate": 4.904758759591206e-05, + "loss": 5.2225, + "step": 14822 + }, + { + "epoch": 0.0881565800742221, + "grad_norm": 1.3623727560043335, + "learning_rate": 4.9047459892024026e-05, + "loss": 5.1738, + "step": 14823 + }, + { + "epoch": 0.08816252735750309, + "grad_norm": 1.4643206596374512, + "learning_rate": 4.9047332179741274e-05, + "loss": 5.123, + "step": 14824 + }, + { + "epoch": 0.08816847464078409, + "grad_norm": 1.4233453273773193, + "learning_rate": 4.904720445906384e-05, + "loss": 4.9263, + "step": 14825 + }, + { + "epoch": 0.08817442192406509, + "grad_norm": 1.6479318141937256, + "learning_rate": 4.9047076729991786e-05, + "loss": 4.9663, + "step": 14826 + }, + { + "epoch": 0.08818036920734608, + "grad_norm": 1.4759633541107178, + "learning_rate": 4.9046948992525145e-05, + "loss": 5.0326, + "step": 14827 + }, + { + "epoch": 0.08818631649062708, + "grad_norm": 1.435533046722412, + "learning_rate": 4.904682124666395e-05, + "loss": 5.0819, + "step": 14828 + }, + { + "epoch": 0.08819226377390808, + "grad_norm": 1.4540610313415527, + "learning_rate": 4.904669349240827e-05, + "loss": 5.391, + "step": 14829 + }, + { + "epoch": 0.08819821105718907, + "grad_norm": 1.6308038234710693, + "learning_rate": 4.904656572975814e-05, + "loss": 4.9723, + "step": 14830 + }, + { + "epoch": 0.08820415834047007, + "grad_norm": 1.453600287437439, + "learning_rate": 4.90464379587136e-05, + "loss": 5.1689, + "step": 14831 + }, + { + "epoch": 0.08821010562375108, + "grad_norm": 1.4876199960708618, + "learning_rate": 4.904631017927469e-05, + "loss": 5.1163, + "step": 14832 + }, + { + "epoch": 0.08821605290703206, + "grad_norm": 1.4240463972091675, + "learning_rate": 4.9046182391441466e-05, + "loss": 5.1154, + "step": 14833 + }, + { + "epoch": 0.08822200019031307, + "grad_norm": 1.4176205396652222, + "learning_rate": 4.904605459521397e-05, + "loss": 5.1587, + "step": 14834 + }, + { + "epoch": 0.08822794747359407, + "grad_norm": 1.302998423576355, + "learning_rate": 4.9045926790592244e-05, + "loss": 5.1302, + "step": 14835 + }, + { + "epoch": 0.08823389475687506, + "grad_norm": 1.4490020275115967, + "learning_rate": 4.904579897757633e-05, + "loss": 5.0817, + "step": 14836 + }, + { + "epoch": 0.08823984204015606, + "grad_norm": 1.4430203437805176, + "learning_rate": 4.9045671156166276e-05, + "loss": 5.1334, + "step": 14837 + }, + { + "epoch": 0.08824578932343706, + "grad_norm": 1.326277494430542, + "learning_rate": 4.9045543326362134e-05, + "loss": 5.3292, + "step": 14838 + }, + { + "epoch": 0.08825173660671805, + "grad_norm": 1.373415470123291, + "learning_rate": 4.9045415488163936e-05, + "loss": 5.454, + "step": 14839 + }, + { + "epoch": 0.08825768388999905, + "grad_norm": 1.4334250688552856, + "learning_rate": 4.904528764157173e-05, + "loss": 5.2735, + "step": 14840 + }, + { + "epoch": 0.08826363117328005, + "grad_norm": 1.4029041528701782, + "learning_rate": 4.904515978658556e-05, + "loss": 5.0549, + "step": 14841 + }, + { + "epoch": 0.08826957845656104, + "grad_norm": 1.355177879333496, + "learning_rate": 4.904503192320548e-05, + "loss": 5.2569, + "step": 14842 + }, + { + "epoch": 0.08827552573984204, + "grad_norm": 1.2063989639282227, + "learning_rate": 4.904490405143153e-05, + "loss": 5.2469, + "step": 14843 + }, + { + "epoch": 0.08828147302312303, + "grad_norm": 1.2290265560150146, + "learning_rate": 4.904477617126374e-05, + "loss": 5.255, + "step": 14844 + }, + { + "epoch": 0.08828742030640403, + "grad_norm": 1.0648494958877563, + "learning_rate": 4.904464828270218e-05, + "loss": 5.2423, + "step": 14845 + }, + { + "epoch": 0.08829336758968503, + "grad_norm": 1.362572431564331, + "learning_rate": 4.904452038574687e-05, + "loss": 5.3856, + "step": 14846 + }, + { + "epoch": 0.08829931487296602, + "grad_norm": 1.3004114627838135, + "learning_rate": 4.9044392480397886e-05, + "loss": 5.0672, + "step": 14847 + }, + { + "epoch": 0.08830526215624702, + "grad_norm": 1.4852789640426636, + "learning_rate": 4.904426456665523e-05, + "loss": 5.2145, + "step": 14848 + }, + { + "epoch": 0.08831120943952803, + "grad_norm": 1.4221493005752563, + "learning_rate": 4.9044136644518976e-05, + "loss": 5.4544, + "step": 14849 + }, + { + "epoch": 0.08831715672280901, + "grad_norm": 1.4444363117218018, + "learning_rate": 4.904400871398917e-05, + "loss": 5.3342, + "step": 14850 + }, + { + "epoch": 0.08832310400609002, + "grad_norm": 1.1723617315292358, + "learning_rate": 4.904388077506585e-05, + "loss": 5.3846, + "step": 14851 + }, + { + "epoch": 0.08832905128937102, + "grad_norm": 1.3458356857299805, + "learning_rate": 4.904375282774905e-05, + "loss": 5.3903, + "step": 14852 + }, + { + "epoch": 0.08833499857265201, + "grad_norm": 1.4839876890182495, + "learning_rate": 4.904362487203883e-05, + "loss": 5.0889, + "step": 14853 + }, + { + "epoch": 0.08834094585593301, + "grad_norm": 1.6487696170806885, + "learning_rate": 4.904349690793523e-05, + "loss": 5.0904, + "step": 14854 + }, + { + "epoch": 0.08834689313921401, + "grad_norm": 1.5201997756958008, + "learning_rate": 4.904336893543829e-05, + "loss": 4.9017, + "step": 14855 + }, + { + "epoch": 0.088352840422495, + "grad_norm": 1.5502886772155762, + "learning_rate": 4.904324095454806e-05, + "loss": 4.931, + "step": 14856 + }, + { + "epoch": 0.088358787705776, + "grad_norm": 1.4996228218078613, + "learning_rate": 4.904311296526458e-05, + "loss": 5.0773, + "step": 14857 + }, + { + "epoch": 0.088364734989057, + "grad_norm": 1.7004456520080566, + "learning_rate": 4.90429849675879e-05, + "loss": 4.9913, + "step": 14858 + }, + { + "epoch": 0.08837068227233799, + "grad_norm": 1.426007866859436, + "learning_rate": 4.904285696151806e-05, + "loss": 5.1312, + "step": 14859 + }, + { + "epoch": 0.088376629555619, + "grad_norm": 1.4049350023269653, + "learning_rate": 4.904272894705512e-05, + "loss": 5.0539, + "step": 14860 + }, + { + "epoch": 0.0883825768389, + "grad_norm": 1.558273434638977, + "learning_rate": 4.9042600924199096e-05, + "loss": 5.0822, + "step": 14861 + }, + { + "epoch": 0.08838852412218098, + "grad_norm": 1.6177934408187866, + "learning_rate": 4.9042472892950055e-05, + "loss": 5.1646, + "step": 14862 + }, + { + "epoch": 0.08839447140546199, + "grad_norm": 1.5152839422225952, + "learning_rate": 4.904234485330803e-05, + "loss": 5.0144, + "step": 14863 + }, + { + "epoch": 0.08840041868874299, + "grad_norm": 1.474231243133545, + "learning_rate": 4.904221680527308e-05, + "loss": 5.1063, + "step": 14864 + }, + { + "epoch": 0.08840636597202398, + "grad_norm": 1.5897177457809448, + "learning_rate": 4.904208874884523e-05, + "loss": 4.9724, + "step": 14865 + }, + { + "epoch": 0.08841231325530498, + "grad_norm": 1.604368805885315, + "learning_rate": 4.904196068402454e-05, + "loss": 4.8905, + "step": 14866 + }, + { + "epoch": 0.08841826053858598, + "grad_norm": 1.338458776473999, + "learning_rate": 4.904183261081105e-05, + "loss": 4.7829, + "step": 14867 + }, + { + "epoch": 0.08842420782186697, + "grad_norm": 1.62189781665802, + "learning_rate": 4.9041704529204806e-05, + "loss": 4.8025, + "step": 14868 + }, + { + "epoch": 0.08843015510514797, + "grad_norm": 1.555298089981079, + "learning_rate": 4.904157643920585e-05, + "loss": 4.9098, + "step": 14869 + }, + { + "epoch": 0.08843610238842897, + "grad_norm": 1.5110834836959839, + "learning_rate": 4.904144834081423e-05, + "loss": 4.8648, + "step": 14870 + }, + { + "epoch": 0.08844204967170996, + "grad_norm": 1.59073805809021, + "learning_rate": 4.904132023402999e-05, + "loss": 4.8997, + "step": 14871 + }, + { + "epoch": 0.08844799695499096, + "grad_norm": 1.5218732357025146, + "learning_rate": 4.904119211885316e-05, + "loss": 5.352, + "step": 14872 + }, + { + "epoch": 0.08845394423827196, + "grad_norm": 1.5263079404830933, + "learning_rate": 4.904106399528382e-05, + "loss": 4.8921, + "step": 14873 + }, + { + "epoch": 0.08845989152155295, + "grad_norm": 1.6151986122131348, + "learning_rate": 4.904093586332198e-05, + "loss": 5.0086, + "step": 14874 + }, + { + "epoch": 0.08846583880483395, + "grad_norm": 1.4971787929534912, + "learning_rate": 4.90408077229677e-05, + "loss": 5.0119, + "step": 14875 + }, + { + "epoch": 0.08847178608811494, + "grad_norm": 1.4897308349609375, + "learning_rate": 4.904067957422102e-05, + "loss": 5.0175, + "step": 14876 + }, + { + "epoch": 0.08847773337139594, + "grad_norm": 1.4023786783218384, + "learning_rate": 4.904055141708199e-05, + "loss": 5.0361, + "step": 14877 + }, + { + "epoch": 0.08848368065467695, + "grad_norm": 1.4664498567581177, + "learning_rate": 4.904042325155065e-05, + "loss": 4.9784, + "step": 14878 + }, + { + "epoch": 0.08848962793795793, + "grad_norm": 1.390824556350708, + "learning_rate": 4.904029507762704e-05, + "loss": 4.9922, + "step": 14879 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 1.9508315324783325, + "learning_rate": 4.904016689531122e-05, + "loss": 5.6352, + "step": 14880 + }, + { + "epoch": 0.08850152250451994, + "grad_norm": 1.4192322492599487, + "learning_rate": 4.904003870460323e-05, + "loss": 5.0654, + "step": 14881 + }, + { + "epoch": 0.08850746978780093, + "grad_norm": 1.5868372917175293, + "learning_rate": 4.903991050550311e-05, + "loss": 4.9631, + "step": 14882 + }, + { + "epoch": 0.08851341707108193, + "grad_norm": 1.405555009841919, + "learning_rate": 4.903978229801089e-05, + "loss": 5.1311, + "step": 14883 + }, + { + "epoch": 0.08851936435436293, + "grad_norm": 1.453817367553711, + "learning_rate": 4.9039654082126646e-05, + "loss": 5.0866, + "step": 14884 + }, + { + "epoch": 0.08852531163764392, + "grad_norm": 1.5051809549331665, + "learning_rate": 4.9039525857850404e-05, + "loss": 5.1606, + "step": 14885 + }, + { + "epoch": 0.08853125892092492, + "grad_norm": 1.5323255062103271, + "learning_rate": 4.9039397625182206e-05, + "loss": 5.1564, + "step": 14886 + }, + { + "epoch": 0.08853720620420592, + "grad_norm": 1.5018506050109863, + "learning_rate": 4.903926938412211e-05, + "loss": 4.9672, + "step": 14887 + }, + { + "epoch": 0.08854315348748691, + "grad_norm": 1.488289713859558, + "learning_rate": 4.903914113467015e-05, + "loss": 4.882, + "step": 14888 + }, + { + "epoch": 0.08854910077076791, + "grad_norm": 1.434045672416687, + "learning_rate": 4.903901287682637e-05, + "loss": 5.0748, + "step": 14889 + }, + { + "epoch": 0.08855504805404892, + "grad_norm": 1.5172244310379028, + "learning_rate": 4.903888461059083e-05, + "loss": 5.065, + "step": 14890 + }, + { + "epoch": 0.0885609953373299, + "grad_norm": 1.545283555984497, + "learning_rate": 4.903875633596356e-05, + "loss": 5.2187, + "step": 14891 + }, + { + "epoch": 0.0885669426206109, + "grad_norm": 1.3149688243865967, + "learning_rate": 4.90386280529446e-05, + "loss": 4.9977, + "step": 14892 + }, + { + "epoch": 0.08857288990389191, + "grad_norm": 1.4925106763839722, + "learning_rate": 4.903849976153401e-05, + "loss": 5.0622, + "step": 14893 + }, + { + "epoch": 0.0885788371871729, + "grad_norm": 1.6073296070098877, + "learning_rate": 4.903837146173183e-05, + "loss": 5.0823, + "step": 14894 + }, + { + "epoch": 0.0885847844704539, + "grad_norm": 1.2879148721694946, + "learning_rate": 4.9038243153538096e-05, + "loss": 5.1574, + "step": 14895 + }, + { + "epoch": 0.0885907317537349, + "grad_norm": 1.6396079063415527, + "learning_rate": 4.903811483695287e-05, + "loss": 5.1748, + "step": 14896 + }, + { + "epoch": 0.08859667903701589, + "grad_norm": 1.426180124282837, + "learning_rate": 4.903798651197618e-05, + "loss": 5.0374, + "step": 14897 + }, + { + "epoch": 0.08860262632029689, + "grad_norm": 1.3685684204101562, + "learning_rate": 4.9037858178608076e-05, + "loss": 4.9373, + "step": 14898 + }, + { + "epoch": 0.08860857360357789, + "grad_norm": 1.5495455265045166, + "learning_rate": 4.903772983684861e-05, + "loss": 5.0696, + "step": 14899 + }, + { + "epoch": 0.08861452088685888, + "grad_norm": 1.4423854351043701, + "learning_rate": 4.9037601486697815e-05, + "loss": 5.1359, + "step": 14900 + }, + { + "epoch": 0.08862046817013988, + "grad_norm": 1.4704400300979614, + "learning_rate": 4.9037473128155745e-05, + "loss": 5.0438, + "step": 14901 + }, + { + "epoch": 0.08862641545342088, + "grad_norm": 1.49704909324646, + "learning_rate": 4.903734476122244e-05, + "loss": 5.0305, + "step": 14902 + }, + { + "epoch": 0.08863236273670187, + "grad_norm": 1.3732075691223145, + "learning_rate": 4.903721638589795e-05, + "loss": 4.9659, + "step": 14903 + }, + { + "epoch": 0.08863831001998287, + "grad_norm": 1.5920335054397583, + "learning_rate": 4.903708800218231e-05, + "loss": 4.9936, + "step": 14904 + }, + { + "epoch": 0.08864425730326386, + "grad_norm": 1.6084437370300293, + "learning_rate": 4.9036959610075575e-05, + "loss": 5.0048, + "step": 14905 + }, + { + "epoch": 0.08865020458654486, + "grad_norm": 1.2329050302505493, + "learning_rate": 4.903683120957778e-05, + "loss": 4.9729, + "step": 14906 + }, + { + "epoch": 0.08865615186982587, + "grad_norm": 1.4001328945159912, + "learning_rate": 4.903670280068898e-05, + "loss": 4.9577, + "step": 14907 + }, + { + "epoch": 0.08866209915310685, + "grad_norm": 1.3499484062194824, + "learning_rate": 4.903657438340921e-05, + "loss": 4.8696, + "step": 14908 + }, + { + "epoch": 0.08866804643638786, + "grad_norm": 1.3606812953948975, + "learning_rate": 4.903644595773853e-05, + "loss": 4.9142, + "step": 14909 + }, + { + "epoch": 0.08867399371966886, + "grad_norm": 1.3275173902511597, + "learning_rate": 4.9036317523676964e-05, + "loss": 5.032, + "step": 14910 + }, + { + "epoch": 0.08867994100294985, + "grad_norm": 1.5485349893569946, + "learning_rate": 4.903618908122458e-05, + "loss": 4.9252, + "step": 14911 + }, + { + "epoch": 0.08868588828623085, + "grad_norm": 1.4325098991394043, + "learning_rate": 4.9036060630381395e-05, + "loss": 4.9971, + "step": 14912 + }, + { + "epoch": 0.08869183556951185, + "grad_norm": 1.4953216314315796, + "learning_rate": 4.903593217114748e-05, + "loss": 4.8228, + "step": 14913 + }, + { + "epoch": 0.08869778285279284, + "grad_norm": 1.4761654138565063, + "learning_rate": 4.9035803703522876e-05, + "loss": 4.8365, + "step": 14914 + }, + { + "epoch": 0.08870373013607384, + "grad_norm": 1.3572559356689453, + "learning_rate": 4.9035675227507615e-05, + "loss": 4.8409, + "step": 14915 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 1.3793766498565674, + "learning_rate": 4.903554674310175e-05, + "loss": 4.8748, + "step": 14916 + }, + { + "epoch": 0.08871562470263583, + "grad_norm": 1.2097266912460327, + "learning_rate": 4.9035418250305314e-05, + "loss": 4.9695, + "step": 14917 + }, + { + "epoch": 0.08872157198591683, + "grad_norm": 1.5097788572311401, + "learning_rate": 4.903528974911837e-05, + "loss": 4.9205, + "step": 14918 + }, + { + "epoch": 0.08872751926919784, + "grad_norm": 1.474219560623169, + "learning_rate": 4.903516123954095e-05, + "loss": 4.9382, + "step": 14919 + }, + { + "epoch": 0.08873346655247882, + "grad_norm": 1.4695779085159302, + "learning_rate": 4.903503272157311e-05, + "loss": 5.1486, + "step": 14920 + }, + { + "epoch": 0.08873941383575983, + "grad_norm": 1.6874669790267944, + "learning_rate": 4.903490419521488e-05, + "loss": 5.6441, + "step": 14921 + }, + { + "epoch": 0.08874536111904083, + "grad_norm": 1.5862348079681396, + "learning_rate": 4.903477566046632e-05, + "loss": 5.1457, + "step": 14922 + }, + { + "epoch": 0.08875130840232182, + "grad_norm": 1.5781593322753906, + "learning_rate": 4.903464711732747e-05, + "loss": 4.915, + "step": 14923 + }, + { + "epoch": 0.08875725568560282, + "grad_norm": 1.5252950191497803, + "learning_rate": 4.903451856579837e-05, + "loss": 5.0672, + "step": 14924 + }, + { + "epoch": 0.08876320296888382, + "grad_norm": 1.575958013534546, + "learning_rate": 4.9034390005879065e-05, + "loss": 4.9914, + "step": 14925 + }, + { + "epoch": 0.08876915025216481, + "grad_norm": 1.3837618827819824, + "learning_rate": 4.90342614375696e-05, + "loss": 5.1778, + "step": 14926 + }, + { + "epoch": 0.08877509753544581, + "grad_norm": 1.4716275930404663, + "learning_rate": 4.9034132860870036e-05, + "loss": 5.2625, + "step": 14927 + }, + { + "epoch": 0.08878104481872681, + "grad_norm": 1.2883623838424683, + "learning_rate": 4.90340042757804e-05, + "loss": 5.2357, + "step": 14928 + }, + { + "epoch": 0.0887869921020078, + "grad_norm": 1.521010398864746, + "learning_rate": 4.9033875682300736e-05, + "loss": 5.4941, + "step": 14929 + }, + { + "epoch": 0.0887929393852888, + "grad_norm": 1.5457875728607178, + "learning_rate": 4.903374708043109e-05, + "loss": 5.3108, + "step": 14930 + }, + { + "epoch": 0.0887988866685698, + "grad_norm": 1.4583250284194946, + "learning_rate": 4.903361847017152e-05, + "loss": 5.425, + "step": 14931 + }, + { + "epoch": 0.08880483395185079, + "grad_norm": 1.561854362487793, + "learning_rate": 4.903348985152206e-05, + "loss": 5.4267, + "step": 14932 + }, + { + "epoch": 0.0888107812351318, + "grad_norm": 1.6274350881576538, + "learning_rate": 4.9033361224482756e-05, + "loss": 5.3266, + "step": 14933 + }, + { + "epoch": 0.08881672851841278, + "grad_norm": 1.3476616144180298, + "learning_rate": 4.903323258905366e-05, + "loss": 5.248, + "step": 14934 + }, + { + "epoch": 0.08882267580169378, + "grad_norm": 1.3584541082382202, + "learning_rate": 4.90331039452348e-05, + "loss": 5.3101, + "step": 14935 + }, + { + "epoch": 0.08882862308497479, + "grad_norm": 1.5269302129745483, + "learning_rate": 4.903297529302624e-05, + "loss": 5.3451, + "step": 14936 + }, + { + "epoch": 0.08883457036825577, + "grad_norm": 1.5320923328399658, + "learning_rate": 4.903284663242801e-05, + "loss": 5.4289, + "step": 14937 + }, + { + "epoch": 0.08884051765153678, + "grad_norm": 1.5647650957107544, + "learning_rate": 4.9032717963440166e-05, + "loss": 5.2925, + "step": 14938 + }, + { + "epoch": 0.08884646493481778, + "grad_norm": 1.3379693031311035, + "learning_rate": 4.9032589286062744e-05, + "loss": 5.3314, + "step": 14939 + }, + { + "epoch": 0.08885241221809877, + "grad_norm": 1.5872068405151367, + "learning_rate": 4.90324606002958e-05, + "loss": 5.3521, + "step": 14940 + }, + { + "epoch": 0.08885835950137977, + "grad_norm": 1.473799228668213, + "learning_rate": 4.9032331906139373e-05, + "loss": 5.3697, + "step": 14941 + }, + { + "epoch": 0.08886430678466077, + "grad_norm": 2.2111928462982178, + "learning_rate": 4.90322032035935e-05, + "loss": 5.0139, + "step": 14942 + }, + { + "epoch": 0.08887025406794176, + "grad_norm": 1.386910319328308, + "learning_rate": 4.903207449265824e-05, + "loss": 5.3982, + "step": 14943 + }, + { + "epoch": 0.08887620135122276, + "grad_norm": 1.4972623586654663, + "learning_rate": 4.9031945773333624e-05, + "loss": 5.4207, + "step": 14944 + }, + { + "epoch": 0.08888214863450376, + "grad_norm": 1.6061536073684692, + "learning_rate": 4.903181704561971e-05, + "loss": 5.4265, + "step": 14945 + }, + { + "epoch": 0.08888809591778475, + "grad_norm": 1.5003243684768677, + "learning_rate": 4.903168830951653e-05, + "loss": 5.2323, + "step": 14946 + }, + { + "epoch": 0.08889404320106575, + "grad_norm": 1.4466320276260376, + "learning_rate": 4.9031559565024144e-05, + "loss": 5.3054, + "step": 14947 + }, + { + "epoch": 0.08889999048434676, + "grad_norm": 1.4495269060134888, + "learning_rate": 4.9031430812142584e-05, + "loss": 5.2725, + "step": 14948 + }, + { + "epoch": 0.08890593776762774, + "grad_norm": 1.2909798622131348, + "learning_rate": 4.9031302050871896e-05, + "loss": 5.13, + "step": 14949 + }, + { + "epoch": 0.08891188505090875, + "grad_norm": 1.368377685546875, + "learning_rate": 4.903117328121214e-05, + "loss": 5.0471, + "step": 14950 + }, + { + "epoch": 0.08891783233418975, + "grad_norm": 1.3496042490005493, + "learning_rate": 4.903104450316334e-05, + "loss": 5.1209, + "step": 14951 + }, + { + "epoch": 0.08892377961747074, + "grad_norm": 1.593047022819519, + "learning_rate": 4.9030915716725554e-05, + "loss": 5.2551, + "step": 14952 + }, + { + "epoch": 0.08892972690075174, + "grad_norm": 1.3550326824188232, + "learning_rate": 4.903078692189882e-05, + "loss": 5.2543, + "step": 14953 + }, + { + "epoch": 0.08893567418403274, + "grad_norm": 1.4302785396575928, + "learning_rate": 4.903065811868319e-05, + "loss": 5.2828, + "step": 14954 + }, + { + "epoch": 0.08894162146731373, + "grad_norm": 1.578244686126709, + "learning_rate": 4.903052930707871e-05, + "loss": 5.0593, + "step": 14955 + }, + { + "epoch": 0.08894756875059473, + "grad_norm": 1.248634696006775, + "learning_rate": 4.903040048708541e-05, + "loss": 5.0644, + "step": 14956 + }, + { + "epoch": 0.08895351603387573, + "grad_norm": 1.4040237665176392, + "learning_rate": 4.903027165870336e-05, + "loss": 5.0951, + "step": 14957 + }, + { + "epoch": 0.08895946331715672, + "grad_norm": 1.1941477060317993, + "learning_rate": 4.903014282193258e-05, + "loss": 5.0298, + "step": 14958 + }, + { + "epoch": 0.08896541060043772, + "grad_norm": 1.4292995929718018, + "learning_rate": 4.9030013976773125e-05, + "loss": 5.1567, + "step": 14959 + }, + { + "epoch": 0.08897135788371872, + "grad_norm": 1.4789859056472778, + "learning_rate": 4.902988512322505e-05, + "loss": 5.2172, + "step": 14960 + }, + { + "epoch": 0.08897730516699971, + "grad_norm": 2.160266876220703, + "learning_rate": 4.9029756261288376e-05, + "loss": 5.3458, + "step": 14961 + }, + { + "epoch": 0.08898325245028071, + "grad_norm": 1.8164606094360352, + "learning_rate": 4.902962739096317e-05, + "loss": 5.2795, + "step": 14962 + }, + { + "epoch": 0.0889891997335617, + "grad_norm": 2.0879664421081543, + "learning_rate": 4.902949851224947e-05, + "loss": 5.595, + "step": 14963 + }, + { + "epoch": 0.0889951470168427, + "grad_norm": 2.59543514251709, + "learning_rate": 4.9029369625147324e-05, + "loss": 5.3626, + "step": 14964 + }, + { + "epoch": 0.0890010943001237, + "grad_norm": 2.0679430961608887, + "learning_rate": 4.9029240729656764e-05, + "loss": 5.4222, + "step": 14965 + }, + { + "epoch": 0.0890070415834047, + "grad_norm": 1.90644109249115, + "learning_rate": 4.902911182577785e-05, + "loss": 6.1042, + "step": 14966 + }, + { + "epoch": 0.0890129888666857, + "grad_norm": 1.8565638065338135, + "learning_rate": 4.9028982913510626e-05, + "loss": 6.0312, + "step": 14967 + }, + { + "epoch": 0.0890189361499667, + "grad_norm": 1.717623233795166, + "learning_rate": 4.902885399285512e-05, + "loss": 5.794, + "step": 14968 + }, + { + "epoch": 0.08902488343324769, + "grad_norm": 2.2094457149505615, + "learning_rate": 4.90287250638114e-05, + "loss": 5.2517, + "step": 14969 + }, + { + "epoch": 0.08903083071652869, + "grad_norm": 2.2559561729431152, + "learning_rate": 4.9028596126379493e-05, + "loss": 5.2155, + "step": 14970 + }, + { + "epoch": 0.08903677799980969, + "grad_norm": 2.5394740104675293, + "learning_rate": 4.9028467180559455e-05, + "loss": 5.0829, + "step": 14971 + }, + { + "epoch": 0.08904272528309068, + "grad_norm": 1.9542546272277832, + "learning_rate": 4.902833822635133e-05, + "loss": 4.856, + "step": 14972 + }, + { + "epoch": 0.08904867256637168, + "grad_norm": 1.9541314840316772, + "learning_rate": 4.9028209263755154e-05, + "loss": 4.9858, + "step": 14973 + }, + { + "epoch": 0.08905461984965268, + "grad_norm": 1.8625229597091675, + "learning_rate": 4.9028080292770986e-05, + "loss": 4.976, + "step": 14974 + }, + { + "epoch": 0.08906056713293367, + "grad_norm": 2.254417657852173, + "learning_rate": 4.9027951313398855e-05, + "loss": 4.9765, + "step": 14975 + }, + { + "epoch": 0.08906651441621467, + "grad_norm": 2.3143160343170166, + "learning_rate": 4.902782232563882e-05, + "loss": 4.9562, + "step": 14976 + }, + { + "epoch": 0.08907246169949568, + "grad_norm": 2.320388078689575, + "learning_rate": 4.902769332949092e-05, + "loss": 4.9988, + "step": 14977 + }, + { + "epoch": 0.08907840898277666, + "grad_norm": 2.378101348876953, + "learning_rate": 4.90275643249552e-05, + "loss": 5.0869, + "step": 14978 + }, + { + "epoch": 0.08908435626605767, + "grad_norm": 2.5663437843322754, + "learning_rate": 4.90274353120317e-05, + "loss": 5.1124, + "step": 14979 + }, + { + "epoch": 0.08909030354933867, + "grad_norm": 2.2866733074188232, + "learning_rate": 4.902730629072048e-05, + "loss": 5.0564, + "step": 14980 + }, + { + "epoch": 0.08909625083261966, + "grad_norm": 2.060153007507324, + "learning_rate": 4.902717726102157e-05, + "loss": 4.9419, + "step": 14981 + }, + { + "epoch": 0.08910219811590066, + "grad_norm": 2.1555984020233154, + "learning_rate": 4.902704822293502e-05, + "loss": 4.6593, + "step": 14982 + }, + { + "epoch": 0.08910814539918166, + "grad_norm": 2.2045845985412598, + "learning_rate": 4.902691917646088e-05, + "loss": 4.6824, + "step": 14983 + }, + { + "epoch": 0.08911409268246265, + "grad_norm": 2.2891733646392822, + "learning_rate": 4.9026790121599185e-05, + "loss": 4.6378, + "step": 14984 + }, + { + "epoch": 0.08912003996574365, + "grad_norm": 2.0503318309783936, + "learning_rate": 4.902666105834999e-05, + "loss": 4.8051, + "step": 14985 + }, + { + "epoch": 0.08912598724902465, + "grad_norm": 2.2125399112701416, + "learning_rate": 4.9026531986713336e-05, + "loss": 5.0773, + "step": 14986 + }, + { + "epoch": 0.08913193453230564, + "grad_norm": 2.1177804470062256, + "learning_rate": 4.902640290668927e-05, + "loss": 5.0995, + "step": 14987 + }, + { + "epoch": 0.08913788181558664, + "grad_norm": 2.1028857231140137, + "learning_rate": 4.902627381827783e-05, + "loss": 4.3883, + "step": 14988 + }, + { + "epoch": 0.08914382909886764, + "grad_norm": 1.9426429271697998, + "learning_rate": 4.9026144721479065e-05, + "loss": 4.6539, + "step": 14989 + }, + { + "epoch": 0.08914977638214863, + "grad_norm": 2.2325892448425293, + "learning_rate": 4.902601561629302e-05, + "loss": 4.731, + "step": 14990 + }, + { + "epoch": 0.08915572366542963, + "grad_norm": 2.3903300762176514, + "learning_rate": 4.9025886502719756e-05, + "loss": 4.5786, + "step": 14991 + }, + { + "epoch": 0.08916167094871062, + "grad_norm": 2.368431806564331, + "learning_rate": 4.9025757380759284e-05, + "loss": 4.8904, + "step": 14992 + }, + { + "epoch": 0.08916761823199162, + "grad_norm": 2.1727442741394043, + "learning_rate": 4.902562825041168e-05, + "loss": 4.6276, + "step": 14993 + }, + { + "epoch": 0.08917356551527263, + "grad_norm": 2.2038626670837402, + "learning_rate": 4.9025499111676975e-05, + "loss": 4.7451, + "step": 14994 + }, + { + "epoch": 0.08917951279855361, + "grad_norm": 2.3933217525482178, + "learning_rate": 4.902536996455521e-05, + "loss": 4.8129, + "step": 14995 + }, + { + "epoch": 0.08918546008183462, + "grad_norm": 2.473212242126465, + "learning_rate": 4.902524080904645e-05, + "loss": 4.6171, + "step": 14996 + }, + { + "epoch": 0.08919140736511562, + "grad_norm": 2.2226645946502686, + "learning_rate": 4.902511164515071e-05, + "loss": 4.3847, + "step": 14997 + }, + { + "epoch": 0.0891973546483966, + "grad_norm": 2.0874104499816895, + "learning_rate": 4.9024982472868065e-05, + "loss": 4.801, + "step": 14998 + }, + { + "epoch": 0.08920330193167761, + "grad_norm": 1.9831374883651733, + "learning_rate": 4.902485329219854e-05, + "loss": 4.8995, + "step": 14999 + }, + { + "epoch": 0.08920924921495861, + "grad_norm": 2.1662073135375977, + "learning_rate": 4.9024724103142196e-05, + "loss": 4.7221, + "step": 15000 + }, + { + "epoch": 0.0892151964982396, + "grad_norm": 2.335336685180664, + "learning_rate": 4.902459490569906e-05, + "loss": 4.5051, + "step": 15001 + }, + { + "epoch": 0.0892211437815206, + "grad_norm": 2.2647337913513184, + "learning_rate": 4.902446569986919e-05, + "loss": 4.5274, + "step": 15002 + }, + { + "epoch": 0.0892270910648016, + "grad_norm": 2.1781129837036133, + "learning_rate": 4.9024336485652625e-05, + "loss": 4.5661, + "step": 15003 + }, + { + "epoch": 0.08923303834808259, + "grad_norm": 2.6452128887176514, + "learning_rate": 4.902420726304941e-05, + "loss": 5.0087, + "step": 15004 + }, + { + "epoch": 0.0892389856313636, + "grad_norm": 2.10276460647583, + "learning_rate": 4.90240780320596e-05, + "loss": 4.5003, + "step": 15005 + }, + { + "epoch": 0.0892449329146446, + "grad_norm": 2.1297876834869385, + "learning_rate": 4.902394879268323e-05, + "loss": 4.7603, + "step": 15006 + }, + { + "epoch": 0.08925088019792558, + "grad_norm": 2.288257122039795, + "learning_rate": 4.902381954492033e-05, + "loss": 4.7433, + "step": 15007 + }, + { + "epoch": 0.08925682748120659, + "grad_norm": 2.422492742538452, + "learning_rate": 4.902369028877098e-05, + "loss": 4.7823, + "step": 15008 + }, + { + "epoch": 0.08926277476448759, + "grad_norm": 2.4264109134674072, + "learning_rate": 4.9023561024235215e-05, + "loss": 4.9725, + "step": 15009 + }, + { + "epoch": 0.08926872204776858, + "grad_norm": 2.191776752471924, + "learning_rate": 4.902343175131307e-05, + "loss": 4.7893, + "step": 15010 + }, + { + "epoch": 0.08927466933104958, + "grad_norm": 2.0434861183166504, + "learning_rate": 4.9023302470004584e-05, + "loss": 5.3321, + "step": 15011 + }, + { + "epoch": 0.08928061661433058, + "grad_norm": 2.3108692169189453, + "learning_rate": 4.902317318030981e-05, + "loss": 4.848, + "step": 15012 + }, + { + "epoch": 0.08928656389761157, + "grad_norm": 1.8814477920532227, + "learning_rate": 4.9023043882228805e-05, + "loss": 4.9666, + "step": 15013 + }, + { + "epoch": 0.08929251118089257, + "grad_norm": 1.7109707593917847, + "learning_rate": 4.902291457576159e-05, + "loss": 5.0996, + "step": 15014 + }, + { + "epoch": 0.08929845846417357, + "grad_norm": 1.4246928691864014, + "learning_rate": 4.902278526090823e-05, + "loss": 5.1413, + "step": 15015 + }, + { + "epoch": 0.08930440574745456, + "grad_norm": 1.5714298486709595, + "learning_rate": 4.902265593766877e-05, + "loss": 5.4028, + "step": 15016 + }, + { + "epoch": 0.08931035303073556, + "grad_norm": 1.4553309679031372, + "learning_rate": 4.902252660604324e-05, + "loss": 5.1903, + "step": 15017 + }, + { + "epoch": 0.08931630031401656, + "grad_norm": 1.3266233205795288, + "learning_rate": 4.902239726603171e-05, + "loss": 5.1093, + "step": 15018 + }, + { + "epoch": 0.08932224759729755, + "grad_norm": 1.3145966529846191, + "learning_rate": 4.902226791763419e-05, + "loss": 5.0704, + "step": 15019 + }, + { + "epoch": 0.08932819488057855, + "grad_norm": 1.4367384910583496, + "learning_rate": 4.9022138560850754e-05, + "loss": 4.9669, + "step": 15020 + }, + { + "epoch": 0.08933414216385954, + "grad_norm": 1.4239497184753418, + "learning_rate": 4.902200919568144e-05, + "loss": 5.1035, + "step": 15021 + }, + { + "epoch": 0.08934008944714054, + "grad_norm": 1.323853611946106, + "learning_rate": 4.9021879822126284e-05, + "loss": 4.989, + "step": 15022 + }, + { + "epoch": 0.08934603673042155, + "grad_norm": 1.596498727798462, + "learning_rate": 4.9021750440185345e-05, + "loss": 5.0445, + "step": 15023 + }, + { + "epoch": 0.08935198401370253, + "grad_norm": 1.3866841793060303, + "learning_rate": 4.902162104985865e-05, + "loss": 4.9832, + "step": 15024 + }, + { + "epoch": 0.08935793129698354, + "grad_norm": 1.2495089769363403, + "learning_rate": 4.9021491651146265e-05, + "loss": 5.1337, + "step": 15025 + }, + { + "epoch": 0.08936387858026454, + "grad_norm": 1.2082443237304688, + "learning_rate": 4.902136224404822e-05, + "loss": 5.1038, + "step": 15026 + }, + { + "epoch": 0.08936982586354553, + "grad_norm": 1.5153082609176636, + "learning_rate": 4.9021232828564564e-05, + "loss": 5.122, + "step": 15027 + }, + { + "epoch": 0.08937577314682653, + "grad_norm": 1.5340677499771118, + "learning_rate": 4.902110340469536e-05, + "loss": 5.2675, + "step": 15028 + }, + { + "epoch": 0.08938172043010753, + "grad_norm": 1.9367091655731201, + "learning_rate": 4.9020973972440624e-05, + "loss": 5.4528, + "step": 15029 + }, + { + "epoch": 0.08938766771338852, + "grad_norm": 1.7637518644332886, + "learning_rate": 4.902084453180041e-05, + "loss": 5.4686, + "step": 15030 + }, + { + "epoch": 0.08939361499666952, + "grad_norm": 1.668220043182373, + "learning_rate": 4.902071508277477e-05, + "loss": 5.5889, + "step": 15031 + }, + { + "epoch": 0.08939956227995052, + "grad_norm": 2.0754151344299316, + "learning_rate": 4.902058562536375e-05, + "loss": 5.7398, + "step": 15032 + }, + { + "epoch": 0.08940550956323151, + "grad_norm": 1.9756910800933838, + "learning_rate": 4.902045615956739e-05, + "loss": 5.528, + "step": 15033 + }, + { + "epoch": 0.08941145684651251, + "grad_norm": 1.6614958047866821, + "learning_rate": 4.9020326685385735e-05, + "loss": 5.5761, + "step": 15034 + }, + { + "epoch": 0.08941740412979352, + "grad_norm": 2.0193135738372803, + "learning_rate": 4.902019720281884e-05, + "loss": 5.1836, + "step": 15035 + }, + { + "epoch": 0.0894233514130745, + "grad_norm": 2.164290428161621, + "learning_rate": 4.9020067711866735e-05, + "loss": 5.0216, + "step": 15036 + }, + { + "epoch": 0.0894292986963555, + "grad_norm": 2.3957648277282715, + "learning_rate": 4.901993821252947e-05, + "loss": 4.9631, + "step": 15037 + }, + { + "epoch": 0.08943524597963651, + "grad_norm": 2.204258680343628, + "learning_rate": 4.90198087048071e-05, + "loss": 4.774, + "step": 15038 + }, + { + "epoch": 0.0894411932629175, + "grad_norm": 1.7879102230072021, + "learning_rate": 4.9019679188699666e-05, + "loss": 5.716, + "step": 15039 + }, + { + "epoch": 0.0894471405461985, + "grad_norm": 1.6019984483718872, + "learning_rate": 4.9019549664207196e-05, + "loss": 5.3657, + "step": 15040 + }, + { + "epoch": 0.0894530878294795, + "grad_norm": 2.079514741897583, + "learning_rate": 4.901942013132976e-05, + "loss": 5.0526, + "step": 15041 + }, + { + "epoch": 0.08945903511276049, + "grad_norm": 1.9381201267242432, + "learning_rate": 4.901929059006739e-05, + "loss": 4.9585, + "step": 15042 + }, + { + "epoch": 0.08946498239604149, + "grad_norm": 1.6514472961425781, + "learning_rate": 4.9019161040420134e-05, + "loss": 5.4721, + "step": 15043 + }, + { + "epoch": 0.08947092967932249, + "grad_norm": 1.7294371128082275, + "learning_rate": 4.901903148238804e-05, + "loss": 5.4401, + "step": 15044 + }, + { + "epoch": 0.08947687696260348, + "grad_norm": 1.7769347429275513, + "learning_rate": 4.901890191597115e-05, + "loss": 5.4324, + "step": 15045 + }, + { + "epoch": 0.08948282424588448, + "grad_norm": 1.6517225503921509, + "learning_rate": 4.9018772341169505e-05, + "loss": 5.2967, + "step": 15046 + }, + { + "epoch": 0.08948877152916548, + "grad_norm": 1.5310052633285522, + "learning_rate": 4.901864275798316e-05, + "loss": 5.4017, + "step": 15047 + }, + { + "epoch": 0.08949471881244647, + "grad_norm": 1.9703199863433838, + "learning_rate": 4.9018513166412146e-05, + "loss": 4.9813, + "step": 15048 + }, + { + "epoch": 0.08950066609572747, + "grad_norm": 1.991087555885315, + "learning_rate": 4.901838356645652e-05, + "loss": 5.2911, + "step": 15049 + }, + { + "epoch": 0.08950661337900846, + "grad_norm": 1.7992926836013794, + "learning_rate": 4.9018253958116334e-05, + "loss": 5.2996, + "step": 15050 + }, + { + "epoch": 0.08951256066228946, + "grad_norm": 1.5164752006530762, + "learning_rate": 4.901812434139161e-05, + "loss": 5.8002, + "step": 15051 + }, + { + "epoch": 0.08951850794557047, + "grad_norm": 1.8143075704574585, + "learning_rate": 4.9017994716282415e-05, + "loss": 5.241, + "step": 15052 + }, + { + "epoch": 0.08952445522885145, + "grad_norm": 1.9806342124938965, + "learning_rate": 4.9017865082788785e-05, + "loss": 5.3656, + "step": 15053 + }, + { + "epoch": 0.08953040251213246, + "grad_norm": 2.403789520263672, + "learning_rate": 4.901773544091077e-05, + "loss": 5.1024, + "step": 15054 + }, + { + "epoch": 0.08953634979541346, + "grad_norm": 1.5903408527374268, + "learning_rate": 4.90176057906484e-05, + "loss": 5.3849, + "step": 15055 + }, + { + "epoch": 0.08954229707869445, + "grad_norm": 1.764125943183899, + "learning_rate": 4.901747613200175e-05, + "loss": 5.0757, + "step": 15056 + }, + { + "epoch": 0.08954824436197545, + "grad_norm": 2.1031241416931152, + "learning_rate": 4.901734646497084e-05, + "loss": 5.2114, + "step": 15057 + }, + { + "epoch": 0.08955419164525645, + "grad_norm": 1.9965282678604126, + "learning_rate": 4.901721678955571e-05, + "loss": 5.1136, + "step": 15058 + }, + { + "epoch": 0.08956013892853744, + "grad_norm": 1.9062676429748535, + "learning_rate": 4.9017087105756434e-05, + "loss": 4.9166, + "step": 15059 + }, + { + "epoch": 0.08956608621181844, + "grad_norm": 2.0963199138641357, + "learning_rate": 4.901695741357303e-05, + "loss": 4.7587, + "step": 15060 + }, + { + "epoch": 0.08957203349509944, + "grad_norm": 1.7062407732009888, + "learning_rate": 4.901682771300556e-05, + "loss": 5.3046, + "step": 15061 + }, + { + "epoch": 0.08957798077838043, + "grad_norm": 1.574013352394104, + "learning_rate": 4.9016698004054065e-05, + "loss": 5.3007, + "step": 15062 + }, + { + "epoch": 0.08958392806166143, + "grad_norm": 1.7540260553359985, + "learning_rate": 4.9016568286718586e-05, + "loss": 5.5824, + "step": 15063 + }, + { + "epoch": 0.08958987534494244, + "grad_norm": 1.4875624179840088, + "learning_rate": 4.901643856099917e-05, + "loss": 5.4569, + "step": 15064 + }, + { + "epoch": 0.08959582262822342, + "grad_norm": 1.6023603677749634, + "learning_rate": 4.901630882689586e-05, + "loss": 5.5397, + "step": 15065 + }, + { + "epoch": 0.08960176991150443, + "grad_norm": 2.1851913928985596, + "learning_rate": 4.9016179084408706e-05, + "loss": 4.9882, + "step": 15066 + }, + { + "epoch": 0.08960771719478543, + "grad_norm": 1.4636015892028809, + "learning_rate": 4.901604933353776e-05, + "loss": 5.4568, + "step": 15067 + }, + { + "epoch": 0.08961366447806642, + "grad_norm": 2.6841142177581787, + "learning_rate": 4.901591957428305e-05, + "loss": 5.8365, + "step": 15068 + }, + { + "epoch": 0.08961961176134742, + "grad_norm": 2.2015743255615234, + "learning_rate": 4.9015789806644643e-05, + "loss": 5.4798, + "step": 15069 + }, + { + "epoch": 0.08962555904462842, + "grad_norm": 2.3934903144836426, + "learning_rate": 4.901566003062256e-05, + "loss": 5.3355, + "step": 15070 + }, + { + "epoch": 0.08963150632790941, + "grad_norm": 2.418919801712036, + "learning_rate": 4.9015530246216866e-05, + "loss": 5.2546, + "step": 15071 + }, + { + "epoch": 0.08963745361119041, + "grad_norm": 2.2773303985595703, + "learning_rate": 4.90154004534276e-05, + "loss": 5.3306, + "step": 15072 + }, + { + "epoch": 0.08964340089447141, + "grad_norm": 2.09413743019104, + "learning_rate": 4.9015270652254796e-05, + "loss": 5.4715, + "step": 15073 + }, + { + "epoch": 0.0896493481777524, + "grad_norm": 1.8905339241027832, + "learning_rate": 4.901514084269852e-05, + "loss": 5.2248, + "step": 15074 + }, + { + "epoch": 0.0896552954610334, + "grad_norm": 1.7001872062683105, + "learning_rate": 4.9015011024758794e-05, + "loss": 5.2869, + "step": 15075 + }, + { + "epoch": 0.0896612427443144, + "grad_norm": 1.7953561544418335, + "learning_rate": 4.901488119843568e-05, + "loss": 5.2027, + "step": 15076 + }, + { + "epoch": 0.08966719002759539, + "grad_norm": 1.8996349573135376, + "learning_rate": 4.9014751363729225e-05, + "loss": 5.8168, + "step": 15077 + }, + { + "epoch": 0.0896731373108764, + "grad_norm": 1.6294323205947876, + "learning_rate": 4.901462152063946e-05, + "loss": 5.0331, + "step": 15078 + }, + { + "epoch": 0.08967908459415738, + "grad_norm": 1.4392082691192627, + "learning_rate": 4.901449166916645e-05, + "loss": 4.9094, + "step": 15079 + }, + { + "epoch": 0.08968503187743838, + "grad_norm": 1.6613532304763794, + "learning_rate": 4.9014361809310216e-05, + "loss": 5.1426, + "step": 15080 + }, + { + "epoch": 0.08969097916071939, + "grad_norm": 1.7502686977386475, + "learning_rate": 4.9014231941070823e-05, + "loss": 5.4298, + "step": 15081 + }, + { + "epoch": 0.08969692644400037, + "grad_norm": 1.9276418685913086, + "learning_rate": 4.9014102064448305e-05, + "loss": 5.8383, + "step": 15082 + }, + { + "epoch": 0.08970287372728138, + "grad_norm": 2.471407651901245, + "learning_rate": 4.901397217944272e-05, + "loss": 6.1879, + "step": 15083 + }, + { + "epoch": 0.08970882101056238, + "grad_norm": 2.0759341716766357, + "learning_rate": 4.90138422860541e-05, + "loss": 6.0929, + "step": 15084 + }, + { + "epoch": 0.08971476829384337, + "grad_norm": 1.6504180431365967, + "learning_rate": 4.9013712384282505e-05, + "loss": 6.0733, + "step": 15085 + }, + { + "epoch": 0.08972071557712437, + "grad_norm": 1.7268849611282349, + "learning_rate": 4.9013582474127965e-05, + "loss": 5.9707, + "step": 15086 + }, + { + "epoch": 0.08972666286040537, + "grad_norm": 1.8029861450195312, + "learning_rate": 4.901345255559053e-05, + "loss": 5.3645, + "step": 15087 + }, + { + "epoch": 0.08973261014368636, + "grad_norm": 1.8240137100219727, + "learning_rate": 4.9013322628670246e-05, + "loss": 5.4201, + "step": 15088 + }, + { + "epoch": 0.08973855742696736, + "grad_norm": 1.799771785736084, + "learning_rate": 4.901319269336716e-05, + "loss": 5.2043, + "step": 15089 + }, + { + "epoch": 0.08974450471024836, + "grad_norm": 1.6271024942398071, + "learning_rate": 4.901306274968131e-05, + "loss": 5.4118, + "step": 15090 + }, + { + "epoch": 0.08975045199352935, + "grad_norm": 1.4443042278289795, + "learning_rate": 4.9012932797612756e-05, + "loss": 5.5921, + "step": 15091 + }, + { + "epoch": 0.08975639927681035, + "grad_norm": 1.7174689769744873, + "learning_rate": 4.9012802837161535e-05, + "loss": 5.5233, + "step": 15092 + }, + { + "epoch": 0.08976234656009136, + "grad_norm": 1.7158472537994385, + "learning_rate": 4.901267286832769e-05, + "loss": 5.9171, + "step": 15093 + }, + { + "epoch": 0.08976829384337234, + "grad_norm": 1.691797137260437, + "learning_rate": 4.9012542891111275e-05, + "loss": 5.6207, + "step": 15094 + }, + { + "epoch": 0.08977424112665335, + "grad_norm": 1.7525362968444824, + "learning_rate": 4.901241290551233e-05, + "loss": 5.3468, + "step": 15095 + }, + { + "epoch": 0.08978018840993435, + "grad_norm": 1.6895235776901245, + "learning_rate": 4.901228291153089e-05, + "loss": 5.3567, + "step": 15096 + }, + { + "epoch": 0.08978613569321534, + "grad_norm": 1.6617051362991333, + "learning_rate": 4.9012152909167015e-05, + "loss": 5.6781, + "step": 15097 + }, + { + "epoch": 0.08979208297649634, + "grad_norm": 1.5234577655792236, + "learning_rate": 4.901202289842075e-05, + "loss": 5.6262, + "step": 15098 + }, + { + "epoch": 0.08979803025977734, + "grad_norm": 2.1545703411102295, + "learning_rate": 4.9011892879292125e-05, + "loss": 5.3112, + "step": 15099 + }, + { + "epoch": 0.08980397754305833, + "grad_norm": 2.246051073074341, + "learning_rate": 4.9011762851781204e-05, + "loss": 5.3783, + "step": 15100 + }, + { + "epoch": 0.08980992482633933, + "grad_norm": 2.000429630279541, + "learning_rate": 4.901163281588802e-05, + "loss": 5.2561, + "step": 15101 + }, + { + "epoch": 0.08981587210962033, + "grad_norm": 2.0881898403167725, + "learning_rate": 4.901150277161263e-05, + "loss": 5.3308, + "step": 15102 + }, + { + "epoch": 0.08982181939290132, + "grad_norm": 2.4498097896575928, + "learning_rate": 4.901137271895506e-05, + "loss": 5.8405, + "step": 15103 + }, + { + "epoch": 0.08982776667618232, + "grad_norm": 2.210160732269287, + "learning_rate": 4.901124265791538e-05, + "loss": 5.5462, + "step": 15104 + }, + { + "epoch": 0.08983371395946332, + "grad_norm": 2.366419553756714, + "learning_rate": 4.9011112588493625e-05, + "loss": 5.4069, + "step": 15105 + }, + { + "epoch": 0.08983966124274431, + "grad_norm": 1.812118649482727, + "learning_rate": 4.901098251068983e-05, + "loss": 5.9549, + "step": 15106 + }, + { + "epoch": 0.08984560852602531, + "grad_norm": 1.6506917476654053, + "learning_rate": 4.901085242450405e-05, + "loss": 5.762, + "step": 15107 + }, + { + "epoch": 0.0898515558093063, + "grad_norm": 1.8076404333114624, + "learning_rate": 4.901072232993633e-05, + "loss": 5.7841, + "step": 15108 + }, + { + "epoch": 0.0898575030925873, + "grad_norm": 2.51157546043396, + "learning_rate": 4.9010592226986716e-05, + "loss": 5.1544, + "step": 15109 + }, + { + "epoch": 0.0898634503758683, + "grad_norm": 1.9424755573272705, + "learning_rate": 4.901046211565526e-05, + "loss": 5.4587, + "step": 15110 + }, + { + "epoch": 0.0898693976591493, + "grad_norm": 1.998506784439087, + "learning_rate": 4.9010331995941995e-05, + "loss": 5.8242, + "step": 15111 + }, + { + "epoch": 0.0898753449424303, + "grad_norm": 1.8947205543518066, + "learning_rate": 4.901020186784697e-05, + "loss": 5.4488, + "step": 15112 + }, + { + "epoch": 0.0898812922257113, + "grad_norm": 1.905993938446045, + "learning_rate": 4.901007173137022e-05, + "loss": 5.3882, + "step": 15113 + }, + { + "epoch": 0.08988723950899229, + "grad_norm": 1.723973274230957, + "learning_rate": 4.900994158651182e-05, + "loss": 5.9411, + "step": 15114 + }, + { + "epoch": 0.08989318679227329, + "grad_norm": 1.747159719467163, + "learning_rate": 4.900981143327179e-05, + "loss": 5.8436, + "step": 15115 + }, + { + "epoch": 0.08989913407555429, + "grad_norm": 1.7400517463684082, + "learning_rate": 4.900968127165018e-05, + "loss": 5.7067, + "step": 15116 + }, + { + "epoch": 0.08990508135883528, + "grad_norm": 1.763750433921814, + "learning_rate": 4.900955110164704e-05, + "loss": 5.6198, + "step": 15117 + }, + { + "epoch": 0.08991102864211628, + "grad_norm": 1.9004894495010376, + "learning_rate": 4.9009420923262416e-05, + "loss": 5.0977, + "step": 15118 + }, + { + "epoch": 0.08991697592539728, + "grad_norm": 1.6853641271591187, + "learning_rate": 4.900929073649635e-05, + "loss": 5.5213, + "step": 15119 + }, + { + "epoch": 0.08992292320867827, + "grad_norm": 1.7032074928283691, + "learning_rate": 4.900916054134889e-05, + "loss": 5.3764, + "step": 15120 + }, + { + "epoch": 0.08992887049195927, + "grad_norm": 1.623089075088501, + "learning_rate": 4.9009030337820084e-05, + "loss": 5.525, + "step": 15121 + }, + { + "epoch": 0.08993481777524027, + "grad_norm": 1.6154295206069946, + "learning_rate": 4.900890012590996e-05, + "loss": 5.7378, + "step": 15122 + }, + { + "epoch": 0.08994076505852126, + "grad_norm": 1.8368462324142456, + "learning_rate": 4.900876990561859e-05, + "loss": 5.4768, + "step": 15123 + }, + { + "epoch": 0.08994671234180227, + "grad_norm": 1.7773829698562622, + "learning_rate": 4.9008639676946e-05, + "loss": 5.419, + "step": 15124 + }, + { + "epoch": 0.08995265962508327, + "grad_norm": 1.625287413597107, + "learning_rate": 4.9008509439892244e-05, + "loss": 5.4727, + "step": 15125 + }, + { + "epoch": 0.08995860690836426, + "grad_norm": 1.6234408617019653, + "learning_rate": 4.9008379194457364e-05, + "loss": 5.413, + "step": 15126 + }, + { + "epoch": 0.08996455419164526, + "grad_norm": 1.7441129684448242, + "learning_rate": 4.900824894064141e-05, + "loss": 5.2681, + "step": 15127 + }, + { + "epoch": 0.08997050147492626, + "grad_norm": 1.8756482601165771, + "learning_rate": 4.900811867844443e-05, + "loss": 5.5319, + "step": 15128 + }, + { + "epoch": 0.08997644875820725, + "grad_norm": 1.9200249910354614, + "learning_rate": 4.900798840786645e-05, + "loss": 4.7499, + "step": 15129 + }, + { + "epoch": 0.08998239604148825, + "grad_norm": 2.4838919639587402, + "learning_rate": 4.900785812890753e-05, + "loss": 5.0713, + "step": 15130 + }, + { + "epoch": 0.08998834332476925, + "grad_norm": 2.1441292762756348, + "learning_rate": 4.900772784156773e-05, + "loss": 4.9425, + "step": 15131 + }, + { + "epoch": 0.08999429060805024, + "grad_norm": 2.0838072299957275, + "learning_rate": 4.9007597545847066e-05, + "loss": 5.0632, + "step": 15132 + }, + { + "epoch": 0.09000023789133124, + "grad_norm": 1.630042314529419, + "learning_rate": 4.90074672417456e-05, + "loss": 5.2275, + "step": 15133 + }, + { + "epoch": 0.09000618517461224, + "grad_norm": 2.336031675338745, + "learning_rate": 4.900733692926338e-05, + "loss": 4.9596, + "step": 15134 + }, + { + "epoch": 0.09001213245789323, + "grad_norm": 2.414837598800659, + "learning_rate": 4.9007206608400446e-05, + "loss": 4.7405, + "step": 15135 + }, + { + "epoch": 0.09001807974117423, + "grad_norm": 2.2872564792633057, + "learning_rate": 4.900707627915684e-05, + "loss": 4.8294, + "step": 15136 + }, + { + "epoch": 0.09002402702445522, + "grad_norm": 2.474933624267578, + "learning_rate": 4.9006945941532615e-05, + "loss": 4.882, + "step": 15137 + }, + { + "epoch": 0.09002997430773622, + "grad_norm": 2.170109987258911, + "learning_rate": 4.900681559552781e-05, + "loss": 4.6778, + "step": 15138 + }, + { + "epoch": 0.09003592159101723, + "grad_norm": 2.1962943077087402, + "learning_rate": 4.900668524114248e-05, + "loss": 4.8201, + "step": 15139 + }, + { + "epoch": 0.09004186887429821, + "grad_norm": 2.46073317527771, + "learning_rate": 4.9006554878376656e-05, + "loss": 4.6929, + "step": 15140 + }, + { + "epoch": 0.09004781615757922, + "grad_norm": 2.4591431617736816, + "learning_rate": 4.90064245072304e-05, + "loss": 4.711, + "step": 15141 + }, + { + "epoch": 0.09005376344086022, + "grad_norm": 2.2225937843322754, + "learning_rate": 4.9006294127703745e-05, + "loss": 5.2556, + "step": 15142 + }, + { + "epoch": 0.0900597107241412, + "grad_norm": 2.3457517623901367, + "learning_rate": 4.900616373979674e-05, + "loss": 5.7773, + "step": 15143 + }, + { + "epoch": 0.09006565800742221, + "grad_norm": 2.226430892944336, + "learning_rate": 4.9006033343509436e-05, + "loss": 5.6364, + "step": 15144 + }, + { + "epoch": 0.09007160529070321, + "grad_norm": 2.1407759189605713, + "learning_rate": 4.900590293884186e-05, + "loss": 5.4202, + "step": 15145 + }, + { + "epoch": 0.0900775525739842, + "grad_norm": 1.7371548414230347, + "learning_rate": 4.9005772525794084e-05, + "loss": 5.5686, + "step": 15146 + }, + { + "epoch": 0.0900834998572652, + "grad_norm": 1.8759154081344604, + "learning_rate": 4.900564210436615e-05, + "loss": 5.4824, + "step": 15147 + }, + { + "epoch": 0.0900894471405462, + "grad_norm": 1.8595685958862305, + "learning_rate": 4.900551167455807e-05, + "loss": 5.6123, + "step": 15148 + }, + { + "epoch": 0.09009539442382719, + "grad_norm": 2.0119471549987793, + "learning_rate": 4.900538123636993e-05, + "loss": 5.5925, + "step": 15149 + }, + { + "epoch": 0.09010134170710819, + "grad_norm": 1.9375147819519043, + "learning_rate": 4.900525078980176e-05, + "loss": 5.5707, + "step": 15150 + }, + { + "epoch": 0.0901072889903892, + "grad_norm": 1.7323594093322754, + "learning_rate": 4.9005120334853595e-05, + "loss": 5.4133, + "step": 15151 + }, + { + "epoch": 0.09011323627367018, + "grad_norm": 1.7680727243423462, + "learning_rate": 4.90049898715255e-05, + "loss": 5.5954, + "step": 15152 + }, + { + "epoch": 0.09011918355695119, + "grad_norm": 1.8436721563339233, + "learning_rate": 4.9004859399817505e-05, + "loss": 5.5689, + "step": 15153 + }, + { + "epoch": 0.09012513084023219, + "grad_norm": 1.8080954551696777, + "learning_rate": 4.9004728919729664e-05, + "loss": 5.5266, + "step": 15154 + }, + { + "epoch": 0.09013107812351318, + "grad_norm": 2.2874748706817627, + "learning_rate": 4.900459843126202e-05, + "loss": 5.1985, + "step": 15155 + }, + { + "epoch": 0.09013702540679418, + "grad_norm": 1.8425899744033813, + "learning_rate": 4.900446793441462e-05, + "loss": 5.2856, + "step": 15156 + }, + { + "epoch": 0.09014297269007518, + "grad_norm": 1.6970654726028442, + "learning_rate": 4.900433742918751e-05, + "loss": 5.8597, + "step": 15157 + }, + { + "epoch": 0.09014891997335617, + "grad_norm": 2.3444008827209473, + "learning_rate": 4.9004206915580726e-05, + "loss": 4.4653, + "step": 15158 + }, + { + "epoch": 0.09015486725663717, + "grad_norm": 2.0390350818634033, + "learning_rate": 4.9004076393594325e-05, + "loss": 4.6565, + "step": 15159 + }, + { + "epoch": 0.09016081453991817, + "grad_norm": 2.0733320713043213, + "learning_rate": 4.900394586322835e-05, + "loss": 4.6052, + "step": 15160 + }, + { + "epoch": 0.09016676182319916, + "grad_norm": 1.9700855016708374, + "learning_rate": 4.9003815324482846e-05, + "loss": 4.7535, + "step": 15161 + }, + { + "epoch": 0.09017270910648016, + "grad_norm": 2.0294783115386963, + "learning_rate": 4.900368477735786e-05, + "loss": 5.4154, + "step": 15162 + }, + { + "epoch": 0.09017865638976116, + "grad_norm": 1.8937848806381226, + "learning_rate": 4.900355422185343e-05, + "loss": 5.3244, + "step": 15163 + }, + { + "epoch": 0.09018460367304215, + "grad_norm": 1.7404329776763916, + "learning_rate": 4.900342365796961e-05, + "loss": 5.887, + "step": 15164 + }, + { + "epoch": 0.09019055095632315, + "grad_norm": 1.5309412479400635, + "learning_rate": 4.9003293085706446e-05, + "loss": 5.4574, + "step": 15165 + }, + { + "epoch": 0.09019649823960414, + "grad_norm": 2.10003662109375, + "learning_rate": 4.9003162505063976e-05, + "loss": 5.2962, + "step": 15166 + }, + { + "epoch": 0.09020244552288514, + "grad_norm": 2.7704551219940186, + "learning_rate": 4.900303191604225e-05, + "loss": 4.6386, + "step": 15167 + }, + { + "epoch": 0.09020839280616615, + "grad_norm": 3.3551974296569824, + "learning_rate": 4.9002901318641314e-05, + "loss": 5.3348, + "step": 15168 + }, + { + "epoch": 0.09021434008944713, + "grad_norm": 2.8300132751464844, + "learning_rate": 4.9002770712861216e-05, + "loss": 5.2031, + "step": 15169 + }, + { + "epoch": 0.09022028737272814, + "grad_norm": 1.77587890625, + "learning_rate": 4.9002640098702005e-05, + "loss": 5.1371, + "step": 15170 + }, + { + "epoch": 0.09022623465600914, + "grad_norm": 1.694191575050354, + "learning_rate": 4.900250947616371e-05, + "loss": 5.7283, + "step": 15171 + }, + { + "epoch": 0.09023218193929013, + "grad_norm": 1.6392415761947632, + "learning_rate": 4.900237884524638e-05, + "loss": 5.3856, + "step": 15172 + }, + { + "epoch": 0.09023812922257113, + "grad_norm": 2.302626371383667, + "learning_rate": 4.900224820595008e-05, + "loss": 5.1007, + "step": 15173 + }, + { + "epoch": 0.09024407650585213, + "grad_norm": 2.296760082244873, + "learning_rate": 4.900211755827484e-05, + "loss": 5.0303, + "step": 15174 + }, + { + "epoch": 0.09025002378913312, + "grad_norm": 2.2914488315582275, + "learning_rate": 4.9001986902220706e-05, + "loss": 5.3176, + "step": 15175 + }, + { + "epoch": 0.09025597107241412, + "grad_norm": 2.084686756134033, + "learning_rate": 4.900185623778774e-05, + "loss": 5.2028, + "step": 15176 + }, + { + "epoch": 0.09026191835569512, + "grad_norm": 1.9465001821517944, + "learning_rate": 4.9001725564975953e-05, + "loss": 4.661, + "step": 15177 + }, + { + "epoch": 0.09026786563897611, + "grad_norm": 2.926347494125366, + "learning_rate": 4.900159488378542e-05, + "loss": 4.4579, + "step": 15178 + }, + { + "epoch": 0.09027381292225711, + "grad_norm": 2.6047539710998535, + "learning_rate": 4.900146419421619e-05, + "loss": 4.5486, + "step": 15179 + }, + { + "epoch": 0.09027976020553811, + "grad_norm": 2.4737868309020996, + "learning_rate": 4.9001333496268274e-05, + "loss": 4.3661, + "step": 15180 + }, + { + "epoch": 0.0902857074888191, + "grad_norm": 2.075547456741333, + "learning_rate": 4.900120278994176e-05, + "loss": 4.3157, + "step": 15181 + }, + { + "epoch": 0.0902916547721001, + "grad_norm": 2.509284019470215, + "learning_rate": 4.900107207523666e-05, + "loss": 4.2558, + "step": 15182 + }, + { + "epoch": 0.09029760205538111, + "grad_norm": 2.4345662593841553, + "learning_rate": 4.9000941352153046e-05, + "loss": 4.2932, + "step": 15183 + }, + { + "epoch": 0.0903035493386621, + "grad_norm": 2.214146137237549, + "learning_rate": 4.9000810620690945e-05, + "loss": 4.6953, + "step": 15184 + }, + { + "epoch": 0.0903094966219431, + "grad_norm": 2.197709083557129, + "learning_rate": 4.900067988085041e-05, + "loss": 4.7138, + "step": 15185 + }, + { + "epoch": 0.0903154439052241, + "grad_norm": 2.0381791591644287, + "learning_rate": 4.900054913263148e-05, + "loss": 6.1924, + "step": 15186 + }, + { + "epoch": 0.09032139118850509, + "grad_norm": 1.7017699480056763, + "learning_rate": 4.900041837603422e-05, + "loss": 6.1646, + "step": 15187 + }, + { + "epoch": 0.09032733847178609, + "grad_norm": 1.5804365873336792, + "learning_rate": 4.9000287611058645e-05, + "loss": 6.1757, + "step": 15188 + }, + { + "epoch": 0.09033328575506709, + "grad_norm": 1.6158896684646606, + "learning_rate": 4.9000156837704836e-05, + "loss": 6.1136, + "step": 15189 + }, + { + "epoch": 0.09033923303834808, + "grad_norm": 1.9524257183074951, + "learning_rate": 4.90000260559728e-05, + "loss": 5.43, + "step": 15190 + }, + { + "epoch": 0.09034518032162908, + "grad_norm": 1.835134744644165, + "learning_rate": 4.899989526586261e-05, + "loss": 6.0223, + "step": 15191 + }, + { + "epoch": 0.09035112760491008, + "grad_norm": 1.7213332653045654, + "learning_rate": 4.899976446737432e-05, + "loss": 5.7823, + "step": 15192 + }, + { + "epoch": 0.09035707488819107, + "grad_norm": 1.8744465112686157, + "learning_rate": 4.899963366050795e-05, + "loss": 5.0549, + "step": 15193 + }, + { + "epoch": 0.09036302217147207, + "grad_norm": 1.800979495048523, + "learning_rate": 4.899950284526355e-05, + "loss": 5.0726, + "step": 15194 + }, + { + "epoch": 0.09036896945475306, + "grad_norm": 1.7476063966751099, + "learning_rate": 4.899937202164118e-05, + "loss": 4.9177, + "step": 15195 + }, + { + "epoch": 0.09037491673803406, + "grad_norm": 1.5107455253601074, + "learning_rate": 4.899924118964087e-05, + "loss": 5.1873, + "step": 15196 + }, + { + "epoch": 0.09038086402131507, + "grad_norm": 1.4630497694015503, + "learning_rate": 4.899911034926267e-05, + "loss": 4.9166, + "step": 15197 + }, + { + "epoch": 0.09038681130459605, + "grad_norm": 1.519824743270874, + "learning_rate": 4.899897950050664e-05, + "loss": 4.9084, + "step": 15198 + }, + { + "epoch": 0.09039275858787706, + "grad_norm": 1.480298399925232, + "learning_rate": 4.899884864337281e-05, + "loss": 4.8724, + "step": 15199 + }, + { + "epoch": 0.09039870587115806, + "grad_norm": 1.549485445022583, + "learning_rate": 4.8998717777861224e-05, + "loss": 4.8378, + "step": 15200 + }, + { + "epoch": 0.09040465315443905, + "grad_norm": 1.6650373935699463, + "learning_rate": 4.8998586903971936e-05, + "loss": 4.9478, + "step": 15201 + }, + { + "epoch": 0.09041060043772005, + "grad_norm": 1.5880005359649658, + "learning_rate": 4.899845602170499e-05, + "loss": 4.7952, + "step": 15202 + }, + { + "epoch": 0.09041654772100105, + "grad_norm": 1.5553892850875854, + "learning_rate": 4.899832513106043e-05, + "loss": 4.9303, + "step": 15203 + }, + { + "epoch": 0.09042249500428204, + "grad_norm": 1.5907729864120483, + "learning_rate": 4.899819423203831e-05, + "loss": 4.7951, + "step": 15204 + }, + { + "epoch": 0.09042844228756304, + "grad_norm": 1.5885943174362183, + "learning_rate": 4.899806332463866e-05, + "loss": 4.8896, + "step": 15205 + }, + { + "epoch": 0.09043438957084404, + "grad_norm": 1.7483280897140503, + "learning_rate": 4.899793240886154e-05, + "loss": 5.6137, + "step": 15206 + }, + { + "epoch": 0.09044033685412503, + "grad_norm": 1.7883373498916626, + "learning_rate": 4.8997801484706984e-05, + "loss": 5.7183, + "step": 15207 + }, + { + "epoch": 0.09044628413740603, + "grad_norm": 1.7988712787628174, + "learning_rate": 4.8997670552175044e-05, + "loss": 5.7979, + "step": 15208 + }, + { + "epoch": 0.09045223142068703, + "grad_norm": 2.1793367862701416, + "learning_rate": 4.899753961126577e-05, + "loss": 5.3549, + "step": 15209 + }, + { + "epoch": 0.09045817870396802, + "grad_norm": 2.117983341217041, + "learning_rate": 4.8997408661979194e-05, + "loss": 5.1934, + "step": 15210 + }, + { + "epoch": 0.09046412598724902, + "grad_norm": 2.1799557209014893, + "learning_rate": 4.899727770431538e-05, + "loss": 5.2521, + "step": 15211 + }, + { + "epoch": 0.09047007327053003, + "grad_norm": 2.117403745651245, + "learning_rate": 4.8997146738274355e-05, + "loss": 5.3379, + "step": 15212 + }, + { + "epoch": 0.09047602055381102, + "grad_norm": 1.59669828414917, + "learning_rate": 4.899701576385619e-05, + "loss": 5.375, + "step": 15213 + }, + { + "epoch": 0.09048196783709202, + "grad_norm": 1.6929266452789307, + "learning_rate": 4.8996884781060907e-05, + "loss": 5.9243, + "step": 15214 + }, + { + "epoch": 0.09048791512037302, + "grad_norm": 1.8353838920593262, + "learning_rate": 4.899675378988855e-05, + "loss": 5.9216, + "step": 15215 + }, + { + "epoch": 0.09049386240365401, + "grad_norm": 1.6468323469161987, + "learning_rate": 4.899662279033918e-05, + "loss": 6.0171, + "step": 15216 + }, + { + "epoch": 0.09049980968693501, + "grad_norm": 1.4748890399932861, + "learning_rate": 4.899649178241284e-05, + "loss": 5.6775, + "step": 15217 + }, + { + "epoch": 0.09050575697021601, + "grad_norm": 1.8783589601516724, + "learning_rate": 4.8996360766109576e-05, + "loss": 5.7625, + "step": 15218 + }, + { + "epoch": 0.090511704253497, + "grad_norm": 1.7860721349716187, + "learning_rate": 4.8996229741429416e-05, + "loss": 5.7512, + "step": 15219 + }, + { + "epoch": 0.090517651536778, + "grad_norm": 1.7337830066680908, + "learning_rate": 4.899609870837243e-05, + "loss": 5.8233, + "step": 15220 + }, + { + "epoch": 0.090523598820059, + "grad_norm": 1.9256298542022705, + "learning_rate": 4.899596766693865e-05, + "loss": 5.8586, + "step": 15221 + }, + { + "epoch": 0.09052954610333999, + "grad_norm": 1.814205288887024, + "learning_rate": 4.8995836617128135e-05, + "loss": 5.4852, + "step": 15222 + }, + { + "epoch": 0.090535493386621, + "grad_norm": 1.8664608001708984, + "learning_rate": 4.899570555894091e-05, + "loss": 5.6847, + "step": 15223 + }, + { + "epoch": 0.09054144066990198, + "grad_norm": 1.8377459049224854, + "learning_rate": 4.899557449237704e-05, + "loss": 5.8869, + "step": 15224 + }, + { + "epoch": 0.09054738795318298, + "grad_norm": 1.788875937461853, + "learning_rate": 4.899544341743656e-05, + "loss": 5.4372, + "step": 15225 + }, + { + "epoch": 0.09055333523646399, + "grad_norm": 1.8490506410598755, + "learning_rate": 4.899531233411951e-05, + "loss": 6.1163, + "step": 15226 + }, + { + "epoch": 0.09055928251974497, + "grad_norm": 2.14841628074646, + "learning_rate": 4.8995181242425955e-05, + "loss": 6.1154, + "step": 15227 + }, + { + "epoch": 0.09056522980302598, + "grad_norm": 2.051154851913452, + "learning_rate": 4.899505014235593e-05, + "loss": 4.9326, + "step": 15228 + }, + { + "epoch": 0.09057117708630698, + "grad_norm": 2.071126937866211, + "learning_rate": 4.899491903390948e-05, + "loss": 4.8831, + "step": 15229 + }, + { + "epoch": 0.09057712436958797, + "grad_norm": 2.0155231952667236, + "learning_rate": 4.899478791708665e-05, + "loss": 4.87, + "step": 15230 + }, + { + "epoch": 0.09058307165286897, + "grad_norm": 1.946815013885498, + "learning_rate": 4.89946567918875e-05, + "loss": 4.8139, + "step": 15231 + }, + { + "epoch": 0.09058901893614997, + "grad_norm": 1.9526349306106567, + "learning_rate": 4.899452565831204e-05, + "loss": 4.7618, + "step": 15232 + }, + { + "epoch": 0.09059496621943096, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.8994394516360355e-05, + "loss": 4.7617, + "step": 15233 + }, + { + "epoch": 0.09060091350271196, + "grad_norm": 2.0964083671569824, + "learning_rate": 4.8994263366032466e-05, + "loss": 4.6298, + "step": 15234 + }, + { + "epoch": 0.09060686078599296, + "grad_norm": 2.0333590507507324, + "learning_rate": 4.899413220732843e-05, + "loss": 4.6419, + "step": 15235 + }, + { + "epoch": 0.09061280806927395, + "grad_norm": 2.076993703842163, + "learning_rate": 4.89940010402483e-05, + "loss": 4.6163, + "step": 15236 + }, + { + "epoch": 0.09061875535255495, + "grad_norm": 1.767774224281311, + "learning_rate": 4.89938698647921e-05, + "loss": 5.2418, + "step": 15237 + }, + { + "epoch": 0.09062470263583595, + "grad_norm": 1.8380626440048218, + "learning_rate": 4.899373868095989e-05, + "loss": 5.3304, + "step": 15238 + }, + { + "epoch": 0.09063064991911694, + "grad_norm": 1.7332574129104614, + "learning_rate": 4.8993607488751716e-05, + "loss": 5.3528, + "step": 15239 + }, + { + "epoch": 0.09063659720239794, + "grad_norm": 1.8473124504089355, + "learning_rate": 4.8993476288167614e-05, + "loss": 5.5801, + "step": 15240 + }, + { + "epoch": 0.09064254448567895, + "grad_norm": 2.299206256866455, + "learning_rate": 4.899334507920765e-05, + "loss": 5.308, + "step": 15241 + }, + { + "epoch": 0.09064849176895994, + "grad_norm": 1.945417046546936, + "learning_rate": 4.899321386187185e-05, + "loss": 4.8894, + "step": 15242 + }, + { + "epoch": 0.09065443905224094, + "grad_norm": 2.328246831893921, + "learning_rate": 4.899308263616027e-05, + "loss": 5.0332, + "step": 15243 + }, + { + "epoch": 0.09066038633552194, + "grad_norm": 2.194546699523926, + "learning_rate": 4.899295140207295e-05, + "loss": 4.8891, + "step": 15244 + }, + { + "epoch": 0.09066633361880293, + "grad_norm": 2.078903913497925, + "learning_rate": 4.899282015960994e-05, + "loss": 5.0327, + "step": 15245 + }, + { + "epoch": 0.09067228090208393, + "grad_norm": 2.2129557132720947, + "learning_rate": 4.8992688908771285e-05, + "loss": 4.8806, + "step": 15246 + }, + { + "epoch": 0.09067822818536493, + "grad_norm": 2.3200979232788086, + "learning_rate": 4.8992557649557026e-05, + "loss": 4.9961, + "step": 15247 + }, + { + "epoch": 0.09068417546864592, + "grad_norm": 1.5829685926437378, + "learning_rate": 4.899242638196722e-05, + "loss": 5.4238, + "step": 15248 + }, + { + "epoch": 0.09069012275192692, + "grad_norm": 1.9085135459899902, + "learning_rate": 4.89922951060019e-05, + "loss": 5.0338, + "step": 15249 + }, + { + "epoch": 0.09069607003520792, + "grad_norm": 2.3000802993774414, + "learning_rate": 4.899216382166112e-05, + "loss": 4.9529, + "step": 15250 + }, + { + "epoch": 0.09070201731848891, + "grad_norm": 2.1610753536224365, + "learning_rate": 4.899203252894492e-05, + "loss": 4.9373, + "step": 15251 + }, + { + "epoch": 0.09070796460176991, + "grad_norm": 2.2821414470672607, + "learning_rate": 4.899190122785336e-05, + "loss": 5.2032, + "step": 15252 + }, + { + "epoch": 0.0907139118850509, + "grad_norm": 2.226741075515747, + "learning_rate": 4.899176991838646e-05, + "loss": 4.9354, + "step": 15253 + }, + { + "epoch": 0.0907198591683319, + "grad_norm": 2.0117716789245605, + "learning_rate": 4.899163860054429e-05, + "loss": 5.1179, + "step": 15254 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 1.6551730632781982, + "learning_rate": 4.8991507274326886e-05, + "loss": 5.6428, + "step": 15255 + }, + { + "epoch": 0.0907317537348939, + "grad_norm": 1.5236784219741821, + "learning_rate": 4.89913759397343e-05, + "loss": 5.4088, + "step": 15256 + }, + { + "epoch": 0.0907377010181749, + "grad_norm": 1.542356252670288, + "learning_rate": 4.899124459676656e-05, + "loss": 5.3383, + "step": 15257 + }, + { + "epoch": 0.0907436483014559, + "grad_norm": 1.5694434642791748, + "learning_rate": 4.899111324542374e-05, + "loss": 5.5202, + "step": 15258 + }, + { + "epoch": 0.09074959558473689, + "grad_norm": 1.459039568901062, + "learning_rate": 4.8990981885705856e-05, + "loss": 5.3481, + "step": 15259 + }, + { + "epoch": 0.09075554286801789, + "grad_norm": 1.4624565839767456, + "learning_rate": 4.899085051761297e-05, + "loss": 5.343, + "step": 15260 + }, + { + "epoch": 0.09076149015129889, + "grad_norm": 1.2748361825942993, + "learning_rate": 4.899071914114513e-05, + "loss": 5.1925, + "step": 15261 + }, + { + "epoch": 0.09076743743457988, + "grad_norm": 1.3813046216964722, + "learning_rate": 4.899058775630237e-05, + "loss": 4.9712, + "step": 15262 + }, + { + "epoch": 0.09077338471786088, + "grad_norm": 1.349108099937439, + "learning_rate": 4.8990456363084756e-05, + "loss": 4.9562, + "step": 15263 + }, + { + "epoch": 0.09077933200114188, + "grad_norm": 1.4744555950164795, + "learning_rate": 4.8990324961492316e-05, + "loss": 5.0014, + "step": 15264 + }, + { + "epoch": 0.09078527928442287, + "grad_norm": 1.4227643013000488, + "learning_rate": 4.8990193551525105e-05, + "loss": 5.076, + "step": 15265 + }, + { + "epoch": 0.09079122656770387, + "grad_norm": 1.4344059228897095, + "learning_rate": 4.8990062133183164e-05, + "loss": 5.2212, + "step": 15266 + }, + { + "epoch": 0.09079717385098487, + "grad_norm": 1.5858408212661743, + "learning_rate": 4.8989930706466534e-05, + "loss": 5.1893, + "step": 15267 + }, + { + "epoch": 0.09080312113426586, + "grad_norm": 1.6398282051086426, + "learning_rate": 4.898979927137527e-05, + "loss": 5.034, + "step": 15268 + }, + { + "epoch": 0.09080906841754686, + "grad_norm": 1.4295551776885986, + "learning_rate": 4.8989667827909416e-05, + "loss": 5.2761, + "step": 15269 + }, + { + "epoch": 0.09081501570082787, + "grad_norm": 1.4313840866088867, + "learning_rate": 4.898953637606902e-05, + "loss": 5.183, + "step": 15270 + }, + { + "epoch": 0.09082096298410886, + "grad_norm": 1.2977478504180908, + "learning_rate": 4.898940491585412e-05, + "loss": 5.1148, + "step": 15271 + }, + { + "epoch": 0.09082691026738986, + "grad_norm": 1.6052992343902588, + "learning_rate": 4.898927344726477e-05, + "loss": 5.3767, + "step": 15272 + }, + { + "epoch": 0.09083285755067086, + "grad_norm": 1.3184257745742798, + "learning_rate": 4.898914197030101e-05, + "loss": 5.3465, + "step": 15273 + }, + { + "epoch": 0.09083880483395185, + "grad_norm": 1.292985439300537, + "learning_rate": 4.898901048496289e-05, + "loss": 5.2478, + "step": 15274 + }, + { + "epoch": 0.09084475211723285, + "grad_norm": 1.1660702228546143, + "learning_rate": 4.898887899125045e-05, + "loss": 5.2655, + "step": 15275 + }, + { + "epoch": 0.09085069940051385, + "grad_norm": 1.2271296977996826, + "learning_rate": 4.8988747489163746e-05, + "loss": 5.2001, + "step": 15276 + }, + { + "epoch": 0.09085664668379484, + "grad_norm": 1.2237215042114258, + "learning_rate": 4.898861597870281e-05, + "loss": 5.213, + "step": 15277 + }, + { + "epoch": 0.09086259396707584, + "grad_norm": 1.3682539463043213, + "learning_rate": 4.898848445986771e-05, + "loss": 5.2174, + "step": 15278 + }, + { + "epoch": 0.09086854125035684, + "grad_norm": 1.2321406602859497, + "learning_rate": 4.8988352932658466e-05, + "loss": 5.1424, + "step": 15279 + }, + { + "epoch": 0.09087448853363783, + "grad_norm": 1.285792350769043, + "learning_rate": 4.898822139707514e-05, + "loss": 5.1438, + "step": 15280 + }, + { + "epoch": 0.09088043581691883, + "grad_norm": 1.137921690940857, + "learning_rate": 4.898808985311778e-05, + "loss": 5.159, + "step": 15281 + }, + { + "epoch": 0.09088638310019982, + "grad_norm": 1.2261563539505005, + "learning_rate": 4.898795830078641e-05, + "loss": 5.1176, + "step": 15282 + }, + { + "epoch": 0.09089233038348082, + "grad_norm": 1.1642104387283325, + "learning_rate": 4.89878267400811e-05, + "loss": 5.0887, + "step": 15283 + }, + { + "epoch": 0.09089827766676183, + "grad_norm": 1.3699917793273926, + "learning_rate": 4.898769517100189e-05, + "loss": 5.0048, + "step": 15284 + }, + { + "epoch": 0.09090422495004281, + "grad_norm": 1.6375452280044556, + "learning_rate": 4.898756359354882e-05, + "loss": 4.6914, + "step": 15285 + }, + { + "epoch": 0.09091017223332382, + "grad_norm": 1.5404956340789795, + "learning_rate": 4.8987432007721944e-05, + "loss": 4.8266, + "step": 15286 + }, + { + "epoch": 0.09091611951660482, + "grad_norm": 1.6747840642929077, + "learning_rate": 4.89873004135213e-05, + "loss": 4.697, + "step": 15287 + }, + { + "epoch": 0.0909220667998858, + "grad_norm": 1.3908432722091675, + "learning_rate": 4.8987168810946935e-05, + "loss": 4.9327, + "step": 15288 + }, + { + "epoch": 0.09092801408316681, + "grad_norm": 1.4933167695999146, + "learning_rate": 4.89870371999989e-05, + "loss": 4.6153, + "step": 15289 + }, + { + "epoch": 0.09093396136644781, + "grad_norm": 1.6259129047393799, + "learning_rate": 4.8986905580677234e-05, + "loss": 4.533, + "step": 15290 + }, + { + "epoch": 0.0909399086497288, + "grad_norm": 1.3692474365234375, + "learning_rate": 4.898677395298199e-05, + "loss": 4.6246, + "step": 15291 + }, + { + "epoch": 0.0909458559330098, + "grad_norm": 1.4951711893081665, + "learning_rate": 4.8986642316913214e-05, + "loss": 4.6677, + "step": 15292 + }, + { + "epoch": 0.0909518032162908, + "grad_norm": 1.5491467714309692, + "learning_rate": 4.8986510672470946e-05, + "loss": 4.9271, + "step": 15293 + }, + { + "epoch": 0.09095775049957179, + "grad_norm": 1.6902397871017456, + "learning_rate": 4.8986379019655235e-05, + "loss": 4.6467, + "step": 15294 + }, + { + "epoch": 0.09096369778285279, + "grad_norm": 1.5122796297073364, + "learning_rate": 4.898624735846613e-05, + "loss": 4.7103, + "step": 15295 + }, + { + "epoch": 0.0909696450661338, + "grad_norm": 1.5287622213363647, + "learning_rate": 4.898611568890367e-05, + "loss": 4.7461, + "step": 15296 + }, + { + "epoch": 0.09097559234941478, + "grad_norm": 1.4649391174316406, + "learning_rate": 4.898598401096791e-05, + "loss": 5.2472, + "step": 15297 + }, + { + "epoch": 0.09098153963269578, + "grad_norm": 1.7621572017669678, + "learning_rate": 4.898585232465889e-05, + "loss": 4.6864, + "step": 15298 + }, + { + "epoch": 0.09098748691597679, + "grad_norm": 1.6371783018112183, + "learning_rate": 4.898572062997665e-05, + "loss": 4.6091, + "step": 15299 + }, + { + "epoch": 0.09099343419925777, + "grad_norm": 1.28440523147583, + "learning_rate": 4.898558892692125e-05, + "loss": 5.0019, + "step": 15300 + }, + { + "epoch": 0.09099938148253878, + "grad_norm": 1.4753130674362183, + "learning_rate": 4.898545721549272e-05, + "loss": 5.3848, + "step": 15301 + }, + { + "epoch": 0.09100532876581978, + "grad_norm": 1.4267481565475464, + "learning_rate": 4.898532549569112e-05, + "loss": 5.1787, + "step": 15302 + }, + { + "epoch": 0.09101127604910077, + "grad_norm": 1.4724546670913696, + "learning_rate": 4.898519376751649e-05, + "loss": 5.2581, + "step": 15303 + }, + { + "epoch": 0.09101722333238177, + "grad_norm": 1.4417310953140259, + "learning_rate": 4.8985062030968875e-05, + "loss": 5.4829, + "step": 15304 + }, + { + "epoch": 0.09102317061566277, + "grad_norm": 1.1160683631896973, + "learning_rate": 4.898493028604833e-05, + "loss": 5.5287, + "step": 15305 + }, + { + "epoch": 0.09102911789894376, + "grad_norm": 1.2454899549484253, + "learning_rate": 4.8984798532754884e-05, + "loss": 5.2984, + "step": 15306 + }, + { + "epoch": 0.09103506518222476, + "grad_norm": 1.5732132196426392, + "learning_rate": 4.8984666771088596e-05, + "loss": 5.4998, + "step": 15307 + }, + { + "epoch": 0.09104101246550576, + "grad_norm": 1.6430423259735107, + "learning_rate": 4.8984535001049515e-05, + "loss": 5.4636, + "step": 15308 + }, + { + "epoch": 0.09104695974878675, + "grad_norm": 1.245288372039795, + "learning_rate": 4.898440322263768e-05, + "loss": 5.2874, + "step": 15309 + }, + { + "epoch": 0.09105290703206775, + "grad_norm": 1.4186644554138184, + "learning_rate": 4.898427143585312e-05, + "loss": 5.2275, + "step": 15310 + }, + { + "epoch": 0.09105885431534876, + "grad_norm": 1.3040757179260254, + "learning_rate": 4.8984139640695915e-05, + "loss": 5.2864, + "step": 15311 + }, + { + "epoch": 0.09106480159862974, + "grad_norm": 1.4106818437576294, + "learning_rate": 4.898400783716609e-05, + "loss": 5.5897, + "step": 15312 + }, + { + "epoch": 0.09107074888191075, + "grad_norm": 1.5596522092819214, + "learning_rate": 4.89838760252637e-05, + "loss": 5.4827, + "step": 15313 + }, + { + "epoch": 0.09107669616519173, + "grad_norm": 2.2576634883880615, + "learning_rate": 4.898374420498878e-05, + "loss": 5.1471, + "step": 15314 + }, + { + "epoch": 0.09108264344847274, + "grad_norm": 1.2749537229537964, + "learning_rate": 4.898361237634139e-05, + "loss": 5.2688, + "step": 15315 + }, + { + "epoch": 0.09108859073175374, + "grad_norm": 1.4171591997146606, + "learning_rate": 4.8983480539321566e-05, + "loss": 5.0796, + "step": 15316 + }, + { + "epoch": 0.09109453801503473, + "grad_norm": 1.2233314514160156, + "learning_rate": 4.898334869392936e-05, + "loss": 5.0992, + "step": 15317 + }, + { + "epoch": 0.09110048529831573, + "grad_norm": 1.4817143678665161, + "learning_rate": 4.8983216840164804e-05, + "loss": 5.2354, + "step": 15318 + }, + { + "epoch": 0.09110643258159673, + "grad_norm": 1.442088007926941, + "learning_rate": 4.898308497802796e-05, + "loss": 5.2177, + "step": 15319 + }, + { + "epoch": 0.09111237986487772, + "grad_norm": 1.3996042013168335, + "learning_rate": 4.898295310751887e-05, + "loss": 4.9938, + "step": 15320 + }, + { + "epoch": 0.09111832714815872, + "grad_norm": 1.3091521263122559, + "learning_rate": 4.8982821228637576e-05, + "loss": 4.9916, + "step": 15321 + }, + { + "epoch": 0.09112427443143972, + "grad_norm": 1.4807448387145996, + "learning_rate": 4.898268934138414e-05, + "loss": 4.9833, + "step": 15322 + }, + { + "epoch": 0.09113022171472071, + "grad_norm": 1.5992671251296997, + "learning_rate": 4.898255744575858e-05, + "loss": 5.1007, + "step": 15323 + }, + { + "epoch": 0.09113616899800171, + "grad_norm": 1.4472523927688599, + "learning_rate": 4.8982425541760954e-05, + "loss": 5.3123, + "step": 15324 + }, + { + "epoch": 0.09114211628128271, + "grad_norm": 1.2865816354751587, + "learning_rate": 4.898229362939132e-05, + "loss": 5.0817, + "step": 15325 + }, + { + "epoch": 0.0911480635645637, + "grad_norm": 1.477144479751587, + "learning_rate": 4.898216170864972e-05, + "loss": 5.1819, + "step": 15326 + }, + { + "epoch": 0.0911540108478447, + "grad_norm": 1.5831303596496582, + "learning_rate": 4.8982029779536184e-05, + "loss": 5.28, + "step": 15327 + }, + { + "epoch": 0.0911599581311257, + "grad_norm": 1.3366963863372803, + "learning_rate": 4.898189784205078e-05, + "loss": 5.3715, + "step": 15328 + }, + { + "epoch": 0.0911659054144067, + "grad_norm": 1.5603365898132324, + "learning_rate": 4.898176589619353e-05, + "loss": 5.2642, + "step": 15329 + }, + { + "epoch": 0.0911718526976877, + "grad_norm": 1.5105326175689697, + "learning_rate": 4.8981633941964506e-05, + "loss": 4.949, + "step": 15330 + }, + { + "epoch": 0.0911777999809687, + "grad_norm": 1.2074800729751587, + "learning_rate": 4.8981501979363734e-05, + "loss": 5.2847, + "step": 15331 + }, + { + "epoch": 0.09118374726424969, + "grad_norm": 1.4356200695037842, + "learning_rate": 4.898137000839127e-05, + "loss": 5.6169, + "step": 15332 + }, + { + "epoch": 0.09118969454753069, + "grad_norm": 1.5015919208526611, + "learning_rate": 4.8981238029047154e-05, + "loss": 5.1135, + "step": 15333 + }, + { + "epoch": 0.09119564183081169, + "grad_norm": 1.4902187585830688, + "learning_rate": 4.8981106041331434e-05, + "loss": 5.4406, + "step": 15334 + }, + { + "epoch": 0.09120158911409268, + "grad_norm": 1.2884581089019775, + "learning_rate": 4.898097404524416e-05, + "loss": 5.3493, + "step": 15335 + }, + { + "epoch": 0.09120753639737368, + "grad_norm": 1.4323054552078247, + "learning_rate": 4.898084204078539e-05, + "loss": 5.0939, + "step": 15336 + }, + { + "epoch": 0.09121348368065468, + "grad_norm": 1.6282861232757568, + "learning_rate": 4.898071002795514e-05, + "loss": 5.1857, + "step": 15337 + }, + { + "epoch": 0.09121943096393567, + "grad_norm": 1.3413678407669067, + "learning_rate": 4.898057800675347e-05, + "loss": 4.9581, + "step": 15338 + }, + { + "epoch": 0.09122537824721667, + "grad_norm": 1.5613822937011719, + "learning_rate": 4.898044597718044e-05, + "loss": 4.6401, + "step": 15339 + }, + { + "epoch": 0.09123132553049768, + "grad_norm": 1.4945799112319946, + "learning_rate": 4.898031393923608e-05, + "loss": 4.6649, + "step": 15340 + }, + { + "epoch": 0.09123727281377866, + "grad_norm": 1.6086750030517578, + "learning_rate": 4.898018189292043e-05, + "loss": 4.5996, + "step": 15341 + }, + { + "epoch": 0.09124322009705967, + "grad_norm": 1.3530272245407104, + "learning_rate": 4.898004983823355e-05, + "loss": 4.6511, + "step": 15342 + }, + { + "epoch": 0.09124916738034065, + "grad_norm": 1.5523587465286255, + "learning_rate": 4.897991777517549e-05, + "loss": 4.8099, + "step": 15343 + }, + { + "epoch": 0.09125511466362166, + "grad_norm": 1.6695882081985474, + "learning_rate": 4.8979785703746286e-05, + "loss": 5.2371, + "step": 15344 + }, + { + "epoch": 0.09126106194690266, + "grad_norm": 1.777717113494873, + "learning_rate": 4.897965362394599e-05, + "loss": 5.373, + "step": 15345 + }, + { + "epoch": 0.09126700923018365, + "grad_norm": 1.2890517711639404, + "learning_rate": 4.8979521535774636e-05, + "loss": 5.3851, + "step": 15346 + }, + { + "epoch": 0.09127295651346465, + "grad_norm": 1.3539687395095825, + "learning_rate": 4.897938943923228e-05, + "loss": 5.1218, + "step": 15347 + }, + { + "epoch": 0.09127890379674565, + "grad_norm": 1.4157010316848755, + "learning_rate": 4.8979257334318974e-05, + "loss": 4.9411, + "step": 15348 + }, + { + "epoch": 0.09128485108002664, + "grad_norm": 1.4856256246566772, + "learning_rate": 4.897912522103475e-05, + "loss": 5.1622, + "step": 15349 + }, + { + "epoch": 0.09129079836330764, + "grad_norm": 1.4729665517807007, + "learning_rate": 4.8978993099379666e-05, + "loss": 5.0901, + "step": 15350 + }, + { + "epoch": 0.09129674564658864, + "grad_norm": 1.376625895500183, + "learning_rate": 4.897886096935376e-05, + "loss": 4.8843, + "step": 15351 + }, + { + "epoch": 0.09130269292986963, + "grad_norm": 1.3019710779190063, + "learning_rate": 4.897872883095708e-05, + "loss": 4.9956, + "step": 15352 + }, + { + "epoch": 0.09130864021315063, + "grad_norm": 1.4751423597335815, + "learning_rate": 4.897859668418968e-05, + "loss": 5.4369, + "step": 15353 + }, + { + "epoch": 0.09131458749643163, + "grad_norm": 1.3563402891159058, + "learning_rate": 4.8978464529051595e-05, + "loss": 5.2071, + "step": 15354 + }, + { + "epoch": 0.09132053477971262, + "grad_norm": 1.7365561723709106, + "learning_rate": 4.8978332365542875e-05, + "loss": 4.8797, + "step": 15355 + }, + { + "epoch": 0.09132648206299362, + "grad_norm": 1.4001792669296265, + "learning_rate": 4.8978200193663565e-05, + "loss": 5.2549, + "step": 15356 + }, + { + "epoch": 0.09133242934627463, + "grad_norm": 1.5568649768829346, + "learning_rate": 4.897806801341371e-05, + "loss": 5.3805, + "step": 15357 + }, + { + "epoch": 0.09133837662955561, + "grad_norm": 1.4169847965240479, + "learning_rate": 4.897793582479337e-05, + "loss": 5.2655, + "step": 15358 + }, + { + "epoch": 0.09134432391283662, + "grad_norm": 1.3992067575454712, + "learning_rate": 4.897780362780258e-05, + "loss": 5.4284, + "step": 15359 + }, + { + "epoch": 0.09135027119611762, + "grad_norm": 1.2274264097213745, + "learning_rate": 4.8977671422441376e-05, + "loss": 5.2443, + "step": 15360 + }, + { + "epoch": 0.09135621847939861, + "grad_norm": 1.4754104614257812, + "learning_rate": 4.897753920870982e-05, + "loss": 5.3438, + "step": 15361 + }, + { + "epoch": 0.09136216576267961, + "grad_norm": 1.3993452787399292, + "learning_rate": 4.897740698660796e-05, + "loss": 5.2396, + "step": 15362 + }, + { + "epoch": 0.09136811304596061, + "grad_norm": 1.2840338945388794, + "learning_rate": 4.897727475613583e-05, + "loss": 5.2912, + "step": 15363 + }, + { + "epoch": 0.0913740603292416, + "grad_norm": 1.5234180688858032, + "learning_rate": 4.8977142517293474e-05, + "loss": 5.4197, + "step": 15364 + }, + { + "epoch": 0.0913800076125226, + "grad_norm": 1.6243525743484497, + "learning_rate": 4.897701027008095e-05, + "loss": 5.4358, + "step": 15365 + }, + { + "epoch": 0.0913859548958036, + "grad_norm": 1.277801513671875, + "learning_rate": 4.8976878014498306e-05, + "loss": 5.2801, + "step": 15366 + }, + { + "epoch": 0.09139190217908459, + "grad_norm": 1.5294082164764404, + "learning_rate": 4.897674575054557e-05, + "loss": 4.8257, + "step": 15367 + }, + { + "epoch": 0.0913978494623656, + "grad_norm": 1.7289122343063354, + "learning_rate": 4.897661347822281e-05, + "loss": 4.8155, + "step": 15368 + }, + { + "epoch": 0.0914037967456466, + "grad_norm": 1.5567346811294556, + "learning_rate": 4.897648119753006e-05, + "loss": 4.8245, + "step": 15369 + }, + { + "epoch": 0.09140974402892758, + "grad_norm": 1.4855397939682007, + "learning_rate": 4.8976348908467365e-05, + "loss": 4.7247, + "step": 15370 + }, + { + "epoch": 0.09141569131220859, + "grad_norm": 1.4355418682098389, + "learning_rate": 4.897621661103477e-05, + "loss": 5.0925, + "step": 15371 + }, + { + "epoch": 0.09142163859548957, + "grad_norm": 1.3165326118469238, + "learning_rate": 4.897608430523233e-05, + "loss": 5.3419, + "step": 15372 + }, + { + "epoch": 0.09142758587877058, + "grad_norm": 1.4930912256240845, + "learning_rate": 4.8975951991060084e-05, + "loss": 5.3267, + "step": 15373 + }, + { + "epoch": 0.09143353316205158, + "grad_norm": 1.2326771020889282, + "learning_rate": 4.897581966851809e-05, + "loss": 5.2902, + "step": 15374 + }, + { + "epoch": 0.09143948044533257, + "grad_norm": 1.1512086391448975, + "learning_rate": 4.897568733760638e-05, + "loss": 5.2362, + "step": 15375 + }, + { + "epoch": 0.09144542772861357, + "grad_norm": 2.2404119968414307, + "learning_rate": 4.8975554998325e-05, + "loss": 5.055, + "step": 15376 + }, + { + "epoch": 0.09145137501189457, + "grad_norm": 1.3026318550109863, + "learning_rate": 4.8975422650674005e-05, + "loss": 5.0192, + "step": 15377 + }, + { + "epoch": 0.09145732229517556, + "grad_norm": 1.5808472633361816, + "learning_rate": 4.897529029465344e-05, + "loss": 5.2429, + "step": 15378 + }, + { + "epoch": 0.09146326957845656, + "grad_norm": 1.5761525630950928, + "learning_rate": 4.897515793026335e-05, + "loss": 4.9123, + "step": 15379 + }, + { + "epoch": 0.09146921686173756, + "grad_norm": 1.488484501838684, + "learning_rate": 4.897502555750377e-05, + "loss": 4.8463, + "step": 15380 + }, + { + "epoch": 0.09147516414501855, + "grad_norm": 1.4662736654281616, + "learning_rate": 4.897489317637477e-05, + "loss": 5.3047, + "step": 15381 + }, + { + "epoch": 0.09148111142829955, + "grad_norm": 1.6454370021820068, + "learning_rate": 4.897476078687637e-05, + "loss": 5.2335, + "step": 15382 + }, + { + "epoch": 0.09148705871158055, + "grad_norm": 1.425868034362793, + "learning_rate": 4.8974628389008636e-05, + "loss": 5.2016, + "step": 15383 + }, + { + "epoch": 0.09149300599486154, + "grad_norm": 1.599349021911621, + "learning_rate": 4.8974495982771606e-05, + "loss": 5.4205, + "step": 15384 + }, + { + "epoch": 0.09149895327814254, + "grad_norm": 1.6200257539749146, + "learning_rate": 4.897436356816533e-05, + "loss": 5.5001, + "step": 15385 + }, + { + "epoch": 0.09150490056142355, + "grad_norm": 1.5314574241638184, + "learning_rate": 4.8974231145189844e-05, + "loss": 5.4711, + "step": 15386 + }, + { + "epoch": 0.09151084784470453, + "grad_norm": 1.507489562034607, + "learning_rate": 4.8974098713845206e-05, + "loss": 5.4001, + "step": 15387 + }, + { + "epoch": 0.09151679512798554, + "grad_norm": 1.4561303853988647, + "learning_rate": 4.897396627413146e-05, + "loss": 5.4566, + "step": 15388 + }, + { + "epoch": 0.09152274241126654, + "grad_norm": 1.3273184299468994, + "learning_rate": 4.897383382604865e-05, + "loss": 5.4665, + "step": 15389 + }, + { + "epoch": 0.09152868969454753, + "grad_norm": 1.370138168334961, + "learning_rate": 4.8973701369596814e-05, + "loss": 5.4319, + "step": 15390 + }, + { + "epoch": 0.09153463697782853, + "grad_norm": 1.4831699132919312, + "learning_rate": 4.897356890477601e-05, + "loss": 5.2734, + "step": 15391 + }, + { + "epoch": 0.09154058426110953, + "grad_norm": 1.3152328729629517, + "learning_rate": 4.897343643158629e-05, + "loss": 5.3573, + "step": 15392 + }, + { + "epoch": 0.09154653154439052, + "grad_norm": 1.635460376739502, + "learning_rate": 4.8973303950027684e-05, + "loss": 5.2433, + "step": 15393 + }, + { + "epoch": 0.09155247882767152, + "grad_norm": 1.5252761840820312, + "learning_rate": 4.897317146010024e-05, + "loss": 5.2164, + "step": 15394 + }, + { + "epoch": 0.09155842611095252, + "grad_norm": 1.600043773651123, + "learning_rate": 4.897303896180402e-05, + "loss": 5.4138, + "step": 15395 + }, + { + "epoch": 0.09156437339423351, + "grad_norm": 1.6243258714675903, + "learning_rate": 4.8972906455139056e-05, + "loss": 5.6129, + "step": 15396 + }, + { + "epoch": 0.09157032067751451, + "grad_norm": 1.2726150751113892, + "learning_rate": 4.89727739401054e-05, + "loss": 5.4639, + "step": 15397 + }, + { + "epoch": 0.09157626796079552, + "grad_norm": 2.1045331954956055, + "learning_rate": 4.897264141670309e-05, + "loss": 5.1875, + "step": 15398 + }, + { + "epoch": 0.0915822152440765, + "grad_norm": 2.1204488277435303, + "learning_rate": 4.897250888493218e-05, + "loss": 5.0401, + "step": 15399 + }, + { + "epoch": 0.0915881625273575, + "grad_norm": 1.794190526008606, + "learning_rate": 4.8972376344792716e-05, + "loss": 6.0581, + "step": 15400 + }, + { + "epoch": 0.0915941098106385, + "grad_norm": 2.050788402557373, + "learning_rate": 4.8972243796284746e-05, + "loss": 5.0138, + "step": 15401 + }, + { + "epoch": 0.0916000570939195, + "grad_norm": 2.1165850162506104, + "learning_rate": 4.897211123940831e-05, + "loss": 4.7077, + "step": 15402 + }, + { + "epoch": 0.0916060043772005, + "grad_norm": 1.9797117710113525, + "learning_rate": 4.8971978674163455e-05, + "loss": 4.8248, + "step": 15403 + }, + { + "epoch": 0.09161195166048149, + "grad_norm": 1.922232747077942, + "learning_rate": 4.8971846100550234e-05, + "loss": 4.7655, + "step": 15404 + }, + { + "epoch": 0.09161789894376249, + "grad_norm": 1.7310322523117065, + "learning_rate": 4.897171351856869e-05, + "loss": 5.425, + "step": 15405 + }, + { + "epoch": 0.09162384622704349, + "grad_norm": 1.9186078310012817, + "learning_rate": 4.897158092821887e-05, + "loss": 6.2449, + "step": 15406 + }, + { + "epoch": 0.09162979351032448, + "grad_norm": 1.7470628023147583, + "learning_rate": 4.897144832950081e-05, + "loss": 6.1586, + "step": 15407 + }, + { + "epoch": 0.09163574079360548, + "grad_norm": 1.7828420400619507, + "learning_rate": 4.897131572241457e-05, + "loss": 6.1068, + "step": 15408 + }, + { + "epoch": 0.09164168807688648, + "grad_norm": 1.8831984996795654, + "learning_rate": 4.897118310696019e-05, + "loss": 5.6989, + "step": 15409 + }, + { + "epoch": 0.09164763536016747, + "grad_norm": 1.6138192415237427, + "learning_rate": 4.8971050483137726e-05, + "loss": 5.8222, + "step": 15410 + }, + { + "epoch": 0.09165358264344847, + "grad_norm": 1.6921756267547607, + "learning_rate": 4.897091785094721e-05, + "loss": 5.8559, + "step": 15411 + }, + { + "epoch": 0.09165952992672947, + "grad_norm": 2.007937431335449, + "learning_rate": 4.8970785210388694e-05, + "loss": 5.4523, + "step": 15412 + }, + { + "epoch": 0.09166547721001046, + "grad_norm": 1.8820117712020874, + "learning_rate": 4.8970652561462224e-05, + "loss": 5.6293, + "step": 15413 + }, + { + "epoch": 0.09167142449329146, + "grad_norm": 2.0193300247192383, + "learning_rate": 4.897051990416785e-05, + "loss": 5.8481, + "step": 15414 + }, + { + "epoch": 0.09167737177657247, + "grad_norm": 2.3685405254364014, + "learning_rate": 4.897038723850561e-05, + "loss": 6.2884, + "step": 15415 + }, + { + "epoch": 0.09168331905985345, + "grad_norm": 2.001131534576416, + "learning_rate": 4.897025456447556e-05, + "loss": 5.6747, + "step": 15416 + }, + { + "epoch": 0.09168926634313446, + "grad_norm": 1.9729053974151611, + "learning_rate": 4.897012188207774e-05, + "loss": 5.9019, + "step": 15417 + }, + { + "epoch": 0.09169521362641546, + "grad_norm": 1.7620398998260498, + "learning_rate": 4.896998919131219e-05, + "loss": 5.9498, + "step": 15418 + }, + { + "epoch": 0.09170116090969645, + "grad_norm": 1.6993772983551025, + "learning_rate": 4.896985649217898e-05, + "loss": 5.973, + "step": 15419 + }, + { + "epoch": 0.09170710819297745, + "grad_norm": 1.6905665397644043, + "learning_rate": 4.896972378467813e-05, + "loss": 5.9729, + "step": 15420 + }, + { + "epoch": 0.09171305547625845, + "grad_norm": 1.710838794708252, + "learning_rate": 4.8969591068809706e-05, + "loss": 5.6661, + "step": 15421 + }, + { + "epoch": 0.09171900275953944, + "grad_norm": 1.9235612154006958, + "learning_rate": 4.896945834457374e-05, + "loss": 5.38, + "step": 15422 + }, + { + "epoch": 0.09172495004282044, + "grad_norm": 2.360656976699829, + "learning_rate": 4.896932561197028e-05, + "loss": 5.2199, + "step": 15423 + }, + { + "epoch": 0.09173089732610144, + "grad_norm": 2.403338670730591, + "learning_rate": 4.896919287099938e-05, + "loss": 5.1776, + "step": 15424 + }, + { + "epoch": 0.09173684460938243, + "grad_norm": 1.9474782943725586, + "learning_rate": 4.896906012166108e-05, + "loss": 5.0781, + "step": 15425 + }, + { + "epoch": 0.09174279189266343, + "grad_norm": 1.8974144458770752, + "learning_rate": 4.896892736395543e-05, + "loss": 5.1609, + "step": 15426 + }, + { + "epoch": 0.09174873917594444, + "grad_norm": 2.3854262828826904, + "learning_rate": 4.896879459788247e-05, + "loss": 5.2019, + "step": 15427 + }, + { + "epoch": 0.09175468645922542, + "grad_norm": 2.4181137084960938, + "learning_rate": 4.8968661823442264e-05, + "loss": 5.1216, + "step": 15428 + }, + { + "epoch": 0.09176063374250643, + "grad_norm": 2.266355514526367, + "learning_rate": 4.896852904063484e-05, + "loss": 5.0401, + "step": 15429 + }, + { + "epoch": 0.09176658102578741, + "grad_norm": 2.086296558380127, + "learning_rate": 4.896839624946025e-05, + "loss": 4.8601, + "step": 15430 + }, + { + "epoch": 0.09177252830906842, + "grad_norm": 1.943326473236084, + "learning_rate": 4.896826344991854e-05, + "loss": 4.9978, + "step": 15431 + }, + { + "epoch": 0.09177847559234942, + "grad_norm": 2.0165631771087646, + "learning_rate": 4.896813064200975e-05, + "loss": 5.0379, + "step": 15432 + }, + { + "epoch": 0.0917844228756304, + "grad_norm": 1.7142544984817505, + "learning_rate": 4.896799782573394e-05, + "loss": 5.7101, + "step": 15433 + }, + { + "epoch": 0.09179037015891141, + "grad_norm": 1.9000083208084106, + "learning_rate": 4.896786500109115e-05, + "loss": 5.9536, + "step": 15434 + }, + { + "epoch": 0.09179631744219241, + "grad_norm": 1.6976677179336548, + "learning_rate": 4.8967732168081426e-05, + "loss": 5.4408, + "step": 15435 + }, + { + "epoch": 0.0918022647254734, + "grad_norm": 1.7433068752288818, + "learning_rate": 4.8967599326704815e-05, + "loss": 5.831, + "step": 15436 + }, + { + "epoch": 0.0918082120087544, + "grad_norm": 1.484256625175476, + "learning_rate": 4.896746647696136e-05, + "loss": 5.943, + "step": 15437 + }, + { + "epoch": 0.0918141592920354, + "grad_norm": 2.2480883598327637, + "learning_rate": 4.8967333618851106e-05, + "loss": 5.6634, + "step": 15438 + }, + { + "epoch": 0.09182010657531639, + "grad_norm": 1.3530383110046387, + "learning_rate": 4.896720075237411e-05, + "loss": 5.8981, + "step": 15439 + }, + { + "epoch": 0.09182605385859739, + "grad_norm": 1.451636552810669, + "learning_rate": 4.896706787753041e-05, + "loss": 5.9803, + "step": 15440 + }, + { + "epoch": 0.0918320011418784, + "grad_norm": 1.5904042720794678, + "learning_rate": 4.896693499432006e-05, + "loss": 5.9692, + "step": 15441 + }, + { + "epoch": 0.09183794842515938, + "grad_norm": 1.3971885442733765, + "learning_rate": 4.896680210274309e-05, + "loss": 5.8612, + "step": 15442 + }, + { + "epoch": 0.09184389570844038, + "grad_norm": 1.325842022895813, + "learning_rate": 4.8966669202799564e-05, + "loss": 5.9081, + "step": 15443 + }, + { + "epoch": 0.09184984299172139, + "grad_norm": 1.4639033079147339, + "learning_rate": 4.8966536294489515e-05, + "loss": 5.8395, + "step": 15444 + }, + { + "epoch": 0.09185579027500237, + "grad_norm": 1.248425006866455, + "learning_rate": 4.896640337781301e-05, + "loss": 5.9016, + "step": 15445 + }, + { + "epoch": 0.09186173755828338, + "grad_norm": 1.4250134229660034, + "learning_rate": 4.896627045277007e-05, + "loss": 5.815, + "step": 15446 + }, + { + "epoch": 0.09186768484156438, + "grad_norm": 1.9178589582443237, + "learning_rate": 4.896613751936075e-05, + "loss": 5.9092, + "step": 15447 + }, + { + "epoch": 0.09187363212484537, + "grad_norm": 1.9218472242355347, + "learning_rate": 4.896600457758511e-05, + "loss": 5.7151, + "step": 15448 + }, + { + "epoch": 0.09187957940812637, + "grad_norm": 1.7698949575424194, + "learning_rate": 4.896587162744317e-05, + "loss": 5.709, + "step": 15449 + }, + { + "epoch": 0.09188552669140737, + "grad_norm": 2.5047290325164795, + "learning_rate": 4.8965738668935e-05, + "loss": 5.5417, + "step": 15450 + }, + { + "epoch": 0.09189147397468836, + "grad_norm": 1.9855560064315796, + "learning_rate": 4.896560570206065e-05, + "loss": 5.9572, + "step": 15451 + }, + { + "epoch": 0.09189742125796936, + "grad_norm": 1.8577516078948975, + "learning_rate": 4.896547272682014e-05, + "loss": 4.8775, + "step": 15452 + }, + { + "epoch": 0.09190336854125036, + "grad_norm": 1.8830385208129883, + "learning_rate": 4.896533974321353e-05, + "loss": 4.8617, + "step": 15453 + }, + { + "epoch": 0.09190931582453135, + "grad_norm": 1.5114052295684814, + "learning_rate": 4.896520675124087e-05, + "loss": 4.9485, + "step": 15454 + }, + { + "epoch": 0.09191526310781235, + "grad_norm": 1.6233285665512085, + "learning_rate": 4.8965073750902205e-05, + "loss": 5.1098, + "step": 15455 + }, + { + "epoch": 0.09192121039109336, + "grad_norm": 1.6900150775909424, + "learning_rate": 4.896494074219758e-05, + "loss": 6.025, + "step": 15456 + }, + { + "epoch": 0.09192715767437434, + "grad_norm": 1.3984570503234863, + "learning_rate": 4.8964807725127046e-05, + "loss": 5.888, + "step": 15457 + }, + { + "epoch": 0.09193310495765535, + "grad_norm": 1.7069528102874756, + "learning_rate": 4.896467469969064e-05, + "loss": 5.6435, + "step": 15458 + }, + { + "epoch": 0.09193905224093633, + "grad_norm": 1.641513705253601, + "learning_rate": 4.896454166588842e-05, + "loss": 5.5641, + "step": 15459 + }, + { + "epoch": 0.09194499952421734, + "grad_norm": 1.8448737859725952, + "learning_rate": 4.896440862372042e-05, + "loss": 5.5673, + "step": 15460 + }, + { + "epoch": 0.09195094680749834, + "grad_norm": 1.7696945667266846, + "learning_rate": 4.8964275573186694e-05, + "loss": 5.4383, + "step": 15461 + }, + { + "epoch": 0.09195689409077933, + "grad_norm": 2.7951743602752686, + "learning_rate": 4.8964142514287285e-05, + "loss": 4.2996, + "step": 15462 + }, + { + "epoch": 0.09196284137406033, + "grad_norm": 2.5503883361816406, + "learning_rate": 4.8964009447022246e-05, + "loss": 4.2864, + "step": 15463 + }, + { + "epoch": 0.09196878865734133, + "grad_norm": 2.2069225311279297, + "learning_rate": 4.896387637139161e-05, + "loss": 4.3818, + "step": 15464 + }, + { + "epoch": 0.09197473594062232, + "grad_norm": 2.34734845161438, + "learning_rate": 4.8963743287395444e-05, + "loss": 4.2951, + "step": 15465 + }, + { + "epoch": 0.09198068322390332, + "grad_norm": 2.2955567836761475, + "learning_rate": 4.896361019503378e-05, + "loss": 4.3349, + "step": 15466 + }, + { + "epoch": 0.09198663050718432, + "grad_norm": 2.3519480228424072, + "learning_rate": 4.8963477094306666e-05, + "loss": 4.2685, + "step": 15467 + }, + { + "epoch": 0.09199257779046531, + "grad_norm": 2.3862032890319824, + "learning_rate": 4.896334398521415e-05, + "loss": 4.1333, + "step": 15468 + }, + { + "epoch": 0.09199852507374631, + "grad_norm": 2.1290738582611084, + "learning_rate": 4.896321086775627e-05, + "loss": 4.7918, + "step": 15469 + }, + { + "epoch": 0.09200447235702731, + "grad_norm": 2.2130253314971924, + "learning_rate": 4.8963077741933095e-05, + "loss": 5.208, + "step": 15470 + }, + { + "epoch": 0.0920104196403083, + "grad_norm": 2.063810110092163, + "learning_rate": 4.896294460774464e-05, + "loss": 5.1891, + "step": 15471 + }, + { + "epoch": 0.0920163669235893, + "grad_norm": 2.068791627883911, + "learning_rate": 4.8962811465190984e-05, + "loss": 5.2855, + "step": 15472 + }, + { + "epoch": 0.0920223142068703, + "grad_norm": 1.8504056930541992, + "learning_rate": 4.896267831427215e-05, + "loss": 5.0159, + "step": 15473 + }, + { + "epoch": 0.0920282614901513, + "grad_norm": 2.150820255279541, + "learning_rate": 4.89625451549882e-05, + "loss": 5.7728, + "step": 15474 + }, + { + "epoch": 0.0920342087734323, + "grad_norm": 2.3655643463134766, + "learning_rate": 4.8962411987339165e-05, + "loss": 5.4863, + "step": 15475 + }, + { + "epoch": 0.0920401560567133, + "grad_norm": 1.509820818901062, + "learning_rate": 4.8962278811325105e-05, + "loss": 5.5682, + "step": 15476 + }, + { + "epoch": 0.09204610333999429, + "grad_norm": 1.8581949472427368, + "learning_rate": 4.896214562694605e-05, + "loss": 5.6875, + "step": 15477 + }, + { + "epoch": 0.09205205062327529, + "grad_norm": 2.028116464614868, + "learning_rate": 4.8962012434202075e-05, + "loss": 5.3495, + "step": 15478 + }, + { + "epoch": 0.09205799790655629, + "grad_norm": 1.9395058155059814, + "learning_rate": 4.89618792330932e-05, + "loss": 5.5616, + "step": 15479 + }, + { + "epoch": 0.09206394518983728, + "grad_norm": 1.9281854629516602, + "learning_rate": 4.896174602361948e-05, + "loss": 5.6449, + "step": 15480 + }, + { + "epoch": 0.09206989247311828, + "grad_norm": 1.7750074863433838, + "learning_rate": 4.896161280578097e-05, + "loss": 5.1178, + "step": 15481 + }, + { + "epoch": 0.09207583975639928, + "grad_norm": 2.0160205364227295, + "learning_rate": 4.89614795795777e-05, + "loss": 5.4698, + "step": 15482 + }, + { + "epoch": 0.09208178703968027, + "grad_norm": 2.0041770935058594, + "learning_rate": 4.896134634500972e-05, + "loss": 4.6989, + "step": 15483 + }, + { + "epoch": 0.09208773432296127, + "grad_norm": 1.9916999340057373, + "learning_rate": 4.896121310207708e-05, + "loss": 4.6296, + "step": 15484 + }, + { + "epoch": 0.09209368160624228, + "grad_norm": 1.62458336353302, + "learning_rate": 4.8961079850779845e-05, + "loss": 5.1147, + "step": 15485 + }, + { + "epoch": 0.09209962888952326, + "grad_norm": 1.8349764347076416, + "learning_rate": 4.8960946591118036e-05, + "loss": 5.3646, + "step": 15486 + }, + { + "epoch": 0.09210557617280427, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.89608133230917e-05, + "loss": 5.7467, + "step": 15487 + }, + { + "epoch": 0.09211152345608525, + "grad_norm": 1.8945664167404175, + "learning_rate": 4.89606800467009e-05, + "loss": 5.5526, + "step": 15488 + }, + { + "epoch": 0.09211747073936626, + "grad_norm": 2.1056711673736572, + "learning_rate": 4.896054676194568e-05, + "loss": 4.8553, + "step": 15489 + }, + { + "epoch": 0.09212341802264726, + "grad_norm": 2.0394606590270996, + "learning_rate": 4.896041346882607e-05, + "loss": 5.4427, + "step": 15490 + }, + { + "epoch": 0.09212936530592825, + "grad_norm": 2.3078689575195312, + "learning_rate": 4.896028016734213e-05, + "loss": 5.3668, + "step": 15491 + }, + { + "epoch": 0.09213531258920925, + "grad_norm": 2.1227409839630127, + "learning_rate": 4.8960146857493904e-05, + "loss": 5.6314, + "step": 15492 + }, + { + "epoch": 0.09214125987249025, + "grad_norm": 2.156165838241577, + "learning_rate": 4.896001353928144e-05, + "loss": 5.5088, + "step": 15493 + }, + { + "epoch": 0.09214720715577124, + "grad_norm": 1.8915730714797974, + "learning_rate": 4.895988021270478e-05, + "loss": 5.5636, + "step": 15494 + }, + { + "epoch": 0.09215315443905224, + "grad_norm": 1.8041549921035767, + "learning_rate": 4.895974687776398e-05, + "loss": 5.5213, + "step": 15495 + }, + { + "epoch": 0.09215910172233324, + "grad_norm": 1.8982187509536743, + "learning_rate": 4.8959613534459074e-05, + "loss": 5.7038, + "step": 15496 + }, + { + "epoch": 0.09216504900561423, + "grad_norm": 1.9235600233078003, + "learning_rate": 4.895948018279012e-05, + "loss": 5.514, + "step": 15497 + }, + { + "epoch": 0.09217099628889523, + "grad_norm": 2.284212112426758, + "learning_rate": 4.895934682275715e-05, + "loss": 5.4624, + "step": 15498 + }, + { + "epoch": 0.09217694357217623, + "grad_norm": 2.770934820175171, + "learning_rate": 4.895921345436022e-05, + "loss": 4.7516, + "step": 15499 + }, + { + "epoch": 0.09218289085545722, + "grad_norm": 2.054158926010132, + "learning_rate": 4.895908007759939e-05, + "loss": 5.6444, + "step": 15500 + }, + { + "epoch": 0.09218883813873822, + "grad_norm": 2.352905511856079, + "learning_rate": 4.895894669247468e-05, + "loss": 4.7985, + "step": 15501 + }, + { + "epoch": 0.09219478542201923, + "grad_norm": 2.612039804458618, + "learning_rate": 4.895881329898615e-05, + "loss": 4.769, + "step": 15502 + }, + { + "epoch": 0.09220073270530021, + "grad_norm": 2.1274194717407227, + "learning_rate": 4.8958679897133854e-05, + "loss": 4.6185, + "step": 15503 + }, + { + "epoch": 0.09220667998858122, + "grad_norm": 2.2458853721618652, + "learning_rate": 4.895854648691782e-05, + "loss": 4.8576, + "step": 15504 + }, + { + "epoch": 0.09221262727186222, + "grad_norm": 2.415526866912842, + "learning_rate": 4.895841306833811e-05, + "loss": 4.999, + "step": 15505 + }, + { + "epoch": 0.0922185745551432, + "grad_norm": 1.8172876834869385, + "learning_rate": 4.8958279641394765e-05, + "loss": 5.1992, + "step": 15506 + }, + { + "epoch": 0.09222452183842421, + "grad_norm": 2.0568878650665283, + "learning_rate": 4.8958146206087826e-05, + "loss": 5.1348, + "step": 15507 + }, + { + "epoch": 0.09223046912170521, + "grad_norm": 2.152869701385498, + "learning_rate": 4.895801276241736e-05, + "loss": 4.9832, + "step": 15508 + }, + { + "epoch": 0.0922364164049862, + "grad_norm": 1.8191282749176025, + "learning_rate": 4.895787931038339e-05, + "loss": 5.3098, + "step": 15509 + }, + { + "epoch": 0.0922423636882672, + "grad_norm": 1.9511895179748535, + "learning_rate": 4.895774584998597e-05, + "loss": 5.5763, + "step": 15510 + }, + { + "epoch": 0.0922483109715482, + "grad_norm": 1.8735122680664062, + "learning_rate": 4.895761238122515e-05, + "loss": 5.3644, + "step": 15511 + }, + { + "epoch": 0.09225425825482919, + "grad_norm": 1.672721028327942, + "learning_rate": 4.895747890410098e-05, + "loss": 5.2794, + "step": 15512 + }, + { + "epoch": 0.0922602055381102, + "grad_norm": 1.5318527221679688, + "learning_rate": 4.89573454186135e-05, + "loss": 5.3575, + "step": 15513 + }, + { + "epoch": 0.0922661528213912, + "grad_norm": 1.8192704916000366, + "learning_rate": 4.895721192476275e-05, + "loss": 5.498, + "step": 15514 + }, + { + "epoch": 0.09227210010467218, + "grad_norm": 1.948249340057373, + "learning_rate": 4.895707842254879e-05, + "loss": 5.6955, + "step": 15515 + }, + { + "epoch": 0.09227804738795319, + "grad_norm": 2.1378414630889893, + "learning_rate": 4.895694491197166e-05, + "loss": 5.4999, + "step": 15516 + }, + { + "epoch": 0.09228399467123417, + "grad_norm": 2.057358980178833, + "learning_rate": 4.8956811393031414e-05, + "loss": 4.7234, + "step": 15517 + }, + { + "epoch": 0.09228994195451518, + "grad_norm": 1.9550749063491821, + "learning_rate": 4.895667786572809e-05, + "loss": 5.7611, + "step": 15518 + }, + { + "epoch": 0.09229588923779618, + "grad_norm": 2.120396852493286, + "learning_rate": 4.8956544330061734e-05, + "loss": 5.8707, + "step": 15519 + }, + { + "epoch": 0.09230183652107717, + "grad_norm": 1.8432284593582153, + "learning_rate": 4.8956410786032404e-05, + "loss": 5.7512, + "step": 15520 + }, + { + "epoch": 0.09230778380435817, + "grad_norm": 1.738993525505066, + "learning_rate": 4.895627723364013e-05, + "loss": 5.2099, + "step": 15521 + }, + { + "epoch": 0.09231373108763917, + "grad_norm": 1.4885916709899902, + "learning_rate": 4.895614367288497e-05, + "loss": 5.6817, + "step": 15522 + }, + { + "epoch": 0.09231967837092016, + "grad_norm": 1.9712351560592651, + "learning_rate": 4.895601010376697e-05, + "loss": 5.4247, + "step": 15523 + }, + { + "epoch": 0.09232562565420116, + "grad_norm": 1.6669690608978271, + "learning_rate": 4.895587652628617e-05, + "loss": 5.2189, + "step": 15524 + }, + { + "epoch": 0.09233157293748216, + "grad_norm": 2.1034297943115234, + "learning_rate": 4.895574294044262e-05, + "loss": 5.4772, + "step": 15525 + }, + { + "epoch": 0.09233752022076315, + "grad_norm": 2.3692588806152344, + "learning_rate": 4.895560934623637e-05, + "loss": 5.002, + "step": 15526 + }, + { + "epoch": 0.09234346750404415, + "grad_norm": 2.708406686782837, + "learning_rate": 4.8955475743667464e-05, + "loss": 4.9923, + "step": 15527 + }, + { + "epoch": 0.09234941478732515, + "grad_norm": 2.4986281394958496, + "learning_rate": 4.895534213273595e-05, + "loss": 4.7859, + "step": 15528 + }, + { + "epoch": 0.09235536207060614, + "grad_norm": 2.4715240001678467, + "learning_rate": 4.895520851344187e-05, + "loss": 5.2135, + "step": 15529 + }, + { + "epoch": 0.09236130935388714, + "grad_norm": 1.77085280418396, + "learning_rate": 4.895507488578528e-05, + "loss": 5.4675, + "step": 15530 + }, + { + "epoch": 0.09236725663716815, + "grad_norm": 1.4845975637435913, + "learning_rate": 4.8954941249766225e-05, + "loss": 5.8627, + "step": 15531 + }, + { + "epoch": 0.09237320392044913, + "grad_norm": 2.0753140449523926, + "learning_rate": 4.8954807605384734e-05, + "loss": 5.8246, + "step": 15532 + }, + { + "epoch": 0.09237915120373014, + "grad_norm": 1.5671929121017456, + "learning_rate": 4.895467395264088e-05, + "loss": 5.8189, + "step": 15533 + }, + { + "epoch": 0.09238509848701114, + "grad_norm": 1.749223232269287, + "learning_rate": 4.895454029153469e-05, + "loss": 5.9183, + "step": 15534 + }, + { + "epoch": 0.09239104577029213, + "grad_norm": 1.7186611890792847, + "learning_rate": 4.895440662206622e-05, + "loss": 5.84, + "step": 15535 + }, + { + "epoch": 0.09239699305357313, + "grad_norm": 1.654483437538147, + "learning_rate": 4.895427294423551e-05, + "loss": 5.4055, + "step": 15536 + }, + { + "epoch": 0.09240294033685413, + "grad_norm": 1.7109687328338623, + "learning_rate": 4.895413925804261e-05, + "loss": 5.3028, + "step": 15537 + }, + { + "epoch": 0.09240888762013512, + "grad_norm": 1.9221105575561523, + "learning_rate": 4.895400556348757e-05, + "loss": 5.2911, + "step": 15538 + }, + { + "epoch": 0.09241483490341612, + "grad_norm": 1.9464010000228882, + "learning_rate": 4.895387186057044e-05, + "loss": 5.5883, + "step": 15539 + }, + { + "epoch": 0.09242078218669712, + "grad_norm": 1.9429137706756592, + "learning_rate": 4.8953738149291254e-05, + "loss": 5.7164, + "step": 15540 + }, + { + "epoch": 0.09242672946997811, + "grad_norm": 1.7792669534683228, + "learning_rate": 4.8953604429650065e-05, + "loss": 5.7924, + "step": 15541 + }, + { + "epoch": 0.09243267675325911, + "grad_norm": 2.2124290466308594, + "learning_rate": 4.895347070164692e-05, + "loss": 5.4432, + "step": 15542 + }, + { + "epoch": 0.09243862403654012, + "grad_norm": 1.6349585056304932, + "learning_rate": 4.8953336965281873e-05, + "loss": 5.6975, + "step": 15543 + }, + { + "epoch": 0.0924445713198211, + "grad_norm": 2.01434063911438, + "learning_rate": 4.895320322055496e-05, + "loss": 5.3564, + "step": 15544 + }, + { + "epoch": 0.0924505186031021, + "grad_norm": 1.8110109567642212, + "learning_rate": 4.895306946746623e-05, + "loss": 5.3061, + "step": 15545 + }, + { + "epoch": 0.0924564658863831, + "grad_norm": 1.6687593460083008, + "learning_rate": 4.895293570601573e-05, + "loss": 5.4061, + "step": 15546 + }, + { + "epoch": 0.0924624131696641, + "grad_norm": 1.7488101720809937, + "learning_rate": 4.895280193620351e-05, + "loss": 5.4726, + "step": 15547 + }, + { + "epoch": 0.0924683604529451, + "grad_norm": 1.9059126377105713, + "learning_rate": 4.895266815802961e-05, + "loss": 5.9665, + "step": 15548 + }, + { + "epoch": 0.09247430773622609, + "grad_norm": 1.9732307195663452, + "learning_rate": 4.8952534371494084e-05, + "loss": 6.007, + "step": 15549 + }, + { + "epoch": 0.09248025501950709, + "grad_norm": 1.792325496673584, + "learning_rate": 4.895240057659697e-05, + "loss": 5.9466, + "step": 15550 + }, + { + "epoch": 0.09248620230278809, + "grad_norm": 1.7282743453979492, + "learning_rate": 4.895226677333833e-05, + "loss": 5.456, + "step": 15551 + }, + { + "epoch": 0.09249214958606908, + "grad_norm": 1.5014616250991821, + "learning_rate": 4.89521329617182e-05, + "loss": 5.0257, + "step": 15552 + }, + { + "epoch": 0.09249809686935008, + "grad_norm": 1.5420494079589844, + "learning_rate": 4.8951999141736624e-05, + "loss": 5.0657, + "step": 15553 + }, + { + "epoch": 0.09250404415263108, + "grad_norm": 1.4273606538772583, + "learning_rate": 4.895186531339365e-05, + "loss": 5.3431, + "step": 15554 + }, + { + "epoch": 0.09250999143591207, + "grad_norm": 1.9525657892227173, + "learning_rate": 4.895173147668933e-05, + "loss": 5.514, + "step": 15555 + }, + { + "epoch": 0.09251593871919307, + "grad_norm": 2.7004175186157227, + "learning_rate": 4.895159763162371e-05, + "loss": 5.3548, + "step": 15556 + }, + { + "epoch": 0.09252188600247407, + "grad_norm": 2.5703442096710205, + "learning_rate": 4.8951463778196835e-05, + "loss": 5.4275, + "step": 15557 + }, + { + "epoch": 0.09252783328575506, + "grad_norm": 2.4033594131469727, + "learning_rate": 4.895132991640875e-05, + "loss": 5.285, + "step": 15558 + }, + { + "epoch": 0.09253378056903606, + "grad_norm": 2.0295355319976807, + "learning_rate": 4.89511960462595e-05, + "loss": 5.1196, + "step": 15559 + }, + { + "epoch": 0.09253972785231707, + "grad_norm": 2.0739188194274902, + "learning_rate": 4.895106216774914e-05, + "loss": 4.7362, + "step": 15560 + }, + { + "epoch": 0.09254567513559805, + "grad_norm": 2.2429590225219727, + "learning_rate": 4.895092828087771e-05, + "loss": 5.0749, + "step": 15561 + }, + { + "epoch": 0.09255162241887906, + "grad_norm": 1.9738318920135498, + "learning_rate": 4.895079438564526e-05, + "loss": 5.6755, + "step": 15562 + }, + { + "epoch": 0.09255756970216006, + "grad_norm": 2.692275047302246, + "learning_rate": 4.895066048205183e-05, + "loss": 5.3146, + "step": 15563 + }, + { + "epoch": 0.09256351698544105, + "grad_norm": 2.774864912033081, + "learning_rate": 4.895052657009748e-05, + "loss": 5.1116, + "step": 15564 + }, + { + "epoch": 0.09256946426872205, + "grad_norm": 2.5513851642608643, + "learning_rate": 4.895039264978224e-05, + "loss": 5.0464, + "step": 15565 + }, + { + "epoch": 0.09257541155200305, + "grad_norm": 2.2035319805145264, + "learning_rate": 4.895025872110617e-05, + "loss": 5.1499, + "step": 15566 + }, + { + "epoch": 0.09258135883528404, + "grad_norm": 1.669402837753296, + "learning_rate": 4.8950124784069305e-05, + "loss": 5.5006, + "step": 15567 + }, + { + "epoch": 0.09258730611856504, + "grad_norm": 1.9433900117874146, + "learning_rate": 4.894999083867171e-05, + "loss": 5.1423, + "step": 15568 + }, + { + "epoch": 0.09259325340184604, + "grad_norm": 2.2401936054229736, + "learning_rate": 4.8949856884913416e-05, + "loss": 4.8937, + "step": 15569 + }, + { + "epoch": 0.09259920068512703, + "grad_norm": 2.094503164291382, + "learning_rate": 4.894972292279447e-05, + "loss": 4.8554, + "step": 15570 + }, + { + "epoch": 0.09260514796840803, + "grad_norm": 2.1677212715148926, + "learning_rate": 4.894958895231493e-05, + "loss": 4.7446, + "step": 15571 + }, + { + "epoch": 0.09261109525168904, + "grad_norm": 2.0262231826782227, + "learning_rate": 4.894945497347483e-05, + "loss": 4.8282, + "step": 15572 + }, + { + "epoch": 0.09261704253497002, + "grad_norm": 1.9491705894470215, + "learning_rate": 4.894932098627423e-05, + "loss": 4.9579, + "step": 15573 + }, + { + "epoch": 0.09262298981825103, + "grad_norm": 2.0898170471191406, + "learning_rate": 4.8949186990713165e-05, + "loss": 4.8197, + "step": 15574 + }, + { + "epoch": 0.09262893710153201, + "grad_norm": 1.8452088832855225, + "learning_rate": 4.894905298679169e-05, + "loss": 4.8359, + "step": 15575 + }, + { + "epoch": 0.09263488438481302, + "grad_norm": 2.1573541164398193, + "learning_rate": 4.894891897450984e-05, + "loss": 4.5882, + "step": 15576 + }, + { + "epoch": 0.09264083166809402, + "grad_norm": 2.1609156131744385, + "learning_rate": 4.894878495386768e-05, + "loss": 4.7556, + "step": 15577 + }, + { + "epoch": 0.092646778951375, + "grad_norm": 1.9062503576278687, + "learning_rate": 4.894865092486524e-05, + "loss": 4.6933, + "step": 15578 + }, + { + "epoch": 0.09265272623465601, + "grad_norm": 1.8876394033432007, + "learning_rate": 4.894851688750257e-05, + "loss": 4.7317, + "step": 15579 + }, + { + "epoch": 0.09265867351793701, + "grad_norm": 1.9106816053390503, + "learning_rate": 4.894838284177972e-05, + "loss": 4.7597, + "step": 15580 + }, + { + "epoch": 0.092664620801218, + "grad_norm": 1.8116264343261719, + "learning_rate": 4.894824878769674e-05, + "loss": 4.8865, + "step": 15581 + }, + { + "epoch": 0.092670568084499, + "grad_norm": 1.8492180109024048, + "learning_rate": 4.894811472525368e-05, + "loss": 4.7282, + "step": 15582 + }, + { + "epoch": 0.09267651536778, + "grad_norm": 1.9450536966323853, + "learning_rate": 4.894798065445058e-05, + "loss": 5.0777, + "step": 15583 + }, + { + "epoch": 0.09268246265106099, + "grad_norm": 2.2099180221557617, + "learning_rate": 4.894784657528748e-05, + "loss": 5.421, + "step": 15584 + }, + { + "epoch": 0.09268840993434199, + "grad_norm": 2.2239253520965576, + "learning_rate": 4.8947712487764436e-05, + "loss": 5.8346, + "step": 15585 + }, + { + "epoch": 0.092694357217623, + "grad_norm": 1.7867511510849, + "learning_rate": 4.894757839188149e-05, + "loss": 5.9306, + "step": 15586 + }, + { + "epoch": 0.09270030450090398, + "grad_norm": 1.6986007690429688, + "learning_rate": 4.89474442876387e-05, + "loss": 5.0704, + "step": 15587 + }, + { + "epoch": 0.09270625178418498, + "grad_norm": 1.7906185388565063, + "learning_rate": 4.89473101750361e-05, + "loss": 5.1951, + "step": 15588 + }, + { + "epoch": 0.09271219906746599, + "grad_norm": 1.7287026643753052, + "learning_rate": 4.894717605407374e-05, + "loss": 5.1736, + "step": 15589 + }, + { + "epoch": 0.09271814635074697, + "grad_norm": 1.6170624494552612, + "learning_rate": 4.8947041924751665e-05, + "loss": 5.5399, + "step": 15590 + }, + { + "epoch": 0.09272409363402798, + "grad_norm": 1.7556488513946533, + "learning_rate": 4.894690778706994e-05, + "loss": 5.574, + "step": 15591 + }, + { + "epoch": 0.09273004091730898, + "grad_norm": 2.346484899520874, + "learning_rate": 4.894677364102859e-05, + "loss": 5.0062, + "step": 15592 + }, + { + "epoch": 0.09273598820058997, + "grad_norm": 2.1376540660858154, + "learning_rate": 4.894663948662766e-05, + "loss": 5.1377, + "step": 15593 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 2.2489631175994873, + "learning_rate": 4.894650532386721e-05, + "loss": 5.1058, + "step": 15594 + }, + { + "epoch": 0.09274788276715197, + "grad_norm": 1.984281063079834, + "learning_rate": 4.8946371152747285e-05, + "loss": 5.1223, + "step": 15595 + }, + { + "epoch": 0.09275383005043296, + "grad_norm": 1.9387162923812866, + "learning_rate": 4.8946236973267935e-05, + "loss": 5.5121, + "step": 15596 + }, + { + "epoch": 0.09275977733371396, + "grad_norm": 1.8052873611450195, + "learning_rate": 4.894610278542919e-05, + "loss": 5.2101, + "step": 15597 + }, + { + "epoch": 0.09276572461699496, + "grad_norm": 2.558525562286377, + "learning_rate": 4.894596858923111e-05, + "loss": 4.6659, + "step": 15598 + }, + { + "epoch": 0.09277167190027595, + "grad_norm": 1.700897455215454, + "learning_rate": 4.8945834384673746e-05, + "loss": 5.4634, + "step": 15599 + }, + { + "epoch": 0.09277761918355695, + "grad_norm": 1.4691836833953857, + "learning_rate": 4.8945700171757134e-05, + "loss": 5.3873, + "step": 15600 + }, + { + "epoch": 0.09278356646683796, + "grad_norm": 1.4673740863800049, + "learning_rate": 4.894556595048132e-05, + "loss": 5.3917, + "step": 15601 + }, + { + "epoch": 0.09278951375011894, + "grad_norm": 1.6252011060714722, + "learning_rate": 4.894543172084637e-05, + "loss": 5.2003, + "step": 15602 + }, + { + "epoch": 0.09279546103339995, + "grad_norm": 1.6320288181304932, + "learning_rate": 4.89452974828523e-05, + "loss": 5.4821, + "step": 15603 + }, + { + "epoch": 0.09280140831668093, + "grad_norm": 2.1444239616394043, + "learning_rate": 4.8945163236499194e-05, + "loss": 5.9926, + "step": 15604 + }, + { + "epoch": 0.09280735559996194, + "grad_norm": 2.3000271320343018, + "learning_rate": 4.894502898178707e-05, + "loss": 4.7545, + "step": 15605 + }, + { + "epoch": 0.09281330288324294, + "grad_norm": 2.259962797164917, + "learning_rate": 4.894489471871597e-05, + "loss": 5.1292, + "step": 15606 + }, + { + "epoch": 0.09281925016652393, + "grad_norm": 2.5522921085357666, + "learning_rate": 4.8944760447285977e-05, + "loss": 5.1226, + "step": 15607 + }, + { + "epoch": 0.09282519744980493, + "grad_norm": 1.7621963024139404, + "learning_rate": 4.8944626167497096e-05, + "loss": 5.5405, + "step": 15608 + }, + { + "epoch": 0.09283114473308593, + "grad_norm": 1.6631364822387695, + "learning_rate": 4.894449187934941e-05, + "loss": 5.4332, + "step": 15609 + }, + { + "epoch": 0.09283709201636692, + "grad_norm": 1.695904016494751, + "learning_rate": 4.894435758284294e-05, + "loss": 5.4989, + "step": 15610 + }, + { + "epoch": 0.09284303929964792, + "grad_norm": 2.0772507190704346, + "learning_rate": 4.894422327797774e-05, + "loss": 5.0412, + "step": 15611 + }, + { + "epoch": 0.09284898658292892, + "grad_norm": 1.959685206413269, + "learning_rate": 4.894408896475386e-05, + "loss": 5.2749, + "step": 15612 + }, + { + "epoch": 0.09285493386620991, + "grad_norm": 2.0305607318878174, + "learning_rate": 4.894395464317135e-05, + "loss": 5.6227, + "step": 15613 + }, + { + "epoch": 0.09286088114949091, + "grad_norm": 1.7631112337112427, + "learning_rate": 4.894382031323026e-05, + "loss": 5.4396, + "step": 15614 + }, + { + "epoch": 0.09286682843277191, + "grad_norm": 1.8171305656433105, + "learning_rate": 4.894368597493062e-05, + "loss": 5.2498, + "step": 15615 + }, + { + "epoch": 0.0928727757160529, + "grad_norm": 2.123805522918701, + "learning_rate": 4.894355162827249e-05, + "loss": 5.8113, + "step": 15616 + }, + { + "epoch": 0.0928787229993339, + "grad_norm": 1.840071201324463, + "learning_rate": 4.894341727325591e-05, + "loss": 5.6394, + "step": 15617 + }, + { + "epoch": 0.0928846702826149, + "grad_norm": 1.7636733055114746, + "learning_rate": 4.8943282909880935e-05, + "loss": 5.5515, + "step": 15618 + }, + { + "epoch": 0.0928906175658959, + "grad_norm": 1.956026315689087, + "learning_rate": 4.89431485381476e-05, + "loss": 5.1716, + "step": 15619 + }, + { + "epoch": 0.0928965648491769, + "grad_norm": 2.2381720542907715, + "learning_rate": 4.894301415805597e-05, + "loss": 4.9692, + "step": 15620 + }, + { + "epoch": 0.0929025121324579, + "grad_norm": 2.178999423980713, + "learning_rate": 4.894287976960607e-05, + "loss": 4.9732, + "step": 15621 + }, + { + "epoch": 0.09290845941573889, + "grad_norm": 2.1932144165039062, + "learning_rate": 4.894274537279796e-05, + "loss": 4.9497, + "step": 15622 + }, + { + "epoch": 0.09291440669901989, + "grad_norm": 2.093252182006836, + "learning_rate": 4.894261096763169e-05, + "loss": 4.7642, + "step": 15623 + }, + { + "epoch": 0.09292035398230089, + "grad_norm": 1.785686731338501, + "learning_rate": 4.89424765541073e-05, + "loss": 5.1449, + "step": 15624 + }, + { + "epoch": 0.09292630126558188, + "grad_norm": 2.250986099243164, + "learning_rate": 4.894234213222484e-05, + "loss": 4.8503, + "step": 15625 + }, + { + "epoch": 0.09293224854886288, + "grad_norm": 1.8585362434387207, + "learning_rate": 4.8942207701984355e-05, + "loss": 4.582, + "step": 15626 + }, + { + "epoch": 0.09293819583214388, + "grad_norm": 2.080742597579956, + "learning_rate": 4.894207326338589e-05, + "loss": 4.4912, + "step": 15627 + }, + { + "epoch": 0.09294414311542487, + "grad_norm": 2.422774076461792, + "learning_rate": 4.8941938816429495e-05, + "loss": 4.4227, + "step": 15628 + }, + { + "epoch": 0.09295009039870587, + "grad_norm": 2.3304965496063232, + "learning_rate": 4.8941804361115215e-05, + "loss": 4.2265, + "step": 15629 + }, + { + "epoch": 0.09295603768198687, + "grad_norm": 2.619837522506714, + "learning_rate": 4.8941669897443105e-05, + "loss": 4.6812, + "step": 15630 + }, + { + "epoch": 0.09296198496526786, + "grad_norm": 2.4924118518829346, + "learning_rate": 4.89415354254132e-05, + "loss": 4.5081, + "step": 15631 + }, + { + "epoch": 0.09296793224854887, + "grad_norm": 2.5034751892089844, + "learning_rate": 4.894140094502556e-05, + "loss": 4.3356, + "step": 15632 + }, + { + "epoch": 0.09297387953182985, + "grad_norm": 2.599963665008545, + "learning_rate": 4.894126645628021e-05, + "loss": 4.6952, + "step": 15633 + }, + { + "epoch": 0.09297982681511086, + "grad_norm": 2.189516544342041, + "learning_rate": 4.894113195917722e-05, + "loss": 5.75, + "step": 15634 + }, + { + "epoch": 0.09298577409839186, + "grad_norm": 2.5768351554870605, + "learning_rate": 4.894099745371663e-05, + "loss": 5.9257, + "step": 15635 + }, + { + "epoch": 0.09299172138167285, + "grad_norm": 2.2909457683563232, + "learning_rate": 4.894086293989848e-05, + "loss": 5.484, + "step": 15636 + }, + { + "epoch": 0.09299766866495385, + "grad_norm": 2.0447487831115723, + "learning_rate": 4.894072841772282e-05, + "loss": 5.2952, + "step": 15637 + }, + { + "epoch": 0.09300361594823485, + "grad_norm": 1.8934963941574097, + "learning_rate": 4.894059388718971e-05, + "loss": 5.3498, + "step": 15638 + }, + { + "epoch": 0.09300956323151584, + "grad_norm": 1.9989632368087769, + "learning_rate": 4.894045934829919e-05, + "loss": 5.55, + "step": 15639 + }, + { + "epoch": 0.09301551051479684, + "grad_norm": 1.4955580234527588, + "learning_rate": 4.8940324801051285e-05, + "loss": 5.1978, + "step": 15640 + }, + { + "epoch": 0.09302145779807784, + "grad_norm": 1.8308879137039185, + "learning_rate": 4.8940190245446074e-05, + "loss": 5.5448, + "step": 15641 + }, + { + "epoch": 0.09302740508135883, + "grad_norm": 1.4997726678848267, + "learning_rate": 4.8940055681483576e-05, + "loss": 5.353, + "step": 15642 + }, + { + "epoch": 0.09303335236463983, + "grad_norm": 1.5643866062164307, + "learning_rate": 4.8939921109163864e-05, + "loss": 5.1456, + "step": 15643 + }, + { + "epoch": 0.09303929964792083, + "grad_norm": 1.8125799894332886, + "learning_rate": 4.8939786528486967e-05, + "loss": 5.3456, + "step": 15644 + }, + { + "epoch": 0.09304524693120182, + "grad_norm": 1.6802864074707031, + "learning_rate": 4.893965193945294e-05, + "loss": 5.279, + "step": 15645 + }, + { + "epoch": 0.09305119421448282, + "grad_norm": 1.4397536516189575, + "learning_rate": 4.893951734206182e-05, + "loss": 5.9849, + "step": 15646 + }, + { + "epoch": 0.09305714149776383, + "grad_norm": 1.618416428565979, + "learning_rate": 4.893938273631368e-05, + "loss": 5.231, + "step": 15647 + }, + { + "epoch": 0.09306308878104481, + "grad_norm": 1.4833893775939941, + "learning_rate": 4.8939248122208537e-05, + "loss": 5.2883, + "step": 15648 + }, + { + "epoch": 0.09306903606432582, + "grad_norm": 1.2709630727767944, + "learning_rate": 4.8939113499746446e-05, + "loss": 5.1042, + "step": 15649 + }, + { + "epoch": 0.09307498334760682, + "grad_norm": 1.2770884037017822, + "learning_rate": 4.893897886892747e-05, + "loss": 5.0682, + "step": 15650 + }, + { + "epoch": 0.0930809306308878, + "grad_norm": 1.4511629343032837, + "learning_rate": 4.893884422975163e-05, + "loss": 5.0904, + "step": 15651 + }, + { + "epoch": 0.09308687791416881, + "grad_norm": 1.7428641319274902, + "learning_rate": 4.8938709582219e-05, + "loss": 5.2569, + "step": 15652 + }, + { + "epoch": 0.09309282519744981, + "grad_norm": 1.5430729389190674, + "learning_rate": 4.89385749263296e-05, + "loss": 5.1698, + "step": 15653 + }, + { + "epoch": 0.0930987724807308, + "grad_norm": 1.6689143180847168, + "learning_rate": 4.8938440262083495e-05, + "loss": 5.1866, + "step": 15654 + }, + { + "epoch": 0.0931047197640118, + "grad_norm": 1.505698323249817, + "learning_rate": 4.8938305589480734e-05, + "loss": 5.1574, + "step": 15655 + }, + { + "epoch": 0.0931106670472928, + "grad_norm": 1.496547818183899, + "learning_rate": 4.8938170908521356e-05, + "loss": 5.1175, + "step": 15656 + }, + { + "epoch": 0.09311661433057379, + "grad_norm": 1.5257115364074707, + "learning_rate": 4.893803621920541e-05, + "loss": 5.1796, + "step": 15657 + }, + { + "epoch": 0.09312256161385479, + "grad_norm": 1.5880948305130005, + "learning_rate": 4.893790152153294e-05, + "loss": 5.1864, + "step": 15658 + }, + { + "epoch": 0.0931285088971358, + "grad_norm": 1.632869839668274, + "learning_rate": 4.8937766815503994e-05, + "loss": 5.1126, + "step": 15659 + }, + { + "epoch": 0.09313445618041678, + "grad_norm": 1.5902632474899292, + "learning_rate": 4.893763210111862e-05, + "loss": 5.0661, + "step": 15660 + }, + { + "epoch": 0.09314040346369779, + "grad_norm": 1.2780532836914062, + "learning_rate": 4.893749737837687e-05, + "loss": 5.2189, + "step": 15661 + }, + { + "epoch": 0.09314635074697877, + "grad_norm": 1.604551076889038, + "learning_rate": 4.8937362647278786e-05, + "loss": 5.4624, + "step": 15662 + }, + { + "epoch": 0.09315229803025978, + "grad_norm": 1.3654263019561768, + "learning_rate": 4.8937227907824424e-05, + "loss": 5.3875, + "step": 15663 + }, + { + "epoch": 0.09315824531354078, + "grad_norm": 1.3098255395889282, + "learning_rate": 4.893709316001381e-05, + "loss": 5.2158, + "step": 15664 + }, + { + "epoch": 0.09316419259682177, + "grad_norm": 1.4036632776260376, + "learning_rate": 4.893695840384701e-05, + "loss": 5.3808, + "step": 15665 + }, + { + "epoch": 0.09317013988010277, + "grad_norm": 1.772504210472107, + "learning_rate": 4.893682363932407e-05, + "loss": 5.4599, + "step": 15666 + }, + { + "epoch": 0.09317608716338377, + "grad_norm": 1.8509577512741089, + "learning_rate": 4.893668886644503e-05, + "loss": 5.223, + "step": 15667 + }, + { + "epoch": 0.09318203444666476, + "grad_norm": 1.7572264671325684, + "learning_rate": 4.893655408520993e-05, + "loss": 5.3276, + "step": 15668 + }, + { + "epoch": 0.09318798172994576, + "grad_norm": 1.7149637937545776, + "learning_rate": 4.8936419295618835e-05, + "loss": 5.3093, + "step": 15669 + }, + { + "epoch": 0.09319392901322676, + "grad_norm": 1.441741943359375, + "learning_rate": 4.893628449767178e-05, + "loss": 5.2237, + "step": 15670 + }, + { + "epoch": 0.09319987629650775, + "grad_norm": 1.4929050207138062, + "learning_rate": 4.893614969136882e-05, + "loss": 5.22, + "step": 15671 + }, + { + "epoch": 0.09320582357978875, + "grad_norm": 1.251057505607605, + "learning_rate": 4.893601487670999e-05, + "loss": 5.2417, + "step": 15672 + }, + { + "epoch": 0.09321177086306975, + "grad_norm": 1.313826560974121, + "learning_rate": 4.893588005369535e-05, + "loss": 5.1841, + "step": 15673 + }, + { + "epoch": 0.09321771814635074, + "grad_norm": 1.1993061304092407, + "learning_rate": 4.8935745222324935e-05, + "loss": 5.1649, + "step": 15674 + }, + { + "epoch": 0.09322366542963174, + "grad_norm": 1.4086672067642212, + "learning_rate": 4.8935610382598806e-05, + "loss": 5.1463, + "step": 15675 + }, + { + "epoch": 0.09322961271291275, + "grad_norm": 1.3089197874069214, + "learning_rate": 4.893547553451701e-05, + "loss": 5.1505, + "step": 15676 + }, + { + "epoch": 0.09323555999619373, + "grad_norm": 1.3332446813583374, + "learning_rate": 4.893534067807957e-05, + "loss": 5.1267, + "step": 15677 + }, + { + "epoch": 0.09324150727947474, + "grad_norm": 1.433020830154419, + "learning_rate": 4.893520581328656e-05, + "loss": 5.1689, + "step": 15678 + }, + { + "epoch": 0.09324745456275574, + "grad_norm": 1.4111361503601074, + "learning_rate": 4.893507094013801e-05, + "loss": 5.1288, + "step": 15679 + }, + { + "epoch": 0.09325340184603673, + "grad_norm": 1.551698923110962, + "learning_rate": 4.893493605863398e-05, + "loss": 5.0919, + "step": 15680 + }, + { + "epoch": 0.09325934912931773, + "grad_norm": 1.5479143857955933, + "learning_rate": 4.893480116877451e-05, + "loss": 4.9749, + "step": 15681 + }, + { + "epoch": 0.09326529641259873, + "grad_norm": 1.3716951608657837, + "learning_rate": 4.893466627055964e-05, + "loss": 5.2221, + "step": 15682 + }, + { + "epoch": 0.09327124369587972, + "grad_norm": 1.409462571144104, + "learning_rate": 4.893453136398943e-05, + "loss": 5.2131, + "step": 15683 + }, + { + "epoch": 0.09327719097916072, + "grad_norm": 1.3185720443725586, + "learning_rate": 4.8934396449063935e-05, + "loss": 5.094, + "step": 15684 + }, + { + "epoch": 0.09328313826244172, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8934261525783176e-05, + "loss": 5.0889, + "step": 15685 + }, + { + "epoch": 0.09328908554572271, + "grad_norm": 2.147268772125244, + "learning_rate": 4.8934126594147216e-05, + "loss": 4.9404, + "step": 15686 + }, + { + "epoch": 0.09329503282900371, + "grad_norm": 1.3361799716949463, + "learning_rate": 4.8933991654156096e-05, + "loss": 5.0744, + "step": 15687 + }, + { + "epoch": 0.09330098011228471, + "grad_norm": 1.6436421871185303, + "learning_rate": 4.893385670580988e-05, + "loss": 5.0633, + "step": 15688 + }, + { + "epoch": 0.0933069273955657, + "grad_norm": 1.5499234199523926, + "learning_rate": 4.8933721749108586e-05, + "loss": 4.8445, + "step": 15689 + }, + { + "epoch": 0.0933128746788467, + "grad_norm": 1.363355278968811, + "learning_rate": 4.893358678405229e-05, + "loss": 5.1135, + "step": 15690 + }, + { + "epoch": 0.0933188219621277, + "grad_norm": 1.4172797203063965, + "learning_rate": 4.893345181064102e-05, + "loss": 5.056, + "step": 15691 + }, + { + "epoch": 0.0933247692454087, + "grad_norm": 1.546329140663147, + "learning_rate": 4.893331682887483e-05, + "loss": 4.9756, + "step": 15692 + }, + { + "epoch": 0.0933307165286897, + "grad_norm": 1.5151170492172241, + "learning_rate": 4.893318183875376e-05, + "loss": 4.991, + "step": 15693 + }, + { + "epoch": 0.09333666381197069, + "grad_norm": 1.1936514377593994, + "learning_rate": 4.893304684027787e-05, + "loss": 5.0454, + "step": 15694 + }, + { + "epoch": 0.09334261109525169, + "grad_norm": 1.4055380821228027, + "learning_rate": 4.893291183344721e-05, + "loss": 5.0673, + "step": 15695 + }, + { + "epoch": 0.09334855837853269, + "grad_norm": 1.4087036848068237, + "learning_rate": 4.89327768182618e-05, + "loss": 4.9748, + "step": 15696 + }, + { + "epoch": 0.09335450566181368, + "grad_norm": 1.251237392425537, + "learning_rate": 4.893264179472171e-05, + "loss": 5.158, + "step": 15697 + }, + { + "epoch": 0.09336045294509468, + "grad_norm": 1.3806357383728027, + "learning_rate": 4.893250676282699e-05, + "loss": 5.2027, + "step": 15698 + }, + { + "epoch": 0.09336640022837568, + "grad_norm": 1.3959203958511353, + "learning_rate": 4.893237172257767e-05, + "loss": 5.1854, + "step": 15699 + }, + { + "epoch": 0.09337234751165667, + "grad_norm": 1.4886810779571533, + "learning_rate": 4.893223667397381e-05, + "loss": 5.2363, + "step": 15700 + }, + { + "epoch": 0.09337829479493767, + "grad_norm": 1.2987968921661377, + "learning_rate": 4.893210161701546e-05, + "loss": 5.2931, + "step": 15701 + }, + { + "epoch": 0.09338424207821867, + "grad_norm": 1.2594645023345947, + "learning_rate": 4.8931966551702644e-05, + "loss": 5.1346, + "step": 15702 + }, + { + "epoch": 0.09339018936149966, + "grad_norm": 1.5101357698440552, + "learning_rate": 4.893183147803544e-05, + "loss": 5.0369, + "step": 15703 + }, + { + "epoch": 0.09339613664478066, + "grad_norm": 1.4388933181762695, + "learning_rate": 4.8931696396013876e-05, + "loss": 5.0427, + "step": 15704 + }, + { + "epoch": 0.09340208392806167, + "grad_norm": 1.2890875339508057, + "learning_rate": 4.8931561305638006e-05, + "loss": 5.1602, + "step": 15705 + }, + { + "epoch": 0.09340803121134265, + "grad_norm": 1.3310670852661133, + "learning_rate": 4.893142620690787e-05, + "loss": 5.4886, + "step": 15706 + }, + { + "epoch": 0.09341397849462366, + "grad_norm": 1.0935169458389282, + "learning_rate": 4.893129109982353e-05, + "loss": 5.4634, + "step": 15707 + }, + { + "epoch": 0.09341992577790466, + "grad_norm": 1.4718440771102905, + "learning_rate": 4.893115598438501e-05, + "loss": 5.4917, + "step": 15708 + }, + { + "epoch": 0.09342587306118565, + "grad_norm": 1.4053934812545776, + "learning_rate": 4.8931020860592384e-05, + "loss": 5.1588, + "step": 15709 + }, + { + "epoch": 0.09343182034446665, + "grad_norm": 1.3130263090133667, + "learning_rate": 4.893088572844568e-05, + "loss": 5.0464, + "step": 15710 + }, + { + "epoch": 0.09343776762774765, + "grad_norm": 1.3342580795288086, + "learning_rate": 4.8930750587944955e-05, + "loss": 5.1464, + "step": 15711 + }, + { + "epoch": 0.09344371491102864, + "grad_norm": 1.3214285373687744, + "learning_rate": 4.893061543909024e-05, + "loss": 5.0867, + "step": 15712 + }, + { + "epoch": 0.09344966219430964, + "grad_norm": 1.2091466188430786, + "learning_rate": 4.893048028188161e-05, + "loss": 5.1403, + "step": 15713 + }, + { + "epoch": 0.09345560947759064, + "grad_norm": 1.421499490737915, + "learning_rate": 4.893034511631909e-05, + "loss": 5.1853, + "step": 15714 + }, + { + "epoch": 0.09346155676087163, + "grad_norm": 1.2093148231506348, + "learning_rate": 4.893020994240273e-05, + "loss": 5.0892, + "step": 15715 + }, + { + "epoch": 0.09346750404415263, + "grad_norm": 1.361080288887024, + "learning_rate": 4.893007476013258e-05, + "loss": 5.0855, + "step": 15716 + }, + { + "epoch": 0.09347345132743363, + "grad_norm": 1.31247079372406, + "learning_rate": 4.89299395695087e-05, + "loss": 5.1667, + "step": 15717 + }, + { + "epoch": 0.09347939861071462, + "grad_norm": 1.4052191972732544, + "learning_rate": 4.892980437053112e-05, + "loss": 4.9256, + "step": 15718 + }, + { + "epoch": 0.09348534589399562, + "grad_norm": 1.409225344657898, + "learning_rate": 4.8929669163199886e-05, + "loss": 4.7722, + "step": 15719 + }, + { + "epoch": 0.09349129317727661, + "grad_norm": 1.54015052318573, + "learning_rate": 4.892953394751505e-05, + "loss": 4.9331, + "step": 15720 + }, + { + "epoch": 0.09349724046055762, + "grad_norm": 1.313596487045288, + "learning_rate": 4.892939872347667e-05, + "loss": 5.0221, + "step": 15721 + }, + { + "epoch": 0.09350318774383862, + "grad_norm": 1.5266852378845215, + "learning_rate": 4.8929263491084785e-05, + "loss": 5.0261, + "step": 15722 + }, + { + "epoch": 0.0935091350271196, + "grad_norm": 1.409408450126648, + "learning_rate": 4.892912825033944e-05, + "loss": 5.1319, + "step": 15723 + }, + { + "epoch": 0.09351508231040061, + "grad_norm": 1.444326639175415, + "learning_rate": 4.892899300124067e-05, + "loss": 5.0043, + "step": 15724 + }, + { + "epoch": 0.09352102959368161, + "grad_norm": 1.6662111282348633, + "learning_rate": 4.8928857743788556e-05, + "loss": 5.22, + "step": 15725 + }, + { + "epoch": 0.0935269768769626, + "grad_norm": 1.5927739143371582, + "learning_rate": 4.8928722477983116e-05, + "loss": 5.1532, + "step": 15726 + }, + { + "epoch": 0.0935329241602436, + "grad_norm": 1.5560848712921143, + "learning_rate": 4.892858720382441e-05, + "loss": 4.8893, + "step": 15727 + }, + { + "epoch": 0.0935388714435246, + "grad_norm": 1.450135588645935, + "learning_rate": 4.892845192131247e-05, + "loss": 4.8116, + "step": 15728 + }, + { + "epoch": 0.09354481872680559, + "grad_norm": 1.3629002571105957, + "learning_rate": 4.892831663044736e-05, + "loss": 4.9439, + "step": 15729 + }, + { + "epoch": 0.09355076601008659, + "grad_norm": 1.5293892621994019, + "learning_rate": 4.892818133122913e-05, + "loss": 5.1726, + "step": 15730 + }, + { + "epoch": 0.0935567132933676, + "grad_norm": 1.193088412284851, + "learning_rate": 4.892804602365781e-05, + "loss": 5.3199, + "step": 15731 + }, + { + "epoch": 0.09356266057664858, + "grad_norm": 1.5575615167617798, + "learning_rate": 4.8927910707733456e-05, + "loss": 5.3426, + "step": 15732 + }, + { + "epoch": 0.09356860785992958, + "grad_norm": 1.4177138805389404, + "learning_rate": 4.892777538345612e-05, + "loss": 5.4028, + "step": 15733 + }, + { + "epoch": 0.09357455514321059, + "grad_norm": 1.4139392375946045, + "learning_rate": 4.892764005082584e-05, + "loss": 5.3854, + "step": 15734 + }, + { + "epoch": 0.09358050242649157, + "grad_norm": 1.5129605531692505, + "learning_rate": 4.892750470984267e-05, + "loss": 5.3614, + "step": 15735 + }, + { + "epoch": 0.09358644970977258, + "grad_norm": 1.23565673828125, + "learning_rate": 4.8927369360506665e-05, + "loss": 5.2379, + "step": 15736 + }, + { + "epoch": 0.09359239699305358, + "grad_norm": 1.4861465692520142, + "learning_rate": 4.892723400281785e-05, + "loss": 5.0968, + "step": 15737 + }, + { + "epoch": 0.09359834427633457, + "grad_norm": 1.4061464071273804, + "learning_rate": 4.892709863677629e-05, + "loss": 5.2947, + "step": 15738 + }, + { + "epoch": 0.09360429155961557, + "grad_norm": 1.2175462245941162, + "learning_rate": 4.892696326238203e-05, + "loss": 5.2828, + "step": 15739 + }, + { + "epoch": 0.09361023884289657, + "grad_norm": 1.398414969444275, + "learning_rate": 4.8926827879635104e-05, + "loss": 5.3281, + "step": 15740 + }, + { + "epoch": 0.09361618612617756, + "grad_norm": 1.438428282737732, + "learning_rate": 4.892669248853558e-05, + "loss": 5.2483, + "step": 15741 + }, + { + "epoch": 0.09362213340945856, + "grad_norm": 1.6579184532165527, + "learning_rate": 4.8926557089083494e-05, + "loss": 5.1275, + "step": 15742 + }, + { + "epoch": 0.09362808069273956, + "grad_norm": 1.2637989521026611, + "learning_rate": 4.892642168127889e-05, + "loss": 5.2276, + "step": 15743 + }, + { + "epoch": 0.09363402797602055, + "grad_norm": 1.383898377418518, + "learning_rate": 4.892628626512182e-05, + "loss": 5.3406, + "step": 15744 + }, + { + "epoch": 0.09363997525930155, + "grad_norm": 1.3794132471084595, + "learning_rate": 4.8926150840612325e-05, + "loss": 5.2309, + "step": 15745 + }, + { + "epoch": 0.09364592254258255, + "grad_norm": 1.3234885931015015, + "learning_rate": 4.8926015407750466e-05, + "loss": 5.3171, + "step": 15746 + }, + { + "epoch": 0.09365186982586354, + "grad_norm": 1.4807502031326294, + "learning_rate": 4.892587996653629e-05, + "loss": 5.3362, + "step": 15747 + }, + { + "epoch": 0.09365781710914454, + "grad_norm": 2.380307912826538, + "learning_rate": 4.892574451696982e-05, + "loss": 5.3103, + "step": 15748 + }, + { + "epoch": 0.09366376439242553, + "grad_norm": 1.5202600955963135, + "learning_rate": 4.892560905905113e-05, + "loss": 5.2225, + "step": 15749 + }, + { + "epoch": 0.09366971167570654, + "grad_norm": 1.34883451461792, + "learning_rate": 4.892547359278025e-05, + "loss": 5.1794, + "step": 15750 + }, + { + "epoch": 0.09367565895898754, + "grad_norm": 1.7073168754577637, + "learning_rate": 4.8925338118157235e-05, + "loss": 5.101, + "step": 15751 + }, + { + "epoch": 0.09368160624226853, + "grad_norm": 1.2718127965927124, + "learning_rate": 4.892520263518214e-05, + "loss": 5.3492, + "step": 15752 + }, + { + "epoch": 0.09368755352554953, + "grad_norm": 1.2247645854949951, + "learning_rate": 4.8925067143854993e-05, + "loss": 5.0841, + "step": 15753 + }, + { + "epoch": 0.09369350080883053, + "grad_norm": 1.4443535804748535, + "learning_rate": 4.892493164417586e-05, + "loss": 5.2866, + "step": 15754 + }, + { + "epoch": 0.09369944809211152, + "grad_norm": 1.2206883430480957, + "learning_rate": 4.8924796136144776e-05, + "loss": 5.116, + "step": 15755 + }, + { + "epoch": 0.09370539537539252, + "grad_norm": 1.4597479104995728, + "learning_rate": 4.89246606197618e-05, + "loss": 5.1501, + "step": 15756 + }, + { + "epoch": 0.09371134265867352, + "grad_norm": 1.4129786491394043, + "learning_rate": 4.892452509502697e-05, + "loss": 5.2618, + "step": 15757 + }, + { + "epoch": 0.09371728994195451, + "grad_norm": 1.382739543914795, + "learning_rate": 4.892438956194033e-05, + "loss": 5.2191, + "step": 15758 + }, + { + "epoch": 0.09372323722523551, + "grad_norm": 1.3665072917938232, + "learning_rate": 4.8924254020501934e-05, + "loss": 4.9739, + "step": 15759 + }, + { + "epoch": 0.09372918450851651, + "grad_norm": 1.3109017610549927, + "learning_rate": 4.892411847071183e-05, + "loss": 5.0648, + "step": 15760 + }, + { + "epoch": 0.0937351317917975, + "grad_norm": 1.5278202295303345, + "learning_rate": 4.892398291257007e-05, + "loss": 5.0215, + "step": 15761 + }, + { + "epoch": 0.0937410790750785, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.8923847346076686e-05, + "loss": 5.442, + "step": 15762 + }, + { + "epoch": 0.0937470263583595, + "grad_norm": 1.4718897342681885, + "learning_rate": 4.892371177123174e-05, + "loss": 5.1484, + "step": 15763 + }, + { + "epoch": 0.0937529736416405, + "grad_norm": 1.2358952760696411, + "learning_rate": 4.8923576188035264e-05, + "loss": 5.3594, + "step": 15764 + }, + { + "epoch": 0.0937589209249215, + "grad_norm": 1.59844172000885, + "learning_rate": 4.8923440596487326e-05, + "loss": 5.221, + "step": 15765 + }, + { + "epoch": 0.0937648682082025, + "grad_norm": 1.4293478727340698, + "learning_rate": 4.892330499658795e-05, + "loss": 5.2211, + "step": 15766 + }, + { + "epoch": 0.09377081549148349, + "grad_norm": 1.167673110961914, + "learning_rate": 4.8923169388337204e-05, + "loss": 5.1274, + "step": 15767 + }, + { + "epoch": 0.09377676277476449, + "grad_norm": 1.4637590646743774, + "learning_rate": 4.892303377173512e-05, + "loss": 5.0781, + "step": 15768 + }, + { + "epoch": 0.09378271005804549, + "grad_norm": 1.383498191833496, + "learning_rate": 4.892289814678176e-05, + "loss": 5.003, + "step": 15769 + }, + { + "epoch": 0.09378865734132648, + "grad_norm": 1.5803290605545044, + "learning_rate": 4.892276251347716e-05, + "loss": 4.9609, + "step": 15770 + }, + { + "epoch": 0.09379460462460748, + "grad_norm": 1.5272483825683594, + "learning_rate": 4.892262687182137e-05, + "loss": 5.074, + "step": 15771 + }, + { + "epoch": 0.09380055190788848, + "grad_norm": 1.377105951309204, + "learning_rate": 4.8922491221814436e-05, + "loss": 5.011, + "step": 15772 + }, + { + "epoch": 0.09380649919116947, + "grad_norm": 1.2150218486785889, + "learning_rate": 4.8922355563456414e-05, + "loss": 5.172, + "step": 15773 + }, + { + "epoch": 0.09381244647445047, + "grad_norm": 1.379515290260315, + "learning_rate": 4.892221989674734e-05, + "loss": 5.229, + "step": 15774 + }, + { + "epoch": 0.09381839375773147, + "grad_norm": 1.5256911516189575, + "learning_rate": 4.892208422168727e-05, + "loss": 5.0163, + "step": 15775 + }, + { + "epoch": 0.09382434104101246, + "grad_norm": 1.645808458328247, + "learning_rate": 4.892194853827624e-05, + "loss": 5.1382, + "step": 15776 + }, + { + "epoch": 0.09383028832429346, + "grad_norm": 1.7437238693237305, + "learning_rate": 4.8921812846514315e-05, + "loss": 4.8078, + "step": 15777 + }, + { + "epoch": 0.09383623560757447, + "grad_norm": 1.384291410446167, + "learning_rate": 4.892167714640152e-05, + "loss": 5.1645, + "step": 15778 + }, + { + "epoch": 0.09384218289085546, + "grad_norm": 1.6412228345870972, + "learning_rate": 4.892154143793792e-05, + "loss": 5.0472, + "step": 15779 + }, + { + "epoch": 0.09384813017413646, + "grad_norm": 1.5364267826080322, + "learning_rate": 4.8921405721123555e-05, + "loss": 5.1357, + "step": 15780 + }, + { + "epoch": 0.09385407745741745, + "grad_norm": 1.4579834938049316, + "learning_rate": 4.892126999595849e-05, + "loss": 5.2047, + "step": 15781 + }, + { + "epoch": 0.09386002474069845, + "grad_norm": 1.4087393283843994, + "learning_rate": 4.8921134262442745e-05, + "loss": 5.3224, + "step": 15782 + }, + { + "epoch": 0.09386597202397945, + "grad_norm": 1.4741411209106445, + "learning_rate": 4.8920998520576376e-05, + "loss": 4.9882, + "step": 15783 + }, + { + "epoch": 0.09387191930726044, + "grad_norm": 1.488578200340271, + "learning_rate": 4.8920862770359434e-05, + "loss": 4.8698, + "step": 15784 + }, + { + "epoch": 0.09387786659054144, + "grad_norm": 1.4695780277252197, + "learning_rate": 4.892072701179197e-05, + "loss": 4.6841, + "step": 15785 + }, + { + "epoch": 0.09388381387382244, + "grad_norm": 1.2468496561050415, + "learning_rate": 4.892059124487402e-05, + "loss": 5.0962, + "step": 15786 + }, + { + "epoch": 0.09388976115710343, + "grad_norm": 1.1099787950515747, + "learning_rate": 4.8920455469605654e-05, + "loss": 5.0883, + "step": 15787 + }, + { + "epoch": 0.09389570844038443, + "grad_norm": 1.3954483270645142, + "learning_rate": 4.892031968598689e-05, + "loss": 4.9554, + "step": 15788 + }, + { + "epoch": 0.09390165572366543, + "grad_norm": 1.3176839351654053, + "learning_rate": 4.892018389401779e-05, + "loss": 5.1638, + "step": 15789 + }, + { + "epoch": 0.09390760300694642, + "grad_norm": 1.2406723499298096, + "learning_rate": 4.892004809369841e-05, + "loss": 5.0569, + "step": 15790 + }, + { + "epoch": 0.09391355029022742, + "grad_norm": 1.395556926727295, + "learning_rate": 4.891991228502878e-05, + "loss": 4.9179, + "step": 15791 + }, + { + "epoch": 0.09391949757350843, + "grad_norm": 1.3977546691894531, + "learning_rate": 4.891977646800896e-05, + "loss": 5.0045, + "step": 15792 + }, + { + "epoch": 0.09392544485678941, + "grad_norm": 1.5089846849441528, + "learning_rate": 4.891964064263899e-05, + "loss": 5.176, + "step": 15793 + }, + { + "epoch": 0.09393139214007042, + "grad_norm": 1.260077953338623, + "learning_rate": 4.891950480891893e-05, + "loss": 5.3789, + "step": 15794 + }, + { + "epoch": 0.09393733942335142, + "grad_norm": 1.3587939739227295, + "learning_rate": 4.891936896684881e-05, + "loss": 5.308, + "step": 15795 + }, + { + "epoch": 0.0939432867066324, + "grad_norm": 1.4004688262939453, + "learning_rate": 4.8919233116428684e-05, + "loss": 5.5232, + "step": 15796 + }, + { + "epoch": 0.09394923398991341, + "grad_norm": 1.3308182954788208, + "learning_rate": 4.89190972576586e-05, + "loss": 5.3944, + "step": 15797 + }, + { + "epoch": 0.09395518127319441, + "grad_norm": 1.3078187704086304, + "learning_rate": 4.891896139053861e-05, + "loss": 5.3146, + "step": 15798 + }, + { + "epoch": 0.0939611285564754, + "grad_norm": 1.3268121480941772, + "learning_rate": 4.891882551506875e-05, + "loss": 5.2966, + "step": 15799 + }, + { + "epoch": 0.0939670758397564, + "grad_norm": 1.424813985824585, + "learning_rate": 4.8918689631249095e-05, + "loss": 5.132, + "step": 15800 + }, + { + "epoch": 0.0939730231230374, + "grad_norm": 1.2917978763580322, + "learning_rate": 4.8918553739079656e-05, + "loss": 5.1889, + "step": 15801 + }, + { + "epoch": 0.09397897040631839, + "grad_norm": 1.377146601676941, + "learning_rate": 4.8918417838560506e-05, + "loss": 5.2749, + "step": 15802 + }, + { + "epoch": 0.09398491768959939, + "grad_norm": 1.2476272583007812, + "learning_rate": 4.891828192969167e-05, + "loss": 5.1367, + "step": 15803 + }, + { + "epoch": 0.0939908649728804, + "grad_norm": 1.423923373222351, + "learning_rate": 4.891814601247322e-05, + "loss": 5.1657, + "step": 15804 + }, + { + "epoch": 0.09399681225616138, + "grad_norm": 1.2762609720230103, + "learning_rate": 4.891801008690518e-05, + "loss": 5.2245, + "step": 15805 + }, + { + "epoch": 0.09400275953944238, + "grad_norm": 1.3098403215408325, + "learning_rate": 4.891787415298763e-05, + "loss": 5.1452, + "step": 15806 + }, + { + "epoch": 0.09400870682272339, + "grad_norm": 1.2892425060272217, + "learning_rate": 4.8917738210720586e-05, + "loss": 5.268, + "step": 15807 + }, + { + "epoch": 0.09401465410600438, + "grad_norm": 1.4667305946350098, + "learning_rate": 4.8917602260104105e-05, + "loss": 5.1666, + "step": 15808 + }, + { + "epoch": 0.09402060138928538, + "grad_norm": 1.289933204650879, + "learning_rate": 4.891746630113824e-05, + "loss": 5.1772, + "step": 15809 + }, + { + "epoch": 0.09402654867256637, + "grad_norm": 2.3923516273498535, + "learning_rate": 4.891733033382303e-05, + "loss": 5.0732, + "step": 15810 + }, + { + "epoch": 0.09403249595584737, + "grad_norm": 1.223607063293457, + "learning_rate": 4.8917194358158534e-05, + "loss": 5.1025, + "step": 15811 + }, + { + "epoch": 0.09403844323912837, + "grad_norm": 1.5959491729736328, + "learning_rate": 4.8917058374144785e-05, + "loss": 5.3244, + "step": 15812 + }, + { + "epoch": 0.09404439052240936, + "grad_norm": 1.2359555959701538, + "learning_rate": 4.8916922381781845e-05, + "loss": 4.8643, + "step": 15813 + }, + { + "epoch": 0.09405033780569036, + "grad_norm": 1.3971196413040161, + "learning_rate": 4.891678638106974e-05, + "loss": 5.0362, + "step": 15814 + }, + { + "epoch": 0.09405628508897136, + "grad_norm": 1.3501266241073608, + "learning_rate": 4.891665037200855e-05, + "loss": 4.8705, + "step": 15815 + }, + { + "epoch": 0.09406223237225235, + "grad_norm": 1.3506006002426147, + "learning_rate": 4.89165143545983e-05, + "loss": 4.9122, + "step": 15816 + }, + { + "epoch": 0.09406817965553335, + "grad_norm": 1.4444037675857544, + "learning_rate": 4.891637832883904e-05, + "loss": 4.8428, + "step": 15817 + }, + { + "epoch": 0.09407412693881435, + "grad_norm": 1.4757333993911743, + "learning_rate": 4.891624229473082e-05, + "loss": 5.1774, + "step": 15818 + }, + { + "epoch": 0.09408007422209534, + "grad_norm": 1.3660651445388794, + "learning_rate": 4.891610625227369e-05, + "loss": 5.2998, + "step": 15819 + }, + { + "epoch": 0.09408602150537634, + "grad_norm": 1.625279426574707, + "learning_rate": 4.891597020146769e-05, + "loss": 5.1365, + "step": 15820 + }, + { + "epoch": 0.09409196878865735, + "grad_norm": 1.5202007293701172, + "learning_rate": 4.891583414231287e-05, + "loss": 5.287, + "step": 15821 + }, + { + "epoch": 0.09409791607193833, + "grad_norm": 1.5217576026916504, + "learning_rate": 4.891569807480928e-05, + "loss": 5.3599, + "step": 15822 + }, + { + "epoch": 0.09410386335521934, + "grad_norm": 1.5446710586547852, + "learning_rate": 4.891556199895696e-05, + "loss": 5.1332, + "step": 15823 + }, + { + "epoch": 0.09410981063850034, + "grad_norm": 1.2877990007400513, + "learning_rate": 4.8915425914755973e-05, + "loss": 5.0756, + "step": 15824 + }, + { + "epoch": 0.09411575792178133, + "grad_norm": 1.3024258613586426, + "learning_rate": 4.891528982220636e-05, + "loss": 5.3293, + "step": 15825 + }, + { + "epoch": 0.09412170520506233, + "grad_norm": 1.3039882183074951, + "learning_rate": 4.8915153721308166e-05, + "loss": 5.1406, + "step": 15826 + }, + { + "epoch": 0.09412765248834333, + "grad_norm": 1.2524348497390747, + "learning_rate": 4.8915017612061435e-05, + "loss": 5.3044, + "step": 15827 + }, + { + "epoch": 0.09413359977162432, + "grad_norm": 1.2522565126419067, + "learning_rate": 4.8914881494466226e-05, + "loss": 5.1776, + "step": 15828 + }, + { + "epoch": 0.09413954705490532, + "grad_norm": 1.3882638216018677, + "learning_rate": 4.8914745368522566e-05, + "loss": 5.2296, + "step": 15829 + }, + { + "epoch": 0.09414549433818632, + "grad_norm": 1.5169535875320435, + "learning_rate": 4.891460923423052e-05, + "loss": 5.2058, + "step": 15830 + }, + { + "epoch": 0.09415144162146731, + "grad_norm": 1.2045719623565674, + "learning_rate": 4.891447309159014e-05, + "loss": 5.256, + "step": 15831 + }, + { + "epoch": 0.09415738890474831, + "grad_norm": 1.4639356136322021, + "learning_rate": 4.891433694060146e-05, + "loss": 5.1781, + "step": 15832 + }, + { + "epoch": 0.09416333618802931, + "grad_norm": 1.498923420906067, + "learning_rate": 4.891420078126453e-05, + "loss": 5.1777, + "step": 15833 + }, + { + "epoch": 0.0941692834713103, + "grad_norm": 1.163977861404419, + "learning_rate": 4.89140646135794e-05, + "loss": 4.9302, + "step": 15834 + }, + { + "epoch": 0.0941752307545913, + "grad_norm": 1.502808690071106, + "learning_rate": 4.8913928437546113e-05, + "loss": 5.1053, + "step": 15835 + }, + { + "epoch": 0.0941811780378723, + "grad_norm": 1.401517391204834, + "learning_rate": 4.891379225316473e-05, + "loss": 5.3156, + "step": 15836 + }, + { + "epoch": 0.0941871253211533, + "grad_norm": 1.328116774559021, + "learning_rate": 4.891365606043528e-05, + "loss": 5.2333, + "step": 15837 + }, + { + "epoch": 0.0941930726044343, + "grad_norm": 1.160243272781372, + "learning_rate": 4.891351985935782e-05, + "loss": 5.2575, + "step": 15838 + }, + { + "epoch": 0.09419901988771529, + "grad_norm": 1.1748963594436646, + "learning_rate": 4.8913383649932404e-05, + "loss": 5.0673, + "step": 15839 + }, + { + "epoch": 0.09420496717099629, + "grad_norm": 1.2916535139083862, + "learning_rate": 4.891324743215907e-05, + "loss": 5.135, + "step": 15840 + }, + { + "epoch": 0.09421091445427729, + "grad_norm": 1.302393913269043, + "learning_rate": 4.8913111206037865e-05, + "loss": 4.9814, + "step": 15841 + }, + { + "epoch": 0.09421686173755828, + "grad_norm": 1.273445963859558, + "learning_rate": 4.891297497156885e-05, + "loss": 4.9163, + "step": 15842 + }, + { + "epoch": 0.09422280902083928, + "grad_norm": 1.444884181022644, + "learning_rate": 4.8912838728752055e-05, + "loss": 4.9316, + "step": 15843 + }, + { + "epoch": 0.09422875630412028, + "grad_norm": 1.411985993385315, + "learning_rate": 4.891270247758753e-05, + "loss": 4.9222, + "step": 15844 + }, + { + "epoch": 0.09423470358740127, + "grad_norm": 1.3697528839111328, + "learning_rate": 4.891256621807533e-05, + "loss": 4.8398, + "step": 15845 + }, + { + "epoch": 0.09424065087068227, + "grad_norm": 1.385298728942871, + "learning_rate": 4.891242995021551e-05, + "loss": 4.8869, + "step": 15846 + }, + { + "epoch": 0.09424659815396327, + "grad_norm": 1.821768879890442, + "learning_rate": 4.8912293674008094e-05, + "loss": 5.178, + "step": 15847 + }, + { + "epoch": 0.09425254543724426, + "grad_norm": 1.8198026418685913, + "learning_rate": 4.891215738945315e-05, + "loss": 5.2892, + "step": 15848 + }, + { + "epoch": 0.09425849272052526, + "grad_norm": 1.4373536109924316, + "learning_rate": 4.891202109655072e-05, + "loss": 5.1203, + "step": 15849 + }, + { + "epoch": 0.09426444000380627, + "grad_norm": 1.2086896896362305, + "learning_rate": 4.8911884795300855e-05, + "loss": 4.8603, + "step": 15850 + }, + { + "epoch": 0.09427038728708725, + "grad_norm": 1.3166700601577759, + "learning_rate": 4.891174848570359e-05, + "loss": 4.917, + "step": 15851 + }, + { + "epoch": 0.09427633457036826, + "grad_norm": 1.5753637552261353, + "learning_rate": 4.891161216775898e-05, + "loss": 5.0197, + "step": 15852 + }, + { + "epoch": 0.09428228185364926, + "grad_norm": 1.5428698062896729, + "learning_rate": 4.891147584146708e-05, + "loss": 5.2048, + "step": 15853 + }, + { + "epoch": 0.09428822913693025, + "grad_norm": 1.3760755062103271, + "learning_rate": 4.8911339506827924e-05, + "loss": 5.2568, + "step": 15854 + }, + { + "epoch": 0.09429417642021125, + "grad_norm": 1.6683621406555176, + "learning_rate": 4.891120316384157e-05, + "loss": 4.8976, + "step": 15855 + }, + { + "epoch": 0.09430012370349225, + "grad_norm": 1.4224987030029297, + "learning_rate": 4.891106681250807e-05, + "loss": 4.9538, + "step": 15856 + }, + { + "epoch": 0.09430607098677324, + "grad_norm": 1.2851178646087646, + "learning_rate": 4.8910930452827454e-05, + "loss": 4.8972, + "step": 15857 + }, + { + "epoch": 0.09431201827005424, + "grad_norm": 1.6412112712860107, + "learning_rate": 4.891079408479978e-05, + "loss": 5.124, + "step": 15858 + }, + { + "epoch": 0.09431796555333524, + "grad_norm": 1.380089282989502, + "learning_rate": 4.891065770842509e-05, + "loss": 5.1155, + "step": 15859 + }, + { + "epoch": 0.09432391283661623, + "grad_norm": 1.3117294311523438, + "learning_rate": 4.891052132370344e-05, + "loss": 5.1968, + "step": 15860 + }, + { + "epoch": 0.09432986011989723, + "grad_norm": 1.5171841382980347, + "learning_rate": 4.891038493063488e-05, + "loss": 5.1029, + "step": 15861 + }, + { + "epoch": 0.09433580740317823, + "grad_norm": 1.4801427125930786, + "learning_rate": 4.8910248529219446e-05, + "loss": 5.1533, + "step": 15862 + }, + { + "epoch": 0.09434175468645922, + "grad_norm": 1.672522783279419, + "learning_rate": 4.8910112119457196e-05, + "loss": 5.3259, + "step": 15863 + }, + { + "epoch": 0.09434770196974022, + "grad_norm": 1.5151952505111694, + "learning_rate": 4.890997570134816e-05, + "loss": 5.2654, + "step": 15864 + }, + { + "epoch": 0.09435364925302123, + "grad_norm": 1.4178684949874878, + "learning_rate": 4.890983927489242e-05, + "loss": 5.2369, + "step": 15865 + }, + { + "epoch": 0.09435959653630221, + "grad_norm": 1.3673019409179688, + "learning_rate": 4.890970284008999e-05, + "loss": 5.2176, + "step": 15866 + }, + { + "epoch": 0.09436554381958322, + "grad_norm": 1.4063305854797363, + "learning_rate": 4.8909566396940934e-05, + "loss": 5.1189, + "step": 15867 + }, + { + "epoch": 0.0943714911028642, + "grad_norm": 1.277815818786621, + "learning_rate": 4.890942994544528e-05, + "loss": 5.2204, + "step": 15868 + }, + { + "epoch": 0.09437743838614521, + "grad_norm": 1.5394912958145142, + "learning_rate": 4.890929348560311e-05, + "loss": 5.1147, + "step": 15869 + }, + { + "epoch": 0.09438338566942621, + "grad_norm": 1.4091798067092896, + "learning_rate": 4.890915701741444e-05, + "loss": 5.1367, + "step": 15870 + }, + { + "epoch": 0.0943893329527072, + "grad_norm": 1.367828369140625, + "learning_rate": 4.8909020540879336e-05, + "loss": 5.1871, + "step": 15871 + }, + { + "epoch": 0.0943952802359882, + "grad_norm": 2.2413175106048584, + "learning_rate": 4.890888405599784e-05, + "loss": 5.0571, + "step": 15872 + }, + { + "epoch": 0.0944012275192692, + "grad_norm": 1.392906904220581, + "learning_rate": 4.8908747562769995e-05, + "loss": 4.9885, + "step": 15873 + }, + { + "epoch": 0.09440717480255019, + "grad_norm": 1.4517099857330322, + "learning_rate": 4.8908611061195865e-05, + "loss": 5.1596, + "step": 15874 + }, + { + "epoch": 0.09441312208583119, + "grad_norm": 1.663919448852539, + "learning_rate": 4.890847455127547e-05, + "loss": 5.0029, + "step": 15875 + }, + { + "epoch": 0.0944190693691122, + "grad_norm": 1.5252666473388672, + "learning_rate": 4.8908338033008885e-05, + "loss": 4.9596, + "step": 15876 + }, + { + "epoch": 0.09442501665239318, + "grad_norm": 1.613261103630066, + "learning_rate": 4.8908201506396143e-05, + "loss": 4.91, + "step": 15877 + }, + { + "epoch": 0.09443096393567418, + "grad_norm": 1.5182253122329712, + "learning_rate": 4.8908064971437295e-05, + "loss": 5.0564, + "step": 15878 + }, + { + "epoch": 0.09443691121895519, + "grad_norm": 1.4765241146087646, + "learning_rate": 4.8907928428132386e-05, + "loss": 5.0863, + "step": 15879 + }, + { + "epoch": 0.09444285850223617, + "grad_norm": 1.6401035785675049, + "learning_rate": 4.890779187648147e-05, + "loss": 4.9876, + "step": 15880 + }, + { + "epoch": 0.09444880578551718, + "grad_norm": 1.4818077087402344, + "learning_rate": 4.8907655316484594e-05, + "loss": 4.9361, + "step": 15881 + }, + { + "epoch": 0.09445475306879818, + "grad_norm": 1.4490398168563843, + "learning_rate": 4.89075187481418e-05, + "loss": 4.8991, + "step": 15882 + }, + { + "epoch": 0.09446070035207917, + "grad_norm": 1.2799785137176514, + "learning_rate": 4.890738217145313e-05, + "loss": 5.0147, + "step": 15883 + }, + { + "epoch": 0.09446664763536017, + "grad_norm": 1.416590929031372, + "learning_rate": 4.890724558641865e-05, + "loss": 5.0255, + "step": 15884 + }, + { + "epoch": 0.09447259491864117, + "grad_norm": 1.4365648031234741, + "learning_rate": 4.8907108993038395e-05, + "loss": 5.0262, + "step": 15885 + }, + { + "epoch": 0.09447854220192216, + "grad_norm": 1.367490530014038, + "learning_rate": 4.890697239131241e-05, + "loss": 4.9478, + "step": 15886 + }, + { + "epoch": 0.09448448948520316, + "grad_norm": 1.3645575046539307, + "learning_rate": 4.8906835781240754e-05, + "loss": 5.0751, + "step": 15887 + }, + { + "epoch": 0.09449043676848416, + "grad_norm": 1.4014960527420044, + "learning_rate": 4.8906699162823464e-05, + "loss": 4.9789, + "step": 15888 + }, + { + "epoch": 0.09449638405176515, + "grad_norm": 1.2261216640472412, + "learning_rate": 4.8906562536060596e-05, + "loss": 4.9619, + "step": 15889 + }, + { + "epoch": 0.09450233133504615, + "grad_norm": 1.3241546154022217, + "learning_rate": 4.890642590095219e-05, + "loss": 4.9947, + "step": 15890 + }, + { + "epoch": 0.09450827861832715, + "grad_norm": 1.337372899055481, + "learning_rate": 4.89062892574983e-05, + "loss": 4.9817, + "step": 15891 + }, + { + "epoch": 0.09451422590160814, + "grad_norm": 1.47610604763031, + "learning_rate": 4.8906152605698974e-05, + "loss": 4.9467, + "step": 15892 + }, + { + "epoch": 0.09452017318488914, + "grad_norm": 1.3533576726913452, + "learning_rate": 4.890601594555425e-05, + "loss": 4.9819, + "step": 15893 + }, + { + "epoch": 0.09452612046817015, + "grad_norm": 1.4445271492004395, + "learning_rate": 4.890587927706419e-05, + "loss": 4.9566, + "step": 15894 + }, + { + "epoch": 0.09453206775145113, + "grad_norm": 1.4600121974945068, + "learning_rate": 4.8905742600228834e-05, + "loss": 4.9341, + "step": 15895 + }, + { + "epoch": 0.09453801503473214, + "grad_norm": 1.2824327945709229, + "learning_rate": 4.8905605915048224e-05, + "loss": 5.0945, + "step": 15896 + }, + { + "epoch": 0.09454396231801313, + "grad_norm": 1.4806164503097534, + "learning_rate": 4.890546922152242e-05, + "loss": 5.1312, + "step": 15897 + }, + { + "epoch": 0.09454990960129413, + "grad_norm": 1.3514155149459839, + "learning_rate": 4.890533251965146e-05, + "loss": 4.9596, + "step": 15898 + }, + { + "epoch": 0.09455585688457513, + "grad_norm": 1.332749843597412, + "learning_rate": 4.89051958094354e-05, + "loss": 5.0649, + "step": 15899 + }, + { + "epoch": 0.09456180416785612, + "grad_norm": 1.310562014579773, + "learning_rate": 4.8905059090874284e-05, + "loss": 5.0977, + "step": 15900 + }, + { + "epoch": 0.09456775145113712, + "grad_norm": 1.342310905456543, + "learning_rate": 4.8904922363968153e-05, + "loss": 5.115, + "step": 15901 + }, + { + "epoch": 0.09457369873441812, + "grad_norm": 1.4810988903045654, + "learning_rate": 4.890478562871706e-05, + "loss": 5.1305, + "step": 15902 + }, + { + "epoch": 0.09457964601769911, + "grad_norm": 1.3064900636672974, + "learning_rate": 4.890464888512106e-05, + "loss": 5.1387, + "step": 15903 + }, + { + "epoch": 0.09458559330098011, + "grad_norm": 1.4571950435638428, + "learning_rate": 4.890451213318019e-05, + "loss": 5.1235, + "step": 15904 + }, + { + "epoch": 0.09459154058426111, + "grad_norm": 1.3964077234268188, + "learning_rate": 4.89043753728945e-05, + "loss": 5.0854, + "step": 15905 + }, + { + "epoch": 0.0945974878675421, + "grad_norm": 1.4404022693634033, + "learning_rate": 4.8904238604264044e-05, + "loss": 5.0991, + "step": 15906 + }, + { + "epoch": 0.0946034351508231, + "grad_norm": 1.3269283771514893, + "learning_rate": 4.890410182728886e-05, + "loss": 4.9299, + "step": 15907 + }, + { + "epoch": 0.0946093824341041, + "grad_norm": 1.4588782787322998, + "learning_rate": 4.8903965041969e-05, + "loss": 5.0992, + "step": 15908 + }, + { + "epoch": 0.0946153297173851, + "grad_norm": 1.2911858558654785, + "learning_rate": 4.8903828248304525e-05, + "loss": 5.0639, + "step": 15909 + }, + { + "epoch": 0.0946212770006661, + "grad_norm": 1.336695909500122, + "learning_rate": 4.8903691446295466e-05, + "loss": 5.1479, + "step": 15910 + }, + { + "epoch": 0.0946272242839471, + "grad_norm": 1.3052904605865479, + "learning_rate": 4.890355463594186e-05, + "loss": 5.049, + "step": 15911 + }, + { + "epoch": 0.09463317156722809, + "grad_norm": 1.3744491338729858, + "learning_rate": 4.890341781724379e-05, + "loss": 5.0709, + "step": 15912 + }, + { + "epoch": 0.09463911885050909, + "grad_norm": 1.5727102756500244, + "learning_rate": 4.890328099020127e-05, + "loss": 4.9857, + "step": 15913 + }, + { + "epoch": 0.09464506613379009, + "grad_norm": 1.5804322957992554, + "learning_rate": 4.890314415481437e-05, + "loss": 5.133, + "step": 15914 + }, + { + "epoch": 0.09465101341707108, + "grad_norm": 1.228421926498413, + "learning_rate": 4.8903007311083124e-05, + "loss": 4.9561, + "step": 15915 + }, + { + "epoch": 0.09465696070035208, + "grad_norm": 1.4680207967758179, + "learning_rate": 4.890287045900759e-05, + "loss": 5.0502, + "step": 15916 + }, + { + "epoch": 0.09466290798363308, + "grad_norm": 1.3447710275650024, + "learning_rate": 4.89027335985878e-05, + "loss": 5.1255, + "step": 15917 + }, + { + "epoch": 0.09466885526691407, + "grad_norm": 1.3510375022888184, + "learning_rate": 4.8902596729823825e-05, + "loss": 5.0936, + "step": 15918 + }, + { + "epoch": 0.09467480255019507, + "grad_norm": 1.3805617094039917, + "learning_rate": 4.89024598527157e-05, + "loss": 5.1146, + "step": 15919 + }, + { + "epoch": 0.09468074983347607, + "grad_norm": 1.568036437034607, + "learning_rate": 4.890232296726347e-05, + "loss": 5.0032, + "step": 15920 + }, + { + "epoch": 0.09468669711675706, + "grad_norm": 1.6060000658035278, + "learning_rate": 4.890218607346718e-05, + "loss": 5.017, + "step": 15921 + }, + { + "epoch": 0.09469264440003806, + "grad_norm": 1.498241901397705, + "learning_rate": 4.890204917132689e-05, + "loss": 5.1265, + "step": 15922 + }, + { + "epoch": 0.09469859168331907, + "grad_norm": 1.418135643005371, + "learning_rate": 4.8901912260842644e-05, + "loss": 5.1458, + "step": 15923 + }, + { + "epoch": 0.09470453896660005, + "grad_norm": 1.3306639194488525, + "learning_rate": 4.890177534201448e-05, + "loss": 5.1672, + "step": 15924 + }, + { + "epoch": 0.09471048624988106, + "grad_norm": 1.542938470840454, + "learning_rate": 4.890163841484246e-05, + "loss": 5.1511, + "step": 15925 + }, + { + "epoch": 0.09471643353316204, + "grad_norm": 1.3050166368484497, + "learning_rate": 4.890150147932662e-05, + "loss": 5.2615, + "step": 15926 + }, + { + "epoch": 0.09472238081644305, + "grad_norm": 1.3447345495224, + "learning_rate": 4.890136453546702e-05, + "loss": 5.2957, + "step": 15927 + }, + { + "epoch": 0.09472832809972405, + "grad_norm": 1.3270481824874878, + "learning_rate": 4.8901227583263695e-05, + "loss": 5.2751, + "step": 15928 + }, + { + "epoch": 0.09473427538300504, + "grad_norm": 1.3909003734588623, + "learning_rate": 4.890109062271669e-05, + "loss": 5.1162, + "step": 15929 + }, + { + "epoch": 0.09474022266628604, + "grad_norm": 1.4668915271759033, + "learning_rate": 4.890095365382608e-05, + "loss": 5.0313, + "step": 15930 + }, + { + "epoch": 0.09474616994956704, + "grad_norm": 1.2651780843734741, + "learning_rate": 4.890081667659188e-05, + "loss": 5.0576, + "step": 15931 + }, + { + "epoch": 0.09475211723284803, + "grad_norm": 1.5086911916732788, + "learning_rate": 4.8900679691014154e-05, + "loss": 4.9508, + "step": 15932 + }, + { + "epoch": 0.09475806451612903, + "grad_norm": 1.2698594331741333, + "learning_rate": 4.8900542697092956e-05, + "loss": 5.0183, + "step": 15933 + }, + { + "epoch": 0.09476401179941003, + "grad_norm": 2.691392183303833, + "learning_rate": 4.8900405694828313e-05, + "loss": 5.0997, + "step": 15934 + }, + { + "epoch": 0.09476995908269102, + "grad_norm": 1.3395452499389648, + "learning_rate": 4.8900268684220295e-05, + "loss": 5.2219, + "step": 15935 + }, + { + "epoch": 0.09477590636597202, + "grad_norm": 1.3485181331634521, + "learning_rate": 4.8900131665268934e-05, + "loss": 4.9594, + "step": 15936 + }, + { + "epoch": 0.09478185364925303, + "grad_norm": 1.2990431785583496, + "learning_rate": 4.889999463797429e-05, + "loss": 4.9492, + "step": 15937 + }, + { + "epoch": 0.09478780093253401, + "grad_norm": 1.2848893404006958, + "learning_rate": 4.8899857602336396e-05, + "loss": 4.9819, + "step": 15938 + }, + { + "epoch": 0.09479374821581502, + "grad_norm": 1.4666554927825928, + "learning_rate": 4.889972055835531e-05, + "loss": 4.9672, + "step": 15939 + }, + { + "epoch": 0.09479969549909602, + "grad_norm": 1.3356142044067383, + "learning_rate": 4.8899583506031085e-05, + "loss": 5.029, + "step": 15940 + }, + { + "epoch": 0.094805642782377, + "grad_norm": 1.561786413192749, + "learning_rate": 4.8899446445363765e-05, + "loss": 4.9071, + "step": 15941 + }, + { + "epoch": 0.09481159006565801, + "grad_norm": 1.4906450510025024, + "learning_rate": 4.889930937635339e-05, + "loss": 5.0832, + "step": 15942 + }, + { + "epoch": 0.09481753734893901, + "grad_norm": 1.5042341947555542, + "learning_rate": 4.889917229900001e-05, + "loss": 5.1069, + "step": 15943 + }, + { + "epoch": 0.09482348463222, + "grad_norm": 1.6562377214431763, + "learning_rate": 4.889903521330368e-05, + "loss": 5.0532, + "step": 15944 + }, + { + "epoch": 0.094829431915501, + "grad_norm": 1.1881135702133179, + "learning_rate": 4.889889811926445e-05, + "loss": 5.1159, + "step": 15945 + }, + { + "epoch": 0.094835379198782, + "grad_norm": 1.3550158739089966, + "learning_rate": 4.889876101688234e-05, + "loss": 5.0754, + "step": 15946 + }, + { + "epoch": 0.09484132648206299, + "grad_norm": 1.403874158859253, + "learning_rate": 4.8898623906157435e-05, + "loss": 5.405, + "step": 15947 + }, + { + "epoch": 0.09484727376534399, + "grad_norm": 1.4460557699203491, + "learning_rate": 4.889848678708977e-05, + "loss": 5.041, + "step": 15948 + }, + { + "epoch": 0.094853221048625, + "grad_norm": 1.4151064157485962, + "learning_rate": 4.889834965967939e-05, + "loss": 5.368, + "step": 15949 + }, + { + "epoch": 0.09485916833190598, + "grad_norm": 1.3388437032699585, + "learning_rate": 4.889821252392633e-05, + "loss": 5.2905, + "step": 15950 + }, + { + "epoch": 0.09486511561518698, + "grad_norm": 1.1941900253295898, + "learning_rate": 4.8898075379830665e-05, + "loss": 5.1499, + "step": 15951 + }, + { + "epoch": 0.09487106289846799, + "grad_norm": 1.4840821027755737, + "learning_rate": 4.889793822739243e-05, + "loss": 5.0461, + "step": 15952 + }, + { + "epoch": 0.09487701018174897, + "grad_norm": 1.4021552801132202, + "learning_rate": 4.889780106661166e-05, + "loss": 4.89, + "step": 15953 + }, + { + "epoch": 0.09488295746502998, + "grad_norm": 1.4893288612365723, + "learning_rate": 4.889766389748842e-05, + "loss": 4.9719, + "step": 15954 + }, + { + "epoch": 0.09488890474831096, + "grad_norm": 1.4530198574066162, + "learning_rate": 4.889752672002275e-05, + "loss": 5.3931, + "step": 15955 + }, + { + "epoch": 0.09489485203159197, + "grad_norm": 1.468037724494934, + "learning_rate": 4.88973895342147e-05, + "loss": 5.271, + "step": 15956 + }, + { + "epoch": 0.09490079931487297, + "grad_norm": 1.3074537515640259, + "learning_rate": 4.889725234006433e-05, + "loss": 5.202, + "step": 15957 + }, + { + "epoch": 0.09490674659815396, + "grad_norm": 1.3678735494613647, + "learning_rate": 4.889711513757166e-05, + "loss": 5.0821, + "step": 15958 + }, + { + "epoch": 0.09491269388143496, + "grad_norm": 1.3922240734100342, + "learning_rate": 4.889697792673676e-05, + "loss": 4.8938, + "step": 15959 + }, + { + "epoch": 0.09491864116471596, + "grad_norm": 1.3895872831344604, + "learning_rate": 4.8896840707559674e-05, + "loss": 4.8293, + "step": 15960 + }, + { + "epoch": 0.09492458844799695, + "grad_norm": 1.223599910736084, + "learning_rate": 4.889670348004045e-05, + "loss": 4.8528, + "step": 15961 + }, + { + "epoch": 0.09493053573127795, + "grad_norm": 1.4488904476165771, + "learning_rate": 4.889656624417913e-05, + "loss": 5.0107, + "step": 15962 + }, + { + "epoch": 0.09493648301455895, + "grad_norm": 1.5250918865203857, + "learning_rate": 4.889642899997576e-05, + "loss": 4.9114, + "step": 15963 + }, + { + "epoch": 0.09494243029783994, + "grad_norm": 1.4656517505645752, + "learning_rate": 4.88962917474304e-05, + "loss": 5.2163, + "step": 15964 + }, + { + "epoch": 0.09494837758112094, + "grad_norm": 1.316635251045227, + "learning_rate": 4.889615448654309e-05, + "loss": 5.1904, + "step": 15965 + }, + { + "epoch": 0.09495432486440195, + "grad_norm": 1.5920292139053345, + "learning_rate": 4.8896017217313886e-05, + "loss": 5.0858, + "step": 15966 + }, + { + "epoch": 0.09496027214768293, + "grad_norm": 1.5263009071350098, + "learning_rate": 4.889587993974282e-05, + "loss": 5.0594, + "step": 15967 + }, + { + "epoch": 0.09496621943096394, + "grad_norm": 1.4230486154556274, + "learning_rate": 4.889574265382996e-05, + "loss": 5.0712, + "step": 15968 + }, + { + "epoch": 0.09497216671424494, + "grad_norm": 1.9315528869628906, + "learning_rate": 4.889560535957533e-05, + "loss": 4.8489, + "step": 15969 + }, + { + "epoch": 0.09497811399752593, + "grad_norm": 1.3432739973068237, + "learning_rate": 4.8895468056979e-05, + "loss": 4.9722, + "step": 15970 + }, + { + "epoch": 0.09498406128080693, + "grad_norm": 1.191886067390442, + "learning_rate": 4.8895330746041e-05, + "loss": 4.9384, + "step": 15971 + }, + { + "epoch": 0.09499000856408793, + "grad_norm": 1.4204323291778564, + "learning_rate": 4.8895193426761396e-05, + "loss": 5.1063, + "step": 15972 + }, + { + "epoch": 0.09499595584736892, + "grad_norm": 1.319189429283142, + "learning_rate": 4.8895056099140224e-05, + "loss": 5.0643, + "step": 15973 + }, + { + "epoch": 0.09500190313064992, + "grad_norm": 1.2905625104904175, + "learning_rate": 4.8894918763177533e-05, + "loss": 5.0806, + "step": 15974 + }, + { + "epoch": 0.09500785041393092, + "grad_norm": 1.6914581060409546, + "learning_rate": 4.889478141887338e-05, + "loss": 4.9209, + "step": 15975 + }, + { + "epoch": 0.09501379769721191, + "grad_norm": 1.390061378479004, + "learning_rate": 4.8894644066227797e-05, + "loss": 5.1376, + "step": 15976 + }, + { + "epoch": 0.09501974498049291, + "grad_norm": 1.2711600065231323, + "learning_rate": 4.889450670524084e-05, + "loss": 5.2344, + "step": 15977 + }, + { + "epoch": 0.09502569226377391, + "grad_norm": 1.472398042678833, + "learning_rate": 4.889436933591256e-05, + "loss": 5.0605, + "step": 15978 + }, + { + "epoch": 0.0950316395470549, + "grad_norm": 1.483567714691162, + "learning_rate": 4.889423195824301e-05, + "loss": 4.9827, + "step": 15979 + }, + { + "epoch": 0.0950375868303359, + "grad_norm": 1.706921935081482, + "learning_rate": 4.889409457223222e-05, + "loss": 5.0692, + "step": 15980 + }, + { + "epoch": 0.0950435341136169, + "grad_norm": 1.7719398736953735, + "learning_rate": 4.889395717788026e-05, + "loss": 5.0985, + "step": 15981 + }, + { + "epoch": 0.0950494813968979, + "grad_norm": 1.6768114566802979, + "learning_rate": 4.889381977518715e-05, + "loss": 4.8838, + "step": 15982 + }, + { + "epoch": 0.0950554286801789, + "grad_norm": 1.5722233057022095, + "learning_rate": 4.889368236415296e-05, + "loss": 4.824, + "step": 15983 + }, + { + "epoch": 0.09506137596345988, + "grad_norm": 1.5722928047180176, + "learning_rate": 4.889354494477773e-05, + "loss": 5.3027, + "step": 15984 + }, + { + "epoch": 0.09506732324674089, + "grad_norm": 2.0003905296325684, + "learning_rate": 4.8893407517061526e-05, + "loss": 5.2216, + "step": 15985 + }, + { + "epoch": 0.09507327053002189, + "grad_norm": 1.390168309211731, + "learning_rate": 4.889327008100437e-05, + "loss": 5.358, + "step": 15986 + }, + { + "epoch": 0.09507921781330288, + "grad_norm": 1.545292854309082, + "learning_rate": 4.889313263660632e-05, + "loss": 5.5124, + "step": 15987 + }, + { + "epoch": 0.09508516509658388, + "grad_norm": 1.4416158199310303, + "learning_rate": 4.889299518386742e-05, + "loss": 5.0929, + "step": 15988 + }, + { + "epoch": 0.09509111237986488, + "grad_norm": 1.8936892747879028, + "learning_rate": 4.889285772278773e-05, + "loss": 4.9407, + "step": 15989 + }, + { + "epoch": 0.09509705966314587, + "grad_norm": 1.4762251377105713, + "learning_rate": 4.889272025336729e-05, + "loss": 5.05, + "step": 15990 + }, + { + "epoch": 0.09510300694642687, + "grad_norm": 1.4513001441955566, + "learning_rate": 4.8892582775606146e-05, + "loss": 5.2386, + "step": 15991 + }, + { + "epoch": 0.09510895422970787, + "grad_norm": 1.8999260663986206, + "learning_rate": 4.8892445289504345e-05, + "loss": 5.1524, + "step": 15992 + }, + { + "epoch": 0.09511490151298886, + "grad_norm": 1.5721614360809326, + "learning_rate": 4.8892307795061945e-05, + "loss": 5.2276, + "step": 15993 + }, + { + "epoch": 0.09512084879626986, + "grad_norm": 1.754425287246704, + "learning_rate": 4.889217029227898e-05, + "loss": 5.118, + "step": 15994 + }, + { + "epoch": 0.09512679607955087, + "grad_norm": 1.6336870193481445, + "learning_rate": 4.889203278115551e-05, + "loss": 5.2065, + "step": 15995 + }, + { + "epoch": 0.09513274336283185, + "grad_norm": 2.721186876296997, + "learning_rate": 4.889189526169157e-05, + "loss": 5.3698, + "step": 15996 + }, + { + "epoch": 0.09513869064611286, + "grad_norm": 1.3870679140090942, + "learning_rate": 4.889175773388722e-05, + "loss": 5.294, + "step": 15997 + }, + { + "epoch": 0.09514463792939386, + "grad_norm": 1.4010889530181885, + "learning_rate": 4.889162019774252e-05, + "loss": 5.2313, + "step": 15998 + }, + { + "epoch": 0.09515058521267485, + "grad_norm": 1.6322177648544312, + "learning_rate": 4.889148265325748e-05, + "loss": 5.2871, + "step": 15999 + }, + { + "epoch": 0.09515653249595585, + "grad_norm": 1.5373196601867676, + "learning_rate": 4.889134510043218e-05, + "loss": 5.4748, + "step": 16000 + }, + { + "epoch": 0.09516247977923685, + "grad_norm": 1.572461724281311, + "learning_rate": 4.889120753926666e-05, + "loss": 5.3634, + "step": 16001 + }, + { + "epoch": 0.09516842706251784, + "grad_norm": 1.3587132692337036, + "learning_rate": 4.889106996976096e-05, + "loss": 5.1399, + "step": 16002 + }, + { + "epoch": 0.09517437434579884, + "grad_norm": 1.1270248889923096, + "learning_rate": 4.889093239191514e-05, + "loss": 5.1845, + "step": 16003 + }, + { + "epoch": 0.09518032162907984, + "grad_norm": 1.5456722974777222, + "learning_rate": 4.889079480572924e-05, + "loss": 5.4895, + "step": 16004 + }, + { + "epoch": 0.09518626891236083, + "grad_norm": 1.2772669792175293, + "learning_rate": 4.8890657211203307e-05, + "loss": 5.5415, + "step": 16005 + }, + { + "epoch": 0.09519221619564183, + "grad_norm": 1.5249123573303223, + "learning_rate": 4.88905196083374e-05, + "loss": 5.2731, + "step": 16006 + }, + { + "epoch": 0.09519816347892283, + "grad_norm": 1.137450098991394, + "learning_rate": 4.889038199713155e-05, + "loss": 5.2232, + "step": 16007 + }, + { + "epoch": 0.09520411076220382, + "grad_norm": 1.4076485633850098, + "learning_rate": 4.889024437758582e-05, + "loss": 5.3428, + "step": 16008 + }, + { + "epoch": 0.09521005804548482, + "grad_norm": 1.3883590698242188, + "learning_rate": 4.889010674970026e-05, + "loss": 5.328, + "step": 16009 + }, + { + "epoch": 0.09521600532876583, + "grad_norm": 1.4320605993270874, + "learning_rate": 4.88899691134749e-05, + "loss": 5.1469, + "step": 16010 + }, + { + "epoch": 0.09522195261204681, + "grad_norm": 1.5601880550384521, + "learning_rate": 4.8889831468909795e-05, + "loss": 5.1063, + "step": 16011 + }, + { + "epoch": 0.09522789989532782, + "grad_norm": 1.4243980646133423, + "learning_rate": 4.8889693816005014e-05, + "loss": 5.067, + "step": 16012 + }, + { + "epoch": 0.0952338471786088, + "grad_norm": 1.3901020288467407, + "learning_rate": 4.8889556154760577e-05, + "loss": 4.9954, + "step": 16013 + }, + { + "epoch": 0.0952397944618898, + "grad_norm": 1.2067557573318481, + "learning_rate": 4.8889418485176544e-05, + "loss": 5.5485, + "step": 16014 + }, + { + "epoch": 0.09524574174517081, + "grad_norm": 1.6004818677902222, + "learning_rate": 4.888928080725296e-05, + "loss": 5.0334, + "step": 16015 + }, + { + "epoch": 0.0952516890284518, + "grad_norm": 1.42451810836792, + "learning_rate": 4.8889143120989864e-05, + "loss": 4.9913, + "step": 16016 + }, + { + "epoch": 0.0952576363117328, + "grad_norm": 1.528438925743103, + "learning_rate": 4.888900542638734e-05, + "loss": 4.9749, + "step": 16017 + }, + { + "epoch": 0.0952635835950138, + "grad_norm": 1.2179231643676758, + "learning_rate": 4.888886772344539e-05, + "loss": 5.0631, + "step": 16018 + }, + { + "epoch": 0.09526953087829479, + "grad_norm": 1.5069763660430908, + "learning_rate": 4.8888730012164085e-05, + "loss": 5.0739, + "step": 16019 + }, + { + "epoch": 0.09527547816157579, + "grad_norm": 1.3587465286254883, + "learning_rate": 4.888859229254348e-05, + "loss": 5.0924, + "step": 16020 + }, + { + "epoch": 0.0952814254448568, + "grad_norm": 1.412811517715454, + "learning_rate": 4.888845456458361e-05, + "loss": 5.0228, + "step": 16021 + }, + { + "epoch": 0.09528737272813778, + "grad_norm": 1.5316507816314697, + "learning_rate": 4.888831682828453e-05, + "loss": 4.9514, + "step": 16022 + }, + { + "epoch": 0.09529332001141878, + "grad_norm": 1.4402068853378296, + "learning_rate": 4.888817908364628e-05, + "loss": 4.9404, + "step": 16023 + }, + { + "epoch": 0.09529926729469979, + "grad_norm": 1.353027582168579, + "learning_rate": 4.888804133066892e-05, + "loss": 5.0359, + "step": 16024 + }, + { + "epoch": 0.09530521457798077, + "grad_norm": 1.4211509227752686, + "learning_rate": 4.8887903569352486e-05, + "loss": 5.2472, + "step": 16025 + }, + { + "epoch": 0.09531116186126178, + "grad_norm": 1.3640077114105225, + "learning_rate": 4.888776579969704e-05, + "loss": 5.4126, + "step": 16026 + }, + { + "epoch": 0.09531710914454278, + "grad_norm": 1.5627541542053223, + "learning_rate": 4.8887628021702616e-05, + "loss": 5.1019, + "step": 16027 + }, + { + "epoch": 0.09532305642782377, + "grad_norm": 1.788611650466919, + "learning_rate": 4.888749023536927e-05, + "loss": 4.9395, + "step": 16028 + }, + { + "epoch": 0.09532900371110477, + "grad_norm": 1.3194786310195923, + "learning_rate": 4.8887352440697044e-05, + "loss": 4.9888, + "step": 16029 + }, + { + "epoch": 0.09533495099438577, + "grad_norm": 1.3091423511505127, + "learning_rate": 4.888721463768598e-05, + "loss": 5.1328, + "step": 16030 + }, + { + "epoch": 0.09534089827766676, + "grad_norm": 1.2864805459976196, + "learning_rate": 4.8887076826336154e-05, + "loss": 5.2569, + "step": 16031 + }, + { + "epoch": 0.09534684556094776, + "grad_norm": 1.3800050020217896, + "learning_rate": 4.888693900664759e-05, + "loss": 5.0698, + "step": 16032 + }, + { + "epoch": 0.09535279284422876, + "grad_norm": 1.2338416576385498, + "learning_rate": 4.8886801178620347e-05, + "loss": 5.227, + "step": 16033 + }, + { + "epoch": 0.09535874012750975, + "grad_norm": 1.4023356437683105, + "learning_rate": 4.888666334225446e-05, + "loss": 5.2976, + "step": 16034 + }, + { + "epoch": 0.09536468741079075, + "grad_norm": 1.4695215225219727, + "learning_rate": 4.8886525497549994e-05, + "loss": 5.1062, + "step": 16035 + }, + { + "epoch": 0.09537063469407175, + "grad_norm": 1.3647410869598389, + "learning_rate": 4.888638764450698e-05, + "loss": 5.2613, + "step": 16036 + }, + { + "epoch": 0.09537658197735274, + "grad_norm": 1.3059413433074951, + "learning_rate": 4.8886249783125484e-05, + "loss": 5.1593, + "step": 16037 + }, + { + "epoch": 0.09538252926063374, + "grad_norm": 1.3861093521118164, + "learning_rate": 4.8886111913405544e-05, + "loss": 4.9149, + "step": 16038 + }, + { + "epoch": 0.09538847654391475, + "grad_norm": 1.4214578866958618, + "learning_rate": 4.88859740353472e-05, + "loss": 5.0443, + "step": 16039 + }, + { + "epoch": 0.09539442382719573, + "grad_norm": 1.3835242986679077, + "learning_rate": 4.888583614895052e-05, + "loss": 4.9516, + "step": 16040 + }, + { + "epoch": 0.09540037111047674, + "grad_norm": 1.47120201587677, + "learning_rate": 4.8885698254215526e-05, + "loss": 4.9673, + "step": 16041 + }, + { + "epoch": 0.09540631839375772, + "grad_norm": 1.4861125946044922, + "learning_rate": 4.8885560351142295e-05, + "loss": 4.8283, + "step": 16042 + }, + { + "epoch": 0.09541226567703873, + "grad_norm": 1.2469282150268555, + "learning_rate": 4.888542243973086e-05, + "loss": 5.164, + "step": 16043 + }, + { + "epoch": 0.09541821296031973, + "grad_norm": 1.2372372150421143, + "learning_rate": 4.888528451998127e-05, + "loss": 5.2986, + "step": 16044 + }, + { + "epoch": 0.09542416024360072, + "grad_norm": 1.370978593826294, + "learning_rate": 4.888514659189357e-05, + "loss": 5.2353, + "step": 16045 + }, + { + "epoch": 0.09543010752688172, + "grad_norm": 1.4328222274780273, + "learning_rate": 4.888500865546781e-05, + "loss": 5.3482, + "step": 16046 + }, + { + "epoch": 0.09543605481016272, + "grad_norm": 1.2651796340942383, + "learning_rate": 4.888487071070405e-05, + "loss": 5.3276, + "step": 16047 + }, + { + "epoch": 0.09544200209344371, + "grad_norm": 1.34639310836792, + "learning_rate": 4.8884732757602325e-05, + "loss": 5.108, + "step": 16048 + }, + { + "epoch": 0.09544794937672471, + "grad_norm": 1.2254658937454224, + "learning_rate": 4.888459479616269e-05, + "loss": 5.1569, + "step": 16049 + }, + { + "epoch": 0.09545389666000571, + "grad_norm": 1.2902439832687378, + "learning_rate": 4.888445682638518e-05, + "loss": 5.2215, + "step": 16050 + }, + { + "epoch": 0.0954598439432867, + "grad_norm": 1.572160243988037, + "learning_rate": 4.888431884826986e-05, + "loss": 5.1288, + "step": 16051 + }, + { + "epoch": 0.0954657912265677, + "grad_norm": 1.266427993774414, + "learning_rate": 4.888418086181676e-05, + "loss": 5.231, + "step": 16052 + }, + { + "epoch": 0.0954717385098487, + "grad_norm": 1.2186620235443115, + "learning_rate": 4.888404286702595e-05, + "loss": 5.113, + "step": 16053 + }, + { + "epoch": 0.0954776857931297, + "grad_norm": 1.386727213859558, + "learning_rate": 4.888390486389747e-05, + "loss": 5.0559, + "step": 16054 + }, + { + "epoch": 0.0954836330764107, + "grad_norm": 1.3253827095031738, + "learning_rate": 4.8883766852431354e-05, + "loss": 5.2569, + "step": 16055 + }, + { + "epoch": 0.0954895803596917, + "grad_norm": 1.219800591468811, + "learning_rate": 4.888362883262767e-05, + "loss": 5.0805, + "step": 16056 + }, + { + "epoch": 0.09549552764297269, + "grad_norm": 1.2425061464309692, + "learning_rate": 4.888349080448646e-05, + "loss": 5.1447, + "step": 16057 + }, + { + "epoch": 0.09550147492625369, + "grad_norm": 2.619645833969116, + "learning_rate": 4.888335276800777e-05, + "loss": 5.2419, + "step": 16058 + }, + { + "epoch": 0.09550742220953469, + "grad_norm": 1.3087180852890015, + "learning_rate": 4.888321472319164e-05, + "loss": 5.1895, + "step": 16059 + }, + { + "epoch": 0.09551336949281568, + "grad_norm": 1.1865695714950562, + "learning_rate": 4.888307667003813e-05, + "loss": 5.1791, + "step": 16060 + }, + { + "epoch": 0.09551931677609668, + "grad_norm": 1.2647303342819214, + "learning_rate": 4.8882938608547294e-05, + "loss": 5.1928, + "step": 16061 + }, + { + "epoch": 0.09552526405937768, + "grad_norm": 1.2161632776260376, + "learning_rate": 4.888280053871916e-05, + "loss": 5.1431, + "step": 16062 + }, + { + "epoch": 0.09553121134265867, + "grad_norm": 1.3904309272766113, + "learning_rate": 4.8882662460553784e-05, + "loss": 5.0658, + "step": 16063 + }, + { + "epoch": 0.09553715862593967, + "grad_norm": 1.4302258491516113, + "learning_rate": 4.888252437405123e-05, + "loss": 5.1838, + "step": 16064 + }, + { + "epoch": 0.09554310590922067, + "grad_norm": 1.4313236474990845, + "learning_rate": 4.888238627921152e-05, + "loss": 5.2108, + "step": 16065 + }, + { + "epoch": 0.09554905319250166, + "grad_norm": 1.485170602798462, + "learning_rate": 4.8882248176034726e-05, + "loss": 5.179, + "step": 16066 + }, + { + "epoch": 0.09555500047578266, + "grad_norm": 1.3742952346801758, + "learning_rate": 4.888211006452088e-05, + "loss": 5.0416, + "step": 16067 + }, + { + "epoch": 0.09556094775906367, + "grad_norm": 1.2600523233413696, + "learning_rate": 4.888197194467005e-05, + "loss": 5.0891, + "step": 16068 + }, + { + "epoch": 0.09556689504234465, + "grad_norm": 1.2905696630477905, + "learning_rate": 4.888183381648225e-05, + "loss": 5.1004, + "step": 16069 + }, + { + "epoch": 0.09557284232562566, + "grad_norm": 1.2373219728469849, + "learning_rate": 4.8881695679957565e-05, + "loss": 5.1549, + "step": 16070 + }, + { + "epoch": 0.09557878960890664, + "grad_norm": 1.43118155002594, + "learning_rate": 4.8881557535096014e-05, + "loss": 5.067, + "step": 16071 + }, + { + "epoch": 0.09558473689218765, + "grad_norm": 1.201025366783142, + "learning_rate": 4.888141938189767e-05, + "loss": 5.1304, + "step": 16072 + }, + { + "epoch": 0.09559068417546865, + "grad_norm": 1.3497222661972046, + "learning_rate": 4.888128122036256e-05, + "loss": 5.0802, + "step": 16073 + }, + { + "epoch": 0.09559663145874964, + "grad_norm": 1.3429580926895142, + "learning_rate": 4.888114305049074e-05, + "loss": 5.1033, + "step": 16074 + }, + { + "epoch": 0.09560257874203064, + "grad_norm": 1.212725281715393, + "learning_rate": 4.888100487228227e-05, + "loss": 5.0627, + "step": 16075 + }, + { + "epoch": 0.09560852602531164, + "grad_norm": 1.258507490158081, + "learning_rate": 4.8880866685737174e-05, + "loss": 5.1215, + "step": 16076 + }, + { + "epoch": 0.09561447330859263, + "grad_norm": 1.4401910305023193, + "learning_rate": 4.888072849085552e-05, + "loss": 4.9619, + "step": 16077 + }, + { + "epoch": 0.09562042059187363, + "grad_norm": 1.240682601928711, + "learning_rate": 4.888059028763735e-05, + "loss": 4.8384, + "step": 16078 + }, + { + "epoch": 0.09562636787515463, + "grad_norm": 1.5701509714126587, + "learning_rate": 4.888045207608272e-05, + "loss": 5.0756, + "step": 16079 + }, + { + "epoch": 0.09563231515843562, + "grad_norm": 2.0408403873443604, + "learning_rate": 4.888031385619166e-05, + "loss": 5.1615, + "step": 16080 + }, + { + "epoch": 0.09563826244171662, + "grad_norm": 1.8134169578552246, + "learning_rate": 4.8880175627964245e-05, + "loss": 5.2383, + "step": 16081 + }, + { + "epoch": 0.09564420972499763, + "grad_norm": 1.4934067726135254, + "learning_rate": 4.888003739140049e-05, + "loss": 5.1512, + "step": 16082 + }, + { + "epoch": 0.09565015700827861, + "grad_norm": 1.6359374523162842, + "learning_rate": 4.887989914650047e-05, + "loss": 5.1245, + "step": 16083 + }, + { + "epoch": 0.09565610429155962, + "grad_norm": 1.5446397066116333, + "learning_rate": 4.887976089326422e-05, + "loss": 4.9806, + "step": 16084 + }, + { + "epoch": 0.09566205157484062, + "grad_norm": 1.845180869102478, + "learning_rate": 4.8879622631691794e-05, + "loss": 5.0474, + "step": 16085 + }, + { + "epoch": 0.0956679988581216, + "grad_norm": 1.8755276203155518, + "learning_rate": 4.887948436178324e-05, + "loss": 5.0674, + "step": 16086 + }, + { + "epoch": 0.09567394614140261, + "grad_norm": 1.5596239566802979, + "learning_rate": 4.88793460835386e-05, + "loss": 5.0699, + "step": 16087 + }, + { + "epoch": 0.09567989342468361, + "grad_norm": 1.6092095375061035, + "learning_rate": 4.8879207796957935e-05, + "loss": 5.1184, + "step": 16088 + }, + { + "epoch": 0.0956858407079646, + "grad_norm": 1.6217916011810303, + "learning_rate": 4.887906950204127e-05, + "loss": 4.9607, + "step": 16089 + }, + { + "epoch": 0.0956917879912456, + "grad_norm": 1.5006567239761353, + "learning_rate": 4.8878931198788694e-05, + "loss": 4.7948, + "step": 16090 + }, + { + "epoch": 0.0956977352745266, + "grad_norm": 1.397647738456726, + "learning_rate": 4.887879288720021e-05, + "loss": 5.1067, + "step": 16091 + }, + { + "epoch": 0.09570368255780759, + "grad_norm": 1.5627835988998413, + "learning_rate": 4.8878654567275886e-05, + "loss": 4.9138, + "step": 16092 + }, + { + "epoch": 0.09570962984108859, + "grad_norm": 1.4590591192245483, + "learning_rate": 4.8878516239015784e-05, + "loss": 4.9132, + "step": 16093 + }, + { + "epoch": 0.0957155771243696, + "grad_norm": 1.347569465637207, + "learning_rate": 4.887837790241992e-05, + "loss": 4.9732, + "step": 16094 + }, + { + "epoch": 0.09572152440765058, + "grad_norm": 1.547169804573059, + "learning_rate": 4.887823955748838e-05, + "loss": 5.1336, + "step": 16095 + }, + { + "epoch": 0.09572747169093158, + "grad_norm": 1.3920515775680542, + "learning_rate": 4.887810120422118e-05, + "loss": 5.0738, + "step": 16096 + }, + { + "epoch": 0.09573341897421259, + "grad_norm": 1.4531773328781128, + "learning_rate": 4.8877962842618386e-05, + "loss": 5.0517, + "step": 16097 + }, + { + "epoch": 0.09573936625749357, + "grad_norm": 1.458679437637329, + "learning_rate": 4.887782447268004e-05, + "loss": 4.9291, + "step": 16098 + }, + { + "epoch": 0.09574531354077458, + "grad_norm": 1.6293518543243408, + "learning_rate": 4.8877686094406196e-05, + "loss": 4.7676, + "step": 16099 + }, + { + "epoch": 0.09575126082405556, + "grad_norm": 1.6756728887557983, + "learning_rate": 4.8877547707796895e-05, + "loss": 4.7426, + "step": 16100 + }, + { + "epoch": 0.09575720810733657, + "grad_norm": 1.7573354244232178, + "learning_rate": 4.8877409312852194e-05, + "loss": 4.6344, + "step": 16101 + }, + { + "epoch": 0.09576315539061757, + "grad_norm": 1.701581597328186, + "learning_rate": 4.8877270909572126e-05, + "loss": 4.8023, + "step": 16102 + }, + { + "epoch": 0.09576910267389856, + "grad_norm": 1.4811267852783203, + "learning_rate": 4.887713249795676e-05, + "loss": 4.9964, + "step": 16103 + }, + { + "epoch": 0.09577504995717956, + "grad_norm": 1.4324437379837036, + "learning_rate": 4.887699407800612e-05, + "loss": 4.9657, + "step": 16104 + }, + { + "epoch": 0.09578099724046056, + "grad_norm": 1.6630572080612183, + "learning_rate": 4.8876855649720285e-05, + "loss": 4.8689, + "step": 16105 + }, + { + "epoch": 0.09578694452374155, + "grad_norm": 1.8548660278320312, + "learning_rate": 4.887671721309928e-05, + "loss": 4.8775, + "step": 16106 + }, + { + "epoch": 0.09579289180702255, + "grad_norm": 1.5234023332595825, + "learning_rate": 4.887657876814316e-05, + "loss": 5.1495, + "step": 16107 + }, + { + "epoch": 0.09579883909030355, + "grad_norm": 1.5281673669815063, + "learning_rate": 4.8876440314851967e-05, + "loss": 4.8887, + "step": 16108 + }, + { + "epoch": 0.09580478637358454, + "grad_norm": 1.6189017295837402, + "learning_rate": 4.887630185322576e-05, + "loss": 4.7103, + "step": 16109 + }, + { + "epoch": 0.09581073365686554, + "grad_norm": 1.8149834871292114, + "learning_rate": 4.8876163383264584e-05, + "loss": 4.5674, + "step": 16110 + }, + { + "epoch": 0.09581668094014655, + "grad_norm": 1.6370511054992676, + "learning_rate": 4.887602490496848e-05, + "loss": 4.6307, + "step": 16111 + }, + { + "epoch": 0.09582262822342753, + "grad_norm": 1.603553056716919, + "learning_rate": 4.887588641833751e-05, + "loss": 4.597, + "step": 16112 + }, + { + "epoch": 0.09582857550670854, + "grad_norm": 1.6511812210083008, + "learning_rate": 4.887574792337171e-05, + "loss": 4.604, + "step": 16113 + }, + { + "epoch": 0.09583452278998954, + "grad_norm": 1.6924868822097778, + "learning_rate": 4.887560942007113e-05, + "loss": 4.6674, + "step": 16114 + }, + { + "epoch": 0.09584047007327053, + "grad_norm": 1.6445999145507812, + "learning_rate": 4.887547090843583e-05, + "loss": 4.492, + "step": 16115 + }, + { + "epoch": 0.09584641735655153, + "grad_norm": 2.282087564468384, + "learning_rate": 4.887533238846585e-05, + "loss": 5.7458, + "step": 16116 + }, + { + "epoch": 0.09585236463983253, + "grad_norm": 1.8790422677993774, + "learning_rate": 4.887519386016123e-05, + "loss": 5.6642, + "step": 16117 + }, + { + "epoch": 0.09585831192311352, + "grad_norm": 1.887954592704773, + "learning_rate": 4.887505532352203e-05, + "loss": 5.8485, + "step": 16118 + }, + { + "epoch": 0.09586425920639452, + "grad_norm": 1.8805441856384277, + "learning_rate": 4.88749167785483e-05, + "loss": 5.5941, + "step": 16119 + }, + { + "epoch": 0.09587020648967552, + "grad_norm": 2.141098976135254, + "learning_rate": 4.8874778225240076e-05, + "loss": 5.1748, + "step": 16120 + }, + { + "epoch": 0.09587615377295651, + "grad_norm": 1.560094952583313, + "learning_rate": 4.887463966359741e-05, + "loss": 5.625, + "step": 16121 + }, + { + "epoch": 0.09588210105623751, + "grad_norm": 1.6463109254837036, + "learning_rate": 4.887450109362036e-05, + "loss": 5.6568, + "step": 16122 + }, + { + "epoch": 0.09588804833951851, + "grad_norm": 1.5389329195022583, + "learning_rate": 4.887436251530898e-05, + "loss": 5.6461, + "step": 16123 + }, + { + "epoch": 0.0958939956227995, + "grad_norm": 1.4973753690719604, + "learning_rate": 4.8874223928663284e-05, + "loss": 5.3542, + "step": 16124 + }, + { + "epoch": 0.0958999429060805, + "grad_norm": 1.4039745330810547, + "learning_rate": 4.8874085333683364e-05, + "loss": 5.506, + "step": 16125 + }, + { + "epoch": 0.0959058901893615, + "grad_norm": 1.819114089012146, + "learning_rate": 4.8873946730369235e-05, + "loss": 5.2586, + "step": 16126 + }, + { + "epoch": 0.0959118374726425, + "grad_norm": 1.9034372568130493, + "learning_rate": 4.887380811872095e-05, + "loss": 5.1818, + "step": 16127 + }, + { + "epoch": 0.0959177847559235, + "grad_norm": 1.8390016555786133, + "learning_rate": 4.8873669498738584e-05, + "loss": 5.8263, + "step": 16128 + }, + { + "epoch": 0.09592373203920448, + "grad_norm": 1.780961275100708, + "learning_rate": 4.887353087042216e-05, + "loss": 5.801, + "step": 16129 + }, + { + "epoch": 0.09592967932248549, + "grad_norm": 1.8105396032333374, + "learning_rate": 4.887339223377173e-05, + "loss": 5.3426, + "step": 16130 + }, + { + "epoch": 0.09593562660576649, + "grad_norm": 1.9126670360565186, + "learning_rate": 4.887325358878735e-05, + "loss": 5.404, + "step": 16131 + }, + { + "epoch": 0.09594157388904748, + "grad_norm": 1.4767181873321533, + "learning_rate": 4.887311493546906e-05, + "loss": 5.5631, + "step": 16132 + }, + { + "epoch": 0.09594752117232848, + "grad_norm": 1.4779311418533325, + "learning_rate": 4.8872976273816904e-05, + "loss": 5.6407, + "step": 16133 + }, + { + "epoch": 0.09595346845560948, + "grad_norm": 1.9026421308517456, + "learning_rate": 4.8872837603830955e-05, + "loss": 5.4299, + "step": 16134 + }, + { + "epoch": 0.09595941573889047, + "grad_norm": 1.845184326171875, + "learning_rate": 4.887269892551123e-05, + "loss": 5.4873, + "step": 16135 + }, + { + "epoch": 0.09596536302217147, + "grad_norm": 2.49023175239563, + "learning_rate": 4.88725602388578e-05, + "loss": 4.1458, + "step": 16136 + }, + { + "epoch": 0.09597131030545247, + "grad_norm": 2.0831515789031982, + "learning_rate": 4.887242154387071e-05, + "loss": 5.0316, + "step": 16137 + }, + { + "epoch": 0.09597725758873346, + "grad_norm": 1.6316094398498535, + "learning_rate": 4.887228284055e-05, + "loss": 5.1289, + "step": 16138 + }, + { + "epoch": 0.09598320487201446, + "grad_norm": 2.025193214416504, + "learning_rate": 4.8872144128895724e-05, + "loss": 5.3065, + "step": 16139 + }, + { + "epoch": 0.09598915215529547, + "grad_norm": 2.077871322631836, + "learning_rate": 4.887200540890793e-05, + "loss": 5.1163, + "step": 16140 + }, + { + "epoch": 0.09599509943857645, + "grad_norm": 1.8450415134429932, + "learning_rate": 4.8871866680586666e-05, + "loss": 5.2638, + "step": 16141 + }, + { + "epoch": 0.09600104672185746, + "grad_norm": 1.676255464553833, + "learning_rate": 4.8871727943931974e-05, + "loss": 4.8191, + "step": 16142 + }, + { + "epoch": 0.09600699400513846, + "grad_norm": 1.6484187841415405, + "learning_rate": 4.8871589198943914e-05, + "loss": 5.3993, + "step": 16143 + }, + { + "epoch": 0.09601294128841945, + "grad_norm": 1.7061866521835327, + "learning_rate": 4.887145044562253e-05, + "loss": 5.2941, + "step": 16144 + }, + { + "epoch": 0.09601888857170045, + "grad_norm": 1.7628071308135986, + "learning_rate": 4.887131168396786e-05, + "loss": 5.2736, + "step": 16145 + }, + { + "epoch": 0.09602483585498145, + "grad_norm": 2.0107390880584717, + "learning_rate": 4.887117291397997e-05, + "loss": 5.1561, + "step": 16146 + }, + { + "epoch": 0.09603078313826244, + "grad_norm": 1.7889841794967651, + "learning_rate": 4.887103413565889e-05, + "loss": 6.0519, + "step": 16147 + }, + { + "epoch": 0.09603673042154344, + "grad_norm": 1.7982914447784424, + "learning_rate": 4.8870895349004686e-05, + "loss": 5.4913, + "step": 16148 + }, + { + "epoch": 0.09604267770482444, + "grad_norm": 1.8263020515441895, + "learning_rate": 4.88707565540174e-05, + "loss": 5.8516, + "step": 16149 + }, + { + "epoch": 0.09604862498810543, + "grad_norm": 1.642863392829895, + "learning_rate": 4.887061775069708e-05, + "loss": 5.5714, + "step": 16150 + }, + { + "epoch": 0.09605457227138643, + "grad_norm": 1.5696642398834229, + "learning_rate": 4.887047893904377e-05, + "loss": 5.4624, + "step": 16151 + }, + { + "epoch": 0.09606051955466743, + "grad_norm": 1.8895677328109741, + "learning_rate": 4.8870340119057536e-05, + "loss": 5.621, + "step": 16152 + }, + { + "epoch": 0.09606646683794842, + "grad_norm": 1.772875428199768, + "learning_rate": 4.8870201290738395e-05, + "loss": 5.5371, + "step": 16153 + }, + { + "epoch": 0.09607241412122942, + "grad_norm": 1.6763731241226196, + "learning_rate": 4.8870062454086415e-05, + "loss": 5.966, + "step": 16154 + }, + { + "epoch": 0.09607836140451043, + "grad_norm": 1.5911294221878052, + "learning_rate": 4.886992360910165e-05, + "loss": 5.3707, + "step": 16155 + }, + { + "epoch": 0.09608430868779141, + "grad_norm": 1.7060188055038452, + "learning_rate": 4.886978475578414e-05, + "loss": 5.5278, + "step": 16156 + }, + { + "epoch": 0.09609025597107242, + "grad_norm": 1.6456331014633179, + "learning_rate": 4.886964589413394e-05, + "loss": 5.5132, + "step": 16157 + }, + { + "epoch": 0.0960962032543534, + "grad_norm": 1.6736609935760498, + "learning_rate": 4.886950702415109e-05, + "loss": 5.245, + "step": 16158 + }, + { + "epoch": 0.0961021505376344, + "grad_norm": 1.5359262228012085, + "learning_rate": 4.886936814583564e-05, + "loss": 5.3893, + "step": 16159 + }, + { + "epoch": 0.09610809782091541, + "grad_norm": 1.5430463552474976, + "learning_rate": 4.886922925918763e-05, + "loss": 5.4257, + "step": 16160 + }, + { + "epoch": 0.0961140451041964, + "grad_norm": 1.940909743309021, + "learning_rate": 4.886909036420714e-05, + "loss": 5.0744, + "step": 16161 + }, + { + "epoch": 0.0961199923874774, + "grad_norm": 1.869372844696045, + "learning_rate": 4.886895146089418e-05, + "loss": 5.4901, + "step": 16162 + }, + { + "epoch": 0.0961259396707584, + "grad_norm": 1.794975996017456, + "learning_rate": 4.886881254924882e-05, + "loss": 5.5174, + "step": 16163 + }, + { + "epoch": 0.09613188695403939, + "grad_norm": 1.6314165592193604, + "learning_rate": 4.8868673629271105e-05, + "loss": 5.5883, + "step": 16164 + }, + { + "epoch": 0.09613783423732039, + "grad_norm": 1.7309901714324951, + "learning_rate": 4.886853470096108e-05, + "loss": 5.3881, + "step": 16165 + }, + { + "epoch": 0.09614378152060139, + "grad_norm": 1.7356623411178589, + "learning_rate": 4.88683957643188e-05, + "loss": 5.3578, + "step": 16166 + }, + { + "epoch": 0.09614972880388238, + "grad_norm": 2.302006244659424, + "learning_rate": 4.886825681934431e-05, + "loss": 5.7811, + "step": 16167 + }, + { + "epoch": 0.09615567608716338, + "grad_norm": 2.282381534576416, + "learning_rate": 4.8868117866037656e-05, + "loss": 5.8847, + "step": 16168 + }, + { + "epoch": 0.09616162337044439, + "grad_norm": 1.9158310890197754, + "learning_rate": 4.886797890439889e-05, + "loss": 5.7663, + "step": 16169 + }, + { + "epoch": 0.09616757065372537, + "grad_norm": 1.6491609811782837, + "learning_rate": 4.886783993442806e-05, + "loss": 5.9077, + "step": 16170 + }, + { + "epoch": 0.09617351793700638, + "grad_norm": 1.739547848701477, + "learning_rate": 4.886770095612521e-05, + "loss": 5.5126, + "step": 16171 + }, + { + "epoch": 0.09617946522028738, + "grad_norm": 1.534516453742981, + "learning_rate": 4.88675619694904e-05, + "loss": 5.372, + "step": 16172 + }, + { + "epoch": 0.09618541250356837, + "grad_norm": 1.8228504657745361, + "learning_rate": 4.8867422974523657e-05, + "loss": 5.4673, + "step": 16173 + }, + { + "epoch": 0.09619135978684937, + "grad_norm": 1.8887168169021606, + "learning_rate": 4.886728397122505e-05, + "loss": 5.5699, + "step": 16174 + }, + { + "epoch": 0.09619730707013037, + "grad_norm": 1.6889835596084595, + "learning_rate": 4.8867144959594626e-05, + "loss": 5.6244, + "step": 16175 + }, + { + "epoch": 0.09620325435341136, + "grad_norm": 1.7387192249298096, + "learning_rate": 4.8867005939632424e-05, + "loss": 5.7735, + "step": 16176 + }, + { + "epoch": 0.09620920163669236, + "grad_norm": 1.9036939144134521, + "learning_rate": 4.8866866911338494e-05, + "loss": 5.8873, + "step": 16177 + }, + { + "epoch": 0.09621514891997336, + "grad_norm": 1.6884106397628784, + "learning_rate": 4.886672787471289e-05, + "loss": 5.1366, + "step": 16178 + }, + { + "epoch": 0.09622109620325435, + "grad_norm": 1.5132830142974854, + "learning_rate": 4.886658882975566e-05, + "loss": 5.2964, + "step": 16179 + }, + { + "epoch": 0.09622704348653535, + "grad_norm": 1.7039000988006592, + "learning_rate": 4.886644977646685e-05, + "loss": 5.2287, + "step": 16180 + }, + { + "epoch": 0.09623299076981635, + "grad_norm": 1.6894882917404175, + "learning_rate": 4.886631071484651e-05, + "loss": 5.3205, + "step": 16181 + }, + { + "epoch": 0.09623893805309734, + "grad_norm": 2.303013324737549, + "learning_rate": 4.8866171644894684e-05, + "loss": 5.2701, + "step": 16182 + }, + { + "epoch": 0.09624488533637834, + "grad_norm": 1.6158491373062134, + "learning_rate": 4.886603256661142e-05, + "loss": 5.522, + "step": 16183 + }, + { + "epoch": 0.09625083261965935, + "grad_norm": 1.5886715650558472, + "learning_rate": 4.8865893479996776e-05, + "loss": 5.7498, + "step": 16184 + }, + { + "epoch": 0.09625677990294033, + "grad_norm": 2.007570505142212, + "learning_rate": 4.88657543850508e-05, + "loss": 5.3746, + "step": 16185 + }, + { + "epoch": 0.09626272718622134, + "grad_norm": 2.8191232681274414, + "learning_rate": 4.886561528177352e-05, + "loss": 4.9794, + "step": 16186 + }, + { + "epoch": 0.09626867446950232, + "grad_norm": 2.5193052291870117, + "learning_rate": 4.886547617016501e-05, + "loss": 4.982, + "step": 16187 + }, + { + "epoch": 0.09627462175278333, + "grad_norm": 1.8875666856765747, + "learning_rate": 4.8865337050225316e-05, + "loss": 5.1801, + "step": 16188 + }, + { + "epoch": 0.09628056903606433, + "grad_norm": 1.441834568977356, + "learning_rate": 4.8865197921954475e-05, + "loss": 5.2723, + "step": 16189 + }, + { + "epoch": 0.09628651631934532, + "grad_norm": 2.0356223583221436, + "learning_rate": 4.8865058785352536e-05, + "loss": 5.4185, + "step": 16190 + }, + { + "epoch": 0.09629246360262632, + "grad_norm": 2.03885817527771, + "learning_rate": 4.8864919640419554e-05, + "loss": 5.1636, + "step": 16191 + }, + { + "epoch": 0.09629841088590732, + "grad_norm": 2.118439197540283, + "learning_rate": 4.8864780487155576e-05, + "loss": 5.4012, + "step": 16192 + }, + { + "epoch": 0.09630435816918831, + "grad_norm": 1.8266710042953491, + "learning_rate": 4.886464132556064e-05, + "loss": 4.9442, + "step": 16193 + }, + { + "epoch": 0.09631030545246931, + "grad_norm": 1.646341323852539, + "learning_rate": 4.886450215563482e-05, + "loss": 5.1368, + "step": 16194 + }, + { + "epoch": 0.09631625273575031, + "grad_norm": 1.8833272457122803, + "learning_rate": 4.886436297737814e-05, + "loss": 5.279, + "step": 16195 + }, + { + "epoch": 0.0963222000190313, + "grad_norm": 1.9521067142486572, + "learning_rate": 4.8864223790790666e-05, + "loss": 5.6571, + "step": 16196 + }, + { + "epoch": 0.0963281473023123, + "grad_norm": 1.8902586698532104, + "learning_rate": 4.8864084595872427e-05, + "loss": 5.632, + "step": 16197 + }, + { + "epoch": 0.0963340945855933, + "grad_norm": 1.7994412183761597, + "learning_rate": 4.886394539262349e-05, + "loss": 5.574, + "step": 16198 + }, + { + "epoch": 0.0963400418688743, + "grad_norm": 1.751780390739441, + "learning_rate": 4.8863806181043895e-05, + "loss": 5.691, + "step": 16199 + }, + { + "epoch": 0.0963459891521553, + "grad_norm": 2.30880069732666, + "learning_rate": 4.8863666961133684e-05, + "loss": 5.7477, + "step": 16200 + }, + { + "epoch": 0.0963519364354363, + "grad_norm": 2.351921319961548, + "learning_rate": 4.8863527732892924e-05, + "loss": 5.8162, + "step": 16201 + }, + { + "epoch": 0.09635788371871729, + "grad_norm": 1.6124454736709595, + "learning_rate": 4.8863388496321636e-05, + "loss": 5.8105, + "step": 16202 + }, + { + "epoch": 0.09636383100199829, + "grad_norm": 1.4927148818969727, + "learning_rate": 4.886324925141991e-05, + "loss": 5.8246, + "step": 16203 + }, + { + "epoch": 0.09636977828527929, + "grad_norm": 1.71438729763031, + "learning_rate": 4.886310999818775e-05, + "loss": 5.798, + "step": 16204 + }, + { + "epoch": 0.09637572556856028, + "grad_norm": 1.9519150257110596, + "learning_rate": 4.886297073662523e-05, + "loss": 5.2815, + "step": 16205 + }, + { + "epoch": 0.09638167285184128, + "grad_norm": 1.7694860696792603, + "learning_rate": 4.88628314667324e-05, + "loss": 5.7564, + "step": 16206 + }, + { + "epoch": 0.09638762013512228, + "grad_norm": 1.658252477645874, + "learning_rate": 4.88626921885093e-05, + "loss": 5.6586, + "step": 16207 + }, + { + "epoch": 0.09639356741840327, + "grad_norm": 2.310295581817627, + "learning_rate": 4.886255290195598e-05, + "loss": 4.9317, + "step": 16208 + }, + { + "epoch": 0.09639951470168427, + "grad_norm": 2.239964246749878, + "learning_rate": 4.886241360707249e-05, + "loss": 5.3794, + "step": 16209 + }, + { + "epoch": 0.09640546198496527, + "grad_norm": 2.470205307006836, + "learning_rate": 4.886227430385887e-05, + "loss": 5.1755, + "step": 16210 + }, + { + "epoch": 0.09641140926824626, + "grad_norm": 2.208298683166504, + "learning_rate": 4.8862134992315185e-05, + "loss": 5.1296, + "step": 16211 + }, + { + "epoch": 0.09641735655152726, + "grad_norm": 2.112288475036621, + "learning_rate": 4.886199567244147e-05, + "loss": 5.0888, + "step": 16212 + }, + { + "epoch": 0.09642330383480827, + "grad_norm": 2.3725969791412354, + "learning_rate": 4.886185634423778e-05, + "loss": 5.0256, + "step": 16213 + }, + { + "epoch": 0.09642925111808925, + "grad_norm": 2.3314402103424072, + "learning_rate": 4.8861717007704164e-05, + "loss": 5.012, + "step": 16214 + }, + { + "epoch": 0.09643519840137026, + "grad_norm": 2.1015000343322754, + "learning_rate": 4.8861577662840676e-05, + "loss": 4.7244, + "step": 16215 + }, + { + "epoch": 0.09644114568465124, + "grad_norm": 2.335218906402588, + "learning_rate": 4.8861438309647344e-05, + "loss": 4.8442, + "step": 16216 + }, + { + "epoch": 0.09644709296793225, + "grad_norm": 2.249216079711914, + "learning_rate": 4.886129894812424e-05, + "loss": 5.2573, + "step": 16217 + }, + { + "epoch": 0.09645304025121325, + "grad_norm": 2.228283166885376, + "learning_rate": 4.8861159578271406e-05, + "loss": 4.7297, + "step": 16218 + }, + { + "epoch": 0.09645898753449424, + "grad_norm": 1.7820645570755005, + "learning_rate": 4.886102020008888e-05, + "loss": 4.8427, + "step": 16219 + }, + { + "epoch": 0.09646493481777524, + "grad_norm": 2.1911120414733887, + "learning_rate": 4.886088081357672e-05, + "loss": 4.9677, + "step": 16220 + }, + { + "epoch": 0.09647088210105624, + "grad_norm": 2.453758716583252, + "learning_rate": 4.8860741418734976e-05, + "loss": 4.9039, + "step": 16221 + }, + { + "epoch": 0.09647682938433723, + "grad_norm": 2.488105058670044, + "learning_rate": 4.886060201556369e-05, + "loss": 5.0211, + "step": 16222 + }, + { + "epoch": 0.09648277666761823, + "grad_norm": 2.2040843963623047, + "learning_rate": 4.8860462604062915e-05, + "loss": 5.1067, + "step": 16223 + }, + { + "epoch": 0.09648872395089923, + "grad_norm": 2.0934717655181885, + "learning_rate": 4.8860323184232695e-05, + "loss": 4.9648, + "step": 16224 + }, + { + "epoch": 0.09649467123418022, + "grad_norm": 2.3775415420532227, + "learning_rate": 4.886018375607309e-05, + "loss": 4.9459, + "step": 16225 + }, + { + "epoch": 0.09650061851746122, + "grad_norm": 2.4042131900787354, + "learning_rate": 4.886004431958414e-05, + "loss": 4.7845, + "step": 16226 + }, + { + "epoch": 0.09650656580074223, + "grad_norm": 2.34424090385437, + "learning_rate": 4.885990487476589e-05, + "loss": 5.012, + "step": 16227 + }, + { + "epoch": 0.09651251308402321, + "grad_norm": 2.2711172103881836, + "learning_rate": 4.8859765421618395e-05, + "loss": 4.906, + "step": 16228 + }, + { + "epoch": 0.09651846036730422, + "grad_norm": 2.4021360874176025, + "learning_rate": 4.8859625960141706e-05, + "loss": 4.916, + "step": 16229 + }, + { + "epoch": 0.09652440765058522, + "grad_norm": 1.9205279350280762, + "learning_rate": 4.885948649033587e-05, + "loss": 5.0469, + "step": 16230 + }, + { + "epoch": 0.0965303549338662, + "grad_norm": 2.226362466812134, + "learning_rate": 4.885934701220093e-05, + "loss": 4.9439, + "step": 16231 + }, + { + "epoch": 0.09653630221714721, + "grad_norm": 2.288909673690796, + "learning_rate": 4.885920752573694e-05, + "loss": 4.8271, + "step": 16232 + }, + { + "epoch": 0.09654224950042821, + "grad_norm": 2.132235050201416, + "learning_rate": 4.8859068030943943e-05, + "loss": 5.1891, + "step": 16233 + }, + { + "epoch": 0.0965481967837092, + "grad_norm": 2.080244541168213, + "learning_rate": 4.8858928527822e-05, + "loss": 4.9055, + "step": 16234 + }, + { + "epoch": 0.0965541440669902, + "grad_norm": 2.324211359024048, + "learning_rate": 4.8858789016371145e-05, + "loss": 5.2614, + "step": 16235 + }, + { + "epoch": 0.0965600913502712, + "grad_norm": 1.827802062034607, + "learning_rate": 4.8858649496591437e-05, + "loss": 4.8874, + "step": 16236 + }, + { + "epoch": 0.09656603863355219, + "grad_norm": 1.8670811653137207, + "learning_rate": 4.885850996848292e-05, + "loss": 5.2402, + "step": 16237 + }, + { + "epoch": 0.09657198591683319, + "grad_norm": 2.046444892883301, + "learning_rate": 4.885837043204564e-05, + "loss": 4.7029, + "step": 16238 + }, + { + "epoch": 0.0965779332001142, + "grad_norm": 2.007894992828369, + "learning_rate": 4.885823088727965e-05, + "loss": 5.6706, + "step": 16239 + }, + { + "epoch": 0.09658388048339518, + "grad_norm": 2.24422025680542, + "learning_rate": 4.8858091334185005e-05, + "loss": 5.9666, + "step": 16240 + }, + { + "epoch": 0.09658982776667618, + "grad_norm": 1.7045838832855225, + "learning_rate": 4.885795177276174e-05, + "loss": 5.3021, + "step": 16241 + }, + { + "epoch": 0.09659577504995719, + "grad_norm": 1.7880860567092896, + "learning_rate": 4.885781220300991e-05, + "loss": 4.9151, + "step": 16242 + }, + { + "epoch": 0.09660172233323817, + "grad_norm": 2.3720862865448, + "learning_rate": 4.885767262492957e-05, + "loss": 5.0868, + "step": 16243 + }, + { + "epoch": 0.09660766961651918, + "grad_norm": 1.8655211925506592, + "learning_rate": 4.8857533038520756e-05, + "loss": 5.5072, + "step": 16244 + }, + { + "epoch": 0.09661361689980018, + "grad_norm": 1.8259748220443726, + "learning_rate": 4.885739344378353e-05, + "loss": 5.5992, + "step": 16245 + }, + { + "epoch": 0.09661956418308117, + "grad_norm": 1.667145013809204, + "learning_rate": 4.885725384071793e-05, + "loss": 5.2069, + "step": 16246 + }, + { + "epoch": 0.09662551146636217, + "grad_norm": 1.8004356622695923, + "learning_rate": 4.8857114229324015e-05, + "loss": 5.232, + "step": 16247 + }, + { + "epoch": 0.09663145874964316, + "grad_norm": 1.8246740102767944, + "learning_rate": 4.8856974609601825e-05, + "loss": 5.185, + "step": 16248 + }, + { + "epoch": 0.09663740603292416, + "grad_norm": 1.7453134059906006, + "learning_rate": 4.885683498155141e-05, + "loss": 4.9118, + "step": 16249 + }, + { + "epoch": 0.09664335331620516, + "grad_norm": 1.76914381980896, + "learning_rate": 4.885669534517282e-05, + "loss": 4.6679, + "step": 16250 + }, + { + "epoch": 0.09664930059948615, + "grad_norm": 2.0119516849517822, + "learning_rate": 4.88565557004661e-05, + "loss": 4.6495, + "step": 16251 + }, + { + "epoch": 0.09665524788276715, + "grad_norm": 1.7628357410430908, + "learning_rate": 4.885641604743131e-05, + "loss": 4.7581, + "step": 16252 + }, + { + "epoch": 0.09666119516604815, + "grad_norm": 1.6456751823425293, + "learning_rate": 4.8856276386068486e-05, + "loss": 4.9539, + "step": 16253 + }, + { + "epoch": 0.09666714244932914, + "grad_norm": 1.8474618196487427, + "learning_rate": 4.885613671637769e-05, + "loss": 5.9248, + "step": 16254 + }, + { + "epoch": 0.09667308973261014, + "grad_norm": 2.1205222606658936, + "learning_rate": 4.885599703835896e-05, + "loss": 5.2783, + "step": 16255 + }, + { + "epoch": 0.09667903701589114, + "grad_norm": 1.7559815645217896, + "learning_rate": 4.885585735201235e-05, + "loss": 5.6276, + "step": 16256 + }, + { + "epoch": 0.09668498429917213, + "grad_norm": 1.5784190893173218, + "learning_rate": 4.885571765733789e-05, + "loss": 5.5933, + "step": 16257 + }, + { + "epoch": 0.09669093158245314, + "grad_norm": 1.7377841472625732, + "learning_rate": 4.885557795433567e-05, + "loss": 5.1234, + "step": 16258 + }, + { + "epoch": 0.09669687886573414, + "grad_norm": 1.6517775058746338, + "learning_rate": 4.88554382430057e-05, + "loss": 5.6291, + "step": 16259 + }, + { + "epoch": 0.09670282614901513, + "grad_norm": 1.8474104404449463, + "learning_rate": 4.885529852334805e-05, + "loss": 6.0357, + "step": 16260 + }, + { + "epoch": 0.09670877343229613, + "grad_norm": 1.6555463075637817, + "learning_rate": 4.8855158795362756e-05, + "loss": 5.9828, + "step": 16261 + }, + { + "epoch": 0.09671472071557713, + "grad_norm": 1.6003193855285645, + "learning_rate": 4.8855019059049876e-05, + "loss": 5.9705, + "step": 16262 + }, + { + "epoch": 0.09672066799885812, + "grad_norm": 1.4992772340774536, + "learning_rate": 4.885487931440945e-05, + "loss": 5.8604, + "step": 16263 + }, + { + "epoch": 0.09672661528213912, + "grad_norm": 1.8667478561401367, + "learning_rate": 4.885473956144154e-05, + "loss": 6.1141, + "step": 16264 + }, + { + "epoch": 0.09673256256542012, + "grad_norm": 1.7311911582946777, + "learning_rate": 4.8854599800146186e-05, + "loss": 5.4142, + "step": 16265 + }, + { + "epoch": 0.09673850984870111, + "grad_norm": 2.0519683361053467, + "learning_rate": 4.885446003052343e-05, + "loss": 5.4321, + "step": 16266 + }, + { + "epoch": 0.09674445713198211, + "grad_norm": 2.02132248878479, + "learning_rate": 4.8854320252573325e-05, + "loss": 5.4957, + "step": 16267 + }, + { + "epoch": 0.09675040441526311, + "grad_norm": 1.7282330989837646, + "learning_rate": 4.885418046629594e-05, + "loss": 5.4486, + "step": 16268 + }, + { + "epoch": 0.0967563516985441, + "grad_norm": 1.909114122390747, + "learning_rate": 4.885404067169129e-05, + "loss": 5.4782, + "step": 16269 + }, + { + "epoch": 0.0967622989818251, + "grad_norm": 1.897161602973938, + "learning_rate": 4.885390086875945e-05, + "loss": 5.8678, + "step": 16270 + }, + { + "epoch": 0.0967682462651061, + "grad_norm": 2.0866503715515137, + "learning_rate": 4.885376105750046e-05, + "loss": 5.0869, + "step": 16271 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 1.6914600133895874, + "learning_rate": 4.885362123791437e-05, + "loss": 5.3385, + "step": 16272 + }, + { + "epoch": 0.0967801408316681, + "grad_norm": 1.4390329122543335, + "learning_rate": 4.885348141000122e-05, + "loss": 5.8069, + "step": 16273 + }, + { + "epoch": 0.0967860881149491, + "grad_norm": 1.5077629089355469, + "learning_rate": 4.885334157376107e-05, + "loss": 5.6679, + "step": 16274 + }, + { + "epoch": 0.09679203539823009, + "grad_norm": 1.4550343751907349, + "learning_rate": 4.885320172919397e-05, + "loss": 5.7548, + "step": 16275 + }, + { + "epoch": 0.09679798268151109, + "grad_norm": 2.068070650100708, + "learning_rate": 4.8853061876299956e-05, + "loss": 4.9706, + "step": 16276 + }, + { + "epoch": 0.09680392996479208, + "grad_norm": 1.3487659692764282, + "learning_rate": 4.885292201507909e-05, + "loss": 5.6918, + "step": 16277 + }, + { + "epoch": 0.09680987724807308, + "grad_norm": 1.4306180477142334, + "learning_rate": 4.885278214553141e-05, + "loss": 5.6196, + "step": 16278 + }, + { + "epoch": 0.09681582453135408, + "grad_norm": 1.6410231590270996, + "learning_rate": 4.885264226765698e-05, + "loss": 5.0523, + "step": 16279 + }, + { + "epoch": 0.09682177181463507, + "grad_norm": 2.4701485633850098, + "learning_rate": 4.8852502381455825e-05, + "loss": 4.6255, + "step": 16280 + }, + { + "epoch": 0.09682771909791607, + "grad_norm": 2.5248069763183594, + "learning_rate": 4.885236248692802e-05, + "loss": 4.5055, + "step": 16281 + }, + { + "epoch": 0.09683366638119707, + "grad_norm": 2.1913154125213623, + "learning_rate": 4.8852222584073595e-05, + "loss": 4.748, + "step": 16282 + }, + { + "epoch": 0.09683961366447806, + "grad_norm": 1.951987385749817, + "learning_rate": 4.8852082672892606e-05, + "loss": 5.3871, + "step": 16283 + }, + { + "epoch": 0.09684556094775906, + "grad_norm": 2.007020950317383, + "learning_rate": 4.885194275338511e-05, + "loss": 6.1075, + "step": 16284 + }, + { + "epoch": 0.09685150823104006, + "grad_norm": 1.9821717739105225, + "learning_rate": 4.885180282555113e-05, + "loss": 5.1719, + "step": 16285 + }, + { + "epoch": 0.09685745551432105, + "grad_norm": 2.339564800262451, + "learning_rate": 4.885166288939074e-05, + "loss": 4.9518, + "step": 16286 + }, + { + "epoch": 0.09686340279760206, + "grad_norm": 2.1785504817962646, + "learning_rate": 4.8851522944903984e-05, + "loss": 4.9656, + "step": 16287 + }, + { + "epoch": 0.09686935008088306, + "grad_norm": 1.7723946571350098, + "learning_rate": 4.885138299209091e-05, + "loss": 6.1572, + "step": 16288 + }, + { + "epoch": 0.09687529736416405, + "grad_norm": 1.702458381652832, + "learning_rate": 4.885124303095156e-05, + "loss": 5.9616, + "step": 16289 + }, + { + "epoch": 0.09688124464744505, + "grad_norm": 2.279836893081665, + "learning_rate": 4.885110306148599e-05, + "loss": 5.4305, + "step": 16290 + }, + { + "epoch": 0.09688719193072605, + "grad_norm": 1.8569501638412476, + "learning_rate": 4.8850963083694244e-05, + "loss": 5.8019, + "step": 16291 + }, + { + "epoch": 0.09689313921400704, + "grad_norm": 1.8126327991485596, + "learning_rate": 4.885082309757637e-05, + "loss": 5.7076, + "step": 16292 + }, + { + "epoch": 0.09689908649728804, + "grad_norm": 1.7170337438583374, + "learning_rate": 4.8850683103132424e-05, + "loss": 5.9862, + "step": 16293 + }, + { + "epoch": 0.09690503378056904, + "grad_norm": 1.7631909847259521, + "learning_rate": 4.8850543100362454e-05, + "loss": 5.917, + "step": 16294 + }, + { + "epoch": 0.09691098106385003, + "grad_norm": 1.9938957691192627, + "learning_rate": 4.88504030892665e-05, + "loss": 5.5773, + "step": 16295 + }, + { + "epoch": 0.09691692834713103, + "grad_norm": 1.9459222555160522, + "learning_rate": 4.8850263069844623e-05, + "loss": 5.2847, + "step": 16296 + }, + { + "epoch": 0.09692287563041203, + "grad_norm": 1.8420277833938599, + "learning_rate": 4.8850123042096865e-05, + "loss": 5.5691, + "step": 16297 + }, + { + "epoch": 0.09692882291369302, + "grad_norm": 2.2592809200286865, + "learning_rate": 4.8849983006023267e-05, + "loss": 5.4666, + "step": 16298 + }, + { + "epoch": 0.09693477019697402, + "grad_norm": 2.080939292907715, + "learning_rate": 4.884984296162389e-05, + "loss": 5.243, + "step": 16299 + }, + { + "epoch": 0.09694071748025503, + "grad_norm": 1.648836374282837, + "learning_rate": 4.884970290889879e-05, + "loss": 5.8331, + "step": 16300 + }, + { + "epoch": 0.09694666476353601, + "grad_norm": 1.668505311012268, + "learning_rate": 4.884956284784799e-05, + "loss": 5.7523, + "step": 16301 + }, + { + "epoch": 0.09695261204681702, + "grad_norm": 1.5473688840866089, + "learning_rate": 4.8849422778471567e-05, + "loss": 5.5379, + "step": 16302 + }, + { + "epoch": 0.09695855933009802, + "grad_norm": 1.9258644580841064, + "learning_rate": 4.8849282700769545e-05, + "loss": 5.6405, + "step": 16303 + }, + { + "epoch": 0.096964506613379, + "grad_norm": 1.5651416778564453, + "learning_rate": 4.884914261474199e-05, + "loss": 6.1487, + "step": 16304 + }, + { + "epoch": 0.09697045389666001, + "grad_norm": 1.5289270877838135, + "learning_rate": 4.884900252038894e-05, + "loss": 5.6653, + "step": 16305 + }, + { + "epoch": 0.096976401179941, + "grad_norm": 1.8394510746002197, + "learning_rate": 4.8848862417710464e-05, + "loss": 4.9243, + "step": 16306 + }, + { + "epoch": 0.096982348463222, + "grad_norm": 1.7624824047088623, + "learning_rate": 4.8848722306706584e-05, + "loss": 5.7712, + "step": 16307 + }, + { + "epoch": 0.096988295746503, + "grad_norm": 1.7294182777404785, + "learning_rate": 4.8848582187377365e-05, + "loss": 5.5197, + "step": 16308 + }, + { + "epoch": 0.09699424302978399, + "grad_norm": 1.69902765750885, + "learning_rate": 4.8848442059722856e-05, + "loss": 5.6485, + "step": 16309 + }, + { + "epoch": 0.09700019031306499, + "grad_norm": 1.7867447137832642, + "learning_rate": 4.88483019237431e-05, + "loss": 5.4422, + "step": 16310 + }, + { + "epoch": 0.09700613759634599, + "grad_norm": 1.6588819026947021, + "learning_rate": 4.884816177943814e-05, + "loss": 5.4282, + "step": 16311 + }, + { + "epoch": 0.09701208487962698, + "grad_norm": 1.504918098449707, + "learning_rate": 4.884802162680804e-05, + "loss": 5.508, + "step": 16312 + }, + { + "epoch": 0.09701803216290798, + "grad_norm": 1.5852895975112915, + "learning_rate": 4.8847881465852846e-05, + "loss": 5.5567, + "step": 16313 + }, + { + "epoch": 0.09702397944618898, + "grad_norm": 1.5719797611236572, + "learning_rate": 4.88477412965726e-05, + "loss": 5.6284, + "step": 16314 + }, + { + "epoch": 0.09702992672946997, + "grad_norm": 1.4208050966262817, + "learning_rate": 4.884760111896735e-05, + "loss": 5.5653, + "step": 16315 + }, + { + "epoch": 0.09703587401275098, + "grad_norm": 1.567555546760559, + "learning_rate": 4.8847460933037156e-05, + "loss": 5.5144, + "step": 16316 + }, + { + "epoch": 0.09704182129603198, + "grad_norm": 1.9179699420928955, + "learning_rate": 4.884732073878205e-05, + "loss": 4.7947, + "step": 16317 + }, + { + "epoch": 0.09704776857931297, + "grad_norm": 2.5346062183380127, + "learning_rate": 4.88471805362021e-05, + "loss": 3.8315, + "step": 16318 + }, + { + "epoch": 0.09705371586259397, + "grad_norm": 2.585686683654785, + "learning_rate": 4.884704032529734e-05, + "loss": 3.7288, + "step": 16319 + }, + { + "epoch": 0.09705966314587497, + "grad_norm": 2.133723020553589, + "learning_rate": 4.8846900106067825e-05, + "loss": 3.6369, + "step": 16320 + }, + { + "epoch": 0.09706561042915596, + "grad_norm": 2.4039080142974854, + "learning_rate": 4.884675987851361e-05, + "loss": 3.9068, + "step": 16321 + }, + { + "epoch": 0.09707155771243696, + "grad_norm": 2.643489360809326, + "learning_rate": 4.884661964263473e-05, + "loss": 3.7793, + "step": 16322 + }, + { + "epoch": 0.09707750499571796, + "grad_norm": 2.485727071762085, + "learning_rate": 4.8846479398431244e-05, + "loss": 4.9789, + "step": 16323 + }, + { + "epoch": 0.09708345227899895, + "grad_norm": 2.8592441082000732, + "learning_rate": 4.8846339145903194e-05, + "loss": 4.0196, + "step": 16324 + }, + { + "epoch": 0.09708939956227995, + "grad_norm": 2.470813035964966, + "learning_rate": 4.884619888505064e-05, + "loss": 5.2308, + "step": 16325 + }, + { + "epoch": 0.09709534684556095, + "grad_norm": 2.3255081176757812, + "learning_rate": 4.884605861587362e-05, + "loss": 5.3535, + "step": 16326 + }, + { + "epoch": 0.09710129412884194, + "grad_norm": 2.1462676525115967, + "learning_rate": 4.8845918338372195e-05, + "loss": 5.2611, + "step": 16327 + }, + { + "epoch": 0.09710724141212294, + "grad_norm": 1.8838989734649658, + "learning_rate": 4.88457780525464e-05, + "loss": 5.8104, + "step": 16328 + }, + { + "epoch": 0.09711318869540395, + "grad_norm": 2.137746572494507, + "learning_rate": 4.884563775839629e-05, + "loss": 5.4702, + "step": 16329 + }, + { + "epoch": 0.09711913597868493, + "grad_norm": 1.8934431076049805, + "learning_rate": 4.884549745592192e-05, + "loss": 4.9703, + "step": 16330 + }, + { + "epoch": 0.09712508326196594, + "grad_norm": 2.409020185470581, + "learning_rate": 4.884535714512333e-05, + "loss": 5.6793, + "step": 16331 + }, + { + "epoch": 0.09713103054524694, + "grad_norm": 2.039520263671875, + "learning_rate": 4.884521682600056e-05, + "loss": 5.7809, + "step": 16332 + }, + { + "epoch": 0.09713697782852793, + "grad_norm": 3.1211516857147217, + "learning_rate": 4.884507649855369e-05, + "loss": 5.6195, + "step": 16333 + }, + { + "epoch": 0.09714292511180893, + "grad_norm": 1.9474505186080933, + "learning_rate": 4.884493616278274e-05, + "loss": 5.3064, + "step": 16334 + }, + { + "epoch": 0.09714887239508992, + "grad_norm": 1.7586307525634766, + "learning_rate": 4.884479581868777e-05, + "loss": 4.9531, + "step": 16335 + }, + { + "epoch": 0.09715481967837092, + "grad_norm": 1.6352753639221191, + "learning_rate": 4.884465546626883e-05, + "loss": 5.304, + "step": 16336 + }, + { + "epoch": 0.09716076696165192, + "grad_norm": 1.681362271308899, + "learning_rate": 4.884451510552597e-05, + "loss": 5.9167, + "step": 16337 + }, + { + "epoch": 0.09716671424493291, + "grad_norm": 1.7970985174179077, + "learning_rate": 4.8844374736459225e-05, + "loss": 6.122, + "step": 16338 + }, + { + "epoch": 0.09717266152821391, + "grad_norm": 1.5312799215316772, + "learning_rate": 4.8844234359068666e-05, + "loss": 4.903, + "step": 16339 + }, + { + "epoch": 0.09717860881149491, + "grad_norm": 1.7024787664413452, + "learning_rate": 4.884409397335432e-05, + "loss": 5.3306, + "step": 16340 + }, + { + "epoch": 0.0971845560947759, + "grad_norm": 3.000169515609741, + "learning_rate": 4.884395357931626e-05, + "loss": 4.9682, + "step": 16341 + }, + { + "epoch": 0.0971905033780569, + "grad_norm": 2.910048484802246, + "learning_rate": 4.884381317695452e-05, + "loss": 5.2385, + "step": 16342 + }, + { + "epoch": 0.0971964506613379, + "grad_norm": 2.1094155311584473, + "learning_rate": 4.8843672766269147e-05, + "loss": 5.1025, + "step": 16343 + }, + { + "epoch": 0.09720239794461889, + "grad_norm": 1.7918319702148438, + "learning_rate": 4.884353234726019e-05, + "loss": 5.2822, + "step": 16344 + }, + { + "epoch": 0.0972083452278999, + "grad_norm": 1.574461579322815, + "learning_rate": 4.884339191992771e-05, + "loss": 5.6254, + "step": 16345 + }, + { + "epoch": 0.0972142925111809, + "grad_norm": 2.0780746936798096, + "learning_rate": 4.884325148427175e-05, + "loss": 5.0641, + "step": 16346 + }, + { + "epoch": 0.09722023979446189, + "grad_norm": 2.30399227142334, + "learning_rate": 4.884311104029235e-05, + "loss": 4.9591, + "step": 16347 + }, + { + "epoch": 0.09722618707774289, + "grad_norm": 2.087993621826172, + "learning_rate": 4.884297058798957e-05, + "loss": 5.0514, + "step": 16348 + }, + { + "epoch": 0.09723213436102389, + "grad_norm": 2.0179786682128906, + "learning_rate": 4.884283012736345e-05, + "loss": 4.9632, + "step": 16349 + }, + { + "epoch": 0.09723808164430488, + "grad_norm": 2.4394171237945557, + "learning_rate": 4.8842689658414054e-05, + "loss": 4.6517, + "step": 16350 + }, + { + "epoch": 0.09724402892758588, + "grad_norm": 2.6895275115966797, + "learning_rate": 4.884254918114142e-05, + "loss": 4.726, + "step": 16351 + }, + { + "epoch": 0.09724997621086688, + "grad_norm": 1.5181125402450562, + "learning_rate": 4.884240869554559e-05, + "loss": 5.679, + "step": 16352 + }, + { + "epoch": 0.09725592349414787, + "grad_norm": 1.758475422859192, + "learning_rate": 4.884226820162662e-05, + "loss": 5.2323, + "step": 16353 + }, + { + "epoch": 0.09726187077742887, + "grad_norm": 2.0166938304901123, + "learning_rate": 4.884212769938457e-05, + "loss": 4.6912, + "step": 16354 + }, + { + "epoch": 0.09726781806070987, + "grad_norm": 2.1366612911224365, + "learning_rate": 4.8841987188819475e-05, + "loss": 4.4761, + "step": 16355 + }, + { + "epoch": 0.09727376534399086, + "grad_norm": 1.9595547914505005, + "learning_rate": 4.884184666993139e-05, + "loss": 4.5343, + "step": 16356 + }, + { + "epoch": 0.09727971262727186, + "grad_norm": 1.896043300628662, + "learning_rate": 4.884170614272037e-05, + "loss": 4.465, + "step": 16357 + }, + { + "epoch": 0.09728565991055287, + "grad_norm": 2.062506675720215, + "learning_rate": 4.884156560718645e-05, + "loss": 4.301, + "step": 16358 + }, + { + "epoch": 0.09729160719383385, + "grad_norm": 2.0816612243652344, + "learning_rate": 4.884142506332968e-05, + "loss": 4.5414, + "step": 16359 + }, + { + "epoch": 0.09729755447711486, + "grad_norm": 2.0095489025115967, + "learning_rate": 4.884128451115012e-05, + "loss": 4.3779, + "step": 16360 + }, + { + "epoch": 0.09730350176039586, + "grad_norm": 2.0766615867614746, + "learning_rate": 4.884114395064781e-05, + "loss": 4.3999, + "step": 16361 + }, + { + "epoch": 0.09730944904367685, + "grad_norm": 2.0266785621643066, + "learning_rate": 4.8841003381822805e-05, + "loss": 4.5122, + "step": 16362 + }, + { + "epoch": 0.09731539632695785, + "grad_norm": 1.9631284475326538, + "learning_rate": 4.884086280467516e-05, + "loss": 4.3061, + "step": 16363 + }, + { + "epoch": 0.09732134361023884, + "grad_norm": 2.2965009212493896, + "learning_rate": 4.8840722219204905e-05, + "loss": 4.3387, + "step": 16364 + }, + { + "epoch": 0.09732729089351984, + "grad_norm": 2.036365509033203, + "learning_rate": 4.8840581625412105e-05, + "loss": 4.3242, + "step": 16365 + }, + { + "epoch": 0.09733323817680084, + "grad_norm": 2.186131477355957, + "learning_rate": 4.88404410232968e-05, + "loss": 4.2517, + "step": 16366 + }, + { + "epoch": 0.09733918546008183, + "grad_norm": 2.2000489234924316, + "learning_rate": 4.884030041285905e-05, + "loss": 4.274, + "step": 16367 + }, + { + "epoch": 0.09734513274336283, + "grad_norm": 3.2708849906921387, + "learning_rate": 4.884015979409889e-05, + "loss": 4.9575, + "step": 16368 + }, + { + "epoch": 0.09735108002664383, + "grad_norm": 1.7634176015853882, + "learning_rate": 4.884001916701639e-05, + "loss": 4.63, + "step": 16369 + }, + { + "epoch": 0.09735702730992482, + "grad_norm": 2.297611713409424, + "learning_rate": 4.883987853161157e-05, + "loss": 4.3009, + "step": 16370 + }, + { + "epoch": 0.09736297459320582, + "grad_norm": 2.1840944290161133, + "learning_rate": 4.8839737887884507e-05, + "loss": 4.2232, + "step": 16371 + }, + { + "epoch": 0.09736892187648682, + "grad_norm": 2.1925270557403564, + "learning_rate": 4.8839597235835234e-05, + "loss": 4.1824, + "step": 16372 + }, + { + "epoch": 0.09737486915976781, + "grad_norm": 2.175720453262329, + "learning_rate": 4.88394565754638e-05, + "loss": 4.2619, + "step": 16373 + }, + { + "epoch": 0.09738081644304881, + "grad_norm": 2.282804489135742, + "learning_rate": 4.883931590677026e-05, + "loss": 4.2207, + "step": 16374 + }, + { + "epoch": 0.09738676372632982, + "grad_norm": 1.674668788909912, + "learning_rate": 4.883917522975466e-05, + "loss": 5.3627, + "step": 16375 + }, + { + "epoch": 0.0973927110096108, + "grad_norm": 1.6538902521133423, + "learning_rate": 4.883903454441705e-05, + "loss": 5.302, + "step": 16376 + }, + { + "epoch": 0.09739865829289181, + "grad_norm": 1.4267115592956543, + "learning_rate": 4.8838893850757485e-05, + "loss": 5.2545, + "step": 16377 + }, + { + "epoch": 0.09740460557617281, + "grad_norm": 1.3086082935333252, + "learning_rate": 4.8838753148776e-05, + "loss": 5.1538, + "step": 16378 + }, + { + "epoch": 0.0974105528594538, + "grad_norm": 1.4384034872055054, + "learning_rate": 4.883861243847266e-05, + "loss": 5.3925, + "step": 16379 + }, + { + "epoch": 0.0974165001427348, + "grad_norm": 1.4971977472305298, + "learning_rate": 4.88384717198475e-05, + "loss": 5.3966, + "step": 16380 + }, + { + "epoch": 0.0974224474260158, + "grad_norm": 1.517468810081482, + "learning_rate": 4.8838330992900584e-05, + "loss": 5.1097, + "step": 16381 + }, + { + "epoch": 0.09742839470929679, + "grad_norm": 1.388852596282959, + "learning_rate": 4.8838190257631944e-05, + "loss": 5.1066, + "step": 16382 + }, + { + "epoch": 0.09743434199257779, + "grad_norm": 1.2972341775894165, + "learning_rate": 4.8838049514041646e-05, + "loss": 5.0383, + "step": 16383 + }, + { + "epoch": 0.0974402892758588, + "grad_norm": 1.338291049003601, + "learning_rate": 4.883790876212972e-05, + "loss": 5.1339, + "step": 16384 + }, + { + "epoch": 0.09744623655913978, + "grad_norm": 1.4399670362472534, + "learning_rate": 4.883776800189624e-05, + "loss": 5.0542, + "step": 16385 + }, + { + "epoch": 0.09745218384242078, + "grad_norm": 1.5091251134872437, + "learning_rate": 4.8837627233341235e-05, + "loss": 4.9303, + "step": 16386 + }, + { + "epoch": 0.09745813112570179, + "grad_norm": 1.4728022813796997, + "learning_rate": 4.8837486456464764e-05, + "loss": 5.0902, + "step": 16387 + }, + { + "epoch": 0.09746407840898277, + "grad_norm": 1.454509973526001, + "learning_rate": 4.8837345671266865e-05, + "loss": 4.9227, + "step": 16388 + }, + { + "epoch": 0.09747002569226378, + "grad_norm": 1.431118130683899, + "learning_rate": 4.88372048777476e-05, + "loss": 5.0128, + "step": 16389 + }, + { + "epoch": 0.09747597297554478, + "grad_norm": 1.434967041015625, + "learning_rate": 4.8837064075907015e-05, + "loss": 5.1793, + "step": 16390 + }, + { + "epoch": 0.09748192025882577, + "grad_norm": 1.5077275037765503, + "learning_rate": 4.883692326574515e-05, + "loss": 5.1573, + "step": 16391 + }, + { + "epoch": 0.09748786754210677, + "grad_norm": 1.44413161277771, + "learning_rate": 4.883678244726208e-05, + "loss": 5.2297, + "step": 16392 + }, + { + "epoch": 0.09749381482538776, + "grad_norm": 1.606898546218872, + "learning_rate": 4.883664162045781e-05, + "loss": 4.9409, + "step": 16393 + }, + { + "epoch": 0.09749976210866876, + "grad_norm": 1.649034857749939, + "learning_rate": 4.883650078533243e-05, + "loss": 5.1519, + "step": 16394 + }, + { + "epoch": 0.09750570939194976, + "grad_norm": 1.5309730768203735, + "learning_rate": 4.883635994188597e-05, + "loss": 4.9568, + "step": 16395 + }, + { + "epoch": 0.09751165667523075, + "grad_norm": 1.8033829927444458, + "learning_rate": 4.883621909011848e-05, + "loss": 4.7442, + "step": 16396 + }, + { + "epoch": 0.09751760395851175, + "grad_norm": 1.653501272201538, + "learning_rate": 4.8836078230030016e-05, + "loss": 4.5672, + "step": 16397 + }, + { + "epoch": 0.09752355124179275, + "grad_norm": 1.686077356338501, + "learning_rate": 4.8835937361620624e-05, + "loss": 4.5819, + "step": 16398 + }, + { + "epoch": 0.09752949852507374, + "grad_norm": 1.5233088731765747, + "learning_rate": 4.883579648489035e-05, + "loss": 4.5191, + "step": 16399 + }, + { + "epoch": 0.09753544580835474, + "grad_norm": 1.6472907066345215, + "learning_rate": 4.883565559983925e-05, + "loss": 4.6418, + "step": 16400 + }, + { + "epoch": 0.09754139309163574, + "grad_norm": 1.817649483680725, + "learning_rate": 4.8835514706467364e-05, + "loss": 4.806, + "step": 16401 + }, + { + "epoch": 0.09754734037491673, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.8835373804774754e-05, + "loss": 4.8169, + "step": 16402 + }, + { + "epoch": 0.09755328765819773, + "grad_norm": 1.5510175228118896, + "learning_rate": 4.883523289476145e-05, + "loss": 4.7987, + "step": 16403 + }, + { + "epoch": 0.09755923494147874, + "grad_norm": 1.4557734727859497, + "learning_rate": 4.8835091976427514e-05, + "loss": 4.7322, + "step": 16404 + }, + { + "epoch": 0.09756518222475973, + "grad_norm": 1.528123140335083, + "learning_rate": 4.8834951049773006e-05, + "loss": 4.7376, + "step": 16405 + }, + { + "epoch": 0.09757112950804073, + "grad_norm": 1.6215547323226929, + "learning_rate": 4.8834810114797944e-05, + "loss": 4.7679, + "step": 16406 + }, + { + "epoch": 0.09757707679132173, + "grad_norm": 1.4554566144943237, + "learning_rate": 4.883466917150241e-05, + "loss": 4.6452, + "step": 16407 + }, + { + "epoch": 0.09758302407460272, + "grad_norm": 1.5100599527359009, + "learning_rate": 4.883452821988644e-05, + "loss": 4.6957, + "step": 16408 + }, + { + "epoch": 0.09758897135788372, + "grad_norm": 1.7057833671569824, + "learning_rate": 4.8834387259950074e-05, + "loss": 4.7888, + "step": 16409 + }, + { + "epoch": 0.09759491864116472, + "grad_norm": 1.4016892910003662, + "learning_rate": 4.883424629169337e-05, + "loss": 4.769, + "step": 16410 + }, + { + "epoch": 0.09760086592444571, + "grad_norm": 1.5257891416549683, + "learning_rate": 4.883410531511638e-05, + "loss": 4.7443, + "step": 16411 + }, + { + "epoch": 0.09760681320772671, + "grad_norm": 1.3904502391815186, + "learning_rate": 4.883396433021916e-05, + "loss": 4.786, + "step": 16412 + }, + { + "epoch": 0.09761276049100771, + "grad_norm": 1.6081106662750244, + "learning_rate": 4.883382333700174e-05, + "loss": 4.5321, + "step": 16413 + }, + { + "epoch": 0.0976187077742887, + "grad_norm": 1.4291402101516724, + "learning_rate": 4.883368233546417e-05, + "loss": 4.5898, + "step": 16414 + }, + { + "epoch": 0.0976246550575697, + "grad_norm": 1.5700920820236206, + "learning_rate": 4.8833541325606524e-05, + "loss": 5.2177, + "step": 16415 + }, + { + "epoch": 0.0976306023408507, + "grad_norm": 1.5503007173538208, + "learning_rate": 4.8833400307428825e-05, + "loss": 5.3911, + "step": 16416 + }, + { + "epoch": 0.0976365496241317, + "grad_norm": 1.5890953540802002, + "learning_rate": 4.8833259280931135e-05, + "loss": 4.9426, + "step": 16417 + }, + { + "epoch": 0.0976424969074127, + "grad_norm": 1.5032304525375366, + "learning_rate": 4.8833118246113494e-05, + "loss": 4.6124, + "step": 16418 + }, + { + "epoch": 0.0976484441906937, + "grad_norm": 1.5300242900848389, + "learning_rate": 4.8832977202975964e-05, + "loss": 4.9323, + "step": 16419 + }, + { + "epoch": 0.09765439147397469, + "grad_norm": 1.7094424962997437, + "learning_rate": 4.883283615151859e-05, + "loss": 5.3205, + "step": 16420 + }, + { + "epoch": 0.09766033875725569, + "grad_norm": 1.8231004476547241, + "learning_rate": 4.883269509174142e-05, + "loss": 5.0414, + "step": 16421 + }, + { + "epoch": 0.09766628604053668, + "grad_norm": 1.7779520750045776, + "learning_rate": 4.8832554023644496e-05, + "loss": 4.9106, + "step": 16422 + }, + { + "epoch": 0.09767223332381768, + "grad_norm": 1.5394103527069092, + "learning_rate": 4.8832412947227875e-05, + "loss": 4.998, + "step": 16423 + }, + { + "epoch": 0.09767818060709868, + "grad_norm": 1.3814078569412231, + "learning_rate": 4.883227186249161e-05, + "loss": 4.9109, + "step": 16424 + }, + { + "epoch": 0.09768412789037967, + "grad_norm": 1.291040301322937, + "learning_rate": 4.8832130769435735e-05, + "loss": 5.3617, + "step": 16425 + }, + { + "epoch": 0.09769007517366067, + "grad_norm": 1.561249017715454, + "learning_rate": 4.883198966806032e-05, + "loss": 5.3041, + "step": 16426 + }, + { + "epoch": 0.09769602245694167, + "grad_norm": 1.7411010265350342, + "learning_rate": 4.883184855836539e-05, + "loss": 5.0816, + "step": 16427 + }, + { + "epoch": 0.09770196974022266, + "grad_norm": 1.6507155895233154, + "learning_rate": 4.8831707440351024e-05, + "loss": 5.1089, + "step": 16428 + }, + { + "epoch": 0.09770791702350366, + "grad_norm": 1.5242364406585693, + "learning_rate": 4.8831566314017254e-05, + "loss": 4.9718, + "step": 16429 + }, + { + "epoch": 0.09771386430678466, + "grad_norm": 2.3768868446350098, + "learning_rate": 4.883142517936412e-05, + "loss": 4.9333, + "step": 16430 + }, + { + "epoch": 0.09771981159006565, + "grad_norm": 1.2830429077148438, + "learning_rate": 4.8831284036391684e-05, + "loss": 4.9238, + "step": 16431 + }, + { + "epoch": 0.09772575887334665, + "grad_norm": 1.5065499544143677, + "learning_rate": 4.883114288509999e-05, + "loss": 5.0151, + "step": 16432 + }, + { + "epoch": 0.09773170615662766, + "grad_norm": 1.5989798307418823, + "learning_rate": 4.88310017254891e-05, + "loss": 5.0081, + "step": 16433 + }, + { + "epoch": 0.09773765343990864, + "grad_norm": 1.391644835472107, + "learning_rate": 4.883086055755905e-05, + "loss": 4.8942, + "step": 16434 + }, + { + "epoch": 0.09774360072318965, + "grad_norm": 1.4952952861785889, + "learning_rate": 4.883071938130989e-05, + "loss": 5.0018, + "step": 16435 + }, + { + "epoch": 0.09774954800647065, + "grad_norm": 1.522814393043518, + "learning_rate": 4.883057819674168e-05, + "loss": 5.2591, + "step": 16436 + }, + { + "epoch": 0.09775549528975164, + "grad_norm": 1.3879649639129639, + "learning_rate": 4.8830437003854454e-05, + "loss": 4.9136, + "step": 16437 + }, + { + "epoch": 0.09776144257303264, + "grad_norm": 1.3485056161880493, + "learning_rate": 4.883029580264827e-05, + "loss": 5.5159, + "step": 16438 + }, + { + "epoch": 0.09776738985631364, + "grad_norm": 1.475131869316101, + "learning_rate": 4.883015459312317e-05, + "loss": 5.4397, + "step": 16439 + }, + { + "epoch": 0.09777333713959463, + "grad_norm": 1.2736895084381104, + "learning_rate": 4.8830013375279215e-05, + "loss": 5.2867, + "step": 16440 + }, + { + "epoch": 0.09777928442287563, + "grad_norm": 1.456312656402588, + "learning_rate": 4.882987214911645e-05, + "loss": 5.3351, + "step": 16441 + }, + { + "epoch": 0.09778523170615663, + "grad_norm": 1.5312397480010986, + "learning_rate": 4.882973091463492e-05, + "loss": 5.3233, + "step": 16442 + }, + { + "epoch": 0.09779117898943762, + "grad_norm": 1.5735961198806763, + "learning_rate": 4.882958967183468e-05, + "loss": 4.9878, + "step": 16443 + }, + { + "epoch": 0.09779712627271862, + "grad_norm": 1.337172508239746, + "learning_rate": 4.882944842071577e-05, + "loss": 5.121, + "step": 16444 + }, + { + "epoch": 0.09780307355599963, + "grad_norm": 1.47593355178833, + "learning_rate": 4.882930716127826e-05, + "loss": 5.4733, + "step": 16445 + }, + { + "epoch": 0.09780902083928061, + "grad_norm": 1.4311164617538452, + "learning_rate": 4.882916589352217e-05, + "loss": 5.2215, + "step": 16446 + }, + { + "epoch": 0.09781496812256162, + "grad_norm": 1.3628556728363037, + "learning_rate": 4.882902461744757e-05, + "loss": 5.3611, + "step": 16447 + }, + { + "epoch": 0.09782091540584262, + "grad_norm": 1.5621687173843384, + "learning_rate": 4.882888333305451e-05, + "loss": 5.4407, + "step": 16448 + }, + { + "epoch": 0.0978268626891236, + "grad_norm": 1.570478081703186, + "learning_rate": 4.8828742040343024e-05, + "loss": 5.533, + "step": 16449 + }, + { + "epoch": 0.09783280997240461, + "grad_norm": 1.3725816011428833, + "learning_rate": 4.8828600739313174e-05, + "loss": 5.1467, + "step": 16450 + }, + { + "epoch": 0.0978387572556856, + "grad_norm": 1.4899497032165527, + "learning_rate": 4.8828459429965e-05, + "loss": 5.233, + "step": 16451 + }, + { + "epoch": 0.0978447045389666, + "grad_norm": 1.380609154701233, + "learning_rate": 4.882831811229857e-05, + "loss": 5.1484, + "step": 16452 + }, + { + "epoch": 0.0978506518222476, + "grad_norm": 1.2167932987213135, + "learning_rate": 4.882817678631391e-05, + "loss": 5.1687, + "step": 16453 + }, + { + "epoch": 0.09785659910552859, + "grad_norm": 1.5250643491744995, + "learning_rate": 4.882803545201108e-05, + "loss": 5.2395, + "step": 16454 + }, + { + "epoch": 0.09786254638880959, + "grad_norm": 1.4288511276245117, + "learning_rate": 4.882789410939013e-05, + "loss": 5.0532, + "step": 16455 + }, + { + "epoch": 0.09786849367209059, + "grad_norm": 1.6325379610061646, + "learning_rate": 4.8827752758451105e-05, + "loss": 5.2077, + "step": 16456 + }, + { + "epoch": 0.09787444095537158, + "grad_norm": 1.4227756261825562, + "learning_rate": 4.882761139919406e-05, + "loss": 5.0431, + "step": 16457 + }, + { + "epoch": 0.09788038823865258, + "grad_norm": 1.355039358139038, + "learning_rate": 4.8827470031619046e-05, + "loss": 4.9062, + "step": 16458 + }, + { + "epoch": 0.09788633552193358, + "grad_norm": 1.5071823596954346, + "learning_rate": 4.8827328655726113e-05, + "loss": 5.2632, + "step": 16459 + }, + { + "epoch": 0.09789228280521457, + "grad_norm": 1.411828637123108, + "learning_rate": 4.88271872715153e-05, + "loss": 5.343, + "step": 16460 + }, + { + "epoch": 0.09789823008849557, + "grad_norm": 1.419164776802063, + "learning_rate": 4.882704587898666e-05, + "loss": 5.1643, + "step": 16461 + }, + { + "epoch": 0.09790417737177658, + "grad_norm": 1.4997645616531372, + "learning_rate": 4.882690447814024e-05, + "loss": 5.1701, + "step": 16462 + }, + { + "epoch": 0.09791012465505756, + "grad_norm": 1.4251139163970947, + "learning_rate": 4.88267630689761e-05, + "loss": 5.0228, + "step": 16463 + }, + { + "epoch": 0.09791607193833857, + "grad_norm": 1.289102554321289, + "learning_rate": 4.882662165149429e-05, + "loss": 5.1934, + "step": 16464 + }, + { + "epoch": 0.09792201922161957, + "grad_norm": 1.1589713096618652, + "learning_rate": 4.882648022569484e-05, + "loss": 5.3388, + "step": 16465 + }, + { + "epoch": 0.09792796650490056, + "grad_norm": 1.1682082414627075, + "learning_rate": 4.8826338791577816e-05, + "loss": 5.2062, + "step": 16466 + }, + { + "epoch": 0.09793391378818156, + "grad_norm": 1.2263107299804688, + "learning_rate": 4.882619734914326e-05, + "loss": 5.414, + "step": 16467 + }, + { + "epoch": 0.09793986107146256, + "grad_norm": 1.2873631715774536, + "learning_rate": 4.882605589839123e-05, + "loss": 5.4286, + "step": 16468 + }, + { + "epoch": 0.09794580835474355, + "grad_norm": 1.2950979471206665, + "learning_rate": 4.882591443932177e-05, + "loss": 5.1603, + "step": 16469 + }, + { + "epoch": 0.09795175563802455, + "grad_norm": 1.5623066425323486, + "learning_rate": 4.882577297193493e-05, + "loss": 5.0778, + "step": 16470 + }, + { + "epoch": 0.09795770292130555, + "grad_norm": 1.5446339845657349, + "learning_rate": 4.882563149623076e-05, + "loss": 5.1451, + "step": 16471 + }, + { + "epoch": 0.09796365020458654, + "grad_norm": 1.599387526512146, + "learning_rate": 4.882549001220931e-05, + "loss": 5.4596, + "step": 16472 + }, + { + "epoch": 0.09796959748786754, + "grad_norm": 1.325596809387207, + "learning_rate": 4.882534851987062e-05, + "loss": 5.4639, + "step": 16473 + }, + { + "epoch": 0.09797554477114855, + "grad_norm": 1.3077852725982666, + "learning_rate": 4.8825207019214746e-05, + "loss": 5.3654, + "step": 16474 + }, + { + "epoch": 0.09798149205442953, + "grad_norm": 1.5500328540802002, + "learning_rate": 4.882506551024174e-05, + "loss": 4.946, + "step": 16475 + }, + { + "epoch": 0.09798743933771054, + "grad_norm": 1.6101415157318115, + "learning_rate": 4.8824923992951656e-05, + "loss": 4.9618, + "step": 16476 + }, + { + "epoch": 0.09799338662099154, + "grad_norm": 1.542837381362915, + "learning_rate": 4.882478246734453e-05, + "loss": 4.9959, + "step": 16477 + }, + { + "epoch": 0.09799933390427253, + "grad_norm": 1.5618165731430054, + "learning_rate": 4.8824640933420424e-05, + "loss": 5.1221, + "step": 16478 + }, + { + "epoch": 0.09800528118755353, + "grad_norm": 1.4425160884857178, + "learning_rate": 4.882449939117938e-05, + "loss": 5.1689, + "step": 16479 + }, + { + "epoch": 0.09801122847083452, + "grad_norm": 1.3621004819869995, + "learning_rate": 4.8824357840621445e-05, + "loss": 4.9975, + "step": 16480 + }, + { + "epoch": 0.09801717575411552, + "grad_norm": 1.5944523811340332, + "learning_rate": 4.882421628174668e-05, + "loss": 5.0296, + "step": 16481 + }, + { + "epoch": 0.09802312303739652, + "grad_norm": 1.391321063041687, + "learning_rate": 4.8824074714555125e-05, + "loss": 5.0139, + "step": 16482 + }, + { + "epoch": 0.09802907032067751, + "grad_norm": 1.2085964679718018, + "learning_rate": 4.882393313904683e-05, + "loss": 5.1125, + "step": 16483 + }, + { + "epoch": 0.09803501760395851, + "grad_norm": 1.391383409500122, + "learning_rate": 4.882379155522185e-05, + "loss": 5.2999, + "step": 16484 + }, + { + "epoch": 0.09804096488723951, + "grad_norm": 1.3748564720153809, + "learning_rate": 4.882364996308023e-05, + "loss": 5.3096, + "step": 16485 + }, + { + "epoch": 0.0980469121705205, + "grad_norm": 1.825728416442871, + "learning_rate": 4.8823508362622014e-05, + "loss": 5.3318, + "step": 16486 + }, + { + "epoch": 0.0980528594538015, + "grad_norm": 1.6402180194854736, + "learning_rate": 4.882336675384726e-05, + "loss": 5.155, + "step": 16487 + }, + { + "epoch": 0.0980588067370825, + "grad_norm": 1.343284249305725, + "learning_rate": 4.882322513675601e-05, + "loss": 4.9341, + "step": 16488 + }, + { + "epoch": 0.09806475402036349, + "grad_norm": 1.3958711624145508, + "learning_rate": 4.882308351134833e-05, + "loss": 4.9595, + "step": 16489 + }, + { + "epoch": 0.0980707013036445, + "grad_norm": 1.572996735572815, + "learning_rate": 4.882294187762425e-05, + "loss": 4.9666, + "step": 16490 + }, + { + "epoch": 0.0980766485869255, + "grad_norm": 1.6167391538619995, + "learning_rate": 4.882280023558383e-05, + "loss": 4.7387, + "step": 16491 + }, + { + "epoch": 0.09808259587020648, + "grad_norm": 2.474092483520508, + "learning_rate": 4.882265858522711e-05, + "loss": 5.1476, + "step": 16492 + }, + { + "epoch": 0.09808854315348749, + "grad_norm": 1.5375875234603882, + "learning_rate": 4.8822516926554155e-05, + "loss": 4.5832, + "step": 16493 + }, + { + "epoch": 0.09809449043676849, + "grad_norm": 1.6802133321762085, + "learning_rate": 4.8822375259565e-05, + "loss": 4.615, + "step": 16494 + }, + { + "epoch": 0.09810043772004948, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.8822233584259703e-05, + "loss": 4.6586, + "step": 16495 + }, + { + "epoch": 0.09810638500333048, + "grad_norm": 1.5207875967025757, + "learning_rate": 4.882209190063831e-05, + "loss": 4.6748, + "step": 16496 + }, + { + "epoch": 0.09811233228661148, + "grad_norm": 1.4980802536010742, + "learning_rate": 4.882195020870087e-05, + "loss": 4.5326, + "step": 16497 + }, + { + "epoch": 0.09811827956989247, + "grad_norm": 1.473092794418335, + "learning_rate": 4.882180850844743e-05, + "loss": 4.6126, + "step": 16498 + }, + { + "epoch": 0.09812422685317347, + "grad_norm": 1.521147608757019, + "learning_rate": 4.8821666799878055e-05, + "loss": 4.6269, + "step": 16499 + }, + { + "epoch": 0.09813017413645447, + "grad_norm": 1.7371230125427246, + "learning_rate": 4.882152508299277e-05, + "loss": 4.6847, + "step": 16500 + }, + { + "epoch": 0.09813612141973546, + "grad_norm": 1.7222683429718018, + "learning_rate": 4.8821383357791636e-05, + "loss": 5.3943, + "step": 16501 + }, + { + "epoch": 0.09814206870301646, + "grad_norm": 1.523373007774353, + "learning_rate": 4.8821241624274705e-05, + "loss": 5.2822, + "step": 16502 + }, + { + "epoch": 0.09814801598629747, + "grad_norm": 1.365224838256836, + "learning_rate": 4.882109988244203e-05, + "loss": 5.1923, + "step": 16503 + }, + { + "epoch": 0.09815396326957845, + "grad_norm": 1.503907322883606, + "learning_rate": 4.882095813229365e-05, + "loss": 5.128, + "step": 16504 + }, + { + "epoch": 0.09815991055285946, + "grad_norm": 1.5996166467666626, + "learning_rate": 4.8820816373829625e-05, + "loss": 4.9296, + "step": 16505 + }, + { + "epoch": 0.09816585783614046, + "grad_norm": 1.373089075088501, + "learning_rate": 4.8820674607049994e-05, + "loss": 5.0614, + "step": 16506 + }, + { + "epoch": 0.09817180511942145, + "grad_norm": 1.3730735778808594, + "learning_rate": 4.882053283195481e-05, + "loss": 5.0374, + "step": 16507 + }, + { + "epoch": 0.09817775240270245, + "grad_norm": 1.2357912063598633, + "learning_rate": 4.882039104854413e-05, + "loss": 5.1513, + "step": 16508 + }, + { + "epoch": 0.09818369968598344, + "grad_norm": 1.402327299118042, + "learning_rate": 4.8820249256817995e-05, + "loss": 5.7344, + "step": 16509 + }, + { + "epoch": 0.09818964696926444, + "grad_norm": 1.3152369260787964, + "learning_rate": 4.882010745677645e-05, + "loss": 5.6755, + "step": 16510 + }, + { + "epoch": 0.09819559425254544, + "grad_norm": 1.409428358078003, + "learning_rate": 4.8819965648419565e-05, + "loss": 5.3562, + "step": 16511 + }, + { + "epoch": 0.09820154153582643, + "grad_norm": 1.3278082609176636, + "learning_rate": 4.881982383174737e-05, + "loss": 5.2401, + "step": 16512 + }, + { + "epoch": 0.09820748881910743, + "grad_norm": 1.287716269493103, + "learning_rate": 4.881968200675991e-05, + "loss": 4.9961, + "step": 16513 + }, + { + "epoch": 0.09821343610238843, + "grad_norm": 1.3444676399230957, + "learning_rate": 4.881954017345727e-05, + "loss": 5.5592, + "step": 16514 + }, + { + "epoch": 0.09821938338566942, + "grad_norm": 1.4815365076065063, + "learning_rate": 4.881939833183945e-05, + "loss": 5.5342, + "step": 16515 + }, + { + "epoch": 0.09822533066895042, + "grad_norm": 1.210050344467163, + "learning_rate": 4.8819256481906536e-05, + "loss": 5.5375, + "step": 16516 + }, + { + "epoch": 0.09823127795223142, + "grad_norm": 2.041801691055298, + "learning_rate": 4.881911462365857e-05, + "loss": 4.601, + "step": 16517 + }, + { + "epoch": 0.09823722523551241, + "grad_norm": 2.196315050125122, + "learning_rate": 4.881897275709558e-05, + "loss": 4.2376, + "step": 16518 + }, + { + "epoch": 0.09824317251879341, + "grad_norm": 2.1649539470672607, + "learning_rate": 4.881883088221765e-05, + "loss": 4.4159, + "step": 16519 + }, + { + "epoch": 0.09824911980207442, + "grad_norm": 2.02476167678833, + "learning_rate": 4.881868899902481e-05, + "loss": 4.4091, + "step": 16520 + }, + { + "epoch": 0.0982550670853554, + "grad_norm": 1.9262346029281616, + "learning_rate": 4.88185471075171e-05, + "loss": 4.4326, + "step": 16521 + }, + { + "epoch": 0.0982610143686364, + "grad_norm": 1.8461369276046753, + "learning_rate": 4.881840520769459e-05, + "loss": 4.1563, + "step": 16522 + }, + { + "epoch": 0.09826696165191741, + "grad_norm": 1.8261640071868896, + "learning_rate": 4.881826329955732e-05, + "loss": 4.3518, + "step": 16523 + }, + { + "epoch": 0.0982729089351984, + "grad_norm": 2.1533737182617188, + "learning_rate": 4.881812138310534e-05, + "loss": 4.292, + "step": 16524 + }, + { + "epoch": 0.0982788562184794, + "grad_norm": 2.11578369140625, + "learning_rate": 4.8817979458338705e-05, + "loss": 4.5411, + "step": 16525 + }, + { + "epoch": 0.0982848035017604, + "grad_norm": 1.8681827783584595, + "learning_rate": 4.881783752525745e-05, + "loss": 5.7264, + "step": 16526 + }, + { + "epoch": 0.09829075078504139, + "grad_norm": 1.98794424533844, + "learning_rate": 4.881769558386163e-05, + "loss": 5.4694, + "step": 16527 + }, + { + "epoch": 0.09829669806832239, + "grad_norm": 2.6389517784118652, + "learning_rate": 4.881755363415131e-05, + "loss": 5.0086, + "step": 16528 + }, + { + "epoch": 0.0983026453516034, + "grad_norm": 2.2565221786499023, + "learning_rate": 4.881741167612653e-05, + "loss": 4.9219, + "step": 16529 + }, + { + "epoch": 0.09830859263488438, + "grad_norm": 1.8296940326690674, + "learning_rate": 4.881726970978733e-05, + "loss": 4.9185, + "step": 16530 + }, + { + "epoch": 0.09831453991816538, + "grad_norm": 2.031334638595581, + "learning_rate": 4.8817127735133774e-05, + "loss": 4.8589, + "step": 16531 + }, + { + "epoch": 0.09832048720144639, + "grad_norm": 1.5883747339248657, + "learning_rate": 4.8816985752165904e-05, + "loss": 5.2695, + "step": 16532 + }, + { + "epoch": 0.09832643448472737, + "grad_norm": 1.4946906566619873, + "learning_rate": 4.8816843760883755e-05, + "loss": 5.6835, + "step": 16533 + }, + { + "epoch": 0.09833238176800838, + "grad_norm": 1.7901808023452759, + "learning_rate": 4.881670176128741e-05, + "loss": 6.1753, + "step": 16534 + }, + { + "epoch": 0.09833832905128938, + "grad_norm": 1.7249737977981567, + "learning_rate": 4.881655975337689e-05, + "loss": 5.86, + "step": 16535 + }, + { + "epoch": 0.09834427633457037, + "grad_norm": 1.8257695436477661, + "learning_rate": 4.8816417737152264e-05, + "loss": 5.1969, + "step": 16536 + }, + { + "epoch": 0.09835022361785137, + "grad_norm": 1.3712751865386963, + "learning_rate": 4.881627571261357e-05, + "loss": 5.7666, + "step": 16537 + }, + { + "epoch": 0.09835617090113236, + "grad_norm": 1.8865090608596802, + "learning_rate": 4.881613367976086e-05, + "loss": 4.8832, + "step": 16538 + }, + { + "epoch": 0.09836211818441336, + "grad_norm": 1.7155808210372925, + "learning_rate": 4.8815991638594175e-05, + "loss": 4.7248, + "step": 16539 + }, + { + "epoch": 0.09836806546769436, + "grad_norm": 1.6654868125915527, + "learning_rate": 4.8815849589113585e-05, + "loss": 4.7095, + "step": 16540 + }, + { + "epoch": 0.09837401275097535, + "grad_norm": 1.6152902841567993, + "learning_rate": 4.881570753131912e-05, + "loss": 5.2894, + "step": 16541 + }, + { + "epoch": 0.09837996003425635, + "grad_norm": 2.1657047271728516, + "learning_rate": 4.8815565465210835e-05, + "loss": 5.9782, + "step": 16542 + }, + { + "epoch": 0.09838590731753735, + "grad_norm": 1.801346778869629, + "learning_rate": 4.88154233907888e-05, + "loss": 5.6683, + "step": 16543 + }, + { + "epoch": 0.09839185460081834, + "grad_norm": 1.7916477918624878, + "learning_rate": 4.881528130805303e-05, + "loss": 5.7056, + "step": 16544 + }, + { + "epoch": 0.09839780188409934, + "grad_norm": 2.1006147861480713, + "learning_rate": 4.881513921700359e-05, + "loss": 5.6315, + "step": 16545 + }, + { + "epoch": 0.09840374916738034, + "grad_norm": 2.3291585445404053, + "learning_rate": 4.8814997117640535e-05, + "loss": 4.8996, + "step": 16546 + }, + { + "epoch": 0.09840969645066133, + "grad_norm": 1.9543695449829102, + "learning_rate": 4.8814855009963916e-05, + "loss": 5.1839, + "step": 16547 + }, + { + "epoch": 0.09841564373394233, + "grad_norm": 2.7100865840911865, + "learning_rate": 4.881471289397378e-05, + "loss": 5.1445, + "step": 16548 + }, + { + "epoch": 0.09842159101722334, + "grad_norm": 2.5749876499176025, + "learning_rate": 4.8814570769670165e-05, + "loss": 5.2023, + "step": 16549 + }, + { + "epoch": 0.09842753830050432, + "grad_norm": 2.079770088195801, + "learning_rate": 4.881442863705313e-05, + "loss": 5.1197, + "step": 16550 + }, + { + "epoch": 0.09843348558378533, + "grad_norm": 1.9495431184768677, + "learning_rate": 4.881428649612272e-05, + "loss": 4.8669, + "step": 16551 + }, + { + "epoch": 0.09843943286706633, + "grad_norm": 2.0918610095977783, + "learning_rate": 4.8814144346879e-05, + "loss": 5.0413, + "step": 16552 + }, + { + "epoch": 0.09844538015034732, + "grad_norm": 2.326662302017212, + "learning_rate": 4.8814002189322e-05, + "loss": 5.0085, + "step": 16553 + }, + { + "epoch": 0.09845132743362832, + "grad_norm": 2.3819150924682617, + "learning_rate": 4.881386002345178e-05, + "loss": 4.8364, + "step": 16554 + }, + { + "epoch": 0.09845727471690932, + "grad_norm": 2.6585230827331543, + "learning_rate": 4.881371784926839e-05, + "loss": 5.1722, + "step": 16555 + }, + { + "epoch": 0.09846322200019031, + "grad_norm": 2.209075689315796, + "learning_rate": 4.881357566677187e-05, + "loss": 5.0474, + "step": 16556 + }, + { + "epoch": 0.09846916928347131, + "grad_norm": 1.9725440740585327, + "learning_rate": 4.881343347596229e-05, + "loss": 5.0361, + "step": 16557 + }, + { + "epoch": 0.09847511656675231, + "grad_norm": 2.0074071884155273, + "learning_rate": 4.881329127683968e-05, + "loss": 5.5143, + "step": 16558 + }, + { + "epoch": 0.0984810638500333, + "grad_norm": 1.8329545259475708, + "learning_rate": 4.8813149069404093e-05, + "loss": 5.8843, + "step": 16559 + }, + { + "epoch": 0.0984870111333143, + "grad_norm": 2.2991678714752197, + "learning_rate": 4.881300685365558e-05, + "loss": 4.6178, + "step": 16560 + }, + { + "epoch": 0.0984929584165953, + "grad_norm": 2.7643637657165527, + "learning_rate": 4.881286462959419e-05, + "loss": 4.1381, + "step": 16561 + }, + { + "epoch": 0.0984989056998763, + "grad_norm": 2.5811941623687744, + "learning_rate": 4.8812722397219985e-05, + "loss": 3.8026, + "step": 16562 + }, + { + "epoch": 0.0985048529831573, + "grad_norm": 2.1111907958984375, + "learning_rate": 4.8812580156533e-05, + "loss": 4.0149, + "step": 16563 + }, + { + "epoch": 0.0985108002664383, + "grad_norm": 2.229973793029785, + "learning_rate": 4.8812437907533294e-05, + "loss": 4.24, + "step": 16564 + }, + { + "epoch": 0.09851674754971929, + "grad_norm": 1.6310914754867554, + "learning_rate": 4.8812295650220905e-05, + "loss": 5.9476, + "step": 16565 + }, + { + "epoch": 0.09852269483300029, + "grad_norm": 1.7397875785827637, + "learning_rate": 4.881215338459589e-05, + "loss": 5.8527, + "step": 16566 + }, + { + "epoch": 0.09852864211628128, + "grad_norm": 1.8279019594192505, + "learning_rate": 4.88120111106583e-05, + "loss": 5.5869, + "step": 16567 + }, + { + "epoch": 0.09853458939956228, + "grad_norm": 1.6956331729888916, + "learning_rate": 4.881186882840818e-05, + "loss": 5.6508, + "step": 16568 + }, + { + "epoch": 0.09854053668284328, + "grad_norm": 1.619205355644226, + "learning_rate": 4.881172653784559e-05, + "loss": 5.6502, + "step": 16569 + }, + { + "epoch": 0.09854648396612427, + "grad_norm": 1.4612733125686646, + "learning_rate": 4.881158423897057e-05, + "loss": 5.5937, + "step": 16570 + }, + { + "epoch": 0.09855243124940527, + "grad_norm": 1.4997358322143555, + "learning_rate": 4.8811441931783165e-05, + "loss": 5.5865, + "step": 16571 + }, + { + "epoch": 0.09855837853268627, + "grad_norm": 1.6516716480255127, + "learning_rate": 4.8811299616283434e-05, + "loss": 5.4031, + "step": 16572 + }, + { + "epoch": 0.09856432581596726, + "grad_norm": 1.5714633464813232, + "learning_rate": 4.881115729247143e-05, + "loss": 5.4543, + "step": 16573 + }, + { + "epoch": 0.09857027309924826, + "grad_norm": 1.4891443252563477, + "learning_rate": 4.881101496034719e-05, + "loss": 5.5687, + "step": 16574 + }, + { + "epoch": 0.09857622038252926, + "grad_norm": 1.3504915237426758, + "learning_rate": 4.8810872619910773e-05, + "loss": 5.5777, + "step": 16575 + }, + { + "epoch": 0.09858216766581025, + "grad_norm": 1.5825836658477783, + "learning_rate": 4.881073027116223e-05, + "loss": 5.547, + "step": 16576 + }, + { + "epoch": 0.09858811494909125, + "grad_norm": 1.4398233890533447, + "learning_rate": 4.8810587914101607e-05, + "loss": 5.4707, + "step": 16577 + }, + { + "epoch": 0.09859406223237226, + "grad_norm": 1.6776020526885986, + "learning_rate": 4.881044554872895e-05, + "loss": 5.4879, + "step": 16578 + }, + { + "epoch": 0.09860000951565324, + "grad_norm": 1.417771339416504, + "learning_rate": 4.8810303175044316e-05, + "loss": 5.5362, + "step": 16579 + }, + { + "epoch": 0.09860595679893425, + "grad_norm": 1.4919921159744263, + "learning_rate": 4.881016079304775e-05, + "loss": 5.5289, + "step": 16580 + }, + { + "epoch": 0.09861190408221525, + "grad_norm": 1.6195905208587646, + "learning_rate": 4.88100184027393e-05, + "loss": 5.467, + "step": 16581 + }, + { + "epoch": 0.09861785136549624, + "grad_norm": 1.5255846977233887, + "learning_rate": 4.880987600411902e-05, + "loss": 6.268, + "step": 16582 + }, + { + "epoch": 0.09862379864877724, + "grad_norm": 1.5051823854446411, + "learning_rate": 4.880973359718696e-05, + "loss": 6.024, + "step": 16583 + }, + { + "epoch": 0.09862974593205824, + "grad_norm": 2.455932378768921, + "learning_rate": 4.880959118194317e-05, + "loss": 5.0881, + "step": 16584 + }, + { + "epoch": 0.09863569321533923, + "grad_norm": 2.3916566371917725, + "learning_rate": 4.880944875838769e-05, + "loss": 5.0897, + "step": 16585 + }, + { + "epoch": 0.09864164049862023, + "grad_norm": 2.0487334728240967, + "learning_rate": 4.880930632652058e-05, + "loss": 5.603, + "step": 16586 + }, + { + "epoch": 0.09864758778190123, + "grad_norm": 1.9195282459259033, + "learning_rate": 4.880916388634189e-05, + "loss": 5.6492, + "step": 16587 + }, + { + "epoch": 0.09865353506518222, + "grad_norm": 1.743602991104126, + "learning_rate": 4.880902143785166e-05, + "loss": 5.7378, + "step": 16588 + }, + { + "epoch": 0.09865948234846322, + "grad_norm": 1.913156509399414, + "learning_rate": 4.880887898104996e-05, + "loss": 5.6267, + "step": 16589 + }, + { + "epoch": 0.09866542963174423, + "grad_norm": 1.8759669065475464, + "learning_rate": 4.880873651593681e-05, + "loss": 5.5593, + "step": 16590 + }, + { + "epoch": 0.09867137691502521, + "grad_norm": 1.8475536108016968, + "learning_rate": 4.880859404251229e-05, + "loss": 5.5021, + "step": 16591 + }, + { + "epoch": 0.09867732419830622, + "grad_norm": 1.5235642194747925, + "learning_rate": 4.880845156077643e-05, + "loss": 5.4692, + "step": 16592 + }, + { + "epoch": 0.09868327148158722, + "grad_norm": 1.8132069110870361, + "learning_rate": 4.8808309070729294e-05, + "loss": 5.6067, + "step": 16593 + }, + { + "epoch": 0.0986892187648682, + "grad_norm": 1.8001697063446045, + "learning_rate": 4.880816657237091e-05, + "loss": 5.749, + "step": 16594 + }, + { + "epoch": 0.09869516604814921, + "grad_norm": 1.8349007368087769, + "learning_rate": 4.8808024065701354e-05, + "loss": 5.6596, + "step": 16595 + }, + { + "epoch": 0.0987011133314302, + "grad_norm": 1.5677918195724487, + "learning_rate": 4.880788155072065e-05, + "loss": 5.725, + "step": 16596 + }, + { + "epoch": 0.0987070606147112, + "grad_norm": 1.8379719257354736, + "learning_rate": 4.880773902742887e-05, + "loss": 5.4325, + "step": 16597 + }, + { + "epoch": 0.0987130078979922, + "grad_norm": 1.8847566843032837, + "learning_rate": 4.880759649582605e-05, + "loss": 5.5737, + "step": 16598 + }, + { + "epoch": 0.09871895518127319, + "grad_norm": 2.398552417755127, + "learning_rate": 4.8807453955912244e-05, + "loss": 5.4192, + "step": 16599 + }, + { + "epoch": 0.09872490246455419, + "grad_norm": 1.990404486656189, + "learning_rate": 4.8807311407687494e-05, + "loss": 5.4624, + "step": 16600 + }, + { + "epoch": 0.09873084974783519, + "grad_norm": 1.533575177192688, + "learning_rate": 4.880716885115187e-05, + "loss": 5.8242, + "step": 16601 + }, + { + "epoch": 0.09873679703111618, + "grad_norm": 1.7357563972473145, + "learning_rate": 4.88070262863054e-05, + "loss": 5.9343, + "step": 16602 + }, + { + "epoch": 0.09874274431439718, + "grad_norm": 1.8504372835159302, + "learning_rate": 4.880688371314816e-05, + "loss": 5.6685, + "step": 16603 + }, + { + "epoch": 0.09874869159767818, + "grad_norm": 2.5040910243988037, + "learning_rate": 4.880674113168016e-05, + "loss": 5.1591, + "step": 16604 + }, + { + "epoch": 0.09875463888095917, + "grad_norm": 2.7820568084716797, + "learning_rate": 4.880659854190148e-05, + "loss": 5.0528, + "step": 16605 + }, + { + "epoch": 0.09876058616424017, + "grad_norm": 2.004427909851074, + "learning_rate": 4.8806455943812165e-05, + "loss": 5.6251, + "step": 16606 + }, + { + "epoch": 0.09876653344752118, + "grad_norm": 1.8053330183029175, + "learning_rate": 4.880631333741227e-05, + "loss": 5.5293, + "step": 16607 + }, + { + "epoch": 0.09877248073080216, + "grad_norm": 1.6708273887634277, + "learning_rate": 4.8806170722701824e-05, + "loss": 6.1215, + "step": 16608 + }, + { + "epoch": 0.09877842801408317, + "grad_norm": 1.6344959735870361, + "learning_rate": 4.88060280996809e-05, + "loss": 6.191, + "step": 16609 + }, + { + "epoch": 0.09878437529736417, + "grad_norm": 1.68915593624115, + "learning_rate": 4.880588546834953e-05, + "loss": 5.9302, + "step": 16610 + }, + { + "epoch": 0.09879032258064516, + "grad_norm": 2.108917236328125, + "learning_rate": 4.8805742828707777e-05, + "loss": 5.5227, + "step": 16611 + }, + { + "epoch": 0.09879626986392616, + "grad_norm": 1.7772480249404907, + "learning_rate": 4.8805600180755685e-05, + "loss": 5.5694, + "step": 16612 + }, + { + "epoch": 0.09880221714720716, + "grad_norm": 1.629629135131836, + "learning_rate": 4.8805457524493305e-05, + "loss": 5.7881, + "step": 16613 + }, + { + "epoch": 0.09880816443048815, + "grad_norm": 1.8985555171966553, + "learning_rate": 4.880531485992068e-05, + "loss": 5.5357, + "step": 16614 + }, + { + "epoch": 0.09881411171376915, + "grad_norm": 2.5329599380493164, + "learning_rate": 4.880517218703786e-05, + "loss": 4.8959, + "step": 16615 + }, + { + "epoch": 0.09882005899705015, + "grad_norm": 2.408377170562744, + "learning_rate": 4.8805029505844915e-05, + "loss": 4.9581, + "step": 16616 + }, + { + "epoch": 0.09882600628033114, + "grad_norm": 2.125190258026123, + "learning_rate": 4.880488681634187e-05, + "loss": 4.4116, + "step": 16617 + }, + { + "epoch": 0.09883195356361214, + "grad_norm": 2.153186082839966, + "learning_rate": 4.880474411852879e-05, + "loss": 4.2887, + "step": 16618 + }, + { + "epoch": 0.09883790084689315, + "grad_norm": 2.3961498737335205, + "learning_rate": 4.880460141240571e-05, + "loss": 4.6521, + "step": 16619 + }, + { + "epoch": 0.09884384813017413, + "grad_norm": 2.4282264709472656, + "learning_rate": 4.880445869797271e-05, + "loss": 4.6307, + "step": 16620 + }, + { + "epoch": 0.09884979541345514, + "grad_norm": 2.461005687713623, + "learning_rate": 4.88043159752298e-05, + "loss": 4.4234, + "step": 16621 + }, + { + "epoch": 0.09885574269673614, + "grad_norm": 2.5483081340789795, + "learning_rate": 4.8804173244177056e-05, + "loss": 4.2688, + "step": 16622 + }, + { + "epoch": 0.09886168998001713, + "grad_norm": 2.370413303375244, + "learning_rate": 4.8804030504814524e-05, + "loss": 4.4887, + "step": 16623 + }, + { + "epoch": 0.09886763726329813, + "grad_norm": 2.681118965148926, + "learning_rate": 4.880388775714225e-05, + "loss": 4.2941, + "step": 16624 + }, + { + "epoch": 0.09887358454657912, + "grad_norm": 2.1210896968841553, + "learning_rate": 4.8803745001160284e-05, + "loss": 5.1994, + "step": 16625 + }, + { + "epoch": 0.09887953182986012, + "grad_norm": 1.703626275062561, + "learning_rate": 4.880360223686867e-05, + "loss": 5.5578, + "step": 16626 + }, + { + "epoch": 0.09888547911314112, + "grad_norm": 1.5515342950820923, + "learning_rate": 4.8803459464267475e-05, + "loss": 5.6636, + "step": 16627 + }, + { + "epoch": 0.09889142639642211, + "grad_norm": 1.2145434617996216, + "learning_rate": 4.880331668335673e-05, + "loss": 5.3634, + "step": 16628 + }, + { + "epoch": 0.09889737367970311, + "grad_norm": 1.2893304824829102, + "learning_rate": 4.88031738941365e-05, + "loss": 5.5383, + "step": 16629 + }, + { + "epoch": 0.09890332096298411, + "grad_norm": 3.1206297874450684, + "learning_rate": 4.880303109660682e-05, + "loss": 4.9313, + "step": 16630 + }, + { + "epoch": 0.0989092682462651, + "grad_norm": 3.382498264312744, + "learning_rate": 4.8802888290767756e-05, + "loss": 4.4475, + "step": 16631 + }, + { + "epoch": 0.0989152155295461, + "grad_norm": 1.8280858993530273, + "learning_rate": 4.880274547661934e-05, + "loss": 5.6722, + "step": 16632 + }, + { + "epoch": 0.0989211628128271, + "grad_norm": 2.0412793159484863, + "learning_rate": 4.880260265416164e-05, + "loss": 5.3952, + "step": 16633 + }, + { + "epoch": 0.09892711009610809, + "grad_norm": 2.0702524185180664, + "learning_rate": 4.880245982339469e-05, + "loss": 5.2754, + "step": 16634 + }, + { + "epoch": 0.0989330573793891, + "grad_norm": 1.7081348896026611, + "learning_rate": 4.880231698431855e-05, + "loss": 5.8414, + "step": 16635 + }, + { + "epoch": 0.0989390046626701, + "grad_norm": 1.7762012481689453, + "learning_rate": 4.880217413693328e-05, + "loss": 6.0106, + "step": 16636 + }, + { + "epoch": 0.09894495194595108, + "grad_norm": 1.815253496170044, + "learning_rate": 4.8802031281238895e-05, + "loss": 5.9715, + "step": 16637 + }, + { + "epoch": 0.09895089922923209, + "grad_norm": 1.8652589321136475, + "learning_rate": 4.880188841723548e-05, + "loss": 5.9437, + "step": 16638 + }, + { + "epoch": 0.09895684651251309, + "grad_norm": 1.687664270401001, + "learning_rate": 4.8801745544923075e-05, + "loss": 6.0776, + "step": 16639 + }, + { + "epoch": 0.09896279379579408, + "grad_norm": 1.579231858253479, + "learning_rate": 4.880160266430171e-05, + "loss": 6.0486, + "step": 16640 + }, + { + "epoch": 0.09896874107907508, + "grad_norm": 1.711932897567749, + "learning_rate": 4.8801459775371464e-05, + "loss": 5.7954, + "step": 16641 + }, + { + "epoch": 0.09897468836235608, + "grad_norm": 2.022918939590454, + "learning_rate": 4.880131687813237e-05, + "loss": 5.4453, + "step": 16642 + }, + { + "epoch": 0.09898063564563707, + "grad_norm": 2.4682674407958984, + "learning_rate": 4.880117397258449e-05, + "loss": 5.2084, + "step": 16643 + }, + { + "epoch": 0.09898658292891807, + "grad_norm": 2.7558486461639404, + "learning_rate": 4.880103105872786e-05, + "loss": 4.8931, + "step": 16644 + }, + { + "epoch": 0.09899253021219907, + "grad_norm": 1.8757295608520508, + "learning_rate": 4.880088813656253e-05, + "loss": 5.4484, + "step": 16645 + }, + { + "epoch": 0.09899847749548006, + "grad_norm": 2.0811331272125244, + "learning_rate": 4.880074520608857e-05, + "loss": 5.8003, + "step": 16646 + }, + { + "epoch": 0.09900442477876106, + "grad_norm": 1.9147615432739258, + "learning_rate": 4.880060226730601e-05, + "loss": 5.869, + "step": 16647 + }, + { + "epoch": 0.09901037206204207, + "grad_norm": 1.974865436553955, + "learning_rate": 4.88004593202149e-05, + "loss": 5.5896, + "step": 16648 + }, + { + "epoch": 0.09901631934532305, + "grad_norm": 1.8365596532821655, + "learning_rate": 4.88003163648153e-05, + "loss": 5.5321, + "step": 16649 + }, + { + "epoch": 0.09902226662860406, + "grad_norm": 1.5927996635437012, + "learning_rate": 4.8800173401107255e-05, + "loss": 5.49, + "step": 16650 + }, + { + "epoch": 0.09902821391188506, + "grad_norm": 1.7566391229629517, + "learning_rate": 4.880003042909081e-05, + "loss": 5.49, + "step": 16651 + }, + { + "epoch": 0.09903416119516605, + "grad_norm": 1.718018651008606, + "learning_rate": 4.879988744876602e-05, + "loss": 5.4515, + "step": 16652 + }, + { + "epoch": 0.09904010847844705, + "grad_norm": 1.8946046829223633, + "learning_rate": 4.879974446013295e-05, + "loss": 4.9902, + "step": 16653 + }, + { + "epoch": 0.09904605576172804, + "grad_norm": 1.939060926437378, + "learning_rate": 4.879960146319162e-05, + "loss": 5.2067, + "step": 16654 + }, + { + "epoch": 0.09905200304500904, + "grad_norm": 1.6621825695037842, + "learning_rate": 4.8799458457942106e-05, + "loss": 5.0041, + "step": 16655 + }, + { + "epoch": 0.09905795032829004, + "grad_norm": 1.8790650367736816, + "learning_rate": 4.879931544438444e-05, + "loss": 4.6893, + "step": 16656 + }, + { + "epoch": 0.09906389761157103, + "grad_norm": 2.20035982131958, + "learning_rate": 4.879917242251868e-05, + "loss": 4.4463, + "step": 16657 + }, + { + "epoch": 0.09906984489485203, + "grad_norm": 1.4379361867904663, + "learning_rate": 4.879902939234487e-05, + "loss": 4.993, + "step": 16658 + }, + { + "epoch": 0.09907579217813303, + "grad_norm": 2.2738726139068604, + "learning_rate": 4.879888635386307e-05, + "loss": 5.108, + "step": 16659 + }, + { + "epoch": 0.09908173946141402, + "grad_norm": 2.0921952724456787, + "learning_rate": 4.8798743307073325e-05, + "loss": 5.3023, + "step": 16660 + }, + { + "epoch": 0.09908768674469502, + "grad_norm": 1.894437313079834, + "learning_rate": 4.8798600251975684e-05, + "loss": 5.2797, + "step": 16661 + }, + { + "epoch": 0.09909363402797602, + "grad_norm": 1.6831610202789307, + "learning_rate": 4.87984571885702e-05, + "loss": 5.3342, + "step": 16662 + }, + { + "epoch": 0.09909958131125701, + "grad_norm": 1.9177473783493042, + "learning_rate": 4.879831411685691e-05, + "loss": 5.2245, + "step": 16663 + }, + { + "epoch": 0.09910552859453801, + "grad_norm": 1.8289183378219604, + "learning_rate": 4.879817103683589e-05, + "loss": 5.2411, + "step": 16664 + }, + { + "epoch": 0.09911147587781902, + "grad_norm": 1.7047971487045288, + "learning_rate": 4.8798027948507166e-05, + "loss": 5.1896, + "step": 16665 + }, + { + "epoch": 0.0991174231611, + "grad_norm": 1.5395535230636597, + "learning_rate": 4.87978848518708e-05, + "loss": 5.0688, + "step": 16666 + }, + { + "epoch": 0.099123370444381, + "grad_norm": 1.652870535850525, + "learning_rate": 4.879774174692683e-05, + "loss": 5.1786, + "step": 16667 + }, + { + "epoch": 0.09912931772766201, + "grad_norm": 1.7581889629364014, + "learning_rate": 4.8797598633675326e-05, + "loss": 5.0549, + "step": 16668 + }, + { + "epoch": 0.099135265010943, + "grad_norm": 1.6056864261627197, + "learning_rate": 4.8797455512116315e-05, + "loss": 5.0516, + "step": 16669 + }, + { + "epoch": 0.099141212294224, + "grad_norm": 1.8067295551300049, + "learning_rate": 4.879731238224986e-05, + "loss": 5.0642, + "step": 16670 + }, + { + "epoch": 0.099147159577505, + "grad_norm": 1.7332173585891724, + "learning_rate": 4.8797169244076016e-05, + "loss": 5.0361, + "step": 16671 + }, + { + "epoch": 0.09915310686078599, + "grad_norm": 1.64972984790802, + "learning_rate": 4.879702609759482e-05, + "loss": 5.0521, + "step": 16672 + }, + { + "epoch": 0.09915905414406699, + "grad_norm": 1.8066579103469849, + "learning_rate": 4.879688294280633e-05, + "loss": 5.1431, + "step": 16673 + }, + { + "epoch": 0.09916500142734799, + "grad_norm": 2.093921661376953, + "learning_rate": 4.879673977971059e-05, + "loss": 5.4831, + "step": 16674 + }, + { + "epoch": 0.09917094871062898, + "grad_norm": 2.1563215255737305, + "learning_rate": 4.879659660830766e-05, + "loss": 5.4992, + "step": 16675 + }, + { + "epoch": 0.09917689599390998, + "grad_norm": 1.9041906595230103, + "learning_rate": 4.8796453428597585e-05, + "loss": 6.0952, + "step": 16676 + }, + { + "epoch": 0.09918284327719099, + "grad_norm": 1.7259836196899414, + "learning_rate": 4.879631024058041e-05, + "loss": 5.9602, + "step": 16677 + }, + { + "epoch": 0.09918879056047197, + "grad_norm": 2.075324058532715, + "learning_rate": 4.879616704425619e-05, + "loss": 5.1186, + "step": 16678 + }, + { + "epoch": 0.09919473784375298, + "grad_norm": 2.243378162384033, + "learning_rate": 4.8796023839624975e-05, + "loss": 4.8764, + "step": 16679 + }, + { + "epoch": 0.09920068512703398, + "grad_norm": 1.8717987537384033, + "learning_rate": 4.879588062668681e-05, + "loss": 5.6084, + "step": 16680 + }, + { + "epoch": 0.09920663241031497, + "grad_norm": 1.8316127061843872, + "learning_rate": 4.879573740544175e-05, + "loss": 5.5613, + "step": 16681 + }, + { + "epoch": 0.09921257969359597, + "grad_norm": 1.7016340494155884, + "learning_rate": 4.879559417588985e-05, + "loss": 5.5577, + "step": 16682 + }, + { + "epoch": 0.09921852697687697, + "grad_norm": 2.2173359394073486, + "learning_rate": 4.879545093803115e-05, + "loss": 4.9591, + "step": 16683 + }, + { + "epoch": 0.09922447426015796, + "grad_norm": 1.9507017135620117, + "learning_rate": 4.87953076918657e-05, + "loss": 5.6648, + "step": 16684 + }, + { + "epoch": 0.09923042154343896, + "grad_norm": 1.6124898195266724, + "learning_rate": 4.879516443739356e-05, + "loss": 6.0163, + "step": 16685 + }, + { + "epoch": 0.09923636882671995, + "grad_norm": 1.5823163986206055, + "learning_rate": 4.879502117461477e-05, + "loss": 5.868, + "step": 16686 + }, + { + "epoch": 0.09924231611000095, + "grad_norm": 1.608522653579712, + "learning_rate": 4.879487790352938e-05, + "loss": 5.7482, + "step": 16687 + }, + { + "epoch": 0.09924826339328195, + "grad_norm": 1.783008337020874, + "learning_rate": 4.879473462413745e-05, + "loss": 5.2352, + "step": 16688 + }, + { + "epoch": 0.09925421067656294, + "grad_norm": 1.8089349269866943, + "learning_rate": 4.8794591336439024e-05, + "loss": 5.1793, + "step": 16689 + }, + { + "epoch": 0.09926015795984394, + "grad_norm": 1.5393356084823608, + "learning_rate": 4.879444804043415e-05, + "loss": 5.4802, + "step": 16690 + }, + { + "epoch": 0.09926610524312494, + "grad_norm": 1.7046642303466797, + "learning_rate": 4.8794304736122886e-05, + "loss": 5.8368, + "step": 16691 + }, + { + "epoch": 0.09927205252640593, + "grad_norm": 1.7474054098129272, + "learning_rate": 4.879416142350527e-05, + "loss": 5.7578, + "step": 16692 + }, + { + "epoch": 0.09927799980968693, + "grad_norm": 1.9804757833480835, + "learning_rate": 4.879401810258136e-05, + "loss": 5.691, + "step": 16693 + }, + { + "epoch": 0.09928394709296794, + "grad_norm": 1.7752422094345093, + "learning_rate": 4.87938747733512e-05, + "loss": 5.2478, + "step": 16694 + }, + { + "epoch": 0.09928989437624892, + "grad_norm": 1.8842644691467285, + "learning_rate": 4.879373143581485e-05, + "loss": 5.2061, + "step": 16695 + }, + { + "epoch": 0.09929584165952993, + "grad_norm": 1.6537442207336426, + "learning_rate": 4.8793588089972355e-05, + "loss": 5.215, + "step": 16696 + }, + { + "epoch": 0.09930178894281093, + "grad_norm": 1.5108014345169067, + "learning_rate": 4.8793444735823755e-05, + "loss": 5.2327, + "step": 16697 + }, + { + "epoch": 0.09930773622609192, + "grad_norm": 1.4653078317642212, + "learning_rate": 4.8793301373369116e-05, + "loss": 5.219, + "step": 16698 + }, + { + "epoch": 0.09931368350937292, + "grad_norm": 1.3908593654632568, + "learning_rate": 4.879315800260848e-05, + "loss": 5.1597, + "step": 16699 + }, + { + "epoch": 0.09931963079265392, + "grad_norm": 1.3809629678726196, + "learning_rate": 4.87930146235419e-05, + "loss": 5.2364, + "step": 16700 + }, + { + "epoch": 0.09932557807593491, + "grad_norm": 1.741685152053833, + "learning_rate": 4.879287123616943e-05, + "loss": 5.7777, + "step": 16701 + }, + { + "epoch": 0.09933152535921591, + "grad_norm": 1.7733122110366821, + "learning_rate": 4.879272784049111e-05, + "loss": 5.4035, + "step": 16702 + }, + { + "epoch": 0.09933747264249691, + "grad_norm": 1.4871195554733276, + "learning_rate": 4.8792584436506985e-05, + "loss": 4.961, + "step": 16703 + }, + { + "epoch": 0.0993434199257779, + "grad_norm": 1.6865509748458862, + "learning_rate": 4.8792441024217115e-05, + "loss": 4.9876, + "step": 16704 + }, + { + "epoch": 0.0993493672090589, + "grad_norm": 1.6606428623199463, + "learning_rate": 4.879229760362156e-05, + "loss": 5.1431, + "step": 16705 + }, + { + "epoch": 0.0993553144923399, + "grad_norm": 1.6394522190093994, + "learning_rate": 4.879215417472036e-05, + "loss": 5.223, + "step": 16706 + }, + { + "epoch": 0.0993612617756209, + "grad_norm": 1.6220464706420898, + "learning_rate": 4.879201073751356e-05, + "loss": 5.322, + "step": 16707 + }, + { + "epoch": 0.0993672090589019, + "grad_norm": 1.4539369344711304, + "learning_rate": 4.879186729200121e-05, + "loss": 5.1935, + "step": 16708 + }, + { + "epoch": 0.0993731563421829, + "grad_norm": 1.7421495914459229, + "learning_rate": 4.8791723838183376e-05, + "loss": 5.0639, + "step": 16709 + }, + { + "epoch": 0.09937910362546389, + "grad_norm": 1.5782475471496582, + "learning_rate": 4.8791580376060085e-05, + "loss": 5.8221, + "step": 16710 + }, + { + "epoch": 0.09938505090874489, + "grad_norm": 1.6991766691207886, + "learning_rate": 4.879143690563141e-05, + "loss": 5.9037, + "step": 16711 + }, + { + "epoch": 0.09939099819202589, + "grad_norm": 1.7815147638320923, + "learning_rate": 4.879129342689739e-05, + "loss": 5.668, + "step": 16712 + }, + { + "epoch": 0.09939694547530688, + "grad_norm": 1.6047189235687256, + "learning_rate": 4.879114993985806e-05, + "loss": 5.3005, + "step": 16713 + }, + { + "epoch": 0.09940289275858788, + "grad_norm": 1.8050780296325684, + "learning_rate": 4.87910064445135e-05, + "loss": 5.4931, + "step": 16714 + }, + { + "epoch": 0.09940884004186887, + "grad_norm": 2.010920286178589, + "learning_rate": 4.8790862940863744e-05, + "loss": 5.6301, + "step": 16715 + }, + { + "epoch": 0.09941478732514987, + "grad_norm": 1.443099856376648, + "learning_rate": 4.879071942890884e-05, + "loss": 5.9498, + "step": 16716 + }, + { + "epoch": 0.09942073460843087, + "grad_norm": 1.777207612991333, + "learning_rate": 4.879057590864885e-05, + "loss": 5.2754, + "step": 16717 + }, + { + "epoch": 0.09942668189171186, + "grad_norm": 2.314602851867676, + "learning_rate": 4.87904323800838e-05, + "loss": 5.1447, + "step": 16718 + }, + { + "epoch": 0.09943262917499286, + "grad_norm": 1.4886807203292847, + "learning_rate": 4.879028884321377e-05, + "loss": 5.5389, + "step": 16719 + }, + { + "epoch": 0.09943857645827386, + "grad_norm": 1.4403626918792725, + "learning_rate": 4.879014529803879e-05, + "loss": 5.5377, + "step": 16720 + }, + { + "epoch": 0.09944452374155485, + "grad_norm": 1.570827841758728, + "learning_rate": 4.8790001744558916e-05, + "loss": 5.2541, + "step": 16721 + }, + { + "epoch": 0.09945047102483585, + "grad_norm": 1.6352084875106812, + "learning_rate": 4.87898581827742e-05, + "loss": 4.9031, + "step": 16722 + }, + { + "epoch": 0.09945641830811686, + "grad_norm": 1.864465594291687, + "learning_rate": 4.878971461268469e-05, + "loss": 4.8689, + "step": 16723 + }, + { + "epoch": 0.09946236559139784, + "grad_norm": 1.5618411302566528, + "learning_rate": 4.878957103429044e-05, + "loss": 5.4576, + "step": 16724 + }, + { + "epoch": 0.09946831287467885, + "grad_norm": 1.6910091638565063, + "learning_rate": 4.8789427447591486e-05, + "loss": 5.557, + "step": 16725 + }, + { + "epoch": 0.09947426015795985, + "grad_norm": 1.708056926727295, + "learning_rate": 4.8789283852587895e-05, + "loss": 5.5343, + "step": 16726 + }, + { + "epoch": 0.09948020744124084, + "grad_norm": 1.5828802585601807, + "learning_rate": 4.878914024927971e-05, + "loss": 5.3913, + "step": 16727 + }, + { + "epoch": 0.09948615472452184, + "grad_norm": 1.6802269220352173, + "learning_rate": 4.878899663766698e-05, + "loss": 5.4407, + "step": 16728 + }, + { + "epoch": 0.09949210200780284, + "grad_norm": 2.0542306900024414, + "learning_rate": 4.8788853017749766e-05, + "loss": 4.9265, + "step": 16729 + }, + { + "epoch": 0.09949804929108383, + "grad_norm": 2.035903215408325, + "learning_rate": 4.87887093895281e-05, + "loss": 5.1802, + "step": 16730 + }, + { + "epoch": 0.09950399657436483, + "grad_norm": 1.7885538339614868, + "learning_rate": 4.8788565753002044e-05, + "loss": 5.5238, + "step": 16731 + }, + { + "epoch": 0.09950994385764583, + "grad_norm": 1.606881022453308, + "learning_rate": 4.878842210817165e-05, + "loss": 5.805, + "step": 16732 + }, + { + "epoch": 0.09951589114092682, + "grad_norm": 1.6354256868362427, + "learning_rate": 4.8788278455036956e-05, + "loss": 5.7968, + "step": 16733 + }, + { + "epoch": 0.09952183842420782, + "grad_norm": 1.7537651062011719, + "learning_rate": 4.8788134793598024e-05, + "loss": 5.5945, + "step": 16734 + }, + { + "epoch": 0.09952778570748883, + "grad_norm": 2.149411678314209, + "learning_rate": 4.8787991123854895e-05, + "loss": 4.7458, + "step": 16735 + }, + { + "epoch": 0.09953373299076981, + "grad_norm": 1.9956060647964478, + "learning_rate": 4.878784744580763e-05, + "loss": 4.9471, + "step": 16736 + }, + { + "epoch": 0.09953968027405082, + "grad_norm": 2.0445396900177, + "learning_rate": 4.878770375945627e-05, + "loss": 4.9063, + "step": 16737 + }, + { + "epoch": 0.09954562755733182, + "grad_norm": 1.8563852310180664, + "learning_rate": 4.878756006480088e-05, + "loss": 5.8788, + "step": 16738 + }, + { + "epoch": 0.0995515748406128, + "grad_norm": 1.8931719064712524, + "learning_rate": 4.8787416361841474e-05, + "loss": 6.0917, + "step": 16739 + }, + { + "epoch": 0.09955752212389381, + "grad_norm": 2.062368869781494, + "learning_rate": 4.878727265057814e-05, + "loss": 5.0113, + "step": 16740 + }, + { + "epoch": 0.09956346940717481, + "grad_norm": 1.7274762392044067, + "learning_rate": 4.878712893101092e-05, + "loss": 5.7383, + "step": 16741 + }, + { + "epoch": 0.0995694166904558, + "grad_norm": 1.7377746105194092, + "learning_rate": 4.878698520313986e-05, + "loss": 5.5545, + "step": 16742 + }, + { + "epoch": 0.0995753639737368, + "grad_norm": 1.8383115530014038, + "learning_rate": 4.8786841466965e-05, + "loss": 5.2297, + "step": 16743 + }, + { + "epoch": 0.09958131125701779, + "grad_norm": 1.7715762853622437, + "learning_rate": 4.8786697722486405e-05, + "loss": 5.4735, + "step": 16744 + }, + { + "epoch": 0.09958725854029879, + "grad_norm": 1.8447803258895874, + "learning_rate": 4.878655396970412e-05, + "loss": 5.25, + "step": 16745 + }, + { + "epoch": 0.09959320582357979, + "grad_norm": 2.215622663497925, + "learning_rate": 4.878641020861819e-05, + "loss": 4.8387, + "step": 16746 + }, + { + "epoch": 0.09959915310686078, + "grad_norm": 1.71353018283844, + "learning_rate": 4.878626643922867e-05, + "loss": 5.6831, + "step": 16747 + }, + { + "epoch": 0.09960510039014178, + "grad_norm": 1.8424171209335327, + "learning_rate": 4.8786122661535616e-05, + "loss": 5.5785, + "step": 16748 + }, + { + "epoch": 0.09961104767342278, + "grad_norm": 1.8796172142028809, + "learning_rate": 4.8785978875539065e-05, + "loss": 5.5921, + "step": 16749 + }, + { + "epoch": 0.09961699495670377, + "grad_norm": 1.820435881614685, + "learning_rate": 4.878583508123908e-05, + "loss": 5.7645, + "step": 16750 + }, + { + "epoch": 0.09962294223998477, + "grad_norm": 1.9210152626037598, + "learning_rate": 4.87856912786357e-05, + "loss": 5.0471, + "step": 16751 + }, + { + "epoch": 0.09962888952326578, + "grad_norm": 1.4372605085372925, + "learning_rate": 4.878554746772899e-05, + "loss": 5.3131, + "step": 16752 + }, + { + "epoch": 0.09963483680654676, + "grad_norm": 1.8078817129135132, + "learning_rate": 4.878540364851898e-05, + "loss": 5.266, + "step": 16753 + }, + { + "epoch": 0.09964078408982777, + "grad_norm": 2.068875551223755, + "learning_rate": 4.878525982100575e-05, + "loss": 4.714, + "step": 16754 + }, + { + "epoch": 0.09964673137310877, + "grad_norm": 2.0813167095184326, + "learning_rate": 4.878511598518931e-05, + "loss": 4.5889, + "step": 16755 + }, + { + "epoch": 0.09965267865638976, + "grad_norm": 2.3035426139831543, + "learning_rate": 4.878497214106974e-05, + "loss": 4.8549, + "step": 16756 + }, + { + "epoch": 0.09965862593967076, + "grad_norm": 1.7791129350662231, + "learning_rate": 4.878482828864709e-05, + "loss": 5.2515, + "step": 16757 + }, + { + "epoch": 0.09966457322295176, + "grad_norm": 1.7512277364730835, + "learning_rate": 4.878468442792139e-05, + "loss": 5.8079, + "step": 16758 + }, + { + "epoch": 0.09967052050623275, + "grad_norm": 1.789523720741272, + "learning_rate": 4.878454055889271e-05, + "loss": 5.4302, + "step": 16759 + }, + { + "epoch": 0.09967646778951375, + "grad_norm": 1.72003173828125, + "learning_rate": 4.8784396681561086e-05, + "loss": 5.6425, + "step": 16760 + }, + { + "epoch": 0.09968241507279475, + "grad_norm": 2.0497727394104004, + "learning_rate": 4.878425279592658e-05, + "loss": 5.6608, + "step": 16761 + }, + { + "epoch": 0.09968836235607574, + "grad_norm": 1.7305432558059692, + "learning_rate": 4.878410890198923e-05, + "loss": 5.5431, + "step": 16762 + }, + { + "epoch": 0.09969430963935674, + "grad_norm": 1.708824634552002, + "learning_rate": 4.878396499974911e-05, + "loss": 5.1754, + "step": 16763 + }, + { + "epoch": 0.09970025692263774, + "grad_norm": 1.9238412380218506, + "learning_rate": 4.878382108920624e-05, + "loss": 5.0595, + "step": 16764 + }, + { + "epoch": 0.09970620420591873, + "grad_norm": 1.7634879350662231, + "learning_rate": 4.878367717036069e-05, + "loss": 5.5733, + "step": 16765 + }, + { + "epoch": 0.09971215148919974, + "grad_norm": 1.7330491542816162, + "learning_rate": 4.8783533243212495e-05, + "loss": 5.4314, + "step": 16766 + }, + { + "epoch": 0.09971809877248074, + "grad_norm": 1.4424408674240112, + "learning_rate": 4.878338930776172e-05, + "loss": 5.3059, + "step": 16767 + }, + { + "epoch": 0.09972404605576173, + "grad_norm": 1.4692374467849731, + "learning_rate": 4.878324536400841e-05, + "loss": 5.2838, + "step": 16768 + }, + { + "epoch": 0.09972999333904273, + "grad_norm": 1.3602346181869507, + "learning_rate": 4.878310141195262e-05, + "loss": 5.5587, + "step": 16769 + }, + { + "epoch": 0.09973594062232373, + "grad_norm": 1.3222168684005737, + "learning_rate": 4.878295745159438e-05, + "loss": 5.61, + "step": 16770 + }, + { + "epoch": 0.09974188790560472, + "grad_norm": 1.398383378982544, + "learning_rate": 4.878281348293377e-05, + "loss": 5.5348, + "step": 16771 + }, + { + "epoch": 0.09974783518888572, + "grad_norm": 1.4184808731079102, + "learning_rate": 4.878266950597081e-05, + "loss": 5.4425, + "step": 16772 + }, + { + "epoch": 0.09975378247216671, + "grad_norm": 1.2451627254486084, + "learning_rate": 4.878252552070558e-05, + "loss": 5.5105, + "step": 16773 + }, + { + "epoch": 0.09975972975544771, + "grad_norm": 1.4243760108947754, + "learning_rate": 4.878238152713811e-05, + "loss": 5.5839, + "step": 16774 + }, + { + "epoch": 0.09976567703872871, + "grad_norm": 1.1774061918258667, + "learning_rate": 4.878223752526846e-05, + "loss": 5.4785, + "step": 16775 + }, + { + "epoch": 0.0997716243220097, + "grad_norm": 1.2542285919189453, + "learning_rate": 4.8782093515096676e-05, + "loss": 5.4994, + "step": 16776 + }, + { + "epoch": 0.0997775716052907, + "grad_norm": 1.486611008644104, + "learning_rate": 4.878194949662281e-05, + "loss": 5.347, + "step": 16777 + }, + { + "epoch": 0.0997835188885717, + "grad_norm": 1.391717791557312, + "learning_rate": 4.878180546984691e-05, + "loss": 5.3397, + "step": 16778 + }, + { + "epoch": 0.09978946617185269, + "grad_norm": 1.819778323173523, + "learning_rate": 4.878166143476902e-05, + "loss": 5.4217, + "step": 16779 + }, + { + "epoch": 0.0997954134551337, + "grad_norm": 1.549660563468933, + "learning_rate": 4.8781517391389205e-05, + "loss": 5.5044, + "step": 16780 + }, + { + "epoch": 0.0998013607384147, + "grad_norm": 1.4923075437545776, + "learning_rate": 4.878137333970751e-05, + "loss": 5.4779, + "step": 16781 + }, + { + "epoch": 0.09980730802169568, + "grad_norm": 1.3846399784088135, + "learning_rate": 4.878122927972398e-05, + "loss": 5.8974, + "step": 16782 + }, + { + "epoch": 0.09981325530497669, + "grad_norm": 1.325563669204712, + "learning_rate": 4.878108521143867e-05, + "loss": 5.516, + "step": 16783 + }, + { + "epoch": 0.09981920258825769, + "grad_norm": 1.3482844829559326, + "learning_rate": 4.878094113485162e-05, + "loss": 5.4661, + "step": 16784 + }, + { + "epoch": 0.09982514987153868, + "grad_norm": 1.4238206148147583, + "learning_rate": 4.87807970499629e-05, + "loss": 5.5551, + "step": 16785 + }, + { + "epoch": 0.09983109715481968, + "grad_norm": 1.1277439594268799, + "learning_rate": 4.8780652956772544e-05, + "loss": 5.3611, + "step": 16786 + }, + { + "epoch": 0.09983704443810068, + "grad_norm": 1.2312495708465576, + "learning_rate": 4.878050885528061e-05, + "loss": 5.4233, + "step": 16787 + }, + { + "epoch": 0.09984299172138167, + "grad_norm": 1.3811876773834229, + "learning_rate": 4.878036474548715e-05, + "loss": 5.4336, + "step": 16788 + }, + { + "epoch": 0.09984893900466267, + "grad_norm": 1.211362361907959, + "learning_rate": 4.87802206273922e-05, + "loss": 4.9956, + "step": 16789 + }, + { + "epoch": 0.09985488628794367, + "grad_norm": 1.0385311841964722, + "learning_rate": 4.878007650099583e-05, + "loss": 5.4416, + "step": 16790 + }, + { + "epoch": 0.09986083357122466, + "grad_norm": 1.2311192750930786, + "learning_rate": 4.8779932366298074e-05, + "loss": 5.4814, + "step": 16791 + }, + { + "epoch": 0.09986678085450566, + "grad_norm": 1.6310219764709473, + "learning_rate": 4.8779788223299e-05, + "loss": 5.1746, + "step": 16792 + }, + { + "epoch": 0.09987272813778666, + "grad_norm": 1.4695444107055664, + "learning_rate": 4.877964407199864e-05, + "loss": 5.3724, + "step": 16793 + }, + { + "epoch": 0.09987867542106765, + "grad_norm": 1.8295196294784546, + "learning_rate": 4.877949991239705e-05, + "loss": 5.1085, + "step": 16794 + }, + { + "epoch": 0.09988462270434866, + "grad_norm": 1.5845080614089966, + "learning_rate": 4.877935574449428e-05, + "loss": 5.027, + "step": 16795 + }, + { + "epoch": 0.09989056998762966, + "grad_norm": 1.3743692636489868, + "learning_rate": 4.8779211568290395e-05, + "loss": 5.0717, + "step": 16796 + }, + { + "epoch": 0.09989651727091065, + "grad_norm": 1.3857053518295288, + "learning_rate": 4.877906738378542e-05, + "loss": 4.9698, + "step": 16797 + }, + { + "epoch": 0.09990246455419165, + "grad_norm": 1.3818373680114746, + "learning_rate": 4.8778923190979425e-05, + "loss": 4.8686, + "step": 16798 + }, + { + "epoch": 0.09990841183747265, + "grad_norm": 1.563095211982727, + "learning_rate": 4.877877898987245e-05, + "loss": 4.6804, + "step": 16799 + }, + { + "epoch": 0.09991435912075364, + "grad_norm": 1.3965919017791748, + "learning_rate": 4.877863478046455e-05, + "loss": 5.141, + "step": 16800 + }, + { + "epoch": 0.09992030640403464, + "grad_norm": 1.5473159551620483, + "learning_rate": 4.8778490562755775e-05, + "loss": 5.0796, + "step": 16801 + }, + { + "epoch": 0.09992625368731563, + "grad_norm": 2.548140525817871, + "learning_rate": 4.877834633674618e-05, + "loss": 4.9149, + "step": 16802 + }, + { + "epoch": 0.09993220097059663, + "grad_norm": 1.59461510181427, + "learning_rate": 4.87782021024358e-05, + "loss": 4.9048, + "step": 16803 + }, + { + "epoch": 0.09993814825387763, + "grad_norm": 1.49467134475708, + "learning_rate": 4.87780578598247e-05, + "loss": 5.2484, + "step": 16804 + }, + { + "epoch": 0.09994409553715862, + "grad_norm": 1.5844218730926514, + "learning_rate": 4.8777913608912926e-05, + "loss": 5.2107, + "step": 16805 + }, + { + "epoch": 0.09995004282043962, + "grad_norm": 1.465334415435791, + "learning_rate": 4.877776934970053e-05, + "loss": 5.4002, + "step": 16806 + }, + { + "epoch": 0.09995599010372062, + "grad_norm": 1.5409786701202393, + "learning_rate": 4.877762508218756e-05, + "loss": 5.6233, + "step": 16807 + }, + { + "epoch": 0.09996193738700161, + "grad_norm": 1.3813812732696533, + "learning_rate": 4.877748080637406e-05, + "loss": 5.3072, + "step": 16808 + }, + { + "epoch": 0.09996788467028261, + "grad_norm": 1.3815702199935913, + "learning_rate": 4.8777336522260095e-05, + "loss": 5.0923, + "step": 16809 + }, + { + "epoch": 0.09997383195356362, + "grad_norm": 1.6513910293579102, + "learning_rate": 4.87771922298457e-05, + "loss": 5.0482, + "step": 16810 + }, + { + "epoch": 0.0999797792368446, + "grad_norm": 1.6680731773376465, + "learning_rate": 4.8777047929130944e-05, + "loss": 4.984, + "step": 16811 + }, + { + "epoch": 0.0999857265201256, + "grad_norm": 1.4342384338378906, + "learning_rate": 4.8776903620115855e-05, + "loss": 5.2745, + "step": 16812 + }, + { + "epoch": 0.09999167380340661, + "grad_norm": 1.564255714416504, + "learning_rate": 4.87767593028005e-05, + "loss": 5.398, + "step": 16813 + }, + { + "epoch": 0.0999976210866876, + "grad_norm": 1.2767013311386108, + "learning_rate": 4.877661497718493e-05, + "loss": 5.0663, + "step": 16814 + }, + { + "epoch": 0.1000035683699686, + "grad_norm": 1.35418701171875, + "learning_rate": 4.877647064326918e-05, + "loss": 5.064, + "step": 16815 + }, + { + "epoch": 0.1000095156532496, + "grad_norm": 1.5754468441009521, + "learning_rate": 4.877632630105331e-05, + "loss": 5.1525, + "step": 16816 + }, + { + "epoch": 0.10001546293653059, + "grad_norm": 1.8457043170928955, + "learning_rate": 4.877618195053737e-05, + "loss": 5.3074, + "step": 16817 + }, + { + "epoch": 0.10002141021981159, + "grad_norm": 1.7238751649856567, + "learning_rate": 4.877603759172141e-05, + "loss": 5.3408, + "step": 16818 + }, + { + "epoch": 0.10002735750309259, + "grad_norm": 1.5342493057250977, + "learning_rate": 4.8775893224605486e-05, + "loss": 5.3495, + "step": 16819 + }, + { + "epoch": 0.10003330478637358, + "grad_norm": 1.4931390285491943, + "learning_rate": 4.877574884918964e-05, + "loss": 5.2617, + "step": 16820 + }, + { + "epoch": 0.10003925206965458, + "grad_norm": 1.5503534078598022, + "learning_rate": 4.877560446547393e-05, + "loss": 5.0805, + "step": 16821 + }, + { + "epoch": 0.10004519935293558, + "grad_norm": 1.480191707611084, + "learning_rate": 4.87754600734584e-05, + "loss": 5.1405, + "step": 16822 + }, + { + "epoch": 0.10005114663621657, + "grad_norm": 1.371559977531433, + "learning_rate": 4.87753156731431e-05, + "loss": 5.2313, + "step": 16823 + }, + { + "epoch": 0.10005709391949758, + "grad_norm": 1.2534080743789673, + "learning_rate": 4.8775171264528085e-05, + "loss": 5.3029, + "step": 16824 + }, + { + "epoch": 0.10006304120277858, + "grad_norm": 1.4513366222381592, + "learning_rate": 4.8775026847613406e-05, + "loss": 5.2663, + "step": 16825 + }, + { + "epoch": 0.10006898848605957, + "grad_norm": 1.4045735597610474, + "learning_rate": 4.8774882422399105e-05, + "loss": 5.2358, + "step": 16826 + }, + { + "epoch": 0.10007493576934057, + "grad_norm": 1.469664216041565, + "learning_rate": 4.877473798888524e-05, + "loss": 5.0215, + "step": 16827 + }, + { + "epoch": 0.10008088305262157, + "grad_norm": 1.4306927919387817, + "learning_rate": 4.8774593547071855e-05, + "loss": 4.8262, + "step": 16828 + }, + { + "epoch": 0.10008683033590256, + "grad_norm": 1.5118143558502197, + "learning_rate": 4.877444909695902e-05, + "loss": 4.8248, + "step": 16829 + }, + { + "epoch": 0.10009277761918356, + "grad_norm": 1.3022321462631226, + "learning_rate": 4.8774304638546754e-05, + "loss": 4.7268, + "step": 16830 + }, + { + "epoch": 0.10009872490246455, + "grad_norm": 1.468758463859558, + "learning_rate": 4.877416017183513e-05, + "loss": 4.8686, + "step": 16831 + }, + { + "epoch": 0.10010467218574555, + "grad_norm": 1.4958772659301758, + "learning_rate": 4.8774015696824196e-05, + "loss": 5.084, + "step": 16832 + }, + { + "epoch": 0.10011061946902655, + "grad_norm": 1.5816160440444946, + "learning_rate": 4.877387121351399e-05, + "loss": 5.1009, + "step": 16833 + }, + { + "epoch": 0.10011656675230754, + "grad_norm": 1.4751555919647217, + "learning_rate": 4.877372672190458e-05, + "loss": 5.1875, + "step": 16834 + }, + { + "epoch": 0.10012251403558854, + "grad_norm": 1.380433201789856, + "learning_rate": 4.8773582221996006e-05, + "loss": 5.3213, + "step": 16835 + }, + { + "epoch": 0.10012846131886954, + "grad_norm": 1.566112756729126, + "learning_rate": 4.877343771378832e-05, + "loss": 4.9251, + "step": 16836 + }, + { + "epoch": 0.10013440860215053, + "grad_norm": 1.4834301471710205, + "learning_rate": 4.8773293197281566e-05, + "loss": 4.7936, + "step": 16837 + }, + { + "epoch": 0.10014035588543153, + "grad_norm": 1.6053043603897095, + "learning_rate": 4.877314867247581e-05, + "loss": 4.8611, + "step": 16838 + }, + { + "epoch": 0.10014630316871254, + "grad_norm": 1.420598030090332, + "learning_rate": 4.877300413937109e-05, + "loss": 5.0481, + "step": 16839 + }, + { + "epoch": 0.10015225045199352, + "grad_norm": 1.474554181098938, + "learning_rate": 4.877285959796746e-05, + "loss": 5.0342, + "step": 16840 + }, + { + "epoch": 0.10015819773527453, + "grad_norm": 1.6535485982894897, + "learning_rate": 4.877271504826496e-05, + "loss": 5.4624, + "step": 16841 + }, + { + "epoch": 0.10016414501855553, + "grad_norm": 1.3873733282089233, + "learning_rate": 4.877257049026367e-05, + "loss": 5.1673, + "step": 16842 + }, + { + "epoch": 0.10017009230183652, + "grad_norm": 1.3890115022659302, + "learning_rate": 4.8772425923963606e-05, + "loss": 4.938, + "step": 16843 + }, + { + "epoch": 0.10017603958511752, + "grad_norm": 1.443969964981079, + "learning_rate": 4.8772281349364846e-05, + "loss": 4.8525, + "step": 16844 + }, + { + "epoch": 0.10018198686839852, + "grad_norm": 1.545344591140747, + "learning_rate": 4.877213676646742e-05, + "loss": 4.8682, + "step": 16845 + }, + { + "epoch": 0.10018793415167951, + "grad_norm": 1.6065396070480347, + "learning_rate": 4.877199217527138e-05, + "loss": 4.7394, + "step": 16846 + }, + { + "epoch": 0.10019388143496051, + "grad_norm": 1.444199800491333, + "learning_rate": 4.877184757577679e-05, + "loss": 4.7775, + "step": 16847 + }, + { + "epoch": 0.10019982871824151, + "grad_norm": 1.5434626340866089, + "learning_rate": 4.87717029679837e-05, + "loss": 4.6714, + "step": 16848 + }, + { + "epoch": 0.1002057760015225, + "grad_norm": 1.502533197402954, + "learning_rate": 4.877155835189215e-05, + "loss": 4.7591, + "step": 16849 + }, + { + "epoch": 0.1002117232848035, + "grad_norm": 1.6330854892730713, + "learning_rate": 4.877141372750219e-05, + "loss": 4.7426, + "step": 16850 + }, + { + "epoch": 0.1002176705680845, + "grad_norm": 1.658887267112732, + "learning_rate": 4.877126909481388e-05, + "loss": 4.7558, + "step": 16851 + }, + { + "epoch": 0.10022361785136549, + "grad_norm": 1.4569580554962158, + "learning_rate": 4.877112445382727e-05, + "loss": 4.7797, + "step": 16852 + }, + { + "epoch": 0.1002295651346465, + "grad_norm": 1.4903759956359863, + "learning_rate": 4.8770979804542394e-05, + "loss": 4.7895, + "step": 16853 + }, + { + "epoch": 0.1002355124179275, + "grad_norm": 1.638406753540039, + "learning_rate": 4.877083514695933e-05, + "loss": 4.7197, + "step": 16854 + }, + { + "epoch": 0.10024145970120849, + "grad_norm": 1.4558868408203125, + "learning_rate": 4.87706904810781e-05, + "loss": 4.7159, + "step": 16855 + }, + { + "epoch": 0.10024740698448949, + "grad_norm": 1.5545023679733276, + "learning_rate": 4.877054580689877e-05, + "loss": 4.7387, + "step": 16856 + }, + { + "epoch": 0.10025335426777049, + "grad_norm": 1.3767842054367065, + "learning_rate": 4.877040112442139e-05, + "loss": 4.7149, + "step": 16857 + }, + { + "epoch": 0.10025930155105148, + "grad_norm": 1.4483342170715332, + "learning_rate": 4.877025643364601e-05, + "loss": 4.7756, + "step": 16858 + }, + { + "epoch": 0.10026524883433248, + "grad_norm": 1.1949654817581177, + "learning_rate": 4.8770111734572673e-05, + "loss": 4.7883, + "step": 16859 + }, + { + "epoch": 0.10027119611761347, + "grad_norm": 1.430977463722229, + "learning_rate": 4.876996702720144e-05, + "loss": 5.0236, + "step": 16860 + }, + { + "epoch": 0.10027714340089447, + "grad_norm": 1.4976351261138916, + "learning_rate": 4.876982231153236e-05, + "loss": 5.1242, + "step": 16861 + }, + { + "epoch": 0.10028309068417547, + "grad_norm": 1.6913431882858276, + "learning_rate": 4.876967758756547e-05, + "loss": 5.3454, + "step": 16862 + }, + { + "epoch": 0.10028903796745646, + "grad_norm": 1.5901557207107544, + "learning_rate": 4.876953285530084e-05, + "loss": 5.2313, + "step": 16863 + }, + { + "epoch": 0.10029498525073746, + "grad_norm": 2.483757257461548, + "learning_rate": 4.8769388114738515e-05, + "loss": 4.9951, + "step": 16864 + }, + { + "epoch": 0.10030093253401846, + "grad_norm": 1.5647902488708496, + "learning_rate": 4.8769243365878536e-05, + "loss": 5.1029, + "step": 16865 + }, + { + "epoch": 0.10030687981729945, + "grad_norm": 1.5830740928649902, + "learning_rate": 4.8769098608720954e-05, + "loss": 5.1918, + "step": 16866 + }, + { + "epoch": 0.10031282710058045, + "grad_norm": 1.5231165885925293, + "learning_rate": 4.876895384326584e-05, + "loss": 5.0817, + "step": 16867 + }, + { + "epoch": 0.10031877438386146, + "grad_norm": 1.5266731977462769, + "learning_rate": 4.876880906951321e-05, + "loss": 4.9117, + "step": 16868 + }, + { + "epoch": 0.10032472166714244, + "grad_norm": 1.9662569761276245, + "learning_rate": 4.876866428746315e-05, + "loss": 4.8381, + "step": 16869 + }, + { + "epoch": 0.10033066895042345, + "grad_norm": 1.34932279586792, + "learning_rate": 4.876851949711569e-05, + "loss": 5.0781, + "step": 16870 + }, + { + "epoch": 0.10033661623370445, + "grad_norm": 1.3333275318145752, + "learning_rate": 4.876837469847089e-05, + "loss": 5.0527, + "step": 16871 + }, + { + "epoch": 0.10034256351698544, + "grad_norm": 1.3569806814193726, + "learning_rate": 4.876822989152879e-05, + "loss": 5.0854, + "step": 16872 + }, + { + "epoch": 0.10034851080026644, + "grad_norm": 1.4417848587036133, + "learning_rate": 4.876808507628945e-05, + "loss": 4.885, + "step": 16873 + }, + { + "epoch": 0.10035445808354744, + "grad_norm": 1.453704833984375, + "learning_rate": 4.876794025275292e-05, + "loss": 4.8919, + "step": 16874 + }, + { + "epoch": 0.10036040536682843, + "grad_norm": 1.392701268196106, + "learning_rate": 4.876779542091924e-05, + "loss": 5.0682, + "step": 16875 + }, + { + "epoch": 0.10036635265010943, + "grad_norm": 1.5623222589492798, + "learning_rate": 4.876765058078847e-05, + "loss": 5.0369, + "step": 16876 + }, + { + "epoch": 0.10037229993339043, + "grad_norm": 1.4053794145584106, + "learning_rate": 4.876750573236066e-05, + "loss": 4.9932, + "step": 16877 + }, + { + "epoch": 0.10037824721667142, + "grad_norm": 1.3282443284988403, + "learning_rate": 4.876736087563586e-05, + "loss": 5.0678, + "step": 16878 + }, + { + "epoch": 0.10038419449995242, + "grad_norm": 1.3737441301345825, + "learning_rate": 4.876721601061412e-05, + "loss": 5.1292, + "step": 16879 + }, + { + "epoch": 0.10039014178323342, + "grad_norm": 1.3209916353225708, + "learning_rate": 4.876707113729549e-05, + "loss": 5.0717, + "step": 16880 + }, + { + "epoch": 0.10039608906651441, + "grad_norm": 1.2051011323928833, + "learning_rate": 4.8766926255680026e-05, + "loss": 5.0075, + "step": 16881 + }, + { + "epoch": 0.10040203634979541, + "grad_norm": 1.260746955871582, + "learning_rate": 4.876678136576777e-05, + "loss": 4.8419, + "step": 16882 + }, + { + "epoch": 0.10040798363307642, + "grad_norm": 1.3981266021728516, + "learning_rate": 4.876663646755877e-05, + "loss": 4.8558, + "step": 16883 + }, + { + "epoch": 0.1004139309163574, + "grad_norm": 1.3491755723953247, + "learning_rate": 4.876649156105309e-05, + "loss": 4.7809, + "step": 16884 + }, + { + "epoch": 0.10041987819963841, + "grad_norm": 1.3315166234970093, + "learning_rate": 4.8766346646250774e-05, + "loss": 4.9221, + "step": 16885 + }, + { + "epoch": 0.10042582548291941, + "grad_norm": 1.250731348991394, + "learning_rate": 4.876620172315186e-05, + "loss": 4.8344, + "step": 16886 + }, + { + "epoch": 0.1004317727662004, + "grad_norm": 1.249316692352295, + "learning_rate": 4.876605679175642e-05, + "loss": 4.8441, + "step": 16887 + }, + { + "epoch": 0.1004377200494814, + "grad_norm": 1.3112961053848267, + "learning_rate": 4.87659118520645e-05, + "loss": 4.834, + "step": 16888 + }, + { + "epoch": 0.10044366733276239, + "grad_norm": 1.4331620931625366, + "learning_rate": 4.876576690407614e-05, + "loss": 4.9801, + "step": 16889 + }, + { + "epoch": 0.10044961461604339, + "grad_norm": 1.5304386615753174, + "learning_rate": 4.8765621947791396e-05, + "loss": 5.1799, + "step": 16890 + }, + { + "epoch": 0.10045556189932439, + "grad_norm": 1.3581719398498535, + "learning_rate": 4.8765476983210326e-05, + "loss": 5.1517, + "step": 16891 + }, + { + "epoch": 0.10046150918260538, + "grad_norm": 1.2568892240524292, + "learning_rate": 4.876533201033296e-05, + "loss": 5.0663, + "step": 16892 + }, + { + "epoch": 0.10046745646588638, + "grad_norm": 1.3863126039505005, + "learning_rate": 4.876518702915936e-05, + "loss": 4.9666, + "step": 16893 + }, + { + "epoch": 0.10047340374916738, + "grad_norm": 1.328078031539917, + "learning_rate": 4.87650420396896e-05, + "loss": 5.0049, + "step": 16894 + }, + { + "epoch": 0.10047935103244837, + "grad_norm": 1.252009630203247, + "learning_rate": 4.8764897041923696e-05, + "loss": 5.0709, + "step": 16895 + }, + { + "epoch": 0.10048529831572937, + "grad_norm": 1.4895809888839722, + "learning_rate": 4.876475203586171e-05, + "loss": 5.0922, + "step": 16896 + }, + { + "epoch": 0.10049124559901038, + "grad_norm": 1.363641619682312, + "learning_rate": 4.8764607021503696e-05, + "loss": 5.0233, + "step": 16897 + }, + { + "epoch": 0.10049719288229136, + "grad_norm": 1.5323866605758667, + "learning_rate": 4.876446199884971e-05, + "loss": 4.8705, + "step": 16898 + }, + { + "epoch": 0.10050314016557237, + "grad_norm": 1.4069478511810303, + "learning_rate": 4.8764316967899786e-05, + "loss": 5.0136, + "step": 16899 + }, + { + "epoch": 0.10050908744885337, + "grad_norm": 1.4166046380996704, + "learning_rate": 4.876417192865399e-05, + "loss": 5.0047, + "step": 16900 + }, + { + "epoch": 0.10051503473213436, + "grad_norm": 1.5298703908920288, + "learning_rate": 4.876402688111237e-05, + "loss": 5.0046, + "step": 16901 + }, + { + "epoch": 0.10052098201541536, + "grad_norm": 1.340071678161621, + "learning_rate": 4.876388182527497e-05, + "loss": 5.107, + "step": 16902 + }, + { + "epoch": 0.10052692929869636, + "grad_norm": 1.367415189743042, + "learning_rate": 4.876373676114184e-05, + "loss": 4.9292, + "step": 16903 + }, + { + "epoch": 0.10053287658197735, + "grad_norm": 1.3535525798797607, + "learning_rate": 4.876359168871304e-05, + "loss": 4.9801, + "step": 16904 + }, + { + "epoch": 0.10053882386525835, + "grad_norm": 1.2370539903640747, + "learning_rate": 4.8763446607988615e-05, + "loss": 4.9598, + "step": 16905 + }, + { + "epoch": 0.10054477114853935, + "grad_norm": 1.251837968826294, + "learning_rate": 4.876330151896862e-05, + "loss": 5.0506, + "step": 16906 + }, + { + "epoch": 0.10055071843182034, + "grad_norm": 1.3221372365951538, + "learning_rate": 4.8763156421653097e-05, + "loss": 5.4094, + "step": 16907 + }, + { + "epoch": 0.10055666571510134, + "grad_norm": 1.34721040725708, + "learning_rate": 4.87630113160421e-05, + "loss": 5.4361, + "step": 16908 + }, + { + "epoch": 0.10056261299838234, + "grad_norm": 1.2884198427200317, + "learning_rate": 4.876286620213568e-05, + "loss": 5.3518, + "step": 16909 + }, + { + "epoch": 0.10056856028166333, + "grad_norm": 1.259414553642273, + "learning_rate": 4.87627210799339e-05, + "loss": 5.2298, + "step": 16910 + }, + { + "epoch": 0.10057450756494433, + "grad_norm": 1.482032299041748, + "learning_rate": 4.8762575949436796e-05, + "loss": 5.3625, + "step": 16911 + }, + { + "epoch": 0.10058045484822534, + "grad_norm": 1.2673801183700562, + "learning_rate": 4.876243081064441e-05, + "loss": 5.2678, + "step": 16912 + }, + { + "epoch": 0.10058640213150633, + "grad_norm": 1.3014607429504395, + "learning_rate": 4.876228566355682e-05, + "loss": 5.2762, + "step": 16913 + }, + { + "epoch": 0.10059234941478733, + "grad_norm": 1.2084840536117554, + "learning_rate": 4.876214050817405e-05, + "loss": 5.1128, + "step": 16914 + }, + { + "epoch": 0.10059829669806833, + "grad_norm": 1.3497353792190552, + "learning_rate": 4.876199534449617e-05, + "loss": 5.1666, + "step": 16915 + }, + { + "epoch": 0.10060424398134932, + "grad_norm": 1.4095430374145508, + "learning_rate": 4.876185017252322e-05, + "loss": 5.0055, + "step": 16916 + }, + { + "epoch": 0.10061019126463032, + "grad_norm": 1.319938063621521, + "learning_rate": 4.876170499225525e-05, + "loss": 5.0628, + "step": 16917 + }, + { + "epoch": 0.10061613854791131, + "grad_norm": 1.2126001119613647, + "learning_rate": 4.876155980369232e-05, + "loss": 5.4244, + "step": 16918 + }, + { + "epoch": 0.10062208583119231, + "grad_norm": 1.0456511974334717, + "learning_rate": 4.876141460683448e-05, + "loss": 5.2556, + "step": 16919 + }, + { + "epoch": 0.10062803311447331, + "grad_norm": 1.2545825242996216, + "learning_rate": 4.8761269401681765e-05, + "loss": 5.1549, + "step": 16920 + }, + { + "epoch": 0.1006339803977543, + "grad_norm": 1.3613678216934204, + "learning_rate": 4.876112418823424e-05, + "loss": 5.0592, + "step": 16921 + }, + { + "epoch": 0.1006399276810353, + "grad_norm": 1.4963204860687256, + "learning_rate": 4.876097896649196e-05, + "loss": 5.1025, + "step": 16922 + }, + { + "epoch": 0.1006458749643163, + "grad_norm": 1.3221436738967896, + "learning_rate": 4.876083373645495e-05, + "loss": 5.2534, + "step": 16923 + }, + { + "epoch": 0.10065182224759729, + "grad_norm": 1.6041839122772217, + "learning_rate": 4.8760688498123294e-05, + "loss": 5.3351, + "step": 16924 + }, + { + "epoch": 0.1006577695308783, + "grad_norm": 1.4891480207443237, + "learning_rate": 4.876054325149702e-05, + "loss": 5.4782, + "step": 16925 + }, + { + "epoch": 0.1006637168141593, + "grad_norm": 2.101271867752075, + "learning_rate": 4.876039799657619e-05, + "loss": 5.3844, + "step": 16926 + }, + { + "epoch": 0.10066966409744028, + "grad_norm": 1.5637247562408447, + "learning_rate": 4.8760252733360845e-05, + "loss": 5.4488, + "step": 16927 + }, + { + "epoch": 0.10067561138072129, + "grad_norm": 1.5939668416976929, + "learning_rate": 4.8760107461851044e-05, + "loss": 5.3429, + "step": 16928 + }, + { + "epoch": 0.10068155866400229, + "grad_norm": 1.509945273399353, + "learning_rate": 4.875996218204684e-05, + "loss": 5.4501, + "step": 16929 + }, + { + "epoch": 0.10068750594728328, + "grad_norm": 1.553009271621704, + "learning_rate": 4.875981689394827e-05, + "loss": 5.4183, + "step": 16930 + }, + { + "epoch": 0.10069345323056428, + "grad_norm": 1.5002714395523071, + "learning_rate": 4.875967159755539e-05, + "loss": 5.2343, + "step": 16931 + }, + { + "epoch": 0.10069940051384528, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8759526292868266e-05, + "loss": 5.4414, + "step": 16932 + }, + { + "epoch": 0.10070534779712627, + "grad_norm": 1.38532555103302, + "learning_rate": 4.875938097988694e-05, + "loss": 5.4026, + "step": 16933 + }, + { + "epoch": 0.10071129508040727, + "grad_norm": 1.4190242290496826, + "learning_rate": 4.8759235658611445e-05, + "loss": 5.346, + "step": 16934 + }, + { + "epoch": 0.10071724236368827, + "grad_norm": 1.291375756263733, + "learning_rate": 4.875909032904186e-05, + "loss": 5.3715, + "step": 16935 + }, + { + "epoch": 0.10072318964696926, + "grad_norm": 1.5563501119613647, + "learning_rate": 4.8758944991178214e-05, + "loss": 5.2474, + "step": 16936 + }, + { + "epoch": 0.10072913693025026, + "grad_norm": 1.2936631441116333, + "learning_rate": 4.875879964502056e-05, + "loss": 5.2627, + "step": 16937 + }, + { + "epoch": 0.10073508421353126, + "grad_norm": 1.5020617246627808, + "learning_rate": 4.875865429056896e-05, + "loss": 5.2166, + "step": 16938 + }, + { + "epoch": 0.10074103149681225, + "grad_norm": 1.4830302000045776, + "learning_rate": 4.8758508927823464e-05, + "loss": 5.2558, + "step": 16939 + }, + { + "epoch": 0.10074697878009325, + "grad_norm": 1.4259967803955078, + "learning_rate": 4.8758363556784114e-05, + "loss": 5.3117, + "step": 16940 + }, + { + "epoch": 0.10075292606337426, + "grad_norm": 1.5735303163528442, + "learning_rate": 4.875821817745096e-05, + "loss": 5.2993, + "step": 16941 + }, + { + "epoch": 0.10075887334665524, + "grad_norm": 1.6409742832183838, + "learning_rate": 4.875807278982407e-05, + "loss": 5.4337, + "step": 16942 + }, + { + "epoch": 0.10076482062993625, + "grad_norm": 1.5159885883331299, + "learning_rate": 4.875792739390347e-05, + "loss": 5.4222, + "step": 16943 + }, + { + "epoch": 0.10077076791321725, + "grad_norm": 1.704200029373169, + "learning_rate": 4.875778198968923e-05, + "loss": 5.5248, + "step": 16944 + }, + { + "epoch": 0.10077671519649824, + "grad_norm": 1.8533267974853516, + "learning_rate": 4.875763657718139e-05, + "loss": 5.2155, + "step": 16945 + }, + { + "epoch": 0.10078266247977924, + "grad_norm": 1.3260399103164673, + "learning_rate": 4.8757491156380006e-05, + "loss": 5.3239, + "step": 16946 + }, + { + "epoch": 0.10078860976306023, + "grad_norm": 1.317050814628601, + "learning_rate": 4.875734572728513e-05, + "loss": 5.2346, + "step": 16947 + }, + { + "epoch": 0.10079455704634123, + "grad_norm": 1.5583351850509644, + "learning_rate": 4.875720028989681e-05, + "loss": 5.194, + "step": 16948 + }, + { + "epoch": 0.10080050432962223, + "grad_norm": 1.3424546718597412, + "learning_rate": 4.8757054844215094e-05, + "loss": 5.3616, + "step": 16949 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 1.3151681423187256, + "learning_rate": 4.875690939024004e-05, + "loss": 5.2183, + "step": 16950 + }, + { + "epoch": 0.10081239889618422, + "grad_norm": 1.441724419593811, + "learning_rate": 4.875676392797168e-05, + "loss": 5.3292, + "step": 16951 + }, + { + "epoch": 0.10081834617946522, + "grad_norm": 1.3751790523529053, + "learning_rate": 4.87566184574101e-05, + "loss": 5.1747, + "step": 16952 + }, + { + "epoch": 0.10082429346274621, + "grad_norm": 1.5188177824020386, + "learning_rate": 4.8756472978555314e-05, + "loss": 5.2291, + "step": 16953 + }, + { + "epoch": 0.10083024074602721, + "grad_norm": 1.2834105491638184, + "learning_rate": 4.87563274914074e-05, + "loss": 5.1655, + "step": 16954 + }, + { + "epoch": 0.10083618802930822, + "grad_norm": 1.3950659036636353, + "learning_rate": 4.8756181995966385e-05, + "loss": 5.2318, + "step": 16955 + }, + { + "epoch": 0.1008421353125892, + "grad_norm": 1.3544670343399048, + "learning_rate": 4.875603649223234e-05, + "loss": 5.026, + "step": 16956 + }, + { + "epoch": 0.1008480825958702, + "grad_norm": 1.4849059581756592, + "learning_rate": 4.875589098020531e-05, + "loss": 5.2139, + "step": 16957 + }, + { + "epoch": 0.10085402987915121, + "grad_norm": 1.2032678127288818, + "learning_rate": 4.875574545988534e-05, + "loss": 5.3103, + "step": 16958 + }, + { + "epoch": 0.1008599771624322, + "grad_norm": 1.4803698062896729, + "learning_rate": 4.875559993127249e-05, + "loss": 5.2546, + "step": 16959 + }, + { + "epoch": 0.1008659244457132, + "grad_norm": 1.374115228652954, + "learning_rate": 4.8755454394366795e-05, + "loss": 5.1654, + "step": 16960 + }, + { + "epoch": 0.1008718717289942, + "grad_norm": 1.420754075050354, + "learning_rate": 4.875530884916832e-05, + "loss": 5.3368, + "step": 16961 + }, + { + "epoch": 0.10087781901227519, + "grad_norm": 1.3919636011123657, + "learning_rate": 4.875516329567712e-05, + "loss": 5.3053, + "step": 16962 + }, + { + "epoch": 0.10088376629555619, + "grad_norm": 1.2697970867156982, + "learning_rate": 4.8755017733893235e-05, + "loss": 5.1771, + "step": 16963 + }, + { + "epoch": 0.10088971357883719, + "grad_norm": 1.3521144390106201, + "learning_rate": 4.8754872163816714e-05, + "loss": 5.3226, + "step": 16964 + }, + { + "epoch": 0.10089566086211818, + "grad_norm": 1.4171572923660278, + "learning_rate": 4.875472658544761e-05, + "loss": 5.17, + "step": 16965 + }, + { + "epoch": 0.10090160814539918, + "grad_norm": 1.1771302223205566, + "learning_rate": 4.875458099878598e-05, + "loss": 5.2938, + "step": 16966 + }, + { + "epoch": 0.10090755542868018, + "grad_norm": 1.3881202936172485, + "learning_rate": 4.875443540383188e-05, + "loss": 5.2567, + "step": 16967 + }, + { + "epoch": 0.10091350271196117, + "grad_norm": 1.3272387981414795, + "learning_rate": 4.875428980058534e-05, + "loss": 5.2459, + "step": 16968 + }, + { + "epoch": 0.10091944999524217, + "grad_norm": 1.227569341659546, + "learning_rate": 4.875414418904643e-05, + "loss": 5.4037, + "step": 16969 + }, + { + "epoch": 0.10092539727852318, + "grad_norm": 1.6725070476531982, + "learning_rate": 4.875399856921519e-05, + "loss": 4.957, + "step": 16970 + }, + { + "epoch": 0.10093134456180416, + "grad_norm": 1.2896990776062012, + "learning_rate": 4.8753852941091676e-05, + "loss": 5.0245, + "step": 16971 + }, + { + "epoch": 0.10093729184508517, + "grad_norm": 1.4771101474761963, + "learning_rate": 4.8753707304675935e-05, + "loss": 5.007, + "step": 16972 + }, + { + "epoch": 0.10094323912836617, + "grad_norm": 1.5898420810699463, + "learning_rate": 4.8753561659968025e-05, + "loss": 5.2144, + "step": 16973 + }, + { + "epoch": 0.10094918641164716, + "grad_norm": 1.3972615003585815, + "learning_rate": 4.875341600696799e-05, + "loss": 5.0019, + "step": 16974 + }, + { + "epoch": 0.10095513369492816, + "grad_norm": 1.3663748502731323, + "learning_rate": 4.875327034567588e-05, + "loss": 5.3281, + "step": 16975 + }, + { + "epoch": 0.10096108097820915, + "grad_norm": 1.4441343545913696, + "learning_rate": 4.875312467609175e-05, + "loss": 5.3224, + "step": 16976 + }, + { + "epoch": 0.10096702826149015, + "grad_norm": 1.409233570098877, + "learning_rate": 4.875297899821565e-05, + "loss": 5.1244, + "step": 16977 + }, + { + "epoch": 0.10097297554477115, + "grad_norm": 1.286838412284851, + "learning_rate": 4.875283331204763e-05, + "loss": 5.187, + "step": 16978 + }, + { + "epoch": 0.10097892282805214, + "grad_norm": 1.3722141981124878, + "learning_rate": 4.8752687617587744e-05, + "loss": 5.1052, + "step": 16979 + }, + { + "epoch": 0.10098487011133314, + "grad_norm": 1.464938998222351, + "learning_rate": 4.8752541914836034e-05, + "loss": 5.2428, + "step": 16980 + }, + { + "epoch": 0.10099081739461414, + "grad_norm": 1.5051358938217163, + "learning_rate": 4.875239620379256e-05, + "loss": 5.204, + "step": 16981 + }, + { + "epoch": 0.10099676467789513, + "grad_norm": 1.374108076095581, + "learning_rate": 4.875225048445737e-05, + "loss": 5.4567, + "step": 16982 + }, + { + "epoch": 0.10100271196117613, + "grad_norm": 1.482023000717163, + "learning_rate": 4.875210475683052e-05, + "loss": 5.3605, + "step": 16983 + }, + { + "epoch": 0.10100865924445714, + "grad_norm": 1.429819107055664, + "learning_rate": 4.8751959020912056e-05, + "loss": 5.3351, + "step": 16984 + }, + { + "epoch": 0.10101460652773812, + "grad_norm": 1.3165935277938843, + "learning_rate": 4.875181327670202e-05, + "loss": 5.2705, + "step": 16985 + }, + { + "epoch": 0.10102055381101913, + "grad_norm": 1.4560794830322266, + "learning_rate": 4.8751667524200474e-05, + "loss": 5.313, + "step": 16986 + }, + { + "epoch": 0.10102650109430013, + "grad_norm": 1.5268526077270508, + "learning_rate": 4.875152176340747e-05, + "loss": 5.2432, + "step": 16987 + }, + { + "epoch": 0.10103244837758112, + "grad_norm": 1.8486063480377197, + "learning_rate": 4.875137599432305e-05, + "loss": 5.4951, + "step": 16988 + }, + { + "epoch": 0.10103839566086212, + "grad_norm": 1.5344970226287842, + "learning_rate": 4.875123021694727e-05, + "loss": 4.7321, + "step": 16989 + }, + { + "epoch": 0.10104434294414312, + "grad_norm": 1.5000940561294556, + "learning_rate": 4.8751084431280186e-05, + "loss": 5.1539, + "step": 16990 + }, + { + "epoch": 0.10105029022742411, + "grad_norm": 1.3047879934310913, + "learning_rate": 4.875093863732184e-05, + "loss": 5.1549, + "step": 16991 + }, + { + "epoch": 0.10105623751070511, + "grad_norm": 1.3496383428573608, + "learning_rate": 4.875079283507229e-05, + "loss": 5.0896, + "step": 16992 + }, + { + "epoch": 0.10106218479398611, + "grad_norm": 1.3492714166641235, + "learning_rate": 4.875064702453158e-05, + "loss": 5.0242, + "step": 16993 + }, + { + "epoch": 0.1010681320772671, + "grad_norm": 1.3479794263839722, + "learning_rate": 4.8750501205699766e-05, + "loss": 4.9653, + "step": 16994 + }, + { + "epoch": 0.1010740793605481, + "grad_norm": 1.4737683534622192, + "learning_rate": 4.87503553785769e-05, + "loss": 5.0082, + "step": 16995 + }, + { + "epoch": 0.1010800266438291, + "grad_norm": 1.335184931755066, + "learning_rate": 4.8750209543163026e-05, + "loss": 5.0068, + "step": 16996 + }, + { + "epoch": 0.10108597392711009, + "grad_norm": 1.3982423543930054, + "learning_rate": 4.87500636994582e-05, + "loss": 4.9958, + "step": 16997 + }, + { + "epoch": 0.1010919212103911, + "grad_norm": 1.4706374406814575, + "learning_rate": 4.874991784746248e-05, + "loss": 4.9776, + "step": 16998 + }, + { + "epoch": 0.1010978684936721, + "grad_norm": 1.4456995725631714, + "learning_rate": 4.8749771987175896e-05, + "loss": 5.1226, + "step": 16999 + }, + { + "epoch": 0.10110381577695308, + "grad_norm": 1.3827359676361084, + "learning_rate": 4.874962611859853e-05, + "loss": 5.0648, + "step": 17000 + }, + { + "epoch": 0.10110976306023409, + "grad_norm": 1.4089758396148682, + "learning_rate": 4.874948024173039e-05, + "loss": 5.0511, + "step": 17001 + }, + { + "epoch": 0.10111571034351509, + "grad_norm": 1.5135823488235474, + "learning_rate": 4.874933435657157e-05, + "loss": 5.1586, + "step": 17002 + }, + { + "epoch": 0.10112165762679608, + "grad_norm": 1.3575700521469116, + "learning_rate": 4.87491884631221e-05, + "loss": 5.4172, + "step": 17003 + }, + { + "epoch": 0.10112760491007708, + "grad_norm": 1.6240919828414917, + "learning_rate": 4.874904256138203e-05, + "loss": 4.8663, + "step": 17004 + }, + { + "epoch": 0.10113355219335807, + "grad_norm": 1.517287254333496, + "learning_rate": 4.8748896651351415e-05, + "loss": 5.2746, + "step": 17005 + }, + { + "epoch": 0.10113949947663907, + "grad_norm": 1.359541893005371, + "learning_rate": 4.87487507330303e-05, + "loss": 5.2497, + "step": 17006 + }, + { + "epoch": 0.10114544675992007, + "grad_norm": 1.608406901359558, + "learning_rate": 4.8748604806418755e-05, + "loss": 5.2789, + "step": 17007 + }, + { + "epoch": 0.10115139404320106, + "grad_norm": 1.5752578973770142, + "learning_rate": 4.874845887151681e-05, + "loss": 5.1583, + "step": 17008 + }, + { + "epoch": 0.10115734132648206, + "grad_norm": 1.5864077806472778, + "learning_rate": 4.8748312928324524e-05, + "loss": 5.2091, + "step": 17009 + }, + { + "epoch": 0.10116328860976306, + "grad_norm": 1.4714727401733398, + "learning_rate": 4.874816697684195e-05, + "loss": 5.2404, + "step": 17010 + }, + { + "epoch": 0.10116923589304405, + "grad_norm": 1.4676539897918701, + "learning_rate": 4.874802101706913e-05, + "loss": 5.3318, + "step": 17011 + }, + { + "epoch": 0.10117518317632505, + "grad_norm": 1.3290908336639404, + "learning_rate": 4.874787504900612e-05, + "loss": 5.0484, + "step": 17012 + }, + { + "epoch": 0.10118113045960606, + "grad_norm": 1.2661367654800415, + "learning_rate": 4.8747729072652984e-05, + "loss": 5.1857, + "step": 17013 + }, + { + "epoch": 0.10118707774288704, + "grad_norm": 1.2540318965911865, + "learning_rate": 4.874758308800975e-05, + "loss": 5.3025, + "step": 17014 + }, + { + "epoch": 0.10119302502616805, + "grad_norm": 1.2353893518447876, + "learning_rate": 4.874743709507649e-05, + "loss": 5.3613, + "step": 17015 + }, + { + "epoch": 0.10119897230944905, + "grad_norm": 1.2193371057510376, + "learning_rate": 4.874729109385323e-05, + "loss": 5.3029, + "step": 17016 + }, + { + "epoch": 0.10120491959273004, + "grad_norm": 1.2443112134933472, + "learning_rate": 4.874714508434005e-05, + "loss": 5.3667, + "step": 17017 + }, + { + "epoch": 0.10121086687601104, + "grad_norm": 1.4194598197937012, + "learning_rate": 4.874699906653698e-05, + "loss": 5.5583, + "step": 17018 + }, + { + "epoch": 0.10121681415929204, + "grad_norm": 1.4791369438171387, + "learning_rate": 4.874685304044408e-05, + "loss": 5.2797, + "step": 17019 + }, + { + "epoch": 0.10122276144257303, + "grad_norm": 1.4528671503067017, + "learning_rate": 4.87467070060614e-05, + "loss": 5.1261, + "step": 17020 + }, + { + "epoch": 0.10122870872585403, + "grad_norm": 1.2694898843765259, + "learning_rate": 4.8746560963388985e-05, + "loss": 5.3817, + "step": 17021 + }, + { + "epoch": 0.10123465600913503, + "grad_norm": 1.6012862920761108, + "learning_rate": 4.8746414912426896e-05, + "loss": 4.962, + "step": 17022 + }, + { + "epoch": 0.10124060329241602, + "grad_norm": 1.6179730892181396, + "learning_rate": 4.874626885317518e-05, + "loss": 4.6365, + "step": 17023 + }, + { + "epoch": 0.10124655057569702, + "grad_norm": 1.4522144794464111, + "learning_rate": 4.8746122785633885e-05, + "loss": 4.8943, + "step": 17024 + }, + { + "epoch": 0.10125249785897802, + "grad_norm": 1.6087841987609863, + "learning_rate": 4.8745976709803064e-05, + "loss": 4.81, + "step": 17025 + }, + { + "epoch": 0.10125844514225901, + "grad_norm": 1.424810767173767, + "learning_rate": 4.8745830625682766e-05, + "loss": 4.8699, + "step": 17026 + }, + { + "epoch": 0.10126439242554001, + "grad_norm": 1.3316916227340698, + "learning_rate": 4.874568453327304e-05, + "loss": 5.0084, + "step": 17027 + }, + { + "epoch": 0.10127033970882102, + "grad_norm": 1.549833059310913, + "learning_rate": 4.8745538432573946e-05, + "loss": 4.748, + "step": 17028 + }, + { + "epoch": 0.101276286992102, + "grad_norm": 1.294263482093811, + "learning_rate": 4.874539232358553e-05, + "loss": 4.8004, + "step": 17029 + }, + { + "epoch": 0.101282234275383, + "grad_norm": 1.5209519863128662, + "learning_rate": 4.8745246206307845e-05, + "loss": 4.8187, + "step": 17030 + }, + { + "epoch": 0.10128818155866401, + "grad_norm": 1.5805583000183105, + "learning_rate": 4.874510008074094e-05, + "loss": 4.7126, + "step": 17031 + }, + { + "epoch": 0.101294128841945, + "grad_norm": 1.473693609237671, + "learning_rate": 4.8744953946884864e-05, + "loss": 4.86, + "step": 17032 + }, + { + "epoch": 0.101300076125226, + "grad_norm": 1.6662403345108032, + "learning_rate": 4.8744807804739664e-05, + "loss": 4.8903, + "step": 17033 + }, + { + "epoch": 0.10130602340850699, + "grad_norm": 1.5269529819488525, + "learning_rate": 4.87446616543054e-05, + "loss": 5.1061, + "step": 17034 + }, + { + "epoch": 0.10131197069178799, + "grad_norm": 1.3940715789794922, + "learning_rate": 4.8744515495582127e-05, + "loss": 5.3221, + "step": 17035 + }, + { + "epoch": 0.10131791797506899, + "grad_norm": 1.4603626728057861, + "learning_rate": 4.874436932856988e-05, + "loss": 5.2562, + "step": 17036 + }, + { + "epoch": 0.10132386525834998, + "grad_norm": 1.4601393938064575, + "learning_rate": 4.874422315326873e-05, + "loss": 5.1297, + "step": 17037 + }, + { + "epoch": 0.10132981254163098, + "grad_norm": 1.3284024000167847, + "learning_rate": 4.874407696967871e-05, + "loss": 5.2209, + "step": 17038 + }, + { + "epoch": 0.10133575982491198, + "grad_norm": 1.1924611330032349, + "learning_rate": 4.874393077779987e-05, + "loss": 5.265, + "step": 17039 + }, + { + "epoch": 0.10134170710819297, + "grad_norm": 1.1306421756744385, + "learning_rate": 4.874378457763228e-05, + "loss": 5.1637, + "step": 17040 + }, + { + "epoch": 0.10134765439147397, + "grad_norm": 1.414591908454895, + "learning_rate": 4.874363836917598e-05, + "loss": 5.1238, + "step": 17041 + }, + { + "epoch": 0.10135360167475498, + "grad_norm": 1.245263934135437, + "learning_rate": 4.8743492152431016e-05, + "loss": 5.1779, + "step": 17042 + }, + { + "epoch": 0.10135954895803596, + "grad_norm": 1.363484501838684, + "learning_rate": 4.874334592739745e-05, + "loss": 5.1328, + "step": 17043 + }, + { + "epoch": 0.10136549624131697, + "grad_norm": 1.3666833639144897, + "learning_rate": 4.8743199694075326e-05, + "loss": 5.2547, + "step": 17044 + }, + { + "epoch": 0.10137144352459797, + "grad_norm": 1.3848010301589966, + "learning_rate": 4.8743053452464694e-05, + "loss": 5.2745, + "step": 17045 + }, + { + "epoch": 0.10137739080787896, + "grad_norm": 1.4478403329849243, + "learning_rate": 4.87429072025656e-05, + "loss": 5.2069, + "step": 17046 + }, + { + "epoch": 0.10138333809115996, + "grad_norm": 1.5361924171447754, + "learning_rate": 4.8742760944378115e-05, + "loss": 5.1721, + "step": 17047 + }, + { + "epoch": 0.10138928537444096, + "grad_norm": 1.549049973487854, + "learning_rate": 4.874261467790227e-05, + "loss": 5.2525, + "step": 17048 + }, + { + "epoch": 0.10139523265772195, + "grad_norm": 1.484999656677246, + "learning_rate": 4.874246840313813e-05, + "loss": 5.2433, + "step": 17049 + }, + { + "epoch": 0.10140117994100295, + "grad_norm": 1.58607017993927, + "learning_rate": 4.8742322120085734e-05, + "loss": 4.9631, + "step": 17050 + }, + { + "epoch": 0.10140712722428395, + "grad_norm": 1.1922807693481445, + "learning_rate": 4.874217582874514e-05, + "loss": 5.1917, + "step": 17051 + }, + { + "epoch": 0.10141307450756494, + "grad_norm": 1.1538786888122559, + "learning_rate": 4.87420295291164e-05, + "loss": 5.0231, + "step": 17052 + }, + { + "epoch": 0.10141902179084594, + "grad_norm": 1.302758812904358, + "learning_rate": 4.874188322119956e-05, + "loss": 5.0292, + "step": 17053 + }, + { + "epoch": 0.10142496907412694, + "grad_norm": 1.2432395219802856, + "learning_rate": 4.874173690499467e-05, + "loss": 5.1671, + "step": 17054 + }, + { + "epoch": 0.10143091635740793, + "grad_norm": 1.3793164491653442, + "learning_rate": 4.8741590580501786e-05, + "loss": 5.2231, + "step": 17055 + }, + { + "epoch": 0.10143686364068893, + "grad_norm": 1.3487818241119385, + "learning_rate": 4.8741444247720966e-05, + "loss": 5.0464, + "step": 17056 + }, + { + "epoch": 0.10144281092396994, + "grad_norm": 1.512860894203186, + "learning_rate": 4.874129790665225e-05, + "loss": 4.8973, + "step": 17057 + }, + { + "epoch": 0.10144875820725092, + "grad_norm": 1.6202374696731567, + "learning_rate": 4.874115155729569e-05, + "loss": 5.0055, + "step": 17058 + }, + { + "epoch": 0.10145470549053193, + "grad_norm": 1.3453385829925537, + "learning_rate": 4.874100519965134e-05, + "loss": 4.7808, + "step": 17059 + }, + { + "epoch": 0.10146065277381293, + "grad_norm": 1.4613635540008545, + "learning_rate": 4.874085883371925e-05, + "loss": 4.8073, + "step": 17060 + }, + { + "epoch": 0.10146660005709392, + "grad_norm": 1.3086074590682983, + "learning_rate": 4.874071245949947e-05, + "loss": 4.9751, + "step": 17061 + }, + { + "epoch": 0.10147254734037492, + "grad_norm": 1.454784631729126, + "learning_rate": 4.8740566076992055e-05, + "loss": 5.2422, + "step": 17062 + }, + { + "epoch": 0.10147849462365591, + "grad_norm": 1.3406941890716553, + "learning_rate": 4.8740419686197054e-05, + "loss": 5.2342, + "step": 17063 + }, + { + "epoch": 0.10148444190693691, + "grad_norm": 1.3241393566131592, + "learning_rate": 4.8740273287114514e-05, + "loss": 5.2168, + "step": 17064 + }, + { + "epoch": 0.10149038919021791, + "grad_norm": 1.2292134761810303, + "learning_rate": 4.8740126879744495e-05, + "loss": 5.171, + "step": 17065 + }, + { + "epoch": 0.1014963364734989, + "grad_norm": 1.395484209060669, + "learning_rate": 4.8739980464087044e-05, + "loss": 5.1782, + "step": 17066 + }, + { + "epoch": 0.1015022837567799, + "grad_norm": 1.8667857646942139, + "learning_rate": 4.87398340401422e-05, + "loss": 5.7113, + "step": 17067 + }, + { + "epoch": 0.1015082310400609, + "grad_norm": 1.4775335788726807, + "learning_rate": 4.873968760791003e-05, + "loss": 5.2518, + "step": 17068 + }, + { + "epoch": 0.10151417832334189, + "grad_norm": 1.5058828592300415, + "learning_rate": 4.873954116739059e-05, + "loss": 5.3249, + "step": 17069 + }, + { + "epoch": 0.1015201256066229, + "grad_norm": 1.4806468486785889, + "learning_rate": 4.873939471858391e-05, + "loss": 5.1119, + "step": 17070 + }, + { + "epoch": 0.1015260728899039, + "grad_norm": 1.3866868019104004, + "learning_rate": 4.873924826149006e-05, + "loss": 5.1709, + "step": 17071 + }, + { + "epoch": 0.10153202017318488, + "grad_norm": 1.2337566614151, + "learning_rate": 4.8739101796109074e-05, + "loss": 5.2346, + "step": 17072 + }, + { + "epoch": 0.10153796745646589, + "grad_norm": 1.5977396965026855, + "learning_rate": 4.873895532244103e-05, + "loss": 5.4213, + "step": 17073 + }, + { + "epoch": 0.10154391473974689, + "grad_norm": 1.343363642692566, + "learning_rate": 4.873880884048595e-05, + "loss": 5.2865, + "step": 17074 + }, + { + "epoch": 0.10154986202302788, + "grad_norm": 1.4759324789047241, + "learning_rate": 4.87386623502439e-05, + "loss": 5.1743, + "step": 17075 + }, + { + "epoch": 0.10155580930630888, + "grad_norm": 1.2113150358200073, + "learning_rate": 4.873851585171493e-05, + "loss": 5.2218, + "step": 17076 + }, + { + "epoch": 0.10156175658958988, + "grad_norm": 1.3962153196334839, + "learning_rate": 4.873836934489908e-05, + "loss": 5.1031, + "step": 17077 + }, + { + "epoch": 0.10156770387287087, + "grad_norm": 1.410144329071045, + "learning_rate": 4.8738222829796424e-05, + "loss": 5.0662, + "step": 17078 + }, + { + "epoch": 0.10157365115615187, + "grad_norm": 1.224947452545166, + "learning_rate": 4.873807630640699e-05, + "loss": 5.1583, + "step": 17079 + }, + { + "epoch": 0.10157959843943287, + "grad_norm": 1.401877522468567, + "learning_rate": 4.873792977473084e-05, + "loss": 5.2688, + "step": 17080 + }, + { + "epoch": 0.10158554572271386, + "grad_norm": 1.3576874732971191, + "learning_rate": 4.873778323476802e-05, + "loss": 5.037, + "step": 17081 + }, + { + "epoch": 0.10159149300599486, + "grad_norm": 1.226619839668274, + "learning_rate": 4.8737636686518595e-05, + "loss": 5.0502, + "step": 17082 + }, + { + "epoch": 0.10159744028927586, + "grad_norm": 1.2307099103927612, + "learning_rate": 4.87374901299826e-05, + "loss": 5.0855, + "step": 17083 + }, + { + "epoch": 0.10160338757255685, + "grad_norm": 1.1481422185897827, + "learning_rate": 4.873734356516009e-05, + "loss": 5.2114, + "step": 17084 + }, + { + "epoch": 0.10160933485583785, + "grad_norm": 1.4645094871520996, + "learning_rate": 4.873719699205113e-05, + "loss": 5.1432, + "step": 17085 + }, + { + "epoch": 0.10161528213911886, + "grad_norm": 1.3309158086776733, + "learning_rate": 4.873705041065575e-05, + "loss": 5.1557, + "step": 17086 + }, + { + "epoch": 0.10162122942239984, + "grad_norm": 1.2546007633209229, + "learning_rate": 4.873690382097401e-05, + "loss": 5.324, + "step": 17087 + }, + { + "epoch": 0.10162717670568085, + "grad_norm": 1.33823561668396, + "learning_rate": 4.873675722300597e-05, + "loss": 5.1773, + "step": 17088 + }, + { + "epoch": 0.10163312398896185, + "grad_norm": 1.3027381896972656, + "learning_rate": 4.873661061675166e-05, + "loss": 5.4172, + "step": 17089 + }, + { + "epoch": 0.10163907127224284, + "grad_norm": 1.3852121829986572, + "learning_rate": 4.873646400221116e-05, + "loss": 5.1655, + "step": 17090 + }, + { + "epoch": 0.10164501855552384, + "grad_norm": 1.4345825910568237, + "learning_rate": 4.87363173793845e-05, + "loss": 4.9941, + "step": 17091 + }, + { + "epoch": 0.10165096583880483, + "grad_norm": 1.4016261100769043, + "learning_rate": 4.873617074827173e-05, + "loss": 4.9657, + "step": 17092 + }, + { + "epoch": 0.10165691312208583, + "grad_norm": 1.339082956314087, + "learning_rate": 4.8736024108872914e-05, + "loss": 5.0075, + "step": 17093 + }, + { + "epoch": 0.10166286040536683, + "grad_norm": 1.3223985433578491, + "learning_rate": 4.8735877461188094e-05, + "loss": 4.9656, + "step": 17094 + }, + { + "epoch": 0.10166880768864782, + "grad_norm": 1.4618138074874878, + "learning_rate": 4.8735730805217326e-05, + "loss": 5.0158, + "step": 17095 + }, + { + "epoch": 0.10167475497192882, + "grad_norm": 1.4075788259506226, + "learning_rate": 4.8735584140960666e-05, + "loss": 5.3668, + "step": 17096 + }, + { + "epoch": 0.10168070225520982, + "grad_norm": 1.2219016551971436, + "learning_rate": 4.873543746841815e-05, + "loss": 5.3549, + "step": 17097 + }, + { + "epoch": 0.10168664953849081, + "grad_norm": 1.4344584941864014, + "learning_rate": 4.873529078758985e-05, + "loss": 5.2044, + "step": 17098 + }, + { + "epoch": 0.10169259682177181, + "grad_norm": 1.3579001426696777, + "learning_rate": 4.8735144098475794e-05, + "loss": 5.1071, + "step": 17099 + }, + { + "epoch": 0.10169854410505282, + "grad_norm": 1.4645969867706299, + "learning_rate": 4.873499740107604e-05, + "loss": 5.0359, + "step": 17100 + }, + { + "epoch": 0.1017044913883338, + "grad_norm": 1.6800013780593872, + "learning_rate": 4.8734850695390654e-05, + "loss": 5.2085, + "step": 17101 + }, + { + "epoch": 0.1017104386716148, + "grad_norm": 1.678339958190918, + "learning_rate": 4.873470398141968e-05, + "loss": 5.1671, + "step": 17102 + }, + { + "epoch": 0.10171638595489581, + "grad_norm": 1.6498647928237915, + "learning_rate": 4.873455725916316e-05, + "loss": 5.2105, + "step": 17103 + }, + { + "epoch": 0.1017223332381768, + "grad_norm": 1.522147297859192, + "learning_rate": 4.873441052862115e-05, + "loss": 5.1215, + "step": 17104 + }, + { + "epoch": 0.1017282805214578, + "grad_norm": 1.3335652351379395, + "learning_rate": 4.87342637897937e-05, + "loss": 5.2504, + "step": 17105 + }, + { + "epoch": 0.1017342278047388, + "grad_norm": 1.1647717952728271, + "learning_rate": 4.873411704268087e-05, + "loss": 5.3183, + "step": 17106 + }, + { + "epoch": 0.10174017508801979, + "grad_norm": 1.3210188150405884, + "learning_rate": 4.8733970287282706e-05, + "loss": 5.399, + "step": 17107 + }, + { + "epoch": 0.10174612237130079, + "grad_norm": 1.2331137657165527, + "learning_rate": 4.873382352359925e-05, + "loss": 5.2521, + "step": 17108 + }, + { + "epoch": 0.10175206965458179, + "grad_norm": 1.245252251625061, + "learning_rate": 4.873367675163056e-05, + "loss": 5.2092, + "step": 17109 + }, + { + "epoch": 0.10175801693786278, + "grad_norm": 1.3423751592636108, + "learning_rate": 4.87335299713767e-05, + "loss": 4.918, + "step": 17110 + }, + { + "epoch": 0.10176396422114378, + "grad_norm": 1.8670060634613037, + "learning_rate": 4.87333831828377e-05, + "loss": 4.6559, + "step": 17111 + }, + { + "epoch": 0.10176991150442478, + "grad_norm": 1.54763925075531, + "learning_rate": 4.873323638601363e-05, + "loss": 5.2565, + "step": 17112 + }, + { + "epoch": 0.10177585878770577, + "grad_norm": 1.134102702140808, + "learning_rate": 4.8733089580904525e-05, + "loss": 5.2119, + "step": 17113 + }, + { + "epoch": 0.10178180607098677, + "grad_norm": 1.395027756690979, + "learning_rate": 4.873294276751045e-05, + "loss": 5.0732, + "step": 17114 + }, + { + "epoch": 0.10178775335426778, + "grad_norm": 1.104973554611206, + "learning_rate": 4.873279594583144e-05, + "loss": 5.0807, + "step": 17115 + }, + { + "epoch": 0.10179370063754876, + "grad_norm": 1.0554969310760498, + "learning_rate": 4.873264911586757e-05, + "loss": 5.0831, + "step": 17116 + }, + { + "epoch": 0.10179964792082977, + "grad_norm": 1.0598722696304321, + "learning_rate": 4.873250227761887e-05, + "loss": 5.1264, + "step": 17117 + }, + { + "epoch": 0.10180559520411077, + "grad_norm": 1.1047697067260742, + "learning_rate": 4.8732355431085395e-05, + "loss": 5.0687, + "step": 17118 + }, + { + "epoch": 0.10181154248739176, + "grad_norm": 1.5564457178115845, + "learning_rate": 4.87322085762672e-05, + "loss": 5.0063, + "step": 17119 + }, + { + "epoch": 0.10181748977067276, + "grad_norm": 1.5218400955200195, + "learning_rate": 4.8732061713164344e-05, + "loss": 5.3785, + "step": 17120 + }, + { + "epoch": 0.10182343705395375, + "grad_norm": 1.3067396879196167, + "learning_rate": 4.873191484177686e-05, + "loss": 5.4108, + "step": 17121 + }, + { + "epoch": 0.10182938433723475, + "grad_norm": 1.4401333332061768, + "learning_rate": 4.873176796210482e-05, + "loss": 5.5251, + "step": 17122 + }, + { + "epoch": 0.10183533162051575, + "grad_norm": 1.0483810901641846, + "learning_rate": 4.873162107414826e-05, + "loss": 5.4983, + "step": 17123 + }, + { + "epoch": 0.10184127890379674, + "grad_norm": 1.2637344598770142, + "learning_rate": 4.8731474177907244e-05, + "loss": 5.4487, + "step": 17124 + }, + { + "epoch": 0.10184722618707774, + "grad_norm": 1.314834475517273, + "learning_rate": 4.873132727338181e-05, + "loss": 5.228, + "step": 17125 + }, + { + "epoch": 0.10185317347035874, + "grad_norm": 1.354665756225586, + "learning_rate": 4.8731180360572e-05, + "loss": 5.3908, + "step": 17126 + }, + { + "epoch": 0.10185912075363973, + "grad_norm": 1.3690662384033203, + "learning_rate": 4.87310334394779e-05, + "loss": 5.0955, + "step": 17127 + }, + { + "epoch": 0.10186506803692073, + "grad_norm": 1.5240978002548218, + "learning_rate": 4.873088651009954e-05, + "loss": 5.2838, + "step": 17128 + }, + { + "epoch": 0.10187101532020174, + "grad_norm": 1.147658109664917, + "learning_rate": 4.8730739572436966e-05, + "loss": 5.3074, + "step": 17129 + }, + { + "epoch": 0.10187696260348272, + "grad_norm": 1.3384162187576294, + "learning_rate": 4.8730592626490235e-05, + "loss": 5.3677, + "step": 17130 + }, + { + "epoch": 0.10188290988676373, + "grad_norm": 1.3388500213623047, + "learning_rate": 4.87304456722594e-05, + "loss": 5.3151, + "step": 17131 + }, + { + "epoch": 0.10188885717004473, + "grad_norm": 1.215617060661316, + "learning_rate": 4.873029870974452e-05, + "loss": 4.9182, + "step": 17132 + }, + { + "epoch": 0.10189480445332572, + "grad_norm": 1.2983050346374512, + "learning_rate": 4.873015173894563e-05, + "loss": 5.142, + "step": 17133 + }, + { + "epoch": 0.10190075173660672, + "grad_norm": 1.3918750286102295, + "learning_rate": 4.873000475986279e-05, + "loss": 5.0548, + "step": 17134 + }, + { + "epoch": 0.10190669901988772, + "grad_norm": 1.3934828042984009, + "learning_rate": 4.8729857772496045e-05, + "loss": 5.1319, + "step": 17135 + }, + { + "epoch": 0.10191264630316871, + "grad_norm": 1.32583487033844, + "learning_rate": 4.872971077684546e-05, + "loss": 5.2762, + "step": 17136 + }, + { + "epoch": 0.10191859358644971, + "grad_norm": 1.295102834701538, + "learning_rate": 4.872956377291108e-05, + "loss": 5.2338, + "step": 17137 + }, + { + "epoch": 0.10192454086973071, + "grad_norm": 1.2840588092803955, + "learning_rate": 4.8729416760692946e-05, + "loss": 5.3957, + "step": 17138 + }, + { + "epoch": 0.1019304881530117, + "grad_norm": 1.371270775794983, + "learning_rate": 4.872926974019112e-05, + "loss": 5.5933, + "step": 17139 + }, + { + "epoch": 0.1019364354362927, + "grad_norm": 1.380387783050537, + "learning_rate": 4.872912271140565e-05, + "loss": 5.6628, + "step": 17140 + }, + { + "epoch": 0.1019423827195737, + "grad_norm": 1.3120551109313965, + "learning_rate": 4.8728975674336596e-05, + "loss": 5.6424, + "step": 17141 + }, + { + "epoch": 0.10194833000285469, + "grad_norm": 1.3965035676956177, + "learning_rate": 4.8728828628984003e-05, + "loss": 5.5413, + "step": 17142 + }, + { + "epoch": 0.1019542772861357, + "grad_norm": 1.5870885848999023, + "learning_rate": 4.872868157534791e-05, + "loss": 5.1952, + "step": 17143 + }, + { + "epoch": 0.1019602245694167, + "grad_norm": 1.584633231163025, + "learning_rate": 4.872853451342839e-05, + "loss": 5.1045, + "step": 17144 + }, + { + "epoch": 0.10196617185269768, + "grad_norm": 1.5781641006469727, + "learning_rate": 4.872838744322548e-05, + "loss": 4.9581, + "step": 17145 + }, + { + "epoch": 0.10197211913597869, + "grad_norm": 1.3683301210403442, + "learning_rate": 4.872824036473923e-05, + "loss": 4.9931, + "step": 17146 + }, + { + "epoch": 0.10197806641925969, + "grad_norm": 1.4182472229003906, + "learning_rate": 4.87280932779697e-05, + "loss": 4.7815, + "step": 17147 + }, + { + "epoch": 0.10198401370254068, + "grad_norm": 1.464609146118164, + "learning_rate": 4.872794618291694e-05, + "loss": 4.9158, + "step": 17148 + }, + { + "epoch": 0.10198996098582168, + "grad_norm": 1.4733667373657227, + "learning_rate": 4.872779907958099e-05, + "loss": 5.069, + "step": 17149 + }, + { + "epoch": 0.10199590826910268, + "grad_norm": 1.4454584121704102, + "learning_rate": 4.872765196796192e-05, + "loss": 5.1131, + "step": 17150 + }, + { + "epoch": 0.10200185555238367, + "grad_norm": 1.6175665855407715, + "learning_rate": 4.872750484805977e-05, + "loss": 4.9432, + "step": 17151 + }, + { + "epoch": 0.10200780283566467, + "grad_norm": 1.378569483757019, + "learning_rate": 4.872735771987459e-05, + "loss": 4.9243, + "step": 17152 + }, + { + "epoch": 0.10201375011894566, + "grad_norm": 1.452481985092163, + "learning_rate": 4.872721058340644e-05, + "loss": 4.8421, + "step": 17153 + }, + { + "epoch": 0.10201969740222666, + "grad_norm": 1.8265782594680786, + "learning_rate": 4.872706343865536e-05, + "loss": 5.2555, + "step": 17154 + }, + { + "epoch": 0.10202564468550766, + "grad_norm": 1.6913262605667114, + "learning_rate": 4.8726916285621414e-05, + "loss": 5.3829, + "step": 17155 + }, + { + "epoch": 0.10203159196878865, + "grad_norm": 1.6480923891067505, + "learning_rate": 4.8726769124304644e-05, + "loss": 5.4168, + "step": 17156 + }, + { + "epoch": 0.10203753925206965, + "grad_norm": 1.702602744102478, + "learning_rate": 4.8726621954705105e-05, + "loss": 5.4045, + "step": 17157 + }, + { + "epoch": 0.10204348653535066, + "grad_norm": 1.749205470085144, + "learning_rate": 4.8726474776822844e-05, + "loss": 5.5886, + "step": 17158 + }, + { + "epoch": 0.10204943381863164, + "grad_norm": 1.927309274673462, + "learning_rate": 4.8726327590657916e-05, + "loss": 5.5547, + "step": 17159 + }, + { + "epoch": 0.10205538110191265, + "grad_norm": 1.6493511199951172, + "learning_rate": 4.8726180396210374e-05, + "loss": 5.6764, + "step": 17160 + }, + { + "epoch": 0.10206132838519365, + "grad_norm": 1.7083081007003784, + "learning_rate": 4.8726033193480266e-05, + "loss": 5.5823, + "step": 17161 + }, + { + "epoch": 0.10206727566847464, + "grad_norm": 1.7882472276687622, + "learning_rate": 4.872588598246765e-05, + "loss": 5.4388, + "step": 17162 + }, + { + "epoch": 0.10207322295175564, + "grad_norm": 1.6043784618377686, + "learning_rate": 4.872573876317257e-05, + "loss": 5.6816, + "step": 17163 + }, + { + "epoch": 0.10207917023503664, + "grad_norm": 1.3449418544769287, + "learning_rate": 4.872559153559507e-05, + "loss": 5.5661, + "step": 17164 + }, + { + "epoch": 0.10208511751831763, + "grad_norm": 1.7593882083892822, + "learning_rate": 4.8725444299735226e-05, + "loss": 4.95, + "step": 17165 + }, + { + "epoch": 0.10209106480159863, + "grad_norm": 1.8593993186950684, + "learning_rate": 4.872529705559307e-05, + "loss": 5.3296, + "step": 17166 + }, + { + "epoch": 0.10209701208487963, + "grad_norm": 1.7530159950256348, + "learning_rate": 4.872514980316865e-05, + "loss": 5.4378, + "step": 17167 + }, + { + "epoch": 0.10210295936816062, + "grad_norm": 1.7487550973892212, + "learning_rate": 4.872500254246203e-05, + "loss": 5.3435, + "step": 17168 + }, + { + "epoch": 0.10210890665144162, + "grad_norm": 1.7868090867996216, + "learning_rate": 4.8724855273473256e-05, + "loss": 5.2266, + "step": 17169 + }, + { + "epoch": 0.10211485393472262, + "grad_norm": 1.6116459369659424, + "learning_rate": 4.872470799620238e-05, + "loss": 5.2394, + "step": 17170 + }, + { + "epoch": 0.10212080121800361, + "grad_norm": 1.6221721172332764, + "learning_rate": 4.872456071064946e-05, + "loss": 5.823, + "step": 17171 + }, + { + "epoch": 0.10212674850128461, + "grad_norm": 1.462540626525879, + "learning_rate": 4.872441341681454e-05, + "loss": 5.8816, + "step": 17172 + }, + { + "epoch": 0.10213269578456562, + "grad_norm": 1.3804352283477783, + "learning_rate": 4.872426611469766e-05, + "loss": 5.7982, + "step": 17173 + }, + { + "epoch": 0.1021386430678466, + "grad_norm": 1.7873106002807617, + "learning_rate": 4.872411880429889e-05, + "loss": 5.0282, + "step": 17174 + }, + { + "epoch": 0.1021445903511276, + "grad_norm": 1.9154506921768188, + "learning_rate": 4.8723971485618284e-05, + "loss": 4.8535, + "step": 17175 + }, + { + "epoch": 0.10215053763440861, + "grad_norm": 1.865502953529358, + "learning_rate": 4.872382415865587e-05, + "loss": 5.5282, + "step": 17176 + }, + { + "epoch": 0.1021564849176896, + "grad_norm": 1.8683371543884277, + "learning_rate": 4.872367682341173e-05, + "loss": 5.2973, + "step": 17177 + }, + { + "epoch": 0.1021624322009706, + "grad_norm": 1.8488374948501587, + "learning_rate": 4.872352947988589e-05, + "loss": 5.4094, + "step": 17178 + }, + { + "epoch": 0.1021683794842516, + "grad_norm": 1.6702567338943481, + "learning_rate": 4.872338212807841e-05, + "loss": 5.5705, + "step": 17179 + }, + { + "epoch": 0.10217432676753259, + "grad_norm": 1.6559606790542603, + "learning_rate": 4.8723234767989345e-05, + "loss": 5.6637, + "step": 17180 + }, + { + "epoch": 0.10218027405081359, + "grad_norm": 1.523253321647644, + "learning_rate": 4.872308739961875e-05, + "loss": 5.4033, + "step": 17181 + }, + { + "epoch": 0.10218622133409458, + "grad_norm": 1.4300789833068848, + "learning_rate": 4.8722940022966665e-05, + "loss": 5.7568, + "step": 17182 + }, + { + "epoch": 0.10219216861737558, + "grad_norm": 1.5076279640197754, + "learning_rate": 4.872279263803314e-05, + "loss": 4.9469, + "step": 17183 + }, + { + "epoch": 0.10219811590065658, + "grad_norm": 1.721596598625183, + "learning_rate": 4.872264524481824e-05, + "loss": 5.1595, + "step": 17184 + }, + { + "epoch": 0.10220406318393757, + "grad_norm": 1.5876305103302002, + "learning_rate": 4.872249784332201e-05, + "loss": 4.9964, + "step": 17185 + }, + { + "epoch": 0.10221001046721857, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.87223504335445e-05, + "loss": 5.0299, + "step": 17186 + }, + { + "epoch": 0.10221595775049958, + "grad_norm": 1.586411952972412, + "learning_rate": 4.872220301548576e-05, + "loss": 4.9945, + "step": 17187 + }, + { + "epoch": 0.10222190503378056, + "grad_norm": 1.541045069694519, + "learning_rate": 4.872205558914585e-05, + "loss": 4.8789, + "step": 17188 + }, + { + "epoch": 0.10222785231706157, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.872190815452481e-05, + "loss": 4.849, + "step": 17189 + }, + { + "epoch": 0.10223379960034257, + "grad_norm": 1.7448357343673706, + "learning_rate": 4.87217607116227e-05, + "loss": 4.7961, + "step": 17190 + }, + { + "epoch": 0.10223974688362356, + "grad_norm": 1.7249553203582764, + "learning_rate": 4.872161326043957e-05, + "loss": 4.7988, + "step": 17191 + }, + { + "epoch": 0.10224569416690456, + "grad_norm": 1.6894437074661255, + "learning_rate": 4.8721465800975465e-05, + "loss": 4.6713, + "step": 17192 + }, + { + "epoch": 0.10225164145018556, + "grad_norm": 1.5226197242736816, + "learning_rate": 4.8721318333230446e-05, + "loss": 4.8233, + "step": 17193 + }, + { + "epoch": 0.10225758873346655, + "grad_norm": 1.6511256694793701, + "learning_rate": 4.8721170857204554e-05, + "loss": 5.177, + "step": 17194 + }, + { + "epoch": 0.10226353601674755, + "grad_norm": 1.8213993310928345, + "learning_rate": 4.872102337289785e-05, + "loss": 5.2472, + "step": 17195 + }, + { + "epoch": 0.10226948330002855, + "grad_norm": 1.6683803796768188, + "learning_rate": 4.872087588031038e-05, + "loss": 4.7902, + "step": 17196 + }, + { + "epoch": 0.10227543058330954, + "grad_norm": 1.5809015035629272, + "learning_rate": 4.8720728379442204e-05, + "loss": 4.6288, + "step": 17197 + }, + { + "epoch": 0.10228137786659054, + "grad_norm": 1.7978498935699463, + "learning_rate": 4.872058087029336e-05, + "loss": 4.6638, + "step": 17198 + }, + { + "epoch": 0.10228732514987154, + "grad_norm": 1.74656081199646, + "learning_rate": 4.87204333528639e-05, + "loss": 5.652, + "step": 17199 + }, + { + "epoch": 0.10229327243315253, + "grad_norm": 1.6222811937332153, + "learning_rate": 4.87202858271539e-05, + "loss": 5.3951, + "step": 17200 + }, + { + "epoch": 0.10229921971643353, + "grad_norm": 1.8816531896591187, + "learning_rate": 4.8720138293163374e-05, + "loss": 5.728, + "step": 17201 + }, + { + "epoch": 0.10230516699971454, + "grad_norm": 1.5618531703948975, + "learning_rate": 4.871999075089241e-05, + "loss": 5.7162, + "step": 17202 + }, + { + "epoch": 0.10231111428299552, + "grad_norm": 1.4562182426452637, + "learning_rate": 4.871984320034103e-05, + "loss": 5.7563, + "step": 17203 + }, + { + "epoch": 0.10231706156627653, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.87196956415093e-05, + "loss": 5.6333, + "step": 17204 + }, + { + "epoch": 0.10232300884955753, + "grad_norm": 1.7934935092926025, + "learning_rate": 4.871954807439727e-05, + "loss": 5.5804, + "step": 17205 + }, + { + "epoch": 0.10232895613283852, + "grad_norm": 1.5005213022232056, + "learning_rate": 4.8719400499005e-05, + "loss": 5.2471, + "step": 17206 + }, + { + "epoch": 0.10233490341611952, + "grad_norm": 1.5418996810913086, + "learning_rate": 4.871925291533252e-05, + "loss": 6.0574, + "step": 17207 + }, + { + "epoch": 0.10234085069940052, + "grad_norm": 1.3919132947921753, + "learning_rate": 4.87191053233799e-05, + "loss": 6.0048, + "step": 17208 + }, + { + "epoch": 0.10234679798268151, + "grad_norm": 1.9565762281417847, + "learning_rate": 4.8718957723147184e-05, + "loss": 4.9914, + "step": 17209 + }, + { + "epoch": 0.10235274526596251, + "grad_norm": 2.3950796127319336, + "learning_rate": 4.871881011463442e-05, + "loss": 5.7963, + "step": 17210 + }, + { + "epoch": 0.1023586925492435, + "grad_norm": 2.0693960189819336, + "learning_rate": 4.871866249784167e-05, + "loss": 5.4641, + "step": 17211 + }, + { + "epoch": 0.1023646398325245, + "grad_norm": 2.105893850326538, + "learning_rate": 4.871851487276898e-05, + "loss": 5.3983, + "step": 17212 + }, + { + "epoch": 0.1023705871158055, + "grad_norm": 2.171363115310669, + "learning_rate": 4.8718367239416404e-05, + "loss": 5.6619, + "step": 17213 + }, + { + "epoch": 0.10237653439908649, + "grad_norm": 2.141611099243164, + "learning_rate": 4.8718219597783984e-05, + "loss": 5.5488, + "step": 17214 + }, + { + "epoch": 0.1023824816823675, + "grad_norm": 1.8755214214324951, + "learning_rate": 4.871807194787178e-05, + "loss": 5.4888, + "step": 17215 + }, + { + "epoch": 0.1023884289656485, + "grad_norm": 2.0865023136138916, + "learning_rate": 4.871792428967984e-05, + "loss": 5.4645, + "step": 17216 + }, + { + "epoch": 0.10239437624892948, + "grad_norm": 1.9486721754074097, + "learning_rate": 4.871777662320823e-05, + "loss": 5.4057, + "step": 17217 + }, + { + "epoch": 0.10240032353221049, + "grad_norm": 2.109412670135498, + "learning_rate": 4.8717628948456976e-05, + "loss": 5.3768, + "step": 17218 + }, + { + "epoch": 0.10240627081549149, + "grad_norm": 2.202826499938965, + "learning_rate": 4.871748126542615e-05, + "loss": 5.4996, + "step": 17219 + }, + { + "epoch": 0.10241221809877248, + "grad_norm": 1.8646687269210815, + "learning_rate": 4.87173335741158e-05, + "loss": 5.5151, + "step": 17220 + }, + { + "epoch": 0.10241816538205348, + "grad_norm": 1.7966501712799072, + "learning_rate": 4.8717185874525964e-05, + "loss": 5.5548, + "step": 17221 + }, + { + "epoch": 0.10242411266533448, + "grad_norm": 1.9538966417312622, + "learning_rate": 4.8717038166656706e-05, + "loss": 5.6221, + "step": 17222 + }, + { + "epoch": 0.10243005994861547, + "grad_norm": 1.6085959672927856, + "learning_rate": 4.871689045050808e-05, + "loss": 5.2468, + "step": 17223 + }, + { + "epoch": 0.10243600723189647, + "grad_norm": 1.7573461532592773, + "learning_rate": 4.871674272608012e-05, + "loss": 5.5835, + "step": 17224 + }, + { + "epoch": 0.10244195451517747, + "grad_norm": 1.8237701654434204, + "learning_rate": 4.87165949933729e-05, + "loss": 5.3537, + "step": 17225 + }, + { + "epoch": 0.10244790179845846, + "grad_norm": 1.963970422744751, + "learning_rate": 4.8716447252386465e-05, + "loss": 5.5714, + "step": 17226 + }, + { + "epoch": 0.10245384908173946, + "grad_norm": 2.0216476917266846, + "learning_rate": 4.871629950312086e-05, + "loss": 5.4889, + "step": 17227 + }, + { + "epoch": 0.10245979636502046, + "grad_norm": 2.0271217823028564, + "learning_rate": 4.871615174557614e-05, + "loss": 5.5903, + "step": 17228 + }, + { + "epoch": 0.10246574364830145, + "grad_norm": 1.7717560529708862, + "learning_rate": 4.871600397975236e-05, + "loss": 5.3989, + "step": 17229 + }, + { + "epoch": 0.10247169093158245, + "grad_norm": 1.722076416015625, + "learning_rate": 4.8715856205649556e-05, + "loss": 5.526, + "step": 17230 + }, + { + "epoch": 0.10247763821486346, + "grad_norm": 2.124905586242676, + "learning_rate": 4.8715708423267805e-05, + "loss": 5.3835, + "step": 17231 + }, + { + "epoch": 0.10248358549814444, + "grad_norm": 2.2088522911071777, + "learning_rate": 4.8715560632607135e-05, + "loss": 5.5228, + "step": 17232 + }, + { + "epoch": 0.10248953278142545, + "grad_norm": 2.0236847400665283, + "learning_rate": 4.871541283366761e-05, + "loss": 5.3851, + "step": 17233 + }, + { + "epoch": 0.10249548006470645, + "grad_norm": 1.7546913623809814, + "learning_rate": 4.871526502644928e-05, + "loss": 5.2, + "step": 17234 + }, + { + "epoch": 0.10250142734798744, + "grad_norm": 1.9796072244644165, + "learning_rate": 4.87151172109522e-05, + "loss": 5.3873, + "step": 17235 + }, + { + "epoch": 0.10250737463126844, + "grad_norm": 1.5305960178375244, + "learning_rate": 4.8714969387176414e-05, + "loss": 5.1888, + "step": 17236 + }, + { + "epoch": 0.10251332191454944, + "grad_norm": 2.007124185562134, + "learning_rate": 4.871482155512198e-05, + "loss": 5.4024, + "step": 17237 + }, + { + "epoch": 0.10251926919783043, + "grad_norm": 1.8268414735794067, + "learning_rate": 4.871467371478894e-05, + "loss": 5.4289, + "step": 17238 + }, + { + "epoch": 0.10252521648111143, + "grad_norm": 1.9826276302337646, + "learning_rate": 4.871452586617736e-05, + "loss": 5.3222, + "step": 17239 + }, + { + "epoch": 0.10253116376439242, + "grad_norm": 1.7642468214035034, + "learning_rate": 4.8714378009287285e-05, + "loss": 5.3858, + "step": 17240 + }, + { + "epoch": 0.10253711104767342, + "grad_norm": 1.9604185819625854, + "learning_rate": 4.8714230144118764e-05, + "loss": 5.4142, + "step": 17241 + }, + { + "epoch": 0.10254305833095442, + "grad_norm": 2.333829402923584, + "learning_rate": 4.8714082270671844e-05, + "loss": 5.2124, + "step": 17242 + }, + { + "epoch": 0.10254900561423541, + "grad_norm": 1.996928095817566, + "learning_rate": 4.8713934388946593e-05, + "loss": 5.5055, + "step": 17243 + }, + { + "epoch": 0.10255495289751641, + "grad_norm": 2.2702581882476807, + "learning_rate": 4.871378649894304e-05, + "loss": 5.3477, + "step": 17244 + }, + { + "epoch": 0.10256090018079742, + "grad_norm": 1.9696896076202393, + "learning_rate": 4.871363860066126e-05, + "loss": 5.39, + "step": 17245 + }, + { + "epoch": 0.1025668474640784, + "grad_norm": 1.7752536535263062, + "learning_rate": 4.871349069410129e-05, + "loss": 5.326, + "step": 17246 + }, + { + "epoch": 0.1025727947473594, + "grad_norm": 1.798829197883606, + "learning_rate": 4.8713342779263184e-05, + "loss": 5.4066, + "step": 17247 + }, + { + "epoch": 0.10257874203064041, + "grad_norm": 1.975467562675476, + "learning_rate": 4.871319485614699e-05, + "loss": 5.4183, + "step": 17248 + }, + { + "epoch": 0.1025846893139214, + "grad_norm": 2.4021782875061035, + "learning_rate": 4.871304692475277e-05, + "loss": 5.3949, + "step": 17249 + }, + { + "epoch": 0.1025906365972024, + "grad_norm": 1.8973580598831177, + "learning_rate": 4.871289898508058e-05, + "loss": 5.437, + "step": 17250 + }, + { + "epoch": 0.1025965838804834, + "grad_norm": 2.3427937030792236, + "learning_rate": 4.8712751037130446e-05, + "loss": 5.4347, + "step": 17251 + }, + { + "epoch": 0.10260253116376439, + "grad_norm": 1.8699359893798828, + "learning_rate": 4.871260308090245e-05, + "loss": 5.3404, + "step": 17252 + }, + { + "epoch": 0.10260847844704539, + "grad_norm": 2.146106719970703, + "learning_rate": 4.871245511639661e-05, + "loss": 5.3664, + "step": 17253 + }, + { + "epoch": 0.10261442573032639, + "grad_norm": 2.0223419666290283, + "learning_rate": 4.871230714361302e-05, + "loss": 5.4117, + "step": 17254 + }, + { + "epoch": 0.10262037301360738, + "grad_norm": 2.036025047302246, + "learning_rate": 4.871215916255169e-05, + "loss": 5.4349, + "step": 17255 + }, + { + "epoch": 0.10262632029688838, + "grad_norm": 2.0085432529449463, + "learning_rate": 4.87120111732127e-05, + "loss": 5.4896, + "step": 17256 + }, + { + "epoch": 0.10263226758016938, + "grad_norm": 2.088165521621704, + "learning_rate": 4.871186317559609e-05, + "loss": 5.2516, + "step": 17257 + }, + { + "epoch": 0.10263821486345037, + "grad_norm": 1.7493584156036377, + "learning_rate": 4.871171516970191e-05, + "loss": 5.0744, + "step": 17258 + }, + { + "epoch": 0.10264416214673137, + "grad_norm": 1.9395314455032349, + "learning_rate": 4.8711567155530224e-05, + "loss": 5.2783, + "step": 17259 + }, + { + "epoch": 0.10265010943001238, + "grad_norm": 2.057565689086914, + "learning_rate": 4.871141913308107e-05, + "loss": 5.2501, + "step": 17260 + }, + { + "epoch": 0.10265605671329336, + "grad_norm": 2.159641742706299, + "learning_rate": 4.87112711023545e-05, + "loss": 5.2844, + "step": 17261 + }, + { + "epoch": 0.10266200399657437, + "grad_norm": 1.8931914567947388, + "learning_rate": 4.8711123063350575e-05, + "loss": 5.4454, + "step": 17262 + }, + { + "epoch": 0.10266795127985537, + "grad_norm": 1.9728927612304688, + "learning_rate": 4.871097501606934e-05, + "loss": 5.3719, + "step": 17263 + }, + { + "epoch": 0.10267389856313636, + "grad_norm": 1.8770530223846436, + "learning_rate": 4.8710826960510845e-05, + "loss": 5.4244, + "step": 17264 + }, + { + "epoch": 0.10267984584641736, + "grad_norm": 2.072201728820801, + "learning_rate": 4.871067889667516e-05, + "loss": 5.3282, + "step": 17265 + }, + { + "epoch": 0.10268579312969836, + "grad_norm": 2.16689133644104, + "learning_rate": 4.8710530824562304e-05, + "loss": 5.4205, + "step": 17266 + }, + { + "epoch": 0.10269174041297935, + "grad_norm": 2.017695903778076, + "learning_rate": 4.8710382744172354e-05, + "loss": 5.1803, + "step": 17267 + }, + { + "epoch": 0.10269768769626035, + "grad_norm": 1.8181023597717285, + "learning_rate": 4.871023465550535e-05, + "loss": 5.3418, + "step": 17268 + }, + { + "epoch": 0.10270363497954134, + "grad_norm": 1.9661909341812134, + "learning_rate": 4.871008655856136e-05, + "loss": 5.115, + "step": 17269 + }, + { + "epoch": 0.10270958226282234, + "grad_norm": 1.9482250213623047, + "learning_rate": 4.870993845334041e-05, + "loss": 5.0172, + "step": 17270 + }, + { + "epoch": 0.10271552954610334, + "grad_norm": 2.0916497707366943, + "learning_rate": 4.870979033984257e-05, + "loss": 5.4317, + "step": 17271 + }, + { + "epoch": 0.10272147682938433, + "grad_norm": 1.919918417930603, + "learning_rate": 4.8709642218067894e-05, + "loss": 5.3986, + "step": 17272 + }, + { + "epoch": 0.10272742411266533, + "grad_norm": 1.8286259174346924, + "learning_rate": 4.870949408801642e-05, + "loss": 5.1301, + "step": 17273 + }, + { + "epoch": 0.10273337139594634, + "grad_norm": 2.2312278747558594, + "learning_rate": 4.870934594968821e-05, + "loss": 5.0839, + "step": 17274 + }, + { + "epoch": 0.10273931867922732, + "grad_norm": 2.2795724868774414, + "learning_rate": 4.870919780308331e-05, + "loss": 5.3578, + "step": 17275 + }, + { + "epoch": 0.10274526596250833, + "grad_norm": 2.253885269165039, + "learning_rate": 4.870904964820178e-05, + "loss": 5.2482, + "step": 17276 + }, + { + "epoch": 0.10275121324578933, + "grad_norm": 1.9351953268051147, + "learning_rate": 4.870890148504366e-05, + "loss": 5.3657, + "step": 17277 + }, + { + "epoch": 0.10275716052907032, + "grad_norm": 2.072274923324585, + "learning_rate": 4.8708753313609004e-05, + "loss": 5.2433, + "step": 17278 + }, + { + "epoch": 0.10276310781235132, + "grad_norm": 2.0419273376464844, + "learning_rate": 4.8708605133897874e-05, + "loss": 5.27, + "step": 17279 + }, + { + "epoch": 0.10276905509563232, + "grad_norm": 2.156855821609497, + "learning_rate": 4.870845694591031e-05, + "loss": 5.1727, + "step": 17280 + }, + { + "epoch": 0.10277500237891331, + "grad_norm": 1.6552194356918335, + "learning_rate": 4.870830874964637e-05, + "loss": 5.0872, + "step": 17281 + }, + { + "epoch": 0.10278094966219431, + "grad_norm": 1.8167924880981445, + "learning_rate": 4.870816054510611e-05, + "loss": 5.2827, + "step": 17282 + }, + { + "epoch": 0.10278689694547531, + "grad_norm": 2.1617610454559326, + "learning_rate": 4.870801233228956e-05, + "loss": 5.1375, + "step": 17283 + }, + { + "epoch": 0.1027928442287563, + "grad_norm": 1.918817162513733, + "learning_rate": 4.87078641111968e-05, + "loss": 5.2945, + "step": 17284 + }, + { + "epoch": 0.1027987915120373, + "grad_norm": 1.5282881259918213, + "learning_rate": 4.870771588182788e-05, + "loss": 5.6653, + "step": 17285 + }, + { + "epoch": 0.1028047387953183, + "grad_norm": 1.7902590036392212, + "learning_rate": 4.8707567644182825e-05, + "loss": 5.6262, + "step": 17286 + }, + { + "epoch": 0.10281068607859929, + "grad_norm": 1.9451625347137451, + "learning_rate": 4.87074193982617e-05, + "loss": 5.1153, + "step": 17287 + }, + { + "epoch": 0.1028166333618803, + "grad_norm": 1.832401156425476, + "learning_rate": 4.870727114406457e-05, + "loss": 5.2928, + "step": 17288 + }, + { + "epoch": 0.1028225806451613, + "grad_norm": 1.645761251449585, + "learning_rate": 4.870712288159147e-05, + "loss": 5.649, + "step": 17289 + }, + { + "epoch": 0.10282852792844228, + "grad_norm": 1.6721855401992798, + "learning_rate": 4.8706974610842474e-05, + "loss": 5.7568, + "step": 17290 + }, + { + "epoch": 0.10283447521172329, + "grad_norm": 1.7489598989486694, + "learning_rate": 4.87068263318176e-05, + "loss": 5.6752, + "step": 17291 + }, + { + "epoch": 0.10284042249500429, + "grad_norm": 1.505332112312317, + "learning_rate": 4.870667804451693e-05, + "loss": 5.2993, + "step": 17292 + }, + { + "epoch": 0.10284636977828528, + "grad_norm": 1.3620814085006714, + "learning_rate": 4.870652974894049e-05, + "loss": 4.7225, + "step": 17293 + }, + { + "epoch": 0.10285231706156628, + "grad_norm": 2.1685922145843506, + "learning_rate": 4.8706381445088356e-05, + "loss": 4.8737, + "step": 17294 + }, + { + "epoch": 0.10285826434484728, + "grad_norm": 2.219942331314087, + "learning_rate": 4.8706233132960566e-05, + "loss": 5.7529, + "step": 17295 + }, + { + "epoch": 0.10286421162812827, + "grad_norm": 1.928809404373169, + "learning_rate": 4.8706084812557176e-05, + "loss": 5.803, + "step": 17296 + }, + { + "epoch": 0.10287015891140927, + "grad_norm": 1.8534711599349976, + "learning_rate": 4.870593648387823e-05, + "loss": 5.9403, + "step": 17297 + }, + { + "epoch": 0.10287610619469026, + "grad_norm": 2.2624459266662598, + "learning_rate": 4.87057881469238e-05, + "loss": 5.1227, + "step": 17298 + }, + { + "epoch": 0.10288205347797126, + "grad_norm": 2.4320240020751953, + "learning_rate": 4.870563980169391e-05, + "loss": 4.9701, + "step": 17299 + }, + { + "epoch": 0.10288800076125226, + "grad_norm": 2.664921760559082, + "learning_rate": 4.870549144818864e-05, + "loss": 4.8771, + "step": 17300 + }, + { + "epoch": 0.10289394804453325, + "grad_norm": 2.2558987140655518, + "learning_rate": 4.870534308640802e-05, + "loss": 5.0682, + "step": 17301 + }, + { + "epoch": 0.10289989532781425, + "grad_norm": 2.291553258895874, + "learning_rate": 4.870519471635211e-05, + "loss": 4.8481, + "step": 17302 + }, + { + "epoch": 0.10290584261109526, + "grad_norm": 1.9109137058258057, + "learning_rate": 4.870504633802096e-05, + "loss": 5.377, + "step": 17303 + }, + { + "epoch": 0.10291178989437624, + "grad_norm": 1.6809476613998413, + "learning_rate": 4.870489795141463e-05, + "loss": 5.5337, + "step": 17304 + }, + { + "epoch": 0.10291773717765725, + "grad_norm": 1.6410505771636963, + "learning_rate": 4.870474955653316e-05, + "loss": 5.5353, + "step": 17305 + }, + { + "epoch": 0.10292368446093825, + "grad_norm": 1.6310313940048218, + "learning_rate": 4.87046011533766e-05, + "loss": 5.4727, + "step": 17306 + }, + { + "epoch": 0.10292963174421924, + "grad_norm": 1.6450475454330444, + "learning_rate": 4.8704452741945015e-05, + "loss": 5.3677, + "step": 17307 + }, + { + "epoch": 0.10293557902750024, + "grad_norm": 1.7327302694320679, + "learning_rate": 4.870430432223846e-05, + "loss": 5.2964, + "step": 17308 + }, + { + "epoch": 0.10294152631078124, + "grad_norm": 2.837498426437378, + "learning_rate": 4.870415589425696e-05, + "loss": 4.7407, + "step": 17309 + }, + { + "epoch": 0.10294747359406223, + "grad_norm": 2.326399803161621, + "learning_rate": 4.8704007458000593e-05, + "loss": 4.8998, + "step": 17310 + }, + { + "epoch": 0.10295342087734323, + "grad_norm": 1.9505521059036255, + "learning_rate": 4.87038590134694e-05, + "loss": 5.438, + "step": 17311 + }, + { + "epoch": 0.10295936816062423, + "grad_norm": 1.690581202507019, + "learning_rate": 4.870371056066344e-05, + "loss": 5.4291, + "step": 17312 + }, + { + "epoch": 0.10296531544390522, + "grad_norm": 1.9977236986160278, + "learning_rate": 4.870356209958276e-05, + "loss": 5.81, + "step": 17313 + }, + { + "epoch": 0.10297126272718622, + "grad_norm": 1.7996702194213867, + "learning_rate": 4.8703413630227405e-05, + "loss": 5.7569, + "step": 17314 + }, + { + "epoch": 0.10297721001046722, + "grad_norm": 1.7594531774520874, + "learning_rate": 4.870326515259743e-05, + "loss": 5.9367, + "step": 17315 + }, + { + "epoch": 0.10298315729374821, + "grad_norm": 1.8434146642684937, + "learning_rate": 4.870311666669289e-05, + "loss": 5.1578, + "step": 17316 + }, + { + "epoch": 0.10298910457702921, + "grad_norm": 2.531515598297119, + "learning_rate": 4.870296817251385e-05, + "loss": 5.0574, + "step": 17317 + }, + { + "epoch": 0.10299505186031022, + "grad_norm": 2.2126452922821045, + "learning_rate": 4.870281967006034e-05, + "loss": 4.9034, + "step": 17318 + }, + { + "epoch": 0.1030009991435912, + "grad_norm": 2.391558885574341, + "learning_rate": 4.870267115933242e-05, + "loss": 4.9584, + "step": 17319 + }, + { + "epoch": 0.1030069464268722, + "grad_norm": 1.9653453826904297, + "learning_rate": 4.8702522640330145e-05, + "loss": 4.9569, + "step": 17320 + }, + { + "epoch": 0.10301289371015321, + "grad_norm": 2.0124504566192627, + "learning_rate": 4.870237411305356e-05, + "loss": 4.9237, + "step": 17321 + }, + { + "epoch": 0.1030188409934342, + "grad_norm": 1.9120689630508423, + "learning_rate": 4.8702225577502724e-05, + "loss": 4.9637, + "step": 17322 + }, + { + "epoch": 0.1030247882767152, + "grad_norm": 2.108009099960327, + "learning_rate": 4.8702077033677684e-05, + "loss": 4.9479, + "step": 17323 + }, + { + "epoch": 0.1030307355599962, + "grad_norm": 2.211385488510132, + "learning_rate": 4.8701928481578494e-05, + "loss": 4.9553, + "step": 17324 + }, + { + "epoch": 0.10303668284327719, + "grad_norm": 2.1452252864837646, + "learning_rate": 4.8701779921205215e-05, + "loss": 4.7809, + "step": 17325 + }, + { + "epoch": 0.10304263012655819, + "grad_norm": 2.126650810241699, + "learning_rate": 4.8701631352557874e-05, + "loss": 4.7027, + "step": 17326 + }, + { + "epoch": 0.10304857740983918, + "grad_norm": 1.9753129482269287, + "learning_rate": 4.870148277563655e-05, + "loss": 4.8073, + "step": 17327 + }, + { + "epoch": 0.10305452469312018, + "grad_norm": 2.013455867767334, + "learning_rate": 4.8701334190441284e-05, + "loss": 4.7989, + "step": 17328 + }, + { + "epoch": 0.10306047197640118, + "grad_norm": 2.2819676399230957, + "learning_rate": 4.8701185596972124e-05, + "loss": 4.7784, + "step": 17329 + }, + { + "epoch": 0.10306641925968217, + "grad_norm": 2.050511360168457, + "learning_rate": 4.870103699522912e-05, + "loss": 4.9621, + "step": 17330 + }, + { + "epoch": 0.10307236654296317, + "grad_norm": 2.422591209411621, + "learning_rate": 4.870088838521233e-05, + "loss": 4.7558, + "step": 17331 + }, + { + "epoch": 0.10307831382624418, + "grad_norm": 2.2109572887420654, + "learning_rate": 4.870073976692181e-05, + "loss": 4.7162, + "step": 17332 + }, + { + "epoch": 0.10308426110952516, + "grad_norm": 2.070526123046875, + "learning_rate": 4.8700591140357596e-05, + "loss": 4.9765, + "step": 17333 + }, + { + "epoch": 0.10309020839280617, + "grad_norm": 1.610152244567871, + "learning_rate": 4.870044250551976e-05, + "loss": 5.9361, + "step": 17334 + }, + { + "epoch": 0.10309615567608717, + "grad_norm": 1.8921641111373901, + "learning_rate": 4.870029386240834e-05, + "loss": 4.9423, + "step": 17335 + }, + { + "epoch": 0.10310210295936816, + "grad_norm": 2.07476806640625, + "learning_rate": 4.870014521102339e-05, + "loss": 4.7742, + "step": 17336 + }, + { + "epoch": 0.10310805024264916, + "grad_norm": 2.021850824356079, + "learning_rate": 4.869999655136498e-05, + "loss": 4.8182, + "step": 17337 + }, + { + "epoch": 0.10311399752593016, + "grad_norm": 1.5896223783493042, + "learning_rate": 4.869984788343314e-05, + "loss": 5.5694, + "step": 17338 + }, + { + "epoch": 0.10311994480921115, + "grad_norm": 1.1907202005386353, + "learning_rate": 4.869969920722792e-05, + "loss": 5.4427, + "step": 17339 + }, + { + "epoch": 0.10312589209249215, + "grad_norm": 1.56050443649292, + "learning_rate": 4.869955052274938e-05, + "loss": 5.2405, + "step": 17340 + }, + { + "epoch": 0.10313183937577315, + "grad_norm": 1.6611580848693848, + "learning_rate": 4.869940182999757e-05, + "loss": 5.1457, + "step": 17341 + }, + { + "epoch": 0.10313778665905414, + "grad_norm": 1.4664785861968994, + "learning_rate": 4.869925312897256e-05, + "loss": 5.2846, + "step": 17342 + }, + { + "epoch": 0.10314373394233514, + "grad_norm": 1.9751476049423218, + "learning_rate": 4.8699104419674366e-05, + "loss": 5.0283, + "step": 17343 + }, + { + "epoch": 0.10314968122561614, + "grad_norm": 1.715144157409668, + "learning_rate": 4.869895570210307e-05, + "loss": 4.8856, + "step": 17344 + }, + { + "epoch": 0.10315562850889713, + "grad_norm": 1.7803713083267212, + "learning_rate": 4.8698806976258704e-05, + "loss": 5.5573, + "step": 17345 + }, + { + "epoch": 0.10316157579217813, + "grad_norm": 1.4687060117721558, + "learning_rate": 4.8698658242141336e-05, + "loss": 5.2287, + "step": 17346 + }, + { + "epoch": 0.10316752307545914, + "grad_norm": 1.6236404180526733, + "learning_rate": 4.869850949975101e-05, + "loss": 5.1, + "step": 17347 + }, + { + "epoch": 0.10317347035874012, + "grad_norm": 1.6414464712142944, + "learning_rate": 4.869836074908778e-05, + "loss": 5.0884, + "step": 17348 + }, + { + "epoch": 0.10317941764202113, + "grad_norm": 1.5938411951065063, + "learning_rate": 4.86982119901517e-05, + "loss": 5.9405, + "step": 17349 + }, + { + "epoch": 0.10318536492530213, + "grad_norm": 1.7434169054031372, + "learning_rate": 4.869806322294282e-05, + "loss": 6.3698, + "step": 17350 + }, + { + "epoch": 0.10319131220858312, + "grad_norm": 1.4999836683273315, + "learning_rate": 4.8697914447461185e-05, + "loss": 5.4169, + "step": 17351 + }, + { + "epoch": 0.10319725949186412, + "grad_norm": 1.768048644065857, + "learning_rate": 4.869776566370686e-05, + "loss": 5.6703, + "step": 17352 + }, + { + "epoch": 0.10320320677514512, + "grad_norm": 1.734729528427124, + "learning_rate": 4.869761687167988e-05, + "loss": 5.6454, + "step": 17353 + }, + { + "epoch": 0.10320915405842611, + "grad_norm": 1.848308801651001, + "learning_rate": 4.869746807138031e-05, + "loss": 5.742, + "step": 17354 + }, + { + "epoch": 0.10321510134170711, + "grad_norm": 1.628144383430481, + "learning_rate": 4.8697319262808205e-05, + "loss": 5.6099, + "step": 17355 + }, + { + "epoch": 0.1032210486249881, + "grad_norm": 1.5005884170532227, + "learning_rate": 4.86971704459636e-05, + "loss": 5.5419, + "step": 17356 + }, + { + "epoch": 0.1032269959082691, + "grad_norm": 1.5255531072616577, + "learning_rate": 4.869702162084657e-05, + "loss": 5.4757, + "step": 17357 + }, + { + "epoch": 0.1032329431915501, + "grad_norm": 1.549132227897644, + "learning_rate": 4.869687278745715e-05, + "loss": 5.4757, + "step": 17358 + }, + { + "epoch": 0.10323889047483109, + "grad_norm": 1.6518296003341675, + "learning_rate": 4.869672394579539e-05, + "loss": 5.5803, + "step": 17359 + }, + { + "epoch": 0.10324483775811209, + "grad_norm": 2.3987839221954346, + "learning_rate": 4.869657509586136e-05, + "loss": 5.0978, + "step": 17360 + }, + { + "epoch": 0.1032507850413931, + "grad_norm": 1.7290594577789307, + "learning_rate": 4.869642623765509e-05, + "loss": 5.4998, + "step": 17361 + }, + { + "epoch": 0.10325673232467408, + "grad_norm": 1.6334084272384644, + "learning_rate": 4.869627737117665e-05, + "loss": 5.4695, + "step": 17362 + }, + { + "epoch": 0.10326267960795509, + "grad_norm": 1.609734296798706, + "learning_rate": 4.8696128496426074e-05, + "loss": 5.4406, + "step": 17363 + }, + { + "epoch": 0.10326862689123609, + "grad_norm": 1.7579066753387451, + "learning_rate": 4.869597961340343e-05, + "loss": 5.6412, + "step": 17364 + }, + { + "epoch": 0.10327457417451708, + "grad_norm": 1.8831701278686523, + "learning_rate": 4.869583072210877e-05, + "loss": 5.444, + "step": 17365 + }, + { + "epoch": 0.10328052145779808, + "grad_norm": 1.9597128629684448, + "learning_rate": 4.869568182254214e-05, + "loss": 5.2228, + "step": 17366 + }, + { + "epoch": 0.10328646874107908, + "grad_norm": 1.8867931365966797, + "learning_rate": 4.8695532914703584e-05, + "loss": 4.9979, + "step": 17367 + }, + { + "epoch": 0.10329241602436007, + "grad_norm": 1.5480263233184814, + "learning_rate": 4.869538399859317e-05, + "loss": 5.6457, + "step": 17368 + }, + { + "epoch": 0.10329836330764107, + "grad_norm": 1.6710255146026611, + "learning_rate": 4.869523507421093e-05, + "loss": 5.774, + "step": 17369 + }, + { + "epoch": 0.10330431059092207, + "grad_norm": 1.6559721231460571, + "learning_rate": 4.869508614155695e-05, + "loss": 5.5643, + "step": 17370 + }, + { + "epoch": 0.10331025787420306, + "grad_norm": 1.4451355934143066, + "learning_rate": 4.869493720063124e-05, + "loss": 5.4598, + "step": 17371 + }, + { + "epoch": 0.10331620515748406, + "grad_norm": 1.8376599550247192, + "learning_rate": 4.869478825143388e-05, + "loss": 4.7552, + "step": 17372 + }, + { + "epoch": 0.10332215244076506, + "grad_norm": 2.0193891525268555, + "learning_rate": 4.869463929396491e-05, + "loss": 4.5671, + "step": 17373 + }, + { + "epoch": 0.10332809972404605, + "grad_norm": 2.07692551612854, + "learning_rate": 4.869449032822439e-05, + "loss": 4.4776, + "step": 17374 + }, + { + "epoch": 0.10333404700732705, + "grad_norm": 1.820893406867981, + "learning_rate": 4.869434135421237e-05, + "loss": 5.4705, + "step": 17375 + }, + { + "epoch": 0.10333999429060806, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.86941923719289e-05, + "loss": 4.8619, + "step": 17376 + }, + { + "epoch": 0.10334594157388904, + "grad_norm": 1.9348174333572388, + "learning_rate": 4.8694043381374026e-05, + "loss": 4.3723, + "step": 17377 + }, + { + "epoch": 0.10335188885717005, + "grad_norm": 1.8993666172027588, + "learning_rate": 4.869389438254781e-05, + "loss": 4.5442, + "step": 17378 + }, + { + "epoch": 0.10335783614045105, + "grad_norm": 1.9089124202728271, + "learning_rate": 4.869374537545031e-05, + "loss": 4.3347, + "step": 17379 + }, + { + "epoch": 0.10336378342373204, + "grad_norm": 1.8560502529144287, + "learning_rate": 4.869359636008155e-05, + "loss": 4.312, + "step": 17380 + }, + { + "epoch": 0.10336973070701304, + "grad_norm": 1.909680962562561, + "learning_rate": 4.8693447336441614e-05, + "loss": 4.3109, + "step": 17381 + }, + { + "epoch": 0.10337567799029404, + "grad_norm": 1.7769371271133423, + "learning_rate": 4.8693298304530535e-05, + "loss": 4.4442, + "step": 17382 + }, + { + "epoch": 0.10338162527357503, + "grad_norm": 2.080097198486328, + "learning_rate": 4.869314926434837e-05, + "loss": 4.339, + "step": 17383 + }, + { + "epoch": 0.10338757255685603, + "grad_norm": 1.8703278303146362, + "learning_rate": 4.8693000215895176e-05, + "loss": 4.4124, + "step": 17384 + }, + { + "epoch": 0.10339351984013702, + "grad_norm": 1.9553934335708618, + "learning_rate": 4.869285115917099e-05, + "loss": 4.3571, + "step": 17385 + }, + { + "epoch": 0.10339946712341802, + "grad_norm": 1.8989006280899048, + "learning_rate": 4.869270209417588e-05, + "loss": 4.4108, + "step": 17386 + }, + { + "epoch": 0.10340541440669902, + "grad_norm": 1.8347021341323853, + "learning_rate": 4.8692553020909896e-05, + "loss": 4.1529, + "step": 17387 + }, + { + "epoch": 0.10341136168998001, + "grad_norm": 1.9458621740341187, + "learning_rate": 4.869240393937309e-05, + "loss": 4.2392, + "step": 17388 + }, + { + "epoch": 0.10341730897326101, + "grad_norm": 1.8578664064407349, + "learning_rate": 4.86922548495655e-05, + "loss": 4.3238, + "step": 17389 + }, + { + "epoch": 0.10342325625654201, + "grad_norm": 1.9359874725341797, + "learning_rate": 4.869210575148719e-05, + "loss": 4.56, + "step": 17390 + }, + { + "epoch": 0.103429203539823, + "grad_norm": 2.0030486583709717, + "learning_rate": 4.869195664513822e-05, + "loss": 4.1571, + "step": 17391 + }, + { + "epoch": 0.103435150823104, + "grad_norm": 1.9431639909744263, + "learning_rate": 4.869180753051863e-05, + "loss": 4.2181, + "step": 17392 + }, + { + "epoch": 0.10344109810638501, + "grad_norm": 1.9171335697174072, + "learning_rate": 4.869165840762847e-05, + "loss": 4.3139, + "step": 17393 + }, + { + "epoch": 0.103447045389666, + "grad_norm": 1.9467666149139404, + "learning_rate": 4.86915092764678e-05, + "loss": 4.3906, + "step": 17394 + }, + { + "epoch": 0.103452992672947, + "grad_norm": 2.1354262828826904, + "learning_rate": 4.8691360137036666e-05, + "loss": 4.3407, + "step": 17395 + }, + { + "epoch": 0.103458939956228, + "grad_norm": 1.7994540929794312, + "learning_rate": 4.8691210989335126e-05, + "loss": 4.5767, + "step": 17396 + }, + { + "epoch": 0.10346488723950899, + "grad_norm": 1.8322330713272095, + "learning_rate": 4.869106183336323e-05, + "loss": 4.62, + "step": 17397 + }, + { + "epoch": 0.10347083452278999, + "grad_norm": 1.9874459505081177, + "learning_rate": 4.869091266912102e-05, + "loss": 4.2579, + "step": 17398 + }, + { + "epoch": 0.10347678180607099, + "grad_norm": 1.8300455808639526, + "learning_rate": 4.869076349660856e-05, + "loss": 4.3049, + "step": 17399 + }, + { + "epoch": 0.10348272908935198, + "grad_norm": 1.8731672763824463, + "learning_rate": 4.8690614315825914e-05, + "loss": 4.3241, + "step": 17400 + }, + { + "epoch": 0.10348867637263298, + "grad_norm": 1.8587061166763306, + "learning_rate": 4.86904651267731e-05, + "loss": 4.2513, + "step": 17401 + }, + { + "epoch": 0.10349462365591398, + "grad_norm": 1.8614505529403687, + "learning_rate": 4.86903159294502e-05, + "loss": 4.2877, + "step": 17402 + }, + { + "epoch": 0.10350057093919497, + "grad_norm": 1.7118782997131348, + "learning_rate": 4.869016672385725e-05, + "loss": 5.951, + "step": 17403 + }, + { + "epoch": 0.10350651822247597, + "grad_norm": 1.6701730489730835, + "learning_rate": 4.869001750999431e-05, + "loss": 5.8099, + "step": 17404 + }, + { + "epoch": 0.10351246550575698, + "grad_norm": 1.4960297346115112, + "learning_rate": 4.868986828786143e-05, + "loss": 5.7589, + "step": 17405 + }, + { + "epoch": 0.10351841278903796, + "grad_norm": 1.3732372522354126, + "learning_rate": 4.868971905745866e-05, + "loss": 5.8552, + "step": 17406 + }, + { + "epoch": 0.10352436007231897, + "grad_norm": 1.5108624696731567, + "learning_rate": 4.868956981878606e-05, + "loss": 5.82, + "step": 17407 + }, + { + "epoch": 0.10353030735559997, + "grad_norm": 1.8640809059143066, + "learning_rate": 4.868942057184367e-05, + "loss": 5.4388, + "step": 17408 + }, + { + "epoch": 0.10353625463888096, + "grad_norm": 2.082534074783325, + "learning_rate": 4.868927131663154e-05, + "loss": 4.3796, + "step": 17409 + }, + { + "epoch": 0.10354220192216196, + "grad_norm": 1.8963665962219238, + "learning_rate": 4.868912205314975e-05, + "loss": 5.6469, + "step": 17410 + }, + { + "epoch": 0.10354814920544296, + "grad_norm": 1.7797149419784546, + "learning_rate": 4.868897278139832e-05, + "loss": 5.6187, + "step": 17411 + }, + { + "epoch": 0.10355409648872395, + "grad_norm": 1.8464981317520142, + "learning_rate": 4.868882350137732e-05, + "loss": 4.8464, + "step": 17412 + }, + { + "epoch": 0.10356004377200495, + "grad_norm": 1.5401747226715088, + "learning_rate": 4.8688674213086794e-05, + "loss": 5.3547, + "step": 17413 + }, + { + "epoch": 0.10356599105528594, + "grad_norm": 1.4159618616104126, + "learning_rate": 4.868852491652679e-05, + "loss": 5.4428, + "step": 17414 + }, + { + "epoch": 0.10357193833856694, + "grad_norm": 1.6561527252197266, + "learning_rate": 4.868837561169738e-05, + "loss": 5.6467, + "step": 17415 + }, + { + "epoch": 0.10357788562184794, + "grad_norm": 1.659527063369751, + "learning_rate": 4.8688226298598586e-05, + "loss": 5.8631, + "step": 17416 + }, + { + "epoch": 0.10358383290512893, + "grad_norm": 1.8206923007965088, + "learning_rate": 4.868807697723049e-05, + "loss": 5.6475, + "step": 17417 + }, + { + "epoch": 0.10358978018840993, + "grad_norm": 1.9741102457046509, + "learning_rate": 4.868792764759312e-05, + "loss": 4.633, + "step": 17418 + }, + { + "epoch": 0.10359572747169093, + "grad_norm": 1.9505152702331543, + "learning_rate": 4.8687778309686546e-05, + "loss": 4.4024, + "step": 17419 + }, + { + "epoch": 0.10360167475497192, + "grad_norm": 1.7461168766021729, + "learning_rate": 4.868762896351082e-05, + "loss": 5.6505, + "step": 17420 + }, + { + "epoch": 0.10360762203825293, + "grad_norm": 1.6750074625015259, + "learning_rate": 4.868747960906598e-05, + "loss": 5.7747, + "step": 17421 + }, + { + "epoch": 0.10361356932153393, + "grad_norm": 1.5986868143081665, + "learning_rate": 4.8687330246352085e-05, + "loss": 5.2086, + "step": 17422 + }, + { + "epoch": 0.10361951660481492, + "grad_norm": 1.5743950605392456, + "learning_rate": 4.868718087536919e-05, + "loss": 5.6462, + "step": 17423 + }, + { + "epoch": 0.10362546388809592, + "grad_norm": 1.5192588567733765, + "learning_rate": 4.868703149611734e-05, + "loss": 5.5579, + "step": 17424 + }, + { + "epoch": 0.10363141117137692, + "grad_norm": 1.7356244325637817, + "learning_rate": 4.86868821085966e-05, + "loss": 5.5978, + "step": 17425 + }, + { + "epoch": 0.10363735845465791, + "grad_norm": 1.7366925477981567, + "learning_rate": 4.868673271280701e-05, + "loss": 5.3812, + "step": 17426 + }, + { + "epoch": 0.10364330573793891, + "grad_norm": 2.016662836074829, + "learning_rate": 4.868658330874862e-05, + "loss": 5.4003, + "step": 17427 + }, + { + "epoch": 0.10364925302121991, + "grad_norm": 2.022550582885742, + "learning_rate": 4.86864338964215e-05, + "loss": 5.191, + "step": 17428 + }, + { + "epoch": 0.1036552003045009, + "grad_norm": 1.8406000137329102, + "learning_rate": 4.868628447582568e-05, + "loss": 5.9494, + "step": 17429 + }, + { + "epoch": 0.1036611475877819, + "grad_norm": 1.7836806774139404, + "learning_rate": 4.868613504696123e-05, + "loss": 5.4606, + "step": 17430 + }, + { + "epoch": 0.1036670948710629, + "grad_norm": 1.6688835620880127, + "learning_rate": 4.86859856098282e-05, + "loss": 5.2287, + "step": 17431 + }, + { + "epoch": 0.10367304215434389, + "grad_norm": 1.7083512544631958, + "learning_rate": 4.868583616442663e-05, + "loss": 4.7133, + "step": 17432 + }, + { + "epoch": 0.1036789894376249, + "grad_norm": 1.8784829378128052, + "learning_rate": 4.8685686710756576e-05, + "loss": 4.8341, + "step": 17433 + }, + { + "epoch": 0.1036849367209059, + "grad_norm": 2.380962610244751, + "learning_rate": 4.8685537248818105e-05, + "loss": 4.6553, + "step": 17434 + }, + { + "epoch": 0.10369088400418688, + "grad_norm": 1.936126470565796, + "learning_rate": 4.868538777861125e-05, + "loss": 5.0645, + "step": 17435 + }, + { + "epoch": 0.10369683128746789, + "grad_norm": 1.9400380849838257, + "learning_rate": 4.8685238300136065e-05, + "loss": 4.9022, + "step": 17436 + }, + { + "epoch": 0.10370277857074889, + "grad_norm": 2.0275371074676514, + "learning_rate": 4.868508881339261e-05, + "loss": 4.8918, + "step": 17437 + }, + { + "epoch": 0.10370872585402988, + "grad_norm": 1.8734835386276245, + "learning_rate": 4.868493931838094e-05, + "loss": 4.9889, + "step": 17438 + }, + { + "epoch": 0.10371467313731088, + "grad_norm": 2.346519947052002, + "learning_rate": 4.868478981510111e-05, + "loss": 4.4857, + "step": 17439 + }, + { + "epoch": 0.10372062042059188, + "grad_norm": 2.4242961406707764, + "learning_rate": 4.868464030355315e-05, + "loss": 4.034, + "step": 17440 + }, + { + "epoch": 0.10372656770387287, + "grad_norm": 2.3877294063568115, + "learning_rate": 4.8684490783737133e-05, + "loss": 4.2761, + "step": 17441 + }, + { + "epoch": 0.10373251498715387, + "grad_norm": 1.832585096359253, + "learning_rate": 4.8684341255653107e-05, + "loss": 5.1485, + "step": 17442 + }, + { + "epoch": 0.10373846227043486, + "grad_norm": 2.0385608673095703, + "learning_rate": 4.868419171930112e-05, + "loss": 5.7793, + "step": 17443 + }, + { + "epoch": 0.10374440955371586, + "grad_norm": 1.8885849714279175, + "learning_rate": 4.8684042174681225e-05, + "loss": 5.9304, + "step": 17444 + }, + { + "epoch": 0.10375035683699686, + "grad_norm": 1.8748784065246582, + "learning_rate": 4.868389262179348e-05, + "loss": 5.3722, + "step": 17445 + }, + { + "epoch": 0.10375630412027785, + "grad_norm": 1.9851447343826294, + "learning_rate": 4.8683743060637924e-05, + "loss": 5.4734, + "step": 17446 + }, + { + "epoch": 0.10376225140355885, + "grad_norm": 2.387681245803833, + "learning_rate": 4.868359349121463e-05, + "loss": 4.7244, + "step": 17447 + }, + { + "epoch": 0.10376819868683985, + "grad_norm": 1.8236793279647827, + "learning_rate": 4.868344391352363e-05, + "loss": 5.0094, + "step": 17448 + }, + { + "epoch": 0.10377414597012084, + "grad_norm": 1.3649673461914062, + "learning_rate": 4.868329432756498e-05, + "loss": 5.3295, + "step": 17449 + }, + { + "epoch": 0.10378009325340184, + "grad_norm": 1.8916471004486084, + "learning_rate": 4.8683144733338746e-05, + "loss": 5.9443, + "step": 17450 + }, + { + "epoch": 0.10378604053668285, + "grad_norm": 1.8541333675384521, + "learning_rate": 4.868299513084497e-05, + "loss": 5.425, + "step": 17451 + }, + { + "epoch": 0.10379198781996384, + "grad_norm": 1.9708364009857178, + "learning_rate": 4.8682845520083695e-05, + "loss": 5.3254, + "step": 17452 + }, + { + "epoch": 0.10379793510324484, + "grad_norm": 1.7171103954315186, + "learning_rate": 4.8682695901054995e-05, + "loss": 5.3498, + "step": 17453 + }, + { + "epoch": 0.10380388238652584, + "grad_norm": 1.6002514362335205, + "learning_rate": 4.868254627375891e-05, + "loss": 5.1611, + "step": 17454 + }, + { + "epoch": 0.10380982966980683, + "grad_norm": 1.9245331287384033, + "learning_rate": 4.8682396638195486e-05, + "loss": 5.3348, + "step": 17455 + }, + { + "epoch": 0.10381577695308783, + "grad_norm": 1.4742863178253174, + "learning_rate": 4.8682246994364786e-05, + "loss": 5.7573, + "step": 17456 + }, + { + "epoch": 0.10382172423636883, + "grad_norm": 1.929343581199646, + "learning_rate": 4.8682097342266855e-05, + "loss": 5.8469, + "step": 17457 + }, + { + "epoch": 0.10382767151964982, + "grad_norm": 1.6212769746780396, + "learning_rate": 4.8681947681901754e-05, + "loss": 5.9121, + "step": 17458 + }, + { + "epoch": 0.10383361880293082, + "grad_norm": 1.6550590991973877, + "learning_rate": 4.868179801326952e-05, + "loss": 5.7114, + "step": 17459 + }, + { + "epoch": 0.10383956608621182, + "grad_norm": 1.671628475189209, + "learning_rate": 4.868164833637023e-05, + "loss": 5.3988, + "step": 17460 + }, + { + "epoch": 0.10384551336949281, + "grad_norm": 1.5833921432495117, + "learning_rate": 4.868149865120391e-05, + "loss": 5.1952, + "step": 17461 + }, + { + "epoch": 0.10385146065277381, + "grad_norm": 1.8280199766159058, + "learning_rate": 4.868134895777063e-05, + "loss": 5.4812, + "step": 17462 + }, + { + "epoch": 0.10385740793605482, + "grad_norm": 1.7413616180419922, + "learning_rate": 4.868119925607043e-05, + "loss": 5.4119, + "step": 17463 + }, + { + "epoch": 0.1038633552193358, + "grad_norm": 1.6645252704620361, + "learning_rate": 4.868104954610337e-05, + "loss": 5.3546, + "step": 17464 + }, + { + "epoch": 0.1038693025026168, + "grad_norm": 1.634175181388855, + "learning_rate": 4.86808998278695e-05, + "loss": 5.3119, + "step": 17465 + }, + { + "epoch": 0.10387524978589781, + "grad_norm": 1.5220096111297607, + "learning_rate": 4.868075010136887e-05, + "loss": 5.1345, + "step": 17466 + }, + { + "epoch": 0.1038811970691788, + "grad_norm": 1.3279895782470703, + "learning_rate": 4.8680600366601534e-05, + "loss": 5.0071, + "step": 17467 + }, + { + "epoch": 0.1038871443524598, + "grad_norm": 1.4460431337356567, + "learning_rate": 4.8680450623567555e-05, + "loss": 4.8219, + "step": 17468 + }, + { + "epoch": 0.1038930916357408, + "grad_norm": 1.7028027772903442, + "learning_rate": 4.868030087226697e-05, + "loss": 5.2679, + "step": 17469 + }, + { + "epoch": 0.10389903891902179, + "grad_norm": 1.7697324752807617, + "learning_rate": 4.8680151112699835e-05, + "loss": 5.504, + "step": 17470 + }, + { + "epoch": 0.10390498620230279, + "grad_norm": 1.4549357891082764, + "learning_rate": 4.86800013448662e-05, + "loss": 5.4475, + "step": 17471 + }, + { + "epoch": 0.10391093348558378, + "grad_norm": 1.7069107294082642, + "learning_rate": 4.867985156876613e-05, + "loss": 5.5878, + "step": 17472 + }, + { + "epoch": 0.10391688076886478, + "grad_norm": 1.8917819261550903, + "learning_rate": 4.867970178439967e-05, + "loss": 5.4449, + "step": 17473 + }, + { + "epoch": 0.10392282805214578, + "grad_norm": 1.7132060527801514, + "learning_rate": 4.8679551991766856e-05, + "loss": 5.7547, + "step": 17474 + }, + { + "epoch": 0.10392877533542677, + "grad_norm": 1.6535362005233765, + "learning_rate": 4.867940219086777e-05, + "loss": 5.9603, + "step": 17475 + }, + { + "epoch": 0.10393472261870777, + "grad_norm": 1.6559079885482788, + "learning_rate": 4.8679252381702443e-05, + "loss": 5.9673, + "step": 17476 + }, + { + "epoch": 0.10394066990198877, + "grad_norm": 1.5295041799545288, + "learning_rate": 4.867910256427093e-05, + "loss": 5.4502, + "step": 17477 + }, + { + "epoch": 0.10394661718526976, + "grad_norm": 1.8571394681930542, + "learning_rate": 4.8678952738573294e-05, + "loss": 6.1838, + "step": 17478 + }, + { + "epoch": 0.10395256446855076, + "grad_norm": 1.7148513793945312, + "learning_rate": 4.8678802904609576e-05, + "loss": 5.9624, + "step": 17479 + }, + { + "epoch": 0.10395851175183177, + "grad_norm": 1.7191139459609985, + "learning_rate": 4.867865306237983e-05, + "loss": 5.8591, + "step": 17480 + }, + { + "epoch": 0.10396445903511276, + "grad_norm": 1.526285171508789, + "learning_rate": 4.867850321188412e-05, + "loss": 5.988, + "step": 17481 + }, + { + "epoch": 0.10397040631839376, + "grad_norm": 1.5284392833709717, + "learning_rate": 4.867835335312249e-05, + "loss": 5.7212, + "step": 17482 + }, + { + "epoch": 0.10397635360167476, + "grad_norm": 1.5675333738327026, + "learning_rate": 4.8678203486094975e-05, + "loss": 5.5921, + "step": 17483 + }, + { + "epoch": 0.10398230088495575, + "grad_norm": 1.7697393894195557, + "learning_rate": 4.8678053610801654e-05, + "loss": 5.1748, + "step": 17484 + }, + { + "epoch": 0.10398824816823675, + "grad_norm": 1.5940029621124268, + "learning_rate": 4.867790372724257e-05, + "loss": 5.7108, + "step": 17485 + }, + { + "epoch": 0.10399419545151775, + "grad_norm": 2.0347743034362793, + "learning_rate": 4.867775383541777e-05, + "loss": 5.4253, + "step": 17486 + }, + { + "epoch": 0.10400014273479874, + "grad_norm": 2.1038641929626465, + "learning_rate": 4.867760393532732e-05, + "loss": 5.2362, + "step": 17487 + }, + { + "epoch": 0.10400609001807974, + "grad_norm": 2.2253377437591553, + "learning_rate": 4.867745402697126e-05, + "loss": 5.0801, + "step": 17488 + }, + { + "epoch": 0.10401203730136074, + "grad_norm": 1.8215906620025635, + "learning_rate": 4.867730411034964e-05, + "loss": 5.1438, + "step": 17489 + }, + { + "epoch": 0.10401798458464173, + "grad_norm": 1.5428386926651, + "learning_rate": 4.867715418546252e-05, + "loss": 5.0664, + "step": 17490 + }, + { + "epoch": 0.10402393186792273, + "grad_norm": 1.3886137008666992, + "learning_rate": 4.867700425230995e-05, + "loss": 4.992, + "step": 17491 + }, + { + "epoch": 0.10402987915120374, + "grad_norm": 1.4177032709121704, + "learning_rate": 4.867685431089199e-05, + "loss": 4.9245, + "step": 17492 + }, + { + "epoch": 0.10403582643448472, + "grad_norm": 1.2621585130691528, + "learning_rate": 4.867670436120867e-05, + "loss": 4.8902, + "step": 17493 + }, + { + "epoch": 0.10404177371776573, + "grad_norm": 1.4095661640167236, + "learning_rate": 4.867655440326007e-05, + "loss": 4.871, + "step": 17494 + }, + { + "epoch": 0.10404772100104673, + "grad_norm": 1.3117374181747437, + "learning_rate": 4.867640443704622e-05, + "loss": 4.9351, + "step": 17495 + }, + { + "epoch": 0.10405366828432772, + "grad_norm": 1.6237322092056274, + "learning_rate": 4.867625446256719e-05, + "loss": 5.4253, + "step": 17496 + }, + { + "epoch": 0.10405961556760872, + "grad_norm": 2.095696210861206, + "learning_rate": 4.867610447982302e-05, + "loss": 5.1793, + "step": 17497 + }, + { + "epoch": 0.10406556285088972, + "grad_norm": 3.627516508102417, + "learning_rate": 4.867595448881377e-05, + "loss": 5.1206, + "step": 17498 + }, + { + "epoch": 0.10407151013417071, + "grad_norm": 2.0525522232055664, + "learning_rate": 4.8675804489539477e-05, + "loss": 5.5922, + "step": 17499 + }, + { + "epoch": 0.10407745741745171, + "grad_norm": 1.6003656387329102, + "learning_rate": 4.867565448200022e-05, + "loss": 6.0267, + "step": 17500 + }, + { + "epoch": 0.1040834047007327, + "grad_norm": 1.4709582328796387, + "learning_rate": 4.8675504466196034e-05, + "loss": 5.55, + "step": 17501 + }, + { + "epoch": 0.1040893519840137, + "grad_norm": 1.5550457239151, + "learning_rate": 4.8675354442126966e-05, + "loss": 5.6857, + "step": 17502 + }, + { + "epoch": 0.1040952992672947, + "grad_norm": 1.6180169582366943, + "learning_rate": 4.8675204409793085e-05, + "loss": 5.3079, + "step": 17503 + }, + { + "epoch": 0.10410124655057569, + "grad_norm": 1.5625691413879395, + "learning_rate": 4.8675054369194426e-05, + "loss": 5.5965, + "step": 17504 + }, + { + "epoch": 0.10410719383385669, + "grad_norm": 1.4117538928985596, + "learning_rate": 4.8674904320331064e-05, + "loss": 5.7337, + "step": 17505 + }, + { + "epoch": 0.1041131411171377, + "grad_norm": 1.5518572330474854, + "learning_rate": 4.867475426320302e-05, + "loss": 5.5802, + "step": 17506 + }, + { + "epoch": 0.10411908840041868, + "grad_norm": 1.3276773691177368, + "learning_rate": 4.867460419781037e-05, + "loss": 6.0462, + "step": 17507 + }, + { + "epoch": 0.10412503568369968, + "grad_norm": 1.3660519123077393, + "learning_rate": 4.867445412415317e-05, + "loss": 6.0382, + "step": 17508 + }, + { + "epoch": 0.10413098296698069, + "grad_norm": 1.2959636449813843, + "learning_rate": 4.867430404223146e-05, + "loss": 5.8823, + "step": 17509 + }, + { + "epoch": 0.10413693025026168, + "grad_norm": 2.009265899658203, + "learning_rate": 4.867415395204528e-05, + "loss": 4.9889, + "step": 17510 + }, + { + "epoch": 0.10414287753354268, + "grad_norm": 1.3692728281021118, + "learning_rate": 4.8674003853594705e-05, + "loss": 5.2382, + "step": 17511 + }, + { + "epoch": 0.10414882481682368, + "grad_norm": 1.4074095487594604, + "learning_rate": 4.8673853746879785e-05, + "loss": 5.8241, + "step": 17512 + }, + { + "epoch": 0.10415477210010467, + "grad_norm": 1.2155077457427979, + "learning_rate": 4.867370363190057e-05, + "loss": 5.762, + "step": 17513 + }, + { + "epoch": 0.10416071938338567, + "grad_norm": 1.1142069101333618, + "learning_rate": 4.86735535086571e-05, + "loss": 5.7591, + "step": 17514 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 1.1758382320404053, + "learning_rate": 4.867340337714944e-05, + "loss": 5.6534, + "step": 17515 + }, + { + "epoch": 0.10417261394994766, + "grad_norm": 1.2154567241668701, + "learning_rate": 4.867325323737765e-05, + "loss": 5.7465, + "step": 17516 + }, + { + "epoch": 0.10417856123322866, + "grad_norm": 1.3033171892166138, + "learning_rate": 4.867310308934176e-05, + "loss": 5.7701, + "step": 17517 + }, + { + "epoch": 0.10418450851650966, + "grad_norm": 1.3491926193237305, + "learning_rate": 4.867295293304184e-05, + "loss": 5.7883, + "step": 17518 + }, + { + "epoch": 0.10419045579979065, + "grad_norm": 1.223988652229309, + "learning_rate": 4.867280276847793e-05, + "loss": 5.7065, + "step": 17519 + }, + { + "epoch": 0.10419640308307165, + "grad_norm": 1.3885700702667236, + "learning_rate": 4.867265259565009e-05, + "loss": 5.6934, + "step": 17520 + }, + { + "epoch": 0.10420235036635266, + "grad_norm": 1.1616452932357788, + "learning_rate": 4.867250241455837e-05, + "loss": 5.6958, + "step": 17521 + }, + { + "epoch": 0.10420829764963364, + "grad_norm": 1.2696330547332764, + "learning_rate": 4.867235222520283e-05, + "loss": 5.5534, + "step": 17522 + }, + { + "epoch": 0.10421424493291465, + "grad_norm": 1.3539372682571411, + "learning_rate": 4.8672202027583516e-05, + "loss": 5.8028, + "step": 17523 + }, + { + "epoch": 0.10422019221619565, + "grad_norm": 2.547095775604248, + "learning_rate": 4.867205182170048e-05, + "loss": 5.0223, + "step": 17524 + }, + { + "epoch": 0.10422613949947664, + "grad_norm": 1.7378231287002563, + "learning_rate": 4.8671901607553775e-05, + "loss": 5.8356, + "step": 17525 + }, + { + "epoch": 0.10423208678275764, + "grad_norm": 1.9287587404251099, + "learning_rate": 4.867175138514346e-05, + "loss": 5.9694, + "step": 17526 + }, + { + "epoch": 0.10423803406603864, + "grad_norm": 1.685260534286499, + "learning_rate": 4.867160115446957e-05, + "loss": 5.6962, + "step": 17527 + }, + { + "epoch": 0.10424398134931963, + "grad_norm": 1.594699501991272, + "learning_rate": 4.8671450915532176e-05, + "loss": 5.6139, + "step": 17528 + }, + { + "epoch": 0.10424992863260063, + "grad_norm": 1.5966441631317139, + "learning_rate": 4.867130066833132e-05, + "loss": 5.6369, + "step": 17529 + }, + { + "epoch": 0.10425587591588162, + "grad_norm": 1.701524019241333, + "learning_rate": 4.867115041286706e-05, + "loss": 5.6487, + "step": 17530 + }, + { + "epoch": 0.10426182319916262, + "grad_norm": 1.575536847114563, + "learning_rate": 4.8671000149139444e-05, + "loss": 5.5935, + "step": 17531 + }, + { + "epoch": 0.10426777048244362, + "grad_norm": 1.6812626123428345, + "learning_rate": 4.867084987714853e-05, + "loss": 5.4343, + "step": 17532 + }, + { + "epoch": 0.10427371776572461, + "grad_norm": 1.6122568845748901, + "learning_rate": 4.867069959689435e-05, + "loss": 5.5194, + "step": 17533 + }, + { + "epoch": 0.10427966504900561, + "grad_norm": 1.5337659120559692, + "learning_rate": 4.8670549308376996e-05, + "loss": 5.5248, + "step": 17534 + }, + { + "epoch": 0.10428561233228661, + "grad_norm": 1.45541250705719, + "learning_rate": 4.867039901159649e-05, + "loss": 5.6301, + "step": 17535 + }, + { + "epoch": 0.1042915596155676, + "grad_norm": 1.6674455404281616, + "learning_rate": 4.867024870655289e-05, + "loss": 6.1182, + "step": 17536 + }, + { + "epoch": 0.1042975068988486, + "grad_norm": 1.4686870574951172, + "learning_rate": 4.867009839324624e-05, + "loss": 5.9761, + "step": 17537 + }, + { + "epoch": 0.1043034541821296, + "grad_norm": 1.6447898149490356, + "learning_rate": 4.866994807167662e-05, + "loss": 5.4559, + "step": 17538 + }, + { + "epoch": 0.1043094014654106, + "grad_norm": 1.4841620922088623, + "learning_rate": 4.866979774184406e-05, + "loss": 5.4441, + "step": 17539 + }, + { + "epoch": 0.1043153487486916, + "grad_norm": 1.8813121318817139, + "learning_rate": 4.8669647403748616e-05, + "loss": 5.348, + "step": 17540 + }, + { + "epoch": 0.1043212960319726, + "grad_norm": 4.018791198730469, + "learning_rate": 4.866949705739035e-05, + "loss": 5.457, + "step": 17541 + }, + { + "epoch": 0.10432724331525359, + "grad_norm": 2.9932172298431396, + "learning_rate": 4.86693467027693e-05, + "loss": 5.2345, + "step": 17542 + }, + { + "epoch": 0.10433319059853459, + "grad_norm": 1.4329689741134644, + "learning_rate": 4.866919633988553e-05, + "loss": 5.8491, + "step": 17543 + }, + { + "epoch": 0.10433913788181559, + "grad_norm": 1.7308731079101562, + "learning_rate": 4.866904596873909e-05, + "loss": 5.5858, + "step": 17544 + }, + { + "epoch": 0.10434508516509658, + "grad_norm": 2.2066311836242676, + "learning_rate": 4.866889558933002e-05, + "loss": 4.7702, + "step": 17545 + }, + { + "epoch": 0.10435103244837758, + "grad_norm": 1.528171181678772, + "learning_rate": 4.866874520165839e-05, + "loss": 5.1622, + "step": 17546 + }, + { + "epoch": 0.10435697973165858, + "grad_norm": 1.8969347476959229, + "learning_rate": 4.866859480572424e-05, + "loss": 5.0091, + "step": 17547 + }, + { + "epoch": 0.10436292701493957, + "grad_norm": 1.6737502813339233, + "learning_rate": 4.8668444401527644e-05, + "loss": 5.7552, + "step": 17548 + }, + { + "epoch": 0.10436887429822057, + "grad_norm": 1.793411374092102, + "learning_rate": 4.8668293989068626e-05, + "loss": 5.7963, + "step": 17549 + }, + { + "epoch": 0.10437482158150158, + "grad_norm": 1.8675566911697388, + "learning_rate": 4.866814356834725e-05, + "loss": 4.7389, + "step": 17550 + }, + { + "epoch": 0.10438076886478256, + "grad_norm": 1.9145622253417969, + "learning_rate": 4.8667993139363574e-05, + "loss": 5.0921, + "step": 17551 + }, + { + "epoch": 0.10438671614806357, + "grad_norm": 1.6751158237457275, + "learning_rate": 4.866784270211764e-05, + "loss": 5.5547, + "step": 17552 + }, + { + "epoch": 0.10439266343134457, + "grad_norm": 1.754550576210022, + "learning_rate": 4.866769225660951e-05, + "loss": 5.6077, + "step": 17553 + }, + { + "epoch": 0.10439861071462556, + "grad_norm": 2.0323402881622314, + "learning_rate": 4.866754180283924e-05, + "loss": 5.1191, + "step": 17554 + }, + { + "epoch": 0.10440455799790656, + "grad_norm": 1.8000339269638062, + "learning_rate": 4.866739134080687e-05, + "loss": 5.1533, + "step": 17555 + }, + { + "epoch": 0.10441050528118756, + "grad_norm": 2.053093671798706, + "learning_rate": 4.866724087051245e-05, + "loss": 4.9985, + "step": 17556 + }, + { + "epoch": 0.10441645256446855, + "grad_norm": 1.6764185428619385, + "learning_rate": 4.866709039195605e-05, + "loss": 4.9674, + "step": 17557 + }, + { + "epoch": 0.10442239984774955, + "grad_norm": 1.6942695379257202, + "learning_rate": 4.866693990513772e-05, + "loss": 4.9319, + "step": 17558 + }, + { + "epoch": 0.10442834713103054, + "grad_norm": 1.5124322175979614, + "learning_rate": 4.8666789410057496e-05, + "loss": 5.1371, + "step": 17559 + }, + { + "epoch": 0.10443429441431154, + "grad_norm": 1.925757646560669, + "learning_rate": 4.866663890671545e-05, + "loss": 4.6366, + "step": 17560 + }, + { + "epoch": 0.10444024169759254, + "grad_norm": 2.0077321529388428, + "learning_rate": 4.866648839511161e-05, + "loss": 4.9993, + "step": 17561 + }, + { + "epoch": 0.10444618898087353, + "grad_norm": 2.1986982822418213, + "learning_rate": 4.866633787524605e-05, + "loss": 4.814, + "step": 17562 + }, + { + "epoch": 0.10445213626415453, + "grad_norm": 1.9967917203903198, + "learning_rate": 4.866618734711882e-05, + "loss": 4.5182, + "step": 17563 + }, + { + "epoch": 0.10445808354743553, + "grad_norm": 1.7663863897323608, + "learning_rate": 4.8666036810729965e-05, + "loss": 4.5589, + "step": 17564 + }, + { + "epoch": 0.10446403083071652, + "grad_norm": 1.7784098386764526, + "learning_rate": 4.8665886266079537e-05, + "loss": 4.6739, + "step": 17565 + }, + { + "epoch": 0.10446997811399752, + "grad_norm": 1.7143903970718384, + "learning_rate": 4.8665735713167596e-05, + "loss": 4.8434, + "step": 17566 + }, + { + "epoch": 0.10447592539727853, + "grad_norm": 2.018825054168701, + "learning_rate": 4.866558515199419e-05, + "loss": 4.5235, + "step": 17567 + }, + { + "epoch": 0.10448187268055951, + "grad_norm": 2.1135973930358887, + "learning_rate": 4.8665434582559374e-05, + "loss": 4.5048, + "step": 17568 + }, + { + "epoch": 0.10448781996384052, + "grad_norm": 2.097177028656006, + "learning_rate": 4.86652840048632e-05, + "loss": 4.7811, + "step": 17569 + }, + { + "epoch": 0.10449376724712152, + "grad_norm": 2.054049015045166, + "learning_rate": 4.866513341890572e-05, + "loss": 4.5964, + "step": 17570 + }, + { + "epoch": 0.10449971453040251, + "grad_norm": 1.9631117582321167, + "learning_rate": 4.866498282468699e-05, + "loss": 4.4055, + "step": 17571 + }, + { + "epoch": 0.10450566181368351, + "grad_norm": 2.079071521759033, + "learning_rate": 4.8664832222207055e-05, + "loss": 4.3743, + "step": 17572 + }, + { + "epoch": 0.10451160909696451, + "grad_norm": 1.8425450325012207, + "learning_rate": 4.8664681611465966e-05, + "loss": 4.411, + "step": 17573 + }, + { + "epoch": 0.1045175563802455, + "grad_norm": 1.812538743019104, + "learning_rate": 4.866453099246379e-05, + "loss": 4.3496, + "step": 17574 + }, + { + "epoch": 0.1045235036635265, + "grad_norm": 1.8823848962783813, + "learning_rate": 4.8664380365200566e-05, + "loss": 4.3613, + "step": 17575 + }, + { + "epoch": 0.1045294509468075, + "grad_norm": 1.6085865497589111, + "learning_rate": 4.8664229729676356e-05, + "loss": 4.5187, + "step": 17576 + }, + { + "epoch": 0.10453539823008849, + "grad_norm": 1.8719606399536133, + "learning_rate": 4.8664079085891204e-05, + "loss": 4.7276, + "step": 17577 + }, + { + "epoch": 0.1045413455133695, + "grad_norm": 1.7630116939544678, + "learning_rate": 4.866392843384517e-05, + "loss": 4.3749, + "step": 17578 + }, + { + "epoch": 0.1045472927966505, + "grad_norm": 1.8641449213027954, + "learning_rate": 4.86637777735383e-05, + "loss": 4.5781, + "step": 17579 + }, + { + "epoch": 0.10455324007993148, + "grad_norm": 1.8178362846374512, + "learning_rate": 4.8663627104970645e-05, + "loss": 4.3217, + "step": 17580 + }, + { + "epoch": 0.10455918736321249, + "grad_norm": 1.7655141353607178, + "learning_rate": 4.866347642814228e-05, + "loss": 4.4972, + "step": 17581 + }, + { + "epoch": 0.10456513464649349, + "grad_norm": 1.843266248703003, + "learning_rate": 4.8663325743053216e-05, + "loss": 4.5214, + "step": 17582 + }, + { + "epoch": 0.10457108192977448, + "grad_norm": 1.8023161888122559, + "learning_rate": 4.866317504970354e-05, + "loss": 4.3205, + "step": 17583 + }, + { + "epoch": 0.10457702921305548, + "grad_norm": 1.7845708131790161, + "learning_rate": 4.8663024348093296e-05, + "loss": 4.1439, + "step": 17584 + }, + { + "epoch": 0.10458297649633648, + "grad_norm": 2.0029754638671875, + "learning_rate": 4.866287363822253e-05, + "loss": 4.4627, + "step": 17585 + }, + { + "epoch": 0.10458892377961747, + "grad_norm": 1.6008789539337158, + "learning_rate": 4.8662722920091305e-05, + "loss": 4.5539, + "step": 17586 + }, + { + "epoch": 0.10459487106289847, + "grad_norm": 1.884207844734192, + "learning_rate": 4.8662572193699664e-05, + "loss": 4.1132, + "step": 17587 + }, + { + "epoch": 0.10460081834617946, + "grad_norm": 1.7014282941818237, + "learning_rate": 4.866242145904767e-05, + "loss": 4.9612, + "step": 17588 + }, + { + "epoch": 0.10460676562946046, + "grad_norm": 1.7388410568237305, + "learning_rate": 4.8662270716135364e-05, + "loss": 5.3079, + "step": 17589 + }, + { + "epoch": 0.10461271291274146, + "grad_norm": 1.6414510011672974, + "learning_rate": 4.8662119964962805e-05, + "loss": 5.5816, + "step": 17590 + }, + { + "epoch": 0.10461866019602245, + "grad_norm": 1.4039387702941895, + "learning_rate": 4.866196920553004e-05, + "loss": 5.0036, + "step": 17591 + }, + { + "epoch": 0.10462460747930345, + "grad_norm": 1.7621723413467407, + "learning_rate": 4.866181843783712e-05, + "loss": 5.3461, + "step": 17592 + }, + { + "epoch": 0.10463055476258445, + "grad_norm": 1.4525210857391357, + "learning_rate": 4.866166766188412e-05, + "loss": 5.2897, + "step": 17593 + }, + { + "epoch": 0.10463650204586544, + "grad_norm": 1.4203788042068481, + "learning_rate": 4.866151687767107e-05, + "loss": 5.2506, + "step": 17594 + }, + { + "epoch": 0.10464244932914644, + "grad_norm": 1.419097900390625, + "learning_rate": 4.866136608519803e-05, + "loss": 5.246, + "step": 17595 + }, + { + "epoch": 0.10464839661242745, + "grad_norm": 1.8866242170333862, + "learning_rate": 4.8661215284465047e-05, + "loss": 5.5259, + "step": 17596 + }, + { + "epoch": 0.10465434389570843, + "grad_norm": 1.5161887407302856, + "learning_rate": 4.866106447547218e-05, + "loss": 5.2219, + "step": 17597 + }, + { + "epoch": 0.10466029117898944, + "grad_norm": 1.3552051782608032, + "learning_rate": 4.866091365821948e-05, + "loss": 4.9473, + "step": 17598 + }, + { + "epoch": 0.10466623846227044, + "grad_norm": 1.3443762063980103, + "learning_rate": 4.8660762832707e-05, + "loss": 5.0027, + "step": 17599 + }, + { + "epoch": 0.10467218574555143, + "grad_norm": 1.5657448768615723, + "learning_rate": 4.866061199893479e-05, + "loss": 5.3873, + "step": 17600 + }, + { + "epoch": 0.10467813302883243, + "grad_norm": 1.177984595298767, + "learning_rate": 4.866046115690291e-05, + "loss": 4.8628, + "step": 17601 + }, + { + "epoch": 0.10468408031211343, + "grad_norm": 1.1911925077438354, + "learning_rate": 4.8660310306611405e-05, + "loss": 4.7862, + "step": 17602 + }, + { + "epoch": 0.10469002759539442, + "grad_norm": 1.238619327545166, + "learning_rate": 4.866015944806033e-05, + "loss": 4.6844, + "step": 17603 + }, + { + "epoch": 0.10469597487867542, + "grad_norm": 1.4151804447174072, + "learning_rate": 4.8660008581249736e-05, + "loss": 4.7824, + "step": 17604 + }, + { + "epoch": 0.10470192216195642, + "grad_norm": 1.1852803230285645, + "learning_rate": 4.8659857706179676e-05, + "loss": 4.8358, + "step": 17605 + }, + { + "epoch": 0.10470786944523741, + "grad_norm": 1.2641617059707642, + "learning_rate": 4.865970682285022e-05, + "loss": 4.688, + "step": 17606 + }, + { + "epoch": 0.10471381672851841, + "grad_norm": 1.3711220026016235, + "learning_rate": 4.865955593126138e-05, + "loss": 4.6552, + "step": 17607 + }, + { + "epoch": 0.10471976401179942, + "grad_norm": 1.5641502141952515, + "learning_rate": 4.865940503141325e-05, + "loss": 5.0781, + "step": 17608 + }, + { + "epoch": 0.1047257112950804, + "grad_norm": 1.5290453433990479, + "learning_rate": 4.865925412330586e-05, + "loss": 5.1347, + "step": 17609 + }, + { + "epoch": 0.1047316585783614, + "grad_norm": 1.6220836639404297, + "learning_rate": 4.8659103206939275e-05, + "loss": 5.2943, + "step": 17610 + }, + { + "epoch": 0.10473760586164241, + "grad_norm": 1.4212614297866821, + "learning_rate": 4.865895228231353e-05, + "loss": 5.2939, + "step": 17611 + }, + { + "epoch": 0.1047435531449234, + "grad_norm": 1.4920703172683716, + "learning_rate": 4.8658801349428696e-05, + "loss": 5.3314, + "step": 17612 + }, + { + "epoch": 0.1047495004282044, + "grad_norm": 1.4596521854400635, + "learning_rate": 4.865865040828482e-05, + "loss": 5.3082, + "step": 17613 + }, + { + "epoch": 0.1047554477114854, + "grad_norm": 1.2887258529663086, + "learning_rate": 4.865849945888195e-05, + "loss": 5.1002, + "step": 17614 + }, + { + "epoch": 0.10476139499476639, + "grad_norm": 1.3587419986724854, + "learning_rate": 4.8658348501220145e-05, + "loss": 4.9773, + "step": 17615 + }, + { + "epoch": 0.10476734227804739, + "grad_norm": 1.5476746559143066, + "learning_rate": 4.865819753529945e-05, + "loss": 5.0726, + "step": 17616 + }, + { + "epoch": 0.10477328956132839, + "grad_norm": 1.2820343971252441, + "learning_rate": 4.865804656111993e-05, + "loss": 5.0708, + "step": 17617 + }, + { + "epoch": 0.10477923684460938, + "grad_norm": 1.5396101474761963, + "learning_rate": 4.8657895578681634e-05, + "loss": 5.087, + "step": 17618 + }, + { + "epoch": 0.10478518412789038, + "grad_norm": 1.9199161529541016, + "learning_rate": 4.86577445879846e-05, + "loss": 4.9402, + "step": 17619 + }, + { + "epoch": 0.10479113141117137, + "grad_norm": 1.6283903121948242, + "learning_rate": 4.8657593589028894e-05, + "loss": 5.2045, + "step": 17620 + }, + { + "epoch": 0.10479707869445237, + "grad_norm": 1.350632905960083, + "learning_rate": 4.865744258181457e-05, + "loss": 5.2314, + "step": 17621 + }, + { + "epoch": 0.10480302597773337, + "grad_norm": 1.5528992414474487, + "learning_rate": 4.865729156634168e-05, + "loss": 4.9361, + "step": 17622 + }, + { + "epoch": 0.10480897326101436, + "grad_norm": 1.4698718786239624, + "learning_rate": 4.865714054261027e-05, + "loss": 5.6547, + "step": 17623 + }, + { + "epoch": 0.10481492054429536, + "grad_norm": 1.2905457019805908, + "learning_rate": 4.86569895106204e-05, + "loss": 5.5628, + "step": 17624 + }, + { + "epoch": 0.10482086782757637, + "grad_norm": 1.2559312582015991, + "learning_rate": 4.8656838470372116e-05, + "loss": 5.3106, + "step": 17625 + }, + { + "epoch": 0.10482681511085735, + "grad_norm": 1.2229273319244385, + "learning_rate": 4.8656687421865466e-05, + "loss": 5.1566, + "step": 17626 + }, + { + "epoch": 0.10483276239413836, + "grad_norm": 1.4148969650268555, + "learning_rate": 4.8656536365100524e-05, + "loss": 5.1785, + "step": 17627 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 1.4109671115875244, + "learning_rate": 4.865638530007732e-05, + "loss": 4.922, + "step": 17628 + }, + { + "epoch": 0.10484465696070035, + "grad_norm": 1.526160478591919, + "learning_rate": 4.865623422679593e-05, + "loss": 5.0734, + "step": 17629 + }, + { + "epoch": 0.10485060424398135, + "grad_norm": 1.5093508958816528, + "learning_rate": 4.865608314525638e-05, + "loss": 5.1926, + "step": 17630 + }, + { + "epoch": 0.10485655152726235, + "grad_norm": 1.4625009298324585, + "learning_rate": 4.8655932055458734e-05, + "loss": 5.1372, + "step": 17631 + }, + { + "epoch": 0.10486249881054334, + "grad_norm": 1.348502516746521, + "learning_rate": 4.865578095740305e-05, + "loss": 5.0275, + "step": 17632 + }, + { + "epoch": 0.10486844609382434, + "grad_norm": 1.4530283212661743, + "learning_rate": 4.865562985108938e-05, + "loss": 5.093, + "step": 17633 + }, + { + "epoch": 0.10487439337710534, + "grad_norm": 1.4871639013290405, + "learning_rate": 4.865547873651778e-05, + "loss": 5.0789, + "step": 17634 + }, + { + "epoch": 0.10488034066038633, + "grad_norm": 1.2314977645874023, + "learning_rate": 4.865532761368828e-05, + "loss": 5.0966, + "step": 17635 + }, + { + "epoch": 0.10488628794366733, + "grad_norm": 1.3988053798675537, + "learning_rate": 4.865517648260097e-05, + "loss": 5.4284, + "step": 17636 + }, + { + "epoch": 0.10489223522694834, + "grad_norm": 1.3434901237487793, + "learning_rate": 4.865502534325587e-05, + "loss": 5.3563, + "step": 17637 + }, + { + "epoch": 0.10489818251022932, + "grad_norm": 1.3380807638168335, + "learning_rate": 4.865487419565305e-05, + "loss": 5.3628, + "step": 17638 + }, + { + "epoch": 0.10490412979351033, + "grad_norm": 1.5222781896591187, + "learning_rate": 4.865472303979255e-05, + "loss": 5.2164, + "step": 17639 + }, + { + "epoch": 0.10491007707679133, + "grad_norm": 1.2916938066482544, + "learning_rate": 4.865457187567444e-05, + "loss": 5.1248, + "step": 17640 + }, + { + "epoch": 0.10491602436007232, + "grad_norm": 1.4988411664962769, + "learning_rate": 4.8654420703298755e-05, + "loss": 5.0932, + "step": 17641 + }, + { + "epoch": 0.10492197164335332, + "grad_norm": 1.2529023885726929, + "learning_rate": 4.8654269522665564e-05, + "loss": 5.1465, + "step": 17642 + }, + { + "epoch": 0.10492791892663432, + "grad_norm": 1.3913809061050415, + "learning_rate": 4.86541183337749e-05, + "loss": 5.0039, + "step": 17643 + }, + { + "epoch": 0.10493386620991531, + "grad_norm": 1.5128841400146484, + "learning_rate": 4.8653967136626836e-05, + "loss": 4.9937, + "step": 17644 + }, + { + "epoch": 0.10493981349319631, + "grad_norm": 1.3300340175628662, + "learning_rate": 4.865381593122142e-05, + "loss": 5.0521, + "step": 17645 + }, + { + "epoch": 0.10494576077647731, + "grad_norm": 1.6548517942428589, + "learning_rate": 4.86536647175587e-05, + "loss": 5.1361, + "step": 17646 + }, + { + "epoch": 0.1049517080597583, + "grad_norm": 1.2479137182235718, + "learning_rate": 4.865351349563873e-05, + "loss": 5.3129, + "step": 17647 + }, + { + "epoch": 0.1049576553430393, + "grad_norm": 1.3804575204849243, + "learning_rate": 4.8653362265461556e-05, + "loss": 4.9891, + "step": 17648 + }, + { + "epoch": 0.10496360262632029, + "grad_norm": 1.2821561098098755, + "learning_rate": 4.865321102702724e-05, + "loss": 5.0255, + "step": 17649 + }, + { + "epoch": 0.10496954990960129, + "grad_norm": 1.5715882778167725, + "learning_rate": 4.865305978033583e-05, + "loss": 4.9897, + "step": 17650 + }, + { + "epoch": 0.1049754971928823, + "grad_norm": 1.5910687446594238, + "learning_rate": 4.865290852538738e-05, + "loss": 5.1387, + "step": 17651 + }, + { + "epoch": 0.10498144447616328, + "grad_norm": 1.4188683032989502, + "learning_rate": 4.865275726218196e-05, + "loss": 5.3502, + "step": 17652 + }, + { + "epoch": 0.10498739175944428, + "grad_norm": 1.6032958030700684, + "learning_rate": 4.8652605990719594e-05, + "loss": 5.2716, + "step": 17653 + }, + { + "epoch": 0.10499333904272529, + "grad_norm": 1.4894942045211792, + "learning_rate": 4.8652454711000353e-05, + "loss": 5.237, + "step": 17654 + }, + { + "epoch": 0.10499928632600627, + "grad_norm": 1.5370794534683228, + "learning_rate": 4.8652303423024276e-05, + "loss": 5.0227, + "step": 17655 + }, + { + "epoch": 0.10500523360928728, + "grad_norm": 1.4100168943405151, + "learning_rate": 4.865215212679143e-05, + "loss": 5.0713, + "step": 17656 + }, + { + "epoch": 0.10501118089256828, + "grad_norm": 1.6180533170700073, + "learning_rate": 4.8652000822301856e-05, + "loss": 5.2041, + "step": 17657 + }, + { + "epoch": 0.10501712817584927, + "grad_norm": 1.2447609901428223, + "learning_rate": 4.865184950955562e-05, + "loss": 5.1073, + "step": 17658 + }, + { + "epoch": 0.10502307545913027, + "grad_norm": 1.4866548776626587, + "learning_rate": 4.865169818855277e-05, + "loss": 5.1287, + "step": 17659 + }, + { + "epoch": 0.10502902274241127, + "grad_norm": 1.33426034450531, + "learning_rate": 4.865154685929335e-05, + "loss": 5.1343, + "step": 17660 + }, + { + "epoch": 0.10503497002569226, + "grad_norm": 1.122551679611206, + "learning_rate": 4.865139552177742e-05, + "loss": 5.1267, + "step": 17661 + }, + { + "epoch": 0.10504091730897326, + "grad_norm": 1.787278175354004, + "learning_rate": 4.865124417600504e-05, + "loss": 5.4828, + "step": 17662 + }, + { + "epoch": 0.10504686459225426, + "grad_norm": 1.4937405586242676, + "learning_rate": 4.8651092821976246e-05, + "loss": 5.3467, + "step": 17663 + }, + { + "epoch": 0.10505281187553525, + "grad_norm": 1.395286202430725, + "learning_rate": 4.86509414596911e-05, + "loss": 5.1552, + "step": 17664 + }, + { + "epoch": 0.10505875915881625, + "grad_norm": 1.5284260511398315, + "learning_rate": 4.865079008914965e-05, + "loss": 5.2718, + "step": 17665 + }, + { + "epoch": 0.10506470644209726, + "grad_norm": 2.0051753520965576, + "learning_rate": 4.865063871035197e-05, + "loss": 5.1121, + "step": 17666 + }, + { + "epoch": 0.10507065372537824, + "grad_norm": 1.690699577331543, + "learning_rate": 4.8650487323298085e-05, + "loss": 5.1091, + "step": 17667 + }, + { + "epoch": 0.10507660100865925, + "grad_norm": 1.5275843143463135, + "learning_rate": 4.865033592798807e-05, + "loss": 5.3064, + "step": 17668 + }, + { + "epoch": 0.10508254829194025, + "grad_norm": 1.584038496017456, + "learning_rate": 4.865018452442195e-05, + "loss": 5.2598, + "step": 17669 + }, + { + "epoch": 0.10508849557522124, + "grad_norm": 1.8086310625076294, + "learning_rate": 4.865003311259981e-05, + "loss": 5.2229, + "step": 17670 + }, + { + "epoch": 0.10509444285850224, + "grad_norm": 1.805972695350647, + "learning_rate": 4.864988169252168e-05, + "loss": 5.1051, + "step": 17671 + }, + { + "epoch": 0.10510039014178324, + "grad_norm": 1.6209838390350342, + "learning_rate": 4.864973026418762e-05, + "loss": 5.1808, + "step": 17672 + }, + { + "epoch": 0.10510633742506423, + "grad_norm": 1.3997793197631836, + "learning_rate": 4.8649578827597684e-05, + "loss": 4.9167, + "step": 17673 + }, + { + "epoch": 0.10511228470834523, + "grad_norm": 1.368037462234497, + "learning_rate": 4.8649427382751925e-05, + "loss": 4.98, + "step": 17674 + }, + { + "epoch": 0.10511823199162623, + "grad_norm": 1.3904718160629272, + "learning_rate": 4.864927592965039e-05, + "loss": 4.8101, + "step": 17675 + }, + { + "epoch": 0.10512417927490722, + "grad_norm": 1.3237133026123047, + "learning_rate": 4.864912446829315e-05, + "loss": 5.1427, + "step": 17676 + }, + { + "epoch": 0.10513012655818822, + "grad_norm": 1.2642048597335815, + "learning_rate": 4.864897299868024e-05, + "loss": 5.2961, + "step": 17677 + }, + { + "epoch": 0.10513607384146921, + "grad_norm": 1.4357531070709229, + "learning_rate": 4.864882152081172e-05, + "loss": 5.4811, + "step": 17678 + }, + { + "epoch": 0.10514202112475021, + "grad_norm": 1.652321696281433, + "learning_rate": 4.864867003468763e-05, + "loss": 5.2172, + "step": 17679 + }, + { + "epoch": 0.10514796840803121, + "grad_norm": 1.6143925189971924, + "learning_rate": 4.864851854030804e-05, + "loss": 4.9856, + "step": 17680 + }, + { + "epoch": 0.1051539156913122, + "grad_norm": 1.637320637702942, + "learning_rate": 4.8648367037673e-05, + "loss": 4.9458, + "step": 17681 + }, + { + "epoch": 0.1051598629745932, + "grad_norm": 1.650970458984375, + "learning_rate": 4.864821552678256e-05, + "loss": 4.714, + "step": 17682 + }, + { + "epoch": 0.1051658102578742, + "grad_norm": 1.616098403930664, + "learning_rate": 4.864806400763676e-05, + "loss": 4.7064, + "step": 17683 + }, + { + "epoch": 0.1051717575411552, + "grad_norm": 1.6400461196899414, + "learning_rate": 4.864791248023568e-05, + "loss": 4.5955, + "step": 17684 + }, + { + "epoch": 0.1051777048244362, + "grad_norm": 1.3815523386001587, + "learning_rate": 4.8647760944579344e-05, + "loss": 4.7491, + "step": 17685 + }, + { + "epoch": 0.1051836521077172, + "grad_norm": 1.5695693492889404, + "learning_rate": 4.864760940066783e-05, + "loss": 4.6242, + "step": 17686 + }, + { + "epoch": 0.10518959939099819, + "grad_norm": 1.5861409902572632, + "learning_rate": 4.8647457848501174e-05, + "loss": 4.5859, + "step": 17687 + }, + { + "epoch": 0.10519554667427919, + "grad_norm": 1.637741208076477, + "learning_rate": 4.864730628807944e-05, + "loss": 4.6572, + "step": 17688 + }, + { + "epoch": 0.10520149395756019, + "grad_norm": 1.5806957483291626, + "learning_rate": 4.864715471940268e-05, + "loss": 4.8879, + "step": 17689 + }, + { + "epoch": 0.10520744124084118, + "grad_norm": 2.0158286094665527, + "learning_rate": 4.864700314247093e-05, + "loss": 5.5019, + "step": 17690 + }, + { + "epoch": 0.10521338852412218, + "grad_norm": 1.5022921562194824, + "learning_rate": 4.8646851557284256e-05, + "loss": 5.2029, + "step": 17691 + }, + { + "epoch": 0.10521933580740318, + "grad_norm": 1.8164446353912354, + "learning_rate": 4.864669996384272e-05, + "loss": 4.9258, + "step": 17692 + }, + { + "epoch": 0.10522528309068417, + "grad_norm": 1.6789724826812744, + "learning_rate": 4.864654836214636e-05, + "loss": 5.0876, + "step": 17693 + }, + { + "epoch": 0.10523123037396517, + "grad_norm": 1.778971552848816, + "learning_rate": 4.864639675219523e-05, + "loss": 5.1052, + "step": 17694 + }, + { + "epoch": 0.10523717765724618, + "grad_norm": 1.2401436567306519, + "learning_rate": 4.8646245133989396e-05, + "loss": 5.2536, + "step": 17695 + }, + { + "epoch": 0.10524312494052716, + "grad_norm": 1.6509275436401367, + "learning_rate": 4.8646093507528904e-05, + "loss": 4.9215, + "step": 17696 + }, + { + "epoch": 0.10524907222380817, + "grad_norm": 1.3725727796554565, + "learning_rate": 4.864594187281379e-05, + "loss": 5.5578, + "step": 17697 + }, + { + "epoch": 0.10525501950708917, + "grad_norm": 1.481040358543396, + "learning_rate": 4.864579022984413e-05, + "loss": 5.4683, + "step": 17698 + }, + { + "epoch": 0.10526096679037016, + "grad_norm": 1.4682444334030151, + "learning_rate": 4.864563857861998e-05, + "loss": 5.5076, + "step": 17699 + }, + { + "epoch": 0.10526691407365116, + "grad_norm": 1.2660551071166992, + "learning_rate": 4.864548691914137e-05, + "loss": 5.6092, + "step": 17700 + }, + { + "epoch": 0.10527286135693216, + "grad_norm": 1.266858458518982, + "learning_rate": 4.8645335251408366e-05, + "loss": 5.4373, + "step": 17701 + }, + { + "epoch": 0.10527880864021315, + "grad_norm": 1.5075262784957886, + "learning_rate": 4.8645183575421024e-05, + "loss": 5.3651, + "step": 17702 + }, + { + "epoch": 0.10528475592349415, + "grad_norm": 1.6108607053756714, + "learning_rate": 4.864503189117939e-05, + "loss": 5.3372, + "step": 17703 + }, + { + "epoch": 0.10529070320677515, + "grad_norm": 1.677874207496643, + "learning_rate": 4.8644880198683515e-05, + "loss": 4.9378, + "step": 17704 + }, + { + "epoch": 0.10529665049005614, + "grad_norm": 1.5847524404525757, + "learning_rate": 4.864472849793346e-05, + "loss": 5.2918, + "step": 17705 + }, + { + "epoch": 0.10530259777333714, + "grad_norm": 1.598244309425354, + "learning_rate": 4.864457678892927e-05, + "loss": 5.2408, + "step": 17706 + }, + { + "epoch": 0.10530854505661813, + "grad_norm": 1.4147340059280396, + "learning_rate": 4.8644425071671015e-05, + "loss": 5.2856, + "step": 17707 + }, + { + "epoch": 0.10531449233989913, + "grad_norm": 1.6057299375534058, + "learning_rate": 4.8644273346158734e-05, + "loss": 5.343, + "step": 17708 + }, + { + "epoch": 0.10532043962318013, + "grad_norm": 1.3503344058990479, + "learning_rate": 4.864412161239247e-05, + "loss": 5.4081, + "step": 17709 + }, + { + "epoch": 0.10532638690646112, + "grad_norm": 1.8316742181777954, + "learning_rate": 4.8643969870372295e-05, + "loss": 4.7925, + "step": 17710 + }, + { + "epoch": 0.10533233418974212, + "grad_norm": 2.1429593563079834, + "learning_rate": 4.864381812009825e-05, + "loss": 4.3519, + "step": 17711 + }, + { + "epoch": 0.10533828147302313, + "grad_norm": 1.9665764570236206, + "learning_rate": 4.8643666361570396e-05, + "loss": 4.388, + "step": 17712 + }, + { + "epoch": 0.10534422875630411, + "grad_norm": 1.7851755619049072, + "learning_rate": 4.864351459478878e-05, + "loss": 4.5242, + "step": 17713 + }, + { + "epoch": 0.10535017603958512, + "grad_norm": 1.8347305059432983, + "learning_rate": 4.864336281975346e-05, + "loss": 4.166, + "step": 17714 + }, + { + "epoch": 0.10535612332286612, + "grad_norm": 1.9413511753082275, + "learning_rate": 4.864321103646449e-05, + "loss": 4.0937, + "step": 17715 + }, + { + "epoch": 0.1053620706061471, + "grad_norm": 1.8122237920761108, + "learning_rate": 4.8643059244921904e-05, + "loss": 4.3812, + "step": 17716 + }, + { + "epoch": 0.10536801788942811, + "grad_norm": 2.0114996433258057, + "learning_rate": 4.864290744512578e-05, + "loss": 4.0728, + "step": 17717 + }, + { + "epoch": 0.10537396517270911, + "grad_norm": 1.8565599918365479, + "learning_rate": 4.8642755637076165e-05, + "loss": 4.2625, + "step": 17718 + }, + { + "epoch": 0.1053799124559901, + "grad_norm": 1.9136046171188354, + "learning_rate": 4.8642603820773105e-05, + "loss": 4.4933, + "step": 17719 + }, + { + "epoch": 0.1053858597392711, + "grad_norm": 1.8930033445358276, + "learning_rate": 4.864245199621666e-05, + "loss": 4.3249, + "step": 17720 + }, + { + "epoch": 0.1053918070225521, + "grad_norm": 1.7729578018188477, + "learning_rate": 4.864230016340687e-05, + "loss": 4.4736, + "step": 17721 + }, + { + "epoch": 0.10539775430583309, + "grad_norm": 2.1663360595703125, + "learning_rate": 4.864214832234381e-05, + "loss": 4.7505, + "step": 17722 + }, + { + "epoch": 0.1054037015891141, + "grad_norm": 1.9864879846572876, + "learning_rate": 4.864199647302751e-05, + "loss": 4.7233, + "step": 17723 + }, + { + "epoch": 0.1054096488723951, + "grad_norm": 2.031329870223999, + "learning_rate": 4.8641844615458035e-05, + "loss": 4.8218, + "step": 17724 + }, + { + "epoch": 0.10541559615567608, + "grad_norm": 2.0325984954833984, + "learning_rate": 4.864169274963544e-05, + "loss": 4.9383, + "step": 17725 + }, + { + "epoch": 0.10542154343895709, + "grad_norm": 1.9482324123382568, + "learning_rate": 4.864154087555977e-05, + "loss": 5.0849, + "step": 17726 + }, + { + "epoch": 0.10542749072223809, + "grad_norm": 1.6887640953063965, + "learning_rate": 4.864138899323108e-05, + "loss": 5.0216, + "step": 17727 + }, + { + "epoch": 0.10543343800551908, + "grad_norm": 2.0226924419403076, + "learning_rate": 4.864123710264944e-05, + "loss": 4.9241, + "step": 17728 + }, + { + "epoch": 0.10543938528880008, + "grad_norm": 1.647629976272583, + "learning_rate": 4.8641085203814873e-05, + "loss": 5.0318, + "step": 17729 + }, + { + "epoch": 0.10544533257208108, + "grad_norm": 1.766290545463562, + "learning_rate": 4.864093329672745e-05, + "loss": 4.9034, + "step": 17730 + }, + { + "epoch": 0.10545127985536207, + "grad_norm": 1.7573658227920532, + "learning_rate": 4.864078138138723e-05, + "loss": 4.7783, + "step": 17731 + }, + { + "epoch": 0.10545722713864307, + "grad_norm": 1.5503767728805542, + "learning_rate": 4.864062945779425e-05, + "loss": 5.1085, + "step": 17732 + }, + { + "epoch": 0.10546317442192407, + "grad_norm": 1.7276320457458496, + "learning_rate": 4.864047752594857e-05, + "loss": 4.8028, + "step": 17733 + }, + { + "epoch": 0.10546912170520506, + "grad_norm": 1.9654134511947632, + "learning_rate": 4.864032558585024e-05, + "loss": 5.1221, + "step": 17734 + }, + { + "epoch": 0.10547506898848606, + "grad_norm": 1.9654512405395508, + "learning_rate": 4.864017363749933e-05, + "loss": 5.0463, + "step": 17735 + }, + { + "epoch": 0.10548101627176705, + "grad_norm": 1.9071869850158691, + "learning_rate": 4.864002168089587e-05, + "loss": 5.0822, + "step": 17736 + }, + { + "epoch": 0.10548696355504805, + "grad_norm": 2.4190056324005127, + "learning_rate": 4.863986971603993e-05, + "loss": 5.7404, + "step": 17737 + }, + { + "epoch": 0.10549291083832905, + "grad_norm": 2.2098371982574463, + "learning_rate": 4.863971774293155e-05, + "loss": 5.9282, + "step": 17738 + }, + { + "epoch": 0.10549885812161004, + "grad_norm": 2.569831132888794, + "learning_rate": 4.8639565761570784e-05, + "loss": 4.3309, + "step": 17739 + }, + { + "epoch": 0.10550480540489104, + "grad_norm": 2.252847909927368, + "learning_rate": 4.8639413771957696e-05, + "loss": 4.185, + "step": 17740 + }, + { + "epoch": 0.10551075268817205, + "grad_norm": 2.3022215366363525, + "learning_rate": 4.8639261774092325e-05, + "loss": 4.3537, + "step": 17741 + }, + { + "epoch": 0.10551669997145303, + "grad_norm": 2.2695138454437256, + "learning_rate": 4.8639109767974745e-05, + "loss": 3.9806, + "step": 17742 + }, + { + "epoch": 0.10552264725473404, + "grad_norm": 2.1722588539123535, + "learning_rate": 4.8638957753604985e-05, + "loss": 3.9803, + "step": 17743 + }, + { + "epoch": 0.10552859453801504, + "grad_norm": 2.4385933876037598, + "learning_rate": 4.863880573098312e-05, + "loss": 4.0148, + "step": 17744 + }, + { + "epoch": 0.10553454182129603, + "grad_norm": 2.3186235427856445, + "learning_rate": 4.8638653700109184e-05, + "loss": 3.979, + "step": 17745 + }, + { + "epoch": 0.10554048910457703, + "grad_norm": 2.4591264724731445, + "learning_rate": 4.863850166098324e-05, + "loss": 3.9258, + "step": 17746 + }, + { + "epoch": 0.10554643638785803, + "grad_norm": 2.2619590759277344, + "learning_rate": 4.8638349613605336e-05, + "loss": 4.0571, + "step": 17747 + }, + { + "epoch": 0.10555238367113902, + "grad_norm": 2.393226146697998, + "learning_rate": 4.863819755797553e-05, + "loss": 4.0036, + "step": 17748 + }, + { + "epoch": 0.10555833095442002, + "grad_norm": 2.281846046447754, + "learning_rate": 4.8638045494093875e-05, + "loss": 3.9382, + "step": 17749 + }, + { + "epoch": 0.10556427823770102, + "grad_norm": 2.165407657623291, + "learning_rate": 4.8637893421960425e-05, + "loss": 4.0204, + "step": 17750 + }, + { + "epoch": 0.10557022552098201, + "grad_norm": 2.131829261779785, + "learning_rate": 4.863774134157523e-05, + "loss": 4.8661, + "step": 17751 + }, + { + "epoch": 0.10557617280426301, + "grad_norm": 2.0619029998779297, + "learning_rate": 4.863758925293834e-05, + "loss": 5.5522, + "step": 17752 + }, + { + "epoch": 0.10558212008754402, + "grad_norm": 1.6535427570343018, + "learning_rate": 4.863743715604981e-05, + "loss": 5.3463, + "step": 17753 + }, + { + "epoch": 0.105588067370825, + "grad_norm": 1.903904676437378, + "learning_rate": 4.86372850509097e-05, + "loss": 5.7202, + "step": 17754 + }, + { + "epoch": 0.105594014654106, + "grad_norm": 1.649357557296753, + "learning_rate": 4.863713293751806e-05, + "loss": 5.577, + "step": 17755 + }, + { + "epoch": 0.10559996193738701, + "grad_norm": 2.0812721252441406, + "learning_rate": 4.8636980815874936e-05, + "loss": 5.3164, + "step": 17756 + }, + { + "epoch": 0.105605909220668, + "grad_norm": 2.312357187271118, + "learning_rate": 4.8636828685980384e-05, + "loss": 5.3018, + "step": 17757 + }, + { + "epoch": 0.105611856503949, + "grad_norm": 2.1815388202667236, + "learning_rate": 4.863667654783447e-05, + "loss": 5.1509, + "step": 17758 + }, + { + "epoch": 0.10561780378723, + "grad_norm": 1.7500512599945068, + "learning_rate": 4.8636524401437225e-05, + "loss": 5.492, + "step": 17759 + }, + { + "epoch": 0.10562375107051099, + "grad_norm": 1.6850415468215942, + "learning_rate": 4.863637224678872e-05, + "loss": 5.5086, + "step": 17760 + }, + { + "epoch": 0.10562969835379199, + "grad_norm": 1.7222185134887695, + "learning_rate": 4.8636220083889e-05, + "loss": 5.4139, + "step": 17761 + }, + { + "epoch": 0.10563564563707299, + "grad_norm": 1.627914309501648, + "learning_rate": 4.8636067912738116e-05, + "loss": 5.5763, + "step": 17762 + }, + { + "epoch": 0.10564159292035398, + "grad_norm": 1.5884100198745728, + "learning_rate": 4.863591573333613e-05, + "loss": 5.544, + "step": 17763 + }, + { + "epoch": 0.10564754020363498, + "grad_norm": 1.4660178422927856, + "learning_rate": 4.8635763545683085e-05, + "loss": 5.4913, + "step": 17764 + }, + { + "epoch": 0.10565348748691597, + "grad_norm": 1.5240764617919922, + "learning_rate": 4.863561134977904e-05, + "loss": 5.4757, + "step": 17765 + }, + { + "epoch": 0.10565943477019697, + "grad_norm": 1.3686332702636719, + "learning_rate": 4.863545914562406e-05, + "loss": 5.4934, + "step": 17766 + }, + { + "epoch": 0.10566538205347797, + "grad_norm": 1.5429164171218872, + "learning_rate": 4.863530693321817e-05, + "loss": 5.3654, + "step": 17767 + }, + { + "epoch": 0.10567132933675896, + "grad_norm": 1.4237322807312012, + "learning_rate": 4.863515471256145e-05, + "loss": 5.4128, + "step": 17768 + }, + { + "epoch": 0.10567727662003996, + "grad_norm": 1.6438677310943604, + "learning_rate": 4.863500248365393e-05, + "loss": 5.3129, + "step": 17769 + }, + { + "epoch": 0.10568322390332097, + "grad_norm": 1.9208921194076538, + "learning_rate": 4.8634850246495675e-05, + "loss": 5.4889, + "step": 17770 + }, + { + "epoch": 0.10568917118660195, + "grad_norm": 1.6967288255691528, + "learning_rate": 4.863469800108675e-05, + "loss": 5.5301, + "step": 17771 + }, + { + "epoch": 0.10569511846988296, + "grad_norm": 1.5820802450180054, + "learning_rate": 4.8634545747427185e-05, + "loss": 5.4126, + "step": 17772 + }, + { + "epoch": 0.10570106575316396, + "grad_norm": 1.8280025720596313, + "learning_rate": 4.8634393485517046e-05, + "loss": 6.1201, + "step": 17773 + }, + { + "epoch": 0.10570701303644495, + "grad_norm": 1.809193730354309, + "learning_rate": 4.8634241215356394e-05, + "loss": 5.4123, + "step": 17774 + }, + { + "epoch": 0.10571296031972595, + "grad_norm": 1.596528172492981, + "learning_rate": 4.863408893694527e-05, + "loss": 5.6865, + "step": 17775 + }, + { + "epoch": 0.10571890760300695, + "grad_norm": 1.7726397514343262, + "learning_rate": 4.8633936650283715e-05, + "loss": 5.7298, + "step": 17776 + }, + { + "epoch": 0.10572485488628794, + "grad_norm": 1.5804529190063477, + "learning_rate": 4.863378435537182e-05, + "loss": 5.6051, + "step": 17777 + }, + { + "epoch": 0.10573080216956894, + "grad_norm": 1.5244919061660767, + "learning_rate": 4.8633632052209595e-05, + "loss": 5.7402, + "step": 17778 + }, + { + "epoch": 0.10573674945284994, + "grad_norm": 1.5003318786621094, + "learning_rate": 4.8633479740797117e-05, + "loss": 5.6978, + "step": 17779 + }, + { + "epoch": 0.10574269673613093, + "grad_norm": 1.7325289249420166, + "learning_rate": 4.863332742113444e-05, + "loss": 5.8616, + "step": 17780 + }, + { + "epoch": 0.10574864401941193, + "grad_norm": 1.8214267492294312, + "learning_rate": 4.863317509322161e-05, + "loss": 5.9213, + "step": 17781 + }, + { + "epoch": 0.10575459130269294, + "grad_norm": 1.7067787647247314, + "learning_rate": 4.863302275705869e-05, + "loss": 5.5518, + "step": 17782 + }, + { + "epoch": 0.10576053858597392, + "grad_norm": 1.8018234968185425, + "learning_rate": 4.863287041264571e-05, + "loss": 5.5241, + "step": 17783 + }, + { + "epoch": 0.10576648586925493, + "grad_norm": 1.7645032405853271, + "learning_rate": 4.863271805998275e-05, + "loss": 5.6471, + "step": 17784 + }, + { + "epoch": 0.10577243315253593, + "grad_norm": 1.6891655921936035, + "learning_rate": 4.8632565699069854e-05, + "loss": 5.9138, + "step": 17785 + }, + { + "epoch": 0.10577838043581692, + "grad_norm": 1.6546204090118408, + "learning_rate": 4.8632413329907076e-05, + "loss": 5.8511, + "step": 17786 + }, + { + "epoch": 0.10578432771909792, + "grad_norm": 1.864680528640747, + "learning_rate": 4.863226095249446e-05, + "loss": 5.7665, + "step": 17787 + }, + { + "epoch": 0.10579027500237892, + "grad_norm": 1.9052486419677734, + "learning_rate": 4.863210856683207e-05, + "loss": 5.6528, + "step": 17788 + }, + { + "epoch": 0.10579622228565991, + "grad_norm": 2.212982416152954, + "learning_rate": 4.8631956172919944e-05, + "loss": 5.2294, + "step": 17789 + }, + { + "epoch": 0.10580216956894091, + "grad_norm": 2.0703213214874268, + "learning_rate": 4.863180377075816e-05, + "loss": 4.9963, + "step": 17790 + }, + { + "epoch": 0.10580811685222191, + "grad_norm": 2.1718661785125732, + "learning_rate": 4.863165136034675e-05, + "loss": 5.1047, + "step": 17791 + }, + { + "epoch": 0.1058140641355029, + "grad_norm": 2.2078070640563965, + "learning_rate": 4.8631498941685774e-05, + "loss": 5.2682, + "step": 17792 + }, + { + "epoch": 0.1058200114187839, + "grad_norm": 2.187614917755127, + "learning_rate": 4.863134651477529e-05, + "loss": 4.9008, + "step": 17793 + }, + { + "epoch": 0.10582595870206489, + "grad_norm": 1.7202839851379395, + "learning_rate": 4.863119407961535e-05, + "loss": 5.1006, + "step": 17794 + }, + { + "epoch": 0.10583190598534589, + "grad_norm": 2.3109450340270996, + "learning_rate": 4.8631041636206e-05, + "loss": 4.8489, + "step": 17795 + }, + { + "epoch": 0.1058378532686269, + "grad_norm": 2.2688632011413574, + "learning_rate": 4.8630889184547295e-05, + "loss": 4.953, + "step": 17796 + }, + { + "epoch": 0.10584380055190788, + "grad_norm": 2.0636980533599854, + "learning_rate": 4.863073672463929e-05, + "loss": 4.9537, + "step": 17797 + }, + { + "epoch": 0.10584974783518888, + "grad_norm": 1.9752720594406128, + "learning_rate": 4.863058425648205e-05, + "loss": 4.8646, + "step": 17798 + }, + { + "epoch": 0.10585569511846989, + "grad_norm": 1.9784966707229614, + "learning_rate": 4.86304317800756e-05, + "loss": 5.1245, + "step": 17799 + }, + { + "epoch": 0.10586164240175087, + "grad_norm": 1.812218427658081, + "learning_rate": 4.863027929542002e-05, + "loss": 5.4367, + "step": 17800 + }, + { + "epoch": 0.10586758968503188, + "grad_norm": 1.8048956394195557, + "learning_rate": 4.863012680251536e-05, + "loss": 5.6052, + "step": 17801 + }, + { + "epoch": 0.10587353696831288, + "grad_norm": 1.9246432781219482, + "learning_rate": 4.862997430136166e-05, + "loss": 5.9335, + "step": 17802 + }, + { + "epoch": 0.10587948425159387, + "grad_norm": 1.5138533115386963, + "learning_rate": 4.862982179195897e-05, + "loss": 5.8785, + "step": 17803 + }, + { + "epoch": 0.10588543153487487, + "grad_norm": 1.4948742389678955, + "learning_rate": 4.862966927430737e-05, + "loss": 5.7478, + "step": 17804 + }, + { + "epoch": 0.10589137881815587, + "grad_norm": 1.4670746326446533, + "learning_rate": 4.862951674840689e-05, + "loss": 5.7397, + "step": 17805 + }, + { + "epoch": 0.10589732610143686, + "grad_norm": 1.4234925508499146, + "learning_rate": 4.862936421425759e-05, + "loss": 5.9919, + "step": 17806 + }, + { + "epoch": 0.10590327338471786, + "grad_norm": 1.8313277959823608, + "learning_rate": 4.862921167185953e-05, + "loss": 5.7289, + "step": 17807 + }, + { + "epoch": 0.10590922066799886, + "grad_norm": 1.7373311519622803, + "learning_rate": 4.8629059121212745e-05, + "loss": 5.7652, + "step": 17808 + }, + { + "epoch": 0.10591516795127985, + "grad_norm": 1.7706129550933838, + "learning_rate": 4.86289065623173e-05, + "loss": 5.4623, + "step": 17809 + }, + { + "epoch": 0.10592111523456085, + "grad_norm": 1.7332470417022705, + "learning_rate": 4.862875399517325e-05, + "loss": 5.5546, + "step": 17810 + }, + { + "epoch": 0.10592706251784186, + "grad_norm": 1.7493473291397095, + "learning_rate": 4.862860141978065e-05, + "loss": 5.2762, + "step": 17811 + }, + { + "epoch": 0.10593300980112284, + "grad_norm": 1.8064602613449097, + "learning_rate": 4.862844883613955e-05, + "loss": 5.2969, + "step": 17812 + }, + { + "epoch": 0.10593895708440385, + "grad_norm": 1.6318674087524414, + "learning_rate": 4.862829624425e-05, + "loss": 5.3229, + "step": 17813 + }, + { + "epoch": 0.10594490436768485, + "grad_norm": 1.7438777685165405, + "learning_rate": 4.8628143644112056e-05, + "loss": 5.3167, + "step": 17814 + }, + { + "epoch": 0.10595085165096584, + "grad_norm": 1.8095386028289795, + "learning_rate": 4.8627991035725774e-05, + "loss": 5.2744, + "step": 17815 + }, + { + "epoch": 0.10595679893424684, + "grad_norm": 1.8095691204071045, + "learning_rate": 4.86278384190912e-05, + "loss": 5.5105, + "step": 17816 + }, + { + "epoch": 0.10596274621752784, + "grad_norm": 1.858776569366455, + "learning_rate": 4.862768579420839e-05, + "loss": 5.4338, + "step": 17817 + }, + { + "epoch": 0.10596869350080883, + "grad_norm": 1.8224806785583496, + "learning_rate": 4.86275331610774e-05, + "loss": 5.6273, + "step": 17818 + }, + { + "epoch": 0.10597464078408983, + "grad_norm": 1.6850696802139282, + "learning_rate": 4.8627380519698284e-05, + "loss": 5.9963, + "step": 17819 + }, + { + "epoch": 0.10598058806737083, + "grad_norm": 1.4804600477218628, + "learning_rate": 4.86272278700711e-05, + "loss": 5.726, + "step": 17820 + }, + { + "epoch": 0.10598653535065182, + "grad_norm": 1.721027135848999, + "learning_rate": 4.862707521219589e-05, + "loss": 5.191, + "step": 17821 + }, + { + "epoch": 0.10599248263393282, + "grad_norm": 1.8109691143035889, + "learning_rate": 4.862692254607271e-05, + "loss": 4.926, + "step": 17822 + }, + { + "epoch": 0.10599842991721381, + "grad_norm": 1.7531434297561646, + "learning_rate": 4.862676987170162e-05, + "loss": 5.0376, + "step": 17823 + }, + { + "epoch": 0.10600437720049481, + "grad_norm": 1.6847648620605469, + "learning_rate": 4.8626617189082656e-05, + "loss": 5.0376, + "step": 17824 + }, + { + "epoch": 0.10601032448377581, + "grad_norm": 1.6512411832809448, + "learning_rate": 4.86264644982159e-05, + "loss": 5.087, + "step": 17825 + }, + { + "epoch": 0.1060162717670568, + "grad_norm": 1.6410924196243286, + "learning_rate": 4.8626311799101375e-05, + "loss": 5.6917, + "step": 17826 + }, + { + "epoch": 0.1060222190503378, + "grad_norm": 2.1565957069396973, + "learning_rate": 4.862615909173916e-05, + "loss": 4.619, + "step": 17827 + }, + { + "epoch": 0.1060281663336188, + "grad_norm": 1.8235310316085815, + "learning_rate": 4.86260063761293e-05, + "loss": 5.1155, + "step": 17828 + }, + { + "epoch": 0.1060341136168998, + "grad_norm": 1.7710633277893066, + "learning_rate": 4.862585365227184e-05, + "loss": 4.7845, + "step": 17829 + }, + { + "epoch": 0.1060400609001808, + "grad_norm": 2.174832820892334, + "learning_rate": 4.862570092016683e-05, + "loss": 4.6384, + "step": 17830 + }, + { + "epoch": 0.1060460081834618, + "grad_norm": 2.359682321548462, + "learning_rate": 4.862554817981434e-05, + "loss": 4.2191, + "step": 17831 + }, + { + "epoch": 0.10605195546674279, + "grad_norm": 2.4251585006713867, + "learning_rate": 4.8625395431214414e-05, + "loss": 4.0982, + "step": 17832 + }, + { + "epoch": 0.10605790275002379, + "grad_norm": 2.543009042739868, + "learning_rate": 4.86252426743671e-05, + "loss": 4.0773, + "step": 17833 + }, + { + "epoch": 0.10606385003330479, + "grad_norm": 2.6991419792175293, + "learning_rate": 4.862508990927247e-05, + "loss": 4.0209, + "step": 17834 + }, + { + "epoch": 0.10606979731658578, + "grad_norm": 2.354445695877075, + "learning_rate": 4.862493713593056e-05, + "loss": 3.9223, + "step": 17835 + }, + { + "epoch": 0.10607574459986678, + "grad_norm": 2.5119223594665527, + "learning_rate": 4.8624784354341426e-05, + "loss": 3.9006, + "step": 17836 + }, + { + "epoch": 0.10608169188314778, + "grad_norm": 2.717792272567749, + "learning_rate": 4.862463156450513e-05, + "loss": 4.3295, + "step": 17837 + }, + { + "epoch": 0.10608763916642877, + "grad_norm": 3.1779162883758545, + "learning_rate": 4.862447876642171e-05, + "loss": 4.3483, + "step": 17838 + }, + { + "epoch": 0.10609358644970977, + "grad_norm": 2.272994041442871, + "learning_rate": 4.8624325960091235e-05, + "loss": 4.2826, + "step": 17839 + }, + { + "epoch": 0.10609953373299078, + "grad_norm": 2.4689860343933105, + "learning_rate": 4.862417314551375e-05, + "loss": 4.9144, + "step": 17840 + }, + { + "epoch": 0.10610548101627176, + "grad_norm": 1.8101458549499512, + "learning_rate": 4.862402032268931e-05, + "loss": 5.9325, + "step": 17841 + }, + { + "epoch": 0.10611142829955277, + "grad_norm": 1.9994734525680542, + "learning_rate": 4.862386749161797e-05, + "loss": 5.5438, + "step": 17842 + }, + { + "epoch": 0.10611737558283377, + "grad_norm": 2.5475401878356934, + "learning_rate": 4.8623714652299786e-05, + "loss": 5.2262, + "step": 17843 + }, + { + "epoch": 0.10612332286611476, + "grad_norm": 2.286040782928467, + "learning_rate": 4.86235618047348e-05, + "loss": 5.065, + "step": 17844 + }, + { + "epoch": 0.10612927014939576, + "grad_norm": 1.788761854171753, + "learning_rate": 4.862340894892308e-05, + "loss": 5.5053, + "step": 17845 + }, + { + "epoch": 0.10613521743267676, + "grad_norm": 2.2951841354370117, + "learning_rate": 4.8623256084864663e-05, + "loss": 5.1262, + "step": 17846 + }, + { + "epoch": 0.10614116471595775, + "grad_norm": 1.962814211845398, + "learning_rate": 4.862310321255962e-05, + "loss": 5.8084, + "step": 17847 + }, + { + "epoch": 0.10614711199923875, + "grad_norm": 1.7888414859771729, + "learning_rate": 4.862295033200799e-05, + "loss": 5.2409, + "step": 17848 + }, + { + "epoch": 0.10615305928251975, + "grad_norm": 1.7108670473098755, + "learning_rate": 4.862279744320983e-05, + "loss": 5.6138, + "step": 17849 + }, + { + "epoch": 0.10615900656580074, + "grad_norm": 1.7636443376541138, + "learning_rate": 4.8622644546165196e-05, + "loss": 5.5664, + "step": 17850 + }, + { + "epoch": 0.10616495384908174, + "grad_norm": 1.7193186283111572, + "learning_rate": 4.8622491640874147e-05, + "loss": 5.7852, + "step": 17851 + }, + { + "epoch": 0.10617090113236273, + "grad_norm": 1.817215919494629, + "learning_rate": 4.8622338727336723e-05, + "loss": 5.5478, + "step": 17852 + }, + { + "epoch": 0.10617684841564373, + "grad_norm": 1.547817349433899, + "learning_rate": 4.8622185805552994e-05, + "loss": 5.5249, + "step": 17853 + }, + { + "epoch": 0.10618279569892473, + "grad_norm": 1.577528953552246, + "learning_rate": 4.862203287552299e-05, + "loss": 5.7268, + "step": 17854 + }, + { + "epoch": 0.10618874298220572, + "grad_norm": 1.4524853229522705, + "learning_rate": 4.862187993724679e-05, + "loss": 5.8539, + "step": 17855 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 1.6361198425292969, + "learning_rate": 4.8621726990724437e-05, + "loss": 5.0815, + "step": 17856 + }, + { + "epoch": 0.10620063754876773, + "grad_norm": 1.65043044090271, + "learning_rate": 4.862157403595598e-05, + "loss": 5.1938, + "step": 17857 + }, + { + "epoch": 0.10620658483204871, + "grad_norm": 1.6236746311187744, + "learning_rate": 4.8621421072941476e-05, + "loss": 5.5602, + "step": 17858 + }, + { + "epoch": 0.10621253211532972, + "grad_norm": 1.4648228883743286, + "learning_rate": 4.862126810168097e-05, + "loss": 5.3728, + "step": 17859 + }, + { + "epoch": 0.10621847939861072, + "grad_norm": 1.4803123474121094, + "learning_rate": 4.862111512217453e-05, + "loss": 5.58, + "step": 17860 + }, + { + "epoch": 0.1062244266818917, + "grad_norm": 1.320387840270996, + "learning_rate": 4.862096213442221e-05, + "loss": 5.0337, + "step": 17861 + }, + { + "epoch": 0.10623037396517271, + "grad_norm": 1.8309158086776733, + "learning_rate": 4.862080913842405e-05, + "loss": 4.3603, + "step": 17862 + }, + { + "epoch": 0.10623632124845371, + "grad_norm": 1.79231595993042, + "learning_rate": 4.86206561341801e-05, + "loss": 4.401, + "step": 17863 + }, + { + "epoch": 0.1062422685317347, + "grad_norm": 1.7894480228424072, + "learning_rate": 4.862050312169043e-05, + "loss": 4.4592, + "step": 17864 + }, + { + "epoch": 0.1062482158150157, + "grad_norm": 1.8271396160125732, + "learning_rate": 4.8620350100955095e-05, + "loss": 4.2442, + "step": 17865 + }, + { + "epoch": 0.1062541630982967, + "grad_norm": 2.03336238861084, + "learning_rate": 4.862019707197413e-05, + "loss": 4.6245, + "step": 17866 + }, + { + "epoch": 0.10626011038157769, + "grad_norm": 1.8034088611602783, + "learning_rate": 4.86200440347476e-05, + "loss": 4.5798, + "step": 17867 + }, + { + "epoch": 0.10626605766485869, + "grad_norm": 1.366013765335083, + "learning_rate": 4.861989098927556e-05, + "loss": 5.2409, + "step": 17868 + }, + { + "epoch": 0.1062720049481397, + "grad_norm": 1.603281855583191, + "learning_rate": 4.8619737935558054e-05, + "loss": 5.6699, + "step": 17869 + }, + { + "epoch": 0.10627795223142068, + "grad_norm": 1.6720329523086548, + "learning_rate": 4.861958487359515e-05, + "loss": 5.2162, + "step": 17870 + }, + { + "epoch": 0.10628389951470169, + "grad_norm": 2.5577762126922607, + "learning_rate": 4.861943180338689e-05, + "loss": 3.9116, + "step": 17871 + }, + { + "epoch": 0.10628984679798269, + "grad_norm": 2.6489310264587402, + "learning_rate": 4.861927872493332e-05, + "loss": 4.232, + "step": 17872 + }, + { + "epoch": 0.10629579408126368, + "grad_norm": 2.481381893157959, + "learning_rate": 4.861912563823451e-05, + "loss": 4.374, + "step": 17873 + }, + { + "epoch": 0.10630174136454468, + "grad_norm": 2.444721221923828, + "learning_rate": 4.861897254329052e-05, + "loss": 4.504, + "step": 17874 + }, + { + "epoch": 0.10630768864782568, + "grad_norm": 2.529085636138916, + "learning_rate": 4.8618819440101373e-05, + "loss": 4.1305, + "step": 17875 + }, + { + "epoch": 0.10631363593110667, + "grad_norm": 3.966379404067993, + "learning_rate": 4.861866632866715e-05, + "loss": 3.9104, + "step": 17876 + }, + { + "epoch": 0.10631958321438767, + "grad_norm": 2.408405065536499, + "learning_rate": 4.8618513208987895e-05, + "loss": 3.8762, + "step": 17877 + }, + { + "epoch": 0.10632553049766867, + "grad_norm": 2.41780686378479, + "learning_rate": 4.8618360081063654e-05, + "loss": 3.7665, + "step": 17878 + }, + { + "epoch": 0.10633147778094966, + "grad_norm": 2.60262393951416, + "learning_rate": 4.861820694489448e-05, + "loss": 4.067, + "step": 17879 + }, + { + "epoch": 0.10633742506423066, + "grad_norm": 2.624938726425171, + "learning_rate": 4.8618053800480456e-05, + "loss": 4.5653, + "step": 17880 + }, + { + "epoch": 0.10634337234751165, + "grad_norm": 2.783202886581421, + "learning_rate": 4.86179006478216e-05, + "loss": 4.4091, + "step": 17881 + }, + { + "epoch": 0.10634931963079265, + "grad_norm": 2.8269615173339844, + "learning_rate": 4.861774748691798e-05, + "loss": 3.949, + "step": 17882 + }, + { + "epoch": 0.10635526691407365, + "grad_norm": 2.82108998298645, + "learning_rate": 4.861759431776965e-05, + "loss": 3.8479, + "step": 17883 + }, + { + "epoch": 0.10636121419735464, + "grad_norm": 2.8543620109558105, + "learning_rate": 4.861744114037666e-05, + "loss": 3.4358, + "step": 17884 + }, + { + "epoch": 0.10636716148063564, + "grad_norm": 2.6492035388946533, + "learning_rate": 4.861728795473907e-05, + "loss": 3.6298, + "step": 17885 + }, + { + "epoch": 0.10637310876391665, + "grad_norm": 2.834181785583496, + "learning_rate": 4.861713476085693e-05, + "loss": 3.4125, + "step": 17886 + }, + { + "epoch": 0.10637905604719763, + "grad_norm": 3.447075605392456, + "learning_rate": 4.861698155873028e-05, + "loss": 3.5416, + "step": 17887 + }, + { + "epoch": 0.10638500333047864, + "grad_norm": 3.6009531021118164, + "learning_rate": 4.86168283483592e-05, + "loss": 4.1912, + "step": 17888 + }, + { + "epoch": 0.10639095061375964, + "grad_norm": 4.086645126342773, + "learning_rate": 4.861667512974372e-05, + "loss": 4.3999, + "step": 17889 + }, + { + "epoch": 0.10639689789704063, + "grad_norm": 3.673405408859253, + "learning_rate": 4.86165219028839e-05, + "loss": 4.3731, + "step": 17890 + }, + { + "epoch": 0.10640284518032163, + "grad_norm": 2.2896664142608643, + "learning_rate": 4.861636866777981e-05, + "loss": 5.5963, + "step": 17891 + }, + { + "epoch": 0.10640879246360263, + "grad_norm": 2.0481069087982178, + "learning_rate": 4.861621542443148e-05, + "loss": 5.7909, + "step": 17892 + }, + { + "epoch": 0.10641473974688362, + "grad_norm": 1.9108741283416748, + "learning_rate": 4.861606217283897e-05, + "loss": 5.3044, + "step": 17893 + }, + { + "epoch": 0.10642068703016462, + "grad_norm": 1.7842040061950684, + "learning_rate": 4.861590891300235e-05, + "loss": 5.3071, + "step": 17894 + }, + { + "epoch": 0.10642663431344562, + "grad_norm": 1.854777455329895, + "learning_rate": 4.861575564492164e-05, + "loss": 5.386, + "step": 17895 + }, + { + "epoch": 0.10643258159672661, + "grad_norm": 1.7286109924316406, + "learning_rate": 4.861560236859693e-05, + "loss": 5.5609, + "step": 17896 + }, + { + "epoch": 0.10643852888000761, + "grad_norm": 1.709408164024353, + "learning_rate": 4.861544908402825e-05, + "loss": 5.6772, + "step": 17897 + }, + { + "epoch": 0.10644447616328861, + "grad_norm": 1.9251428842544556, + "learning_rate": 4.861529579121567e-05, + "loss": 5.6114, + "step": 17898 + }, + { + "epoch": 0.1064504234465696, + "grad_norm": 1.6568808555603027, + "learning_rate": 4.8615142490159226e-05, + "loss": 5.4648, + "step": 17899 + }, + { + "epoch": 0.1064563707298506, + "grad_norm": 1.7793960571289062, + "learning_rate": 4.861498918085898e-05, + "loss": 5.4987, + "step": 17900 + }, + { + "epoch": 0.10646231801313161, + "grad_norm": 1.9044899940490723, + "learning_rate": 4.861483586331499e-05, + "loss": 5.7757, + "step": 17901 + }, + { + "epoch": 0.1064682652964126, + "grad_norm": 2.215278387069702, + "learning_rate": 4.86146825375273e-05, + "loss": 6.2767, + "step": 17902 + }, + { + "epoch": 0.1064742125796936, + "grad_norm": 1.8699604272842407, + "learning_rate": 4.861452920349597e-05, + "loss": 6.2987, + "step": 17903 + }, + { + "epoch": 0.1064801598629746, + "grad_norm": 1.634887456893921, + "learning_rate": 4.861437586122105e-05, + "loss": 6.2596, + "step": 17904 + }, + { + "epoch": 0.10648610714625559, + "grad_norm": 1.54149329662323, + "learning_rate": 4.86142225107026e-05, + "loss": 6.1988, + "step": 17905 + }, + { + "epoch": 0.10649205442953659, + "grad_norm": 1.5954409837722778, + "learning_rate": 4.861406915194067e-05, + "loss": 6.1052, + "step": 17906 + }, + { + "epoch": 0.10649800171281759, + "grad_norm": 1.8810808658599854, + "learning_rate": 4.86139157849353e-05, + "loss": 6.0318, + "step": 17907 + }, + { + "epoch": 0.10650394899609858, + "grad_norm": 1.4983458518981934, + "learning_rate": 4.861376240968656e-05, + "loss": 5.8614, + "step": 17908 + }, + { + "epoch": 0.10650989627937958, + "grad_norm": 1.5446088314056396, + "learning_rate": 4.8613609026194504e-05, + "loss": 5.623, + "step": 17909 + }, + { + "epoch": 0.10651584356266057, + "grad_norm": 1.7121042013168335, + "learning_rate": 4.861345563445918e-05, + "loss": 4.9258, + "step": 17910 + }, + { + "epoch": 0.10652179084594157, + "grad_norm": 2.002478837966919, + "learning_rate": 4.861330223448065e-05, + "loss": 5.285, + "step": 17911 + }, + { + "epoch": 0.10652773812922257, + "grad_norm": 1.7703490257263184, + "learning_rate": 4.8613148826258944e-05, + "loss": 5.2279, + "step": 17912 + }, + { + "epoch": 0.10653368541250356, + "grad_norm": 1.7763222455978394, + "learning_rate": 4.861299540979415e-05, + "loss": 4.8737, + "step": 17913 + }, + { + "epoch": 0.10653963269578456, + "grad_norm": 1.5921473503112793, + "learning_rate": 4.8612841985086296e-05, + "loss": 5.3756, + "step": 17914 + }, + { + "epoch": 0.10654557997906557, + "grad_norm": 1.810085654258728, + "learning_rate": 4.8612688552135435e-05, + "loss": 5.3784, + "step": 17915 + }, + { + "epoch": 0.10655152726234655, + "grad_norm": 2.2289364337921143, + "learning_rate": 4.8612535110941636e-05, + "loss": 5.0258, + "step": 17916 + }, + { + "epoch": 0.10655747454562756, + "grad_norm": 1.9337642192840576, + "learning_rate": 4.8612381661504946e-05, + "loss": 4.9943, + "step": 17917 + }, + { + "epoch": 0.10656342182890856, + "grad_norm": 1.5772477388381958, + "learning_rate": 4.861222820382542e-05, + "loss": 5.1188, + "step": 17918 + }, + { + "epoch": 0.10656936911218955, + "grad_norm": 1.6176950931549072, + "learning_rate": 4.8612074737903097e-05, + "loss": 5.0973, + "step": 17919 + }, + { + "epoch": 0.10657531639547055, + "grad_norm": 1.7878233194351196, + "learning_rate": 4.8611921263738045e-05, + "loss": 5.0342, + "step": 17920 + }, + { + "epoch": 0.10658126367875155, + "grad_norm": 1.7473089694976807, + "learning_rate": 4.861176778133033e-05, + "loss": 5.2844, + "step": 17921 + }, + { + "epoch": 0.10658721096203254, + "grad_norm": 2.472464084625244, + "learning_rate": 4.8611614290679975e-05, + "loss": 4.9654, + "step": 17922 + }, + { + "epoch": 0.10659315824531354, + "grad_norm": 2.5256218910217285, + "learning_rate": 4.861146079178706e-05, + "loss": 4.7885, + "step": 17923 + }, + { + "epoch": 0.10659910552859454, + "grad_norm": 2.2665674686431885, + "learning_rate": 4.861130728465162e-05, + "loss": 5.0838, + "step": 17924 + }, + { + "epoch": 0.10660505281187553, + "grad_norm": 1.6795161962509155, + "learning_rate": 4.861115376927372e-05, + "loss": 5.3174, + "step": 17925 + }, + { + "epoch": 0.10661100009515653, + "grad_norm": 1.5786751508712769, + "learning_rate": 4.8611000245653405e-05, + "loss": 5.1831, + "step": 17926 + }, + { + "epoch": 0.10661694737843753, + "grad_norm": 2.0238442420959473, + "learning_rate": 4.861084671379074e-05, + "loss": 5.7967, + "step": 17927 + }, + { + "epoch": 0.10662289466171852, + "grad_norm": 1.5760328769683838, + "learning_rate": 4.861069317368577e-05, + "loss": 5.5692, + "step": 17928 + }, + { + "epoch": 0.10662884194499953, + "grad_norm": 1.7190479040145874, + "learning_rate": 4.861053962533855e-05, + "loss": 5.4248, + "step": 17929 + }, + { + "epoch": 0.10663478922828053, + "grad_norm": 1.987444519996643, + "learning_rate": 4.861038606874914e-05, + "loss": 5.3845, + "step": 17930 + }, + { + "epoch": 0.10664073651156152, + "grad_norm": 2.3603975772857666, + "learning_rate": 4.8610232503917585e-05, + "loss": 4.9948, + "step": 17931 + }, + { + "epoch": 0.10664668379484252, + "grad_norm": 2.560696601867676, + "learning_rate": 4.861007893084394e-05, + "loss": 4.797, + "step": 17932 + }, + { + "epoch": 0.10665263107812352, + "grad_norm": 2.3494272232055664, + "learning_rate": 4.860992534952826e-05, + "loss": 4.81, + "step": 17933 + }, + { + "epoch": 0.10665857836140451, + "grad_norm": 2.1878998279571533, + "learning_rate": 4.86097717599706e-05, + "loss": 4.7863, + "step": 17934 + }, + { + "epoch": 0.10666452564468551, + "grad_norm": 2.123789072036743, + "learning_rate": 4.8609618162171016e-05, + "loss": 4.7846, + "step": 17935 + }, + { + "epoch": 0.10667047292796651, + "grad_norm": 2.307370662689209, + "learning_rate": 4.8609464556129555e-05, + "loss": 4.3901, + "step": 17936 + }, + { + "epoch": 0.1066764202112475, + "grad_norm": 1.8189514875411987, + "learning_rate": 4.8609310941846274e-05, + "loss": 5.2722, + "step": 17937 + }, + { + "epoch": 0.1066823674945285, + "grad_norm": 1.4699981212615967, + "learning_rate": 4.860915731932123e-05, + "loss": 5.7501, + "step": 17938 + }, + { + "epoch": 0.10668831477780949, + "grad_norm": 1.5624393224716187, + "learning_rate": 4.860900368855447e-05, + "loss": 5.6963, + "step": 17939 + }, + { + "epoch": 0.10669426206109049, + "grad_norm": 1.8463138341903687, + "learning_rate": 4.860885004954605e-05, + "loss": 5.3627, + "step": 17940 + }, + { + "epoch": 0.1067002093443715, + "grad_norm": 1.7627042531967163, + "learning_rate": 4.8608696402296025e-05, + "loss": 5.6548, + "step": 17941 + }, + { + "epoch": 0.10670615662765248, + "grad_norm": 1.631505012512207, + "learning_rate": 4.860854274680444e-05, + "loss": 5.7926, + "step": 17942 + }, + { + "epoch": 0.10671210391093348, + "grad_norm": 1.4491498470306396, + "learning_rate": 4.860838908307137e-05, + "loss": 5.5395, + "step": 17943 + }, + { + "epoch": 0.10671805119421449, + "grad_norm": 1.6210049390792847, + "learning_rate": 4.8608235411096845e-05, + "loss": 5.2768, + "step": 17944 + }, + { + "epoch": 0.10672399847749547, + "grad_norm": 1.4522534608840942, + "learning_rate": 4.860808173088094e-05, + "loss": 5.7723, + "step": 17945 + }, + { + "epoch": 0.10672994576077648, + "grad_norm": 2.0779013633728027, + "learning_rate": 4.860792804242369e-05, + "loss": 5.4679, + "step": 17946 + }, + { + "epoch": 0.10673589304405748, + "grad_norm": 2.248556137084961, + "learning_rate": 4.860777434572515e-05, + "loss": 5.5089, + "step": 17947 + }, + { + "epoch": 0.10674184032733847, + "grad_norm": 2.2192306518554688, + "learning_rate": 4.86076206407854e-05, + "loss": 5.4098, + "step": 17948 + }, + { + "epoch": 0.10674778761061947, + "grad_norm": 1.7523053884506226, + "learning_rate": 4.8607466927604455e-05, + "loss": 5.3223, + "step": 17949 + }, + { + "epoch": 0.10675373489390047, + "grad_norm": 1.8636107444763184, + "learning_rate": 4.8607313206182395e-05, + "loss": 5.339, + "step": 17950 + }, + { + "epoch": 0.10675968217718146, + "grad_norm": 1.9067093133926392, + "learning_rate": 4.860715947651926e-05, + "loss": 5.3779, + "step": 17951 + }, + { + "epoch": 0.10676562946046246, + "grad_norm": 1.850948452949524, + "learning_rate": 4.860700573861512e-05, + "loss": 5.3474, + "step": 17952 + }, + { + "epoch": 0.10677157674374346, + "grad_norm": 2.144895076751709, + "learning_rate": 4.8606851992470005e-05, + "loss": 5.3089, + "step": 17953 + }, + { + "epoch": 0.10677752402702445, + "grad_norm": 2.054420232772827, + "learning_rate": 4.860669823808399e-05, + "loss": 5.3653, + "step": 17954 + }, + { + "epoch": 0.10678347131030545, + "grad_norm": 1.94870126247406, + "learning_rate": 4.860654447545711e-05, + "loss": 5.2514, + "step": 17955 + }, + { + "epoch": 0.10678941859358645, + "grad_norm": 1.8006596565246582, + "learning_rate": 4.860639070458945e-05, + "loss": 5.2357, + "step": 17956 + }, + { + "epoch": 0.10679536587686744, + "grad_norm": 2.309035301208496, + "learning_rate": 4.860623692548103e-05, + "loss": 5.2681, + "step": 17957 + }, + { + "epoch": 0.10680131316014845, + "grad_norm": 2.402949571609497, + "learning_rate": 4.860608313813192e-05, + "loss": 5.549, + "step": 17958 + }, + { + "epoch": 0.10680726044342945, + "grad_norm": 1.724307894706726, + "learning_rate": 4.8605929342542164e-05, + "loss": 5.5283, + "step": 17959 + }, + { + "epoch": 0.10681320772671044, + "grad_norm": 1.8566054105758667, + "learning_rate": 4.860577553871183e-05, + "loss": 5.834, + "step": 17960 + }, + { + "epoch": 0.10681915500999144, + "grad_norm": 1.8882628679275513, + "learning_rate": 4.860562172664096e-05, + "loss": 5.7954, + "step": 17961 + }, + { + "epoch": 0.10682510229327244, + "grad_norm": 1.694075345993042, + "learning_rate": 4.860546790632961e-05, + "loss": 5.7573, + "step": 17962 + }, + { + "epoch": 0.10683104957655343, + "grad_norm": 1.8312102556228638, + "learning_rate": 4.860531407777783e-05, + "loss": 5.4479, + "step": 17963 + }, + { + "epoch": 0.10683699685983443, + "grad_norm": 1.6124730110168457, + "learning_rate": 4.860516024098569e-05, + "loss": 5.5356, + "step": 17964 + }, + { + "epoch": 0.10684294414311543, + "grad_norm": 2.3505187034606934, + "learning_rate": 4.8605006395953225e-05, + "loss": 5.6543, + "step": 17965 + }, + { + "epoch": 0.10684889142639642, + "grad_norm": 2.69331431388855, + "learning_rate": 4.86048525426805e-05, + "loss": 5.5359, + "step": 17966 + }, + { + "epoch": 0.10685483870967742, + "grad_norm": 2.095374822616577, + "learning_rate": 4.860469868116756e-05, + "loss": 5.5514, + "step": 17967 + }, + { + "epoch": 0.10686078599295841, + "grad_norm": 1.8596038818359375, + "learning_rate": 4.8604544811414465e-05, + "loss": 5.5171, + "step": 17968 + }, + { + "epoch": 0.10686673327623941, + "grad_norm": 2.215549945831299, + "learning_rate": 4.860439093342127e-05, + "loss": 5.3824, + "step": 17969 + }, + { + "epoch": 0.10687268055952041, + "grad_norm": 1.9737238883972168, + "learning_rate": 4.860423704718803e-05, + "loss": 5.4159, + "step": 17970 + }, + { + "epoch": 0.1068786278428014, + "grad_norm": 1.8673701286315918, + "learning_rate": 4.860408315271479e-05, + "loss": 5.421, + "step": 17971 + }, + { + "epoch": 0.1068845751260824, + "grad_norm": 1.905371069908142, + "learning_rate": 4.86039292500016e-05, + "loss": 5.4003, + "step": 17972 + }, + { + "epoch": 0.1068905224093634, + "grad_norm": 1.7888939380645752, + "learning_rate": 4.8603775339048534e-05, + "loss": 5.1581, + "step": 17973 + }, + { + "epoch": 0.1068964696926444, + "grad_norm": 1.7499796152114868, + "learning_rate": 4.8603621419855625e-05, + "loss": 5.1334, + "step": 17974 + }, + { + "epoch": 0.1069024169759254, + "grad_norm": 1.6159700155258179, + "learning_rate": 4.860346749242295e-05, + "loss": 5.1999, + "step": 17975 + }, + { + "epoch": 0.1069083642592064, + "grad_norm": 1.7355921268463135, + "learning_rate": 4.860331355675053e-05, + "loss": 5.3899, + "step": 17976 + }, + { + "epoch": 0.10691431154248739, + "grad_norm": 1.760110855102539, + "learning_rate": 4.860315961283846e-05, + "loss": 5.5386, + "step": 17977 + }, + { + "epoch": 0.10692025882576839, + "grad_norm": 1.605482816696167, + "learning_rate": 4.860300566068675e-05, + "loss": 5.5486, + "step": 17978 + }, + { + "epoch": 0.10692620610904939, + "grad_norm": 2.1792690753936768, + "learning_rate": 4.860285170029548e-05, + "loss": 4.8871, + "step": 17979 + }, + { + "epoch": 0.10693215339233038, + "grad_norm": 1.4513617753982544, + "learning_rate": 4.86026977316647e-05, + "loss": 5.1944, + "step": 17980 + }, + { + "epoch": 0.10693810067561138, + "grad_norm": 2.560112476348877, + "learning_rate": 4.860254375479446e-05, + "loss": 4.2504, + "step": 17981 + }, + { + "epoch": 0.10694404795889238, + "grad_norm": 2.035403251647949, + "learning_rate": 4.8602389769684816e-05, + "loss": 5.4479, + "step": 17982 + }, + { + "epoch": 0.10694999524217337, + "grad_norm": 1.8496562242507935, + "learning_rate": 4.8602235776335826e-05, + "loss": 5.4981, + "step": 17983 + }, + { + "epoch": 0.10695594252545437, + "grad_norm": 1.9541285037994385, + "learning_rate": 4.8602081774747536e-05, + "loss": 5.5772, + "step": 17984 + }, + { + "epoch": 0.10696188980873537, + "grad_norm": 1.674981951713562, + "learning_rate": 4.860192776492001e-05, + "loss": 5.3656, + "step": 17985 + }, + { + "epoch": 0.10696783709201636, + "grad_norm": 1.675601601600647, + "learning_rate": 4.860177374685328e-05, + "loss": 5.3382, + "step": 17986 + }, + { + "epoch": 0.10697378437529736, + "grad_norm": 1.8874675035476685, + "learning_rate": 4.860161972054743e-05, + "loss": 5.1908, + "step": 17987 + }, + { + "epoch": 0.10697973165857837, + "grad_norm": 2.267000675201416, + "learning_rate": 4.860146568600249e-05, + "loss": 5.4437, + "step": 17988 + }, + { + "epoch": 0.10698567894185936, + "grad_norm": 1.8062045574188232, + "learning_rate": 4.8601311643218526e-05, + "loss": 5.2315, + "step": 17989 + }, + { + "epoch": 0.10699162622514036, + "grad_norm": 1.9503196477890015, + "learning_rate": 4.8601157592195584e-05, + "loss": 5.3999, + "step": 17990 + }, + { + "epoch": 0.10699757350842136, + "grad_norm": 1.8589918613433838, + "learning_rate": 4.860100353293372e-05, + "loss": 5.694, + "step": 17991 + }, + { + "epoch": 0.10700352079170235, + "grad_norm": 1.69667649269104, + "learning_rate": 4.8600849465432995e-05, + "loss": 5.6146, + "step": 17992 + }, + { + "epoch": 0.10700946807498335, + "grad_norm": 1.6006754636764526, + "learning_rate": 4.8600695389693455e-05, + "loss": 5.2849, + "step": 17993 + }, + { + "epoch": 0.10701541535826435, + "grad_norm": 1.7502506971359253, + "learning_rate": 4.860054130571516e-05, + "loss": 4.9652, + "step": 17994 + }, + { + "epoch": 0.10702136264154534, + "grad_norm": 1.6936286687850952, + "learning_rate": 4.860038721349816e-05, + "loss": 5.2192, + "step": 17995 + }, + { + "epoch": 0.10702730992482634, + "grad_norm": 1.4757579565048218, + "learning_rate": 4.8600233113042496e-05, + "loss": 5.3917, + "step": 17996 + }, + { + "epoch": 0.10703325720810733, + "grad_norm": 1.4602460861206055, + "learning_rate": 4.8600079004348245e-05, + "loss": 5.5418, + "step": 17997 + }, + { + "epoch": 0.10703920449138833, + "grad_norm": 1.4150431156158447, + "learning_rate": 4.859992488741545e-05, + "loss": 5.6592, + "step": 17998 + }, + { + "epoch": 0.10704515177466933, + "grad_norm": 1.385908842086792, + "learning_rate": 4.859977076224416e-05, + "loss": 5.2818, + "step": 17999 + }, + { + "epoch": 0.10705109905795032, + "grad_norm": 1.3683747053146362, + "learning_rate": 4.8599616628834446e-05, + "loss": 5.2743, + "step": 18000 + }, + { + "epoch": 0.10705704634123132, + "grad_norm": 1.2521027326583862, + "learning_rate": 4.859946248718634e-05, + "loss": 5.1564, + "step": 18001 + }, + { + "epoch": 0.10706299362451233, + "grad_norm": 1.445575475692749, + "learning_rate": 4.8599308337299906e-05, + "loss": 5.0108, + "step": 18002 + }, + { + "epoch": 0.10706894090779331, + "grad_norm": 1.3680258989334106, + "learning_rate": 4.859915417917519e-05, + "loss": 5.2649, + "step": 18003 + }, + { + "epoch": 0.10707488819107432, + "grad_norm": 1.2142491340637207, + "learning_rate": 4.859900001281227e-05, + "loss": 5.1143, + "step": 18004 + }, + { + "epoch": 0.10708083547435532, + "grad_norm": 1.244157314300537, + "learning_rate": 4.859884583821117e-05, + "loss": 5.2321, + "step": 18005 + }, + { + "epoch": 0.1070867827576363, + "grad_norm": 1.4057670831680298, + "learning_rate": 4.859869165537196e-05, + "loss": 5.3419, + "step": 18006 + }, + { + "epoch": 0.10709273004091731, + "grad_norm": 1.3243392705917358, + "learning_rate": 4.859853746429469e-05, + "loss": 5.0217, + "step": 18007 + }, + { + "epoch": 0.10709867732419831, + "grad_norm": 1.3227713108062744, + "learning_rate": 4.8598383264979416e-05, + "loss": 5.055, + "step": 18008 + }, + { + "epoch": 0.1071046246074793, + "grad_norm": 1.3313336372375488, + "learning_rate": 4.8598229057426195e-05, + "loss": 5.1319, + "step": 18009 + }, + { + "epoch": 0.1071105718907603, + "grad_norm": 1.385715126991272, + "learning_rate": 4.8598074841635064e-05, + "loss": 4.9349, + "step": 18010 + }, + { + "epoch": 0.1071165191740413, + "grad_norm": 1.3244850635528564, + "learning_rate": 4.85979206176061e-05, + "loss": 4.9055, + "step": 18011 + }, + { + "epoch": 0.10712246645732229, + "grad_norm": 1.2922260761260986, + "learning_rate": 4.859776638533934e-05, + "loss": 5.0518, + "step": 18012 + }, + { + "epoch": 0.10712841374060329, + "grad_norm": 1.3371012210845947, + "learning_rate": 4.8597612144834845e-05, + "loss": 5.234, + "step": 18013 + }, + { + "epoch": 0.1071343610238843, + "grad_norm": 1.3367552757263184, + "learning_rate": 4.859745789609267e-05, + "loss": 4.9765, + "step": 18014 + }, + { + "epoch": 0.10714030830716528, + "grad_norm": 1.5067929029464722, + "learning_rate": 4.859730363911286e-05, + "loss": 5.235, + "step": 18015 + }, + { + "epoch": 0.10714625559044628, + "grad_norm": 1.3660157918930054, + "learning_rate": 4.859714937389548e-05, + "loss": 5.4104, + "step": 18016 + }, + { + "epoch": 0.10715220287372729, + "grad_norm": 1.3999029397964478, + "learning_rate": 4.859699510044057e-05, + "loss": 5.1603, + "step": 18017 + }, + { + "epoch": 0.10715815015700828, + "grad_norm": 1.6147737503051758, + "learning_rate": 4.8596840818748204e-05, + "loss": 5.0506, + "step": 18018 + }, + { + "epoch": 0.10716409744028928, + "grad_norm": 1.5618371963500977, + "learning_rate": 4.859668652881843e-05, + "loss": 5.1564, + "step": 18019 + }, + { + "epoch": 0.10717004472357028, + "grad_norm": 1.3786426782608032, + "learning_rate": 4.859653223065128e-05, + "loss": 5.1884, + "step": 18020 + }, + { + "epoch": 0.10717599200685127, + "grad_norm": 1.429489016532898, + "learning_rate": 4.859637792424683e-05, + "loss": 5.1556, + "step": 18021 + }, + { + "epoch": 0.10718193929013227, + "grad_norm": 1.3347980976104736, + "learning_rate": 4.859622360960513e-05, + "loss": 5.008, + "step": 18022 + }, + { + "epoch": 0.10718788657341327, + "grad_norm": 1.3850064277648926, + "learning_rate": 4.859606928672623e-05, + "loss": 5.0719, + "step": 18023 + }, + { + "epoch": 0.10719383385669426, + "grad_norm": 1.3279672861099243, + "learning_rate": 4.859591495561019e-05, + "loss": 5.0793, + "step": 18024 + }, + { + "epoch": 0.10719978113997526, + "grad_norm": 1.5108927488327026, + "learning_rate": 4.8595760616257056e-05, + "loss": 5.1067, + "step": 18025 + }, + { + "epoch": 0.10720572842325625, + "grad_norm": 1.2342565059661865, + "learning_rate": 4.859560626866689e-05, + "loss": 5.0298, + "step": 18026 + }, + { + "epoch": 0.10721167570653725, + "grad_norm": 1.2821179628372192, + "learning_rate": 4.859545191283974e-05, + "loss": 5.2185, + "step": 18027 + }, + { + "epoch": 0.10721762298981825, + "grad_norm": 1.11893630027771, + "learning_rate": 4.859529754877566e-05, + "loss": 5.1911, + "step": 18028 + }, + { + "epoch": 0.10722357027309924, + "grad_norm": 1.2202814817428589, + "learning_rate": 4.859514317647471e-05, + "loss": 5.028, + "step": 18029 + }, + { + "epoch": 0.10722951755638024, + "grad_norm": 1.3898543119430542, + "learning_rate": 4.859498879593694e-05, + "loss": 5.4019, + "step": 18030 + }, + { + "epoch": 0.10723546483966125, + "grad_norm": 1.2810478210449219, + "learning_rate": 4.859483440716239e-05, + "loss": 5.0634, + "step": 18031 + }, + { + "epoch": 0.10724141212294223, + "grad_norm": 1.4424680471420288, + "learning_rate": 4.859468001015114e-05, + "loss": 5.0058, + "step": 18032 + }, + { + "epoch": 0.10724735940622324, + "grad_norm": 1.4053739309310913, + "learning_rate": 4.859452560490323e-05, + "loss": 5.0174, + "step": 18033 + }, + { + "epoch": 0.10725330668950424, + "grad_norm": 1.2552763223648071, + "learning_rate": 4.859437119141871e-05, + "loss": 5.0222, + "step": 18034 + }, + { + "epoch": 0.10725925397278523, + "grad_norm": 1.3694052696228027, + "learning_rate": 4.859421676969764e-05, + "loss": 4.9663, + "step": 18035 + }, + { + "epoch": 0.10726520125606623, + "grad_norm": 1.3814043998718262, + "learning_rate": 4.859406233974007e-05, + "loss": 5.01, + "step": 18036 + }, + { + "epoch": 0.10727114853934723, + "grad_norm": 1.5185308456420898, + "learning_rate": 4.859390790154606e-05, + "loss": 4.9698, + "step": 18037 + }, + { + "epoch": 0.10727709582262822, + "grad_norm": 1.2509820461273193, + "learning_rate": 4.859375345511566e-05, + "loss": 5.1034, + "step": 18038 + }, + { + "epoch": 0.10728304310590922, + "grad_norm": 1.3478872776031494, + "learning_rate": 4.8593599000448926e-05, + "loss": 5.2459, + "step": 18039 + }, + { + "epoch": 0.10728899038919022, + "grad_norm": 1.3720686435699463, + "learning_rate": 4.859344453754591e-05, + "loss": 5.1671, + "step": 18040 + }, + { + "epoch": 0.10729493767247121, + "grad_norm": 1.3953602313995361, + "learning_rate": 4.859329006640666e-05, + "loss": 5.3221, + "step": 18041 + }, + { + "epoch": 0.10730088495575221, + "grad_norm": 1.4901010990142822, + "learning_rate": 4.859313558703125e-05, + "loss": 5.1694, + "step": 18042 + }, + { + "epoch": 0.10730683223903321, + "grad_norm": 1.4153228998184204, + "learning_rate": 4.859298109941971e-05, + "loss": 5.2721, + "step": 18043 + }, + { + "epoch": 0.1073127795223142, + "grad_norm": 1.34188711643219, + "learning_rate": 4.859282660357211e-05, + "loss": 5.3048, + "step": 18044 + }, + { + "epoch": 0.1073187268055952, + "grad_norm": 1.355832576751709, + "learning_rate": 4.859267209948849e-05, + "loss": 5.2908, + "step": 18045 + }, + { + "epoch": 0.1073246740888762, + "grad_norm": 1.1551882028579712, + "learning_rate": 4.859251758716891e-05, + "loss": 5.1681, + "step": 18046 + }, + { + "epoch": 0.1073306213721572, + "grad_norm": 1.1728358268737793, + "learning_rate": 4.8592363066613434e-05, + "loss": 5.1535, + "step": 18047 + }, + { + "epoch": 0.1073365686554382, + "grad_norm": 1.4180268049240112, + "learning_rate": 4.859220853782211e-05, + "loss": 4.6467, + "step": 18048 + }, + { + "epoch": 0.1073425159387192, + "grad_norm": 1.4042308330535889, + "learning_rate": 4.8592054000794984e-05, + "loss": 4.7348, + "step": 18049 + }, + { + "epoch": 0.10734846322200019, + "grad_norm": 1.2508533000946045, + "learning_rate": 4.859189945553211e-05, + "loss": 4.7797, + "step": 18050 + }, + { + "epoch": 0.10735441050528119, + "grad_norm": 1.2266274690628052, + "learning_rate": 4.859174490203355e-05, + "loss": 4.7223, + "step": 18051 + }, + { + "epoch": 0.10736035778856219, + "grad_norm": 1.3217378854751587, + "learning_rate": 4.8591590340299366e-05, + "loss": 4.82, + "step": 18052 + }, + { + "epoch": 0.10736630507184318, + "grad_norm": 1.3789056539535522, + "learning_rate": 4.8591435770329594e-05, + "loss": 5.3133, + "step": 18053 + }, + { + "epoch": 0.10737225235512418, + "grad_norm": 1.6090314388275146, + "learning_rate": 4.85912811921243e-05, + "loss": 5.2263, + "step": 18054 + }, + { + "epoch": 0.10737819963840518, + "grad_norm": 1.3780972957611084, + "learning_rate": 4.859112660568353e-05, + "loss": 5.3081, + "step": 18055 + }, + { + "epoch": 0.10738414692168617, + "grad_norm": 1.3518953323364258, + "learning_rate": 4.859097201100734e-05, + "loss": 5.3423, + "step": 18056 + }, + { + "epoch": 0.10739009420496717, + "grad_norm": 1.4160034656524658, + "learning_rate": 4.859081740809579e-05, + "loss": 5.3082, + "step": 18057 + }, + { + "epoch": 0.10739604148824816, + "grad_norm": 1.1970654726028442, + "learning_rate": 4.8590662796948924e-05, + "loss": 5.254, + "step": 18058 + }, + { + "epoch": 0.10740198877152916, + "grad_norm": 1.3175582885742188, + "learning_rate": 4.859050817756681e-05, + "loss": 5.2823, + "step": 18059 + }, + { + "epoch": 0.10740793605481017, + "grad_norm": 1.5136942863464355, + "learning_rate": 4.859035354994948e-05, + "loss": 5.2238, + "step": 18060 + }, + { + "epoch": 0.10741388333809115, + "grad_norm": 1.2552412748336792, + "learning_rate": 4.859019891409701e-05, + "loss": 5.0492, + "step": 18061 + }, + { + "epoch": 0.10741983062137216, + "grad_norm": 1.2873655557632446, + "learning_rate": 4.859004427000945e-05, + "loss": 4.9162, + "step": 18062 + }, + { + "epoch": 0.10742577790465316, + "grad_norm": 1.2441788911819458, + "learning_rate": 4.8589889617686834e-05, + "loss": 4.9769, + "step": 18063 + }, + { + "epoch": 0.10743172518793415, + "grad_norm": 1.4254180192947388, + "learning_rate": 4.8589734957129246e-05, + "loss": 4.9917, + "step": 18064 + }, + { + "epoch": 0.10743767247121515, + "grad_norm": 1.3922675848007202, + "learning_rate": 4.858958028833672e-05, + "loss": 4.9705, + "step": 18065 + }, + { + "epoch": 0.10744361975449615, + "grad_norm": 1.430801510810852, + "learning_rate": 4.858942561130932e-05, + "loss": 5.0772, + "step": 18066 + }, + { + "epoch": 0.10744956703777714, + "grad_norm": 1.3651894330978394, + "learning_rate": 4.8589270926047085e-05, + "loss": 4.8844, + "step": 18067 + }, + { + "epoch": 0.10745551432105814, + "grad_norm": 1.4133042097091675, + "learning_rate": 4.858911623255008e-05, + "loss": 4.9397, + "step": 18068 + }, + { + "epoch": 0.10746146160433914, + "grad_norm": 1.4437615871429443, + "learning_rate": 4.858896153081837e-05, + "loss": 4.9977, + "step": 18069 + }, + { + "epoch": 0.10746740888762013, + "grad_norm": 1.3420813083648682, + "learning_rate": 4.858880682085199e-05, + "loss": 4.9295, + "step": 18070 + }, + { + "epoch": 0.10747335617090113, + "grad_norm": 1.2613091468811035, + "learning_rate": 4.8588652102651e-05, + "loss": 5.3186, + "step": 18071 + }, + { + "epoch": 0.10747930345418213, + "grad_norm": 1.2117836475372314, + "learning_rate": 4.858849737621545e-05, + "loss": 5.207, + "step": 18072 + }, + { + "epoch": 0.10748525073746312, + "grad_norm": 1.3153164386749268, + "learning_rate": 4.85883426415454e-05, + "loss": 4.9786, + "step": 18073 + }, + { + "epoch": 0.10749119802074412, + "grad_norm": 1.2437881231307983, + "learning_rate": 4.858818789864091e-05, + "loss": 4.8748, + "step": 18074 + }, + { + "epoch": 0.10749714530402513, + "grad_norm": 1.2477847337722778, + "learning_rate": 4.858803314750203e-05, + "loss": 4.8874, + "step": 18075 + }, + { + "epoch": 0.10750309258730611, + "grad_norm": 1.342822790145874, + "learning_rate": 4.858787838812881e-05, + "loss": 4.8244, + "step": 18076 + }, + { + "epoch": 0.10750903987058712, + "grad_norm": 1.4947394132614136, + "learning_rate": 4.8587723620521306e-05, + "loss": 4.9091, + "step": 18077 + }, + { + "epoch": 0.10751498715386812, + "grad_norm": 1.388978362083435, + "learning_rate": 4.8587568844679566e-05, + "loss": 4.9075, + "step": 18078 + }, + { + "epoch": 0.10752093443714911, + "grad_norm": 1.5932878255844116, + "learning_rate": 4.8587414060603656e-05, + "loss": 4.8712, + "step": 18079 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 1.3746308088302612, + "learning_rate": 4.8587259268293616e-05, + "loss": 4.9187, + "step": 18080 + }, + { + "epoch": 0.10753282900371111, + "grad_norm": 1.2811295986175537, + "learning_rate": 4.858710446774951e-05, + "loss": 4.8643, + "step": 18081 + }, + { + "epoch": 0.1075387762869921, + "grad_norm": 1.4154548645019531, + "learning_rate": 4.858694965897139e-05, + "loss": 4.8802, + "step": 18082 + }, + { + "epoch": 0.1075447235702731, + "grad_norm": 1.3216148614883423, + "learning_rate": 4.8586794841959305e-05, + "loss": 5.0356, + "step": 18083 + }, + { + "epoch": 0.1075506708535541, + "grad_norm": 1.0971577167510986, + "learning_rate": 4.858664001671332e-05, + "loss": 5.2085, + "step": 18084 + }, + { + "epoch": 0.10755661813683509, + "grad_norm": 1.3257287740707397, + "learning_rate": 4.858648518323348e-05, + "loss": 5.1728, + "step": 18085 + }, + { + "epoch": 0.1075625654201161, + "grad_norm": 1.2429475784301758, + "learning_rate": 4.858633034151985e-05, + "loss": 5.1053, + "step": 18086 + }, + { + "epoch": 0.10756851270339708, + "grad_norm": 1.1196707487106323, + "learning_rate": 4.858617549157246e-05, + "loss": 5.074, + "step": 18087 + }, + { + "epoch": 0.10757445998667808, + "grad_norm": 1.1981266736984253, + "learning_rate": 4.858602063339139e-05, + "loss": 5.0093, + "step": 18088 + }, + { + "epoch": 0.10758040726995909, + "grad_norm": 1.3818682432174683, + "learning_rate": 4.858586576697668e-05, + "loss": 5.0184, + "step": 18089 + }, + { + "epoch": 0.10758635455324007, + "grad_norm": 1.303539752960205, + "learning_rate": 4.85857108923284e-05, + "loss": 5.1778, + "step": 18090 + }, + { + "epoch": 0.10759230183652108, + "grad_norm": 1.3990812301635742, + "learning_rate": 4.8585556009446576e-05, + "loss": 4.9785, + "step": 18091 + }, + { + "epoch": 0.10759824911980208, + "grad_norm": 1.2507104873657227, + "learning_rate": 4.858540111833129e-05, + "loss": 4.9024, + "step": 18092 + }, + { + "epoch": 0.10760419640308307, + "grad_norm": 1.2867792844772339, + "learning_rate": 4.858524621898257e-05, + "loss": 4.8847, + "step": 18093 + }, + { + "epoch": 0.10761014368636407, + "grad_norm": 1.1816591024398804, + "learning_rate": 4.8585091311400495e-05, + "loss": 4.9431, + "step": 18094 + }, + { + "epoch": 0.10761609096964507, + "grad_norm": 1.292284607887268, + "learning_rate": 4.85849363955851e-05, + "loss": 5.2273, + "step": 18095 + }, + { + "epoch": 0.10762203825292606, + "grad_norm": 1.3242478370666504, + "learning_rate": 4.8584781471536456e-05, + "loss": 5.093, + "step": 18096 + }, + { + "epoch": 0.10762798553620706, + "grad_norm": 1.211534857749939, + "learning_rate": 4.858462653925461e-05, + "loss": 5.0928, + "step": 18097 + }, + { + "epoch": 0.10763393281948806, + "grad_norm": 1.0469262599945068, + "learning_rate": 4.858447159873961e-05, + "loss": 5.0435, + "step": 18098 + }, + { + "epoch": 0.10763988010276905, + "grad_norm": 1.2352322340011597, + "learning_rate": 4.8584316649991514e-05, + "loss": 5.1899, + "step": 18099 + }, + { + "epoch": 0.10764582738605005, + "grad_norm": 1.2135246992111206, + "learning_rate": 4.8584161693010375e-05, + "loss": 5.1028, + "step": 18100 + }, + { + "epoch": 0.10765177466933105, + "grad_norm": 1.3525876998901367, + "learning_rate": 4.858400672779625e-05, + "loss": 5.0422, + "step": 18101 + }, + { + "epoch": 0.10765772195261204, + "grad_norm": 1.3221076726913452, + "learning_rate": 4.85838517543492e-05, + "loss": 5.1329, + "step": 18102 + }, + { + "epoch": 0.10766366923589304, + "grad_norm": 1.4856393337249756, + "learning_rate": 4.858369677266926e-05, + "loss": 4.6795, + "step": 18103 + }, + { + "epoch": 0.10766961651917405, + "grad_norm": 1.4690982103347778, + "learning_rate": 4.8583541782756495e-05, + "loss": 5.1234, + "step": 18104 + }, + { + "epoch": 0.10767556380245503, + "grad_norm": 1.2535064220428467, + "learning_rate": 4.8583386784610964e-05, + "loss": 5.1344, + "step": 18105 + }, + { + "epoch": 0.10768151108573604, + "grad_norm": 1.3537837266921997, + "learning_rate": 4.858323177823272e-05, + "loss": 5.228, + "step": 18106 + }, + { + "epoch": 0.10768745836901704, + "grad_norm": 1.2927895784378052, + "learning_rate": 4.8583076763621805e-05, + "loss": 5.2371, + "step": 18107 + }, + { + "epoch": 0.10769340565229803, + "grad_norm": 1.2356709241867065, + "learning_rate": 4.8582921740778284e-05, + "loss": 4.9056, + "step": 18108 + }, + { + "epoch": 0.10769935293557903, + "grad_norm": 1.266918420791626, + "learning_rate": 4.858276670970221e-05, + "loss": 5.2142, + "step": 18109 + }, + { + "epoch": 0.10770530021886003, + "grad_norm": 1.1703591346740723, + "learning_rate": 4.858261167039364e-05, + "loss": 5.1237, + "step": 18110 + }, + { + "epoch": 0.10771124750214102, + "grad_norm": 1.2324700355529785, + "learning_rate": 4.858245662285262e-05, + "loss": 5.1391, + "step": 18111 + }, + { + "epoch": 0.10771719478542202, + "grad_norm": 1.2764140367507935, + "learning_rate": 4.85823015670792e-05, + "loss": 5.1368, + "step": 18112 + }, + { + "epoch": 0.10772314206870302, + "grad_norm": 1.254909634590149, + "learning_rate": 4.8582146503073456e-05, + "loss": 5.002, + "step": 18113 + }, + { + "epoch": 0.10772908935198401, + "grad_norm": 1.3368279933929443, + "learning_rate": 4.858199143083542e-05, + "loss": 5.1365, + "step": 18114 + }, + { + "epoch": 0.10773503663526501, + "grad_norm": 1.3550091981887817, + "learning_rate": 4.8581836350365165e-05, + "loss": 5.1722, + "step": 18115 + }, + { + "epoch": 0.107740983918546, + "grad_norm": 1.6306661367416382, + "learning_rate": 4.858168126166272e-05, + "loss": 5.0883, + "step": 18116 + }, + { + "epoch": 0.107746931201827, + "grad_norm": 1.5143946409225464, + "learning_rate": 4.858152616472816e-05, + "loss": 5.1258, + "step": 18117 + }, + { + "epoch": 0.107752878485108, + "grad_norm": 1.6553763151168823, + "learning_rate": 4.858137105956153e-05, + "loss": 4.9596, + "step": 18118 + }, + { + "epoch": 0.107758825768389, + "grad_norm": 1.920473337173462, + "learning_rate": 4.8581215946162896e-05, + "loss": 5.2206, + "step": 18119 + }, + { + "epoch": 0.10776477305167, + "grad_norm": 1.8482425212860107, + "learning_rate": 4.85810608245323e-05, + "loss": 5.1515, + "step": 18120 + }, + { + "epoch": 0.107770720334951, + "grad_norm": 1.6005665063858032, + "learning_rate": 4.8580905694669794e-05, + "loss": 5.1383, + "step": 18121 + }, + { + "epoch": 0.10777666761823199, + "grad_norm": 1.2169783115386963, + "learning_rate": 4.858075055657544e-05, + "loss": 5.3538, + "step": 18122 + }, + { + "epoch": 0.10778261490151299, + "grad_norm": 1.3251442909240723, + "learning_rate": 4.858059541024929e-05, + "loss": 5.3116, + "step": 18123 + }, + { + "epoch": 0.10778856218479399, + "grad_norm": 1.2065789699554443, + "learning_rate": 4.858044025569139e-05, + "loss": 5.2334, + "step": 18124 + }, + { + "epoch": 0.10779450946807498, + "grad_norm": 1.5847411155700684, + "learning_rate": 4.858028509290181e-05, + "loss": 4.9114, + "step": 18125 + }, + { + "epoch": 0.10780045675135598, + "grad_norm": 1.373826503753662, + "learning_rate": 4.85801299218806e-05, + "loss": 5.0748, + "step": 18126 + }, + { + "epoch": 0.10780640403463698, + "grad_norm": 1.7349494695663452, + "learning_rate": 4.85799747426278e-05, + "loss": 5.0888, + "step": 18127 + }, + { + "epoch": 0.10781235131791797, + "grad_norm": 1.3385915756225586, + "learning_rate": 4.857981955514349e-05, + "loss": 5.1472, + "step": 18128 + }, + { + "epoch": 0.10781829860119897, + "grad_norm": 1.3666753768920898, + "learning_rate": 4.857966435942769e-05, + "loss": 5.0881, + "step": 18129 + }, + { + "epoch": 0.10782424588447997, + "grad_norm": 1.39078688621521, + "learning_rate": 4.857950915548048e-05, + "loss": 5.3867, + "step": 18130 + }, + { + "epoch": 0.10783019316776096, + "grad_norm": 1.4484905004501343, + "learning_rate": 4.857935394330192e-05, + "loss": 5.0516, + "step": 18131 + }, + { + "epoch": 0.10783614045104196, + "grad_norm": 1.526084542274475, + "learning_rate": 4.8579198722892034e-05, + "loss": 5.0424, + "step": 18132 + }, + { + "epoch": 0.10784208773432297, + "grad_norm": 1.4617003202438354, + "learning_rate": 4.8579043494250895e-05, + "loss": 5.0245, + "step": 18133 + }, + { + "epoch": 0.10784803501760395, + "grad_norm": 1.3335559368133545, + "learning_rate": 4.857888825737856e-05, + "loss": 4.9398, + "step": 18134 + }, + { + "epoch": 0.10785398230088496, + "grad_norm": 1.1473711729049683, + "learning_rate": 4.857873301227508e-05, + "loss": 5.1818, + "step": 18135 + }, + { + "epoch": 0.10785992958416596, + "grad_norm": 1.5986409187316895, + "learning_rate": 4.8578577758940504e-05, + "loss": 5.3518, + "step": 18136 + }, + { + "epoch": 0.10786587686744695, + "grad_norm": 1.6430408954620361, + "learning_rate": 4.857842249737489e-05, + "loss": 5.3052, + "step": 18137 + }, + { + "epoch": 0.10787182415072795, + "grad_norm": 1.5069605112075806, + "learning_rate": 4.8578267227578303e-05, + "loss": 5.3491, + "step": 18138 + }, + { + "epoch": 0.10787777143400895, + "grad_norm": 1.3385566473007202, + "learning_rate": 4.857811194955077e-05, + "loss": 5.3864, + "step": 18139 + }, + { + "epoch": 0.10788371871728994, + "grad_norm": 1.1956936120986938, + "learning_rate": 4.857795666329237e-05, + "loss": 5.1304, + "step": 18140 + }, + { + "epoch": 0.10788966600057094, + "grad_norm": 1.3437196016311646, + "learning_rate": 4.857780136880315e-05, + "loss": 5.1872, + "step": 18141 + }, + { + "epoch": 0.10789561328385194, + "grad_norm": 1.4649217128753662, + "learning_rate": 4.857764606608316e-05, + "loss": 5.4178, + "step": 18142 + }, + { + "epoch": 0.10790156056713293, + "grad_norm": 1.2196028232574463, + "learning_rate": 4.857749075513246e-05, + "loss": 5.1782, + "step": 18143 + }, + { + "epoch": 0.10790750785041393, + "grad_norm": 1.2016780376434326, + "learning_rate": 4.8577335435951096e-05, + "loss": 5.2293, + "step": 18144 + }, + { + "epoch": 0.10791345513369492, + "grad_norm": 1.3034183979034424, + "learning_rate": 4.857718010853914e-05, + "loss": 5.2886, + "step": 18145 + }, + { + "epoch": 0.10791940241697592, + "grad_norm": 1.1815390586853027, + "learning_rate": 4.857702477289663e-05, + "loss": 5.2637, + "step": 18146 + }, + { + "epoch": 0.10792534970025693, + "grad_norm": 1.328203558921814, + "learning_rate": 4.857686942902362e-05, + "loss": 5.3154, + "step": 18147 + }, + { + "epoch": 0.10793129698353791, + "grad_norm": 1.2995961904525757, + "learning_rate": 4.857671407692016e-05, + "loss": 5.3313, + "step": 18148 + }, + { + "epoch": 0.10793724426681892, + "grad_norm": 1.181191325187683, + "learning_rate": 4.8576558716586326e-05, + "loss": 5.2589, + "step": 18149 + }, + { + "epoch": 0.10794319155009992, + "grad_norm": 1.266570806503296, + "learning_rate": 4.8576403348022154e-05, + "loss": 5.1694, + "step": 18150 + }, + { + "epoch": 0.1079491388333809, + "grad_norm": 1.4107643365859985, + "learning_rate": 4.857624797122771e-05, + "loss": 5.1784, + "step": 18151 + }, + { + "epoch": 0.10795508611666191, + "grad_norm": 1.1809200048446655, + "learning_rate": 4.8576092586203024e-05, + "loss": 5.3081, + "step": 18152 + }, + { + "epoch": 0.10796103339994291, + "grad_norm": 1.179453730583191, + "learning_rate": 4.857593719294818e-05, + "loss": 5.2534, + "step": 18153 + }, + { + "epoch": 0.1079669806832239, + "grad_norm": 1.3677690029144287, + "learning_rate": 4.857578179146323e-05, + "loss": 5.4021, + "step": 18154 + }, + { + "epoch": 0.1079729279665049, + "grad_norm": 1.3077856302261353, + "learning_rate": 4.8575626381748196e-05, + "loss": 5.1766, + "step": 18155 + }, + { + "epoch": 0.1079788752497859, + "grad_norm": 1.075791835784912, + "learning_rate": 4.857547096380317e-05, + "loss": 5.163, + "step": 18156 + }, + { + "epoch": 0.10798482253306689, + "grad_norm": 1.2855931520462036, + "learning_rate": 4.8575315537628186e-05, + "loss": 5.157, + "step": 18157 + }, + { + "epoch": 0.10799076981634789, + "grad_norm": 1.1961009502410889, + "learning_rate": 4.8575160103223303e-05, + "loss": 5.1632, + "step": 18158 + }, + { + "epoch": 0.1079967170996289, + "grad_norm": 1.6419997215270996, + "learning_rate": 4.8575004660588574e-05, + "loss": 5.1575, + "step": 18159 + }, + { + "epoch": 0.10800266438290988, + "grad_norm": 1.5928575992584229, + "learning_rate": 4.857484920972405e-05, + "loss": 5.0818, + "step": 18160 + }, + { + "epoch": 0.10800861166619088, + "grad_norm": 1.3492580652236938, + "learning_rate": 4.85746937506298e-05, + "loss": 5.1529, + "step": 18161 + }, + { + "epoch": 0.10801455894947189, + "grad_norm": 1.543717861175537, + "learning_rate": 4.857453828330587e-05, + "loss": 5.6192, + "step": 18162 + }, + { + "epoch": 0.10802050623275287, + "grad_norm": 1.5657880306243896, + "learning_rate": 4.85743828077523e-05, + "loss": 5.6619, + "step": 18163 + }, + { + "epoch": 0.10802645351603388, + "grad_norm": 1.3861533403396606, + "learning_rate": 4.8574227323969164e-05, + "loss": 5.2147, + "step": 18164 + }, + { + "epoch": 0.10803240079931488, + "grad_norm": 1.3780323266983032, + "learning_rate": 4.85740718319565e-05, + "loss": 5.1112, + "step": 18165 + }, + { + "epoch": 0.10803834808259587, + "grad_norm": 1.5768086910247803, + "learning_rate": 4.857391633171438e-05, + "loss": 5.011, + "step": 18166 + }, + { + "epoch": 0.10804429536587687, + "grad_norm": 1.4504894018173218, + "learning_rate": 4.857376082324285e-05, + "loss": 4.9349, + "step": 18167 + }, + { + "epoch": 0.10805024264915787, + "grad_norm": 1.5084949731826782, + "learning_rate": 4.857360530654196e-05, + "loss": 4.9861, + "step": 18168 + }, + { + "epoch": 0.10805618993243886, + "grad_norm": 1.4052237272262573, + "learning_rate": 4.857344978161177e-05, + "loss": 5.0447, + "step": 18169 + }, + { + "epoch": 0.10806213721571986, + "grad_norm": 1.5666663646697998, + "learning_rate": 4.857329424845233e-05, + "loss": 5.3537, + "step": 18170 + }, + { + "epoch": 0.10806808449900086, + "grad_norm": 1.251293420791626, + "learning_rate": 4.8573138707063695e-05, + "loss": 5.0139, + "step": 18171 + }, + { + "epoch": 0.10807403178228185, + "grad_norm": 1.2570216655731201, + "learning_rate": 4.8572983157445926e-05, + "loss": 4.9959, + "step": 18172 + }, + { + "epoch": 0.10807997906556285, + "grad_norm": 1.5116729736328125, + "learning_rate": 4.857282759959907e-05, + "loss": 5.1592, + "step": 18173 + }, + { + "epoch": 0.10808592634884384, + "grad_norm": 1.518898367881775, + "learning_rate": 4.857267203352318e-05, + "loss": 5.3541, + "step": 18174 + }, + { + "epoch": 0.10809187363212484, + "grad_norm": 1.314247965812683, + "learning_rate": 4.857251645921832e-05, + "loss": 5.2249, + "step": 18175 + }, + { + "epoch": 0.10809782091540585, + "grad_norm": 1.378150224685669, + "learning_rate": 4.857236087668453e-05, + "loss": 5.0004, + "step": 18176 + }, + { + "epoch": 0.10810376819868683, + "grad_norm": 1.4453868865966797, + "learning_rate": 4.8572205285921876e-05, + "loss": 5.2717, + "step": 18177 + }, + { + "epoch": 0.10810971548196784, + "grad_norm": 1.3493587970733643, + "learning_rate": 4.857204968693041e-05, + "loss": 5.4044, + "step": 18178 + }, + { + "epoch": 0.10811566276524884, + "grad_norm": 1.3819094896316528, + "learning_rate": 4.857189407971019e-05, + "loss": 5.0641, + "step": 18179 + }, + { + "epoch": 0.10812161004852983, + "grad_norm": 1.337969422340393, + "learning_rate": 4.857173846426126e-05, + "loss": 4.9078, + "step": 18180 + }, + { + "epoch": 0.10812755733181083, + "grad_norm": 1.655778408050537, + "learning_rate": 4.857158284058367e-05, + "loss": 4.9192, + "step": 18181 + }, + { + "epoch": 0.10813350461509183, + "grad_norm": 1.3867977857589722, + "learning_rate": 4.85714272086775e-05, + "loss": 4.86, + "step": 18182 + }, + { + "epoch": 0.10813945189837282, + "grad_norm": 1.5444231033325195, + "learning_rate": 4.8571271568542786e-05, + "loss": 4.9745, + "step": 18183 + }, + { + "epoch": 0.10814539918165382, + "grad_norm": 1.470123052597046, + "learning_rate": 4.8571115920179576e-05, + "loss": 5.1311, + "step": 18184 + }, + { + "epoch": 0.10815134646493482, + "grad_norm": 1.3052124977111816, + "learning_rate": 4.8570960263587936e-05, + "loss": 5.0657, + "step": 18185 + }, + { + "epoch": 0.10815729374821581, + "grad_norm": 1.4197286367416382, + "learning_rate": 4.857080459876792e-05, + "loss": 5.0798, + "step": 18186 + }, + { + "epoch": 0.10816324103149681, + "grad_norm": 1.5119234323501587, + "learning_rate": 4.857064892571958e-05, + "loss": 5.2842, + "step": 18187 + }, + { + "epoch": 0.10816918831477781, + "grad_norm": 1.6037629842758179, + "learning_rate": 4.8570493244442974e-05, + "loss": 4.8785, + "step": 18188 + }, + { + "epoch": 0.1081751355980588, + "grad_norm": 1.6456643342971802, + "learning_rate": 4.857033755493814e-05, + "loss": 5.2566, + "step": 18189 + }, + { + "epoch": 0.1081810828813398, + "grad_norm": 1.5777020454406738, + "learning_rate": 4.8570181857205155e-05, + "loss": 4.9856, + "step": 18190 + }, + { + "epoch": 0.1081870301646208, + "grad_norm": 1.6042171716690063, + "learning_rate": 4.857002615124405e-05, + "loss": 4.9179, + "step": 18191 + }, + { + "epoch": 0.1081929774479018, + "grad_norm": 1.2339718341827393, + "learning_rate": 4.856987043705491e-05, + "loss": 4.9144, + "step": 18192 + }, + { + "epoch": 0.1081989247311828, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.856971471463776e-05, + "loss": 5.0296, + "step": 18193 + }, + { + "epoch": 0.1082048720144638, + "grad_norm": 1.4179781675338745, + "learning_rate": 4.856955898399267e-05, + "loss": 5.268, + "step": 18194 + }, + { + "epoch": 0.10821081929774479, + "grad_norm": 1.5291078090667725, + "learning_rate": 4.856940324511969e-05, + "loss": 5.2433, + "step": 18195 + }, + { + "epoch": 0.10821676658102579, + "grad_norm": 1.5799169540405273, + "learning_rate": 4.856924749801888e-05, + "loss": 5.1906, + "step": 18196 + }, + { + "epoch": 0.10822271386430679, + "grad_norm": 1.4068591594696045, + "learning_rate": 4.8569091742690276e-05, + "loss": 5.2152, + "step": 18197 + }, + { + "epoch": 0.10822866114758778, + "grad_norm": 1.3728901147842407, + "learning_rate": 4.8568935979133953e-05, + "loss": 5.1717, + "step": 18198 + }, + { + "epoch": 0.10823460843086878, + "grad_norm": 1.524344563484192, + "learning_rate": 4.856878020734996e-05, + "loss": 5.0635, + "step": 18199 + }, + { + "epoch": 0.10824055571414978, + "grad_norm": 1.4725397825241089, + "learning_rate": 4.856862442733835e-05, + "loss": 5.2382, + "step": 18200 + }, + { + "epoch": 0.10824650299743077, + "grad_norm": 1.3467813730239868, + "learning_rate": 4.856846863909917e-05, + "loss": 5.0823, + "step": 18201 + }, + { + "epoch": 0.10825245028071177, + "grad_norm": 1.264833927154541, + "learning_rate": 4.856831284263249e-05, + "loss": 5.1763, + "step": 18202 + }, + { + "epoch": 0.10825839756399276, + "grad_norm": 1.2883045673370361, + "learning_rate": 4.856815703793836e-05, + "loss": 5.1207, + "step": 18203 + }, + { + "epoch": 0.10826434484727376, + "grad_norm": 1.309486746788025, + "learning_rate": 4.856800122501681e-05, + "loss": 5.0648, + "step": 18204 + }, + { + "epoch": 0.10827029213055477, + "grad_norm": 1.4473057985305786, + "learning_rate": 4.856784540386793e-05, + "loss": 4.9615, + "step": 18205 + }, + { + "epoch": 0.10827623941383575, + "grad_norm": 1.5151125192642212, + "learning_rate": 4.856768957449175e-05, + "loss": 5.2847, + "step": 18206 + }, + { + "epoch": 0.10828218669711676, + "grad_norm": 1.4859318733215332, + "learning_rate": 4.8567533736888336e-05, + "loss": 4.931, + "step": 18207 + }, + { + "epoch": 0.10828813398039776, + "grad_norm": 1.6516517400741577, + "learning_rate": 4.8567377891057745e-05, + "loss": 5.05, + "step": 18208 + }, + { + "epoch": 0.10829408126367875, + "grad_norm": 1.679347276687622, + "learning_rate": 4.8567222037000024e-05, + "loss": 5.2281, + "step": 18209 + }, + { + "epoch": 0.10830002854695975, + "grad_norm": 1.5119515657424927, + "learning_rate": 4.856706617471523e-05, + "loss": 4.9572, + "step": 18210 + }, + { + "epoch": 0.10830597583024075, + "grad_norm": 1.6819381713867188, + "learning_rate": 4.8566910304203404e-05, + "loss": 4.6228, + "step": 18211 + }, + { + "epoch": 0.10831192311352174, + "grad_norm": 1.7754294872283936, + "learning_rate": 4.856675442546462e-05, + "loss": 4.6851, + "step": 18212 + }, + { + "epoch": 0.10831787039680274, + "grad_norm": 1.455660343170166, + "learning_rate": 4.856659853849893e-05, + "loss": 5.059, + "step": 18213 + }, + { + "epoch": 0.10832381768008374, + "grad_norm": 1.358823299407959, + "learning_rate": 4.856644264330639e-05, + "loss": 5.0354, + "step": 18214 + }, + { + "epoch": 0.10832976496336473, + "grad_norm": 1.465482473373413, + "learning_rate": 4.856628673988703e-05, + "loss": 5.0441, + "step": 18215 + }, + { + "epoch": 0.10833571224664573, + "grad_norm": 1.3863260746002197, + "learning_rate": 4.8566130828240936e-05, + "loss": 5.0445, + "step": 18216 + }, + { + "epoch": 0.10834165952992673, + "grad_norm": 1.556997299194336, + "learning_rate": 4.856597490836815e-05, + "loss": 5.0629, + "step": 18217 + }, + { + "epoch": 0.10834760681320772, + "grad_norm": 1.3784066438674927, + "learning_rate": 4.856581898026872e-05, + "loss": 5.1894, + "step": 18218 + }, + { + "epoch": 0.10835355409648872, + "grad_norm": 1.4675719738006592, + "learning_rate": 4.856566304394271e-05, + "loss": 5.008, + "step": 18219 + }, + { + "epoch": 0.10835950137976973, + "grad_norm": 1.634920597076416, + "learning_rate": 4.856550709939016e-05, + "loss": 4.7707, + "step": 18220 + }, + { + "epoch": 0.10836544866305071, + "grad_norm": 1.83092200756073, + "learning_rate": 4.856535114661115e-05, + "loss": 4.8947, + "step": 18221 + }, + { + "epoch": 0.10837139594633172, + "grad_norm": 1.497359037399292, + "learning_rate": 4.856519518560571e-05, + "loss": 4.9656, + "step": 18222 + }, + { + "epoch": 0.10837734322961272, + "grad_norm": 1.3194255828857422, + "learning_rate": 4.856503921637391e-05, + "loss": 5.2374, + "step": 18223 + }, + { + "epoch": 0.1083832905128937, + "grad_norm": 1.3584619760513306, + "learning_rate": 4.8564883238915794e-05, + "loss": 5.1154, + "step": 18224 + }, + { + "epoch": 0.10838923779617471, + "grad_norm": 1.4173928499221802, + "learning_rate": 4.8564727253231416e-05, + "loss": 5.173, + "step": 18225 + }, + { + "epoch": 0.10839518507945571, + "grad_norm": 1.4110074043273926, + "learning_rate": 4.8564571259320844e-05, + "loss": 5.2409, + "step": 18226 + }, + { + "epoch": 0.1084011323627367, + "grad_norm": 1.4481827020645142, + "learning_rate": 4.856441525718412e-05, + "loss": 4.8533, + "step": 18227 + }, + { + "epoch": 0.1084070796460177, + "grad_norm": 1.4017881155014038, + "learning_rate": 4.85642592468213e-05, + "loss": 5.0483, + "step": 18228 + }, + { + "epoch": 0.1084130269292987, + "grad_norm": 1.3940458297729492, + "learning_rate": 4.8564103228232445e-05, + "loss": 5.0983, + "step": 18229 + }, + { + "epoch": 0.10841897421257969, + "grad_norm": 1.4414485692977905, + "learning_rate": 4.8563947201417604e-05, + "loss": 5.1561, + "step": 18230 + }, + { + "epoch": 0.1084249214958607, + "grad_norm": 1.3622056245803833, + "learning_rate": 4.856379116637683e-05, + "loss": 5.1773, + "step": 18231 + }, + { + "epoch": 0.10843086877914168, + "grad_norm": 1.3298035860061646, + "learning_rate": 4.856363512311019e-05, + "loss": 5.0742, + "step": 18232 + }, + { + "epoch": 0.10843681606242268, + "grad_norm": 1.3110575675964355, + "learning_rate": 4.856347907161771e-05, + "loss": 5.044, + "step": 18233 + }, + { + "epoch": 0.10844276334570369, + "grad_norm": 1.309591293334961, + "learning_rate": 4.856332301189948e-05, + "loss": 5.1313, + "step": 18234 + }, + { + "epoch": 0.10844871062898467, + "grad_norm": 1.2283830642700195, + "learning_rate": 4.856316694395552e-05, + "loss": 5.0777, + "step": 18235 + }, + { + "epoch": 0.10845465791226568, + "grad_norm": 1.1523172855377197, + "learning_rate": 4.856301086778592e-05, + "loss": 5.1245, + "step": 18236 + }, + { + "epoch": 0.10846060519554668, + "grad_norm": 1.3058217763900757, + "learning_rate": 4.85628547833907e-05, + "loss": 4.9649, + "step": 18237 + }, + { + "epoch": 0.10846655247882767, + "grad_norm": 1.239734172821045, + "learning_rate": 4.856269869076994e-05, + "loss": 5.0736, + "step": 18238 + }, + { + "epoch": 0.10847249976210867, + "grad_norm": 1.2624062299728394, + "learning_rate": 4.856254258992369e-05, + "loss": 5.0538, + "step": 18239 + }, + { + "epoch": 0.10847844704538967, + "grad_norm": 1.2172342538833618, + "learning_rate": 4.856238648085199e-05, + "loss": 5.0781, + "step": 18240 + }, + { + "epoch": 0.10848439432867066, + "grad_norm": 1.2534043788909912, + "learning_rate": 4.8562230363554906e-05, + "loss": 5.2148, + "step": 18241 + }, + { + "epoch": 0.10849034161195166, + "grad_norm": 1.3765602111816406, + "learning_rate": 4.85620742380325e-05, + "loss": 5.1274, + "step": 18242 + }, + { + "epoch": 0.10849628889523266, + "grad_norm": 1.4610897302627563, + "learning_rate": 4.856191810428481e-05, + "loss": 5.0356, + "step": 18243 + }, + { + "epoch": 0.10850223617851365, + "grad_norm": 1.4103399515151978, + "learning_rate": 4.8561761962311895e-05, + "loss": 5.0198, + "step": 18244 + }, + { + "epoch": 0.10850818346179465, + "grad_norm": 1.5159040689468384, + "learning_rate": 4.856160581211382e-05, + "loss": 5.0139, + "step": 18245 + }, + { + "epoch": 0.10851413074507565, + "grad_norm": 1.5071041584014893, + "learning_rate": 4.856144965369063e-05, + "loss": 4.9644, + "step": 18246 + }, + { + "epoch": 0.10852007802835664, + "grad_norm": 1.4504464864730835, + "learning_rate": 4.856129348704237e-05, + "loss": 5.041, + "step": 18247 + }, + { + "epoch": 0.10852602531163764, + "grad_norm": 1.2327022552490234, + "learning_rate": 4.856113731216911e-05, + "loss": 4.9775, + "step": 18248 + }, + { + "epoch": 0.10853197259491865, + "grad_norm": 2.013401508331299, + "learning_rate": 4.8560981129070914e-05, + "loss": 4.5814, + "step": 18249 + }, + { + "epoch": 0.10853791987819963, + "grad_norm": 1.7224215269088745, + "learning_rate": 4.8560824937747814e-05, + "loss": 5.3439, + "step": 18250 + }, + { + "epoch": 0.10854386716148064, + "grad_norm": 1.6198631525039673, + "learning_rate": 4.856066873819987e-05, + "loss": 5.0878, + "step": 18251 + }, + { + "epoch": 0.10854981444476164, + "grad_norm": 1.3257763385772705, + "learning_rate": 4.8560512530427146e-05, + "loss": 5.4697, + "step": 18252 + }, + { + "epoch": 0.10855576172804263, + "grad_norm": 1.6341005563735962, + "learning_rate": 4.856035631442969e-05, + "loss": 5.1383, + "step": 18253 + }, + { + "epoch": 0.10856170901132363, + "grad_norm": 1.4148058891296387, + "learning_rate": 4.8560200090207555e-05, + "loss": 5.3053, + "step": 18254 + }, + { + "epoch": 0.10856765629460463, + "grad_norm": 1.4810155630111694, + "learning_rate": 4.8560043857760796e-05, + "loss": 5.1222, + "step": 18255 + }, + { + "epoch": 0.10857360357788562, + "grad_norm": 1.4345650672912598, + "learning_rate": 4.8559887617089476e-05, + "loss": 5.2331, + "step": 18256 + }, + { + "epoch": 0.10857955086116662, + "grad_norm": 1.7319680452346802, + "learning_rate": 4.855973136819363e-05, + "loss": 4.6762, + "step": 18257 + }, + { + "epoch": 0.10858549814444762, + "grad_norm": 1.3632503747940063, + "learning_rate": 4.855957511107333e-05, + "loss": 4.8047, + "step": 18258 + }, + { + "epoch": 0.10859144542772861, + "grad_norm": 1.2798017263412476, + "learning_rate": 4.8559418845728636e-05, + "loss": 4.9368, + "step": 18259 + }, + { + "epoch": 0.10859739271100961, + "grad_norm": 1.539689540863037, + "learning_rate": 4.855926257215958e-05, + "loss": 4.8178, + "step": 18260 + }, + { + "epoch": 0.1086033399942906, + "grad_norm": 1.2351077795028687, + "learning_rate": 4.855910629036623e-05, + "loss": 5.0983, + "step": 18261 + }, + { + "epoch": 0.1086092872775716, + "grad_norm": 1.582154393196106, + "learning_rate": 4.855895000034865e-05, + "loss": 5.0563, + "step": 18262 + }, + { + "epoch": 0.1086152345608526, + "grad_norm": 1.3505899906158447, + "learning_rate": 4.855879370210688e-05, + "loss": 5.4024, + "step": 18263 + }, + { + "epoch": 0.1086211818441336, + "grad_norm": 1.236626148223877, + "learning_rate": 4.855863739564097e-05, + "loss": 5.4412, + "step": 18264 + }, + { + "epoch": 0.1086271291274146, + "grad_norm": 1.1207302808761597, + "learning_rate": 4.855848108095099e-05, + "loss": 5.3498, + "step": 18265 + }, + { + "epoch": 0.1086330764106956, + "grad_norm": 1.3238142728805542, + "learning_rate": 4.855832475803698e-05, + "loss": 4.9028, + "step": 18266 + }, + { + "epoch": 0.10863902369397659, + "grad_norm": 1.4837650060653687, + "learning_rate": 4.8558168426899006e-05, + "loss": 5.354, + "step": 18267 + }, + { + "epoch": 0.10864497097725759, + "grad_norm": 1.55657160282135, + "learning_rate": 4.8558012087537126e-05, + "loss": 5.4629, + "step": 18268 + }, + { + "epoch": 0.10865091826053859, + "grad_norm": 1.4918092489242554, + "learning_rate": 4.855785573995138e-05, + "loss": 5.046, + "step": 18269 + }, + { + "epoch": 0.10865686554381958, + "grad_norm": 1.5374544858932495, + "learning_rate": 4.855769938414183e-05, + "loss": 4.9571, + "step": 18270 + }, + { + "epoch": 0.10866281282710058, + "grad_norm": 1.360386610031128, + "learning_rate": 4.8557543020108537e-05, + "loss": 4.9482, + "step": 18271 + }, + { + "epoch": 0.10866876011038158, + "grad_norm": 1.2835793495178223, + "learning_rate": 4.855738664785154e-05, + "loss": 4.8301, + "step": 18272 + }, + { + "epoch": 0.10867470739366257, + "grad_norm": 1.453478217124939, + "learning_rate": 4.8557230267370915e-05, + "loss": 4.7873, + "step": 18273 + }, + { + "epoch": 0.10868065467694357, + "grad_norm": 1.4986752271652222, + "learning_rate": 4.855707387866669e-05, + "loss": 5.4533, + "step": 18274 + }, + { + "epoch": 0.10868660196022457, + "grad_norm": 1.574263572692871, + "learning_rate": 4.855691748173894e-05, + "loss": 5.0576, + "step": 18275 + }, + { + "epoch": 0.10869254924350556, + "grad_norm": 1.6014435291290283, + "learning_rate": 4.855676107658772e-05, + "loss": 4.8039, + "step": 18276 + }, + { + "epoch": 0.10869849652678656, + "grad_norm": 1.3822481632232666, + "learning_rate": 4.855660466321307e-05, + "loss": 4.9241, + "step": 18277 + }, + { + "epoch": 0.10870444381006757, + "grad_norm": 1.3199692964553833, + "learning_rate": 4.855644824161506e-05, + "loss": 4.842, + "step": 18278 + }, + { + "epoch": 0.10871039109334855, + "grad_norm": 1.340505599975586, + "learning_rate": 4.855629181179373e-05, + "loss": 4.8217, + "step": 18279 + }, + { + "epoch": 0.10871633837662956, + "grad_norm": 1.32645845413208, + "learning_rate": 4.8556135373749144e-05, + "loss": 4.9701, + "step": 18280 + }, + { + "epoch": 0.10872228565991056, + "grad_norm": 1.3629400730133057, + "learning_rate": 4.855597892748135e-05, + "loss": 5.2129, + "step": 18281 + }, + { + "epoch": 0.10872823294319155, + "grad_norm": 1.504604458808899, + "learning_rate": 4.8555822472990415e-05, + "loss": 4.988, + "step": 18282 + }, + { + "epoch": 0.10873418022647255, + "grad_norm": 1.514352560043335, + "learning_rate": 4.855566601027638e-05, + "loss": 4.8909, + "step": 18283 + }, + { + "epoch": 0.10874012750975355, + "grad_norm": 1.35514235496521, + "learning_rate": 4.85555095393393e-05, + "loss": 4.9441, + "step": 18284 + }, + { + "epoch": 0.10874607479303454, + "grad_norm": 1.1690728664398193, + "learning_rate": 4.8555353060179256e-05, + "loss": 5.3733, + "step": 18285 + }, + { + "epoch": 0.10875202207631554, + "grad_norm": 1.3280658721923828, + "learning_rate": 4.855519657279626e-05, + "loss": 5.4406, + "step": 18286 + }, + { + "epoch": 0.10875796935959654, + "grad_norm": 1.5852582454681396, + "learning_rate": 4.85550400771904e-05, + "loss": 5.176, + "step": 18287 + }, + { + "epoch": 0.10876391664287753, + "grad_norm": 1.233869194984436, + "learning_rate": 4.855488357336172e-05, + "loss": 5.2879, + "step": 18288 + }, + { + "epoch": 0.10876986392615853, + "grad_norm": 1.365251064300537, + "learning_rate": 4.855472706131027e-05, + "loss": 5.1592, + "step": 18289 + }, + { + "epoch": 0.10877581120943952, + "grad_norm": 1.6119641065597534, + "learning_rate": 4.8554570541036104e-05, + "loss": 5.0079, + "step": 18290 + }, + { + "epoch": 0.10878175849272052, + "grad_norm": 1.3233095407485962, + "learning_rate": 4.855441401253928e-05, + "loss": 5.3579, + "step": 18291 + }, + { + "epoch": 0.10878770577600153, + "grad_norm": 1.3345812559127808, + "learning_rate": 4.855425747581986e-05, + "loss": 5.1435, + "step": 18292 + }, + { + "epoch": 0.10879365305928251, + "grad_norm": 1.6694916486740112, + "learning_rate": 4.855410093087789e-05, + "loss": 5.0007, + "step": 18293 + }, + { + "epoch": 0.10879960034256352, + "grad_norm": 1.5835634469985962, + "learning_rate": 4.855394437771342e-05, + "loss": 4.9706, + "step": 18294 + }, + { + "epoch": 0.10880554762584452, + "grad_norm": 1.5465360879898071, + "learning_rate": 4.8553787816326526e-05, + "loss": 4.8983, + "step": 18295 + }, + { + "epoch": 0.1088114949091255, + "grad_norm": 1.4393326044082642, + "learning_rate": 4.855363124671723e-05, + "loss": 4.9365, + "step": 18296 + }, + { + "epoch": 0.10881744219240651, + "grad_norm": 1.5096935033798218, + "learning_rate": 4.8553474668885626e-05, + "loss": 4.8343, + "step": 18297 + }, + { + "epoch": 0.10882338947568751, + "grad_norm": 1.422397255897522, + "learning_rate": 4.8553318082831735e-05, + "loss": 4.9229, + "step": 18298 + }, + { + "epoch": 0.1088293367589685, + "grad_norm": 1.6444910764694214, + "learning_rate": 4.855316148855562e-05, + "loss": 5.0403, + "step": 18299 + }, + { + "epoch": 0.1088352840422495, + "grad_norm": 1.3621931076049805, + "learning_rate": 4.855300488605734e-05, + "loss": 4.9027, + "step": 18300 + }, + { + "epoch": 0.1088412313255305, + "grad_norm": 1.5086915493011475, + "learning_rate": 4.855284827533696e-05, + "loss": 4.95, + "step": 18301 + }, + { + "epoch": 0.10884717860881149, + "grad_norm": 1.7021756172180176, + "learning_rate": 4.855269165639451e-05, + "loss": 4.8245, + "step": 18302 + }, + { + "epoch": 0.10885312589209249, + "grad_norm": 1.6745699644088745, + "learning_rate": 4.855253502923007e-05, + "loss": 4.7832, + "step": 18303 + }, + { + "epoch": 0.1088590731753735, + "grad_norm": 1.2379045486450195, + "learning_rate": 4.8552378393843676e-05, + "loss": 5.0438, + "step": 18304 + }, + { + "epoch": 0.10886502045865448, + "grad_norm": 1.3999474048614502, + "learning_rate": 4.85522217502354e-05, + "loss": 5.0123, + "step": 18305 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 1.3539077043533325, + "learning_rate": 4.8552065098405276e-05, + "loss": 5.0722, + "step": 18306 + }, + { + "epoch": 0.10887691502521649, + "grad_norm": 1.3992128372192383, + "learning_rate": 4.8551908438353374e-05, + "loss": 4.9449, + "step": 18307 + }, + { + "epoch": 0.10888286230849747, + "grad_norm": 1.617443323135376, + "learning_rate": 4.8551751770079744e-05, + "loss": 5.1081, + "step": 18308 + }, + { + "epoch": 0.10888880959177848, + "grad_norm": 1.6027116775512695, + "learning_rate": 4.8551595093584446e-05, + "loss": 5.06, + "step": 18309 + }, + { + "epoch": 0.10889475687505948, + "grad_norm": 1.1488780975341797, + "learning_rate": 4.855143840886752e-05, + "loss": 5.1771, + "step": 18310 + }, + { + "epoch": 0.10890070415834047, + "grad_norm": 1.5683537721633911, + "learning_rate": 4.855128171592903e-05, + "loss": 5.1402, + "step": 18311 + }, + { + "epoch": 0.10890665144162147, + "grad_norm": 1.2840538024902344, + "learning_rate": 4.855112501476904e-05, + "loss": 5.2887, + "step": 18312 + }, + { + "epoch": 0.10891259872490247, + "grad_norm": 1.2311303615570068, + "learning_rate": 4.855096830538759e-05, + "loss": 5.2057, + "step": 18313 + }, + { + "epoch": 0.10891854600818346, + "grad_norm": 1.3655261993408203, + "learning_rate": 4.855081158778474e-05, + "loss": 5.3298, + "step": 18314 + }, + { + "epoch": 0.10892449329146446, + "grad_norm": 1.3405102491378784, + "learning_rate": 4.855065486196055e-05, + "loss": 5.3249, + "step": 18315 + }, + { + "epoch": 0.10893044057474546, + "grad_norm": 1.3816508054733276, + "learning_rate": 4.855049812791506e-05, + "loss": 5.2829, + "step": 18316 + }, + { + "epoch": 0.10893638785802645, + "grad_norm": 1.1929587125778198, + "learning_rate": 4.855034138564835e-05, + "loss": 5.5317, + "step": 18317 + }, + { + "epoch": 0.10894233514130745, + "grad_norm": 1.2426830530166626, + "learning_rate": 4.855018463516045e-05, + "loss": 5.263, + "step": 18318 + }, + { + "epoch": 0.10894828242458844, + "grad_norm": 1.3385604619979858, + "learning_rate": 4.855002787645141e-05, + "loss": 5.2531, + "step": 18319 + }, + { + "epoch": 0.10895422970786944, + "grad_norm": 1.2306677103042603, + "learning_rate": 4.8549871109521314e-05, + "loss": 5.245, + "step": 18320 + }, + { + "epoch": 0.10896017699115045, + "grad_norm": 1.3108047246932983, + "learning_rate": 4.85497143343702e-05, + "loss": 5.3063, + "step": 18321 + }, + { + "epoch": 0.10896612427443143, + "grad_norm": 1.3951044082641602, + "learning_rate": 4.8549557550998126e-05, + "loss": 5.4842, + "step": 18322 + }, + { + "epoch": 0.10897207155771244, + "grad_norm": 1.4618322849273682, + "learning_rate": 4.854940075940514e-05, + "loss": 5.5703, + "step": 18323 + }, + { + "epoch": 0.10897801884099344, + "grad_norm": 1.3512097597122192, + "learning_rate": 4.8549243959591304e-05, + "loss": 5.2615, + "step": 18324 + }, + { + "epoch": 0.10898396612427443, + "grad_norm": 1.261428713798523, + "learning_rate": 4.8549087151556675e-05, + "loss": 5.2617, + "step": 18325 + }, + { + "epoch": 0.10898991340755543, + "grad_norm": 1.5647974014282227, + "learning_rate": 4.854893033530129e-05, + "loss": 5.0529, + "step": 18326 + }, + { + "epoch": 0.10899586069083643, + "grad_norm": 1.3635188341140747, + "learning_rate": 4.8548773510825226e-05, + "loss": 5.1029, + "step": 18327 + }, + { + "epoch": 0.10900180797411742, + "grad_norm": 1.2746639251708984, + "learning_rate": 4.854861667812852e-05, + "loss": 5.1788, + "step": 18328 + }, + { + "epoch": 0.10900775525739842, + "grad_norm": 1.3292982578277588, + "learning_rate": 4.854845983721125e-05, + "loss": 5.2442, + "step": 18329 + }, + { + "epoch": 0.10901370254067942, + "grad_norm": 1.3015047311782837, + "learning_rate": 4.854830298807345e-05, + "loss": 5.2234, + "step": 18330 + }, + { + "epoch": 0.10901964982396041, + "grad_norm": 1.2642244100570679, + "learning_rate": 4.854814613071518e-05, + "loss": 5.1501, + "step": 18331 + }, + { + "epoch": 0.10902559710724141, + "grad_norm": 1.191630482673645, + "learning_rate": 4.8547989265136484e-05, + "loss": 5.1618, + "step": 18332 + }, + { + "epoch": 0.10903154439052241, + "grad_norm": 1.4171391725540161, + "learning_rate": 4.8547832391337445e-05, + "loss": 5.1431, + "step": 18333 + }, + { + "epoch": 0.1090374916738034, + "grad_norm": 1.3901907205581665, + "learning_rate": 4.854767550931809e-05, + "loss": 5.1464, + "step": 18334 + }, + { + "epoch": 0.1090434389570844, + "grad_norm": 1.5166548490524292, + "learning_rate": 4.854751861907849e-05, + "loss": 5.0841, + "step": 18335 + }, + { + "epoch": 0.1090493862403654, + "grad_norm": 1.3555935621261597, + "learning_rate": 4.854736172061869e-05, + "loss": 5.2947, + "step": 18336 + }, + { + "epoch": 0.1090553335236464, + "grad_norm": 1.1348215341567993, + "learning_rate": 4.854720481393875e-05, + "loss": 5.2813, + "step": 18337 + }, + { + "epoch": 0.1090612808069274, + "grad_norm": 1.3353219032287598, + "learning_rate": 4.8547047899038734e-05, + "loss": 5.2473, + "step": 18338 + }, + { + "epoch": 0.1090672280902084, + "grad_norm": 1.550512671470642, + "learning_rate": 4.854689097591868e-05, + "loss": 5.1364, + "step": 18339 + }, + { + "epoch": 0.10907317537348939, + "grad_norm": 1.5353589057922363, + "learning_rate": 4.8546734044578646e-05, + "loss": 5.0105, + "step": 18340 + }, + { + "epoch": 0.10907912265677039, + "grad_norm": 1.4025498628616333, + "learning_rate": 4.85465771050187e-05, + "loss": 5.0779, + "step": 18341 + }, + { + "epoch": 0.10908506994005139, + "grad_norm": 1.220438838005066, + "learning_rate": 4.8546420157238874e-05, + "loss": 5.0732, + "step": 18342 + }, + { + "epoch": 0.10909101722333238, + "grad_norm": 1.4058369398117065, + "learning_rate": 4.8546263201239245e-05, + "loss": 5.0838, + "step": 18343 + }, + { + "epoch": 0.10909696450661338, + "grad_norm": 1.4438905715942383, + "learning_rate": 4.854610623701986e-05, + "loss": 5.0449, + "step": 18344 + }, + { + "epoch": 0.10910291178989438, + "grad_norm": 1.536890983581543, + "learning_rate": 4.854594926458076e-05, + "loss": 4.9601, + "step": 18345 + }, + { + "epoch": 0.10910885907317537, + "grad_norm": 1.3566638231277466, + "learning_rate": 4.8545792283922025e-05, + "loss": 4.9283, + "step": 18346 + }, + { + "epoch": 0.10911480635645637, + "grad_norm": 1.3086943626403809, + "learning_rate": 4.8545635295043694e-05, + "loss": 5.0638, + "step": 18347 + }, + { + "epoch": 0.10912075363973736, + "grad_norm": 1.330124020576477, + "learning_rate": 4.854547829794582e-05, + "loss": 5.0944, + "step": 18348 + }, + { + "epoch": 0.10912670092301836, + "grad_norm": 1.4076783657073975, + "learning_rate": 4.854532129262848e-05, + "loss": 4.9725, + "step": 18349 + }, + { + "epoch": 0.10913264820629937, + "grad_norm": 1.380814552307129, + "learning_rate": 4.854516427909169e-05, + "loss": 5.0551, + "step": 18350 + }, + { + "epoch": 0.10913859548958035, + "grad_norm": 1.4243587255477905, + "learning_rate": 4.854500725733554e-05, + "loss": 5.103, + "step": 18351 + }, + { + "epoch": 0.10914454277286136, + "grad_norm": 1.438328742980957, + "learning_rate": 4.854485022736006e-05, + "loss": 5.1153, + "step": 18352 + }, + { + "epoch": 0.10915049005614236, + "grad_norm": 1.4602978229522705, + "learning_rate": 4.8544693189165324e-05, + "loss": 4.8916, + "step": 18353 + }, + { + "epoch": 0.10915643733942335, + "grad_norm": 1.548378586769104, + "learning_rate": 4.8544536142751385e-05, + "loss": 5.0205, + "step": 18354 + }, + { + "epoch": 0.10916238462270435, + "grad_norm": 1.33285653591156, + "learning_rate": 4.854437908811828e-05, + "loss": 4.9558, + "step": 18355 + }, + { + "epoch": 0.10916833190598535, + "grad_norm": 1.442918300628662, + "learning_rate": 4.854422202526609e-05, + "loss": 4.9119, + "step": 18356 + }, + { + "epoch": 0.10917427918926634, + "grad_norm": 1.498830795288086, + "learning_rate": 4.8544064954194836e-05, + "loss": 4.9787, + "step": 18357 + }, + { + "epoch": 0.10918022647254734, + "grad_norm": 1.422012209892273, + "learning_rate": 4.85439078749046e-05, + "loss": 5.0013, + "step": 18358 + }, + { + "epoch": 0.10918617375582834, + "grad_norm": 1.4635952711105347, + "learning_rate": 4.854375078739543e-05, + "loss": 4.8389, + "step": 18359 + }, + { + "epoch": 0.10919212103910933, + "grad_norm": 1.3973792791366577, + "learning_rate": 4.854359369166738e-05, + "loss": 4.9503, + "step": 18360 + }, + { + "epoch": 0.10919806832239033, + "grad_norm": 1.4016454219818115, + "learning_rate": 4.8543436587720504e-05, + "loss": 4.8533, + "step": 18361 + }, + { + "epoch": 0.10920401560567133, + "grad_norm": 1.215690016746521, + "learning_rate": 4.854327947555486e-05, + "loss": 5.0961, + "step": 18362 + }, + { + "epoch": 0.10920996288895232, + "grad_norm": 1.1589696407318115, + "learning_rate": 4.85431223551705e-05, + "loss": 4.8991, + "step": 18363 + }, + { + "epoch": 0.10921591017223332, + "grad_norm": 1.2894245386123657, + "learning_rate": 4.854296522656748e-05, + "loss": 5.0622, + "step": 18364 + }, + { + "epoch": 0.10922185745551433, + "grad_norm": 1.3525546789169312, + "learning_rate": 4.854280808974585e-05, + "loss": 5.1679, + "step": 18365 + }, + { + "epoch": 0.10922780473879531, + "grad_norm": 1.2055712938308716, + "learning_rate": 4.854265094470567e-05, + "loss": 5.2706, + "step": 18366 + }, + { + "epoch": 0.10923375202207632, + "grad_norm": 1.3646256923675537, + "learning_rate": 4.8542493791447e-05, + "loss": 5.2381, + "step": 18367 + }, + { + "epoch": 0.10923969930535732, + "grad_norm": 1.535840630531311, + "learning_rate": 4.8542336629969875e-05, + "loss": 5.0133, + "step": 18368 + }, + { + "epoch": 0.1092456465886383, + "grad_norm": 1.3226375579833984, + "learning_rate": 4.854217946027437e-05, + "loss": 4.9518, + "step": 18369 + }, + { + "epoch": 0.10925159387191931, + "grad_norm": 1.4403883218765259, + "learning_rate": 4.854202228236054e-05, + "loss": 5.1958, + "step": 18370 + }, + { + "epoch": 0.10925754115520031, + "grad_norm": 1.3661396503448486, + "learning_rate": 4.8541865096228426e-05, + "loss": 5.297, + "step": 18371 + }, + { + "epoch": 0.1092634884384813, + "grad_norm": 1.1291767358779907, + "learning_rate": 4.8541707901878096e-05, + "loss": 5.0954, + "step": 18372 + }, + { + "epoch": 0.1092694357217623, + "grad_norm": 1.414288878440857, + "learning_rate": 4.854155069930959e-05, + "loss": 5.0499, + "step": 18373 + }, + { + "epoch": 0.1092753830050433, + "grad_norm": 1.405760407447815, + "learning_rate": 4.8541393488522976e-05, + "loss": 5.004, + "step": 18374 + }, + { + "epoch": 0.10928133028832429, + "grad_norm": 1.2152272462844849, + "learning_rate": 4.854123626951831e-05, + "loss": 4.9798, + "step": 18375 + }, + { + "epoch": 0.10928727757160529, + "grad_norm": 1.3401811122894287, + "learning_rate": 4.854107904229564e-05, + "loss": 5.1179, + "step": 18376 + }, + { + "epoch": 0.10929322485488628, + "grad_norm": 1.036811113357544, + "learning_rate": 4.854092180685502e-05, + "loss": 5.129, + "step": 18377 + }, + { + "epoch": 0.10929917213816728, + "grad_norm": 1.380259394645691, + "learning_rate": 4.8540764563196506e-05, + "loss": 5.163, + "step": 18378 + }, + { + "epoch": 0.10930511942144829, + "grad_norm": 1.3078418970108032, + "learning_rate": 4.8540607311320156e-05, + "loss": 4.9882, + "step": 18379 + }, + { + "epoch": 0.10931106670472927, + "grad_norm": 1.2273530960083008, + "learning_rate": 4.854045005122603e-05, + "loss": 5.0736, + "step": 18380 + }, + { + "epoch": 0.10931701398801028, + "grad_norm": 1.1997276544570923, + "learning_rate": 4.8540292782914164e-05, + "loss": 4.9193, + "step": 18381 + }, + { + "epoch": 0.10932296127129128, + "grad_norm": 1.2119728326797485, + "learning_rate": 4.854013550638463e-05, + "loss": 4.9752, + "step": 18382 + }, + { + "epoch": 0.10932890855457227, + "grad_norm": 1.1508461236953735, + "learning_rate": 4.853997822163748e-05, + "loss": 4.8432, + "step": 18383 + }, + { + "epoch": 0.10933485583785327, + "grad_norm": 1.2142893075942993, + "learning_rate": 4.853982092867276e-05, + "loss": 5.0771, + "step": 18384 + }, + { + "epoch": 0.10934080312113427, + "grad_norm": 1.1016231775283813, + "learning_rate": 4.8539663627490536e-05, + "loss": 5.0918, + "step": 18385 + }, + { + "epoch": 0.10934675040441526, + "grad_norm": 1.2202482223510742, + "learning_rate": 4.8539506318090865e-05, + "loss": 5.1181, + "step": 18386 + }, + { + "epoch": 0.10935269768769626, + "grad_norm": 1.3560340404510498, + "learning_rate": 4.853934900047379e-05, + "loss": 5.1007, + "step": 18387 + }, + { + "epoch": 0.10935864497097726, + "grad_norm": 1.350473165512085, + "learning_rate": 4.8539191674639374e-05, + "loss": 5.1084, + "step": 18388 + }, + { + "epoch": 0.10936459225425825, + "grad_norm": 1.5102394819259644, + "learning_rate": 4.853903434058766e-05, + "loss": 5.0825, + "step": 18389 + }, + { + "epoch": 0.10937053953753925, + "grad_norm": 1.3704886436462402, + "learning_rate": 4.853887699831872e-05, + "loss": 5.1083, + "step": 18390 + }, + { + "epoch": 0.10937648682082025, + "grad_norm": 1.315167784690857, + "learning_rate": 4.8538719647832606e-05, + "loss": 4.9786, + "step": 18391 + }, + { + "epoch": 0.10938243410410124, + "grad_norm": 1.5208832025527954, + "learning_rate": 4.8538562289129356e-05, + "loss": 4.9011, + "step": 18392 + }, + { + "epoch": 0.10938838138738224, + "grad_norm": 1.3259782791137695, + "learning_rate": 4.8538404922209046e-05, + "loss": 4.9368, + "step": 18393 + }, + { + "epoch": 0.10939432867066325, + "grad_norm": 1.3342556953430176, + "learning_rate": 4.853824754707172e-05, + "loss": 4.9858, + "step": 18394 + }, + { + "epoch": 0.10940027595394423, + "grad_norm": 1.2291737794876099, + "learning_rate": 4.853809016371743e-05, + "loss": 5.0289, + "step": 18395 + }, + { + "epoch": 0.10940622323722524, + "grad_norm": 1.1539384126663208, + "learning_rate": 4.8537932772146245e-05, + "loss": 4.9444, + "step": 18396 + }, + { + "epoch": 0.10941217052050624, + "grad_norm": 1.2171412706375122, + "learning_rate": 4.8537775372358204e-05, + "loss": 4.9818, + "step": 18397 + }, + { + "epoch": 0.10941811780378723, + "grad_norm": 1.2133311033248901, + "learning_rate": 4.8537617964353374e-05, + "loss": 5.2647, + "step": 18398 + }, + { + "epoch": 0.10942406508706823, + "grad_norm": 1.2499877214431763, + "learning_rate": 4.8537460548131796e-05, + "loss": 5.4893, + "step": 18399 + }, + { + "epoch": 0.10943001237034923, + "grad_norm": 1.2127736806869507, + "learning_rate": 4.8537303123693545e-05, + "loss": 5.3607, + "step": 18400 + }, + { + "epoch": 0.10943595965363022, + "grad_norm": 1.3051133155822754, + "learning_rate": 4.853714569103865e-05, + "loss": 5.4531, + "step": 18401 + }, + { + "epoch": 0.10944190693691122, + "grad_norm": 1.3183389902114868, + "learning_rate": 4.85369882501672e-05, + "loss": 5.1784, + "step": 18402 + }, + { + "epoch": 0.10944785422019222, + "grad_norm": 1.5276503562927246, + "learning_rate": 4.853683080107922e-05, + "loss": 4.9092, + "step": 18403 + }, + { + "epoch": 0.10945380150347321, + "grad_norm": 1.519415259361267, + "learning_rate": 4.853667334377478e-05, + "loss": 4.7973, + "step": 18404 + }, + { + "epoch": 0.10945974878675421, + "grad_norm": 1.4063026905059814, + "learning_rate": 4.853651587825392e-05, + "loss": 4.7771, + "step": 18405 + }, + { + "epoch": 0.1094656960700352, + "grad_norm": 1.2753932476043701, + "learning_rate": 4.8536358404516715e-05, + "loss": 4.7902, + "step": 18406 + }, + { + "epoch": 0.1094716433533162, + "grad_norm": 1.5203404426574707, + "learning_rate": 4.8536200922563205e-05, + "loss": 4.961, + "step": 18407 + }, + { + "epoch": 0.1094775906365972, + "grad_norm": 1.4700336456298828, + "learning_rate": 4.8536043432393455e-05, + "loss": 5.0276, + "step": 18408 + }, + { + "epoch": 0.1094835379198782, + "grad_norm": 1.3945552110671997, + "learning_rate": 4.8535885934007506e-05, + "loss": 4.9641, + "step": 18409 + }, + { + "epoch": 0.1094894852031592, + "grad_norm": 1.1885923147201538, + "learning_rate": 4.853572842740544e-05, + "loss": 4.9162, + "step": 18410 + }, + { + "epoch": 0.1094954324864402, + "grad_norm": 1.414090871810913, + "learning_rate": 4.853557091258728e-05, + "loss": 4.9317, + "step": 18411 + }, + { + "epoch": 0.10950137976972119, + "grad_norm": 1.4395371675491333, + "learning_rate": 4.85354133895531e-05, + "loss": 4.7658, + "step": 18412 + }, + { + "epoch": 0.10950732705300219, + "grad_norm": 1.351665735244751, + "learning_rate": 4.8535255858302944e-05, + "loss": 4.9385, + "step": 18413 + }, + { + "epoch": 0.10951327433628319, + "grad_norm": 1.5085922479629517, + "learning_rate": 4.853509831883688e-05, + "loss": 5.0192, + "step": 18414 + }, + { + "epoch": 0.10951922161956418, + "grad_norm": 1.3413939476013184, + "learning_rate": 4.8534940771154954e-05, + "loss": 4.9193, + "step": 18415 + }, + { + "epoch": 0.10952516890284518, + "grad_norm": 1.532934546470642, + "learning_rate": 4.853478321525723e-05, + "loss": 4.9137, + "step": 18416 + }, + { + "epoch": 0.10953111618612618, + "grad_norm": 1.388016700744629, + "learning_rate": 4.8534625651143754e-05, + "loss": 4.9381, + "step": 18417 + }, + { + "epoch": 0.10953706346940717, + "grad_norm": 1.551255702972412, + "learning_rate": 4.853446807881458e-05, + "loss": 5.0973, + "step": 18418 + }, + { + "epoch": 0.10954301075268817, + "grad_norm": 1.4487138986587524, + "learning_rate": 4.853431049826976e-05, + "loss": 5.1313, + "step": 18419 + }, + { + "epoch": 0.10954895803596917, + "grad_norm": 1.467703104019165, + "learning_rate": 4.853415290950936e-05, + "loss": 5.0381, + "step": 18420 + }, + { + "epoch": 0.10955490531925016, + "grad_norm": 1.4529845714569092, + "learning_rate": 4.853399531253343e-05, + "loss": 4.9945, + "step": 18421 + }, + { + "epoch": 0.10956085260253116, + "grad_norm": 1.230872631072998, + "learning_rate": 4.8533837707342036e-05, + "loss": 5.0579, + "step": 18422 + }, + { + "epoch": 0.10956679988581217, + "grad_norm": 1.3668066263198853, + "learning_rate": 4.8533680093935206e-05, + "loss": 5.2567, + "step": 18423 + }, + { + "epoch": 0.10957274716909315, + "grad_norm": 1.3560447692871094, + "learning_rate": 4.853352247231302e-05, + "loss": 5.0152, + "step": 18424 + }, + { + "epoch": 0.10957869445237416, + "grad_norm": 1.4296886920928955, + "learning_rate": 4.8533364842475524e-05, + "loss": 5.1132, + "step": 18425 + }, + { + "epoch": 0.10958464173565516, + "grad_norm": 1.4232845306396484, + "learning_rate": 4.853320720442277e-05, + "loss": 5.0427, + "step": 18426 + }, + { + "epoch": 0.10959058901893615, + "grad_norm": 1.4019423723220825, + "learning_rate": 4.8533049558154826e-05, + "loss": 5.2369, + "step": 18427 + }, + { + "epoch": 0.10959653630221715, + "grad_norm": 1.5423427820205688, + "learning_rate": 4.853289190367173e-05, + "loss": 5.1053, + "step": 18428 + }, + { + "epoch": 0.10960248358549815, + "grad_norm": 1.5049951076507568, + "learning_rate": 4.8532734240973545e-05, + "loss": 5.3784, + "step": 18429 + }, + { + "epoch": 0.10960843086877914, + "grad_norm": 1.678328037261963, + "learning_rate": 4.853257657006033e-05, + "loss": 5.3021, + "step": 18430 + }, + { + "epoch": 0.10961437815206014, + "grad_norm": 1.5986173152923584, + "learning_rate": 4.853241889093213e-05, + "loss": 5.1686, + "step": 18431 + }, + { + "epoch": 0.10962032543534114, + "grad_norm": 1.5304551124572754, + "learning_rate": 4.853226120358901e-05, + "loss": 5.2319, + "step": 18432 + }, + { + "epoch": 0.10962627271862213, + "grad_norm": 1.609595775604248, + "learning_rate": 4.853210350803102e-05, + "loss": 5.0256, + "step": 18433 + }, + { + "epoch": 0.10963222000190313, + "grad_norm": 1.3506170511245728, + "learning_rate": 4.853194580425821e-05, + "loss": 5.0792, + "step": 18434 + }, + { + "epoch": 0.10963816728518412, + "grad_norm": 1.2946768999099731, + "learning_rate": 4.853178809227065e-05, + "loss": 5.0155, + "step": 18435 + }, + { + "epoch": 0.10964411456846512, + "grad_norm": 1.5691487789154053, + "learning_rate": 4.853163037206838e-05, + "loss": 5.1302, + "step": 18436 + }, + { + "epoch": 0.10965006185174613, + "grad_norm": 1.6740599870681763, + "learning_rate": 4.853147264365146e-05, + "loss": 5.2371, + "step": 18437 + }, + { + "epoch": 0.10965600913502711, + "grad_norm": 1.4822674989700317, + "learning_rate": 4.853131490701995e-05, + "loss": 5.0194, + "step": 18438 + }, + { + "epoch": 0.10966195641830812, + "grad_norm": 1.385177493095398, + "learning_rate": 4.853115716217389e-05, + "loss": 4.9444, + "step": 18439 + }, + { + "epoch": 0.10966790370158912, + "grad_norm": 1.3696002960205078, + "learning_rate": 4.853099940911337e-05, + "loss": 5.0557, + "step": 18440 + }, + { + "epoch": 0.1096738509848701, + "grad_norm": 1.6609543561935425, + "learning_rate": 4.8530841647838396e-05, + "loss": 4.9032, + "step": 18441 + }, + { + "epoch": 0.10967979826815111, + "grad_norm": 1.5938438177108765, + "learning_rate": 4.8530683878349056e-05, + "loss": 4.8639, + "step": 18442 + }, + { + "epoch": 0.10968574555143211, + "grad_norm": 1.4565002918243408, + "learning_rate": 4.85305261006454e-05, + "loss": 5.0483, + "step": 18443 + }, + { + "epoch": 0.1096916928347131, + "grad_norm": 1.5930250883102417, + "learning_rate": 4.853036831472749e-05, + "loss": 5.0751, + "step": 18444 + }, + { + "epoch": 0.1096976401179941, + "grad_norm": 1.5648735761642456, + "learning_rate": 4.853021052059536e-05, + "loss": 5.0991, + "step": 18445 + }, + { + "epoch": 0.1097035874012751, + "grad_norm": 1.4230155944824219, + "learning_rate": 4.8530052718249076e-05, + "loss": 5.098, + "step": 18446 + }, + { + "epoch": 0.10970953468455609, + "grad_norm": 1.4366841316223145, + "learning_rate": 4.85298949076887e-05, + "loss": 5.0975, + "step": 18447 + }, + { + "epoch": 0.10971548196783709, + "grad_norm": 1.437514066696167, + "learning_rate": 4.852973708891427e-05, + "loss": 5.0325, + "step": 18448 + }, + { + "epoch": 0.1097214292511181, + "grad_norm": 2.0367636680603027, + "learning_rate": 4.852957926192586e-05, + "loss": 5.2064, + "step": 18449 + }, + { + "epoch": 0.10972737653439908, + "grad_norm": 2.16357684135437, + "learning_rate": 4.852942142672352e-05, + "loss": 5.1532, + "step": 18450 + }, + { + "epoch": 0.10973332381768008, + "grad_norm": 1.6931402683258057, + "learning_rate": 4.8529263583307296e-05, + "loss": 5.2128, + "step": 18451 + }, + { + "epoch": 0.10973927110096109, + "grad_norm": 2.4651196002960205, + "learning_rate": 4.852910573167725e-05, + "loss": 4.798, + "step": 18452 + }, + { + "epoch": 0.10974521838424207, + "grad_norm": 1.7160784006118774, + "learning_rate": 4.852894787183344e-05, + "loss": 5.5087, + "step": 18453 + }, + { + "epoch": 0.10975116566752308, + "grad_norm": 1.478097915649414, + "learning_rate": 4.852879000377591e-05, + "loss": 5.6876, + "step": 18454 + }, + { + "epoch": 0.10975711295080408, + "grad_norm": 1.8612531423568726, + "learning_rate": 4.852863212750474e-05, + "loss": 5.2259, + "step": 18455 + }, + { + "epoch": 0.10976306023408507, + "grad_norm": 1.6869621276855469, + "learning_rate": 4.852847424301995e-05, + "loss": 5.5294, + "step": 18456 + }, + { + "epoch": 0.10976900751736607, + "grad_norm": 1.7378077507019043, + "learning_rate": 4.852831635032161e-05, + "loss": 5.4568, + "step": 18457 + }, + { + "epoch": 0.10977495480064707, + "grad_norm": 1.7788033485412598, + "learning_rate": 4.852815844940979e-05, + "loss": 5.2331, + "step": 18458 + }, + { + "epoch": 0.10978090208392806, + "grad_norm": 1.8730370998382568, + "learning_rate": 4.852800054028453e-05, + "loss": 4.9792, + "step": 18459 + }, + { + "epoch": 0.10978684936720906, + "grad_norm": 1.5126397609710693, + "learning_rate": 4.852784262294588e-05, + "loss": 5.3134, + "step": 18460 + }, + { + "epoch": 0.10979279665049006, + "grad_norm": 1.6687992811203003, + "learning_rate": 4.8527684697393914e-05, + "loss": 5.3296, + "step": 18461 + }, + { + "epoch": 0.10979874393377105, + "grad_norm": 1.6268471479415894, + "learning_rate": 4.852752676362867e-05, + "loss": 4.9804, + "step": 18462 + }, + { + "epoch": 0.10980469121705205, + "grad_norm": 1.7055017948150635, + "learning_rate": 4.8527368821650214e-05, + "loss": 5.0289, + "step": 18463 + }, + { + "epoch": 0.10981063850033304, + "grad_norm": 1.489247441291809, + "learning_rate": 4.852721087145859e-05, + "loss": 5.0428, + "step": 18464 + }, + { + "epoch": 0.10981658578361404, + "grad_norm": 1.7411161661148071, + "learning_rate": 4.8527052913053874e-05, + "loss": 5.1142, + "step": 18465 + }, + { + "epoch": 0.10982253306689505, + "grad_norm": 1.5776443481445312, + "learning_rate": 4.8526894946436094e-05, + "loss": 5.2881, + "step": 18466 + }, + { + "epoch": 0.10982848035017603, + "grad_norm": 1.342997431755066, + "learning_rate": 4.852673697160532e-05, + "loss": 5.0295, + "step": 18467 + }, + { + "epoch": 0.10983442763345704, + "grad_norm": 1.1686962842941284, + "learning_rate": 4.8526578988561606e-05, + "loss": 5.0607, + "step": 18468 + }, + { + "epoch": 0.10984037491673804, + "grad_norm": 1.578697681427002, + "learning_rate": 4.8526420997305006e-05, + "loss": 5.3291, + "step": 18469 + }, + { + "epoch": 0.10984632220001903, + "grad_norm": 1.5248758792877197, + "learning_rate": 4.8526262997835575e-05, + "loss": 5.1206, + "step": 18470 + }, + { + "epoch": 0.10985226948330003, + "grad_norm": 1.1425076723098755, + "learning_rate": 4.852610499015337e-05, + "loss": 5.1892, + "step": 18471 + }, + { + "epoch": 0.10985821676658103, + "grad_norm": 1.356423020362854, + "learning_rate": 4.852594697425844e-05, + "loss": 4.9477, + "step": 18472 + }, + { + "epoch": 0.10986416404986202, + "grad_norm": 1.3905398845672607, + "learning_rate": 4.852578895015085e-05, + "loss": 4.9084, + "step": 18473 + }, + { + "epoch": 0.10987011133314302, + "grad_norm": 1.3447619676589966, + "learning_rate": 4.8525630917830655e-05, + "loss": 4.9042, + "step": 18474 + }, + { + "epoch": 0.10987605861642402, + "grad_norm": 1.2110105752944946, + "learning_rate": 4.8525472877297893e-05, + "loss": 4.9669, + "step": 18475 + }, + { + "epoch": 0.10988200589970501, + "grad_norm": 1.480750560760498, + "learning_rate": 4.8525314828552646e-05, + "loss": 5.1071, + "step": 18476 + }, + { + "epoch": 0.10988795318298601, + "grad_norm": 1.2497118711471558, + "learning_rate": 4.852515677159495e-05, + "loss": 4.8868, + "step": 18477 + }, + { + "epoch": 0.10989390046626701, + "grad_norm": 1.4057846069335938, + "learning_rate": 4.8524998706424856e-05, + "loss": 5.1173, + "step": 18478 + }, + { + "epoch": 0.109899847749548, + "grad_norm": 1.3325163125991821, + "learning_rate": 4.8524840633042436e-05, + "loss": 5.1066, + "step": 18479 + }, + { + "epoch": 0.109905795032829, + "grad_norm": 1.333720326423645, + "learning_rate": 4.852468255144773e-05, + "loss": 5.1404, + "step": 18480 + }, + { + "epoch": 0.10991174231611, + "grad_norm": 1.3484537601470947, + "learning_rate": 4.852452446164081e-05, + "loss": 5.1284, + "step": 18481 + }, + { + "epoch": 0.109917689599391, + "grad_norm": 1.3348337411880493, + "learning_rate": 4.8524366363621716e-05, + "loss": 5.2056, + "step": 18482 + }, + { + "epoch": 0.109923636882672, + "grad_norm": 1.1838293075561523, + "learning_rate": 4.8524208257390504e-05, + "loss": 5.0488, + "step": 18483 + }, + { + "epoch": 0.109929584165953, + "grad_norm": 1.2820385694503784, + "learning_rate": 4.852405014294724e-05, + "loss": 5.1329, + "step": 18484 + }, + { + "epoch": 0.10993553144923399, + "grad_norm": 1.3892844915390015, + "learning_rate": 4.852389202029198e-05, + "loss": 5.0263, + "step": 18485 + }, + { + "epoch": 0.10994147873251499, + "grad_norm": 1.4780217409133911, + "learning_rate": 4.852373388942476e-05, + "loss": 5.0866, + "step": 18486 + }, + { + "epoch": 0.10994742601579599, + "grad_norm": 1.4181870222091675, + "learning_rate": 4.852357575034565e-05, + "loss": 5.1436, + "step": 18487 + }, + { + "epoch": 0.10995337329907698, + "grad_norm": 1.4174554347991943, + "learning_rate": 4.852341760305471e-05, + "loss": 5.132, + "step": 18488 + }, + { + "epoch": 0.10995932058235798, + "grad_norm": 1.2727283239364624, + "learning_rate": 4.852325944755198e-05, + "loss": 5.0171, + "step": 18489 + }, + { + "epoch": 0.10996526786563898, + "grad_norm": 1.2102142572402954, + "learning_rate": 4.852310128383753e-05, + "loss": 5.0183, + "step": 18490 + }, + { + "epoch": 0.10997121514891997, + "grad_norm": 1.254946231842041, + "learning_rate": 4.85229431119114e-05, + "loss": 5.105, + "step": 18491 + }, + { + "epoch": 0.10997716243220097, + "grad_norm": 1.4097338914871216, + "learning_rate": 4.8522784931773666e-05, + "loss": 4.953, + "step": 18492 + }, + { + "epoch": 0.10998310971548196, + "grad_norm": 1.368314504623413, + "learning_rate": 4.852262674342436e-05, + "loss": 4.9527, + "step": 18493 + }, + { + "epoch": 0.10998905699876296, + "grad_norm": 1.3907700777053833, + "learning_rate": 4.8522468546863554e-05, + "loss": 4.9416, + "step": 18494 + }, + { + "epoch": 0.10999500428204396, + "grad_norm": 1.2113755941390991, + "learning_rate": 4.852231034209129e-05, + "loss": 4.8552, + "step": 18495 + }, + { + "epoch": 0.11000095156532495, + "grad_norm": 1.3752022981643677, + "learning_rate": 4.852215212910763e-05, + "loss": 4.9314, + "step": 18496 + }, + { + "epoch": 0.11000689884860596, + "grad_norm": 1.243531584739685, + "learning_rate": 4.852199390791264e-05, + "loss": 4.925, + "step": 18497 + }, + { + "epoch": 0.11001284613188696, + "grad_norm": 1.3528475761413574, + "learning_rate": 4.852183567850636e-05, + "loss": 4.8643, + "step": 18498 + }, + { + "epoch": 0.11001879341516795, + "grad_norm": 1.4653394222259521, + "learning_rate": 4.8521677440888845e-05, + "loss": 4.8894, + "step": 18499 + }, + { + "epoch": 0.11002474069844895, + "grad_norm": 1.3524682521820068, + "learning_rate": 4.852151919506016e-05, + "loss": 4.7458, + "step": 18500 + }, + { + "epoch": 0.11003068798172995, + "grad_norm": 1.3654247522354126, + "learning_rate": 4.852136094102036e-05, + "loss": 4.7971, + "step": 18501 + }, + { + "epoch": 0.11003663526501094, + "grad_norm": 1.395735740661621, + "learning_rate": 4.85212026787695e-05, + "loss": 4.7677, + "step": 18502 + }, + { + "epoch": 0.11004258254829194, + "grad_norm": 1.4467344284057617, + "learning_rate": 4.8521044408307616e-05, + "loss": 4.726, + "step": 18503 + }, + { + "epoch": 0.11004852983157294, + "grad_norm": 1.276580572128296, + "learning_rate": 4.852088612963478e-05, + "loss": 4.8145, + "step": 18504 + }, + { + "epoch": 0.11005447711485393, + "grad_norm": 1.4406812191009521, + "learning_rate": 4.852072784275106e-05, + "loss": 4.7942, + "step": 18505 + }, + { + "epoch": 0.11006042439813493, + "grad_norm": 1.4281691312789917, + "learning_rate": 4.8520569547656483e-05, + "loss": 4.9745, + "step": 18506 + }, + { + "epoch": 0.11006637168141593, + "grad_norm": 1.3521541357040405, + "learning_rate": 4.852041124435112e-05, + "loss": 4.8335, + "step": 18507 + }, + { + "epoch": 0.11007231896469692, + "grad_norm": 1.2510555982589722, + "learning_rate": 4.852025293283503e-05, + "loss": 4.8868, + "step": 18508 + }, + { + "epoch": 0.11007826624797792, + "grad_norm": 1.3792724609375, + "learning_rate": 4.852009461310826e-05, + "loss": 4.9388, + "step": 18509 + }, + { + "epoch": 0.11008421353125893, + "grad_norm": 1.3494830131530762, + "learning_rate": 4.851993628517086e-05, + "loss": 4.8536, + "step": 18510 + }, + { + "epoch": 0.11009016081453991, + "grad_norm": 1.2981318235397339, + "learning_rate": 4.851977794902291e-05, + "loss": 4.8479, + "step": 18511 + }, + { + "epoch": 0.11009610809782092, + "grad_norm": 1.3305935859680176, + "learning_rate": 4.851961960466444e-05, + "loss": 4.9893, + "step": 18512 + }, + { + "epoch": 0.11010205538110192, + "grad_norm": 1.3141270875930786, + "learning_rate": 4.851946125209551e-05, + "loss": 4.8349, + "step": 18513 + }, + { + "epoch": 0.1101080026643829, + "grad_norm": 1.2411303520202637, + "learning_rate": 4.851930289131619e-05, + "loss": 4.8698, + "step": 18514 + }, + { + "epoch": 0.11011394994766391, + "grad_norm": 1.520176887512207, + "learning_rate": 4.851914452232651e-05, + "loss": 4.7576, + "step": 18515 + }, + { + "epoch": 0.11011989723094491, + "grad_norm": 1.3073054552078247, + "learning_rate": 4.851898614512655e-05, + "loss": 4.8974, + "step": 18516 + }, + { + "epoch": 0.1101258445142259, + "grad_norm": 1.4703196287155151, + "learning_rate": 4.8518827759716354e-05, + "loss": 5.0947, + "step": 18517 + }, + { + "epoch": 0.1101317917975069, + "grad_norm": 1.3140865564346313, + "learning_rate": 4.851866936609597e-05, + "loss": 5.4125, + "step": 18518 + }, + { + "epoch": 0.1101377390807879, + "grad_norm": 1.2075819969177246, + "learning_rate": 4.8518510964265465e-05, + "loss": 5.2993, + "step": 18519 + }, + { + "epoch": 0.11014368636406889, + "grad_norm": 1.6519954204559326, + "learning_rate": 4.85183525542249e-05, + "loss": 5.6638, + "step": 18520 + }, + { + "epoch": 0.11014963364734989, + "grad_norm": 2.118663787841797, + "learning_rate": 4.851819413597432e-05, + "loss": 5.5422, + "step": 18521 + }, + { + "epoch": 0.1101555809306309, + "grad_norm": 1.902429461479187, + "learning_rate": 4.851803570951377e-05, + "loss": 5.3244, + "step": 18522 + }, + { + "epoch": 0.11016152821391188, + "grad_norm": 2.593628406524658, + "learning_rate": 4.8517877274843315e-05, + "loss": 5.0554, + "step": 18523 + }, + { + "epoch": 0.11016747549719288, + "grad_norm": 2.6404380798339844, + "learning_rate": 4.851771883196302e-05, + "loss": 4.9789, + "step": 18524 + }, + { + "epoch": 0.11017342278047387, + "grad_norm": 2.08564829826355, + "learning_rate": 4.8517560380872934e-05, + "loss": 4.9616, + "step": 18525 + }, + { + "epoch": 0.11017937006375488, + "grad_norm": 2.306739091873169, + "learning_rate": 4.8517401921573114e-05, + "loss": 4.9368, + "step": 18526 + }, + { + "epoch": 0.11018531734703588, + "grad_norm": 3.0212862491607666, + "learning_rate": 4.85172434540636e-05, + "loss": 4.6379, + "step": 18527 + }, + { + "epoch": 0.11019126463031687, + "grad_norm": 2.554163694381714, + "learning_rate": 4.851708497834446e-05, + "loss": 4.6958, + "step": 18528 + }, + { + "epoch": 0.11019721191359787, + "grad_norm": 2.354631185531616, + "learning_rate": 4.851692649441576e-05, + "loss": 4.7904, + "step": 18529 + }, + { + "epoch": 0.11020315919687887, + "grad_norm": 1.5072609186172485, + "learning_rate": 4.851676800227754e-05, + "loss": 5.5862, + "step": 18530 + }, + { + "epoch": 0.11020910648015986, + "grad_norm": 1.5677906274795532, + "learning_rate": 4.851660950192986e-05, + "loss": 5.8712, + "step": 18531 + }, + { + "epoch": 0.11021505376344086, + "grad_norm": 1.7329411506652832, + "learning_rate": 4.851645099337276e-05, + "loss": 5.4559, + "step": 18532 + }, + { + "epoch": 0.11022100104672186, + "grad_norm": 2.187192916870117, + "learning_rate": 4.851629247660633e-05, + "loss": 5.2172, + "step": 18533 + }, + { + "epoch": 0.11022694833000285, + "grad_norm": 2.5248184204101562, + "learning_rate": 4.851613395163059e-05, + "loss": 4.7283, + "step": 18534 + }, + { + "epoch": 0.11023289561328385, + "grad_norm": 1.897926926612854, + "learning_rate": 4.8515975418445625e-05, + "loss": 5.0609, + "step": 18535 + }, + { + "epoch": 0.11023884289656485, + "grad_norm": 1.6827658414840698, + "learning_rate": 4.851581687705147e-05, + "loss": 5.2637, + "step": 18536 + }, + { + "epoch": 0.11024479017984584, + "grad_norm": 1.6638895273208618, + "learning_rate": 4.8515658327448184e-05, + "loss": 5.3758, + "step": 18537 + }, + { + "epoch": 0.11025073746312684, + "grad_norm": 1.3794528245925903, + "learning_rate": 4.8515499769635824e-05, + "loss": 5.1398, + "step": 18538 + }, + { + "epoch": 0.11025668474640785, + "grad_norm": 1.7829253673553467, + "learning_rate": 4.8515341203614454e-05, + "loss": 5.8449, + "step": 18539 + }, + { + "epoch": 0.11026263202968883, + "grad_norm": 1.9193391799926758, + "learning_rate": 4.85151826293841e-05, + "loss": 5.6113, + "step": 18540 + }, + { + "epoch": 0.11026857931296984, + "grad_norm": 1.9315286874771118, + "learning_rate": 4.851502404694486e-05, + "loss": 5.4341, + "step": 18541 + }, + { + "epoch": 0.11027452659625084, + "grad_norm": 1.8884371519088745, + "learning_rate": 4.851486545629677e-05, + "loss": 5.0711, + "step": 18542 + }, + { + "epoch": 0.11028047387953183, + "grad_norm": 2.104315996170044, + "learning_rate": 4.8514706857439866e-05, + "loss": 4.7431, + "step": 18543 + }, + { + "epoch": 0.11028642116281283, + "grad_norm": 1.9781455993652344, + "learning_rate": 4.8514548250374234e-05, + "loss": 4.9088, + "step": 18544 + }, + { + "epoch": 0.11029236844609383, + "grad_norm": 2.0802392959594727, + "learning_rate": 4.851438963509991e-05, + "loss": 4.8418, + "step": 18545 + }, + { + "epoch": 0.11029831572937482, + "grad_norm": 2.1856627464294434, + "learning_rate": 4.851423101161696e-05, + "loss": 5.5758, + "step": 18546 + }, + { + "epoch": 0.11030426301265582, + "grad_norm": 1.578050971031189, + "learning_rate": 4.851407237992543e-05, + "loss": 5.2795, + "step": 18547 + }, + { + "epoch": 0.11031021029593682, + "grad_norm": 2.241647720336914, + "learning_rate": 4.8513913740025376e-05, + "loss": 4.7807, + "step": 18548 + }, + { + "epoch": 0.11031615757921781, + "grad_norm": 2.102911949157715, + "learning_rate": 4.851375509191687e-05, + "loss": 5.1933, + "step": 18549 + }, + { + "epoch": 0.11032210486249881, + "grad_norm": 1.7198251485824585, + "learning_rate": 4.851359643559995e-05, + "loss": 5.273, + "step": 18550 + }, + { + "epoch": 0.11032805214577981, + "grad_norm": 1.6389858722686768, + "learning_rate": 4.8513437771074675e-05, + "loss": 5.7741, + "step": 18551 + }, + { + "epoch": 0.1103339994290608, + "grad_norm": 1.3120185136795044, + "learning_rate": 4.8513279098341106e-05, + "loss": 5.6433, + "step": 18552 + }, + { + "epoch": 0.1103399467123418, + "grad_norm": 2.6182525157928467, + "learning_rate": 4.8513120417399286e-05, + "loss": 5.2905, + "step": 18553 + }, + { + "epoch": 0.11034589399562279, + "grad_norm": 2.8740553855895996, + "learning_rate": 4.851296172824928e-05, + "loss": 5.0364, + "step": 18554 + }, + { + "epoch": 0.1103518412789038, + "grad_norm": 2.126779794692993, + "learning_rate": 4.851280303089115e-05, + "loss": 4.8801, + "step": 18555 + }, + { + "epoch": 0.1103577885621848, + "grad_norm": 2.2658486366271973, + "learning_rate": 4.851264432532493e-05, + "loss": 5.0411, + "step": 18556 + }, + { + "epoch": 0.11036373584546579, + "grad_norm": 2.2387850284576416, + "learning_rate": 4.8512485611550706e-05, + "loss": 5.048, + "step": 18557 + }, + { + "epoch": 0.11036968312874679, + "grad_norm": 2.5402557849884033, + "learning_rate": 4.851232688956851e-05, + "loss": 5.2581, + "step": 18558 + }, + { + "epoch": 0.11037563041202779, + "grad_norm": 1.9275699853897095, + "learning_rate": 4.8512168159378396e-05, + "loss": 5.765, + "step": 18559 + }, + { + "epoch": 0.11038157769530878, + "grad_norm": 1.6632050275802612, + "learning_rate": 4.8512009420980434e-05, + "loss": 5.9928, + "step": 18560 + }, + { + "epoch": 0.11038752497858978, + "grad_norm": 1.9383779764175415, + "learning_rate": 4.851185067437467e-05, + "loss": 5.306, + "step": 18561 + }, + { + "epoch": 0.11039347226187078, + "grad_norm": 1.6358258724212646, + "learning_rate": 4.851169191956117e-05, + "loss": 5.4039, + "step": 18562 + }, + { + "epoch": 0.11039941954515177, + "grad_norm": 1.625636339187622, + "learning_rate": 4.851153315653997e-05, + "loss": 5.5028, + "step": 18563 + }, + { + "epoch": 0.11040536682843277, + "grad_norm": 1.8142133951187134, + "learning_rate": 4.8511374385311134e-05, + "loss": 5.3636, + "step": 18564 + }, + { + "epoch": 0.11041131411171377, + "grad_norm": 1.778742790222168, + "learning_rate": 4.8511215605874724e-05, + "loss": 5.9869, + "step": 18565 + }, + { + "epoch": 0.11041726139499476, + "grad_norm": 1.7027266025543213, + "learning_rate": 4.8511056818230795e-05, + "loss": 5.9855, + "step": 18566 + }, + { + "epoch": 0.11042320867827576, + "grad_norm": 1.8098080158233643, + "learning_rate": 4.85108980223794e-05, + "loss": 5.3241, + "step": 18567 + }, + { + "epoch": 0.11042915596155677, + "grad_norm": 2.058525562286377, + "learning_rate": 4.851073921832059e-05, + "loss": 5.3369, + "step": 18568 + }, + { + "epoch": 0.11043510324483775, + "grad_norm": 1.6393969058990479, + "learning_rate": 4.851058040605443e-05, + "loss": 5.234, + "step": 18569 + }, + { + "epoch": 0.11044105052811876, + "grad_norm": 1.7245092391967773, + "learning_rate": 4.8510421585580954e-05, + "loss": 5.3252, + "step": 18570 + }, + { + "epoch": 0.11044699781139976, + "grad_norm": 1.7108781337738037, + "learning_rate": 4.851026275690025e-05, + "loss": 5.342, + "step": 18571 + }, + { + "epoch": 0.11045294509468075, + "grad_norm": 1.6860250234603882, + "learning_rate": 4.8510103920012354e-05, + "loss": 5.1265, + "step": 18572 + }, + { + "epoch": 0.11045889237796175, + "grad_norm": 1.4939595460891724, + "learning_rate": 4.850994507491731e-05, + "loss": 4.995, + "step": 18573 + }, + { + "epoch": 0.11046483966124275, + "grad_norm": 1.6137492656707764, + "learning_rate": 4.85097862216152e-05, + "loss": 5.0099, + "step": 18574 + }, + { + "epoch": 0.11047078694452374, + "grad_norm": 1.8155491352081299, + "learning_rate": 4.850962736010606e-05, + "loss": 4.965, + "step": 18575 + }, + { + "epoch": 0.11047673422780474, + "grad_norm": 1.6313834190368652, + "learning_rate": 4.8509468490389955e-05, + "loss": 5.1881, + "step": 18576 + }, + { + "epoch": 0.11048268151108574, + "grad_norm": 1.9885855913162231, + "learning_rate": 4.850930961246694e-05, + "loss": 4.9172, + "step": 18577 + }, + { + "epoch": 0.11048862879436673, + "grad_norm": 1.7815529108047485, + "learning_rate": 4.850915072633706e-05, + "loss": 5.2431, + "step": 18578 + }, + { + "epoch": 0.11049457607764773, + "grad_norm": 1.496060848236084, + "learning_rate": 4.8508991832000384e-05, + "loss": 5.0222, + "step": 18579 + }, + { + "epoch": 0.11050052336092873, + "grad_norm": 1.76019287109375, + "learning_rate": 4.850883292945696e-05, + "loss": 5.1522, + "step": 18580 + }, + { + "epoch": 0.11050647064420972, + "grad_norm": 1.6975457668304443, + "learning_rate": 4.8508674018706845e-05, + "loss": 5.0687, + "step": 18581 + }, + { + "epoch": 0.11051241792749072, + "grad_norm": 2.056002378463745, + "learning_rate": 4.85085150997501e-05, + "loss": 5.0267, + "step": 18582 + }, + { + "epoch": 0.11051836521077171, + "grad_norm": 1.8109005689620972, + "learning_rate": 4.850835617258677e-05, + "loss": 5.7661, + "step": 18583 + }, + { + "epoch": 0.11052431249405271, + "grad_norm": 1.762326717376709, + "learning_rate": 4.850819723721692e-05, + "loss": 5.8038, + "step": 18584 + }, + { + "epoch": 0.11053025977733372, + "grad_norm": 1.5169013738632202, + "learning_rate": 4.85080382936406e-05, + "loss": 5.7988, + "step": 18585 + }, + { + "epoch": 0.1105362070606147, + "grad_norm": 1.7740446329116821, + "learning_rate": 4.850787934185786e-05, + "loss": 5.5388, + "step": 18586 + }, + { + "epoch": 0.11054215434389571, + "grad_norm": 1.560950756072998, + "learning_rate": 4.850772038186877e-05, + "loss": 5.406, + "step": 18587 + }, + { + "epoch": 0.11054810162717671, + "grad_norm": 1.6391148567199707, + "learning_rate": 4.850756141367338e-05, + "loss": 5.4669, + "step": 18588 + }, + { + "epoch": 0.1105540489104577, + "grad_norm": 1.5571023225784302, + "learning_rate": 4.8507402437271734e-05, + "loss": 5.6556, + "step": 18589 + }, + { + "epoch": 0.1105599961937387, + "grad_norm": 1.5374432802200317, + "learning_rate": 4.85072434526639e-05, + "loss": 5.7617, + "step": 18590 + }, + { + "epoch": 0.1105659434770197, + "grad_norm": 1.4683212041854858, + "learning_rate": 4.850708445984993e-05, + "loss": 5.5074, + "step": 18591 + }, + { + "epoch": 0.11057189076030069, + "grad_norm": 1.6689101457595825, + "learning_rate": 4.850692545882988e-05, + "loss": 5.3259, + "step": 18592 + }, + { + "epoch": 0.11057783804358169, + "grad_norm": 1.394108533859253, + "learning_rate": 4.85067664496038e-05, + "loss": 5.1686, + "step": 18593 + }, + { + "epoch": 0.1105837853268627, + "grad_norm": 1.7093585729599, + "learning_rate": 4.850660743217176e-05, + "loss": 5.6622, + "step": 18594 + }, + { + "epoch": 0.11058973261014368, + "grad_norm": 1.6189805269241333, + "learning_rate": 4.85064484065338e-05, + "loss": 5.6855, + "step": 18595 + }, + { + "epoch": 0.11059567989342468, + "grad_norm": 1.5303481817245483, + "learning_rate": 4.850628937268999e-05, + "loss": 5.8242, + "step": 18596 + }, + { + "epoch": 0.11060162717670569, + "grad_norm": 1.6557955741882324, + "learning_rate": 4.850613033064037e-05, + "loss": 5.4924, + "step": 18597 + }, + { + "epoch": 0.11060757445998667, + "grad_norm": 1.5280576944351196, + "learning_rate": 4.8505971280385e-05, + "loss": 5.6122, + "step": 18598 + }, + { + "epoch": 0.11061352174326768, + "grad_norm": 1.3656830787658691, + "learning_rate": 4.8505812221923945e-05, + "loss": 5.5282, + "step": 18599 + }, + { + "epoch": 0.11061946902654868, + "grad_norm": 1.3605096340179443, + "learning_rate": 4.850565315525725e-05, + "loss": 5.0747, + "step": 18600 + }, + { + "epoch": 0.11062541630982967, + "grad_norm": 2.120056390762329, + "learning_rate": 4.850549408038498e-05, + "loss": 5.1559, + "step": 18601 + }, + { + "epoch": 0.11063136359311067, + "grad_norm": 2.14626145362854, + "learning_rate": 4.850533499730718e-05, + "loss": 4.9778, + "step": 18602 + }, + { + "epoch": 0.11063731087639167, + "grad_norm": 2.1857240200042725, + "learning_rate": 4.8505175906023916e-05, + "loss": 4.8555, + "step": 18603 + }, + { + "epoch": 0.11064325815967266, + "grad_norm": 1.6636399030685425, + "learning_rate": 4.850501680653523e-05, + "loss": 5.3488, + "step": 18604 + }, + { + "epoch": 0.11064920544295366, + "grad_norm": 1.669511079788208, + "learning_rate": 4.8504857698841185e-05, + "loss": 5.2697, + "step": 18605 + }, + { + "epoch": 0.11065515272623466, + "grad_norm": 2.1935081481933594, + "learning_rate": 4.850469858294184e-05, + "loss": 4.4319, + "step": 18606 + }, + { + "epoch": 0.11066110000951565, + "grad_norm": 2.2359724044799805, + "learning_rate": 4.850453945883725e-05, + "loss": 4.2343, + "step": 18607 + }, + { + "epoch": 0.11066704729279665, + "grad_norm": 2.278247594833374, + "learning_rate": 4.850438032652747e-05, + "loss": 4.4955, + "step": 18608 + }, + { + "epoch": 0.11067299457607765, + "grad_norm": 2.3036160469055176, + "learning_rate": 4.850422118601254e-05, + "loss": 4.9122, + "step": 18609 + }, + { + "epoch": 0.11067894185935864, + "grad_norm": 2.3913469314575195, + "learning_rate": 4.850406203729254e-05, + "loss": 4.4703, + "step": 18610 + }, + { + "epoch": 0.11068488914263964, + "grad_norm": 1.9795238971710205, + "learning_rate": 4.8503902880367516e-05, + "loss": 4.7099, + "step": 18611 + }, + { + "epoch": 0.11069083642592063, + "grad_norm": 2.3990728855133057, + "learning_rate": 4.850374371523752e-05, + "loss": 4.3833, + "step": 18612 + }, + { + "epoch": 0.11069678370920163, + "grad_norm": 2.429461717605591, + "learning_rate": 4.850358454190261e-05, + "loss": 4.4279, + "step": 18613 + }, + { + "epoch": 0.11070273099248264, + "grad_norm": 2.598304271697998, + "learning_rate": 4.8503425360362845e-05, + "loss": 4.4376, + "step": 18614 + }, + { + "epoch": 0.11070867827576363, + "grad_norm": 2.3201403617858887, + "learning_rate": 4.850326617061827e-05, + "loss": 4.6822, + "step": 18615 + }, + { + "epoch": 0.11071462555904463, + "grad_norm": 1.8401033878326416, + "learning_rate": 4.8503106972668956e-05, + "loss": 5.1109, + "step": 18616 + }, + { + "epoch": 0.11072057284232563, + "grad_norm": 1.772309422492981, + "learning_rate": 4.850294776651494e-05, + "loss": 5.7237, + "step": 18617 + }, + { + "epoch": 0.11072652012560662, + "grad_norm": 1.7160669565200806, + "learning_rate": 4.8502788552156295e-05, + "loss": 5.7218, + "step": 18618 + }, + { + "epoch": 0.11073246740888762, + "grad_norm": 1.5467272996902466, + "learning_rate": 4.850262932959306e-05, + "loss": 5.4169, + "step": 18619 + }, + { + "epoch": 0.11073841469216862, + "grad_norm": 1.3382668495178223, + "learning_rate": 4.8502470098825316e-05, + "loss": 5.1243, + "step": 18620 + }, + { + "epoch": 0.11074436197544961, + "grad_norm": 1.3461776971817017, + "learning_rate": 4.850231085985309e-05, + "loss": 4.9412, + "step": 18621 + }, + { + "epoch": 0.11075030925873061, + "grad_norm": 1.4207700490951538, + "learning_rate": 4.850215161267646e-05, + "loss": 5.4449, + "step": 18622 + }, + { + "epoch": 0.11075625654201161, + "grad_norm": 1.7271502017974854, + "learning_rate": 4.8501992357295454e-05, + "loss": 5.4579, + "step": 18623 + }, + { + "epoch": 0.1107622038252926, + "grad_norm": 1.753090500831604, + "learning_rate": 4.8501833093710156e-05, + "loss": 5.7577, + "step": 18624 + }, + { + "epoch": 0.1107681511085736, + "grad_norm": 1.3730309009552002, + "learning_rate": 4.850167382192062e-05, + "loss": 5.3646, + "step": 18625 + }, + { + "epoch": 0.1107740983918546, + "grad_norm": 1.4723306894302368, + "learning_rate": 4.8501514541926883e-05, + "loss": 4.8234, + "step": 18626 + }, + { + "epoch": 0.1107800456751356, + "grad_norm": 1.3944339752197266, + "learning_rate": 4.850135525372901e-05, + "loss": 4.805, + "step": 18627 + }, + { + "epoch": 0.1107859929584166, + "grad_norm": 1.1402732133865356, + "learning_rate": 4.850119595732706e-05, + "loss": 4.9865, + "step": 18628 + }, + { + "epoch": 0.1107919402416976, + "grad_norm": 1.0595287084579468, + "learning_rate": 4.850103665272108e-05, + "loss": 4.9961, + "step": 18629 + }, + { + "epoch": 0.11079788752497859, + "grad_norm": 1.445143699645996, + "learning_rate": 4.8500877339911136e-05, + "loss": 5.2089, + "step": 18630 + }, + { + "epoch": 0.11080383480825959, + "grad_norm": 2.2014050483703613, + "learning_rate": 4.8500718018897275e-05, + "loss": 4.7445, + "step": 18631 + }, + { + "epoch": 0.11080978209154059, + "grad_norm": 2.117194890975952, + "learning_rate": 4.850055868967956e-05, + "loss": 4.8755, + "step": 18632 + }, + { + "epoch": 0.11081572937482158, + "grad_norm": 1.82968008518219, + "learning_rate": 4.850039935225804e-05, + "loss": 4.8852, + "step": 18633 + }, + { + "epoch": 0.11082167665810258, + "grad_norm": 1.613770842552185, + "learning_rate": 4.8500240006632766e-05, + "loss": 5.1053, + "step": 18634 + }, + { + "epoch": 0.11082762394138358, + "grad_norm": 1.8672553300857544, + "learning_rate": 4.850008065280381e-05, + "loss": 4.7134, + "step": 18635 + }, + { + "epoch": 0.11083357122466457, + "grad_norm": 1.9933403730392456, + "learning_rate": 4.849992129077122e-05, + "loss": 4.7544, + "step": 18636 + }, + { + "epoch": 0.11083951850794557, + "grad_norm": 1.8642876148223877, + "learning_rate": 4.849976192053505e-05, + "loss": 4.6598, + "step": 18637 + }, + { + "epoch": 0.11084546579122657, + "grad_norm": 1.8983674049377441, + "learning_rate": 4.849960254209536e-05, + "loss": 4.7403, + "step": 18638 + }, + { + "epoch": 0.11085141307450756, + "grad_norm": 1.9882328510284424, + "learning_rate": 4.849944315545219e-05, + "loss": 5.0105, + "step": 18639 + }, + { + "epoch": 0.11085736035778856, + "grad_norm": 1.7971723079681396, + "learning_rate": 4.8499283760605614e-05, + "loss": 5.6138, + "step": 18640 + }, + { + "epoch": 0.11086330764106955, + "grad_norm": 1.5002641677856445, + "learning_rate": 4.849912435755568e-05, + "loss": 5.7336, + "step": 18641 + }, + { + "epoch": 0.11086925492435055, + "grad_norm": 1.412880778312683, + "learning_rate": 4.8498964946302436e-05, + "loss": 5.532, + "step": 18642 + }, + { + "epoch": 0.11087520220763156, + "grad_norm": 1.6482197046279907, + "learning_rate": 4.849880552684596e-05, + "loss": 5.5432, + "step": 18643 + }, + { + "epoch": 0.11088114949091255, + "grad_norm": 1.5852200984954834, + "learning_rate": 4.849864609918629e-05, + "loss": 5.3577, + "step": 18644 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 1.540536642074585, + "learning_rate": 4.849848666332348e-05, + "loss": 5.4983, + "step": 18645 + }, + { + "epoch": 0.11089304405747455, + "grad_norm": 1.7822679281234741, + "learning_rate": 4.849832721925759e-05, + "loss": 5.1427, + "step": 18646 + }, + { + "epoch": 0.11089899134075554, + "grad_norm": 1.722977638244629, + "learning_rate": 4.8498167766988685e-05, + "loss": 5.2759, + "step": 18647 + }, + { + "epoch": 0.11090493862403654, + "grad_norm": 1.7543476819992065, + "learning_rate": 4.8498008306516806e-05, + "loss": 5.2616, + "step": 18648 + }, + { + "epoch": 0.11091088590731754, + "grad_norm": 1.4882584810256958, + "learning_rate": 4.8497848837842016e-05, + "loss": 5.3781, + "step": 18649 + }, + { + "epoch": 0.11091683319059853, + "grad_norm": 1.7358192205429077, + "learning_rate": 4.849768936096437e-05, + "loss": 5.5262, + "step": 18650 + }, + { + "epoch": 0.11092278047387953, + "grad_norm": 1.6070705652236938, + "learning_rate": 4.849752987588393e-05, + "loss": 5.0576, + "step": 18651 + }, + { + "epoch": 0.11092872775716053, + "grad_norm": 1.7641521692276, + "learning_rate": 4.8497370382600736e-05, + "loss": 5.21, + "step": 18652 + }, + { + "epoch": 0.11093467504044152, + "grad_norm": 1.8225789070129395, + "learning_rate": 4.849721088111485e-05, + "loss": 6.2734, + "step": 18653 + }, + { + "epoch": 0.11094062232372252, + "grad_norm": 1.8502428531646729, + "learning_rate": 4.849705137142634e-05, + "loss": 5.8298, + "step": 18654 + }, + { + "epoch": 0.11094656960700353, + "grad_norm": 1.4959850311279297, + "learning_rate": 4.8496891853535255e-05, + "loss": 5.4667, + "step": 18655 + }, + { + "epoch": 0.11095251689028451, + "grad_norm": 1.7957161664962769, + "learning_rate": 4.849673232744164e-05, + "loss": 5.3483, + "step": 18656 + }, + { + "epoch": 0.11095846417356552, + "grad_norm": 1.448737382888794, + "learning_rate": 4.8496572793145554e-05, + "loss": 5.4568, + "step": 18657 + }, + { + "epoch": 0.11096441145684652, + "grad_norm": 1.5068676471710205, + "learning_rate": 4.8496413250647065e-05, + "loss": 5.7089, + "step": 18658 + }, + { + "epoch": 0.1109703587401275, + "grad_norm": 1.5162447690963745, + "learning_rate": 4.849625369994622e-05, + "loss": 5.6042, + "step": 18659 + }, + { + "epoch": 0.11097630602340851, + "grad_norm": 1.81594979763031, + "learning_rate": 4.8496094141043076e-05, + "loss": 5.5301, + "step": 18660 + }, + { + "epoch": 0.11098225330668951, + "grad_norm": 1.9147114753723145, + "learning_rate": 4.8495934573937684e-05, + "loss": 4.6335, + "step": 18661 + }, + { + "epoch": 0.1109882005899705, + "grad_norm": 1.4161462783813477, + "learning_rate": 4.8495774998630106e-05, + "loss": 4.9868, + "step": 18662 + }, + { + "epoch": 0.1109941478732515, + "grad_norm": 1.5652790069580078, + "learning_rate": 4.8495615415120396e-05, + "loss": 5.6954, + "step": 18663 + }, + { + "epoch": 0.1110000951565325, + "grad_norm": 1.5217374563217163, + "learning_rate": 4.8495455823408616e-05, + "loss": 5.4338, + "step": 18664 + }, + { + "epoch": 0.11100604243981349, + "grad_norm": 1.3335540294647217, + "learning_rate": 4.8495296223494805e-05, + "loss": 5.4751, + "step": 18665 + }, + { + "epoch": 0.11101198972309449, + "grad_norm": 1.8903460502624512, + "learning_rate": 4.849513661537903e-05, + "loss": 4.9481, + "step": 18666 + }, + { + "epoch": 0.1110179370063755, + "grad_norm": 1.814666748046875, + "learning_rate": 4.849497699906135e-05, + "loss": 5.1422, + "step": 18667 + }, + { + "epoch": 0.11102388428965648, + "grad_norm": 1.7838057279586792, + "learning_rate": 4.8494817374541816e-05, + "loss": 5.3991, + "step": 18668 + }, + { + "epoch": 0.11102983157293748, + "grad_norm": 1.665671944618225, + "learning_rate": 4.849465774182048e-05, + "loss": 5.5362, + "step": 18669 + }, + { + "epoch": 0.11103577885621847, + "grad_norm": 2.255326509475708, + "learning_rate": 4.8494498100897415e-05, + "loss": 5.3161, + "step": 18670 + }, + { + "epoch": 0.11104172613949947, + "grad_norm": 1.7641721963882446, + "learning_rate": 4.849433845177265e-05, + "loss": 5.0422, + "step": 18671 + }, + { + "epoch": 0.11104767342278048, + "grad_norm": 1.4214074611663818, + "learning_rate": 4.8494178794446256e-05, + "loss": 5.2417, + "step": 18672 + }, + { + "epoch": 0.11105362070606146, + "grad_norm": 1.6417256593704224, + "learning_rate": 4.849401912891829e-05, + "loss": 5.262, + "step": 18673 + }, + { + "epoch": 0.11105956798934247, + "grad_norm": 1.4238179922103882, + "learning_rate": 4.84938594551888e-05, + "loss": 5.9754, + "step": 18674 + }, + { + "epoch": 0.11106551527262347, + "grad_norm": 1.9513673782348633, + "learning_rate": 4.849369977325785e-05, + "loss": 5.8917, + "step": 18675 + }, + { + "epoch": 0.11107146255590446, + "grad_norm": 1.625225305557251, + "learning_rate": 4.849354008312549e-05, + "loss": 5.7142, + "step": 18676 + }, + { + "epoch": 0.11107740983918546, + "grad_norm": 1.5306450128555298, + "learning_rate": 4.849338038479178e-05, + "loss": 5.3206, + "step": 18677 + }, + { + "epoch": 0.11108335712246646, + "grad_norm": 2.7895541191101074, + "learning_rate": 4.849322067825677e-05, + "loss": 4.3585, + "step": 18678 + }, + { + "epoch": 0.11108930440574745, + "grad_norm": 2.2688374519348145, + "learning_rate": 4.849306096352052e-05, + "loss": 4.4967, + "step": 18679 + }, + { + "epoch": 0.11109525168902845, + "grad_norm": 2.1710267066955566, + "learning_rate": 4.849290124058309e-05, + "loss": 4.0673, + "step": 18680 + }, + { + "epoch": 0.11110119897230945, + "grad_norm": 2.235142707824707, + "learning_rate": 4.849274150944453e-05, + "loss": 3.8198, + "step": 18681 + }, + { + "epoch": 0.11110714625559044, + "grad_norm": 2.328324317932129, + "learning_rate": 4.849258177010489e-05, + "loss": 4.008, + "step": 18682 + }, + { + "epoch": 0.11111309353887144, + "grad_norm": 2.2681312561035156, + "learning_rate": 4.849242202256424e-05, + "loss": 4.1541, + "step": 18683 + }, + { + "epoch": 0.11111904082215245, + "grad_norm": 2.5430855751037598, + "learning_rate": 4.849226226682262e-05, + "loss": 4.3177, + "step": 18684 + }, + { + "epoch": 0.11112498810543343, + "grad_norm": 2.1995978355407715, + "learning_rate": 4.84921025028801e-05, + "loss": 4.5792, + "step": 18685 + }, + { + "epoch": 0.11113093538871444, + "grad_norm": 1.9515454769134521, + "learning_rate": 4.849194273073673e-05, + "loss": 4.8759, + "step": 18686 + }, + { + "epoch": 0.11113688267199544, + "grad_norm": 2.484431028366089, + "learning_rate": 4.849178295039257e-05, + "loss": 4.1916, + "step": 18687 + }, + { + "epoch": 0.11114282995527643, + "grad_norm": 2.356790065765381, + "learning_rate": 4.8491623161847665e-05, + "loss": 4.38, + "step": 18688 + }, + { + "epoch": 0.11114877723855743, + "grad_norm": 2.414517879486084, + "learning_rate": 4.849146336510207e-05, + "loss": 4.3739, + "step": 18689 + }, + { + "epoch": 0.11115472452183843, + "grad_norm": 2.4129765033721924, + "learning_rate": 4.849130356015587e-05, + "loss": 4.0384, + "step": 18690 + }, + { + "epoch": 0.11116067180511942, + "grad_norm": 2.146932363510132, + "learning_rate": 4.8491143747009074e-05, + "loss": 4.4045, + "step": 18691 + }, + { + "epoch": 0.11116661908840042, + "grad_norm": 2.1945905685424805, + "learning_rate": 4.8490983925661776e-05, + "loss": 5.1674, + "step": 18692 + }, + { + "epoch": 0.11117256637168142, + "grad_norm": 2.2188448905944824, + "learning_rate": 4.849082409611402e-05, + "loss": 4.628, + "step": 18693 + }, + { + "epoch": 0.11117851365496241, + "grad_norm": 1.7684906721115112, + "learning_rate": 4.8490664258365847e-05, + "loss": 5.236, + "step": 18694 + }, + { + "epoch": 0.11118446093824341, + "grad_norm": 2.0367350578308105, + "learning_rate": 4.849050441241734e-05, + "loss": 5.6408, + "step": 18695 + }, + { + "epoch": 0.11119040822152441, + "grad_norm": 2.0829811096191406, + "learning_rate": 4.849034455826853e-05, + "loss": 5.5519, + "step": 18696 + }, + { + "epoch": 0.1111963555048054, + "grad_norm": 1.7884539365768433, + "learning_rate": 4.8490184695919486e-05, + "loss": 5.2345, + "step": 18697 + }, + { + "epoch": 0.1112023027880864, + "grad_norm": 1.8792423009872437, + "learning_rate": 4.849002482537026e-05, + "loss": 4.7622, + "step": 18698 + }, + { + "epoch": 0.11120825007136739, + "grad_norm": 1.7493008375167847, + "learning_rate": 4.8489864946620914e-05, + "loss": 5.295, + "step": 18699 + }, + { + "epoch": 0.1112141973546484, + "grad_norm": 1.60455322265625, + "learning_rate": 4.84897050596715e-05, + "loss": 5.5708, + "step": 18700 + }, + { + "epoch": 0.1112201446379294, + "grad_norm": 1.4326173067092896, + "learning_rate": 4.848954516452206e-05, + "loss": 5.9185, + "step": 18701 + }, + { + "epoch": 0.11122609192121038, + "grad_norm": 1.6318118572235107, + "learning_rate": 4.8489385261172685e-05, + "loss": 5.6545, + "step": 18702 + }, + { + "epoch": 0.11123203920449139, + "grad_norm": 1.4083906412124634, + "learning_rate": 4.848922534962339e-05, + "loss": 5.4776, + "step": 18703 + }, + { + "epoch": 0.11123798648777239, + "grad_norm": 1.222609519958496, + "learning_rate": 4.8489065429874256e-05, + "loss": 5.5094, + "step": 18704 + }, + { + "epoch": 0.11124393377105338, + "grad_norm": 1.6955020427703857, + "learning_rate": 4.848890550192533e-05, + "loss": 5.0516, + "step": 18705 + }, + { + "epoch": 0.11124988105433438, + "grad_norm": 1.3875632286071777, + "learning_rate": 4.848874556577667e-05, + "loss": 5.5321, + "step": 18706 + }, + { + "epoch": 0.11125582833761538, + "grad_norm": 1.2538158893585205, + "learning_rate": 4.848858562142833e-05, + "loss": 5.464, + "step": 18707 + }, + { + "epoch": 0.11126177562089637, + "grad_norm": 1.7350475788116455, + "learning_rate": 4.8488425668880366e-05, + "loss": 5.2815, + "step": 18708 + }, + { + "epoch": 0.11126772290417737, + "grad_norm": 1.543989658355713, + "learning_rate": 4.848826570813284e-05, + "loss": 5.4817, + "step": 18709 + }, + { + "epoch": 0.11127367018745837, + "grad_norm": 1.3931440114974976, + "learning_rate": 4.8488105739185807e-05, + "loss": 5.7652, + "step": 18710 + }, + { + "epoch": 0.11127961747073936, + "grad_norm": 1.4630471467971802, + "learning_rate": 4.8487945762039314e-05, + "loss": 5.4886, + "step": 18711 + }, + { + "epoch": 0.11128556475402036, + "grad_norm": 1.338161826133728, + "learning_rate": 4.848778577669342e-05, + "loss": 5.2021, + "step": 18712 + }, + { + "epoch": 0.11129151203730137, + "grad_norm": 1.4282599687576294, + "learning_rate": 4.8487625783148186e-05, + "loss": 5.2767, + "step": 18713 + }, + { + "epoch": 0.11129745932058235, + "grad_norm": 1.4386523962020874, + "learning_rate": 4.848746578140366e-05, + "loss": 5.7286, + "step": 18714 + }, + { + "epoch": 0.11130340660386336, + "grad_norm": 1.2272754907608032, + "learning_rate": 4.84873057714599e-05, + "loss": 5.3609, + "step": 18715 + }, + { + "epoch": 0.11130935388714436, + "grad_norm": 1.8362592458724976, + "learning_rate": 4.848714575331697e-05, + "loss": 5.0494, + "step": 18716 + }, + { + "epoch": 0.11131530117042535, + "grad_norm": 2.098970651626587, + "learning_rate": 4.848698572697492e-05, + "loss": 4.8282, + "step": 18717 + }, + { + "epoch": 0.11132124845370635, + "grad_norm": 2.2145583629608154, + "learning_rate": 4.84868256924338e-05, + "loss": 4.4621, + "step": 18718 + }, + { + "epoch": 0.11132719573698735, + "grad_norm": 1.8036415576934814, + "learning_rate": 4.848666564969368e-05, + "loss": 5.374, + "step": 18719 + }, + { + "epoch": 0.11133314302026834, + "grad_norm": 1.5794750452041626, + "learning_rate": 4.8486505598754605e-05, + "loss": 5.6246, + "step": 18720 + }, + { + "epoch": 0.11133909030354934, + "grad_norm": 1.637068510055542, + "learning_rate": 4.848634553961664e-05, + "loss": 5.4506, + "step": 18721 + }, + { + "epoch": 0.11134503758683034, + "grad_norm": 1.6928807497024536, + "learning_rate": 4.8486185472279824e-05, + "loss": 5.2405, + "step": 18722 + }, + { + "epoch": 0.11135098487011133, + "grad_norm": 2.0931332111358643, + "learning_rate": 4.848602539674422e-05, + "loss": 4.9366, + "step": 18723 + }, + { + "epoch": 0.11135693215339233, + "grad_norm": 1.4645583629608154, + "learning_rate": 4.848586531300989e-05, + "loss": 5.0677, + "step": 18724 + }, + { + "epoch": 0.11136287943667333, + "grad_norm": 1.7817938327789307, + "learning_rate": 4.8485705221076896e-05, + "loss": 5.5975, + "step": 18725 + }, + { + "epoch": 0.11136882671995432, + "grad_norm": 1.7167946100234985, + "learning_rate": 4.848554512094528e-05, + "loss": 5.829, + "step": 18726 + }, + { + "epoch": 0.11137477400323532, + "grad_norm": 1.723574161529541, + "learning_rate": 4.8485385012615106e-05, + "loss": 5.2702, + "step": 18727 + }, + { + "epoch": 0.11138072128651631, + "grad_norm": 1.4848002195358276, + "learning_rate": 4.848522489608642e-05, + "loss": 5.6739, + "step": 18728 + }, + { + "epoch": 0.11138666856979731, + "grad_norm": 1.798085331916809, + "learning_rate": 4.848506477135929e-05, + "loss": 5.7314, + "step": 18729 + }, + { + "epoch": 0.11139261585307832, + "grad_norm": 1.7033846378326416, + "learning_rate": 4.848490463843376e-05, + "loss": 5.531, + "step": 18730 + }, + { + "epoch": 0.1113985631363593, + "grad_norm": 1.64686119556427, + "learning_rate": 4.8484744497309896e-05, + "loss": 5.8325, + "step": 18731 + }, + { + "epoch": 0.1114045104196403, + "grad_norm": 1.9923123121261597, + "learning_rate": 4.8484584347987755e-05, + "loss": 5.9614, + "step": 18732 + }, + { + "epoch": 0.11141045770292131, + "grad_norm": 1.768896460533142, + "learning_rate": 4.8484424190467385e-05, + "loss": 5.9892, + "step": 18733 + }, + { + "epoch": 0.1114164049862023, + "grad_norm": 1.5981477499008179, + "learning_rate": 4.848426402474885e-05, + "loss": 5.6239, + "step": 18734 + }, + { + "epoch": 0.1114223522694833, + "grad_norm": 1.8919446468353271, + "learning_rate": 4.848410385083219e-05, + "loss": 5.7437, + "step": 18735 + }, + { + "epoch": 0.1114282995527643, + "grad_norm": 2.2705752849578857, + "learning_rate": 4.848394366871748e-05, + "loss": 4.5999, + "step": 18736 + }, + { + "epoch": 0.11143424683604529, + "grad_norm": 1.8626762628555298, + "learning_rate": 4.848378347840476e-05, + "loss": 5.5706, + "step": 18737 + }, + { + "epoch": 0.11144019411932629, + "grad_norm": 1.5893161296844482, + "learning_rate": 4.84836232798941e-05, + "loss": 5.4011, + "step": 18738 + }, + { + "epoch": 0.1114461414026073, + "grad_norm": 1.3441518545150757, + "learning_rate": 4.8483463073185554e-05, + "loss": 5.2412, + "step": 18739 + }, + { + "epoch": 0.11145208868588828, + "grad_norm": 1.6281975507736206, + "learning_rate": 4.848330285827917e-05, + "loss": 5.4281, + "step": 18740 + }, + { + "epoch": 0.11145803596916928, + "grad_norm": 2.1942298412323, + "learning_rate": 4.8483142635175e-05, + "loss": 5.6202, + "step": 18741 + }, + { + "epoch": 0.11146398325245029, + "grad_norm": 2.086764097213745, + "learning_rate": 4.848298240387311e-05, + "loss": 5.665, + "step": 18742 + }, + { + "epoch": 0.11146993053573127, + "grad_norm": 2.0656285285949707, + "learning_rate": 4.848282216437356e-05, + "loss": 5.5196, + "step": 18743 + }, + { + "epoch": 0.11147587781901228, + "grad_norm": 1.5579513311386108, + "learning_rate": 4.84826619166764e-05, + "loss": 5.7366, + "step": 18744 + }, + { + "epoch": 0.11148182510229328, + "grad_norm": 1.7952065467834473, + "learning_rate": 4.848250166078168e-05, + "loss": 5.8041, + "step": 18745 + }, + { + "epoch": 0.11148777238557427, + "grad_norm": 1.3523657321929932, + "learning_rate": 4.848234139668947e-05, + "loss": 5.6628, + "step": 18746 + }, + { + "epoch": 0.11149371966885527, + "grad_norm": 1.6833933591842651, + "learning_rate": 4.848218112439981e-05, + "loss": 5.5285, + "step": 18747 + }, + { + "epoch": 0.11149966695213627, + "grad_norm": 1.308733344078064, + "learning_rate": 4.848202084391276e-05, + "loss": 5.9953, + "step": 18748 + }, + { + "epoch": 0.11150561423541726, + "grad_norm": 1.3434252738952637, + "learning_rate": 4.848186055522838e-05, + "loss": 5.8267, + "step": 18749 + }, + { + "epoch": 0.11151156151869826, + "grad_norm": 1.6250263452529907, + "learning_rate": 4.848170025834673e-05, + "loss": 4.964, + "step": 18750 + }, + { + "epoch": 0.11151750880197926, + "grad_norm": 1.4924334287643433, + "learning_rate": 4.848153995326786e-05, + "loss": 4.9072, + "step": 18751 + }, + { + "epoch": 0.11152345608526025, + "grad_norm": 1.5650702714920044, + "learning_rate": 4.8481379639991826e-05, + "loss": 5.8793, + "step": 18752 + }, + { + "epoch": 0.11152940336854125, + "grad_norm": 1.488553762435913, + "learning_rate": 4.848121931851868e-05, + "loss": 5.823, + "step": 18753 + }, + { + "epoch": 0.11153535065182225, + "grad_norm": 1.5356508493423462, + "learning_rate": 4.848105898884849e-05, + "loss": 5.7632, + "step": 18754 + }, + { + "epoch": 0.11154129793510324, + "grad_norm": 1.5389797687530518, + "learning_rate": 4.8480898650981296e-05, + "loss": 5.8662, + "step": 18755 + }, + { + "epoch": 0.11154724521838424, + "grad_norm": 1.3963713645935059, + "learning_rate": 4.848073830491717e-05, + "loss": 5.5647, + "step": 18756 + }, + { + "epoch": 0.11155319250166523, + "grad_norm": 1.3739324808120728, + "learning_rate": 4.848057795065617e-05, + "loss": 5.6686, + "step": 18757 + }, + { + "epoch": 0.11155913978494623, + "grad_norm": 1.2932708263397217, + "learning_rate": 4.848041758819833e-05, + "loss": 5.6567, + "step": 18758 + }, + { + "epoch": 0.11156508706822724, + "grad_norm": 1.3388581275939941, + "learning_rate": 4.848025721754372e-05, + "loss": 5.6111, + "step": 18759 + }, + { + "epoch": 0.11157103435150822, + "grad_norm": 1.28604257106781, + "learning_rate": 4.84800968386924e-05, + "loss": 5.633, + "step": 18760 + }, + { + "epoch": 0.11157698163478923, + "grad_norm": 2.0710771083831787, + "learning_rate": 4.847993645164441e-05, + "loss": 5.1686, + "step": 18761 + }, + { + "epoch": 0.11158292891807023, + "grad_norm": 1.8022092580795288, + "learning_rate": 4.847977605639983e-05, + "loss": 5.6373, + "step": 18762 + }, + { + "epoch": 0.11158887620135122, + "grad_norm": 1.7080397605895996, + "learning_rate": 4.84796156529587e-05, + "loss": 5.5389, + "step": 18763 + }, + { + "epoch": 0.11159482348463222, + "grad_norm": 1.3582305908203125, + "learning_rate": 4.847945524132107e-05, + "loss": 5.5574, + "step": 18764 + }, + { + "epoch": 0.11160077076791322, + "grad_norm": 1.9037936925888062, + "learning_rate": 4.8479294821487015e-05, + "loss": 5.2108, + "step": 18765 + }, + { + "epoch": 0.11160671805119421, + "grad_norm": 1.6884709596633911, + "learning_rate": 4.8479134393456576e-05, + "loss": 5.2462, + "step": 18766 + }, + { + "epoch": 0.11161266533447521, + "grad_norm": 1.720261812210083, + "learning_rate": 4.8478973957229813e-05, + "loss": 5.5132, + "step": 18767 + }, + { + "epoch": 0.11161861261775621, + "grad_norm": 2.1769275665283203, + "learning_rate": 4.847881351280679e-05, + "loss": 5.1169, + "step": 18768 + }, + { + "epoch": 0.1116245599010372, + "grad_norm": 1.8593683242797852, + "learning_rate": 4.847865306018754e-05, + "loss": 4.8812, + "step": 18769 + }, + { + "epoch": 0.1116305071843182, + "grad_norm": 1.9496150016784668, + "learning_rate": 4.8478492599372147e-05, + "loss": 4.8244, + "step": 18770 + }, + { + "epoch": 0.1116364544675992, + "grad_norm": 1.584330677986145, + "learning_rate": 4.8478332130360655e-05, + "loss": 4.769, + "step": 18771 + }, + { + "epoch": 0.1116424017508802, + "grad_norm": 1.5987087488174438, + "learning_rate": 4.8478171653153116e-05, + "loss": 4.8385, + "step": 18772 + }, + { + "epoch": 0.1116483490341612, + "grad_norm": 1.919463038444519, + "learning_rate": 4.847801116774959e-05, + "loss": 4.7365, + "step": 18773 + }, + { + "epoch": 0.1116542963174422, + "grad_norm": 1.8708561658859253, + "learning_rate": 4.847785067415014e-05, + "loss": 4.9067, + "step": 18774 + }, + { + "epoch": 0.11166024360072319, + "grad_norm": 1.778316617012024, + "learning_rate": 4.8477690172354804e-05, + "loss": 4.8213, + "step": 18775 + }, + { + "epoch": 0.11166619088400419, + "grad_norm": 1.7170525789260864, + "learning_rate": 4.8477529662363655e-05, + "loss": 4.7115, + "step": 18776 + }, + { + "epoch": 0.11167213816728519, + "grad_norm": 1.6704293489456177, + "learning_rate": 4.847736914417674e-05, + "loss": 4.5814, + "step": 18777 + }, + { + "epoch": 0.11167808545056618, + "grad_norm": 1.7422312498092651, + "learning_rate": 4.847720861779412e-05, + "loss": 4.6206, + "step": 18778 + }, + { + "epoch": 0.11168403273384718, + "grad_norm": 1.7162894010543823, + "learning_rate": 4.8477048083215845e-05, + "loss": 4.6421, + "step": 18779 + }, + { + "epoch": 0.11168998001712818, + "grad_norm": 1.7825870513916016, + "learning_rate": 4.847688754044199e-05, + "loss": 4.6899, + "step": 18780 + }, + { + "epoch": 0.11169592730040917, + "grad_norm": 1.8103221654891968, + "learning_rate": 4.8476726989472577e-05, + "loss": 4.5619, + "step": 18781 + }, + { + "epoch": 0.11170187458369017, + "grad_norm": 1.8276532888412476, + "learning_rate": 4.847656643030769e-05, + "loss": 4.3429, + "step": 18782 + }, + { + "epoch": 0.11170782186697117, + "grad_norm": 1.7625696659088135, + "learning_rate": 4.847640586294737e-05, + "loss": 4.4154, + "step": 18783 + }, + { + "epoch": 0.11171376915025216, + "grad_norm": 1.842450499534607, + "learning_rate": 4.8476245287391684e-05, + "loss": 4.6279, + "step": 18784 + }, + { + "epoch": 0.11171971643353316, + "grad_norm": 1.879961371421814, + "learning_rate": 4.847608470364069e-05, + "loss": 4.4906, + "step": 18785 + }, + { + "epoch": 0.11172566371681415, + "grad_norm": 1.5556871891021729, + "learning_rate": 4.847592411169443e-05, + "loss": 5.0258, + "step": 18786 + }, + { + "epoch": 0.11173161100009515, + "grad_norm": 1.8000839948654175, + "learning_rate": 4.8475763511552965e-05, + "loss": 4.4746, + "step": 18787 + }, + { + "epoch": 0.11173755828337616, + "grad_norm": 1.4234516620635986, + "learning_rate": 4.847560290321636e-05, + "loss": 5.4744, + "step": 18788 + }, + { + "epoch": 0.11174350556665714, + "grad_norm": 1.5717182159423828, + "learning_rate": 4.847544228668466e-05, + "loss": 5.4368, + "step": 18789 + }, + { + "epoch": 0.11174945284993815, + "grad_norm": 1.3514728546142578, + "learning_rate": 4.847528166195793e-05, + "loss": 5.3036, + "step": 18790 + }, + { + "epoch": 0.11175540013321915, + "grad_norm": 1.4620373249053955, + "learning_rate": 4.847512102903621e-05, + "loss": 5.2206, + "step": 18791 + }, + { + "epoch": 0.11176134741650014, + "grad_norm": 1.3034706115722656, + "learning_rate": 4.847496038791958e-05, + "loss": 5.3359, + "step": 18792 + }, + { + "epoch": 0.11176729469978114, + "grad_norm": 1.599876046180725, + "learning_rate": 4.847479973860808e-05, + "loss": 5.1282, + "step": 18793 + }, + { + "epoch": 0.11177324198306214, + "grad_norm": 1.4783935546875, + "learning_rate": 4.847463908110177e-05, + "loss": 5.1958, + "step": 18794 + }, + { + "epoch": 0.11177918926634313, + "grad_norm": 1.5132538080215454, + "learning_rate": 4.84744784154007e-05, + "loss": 5.0166, + "step": 18795 + }, + { + "epoch": 0.11178513654962413, + "grad_norm": 1.9335131645202637, + "learning_rate": 4.847431774150495e-05, + "loss": 4.8899, + "step": 18796 + }, + { + "epoch": 0.11179108383290513, + "grad_norm": 1.5765737295150757, + "learning_rate": 4.847415705941454e-05, + "loss": 5.2848, + "step": 18797 + }, + { + "epoch": 0.11179703111618612, + "grad_norm": 1.7239350080490112, + "learning_rate": 4.847399636912955e-05, + "loss": 5.0606, + "step": 18798 + }, + { + "epoch": 0.11180297839946712, + "grad_norm": 1.5246455669403076, + "learning_rate": 4.847383567065004e-05, + "loss": 5.0829, + "step": 18799 + }, + { + "epoch": 0.11180892568274813, + "grad_norm": 1.3902997970581055, + "learning_rate": 4.847367496397604e-05, + "loss": 5.2729, + "step": 18800 + }, + { + "epoch": 0.11181487296602911, + "grad_norm": 1.426282286643982, + "learning_rate": 4.8473514249107634e-05, + "loss": 5.2259, + "step": 18801 + }, + { + "epoch": 0.11182082024931012, + "grad_norm": 1.4425853490829468, + "learning_rate": 4.847335352604486e-05, + "loss": 4.923, + "step": 18802 + }, + { + "epoch": 0.11182676753259112, + "grad_norm": 1.26097571849823, + "learning_rate": 4.8473192794787786e-05, + "loss": 4.9122, + "step": 18803 + }, + { + "epoch": 0.1118327148158721, + "grad_norm": 1.4102699756622314, + "learning_rate": 4.847303205533646e-05, + "loss": 4.9641, + "step": 18804 + }, + { + "epoch": 0.11183866209915311, + "grad_norm": 1.3965771198272705, + "learning_rate": 4.847287130769094e-05, + "loss": 4.9832, + "step": 18805 + }, + { + "epoch": 0.11184460938243411, + "grad_norm": 1.3588200807571411, + "learning_rate": 4.8472710551851284e-05, + "loss": 5.0502, + "step": 18806 + }, + { + "epoch": 0.1118505566657151, + "grad_norm": 1.394020676612854, + "learning_rate": 4.847254978781755e-05, + "loss": 4.9699, + "step": 18807 + }, + { + "epoch": 0.1118565039489961, + "grad_norm": 1.4548087120056152, + "learning_rate": 4.8472389015589794e-05, + "loss": 4.9112, + "step": 18808 + }, + { + "epoch": 0.1118624512322771, + "grad_norm": 1.4359081983566284, + "learning_rate": 4.847222823516806e-05, + "loss": 4.9284, + "step": 18809 + }, + { + "epoch": 0.11186839851555809, + "grad_norm": 1.3159685134887695, + "learning_rate": 4.847206744655242e-05, + "loss": 4.9661, + "step": 18810 + }, + { + "epoch": 0.11187434579883909, + "grad_norm": 1.5037652254104614, + "learning_rate": 4.847190664974292e-05, + "loss": 5.0318, + "step": 18811 + }, + { + "epoch": 0.1118802930821201, + "grad_norm": 1.7603816986083984, + "learning_rate": 4.8471745844739624e-05, + "loss": 5.0486, + "step": 18812 + }, + { + "epoch": 0.11188624036540108, + "grad_norm": 1.6205053329467773, + "learning_rate": 4.847158503154259e-05, + "loss": 5.0587, + "step": 18813 + }, + { + "epoch": 0.11189218764868208, + "grad_norm": 1.559334635734558, + "learning_rate": 4.847142421015185e-05, + "loss": 5.1514, + "step": 18814 + }, + { + "epoch": 0.11189813493196307, + "grad_norm": 1.4896910190582275, + "learning_rate": 4.8471263380567495e-05, + "loss": 5.2103, + "step": 18815 + }, + { + "epoch": 0.11190408221524407, + "grad_norm": 1.43007493019104, + "learning_rate": 4.847110254278956e-05, + "loss": 5.0152, + "step": 18816 + }, + { + "epoch": 0.11191002949852508, + "grad_norm": 1.3567081689834595, + "learning_rate": 4.84709416968181e-05, + "loss": 4.7193, + "step": 18817 + }, + { + "epoch": 0.11191597678180606, + "grad_norm": 1.3283864259719849, + "learning_rate": 4.8470780842653186e-05, + "loss": 4.8559, + "step": 18818 + }, + { + "epoch": 0.11192192406508707, + "grad_norm": 1.5427826642990112, + "learning_rate": 4.8470619980294854e-05, + "loss": 5.1406, + "step": 18819 + }, + { + "epoch": 0.11192787134836807, + "grad_norm": 1.4549115896224976, + "learning_rate": 4.847045910974318e-05, + "loss": 5.0377, + "step": 18820 + }, + { + "epoch": 0.11193381863164906, + "grad_norm": 1.3822715282440186, + "learning_rate": 4.84702982309982e-05, + "loss": 4.9279, + "step": 18821 + }, + { + "epoch": 0.11193976591493006, + "grad_norm": 1.290756106376648, + "learning_rate": 4.8470137344059996e-05, + "loss": 4.9631, + "step": 18822 + }, + { + "epoch": 0.11194571319821106, + "grad_norm": 1.8070625066757202, + "learning_rate": 4.84699764489286e-05, + "loss": 5.0103, + "step": 18823 + }, + { + "epoch": 0.11195166048149205, + "grad_norm": 1.6692131757736206, + "learning_rate": 4.846981554560408e-05, + "loss": 5.1265, + "step": 18824 + }, + { + "epoch": 0.11195760776477305, + "grad_norm": 1.7644426822662354, + "learning_rate": 4.8469654634086495e-05, + "loss": 5.0712, + "step": 18825 + }, + { + "epoch": 0.11196355504805405, + "grad_norm": 1.5689074993133545, + "learning_rate": 4.8469493714375893e-05, + "loss": 5.0551, + "step": 18826 + }, + { + "epoch": 0.11196950233133504, + "grad_norm": 1.610300064086914, + "learning_rate": 4.846933278647233e-05, + "loss": 5.0746, + "step": 18827 + }, + { + "epoch": 0.11197544961461604, + "grad_norm": 1.2828009128570557, + "learning_rate": 4.846917185037586e-05, + "loss": 5.0645, + "step": 18828 + }, + { + "epoch": 0.11198139689789705, + "grad_norm": 1.386265516281128, + "learning_rate": 4.846901090608655e-05, + "loss": 5.1885, + "step": 18829 + }, + { + "epoch": 0.11198734418117803, + "grad_norm": 1.446359634399414, + "learning_rate": 4.846884995360446e-05, + "loss": 5.3245, + "step": 18830 + }, + { + "epoch": 0.11199329146445904, + "grad_norm": 1.4347827434539795, + "learning_rate": 4.846868899292962e-05, + "loss": 5.379, + "step": 18831 + }, + { + "epoch": 0.11199923874774004, + "grad_norm": 1.7589528560638428, + "learning_rate": 4.846852802406212e-05, + "loss": 5.2726, + "step": 18832 + }, + { + "epoch": 0.11200518603102103, + "grad_norm": 1.4316980838775635, + "learning_rate": 4.846836704700199e-05, + "loss": 5.5424, + "step": 18833 + }, + { + "epoch": 0.11201113331430203, + "grad_norm": 1.202364444732666, + "learning_rate": 4.84682060617493e-05, + "loss": 5.4271, + "step": 18834 + }, + { + "epoch": 0.11201708059758303, + "grad_norm": 1.282231330871582, + "learning_rate": 4.8468045068304094e-05, + "loss": 5.4895, + "step": 18835 + }, + { + "epoch": 0.11202302788086402, + "grad_norm": 1.8428497314453125, + "learning_rate": 4.846788406666644e-05, + "loss": 4.9924, + "step": 18836 + }, + { + "epoch": 0.11202897516414502, + "grad_norm": 1.8442119359970093, + "learning_rate": 4.846772305683639e-05, + "loss": 4.6735, + "step": 18837 + }, + { + "epoch": 0.11203492244742602, + "grad_norm": 1.7083659172058105, + "learning_rate": 4.846756203881401e-05, + "loss": 4.8064, + "step": 18838 + }, + { + "epoch": 0.11204086973070701, + "grad_norm": 1.5663195848464966, + "learning_rate": 4.8467401012599336e-05, + "loss": 5.095, + "step": 18839 + }, + { + "epoch": 0.11204681701398801, + "grad_norm": 1.7466095685958862, + "learning_rate": 4.846723997819244e-05, + "loss": 4.7633, + "step": 18840 + }, + { + "epoch": 0.11205276429726901, + "grad_norm": 1.73336660861969, + "learning_rate": 4.846707893559336e-05, + "loss": 4.8776, + "step": 18841 + }, + { + "epoch": 0.11205871158055, + "grad_norm": 1.726456880569458, + "learning_rate": 4.8466917884802175e-05, + "loss": 4.845, + "step": 18842 + }, + { + "epoch": 0.112064658863831, + "grad_norm": 1.733583927154541, + "learning_rate": 4.8466756825818934e-05, + "loss": 4.8272, + "step": 18843 + }, + { + "epoch": 0.11207060614711199, + "grad_norm": 1.8252346515655518, + "learning_rate": 4.8466595758643684e-05, + "loss": 4.7088, + "step": 18844 + }, + { + "epoch": 0.112076553430393, + "grad_norm": 1.6071163415908813, + "learning_rate": 4.8466434683276495e-05, + "loss": 4.7085, + "step": 18845 + }, + { + "epoch": 0.112082500713674, + "grad_norm": 1.8407503366470337, + "learning_rate": 4.846627359971741e-05, + "loss": 4.6885, + "step": 18846 + }, + { + "epoch": 0.11208844799695498, + "grad_norm": 1.5426356792449951, + "learning_rate": 4.84661125079665e-05, + "loss": 4.7252, + "step": 18847 + }, + { + "epoch": 0.11209439528023599, + "grad_norm": 1.8290139436721802, + "learning_rate": 4.84659514080238e-05, + "loss": 4.9314, + "step": 18848 + }, + { + "epoch": 0.11210034256351699, + "grad_norm": 1.73724365234375, + "learning_rate": 4.846579029988939e-05, + "loss": 4.7618, + "step": 18849 + }, + { + "epoch": 0.11210628984679798, + "grad_norm": 2.0577304363250732, + "learning_rate": 4.8465629183563314e-05, + "loss": 4.8118, + "step": 18850 + }, + { + "epoch": 0.11211223713007898, + "grad_norm": 1.8696433305740356, + "learning_rate": 4.846546805904562e-05, + "loss": 4.6813, + "step": 18851 + }, + { + "epoch": 0.11211818441335998, + "grad_norm": 1.6597977876663208, + "learning_rate": 4.846530692633638e-05, + "loss": 4.5187, + "step": 18852 + }, + { + "epoch": 0.11212413169664097, + "grad_norm": 1.6595630645751953, + "learning_rate": 4.846514578543564e-05, + "loss": 5.012, + "step": 18853 + }, + { + "epoch": 0.11213007897992197, + "grad_norm": 2.2116329669952393, + "learning_rate": 4.846498463634347e-05, + "loss": 5.1757, + "step": 18854 + }, + { + "epoch": 0.11213602626320297, + "grad_norm": 1.8592875003814697, + "learning_rate": 4.846482347905991e-05, + "loss": 6.0403, + "step": 18855 + }, + { + "epoch": 0.11214197354648396, + "grad_norm": 1.7812080383300781, + "learning_rate": 4.846466231358502e-05, + "loss": 5.974, + "step": 18856 + }, + { + "epoch": 0.11214792082976496, + "grad_norm": 1.8986600637435913, + "learning_rate": 4.846450113991886e-05, + "loss": 5.3866, + "step": 18857 + }, + { + "epoch": 0.11215386811304597, + "grad_norm": 2.4542179107666016, + "learning_rate": 4.846433995806148e-05, + "loss": 4.863, + "step": 18858 + }, + { + "epoch": 0.11215981539632695, + "grad_norm": 2.1604816913604736, + "learning_rate": 4.846417876801295e-05, + "loss": 5.219, + "step": 18859 + }, + { + "epoch": 0.11216576267960796, + "grad_norm": 2.325782060623169, + "learning_rate": 4.846401756977331e-05, + "loss": 5.1454, + "step": 18860 + }, + { + "epoch": 0.11217170996288896, + "grad_norm": 2.3508334159851074, + "learning_rate": 4.846385636334263e-05, + "loss": 5.1318, + "step": 18861 + }, + { + "epoch": 0.11217765724616995, + "grad_norm": 2.2381060123443604, + "learning_rate": 4.846369514872096e-05, + "loss": 5.0676, + "step": 18862 + }, + { + "epoch": 0.11218360452945095, + "grad_norm": 2.3624770641326904, + "learning_rate": 4.8463533925908355e-05, + "loss": 5.0251, + "step": 18863 + }, + { + "epoch": 0.11218955181273195, + "grad_norm": 1.9950919151306152, + "learning_rate": 4.846337269490487e-05, + "loss": 5.0396, + "step": 18864 + }, + { + "epoch": 0.11219549909601294, + "grad_norm": 1.829410433769226, + "learning_rate": 4.8463211455710574e-05, + "loss": 4.9327, + "step": 18865 + }, + { + "epoch": 0.11220144637929394, + "grad_norm": 1.8879605531692505, + "learning_rate": 4.846305020832551e-05, + "loss": 4.8902, + "step": 18866 + }, + { + "epoch": 0.11220739366257494, + "grad_norm": 1.89055335521698, + "learning_rate": 4.846288895274973e-05, + "loss": 4.9219, + "step": 18867 + }, + { + "epoch": 0.11221334094585593, + "grad_norm": 2.224971055984497, + "learning_rate": 4.84627276889833e-05, + "loss": 5.0164, + "step": 18868 + }, + { + "epoch": 0.11221928822913693, + "grad_norm": 2.1675336360931396, + "learning_rate": 4.8462566417026276e-05, + "loss": 5.0082, + "step": 18869 + }, + { + "epoch": 0.11222523551241793, + "grad_norm": 1.885236144065857, + "learning_rate": 4.8462405136878714e-05, + "loss": 5.1484, + "step": 18870 + }, + { + "epoch": 0.11223118279569892, + "grad_norm": 1.3037774562835693, + "learning_rate": 4.846224384854067e-05, + "loss": 5.64, + "step": 18871 + }, + { + "epoch": 0.11223713007897992, + "grad_norm": 1.6506762504577637, + "learning_rate": 4.846208255201219e-05, + "loss": 5.6067, + "step": 18872 + }, + { + "epoch": 0.11224307736226091, + "grad_norm": 1.4294368028640747, + "learning_rate": 4.8461921247293344e-05, + "loss": 5.67, + "step": 18873 + }, + { + "epoch": 0.11224902464554191, + "grad_norm": 1.6201854944229126, + "learning_rate": 4.846175993438419e-05, + "loss": 5.6093, + "step": 18874 + }, + { + "epoch": 0.11225497192882292, + "grad_norm": 1.5683603286743164, + "learning_rate": 4.846159861328478e-05, + "loss": 5.6129, + "step": 18875 + }, + { + "epoch": 0.1122609192121039, + "grad_norm": 1.5446193218231201, + "learning_rate": 4.8461437283995156e-05, + "loss": 5.6063, + "step": 18876 + }, + { + "epoch": 0.1122668664953849, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846127594651539e-05, + "loss": 5.6291, + "step": 18877 + }, + { + "epoch": 0.11227281377866591, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846111460084554e-05, + "loss": 5.6282, + "step": 18878 + }, + { + "epoch": 0.1122787610619469, + "grad_norm": 1.4379156827926636, + "learning_rate": 4.846095324698565e-05, + "loss": 5.5451, + "step": 18879 + }, + { + "epoch": 0.1122847083452279, + "grad_norm": 1.4940646886825562, + "learning_rate": 4.8460791884935785e-05, + "loss": 5.4705, + "step": 18880 + }, + { + "epoch": 0.1122906556285089, + "grad_norm": 1.4625567197799683, + "learning_rate": 4.8460630514696e-05, + "loss": 5.5428, + "step": 18881 + }, + { + "epoch": 0.11229660291178989, + "grad_norm": 1.7899153232574463, + "learning_rate": 4.846046913626636e-05, + "loss": 5.7665, + "step": 18882 + }, + { + "epoch": 0.11230255019507089, + "grad_norm": 2.1002516746520996, + "learning_rate": 4.8460307749646906e-05, + "loss": 6.1132, + "step": 18883 + }, + { + "epoch": 0.11230849747835189, + "grad_norm": 1.8406580686569214, + "learning_rate": 4.84601463548377e-05, + "loss": 5.5207, + "step": 18884 + }, + { + "epoch": 0.11231444476163288, + "grad_norm": 1.6287425756454468, + "learning_rate": 4.84599849518388e-05, + "loss": 5.931, + "step": 18885 + }, + { + "epoch": 0.11232039204491388, + "grad_norm": 1.4447002410888672, + "learning_rate": 4.845982354065027e-05, + "loss": 5.6181, + "step": 18886 + }, + { + "epoch": 0.11232633932819489, + "grad_norm": 1.6555171012878418, + "learning_rate": 4.845966212127215e-05, + "loss": 5.1448, + "step": 18887 + }, + { + "epoch": 0.11233228661147587, + "grad_norm": 2.0948448181152344, + "learning_rate": 4.84595006937045e-05, + "loss": 5.3695, + "step": 18888 + }, + { + "epoch": 0.11233823389475688, + "grad_norm": 1.6369346380233765, + "learning_rate": 4.845933925794739e-05, + "loss": 5.5859, + "step": 18889 + }, + { + "epoch": 0.11234418117803788, + "grad_norm": 1.4660474061965942, + "learning_rate": 4.845917781400086e-05, + "loss": 5.6121, + "step": 18890 + }, + { + "epoch": 0.11235012846131887, + "grad_norm": 1.6739449501037598, + "learning_rate": 4.845901636186497e-05, + "loss": 5.6874, + "step": 18891 + }, + { + "epoch": 0.11235607574459987, + "grad_norm": 1.4542694091796875, + "learning_rate": 4.8458854901539794e-05, + "loss": 5.5956, + "step": 18892 + }, + { + "epoch": 0.11236202302788087, + "grad_norm": 1.3305023908615112, + "learning_rate": 4.8458693433025365e-05, + "loss": 5.658, + "step": 18893 + }, + { + "epoch": 0.11236797031116186, + "grad_norm": 1.8081300258636475, + "learning_rate": 4.845853195632175e-05, + "loss": 4.8563, + "step": 18894 + }, + { + "epoch": 0.11237391759444286, + "grad_norm": 1.8959764242172241, + "learning_rate": 4.8458370471429e-05, + "loss": 5.3051, + "step": 18895 + }, + { + "epoch": 0.11237986487772386, + "grad_norm": 1.9471427202224731, + "learning_rate": 4.845820897834718e-05, + "loss": 5.8181, + "step": 18896 + }, + { + "epoch": 0.11238581216100485, + "grad_norm": 1.6311548948287964, + "learning_rate": 4.845804747707634e-05, + "loss": 5.7714, + "step": 18897 + }, + { + "epoch": 0.11239175944428585, + "grad_norm": 1.830788493156433, + "learning_rate": 4.845788596761653e-05, + "loss": 5.9535, + "step": 18898 + }, + { + "epoch": 0.11239770672756685, + "grad_norm": 1.7896127700805664, + "learning_rate": 4.8457724449967836e-05, + "loss": 5.5385, + "step": 18899 + }, + { + "epoch": 0.11240365401084784, + "grad_norm": 1.5098718404769897, + "learning_rate": 4.845756292413027e-05, + "loss": 5.4067, + "step": 18900 + }, + { + "epoch": 0.11240960129412884, + "grad_norm": 1.9224756956100464, + "learning_rate": 4.845740139010392e-05, + "loss": 5.4863, + "step": 18901 + }, + { + "epoch": 0.11241554857740983, + "grad_norm": 2.1158740520477295, + "learning_rate": 4.845723984788884e-05, + "loss": 5.0745, + "step": 18902 + }, + { + "epoch": 0.11242149586069083, + "grad_norm": 2.292292594909668, + "learning_rate": 4.845707829748507e-05, + "loss": 4.9248, + "step": 18903 + }, + { + "epoch": 0.11242744314397184, + "grad_norm": 2.312593698501587, + "learning_rate": 4.8456916738892675e-05, + "loss": 4.9712, + "step": 18904 + }, + { + "epoch": 0.11243339042725282, + "grad_norm": 1.7302945852279663, + "learning_rate": 4.8456755172111725e-05, + "loss": 5.0814, + "step": 18905 + }, + { + "epoch": 0.11243933771053383, + "grad_norm": 1.3441206216812134, + "learning_rate": 4.845659359714225e-05, + "loss": 5.6563, + "step": 18906 + }, + { + "epoch": 0.11244528499381483, + "grad_norm": 1.5126272439956665, + "learning_rate": 4.845643201398433e-05, + "loss": 5.607, + "step": 18907 + }, + { + "epoch": 0.11245123227709582, + "grad_norm": 1.438795804977417, + "learning_rate": 4.845627042263801e-05, + "loss": 5.5287, + "step": 18908 + }, + { + "epoch": 0.11245717956037682, + "grad_norm": 1.6724447011947632, + "learning_rate": 4.845610882310335e-05, + "loss": 5.361, + "step": 18909 + }, + { + "epoch": 0.11246312684365782, + "grad_norm": 1.7267217636108398, + "learning_rate": 4.845594721538041e-05, + "loss": 5.6361, + "step": 18910 + }, + { + "epoch": 0.11246907412693881, + "grad_norm": 1.7616380453109741, + "learning_rate": 4.845578559946923e-05, + "loss": 5.2538, + "step": 18911 + }, + { + "epoch": 0.11247502141021981, + "grad_norm": 1.8318467140197754, + "learning_rate": 4.845562397536988e-05, + "loss": 4.8236, + "step": 18912 + }, + { + "epoch": 0.11248096869350081, + "grad_norm": 2.4882378578186035, + "learning_rate": 4.8455462343082415e-05, + "loss": 4.5624, + "step": 18913 + }, + { + "epoch": 0.1124869159767818, + "grad_norm": 2.5109870433807373, + "learning_rate": 4.845530070260689e-05, + "loss": 4.7906, + "step": 18914 + }, + { + "epoch": 0.1124928632600628, + "grad_norm": 2.2084672451019287, + "learning_rate": 4.845513905394336e-05, + "loss": 4.5304, + "step": 18915 + }, + { + "epoch": 0.1124988105433438, + "grad_norm": 2.4276058673858643, + "learning_rate": 4.8454977397091885e-05, + "loss": 4.3753, + "step": 18916 + }, + { + "epoch": 0.1125047578266248, + "grad_norm": 2.5022165775299072, + "learning_rate": 4.845481573205252e-05, + "loss": 4.1849, + "step": 18917 + }, + { + "epoch": 0.1125107051099058, + "grad_norm": 2.511643171310425, + "learning_rate": 4.845465405882532e-05, + "loss": 4.4007, + "step": 18918 + }, + { + "epoch": 0.1125166523931868, + "grad_norm": 2.598860263824463, + "learning_rate": 4.845449237741034e-05, + "loss": 4.6015, + "step": 18919 + }, + { + "epoch": 0.11252259967646779, + "grad_norm": 2.339555263519287, + "learning_rate": 4.845433068780765e-05, + "loss": 4.4123, + "step": 18920 + }, + { + "epoch": 0.11252854695974879, + "grad_norm": 2.286858320236206, + "learning_rate": 4.845416899001729e-05, + "loss": 4.3709, + "step": 18921 + }, + { + "epoch": 0.11253449424302979, + "grad_norm": 2.431622266769409, + "learning_rate": 4.845400728403932e-05, + "loss": 4.2162, + "step": 18922 + }, + { + "epoch": 0.11254044152631078, + "grad_norm": 2.7147364616394043, + "learning_rate": 4.8453845569873796e-05, + "loss": 4.3949, + "step": 18923 + }, + { + "epoch": 0.11254638880959178, + "grad_norm": 2.4738264083862305, + "learning_rate": 4.8453683847520784e-05, + "loss": 4.2671, + "step": 18924 + }, + { + "epoch": 0.11255233609287278, + "grad_norm": 2.007298707962036, + "learning_rate": 4.8453522116980325e-05, + "loss": 4.9317, + "step": 18925 + }, + { + "epoch": 0.11255828337615377, + "grad_norm": 1.8057860136032104, + "learning_rate": 4.8453360378252486e-05, + "loss": 5.4763, + "step": 18926 + }, + { + "epoch": 0.11256423065943477, + "grad_norm": 1.913892149925232, + "learning_rate": 4.845319863133733e-05, + "loss": 5.3112, + "step": 18927 + }, + { + "epoch": 0.11257017794271577, + "grad_norm": 1.6226540803909302, + "learning_rate": 4.845303687623489e-05, + "loss": 5.7164, + "step": 18928 + }, + { + "epoch": 0.11257612522599676, + "grad_norm": 1.7885600328445435, + "learning_rate": 4.8452875112945253e-05, + "loss": 5.7746, + "step": 18929 + }, + { + "epoch": 0.11258207250927776, + "grad_norm": 1.5598177909851074, + "learning_rate": 4.8452713341468444e-05, + "loss": 5.7843, + "step": 18930 + }, + { + "epoch": 0.11258801979255875, + "grad_norm": 1.517059564590454, + "learning_rate": 4.845255156180455e-05, + "loss": 5.7777, + "step": 18931 + }, + { + "epoch": 0.11259396707583975, + "grad_norm": 1.2515442371368408, + "learning_rate": 4.84523897739536e-05, + "loss": 5.7443, + "step": 18932 + }, + { + "epoch": 0.11259991435912076, + "grad_norm": 1.4970554113388062, + "learning_rate": 4.845222797791566e-05, + "loss": 5.6157, + "step": 18933 + }, + { + "epoch": 0.11260586164240174, + "grad_norm": 1.632620930671692, + "learning_rate": 4.8452066173690804e-05, + "loss": 5.0715, + "step": 18934 + }, + { + "epoch": 0.11261180892568275, + "grad_norm": 1.9634324312210083, + "learning_rate": 4.845190436127907e-05, + "loss": 5.3624, + "step": 18935 + }, + { + "epoch": 0.11261775620896375, + "grad_norm": 1.663560152053833, + "learning_rate": 4.8451742540680514e-05, + "loss": 5.4324, + "step": 18936 + }, + { + "epoch": 0.11262370349224474, + "grad_norm": 1.560684323310852, + "learning_rate": 4.84515807118952e-05, + "loss": 4.8426, + "step": 18937 + }, + { + "epoch": 0.11262965077552574, + "grad_norm": 1.5759334564208984, + "learning_rate": 4.8451418874923185e-05, + "loss": 5.6239, + "step": 18938 + }, + { + "epoch": 0.11263559805880674, + "grad_norm": 1.8501811027526855, + "learning_rate": 4.8451257029764504e-05, + "loss": 5.1734, + "step": 18939 + }, + { + "epoch": 0.11264154534208773, + "grad_norm": 1.811924934387207, + "learning_rate": 4.845109517641925e-05, + "loss": 5.2778, + "step": 18940 + }, + { + "epoch": 0.11264749262536873, + "grad_norm": 1.9684933423995972, + "learning_rate": 4.845093331488746e-05, + "loss": 5.3673, + "step": 18941 + }, + { + "epoch": 0.11265343990864973, + "grad_norm": 2.1155457496643066, + "learning_rate": 4.8450771445169185e-05, + "loss": 4.6955, + "step": 18942 + }, + { + "epoch": 0.11265938719193072, + "grad_norm": 2.117941379547119, + "learning_rate": 4.8450609567264495e-05, + "loss": 4.4051, + "step": 18943 + }, + { + "epoch": 0.11266533447521172, + "grad_norm": 1.9649946689605713, + "learning_rate": 4.845044768117343e-05, + "loss": 5.0204, + "step": 18944 + }, + { + "epoch": 0.11267128175849273, + "grad_norm": 1.898119568824768, + "learning_rate": 4.845028578689606e-05, + "loss": 4.9994, + "step": 18945 + }, + { + "epoch": 0.11267722904177371, + "grad_norm": 2.4376771450042725, + "learning_rate": 4.845012388443244e-05, + "loss": 4.6852, + "step": 18946 + }, + { + "epoch": 0.11268317632505472, + "grad_norm": 2.593094825744629, + "learning_rate": 4.844996197378262e-05, + "loss": 4.3845, + "step": 18947 + }, + { + "epoch": 0.11268912360833572, + "grad_norm": 2.6004302501678467, + "learning_rate": 4.844980005494666e-05, + "loss": 4.2989, + "step": 18948 + }, + { + "epoch": 0.1126950708916167, + "grad_norm": 2.4045653343200684, + "learning_rate": 4.844963812792462e-05, + "loss": 4.411, + "step": 18949 + }, + { + "epoch": 0.11270101817489771, + "grad_norm": 2.2256572246551514, + "learning_rate": 4.8449476192716555e-05, + "loss": 4.423, + "step": 18950 + }, + { + "epoch": 0.11270696545817871, + "grad_norm": 2.110077142715454, + "learning_rate": 4.844931424932252e-05, + "loss": 4.2971, + "step": 18951 + }, + { + "epoch": 0.1127129127414597, + "grad_norm": 1.8960111141204834, + "learning_rate": 4.844915229774257e-05, + "loss": 5.0758, + "step": 18952 + }, + { + "epoch": 0.1127188600247407, + "grad_norm": 1.998542308807373, + "learning_rate": 4.844899033797676e-05, + "loss": 4.8565, + "step": 18953 + }, + { + "epoch": 0.1127248073080217, + "grad_norm": 1.7070491313934326, + "learning_rate": 4.8448828370025156e-05, + "loss": 5.4684, + "step": 18954 + }, + { + "epoch": 0.11273075459130269, + "grad_norm": 2.062570095062256, + "learning_rate": 4.8448666393887806e-05, + "loss": 5.5384, + "step": 18955 + }, + { + "epoch": 0.11273670187458369, + "grad_norm": 1.8782148361206055, + "learning_rate": 4.844850440956476e-05, + "loss": 5.0373, + "step": 18956 + }, + { + "epoch": 0.1127426491578647, + "grad_norm": 2.3674817085266113, + "learning_rate": 4.8448342417056096e-05, + "loss": 5.1999, + "step": 18957 + }, + { + "epoch": 0.11274859644114568, + "grad_norm": 2.2243809700012207, + "learning_rate": 4.844818041636186e-05, + "loss": 5.3275, + "step": 18958 + }, + { + "epoch": 0.11275454372442668, + "grad_norm": 2.2929039001464844, + "learning_rate": 4.8448018407482096e-05, + "loss": 5.3958, + "step": 18959 + }, + { + "epoch": 0.11276049100770767, + "grad_norm": 2.0325045585632324, + "learning_rate": 4.844785639041688e-05, + "loss": 4.6686, + "step": 18960 + }, + { + "epoch": 0.11276643829098867, + "grad_norm": 1.8510624170303345, + "learning_rate": 4.8447694365166255e-05, + "loss": 4.9134, + "step": 18961 + }, + { + "epoch": 0.11277238557426968, + "grad_norm": 1.7537583112716675, + "learning_rate": 4.844753233173027e-05, + "loss": 5.0618, + "step": 18962 + }, + { + "epoch": 0.11277833285755066, + "grad_norm": 1.9293370246887207, + "learning_rate": 4.844737029010901e-05, + "loss": 4.8716, + "step": 18963 + }, + { + "epoch": 0.11278428014083167, + "grad_norm": 1.6931575536727905, + "learning_rate": 4.844720824030251e-05, + "loss": 5.4606, + "step": 18964 + }, + { + "epoch": 0.11279022742411267, + "grad_norm": 1.970825433731079, + "learning_rate": 4.8447046182310836e-05, + "loss": 5.2482, + "step": 18965 + }, + { + "epoch": 0.11279617470739366, + "grad_norm": 1.4842323064804077, + "learning_rate": 4.844688411613404e-05, + "loss": 5.972, + "step": 18966 + }, + { + "epoch": 0.11280212199067466, + "grad_norm": 1.84175705909729, + "learning_rate": 4.8446722041772174e-05, + "loss": 4.7696, + "step": 18967 + }, + { + "epoch": 0.11280806927395566, + "grad_norm": 1.8980286121368408, + "learning_rate": 4.84465599592253e-05, + "loss": 4.5125, + "step": 18968 + }, + { + "epoch": 0.11281401655723665, + "grad_norm": 1.7349838018417358, + "learning_rate": 4.844639786849348e-05, + "loss": 4.581, + "step": 18969 + }, + { + "epoch": 0.11281996384051765, + "grad_norm": 1.5894320011138916, + "learning_rate": 4.844623576957675e-05, + "loss": 4.9205, + "step": 18970 + }, + { + "epoch": 0.11282591112379865, + "grad_norm": 1.8740227222442627, + "learning_rate": 4.84460736624752e-05, + "loss": 4.938, + "step": 18971 + }, + { + "epoch": 0.11283185840707964, + "grad_norm": 1.744537591934204, + "learning_rate": 4.8445911547188854e-05, + "loss": 5.5215, + "step": 18972 + }, + { + "epoch": 0.11283780569036064, + "grad_norm": 1.5465041399002075, + "learning_rate": 4.844574942371779e-05, + "loss": 5.3607, + "step": 18973 + }, + { + "epoch": 0.11284375297364165, + "grad_norm": 1.8417413234710693, + "learning_rate": 4.8445587292062056e-05, + "loss": 5.632, + "step": 18974 + }, + { + "epoch": 0.11284970025692263, + "grad_norm": 1.7401045560836792, + "learning_rate": 4.8445425152221704e-05, + "loss": 5.5514, + "step": 18975 + }, + { + "epoch": 0.11285564754020364, + "grad_norm": 1.6192666292190552, + "learning_rate": 4.8445263004196805e-05, + "loss": 5.2694, + "step": 18976 + }, + { + "epoch": 0.11286159482348464, + "grad_norm": 1.842510461807251, + "learning_rate": 4.84451008479874e-05, + "loss": 5.3429, + "step": 18977 + }, + { + "epoch": 0.11286754210676563, + "grad_norm": 1.4824966192245483, + "learning_rate": 4.8444938683593554e-05, + "loss": 5.5212, + "step": 18978 + }, + { + "epoch": 0.11287348939004663, + "grad_norm": 1.7926548719406128, + "learning_rate": 4.8444776511015324e-05, + "loss": 4.8687, + "step": 18979 + }, + { + "epoch": 0.11287943667332763, + "grad_norm": 1.7114008665084839, + "learning_rate": 4.844461433025277e-05, + "loss": 4.7459, + "step": 18980 + }, + { + "epoch": 0.11288538395660862, + "grad_norm": 1.8884011507034302, + "learning_rate": 4.844445214130594e-05, + "loss": 5.1957, + "step": 18981 + }, + { + "epoch": 0.11289133123988962, + "grad_norm": 1.6901582479476929, + "learning_rate": 4.844428994417489e-05, + "loss": 5.3349, + "step": 18982 + }, + { + "epoch": 0.11289727852317062, + "grad_norm": 1.7148336172103882, + "learning_rate": 4.844412773885968e-05, + "loss": 5.4903, + "step": 18983 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 1.478767991065979, + "learning_rate": 4.844396552536037e-05, + "loss": 5.6339, + "step": 18984 + }, + { + "epoch": 0.11290917308973261, + "grad_norm": 1.5679733753204346, + "learning_rate": 4.844380330367701e-05, + "loss": 5.4722, + "step": 18985 + }, + { + "epoch": 0.11291512037301361, + "grad_norm": 1.718564510345459, + "learning_rate": 4.844364107380966e-05, + "loss": 5.2826, + "step": 18986 + }, + { + "epoch": 0.1129210676562946, + "grad_norm": 1.6757621765136719, + "learning_rate": 4.844347883575839e-05, + "loss": 5.7454, + "step": 18987 + }, + { + "epoch": 0.1129270149395756, + "grad_norm": 1.9370322227478027, + "learning_rate": 4.844331658952324e-05, + "loss": 4.6631, + "step": 18988 + }, + { + "epoch": 0.1129329622228566, + "grad_norm": 1.9932162761688232, + "learning_rate": 4.844315433510426e-05, + "loss": 4.7486, + "step": 18989 + }, + { + "epoch": 0.1129389095061376, + "grad_norm": 2.0191309452056885, + "learning_rate": 4.844299207250152e-05, + "loss": 4.6999, + "step": 18990 + }, + { + "epoch": 0.1129448567894186, + "grad_norm": 1.971913456916809, + "learning_rate": 4.8442829801715074e-05, + "loss": 4.7345, + "step": 18991 + }, + { + "epoch": 0.11295080407269958, + "grad_norm": 1.8503371477127075, + "learning_rate": 4.844266752274498e-05, + "loss": 4.5352, + "step": 18992 + }, + { + "epoch": 0.11295675135598059, + "grad_norm": 2.0024712085723877, + "learning_rate": 4.8442505235591294e-05, + "loss": 4.6513, + "step": 18993 + }, + { + "epoch": 0.11296269863926159, + "grad_norm": 1.645996332168579, + "learning_rate": 4.844234294025407e-05, + "loss": 4.816, + "step": 18994 + }, + { + "epoch": 0.11296864592254258, + "grad_norm": 1.6649290323257446, + "learning_rate": 4.844218063673337e-05, + "loss": 5.1471, + "step": 18995 + }, + { + "epoch": 0.11297459320582358, + "grad_norm": 1.4211794137954712, + "learning_rate": 4.844201832502924e-05, + "loss": 5.0807, + "step": 18996 + }, + { + "epoch": 0.11298054048910458, + "grad_norm": 1.6982463598251343, + "learning_rate": 4.844185600514175e-05, + "loss": 4.9912, + "step": 18997 + }, + { + "epoch": 0.11298648777238557, + "grad_norm": 1.5852501392364502, + "learning_rate": 4.844169367707095e-05, + "loss": 5.4541, + "step": 18998 + }, + { + "epoch": 0.11299243505566657, + "grad_norm": 1.787331223487854, + "learning_rate": 4.844153134081689e-05, + "loss": 5.4295, + "step": 18999 + }, + { + "epoch": 0.11299838233894757, + "grad_norm": 1.5758492946624756, + "learning_rate": 4.844136899637964e-05, + "loss": 5.2601, + "step": 19000 + }, + { + "epoch": 0.11300432962222856, + "grad_norm": 1.5441172122955322, + "learning_rate": 4.844120664375925e-05, + "loss": 4.882, + "step": 19001 + }, + { + "epoch": 0.11301027690550956, + "grad_norm": 1.6587432622909546, + "learning_rate": 4.8441044282955774e-05, + "loss": 4.8311, + "step": 19002 + }, + { + "epoch": 0.11301622418879056, + "grad_norm": 1.6563838720321655, + "learning_rate": 4.844088191396927e-05, + "loss": 4.87, + "step": 19003 + }, + { + "epoch": 0.11302217147207155, + "grad_norm": 1.7367866039276123, + "learning_rate": 4.84407195367998e-05, + "loss": 5.2984, + "step": 19004 + }, + { + "epoch": 0.11302811875535256, + "grad_norm": 2.3307883739471436, + "learning_rate": 4.844055715144742e-05, + "loss": 4.8798, + "step": 19005 + }, + { + "epoch": 0.11303406603863356, + "grad_norm": 2.601762294769287, + "learning_rate": 4.844039475791218e-05, + "loss": 4.8156, + "step": 19006 + }, + { + "epoch": 0.11304001332191455, + "grad_norm": 2.372610330581665, + "learning_rate": 4.844023235619414e-05, + "loss": 4.0715, + "step": 19007 + }, + { + "epoch": 0.11304596060519555, + "grad_norm": 2.16119384765625, + "learning_rate": 4.8440069946293356e-05, + "loss": 4.2701, + "step": 19008 + }, + { + "epoch": 0.11305190788847655, + "grad_norm": 2.1576502323150635, + "learning_rate": 4.843990752820989e-05, + "loss": 4.1302, + "step": 19009 + }, + { + "epoch": 0.11305785517175754, + "grad_norm": 2.122025489807129, + "learning_rate": 4.843974510194379e-05, + "loss": 4.0969, + "step": 19010 + }, + { + "epoch": 0.11306380245503854, + "grad_norm": 2.1929194927215576, + "learning_rate": 4.843958266749512e-05, + "loss": 4.2054, + "step": 19011 + }, + { + "epoch": 0.11306974973831954, + "grad_norm": 2.6305301189422607, + "learning_rate": 4.843942022486393e-05, + "loss": 4.3942, + "step": 19012 + }, + { + "epoch": 0.11307569702160053, + "grad_norm": 2.5355119705200195, + "learning_rate": 4.843925777405028e-05, + "loss": 4.4392, + "step": 19013 + }, + { + "epoch": 0.11308164430488153, + "grad_norm": 2.5040411949157715, + "learning_rate": 4.843909531505424e-05, + "loss": 4.221, + "step": 19014 + }, + { + "epoch": 0.11308759158816253, + "grad_norm": 2.15824556350708, + "learning_rate": 4.843893284787584e-05, + "loss": 4.8255, + "step": 19015 + }, + { + "epoch": 0.11309353887144352, + "grad_norm": 1.6300889253616333, + "learning_rate": 4.8438770372515155e-05, + "loss": 5.3668, + "step": 19016 + }, + { + "epoch": 0.11309948615472452, + "grad_norm": 1.745676875114441, + "learning_rate": 4.8438607888972245e-05, + "loss": 5.2858, + "step": 19017 + }, + { + "epoch": 0.11310543343800553, + "grad_norm": 1.6511434316635132, + "learning_rate": 4.8438445397247146e-05, + "loss": 5.2856, + "step": 19018 + }, + { + "epoch": 0.11311138072128651, + "grad_norm": 1.6282720565795898, + "learning_rate": 4.843828289733994e-05, + "loss": 5.7748, + "step": 19019 + }, + { + "epoch": 0.11311732800456752, + "grad_norm": 1.6303821802139282, + "learning_rate": 4.843812038925066e-05, + "loss": 5.3627, + "step": 19020 + }, + { + "epoch": 0.1131232752878485, + "grad_norm": 1.5684829950332642, + "learning_rate": 4.843795787297938e-05, + "loss": 5.6563, + "step": 19021 + }, + { + "epoch": 0.1131292225711295, + "grad_norm": 1.9084935188293457, + "learning_rate": 4.843779534852615e-05, + "loss": 5.7084, + "step": 19022 + }, + { + "epoch": 0.11313516985441051, + "grad_norm": 1.5176855325698853, + "learning_rate": 4.843763281589103e-05, + "loss": 5.7602, + "step": 19023 + }, + { + "epoch": 0.1131411171376915, + "grad_norm": 1.3877767324447632, + "learning_rate": 4.843747027507407e-05, + "loss": 5.4914, + "step": 19024 + }, + { + "epoch": 0.1131470644209725, + "grad_norm": 2.0801119804382324, + "learning_rate": 4.843730772607533e-05, + "loss": 4.8814, + "step": 19025 + }, + { + "epoch": 0.1131530117042535, + "grad_norm": 1.9673620462417603, + "learning_rate": 4.8437145168894874e-05, + "loss": 4.9423, + "step": 19026 + }, + { + "epoch": 0.11315895898753449, + "grad_norm": 1.5284085273742676, + "learning_rate": 4.8436982603532755e-05, + "loss": 5.0471, + "step": 19027 + }, + { + "epoch": 0.11316490627081549, + "grad_norm": 1.870762825012207, + "learning_rate": 4.8436820029989023e-05, + "loss": 4.9376, + "step": 19028 + }, + { + "epoch": 0.11317085355409649, + "grad_norm": 1.9094692468643188, + "learning_rate": 4.843665744826374e-05, + "loss": 4.8677, + "step": 19029 + }, + { + "epoch": 0.11317680083737748, + "grad_norm": 1.6463623046875, + "learning_rate": 4.8436494858356964e-05, + "loss": 5.3397, + "step": 19030 + }, + { + "epoch": 0.11318274812065848, + "grad_norm": 1.8127562999725342, + "learning_rate": 4.8436332260268745e-05, + "loss": 5.1626, + "step": 19031 + }, + { + "epoch": 0.11318869540393948, + "grad_norm": 1.5196025371551514, + "learning_rate": 4.8436169653999144e-05, + "loss": 5.1213, + "step": 19032 + }, + { + "epoch": 0.11319464268722047, + "grad_norm": 1.8930630683898926, + "learning_rate": 4.843600703954823e-05, + "loss": 4.8268, + "step": 19033 + }, + { + "epoch": 0.11320058997050148, + "grad_norm": 2.1579136848449707, + "learning_rate": 4.843584441691603e-05, + "loss": 5.6111, + "step": 19034 + }, + { + "epoch": 0.11320653725378248, + "grad_norm": 1.7644915580749512, + "learning_rate": 4.8435681786102624e-05, + "loss": 5.5762, + "step": 19035 + }, + { + "epoch": 0.11321248453706347, + "grad_norm": 1.5442852973937988, + "learning_rate": 4.843551914710808e-05, + "loss": 5.6486, + "step": 19036 + }, + { + "epoch": 0.11321843182034447, + "grad_norm": 1.823852777481079, + "learning_rate": 4.843535649993242e-05, + "loss": 5.6581, + "step": 19037 + }, + { + "epoch": 0.11322437910362547, + "grad_norm": 1.5850268602371216, + "learning_rate": 4.8435193844575726e-05, + "loss": 5.6351, + "step": 19038 + }, + { + "epoch": 0.11323032638690646, + "grad_norm": 1.6234556436538696, + "learning_rate": 4.843503118103805e-05, + "loss": 5.5462, + "step": 19039 + }, + { + "epoch": 0.11323627367018746, + "grad_norm": 1.602618932723999, + "learning_rate": 4.843486850931944e-05, + "loss": 5.2935, + "step": 19040 + }, + { + "epoch": 0.11324222095346846, + "grad_norm": 1.6808282136917114, + "learning_rate": 4.843470582941997e-05, + "loss": 5.2254, + "step": 19041 + }, + { + "epoch": 0.11324816823674945, + "grad_norm": 1.6311568021774292, + "learning_rate": 4.8434543141339674e-05, + "loss": 5.1894, + "step": 19042 + }, + { + "epoch": 0.11325411552003045, + "grad_norm": 1.5836867094039917, + "learning_rate": 4.843438044507863e-05, + "loss": 5.6344, + "step": 19043 + }, + { + "epoch": 0.11326006280331145, + "grad_norm": 1.5654397010803223, + "learning_rate": 4.843421774063688e-05, + "loss": 5.2902, + "step": 19044 + }, + { + "epoch": 0.11326601008659244, + "grad_norm": 2.3957626819610596, + "learning_rate": 4.843405502801449e-05, + "loss": 4.812, + "step": 19045 + }, + { + "epoch": 0.11327195736987344, + "grad_norm": 2.123473644256592, + "learning_rate": 4.843389230721151e-05, + "loss": 4.6399, + "step": 19046 + }, + { + "epoch": 0.11327790465315445, + "grad_norm": 1.6691471338272095, + "learning_rate": 4.8433729578228007e-05, + "loss": 4.9337, + "step": 19047 + }, + { + "epoch": 0.11328385193643543, + "grad_norm": 1.6179373264312744, + "learning_rate": 4.8433566841064025e-05, + "loss": 5.1002, + "step": 19048 + }, + { + "epoch": 0.11328979921971644, + "grad_norm": 1.658995270729065, + "learning_rate": 4.843340409571963e-05, + "loss": 5.0397, + "step": 19049 + }, + { + "epoch": 0.11329574650299742, + "grad_norm": 2.0216362476348877, + "learning_rate": 4.843324134219488e-05, + "loss": 5.3112, + "step": 19050 + }, + { + "epoch": 0.11330169378627843, + "grad_norm": 2.0376546382904053, + "learning_rate": 4.843307858048982e-05, + "loss": 5.087, + "step": 19051 + }, + { + "epoch": 0.11330764106955943, + "grad_norm": 2.2038021087646484, + "learning_rate": 4.8432915810604516e-05, + "loss": 4.951, + "step": 19052 + }, + { + "epoch": 0.11331358835284042, + "grad_norm": 1.8985834121704102, + "learning_rate": 4.843275303253903e-05, + "loss": 5.522, + "step": 19053 + }, + { + "epoch": 0.11331953563612142, + "grad_norm": 1.9047077894210815, + "learning_rate": 4.8432590246293404e-05, + "loss": 5.8387, + "step": 19054 + }, + { + "epoch": 0.11332548291940242, + "grad_norm": 1.508352279663086, + "learning_rate": 4.8432427451867704e-05, + "loss": 5.7969, + "step": 19055 + }, + { + "epoch": 0.11333143020268341, + "grad_norm": 1.631695032119751, + "learning_rate": 4.8432264649261984e-05, + "loss": 5.3562, + "step": 19056 + }, + { + "epoch": 0.11333737748596441, + "grad_norm": 1.673411250114441, + "learning_rate": 4.8432101838476305e-05, + "loss": 5.3286, + "step": 19057 + }, + { + "epoch": 0.11334332476924541, + "grad_norm": 2.697946071624756, + "learning_rate": 4.843193901951072e-05, + "loss": 5.0634, + "step": 19058 + }, + { + "epoch": 0.1133492720525264, + "grad_norm": 2.5914673805236816, + "learning_rate": 4.843177619236529e-05, + "loss": 4.8294, + "step": 19059 + }, + { + "epoch": 0.1133552193358074, + "grad_norm": 1.8503727912902832, + "learning_rate": 4.843161335704007e-05, + "loss": 5.1436, + "step": 19060 + }, + { + "epoch": 0.1133611666190884, + "grad_norm": 1.7629435062408447, + "learning_rate": 4.843145051353511e-05, + "loss": 5.1822, + "step": 19061 + }, + { + "epoch": 0.11336711390236939, + "grad_norm": 1.826360821723938, + "learning_rate": 4.843128766185048e-05, + "loss": 5.5151, + "step": 19062 + }, + { + "epoch": 0.1133730611856504, + "grad_norm": 2.0347046852111816, + "learning_rate": 4.843112480198623e-05, + "loss": 4.7732, + "step": 19063 + }, + { + "epoch": 0.1133790084689314, + "grad_norm": 2.037482738494873, + "learning_rate": 4.843096193394241e-05, + "loss": 4.6475, + "step": 19064 + }, + { + "epoch": 0.11338495575221239, + "grad_norm": 2.1152050495147705, + "learning_rate": 4.8430799057719076e-05, + "loss": 4.531, + "step": 19065 + }, + { + "epoch": 0.11339090303549339, + "grad_norm": 2.303982734680176, + "learning_rate": 4.8430636173316306e-05, + "loss": 4.6317, + "step": 19066 + }, + { + "epoch": 0.11339685031877439, + "grad_norm": 2.3326570987701416, + "learning_rate": 4.843047328073414e-05, + "loss": 4.736, + "step": 19067 + }, + { + "epoch": 0.11340279760205538, + "grad_norm": 2.371316909790039, + "learning_rate": 4.8430310379972634e-05, + "loss": 4.806, + "step": 19068 + }, + { + "epoch": 0.11340874488533638, + "grad_norm": 2.5370912551879883, + "learning_rate": 4.8430147471031855e-05, + "loss": 4.7867, + "step": 19069 + }, + { + "epoch": 0.11341469216861738, + "grad_norm": 2.456982135772705, + "learning_rate": 4.842998455391185e-05, + "loss": 4.6942, + "step": 19070 + }, + { + "epoch": 0.11342063945189837, + "grad_norm": 2.526287078857422, + "learning_rate": 4.842982162861268e-05, + "loss": 4.7333, + "step": 19071 + }, + { + "epoch": 0.11342658673517937, + "grad_norm": 2.2763514518737793, + "learning_rate": 4.84296586951344e-05, + "loss": 4.712, + "step": 19072 + }, + { + "epoch": 0.11343253401846037, + "grad_norm": 2.330958366394043, + "learning_rate": 4.842949575347707e-05, + "loss": 4.5875, + "step": 19073 + }, + { + "epoch": 0.11343848130174136, + "grad_norm": 2.390018939971924, + "learning_rate": 4.8429332803640745e-05, + "loss": 4.6941, + "step": 19074 + }, + { + "epoch": 0.11344442858502236, + "grad_norm": 2.279719829559326, + "learning_rate": 4.842916984562548e-05, + "loss": 4.6216, + "step": 19075 + }, + { + "epoch": 0.11345037586830337, + "grad_norm": 2.2815043926239014, + "learning_rate": 4.842900687943133e-05, + "loss": 4.5667, + "step": 19076 + }, + { + "epoch": 0.11345632315158435, + "grad_norm": 2.301231861114502, + "learning_rate": 4.842884390505836e-05, + "loss": 4.5451, + "step": 19077 + }, + { + "epoch": 0.11346227043486536, + "grad_norm": 2.1763200759887695, + "learning_rate": 4.842868092250662e-05, + "loss": 4.5937, + "step": 19078 + }, + { + "epoch": 0.11346821771814634, + "grad_norm": 2.2151448726654053, + "learning_rate": 4.842851793177618e-05, + "loss": 4.8341, + "step": 19079 + }, + { + "epoch": 0.11347416500142735, + "grad_norm": 2.3094639778137207, + "learning_rate": 4.8428354932867085e-05, + "loss": 4.7308, + "step": 19080 + }, + { + "epoch": 0.11348011228470835, + "grad_norm": 1.5218987464904785, + "learning_rate": 4.8428191925779385e-05, + "loss": 5.2701, + "step": 19081 + }, + { + "epoch": 0.11348605956798934, + "grad_norm": 1.3781639337539673, + "learning_rate": 4.842802891051315e-05, + "loss": 5.6873, + "step": 19082 + }, + { + "epoch": 0.11349200685127034, + "grad_norm": 1.814702033996582, + "learning_rate": 4.842786588706842e-05, + "loss": 5.7713, + "step": 19083 + }, + { + "epoch": 0.11349795413455134, + "grad_norm": 1.5691754817962646, + "learning_rate": 4.842770285544528e-05, + "loss": 5.7115, + "step": 19084 + }, + { + "epoch": 0.11350390141783233, + "grad_norm": 1.962762713432312, + "learning_rate": 4.8427539815643766e-05, + "loss": 5.4159, + "step": 19085 + }, + { + "epoch": 0.11350984870111333, + "grad_norm": 1.6766527891159058, + "learning_rate": 4.842737676766393e-05, + "loss": 5.6007, + "step": 19086 + }, + { + "epoch": 0.11351579598439433, + "grad_norm": 1.782934308052063, + "learning_rate": 4.8427213711505844e-05, + "loss": 5.982, + "step": 19087 + }, + { + "epoch": 0.11352174326767532, + "grad_norm": 1.5706422328948975, + "learning_rate": 4.842705064716957e-05, + "loss": 5.5125, + "step": 19088 + }, + { + "epoch": 0.11352769055095632, + "grad_norm": 2.4957141876220703, + "learning_rate": 4.842688757465515e-05, + "loss": 4.5386, + "step": 19089 + }, + { + "epoch": 0.11353363783423732, + "grad_norm": 2.1444833278656006, + "learning_rate": 4.842672449396264e-05, + "loss": 4.6108, + "step": 19090 + }, + { + "epoch": 0.11353958511751831, + "grad_norm": 2.4586305618286133, + "learning_rate": 4.8426561405092106e-05, + "loss": 4.7453, + "step": 19091 + }, + { + "epoch": 0.11354553240079931, + "grad_norm": 2.228759765625, + "learning_rate": 4.8426398308043605e-05, + "loss": 4.662, + "step": 19092 + }, + { + "epoch": 0.11355147968408032, + "grad_norm": 2.029172420501709, + "learning_rate": 4.8426235202817184e-05, + "loss": 4.6389, + "step": 19093 + }, + { + "epoch": 0.1135574269673613, + "grad_norm": 2.1887340545654297, + "learning_rate": 4.842607208941291e-05, + "loss": 4.6852, + "step": 19094 + }, + { + "epoch": 0.11356337425064231, + "grad_norm": 1.7664849758148193, + "learning_rate": 4.842590896783084e-05, + "loss": 5.2435, + "step": 19095 + }, + { + "epoch": 0.11356932153392331, + "grad_norm": 1.5581247806549072, + "learning_rate": 4.8425745838071016e-05, + "loss": 5.6828, + "step": 19096 + }, + { + "epoch": 0.1135752688172043, + "grad_norm": 1.570602297782898, + "learning_rate": 4.842558270013352e-05, + "loss": 5.7011, + "step": 19097 + }, + { + "epoch": 0.1135812161004853, + "grad_norm": 1.4669830799102783, + "learning_rate": 4.842541955401838e-05, + "loss": 5.4361, + "step": 19098 + }, + { + "epoch": 0.1135871633837663, + "grad_norm": 1.199173927307129, + "learning_rate": 4.842525639972568e-05, + "loss": 5.5198, + "step": 19099 + }, + { + "epoch": 0.11359311066704729, + "grad_norm": 1.1747777462005615, + "learning_rate": 4.842509323725546e-05, + "loss": 5.6252, + "step": 19100 + }, + { + "epoch": 0.11359905795032829, + "grad_norm": 1.4497981071472168, + "learning_rate": 4.8424930066607784e-05, + "loss": 5.4295, + "step": 19101 + }, + { + "epoch": 0.1136050052336093, + "grad_norm": 1.485688328742981, + "learning_rate": 4.8424766887782704e-05, + "loss": 5.1248, + "step": 19102 + }, + { + "epoch": 0.11361095251689028, + "grad_norm": 1.419149398803711, + "learning_rate": 4.842460370078028e-05, + "loss": 5.0604, + "step": 19103 + }, + { + "epoch": 0.11361689980017128, + "grad_norm": 1.622096300125122, + "learning_rate": 4.842444050560058e-05, + "loss": 5.4429, + "step": 19104 + }, + { + "epoch": 0.11362284708345229, + "grad_norm": 1.2471072673797607, + "learning_rate": 4.8424277302243636e-05, + "loss": 5.3636, + "step": 19105 + }, + { + "epoch": 0.11362879436673327, + "grad_norm": 1.3416316509246826, + "learning_rate": 4.842411409070952e-05, + "loss": 5.1415, + "step": 19106 + }, + { + "epoch": 0.11363474165001428, + "grad_norm": 1.3691420555114746, + "learning_rate": 4.8423950870998293e-05, + "loss": 5.3286, + "step": 19107 + }, + { + "epoch": 0.11364068893329526, + "grad_norm": 1.2382487058639526, + "learning_rate": 4.842378764311e-05, + "loss": 5.4391, + "step": 19108 + }, + { + "epoch": 0.11364663621657627, + "grad_norm": 1.1729276180267334, + "learning_rate": 4.842362440704471e-05, + "loss": 5.4158, + "step": 19109 + }, + { + "epoch": 0.11365258349985727, + "grad_norm": 1.2451897859573364, + "learning_rate": 4.842346116280247e-05, + "loss": 5.2487, + "step": 19110 + }, + { + "epoch": 0.11365853078313826, + "grad_norm": 1.255652666091919, + "learning_rate": 4.8423297910383354e-05, + "loss": 5.2759, + "step": 19111 + }, + { + "epoch": 0.11366447806641926, + "grad_norm": 1.170296549797058, + "learning_rate": 4.8423134649787394e-05, + "loss": 5.1508, + "step": 19112 + }, + { + "epoch": 0.11367042534970026, + "grad_norm": 1.3954061269760132, + "learning_rate": 4.842297138101467e-05, + "loss": 5.3102, + "step": 19113 + }, + { + "epoch": 0.11367637263298125, + "grad_norm": 1.2746593952178955, + "learning_rate": 4.842280810406522e-05, + "loss": 5.2587, + "step": 19114 + }, + { + "epoch": 0.11368231991626225, + "grad_norm": 1.3224173784255981, + "learning_rate": 4.8422644818939114e-05, + "loss": 5.1927, + "step": 19115 + }, + { + "epoch": 0.11368826719954325, + "grad_norm": 1.0930812358856201, + "learning_rate": 4.84224815256364e-05, + "loss": 5.1676, + "step": 19116 + }, + { + "epoch": 0.11369421448282424, + "grad_norm": 1.3805547952651978, + "learning_rate": 4.842231822415715e-05, + "loss": 5.066, + "step": 19117 + }, + { + "epoch": 0.11370016176610524, + "grad_norm": 1.3455450534820557, + "learning_rate": 4.84221549145014e-05, + "loss": 4.9656, + "step": 19118 + }, + { + "epoch": 0.11370610904938624, + "grad_norm": 1.442218542098999, + "learning_rate": 4.842199159666922e-05, + "loss": 4.9094, + "step": 19119 + }, + { + "epoch": 0.11371205633266723, + "grad_norm": 1.435941457748413, + "learning_rate": 4.8421828270660665e-05, + "loss": 5.1035, + "step": 19120 + }, + { + "epoch": 0.11371800361594823, + "grad_norm": 1.2507586479187012, + "learning_rate": 4.84216649364758e-05, + "loss": 5.2395, + "step": 19121 + }, + { + "epoch": 0.11372395089922924, + "grad_norm": 1.3616739511489868, + "learning_rate": 4.842150159411466e-05, + "loss": 5.2082, + "step": 19122 + }, + { + "epoch": 0.11372989818251023, + "grad_norm": 1.2988322973251343, + "learning_rate": 4.842133824357732e-05, + "loss": 5.1271, + "step": 19123 + }, + { + "epoch": 0.11373584546579123, + "grad_norm": 1.2761636972427368, + "learning_rate": 4.842117488486384e-05, + "loss": 5.1724, + "step": 19124 + }, + { + "epoch": 0.11374179274907223, + "grad_norm": 1.2834585905075073, + "learning_rate": 4.842101151797426e-05, + "loss": 5.2256, + "step": 19125 + }, + { + "epoch": 0.11374774003235322, + "grad_norm": 1.2074506282806396, + "learning_rate": 4.8420848142908655e-05, + "loss": 5.2704, + "step": 19126 + }, + { + "epoch": 0.11375368731563422, + "grad_norm": 1.355292797088623, + "learning_rate": 4.842068475966707e-05, + "loss": 5.1109, + "step": 19127 + }, + { + "epoch": 0.11375963459891522, + "grad_norm": 1.1144691705703735, + "learning_rate": 4.8420521368249565e-05, + "loss": 5.0903, + "step": 19128 + }, + { + "epoch": 0.11376558188219621, + "grad_norm": 1.3889878988265991, + "learning_rate": 4.84203579686562e-05, + "loss": 5.1289, + "step": 19129 + }, + { + "epoch": 0.11377152916547721, + "grad_norm": 1.1302597522735596, + "learning_rate": 4.8420194560887035e-05, + "loss": 4.9211, + "step": 19130 + }, + { + "epoch": 0.11377747644875821, + "grad_norm": 1.1715654134750366, + "learning_rate": 4.8420031144942115e-05, + "loss": 5.2239, + "step": 19131 + }, + { + "epoch": 0.1137834237320392, + "grad_norm": 1.327021837234497, + "learning_rate": 4.84198677208215e-05, + "loss": 5.2941, + "step": 19132 + }, + { + "epoch": 0.1137893710153202, + "grad_norm": 1.3442116975784302, + "learning_rate": 4.841970428852526e-05, + "loss": 5.1752, + "step": 19133 + }, + { + "epoch": 0.1137953182986012, + "grad_norm": 1.207207202911377, + "learning_rate": 4.841954084805344e-05, + "loss": 4.9607, + "step": 19134 + }, + { + "epoch": 0.1138012655818822, + "grad_norm": 1.1609065532684326, + "learning_rate": 4.8419377399406104e-05, + "loss": 5.0458, + "step": 19135 + }, + { + "epoch": 0.1138072128651632, + "grad_norm": 1.365605115890503, + "learning_rate": 4.84192139425833e-05, + "loss": 5.0884, + "step": 19136 + }, + { + "epoch": 0.11381316014844418, + "grad_norm": 1.5192269086837769, + "learning_rate": 4.8419050477585096e-05, + "loss": 5.4803, + "step": 19137 + }, + { + "epoch": 0.11381910743172519, + "grad_norm": 1.187456488609314, + "learning_rate": 4.841888700441153e-05, + "loss": 5.4595, + "step": 19138 + }, + { + "epoch": 0.11382505471500619, + "grad_norm": 1.1836395263671875, + "learning_rate": 4.841872352306268e-05, + "loss": 5.27, + "step": 19139 + }, + { + "epoch": 0.11383100199828718, + "grad_norm": 1.353762149810791, + "learning_rate": 4.841856003353861e-05, + "loss": 5.4646, + "step": 19140 + }, + { + "epoch": 0.11383694928156818, + "grad_norm": 1.4854416847229004, + "learning_rate": 4.8418396535839344e-05, + "loss": 5.2894, + "step": 19141 + }, + { + "epoch": 0.11384289656484918, + "grad_norm": 1.3731143474578857, + "learning_rate": 4.841823302996496e-05, + "loss": 4.7512, + "step": 19142 + }, + { + "epoch": 0.11384884384813017, + "grad_norm": 1.3945658206939697, + "learning_rate": 4.841806951591552e-05, + "loss": 4.9625, + "step": 19143 + }, + { + "epoch": 0.11385479113141117, + "grad_norm": 1.2692869901657104, + "learning_rate": 4.841790599369107e-05, + "loss": 5.2245, + "step": 19144 + }, + { + "epoch": 0.11386073841469217, + "grad_norm": 1.3667423725128174, + "learning_rate": 4.8417742463291674e-05, + "loss": 5.202, + "step": 19145 + }, + { + "epoch": 0.11386668569797316, + "grad_norm": 1.2639939785003662, + "learning_rate": 4.8417578924717377e-05, + "loss": 5.4378, + "step": 19146 + }, + { + "epoch": 0.11387263298125416, + "grad_norm": 1.327867865562439, + "learning_rate": 4.8417415377968255e-05, + "loss": 5.1632, + "step": 19147 + }, + { + "epoch": 0.11387858026453516, + "grad_norm": 1.2095093727111816, + "learning_rate": 4.841725182304435e-05, + "loss": 4.9969, + "step": 19148 + }, + { + "epoch": 0.11388452754781615, + "grad_norm": 1.3395425081253052, + "learning_rate": 4.841708825994573e-05, + "loss": 5.1797, + "step": 19149 + }, + { + "epoch": 0.11389047483109715, + "grad_norm": 1.4817496538162231, + "learning_rate": 4.841692468867244e-05, + "loss": 5.1126, + "step": 19150 + }, + { + "epoch": 0.11389642211437816, + "grad_norm": 1.3066308498382568, + "learning_rate": 4.8416761109224547e-05, + "loss": 5.2692, + "step": 19151 + }, + { + "epoch": 0.11390236939765915, + "grad_norm": 1.444701075553894, + "learning_rate": 4.84165975216021e-05, + "loss": 5.0525, + "step": 19152 + }, + { + "epoch": 0.11390831668094015, + "grad_norm": 1.2720032930374146, + "learning_rate": 4.8416433925805165e-05, + "loss": 5.138, + "step": 19153 + }, + { + "epoch": 0.11391426396422115, + "grad_norm": 1.2228437662124634, + "learning_rate": 4.84162703218338e-05, + "loss": 5.028, + "step": 19154 + }, + { + "epoch": 0.11392021124750214, + "grad_norm": 1.1950013637542725, + "learning_rate": 4.841610670968805e-05, + "loss": 5.0873, + "step": 19155 + }, + { + "epoch": 0.11392615853078314, + "grad_norm": 1.3538236618041992, + "learning_rate": 4.8415943089367976e-05, + "loss": 5.0039, + "step": 19156 + }, + { + "epoch": 0.11393210581406414, + "grad_norm": 1.3344488143920898, + "learning_rate": 4.841577946087364e-05, + "loss": 5.0215, + "step": 19157 + }, + { + "epoch": 0.11393805309734513, + "grad_norm": 1.7098866701126099, + "learning_rate": 4.841561582420511e-05, + "loss": 5.5719, + "step": 19158 + }, + { + "epoch": 0.11394400038062613, + "grad_norm": 1.3574185371398926, + "learning_rate": 4.841545217936241e-05, + "loss": 4.8491, + "step": 19159 + }, + { + "epoch": 0.11394994766390713, + "grad_norm": 1.447292447090149, + "learning_rate": 4.8415288526345634e-05, + "loss": 4.8632, + "step": 19160 + }, + { + "epoch": 0.11395589494718812, + "grad_norm": 1.6439673900604248, + "learning_rate": 4.841512486515481e-05, + "loss": 5.282, + "step": 19161 + }, + { + "epoch": 0.11396184223046912, + "grad_norm": 1.3063132762908936, + "learning_rate": 4.841496119579002e-05, + "loss": 5.0399, + "step": 19162 + }, + { + "epoch": 0.11396778951375013, + "grad_norm": 1.4244173765182495, + "learning_rate": 4.8414797518251296e-05, + "loss": 4.7731, + "step": 19163 + }, + { + "epoch": 0.11397373679703111, + "grad_norm": 1.225203514099121, + "learning_rate": 4.841463383253872e-05, + "loss": 4.8294, + "step": 19164 + }, + { + "epoch": 0.11397968408031212, + "grad_norm": 1.2978007793426514, + "learning_rate": 4.8414470138652334e-05, + "loss": 4.6336, + "step": 19165 + }, + { + "epoch": 0.1139856313635931, + "grad_norm": 1.306591272354126, + "learning_rate": 4.8414306436592194e-05, + "loss": 4.8267, + "step": 19166 + }, + { + "epoch": 0.1139915786468741, + "grad_norm": 1.1227960586547852, + "learning_rate": 4.841414272635837e-05, + "loss": 4.7438, + "step": 19167 + }, + { + "epoch": 0.11399752593015511, + "grad_norm": 1.3674911260604858, + "learning_rate": 4.8413979007950905e-05, + "loss": 4.8127, + "step": 19168 + }, + { + "epoch": 0.1140034732134361, + "grad_norm": 1.3923397064208984, + "learning_rate": 4.841381528136986e-05, + "loss": 5.1568, + "step": 19169 + }, + { + "epoch": 0.1140094204967171, + "grad_norm": 1.2014738321304321, + "learning_rate": 4.84136515466153e-05, + "loss": 5.0116, + "step": 19170 + }, + { + "epoch": 0.1140153677799981, + "grad_norm": 1.3564008474349976, + "learning_rate": 4.841348780368726e-05, + "loss": 5.1181, + "step": 19171 + }, + { + "epoch": 0.11402131506327909, + "grad_norm": 1.1918834447860718, + "learning_rate": 4.841332405258583e-05, + "loss": 5.0854, + "step": 19172 + }, + { + "epoch": 0.11402726234656009, + "grad_norm": 1.2056841850280762, + "learning_rate": 4.8413160293311047e-05, + "loss": 4.825, + "step": 19173 + }, + { + "epoch": 0.11403320962984109, + "grad_norm": 1.3841508626937866, + "learning_rate": 4.841299652586298e-05, + "loss": 4.7543, + "step": 19174 + }, + { + "epoch": 0.11403915691312208, + "grad_norm": 1.511307716369629, + "learning_rate": 4.841283275024166e-05, + "loss": 4.9821, + "step": 19175 + }, + { + "epoch": 0.11404510419640308, + "grad_norm": 1.2577831745147705, + "learning_rate": 4.8412668966447175e-05, + "loss": 5.0138, + "step": 19176 + }, + { + "epoch": 0.11405105147968408, + "grad_norm": 1.442159652709961, + "learning_rate": 4.841250517447956e-05, + "loss": 5.0066, + "step": 19177 + }, + { + "epoch": 0.11405699876296507, + "grad_norm": 1.3029484748840332, + "learning_rate": 4.841234137433889e-05, + "loss": 4.9229, + "step": 19178 + }, + { + "epoch": 0.11406294604624607, + "grad_norm": 1.3138917684555054, + "learning_rate": 4.841217756602521e-05, + "loss": 4.6262, + "step": 19179 + }, + { + "epoch": 0.11406889332952708, + "grad_norm": 1.2164885997772217, + "learning_rate": 4.841201374953857e-05, + "loss": 4.7952, + "step": 19180 + }, + { + "epoch": 0.11407484061280806, + "grad_norm": 1.4247347116470337, + "learning_rate": 4.8411849924879046e-05, + "loss": 5.0066, + "step": 19181 + }, + { + "epoch": 0.11408078789608907, + "grad_norm": 1.236006736755371, + "learning_rate": 4.8411686092046695e-05, + "loss": 4.6585, + "step": 19182 + }, + { + "epoch": 0.11408673517937007, + "grad_norm": 1.2381118535995483, + "learning_rate": 4.841152225104156e-05, + "loss": 5.0935, + "step": 19183 + }, + { + "epoch": 0.11409268246265106, + "grad_norm": 1.3557883501052856, + "learning_rate": 4.84113584018637e-05, + "loss": 5.1536, + "step": 19184 + }, + { + "epoch": 0.11409862974593206, + "grad_norm": 1.3191505670547485, + "learning_rate": 4.8411194544513184e-05, + "loss": 5.2857, + "step": 19185 + }, + { + "epoch": 0.11410457702921306, + "grad_norm": 1.2058855295181274, + "learning_rate": 4.841103067899006e-05, + "loss": 5.142, + "step": 19186 + }, + { + "epoch": 0.11411052431249405, + "grad_norm": 1.163136601448059, + "learning_rate": 4.8410866805294384e-05, + "loss": 5.1891, + "step": 19187 + }, + { + "epoch": 0.11411647159577505, + "grad_norm": 1.3245770931243896, + "learning_rate": 4.841070292342622e-05, + "loss": 5.0629, + "step": 19188 + }, + { + "epoch": 0.11412241887905605, + "grad_norm": 1.13837730884552, + "learning_rate": 4.841053903338562e-05, + "loss": 5.1045, + "step": 19189 + }, + { + "epoch": 0.11412836616233704, + "grad_norm": 1.4724907875061035, + "learning_rate": 4.8410375135172646e-05, + "loss": 5.01, + "step": 19190 + }, + { + "epoch": 0.11413431344561804, + "grad_norm": 1.3786016702651978, + "learning_rate": 4.841021122878735e-05, + "loss": 5.0188, + "step": 19191 + }, + { + "epoch": 0.11414026072889905, + "grad_norm": 1.2996101379394531, + "learning_rate": 4.841004731422979e-05, + "loss": 4.954, + "step": 19192 + }, + { + "epoch": 0.11414620801218003, + "grad_norm": 1.297892451286316, + "learning_rate": 4.840988339150002e-05, + "loss": 4.9841, + "step": 19193 + }, + { + "epoch": 0.11415215529546104, + "grad_norm": 1.3011624813079834, + "learning_rate": 4.84097194605981e-05, + "loss": 4.8547, + "step": 19194 + }, + { + "epoch": 0.11415810257874202, + "grad_norm": 1.2169194221496582, + "learning_rate": 4.8409555521524096e-05, + "loss": 4.8801, + "step": 19195 + }, + { + "epoch": 0.11416404986202303, + "grad_norm": 1.4189658164978027, + "learning_rate": 4.8409391574278065e-05, + "loss": 4.9521, + "step": 19196 + }, + { + "epoch": 0.11416999714530403, + "grad_norm": 1.4178590774536133, + "learning_rate": 4.840922761886004e-05, + "loss": 4.7847, + "step": 19197 + }, + { + "epoch": 0.11417594442858502, + "grad_norm": 1.395585536956787, + "learning_rate": 4.8409063655270105e-05, + "loss": 5.0404, + "step": 19198 + }, + { + "epoch": 0.11418189171186602, + "grad_norm": 1.4803121089935303, + "learning_rate": 4.840889968350831e-05, + "loss": 4.8851, + "step": 19199 + }, + { + "epoch": 0.11418783899514702, + "grad_norm": 1.4736177921295166, + "learning_rate": 4.84087357035747e-05, + "loss": 4.9127, + "step": 19200 + }, + { + "epoch": 0.11419378627842801, + "grad_norm": 1.2947148084640503, + "learning_rate": 4.8408571715469354e-05, + "loss": 4.9169, + "step": 19201 + }, + { + "epoch": 0.11419973356170901, + "grad_norm": 1.2428392171859741, + "learning_rate": 4.840840771919232e-05, + "loss": 5.2759, + "step": 19202 + }, + { + "epoch": 0.11420568084499001, + "grad_norm": 1.2743968963623047, + "learning_rate": 4.840824371474364e-05, + "loss": 5.2273, + "step": 19203 + }, + { + "epoch": 0.114211628128271, + "grad_norm": 1.3068950176239014, + "learning_rate": 4.840807970212339e-05, + "loss": 5.3455, + "step": 19204 + }, + { + "epoch": 0.114217575411552, + "grad_norm": 1.2238211631774902, + "learning_rate": 4.8407915681331614e-05, + "loss": 5.024, + "step": 19205 + }, + { + "epoch": 0.114223522694833, + "grad_norm": 1.1461126804351807, + "learning_rate": 4.8407751652368384e-05, + "loss": 5.2113, + "step": 19206 + }, + { + "epoch": 0.11422946997811399, + "grad_norm": 1.2286972999572754, + "learning_rate": 4.840758761523375e-05, + "loss": 5.006, + "step": 19207 + }, + { + "epoch": 0.114235417261395, + "grad_norm": 1.3054790496826172, + "learning_rate": 4.840742356992777e-05, + "loss": 5.0592, + "step": 19208 + }, + { + "epoch": 0.114241364544676, + "grad_norm": 1.2426046133041382, + "learning_rate": 4.84072595164505e-05, + "loss": 5.1058, + "step": 19209 + }, + { + "epoch": 0.11424731182795698, + "grad_norm": 1.325263261795044, + "learning_rate": 4.840709545480199e-05, + "loss": 5.0528, + "step": 19210 + }, + { + "epoch": 0.11425325911123799, + "grad_norm": 1.1753286123275757, + "learning_rate": 4.840693138498231e-05, + "loss": 5.2193, + "step": 19211 + }, + { + "epoch": 0.11425920639451899, + "grad_norm": 1.486204743385315, + "learning_rate": 4.8406767306991515e-05, + "loss": 5.0389, + "step": 19212 + }, + { + "epoch": 0.11426515367779998, + "grad_norm": 1.344887614250183, + "learning_rate": 4.8406603220829655e-05, + "loss": 5.0072, + "step": 19213 + }, + { + "epoch": 0.11427110096108098, + "grad_norm": 1.270340919494629, + "learning_rate": 4.840643912649679e-05, + "loss": 5.0154, + "step": 19214 + }, + { + "epoch": 0.11427704824436198, + "grad_norm": 1.390960454940796, + "learning_rate": 4.8406275023992983e-05, + "loss": 5.0803, + "step": 19215 + }, + { + "epoch": 0.11428299552764297, + "grad_norm": 1.2927583456039429, + "learning_rate": 4.8406110913318294e-05, + "loss": 5.04, + "step": 19216 + }, + { + "epoch": 0.11428894281092397, + "grad_norm": 1.3101180791854858, + "learning_rate": 4.840594679447275e-05, + "loss": 4.9988, + "step": 19217 + }, + { + "epoch": 0.11429489009420497, + "grad_norm": 1.2187588214874268, + "learning_rate": 4.8405782667456454e-05, + "loss": 5.1006, + "step": 19218 + }, + { + "epoch": 0.11430083737748596, + "grad_norm": 1.3578346967697144, + "learning_rate": 4.840561853226944e-05, + "loss": 5.0528, + "step": 19219 + }, + { + "epoch": 0.11430678466076696, + "grad_norm": 1.8960474729537964, + "learning_rate": 4.840545438891176e-05, + "loss": 5.323, + "step": 19220 + }, + { + "epoch": 0.11431273194404797, + "grad_norm": 1.3410239219665527, + "learning_rate": 4.840529023738348e-05, + "loss": 5.1488, + "step": 19221 + }, + { + "epoch": 0.11431867922732895, + "grad_norm": 1.381373405456543, + "learning_rate": 4.840512607768465e-05, + "loss": 5.1477, + "step": 19222 + }, + { + "epoch": 0.11432462651060996, + "grad_norm": 1.4095546007156372, + "learning_rate": 4.8404961909815336e-05, + "loss": 5.1515, + "step": 19223 + }, + { + "epoch": 0.11433057379389094, + "grad_norm": 1.254451870918274, + "learning_rate": 4.840479773377559e-05, + "loss": 5.1276, + "step": 19224 + }, + { + "epoch": 0.11433652107717195, + "grad_norm": 1.3001519441604614, + "learning_rate": 4.840463354956548e-05, + "loss": 5.1561, + "step": 19225 + }, + { + "epoch": 0.11434246836045295, + "grad_norm": 1.231469750404358, + "learning_rate": 4.840446935718505e-05, + "loss": 4.963, + "step": 19226 + }, + { + "epoch": 0.11434841564373394, + "grad_norm": 1.323225736618042, + "learning_rate": 4.840430515663435e-05, + "loss": 5.0998, + "step": 19227 + }, + { + "epoch": 0.11435436292701494, + "grad_norm": 1.2244281768798828, + "learning_rate": 4.8404140947913456e-05, + "loss": 5.0727, + "step": 19228 + }, + { + "epoch": 0.11436031021029594, + "grad_norm": 1.2634974718093872, + "learning_rate": 4.840397673102242e-05, + "loss": 5.2049, + "step": 19229 + }, + { + "epoch": 0.11436625749357693, + "grad_norm": 1.5431766510009766, + "learning_rate": 4.84038125059613e-05, + "loss": 5.1387, + "step": 19230 + }, + { + "epoch": 0.11437220477685793, + "grad_norm": 1.485696792602539, + "learning_rate": 4.8403648272730145e-05, + "loss": 4.7971, + "step": 19231 + }, + { + "epoch": 0.11437815206013893, + "grad_norm": 1.4774583578109741, + "learning_rate": 4.840348403132902e-05, + "loss": 4.8967, + "step": 19232 + }, + { + "epoch": 0.11438409934341992, + "grad_norm": 1.1903584003448486, + "learning_rate": 4.840331978175798e-05, + "loss": 4.8827, + "step": 19233 + }, + { + "epoch": 0.11439004662670092, + "grad_norm": 1.3851109743118286, + "learning_rate": 4.840315552401708e-05, + "loss": 4.8348, + "step": 19234 + }, + { + "epoch": 0.11439599390998192, + "grad_norm": 1.3834025859832764, + "learning_rate": 4.840299125810639e-05, + "loss": 4.9392, + "step": 19235 + }, + { + "epoch": 0.11440194119326291, + "grad_norm": 1.2576985359191895, + "learning_rate": 4.840282698402595e-05, + "loss": 4.9092, + "step": 19236 + }, + { + "epoch": 0.11440788847654391, + "grad_norm": 1.2408863306045532, + "learning_rate": 4.840266270177583e-05, + "loss": 4.9041, + "step": 19237 + }, + { + "epoch": 0.11441383575982492, + "grad_norm": 1.4397286176681519, + "learning_rate": 4.840249841135608e-05, + "loss": 4.9588, + "step": 19238 + }, + { + "epoch": 0.1144197830431059, + "grad_norm": 1.3446424007415771, + "learning_rate": 4.840233411276676e-05, + "loss": 4.9757, + "step": 19239 + }, + { + "epoch": 0.1144257303263869, + "grad_norm": 1.2520800828933716, + "learning_rate": 4.840216980600793e-05, + "loss": 4.9746, + "step": 19240 + }, + { + "epoch": 0.11443167760966791, + "grad_norm": 1.2509692907333374, + "learning_rate": 4.840200549107963e-05, + "loss": 5.063, + "step": 19241 + }, + { + "epoch": 0.1144376248929489, + "grad_norm": 1.3295235633850098, + "learning_rate": 4.840184116798194e-05, + "loss": 5.02, + "step": 19242 + }, + { + "epoch": 0.1144435721762299, + "grad_norm": 1.3346072435379028, + "learning_rate": 4.8401676836714916e-05, + "loss": 5.0393, + "step": 19243 + }, + { + "epoch": 0.1144495194595109, + "grad_norm": 1.6711392402648926, + "learning_rate": 4.84015124972786e-05, + "loss": 5.0856, + "step": 19244 + }, + { + "epoch": 0.11445546674279189, + "grad_norm": 1.2785863876342773, + "learning_rate": 4.8401348149673065e-05, + "loss": 5.1181, + "step": 19245 + }, + { + "epoch": 0.11446141402607289, + "grad_norm": 1.4998282194137573, + "learning_rate": 4.8401183793898354e-05, + "loss": 5.0101, + "step": 19246 + }, + { + "epoch": 0.1144673613093539, + "grad_norm": 1.4768141508102417, + "learning_rate": 4.840101942995454e-05, + "loss": 4.8256, + "step": 19247 + }, + { + "epoch": 0.11447330859263488, + "grad_norm": 1.3829854726791382, + "learning_rate": 4.840085505784167e-05, + "loss": 4.8298, + "step": 19248 + }, + { + "epoch": 0.11447925587591588, + "grad_norm": 1.2079180479049683, + "learning_rate": 4.840069067755979e-05, + "loss": 4.9054, + "step": 19249 + }, + { + "epoch": 0.11448520315919689, + "grad_norm": 1.464245080947876, + "learning_rate": 4.8400526289108984e-05, + "loss": 4.8943, + "step": 19250 + }, + { + "epoch": 0.11449115044247787, + "grad_norm": 1.400992512702942, + "learning_rate": 4.840036189248929e-05, + "loss": 4.754, + "step": 19251 + }, + { + "epoch": 0.11449709772575888, + "grad_norm": 1.41909921169281, + "learning_rate": 4.840019748770077e-05, + "loss": 4.9179, + "step": 19252 + }, + { + "epoch": 0.11450304500903986, + "grad_norm": 1.3990073204040527, + "learning_rate": 4.840003307474349e-05, + "loss": 4.7989, + "step": 19253 + }, + { + "epoch": 0.11450899229232087, + "grad_norm": 1.2858465909957886, + "learning_rate": 4.8399868653617497e-05, + "loss": 4.7556, + "step": 19254 + }, + { + "epoch": 0.11451493957560187, + "grad_norm": 1.2721470594406128, + "learning_rate": 4.8399704224322854e-05, + "loss": 4.8441, + "step": 19255 + }, + { + "epoch": 0.11452088685888286, + "grad_norm": 1.2352218627929688, + "learning_rate": 4.839953978685961e-05, + "loss": 4.753, + "step": 19256 + }, + { + "epoch": 0.11452683414216386, + "grad_norm": 1.3000402450561523, + "learning_rate": 4.8399375341227834e-05, + "loss": 4.7634, + "step": 19257 + }, + { + "epoch": 0.11453278142544486, + "grad_norm": 1.2934285402297974, + "learning_rate": 4.839921088742757e-05, + "loss": 4.8047, + "step": 19258 + }, + { + "epoch": 0.11453872870872585, + "grad_norm": 1.5773643255233765, + "learning_rate": 4.839904642545889e-05, + "loss": 4.8588, + "step": 19259 + }, + { + "epoch": 0.11454467599200685, + "grad_norm": 1.3872511386871338, + "learning_rate": 4.8398881955321844e-05, + "loss": 5.0781, + "step": 19260 + }, + { + "epoch": 0.11455062327528785, + "grad_norm": 1.403011679649353, + "learning_rate": 4.839871747701649e-05, + "loss": 5.1375, + "step": 19261 + }, + { + "epoch": 0.11455657055856884, + "grad_norm": 1.2086342573165894, + "learning_rate": 4.839855299054289e-05, + "loss": 5.1052, + "step": 19262 + }, + { + "epoch": 0.11456251784184984, + "grad_norm": 1.3916890621185303, + "learning_rate": 4.8398388495901085e-05, + "loss": 5.0687, + "step": 19263 + }, + { + "epoch": 0.11456846512513084, + "grad_norm": 1.4591625928878784, + "learning_rate": 4.839822399309115e-05, + "loss": 5.0098, + "step": 19264 + }, + { + "epoch": 0.11457441240841183, + "grad_norm": 1.3421653509140015, + "learning_rate": 4.839805948211314e-05, + "loss": 4.9511, + "step": 19265 + }, + { + "epoch": 0.11458035969169283, + "grad_norm": 1.3959892988204956, + "learning_rate": 4.83978949629671e-05, + "loss": 5.0206, + "step": 19266 + }, + { + "epoch": 0.11458630697497384, + "grad_norm": 1.3058884143829346, + "learning_rate": 4.839773043565311e-05, + "loss": 5.0885, + "step": 19267 + }, + { + "epoch": 0.11459225425825482, + "grad_norm": 1.452760100364685, + "learning_rate": 4.839756590017121e-05, + "loss": 4.9945, + "step": 19268 + }, + { + "epoch": 0.11459820154153583, + "grad_norm": 1.4445050954818726, + "learning_rate": 4.8397401356521454e-05, + "loss": 4.8128, + "step": 19269 + }, + { + "epoch": 0.11460414882481683, + "grad_norm": 1.2491203546524048, + "learning_rate": 4.8397236804703916e-05, + "loss": 4.7355, + "step": 19270 + }, + { + "epoch": 0.11461009610809782, + "grad_norm": 1.3198809623718262, + "learning_rate": 4.839707224471864e-05, + "loss": 4.7621, + "step": 19271 + }, + { + "epoch": 0.11461604339137882, + "grad_norm": 1.4831585884094238, + "learning_rate": 4.8396907676565686e-05, + "loss": 4.7393, + "step": 19272 + }, + { + "epoch": 0.11462199067465982, + "grad_norm": 1.2767844200134277, + "learning_rate": 4.839674310024512e-05, + "loss": 4.8063, + "step": 19273 + }, + { + "epoch": 0.11462793795794081, + "grad_norm": 1.4342589378356934, + "learning_rate": 4.839657851575698e-05, + "loss": 4.7615, + "step": 19274 + }, + { + "epoch": 0.11463388524122181, + "grad_norm": 1.30052649974823, + "learning_rate": 4.839641392310135e-05, + "loss": 4.7389, + "step": 19275 + }, + { + "epoch": 0.11463983252450281, + "grad_norm": 1.3592944145202637, + "learning_rate": 4.8396249322278266e-05, + "loss": 4.704, + "step": 19276 + }, + { + "epoch": 0.1146457798077838, + "grad_norm": 1.1905149221420288, + "learning_rate": 4.83960847132878e-05, + "loss": 4.7189, + "step": 19277 + }, + { + "epoch": 0.1146517270910648, + "grad_norm": 1.4920209646224976, + "learning_rate": 4.8395920096129996e-05, + "loss": 4.8844, + "step": 19278 + }, + { + "epoch": 0.1146576743743458, + "grad_norm": 1.486556887626648, + "learning_rate": 4.839575547080491e-05, + "loss": 4.9462, + "step": 19279 + }, + { + "epoch": 0.1146636216576268, + "grad_norm": 1.500434160232544, + "learning_rate": 4.839559083731262e-05, + "loss": 4.9118, + "step": 19280 + }, + { + "epoch": 0.1146695689409078, + "grad_norm": 1.5061683654785156, + "learning_rate": 4.839542619565317e-05, + "loss": 4.7921, + "step": 19281 + }, + { + "epoch": 0.11467551622418878, + "grad_norm": 1.587161660194397, + "learning_rate": 4.839526154582662e-05, + "loss": 5.1129, + "step": 19282 + }, + { + "epoch": 0.11468146350746979, + "grad_norm": 1.3225055932998657, + "learning_rate": 4.839509688783302e-05, + "loss": 4.8538, + "step": 19283 + }, + { + "epoch": 0.11468741079075079, + "grad_norm": 1.3121862411499023, + "learning_rate": 4.839493222167244e-05, + "loss": 4.8695, + "step": 19284 + }, + { + "epoch": 0.11469335807403178, + "grad_norm": 1.4202474355697632, + "learning_rate": 4.839476754734492e-05, + "loss": 4.8628, + "step": 19285 + }, + { + "epoch": 0.11469930535731278, + "grad_norm": 1.283316969871521, + "learning_rate": 4.8394602864850534e-05, + "loss": 4.8431, + "step": 19286 + }, + { + "epoch": 0.11470525264059378, + "grad_norm": 1.3255420923233032, + "learning_rate": 4.839443817418934e-05, + "loss": 4.9993, + "step": 19287 + }, + { + "epoch": 0.11471119992387477, + "grad_norm": 1.3569047451019287, + "learning_rate": 4.8394273475361386e-05, + "loss": 4.9478, + "step": 19288 + }, + { + "epoch": 0.11471714720715577, + "grad_norm": 1.2374382019042969, + "learning_rate": 4.839410876836673e-05, + "loss": 5.1119, + "step": 19289 + }, + { + "epoch": 0.11472309449043677, + "grad_norm": 1.3518184423446655, + "learning_rate": 4.839394405320543e-05, + "loss": 5.2506, + "step": 19290 + }, + { + "epoch": 0.11472904177371776, + "grad_norm": 1.2599278688430786, + "learning_rate": 4.839377932987755e-05, + "loss": 5.208, + "step": 19291 + }, + { + "epoch": 0.11473498905699876, + "grad_norm": 1.3122080564498901, + "learning_rate": 4.839361459838314e-05, + "loss": 5.2356, + "step": 19292 + }, + { + "epoch": 0.11474093634027976, + "grad_norm": 1.1587629318237305, + "learning_rate": 4.839344985872226e-05, + "loss": 5.2469, + "step": 19293 + }, + { + "epoch": 0.11474688362356075, + "grad_norm": 1.2733700275421143, + "learning_rate": 4.839328511089498e-05, + "loss": 5.2365, + "step": 19294 + }, + { + "epoch": 0.11475283090684175, + "grad_norm": 1.3206977844238281, + "learning_rate": 4.8393120354901334e-05, + "loss": 5.2242, + "step": 19295 + }, + { + "epoch": 0.11475877819012276, + "grad_norm": 1.1924374103546143, + "learning_rate": 4.83929555907414e-05, + "loss": 5.2916, + "step": 19296 + }, + { + "epoch": 0.11476472547340374, + "grad_norm": 1.2989557981491089, + "learning_rate": 4.8392790818415215e-05, + "loss": 5.173, + "step": 19297 + }, + { + "epoch": 0.11477067275668475, + "grad_norm": 1.3470929861068726, + "learning_rate": 4.839262603792286e-05, + "loss": 5.2309, + "step": 19298 + }, + { + "epoch": 0.11477662003996575, + "grad_norm": 1.1529438495635986, + "learning_rate": 4.8392461249264376e-05, + "loss": 5.2373, + "step": 19299 + }, + { + "epoch": 0.11478256732324674, + "grad_norm": 1.1988370418548584, + "learning_rate": 4.839229645243982e-05, + "loss": 5.2067, + "step": 19300 + }, + { + "epoch": 0.11478851460652774, + "grad_norm": 1.3069959878921509, + "learning_rate": 4.839213164744926e-05, + "loss": 5.1413, + "step": 19301 + }, + { + "epoch": 0.11479446188980874, + "grad_norm": 1.230211615562439, + "learning_rate": 4.839196683429275e-05, + "loss": 5.2076, + "step": 19302 + }, + { + "epoch": 0.11480040917308973, + "grad_norm": 1.3232944011688232, + "learning_rate": 4.839180201297034e-05, + "loss": 5.2077, + "step": 19303 + }, + { + "epoch": 0.11480635645637073, + "grad_norm": 1.2436466217041016, + "learning_rate": 4.839163718348211e-05, + "loss": 5.1646, + "step": 19304 + }, + { + "epoch": 0.11481230373965173, + "grad_norm": 1.160416841506958, + "learning_rate": 4.8391472345828085e-05, + "loss": 5.0582, + "step": 19305 + }, + { + "epoch": 0.11481825102293272, + "grad_norm": 1.3895483016967773, + "learning_rate": 4.8391307500008344e-05, + "loss": 5.2516, + "step": 19306 + }, + { + "epoch": 0.11482419830621372, + "grad_norm": 1.5018577575683594, + "learning_rate": 4.8391142646022935e-05, + "loss": 5.4308, + "step": 19307 + }, + { + "epoch": 0.11483014558949473, + "grad_norm": 1.5278204679489136, + "learning_rate": 4.8390977783871925e-05, + "loss": 5.2238, + "step": 19308 + }, + { + "epoch": 0.11483609287277571, + "grad_norm": 1.5735019445419312, + "learning_rate": 4.839081291355536e-05, + "loss": 5.4874, + "step": 19309 + }, + { + "epoch": 0.11484204015605672, + "grad_norm": 1.4098745584487915, + "learning_rate": 4.839064803507332e-05, + "loss": 5.082, + "step": 19310 + }, + { + "epoch": 0.1148479874393377, + "grad_norm": 1.47605299949646, + "learning_rate": 4.8390483148425824e-05, + "loss": 5.0869, + "step": 19311 + }, + { + "epoch": 0.1148539347226187, + "grad_norm": 1.442550778388977, + "learning_rate": 4.8390318253612966e-05, + "loss": 5.1232, + "step": 19312 + }, + { + "epoch": 0.11485988200589971, + "grad_norm": 1.1225110292434692, + "learning_rate": 4.8390153350634785e-05, + "loss": 5.0782, + "step": 19313 + }, + { + "epoch": 0.1148658292891807, + "grad_norm": 1.329656720161438, + "learning_rate": 4.838998843949135e-05, + "loss": 4.9912, + "step": 19314 + }, + { + "epoch": 0.1148717765724617, + "grad_norm": 1.6484954357147217, + "learning_rate": 4.8389823520182704e-05, + "loss": 4.785, + "step": 19315 + }, + { + "epoch": 0.1148777238557427, + "grad_norm": 1.46773099899292, + "learning_rate": 4.838965859270891e-05, + "loss": 4.7835, + "step": 19316 + }, + { + "epoch": 0.11488367113902369, + "grad_norm": 1.717592477798462, + "learning_rate": 4.838949365707004e-05, + "loss": 5.1603, + "step": 19317 + }, + { + "epoch": 0.11488961842230469, + "grad_norm": 1.7265046834945679, + "learning_rate": 4.838932871326613e-05, + "loss": 4.9057, + "step": 19318 + }, + { + "epoch": 0.11489556570558569, + "grad_norm": 1.6203346252441406, + "learning_rate": 4.838916376129725e-05, + "loss": 4.8206, + "step": 19319 + }, + { + "epoch": 0.11490151298886668, + "grad_norm": 1.2972123622894287, + "learning_rate": 4.838899880116345e-05, + "loss": 4.7026, + "step": 19320 + }, + { + "epoch": 0.11490746027214768, + "grad_norm": 1.4215303659439087, + "learning_rate": 4.838883383286479e-05, + "loss": 4.7032, + "step": 19321 + }, + { + "epoch": 0.11491340755542868, + "grad_norm": 1.442439317703247, + "learning_rate": 4.838866885640134e-05, + "loss": 4.6853, + "step": 19322 + }, + { + "epoch": 0.11491935483870967, + "grad_norm": 1.3752079010009766, + "learning_rate": 4.838850387177315e-05, + "loss": 4.6842, + "step": 19323 + }, + { + "epoch": 0.11492530212199067, + "grad_norm": 1.4834825992584229, + "learning_rate": 4.838833887898026e-05, + "loss": 4.6455, + "step": 19324 + }, + { + "epoch": 0.11493124940527168, + "grad_norm": 1.3493545055389404, + "learning_rate": 4.8388173878022743e-05, + "loss": 4.5489, + "step": 19325 + }, + { + "epoch": 0.11493719668855266, + "grad_norm": 1.5903066396713257, + "learning_rate": 4.838800886890067e-05, + "loss": 4.5574, + "step": 19326 + }, + { + "epoch": 0.11494314397183367, + "grad_norm": 1.3842332363128662, + "learning_rate": 4.8387843851614076e-05, + "loss": 4.7516, + "step": 19327 + }, + { + "epoch": 0.11494909125511467, + "grad_norm": 1.5355647802352905, + "learning_rate": 4.838767882616303e-05, + "loss": 4.5984, + "step": 19328 + }, + { + "epoch": 0.11495503853839566, + "grad_norm": 1.6534103155136108, + "learning_rate": 4.838751379254759e-05, + "loss": 4.7761, + "step": 19329 + }, + { + "epoch": 0.11496098582167666, + "grad_norm": 1.7028656005859375, + "learning_rate": 4.83873487507678e-05, + "loss": 5.0164, + "step": 19330 + }, + { + "epoch": 0.11496693310495766, + "grad_norm": 1.7165244817733765, + "learning_rate": 4.838718370082374e-05, + "loss": 5.1044, + "step": 19331 + }, + { + "epoch": 0.11497288038823865, + "grad_norm": 1.3272297382354736, + "learning_rate": 4.838701864271545e-05, + "loss": 5.0072, + "step": 19332 + }, + { + "epoch": 0.11497882767151965, + "grad_norm": 1.553613543510437, + "learning_rate": 4.8386853576442994e-05, + "loss": 4.945, + "step": 19333 + }, + { + "epoch": 0.11498477495480065, + "grad_norm": 1.4403818845748901, + "learning_rate": 4.8386688502006425e-05, + "loss": 5.0661, + "step": 19334 + }, + { + "epoch": 0.11499072223808164, + "grad_norm": 1.5347598791122437, + "learning_rate": 4.8386523419405814e-05, + "loss": 5.0603, + "step": 19335 + }, + { + "epoch": 0.11499666952136264, + "grad_norm": 1.3777856826782227, + "learning_rate": 4.83863583286412e-05, + "loss": 5.112, + "step": 19336 + }, + { + "epoch": 0.11500261680464365, + "grad_norm": 1.794287919998169, + "learning_rate": 4.8386193229712654e-05, + "loss": 5.1972, + "step": 19337 + }, + { + "epoch": 0.11500856408792463, + "grad_norm": 1.3142359256744385, + "learning_rate": 4.8386028122620234e-05, + "loss": 5.3577, + "step": 19338 + }, + { + "epoch": 0.11501451137120564, + "grad_norm": 1.0925400257110596, + "learning_rate": 4.838586300736399e-05, + "loss": 5.2094, + "step": 19339 + }, + { + "epoch": 0.11502045865448662, + "grad_norm": 1.6456180810928345, + "learning_rate": 4.838569788394398e-05, + "loss": 4.8287, + "step": 19340 + }, + { + "epoch": 0.11502640593776763, + "grad_norm": 1.2811404466629028, + "learning_rate": 4.8385532752360265e-05, + "loss": 5.0659, + "step": 19341 + }, + { + "epoch": 0.11503235322104863, + "grad_norm": 1.392863154411316, + "learning_rate": 4.83853676126129e-05, + "loss": 5.2655, + "step": 19342 + }, + { + "epoch": 0.11503830050432962, + "grad_norm": 1.2255772352218628, + "learning_rate": 4.838520246470195e-05, + "loss": 5.0422, + "step": 19343 + }, + { + "epoch": 0.11504424778761062, + "grad_norm": 1.735661506652832, + "learning_rate": 4.8385037308627465e-05, + "loss": 6.0562, + "step": 19344 + }, + { + "epoch": 0.11505019507089162, + "grad_norm": 1.2034478187561035, + "learning_rate": 4.838487214438951e-05, + "loss": 4.9773, + "step": 19345 + }, + { + "epoch": 0.11505614235417261, + "grad_norm": 1.2786695957183838, + "learning_rate": 4.838470697198813e-05, + "loss": 4.8771, + "step": 19346 + }, + { + "epoch": 0.11506208963745361, + "grad_norm": 1.2345244884490967, + "learning_rate": 4.8384541791423394e-05, + "loss": 5.0098, + "step": 19347 + }, + { + "epoch": 0.11506803692073461, + "grad_norm": 1.3156319856643677, + "learning_rate": 4.838437660269536e-05, + "loss": 5.1089, + "step": 19348 + }, + { + "epoch": 0.1150739842040156, + "grad_norm": 1.3406500816345215, + "learning_rate": 4.838421140580407e-05, + "loss": 4.8374, + "step": 19349 + }, + { + "epoch": 0.1150799314872966, + "grad_norm": 1.412318468093872, + "learning_rate": 4.83840462007496e-05, + "loss": 4.9074, + "step": 19350 + }, + { + "epoch": 0.1150858787705776, + "grad_norm": 1.3075577020645142, + "learning_rate": 4.8383880987532004e-05, + "loss": 4.9694, + "step": 19351 + }, + { + "epoch": 0.11509182605385859, + "grad_norm": 1.178300380706787, + "learning_rate": 4.838371576615134e-05, + "loss": 4.9863, + "step": 19352 + }, + { + "epoch": 0.1150977733371396, + "grad_norm": 1.5120453834533691, + "learning_rate": 4.838355053660765e-05, + "loss": 4.8766, + "step": 19353 + }, + { + "epoch": 0.1151037206204206, + "grad_norm": 1.4834094047546387, + "learning_rate": 4.8383385298901014e-05, + "loss": 4.9724, + "step": 19354 + }, + { + "epoch": 0.11510966790370158, + "grad_norm": 1.561998724937439, + "learning_rate": 4.8383220053031475e-05, + "loss": 4.9239, + "step": 19355 + }, + { + "epoch": 0.11511561518698259, + "grad_norm": 1.4366774559020996, + "learning_rate": 4.83830547989991e-05, + "loss": 4.8052, + "step": 19356 + }, + { + "epoch": 0.11512156247026359, + "grad_norm": 1.2530354261398315, + "learning_rate": 4.8382889536803936e-05, + "loss": 5.0115, + "step": 19357 + }, + { + "epoch": 0.11512750975354458, + "grad_norm": 1.4827991724014282, + "learning_rate": 4.838272426644606e-05, + "loss": 5.1592, + "step": 19358 + }, + { + "epoch": 0.11513345703682558, + "grad_norm": 1.5874660015106201, + "learning_rate": 4.83825589879255e-05, + "loss": 5.0255, + "step": 19359 + }, + { + "epoch": 0.11513940432010658, + "grad_norm": 1.4771748781204224, + "learning_rate": 4.8382393701242335e-05, + "loss": 5.1537, + "step": 19360 + }, + { + "epoch": 0.11514535160338757, + "grad_norm": 1.4980419874191284, + "learning_rate": 4.8382228406396625e-05, + "loss": 5.0109, + "step": 19361 + }, + { + "epoch": 0.11515129888666857, + "grad_norm": 1.5008245706558228, + "learning_rate": 4.8382063103388405e-05, + "loss": 5.1644, + "step": 19362 + }, + { + "epoch": 0.11515724616994957, + "grad_norm": 1.425648808479309, + "learning_rate": 4.838189779221777e-05, + "loss": 4.8298, + "step": 19363 + }, + { + "epoch": 0.11516319345323056, + "grad_norm": 1.4478559494018555, + "learning_rate": 4.8381732472884744e-05, + "loss": 5.2984, + "step": 19364 + }, + { + "epoch": 0.11516914073651156, + "grad_norm": 1.5071446895599365, + "learning_rate": 4.83815671453894e-05, + "loss": 4.9557, + "step": 19365 + }, + { + "epoch": 0.11517508801979257, + "grad_norm": 1.6358442306518555, + "learning_rate": 4.8381401809731785e-05, + "loss": 4.7956, + "step": 19366 + }, + { + "epoch": 0.11518103530307355, + "grad_norm": 1.5035837888717651, + "learning_rate": 4.838123646591197e-05, + "loss": 4.816, + "step": 19367 + }, + { + "epoch": 0.11518698258635456, + "grad_norm": 1.4265867471694946, + "learning_rate": 4.838107111393e-05, + "loss": 4.7911, + "step": 19368 + }, + { + "epoch": 0.11519292986963554, + "grad_norm": 1.489668369293213, + "learning_rate": 4.838090575378595e-05, + "loss": 4.8403, + "step": 19369 + }, + { + "epoch": 0.11519887715291655, + "grad_norm": 1.4454714059829712, + "learning_rate": 4.838074038547986e-05, + "loss": 4.8848, + "step": 19370 + }, + { + "epoch": 0.11520482443619755, + "grad_norm": 1.42531418800354, + "learning_rate": 4.83805750090118e-05, + "loss": 5.0249, + "step": 19371 + }, + { + "epoch": 0.11521077171947854, + "grad_norm": 1.4370076656341553, + "learning_rate": 4.8380409624381826e-05, + "loss": 4.9219, + "step": 19372 + }, + { + "epoch": 0.11521671900275954, + "grad_norm": 1.543291449546814, + "learning_rate": 4.838024423158999e-05, + "loss": 4.9835, + "step": 19373 + }, + { + "epoch": 0.11522266628604054, + "grad_norm": 1.2460718154907227, + "learning_rate": 4.838007883063634e-05, + "loss": 5.0426, + "step": 19374 + }, + { + "epoch": 0.11522861356932153, + "grad_norm": 1.5159900188446045, + "learning_rate": 4.837991342152096e-05, + "loss": 5.0214, + "step": 19375 + }, + { + "epoch": 0.11523456085260253, + "grad_norm": 1.3800876140594482, + "learning_rate": 4.837974800424389e-05, + "loss": 4.7606, + "step": 19376 + }, + { + "epoch": 0.11524050813588353, + "grad_norm": 1.509788155555725, + "learning_rate": 4.8379582578805197e-05, + "loss": 4.9886, + "step": 19377 + }, + { + "epoch": 0.11524645541916452, + "grad_norm": 1.292523741722107, + "learning_rate": 4.837941714520492e-05, + "loss": 5.1574, + "step": 19378 + }, + { + "epoch": 0.11525240270244552, + "grad_norm": 1.351827621459961, + "learning_rate": 4.837925170344314e-05, + "loss": 5.3133, + "step": 19379 + }, + { + "epoch": 0.11525834998572652, + "grad_norm": 1.4871753454208374, + "learning_rate": 4.83790862535199e-05, + "loss": 4.843, + "step": 19380 + }, + { + "epoch": 0.11526429726900751, + "grad_norm": 1.6031657457351685, + "learning_rate": 4.8378920795435264e-05, + "loss": 4.8244, + "step": 19381 + }, + { + "epoch": 0.11527024455228851, + "grad_norm": 1.3754857778549194, + "learning_rate": 4.8378755329189294e-05, + "loss": 4.8421, + "step": 19382 + }, + { + "epoch": 0.11527619183556952, + "grad_norm": 1.5428962707519531, + "learning_rate": 4.837858985478203e-05, + "loss": 4.9472, + "step": 19383 + }, + { + "epoch": 0.1152821391188505, + "grad_norm": 1.45586097240448, + "learning_rate": 4.837842437221356e-05, + "loss": 4.874, + "step": 19384 + }, + { + "epoch": 0.1152880864021315, + "grad_norm": 1.5139529705047607, + "learning_rate": 4.837825888148391e-05, + "loss": 4.8867, + "step": 19385 + }, + { + "epoch": 0.11529403368541251, + "grad_norm": 1.6341979503631592, + "learning_rate": 4.837809338259315e-05, + "loss": 4.8476, + "step": 19386 + }, + { + "epoch": 0.1152999809686935, + "grad_norm": 1.45046865940094, + "learning_rate": 4.837792787554134e-05, + "loss": 5.0273, + "step": 19387 + }, + { + "epoch": 0.1153059282519745, + "grad_norm": 1.2840397357940674, + "learning_rate": 4.8377762360328547e-05, + "loss": 5.1717, + "step": 19388 + }, + { + "epoch": 0.1153118755352555, + "grad_norm": 1.4211467504501343, + "learning_rate": 4.8377596836954805e-05, + "loss": 5.021, + "step": 19389 + }, + { + "epoch": 0.11531782281853649, + "grad_norm": 1.3885877132415771, + "learning_rate": 4.837743130542019e-05, + "loss": 5.2158, + "step": 19390 + }, + { + "epoch": 0.11532377010181749, + "grad_norm": 1.2344088554382324, + "learning_rate": 4.837726576572476e-05, + "loss": 5.212, + "step": 19391 + }, + { + "epoch": 0.11532971738509849, + "grad_norm": 1.1903822422027588, + "learning_rate": 4.837710021786857e-05, + "loss": 5.3071, + "step": 19392 + }, + { + "epoch": 0.11533566466837948, + "grad_norm": 1.4263699054718018, + "learning_rate": 4.837693466185167e-05, + "loss": 5.1472, + "step": 19393 + }, + { + "epoch": 0.11534161195166048, + "grad_norm": 1.201027512550354, + "learning_rate": 4.837676909767412e-05, + "loss": 5.1779, + "step": 19394 + }, + { + "epoch": 0.11534755923494149, + "grad_norm": 1.2903262376785278, + "learning_rate": 4.8376603525335995e-05, + "loss": 5.038, + "step": 19395 + }, + { + "epoch": 0.11535350651822247, + "grad_norm": 1.3125475645065308, + "learning_rate": 4.837643794483733e-05, + "loss": 4.8948, + "step": 19396 + }, + { + "epoch": 0.11535945380150348, + "grad_norm": 1.1773933172225952, + "learning_rate": 4.837627235617819e-05, + "loss": 5.0854, + "step": 19397 + }, + { + "epoch": 0.11536540108478446, + "grad_norm": 1.2542996406555176, + "learning_rate": 4.837610675935864e-05, + "loss": 5.1329, + "step": 19398 + }, + { + "epoch": 0.11537134836806547, + "grad_norm": 1.1876561641693115, + "learning_rate": 4.837594115437873e-05, + "loss": 4.9757, + "step": 19399 + }, + { + "epoch": 0.11537729565134647, + "grad_norm": 1.2957814931869507, + "learning_rate": 4.837577554123852e-05, + "loss": 5.1203, + "step": 19400 + }, + { + "epoch": 0.11538324293462746, + "grad_norm": 1.2537682056427002, + "learning_rate": 4.837560991993807e-05, + "loss": 4.975, + "step": 19401 + }, + { + "epoch": 0.11538919021790846, + "grad_norm": 1.1898986101150513, + "learning_rate": 4.837544429047743e-05, + "loss": 4.9028, + "step": 19402 + }, + { + "epoch": 0.11539513750118946, + "grad_norm": 1.4129477739334106, + "learning_rate": 4.837527865285667e-05, + "loss": 4.7576, + "step": 19403 + }, + { + "epoch": 0.11540108478447045, + "grad_norm": 1.5386319160461426, + "learning_rate": 4.837511300707585e-05, + "loss": 4.9332, + "step": 19404 + }, + { + "epoch": 0.11540703206775145, + "grad_norm": 1.3597557544708252, + "learning_rate": 4.8374947353135e-05, + "loss": 4.8007, + "step": 19405 + }, + { + "epoch": 0.11541297935103245, + "grad_norm": 1.8251479864120483, + "learning_rate": 4.837478169103421e-05, + "loss": 5.048, + "step": 19406 + }, + { + "epoch": 0.11541892663431344, + "grad_norm": 1.488844871520996, + "learning_rate": 4.8374616020773523e-05, + "loss": 4.855, + "step": 19407 + }, + { + "epoch": 0.11542487391759444, + "grad_norm": 1.1640641689300537, + "learning_rate": 4.8374450342352996e-05, + "loss": 4.7714, + "step": 19408 + }, + { + "epoch": 0.11543082120087544, + "grad_norm": 1.1133109331130981, + "learning_rate": 4.8374284655772696e-05, + "loss": 4.849, + "step": 19409 + }, + { + "epoch": 0.11543676848415643, + "grad_norm": 1.2767143249511719, + "learning_rate": 4.837411896103266e-05, + "loss": 4.8078, + "step": 19410 + }, + { + "epoch": 0.11544271576743743, + "grad_norm": 1.2564034461975098, + "learning_rate": 4.837395325813298e-05, + "loss": 4.8602, + "step": 19411 + }, + { + "epoch": 0.11544866305071844, + "grad_norm": 1.2702561616897583, + "learning_rate": 4.837378754707369e-05, + "loss": 4.9148, + "step": 19412 + }, + { + "epoch": 0.11545461033399942, + "grad_norm": 1.1960140466690063, + "learning_rate": 4.8373621827854845e-05, + "loss": 4.9242, + "step": 19413 + }, + { + "epoch": 0.11546055761728043, + "grad_norm": 1.3663053512573242, + "learning_rate": 4.837345610047651e-05, + "loss": 4.9837, + "step": 19414 + }, + { + "epoch": 0.11546650490056143, + "grad_norm": 1.340897560119629, + "learning_rate": 4.837329036493875e-05, + "loss": 4.8059, + "step": 19415 + }, + { + "epoch": 0.11547245218384242, + "grad_norm": 1.326195478439331, + "learning_rate": 4.8373124621241616e-05, + "loss": 4.7115, + "step": 19416 + }, + { + "epoch": 0.11547839946712342, + "grad_norm": 1.2291951179504395, + "learning_rate": 4.837295886938516e-05, + "loss": 5.0075, + "step": 19417 + }, + { + "epoch": 0.11548434675040442, + "grad_norm": 1.3071776628494263, + "learning_rate": 4.837279310936945e-05, + "loss": 4.7839, + "step": 19418 + }, + { + "epoch": 0.11549029403368541, + "grad_norm": 1.4331681728363037, + "learning_rate": 4.837262734119453e-05, + "loss": 4.7494, + "step": 19419 + }, + { + "epoch": 0.11549624131696641, + "grad_norm": 1.4209895133972168, + "learning_rate": 4.837246156486048e-05, + "loss": 4.8538, + "step": 19420 + }, + { + "epoch": 0.11550218860024741, + "grad_norm": 1.2397242784500122, + "learning_rate": 4.837229578036734e-05, + "loss": 4.7616, + "step": 19421 + }, + { + "epoch": 0.1155081358835284, + "grad_norm": 1.2271560430526733, + "learning_rate": 4.837212998771517e-05, + "loss": 4.7361, + "step": 19422 + }, + { + "epoch": 0.1155140831668094, + "grad_norm": 1.3334344625473022, + "learning_rate": 4.837196418690403e-05, + "loss": 4.8971, + "step": 19423 + }, + { + "epoch": 0.1155200304500904, + "grad_norm": 1.3195756673812866, + "learning_rate": 4.837179837793398e-05, + "loss": 4.8944, + "step": 19424 + }, + { + "epoch": 0.1155259777333714, + "grad_norm": 1.4583542346954346, + "learning_rate": 4.837163256080508e-05, + "loss": 4.7857, + "step": 19425 + }, + { + "epoch": 0.1155319250166524, + "grad_norm": 1.5155558586120605, + "learning_rate": 4.837146673551739e-05, + "loss": 4.7728, + "step": 19426 + }, + { + "epoch": 0.1155378722999334, + "grad_norm": 1.3582627773284912, + "learning_rate": 4.837130090207095e-05, + "loss": 4.7065, + "step": 19427 + }, + { + "epoch": 0.11554381958321439, + "grad_norm": 1.2635151147842407, + "learning_rate": 4.837113506046584e-05, + "loss": 4.882, + "step": 19428 + }, + { + "epoch": 0.11554976686649539, + "grad_norm": 1.417083501815796, + "learning_rate": 4.83709692107021e-05, + "loss": 4.8928, + "step": 19429 + }, + { + "epoch": 0.11555571414977638, + "grad_norm": 1.4780973196029663, + "learning_rate": 4.8370803352779806e-05, + "loss": 4.9458, + "step": 19430 + }, + { + "epoch": 0.11556166143305738, + "grad_norm": 1.2949103116989136, + "learning_rate": 4.8370637486699e-05, + "loss": 4.8753, + "step": 19431 + }, + { + "epoch": 0.11556760871633838, + "grad_norm": 1.4755308628082275, + "learning_rate": 4.8370471612459744e-05, + "loss": 4.7886, + "step": 19432 + }, + { + "epoch": 0.11557355599961937, + "grad_norm": 1.4527158737182617, + "learning_rate": 4.8370305730062095e-05, + "loss": 4.8442, + "step": 19433 + }, + { + "epoch": 0.11557950328290037, + "grad_norm": 1.3422110080718994, + "learning_rate": 4.8370139839506124e-05, + "loss": 4.9745, + "step": 19434 + }, + { + "epoch": 0.11558545056618137, + "grad_norm": 1.5843584537506104, + "learning_rate": 4.836997394079187e-05, + "loss": 4.8432, + "step": 19435 + }, + { + "epoch": 0.11559139784946236, + "grad_norm": 1.3267780542373657, + "learning_rate": 4.836980803391941e-05, + "loss": 4.7816, + "step": 19436 + }, + { + "epoch": 0.11559734513274336, + "grad_norm": 1.3092966079711914, + "learning_rate": 4.836964211888878e-05, + "loss": 5.0283, + "step": 19437 + }, + { + "epoch": 0.11560329241602436, + "grad_norm": 1.4653512239456177, + "learning_rate": 4.836947619570005e-05, + "loss": 4.9265, + "step": 19438 + }, + { + "epoch": 0.11560923969930535, + "grad_norm": 1.344672441482544, + "learning_rate": 4.836931026435328e-05, + "loss": 5.0426, + "step": 19439 + }, + { + "epoch": 0.11561518698258635, + "grad_norm": 1.3949403762817383, + "learning_rate": 4.836914432484853e-05, + "loss": 5.1539, + "step": 19440 + }, + { + "epoch": 0.11562113426586736, + "grad_norm": 1.3876662254333496, + "learning_rate": 4.836897837718585e-05, + "loss": 4.9346, + "step": 19441 + }, + { + "epoch": 0.11562708154914834, + "grad_norm": 1.3399412631988525, + "learning_rate": 4.83688124213653e-05, + "loss": 4.8688, + "step": 19442 + }, + { + "epoch": 0.11563302883242935, + "grad_norm": 1.3819881677627563, + "learning_rate": 4.836864645738694e-05, + "loss": 4.9527, + "step": 19443 + }, + { + "epoch": 0.11563897611571035, + "grad_norm": 1.509074091911316, + "learning_rate": 4.8368480485250825e-05, + "loss": 4.9273, + "step": 19444 + }, + { + "epoch": 0.11564492339899134, + "grad_norm": 1.2591453790664673, + "learning_rate": 4.836831450495701e-05, + "loss": 4.9065, + "step": 19445 + }, + { + "epoch": 0.11565087068227234, + "grad_norm": 1.4065910577774048, + "learning_rate": 4.836814851650557e-05, + "loss": 4.9699, + "step": 19446 + }, + { + "epoch": 0.11565681796555334, + "grad_norm": 1.3355581760406494, + "learning_rate": 4.836798251989655e-05, + "loss": 5.1639, + "step": 19447 + }, + { + "epoch": 0.11566276524883433, + "grad_norm": 1.3715496063232422, + "learning_rate": 4.836781651513e-05, + "loss": 4.855, + "step": 19448 + }, + { + "epoch": 0.11566871253211533, + "grad_norm": 1.569305658340454, + "learning_rate": 4.836765050220599e-05, + "loss": 4.6329, + "step": 19449 + }, + { + "epoch": 0.11567465981539633, + "grad_norm": 1.3613293170928955, + "learning_rate": 4.836748448112458e-05, + "loss": 4.9897, + "step": 19450 + }, + { + "epoch": 0.11568060709867732, + "grad_norm": 1.2653577327728271, + "learning_rate": 4.836731845188581e-05, + "loss": 4.9819, + "step": 19451 + }, + { + "epoch": 0.11568655438195832, + "grad_norm": 1.5030022859573364, + "learning_rate": 4.836715241448976e-05, + "loss": 4.8387, + "step": 19452 + }, + { + "epoch": 0.11569250166523933, + "grad_norm": 1.2560715675354004, + "learning_rate": 4.836698636893647e-05, + "loss": 5.0862, + "step": 19453 + }, + { + "epoch": 0.11569844894852031, + "grad_norm": 1.1981379985809326, + "learning_rate": 4.836682031522602e-05, + "loss": 4.7682, + "step": 19454 + }, + { + "epoch": 0.11570439623180132, + "grad_norm": 1.3572615385055542, + "learning_rate": 4.8366654253358444e-05, + "loss": 4.9008, + "step": 19455 + }, + { + "epoch": 0.11571034351508232, + "grad_norm": 1.2542002201080322, + "learning_rate": 4.8366488183333816e-05, + "loss": 4.911, + "step": 19456 + }, + { + "epoch": 0.1157162907983633, + "grad_norm": 1.4759174585342407, + "learning_rate": 4.8366322105152186e-05, + "loss": 4.789, + "step": 19457 + }, + { + "epoch": 0.11572223808164431, + "grad_norm": 1.2307411432266235, + "learning_rate": 4.8366156018813616e-05, + "loss": 4.9556, + "step": 19458 + }, + { + "epoch": 0.1157281853649253, + "grad_norm": 1.240334153175354, + "learning_rate": 4.836598992431816e-05, + "loss": 4.9996, + "step": 19459 + }, + { + "epoch": 0.1157341326482063, + "grad_norm": 1.3100368976593018, + "learning_rate": 4.8365823821665876e-05, + "loss": 5.0693, + "step": 19460 + }, + { + "epoch": 0.1157400799314873, + "grad_norm": 1.0904709100723267, + "learning_rate": 4.8365657710856835e-05, + "loss": 5.0327, + "step": 19461 + }, + { + "epoch": 0.11574602721476829, + "grad_norm": 1.3847914934158325, + "learning_rate": 4.836549159189108e-05, + "loss": 5.0512, + "step": 19462 + }, + { + "epoch": 0.11575197449804929, + "grad_norm": 1.2307064533233643, + "learning_rate": 4.836532546476866e-05, + "loss": 5.0687, + "step": 19463 + }, + { + "epoch": 0.11575792178133029, + "grad_norm": 1.3900285959243774, + "learning_rate": 4.836515932948966e-05, + "loss": 5.1044, + "step": 19464 + }, + { + "epoch": 0.11576386906461128, + "grad_norm": 1.2194246053695679, + "learning_rate": 4.836499318605412e-05, + "loss": 5.0412, + "step": 19465 + }, + { + "epoch": 0.11576981634789228, + "grad_norm": 1.3460240364074707, + "learning_rate": 4.83648270344621e-05, + "loss": 5.14, + "step": 19466 + }, + { + "epoch": 0.11577576363117328, + "grad_norm": 1.2739115953445435, + "learning_rate": 4.8364660874713664e-05, + "loss": 5.0782, + "step": 19467 + }, + { + "epoch": 0.11578171091445427, + "grad_norm": 1.987092137336731, + "learning_rate": 4.836449470680887e-05, + "loss": 4.8106, + "step": 19468 + }, + { + "epoch": 0.11578765819773527, + "grad_norm": 1.3820792436599731, + "learning_rate": 4.8364328530747765e-05, + "loss": 5.3549, + "step": 19469 + }, + { + "epoch": 0.11579360548101628, + "grad_norm": 1.5276916027069092, + "learning_rate": 4.836416234653042e-05, + "loss": 5.3479, + "step": 19470 + }, + { + "epoch": 0.11579955276429726, + "grad_norm": 1.5292818546295166, + "learning_rate": 4.836399615415688e-05, + "loss": 5.2627, + "step": 19471 + }, + { + "epoch": 0.11580550004757827, + "grad_norm": 1.5759434700012207, + "learning_rate": 4.836382995362722e-05, + "loss": 5.2925, + "step": 19472 + }, + { + "epoch": 0.11581144733085927, + "grad_norm": 1.3807876110076904, + "learning_rate": 4.836366374494148e-05, + "loss": 5.0794, + "step": 19473 + }, + { + "epoch": 0.11581739461414026, + "grad_norm": 1.3631199598312378, + "learning_rate": 4.836349752809973e-05, + "loss": 5.0606, + "step": 19474 + }, + { + "epoch": 0.11582334189742126, + "grad_norm": 1.5250667333602905, + "learning_rate": 4.836333130310202e-05, + "loss": 5.1799, + "step": 19475 + }, + { + "epoch": 0.11582928918070226, + "grad_norm": 1.4191410541534424, + "learning_rate": 4.836316506994842e-05, + "loss": 5.2812, + "step": 19476 + }, + { + "epoch": 0.11583523646398325, + "grad_norm": 1.5502076148986816, + "learning_rate": 4.8362998828638975e-05, + "loss": 5.3503, + "step": 19477 + }, + { + "epoch": 0.11584118374726425, + "grad_norm": 1.441786766052246, + "learning_rate": 4.836283257917375e-05, + "loss": 5.1526, + "step": 19478 + }, + { + "epoch": 0.11584713103054525, + "grad_norm": 1.3994730710983276, + "learning_rate": 4.83626663215528e-05, + "loss": 5.1969, + "step": 19479 + }, + { + "epoch": 0.11585307831382624, + "grad_norm": 1.5141762495040894, + "learning_rate": 4.836250005577619e-05, + "loss": 5.099, + "step": 19480 + }, + { + "epoch": 0.11585902559710724, + "grad_norm": 1.4504029750823975, + "learning_rate": 4.836233378184397e-05, + "loss": 5.5225, + "step": 19481 + }, + { + "epoch": 0.11586497288038825, + "grad_norm": 1.3617264032363892, + "learning_rate": 4.8362167499756194e-05, + "loss": 5.3426, + "step": 19482 + }, + { + "epoch": 0.11587092016366923, + "grad_norm": 1.3681023120880127, + "learning_rate": 4.8362001209512934e-05, + "loss": 5.3476, + "step": 19483 + }, + { + "epoch": 0.11587686744695024, + "grad_norm": 1.050550937652588, + "learning_rate": 4.836183491111424e-05, + "loss": 5.1338, + "step": 19484 + }, + { + "epoch": 0.11588281473023124, + "grad_norm": 1.386715054512024, + "learning_rate": 4.836166860456017e-05, + "loss": 5.2761, + "step": 19485 + }, + { + "epoch": 0.11588876201351223, + "grad_norm": 1.2128262519836426, + "learning_rate": 4.836150228985078e-05, + "loss": 5.165, + "step": 19486 + }, + { + "epoch": 0.11589470929679323, + "grad_norm": 1.224721074104309, + "learning_rate": 4.836133596698614e-05, + "loss": 5.1631, + "step": 19487 + }, + { + "epoch": 0.11590065658007422, + "grad_norm": 1.2348668575286865, + "learning_rate": 4.8361169635966285e-05, + "loss": 5.3206, + "step": 19488 + }, + { + "epoch": 0.11590660386335522, + "grad_norm": 1.1665185689926147, + "learning_rate": 4.836100329679129e-05, + "loss": 5.3162, + "step": 19489 + }, + { + "epoch": 0.11591255114663622, + "grad_norm": 1.2063257694244385, + "learning_rate": 4.836083694946122e-05, + "loss": 5.0348, + "step": 19490 + }, + { + "epoch": 0.11591849842991721, + "grad_norm": 1.5199745893478394, + "learning_rate": 4.836067059397612e-05, + "loss": 5.0793, + "step": 19491 + }, + { + "epoch": 0.11592444571319821, + "grad_norm": 1.2285770177841187, + "learning_rate": 4.8360504230336044e-05, + "loss": 5.1478, + "step": 19492 + }, + { + "epoch": 0.11593039299647921, + "grad_norm": 1.3429020643234253, + "learning_rate": 4.836033785854107e-05, + "loss": 5.3225, + "step": 19493 + }, + { + "epoch": 0.1159363402797602, + "grad_norm": 1.3870415687561035, + "learning_rate": 4.836017147859123e-05, + "loss": 5.2711, + "step": 19494 + }, + { + "epoch": 0.1159422875630412, + "grad_norm": 1.3311539888381958, + "learning_rate": 4.8360005090486603e-05, + "loss": 5.1778, + "step": 19495 + }, + { + "epoch": 0.1159482348463222, + "grad_norm": 1.1331884860992432, + "learning_rate": 4.8359838694227236e-05, + "loss": 5.1435, + "step": 19496 + }, + { + "epoch": 0.11595418212960319, + "grad_norm": 1.427506685256958, + "learning_rate": 4.83596722898132e-05, + "loss": 5.2153, + "step": 19497 + }, + { + "epoch": 0.1159601294128842, + "grad_norm": 1.4716016054153442, + "learning_rate": 4.835950587724453e-05, + "loss": 4.9599, + "step": 19498 + }, + { + "epoch": 0.1159660766961652, + "grad_norm": 1.073724389076233, + "learning_rate": 4.8359339456521305e-05, + "loss": 5.3481, + "step": 19499 + }, + { + "epoch": 0.11597202397944618, + "grad_norm": 1.1965457201004028, + "learning_rate": 4.835917302764358e-05, + "loss": 5.128, + "step": 19500 + }, + { + "epoch": 0.11597797126272719, + "grad_norm": 1.2589031457901, + "learning_rate": 4.83590065906114e-05, + "loss": 5.1952, + "step": 19501 + }, + { + "epoch": 0.11598391854600819, + "grad_norm": 1.5062520503997803, + "learning_rate": 4.8358840145424835e-05, + "loss": 5.3431, + "step": 19502 + }, + { + "epoch": 0.11598986582928918, + "grad_norm": 1.3464981317520142, + "learning_rate": 4.8358673692083944e-05, + "loss": 5.187, + "step": 19503 + }, + { + "epoch": 0.11599581311257018, + "grad_norm": 1.195157766342163, + "learning_rate": 4.8358507230588776e-05, + "loss": 5.4018, + "step": 19504 + }, + { + "epoch": 0.11600176039585118, + "grad_norm": 1.185371994972229, + "learning_rate": 4.83583407609394e-05, + "loss": 5.3204, + "step": 19505 + }, + { + "epoch": 0.11600770767913217, + "grad_norm": 1.1011184453964233, + "learning_rate": 4.835817428313586e-05, + "loss": 5.2426, + "step": 19506 + }, + { + "epoch": 0.11601365496241317, + "grad_norm": 1.2706186771392822, + "learning_rate": 4.835800779717823e-05, + "loss": 5.3277, + "step": 19507 + }, + { + "epoch": 0.11601960224569417, + "grad_norm": 1.23444664478302, + "learning_rate": 4.8357841303066564e-05, + "loss": 5.304, + "step": 19508 + }, + { + "epoch": 0.11602554952897516, + "grad_norm": 1.3166215419769287, + "learning_rate": 4.8357674800800915e-05, + "loss": 5.1755, + "step": 19509 + }, + { + "epoch": 0.11603149681225616, + "grad_norm": 1.0634559392929077, + "learning_rate": 4.835750829038134e-05, + "loss": 5.2188, + "step": 19510 + }, + { + "epoch": 0.11603744409553716, + "grad_norm": 1.0847052335739136, + "learning_rate": 4.8357341771807894e-05, + "loss": 5.1993, + "step": 19511 + }, + { + "epoch": 0.11604339137881815, + "grad_norm": 1.2893394231796265, + "learning_rate": 4.8357175245080645e-05, + "loss": 5.278, + "step": 19512 + }, + { + "epoch": 0.11604933866209916, + "grad_norm": 1.1346744298934937, + "learning_rate": 4.8357008710199653e-05, + "loss": 5.0915, + "step": 19513 + }, + { + "epoch": 0.11605528594538016, + "grad_norm": 1.2405723333358765, + "learning_rate": 4.835684216716497e-05, + "loss": 5.3274, + "step": 19514 + }, + { + "epoch": 0.11606123322866115, + "grad_norm": 1.2367215156555176, + "learning_rate": 4.8356675615976646e-05, + "loss": 5.3145, + "step": 19515 + }, + { + "epoch": 0.11606718051194215, + "grad_norm": 1.23695969581604, + "learning_rate": 4.835650905663476e-05, + "loss": 5.1454, + "step": 19516 + }, + { + "epoch": 0.11607312779522314, + "grad_norm": 1.649644136428833, + "learning_rate": 4.835634248913935e-05, + "loss": 4.9684, + "step": 19517 + }, + { + "epoch": 0.11607907507850414, + "grad_norm": 1.3828257322311401, + "learning_rate": 4.835617591349049e-05, + "loss": 4.8913, + "step": 19518 + }, + { + "epoch": 0.11608502236178514, + "grad_norm": 1.4446587562561035, + "learning_rate": 4.8356009329688215e-05, + "loss": 4.9248, + "step": 19519 + }, + { + "epoch": 0.11609096964506613, + "grad_norm": 1.4149401187896729, + "learning_rate": 4.835584273773261e-05, + "loss": 5.0446, + "step": 19520 + }, + { + "epoch": 0.11609691692834713, + "grad_norm": 1.4073368310928345, + "learning_rate": 4.835567613762372e-05, + "loss": 5.1451, + "step": 19521 + }, + { + "epoch": 0.11610286421162813, + "grad_norm": 1.438539743423462, + "learning_rate": 4.835550952936161e-05, + "loss": 5.3629, + "step": 19522 + }, + { + "epoch": 0.11610881149490912, + "grad_norm": 1.4686654806137085, + "learning_rate": 4.835534291294632e-05, + "loss": 5.4386, + "step": 19523 + }, + { + "epoch": 0.11611475877819012, + "grad_norm": 1.3416131734848022, + "learning_rate": 4.835517628837793e-05, + "loss": 5.4625, + "step": 19524 + }, + { + "epoch": 0.11612070606147112, + "grad_norm": 1.38942551612854, + "learning_rate": 4.835500965565649e-05, + "loss": 5.2164, + "step": 19525 + }, + { + "epoch": 0.11612665334475211, + "grad_norm": 1.157583475112915, + "learning_rate": 4.835484301478205e-05, + "loss": 4.931, + "step": 19526 + }, + { + "epoch": 0.11613260062803311, + "grad_norm": 1.1182529926300049, + "learning_rate": 4.835467636575468e-05, + "loss": 5.0804, + "step": 19527 + }, + { + "epoch": 0.11613854791131412, + "grad_norm": 1.1087690591812134, + "learning_rate": 4.835450970857444e-05, + "loss": 4.9112, + "step": 19528 + }, + { + "epoch": 0.1161444951945951, + "grad_norm": 1.1217858791351318, + "learning_rate": 4.8354343043241374e-05, + "loss": 4.8775, + "step": 19529 + }, + { + "epoch": 0.1161504424778761, + "grad_norm": 1.703722596168518, + "learning_rate": 4.8354176369755556e-05, + "loss": 5.0991, + "step": 19530 + }, + { + "epoch": 0.11615638976115711, + "grad_norm": 1.5027599334716797, + "learning_rate": 4.8354009688117026e-05, + "loss": 5.3486, + "step": 19531 + }, + { + "epoch": 0.1161623370444381, + "grad_norm": 1.3976017236709595, + "learning_rate": 4.835384299832586e-05, + "loss": 5.3045, + "step": 19532 + }, + { + "epoch": 0.1161682843277191, + "grad_norm": 1.4341175556182861, + "learning_rate": 4.83536763003821e-05, + "loss": 5.2463, + "step": 19533 + }, + { + "epoch": 0.1161742316110001, + "grad_norm": 1.248632550239563, + "learning_rate": 4.835350959428582e-05, + "loss": 5.1573, + "step": 19534 + }, + { + "epoch": 0.11618017889428109, + "grad_norm": 1.2873725891113281, + "learning_rate": 4.835334288003707e-05, + "loss": 5.3115, + "step": 19535 + }, + { + "epoch": 0.11618612617756209, + "grad_norm": 1.4359512329101562, + "learning_rate": 4.835317615763591e-05, + "loss": 5.1134, + "step": 19536 + }, + { + "epoch": 0.11619207346084309, + "grad_norm": 1.3092215061187744, + "learning_rate": 4.8353009427082395e-05, + "loss": 5.2955, + "step": 19537 + }, + { + "epoch": 0.11619802074412408, + "grad_norm": 1.292256474494934, + "learning_rate": 4.8352842688376585e-05, + "loss": 5.2163, + "step": 19538 + }, + { + "epoch": 0.11620396802740508, + "grad_norm": 1.2327983379364014, + "learning_rate": 4.8352675941518545e-05, + "loss": 5.2785, + "step": 19539 + }, + { + "epoch": 0.11620991531068608, + "grad_norm": 1.3402459621429443, + "learning_rate": 4.835250918650832e-05, + "loss": 5.2474, + "step": 19540 + }, + { + "epoch": 0.11621586259396707, + "grad_norm": 1.4312702417373657, + "learning_rate": 4.835234242334598e-05, + "loss": 5.1451, + "step": 19541 + }, + { + "epoch": 0.11622180987724808, + "grad_norm": 1.4165308475494385, + "learning_rate": 4.8352175652031576e-05, + "loss": 5.2241, + "step": 19542 + }, + { + "epoch": 0.11622775716052908, + "grad_norm": 1.1984010934829712, + "learning_rate": 4.835200887256517e-05, + "loss": 5.2084, + "step": 19543 + }, + { + "epoch": 0.11623370444381007, + "grad_norm": 1.277029275894165, + "learning_rate": 4.835184208494682e-05, + "loss": 5.1136, + "step": 19544 + }, + { + "epoch": 0.11623965172709107, + "grad_norm": 1.4002219438552856, + "learning_rate": 4.8351675289176586e-05, + "loss": 5.1313, + "step": 19545 + }, + { + "epoch": 0.11624559901037206, + "grad_norm": 1.397129774093628, + "learning_rate": 4.835150848525452e-05, + "loss": 5.2001, + "step": 19546 + }, + { + "epoch": 0.11625154629365306, + "grad_norm": 1.3968653678894043, + "learning_rate": 4.8351341673180686e-05, + "loss": 5.1292, + "step": 19547 + }, + { + "epoch": 0.11625749357693406, + "grad_norm": 1.298600435256958, + "learning_rate": 4.8351174852955125e-05, + "loss": 5.1185, + "step": 19548 + }, + { + "epoch": 0.11626344086021505, + "grad_norm": 1.119382619857788, + "learning_rate": 4.835100802457793e-05, + "loss": 5.2052, + "step": 19549 + }, + { + "epoch": 0.11626938814349605, + "grad_norm": 1.2555358409881592, + "learning_rate": 4.835084118804913e-05, + "loss": 5.2604, + "step": 19550 + }, + { + "epoch": 0.11627533542677705, + "grad_norm": 1.293525218963623, + "learning_rate": 4.835067434336879e-05, + "loss": 5.1402, + "step": 19551 + }, + { + "epoch": 0.11628128271005804, + "grad_norm": 1.3321988582611084, + "learning_rate": 4.8350507490536976e-05, + "loss": 5.0959, + "step": 19552 + }, + { + "epoch": 0.11628722999333904, + "grad_norm": 1.3231252431869507, + "learning_rate": 4.835034062955374e-05, + "loss": 5.0461, + "step": 19553 + }, + { + "epoch": 0.11629317727662004, + "grad_norm": 1.2743831872940063, + "learning_rate": 4.835017376041914e-05, + "loss": 5.1215, + "step": 19554 + }, + { + "epoch": 0.11629912455990103, + "grad_norm": 1.3750208616256714, + "learning_rate": 4.835000688313323e-05, + "loss": 5.0459, + "step": 19555 + }, + { + "epoch": 0.11630507184318203, + "grad_norm": 1.394209861755371, + "learning_rate": 4.834983999769609e-05, + "loss": 5.1577, + "step": 19556 + }, + { + "epoch": 0.11631101912646304, + "grad_norm": 1.2393178939819336, + "learning_rate": 4.834967310410775e-05, + "loss": 5.1217, + "step": 19557 + }, + { + "epoch": 0.11631696640974402, + "grad_norm": 1.2668427228927612, + "learning_rate": 4.834950620236829e-05, + "loss": 5.0266, + "step": 19558 + }, + { + "epoch": 0.11632291369302503, + "grad_norm": 1.4088828563690186, + "learning_rate": 4.834933929247775e-05, + "loss": 4.8089, + "step": 19559 + }, + { + "epoch": 0.11632886097630603, + "grad_norm": 1.2668780088424683, + "learning_rate": 4.83491723744362e-05, + "loss": 5.2791, + "step": 19560 + }, + { + "epoch": 0.11633480825958702, + "grad_norm": 1.3243741989135742, + "learning_rate": 4.834900544824369e-05, + "loss": 5.1743, + "step": 19561 + }, + { + "epoch": 0.11634075554286802, + "grad_norm": 1.497856616973877, + "learning_rate": 4.834883851390029e-05, + "loss": 4.8667, + "step": 19562 + }, + { + "epoch": 0.11634670282614902, + "grad_norm": 1.426867961883545, + "learning_rate": 4.834867157140605e-05, + "loss": 4.9758, + "step": 19563 + }, + { + "epoch": 0.11635265010943001, + "grad_norm": 1.4427236318588257, + "learning_rate": 4.834850462076103e-05, + "loss": 5.45, + "step": 19564 + }, + { + "epoch": 0.11635859739271101, + "grad_norm": 1.4465901851654053, + "learning_rate": 4.834833766196528e-05, + "loss": 5.0877, + "step": 19565 + }, + { + "epoch": 0.11636454467599201, + "grad_norm": 1.76282799243927, + "learning_rate": 4.834817069501888e-05, + "loss": 5.0607, + "step": 19566 + }, + { + "epoch": 0.116370491959273, + "grad_norm": 1.4688469171524048, + "learning_rate": 4.8348003719921864e-05, + "loss": 4.9929, + "step": 19567 + }, + { + "epoch": 0.116376439242554, + "grad_norm": 1.576390266418457, + "learning_rate": 4.834783673667431e-05, + "loss": 5.7283, + "step": 19568 + }, + { + "epoch": 0.116382386525835, + "grad_norm": 1.517745852470398, + "learning_rate": 4.834766974527626e-05, + "loss": 5.3711, + "step": 19569 + }, + { + "epoch": 0.11638833380911599, + "grad_norm": 1.5122108459472656, + "learning_rate": 4.834750274572778e-05, + "loss": 5.6297, + "step": 19570 + }, + { + "epoch": 0.116394281092397, + "grad_norm": 1.9188055992126465, + "learning_rate": 4.8347335738028934e-05, + "loss": 5.0911, + "step": 19571 + }, + { + "epoch": 0.116400228375678, + "grad_norm": 1.7408324480056763, + "learning_rate": 4.834716872217977e-05, + "loss": 5.1396, + "step": 19572 + }, + { + "epoch": 0.11640617565895899, + "grad_norm": 1.7669044733047485, + "learning_rate": 4.834700169818035e-05, + "loss": 5.1463, + "step": 19573 + }, + { + "epoch": 0.11641212294223999, + "grad_norm": 1.7838845252990723, + "learning_rate": 4.834683466603074e-05, + "loss": 5.3486, + "step": 19574 + }, + { + "epoch": 0.11641807022552098, + "grad_norm": 1.8427141904830933, + "learning_rate": 4.834666762573098e-05, + "loss": 5.1454, + "step": 19575 + }, + { + "epoch": 0.11642401750880198, + "grad_norm": 1.8620864152908325, + "learning_rate": 4.8346500577281145e-05, + "loss": 4.9462, + "step": 19576 + }, + { + "epoch": 0.11642996479208298, + "grad_norm": 1.7334544658660889, + "learning_rate": 4.834633352068129e-05, + "loss": 4.9012, + "step": 19577 + }, + { + "epoch": 0.11643591207536397, + "grad_norm": 1.7202188968658447, + "learning_rate": 4.834616645593147e-05, + "loss": 5.2577, + "step": 19578 + }, + { + "epoch": 0.11644185935864497, + "grad_norm": 1.5666993856430054, + "learning_rate": 4.834599938303174e-05, + "loss": 4.9502, + "step": 19579 + }, + { + "epoch": 0.11644780664192597, + "grad_norm": 1.5880829095840454, + "learning_rate": 4.834583230198217e-05, + "loss": 5.1193, + "step": 19580 + }, + { + "epoch": 0.11645375392520696, + "grad_norm": 1.7851444482803345, + "learning_rate": 4.834566521278281e-05, + "loss": 5.1411, + "step": 19581 + }, + { + "epoch": 0.11645970120848796, + "grad_norm": 1.8817992210388184, + "learning_rate": 4.834549811543371e-05, + "loss": 5.1773, + "step": 19582 + }, + { + "epoch": 0.11646564849176896, + "grad_norm": 1.8055325746536255, + "learning_rate": 4.834533100993495e-05, + "loss": 4.8526, + "step": 19583 + }, + { + "epoch": 0.11647159577504995, + "grad_norm": 1.501705527305603, + "learning_rate": 4.834516389628657e-05, + "loss": 4.9943, + "step": 19584 + }, + { + "epoch": 0.11647754305833095, + "grad_norm": 1.8224765062332153, + "learning_rate": 4.8344996774488635e-05, + "loss": 5.3321, + "step": 19585 + }, + { + "epoch": 0.11648349034161196, + "grad_norm": 1.7806826829910278, + "learning_rate": 4.83448296445412e-05, + "loss": 5.1565, + "step": 19586 + }, + { + "epoch": 0.11648943762489294, + "grad_norm": 1.64619779586792, + "learning_rate": 4.8344662506444334e-05, + "loss": 4.9259, + "step": 19587 + }, + { + "epoch": 0.11649538490817395, + "grad_norm": 1.7176555395126343, + "learning_rate": 4.834449536019808e-05, + "loss": 4.9173, + "step": 19588 + }, + { + "epoch": 0.11650133219145495, + "grad_norm": 1.7485530376434326, + "learning_rate": 4.834432820580251e-05, + "loss": 4.9548, + "step": 19589 + }, + { + "epoch": 0.11650727947473594, + "grad_norm": 1.8407695293426514, + "learning_rate": 4.834416104325767e-05, + "loss": 5.5323, + "step": 19590 + }, + { + "epoch": 0.11651322675801694, + "grad_norm": 1.37450110912323, + "learning_rate": 4.834399387256363e-05, + "loss": 5.0058, + "step": 19591 + }, + { + "epoch": 0.11651917404129794, + "grad_norm": 1.6784085035324097, + "learning_rate": 4.834382669372044e-05, + "loss": 5.0886, + "step": 19592 + }, + { + "epoch": 0.11652512132457893, + "grad_norm": 1.9228695631027222, + "learning_rate": 4.834365950672816e-05, + "loss": 5.5382, + "step": 19593 + }, + { + "epoch": 0.11653106860785993, + "grad_norm": 1.7998968362808228, + "learning_rate": 4.834349231158685e-05, + "loss": 5.3286, + "step": 19594 + }, + { + "epoch": 0.11653701589114093, + "grad_norm": 1.9077783823013306, + "learning_rate": 4.8343325108296574e-05, + "loss": 4.9033, + "step": 19595 + }, + { + "epoch": 0.11654296317442192, + "grad_norm": 1.3677197694778442, + "learning_rate": 4.834315789685738e-05, + "loss": 5.4146, + "step": 19596 + }, + { + "epoch": 0.11654891045770292, + "grad_norm": 1.5490330457687378, + "learning_rate": 4.834299067726933e-05, + "loss": 5.8435, + "step": 19597 + }, + { + "epoch": 0.11655485774098392, + "grad_norm": 1.7260395288467407, + "learning_rate": 4.8342823449532484e-05, + "loss": 4.9687, + "step": 19598 + }, + { + "epoch": 0.11656080502426491, + "grad_norm": 1.5140855312347412, + "learning_rate": 4.83426562136469e-05, + "loss": 4.8185, + "step": 19599 + }, + { + "epoch": 0.11656675230754591, + "grad_norm": 1.7183781862258911, + "learning_rate": 4.834248896961263e-05, + "loss": 4.954, + "step": 19600 + }, + { + "epoch": 0.11657269959082692, + "grad_norm": 1.3909941911697388, + "learning_rate": 4.834232171742975e-05, + "loss": 5.3393, + "step": 19601 + }, + { + "epoch": 0.1165786468741079, + "grad_norm": 1.437046766281128, + "learning_rate": 4.83421544570983e-05, + "loss": 5.5486, + "step": 19602 + }, + { + "epoch": 0.11658459415738891, + "grad_norm": 1.4513304233551025, + "learning_rate": 4.8341987188618344e-05, + "loss": 5.6754, + "step": 19603 + }, + { + "epoch": 0.1165905414406699, + "grad_norm": 1.7366830110549927, + "learning_rate": 4.8341819911989936e-05, + "loss": 5.5651, + "step": 19604 + }, + { + "epoch": 0.1165964887239509, + "grad_norm": 1.7084081172943115, + "learning_rate": 4.834165262721315e-05, + "loss": 5.5237, + "step": 19605 + }, + { + "epoch": 0.1166024360072319, + "grad_norm": 1.588749647140503, + "learning_rate": 4.834148533428803e-05, + "loss": 5.5371, + "step": 19606 + }, + { + "epoch": 0.11660838329051289, + "grad_norm": 1.6907262802124023, + "learning_rate": 4.834131803321464e-05, + "loss": 5.3998, + "step": 19607 + }, + { + "epoch": 0.11661433057379389, + "grad_norm": 1.676530122756958, + "learning_rate": 4.834115072399304e-05, + "loss": 5.1636, + "step": 19608 + }, + { + "epoch": 0.11662027785707489, + "grad_norm": 1.6379070281982422, + "learning_rate": 4.834098340662327e-05, + "loss": 5.4196, + "step": 19609 + }, + { + "epoch": 0.11662622514035588, + "grad_norm": 1.6794102191925049, + "learning_rate": 4.8340816081105424e-05, + "loss": 5.3671, + "step": 19610 + }, + { + "epoch": 0.11663217242363688, + "grad_norm": 1.7833147048950195, + "learning_rate": 4.834064874743953e-05, + "loss": 5.3417, + "step": 19611 + }, + { + "epoch": 0.11663811970691788, + "grad_norm": 1.649409532546997, + "learning_rate": 4.834048140562566e-05, + "loss": 5.2781, + "step": 19612 + }, + { + "epoch": 0.11664406699019887, + "grad_norm": 1.6082829236984253, + "learning_rate": 4.834031405566387e-05, + "loss": 5.1188, + "step": 19613 + }, + { + "epoch": 0.11665001427347987, + "grad_norm": 1.6651804447174072, + "learning_rate": 4.834014669755421e-05, + "loss": 5.1683, + "step": 19614 + }, + { + "epoch": 0.11665596155676088, + "grad_norm": 1.715795636177063, + "learning_rate": 4.8339979331296755e-05, + "loss": 5.2491, + "step": 19615 + }, + { + "epoch": 0.11666190884004186, + "grad_norm": 1.6809749603271484, + "learning_rate": 4.8339811956891546e-05, + "loss": 5.0614, + "step": 19616 + }, + { + "epoch": 0.11666785612332287, + "grad_norm": 1.563790202140808, + "learning_rate": 4.833964457433865e-05, + "loss": 5.231, + "step": 19617 + }, + { + "epoch": 0.11667380340660387, + "grad_norm": 1.464647650718689, + "learning_rate": 4.8339477183638136e-05, + "loss": 5.0405, + "step": 19618 + }, + { + "epoch": 0.11667975068988486, + "grad_norm": 1.989701509475708, + "learning_rate": 4.8339309784790043e-05, + "loss": 5.4454, + "step": 19619 + }, + { + "epoch": 0.11668569797316586, + "grad_norm": 2.438558340072632, + "learning_rate": 4.833914237779444e-05, + "loss": 5.7298, + "step": 19620 + }, + { + "epoch": 0.11669164525644686, + "grad_norm": 1.7590994834899902, + "learning_rate": 4.833897496265139e-05, + "loss": 5.4473, + "step": 19621 + }, + { + "epoch": 0.11669759253972785, + "grad_norm": 2.1040074825286865, + "learning_rate": 4.833880753936093e-05, + "loss": 5.2399, + "step": 19622 + }, + { + "epoch": 0.11670353982300885, + "grad_norm": 1.7136433124542236, + "learning_rate": 4.8338640107923146e-05, + "loss": 5.21, + "step": 19623 + }, + { + "epoch": 0.11670948710628985, + "grad_norm": 1.5797784328460693, + "learning_rate": 4.8338472668338074e-05, + "loss": 5.3555, + "step": 19624 + }, + { + "epoch": 0.11671543438957084, + "grad_norm": 1.512645959854126, + "learning_rate": 4.833830522060579e-05, + "loss": 5.4964, + "step": 19625 + }, + { + "epoch": 0.11672138167285184, + "grad_norm": 1.9328651428222656, + "learning_rate": 4.833813776472634e-05, + "loss": 5.9072, + "step": 19626 + }, + { + "epoch": 0.11672732895613284, + "grad_norm": 1.882068395614624, + "learning_rate": 4.8337970300699795e-05, + "loss": 5.4304, + "step": 19627 + }, + { + "epoch": 0.11673327623941383, + "grad_norm": 2.1347815990448, + "learning_rate": 4.83378028285262e-05, + "loss": 5.1286, + "step": 19628 + }, + { + "epoch": 0.11673922352269483, + "grad_norm": 2.0237247943878174, + "learning_rate": 4.833763534820562e-05, + "loss": 5.113, + "step": 19629 + }, + { + "epoch": 0.11674517080597584, + "grad_norm": 1.5656205415725708, + "learning_rate": 4.833746785973811e-05, + "loss": 4.8452, + "step": 19630 + }, + { + "epoch": 0.11675111808925683, + "grad_norm": 2.268324613571167, + "learning_rate": 4.833730036312374e-05, + "loss": 5.7184, + "step": 19631 + }, + { + "epoch": 0.11675706537253783, + "grad_norm": 2.1705756187438965, + "learning_rate": 4.833713285836255e-05, + "loss": 5.6489, + "step": 19632 + }, + { + "epoch": 0.11676301265581882, + "grad_norm": 1.7976182699203491, + "learning_rate": 4.833696534545461e-05, + "loss": 5.7016, + "step": 19633 + }, + { + "epoch": 0.11676895993909982, + "grad_norm": 1.2853381633758545, + "learning_rate": 4.8336797824399976e-05, + "loss": 5.654, + "step": 19634 + }, + { + "epoch": 0.11677490722238082, + "grad_norm": 1.8741413354873657, + "learning_rate": 4.833663029519871e-05, + "loss": 5.6735, + "step": 19635 + }, + { + "epoch": 0.11678085450566181, + "grad_norm": 1.4911704063415527, + "learning_rate": 4.8336462757850864e-05, + "loss": 5.3877, + "step": 19636 + }, + { + "epoch": 0.11678680178894281, + "grad_norm": 1.7979151010513306, + "learning_rate": 4.8336295212356506e-05, + "loss": 5.5677, + "step": 19637 + }, + { + "epoch": 0.11679274907222381, + "grad_norm": 2.036970376968384, + "learning_rate": 4.8336127658715677e-05, + "loss": 5.4768, + "step": 19638 + }, + { + "epoch": 0.1167986963555048, + "grad_norm": 1.9423377513885498, + "learning_rate": 4.833596009692846e-05, + "loss": 5.4021, + "step": 19639 + }, + { + "epoch": 0.1168046436387858, + "grad_norm": 1.5860786437988281, + "learning_rate": 4.8335792526994894e-05, + "loss": 5.3363, + "step": 19640 + }, + { + "epoch": 0.1168105909220668, + "grad_norm": 1.5712209939956665, + "learning_rate": 4.833562494891504e-05, + "loss": 5.432, + "step": 19641 + }, + { + "epoch": 0.11681653820534779, + "grad_norm": 1.3889914751052856, + "learning_rate": 4.833545736268897e-05, + "loss": 5.3272, + "step": 19642 + }, + { + "epoch": 0.1168224854886288, + "grad_norm": 1.607134461402893, + "learning_rate": 4.8335289768316726e-05, + "loss": 5.9617, + "step": 19643 + }, + { + "epoch": 0.1168284327719098, + "grad_norm": 1.6738252639770508, + "learning_rate": 4.8335122165798376e-05, + "loss": 5.6361, + "step": 19644 + }, + { + "epoch": 0.11683438005519078, + "grad_norm": 1.6006174087524414, + "learning_rate": 4.8334954555133974e-05, + "loss": 5.7384, + "step": 19645 + }, + { + "epoch": 0.11684032733847179, + "grad_norm": 1.7018747329711914, + "learning_rate": 4.833478693632358e-05, + "loss": 5.0784, + "step": 19646 + }, + { + "epoch": 0.11684627462175279, + "grad_norm": 1.7542921304702759, + "learning_rate": 4.833461930936726e-05, + "loss": 5.2674, + "step": 19647 + }, + { + "epoch": 0.11685222190503378, + "grad_norm": 1.6434245109558105, + "learning_rate": 4.8334451674265055e-05, + "loss": 4.7117, + "step": 19648 + }, + { + "epoch": 0.11685816918831478, + "grad_norm": 1.7878485918045044, + "learning_rate": 4.8334284031017044e-05, + "loss": 4.8068, + "step": 19649 + }, + { + "epoch": 0.11686411647159578, + "grad_norm": 1.7029922008514404, + "learning_rate": 4.833411637962327e-05, + "loss": 4.9168, + "step": 19650 + }, + { + "epoch": 0.11687006375487677, + "grad_norm": 1.8004266023635864, + "learning_rate": 4.83339487200838e-05, + "loss": 4.9931, + "step": 19651 + }, + { + "epoch": 0.11687601103815777, + "grad_norm": 1.7843881845474243, + "learning_rate": 4.833378105239869e-05, + "loss": 5.0786, + "step": 19652 + }, + { + "epoch": 0.11688195832143877, + "grad_norm": 1.697993278503418, + "learning_rate": 4.833361337656799e-05, + "loss": 5.188, + "step": 19653 + }, + { + "epoch": 0.11688790560471976, + "grad_norm": 1.8484392166137695, + "learning_rate": 4.833344569259177e-05, + "loss": 5.4858, + "step": 19654 + }, + { + "epoch": 0.11689385288800076, + "grad_norm": 1.6850509643554688, + "learning_rate": 4.833327800047009e-05, + "loss": 5.7946, + "step": 19655 + }, + { + "epoch": 0.11689980017128176, + "grad_norm": 1.709845781326294, + "learning_rate": 4.8333110300203e-05, + "loss": 6.0674, + "step": 19656 + }, + { + "epoch": 0.11690574745456275, + "grad_norm": 1.6634660959243774, + "learning_rate": 4.833294259179057e-05, + "loss": 5.8038, + "step": 19657 + }, + { + "epoch": 0.11691169473784375, + "grad_norm": 1.6274930238723755, + "learning_rate": 4.833277487523283e-05, + "loss": 5.6752, + "step": 19658 + }, + { + "epoch": 0.11691764202112476, + "grad_norm": 1.5415219068527222, + "learning_rate": 4.833260715052988e-05, + "loss": 5.4002, + "step": 19659 + }, + { + "epoch": 0.11692358930440575, + "grad_norm": 1.6023998260498047, + "learning_rate": 4.833243941768175e-05, + "loss": 5.2429, + "step": 19660 + }, + { + "epoch": 0.11692953658768675, + "grad_norm": 1.4608384370803833, + "learning_rate": 4.8332271676688515e-05, + "loss": 5.5144, + "step": 19661 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 1.700076937675476, + "learning_rate": 4.833210392755021e-05, + "loss": 5.6356, + "step": 19662 + }, + { + "epoch": 0.11694143115424874, + "grad_norm": 1.415705919265747, + "learning_rate": 4.833193617026692e-05, + "loss": 5.6977, + "step": 19663 + }, + { + "epoch": 0.11694737843752974, + "grad_norm": 1.620815634727478, + "learning_rate": 4.833176840483868e-05, + "loss": 5.8967, + "step": 19664 + }, + { + "epoch": 0.11695332572081073, + "grad_norm": 1.4221736192703247, + "learning_rate": 4.833160063126558e-05, + "loss": 5.5351, + "step": 19665 + }, + { + "epoch": 0.11695927300409173, + "grad_norm": 1.460254192352295, + "learning_rate": 4.833143284954764e-05, + "loss": 5.327, + "step": 19666 + }, + { + "epoch": 0.11696522028737273, + "grad_norm": 1.8340283632278442, + "learning_rate": 4.833126505968495e-05, + "loss": 5.199, + "step": 19667 + }, + { + "epoch": 0.11697116757065372, + "grad_norm": 1.4036595821380615, + "learning_rate": 4.8331097261677555e-05, + "loss": 5.185, + "step": 19668 + }, + { + "epoch": 0.11697711485393472, + "grad_norm": 1.5454041957855225, + "learning_rate": 4.833092945552551e-05, + "loss": 5.3545, + "step": 19669 + }, + { + "epoch": 0.11698306213721572, + "grad_norm": 1.4965288639068604, + "learning_rate": 4.8330761641228886e-05, + "loss": 5.2993, + "step": 19670 + }, + { + "epoch": 0.11698900942049671, + "grad_norm": 2.4290192127227783, + "learning_rate": 4.833059381878773e-05, + "loss": 5.2738, + "step": 19671 + }, + { + "epoch": 0.11699495670377771, + "grad_norm": 2.502086877822876, + "learning_rate": 4.8330425988202097e-05, + "loss": 5.3218, + "step": 19672 + }, + { + "epoch": 0.11700090398705872, + "grad_norm": 2.1629221439361572, + "learning_rate": 4.833025814947206e-05, + "loss": 5.304, + "step": 19673 + }, + { + "epoch": 0.1170068512703397, + "grad_norm": 2.096604824066162, + "learning_rate": 4.8330090302597675e-05, + "loss": 5.3423, + "step": 19674 + }, + { + "epoch": 0.1170127985536207, + "grad_norm": 2.2843055725097656, + "learning_rate": 4.832992244757899e-05, + "loss": 5.2463, + "step": 19675 + }, + { + "epoch": 0.11701874583690171, + "grad_norm": 2.1538522243499756, + "learning_rate": 4.8329754584416074e-05, + "loss": 5.0529, + "step": 19676 + }, + { + "epoch": 0.1170246931201827, + "grad_norm": 1.763832688331604, + "learning_rate": 4.832958671310898e-05, + "loss": 5.105, + "step": 19677 + }, + { + "epoch": 0.1170306404034637, + "grad_norm": 2.048945426940918, + "learning_rate": 4.832941883365777e-05, + "loss": 5.1724, + "step": 19678 + }, + { + "epoch": 0.1170365876867447, + "grad_norm": 2.324202537536621, + "learning_rate": 4.83292509460625e-05, + "loss": 5.1574, + "step": 19679 + }, + { + "epoch": 0.11704253497002569, + "grad_norm": 2.447587728500366, + "learning_rate": 4.8329083050323235e-05, + "loss": 5.2401, + "step": 19680 + }, + { + "epoch": 0.11704848225330669, + "grad_norm": 2.212921380996704, + "learning_rate": 4.832891514644002e-05, + "loss": 5.1122, + "step": 19681 + }, + { + "epoch": 0.11705442953658769, + "grad_norm": 2.1183717250823975, + "learning_rate": 4.832874723441292e-05, + "loss": 4.985, + "step": 19682 + }, + { + "epoch": 0.11706037681986868, + "grad_norm": 2.1509101390838623, + "learning_rate": 4.8328579314242006e-05, + "loss": 5.1369, + "step": 19683 + }, + { + "epoch": 0.11706632410314968, + "grad_norm": 1.9071851968765259, + "learning_rate": 4.832841138592732e-05, + "loss": 5.0454, + "step": 19684 + }, + { + "epoch": 0.11707227138643068, + "grad_norm": 2.262612819671631, + "learning_rate": 4.8328243449468926e-05, + "loss": 5.0763, + "step": 19685 + }, + { + "epoch": 0.11707821866971167, + "grad_norm": 2.073665142059326, + "learning_rate": 4.8328075504866874e-05, + "loss": 5.0779, + "step": 19686 + }, + { + "epoch": 0.11708416595299267, + "grad_norm": 1.9270633459091187, + "learning_rate": 4.832790755212124e-05, + "loss": 4.8148, + "step": 19687 + }, + { + "epoch": 0.11709011323627368, + "grad_norm": 1.9167968034744263, + "learning_rate": 4.832773959123208e-05, + "loss": 4.8027, + "step": 19688 + }, + { + "epoch": 0.11709606051955466, + "grad_norm": 2.0495805740356445, + "learning_rate": 4.8327571622199444e-05, + "loss": 4.9483, + "step": 19689 + }, + { + "epoch": 0.11710200780283567, + "grad_norm": 2.203997850418091, + "learning_rate": 4.83274036450234e-05, + "loss": 5.1086, + "step": 19690 + }, + { + "epoch": 0.11710795508611666, + "grad_norm": 2.0023131370544434, + "learning_rate": 4.8327235659703984e-05, + "loss": 5.0601, + "step": 19691 + }, + { + "epoch": 0.11711390236939766, + "grad_norm": 2.3212523460388184, + "learning_rate": 4.832706766624128e-05, + "loss": 4.9391, + "step": 19692 + }, + { + "epoch": 0.11711984965267866, + "grad_norm": 2.2633869647979736, + "learning_rate": 4.8326899664635336e-05, + "loss": 5.0262, + "step": 19693 + }, + { + "epoch": 0.11712579693595965, + "grad_norm": 2.2608723640441895, + "learning_rate": 4.832673165488622e-05, + "loss": 4.9814, + "step": 19694 + }, + { + "epoch": 0.11713174421924065, + "grad_norm": 2.0270745754241943, + "learning_rate": 4.8326563636993975e-05, + "loss": 4.9321, + "step": 19695 + }, + { + "epoch": 0.11713769150252165, + "grad_norm": 2.1299290657043457, + "learning_rate": 4.832639561095867e-05, + "loss": 4.8248, + "step": 19696 + }, + { + "epoch": 0.11714363878580264, + "grad_norm": 2.1891887187957764, + "learning_rate": 4.8326227576780355e-05, + "loss": 4.963, + "step": 19697 + }, + { + "epoch": 0.11714958606908364, + "grad_norm": 2.35532546043396, + "learning_rate": 4.8326059534459114e-05, + "loss": 4.8617, + "step": 19698 + }, + { + "epoch": 0.11715553335236464, + "grad_norm": 2.215864658355713, + "learning_rate": 4.8325891483994964e-05, + "loss": 5.1467, + "step": 19699 + }, + { + "epoch": 0.11716148063564563, + "grad_norm": 1.7004871368408203, + "learning_rate": 4.8325723425387996e-05, + "loss": 4.8682, + "step": 19700 + }, + { + "epoch": 0.11716742791892663, + "grad_norm": 2.537426471710205, + "learning_rate": 4.832555535863826e-05, + "loss": 5.0373, + "step": 19701 + }, + { + "epoch": 0.11717337520220764, + "grad_norm": 2.3324837684631348, + "learning_rate": 4.832538728374581e-05, + "loss": 4.9261, + "step": 19702 + }, + { + "epoch": 0.11717932248548862, + "grad_norm": 2.107374906539917, + "learning_rate": 4.832521920071071e-05, + "loss": 5.0036, + "step": 19703 + }, + { + "epoch": 0.11718526976876963, + "grad_norm": 2.0933899879455566, + "learning_rate": 4.8325051109533024e-05, + "loss": 5.086, + "step": 19704 + }, + { + "epoch": 0.11719121705205063, + "grad_norm": 1.9250128269195557, + "learning_rate": 4.8324883010212794e-05, + "loss": 4.9056, + "step": 19705 + }, + { + "epoch": 0.11719716433533162, + "grad_norm": 2.0679538249969482, + "learning_rate": 4.832471490275009e-05, + "loss": 5.0291, + "step": 19706 + }, + { + "epoch": 0.11720311161861262, + "grad_norm": 2.1115055084228516, + "learning_rate": 4.8324546787144974e-05, + "loss": 4.8649, + "step": 19707 + }, + { + "epoch": 0.11720905890189362, + "grad_norm": 2.123899459838867, + "learning_rate": 4.832437866339749e-05, + "loss": 4.9011, + "step": 19708 + }, + { + "epoch": 0.11721500618517461, + "grad_norm": 2.2809536457061768, + "learning_rate": 4.832421053150772e-05, + "loss": 5.1844, + "step": 19709 + }, + { + "epoch": 0.11722095346845561, + "grad_norm": 2.04567551612854, + "learning_rate": 4.83240423914757e-05, + "loss": 4.8685, + "step": 19710 + }, + { + "epoch": 0.11722690075173661, + "grad_norm": 1.5762519836425781, + "learning_rate": 4.8323874243301495e-05, + "loss": 5.4069, + "step": 19711 + }, + { + "epoch": 0.1172328480350176, + "grad_norm": 1.719250202178955, + "learning_rate": 4.832370608698518e-05, + "loss": 5.6127, + "step": 19712 + }, + { + "epoch": 0.1172387953182986, + "grad_norm": 1.6808120012283325, + "learning_rate": 4.8323537922526785e-05, + "loss": 5.5401, + "step": 19713 + }, + { + "epoch": 0.1172447426015796, + "grad_norm": 1.6794480085372925, + "learning_rate": 4.832336974992639e-05, + "loss": 5.6679, + "step": 19714 + }, + { + "epoch": 0.11725068988486059, + "grad_norm": 1.7805535793304443, + "learning_rate": 4.832320156918405e-05, + "loss": 5.5025, + "step": 19715 + }, + { + "epoch": 0.1172566371681416, + "grad_norm": 2.1433472633361816, + "learning_rate": 4.832303338029982e-05, + "loss": 5.2425, + "step": 19716 + }, + { + "epoch": 0.1172625844514226, + "grad_norm": 1.5449565649032593, + "learning_rate": 4.832286518327376e-05, + "loss": 5.3278, + "step": 19717 + }, + { + "epoch": 0.11726853173470358, + "grad_norm": 1.7341786623001099, + "learning_rate": 4.832269697810592e-05, + "loss": 5.3393, + "step": 19718 + }, + { + "epoch": 0.11727447901798459, + "grad_norm": 1.4936028718948364, + "learning_rate": 4.832252876479638e-05, + "loss": 5.0499, + "step": 19719 + }, + { + "epoch": 0.11728042630126558, + "grad_norm": 1.7648371458053589, + "learning_rate": 4.832236054334518e-05, + "loss": 5.3585, + "step": 19720 + }, + { + "epoch": 0.11728637358454658, + "grad_norm": 1.8131940364837646, + "learning_rate": 4.832219231375238e-05, + "loss": 5.2496, + "step": 19721 + }, + { + "epoch": 0.11729232086782758, + "grad_norm": 1.5939579010009766, + "learning_rate": 4.832202407601806e-05, + "loss": 5.2294, + "step": 19722 + }, + { + "epoch": 0.11729826815110857, + "grad_norm": 1.6752222776412964, + "learning_rate": 4.832185583014225e-05, + "loss": 5.2679, + "step": 19723 + }, + { + "epoch": 0.11730421543438957, + "grad_norm": 1.4784640073776245, + "learning_rate": 4.832168757612502e-05, + "loss": 5.1567, + "step": 19724 + }, + { + "epoch": 0.11731016271767057, + "grad_norm": 1.5112851858139038, + "learning_rate": 4.8321519313966436e-05, + "loss": 5.0304, + "step": 19725 + }, + { + "epoch": 0.11731611000095156, + "grad_norm": 1.5895473957061768, + "learning_rate": 4.832135104366654e-05, + "loss": 5.0681, + "step": 19726 + }, + { + "epoch": 0.11732205728423256, + "grad_norm": 1.510641098022461, + "learning_rate": 4.832118276522541e-05, + "loss": 5.0667, + "step": 19727 + }, + { + "epoch": 0.11732800456751356, + "grad_norm": 1.7403017282485962, + "learning_rate": 4.83210144786431e-05, + "loss": 4.9199, + "step": 19728 + }, + { + "epoch": 0.11733395185079455, + "grad_norm": 2.239452600479126, + "learning_rate": 4.832084618391966e-05, + "loss": 5.2846, + "step": 19729 + }, + { + "epoch": 0.11733989913407555, + "grad_norm": 1.977001428604126, + "learning_rate": 4.8320677881055154e-05, + "loss": 4.9573, + "step": 19730 + }, + { + "epoch": 0.11734584641735656, + "grad_norm": 2.2819485664367676, + "learning_rate": 4.8320509570049633e-05, + "loss": 4.6549, + "step": 19731 + }, + { + "epoch": 0.11735179370063754, + "grad_norm": 2.3943941593170166, + "learning_rate": 4.832034125090317e-05, + "loss": 4.8411, + "step": 19732 + }, + { + "epoch": 0.11735774098391855, + "grad_norm": 2.5439767837524414, + "learning_rate": 4.832017292361582e-05, + "loss": 4.7305, + "step": 19733 + }, + { + "epoch": 0.11736368826719955, + "grad_norm": 2.21797251701355, + "learning_rate": 4.8320004588187636e-05, + "loss": 4.8963, + "step": 19734 + }, + { + "epoch": 0.11736963555048054, + "grad_norm": 1.9822254180908203, + "learning_rate": 4.831983624461868e-05, + "loss": 4.8062, + "step": 19735 + }, + { + "epoch": 0.11737558283376154, + "grad_norm": 2.56172513961792, + "learning_rate": 4.8319667892909004e-05, + "loss": 4.6495, + "step": 19736 + }, + { + "epoch": 0.11738153011704254, + "grad_norm": 2.3328988552093506, + "learning_rate": 4.831949953305868e-05, + "loss": 4.3587, + "step": 19737 + }, + { + "epoch": 0.11738747740032353, + "grad_norm": 2.4720728397369385, + "learning_rate": 4.831933116506775e-05, + "loss": 4.5648, + "step": 19738 + }, + { + "epoch": 0.11739342468360453, + "grad_norm": 2.3738696575164795, + "learning_rate": 4.831916278893629e-05, + "loss": 4.391, + "step": 19739 + }, + { + "epoch": 0.11739937196688553, + "grad_norm": 2.400050640106201, + "learning_rate": 4.831899440466435e-05, + "loss": 4.5792, + "step": 19740 + }, + { + "epoch": 0.11740531925016652, + "grad_norm": 1.7596909999847412, + "learning_rate": 4.831882601225199e-05, + "loss": 4.8026, + "step": 19741 + }, + { + "epoch": 0.11741126653344752, + "grad_norm": 2.2190558910369873, + "learning_rate": 4.831865761169927e-05, + "loss": 4.578, + "step": 19742 + }, + { + "epoch": 0.11741721381672852, + "grad_norm": 2.468982458114624, + "learning_rate": 4.831848920300624e-05, + "loss": 4.3132, + "step": 19743 + }, + { + "epoch": 0.11742316110000951, + "grad_norm": 2.1495306491851807, + "learning_rate": 4.831832078617298e-05, + "loss": 4.5307, + "step": 19744 + }, + { + "epoch": 0.11742910838329051, + "grad_norm": 2.2298312187194824, + "learning_rate": 4.831815236119953e-05, + "loss": 4.3435, + "step": 19745 + }, + { + "epoch": 0.11743505566657152, + "grad_norm": 2.0968551635742188, + "learning_rate": 4.831798392808595e-05, + "loss": 4.4348, + "step": 19746 + }, + { + "epoch": 0.1174410029498525, + "grad_norm": 2.2520592212677, + "learning_rate": 4.831781548683231e-05, + "loss": 4.4347, + "step": 19747 + }, + { + "epoch": 0.1174469502331335, + "grad_norm": 2.5319058895111084, + "learning_rate": 4.8317647037438655e-05, + "loss": 4.3817, + "step": 19748 + }, + { + "epoch": 0.1174528975164145, + "grad_norm": 2.186539649963379, + "learning_rate": 4.8317478579905054e-05, + "loss": 4.6415, + "step": 19749 + }, + { + "epoch": 0.1174588447996955, + "grad_norm": 2.472963571548462, + "learning_rate": 4.8317310114231554e-05, + "loss": 4.4495, + "step": 19750 + }, + { + "epoch": 0.1174647920829765, + "grad_norm": 2.3692901134490967, + "learning_rate": 4.831714164041823e-05, + "loss": 4.3571, + "step": 19751 + }, + { + "epoch": 0.11747073936625749, + "grad_norm": 1.8001717329025269, + "learning_rate": 4.831697315846513e-05, + "loss": 5.3843, + "step": 19752 + }, + { + "epoch": 0.11747668664953849, + "grad_norm": 1.6087725162506104, + "learning_rate": 4.8316804668372315e-05, + "loss": 5.7155, + "step": 19753 + }, + { + "epoch": 0.11748263393281949, + "grad_norm": 1.5348961353302002, + "learning_rate": 4.8316636170139845e-05, + "loss": 4.8697, + "step": 19754 + }, + { + "epoch": 0.11748858121610048, + "grad_norm": 1.790076494216919, + "learning_rate": 4.831646766376778e-05, + "loss": 5.708, + "step": 19755 + }, + { + "epoch": 0.11749452849938148, + "grad_norm": 1.8615236282348633, + "learning_rate": 4.831629914925617e-05, + "loss": 5.3669, + "step": 19756 + }, + { + "epoch": 0.11750047578266248, + "grad_norm": 1.5969476699829102, + "learning_rate": 4.8316130626605096e-05, + "loss": 5.4041, + "step": 19757 + }, + { + "epoch": 0.11750642306594347, + "grad_norm": 1.5471712350845337, + "learning_rate": 4.8315962095814584e-05, + "loss": 5.5293, + "step": 19758 + }, + { + "epoch": 0.11751237034922447, + "grad_norm": 1.6281818151474, + "learning_rate": 4.831579355688472e-05, + "loss": 5.51, + "step": 19759 + }, + { + "epoch": 0.11751831763250548, + "grad_norm": 1.5264689922332764, + "learning_rate": 4.831562500981555e-05, + "loss": 4.9906, + "step": 19760 + }, + { + "epoch": 0.11752426491578646, + "grad_norm": 1.8446382284164429, + "learning_rate": 4.8315456454607145e-05, + "loss": 4.8351, + "step": 19761 + }, + { + "epoch": 0.11753021219906747, + "grad_norm": 2.0462918281555176, + "learning_rate": 4.8315287891259545e-05, + "loss": 4.7906, + "step": 19762 + }, + { + "epoch": 0.11753615948234847, + "grad_norm": 1.664975643157959, + "learning_rate": 4.831511931977282e-05, + "loss": 5.4149, + "step": 19763 + }, + { + "epoch": 0.11754210676562946, + "grad_norm": 1.8824998140335083, + "learning_rate": 4.831495074014703e-05, + "loss": 5.2587, + "step": 19764 + }, + { + "epoch": 0.11754805404891046, + "grad_norm": 1.6167455911636353, + "learning_rate": 4.8314782152382235e-05, + "loss": 5.3213, + "step": 19765 + }, + { + "epoch": 0.11755400133219146, + "grad_norm": 1.686562180519104, + "learning_rate": 4.831461355647848e-05, + "loss": 5.3497, + "step": 19766 + }, + { + "epoch": 0.11755994861547245, + "grad_norm": 1.7332249879837036, + "learning_rate": 4.831444495243584e-05, + "loss": 5.3139, + "step": 19767 + }, + { + "epoch": 0.11756589589875345, + "grad_norm": 1.6482213735580444, + "learning_rate": 4.8314276340254375e-05, + "loss": 5.5488, + "step": 19768 + }, + { + "epoch": 0.11757184318203445, + "grad_norm": 1.6714067459106445, + "learning_rate": 4.8314107719934134e-05, + "loss": 4.7354, + "step": 19769 + }, + { + "epoch": 0.11757779046531544, + "grad_norm": 1.5826655626296997, + "learning_rate": 4.8313939091475166e-05, + "loss": 5.5232, + "step": 19770 + }, + { + "epoch": 0.11758373774859644, + "grad_norm": 1.4177565574645996, + "learning_rate": 4.831377045487756e-05, + "loss": 5.4262, + "step": 19771 + }, + { + "epoch": 0.11758968503187744, + "grad_norm": 1.4056715965270996, + "learning_rate": 4.831360181014135e-05, + "loss": 5.6306, + "step": 19772 + }, + { + "epoch": 0.11759563231515843, + "grad_norm": 1.7903814315795898, + "learning_rate": 4.83134331572666e-05, + "loss": 4.5016, + "step": 19773 + }, + { + "epoch": 0.11760157959843943, + "grad_norm": 1.8719782829284668, + "learning_rate": 4.831326449625337e-05, + "loss": 4.3561, + "step": 19774 + }, + { + "epoch": 0.11760752688172044, + "grad_norm": 2.0182130336761475, + "learning_rate": 4.831309582710173e-05, + "loss": 4.3988, + "step": 19775 + }, + { + "epoch": 0.11761347416500142, + "grad_norm": 1.828475832939148, + "learning_rate": 4.8312927149811726e-05, + "loss": 4.4127, + "step": 19776 + }, + { + "epoch": 0.11761942144828243, + "grad_norm": 1.8332375288009644, + "learning_rate": 4.831275846438341e-05, + "loss": 4.3285, + "step": 19777 + }, + { + "epoch": 0.11762536873156341, + "grad_norm": 1.7542626857757568, + "learning_rate": 4.831258977081686e-05, + "loss": 5.4412, + "step": 19778 + }, + { + "epoch": 0.11763131601484442, + "grad_norm": 1.9277591705322266, + "learning_rate": 4.831242106911212e-05, + "loss": 4.1537, + "step": 19779 + }, + { + "epoch": 0.11763726329812542, + "grad_norm": 1.943296194076538, + "learning_rate": 4.8312252359269265e-05, + "loss": 4.448, + "step": 19780 + }, + { + "epoch": 0.11764321058140641, + "grad_norm": 1.8032363653182983, + "learning_rate": 4.831208364128834e-05, + "loss": 4.9847, + "step": 19781 + }, + { + "epoch": 0.11764915786468741, + "grad_norm": 1.9383130073547363, + "learning_rate": 4.83119149151694e-05, + "loss": 4.7231, + "step": 19782 + }, + { + "epoch": 0.11765510514796841, + "grad_norm": 1.8854987621307373, + "learning_rate": 4.831174618091252e-05, + "loss": 4.1493, + "step": 19783 + }, + { + "epoch": 0.1176610524312494, + "grad_norm": 1.932180404663086, + "learning_rate": 4.831157743851775e-05, + "loss": 4.0519, + "step": 19784 + }, + { + "epoch": 0.1176669997145304, + "grad_norm": 1.885292887687683, + "learning_rate": 4.831140868798514e-05, + "loss": 4.1593, + "step": 19785 + }, + { + "epoch": 0.1176729469978114, + "grad_norm": 1.8257746696472168, + "learning_rate": 4.8311239929314764e-05, + "loss": 4.3896, + "step": 19786 + }, + { + "epoch": 0.11767889428109239, + "grad_norm": 1.9383732080459595, + "learning_rate": 4.831107116250667e-05, + "loss": 4.1973, + "step": 19787 + }, + { + "epoch": 0.1176848415643734, + "grad_norm": 1.9942466020584106, + "learning_rate": 4.831090238756093e-05, + "loss": 4.3542, + "step": 19788 + }, + { + "epoch": 0.1176907888476544, + "grad_norm": 1.5551074743270874, + "learning_rate": 4.831073360447759e-05, + "loss": 4.9338, + "step": 19789 + }, + { + "epoch": 0.11769673613093538, + "grad_norm": 1.5898525714874268, + "learning_rate": 4.831056481325672e-05, + "loss": 4.8582, + "step": 19790 + }, + { + "epoch": 0.11770268341421639, + "grad_norm": 1.7175228595733643, + "learning_rate": 4.831039601389836e-05, + "loss": 4.6618, + "step": 19791 + }, + { + "epoch": 0.11770863069749739, + "grad_norm": 2.3165528774261475, + "learning_rate": 4.8310227206402594e-05, + "loss": 4.8579, + "step": 19792 + }, + { + "epoch": 0.11771457798077838, + "grad_norm": 1.4406440258026123, + "learning_rate": 4.8310058390769464e-05, + "loss": 5.6443, + "step": 19793 + }, + { + "epoch": 0.11772052526405938, + "grad_norm": 1.6670812368392944, + "learning_rate": 4.8309889566999037e-05, + "loss": 5.2096, + "step": 19794 + }, + { + "epoch": 0.11772647254734038, + "grad_norm": 1.6150201559066772, + "learning_rate": 4.8309720735091354e-05, + "loss": 5.2055, + "step": 19795 + }, + { + "epoch": 0.11773241983062137, + "grad_norm": 1.7714163064956665, + "learning_rate": 4.83095518950465e-05, + "loss": 5.9145, + "step": 19796 + }, + { + "epoch": 0.11773836711390237, + "grad_norm": 1.3608043193817139, + "learning_rate": 4.8309383046864526e-05, + "loss": 5.1546, + "step": 19797 + }, + { + "epoch": 0.11774431439718337, + "grad_norm": 1.2962807416915894, + "learning_rate": 4.830921419054548e-05, + "loss": 5.3574, + "step": 19798 + }, + { + "epoch": 0.11775026168046436, + "grad_norm": 2.0007364749908447, + "learning_rate": 4.8309045326089434e-05, + "loss": 5.0939, + "step": 19799 + }, + { + "epoch": 0.11775620896374536, + "grad_norm": 1.6526695489883423, + "learning_rate": 4.830887645349644e-05, + "loss": 5.7498, + "step": 19800 + }, + { + "epoch": 0.11776215624702636, + "grad_norm": 1.4990460872650146, + "learning_rate": 4.830870757276655e-05, + "loss": 5.2728, + "step": 19801 + }, + { + "epoch": 0.11776810353030735, + "grad_norm": 2.182511806488037, + "learning_rate": 4.830853868389984e-05, + "loss": 5.1598, + "step": 19802 + }, + { + "epoch": 0.11777405081358835, + "grad_norm": 2.515284538269043, + "learning_rate": 4.8308369786896354e-05, + "loss": 5.1378, + "step": 19803 + }, + { + "epoch": 0.11777999809686936, + "grad_norm": 1.9783490896224976, + "learning_rate": 4.830820088175616e-05, + "loss": 4.9242, + "step": 19804 + }, + { + "epoch": 0.11778594538015034, + "grad_norm": 1.790901780128479, + "learning_rate": 4.8308031968479315e-05, + "loss": 5.1156, + "step": 19805 + }, + { + "epoch": 0.11779189266343135, + "grad_norm": 1.751846432685852, + "learning_rate": 4.830786304706587e-05, + "loss": 5.2306, + "step": 19806 + }, + { + "epoch": 0.11779783994671233, + "grad_norm": 1.588497519493103, + "learning_rate": 4.83076941175159e-05, + "loss": 5.3987, + "step": 19807 + }, + { + "epoch": 0.11780378722999334, + "grad_norm": 1.9150582551956177, + "learning_rate": 4.830752517982945e-05, + "loss": 4.977, + "step": 19808 + }, + { + "epoch": 0.11780973451327434, + "grad_norm": 1.706708312034607, + "learning_rate": 4.8307356234006584e-05, + "loss": 5.0455, + "step": 19809 + }, + { + "epoch": 0.11781568179655533, + "grad_norm": 1.9373780488967896, + "learning_rate": 4.830718728004736e-05, + "loss": 5.0547, + "step": 19810 + }, + { + "epoch": 0.11782162907983633, + "grad_norm": 1.6948046684265137, + "learning_rate": 4.830701831795184e-05, + "loss": 5.0943, + "step": 19811 + }, + { + "epoch": 0.11782757636311733, + "grad_norm": 1.630083680152893, + "learning_rate": 4.8306849347720087e-05, + "loss": 5.6369, + "step": 19812 + }, + { + "epoch": 0.11783352364639832, + "grad_norm": 1.4906461238861084, + "learning_rate": 4.830668036935214e-05, + "loss": 5.2921, + "step": 19813 + }, + { + "epoch": 0.11783947092967932, + "grad_norm": 1.6434717178344727, + "learning_rate": 4.8306511382848076e-05, + "loss": 5.3473, + "step": 19814 + }, + { + "epoch": 0.11784541821296032, + "grad_norm": 1.5606834888458252, + "learning_rate": 4.8306342388207956e-05, + "loss": 5.3031, + "step": 19815 + }, + { + "epoch": 0.11785136549624131, + "grad_norm": 2.157352924346924, + "learning_rate": 4.830617338543183e-05, + "loss": 4.4939, + "step": 19816 + }, + { + "epoch": 0.11785731277952231, + "grad_norm": 2.49686598777771, + "learning_rate": 4.830600437451975e-05, + "loss": 4.506, + "step": 19817 + }, + { + "epoch": 0.11786326006280332, + "grad_norm": 1.943969964981079, + "learning_rate": 4.830583535547179e-05, + "loss": 4.411, + "step": 19818 + }, + { + "epoch": 0.1178692073460843, + "grad_norm": 1.9092329740524292, + "learning_rate": 4.830566632828801e-05, + "loss": 4.4121, + "step": 19819 + }, + { + "epoch": 0.1178751546293653, + "grad_norm": 1.7568551301956177, + "learning_rate": 4.830549729296846e-05, + "loss": 4.317, + "step": 19820 + }, + { + "epoch": 0.11788110191264631, + "grad_norm": 1.788150429725647, + "learning_rate": 4.83053282495132e-05, + "loss": 4.2928, + "step": 19821 + }, + { + "epoch": 0.1178870491959273, + "grad_norm": 1.9792863130569458, + "learning_rate": 4.830515919792229e-05, + "loss": 4.3219, + "step": 19822 + }, + { + "epoch": 0.1178929964792083, + "grad_norm": 2.2407681941986084, + "learning_rate": 4.8304990138195795e-05, + "loss": 4.296, + "step": 19823 + }, + { + "epoch": 0.1178989437624893, + "grad_norm": 1.993288516998291, + "learning_rate": 4.830482107033377e-05, + "loss": 4.2922, + "step": 19824 + }, + { + "epoch": 0.11790489104577029, + "grad_norm": 2.1966097354888916, + "learning_rate": 4.8304651994336264e-05, + "loss": 4.1215, + "step": 19825 + }, + { + "epoch": 0.11791083832905129, + "grad_norm": 1.569989562034607, + "learning_rate": 4.8304482910203345e-05, + "loss": 5.5432, + "step": 19826 + }, + { + "epoch": 0.11791678561233229, + "grad_norm": 1.522828459739685, + "learning_rate": 4.8304313817935075e-05, + "loss": 5.465, + "step": 19827 + }, + { + "epoch": 0.11792273289561328, + "grad_norm": 1.9455969333648682, + "learning_rate": 4.830414471753151e-05, + "loss": 5.1462, + "step": 19828 + }, + { + "epoch": 0.11792868017889428, + "grad_norm": 1.8587162494659424, + "learning_rate": 4.830397560899271e-05, + "loss": 5.1987, + "step": 19829 + }, + { + "epoch": 0.11793462746217528, + "grad_norm": 2.1671674251556396, + "learning_rate": 4.830380649231873e-05, + "loss": 5.3333, + "step": 19830 + }, + { + "epoch": 0.11794057474545627, + "grad_norm": 1.8267066478729248, + "learning_rate": 4.8303637367509636e-05, + "loss": 5.5306, + "step": 19831 + }, + { + "epoch": 0.11794652202873727, + "grad_norm": 1.80419921875, + "learning_rate": 4.830346823456548e-05, + "loss": 5.3077, + "step": 19832 + }, + { + "epoch": 0.11795246931201828, + "grad_norm": 1.9116721153259277, + "learning_rate": 4.830329909348632e-05, + "loss": 4.8531, + "step": 19833 + }, + { + "epoch": 0.11795841659529926, + "grad_norm": 1.9208347797393799, + "learning_rate": 4.830312994427223e-05, + "loss": 4.9645, + "step": 19834 + }, + { + "epoch": 0.11796436387858027, + "grad_norm": 1.8385374546051025, + "learning_rate": 4.8302960786923246e-05, + "loss": 4.7095, + "step": 19835 + }, + { + "epoch": 0.11797031116186125, + "grad_norm": 1.9271587133407593, + "learning_rate": 4.830279162143945e-05, + "loss": 4.5788, + "step": 19836 + }, + { + "epoch": 0.11797625844514226, + "grad_norm": 2.0168333053588867, + "learning_rate": 4.8302622447820885e-05, + "loss": 4.7595, + "step": 19837 + }, + { + "epoch": 0.11798220572842326, + "grad_norm": 1.9674837589263916, + "learning_rate": 4.8302453266067616e-05, + "loss": 4.674, + "step": 19838 + }, + { + "epoch": 0.11798815301170425, + "grad_norm": 1.944601058959961, + "learning_rate": 4.830228407617969e-05, + "loss": 4.6683, + "step": 19839 + }, + { + "epoch": 0.11799410029498525, + "grad_norm": 1.8970340490341187, + "learning_rate": 4.83021148781572e-05, + "loss": 5.2577, + "step": 19840 + }, + { + "epoch": 0.11800004757826625, + "grad_norm": 2.035505533218384, + "learning_rate": 4.8301945672000164e-05, + "loss": 4.7872, + "step": 19841 + }, + { + "epoch": 0.11800599486154724, + "grad_norm": 2.4211058616638184, + "learning_rate": 4.830177645770867e-05, + "loss": 4.9424, + "step": 19842 + }, + { + "epoch": 0.11801194214482824, + "grad_norm": 2.080132484436035, + "learning_rate": 4.830160723528276e-05, + "loss": 4.7908, + "step": 19843 + }, + { + "epoch": 0.11801788942810924, + "grad_norm": 3.5975728034973145, + "learning_rate": 4.83014380047225e-05, + "loss": 5.3434, + "step": 19844 + }, + { + "epoch": 0.11802383671139023, + "grad_norm": 1.6917449235916138, + "learning_rate": 4.830126876602795e-05, + "loss": 5.2593, + "step": 19845 + }, + { + "epoch": 0.11802978399467123, + "grad_norm": 1.8179433345794678, + "learning_rate": 4.8301099519199173e-05, + "loss": 5.9407, + "step": 19846 + }, + { + "epoch": 0.11803573127795224, + "grad_norm": 1.652653694152832, + "learning_rate": 4.8300930264236216e-05, + "loss": 5.505, + "step": 19847 + }, + { + "epoch": 0.11804167856123322, + "grad_norm": 1.6400798559188843, + "learning_rate": 4.830076100113915e-05, + "loss": 5.7281, + "step": 19848 + }, + { + "epoch": 0.11804762584451423, + "grad_norm": 1.865049123764038, + "learning_rate": 4.830059172990802e-05, + "loss": 5.4562, + "step": 19849 + }, + { + "epoch": 0.11805357312779523, + "grad_norm": 1.68345308303833, + "learning_rate": 4.8300422450542906e-05, + "loss": 5.3027, + "step": 19850 + }, + { + "epoch": 0.11805952041107622, + "grad_norm": 2.1790804862976074, + "learning_rate": 4.8300253163043855e-05, + "loss": 4.5531, + "step": 19851 + }, + { + "epoch": 0.11806546769435722, + "grad_norm": 2.63421368598938, + "learning_rate": 4.8300083867410915e-05, + "loss": 4.0978, + "step": 19852 + }, + { + "epoch": 0.11807141497763822, + "grad_norm": 1.8692448139190674, + "learning_rate": 4.829991456364417e-05, + "loss": 5.5482, + "step": 19853 + }, + { + "epoch": 0.11807736226091921, + "grad_norm": 1.684128761291504, + "learning_rate": 4.829974525174365e-05, + "loss": 5.5612, + "step": 19854 + }, + { + "epoch": 0.11808330954420021, + "grad_norm": 1.5720278024673462, + "learning_rate": 4.829957593170944e-05, + "loss": 5.6787, + "step": 19855 + }, + { + "epoch": 0.11808925682748121, + "grad_norm": 1.834423303604126, + "learning_rate": 4.829940660354159e-05, + "loss": 4.5591, + "step": 19856 + }, + { + "epoch": 0.1180952041107622, + "grad_norm": 1.7370680570602417, + "learning_rate": 4.829923726724015e-05, + "loss": 5.1643, + "step": 19857 + }, + { + "epoch": 0.1181011513940432, + "grad_norm": 2.1546318531036377, + "learning_rate": 4.829906792280519e-05, + "loss": 4.5788, + "step": 19858 + }, + { + "epoch": 0.1181070986773242, + "grad_norm": 2.5604169368743896, + "learning_rate": 4.829889857023677e-05, + "loss": 3.1948, + "step": 19859 + }, + { + "epoch": 0.11811304596060519, + "grad_norm": 2.072169780731201, + "learning_rate": 4.829872920953494e-05, + "loss": 3.9707, + "step": 19860 + }, + { + "epoch": 0.1181189932438862, + "grad_norm": 1.7981303930282593, + "learning_rate": 4.829855984069976e-05, + "loss": 5.8413, + "step": 19861 + }, + { + "epoch": 0.1181249405271672, + "grad_norm": 1.621327519416809, + "learning_rate": 4.8298390463731305e-05, + "loss": 5.4867, + "step": 19862 + }, + { + "epoch": 0.11813088781044818, + "grad_norm": 1.5245294570922852, + "learning_rate": 4.829822107862962e-05, + "loss": 5.7148, + "step": 19863 + }, + { + "epoch": 0.11813683509372919, + "grad_norm": 2.2656896114349365, + "learning_rate": 4.8298051685394765e-05, + "loss": 5.6678, + "step": 19864 + }, + { + "epoch": 0.11814278237701017, + "grad_norm": 1.8529094457626343, + "learning_rate": 4.8297882284026805e-05, + "loss": 5.4445, + "step": 19865 + }, + { + "epoch": 0.11814872966029118, + "grad_norm": 1.5151565074920654, + "learning_rate": 4.829771287452579e-05, + "loss": 5.2794, + "step": 19866 + }, + { + "epoch": 0.11815467694357218, + "grad_norm": 1.8492248058319092, + "learning_rate": 4.829754345689178e-05, + "loss": 5.0797, + "step": 19867 + }, + { + "epoch": 0.11816062422685317, + "grad_norm": 2.7612802982330322, + "learning_rate": 4.829737403112484e-05, + "loss": 5.1486, + "step": 19868 + }, + { + "epoch": 0.11816657151013417, + "grad_norm": 1.9457459449768066, + "learning_rate": 4.8297204597225035e-05, + "loss": 5.6507, + "step": 19869 + }, + { + "epoch": 0.11817251879341517, + "grad_norm": 1.6429107189178467, + "learning_rate": 4.829703515519242e-05, + "loss": 5.8414, + "step": 19870 + }, + { + "epoch": 0.11817846607669616, + "grad_norm": 1.556187391281128, + "learning_rate": 4.829686570502704e-05, + "loss": 5.9028, + "step": 19871 + }, + { + "epoch": 0.11818441335997716, + "grad_norm": 1.451532006263733, + "learning_rate": 4.8296696246728965e-05, + "loss": 5.8497, + "step": 19872 + }, + { + "epoch": 0.11819036064325816, + "grad_norm": 1.7325583696365356, + "learning_rate": 4.8296526780298256e-05, + "loss": 5.3531, + "step": 19873 + }, + { + "epoch": 0.11819630792653915, + "grad_norm": 1.784332275390625, + "learning_rate": 4.829635730573497e-05, + "loss": 5.6025, + "step": 19874 + }, + { + "epoch": 0.11820225520982015, + "grad_norm": 1.6109933853149414, + "learning_rate": 4.829618782303917e-05, + "loss": 5.5626, + "step": 19875 + }, + { + "epoch": 0.11820820249310116, + "grad_norm": 1.6639639139175415, + "learning_rate": 4.8296018332210905e-05, + "loss": 5.5679, + "step": 19876 + }, + { + "epoch": 0.11821414977638214, + "grad_norm": 1.8205533027648926, + "learning_rate": 4.829584883325025e-05, + "loss": 5.448, + "step": 19877 + }, + { + "epoch": 0.11822009705966315, + "grad_norm": 1.6450576782226562, + "learning_rate": 4.829567932615725e-05, + "loss": 5.5966, + "step": 19878 + }, + { + "epoch": 0.11822604434294415, + "grad_norm": 1.456151008605957, + "learning_rate": 4.829550981093196e-05, + "loss": 5.5194, + "step": 19879 + }, + { + "epoch": 0.11823199162622514, + "grad_norm": 1.6064491271972656, + "learning_rate": 4.829534028757446e-05, + "loss": 5.6929, + "step": 19880 + }, + { + "epoch": 0.11823793890950614, + "grad_norm": 1.438132405281067, + "learning_rate": 4.829517075608479e-05, + "loss": 5.6738, + "step": 19881 + }, + { + "epoch": 0.11824388619278714, + "grad_norm": 2.503048896789551, + "learning_rate": 4.8295001216463024e-05, + "loss": 4.9929, + "step": 19882 + }, + { + "epoch": 0.11824983347606813, + "grad_norm": 2.3379812240600586, + "learning_rate": 4.829483166870921e-05, + "loss": 4.7947, + "step": 19883 + }, + { + "epoch": 0.11825578075934913, + "grad_norm": 2.055328130722046, + "learning_rate": 4.829466211282341e-05, + "loss": 5.3265, + "step": 19884 + }, + { + "epoch": 0.11826172804263013, + "grad_norm": 1.7393126487731934, + "learning_rate": 4.829449254880569e-05, + "loss": 5.0483, + "step": 19885 + }, + { + "epoch": 0.11826767532591112, + "grad_norm": 2.3054347038269043, + "learning_rate": 4.829432297665609e-05, + "loss": 4.9002, + "step": 19886 + }, + { + "epoch": 0.11827362260919212, + "grad_norm": 2.434323310852051, + "learning_rate": 4.82941533963747e-05, + "loss": 4.8013, + "step": 19887 + }, + { + "epoch": 0.11827956989247312, + "grad_norm": 2.0834875106811523, + "learning_rate": 4.829398380796155e-05, + "loss": 4.786, + "step": 19888 + }, + { + "epoch": 0.11828551717575411, + "grad_norm": 1.6682358980178833, + "learning_rate": 4.829381421141671e-05, + "loss": 5.6843, + "step": 19889 + }, + { + "epoch": 0.11829146445903511, + "grad_norm": 1.8787375688552856, + "learning_rate": 4.829364460674025e-05, + "loss": 5.5191, + "step": 19890 + }, + { + "epoch": 0.11829741174231612, + "grad_norm": 1.7496438026428223, + "learning_rate": 4.829347499393221e-05, + "loss": 5.6968, + "step": 19891 + }, + { + "epoch": 0.1183033590255971, + "grad_norm": 1.5585973262786865, + "learning_rate": 4.829330537299266e-05, + "loss": 5.5588, + "step": 19892 + }, + { + "epoch": 0.1183093063088781, + "grad_norm": 1.8294848203659058, + "learning_rate": 4.8293135743921664e-05, + "loss": 5.2407, + "step": 19893 + }, + { + "epoch": 0.11831525359215911, + "grad_norm": 1.4877654314041138, + "learning_rate": 4.829296610671927e-05, + "loss": 5.5383, + "step": 19894 + }, + { + "epoch": 0.1183212008754401, + "grad_norm": 1.5250638723373413, + "learning_rate": 4.829279646138554e-05, + "loss": 5.6443, + "step": 19895 + }, + { + "epoch": 0.1183271481587211, + "grad_norm": 1.5662062168121338, + "learning_rate": 4.829262680792054e-05, + "loss": 5.5409, + "step": 19896 + }, + { + "epoch": 0.11833309544200209, + "grad_norm": 1.1783791780471802, + "learning_rate": 4.829245714632432e-05, + "loss": 5.6169, + "step": 19897 + }, + { + "epoch": 0.11833904272528309, + "grad_norm": 1.4960299730300903, + "learning_rate": 4.829228747659695e-05, + "loss": 5.7195, + "step": 19898 + }, + { + "epoch": 0.11834499000856409, + "grad_norm": 1.437047004699707, + "learning_rate": 4.829211779873848e-05, + "loss": 5.7229, + "step": 19899 + }, + { + "epoch": 0.11835093729184508, + "grad_norm": 1.4095619916915894, + "learning_rate": 4.829194811274897e-05, + "loss": 5.7227, + "step": 19900 + }, + { + "epoch": 0.11835688457512608, + "grad_norm": 1.5694538354873657, + "learning_rate": 4.829177841862849e-05, + "loss": 5.356, + "step": 19901 + }, + { + "epoch": 0.11836283185840708, + "grad_norm": 1.7124476432800293, + "learning_rate": 4.829160871637708e-05, + "loss": 4.9185, + "step": 19902 + }, + { + "epoch": 0.11836877914168807, + "grad_norm": 2.2423064708709717, + "learning_rate": 4.829143900599481e-05, + "loss": 5.4345, + "step": 19903 + }, + { + "epoch": 0.11837472642496907, + "grad_norm": 1.8333791494369507, + "learning_rate": 4.829126928748175e-05, + "loss": 5.3666, + "step": 19904 + }, + { + "epoch": 0.11838067370825008, + "grad_norm": 1.5184969902038574, + "learning_rate": 4.8291099560837936e-05, + "loss": 5.4372, + "step": 19905 + }, + { + "epoch": 0.11838662099153106, + "grad_norm": 1.628544807434082, + "learning_rate": 4.829092982606345e-05, + "loss": 5.2682, + "step": 19906 + }, + { + "epoch": 0.11839256827481207, + "grad_norm": 1.5791584253311157, + "learning_rate": 4.829076008315834e-05, + "loss": 5.2149, + "step": 19907 + }, + { + "epoch": 0.11839851555809307, + "grad_norm": 1.299560546875, + "learning_rate": 4.8290590332122656e-05, + "loss": 5.1735, + "step": 19908 + }, + { + "epoch": 0.11840446284137406, + "grad_norm": 1.343913197517395, + "learning_rate": 4.829042057295647e-05, + "loss": 5.2344, + "step": 19909 + }, + { + "epoch": 0.11841041012465506, + "grad_norm": 1.2621396780014038, + "learning_rate": 4.829025080565985e-05, + "loss": 5.2982, + "step": 19910 + }, + { + "epoch": 0.11841635740793606, + "grad_norm": 1.2189174890518188, + "learning_rate": 4.829008103023284e-05, + "loss": 5.3347, + "step": 19911 + }, + { + "epoch": 0.11842230469121705, + "grad_norm": 1.2917883396148682, + "learning_rate": 4.82899112466755e-05, + "loss": 5.0745, + "step": 19912 + }, + { + "epoch": 0.11842825197449805, + "grad_norm": 1.2382320165634155, + "learning_rate": 4.828974145498789e-05, + "loss": 5.1999, + "step": 19913 + }, + { + "epoch": 0.11843419925777905, + "grad_norm": 1.398218035697937, + "learning_rate": 4.828957165517007e-05, + "loss": 5.4944, + "step": 19914 + }, + { + "epoch": 0.11844014654106004, + "grad_norm": 1.448901653289795, + "learning_rate": 4.8289401847222115e-05, + "loss": 5.4645, + "step": 19915 + }, + { + "epoch": 0.11844609382434104, + "grad_norm": 1.4628182649612427, + "learning_rate": 4.828923203114406e-05, + "loss": 5.003, + "step": 19916 + }, + { + "epoch": 0.11845204110762204, + "grad_norm": 1.3390740156173706, + "learning_rate": 4.828906220693598e-05, + "loss": 5.3482, + "step": 19917 + }, + { + "epoch": 0.11845798839090303, + "grad_norm": 1.539097547531128, + "learning_rate": 4.8288892374597925e-05, + "loss": 5.304, + "step": 19918 + }, + { + "epoch": 0.11846393567418403, + "grad_norm": 1.4011404514312744, + "learning_rate": 4.828872253412996e-05, + "loss": 5.2073, + "step": 19919 + }, + { + "epoch": 0.11846988295746504, + "grad_norm": 1.4064414501190186, + "learning_rate": 4.828855268553214e-05, + "loss": 5.2316, + "step": 19920 + }, + { + "epoch": 0.11847583024074602, + "grad_norm": 1.5808193683624268, + "learning_rate": 4.828838282880452e-05, + "loss": 5.211, + "step": 19921 + }, + { + "epoch": 0.11848177752402703, + "grad_norm": 1.5043809413909912, + "learning_rate": 4.828821296394718e-05, + "loss": 5.0564, + "step": 19922 + }, + { + "epoch": 0.11848772480730803, + "grad_norm": 1.2494529485702515, + "learning_rate": 4.828804309096016e-05, + "loss": 5.1523, + "step": 19923 + }, + { + "epoch": 0.11849367209058902, + "grad_norm": 1.4186055660247803, + "learning_rate": 4.8287873209843524e-05, + "loss": 4.9103, + "step": 19924 + }, + { + "epoch": 0.11849961937387002, + "grad_norm": 1.6093229055404663, + "learning_rate": 4.828770332059733e-05, + "loss": 4.9215, + "step": 19925 + }, + { + "epoch": 0.118505566657151, + "grad_norm": 1.5125865936279297, + "learning_rate": 4.8287533423221643e-05, + "loss": 5.0515, + "step": 19926 + }, + { + "epoch": 0.11851151394043201, + "grad_norm": 1.5410135984420776, + "learning_rate": 4.828736351771652e-05, + "loss": 4.9576, + "step": 19927 + }, + { + "epoch": 0.11851746122371301, + "grad_norm": 1.5431303977966309, + "learning_rate": 4.828719360408201e-05, + "loss": 5.1606, + "step": 19928 + }, + { + "epoch": 0.118523408506994, + "grad_norm": 1.4709242582321167, + "learning_rate": 4.828702368231819e-05, + "loss": 4.7685, + "step": 19929 + }, + { + "epoch": 0.118529355790275, + "grad_norm": 1.173568606376648, + "learning_rate": 4.828685375242511e-05, + "loss": 4.7591, + "step": 19930 + }, + { + "epoch": 0.118535303073556, + "grad_norm": 1.3113515377044678, + "learning_rate": 4.828668381440283e-05, + "loss": 4.786, + "step": 19931 + }, + { + "epoch": 0.11854125035683699, + "grad_norm": 1.4658124446868896, + "learning_rate": 4.828651386825141e-05, + "loss": 4.7776, + "step": 19932 + }, + { + "epoch": 0.118547197640118, + "grad_norm": 1.3406554460525513, + "learning_rate": 4.828634391397091e-05, + "loss": 5.0733, + "step": 19933 + }, + { + "epoch": 0.118553144923399, + "grad_norm": 1.2102482318878174, + "learning_rate": 4.828617395156138e-05, + "loss": 5.0069, + "step": 19934 + }, + { + "epoch": 0.11855909220667998, + "grad_norm": 0.989989697933197, + "learning_rate": 4.828600398102289e-05, + "loss": 4.759, + "step": 19935 + }, + { + "epoch": 0.11856503948996099, + "grad_norm": 1.2296501398086548, + "learning_rate": 4.82858340023555e-05, + "loss": 4.6269, + "step": 19936 + }, + { + "epoch": 0.11857098677324199, + "grad_norm": 1.5649582147598267, + "learning_rate": 4.828566401555926e-05, + "loss": 5.0196, + "step": 19937 + }, + { + "epoch": 0.11857693405652298, + "grad_norm": 1.2393609285354614, + "learning_rate": 4.8285494020634245e-05, + "loss": 5.059, + "step": 19938 + }, + { + "epoch": 0.11858288133980398, + "grad_norm": 1.450697422027588, + "learning_rate": 4.82853240175805e-05, + "loss": 5.1143, + "step": 19939 + }, + { + "epoch": 0.11858882862308498, + "grad_norm": 1.4795258045196533, + "learning_rate": 4.8285154006398084e-05, + "loss": 5.075, + "step": 19940 + }, + { + "epoch": 0.11859477590636597, + "grad_norm": 1.5858484506607056, + "learning_rate": 4.828498398708707e-05, + "loss": 5.0665, + "step": 19941 + }, + { + "epoch": 0.11860072318964697, + "grad_norm": 1.3411937952041626, + "learning_rate": 4.82848139596475e-05, + "loss": 4.9864, + "step": 19942 + }, + { + "epoch": 0.11860667047292797, + "grad_norm": 1.4348468780517578, + "learning_rate": 4.828464392407945e-05, + "loss": 4.904, + "step": 19943 + }, + { + "epoch": 0.11861261775620896, + "grad_norm": 1.4753068685531616, + "learning_rate": 4.8284473880382967e-05, + "loss": 5.0784, + "step": 19944 + }, + { + "epoch": 0.11861856503948996, + "grad_norm": 1.379059076309204, + "learning_rate": 4.828430382855811e-05, + "loss": 4.9782, + "step": 19945 + }, + { + "epoch": 0.11862451232277096, + "grad_norm": 1.444729208946228, + "learning_rate": 4.828413376860495e-05, + "loss": 5.5804, + "step": 19946 + }, + { + "epoch": 0.11863045960605195, + "grad_norm": 1.3467416763305664, + "learning_rate": 4.8283963700523535e-05, + "loss": 5.3278, + "step": 19947 + }, + { + "epoch": 0.11863640688933295, + "grad_norm": 1.5206544399261475, + "learning_rate": 4.8283793624313936e-05, + "loss": 5.01, + "step": 19948 + }, + { + "epoch": 0.11864235417261396, + "grad_norm": 1.394729733467102, + "learning_rate": 4.8283623539976195e-05, + "loss": 5.2139, + "step": 19949 + }, + { + "epoch": 0.11864830145589494, + "grad_norm": 1.3675029277801514, + "learning_rate": 4.8283453447510394e-05, + "loss": 5.4559, + "step": 19950 + }, + { + "epoch": 0.11865424873917595, + "grad_norm": 1.1950232982635498, + "learning_rate": 4.828328334691657e-05, + "loss": 5.2233, + "step": 19951 + }, + { + "epoch": 0.11866019602245695, + "grad_norm": 1.3517179489135742, + "learning_rate": 4.82831132381948e-05, + "loss": 5.0519, + "step": 19952 + }, + { + "epoch": 0.11866614330573794, + "grad_norm": 1.4184643030166626, + "learning_rate": 4.828294312134512e-05, + "loss": 4.8722, + "step": 19953 + }, + { + "epoch": 0.11867209058901894, + "grad_norm": 1.4558582305908203, + "learning_rate": 4.828277299636762e-05, + "loss": 5.3876, + "step": 19954 + }, + { + "epoch": 0.11867803787229993, + "grad_norm": 1.4617977142333984, + "learning_rate": 4.8282602863262345e-05, + "loss": 5.4784, + "step": 19955 + }, + { + "epoch": 0.11868398515558093, + "grad_norm": 1.4997669458389282, + "learning_rate": 4.828243272202935e-05, + "loss": 5.2556, + "step": 19956 + }, + { + "epoch": 0.11868993243886193, + "grad_norm": 1.2730913162231445, + "learning_rate": 4.8282262572668696e-05, + "loss": 5.3194, + "step": 19957 + }, + { + "epoch": 0.11869587972214292, + "grad_norm": 1.4149047136306763, + "learning_rate": 4.8282092415180444e-05, + "loss": 5.5139, + "step": 19958 + }, + { + "epoch": 0.11870182700542392, + "grad_norm": 1.2510145902633667, + "learning_rate": 4.828192224956466e-05, + "loss": 5.2486, + "step": 19959 + }, + { + "epoch": 0.11870777428870492, + "grad_norm": 1.2229409217834473, + "learning_rate": 4.828175207582139e-05, + "loss": 5.2391, + "step": 19960 + }, + { + "epoch": 0.11871372157198591, + "grad_norm": 1.3316899538040161, + "learning_rate": 4.828158189395071e-05, + "loss": 5.2928, + "step": 19961 + }, + { + "epoch": 0.11871966885526691, + "grad_norm": 1.4331640005111694, + "learning_rate": 4.828141170395266e-05, + "loss": 5.3311, + "step": 19962 + }, + { + "epoch": 0.11872561613854792, + "grad_norm": 1.3313428163528442, + "learning_rate": 4.828124150582732e-05, + "loss": 5.2203, + "step": 19963 + }, + { + "epoch": 0.1187315634218289, + "grad_norm": 1.6505075693130493, + "learning_rate": 4.828107129957473e-05, + "loss": 4.8604, + "step": 19964 + }, + { + "epoch": 0.1187375107051099, + "grad_norm": 1.3544394969940186, + "learning_rate": 4.828090108519496e-05, + "loss": 5.17, + "step": 19965 + }, + { + "epoch": 0.11874345798839091, + "grad_norm": 1.3194384574890137, + "learning_rate": 4.828073086268808e-05, + "loss": 5.2197, + "step": 19966 + }, + { + "epoch": 0.1187494052716719, + "grad_norm": 1.4014582633972168, + "learning_rate": 4.8280560632054126e-05, + "loss": 5.2865, + "step": 19967 + }, + { + "epoch": 0.1187553525549529, + "grad_norm": 1.5148218870162964, + "learning_rate": 4.828039039329317e-05, + "loss": 5.3765, + "step": 19968 + }, + { + "epoch": 0.1187612998382339, + "grad_norm": 1.3657969236373901, + "learning_rate": 4.828022014640527e-05, + "loss": 4.9787, + "step": 19969 + }, + { + "epoch": 0.11876724712151489, + "grad_norm": 1.547717571258545, + "learning_rate": 4.828004989139049e-05, + "loss": 5.0538, + "step": 19970 + }, + { + "epoch": 0.11877319440479589, + "grad_norm": 1.5132863521575928, + "learning_rate": 4.827987962824888e-05, + "loss": 5.0301, + "step": 19971 + }, + { + "epoch": 0.11877914168807689, + "grad_norm": 1.4020887613296509, + "learning_rate": 4.827970935698051e-05, + "loss": 4.9646, + "step": 19972 + }, + { + "epoch": 0.11878508897135788, + "grad_norm": 1.4983519315719604, + "learning_rate": 4.8279539077585424e-05, + "loss": 5.2266, + "step": 19973 + }, + { + "epoch": 0.11879103625463888, + "grad_norm": 1.3545745611190796, + "learning_rate": 4.82793687900637e-05, + "loss": 5.108, + "step": 19974 + }, + { + "epoch": 0.11879698353791988, + "grad_norm": 1.4865717887878418, + "learning_rate": 4.827919849441539e-05, + "loss": 5.257, + "step": 19975 + }, + { + "epoch": 0.11880293082120087, + "grad_norm": 1.4389182329177856, + "learning_rate": 4.8279028190640546e-05, + "loss": 4.976, + "step": 19976 + }, + { + "epoch": 0.11880887810448187, + "grad_norm": 1.2823866605758667, + "learning_rate": 4.827885787873924e-05, + "loss": 4.7617, + "step": 19977 + }, + { + "epoch": 0.11881482538776288, + "grad_norm": 1.369992971420288, + "learning_rate": 4.8278687558711525e-05, + "loss": 4.7165, + "step": 19978 + }, + { + "epoch": 0.11882077267104386, + "grad_norm": 1.2873594760894775, + "learning_rate": 4.827851723055745e-05, + "loss": 4.6705, + "step": 19979 + }, + { + "epoch": 0.11882671995432487, + "grad_norm": 1.3779295682907104, + "learning_rate": 4.827834689427709e-05, + "loss": 4.9752, + "step": 19980 + }, + { + "epoch": 0.11883266723760587, + "grad_norm": 1.5264688730239868, + "learning_rate": 4.82781765498705e-05, + "loss": 5.0295, + "step": 19981 + }, + { + "epoch": 0.11883861452088686, + "grad_norm": 1.6745606660842896, + "learning_rate": 4.827800619733774e-05, + "loss": 5.4265, + "step": 19982 + }, + { + "epoch": 0.11884456180416786, + "grad_norm": 1.5993295907974243, + "learning_rate": 4.8277835836678874e-05, + "loss": 5.0611, + "step": 19983 + }, + { + "epoch": 0.11885050908744885, + "grad_norm": 1.6451520919799805, + "learning_rate": 4.827766546789395e-05, + "loss": 4.9504, + "step": 19984 + }, + { + "epoch": 0.11885645637072985, + "grad_norm": 1.4769519567489624, + "learning_rate": 4.827749509098304e-05, + "loss": 5.1324, + "step": 19985 + }, + { + "epoch": 0.11886240365401085, + "grad_norm": 1.6930506229400635, + "learning_rate": 4.827732470594619e-05, + "loss": 5.134, + "step": 19986 + }, + { + "epoch": 0.11886835093729184, + "grad_norm": 1.1951912641525269, + "learning_rate": 4.827715431278347e-05, + "loss": 5.2521, + "step": 19987 + }, + { + "epoch": 0.11887429822057284, + "grad_norm": 1.3520997762680054, + "learning_rate": 4.827698391149493e-05, + "loss": 5.1791, + "step": 19988 + }, + { + "epoch": 0.11888024550385384, + "grad_norm": 1.3710130453109741, + "learning_rate": 4.8276813502080644e-05, + "loss": 5.1179, + "step": 19989 + }, + { + "epoch": 0.11888619278713483, + "grad_norm": 1.4977210760116577, + "learning_rate": 4.827664308454066e-05, + "loss": 5.1492, + "step": 19990 + }, + { + "epoch": 0.11889214007041583, + "grad_norm": 1.2681607007980347, + "learning_rate": 4.8276472658875035e-05, + "loss": 5.1178, + "step": 19991 + }, + { + "epoch": 0.11889808735369684, + "grad_norm": 1.2606865167617798, + "learning_rate": 4.827630222508385e-05, + "loss": 5.2796, + "step": 19992 + }, + { + "epoch": 0.11890403463697782, + "grad_norm": 1.477273941040039, + "learning_rate": 4.827613178316713e-05, + "loss": 5.251, + "step": 19993 + }, + { + "epoch": 0.11890998192025883, + "grad_norm": 1.4194386005401611, + "learning_rate": 4.8275961333124956e-05, + "loss": 5.157, + "step": 19994 + }, + { + "epoch": 0.11891592920353983, + "grad_norm": 1.2693103551864624, + "learning_rate": 4.8275790874957396e-05, + "loss": 5.2037, + "step": 19995 + }, + { + "epoch": 0.11892187648682082, + "grad_norm": 1.2035702466964722, + "learning_rate": 4.8275620408664487e-05, + "loss": 5.1613, + "step": 19996 + }, + { + "epoch": 0.11892782377010182, + "grad_norm": 1.1674199104309082, + "learning_rate": 4.8275449934246295e-05, + "loss": 5.2415, + "step": 19997 + }, + { + "epoch": 0.11893377105338282, + "grad_norm": 1.5064369440078735, + "learning_rate": 4.8275279451702895e-05, + "loss": 5.2025, + "step": 19998 + }, + { + "epoch": 0.11893971833666381, + "grad_norm": 1.3770934343338013, + "learning_rate": 4.827510896103433e-05, + "loss": 5.0804, + "step": 19999 + }, + { + "epoch": 0.11894566561994481, + "grad_norm": 1.4852590560913086, + "learning_rate": 4.827493846224067e-05, + "loss": 5.0169, + "step": 20000 + }, + { + "epoch": 0.11895161290322581, + "grad_norm": 1.3760627508163452, + "learning_rate": 4.8274767955321966e-05, + "loss": 5.245, + "step": 20001 + }, + { + "epoch": 0.1189575601865068, + "grad_norm": 1.4135125875473022, + "learning_rate": 4.827459744027828e-05, + "loss": 5.1599, + "step": 20002 + }, + { + "epoch": 0.1189635074697878, + "grad_norm": 1.352949857711792, + "learning_rate": 4.8274426917109675e-05, + "loss": 5.187, + "step": 20003 + }, + { + "epoch": 0.1189694547530688, + "grad_norm": 1.279439091682434, + "learning_rate": 4.82742563858162e-05, + "loss": 5.1369, + "step": 20004 + }, + { + "epoch": 0.11897540203634979, + "grad_norm": 1.6078580617904663, + "learning_rate": 4.8274085846397935e-05, + "loss": 5.097, + "step": 20005 + }, + { + "epoch": 0.1189813493196308, + "grad_norm": 1.4414268732070923, + "learning_rate": 4.827391529885492e-05, + "loss": 5.1412, + "step": 20006 + }, + { + "epoch": 0.1189872966029118, + "grad_norm": 1.249731421470642, + "learning_rate": 4.827374474318722e-05, + "loss": 5.002, + "step": 20007 + }, + { + "epoch": 0.11899324388619278, + "grad_norm": 1.5977002382278442, + "learning_rate": 4.82735741793949e-05, + "loss": 5.0387, + "step": 20008 + }, + { + "epoch": 0.11899919116947379, + "grad_norm": 1.5115478038787842, + "learning_rate": 4.8273403607478016e-05, + "loss": 4.9497, + "step": 20009 + }, + { + "epoch": 0.11900513845275479, + "grad_norm": 1.433825135231018, + "learning_rate": 4.8273233027436625e-05, + "loss": 4.9818, + "step": 20010 + }, + { + "epoch": 0.11901108573603578, + "grad_norm": 1.51628839969635, + "learning_rate": 4.827306243927079e-05, + "loss": 4.8819, + "step": 20011 + }, + { + "epoch": 0.11901703301931678, + "grad_norm": 1.3780534267425537, + "learning_rate": 4.8272891842980564e-05, + "loss": 5.18, + "step": 20012 + }, + { + "epoch": 0.11902298030259777, + "grad_norm": 1.2616275548934937, + "learning_rate": 4.8272721238566023e-05, + "loss": 5.549, + "step": 20013 + }, + { + "epoch": 0.11902892758587877, + "grad_norm": 1.2978616952896118, + "learning_rate": 4.8272550626027204e-05, + "loss": 5.4608, + "step": 20014 + }, + { + "epoch": 0.11903487486915977, + "grad_norm": 1.2539299726486206, + "learning_rate": 4.827238000536418e-05, + "loss": 5.5612, + "step": 20015 + }, + { + "epoch": 0.11904082215244076, + "grad_norm": 1.4023045301437378, + "learning_rate": 4.827220937657702e-05, + "loss": 5.2669, + "step": 20016 + }, + { + "epoch": 0.11904676943572176, + "grad_norm": 1.4386683702468872, + "learning_rate": 4.827203873966576e-05, + "loss": 5.0703, + "step": 20017 + }, + { + "epoch": 0.11905271671900276, + "grad_norm": 1.5248057842254639, + "learning_rate": 4.827186809463048e-05, + "loss": 5.0376, + "step": 20018 + }, + { + "epoch": 0.11905866400228375, + "grad_norm": 1.4410630464553833, + "learning_rate": 4.827169744147122e-05, + "loss": 5.1396, + "step": 20019 + }, + { + "epoch": 0.11906461128556475, + "grad_norm": 1.7917122840881348, + "learning_rate": 4.827152678018806e-05, + "loss": 5.1673, + "step": 20020 + }, + { + "epoch": 0.11907055856884576, + "grad_norm": 1.739169716835022, + "learning_rate": 4.827135611078105e-05, + "loss": 5.6848, + "step": 20021 + }, + { + "epoch": 0.11907650585212674, + "grad_norm": 1.6629457473754883, + "learning_rate": 4.827118543325024e-05, + "loss": 5.7335, + "step": 20022 + }, + { + "epoch": 0.11908245313540775, + "grad_norm": 1.634628176689148, + "learning_rate": 4.827101474759571e-05, + "loss": 5.7718, + "step": 20023 + }, + { + "epoch": 0.11908840041868875, + "grad_norm": 1.299861192703247, + "learning_rate": 4.827084405381751e-05, + "loss": 5.6917, + "step": 20024 + }, + { + "epoch": 0.11909434770196974, + "grad_norm": 1.3863619565963745, + "learning_rate": 4.82706733519157e-05, + "loss": 5.7363, + "step": 20025 + }, + { + "epoch": 0.11910029498525074, + "grad_norm": 2.3500845432281494, + "learning_rate": 4.827050264189033e-05, + "loss": 5.192, + "step": 20026 + }, + { + "epoch": 0.11910624226853174, + "grad_norm": 1.426633358001709, + "learning_rate": 4.827033192374147e-05, + "loss": 5.5643, + "step": 20027 + }, + { + "epoch": 0.11911218955181273, + "grad_norm": 1.4728987216949463, + "learning_rate": 4.8270161197469175e-05, + "loss": 5.6323, + "step": 20028 + }, + { + "epoch": 0.11911813683509373, + "grad_norm": 1.66750168800354, + "learning_rate": 4.826999046307352e-05, + "loss": 5.4327, + "step": 20029 + }, + { + "epoch": 0.11912408411837473, + "grad_norm": 1.4894248247146606, + "learning_rate": 4.8269819720554545e-05, + "loss": 5.4332, + "step": 20030 + }, + { + "epoch": 0.11913003140165572, + "grad_norm": 1.5166181325912476, + "learning_rate": 4.826964896991231e-05, + "loss": 5.5467, + "step": 20031 + }, + { + "epoch": 0.11913597868493672, + "grad_norm": 1.2947237491607666, + "learning_rate": 4.826947821114689e-05, + "loss": 5.5116, + "step": 20032 + }, + { + "epoch": 0.11914192596821772, + "grad_norm": 1.3890970945358276, + "learning_rate": 4.8269307444258326e-05, + "loss": 5.5459, + "step": 20033 + }, + { + "epoch": 0.11914787325149871, + "grad_norm": 1.496099591255188, + "learning_rate": 4.8269136669246695e-05, + "loss": 5.5533, + "step": 20034 + }, + { + "epoch": 0.11915382053477971, + "grad_norm": 1.4115175008773804, + "learning_rate": 4.8268965886112045e-05, + "loss": 5.4898, + "step": 20035 + }, + { + "epoch": 0.11915976781806072, + "grad_norm": 1.3803601264953613, + "learning_rate": 4.826879509485444e-05, + "loss": 5.598, + "step": 20036 + }, + { + "epoch": 0.1191657151013417, + "grad_norm": 1.7235617637634277, + "learning_rate": 4.826862429547394e-05, + "loss": 5.5489, + "step": 20037 + }, + { + "epoch": 0.1191716623846227, + "grad_norm": 1.726289987564087, + "learning_rate": 4.82684534879706e-05, + "loss": 5.5461, + "step": 20038 + }, + { + "epoch": 0.11917760966790371, + "grad_norm": 1.593349814414978, + "learning_rate": 4.826828267234449e-05, + "loss": 5.3594, + "step": 20039 + }, + { + "epoch": 0.1191835569511847, + "grad_norm": 2.3147101402282715, + "learning_rate": 4.826811184859566e-05, + "loss": 4.6888, + "step": 20040 + }, + { + "epoch": 0.1191895042344657, + "grad_norm": 2.1485888957977295, + "learning_rate": 4.826794101672417e-05, + "loss": 4.6874, + "step": 20041 + }, + { + "epoch": 0.11919545151774669, + "grad_norm": 2.5710601806640625, + "learning_rate": 4.826777017673009e-05, + "loss": 4.6524, + "step": 20042 + }, + { + "epoch": 0.11920139880102769, + "grad_norm": 2.314556121826172, + "learning_rate": 4.826759932861346e-05, + "loss": 4.3273, + "step": 20043 + }, + { + "epoch": 0.11920734608430869, + "grad_norm": 2.060617208480835, + "learning_rate": 4.826742847237436e-05, + "loss": 4.6601, + "step": 20044 + }, + { + "epoch": 0.11921329336758968, + "grad_norm": 1.9709726572036743, + "learning_rate": 4.826725760801284e-05, + "loss": 6.1007, + "step": 20045 + }, + { + "epoch": 0.11921924065087068, + "grad_norm": 2.0907840728759766, + "learning_rate": 4.826708673552895e-05, + "loss": 6.0386, + "step": 20046 + }, + { + "epoch": 0.11922518793415168, + "grad_norm": 2.02783203125, + "learning_rate": 4.826691585492278e-05, + "loss": 5.4651, + "step": 20047 + }, + { + "epoch": 0.11923113521743267, + "grad_norm": 1.8326990604400635, + "learning_rate": 4.826674496619435e-05, + "loss": 5.7342, + "step": 20048 + }, + { + "epoch": 0.11923708250071367, + "grad_norm": 1.8395801782608032, + "learning_rate": 4.8266574069343753e-05, + "loss": 5.657, + "step": 20049 + }, + { + "epoch": 0.11924302978399468, + "grad_norm": 1.5144078731536865, + "learning_rate": 4.826640316437103e-05, + "loss": 5.6856, + "step": 20050 + }, + { + "epoch": 0.11924897706727566, + "grad_norm": 1.6133313179016113, + "learning_rate": 4.826623225127626e-05, + "loss": 5.114, + "step": 20051 + }, + { + "epoch": 0.11925492435055667, + "grad_norm": 2.0678884983062744, + "learning_rate": 4.826606133005947e-05, + "loss": 5.6642, + "step": 20052 + }, + { + "epoch": 0.11926087163383767, + "grad_norm": 1.7214683294296265, + "learning_rate": 4.8265890400720744e-05, + "loss": 5.8689, + "step": 20053 + }, + { + "epoch": 0.11926681891711866, + "grad_norm": 1.7670868635177612, + "learning_rate": 4.826571946326014e-05, + "loss": 5.6504, + "step": 20054 + }, + { + "epoch": 0.11927276620039966, + "grad_norm": 1.6336724758148193, + "learning_rate": 4.82655485176777e-05, + "loss": 5.7624, + "step": 20055 + }, + { + "epoch": 0.11927871348368066, + "grad_norm": 1.6147593259811401, + "learning_rate": 4.8265377563973514e-05, + "loss": 5.8398, + "step": 20056 + }, + { + "epoch": 0.11928466076696165, + "grad_norm": 1.6203758716583252, + "learning_rate": 4.8265206602147614e-05, + "loss": 5.3793, + "step": 20057 + }, + { + "epoch": 0.11929060805024265, + "grad_norm": 1.8295884132385254, + "learning_rate": 4.8265035632200084e-05, + "loss": 5.0185, + "step": 20058 + }, + { + "epoch": 0.11929655533352365, + "grad_norm": 1.6802337169647217, + "learning_rate": 4.826486465413096e-05, + "loss": 5.8104, + "step": 20059 + }, + { + "epoch": 0.11930250261680464, + "grad_norm": 1.9276031255722046, + "learning_rate": 4.826469366794031e-05, + "loss": 5.2106, + "step": 20060 + }, + { + "epoch": 0.11930844990008564, + "grad_norm": 1.9589072465896606, + "learning_rate": 4.8264522673628205e-05, + "loss": 5.2336, + "step": 20061 + }, + { + "epoch": 0.11931439718336664, + "grad_norm": 3.45713472366333, + "learning_rate": 4.826435167119469e-05, + "loss": 5.7015, + "step": 20062 + }, + { + "epoch": 0.11932034446664763, + "grad_norm": 3.057732343673706, + "learning_rate": 4.826418066063983e-05, + "loss": 4.2376, + "step": 20063 + }, + { + "epoch": 0.11932629174992863, + "grad_norm": 2.9540810585021973, + "learning_rate": 4.8264009641963684e-05, + "loss": 4.1357, + "step": 20064 + }, + { + "epoch": 0.11933223903320964, + "grad_norm": 2.707113027572632, + "learning_rate": 4.826383861516632e-05, + "loss": 3.7255, + "step": 20065 + }, + { + "epoch": 0.11933818631649062, + "grad_norm": 2.488718032836914, + "learning_rate": 4.8263667580247784e-05, + "loss": 3.7309, + "step": 20066 + }, + { + "epoch": 0.11934413359977163, + "grad_norm": 2.6351873874664307, + "learning_rate": 4.826349653720814e-05, + "loss": 3.5953, + "step": 20067 + }, + { + "epoch": 0.11935008088305263, + "grad_norm": 2.866333246231079, + "learning_rate": 4.826332548604745e-05, + "loss": 3.8627, + "step": 20068 + }, + { + "epoch": 0.11935602816633362, + "grad_norm": 1.5446399450302124, + "learning_rate": 4.8263154426765777e-05, + "loss": 5.3014, + "step": 20069 + }, + { + "epoch": 0.11936197544961462, + "grad_norm": 1.7273021936416626, + "learning_rate": 4.8262983359363176e-05, + "loss": 5.6102, + "step": 20070 + }, + { + "epoch": 0.1193679227328956, + "grad_norm": 1.4169118404388428, + "learning_rate": 4.826281228383971e-05, + "loss": 5.6831, + "step": 20071 + }, + { + "epoch": 0.11937387001617661, + "grad_norm": 1.7140129804611206, + "learning_rate": 4.826264120019544e-05, + "loss": 5.6609, + "step": 20072 + }, + { + "epoch": 0.11937981729945761, + "grad_norm": 1.4560796022415161, + "learning_rate": 4.8262470108430414e-05, + "loss": 5.6279, + "step": 20073 + }, + { + "epoch": 0.1193857645827386, + "grad_norm": 1.6894809007644653, + "learning_rate": 4.8262299008544697e-05, + "loss": 5.192, + "step": 20074 + }, + { + "epoch": 0.1193917118660196, + "grad_norm": 2.995307683944702, + "learning_rate": 4.826212790053836e-05, + "loss": 4.9009, + "step": 20075 + }, + { + "epoch": 0.1193976591493006, + "grad_norm": 2.9559946060180664, + "learning_rate": 4.826195678441145e-05, + "loss": 4.8801, + "step": 20076 + }, + { + "epoch": 0.11940360643258159, + "grad_norm": 2.550973653793335, + "learning_rate": 4.826178566016403e-05, + "loss": 4.7061, + "step": 20077 + }, + { + "epoch": 0.11940955371586259, + "grad_norm": 2.0249550342559814, + "learning_rate": 4.826161452779617e-05, + "loss": 5.0315, + "step": 20078 + }, + { + "epoch": 0.1194155009991436, + "grad_norm": 1.6208853721618652, + "learning_rate": 4.826144338730791e-05, + "loss": 5.3685, + "step": 20079 + }, + { + "epoch": 0.11942144828242458, + "grad_norm": 1.6138144731521606, + "learning_rate": 4.826127223869933e-05, + "loss": 5.3098, + "step": 20080 + }, + { + "epoch": 0.11942739556570559, + "grad_norm": 1.6347969770431519, + "learning_rate": 4.8261101081970476e-05, + "loss": 5.7519, + "step": 20081 + }, + { + "epoch": 0.11943334284898659, + "grad_norm": 1.6273889541625977, + "learning_rate": 4.8260929917121403e-05, + "loss": 5.5083, + "step": 20082 + }, + { + "epoch": 0.11943929013226758, + "grad_norm": 1.7236882448196411, + "learning_rate": 4.826075874415219e-05, + "loss": 5.3613, + "step": 20083 + }, + { + "epoch": 0.11944523741554858, + "grad_norm": 1.5177632570266724, + "learning_rate": 4.826058756306289e-05, + "loss": 5.4234, + "step": 20084 + }, + { + "epoch": 0.11945118469882958, + "grad_norm": 1.9017301797866821, + "learning_rate": 4.826041637385354e-05, + "loss": 4.6868, + "step": 20085 + }, + { + "epoch": 0.11945713198211057, + "grad_norm": 1.8880805969238281, + "learning_rate": 4.826024517652425e-05, + "loss": 4.4478, + "step": 20086 + }, + { + "epoch": 0.11946307926539157, + "grad_norm": 1.5617226362228394, + "learning_rate": 4.826007397107503e-05, + "loss": 5.3775, + "step": 20087 + }, + { + "epoch": 0.11946902654867257, + "grad_norm": 1.836101770401001, + "learning_rate": 4.825990275750595e-05, + "loss": 5.33, + "step": 20088 + }, + { + "epoch": 0.11947497383195356, + "grad_norm": 1.6876533031463623, + "learning_rate": 4.825973153581709e-05, + "loss": 5.3164, + "step": 20089 + }, + { + "epoch": 0.11948092111523456, + "grad_norm": 1.7182306051254272, + "learning_rate": 4.82595603060085e-05, + "loss": 5.3545, + "step": 20090 + }, + { + "epoch": 0.11948686839851556, + "grad_norm": 2.160414934158325, + "learning_rate": 4.825938906808023e-05, + "loss": 4.3744, + "step": 20091 + }, + { + "epoch": 0.11949281568179655, + "grad_norm": 1.4865752458572388, + "learning_rate": 4.825921782203236e-05, + "loss": 5.455, + "step": 20092 + }, + { + "epoch": 0.11949876296507755, + "grad_norm": 1.550986409187317, + "learning_rate": 4.825904656786492e-05, + "loss": 5.4879, + "step": 20093 + }, + { + "epoch": 0.11950471024835856, + "grad_norm": 1.473037838935852, + "learning_rate": 4.8258875305577996e-05, + "loss": 5.3964, + "step": 20094 + }, + { + "epoch": 0.11951065753163954, + "grad_norm": 1.6714228391647339, + "learning_rate": 4.825870403517164e-05, + "loss": 5.0215, + "step": 20095 + }, + { + "epoch": 0.11951660481492055, + "grad_norm": 1.7555420398712158, + "learning_rate": 4.8258532756645905e-05, + "loss": 4.9852, + "step": 20096 + }, + { + "epoch": 0.11952255209820155, + "grad_norm": 1.562729835510254, + "learning_rate": 4.825836147000086e-05, + "loss": 4.5928, + "step": 20097 + }, + { + "epoch": 0.11952849938148254, + "grad_norm": 1.7901209592819214, + "learning_rate": 4.825819017523656e-05, + "loss": 5.3176, + "step": 20098 + }, + { + "epoch": 0.11953444666476354, + "grad_norm": 1.605578064918518, + "learning_rate": 4.825801887235307e-05, + "loss": 5.3162, + "step": 20099 + }, + { + "epoch": 0.11954039394804453, + "grad_norm": 1.9077202081680298, + "learning_rate": 4.8257847561350445e-05, + "loss": 5.3378, + "step": 20100 + }, + { + "epoch": 0.11954634123132553, + "grad_norm": 1.9171262979507446, + "learning_rate": 4.825767624222875e-05, + "loss": 5.2585, + "step": 20101 + }, + { + "epoch": 0.11955228851460653, + "grad_norm": 1.5661342144012451, + "learning_rate": 4.825750491498803e-05, + "loss": 5.3421, + "step": 20102 + }, + { + "epoch": 0.11955823579788752, + "grad_norm": 2.188962697982788, + "learning_rate": 4.825733357962836e-05, + "loss": 4.8925, + "step": 20103 + }, + { + "epoch": 0.11956418308116852, + "grad_norm": 1.4218099117279053, + "learning_rate": 4.82571622361498e-05, + "loss": 5.3497, + "step": 20104 + }, + { + "epoch": 0.11957013036444952, + "grad_norm": 1.6142303943634033, + "learning_rate": 4.82569908845524e-05, + "loss": 5.1657, + "step": 20105 + }, + { + "epoch": 0.11957607764773051, + "grad_norm": 1.9385474920272827, + "learning_rate": 4.8256819524836224e-05, + "loss": 5.0509, + "step": 20106 + }, + { + "epoch": 0.11958202493101151, + "grad_norm": 2.077528953552246, + "learning_rate": 4.825664815700134e-05, + "loss": 5.1879, + "step": 20107 + }, + { + "epoch": 0.11958797221429252, + "grad_norm": 2.158764123916626, + "learning_rate": 4.825647678104779e-05, + "loss": 4.9595, + "step": 20108 + }, + { + "epoch": 0.1195939194975735, + "grad_norm": 2.0398664474487305, + "learning_rate": 4.825630539697565e-05, + "loss": 4.9156, + "step": 20109 + }, + { + "epoch": 0.1195998667808545, + "grad_norm": 2.0280275344848633, + "learning_rate": 4.825613400478497e-05, + "loss": 4.8655, + "step": 20110 + }, + { + "epoch": 0.11960581406413551, + "grad_norm": 2.0311338901519775, + "learning_rate": 4.8255962604475816e-05, + "loss": 4.8953, + "step": 20111 + }, + { + "epoch": 0.1196117613474165, + "grad_norm": 2.334346055984497, + "learning_rate": 4.825579119604825e-05, + "loss": 5.0044, + "step": 20112 + }, + { + "epoch": 0.1196177086306975, + "grad_norm": 2.272148847579956, + "learning_rate": 4.825561977950233e-05, + "loss": 4.8911, + "step": 20113 + }, + { + "epoch": 0.1196236559139785, + "grad_norm": 2.0724244117736816, + "learning_rate": 4.8255448354838104e-05, + "loss": 5.3492, + "step": 20114 + }, + { + "epoch": 0.11962960319725949, + "grad_norm": 1.7691513299942017, + "learning_rate": 4.8255276922055644e-05, + "loss": 5.5727, + "step": 20115 + }, + { + "epoch": 0.11963555048054049, + "grad_norm": 1.9434363842010498, + "learning_rate": 4.8255105481155004e-05, + "loss": 5.4564, + "step": 20116 + }, + { + "epoch": 0.11964149776382149, + "grad_norm": 1.623660683631897, + "learning_rate": 4.825493403213626e-05, + "loss": 5.2862, + "step": 20117 + }, + { + "epoch": 0.11964744504710248, + "grad_norm": 1.6246039867401123, + "learning_rate": 4.8254762574999446e-05, + "loss": 5.3627, + "step": 20118 + }, + { + "epoch": 0.11965339233038348, + "grad_norm": 1.689290165901184, + "learning_rate": 4.825459110974464e-05, + "loss": 4.6902, + "step": 20119 + }, + { + "epoch": 0.11965933961366448, + "grad_norm": 1.487697720527649, + "learning_rate": 4.825441963637189e-05, + "loss": 4.7598, + "step": 20120 + }, + { + "epoch": 0.11966528689694547, + "grad_norm": 1.7388331890106201, + "learning_rate": 4.825424815488126e-05, + "loss": 4.709, + "step": 20121 + }, + { + "epoch": 0.11967123418022647, + "grad_norm": 1.9586225748062134, + "learning_rate": 4.8254076665272826e-05, + "loss": 4.4625, + "step": 20122 + }, + { + "epoch": 0.11967718146350748, + "grad_norm": 1.9228769540786743, + "learning_rate": 4.825390516754662e-05, + "loss": 4.1447, + "step": 20123 + }, + { + "epoch": 0.11968312874678846, + "grad_norm": 1.8852907419204712, + "learning_rate": 4.825373366170273e-05, + "loss": 4.2618, + "step": 20124 + }, + { + "epoch": 0.11968907603006947, + "grad_norm": 1.8267028331756592, + "learning_rate": 4.825356214774119e-05, + "loss": 4.4095, + "step": 20125 + }, + { + "epoch": 0.11969502331335047, + "grad_norm": 1.8847311735153198, + "learning_rate": 4.825339062566208e-05, + "loss": 4.1904, + "step": 20126 + }, + { + "epoch": 0.11970097059663146, + "grad_norm": 2.0036990642547607, + "learning_rate": 4.825321909546545e-05, + "loss": 4.2348, + "step": 20127 + }, + { + "epoch": 0.11970691787991246, + "grad_norm": 1.8992520570755005, + "learning_rate": 4.825304755715136e-05, + "loss": 4.3038, + "step": 20128 + }, + { + "epoch": 0.11971286516319345, + "grad_norm": 1.8314359188079834, + "learning_rate": 4.8252876010719874e-05, + "loss": 4.102, + "step": 20129 + }, + { + "epoch": 0.11971881244647445, + "grad_norm": 1.9093595743179321, + "learning_rate": 4.825270445617104e-05, + "loss": 4.0307, + "step": 20130 + }, + { + "epoch": 0.11972475972975545, + "grad_norm": 2.1645400524139404, + "learning_rate": 4.8252532893504936e-05, + "loss": 4.2032, + "step": 20131 + }, + { + "epoch": 0.11973070701303644, + "grad_norm": 2.0268661975860596, + "learning_rate": 4.8252361322721605e-05, + "loss": 4.7705, + "step": 20132 + }, + { + "epoch": 0.11973665429631744, + "grad_norm": 1.8852148056030273, + "learning_rate": 4.825218974382113e-05, + "loss": 4.8969, + "step": 20133 + }, + { + "epoch": 0.11974260157959844, + "grad_norm": 1.9107592105865479, + "learning_rate": 4.825201815680354e-05, + "loss": 5.2587, + "step": 20134 + }, + { + "epoch": 0.11974854886287943, + "grad_norm": 1.6433600187301636, + "learning_rate": 4.825184656166892e-05, + "loss": 5.1954, + "step": 20135 + }, + { + "epoch": 0.11975449614616043, + "grad_norm": 1.4135210514068604, + "learning_rate": 4.825167495841731e-05, + "loss": 5.0398, + "step": 20136 + }, + { + "epoch": 0.11976044342944143, + "grad_norm": 1.9514580965042114, + "learning_rate": 4.825150334704879e-05, + "loss": 4.3527, + "step": 20137 + }, + { + "epoch": 0.11976639071272242, + "grad_norm": 1.8811348676681519, + "learning_rate": 4.825133172756341e-05, + "loss": 4.2798, + "step": 20138 + }, + { + "epoch": 0.11977233799600343, + "grad_norm": 1.8210500478744507, + "learning_rate": 4.825116009996123e-05, + "loss": 4.666, + "step": 20139 + }, + { + "epoch": 0.11977828527928443, + "grad_norm": 1.8773581981658936, + "learning_rate": 4.825098846424231e-05, + "loss": 4.9104, + "step": 20140 + }, + { + "epoch": 0.11978423256256542, + "grad_norm": 1.517233967781067, + "learning_rate": 4.825081682040671e-05, + "loss": 5.5915, + "step": 20141 + }, + { + "epoch": 0.11979017984584642, + "grad_norm": 1.6219067573547363, + "learning_rate": 4.825064516845449e-05, + "loss": 5.6538, + "step": 20142 + }, + { + "epoch": 0.11979612712912742, + "grad_norm": 1.4977927207946777, + "learning_rate": 4.8250473508385707e-05, + "loss": 5.3499, + "step": 20143 + }, + { + "epoch": 0.11980207441240841, + "grad_norm": 1.5381087064743042, + "learning_rate": 4.8250301840200424e-05, + "loss": 5.6666, + "step": 20144 + }, + { + "epoch": 0.11980802169568941, + "grad_norm": 1.5895806550979614, + "learning_rate": 4.82501301638987e-05, + "loss": 5.2099, + "step": 20145 + }, + { + "epoch": 0.11981396897897041, + "grad_norm": 1.7511320114135742, + "learning_rate": 4.8249958479480603e-05, + "loss": 4.622, + "step": 20146 + }, + { + "epoch": 0.1198199162622514, + "grad_norm": 1.8109928369522095, + "learning_rate": 4.824978678694618e-05, + "loss": 4.4156, + "step": 20147 + }, + { + "epoch": 0.1198258635455324, + "grad_norm": 1.474926471710205, + "learning_rate": 4.8249615086295494e-05, + "loss": 5.4845, + "step": 20148 + }, + { + "epoch": 0.1198318108288134, + "grad_norm": 1.8301719427108765, + "learning_rate": 4.824944337752861e-05, + "loss": 5.1814, + "step": 20149 + }, + { + "epoch": 0.11983775811209439, + "grad_norm": 1.8549950122833252, + "learning_rate": 4.824927166064559e-05, + "loss": 5.2944, + "step": 20150 + }, + { + "epoch": 0.1198437053953754, + "grad_norm": 1.7832791805267334, + "learning_rate": 4.8249099935646494e-05, + "loss": 5.7594, + "step": 20151 + }, + { + "epoch": 0.1198496526786564, + "grad_norm": 1.5706509351730347, + "learning_rate": 4.8248928202531366e-05, + "loss": 5.4607, + "step": 20152 + }, + { + "epoch": 0.11985559996193738, + "grad_norm": 1.6395286321640015, + "learning_rate": 4.824875646130028e-05, + "loss": 5.3338, + "step": 20153 + }, + { + "epoch": 0.11986154724521839, + "grad_norm": 1.9523805379867554, + "learning_rate": 4.824858471195329e-05, + "loss": 5.1205, + "step": 20154 + }, + { + "epoch": 0.11986749452849939, + "grad_norm": 2.45190691947937, + "learning_rate": 4.824841295449047e-05, + "loss": 4.5387, + "step": 20155 + }, + { + "epoch": 0.11987344181178038, + "grad_norm": 2.2806150913238525, + "learning_rate": 4.8248241188911856e-05, + "loss": 4.8134, + "step": 20156 + }, + { + "epoch": 0.11987938909506138, + "grad_norm": 2.230710029602051, + "learning_rate": 4.8248069415217534e-05, + "loss": 4.7386, + "step": 20157 + }, + { + "epoch": 0.11988533637834237, + "grad_norm": 2.13611102104187, + "learning_rate": 4.8247897633407546e-05, + "loss": 4.6519, + "step": 20158 + }, + { + "epoch": 0.11989128366162337, + "grad_norm": 1.7644202709197998, + "learning_rate": 4.824772584348196e-05, + "loss": 5.5343, + "step": 20159 + }, + { + "epoch": 0.11989723094490437, + "grad_norm": 1.8997445106506348, + "learning_rate": 4.824755404544083e-05, + "loss": 5.2135, + "step": 20160 + }, + { + "epoch": 0.11990317822818536, + "grad_norm": 1.8288135528564453, + "learning_rate": 4.824738223928421e-05, + "loss": 4.9554, + "step": 20161 + }, + { + "epoch": 0.11990912551146636, + "grad_norm": 1.795866847038269, + "learning_rate": 4.824721042501218e-05, + "loss": 5.6791, + "step": 20162 + }, + { + "epoch": 0.11991507279474736, + "grad_norm": 2.3721072673797607, + "learning_rate": 4.824703860262479e-05, + "loss": 5.4931, + "step": 20163 + }, + { + "epoch": 0.11992102007802835, + "grad_norm": 2.415207862854004, + "learning_rate": 4.824686677212209e-05, + "loss": 5.3801, + "step": 20164 + }, + { + "epoch": 0.11992696736130935, + "grad_norm": 2.411116600036621, + "learning_rate": 4.824669493350415e-05, + "loss": 5.1122, + "step": 20165 + }, + { + "epoch": 0.11993291464459035, + "grad_norm": 1.928256869316101, + "learning_rate": 4.824652308677104e-05, + "loss": 5.1627, + "step": 20166 + }, + { + "epoch": 0.11993886192787134, + "grad_norm": 1.9031376838684082, + "learning_rate": 4.8246351231922803e-05, + "loss": 5.014, + "step": 20167 + }, + { + "epoch": 0.11994480921115235, + "grad_norm": 1.8143563270568848, + "learning_rate": 4.82461793689595e-05, + "loss": 4.8921, + "step": 20168 + }, + { + "epoch": 0.11995075649443335, + "grad_norm": 1.7218538522720337, + "learning_rate": 4.824600749788121e-05, + "loss": 4.83, + "step": 20169 + }, + { + "epoch": 0.11995670377771434, + "grad_norm": 1.8235888481140137, + "learning_rate": 4.824583561868796e-05, + "loss": 5.0709, + "step": 20170 + }, + { + "epoch": 0.11996265106099534, + "grad_norm": 2.404656410217285, + "learning_rate": 4.8245663731379845e-05, + "loss": 4.7555, + "step": 20171 + }, + { + "epoch": 0.11996859834427634, + "grad_norm": 2.0463438034057617, + "learning_rate": 4.82454918359569e-05, + "loss": 5.2582, + "step": 20172 + }, + { + "epoch": 0.11997454562755733, + "grad_norm": 1.9073017835617065, + "learning_rate": 4.82453199324192e-05, + "loss": 5.794, + "step": 20173 + }, + { + "epoch": 0.11998049291083833, + "grad_norm": 1.856101632118225, + "learning_rate": 4.8245148020766796e-05, + "loss": 5.8569, + "step": 20174 + }, + { + "epoch": 0.11998644019411933, + "grad_norm": 1.6862335205078125, + "learning_rate": 4.8244976100999745e-05, + "loss": 5.7762, + "step": 20175 + }, + { + "epoch": 0.11999238747740032, + "grad_norm": 1.8727613687515259, + "learning_rate": 4.824480417311812e-05, + "loss": 5.5417, + "step": 20176 + }, + { + "epoch": 0.11999833476068132, + "grad_norm": 2.2967453002929688, + "learning_rate": 4.8244632237121964e-05, + "loss": 5.3268, + "step": 20177 + }, + { + "epoch": 0.12000428204396232, + "grad_norm": 2.1443405151367188, + "learning_rate": 4.824446029301136e-05, + "loss": 5.1333, + "step": 20178 + }, + { + "epoch": 0.12001022932724331, + "grad_norm": 1.7855141162872314, + "learning_rate": 4.824428834078635e-05, + "loss": 5.2781, + "step": 20179 + }, + { + "epoch": 0.12001617661052431, + "grad_norm": 1.880510926246643, + "learning_rate": 4.8244116380447e-05, + "loss": 5.1012, + "step": 20180 + }, + { + "epoch": 0.12002212389380532, + "grad_norm": 1.6733261346817017, + "learning_rate": 4.824394441199337e-05, + "loss": 5.3, + "step": 20181 + }, + { + "epoch": 0.1200280711770863, + "grad_norm": 1.781132459640503, + "learning_rate": 4.824377243542552e-05, + "loss": 5.7102, + "step": 20182 + }, + { + "epoch": 0.1200340184603673, + "grad_norm": 1.779144287109375, + "learning_rate": 4.82436004507435e-05, + "loss": 5.694, + "step": 20183 + }, + { + "epoch": 0.12003996574364831, + "grad_norm": 1.6547144651412964, + "learning_rate": 4.824342845794739e-05, + "loss": 5.4852, + "step": 20184 + }, + { + "epoch": 0.1200459130269293, + "grad_norm": 1.8403137922286987, + "learning_rate": 4.824325645703723e-05, + "loss": 5.9584, + "step": 20185 + }, + { + "epoch": 0.1200518603102103, + "grad_norm": 1.738139271736145, + "learning_rate": 4.8243084448013095e-05, + "loss": 5.903, + "step": 20186 + }, + { + "epoch": 0.12005780759349129, + "grad_norm": 1.7819492816925049, + "learning_rate": 4.824291243087504e-05, + "loss": 5.587, + "step": 20187 + }, + { + "epoch": 0.12006375487677229, + "grad_norm": 1.5876322984695435, + "learning_rate": 4.824274040562313e-05, + "loss": 5.1007, + "step": 20188 + }, + { + "epoch": 0.12006970216005329, + "grad_norm": 1.6465766429901123, + "learning_rate": 4.824256837225741e-05, + "loss": 4.9674, + "step": 20189 + }, + { + "epoch": 0.12007564944333428, + "grad_norm": 1.5593008995056152, + "learning_rate": 4.824239633077795e-05, + "loss": 4.8428, + "step": 20190 + }, + { + "epoch": 0.12008159672661528, + "grad_norm": 1.9153317213058472, + "learning_rate": 4.8242224281184814e-05, + "loss": 5.7613, + "step": 20191 + }, + { + "epoch": 0.12008754400989628, + "grad_norm": 1.727364182472229, + "learning_rate": 4.8242052223478055e-05, + "loss": 5.8612, + "step": 20192 + }, + { + "epoch": 0.12009349129317727, + "grad_norm": 1.567190408706665, + "learning_rate": 4.8241880157657736e-05, + "loss": 5.9975, + "step": 20193 + }, + { + "epoch": 0.12009943857645827, + "grad_norm": 1.549182415008545, + "learning_rate": 4.824170808372391e-05, + "loss": 5.9723, + "step": 20194 + }, + { + "epoch": 0.12010538585973927, + "grad_norm": 1.6152268648147583, + "learning_rate": 4.824153600167666e-05, + "loss": 5.9953, + "step": 20195 + }, + { + "epoch": 0.12011133314302026, + "grad_norm": 1.5206012725830078, + "learning_rate": 4.824136391151602e-05, + "loss": 5.7435, + "step": 20196 + }, + { + "epoch": 0.12011728042630127, + "grad_norm": 1.719746470451355, + "learning_rate": 4.824119181324206e-05, + "loss": 5.6181, + "step": 20197 + }, + { + "epoch": 0.12012322770958227, + "grad_norm": 1.53969407081604, + "learning_rate": 4.824101970685484e-05, + "loss": 5.2699, + "step": 20198 + }, + { + "epoch": 0.12012917499286326, + "grad_norm": 1.6543430089950562, + "learning_rate": 4.824084759235442e-05, + "loss": 5.3316, + "step": 20199 + }, + { + "epoch": 0.12013512227614426, + "grad_norm": 1.8182042837142944, + "learning_rate": 4.8240675469740856e-05, + "loss": 5.4494, + "step": 20200 + }, + { + "epoch": 0.12014106955942526, + "grad_norm": 1.5531221628189087, + "learning_rate": 4.824050333901422e-05, + "loss": 5.3292, + "step": 20201 + }, + { + "epoch": 0.12014701684270625, + "grad_norm": 1.4964851140975952, + "learning_rate": 4.8240331200174564e-05, + "loss": 5.391, + "step": 20202 + }, + { + "epoch": 0.12015296412598725, + "grad_norm": 1.5492072105407715, + "learning_rate": 4.824015905322195e-05, + "loss": 5.373, + "step": 20203 + }, + { + "epoch": 0.12015891140926825, + "grad_norm": 1.733115792274475, + "learning_rate": 4.823998689815643e-05, + "loss": 5.6997, + "step": 20204 + }, + { + "epoch": 0.12016485869254924, + "grad_norm": 1.8122310638427734, + "learning_rate": 4.8239814734978074e-05, + "loss": 5.4116, + "step": 20205 + }, + { + "epoch": 0.12017080597583024, + "grad_norm": 1.9058727025985718, + "learning_rate": 4.8239642563686934e-05, + "loss": 4.9749, + "step": 20206 + }, + { + "epoch": 0.12017675325911124, + "grad_norm": 1.5442882776260376, + "learning_rate": 4.823947038428308e-05, + "loss": 5.6342, + "step": 20207 + }, + { + "epoch": 0.12018270054239223, + "grad_norm": 1.5593653917312622, + "learning_rate": 4.823929819676657e-05, + "loss": 5.7084, + "step": 20208 + }, + { + "epoch": 0.12018864782567323, + "grad_norm": 1.5067681074142456, + "learning_rate": 4.823912600113746e-05, + "loss": 5.2455, + "step": 20209 + }, + { + "epoch": 0.12019459510895424, + "grad_norm": 1.7560538053512573, + "learning_rate": 4.82389537973958e-05, + "loss": 5.5733, + "step": 20210 + }, + { + "epoch": 0.12020054239223522, + "grad_norm": 1.6941232681274414, + "learning_rate": 4.823878158554167e-05, + "loss": 5.4642, + "step": 20211 + }, + { + "epoch": 0.12020648967551623, + "grad_norm": 1.531043529510498, + "learning_rate": 4.8238609365575124e-05, + "loss": 5.1859, + "step": 20212 + }, + { + "epoch": 0.12021243695879723, + "grad_norm": 1.8201080560684204, + "learning_rate": 4.823843713749622e-05, + "loss": 5.1331, + "step": 20213 + }, + { + "epoch": 0.12021838424207822, + "grad_norm": 1.6585347652435303, + "learning_rate": 4.823826490130501e-05, + "loss": 5.6017, + "step": 20214 + }, + { + "epoch": 0.12022433152535922, + "grad_norm": 1.7156457901000977, + "learning_rate": 4.8238092657001566e-05, + "loss": 5.4022, + "step": 20215 + }, + { + "epoch": 0.1202302788086402, + "grad_norm": 1.474266529083252, + "learning_rate": 4.823792040458595e-05, + "loss": 5.6352, + "step": 20216 + }, + { + "epoch": 0.12023622609192121, + "grad_norm": 1.4047836065292358, + "learning_rate": 4.8237748144058206e-05, + "loss": 5.7834, + "step": 20217 + }, + { + "epoch": 0.12024217337520221, + "grad_norm": 1.4172712564468384, + "learning_rate": 4.823757587541841e-05, + "loss": 5.7711, + "step": 20218 + }, + { + "epoch": 0.1202481206584832, + "grad_norm": 1.6180040836334229, + "learning_rate": 4.823740359866661e-05, + "loss": 4.9208, + "step": 20219 + }, + { + "epoch": 0.1202540679417642, + "grad_norm": 1.917434573173523, + "learning_rate": 4.8237231313802875e-05, + "loss": 5.0108, + "step": 20220 + }, + { + "epoch": 0.1202600152250452, + "grad_norm": 1.6807219982147217, + "learning_rate": 4.823705902082727e-05, + "loss": 4.8156, + "step": 20221 + }, + { + "epoch": 0.12026596250832619, + "grad_norm": 1.7759804725646973, + "learning_rate": 4.823688671973984e-05, + "loss": 4.9253, + "step": 20222 + }, + { + "epoch": 0.12027190979160719, + "grad_norm": 1.667723536491394, + "learning_rate": 4.8236714410540664e-05, + "loss": 5.3166, + "step": 20223 + }, + { + "epoch": 0.1202778570748882, + "grad_norm": 2.089888334274292, + "learning_rate": 4.823654209322977e-05, + "loss": 4.5147, + "step": 20224 + }, + { + "epoch": 0.12028380435816918, + "grad_norm": 1.878585934638977, + "learning_rate": 4.823636976780725e-05, + "loss": 5.2102, + "step": 20225 + }, + { + "epoch": 0.12028975164145018, + "grad_norm": 1.758644461631775, + "learning_rate": 4.8236197434273164e-05, + "loss": 5.7388, + "step": 20226 + }, + { + "epoch": 0.12029569892473119, + "grad_norm": 1.8373035192489624, + "learning_rate": 4.823602509262755e-05, + "loss": 5.0102, + "step": 20227 + }, + { + "epoch": 0.12030164620801218, + "grad_norm": 1.697994589805603, + "learning_rate": 4.8235852742870486e-05, + "loss": 4.8272, + "step": 20228 + }, + { + "epoch": 0.12030759349129318, + "grad_norm": 1.8276288509368896, + "learning_rate": 4.823568038500202e-05, + "loss": 5.2316, + "step": 20229 + }, + { + "epoch": 0.12031354077457418, + "grad_norm": 1.691236972808838, + "learning_rate": 4.823550801902222e-05, + "loss": 5.2957, + "step": 20230 + }, + { + "epoch": 0.12031948805785517, + "grad_norm": 1.5625227689743042, + "learning_rate": 4.823533564493115e-05, + "loss": 5.0525, + "step": 20231 + }, + { + "epoch": 0.12032543534113617, + "grad_norm": 1.927823543548584, + "learning_rate": 4.823516326272886e-05, + "loss": 5.1367, + "step": 20232 + }, + { + "epoch": 0.12033138262441717, + "grad_norm": 1.649434208869934, + "learning_rate": 4.823499087241541e-05, + "loss": 4.8151, + "step": 20233 + }, + { + "epoch": 0.12033732990769816, + "grad_norm": 1.660487413406372, + "learning_rate": 4.8234818473990866e-05, + "loss": 4.8875, + "step": 20234 + }, + { + "epoch": 0.12034327719097916, + "grad_norm": 1.584165096282959, + "learning_rate": 4.823464606745529e-05, + "loss": 5.4909, + "step": 20235 + }, + { + "epoch": 0.12034922447426016, + "grad_norm": 1.6812808513641357, + "learning_rate": 4.823447365280874e-05, + "loss": 5.1194, + "step": 20236 + }, + { + "epoch": 0.12035517175754115, + "grad_norm": 1.6096045970916748, + "learning_rate": 4.823430123005127e-05, + "loss": 4.974, + "step": 20237 + }, + { + "epoch": 0.12036111904082215, + "grad_norm": 1.9969391822814941, + "learning_rate": 4.8234128799182954e-05, + "loss": 4.4403, + "step": 20238 + }, + { + "epoch": 0.12036706632410316, + "grad_norm": 1.7902976274490356, + "learning_rate": 4.8233956360203836e-05, + "loss": 5.0718, + "step": 20239 + }, + { + "epoch": 0.12037301360738414, + "grad_norm": 1.7156457901000977, + "learning_rate": 4.8233783913113985e-05, + "loss": 5.0892, + "step": 20240 + }, + { + "epoch": 0.12037896089066515, + "grad_norm": 2.1590521335601807, + "learning_rate": 4.823361145791346e-05, + "loss": 5.3385, + "step": 20241 + }, + { + "epoch": 0.12038490817394615, + "grad_norm": 1.7091206312179565, + "learning_rate": 4.8233438994602325e-05, + "loss": 5.1961, + "step": 20242 + }, + { + "epoch": 0.12039085545722714, + "grad_norm": 1.3705766201019287, + "learning_rate": 4.823326652318063e-05, + "loss": 5.023, + "step": 20243 + }, + { + "epoch": 0.12039680274050814, + "grad_norm": 1.2733731269836426, + "learning_rate": 4.8233094043648456e-05, + "loss": 5.2236, + "step": 20244 + }, + { + "epoch": 0.12040275002378913, + "grad_norm": 1.3697882890701294, + "learning_rate": 4.823292155600583e-05, + "loss": 5.3146, + "step": 20245 + }, + { + "epoch": 0.12040869730707013, + "grad_norm": 1.4292283058166504, + "learning_rate": 4.8232749060252846e-05, + "loss": 5.2777, + "step": 20246 + }, + { + "epoch": 0.12041464459035113, + "grad_norm": 1.5285491943359375, + "learning_rate": 4.823257655638954e-05, + "loss": 5.3465, + "step": 20247 + }, + { + "epoch": 0.12042059187363212, + "grad_norm": 1.6307164430618286, + "learning_rate": 4.823240404441598e-05, + "loss": 5.2863, + "step": 20248 + }, + { + "epoch": 0.12042653915691312, + "grad_norm": 1.4112886190414429, + "learning_rate": 4.823223152433224e-05, + "loss": 5.3082, + "step": 20249 + }, + { + "epoch": 0.12043248644019412, + "grad_norm": 1.4699361324310303, + "learning_rate": 4.823205899613836e-05, + "loss": 5.2161, + "step": 20250 + }, + { + "epoch": 0.12043843372347511, + "grad_norm": 1.3991621732711792, + "learning_rate": 4.823188645983441e-05, + "loss": 5.2493, + "step": 20251 + }, + { + "epoch": 0.12044438100675611, + "grad_norm": 1.4673911333084106, + "learning_rate": 4.8231713915420446e-05, + "loss": 5.1592, + "step": 20252 + }, + { + "epoch": 0.12045032829003711, + "grad_norm": 1.3782176971435547, + "learning_rate": 4.8231541362896534e-05, + "loss": 5.3296, + "step": 20253 + }, + { + "epoch": 0.1204562755733181, + "grad_norm": 1.5209922790527344, + "learning_rate": 4.823136880226272e-05, + "loss": 5.4215, + "step": 20254 + }, + { + "epoch": 0.1204622228565991, + "grad_norm": 1.3906199932098389, + "learning_rate": 4.823119623351909e-05, + "loss": 5.2263, + "step": 20255 + }, + { + "epoch": 0.1204681701398801, + "grad_norm": 1.4061380624771118, + "learning_rate": 4.823102365666568e-05, + "loss": 5.2252, + "step": 20256 + }, + { + "epoch": 0.1204741174231611, + "grad_norm": 1.3005892038345337, + "learning_rate": 4.8230851071702564e-05, + "loss": 5.2015, + "step": 20257 + }, + { + "epoch": 0.1204800647064421, + "grad_norm": 1.4949315786361694, + "learning_rate": 4.8230678478629796e-05, + "loss": 4.9753, + "step": 20258 + }, + { + "epoch": 0.1204860119897231, + "grad_norm": 1.5322837829589844, + "learning_rate": 4.823050587744744e-05, + "loss": 5.1862, + "step": 20259 + }, + { + "epoch": 0.12049195927300409, + "grad_norm": 1.379016637802124, + "learning_rate": 4.8230333268155556e-05, + "loss": 5.0689, + "step": 20260 + }, + { + "epoch": 0.12049790655628509, + "grad_norm": 1.2959635257720947, + "learning_rate": 4.8230160650754205e-05, + "loss": 5.1079, + "step": 20261 + }, + { + "epoch": 0.12050385383956609, + "grad_norm": 1.3587706089019775, + "learning_rate": 4.8229988025243436e-05, + "loss": 5.2024, + "step": 20262 + }, + { + "epoch": 0.12050980112284708, + "grad_norm": 1.3031280040740967, + "learning_rate": 4.822981539162332e-05, + "loss": 5.1008, + "step": 20263 + }, + { + "epoch": 0.12051574840612808, + "grad_norm": 1.315364956855774, + "learning_rate": 4.822964274989392e-05, + "loss": 4.8122, + "step": 20264 + }, + { + "epoch": 0.12052169568940908, + "grad_norm": 1.3627794981002808, + "learning_rate": 4.8229470100055293e-05, + "loss": 5.0851, + "step": 20265 + }, + { + "epoch": 0.12052764297269007, + "grad_norm": 1.4490907192230225, + "learning_rate": 4.822929744210749e-05, + "loss": 4.7956, + "step": 20266 + }, + { + "epoch": 0.12053359025597107, + "grad_norm": 1.1658390760421753, + "learning_rate": 4.8229124776050584e-05, + "loss": 5.0365, + "step": 20267 + }, + { + "epoch": 0.12053953753925208, + "grad_norm": 1.2844047546386719, + "learning_rate": 4.822895210188463e-05, + "loss": 5.3005, + "step": 20268 + }, + { + "epoch": 0.12054548482253306, + "grad_norm": 1.5759227275848389, + "learning_rate": 4.822877941960969e-05, + "loss": 5.0768, + "step": 20269 + }, + { + "epoch": 0.12055143210581407, + "grad_norm": 1.457592248916626, + "learning_rate": 4.822860672922582e-05, + "loss": 5.1662, + "step": 20270 + }, + { + "epoch": 0.12055737938909507, + "grad_norm": 1.2711186408996582, + "learning_rate": 4.8228434030733086e-05, + "loss": 5.3703, + "step": 20271 + }, + { + "epoch": 0.12056332667237606, + "grad_norm": 1.300824522972107, + "learning_rate": 4.822826132413155e-05, + "loss": 5.2529, + "step": 20272 + }, + { + "epoch": 0.12056927395565706, + "grad_norm": 1.2395694255828857, + "learning_rate": 4.822808860942126e-05, + "loss": 5.3225, + "step": 20273 + }, + { + "epoch": 0.12057522123893805, + "grad_norm": 1.491053581237793, + "learning_rate": 4.822791588660229e-05, + "loss": 5.5039, + "step": 20274 + }, + { + "epoch": 0.12058116852221905, + "grad_norm": 1.4981472492218018, + "learning_rate": 4.8227743155674684e-05, + "loss": 4.8774, + "step": 20275 + }, + { + "epoch": 0.12058711580550005, + "grad_norm": 1.4627505540847778, + "learning_rate": 4.822757041663852e-05, + "loss": 4.9165, + "step": 20276 + }, + { + "epoch": 0.12059306308878104, + "grad_norm": 1.5328632593154907, + "learning_rate": 4.8227397669493856e-05, + "loss": 4.8773, + "step": 20277 + }, + { + "epoch": 0.12059901037206204, + "grad_norm": 1.314146876335144, + "learning_rate": 4.822722491424074e-05, + "loss": 5.0159, + "step": 20278 + }, + { + "epoch": 0.12060495765534304, + "grad_norm": 1.435636043548584, + "learning_rate": 4.822705215087925e-05, + "loss": 5.1621, + "step": 20279 + }, + { + "epoch": 0.12061090493862403, + "grad_norm": 1.3141332864761353, + "learning_rate": 4.822687937940943e-05, + "loss": 5.3143, + "step": 20280 + }, + { + "epoch": 0.12061685222190503, + "grad_norm": 1.3140829801559448, + "learning_rate": 4.822670659983134e-05, + "loss": 5.3171, + "step": 20281 + }, + { + "epoch": 0.12062279950518603, + "grad_norm": 1.5490076541900635, + "learning_rate": 4.8226533812145056e-05, + "loss": 5.1932, + "step": 20282 + }, + { + "epoch": 0.12062874678846702, + "grad_norm": 1.4878573417663574, + "learning_rate": 4.822636101635063e-05, + "loss": 5.1662, + "step": 20283 + }, + { + "epoch": 0.12063469407174802, + "grad_norm": 1.519872784614563, + "learning_rate": 4.822618821244811e-05, + "loss": 5.0641, + "step": 20284 + }, + { + "epoch": 0.12064064135502903, + "grad_norm": 1.430929183959961, + "learning_rate": 4.822601540043757e-05, + "loss": 4.9086, + "step": 20285 + }, + { + "epoch": 0.12064658863831002, + "grad_norm": 1.483995795249939, + "learning_rate": 4.822584258031908e-05, + "loss": 4.992, + "step": 20286 + }, + { + "epoch": 0.12065253592159102, + "grad_norm": 1.3074853420257568, + "learning_rate": 4.822566975209269e-05, + "loss": 4.9514, + "step": 20287 + }, + { + "epoch": 0.12065848320487202, + "grad_norm": 1.6032319068908691, + "learning_rate": 4.822549691575844e-05, + "loss": 4.8495, + "step": 20288 + }, + { + "epoch": 0.12066443048815301, + "grad_norm": 1.2918034791946411, + "learning_rate": 4.822532407131641e-05, + "loss": 5.0728, + "step": 20289 + }, + { + "epoch": 0.12067037777143401, + "grad_norm": 1.3000357151031494, + "learning_rate": 4.8225151218766675e-05, + "loss": 5.0898, + "step": 20290 + }, + { + "epoch": 0.12067632505471501, + "grad_norm": 1.3674614429473877, + "learning_rate": 4.8224978358109274e-05, + "loss": 4.8252, + "step": 20291 + }, + { + "epoch": 0.120682272337996, + "grad_norm": 1.1932893991470337, + "learning_rate": 4.822480548934427e-05, + "loss": 4.9946, + "step": 20292 + }, + { + "epoch": 0.120688219621277, + "grad_norm": 1.1052628755569458, + "learning_rate": 4.822463261247173e-05, + "loss": 5.0293, + "step": 20293 + }, + { + "epoch": 0.120694166904558, + "grad_norm": 1.1658306121826172, + "learning_rate": 4.82244597274917e-05, + "loss": 4.9417, + "step": 20294 + }, + { + "epoch": 0.12070011418783899, + "grad_norm": 1.1357192993164062, + "learning_rate": 4.822428683440426e-05, + "loss": 4.9448, + "step": 20295 + }, + { + "epoch": 0.12070606147112, + "grad_norm": 1.0769197940826416, + "learning_rate": 4.822411393320946e-05, + "loss": 4.8676, + "step": 20296 + }, + { + "epoch": 0.120712008754401, + "grad_norm": 1.4339419603347778, + "learning_rate": 4.8223941023907366e-05, + "loss": 5.0648, + "step": 20297 + }, + { + "epoch": 0.12071795603768198, + "grad_norm": 1.6009191274642944, + "learning_rate": 4.822376810649803e-05, + "loss": 5.2228, + "step": 20298 + }, + { + "epoch": 0.12072390332096299, + "grad_norm": 1.5266865491867065, + "learning_rate": 4.8223595180981515e-05, + "loss": 5.1399, + "step": 20299 + }, + { + "epoch": 0.12072985060424399, + "grad_norm": 1.6861037015914917, + "learning_rate": 4.822342224735788e-05, + "loss": 4.9326, + "step": 20300 + }, + { + "epoch": 0.12073579788752498, + "grad_norm": 1.4925029277801514, + "learning_rate": 4.8223249305627204e-05, + "loss": 4.9586, + "step": 20301 + }, + { + "epoch": 0.12074174517080598, + "grad_norm": 1.3088650703430176, + "learning_rate": 4.822307635578952e-05, + "loss": 5.1486, + "step": 20302 + }, + { + "epoch": 0.12074769245408697, + "grad_norm": 1.5702837705612183, + "learning_rate": 4.82229033978449e-05, + "loss": 4.788, + "step": 20303 + }, + { + "epoch": 0.12075363973736797, + "grad_norm": 1.5717079639434814, + "learning_rate": 4.8222730431793406e-05, + "loss": 4.6354, + "step": 20304 + }, + { + "epoch": 0.12075958702064897, + "grad_norm": 1.4520710706710815, + "learning_rate": 4.822255745763509e-05, + "loss": 4.6995, + "step": 20305 + }, + { + "epoch": 0.12076553430392996, + "grad_norm": 1.57894766330719, + "learning_rate": 4.822238447537003e-05, + "loss": 4.6355, + "step": 20306 + }, + { + "epoch": 0.12077148158721096, + "grad_norm": 1.5820640325546265, + "learning_rate": 4.822221148499827e-05, + "loss": 4.6993, + "step": 20307 + }, + { + "epoch": 0.12077742887049196, + "grad_norm": 1.5759177207946777, + "learning_rate": 4.822203848651987e-05, + "loss": 4.5678, + "step": 20308 + }, + { + "epoch": 0.12078337615377295, + "grad_norm": 1.5758824348449707, + "learning_rate": 4.822186547993491e-05, + "loss": 4.547, + "step": 20309 + }, + { + "epoch": 0.12078932343705395, + "grad_norm": 1.6604961156845093, + "learning_rate": 4.822169246524343e-05, + "loss": 4.6418, + "step": 20310 + }, + { + "epoch": 0.12079527072033495, + "grad_norm": 1.6913725137710571, + "learning_rate": 4.8221519442445496e-05, + "loss": 4.5329, + "step": 20311 + }, + { + "epoch": 0.12080121800361594, + "grad_norm": 1.6500364542007446, + "learning_rate": 4.822134641154117e-05, + "loss": 4.6701, + "step": 20312 + }, + { + "epoch": 0.12080716528689694, + "grad_norm": 1.6819617748260498, + "learning_rate": 4.822117337253051e-05, + "loss": 4.619, + "step": 20313 + }, + { + "epoch": 0.12081311257017795, + "grad_norm": 1.27179753780365, + "learning_rate": 4.8221000325413576e-05, + "loss": 5.091, + "step": 20314 + }, + { + "epoch": 0.12081905985345893, + "grad_norm": 1.357703447341919, + "learning_rate": 4.822082727019044e-05, + "loss": 4.9313, + "step": 20315 + }, + { + "epoch": 0.12082500713673994, + "grad_norm": 1.2419538497924805, + "learning_rate": 4.8220654206861144e-05, + "loss": 4.9511, + "step": 20316 + }, + { + "epoch": 0.12083095442002094, + "grad_norm": 1.4506672620773315, + "learning_rate": 4.822048113542576e-05, + "loss": 5.1608, + "step": 20317 + }, + { + "epoch": 0.12083690170330193, + "grad_norm": 1.597922921180725, + "learning_rate": 4.8220308055884345e-05, + "loss": 5.1663, + "step": 20318 + }, + { + "epoch": 0.12084284898658293, + "grad_norm": 1.2692219018936157, + "learning_rate": 4.822013496823696e-05, + "loss": 5.0838, + "step": 20319 + }, + { + "epoch": 0.12084879626986393, + "grad_norm": 1.427439570426941, + "learning_rate": 4.8219961872483674e-05, + "loss": 4.9863, + "step": 20320 + }, + { + "epoch": 0.12085474355314492, + "grad_norm": 1.3992658853530884, + "learning_rate": 4.821978876862453e-05, + "loss": 5.1907, + "step": 20321 + }, + { + "epoch": 0.12086069083642592, + "grad_norm": 1.3777414560317993, + "learning_rate": 4.8219615656659605e-05, + "loss": 5.137, + "step": 20322 + }, + { + "epoch": 0.12086663811970692, + "grad_norm": 1.3394333124160767, + "learning_rate": 4.821944253658895e-05, + "loss": 5.1222, + "step": 20323 + }, + { + "epoch": 0.12087258540298791, + "grad_norm": 1.3054091930389404, + "learning_rate": 4.8219269408412625e-05, + "loss": 4.9626, + "step": 20324 + }, + { + "epoch": 0.12087853268626891, + "grad_norm": 1.3209751844406128, + "learning_rate": 4.8219096272130696e-05, + "loss": 5.0408, + "step": 20325 + }, + { + "epoch": 0.12088447996954992, + "grad_norm": 1.3860117197036743, + "learning_rate": 4.821892312774322e-05, + "loss": 4.9667, + "step": 20326 + }, + { + "epoch": 0.1208904272528309, + "grad_norm": 1.2468161582946777, + "learning_rate": 4.821874997525025e-05, + "loss": 5.1203, + "step": 20327 + }, + { + "epoch": 0.1208963745361119, + "grad_norm": 1.221932291984558, + "learning_rate": 4.821857681465186e-05, + "loss": 4.9117, + "step": 20328 + }, + { + "epoch": 0.12090232181939291, + "grad_norm": 1.2188096046447754, + "learning_rate": 4.8218403645948105e-05, + "loss": 4.7647, + "step": 20329 + }, + { + "epoch": 0.1209082691026739, + "grad_norm": 1.4023007154464722, + "learning_rate": 4.8218230469139044e-05, + "loss": 4.9038, + "step": 20330 + }, + { + "epoch": 0.1209142163859549, + "grad_norm": 1.4733843803405762, + "learning_rate": 4.821805728422474e-05, + "loss": 4.9782, + "step": 20331 + }, + { + "epoch": 0.12092016366923589, + "grad_norm": 1.405462384223938, + "learning_rate": 4.821788409120525e-05, + "loss": 5.0028, + "step": 20332 + }, + { + "epoch": 0.12092611095251689, + "grad_norm": 1.4103752374649048, + "learning_rate": 4.821771089008064e-05, + "loss": 4.8219, + "step": 20333 + }, + { + "epoch": 0.12093205823579789, + "grad_norm": 1.403225064277649, + "learning_rate": 4.821753768085096e-05, + "loss": 4.9024, + "step": 20334 + }, + { + "epoch": 0.12093800551907888, + "grad_norm": 1.3480467796325684, + "learning_rate": 4.821736446351629e-05, + "loss": 4.9341, + "step": 20335 + }, + { + "epoch": 0.12094395280235988, + "grad_norm": 1.4869621992111206, + "learning_rate": 4.821719123807667e-05, + "loss": 5.6448, + "step": 20336 + }, + { + "epoch": 0.12094990008564088, + "grad_norm": 1.3473197221755981, + "learning_rate": 4.821701800453217e-05, + "loss": 4.9512, + "step": 20337 + }, + { + "epoch": 0.12095584736892187, + "grad_norm": 1.378721833229065, + "learning_rate": 4.821684476288285e-05, + "loss": 5.0146, + "step": 20338 + }, + { + "epoch": 0.12096179465220287, + "grad_norm": 1.2590171098709106, + "learning_rate": 4.821667151312876e-05, + "loss": 4.8453, + "step": 20339 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 1.3700976371765137, + "learning_rate": 4.821649825526998e-05, + "loss": 4.989, + "step": 20340 + }, + { + "epoch": 0.12097368921876486, + "grad_norm": 1.2956781387329102, + "learning_rate": 4.821632498930656e-05, + "loss": 5.1885, + "step": 20341 + }, + { + "epoch": 0.12097963650204586, + "grad_norm": 1.5004302263259888, + "learning_rate": 4.821615171523856e-05, + "loss": 4.8978, + "step": 20342 + }, + { + "epoch": 0.12098558378532687, + "grad_norm": 1.4427284002304077, + "learning_rate": 4.821597843306603e-05, + "loss": 5.0771, + "step": 20343 + }, + { + "epoch": 0.12099153106860785, + "grad_norm": 1.2329649925231934, + "learning_rate": 4.8215805142789054e-05, + "loss": 5.1695, + "step": 20344 + }, + { + "epoch": 0.12099747835188886, + "grad_norm": 1.521106243133545, + "learning_rate": 4.8215631844407685e-05, + "loss": 4.8117, + "step": 20345 + }, + { + "epoch": 0.12100342563516986, + "grad_norm": 1.4634170532226562, + "learning_rate": 4.8215458537921966e-05, + "loss": 4.8144, + "step": 20346 + }, + { + "epoch": 0.12100937291845085, + "grad_norm": 1.2921918630599976, + "learning_rate": 4.821528522333197e-05, + "loss": 5.0769, + "step": 20347 + }, + { + "epoch": 0.12101532020173185, + "grad_norm": 1.5667484998703003, + "learning_rate": 4.821511190063777e-05, + "loss": 4.7748, + "step": 20348 + }, + { + "epoch": 0.12102126748501285, + "grad_norm": 1.3546236753463745, + "learning_rate": 4.8214938569839405e-05, + "loss": 5.0496, + "step": 20349 + }, + { + "epoch": 0.12102721476829384, + "grad_norm": 1.354236125946045, + "learning_rate": 4.821476523093695e-05, + "loss": 4.9173, + "step": 20350 + }, + { + "epoch": 0.12103316205157484, + "grad_norm": 1.3883708715438843, + "learning_rate": 4.821459188393046e-05, + "loss": 5.0093, + "step": 20351 + }, + { + "epoch": 0.12103910933485584, + "grad_norm": 1.5914138555526733, + "learning_rate": 4.8214418528819995e-05, + "loss": 4.7995, + "step": 20352 + }, + { + "epoch": 0.12104505661813683, + "grad_norm": 1.3804936408996582, + "learning_rate": 4.821424516560561e-05, + "loss": 5.0071, + "step": 20353 + }, + { + "epoch": 0.12105100390141783, + "grad_norm": 1.4783899784088135, + "learning_rate": 4.8214071794287376e-05, + "loss": 4.9744, + "step": 20354 + }, + { + "epoch": 0.12105695118469884, + "grad_norm": 1.480790376663208, + "learning_rate": 4.821389841486535e-05, + "loss": 4.9975, + "step": 20355 + }, + { + "epoch": 0.12106289846797982, + "grad_norm": 1.852853536605835, + "learning_rate": 4.82137250273396e-05, + "loss": 5.069, + "step": 20356 + }, + { + "epoch": 0.12106884575126083, + "grad_norm": 1.623017430305481, + "learning_rate": 4.821355163171016e-05, + "loss": 4.9939, + "step": 20357 + }, + { + "epoch": 0.12107479303454183, + "grad_norm": 1.526219367980957, + "learning_rate": 4.8213378227977123e-05, + "loss": 5.1281, + "step": 20358 + }, + { + "epoch": 0.12108074031782282, + "grad_norm": 1.574321985244751, + "learning_rate": 4.8213204816140536e-05, + "loss": 5.0241, + "step": 20359 + }, + { + "epoch": 0.12108668760110382, + "grad_norm": 1.5978455543518066, + "learning_rate": 4.8213031396200446e-05, + "loss": 5.0107, + "step": 20360 + }, + { + "epoch": 0.12109263488438482, + "grad_norm": 1.509109616279602, + "learning_rate": 4.821285796815694e-05, + "loss": 5.0056, + "step": 20361 + }, + { + "epoch": 0.12109858216766581, + "grad_norm": 1.4923186302185059, + "learning_rate": 4.8212684532010054e-05, + "loss": 5.0412, + "step": 20362 + }, + { + "epoch": 0.12110452945094681, + "grad_norm": 1.7046619653701782, + "learning_rate": 4.8212511087759874e-05, + "loss": 4.8996, + "step": 20363 + }, + { + "epoch": 0.1211104767342278, + "grad_norm": 1.7599172592163086, + "learning_rate": 4.8212337635406435e-05, + "loss": 4.9979, + "step": 20364 + }, + { + "epoch": 0.1211164240175088, + "grad_norm": 1.6309099197387695, + "learning_rate": 4.821216417494982e-05, + "loss": 4.9639, + "step": 20365 + }, + { + "epoch": 0.1211223713007898, + "grad_norm": 1.7311389446258545, + "learning_rate": 4.821199070639006e-05, + "loss": 4.9296, + "step": 20366 + }, + { + "epoch": 0.12112831858407079, + "grad_norm": 1.480536699295044, + "learning_rate": 4.8211817229727246e-05, + "loss": 4.9338, + "step": 20367 + }, + { + "epoch": 0.12113426586735179, + "grad_norm": 1.4267778396606445, + "learning_rate": 4.821164374496143e-05, + "loss": 4.8954, + "step": 20368 + }, + { + "epoch": 0.1211402131506328, + "grad_norm": 1.3726919889450073, + "learning_rate": 4.821147025209266e-05, + "loss": 4.8362, + "step": 20369 + }, + { + "epoch": 0.12114616043391378, + "grad_norm": 1.5158253908157349, + "learning_rate": 4.821129675112101e-05, + "loss": 5.0629, + "step": 20370 + }, + { + "epoch": 0.12115210771719478, + "grad_norm": 1.2002782821655273, + "learning_rate": 4.8211123242046535e-05, + "loss": 4.7668, + "step": 20371 + }, + { + "epoch": 0.12115805500047579, + "grad_norm": 1.123113751411438, + "learning_rate": 4.821094972486929e-05, + "loss": 5.0103, + "step": 20372 + }, + { + "epoch": 0.12116400228375677, + "grad_norm": 1.360532283782959, + "learning_rate": 4.821077619958936e-05, + "loss": 5.0503, + "step": 20373 + }, + { + "epoch": 0.12116994956703778, + "grad_norm": 1.3912672996520996, + "learning_rate": 4.821060266620677e-05, + "loss": 4.9326, + "step": 20374 + }, + { + "epoch": 0.12117589685031878, + "grad_norm": 1.2644896507263184, + "learning_rate": 4.821042912472161e-05, + "loss": 4.9584, + "step": 20375 + }, + { + "epoch": 0.12118184413359977, + "grad_norm": 1.1967086791992188, + "learning_rate": 4.821025557513392e-05, + "loss": 4.8954, + "step": 20376 + }, + { + "epoch": 0.12118779141688077, + "grad_norm": 1.353725552558899, + "learning_rate": 4.821008201744378e-05, + "loss": 4.8438, + "step": 20377 + }, + { + "epoch": 0.12119373870016177, + "grad_norm": 1.239682912826538, + "learning_rate": 4.820990845165123e-05, + "loss": 4.9624, + "step": 20378 + }, + { + "epoch": 0.12119968598344276, + "grad_norm": 1.1952159404754639, + "learning_rate": 4.820973487775634e-05, + "loss": 4.9254, + "step": 20379 + }, + { + "epoch": 0.12120563326672376, + "grad_norm": 1.4531627893447876, + "learning_rate": 4.820956129575918e-05, + "loss": 4.9487, + "step": 20380 + }, + { + "epoch": 0.12121158055000476, + "grad_norm": 1.2653759717941284, + "learning_rate": 4.8209387705659805e-05, + "loss": 4.7916, + "step": 20381 + }, + { + "epoch": 0.12121752783328575, + "grad_norm": 1.3156383037567139, + "learning_rate": 4.820921410745826e-05, + "loss": 5.0585, + "step": 20382 + }, + { + "epoch": 0.12122347511656675, + "grad_norm": 1.536216139793396, + "learning_rate": 4.820904050115462e-05, + "loss": 4.849, + "step": 20383 + }, + { + "epoch": 0.12122942239984776, + "grad_norm": 1.6567318439483643, + "learning_rate": 4.820886688674895e-05, + "loss": 4.6508, + "step": 20384 + }, + { + "epoch": 0.12123536968312874, + "grad_norm": 1.4173903465270996, + "learning_rate": 4.82086932642413e-05, + "loss": 4.8919, + "step": 20385 + }, + { + "epoch": 0.12124131696640975, + "grad_norm": 1.4352593421936035, + "learning_rate": 4.820851963363174e-05, + "loss": 4.7546, + "step": 20386 + }, + { + "epoch": 0.12124726424969075, + "grad_norm": 1.538988471031189, + "learning_rate": 4.8208345994920326e-05, + "loss": 4.7707, + "step": 20387 + }, + { + "epoch": 0.12125321153297174, + "grad_norm": 1.3959681987762451, + "learning_rate": 4.820817234810711e-05, + "loss": 4.5633, + "step": 20388 + }, + { + "epoch": 0.12125915881625274, + "grad_norm": 1.3972582817077637, + "learning_rate": 4.820799869319217e-05, + "loss": 4.5165, + "step": 20389 + }, + { + "epoch": 0.12126510609953374, + "grad_norm": 1.770070195198059, + "learning_rate": 4.820782503017555e-05, + "loss": 4.9679, + "step": 20390 + }, + { + "epoch": 0.12127105338281473, + "grad_norm": 1.6822887659072876, + "learning_rate": 4.820765135905732e-05, + "loss": 4.9589, + "step": 20391 + }, + { + "epoch": 0.12127700066609573, + "grad_norm": 1.6352055072784424, + "learning_rate": 4.820747767983754e-05, + "loss": 5.0389, + "step": 20392 + }, + { + "epoch": 0.12128294794937672, + "grad_norm": 1.4803529977798462, + "learning_rate": 4.8207303992516274e-05, + "loss": 5.1071, + "step": 20393 + }, + { + "epoch": 0.12128889523265772, + "grad_norm": 1.5575767755508423, + "learning_rate": 4.820713029709357e-05, + "loss": 5.2072, + "step": 20394 + }, + { + "epoch": 0.12129484251593872, + "grad_norm": 1.3417006731033325, + "learning_rate": 4.82069565935695e-05, + "loss": 5.1991, + "step": 20395 + }, + { + "epoch": 0.12130078979921971, + "grad_norm": 1.3577461242675781, + "learning_rate": 4.820678288194412e-05, + "loss": 5.3225, + "step": 20396 + }, + { + "epoch": 0.12130673708250071, + "grad_norm": 1.3763153553009033, + "learning_rate": 4.8206609162217494e-05, + "loss": 5.0247, + "step": 20397 + }, + { + "epoch": 0.12131268436578171, + "grad_norm": 1.7175389528274536, + "learning_rate": 4.8206435434389675e-05, + "loss": 5.2964, + "step": 20398 + }, + { + "epoch": 0.1213186316490627, + "grad_norm": 1.4921340942382812, + "learning_rate": 4.820626169846073e-05, + "loss": 4.781, + "step": 20399 + }, + { + "epoch": 0.1213245789323437, + "grad_norm": 1.3149629831314087, + "learning_rate": 4.8206087954430725e-05, + "loss": 5.2148, + "step": 20400 + }, + { + "epoch": 0.1213305262156247, + "grad_norm": 1.5960938930511475, + "learning_rate": 4.8205914202299715e-05, + "loss": 5.4152, + "step": 20401 + }, + { + "epoch": 0.1213364734989057, + "grad_norm": 1.4101301431655884, + "learning_rate": 4.8205740442067757e-05, + "loss": 5.2033, + "step": 20402 + }, + { + "epoch": 0.1213424207821867, + "grad_norm": 1.2584593296051025, + "learning_rate": 4.820556667373492e-05, + "loss": 5.1183, + "step": 20403 + }, + { + "epoch": 0.1213483680654677, + "grad_norm": 1.365639090538025, + "learning_rate": 4.820539289730125e-05, + "loss": 4.9446, + "step": 20404 + }, + { + "epoch": 0.12135431534874869, + "grad_norm": 1.6515495777130127, + "learning_rate": 4.820521911276682e-05, + "loss": 4.9002, + "step": 20405 + }, + { + "epoch": 0.12136026263202969, + "grad_norm": 1.2481954097747803, + "learning_rate": 4.8205045320131684e-05, + "loss": 5.3332, + "step": 20406 + }, + { + "epoch": 0.12136620991531069, + "grad_norm": 1.4952552318572998, + "learning_rate": 4.820487151939591e-05, + "loss": 4.7492, + "step": 20407 + }, + { + "epoch": 0.12137215719859168, + "grad_norm": 1.4472140073776245, + "learning_rate": 4.8204697710559556e-05, + "loss": 5.158, + "step": 20408 + }, + { + "epoch": 0.12137810448187268, + "grad_norm": 1.2544384002685547, + "learning_rate": 4.8204523893622685e-05, + "loss": 5.2041, + "step": 20409 + }, + { + "epoch": 0.12138405176515368, + "grad_norm": 1.1283172369003296, + "learning_rate": 4.820435006858535e-05, + "loss": 5.246, + "step": 20410 + }, + { + "epoch": 0.12138999904843467, + "grad_norm": 1.1113736629486084, + "learning_rate": 4.8204176235447617e-05, + "loss": 5.2116, + "step": 20411 + }, + { + "epoch": 0.12139594633171567, + "grad_norm": 1.2103666067123413, + "learning_rate": 4.820400239420955e-05, + "loss": 5.4421, + "step": 20412 + }, + { + "epoch": 0.12140189361499668, + "grad_norm": 1.2054588794708252, + "learning_rate": 4.82038285448712e-05, + "loss": 5.2503, + "step": 20413 + }, + { + "epoch": 0.12140784089827766, + "grad_norm": 1.568247675895691, + "learning_rate": 4.820365468743263e-05, + "loss": 4.9009, + "step": 20414 + }, + { + "epoch": 0.12141378818155867, + "grad_norm": 1.7106029987335205, + "learning_rate": 4.820348082189391e-05, + "loss": 4.7826, + "step": 20415 + }, + { + "epoch": 0.12141973546483967, + "grad_norm": 1.4479119777679443, + "learning_rate": 4.8203306948255095e-05, + "loss": 5.0084, + "step": 20416 + }, + { + "epoch": 0.12142568274812066, + "grad_norm": 1.467880368232727, + "learning_rate": 4.820313306651624e-05, + "loss": 5.2477, + "step": 20417 + }, + { + "epoch": 0.12143163003140166, + "grad_norm": 1.4088891744613647, + "learning_rate": 4.820295917667742e-05, + "loss": 5.1151, + "step": 20418 + }, + { + "epoch": 0.12143757731468266, + "grad_norm": 1.2838404178619385, + "learning_rate": 4.820278527873868e-05, + "loss": 5.6774, + "step": 20419 + }, + { + "epoch": 0.12144352459796365, + "grad_norm": 1.2146633863449097, + "learning_rate": 4.820261137270009e-05, + "loss": 5.0487, + "step": 20420 + }, + { + "epoch": 0.12144947188124465, + "grad_norm": 1.5603777170181274, + "learning_rate": 4.820243745856171e-05, + "loss": 5.0569, + "step": 20421 + }, + { + "epoch": 0.12145541916452564, + "grad_norm": 1.4454957246780396, + "learning_rate": 4.8202263536323586e-05, + "loss": 4.9556, + "step": 20422 + }, + { + "epoch": 0.12146136644780664, + "grad_norm": 1.4732788801193237, + "learning_rate": 4.820208960598581e-05, + "loss": 5.0095, + "step": 20423 + }, + { + "epoch": 0.12146731373108764, + "grad_norm": 1.4078243970870972, + "learning_rate": 4.820191566754841e-05, + "loss": 5.2642, + "step": 20424 + }, + { + "epoch": 0.12147326101436863, + "grad_norm": 1.2497223615646362, + "learning_rate": 4.820174172101147e-05, + "loss": 5.0792, + "step": 20425 + }, + { + "epoch": 0.12147920829764963, + "grad_norm": 1.5479954481124878, + "learning_rate": 4.8201567766375034e-05, + "loss": 4.9157, + "step": 20426 + }, + { + "epoch": 0.12148515558093063, + "grad_norm": 1.4266546964645386, + "learning_rate": 4.8201393803639175e-05, + "loss": 4.8776, + "step": 20427 + }, + { + "epoch": 0.12149110286421162, + "grad_norm": 1.3757798671722412, + "learning_rate": 4.8201219832803946e-05, + "loss": 4.8253, + "step": 20428 + }, + { + "epoch": 0.12149705014749262, + "grad_norm": 1.3386640548706055, + "learning_rate": 4.8201045853869416e-05, + "loss": 4.7895, + "step": 20429 + }, + { + "epoch": 0.12150299743077363, + "grad_norm": 1.3968008756637573, + "learning_rate": 4.820087186683564e-05, + "loss": 4.7238, + "step": 20430 + }, + { + "epoch": 0.12150894471405461, + "grad_norm": 1.4070801734924316, + "learning_rate": 4.820069787170267e-05, + "loss": 4.9614, + "step": 20431 + }, + { + "epoch": 0.12151489199733562, + "grad_norm": 1.4280625581741333, + "learning_rate": 4.820052386847059e-05, + "loss": 4.6765, + "step": 20432 + }, + { + "epoch": 0.12152083928061662, + "grad_norm": 1.3841910362243652, + "learning_rate": 4.820034985713944e-05, + "loss": 4.8008, + "step": 20433 + }, + { + "epoch": 0.1215267865638976, + "grad_norm": 1.3555341958999634, + "learning_rate": 4.820017583770928e-05, + "loss": 4.7967, + "step": 20434 + }, + { + "epoch": 0.12153273384717861, + "grad_norm": 1.3651773929595947, + "learning_rate": 4.820000181018019e-05, + "loss": 4.9003, + "step": 20435 + }, + { + "epoch": 0.12153868113045961, + "grad_norm": 1.364749789237976, + "learning_rate": 4.8199827774552215e-05, + "loss": 4.9834, + "step": 20436 + }, + { + "epoch": 0.1215446284137406, + "grad_norm": 1.384041428565979, + "learning_rate": 4.8199653730825424e-05, + "loss": 4.9997, + "step": 20437 + }, + { + "epoch": 0.1215505756970216, + "grad_norm": 1.544909954071045, + "learning_rate": 4.8199479678999867e-05, + "loss": 4.7909, + "step": 20438 + }, + { + "epoch": 0.1215565229803026, + "grad_norm": 1.4012216329574585, + "learning_rate": 4.819930561907561e-05, + "loss": 4.7359, + "step": 20439 + }, + { + "epoch": 0.12156247026358359, + "grad_norm": 1.2876297235488892, + "learning_rate": 4.819913155105272e-05, + "loss": 4.5025, + "step": 20440 + }, + { + "epoch": 0.1215684175468646, + "grad_norm": 1.5008763074874878, + "learning_rate": 4.819895747493125e-05, + "loss": 4.4486, + "step": 20441 + }, + { + "epoch": 0.1215743648301456, + "grad_norm": 1.3206987380981445, + "learning_rate": 4.8198783390711264e-05, + "loss": 4.4767, + "step": 20442 + }, + { + "epoch": 0.12158031211342658, + "grad_norm": 1.3569231033325195, + "learning_rate": 4.819860929839283e-05, + "loss": 4.6665, + "step": 20443 + }, + { + "epoch": 0.12158625939670759, + "grad_norm": 1.3377808332443237, + "learning_rate": 4.8198435197975986e-05, + "loss": 4.6109, + "step": 20444 + }, + { + "epoch": 0.12159220667998859, + "grad_norm": 1.5400346517562866, + "learning_rate": 4.8198261089460824e-05, + "loss": 4.2635, + "step": 20445 + }, + { + "epoch": 0.12159815396326958, + "grad_norm": 1.6329059600830078, + "learning_rate": 4.8198086972847376e-05, + "loss": 4.3745, + "step": 20446 + }, + { + "epoch": 0.12160410124655058, + "grad_norm": 1.4274183511734009, + "learning_rate": 4.819791284813573e-05, + "loss": 4.4103, + "step": 20447 + }, + { + "epoch": 0.12161004852983158, + "grad_norm": 1.4671530723571777, + "learning_rate": 4.8197738715325916e-05, + "loss": 4.3995, + "step": 20448 + }, + { + "epoch": 0.12161599581311257, + "grad_norm": 1.3783891201019287, + "learning_rate": 4.819756457441802e-05, + "loss": 4.3874, + "step": 20449 + }, + { + "epoch": 0.12162194309639357, + "grad_norm": 1.4054951667785645, + "learning_rate": 4.819739042541209e-05, + "loss": 4.3307, + "step": 20450 + }, + { + "epoch": 0.12162789037967456, + "grad_norm": 1.5449576377868652, + "learning_rate": 4.81972162683082e-05, + "loss": 4.8499, + "step": 20451 + }, + { + "epoch": 0.12163383766295556, + "grad_norm": 1.3887544870376587, + "learning_rate": 4.8197042103106394e-05, + "loss": 4.622, + "step": 20452 + }, + { + "epoch": 0.12163978494623656, + "grad_norm": 1.319422960281372, + "learning_rate": 4.819686792980673e-05, + "loss": 4.5172, + "step": 20453 + }, + { + "epoch": 0.12164573222951755, + "grad_norm": 1.3681663274765015, + "learning_rate": 4.8196693748409296e-05, + "loss": 4.8121, + "step": 20454 + }, + { + "epoch": 0.12165167951279855, + "grad_norm": 1.250482439994812, + "learning_rate": 4.819651955891413e-05, + "loss": 4.8792, + "step": 20455 + }, + { + "epoch": 0.12165762679607955, + "grad_norm": 1.3297876119613647, + "learning_rate": 4.819634536132129e-05, + "loss": 5.1069, + "step": 20456 + }, + { + "epoch": 0.12166357407936054, + "grad_norm": 1.3733534812927246, + "learning_rate": 4.819617115563086e-05, + "loss": 4.6061, + "step": 20457 + }, + { + "epoch": 0.12166952136264154, + "grad_norm": 1.287663459777832, + "learning_rate": 4.819599694184288e-05, + "loss": 4.9407, + "step": 20458 + }, + { + "epoch": 0.12167546864592255, + "grad_norm": 1.4198147058486938, + "learning_rate": 4.8195822719957416e-05, + "loss": 4.5361, + "step": 20459 + }, + { + "epoch": 0.12168141592920353, + "grad_norm": 1.7429990768432617, + "learning_rate": 4.819564848997453e-05, + "loss": 4.6604, + "step": 20460 + }, + { + "epoch": 0.12168736321248454, + "grad_norm": 1.4298913478851318, + "learning_rate": 4.819547425189429e-05, + "loss": 4.7415, + "step": 20461 + }, + { + "epoch": 0.12169331049576554, + "grad_norm": 1.3519923686981201, + "learning_rate": 4.8195300005716736e-05, + "loss": 5.3706, + "step": 20462 + }, + { + "epoch": 0.12169925777904653, + "grad_norm": 1.1476925611495972, + "learning_rate": 4.819512575144195e-05, + "loss": 5.4474, + "step": 20463 + }, + { + "epoch": 0.12170520506232753, + "grad_norm": 1.2756370306015015, + "learning_rate": 4.819495148906999e-05, + "loss": 4.9747, + "step": 20464 + }, + { + "epoch": 0.12171115234560853, + "grad_norm": 1.3161675930023193, + "learning_rate": 4.8194777218600906e-05, + "loss": 4.7093, + "step": 20465 + }, + { + "epoch": 0.12171709962888952, + "grad_norm": 1.4928854703903198, + "learning_rate": 4.8194602940034766e-05, + "loss": 4.7517, + "step": 20466 + }, + { + "epoch": 0.12172304691217052, + "grad_norm": 1.426684856414795, + "learning_rate": 4.819442865337163e-05, + "loss": 4.8639, + "step": 20467 + }, + { + "epoch": 0.12172899419545152, + "grad_norm": 1.368988037109375, + "learning_rate": 4.819425435861156e-05, + "loss": 4.8532, + "step": 20468 + }, + { + "epoch": 0.12173494147873251, + "grad_norm": 1.492031455039978, + "learning_rate": 4.819408005575461e-05, + "loss": 4.5139, + "step": 20469 + }, + { + "epoch": 0.12174088876201351, + "grad_norm": 1.6340793371200562, + "learning_rate": 4.819390574480085e-05, + "loss": 4.4042, + "step": 20470 + }, + { + "epoch": 0.12174683604529452, + "grad_norm": 1.5353302955627441, + "learning_rate": 4.819373142575034e-05, + "loss": 5.1097, + "step": 20471 + }, + { + "epoch": 0.1217527833285755, + "grad_norm": 1.5314761400222778, + "learning_rate": 4.8193557098603134e-05, + "loss": 4.7689, + "step": 20472 + }, + { + "epoch": 0.1217587306118565, + "grad_norm": 1.4626027345657349, + "learning_rate": 4.8193382763359295e-05, + "loss": 4.434, + "step": 20473 + }, + { + "epoch": 0.12176467789513751, + "grad_norm": 1.621871829032898, + "learning_rate": 4.8193208420018885e-05, + "loss": 4.5098, + "step": 20474 + }, + { + "epoch": 0.1217706251784185, + "grad_norm": 1.5429425239562988, + "learning_rate": 4.819303406858198e-05, + "loss": 4.4547, + "step": 20475 + }, + { + "epoch": 0.1217765724616995, + "grad_norm": 1.5002613067626953, + "learning_rate": 4.819285970904861e-05, + "loss": 4.6906, + "step": 20476 + }, + { + "epoch": 0.1217825197449805, + "grad_norm": 1.2322206497192383, + "learning_rate": 4.819268534141886e-05, + "loss": 5.049, + "step": 20477 + }, + { + "epoch": 0.12178846702826149, + "grad_norm": 1.2598546743392944, + "learning_rate": 4.819251096569278e-05, + "loss": 5.2906, + "step": 20478 + }, + { + "epoch": 0.12179441431154249, + "grad_norm": 1.2702369689941406, + "learning_rate": 4.8192336581870436e-05, + "loss": 5.1828, + "step": 20479 + }, + { + "epoch": 0.12180036159482348, + "grad_norm": 1.3816938400268555, + "learning_rate": 4.819216218995189e-05, + "loss": 5.1083, + "step": 20480 + }, + { + "epoch": 0.12180630887810448, + "grad_norm": 1.2958251237869263, + "learning_rate": 4.819198778993719e-05, + "loss": 5.1715, + "step": 20481 + }, + { + "epoch": 0.12181225616138548, + "grad_norm": 1.2317209243774414, + "learning_rate": 4.819181338182641e-05, + "loss": 5.1969, + "step": 20482 + }, + { + "epoch": 0.12181820344466647, + "grad_norm": 1.362483263015747, + "learning_rate": 4.819163896561961e-05, + "loss": 5.0893, + "step": 20483 + }, + { + "epoch": 0.12182415072794747, + "grad_norm": 1.1019991636276245, + "learning_rate": 4.819146454131685e-05, + "loss": 5.411, + "step": 20484 + }, + { + "epoch": 0.12183009801122847, + "grad_norm": 1.3575057983398438, + "learning_rate": 4.8191290108918184e-05, + "loss": 5.1797, + "step": 20485 + }, + { + "epoch": 0.12183604529450946, + "grad_norm": 1.4110307693481445, + "learning_rate": 4.8191115668423685e-05, + "loss": 5.3108, + "step": 20486 + }, + { + "epoch": 0.12184199257779046, + "grad_norm": 1.3322244882583618, + "learning_rate": 4.819094121983341e-05, + "loss": 5.238, + "step": 20487 + }, + { + "epoch": 0.12184793986107147, + "grad_norm": 1.3466796875, + "learning_rate": 4.819076676314741e-05, + "loss": 5.2786, + "step": 20488 + }, + { + "epoch": 0.12185388714435245, + "grad_norm": 1.4118572473526, + "learning_rate": 4.819059229836575e-05, + "loss": 5.0254, + "step": 20489 + }, + { + "epoch": 0.12185983442763346, + "grad_norm": 1.6264641284942627, + "learning_rate": 4.81904178254885e-05, + "loss": 4.8822, + "step": 20490 + }, + { + "epoch": 0.12186578171091446, + "grad_norm": 1.325591802597046, + "learning_rate": 4.8190243344515705e-05, + "loss": 5.5997, + "step": 20491 + }, + { + "epoch": 0.12187172899419545, + "grad_norm": 1.5424168109893799, + "learning_rate": 4.8190068855447444e-05, + "loss": 5.2096, + "step": 20492 + }, + { + "epoch": 0.12187767627747645, + "grad_norm": 1.3096263408660889, + "learning_rate": 4.818989435828377e-05, + "loss": 5.1026, + "step": 20493 + }, + { + "epoch": 0.12188362356075745, + "grad_norm": 1.3479657173156738, + "learning_rate": 4.8189719853024746e-05, + "loss": 5.0403, + "step": 20494 + }, + { + "epoch": 0.12188957084403844, + "grad_norm": 1.1970547437667847, + "learning_rate": 4.818954533967043e-05, + "loss": 5.06, + "step": 20495 + }, + { + "epoch": 0.12189551812731944, + "grad_norm": 1.3364722728729248, + "learning_rate": 4.818937081822088e-05, + "loss": 5.0216, + "step": 20496 + }, + { + "epoch": 0.12190146541060044, + "grad_norm": 1.2553714513778687, + "learning_rate": 4.818919628867615e-05, + "loss": 4.9662, + "step": 20497 + }, + { + "epoch": 0.12190741269388143, + "grad_norm": 1.270330786705017, + "learning_rate": 4.818902175103633e-05, + "loss": 4.8526, + "step": 20498 + }, + { + "epoch": 0.12191335997716243, + "grad_norm": 1.4872468709945679, + "learning_rate": 4.818884720530145e-05, + "loss": 4.9435, + "step": 20499 + }, + { + "epoch": 0.12191930726044344, + "grad_norm": 1.3152670860290527, + "learning_rate": 4.818867265147159e-05, + "loss": 5.1301, + "step": 20500 + }, + { + "epoch": 0.12192525454372442, + "grad_norm": 1.210864543914795, + "learning_rate": 4.8188498089546794e-05, + "loss": 5.1465, + "step": 20501 + }, + { + "epoch": 0.12193120182700543, + "grad_norm": 1.276159644126892, + "learning_rate": 4.818832351952715e-05, + "loss": 5.0847, + "step": 20502 + }, + { + "epoch": 0.12193714911028643, + "grad_norm": 1.449988842010498, + "learning_rate": 4.8188148941412684e-05, + "loss": 5.1143, + "step": 20503 + }, + { + "epoch": 0.12194309639356742, + "grad_norm": 1.241921305656433, + "learning_rate": 4.818797435520348e-05, + "loss": 5.067, + "step": 20504 + }, + { + "epoch": 0.12194904367684842, + "grad_norm": 1.3087794780731201, + "learning_rate": 4.81877997608996e-05, + "loss": 5.121, + "step": 20505 + }, + { + "epoch": 0.12195499096012942, + "grad_norm": 1.2226066589355469, + "learning_rate": 4.8187625158501095e-05, + "loss": 5.1879, + "step": 20506 + }, + { + "epoch": 0.12196093824341041, + "grad_norm": 1.2744648456573486, + "learning_rate": 4.8187450548008025e-05, + "loss": 5.1308, + "step": 20507 + }, + { + "epoch": 0.12196688552669141, + "grad_norm": 1.3409245014190674, + "learning_rate": 4.8187275929420464e-05, + "loss": 5.0914, + "step": 20508 + }, + { + "epoch": 0.1219728328099724, + "grad_norm": 1.2840641736984253, + "learning_rate": 4.818710130273846e-05, + "loss": 5.0818, + "step": 20509 + }, + { + "epoch": 0.1219787800932534, + "grad_norm": 1.4204998016357422, + "learning_rate": 4.818692666796207e-05, + "loss": 5.4553, + "step": 20510 + }, + { + "epoch": 0.1219847273765344, + "grad_norm": 1.3061211109161377, + "learning_rate": 4.818675202509137e-05, + "loss": 5.1777, + "step": 20511 + }, + { + "epoch": 0.12199067465981539, + "grad_norm": 1.3137598037719727, + "learning_rate": 4.818657737412642e-05, + "loss": 5.1156, + "step": 20512 + }, + { + "epoch": 0.12199662194309639, + "grad_norm": 1.1616209745407104, + "learning_rate": 4.818640271506727e-05, + "loss": 5.3169, + "step": 20513 + }, + { + "epoch": 0.1220025692263774, + "grad_norm": 1.270844578742981, + "learning_rate": 4.8186228047914e-05, + "loss": 5.3005, + "step": 20514 + }, + { + "epoch": 0.12200851650965838, + "grad_norm": 1.4955285787582397, + "learning_rate": 4.818605337266664e-05, + "loss": 5.1762, + "step": 20515 + }, + { + "epoch": 0.12201446379293938, + "grad_norm": 1.3431698083877563, + "learning_rate": 4.818587868932527e-05, + "loss": 4.9477, + "step": 20516 + }, + { + "epoch": 0.12202041107622039, + "grad_norm": 1.3437286615371704, + "learning_rate": 4.818570399788995e-05, + "loss": 4.7787, + "step": 20517 + }, + { + "epoch": 0.12202635835950137, + "grad_norm": 1.3840901851654053, + "learning_rate": 4.818552929836074e-05, + "loss": 5.0749, + "step": 20518 + }, + { + "epoch": 0.12203230564278238, + "grad_norm": 1.3907465934753418, + "learning_rate": 4.8185354590737707e-05, + "loss": 4.9084, + "step": 20519 + }, + { + "epoch": 0.12203825292606338, + "grad_norm": 1.360065221786499, + "learning_rate": 4.818517987502091e-05, + "loss": 4.9323, + "step": 20520 + }, + { + "epoch": 0.12204420020934437, + "grad_norm": 1.1924186944961548, + "learning_rate": 4.818500515121039e-05, + "loss": 4.8237, + "step": 20521 + }, + { + "epoch": 0.12205014749262537, + "grad_norm": 1.6362069845199585, + "learning_rate": 4.818483041930624e-05, + "loss": 4.6073, + "step": 20522 + }, + { + "epoch": 0.12205609477590637, + "grad_norm": 1.4413504600524902, + "learning_rate": 4.81846556793085e-05, + "loss": 4.7733, + "step": 20523 + }, + { + "epoch": 0.12206204205918736, + "grad_norm": 1.5076016187667847, + "learning_rate": 4.818448093121723e-05, + "loss": 5.4376, + "step": 20524 + }, + { + "epoch": 0.12206798934246836, + "grad_norm": 1.5311039686203003, + "learning_rate": 4.818430617503251e-05, + "loss": 5.1398, + "step": 20525 + }, + { + "epoch": 0.12207393662574936, + "grad_norm": 1.4373403787612915, + "learning_rate": 4.818413141075438e-05, + "loss": 4.897, + "step": 20526 + }, + { + "epoch": 0.12207988390903035, + "grad_norm": 1.4221818447113037, + "learning_rate": 4.818395663838291e-05, + "loss": 5.223, + "step": 20527 + }, + { + "epoch": 0.12208583119231135, + "grad_norm": 1.2606967687606812, + "learning_rate": 4.818378185791817e-05, + "loss": 4.7242, + "step": 20528 + }, + { + "epoch": 0.12209177847559236, + "grad_norm": 1.2508289813995361, + "learning_rate": 4.818360706936019e-05, + "loss": 4.623, + "step": 20529 + }, + { + "epoch": 0.12209772575887334, + "grad_norm": 1.3701050281524658, + "learning_rate": 4.8183432272709065e-05, + "loss": 4.6716, + "step": 20530 + }, + { + "epoch": 0.12210367304215435, + "grad_norm": 1.5785399675369263, + "learning_rate": 4.818325746796485e-05, + "loss": 4.5495, + "step": 20531 + }, + { + "epoch": 0.12210962032543535, + "grad_norm": 1.4542807340621948, + "learning_rate": 4.8183082655127584e-05, + "loss": 4.6848, + "step": 20532 + }, + { + "epoch": 0.12211556760871634, + "grad_norm": 1.2740551233291626, + "learning_rate": 4.818290783419736e-05, + "loss": 4.7792, + "step": 20533 + }, + { + "epoch": 0.12212151489199734, + "grad_norm": 1.2965741157531738, + "learning_rate": 4.8182733005174205e-05, + "loss": 4.7552, + "step": 20534 + }, + { + "epoch": 0.12212746217527834, + "grad_norm": 1.3440501689910889, + "learning_rate": 4.8182558168058215e-05, + "loss": 5.0506, + "step": 20535 + }, + { + "epoch": 0.12213340945855933, + "grad_norm": 1.3767000436782837, + "learning_rate": 4.8182383322849415e-05, + "loss": 5.0523, + "step": 20536 + }, + { + "epoch": 0.12213935674184033, + "grad_norm": 1.4770883321762085, + "learning_rate": 4.81822084695479e-05, + "loss": 5.117, + "step": 20537 + }, + { + "epoch": 0.12214530402512132, + "grad_norm": 1.4463403224945068, + "learning_rate": 4.818203360815371e-05, + "loss": 5.0566, + "step": 20538 + }, + { + "epoch": 0.12215125130840232, + "grad_norm": 1.5590862035751343, + "learning_rate": 4.8181858738666905e-05, + "loss": 5.1184, + "step": 20539 + }, + { + "epoch": 0.12215719859168332, + "grad_norm": 1.2578922510147095, + "learning_rate": 4.818168386108756e-05, + "loss": 5.0364, + "step": 20540 + }, + { + "epoch": 0.12216314587496431, + "grad_norm": 1.363750696182251, + "learning_rate": 4.8181508975415727e-05, + "loss": 5.1133, + "step": 20541 + }, + { + "epoch": 0.12216909315824531, + "grad_norm": 1.5973013639450073, + "learning_rate": 4.8181334081651474e-05, + "loss": 4.9659, + "step": 20542 + }, + { + "epoch": 0.12217504044152631, + "grad_norm": 1.4429646730422974, + "learning_rate": 4.818115917979485e-05, + "loss": 5.1669, + "step": 20543 + }, + { + "epoch": 0.1221809877248073, + "grad_norm": 1.4704759120941162, + "learning_rate": 4.818098426984592e-05, + "loss": 5.1613, + "step": 20544 + }, + { + "epoch": 0.1221869350080883, + "grad_norm": 1.3613824844360352, + "learning_rate": 4.8180809351804756e-05, + "loss": 5.2524, + "step": 20545 + }, + { + "epoch": 0.1221928822913693, + "grad_norm": 1.199265480041504, + "learning_rate": 4.8180634425671404e-05, + "loss": 5.1596, + "step": 20546 + }, + { + "epoch": 0.1221988295746503, + "grad_norm": 1.3537240028381348, + "learning_rate": 4.818045949144594e-05, + "loss": 5.1456, + "step": 20547 + }, + { + "epoch": 0.1222047768579313, + "grad_norm": 1.4804584980010986, + "learning_rate": 4.818028454912841e-05, + "loss": 5.0443, + "step": 20548 + }, + { + "epoch": 0.1222107241412123, + "grad_norm": 1.3245832920074463, + "learning_rate": 4.8180109598718884e-05, + "loss": 4.9495, + "step": 20549 + }, + { + "epoch": 0.12221667142449329, + "grad_norm": 1.5168079137802124, + "learning_rate": 4.817993464021742e-05, + "loss": 4.8094, + "step": 20550 + }, + { + "epoch": 0.12222261870777429, + "grad_norm": 1.4146143198013306, + "learning_rate": 4.817975967362408e-05, + "loss": 5.0319, + "step": 20551 + }, + { + "epoch": 0.12222856599105529, + "grad_norm": 1.30800199508667, + "learning_rate": 4.817958469893893e-05, + "loss": 4.6641, + "step": 20552 + }, + { + "epoch": 0.12223451327433628, + "grad_norm": 1.1652897596359253, + "learning_rate": 4.8179409716162026e-05, + "loss": 4.8978, + "step": 20553 + }, + { + "epoch": 0.12224046055761728, + "grad_norm": 1.4594627618789673, + "learning_rate": 4.817923472529343e-05, + "loss": 5.0124, + "step": 20554 + }, + { + "epoch": 0.12224640784089828, + "grad_norm": 1.2955336570739746, + "learning_rate": 4.81790597263332e-05, + "loss": 5.0336, + "step": 20555 + }, + { + "epoch": 0.12225235512417927, + "grad_norm": 1.3508485555648804, + "learning_rate": 4.8178884719281395e-05, + "loss": 4.8695, + "step": 20556 + }, + { + "epoch": 0.12225830240746027, + "grad_norm": 1.363410472869873, + "learning_rate": 4.8178709704138094e-05, + "loss": 4.9162, + "step": 20557 + }, + { + "epoch": 0.12226424969074128, + "grad_norm": 1.4330451488494873, + "learning_rate": 4.817853468090333e-05, + "loss": 4.8993, + "step": 20558 + }, + { + "epoch": 0.12227019697402226, + "grad_norm": 1.3630226850509644, + "learning_rate": 4.817835964957719e-05, + "loss": 4.9196, + "step": 20559 + }, + { + "epoch": 0.12227614425730327, + "grad_norm": 1.4265079498291016, + "learning_rate": 4.817818461015972e-05, + "loss": 4.8966, + "step": 20560 + }, + { + "epoch": 0.12228209154058427, + "grad_norm": 1.4709514379501343, + "learning_rate": 4.817800956265098e-05, + "loss": 4.7685, + "step": 20561 + }, + { + "epoch": 0.12228803882386526, + "grad_norm": 1.1047412157058716, + "learning_rate": 4.8177834507051044e-05, + "loss": 4.8495, + "step": 20562 + }, + { + "epoch": 0.12229398610714626, + "grad_norm": 1.302027940750122, + "learning_rate": 4.817765944335996e-05, + "loss": 4.9414, + "step": 20563 + }, + { + "epoch": 0.12229993339042726, + "grad_norm": 1.2321425676345825, + "learning_rate": 4.8177484371577796e-05, + "loss": 4.8089, + "step": 20564 + }, + { + "epoch": 0.12230588067370825, + "grad_norm": 1.5107663869857788, + "learning_rate": 4.8177309291704616e-05, + "loss": 4.8964, + "step": 20565 + }, + { + "epoch": 0.12231182795698925, + "grad_norm": 1.4476573467254639, + "learning_rate": 4.817713420374047e-05, + "loss": 5.1385, + "step": 20566 + }, + { + "epoch": 0.12231777524027024, + "grad_norm": 1.7367160320281982, + "learning_rate": 4.817695910768544e-05, + "loss": 4.7051, + "step": 20567 + }, + { + "epoch": 0.12232372252355124, + "grad_norm": 1.7436206340789795, + "learning_rate": 4.817678400353955e-05, + "loss": 5.0161, + "step": 20568 + }, + { + "epoch": 0.12232966980683224, + "grad_norm": 1.667702317237854, + "learning_rate": 4.8176608891302905e-05, + "loss": 4.7507, + "step": 20569 + }, + { + "epoch": 0.12233561709011323, + "grad_norm": 1.3754125833511353, + "learning_rate": 4.817643377097554e-05, + "loss": 4.9623, + "step": 20570 + }, + { + "epoch": 0.12234156437339423, + "grad_norm": 1.539730191230774, + "learning_rate": 4.817625864255751e-05, + "loss": 4.9798, + "step": 20571 + }, + { + "epoch": 0.12234751165667523, + "grad_norm": 1.2995619773864746, + "learning_rate": 4.81760835060489e-05, + "loss": 4.9225, + "step": 20572 + }, + { + "epoch": 0.12235345893995622, + "grad_norm": 1.4950238466262817, + "learning_rate": 4.817590836144975e-05, + "loss": 5.0578, + "step": 20573 + }, + { + "epoch": 0.12235940622323722, + "grad_norm": 1.5506999492645264, + "learning_rate": 4.8175733208760144e-05, + "loss": 4.7418, + "step": 20574 + }, + { + "epoch": 0.12236535350651823, + "grad_norm": 2.153271198272705, + "learning_rate": 4.817555804798012e-05, + "loss": 4.8025, + "step": 20575 + }, + { + "epoch": 0.12237130078979921, + "grad_norm": 1.4991137981414795, + "learning_rate": 4.817538287910974e-05, + "loss": 4.9943, + "step": 20576 + }, + { + "epoch": 0.12237724807308022, + "grad_norm": 1.3596469163894653, + "learning_rate": 4.8175207702149085e-05, + "loss": 5.4109, + "step": 20577 + }, + { + "epoch": 0.12238319535636122, + "grad_norm": 1.182950735092163, + "learning_rate": 4.81750325170982e-05, + "loss": 5.4844, + "step": 20578 + }, + { + "epoch": 0.1223891426396422, + "grad_norm": 1.2713780403137207, + "learning_rate": 4.817485732395715e-05, + "loss": 5.3333, + "step": 20579 + }, + { + "epoch": 0.12239508992292321, + "grad_norm": 1.396163821220398, + "learning_rate": 4.8174682122726e-05, + "loss": 5.1666, + "step": 20580 + }, + { + "epoch": 0.12240103720620421, + "grad_norm": 1.3530118465423584, + "learning_rate": 4.81745069134048e-05, + "loss": 5.055, + "step": 20581 + }, + { + "epoch": 0.1224069844894852, + "grad_norm": 1.1625109910964966, + "learning_rate": 4.8174331695993626e-05, + "loss": 5.2553, + "step": 20582 + }, + { + "epoch": 0.1224129317727662, + "grad_norm": 1.4428709745407104, + "learning_rate": 4.817415647049253e-05, + "loss": 5.1255, + "step": 20583 + }, + { + "epoch": 0.1224188790560472, + "grad_norm": 1.674591064453125, + "learning_rate": 4.8173981236901574e-05, + "loss": 4.7623, + "step": 20584 + }, + { + "epoch": 0.12242482633932819, + "grad_norm": 1.4691076278686523, + "learning_rate": 4.817380599522083e-05, + "loss": 5.1077, + "step": 20585 + }, + { + "epoch": 0.12243077362260919, + "grad_norm": 1.0224462747573853, + "learning_rate": 4.817363074545034e-05, + "loss": 5.1022, + "step": 20586 + }, + { + "epoch": 0.1224367209058902, + "grad_norm": 1.3090193271636963, + "learning_rate": 4.817345548759018e-05, + "loss": 5.121, + "step": 20587 + }, + { + "epoch": 0.12244266818917118, + "grad_norm": 1.028120756149292, + "learning_rate": 4.81732802216404e-05, + "loss": 5.2709, + "step": 20588 + }, + { + "epoch": 0.12244861547245219, + "grad_norm": 1.3667192459106445, + "learning_rate": 4.817310494760107e-05, + "loss": 5.075, + "step": 20589 + }, + { + "epoch": 0.12245456275573319, + "grad_norm": 1.3145662546157837, + "learning_rate": 4.8172929665472255e-05, + "loss": 5.1258, + "step": 20590 + }, + { + "epoch": 0.12246051003901418, + "grad_norm": 1.2744371891021729, + "learning_rate": 4.8172754375254e-05, + "loss": 5.0155, + "step": 20591 + }, + { + "epoch": 0.12246645732229518, + "grad_norm": 1.4647456407546997, + "learning_rate": 4.817257907694638e-05, + "loss": 5.0325, + "step": 20592 + }, + { + "epoch": 0.12247240460557618, + "grad_norm": 1.1393789052963257, + "learning_rate": 4.817240377054945e-05, + "loss": 5.1304, + "step": 20593 + }, + { + "epoch": 0.12247835188885717, + "grad_norm": 1.3927806615829468, + "learning_rate": 4.817222845606328e-05, + "loss": 5.0588, + "step": 20594 + }, + { + "epoch": 0.12248429917213817, + "grad_norm": 1.3344571590423584, + "learning_rate": 4.817205313348792e-05, + "loss": 5.0428, + "step": 20595 + }, + { + "epoch": 0.12249024645541916, + "grad_norm": 0.9816542267799377, + "learning_rate": 4.817187780282343e-05, + "loss": 5.0046, + "step": 20596 + }, + { + "epoch": 0.12249619373870016, + "grad_norm": 1.1602904796600342, + "learning_rate": 4.817170246406989e-05, + "loss": 5.0372, + "step": 20597 + }, + { + "epoch": 0.12250214102198116, + "grad_norm": 1.2147279977798462, + "learning_rate": 4.817152711722733e-05, + "loss": 4.999, + "step": 20598 + }, + { + "epoch": 0.12250808830526215, + "grad_norm": 1.3654884099960327, + "learning_rate": 4.817135176229585e-05, + "loss": 5.0635, + "step": 20599 + }, + { + "epoch": 0.12251403558854315, + "grad_norm": 1.3051310777664185, + "learning_rate": 4.817117639927547e-05, + "loss": 5.0137, + "step": 20600 + }, + { + "epoch": 0.12251998287182415, + "grad_norm": 1.2217040061950684, + "learning_rate": 4.8171001028166284e-05, + "loss": 4.7167, + "step": 20601 + }, + { + "epoch": 0.12252593015510514, + "grad_norm": 1.3541781902313232, + "learning_rate": 4.8170825648968345e-05, + "loss": 4.9244, + "step": 20602 + }, + { + "epoch": 0.12253187743838614, + "grad_norm": 1.2899030447006226, + "learning_rate": 4.81706502616817e-05, + "loss": 5.0452, + "step": 20603 + }, + { + "epoch": 0.12253782472166715, + "grad_norm": 1.4059736728668213, + "learning_rate": 4.817047486630643e-05, + "loss": 4.9318, + "step": 20604 + }, + { + "epoch": 0.12254377200494813, + "grad_norm": 1.6990517377853394, + "learning_rate": 4.817029946284257e-05, + "loss": 4.5067, + "step": 20605 + }, + { + "epoch": 0.12254971928822914, + "grad_norm": 1.4028486013412476, + "learning_rate": 4.817012405129021e-05, + "loss": 5.0994, + "step": 20606 + }, + { + "epoch": 0.12255566657151014, + "grad_norm": 1.5692994594573975, + "learning_rate": 4.8169948631649395e-05, + "loss": 4.742, + "step": 20607 + }, + { + "epoch": 0.12256161385479113, + "grad_norm": 1.4501662254333496, + "learning_rate": 4.81697732039202e-05, + "loss": 4.9951, + "step": 20608 + }, + { + "epoch": 0.12256756113807213, + "grad_norm": 1.2898585796356201, + "learning_rate": 4.816959776810267e-05, + "loss": 5.2756, + "step": 20609 + }, + { + "epoch": 0.12257350842135313, + "grad_norm": 1.2808797359466553, + "learning_rate": 4.8169422324196867e-05, + "loss": 5.043, + "step": 20610 + }, + { + "epoch": 0.12257945570463412, + "grad_norm": 1.6888319253921509, + "learning_rate": 4.816924687220287e-05, + "loss": 4.6803, + "step": 20611 + }, + { + "epoch": 0.12258540298791512, + "grad_norm": 1.6619288921356201, + "learning_rate": 4.8169071412120716e-05, + "loss": 4.7334, + "step": 20612 + }, + { + "epoch": 0.12259135027119612, + "grad_norm": 1.4474331140518188, + "learning_rate": 4.816889594395049e-05, + "loss": 4.8519, + "step": 20613 + }, + { + "epoch": 0.12259729755447711, + "grad_norm": 1.519037127494812, + "learning_rate": 4.816872046769223e-05, + "loss": 4.7864, + "step": 20614 + }, + { + "epoch": 0.12260324483775811, + "grad_norm": 1.4860186576843262, + "learning_rate": 4.816854498334602e-05, + "loss": 4.7542, + "step": 20615 + }, + { + "epoch": 0.12260919212103912, + "grad_norm": 1.3120838403701782, + "learning_rate": 4.81683694909119e-05, + "loss": 4.6539, + "step": 20616 + }, + { + "epoch": 0.1226151394043201, + "grad_norm": 1.4509785175323486, + "learning_rate": 4.816819399038995e-05, + "loss": 5.105, + "step": 20617 + }, + { + "epoch": 0.1226210866876011, + "grad_norm": 1.428066372871399, + "learning_rate": 4.816801848178022e-05, + "loss": 5.1138, + "step": 20618 + }, + { + "epoch": 0.12262703397088211, + "grad_norm": 1.3920371532440186, + "learning_rate": 4.816784296508277e-05, + "loss": 5.0398, + "step": 20619 + }, + { + "epoch": 0.1226329812541631, + "grad_norm": 1.258225679397583, + "learning_rate": 4.816766744029767e-05, + "loss": 4.7204, + "step": 20620 + }, + { + "epoch": 0.1226389285374441, + "grad_norm": 1.4209269285202026, + "learning_rate": 4.816749190742498e-05, + "loss": 4.6532, + "step": 20621 + }, + { + "epoch": 0.1226448758207251, + "grad_norm": 1.6276925802230835, + "learning_rate": 4.816731636646475e-05, + "loss": 4.7025, + "step": 20622 + }, + { + "epoch": 0.12265082310400609, + "grad_norm": 1.3714722394943237, + "learning_rate": 4.8167140817417055e-05, + "loss": 5.1781, + "step": 20623 + }, + { + "epoch": 0.12265677038728709, + "grad_norm": 1.397017240524292, + "learning_rate": 4.816696526028195e-05, + "loss": 5.2097, + "step": 20624 + }, + { + "epoch": 0.12266271767056808, + "grad_norm": 1.2807291746139526, + "learning_rate": 4.8166789695059486e-05, + "loss": 5.1588, + "step": 20625 + }, + { + "epoch": 0.12266866495384908, + "grad_norm": 1.301222562789917, + "learning_rate": 4.816661412174976e-05, + "loss": 5.0906, + "step": 20626 + }, + { + "epoch": 0.12267461223713008, + "grad_norm": 1.6813510656356812, + "learning_rate": 4.816643854035279e-05, + "loss": 4.4956, + "step": 20627 + }, + { + "epoch": 0.12268055952041107, + "grad_norm": 1.7415688037872314, + "learning_rate": 4.816626295086865e-05, + "loss": 4.4246, + "step": 20628 + }, + { + "epoch": 0.12268650680369207, + "grad_norm": 1.9389246702194214, + "learning_rate": 4.816608735329742e-05, + "loss": 4.4231, + "step": 20629 + }, + { + "epoch": 0.12269245408697307, + "grad_norm": 1.7021642923355103, + "learning_rate": 4.816591174763914e-05, + "loss": 4.5314, + "step": 20630 + }, + { + "epoch": 0.12269840137025406, + "grad_norm": 1.889491081237793, + "learning_rate": 4.8165736133893876e-05, + "loss": 4.384, + "step": 20631 + }, + { + "epoch": 0.12270434865353506, + "grad_norm": 1.8447821140289307, + "learning_rate": 4.816556051206171e-05, + "loss": 4.5086, + "step": 20632 + }, + { + "epoch": 0.12271029593681607, + "grad_norm": 1.7669256925582886, + "learning_rate": 4.8165384882142674e-05, + "loss": 4.4537, + "step": 20633 + }, + { + "epoch": 0.12271624322009705, + "grad_norm": 1.8175028562545776, + "learning_rate": 4.8165209244136846e-05, + "loss": 4.4478, + "step": 20634 + }, + { + "epoch": 0.12272219050337806, + "grad_norm": 1.7047181129455566, + "learning_rate": 4.816503359804427e-05, + "loss": 4.7366, + "step": 20635 + }, + { + "epoch": 0.12272813778665906, + "grad_norm": 1.4321893453598022, + "learning_rate": 4.816485794386504e-05, + "loss": 4.9958, + "step": 20636 + }, + { + "epoch": 0.12273408506994005, + "grad_norm": 1.3354036808013916, + "learning_rate": 4.816468228159918e-05, + "loss": 4.906, + "step": 20637 + }, + { + "epoch": 0.12274003235322105, + "grad_norm": 1.281680703163147, + "learning_rate": 4.8164506611246784e-05, + "loss": 4.884, + "step": 20638 + }, + { + "epoch": 0.12274597963650205, + "grad_norm": 1.32127046585083, + "learning_rate": 4.8164330932807885e-05, + "loss": 4.8039, + "step": 20639 + }, + { + "epoch": 0.12275192691978304, + "grad_norm": 1.2233742475509644, + "learning_rate": 4.816415524628257e-05, + "loss": 4.8872, + "step": 20640 + }, + { + "epoch": 0.12275787420306404, + "grad_norm": 1.4896177053451538, + "learning_rate": 4.816397955167088e-05, + "loss": 5.0379, + "step": 20641 + }, + { + "epoch": 0.12276382148634504, + "grad_norm": 1.389992594718933, + "learning_rate": 4.8163803848972886e-05, + "loss": 5.1364, + "step": 20642 + }, + { + "epoch": 0.12276976876962603, + "grad_norm": 1.4248872995376587, + "learning_rate": 4.8163628138188645e-05, + "loss": 5.3152, + "step": 20643 + }, + { + "epoch": 0.12277571605290703, + "grad_norm": 1.3105376958847046, + "learning_rate": 4.816345241931822e-05, + "loss": 4.9878, + "step": 20644 + }, + { + "epoch": 0.12278166333618803, + "grad_norm": 1.3307970762252808, + "learning_rate": 4.816327669236167e-05, + "loss": 4.9105, + "step": 20645 + }, + { + "epoch": 0.12278761061946902, + "grad_norm": 1.9464685916900635, + "learning_rate": 4.816310095731907e-05, + "loss": 5.2259, + "step": 20646 + }, + { + "epoch": 0.12279355790275003, + "grad_norm": 1.4600616693496704, + "learning_rate": 4.816292521419046e-05, + "loss": 4.7044, + "step": 20647 + }, + { + "epoch": 0.12279950518603103, + "grad_norm": 1.202574610710144, + "learning_rate": 4.816274946297592e-05, + "loss": 5.1854, + "step": 20648 + }, + { + "epoch": 0.12280545246931202, + "grad_norm": 1.5569230318069458, + "learning_rate": 4.81625737036755e-05, + "loss": 4.8316, + "step": 20649 + }, + { + "epoch": 0.12281139975259302, + "grad_norm": 1.3303078413009644, + "learning_rate": 4.8162397936289264e-05, + "loss": 4.891, + "step": 20650 + }, + { + "epoch": 0.12281734703587402, + "grad_norm": 1.2397204637527466, + "learning_rate": 4.816222216081728e-05, + "loss": 4.8077, + "step": 20651 + }, + { + "epoch": 0.12282329431915501, + "grad_norm": 1.29647696018219, + "learning_rate": 4.8162046377259594e-05, + "loss": 4.7518, + "step": 20652 + }, + { + "epoch": 0.12282924160243601, + "grad_norm": 1.4492244720458984, + "learning_rate": 4.816187058561629e-05, + "loss": 4.6352, + "step": 20653 + }, + { + "epoch": 0.122835188885717, + "grad_norm": 1.2785146236419678, + "learning_rate": 4.81616947858874e-05, + "loss": 4.9128, + "step": 20654 + }, + { + "epoch": 0.122841136168998, + "grad_norm": 1.2652465105056763, + "learning_rate": 4.8161518978073016e-05, + "loss": 5.1555, + "step": 20655 + }, + { + "epoch": 0.122847083452279, + "grad_norm": 1.5048694610595703, + "learning_rate": 4.816134316217318e-05, + "loss": 5.0648, + "step": 20656 + }, + { + "epoch": 0.12285303073555999, + "grad_norm": 1.3626654148101807, + "learning_rate": 4.816116733818795e-05, + "loss": 5.0668, + "step": 20657 + }, + { + "epoch": 0.12285897801884099, + "grad_norm": 1.614112377166748, + "learning_rate": 4.816099150611741e-05, + "loss": 4.9234, + "step": 20658 + }, + { + "epoch": 0.122864925302122, + "grad_norm": 1.9453253746032715, + "learning_rate": 4.81608156659616e-05, + "loss": 4.7709, + "step": 20659 + }, + { + "epoch": 0.12287087258540298, + "grad_norm": 1.7604261636734009, + "learning_rate": 4.816063981772059e-05, + "loss": 4.8153, + "step": 20660 + }, + { + "epoch": 0.12287681986868398, + "grad_norm": 1.473319172859192, + "learning_rate": 4.8160463961394436e-05, + "loss": 4.9552, + "step": 20661 + }, + { + "epoch": 0.12288276715196499, + "grad_norm": 1.332900881767273, + "learning_rate": 4.8160288096983207e-05, + "loss": 5.1753, + "step": 20662 + }, + { + "epoch": 0.12288871443524597, + "grad_norm": 1.438464641571045, + "learning_rate": 4.816011222448696e-05, + "loss": 5.0386, + "step": 20663 + }, + { + "epoch": 0.12289466171852698, + "grad_norm": 1.4369616508483887, + "learning_rate": 4.8159936343905756e-05, + "loss": 5.1144, + "step": 20664 + }, + { + "epoch": 0.12290060900180798, + "grad_norm": 1.307914137840271, + "learning_rate": 4.8159760455239656e-05, + "loss": 5.0308, + "step": 20665 + }, + { + "epoch": 0.12290655628508897, + "grad_norm": 1.4199682474136353, + "learning_rate": 4.815958455848872e-05, + "loss": 4.9803, + "step": 20666 + }, + { + "epoch": 0.12291250356836997, + "grad_norm": 1.2451025247573853, + "learning_rate": 4.815940865365303e-05, + "loss": 5.0328, + "step": 20667 + }, + { + "epoch": 0.12291845085165097, + "grad_norm": 1.2542675733566284, + "learning_rate": 4.8159232740732615e-05, + "loss": 5.0961, + "step": 20668 + }, + { + "epoch": 0.12292439813493196, + "grad_norm": 1.4102520942687988, + "learning_rate": 4.815905681972756e-05, + "loss": 5.1512, + "step": 20669 + }, + { + "epoch": 0.12293034541821296, + "grad_norm": 1.7003612518310547, + "learning_rate": 4.81588808906379e-05, + "loss": 5.6308, + "step": 20670 + }, + { + "epoch": 0.12293629270149396, + "grad_norm": 1.7957112789154053, + "learning_rate": 4.815870495346373e-05, + "loss": 5.2033, + "step": 20671 + }, + { + "epoch": 0.12294223998477495, + "grad_norm": 1.8667526245117188, + "learning_rate": 4.815852900820509e-05, + "loss": 5.3148, + "step": 20672 + }, + { + "epoch": 0.12294818726805595, + "grad_norm": 1.5151188373565674, + "learning_rate": 4.815835305486205e-05, + "loss": 5.1791, + "step": 20673 + }, + { + "epoch": 0.12295413455133695, + "grad_norm": 1.842624545097351, + "learning_rate": 4.8158177093434666e-05, + "loss": 4.7996, + "step": 20674 + }, + { + "epoch": 0.12296008183461794, + "grad_norm": 1.6197025775909424, + "learning_rate": 4.815800112392299e-05, + "loss": 4.9929, + "step": 20675 + }, + { + "epoch": 0.12296602911789895, + "grad_norm": 1.4609524011611938, + "learning_rate": 4.8157825146327113e-05, + "loss": 4.961, + "step": 20676 + }, + { + "epoch": 0.12297197640117995, + "grad_norm": 1.479789137840271, + "learning_rate": 4.8157649160647065e-05, + "loss": 5.3686, + "step": 20677 + }, + { + "epoch": 0.12297792368446094, + "grad_norm": 2.120084524154663, + "learning_rate": 4.815747316688293e-05, + "loss": 4.8741, + "step": 20678 + }, + { + "epoch": 0.12298387096774194, + "grad_norm": 1.2068350315093994, + "learning_rate": 4.815729716503476e-05, + "loss": 5.5907, + "step": 20679 + }, + { + "epoch": 0.12298981825102294, + "grad_norm": 1.9006667137145996, + "learning_rate": 4.815712115510261e-05, + "loss": 5.0154, + "step": 20680 + }, + { + "epoch": 0.12299576553430393, + "grad_norm": 1.7368868589401245, + "learning_rate": 4.815694513708656e-05, + "loss": 5.1994, + "step": 20681 + }, + { + "epoch": 0.12300171281758493, + "grad_norm": 1.8622910976409912, + "learning_rate": 4.815676911098665e-05, + "loss": 4.7889, + "step": 20682 + }, + { + "epoch": 0.12300766010086592, + "grad_norm": 1.7475686073303223, + "learning_rate": 4.815659307680295e-05, + "loss": 5.1067, + "step": 20683 + }, + { + "epoch": 0.12301360738414692, + "grad_norm": 1.7088334560394287, + "learning_rate": 4.815641703453553e-05, + "loss": 4.8665, + "step": 20684 + }, + { + "epoch": 0.12301955466742792, + "grad_norm": 1.4785330295562744, + "learning_rate": 4.815624098418444e-05, + "loss": 5.417, + "step": 20685 + }, + { + "epoch": 0.12302550195070891, + "grad_norm": 1.5346219539642334, + "learning_rate": 4.8156064925749745e-05, + "loss": 5.4747, + "step": 20686 + }, + { + "epoch": 0.12303144923398991, + "grad_norm": 1.7572461366653442, + "learning_rate": 4.815588885923151e-05, + "loss": 5.021, + "step": 20687 + }, + { + "epoch": 0.12303739651727091, + "grad_norm": 1.57370126247406, + "learning_rate": 4.815571278462979e-05, + "loss": 5.5248, + "step": 20688 + }, + { + "epoch": 0.1230433438005519, + "grad_norm": 1.7549457550048828, + "learning_rate": 4.815553670194465e-05, + "loss": 5.346, + "step": 20689 + }, + { + "epoch": 0.1230492910838329, + "grad_norm": 1.7188549041748047, + "learning_rate": 4.8155360611176156e-05, + "loss": 5.4671, + "step": 20690 + }, + { + "epoch": 0.1230552383671139, + "grad_norm": 2.358586311340332, + "learning_rate": 4.815518451232436e-05, + "loss": 4.4753, + "step": 20691 + }, + { + "epoch": 0.1230611856503949, + "grad_norm": 2.2453999519348145, + "learning_rate": 4.815500840538933e-05, + "loss": 4.5065, + "step": 20692 + }, + { + "epoch": 0.1230671329336759, + "grad_norm": 1.505689263343811, + "learning_rate": 4.8154832290371123e-05, + "loss": 5.2223, + "step": 20693 + }, + { + "epoch": 0.1230730802169569, + "grad_norm": 1.5649336576461792, + "learning_rate": 4.8154656167269804e-05, + "loss": 5.3686, + "step": 20694 + }, + { + "epoch": 0.12307902750023789, + "grad_norm": 1.8131600618362427, + "learning_rate": 4.815448003608544e-05, + "loss": 5.5532, + "step": 20695 + }, + { + "epoch": 0.12308497478351889, + "grad_norm": 1.7565428018569946, + "learning_rate": 4.815430389681808e-05, + "loss": 5.4619, + "step": 20696 + }, + { + "epoch": 0.12309092206679989, + "grad_norm": 1.708799958229065, + "learning_rate": 4.815412774946779e-05, + "loss": 5.5746, + "step": 20697 + }, + { + "epoch": 0.12309686935008088, + "grad_norm": 1.6220203638076782, + "learning_rate": 4.815395159403464e-05, + "loss": 5.1071, + "step": 20698 + }, + { + "epoch": 0.12310281663336188, + "grad_norm": 1.5516228675842285, + "learning_rate": 4.8153775430518676e-05, + "loss": 5.3921, + "step": 20699 + }, + { + "epoch": 0.12310876391664288, + "grad_norm": 1.7192966938018799, + "learning_rate": 4.815359925891998e-05, + "loss": 5.2339, + "step": 20700 + }, + { + "epoch": 0.12311471119992387, + "grad_norm": 1.3066575527191162, + "learning_rate": 4.815342307923859e-05, + "loss": 4.998, + "step": 20701 + }, + { + "epoch": 0.12312065848320487, + "grad_norm": 1.49882173538208, + "learning_rate": 4.815324689147459e-05, + "loss": 5.0493, + "step": 20702 + }, + { + "epoch": 0.12312660576648587, + "grad_norm": 1.5100362300872803, + "learning_rate": 4.815307069562802e-05, + "loss": 5.7113, + "step": 20703 + }, + { + "epoch": 0.12313255304976686, + "grad_norm": 1.7987116575241089, + "learning_rate": 4.815289449169896e-05, + "loss": 4.3582, + "step": 20704 + }, + { + "epoch": 0.12313850033304787, + "grad_norm": 1.7036083936691284, + "learning_rate": 4.815271827968746e-05, + "loss": 5.0769, + "step": 20705 + }, + { + "epoch": 0.12314444761632887, + "grad_norm": 1.8392287492752075, + "learning_rate": 4.8152542059593584e-05, + "loss": 4.6458, + "step": 20706 + }, + { + "epoch": 0.12315039489960986, + "grad_norm": 1.7489079236984253, + "learning_rate": 4.81523658314174e-05, + "loss": 4.9117, + "step": 20707 + }, + { + "epoch": 0.12315634218289086, + "grad_norm": 2.2490482330322266, + "learning_rate": 4.8152189595158965e-05, + "loss": 5.2912, + "step": 20708 + }, + { + "epoch": 0.12316228946617186, + "grad_norm": 1.6101025342941284, + "learning_rate": 4.815201335081834e-05, + "loss": 4.9382, + "step": 20709 + }, + { + "epoch": 0.12316823674945285, + "grad_norm": 1.7892024517059326, + "learning_rate": 4.815183709839558e-05, + "loss": 5.0046, + "step": 20710 + }, + { + "epoch": 0.12317418403273385, + "grad_norm": 1.5614895820617676, + "learning_rate": 4.815166083789076e-05, + "loss": 5.5325, + "step": 20711 + }, + { + "epoch": 0.12318013131601484, + "grad_norm": 1.4775935411453247, + "learning_rate": 4.815148456930392e-05, + "loss": 5.0981, + "step": 20712 + }, + { + "epoch": 0.12318607859929584, + "grad_norm": 1.3652704954147339, + "learning_rate": 4.815130829263515e-05, + "loss": 4.9632, + "step": 20713 + }, + { + "epoch": 0.12319202588257684, + "grad_norm": 1.7767298221588135, + "learning_rate": 4.815113200788449e-05, + "loss": 4.5071, + "step": 20714 + }, + { + "epoch": 0.12319797316585783, + "grad_norm": 1.8673535585403442, + "learning_rate": 4.815095571505202e-05, + "loss": 4.3313, + "step": 20715 + }, + { + "epoch": 0.12320392044913883, + "grad_norm": 1.6682900190353394, + "learning_rate": 4.8150779414137775e-05, + "loss": 5.2341, + "step": 20716 + }, + { + "epoch": 0.12320986773241983, + "grad_norm": 1.6456630229949951, + "learning_rate": 4.815060310514184e-05, + "loss": 5.3823, + "step": 20717 + }, + { + "epoch": 0.12321581501570082, + "grad_norm": 1.9971877336502075, + "learning_rate": 4.8150426788064265e-05, + "loss": 5.1093, + "step": 20718 + }, + { + "epoch": 0.12322176229898182, + "grad_norm": 1.6881333589553833, + "learning_rate": 4.815025046290512e-05, + "loss": 5.1788, + "step": 20719 + }, + { + "epoch": 0.12322770958226283, + "grad_norm": 1.6873126029968262, + "learning_rate": 4.815007412966446e-05, + "loss": 5.4508, + "step": 20720 + }, + { + "epoch": 0.12323365686554381, + "grad_norm": 1.5401923656463623, + "learning_rate": 4.814989778834235e-05, + "loss": 5.3638, + "step": 20721 + }, + { + "epoch": 0.12323960414882482, + "grad_norm": 1.3972458839416504, + "learning_rate": 4.814972143893885e-05, + "loss": 5.3096, + "step": 20722 + }, + { + "epoch": 0.12324555143210582, + "grad_norm": 1.7662227153778076, + "learning_rate": 4.8149545081454015e-05, + "loss": 5.7959, + "step": 20723 + }, + { + "epoch": 0.1232514987153868, + "grad_norm": 1.5072314739227295, + "learning_rate": 4.814936871588792e-05, + "loss": 5.6857, + "step": 20724 + }, + { + "epoch": 0.12325744599866781, + "grad_norm": 1.6628614664077759, + "learning_rate": 4.814919234224062e-05, + "loss": 5.4054, + "step": 20725 + }, + { + "epoch": 0.12326339328194881, + "grad_norm": 1.7059345245361328, + "learning_rate": 4.814901596051217e-05, + "loss": 5.3205, + "step": 20726 + }, + { + "epoch": 0.1232693405652298, + "grad_norm": 1.5989772081375122, + "learning_rate": 4.814883957070264e-05, + "loss": 5.0841, + "step": 20727 + }, + { + "epoch": 0.1232752878485108, + "grad_norm": 1.3816654682159424, + "learning_rate": 4.814866317281209e-05, + "loss": 4.9146, + "step": 20728 + }, + { + "epoch": 0.1232812351317918, + "grad_norm": 1.3992705345153809, + "learning_rate": 4.814848676684058e-05, + "loss": 4.8416, + "step": 20729 + }, + { + "epoch": 0.12328718241507279, + "grad_norm": 1.7377054691314697, + "learning_rate": 4.814831035278818e-05, + "loss": 5.3636, + "step": 20730 + }, + { + "epoch": 0.12329312969835379, + "grad_norm": 2.1461470127105713, + "learning_rate": 4.814813393065494e-05, + "loss": 5.7162, + "step": 20731 + }, + { + "epoch": 0.1232990769816348, + "grad_norm": 1.7310097217559814, + "learning_rate": 4.814795750044092e-05, + "loss": 5.7005, + "step": 20732 + }, + { + "epoch": 0.12330502426491578, + "grad_norm": 1.678813099861145, + "learning_rate": 4.814778106214619e-05, + "loss": 5.8184, + "step": 20733 + }, + { + "epoch": 0.12331097154819678, + "grad_norm": 1.7520476579666138, + "learning_rate": 4.814760461577081e-05, + "loss": 5.5746, + "step": 20734 + }, + { + "epoch": 0.12331691883147779, + "grad_norm": 1.6140379905700684, + "learning_rate": 4.8147428161314846e-05, + "loss": 5.4311, + "step": 20735 + }, + { + "epoch": 0.12332286611475878, + "grad_norm": 1.5862205028533936, + "learning_rate": 4.814725169877834e-05, + "loss": 5.5008, + "step": 20736 + }, + { + "epoch": 0.12332881339803978, + "grad_norm": 1.5568691492080688, + "learning_rate": 4.814707522816138e-05, + "loss": 5.5164, + "step": 20737 + }, + { + "epoch": 0.12333476068132078, + "grad_norm": 1.245606780052185, + "learning_rate": 4.814689874946401e-05, + "loss": 5.4217, + "step": 20738 + }, + { + "epoch": 0.12334070796460177, + "grad_norm": 1.3054754734039307, + "learning_rate": 4.8146722262686294e-05, + "loss": 5.4749, + "step": 20739 + }, + { + "epoch": 0.12334665524788277, + "grad_norm": 1.5772032737731934, + "learning_rate": 4.81465457678283e-05, + "loss": 5.7249, + "step": 20740 + }, + { + "epoch": 0.12335260253116376, + "grad_norm": 1.469688057899475, + "learning_rate": 4.814636926489009e-05, + "loss": 5.8515, + "step": 20741 + }, + { + "epoch": 0.12335854981444476, + "grad_norm": 2.3438186645507812, + "learning_rate": 4.814619275387172e-05, + "loss": 4.7599, + "step": 20742 + }, + { + "epoch": 0.12336449709772576, + "grad_norm": 2.4038238525390625, + "learning_rate": 4.814601623477325e-05, + "loss": 4.5717, + "step": 20743 + }, + { + "epoch": 0.12337044438100675, + "grad_norm": 2.773898124694824, + "learning_rate": 4.8145839707594745e-05, + "loss": 4.4889, + "step": 20744 + }, + { + "epoch": 0.12337639166428775, + "grad_norm": 2.863701820373535, + "learning_rate": 4.814566317233626e-05, + "loss": 4.5076, + "step": 20745 + }, + { + "epoch": 0.12338233894756875, + "grad_norm": 2.066301107406616, + "learning_rate": 4.8145486628997875e-05, + "loss": 4.8112, + "step": 20746 + }, + { + "epoch": 0.12338828623084974, + "grad_norm": 2.307910680770874, + "learning_rate": 4.814531007757963e-05, + "loss": 4.3896, + "step": 20747 + }, + { + "epoch": 0.12339423351413074, + "grad_norm": 2.2435505390167236, + "learning_rate": 4.81451335180816e-05, + "loss": 4.6403, + "step": 20748 + }, + { + "epoch": 0.12340018079741175, + "grad_norm": 2.4653170108795166, + "learning_rate": 4.814495695050385e-05, + "loss": 4.4737, + "step": 20749 + }, + { + "epoch": 0.12340612808069273, + "grad_norm": 2.3770196437835693, + "learning_rate": 4.814478037484643e-05, + "loss": 4.4951, + "step": 20750 + }, + { + "epoch": 0.12341207536397374, + "grad_norm": 1.8455066680908203, + "learning_rate": 4.81446037911094e-05, + "loss": 5.2646, + "step": 20751 + }, + { + "epoch": 0.12341802264725474, + "grad_norm": 1.6683069467544556, + "learning_rate": 4.814442719929283e-05, + "loss": 5.4287, + "step": 20752 + }, + { + "epoch": 0.12342396993053573, + "grad_norm": 1.4904793500900269, + "learning_rate": 4.814425059939679e-05, + "loss": 4.9993, + "step": 20753 + }, + { + "epoch": 0.12342991721381673, + "grad_norm": 1.5601847171783447, + "learning_rate": 4.8144073991421326e-05, + "loss": 5.1637, + "step": 20754 + }, + { + "epoch": 0.12343586449709773, + "grad_norm": 1.8937057256698608, + "learning_rate": 4.8143897375366496e-05, + "loss": 4.6928, + "step": 20755 + }, + { + "epoch": 0.12344181178037872, + "grad_norm": 1.8150557279586792, + "learning_rate": 4.814372075123238e-05, + "loss": 5.8257, + "step": 20756 + }, + { + "epoch": 0.12344775906365972, + "grad_norm": 1.537091612815857, + "learning_rate": 4.814354411901902e-05, + "loss": 5.0506, + "step": 20757 + }, + { + "epoch": 0.12345370634694072, + "grad_norm": 1.9722800254821777, + "learning_rate": 4.8143367478726495e-05, + "loss": 4.2019, + "step": 20758 + }, + { + "epoch": 0.12345965363022171, + "grad_norm": 1.9497390985488892, + "learning_rate": 4.8143190830354865e-05, + "loss": 4.2974, + "step": 20759 + }, + { + "epoch": 0.12346560091350271, + "grad_norm": 1.877036690711975, + "learning_rate": 4.814301417390418e-05, + "loss": 4.1039, + "step": 20760 + }, + { + "epoch": 0.12347154819678371, + "grad_norm": 1.932218313217163, + "learning_rate": 4.814283750937451e-05, + "loss": 4.3427, + "step": 20761 + }, + { + "epoch": 0.1234774954800647, + "grad_norm": 2.175657272338867, + "learning_rate": 4.814266083676591e-05, + "loss": 4.6891, + "step": 20762 + }, + { + "epoch": 0.1234834427633457, + "grad_norm": 1.7364848852157593, + "learning_rate": 4.8142484156078456e-05, + "loss": 4.4825, + "step": 20763 + }, + { + "epoch": 0.1234893900466267, + "grad_norm": 1.7598278522491455, + "learning_rate": 4.8142307467312184e-05, + "loss": 4.0782, + "step": 20764 + }, + { + "epoch": 0.1234953373299077, + "grad_norm": 1.9056943655014038, + "learning_rate": 4.814213077046719e-05, + "loss": 4.245, + "step": 20765 + }, + { + "epoch": 0.1235012846131887, + "grad_norm": 1.8974699974060059, + "learning_rate": 4.8141954065543506e-05, + "loss": 4.0707, + "step": 20766 + }, + { + "epoch": 0.1235072318964697, + "grad_norm": 1.9884151220321655, + "learning_rate": 4.814177735254121e-05, + "loss": 4.1443, + "step": 20767 + }, + { + "epoch": 0.12351317917975069, + "grad_norm": 1.952216625213623, + "learning_rate": 4.814160063146035e-05, + "loss": 4.6248, + "step": 20768 + }, + { + "epoch": 0.12351912646303169, + "grad_norm": 2.537240743637085, + "learning_rate": 4.814142390230101e-05, + "loss": 4.8936, + "step": 20769 + }, + { + "epoch": 0.12352507374631268, + "grad_norm": 1.6106029748916626, + "learning_rate": 4.814124716506322e-05, + "loss": 5.9498, + "step": 20770 + }, + { + "epoch": 0.12353102102959368, + "grad_norm": 2.3211259841918945, + "learning_rate": 4.814107041974707e-05, + "loss": 4.634, + "step": 20771 + }, + { + "epoch": 0.12353696831287468, + "grad_norm": 2.1425933837890625, + "learning_rate": 4.814089366635261e-05, + "loss": 4.9106, + "step": 20772 + }, + { + "epoch": 0.12354291559615567, + "grad_norm": 1.9194071292877197, + "learning_rate": 4.814071690487991e-05, + "loss": 4.9044, + "step": 20773 + }, + { + "epoch": 0.12354886287943667, + "grad_norm": 2.2048282623291016, + "learning_rate": 4.814054013532902e-05, + "loss": 4.7123, + "step": 20774 + }, + { + "epoch": 0.12355481016271767, + "grad_norm": 2.1015446186065674, + "learning_rate": 4.8140363357700004e-05, + "loss": 4.6005, + "step": 20775 + }, + { + "epoch": 0.12356075744599866, + "grad_norm": 2.133510112762451, + "learning_rate": 4.814018657199293e-05, + "loss": 5.1534, + "step": 20776 + }, + { + "epoch": 0.12356670472927966, + "grad_norm": 2.050220012664795, + "learning_rate": 4.814000977820785e-05, + "loss": 4.8997, + "step": 20777 + }, + { + "epoch": 0.12357265201256067, + "grad_norm": 2.0189473628997803, + "learning_rate": 4.8139832976344836e-05, + "loss": 4.6096, + "step": 20778 + }, + { + "epoch": 0.12357859929584165, + "grad_norm": 2.515733242034912, + "learning_rate": 4.813965616640395e-05, + "loss": 4.7096, + "step": 20779 + }, + { + "epoch": 0.12358454657912266, + "grad_norm": 2.062140941619873, + "learning_rate": 4.813947934838524e-05, + "loss": 4.8037, + "step": 20780 + }, + { + "epoch": 0.12359049386240366, + "grad_norm": 2.0707905292510986, + "learning_rate": 4.8139302522288776e-05, + "loss": 5.3148, + "step": 20781 + }, + { + "epoch": 0.12359644114568465, + "grad_norm": 2.0126004219055176, + "learning_rate": 4.813912568811463e-05, + "loss": 5.522, + "step": 20782 + }, + { + "epoch": 0.12360238842896565, + "grad_norm": 1.9760699272155762, + "learning_rate": 4.8138948845862855e-05, + "loss": 5.2751, + "step": 20783 + }, + { + "epoch": 0.12360833571224665, + "grad_norm": 1.6164956092834473, + "learning_rate": 4.81387719955335e-05, + "loss": 5.4444, + "step": 20784 + }, + { + "epoch": 0.12361428299552764, + "grad_norm": 1.7360550165176392, + "learning_rate": 4.8138595137126645e-05, + "loss": 4.7908, + "step": 20785 + }, + { + "epoch": 0.12362023027880864, + "grad_norm": 1.691304087638855, + "learning_rate": 4.813841827064235e-05, + "loss": 5.4206, + "step": 20786 + }, + { + "epoch": 0.12362617756208964, + "grad_norm": 1.685165524482727, + "learning_rate": 4.813824139608066e-05, + "loss": 4.457, + "step": 20787 + }, + { + "epoch": 0.12363212484537063, + "grad_norm": 2.114884376525879, + "learning_rate": 4.813806451344166e-05, + "loss": 4.8126, + "step": 20788 + }, + { + "epoch": 0.12363807212865163, + "grad_norm": 2.084394693374634, + "learning_rate": 4.81378876227254e-05, + "loss": 4.6486, + "step": 20789 + }, + { + "epoch": 0.12364401941193263, + "grad_norm": 1.901607871055603, + "learning_rate": 4.813771072393194e-05, + "loss": 4.3079, + "step": 20790 + }, + { + "epoch": 0.12364996669521362, + "grad_norm": 1.8139945268630981, + "learning_rate": 4.8137533817061345e-05, + "loss": 4.2445, + "step": 20791 + }, + { + "epoch": 0.12365591397849462, + "grad_norm": 1.8131442070007324, + "learning_rate": 4.8137356902113674e-05, + "loss": 4.1701, + "step": 20792 + }, + { + "epoch": 0.12366186126177563, + "grad_norm": 1.7977681159973145, + "learning_rate": 4.8137179979088995e-05, + "loss": 4.1976, + "step": 20793 + }, + { + "epoch": 0.12366780854505662, + "grad_norm": 1.78773832321167, + "learning_rate": 4.813700304798736e-05, + "loss": 4.0982, + "step": 20794 + }, + { + "epoch": 0.12367375582833762, + "grad_norm": 1.9300304651260376, + "learning_rate": 4.8136826108808844e-05, + "loss": 4.0887, + "step": 20795 + }, + { + "epoch": 0.12367970311161862, + "grad_norm": 1.8883346319198608, + "learning_rate": 4.813664916155349e-05, + "loss": 5.0699, + "step": 20796 + }, + { + "epoch": 0.12368565039489961, + "grad_norm": 1.9141865968704224, + "learning_rate": 4.813647220622137e-05, + "loss": 4.6133, + "step": 20797 + }, + { + "epoch": 0.12369159767818061, + "grad_norm": 2.074240207672119, + "learning_rate": 4.813629524281256e-05, + "loss": 4.2272, + "step": 20798 + }, + { + "epoch": 0.12369754496146161, + "grad_norm": 1.9218412637710571, + "learning_rate": 4.81361182713271e-05, + "loss": 4.2612, + "step": 20799 + }, + { + "epoch": 0.1237034922447426, + "grad_norm": 2.3334543704986572, + "learning_rate": 4.8135941291765066e-05, + "loss": 5.4561, + "step": 20800 + }, + { + "epoch": 0.1237094395280236, + "grad_norm": 2.1329383850097656, + "learning_rate": 4.8135764304126504e-05, + "loss": 4.8373, + "step": 20801 + }, + { + "epoch": 0.12371538681130459, + "grad_norm": 2.2241666316986084, + "learning_rate": 4.81355873084115e-05, + "loss": 4.5995, + "step": 20802 + }, + { + "epoch": 0.12372133409458559, + "grad_norm": 1.448601245880127, + "learning_rate": 4.8135410304620086e-05, + "loss": 6.0327, + "step": 20803 + }, + { + "epoch": 0.1237272813778666, + "grad_norm": 2.05168080329895, + "learning_rate": 4.8135233292752344e-05, + "loss": 4.8944, + "step": 20804 + }, + { + "epoch": 0.12373322866114758, + "grad_norm": 1.9282878637313843, + "learning_rate": 4.813505627280834e-05, + "loss": 5.1704, + "step": 20805 + }, + { + "epoch": 0.12373917594442858, + "grad_norm": 1.892562747001648, + "learning_rate": 4.813487924478812e-05, + "loss": 5.3674, + "step": 20806 + }, + { + "epoch": 0.12374512322770959, + "grad_norm": 1.866495132446289, + "learning_rate": 4.813470220869175e-05, + "loss": 5.3585, + "step": 20807 + }, + { + "epoch": 0.12375107051099057, + "grad_norm": 1.8725072145462036, + "learning_rate": 4.81345251645193e-05, + "loss": 5.0933, + "step": 20808 + }, + { + "epoch": 0.12375701779427158, + "grad_norm": 1.486983299255371, + "learning_rate": 4.8134348112270825e-05, + "loss": 5.1869, + "step": 20809 + }, + { + "epoch": 0.12376296507755258, + "grad_norm": 1.5050567388534546, + "learning_rate": 4.813417105194639e-05, + "loss": 5.1382, + "step": 20810 + }, + { + "epoch": 0.12376891236083357, + "grad_norm": 1.629869818687439, + "learning_rate": 4.813399398354605e-05, + "loss": 5.3847, + "step": 20811 + }, + { + "epoch": 0.12377485964411457, + "grad_norm": 1.749213695526123, + "learning_rate": 4.813381690706987e-05, + "loss": 4.8655, + "step": 20812 + }, + { + "epoch": 0.12378080692739557, + "grad_norm": 1.734803318977356, + "learning_rate": 4.813363982251792e-05, + "loss": 5.2059, + "step": 20813 + }, + { + "epoch": 0.12378675421067656, + "grad_norm": 1.8050858974456787, + "learning_rate": 4.813346272989024e-05, + "loss": 5.1364, + "step": 20814 + }, + { + "epoch": 0.12379270149395756, + "grad_norm": 1.6926177740097046, + "learning_rate": 4.813328562918692e-05, + "loss": 4.969, + "step": 20815 + }, + { + "epoch": 0.12379864877723856, + "grad_norm": 1.9767627716064453, + "learning_rate": 4.813310852040801e-05, + "loss": 5.1043, + "step": 20816 + }, + { + "epoch": 0.12380459606051955, + "grad_norm": 1.5432230234146118, + "learning_rate": 4.813293140355357e-05, + "loss": 5.0858, + "step": 20817 + }, + { + "epoch": 0.12381054334380055, + "grad_norm": 1.5301191806793213, + "learning_rate": 4.813275427862366e-05, + "loss": 5.2312, + "step": 20818 + }, + { + "epoch": 0.12381649062708155, + "grad_norm": 1.6347124576568604, + "learning_rate": 4.813257714561835e-05, + "loss": 5.1701, + "step": 20819 + }, + { + "epoch": 0.12382243791036254, + "grad_norm": 2.1260578632354736, + "learning_rate": 4.813240000453769e-05, + "loss": 5.3055, + "step": 20820 + }, + { + "epoch": 0.12382838519364354, + "grad_norm": 2.0905344486236572, + "learning_rate": 4.813222285538175e-05, + "loss": 5.1265, + "step": 20821 + }, + { + "epoch": 0.12383433247692455, + "grad_norm": 1.8773592710494995, + "learning_rate": 4.81320456981506e-05, + "loss": 5.1409, + "step": 20822 + }, + { + "epoch": 0.12384027976020553, + "grad_norm": 1.9149075746536255, + "learning_rate": 4.8131868532844275e-05, + "loss": 5.1855, + "step": 20823 + }, + { + "epoch": 0.12384622704348654, + "grad_norm": 2.0494494438171387, + "learning_rate": 4.813169135946286e-05, + "loss": 5.2561, + "step": 20824 + }, + { + "epoch": 0.12385217432676754, + "grad_norm": 1.9590463638305664, + "learning_rate": 4.8131514178006417e-05, + "loss": 5.0764, + "step": 20825 + }, + { + "epoch": 0.12385812161004853, + "grad_norm": 2.5940022468566895, + "learning_rate": 4.8131336988475e-05, + "loss": 4.42, + "step": 20826 + }, + { + "epoch": 0.12386406889332953, + "grad_norm": 2.135793924331665, + "learning_rate": 4.8131159790868665e-05, + "loss": 4.653, + "step": 20827 + }, + { + "epoch": 0.12387001617661053, + "grad_norm": 2.1380679607391357, + "learning_rate": 4.813098258518748e-05, + "loss": 4.7332, + "step": 20828 + }, + { + "epoch": 0.12387596345989152, + "grad_norm": 2.264723300933838, + "learning_rate": 4.8130805371431513e-05, + "loss": 4.8735, + "step": 20829 + }, + { + "epoch": 0.12388191074317252, + "grad_norm": 2.4449269771575928, + "learning_rate": 4.813062814960082e-05, + "loss": 3.6335, + "step": 20830 + }, + { + "epoch": 0.12388785802645351, + "grad_norm": 2.5718894004821777, + "learning_rate": 4.813045091969547e-05, + "loss": 3.8212, + "step": 20831 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 1.9600555896759033, + "learning_rate": 4.813027368171551e-05, + "loss": 5.7456, + "step": 20832 + }, + { + "epoch": 0.12389975259301551, + "grad_norm": 2.032362699508667, + "learning_rate": 4.813009643566101e-05, + "loss": 5.3087, + "step": 20833 + }, + { + "epoch": 0.1239056998762965, + "grad_norm": 2.0349206924438477, + "learning_rate": 4.8129919181532036e-05, + "loss": 5.0988, + "step": 20834 + }, + { + "epoch": 0.1239116471595775, + "grad_norm": 2.811582565307617, + "learning_rate": 4.812974191932864e-05, + "loss": 4.4085, + "step": 20835 + }, + { + "epoch": 0.1239175944428585, + "grad_norm": 1.8748958110809326, + "learning_rate": 4.8129564649050904e-05, + "loss": 5.3469, + "step": 20836 + }, + { + "epoch": 0.1239235417261395, + "grad_norm": 2.162895917892456, + "learning_rate": 4.8129387370698865e-05, + "loss": 5.4258, + "step": 20837 + }, + { + "epoch": 0.1239294890094205, + "grad_norm": 1.60780668258667, + "learning_rate": 4.8129210084272596e-05, + "loss": 5.4865, + "step": 20838 + }, + { + "epoch": 0.1239354362927015, + "grad_norm": 1.8906630277633667, + "learning_rate": 4.812903278977216e-05, + "loss": 5.3286, + "step": 20839 + }, + { + "epoch": 0.12394138357598249, + "grad_norm": 1.5469995737075806, + "learning_rate": 4.812885548719762e-05, + "loss": 5.4524, + "step": 20840 + }, + { + "epoch": 0.12394733085926349, + "grad_norm": 1.632104754447937, + "learning_rate": 4.8128678176549034e-05, + "loss": 5.4239, + "step": 20841 + }, + { + "epoch": 0.12395327814254449, + "grad_norm": 1.9250766038894653, + "learning_rate": 4.812850085782646e-05, + "loss": 5.3333, + "step": 20842 + }, + { + "epoch": 0.12395922542582548, + "grad_norm": 1.5831308364868164, + "learning_rate": 4.8128323531029974e-05, + "loss": 5.38, + "step": 20843 + }, + { + "epoch": 0.12396517270910648, + "grad_norm": 1.8450974225997925, + "learning_rate": 4.812814619615963e-05, + "loss": 5.1909, + "step": 20844 + }, + { + "epoch": 0.12397111999238748, + "grad_norm": 1.990018367767334, + "learning_rate": 4.8127968853215485e-05, + "loss": 5.2392, + "step": 20845 + }, + { + "epoch": 0.12397706727566847, + "grad_norm": 1.7380045652389526, + "learning_rate": 4.812779150219761e-05, + "loss": 5.4486, + "step": 20846 + }, + { + "epoch": 0.12398301455894947, + "grad_norm": 1.6080845594406128, + "learning_rate": 4.812761414310605e-05, + "loss": 6.0048, + "step": 20847 + }, + { + "epoch": 0.12398896184223047, + "grad_norm": 1.2336721420288086, + "learning_rate": 4.8127436775940884e-05, + "loss": 5.8988, + "step": 20848 + }, + { + "epoch": 0.12399490912551146, + "grad_norm": 1.3851333856582642, + "learning_rate": 4.8127259400702173e-05, + "loss": 6.0162, + "step": 20849 + }, + { + "epoch": 0.12400085640879246, + "grad_norm": 1.3938422203063965, + "learning_rate": 4.8127082017389965e-05, + "loss": 5.9186, + "step": 20850 + }, + { + "epoch": 0.12400680369207347, + "grad_norm": 1.6463207006454468, + "learning_rate": 4.812690462600434e-05, + "loss": 5.9684, + "step": 20851 + }, + { + "epoch": 0.12401275097535445, + "grad_norm": 1.4180574417114258, + "learning_rate": 4.8126727226545353e-05, + "loss": 5.9383, + "step": 20852 + }, + { + "epoch": 0.12401869825863546, + "grad_norm": 1.3431847095489502, + "learning_rate": 4.8126549819013065e-05, + "loss": 5.862, + "step": 20853 + }, + { + "epoch": 0.12402464554191646, + "grad_norm": 1.3493611812591553, + "learning_rate": 4.812637240340753e-05, + "loss": 5.8796, + "step": 20854 + }, + { + "epoch": 0.12403059282519745, + "grad_norm": 1.2833929061889648, + "learning_rate": 4.812619497972882e-05, + "loss": 5.7322, + "step": 20855 + }, + { + "epoch": 0.12403654010847845, + "grad_norm": 1.4494770765304565, + "learning_rate": 4.8126017547977e-05, + "loss": 5.6871, + "step": 20856 + }, + { + "epoch": 0.12404248739175945, + "grad_norm": 1.9750009775161743, + "learning_rate": 4.812584010815212e-05, + "loss": 5.4744, + "step": 20857 + }, + { + "epoch": 0.12404843467504044, + "grad_norm": 2.2873501777648926, + "learning_rate": 4.812566266025425e-05, + "loss": 4.7326, + "step": 20858 + }, + { + "epoch": 0.12405438195832144, + "grad_norm": 2.3699395656585693, + "learning_rate": 4.8125485204283446e-05, + "loss": 5.1084, + "step": 20859 + }, + { + "epoch": 0.12406032924160243, + "grad_norm": 2.3874311447143555, + "learning_rate": 4.812530774023978e-05, + "loss": 4.7226, + "step": 20860 + }, + { + "epoch": 0.12406627652488343, + "grad_norm": 1.6285946369171143, + "learning_rate": 4.8125130268123305e-05, + "loss": 5.4695, + "step": 20861 + }, + { + "epoch": 0.12407222380816443, + "grad_norm": 1.5346466302871704, + "learning_rate": 4.8124952787934096e-05, + "loss": 5.5105, + "step": 20862 + }, + { + "epoch": 0.12407817109144542, + "grad_norm": 1.7935290336608887, + "learning_rate": 4.8124775299672195e-05, + "loss": 5.2028, + "step": 20863 + }, + { + "epoch": 0.12408411837472642, + "grad_norm": 1.7893015146255493, + "learning_rate": 4.812459780333767e-05, + "loss": 5.1571, + "step": 20864 + }, + { + "epoch": 0.12409006565800743, + "grad_norm": 1.6904758214950562, + "learning_rate": 4.8124420298930596e-05, + "loss": 6.0024, + "step": 20865 + }, + { + "epoch": 0.12409601294128841, + "grad_norm": 1.7721166610717773, + "learning_rate": 4.812424278645102e-05, + "loss": 5.8716, + "step": 20866 + }, + { + "epoch": 0.12410196022456942, + "grad_norm": 1.5822969675064087, + "learning_rate": 4.812406526589901e-05, + "loss": 5.7984, + "step": 20867 + }, + { + "epoch": 0.12410790750785042, + "grad_norm": 1.713592290878296, + "learning_rate": 4.8123887737274634e-05, + "loss": 5.7348, + "step": 20868 + }, + { + "epoch": 0.1241138547911314, + "grad_norm": 1.54501473903656, + "learning_rate": 4.812371020057794e-05, + "loss": 5.7012, + "step": 20869 + }, + { + "epoch": 0.12411980207441241, + "grad_norm": 1.2782925367355347, + "learning_rate": 4.8123532655809e-05, + "loss": 5.6171, + "step": 20870 + }, + { + "epoch": 0.12412574935769341, + "grad_norm": 1.357879638671875, + "learning_rate": 4.812335510296787e-05, + "loss": 5.7021, + "step": 20871 + }, + { + "epoch": 0.1241316966409744, + "grad_norm": 1.468440294265747, + "learning_rate": 4.812317754205462e-05, + "loss": 5.6863, + "step": 20872 + }, + { + "epoch": 0.1241376439242554, + "grad_norm": 2.965566396713257, + "learning_rate": 4.812299997306931e-05, + "loss": 5.3282, + "step": 20873 + }, + { + "epoch": 0.1241435912075364, + "grad_norm": 3.3760321140289307, + "learning_rate": 4.8122822396012e-05, + "loss": 5.0464, + "step": 20874 + }, + { + "epoch": 0.12414953849081739, + "grad_norm": 2.340055465698242, + "learning_rate": 4.8122644810882746e-05, + "loss": 4.6466, + "step": 20875 + }, + { + "epoch": 0.12415548577409839, + "grad_norm": 1.5659359693527222, + "learning_rate": 4.8122467217681615e-05, + "loss": 5.5262, + "step": 20876 + }, + { + "epoch": 0.1241614330573794, + "grad_norm": 1.9036263227462769, + "learning_rate": 4.812228961640868e-05, + "loss": 5.7474, + "step": 20877 + }, + { + "epoch": 0.12416738034066038, + "grad_norm": 1.8488661050796509, + "learning_rate": 4.812211200706398e-05, + "loss": 5.6901, + "step": 20878 + }, + { + "epoch": 0.12417332762394138, + "grad_norm": 1.7501896619796753, + "learning_rate": 4.8121934389647594e-05, + "loss": 5.9729, + "step": 20879 + }, + { + "epoch": 0.12417927490722239, + "grad_norm": 1.7495286464691162, + "learning_rate": 4.812175676415957e-05, + "loss": 5.4282, + "step": 20880 + }, + { + "epoch": 0.12418522219050337, + "grad_norm": 1.8494720458984375, + "learning_rate": 4.8121579130600005e-05, + "loss": 5.6148, + "step": 20881 + }, + { + "epoch": 0.12419116947378438, + "grad_norm": 1.860341191291809, + "learning_rate": 4.812140148896892e-05, + "loss": 5.6192, + "step": 20882 + }, + { + "epoch": 0.12419711675706538, + "grad_norm": 1.845438003540039, + "learning_rate": 4.8121223839266386e-05, + "loss": 5.4989, + "step": 20883 + }, + { + "epoch": 0.12420306404034637, + "grad_norm": 1.7625926733016968, + "learning_rate": 4.812104618149248e-05, + "loss": 5.4833, + "step": 20884 + }, + { + "epoch": 0.12420901132362737, + "grad_norm": 1.4869773387908936, + "learning_rate": 4.812086851564725e-05, + "loss": 5.6437, + "step": 20885 + }, + { + "epoch": 0.12421495860690837, + "grad_norm": 1.528306245803833, + "learning_rate": 4.812069084173077e-05, + "loss": 5.4938, + "step": 20886 + }, + { + "epoch": 0.12422090589018936, + "grad_norm": 1.28203284740448, + "learning_rate": 4.81205131597431e-05, + "loss": 5.5411, + "step": 20887 + }, + { + "epoch": 0.12422685317347036, + "grad_norm": 1.9413608312606812, + "learning_rate": 4.8120335469684285e-05, + "loss": 5.4842, + "step": 20888 + }, + { + "epoch": 0.12423280045675135, + "grad_norm": 1.8776315450668335, + "learning_rate": 4.812015777155441e-05, + "loss": 5.495, + "step": 20889 + }, + { + "epoch": 0.12423874774003235, + "grad_norm": 1.941171646118164, + "learning_rate": 4.8119980065353524e-05, + "loss": 5.7711, + "step": 20890 + }, + { + "epoch": 0.12424469502331335, + "grad_norm": 1.8312263488769531, + "learning_rate": 4.811980235108169e-05, + "loss": 5.5998, + "step": 20891 + }, + { + "epoch": 0.12425064230659434, + "grad_norm": 1.6940878629684448, + "learning_rate": 4.811962462873897e-05, + "loss": 5.9089, + "step": 20892 + }, + { + "epoch": 0.12425658958987534, + "grad_norm": 1.8769567012786865, + "learning_rate": 4.811944689832543e-05, + "loss": 5.5854, + "step": 20893 + }, + { + "epoch": 0.12426253687315635, + "grad_norm": 1.8289974927902222, + "learning_rate": 4.811926915984113e-05, + "loss": 5.4698, + "step": 20894 + }, + { + "epoch": 0.12426848415643733, + "grad_norm": 2.343961000442505, + "learning_rate": 4.811909141328613e-05, + "loss": 4.4474, + "step": 20895 + }, + { + "epoch": 0.12427443143971834, + "grad_norm": 1.9822384119033813, + "learning_rate": 4.8118913658660504e-05, + "loss": 4.9353, + "step": 20896 + }, + { + "epoch": 0.12428037872299934, + "grad_norm": 2.3056247234344482, + "learning_rate": 4.811873589596429e-05, + "loss": 4.7128, + "step": 20897 + }, + { + "epoch": 0.12428632600628033, + "grad_norm": 2.205653667449951, + "learning_rate": 4.811855812519758e-05, + "loss": 4.08, + "step": 20898 + }, + { + "epoch": 0.12429227328956133, + "grad_norm": 2.0141141414642334, + "learning_rate": 4.81183803463604e-05, + "loss": 4.2903, + "step": 20899 + }, + { + "epoch": 0.12429822057284233, + "grad_norm": 2.2912099361419678, + "learning_rate": 4.811820255945285e-05, + "loss": 4.7582, + "step": 20900 + }, + { + "epoch": 0.12430416785612332, + "grad_norm": 2.1577751636505127, + "learning_rate": 4.8118024764474965e-05, + "loss": 4.757, + "step": 20901 + }, + { + "epoch": 0.12431011513940432, + "grad_norm": 2.2851569652557373, + "learning_rate": 4.811784696142682e-05, + "loss": 3.9403, + "step": 20902 + }, + { + "epoch": 0.12431606242268532, + "grad_norm": 2.256500720977783, + "learning_rate": 4.8117669150308474e-05, + "loss": 4.3498, + "step": 20903 + }, + { + "epoch": 0.12432200970596631, + "grad_norm": 2.1631035804748535, + "learning_rate": 4.811749133111999e-05, + "loss": 4.6171, + "step": 20904 + }, + { + "epoch": 0.12432795698924731, + "grad_norm": 2.360530138015747, + "learning_rate": 4.811731350386142e-05, + "loss": 4.5958, + "step": 20905 + }, + { + "epoch": 0.12433390427252831, + "grad_norm": 2.031268835067749, + "learning_rate": 4.8117135668532845e-05, + "loss": 4.4466, + "step": 20906 + }, + { + "epoch": 0.1243398515558093, + "grad_norm": 1.7367441654205322, + "learning_rate": 4.811695782513431e-05, + "loss": 4.8605, + "step": 20907 + }, + { + "epoch": 0.1243457988390903, + "grad_norm": 2.5067267417907715, + "learning_rate": 4.8116779973665886e-05, + "loss": 4.0849, + "step": 20908 + }, + { + "epoch": 0.1243517461223713, + "grad_norm": 1.5404255390167236, + "learning_rate": 4.811660211412763e-05, + "loss": 4.4511, + "step": 20909 + }, + { + "epoch": 0.1243576934056523, + "grad_norm": 1.4191818237304688, + "learning_rate": 4.8116424246519606e-05, + "loss": 4.4274, + "step": 20910 + }, + { + "epoch": 0.1243636406889333, + "grad_norm": 1.4610079526901245, + "learning_rate": 4.811624637084189e-05, + "loss": 4.4112, + "step": 20911 + }, + { + "epoch": 0.1243695879722143, + "grad_norm": 1.3842167854309082, + "learning_rate": 4.811606848709452e-05, + "loss": 4.3019, + "step": 20912 + }, + { + "epoch": 0.12437553525549529, + "grad_norm": 1.4025331735610962, + "learning_rate": 4.811589059527757e-05, + "loss": 4.251, + "step": 20913 + }, + { + "epoch": 0.12438148253877629, + "grad_norm": 1.5034327507019043, + "learning_rate": 4.81157126953911e-05, + "loss": 4.1553, + "step": 20914 + }, + { + "epoch": 0.12438742982205729, + "grad_norm": 1.5153253078460693, + "learning_rate": 4.811553478743518e-05, + "loss": 4.1264, + "step": 20915 + }, + { + "epoch": 0.12439337710533828, + "grad_norm": 1.4300923347473145, + "learning_rate": 4.811535687140987e-05, + "loss": 4.2653, + "step": 20916 + }, + { + "epoch": 0.12439932438861928, + "grad_norm": 1.4667567014694214, + "learning_rate": 4.811517894731521e-05, + "loss": 4.2216, + "step": 20917 + }, + { + "epoch": 0.12440527167190027, + "grad_norm": 1.6324750185012817, + "learning_rate": 4.81150010151513e-05, + "loss": 4.3083, + "step": 20918 + }, + { + "epoch": 0.12441121895518127, + "grad_norm": 1.507516622543335, + "learning_rate": 4.8114823074918165e-05, + "loss": 4.1369, + "step": 20919 + }, + { + "epoch": 0.12441716623846227, + "grad_norm": 1.5365220308303833, + "learning_rate": 4.8114645126615886e-05, + "loss": 4.061, + "step": 20920 + }, + { + "epoch": 0.12442311352174326, + "grad_norm": 1.3880743980407715, + "learning_rate": 4.811446717024453e-05, + "loss": 4.2464, + "step": 20921 + }, + { + "epoch": 0.12442906080502426, + "grad_norm": 1.619391918182373, + "learning_rate": 4.8114289205804155e-05, + "loss": 4.0032, + "step": 20922 + }, + { + "epoch": 0.12443500808830527, + "grad_norm": 1.5912760496139526, + "learning_rate": 4.811411123329481e-05, + "loss": 3.9996, + "step": 20923 + }, + { + "epoch": 0.12444095537158625, + "grad_norm": 1.6042509078979492, + "learning_rate": 4.811393325271657e-05, + "loss": 3.9225, + "step": 20924 + }, + { + "epoch": 0.12444690265486726, + "grad_norm": 1.4620057344436646, + "learning_rate": 4.8113755264069505e-05, + "loss": 4.4391, + "step": 20925 + }, + { + "epoch": 0.12445284993814826, + "grad_norm": 1.6154197454452515, + "learning_rate": 4.811357726735366e-05, + "loss": 4.1254, + "step": 20926 + }, + { + "epoch": 0.12445879722142925, + "grad_norm": 1.520150065422058, + "learning_rate": 4.8113399262569104e-05, + "loss": 4.7638, + "step": 20927 + }, + { + "epoch": 0.12446474450471025, + "grad_norm": 1.5869375467300415, + "learning_rate": 4.81132212497159e-05, + "loss": 4.047, + "step": 20928 + }, + { + "epoch": 0.12447069178799125, + "grad_norm": 1.610819697380066, + "learning_rate": 4.8113043228794105e-05, + "loss": 4.0823, + "step": 20929 + }, + { + "epoch": 0.12447663907127224, + "grad_norm": 1.4962780475616455, + "learning_rate": 4.811286519980379e-05, + "loss": 5.4004, + "step": 20930 + }, + { + "epoch": 0.12448258635455324, + "grad_norm": 1.382641077041626, + "learning_rate": 4.811268716274501e-05, + "loss": 5.3129, + "step": 20931 + }, + { + "epoch": 0.12448853363783424, + "grad_norm": 1.3323496580123901, + "learning_rate": 4.811250911761783e-05, + "loss": 5.2123, + "step": 20932 + }, + { + "epoch": 0.12449448092111523, + "grad_norm": 1.4375461339950562, + "learning_rate": 4.811233106442231e-05, + "loss": 5.4249, + "step": 20933 + }, + { + "epoch": 0.12450042820439623, + "grad_norm": 1.6861125230789185, + "learning_rate": 4.811215300315852e-05, + "loss": 5.0697, + "step": 20934 + }, + { + "epoch": 0.12450637548767723, + "grad_norm": 1.52859365940094, + "learning_rate": 4.811197493382651e-05, + "loss": 5.5925, + "step": 20935 + }, + { + "epoch": 0.12451232277095822, + "grad_norm": 1.4931366443634033, + "learning_rate": 4.811179685642635e-05, + "loss": 5.4442, + "step": 20936 + }, + { + "epoch": 0.12451827005423922, + "grad_norm": 1.3825764656066895, + "learning_rate": 4.8111618770958104e-05, + "loss": 5.5773, + "step": 20937 + }, + { + "epoch": 0.12452421733752023, + "grad_norm": 1.3441286087036133, + "learning_rate": 4.811144067742183e-05, + "loss": 5.5421, + "step": 20938 + }, + { + "epoch": 0.12453016462080121, + "grad_norm": 1.2910594940185547, + "learning_rate": 4.811126257581758e-05, + "loss": 5.3507, + "step": 20939 + }, + { + "epoch": 0.12453611190408222, + "grad_norm": 1.3505282402038574, + "learning_rate": 4.811108446614544e-05, + "loss": 5.5285, + "step": 20940 + }, + { + "epoch": 0.12454205918736322, + "grad_norm": 1.4562500715255737, + "learning_rate": 4.811090634840546e-05, + "loss": 5.3592, + "step": 20941 + }, + { + "epoch": 0.1245480064706442, + "grad_norm": 1.4702924489974976, + "learning_rate": 4.8110728222597694e-05, + "loss": 5.2603, + "step": 20942 + }, + { + "epoch": 0.12455395375392521, + "grad_norm": 1.6397823095321655, + "learning_rate": 4.811055008872222e-05, + "loss": 5.222, + "step": 20943 + }, + { + "epoch": 0.12455990103720621, + "grad_norm": 1.5603538751602173, + "learning_rate": 4.811037194677908e-05, + "loss": 5.2075, + "step": 20944 + }, + { + "epoch": 0.1245658483204872, + "grad_norm": 1.3349683284759521, + "learning_rate": 4.811019379676835e-05, + "loss": 5.2903, + "step": 20945 + }, + { + "epoch": 0.1245717956037682, + "grad_norm": 1.348935842514038, + "learning_rate": 4.8110015638690096e-05, + "loss": 5.4688, + "step": 20946 + }, + { + "epoch": 0.12457774288704919, + "grad_norm": 1.4173049926757812, + "learning_rate": 4.810983747254437e-05, + "loss": 5.0299, + "step": 20947 + }, + { + "epoch": 0.12458369017033019, + "grad_norm": 1.3553805351257324, + "learning_rate": 4.8109659298331244e-05, + "loss": 5.0798, + "step": 20948 + }, + { + "epoch": 0.1245896374536112, + "grad_norm": 1.3770824670791626, + "learning_rate": 4.810948111605077e-05, + "loss": 4.807, + "step": 20949 + }, + { + "epoch": 0.12459558473689218, + "grad_norm": 1.3450689315795898, + "learning_rate": 4.810930292570302e-05, + "loss": 4.8061, + "step": 20950 + }, + { + "epoch": 0.12460153202017318, + "grad_norm": 1.4118422269821167, + "learning_rate": 4.8109124727288044e-05, + "loss": 5.203, + "step": 20951 + }, + { + "epoch": 0.12460747930345419, + "grad_norm": 1.4127706289291382, + "learning_rate": 4.810894652080592e-05, + "loss": 5.104, + "step": 20952 + }, + { + "epoch": 0.12461342658673517, + "grad_norm": 1.2636264562606812, + "learning_rate": 4.810876830625669e-05, + "loss": 4.9306, + "step": 20953 + }, + { + "epoch": 0.12461937387001618, + "grad_norm": 1.3846913576126099, + "learning_rate": 4.810859008364044e-05, + "loss": 4.8095, + "step": 20954 + }, + { + "epoch": 0.12462532115329718, + "grad_norm": 1.6017072200775146, + "learning_rate": 4.8108411852957216e-05, + "loss": 4.9926, + "step": 20955 + }, + { + "epoch": 0.12463126843657817, + "grad_norm": 1.5098768472671509, + "learning_rate": 4.8108233614207075e-05, + "loss": 5.3204, + "step": 20956 + }, + { + "epoch": 0.12463721571985917, + "grad_norm": 1.1792641878128052, + "learning_rate": 4.8108055367390097e-05, + "loss": 4.7596, + "step": 20957 + }, + { + "epoch": 0.12464316300314017, + "grad_norm": 1.3787871599197388, + "learning_rate": 4.8107877112506336e-05, + "loss": 5.0914, + "step": 20958 + }, + { + "epoch": 0.12464911028642116, + "grad_norm": 1.3097307682037354, + "learning_rate": 4.8107698849555846e-05, + "loss": 4.8154, + "step": 20959 + }, + { + "epoch": 0.12465505756970216, + "grad_norm": 1.4452660083770752, + "learning_rate": 4.810752057853871e-05, + "loss": 5.1395, + "step": 20960 + }, + { + "epoch": 0.12466100485298316, + "grad_norm": 1.4970120191574097, + "learning_rate": 4.8107342299454974e-05, + "loss": 4.8164, + "step": 20961 + }, + { + "epoch": 0.12466695213626415, + "grad_norm": 1.4092109203338623, + "learning_rate": 4.810716401230469e-05, + "loss": 4.9219, + "step": 20962 + }, + { + "epoch": 0.12467289941954515, + "grad_norm": 1.5558546781539917, + "learning_rate": 4.810698571708795e-05, + "loss": 4.8639, + "step": 20963 + }, + { + "epoch": 0.12467884670282615, + "grad_norm": 1.3631898164749146, + "learning_rate": 4.810680741380479e-05, + "loss": 5.2145, + "step": 20964 + }, + { + "epoch": 0.12468479398610714, + "grad_norm": 1.608810544013977, + "learning_rate": 4.8106629102455286e-05, + "loss": 5.2486, + "step": 20965 + }, + { + "epoch": 0.12469074126938814, + "grad_norm": 1.573190689086914, + "learning_rate": 4.81064507830395e-05, + "loss": 5.2476, + "step": 20966 + }, + { + "epoch": 0.12469668855266915, + "grad_norm": 1.5032795667648315, + "learning_rate": 4.810627245555748e-05, + "loss": 5.1557, + "step": 20967 + }, + { + "epoch": 0.12470263583595013, + "grad_norm": 1.3919012546539307, + "learning_rate": 4.810609412000931e-05, + "loss": 5.2812, + "step": 20968 + }, + { + "epoch": 0.12470858311923114, + "grad_norm": 1.417431354522705, + "learning_rate": 4.810591577639504e-05, + "loss": 5.3173, + "step": 20969 + }, + { + "epoch": 0.12471453040251214, + "grad_norm": 1.2135869264602661, + "learning_rate": 4.8105737424714724e-05, + "loss": 5.3511, + "step": 20970 + }, + { + "epoch": 0.12472047768579313, + "grad_norm": 1.3142472505569458, + "learning_rate": 4.810555906496844e-05, + "loss": 5.225, + "step": 20971 + }, + { + "epoch": 0.12472642496907413, + "grad_norm": 1.4344936609268188, + "learning_rate": 4.810538069715625e-05, + "loss": 5.5032, + "step": 20972 + }, + { + "epoch": 0.12473237225235513, + "grad_norm": 1.214281439781189, + "learning_rate": 4.81052023212782e-05, + "loss": 5.4466, + "step": 20973 + }, + { + "epoch": 0.12473831953563612, + "grad_norm": 1.5831886529922485, + "learning_rate": 4.810502393733437e-05, + "loss": 4.6211, + "step": 20974 + }, + { + "epoch": 0.12474426681891712, + "grad_norm": 1.6281508207321167, + "learning_rate": 4.8104845545324816e-05, + "loss": 4.6212, + "step": 20975 + }, + { + "epoch": 0.12475021410219811, + "grad_norm": 1.5753840208053589, + "learning_rate": 4.810466714524959e-05, + "loss": 4.7089, + "step": 20976 + }, + { + "epoch": 0.12475616138547911, + "grad_norm": 1.355692744255066, + "learning_rate": 4.810448873710877e-05, + "loss": 5.0399, + "step": 20977 + }, + { + "epoch": 0.12476210866876011, + "grad_norm": 1.27257239818573, + "learning_rate": 4.810431032090241e-05, + "loss": 4.7091, + "step": 20978 + }, + { + "epoch": 0.1247680559520411, + "grad_norm": 1.532210350036621, + "learning_rate": 4.810413189663058e-05, + "loss": 4.6682, + "step": 20979 + }, + { + "epoch": 0.1247740032353221, + "grad_norm": 1.4075580835342407, + "learning_rate": 4.810395346429333e-05, + "loss": 4.5135, + "step": 20980 + }, + { + "epoch": 0.1247799505186031, + "grad_norm": 1.3797897100448608, + "learning_rate": 4.810377502389073e-05, + "loss": 4.5548, + "step": 20981 + }, + { + "epoch": 0.1247858978018841, + "grad_norm": 1.4484235048294067, + "learning_rate": 4.810359657542284e-05, + "loss": 4.5336, + "step": 20982 + }, + { + "epoch": 0.1247918450851651, + "grad_norm": 1.4712706804275513, + "learning_rate": 4.810341811888972e-05, + "loss": 4.6805, + "step": 20983 + }, + { + "epoch": 0.1247977923684461, + "grad_norm": 1.548684000968933, + "learning_rate": 4.8103239654291444e-05, + "loss": 4.6239, + "step": 20984 + }, + { + "epoch": 0.12480373965172709, + "grad_norm": 1.481542944908142, + "learning_rate": 4.810306118162806e-05, + "loss": 4.981, + "step": 20985 + }, + { + "epoch": 0.12480968693500809, + "grad_norm": 1.423977017402649, + "learning_rate": 4.810288270089963e-05, + "loss": 5.1813, + "step": 20986 + }, + { + "epoch": 0.12481563421828909, + "grad_norm": 1.2712557315826416, + "learning_rate": 4.810270421210623e-05, + "loss": 5.1499, + "step": 20987 + }, + { + "epoch": 0.12482158150157008, + "grad_norm": 1.4444210529327393, + "learning_rate": 4.810252571524791e-05, + "loss": 5.1801, + "step": 20988 + }, + { + "epoch": 0.12482752878485108, + "grad_norm": 1.2743985652923584, + "learning_rate": 4.810234721032475e-05, + "loss": 5.1433, + "step": 20989 + }, + { + "epoch": 0.12483347606813208, + "grad_norm": 1.4066376686096191, + "learning_rate": 4.810216869733679e-05, + "loss": 5.1821, + "step": 20990 + }, + { + "epoch": 0.12483942335141307, + "grad_norm": 1.362889051437378, + "learning_rate": 4.81019901762841e-05, + "loss": 5.2135, + "step": 20991 + }, + { + "epoch": 0.12484537063469407, + "grad_norm": 1.2178412675857544, + "learning_rate": 4.810181164716674e-05, + "loss": 5.3131, + "step": 20992 + }, + { + "epoch": 0.12485131791797507, + "grad_norm": 1.7444922924041748, + "learning_rate": 4.8101633109984786e-05, + "loss": 4.8666, + "step": 20993 + }, + { + "epoch": 0.12485726520125606, + "grad_norm": 1.4151227474212646, + "learning_rate": 4.810145456473828e-05, + "loss": 5.0585, + "step": 20994 + }, + { + "epoch": 0.12486321248453706, + "grad_norm": 1.2906028032302856, + "learning_rate": 4.81012760114273e-05, + "loss": 5.1402, + "step": 20995 + }, + { + "epoch": 0.12486915976781807, + "grad_norm": 1.4265183210372925, + "learning_rate": 4.8101097450051906e-05, + "loss": 5.184, + "step": 20996 + }, + { + "epoch": 0.12487510705109905, + "grad_norm": 1.499804139137268, + "learning_rate": 4.8100918880612154e-05, + "loss": 4.9952, + "step": 20997 + }, + { + "epoch": 0.12488105433438006, + "grad_norm": 1.5296711921691895, + "learning_rate": 4.810074030310812e-05, + "loss": 4.9743, + "step": 20998 + }, + { + "epoch": 0.12488700161766106, + "grad_norm": 1.4345946311950684, + "learning_rate": 4.810056171753984e-05, + "loss": 4.9107, + "step": 20999 + }, + { + "epoch": 0.12489294890094205, + "grad_norm": 1.501966953277588, + "learning_rate": 4.81003831239074e-05, + "loss": 4.8123, + "step": 21000 + }, + { + "epoch": 0.12489889618422305, + "grad_norm": 1.1865864992141724, + "learning_rate": 4.810020452221086e-05, + "loss": 5.1614, + "step": 21001 + }, + { + "epoch": 0.12490484346750405, + "grad_norm": 1.345996379852295, + "learning_rate": 4.810002591245027e-05, + "loss": 4.9784, + "step": 21002 + }, + { + "epoch": 0.12491079075078504, + "grad_norm": 1.2252000570297241, + "learning_rate": 4.80998472946257e-05, + "loss": 4.9433, + "step": 21003 + }, + { + "epoch": 0.12491673803406604, + "grad_norm": 1.4540387392044067, + "learning_rate": 4.809966866873722e-05, + "loss": 4.8608, + "step": 21004 + }, + { + "epoch": 0.12492268531734703, + "grad_norm": 1.382969617843628, + "learning_rate": 4.809949003478488e-05, + "loss": 4.8168, + "step": 21005 + }, + { + "epoch": 0.12492863260062803, + "grad_norm": 1.3642408847808838, + "learning_rate": 4.809931139276874e-05, + "loss": 4.9262, + "step": 21006 + }, + { + "epoch": 0.12493457988390903, + "grad_norm": 1.1903620958328247, + "learning_rate": 4.809913274268887e-05, + "loss": 5.1817, + "step": 21007 + }, + { + "epoch": 0.12494052716719002, + "grad_norm": 1.3020774126052856, + "learning_rate": 4.809895408454534e-05, + "loss": 4.956, + "step": 21008 + }, + { + "epoch": 0.12494647445047102, + "grad_norm": 1.3209398984909058, + "learning_rate": 4.80987754183382e-05, + "loss": 4.9542, + "step": 21009 + }, + { + "epoch": 0.12495242173375203, + "grad_norm": 1.2684825658798218, + "learning_rate": 4.809859674406752e-05, + "loss": 5.2919, + "step": 21010 + }, + { + "epoch": 0.12495836901703301, + "grad_norm": 1.271053671836853, + "learning_rate": 4.809841806173335e-05, + "loss": 5.1397, + "step": 21011 + }, + { + "epoch": 0.12496431630031402, + "grad_norm": 1.2137185335159302, + "learning_rate": 4.809823937133576e-05, + "loss": 5.1874, + "step": 21012 + }, + { + "epoch": 0.12497026358359502, + "grad_norm": 1.2429122924804688, + "learning_rate": 4.8098060672874825e-05, + "loss": 5.0626, + "step": 21013 + }, + { + "epoch": 0.124976210866876, + "grad_norm": 1.3292062282562256, + "learning_rate": 4.809788196635058e-05, + "loss": 4.9019, + "step": 21014 + }, + { + "epoch": 0.12498215815015701, + "grad_norm": 1.3801854848861694, + "learning_rate": 4.8097703251763115e-05, + "loss": 4.8948, + "step": 21015 + }, + { + "epoch": 0.12498810543343801, + "grad_norm": 1.1259671449661255, + "learning_rate": 4.8097524529112484e-05, + "loss": 4.8041, + "step": 21016 + }, + { + "epoch": 0.124994052716719, + "grad_norm": 1.145451307296753, + "learning_rate": 4.809734579839873e-05, + "loss": 5.0012, + "step": 21017 + }, + { + "epoch": 0.125, + "grad_norm": 2.0128631591796875, + "learning_rate": 4.8097167059621945e-05, + "loss": 5.5174, + "step": 21018 + }, + { + "epoch": 0.125005947283281, + "grad_norm": 1.2371736764907837, + "learning_rate": 4.8096988312782174e-05, + "loss": 4.9491, + "step": 21019 + }, + { + "epoch": 0.125011894566562, + "grad_norm": 1.4009771347045898, + "learning_rate": 4.809680955787948e-05, + "loss": 4.8699, + "step": 21020 + }, + { + "epoch": 0.125017841849843, + "grad_norm": 1.2181386947631836, + "learning_rate": 4.809663079491393e-05, + "loss": 4.8258, + "step": 21021 + }, + { + "epoch": 0.12502378913312398, + "grad_norm": 1.3663759231567383, + "learning_rate": 4.809645202388559e-05, + "loss": 5.085, + "step": 21022 + }, + { + "epoch": 0.125029736416405, + "grad_norm": 1.4783004522323608, + "learning_rate": 4.809627324479451e-05, + "loss": 5.0309, + "step": 21023 + }, + { + "epoch": 0.12503568369968598, + "grad_norm": 1.5568218231201172, + "learning_rate": 4.809609445764076e-05, + "loss": 5.217, + "step": 21024 + }, + { + "epoch": 0.12504163098296697, + "grad_norm": 1.42091965675354, + "learning_rate": 4.80959156624244e-05, + "loss": 5.1213, + "step": 21025 + }, + { + "epoch": 0.125047578266248, + "grad_norm": 1.5361231565475464, + "learning_rate": 4.8095736859145504e-05, + "loss": 5.1539, + "step": 21026 + }, + { + "epoch": 0.12505352554952898, + "grad_norm": 1.4799479246139526, + "learning_rate": 4.809555804780411e-05, + "loss": 5.0524, + "step": 21027 + }, + { + "epoch": 0.12505947283280996, + "grad_norm": 1.379309892654419, + "learning_rate": 4.809537922840031e-05, + "loss": 4.8477, + "step": 21028 + }, + { + "epoch": 0.12506542011609098, + "grad_norm": 1.3503345251083374, + "learning_rate": 4.809520040093415e-05, + "loss": 5.3253, + "step": 21029 + }, + { + "epoch": 0.12507136739937197, + "grad_norm": 1.1925950050354004, + "learning_rate": 4.8095021565405684e-05, + "loss": 5.2129, + "step": 21030 + }, + { + "epoch": 0.12507731468265296, + "grad_norm": 1.433516025543213, + "learning_rate": 4.809484272181499e-05, + "loss": 5.1091, + "step": 21031 + }, + { + "epoch": 0.12508326196593397, + "grad_norm": 1.3334667682647705, + "learning_rate": 4.809466387016213e-05, + "loss": 5.3445, + "step": 21032 + }, + { + "epoch": 0.12508920924921496, + "grad_norm": 1.270871877670288, + "learning_rate": 4.809448501044715e-05, + "loss": 5.1455, + "step": 21033 + }, + { + "epoch": 0.12509515653249595, + "grad_norm": 1.2028634548187256, + "learning_rate": 4.8094306142670145e-05, + "loss": 5.1721, + "step": 21034 + }, + { + "epoch": 0.12510110381577697, + "grad_norm": 1.537757396697998, + "learning_rate": 4.809412726683114e-05, + "loss": 5.1853, + "step": 21035 + }, + { + "epoch": 0.12510705109905795, + "grad_norm": 1.3350294828414917, + "learning_rate": 4.809394838293021e-05, + "loss": 5.0725, + "step": 21036 + }, + { + "epoch": 0.12511299838233894, + "grad_norm": 1.3986246585845947, + "learning_rate": 4.8093769490967434e-05, + "loss": 5.1176, + "step": 21037 + }, + { + "epoch": 0.12511894566561996, + "grad_norm": 1.3993934392929077, + "learning_rate": 4.809359059094285e-05, + "loss": 5.1085, + "step": 21038 + }, + { + "epoch": 0.12512489294890095, + "grad_norm": 1.6875231266021729, + "learning_rate": 4.8093411682856535e-05, + "loss": 5.134, + "step": 21039 + }, + { + "epoch": 0.12513084023218193, + "grad_norm": 1.2966142892837524, + "learning_rate": 4.809323276670855e-05, + "loss": 5.1509, + "step": 21040 + }, + { + "epoch": 0.12513678751546295, + "grad_norm": 1.3994536399841309, + "learning_rate": 4.8093053842498956e-05, + "loss": 4.8962, + "step": 21041 + }, + { + "epoch": 0.12514273479874394, + "grad_norm": 1.3936022520065308, + "learning_rate": 4.809287491022782e-05, + "loss": 4.908, + "step": 21042 + }, + { + "epoch": 0.12514868208202493, + "grad_norm": 1.9262713193893433, + "learning_rate": 4.80926959698952e-05, + "loss": 5.0856, + "step": 21043 + }, + { + "epoch": 0.12515462936530594, + "grad_norm": 1.3765772581100464, + "learning_rate": 4.809251702150115e-05, + "loss": 5.0438, + "step": 21044 + }, + { + "epoch": 0.12516057664858693, + "grad_norm": 1.4509775638580322, + "learning_rate": 4.809233806504575e-05, + "loss": 5.2001, + "step": 21045 + }, + { + "epoch": 0.12516652393186792, + "grad_norm": 1.6581740379333496, + "learning_rate": 4.809215910052904e-05, + "loss": 4.7155, + "step": 21046 + }, + { + "epoch": 0.12517247121514893, + "grad_norm": 1.5386825799942017, + "learning_rate": 4.8091980127951115e-05, + "loss": 4.6354, + "step": 21047 + }, + { + "epoch": 0.12517841849842992, + "grad_norm": 1.3021749258041382, + "learning_rate": 4.8091801147312e-05, + "loss": 5.2241, + "step": 21048 + }, + { + "epoch": 0.1251843657817109, + "grad_norm": 1.3396178483963013, + "learning_rate": 4.809162215861179e-05, + "loss": 5.2361, + "step": 21049 + }, + { + "epoch": 0.1251903130649919, + "grad_norm": 1.381496548652649, + "learning_rate": 4.809144316185052e-05, + "loss": 5.3347, + "step": 21050 + }, + { + "epoch": 0.12519626034827291, + "grad_norm": 1.4430748224258423, + "learning_rate": 4.809126415702828e-05, + "loss": 4.895, + "step": 21051 + }, + { + "epoch": 0.1252022076315539, + "grad_norm": 1.2426742315292358, + "learning_rate": 4.809108514414511e-05, + "loss": 4.9085, + "step": 21052 + }, + { + "epoch": 0.1252081549148349, + "grad_norm": 1.224529504776001, + "learning_rate": 4.8090906123201085e-05, + "loss": 5.1997, + "step": 21053 + }, + { + "epoch": 0.1252141021981159, + "grad_norm": 1.295866847038269, + "learning_rate": 4.809072709419626e-05, + "loss": 5.5419, + "step": 21054 + }, + { + "epoch": 0.1252200494813969, + "grad_norm": 1.7327667474746704, + "learning_rate": 4.80905480571307e-05, + "loss": 5.1902, + "step": 21055 + }, + { + "epoch": 0.12522599676467788, + "grad_norm": 1.4727381467819214, + "learning_rate": 4.809036901200447e-05, + "loss": 4.9909, + "step": 21056 + }, + { + "epoch": 0.1252319440479589, + "grad_norm": 1.5449626445770264, + "learning_rate": 4.8090189958817626e-05, + "loss": 4.8721, + "step": 21057 + }, + { + "epoch": 0.1252378913312399, + "grad_norm": 1.563591718673706, + "learning_rate": 4.809001089757024e-05, + "loss": 5.0417, + "step": 21058 + }, + { + "epoch": 0.12524383861452087, + "grad_norm": 1.3692893981933594, + "learning_rate": 4.808983182826237e-05, + "loss": 4.9748, + "step": 21059 + }, + { + "epoch": 0.1252497858978019, + "grad_norm": 1.3994625806808472, + "learning_rate": 4.8089652750894074e-05, + "loss": 5.1823, + "step": 21060 + }, + { + "epoch": 0.12525573318108288, + "grad_norm": 1.3998682498931885, + "learning_rate": 4.8089473665465425e-05, + "loss": 5.2272, + "step": 21061 + }, + { + "epoch": 0.12526168046436387, + "grad_norm": 1.4436434507369995, + "learning_rate": 4.808929457197647e-05, + "loss": 5.4049, + "step": 21062 + }, + { + "epoch": 0.12526762774764488, + "grad_norm": 1.2826770544052124, + "learning_rate": 4.8089115470427294e-05, + "loss": 5.2065, + "step": 21063 + }, + { + "epoch": 0.12527357503092587, + "grad_norm": 1.4545691013336182, + "learning_rate": 4.808893636081794e-05, + "loss": 5.1212, + "step": 21064 + }, + { + "epoch": 0.12527952231420686, + "grad_norm": 1.70439875125885, + "learning_rate": 4.808875724314847e-05, + "loss": 4.9993, + "step": 21065 + }, + { + "epoch": 0.12528546959748788, + "grad_norm": 1.5612056255340576, + "learning_rate": 4.8088578117418965e-05, + "loss": 5.1109, + "step": 21066 + }, + { + "epoch": 0.12529141688076886, + "grad_norm": 1.3385684490203857, + "learning_rate": 4.808839898362947e-05, + "loss": 5.3485, + "step": 21067 + }, + { + "epoch": 0.12529736416404985, + "grad_norm": 1.4440029859542847, + "learning_rate": 4.808821984178006e-05, + "loss": 5.3289, + "step": 21068 + }, + { + "epoch": 0.12530331144733087, + "grad_norm": 1.4780069589614868, + "learning_rate": 4.808804069187078e-05, + "loss": 5.4379, + "step": 21069 + }, + { + "epoch": 0.12530925873061186, + "grad_norm": 1.4137150049209595, + "learning_rate": 4.808786153390171e-05, + "loss": 5.4666, + "step": 21070 + }, + { + "epoch": 0.12531520601389284, + "grad_norm": 1.3870670795440674, + "learning_rate": 4.80876823678729e-05, + "loss": 5.4342, + "step": 21071 + }, + { + "epoch": 0.12532115329717386, + "grad_norm": 1.3641326427459717, + "learning_rate": 4.808750319378442e-05, + "loss": 5.148, + "step": 21072 + }, + { + "epoch": 0.12532710058045485, + "grad_norm": 1.3099322319030762, + "learning_rate": 4.808732401163634e-05, + "loss": 5.1237, + "step": 21073 + }, + { + "epoch": 0.12533304786373584, + "grad_norm": 1.4198615550994873, + "learning_rate": 4.808714482142871e-05, + "loss": 5.5755, + "step": 21074 + }, + { + "epoch": 0.12533899514701685, + "grad_norm": 1.1760785579681396, + "learning_rate": 4.80869656231616e-05, + "loss": 5.5684, + "step": 21075 + }, + { + "epoch": 0.12534494243029784, + "grad_norm": 1.2611156702041626, + "learning_rate": 4.8086786416835054e-05, + "loss": 5.3834, + "step": 21076 + }, + { + "epoch": 0.12535088971357883, + "grad_norm": 1.085659384727478, + "learning_rate": 4.808660720244916e-05, + "loss": 5.2553, + "step": 21077 + }, + { + "epoch": 0.12535683699685984, + "grad_norm": 1.2537906169891357, + "learning_rate": 4.808642798000397e-05, + "loss": 5.3423, + "step": 21078 + }, + { + "epoch": 0.12536278428014083, + "grad_norm": 1.0891891717910767, + "learning_rate": 4.808624874949954e-05, + "loss": 5.4889, + "step": 21079 + }, + { + "epoch": 0.12536873156342182, + "grad_norm": 1.976110577583313, + "learning_rate": 4.808606951093595e-05, + "loss": 5.6103, + "step": 21080 + }, + { + "epoch": 0.12537467884670284, + "grad_norm": 1.3253698348999023, + "learning_rate": 4.808589026431324e-05, + "loss": 5.4673, + "step": 21081 + }, + { + "epoch": 0.12538062612998382, + "grad_norm": 1.4394372701644897, + "learning_rate": 4.808571100963149e-05, + "loss": 5.5256, + "step": 21082 + }, + { + "epoch": 0.1253865734132648, + "grad_norm": 1.45836341381073, + "learning_rate": 4.808553174689076e-05, + "loss": 4.5206, + "step": 21083 + }, + { + "epoch": 0.12539252069654583, + "grad_norm": 1.5719448328018188, + "learning_rate": 4.8085352476091105e-05, + "loss": 4.0577, + "step": 21084 + }, + { + "epoch": 0.12539846797982682, + "grad_norm": 1.3744319677352905, + "learning_rate": 4.808517319723259e-05, + "loss": 4.3965, + "step": 21085 + }, + { + "epoch": 0.1254044152631078, + "grad_norm": 1.4404634237289429, + "learning_rate": 4.8084993910315286e-05, + "loss": 4.3534, + "step": 21086 + }, + { + "epoch": 0.12541036254638882, + "grad_norm": 1.696215033531189, + "learning_rate": 4.8084814615339244e-05, + "loss": 5.4743, + "step": 21087 + }, + { + "epoch": 0.1254163098296698, + "grad_norm": 2.3401246070861816, + "learning_rate": 4.808463531230454e-05, + "loss": 4.3249, + "step": 21088 + }, + { + "epoch": 0.1254222571129508, + "grad_norm": 2.673963785171509, + "learning_rate": 4.808445600121122e-05, + "loss": 4.0038, + "step": 21089 + }, + { + "epoch": 0.1254282043962318, + "grad_norm": 2.551712989807129, + "learning_rate": 4.808427668205935e-05, + "loss": 4.0593, + "step": 21090 + }, + { + "epoch": 0.1254341516795128, + "grad_norm": 2.224776029586792, + "learning_rate": 4.8084097354849004e-05, + "loss": 4.4923, + "step": 21091 + }, + { + "epoch": 0.1254400989627938, + "grad_norm": 2.8964626789093018, + "learning_rate": 4.808391801958024e-05, + "loss": 4.8955, + "step": 21092 + }, + { + "epoch": 0.1254460462460748, + "grad_norm": 2.647202491760254, + "learning_rate": 4.808373867625312e-05, + "loss": 4.315, + "step": 21093 + }, + { + "epoch": 0.1254519935293558, + "grad_norm": 2.852851152420044, + "learning_rate": 4.80835593248677e-05, + "loss": 4.6153, + "step": 21094 + }, + { + "epoch": 0.12545794081263678, + "grad_norm": 1.5732487440109253, + "learning_rate": 4.808337996542405e-05, + "loss": 5.7685, + "step": 21095 + }, + { + "epoch": 0.1254638880959178, + "grad_norm": 1.764635682106018, + "learning_rate": 4.808320059792223e-05, + "loss": 5.8056, + "step": 21096 + }, + { + "epoch": 0.12546983537919879, + "grad_norm": 3.040402889251709, + "learning_rate": 4.80830212223623e-05, + "loss": 4.3029, + "step": 21097 + }, + { + "epoch": 0.12547578266247977, + "grad_norm": 2.3675732612609863, + "learning_rate": 4.8082841838744335e-05, + "loss": 4.2356, + "step": 21098 + }, + { + "epoch": 0.1254817299457608, + "grad_norm": 2.153254747390747, + "learning_rate": 4.808266244706838e-05, + "loss": 4.1071, + "step": 21099 + }, + { + "epoch": 0.12548767722904178, + "grad_norm": 2.181788921356201, + "learning_rate": 4.808248304733451e-05, + "loss": 4.1941, + "step": 21100 + }, + { + "epoch": 0.12549362451232277, + "grad_norm": 2.416555881500244, + "learning_rate": 4.808230363954278e-05, + "loss": 4.0926, + "step": 21101 + }, + { + "epoch": 0.12549957179560378, + "grad_norm": 1.7010666131973267, + "learning_rate": 4.808212422369327e-05, + "loss": 5.3639, + "step": 21102 + }, + { + "epoch": 0.12550551907888477, + "grad_norm": 1.4592742919921875, + "learning_rate": 4.808194479978601e-05, + "loss": 5.5641, + "step": 21103 + }, + { + "epoch": 0.12551146636216576, + "grad_norm": 1.5593754053115845, + "learning_rate": 4.808176536782109e-05, + "loss": 5.4008, + "step": 21104 + }, + { + "epoch": 0.12551741364544677, + "grad_norm": 1.7061179876327515, + "learning_rate": 4.8081585927798565e-05, + "loss": 5.6922, + "step": 21105 + }, + { + "epoch": 0.12552336092872776, + "grad_norm": 1.8220082521438599, + "learning_rate": 4.808140647971849e-05, + "loss": 5.4052, + "step": 21106 + }, + { + "epoch": 0.12552930821200875, + "grad_norm": 1.5218451023101807, + "learning_rate": 4.808122702358095e-05, + "loss": 5.4067, + "step": 21107 + }, + { + "epoch": 0.12553525549528974, + "grad_norm": 1.6590322256088257, + "learning_rate": 4.808104755938598e-05, + "loss": 5.5558, + "step": 21108 + }, + { + "epoch": 0.12554120277857075, + "grad_norm": 1.751290202140808, + "learning_rate": 4.808086808713366e-05, + "loss": 5.5584, + "step": 21109 + }, + { + "epoch": 0.12554715006185174, + "grad_norm": 1.6635403633117676, + "learning_rate": 4.8080688606824035e-05, + "loss": 5.4828, + "step": 21110 + }, + { + "epoch": 0.12555309734513273, + "grad_norm": 1.4710462093353271, + "learning_rate": 4.80805091184572e-05, + "loss": 5.4251, + "step": 21111 + }, + { + "epoch": 0.12555904462841375, + "grad_norm": 1.7598154544830322, + "learning_rate": 4.808032962203318e-05, + "loss": 5.5093, + "step": 21112 + }, + { + "epoch": 0.12556499191169473, + "grad_norm": 1.5128235816955566, + "learning_rate": 4.8080150117552057e-05, + "loss": 5.5069, + "step": 21113 + }, + { + "epoch": 0.12557093919497572, + "grad_norm": 1.5336002111434937, + "learning_rate": 4.80799706050139e-05, + "loss": 5.461, + "step": 21114 + }, + { + "epoch": 0.12557688647825674, + "grad_norm": 1.80903160572052, + "learning_rate": 4.807979108441876e-05, + "loss": 5.5894, + "step": 21115 + }, + { + "epoch": 0.12558283376153773, + "grad_norm": 1.8075919151306152, + "learning_rate": 4.8079611555766706e-05, + "loss": 5.4132, + "step": 21116 + }, + { + "epoch": 0.12558878104481871, + "grad_norm": 1.8319743871688843, + "learning_rate": 4.8079432019057794e-05, + "loss": 5.4409, + "step": 21117 + }, + { + "epoch": 0.12559472832809973, + "grad_norm": 1.7753643989562988, + "learning_rate": 4.8079252474292095e-05, + "loss": 5.425, + "step": 21118 + }, + { + "epoch": 0.12560067561138072, + "grad_norm": 1.614693522453308, + "learning_rate": 4.807907292146967e-05, + "loss": 5.2583, + "step": 21119 + }, + { + "epoch": 0.1256066228946617, + "grad_norm": 1.7520705461502075, + "learning_rate": 4.807889336059057e-05, + "loss": 5.5297, + "step": 21120 + }, + { + "epoch": 0.12561257017794272, + "grad_norm": 1.478826642036438, + "learning_rate": 4.8078713791654875e-05, + "loss": 5.8051, + "step": 21121 + }, + { + "epoch": 0.1256185174612237, + "grad_norm": 1.5645164251327515, + "learning_rate": 4.807853421466263e-05, + "loss": 5.6658, + "step": 21122 + }, + { + "epoch": 0.1256244647445047, + "grad_norm": 1.6254135370254517, + "learning_rate": 4.807835462961392e-05, + "loss": 5.2885, + "step": 21123 + }, + { + "epoch": 0.12563041202778572, + "grad_norm": 1.4290140867233276, + "learning_rate": 4.807817503650879e-05, + "loss": 5.6284, + "step": 21124 + }, + { + "epoch": 0.1256363593110667, + "grad_norm": 1.541447401046753, + "learning_rate": 4.8077995435347304e-05, + "loss": 5.8538, + "step": 21125 + }, + { + "epoch": 0.1256423065943477, + "grad_norm": 1.4778785705566406, + "learning_rate": 4.8077815826129526e-05, + "loss": 5.7019, + "step": 21126 + }, + { + "epoch": 0.1256482538776287, + "grad_norm": 1.5369840860366821, + "learning_rate": 4.807763620885552e-05, + "loss": 5.7164, + "step": 21127 + }, + { + "epoch": 0.1256542011609097, + "grad_norm": 1.5266817808151245, + "learning_rate": 4.807745658352536e-05, + "loss": 5.6203, + "step": 21128 + }, + { + "epoch": 0.12566014844419068, + "grad_norm": 1.4452829360961914, + "learning_rate": 4.8077276950139085e-05, + "loss": 5.7994, + "step": 21129 + }, + { + "epoch": 0.1256660957274717, + "grad_norm": 1.3619974851608276, + "learning_rate": 4.8077097308696786e-05, + "loss": 5.6703, + "step": 21130 + }, + { + "epoch": 0.1256720430107527, + "grad_norm": 1.1146374940872192, + "learning_rate": 4.80769176591985e-05, + "loss": 5.6631, + "step": 21131 + }, + { + "epoch": 0.12567799029403368, + "grad_norm": 1.2224622964859009, + "learning_rate": 4.8076738001644305e-05, + "loss": 5.5511, + "step": 21132 + }, + { + "epoch": 0.1256839375773147, + "grad_norm": 1.530564308166504, + "learning_rate": 4.807655833603426e-05, + "loss": 5.6201, + "step": 21133 + }, + { + "epoch": 0.12568988486059568, + "grad_norm": 1.5123308897018433, + "learning_rate": 4.807637866236842e-05, + "loss": 5.3411, + "step": 21134 + }, + { + "epoch": 0.12569583214387667, + "grad_norm": 1.4682310819625854, + "learning_rate": 4.807619898064686e-05, + "loss": 5.7009, + "step": 21135 + }, + { + "epoch": 0.12570177942715768, + "grad_norm": 1.7714731693267822, + "learning_rate": 4.8076019290869634e-05, + "loss": 5.8286, + "step": 21136 + }, + { + "epoch": 0.12570772671043867, + "grad_norm": 1.6663479804992676, + "learning_rate": 4.8075839593036814e-05, + "loss": 5.8158, + "step": 21137 + }, + { + "epoch": 0.12571367399371966, + "grad_norm": 1.458070158958435, + "learning_rate": 4.8075659887148454e-05, + "loss": 5.6954, + "step": 21138 + }, + { + "epoch": 0.12571962127700068, + "grad_norm": 2.572174072265625, + "learning_rate": 4.807548017320462e-05, + "loss": 4.715, + "step": 21139 + }, + { + "epoch": 0.12572556856028166, + "grad_norm": 2.4615628719329834, + "learning_rate": 4.8075300451205375e-05, + "loss": 4.8458, + "step": 21140 + }, + { + "epoch": 0.12573151584356265, + "grad_norm": 2.193739175796509, + "learning_rate": 4.807512072115078e-05, + "loss": 4.8746, + "step": 21141 + }, + { + "epoch": 0.12573746312684367, + "grad_norm": 1.9279803037643433, + "learning_rate": 4.80749409830409e-05, + "loss": 5.3174, + "step": 21142 + }, + { + "epoch": 0.12574341041012466, + "grad_norm": 2.0332345962524414, + "learning_rate": 4.807476123687579e-05, + "loss": 4.6696, + "step": 21143 + }, + { + "epoch": 0.12574935769340564, + "grad_norm": 2.1900224685668945, + "learning_rate": 4.8074581482655525e-05, + "loss": 4.7911, + "step": 21144 + }, + { + "epoch": 0.12575530497668666, + "grad_norm": 2.1232707500457764, + "learning_rate": 4.807440172038016e-05, + "loss": 4.4891, + "step": 21145 + }, + { + "epoch": 0.12576125225996765, + "grad_norm": 2.2046613693237305, + "learning_rate": 4.807422195004976e-05, + "loss": 5.1136, + "step": 21146 + }, + { + "epoch": 0.12576719954324864, + "grad_norm": 1.9693876504898071, + "learning_rate": 4.807404217166439e-05, + "loss": 5.7068, + "step": 21147 + }, + { + "epoch": 0.12577314682652965, + "grad_norm": 1.8561034202575684, + "learning_rate": 4.807386238522411e-05, + "loss": 5.6435, + "step": 21148 + }, + { + "epoch": 0.12577909410981064, + "grad_norm": 1.7676606178283691, + "learning_rate": 4.8073682590728974e-05, + "loss": 5.0934, + "step": 21149 + }, + { + "epoch": 0.12578504139309163, + "grad_norm": 1.729425311088562, + "learning_rate": 4.8073502788179064e-05, + "loss": 5.4891, + "step": 21150 + }, + { + "epoch": 0.12579098867637264, + "grad_norm": 1.5410076379776, + "learning_rate": 4.807332297757443e-05, + "loss": 5.919, + "step": 21151 + }, + { + "epoch": 0.12579693595965363, + "grad_norm": 1.5089081525802612, + "learning_rate": 4.8073143158915134e-05, + "loss": 5.9701, + "step": 21152 + }, + { + "epoch": 0.12580288324293462, + "grad_norm": 1.476559042930603, + "learning_rate": 4.807296333220125e-05, + "loss": 5.7351, + "step": 21153 + }, + { + "epoch": 0.12580883052621564, + "grad_norm": 2.055143117904663, + "learning_rate": 4.807278349743283e-05, + "loss": 5.4949, + "step": 21154 + }, + { + "epoch": 0.12581477780949663, + "grad_norm": 1.5232601165771484, + "learning_rate": 4.807260365460994e-05, + "loss": 5.3052, + "step": 21155 + }, + { + "epoch": 0.1258207250927776, + "grad_norm": 1.832310676574707, + "learning_rate": 4.807242380373264e-05, + "loss": 5.2832, + "step": 21156 + }, + { + "epoch": 0.12582667237605863, + "grad_norm": 1.8327937126159668, + "learning_rate": 4.807224394480099e-05, + "loss": 5.482, + "step": 21157 + }, + { + "epoch": 0.12583261965933962, + "grad_norm": 1.7728074789047241, + "learning_rate": 4.8072064077815065e-05, + "loss": 5.2636, + "step": 21158 + }, + { + "epoch": 0.1258385669426206, + "grad_norm": 1.6927982568740845, + "learning_rate": 4.8071884202774916e-05, + "loss": 5.369, + "step": 21159 + }, + { + "epoch": 0.12584451422590162, + "grad_norm": 1.8296928405761719, + "learning_rate": 4.8071704319680616e-05, + "loss": 5.4939, + "step": 21160 + }, + { + "epoch": 0.1258504615091826, + "grad_norm": 1.5497393608093262, + "learning_rate": 4.8071524428532224e-05, + "loss": 5.1909, + "step": 21161 + }, + { + "epoch": 0.1258564087924636, + "grad_norm": 1.8332972526550293, + "learning_rate": 4.807134452932979e-05, + "loss": 5.1555, + "step": 21162 + }, + { + "epoch": 0.1258623560757446, + "grad_norm": 1.856772780418396, + "learning_rate": 4.80711646220734e-05, + "loss": 5.1182, + "step": 21163 + }, + { + "epoch": 0.1258683033590256, + "grad_norm": 1.6313568353652954, + "learning_rate": 4.80709847067631e-05, + "loss": 5.0921, + "step": 21164 + }, + { + "epoch": 0.1258742506423066, + "grad_norm": 1.6753991842269897, + "learning_rate": 4.807080478339896e-05, + "loss": 5.1176, + "step": 21165 + }, + { + "epoch": 0.12588019792558758, + "grad_norm": 1.554154396057129, + "learning_rate": 4.807062485198104e-05, + "loss": 5.0849, + "step": 21166 + }, + { + "epoch": 0.1258861452088686, + "grad_norm": 1.9408693313598633, + "learning_rate": 4.8070444912509394e-05, + "loss": 4.9181, + "step": 21167 + }, + { + "epoch": 0.12589209249214958, + "grad_norm": 1.7222824096679688, + "learning_rate": 4.80702649649841e-05, + "loss": 5.6235, + "step": 21168 + }, + { + "epoch": 0.12589803977543057, + "grad_norm": 1.8301146030426025, + "learning_rate": 4.807008500940522e-05, + "loss": 5.3885, + "step": 21169 + }, + { + "epoch": 0.1259039870587116, + "grad_norm": 1.7527635097503662, + "learning_rate": 4.806990504577281e-05, + "loss": 5.3772, + "step": 21170 + }, + { + "epoch": 0.12590993434199257, + "grad_norm": 1.7983075380325317, + "learning_rate": 4.806972507408693e-05, + "loss": 5.7616, + "step": 21171 + }, + { + "epoch": 0.12591588162527356, + "grad_norm": 1.6842983961105347, + "learning_rate": 4.8069545094347653e-05, + "loss": 5.8808, + "step": 21172 + }, + { + "epoch": 0.12592182890855458, + "grad_norm": 1.8382412195205688, + "learning_rate": 4.806936510655503e-05, + "loss": 5.4304, + "step": 21173 + }, + { + "epoch": 0.12592777619183557, + "grad_norm": 1.833301305770874, + "learning_rate": 4.8069185110709133e-05, + "loss": 5.4221, + "step": 21174 + }, + { + "epoch": 0.12593372347511655, + "grad_norm": 1.52051842212677, + "learning_rate": 4.8069005106810025e-05, + "loss": 5.4133, + "step": 21175 + }, + { + "epoch": 0.12593967075839757, + "grad_norm": 1.5269474983215332, + "learning_rate": 4.806882509485776e-05, + "loss": 5.5549, + "step": 21176 + }, + { + "epoch": 0.12594561804167856, + "grad_norm": 1.8116832971572876, + "learning_rate": 4.806864507485241e-05, + "loss": 5.2989, + "step": 21177 + }, + { + "epoch": 0.12595156532495955, + "grad_norm": 1.7355883121490479, + "learning_rate": 4.806846504679403e-05, + "loss": 5.3839, + "step": 21178 + }, + { + "epoch": 0.12595751260824056, + "grad_norm": 1.7445424795150757, + "learning_rate": 4.806828501068269e-05, + "loss": 4.982, + "step": 21179 + }, + { + "epoch": 0.12596345989152155, + "grad_norm": 2.445030689239502, + "learning_rate": 4.806810496651845e-05, + "loss": 4.2665, + "step": 21180 + }, + { + "epoch": 0.12596940717480254, + "grad_norm": 2.6840837001800537, + "learning_rate": 4.8067924914301377e-05, + "loss": 3.9739, + "step": 21181 + }, + { + "epoch": 0.12597535445808355, + "grad_norm": 2.431506872177124, + "learning_rate": 4.806774485403153e-05, + "loss": 3.9235, + "step": 21182 + }, + { + "epoch": 0.12598130174136454, + "grad_norm": 3.124319076538086, + "learning_rate": 4.806756478570896e-05, + "loss": 3.7692, + "step": 21183 + }, + { + "epoch": 0.12598724902464553, + "grad_norm": 2.8702549934387207, + "learning_rate": 4.806738470933375e-05, + "loss": 3.6848, + "step": 21184 + }, + { + "epoch": 0.12599319630792655, + "grad_norm": 2.6687517166137695, + "learning_rate": 4.8067204624905954e-05, + "loss": 3.5655, + "step": 21185 + }, + { + "epoch": 0.12599914359120754, + "grad_norm": 2.3944084644317627, + "learning_rate": 4.806702453242563e-05, + "loss": 3.6176, + "step": 21186 + }, + { + "epoch": 0.12600509087448852, + "grad_norm": 2.565718173980713, + "learning_rate": 4.8066844431892856e-05, + "loss": 3.6557, + "step": 21187 + }, + { + "epoch": 0.12601103815776954, + "grad_norm": 2.9165117740631104, + "learning_rate": 4.806666432330768e-05, + "loss": 3.4013, + "step": 21188 + }, + { + "epoch": 0.12601698544105053, + "grad_norm": 3.232210397720337, + "learning_rate": 4.806648420667017e-05, + "loss": 4.8954, + "step": 21189 + }, + { + "epoch": 0.12602293272433152, + "grad_norm": 3.2784297466278076, + "learning_rate": 4.8066304081980384e-05, + "loss": 4.7801, + "step": 21190 + }, + { + "epoch": 0.12602888000761253, + "grad_norm": 2.8707523345947266, + "learning_rate": 4.8066123949238396e-05, + "loss": 4.7461, + "step": 21191 + }, + { + "epoch": 0.12603482729089352, + "grad_norm": 2.3808538913726807, + "learning_rate": 4.8065943808444255e-05, + "loss": 4.5148, + "step": 21192 + }, + { + "epoch": 0.1260407745741745, + "grad_norm": 2.2710814476013184, + "learning_rate": 4.806576365959804e-05, + "loss": 4.522, + "step": 21193 + }, + { + "epoch": 0.12604672185745552, + "grad_norm": 2.2108187675476074, + "learning_rate": 4.80655835026998e-05, + "loss": 4.7575, + "step": 21194 + }, + { + "epoch": 0.1260526691407365, + "grad_norm": 2.1496641635894775, + "learning_rate": 4.80654033377496e-05, + "loss": 4.6543, + "step": 21195 + }, + { + "epoch": 0.1260586164240175, + "grad_norm": 1.9770373106002808, + "learning_rate": 4.806522316474752e-05, + "loss": 4.59, + "step": 21196 + }, + { + "epoch": 0.12606456370729852, + "grad_norm": 1.8799597024917603, + "learning_rate": 4.80650429836936e-05, + "loss": 4.598, + "step": 21197 + }, + { + "epoch": 0.1260705109905795, + "grad_norm": 1.846724510192871, + "learning_rate": 4.8064862794587903e-05, + "loss": 4.4912, + "step": 21198 + }, + { + "epoch": 0.1260764582738605, + "grad_norm": 1.7821966409683228, + "learning_rate": 4.806468259743051e-05, + "loss": 4.4898, + "step": 21199 + }, + { + "epoch": 0.1260824055571415, + "grad_norm": 1.7804360389709473, + "learning_rate": 4.806450239222148e-05, + "loss": 4.5324, + "step": 21200 + }, + { + "epoch": 0.1260883528404225, + "grad_norm": 1.705761194229126, + "learning_rate": 4.8064322178960864e-05, + "loss": 4.7046, + "step": 21201 + }, + { + "epoch": 0.12609430012370348, + "grad_norm": 2.41103458404541, + "learning_rate": 4.8064141957648726e-05, + "loss": 5.1943, + "step": 21202 + }, + { + "epoch": 0.1261002474069845, + "grad_norm": 2.3028182983398438, + "learning_rate": 4.806396172828515e-05, + "loss": 5.0494, + "step": 21203 + }, + { + "epoch": 0.1261061946902655, + "grad_norm": 2.1674535274505615, + "learning_rate": 4.806378149087016e-05, + "loss": 5.3104, + "step": 21204 + }, + { + "epoch": 0.12611214197354648, + "grad_norm": 1.9217156171798706, + "learning_rate": 4.8063601245403864e-05, + "loss": 5.2403, + "step": 21205 + }, + { + "epoch": 0.1261180892568275, + "grad_norm": 2.097116231918335, + "learning_rate": 4.806342099188629e-05, + "loss": 5.3471, + "step": 21206 + }, + { + "epoch": 0.12612403654010848, + "grad_norm": 1.8356170654296875, + "learning_rate": 4.806324073031751e-05, + "loss": 5.2168, + "step": 21207 + }, + { + "epoch": 0.12612998382338947, + "grad_norm": 2.2306652069091797, + "learning_rate": 4.806306046069761e-05, + "loss": 5.1406, + "step": 21208 + }, + { + "epoch": 0.12613593110667048, + "grad_norm": 1.8946762084960938, + "learning_rate": 4.8062880183026624e-05, + "loss": 5.072, + "step": 21209 + }, + { + "epoch": 0.12614187838995147, + "grad_norm": 2.0963854789733887, + "learning_rate": 4.806269989730462e-05, + "loss": 5.2702, + "step": 21210 + }, + { + "epoch": 0.12614782567323246, + "grad_norm": 1.859677791595459, + "learning_rate": 4.806251960353167e-05, + "loss": 5.1133, + "step": 21211 + }, + { + "epoch": 0.12615377295651348, + "grad_norm": 1.9993607997894287, + "learning_rate": 4.806233930170783e-05, + "loss": 5.1201, + "step": 21212 + }, + { + "epoch": 0.12615972023979447, + "grad_norm": 1.7218701839447021, + "learning_rate": 4.8062158991833176e-05, + "loss": 5.0055, + "step": 21213 + }, + { + "epoch": 0.12616566752307545, + "grad_norm": 1.9172027111053467, + "learning_rate": 4.806197867390775e-05, + "loss": 4.955, + "step": 21214 + }, + { + "epoch": 0.12617161480635647, + "grad_norm": 2.0665276050567627, + "learning_rate": 4.8061798347931627e-05, + "loss": 4.842, + "step": 21215 + }, + { + "epoch": 0.12617756208963746, + "grad_norm": 1.932822346687317, + "learning_rate": 4.806161801390486e-05, + "loss": 4.5687, + "step": 21216 + }, + { + "epoch": 0.12618350937291845, + "grad_norm": 1.7978770732879639, + "learning_rate": 4.806143767182754e-05, + "loss": 4.6994, + "step": 21217 + }, + { + "epoch": 0.12618945665619946, + "grad_norm": 1.9298393726348877, + "learning_rate": 4.80612573216997e-05, + "loss": 4.8935, + "step": 21218 + }, + { + "epoch": 0.12619540393948045, + "grad_norm": 1.8706467151641846, + "learning_rate": 4.806107696352141e-05, + "loss": 4.699, + "step": 21219 + }, + { + "epoch": 0.12620135122276144, + "grad_norm": 1.946582317352295, + "learning_rate": 4.806089659729274e-05, + "loss": 4.9519, + "step": 21220 + }, + { + "epoch": 0.12620729850604245, + "grad_norm": 2.1021311283111572, + "learning_rate": 4.806071622301375e-05, + "loss": 4.8315, + "step": 21221 + }, + { + "epoch": 0.12621324578932344, + "grad_norm": 2.110234022140503, + "learning_rate": 4.8060535840684504e-05, + "loss": 4.6524, + "step": 21222 + }, + { + "epoch": 0.12621919307260443, + "grad_norm": 2.1723785400390625, + "learning_rate": 4.806035545030506e-05, + "loss": 4.7154, + "step": 21223 + }, + { + "epoch": 0.12622514035588542, + "grad_norm": 1.8978101015090942, + "learning_rate": 4.806017505187548e-05, + "loss": 4.6743, + "step": 21224 + }, + { + "epoch": 0.12623108763916643, + "grad_norm": 2.0092225074768066, + "learning_rate": 4.8059994645395833e-05, + "loss": 4.9198, + "step": 21225 + }, + { + "epoch": 0.12623703492244742, + "grad_norm": 1.935624122619629, + "learning_rate": 4.8059814230866184e-05, + "loss": 4.7253, + "step": 21226 + }, + { + "epoch": 0.1262429822057284, + "grad_norm": 1.9758509397506714, + "learning_rate": 4.80596338082866e-05, + "loss": 4.6388, + "step": 21227 + }, + { + "epoch": 0.12624892948900943, + "grad_norm": 2.0389976501464844, + "learning_rate": 4.805945337765712e-05, + "loss": 4.7527, + "step": 21228 + }, + { + "epoch": 0.12625487677229041, + "grad_norm": 2.0781445503234863, + "learning_rate": 4.805927293897783e-05, + "loss": 4.7985, + "step": 21229 + }, + { + "epoch": 0.1262608240555714, + "grad_norm": 2.0403099060058594, + "learning_rate": 4.8059092492248786e-05, + "loss": 5.1442, + "step": 21230 + }, + { + "epoch": 0.12626677133885242, + "grad_norm": 2.141681432723999, + "learning_rate": 4.805891203747005e-05, + "loss": 5.1191, + "step": 21231 + }, + { + "epoch": 0.1262727186221334, + "grad_norm": 2.159761905670166, + "learning_rate": 4.805873157464169e-05, + "loss": 5.2995, + "step": 21232 + }, + { + "epoch": 0.1262786659054144, + "grad_norm": 2.568081855773926, + "learning_rate": 4.805855110376376e-05, + "loss": 5.4263, + "step": 21233 + }, + { + "epoch": 0.1262846131886954, + "grad_norm": 1.8911200761795044, + "learning_rate": 4.8058370624836336e-05, + "loss": 5.3457, + "step": 21234 + }, + { + "epoch": 0.1262905604719764, + "grad_norm": 2.3370580673217773, + "learning_rate": 4.805819013785946e-05, + "loss": 4.8342, + "step": 21235 + }, + { + "epoch": 0.1262965077552574, + "grad_norm": 2.669029474258423, + "learning_rate": 4.805800964283322e-05, + "loss": 4.9175, + "step": 21236 + }, + { + "epoch": 0.1263024550385384, + "grad_norm": 1.9824459552764893, + "learning_rate": 4.8057829139757657e-05, + "loss": 4.6509, + "step": 21237 + }, + { + "epoch": 0.1263084023218194, + "grad_norm": 1.9576833248138428, + "learning_rate": 4.805764862863286e-05, + "loss": 5.4197, + "step": 21238 + }, + { + "epoch": 0.12631434960510038, + "grad_norm": 1.9594717025756836, + "learning_rate": 4.805746810945886e-05, + "loss": 5.7506, + "step": 21239 + }, + { + "epoch": 0.1263202968883814, + "grad_norm": 2.063676357269287, + "learning_rate": 4.8057287582235746e-05, + "loss": 5.6675, + "step": 21240 + }, + { + "epoch": 0.12632624417166238, + "grad_norm": 1.9354885816574097, + "learning_rate": 4.805710704696356e-05, + "loss": 5.1697, + "step": 21241 + }, + { + "epoch": 0.12633219145494337, + "grad_norm": 1.9859137535095215, + "learning_rate": 4.8056926503642384e-05, + "loss": 4.9055, + "step": 21242 + }, + { + "epoch": 0.1263381387382244, + "grad_norm": 2.1015024185180664, + "learning_rate": 4.805674595227228e-05, + "loss": 4.4961, + "step": 21243 + }, + { + "epoch": 0.12634408602150538, + "grad_norm": 2.225673198699951, + "learning_rate": 4.805656539285329e-05, + "loss": 4.2943, + "step": 21244 + }, + { + "epoch": 0.12635003330478636, + "grad_norm": 1.9753731489181519, + "learning_rate": 4.8056384825385495e-05, + "loss": 4.401, + "step": 21245 + }, + { + "epoch": 0.12635598058806738, + "grad_norm": 1.693865180015564, + "learning_rate": 4.805620424986896e-05, + "loss": 4.2992, + "step": 21246 + }, + { + "epoch": 0.12636192787134837, + "grad_norm": 2.0757269859313965, + "learning_rate": 4.805602366630374e-05, + "loss": 4.4564, + "step": 21247 + }, + { + "epoch": 0.12636787515462936, + "grad_norm": 1.559611201286316, + "learning_rate": 4.80558430746899e-05, + "loss": 5.95, + "step": 21248 + }, + { + "epoch": 0.12637382243791037, + "grad_norm": 1.7863824367523193, + "learning_rate": 4.80556624750275e-05, + "loss": 5.2208, + "step": 21249 + }, + { + "epoch": 0.12637976972119136, + "grad_norm": 1.7766302824020386, + "learning_rate": 4.805548186731661e-05, + "loss": 4.9666, + "step": 21250 + }, + { + "epoch": 0.12638571700447235, + "grad_norm": 1.5633225440979004, + "learning_rate": 4.805530125155728e-05, + "loss": 4.7051, + "step": 21251 + }, + { + "epoch": 0.12639166428775336, + "grad_norm": 1.795332431793213, + "learning_rate": 4.80551206277496e-05, + "loss": 4.624, + "step": 21252 + }, + { + "epoch": 0.12639761157103435, + "grad_norm": 2.2065796852111816, + "learning_rate": 4.805493999589361e-05, + "loss": 4.2034, + "step": 21253 + }, + { + "epoch": 0.12640355885431534, + "grad_norm": 2.0833165645599365, + "learning_rate": 4.805475935598937e-05, + "loss": 4.3267, + "step": 21254 + }, + { + "epoch": 0.12640950613759636, + "grad_norm": 2.591543436050415, + "learning_rate": 4.8054578708036954e-05, + "loss": 4.5015, + "step": 21255 + }, + { + "epoch": 0.12641545342087734, + "grad_norm": 1.7929967641830444, + "learning_rate": 4.805439805203643e-05, + "loss": 5.1193, + "step": 21256 + }, + { + "epoch": 0.12642140070415833, + "grad_norm": 1.632691740989685, + "learning_rate": 4.805421738798785e-05, + "loss": 4.728, + "step": 21257 + }, + { + "epoch": 0.12642734798743935, + "grad_norm": 1.844673752784729, + "learning_rate": 4.8054036715891284e-05, + "loss": 4.8617, + "step": 21258 + }, + { + "epoch": 0.12643329527072034, + "grad_norm": 1.7764726877212524, + "learning_rate": 4.805385603574678e-05, + "loss": 5.0102, + "step": 21259 + }, + { + "epoch": 0.12643924255400132, + "grad_norm": 1.7257095575332642, + "learning_rate": 4.8053675347554425e-05, + "loss": 5.4136, + "step": 21260 + }, + { + "epoch": 0.12644518983728234, + "grad_norm": 1.9378974437713623, + "learning_rate": 4.805349465131427e-05, + "loss": 4.8102, + "step": 21261 + }, + { + "epoch": 0.12645113712056333, + "grad_norm": 2.1207330226898193, + "learning_rate": 4.805331394702637e-05, + "loss": 5.137, + "step": 21262 + }, + { + "epoch": 0.12645708440384432, + "grad_norm": 2.630957841873169, + "learning_rate": 4.8053133234690806e-05, + "loss": 3.9948, + "step": 21263 + }, + { + "epoch": 0.12646303168712533, + "grad_norm": 2.5051863193511963, + "learning_rate": 4.805295251430762e-05, + "loss": 3.7358, + "step": 21264 + }, + { + "epoch": 0.12646897897040632, + "grad_norm": 2.4558019638061523, + "learning_rate": 4.805277178587689e-05, + "loss": 4.1314, + "step": 21265 + }, + { + "epoch": 0.1264749262536873, + "grad_norm": 2.1878461837768555, + "learning_rate": 4.805259104939869e-05, + "loss": 5.1189, + "step": 21266 + }, + { + "epoch": 0.12648087353696832, + "grad_norm": 2.303126811981201, + "learning_rate": 4.805241030487305e-05, + "loss": 4.4202, + "step": 21267 + }, + { + "epoch": 0.1264868208202493, + "grad_norm": 2.4533417224884033, + "learning_rate": 4.805222955230006e-05, + "loss": 4.4752, + "step": 21268 + }, + { + "epoch": 0.1264927681035303, + "grad_norm": 2.4850356578826904, + "learning_rate": 4.805204879167977e-05, + "loss": 4.0938, + "step": 21269 + }, + { + "epoch": 0.12649871538681132, + "grad_norm": 2.622119665145874, + "learning_rate": 4.805186802301226e-05, + "loss": 3.5693, + "step": 21270 + }, + { + "epoch": 0.1265046626700923, + "grad_norm": 2.5546908378601074, + "learning_rate": 4.8051687246297574e-05, + "loss": 4.1895, + "step": 21271 + }, + { + "epoch": 0.1265106099533733, + "grad_norm": 2.6318092346191406, + "learning_rate": 4.805150646153578e-05, + "loss": 4.5214, + "step": 21272 + }, + { + "epoch": 0.1265165572366543, + "grad_norm": 2.380413770675659, + "learning_rate": 4.805132566872694e-05, + "loss": 4.601, + "step": 21273 + }, + { + "epoch": 0.1265225045199353, + "grad_norm": 2.652449369430542, + "learning_rate": 4.805114486787112e-05, + "loss": 4.7164, + "step": 21274 + }, + { + "epoch": 0.12652845180321629, + "grad_norm": 2.6453335285186768, + "learning_rate": 4.8050964058968394e-05, + "loss": 4.8007, + "step": 21275 + }, + { + "epoch": 0.1265343990864973, + "grad_norm": 2.226515054702759, + "learning_rate": 4.8050783242018805e-05, + "loss": 4.7653, + "step": 21276 + }, + { + "epoch": 0.1265403463697783, + "grad_norm": 2.678157091140747, + "learning_rate": 4.805060241702243e-05, + "loss": 4.8511, + "step": 21277 + }, + { + "epoch": 0.12654629365305928, + "grad_norm": 2.2161943912506104, + "learning_rate": 4.8050421583979324e-05, + "loss": 4.6734, + "step": 21278 + }, + { + "epoch": 0.1265522409363403, + "grad_norm": 2.242539882659912, + "learning_rate": 4.805024074288956e-05, + "loss": 4.5445, + "step": 21279 + }, + { + "epoch": 0.12655818821962128, + "grad_norm": 1.9599577188491821, + "learning_rate": 4.805005989375319e-05, + "loss": 4.7331, + "step": 21280 + }, + { + "epoch": 0.12656413550290227, + "grad_norm": 2.1399378776550293, + "learning_rate": 4.8049879036570286e-05, + "loss": 4.1747, + "step": 21281 + }, + { + "epoch": 0.12657008278618326, + "grad_norm": 2.202322244644165, + "learning_rate": 4.8049698171340904e-05, + "loss": 4.3195, + "step": 21282 + }, + { + "epoch": 0.12657603006946427, + "grad_norm": 2.071727991104126, + "learning_rate": 4.8049517298065115e-05, + "loss": 4.3142, + "step": 21283 + }, + { + "epoch": 0.12658197735274526, + "grad_norm": 1.8801134824752808, + "learning_rate": 4.8049336416742974e-05, + "loss": 4.2353, + "step": 21284 + }, + { + "epoch": 0.12658792463602625, + "grad_norm": 1.8937469720840454, + "learning_rate": 4.804915552737455e-05, + "loss": 4.1141, + "step": 21285 + }, + { + "epoch": 0.12659387191930727, + "grad_norm": 1.8500044345855713, + "learning_rate": 4.8048974629959906e-05, + "loss": 4.0509, + "step": 21286 + }, + { + "epoch": 0.12659981920258825, + "grad_norm": 1.8931934833526611, + "learning_rate": 4.8048793724499095e-05, + "loss": 4.1905, + "step": 21287 + }, + { + "epoch": 0.12660576648586924, + "grad_norm": 1.6579469442367554, + "learning_rate": 4.8048612810992196e-05, + "loss": 4.8032, + "step": 21288 + }, + { + "epoch": 0.12661171376915026, + "grad_norm": 1.7402268648147583, + "learning_rate": 4.804843188943926e-05, + "loss": 5.363, + "step": 21289 + }, + { + "epoch": 0.12661766105243125, + "grad_norm": 1.6550151109695435, + "learning_rate": 4.804825095984036e-05, + "loss": 5.4504, + "step": 21290 + }, + { + "epoch": 0.12662360833571223, + "grad_norm": 1.5498002767562866, + "learning_rate": 4.8048070022195546e-05, + "loss": 5.2858, + "step": 21291 + }, + { + "epoch": 0.12662955561899325, + "grad_norm": 1.6577101945877075, + "learning_rate": 4.804788907650489e-05, + "loss": 5.0535, + "step": 21292 + }, + { + "epoch": 0.12663550290227424, + "grad_norm": 1.5144888162612915, + "learning_rate": 4.804770812276845e-05, + "loss": 5.0564, + "step": 21293 + }, + { + "epoch": 0.12664145018555523, + "grad_norm": 1.7675977945327759, + "learning_rate": 4.804752716098631e-05, + "loss": 4.9044, + "step": 21294 + }, + { + "epoch": 0.12664739746883624, + "grad_norm": 1.6419012546539307, + "learning_rate": 4.8047346191158506e-05, + "loss": 5.1735, + "step": 21295 + }, + { + "epoch": 0.12665334475211723, + "grad_norm": 1.9034998416900635, + "learning_rate": 4.8047165213285106e-05, + "loss": 5.762, + "step": 21296 + }, + { + "epoch": 0.12665929203539822, + "grad_norm": 2.2357866764068604, + "learning_rate": 4.8046984227366186e-05, + "loss": 5.0351, + "step": 21297 + }, + { + "epoch": 0.12666523931867923, + "grad_norm": 1.528701663017273, + "learning_rate": 4.8046803233401796e-05, + "loss": 5.3659, + "step": 21298 + }, + { + "epoch": 0.12667118660196022, + "grad_norm": 1.5450912714004517, + "learning_rate": 4.8046622231392015e-05, + "loss": 5.4961, + "step": 21299 + }, + { + "epoch": 0.1266771338852412, + "grad_norm": 2.459630012512207, + "learning_rate": 4.804644122133689e-05, + "loss": 4.6308, + "step": 21300 + }, + { + "epoch": 0.12668308116852223, + "grad_norm": 1.8703144788742065, + "learning_rate": 4.8046260203236494e-05, + "loss": 4.6424, + "step": 21301 + }, + { + "epoch": 0.12668902845180322, + "grad_norm": 1.4294613599777222, + "learning_rate": 4.804607917709088e-05, + "loss": 5.5703, + "step": 21302 + }, + { + "epoch": 0.1266949757350842, + "grad_norm": 1.6063963174819946, + "learning_rate": 4.804589814290012e-05, + "loss": 5.6344, + "step": 21303 + }, + { + "epoch": 0.12670092301836522, + "grad_norm": 2.1621460914611816, + "learning_rate": 4.8045717100664275e-05, + "loss": 5.1798, + "step": 21304 + }, + { + "epoch": 0.1267068703016462, + "grad_norm": 2.187513828277588, + "learning_rate": 4.804553605038341e-05, + "loss": 4.4837, + "step": 21305 + }, + { + "epoch": 0.1267128175849272, + "grad_norm": 2.5205118656158447, + "learning_rate": 4.804535499205759e-05, + "loss": 4.5554, + "step": 21306 + }, + { + "epoch": 0.1267187648682082, + "grad_norm": 2.196026563644409, + "learning_rate": 4.804517392568687e-05, + "loss": 4.5849, + "step": 21307 + }, + { + "epoch": 0.1267247121514892, + "grad_norm": 2.152150869369507, + "learning_rate": 4.804499285127132e-05, + "loss": 4.4153, + "step": 21308 + }, + { + "epoch": 0.1267306594347702, + "grad_norm": 2.398475170135498, + "learning_rate": 4.8044811768811e-05, + "loss": 4.1129, + "step": 21309 + }, + { + "epoch": 0.1267366067180512, + "grad_norm": 2.4291298389434814, + "learning_rate": 4.8044630678305976e-05, + "loss": 4.4199, + "step": 21310 + }, + { + "epoch": 0.1267425540013322, + "grad_norm": 2.6893248558044434, + "learning_rate": 4.80444495797563e-05, + "loss": 4.419, + "step": 21311 + }, + { + "epoch": 0.12674850128461318, + "grad_norm": 2.369361400604248, + "learning_rate": 4.804426847316206e-05, + "loss": 4.3434, + "step": 21312 + }, + { + "epoch": 0.1267544485678942, + "grad_norm": 2.206676721572876, + "learning_rate": 4.804408735852329e-05, + "loss": 4.2195, + "step": 21313 + }, + { + "epoch": 0.12676039585117518, + "grad_norm": 2.3347322940826416, + "learning_rate": 4.8043906235840074e-05, + "loss": 4.352, + "step": 21314 + }, + { + "epoch": 0.12676634313445617, + "grad_norm": 2.4026732444763184, + "learning_rate": 4.804372510511247e-05, + "loss": 4.0351, + "step": 21315 + }, + { + "epoch": 0.1267722904177372, + "grad_norm": 2.3547754287719727, + "learning_rate": 4.8043543966340546e-05, + "loss": 4.1292, + "step": 21316 + }, + { + "epoch": 0.12677823770101818, + "grad_norm": 2.3924174308776855, + "learning_rate": 4.804336281952434e-05, + "loss": 4.138, + "step": 21317 + }, + { + "epoch": 0.12678418498429916, + "grad_norm": 2.063361883163452, + "learning_rate": 4.804318166466395e-05, + "loss": 4.1288, + "step": 21318 + }, + { + "epoch": 0.12679013226758018, + "grad_norm": 2.1719813346862793, + "learning_rate": 4.8043000501759415e-05, + "loss": 4.3262, + "step": 21319 + }, + { + "epoch": 0.12679607955086117, + "grad_norm": 2.3787803649902344, + "learning_rate": 4.8042819330810803e-05, + "loss": 4.448, + "step": 21320 + }, + { + "epoch": 0.12680202683414216, + "grad_norm": 2.369344472885132, + "learning_rate": 4.80426381518182e-05, + "loss": 4.4237, + "step": 21321 + }, + { + "epoch": 0.12680797411742317, + "grad_norm": 1.9213550090789795, + "learning_rate": 4.804245696478163e-05, + "loss": 4.8805, + "step": 21322 + }, + { + "epoch": 0.12681392140070416, + "grad_norm": 2.1709017753601074, + "learning_rate": 4.804227576970118e-05, + "loss": 5.7745, + "step": 21323 + }, + { + "epoch": 0.12681986868398515, + "grad_norm": 2.1823856830596924, + "learning_rate": 4.8042094566576925e-05, + "loss": 5.561, + "step": 21324 + }, + { + "epoch": 0.12682581596726616, + "grad_norm": 2.403367519378662, + "learning_rate": 4.80419133554089e-05, + "loss": 5.6699, + "step": 21325 + }, + { + "epoch": 0.12683176325054715, + "grad_norm": 1.8335449695587158, + "learning_rate": 4.8041732136197184e-05, + "loss": 5.5058, + "step": 21326 + }, + { + "epoch": 0.12683771053382814, + "grad_norm": 1.7406642436981201, + "learning_rate": 4.804155090894183e-05, + "loss": 5.6536, + "step": 21327 + }, + { + "epoch": 0.12684365781710916, + "grad_norm": 2.160098075866699, + "learning_rate": 4.804136967364291e-05, + "loss": 5.4742, + "step": 21328 + }, + { + "epoch": 0.12684960510039014, + "grad_norm": 1.5187212228775024, + "learning_rate": 4.804118843030049e-05, + "loss": 5.2908, + "step": 21329 + }, + { + "epoch": 0.12685555238367113, + "grad_norm": 1.387417197227478, + "learning_rate": 4.804100717891463e-05, + "loss": 5.3319, + "step": 21330 + }, + { + "epoch": 0.12686149966695215, + "grad_norm": 1.3029687404632568, + "learning_rate": 4.80408259194854e-05, + "loss": 5.4069, + "step": 21331 + }, + { + "epoch": 0.12686744695023314, + "grad_norm": 1.7097088098526, + "learning_rate": 4.804064465201284e-05, + "loss": 4.8422, + "step": 21332 + }, + { + "epoch": 0.12687339423351413, + "grad_norm": 1.7519829273223877, + "learning_rate": 4.804046337649704e-05, + "loss": 5.4513, + "step": 21333 + }, + { + "epoch": 0.12687934151679514, + "grad_norm": 1.5313260555267334, + "learning_rate": 4.8040282092938046e-05, + "loss": 4.8656, + "step": 21334 + }, + { + "epoch": 0.12688528880007613, + "grad_norm": 1.629780888557434, + "learning_rate": 4.804010080133593e-05, + "loss": 4.8751, + "step": 21335 + }, + { + "epoch": 0.12689123608335712, + "grad_norm": 1.7247028350830078, + "learning_rate": 4.8039919501690756e-05, + "loss": 4.7207, + "step": 21336 + }, + { + "epoch": 0.12689718336663813, + "grad_norm": 1.517016887664795, + "learning_rate": 4.803973819400258e-05, + "loss": 5.0604, + "step": 21337 + }, + { + "epoch": 0.12690313064991912, + "grad_norm": 1.4583669900894165, + "learning_rate": 4.8039556878271475e-05, + "loss": 5.0638, + "step": 21338 + }, + { + "epoch": 0.1269090779332001, + "grad_norm": 1.725014567375183, + "learning_rate": 4.803937555449749e-05, + "loss": 5.5831, + "step": 21339 + }, + { + "epoch": 0.1269150252164811, + "grad_norm": 1.4144753217697144, + "learning_rate": 4.803919422268071e-05, + "loss": 5.3899, + "step": 21340 + }, + { + "epoch": 0.1269209724997621, + "grad_norm": 1.4197511672973633, + "learning_rate": 4.803901288282117e-05, + "loss": 5.4904, + "step": 21341 + }, + { + "epoch": 0.1269269197830431, + "grad_norm": 1.5491420030593872, + "learning_rate": 4.803883153491896e-05, + "loss": 5.5008, + "step": 21342 + }, + { + "epoch": 0.1269328670663241, + "grad_norm": 1.4152858257293701, + "learning_rate": 4.803865017897412e-05, + "loss": 5.5328, + "step": 21343 + }, + { + "epoch": 0.1269388143496051, + "grad_norm": 1.6931630373001099, + "learning_rate": 4.803846881498674e-05, + "loss": 5.4435, + "step": 21344 + }, + { + "epoch": 0.1269447616328861, + "grad_norm": 1.4955002069473267, + "learning_rate": 4.803828744295686e-05, + "loss": 5.3631, + "step": 21345 + }, + { + "epoch": 0.12695070891616708, + "grad_norm": 1.5340615510940552, + "learning_rate": 4.803810606288455e-05, + "loss": 5.4711, + "step": 21346 + }, + { + "epoch": 0.1269566561994481, + "grad_norm": 1.4584442377090454, + "learning_rate": 4.803792467476988e-05, + "loss": 5.512, + "step": 21347 + }, + { + "epoch": 0.1269626034827291, + "grad_norm": 1.663875699043274, + "learning_rate": 4.803774327861291e-05, + "loss": 5.5867, + "step": 21348 + }, + { + "epoch": 0.12696855076601007, + "grad_norm": 1.4865331649780273, + "learning_rate": 4.8037561874413696e-05, + "loss": 5.0047, + "step": 21349 + }, + { + "epoch": 0.1269744980492911, + "grad_norm": 1.5889533758163452, + "learning_rate": 4.803738046217231e-05, + "loss": 4.9325, + "step": 21350 + }, + { + "epoch": 0.12698044533257208, + "grad_norm": 1.7473856210708618, + "learning_rate": 4.8037199041888814e-05, + "loss": 4.9296, + "step": 21351 + }, + { + "epoch": 0.12698639261585307, + "grad_norm": 1.9395428895950317, + "learning_rate": 4.8037017613563265e-05, + "loss": 5.5787, + "step": 21352 + }, + { + "epoch": 0.12699233989913408, + "grad_norm": 1.8723230361938477, + "learning_rate": 4.8036836177195734e-05, + "loss": 5.2864, + "step": 21353 + }, + { + "epoch": 0.12699828718241507, + "grad_norm": 1.8751366138458252, + "learning_rate": 4.8036654732786276e-05, + "loss": 4.9116, + "step": 21354 + }, + { + "epoch": 0.12700423446569606, + "grad_norm": 1.6620196104049683, + "learning_rate": 4.803647328033497e-05, + "loss": 5.1592, + "step": 21355 + }, + { + "epoch": 0.12701018174897707, + "grad_norm": 2.01167631149292, + "learning_rate": 4.803629181984187e-05, + "loss": 5.2254, + "step": 21356 + }, + { + "epoch": 0.12701612903225806, + "grad_norm": 1.6565442085266113, + "learning_rate": 4.803611035130703e-05, + "loss": 5.2454, + "step": 21357 + }, + { + "epoch": 0.12702207631553905, + "grad_norm": 1.3379613161087036, + "learning_rate": 4.803592887473053e-05, + "loss": 5.3203, + "step": 21358 + }, + { + "epoch": 0.12702802359882007, + "grad_norm": 1.580633282661438, + "learning_rate": 4.8035747390112415e-05, + "loss": 5.2555, + "step": 21359 + }, + { + "epoch": 0.12703397088210105, + "grad_norm": 1.9735597372055054, + "learning_rate": 4.803556589745276e-05, + "loss": 5.6899, + "step": 21360 + }, + { + "epoch": 0.12703991816538204, + "grad_norm": 1.6550042629241943, + "learning_rate": 4.8035384396751636e-05, + "loss": 4.8188, + "step": 21361 + }, + { + "epoch": 0.12704586544866306, + "grad_norm": 1.598645567893982, + "learning_rate": 4.803520288800909e-05, + "loss": 5.0498, + "step": 21362 + }, + { + "epoch": 0.12705181273194405, + "grad_norm": 1.5990798473358154, + "learning_rate": 4.80350213712252e-05, + "loss": 5.0563, + "step": 21363 + }, + { + "epoch": 0.12705776001522504, + "grad_norm": 1.5130763053894043, + "learning_rate": 4.803483984640001e-05, + "loss": 5.2562, + "step": 21364 + }, + { + "epoch": 0.12706370729850605, + "grad_norm": 1.5498485565185547, + "learning_rate": 4.803465831353361e-05, + "loss": 5.551, + "step": 21365 + }, + { + "epoch": 0.12706965458178704, + "grad_norm": 1.819954752922058, + "learning_rate": 4.803447677262603e-05, + "loss": 4.5888, + "step": 21366 + }, + { + "epoch": 0.12707560186506803, + "grad_norm": 1.5863771438598633, + "learning_rate": 4.8034295223677374e-05, + "loss": 5.108, + "step": 21367 + }, + { + "epoch": 0.12708154914834904, + "grad_norm": 1.6637874841690063, + "learning_rate": 4.803411366668767e-05, + "loss": 5.3476, + "step": 21368 + }, + { + "epoch": 0.12708749643163003, + "grad_norm": 1.5182580947875977, + "learning_rate": 4.8033932101657e-05, + "loss": 5.6559, + "step": 21369 + }, + { + "epoch": 0.12709344371491102, + "grad_norm": 1.725801706314087, + "learning_rate": 4.803375052858542e-05, + "loss": 4.6643, + "step": 21370 + }, + { + "epoch": 0.12709939099819204, + "grad_norm": 1.6476885080337524, + "learning_rate": 4.803356894747299e-05, + "loss": 4.6574, + "step": 21371 + }, + { + "epoch": 0.12710533828147302, + "grad_norm": 1.520213007926941, + "learning_rate": 4.803338735831979e-05, + "loss": 5.3691, + "step": 21372 + }, + { + "epoch": 0.127111285564754, + "grad_norm": 1.4914368391036987, + "learning_rate": 4.803320576112586e-05, + "loss": 5.2913, + "step": 21373 + }, + { + "epoch": 0.12711723284803503, + "grad_norm": 1.254329800605774, + "learning_rate": 4.803302415589128e-05, + "loss": 5.3926, + "step": 21374 + }, + { + "epoch": 0.12712318013131602, + "grad_norm": 1.909441351890564, + "learning_rate": 4.8032842542616116e-05, + "loss": 4.6179, + "step": 21375 + }, + { + "epoch": 0.127129127414597, + "grad_norm": 1.7123392820358276, + "learning_rate": 4.803266092130042e-05, + "loss": 5.1276, + "step": 21376 + }, + { + "epoch": 0.12713507469787802, + "grad_norm": 1.717854380607605, + "learning_rate": 4.8032479291944265e-05, + "loss": 5.3377, + "step": 21377 + }, + { + "epoch": 0.127141021981159, + "grad_norm": 1.7636181116104126, + "learning_rate": 4.80322976545477e-05, + "loss": 5.3434, + "step": 21378 + }, + { + "epoch": 0.12714696926444, + "grad_norm": 1.6754179000854492, + "learning_rate": 4.80321160091108e-05, + "loss": 5.3604, + "step": 21379 + }, + { + "epoch": 0.127152916547721, + "grad_norm": 1.4759787321090698, + "learning_rate": 4.803193435563364e-05, + "loss": 5.267, + "step": 21380 + }, + { + "epoch": 0.127158863831002, + "grad_norm": 1.8769867420196533, + "learning_rate": 4.803175269411625e-05, + "loss": 5.2666, + "step": 21381 + }, + { + "epoch": 0.127164811114283, + "grad_norm": 1.7843588590621948, + "learning_rate": 4.803157102455873e-05, + "loss": 5.1529, + "step": 21382 + }, + { + "epoch": 0.127170758397564, + "grad_norm": 1.7799369096755981, + "learning_rate": 4.803138934696111e-05, + "loss": 4.9332, + "step": 21383 + }, + { + "epoch": 0.127176705680845, + "grad_norm": 1.8240329027175903, + "learning_rate": 4.803120766132348e-05, + "loss": 4.8369, + "step": 21384 + }, + { + "epoch": 0.12718265296412598, + "grad_norm": 1.7379107475280762, + "learning_rate": 4.8031025967645895e-05, + "loss": 4.6134, + "step": 21385 + }, + { + "epoch": 0.127188600247407, + "grad_norm": 1.9912395477294922, + "learning_rate": 4.8030844265928414e-05, + "loss": 4.5456, + "step": 21386 + }, + { + "epoch": 0.12719454753068798, + "grad_norm": 1.762600302696228, + "learning_rate": 4.80306625561711e-05, + "loss": 5.4269, + "step": 21387 + }, + { + "epoch": 0.12720049481396897, + "grad_norm": 1.9208531379699707, + "learning_rate": 4.8030480838374027e-05, + "loss": 5.542, + "step": 21388 + }, + { + "epoch": 0.12720644209725, + "grad_norm": 1.8121410608291626, + "learning_rate": 4.803029911253725e-05, + "loss": 5.7218, + "step": 21389 + }, + { + "epoch": 0.12721238938053098, + "grad_norm": 2.0130512714385986, + "learning_rate": 4.803011737866082e-05, + "loss": 5.4736, + "step": 21390 + }, + { + "epoch": 0.12721833666381197, + "grad_norm": 1.4087759256362915, + "learning_rate": 4.802993563674483e-05, + "loss": 5.5634, + "step": 21391 + }, + { + "epoch": 0.12722428394709298, + "grad_norm": 1.640550971031189, + "learning_rate": 4.8029753886789316e-05, + "loss": 5.6422, + "step": 21392 + }, + { + "epoch": 0.12723023123037397, + "grad_norm": 1.58751380443573, + "learning_rate": 4.802957212879436e-05, + "loss": 5.2661, + "step": 21393 + }, + { + "epoch": 0.12723617851365496, + "grad_norm": 1.536847472190857, + "learning_rate": 4.802939036276002e-05, + "loss": 5.475, + "step": 21394 + }, + { + "epoch": 0.12724212579693597, + "grad_norm": 1.8386236429214478, + "learning_rate": 4.802920858868635e-05, + "loss": 5.4889, + "step": 21395 + }, + { + "epoch": 0.12724807308021696, + "grad_norm": 1.7268786430358887, + "learning_rate": 4.802902680657343e-05, + "loss": 5.2129, + "step": 21396 + }, + { + "epoch": 0.12725402036349795, + "grad_norm": 1.5081709623336792, + "learning_rate": 4.8028845016421306e-05, + "loss": 5.0437, + "step": 21397 + }, + { + "epoch": 0.12725996764677894, + "grad_norm": 1.3470754623413086, + "learning_rate": 4.802866321823006e-05, + "loss": 5.2242, + "step": 21398 + }, + { + "epoch": 0.12726591493005995, + "grad_norm": 1.2352057695388794, + "learning_rate": 4.802848141199974e-05, + "loss": 4.6926, + "step": 21399 + }, + { + "epoch": 0.12727186221334094, + "grad_norm": 1.4411710500717163, + "learning_rate": 4.802829959773041e-05, + "loss": 5.098, + "step": 21400 + }, + { + "epoch": 0.12727780949662193, + "grad_norm": 1.3453952074050903, + "learning_rate": 4.802811777542214e-05, + "loss": 5.0484, + "step": 21401 + }, + { + "epoch": 0.12728375677990295, + "grad_norm": 1.4602265357971191, + "learning_rate": 4.8027935945074995e-05, + "loss": 5.167, + "step": 21402 + }, + { + "epoch": 0.12728970406318393, + "grad_norm": 1.4542255401611328, + "learning_rate": 4.802775410668904e-05, + "loss": 5.0701, + "step": 21403 + }, + { + "epoch": 0.12729565134646492, + "grad_norm": 1.4398037195205688, + "learning_rate": 4.802757226026433e-05, + "loss": 5.0809, + "step": 21404 + }, + { + "epoch": 0.12730159862974594, + "grad_norm": 1.3027135133743286, + "learning_rate": 4.8027390405800935e-05, + "loss": 5.1283, + "step": 21405 + }, + { + "epoch": 0.12730754591302693, + "grad_norm": 1.3704328536987305, + "learning_rate": 4.802720854329891e-05, + "loss": 5.0886, + "step": 21406 + }, + { + "epoch": 0.12731349319630791, + "grad_norm": 1.2771658897399902, + "learning_rate": 4.802702667275833e-05, + "loss": 4.968, + "step": 21407 + }, + { + "epoch": 0.12731944047958893, + "grad_norm": 1.3370757102966309, + "learning_rate": 4.802684479417925e-05, + "loss": 5.2742, + "step": 21408 + }, + { + "epoch": 0.12732538776286992, + "grad_norm": 1.2101991176605225, + "learning_rate": 4.802666290756174e-05, + "loss": 5.3125, + "step": 21409 + }, + { + "epoch": 0.1273313350461509, + "grad_norm": 1.327354907989502, + "learning_rate": 4.8026481012905854e-05, + "loss": 5.0784, + "step": 21410 + }, + { + "epoch": 0.12733728232943192, + "grad_norm": 1.2267961502075195, + "learning_rate": 4.802629911021166e-05, + "loss": 5.0666, + "step": 21411 + }, + { + "epoch": 0.1273432296127129, + "grad_norm": 1.2195243835449219, + "learning_rate": 4.8026117199479224e-05, + "loss": 5.1941, + "step": 21412 + }, + { + "epoch": 0.1273491768959939, + "grad_norm": 1.1964733600616455, + "learning_rate": 4.8025935280708616e-05, + "loss": 5.0561, + "step": 21413 + }, + { + "epoch": 0.12735512417927491, + "grad_norm": 1.148831844329834, + "learning_rate": 4.802575335389989e-05, + "loss": 4.9592, + "step": 21414 + }, + { + "epoch": 0.1273610714625559, + "grad_norm": 1.2319111824035645, + "learning_rate": 4.802557141905311e-05, + "loss": 5.0165, + "step": 21415 + }, + { + "epoch": 0.1273670187458369, + "grad_norm": 1.324744462966919, + "learning_rate": 4.802538947616834e-05, + "loss": 4.9402, + "step": 21416 + }, + { + "epoch": 0.1273729660291179, + "grad_norm": 1.1551966667175293, + "learning_rate": 4.802520752524564e-05, + "loss": 5.1849, + "step": 21417 + }, + { + "epoch": 0.1273789133123989, + "grad_norm": 1.2087135314941406, + "learning_rate": 4.802502556628508e-05, + "loss": 5.1082, + "step": 21418 + }, + { + "epoch": 0.12738486059567988, + "grad_norm": 1.1568787097930908, + "learning_rate": 4.8024843599286726e-05, + "loss": 5.1379, + "step": 21419 + }, + { + "epoch": 0.1273908078789609, + "grad_norm": 1.2819747924804688, + "learning_rate": 4.802466162425063e-05, + "loss": 5.2054, + "step": 21420 + }, + { + "epoch": 0.1273967551622419, + "grad_norm": 1.3548219203948975, + "learning_rate": 4.8024479641176866e-05, + "loss": 4.8277, + "step": 21421 + }, + { + "epoch": 0.12740270244552288, + "grad_norm": 1.3331178426742554, + "learning_rate": 4.80242976500655e-05, + "loss": 4.991, + "step": 21422 + }, + { + "epoch": 0.1274086497288039, + "grad_norm": 1.3595576286315918, + "learning_rate": 4.8024115650916584e-05, + "loss": 4.8734, + "step": 21423 + }, + { + "epoch": 0.12741459701208488, + "grad_norm": 1.310585856437683, + "learning_rate": 4.802393364373019e-05, + "loss": 4.9281, + "step": 21424 + }, + { + "epoch": 0.12742054429536587, + "grad_norm": 1.3193553686141968, + "learning_rate": 4.8023751628506374e-05, + "loss": 4.9819, + "step": 21425 + }, + { + "epoch": 0.12742649157864688, + "grad_norm": 1.2952460050582886, + "learning_rate": 4.8023569605245204e-05, + "loss": 4.9577, + "step": 21426 + }, + { + "epoch": 0.12743243886192787, + "grad_norm": 1.376548409461975, + "learning_rate": 4.802338757394674e-05, + "loss": 5.2219, + "step": 21427 + }, + { + "epoch": 0.12743838614520886, + "grad_norm": 1.1417921781539917, + "learning_rate": 4.802320553461106e-05, + "loss": 5.0234, + "step": 21428 + }, + { + "epoch": 0.12744433342848988, + "grad_norm": 1.2543314695358276, + "learning_rate": 4.8023023487238214e-05, + "loss": 4.9921, + "step": 21429 + }, + { + "epoch": 0.12745028071177086, + "grad_norm": 1.4437085390090942, + "learning_rate": 4.802284143182827e-05, + "loss": 4.8699, + "step": 21430 + }, + { + "epoch": 0.12745622799505185, + "grad_norm": 1.137539267539978, + "learning_rate": 4.802265936838128e-05, + "loss": 5.1073, + "step": 21431 + }, + { + "epoch": 0.12746217527833287, + "grad_norm": 1.4179331064224243, + "learning_rate": 4.802247729689733e-05, + "loss": 5.0073, + "step": 21432 + }, + { + "epoch": 0.12746812256161386, + "grad_norm": 1.5519764423370361, + "learning_rate": 4.802229521737646e-05, + "loss": 4.9426, + "step": 21433 + }, + { + "epoch": 0.12747406984489484, + "grad_norm": 1.440847396850586, + "learning_rate": 4.8022113129818754e-05, + "loss": 5.2137, + "step": 21434 + }, + { + "epoch": 0.12748001712817586, + "grad_norm": 1.2741557359695435, + "learning_rate": 4.802193103422426e-05, + "loss": 4.966, + "step": 21435 + }, + { + "epoch": 0.12748596441145685, + "grad_norm": 1.5297214984893799, + "learning_rate": 4.8021748930593045e-05, + "loss": 5.006, + "step": 21436 + }, + { + "epoch": 0.12749191169473784, + "grad_norm": 1.2509713172912598, + "learning_rate": 4.802156681892518e-05, + "loss": 5.0719, + "step": 21437 + }, + { + "epoch": 0.12749785897801885, + "grad_norm": 1.2376511096954346, + "learning_rate": 4.802138469922073e-05, + "loss": 4.8896, + "step": 21438 + }, + { + "epoch": 0.12750380626129984, + "grad_norm": 1.311804175376892, + "learning_rate": 4.802120257147974e-05, + "loss": 5.0292, + "step": 21439 + }, + { + "epoch": 0.12750975354458083, + "grad_norm": 1.2717031240463257, + "learning_rate": 4.802102043570229e-05, + "loss": 5.157, + "step": 21440 + }, + { + "epoch": 0.12751570082786184, + "grad_norm": 1.2967960834503174, + "learning_rate": 4.8020838291888445e-05, + "loss": 5.1289, + "step": 21441 + }, + { + "epoch": 0.12752164811114283, + "grad_norm": 1.2796543836593628, + "learning_rate": 4.802065614003826e-05, + "loss": 5.0702, + "step": 21442 + }, + { + "epoch": 0.12752759539442382, + "grad_norm": 1.4490569829940796, + "learning_rate": 4.80204739801518e-05, + "loss": 5.1, + "step": 21443 + }, + { + "epoch": 0.12753354267770484, + "grad_norm": 1.1721242666244507, + "learning_rate": 4.8020291812229136e-05, + "loss": 5.1237, + "step": 21444 + }, + { + "epoch": 0.12753948996098582, + "grad_norm": 1.3185924291610718, + "learning_rate": 4.8020109636270316e-05, + "loss": 5.0208, + "step": 21445 + }, + { + "epoch": 0.1275454372442668, + "grad_norm": 1.4432177543640137, + "learning_rate": 4.801992745227543e-05, + "loss": 5.0235, + "step": 21446 + }, + { + "epoch": 0.12755138452754783, + "grad_norm": 1.3810619115829468, + "learning_rate": 4.801974526024451e-05, + "loss": 4.8893, + "step": 21447 + }, + { + "epoch": 0.12755733181082882, + "grad_norm": 1.3421547412872314, + "learning_rate": 4.8019563060177634e-05, + "loss": 4.9605, + "step": 21448 + }, + { + "epoch": 0.1275632790941098, + "grad_norm": 1.304095983505249, + "learning_rate": 4.8019380852074875e-05, + "loss": 4.9489, + "step": 21449 + }, + { + "epoch": 0.12756922637739082, + "grad_norm": 1.3935438394546509, + "learning_rate": 4.801919863593629e-05, + "loss": 4.9097, + "step": 21450 + }, + { + "epoch": 0.1275751736606718, + "grad_norm": 1.1719253063201904, + "learning_rate": 4.801901641176193e-05, + "loss": 4.9922, + "step": 21451 + }, + { + "epoch": 0.1275811209439528, + "grad_norm": 1.8718456029891968, + "learning_rate": 4.801883417955188e-05, + "loss": 5.409, + "step": 21452 + }, + { + "epoch": 0.1275870682272338, + "grad_norm": 1.1837137937545776, + "learning_rate": 4.801865193930618e-05, + "loss": 4.967, + "step": 21453 + }, + { + "epoch": 0.1275930155105148, + "grad_norm": 1.2643749713897705, + "learning_rate": 4.801846969102491e-05, + "loss": 4.7932, + "step": 21454 + }, + { + "epoch": 0.1275989627937958, + "grad_norm": 1.2207399606704712, + "learning_rate": 4.801828743470814e-05, + "loss": 4.9634, + "step": 21455 + }, + { + "epoch": 0.12760491007707678, + "grad_norm": 1.2489538192749023, + "learning_rate": 4.801810517035592e-05, + "loss": 5.1077, + "step": 21456 + }, + { + "epoch": 0.1276108573603578, + "grad_norm": 1.3879250288009644, + "learning_rate": 4.801792289796832e-05, + "loss": 5.225, + "step": 21457 + }, + { + "epoch": 0.12761680464363878, + "grad_norm": 1.4891397953033447, + "learning_rate": 4.8017740617545385e-05, + "loss": 5.1288, + "step": 21458 + }, + { + "epoch": 0.12762275192691977, + "grad_norm": 1.555528998374939, + "learning_rate": 4.801755832908721e-05, + "loss": 5.1875, + "step": 21459 + }, + { + "epoch": 0.12762869921020079, + "grad_norm": 1.287625789642334, + "learning_rate": 4.8017376032593834e-05, + "loss": 5.1934, + "step": 21460 + }, + { + "epoch": 0.12763464649348177, + "grad_norm": 1.4907346963882446, + "learning_rate": 4.801719372806533e-05, + "loss": 5.169, + "step": 21461 + }, + { + "epoch": 0.12764059377676276, + "grad_norm": 1.2776025533676147, + "learning_rate": 4.801701141550177e-05, + "loss": 5.2178, + "step": 21462 + }, + { + "epoch": 0.12764654106004378, + "grad_norm": 1.4319080114364624, + "learning_rate": 4.80168290949032e-05, + "loss": 5.159, + "step": 21463 + }, + { + "epoch": 0.12765248834332477, + "grad_norm": 1.4323997497558594, + "learning_rate": 4.80166467662697e-05, + "loss": 5.227, + "step": 21464 + }, + { + "epoch": 0.12765843562660575, + "grad_norm": 1.409071445465088, + "learning_rate": 4.8016464429601326e-05, + "loss": 5.0025, + "step": 21465 + }, + { + "epoch": 0.12766438290988677, + "grad_norm": 1.42705500125885, + "learning_rate": 4.801628208489814e-05, + "loss": 5.0332, + "step": 21466 + }, + { + "epoch": 0.12767033019316776, + "grad_norm": 1.2235654592514038, + "learning_rate": 4.801609973216021e-05, + "loss": 5.0734, + "step": 21467 + }, + { + "epoch": 0.12767627747644875, + "grad_norm": 1.2238860130310059, + "learning_rate": 4.8015917371387595e-05, + "loss": 4.9804, + "step": 21468 + }, + { + "epoch": 0.12768222475972976, + "grad_norm": 1.4584438800811768, + "learning_rate": 4.801573500258036e-05, + "loss": 5.162, + "step": 21469 + }, + { + "epoch": 0.12768817204301075, + "grad_norm": 1.236396074295044, + "learning_rate": 4.8015552625738566e-05, + "loss": 5.1374, + "step": 21470 + }, + { + "epoch": 0.12769411932629174, + "grad_norm": 1.472617745399475, + "learning_rate": 4.801537024086229e-05, + "loss": 5.0376, + "step": 21471 + }, + { + "epoch": 0.12770006660957275, + "grad_norm": 1.2870211601257324, + "learning_rate": 4.801518784795158e-05, + "loss": 4.9798, + "step": 21472 + }, + { + "epoch": 0.12770601389285374, + "grad_norm": 1.3299795389175415, + "learning_rate": 4.801500544700651e-05, + "loss": 4.9588, + "step": 21473 + }, + { + "epoch": 0.12771196117613473, + "grad_norm": 1.474135398864746, + "learning_rate": 4.8014823038027134e-05, + "loss": 5.015, + "step": 21474 + }, + { + "epoch": 0.12771790845941575, + "grad_norm": 1.6452490091323853, + "learning_rate": 4.8014640621013524e-05, + "loss": 5.0075, + "step": 21475 + }, + { + "epoch": 0.12772385574269673, + "grad_norm": 1.3577489852905273, + "learning_rate": 4.801445819596574e-05, + "loss": 4.9675, + "step": 21476 + }, + { + "epoch": 0.12772980302597772, + "grad_norm": 1.2642143964767456, + "learning_rate": 4.801427576288384e-05, + "loss": 5.0593, + "step": 21477 + }, + { + "epoch": 0.12773575030925874, + "grad_norm": 1.5256940126419067, + "learning_rate": 4.801409332176791e-05, + "loss": 4.8987, + "step": 21478 + }, + { + "epoch": 0.12774169759253973, + "grad_norm": 1.667886734008789, + "learning_rate": 4.801391087261798e-05, + "loss": 4.7562, + "step": 21479 + }, + { + "epoch": 0.12774764487582072, + "grad_norm": 1.3564702272415161, + "learning_rate": 4.801372841543415e-05, + "loss": 5.2975, + "step": 21480 + }, + { + "epoch": 0.12775359215910173, + "grad_norm": 1.607532262802124, + "learning_rate": 4.801354595021645e-05, + "loss": 4.9578, + "step": 21481 + }, + { + "epoch": 0.12775953944238272, + "grad_norm": 1.2633382081985474, + "learning_rate": 4.801336347696496e-05, + "loss": 5.1104, + "step": 21482 + }, + { + "epoch": 0.1277654867256637, + "grad_norm": 1.4292182922363281, + "learning_rate": 4.801318099567975e-05, + "loss": 4.9637, + "step": 21483 + }, + { + "epoch": 0.12777143400894472, + "grad_norm": 1.1797621250152588, + "learning_rate": 4.8012998506360874e-05, + "loss": 5.403, + "step": 21484 + }, + { + "epoch": 0.1277773812922257, + "grad_norm": 1.3704683780670166, + "learning_rate": 4.801281600900839e-05, + "loss": 4.9852, + "step": 21485 + }, + { + "epoch": 0.1277833285755067, + "grad_norm": 1.4775960445404053, + "learning_rate": 4.8012633503622384e-05, + "loss": 5.2049, + "step": 21486 + }, + { + "epoch": 0.12778927585878772, + "grad_norm": 1.5056041479110718, + "learning_rate": 4.801245099020289e-05, + "loss": 4.9782, + "step": 21487 + }, + { + "epoch": 0.1277952231420687, + "grad_norm": 1.3562772274017334, + "learning_rate": 4.801226846875e-05, + "loss": 5.0427, + "step": 21488 + }, + { + "epoch": 0.1278011704253497, + "grad_norm": 1.346339464187622, + "learning_rate": 4.801208593926376e-05, + "loss": 5.2215, + "step": 21489 + }, + { + "epoch": 0.1278071177086307, + "grad_norm": 1.3189916610717773, + "learning_rate": 4.801190340174424e-05, + "loss": 5.2097, + "step": 21490 + }, + { + "epoch": 0.1278130649919117, + "grad_norm": 1.466374397277832, + "learning_rate": 4.80117208561915e-05, + "loss": 4.8106, + "step": 21491 + }, + { + "epoch": 0.12781901227519268, + "grad_norm": 1.4882310628890991, + "learning_rate": 4.801153830260561e-05, + "loss": 5.1702, + "step": 21492 + }, + { + "epoch": 0.1278249595584737, + "grad_norm": 1.4080910682678223, + "learning_rate": 4.801135574098662e-05, + "loss": 5.0508, + "step": 21493 + }, + { + "epoch": 0.1278309068417547, + "grad_norm": 1.366672396659851, + "learning_rate": 4.801117317133461e-05, + "loss": 4.8692, + "step": 21494 + }, + { + "epoch": 0.12783685412503568, + "grad_norm": 1.3347315788269043, + "learning_rate": 4.801099059364963e-05, + "loss": 5.2327, + "step": 21495 + }, + { + "epoch": 0.1278428014083167, + "grad_norm": 1.434276819229126, + "learning_rate": 4.8010808007931765e-05, + "loss": 4.7217, + "step": 21496 + }, + { + "epoch": 0.12784874869159768, + "grad_norm": 1.2148855924606323, + "learning_rate": 4.801062541418105e-05, + "loss": 5.2082, + "step": 21497 + }, + { + "epoch": 0.12785469597487867, + "grad_norm": 1.4282805919647217, + "learning_rate": 4.801044281239758e-05, + "loss": 4.8627, + "step": 21498 + }, + { + "epoch": 0.12786064325815968, + "grad_norm": 1.309984564781189, + "learning_rate": 4.8010260202581394e-05, + "loss": 5.0809, + "step": 21499 + }, + { + "epoch": 0.12786659054144067, + "grad_norm": 1.2769159078598022, + "learning_rate": 4.801007758473256e-05, + "loss": 5.0357, + "step": 21500 + }, + { + "epoch": 0.12787253782472166, + "grad_norm": 1.4789204597473145, + "learning_rate": 4.800989495885115e-05, + "loss": 5.0572, + "step": 21501 + }, + { + "epoch": 0.12787848510800268, + "grad_norm": 1.2763663530349731, + "learning_rate": 4.8009712324937216e-05, + "loss": 5.2331, + "step": 21502 + }, + { + "epoch": 0.12788443239128366, + "grad_norm": 1.237911581993103, + "learning_rate": 4.800952968299084e-05, + "loss": 5.1217, + "step": 21503 + }, + { + "epoch": 0.12789037967456465, + "grad_norm": 1.3204708099365234, + "learning_rate": 4.800934703301206e-05, + "loss": 5.0503, + "step": 21504 + }, + { + "epoch": 0.12789632695784567, + "grad_norm": 1.2918440103530884, + "learning_rate": 4.800916437500097e-05, + "loss": 5.1229, + "step": 21505 + }, + { + "epoch": 0.12790227424112666, + "grad_norm": 1.2793703079223633, + "learning_rate": 4.8008981708957614e-05, + "loss": 4.9075, + "step": 21506 + }, + { + "epoch": 0.12790822152440764, + "grad_norm": 1.177607536315918, + "learning_rate": 4.8008799034882054e-05, + "loss": 4.89, + "step": 21507 + }, + { + "epoch": 0.12791416880768866, + "grad_norm": 0.9703904986381531, + "learning_rate": 4.800861635277437e-05, + "loss": 5.0141, + "step": 21508 + }, + { + "epoch": 0.12792011609096965, + "grad_norm": 1.2512762546539307, + "learning_rate": 4.800843366263461e-05, + "loss": 4.953, + "step": 21509 + }, + { + "epoch": 0.12792606337425064, + "grad_norm": 1.3279083967208862, + "learning_rate": 4.8008250964462846e-05, + "loss": 5.0179, + "step": 21510 + }, + { + "epoch": 0.12793201065753165, + "grad_norm": 1.3790103197097778, + "learning_rate": 4.8008068258259144e-05, + "loss": 4.9531, + "step": 21511 + }, + { + "epoch": 0.12793795794081264, + "grad_norm": 1.2640241384506226, + "learning_rate": 4.800788554402355e-05, + "loss": 5.0281, + "step": 21512 + }, + { + "epoch": 0.12794390522409363, + "grad_norm": 1.2616617679595947, + "learning_rate": 4.800770282175615e-05, + "loss": 5.1131, + "step": 21513 + }, + { + "epoch": 0.12794985250737465, + "grad_norm": 1.7765449285507202, + "learning_rate": 4.800752009145699e-05, + "loss": 5.3388, + "step": 21514 + }, + { + "epoch": 0.12795579979065563, + "grad_norm": 1.4468929767608643, + "learning_rate": 4.800733735312615e-05, + "loss": 4.9308, + "step": 21515 + }, + { + "epoch": 0.12796174707393662, + "grad_norm": 1.286733865737915, + "learning_rate": 4.800715460676369e-05, + "loss": 5.0407, + "step": 21516 + }, + { + "epoch": 0.1279676943572176, + "grad_norm": 1.3074883222579956, + "learning_rate": 4.8006971852369665e-05, + "loss": 5.0364, + "step": 21517 + }, + { + "epoch": 0.12797364164049863, + "grad_norm": 1.2966744899749756, + "learning_rate": 4.8006789089944144e-05, + "loss": 5.0411, + "step": 21518 + }, + { + "epoch": 0.1279795889237796, + "grad_norm": 1.4764792919158936, + "learning_rate": 4.800660631948719e-05, + "loss": 5.0178, + "step": 21519 + }, + { + "epoch": 0.1279855362070606, + "grad_norm": 1.3073668479919434, + "learning_rate": 4.800642354099887e-05, + "loss": 4.8384, + "step": 21520 + }, + { + "epoch": 0.12799148349034162, + "grad_norm": 1.433164119720459, + "learning_rate": 4.800624075447924e-05, + "loss": 4.4844, + "step": 21521 + }, + { + "epoch": 0.1279974307736226, + "grad_norm": 1.435656189918518, + "learning_rate": 4.8006057959928375e-05, + "loss": 4.7067, + "step": 21522 + }, + { + "epoch": 0.1280033780569036, + "grad_norm": 1.2541238069534302, + "learning_rate": 4.800587515734632e-05, + "loss": 4.76, + "step": 21523 + }, + { + "epoch": 0.1280093253401846, + "grad_norm": 1.3341822624206543, + "learning_rate": 4.8005692346733166e-05, + "loss": 4.9485, + "step": 21524 + }, + { + "epoch": 0.1280152726234656, + "grad_norm": 1.1761771440505981, + "learning_rate": 4.8005509528088963e-05, + "loss": 4.9416, + "step": 21525 + }, + { + "epoch": 0.1280212199067466, + "grad_norm": 1.490059494972229, + "learning_rate": 4.8005326701413764e-05, + "loss": 4.5864, + "step": 21526 + }, + { + "epoch": 0.1280271671900276, + "grad_norm": 1.4474053382873535, + "learning_rate": 4.8005143866707656e-05, + "loss": 4.3612, + "step": 21527 + }, + { + "epoch": 0.1280331144733086, + "grad_norm": 1.4138057231903076, + "learning_rate": 4.800496102397068e-05, + "loss": 4.7795, + "step": 21528 + }, + { + "epoch": 0.12803906175658958, + "grad_norm": 1.3671265840530396, + "learning_rate": 4.8004778173202915e-05, + "loss": 4.8096, + "step": 21529 + }, + { + "epoch": 0.1280450090398706, + "grad_norm": 1.3463077545166016, + "learning_rate": 4.800459531440441e-05, + "loss": 4.4858, + "step": 21530 + }, + { + "epoch": 0.12805095632315158, + "grad_norm": 1.2250823974609375, + "learning_rate": 4.800441244757525e-05, + "loss": 4.7394, + "step": 21531 + }, + { + "epoch": 0.12805690360643257, + "grad_norm": 1.4103713035583496, + "learning_rate": 4.800422957271548e-05, + "loss": 4.8084, + "step": 21532 + }, + { + "epoch": 0.1280628508897136, + "grad_norm": 1.3920261859893799, + "learning_rate": 4.800404668982518e-05, + "loss": 4.9744, + "step": 21533 + }, + { + "epoch": 0.12806879817299457, + "grad_norm": 1.2541594505310059, + "learning_rate": 4.8003863798904395e-05, + "loss": 5.024, + "step": 21534 + }, + { + "epoch": 0.12807474545627556, + "grad_norm": 1.2717599868774414, + "learning_rate": 4.80036808999532e-05, + "loss": 4.9402, + "step": 21535 + }, + { + "epoch": 0.12808069273955658, + "grad_norm": 1.168628215789795, + "learning_rate": 4.8003497992971656e-05, + "loss": 4.8391, + "step": 21536 + }, + { + "epoch": 0.12808664002283757, + "grad_norm": 1.2135813236236572, + "learning_rate": 4.800331507795984e-05, + "loss": 4.9725, + "step": 21537 + }, + { + "epoch": 0.12809258730611855, + "grad_norm": 1.2984068393707275, + "learning_rate": 4.8003132154917795e-05, + "loss": 4.8946, + "step": 21538 + }, + { + "epoch": 0.12809853458939957, + "grad_norm": 1.1610583066940308, + "learning_rate": 4.8002949223845595e-05, + "loss": 4.9362, + "step": 21539 + }, + { + "epoch": 0.12810448187268056, + "grad_norm": 1.357981562614441, + "learning_rate": 4.8002766284743306e-05, + "loss": 4.6936, + "step": 21540 + }, + { + "epoch": 0.12811042915596155, + "grad_norm": 1.437784194946289, + "learning_rate": 4.800258333761098e-05, + "loss": 4.7942, + "step": 21541 + }, + { + "epoch": 0.12811637643924256, + "grad_norm": 1.364261507987976, + "learning_rate": 4.8002400382448704e-05, + "loss": 4.763, + "step": 21542 + }, + { + "epoch": 0.12812232372252355, + "grad_norm": 1.3244688510894775, + "learning_rate": 4.800221741925652e-05, + "loss": 4.8804, + "step": 21543 + }, + { + "epoch": 0.12812827100580454, + "grad_norm": 1.5480263233184814, + "learning_rate": 4.80020344480345e-05, + "loss": 4.6523, + "step": 21544 + }, + { + "epoch": 0.12813421828908556, + "grad_norm": 1.2875494956970215, + "learning_rate": 4.800185146878271e-05, + "loss": 4.6137, + "step": 21545 + }, + { + "epoch": 0.12814016557236654, + "grad_norm": 1.1969667673110962, + "learning_rate": 4.80016684815012e-05, + "loss": 5.1034, + "step": 21546 + }, + { + "epoch": 0.12814611285564753, + "grad_norm": 1.3188492059707642, + "learning_rate": 4.8001485486190064e-05, + "loss": 5.078, + "step": 21547 + }, + { + "epoch": 0.12815206013892855, + "grad_norm": 1.2246590852737427, + "learning_rate": 4.800130248284934e-05, + "loss": 5.0404, + "step": 21548 + }, + { + "epoch": 0.12815800742220954, + "grad_norm": 1.2853569984436035, + "learning_rate": 4.800111947147909e-05, + "loss": 4.9271, + "step": 21549 + }, + { + "epoch": 0.12816395470549052, + "grad_norm": 1.1865004301071167, + "learning_rate": 4.8000936452079395e-05, + "loss": 4.8657, + "step": 21550 + }, + { + "epoch": 0.12816990198877154, + "grad_norm": 1.4134557247161865, + "learning_rate": 4.8000753424650306e-05, + "loss": 4.5964, + "step": 21551 + }, + { + "epoch": 0.12817584927205253, + "grad_norm": 1.3943791389465332, + "learning_rate": 4.8000570389191894e-05, + "loss": 4.7792, + "step": 21552 + }, + { + "epoch": 0.12818179655533352, + "grad_norm": 1.5506455898284912, + "learning_rate": 4.8000387345704225e-05, + "loss": 4.767, + "step": 21553 + }, + { + "epoch": 0.12818774383861453, + "grad_norm": 1.516860008239746, + "learning_rate": 4.8000204294187356e-05, + "loss": 4.8412, + "step": 21554 + }, + { + "epoch": 0.12819369112189552, + "grad_norm": 1.3515304327011108, + "learning_rate": 4.8000021234641345e-05, + "loss": 4.7443, + "step": 21555 + }, + { + "epoch": 0.1281996384051765, + "grad_norm": 1.4094910621643066, + "learning_rate": 4.7999838167066276e-05, + "loss": 4.8343, + "step": 21556 + }, + { + "epoch": 0.12820558568845752, + "grad_norm": 1.3746453523635864, + "learning_rate": 4.7999655091462195e-05, + "loss": 4.6913, + "step": 21557 + }, + { + "epoch": 0.1282115329717385, + "grad_norm": 1.4625654220581055, + "learning_rate": 4.799947200782917e-05, + "loss": 4.8412, + "step": 21558 + }, + { + "epoch": 0.1282174802550195, + "grad_norm": 1.3790411949157715, + "learning_rate": 4.7999288916167275e-05, + "loss": 4.5777, + "step": 21559 + }, + { + "epoch": 0.12822342753830052, + "grad_norm": 1.4020804166793823, + "learning_rate": 4.799910581647656e-05, + "loss": 4.8728, + "step": 21560 + }, + { + "epoch": 0.1282293748215815, + "grad_norm": 1.2850565910339355, + "learning_rate": 4.799892270875709e-05, + "loss": 4.9687, + "step": 21561 + }, + { + "epoch": 0.1282353221048625, + "grad_norm": 1.4895892143249512, + "learning_rate": 4.799873959300894e-05, + "loss": 4.9786, + "step": 21562 + }, + { + "epoch": 0.1282412693881435, + "grad_norm": 1.149808406829834, + "learning_rate": 4.799855646923217e-05, + "loss": 4.9924, + "step": 21563 + }, + { + "epoch": 0.1282472166714245, + "grad_norm": 1.3952314853668213, + "learning_rate": 4.799837333742684e-05, + "loss": 4.9225, + "step": 21564 + }, + { + "epoch": 0.12825316395470548, + "grad_norm": 1.271844744682312, + "learning_rate": 4.799819019759301e-05, + "loss": 4.9967, + "step": 21565 + }, + { + "epoch": 0.1282591112379865, + "grad_norm": 1.3351553678512573, + "learning_rate": 4.799800704973075e-05, + "loss": 4.9089, + "step": 21566 + }, + { + "epoch": 0.1282650585212675, + "grad_norm": 1.2077351808547974, + "learning_rate": 4.799782389384013e-05, + "loss": 4.8948, + "step": 21567 + }, + { + "epoch": 0.12827100580454848, + "grad_norm": 1.6159747838974, + "learning_rate": 4.79976407299212e-05, + "loss": 4.6636, + "step": 21568 + }, + { + "epoch": 0.1282769530878295, + "grad_norm": 1.4904805421829224, + "learning_rate": 4.7997457557974035e-05, + "loss": 4.8164, + "step": 21569 + }, + { + "epoch": 0.12828290037111048, + "grad_norm": 1.2312726974487305, + "learning_rate": 4.79972743779987e-05, + "loss": 4.8022, + "step": 21570 + }, + { + "epoch": 0.12828884765439147, + "grad_norm": 1.3150570392608643, + "learning_rate": 4.799709118999525e-05, + "loss": 4.7237, + "step": 21571 + }, + { + "epoch": 0.12829479493767248, + "grad_norm": 1.441749930381775, + "learning_rate": 4.799690799396375e-05, + "loss": 4.8704, + "step": 21572 + }, + { + "epoch": 0.12830074222095347, + "grad_norm": 1.4237558841705322, + "learning_rate": 4.799672478990427e-05, + "loss": 4.9428, + "step": 21573 + }, + { + "epoch": 0.12830668950423446, + "grad_norm": 1.5440024137496948, + "learning_rate": 4.7996541577816867e-05, + "loss": 4.7546, + "step": 21574 + }, + { + "epoch": 0.12831263678751545, + "grad_norm": 1.2962610721588135, + "learning_rate": 4.799635835770161e-05, + "loss": 4.9324, + "step": 21575 + }, + { + "epoch": 0.12831858407079647, + "grad_norm": 2.1041312217712402, + "learning_rate": 4.799617512955857e-05, + "loss": 5.2894, + "step": 21576 + }, + { + "epoch": 0.12832453135407745, + "grad_norm": 1.3591945171356201, + "learning_rate": 4.7995991893387796e-05, + "loss": 4.6942, + "step": 21577 + }, + { + "epoch": 0.12833047863735844, + "grad_norm": 1.2474287748336792, + "learning_rate": 4.799580864918936e-05, + "loss": 5.0003, + "step": 21578 + }, + { + "epoch": 0.12833642592063946, + "grad_norm": 1.4604638814926147, + "learning_rate": 4.7995625396963326e-05, + "loss": 4.8608, + "step": 21579 + }, + { + "epoch": 0.12834237320392045, + "grad_norm": 1.5033100843429565, + "learning_rate": 4.7995442136709755e-05, + "loss": 4.9221, + "step": 21580 + }, + { + "epoch": 0.12834832048720143, + "grad_norm": 1.4712806940078735, + "learning_rate": 4.799525886842872e-05, + "loss": 4.9657, + "step": 21581 + }, + { + "epoch": 0.12835426777048245, + "grad_norm": 1.4505717754364014, + "learning_rate": 4.799507559212026e-05, + "loss": 4.7913, + "step": 21582 + }, + { + "epoch": 0.12836021505376344, + "grad_norm": 1.6151630878448486, + "learning_rate": 4.7994892307784466e-05, + "loss": 4.6494, + "step": 21583 + }, + { + "epoch": 0.12836616233704443, + "grad_norm": 1.5356489419937134, + "learning_rate": 4.79947090154214e-05, + "loss": 4.5596, + "step": 21584 + }, + { + "epoch": 0.12837210962032544, + "grad_norm": 1.5046836137771606, + "learning_rate": 4.7994525715031114e-05, + "loss": 4.6486, + "step": 21585 + }, + { + "epoch": 0.12837805690360643, + "grad_norm": 1.413750171661377, + "learning_rate": 4.799434240661367e-05, + "loss": 4.8878, + "step": 21586 + }, + { + "epoch": 0.12838400418688742, + "grad_norm": 1.3955304622650146, + "learning_rate": 4.799415909016915e-05, + "loss": 5.1577, + "step": 21587 + }, + { + "epoch": 0.12838995147016843, + "grad_norm": 1.5791069269180298, + "learning_rate": 4.79939757656976e-05, + "loss": 5.1712, + "step": 21588 + }, + { + "epoch": 0.12839589875344942, + "grad_norm": 1.3384202718734741, + "learning_rate": 4.799379243319909e-05, + "loss": 5.1534, + "step": 21589 + }, + { + "epoch": 0.1284018460367304, + "grad_norm": 1.4390661716461182, + "learning_rate": 4.7993609092673684e-05, + "loss": 5.3616, + "step": 21590 + }, + { + "epoch": 0.12840779332001143, + "grad_norm": 1.3923462629318237, + "learning_rate": 4.799342574412145e-05, + "loss": 5.2225, + "step": 21591 + }, + { + "epoch": 0.12841374060329241, + "grad_norm": 1.2241096496582031, + "learning_rate": 4.799324238754245e-05, + "loss": 5.2419, + "step": 21592 + }, + { + "epoch": 0.1284196878865734, + "grad_norm": 1.3041672706604004, + "learning_rate": 4.799305902293674e-05, + "loss": 5.0903, + "step": 21593 + }, + { + "epoch": 0.12842563516985442, + "grad_norm": 1.2822580337524414, + "learning_rate": 4.799287565030439e-05, + "loss": 5.1304, + "step": 21594 + }, + { + "epoch": 0.1284315824531354, + "grad_norm": 1.4155261516571045, + "learning_rate": 4.7992692269645475e-05, + "loss": 5.2332, + "step": 21595 + }, + { + "epoch": 0.1284375297364164, + "grad_norm": 1.4972230195999146, + "learning_rate": 4.799250888096004e-05, + "loss": 5.0588, + "step": 21596 + }, + { + "epoch": 0.1284434770196974, + "grad_norm": 1.3301728963851929, + "learning_rate": 4.799232548424816e-05, + "loss": 5.0401, + "step": 21597 + }, + { + "epoch": 0.1284494243029784, + "grad_norm": 1.2775028944015503, + "learning_rate": 4.799214207950989e-05, + "loss": 4.877, + "step": 21598 + }, + { + "epoch": 0.1284553715862594, + "grad_norm": 1.1996419429779053, + "learning_rate": 4.799195866674532e-05, + "loss": 4.9223, + "step": 21599 + }, + { + "epoch": 0.1284613188695404, + "grad_norm": 1.1330626010894775, + "learning_rate": 4.7991775245954477e-05, + "loss": 4.9224, + "step": 21600 + }, + { + "epoch": 0.1284672661528214, + "grad_norm": 1.3013830184936523, + "learning_rate": 4.7991591817137446e-05, + "loss": 5.1005, + "step": 21601 + }, + { + "epoch": 0.12847321343610238, + "grad_norm": 1.2901992797851562, + "learning_rate": 4.79914083802943e-05, + "loss": 4.9554, + "step": 21602 + }, + { + "epoch": 0.1284791607193834, + "grad_norm": 1.4342957735061646, + "learning_rate": 4.799122493542507e-05, + "loss": 4.9685, + "step": 21603 + }, + { + "epoch": 0.12848510800266438, + "grad_norm": 1.2227423191070557, + "learning_rate": 4.7991041482529856e-05, + "loss": 4.9219, + "step": 21604 + }, + { + "epoch": 0.12849105528594537, + "grad_norm": 1.2947163581848145, + "learning_rate": 4.7990858021608705e-05, + "loss": 4.9747, + "step": 21605 + }, + { + "epoch": 0.1284970025692264, + "grad_norm": 1.2928695678710938, + "learning_rate": 4.799067455266168e-05, + "loss": 5.0456, + "step": 21606 + }, + { + "epoch": 0.12850294985250738, + "grad_norm": 1.461930513381958, + "learning_rate": 4.799049107568885e-05, + "loss": 4.8518, + "step": 21607 + }, + { + "epoch": 0.12850889713578836, + "grad_norm": 1.4009983539581299, + "learning_rate": 4.799030759069028e-05, + "loss": 4.8761, + "step": 21608 + }, + { + "epoch": 0.12851484441906938, + "grad_norm": 1.2762218713760376, + "learning_rate": 4.799012409766602e-05, + "loss": 4.8551, + "step": 21609 + }, + { + "epoch": 0.12852079170235037, + "grad_norm": 1.3359547853469849, + "learning_rate": 4.7989940596616156e-05, + "loss": 4.7933, + "step": 21610 + }, + { + "epoch": 0.12852673898563136, + "grad_norm": 1.4515223503112793, + "learning_rate": 4.7989757087540735e-05, + "loss": 4.8432, + "step": 21611 + }, + { + "epoch": 0.12853268626891237, + "grad_norm": 1.445410966873169, + "learning_rate": 4.7989573570439825e-05, + "loss": 5.0115, + "step": 21612 + }, + { + "epoch": 0.12853863355219336, + "grad_norm": 1.4424355030059814, + "learning_rate": 4.79893900453135e-05, + "loss": 4.9, + "step": 21613 + }, + { + "epoch": 0.12854458083547435, + "grad_norm": 1.2938885688781738, + "learning_rate": 4.798920651216182e-05, + "loss": 4.7918, + "step": 21614 + }, + { + "epoch": 0.12855052811875536, + "grad_norm": 1.3097805976867676, + "learning_rate": 4.798902297098484e-05, + "loss": 4.7449, + "step": 21615 + }, + { + "epoch": 0.12855647540203635, + "grad_norm": 1.5416840314865112, + "learning_rate": 4.798883942178263e-05, + "loss": 5.3092, + "step": 21616 + }, + { + "epoch": 0.12856242268531734, + "grad_norm": 1.339882493019104, + "learning_rate": 4.798865586455525e-05, + "loss": 5.2832, + "step": 21617 + }, + { + "epoch": 0.12856836996859836, + "grad_norm": 1.2793277502059937, + "learning_rate": 4.7988472299302764e-05, + "loss": 4.9532, + "step": 21618 + }, + { + "epoch": 0.12857431725187934, + "grad_norm": 1.3368133306503296, + "learning_rate": 4.7988288726025254e-05, + "loss": 5.0795, + "step": 21619 + }, + { + "epoch": 0.12858026453516033, + "grad_norm": 1.4083633422851562, + "learning_rate": 4.7988105144722764e-05, + "loss": 5.3231, + "step": 21620 + }, + { + "epoch": 0.12858621181844135, + "grad_norm": 1.4018146991729736, + "learning_rate": 4.7987921555395356e-05, + "loss": 5.0031, + "step": 21621 + }, + { + "epoch": 0.12859215910172234, + "grad_norm": 1.2982511520385742, + "learning_rate": 4.798773795804311e-05, + "loss": 4.9553, + "step": 21622 + }, + { + "epoch": 0.12859810638500332, + "grad_norm": 1.2939512729644775, + "learning_rate": 4.798755435266607e-05, + "loss": 4.9096, + "step": 21623 + }, + { + "epoch": 0.12860405366828434, + "grad_norm": 1.2920591831207275, + "learning_rate": 4.7987370739264334e-05, + "loss": 4.8198, + "step": 21624 + }, + { + "epoch": 0.12861000095156533, + "grad_norm": 1.537635326385498, + "learning_rate": 4.798718711783793e-05, + "loss": 4.9656, + "step": 21625 + }, + { + "epoch": 0.12861594823484632, + "grad_norm": 1.4374878406524658, + "learning_rate": 4.798700348838694e-05, + "loss": 5.022, + "step": 21626 + }, + { + "epoch": 0.12862189551812733, + "grad_norm": 1.4768397808074951, + "learning_rate": 4.798681985091142e-05, + "loss": 5.1965, + "step": 21627 + }, + { + "epoch": 0.12862784280140832, + "grad_norm": 1.370009183883667, + "learning_rate": 4.798663620541145e-05, + "loss": 5.049, + "step": 21628 + }, + { + "epoch": 0.1286337900846893, + "grad_norm": 1.309531569480896, + "learning_rate": 4.7986452551887076e-05, + "loss": 4.9583, + "step": 21629 + }, + { + "epoch": 0.12863973736797032, + "grad_norm": 1.3303570747375488, + "learning_rate": 4.7986268890338365e-05, + "loss": 5.0708, + "step": 21630 + }, + { + "epoch": 0.1286456846512513, + "grad_norm": 1.389640212059021, + "learning_rate": 4.7986085220765385e-05, + "loss": 5.0744, + "step": 21631 + }, + { + "epoch": 0.1286516319345323, + "grad_norm": 1.198508620262146, + "learning_rate": 4.798590154316821e-05, + "loss": 5.0152, + "step": 21632 + }, + { + "epoch": 0.1286575792178133, + "grad_norm": 1.3534667491912842, + "learning_rate": 4.7985717857546886e-05, + "loss": 5.0292, + "step": 21633 + }, + { + "epoch": 0.1286635265010943, + "grad_norm": 1.4618093967437744, + "learning_rate": 4.798553416390149e-05, + "loss": 5.0733, + "step": 21634 + }, + { + "epoch": 0.1286694737843753, + "grad_norm": 1.4006026983261108, + "learning_rate": 4.798535046223207e-05, + "loss": 5.0071, + "step": 21635 + }, + { + "epoch": 0.12867542106765628, + "grad_norm": 1.4667402505874634, + "learning_rate": 4.7985166752538714e-05, + "loss": 4.8829, + "step": 21636 + }, + { + "epoch": 0.1286813683509373, + "grad_norm": 1.2916743755340576, + "learning_rate": 4.798498303482147e-05, + "loss": 4.9049, + "step": 21637 + }, + { + "epoch": 0.12868731563421829, + "grad_norm": 1.400270700454712, + "learning_rate": 4.798479930908041e-05, + "loss": 5.1051, + "step": 21638 + }, + { + "epoch": 0.12869326291749927, + "grad_norm": 1.3317632675170898, + "learning_rate": 4.798461557531558e-05, + "loss": 4.7864, + "step": 21639 + }, + { + "epoch": 0.1286992102007803, + "grad_norm": 1.1226558685302734, + "learning_rate": 4.7984431833527074e-05, + "loss": 4.8598, + "step": 21640 + }, + { + "epoch": 0.12870515748406128, + "grad_norm": 1.2921690940856934, + "learning_rate": 4.7984248083714934e-05, + "loss": 4.8687, + "step": 21641 + }, + { + "epoch": 0.12871110476734227, + "grad_norm": 1.2811640501022339, + "learning_rate": 4.798406432587923e-05, + "loss": 4.7438, + "step": 21642 + }, + { + "epoch": 0.12871705205062328, + "grad_norm": 1.1892732381820679, + "learning_rate": 4.7983880560020026e-05, + "loss": 4.681, + "step": 21643 + }, + { + "epoch": 0.12872299933390427, + "grad_norm": 1.3800525665283203, + "learning_rate": 4.7983696786137386e-05, + "loss": 4.9215, + "step": 21644 + }, + { + "epoch": 0.12872894661718526, + "grad_norm": 1.2753770351409912, + "learning_rate": 4.7983513004231385e-05, + "loss": 5.0006, + "step": 21645 + }, + { + "epoch": 0.12873489390046627, + "grad_norm": 1.494894027709961, + "learning_rate": 4.7983329214302064e-05, + "loss": 4.9356, + "step": 21646 + }, + { + "epoch": 0.12874084118374726, + "grad_norm": 1.3660098314285278, + "learning_rate": 4.7983145416349505e-05, + "loss": 5.3071, + "step": 21647 + }, + { + "epoch": 0.12874678846702825, + "grad_norm": 1.3494385480880737, + "learning_rate": 4.798296161037377e-05, + "loss": 5.3493, + "step": 21648 + }, + { + "epoch": 0.12875273575030927, + "grad_norm": 1.2632153034210205, + "learning_rate": 4.798277779637492e-05, + "loss": 4.9825, + "step": 21649 + }, + { + "epoch": 0.12875868303359025, + "grad_norm": 1.3519765138626099, + "learning_rate": 4.7982593974353015e-05, + "loss": 4.9032, + "step": 21650 + }, + { + "epoch": 0.12876463031687124, + "grad_norm": 1.3728691339492798, + "learning_rate": 4.798241014430813e-05, + "loss": 5.0458, + "step": 21651 + }, + { + "epoch": 0.12877057760015226, + "grad_norm": 1.326675295829773, + "learning_rate": 4.798222630624032e-05, + "loss": 4.9129, + "step": 21652 + }, + { + "epoch": 0.12877652488343325, + "grad_norm": 1.4878405332565308, + "learning_rate": 4.798204246014965e-05, + "loss": 5.1253, + "step": 21653 + }, + { + "epoch": 0.12878247216671423, + "grad_norm": 1.322288990020752, + "learning_rate": 4.798185860603619e-05, + "loss": 5.1333, + "step": 21654 + }, + { + "epoch": 0.12878841944999525, + "grad_norm": 1.496812343597412, + "learning_rate": 4.7981674743899995e-05, + "loss": 5.0263, + "step": 21655 + }, + { + "epoch": 0.12879436673327624, + "grad_norm": 1.4336779117584229, + "learning_rate": 4.7981490873741144e-05, + "loss": 5.1177, + "step": 21656 + }, + { + "epoch": 0.12880031401655723, + "grad_norm": 1.380751132965088, + "learning_rate": 4.7981306995559684e-05, + "loss": 5.0884, + "step": 21657 + }, + { + "epoch": 0.12880626129983824, + "grad_norm": 1.3929660320281982, + "learning_rate": 4.798112310935569e-05, + "loss": 5.3662, + "step": 21658 + }, + { + "epoch": 0.12881220858311923, + "grad_norm": 1.2857346534729004, + "learning_rate": 4.798093921512923e-05, + "loss": 5.2264, + "step": 21659 + }, + { + "epoch": 0.12881815586640022, + "grad_norm": 1.2468816041946411, + "learning_rate": 4.798075531288035e-05, + "loss": 4.8248, + "step": 21660 + }, + { + "epoch": 0.12882410314968123, + "grad_norm": 1.43264901638031, + "learning_rate": 4.798057140260913e-05, + "loss": 5.3999, + "step": 21661 + }, + { + "epoch": 0.12883005043296222, + "grad_norm": 1.3590344190597534, + "learning_rate": 4.798038748431563e-05, + "loss": 5.1312, + "step": 21662 + }, + { + "epoch": 0.1288359977162432, + "grad_norm": 1.4812084436416626, + "learning_rate": 4.7980203557999915e-05, + "loss": 4.7615, + "step": 21663 + }, + { + "epoch": 0.12884194499952423, + "grad_norm": 1.4256600141525269, + "learning_rate": 4.798001962366205e-05, + "loss": 4.8678, + "step": 21664 + }, + { + "epoch": 0.12884789228280522, + "grad_norm": 1.1849418878555298, + "learning_rate": 4.7979835681302095e-05, + "loss": 4.8823, + "step": 21665 + }, + { + "epoch": 0.1288538395660862, + "grad_norm": 1.395228385925293, + "learning_rate": 4.7979651730920116e-05, + "loss": 4.682, + "step": 21666 + }, + { + "epoch": 0.12885978684936722, + "grad_norm": 1.2800064086914062, + "learning_rate": 4.7979467772516186e-05, + "loss": 4.7797, + "step": 21667 + }, + { + "epoch": 0.1288657341326482, + "grad_norm": 1.3429536819458008, + "learning_rate": 4.7979283806090346e-05, + "loss": 4.7517, + "step": 21668 + }, + { + "epoch": 0.1288716814159292, + "grad_norm": 1.359732747077942, + "learning_rate": 4.797909983164269e-05, + "loss": 4.7123, + "step": 21669 + }, + { + "epoch": 0.1288776286992102, + "grad_norm": 1.2731539011001587, + "learning_rate": 4.7978915849173254e-05, + "loss": 4.7211, + "step": 21670 + }, + { + "epoch": 0.1288835759824912, + "grad_norm": 1.3688287734985352, + "learning_rate": 4.797873185868213e-05, + "loss": 4.7257, + "step": 21671 + }, + { + "epoch": 0.1288895232657722, + "grad_norm": 1.4043165445327759, + "learning_rate": 4.797854786016936e-05, + "loss": 4.8099, + "step": 21672 + }, + { + "epoch": 0.1288954705490532, + "grad_norm": 1.3721412420272827, + "learning_rate": 4.797836385363502e-05, + "loss": 4.7698, + "step": 21673 + }, + { + "epoch": 0.1289014178323342, + "grad_norm": 1.4348787069320679, + "learning_rate": 4.797817983907917e-05, + "loss": 4.7587, + "step": 21674 + }, + { + "epoch": 0.12890736511561518, + "grad_norm": 1.133793592453003, + "learning_rate": 4.797799581650187e-05, + "loss": 4.8101, + "step": 21675 + }, + { + "epoch": 0.1289133123988962, + "grad_norm": 1.3624104261398315, + "learning_rate": 4.797781178590319e-05, + "loss": 4.7416, + "step": 21676 + }, + { + "epoch": 0.12891925968217718, + "grad_norm": 1.5194214582443237, + "learning_rate": 4.7977627747283196e-05, + "loss": 4.6894, + "step": 21677 + }, + { + "epoch": 0.12892520696545817, + "grad_norm": 1.3625789880752563, + "learning_rate": 4.7977443700641954e-05, + "loss": 4.8029, + "step": 21678 + }, + { + "epoch": 0.1289311542487392, + "grad_norm": 1.2961907386779785, + "learning_rate": 4.797725964597952e-05, + "loss": 4.718, + "step": 21679 + }, + { + "epoch": 0.12893710153202018, + "grad_norm": 1.4091925621032715, + "learning_rate": 4.797707558329596e-05, + "loss": 4.7604, + "step": 21680 + }, + { + "epoch": 0.12894304881530116, + "grad_norm": 1.2274402379989624, + "learning_rate": 4.797689151259134e-05, + "loss": 4.8241, + "step": 21681 + }, + { + "epoch": 0.12894899609858218, + "grad_norm": 1.3694384098052979, + "learning_rate": 4.797670743386573e-05, + "loss": 4.7724, + "step": 21682 + }, + { + "epoch": 0.12895494338186317, + "grad_norm": 1.3621066808700562, + "learning_rate": 4.7976523347119184e-05, + "loss": 4.685, + "step": 21683 + }, + { + "epoch": 0.12896089066514416, + "grad_norm": 1.418641209602356, + "learning_rate": 4.7976339252351766e-05, + "loss": 4.7379, + "step": 21684 + }, + { + "epoch": 0.12896683794842517, + "grad_norm": 1.3113913536071777, + "learning_rate": 4.797615514956355e-05, + "loss": 4.7922, + "step": 21685 + }, + { + "epoch": 0.12897278523170616, + "grad_norm": 1.3266078233718872, + "learning_rate": 4.79759710387546e-05, + "loss": 4.7116, + "step": 21686 + }, + { + "epoch": 0.12897873251498715, + "grad_norm": 1.5212455987930298, + "learning_rate": 4.7975786919924975e-05, + "loss": 4.8422, + "step": 21687 + }, + { + "epoch": 0.12898467979826816, + "grad_norm": 1.225883960723877, + "learning_rate": 4.797560279307473e-05, + "loss": 4.8641, + "step": 21688 + }, + { + "epoch": 0.12899062708154915, + "grad_norm": 1.451951026916504, + "learning_rate": 4.797541865820395e-05, + "loss": 4.7685, + "step": 21689 + }, + { + "epoch": 0.12899657436483014, + "grad_norm": 1.3755689859390259, + "learning_rate": 4.7975234515312694e-05, + "loss": 4.7828, + "step": 21690 + }, + { + "epoch": 0.12900252164811113, + "grad_norm": 1.2667524814605713, + "learning_rate": 4.797505036440101e-05, + "loss": 4.6897, + "step": 21691 + }, + { + "epoch": 0.12900846893139215, + "grad_norm": 1.4491240978240967, + "learning_rate": 4.797486620546898e-05, + "loss": 4.8052, + "step": 21692 + }, + { + "epoch": 0.12901441621467313, + "grad_norm": 1.21664559841156, + "learning_rate": 4.797468203851665e-05, + "loss": 4.712, + "step": 21693 + }, + { + "epoch": 0.12902036349795412, + "grad_norm": 1.3836992979049683, + "learning_rate": 4.797449786354411e-05, + "loss": 4.6642, + "step": 21694 + }, + { + "epoch": 0.12902631078123514, + "grad_norm": 1.4487723112106323, + "learning_rate": 4.79743136805514e-05, + "loss": 4.7088, + "step": 21695 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 1.2507479190826416, + "learning_rate": 4.79741294895386e-05, + "loss": 4.8429, + "step": 21696 + }, + { + "epoch": 0.1290382053477971, + "grad_norm": 1.231549620628357, + "learning_rate": 4.7973945290505766e-05, + "loss": 4.9336, + "step": 21697 + }, + { + "epoch": 0.12904415263107813, + "grad_norm": 1.2317709922790527, + "learning_rate": 4.797376108345297e-05, + "loss": 4.6885, + "step": 21698 + }, + { + "epoch": 0.12905009991435912, + "grad_norm": 1.2158896923065186, + "learning_rate": 4.797357686838026e-05, + "loss": 4.7068, + "step": 21699 + }, + { + "epoch": 0.1290560471976401, + "grad_norm": 1.5054548978805542, + "learning_rate": 4.7973392645287726e-05, + "loss": 4.8568, + "step": 21700 + }, + { + "epoch": 0.12906199448092112, + "grad_norm": 1.1551764011383057, + "learning_rate": 4.7973208414175406e-05, + "loss": 4.6746, + "step": 21701 + }, + { + "epoch": 0.1290679417642021, + "grad_norm": 1.3304046392440796, + "learning_rate": 4.7973024175043386e-05, + "loss": 4.8012, + "step": 21702 + }, + { + "epoch": 0.1290738890474831, + "grad_norm": 1.4763063192367554, + "learning_rate": 4.797283992789172e-05, + "loss": 4.7412, + "step": 21703 + }, + { + "epoch": 0.12907983633076411, + "grad_norm": 1.247894287109375, + "learning_rate": 4.797265567272047e-05, + "loss": 4.7786, + "step": 21704 + }, + { + "epoch": 0.1290857836140451, + "grad_norm": 1.3668314218521118, + "learning_rate": 4.79724714095297e-05, + "loss": 4.7728, + "step": 21705 + }, + { + "epoch": 0.1290917308973261, + "grad_norm": 1.3727326393127441, + "learning_rate": 4.7972287138319477e-05, + "loss": 4.8493, + "step": 21706 + }, + { + "epoch": 0.1290976781806071, + "grad_norm": 1.3531663417816162, + "learning_rate": 4.797210285908987e-05, + "loss": 4.7598, + "step": 21707 + }, + { + "epoch": 0.1291036254638881, + "grad_norm": 1.4899832010269165, + "learning_rate": 4.797191857184094e-05, + "loss": 4.7274, + "step": 21708 + }, + { + "epoch": 0.12910957274716908, + "grad_norm": 1.3908995389938354, + "learning_rate": 4.7971734276572744e-05, + "loss": 4.9911, + "step": 21709 + }, + { + "epoch": 0.1291155200304501, + "grad_norm": 1.621774435043335, + "learning_rate": 4.7971549973285357e-05, + "loss": 5.0285, + "step": 21710 + }, + { + "epoch": 0.1291214673137311, + "grad_norm": 1.415650725364685, + "learning_rate": 4.797136566197884e-05, + "loss": 5.0959, + "step": 21711 + }, + { + "epoch": 0.12912741459701207, + "grad_norm": 1.4947463274002075, + "learning_rate": 4.797118134265326e-05, + "loss": 4.9473, + "step": 21712 + }, + { + "epoch": 0.1291333618802931, + "grad_norm": 1.45017409324646, + "learning_rate": 4.7970997015308674e-05, + "loss": 4.9928, + "step": 21713 + }, + { + "epoch": 0.12913930916357408, + "grad_norm": 1.2527333498001099, + "learning_rate": 4.7970812679945145e-05, + "loss": 4.9771, + "step": 21714 + }, + { + "epoch": 0.12914525644685507, + "grad_norm": 1.457526683807373, + "learning_rate": 4.797062833656275e-05, + "loss": 5.0143, + "step": 21715 + }, + { + "epoch": 0.12915120373013608, + "grad_norm": 1.1785821914672852, + "learning_rate": 4.7970443985161546e-05, + "loss": 5.1313, + "step": 21716 + }, + { + "epoch": 0.12915715101341707, + "grad_norm": 1.3593906164169312, + "learning_rate": 4.79702596257416e-05, + "loss": 5.0906, + "step": 21717 + }, + { + "epoch": 0.12916309829669806, + "grad_norm": 1.3789628744125366, + "learning_rate": 4.797007525830296e-05, + "loss": 5.0352, + "step": 21718 + }, + { + "epoch": 0.12916904557997907, + "grad_norm": 1.282631278038025, + "learning_rate": 4.796989088284571e-05, + "loss": 4.9973, + "step": 21719 + }, + { + "epoch": 0.12917499286326006, + "grad_norm": 1.2933098077774048, + "learning_rate": 4.796970649936991e-05, + "loss": 5.0783, + "step": 21720 + }, + { + "epoch": 0.12918094014654105, + "grad_norm": 1.3595205545425415, + "learning_rate": 4.796952210787563e-05, + "loss": 5.158, + "step": 21721 + }, + { + "epoch": 0.12918688742982207, + "grad_norm": 1.3962088823318481, + "learning_rate": 4.796933770836293e-05, + "loss": 4.9939, + "step": 21722 + }, + { + "epoch": 0.12919283471310306, + "grad_norm": 1.382554292678833, + "learning_rate": 4.796915330083186e-05, + "loss": 4.8864, + "step": 21723 + }, + { + "epoch": 0.12919878199638404, + "grad_norm": 1.3807674646377563, + "learning_rate": 4.7968968885282495e-05, + "loss": 5.0454, + "step": 21724 + }, + { + "epoch": 0.12920472927966506, + "grad_norm": 1.276809811592102, + "learning_rate": 4.7968784461714905e-05, + "loss": 5.1221, + "step": 21725 + }, + { + "epoch": 0.12921067656294605, + "grad_norm": 1.230714201927185, + "learning_rate": 4.796860003012915e-05, + "loss": 5.0382, + "step": 21726 + }, + { + "epoch": 0.12921662384622704, + "grad_norm": 1.2899225950241089, + "learning_rate": 4.796841559052529e-05, + "loss": 4.8591, + "step": 21727 + }, + { + "epoch": 0.12922257112950805, + "grad_norm": 1.3561869859695435, + "learning_rate": 4.79682311429034e-05, + "loss": 4.8811, + "step": 21728 + }, + { + "epoch": 0.12922851841278904, + "grad_norm": 1.600656270980835, + "learning_rate": 4.796804668726353e-05, + "loss": 4.9317, + "step": 21729 + }, + { + "epoch": 0.12923446569607003, + "grad_norm": 1.4110677242279053, + "learning_rate": 4.7967862223605756e-05, + "loss": 5.0964, + "step": 21730 + }, + { + "epoch": 0.12924041297935104, + "grad_norm": 1.2293707132339478, + "learning_rate": 4.796767775193014e-05, + "loss": 5.2952, + "step": 21731 + }, + { + "epoch": 0.12924636026263203, + "grad_norm": 1.4413278102874756, + "learning_rate": 4.796749327223674e-05, + "loss": 4.9628, + "step": 21732 + }, + { + "epoch": 0.12925230754591302, + "grad_norm": 1.4178003072738647, + "learning_rate": 4.7967308784525635e-05, + "loss": 4.7142, + "step": 21733 + }, + { + "epoch": 0.12925825482919404, + "grad_norm": 1.2427667379379272, + "learning_rate": 4.7967124288796875e-05, + "loss": 5.2655, + "step": 21734 + }, + { + "epoch": 0.12926420211247502, + "grad_norm": 1.3278542757034302, + "learning_rate": 4.796693978505052e-05, + "loss": 5.0156, + "step": 21735 + }, + { + "epoch": 0.129270149395756, + "grad_norm": 1.3728119134902954, + "learning_rate": 4.7966755273286656e-05, + "loss": 5.4176, + "step": 21736 + }, + { + "epoch": 0.12927609667903703, + "grad_norm": 1.344072937965393, + "learning_rate": 4.796657075350533e-05, + "loss": 4.8808, + "step": 21737 + }, + { + "epoch": 0.12928204396231802, + "grad_norm": 1.2877874374389648, + "learning_rate": 4.796638622570661e-05, + "loss": 5.0312, + "step": 21738 + }, + { + "epoch": 0.129287991245599, + "grad_norm": 1.3147602081298828, + "learning_rate": 4.7966201689890566e-05, + "loss": 5.0241, + "step": 21739 + }, + { + "epoch": 0.12929393852888002, + "grad_norm": 1.3858917951583862, + "learning_rate": 4.796601714605726e-05, + "loss": 4.468, + "step": 21740 + }, + { + "epoch": 0.129299885812161, + "grad_norm": 1.4089725017547607, + "learning_rate": 4.7965832594206747e-05, + "loss": 4.587, + "step": 21741 + }, + { + "epoch": 0.129305833095442, + "grad_norm": 1.4754424095153809, + "learning_rate": 4.796564803433911e-05, + "loss": 4.8697, + "step": 21742 + }, + { + "epoch": 0.129311780378723, + "grad_norm": 1.557544231414795, + "learning_rate": 4.796546346645439e-05, + "loss": 5.058, + "step": 21743 + }, + { + "epoch": 0.129317727662004, + "grad_norm": 1.3962191343307495, + "learning_rate": 4.7965278890552666e-05, + "loss": 5.172, + "step": 21744 + }, + { + "epoch": 0.129323674945285, + "grad_norm": 1.4976222515106201, + "learning_rate": 4.796509430663401e-05, + "loss": 5.2878, + "step": 21745 + }, + { + "epoch": 0.129329622228566, + "grad_norm": 1.3315789699554443, + "learning_rate": 4.796490971469847e-05, + "loss": 5.0468, + "step": 21746 + }, + { + "epoch": 0.129335569511847, + "grad_norm": 1.3718360662460327, + "learning_rate": 4.796472511474611e-05, + "loss": 4.9696, + "step": 21747 + }, + { + "epoch": 0.12934151679512798, + "grad_norm": 1.4873707294464111, + "learning_rate": 4.7964540506777014e-05, + "loss": 4.9281, + "step": 21748 + }, + { + "epoch": 0.12934746407840897, + "grad_norm": 1.3806785345077515, + "learning_rate": 4.7964355890791226e-05, + "loss": 5.1646, + "step": 21749 + }, + { + "epoch": 0.12935341136168998, + "grad_norm": 1.4873976707458496, + "learning_rate": 4.796417126678883e-05, + "loss": 5.1125, + "step": 21750 + }, + { + "epoch": 0.12935935864497097, + "grad_norm": 1.3314671516418457, + "learning_rate": 4.7963986634769864e-05, + "loss": 5.0819, + "step": 21751 + }, + { + "epoch": 0.12936530592825196, + "grad_norm": 1.2392772436141968, + "learning_rate": 4.796380199473442e-05, + "loss": 5.0049, + "step": 21752 + }, + { + "epoch": 0.12937125321153298, + "grad_norm": 1.4799960851669312, + "learning_rate": 4.7963617346682544e-05, + "loss": 4.8518, + "step": 21753 + }, + { + "epoch": 0.12937720049481397, + "grad_norm": 1.5646624565124512, + "learning_rate": 4.796343269061431e-05, + "loss": 4.5612, + "step": 21754 + }, + { + "epoch": 0.12938314777809495, + "grad_norm": 1.5001260042190552, + "learning_rate": 4.796324802652977e-05, + "loss": 4.8736, + "step": 21755 + }, + { + "epoch": 0.12938909506137597, + "grad_norm": 1.4235304594039917, + "learning_rate": 4.7963063354429004e-05, + "loss": 4.9256, + "step": 21756 + }, + { + "epoch": 0.12939504234465696, + "grad_norm": 1.3335869312286377, + "learning_rate": 4.7962878674312075e-05, + "loss": 4.7066, + "step": 21757 + }, + { + "epoch": 0.12940098962793795, + "grad_norm": 1.2664694786071777, + "learning_rate": 4.7962693986179036e-05, + "loss": 4.7202, + "step": 21758 + }, + { + "epoch": 0.12940693691121896, + "grad_norm": 1.2120671272277832, + "learning_rate": 4.7962509290029954e-05, + "loss": 4.8417, + "step": 21759 + }, + { + "epoch": 0.12941288419449995, + "grad_norm": 1.3657382726669312, + "learning_rate": 4.7962324585864906e-05, + "loss": 4.6566, + "step": 21760 + }, + { + "epoch": 0.12941883147778094, + "grad_norm": 1.3212461471557617, + "learning_rate": 4.7962139873683944e-05, + "loss": 4.8251, + "step": 21761 + }, + { + "epoch": 0.12942477876106195, + "grad_norm": 1.9045685529708862, + "learning_rate": 4.7961955153487137e-05, + "loss": 4.5268, + "step": 21762 + }, + { + "epoch": 0.12943072604434294, + "grad_norm": 1.536188006401062, + "learning_rate": 4.7961770425274545e-05, + "loss": 4.8356, + "step": 21763 + }, + { + "epoch": 0.12943667332762393, + "grad_norm": 1.4966436624526978, + "learning_rate": 4.796158568904624e-05, + "loss": 4.485, + "step": 21764 + }, + { + "epoch": 0.12944262061090495, + "grad_norm": 1.377543568611145, + "learning_rate": 4.796140094480228e-05, + "loss": 4.7828, + "step": 21765 + }, + { + "epoch": 0.12944856789418593, + "grad_norm": 1.6093590259552002, + "learning_rate": 4.796121619254273e-05, + "loss": 4.6621, + "step": 21766 + }, + { + "epoch": 0.12945451517746692, + "grad_norm": 1.4633464813232422, + "learning_rate": 4.796103143226767e-05, + "loss": 4.7979, + "step": 21767 + }, + { + "epoch": 0.12946046246074794, + "grad_norm": 1.332219123840332, + "learning_rate": 4.7960846663977136e-05, + "loss": 4.8313, + "step": 21768 + }, + { + "epoch": 0.12946640974402893, + "grad_norm": 1.2190324068069458, + "learning_rate": 4.796066188767121e-05, + "loss": 4.6559, + "step": 21769 + }, + { + "epoch": 0.12947235702730991, + "grad_norm": 1.4958453178405762, + "learning_rate": 4.796047710334996e-05, + "loss": 4.7633, + "step": 21770 + }, + { + "epoch": 0.12947830431059093, + "grad_norm": 1.2693027257919312, + "learning_rate": 4.796029231101344e-05, + "loss": 4.7291, + "step": 21771 + }, + { + "epoch": 0.12948425159387192, + "grad_norm": 1.2988125085830688, + "learning_rate": 4.7960107510661725e-05, + "loss": 4.7817, + "step": 21772 + }, + { + "epoch": 0.1294901988771529, + "grad_norm": 1.355332374572754, + "learning_rate": 4.7959922702294866e-05, + "loss": 4.6112, + "step": 21773 + }, + { + "epoch": 0.12949614616043392, + "grad_norm": 1.3531986474990845, + "learning_rate": 4.7959737885912934e-05, + "loss": 4.7711, + "step": 21774 + }, + { + "epoch": 0.1295020934437149, + "grad_norm": 1.275888204574585, + "learning_rate": 4.7959553061516004e-05, + "loss": 4.9089, + "step": 21775 + }, + { + "epoch": 0.1295080407269959, + "grad_norm": 1.4016762971878052, + "learning_rate": 4.795936822910413e-05, + "loss": 4.8768, + "step": 21776 + }, + { + "epoch": 0.12951398801027691, + "grad_norm": 1.5274311304092407, + "learning_rate": 4.795918338867737e-05, + "loss": 4.7434, + "step": 21777 + }, + { + "epoch": 0.1295199352935579, + "grad_norm": 1.4976401329040527, + "learning_rate": 4.79589985402358e-05, + "loss": 4.992, + "step": 21778 + }, + { + "epoch": 0.1295258825768389, + "grad_norm": 1.5180116891860962, + "learning_rate": 4.795881368377948e-05, + "loss": 5.1312, + "step": 21779 + }, + { + "epoch": 0.1295318298601199, + "grad_norm": 1.3271901607513428, + "learning_rate": 4.795862881930848e-05, + "loss": 5.1021, + "step": 21780 + }, + { + "epoch": 0.1295377771434009, + "grad_norm": 1.5069388151168823, + "learning_rate": 4.795844394682286e-05, + "loss": 4.8872, + "step": 21781 + }, + { + "epoch": 0.12954372442668188, + "grad_norm": 1.4247567653656006, + "learning_rate": 4.795825906632267e-05, + "loss": 5.0028, + "step": 21782 + }, + { + "epoch": 0.1295496717099629, + "grad_norm": 1.4976978302001953, + "learning_rate": 4.795807417780801e-05, + "loss": 5.0181, + "step": 21783 + }, + { + "epoch": 0.1295556189932439, + "grad_norm": 1.291518211364746, + "learning_rate": 4.7957889281278913e-05, + "loss": 4.8314, + "step": 21784 + }, + { + "epoch": 0.12956156627652488, + "grad_norm": 1.352803349494934, + "learning_rate": 4.7957704376735455e-05, + "loss": 4.916, + "step": 21785 + }, + { + "epoch": 0.1295675135598059, + "grad_norm": 1.3911688327789307, + "learning_rate": 4.7957519464177695e-05, + "loss": 5.1256, + "step": 21786 + }, + { + "epoch": 0.12957346084308688, + "grad_norm": 1.2493035793304443, + "learning_rate": 4.795733454360571e-05, + "loss": 4.8268, + "step": 21787 + }, + { + "epoch": 0.12957940812636787, + "grad_norm": 1.4249591827392578, + "learning_rate": 4.7957149615019547e-05, + "loss": 4.8414, + "step": 21788 + }, + { + "epoch": 0.12958535540964888, + "grad_norm": 1.5388774871826172, + "learning_rate": 4.795696467841929e-05, + "loss": 4.6288, + "step": 21789 + }, + { + "epoch": 0.12959130269292987, + "grad_norm": 1.1780091524124146, + "learning_rate": 4.795677973380499e-05, + "loss": 4.5712, + "step": 21790 + }, + { + "epoch": 0.12959724997621086, + "grad_norm": 1.2415392398834229, + "learning_rate": 4.7956594781176716e-05, + "loss": 4.8536, + "step": 21791 + }, + { + "epoch": 0.12960319725949188, + "grad_norm": 1.2828611135482788, + "learning_rate": 4.795640982053453e-05, + "loss": 5.1549, + "step": 21792 + }, + { + "epoch": 0.12960914454277286, + "grad_norm": 1.5143916606903076, + "learning_rate": 4.79562248518785e-05, + "loss": 5.2302, + "step": 21793 + }, + { + "epoch": 0.12961509182605385, + "grad_norm": 1.3260207176208496, + "learning_rate": 4.795603987520869e-05, + "loss": 4.9272, + "step": 21794 + }, + { + "epoch": 0.12962103910933487, + "grad_norm": 1.2133897542953491, + "learning_rate": 4.795585489052516e-05, + "loss": 4.8229, + "step": 21795 + }, + { + "epoch": 0.12962698639261586, + "grad_norm": 1.5181169509887695, + "learning_rate": 4.795566989782798e-05, + "loss": 4.8024, + "step": 21796 + }, + { + "epoch": 0.12963293367589684, + "grad_norm": 1.3889726400375366, + "learning_rate": 4.795548489711722e-05, + "loss": 4.5859, + "step": 21797 + }, + { + "epoch": 0.12963888095917786, + "grad_norm": 1.543861985206604, + "learning_rate": 4.7955299888392924e-05, + "loss": 4.7782, + "step": 21798 + }, + { + "epoch": 0.12964482824245885, + "grad_norm": 1.4648151397705078, + "learning_rate": 4.795511487165518e-05, + "loss": 4.9949, + "step": 21799 + }, + { + "epoch": 0.12965077552573984, + "grad_norm": 1.2487531900405884, + "learning_rate": 4.795492984690404e-05, + "loss": 5.0329, + "step": 21800 + }, + { + "epoch": 0.12965672280902085, + "grad_norm": 1.503164529800415, + "learning_rate": 4.795474481413957e-05, + "loss": 4.7723, + "step": 21801 + }, + { + "epoch": 0.12966267009230184, + "grad_norm": 1.3406294584274292, + "learning_rate": 4.795455977336184e-05, + "loss": 4.9541, + "step": 21802 + }, + { + "epoch": 0.12966861737558283, + "grad_norm": 1.4314171075820923, + "learning_rate": 4.795437472457091e-05, + "loss": 5.018, + "step": 21803 + }, + { + "epoch": 0.12967456465886384, + "grad_norm": 1.3255850076675415, + "learning_rate": 4.795418966776683e-05, + "loss": 4.7675, + "step": 21804 + }, + { + "epoch": 0.12968051194214483, + "grad_norm": 1.6132442951202393, + "learning_rate": 4.7954004602949697e-05, + "loss": 4.8068, + "step": 21805 + }, + { + "epoch": 0.12968645922542582, + "grad_norm": 1.25650954246521, + "learning_rate": 4.7953819530119555e-05, + "loss": 4.8709, + "step": 21806 + }, + { + "epoch": 0.1296924065087068, + "grad_norm": 1.3686168193817139, + "learning_rate": 4.795363444927646e-05, + "loss": 4.8815, + "step": 21807 + }, + { + "epoch": 0.12969835379198782, + "grad_norm": 1.250143051147461, + "learning_rate": 4.79534493604205e-05, + "loss": 4.9077, + "step": 21808 + }, + { + "epoch": 0.1297043010752688, + "grad_norm": 1.421834111213684, + "learning_rate": 4.795326426355173e-05, + "loss": 4.806, + "step": 21809 + }, + { + "epoch": 0.1297102483585498, + "grad_norm": 1.3038170337677002, + "learning_rate": 4.795307915867021e-05, + "loss": 5.0142, + "step": 21810 + }, + { + "epoch": 0.12971619564183082, + "grad_norm": 1.390637993812561, + "learning_rate": 4.7952894045776e-05, + "loss": 4.8802, + "step": 21811 + }, + { + "epoch": 0.1297221429251118, + "grad_norm": 1.3310891389846802, + "learning_rate": 4.7952708924869184e-05, + "loss": 4.7995, + "step": 21812 + }, + { + "epoch": 0.1297280902083928, + "grad_norm": 1.243156909942627, + "learning_rate": 4.79525237959498e-05, + "loss": 4.6147, + "step": 21813 + }, + { + "epoch": 0.1297340374916738, + "grad_norm": 1.522707223892212, + "learning_rate": 4.7952338659017934e-05, + "loss": 4.6666, + "step": 21814 + }, + { + "epoch": 0.1297399847749548, + "grad_norm": 1.3331211805343628, + "learning_rate": 4.795215351407365e-05, + "loss": 4.7236, + "step": 21815 + }, + { + "epoch": 0.12974593205823579, + "grad_norm": 1.3704382181167603, + "learning_rate": 4.7951968361116996e-05, + "loss": 5.299, + "step": 21816 + }, + { + "epoch": 0.1297518793415168, + "grad_norm": 1.4870846271514893, + "learning_rate": 4.7951783200148055e-05, + "loss": 5.2623, + "step": 21817 + }, + { + "epoch": 0.1297578266247978, + "grad_norm": 1.4282408952713013, + "learning_rate": 4.795159803116688e-05, + "loss": 5.075, + "step": 21818 + }, + { + "epoch": 0.12976377390807878, + "grad_norm": 1.408409595489502, + "learning_rate": 4.795141285417354e-05, + "loss": 4.7274, + "step": 21819 + }, + { + "epoch": 0.1297697211913598, + "grad_norm": 1.4432475566864014, + "learning_rate": 4.79512276691681e-05, + "loss": 4.8196, + "step": 21820 + }, + { + "epoch": 0.12977566847464078, + "grad_norm": 1.6136623620986938, + "learning_rate": 4.7951042476150624e-05, + "loss": 4.7634, + "step": 21821 + }, + { + "epoch": 0.12978161575792177, + "grad_norm": 1.13461434841156, + "learning_rate": 4.795085727512117e-05, + "loss": 4.9421, + "step": 21822 + }, + { + "epoch": 0.12978756304120279, + "grad_norm": 1.2107611894607544, + "learning_rate": 4.795067206607981e-05, + "loss": 5.1572, + "step": 21823 + }, + { + "epoch": 0.12979351032448377, + "grad_norm": 1.8843787908554077, + "learning_rate": 4.795048684902661e-05, + "loss": 5.4081, + "step": 21824 + }, + { + "epoch": 0.12979945760776476, + "grad_norm": 1.192597508430481, + "learning_rate": 4.7950301623961633e-05, + "loss": 4.9609, + "step": 21825 + }, + { + "epoch": 0.12980540489104578, + "grad_norm": 1.4349040985107422, + "learning_rate": 4.795011639088495e-05, + "loss": 4.72, + "step": 21826 + }, + { + "epoch": 0.12981135217432677, + "grad_norm": 1.8054217100143433, + "learning_rate": 4.79499311497966e-05, + "loss": 5.5003, + "step": 21827 + }, + { + "epoch": 0.12981729945760775, + "grad_norm": 1.521070122718811, + "learning_rate": 4.794974590069669e-05, + "loss": 5.5325, + "step": 21828 + }, + { + "epoch": 0.12982324674088877, + "grad_norm": 1.936892032623291, + "learning_rate": 4.794956064358524e-05, + "loss": 4.6644, + "step": 21829 + }, + { + "epoch": 0.12982919402416976, + "grad_norm": 1.9401378631591797, + "learning_rate": 4.794937537846234e-05, + "loss": 4.7442, + "step": 21830 + }, + { + "epoch": 0.12983514130745075, + "grad_norm": 1.3924851417541504, + "learning_rate": 4.794919010532806e-05, + "loss": 4.9434, + "step": 21831 + }, + { + "epoch": 0.12984108859073176, + "grad_norm": 1.3180463314056396, + "learning_rate": 4.794900482418244e-05, + "loss": 4.9098, + "step": 21832 + }, + { + "epoch": 0.12984703587401275, + "grad_norm": 1.3872355222702026, + "learning_rate": 4.7948819535025565e-05, + "loss": 4.8212, + "step": 21833 + }, + { + "epoch": 0.12985298315729374, + "grad_norm": 1.2868075370788574, + "learning_rate": 4.79486342378575e-05, + "loss": 4.7609, + "step": 21834 + }, + { + "epoch": 0.12985893044057475, + "grad_norm": 1.4286006689071655, + "learning_rate": 4.79484489326783e-05, + "loss": 4.828, + "step": 21835 + }, + { + "epoch": 0.12986487772385574, + "grad_norm": 1.3485580682754517, + "learning_rate": 4.794826361948804e-05, + "loss": 4.7596, + "step": 21836 + }, + { + "epoch": 0.12987082500713673, + "grad_norm": 1.469319224357605, + "learning_rate": 4.794807829828677e-05, + "loss": 4.8431, + "step": 21837 + }, + { + "epoch": 0.12987677229041775, + "grad_norm": 1.4626957178115845, + "learning_rate": 4.794789296907457e-05, + "loss": 4.7884, + "step": 21838 + }, + { + "epoch": 0.12988271957369873, + "grad_norm": 1.2266536951065063, + "learning_rate": 4.794770763185149e-05, + "loss": 4.8359, + "step": 21839 + }, + { + "epoch": 0.12988866685697972, + "grad_norm": 1.2295827865600586, + "learning_rate": 4.794752228661761e-05, + "loss": 4.6327, + "step": 21840 + }, + { + "epoch": 0.12989461414026074, + "grad_norm": 1.4784702062606812, + "learning_rate": 4.794733693337298e-05, + "loss": 4.8363, + "step": 21841 + }, + { + "epoch": 0.12990056142354173, + "grad_norm": 1.6527009010314941, + "learning_rate": 4.794715157211767e-05, + "loss": 5.0696, + "step": 21842 + }, + { + "epoch": 0.12990650870682272, + "grad_norm": 1.7082421779632568, + "learning_rate": 4.7946966202851754e-05, + "loss": 4.8249, + "step": 21843 + }, + { + "epoch": 0.12991245599010373, + "grad_norm": 1.5493143796920776, + "learning_rate": 4.794678082557529e-05, + "loss": 4.9604, + "step": 21844 + }, + { + "epoch": 0.12991840327338472, + "grad_norm": 1.631940245628357, + "learning_rate": 4.7946595440288335e-05, + "loss": 4.6672, + "step": 21845 + }, + { + "epoch": 0.1299243505566657, + "grad_norm": 1.3021342754364014, + "learning_rate": 4.794641004699096e-05, + "loss": 4.821, + "step": 21846 + }, + { + "epoch": 0.12993029783994672, + "grad_norm": 1.331272006034851, + "learning_rate": 4.794622464568324e-05, + "loss": 5.1398, + "step": 21847 + }, + { + "epoch": 0.1299362451232277, + "grad_norm": 1.5635039806365967, + "learning_rate": 4.794603923636522e-05, + "loss": 5.0405, + "step": 21848 + }, + { + "epoch": 0.1299421924065087, + "grad_norm": 1.412961721420288, + "learning_rate": 4.794585381903698e-05, + "loss": 5.1334, + "step": 21849 + }, + { + "epoch": 0.12994813968978972, + "grad_norm": 1.0943198204040527, + "learning_rate": 4.794566839369857e-05, + "loss": 5.1978, + "step": 21850 + }, + { + "epoch": 0.1299540869730707, + "grad_norm": 1.6458427906036377, + "learning_rate": 4.794548296035007e-05, + "loss": 4.6475, + "step": 21851 + }, + { + "epoch": 0.1299600342563517, + "grad_norm": 1.37641179561615, + "learning_rate": 4.794529751899155e-05, + "loss": 5.0094, + "step": 21852 + }, + { + "epoch": 0.1299659815396327, + "grad_norm": 1.6493875980377197, + "learning_rate": 4.7945112069623054e-05, + "loss": 4.9748, + "step": 21853 + }, + { + "epoch": 0.1299719288229137, + "grad_norm": 1.4612071514129639, + "learning_rate": 4.794492661224466e-05, + "loss": 5.1217, + "step": 21854 + }, + { + "epoch": 0.12997787610619468, + "grad_norm": 1.4929149150848389, + "learning_rate": 4.7944741146856425e-05, + "loss": 4.916, + "step": 21855 + }, + { + "epoch": 0.1299838233894757, + "grad_norm": 1.5030015707015991, + "learning_rate": 4.794455567345842e-05, + "loss": 5.1206, + "step": 21856 + }, + { + "epoch": 0.1299897706727567, + "grad_norm": 1.3132811784744263, + "learning_rate": 4.79443701920507e-05, + "loss": 5.1996, + "step": 21857 + }, + { + "epoch": 0.12999571795603768, + "grad_norm": 1.3515914678573608, + "learning_rate": 4.794418470263335e-05, + "loss": 4.8565, + "step": 21858 + }, + { + "epoch": 0.1300016652393187, + "grad_norm": 1.3780977725982666, + "learning_rate": 4.7943999205206414e-05, + "loss": 4.9207, + "step": 21859 + }, + { + "epoch": 0.13000761252259968, + "grad_norm": 1.3044095039367676, + "learning_rate": 4.794381369976997e-05, + "loss": 5.0898, + "step": 21860 + }, + { + "epoch": 0.13001355980588067, + "grad_norm": 1.3406704664230347, + "learning_rate": 4.7943628186324076e-05, + "loss": 4.942, + "step": 21861 + }, + { + "epoch": 0.13001950708916168, + "grad_norm": 1.2654430866241455, + "learning_rate": 4.7943442664868795e-05, + "loss": 5.2096, + "step": 21862 + }, + { + "epoch": 0.13002545437244267, + "grad_norm": 1.313717007637024, + "learning_rate": 4.79432571354042e-05, + "loss": 4.9946, + "step": 21863 + }, + { + "epoch": 0.13003140165572366, + "grad_norm": 1.0787066221237183, + "learning_rate": 4.794307159793035e-05, + "loss": 4.9556, + "step": 21864 + }, + { + "epoch": 0.13003734893900465, + "grad_norm": 1.3731575012207031, + "learning_rate": 4.794288605244731e-05, + "loss": 4.904, + "step": 21865 + }, + { + "epoch": 0.13004329622228566, + "grad_norm": 1.4843237400054932, + "learning_rate": 4.794270049895514e-05, + "loss": 5.1451, + "step": 21866 + }, + { + "epoch": 0.13004924350556665, + "grad_norm": 1.3293545246124268, + "learning_rate": 4.794251493745392e-05, + "loss": 5.1794, + "step": 21867 + }, + { + "epoch": 0.13005519078884764, + "grad_norm": 1.6757280826568604, + "learning_rate": 4.79423293679437e-05, + "loss": 4.9797, + "step": 21868 + }, + { + "epoch": 0.13006113807212866, + "grad_norm": 1.7158734798431396, + "learning_rate": 4.794214379042456e-05, + "loss": 4.7833, + "step": 21869 + }, + { + "epoch": 0.13006708535540965, + "grad_norm": 2.164602756500244, + "learning_rate": 4.794195820489654e-05, + "loss": 4.4662, + "step": 21870 + }, + { + "epoch": 0.13007303263869063, + "grad_norm": 1.5726985931396484, + "learning_rate": 4.794177261135972e-05, + "loss": 5.3064, + "step": 21871 + }, + { + "epoch": 0.13007897992197165, + "grad_norm": 1.3667716979980469, + "learning_rate": 4.794158700981417e-05, + "loss": 5.0881, + "step": 21872 + }, + { + "epoch": 0.13008492720525264, + "grad_norm": 1.5155465602874756, + "learning_rate": 4.794140140025994e-05, + "loss": 4.95, + "step": 21873 + }, + { + "epoch": 0.13009087448853363, + "grad_norm": 1.4024773836135864, + "learning_rate": 4.794121578269712e-05, + "loss": 5.1932, + "step": 21874 + }, + { + "epoch": 0.13009682177181464, + "grad_norm": 1.3104946613311768, + "learning_rate": 4.7941030157125746e-05, + "loss": 5.1143, + "step": 21875 + }, + { + "epoch": 0.13010276905509563, + "grad_norm": 1.3269513845443726, + "learning_rate": 4.79408445235459e-05, + "loss": 5.1411, + "step": 21876 + }, + { + "epoch": 0.13010871633837662, + "grad_norm": 1.3147937059402466, + "learning_rate": 4.7940658881957645e-05, + "loss": 5.0444, + "step": 21877 + }, + { + "epoch": 0.13011466362165763, + "grad_norm": 1.125897765159607, + "learning_rate": 4.794047323236104e-05, + "loss": 5.0522, + "step": 21878 + }, + { + "epoch": 0.13012061090493862, + "grad_norm": 1.331945776939392, + "learning_rate": 4.794028757475615e-05, + "loss": 5.1433, + "step": 21879 + }, + { + "epoch": 0.1301265581882196, + "grad_norm": 1.206411361694336, + "learning_rate": 4.794010190914304e-05, + "loss": 4.7293, + "step": 21880 + }, + { + "epoch": 0.13013250547150063, + "grad_norm": 1.6212915182113647, + "learning_rate": 4.793991623552179e-05, + "loss": 4.5976, + "step": 21881 + }, + { + "epoch": 0.13013845275478161, + "grad_norm": 1.4009672403335571, + "learning_rate": 4.793973055389244e-05, + "loss": 4.8846, + "step": 21882 + }, + { + "epoch": 0.1301444000380626, + "grad_norm": 1.5049399137496948, + "learning_rate": 4.793954486425507e-05, + "loss": 4.7785, + "step": 21883 + }, + { + "epoch": 0.13015034732134362, + "grad_norm": 1.496751070022583, + "learning_rate": 4.7939359166609746e-05, + "loss": 4.5957, + "step": 21884 + }, + { + "epoch": 0.1301562946046246, + "grad_norm": 1.7572035789489746, + "learning_rate": 4.7939173460956525e-05, + "loss": 4.8929, + "step": 21885 + }, + { + "epoch": 0.1301622418879056, + "grad_norm": 1.593353271484375, + "learning_rate": 4.793898774729548e-05, + "loss": 5.6704, + "step": 21886 + }, + { + "epoch": 0.1301681891711866, + "grad_norm": 1.4550076723098755, + "learning_rate": 4.7938802025626665e-05, + "loss": 5.6588, + "step": 21887 + }, + { + "epoch": 0.1301741364544676, + "grad_norm": 1.6618671417236328, + "learning_rate": 4.793861629595015e-05, + "loss": 5.6571, + "step": 21888 + }, + { + "epoch": 0.1301800837377486, + "grad_norm": 1.4493645429611206, + "learning_rate": 4.793843055826601e-05, + "loss": 5.4406, + "step": 21889 + }, + { + "epoch": 0.1301860310210296, + "grad_norm": 1.5164732933044434, + "learning_rate": 4.793824481257429e-05, + "loss": 5.4872, + "step": 21890 + }, + { + "epoch": 0.1301919783043106, + "grad_norm": 1.5956424474716187, + "learning_rate": 4.793805905887508e-05, + "loss": 4.7702, + "step": 21891 + }, + { + "epoch": 0.13019792558759158, + "grad_norm": 1.850864291191101, + "learning_rate": 4.7937873297168425e-05, + "loss": 4.6842, + "step": 21892 + }, + { + "epoch": 0.1302038728708726, + "grad_norm": 1.637451171875, + "learning_rate": 4.793768752745439e-05, + "loss": 5.2488, + "step": 21893 + }, + { + "epoch": 0.13020982015415358, + "grad_norm": 1.5980913639068604, + "learning_rate": 4.793750174973305e-05, + "loss": 5.4026, + "step": 21894 + }, + { + "epoch": 0.13021576743743457, + "grad_norm": 1.7420471906661987, + "learning_rate": 4.793731596400446e-05, + "loss": 5.2409, + "step": 21895 + }, + { + "epoch": 0.1302217147207156, + "grad_norm": 2.749483346939087, + "learning_rate": 4.7937130170268694e-05, + "loss": 5.3401, + "step": 21896 + }, + { + "epoch": 0.13022766200399657, + "grad_norm": 2.610828399658203, + "learning_rate": 4.793694436852581e-05, + "loss": 5.0967, + "step": 21897 + }, + { + "epoch": 0.13023360928727756, + "grad_norm": 2.5725367069244385, + "learning_rate": 4.793675855877588e-05, + "loss": 5.1184, + "step": 21898 + }, + { + "epoch": 0.13023955657055858, + "grad_norm": 2.438526153564453, + "learning_rate": 4.793657274101896e-05, + "loss": 5.1315, + "step": 21899 + }, + { + "epoch": 0.13024550385383957, + "grad_norm": 2.2574191093444824, + "learning_rate": 4.793638691525513e-05, + "loss": 4.9999, + "step": 21900 + }, + { + "epoch": 0.13025145113712056, + "grad_norm": 1.9024723768234253, + "learning_rate": 4.7936201081484434e-05, + "loss": 5.1766, + "step": 21901 + }, + { + "epoch": 0.13025739842040157, + "grad_norm": 2.2040951251983643, + "learning_rate": 4.793601523970695e-05, + "loss": 4.9261, + "step": 21902 + }, + { + "epoch": 0.13026334570368256, + "grad_norm": 2.333158016204834, + "learning_rate": 4.7935829389922736e-05, + "loss": 4.9423, + "step": 21903 + }, + { + "epoch": 0.13026929298696355, + "grad_norm": 2.2712838649749756, + "learning_rate": 4.793564353213187e-05, + "loss": 4.7511, + "step": 21904 + }, + { + "epoch": 0.13027524027024456, + "grad_norm": 2.119046211242676, + "learning_rate": 4.79354576663344e-05, + "loss": 4.7284, + "step": 21905 + }, + { + "epoch": 0.13028118755352555, + "grad_norm": 2.3056483268737793, + "learning_rate": 4.79352717925304e-05, + "loss": 4.8627, + "step": 21906 + }, + { + "epoch": 0.13028713483680654, + "grad_norm": 2.2767837047576904, + "learning_rate": 4.793508591071993e-05, + "loss": 4.7924, + "step": 21907 + }, + { + "epoch": 0.13029308212008756, + "grad_norm": 2.138441324234009, + "learning_rate": 4.793490002090306e-05, + "loss": 4.747, + "step": 21908 + }, + { + "epoch": 0.13029902940336854, + "grad_norm": 1.9595372676849365, + "learning_rate": 4.793471412307986e-05, + "loss": 4.6861, + "step": 21909 + }, + { + "epoch": 0.13030497668664953, + "grad_norm": 2.207357883453369, + "learning_rate": 4.793452821725039e-05, + "loss": 4.4727, + "step": 21910 + }, + { + "epoch": 0.13031092396993055, + "grad_norm": 1.9506596326828003, + "learning_rate": 4.7934342303414704e-05, + "loss": 4.4445, + "step": 21911 + }, + { + "epoch": 0.13031687125321154, + "grad_norm": 2.0946574211120605, + "learning_rate": 4.793415638157288e-05, + "loss": 4.4556, + "step": 21912 + }, + { + "epoch": 0.13032281853649252, + "grad_norm": 2.7089650630950928, + "learning_rate": 4.793397045172497e-05, + "loss": 4.3106, + "step": 21913 + }, + { + "epoch": 0.13032876581977354, + "grad_norm": 2.6837174892425537, + "learning_rate": 4.793378451387106e-05, + "loss": 4.4133, + "step": 21914 + }, + { + "epoch": 0.13033471310305453, + "grad_norm": 2.28702712059021, + "learning_rate": 4.7933598568011207e-05, + "loss": 4.4326, + "step": 21915 + }, + { + "epoch": 0.13034066038633552, + "grad_norm": 2.172691583633423, + "learning_rate": 4.793341261414546e-05, + "loss": 4.6047, + "step": 21916 + }, + { + "epoch": 0.13034660766961653, + "grad_norm": 2.202906608581543, + "learning_rate": 4.79332266522739e-05, + "loss": 4.6857, + "step": 21917 + }, + { + "epoch": 0.13035255495289752, + "grad_norm": 1.7617685794830322, + "learning_rate": 4.793304068239658e-05, + "loss": 4.4888, + "step": 21918 + }, + { + "epoch": 0.1303585022361785, + "grad_norm": 2.2866454124450684, + "learning_rate": 4.7932854704513586e-05, + "loss": 4.5558, + "step": 21919 + }, + { + "epoch": 0.13036444951945952, + "grad_norm": 2.0338642597198486, + "learning_rate": 4.793266871862496e-05, + "loss": 5.2769, + "step": 21920 + }, + { + "epoch": 0.1303703968027405, + "grad_norm": 2.0302703380584717, + "learning_rate": 4.793248272473078e-05, + "loss": 4.5903, + "step": 21921 + }, + { + "epoch": 0.1303763440860215, + "grad_norm": 2.1618101596832275, + "learning_rate": 4.793229672283111e-05, + "loss": 4.9971, + "step": 21922 + }, + { + "epoch": 0.1303822913693025, + "grad_norm": 2.0446085929870605, + "learning_rate": 4.7932110712926004e-05, + "loss": 5.286, + "step": 21923 + }, + { + "epoch": 0.1303882386525835, + "grad_norm": 1.544705867767334, + "learning_rate": 4.793192469501554e-05, + "loss": 5.5509, + "step": 21924 + }, + { + "epoch": 0.1303941859358645, + "grad_norm": 1.5994058847427368, + "learning_rate": 4.7931738669099776e-05, + "loss": 5.5891, + "step": 21925 + }, + { + "epoch": 0.13040013321914548, + "grad_norm": 1.5866730213165283, + "learning_rate": 4.793155263517878e-05, + "loss": 5.3539, + "step": 21926 + }, + { + "epoch": 0.1304060805024265, + "grad_norm": 1.5843631029129028, + "learning_rate": 4.793136659325262e-05, + "loss": 5.5528, + "step": 21927 + }, + { + "epoch": 0.13041202778570748, + "grad_norm": 1.8037461042404175, + "learning_rate": 4.7931180543321354e-05, + "loss": 4.9484, + "step": 21928 + }, + { + "epoch": 0.13041797506898847, + "grad_norm": 1.8021430969238281, + "learning_rate": 4.793099448538505e-05, + "loss": 5.2239, + "step": 21929 + }, + { + "epoch": 0.1304239223522695, + "grad_norm": 1.9063239097595215, + "learning_rate": 4.793080841944377e-05, + "loss": 5.0627, + "step": 21930 + }, + { + "epoch": 0.13042986963555048, + "grad_norm": 1.8546555042266846, + "learning_rate": 4.7930622345497575e-05, + "loss": 4.8691, + "step": 21931 + }, + { + "epoch": 0.13043581691883147, + "grad_norm": 1.7901126146316528, + "learning_rate": 4.793043626354655e-05, + "loss": 4.8975, + "step": 21932 + }, + { + "epoch": 0.13044176420211248, + "grad_norm": 1.7083008289337158, + "learning_rate": 4.793025017359074e-05, + "loss": 4.8176, + "step": 21933 + }, + { + "epoch": 0.13044771148539347, + "grad_norm": 1.7584604024887085, + "learning_rate": 4.793006407563022e-05, + "loss": 5.2551, + "step": 21934 + }, + { + "epoch": 0.13045365876867446, + "grad_norm": 1.6731703281402588, + "learning_rate": 4.792987796966505e-05, + "loss": 5.0456, + "step": 21935 + }, + { + "epoch": 0.13045960605195547, + "grad_norm": 1.6340082883834839, + "learning_rate": 4.7929691855695294e-05, + "loss": 5.5061, + "step": 21936 + }, + { + "epoch": 0.13046555333523646, + "grad_norm": 1.7354822158813477, + "learning_rate": 4.792950573372102e-05, + "loss": 5.7164, + "step": 21937 + }, + { + "epoch": 0.13047150061851745, + "grad_norm": 1.6100409030914307, + "learning_rate": 4.79293196037423e-05, + "loss": 5.2427, + "step": 21938 + }, + { + "epoch": 0.13047744790179847, + "grad_norm": 2.603156328201294, + "learning_rate": 4.7929133465759184e-05, + "loss": 4.1146, + "step": 21939 + }, + { + "epoch": 0.13048339518507945, + "grad_norm": 2.518183946609497, + "learning_rate": 4.7928947319771746e-05, + "loss": 4.2918, + "step": 21940 + }, + { + "epoch": 0.13048934246836044, + "grad_norm": 1.7518165111541748, + "learning_rate": 4.792876116578004e-05, + "loss": 5.9257, + "step": 21941 + }, + { + "epoch": 0.13049528975164146, + "grad_norm": 1.8118661642074585, + "learning_rate": 4.792857500378416e-05, + "loss": 5.8985, + "step": 21942 + }, + { + "epoch": 0.13050123703492245, + "grad_norm": 1.5877163410186768, + "learning_rate": 4.792838883378414e-05, + "loss": 6.0572, + "step": 21943 + }, + { + "epoch": 0.13050718431820343, + "grad_norm": 1.313362956047058, + "learning_rate": 4.7928202655780055e-05, + "loss": 5.7739, + "step": 21944 + }, + { + "epoch": 0.13051313160148445, + "grad_norm": 1.5902273654937744, + "learning_rate": 4.792801646977198e-05, + "loss": 6.021, + "step": 21945 + }, + { + "epoch": 0.13051907888476544, + "grad_norm": 1.8784877061843872, + "learning_rate": 4.792783027575996e-05, + "loss": 5.0933, + "step": 21946 + }, + { + "epoch": 0.13052502616804643, + "grad_norm": 1.7743972539901733, + "learning_rate": 4.7927644073744076e-05, + "loss": 5.1168, + "step": 21947 + }, + { + "epoch": 0.13053097345132744, + "grad_norm": 2.0093095302581787, + "learning_rate": 4.792745786372439e-05, + "loss": 5.7441, + "step": 21948 + }, + { + "epoch": 0.13053692073460843, + "grad_norm": 2.0483853816986084, + "learning_rate": 4.7927271645700966e-05, + "loss": 5.4851, + "step": 21949 + }, + { + "epoch": 0.13054286801788942, + "grad_norm": 1.7858600616455078, + "learning_rate": 4.792708541967386e-05, + "loss": 5.4308, + "step": 21950 + }, + { + "epoch": 0.13054881530117043, + "grad_norm": 1.578202247619629, + "learning_rate": 4.7926899185643155e-05, + "loss": 5.4409, + "step": 21951 + }, + { + "epoch": 0.13055476258445142, + "grad_norm": 1.5763752460479736, + "learning_rate": 4.7926712943608895e-05, + "loss": 5.438, + "step": 21952 + }, + { + "epoch": 0.1305607098677324, + "grad_norm": 1.4117366075515747, + "learning_rate": 4.792652669357117e-05, + "loss": 5.3256, + "step": 21953 + }, + { + "epoch": 0.13056665715101343, + "grad_norm": 1.8186451196670532, + "learning_rate": 4.792634043553003e-05, + "loss": 5.4336, + "step": 21954 + }, + { + "epoch": 0.13057260443429441, + "grad_norm": 1.8576366901397705, + "learning_rate": 4.7926154169485536e-05, + "loss": 5.5133, + "step": 21955 + }, + { + "epoch": 0.1305785517175754, + "grad_norm": 1.81550931930542, + "learning_rate": 4.7925967895437754e-05, + "loss": 5.3673, + "step": 21956 + }, + { + "epoch": 0.13058449900085642, + "grad_norm": 1.5518393516540527, + "learning_rate": 4.7925781613386765e-05, + "loss": 5.3788, + "step": 21957 + }, + { + "epoch": 0.1305904462841374, + "grad_norm": 1.726492166519165, + "learning_rate": 4.7925595323332615e-05, + "loss": 5.4759, + "step": 21958 + }, + { + "epoch": 0.1305963935674184, + "grad_norm": 1.6105836629867554, + "learning_rate": 4.792540902527538e-05, + "loss": 5.3339, + "step": 21959 + }, + { + "epoch": 0.1306023408506994, + "grad_norm": 1.6900887489318848, + "learning_rate": 4.792522271921512e-05, + "loss": 5.457, + "step": 21960 + }, + { + "epoch": 0.1306082881339804, + "grad_norm": 1.6158493757247925, + "learning_rate": 4.79250364051519e-05, + "loss": 5.4049, + "step": 21961 + }, + { + "epoch": 0.1306142354172614, + "grad_norm": 1.5123624801635742, + "learning_rate": 4.792485008308579e-05, + "loss": 5.3611, + "step": 21962 + }, + { + "epoch": 0.1306201827005424, + "grad_norm": 1.4421589374542236, + "learning_rate": 4.792466375301685e-05, + "loss": 5.3816, + "step": 21963 + }, + { + "epoch": 0.1306261299838234, + "grad_norm": 1.6167370080947876, + "learning_rate": 4.792447741494514e-05, + "loss": 5.3484, + "step": 21964 + }, + { + "epoch": 0.13063207726710438, + "grad_norm": 1.5235882997512817, + "learning_rate": 4.7924291068870745e-05, + "loss": 5.4756, + "step": 21965 + }, + { + "epoch": 0.1306380245503854, + "grad_norm": 1.5585761070251465, + "learning_rate": 4.7924104714793705e-05, + "loss": 4.9743, + "step": 21966 + }, + { + "epoch": 0.13064397183366638, + "grad_norm": 1.6565943956375122, + "learning_rate": 4.79239183527141e-05, + "loss": 4.9801, + "step": 21967 + }, + { + "epoch": 0.13064991911694737, + "grad_norm": 1.449012041091919, + "learning_rate": 4.7923731982631993e-05, + "loss": 5.2166, + "step": 21968 + }, + { + "epoch": 0.1306558664002284, + "grad_norm": 1.7511426210403442, + "learning_rate": 4.792354560454745e-05, + "loss": 4.7892, + "step": 21969 + }, + { + "epoch": 0.13066181368350938, + "grad_norm": 1.8433175086975098, + "learning_rate": 4.7923359218460535e-05, + "loss": 5.1481, + "step": 21970 + }, + { + "epoch": 0.13066776096679036, + "grad_norm": 1.4407368898391724, + "learning_rate": 4.792317282437131e-05, + "loss": 5.3282, + "step": 21971 + }, + { + "epoch": 0.13067370825007138, + "grad_norm": 1.7756870985031128, + "learning_rate": 4.7922986422279836e-05, + "loss": 4.9934, + "step": 21972 + }, + { + "epoch": 0.13067965553335237, + "grad_norm": 1.6745517253875732, + "learning_rate": 4.7922800012186197e-05, + "loss": 4.9524, + "step": 21973 + }, + { + "epoch": 0.13068560281663336, + "grad_norm": 1.6869374513626099, + "learning_rate": 4.792261359409044e-05, + "loss": 5.0163, + "step": 21974 + }, + { + "epoch": 0.13069155009991437, + "grad_norm": 1.810007929801941, + "learning_rate": 4.7922427167992635e-05, + "loss": 5.7507, + "step": 21975 + }, + { + "epoch": 0.13069749738319536, + "grad_norm": 1.438236951828003, + "learning_rate": 4.792224073389284e-05, + "loss": 5.6271, + "step": 21976 + }, + { + "epoch": 0.13070344466647635, + "grad_norm": 1.7424002885818481, + "learning_rate": 4.7922054291791135e-05, + "loss": 5.4101, + "step": 21977 + }, + { + "epoch": 0.13070939194975736, + "grad_norm": 1.6832276582717896, + "learning_rate": 4.7921867841687576e-05, + "loss": 5.5323, + "step": 21978 + }, + { + "epoch": 0.13071533923303835, + "grad_norm": 1.4542639255523682, + "learning_rate": 4.792168138358223e-05, + "loss": 5.6003, + "step": 21979 + }, + { + "epoch": 0.13072128651631934, + "grad_norm": 1.5791352987289429, + "learning_rate": 4.7921494917475164e-05, + "loss": 4.448, + "step": 21980 + }, + { + "epoch": 0.13072723379960036, + "grad_norm": 1.7216298580169678, + "learning_rate": 4.792130844336644e-05, + "loss": 5.2205, + "step": 21981 + }, + { + "epoch": 0.13073318108288134, + "grad_norm": 1.7315418720245361, + "learning_rate": 4.792112196125612e-05, + "loss": 5.617, + "step": 21982 + }, + { + "epoch": 0.13073912836616233, + "grad_norm": 1.6149991750717163, + "learning_rate": 4.792093547114428e-05, + "loss": 5.1341, + "step": 21983 + }, + { + "epoch": 0.13074507564944332, + "grad_norm": 1.8531928062438965, + "learning_rate": 4.792074897303097e-05, + "loss": 5.384, + "step": 21984 + }, + { + "epoch": 0.13075102293272434, + "grad_norm": 1.869070053100586, + "learning_rate": 4.792056246691627e-05, + "loss": 5.428, + "step": 21985 + }, + { + "epoch": 0.13075697021600532, + "grad_norm": 1.715179204940796, + "learning_rate": 4.792037595280024e-05, + "loss": 5.5358, + "step": 21986 + }, + { + "epoch": 0.1307629174992863, + "grad_norm": 2.155991315841675, + "learning_rate": 4.792018943068294e-05, + "loss": 4.9676, + "step": 21987 + }, + { + "epoch": 0.13076886478256733, + "grad_norm": 1.9201817512512207, + "learning_rate": 4.7920002900564434e-05, + "loss": 5.1021, + "step": 21988 + }, + { + "epoch": 0.13077481206584832, + "grad_norm": 1.8021970987319946, + "learning_rate": 4.79198163624448e-05, + "loss": 5.233, + "step": 21989 + }, + { + "epoch": 0.1307807593491293, + "grad_norm": 2.034694194793701, + "learning_rate": 4.7919629816324093e-05, + "loss": 5.7133, + "step": 21990 + }, + { + "epoch": 0.13078670663241032, + "grad_norm": 1.7929306030273438, + "learning_rate": 4.791944326220238e-05, + "loss": 5.1922, + "step": 21991 + }, + { + "epoch": 0.1307926539156913, + "grad_norm": 1.6092936992645264, + "learning_rate": 4.791925670007972e-05, + "loss": 4.8169, + "step": 21992 + }, + { + "epoch": 0.1307986011989723, + "grad_norm": 1.6994092464447021, + "learning_rate": 4.791907012995619e-05, + "loss": 4.7869, + "step": 21993 + }, + { + "epoch": 0.1308045484822533, + "grad_norm": 1.7823549509048462, + "learning_rate": 4.791888355183185e-05, + "loss": 5.1608, + "step": 21994 + }, + { + "epoch": 0.1308104957655343, + "grad_norm": 1.9024605751037598, + "learning_rate": 4.7918696965706764e-05, + "loss": 4.016, + "step": 21995 + }, + { + "epoch": 0.1308164430488153, + "grad_norm": 1.8696129322052002, + "learning_rate": 4.7918510371580993e-05, + "loss": 4.3457, + "step": 21996 + }, + { + "epoch": 0.1308223903320963, + "grad_norm": 1.8359664678573608, + "learning_rate": 4.791832376945461e-05, + "loss": 4.1822, + "step": 21997 + }, + { + "epoch": 0.1308283376153773, + "grad_norm": 1.867409586906433, + "learning_rate": 4.791813715932768e-05, + "loss": 4.0156, + "step": 21998 + }, + { + "epoch": 0.13083428489865828, + "grad_norm": 1.729768991470337, + "learning_rate": 4.7917950541200264e-05, + "loss": 5.4221, + "step": 21999 + }, + { + "epoch": 0.1308402321819393, + "grad_norm": 1.8171114921569824, + "learning_rate": 4.791776391507242e-05, + "loss": 4.1685, + "step": 22000 + }, + { + "epoch": 0.13084617946522029, + "grad_norm": 1.8626638650894165, + "learning_rate": 4.7917577280944234e-05, + "loss": 4.1981, + "step": 22001 + }, + { + "epoch": 0.13085212674850127, + "grad_norm": 1.9804152250289917, + "learning_rate": 4.791739063881575e-05, + "loss": 4.1258, + "step": 22002 + }, + { + "epoch": 0.1308580740317823, + "grad_norm": 2.6114773750305176, + "learning_rate": 4.791720398868704e-05, + "loss": 4.0207, + "step": 22003 + }, + { + "epoch": 0.13086402131506328, + "grad_norm": 2.1169519424438477, + "learning_rate": 4.791701733055818e-05, + "loss": 4.0134, + "step": 22004 + }, + { + "epoch": 0.13086996859834427, + "grad_norm": 2.318971872329712, + "learning_rate": 4.791683066442922e-05, + "loss": 4.1341, + "step": 22005 + }, + { + "epoch": 0.13087591588162528, + "grad_norm": 2.1771652698516846, + "learning_rate": 4.7916643990300234e-05, + "loss": 4.5816, + "step": 22006 + }, + { + "epoch": 0.13088186316490627, + "grad_norm": 2.327596426010132, + "learning_rate": 4.791645730817128e-05, + "loss": 5.3562, + "step": 22007 + }, + { + "epoch": 0.13088781044818726, + "grad_norm": 2.3558785915374756, + "learning_rate": 4.7916270618042434e-05, + "loss": 4.055, + "step": 22008 + }, + { + "epoch": 0.13089375773146827, + "grad_norm": 2.07840633392334, + "learning_rate": 4.791608391991374e-05, + "loss": 4.4366, + "step": 22009 + }, + { + "epoch": 0.13089970501474926, + "grad_norm": 2.4755849838256836, + "learning_rate": 4.79158972137853e-05, + "loss": 5.5616, + "step": 22010 + }, + { + "epoch": 0.13090565229803025, + "grad_norm": 1.8745293617248535, + "learning_rate": 4.791571049965714e-05, + "loss": 5.1908, + "step": 22011 + }, + { + "epoch": 0.13091159958131127, + "grad_norm": 1.8463020324707031, + "learning_rate": 4.791552377752935e-05, + "loss": 5.64, + "step": 22012 + }, + { + "epoch": 0.13091754686459225, + "grad_norm": 1.7283350229263306, + "learning_rate": 4.791533704740199e-05, + "loss": 5.191, + "step": 22013 + }, + { + "epoch": 0.13092349414787324, + "grad_norm": 2.290731191635132, + "learning_rate": 4.7915150309275115e-05, + "loss": 4.7131, + "step": 22014 + }, + { + "epoch": 0.13092944143115426, + "grad_norm": 2.1718969345092773, + "learning_rate": 4.7914963563148794e-05, + "loss": 4.6983, + "step": 22015 + }, + { + "epoch": 0.13093538871443525, + "grad_norm": 2.179349184036255, + "learning_rate": 4.791477680902311e-05, + "loss": 4.7265, + "step": 22016 + }, + { + "epoch": 0.13094133599771623, + "grad_norm": 1.7619205713272095, + "learning_rate": 4.79145900468981e-05, + "loss": 5.3916, + "step": 22017 + }, + { + "epoch": 0.13094728328099725, + "grad_norm": 1.827709674835205, + "learning_rate": 4.7914403276773855e-05, + "loss": 5.4988, + "step": 22018 + }, + { + "epoch": 0.13095323056427824, + "grad_norm": 1.768192172050476, + "learning_rate": 4.7914216498650424e-05, + "loss": 5.3605, + "step": 22019 + }, + { + "epoch": 0.13095917784755923, + "grad_norm": 1.6903995275497437, + "learning_rate": 4.791402971252788e-05, + "loss": 5.3919, + "step": 22020 + }, + { + "epoch": 0.13096512513084024, + "grad_norm": 1.5048458576202393, + "learning_rate": 4.791384291840628e-05, + "loss": 5.43, + "step": 22021 + }, + { + "epoch": 0.13097107241412123, + "grad_norm": 1.6317448616027832, + "learning_rate": 4.7913656116285685e-05, + "loss": 5.4964, + "step": 22022 + }, + { + "epoch": 0.13097701969740222, + "grad_norm": 1.775623083114624, + "learning_rate": 4.791346930616619e-05, + "loss": 5.4068, + "step": 22023 + }, + { + "epoch": 0.13098296698068324, + "grad_norm": 1.7148652076721191, + "learning_rate": 4.7913282488047826e-05, + "loss": 5.4362, + "step": 22024 + }, + { + "epoch": 0.13098891426396422, + "grad_norm": 1.6784619092941284, + "learning_rate": 4.7913095661930675e-05, + "loss": 5.3668, + "step": 22025 + }, + { + "epoch": 0.1309948615472452, + "grad_norm": 1.671555757522583, + "learning_rate": 4.79129088278148e-05, + "loss": 5.264, + "step": 22026 + }, + { + "epoch": 0.13100080883052623, + "grad_norm": 1.5523961782455444, + "learning_rate": 4.791272198570027e-05, + "loss": 5.1395, + "step": 22027 + }, + { + "epoch": 0.13100675611380722, + "grad_norm": 1.8762462139129639, + "learning_rate": 4.7912535135587134e-05, + "loss": 5.1099, + "step": 22028 + }, + { + "epoch": 0.1310127033970882, + "grad_norm": 1.7621192932128906, + "learning_rate": 4.7912348277475474e-05, + "loss": 5.0033, + "step": 22029 + }, + { + "epoch": 0.13101865068036922, + "grad_norm": 1.6044316291809082, + "learning_rate": 4.791216141136535e-05, + "loss": 5.2646, + "step": 22030 + }, + { + "epoch": 0.1310245979636502, + "grad_norm": 2.3852479457855225, + "learning_rate": 4.791197453725683e-05, + "loss": 4.7932, + "step": 22031 + }, + { + "epoch": 0.1310305452469312, + "grad_norm": 2.259331703186035, + "learning_rate": 4.7911787655149975e-05, + "loss": 4.8083, + "step": 22032 + }, + { + "epoch": 0.1310364925302122, + "grad_norm": 2.167745351791382, + "learning_rate": 4.791160076504485e-05, + "loss": 4.852, + "step": 22033 + }, + { + "epoch": 0.1310424398134932, + "grad_norm": 1.8246276378631592, + "learning_rate": 4.791141386694152e-05, + "loss": 5.1364, + "step": 22034 + }, + { + "epoch": 0.1310483870967742, + "grad_norm": 1.820461630821228, + "learning_rate": 4.791122696084006e-05, + "loss": 4.9647, + "step": 22035 + }, + { + "epoch": 0.1310543343800552, + "grad_norm": 1.6964235305786133, + "learning_rate": 4.791104004674052e-05, + "loss": 5.4281, + "step": 22036 + }, + { + "epoch": 0.1310602816633362, + "grad_norm": 1.8432056903839111, + "learning_rate": 4.791085312464297e-05, + "loss": 5.1905, + "step": 22037 + }, + { + "epoch": 0.13106622894661718, + "grad_norm": 1.9929230213165283, + "learning_rate": 4.7910666194547485e-05, + "loss": 5.0115, + "step": 22038 + }, + { + "epoch": 0.1310721762298982, + "grad_norm": 1.70926034450531, + "learning_rate": 4.791047925645412e-05, + "loss": 5.299, + "step": 22039 + }, + { + "epoch": 0.13107812351317918, + "grad_norm": 1.5090575218200684, + "learning_rate": 4.791029231036295e-05, + "loss": 5.4832, + "step": 22040 + }, + { + "epoch": 0.13108407079646017, + "grad_norm": 1.9068914651870728, + "learning_rate": 4.7910105356274025e-05, + "loss": 4.6246, + "step": 22041 + }, + { + "epoch": 0.13109001807974116, + "grad_norm": 1.9232919216156006, + "learning_rate": 4.7909918394187425e-05, + "loss": 4.7151, + "step": 22042 + }, + { + "epoch": 0.13109596536302218, + "grad_norm": 1.973927617073059, + "learning_rate": 4.790973142410321e-05, + "loss": 4.4912, + "step": 22043 + }, + { + "epoch": 0.13110191264630316, + "grad_norm": 1.554721474647522, + "learning_rate": 4.7909544446021434e-05, + "loss": 5.211, + "step": 22044 + }, + { + "epoch": 0.13110785992958415, + "grad_norm": 1.8059271574020386, + "learning_rate": 4.7909357459942185e-05, + "loss": 5.2998, + "step": 22045 + }, + { + "epoch": 0.13111380721286517, + "grad_norm": 1.7360923290252686, + "learning_rate": 4.79091704658655e-05, + "loss": 5.58, + "step": 22046 + }, + { + "epoch": 0.13111975449614616, + "grad_norm": 1.627770185470581, + "learning_rate": 4.790898346379148e-05, + "loss": 5.7186, + "step": 22047 + }, + { + "epoch": 0.13112570177942715, + "grad_norm": 1.6354387998580933, + "learning_rate": 4.790879645372016e-05, + "loss": 5.5099, + "step": 22048 + }, + { + "epoch": 0.13113164906270816, + "grad_norm": 1.6667500734329224, + "learning_rate": 4.790860943565161e-05, + "loss": 5.4328, + "step": 22049 + }, + { + "epoch": 0.13113759634598915, + "grad_norm": 1.7549245357513428, + "learning_rate": 4.790842240958591e-05, + "loss": 5.4191, + "step": 22050 + }, + { + "epoch": 0.13114354362927014, + "grad_norm": 1.5705612897872925, + "learning_rate": 4.790823537552311e-05, + "loss": 5.254, + "step": 22051 + }, + { + "epoch": 0.13114949091255115, + "grad_norm": 1.438839316368103, + "learning_rate": 4.790804833346329e-05, + "loss": 5.4708, + "step": 22052 + }, + { + "epoch": 0.13115543819583214, + "grad_norm": 1.8666369915008545, + "learning_rate": 4.790786128340651e-05, + "loss": 5.8635, + "step": 22053 + }, + { + "epoch": 0.13116138547911313, + "grad_norm": 2.1541588306427, + "learning_rate": 4.7907674225352815e-05, + "loss": 5.4732, + "step": 22054 + }, + { + "epoch": 0.13116733276239415, + "grad_norm": 1.6082664728164673, + "learning_rate": 4.79074871593023e-05, + "loss": 5.3902, + "step": 22055 + }, + { + "epoch": 0.13117328004567513, + "grad_norm": 1.7293864488601685, + "learning_rate": 4.790730008525502e-05, + "loss": 5.3317, + "step": 22056 + }, + { + "epoch": 0.13117922732895612, + "grad_norm": 1.830518126487732, + "learning_rate": 4.790711300321104e-05, + "loss": 5.3786, + "step": 22057 + }, + { + "epoch": 0.13118517461223714, + "grad_norm": 2.368182897567749, + "learning_rate": 4.790692591317041e-05, + "loss": 5.8, + "step": 22058 + }, + { + "epoch": 0.13119112189551813, + "grad_norm": 2.27848482131958, + "learning_rate": 4.7906738815133216e-05, + "loss": 5.4954, + "step": 22059 + }, + { + "epoch": 0.13119706917879911, + "grad_norm": 1.6672909259796143, + "learning_rate": 4.790655170909952e-05, + "loss": 5.2937, + "step": 22060 + }, + { + "epoch": 0.13120301646208013, + "grad_norm": 1.9788751602172852, + "learning_rate": 4.790636459506938e-05, + "loss": 5.1761, + "step": 22061 + }, + { + "epoch": 0.13120896374536112, + "grad_norm": 2.8215107917785645, + "learning_rate": 4.7906177473042865e-05, + "loss": 4.9236, + "step": 22062 + }, + { + "epoch": 0.1312149110286421, + "grad_norm": 2.0486905574798584, + "learning_rate": 4.790599034302004e-05, + "loss": 5.2273, + "step": 22063 + }, + { + "epoch": 0.13122085831192312, + "grad_norm": 1.9029892683029175, + "learning_rate": 4.790580320500097e-05, + "loss": 4.7737, + "step": 22064 + }, + { + "epoch": 0.1312268055952041, + "grad_norm": 2.052060842514038, + "learning_rate": 4.790561605898572e-05, + "loss": 4.7055, + "step": 22065 + }, + { + "epoch": 0.1312327528784851, + "grad_norm": 2.3215537071228027, + "learning_rate": 4.790542890497436e-05, + "loss": 4.6687, + "step": 22066 + }, + { + "epoch": 0.13123870016176611, + "grad_norm": 1.9903185367584229, + "learning_rate": 4.790524174296694e-05, + "loss": 4.5768, + "step": 22067 + }, + { + "epoch": 0.1312446474450471, + "grad_norm": 1.9112823009490967, + "learning_rate": 4.790505457296355e-05, + "loss": 4.664, + "step": 22068 + }, + { + "epoch": 0.1312505947283281, + "grad_norm": 2.09714412689209, + "learning_rate": 4.790486739496424e-05, + "loss": 4.4941, + "step": 22069 + }, + { + "epoch": 0.1312565420116091, + "grad_norm": 1.986820936203003, + "learning_rate": 4.7904680208969073e-05, + "loss": 4.8173, + "step": 22070 + }, + { + "epoch": 0.1312624892948901, + "grad_norm": 1.8170347213745117, + "learning_rate": 4.790449301497812e-05, + "loss": 4.78, + "step": 22071 + }, + { + "epoch": 0.13126843657817108, + "grad_norm": 1.7738579511642456, + "learning_rate": 4.790430581299145e-05, + "loss": 5.3492, + "step": 22072 + }, + { + "epoch": 0.1312743838614521, + "grad_norm": 1.9075175523757935, + "learning_rate": 4.7904118603009115e-05, + "loss": 4.4672, + "step": 22073 + }, + { + "epoch": 0.1312803311447331, + "grad_norm": 1.9848250150680542, + "learning_rate": 4.790393138503119e-05, + "loss": 4.2157, + "step": 22074 + }, + { + "epoch": 0.13128627842801407, + "grad_norm": 1.7980430126190186, + "learning_rate": 4.7903744159057745e-05, + "loss": 4.2482, + "step": 22075 + }, + { + "epoch": 0.1312922257112951, + "grad_norm": 1.8066810369491577, + "learning_rate": 4.7903556925088835e-05, + "loss": 4.0731, + "step": 22076 + }, + { + "epoch": 0.13129817299457608, + "grad_norm": 1.901912808418274, + "learning_rate": 4.790336968312453e-05, + "loss": 4.0677, + "step": 22077 + }, + { + "epoch": 0.13130412027785707, + "grad_norm": 1.8650418519973755, + "learning_rate": 4.79031824331649e-05, + "loss": 4.0593, + "step": 22078 + }, + { + "epoch": 0.13131006756113808, + "grad_norm": 1.8098959922790527, + "learning_rate": 4.7902995175210003e-05, + "loss": 4.1248, + "step": 22079 + }, + { + "epoch": 0.13131601484441907, + "grad_norm": 1.7840689420700073, + "learning_rate": 4.790280790925991e-05, + "loss": 4.1299, + "step": 22080 + }, + { + "epoch": 0.13132196212770006, + "grad_norm": 1.847676157951355, + "learning_rate": 4.7902620635314676e-05, + "loss": 3.9775, + "step": 22081 + }, + { + "epoch": 0.13132790941098108, + "grad_norm": 1.970070719718933, + "learning_rate": 4.7902433353374374e-05, + "loss": 3.9744, + "step": 22082 + }, + { + "epoch": 0.13133385669426206, + "grad_norm": 1.7709019184112549, + "learning_rate": 4.790224606343908e-05, + "loss": 3.9691, + "step": 22083 + }, + { + "epoch": 0.13133980397754305, + "grad_norm": 2.0055277347564697, + "learning_rate": 4.790205876550884e-05, + "loss": 4.0181, + "step": 22084 + }, + { + "epoch": 0.13134575126082407, + "grad_norm": 1.8686769008636475, + "learning_rate": 4.790187145958372e-05, + "loss": 3.9445, + "step": 22085 + }, + { + "epoch": 0.13135169854410506, + "grad_norm": 1.8052544593811035, + "learning_rate": 4.790168414566381e-05, + "loss": 4.3716, + "step": 22086 + }, + { + "epoch": 0.13135764582738604, + "grad_norm": 1.730320692062378, + "learning_rate": 4.790149682374915e-05, + "loss": 5.8462, + "step": 22087 + }, + { + "epoch": 0.13136359311066706, + "grad_norm": 1.8372067213058472, + "learning_rate": 4.790130949383982e-05, + "loss": 6.0599, + "step": 22088 + }, + { + "epoch": 0.13136954039394805, + "grad_norm": 1.505204200744629, + "learning_rate": 4.7901122155935874e-05, + "loss": 5.9626, + "step": 22089 + }, + { + "epoch": 0.13137548767722904, + "grad_norm": 2.126800537109375, + "learning_rate": 4.790093481003738e-05, + "loss": 5.3673, + "step": 22090 + }, + { + "epoch": 0.13138143496051005, + "grad_norm": 1.5778108835220337, + "learning_rate": 4.7900747456144415e-05, + "loss": 5.4421, + "step": 22091 + }, + { + "epoch": 0.13138738224379104, + "grad_norm": 1.4741785526275635, + "learning_rate": 4.7900560094257024e-05, + "loss": 5.5546, + "step": 22092 + }, + { + "epoch": 0.13139332952707203, + "grad_norm": 1.3331834077835083, + "learning_rate": 4.7900372724375295e-05, + "loss": 5.592, + "step": 22093 + }, + { + "epoch": 0.13139927681035304, + "grad_norm": 2.421566963195801, + "learning_rate": 4.790018534649927e-05, + "loss": 5.1022, + "step": 22094 + }, + { + "epoch": 0.13140522409363403, + "grad_norm": 1.761720895767212, + "learning_rate": 4.789999796062904e-05, + "loss": 5.2071, + "step": 22095 + }, + { + "epoch": 0.13141117137691502, + "grad_norm": 1.5059387683868408, + "learning_rate": 4.789981056676465e-05, + "loss": 5.3767, + "step": 22096 + }, + { + "epoch": 0.13141711866019604, + "grad_norm": 1.5319740772247314, + "learning_rate": 4.7899623164906176e-05, + "loss": 5.6233, + "step": 22097 + }, + { + "epoch": 0.13142306594347702, + "grad_norm": 1.7106443643569946, + "learning_rate": 4.789943575505368e-05, + "loss": 5.5583, + "step": 22098 + }, + { + "epoch": 0.131429013226758, + "grad_norm": 1.4288161993026733, + "learning_rate": 4.7899248337207227e-05, + "loss": 5.4574, + "step": 22099 + }, + { + "epoch": 0.131434960510039, + "grad_norm": 1.7327675819396973, + "learning_rate": 4.789906091136688e-05, + "loss": 5.3935, + "step": 22100 + }, + { + "epoch": 0.13144090779332002, + "grad_norm": 1.7318532466888428, + "learning_rate": 4.7898873477532716e-05, + "loss": 5.0156, + "step": 22101 + }, + { + "epoch": 0.131446855076601, + "grad_norm": 1.4947113990783691, + "learning_rate": 4.789868603570478e-05, + "loss": 5.2255, + "step": 22102 + }, + { + "epoch": 0.131452802359882, + "grad_norm": 2.454650402069092, + "learning_rate": 4.789849858588316e-05, + "loss": 5.0697, + "step": 22103 + }, + { + "epoch": 0.131458749643163, + "grad_norm": 2.0269839763641357, + "learning_rate": 4.789831112806791e-05, + "loss": 5.3687, + "step": 22104 + }, + { + "epoch": 0.131464696926444, + "grad_norm": 1.89911687374115, + "learning_rate": 4.7898123662259084e-05, + "loss": 5.1816, + "step": 22105 + }, + { + "epoch": 0.13147064420972498, + "grad_norm": 1.7952163219451904, + "learning_rate": 4.789793618845677e-05, + "loss": 5.1441, + "step": 22106 + }, + { + "epoch": 0.131476591493006, + "grad_norm": 1.458935022354126, + "learning_rate": 4.789774870666102e-05, + "loss": 4.8489, + "step": 22107 + }, + { + "epoch": 0.131482538776287, + "grad_norm": 1.5516583919525146, + "learning_rate": 4.78975612168719e-05, + "loss": 4.9763, + "step": 22108 + }, + { + "epoch": 0.13148848605956798, + "grad_norm": 1.525307297706604, + "learning_rate": 4.789737371908948e-05, + "loss": 5.5826, + "step": 22109 + }, + { + "epoch": 0.131494433342849, + "grad_norm": 1.516675353050232, + "learning_rate": 4.7897186213313824e-05, + "loss": 5.7384, + "step": 22110 + }, + { + "epoch": 0.13150038062612998, + "grad_norm": 1.3918993473052979, + "learning_rate": 4.7896998699545e-05, + "loss": 5.9798, + "step": 22111 + }, + { + "epoch": 0.13150632790941097, + "grad_norm": 1.7346227169036865, + "learning_rate": 4.789681117778307e-05, + "loss": 5.4939, + "step": 22112 + }, + { + "epoch": 0.13151227519269199, + "grad_norm": 1.784882664680481, + "learning_rate": 4.7896623648028094e-05, + "loss": 5.5369, + "step": 22113 + }, + { + "epoch": 0.13151822247597297, + "grad_norm": 1.5360532999038696, + "learning_rate": 4.789643611028015e-05, + "loss": 5.5539, + "step": 22114 + }, + { + "epoch": 0.13152416975925396, + "grad_norm": 1.3865541219711304, + "learning_rate": 4.789624856453929e-05, + "loss": 5.6192, + "step": 22115 + }, + { + "epoch": 0.13153011704253498, + "grad_norm": 1.8362021446228027, + "learning_rate": 4.7896061010805596e-05, + "loss": 5.6915, + "step": 22116 + }, + { + "epoch": 0.13153606432581597, + "grad_norm": 1.607771635055542, + "learning_rate": 4.789587344907911e-05, + "loss": 5.4442, + "step": 22117 + }, + { + "epoch": 0.13154201160909695, + "grad_norm": 1.5097888708114624, + "learning_rate": 4.789568587935992e-05, + "loss": 5.84, + "step": 22118 + }, + { + "epoch": 0.13154795889237797, + "grad_norm": 1.4404877424240112, + "learning_rate": 4.789549830164809e-05, + "loss": 5.7407, + "step": 22119 + }, + { + "epoch": 0.13155390617565896, + "grad_norm": 1.5682063102722168, + "learning_rate": 4.7895310715943665e-05, + "loss": 5.3026, + "step": 22120 + }, + { + "epoch": 0.13155985345893995, + "grad_norm": 1.6435290575027466, + "learning_rate": 4.789512312224672e-05, + "loss": 5.7749, + "step": 22121 + }, + { + "epoch": 0.13156580074222096, + "grad_norm": 1.7454910278320312, + "learning_rate": 4.7894935520557335e-05, + "loss": 5.5817, + "step": 22122 + }, + { + "epoch": 0.13157174802550195, + "grad_norm": 1.9168800115585327, + "learning_rate": 4.789474791087556e-05, + "loss": 4.3752, + "step": 22123 + }, + { + "epoch": 0.13157769530878294, + "grad_norm": 2.1051509380340576, + "learning_rate": 4.789456029320147e-05, + "loss": 3.6253, + "step": 22124 + }, + { + "epoch": 0.13158364259206395, + "grad_norm": 2.0902812480926514, + "learning_rate": 4.789437266753512e-05, + "loss": 4.039, + "step": 22125 + }, + { + "epoch": 0.13158958987534494, + "grad_norm": 1.804121971130371, + "learning_rate": 4.789418503387658e-05, + "loss": 3.6551, + "step": 22126 + }, + { + "epoch": 0.13159553715862593, + "grad_norm": 1.992370367050171, + "learning_rate": 4.789399739222592e-05, + "loss": 3.6387, + "step": 22127 + }, + { + "epoch": 0.13160148444190695, + "grad_norm": 2.0625061988830566, + "learning_rate": 4.7893809742583204e-05, + "loss": 3.943, + "step": 22128 + }, + { + "epoch": 0.13160743172518793, + "grad_norm": 2.021989107131958, + "learning_rate": 4.789362208494849e-05, + "loss": 4.0269, + "step": 22129 + }, + { + "epoch": 0.13161337900846892, + "grad_norm": 2.037161350250244, + "learning_rate": 4.7893434419321856e-05, + "loss": 5.3085, + "step": 22130 + }, + { + "epoch": 0.13161932629174994, + "grad_norm": 1.8836485147476196, + "learning_rate": 4.7893246745703355e-05, + "loss": 4.7337, + "step": 22131 + }, + { + "epoch": 0.13162527357503093, + "grad_norm": 1.5900107622146606, + "learning_rate": 4.789305906409306e-05, + "loss": 5.0772, + "step": 22132 + }, + { + "epoch": 0.13163122085831191, + "grad_norm": 1.627558946609497, + "learning_rate": 4.789287137449103e-05, + "loss": 5.1703, + "step": 22133 + }, + { + "epoch": 0.13163716814159293, + "grad_norm": 1.8517992496490479, + "learning_rate": 4.7892683676897344e-05, + "loss": 5.173, + "step": 22134 + }, + { + "epoch": 0.13164311542487392, + "grad_norm": 1.2436500787734985, + "learning_rate": 4.789249597131205e-05, + "loss": 4.956, + "step": 22135 + }, + { + "epoch": 0.1316490627081549, + "grad_norm": 1.5156265497207642, + "learning_rate": 4.789230825773523e-05, + "loss": 5.6121, + "step": 22136 + }, + { + "epoch": 0.13165500999143592, + "grad_norm": 1.3742187023162842, + "learning_rate": 4.789212053616694e-05, + "loss": 5.2186, + "step": 22137 + }, + { + "epoch": 0.1316609572747169, + "grad_norm": 1.3079794645309448, + "learning_rate": 4.7891932806607245e-05, + "loss": 5.4108, + "step": 22138 + }, + { + "epoch": 0.1316669045579979, + "grad_norm": 1.5291730165481567, + "learning_rate": 4.789174506905621e-05, + "loss": 5.1516, + "step": 22139 + }, + { + "epoch": 0.13167285184127892, + "grad_norm": 1.3465576171875, + "learning_rate": 4.7891557323513904e-05, + "loss": 4.9797, + "step": 22140 + }, + { + "epoch": 0.1316787991245599, + "grad_norm": 1.228513479232788, + "learning_rate": 4.789136956998039e-05, + "loss": 5.0119, + "step": 22141 + }, + { + "epoch": 0.1316847464078409, + "grad_norm": 1.4027810096740723, + "learning_rate": 4.789118180845574e-05, + "loss": 5.2781, + "step": 22142 + }, + { + "epoch": 0.1316906936911219, + "grad_norm": 1.371072769165039, + "learning_rate": 4.789099403894002e-05, + "loss": 5.1414, + "step": 22143 + }, + { + "epoch": 0.1316966409744029, + "grad_norm": 1.264255404472351, + "learning_rate": 4.7890806261433286e-05, + "loss": 4.9926, + "step": 22144 + }, + { + "epoch": 0.13170258825768388, + "grad_norm": 1.351501226425171, + "learning_rate": 4.78906184759356e-05, + "loss": 5.1473, + "step": 22145 + }, + { + "epoch": 0.1317085355409649, + "grad_norm": 1.4877911806106567, + "learning_rate": 4.7890430682447046e-05, + "loss": 5.2634, + "step": 22146 + }, + { + "epoch": 0.1317144828242459, + "grad_norm": 1.3446416854858398, + "learning_rate": 4.7890242880967675e-05, + "loss": 5.197, + "step": 22147 + }, + { + "epoch": 0.13172043010752688, + "grad_norm": 1.2246133089065552, + "learning_rate": 4.789005507149756e-05, + "loss": 5.1262, + "step": 22148 + }, + { + "epoch": 0.1317263773908079, + "grad_norm": 1.3092166185379028, + "learning_rate": 4.7889867254036755e-05, + "loss": 5.0157, + "step": 22149 + }, + { + "epoch": 0.13173232467408888, + "grad_norm": 1.3076307773590088, + "learning_rate": 4.788967942858534e-05, + "loss": 5.159, + "step": 22150 + }, + { + "epoch": 0.13173827195736987, + "grad_norm": 1.3207625150680542, + "learning_rate": 4.788949159514338e-05, + "loss": 5.1559, + "step": 22151 + }, + { + "epoch": 0.13174421924065088, + "grad_norm": 1.4235469102859497, + "learning_rate": 4.788930375371092e-05, + "loss": 4.9426, + "step": 22152 + }, + { + "epoch": 0.13175016652393187, + "grad_norm": 1.4294525384902954, + "learning_rate": 4.7889115904288054e-05, + "loss": 5.0116, + "step": 22153 + }, + { + "epoch": 0.13175611380721286, + "grad_norm": 1.3456943035125732, + "learning_rate": 4.788892804687483e-05, + "loss": 4.9962, + "step": 22154 + }, + { + "epoch": 0.13176206109049388, + "grad_norm": 1.368545651435852, + "learning_rate": 4.788874018147132e-05, + "loss": 5.1523, + "step": 22155 + }, + { + "epoch": 0.13176800837377486, + "grad_norm": 1.2844034433364868, + "learning_rate": 4.788855230807758e-05, + "loss": 4.879, + "step": 22156 + }, + { + "epoch": 0.13177395565705585, + "grad_norm": 1.3061450719833374, + "learning_rate": 4.788836442669369e-05, + "loss": 4.9011, + "step": 22157 + }, + { + "epoch": 0.13177990294033684, + "grad_norm": 1.4233042001724243, + "learning_rate": 4.788817653731971e-05, + "loss": 4.8821, + "step": 22158 + }, + { + "epoch": 0.13178585022361786, + "grad_norm": 1.4013172388076782, + "learning_rate": 4.788798863995569e-05, + "loss": 4.8431, + "step": 22159 + }, + { + "epoch": 0.13179179750689884, + "grad_norm": 1.2786699533462524, + "learning_rate": 4.7887800734601716e-05, + "loss": 4.6884, + "step": 22160 + }, + { + "epoch": 0.13179774479017983, + "grad_norm": 1.408245325088501, + "learning_rate": 4.7887612821257855e-05, + "loss": 5.2191, + "step": 22161 + }, + { + "epoch": 0.13180369207346085, + "grad_norm": 1.5876145362854004, + "learning_rate": 4.788742489992416e-05, + "loss": 5.459, + "step": 22162 + }, + { + "epoch": 0.13180963935674184, + "grad_norm": 1.4462308883666992, + "learning_rate": 4.7887236970600705e-05, + "loss": 5.2757, + "step": 22163 + }, + { + "epoch": 0.13181558664002282, + "grad_norm": 1.288514494895935, + "learning_rate": 4.7887049033287546e-05, + "loss": 5.1, + "step": 22164 + }, + { + "epoch": 0.13182153392330384, + "grad_norm": 1.387949824333191, + "learning_rate": 4.788686108798476e-05, + "loss": 4.9212, + "step": 22165 + }, + { + "epoch": 0.13182748120658483, + "grad_norm": 1.534636378288269, + "learning_rate": 4.7886673134692404e-05, + "loss": 4.7585, + "step": 22166 + }, + { + "epoch": 0.13183342848986582, + "grad_norm": 1.464815378189087, + "learning_rate": 4.788648517341054e-05, + "loss": 5.121, + "step": 22167 + }, + { + "epoch": 0.13183937577314683, + "grad_norm": 1.2842152118682861, + "learning_rate": 4.788629720413925e-05, + "loss": 5.1032, + "step": 22168 + }, + { + "epoch": 0.13184532305642782, + "grad_norm": 1.5626686811447144, + "learning_rate": 4.7886109226878595e-05, + "loss": 4.9001, + "step": 22169 + }, + { + "epoch": 0.1318512703397088, + "grad_norm": 1.4019660949707031, + "learning_rate": 4.788592124162863e-05, + "loss": 5.2157, + "step": 22170 + }, + { + "epoch": 0.13185721762298983, + "grad_norm": 1.1018543243408203, + "learning_rate": 4.788573324838942e-05, + "loss": 5.5623, + "step": 22171 + }, + { + "epoch": 0.1318631649062708, + "grad_norm": 1.4074633121490479, + "learning_rate": 4.788554524716105e-05, + "loss": 5.0306, + "step": 22172 + }, + { + "epoch": 0.1318691121895518, + "grad_norm": 1.4724953174591064, + "learning_rate": 4.788535723794356e-05, + "loss": 5.033, + "step": 22173 + }, + { + "epoch": 0.13187505947283282, + "grad_norm": 1.359288215637207, + "learning_rate": 4.788516922073703e-05, + "loss": 4.918, + "step": 22174 + }, + { + "epoch": 0.1318810067561138, + "grad_norm": 1.3733046054840088, + "learning_rate": 4.788498119554152e-05, + "loss": 4.9631, + "step": 22175 + }, + { + "epoch": 0.1318869540393948, + "grad_norm": 1.1926368474960327, + "learning_rate": 4.7884793162357114e-05, + "loss": 4.8628, + "step": 22176 + }, + { + "epoch": 0.1318929013226758, + "grad_norm": 1.1444061994552612, + "learning_rate": 4.788460512118386e-05, + "loss": 4.8978, + "step": 22177 + }, + { + "epoch": 0.1318988486059568, + "grad_norm": 1.3945989608764648, + "learning_rate": 4.7884417072021814e-05, + "loss": 4.9901, + "step": 22178 + }, + { + "epoch": 0.13190479588923779, + "grad_norm": 1.4278130531311035, + "learning_rate": 4.7884229014871063e-05, + "loss": 4.8705, + "step": 22179 + }, + { + "epoch": 0.1319107431725188, + "grad_norm": 1.4391251802444458, + "learning_rate": 4.788404094973167e-05, + "loss": 4.8575, + "step": 22180 + }, + { + "epoch": 0.1319166904557998, + "grad_norm": 1.435241460800171, + "learning_rate": 4.788385287660369e-05, + "loss": 4.8571, + "step": 22181 + }, + { + "epoch": 0.13192263773908078, + "grad_norm": 1.2841169834136963, + "learning_rate": 4.788366479548718e-05, + "loss": 4.8738, + "step": 22182 + }, + { + "epoch": 0.1319285850223618, + "grad_norm": 1.318769931793213, + "learning_rate": 4.7883476706382236e-05, + "loss": 5.1381, + "step": 22183 + }, + { + "epoch": 0.13193453230564278, + "grad_norm": 1.398940920829773, + "learning_rate": 4.78832886092889e-05, + "loss": 4.8094, + "step": 22184 + }, + { + "epoch": 0.13194047958892377, + "grad_norm": 1.373937726020813, + "learning_rate": 4.788310050420725e-05, + "loss": 5.0183, + "step": 22185 + }, + { + "epoch": 0.1319464268722048, + "grad_norm": 1.2899675369262695, + "learning_rate": 4.788291239113734e-05, + "loss": 5.3211, + "step": 22186 + }, + { + "epoch": 0.13195237415548577, + "grad_norm": 1.2992362976074219, + "learning_rate": 4.788272427007924e-05, + "loss": 5.2411, + "step": 22187 + }, + { + "epoch": 0.13195832143876676, + "grad_norm": 1.3528488874435425, + "learning_rate": 4.7882536141033025e-05, + "loss": 5.272, + "step": 22188 + }, + { + "epoch": 0.13196426872204778, + "grad_norm": 1.0530016422271729, + "learning_rate": 4.7882348003998746e-05, + "loss": 5.1516, + "step": 22189 + }, + { + "epoch": 0.13197021600532877, + "grad_norm": 1.3447175025939941, + "learning_rate": 4.7882159858976486e-05, + "loss": 5.0007, + "step": 22190 + }, + { + "epoch": 0.13197616328860975, + "grad_norm": 1.531227946281433, + "learning_rate": 4.788197170596629e-05, + "loss": 5.0506, + "step": 22191 + }, + { + "epoch": 0.13198211057189077, + "grad_norm": 1.3458744287490845, + "learning_rate": 4.788178354496823e-05, + "loss": 4.931, + "step": 22192 + }, + { + "epoch": 0.13198805785517176, + "grad_norm": 1.380890965461731, + "learning_rate": 4.788159537598239e-05, + "loss": 5.2813, + "step": 22193 + }, + { + "epoch": 0.13199400513845275, + "grad_norm": 1.387640118598938, + "learning_rate": 4.788140719900881e-05, + "loss": 5.1234, + "step": 22194 + }, + { + "epoch": 0.13199995242173376, + "grad_norm": 1.304620623588562, + "learning_rate": 4.788121901404757e-05, + "loss": 4.988, + "step": 22195 + }, + { + "epoch": 0.13200589970501475, + "grad_norm": 1.3828579187393188, + "learning_rate": 4.7881030821098736e-05, + "loss": 5.2552, + "step": 22196 + }, + { + "epoch": 0.13201184698829574, + "grad_norm": 1.4819931983947754, + "learning_rate": 4.788084262016237e-05, + "loss": 4.9094, + "step": 22197 + }, + { + "epoch": 0.13201779427157675, + "grad_norm": 1.4570109844207764, + "learning_rate": 4.788065441123853e-05, + "loss": 5.0518, + "step": 22198 + }, + { + "epoch": 0.13202374155485774, + "grad_norm": 1.4303123950958252, + "learning_rate": 4.7880466194327305e-05, + "loss": 4.773, + "step": 22199 + }, + { + "epoch": 0.13202968883813873, + "grad_norm": 1.5727583169937134, + "learning_rate": 4.788027796942874e-05, + "loss": 4.458, + "step": 22200 + }, + { + "epoch": 0.13203563612141975, + "grad_norm": 1.5693985223770142, + "learning_rate": 4.78800897365429e-05, + "loss": 4.4378, + "step": 22201 + }, + { + "epoch": 0.13204158340470074, + "grad_norm": 1.4328757524490356, + "learning_rate": 4.787990149566987e-05, + "loss": 4.3503, + "step": 22202 + }, + { + "epoch": 0.13204753068798172, + "grad_norm": 1.4490034580230713, + "learning_rate": 4.787971324680969e-05, + "loss": 4.3476, + "step": 22203 + }, + { + "epoch": 0.13205347797126274, + "grad_norm": 1.4600367546081543, + "learning_rate": 4.7879524989962446e-05, + "loss": 4.3052, + "step": 22204 + }, + { + "epoch": 0.13205942525454373, + "grad_norm": 1.5479463338851929, + "learning_rate": 4.787933672512819e-05, + "loss": 4.3291, + "step": 22205 + }, + { + "epoch": 0.13206537253782472, + "grad_norm": 1.6317998170852661, + "learning_rate": 4.7879148452306986e-05, + "loss": 4.2697, + "step": 22206 + }, + { + "epoch": 0.13207131982110573, + "grad_norm": 1.5387004613876343, + "learning_rate": 4.787896017149892e-05, + "loss": 4.3413, + "step": 22207 + }, + { + "epoch": 0.13207726710438672, + "grad_norm": 1.5556374788284302, + "learning_rate": 4.7878771882704046e-05, + "loss": 4.2002, + "step": 22208 + }, + { + "epoch": 0.1320832143876677, + "grad_norm": 1.626752495765686, + "learning_rate": 4.787858358592243e-05, + "loss": 4.2729, + "step": 22209 + }, + { + "epoch": 0.13208916167094872, + "grad_norm": 1.3982586860656738, + "learning_rate": 4.7878395281154134e-05, + "loss": 4.2138, + "step": 22210 + }, + { + "epoch": 0.1320951089542297, + "grad_norm": 1.5739530324935913, + "learning_rate": 4.787820696839922e-05, + "loss": 4.1526, + "step": 22211 + }, + { + "epoch": 0.1321010562375107, + "grad_norm": 1.458217978477478, + "learning_rate": 4.787801864765777e-05, + "loss": 4.2584, + "step": 22212 + }, + { + "epoch": 0.13210700352079172, + "grad_norm": 1.4696205854415894, + "learning_rate": 4.787783031892984e-05, + "loss": 4.2042, + "step": 22213 + }, + { + "epoch": 0.1321129508040727, + "grad_norm": 1.729152798652649, + "learning_rate": 4.7877641982215485e-05, + "loss": 4.4817, + "step": 22214 + }, + { + "epoch": 0.1321188980873537, + "grad_norm": 1.7412737607955933, + "learning_rate": 4.787745363751479e-05, + "loss": 4.4568, + "step": 22215 + }, + { + "epoch": 0.13212484537063468, + "grad_norm": 1.6463770866394043, + "learning_rate": 4.787726528482781e-05, + "loss": 4.4503, + "step": 22216 + }, + { + "epoch": 0.1321307926539157, + "grad_norm": 1.5496896505355835, + "learning_rate": 4.7877076924154617e-05, + "loss": 4.3863, + "step": 22217 + }, + { + "epoch": 0.13213673993719668, + "grad_norm": 1.6521345376968384, + "learning_rate": 4.787688855549527e-05, + "loss": 4.3847, + "step": 22218 + }, + { + "epoch": 0.13214268722047767, + "grad_norm": 1.6477288007736206, + "learning_rate": 4.7876700178849836e-05, + "loss": 4.3939, + "step": 22219 + }, + { + "epoch": 0.1321486345037587, + "grad_norm": 1.6795778274536133, + "learning_rate": 4.787651179421838e-05, + "loss": 4.1722, + "step": 22220 + }, + { + "epoch": 0.13215458178703968, + "grad_norm": 1.5795823335647583, + "learning_rate": 4.787632340160098e-05, + "loss": 4.2125, + "step": 22221 + }, + { + "epoch": 0.13216052907032066, + "grad_norm": 1.6583930253982544, + "learning_rate": 4.7876135000997686e-05, + "loss": 4.2013, + "step": 22222 + }, + { + "epoch": 0.13216647635360168, + "grad_norm": 1.4495878219604492, + "learning_rate": 4.7875946592408575e-05, + "loss": 4.1335, + "step": 22223 + }, + { + "epoch": 0.13217242363688267, + "grad_norm": 1.5657227039337158, + "learning_rate": 4.78757581758337e-05, + "loss": 4.1514, + "step": 22224 + }, + { + "epoch": 0.13217837092016366, + "grad_norm": 1.7183332443237305, + "learning_rate": 4.787556975127313e-05, + "loss": 4.7715, + "step": 22225 + }, + { + "epoch": 0.13218431820344467, + "grad_norm": 2.1822710037231445, + "learning_rate": 4.7875381318726945e-05, + "loss": 4.9383, + "step": 22226 + }, + { + "epoch": 0.13219026548672566, + "grad_norm": 1.9633662700653076, + "learning_rate": 4.787519287819519e-05, + "loss": 4.9601, + "step": 22227 + }, + { + "epoch": 0.13219621277000665, + "grad_norm": 1.6858619451522827, + "learning_rate": 4.787500442967795e-05, + "loss": 5.0091, + "step": 22228 + }, + { + "epoch": 0.13220216005328767, + "grad_norm": 1.5447601079940796, + "learning_rate": 4.787481597317528e-05, + "loss": 4.8372, + "step": 22229 + }, + { + "epoch": 0.13220810733656865, + "grad_norm": 1.4934616088867188, + "learning_rate": 4.787462750868725e-05, + "loss": 4.9812, + "step": 22230 + }, + { + "epoch": 0.13221405461984964, + "grad_norm": 1.4039883613586426, + "learning_rate": 4.787443903621393e-05, + "loss": 4.829, + "step": 22231 + }, + { + "epoch": 0.13222000190313066, + "grad_norm": 1.5184186697006226, + "learning_rate": 4.787425055575536e-05, + "loss": 4.8379, + "step": 22232 + }, + { + "epoch": 0.13222594918641165, + "grad_norm": 1.3783762454986572, + "learning_rate": 4.787406206731164e-05, + "loss": 4.9209, + "step": 22233 + }, + { + "epoch": 0.13223189646969263, + "grad_norm": 1.360772967338562, + "learning_rate": 4.787387357088282e-05, + "loss": 5.0036, + "step": 22234 + }, + { + "epoch": 0.13223784375297365, + "grad_norm": 1.4753018617630005, + "learning_rate": 4.787368506646897e-05, + "loss": 5.3268, + "step": 22235 + }, + { + "epoch": 0.13224379103625464, + "grad_norm": 1.3295317888259888, + "learning_rate": 4.787349655407014e-05, + "loss": 5.3096, + "step": 22236 + }, + { + "epoch": 0.13224973831953563, + "grad_norm": 1.4120566844940186, + "learning_rate": 4.787330803368642e-05, + "loss": 4.9041, + "step": 22237 + }, + { + "epoch": 0.13225568560281664, + "grad_norm": 1.3822401762008667, + "learning_rate": 4.787311950531787e-05, + "loss": 5.0089, + "step": 22238 + }, + { + "epoch": 0.13226163288609763, + "grad_norm": 1.0574642419815063, + "learning_rate": 4.7872930968964535e-05, + "loss": 5.528, + "step": 22239 + }, + { + "epoch": 0.13226758016937862, + "grad_norm": 1.4523993730545044, + "learning_rate": 4.78727424246265e-05, + "loss": 5.3844, + "step": 22240 + }, + { + "epoch": 0.13227352745265963, + "grad_norm": 1.283956527709961, + "learning_rate": 4.787255387230383e-05, + "loss": 5.226, + "step": 22241 + }, + { + "epoch": 0.13227947473594062, + "grad_norm": 1.621275782585144, + "learning_rate": 4.7872365311996594e-05, + "loss": 4.7797, + "step": 22242 + }, + { + "epoch": 0.1322854220192216, + "grad_norm": 1.327376365661621, + "learning_rate": 4.787217674370484e-05, + "loss": 4.9057, + "step": 22243 + }, + { + "epoch": 0.13229136930250263, + "grad_norm": 1.5311939716339111, + "learning_rate": 4.787198816742865e-05, + "loss": 5.0076, + "step": 22244 + }, + { + "epoch": 0.13229731658578361, + "grad_norm": 1.3926832675933838, + "learning_rate": 4.7871799583168085e-05, + "loss": 4.9328, + "step": 22245 + }, + { + "epoch": 0.1323032638690646, + "grad_norm": 1.2381867170333862, + "learning_rate": 4.787161099092321e-05, + "loss": 5.1678, + "step": 22246 + }, + { + "epoch": 0.13230921115234562, + "grad_norm": 1.1969068050384521, + "learning_rate": 4.78714223906941e-05, + "loss": 5.5106, + "step": 22247 + }, + { + "epoch": 0.1323151584356266, + "grad_norm": 1.2368844747543335, + "learning_rate": 4.7871233782480804e-05, + "loss": 5.4105, + "step": 22248 + }, + { + "epoch": 0.1323211057189076, + "grad_norm": 1.45974862575531, + "learning_rate": 4.78710451662834e-05, + "loss": 4.9328, + "step": 22249 + }, + { + "epoch": 0.1323270530021886, + "grad_norm": 1.2457060813903809, + "learning_rate": 4.787085654210195e-05, + "loss": 5.225, + "step": 22250 + }, + { + "epoch": 0.1323330002854696, + "grad_norm": 1.4274303913116455, + "learning_rate": 4.787066790993652e-05, + "loss": 4.8785, + "step": 22251 + }, + { + "epoch": 0.1323389475687506, + "grad_norm": 1.3072400093078613, + "learning_rate": 4.7870479269787174e-05, + "loss": 4.871, + "step": 22252 + }, + { + "epoch": 0.1323448948520316, + "grad_norm": 1.2442991733551025, + "learning_rate": 4.787029062165398e-05, + "loss": 4.8374, + "step": 22253 + }, + { + "epoch": 0.1323508421353126, + "grad_norm": 1.3584920167922974, + "learning_rate": 4.787010196553701e-05, + "loss": 5.2427, + "step": 22254 + }, + { + "epoch": 0.13235678941859358, + "grad_norm": 1.560067892074585, + "learning_rate": 4.786991330143632e-05, + "loss": 4.8689, + "step": 22255 + }, + { + "epoch": 0.1323627367018746, + "grad_norm": 1.3197054862976074, + "learning_rate": 4.786972462935198e-05, + "loss": 4.8326, + "step": 22256 + }, + { + "epoch": 0.13236868398515558, + "grad_norm": 1.2790191173553467, + "learning_rate": 4.786953594928405e-05, + "loss": 4.7454, + "step": 22257 + }, + { + "epoch": 0.13237463126843657, + "grad_norm": 1.6187344789505005, + "learning_rate": 4.7869347261232606e-05, + "loss": 5.5456, + "step": 22258 + }, + { + "epoch": 0.1323805785517176, + "grad_norm": 1.3327410221099854, + "learning_rate": 4.786915856519771e-05, + "loss": 4.834, + "step": 22259 + }, + { + "epoch": 0.13238652583499858, + "grad_norm": 1.2602509260177612, + "learning_rate": 4.786896986117943e-05, + "loss": 5.1677, + "step": 22260 + }, + { + "epoch": 0.13239247311827956, + "grad_norm": 1.4382299184799194, + "learning_rate": 4.786878114917782e-05, + "loss": 5.0591, + "step": 22261 + }, + { + "epoch": 0.13239842040156058, + "grad_norm": 1.4061304330825806, + "learning_rate": 4.786859242919296e-05, + "loss": 5.0161, + "step": 22262 + }, + { + "epoch": 0.13240436768484157, + "grad_norm": 1.4143967628479004, + "learning_rate": 4.7868403701224905e-05, + "loss": 4.7625, + "step": 22263 + }, + { + "epoch": 0.13241031496812256, + "grad_norm": 1.4221394062042236, + "learning_rate": 4.786821496527374e-05, + "loss": 4.8579, + "step": 22264 + }, + { + "epoch": 0.13241626225140357, + "grad_norm": 1.3852332830429077, + "learning_rate": 4.78680262213395e-05, + "loss": 4.6081, + "step": 22265 + }, + { + "epoch": 0.13242220953468456, + "grad_norm": 1.2698066234588623, + "learning_rate": 4.786783746942228e-05, + "loss": 4.7903, + "step": 22266 + }, + { + "epoch": 0.13242815681796555, + "grad_norm": 1.2313082218170166, + "learning_rate": 4.7867648709522136e-05, + "loss": 4.8353, + "step": 22267 + }, + { + "epoch": 0.13243410410124656, + "grad_norm": 1.3578218221664429, + "learning_rate": 4.7867459941639124e-05, + "loss": 5.2778, + "step": 22268 + }, + { + "epoch": 0.13244005138452755, + "grad_norm": 1.5034034252166748, + "learning_rate": 4.786727116577332e-05, + "loss": 5.2208, + "step": 22269 + }, + { + "epoch": 0.13244599866780854, + "grad_norm": 1.621207356452942, + "learning_rate": 4.786708238192479e-05, + "loss": 4.8394, + "step": 22270 + }, + { + "epoch": 0.13245194595108956, + "grad_norm": 1.471311092376709, + "learning_rate": 4.7866893590093595e-05, + "loss": 4.8942, + "step": 22271 + }, + { + "epoch": 0.13245789323437054, + "grad_norm": 1.3276898860931396, + "learning_rate": 4.7866704790279806e-05, + "loss": 4.833, + "step": 22272 + }, + { + "epoch": 0.13246384051765153, + "grad_norm": 1.484650731086731, + "learning_rate": 4.786651598248349e-05, + "loss": 5.0415, + "step": 22273 + }, + { + "epoch": 0.13246978780093252, + "grad_norm": 1.3327105045318604, + "learning_rate": 4.7866327166704703e-05, + "loss": 5.2227, + "step": 22274 + }, + { + "epoch": 0.13247573508421354, + "grad_norm": 1.4387754201889038, + "learning_rate": 4.7866138342943525e-05, + "loss": 5.1764, + "step": 22275 + }, + { + "epoch": 0.13248168236749452, + "grad_norm": 1.3406511545181274, + "learning_rate": 4.786594951120001e-05, + "loss": 5.2711, + "step": 22276 + }, + { + "epoch": 0.1324876296507755, + "grad_norm": 1.3859505653381348, + "learning_rate": 4.7865760671474224e-05, + "loss": 5.1102, + "step": 22277 + }, + { + "epoch": 0.13249357693405653, + "grad_norm": 1.517545461654663, + "learning_rate": 4.7865571823766245e-05, + "loss": 5.1275, + "step": 22278 + }, + { + "epoch": 0.13249952421733752, + "grad_norm": 1.720278263092041, + "learning_rate": 4.7865382968076125e-05, + "loss": 5.0902, + "step": 22279 + }, + { + "epoch": 0.1325054715006185, + "grad_norm": 1.543717622756958, + "learning_rate": 4.786519410440394e-05, + "loss": 5.1094, + "step": 22280 + }, + { + "epoch": 0.13251141878389952, + "grad_norm": 1.2068023681640625, + "learning_rate": 4.786500523274975e-05, + "loss": 5.1791, + "step": 22281 + }, + { + "epoch": 0.1325173660671805, + "grad_norm": 1.426169991493225, + "learning_rate": 4.786481635311362e-05, + "loss": 5.2155, + "step": 22282 + }, + { + "epoch": 0.1325233133504615, + "grad_norm": 1.4624898433685303, + "learning_rate": 4.7864627465495626e-05, + "loss": 4.8741, + "step": 22283 + }, + { + "epoch": 0.1325292606337425, + "grad_norm": 1.2942382097244263, + "learning_rate": 4.786443856989582e-05, + "loss": 5.4888, + "step": 22284 + }, + { + "epoch": 0.1325352079170235, + "grad_norm": 1.2372108697891235, + "learning_rate": 4.786424966631428e-05, + "loss": 5.1907, + "step": 22285 + }, + { + "epoch": 0.1325411552003045, + "grad_norm": 1.368546962738037, + "learning_rate": 4.7864060754751064e-05, + "loss": 5.1653, + "step": 22286 + }, + { + "epoch": 0.1325471024835855, + "grad_norm": 1.6052632331848145, + "learning_rate": 4.786387183520624e-05, + "loss": 5.2139, + "step": 22287 + }, + { + "epoch": 0.1325530497668665, + "grad_norm": 1.4893959760665894, + "learning_rate": 4.7863682907679874e-05, + "loss": 4.9972, + "step": 22288 + }, + { + "epoch": 0.13255899705014748, + "grad_norm": 1.370919942855835, + "learning_rate": 4.786349397217204e-05, + "loss": 5.315, + "step": 22289 + }, + { + "epoch": 0.1325649443334285, + "grad_norm": 1.7138948440551758, + "learning_rate": 4.786330502868279e-05, + "loss": 5.4063, + "step": 22290 + }, + { + "epoch": 0.13257089161670949, + "grad_norm": 1.4117851257324219, + "learning_rate": 4.786311607721219e-05, + "loss": 5.3601, + "step": 22291 + }, + { + "epoch": 0.13257683889999047, + "grad_norm": 2.5631167888641357, + "learning_rate": 4.786292711776033e-05, + "loss": 3.8547, + "step": 22292 + }, + { + "epoch": 0.1325827861832715, + "grad_norm": 2.4507203102111816, + "learning_rate": 4.786273815032724e-05, + "loss": 3.9096, + "step": 22293 + }, + { + "epoch": 0.13258873346655248, + "grad_norm": 2.384136915206909, + "learning_rate": 4.7862549174913014e-05, + "loss": 4.0437, + "step": 22294 + }, + { + "epoch": 0.13259468074983347, + "grad_norm": 2.215449094772339, + "learning_rate": 4.786236019151771e-05, + "loss": 3.9703, + "step": 22295 + }, + { + "epoch": 0.13260062803311448, + "grad_norm": 2.1639139652252197, + "learning_rate": 4.786217120014138e-05, + "loss": 3.5108, + "step": 22296 + }, + { + "epoch": 0.13260657531639547, + "grad_norm": 2.2001569271087646, + "learning_rate": 4.786198220078412e-05, + "loss": 3.3189, + "step": 22297 + }, + { + "epoch": 0.13261252259967646, + "grad_norm": 2.1637179851531982, + "learning_rate": 4.7861793193445964e-05, + "loss": 3.3301, + "step": 22298 + }, + { + "epoch": 0.13261846988295747, + "grad_norm": 2.12546443939209, + "learning_rate": 4.7861604178127e-05, + "loss": 3.4002, + "step": 22299 + }, + { + "epoch": 0.13262441716623846, + "grad_norm": 1.632663369178772, + "learning_rate": 4.7861415154827285e-05, + "loss": 5.6516, + "step": 22300 + }, + { + "epoch": 0.13263036444951945, + "grad_norm": 1.6801213026046753, + "learning_rate": 4.786122612354688e-05, + "loss": 5.5013, + "step": 22301 + }, + { + "epoch": 0.13263631173280047, + "grad_norm": 1.5306708812713623, + "learning_rate": 4.7861037084285866e-05, + "loss": 5.6885, + "step": 22302 + }, + { + "epoch": 0.13264225901608145, + "grad_norm": 1.553322196006775, + "learning_rate": 4.7860848037044294e-05, + "loss": 5.499, + "step": 22303 + }, + { + "epoch": 0.13264820629936244, + "grad_norm": 1.5508325099945068, + "learning_rate": 4.7860658981822234e-05, + "loss": 5.522, + "step": 22304 + }, + { + "epoch": 0.13265415358264346, + "grad_norm": 1.4522117376327515, + "learning_rate": 4.786046991861976e-05, + "loss": 5.616, + "step": 22305 + }, + { + "epoch": 0.13266010086592445, + "grad_norm": 1.5596072673797607, + "learning_rate": 4.7860280847436926e-05, + "loss": 5.5323, + "step": 22306 + }, + { + "epoch": 0.13266604814920543, + "grad_norm": 1.8776074647903442, + "learning_rate": 4.7860091768273806e-05, + "loss": 5.4604, + "step": 22307 + }, + { + "epoch": 0.13267199543248645, + "grad_norm": 1.97171151638031, + "learning_rate": 4.785990268113048e-05, + "loss": 5.2305, + "step": 22308 + }, + { + "epoch": 0.13267794271576744, + "grad_norm": 1.35499107837677, + "learning_rate": 4.785971358600698e-05, + "loss": 4.8288, + "step": 22309 + }, + { + "epoch": 0.13268388999904843, + "grad_norm": 1.5026946067810059, + "learning_rate": 4.785952448290339e-05, + "loss": 4.6641, + "step": 22310 + }, + { + "epoch": 0.13268983728232944, + "grad_norm": 1.6728490591049194, + "learning_rate": 4.785933537181978e-05, + "loss": 4.8855, + "step": 22311 + }, + { + "epoch": 0.13269578456561043, + "grad_norm": 1.834144115447998, + "learning_rate": 4.7859146252756213e-05, + "loss": 4.5688, + "step": 22312 + }, + { + "epoch": 0.13270173184889142, + "grad_norm": 2.314073085784912, + "learning_rate": 4.7858957125712753e-05, + "loss": 5.3503, + "step": 22313 + }, + { + "epoch": 0.13270767913217243, + "grad_norm": 1.7270644903182983, + "learning_rate": 4.785876799068947e-05, + "loss": 5.6763, + "step": 22314 + }, + { + "epoch": 0.13271362641545342, + "grad_norm": 1.929304599761963, + "learning_rate": 4.785857884768643e-05, + "loss": 5.1659, + "step": 22315 + }, + { + "epoch": 0.1327195736987344, + "grad_norm": 1.8507132530212402, + "learning_rate": 4.785838969670369e-05, + "loss": 5.0806, + "step": 22316 + }, + { + "epoch": 0.13272552098201543, + "grad_norm": 1.6761378049850464, + "learning_rate": 4.785820053774133e-05, + "loss": 5.2008, + "step": 22317 + }, + { + "epoch": 0.13273146826529642, + "grad_norm": 1.521119475364685, + "learning_rate": 4.785801137079939e-05, + "loss": 5.0448, + "step": 22318 + }, + { + "epoch": 0.1327374155485774, + "grad_norm": 1.6237796545028687, + "learning_rate": 4.785782219587797e-05, + "loss": 5.0451, + "step": 22319 + }, + { + "epoch": 0.13274336283185842, + "grad_norm": 1.4166826009750366, + "learning_rate": 4.785763301297712e-05, + "loss": 5.0055, + "step": 22320 + }, + { + "epoch": 0.1327493101151394, + "grad_norm": 1.7093290090560913, + "learning_rate": 4.7857443822096905e-05, + "loss": 4.9528, + "step": 22321 + }, + { + "epoch": 0.1327552573984204, + "grad_norm": 1.7715668678283691, + "learning_rate": 4.785725462323739e-05, + "loss": 5.1638, + "step": 22322 + }, + { + "epoch": 0.1327612046817014, + "grad_norm": 1.8321062326431274, + "learning_rate": 4.785706541639865e-05, + "loss": 5.1916, + "step": 22323 + }, + { + "epoch": 0.1327671519649824, + "grad_norm": 1.6878079175949097, + "learning_rate": 4.7856876201580736e-05, + "loss": 5.1106, + "step": 22324 + }, + { + "epoch": 0.1327730992482634, + "grad_norm": 1.5275590419769287, + "learning_rate": 4.7856686978783725e-05, + "loss": 5.1073, + "step": 22325 + }, + { + "epoch": 0.1327790465315444, + "grad_norm": 1.6648119688034058, + "learning_rate": 4.7856497748007684e-05, + "loss": 5.3244, + "step": 22326 + }, + { + "epoch": 0.1327849938148254, + "grad_norm": 1.693325400352478, + "learning_rate": 4.7856308509252674e-05, + "loss": 5.596, + "step": 22327 + }, + { + "epoch": 0.13279094109810638, + "grad_norm": 2.6629621982574463, + "learning_rate": 4.785611926251876e-05, + "loss": 4.1305, + "step": 22328 + }, + { + "epoch": 0.1327968883813874, + "grad_norm": 2.4292843341827393, + "learning_rate": 4.785593000780602e-05, + "loss": 4.5656, + "step": 22329 + }, + { + "epoch": 0.13280283566466838, + "grad_norm": 1.5317484140396118, + "learning_rate": 4.78557407451145e-05, + "loss": 5.6828, + "step": 22330 + }, + { + "epoch": 0.13280878294794937, + "grad_norm": 1.59109365940094, + "learning_rate": 4.7855551474444285e-05, + "loss": 5.7914, + "step": 22331 + }, + { + "epoch": 0.13281473023123036, + "grad_norm": 1.359665036201477, + "learning_rate": 4.7855362195795425e-05, + "loss": 5.6294, + "step": 22332 + }, + { + "epoch": 0.13282067751451138, + "grad_norm": 1.327269196510315, + "learning_rate": 4.7855172909168003e-05, + "loss": 5.7178, + "step": 22333 + }, + { + "epoch": 0.13282662479779236, + "grad_norm": 1.4080103635787964, + "learning_rate": 4.785498361456207e-05, + "loss": 5.8786, + "step": 22334 + }, + { + "epoch": 0.13283257208107335, + "grad_norm": 1.393926978111267, + "learning_rate": 4.78547943119777e-05, + "loss": 5.4177, + "step": 22335 + }, + { + "epoch": 0.13283851936435437, + "grad_norm": 1.6050227880477905, + "learning_rate": 4.785460500141495e-05, + "loss": 5.5235, + "step": 22336 + }, + { + "epoch": 0.13284446664763536, + "grad_norm": 1.5462367534637451, + "learning_rate": 4.785441568287391e-05, + "loss": 6.1101, + "step": 22337 + }, + { + "epoch": 0.13285041393091634, + "grad_norm": 1.5062382221221924, + "learning_rate": 4.785422635635462e-05, + "loss": 5.8075, + "step": 22338 + }, + { + "epoch": 0.13285636121419736, + "grad_norm": 1.7419465780258179, + "learning_rate": 4.785403702185716e-05, + "loss": 5.8189, + "step": 22339 + }, + { + "epoch": 0.13286230849747835, + "grad_norm": 1.754164218902588, + "learning_rate": 4.785384767938158e-05, + "loss": 5.6446, + "step": 22340 + }, + { + "epoch": 0.13286825578075934, + "grad_norm": 1.3769707679748535, + "learning_rate": 4.785365832892797e-05, + "loss": 5.7689, + "step": 22341 + }, + { + "epoch": 0.13287420306404035, + "grad_norm": 1.6358861923217773, + "learning_rate": 4.7853468970496386e-05, + "loss": 5.4568, + "step": 22342 + }, + { + "epoch": 0.13288015034732134, + "grad_norm": 1.567083477973938, + "learning_rate": 4.7853279604086883e-05, + "loss": 5.4124, + "step": 22343 + }, + { + "epoch": 0.13288609763060233, + "grad_norm": 1.3793751001358032, + "learning_rate": 4.785309022969954e-05, + "loss": 5.5976, + "step": 22344 + }, + { + "epoch": 0.13289204491388334, + "grad_norm": 1.5371218919754028, + "learning_rate": 4.7852900847334414e-05, + "loss": 5.2898, + "step": 22345 + }, + { + "epoch": 0.13289799219716433, + "grad_norm": 2.1502809524536133, + "learning_rate": 4.785271145699158e-05, + "loss": 4.1536, + "step": 22346 + }, + { + "epoch": 0.13290393948044532, + "grad_norm": 1.9648473262786865, + "learning_rate": 4.785252205867111e-05, + "loss": 4.1755, + "step": 22347 + }, + { + "epoch": 0.13290988676372634, + "grad_norm": 1.874877691268921, + "learning_rate": 4.785233265237305e-05, + "loss": 4.1043, + "step": 22348 + }, + { + "epoch": 0.13291583404700733, + "grad_norm": 1.924109935760498, + "learning_rate": 4.785214323809748e-05, + "loss": 4.0551, + "step": 22349 + }, + { + "epoch": 0.1329217813302883, + "grad_norm": 1.8653898239135742, + "learning_rate": 4.785195381584446e-05, + "loss": 4.0712, + "step": 22350 + }, + { + "epoch": 0.13292772861356933, + "grad_norm": 1.8480240106582642, + "learning_rate": 4.785176438561406e-05, + "loss": 4.0729, + "step": 22351 + }, + { + "epoch": 0.13293367589685032, + "grad_norm": 1.7229113578796387, + "learning_rate": 4.785157494740635e-05, + "loss": 3.9822, + "step": 22352 + }, + { + "epoch": 0.1329396231801313, + "grad_norm": 1.9756056070327759, + "learning_rate": 4.7851385501221385e-05, + "loss": 3.8667, + "step": 22353 + }, + { + "epoch": 0.13294557046341232, + "grad_norm": 1.9121302366256714, + "learning_rate": 4.785119604705924e-05, + "loss": 4.0157, + "step": 22354 + }, + { + "epoch": 0.1329515177466933, + "grad_norm": 1.999444842338562, + "learning_rate": 4.785100658491998e-05, + "loss": 4.0511, + "step": 22355 + }, + { + "epoch": 0.1329574650299743, + "grad_norm": 1.8992079496383667, + "learning_rate": 4.785081711480367e-05, + "loss": 3.9595, + "step": 22356 + }, + { + "epoch": 0.1329634123132553, + "grad_norm": 1.8835148811340332, + "learning_rate": 4.785062763671037e-05, + "loss": 3.9891, + "step": 22357 + }, + { + "epoch": 0.1329693595965363, + "grad_norm": 1.8938409090042114, + "learning_rate": 4.785043815064015e-05, + "loss": 3.927, + "step": 22358 + }, + { + "epoch": 0.1329753068798173, + "grad_norm": 1.8824357986450195, + "learning_rate": 4.785024865659309e-05, + "loss": 4.0438, + "step": 22359 + }, + { + "epoch": 0.1329812541630983, + "grad_norm": 1.9158250093460083, + "learning_rate": 4.785005915456924e-05, + "loss": 4.0448, + "step": 22360 + }, + { + "epoch": 0.1329872014463793, + "grad_norm": 1.7421679496765137, + "learning_rate": 4.784986964456867e-05, + "loss": 3.9869, + "step": 22361 + }, + { + "epoch": 0.13299314872966028, + "grad_norm": 1.7917057275772095, + "learning_rate": 4.784968012659145e-05, + "loss": 3.9976, + "step": 22362 + }, + { + "epoch": 0.1329990960129413, + "grad_norm": 1.9387284517288208, + "learning_rate": 4.784949060063764e-05, + "loss": 4.3383, + "step": 22363 + }, + { + "epoch": 0.1330050432962223, + "grad_norm": 2.60548996925354, + "learning_rate": 4.78493010667073e-05, + "loss": 4.5527, + "step": 22364 + }, + { + "epoch": 0.13301099057950327, + "grad_norm": 2.440361976623535, + "learning_rate": 4.784911152480051e-05, + "loss": 4.7931, + "step": 22365 + }, + { + "epoch": 0.1330169378627843, + "grad_norm": 2.4233226776123047, + "learning_rate": 4.784892197491734e-05, + "loss": 4.5482, + "step": 22366 + }, + { + "epoch": 0.13302288514606528, + "grad_norm": 2.3421928882598877, + "learning_rate": 4.7848732417057836e-05, + "loss": 4.6708, + "step": 22367 + }, + { + "epoch": 0.13302883242934627, + "grad_norm": 1.9476850032806396, + "learning_rate": 4.784854285122208e-05, + "loss": 4.5518, + "step": 22368 + }, + { + "epoch": 0.13303477971262728, + "grad_norm": 2.015965223312378, + "learning_rate": 4.784835327741013e-05, + "loss": 4.5258, + "step": 22369 + }, + { + "epoch": 0.13304072699590827, + "grad_norm": 2.28434157371521, + "learning_rate": 4.784816369562206e-05, + "loss": 4.6413, + "step": 22370 + }, + { + "epoch": 0.13304667427918926, + "grad_norm": 1.9141323566436768, + "learning_rate": 4.784797410585794e-05, + "loss": 4.7134, + "step": 22371 + }, + { + "epoch": 0.13305262156247027, + "grad_norm": 2.2627341747283936, + "learning_rate": 4.7847784508117815e-05, + "loss": 4.512, + "step": 22372 + }, + { + "epoch": 0.13305856884575126, + "grad_norm": 2.2111268043518066, + "learning_rate": 4.784759490240177e-05, + "loss": 4.6105, + "step": 22373 + }, + { + "epoch": 0.13306451612903225, + "grad_norm": 2.4321610927581787, + "learning_rate": 4.7847405288709864e-05, + "loss": 5.1333, + "step": 22374 + }, + { + "epoch": 0.13307046341231327, + "grad_norm": 2.49605131149292, + "learning_rate": 4.7847215667042165e-05, + "loss": 5.2355, + "step": 22375 + }, + { + "epoch": 0.13307641069559425, + "grad_norm": 2.2517080307006836, + "learning_rate": 4.784702603739874e-05, + "loss": 5.3007, + "step": 22376 + }, + { + "epoch": 0.13308235797887524, + "grad_norm": 1.807502269744873, + "learning_rate": 4.784683639977966e-05, + "loss": 5.2645, + "step": 22377 + }, + { + "epoch": 0.13308830526215626, + "grad_norm": 1.9133596420288086, + "learning_rate": 4.784664675418497e-05, + "loss": 5.3313, + "step": 22378 + }, + { + "epoch": 0.13309425254543725, + "grad_norm": 1.823691725730896, + "learning_rate": 4.7846457100614774e-05, + "loss": 5.5637, + "step": 22379 + }, + { + "epoch": 0.13310019982871824, + "grad_norm": 1.769579291343689, + "learning_rate": 4.78462674390691e-05, + "loss": 5.3217, + "step": 22380 + }, + { + "epoch": 0.13310614711199925, + "grad_norm": 1.576685905456543, + "learning_rate": 4.784607776954804e-05, + "loss": 5.5387, + "step": 22381 + }, + { + "epoch": 0.13311209439528024, + "grad_norm": 1.5737719535827637, + "learning_rate": 4.784588809205164e-05, + "loss": 5.269, + "step": 22382 + }, + { + "epoch": 0.13311804167856123, + "grad_norm": 1.6323963403701782, + "learning_rate": 4.784569840657998e-05, + "loss": 5.156, + "step": 22383 + }, + { + "epoch": 0.13312398896184224, + "grad_norm": 2.5943386554718018, + "learning_rate": 4.784550871313312e-05, + "loss": 5.0882, + "step": 22384 + }, + { + "epoch": 0.13312993624512323, + "grad_norm": 1.5392063856124878, + "learning_rate": 4.784531901171113e-05, + "loss": 5.0303, + "step": 22385 + }, + { + "epoch": 0.13313588352840422, + "grad_norm": 1.7257198095321655, + "learning_rate": 4.784512930231408e-05, + "loss": 5.3784, + "step": 22386 + }, + { + "epoch": 0.13314183081168524, + "grad_norm": 1.7736787796020508, + "learning_rate": 4.784493958494203e-05, + "loss": 5.256, + "step": 22387 + }, + { + "epoch": 0.13314777809496622, + "grad_norm": 1.575386643409729, + "learning_rate": 4.784474985959505e-05, + "loss": 5.1247, + "step": 22388 + }, + { + "epoch": 0.1331537253782472, + "grad_norm": 1.6164257526397705, + "learning_rate": 4.7844560126273195e-05, + "loss": 5.553, + "step": 22389 + }, + { + "epoch": 0.13315967266152823, + "grad_norm": 1.515674114227295, + "learning_rate": 4.7844370384976546e-05, + "loss": 5.556, + "step": 22390 + }, + { + "epoch": 0.13316561994480922, + "grad_norm": 1.5831459760665894, + "learning_rate": 4.784418063570516e-05, + "loss": 5.2649, + "step": 22391 + }, + { + "epoch": 0.1331715672280902, + "grad_norm": 1.5372157096862793, + "learning_rate": 4.7843990878459114e-05, + "loss": 5.1961, + "step": 22392 + }, + { + "epoch": 0.1331775145113712, + "grad_norm": 1.5881307125091553, + "learning_rate": 4.784380111323846e-05, + "loss": 5.5521, + "step": 22393 + }, + { + "epoch": 0.1331834617946522, + "grad_norm": 1.7717739343643188, + "learning_rate": 4.784361134004327e-05, + "loss": 5.4407, + "step": 22394 + }, + { + "epoch": 0.1331894090779332, + "grad_norm": 1.7472600936889648, + "learning_rate": 4.784342155887362e-05, + "loss": 5.1055, + "step": 22395 + }, + { + "epoch": 0.13319535636121418, + "grad_norm": 1.8296018838882446, + "learning_rate": 4.784323176972956e-05, + "loss": 4.596, + "step": 22396 + }, + { + "epoch": 0.1332013036444952, + "grad_norm": 1.6303856372833252, + "learning_rate": 4.784304197261117e-05, + "loss": 5.4028, + "step": 22397 + }, + { + "epoch": 0.1332072509277762, + "grad_norm": 1.4000413417816162, + "learning_rate": 4.78428521675185e-05, + "loss": 5.8166, + "step": 22398 + }, + { + "epoch": 0.13321319821105718, + "grad_norm": 1.4396088123321533, + "learning_rate": 4.7842662354451634e-05, + "loss": 5.4439, + "step": 22399 + }, + { + "epoch": 0.1332191454943382, + "grad_norm": 1.580919623374939, + "learning_rate": 4.7842472533410635e-05, + "loss": 5.3089, + "step": 22400 + }, + { + "epoch": 0.13322509277761918, + "grad_norm": 1.7976210117340088, + "learning_rate": 4.7842282704395545e-05, + "loss": 5.1538, + "step": 22401 + }, + { + "epoch": 0.13323104006090017, + "grad_norm": 1.7573418617248535, + "learning_rate": 4.784209286740647e-05, + "loss": 5.3701, + "step": 22402 + }, + { + "epoch": 0.13323698734418118, + "grad_norm": 1.6944206953048706, + "learning_rate": 4.784190302244345e-05, + "loss": 4.8349, + "step": 22403 + }, + { + "epoch": 0.13324293462746217, + "grad_norm": 1.9255948066711426, + "learning_rate": 4.7841713169506555e-05, + "loss": 5.2077, + "step": 22404 + }, + { + "epoch": 0.13324888191074316, + "grad_norm": 1.7583602666854858, + "learning_rate": 4.784152330859586e-05, + "loss": 4.9968, + "step": 22405 + }, + { + "epoch": 0.13325482919402418, + "grad_norm": 1.6917812824249268, + "learning_rate": 4.784133343971142e-05, + "loss": 5.3295, + "step": 22406 + }, + { + "epoch": 0.13326077647730517, + "grad_norm": 1.5531493425369263, + "learning_rate": 4.784114356285331e-05, + "loss": 5.2978, + "step": 22407 + }, + { + "epoch": 0.13326672376058615, + "grad_norm": 1.5347543954849243, + "learning_rate": 4.7840953678021586e-05, + "loss": 5.2922, + "step": 22408 + }, + { + "epoch": 0.13327267104386717, + "grad_norm": 1.3059866428375244, + "learning_rate": 4.7840763785216323e-05, + "loss": 5.2255, + "step": 22409 + }, + { + "epoch": 0.13327861832714816, + "grad_norm": 1.2207573652267456, + "learning_rate": 4.784057388443759e-05, + "loss": 4.9595, + "step": 22410 + }, + { + "epoch": 0.13328456561042915, + "grad_norm": 1.9115726947784424, + "learning_rate": 4.784038397568545e-05, + "loss": 5.0465, + "step": 22411 + }, + { + "epoch": 0.13329051289371016, + "grad_norm": 1.907443642616272, + "learning_rate": 4.7840194058959965e-05, + "loss": 4.5429, + "step": 22412 + }, + { + "epoch": 0.13329646017699115, + "grad_norm": 1.7891590595245361, + "learning_rate": 4.78400041342612e-05, + "loss": 4.5718, + "step": 22413 + }, + { + "epoch": 0.13330240746027214, + "grad_norm": 1.7904539108276367, + "learning_rate": 4.7839814201589234e-05, + "loss": 4.7077, + "step": 22414 + }, + { + "epoch": 0.13330835474355315, + "grad_norm": 1.8562805652618408, + "learning_rate": 4.783962426094411e-05, + "loss": 4.8559, + "step": 22415 + }, + { + "epoch": 0.13331430202683414, + "grad_norm": 1.7840648889541626, + "learning_rate": 4.7839434312325924e-05, + "loss": 4.5559, + "step": 22416 + }, + { + "epoch": 0.13332024931011513, + "grad_norm": 1.8956695795059204, + "learning_rate": 4.783924435573472e-05, + "loss": 4.6933, + "step": 22417 + }, + { + "epoch": 0.13332619659339615, + "grad_norm": 1.798685073852539, + "learning_rate": 4.783905439117058e-05, + "loss": 4.5131, + "step": 22418 + }, + { + "epoch": 0.13333214387667713, + "grad_norm": 1.8377288579940796, + "learning_rate": 4.7838864418633554e-05, + "loss": 4.4986, + "step": 22419 + }, + { + "epoch": 0.13333809115995812, + "grad_norm": 1.8382439613342285, + "learning_rate": 4.783867443812372e-05, + "loss": 5.1565, + "step": 22420 + }, + { + "epoch": 0.13334403844323914, + "grad_norm": 2.030796766281128, + "learning_rate": 4.783848444964114e-05, + "loss": 5.4532, + "step": 22421 + }, + { + "epoch": 0.13334998572652013, + "grad_norm": 2.020561695098877, + "learning_rate": 4.7838294453185886e-05, + "loss": 5.4529, + "step": 22422 + }, + { + "epoch": 0.13335593300980111, + "grad_norm": 1.8092904090881348, + "learning_rate": 4.783810444875801e-05, + "loss": 5.4092, + "step": 22423 + }, + { + "epoch": 0.13336188029308213, + "grad_norm": 1.7571618556976318, + "learning_rate": 4.78379144363576e-05, + "loss": 5.5134, + "step": 22424 + }, + { + "epoch": 0.13336782757636312, + "grad_norm": 1.8572049140930176, + "learning_rate": 4.7837724415984694e-05, + "loss": 5.1786, + "step": 22425 + }, + { + "epoch": 0.1333737748596441, + "grad_norm": 2.3944039344787598, + "learning_rate": 4.783753438763938e-05, + "loss": 4.7667, + "step": 22426 + }, + { + "epoch": 0.13337972214292512, + "grad_norm": 1.9377988576889038, + "learning_rate": 4.7837344351321725e-05, + "loss": 5.6523, + "step": 22427 + }, + { + "epoch": 0.1333856694262061, + "grad_norm": 1.7981183528900146, + "learning_rate": 4.783715430703178e-05, + "loss": 5.5374, + "step": 22428 + }, + { + "epoch": 0.1333916167094871, + "grad_norm": 1.6658248901367188, + "learning_rate": 4.783696425476963e-05, + "loss": 5.5128, + "step": 22429 + }, + { + "epoch": 0.13339756399276811, + "grad_norm": 1.6594502925872803, + "learning_rate": 4.783677419453533e-05, + "loss": 5.5225, + "step": 22430 + }, + { + "epoch": 0.1334035112760491, + "grad_norm": 1.6250741481781006, + "learning_rate": 4.7836584126328945e-05, + "loss": 5.4027, + "step": 22431 + }, + { + "epoch": 0.1334094585593301, + "grad_norm": 1.633254885673523, + "learning_rate": 4.783639405015054e-05, + "loss": 5.3856, + "step": 22432 + }, + { + "epoch": 0.1334154058426111, + "grad_norm": 1.5948752164840698, + "learning_rate": 4.783620396600019e-05, + "loss": 5.5501, + "step": 22433 + }, + { + "epoch": 0.1334213531258921, + "grad_norm": 2.007847547531128, + "learning_rate": 4.783601387387796e-05, + "loss": 4.878, + "step": 22434 + }, + { + "epoch": 0.13342730040917308, + "grad_norm": 2.4036359786987305, + "learning_rate": 4.783582377378391e-05, + "loss": 3.8348, + "step": 22435 + }, + { + "epoch": 0.1334332476924541, + "grad_norm": 2.7686264514923096, + "learning_rate": 4.783563366571811e-05, + "loss": 3.13, + "step": 22436 + }, + { + "epoch": 0.1334391949757351, + "grad_norm": 2.4651095867156982, + "learning_rate": 4.7835443549680625e-05, + "loss": 2.9104, + "step": 22437 + }, + { + "epoch": 0.13344514225901608, + "grad_norm": 2.57837176322937, + "learning_rate": 4.7835253425671526e-05, + "loss": 3.1145, + "step": 22438 + }, + { + "epoch": 0.1334510895422971, + "grad_norm": 2.804194688796997, + "learning_rate": 4.783506329369087e-05, + "loss": 3.7685, + "step": 22439 + }, + { + "epoch": 0.13345703682557808, + "grad_norm": 2.5836985111236572, + "learning_rate": 4.783487315373874e-05, + "loss": 3.383, + "step": 22440 + }, + { + "epoch": 0.13346298410885907, + "grad_norm": 2.5800416469573975, + "learning_rate": 4.7834683005815184e-05, + "loss": 3.345, + "step": 22441 + }, + { + "epoch": 0.13346893139214008, + "grad_norm": 2.695234775543213, + "learning_rate": 4.7834492849920275e-05, + "loss": 3.7905, + "step": 22442 + }, + { + "epoch": 0.13347487867542107, + "grad_norm": 2.075918436050415, + "learning_rate": 4.783430268605409e-05, + "loss": 4.3114, + "step": 22443 + }, + { + "epoch": 0.13348082595870206, + "grad_norm": 2.221691131591797, + "learning_rate": 4.7834112514216676e-05, + "loss": 5.5658, + "step": 22444 + }, + { + "epoch": 0.13348677324198308, + "grad_norm": 1.9432377815246582, + "learning_rate": 4.783392233440811e-05, + "loss": 5.2566, + "step": 22445 + }, + { + "epoch": 0.13349272052526406, + "grad_norm": 1.9735411405563354, + "learning_rate": 4.783373214662846e-05, + "loss": 4.2656, + "step": 22446 + }, + { + "epoch": 0.13349866780854505, + "grad_norm": 1.8616423606872559, + "learning_rate": 4.783354195087779e-05, + "loss": 4.2018, + "step": 22447 + }, + { + "epoch": 0.13350461509182607, + "grad_norm": 1.9751770496368408, + "learning_rate": 4.783335174715617e-05, + "loss": 4.1716, + "step": 22448 + }, + { + "epoch": 0.13351056237510706, + "grad_norm": 2.053149461746216, + "learning_rate": 4.7833161535463656e-05, + "loss": 4.0603, + "step": 22449 + }, + { + "epoch": 0.13351650965838804, + "grad_norm": 1.8129456043243408, + "learning_rate": 4.7832971315800325e-05, + "loss": 4.098, + "step": 22450 + }, + { + "epoch": 0.13352245694166903, + "grad_norm": 1.8842658996582031, + "learning_rate": 4.783278108816624e-05, + "loss": 4.1225, + "step": 22451 + }, + { + "epoch": 0.13352840422495005, + "grad_norm": 1.9037132263183594, + "learning_rate": 4.783259085256146e-05, + "loss": 4.0953, + "step": 22452 + }, + { + "epoch": 0.13353435150823104, + "grad_norm": 1.8058161735534668, + "learning_rate": 4.7832400608986074e-05, + "loss": 3.9189, + "step": 22453 + }, + { + "epoch": 0.13354029879151202, + "grad_norm": 1.899573564529419, + "learning_rate": 4.7832210357440124e-05, + "loss": 4.063, + "step": 22454 + }, + { + "epoch": 0.13354624607479304, + "grad_norm": 1.8507969379425049, + "learning_rate": 4.783202009792368e-05, + "loss": 4.1139, + "step": 22455 + }, + { + "epoch": 0.13355219335807403, + "grad_norm": 1.861315369606018, + "learning_rate": 4.783182983043681e-05, + "loss": 4.1063, + "step": 22456 + }, + { + "epoch": 0.13355814064135502, + "grad_norm": 1.9481399059295654, + "learning_rate": 4.7831639554979603e-05, + "loss": 4.1103, + "step": 22457 + }, + { + "epoch": 0.13356408792463603, + "grad_norm": 1.9315237998962402, + "learning_rate": 4.7831449271552086e-05, + "loss": 4.0723, + "step": 22458 + }, + { + "epoch": 0.13357003520791702, + "grad_norm": 1.951989769935608, + "learning_rate": 4.783125898015436e-05, + "loss": 4.3063, + "step": 22459 + }, + { + "epoch": 0.133575982491198, + "grad_norm": 1.8107032775878906, + "learning_rate": 4.783106868078647e-05, + "loss": 4.1869, + "step": 22460 + }, + { + "epoch": 0.13358192977447902, + "grad_norm": 1.8079946041107178, + "learning_rate": 4.7830878373448495e-05, + "loss": 4.2569, + "step": 22461 + }, + { + "epoch": 0.13358787705776, + "grad_norm": 1.9094295501708984, + "learning_rate": 4.7830688058140494e-05, + "loss": 4.8144, + "step": 22462 + }, + { + "epoch": 0.133593824341041, + "grad_norm": 1.9410862922668457, + "learning_rate": 4.7830497734862536e-05, + "loss": 4.6606, + "step": 22463 + }, + { + "epoch": 0.13359977162432202, + "grad_norm": 1.832387089729309, + "learning_rate": 4.783030740361469e-05, + "loss": 4.774, + "step": 22464 + }, + { + "epoch": 0.133605718907603, + "grad_norm": 1.8661162853240967, + "learning_rate": 4.783011706439701e-05, + "loss": 5.0414, + "step": 22465 + }, + { + "epoch": 0.133611666190884, + "grad_norm": 1.6019399166107178, + "learning_rate": 4.782992671720958e-05, + "loss": 5.1333, + "step": 22466 + }, + { + "epoch": 0.133617613474165, + "grad_norm": 1.539556860923767, + "learning_rate": 4.7829736362052455e-05, + "loss": 5.5576, + "step": 22467 + }, + { + "epoch": 0.133623560757446, + "grad_norm": 1.6988813877105713, + "learning_rate": 4.7829545998925704e-05, + "loss": 5.5953, + "step": 22468 + }, + { + "epoch": 0.13362950804072699, + "grad_norm": 1.77605140209198, + "learning_rate": 4.78293556278294e-05, + "loss": 5.1917, + "step": 22469 + }, + { + "epoch": 0.133635455324008, + "grad_norm": 1.958486557006836, + "learning_rate": 4.78291652487636e-05, + "loss": 5.141, + "step": 22470 + }, + { + "epoch": 0.133641402607289, + "grad_norm": 1.4875729084014893, + "learning_rate": 4.7828974861728374e-05, + "loss": 5.551, + "step": 22471 + }, + { + "epoch": 0.13364734989056998, + "grad_norm": 1.5118046998977661, + "learning_rate": 4.7828784466723795e-05, + "loss": 5.8965, + "step": 22472 + }, + { + "epoch": 0.133653297173851, + "grad_norm": 1.7107024192810059, + "learning_rate": 4.7828594063749924e-05, + "loss": 5.444, + "step": 22473 + }, + { + "epoch": 0.13365924445713198, + "grad_norm": 2.211569309234619, + "learning_rate": 4.7828403652806814e-05, + "loss": 4.6709, + "step": 22474 + }, + { + "epoch": 0.13366519174041297, + "grad_norm": 1.5755807161331177, + "learning_rate": 4.782821323389455e-05, + "loss": 5.481, + "step": 22475 + }, + { + "epoch": 0.13367113902369399, + "grad_norm": 1.5715577602386475, + "learning_rate": 4.782802280701319e-05, + "loss": 5.4475, + "step": 22476 + }, + { + "epoch": 0.13367708630697497, + "grad_norm": 1.483229160308838, + "learning_rate": 4.782783237216281e-05, + "loss": 5.287, + "step": 22477 + }, + { + "epoch": 0.13368303359025596, + "grad_norm": 1.6031765937805176, + "learning_rate": 4.782764192934347e-05, + "loss": 4.9328, + "step": 22478 + }, + { + "epoch": 0.13368898087353698, + "grad_norm": 1.5472909212112427, + "learning_rate": 4.782745147855523e-05, + "loss": 5.4962, + "step": 22479 + }, + { + "epoch": 0.13369492815681797, + "grad_norm": 1.5153834819793701, + "learning_rate": 4.7827261019798164e-05, + "loss": 5.2488, + "step": 22480 + }, + { + "epoch": 0.13370087544009895, + "grad_norm": 1.8485814332962036, + "learning_rate": 4.782707055307233e-05, + "loss": 4.6998, + "step": 22481 + }, + { + "epoch": 0.13370682272337997, + "grad_norm": 1.6526838541030884, + "learning_rate": 4.782688007837781e-05, + "loss": 4.7843, + "step": 22482 + }, + { + "epoch": 0.13371277000666096, + "grad_norm": 1.6769697666168213, + "learning_rate": 4.782668959571467e-05, + "loss": 4.8344, + "step": 22483 + }, + { + "epoch": 0.13371871728994195, + "grad_norm": 1.6509302854537964, + "learning_rate": 4.782649910508296e-05, + "loss": 5.0646, + "step": 22484 + }, + { + "epoch": 0.13372466457322296, + "grad_norm": 1.58712637424469, + "learning_rate": 4.782630860648275e-05, + "loss": 4.841, + "step": 22485 + }, + { + "epoch": 0.13373061185650395, + "grad_norm": 1.7171813249588013, + "learning_rate": 4.782611809991412e-05, + "loss": 5.5934, + "step": 22486 + }, + { + "epoch": 0.13373655913978494, + "grad_norm": 1.598689079284668, + "learning_rate": 4.782592758537712e-05, + "loss": 5.5131, + "step": 22487 + }, + { + "epoch": 0.13374250642306595, + "grad_norm": 1.652279019355774, + "learning_rate": 4.782573706287183e-05, + "loss": 4.9244, + "step": 22488 + }, + { + "epoch": 0.13374845370634694, + "grad_norm": 1.733337163925171, + "learning_rate": 4.782554653239831e-05, + "loss": 5.1153, + "step": 22489 + }, + { + "epoch": 0.13375440098962793, + "grad_norm": 1.3961280584335327, + "learning_rate": 4.782535599395662e-05, + "loss": 5.1146, + "step": 22490 + }, + { + "epoch": 0.13376034827290895, + "grad_norm": 1.371650218963623, + "learning_rate": 4.782516544754685e-05, + "loss": 4.9608, + "step": 22491 + }, + { + "epoch": 0.13376629555618993, + "grad_norm": 1.738678216934204, + "learning_rate": 4.782497489316904e-05, + "loss": 5.384, + "step": 22492 + }, + { + "epoch": 0.13377224283947092, + "grad_norm": 1.899530291557312, + "learning_rate": 4.7824784330823266e-05, + "loss": 5.479, + "step": 22493 + }, + { + "epoch": 0.13377819012275194, + "grad_norm": 1.6108837127685547, + "learning_rate": 4.782459376050959e-05, + "loss": 5.4919, + "step": 22494 + }, + { + "epoch": 0.13378413740603293, + "grad_norm": 1.688045859336853, + "learning_rate": 4.78244031822281e-05, + "loss": 5.5093, + "step": 22495 + }, + { + "epoch": 0.13379008468931392, + "grad_norm": 1.526538610458374, + "learning_rate": 4.782421259597884e-05, + "loss": 5.4022, + "step": 22496 + }, + { + "epoch": 0.13379603197259493, + "grad_norm": 1.5651198625564575, + "learning_rate": 4.7824022001761884e-05, + "loss": 5.3737, + "step": 22497 + }, + { + "epoch": 0.13380197925587592, + "grad_norm": 1.6090896129608154, + "learning_rate": 4.7823831399577296e-05, + "loss": 5.3482, + "step": 22498 + }, + { + "epoch": 0.1338079265391569, + "grad_norm": 1.5139176845550537, + "learning_rate": 4.782364078942514e-05, + "loss": 5.2195, + "step": 22499 + }, + { + "epoch": 0.13381387382243792, + "grad_norm": 1.468328833580017, + "learning_rate": 4.782345017130549e-05, + "loss": 5.4421, + "step": 22500 + }, + { + "epoch": 0.1338198211057189, + "grad_norm": 1.4803540706634521, + "learning_rate": 4.782325954521841e-05, + "loss": 5.8645, + "step": 22501 + }, + { + "epoch": 0.1338257683889999, + "grad_norm": 1.5472211837768555, + "learning_rate": 4.782306891116397e-05, + "loss": 5.5739, + "step": 22502 + }, + { + "epoch": 0.13383171567228092, + "grad_norm": 1.5523242950439453, + "learning_rate": 4.782287826914223e-05, + "loss": 5.4971, + "step": 22503 + }, + { + "epoch": 0.1338376629555619, + "grad_norm": 1.6459407806396484, + "learning_rate": 4.7822687619153264e-05, + "loss": 5.5006, + "step": 22504 + }, + { + "epoch": 0.1338436102388429, + "grad_norm": 1.9664801359176636, + "learning_rate": 4.782249696119712e-05, + "loss": 4.908, + "step": 22505 + }, + { + "epoch": 0.1338495575221239, + "grad_norm": 1.757797360420227, + "learning_rate": 4.782230629527389e-05, + "loss": 5.3259, + "step": 22506 + }, + { + "epoch": 0.1338555048054049, + "grad_norm": 1.734212040901184, + "learning_rate": 4.7822115621383626e-05, + "loss": 4.9526, + "step": 22507 + }, + { + "epoch": 0.13386145208868588, + "grad_norm": 1.7347631454467773, + "learning_rate": 4.7821924939526386e-05, + "loss": 4.9416, + "step": 22508 + }, + { + "epoch": 0.13386739937196687, + "grad_norm": 1.6283304691314697, + "learning_rate": 4.782173424970226e-05, + "loss": 5.1706, + "step": 22509 + }, + { + "epoch": 0.1338733466552479, + "grad_norm": 1.6665587425231934, + "learning_rate": 4.7821543551911294e-05, + "loss": 5.6977, + "step": 22510 + }, + { + "epoch": 0.13387929393852888, + "grad_norm": 1.5051319599151611, + "learning_rate": 4.7821352846153576e-05, + "loss": 5.7575, + "step": 22511 + }, + { + "epoch": 0.13388524122180986, + "grad_norm": 1.966944932937622, + "learning_rate": 4.7821162132429154e-05, + "loss": 4.8996, + "step": 22512 + }, + { + "epoch": 0.13389118850509088, + "grad_norm": 2.669949769973755, + "learning_rate": 4.782097141073809e-05, + "loss": 3.7917, + "step": 22513 + }, + { + "epoch": 0.13389713578837187, + "grad_norm": 2.743389844894409, + "learning_rate": 4.782078068108048e-05, + "loss": 3.658, + "step": 22514 + }, + { + "epoch": 0.13390308307165286, + "grad_norm": 2.8011279106140137, + "learning_rate": 4.782058994345635e-05, + "loss": 3.4269, + "step": 22515 + }, + { + "epoch": 0.13390903035493387, + "grad_norm": 2.332318067550659, + "learning_rate": 4.78203991978658e-05, + "loss": 3.7318, + "step": 22516 + }, + { + "epoch": 0.13391497763821486, + "grad_norm": 2.1522371768951416, + "learning_rate": 4.782020844430888e-05, + "loss": 3.912, + "step": 22517 + }, + { + "epoch": 0.13392092492149585, + "grad_norm": 1.7325389385223389, + "learning_rate": 4.782001768278567e-05, + "loss": 5.2602, + "step": 22518 + }, + { + "epoch": 0.13392687220477686, + "grad_norm": 1.872207522392273, + "learning_rate": 4.7819826913296216e-05, + "loss": 5.3663, + "step": 22519 + }, + { + "epoch": 0.13393281948805785, + "grad_norm": 1.86244535446167, + "learning_rate": 4.78196361358406e-05, + "loss": 5.382, + "step": 22520 + }, + { + "epoch": 0.13393876677133884, + "grad_norm": 1.6984341144561768, + "learning_rate": 4.781944535041889e-05, + "loss": 5.2243, + "step": 22521 + }, + { + "epoch": 0.13394471405461986, + "grad_norm": 1.7697153091430664, + "learning_rate": 4.781925455703114e-05, + "loss": 5.2368, + "step": 22522 + }, + { + "epoch": 0.13395066133790084, + "grad_norm": 2.323636293411255, + "learning_rate": 4.781906375567743e-05, + "loss": 4.7709, + "step": 22523 + }, + { + "epoch": 0.13395660862118183, + "grad_norm": 2.2196481227874756, + "learning_rate": 4.781887294635782e-05, + "loss": 4.8089, + "step": 22524 + }, + { + "epoch": 0.13396255590446285, + "grad_norm": 1.8148611783981323, + "learning_rate": 4.7818682129072365e-05, + "loss": 4.673, + "step": 22525 + }, + { + "epoch": 0.13396850318774384, + "grad_norm": 1.9306626319885254, + "learning_rate": 4.7818491303821155e-05, + "loss": 5.3217, + "step": 22526 + }, + { + "epoch": 0.13397445047102483, + "grad_norm": 1.9646215438842773, + "learning_rate": 4.781830047060425e-05, + "loss": 4.9239, + "step": 22527 + }, + { + "epoch": 0.13398039775430584, + "grad_norm": 1.7711313962936401, + "learning_rate": 4.7818109629421706e-05, + "loss": 5.1977, + "step": 22528 + }, + { + "epoch": 0.13398634503758683, + "grad_norm": 1.5714713335037231, + "learning_rate": 4.781791878027359e-05, + "loss": 5.1759, + "step": 22529 + }, + { + "epoch": 0.13399229232086782, + "grad_norm": 1.573440670967102, + "learning_rate": 4.781772792315998e-05, + "loss": 5.2892, + "step": 22530 + }, + { + "epoch": 0.13399823960414883, + "grad_norm": 1.484643816947937, + "learning_rate": 4.781753705808094e-05, + "loss": 5.2751, + "step": 22531 + }, + { + "epoch": 0.13400418688742982, + "grad_norm": 1.484236240386963, + "learning_rate": 4.781734618503653e-05, + "loss": 5.1928, + "step": 22532 + }, + { + "epoch": 0.1340101341707108, + "grad_norm": 1.6469415426254272, + "learning_rate": 4.781715530402682e-05, + "loss": 4.9161, + "step": 22533 + }, + { + "epoch": 0.13401608145399183, + "grad_norm": 1.736928939819336, + "learning_rate": 4.781696441505188e-05, + "loss": 5.2132, + "step": 22534 + }, + { + "epoch": 0.1340220287372728, + "grad_norm": 1.6927560567855835, + "learning_rate": 4.781677351811177e-05, + "loss": 5.1001, + "step": 22535 + }, + { + "epoch": 0.1340279760205538, + "grad_norm": 1.4961135387420654, + "learning_rate": 4.7816582613206564e-05, + "loss": 4.8025, + "step": 22536 + }, + { + "epoch": 0.13403392330383482, + "grad_norm": 1.6069209575653076, + "learning_rate": 4.7816391700336315e-05, + "loss": 5.1449, + "step": 22537 + }, + { + "epoch": 0.1340398705871158, + "grad_norm": 1.9168766736984253, + "learning_rate": 4.781620077950111e-05, + "loss": 5.1479, + "step": 22538 + }, + { + "epoch": 0.1340458178703968, + "grad_norm": 1.545693278312683, + "learning_rate": 4.7816009850701e-05, + "loss": 5.1445, + "step": 22539 + }, + { + "epoch": 0.1340517651536778, + "grad_norm": 2.524106740951538, + "learning_rate": 4.781581891393606e-05, + "loss": 4.3988, + "step": 22540 + }, + { + "epoch": 0.1340577124369588, + "grad_norm": 3.073733329772949, + "learning_rate": 4.781562796920635e-05, + "loss": 4.8931, + "step": 22541 + }, + { + "epoch": 0.1340636597202398, + "grad_norm": 2.1566405296325684, + "learning_rate": 4.7815437016511936e-05, + "loss": 4.9778, + "step": 22542 + }, + { + "epoch": 0.1340696070035208, + "grad_norm": 1.6103532314300537, + "learning_rate": 4.78152460558529e-05, + "loss": 5.0521, + "step": 22543 + }, + { + "epoch": 0.1340755542868018, + "grad_norm": 2.068673849105835, + "learning_rate": 4.781505508722929e-05, + "loss": 4.4481, + "step": 22544 + }, + { + "epoch": 0.13408150157008278, + "grad_norm": 2.2658448219299316, + "learning_rate": 4.7814864110641175e-05, + "loss": 4.5904, + "step": 22545 + }, + { + "epoch": 0.1340874488533638, + "grad_norm": 1.6960278749465942, + "learning_rate": 4.781467312608864e-05, + "loss": 5.4661, + "step": 22546 + }, + { + "epoch": 0.13409339613664478, + "grad_norm": 1.7006616592407227, + "learning_rate": 4.781448213357173e-05, + "loss": 5.338, + "step": 22547 + }, + { + "epoch": 0.13409934341992577, + "grad_norm": 1.6810702085494995, + "learning_rate": 4.7814291133090515e-05, + "loss": 5.6328, + "step": 22548 + }, + { + "epoch": 0.1341052907032068, + "grad_norm": 1.788943886756897, + "learning_rate": 4.781410012464508e-05, + "loss": 4.7265, + "step": 22549 + }, + { + "epoch": 0.13411123798648777, + "grad_norm": 1.8539581298828125, + "learning_rate": 4.781390910823547e-05, + "loss": 5.0821, + "step": 22550 + }, + { + "epoch": 0.13411718526976876, + "grad_norm": 1.548677682876587, + "learning_rate": 4.781371808386176e-05, + "loss": 5.4704, + "step": 22551 + }, + { + "epoch": 0.13412313255304978, + "grad_norm": 1.4806692600250244, + "learning_rate": 4.781352705152402e-05, + "loss": 5.5085, + "step": 22552 + }, + { + "epoch": 0.13412907983633077, + "grad_norm": 1.5281784534454346, + "learning_rate": 4.781333601122231e-05, + "loss": 5.0698, + "step": 22553 + }, + { + "epoch": 0.13413502711961175, + "grad_norm": 1.681803822517395, + "learning_rate": 4.78131449629567e-05, + "loss": 4.6259, + "step": 22554 + }, + { + "epoch": 0.13414097440289277, + "grad_norm": 1.9039119482040405, + "learning_rate": 4.781295390672726e-05, + "loss": 4.967, + "step": 22555 + }, + { + "epoch": 0.13414692168617376, + "grad_norm": 1.3885890245437622, + "learning_rate": 4.781276284253405e-05, + "loss": 4.5992, + "step": 22556 + }, + { + "epoch": 0.13415286896945475, + "grad_norm": 1.5828464031219482, + "learning_rate": 4.781257177037714e-05, + "loss": 4.6859, + "step": 22557 + }, + { + "epoch": 0.13415881625273576, + "grad_norm": 1.6242060661315918, + "learning_rate": 4.78123806902566e-05, + "loss": 4.7105, + "step": 22558 + }, + { + "epoch": 0.13416476353601675, + "grad_norm": 1.6682454347610474, + "learning_rate": 4.781218960217249e-05, + "loss": 4.8545, + "step": 22559 + }, + { + "epoch": 0.13417071081929774, + "grad_norm": 1.8982216119766235, + "learning_rate": 4.781199850612489e-05, + "loss": 5.4946, + "step": 22560 + }, + { + "epoch": 0.13417665810257876, + "grad_norm": 1.916904330253601, + "learning_rate": 4.781180740211384e-05, + "loss": 5.7877, + "step": 22561 + }, + { + "epoch": 0.13418260538585974, + "grad_norm": 2.1762099266052246, + "learning_rate": 4.781161629013944e-05, + "loss": 5.7918, + "step": 22562 + }, + { + "epoch": 0.13418855266914073, + "grad_norm": 1.7190003395080566, + "learning_rate": 4.7811425170201726e-05, + "loss": 5.5881, + "step": 22563 + }, + { + "epoch": 0.13419449995242175, + "grad_norm": 1.5587143898010254, + "learning_rate": 4.781123404230079e-05, + "loss": 5.5391, + "step": 22564 + }, + { + "epoch": 0.13420044723570274, + "grad_norm": 1.8347082138061523, + "learning_rate": 4.7811042906436684e-05, + "loss": 5.7366, + "step": 22565 + }, + { + "epoch": 0.13420639451898372, + "grad_norm": 1.5644575357437134, + "learning_rate": 4.7810851762609484e-05, + "loss": 5.6529, + "step": 22566 + }, + { + "epoch": 0.1342123418022647, + "grad_norm": 1.6571894884109497, + "learning_rate": 4.7810660610819246e-05, + "loss": 5.1555, + "step": 22567 + }, + { + "epoch": 0.13421828908554573, + "grad_norm": 1.8291380405426025, + "learning_rate": 4.7810469451066045e-05, + "loss": 5.878, + "step": 22568 + }, + { + "epoch": 0.13422423636882672, + "grad_norm": 1.8254185914993286, + "learning_rate": 4.781027828334994e-05, + "loss": 5.0244, + "step": 22569 + }, + { + "epoch": 0.1342301836521077, + "grad_norm": 1.5728260278701782, + "learning_rate": 4.7810087107671e-05, + "loss": 5.8684, + "step": 22570 + }, + { + "epoch": 0.13423613093538872, + "grad_norm": 1.4518792629241943, + "learning_rate": 4.7809895924029303e-05, + "loss": 6.0868, + "step": 22571 + }, + { + "epoch": 0.1342420782186697, + "grad_norm": 1.5205591917037964, + "learning_rate": 4.7809704732424905e-05, + "loss": 5.3721, + "step": 22572 + }, + { + "epoch": 0.1342480255019507, + "grad_norm": 1.7081562280654907, + "learning_rate": 4.7809513532857876e-05, + "loss": 4.9758, + "step": 22573 + }, + { + "epoch": 0.1342539727852317, + "grad_norm": 1.4048930406570435, + "learning_rate": 4.7809322325328275e-05, + "loss": 5.4701, + "step": 22574 + }, + { + "epoch": 0.1342599200685127, + "grad_norm": 1.5663319826126099, + "learning_rate": 4.780913110983618e-05, + "loss": 5.1094, + "step": 22575 + }, + { + "epoch": 0.1342658673517937, + "grad_norm": 1.6008634567260742, + "learning_rate": 4.780893988638165e-05, + "loss": 5.2138, + "step": 22576 + }, + { + "epoch": 0.1342718146350747, + "grad_norm": 1.5711628198623657, + "learning_rate": 4.780874865496475e-05, + "loss": 5.7172, + "step": 22577 + }, + { + "epoch": 0.1342777619183557, + "grad_norm": 1.799984335899353, + "learning_rate": 4.7808557415585566e-05, + "loss": 4.8959, + "step": 22578 + }, + { + "epoch": 0.13428370920163668, + "grad_norm": 1.7693933248519897, + "learning_rate": 4.7808366168244137e-05, + "loss": 5.376, + "step": 22579 + }, + { + "epoch": 0.1342896564849177, + "grad_norm": 2.1041815280914307, + "learning_rate": 4.780817491294055e-05, + "loss": 5.4672, + "step": 22580 + }, + { + "epoch": 0.13429560376819868, + "grad_norm": 1.8219122886657715, + "learning_rate": 4.780798364967486e-05, + "loss": 5.6201, + "step": 22581 + }, + { + "epoch": 0.13430155105147967, + "grad_norm": 1.5907140970230103, + "learning_rate": 4.780779237844715e-05, + "loss": 5.2499, + "step": 22582 + }, + { + "epoch": 0.1343074983347607, + "grad_norm": 1.388074278831482, + "learning_rate": 4.780760109925746e-05, + "loss": 5.535, + "step": 22583 + }, + { + "epoch": 0.13431344561804168, + "grad_norm": 1.4996978044509888, + "learning_rate": 4.780740981210588e-05, + "loss": 5.2713, + "step": 22584 + }, + { + "epoch": 0.13431939290132267, + "grad_norm": 1.591178059577942, + "learning_rate": 4.780721851699247e-05, + "loss": 5.2211, + "step": 22585 + }, + { + "epoch": 0.13432534018460368, + "grad_norm": 1.5548349618911743, + "learning_rate": 4.780702721391729e-05, + "loss": 5.2867, + "step": 22586 + }, + { + "epoch": 0.13433128746788467, + "grad_norm": 1.5549981594085693, + "learning_rate": 4.780683590288042e-05, + "loss": 5.3627, + "step": 22587 + }, + { + "epoch": 0.13433723475116566, + "grad_norm": 1.4587602615356445, + "learning_rate": 4.780664458388191e-05, + "loss": 5.2031, + "step": 22588 + }, + { + "epoch": 0.13434318203444667, + "grad_norm": 1.836823582649231, + "learning_rate": 4.7806453256921846e-05, + "loss": 4.9802, + "step": 22589 + }, + { + "epoch": 0.13434912931772766, + "grad_norm": 1.5445985794067383, + "learning_rate": 4.780626192200027e-05, + "loss": 4.8789, + "step": 22590 + }, + { + "epoch": 0.13435507660100865, + "grad_norm": 1.5032085180282593, + "learning_rate": 4.780607057911728e-05, + "loss": 4.936, + "step": 22591 + }, + { + "epoch": 0.13436102388428967, + "grad_norm": 1.5628653764724731, + "learning_rate": 4.780587922827292e-05, + "loss": 4.9026, + "step": 22592 + }, + { + "epoch": 0.13436697116757065, + "grad_norm": 2.011505126953125, + "learning_rate": 4.7805687869467265e-05, + "loss": 4.5883, + "step": 22593 + }, + { + "epoch": 0.13437291845085164, + "grad_norm": 1.824877142906189, + "learning_rate": 4.780549650270038e-05, + "loss": 4.7637, + "step": 22594 + }, + { + "epoch": 0.13437886573413266, + "grad_norm": 1.3882604837417603, + "learning_rate": 4.780530512797232e-05, + "loss": 5.1455, + "step": 22595 + }, + { + "epoch": 0.13438481301741365, + "grad_norm": 1.6364738941192627, + "learning_rate": 4.780511374528318e-05, + "loss": 4.7607, + "step": 22596 + }, + { + "epoch": 0.13439076030069463, + "grad_norm": 1.6384764909744263, + "learning_rate": 4.7804922354633004e-05, + "loss": 4.8959, + "step": 22597 + }, + { + "epoch": 0.13439670758397565, + "grad_norm": 1.53514564037323, + "learning_rate": 4.780473095602186e-05, + "loss": 4.9072, + "step": 22598 + }, + { + "epoch": 0.13440265486725664, + "grad_norm": 1.5599232912063599, + "learning_rate": 4.780453954944983e-05, + "loss": 5.0727, + "step": 22599 + }, + { + "epoch": 0.13440860215053763, + "grad_norm": 1.6296029090881348, + "learning_rate": 4.780434813491696e-05, + "loss": 5.1448, + "step": 22600 + }, + { + "epoch": 0.13441454943381864, + "grad_norm": 1.8083057403564453, + "learning_rate": 4.780415671242334e-05, + "loss": 5.0841, + "step": 22601 + }, + { + "epoch": 0.13442049671709963, + "grad_norm": 1.668716311454773, + "learning_rate": 4.780396528196902e-05, + "loss": 5.0684, + "step": 22602 + }, + { + "epoch": 0.13442644400038062, + "grad_norm": 1.5879114866256714, + "learning_rate": 4.7803773843554065e-05, + "loss": 5.3685, + "step": 22603 + }, + { + "epoch": 0.13443239128366163, + "grad_norm": 1.6570247411727905, + "learning_rate": 4.780358239717855e-05, + "loss": 5.2864, + "step": 22604 + }, + { + "epoch": 0.13443833856694262, + "grad_norm": 1.5763763189315796, + "learning_rate": 4.780339094284254e-05, + "loss": 5.1896, + "step": 22605 + }, + { + "epoch": 0.1344442858502236, + "grad_norm": 1.6956191062927246, + "learning_rate": 4.7803199480546105e-05, + "loss": 5.1213, + "step": 22606 + }, + { + "epoch": 0.13445023313350463, + "grad_norm": 1.64959716796875, + "learning_rate": 4.780300801028931e-05, + "loss": 4.8764, + "step": 22607 + }, + { + "epoch": 0.13445618041678561, + "grad_norm": 1.7988736629486084, + "learning_rate": 4.7802816532072216e-05, + "loss": 4.7578, + "step": 22608 + }, + { + "epoch": 0.1344621277000666, + "grad_norm": 1.6349395513534546, + "learning_rate": 4.78026250458949e-05, + "loss": 5.5973, + "step": 22609 + }, + { + "epoch": 0.13446807498334762, + "grad_norm": 1.7561520338058472, + "learning_rate": 4.7802433551757416e-05, + "loss": 4.5933, + "step": 22610 + }, + { + "epoch": 0.1344740222666286, + "grad_norm": 1.7918694019317627, + "learning_rate": 4.780224204965984e-05, + "loss": 4.6726, + "step": 22611 + }, + { + "epoch": 0.1344799695499096, + "grad_norm": 1.6543810367584229, + "learning_rate": 4.780205053960224e-05, + "loss": 5.0966, + "step": 22612 + }, + { + "epoch": 0.1344859168331906, + "grad_norm": 1.4896337985992432, + "learning_rate": 4.7801859021584685e-05, + "loss": 4.9243, + "step": 22613 + }, + { + "epoch": 0.1344918641164716, + "grad_norm": 1.6509222984313965, + "learning_rate": 4.780166749560723e-05, + "loss": 5.0023, + "step": 22614 + }, + { + "epoch": 0.1344978113997526, + "grad_norm": 1.7909302711486816, + "learning_rate": 4.7801475961669944e-05, + "loss": 4.8274, + "step": 22615 + }, + { + "epoch": 0.1345037586830336, + "grad_norm": 1.7640331983566284, + "learning_rate": 4.780128441977291e-05, + "loss": 4.7262, + "step": 22616 + }, + { + "epoch": 0.1345097059663146, + "grad_norm": 1.6381694078445435, + "learning_rate": 4.780109286991617e-05, + "loss": 4.9027, + "step": 22617 + }, + { + "epoch": 0.13451565324959558, + "grad_norm": 1.830243468284607, + "learning_rate": 4.780090131209981e-05, + "loss": 4.837, + "step": 22618 + }, + { + "epoch": 0.1345216005328766, + "grad_norm": 1.6413569450378418, + "learning_rate": 4.780070974632389e-05, + "loss": 4.6675, + "step": 22619 + }, + { + "epoch": 0.13452754781615758, + "grad_norm": 1.7041996717453003, + "learning_rate": 4.780051817258848e-05, + "loss": 4.4556, + "step": 22620 + }, + { + "epoch": 0.13453349509943857, + "grad_norm": 1.6706191301345825, + "learning_rate": 4.780032659089364e-05, + "loss": 5.29, + "step": 22621 + }, + { + "epoch": 0.1345394423827196, + "grad_norm": 1.6883933544158936, + "learning_rate": 4.780013500123945e-05, + "loss": 5.2777, + "step": 22622 + }, + { + "epoch": 0.13454538966600058, + "grad_norm": 1.6006532907485962, + "learning_rate": 4.779994340362596e-05, + "loss": 5.1652, + "step": 22623 + }, + { + "epoch": 0.13455133694928156, + "grad_norm": 1.5645374059677124, + "learning_rate": 4.779975179805325e-05, + "loss": 5.0746, + "step": 22624 + }, + { + "epoch": 0.13455728423256255, + "grad_norm": 1.4294723272323608, + "learning_rate": 4.7799560184521384e-05, + "loss": 5.1747, + "step": 22625 + }, + { + "epoch": 0.13456323151584357, + "grad_norm": 1.5289671421051025, + "learning_rate": 4.7799368563030424e-05, + "loss": 5.0096, + "step": 22626 + }, + { + "epoch": 0.13456917879912456, + "grad_norm": 1.4476962089538574, + "learning_rate": 4.779917693358044e-05, + "loss": 5.1043, + "step": 22627 + }, + { + "epoch": 0.13457512608240554, + "grad_norm": 1.647494912147522, + "learning_rate": 4.7798985296171494e-05, + "loss": 5.2014, + "step": 22628 + }, + { + "epoch": 0.13458107336568656, + "grad_norm": 1.6972601413726807, + "learning_rate": 4.7798793650803665e-05, + "loss": 5.1526, + "step": 22629 + }, + { + "epoch": 0.13458702064896755, + "grad_norm": 1.7442299127578735, + "learning_rate": 4.779860199747701e-05, + "loss": 5.3699, + "step": 22630 + }, + { + "epoch": 0.13459296793224854, + "grad_norm": 1.5356593132019043, + "learning_rate": 4.77984103361916e-05, + "loss": 5.164, + "step": 22631 + }, + { + "epoch": 0.13459891521552955, + "grad_norm": 1.4700989723205566, + "learning_rate": 4.77982186669475e-05, + "loss": 4.7305, + "step": 22632 + }, + { + "epoch": 0.13460486249881054, + "grad_norm": 1.4296282529830933, + "learning_rate": 4.779802698974477e-05, + "loss": 4.7196, + "step": 22633 + }, + { + "epoch": 0.13461080978209153, + "grad_norm": 1.4722986221313477, + "learning_rate": 4.7797835304583494e-05, + "loss": 4.763, + "step": 22634 + }, + { + "epoch": 0.13461675706537254, + "grad_norm": 1.4767835140228271, + "learning_rate": 4.779764361146373e-05, + "loss": 4.6168, + "step": 22635 + }, + { + "epoch": 0.13462270434865353, + "grad_norm": 1.5353070497512817, + "learning_rate": 4.779745191038554e-05, + "loss": 4.8458, + "step": 22636 + }, + { + "epoch": 0.13462865163193452, + "grad_norm": 1.6942658424377441, + "learning_rate": 4.779726020134899e-05, + "loss": 4.8253, + "step": 22637 + }, + { + "epoch": 0.13463459891521554, + "grad_norm": 1.3153749704360962, + "learning_rate": 4.779706848435416e-05, + "loss": 4.6095, + "step": 22638 + }, + { + "epoch": 0.13464054619849652, + "grad_norm": 1.5381252765655518, + "learning_rate": 4.779687675940111e-05, + "loss": 4.202, + "step": 22639 + }, + { + "epoch": 0.1346464934817775, + "grad_norm": 1.5490522384643555, + "learning_rate": 4.779668502648989e-05, + "loss": 4.9204, + "step": 22640 + }, + { + "epoch": 0.13465244076505853, + "grad_norm": 1.518019676208496, + "learning_rate": 4.7796493285620604e-05, + "loss": 5.3894, + "step": 22641 + }, + { + "epoch": 0.13465838804833952, + "grad_norm": 1.635918378829956, + "learning_rate": 4.7796301536793284e-05, + "loss": 4.3345, + "step": 22642 + }, + { + "epoch": 0.1346643353316205, + "grad_norm": 1.7409108877182007, + "learning_rate": 4.779610978000802e-05, + "loss": 4.2783, + "step": 22643 + }, + { + "epoch": 0.13467028261490152, + "grad_norm": 1.7899144887924194, + "learning_rate": 4.7795918015264865e-05, + "loss": 4.8578, + "step": 22644 + }, + { + "epoch": 0.1346762298981825, + "grad_norm": 1.6725822687149048, + "learning_rate": 4.779572624256389e-05, + "loss": 4.7902, + "step": 22645 + }, + { + "epoch": 0.1346821771814635, + "grad_norm": 1.8630287647247314, + "learning_rate": 4.7795534461905165e-05, + "loss": 4.5775, + "step": 22646 + }, + { + "epoch": 0.1346881244647445, + "grad_norm": 1.6607400178909302, + "learning_rate": 4.779534267328875e-05, + "loss": 4.7948, + "step": 22647 + }, + { + "epoch": 0.1346940717480255, + "grad_norm": 1.5015220642089844, + "learning_rate": 4.7795150876714726e-05, + "loss": 4.3331, + "step": 22648 + }, + { + "epoch": 0.1347000190313065, + "grad_norm": 1.5176305770874023, + "learning_rate": 4.779495907218314e-05, + "loss": 4.7168, + "step": 22649 + }, + { + "epoch": 0.1347059663145875, + "grad_norm": 1.8669017553329468, + "learning_rate": 4.7794767259694076e-05, + "loss": 4.6268, + "step": 22650 + }, + { + "epoch": 0.1347119135978685, + "grad_norm": 1.795281171798706, + "learning_rate": 4.7794575439247586e-05, + "loss": 4.6233, + "step": 22651 + }, + { + "epoch": 0.13471786088114948, + "grad_norm": 1.9019118547439575, + "learning_rate": 4.779438361084375e-05, + "loss": 4.9087, + "step": 22652 + }, + { + "epoch": 0.1347238081644305, + "grad_norm": 1.8863301277160645, + "learning_rate": 4.779419177448263e-05, + "loss": 4.6571, + "step": 22653 + }, + { + "epoch": 0.13472975544771149, + "grad_norm": 1.7758681774139404, + "learning_rate": 4.779399993016429e-05, + "loss": 4.7445, + "step": 22654 + }, + { + "epoch": 0.13473570273099247, + "grad_norm": 1.8668162822723389, + "learning_rate": 4.7793808077888804e-05, + "loss": 4.8334, + "step": 22655 + }, + { + "epoch": 0.1347416500142735, + "grad_norm": 1.8495571613311768, + "learning_rate": 4.7793616217656235e-05, + "loss": 4.7865, + "step": 22656 + }, + { + "epoch": 0.13474759729755448, + "grad_norm": 2.0655038356781006, + "learning_rate": 4.779342434946665e-05, + "loss": 4.6479, + "step": 22657 + }, + { + "epoch": 0.13475354458083547, + "grad_norm": 1.8008273839950562, + "learning_rate": 4.7793232473320116e-05, + "loss": 4.8482, + "step": 22658 + }, + { + "epoch": 0.13475949186411648, + "grad_norm": 1.8431730270385742, + "learning_rate": 4.7793040589216695e-05, + "loss": 4.5315, + "step": 22659 + }, + { + "epoch": 0.13476543914739747, + "grad_norm": 1.7335654497146606, + "learning_rate": 4.779284869715647e-05, + "loss": 5.2788, + "step": 22660 + }, + { + "epoch": 0.13477138643067846, + "grad_norm": 1.6339887380599976, + "learning_rate": 4.779265679713949e-05, + "loss": 4.9113, + "step": 22661 + }, + { + "epoch": 0.13477733371395947, + "grad_norm": 1.746029019355774, + "learning_rate": 4.7792464889165825e-05, + "loss": 5.3739, + "step": 22662 + }, + { + "epoch": 0.13478328099724046, + "grad_norm": 1.6831165552139282, + "learning_rate": 4.7792272973235554e-05, + "loss": 5.2394, + "step": 22663 + }, + { + "epoch": 0.13478922828052145, + "grad_norm": 1.629170298576355, + "learning_rate": 4.7792081049348737e-05, + "loss": 5.0894, + "step": 22664 + }, + { + "epoch": 0.13479517556380247, + "grad_norm": 1.71427321434021, + "learning_rate": 4.779188911750543e-05, + "loss": 4.9391, + "step": 22665 + }, + { + "epoch": 0.13480112284708345, + "grad_norm": 1.6911921501159668, + "learning_rate": 4.779169717770572e-05, + "loss": 4.965, + "step": 22666 + }, + { + "epoch": 0.13480707013036444, + "grad_norm": 1.6597939729690552, + "learning_rate": 4.779150522994965e-05, + "loss": 5.1885, + "step": 22667 + }, + { + "epoch": 0.13481301741364546, + "grad_norm": 1.8732246160507202, + "learning_rate": 4.779131327423732e-05, + "loss": 4.7274, + "step": 22668 + }, + { + "epoch": 0.13481896469692645, + "grad_norm": 1.6462973356246948, + "learning_rate": 4.7791121310568765e-05, + "loss": 5.0614, + "step": 22669 + }, + { + "epoch": 0.13482491198020743, + "grad_norm": 1.5832293033599854, + "learning_rate": 4.7790929338944065e-05, + "loss": 5.4794, + "step": 22670 + }, + { + "epoch": 0.13483085926348845, + "grad_norm": 1.8505337238311768, + "learning_rate": 4.7790737359363293e-05, + "loss": 5.3381, + "step": 22671 + }, + { + "epoch": 0.13483680654676944, + "grad_norm": 1.4535889625549316, + "learning_rate": 4.7790545371826504e-05, + "loss": 5.1247, + "step": 22672 + }, + { + "epoch": 0.13484275383005043, + "grad_norm": 2.478214979171753, + "learning_rate": 4.779035337633377e-05, + "loss": 5.2909, + "step": 22673 + }, + { + "epoch": 0.13484870111333144, + "grad_norm": 1.3034166097640991, + "learning_rate": 4.7790161372885176e-05, + "loss": 5.36, + "step": 22674 + }, + { + "epoch": 0.13485464839661243, + "grad_norm": 1.6429485082626343, + "learning_rate": 4.778996936148076e-05, + "loss": 5.5559, + "step": 22675 + }, + { + "epoch": 0.13486059567989342, + "grad_norm": 1.7537177801132202, + "learning_rate": 4.77897773421206e-05, + "loss": 5.3665, + "step": 22676 + }, + { + "epoch": 0.13486654296317444, + "grad_norm": 1.7982977628707886, + "learning_rate": 4.778958531480476e-05, + "loss": 5.5078, + "step": 22677 + }, + { + "epoch": 0.13487249024645542, + "grad_norm": 1.5147206783294678, + "learning_rate": 4.7789393279533315e-05, + "loss": 5.6726, + "step": 22678 + }, + { + "epoch": 0.1348784375297364, + "grad_norm": 1.405532956123352, + "learning_rate": 4.778920123630634e-05, + "loss": 5.4188, + "step": 22679 + }, + { + "epoch": 0.13488438481301743, + "grad_norm": 1.4880021810531616, + "learning_rate": 4.778900918512387e-05, + "loss": 5.4478, + "step": 22680 + }, + { + "epoch": 0.13489033209629842, + "grad_norm": 1.4672034978866577, + "learning_rate": 4.7788817125986006e-05, + "loss": 5.2975, + "step": 22681 + }, + { + "epoch": 0.1348962793795794, + "grad_norm": 1.5284076929092407, + "learning_rate": 4.77886250588928e-05, + "loss": 5.008, + "step": 22682 + }, + { + "epoch": 0.1349022266628604, + "grad_norm": 1.6853814125061035, + "learning_rate": 4.778843298384431e-05, + "loss": 4.5719, + "step": 22683 + }, + { + "epoch": 0.1349081739461414, + "grad_norm": 1.8264626264572144, + "learning_rate": 4.778824090084063e-05, + "loss": 4.7764, + "step": 22684 + }, + { + "epoch": 0.1349141212294224, + "grad_norm": 1.3100756406784058, + "learning_rate": 4.77880488098818e-05, + "loss": 4.9967, + "step": 22685 + }, + { + "epoch": 0.13492006851270338, + "grad_norm": 1.5330268144607544, + "learning_rate": 4.7787856710967895e-05, + "loss": 4.6979, + "step": 22686 + }, + { + "epoch": 0.1349260157959844, + "grad_norm": 1.5872783660888672, + "learning_rate": 4.778766460409899e-05, + "loss": 4.9115, + "step": 22687 + }, + { + "epoch": 0.1349319630792654, + "grad_norm": 1.7895172834396362, + "learning_rate": 4.778747248927515e-05, + "loss": 4.9802, + "step": 22688 + }, + { + "epoch": 0.13493791036254638, + "grad_norm": 1.7277544736862183, + "learning_rate": 4.778728036649643e-05, + "loss": 5.2551, + "step": 22689 + }, + { + "epoch": 0.1349438576458274, + "grad_norm": 1.6623975038528442, + "learning_rate": 4.778708823576291e-05, + "loss": 5.4733, + "step": 22690 + }, + { + "epoch": 0.13494980492910838, + "grad_norm": 1.5472412109375, + "learning_rate": 4.7786896097074655e-05, + "loss": 5.3827, + "step": 22691 + }, + { + "epoch": 0.13495575221238937, + "grad_norm": 1.5824527740478516, + "learning_rate": 4.778670395043173e-05, + "loss": 5.1529, + "step": 22692 + }, + { + "epoch": 0.13496169949567038, + "grad_norm": 1.702009916305542, + "learning_rate": 4.77865117958342e-05, + "loss": 4.9916, + "step": 22693 + }, + { + "epoch": 0.13496764677895137, + "grad_norm": 1.653401255607605, + "learning_rate": 4.778631963328214e-05, + "loss": 5.3644, + "step": 22694 + }, + { + "epoch": 0.13497359406223236, + "grad_norm": 1.7365010976791382, + "learning_rate": 4.7786127462775604e-05, + "loss": 5.6488, + "step": 22695 + }, + { + "epoch": 0.13497954134551338, + "grad_norm": 1.749050498008728, + "learning_rate": 4.778593528431467e-05, + "loss": 5.6256, + "step": 22696 + }, + { + "epoch": 0.13498548862879436, + "grad_norm": 1.8504292964935303, + "learning_rate": 4.7785743097899394e-05, + "loss": 5.3972, + "step": 22697 + }, + { + "epoch": 0.13499143591207535, + "grad_norm": 1.6481549739837646, + "learning_rate": 4.7785550903529864e-05, + "loss": 5.2532, + "step": 22698 + }, + { + "epoch": 0.13499738319535637, + "grad_norm": 1.6081243753433228, + "learning_rate": 4.778535870120612e-05, + "loss": 5.2455, + "step": 22699 + }, + { + "epoch": 0.13500333047863736, + "grad_norm": 1.7087515592575073, + "learning_rate": 4.7785166490928246e-05, + "loss": 5.3115, + "step": 22700 + }, + { + "epoch": 0.13500927776191834, + "grad_norm": 1.626558780670166, + "learning_rate": 4.7784974272696314e-05, + "loss": 4.9586, + "step": 22701 + }, + { + "epoch": 0.13501522504519936, + "grad_norm": 1.5453464984893799, + "learning_rate": 4.778478204651038e-05, + "loss": 5.3882, + "step": 22702 + }, + { + "epoch": 0.13502117232848035, + "grad_norm": 1.602817416191101, + "learning_rate": 4.778458981237051e-05, + "loss": 5.1293, + "step": 22703 + }, + { + "epoch": 0.13502711961176134, + "grad_norm": 1.642824411392212, + "learning_rate": 4.778439757027677e-05, + "loss": 5.25, + "step": 22704 + }, + { + "epoch": 0.13503306689504235, + "grad_norm": 1.544092059135437, + "learning_rate": 4.7784205320229245e-05, + "loss": 5.4593, + "step": 22705 + }, + { + "epoch": 0.13503901417832334, + "grad_norm": 1.5194666385650635, + "learning_rate": 4.778401306222798e-05, + "loss": 5.1281, + "step": 22706 + }, + { + "epoch": 0.13504496146160433, + "grad_norm": 1.5252684354782104, + "learning_rate": 4.778382079627305e-05, + "loss": 5.2614, + "step": 22707 + }, + { + "epoch": 0.13505090874488535, + "grad_norm": 1.3341602087020874, + "learning_rate": 4.778362852236453e-05, + "loss": 5.6714, + "step": 22708 + }, + { + "epoch": 0.13505685602816633, + "grad_norm": 1.4264339208602905, + "learning_rate": 4.7783436240502475e-05, + "loss": 5.5506, + "step": 22709 + }, + { + "epoch": 0.13506280331144732, + "grad_norm": 1.7837181091308594, + "learning_rate": 4.778324395068696e-05, + "loss": 5.4757, + "step": 22710 + }, + { + "epoch": 0.13506875059472834, + "grad_norm": 1.6878288984298706, + "learning_rate": 4.7783051652918054e-05, + "loss": 5.4745, + "step": 22711 + }, + { + "epoch": 0.13507469787800933, + "grad_norm": 1.4143346548080444, + "learning_rate": 4.778285934719582e-05, + "loss": 5.5602, + "step": 22712 + }, + { + "epoch": 0.1350806451612903, + "grad_norm": 1.4829423427581787, + "learning_rate": 4.778266703352032e-05, + "loss": 5.4767, + "step": 22713 + }, + { + "epoch": 0.13508659244457133, + "grad_norm": 1.5431561470031738, + "learning_rate": 4.778247471189163e-05, + "loss": 5.532, + "step": 22714 + }, + { + "epoch": 0.13509253972785232, + "grad_norm": 1.6398223638534546, + "learning_rate": 4.7782282382309814e-05, + "loss": 5.4421, + "step": 22715 + }, + { + "epoch": 0.1350984870111333, + "grad_norm": 1.7385345697402954, + "learning_rate": 4.778209004477494e-05, + "loss": 4.9767, + "step": 22716 + }, + { + "epoch": 0.13510443429441432, + "grad_norm": 1.659159541130066, + "learning_rate": 4.7781897699287066e-05, + "loss": 5.2567, + "step": 22717 + }, + { + "epoch": 0.1351103815776953, + "grad_norm": 1.665582299232483, + "learning_rate": 4.7781705345846274e-05, + "loss": 4.9557, + "step": 22718 + }, + { + "epoch": 0.1351163288609763, + "grad_norm": 1.603225827217102, + "learning_rate": 4.7781512984452614e-05, + "loss": 5.3373, + "step": 22719 + }, + { + "epoch": 0.13512227614425731, + "grad_norm": 2.11853289604187, + "learning_rate": 4.7781320615106176e-05, + "loss": 4.9767, + "step": 22720 + }, + { + "epoch": 0.1351282234275383, + "grad_norm": 1.463710069656372, + "learning_rate": 4.7781128237807006e-05, + "loss": 5.0996, + "step": 22721 + }, + { + "epoch": 0.1351341707108193, + "grad_norm": 1.785783290863037, + "learning_rate": 4.7780935852555186e-05, + "loss": 5.0664, + "step": 22722 + }, + { + "epoch": 0.1351401179941003, + "grad_norm": 1.6467021703720093, + "learning_rate": 4.778074345935078e-05, + "loss": 5.0879, + "step": 22723 + }, + { + "epoch": 0.1351460652773813, + "grad_norm": 1.7273554801940918, + "learning_rate": 4.7780551058193834e-05, + "loss": 5.1165, + "step": 22724 + }, + { + "epoch": 0.13515201256066228, + "grad_norm": 1.7785577774047852, + "learning_rate": 4.7780358649084443e-05, + "loss": 4.9459, + "step": 22725 + }, + { + "epoch": 0.1351579598439433, + "grad_norm": 1.6499429941177368, + "learning_rate": 4.7780166232022674e-05, + "loss": 5.3581, + "step": 22726 + }, + { + "epoch": 0.1351639071272243, + "grad_norm": 1.651881217956543, + "learning_rate": 4.777997380700857e-05, + "loss": 5.215, + "step": 22727 + }, + { + "epoch": 0.13516985441050527, + "grad_norm": 1.726369857788086, + "learning_rate": 4.7779781374042215e-05, + "loss": 4.8891, + "step": 22728 + }, + { + "epoch": 0.1351758016937863, + "grad_norm": 1.5628979206085205, + "learning_rate": 4.7779588933123675e-05, + "loss": 5.0173, + "step": 22729 + }, + { + "epoch": 0.13518174897706728, + "grad_norm": 2.179954767227173, + "learning_rate": 4.777939648425302e-05, + "loss": 5.0088, + "step": 22730 + }, + { + "epoch": 0.13518769626034827, + "grad_norm": 1.5813510417938232, + "learning_rate": 4.777920402743031e-05, + "loss": 5.064, + "step": 22731 + }, + { + "epoch": 0.13519364354362928, + "grad_norm": 1.4100569486618042, + "learning_rate": 4.7779011562655616e-05, + "loss": 5.5696, + "step": 22732 + }, + { + "epoch": 0.13519959082691027, + "grad_norm": 1.4252601861953735, + "learning_rate": 4.7778819089929e-05, + "loss": 5.4797, + "step": 22733 + }, + { + "epoch": 0.13520553811019126, + "grad_norm": 1.5482890605926514, + "learning_rate": 4.7778626609250546e-05, + "loss": 5.7168, + "step": 22734 + }, + { + "epoch": 0.13521148539347227, + "grad_norm": 1.7441178560256958, + "learning_rate": 4.77784341206203e-05, + "loss": 5.5385, + "step": 22735 + }, + { + "epoch": 0.13521743267675326, + "grad_norm": 1.5903903245925903, + "learning_rate": 4.777824162403833e-05, + "loss": 5.4181, + "step": 22736 + }, + { + "epoch": 0.13522337996003425, + "grad_norm": 1.6240642070770264, + "learning_rate": 4.777804911950472e-05, + "loss": 5.5071, + "step": 22737 + }, + { + "epoch": 0.13522932724331527, + "grad_norm": 1.4418225288391113, + "learning_rate": 4.7777856607019536e-05, + "loss": 5.6326, + "step": 22738 + }, + { + "epoch": 0.13523527452659626, + "grad_norm": 1.618449330329895, + "learning_rate": 4.7777664086582823e-05, + "loss": 5.4445, + "step": 22739 + }, + { + "epoch": 0.13524122180987724, + "grad_norm": 1.7598767280578613, + "learning_rate": 4.777747155819467e-05, + "loss": 5.3207, + "step": 22740 + }, + { + "epoch": 0.13524716909315823, + "grad_norm": 1.707531213760376, + "learning_rate": 4.7777279021855134e-05, + "loss": 5.2888, + "step": 22741 + }, + { + "epoch": 0.13525311637643925, + "grad_norm": 1.8292144536972046, + "learning_rate": 4.777708647756429e-05, + "loss": 4.897, + "step": 22742 + }, + { + "epoch": 0.13525906365972024, + "grad_norm": 1.893703818321228, + "learning_rate": 4.77768939253222e-05, + "loss": 4.8088, + "step": 22743 + }, + { + "epoch": 0.13526501094300122, + "grad_norm": 1.6884989738464355, + "learning_rate": 4.777670136512893e-05, + "loss": 5.183, + "step": 22744 + }, + { + "epoch": 0.13527095822628224, + "grad_norm": 1.8513271808624268, + "learning_rate": 4.777650879698454e-05, + "loss": 4.6775, + "step": 22745 + }, + { + "epoch": 0.13527690550956323, + "grad_norm": 1.5597106218338013, + "learning_rate": 4.777631622088912e-05, + "loss": 5.268, + "step": 22746 + }, + { + "epoch": 0.13528285279284422, + "grad_norm": 1.6159777641296387, + "learning_rate": 4.777612363684272e-05, + "loss": 5.223, + "step": 22747 + }, + { + "epoch": 0.13528880007612523, + "grad_norm": 1.6712334156036377, + "learning_rate": 4.777593104484541e-05, + "loss": 5.1676, + "step": 22748 + }, + { + "epoch": 0.13529474735940622, + "grad_norm": 1.4349523782730103, + "learning_rate": 4.7775738444897253e-05, + "loss": 5.3066, + "step": 22749 + }, + { + "epoch": 0.1353006946426872, + "grad_norm": 1.6191719770431519, + "learning_rate": 4.7775545836998324e-05, + "loss": 5.2426, + "step": 22750 + }, + { + "epoch": 0.13530664192596822, + "grad_norm": 1.8324687480926514, + "learning_rate": 4.777535322114869e-05, + "loss": 5.2352, + "step": 22751 + }, + { + "epoch": 0.1353125892092492, + "grad_norm": 1.5355842113494873, + "learning_rate": 4.777516059734841e-05, + "loss": 5.5875, + "step": 22752 + }, + { + "epoch": 0.1353185364925302, + "grad_norm": 1.6957530975341797, + "learning_rate": 4.777496796559756e-05, + "loss": 5.4624, + "step": 22753 + }, + { + "epoch": 0.13532448377581122, + "grad_norm": 1.6195729970932007, + "learning_rate": 4.7774775325896205e-05, + "loss": 5.2686, + "step": 22754 + }, + { + "epoch": 0.1353304310590922, + "grad_norm": 1.429439663887024, + "learning_rate": 4.7774582678244406e-05, + "loss": 5.3407, + "step": 22755 + }, + { + "epoch": 0.1353363783423732, + "grad_norm": 1.4609668254852295, + "learning_rate": 4.777439002264225e-05, + "loss": 5.4332, + "step": 22756 + }, + { + "epoch": 0.1353423256256542, + "grad_norm": 1.3537366390228271, + "learning_rate": 4.7774197359089765e-05, + "loss": 5.4353, + "step": 22757 + }, + { + "epoch": 0.1353482729089352, + "grad_norm": 1.6953861713409424, + "learning_rate": 4.7774004687587057e-05, + "loss": 5.1824, + "step": 22758 + }, + { + "epoch": 0.13535422019221618, + "grad_norm": 1.3835570812225342, + "learning_rate": 4.7773812008134186e-05, + "loss": 5.1748, + "step": 22759 + }, + { + "epoch": 0.1353601674754972, + "grad_norm": 1.94771146774292, + "learning_rate": 4.7773619320731206e-05, + "loss": 4.7599, + "step": 22760 + }, + { + "epoch": 0.1353661147587782, + "grad_norm": 1.56703782081604, + "learning_rate": 4.777342662537819e-05, + "loss": 5.4686, + "step": 22761 + }, + { + "epoch": 0.13537206204205918, + "grad_norm": 1.627790093421936, + "learning_rate": 4.77732339220752e-05, + "loss": 5.4504, + "step": 22762 + }, + { + "epoch": 0.1353780093253402, + "grad_norm": 1.5668286085128784, + "learning_rate": 4.777304121082232e-05, + "loss": 5.5147, + "step": 22763 + }, + { + "epoch": 0.13538395660862118, + "grad_norm": 1.7350172996520996, + "learning_rate": 4.7772848491619606e-05, + "loss": 5.1803, + "step": 22764 + }, + { + "epoch": 0.13538990389190217, + "grad_norm": 1.700966715812683, + "learning_rate": 4.7772655764467124e-05, + "loss": 5.1222, + "step": 22765 + }, + { + "epoch": 0.13539585117518319, + "grad_norm": 1.7613048553466797, + "learning_rate": 4.777246302936494e-05, + "loss": 5.1391, + "step": 22766 + }, + { + "epoch": 0.13540179845846417, + "grad_norm": 1.7095452547073364, + "learning_rate": 4.777227028631312e-05, + "loss": 5.112, + "step": 22767 + }, + { + "epoch": 0.13540774574174516, + "grad_norm": 1.8310586214065552, + "learning_rate": 4.7772077535311744e-05, + "loss": 5.0404, + "step": 22768 + }, + { + "epoch": 0.13541369302502618, + "grad_norm": 1.7058879137039185, + "learning_rate": 4.777188477636087e-05, + "loss": 5.1165, + "step": 22769 + }, + { + "epoch": 0.13541964030830717, + "grad_norm": 1.7806624174118042, + "learning_rate": 4.7771692009460565e-05, + "loss": 5.0711, + "step": 22770 + }, + { + "epoch": 0.13542558759158815, + "grad_norm": 1.8086166381835938, + "learning_rate": 4.777149923461089e-05, + "loss": 4.7757, + "step": 22771 + }, + { + "epoch": 0.13543153487486917, + "grad_norm": 1.9984580278396606, + "learning_rate": 4.777130645181194e-05, + "loss": 4.918, + "step": 22772 + }, + { + "epoch": 0.13543748215815016, + "grad_norm": 1.6648451089859009, + "learning_rate": 4.777111366106375e-05, + "loss": 5.0051, + "step": 22773 + }, + { + "epoch": 0.13544342944143115, + "grad_norm": 1.6590383052825928, + "learning_rate": 4.77709208623664e-05, + "loss": 5.6166, + "step": 22774 + }, + { + "epoch": 0.13544937672471216, + "grad_norm": 1.4530583620071411, + "learning_rate": 4.777072805571995e-05, + "loss": 5.6772, + "step": 22775 + }, + { + "epoch": 0.13545532400799315, + "grad_norm": 1.5310078859329224, + "learning_rate": 4.777053524112448e-05, + "loss": 4.9965, + "step": 22776 + }, + { + "epoch": 0.13546127129127414, + "grad_norm": 1.5363576412200928, + "learning_rate": 4.777034241858005e-05, + "loss": 5.2144, + "step": 22777 + }, + { + "epoch": 0.13546721857455515, + "grad_norm": 1.7318395376205444, + "learning_rate": 4.7770149588086735e-05, + "loss": 5.2367, + "step": 22778 + }, + { + "epoch": 0.13547316585783614, + "grad_norm": 1.567736268043518, + "learning_rate": 4.776995674964459e-05, + "loss": 5.4778, + "step": 22779 + }, + { + "epoch": 0.13547911314111713, + "grad_norm": 1.879223108291626, + "learning_rate": 4.7769763903253685e-05, + "loss": 4.8963, + "step": 22780 + }, + { + "epoch": 0.13548506042439815, + "grad_norm": 1.6292016506195068, + "learning_rate": 4.77695710489141e-05, + "loss": 5.2529, + "step": 22781 + }, + { + "epoch": 0.13549100770767913, + "grad_norm": 1.4838228225708008, + "learning_rate": 4.7769378186625885e-05, + "loss": 5.5594, + "step": 22782 + }, + { + "epoch": 0.13549695499096012, + "grad_norm": 1.4567928314208984, + "learning_rate": 4.776918531638912e-05, + "loss": 5.5789, + "step": 22783 + }, + { + "epoch": 0.13550290227424114, + "grad_norm": 1.6464484930038452, + "learning_rate": 4.776899243820386e-05, + "loss": 5.4319, + "step": 22784 + }, + { + "epoch": 0.13550884955752213, + "grad_norm": 1.501028060913086, + "learning_rate": 4.776879955207019e-05, + "loss": 5.5543, + "step": 22785 + }, + { + "epoch": 0.13551479684080311, + "grad_norm": 1.6811163425445557, + "learning_rate": 4.776860665798816e-05, + "loss": 5.4512, + "step": 22786 + }, + { + "epoch": 0.13552074412408413, + "grad_norm": 1.762147068977356, + "learning_rate": 4.7768413755957854e-05, + "loss": 5.6262, + "step": 22787 + }, + { + "epoch": 0.13552669140736512, + "grad_norm": 1.846987009048462, + "learning_rate": 4.7768220845979315e-05, + "loss": 5.4735, + "step": 22788 + }, + { + "epoch": 0.1355326386906461, + "grad_norm": 1.9326568841934204, + "learning_rate": 4.776802792805264e-05, + "loss": 5.3295, + "step": 22789 + }, + { + "epoch": 0.13553858597392712, + "grad_norm": 1.5496313571929932, + "learning_rate": 4.7767835002177874e-05, + "loss": 5.4742, + "step": 22790 + }, + { + "epoch": 0.1355445332572081, + "grad_norm": 1.3328933715820312, + "learning_rate": 4.776764206835509e-05, + "loss": 5.5611, + "step": 22791 + }, + { + "epoch": 0.1355504805404891, + "grad_norm": 1.3349891901016235, + "learning_rate": 4.776744912658437e-05, + "loss": 5.5732, + "step": 22792 + }, + { + "epoch": 0.13555642782377011, + "grad_norm": 1.510608434677124, + "learning_rate": 4.776725617686576e-05, + "loss": 5.4108, + "step": 22793 + }, + { + "epoch": 0.1355623751070511, + "grad_norm": 1.4556225538253784, + "learning_rate": 4.776706321919934e-05, + "loss": 5.5154, + "step": 22794 + }, + { + "epoch": 0.1355683223903321, + "grad_norm": 1.7231537103652954, + "learning_rate": 4.776687025358516e-05, + "loss": 5.4437, + "step": 22795 + }, + { + "epoch": 0.1355742696736131, + "grad_norm": 1.6234036684036255, + "learning_rate": 4.7766677280023314e-05, + "loss": 5.2642, + "step": 22796 + }, + { + "epoch": 0.1355802169568941, + "grad_norm": 1.6550066471099854, + "learning_rate": 4.776648429851385e-05, + "loss": 5.3577, + "step": 22797 + }, + { + "epoch": 0.13558616424017508, + "grad_norm": 1.5199332237243652, + "learning_rate": 4.776629130905684e-05, + "loss": 4.9679, + "step": 22798 + }, + { + "epoch": 0.13559211152345607, + "grad_norm": 1.5900238752365112, + "learning_rate": 4.776609831165236e-05, + "loss": 5.5357, + "step": 22799 + }, + { + "epoch": 0.1355980588067371, + "grad_norm": 1.4585398435592651, + "learning_rate": 4.776590530630047e-05, + "loss": 5.4191, + "step": 22800 + }, + { + "epoch": 0.13560400609001808, + "grad_norm": 1.4049118757247925, + "learning_rate": 4.7765712293001234e-05, + "loss": 5.4423, + "step": 22801 + }, + { + "epoch": 0.13560995337329906, + "grad_norm": 1.5287877321243286, + "learning_rate": 4.7765519271754726e-05, + "loss": 5.4635, + "step": 22802 + }, + { + "epoch": 0.13561590065658008, + "grad_norm": 1.4761078357696533, + "learning_rate": 4.776532624256101e-05, + "loss": 5.394, + "step": 22803 + }, + { + "epoch": 0.13562184793986107, + "grad_norm": 1.523536205291748, + "learning_rate": 4.776513320542015e-05, + "loss": 5.4171, + "step": 22804 + }, + { + "epoch": 0.13562779522314206, + "grad_norm": 1.701953411102295, + "learning_rate": 4.7764940160332214e-05, + "loss": 5.336, + "step": 22805 + }, + { + "epoch": 0.13563374250642307, + "grad_norm": 1.5426260232925415, + "learning_rate": 4.7764747107297284e-05, + "loss": 5.5175, + "step": 22806 + }, + { + "epoch": 0.13563968978970406, + "grad_norm": 1.5670596361160278, + "learning_rate": 4.776455404631541e-05, + "loss": 5.4254, + "step": 22807 + }, + { + "epoch": 0.13564563707298505, + "grad_norm": 1.4388494491577148, + "learning_rate": 4.7764360977386666e-05, + "loss": 5.3282, + "step": 22808 + }, + { + "epoch": 0.13565158435626606, + "grad_norm": 1.4222092628479004, + "learning_rate": 4.776416790051111e-05, + "loss": 5.5187, + "step": 22809 + }, + { + "epoch": 0.13565753163954705, + "grad_norm": 1.604407787322998, + "learning_rate": 4.776397481568883e-05, + "loss": 5.3026, + "step": 22810 + }, + { + "epoch": 0.13566347892282804, + "grad_norm": 1.4160562753677368, + "learning_rate": 4.776378172291988e-05, + "loss": 5.2925, + "step": 22811 + }, + { + "epoch": 0.13566942620610906, + "grad_norm": 1.543260931968689, + "learning_rate": 4.776358862220433e-05, + "loss": 5.4234, + "step": 22812 + }, + { + "epoch": 0.13567537348939004, + "grad_norm": 1.6589266061782837, + "learning_rate": 4.776339551354224e-05, + "loss": 5.0677, + "step": 22813 + }, + { + "epoch": 0.13568132077267103, + "grad_norm": 1.5909267663955688, + "learning_rate": 4.7763202396933696e-05, + "loss": 5.145, + "step": 22814 + }, + { + "epoch": 0.13568726805595205, + "grad_norm": 1.4697500467300415, + "learning_rate": 4.776300927237873e-05, + "loss": 5.2856, + "step": 22815 + }, + { + "epoch": 0.13569321533923304, + "grad_norm": 1.895766019821167, + "learning_rate": 4.7762816139877456e-05, + "loss": 5.3554, + "step": 22816 + }, + { + "epoch": 0.13569916262251402, + "grad_norm": 1.8093748092651367, + "learning_rate": 4.7762622999429905e-05, + "loss": 4.9482, + "step": 22817 + }, + { + "epoch": 0.13570510990579504, + "grad_norm": 1.6899988651275635, + "learning_rate": 4.776242985103616e-05, + "loss": 5.1788, + "step": 22818 + }, + { + "epoch": 0.13571105718907603, + "grad_norm": 1.8199821710586548, + "learning_rate": 4.7762236694696294e-05, + "loss": 4.9181, + "step": 22819 + }, + { + "epoch": 0.13571700447235702, + "grad_norm": 1.7687036991119385, + "learning_rate": 4.776204353041036e-05, + "loss": 4.9925, + "step": 22820 + }, + { + "epoch": 0.13572295175563803, + "grad_norm": 1.705419659614563, + "learning_rate": 4.776185035817843e-05, + "loss": 5.0644, + "step": 22821 + }, + { + "epoch": 0.13572889903891902, + "grad_norm": 1.7805287837982178, + "learning_rate": 4.7761657178000575e-05, + "loss": 5.1567, + "step": 22822 + }, + { + "epoch": 0.1357348463222, + "grad_norm": 1.4791945219039917, + "learning_rate": 4.776146398987686e-05, + "loss": 5.2834, + "step": 22823 + }, + { + "epoch": 0.13574079360548102, + "grad_norm": 1.546128749847412, + "learning_rate": 4.776127079380735e-05, + "loss": 4.8066, + "step": 22824 + }, + { + "epoch": 0.135746740888762, + "grad_norm": 1.6163334846496582, + "learning_rate": 4.776107758979212e-05, + "loss": 5.1771, + "step": 22825 + }, + { + "epoch": 0.135752688172043, + "grad_norm": 1.6902676820755005, + "learning_rate": 4.776088437783123e-05, + "loss": 4.9249, + "step": 22826 + }, + { + "epoch": 0.13575863545532402, + "grad_norm": 1.4966270923614502, + "learning_rate": 4.776069115792475e-05, + "loss": 5.6609, + "step": 22827 + }, + { + "epoch": 0.135764582738605, + "grad_norm": 1.6107707023620605, + "learning_rate": 4.7760497930072754e-05, + "loss": 5.4167, + "step": 22828 + }, + { + "epoch": 0.135770530021886, + "grad_norm": 1.5773305892944336, + "learning_rate": 4.77603046942753e-05, + "loss": 5.4044, + "step": 22829 + }, + { + "epoch": 0.135776477305167, + "grad_norm": 1.6871259212493896, + "learning_rate": 4.7760111450532454e-05, + "loss": 5.5288, + "step": 22830 + }, + { + "epoch": 0.135782424588448, + "grad_norm": 1.4027100801467896, + "learning_rate": 4.77599181988443e-05, + "loss": 5.265, + "step": 22831 + }, + { + "epoch": 0.13578837187172899, + "grad_norm": 1.7435009479522705, + "learning_rate": 4.775972493921088e-05, + "loss": 5.3546, + "step": 22832 + }, + { + "epoch": 0.13579431915501, + "grad_norm": 1.4834927320480347, + "learning_rate": 4.7759531671632286e-05, + "loss": 5.168, + "step": 22833 + }, + { + "epoch": 0.135800266438291, + "grad_norm": 1.6468613147735596, + "learning_rate": 4.775933839610857e-05, + "loss": 5.0984, + "step": 22834 + }, + { + "epoch": 0.13580621372157198, + "grad_norm": 1.6906235218048096, + "learning_rate": 4.77591451126398e-05, + "loss": 5.0563, + "step": 22835 + }, + { + "epoch": 0.135812161004853, + "grad_norm": 1.2667183876037598, + "learning_rate": 4.775895182122605e-05, + "loss": 5.7256, + "step": 22836 + }, + { + "epoch": 0.13581810828813398, + "grad_norm": 1.381974697113037, + "learning_rate": 4.775875852186739e-05, + "loss": 5.6773, + "step": 22837 + }, + { + "epoch": 0.13582405557141497, + "grad_norm": 1.395326018333435, + "learning_rate": 4.775856521456388e-05, + "loss": 5.4884, + "step": 22838 + }, + { + "epoch": 0.13583000285469599, + "grad_norm": 1.4601794481277466, + "learning_rate": 4.775837189931559e-05, + "loss": 5.6866, + "step": 22839 + }, + { + "epoch": 0.13583595013797697, + "grad_norm": 1.3722656965255737, + "learning_rate": 4.7758178576122584e-05, + "loss": 5.7885, + "step": 22840 + }, + { + "epoch": 0.13584189742125796, + "grad_norm": 1.5126278400421143, + "learning_rate": 4.775798524498494e-05, + "loss": 5.5806, + "step": 22841 + }, + { + "epoch": 0.13584784470453898, + "grad_norm": 1.465306282043457, + "learning_rate": 4.7757791905902714e-05, + "loss": 5.5597, + "step": 22842 + }, + { + "epoch": 0.13585379198781997, + "grad_norm": 1.7111048698425293, + "learning_rate": 4.775759855887598e-05, + "loss": 5.3431, + "step": 22843 + }, + { + "epoch": 0.13585973927110095, + "grad_norm": 1.7369952201843262, + "learning_rate": 4.7757405203904796e-05, + "loss": 5.4373, + "step": 22844 + }, + { + "epoch": 0.13586568655438197, + "grad_norm": 1.571898341178894, + "learning_rate": 4.7757211840989246e-05, + "loss": 5.4751, + "step": 22845 + }, + { + "epoch": 0.13587163383766296, + "grad_norm": 1.6752384901046753, + "learning_rate": 4.775701847012938e-05, + "loss": 5.3411, + "step": 22846 + }, + { + "epoch": 0.13587758112094395, + "grad_norm": 1.3036680221557617, + "learning_rate": 4.775682509132529e-05, + "loss": 5.6136, + "step": 22847 + }, + { + "epoch": 0.13588352840422496, + "grad_norm": 1.60060453414917, + "learning_rate": 4.775663170457701e-05, + "loss": 5.3134, + "step": 22848 + }, + { + "epoch": 0.13588947568750595, + "grad_norm": 1.746317982673645, + "learning_rate": 4.775643830988463e-05, + "loss": 5.1176, + "step": 22849 + }, + { + "epoch": 0.13589542297078694, + "grad_norm": 1.5190258026123047, + "learning_rate": 4.775624490724822e-05, + "loss": 5.2673, + "step": 22850 + }, + { + "epoch": 0.13590137025406795, + "grad_norm": 1.5572645664215088, + "learning_rate": 4.775605149666783e-05, + "loss": 5.7732, + "step": 22851 + }, + { + "epoch": 0.13590731753734894, + "grad_norm": 1.6563985347747803, + "learning_rate": 4.775585807814354e-05, + "loss": 5.3757, + "step": 22852 + }, + { + "epoch": 0.13591326482062993, + "grad_norm": 1.583486795425415, + "learning_rate": 4.775566465167541e-05, + "loss": 5.5406, + "step": 22853 + }, + { + "epoch": 0.13591921210391095, + "grad_norm": 1.9212104082107544, + "learning_rate": 4.7755471217263525e-05, + "loss": 5.5629, + "step": 22854 + }, + { + "epoch": 0.13592515938719194, + "grad_norm": 1.5397447347640991, + "learning_rate": 4.775527777490793e-05, + "loss": 5.5745, + "step": 22855 + }, + { + "epoch": 0.13593110667047292, + "grad_norm": 1.4469612836837769, + "learning_rate": 4.775508432460871e-05, + "loss": 5.5762, + "step": 22856 + }, + { + "epoch": 0.13593705395375394, + "grad_norm": 1.6050552129745483, + "learning_rate": 4.775489086636592e-05, + "loss": 5.2207, + "step": 22857 + }, + { + "epoch": 0.13594300123703493, + "grad_norm": 1.5991270542144775, + "learning_rate": 4.7754697400179636e-05, + "loss": 5.3331, + "step": 22858 + }, + { + "epoch": 0.13594894852031592, + "grad_norm": 1.8474901914596558, + "learning_rate": 4.775450392604992e-05, + "loss": 5.3208, + "step": 22859 + }, + { + "epoch": 0.1359548958035969, + "grad_norm": 1.6865973472595215, + "learning_rate": 4.7754310443976844e-05, + "loss": 5.2557, + "step": 22860 + }, + { + "epoch": 0.13596084308687792, + "grad_norm": 1.9411492347717285, + "learning_rate": 4.775411695396047e-05, + "loss": 5.2765, + "step": 22861 + }, + { + "epoch": 0.1359667903701589, + "grad_norm": 1.6263481378555298, + "learning_rate": 4.775392345600087e-05, + "loss": 5.2767, + "step": 22862 + }, + { + "epoch": 0.1359727376534399, + "grad_norm": 1.7159794569015503, + "learning_rate": 4.7753729950098116e-05, + "loss": 5.5175, + "step": 22863 + }, + { + "epoch": 0.1359786849367209, + "grad_norm": 1.6026562452316284, + "learning_rate": 4.7753536436252266e-05, + "loss": 5.3517, + "step": 22864 + }, + { + "epoch": 0.1359846322200019, + "grad_norm": 1.4052190780639648, + "learning_rate": 4.775334291446339e-05, + "loss": 5.3153, + "step": 22865 + }, + { + "epoch": 0.1359905795032829, + "grad_norm": 1.4030534029006958, + "learning_rate": 4.7753149384731556e-05, + "loss": 5.3798, + "step": 22866 + }, + { + "epoch": 0.1359965267865639, + "grad_norm": 1.5234447717666626, + "learning_rate": 4.775295584705683e-05, + "loss": 5.2717, + "step": 22867 + }, + { + "epoch": 0.1360024740698449, + "grad_norm": 1.6578015089035034, + "learning_rate": 4.775276230143929e-05, + "loss": 5.2482, + "step": 22868 + }, + { + "epoch": 0.13600842135312588, + "grad_norm": 1.427674651145935, + "learning_rate": 4.775256874787899e-05, + "loss": 5.3303, + "step": 22869 + }, + { + "epoch": 0.1360143686364069, + "grad_norm": 1.610268473625183, + "learning_rate": 4.7752375186376006e-05, + "loss": 5.4775, + "step": 22870 + }, + { + "epoch": 0.13602031591968788, + "grad_norm": 1.7097511291503906, + "learning_rate": 4.7752181616930404e-05, + "loss": 5.2721, + "step": 22871 + }, + { + "epoch": 0.13602626320296887, + "grad_norm": 1.6628022193908691, + "learning_rate": 4.775198803954225e-05, + "loss": 5.2049, + "step": 22872 + }, + { + "epoch": 0.1360322104862499, + "grad_norm": 1.6983882188796997, + "learning_rate": 4.7751794454211615e-05, + "loss": 5.1596, + "step": 22873 + }, + { + "epoch": 0.13603815776953088, + "grad_norm": 1.6148128509521484, + "learning_rate": 4.775160086093856e-05, + "loss": 5.3958, + "step": 22874 + }, + { + "epoch": 0.13604410505281186, + "grad_norm": 1.6220009326934814, + "learning_rate": 4.7751407259723155e-05, + "loss": 5.2774, + "step": 22875 + }, + { + "epoch": 0.13605005233609288, + "grad_norm": 1.5017454624176025, + "learning_rate": 4.7751213650565464e-05, + "loss": 5.303, + "step": 22876 + }, + { + "epoch": 0.13605599961937387, + "grad_norm": 1.6734380722045898, + "learning_rate": 4.7751020033465566e-05, + "loss": 5.3784, + "step": 22877 + }, + { + "epoch": 0.13606194690265486, + "grad_norm": 1.8177162408828735, + "learning_rate": 4.775082640842352e-05, + "loss": 5.4498, + "step": 22878 + }, + { + "epoch": 0.13606789418593587, + "grad_norm": 1.6287364959716797, + "learning_rate": 4.7750632775439396e-05, + "loss": 5.3252, + "step": 22879 + }, + { + "epoch": 0.13607384146921686, + "grad_norm": 1.5242222547531128, + "learning_rate": 4.7750439134513267e-05, + "loss": 5.2287, + "step": 22880 + }, + { + "epoch": 0.13607978875249785, + "grad_norm": 1.4447482824325562, + "learning_rate": 4.775024548564519e-05, + "loss": 5.3725, + "step": 22881 + }, + { + "epoch": 0.13608573603577886, + "grad_norm": 1.4994373321533203, + "learning_rate": 4.775005182883523e-05, + "loss": 5.4844, + "step": 22882 + }, + { + "epoch": 0.13609168331905985, + "grad_norm": 1.541668176651001, + "learning_rate": 4.774985816408347e-05, + "loss": 5.4171, + "step": 22883 + }, + { + "epoch": 0.13609763060234084, + "grad_norm": 1.4670990705490112, + "learning_rate": 4.7749664491389965e-05, + "loss": 5.4372, + "step": 22884 + }, + { + "epoch": 0.13610357788562186, + "grad_norm": 1.686318039894104, + "learning_rate": 4.7749470810754796e-05, + "loss": 5.1164, + "step": 22885 + }, + { + "epoch": 0.13610952516890285, + "grad_norm": 1.4744656085968018, + "learning_rate": 4.7749277122178015e-05, + "loss": 5.3787, + "step": 22886 + }, + { + "epoch": 0.13611547245218383, + "grad_norm": 1.498948574066162, + "learning_rate": 4.77490834256597e-05, + "loss": 5.2837, + "step": 22887 + }, + { + "epoch": 0.13612141973546485, + "grad_norm": 1.4990612268447876, + "learning_rate": 4.774888972119991e-05, + "loss": 5.3503, + "step": 22888 + }, + { + "epoch": 0.13612736701874584, + "grad_norm": 1.6973026990890503, + "learning_rate": 4.774869600879872e-05, + "loss": 5.2776, + "step": 22889 + }, + { + "epoch": 0.13613331430202683, + "grad_norm": 1.5271309614181519, + "learning_rate": 4.7748502288456193e-05, + "loss": 5.3318, + "step": 22890 + }, + { + "epoch": 0.13613926158530784, + "grad_norm": 1.5284117460250854, + "learning_rate": 4.7748308560172406e-05, + "loss": 5.2975, + "step": 22891 + }, + { + "epoch": 0.13614520886858883, + "grad_norm": 1.45162034034729, + "learning_rate": 4.774811482394741e-05, + "loss": 5.1825, + "step": 22892 + }, + { + "epoch": 0.13615115615186982, + "grad_norm": 1.558273434638977, + "learning_rate": 4.774792107978129e-05, + "loss": 5.1004, + "step": 22893 + }, + { + "epoch": 0.13615710343515083, + "grad_norm": 1.576781988143921, + "learning_rate": 4.77477273276741e-05, + "loss": 5.4028, + "step": 22894 + }, + { + "epoch": 0.13616305071843182, + "grad_norm": 1.3964447975158691, + "learning_rate": 4.7747533567625916e-05, + "loss": 5.4402, + "step": 22895 + }, + { + "epoch": 0.1361689980017128, + "grad_norm": 1.7266137599945068, + "learning_rate": 4.77473397996368e-05, + "loss": 4.9304, + "step": 22896 + }, + { + "epoch": 0.13617494528499383, + "grad_norm": 1.573444128036499, + "learning_rate": 4.774714602370683e-05, + "loss": 4.9736, + "step": 22897 + }, + { + "epoch": 0.13618089256827481, + "grad_norm": 1.7123498916625977, + "learning_rate": 4.774695223983606e-05, + "loss": 5.3678, + "step": 22898 + }, + { + "epoch": 0.1361868398515558, + "grad_norm": 1.8102420568466187, + "learning_rate": 4.7746758448024566e-05, + "loss": 5.2433, + "step": 22899 + }, + { + "epoch": 0.13619278713483682, + "grad_norm": 1.5984879732131958, + "learning_rate": 4.774656464827242e-05, + "loss": 5.2601, + "step": 22900 + }, + { + "epoch": 0.1361987344181178, + "grad_norm": 1.8117280006408691, + "learning_rate": 4.7746370840579666e-05, + "loss": 5.1488, + "step": 22901 + }, + { + "epoch": 0.1362046817013988, + "grad_norm": 1.6972469091415405, + "learning_rate": 4.7746177024946405e-05, + "loss": 5.337, + "step": 22902 + }, + { + "epoch": 0.1362106289846798, + "grad_norm": 1.4006030559539795, + "learning_rate": 4.7745983201372685e-05, + "loss": 5.4563, + "step": 22903 + }, + { + "epoch": 0.1362165762679608, + "grad_norm": 1.7627719640731812, + "learning_rate": 4.774578936985857e-05, + "loss": 5.0125, + "step": 22904 + }, + { + "epoch": 0.1362225235512418, + "grad_norm": 1.3935896158218384, + "learning_rate": 4.774559553040415e-05, + "loss": 5.2413, + "step": 22905 + }, + { + "epoch": 0.1362284708345228, + "grad_norm": 1.3300725221633911, + "learning_rate": 4.7745401683009464e-05, + "loss": 5.391, + "step": 22906 + }, + { + "epoch": 0.1362344181178038, + "grad_norm": 1.5094577074050903, + "learning_rate": 4.7745207827674596e-05, + "loss": 6.0553, + "step": 22907 + }, + { + "epoch": 0.13624036540108478, + "grad_norm": 1.3816832304000854, + "learning_rate": 4.774501396439961e-05, + "loss": 5.9914, + "step": 22908 + }, + { + "epoch": 0.1362463126843658, + "grad_norm": 1.5488735437393188, + "learning_rate": 4.774482009318458e-05, + "loss": 5.5686, + "step": 22909 + }, + { + "epoch": 0.13625225996764678, + "grad_norm": 1.7096377611160278, + "learning_rate": 4.774462621402957e-05, + "loss": 5.0948, + "step": 22910 + }, + { + "epoch": 0.13625820725092777, + "grad_norm": 1.8099161386489868, + "learning_rate": 4.7744432326934644e-05, + "loss": 5.3055, + "step": 22911 + }, + { + "epoch": 0.1362641545342088, + "grad_norm": 1.5320358276367188, + "learning_rate": 4.7744238431899864e-05, + "loss": 5.467, + "step": 22912 + }, + { + "epoch": 0.13627010181748977, + "grad_norm": 1.928933024406433, + "learning_rate": 4.774404452892531e-05, + "loss": 4.9311, + "step": 22913 + }, + { + "epoch": 0.13627604910077076, + "grad_norm": 1.912596344947815, + "learning_rate": 4.7743850618011046e-05, + "loss": 5.1982, + "step": 22914 + }, + { + "epoch": 0.13628199638405178, + "grad_norm": 1.6227478981018066, + "learning_rate": 4.774365669915714e-05, + "loss": 5.3649, + "step": 22915 + }, + { + "epoch": 0.13628794366733277, + "grad_norm": 1.8333683013916016, + "learning_rate": 4.7743462772363656e-05, + "loss": 4.7404, + "step": 22916 + }, + { + "epoch": 0.13629389095061376, + "grad_norm": 1.6802351474761963, + "learning_rate": 4.7743268837630665e-05, + "loss": 5.2044, + "step": 22917 + }, + { + "epoch": 0.13629983823389474, + "grad_norm": 1.76273775100708, + "learning_rate": 4.774307489495823e-05, + "loss": 4.7032, + "step": 22918 + }, + { + "epoch": 0.13630578551717576, + "grad_norm": 1.8272813558578491, + "learning_rate": 4.7742880944346427e-05, + "loss": 4.6324, + "step": 22919 + }, + { + "epoch": 0.13631173280045675, + "grad_norm": 2.327012777328491, + "learning_rate": 4.7742686985795316e-05, + "loss": 4.3851, + "step": 22920 + }, + { + "epoch": 0.13631768008373774, + "grad_norm": 2.035224199295044, + "learning_rate": 4.7742493019304965e-05, + "loss": 4.2965, + "step": 22921 + }, + { + "epoch": 0.13632362736701875, + "grad_norm": 2.3920044898986816, + "learning_rate": 4.774229904487546e-05, + "loss": 4.237, + "step": 22922 + }, + { + "epoch": 0.13632957465029974, + "grad_norm": 2.3279507160186768, + "learning_rate": 4.7742105062506835e-05, + "loss": 4.3676, + "step": 22923 + }, + { + "epoch": 0.13633552193358073, + "grad_norm": 2.360509157180786, + "learning_rate": 4.7741911072199185e-05, + "loss": 4.1116, + "step": 22924 + }, + { + "epoch": 0.13634146921686174, + "grad_norm": 2.3977739810943604, + "learning_rate": 4.7741717073952573e-05, + "loss": 4.4254, + "step": 22925 + }, + { + "epoch": 0.13634741650014273, + "grad_norm": 2.2043890953063965, + "learning_rate": 4.774152306776706e-05, + "loss": 4.3602, + "step": 22926 + }, + { + "epoch": 0.13635336378342372, + "grad_norm": 2.264444589614868, + "learning_rate": 4.7741329053642714e-05, + "loss": 4.3561, + "step": 22927 + }, + { + "epoch": 0.13635931106670474, + "grad_norm": 1.9636424779891968, + "learning_rate": 4.7741135031579596e-05, + "loss": 4.9631, + "step": 22928 + }, + { + "epoch": 0.13636525834998572, + "grad_norm": 1.9803466796875, + "learning_rate": 4.77409410015778e-05, + "loss": 4.4919, + "step": 22929 + }, + { + "epoch": 0.1363712056332667, + "grad_norm": 2.3046467304229736, + "learning_rate": 4.774074696363736e-05, + "loss": 4.7812, + "step": 22930 + }, + { + "epoch": 0.13637715291654773, + "grad_norm": 1.8447179794311523, + "learning_rate": 4.774055291775837e-05, + "loss": 4.7631, + "step": 22931 + }, + { + "epoch": 0.13638310019982872, + "grad_norm": 1.7349412441253662, + "learning_rate": 4.774035886394089e-05, + "loss": 4.7341, + "step": 22932 + }, + { + "epoch": 0.1363890474831097, + "grad_norm": 1.751775860786438, + "learning_rate": 4.774016480218498e-05, + "loss": 4.9051, + "step": 22933 + }, + { + "epoch": 0.13639499476639072, + "grad_norm": 1.6568492650985718, + "learning_rate": 4.773997073249071e-05, + "loss": 4.9236, + "step": 22934 + }, + { + "epoch": 0.1364009420496717, + "grad_norm": 1.6315816640853882, + "learning_rate": 4.773977665485816e-05, + "loss": 5.0631, + "step": 22935 + }, + { + "epoch": 0.1364068893329527, + "grad_norm": 1.7680082321166992, + "learning_rate": 4.773958256928739e-05, + "loss": 4.7632, + "step": 22936 + }, + { + "epoch": 0.1364128366162337, + "grad_norm": 1.656140923500061, + "learning_rate": 4.773938847577846e-05, + "loss": 4.7978, + "step": 22937 + }, + { + "epoch": 0.1364187838995147, + "grad_norm": 1.9236876964569092, + "learning_rate": 4.773919437433144e-05, + "loss": 4.5575, + "step": 22938 + }, + { + "epoch": 0.1364247311827957, + "grad_norm": 1.98481023311615, + "learning_rate": 4.773900026494641e-05, + "loss": 4.4456, + "step": 22939 + }, + { + "epoch": 0.1364306784660767, + "grad_norm": 1.494399070739746, + "learning_rate": 4.773880614762343e-05, + "loss": 5.3057, + "step": 22940 + }, + { + "epoch": 0.1364366257493577, + "grad_norm": 1.972229242324829, + "learning_rate": 4.773861202236257e-05, + "loss": 4.3849, + "step": 22941 + }, + { + "epoch": 0.13644257303263868, + "grad_norm": 2.0766615867614746, + "learning_rate": 4.773841788916389e-05, + "loss": 4.4249, + "step": 22942 + }, + { + "epoch": 0.1364485203159197, + "grad_norm": 1.9418238401412964, + "learning_rate": 4.773822374802747e-05, + "loss": 4.577, + "step": 22943 + }, + { + "epoch": 0.13645446759920069, + "grad_norm": 2.066725254058838, + "learning_rate": 4.773802959895336e-05, + "loss": 4.3563, + "step": 22944 + }, + { + "epoch": 0.13646041488248167, + "grad_norm": 2.948639154434204, + "learning_rate": 4.773783544194165e-05, + "loss": 3.2644, + "step": 22945 + }, + { + "epoch": 0.1364663621657627, + "grad_norm": 2.065586805343628, + "learning_rate": 4.7737641276992385e-05, + "loss": 5.9715, + "step": 22946 + }, + { + "epoch": 0.13647230944904368, + "grad_norm": 2.169130325317383, + "learning_rate": 4.7737447104105645e-05, + "loss": 4.9516, + "step": 22947 + }, + { + "epoch": 0.13647825673232467, + "grad_norm": 2.4133553504943848, + "learning_rate": 4.773725292328151e-05, + "loss": 5.2266, + "step": 22948 + }, + { + "epoch": 0.13648420401560568, + "grad_norm": 2.4718146324157715, + "learning_rate": 4.773705873452002e-05, + "loss": 5.1842, + "step": 22949 + }, + { + "epoch": 0.13649015129888667, + "grad_norm": 1.8822194337844849, + "learning_rate": 4.773686453782127e-05, + "loss": 4.9297, + "step": 22950 + }, + { + "epoch": 0.13649609858216766, + "grad_norm": 1.8627861738204956, + "learning_rate": 4.773667033318531e-05, + "loss": 4.682, + "step": 22951 + }, + { + "epoch": 0.13650204586544867, + "grad_norm": 2.1915957927703857, + "learning_rate": 4.773647612061222e-05, + "loss": 4.5292, + "step": 22952 + }, + { + "epoch": 0.13650799314872966, + "grad_norm": 2.182401657104492, + "learning_rate": 4.773628190010205e-05, + "loss": 4.6416, + "step": 22953 + }, + { + "epoch": 0.13651394043201065, + "grad_norm": 2.020988941192627, + "learning_rate": 4.773608767165488e-05, + "loss": 4.5698, + "step": 22954 + }, + { + "epoch": 0.13651988771529167, + "grad_norm": 1.5788037776947021, + "learning_rate": 4.773589343527078e-05, + "loss": 5.0962, + "step": 22955 + }, + { + "epoch": 0.13652583499857265, + "grad_norm": 1.929002285003662, + "learning_rate": 4.773569919094982e-05, + "loss": 4.7789, + "step": 22956 + }, + { + "epoch": 0.13653178228185364, + "grad_norm": 1.4314018487930298, + "learning_rate": 4.773550493869206e-05, + "loss": 5.1814, + "step": 22957 + }, + { + "epoch": 0.13653772956513466, + "grad_norm": 1.3779473304748535, + "learning_rate": 4.7735310678497566e-05, + "loss": 5.3468, + "step": 22958 + }, + { + "epoch": 0.13654367684841565, + "grad_norm": 1.543843150138855, + "learning_rate": 4.773511641036641e-05, + "loss": 5.2539, + "step": 22959 + }, + { + "epoch": 0.13654962413169663, + "grad_norm": 1.3671090602874756, + "learning_rate": 4.773492213429866e-05, + "loss": 5.2174, + "step": 22960 + }, + { + "epoch": 0.13655557141497765, + "grad_norm": 1.6130348443984985, + "learning_rate": 4.7734727850294386e-05, + "loss": 5.3554, + "step": 22961 + }, + { + "epoch": 0.13656151869825864, + "grad_norm": 1.4536763429641724, + "learning_rate": 4.773453355835365e-05, + "loss": 5.1686, + "step": 22962 + }, + { + "epoch": 0.13656746598153963, + "grad_norm": 1.4020705223083496, + "learning_rate": 4.773433925847652e-05, + "loss": 5.1832, + "step": 22963 + }, + { + "epoch": 0.13657341326482064, + "grad_norm": 1.5963356494903564, + "learning_rate": 4.773414495066308e-05, + "loss": 5.2799, + "step": 22964 + }, + { + "epoch": 0.13657936054810163, + "grad_norm": 1.235477328300476, + "learning_rate": 4.773395063491338e-05, + "loss": 5.3078, + "step": 22965 + }, + { + "epoch": 0.13658530783138262, + "grad_norm": 1.658551812171936, + "learning_rate": 4.7733756311227484e-05, + "loss": 4.8935, + "step": 22966 + }, + { + "epoch": 0.13659125511466363, + "grad_norm": 1.3750555515289307, + "learning_rate": 4.773356197960548e-05, + "loss": 5.4716, + "step": 22967 + }, + { + "epoch": 0.13659720239794462, + "grad_norm": 1.368320107460022, + "learning_rate": 4.773336764004742e-05, + "loss": 5.3549, + "step": 22968 + }, + { + "epoch": 0.1366031496812256, + "grad_norm": 1.6175824403762817, + "learning_rate": 4.773317329255337e-05, + "loss": 5.6482, + "step": 22969 + }, + { + "epoch": 0.13660909696450663, + "grad_norm": 1.5855069160461426, + "learning_rate": 4.7732978937123404e-05, + "loss": 4.8048, + "step": 22970 + }, + { + "epoch": 0.13661504424778761, + "grad_norm": 1.2763618230819702, + "learning_rate": 4.77327845737576e-05, + "loss": 5.3114, + "step": 22971 + }, + { + "epoch": 0.1366209915310686, + "grad_norm": 1.296797275543213, + "learning_rate": 4.773259020245601e-05, + "loss": 5.2154, + "step": 22972 + }, + { + "epoch": 0.13662693881434962, + "grad_norm": 1.6255276203155518, + "learning_rate": 4.7732395823218714e-05, + "loss": 4.7173, + "step": 22973 + }, + { + "epoch": 0.1366328860976306, + "grad_norm": 1.6712839603424072, + "learning_rate": 4.7732201436045764e-05, + "loss": 4.7129, + "step": 22974 + }, + { + "epoch": 0.1366388333809116, + "grad_norm": 1.3639626502990723, + "learning_rate": 4.773200704093724e-05, + "loss": 5.3616, + "step": 22975 + }, + { + "epoch": 0.13664478066419258, + "grad_norm": 1.5322916507720947, + "learning_rate": 4.773181263789321e-05, + "loss": 4.9117, + "step": 22976 + }, + { + "epoch": 0.1366507279474736, + "grad_norm": 1.5231655836105347, + "learning_rate": 4.7731618226913735e-05, + "loss": 5.3278, + "step": 22977 + }, + { + "epoch": 0.1366566752307546, + "grad_norm": 1.610016942024231, + "learning_rate": 4.7731423807998896e-05, + "loss": 4.8782, + "step": 22978 + }, + { + "epoch": 0.13666262251403558, + "grad_norm": 1.578951358795166, + "learning_rate": 4.773122938114875e-05, + "loss": 5.4874, + "step": 22979 + }, + { + "epoch": 0.1366685697973166, + "grad_norm": 1.7087042331695557, + "learning_rate": 4.773103494636335e-05, + "loss": 5.4259, + "step": 22980 + }, + { + "epoch": 0.13667451708059758, + "grad_norm": 1.4179787635803223, + "learning_rate": 4.773084050364279e-05, + "loss": 5.3227, + "step": 22981 + }, + { + "epoch": 0.13668046436387857, + "grad_norm": 1.6982066631317139, + "learning_rate": 4.773064605298714e-05, + "loss": 4.9789, + "step": 22982 + }, + { + "epoch": 0.13668641164715958, + "grad_norm": 1.6331787109375, + "learning_rate": 4.773045159439644e-05, + "loss": 5.3524, + "step": 22983 + }, + { + "epoch": 0.13669235893044057, + "grad_norm": 1.5722705125808716, + "learning_rate": 4.773025712787078e-05, + "loss": 5.2852, + "step": 22984 + }, + { + "epoch": 0.13669830621372156, + "grad_norm": 1.553524136543274, + "learning_rate": 4.773006265341023e-05, + "loss": 5.3803, + "step": 22985 + }, + { + "epoch": 0.13670425349700258, + "grad_norm": 1.6696399450302124, + "learning_rate": 4.772986817101484e-05, + "loss": 5.1719, + "step": 22986 + }, + { + "epoch": 0.13671020078028356, + "grad_norm": 1.468403935432434, + "learning_rate": 4.772967368068469e-05, + "loss": 5.3468, + "step": 22987 + }, + { + "epoch": 0.13671614806356455, + "grad_norm": 1.5586446523666382, + "learning_rate": 4.772947918241985e-05, + "loss": 5.3733, + "step": 22988 + }, + { + "epoch": 0.13672209534684557, + "grad_norm": 1.549392819404602, + "learning_rate": 4.7729284676220385e-05, + "loss": 5.4622, + "step": 22989 + }, + { + "epoch": 0.13672804263012656, + "grad_norm": 1.4469774961471558, + "learning_rate": 4.772909016208636e-05, + "loss": 5.3998, + "step": 22990 + }, + { + "epoch": 0.13673398991340754, + "grad_norm": 1.3361252546310425, + "learning_rate": 4.7728895640017833e-05, + "loss": 5.1723, + "step": 22991 + }, + { + "epoch": 0.13673993719668856, + "grad_norm": 1.5584652423858643, + "learning_rate": 4.7728701110014894e-05, + "loss": 5.03, + "step": 22992 + }, + { + "epoch": 0.13674588447996955, + "grad_norm": 1.319245457649231, + "learning_rate": 4.7728506572077594e-05, + "loss": 5.0349, + "step": 22993 + }, + { + "epoch": 0.13675183176325054, + "grad_norm": 1.6574468612670898, + "learning_rate": 4.7728312026206015e-05, + "loss": 5.3401, + "step": 22994 + }, + { + "epoch": 0.13675777904653155, + "grad_norm": 1.564598560333252, + "learning_rate": 4.772811747240022e-05, + "loss": 5.3047, + "step": 22995 + }, + { + "epoch": 0.13676372632981254, + "grad_norm": 1.5692095756530762, + "learning_rate": 4.772792291066026e-05, + "loss": 5.1632, + "step": 22996 + }, + { + "epoch": 0.13676967361309353, + "grad_norm": 1.3904811143875122, + "learning_rate": 4.772772834098622e-05, + "loss": 5.2429, + "step": 22997 + }, + { + "epoch": 0.13677562089637454, + "grad_norm": 1.6455345153808594, + "learning_rate": 4.7727533763378175e-05, + "loss": 5.164, + "step": 22998 + }, + { + "epoch": 0.13678156817965553, + "grad_norm": 1.384092092514038, + "learning_rate": 4.772733917783618e-05, + "loss": 4.9753, + "step": 22999 + }, + { + "epoch": 0.13678751546293652, + "grad_norm": 1.5056332349777222, + "learning_rate": 4.77271445843603e-05, + "loss": 5.008, + "step": 23000 + }, + { + "epoch": 0.13679346274621754, + "grad_norm": 1.6766334772109985, + "learning_rate": 4.772694998295061e-05, + "loss": 5.2156, + "step": 23001 + }, + { + "epoch": 0.13679941002949852, + "grad_norm": 1.517899513244629, + "learning_rate": 4.772675537360718e-05, + "loss": 5.4637, + "step": 23002 + }, + { + "epoch": 0.1368053573127795, + "grad_norm": 1.539090633392334, + "learning_rate": 4.772656075633007e-05, + "loss": 4.9678, + "step": 23003 + }, + { + "epoch": 0.13681130459606053, + "grad_norm": 1.5403459072113037, + "learning_rate": 4.772636613111936e-05, + "loss": 5.1884, + "step": 23004 + }, + { + "epoch": 0.13681725187934152, + "grad_norm": 1.4680373668670654, + "learning_rate": 4.7726171497975106e-05, + "loss": 5.118, + "step": 23005 + }, + { + "epoch": 0.1368231991626225, + "grad_norm": 1.6800905466079712, + "learning_rate": 4.7725976856897376e-05, + "loss": 5.5796, + "step": 23006 + }, + { + "epoch": 0.13682914644590352, + "grad_norm": 1.6708084344863892, + "learning_rate": 4.7725782207886246e-05, + "loss": 4.8021, + "step": 23007 + }, + { + "epoch": 0.1368350937291845, + "grad_norm": 1.3744218349456787, + "learning_rate": 4.772558755094177e-05, + "loss": 5.2993, + "step": 23008 + }, + { + "epoch": 0.1368410410124655, + "grad_norm": 1.6822494268417358, + "learning_rate": 4.772539288606405e-05, + "loss": 4.8643, + "step": 23009 + }, + { + "epoch": 0.1368469882957465, + "grad_norm": 1.7003953456878662, + "learning_rate": 4.772519821325311e-05, + "loss": 5.0189, + "step": 23010 + }, + { + "epoch": 0.1368529355790275, + "grad_norm": 1.5518492460250854, + "learning_rate": 4.772500353250905e-05, + "loss": 5.2159, + "step": 23011 + }, + { + "epoch": 0.1368588828623085, + "grad_norm": 1.64122474193573, + "learning_rate": 4.772480884383191e-05, + "loss": 4.8965, + "step": 23012 + }, + { + "epoch": 0.1368648301455895, + "grad_norm": 1.6162265539169312, + "learning_rate": 4.772461414722179e-05, + "loss": 5.1521, + "step": 23013 + }, + { + "epoch": 0.1368707774288705, + "grad_norm": 1.7200851440429688, + "learning_rate": 4.7724419442678736e-05, + "loss": 5.1694, + "step": 23014 + }, + { + "epoch": 0.13687672471215148, + "grad_norm": 1.4717456102371216, + "learning_rate": 4.772422473020283e-05, + "loss": 5.1999, + "step": 23015 + }, + { + "epoch": 0.1368826719954325, + "grad_norm": 1.6320459842681885, + "learning_rate": 4.772403000979412e-05, + "loss": 4.9127, + "step": 23016 + }, + { + "epoch": 0.13688861927871349, + "grad_norm": 1.5466574430465698, + "learning_rate": 4.772383528145269e-05, + "loss": 5.0589, + "step": 23017 + }, + { + "epoch": 0.13689456656199447, + "grad_norm": 1.7745109796524048, + "learning_rate": 4.77236405451786e-05, + "loss": 4.9933, + "step": 23018 + }, + { + "epoch": 0.1369005138452755, + "grad_norm": 1.4493471384048462, + "learning_rate": 4.772344580097193e-05, + "loss": 5.0996, + "step": 23019 + }, + { + "epoch": 0.13690646112855648, + "grad_norm": 1.4859240055084229, + "learning_rate": 4.7723251048832734e-05, + "loss": 5.2686, + "step": 23020 + }, + { + "epoch": 0.13691240841183747, + "grad_norm": 1.6349207162857056, + "learning_rate": 4.7723056288761084e-05, + "loss": 5.1644, + "step": 23021 + }, + { + "epoch": 0.13691835569511848, + "grad_norm": 1.5266002416610718, + "learning_rate": 4.772286152075706e-05, + "loss": 4.988, + "step": 23022 + }, + { + "epoch": 0.13692430297839947, + "grad_norm": 1.592774748802185, + "learning_rate": 4.772266674482071e-05, + "loss": 5.2701, + "step": 23023 + }, + { + "epoch": 0.13693025026168046, + "grad_norm": 1.4789998531341553, + "learning_rate": 4.772247196095211e-05, + "loss": 5.1181, + "step": 23024 + }, + { + "epoch": 0.13693619754496147, + "grad_norm": 1.4374842643737793, + "learning_rate": 4.772227716915134e-05, + "loss": 4.5882, + "step": 23025 + }, + { + "epoch": 0.13694214482824246, + "grad_norm": 1.682689905166626, + "learning_rate": 4.772208236941845e-05, + "loss": 5.5035, + "step": 23026 + }, + { + "epoch": 0.13694809211152345, + "grad_norm": 1.5549851655960083, + "learning_rate": 4.772188756175352e-05, + "loss": 5.5484, + "step": 23027 + }, + { + "epoch": 0.13695403939480447, + "grad_norm": 1.5018965005874634, + "learning_rate": 4.772169274615661e-05, + "loss": 5.0517, + "step": 23028 + }, + { + "epoch": 0.13695998667808545, + "grad_norm": 1.648807168006897, + "learning_rate": 4.77214979226278e-05, + "loss": 5.1527, + "step": 23029 + }, + { + "epoch": 0.13696593396136644, + "grad_norm": 1.6059378385543823, + "learning_rate": 4.772130309116714e-05, + "loss": 5.0003, + "step": 23030 + }, + { + "epoch": 0.13697188124464746, + "grad_norm": 1.368412971496582, + "learning_rate": 4.772110825177472e-05, + "loss": 5.1025, + "step": 23031 + }, + { + "epoch": 0.13697782852792845, + "grad_norm": 1.627031922340393, + "learning_rate": 4.7720913404450576e-05, + "loss": 5.1206, + "step": 23032 + }, + { + "epoch": 0.13698377581120944, + "grad_norm": 1.654307246208191, + "learning_rate": 4.772071854919481e-05, + "loss": 4.8403, + "step": 23033 + }, + { + "epoch": 0.13698972309449042, + "grad_norm": 1.658847451210022, + "learning_rate": 4.772052368600748e-05, + "loss": 5.2089, + "step": 23034 + }, + { + "epoch": 0.13699567037777144, + "grad_norm": 1.6999456882476807, + "learning_rate": 4.772032881488864e-05, + "loss": 5.2022, + "step": 23035 + }, + { + "epoch": 0.13700161766105243, + "grad_norm": 1.2880116701126099, + "learning_rate": 4.772013393583837e-05, + "loss": 5.4331, + "step": 23036 + }, + { + "epoch": 0.13700756494433342, + "grad_norm": 1.4780634641647339, + "learning_rate": 4.7719939048856735e-05, + "loss": 5.034, + "step": 23037 + }, + { + "epoch": 0.13701351222761443, + "grad_norm": 1.5058658123016357, + "learning_rate": 4.771974415394381e-05, + "loss": 5.4403, + "step": 23038 + }, + { + "epoch": 0.13701945951089542, + "grad_norm": 1.4378021955490112, + "learning_rate": 4.771954925109965e-05, + "loss": 5.1769, + "step": 23039 + }, + { + "epoch": 0.1370254067941764, + "grad_norm": 1.6010862588882446, + "learning_rate": 4.7719354340324337e-05, + "loss": 5.4728, + "step": 23040 + }, + { + "epoch": 0.13703135407745742, + "grad_norm": 1.6916764974594116, + "learning_rate": 4.7719159421617924e-05, + "loss": 4.9489, + "step": 23041 + }, + { + "epoch": 0.1370373013607384, + "grad_norm": 1.4737353324890137, + "learning_rate": 4.771896449498049e-05, + "loss": 4.8795, + "step": 23042 + }, + { + "epoch": 0.1370432486440194, + "grad_norm": 1.5808194875717163, + "learning_rate": 4.7718769560412105e-05, + "loss": 4.8375, + "step": 23043 + }, + { + "epoch": 0.13704919592730042, + "grad_norm": 1.3700640201568604, + "learning_rate": 4.771857461791283e-05, + "loss": 4.8135, + "step": 23044 + }, + { + "epoch": 0.1370551432105814, + "grad_norm": 1.1938998699188232, + "learning_rate": 4.7718379667482735e-05, + "loss": 4.8199, + "step": 23045 + }, + { + "epoch": 0.1370610904938624, + "grad_norm": 1.3598859310150146, + "learning_rate": 4.7718184709121885e-05, + "loss": 4.6871, + "step": 23046 + }, + { + "epoch": 0.1370670377771434, + "grad_norm": 1.2303695678710938, + "learning_rate": 4.7717989742830354e-05, + "loss": 4.7421, + "step": 23047 + }, + { + "epoch": 0.1370729850604244, + "grad_norm": 1.2872051000595093, + "learning_rate": 4.77177947686082e-05, + "loss": 4.6669, + "step": 23048 + }, + { + "epoch": 0.13707893234370538, + "grad_norm": 1.2523759603500366, + "learning_rate": 4.771759978645551e-05, + "loss": 4.6359, + "step": 23049 + }, + { + "epoch": 0.1370848796269864, + "grad_norm": 1.2552485466003418, + "learning_rate": 4.771740479637234e-05, + "loss": 4.7362, + "step": 23050 + }, + { + "epoch": 0.1370908269102674, + "grad_norm": 1.434870958328247, + "learning_rate": 4.771720979835875e-05, + "loss": 4.812, + "step": 23051 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 1.6004719734191895, + "learning_rate": 4.771701479241483e-05, + "loss": 5.1579, + "step": 23052 + }, + { + "epoch": 0.1371027214768294, + "grad_norm": 1.5982462167739868, + "learning_rate": 4.7716819778540625e-05, + "loss": 4.8038, + "step": 23053 + }, + { + "epoch": 0.13710866876011038, + "grad_norm": 1.7509288787841797, + "learning_rate": 4.7716624756736215e-05, + "loss": 5.091, + "step": 23054 + }, + { + "epoch": 0.13711461604339137, + "grad_norm": 1.729748010635376, + "learning_rate": 4.7716429727001665e-05, + "loss": 5.0755, + "step": 23055 + }, + { + "epoch": 0.13712056332667238, + "grad_norm": 1.6167495250701904, + "learning_rate": 4.7716234689337044e-05, + "loss": 5.0602, + "step": 23056 + }, + { + "epoch": 0.13712651060995337, + "grad_norm": 1.7035753726959229, + "learning_rate": 4.771603964374242e-05, + "loss": 5.1877, + "step": 23057 + }, + { + "epoch": 0.13713245789323436, + "grad_norm": 1.5923055410385132, + "learning_rate": 4.7715844590217865e-05, + "loss": 4.6043, + "step": 23058 + }, + { + "epoch": 0.13713840517651538, + "grad_norm": 1.551894187927246, + "learning_rate": 4.771564952876344e-05, + "loss": 5.0746, + "step": 23059 + }, + { + "epoch": 0.13714435245979636, + "grad_norm": 1.8965848684310913, + "learning_rate": 4.771545445937921e-05, + "loss": 4.6152, + "step": 23060 + }, + { + "epoch": 0.13715029974307735, + "grad_norm": 1.630903720855713, + "learning_rate": 4.771525938206527e-05, + "loss": 5.3948, + "step": 23061 + }, + { + "epoch": 0.13715624702635837, + "grad_norm": 1.7285772562026978, + "learning_rate": 4.771506429682166e-05, + "loss": 5.2535, + "step": 23062 + }, + { + "epoch": 0.13716219430963936, + "grad_norm": 1.789049506187439, + "learning_rate": 4.771486920364844e-05, + "loss": 4.7232, + "step": 23063 + }, + { + "epoch": 0.13716814159292035, + "grad_norm": 1.6774955987930298, + "learning_rate": 4.7714674102545706e-05, + "loss": 5.424, + "step": 23064 + }, + { + "epoch": 0.13717408887620136, + "grad_norm": 1.9038479328155518, + "learning_rate": 4.771447899351351e-05, + "loss": 4.7868, + "step": 23065 + }, + { + "epoch": 0.13718003615948235, + "grad_norm": 1.906087875366211, + "learning_rate": 4.771428387655192e-05, + "loss": 4.7115, + "step": 23066 + }, + { + "epoch": 0.13718598344276334, + "grad_norm": 1.786908745765686, + "learning_rate": 4.771408875166103e-05, + "loss": 4.6741, + "step": 23067 + }, + { + "epoch": 0.13719193072604435, + "grad_norm": 1.8421779870986938, + "learning_rate": 4.771389361884086e-05, + "loss": 4.9338, + "step": 23068 + }, + { + "epoch": 0.13719787800932534, + "grad_norm": 1.8146562576293945, + "learning_rate": 4.7713698478091516e-05, + "loss": 4.5556, + "step": 23069 + }, + { + "epoch": 0.13720382529260633, + "grad_norm": 1.4694918394088745, + "learning_rate": 4.7713503329413056e-05, + "loss": 5.611, + "step": 23070 + }, + { + "epoch": 0.13720977257588735, + "grad_norm": 1.553694248199463, + "learning_rate": 4.771330817280554e-05, + "loss": 5.6062, + "step": 23071 + }, + { + "epoch": 0.13721571985916833, + "grad_norm": 1.293204426765442, + "learning_rate": 4.771311300826905e-05, + "loss": 5.7157, + "step": 23072 + }, + { + "epoch": 0.13722166714244932, + "grad_norm": 1.369480013847351, + "learning_rate": 4.771291783580364e-05, + "loss": 5.754, + "step": 23073 + }, + { + "epoch": 0.13722761442573034, + "grad_norm": 1.7480628490447998, + "learning_rate": 4.771272265540939e-05, + "loss": 5.4179, + "step": 23074 + }, + { + "epoch": 0.13723356170901133, + "grad_norm": 1.604788064956665, + "learning_rate": 4.771252746708636e-05, + "loss": 5.3766, + "step": 23075 + }, + { + "epoch": 0.13723950899229231, + "grad_norm": 1.721793532371521, + "learning_rate": 4.7712332270834626e-05, + "loss": 4.9839, + "step": 23076 + }, + { + "epoch": 0.13724545627557333, + "grad_norm": 1.528327226638794, + "learning_rate": 4.771213706665425e-05, + "loss": 5.427, + "step": 23077 + }, + { + "epoch": 0.13725140355885432, + "grad_norm": 1.425625205039978, + "learning_rate": 4.7711941854545295e-05, + "loss": 5.6246, + "step": 23078 + }, + { + "epoch": 0.1372573508421353, + "grad_norm": 1.9369326829910278, + "learning_rate": 4.771174663450784e-05, + "loss": 5.5038, + "step": 23079 + }, + { + "epoch": 0.13726329812541632, + "grad_norm": 1.906792163848877, + "learning_rate": 4.771155140654195e-05, + "loss": 5.5361, + "step": 23080 + }, + { + "epoch": 0.1372692454086973, + "grad_norm": 1.7495099306106567, + "learning_rate": 4.7711356170647694e-05, + "loss": 4.9809, + "step": 23081 + }, + { + "epoch": 0.1372751926919783, + "grad_norm": 1.5589921474456787, + "learning_rate": 4.771116092682514e-05, + "loss": 4.9627, + "step": 23082 + }, + { + "epoch": 0.13728113997525931, + "grad_norm": 1.7177824974060059, + "learning_rate": 4.771096567507435e-05, + "loss": 5.403, + "step": 23083 + }, + { + "epoch": 0.1372870872585403, + "grad_norm": 1.5470298528671265, + "learning_rate": 4.7710770415395395e-05, + "loss": 6.0237, + "step": 23084 + }, + { + "epoch": 0.1372930345418213, + "grad_norm": 1.5613659620285034, + "learning_rate": 4.771057514778835e-05, + "loss": 5.7272, + "step": 23085 + }, + { + "epoch": 0.1372989818251023, + "grad_norm": 1.7003729343414307, + "learning_rate": 4.771037987225328e-05, + "loss": 4.9305, + "step": 23086 + }, + { + "epoch": 0.1373049291083833, + "grad_norm": 2.587393283843994, + "learning_rate": 4.771018458879023e-05, + "loss": 4.9075, + "step": 23087 + }, + { + "epoch": 0.13731087639166428, + "grad_norm": 2.208308696746826, + "learning_rate": 4.770998929739931e-05, + "loss": 4.9141, + "step": 23088 + }, + { + "epoch": 0.1373168236749453, + "grad_norm": 2.0532326698303223, + "learning_rate": 4.770979399808057e-05, + "loss": 5.0574, + "step": 23089 + }, + { + "epoch": 0.1373227709582263, + "grad_norm": 1.86672043800354, + "learning_rate": 4.770959869083406e-05, + "loss": 4.9269, + "step": 23090 + }, + { + "epoch": 0.13732871824150727, + "grad_norm": 1.8310163021087646, + "learning_rate": 4.7709403375659874e-05, + "loss": 4.901, + "step": 23091 + }, + { + "epoch": 0.13733466552478826, + "grad_norm": 1.8886011838912964, + "learning_rate": 4.7709208052558065e-05, + "loss": 4.8325, + "step": 23092 + }, + { + "epoch": 0.13734061280806928, + "grad_norm": 1.9192320108413696, + "learning_rate": 4.770901272152871e-05, + "loss": 4.8783, + "step": 23093 + }, + { + "epoch": 0.13734656009135027, + "grad_norm": 2.0797886848449707, + "learning_rate": 4.770881738257187e-05, + "loss": 4.6473, + "step": 23094 + }, + { + "epoch": 0.13735250737463126, + "grad_norm": 2.2008140087127686, + "learning_rate": 4.770862203568762e-05, + "loss": 4.7291, + "step": 23095 + }, + { + "epoch": 0.13735845465791227, + "grad_norm": 2.002549886703491, + "learning_rate": 4.770842668087602e-05, + "loss": 4.5471, + "step": 23096 + }, + { + "epoch": 0.13736440194119326, + "grad_norm": 1.7748942375183105, + "learning_rate": 4.770823131813714e-05, + "loss": 4.5844, + "step": 23097 + }, + { + "epoch": 0.13737034922447425, + "grad_norm": 2.128469467163086, + "learning_rate": 4.7708035947471065e-05, + "loss": 4.7365, + "step": 23098 + }, + { + "epoch": 0.13737629650775526, + "grad_norm": 1.9279344081878662, + "learning_rate": 4.770784056887784e-05, + "loss": 4.5673, + "step": 23099 + }, + { + "epoch": 0.13738224379103625, + "grad_norm": 1.896638035774231, + "learning_rate": 4.770764518235754e-05, + "loss": 4.5956, + "step": 23100 + }, + { + "epoch": 0.13738819107431724, + "grad_norm": 2.4768176078796387, + "learning_rate": 4.770744978791024e-05, + "loss": 4.5071, + "step": 23101 + }, + { + "epoch": 0.13739413835759826, + "grad_norm": 2.0828697681427, + "learning_rate": 4.7707254385536e-05, + "loss": 4.5681, + "step": 23102 + }, + { + "epoch": 0.13740008564087924, + "grad_norm": 2.197688579559326, + "learning_rate": 4.7707058975234895e-05, + "loss": 4.5111, + "step": 23103 + }, + { + "epoch": 0.13740603292416023, + "grad_norm": 2.0053935050964355, + "learning_rate": 4.7706863557007e-05, + "loss": 4.5441, + "step": 23104 + }, + { + "epoch": 0.13741198020744125, + "grad_norm": 2.247901439666748, + "learning_rate": 4.770666813085236e-05, + "loss": 4.5538, + "step": 23105 + }, + { + "epoch": 0.13741792749072224, + "grad_norm": 2.1666789054870605, + "learning_rate": 4.770647269677106e-05, + "loss": 4.7712, + "step": 23106 + }, + { + "epoch": 0.13742387477400322, + "grad_norm": 2.0191304683685303, + "learning_rate": 4.770627725476317e-05, + "loss": 4.5244, + "step": 23107 + }, + { + "epoch": 0.13742982205728424, + "grad_norm": 1.9388200044631958, + "learning_rate": 4.770608180482874e-05, + "loss": 4.6272, + "step": 23108 + }, + { + "epoch": 0.13743576934056523, + "grad_norm": 2.0467464923858643, + "learning_rate": 4.7705886346967865e-05, + "loss": 4.5852, + "step": 23109 + }, + { + "epoch": 0.13744171662384622, + "grad_norm": 2.0310070514678955, + "learning_rate": 4.770569088118059e-05, + "loss": 4.3915, + "step": 23110 + }, + { + "epoch": 0.13744766390712723, + "grad_norm": 2.1216657161712646, + "learning_rate": 4.770549540746701e-05, + "loss": 4.4549, + "step": 23111 + }, + { + "epoch": 0.13745361119040822, + "grad_norm": 1.9715701341629028, + "learning_rate": 4.770529992582715e-05, + "loss": 4.8822, + "step": 23112 + }, + { + "epoch": 0.1374595584736892, + "grad_norm": 2.0956320762634277, + "learning_rate": 4.7705104436261124e-05, + "loss": 5.3927, + "step": 23113 + }, + { + "epoch": 0.13746550575697022, + "grad_norm": 1.6396405696868896, + "learning_rate": 4.770490893876898e-05, + "loss": 5.5089, + "step": 23114 + }, + { + "epoch": 0.1374714530402512, + "grad_norm": 1.8379572629928589, + "learning_rate": 4.7704713433350777e-05, + "loss": 5.9133, + "step": 23115 + }, + { + "epoch": 0.1374774003235322, + "grad_norm": 1.6787012815475464, + "learning_rate": 4.7704517920006594e-05, + "loss": 5.4497, + "step": 23116 + }, + { + "epoch": 0.13748334760681322, + "grad_norm": 1.6657997369766235, + "learning_rate": 4.77043223987365e-05, + "loss": 5.2093, + "step": 23117 + }, + { + "epoch": 0.1374892948900942, + "grad_norm": 1.7581418752670288, + "learning_rate": 4.7704126869540565e-05, + "loss": 6.4119, + "step": 23118 + }, + { + "epoch": 0.1374952421733752, + "grad_norm": 1.4436302185058594, + "learning_rate": 4.770393133241885e-05, + "loss": 6.3299, + "step": 23119 + }, + { + "epoch": 0.1375011894566562, + "grad_norm": 1.6737406253814697, + "learning_rate": 4.7703735787371434e-05, + "loss": 5.8634, + "step": 23120 + }, + { + "epoch": 0.1375071367399372, + "grad_norm": 1.5715806484222412, + "learning_rate": 4.7703540234398375e-05, + "loss": 5.7896, + "step": 23121 + }, + { + "epoch": 0.13751308402321819, + "grad_norm": 1.8452152013778687, + "learning_rate": 4.7703344673499744e-05, + "loss": 5.8868, + "step": 23122 + }, + { + "epoch": 0.1375190313064992, + "grad_norm": 1.6291402578353882, + "learning_rate": 4.770314910467561e-05, + "loss": 5.8256, + "step": 23123 + }, + { + "epoch": 0.1375249785897802, + "grad_norm": 1.4301279783248901, + "learning_rate": 4.770295352792604e-05, + "loss": 5.7982, + "step": 23124 + }, + { + "epoch": 0.13753092587306118, + "grad_norm": 1.5949046611785889, + "learning_rate": 4.770275794325111e-05, + "loss": 5.5606, + "step": 23125 + }, + { + "epoch": 0.1375368731563422, + "grad_norm": 1.572860598564148, + "learning_rate": 4.770256235065087e-05, + "loss": 5.1636, + "step": 23126 + }, + { + "epoch": 0.13754282043962318, + "grad_norm": 1.4339121580123901, + "learning_rate": 4.7702366750125405e-05, + "loss": 5.1374, + "step": 23127 + }, + { + "epoch": 0.13754876772290417, + "grad_norm": 1.4290729761123657, + "learning_rate": 4.770217114167478e-05, + "loss": 5.7268, + "step": 23128 + }, + { + "epoch": 0.13755471500618519, + "grad_norm": 1.1300958395004272, + "learning_rate": 4.7701975525299066e-05, + "loss": 5.6887, + "step": 23129 + }, + { + "epoch": 0.13756066228946617, + "grad_norm": 1.1974701881408691, + "learning_rate": 4.7701779900998325e-05, + "loss": 5.6763, + "step": 23130 + }, + { + "epoch": 0.13756660957274716, + "grad_norm": 1.3675005435943604, + "learning_rate": 4.7701584268772614e-05, + "loss": 5.6558, + "step": 23131 + }, + { + "epoch": 0.13757255685602818, + "grad_norm": 1.3302583694458008, + "learning_rate": 4.770138862862203e-05, + "loss": 5.6915, + "step": 23132 + }, + { + "epoch": 0.13757850413930917, + "grad_norm": 1.3415045738220215, + "learning_rate": 4.770119298054662e-05, + "loss": 5.6922, + "step": 23133 + }, + { + "epoch": 0.13758445142259015, + "grad_norm": 1.229663372039795, + "learning_rate": 4.770099732454646e-05, + "loss": 5.7799, + "step": 23134 + }, + { + "epoch": 0.13759039870587117, + "grad_norm": 1.3245000839233398, + "learning_rate": 4.7700801660621614e-05, + "loss": 5.7848, + "step": 23135 + }, + { + "epoch": 0.13759634598915216, + "grad_norm": 1.2835783958435059, + "learning_rate": 4.770060598877215e-05, + "loss": 5.5999, + "step": 23136 + }, + { + "epoch": 0.13760229327243315, + "grad_norm": 1.9270732402801514, + "learning_rate": 4.770041030899814e-05, + "loss": 4.8701, + "step": 23137 + }, + { + "epoch": 0.13760824055571416, + "grad_norm": 1.8123419284820557, + "learning_rate": 4.7700214621299656e-05, + "loss": 5.3828, + "step": 23138 + }, + { + "epoch": 0.13761418783899515, + "grad_norm": 2.0436434745788574, + "learning_rate": 4.770001892567676e-05, + "loss": 4.6098, + "step": 23139 + }, + { + "epoch": 0.13762013512227614, + "grad_norm": 1.4343012571334839, + "learning_rate": 4.769982322212953e-05, + "loss": 5.5587, + "step": 23140 + }, + { + "epoch": 0.13762608240555715, + "grad_norm": 1.266640067100525, + "learning_rate": 4.769962751065801e-05, + "loss": 5.626, + "step": 23141 + }, + { + "epoch": 0.13763202968883814, + "grad_norm": 1.9386495351791382, + "learning_rate": 4.7699431791262296e-05, + "loss": 4.7212, + "step": 23142 + }, + { + "epoch": 0.13763797697211913, + "grad_norm": 2.270129919052124, + "learning_rate": 4.769923606394244e-05, + "loss": 4.7609, + "step": 23143 + }, + { + "epoch": 0.13764392425540015, + "grad_norm": 2.0305488109588623, + "learning_rate": 4.7699040328698516e-05, + "loss": 4.8083, + "step": 23144 + }, + { + "epoch": 0.13764987153868113, + "grad_norm": 2.1791486740112305, + "learning_rate": 4.769884458553059e-05, + "loss": 4.834, + "step": 23145 + }, + { + "epoch": 0.13765581882196212, + "grad_norm": 2.152580738067627, + "learning_rate": 4.769864883443873e-05, + "loss": 4.5418, + "step": 23146 + }, + { + "epoch": 0.13766176610524314, + "grad_norm": 2.2850470542907715, + "learning_rate": 4.769845307542301e-05, + "loss": 4.9344, + "step": 23147 + }, + { + "epoch": 0.13766771338852413, + "grad_norm": 1.745813012123108, + "learning_rate": 4.76982573084835e-05, + "loss": 4.9631, + "step": 23148 + }, + { + "epoch": 0.13767366067180511, + "grad_norm": 1.5848993062973022, + "learning_rate": 4.769806153362025e-05, + "loss": 5.3936, + "step": 23149 + }, + { + "epoch": 0.1376796079550861, + "grad_norm": 1.5276480913162231, + "learning_rate": 4.7697865750833356e-05, + "loss": 5.7806, + "step": 23150 + }, + { + "epoch": 0.13768555523836712, + "grad_norm": 1.3464304208755493, + "learning_rate": 4.769766996012286e-05, + "loss": 5.5572, + "step": 23151 + }, + { + "epoch": 0.1376915025216481, + "grad_norm": 1.375168800354004, + "learning_rate": 4.769747416148885e-05, + "loss": 5.6109, + "step": 23152 + }, + { + "epoch": 0.1376974498049291, + "grad_norm": 1.3537193536758423, + "learning_rate": 4.769727835493138e-05, + "loss": 5.5257, + "step": 23153 + }, + { + "epoch": 0.1377033970882101, + "grad_norm": 1.6656006574630737, + "learning_rate": 4.769708254045053e-05, + "loss": 5.3327, + "step": 23154 + }, + { + "epoch": 0.1377093443714911, + "grad_norm": 1.6092736721038818, + "learning_rate": 4.769688671804635e-05, + "loss": 5.7785, + "step": 23155 + }, + { + "epoch": 0.1377152916547721, + "grad_norm": 1.5005303621292114, + "learning_rate": 4.7696690887718934e-05, + "loss": 5.4944, + "step": 23156 + }, + { + "epoch": 0.1377212389380531, + "grad_norm": 1.6100717782974243, + "learning_rate": 4.7696495049468336e-05, + "loss": 5.3767, + "step": 23157 + }, + { + "epoch": 0.1377271862213341, + "grad_norm": 1.5637480020523071, + "learning_rate": 4.7696299203294626e-05, + "loss": 5.3981, + "step": 23158 + }, + { + "epoch": 0.13773313350461508, + "grad_norm": 1.6407819986343384, + "learning_rate": 4.769610334919787e-05, + "loss": 5.4328, + "step": 23159 + }, + { + "epoch": 0.1377390807878961, + "grad_norm": 1.8828953504562378, + "learning_rate": 4.7695907487178146e-05, + "loss": 5.5127, + "step": 23160 + }, + { + "epoch": 0.13774502807117708, + "grad_norm": 1.5160561800003052, + "learning_rate": 4.7695711617235506e-05, + "loss": 5.3309, + "step": 23161 + }, + { + "epoch": 0.13775097535445807, + "grad_norm": 1.4901509284973145, + "learning_rate": 4.769551573937003e-05, + "loss": 5.4584, + "step": 23162 + }, + { + "epoch": 0.1377569226377391, + "grad_norm": 1.3983137607574463, + "learning_rate": 4.769531985358179e-05, + "loss": 5.6738, + "step": 23163 + }, + { + "epoch": 0.13776286992102008, + "grad_norm": 1.7664490938186646, + "learning_rate": 4.7695123959870834e-05, + "loss": 5.513, + "step": 23164 + }, + { + "epoch": 0.13776881720430106, + "grad_norm": 1.4650641679763794, + "learning_rate": 4.7694928058237255e-05, + "loss": 4.9959, + "step": 23165 + }, + { + "epoch": 0.13777476448758208, + "grad_norm": 1.5515252351760864, + "learning_rate": 4.7694732148681106e-05, + "loss": 5.1419, + "step": 23166 + }, + { + "epoch": 0.13778071177086307, + "grad_norm": 1.459083914756775, + "learning_rate": 4.769453623120247e-05, + "loss": 5.3639, + "step": 23167 + }, + { + "epoch": 0.13778665905414406, + "grad_norm": 1.6032545566558838, + "learning_rate": 4.76943403058014e-05, + "loss": 5.3822, + "step": 23168 + }, + { + "epoch": 0.13779260633742507, + "grad_norm": 1.5436428785324097, + "learning_rate": 4.769414437247797e-05, + "loss": 5.0313, + "step": 23169 + }, + { + "epoch": 0.13779855362070606, + "grad_norm": 1.2577800750732422, + "learning_rate": 4.769394843123225e-05, + "loss": 4.8907, + "step": 23170 + }, + { + "epoch": 0.13780450090398705, + "grad_norm": 1.4654191732406616, + "learning_rate": 4.769375248206431e-05, + "loss": 5.0346, + "step": 23171 + }, + { + "epoch": 0.13781044818726806, + "grad_norm": 1.9576739072799683, + "learning_rate": 4.769355652497421e-05, + "loss": 5.4, + "step": 23172 + }, + { + "epoch": 0.13781639547054905, + "grad_norm": 1.7060799598693848, + "learning_rate": 4.7693360559962027e-05, + "loss": 4.9668, + "step": 23173 + }, + { + "epoch": 0.13782234275383004, + "grad_norm": 1.4705651998519897, + "learning_rate": 4.769316458702782e-05, + "loss": 5.2053, + "step": 23174 + }, + { + "epoch": 0.13782829003711106, + "grad_norm": 1.806314468383789, + "learning_rate": 4.769296860617167e-05, + "loss": 5.5297, + "step": 23175 + }, + { + "epoch": 0.13783423732039204, + "grad_norm": 1.7741440534591675, + "learning_rate": 4.769277261739364e-05, + "loss": 5.569, + "step": 23176 + }, + { + "epoch": 0.13784018460367303, + "grad_norm": 1.4956278800964355, + "learning_rate": 4.7692576620693796e-05, + "loss": 5.2616, + "step": 23177 + }, + { + "epoch": 0.13784613188695405, + "grad_norm": 1.4668684005737305, + "learning_rate": 4.7692380616072205e-05, + "loss": 5.551, + "step": 23178 + }, + { + "epoch": 0.13785207917023504, + "grad_norm": 1.9172862768173218, + "learning_rate": 4.769218460352894e-05, + "loss": 5.072, + "step": 23179 + }, + { + "epoch": 0.13785802645351602, + "grad_norm": 2.3610761165618896, + "learning_rate": 4.769198858306407e-05, + "loss": 4.5511, + "step": 23180 + }, + { + "epoch": 0.13786397373679704, + "grad_norm": 2.099209785461426, + "learning_rate": 4.769179255467766e-05, + "loss": 5.1829, + "step": 23181 + }, + { + "epoch": 0.13786992102007803, + "grad_norm": 1.8222076892852783, + "learning_rate": 4.7691596518369776e-05, + "loss": 5.1451, + "step": 23182 + }, + { + "epoch": 0.13787586830335902, + "grad_norm": 2.129558563232422, + "learning_rate": 4.769140047414049e-05, + "loss": 4.574, + "step": 23183 + }, + { + "epoch": 0.13788181558664003, + "grad_norm": 2.3188533782958984, + "learning_rate": 4.7691204421989876e-05, + "loss": 4.4604, + "step": 23184 + }, + { + "epoch": 0.13788776286992102, + "grad_norm": 2.2996792793273926, + "learning_rate": 4.7691008361918e-05, + "loss": 4.6119, + "step": 23185 + }, + { + "epoch": 0.137893710153202, + "grad_norm": 2.164652109146118, + "learning_rate": 4.769081229392492e-05, + "loss": 4.6286, + "step": 23186 + }, + { + "epoch": 0.13789965743648303, + "grad_norm": 1.9271842241287231, + "learning_rate": 4.769061621801071e-05, + "loss": 4.947, + "step": 23187 + }, + { + "epoch": 0.137905604719764, + "grad_norm": 1.8559855222702026, + "learning_rate": 4.769042013417545e-05, + "loss": 5.1969, + "step": 23188 + }, + { + "epoch": 0.137911552003045, + "grad_norm": 1.8955408334732056, + "learning_rate": 4.769022404241919e-05, + "loss": 5.0117, + "step": 23189 + }, + { + "epoch": 0.13791749928632602, + "grad_norm": 2.333242177963257, + "learning_rate": 4.769002794274201e-05, + "loss": 4.4839, + "step": 23190 + }, + { + "epoch": 0.137923446569607, + "grad_norm": 1.6732560396194458, + "learning_rate": 4.768983183514397e-05, + "loss": 5.2458, + "step": 23191 + }, + { + "epoch": 0.137929393852888, + "grad_norm": 1.6078556776046753, + "learning_rate": 4.768963571962516e-05, + "loss": 5.616, + "step": 23192 + }, + { + "epoch": 0.137935341136169, + "grad_norm": 1.7516095638275146, + "learning_rate": 4.768943959618562e-05, + "loss": 5.3052, + "step": 23193 + }, + { + "epoch": 0.13794128841945, + "grad_norm": 1.5200318098068237, + "learning_rate": 4.7689243464825425e-05, + "loss": 5.664, + "step": 23194 + }, + { + "epoch": 0.13794723570273099, + "grad_norm": 1.3212077617645264, + "learning_rate": 4.7689047325544664e-05, + "loss": 5.4562, + "step": 23195 + }, + { + "epoch": 0.137953182986012, + "grad_norm": 1.3307675123214722, + "learning_rate": 4.7688851178343386e-05, + "loss": 5.2517, + "step": 23196 + }, + { + "epoch": 0.137959130269293, + "grad_norm": 1.5186207294464111, + "learning_rate": 4.768865502322166e-05, + "loss": 5.654, + "step": 23197 + }, + { + "epoch": 0.13796507755257398, + "grad_norm": 1.6482549905776978, + "learning_rate": 4.7688458860179564e-05, + "loss": 5.3282, + "step": 23198 + }, + { + "epoch": 0.137971024835855, + "grad_norm": 1.4418150186538696, + "learning_rate": 4.768826268921717e-05, + "loss": 5.5913, + "step": 23199 + }, + { + "epoch": 0.13797697211913598, + "grad_norm": 1.5591225624084473, + "learning_rate": 4.768806651033452e-05, + "loss": 5.8459, + "step": 23200 + }, + { + "epoch": 0.13798291940241697, + "grad_norm": 1.3476347923278809, + "learning_rate": 4.768787032353171e-05, + "loss": 5.3597, + "step": 23201 + }, + { + "epoch": 0.137988866685698, + "grad_norm": 1.4543404579162598, + "learning_rate": 4.76876741288088e-05, + "loss": 5.4525, + "step": 23202 + }, + { + "epoch": 0.13799481396897897, + "grad_norm": 1.3845150470733643, + "learning_rate": 4.7687477926165846e-05, + "loss": 5.6559, + "step": 23203 + }, + { + "epoch": 0.13800076125225996, + "grad_norm": 1.303808569908142, + "learning_rate": 4.768728171560294e-05, + "loss": 5.8732, + "step": 23204 + }, + { + "epoch": 0.13800670853554098, + "grad_norm": 1.422867774963379, + "learning_rate": 4.768708549712013e-05, + "loss": 5.217, + "step": 23205 + }, + { + "epoch": 0.13801265581882197, + "grad_norm": 1.558089017868042, + "learning_rate": 4.7686889270717506e-05, + "loss": 5.6403, + "step": 23206 + }, + { + "epoch": 0.13801860310210295, + "grad_norm": 1.5510298013687134, + "learning_rate": 4.7686693036395115e-05, + "loss": 5.6199, + "step": 23207 + }, + { + "epoch": 0.13802455038538394, + "grad_norm": 1.2693150043487549, + "learning_rate": 4.768649679415303e-05, + "loss": 5.7368, + "step": 23208 + }, + { + "epoch": 0.13803049766866496, + "grad_norm": 1.5053805112838745, + "learning_rate": 4.768630054399132e-05, + "loss": 5.4941, + "step": 23209 + }, + { + "epoch": 0.13803644495194595, + "grad_norm": 2.5151054859161377, + "learning_rate": 4.768610428591007e-05, + "loss": 4.5744, + "step": 23210 + }, + { + "epoch": 0.13804239223522694, + "grad_norm": 2.1085267066955566, + "learning_rate": 4.768590801990933e-05, + "loss": 4.5849, + "step": 23211 + }, + { + "epoch": 0.13804833951850795, + "grad_norm": 2.0741498470306396, + "learning_rate": 4.7685711745989174e-05, + "loss": 4.5745, + "step": 23212 + }, + { + "epoch": 0.13805428680178894, + "grad_norm": 2.0066654682159424, + "learning_rate": 4.7685515464149664e-05, + "loss": 4.6388, + "step": 23213 + }, + { + "epoch": 0.13806023408506993, + "grad_norm": 1.9224933385849, + "learning_rate": 4.7685319174390885e-05, + "loss": 4.5382, + "step": 23214 + }, + { + "epoch": 0.13806618136835094, + "grad_norm": 2.2363088130950928, + "learning_rate": 4.7685122876712896e-05, + "loss": 4.5825, + "step": 23215 + }, + { + "epoch": 0.13807212865163193, + "grad_norm": 2.1900362968444824, + "learning_rate": 4.768492657111576e-05, + "loss": 4.5519, + "step": 23216 + }, + { + "epoch": 0.13807807593491292, + "grad_norm": 2.0702250003814697, + "learning_rate": 4.768473025759955e-05, + "loss": 4.5917, + "step": 23217 + }, + { + "epoch": 0.13808402321819394, + "grad_norm": 2.000380277633667, + "learning_rate": 4.768453393616433e-05, + "loss": 4.8847, + "step": 23218 + }, + { + "epoch": 0.13808997050147492, + "grad_norm": 2.0710175037384033, + "learning_rate": 4.768433760681018e-05, + "loss": 4.5455, + "step": 23219 + }, + { + "epoch": 0.1380959177847559, + "grad_norm": 2.1148219108581543, + "learning_rate": 4.7684141269537165e-05, + "loss": 4.5109, + "step": 23220 + }, + { + "epoch": 0.13810186506803693, + "grad_norm": 1.7681657075881958, + "learning_rate": 4.768394492434535e-05, + "loss": 4.8899, + "step": 23221 + }, + { + "epoch": 0.13810781235131792, + "grad_norm": 2.032696008682251, + "learning_rate": 4.76837485712348e-05, + "loss": 5.2375, + "step": 23222 + }, + { + "epoch": 0.1381137596345989, + "grad_norm": 2.0016825199127197, + "learning_rate": 4.7683552210205585e-05, + "loss": 4.9066, + "step": 23223 + }, + { + "epoch": 0.13811970691787992, + "grad_norm": 2.1309103965759277, + "learning_rate": 4.7683355841257784e-05, + "loss": 4.4317, + "step": 23224 + }, + { + "epoch": 0.1381256542011609, + "grad_norm": 1.9037781953811646, + "learning_rate": 4.768315946439145e-05, + "loss": 5.0218, + "step": 23225 + }, + { + "epoch": 0.1381316014844419, + "grad_norm": 2.3080644607543945, + "learning_rate": 4.768296307960666e-05, + "loss": 5.2226, + "step": 23226 + }, + { + "epoch": 0.1381375487677229, + "grad_norm": 2.1073081493377686, + "learning_rate": 4.7682766686903494e-05, + "loss": 5.2403, + "step": 23227 + }, + { + "epoch": 0.1381434960510039, + "grad_norm": 1.7865220308303833, + "learning_rate": 4.768257028628199e-05, + "loss": 5.1642, + "step": 23228 + }, + { + "epoch": 0.1381494433342849, + "grad_norm": 1.7039834260940552, + "learning_rate": 4.768237387774225e-05, + "loss": 5.1943, + "step": 23229 + }, + { + "epoch": 0.1381553906175659, + "grad_norm": 1.714506983757019, + "learning_rate": 4.768217746128432e-05, + "loss": 5.0952, + "step": 23230 + }, + { + "epoch": 0.1381613379008469, + "grad_norm": 1.7183910608291626, + "learning_rate": 4.768198103690827e-05, + "loss": 5.0447, + "step": 23231 + }, + { + "epoch": 0.13816728518412788, + "grad_norm": 1.776077151298523, + "learning_rate": 4.768178460461419e-05, + "loss": 5.1296, + "step": 23232 + }, + { + "epoch": 0.1381732324674089, + "grad_norm": 1.7849907875061035, + "learning_rate": 4.7681588164402124e-05, + "loss": 4.7961, + "step": 23233 + }, + { + "epoch": 0.13817917975068988, + "grad_norm": 1.403860330581665, + "learning_rate": 4.768139171627216e-05, + "loss": 5.4794, + "step": 23234 + }, + { + "epoch": 0.13818512703397087, + "grad_norm": 1.5944229364395142, + "learning_rate": 4.7681195260224344e-05, + "loss": 4.973, + "step": 23235 + }, + { + "epoch": 0.1381910743172519, + "grad_norm": 2.196274518966675, + "learning_rate": 4.7680998796258764e-05, + "loss": 5.1835, + "step": 23236 + }, + { + "epoch": 0.13819702160053288, + "grad_norm": 1.5403459072113037, + "learning_rate": 4.768080232437548e-05, + "loss": 5.828, + "step": 23237 + }, + { + "epoch": 0.13820296888381386, + "grad_norm": 1.9711260795593262, + "learning_rate": 4.768060584457456e-05, + "loss": 5.4937, + "step": 23238 + }, + { + "epoch": 0.13820891616709488, + "grad_norm": 1.6869981288909912, + "learning_rate": 4.7680409356856075e-05, + "loss": 5.3298, + "step": 23239 + }, + { + "epoch": 0.13821486345037587, + "grad_norm": 2.4224069118499756, + "learning_rate": 4.7680212861220096e-05, + "loss": 4.9544, + "step": 23240 + }, + { + "epoch": 0.13822081073365686, + "grad_norm": 1.905261754989624, + "learning_rate": 4.768001635766669e-05, + "loss": 4.852, + "step": 23241 + }, + { + "epoch": 0.13822675801693787, + "grad_norm": 1.7081589698791504, + "learning_rate": 4.7679819846195925e-05, + "loss": 5.2201, + "step": 23242 + }, + { + "epoch": 0.13823270530021886, + "grad_norm": 1.5893620252609253, + "learning_rate": 4.767962332680786e-05, + "loss": 4.9691, + "step": 23243 + }, + { + "epoch": 0.13823865258349985, + "grad_norm": 1.7598754167556763, + "learning_rate": 4.767942679950258e-05, + "loss": 4.9661, + "step": 23244 + }, + { + "epoch": 0.13824459986678087, + "grad_norm": 1.6882308721542358, + "learning_rate": 4.767923026428015e-05, + "loss": 5.3529, + "step": 23245 + }, + { + "epoch": 0.13825054715006185, + "grad_norm": 1.6711715459823608, + "learning_rate": 4.767903372114063e-05, + "loss": 5.3288, + "step": 23246 + }, + { + "epoch": 0.13825649443334284, + "grad_norm": 1.5780813694000244, + "learning_rate": 4.76788371700841e-05, + "loss": 5.5583, + "step": 23247 + }, + { + "epoch": 0.13826244171662386, + "grad_norm": 1.9719429016113281, + "learning_rate": 4.767864061111061e-05, + "loss": 5.2821, + "step": 23248 + }, + { + "epoch": 0.13826838899990485, + "grad_norm": 1.6447231769561768, + "learning_rate": 4.767844404422025e-05, + "loss": 6.0166, + "step": 23249 + }, + { + "epoch": 0.13827433628318583, + "grad_norm": 1.6587456464767456, + "learning_rate": 4.767824746941307e-05, + "loss": 5.4081, + "step": 23250 + }, + { + "epoch": 0.13828028356646685, + "grad_norm": 1.9438105821609497, + "learning_rate": 4.767805088668916e-05, + "loss": 5.4436, + "step": 23251 + }, + { + "epoch": 0.13828623084974784, + "grad_norm": 2.1185503005981445, + "learning_rate": 4.767785429604857e-05, + "loss": 4.8413, + "step": 23252 + }, + { + "epoch": 0.13829217813302883, + "grad_norm": 2.176520347595215, + "learning_rate": 4.767765769749138e-05, + "loss": 4.9092, + "step": 23253 + }, + { + "epoch": 0.13829812541630984, + "grad_norm": 2.020982503890991, + "learning_rate": 4.767746109101765e-05, + "loss": 4.9179, + "step": 23254 + }, + { + "epoch": 0.13830407269959083, + "grad_norm": 1.6086227893829346, + "learning_rate": 4.767726447662746e-05, + "loss": 5.1998, + "step": 23255 + }, + { + "epoch": 0.13831001998287182, + "grad_norm": 1.8750804662704468, + "learning_rate": 4.767706785432087e-05, + "loss": 4.6858, + "step": 23256 + }, + { + "epoch": 0.13831596726615283, + "grad_norm": 1.7748466730117798, + "learning_rate": 4.767687122409794e-05, + "loss": 4.5468, + "step": 23257 + }, + { + "epoch": 0.13832191454943382, + "grad_norm": 1.94595205783844, + "learning_rate": 4.767667458595875e-05, + "loss": 4.6902, + "step": 23258 + }, + { + "epoch": 0.1383278618327148, + "grad_norm": 1.7588400840759277, + "learning_rate": 4.7676477939903375e-05, + "loss": 5.8701, + "step": 23259 + }, + { + "epoch": 0.13833380911599583, + "grad_norm": 1.8222272396087646, + "learning_rate": 4.7676281285931866e-05, + "loss": 4.6879, + "step": 23260 + }, + { + "epoch": 0.13833975639927681, + "grad_norm": 1.7244281768798828, + "learning_rate": 4.767608462404431e-05, + "loss": 5.0215, + "step": 23261 + }, + { + "epoch": 0.1383457036825578, + "grad_norm": 1.5756913423538208, + "learning_rate": 4.767588795424077e-05, + "loss": 5.9537, + "step": 23262 + }, + { + "epoch": 0.13835165096583882, + "grad_norm": 1.6441105604171753, + "learning_rate": 4.767569127652131e-05, + "loss": 5.9245, + "step": 23263 + }, + { + "epoch": 0.1383575982491198, + "grad_norm": 1.5573482513427734, + "learning_rate": 4.767549459088599e-05, + "loss": 5.6705, + "step": 23264 + }, + { + "epoch": 0.1383635455324008, + "grad_norm": 1.65425705909729, + "learning_rate": 4.767529789733489e-05, + "loss": 5.8664, + "step": 23265 + }, + { + "epoch": 0.13836949281568178, + "grad_norm": 1.665283441543579, + "learning_rate": 4.767510119586809e-05, + "loss": 5.7634, + "step": 23266 + }, + { + "epoch": 0.1383754400989628, + "grad_norm": 1.4760838747024536, + "learning_rate": 4.767490448648564e-05, + "loss": 5.7739, + "step": 23267 + }, + { + "epoch": 0.1383813873822438, + "grad_norm": 1.649942398071289, + "learning_rate": 4.7674707769187616e-05, + "loss": 5.7518, + "step": 23268 + }, + { + "epoch": 0.13838733466552477, + "grad_norm": 1.5092672109603882, + "learning_rate": 4.7674511043974084e-05, + "loss": 5.7706, + "step": 23269 + }, + { + "epoch": 0.1383932819488058, + "grad_norm": 2.5008256435394287, + "learning_rate": 4.767431431084512e-05, + "loss": 4.6023, + "step": 23270 + }, + { + "epoch": 0.13839922923208678, + "grad_norm": 2.4018449783325195, + "learning_rate": 4.767411756980078e-05, + "loss": 4.7872, + "step": 23271 + }, + { + "epoch": 0.13840517651536777, + "grad_norm": 1.7928224802017212, + "learning_rate": 4.7673920820841136e-05, + "loss": 5.2731, + "step": 23272 + }, + { + "epoch": 0.13841112379864878, + "grad_norm": 1.844249963760376, + "learning_rate": 4.767372406396627e-05, + "loss": 5.2441, + "step": 23273 + }, + { + "epoch": 0.13841707108192977, + "grad_norm": 2.160876989364624, + "learning_rate": 4.7673527299176236e-05, + "loss": 4.5445, + "step": 23274 + }, + { + "epoch": 0.13842301836521076, + "grad_norm": 1.6312650442123413, + "learning_rate": 4.767333052647112e-05, + "loss": 5.0418, + "step": 23275 + }, + { + "epoch": 0.13842896564849178, + "grad_norm": 1.6567429304122925, + "learning_rate": 4.7673133745850965e-05, + "loss": 5.2882, + "step": 23276 + }, + { + "epoch": 0.13843491293177276, + "grad_norm": 1.8484638929367065, + "learning_rate": 4.767293695731585e-05, + "loss": 5.3432, + "step": 23277 + }, + { + "epoch": 0.13844086021505375, + "grad_norm": 1.8447157144546509, + "learning_rate": 4.767274016086586e-05, + "loss": 5.3307, + "step": 23278 + }, + { + "epoch": 0.13844680749833477, + "grad_norm": 1.6714428663253784, + "learning_rate": 4.767254335650104e-05, + "loss": 5.3053, + "step": 23279 + }, + { + "epoch": 0.13845275478161576, + "grad_norm": 1.7423646450042725, + "learning_rate": 4.7672346544221474e-05, + "loss": 5.3129, + "step": 23280 + }, + { + "epoch": 0.13845870206489674, + "grad_norm": 1.5770469903945923, + "learning_rate": 4.7672149724027224e-05, + "loss": 5.2806, + "step": 23281 + }, + { + "epoch": 0.13846464934817776, + "grad_norm": 1.5982024669647217, + "learning_rate": 4.7671952895918365e-05, + "loss": 5.4873, + "step": 23282 + }, + { + "epoch": 0.13847059663145875, + "grad_norm": 1.9240913391113281, + "learning_rate": 4.767175605989496e-05, + "loss": 5.8309, + "step": 23283 + }, + { + "epoch": 0.13847654391473974, + "grad_norm": 1.612429141998291, + "learning_rate": 4.7671559215957075e-05, + "loss": 5.4479, + "step": 23284 + }, + { + "epoch": 0.13848249119802075, + "grad_norm": 1.5843868255615234, + "learning_rate": 4.7671362364104785e-05, + "loss": 5.5509, + "step": 23285 + }, + { + "epoch": 0.13848843848130174, + "grad_norm": 2.3811614513397217, + "learning_rate": 4.767116550433816e-05, + "loss": 5.4695, + "step": 23286 + }, + { + "epoch": 0.13849438576458273, + "grad_norm": 2.6257996559143066, + "learning_rate": 4.767096863665726e-05, + "loss": 5.0195, + "step": 23287 + }, + { + "epoch": 0.13850033304786374, + "grad_norm": 1.8920071125030518, + "learning_rate": 4.7670771761062164e-05, + "loss": 5.2023, + "step": 23288 + }, + { + "epoch": 0.13850628033114473, + "grad_norm": 1.52253258228302, + "learning_rate": 4.767057487755293e-05, + "loss": 5.6985, + "step": 23289 + }, + { + "epoch": 0.13851222761442572, + "grad_norm": 2.240440845489502, + "learning_rate": 4.767037798612964e-05, + "loss": 5.1073, + "step": 23290 + }, + { + "epoch": 0.13851817489770674, + "grad_norm": 2.127216100692749, + "learning_rate": 4.7670181086792354e-05, + "loss": 5.1885, + "step": 23291 + }, + { + "epoch": 0.13852412218098772, + "grad_norm": 2.128519058227539, + "learning_rate": 4.766998417954114e-05, + "loss": 4.9388, + "step": 23292 + }, + { + "epoch": 0.1385300694642687, + "grad_norm": 1.87863290309906, + "learning_rate": 4.7669787264376066e-05, + "loss": 4.8293, + "step": 23293 + }, + { + "epoch": 0.13853601674754973, + "grad_norm": 2.03975510597229, + "learning_rate": 4.766959034129721e-05, + "loss": 4.9168, + "step": 23294 + }, + { + "epoch": 0.13854196403083072, + "grad_norm": 2.0336341857910156, + "learning_rate": 4.766939341030463e-05, + "loss": 4.9715, + "step": 23295 + }, + { + "epoch": 0.1385479113141117, + "grad_norm": 1.943743348121643, + "learning_rate": 4.7669196471398396e-05, + "loss": 4.7709, + "step": 23296 + }, + { + "epoch": 0.13855385859739272, + "grad_norm": 2.1629462242126465, + "learning_rate": 4.766899952457858e-05, + "loss": 4.7499, + "step": 23297 + }, + { + "epoch": 0.1385598058806737, + "grad_norm": 2.200531005859375, + "learning_rate": 4.7668802569845256e-05, + "loss": 4.8418, + "step": 23298 + }, + { + "epoch": 0.1385657531639547, + "grad_norm": 2.038649797439575, + "learning_rate": 4.766860560719849e-05, + "loss": 5.2351, + "step": 23299 + }, + { + "epoch": 0.1385717004472357, + "grad_norm": 1.8091388940811157, + "learning_rate": 4.766840863663834e-05, + "loss": 5.3526, + "step": 23300 + }, + { + "epoch": 0.1385776477305167, + "grad_norm": 1.9351911544799805, + "learning_rate": 4.7668211658164884e-05, + "loss": 4.813, + "step": 23301 + }, + { + "epoch": 0.1385835950137977, + "grad_norm": 2.0985751152038574, + "learning_rate": 4.766801467177819e-05, + "loss": 4.7762, + "step": 23302 + }, + { + "epoch": 0.1385895422970787, + "grad_norm": 2.023658275604248, + "learning_rate": 4.766781767747833e-05, + "loss": 4.8076, + "step": 23303 + }, + { + "epoch": 0.1385954895803597, + "grad_norm": 1.7464020252227783, + "learning_rate": 4.7667620675265364e-05, + "loss": 5.2537, + "step": 23304 + }, + { + "epoch": 0.13860143686364068, + "grad_norm": 1.7812929153442383, + "learning_rate": 4.7667423665139364e-05, + "loss": 4.8896, + "step": 23305 + }, + { + "epoch": 0.1386073841469217, + "grad_norm": 2.0042948722839355, + "learning_rate": 4.76672266471004e-05, + "loss": 4.7254, + "step": 23306 + }, + { + "epoch": 0.13861333143020269, + "grad_norm": 1.8378963470458984, + "learning_rate": 4.7667029621148554e-05, + "loss": 4.9849, + "step": 23307 + }, + { + "epoch": 0.13861927871348367, + "grad_norm": 2.1476621627807617, + "learning_rate": 4.7666832587283873e-05, + "loss": 4.5167, + "step": 23308 + }, + { + "epoch": 0.1386252259967647, + "grad_norm": 1.8289295434951782, + "learning_rate": 4.7666635545506434e-05, + "loss": 4.8841, + "step": 23309 + }, + { + "epoch": 0.13863117328004568, + "grad_norm": 1.7215977907180786, + "learning_rate": 4.766643849581631e-05, + "loss": 5.0148, + "step": 23310 + }, + { + "epoch": 0.13863712056332667, + "grad_norm": 1.464308261871338, + "learning_rate": 4.7666241438213566e-05, + "loss": 5.2551, + "step": 23311 + }, + { + "epoch": 0.13864306784660768, + "grad_norm": 1.655523657798767, + "learning_rate": 4.766604437269827e-05, + "loss": 5.604, + "step": 23312 + }, + { + "epoch": 0.13864901512988867, + "grad_norm": 1.9533252716064453, + "learning_rate": 4.766584729927049e-05, + "loss": 5.6238, + "step": 23313 + }, + { + "epoch": 0.13865496241316966, + "grad_norm": 1.8174513578414917, + "learning_rate": 4.7665650217930305e-05, + "loss": 5.6806, + "step": 23314 + }, + { + "epoch": 0.13866090969645067, + "grad_norm": 1.58940851688385, + "learning_rate": 4.766545312867776e-05, + "loss": 5.5066, + "step": 23315 + }, + { + "epoch": 0.13866685697973166, + "grad_norm": 1.5862720012664795, + "learning_rate": 4.766525603151295e-05, + "loss": 5.352, + "step": 23316 + }, + { + "epoch": 0.13867280426301265, + "grad_norm": 1.7878305912017822, + "learning_rate": 4.7665058926435934e-05, + "loss": 5.4043, + "step": 23317 + }, + { + "epoch": 0.13867875154629367, + "grad_norm": 1.3984423875808716, + "learning_rate": 4.766486181344678e-05, + "loss": 5.8719, + "step": 23318 + }, + { + "epoch": 0.13868469882957465, + "grad_norm": 1.6912389993667603, + "learning_rate": 4.7664664692545555e-05, + "loss": 5.6587, + "step": 23319 + }, + { + "epoch": 0.13869064611285564, + "grad_norm": 1.593245506286621, + "learning_rate": 4.766446756373233e-05, + "loss": 5.424, + "step": 23320 + }, + { + "epoch": 0.13869659339613666, + "grad_norm": 1.5353487730026245, + "learning_rate": 4.766427042700717e-05, + "loss": 5.7179, + "step": 23321 + }, + { + "epoch": 0.13870254067941765, + "grad_norm": 1.4989358186721802, + "learning_rate": 4.766407328237016e-05, + "loss": 6.1919, + "step": 23322 + }, + { + "epoch": 0.13870848796269863, + "grad_norm": 1.292460322380066, + "learning_rate": 4.766387612982134e-05, + "loss": 5.8265, + "step": 23323 + }, + { + "epoch": 0.13871443524597965, + "grad_norm": 1.4890642166137695, + "learning_rate": 4.766367896936081e-05, + "loss": 5.1671, + "step": 23324 + }, + { + "epoch": 0.13872038252926064, + "grad_norm": 1.7513198852539062, + "learning_rate": 4.766348180098861e-05, + "loss": 4.908, + "step": 23325 + }, + { + "epoch": 0.13872632981254163, + "grad_norm": 1.503311038017273, + "learning_rate": 4.766328462470483e-05, + "loss": 5.661, + "step": 23326 + }, + { + "epoch": 0.13873227709582261, + "grad_norm": 2.333216667175293, + "learning_rate": 4.766308744050953e-05, + "loss": 4.5921, + "step": 23327 + }, + { + "epoch": 0.13873822437910363, + "grad_norm": 2.1495418548583984, + "learning_rate": 4.7662890248402786e-05, + "loss": 4.8017, + "step": 23328 + }, + { + "epoch": 0.13874417166238462, + "grad_norm": 1.4922517538070679, + "learning_rate": 4.766269304838466e-05, + "loss": 5.3407, + "step": 23329 + }, + { + "epoch": 0.1387501189456656, + "grad_norm": 1.5760530233383179, + "learning_rate": 4.7662495840455214e-05, + "loss": 5.1536, + "step": 23330 + }, + { + "epoch": 0.13875606622894662, + "grad_norm": 1.432483434677124, + "learning_rate": 4.7662298624614524e-05, + "loss": 4.405, + "step": 23331 + }, + { + "epoch": 0.1387620135122276, + "grad_norm": 1.5221575498580933, + "learning_rate": 4.766210140086267e-05, + "loss": 4.5132, + "step": 23332 + }, + { + "epoch": 0.1387679607955086, + "grad_norm": 1.7520684003829956, + "learning_rate": 4.76619041691997e-05, + "loss": 4.5229, + "step": 23333 + }, + { + "epoch": 0.13877390807878962, + "grad_norm": 1.8210954666137695, + "learning_rate": 4.76617069296257e-05, + "loss": 4.7207, + "step": 23334 + }, + { + "epoch": 0.1387798553620706, + "grad_norm": 1.5682491064071655, + "learning_rate": 4.7661509682140734e-05, + "loss": 4.5045, + "step": 23335 + }, + { + "epoch": 0.1387858026453516, + "grad_norm": 1.7219401597976685, + "learning_rate": 4.7661312426744865e-05, + "loss": 4.4846, + "step": 23336 + }, + { + "epoch": 0.1387917499286326, + "grad_norm": 1.590681791305542, + "learning_rate": 4.766111516343816e-05, + "loss": 4.2617, + "step": 23337 + }, + { + "epoch": 0.1387976972119136, + "grad_norm": 1.533359408378601, + "learning_rate": 4.76609178922207e-05, + "loss": 4.4746, + "step": 23338 + }, + { + "epoch": 0.13880364449519458, + "grad_norm": 1.5994545221328735, + "learning_rate": 4.7660720613092555e-05, + "loss": 4.5712, + "step": 23339 + }, + { + "epoch": 0.1388095917784756, + "grad_norm": 1.472655177116394, + "learning_rate": 4.766052332605377e-05, + "loss": 4.3592, + "step": 23340 + }, + { + "epoch": 0.1388155390617566, + "grad_norm": 1.5625941753387451, + "learning_rate": 4.7660326031104445e-05, + "loss": 4.2859, + "step": 23341 + }, + { + "epoch": 0.13882148634503758, + "grad_norm": 2.1194114685058594, + "learning_rate": 4.766012872824464e-05, + "loss": 5.0237, + "step": 23342 + }, + { + "epoch": 0.1388274336283186, + "grad_norm": 1.699491262435913, + "learning_rate": 4.7659931417474404e-05, + "loss": 5.4558, + "step": 23343 + }, + { + "epoch": 0.13883338091159958, + "grad_norm": 1.7734466791152954, + "learning_rate": 4.765973409879382e-05, + "loss": 4.5118, + "step": 23344 + }, + { + "epoch": 0.13883932819488057, + "grad_norm": 1.7193443775177002, + "learning_rate": 4.765953677220296e-05, + "loss": 5.7915, + "step": 23345 + }, + { + "epoch": 0.13884527547816158, + "grad_norm": 1.6994706392288208, + "learning_rate": 4.765933943770189e-05, + "loss": 5.2722, + "step": 23346 + }, + { + "epoch": 0.13885122276144257, + "grad_norm": 2.1580300331115723, + "learning_rate": 4.765914209529068e-05, + "loss": 5.2697, + "step": 23347 + }, + { + "epoch": 0.13885717004472356, + "grad_norm": 2.437685012817383, + "learning_rate": 4.765894474496939e-05, + "loss": 5.2533, + "step": 23348 + }, + { + "epoch": 0.13886311732800458, + "grad_norm": 2.2965760231018066, + "learning_rate": 4.7658747386738113e-05, + "loss": 5.3419, + "step": 23349 + }, + { + "epoch": 0.13886906461128556, + "grad_norm": 2.0520517826080322, + "learning_rate": 4.765855002059689e-05, + "loss": 5.1966, + "step": 23350 + }, + { + "epoch": 0.13887501189456655, + "grad_norm": 2.043931484222412, + "learning_rate": 4.76583526465458e-05, + "loss": 5.1984, + "step": 23351 + }, + { + "epoch": 0.13888095917784757, + "grad_norm": 1.9283409118652344, + "learning_rate": 4.765815526458491e-05, + "loss": 4.6806, + "step": 23352 + }, + { + "epoch": 0.13888690646112856, + "grad_norm": 1.8964955806732178, + "learning_rate": 4.76579578747143e-05, + "loss": 4.9367, + "step": 23353 + }, + { + "epoch": 0.13889285374440954, + "grad_norm": 1.8109381198883057, + "learning_rate": 4.765776047693403e-05, + "loss": 4.7777, + "step": 23354 + }, + { + "epoch": 0.13889880102769056, + "grad_norm": 2.0096335411071777, + "learning_rate": 4.765756307124417e-05, + "loss": 4.9217, + "step": 23355 + }, + { + "epoch": 0.13890474831097155, + "grad_norm": 1.8210729360580444, + "learning_rate": 4.765736565764479e-05, + "loss": 4.8393, + "step": 23356 + }, + { + "epoch": 0.13891069559425254, + "grad_norm": 2.1033902168273926, + "learning_rate": 4.7657168236135954e-05, + "loss": 5.043, + "step": 23357 + }, + { + "epoch": 0.13891664287753355, + "grad_norm": 2.0610570907592773, + "learning_rate": 4.7656970806717736e-05, + "loss": 5.0493, + "step": 23358 + }, + { + "epoch": 0.13892259016081454, + "grad_norm": 2.169670343399048, + "learning_rate": 4.765677336939021e-05, + "loss": 5.2321, + "step": 23359 + }, + { + "epoch": 0.13892853744409553, + "grad_norm": 2.198686122894287, + "learning_rate": 4.7656575924153426e-05, + "loss": 5.2698, + "step": 23360 + }, + { + "epoch": 0.13893448472737654, + "grad_norm": 1.9425220489501953, + "learning_rate": 4.7656378471007476e-05, + "loss": 4.9435, + "step": 23361 + }, + { + "epoch": 0.13894043201065753, + "grad_norm": 1.936712384223938, + "learning_rate": 4.765618100995241e-05, + "loss": 4.6584, + "step": 23362 + }, + { + "epoch": 0.13894637929393852, + "grad_norm": 1.7941532135009766, + "learning_rate": 4.765598354098831e-05, + "loss": 4.6791, + "step": 23363 + }, + { + "epoch": 0.13895232657721954, + "grad_norm": 2.0149965286254883, + "learning_rate": 4.765578606411524e-05, + "loss": 5.1019, + "step": 23364 + }, + { + "epoch": 0.13895827386050053, + "grad_norm": 1.9302345514297485, + "learning_rate": 4.7655588579333265e-05, + "loss": 5.1168, + "step": 23365 + }, + { + "epoch": 0.1389642211437815, + "grad_norm": 2.0851333141326904, + "learning_rate": 4.7655391086642465e-05, + "loss": 5.0517, + "step": 23366 + }, + { + "epoch": 0.13897016842706253, + "grad_norm": 1.9221385717391968, + "learning_rate": 4.7655193586042904e-05, + "loss": 5.1486, + "step": 23367 + }, + { + "epoch": 0.13897611571034352, + "grad_norm": 1.9929136037826538, + "learning_rate": 4.765499607753464e-05, + "loss": 5.1288, + "step": 23368 + }, + { + "epoch": 0.1389820629936245, + "grad_norm": 1.8818596601486206, + "learning_rate": 4.765479856111775e-05, + "loss": 4.8252, + "step": 23369 + }, + { + "epoch": 0.13898801027690552, + "grad_norm": 1.748961091041565, + "learning_rate": 4.765460103679231e-05, + "loss": 4.7829, + "step": 23370 + }, + { + "epoch": 0.1389939575601865, + "grad_norm": 1.8021109104156494, + "learning_rate": 4.765440350455838e-05, + "loss": 4.7424, + "step": 23371 + }, + { + "epoch": 0.1389999048434675, + "grad_norm": 2.1486730575561523, + "learning_rate": 4.765420596441603e-05, + "loss": 4.6696, + "step": 23372 + }, + { + "epoch": 0.1390058521267485, + "grad_norm": 1.9908959865570068, + "learning_rate": 4.765400841636534e-05, + "loss": 4.5644, + "step": 23373 + }, + { + "epoch": 0.1390117994100295, + "grad_norm": 2.021198272705078, + "learning_rate": 4.765381086040636e-05, + "loss": 5.2841, + "step": 23374 + }, + { + "epoch": 0.1390177466933105, + "grad_norm": 2.0757644176483154, + "learning_rate": 4.765361329653918e-05, + "loss": 5.0479, + "step": 23375 + }, + { + "epoch": 0.1390236939765915, + "grad_norm": 2.6452016830444336, + "learning_rate": 4.7653415724763844e-05, + "loss": 4.5668, + "step": 23376 + }, + { + "epoch": 0.1390296412598725, + "grad_norm": 1.8536683320999146, + "learning_rate": 4.7653218145080436e-05, + "loss": 4.6049, + "step": 23377 + }, + { + "epoch": 0.13903558854315348, + "grad_norm": 2.1392767429351807, + "learning_rate": 4.765302055748903e-05, + "loss": 4.5307, + "step": 23378 + }, + { + "epoch": 0.1390415358264345, + "grad_norm": 2.0592446327209473, + "learning_rate": 4.765282296198968e-05, + "loss": 4.7421, + "step": 23379 + }, + { + "epoch": 0.1390474831097155, + "grad_norm": 1.9982407093048096, + "learning_rate": 4.765262535858248e-05, + "loss": 4.5699, + "step": 23380 + }, + { + "epoch": 0.13905343039299647, + "grad_norm": 1.6928536891937256, + "learning_rate": 4.765242774726747e-05, + "loss": 5.0689, + "step": 23381 + }, + { + "epoch": 0.1390593776762775, + "grad_norm": 2.1993813514709473, + "learning_rate": 4.765223012804474e-05, + "loss": 4.8268, + "step": 23382 + }, + { + "epoch": 0.13906532495955848, + "grad_norm": 1.711241364479065, + "learning_rate": 4.765203250091434e-05, + "loss": 5.7443, + "step": 23383 + }, + { + "epoch": 0.13907127224283947, + "grad_norm": 1.862398386001587, + "learning_rate": 4.765183486587636e-05, + "loss": 5.3367, + "step": 23384 + }, + { + "epoch": 0.13907721952612045, + "grad_norm": 1.95891273021698, + "learning_rate": 4.765163722293084e-05, + "loss": 5.6618, + "step": 23385 + }, + { + "epoch": 0.13908316680940147, + "grad_norm": 2.362205743789673, + "learning_rate": 4.765143957207789e-05, + "loss": 5.1168, + "step": 23386 + }, + { + "epoch": 0.13908911409268246, + "grad_norm": 1.7440927028656006, + "learning_rate": 4.7651241913317545e-05, + "loss": 4.858, + "step": 23387 + }, + { + "epoch": 0.13909506137596345, + "grad_norm": 1.7432098388671875, + "learning_rate": 4.765104424664989e-05, + "loss": 4.9096, + "step": 23388 + }, + { + "epoch": 0.13910100865924446, + "grad_norm": 1.7505769729614258, + "learning_rate": 4.765084657207498e-05, + "loss": 5.0255, + "step": 23389 + }, + { + "epoch": 0.13910695594252545, + "grad_norm": 1.5105990171432495, + "learning_rate": 4.76506488895929e-05, + "loss": 5.2811, + "step": 23390 + }, + { + "epoch": 0.13911290322580644, + "grad_norm": 1.6876368522644043, + "learning_rate": 4.765045119920372e-05, + "loss": 5.6723, + "step": 23391 + }, + { + "epoch": 0.13911885050908745, + "grad_norm": 1.6542494297027588, + "learning_rate": 4.7650253500907494e-05, + "loss": 5.1409, + "step": 23392 + }, + { + "epoch": 0.13912479779236844, + "grad_norm": 2.0412867069244385, + "learning_rate": 4.76500557947043e-05, + "loss": 4.8772, + "step": 23393 + }, + { + "epoch": 0.13913074507564943, + "grad_norm": 1.8121492862701416, + "learning_rate": 4.76498580805942e-05, + "loss": 5.1079, + "step": 23394 + }, + { + "epoch": 0.13913669235893045, + "grad_norm": 1.576653003692627, + "learning_rate": 4.764966035857727e-05, + "loss": 4.9576, + "step": 23395 + }, + { + "epoch": 0.13914263964221144, + "grad_norm": 1.5891642570495605, + "learning_rate": 4.764946262865358e-05, + "loss": 4.8846, + "step": 23396 + }, + { + "epoch": 0.13914858692549242, + "grad_norm": 1.7079927921295166, + "learning_rate": 4.7649264890823195e-05, + "loss": 5.0182, + "step": 23397 + }, + { + "epoch": 0.13915453420877344, + "grad_norm": 1.6532564163208008, + "learning_rate": 4.764906714508619e-05, + "loss": 4.8068, + "step": 23398 + }, + { + "epoch": 0.13916048149205443, + "grad_norm": 1.5107650756835938, + "learning_rate": 4.764886939144263e-05, + "loss": 5.3482, + "step": 23399 + }, + { + "epoch": 0.13916642877533542, + "grad_norm": 1.666096806526184, + "learning_rate": 4.764867162989258e-05, + "loss": 5.1747, + "step": 23400 + }, + { + "epoch": 0.13917237605861643, + "grad_norm": 1.864372730255127, + "learning_rate": 4.764847386043611e-05, + "loss": 4.3209, + "step": 23401 + }, + { + "epoch": 0.13917832334189742, + "grad_norm": 2.2691080570220947, + "learning_rate": 4.7648276083073295e-05, + "loss": 4.5254, + "step": 23402 + }, + { + "epoch": 0.1391842706251784, + "grad_norm": 2.0673935413360596, + "learning_rate": 4.76480782978042e-05, + "loss": 4.639, + "step": 23403 + }, + { + "epoch": 0.13919021790845942, + "grad_norm": 1.9274605512619019, + "learning_rate": 4.76478805046289e-05, + "loss": 4.579, + "step": 23404 + }, + { + "epoch": 0.1391961651917404, + "grad_norm": 1.5076278448104858, + "learning_rate": 4.7647682703547455e-05, + "loss": 4.9522, + "step": 23405 + }, + { + "epoch": 0.1392021124750214, + "grad_norm": 2.005662202835083, + "learning_rate": 4.7647484894559936e-05, + "loss": 4.3399, + "step": 23406 + }, + { + "epoch": 0.13920805975830242, + "grad_norm": 1.9292556047439575, + "learning_rate": 4.7647287077666414e-05, + "loss": 4.4166, + "step": 23407 + }, + { + "epoch": 0.1392140070415834, + "grad_norm": 1.7474818229675293, + "learning_rate": 4.764708925286696e-05, + "loss": 4.3355, + "step": 23408 + }, + { + "epoch": 0.1392199543248644, + "grad_norm": 1.9833084344863892, + "learning_rate": 4.764689142016164e-05, + "loss": 4.3388, + "step": 23409 + }, + { + "epoch": 0.1392259016081454, + "grad_norm": 1.7962874174118042, + "learning_rate": 4.764669357955053e-05, + "loss": 5.3199, + "step": 23410 + }, + { + "epoch": 0.1392318488914264, + "grad_norm": 1.6865921020507812, + "learning_rate": 4.764649573103368e-05, + "loss": 5.3787, + "step": 23411 + }, + { + "epoch": 0.13923779617470738, + "grad_norm": 1.2966182231903076, + "learning_rate": 4.7646297874611185e-05, + "loss": 5.0989, + "step": 23412 + }, + { + "epoch": 0.1392437434579884, + "grad_norm": 1.732437252998352, + "learning_rate": 4.76461000102831e-05, + "loss": 5.6207, + "step": 23413 + }, + { + "epoch": 0.1392496907412694, + "grad_norm": 1.567841649055481, + "learning_rate": 4.7645902138049494e-05, + "loss": 5.3921, + "step": 23414 + }, + { + "epoch": 0.13925563802455038, + "grad_norm": 1.7841026782989502, + "learning_rate": 4.764570425791043e-05, + "loss": 5.7206, + "step": 23415 + }, + { + "epoch": 0.1392615853078314, + "grad_norm": 2.0582776069641113, + "learning_rate": 4.764550636986599e-05, + "loss": 4.7812, + "step": 23416 + }, + { + "epoch": 0.13926753259111238, + "grad_norm": 1.5891739130020142, + "learning_rate": 4.764530847391624e-05, + "loss": 5.3211, + "step": 23417 + }, + { + "epoch": 0.13927347987439337, + "grad_norm": 1.4662810564041138, + "learning_rate": 4.764511057006125e-05, + "loss": 5.6385, + "step": 23418 + }, + { + "epoch": 0.13927942715767438, + "grad_norm": 1.6601322889328003, + "learning_rate": 4.764491265830108e-05, + "loss": 5.7947, + "step": 23419 + }, + { + "epoch": 0.13928537444095537, + "grad_norm": 1.5726239681243896, + "learning_rate": 4.7644714738635796e-05, + "loss": 5.6488, + "step": 23420 + }, + { + "epoch": 0.13929132172423636, + "grad_norm": 2.0315866470336914, + "learning_rate": 4.7644516811065494e-05, + "loss": 5.3196, + "step": 23421 + }, + { + "epoch": 0.13929726900751738, + "grad_norm": 2.3560190200805664, + "learning_rate": 4.764431887559022e-05, + "loss": 5.0898, + "step": 23422 + }, + { + "epoch": 0.13930321629079837, + "grad_norm": 1.6240613460540771, + "learning_rate": 4.764412093221004e-05, + "loss": 4.9766, + "step": 23423 + }, + { + "epoch": 0.13930916357407935, + "grad_norm": 1.9657840728759766, + "learning_rate": 4.764392298092504e-05, + "loss": 5.5328, + "step": 23424 + }, + { + "epoch": 0.13931511085736037, + "grad_norm": 1.8219939470291138, + "learning_rate": 4.764372502173527e-05, + "loss": 5.3713, + "step": 23425 + }, + { + "epoch": 0.13932105814064136, + "grad_norm": 1.6808767318725586, + "learning_rate": 4.764352705464082e-05, + "loss": 5.4753, + "step": 23426 + }, + { + "epoch": 0.13932700542392235, + "grad_norm": 1.6270160675048828, + "learning_rate": 4.764332907964175e-05, + "loss": 5.6609, + "step": 23427 + }, + { + "epoch": 0.13933295270720336, + "grad_norm": 1.5609904527664185, + "learning_rate": 4.764313109673812e-05, + "loss": 5.6954, + "step": 23428 + }, + { + "epoch": 0.13933889999048435, + "grad_norm": 1.5029795169830322, + "learning_rate": 4.764293310593001e-05, + "loss": 5.6655, + "step": 23429 + }, + { + "epoch": 0.13934484727376534, + "grad_norm": 1.6427209377288818, + "learning_rate": 4.7642735107217484e-05, + "loss": 4.9946, + "step": 23430 + }, + { + "epoch": 0.13935079455704635, + "grad_norm": 1.5815205574035645, + "learning_rate": 4.764253710060062e-05, + "loss": 5.4891, + "step": 23431 + }, + { + "epoch": 0.13935674184032734, + "grad_norm": 1.7551064491271973, + "learning_rate": 4.764233908607947e-05, + "loss": 5.4036, + "step": 23432 + }, + { + "epoch": 0.13936268912360833, + "grad_norm": 1.62980055809021, + "learning_rate": 4.7642141063654114e-05, + "loss": 5.4836, + "step": 23433 + }, + { + "epoch": 0.13936863640688935, + "grad_norm": 1.836366891860962, + "learning_rate": 4.7641943033324634e-05, + "loss": 5.4079, + "step": 23434 + }, + { + "epoch": 0.13937458369017033, + "grad_norm": 1.710744857788086, + "learning_rate": 4.764174499509107e-05, + "loss": 5.2859, + "step": 23435 + }, + { + "epoch": 0.13938053097345132, + "grad_norm": 1.6887309551239014, + "learning_rate": 4.7641546948953515e-05, + "loss": 5.4671, + "step": 23436 + }, + { + "epoch": 0.13938647825673234, + "grad_norm": 1.6997935771942139, + "learning_rate": 4.764134889491203e-05, + "loss": 5.2601, + "step": 23437 + }, + { + "epoch": 0.13939242554001333, + "grad_norm": 1.560526967048645, + "learning_rate": 4.764115083296668e-05, + "loss": 5.795, + "step": 23438 + }, + { + "epoch": 0.13939837282329431, + "grad_norm": 1.4518390893936157, + "learning_rate": 4.7640952763117544e-05, + "loss": 5.3885, + "step": 23439 + }, + { + "epoch": 0.13940432010657533, + "grad_norm": 1.698185920715332, + "learning_rate": 4.7640754685364675e-05, + "loss": 5.053, + "step": 23440 + }, + { + "epoch": 0.13941026738985632, + "grad_norm": 1.7422363758087158, + "learning_rate": 4.764055659970816e-05, + "loss": 5.1586, + "step": 23441 + }, + { + "epoch": 0.1394162146731373, + "grad_norm": 1.7014398574829102, + "learning_rate": 4.7640358506148065e-05, + "loss": 5.2313, + "step": 23442 + }, + { + "epoch": 0.1394221619564183, + "grad_norm": 1.6611777544021606, + "learning_rate": 4.764016040468444e-05, + "loss": 5.1691, + "step": 23443 + }, + { + "epoch": 0.1394281092396993, + "grad_norm": 1.6166971921920776, + "learning_rate": 4.763996229531739e-05, + "loss": 5.2217, + "step": 23444 + }, + { + "epoch": 0.1394340565229803, + "grad_norm": 1.9434369802474976, + "learning_rate": 4.763976417804694e-05, + "loss": 4.4322, + "step": 23445 + }, + { + "epoch": 0.1394400038062613, + "grad_norm": 3.2407455444335938, + "learning_rate": 4.7639566052873197e-05, + "loss": 3.3762, + "step": 23446 + }, + { + "epoch": 0.1394459510895423, + "grad_norm": 1.8475316762924194, + "learning_rate": 4.7639367919796215e-05, + "loss": 5.2435, + "step": 23447 + }, + { + "epoch": 0.1394518983728233, + "grad_norm": 1.7297134399414062, + "learning_rate": 4.763916977881606e-05, + "loss": 5.2485, + "step": 23448 + }, + { + "epoch": 0.13945784565610428, + "grad_norm": 1.720375657081604, + "learning_rate": 4.76389716299328e-05, + "loss": 5.1242, + "step": 23449 + }, + { + "epoch": 0.1394637929393853, + "grad_norm": 1.729045033454895, + "learning_rate": 4.763877347314652e-05, + "loss": 5.0312, + "step": 23450 + }, + { + "epoch": 0.13946974022266628, + "grad_norm": 1.817941427230835, + "learning_rate": 4.7638575308457266e-05, + "loss": 4.5856, + "step": 23451 + }, + { + "epoch": 0.13947568750594727, + "grad_norm": 2.7483971118927, + "learning_rate": 4.763837713586513e-05, + "loss": 3.3044, + "step": 23452 + }, + { + "epoch": 0.1394816347892283, + "grad_norm": 2.3746731281280518, + "learning_rate": 4.763817895537017e-05, + "loss": 3.0149, + "step": 23453 + }, + { + "epoch": 0.13948758207250928, + "grad_norm": 2.6971354484558105, + "learning_rate": 4.763798076697244e-05, + "loss": 3.7174, + "step": 23454 + }, + { + "epoch": 0.13949352935579026, + "grad_norm": 2.457082986831665, + "learning_rate": 4.763778257067205e-05, + "loss": 2.8548, + "step": 23455 + }, + { + "epoch": 0.13949947663907128, + "grad_norm": 2.4862163066864014, + "learning_rate": 4.7637584366469024e-05, + "loss": 2.6084, + "step": 23456 + }, + { + "epoch": 0.13950542392235227, + "grad_norm": 2.847895622253418, + "learning_rate": 4.763738615436346e-05, + "loss": 4.1775, + "step": 23457 + }, + { + "epoch": 0.13951137120563326, + "grad_norm": 2.827467918395996, + "learning_rate": 4.763718793435541e-05, + "loss": 4.0248, + "step": 23458 + }, + { + "epoch": 0.13951731848891427, + "grad_norm": 2.9717519283294678, + "learning_rate": 4.763698970644496e-05, + "loss": 3.8032, + "step": 23459 + }, + { + "epoch": 0.13952326577219526, + "grad_norm": 2.6418726444244385, + "learning_rate": 4.7636791470632166e-05, + "loss": 3.7307, + "step": 23460 + }, + { + "epoch": 0.13952921305547625, + "grad_norm": 2.789552927017212, + "learning_rate": 4.763659322691711e-05, + "loss": 3.458, + "step": 23461 + }, + { + "epoch": 0.13953516033875726, + "grad_norm": 2.3144681453704834, + "learning_rate": 4.7636394975299845e-05, + "loss": 4.1631, + "step": 23462 + }, + { + "epoch": 0.13954110762203825, + "grad_norm": 3.1292171478271484, + "learning_rate": 4.7636196715780454e-05, + "loss": 3.3234, + "step": 23463 + }, + { + "epoch": 0.13954705490531924, + "grad_norm": 3.2646241188049316, + "learning_rate": 4.763599844835899e-05, + "loss": 3.4951, + "step": 23464 + }, + { + "epoch": 0.13955300218860026, + "grad_norm": 3.3047688007354736, + "learning_rate": 4.7635800173035545e-05, + "loss": 3.6349, + "step": 23465 + }, + { + "epoch": 0.13955894947188124, + "grad_norm": 2.6160805225372314, + "learning_rate": 4.763560188981017e-05, + "loss": 3.8286, + "step": 23466 + }, + { + "epoch": 0.13956489675516223, + "grad_norm": 2.5719079971313477, + "learning_rate": 4.763540359868294e-05, + "loss": 3.7716, + "step": 23467 + }, + { + "epoch": 0.13957084403844325, + "grad_norm": 2.6471214294433594, + "learning_rate": 4.763520529965393e-05, + "loss": 3.4606, + "step": 23468 + }, + { + "epoch": 0.13957679132172424, + "grad_norm": 2.581679344177246, + "learning_rate": 4.7635006992723194e-05, + "loss": 3.5469, + "step": 23469 + }, + { + "epoch": 0.13958273860500522, + "grad_norm": 2.3326828479766846, + "learning_rate": 4.763480867789082e-05, + "loss": 3.7371, + "step": 23470 + }, + { + "epoch": 0.13958868588828624, + "grad_norm": 2.46588397026062, + "learning_rate": 4.763461035515686e-05, + "loss": 3.5972, + "step": 23471 + }, + { + "epoch": 0.13959463317156723, + "grad_norm": 2.3971428871154785, + "learning_rate": 4.76344120245214e-05, + "loss": 3.9445, + "step": 23472 + }, + { + "epoch": 0.13960058045484822, + "grad_norm": 1.8938592672348022, + "learning_rate": 4.7634213685984494e-05, + "loss": 5.1934, + "step": 23473 + }, + { + "epoch": 0.13960652773812923, + "grad_norm": 1.4792579412460327, + "learning_rate": 4.763401533954622e-05, + "loss": 5.5867, + "step": 23474 + }, + { + "epoch": 0.13961247502141022, + "grad_norm": 1.9039497375488281, + "learning_rate": 4.763381698520665e-05, + "loss": 4.9615, + "step": 23475 + }, + { + "epoch": 0.1396184223046912, + "grad_norm": 2.2181084156036377, + "learning_rate": 4.7633618622965845e-05, + "loss": 5.107, + "step": 23476 + }, + { + "epoch": 0.13962436958797222, + "grad_norm": 1.618551254272461, + "learning_rate": 4.7633420252823876e-05, + "loss": 4.8326, + "step": 23477 + }, + { + "epoch": 0.1396303168712532, + "grad_norm": 1.7516298294067383, + "learning_rate": 4.763322187478081e-05, + "loss": 5.0812, + "step": 23478 + }, + { + "epoch": 0.1396362641545342, + "grad_norm": 2.385951042175293, + "learning_rate": 4.7633023488836726e-05, + "loss": 4.2155, + "step": 23479 + }, + { + "epoch": 0.13964221143781522, + "grad_norm": 2.1702630519866943, + "learning_rate": 4.7632825094991686e-05, + "loss": 4.1257, + "step": 23480 + }, + { + "epoch": 0.1396481587210962, + "grad_norm": 1.9801292419433594, + "learning_rate": 4.763262669324576e-05, + "loss": 3.7386, + "step": 23481 + }, + { + "epoch": 0.1396541060043772, + "grad_norm": 2.250795602798462, + "learning_rate": 4.7632428283599016e-05, + "loss": 3.7169, + "step": 23482 + }, + { + "epoch": 0.1396600532876582, + "grad_norm": 2.124037027359009, + "learning_rate": 4.763222986605153e-05, + "loss": 3.7271, + "step": 23483 + }, + { + "epoch": 0.1396660005709392, + "grad_norm": 1.7976130247116089, + "learning_rate": 4.763203144060336e-05, + "loss": 3.9943, + "step": 23484 + }, + { + "epoch": 0.13967194785422019, + "grad_norm": 1.8421905040740967, + "learning_rate": 4.763183300725459e-05, + "loss": 4.1526, + "step": 23485 + }, + { + "epoch": 0.1396778951375012, + "grad_norm": 2.166212797164917, + "learning_rate": 4.763163456600527e-05, + "loss": 4.0225, + "step": 23486 + }, + { + "epoch": 0.1396838424207822, + "grad_norm": 2.2913808822631836, + "learning_rate": 4.763143611685549e-05, + "loss": 4.1125, + "step": 23487 + }, + { + "epoch": 0.13968978970406318, + "grad_norm": 2.20432448387146, + "learning_rate": 4.7631237659805307e-05, + "loss": 3.8297, + "step": 23488 + }, + { + "epoch": 0.1396957369873442, + "grad_norm": 2.323784351348877, + "learning_rate": 4.7631039194854785e-05, + "loss": 3.9128, + "step": 23489 + }, + { + "epoch": 0.13970168427062518, + "grad_norm": 2.22320294380188, + "learning_rate": 4.7630840722004014e-05, + "loss": 3.2834, + "step": 23490 + }, + { + "epoch": 0.13970763155390617, + "grad_norm": 2.2063205242156982, + "learning_rate": 4.763064224125304e-05, + "loss": 3.2472, + "step": 23491 + }, + { + "epoch": 0.13971357883718719, + "grad_norm": 2.1124684810638428, + "learning_rate": 4.763044375260195e-05, + "loss": 3.4765, + "step": 23492 + }, + { + "epoch": 0.13971952612046817, + "grad_norm": 2.2450273036956787, + "learning_rate": 4.7630245256050796e-05, + "loss": 3.586, + "step": 23493 + }, + { + "epoch": 0.13972547340374916, + "grad_norm": 2.821563243865967, + "learning_rate": 4.7630046751599665e-05, + "loss": 3.9152, + "step": 23494 + }, + { + "epoch": 0.13973142068703018, + "grad_norm": 2.623655319213867, + "learning_rate": 4.762984823924862e-05, + "loss": 5.2159, + "step": 23495 + }, + { + "epoch": 0.13973736797031117, + "grad_norm": 2.5610146522521973, + "learning_rate": 4.762964971899773e-05, + "loss": 5.0813, + "step": 23496 + }, + { + "epoch": 0.13974331525359215, + "grad_norm": 2.434995651245117, + "learning_rate": 4.7629451190847055e-05, + "loss": 4.651, + "step": 23497 + }, + { + "epoch": 0.13974926253687317, + "grad_norm": 2.0094375610351562, + "learning_rate": 4.7629252654796675e-05, + "loss": 5.6776, + "step": 23498 + }, + { + "epoch": 0.13975520982015416, + "grad_norm": 2.568547248840332, + "learning_rate": 4.7629054110846664e-05, + "loss": 3.2979, + "step": 23499 + }, + { + "epoch": 0.13976115710343515, + "grad_norm": 1.9725669622421265, + "learning_rate": 4.7628855558997074e-05, + "loss": 5.6269, + "step": 23500 + }, + { + "epoch": 0.13976710438671613, + "grad_norm": 1.6308250427246094, + "learning_rate": 4.7628656999247986e-05, + "loss": 5.7476, + "step": 23501 + }, + { + "epoch": 0.13977305166999715, + "grad_norm": 2.4110774993896484, + "learning_rate": 4.762845843159947e-05, + "loss": 4.8208, + "step": 23502 + }, + { + "epoch": 0.13977899895327814, + "grad_norm": 2.9670233726501465, + "learning_rate": 4.762825985605159e-05, + "loss": 3.3216, + "step": 23503 + }, + { + "epoch": 0.13978494623655913, + "grad_norm": 2.9264471530914307, + "learning_rate": 4.762806127260443e-05, + "loss": 3.12, + "step": 23504 + }, + { + "epoch": 0.13979089351984014, + "grad_norm": 2.983513593673706, + "learning_rate": 4.7627862681258037e-05, + "loss": 3.2355, + "step": 23505 + }, + { + "epoch": 0.13979684080312113, + "grad_norm": 2.5023698806762695, + "learning_rate": 4.7627664082012494e-05, + "loss": 3.6619, + "step": 23506 + }, + { + "epoch": 0.13980278808640212, + "grad_norm": 2.691542625427246, + "learning_rate": 4.762746547486786e-05, + "loss": 2.9562, + "step": 23507 + }, + { + "epoch": 0.13980873536968313, + "grad_norm": 2.487741470336914, + "learning_rate": 4.762726685982421e-05, + "loss": 3.6212, + "step": 23508 + }, + { + "epoch": 0.13981468265296412, + "grad_norm": 2.5798730850219727, + "learning_rate": 4.762706823688163e-05, + "loss": 3.6246, + "step": 23509 + }, + { + "epoch": 0.1398206299362451, + "grad_norm": 2.8465988636016846, + "learning_rate": 4.762686960604017e-05, + "loss": 3.3039, + "step": 23510 + }, + { + "epoch": 0.13982657721952613, + "grad_norm": 2.70969820022583, + "learning_rate": 4.7626670967299897e-05, + "loss": 2.3823, + "step": 23511 + }, + { + "epoch": 0.13983252450280712, + "grad_norm": 2.3834662437438965, + "learning_rate": 4.762647232066089e-05, + "loss": 2.8856, + "step": 23512 + }, + { + "epoch": 0.1398384717860881, + "grad_norm": 2.694798231124878, + "learning_rate": 4.762627366612321e-05, + "loss": 4.3653, + "step": 23513 + }, + { + "epoch": 0.13984441906936912, + "grad_norm": 2.6196436882019043, + "learning_rate": 4.7626075003686944e-05, + "loss": 4.5615, + "step": 23514 + }, + { + "epoch": 0.1398503663526501, + "grad_norm": 2.6196036338806152, + "learning_rate": 4.7625876333352136e-05, + "loss": 3.4767, + "step": 23515 + }, + { + "epoch": 0.1398563136359311, + "grad_norm": 2.32704496383667, + "learning_rate": 4.762567765511888e-05, + "loss": 3.7236, + "step": 23516 + }, + { + "epoch": 0.1398622609192121, + "grad_norm": 2.7415919303894043, + "learning_rate": 4.7625478968987226e-05, + "loss": 3.2248, + "step": 23517 + }, + { + "epoch": 0.1398682082024931, + "grad_norm": 2.402270555496216, + "learning_rate": 4.7625280274957254e-05, + "loss": 3.5112, + "step": 23518 + }, + { + "epoch": 0.1398741554857741, + "grad_norm": 2.722087860107422, + "learning_rate": 4.762508157302903e-05, + "loss": 3.5728, + "step": 23519 + }, + { + "epoch": 0.1398801027690551, + "grad_norm": 2.2336719036102295, + "learning_rate": 4.7624882863202626e-05, + "loss": 4.361, + "step": 23520 + }, + { + "epoch": 0.1398860500523361, + "grad_norm": 1.687203288078308, + "learning_rate": 4.7624684145478106e-05, + "loss": 5.2352, + "step": 23521 + }, + { + "epoch": 0.13989199733561708, + "grad_norm": 2.0672800540924072, + "learning_rate": 4.762448541985553e-05, + "loss": 5.0935, + "step": 23522 + }, + { + "epoch": 0.1398979446188981, + "grad_norm": 1.9521383047103882, + "learning_rate": 4.7624286686335e-05, + "loss": 5.1912, + "step": 23523 + }, + { + "epoch": 0.13990389190217908, + "grad_norm": 1.8050906658172607, + "learning_rate": 4.762408794491656e-05, + "loss": 5.2494, + "step": 23524 + }, + { + "epoch": 0.13990983918546007, + "grad_norm": 1.7029122114181519, + "learning_rate": 4.762388919560028e-05, + "loss": 5.2882, + "step": 23525 + }, + { + "epoch": 0.1399157864687411, + "grad_norm": 2.089055299758911, + "learning_rate": 4.7623690438386234e-05, + "loss": 5.1689, + "step": 23526 + }, + { + "epoch": 0.13992173375202208, + "grad_norm": 1.8083282709121704, + "learning_rate": 4.7623491673274503e-05, + "loss": 5.2078, + "step": 23527 + }, + { + "epoch": 0.13992768103530306, + "grad_norm": 1.6455740928649902, + "learning_rate": 4.7623292900265126e-05, + "loss": 4.6492, + "step": 23528 + }, + { + "epoch": 0.13993362831858408, + "grad_norm": 1.7084187269210815, + "learning_rate": 4.76230941193582e-05, + "loss": 4.5537, + "step": 23529 + }, + { + "epoch": 0.13993957560186507, + "grad_norm": 1.5048147439956665, + "learning_rate": 4.762289533055379e-05, + "loss": 4.3823, + "step": 23530 + }, + { + "epoch": 0.13994552288514606, + "grad_norm": 1.6451318264007568, + "learning_rate": 4.762269653385196e-05, + "loss": 4.4546, + "step": 23531 + }, + { + "epoch": 0.13995147016842707, + "grad_norm": 1.4565141201019287, + "learning_rate": 4.762249772925278e-05, + "loss": 4.5148, + "step": 23532 + }, + { + "epoch": 0.13995741745170806, + "grad_norm": 1.4664920568466187, + "learning_rate": 4.7622298916756316e-05, + "loss": 4.4532, + "step": 23533 + }, + { + "epoch": 0.13996336473498905, + "grad_norm": 1.5902373790740967, + "learning_rate": 4.762210009636264e-05, + "loss": 4.4744, + "step": 23534 + }, + { + "epoch": 0.13996931201827006, + "grad_norm": 1.6029250621795654, + "learning_rate": 4.762190126807182e-05, + "loss": 4.4635, + "step": 23535 + }, + { + "epoch": 0.13997525930155105, + "grad_norm": 1.49099862575531, + "learning_rate": 4.7621702431883943e-05, + "loss": 4.4079, + "step": 23536 + }, + { + "epoch": 0.13998120658483204, + "grad_norm": 1.5527629852294922, + "learning_rate": 4.762150358779905e-05, + "loss": 4.4034, + "step": 23537 + }, + { + "epoch": 0.13998715386811306, + "grad_norm": 1.4014298915863037, + "learning_rate": 4.762130473581723e-05, + "loss": 4.5512, + "step": 23538 + }, + { + "epoch": 0.13999310115139404, + "grad_norm": 1.4211797714233398, + "learning_rate": 4.762110587593854e-05, + "loss": 4.3554, + "step": 23539 + }, + { + "epoch": 0.13999904843467503, + "grad_norm": 1.305879831314087, + "learning_rate": 4.762090700816306e-05, + "loss": 4.5469, + "step": 23540 + }, + { + "epoch": 0.14000499571795605, + "grad_norm": 1.6035869121551514, + "learning_rate": 4.762070813249085e-05, + "loss": 4.2506, + "step": 23541 + }, + { + "epoch": 0.14001094300123704, + "grad_norm": 2.48470139503479, + "learning_rate": 4.7620509248922e-05, + "loss": 4.4341, + "step": 23542 + }, + { + "epoch": 0.14001689028451803, + "grad_norm": 2.1328017711639404, + "learning_rate": 4.7620310357456546e-05, + "loss": 4.8064, + "step": 23543 + }, + { + "epoch": 0.14002283756779904, + "grad_norm": 2.631490707397461, + "learning_rate": 4.7620111458094586e-05, + "loss": 4.9828, + "step": 23544 + }, + { + "epoch": 0.14002878485108003, + "grad_norm": 2.4217545986175537, + "learning_rate": 4.761991255083617e-05, + "loss": 3.7975, + "step": 23545 + }, + { + "epoch": 0.14003473213436102, + "grad_norm": 2.1837475299835205, + "learning_rate": 4.7619713635681384e-05, + "loss": 3.7627, + "step": 23546 + }, + { + "epoch": 0.14004067941764203, + "grad_norm": 2.188026189804077, + "learning_rate": 4.7619514712630284e-05, + "loss": 3.6425, + "step": 23547 + }, + { + "epoch": 0.14004662670092302, + "grad_norm": 2.157501697540283, + "learning_rate": 4.761931578168295e-05, + "loss": 3.2671, + "step": 23548 + }, + { + "epoch": 0.140052573984204, + "grad_norm": 2.28362774848938, + "learning_rate": 4.7619116842839446e-05, + "loss": 3.9765, + "step": 23549 + }, + { + "epoch": 0.14005852126748503, + "grad_norm": 2.1072418689727783, + "learning_rate": 4.7618917896099844e-05, + "loss": 3.8694, + "step": 23550 + }, + { + "epoch": 0.140064468550766, + "grad_norm": 2.061612367630005, + "learning_rate": 4.76187189414642e-05, + "loss": 3.6775, + "step": 23551 + }, + { + "epoch": 0.140070415834047, + "grad_norm": 2.153618812561035, + "learning_rate": 4.761851997893261e-05, + "loss": 3.2189, + "step": 23552 + }, + { + "epoch": 0.14007636311732802, + "grad_norm": 2.211912155151367, + "learning_rate": 4.761832100850512e-05, + "loss": 4.0855, + "step": 23553 + }, + { + "epoch": 0.140082310400609, + "grad_norm": 2.109023094177246, + "learning_rate": 4.761812203018181e-05, + "loss": 3.1532, + "step": 23554 + }, + { + "epoch": 0.14008825768389, + "grad_norm": 2.056579113006592, + "learning_rate": 4.7617923043962745e-05, + "loss": 3.3965, + "step": 23555 + }, + { + "epoch": 0.140094204967171, + "grad_norm": 2.6552531719207764, + "learning_rate": 4.761772404984799e-05, + "loss": 4.8136, + "step": 23556 + }, + { + "epoch": 0.140100152250452, + "grad_norm": 2.873891592025757, + "learning_rate": 4.7617525047837634e-05, + "loss": 5.1014, + "step": 23557 + }, + { + "epoch": 0.140106099533733, + "grad_norm": 2.9486472606658936, + "learning_rate": 4.761732603793173e-05, + "loss": 4.9751, + "step": 23558 + }, + { + "epoch": 0.14011204681701397, + "grad_norm": 1.6354721784591675, + "learning_rate": 4.761712702013035e-05, + "loss": 5.6091, + "step": 23559 + }, + { + "epoch": 0.140117994100295, + "grad_norm": 1.766449213027954, + "learning_rate": 4.761692799443357e-05, + "loss": 5.6621, + "step": 23560 + }, + { + "epoch": 0.14012394138357598, + "grad_norm": 1.9253995418548584, + "learning_rate": 4.7616728960841444e-05, + "loss": 5.0477, + "step": 23561 + }, + { + "epoch": 0.14012988866685697, + "grad_norm": 1.5569409132003784, + "learning_rate": 4.761652991935406e-05, + "loss": 5.2989, + "step": 23562 + }, + { + "epoch": 0.14013583595013798, + "grad_norm": 1.395662784576416, + "learning_rate": 4.761633086997147e-05, + "loss": 5.2249, + "step": 23563 + }, + { + "epoch": 0.14014178323341897, + "grad_norm": 1.9045140743255615, + "learning_rate": 4.761613181269376e-05, + "loss": 5.5549, + "step": 23564 + }, + { + "epoch": 0.14014773051669996, + "grad_norm": 2.0041518211364746, + "learning_rate": 4.761593274752099e-05, + "loss": 5.7419, + "step": 23565 + }, + { + "epoch": 0.14015367779998097, + "grad_norm": 1.983040452003479, + "learning_rate": 4.761573367445323e-05, + "loss": 5.761, + "step": 23566 + }, + { + "epoch": 0.14015962508326196, + "grad_norm": 1.6701973676681519, + "learning_rate": 4.761553459349055e-05, + "loss": 5.8376, + "step": 23567 + }, + { + "epoch": 0.14016557236654295, + "grad_norm": 1.3928866386413574, + "learning_rate": 4.761533550463303e-05, + "loss": 5.7623, + "step": 23568 + }, + { + "epoch": 0.14017151964982397, + "grad_norm": 1.5971790552139282, + "learning_rate": 4.761513640788072e-05, + "loss": 5.6896, + "step": 23569 + }, + { + "epoch": 0.14017746693310495, + "grad_norm": 1.655540943145752, + "learning_rate": 4.76149373032337e-05, + "loss": 5.7311, + "step": 23570 + }, + { + "epoch": 0.14018341421638594, + "grad_norm": 1.6018282175064087, + "learning_rate": 4.761473819069204e-05, + "loss": 5.6966, + "step": 23571 + }, + { + "epoch": 0.14018936149966696, + "grad_norm": 2.0446600914001465, + "learning_rate": 4.7614539070255816e-05, + "loss": 4.7235, + "step": 23572 + }, + { + "epoch": 0.14019530878294795, + "grad_norm": 1.6043277978897095, + "learning_rate": 4.761433994192508e-05, + "loss": 5.1602, + "step": 23573 + }, + { + "epoch": 0.14020125606622894, + "grad_norm": 1.7339102029800415, + "learning_rate": 4.761414080569992e-05, + "loss": 4.6082, + "step": 23574 + }, + { + "epoch": 0.14020720334950995, + "grad_norm": 1.9234665632247925, + "learning_rate": 4.761394166158039e-05, + "loss": 5.1365, + "step": 23575 + }, + { + "epoch": 0.14021315063279094, + "grad_norm": 1.7816582918167114, + "learning_rate": 4.7613742509566574e-05, + "loss": 5.4685, + "step": 23576 + }, + { + "epoch": 0.14021909791607193, + "grad_norm": 2.230858564376831, + "learning_rate": 4.7613543349658526e-05, + "loss": 4.0433, + "step": 23577 + }, + { + "epoch": 0.14022504519935294, + "grad_norm": 2.088791847229004, + "learning_rate": 4.761334418185633e-05, + "loss": 4.0262, + "step": 23578 + }, + { + "epoch": 0.14023099248263393, + "grad_norm": 2.2880146503448486, + "learning_rate": 4.761314500616004e-05, + "loss": 3.8526, + "step": 23579 + }, + { + "epoch": 0.14023693976591492, + "grad_norm": 1.428227186203003, + "learning_rate": 4.7612945822569744e-05, + "loss": 5.538, + "step": 23580 + }, + { + "epoch": 0.14024288704919594, + "grad_norm": 1.5487463474273682, + "learning_rate": 4.76127466310855e-05, + "loss": 4.9206, + "step": 23581 + }, + { + "epoch": 0.14024883433247692, + "grad_norm": 1.7598581314086914, + "learning_rate": 4.761254743170738e-05, + "loss": 4.8871, + "step": 23582 + }, + { + "epoch": 0.1402547816157579, + "grad_norm": 1.8421943187713623, + "learning_rate": 4.7612348224435457e-05, + "loss": 4.9177, + "step": 23583 + }, + { + "epoch": 0.14026072889903893, + "grad_norm": 1.9214147329330444, + "learning_rate": 4.761214900926979e-05, + "loss": 5.148, + "step": 23584 + }, + { + "epoch": 0.14026667618231992, + "grad_norm": 1.8675332069396973, + "learning_rate": 4.761194978621045e-05, + "loss": 5.455, + "step": 23585 + }, + { + "epoch": 0.1402726234656009, + "grad_norm": 1.7940279245376587, + "learning_rate": 4.761175055525753e-05, + "loss": 5.3608, + "step": 23586 + }, + { + "epoch": 0.14027857074888192, + "grad_norm": 1.526066541671753, + "learning_rate": 4.761155131641107e-05, + "loss": 5.3672, + "step": 23587 + }, + { + "epoch": 0.1402845180321629, + "grad_norm": 1.7407697439193726, + "learning_rate": 4.761135206967115e-05, + "loss": 5.4809, + "step": 23588 + }, + { + "epoch": 0.1402904653154439, + "grad_norm": 1.8562800884246826, + "learning_rate": 4.761115281503784e-05, + "loss": 5.3086, + "step": 23589 + }, + { + "epoch": 0.1402964125987249, + "grad_norm": 1.7709288597106934, + "learning_rate": 4.7610953552511216e-05, + "loss": 4.8511, + "step": 23590 + }, + { + "epoch": 0.1403023598820059, + "grad_norm": 1.6407638788223267, + "learning_rate": 4.761075428209134e-05, + "loss": 4.7137, + "step": 23591 + }, + { + "epoch": 0.1403083071652869, + "grad_norm": 1.8322784900665283, + "learning_rate": 4.761055500377828e-05, + "loss": 5.1288, + "step": 23592 + }, + { + "epoch": 0.1403142544485679, + "grad_norm": 2.5631179809570312, + "learning_rate": 4.761035571757211e-05, + "loss": 3.9808, + "step": 23593 + }, + { + "epoch": 0.1403202017318489, + "grad_norm": 2.5823936462402344, + "learning_rate": 4.7610156423472895e-05, + "loss": 4.0532, + "step": 23594 + }, + { + "epoch": 0.14032614901512988, + "grad_norm": 3.3013498783111572, + "learning_rate": 4.760995712148072e-05, + "loss": 3.5222, + "step": 23595 + }, + { + "epoch": 0.1403320962984109, + "grad_norm": 2.8877291679382324, + "learning_rate": 4.760975781159563e-05, + "loss": 3.4662, + "step": 23596 + }, + { + "epoch": 0.14033804358169188, + "grad_norm": 2.757053852081299, + "learning_rate": 4.760955849381771e-05, + "loss": 2.9554, + "step": 23597 + }, + { + "epoch": 0.14034399086497287, + "grad_norm": 2.611163854598999, + "learning_rate": 4.760935916814703e-05, + "loss": 3.0722, + "step": 23598 + }, + { + "epoch": 0.1403499381482539, + "grad_norm": 2.5141069889068604, + "learning_rate": 4.760915983458366e-05, + "loss": 2.9377, + "step": 23599 + }, + { + "epoch": 0.14035588543153488, + "grad_norm": 2.88659930229187, + "learning_rate": 4.7608960493127655e-05, + "loss": 2.7086, + "step": 23600 + }, + { + "epoch": 0.14036183271481587, + "grad_norm": 1.4970325231552124, + "learning_rate": 4.7608761143779103e-05, + "loss": 5.279, + "step": 23601 + }, + { + "epoch": 0.14036777999809688, + "grad_norm": 1.883097767829895, + "learning_rate": 4.760856178653806e-05, + "loss": 4.9675, + "step": 23602 + }, + { + "epoch": 0.14037372728137787, + "grad_norm": 1.8045644760131836, + "learning_rate": 4.760836242140461e-05, + "loss": 4.9739, + "step": 23603 + }, + { + "epoch": 0.14037967456465886, + "grad_norm": 2.2752342224121094, + "learning_rate": 4.760816304837881e-05, + "loss": 5.1278, + "step": 23604 + }, + { + "epoch": 0.14038562184793987, + "grad_norm": 1.8345577716827393, + "learning_rate": 4.760796366746074e-05, + "loss": 5.232, + "step": 23605 + }, + { + "epoch": 0.14039156913122086, + "grad_norm": 1.6739290952682495, + "learning_rate": 4.760776427865046e-05, + "loss": 5.1867, + "step": 23606 + }, + { + "epoch": 0.14039751641450185, + "grad_norm": 1.8607251644134521, + "learning_rate": 4.760756488194803e-05, + "loss": 5.1918, + "step": 23607 + }, + { + "epoch": 0.14040346369778287, + "grad_norm": 1.852330207824707, + "learning_rate": 4.760736547735355e-05, + "loss": 5.1462, + "step": 23608 + }, + { + "epoch": 0.14040941098106385, + "grad_norm": 1.738235354423523, + "learning_rate": 4.760716606486706e-05, + "loss": 5.1607, + "step": 23609 + }, + { + "epoch": 0.14041535826434484, + "grad_norm": 1.7101359367370605, + "learning_rate": 4.760696664448865e-05, + "loss": 5.1047, + "step": 23610 + }, + { + "epoch": 0.14042130554762586, + "grad_norm": 1.618538737297058, + "learning_rate": 4.760676721621838e-05, + "loss": 5.034, + "step": 23611 + }, + { + "epoch": 0.14042725283090685, + "grad_norm": 1.5971029996871948, + "learning_rate": 4.760656778005632e-05, + "loss": 5.0689, + "step": 23612 + }, + { + "epoch": 0.14043320011418783, + "grad_norm": 1.7599228620529175, + "learning_rate": 4.760636833600254e-05, + "loss": 5.0584, + "step": 23613 + }, + { + "epoch": 0.14043914739746885, + "grad_norm": 1.7093656063079834, + "learning_rate": 4.7606168884057114e-05, + "loss": 5.0887, + "step": 23614 + }, + { + "epoch": 0.14044509468074984, + "grad_norm": 1.77159583568573, + "learning_rate": 4.760596942422011e-05, + "loss": 4.9885, + "step": 23615 + }, + { + "epoch": 0.14045104196403083, + "grad_norm": 1.6793224811553955, + "learning_rate": 4.7605769956491586e-05, + "loss": 5.7858, + "step": 23616 + }, + { + "epoch": 0.14045698924731181, + "grad_norm": 2.0000784397125244, + "learning_rate": 4.7605570480871624e-05, + "loss": 5.1434, + "step": 23617 + }, + { + "epoch": 0.14046293653059283, + "grad_norm": 1.777692437171936, + "learning_rate": 4.760537099736029e-05, + "loss": 5.237, + "step": 23618 + }, + { + "epoch": 0.14046888381387382, + "grad_norm": 1.7709475755691528, + "learning_rate": 4.760517150595766e-05, + "loss": 5.1844, + "step": 23619 + }, + { + "epoch": 0.1404748310971548, + "grad_norm": 1.5300654172897339, + "learning_rate": 4.76049720066638e-05, + "loss": 5.4657, + "step": 23620 + }, + { + "epoch": 0.14048077838043582, + "grad_norm": 1.5757399797439575, + "learning_rate": 4.7604772499478767e-05, + "loss": 5.7018, + "step": 23621 + }, + { + "epoch": 0.1404867256637168, + "grad_norm": 1.572698712348938, + "learning_rate": 4.760457298440265e-05, + "loss": 5.5974, + "step": 23622 + }, + { + "epoch": 0.1404926729469978, + "grad_norm": 1.7017083168029785, + "learning_rate": 4.760437346143551e-05, + "loss": 5.6591, + "step": 23623 + }, + { + "epoch": 0.14049862023027881, + "grad_norm": 1.496193528175354, + "learning_rate": 4.760417393057741e-05, + "loss": 5.603, + "step": 23624 + }, + { + "epoch": 0.1405045675135598, + "grad_norm": 1.5156760215759277, + "learning_rate": 4.760397439182843e-05, + "loss": 5.5561, + "step": 23625 + }, + { + "epoch": 0.1405105147968408, + "grad_norm": 1.520276665687561, + "learning_rate": 4.760377484518864e-05, + "loss": 5.6208, + "step": 23626 + }, + { + "epoch": 0.1405164620801218, + "grad_norm": 1.6519960165023804, + "learning_rate": 4.760357529065811e-05, + "loss": 5.6191, + "step": 23627 + }, + { + "epoch": 0.1405224093634028, + "grad_norm": 1.6115814447402954, + "learning_rate": 4.760337572823689e-05, + "loss": 5.6622, + "step": 23628 + }, + { + "epoch": 0.14052835664668378, + "grad_norm": 1.6744813919067383, + "learning_rate": 4.760317615792508e-05, + "loss": 4.9525, + "step": 23629 + }, + { + "epoch": 0.1405343039299648, + "grad_norm": 1.8949360847473145, + "learning_rate": 4.7602976579722725e-05, + "loss": 5.2284, + "step": 23630 + }, + { + "epoch": 0.1405402512132458, + "grad_norm": 1.7098066806793213, + "learning_rate": 4.760277699362991e-05, + "loss": 5.6612, + "step": 23631 + }, + { + "epoch": 0.14054619849652678, + "grad_norm": 2.258535861968994, + "learning_rate": 4.76025773996467e-05, + "loss": 5.3049, + "step": 23632 + }, + { + "epoch": 0.1405521457798078, + "grad_norm": 1.713905692100525, + "learning_rate": 4.760237779777316e-05, + "loss": 6.081, + "step": 23633 + }, + { + "epoch": 0.14055809306308878, + "grad_norm": 1.744905710220337, + "learning_rate": 4.760217818800936e-05, + "loss": 5.6269, + "step": 23634 + }, + { + "epoch": 0.14056404034636977, + "grad_norm": 2.032653570175171, + "learning_rate": 4.760197857035538e-05, + "loss": 4.8417, + "step": 23635 + }, + { + "epoch": 0.14056998762965078, + "grad_norm": 1.9457743167877197, + "learning_rate": 4.7601778944811275e-05, + "loss": 4.6145, + "step": 23636 + }, + { + "epoch": 0.14057593491293177, + "grad_norm": 2.0428082942962646, + "learning_rate": 4.760157931137713e-05, + "loss": 4.7341, + "step": 23637 + }, + { + "epoch": 0.14058188219621276, + "grad_norm": 1.8817776441574097, + "learning_rate": 4.7601379670053006e-05, + "loss": 4.4932, + "step": 23638 + }, + { + "epoch": 0.14058782947949378, + "grad_norm": 1.9882752895355225, + "learning_rate": 4.760118002083897e-05, + "loss": 4.5001, + "step": 23639 + }, + { + "epoch": 0.14059377676277476, + "grad_norm": 1.6730908155441284, + "learning_rate": 4.760098036373509e-05, + "loss": 4.2396, + "step": 23640 + }, + { + "epoch": 0.14059972404605575, + "grad_norm": 1.9490888118743896, + "learning_rate": 4.760078069874145e-05, + "loss": 4.2708, + "step": 23641 + }, + { + "epoch": 0.14060567132933677, + "grad_norm": 1.8162645101547241, + "learning_rate": 4.7600581025858114e-05, + "loss": 4.2507, + "step": 23642 + }, + { + "epoch": 0.14061161861261776, + "grad_norm": 1.9260125160217285, + "learning_rate": 4.760038134508514e-05, + "loss": 4.4647, + "step": 23643 + }, + { + "epoch": 0.14061756589589874, + "grad_norm": 1.892685055732727, + "learning_rate": 4.7600181656422616e-05, + "loss": 4.1241, + "step": 23644 + }, + { + "epoch": 0.14062351317917976, + "grad_norm": 1.625123143196106, + "learning_rate": 4.75999819598706e-05, + "loss": 4.3582, + "step": 23645 + }, + { + "epoch": 0.14062946046246075, + "grad_norm": 1.841758131980896, + "learning_rate": 4.759978225542916e-05, + "loss": 4.3403, + "step": 23646 + }, + { + "epoch": 0.14063540774574174, + "grad_norm": 1.8946552276611328, + "learning_rate": 4.759958254309837e-05, + "loss": 4.5008, + "step": 23647 + }, + { + "epoch": 0.14064135502902275, + "grad_norm": 1.7985520362854004, + "learning_rate": 4.75993828228783e-05, + "loss": 4.4869, + "step": 23648 + }, + { + "epoch": 0.14064730231230374, + "grad_norm": 1.823662519454956, + "learning_rate": 4.759918309476902e-05, + "loss": 4.6177, + "step": 23649 + }, + { + "epoch": 0.14065324959558473, + "grad_norm": 1.94038724899292, + "learning_rate": 4.75989833587706e-05, + "loss": 4.4979, + "step": 23650 + }, + { + "epoch": 0.14065919687886574, + "grad_norm": 1.9023078680038452, + "learning_rate": 4.75987836148831e-05, + "loss": 4.3507, + "step": 23651 + }, + { + "epoch": 0.14066514416214673, + "grad_norm": 1.917851448059082, + "learning_rate": 4.7598583863106606e-05, + "loss": 4.1841, + "step": 23652 + }, + { + "epoch": 0.14067109144542772, + "grad_norm": 1.8332593441009521, + "learning_rate": 4.759838410344117e-05, + "loss": 4.4705, + "step": 23653 + }, + { + "epoch": 0.14067703872870874, + "grad_norm": 1.7567338943481445, + "learning_rate": 4.759818433588689e-05, + "loss": 4.5008, + "step": 23654 + }, + { + "epoch": 0.14068298601198972, + "grad_norm": 1.9399288892745972, + "learning_rate": 4.75979845604438e-05, + "loss": 4.3969, + "step": 23655 + }, + { + "epoch": 0.1406889332952707, + "grad_norm": 1.7779430150985718, + "learning_rate": 4.7597784777112e-05, + "loss": 4.3292, + "step": 23656 + }, + { + "epoch": 0.14069488057855173, + "grad_norm": 1.802742600440979, + "learning_rate": 4.759758498589153e-05, + "loss": 5.0038, + "step": 23657 + }, + { + "epoch": 0.14070082786183272, + "grad_norm": 2.5247714519500732, + "learning_rate": 4.759738518678249e-05, + "loss": 5.0153, + "step": 23658 + }, + { + "epoch": 0.1407067751451137, + "grad_norm": 3.0549800395965576, + "learning_rate": 4.759718537978494e-05, + "loss": 4.6653, + "step": 23659 + }, + { + "epoch": 0.14071272242839472, + "grad_norm": 2.7805356979370117, + "learning_rate": 4.7596985564898935e-05, + "loss": 4.4669, + "step": 23660 + }, + { + "epoch": 0.1407186697116757, + "grad_norm": 2.404932737350464, + "learning_rate": 4.759678574212456e-05, + "loss": 4.6932, + "step": 23661 + }, + { + "epoch": 0.1407246169949567, + "grad_norm": 2.2168543338775635, + "learning_rate": 4.7596585911461875e-05, + "loss": 4.397, + "step": 23662 + }, + { + "epoch": 0.1407305642782377, + "grad_norm": 2.423726797103882, + "learning_rate": 4.759638607291097e-05, + "loss": 4.3534, + "step": 23663 + }, + { + "epoch": 0.1407365115615187, + "grad_norm": 2.1283328533172607, + "learning_rate": 4.759618622647188e-05, + "loss": 4.9248, + "step": 23664 + }, + { + "epoch": 0.1407424588447997, + "grad_norm": 1.6989446878433228, + "learning_rate": 4.7595986372144716e-05, + "loss": 5.4656, + "step": 23665 + }, + { + "epoch": 0.1407484061280807, + "grad_norm": 1.7057443857192993, + "learning_rate": 4.759578650992951e-05, + "loss": 5.193, + "step": 23666 + }, + { + "epoch": 0.1407543534113617, + "grad_norm": 2.3968324661254883, + "learning_rate": 4.7595586639826364e-05, + "loss": 5.132, + "step": 23667 + }, + { + "epoch": 0.14076030069464268, + "grad_norm": 1.7770966291427612, + "learning_rate": 4.7595386761835314e-05, + "loss": 4.8487, + "step": 23668 + }, + { + "epoch": 0.1407662479779237, + "grad_norm": 1.8165397644042969, + "learning_rate": 4.759518687595646e-05, + "loss": 4.9981, + "step": 23669 + }, + { + "epoch": 0.14077219526120469, + "grad_norm": 1.4801784753799438, + "learning_rate": 4.759498698218986e-05, + "loss": 5.0204, + "step": 23670 + }, + { + "epoch": 0.14077814254448567, + "grad_norm": 1.6488209962844849, + "learning_rate": 4.759478708053557e-05, + "loss": 4.9349, + "step": 23671 + }, + { + "epoch": 0.1407840898277667, + "grad_norm": 1.5207561254501343, + "learning_rate": 4.759458717099369e-05, + "loss": 4.9986, + "step": 23672 + }, + { + "epoch": 0.14079003711104768, + "grad_norm": 1.5029826164245605, + "learning_rate": 4.7594387253564263e-05, + "loss": 4.9708, + "step": 23673 + }, + { + "epoch": 0.14079598439432867, + "grad_norm": 1.6697144508361816, + "learning_rate": 4.7594187328247375e-05, + "loss": 4.9915, + "step": 23674 + }, + { + "epoch": 0.14080193167760965, + "grad_norm": 1.7437782287597656, + "learning_rate": 4.7593987395043085e-05, + "loss": 5.068, + "step": 23675 + }, + { + "epoch": 0.14080787896089067, + "grad_norm": 1.8639456033706665, + "learning_rate": 4.7593787453951475e-05, + "loss": 4.9861, + "step": 23676 + }, + { + "epoch": 0.14081382624417166, + "grad_norm": 1.7246698141098022, + "learning_rate": 4.75935875049726e-05, + "loss": 4.9547, + "step": 23677 + }, + { + "epoch": 0.14081977352745265, + "grad_norm": 1.764772891998291, + "learning_rate": 4.759338754810654e-05, + "loss": 4.7823, + "step": 23678 + }, + { + "epoch": 0.14082572081073366, + "grad_norm": 1.3609477281570435, + "learning_rate": 4.759318758335336e-05, + "loss": 4.9039, + "step": 23679 + }, + { + "epoch": 0.14083166809401465, + "grad_norm": 1.4477577209472656, + "learning_rate": 4.759298761071313e-05, + "loss": 4.7816, + "step": 23680 + }, + { + "epoch": 0.14083761537729564, + "grad_norm": 1.6295807361602783, + "learning_rate": 4.759278763018592e-05, + "loss": 4.641, + "step": 23681 + }, + { + "epoch": 0.14084356266057665, + "grad_norm": 1.7831028699874878, + "learning_rate": 4.7592587641771806e-05, + "loss": 4.8989, + "step": 23682 + }, + { + "epoch": 0.14084950994385764, + "grad_norm": 1.7806429862976074, + "learning_rate": 4.7592387645470845e-05, + "loss": 4.9344, + "step": 23683 + }, + { + "epoch": 0.14085545722713863, + "grad_norm": 2.0284979343414307, + "learning_rate": 4.759218764128313e-05, + "loss": 5.7399, + "step": 23684 + }, + { + "epoch": 0.14086140451041965, + "grad_norm": 1.853495717048645, + "learning_rate": 4.7591987629208706e-05, + "loss": 4.8495, + "step": 23685 + }, + { + "epoch": 0.14086735179370063, + "grad_norm": 1.6907382011413574, + "learning_rate": 4.759178760924765e-05, + "loss": 4.8365, + "step": 23686 + }, + { + "epoch": 0.14087329907698162, + "grad_norm": 1.7131983041763306, + "learning_rate": 4.7591587581400045e-05, + "loss": 4.8217, + "step": 23687 + }, + { + "epoch": 0.14087924636026264, + "grad_norm": 1.6896579265594482, + "learning_rate": 4.759138754566595e-05, + "loss": 5.4568, + "step": 23688 + }, + { + "epoch": 0.14088519364354363, + "grad_norm": 1.7312794923782349, + "learning_rate": 4.759118750204542e-05, + "loss": 5.7501, + "step": 23689 + }, + { + "epoch": 0.14089114092682462, + "grad_norm": 1.494137167930603, + "learning_rate": 4.759098745053855e-05, + "loss": 5.526, + "step": 23690 + }, + { + "epoch": 0.14089708821010563, + "grad_norm": 2.2159650325775146, + "learning_rate": 4.75907873911454e-05, + "loss": 5.3686, + "step": 23691 + }, + { + "epoch": 0.14090303549338662, + "grad_norm": 2.0564072132110596, + "learning_rate": 4.759058732386603e-05, + "loss": 5.2311, + "step": 23692 + }, + { + "epoch": 0.1409089827766676, + "grad_norm": 2.5233311653137207, + "learning_rate": 4.759038724870053e-05, + "loss": 4.7775, + "step": 23693 + }, + { + "epoch": 0.14091493005994862, + "grad_norm": 2.180325984954834, + "learning_rate": 4.7590187165648956e-05, + "loss": 4.8106, + "step": 23694 + }, + { + "epoch": 0.1409208773432296, + "grad_norm": 2.1391143798828125, + "learning_rate": 4.758998707471138e-05, + "loss": 4.741, + "step": 23695 + }, + { + "epoch": 0.1409268246265106, + "grad_norm": 1.9628124237060547, + "learning_rate": 4.758978697588787e-05, + "loss": 4.7177, + "step": 23696 + }, + { + "epoch": 0.14093277190979162, + "grad_norm": 2.1324729919433594, + "learning_rate": 4.7589586869178506e-05, + "loss": 4.8006, + "step": 23697 + }, + { + "epoch": 0.1409387191930726, + "grad_norm": 1.9791810512542725, + "learning_rate": 4.758938675458335e-05, + "loss": 4.6171, + "step": 23698 + }, + { + "epoch": 0.1409446664763536, + "grad_norm": 1.8566325902938843, + "learning_rate": 4.758918663210247e-05, + "loss": 5.0375, + "step": 23699 + }, + { + "epoch": 0.1409506137596346, + "grad_norm": 2.3218674659729004, + "learning_rate": 4.758898650173593e-05, + "loss": 5.2169, + "step": 23700 + }, + { + "epoch": 0.1409565610429156, + "grad_norm": 2.0162737369537354, + "learning_rate": 4.7588786363483816e-05, + "loss": 4.8988, + "step": 23701 + }, + { + "epoch": 0.14096250832619658, + "grad_norm": 2.1534879207611084, + "learning_rate": 4.7588586217346197e-05, + "loss": 4.9911, + "step": 23702 + }, + { + "epoch": 0.1409684556094776, + "grad_norm": 2.16445255279541, + "learning_rate": 4.7588386063323134e-05, + "loss": 4.9501, + "step": 23703 + }, + { + "epoch": 0.1409744028927586, + "grad_norm": 1.9189707040786743, + "learning_rate": 4.7588185901414684e-05, + "loss": 4.9125, + "step": 23704 + }, + { + "epoch": 0.14098035017603958, + "grad_norm": 2.1000189781188965, + "learning_rate": 4.7587985731620945e-05, + "loss": 5.002, + "step": 23705 + }, + { + "epoch": 0.1409862974593206, + "grad_norm": 2.0911948680877686, + "learning_rate": 4.7587785553941974e-05, + "loss": 5.0206, + "step": 23706 + }, + { + "epoch": 0.14099224474260158, + "grad_norm": 1.9519456624984741, + "learning_rate": 4.758758536837783e-05, + "loss": 4.5715, + "step": 23707 + }, + { + "epoch": 0.14099819202588257, + "grad_norm": 2.1036672592163086, + "learning_rate": 4.75873851749286e-05, + "loss": 4.7427, + "step": 23708 + }, + { + "epoch": 0.14100413930916358, + "grad_norm": 1.6662368774414062, + "learning_rate": 4.7587184973594354e-05, + "loss": 5.1132, + "step": 23709 + }, + { + "epoch": 0.14101008659244457, + "grad_norm": 1.5314775705337524, + "learning_rate": 4.758698476437514e-05, + "loss": 5.6674, + "step": 23710 + }, + { + "epoch": 0.14101603387572556, + "grad_norm": 1.7167651653289795, + "learning_rate": 4.7586784547271056e-05, + "loss": 5.74, + "step": 23711 + }, + { + "epoch": 0.14102198115900658, + "grad_norm": 1.6126611232757568, + "learning_rate": 4.758658432228216e-05, + "loss": 5.7798, + "step": 23712 + }, + { + "epoch": 0.14102792844228756, + "grad_norm": 1.5236903429031372, + "learning_rate": 4.758638408940851e-05, + "loss": 5.3924, + "step": 23713 + }, + { + "epoch": 0.14103387572556855, + "grad_norm": 1.7352653741836548, + "learning_rate": 4.758618384865019e-05, + "loss": 5.3551, + "step": 23714 + }, + { + "epoch": 0.14103982300884957, + "grad_norm": 2.1185758113861084, + "learning_rate": 4.758598360000727e-05, + "loss": 4.5986, + "step": 23715 + }, + { + "epoch": 0.14104577029213056, + "grad_norm": 2.0252137184143066, + "learning_rate": 4.758578334347981e-05, + "loss": 5.5963, + "step": 23716 + }, + { + "epoch": 0.14105171757541154, + "grad_norm": 2.1225454807281494, + "learning_rate": 4.75855830790679e-05, + "loss": 5.1949, + "step": 23717 + }, + { + "epoch": 0.14105766485869256, + "grad_norm": 2.7703025341033936, + "learning_rate": 4.7585382806771585e-05, + "loss": 4.4741, + "step": 23718 + }, + { + "epoch": 0.14106361214197355, + "grad_norm": 1.6570090055465698, + "learning_rate": 4.758518252659094e-05, + "loss": 4.8543, + "step": 23719 + }, + { + "epoch": 0.14106955942525454, + "grad_norm": 1.759743571281433, + "learning_rate": 4.7584982238526053e-05, + "loss": 4.7901, + "step": 23720 + }, + { + "epoch": 0.14107550670853555, + "grad_norm": 1.562591314315796, + "learning_rate": 4.7584781942576976e-05, + "loss": 5.351, + "step": 23721 + }, + { + "epoch": 0.14108145399181654, + "grad_norm": 1.279597520828247, + "learning_rate": 4.758458163874379e-05, + "loss": 6.0303, + "step": 23722 + }, + { + "epoch": 0.14108740127509753, + "grad_norm": 1.3173538446426392, + "learning_rate": 4.758438132702656e-05, + "loss": 6.015, + "step": 23723 + }, + { + "epoch": 0.14109334855837855, + "grad_norm": 1.4862935543060303, + "learning_rate": 4.7584181007425354e-05, + "loss": 5.6649, + "step": 23724 + }, + { + "epoch": 0.14109929584165953, + "grad_norm": 1.8398306369781494, + "learning_rate": 4.7583980679940244e-05, + "loss": 5.3897, + "step": 23725 + }, + { + "epoch": 0.14110524312494052, + "grad_norm": 2.02359676361084, + "learning_rate": 4.758378034457129e-05, + "loss": 5.8195, + "step": 23726 + }, + { + "epoch": 0.14111119040822154, + "grad_norm": 2.131068706512451, + "learning_rate": 4.758358000131858e-05, + "loss": 5.693, + "step": 23727 + }, + { + "epoch": 0.14111713769150253, + "grad_norm": 2.144928455352783, + "learning_rate": 4.7583379650182184e-05, + "loss": 5.4745, + "step": 23728 + }, + { + "epoch": 0.1411230849747835, + "grad_norm": 2.043093681335449, + "learning_rate": 4.758317929116215e-05, + "loss": 5.5877, + "step": 23729 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 1.7879455089569092, + "learning_rate": 4.758297892425857e-05, + "loss": 5.5822, + "step": 23730 + }, + { + "epoch": 0.14113497954134552, + "grad_norm": 1.6113840341567993, + "learning_rate": 4.7582778549471494e-05, + "loss": 5.2861, + "step": 23731 + }, + { + "epoch": 0.1411409268246265, + "grad_norm": 1.6712645292282104, + "learning_rate": 4.7582578166801015e-05, + "loss": 5.1185, + "step": 23732 + }, + { + "epoch": 0.1411468741079075, + "grad_norm": 1.6905531883239746, + "learning_rate": 4.758237777624719e-05, + "loss": 5.3339, + "step": 23733 + }, + { + "epoch": 0.1411528213911885, + "grad_norm": 2.058136224746704, + "learning_rate": 4.758217737781009e-05, + "loss": 4.6243, + "step": 23734 + }, + { + "epoch": 0.1411587686744695, + "grad_norm": 1.9609389305114746, + "learning_rate": 4.758197697148978e-05, + "loss": 4.7675, + "step": 23735 + }, + { + "epoch": 0.1411647159577505, + "grad_norm": 1.947270154953003, + "learning_rate": 4.758177655728634e-05, + "loss": 4.6854, + "step": 23736 + }, + { + "epoch": 0.1411706632410315, + "grad_norm": 2.0735461711883545, + "learning_rate": 4.7581576135199834e-05, + "loss": 4.9539, + "step": 23737 + }, + { + "epoch": 0.1411766105243125, + "grad_norm": 2.0236589908599854, + "learning_rate": 4.758137570523033e-05, + "loss": 5.0488, + "step": 23738 + }, + { + "epoch": 0.14118255780759348, + "grad_norm": 2.1183953285217285, + "learning_rate": 4.7581175267377906e-05, + "loss": 4.9358, + "step": 23739 + }, + { + "epoch": 0.1411885050908745, + "grad_norm": 2.0142831802368164, + "learning_rate": 4.758097482164262e-05, + "loss": 4.8333, + "step": 23740 + }, + { + "epoch": 0.14119445237415548, + "grad_norm": 2.204681634902954, + "learning_rate": 4.758077436802455e-05, + "loss": 4.8852, + "step": 23741 + }, + { + "epoch": 0.14120039965743647, + "grad_norm": 2.216187000274658, + "learning_rate": 4.7580573906523774e-05, + "loss": 5.0268, + "step": 23742 + }, + { + "epoch": 0.1412063469407175, + "grad_norm": 2.1434781551361084, + "learning_rate": 4.7580373437140343e-05, + "loss": 4.9048, + "step": 23743 + }, + { + "epoch": 0.14121229422399847, + "grad_norm": 1.8260117769241333, + "learning_rate": 4.758017295987435e-05, + "loss": 5.0481, + "step": 23744 + }, + { + "epoch": 0.14121824150727946, + "grad_norm": 2.2184064388275146, + "learning_rate": 4.757997247472584e-05, + "loss": 4.8967, + "step": 23745 + }, + { + "epoch": 0.14122418879056048, + "grad_norm": 1.8644381761550903, + "learning_rate": 4.75797719816949e-05, + "loss": 5.1945, + "step": 23746 + }, + { + "epoch": 0.14123013607384147, + "grad_norm": 2.0591354370117188, + "learning_rate": 4.757957148078159e-05, + "loss": 4.8916, + "step": 23747 + }, + { + "epoch": 0.14123608335712245, + "grad_norm": 2.429004669189453, + "learning_rate": 4.7579370971985986e-05, + "loss": 4.555, + "step": 23748 + }, + { + "epoch": 0.14124203064040347, + "grad_norm": 2.451037883758545, + "learning_rate": 4.757917045530816e-05, + "loss": 4.663, + "step": 23749 + }, + { + "epoch": 0.14124797792368446, + "grad_norm": 1.8227989673614502, + "learning_rate": 4.7578969930748176e-05, + "loss": 5.6976, + "step": 23750 + }, + { + "epoch": 0.14125392520696545, + "grad_norm": 1.8706707954406738, + "learning_rate": 4.757876939830611e-05, + "loss": 6.0974, + "step": 23751 + }, + { + "epoch": 0.14125987249024646, + "grad_norm": 1.7714571952819824, + "learning_rate": 4.7578568857982025e-05, + "loss": 5.5516, + "step": 23752 + }, + { + "epoch": 0.14126581977352745, + "grad_norm": 2.067776679992676, + "learning_rate": 4.7578368309776e-05, + "loss": 5.296, + "step": 23753 + }, + { + "epoch": 0.14127176705680844, + "grad_norm": 1.9231433868408203, + "learning_rate": 4.7578167753688095e-05, + "loss": 5.1286, + "step": 23754 + }, + { + "epoch": 0.14127771434008946, + "grad_norm": 2.0858731269836426, + "learning_rate": 4.7577967189718386e-05, + "loss": 4.717, + "step": 23755 + }, + { + "epoch": 0.14128366162337044, + "grad_norm": 2.173215627670288, + "learning_rate": 4.757776661786694e-05, + "loss": 4.6995, + "step": 23756 + }, + { + "epoch": 0.14128960890665143, + "grad_norm": 2.008244037628174, + "learning_rate": 4.7577566038133834e-05, + "loss": 4.4147, + "step": 23757 + }, + { + "epoch": 0.14129555618993245, + "grad_norm": 1.9767186641693115, + "learning_rate": 4.757736545051913e-05, + "loss": 4.9901, + "step": 23758 + }, + { + "epoch": 0.14130150347321344, + "grad_norm": 1.860136866569519, + "learning_rate": 4.7577164855022905e-05, + "loss": 4.7252, + "step": 23759 + }, + { + "epoch": 0.14130745075649442, + "grad_norm": 1.9243319034576416, + "learning_rate": 4.757696425164522e-05, + "loss": 4.6387, + "step": 23760 + }, + { + "epoch": 0.14131339803977544, + "grad_norm": 1.9811434745788574, + "learning_rate": 4.7576763640386155e-05, + "loss": 4.7365, + "step": 23761 + }, + { + "epoch": 0.14131934532305643, + "grad_norm": 2.1552014350891113, + "learning_rate": 4.757656302124577e-05, + "loss": 4.4764, + "step": 23762 + }, + { + "epoch": 0.14132529260633742, + "grad_norm": 1.8660786151885986, + "learning_rate": 4.757636239422414e-05, + "loss": 4.6108, + "step": 23763 + }, + { + "epoch": 0.14133123988961843, + "grad_norm": 2.0548014640808105, + "learning_rate": 4.757616175932134e-05, + "loss": 4.3871, + "step": 23764 + }, + { + "epoch": 0.14133718717289942, + "grad_norm": 2.107966184616089, + "learning_rate": 4.757596111653743e-05, + "loss": 4.3013, + "step": 23765 + }, + { + "epoch": 0.1413431344561804, + "grad_norm": 2.062649726867676, + "learning_rate": 4.757576046587249e-05, + "loss": 4.3352, + "step": 23766 + }, + { + "epoch": 0.14134908173946142, + "grad_norm": 1.9424866437911987, + "learning_rate": 4.7575559807326584e-05, + "loss": 4.5538, + "step": 23767 + }, + { + "epoch": 0.1413550290227424, + "grad_norm": 1.9787993431091309, + "learning_rate": 4.757535914089978e-05, + "loss": 4.7105, + "step": 23768 + }, + { + "epoch": 0.1413609763060234, + "grad_norm": 2.3590548038482666, + "learning_rate": 4.7575158466592154e-05, + "loss": 4.5962, + "step": 23769 + }, + { + "epoch": 0.14136692358930442, + "grad_norm": 2.3521318435668945, + "learning_rate": 4.757495778440377e-05, + "loss": 4.8107, + "step": 23770 + }, + { + "epoch": 0.1413728708725854, + "grad_norm": 2.079169273376465, + "learning_rate": 4.7574757094334696e-05, + "loss": 4.6617, + "step": 23771 + }, + { + "epoch": 0.1413788181558664, + "grad_norm": 2.020505428314209, + "learning_rate": 4.757455639638502e-05, + "loss": 4.9402, + "step": 23772 + }, + { + "epoch": 0.1413847654391474, + "grad_norm": 1.8023982048034668, + "learning_rate": 4.75743556905548e-05, + "loss": 5.7173, + "step": 23773 + }, + { + "epoch": 0.1413907127224284, + "grad_norm": 1.471612572669983, + "learning_rate": 4.75741549768441e-05, + "loss": 5.6359, + "step": 23774 + }, + { + "epoch": 0.14139666000570938, + "grad_norm": 1.691918969154358, + "learning_rate": 4.7573954255252996e-05, + "loss": 5.6043, + "step": 23775 + }, + { + "epoch": 0.1414026072889904, + "grad_norm": 1.5347981452941895, + "learning_rate": 4.757375352578156e-05, + "loss": 5.9488, + "step": 23776 + }, + { + "epoch": 0.1414085545722714, + "grad_norm": 1.6003544330596924, + "learning_rate": 4.757355278842985e-05, + "loss": 5.4831, + "step": 23777 + }, + { + "epoch": 0.14141450185555238, + "grad_norm": 1.868674397468567, + "learning_rate": 4.757335204319796e-05, + "loss": 5.3372, + "step": 23778 + }, + { + "epoch": 0.1414204491388334, + "grad_norm": 1.827628254890442, + "learning_rate": 4.7573151290085935e-05, + "loss": 5.2977, + "step": 23779 + }, + { + "epoch": 0.14142639642211438, + "grad_norm": 1.80328369140625, + "learning_rate": 4.757295052909386e-05, + "loss": 5.2484, + "step": 23780 + }, + { + "epoch": 0.14143234370539537, + "grad_norm": 1.7244900465011597, + "learning_rate": 4.7572749760221815e-05, + "loss": 5.341, + "step": 23781 + }, + { + "epoch": 0.14143829098867639, + "grad_norm": 1.6203787326812744, + "learning_rate": 4.757254898346984e-05, + "loss": 5.1993, + "step": 23782 + }, + { + "epoch": 0.14144423827195737, + "grad_norm": 1.7411043643951416, + "learning_rate": 4.7572348198838026e-05, + "loss": 5.177, + "step": 23783 + }, + { + "epoch": 0.14145018555523836, + "grad_norm": 1.6770362854003906, + "learning_rate": 4.7572147406326435e-05, + "loss": 5.2169, + "step": 23784 + }, + { + "epoch": 0.14145613283851938, + "grad_norm": 1.6283633708953857, + "learning_rate": 4.7571946605935146e-05, + "loss": 5.1338, + "step": 23785 + }, + { + "epoch": 0.14146208012180037, + "grad_norm": 1.601276159286499, + "learning_rate": 4.7571745797664215e-05, + "loss": 5.0783, + "step": 23786 + }, + { + "epoch": 0.14146802740508135, + "grad_norm": 1.7484774589538574, + "learning_rate": 4.757154498151373e-05, + "loss": 5.106, + "step": 23787 + }, + { + "epoch": 0.14147397468836237, + "grad_norm": 1.8326083421707153, + "learning_rate": 4.7571344157483744e-05, + "loss": 5.0202, + "step": 23788 + }, + { + "epoch": 0.14147992197164336, + "grad_norm": 1.7564448118209839, + "learning_rate": 4.757114332557434e-05, + "loss": 5.0854, + "step": 23789 + }, + { + "epoch": 0.14148586925492435, + "grad_norm": 1.776414394378662, + "learning_rate": 4.757094248578558e-05, + "loss": 5.049, + "step": 23790 + }, + { + "epoch": 0.14149181653820536, + "grad_norm": 1.6053420305252075, + "learning_rate": 4.757074163811754e-05, + "loss": 5.1644, + "step": 23791 + }, + { + "epoch": 0.14149776382148635, + "grad_norm": 1.9419928789138794, + "learning_rate": 4.7570540782570295e-05, + "loss": 5.6868, + "step": 23792 + }, + { + "epoch": 0.14150371110476734, + "grad_norm": 1.8629308938980103, + "learning_rate": 4.757033991914389e-05, + "loss": 5.6614, + "step": 23793 + }, + { + "epoch": 0.14150965838804833, + "grad_norm": 1.745348572731018, + "learning_rate": 4.757013904783842e-05, + "loss": 5.6742, + "step": 23794 + }, + { + "epoch": 0.14151560567132934, + "grad_norm": 1.8093681335449219, + "learning_rate": 4.756993816865396e-05, + "loss": 5.8902, + "step": 23795 + }, + { + "epoch": 0.14152155295461033, + "grad_norm": 1.8000177145004272, + "learning_rate": 4.7569737281590554e-05, + "loss": 5.7025, + "step": 23796 + }, + { + "epoch": 0.14152750023789132, + "grad_norm": 1.7782033681869507, + "learning_rate": 4.756953638664829e-05, + "loss": 5.492, + "step": 23797 + }, + { + "epoch": 0.14153344752117233, + "grad_norm": 1.7651612758636475, + "learning_rate": 4.756933548382723e-05, + "loss": 4.8989, + "step": 23798 + }, + { + "epoch": 0.14153939480445332, + "grad_norm": 2.0286474227905273, + "learning_rate": 4.756913457312745e-05, + "loss": 4.5672, + "step": 23799 + }, + { + "epoch": 0.1415453420877343, + "grad_norm": 2.361325740814209, + "learning_rate": 4.756893365454902e-05, + "loss": 4.6471, + "step": 23800 + }, + { + "epoch": 0.14155128937101533, + "grad_norm": 1.8565771579742432, + "learning_rate": 4.756873272809202e-05, + "loss": 4.589, + "step": 23801 + }, + { + "epoch": 0.14155723665429631, + "grad_norm": 1.895958662033081, + "learning_rate": 4.756853179375649e-05, + "loss": 4.4608, + "step": 23802 + }, + { + "epoch": 0.1415631839375773, + "grad_norm": 2.103283166885376, + "learning_rate": 4.756833085154252e-05, + "loss": 4.3885, + "step": 23803 + }, + { + "epoch": 0.14156913122085832, + "grad_norm": 2.0823607444763184, + "learning_rate": 4.756812990145019e-05, + "loss": 4.307, + "step": 23804 + }, + { + "epoch": 0.1415750785041393, + "grad_norm": 1.852010726928711, + "learning_rate": 4.7567928943479546e-05, + "loss": 4.7289, + "step": 23805 + }, + { + "epoch": 0.1415810257874203, + "grad_norm": 1.6223875284194946, + "learning_rate": 4.7567727977630685e-05, + "loss": 5.5772, + "step": 23806 + }, + { + "epoch": 0.1415869730707013, + "grad_norm": 1.9508872032165527, + "learning_rate": 4.756752700390366e-05, + "loss": 5.3001, + "step": 23807 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 1.6098484992980957, + "learning_rate": 4.756732602229853e-05, + "loss": 5.2318, + "step": 23808 + }, + { + "epoch": 0.1415988676372633, + "grad_norm": 1.4197050333023071, + "learning_rate": 4.7567125032815394e-05, + "loss": 4.9752, + "step": 23809 + }, + { + "epoch": 0.1416048149205443, + "grad_norm": 1.5325055122375488, + "learning_rate": 4.7566924035454305e-05, + "loss": 5.0156, + "step": 23810 + }, + { + "epoch": 0.1416107622038253, + "grad_norm": 1.7188338041305542, + "learning_rate": 4.7566723030215335e-05, + "loss": 5.3756, + "step": 23811 + }, + { + "epoch": 0.14161670948710628, + "grad_norm": 1.779646396636963, + "learning_rate": 4.756652201709856e-05, + "loss": 5.3844, + "step": 23812 + }, + { + "epoch": 0.1416226567703873, + "grad_norm": 1.913001298904419, + "learning_rate": 4.756632099610404e-05, + "loss": 5.2604, + "step": 23813 + }, + { + "epoch": 0.14162860405366828, + "grad_norm": 1.5379444360733032, + "learning_rate": 4.7566119967231846e-05, + "loss": 5.4184, + "step": 23814 + }, + { + "epoch": 0.14163455133694927, + "grad_norm": 2.5433242321014404, + "learning_rate": 4.756591893048206e-05, + "loss": 5.076, + "step": 23815 + }, + { + "epoch": 0.1416404986202303, + "grad_norm": 2.0431840419769287, + "learning_rate": 4.756571788585474e-05, + "loss": 5.0766, + "step": 23816 + }, + { + "epoch": 0.14164644590351128, + "grad_norm": 2.5103769302368164, + "learning_rate": 4.7565516833349964e-05, + "loss": 4.9539, + "step": 23817 + }, + { + "epoch": 0.14165239318679226, + "grad_norm": 1.563063383102417, + "learning_rate": 4.75653157729678e-05, + "loss": 5.4752, + "step": 23818 + }, + { + "epoch": 0.14165834047007328, + "grad_norm": 1.8695935010910034, + "learning_rate": 4.756511470470832e-05, + "loss": 5.4486, + "step": 23819 + }, + { + "epoch": 0.14166428775335427, + "grad_norm": 2.092947244644165, + "learning_rate": 4.756491362857158e-05, + "loss": 5.1404, + "step": 23820 + }, + { + "epoch": 0.14167023503663526, + "grad_norm": 1.8582149744033813, + "learning_rate": 4.756471254455768e-05, + "loss": 5.0814, + "step": 23821 + }, + { + "epoch": 0.14167618231991627, + "grad_norm": 3.3430545330047607, + "learning_rate": 4.756451145266666e-05, + "loss": 5.2346, + "step": 23822 + }, + { + "epoch": 0.14168212960319726, + "grad_norm": 2.023859977722168, + "learning_rate": 4.75643103528986e-05, + "loss": 5.1639, + "step": 23823 + }, + { + "epoch": 0.14168807688647825, + "grad_norm": 2.0848581790924072, + "learning_rate": 4.756410924525358e-05, + "loss": 5.1314, + "step": 23824 + }, + { + "epoch": 0.14169402416975926, + "grad_norm": 2.2708516120910645, + "learning_rate": 4.7563908129731663e-05, + "loss": 5.1218, + "step": 23825 + }, + { + "epoch": 0.14169997145304025, + "grad_norm": 1.9105170965194702, + "learning_rate": 4.7563707006332905e-05, + "loss": 5.0428, + "step": 23826 + }, + { + "epoch": 0.14170591873632124, + "grad_norm": 1.9914016723632812, + "learning_rate": 4.75635058750574e-05, + "loss": 5.0497, + "step": 23827 + }, + { + "epoch": 0.14171186601960226, + "grad_norm": 1.9820994138717651, + "learning_rate": 4.756330473590521e-05, + "loss": 5.1161, + "step": 23828 + }, + { + "epoch": 0.14171781330288324, + "grad_norm": 1.7676537036895752, + "learning_rate": 4.75631035888764e-05, + "loss": 5.0291, + "step": 23829 + }, + { + "epoch": 0.14172376058616423, + "grad_norm": 1.9614083766937256, + "learning_rate": 4.7562902433971046e-05, + "loss": 5.3574, + "step": 23830 + }, + { + "epoch": 0.14172970786944525, + "grad_norm": 1.4212971925735474, + "learning_rate": 4.756270127118921e-05, + "loss": 5.8053, + "step": 23831 + }, + { + "epoch": 0.14173565515272624, + "grad_norm": 1.6015945672988892, + "learning_rate": 4.7562500100530984e-05, + "loss": 5.9339, + "step": 23832 + }, + { + "epoch": 0.14174160243600722, + "grad_norm": 1.6133309602737427, + "learning_rate": 4.7562298921996405e-05, + "loss": 5.4939, + "step": 23833 + }, + { + "epoch": 0.14174754971928824, + "grad_norm": 1.514958381652832, + "learning_rate": 4.7562097735585565e-05, + "loss": 5.649, + "step": 23834 + }, + { + "epoch": 0.14175349700256923, + "grad_norm": 1.912479281425476, + "learning_rate": 4.756189654129853e-05, + "loss": 5.5304, + "step": 23835 + }, + { + "epoch": 0.14175944428585022, + "grad_norm": 2.149765968322754, + "learning_rate": 4.756169533913538e-05, + "loss": 5.4228, + "step": 23836 + }, + { + "epoch": 0.14176539156913123, + "grad_norm": 1.8468290567398071, + "learning_rate": 4.756149412909616e-05, + "loss": 5.4605, + "step": 23837 + }, + { + "epoch": 0.14177133885241222, + "grad_norm": 1.670300841331482, + "learning_rate": 4.756129291118097e-05, + "loss": 5.4537, + "step": 23838 + }, + { + "epoch": 0.1417772861356932, + "grad_norm": 1.8857238292694092, + "learning_rate": 4.756109168538985e-05, + "loss": 5.2654, + "step": 23839 + }, + { + "epoch": 0.14178323341897422, + "grad_norm": 1.9114692211151123, + "learning_rate": 4.7560890451722894e-05, + "loss": 5.3255, + "step": 23840 + }, + { + "epoch": 0.1417891807022552, + "grad_norm": 1.654356598854065, + "learning_rate": 4.7560689210180164e-05, + "loss": 5.2983, + "step": 23841 + }, + { + "epoch": 0.1417951279855362, + "grad_norm": 1.9302277565002441, + "learning_rate": 4.7560487960761734e-05, + "loss": 5.7902, + "step": 23842 + }, + { + "epoch": 0.14180107526881722, + "grad_norm": 1.8009575605392456, + "learning_rate": 4.7560286703467674e-05, + "loss": 5.7359, + "step": 23843 + }, + { + "epoch": 0.1418070225520982, + "grad_norm": 1.4472894668579102, + "learning_rate": 4.7560085438298043e-05, + "loss": 5.813, + "step": 23844 + }, + { + "epoch": 0.1418129698353792, + "grad_norm": 1.6131559610366821, + "learning_rate": 4.755988416525292e-05, + "loss": 5.7525, + "step": 23845 + }, + { + "epoch": 0.1418189171186602, + "grad_norm": 1.4684244394302368, + "learning_rate": 4.755968288433237e-05, + "loss": 5.7649, + "step": 23846 + }, + { + "epoch": 0.1418248644019412, + "grad_norm": 1.369974970817566, + "learning_rate": 4.755948159553647e-05, + "loss": 5.666, + "step": 23847 + }, + { + "epoch": 0.14183081168522219, + "grad_norm": 1.6687818765640259, + "learning_rate": 4.755928029886529e-05, + "loss": 5.5685, + "step": 23848 + }, + { + "epoch": 0.1418367589685032, + "grad_norm": 2.011798858642578, + "learning_rate": 4.755907899431891e-05, + "loss": 6.0011, + "step": 23849 + }, + { + "epoch": 0.1418427062517842, + "grad_norm": 2.1938908100128174, + "learning_rate": 4.7558877681897376e-05, + "loss": 5.4987, + "step": 23850 + }, + { + "epoch": 0.14184865353506518, + "grad_norm": 1.9103244543075562, + "learning_rate": 4.7558676361600774e-05, + "loss": 5.5061, + "step": 23851 + }, + { + "epoch": 0.14185460081834617, + "grad_norm": 1.850809097290039, + "learning_rate": 4.7558475033429165e-05, + "loss": 5.4346, + "step": 23852 + }, + { + "epoch": 0.14186054810162718, + "grad_norm": 1.6861615180969238, + "learning_rate": 4.755827369738263e-05, + "loss": 5.5082, + "step": 23853 + }, + { + "epoch": 0.14186649538490817, + "grad_norm": 1.532423496246338, + "learning_rate": 4.7558072353461236e-05, + "loss": 5.704, + "step": 23854 + }, + { + "epoch": 0.14187244266818916, + "grad_norm": 1.6446877717971802, + "learning_rate": 4.755787100166506e-05, + "loss": 5.7046, + "step": 23855 + }, + { + "epoch": 0.14187838995147017, + "grad_norm": 1.599294662475586, + "learning_rate": 4.7557669641994144e-05, + "loss": 5.7324, + "step": 23856 + }, + { + "epoch": 0.14188433723475116, + "grad_norm": 1.8838186264038086, + "learning_rate": 4.7557468274448594e-05, + "loss": 5.5496, + "step": 23857 + }, + { + "epoch": 0.14189028451803215, + "grad_norm": 1.8579468727111816, + "learning_rate": 4.7557266899028464e-05, + "loss": 5.6645, + "step": 23858 + }, + { + "epoch": 0.14189623180131317, + "grad_norm": 2.02162766456604, + "learning_rate": 4.7557065515733815e-05, + "loss": 5.7992, + "step": 23859 + }, + { + "epoch": 0.14190217908459415, + "grad_norm": 1.559417486190796, + "learning_rate": 4.755686412456474e-05, + "loss": 5.6176, + "step": 23860 + }, + { + "epoch": 0.14190812636787514, + "grad_norm": 1.5074375867843628, + "learning_rate": 4.755666272552129e-05, + "loss": 5.3933, + "step": 23861 + }, + { + "epoch": 0.14191407365115616, + "grad_norm": 1.521987795829773, + "learning_rate": 4.755646131860354e-05, + "loss": 5.834, + "step": 23862 + }, + { + "epoch": 0.14192002093443715, + "grad_norm": 1.7396782636642456, + "learning_rate": 4.755625990381157e-05, + "loss": 5.149, + "step": 23863 + }, + { + "epoch": 0.14192596821771813, + "grad_norm": 1.7040945291519165, + "learning_rate": 4.755605848114544e-05, + "loss": 5.1569, + "step": 23864 + }, + { + "epoch": 0.14193191550099915, + "grad_norm": 1.7336739301681519, + "learning_rate": 4.7555857050605217e-05, + "loss": 5.1509, + "step": 23865 + }, + { + "epoch": 0.14193786278428014, + "grad_norm": 1.6548901796340942, + "learning_rate": 4.755565561219099e-05, + "loss": 4.9829, + "step": 23866 + }, + { + "epoch": 0.14194381006756113, + "grad_norm": 1.9203529357910156, + "learning_rate": 4.7555454165902804e-05, + "loss": 4.8946, + "step": 23867 + }, + { + "epoch": 0.14194975735084214, + "grad_norm": 1.8711525201797485, + "learning_rate": 4.755525271174074e-05, + "loss": 4.9691, + "step": 23868 + }, + { + "epoch": 0.14195570463412313, + "grad_norm": 1.8115698099136353, + "learning_rate": 4.755505124970488e-05, + "loss": 4.7342, + "step": 23869 + }, + { + "epoch": 0.14196165191740412, + "grad_norm": 1.996324896812439, + "learning_rate": 4.7554849779795284e-05, + "loss": 4.8892, + "step": 23870 + }, + { + "epoch": 0.14196759920068514, + "grad_norm": 1.7132238149642944, + "learning_rate": 4.7554648302012015e-05, + "loss": 4.7785, + "step": 23871 + }, + { + "epoch": 0.14197354648396612, + "grad_norm": 1.8130909204483032, + "learning_rate": 4.755444681635516e-05, + "loss": 4.9106, + "step": 23872 + }, + { + "epoch": 0.1419794937672471, + "grad_norm": 1.8058964014053345, + "learning_rate": 4.755424532282478e-05, + "loss": 4.7486, + "step": 23873 + }, + { + "epoch": 0.14198544105052813, + "grad_norm": 3.171724557876587, + "learning_rate": 4.755404382142094e-05, + "loss": 4.7696, + "step": 23874 + }, + { + "epoch": 0.14199138833380912, + "grad_norm": 1.99362313747406, + "learning_rate": 4.755384231214372e-05, + "loss": 4.6704, + "step": 23875 + }, + { + "epoch": 0.1419973356170901, + "grad_norm": 1.3904173374176025, + "learning_rate": 4.755364079499318e-05, + "loss": 5.6621, + "step": 23876 + }, + { + "epoch": 0.14200328290037112, + "grad_norm": 1.4735981225967407, + "learning_rate": 4.7553439269969415e-05, + "loss": 5.5464, + "step": 23877 + }, + { + "epoch": 0.1420092301836521, + "grad_norm": 1.3085891008377075, + "learning_rate": 4.755323773707246e-05, + "loss": 5.4913, + "step": 23878 + }, + { + "epoch": 0.1420151774669331, + "grad_norm": 1.627657175064087, + "learning_rate": 4.755303619630241e-05, + "loss": 5.4001, + "step": 23879 + }, + { + "epoch": 0.1420211247502141, + "grad_norm": 1.8672151565551758, + "learning_rate": 4.755283464765933e-05, + "loss": 5.5518, + "step": 23880 + }, + { + "epoch": 0.1420270720334951, + "grad_norm": 1.8344969749450684, + "learning_rate": 4.755263309114328e-05, + "loss": 5.2819, + "step": 23881 + }, + { + "epoch": 0.1420330193167761, + "grad_norm": 1.8662999868392944, + "learning_rate": 4.755243152675434e-05, + "loss": 5.3128, + "step": 23882 + }, + { + "epoch": 0.1420389666000571, + "grad_norm": 1.6729795932769775, + "learning_rate": 4.755222995449259e-05, + "loss": 5.1282, + "step": 23883 + }, + { + "epoch": 0.1420449138833381, + "grad_norm": 2.925039529800415, + "learning_rate": 4.7552028374358074e-05, + "loss": 4.9187, + "step": 23884 + }, + { + "epoch": 0.14205086116661908, + "grad_norm": 2.414885997772217, + "learning_rate": 4.755182678635089e-05, + "loss": 5.219, + "step": 23885 + }, + { + "epoch": 0.1420568084499001, + "grad_norm": 1.7273744344711304, + "learning_rate": 4.7551625190471095e-05, + "loss": 5.1296, + "step": 23886 + }, + { + "epoch": 0.14206275573318108, + "grad_norm": 1.691588044166565, + "learning_rate": 4.755142358671876e-05, + "loss": 5.3328, + "step": 23887 + }, + { + "epoch": 0.14206870301646207, + "grad_norm": 1.6644389629364014, + "learning_rate": 4.755122197509395e-05, + "loss": 6.162, + "step": 23888 + }, + { + "epoch": 0.1420746502997431, + "grad_norm": 1.7232459783554077, + "learning_rate": 4.7551020355596744e-05, + "loss": 6.1469, + "step": 23889 + }, + { + "epoch": 0.14208059758302408, + "grad_norm": 1.4883437156677246, + "learning_rate": 4.7550818728227206e-05, + "loss": 6.1803, + "step": 23890 + }, + { + "epoch": 0.14208654486630506, + "grad_norm": 1.4301148653030396, + "learning_rate": 4.7550617092985425e-05, + "loss": 6.0918, + "step": 23891 + }, + { + "epoch": 0.14209249214958608, + "grad_norm": 1.4922714233398438, + "learning_rate": 4.755041544987144e-05, + "loss": 5.8328, + "step": 23892 + }, + { + "epoch": 0.14209843943286707, + "grad_norm": 1.9683314561843872, + "learning_rate": 4.7550213798885345e-05, + "loss": 5.3362, + "step": 23893 + }, + { + "epoch": 0.14210438671614806, + "grad_norm": 1.841512680053711, + "learning_rate": 4.755001214002721e-05, + "loss": 5.1776, + "step": 23894 + }, + { + "epoch": 0.14211033399942907, + "grad_norm": 1.615190863609314, + "learning_rate": 4.7549810473297085e-05, + "loss": 5.4266, + "step": 23895 + }, + { + "epoch": 0.14211628128271006, + "grad_norm": 1.728252649307251, + "learning_rate": 4.7549608798695065e-05, + "loss": 5.5736, + "step": 23896 + }, + { + "epoch": 0.14212222856599105, + "grad_norm": 1.5590336322784424, + "learning_rate": 4.75494071162212e-05, + "loss": 5.4725, + "step": 23897 + }, + { + "epoch": 0.14212817584927206, + "grad_norm": 1.5246217250823975, + "learning_rate": 4.7549205425875585e-05, + "loss": 5.3707, + "step": 23898 + }, + { + "epoch": 0.14213412313255305, + "grad_norm": 1.4803682565689087, + "learning_rate": 4.754900372765826e-05, + "loss": 5.5735, + "step": 23899 + }, + { + "epoch": 0.14214007041583404, + "grad_norm": 1.633510947227478, + "learning_rate": 4.7548802021569315e-05, + "loss": 5.3334, + "step": 23900 + }, + { + "epoch": 0.14214601769911506, + "grad_norm": 1.9321861267089844, + "learning_rate": 4.754860030760882e-05, + "loss": 5.3384, + "step": 23901 + }, + { + "epoch": 0.14215196498239605, + "grad_norm": 1.858965516090393, + "learning_rate": 4.7548398585776844e-05, + "loss": 5.4072, + "step": 23902 + }, + { + "epoch": 0.14215791226567703, + "grad_norm": 1.7266136407852173, + "learning_rate": 4.754819685607345e-05, + "loss": 5.3865, + "step": 23903 + }, + { + "epoch": 0.14216385954895805, + "grad_norm": 1.579783320426941, + "learning_rate": 4.754799511849871e-05, + "loss": 5.3524, + "step": 23904 + }, + { + "epoch": 0.14216980683223904, + "grad_norm": 1.5112273693084717, + "learning_rate": 4.7547793373052704e-05, + "loss": 5.3411, + "step": 23905 + }, + { + "epoch": 0.14217575411552003, + "grad_norm": 1.5031278133392334, + "learning_rate": 4.754759161973549e-05, + "loss": 5.3782, + "step": 23906 + }, + { + "epoch": 0.14218170139880104, + "grad_norm": 1.581784963607788, + "learning_rate": 4.7547389858547155e-05, + "loss": 5.2722, + "step": 23907 + }, + { + "epoch": 0.14218764868208203, + "grad_norm": 1.350386619567871, + "learning_rate": 4.754718808948775e-05, + "loss": 5.5733, + "step": 23908 + }, + { + "epoch": 0.14219359596536302, + "grad_norm": 1.5469433069229126, + "learning_rate": 4.754698631255736e-05, + "loss": 5.7556, + "step": 23909 + }, + { + "epoch": 0.142199543248644, + "grad_norm": 1.5234500169754028, + "learning_rate": 4.754678452775604e-05, + "loss": 5.9086, + "step": 23910 + }, + { + "epoch": 0.14220549053192502, + "grad_norm": 1.4361084699630737, + "learning_rate": 4.754658273508388e-05, + "loss": 5.7659, + "step": 23911 + }, + { + "epoch": 0.142211437815206, + "grad_norm": 1.5128140449523926, + "learning_rate": 4.754638093454094e-05, + "loss": 5.7307, + "step": 23912 + }, + { + "epoch": 0.142217385098487, + "grad_norm": 1.4324685335159302, + "learning_rate": 4.754617912612729e-05, + "loss": 5.4717, + "step": 23913 + }, + { + "epoch": 0.14222333238176801, + "grad_norm": 1.8225339651107788, + "learning_rate": 4.7545977309843004e-05, + "loss": 5.3876, + "step": 23914 + }, + { + "epoch": 0.142229279665049, + "grad_norm": 1.6822171211242676, + "learning_rate": 4.754577548568815e-05, + "loss": 5.5243, + "step": 23915 + }, + { + "epoch": 0.14223522694833, + "grad_norm": 1.7231889963150024, + "learning_rate": 4.754557365366279e-05, + "loss": 5.9398, + "step": 23916 + }, + { + "epoch": 0.142241174231611, + "grad_norm": 1.6815425157546997, + "learning_rate": 4.754537181376702e-05, + "loss": 6.0264, + "step": 23917 + }, + { + "epoch": 0.142247121514892, + "grad_norm": 1.599161148071289, + "learning_rate": 4.754516996600088e-05, + "loss": 6.0783, + "step": 23918 + }, + { + "epoch": 0.14225306879817298, + "grad_norm": 1.565960168838501, + "learning_rate": 4.7544968110364455e-05, + "loss": 6.2248, + "step": 23919 + }, + { + "epoch": 0.142259016081454, + "grad_norm": 1.5778778791427612, + "learning_rate": 4.754476624685782e-05, + "loss": 6.1216, + "step": 23920 + }, + { + "epoch": 0.142264963364735, + "grad_norm": 1.6303963661193848, + "learning_rate": 4.754456437548104e-05, + "loss": 5.9956, + "step": 23921 + }, + { + "epoch": 0.14227091064801597, + "grad_norm": 1.6119714975357056, + "learning_rate": 4.754436249623418e-05, + "loss": 5.4221, + "step": 23922 + }, + { + "epoch": 0.142276857931297, + "grad_norm": 1.9543877840042114, + "learning_rate": 4.754416060911732e-05, + "loss": 5.3631, + "step": 23923 + }, + { + "epoch": 0.14228280521457798, + "grad_norm": 1.90111243724823, + "learning_rate": 4.754395871413052e-05, + "loss": 5.3828, + "step": 23924 + }, + { + "epoch": 0.14228875249785897, + "grad_norm": 1.6575809717178345, + "learning_rate": 4.754375681127386e-05, + "loss": 5.1258, + "step": 23925 + }, + { + "epoch": 0.14229469978113998, + "grad_norm": 1.5518983602523804, + "learning_rate": 4.7543554900547416e-05, + "loss": 5.2144, + "step": 23926 + }, + { + "epoch": 0.14230064706442097, + "grad_norm": 1.604325532913208, + "learning_rate": 4.754335298195124e-05, + "loss": 5.1447, + "step": 23927 + }, + { + "epoch": 0.14230659434770196, + "grad_norm": 1.6287504434585571, + "learning_rate": 4.754315105548542e-05, + "loss": 5.1267, + "step": 23928 + }, + { + "epoch": 0.14231254163098297, + "grad_norm": 1.5111888647079468, + "learning_rate": 4.7542949121150014e-05, + "loss": 5.1122, + "step": 23929 + }, + { + "epoch": 0.14231848891426396, + "grad_norm": 1.4685728549957275, + "learning_rate": 4.75427471789451e-05, + "loss": 5.5366, + "step": 23930 + }, + { + "epoch": 0.14232443619754495, + "grad_norm": 2.1167118549346924, + "learning_rate": 4.754254522887074e-05, + "loss": 5.0426, + "step": 23931 + }, + { + "epoch": 0.14233038348082597, + "grad_norm": 1.7412205934524536, + "learning_rate": 4.754234327092702e-05, + "loss": 5.1454, + "step": 23932 + }, + { + "epoch": 0.14233633076410696, + "grad_norm": 2.290722608566284, + "learning_rate": 4.754214130511399e-05, + "loss": 4.7253, + "step": 23933 + }, + { + "epoch": 0.14234227804738794, + "grad_norm": 2.460817813873291, + "learning_rate": 4.754193933143174e-05, + "loss": 4.762, + "step": 23934 + }, + { + "epoch": 0.14234822533066896, + "grad_norm": 2.2080838680267334, + "learning_rate": 4.754173734988032e-05, + "loss": 4.6405, + "step": 23935 + }, + { + "epoch": 0.14235417261394995, + "grad_norm": 2.475855588912964, + "learning_rate": 4.7541535360459825e-05, + "loss": 4.6213, + "step": 23936 + }, + { + "epoch": 0.14236011989723094, + "grad_norm": 2.1748647689819336, + "learning_rate": 4.754133336317031e-05, + "loss": 4.5461, + "step": 23937 + }, + { + "epoch": 0.14236606718051195, + "grad_norm": 2.1339731216430664, + "learning_rate": 4.754113135801185e-05, + "loss": 4.6366, + "step": 23938 + }, + { + "epoch": 0.14237201446379294, + "grad_norm": 2.142465353012085, + "learning_rate": 4.754092934498451e-05, + "loss": 4.6129, + "step": 23939 + }, + { + "epoch": 0.14237796174707393, + "grad_norm": 2.1925458908081055, + "learning_rate": 4.754072732408836e-05, + "loss": 4.6171, + "step": 23940 + }, + { + "epoch": 0.14238390903035494, + "grad_norm": 2.1470870971679688, + "learning_rate": 4.7540525295323483e-05, + "loss": 4.4577, + "step": 23941 + }, + { + "epoch": 0.14238985631363593, + "grad_norm": 1.7223306894302368, + "learning_rate": 4.754032325868994e-05, + "loss": 5.7355, + "step": 23942 + }, + { + "epoch": 0.14239580359691692, + "grad_norm": 1.8489956855773926, + "learning_rate": 4.7540121214187805e-05, + "loss": 5.9877, + "step": 23943 + }, + { + "epoch": 0.14240175088019794, + "grad_norm": 1.8920329809188843, + "learning_rate": 4.7539919161817134e-05, + "loss": 5.6751, + "step": 23944 + }, + { + "epoch": 0.14240769816347892, + "grad_norm": 1.642392635345459, + "learning_rate": 4.753971710157802e-05, + "loss": 5.3404, + "step": 23945 + }, + { + "epoch": 0.1424136454467599, + "grad_norm": 1.681997537612915, + "learning_rate": 4.753951503347053e-05, + "loss": 5.2964, + "step": 23946 + }, + { + "epoch": 0.14241959273004093, + "grad_norm": 1.767589807510376, + "learning_rate": 4.753931295749472e-05, + "loss": 5.2843, + "step": 23947 + }, + { + "epoch": 0.14242554001332192, + "grad_norm": 1.7100127935409546, + "learning_rate": 4.7539110873650674e-05, + "loss": 5.3869, + "step": 23948 + }, + { + "epoch": 0.1424314872966029, + "grad_norm": 1.5660570859909058, + "learning_rate": 4.7538908781938453e-05, + "loss": 5.3994, + "step": 23949 + }, + { + "epoch": 0.14243743457988392, + "grad_norm": 1.8509501218795776, + "learning_rate": 4.7538706682358124e-05, + "loss": 5.8575, + "step": 23950 + }, + { + "epoch": 0.1424433818631649, + "grad_norm": 1.5773848295211792, + "learning_rate": 4.753850457490978e-05, + "loss": 5.8548, + "step": 23951 + }, + { + "epoch": 0.1424493291464459, + "grad_norm": 1.4020990133285522, + "learning_rate": 4.753830245959347e-05, + "loss": 5.6696, + "step": 23952 + }, + { + "epoch": 0.1424552764297269, + "grad_norm": 1.7756813764572144, + "learning_rate": 4.753810033640928e-05, + "loss": 5.3623, + "step": 23953 + }, + { + "epoch": 0.1424612237130079, + "grad_norm": 1.9046579599380493, + "learning_rate": 4.7537898205357255e-05, + "loss": 5.4078, + "step": 23954 + }, + { + "epoch": 0.1424671709962889, + "grad_norm": 1.6977450847625732, + "learning_rate": 4.753769606643749e-05, + "loss": 5.4418, + "step": 23955 + }, + { + "epoch": 0.1424731182795699, + "grad_norm": 1.6306700706481934, + "learning_rate": 4.753749391965005e-05, + "loss": 5.6299, + "step": 23956 + }, + { + "epoch": 0.1424790655628509, + "grad_norm": 1.8286629915237427, + "learning_rate": 4.7537291764995006e-05, + "loss": 5.7271, + "step": 23957 + }, + { + "epoch": 0.14248501284613188, + "grad_norm": 1.5603896379470825, + "learning_rate": 4.753708960247242e-05, + "loss": 5.645, + "step": 23958 + }, + { + "epoch": 0.1424909601294129, + "grad_norm": 1.6031434535980225, + "learning_rate": 4.7536887432082375e-05, + "loss": 5.6604, + "step": 23959 + }, + { + "epoch": 0.14249690741269389, + "grad_norm": 1.6950321197509766, + "learning_rate": 4.753668525382493e-05, + "loss": 5.7467, + "step": 23960 + }, + { + "epoch": 0.14250285469597487, + "grad_norm": 1.367156744003296, + "learning_rate": 4.753648306770017e-05, + "loss": 5.8554, + "step": 23961 + }, + { + "epoch": 0.1425088019792559, + "grad_norm": 1.6769720315933228, + "learning_rate": 4.753628087370815e-05, + "loss": 5.7408, + "step": 23962 + }, + { + "epoch": 0.14251474926253688, + "grad_norm": 2.3092730045318604, + "learning_rate": 4.753607867184894e-05, + "loss": 4.3284, + "step": 23963 + }, + { + "epoch": 0.14252069654581787, + "grad_norm": 1.8199213743209839, + "learning_rate": 4.753587646212263e-05, + "loss": 4.9928, + "step": 23964 + }, + { + "epoch": 0.14252664382909888, + "grad_norm": 1.5818908214569092, + "learning_rate": 4.753567424452927e-05, + "loss": 5.4382, + "step": 23965 + }, + { + "epoch": 0.14253259111237987, + "grad_norm": 1.6112592220306396, + "learning_rate": 4.753547201906895e-05, + "loss": 5.6344, + "step": 23966 + }, + { + "epoch": 0.14253853839566086, + "grad_norm": 1.530733585357666, + "learning_rate": 4.753526978574172e-05, + "loss": 5.6788, + "step": 23967 + }, + { + "epoch": 0.14254448567894185, + "grad_norm": 1.4186383485794067, + "learning_rate": 4.7535067544547664e-05, + "loss": 5.5129, + "step": 23968 + }, + { + "epoch": 0.14255043296222286, + "grad_norm": 1.3288373947143555, + "learning_rate": 4.753486529548684e-05, + "loss": 5.4413, + "step": 23969 + }, + { + "epoch": 0.14255638024550385, + "grad_norm": 1.3416498899459839, + "learning_rate": 4.7534663038559335e-05, + "loss": 5.6757, + "step": 23970 + }, + { + "epoch": 0.14256232752878484, + "grad_norm": 1.2552043199539185, + "learning_rate": 4.7534460773765215e-05, + "loss": 5.4015, + "step": 23971 + }, + { + "epoch": 0.14256827481206585, + "grad_norm": 1.7393593788146973, + "learning_rate": 4.7534258501104544e-05, + "loss": 5.8824, + "step": 23972 + }, + { + "epoch": 0.14257422209534684, + "grad_norm": 1.5608609914779663, + "learning_rate": 4.75340562205774e-05, + "loss": 5.7623, + "step": 23973 + }, + { + "epoch": 0.14258016937862783, + "grad_norm": 1.484365463256836, + "learning_rate": 4.753385393218384e-05, + "loss": 5.6563, + "step": 23974 + }, + { + "epoch": 0.14258611666190885, + "grad_norm": 1.5432020425796509, + "learning_rate": 4.753365163592395e-05, + "loss": 5.6214, + "step": 23975 + }, + { + "epoch": 0.14259206394518983, + "grad_norm": 1.3963783979415894, + "learning_rate": 4.7533449331797797e-05, + "loss": 5.5315, + "step": 23976 + }, + { + "epoch": 0.14259801122847082, + "grad_norm": 1.778178095817566, + "learning_rate": 4.753324701980545e-05, + "loss": 5.8467, + "step": 23977 + }, + { + "epoch": 0.14260395851175184, + "grad_norm": 1.717940330505371, + "learning_rate": 4.753304469994698e-05, + "loss": 5.6369, + "step": 23978 + }, + { + "epoch": 0.14260990579503283, + "grad_norm": 1.7598493099212646, + "learning_rate": 4.753284237222245e-05, + "loss": 5.2906, + "step": 23979 + }, + { + "epoch": 0.14261585307831381, + "grad_norm": 2.1206471920013428, + "learning_rate": 4.753264003663194e-05, + "loss": 4.5855, + "step": 23980 + }, + { + "epoch": 0.14262180036159483, + "grad_norm": 2.1312971115112305, + "learning_rate": 4.7532437693175525e-05, + "loss": 4.6795, + "step": 23981 + }, + { + "epoch": 0.14262774764487582, + "grad_norm": 2.6566877365112305, + "learning_rate": 4.753223534185326e-05, + "loss": 4.6831, + "step": 23982 + }, + { + "epoch": 0.1426336949281568, + "grad_norm": 2.5692079067230225, + "learning_rate": 4.753203298266523e-05, + "loss": 4.3662, + "step": 23983 + }, + { + "epoch": 0.14263964221143782, + "grad_norm": 2.2617204189300537, + "learning_rate": 4.75318306156115e-05, + "loss": 4.5077, + "step": 23984 + }, + { + "epoch": 0.1426455894947188, + "grad_norm": 2.3445560932159424, + "learning_rate": 4.753162824069214e-05, + "loss": 4.3449, + "step": 23985 + }, + { + "epoch": 0.1426515367779998, + "grad_norm": 2.193120002746582, + "learning_rate": 4.7531425857907216e-05, + "loss": 4.3601, + "step": 23986 + }, + { + "epoch": 0.14265748406128081, + "grad_norm": 2.3515334129333496, + "learning_rate": 4.753122346725681e-05, + "loss": 4.411, + "step": 23987 + }, + { + "epoch": 0.1426634313445618, + "grad_norm": 2.286971092224121, + "learning_rate": 4.7531021068740986e-05, + "loss": 4.4801, + "step": 23988 + }, + { + "epoch": 0.1426693786278428, + "grad_norm": 2.30155873298645, + "learning_rate": 4.7530818662359814e-05, + "loss": 4.4121, + "step": 23989 + }, + { + "epoch": 0.1426753259111238, + "grad_norm": 2.151796340942383, + "learning_rate": 4.7530616248113364e-05, + "loss": 4.4185, + "step": 23990 + }, + { + "epoch": 0.1426812731944048, + "grad_norm": 2.6092782020568848, + "learning_rate": 4.7530413826001706e-05, + "loss": 4.5183, + "step": 23991 + }, + { + "epoch": 0.14268722047768578, + "grad_norm": 2.3881771564483643, + "learning_rate": 4.7530211396024926e-05, + "loss": 4.5246, + "step": 23992 + }, + { + "epoch": 0.1426931677609668, + "grad_norm": 2.921297550201416, + "learning_rate": 4.753000895818307e-05, + "loss": 4.5855, + "step": 23993 + }, + { + "epoch": 0.1426991150442478, + "grad_norm": 2.039461135864258, + "learning_rate": 4.752980651247623e-05, + "loss": 5.3866, + "step": 23994 + }, + { + "epoch": 0.14270506232752878, + "grad_norm": 2.6810874938964844, + "learning_rate": 4.752960405890446e-05, + "loss": 4.3992, + "step": 23995 + }, + { + "epoch": 0.1427110096108098, + "grad_norm": 2.366675615310669, + "learning_rate": 4.752940159746784e-05, + "loss": 4.3981, + "step": 23996 + }, + { + "epoch": 0.14271695689409078, + "grad_norm": 2.446672201156616, + "learning_rate": 4.7529199128166435e-05, + "loss": 4.3428, + "step": 23997 + }, + { + "epoch": 0.14272290417737177, + "grad_norm": 2.686692476272583, + "learning_rate": 4.7528996651000325e-05, + "loss": 4.4006, + "step": 23998 + }, + { + "epoch": 0.14272885146065278, + "grad_norm": 2.577341318130493, + "learning_rate": 4.752879416596957e-05, + "loss": 4.3635, + "step": 23999 + }, + { + "epoch": 0.14273479874393377, + "grad_norm": 2.0183050632476807, + "learning_rate": 4.752859167307425e-05, + "loss": 4.402, + "step": 24000 + }, + { + "epoch": 0.14274074602721476, + "grad_norm": 2.062704563140869, + "learning_rate": 4.7528389172314434e-05, + "loss": 4.3103, + "step": 24001 + }, + { + "epoch": 0.14274669331049578, + "grad_norm": 2.3112356662750244, + "learning_rate": 4.752818666369019e-05, + "loss": 4.5129, + "step": 24002 + }, + { + "epoch": 0.14275264059377676, + "grad_norm": 2.3484156131744385, + "learning_rate": 4.752798414720158e-05, + "loss": 4.2367, + "step": 24003 + }, + { + "epoch": 0.14275858787705775, + "grad_norm": 2.142179250717163, + "learning_rate": 4.752778162284869e-05, + "loss": 4.8016, + "step": 24004 + }, + { + "epoch": 0.14276453516033877, + "grad_norm": 2.076201915740967, + "learning_rate": 4.752757909063158e-05, + "loss": 5.2754, + "step": 24005 + }, + { + "epoch": 0.14277048244361976, + "grad_norm": 1.7873663902282715, + "learning_rate": 4.752737655055033e-05, + "loss": 5.3064, + "step": 24006 + }, + { + "epoch": 0.14277642972690074, + "grad_norm": 1.863776445388794, + "learning_rate": 4.7527174002605e-05, + "loss": 5.045, + "step": 24007 + }, + { + "epoch": 0.14278237701018176, + "grad_norm": 1.9370598793029785, + "learning_rate": 4.752697144679567e-05, + "loss": 5.037, + "step": 24008 + }, + { + "epoch": 0.14278832429346275, + "grad_norm": 1.967492938041687, + "learning_rate": 4.7526768883122405e-05, + "loss": 4.9898, + "step": 24009 + }, + { + "epoch": 0.14279427157674374, + "grad_norm": 1.6309136152267456, + "learning_rate": 4.7526566311585285e-05, + "loss": 5.0752, + "step": 24010 + }, + { + "epoch": 0.14280021886002475, + "grad_norm": 1.6783781051635742, + "learning_rate": 4.7526363732184365e-05, + "loss": 4.7746, + "step": 24011 + }, + { + "epoch": 0.14280616614330574, + "grad_norm": 1.4897167682647705, + "learning_rate": 4.752616114491972e-05, + "loss": 5.1681, + "step": 24012 + }, + { + "epoch": 0.14281211342658673, + "grad_norm": 1.4138036966323853, + "learning_rate": 4.752595854979144e-05, + "loss": 5.351, + "step": 24013 + }, + { + "epoch": 0.14281806070986774, + "grad_norm": 1.4653584957122803, + "learning_rate": 4.7525755946799566e-05, + "loss": 5.1754, + "step": 24014 + }, + { + "epoch": 0.14282400799314873, + "grad_norm": 1.7669284343719482, + "learning_rate": 4.752555333594419e-05, + "loss": 5.2409, + "step": 24015 + }, + { + "epoch": 0.14282995527642972, + "grad_norm": 2.478325366973877, + "learning_rate": 4.752535071722538e-05, + "loss": 5.7027, + "step": 24016 + }, + { + "epoch": 0.14283590255971074, + "grad_norm": 1.3903100490570068, + "learning_rate": 4.75251480906432e-05, + "loss": 5.371, + "step": 24017 + }, + { + "epoch": 0.14284184984299172, + "grad_norm": 1.5938868522644043, + "learning_rate": 4.752494545619772e-05, + "loss": 5.0741, + "step": 24018 + }, + { + "epoch": 0.1428477971262727, + "grad_norm": 1.4633463621139526, + "learning_rate": 4.752474281388901e-05, + "loss": 5.2562, + "step": 24019 + }, + { + "epoch": 0.14285374440955373, + "grad_norm": 1.5575978755950928, + "learning_rate": 4.7524540163717155e-05, + "loss": 5.7142, + "step": 24020 + }, + { + "epoch": 0.14285969169283472, + "grad_norm": 1.857527732849121, + "learning_rate": 4.7524337505682216e-05, + "loss": 5.6595, + "step": 24021 + }, + { + "epoch": 0.1428656389761157, + "grad_norm": 1.6097089052200317, + "learning_rate": 4.752413483978426e-05, + "loss": 5.2562, + "step": 24022 + }, + { + "epoch": 0.14287158625939672, + "grad_norm": 1.8765082359313965, + "learning_rate": 4.752393216602335e-05, + "loss": 4.511, + "step": 24023 + }, + { + "epoch": 0.1428775335426777, + "grad_norm": 1.5626455545425415, + "learning_rate": 4.752372948439959e-05, + "loss": 4.8816, + "step": 24024 + }, + { + "epoch": 0.1428834808259587, + "grad_norm": 1.4234426021575928, + "learning_rate": 4.7523526794913015e-05, + "loss": 5.1271, + "step": 24025 + }, + { + "epoch": 0.14288942810923969, + "grad_norm": 1.4709553718566895, + "learning_rate": 4.7523324097563706e-05, + "loss": 5.2034, + "step": 24026 + }, + { + "epoch": 0.1428953753925207, + "grad_norm": 1.7568445205688477, + "learning_rate": 4.752312139235175e-05, + "loss": 4.7914, + "step": 24027 + }, + { + "epoch": 0.1429013226758017, + "grad_norm": 1.711824893951416, + "learning_rate": 4.752291867927719e-05, + "loss": 4.6601, + "step": 24028 + }, + { + "epoch": 0.14290726995908268, + "grad_norm": 1.6301651000976562, + "learning_rate": 4.752271595834012e-05, + "loss": 4.9326, + "step": 24029 + }, + { + "epoch": 0.1429132172423637, + "grad_norm": 1.5549229383468628, + "learning_rate": 4.752251322954061e-05, + "loss": 5.1706, + "step": 24030 + }, + { + "epoch": 0.14291916452564468, + "grad_norm": 1.5638782978057861, + "learning_rate": 4.752231049287871e-05, + "loss": 4.9079, + "step": 24031 + }, + { + "epoch": 0.14292511180892567, + "grad_norm": 1.6099932193756104, + "learning_rate": 4.752210774835451e-05, + "loss": 4.7565, + "step": 24032 + }, + { + "epoch": 0.14293105909220669, + "grad_norm": 1.5388545989990234, + "learning_rate": 4.752190499596808e-05, + "loss": 4.792, + "step": 24033 + }, + { + "epoch": 0.14293700637548767, + "grad_norm": 1.4083584547042847, + "learning_rate": 4.752170223571948e-05, + "loss": 4.8608, + "step": 24034 + }, + { + "epoch": 0.14294295365876866, + "grad_norm": 1.5718214511871338, + "learning_rate": 4.752149946760879e-05, + "loss": 4.7874, + "step": 24035 + }, + { + "epoch": 0.14294890094204968, + "grad_norm": 1.5951184034347534, + "learning_rate": 4.752129669163607e-05, + "loss": 4.7581, + "step": 24036 + }, + { + "epoch": 0.14295484822533067, + "grad_norm": 1.5525321960449219, + "learning_rate": 4.7521093907801404e-05, + "loss": 4.5684, + "step": 24037 + }, + { + "epoch": 0.14296079550861165, + "grad_norm": 1.6149049997329712, + "learning_rate": 4.7520891116104856e-05, + "loss": 4.4343, + "step": 24038 + }, + { + "epoch": 0.14296674279189267, + "grad_norm": 1.624150037765503, + "learning_rate": 4.752068831654649e-05, + "loss": 4.4697, + "step": 24039 + }, + { + "epoch": 0.14297269007517366, + "grad_norm": 1.3906975984573364, + "learning_rate": 4.75204855091264e-05, + "loss": 4.4062, + "step": 24040 + }, + { + "epoch": 0.14297863735845465, + "grad_norm": 1.6626862287521362, + "learning_rate": 4.7520282693844623e-05, + "loss": 4.9593, + "step": 24041 + }, + { + "epoch": 0.14298458464173566, + "grad_norm": 1.8431484699249268, + "learning_rate": 4.752007987070126e-05, + "loss": 5.3581, + "step": 24042 + }, + { + "epoch": 0.14299053192501665, + "grad_norm": 1.7550246715545654, + "learning_rate": 4.751987703969637e-05, + "loss": 5.3909, + "step": 24043 + }, + { + "epoch": 0.14299647920829764, + "grad_norm": 1.6016278266906738, + "learning_rate": 4.7519674200830015e-05, + "loss": 5.1732, + "step": 24044 + }, + { + "epoch": 0.14300242649157865, + "grad_norm": 1.4594265222549438, + "learning_rate": 4.7519471354102285e-05, + "loss": 5.0859, + "step": 24045 + }, + { + "epoch": 0.14300837377485964, + "grad_norm": 1.7040293216705322, + "learning_rate": 4.751926849951323e-05, + "loss": 5.1476, + "step": 24046 + }, + { + "epoch": 0.14301432105814063, + "grad_norm": 1.4739158153533936, + "learning_rate": 4.7519065637062934e-05, + "loss": 5.3691, + "step": 24047 + }, + { + "epoch": 0.14302026834142165, + "grad_norm": 1.5245054960250854, + "learning_rate": 4.751886276675147e-05, + "loss": 5.4395, + "step": 24048 + }, + { + "epoch": 0.14302621562470264, + "grad_norm": 1.678786039352417, + "learning_rate": 4.75186598885789e-05, + "loss": 4.826, + "step": 24049 + }, + { + "epoch": 0.14303216290798362, + "grad_norm": 1.9114538431167603, + "learning_rate": 4.7518457002545305e-05, + "loss": 5.1483, + "step": 24050 + }, + { + "epoch": 0.14303811019126464, + "grad_norm": 1.5139118432998657, + "learning_rate": 4.751825410865074e-05, + "loss": 5.1349, + "step": 24051 + }, + { + "epoch": 0.14304405747454563, + "grad_norm": 1.4199074506759644, + "learning_rate": 4.7518051206895286e-05, + "loss": 5.0579, + "step": 24052 + }, + { + "epoch": 0.14305000475782662, + "grad_norm": 1.570027470588684, + "learning_rate": 4.751784829727902e-05, + "loss": 4.9915, + "step": 24053 + }, + { + "epoch": 0.14305595204110763, + "grad_norm": 1.476340651512146, + "learning_rate": 4.7517645379802e-05, + "loss": 5.4808, + "step": 24054 + }, + { + "epoch": 0.14306189932438862, + "grad_norm": 1.7526558637619019, + "learning_rate": 4.75174424544643e-05, + "loss": 5.3816, + "step": 24055 + }, + { + "epoch": 0.1430678466076696, + "grad_norm": 1.846692681312561, + "learning_rate": 4.7517239521266e-05, + "loss": 5.6713, + "step": 24056 + }, + { + "epoch": 0.14307379389095062, + "grad_norm": 1.5340349674224854, + "learning_rate": 4.751703658020716e-05, + "loss": 5.6456, + "step": 24057 + }, + { + "epoch": 0.1430797411742316, + "grad_norm": 1.6693123579025269, + "learning_rate": 4.751683363128786e-05, + "loss": 5.5229, + "step": 24058 + }, + { + "epoch": 0.1430856884575126, + "grad_norm": 1.7673590183258057, + "learning_rate": 4.751663067450816e-05, + "loss": 4.9188, + "step": 24059 + }, + { + "epoch": 0.14309163574079362, + "grad_norm": 1.8243883848190308, + "learning_rate": 4.751642770986814e-05, + "loss": 4.5658, + "step": 24060 + }, + { + "epoch": 0.1430975830240746, + "grad_norm": 2.394139051437378, + "learning_rate": 4.7516224737367866e-05, + "loss": 4.101, + "step": 24061 + }, + { + "epoch": 0.1431035303073556, + "grad_norm": 2.0918843746185303, + "learning_rate": 4.7516021757007414e-05, + "loss": 4.03, + "step": 24062 + }, + { + "epoch": 0.1431094775906366, + "grad_norm": 2.129743814468384, + "learning_rate": 4.751581876878685e-05, + "loss": 4.1339, + "step": 24063 + }, + { + "epoch": 0.1431154248739176, + "grad_norm": 2.1546170711517334, + "learning_rate": 4.751561577270624e-05, + "loss": 4.4471, + "step": 24064 + }, + { + "epoch": 0.14312137215719858, + "grad_norm": 1.9738941192626953, + "learning_rate": 4.751541276876567e-05, + "loss": 5.8276, + "step": 24065 + }, + { + "epoch": 0.1431273194404796, + "grad_norm": 1.9925949573516846, + "learning_rate": 4.7515209756965196e-05, + "loss": 5.2116, + "step": 24066 + }, + { + "epoch": 0.1431332667237606, + "grad_norm": 1.761315941810608, + "learning_rate": 4.75150067373049e-05, + "loss": 5.0048, + "step": 24067 + }, + { + "epoch": 0.14313921400704158, + "grad_norm": 1.7744289636611938, + "learning_rate": 4.751480370978485e-05, + "loss": 5.2451, + "step": 24068 + }, + { + "epoch": 0.1431451612903226, + "grad_norm": 1.4490324258804321, + "learning_rate": 4.7514600674405106e-05, + "loss": 5.704, + "step": 24069 + }, + { + "epoch": 0.14315110857360358, + "grad_norm": 1.4389432668685913, + "learning_rate": 4.751439763116575e-05, + "loss": 5.6274, + "step": 24070 + }, + { + "epoch": 0.14315705585688457, + "grad_norm": 2.0219969749450684, + "learning_rate": 4.751419458006685e-05, + "loss": 4.2387, + "step": 24071 + }, + { + "epoch": 0.14316300314016558, + "grad_norm": 1.6722300052642822, + "learning_rate": 4.751399152110848e-05, + "loss": 4.7426, + "step": 24072 + }, + { + "epoch": 0.14316895042344657, + "grad_norm": 1.461065411567688, + "learning_rate": 4.751378845429071e-05, + "loss": 5.4895, + "step": 24073 + }, + { + "epoch": 0.14317489770672756, + "grad_norm": 1.3877815008163452, + "learning_rate": 4.75135853796136e-05, + "loss": 5.6264, + "step": 24074 + }, + { + "epoch": 0.14318084499000858, + "grad_norm": 1.3981953859329224, + "learning_rate": 4.751338229707724e-05, + "loss": 5.4467, + "step": 24075 + }, + { + "epoch": 0.14318679227328956, + "grad_norm": 1.3032608032226562, + "learning_rate": 4.751317920668169e-05, + "loss": 5.5902, + "step": 24076 + }, + { + "epoch": 0.14319273955657055, + "grad_norm": 1.477534532546997, + "learning_rate": 4.751297610842701e-05, + "loss": 5.6286, + "step": 24077 + }, + { + "epoch": 0.14319868683985157, + "grad_norm": 1.5056313276290894, + "learning_rate": 4.75127730023133e-05, + "loss": 5.5233, + "step": 24078 + }, + { + "epoch": 0.14320463412313256, + "grad_norm": 1.6936917304992676, + "learning_rate": 4.75125698883406e-05, + "loss": 4.9877, + "step": 24079 + }, + { + "epoch": 0.14321058140641355, + "grad_norm": 1.5967860221862793, + "learning_rate": 4.7512366766509004e-05, + "loss": 5.1782, + "step": 24080 + }, + { + "epoch": 0.14321652868969456, + "grad_norm": 1.4995664358139038, + "learning_rate": 4.751216363681857e-05, + "loss": 5.3016, + "step": 24081 + }, + { + "epoch": 0.14322247597297555, + "grad_norm": 1.6829060316085815, + "learning_rate": 4.751196049926937e-05, + "loss": 5.228, + "step": 24082 + }, + { + "epoch": 0.14322842325625654, + "grad_norm": 2.151371955871582, + "learning_rate": 4.7511757353861475e-05, + "loss": 5.1807, + "step": 24083 + }, + { + "epoch": 0.14323437053953753, + "grad_norm": 2.1892330646514893, + "learning_rate": 4.751155420059497e-05, + "loss": 5.3542, + "step": 24084 + }, + { + "epoch": 0.14324031782281854, + "grad_norm": 2.0016772747039795, + "learning_rate": 4.75113510394699e-05, + "loss": 4.9516, + "step": 24085 + }, + { + "epoch": 0.14324626510609953, + "grad_norm": 1.8935182094573975, + "learning_rate": 4.751114787048635e-05, + "loss": 5.0342, + "step": 24086 + }, + { + "epoch": 0.14325221238938052, + "grad_norm": 2.004809617996216, + "learning_rate": 4.75109446936444e-05, + "loss": 4.2826, + "step": 24087 + }, + { + "epoch": 0.14325815967266153, + "grad_norm": 1.8340208530426025, + "learning_rate": 4.7510741508944115e-05, + "loss": 4.9323, + "step": 24088 + }, + { + "epoch": 0.14326410695594252, + "grad_norm": 1.769805908203125, + "learning_rate": 4.7510538316385545e-05, + "loss": 5.3595, + "step": 24089 + }, + { + "epoch": 0.1432700542392235, + "grad_norm": 1.5973625183105469, + "learning_rate": 4.75103351159688e-05, + "loss": 5.6195, + "step": 24090 + }, + { + "epoch": 0.14327600152250453, + "grad_norm": 1.5248761177062988, + "learning_rate": 4.751013190769391e-05, + "loss": 5.3578, + "step": 24091 + }, + { + "epoch": 0.14328194880578551, + "grad_norm": 1.5317707061767578, + "learning_rate": 4.750992869156098e-05, + "loss": 5.2791, + "step": 24092 + }, + { + "epoch": 0.1432878960890665, + "grad_norm": 1.9778176546096802, + "learning_rate": 4.750972546757005e-05, + "loss": 5.1077, + "step": 24093 + }, + { + "epoch": 0.14329384337234752, + "grad_norm": 1.7787549495697021, + "learning_rate": 4.750952223572123e-05, + "loss": 5.1073, + "step": 24094 + }, + { + "epoch": 0.1432997906556285, + "grad_norm": 1.6317193508148193, + "learning_rate": 4.750931899601455e-05, + "loss": 5.3686, + "step": 24095 + }, + { + "epoch": 0.1433057379389095, + "grad_norm": 1.7646535634994507, + "learning_rate": 4.7509115748450106e-05, + "loss": 5.4542, + "step": 24096 + }, + { + "epoch": 0.1433116852221905, + "grad_norm": 1.679877519607544, + "learning_rate": 4.750891249302796e-05, + "loss": 5.7126, + "step": 24097 + }, + { + "epoch": 0.1433176325054715, + "grad_norm": 1.3325512409210205, + "learning_rate": 4.750870922974819e-05, + "loss": 5.512, + "step": 24098 + }, + { + "epoch": 0.1433235797887525, + "grad_norm": 1.443447470664978, + "learning_rate": 4.750850595861086e-05, + "loss": 5.4712, + "step": 24099 + }, + { + "epoch": 0.1433295270720335, + "grad_norm": 1.5300956964492798, + "learning_rate": 4.7508302679616044e-05, + "loss": 5.2247, + "step": 24100 + }, + { + "epoch": 0.1433354743553145, + "grad_norm": 1.4438292980194092, + "learning_rate": 4.750809939276381e-05, + "loss": 5.3292, + "step": 24101 + }, + { + "epoch": 0.14334142163859548, + "grad_norm": 1.5861626863479614, + "learning_rate": 4.750789609805423e-05, + "loss": 5.1881, + "step": 24102 + }, + { + "epoch": 0.1433473689218765, + "grad_norm": 1.4352222681045532, + "learning_rate": 4.750769279548738e-05, + "loss": 5.3461, + "step": 24103 + }, + { + "epoch": 0.14335331620515748, + "grad_norm": 1.4064099788665771, + "learning_rate": 4.750748948506332e-05, + "loss": 5.1699, + "step": 24104 + }, + { + "epoch": 0.14335926348843847, + "grad_norm": 1.2421483993530273, + "learning_rate": 4.7507286166782136e-05, + "loss": 5.3811, + "step": 24105 + }, + { + "epoch": 0.1433652107717195, + "grad_norm": 1.430109977722168, + "learning_rate": 4.750708284064389e-05, + "loss": 5.3169, + "step": 24106 + }, + { + "epoch": 0.14337115805500047, + "grad_norm": 1.4107475280761719, + "learning_rate": 4.750687950664865e-05, + "loss": 5.1744, + "step": 24107 + }, + { + "epoch": 0.14337710533828146, + "grad_norm": 1.4888633489608765, + "learning_rate": 4.750667616479649e-05, + "loss": 5.0892, + "step": 24108 + }, + { + "epoch": 0.14338305262156248, + "grad_norm": 1.5325970649719238, + "learning_rate": 4.7506472815087486e-05, + "loss": 4.8421, + "step": 24109 + }, + { + "epoch": 0.14338899990484347, + "grad_norm": 1.806287407875061, + "learning_rate": 4.75062694575217e-05, + "loss": 5.459, + "step": 24110 + }, + { + "epoch": 0.14339494718812446, + "grad_norm": 1.8281558752059937, + "learning_rate": 4.750606609209921e-05, + "loss": 4.7275, + "step": 24111 + }, + { + "epoch": 0.14340089447140547, + "grad_norm": 1.3527547121047974, + "learning_rate": 4.750586271882009e-05, + "loss": 5.4797, + "step": 24112 + }, + { + "epoch": 0.14340684175468646, + "grad_norm": 1.719956874847412, + "learning_rate": 4.75056593376844e-05, + "loss": 5.1069, + "step": 24113 + }, + { + "epoch": 0.14341278903796745, + "grad_norm": 1.484231948852539, + "learning_rate": 4.750545594869222e-05, + "loss": 5.2246, + "step": 24114 + }, + { + "epoch": 0.14341873632124846, + "grad_norm": 1.7525322437286377, + "learning_rate": 4.7505252551843615e-05, + "loss": 5.2036, + "step": 24115 + }, + { + "epoch": 0.14342468360452945, + "grad_norm": 1.6943596601486206, + "learning_rate": 4.7505049147138656e-05, + "loss": 5.6783, + "step": 24116 + }, + { + "epoch": 0.14343063088781044, + "grad_norm": 1.619377851486206, + "learning_rate": 4.750484573457743e-05, + "loss": 5.4861, + "step": 24117 + }, + { + "epoch": 0.14343657817109146, + "grad_norm": 1.9882891178131104, + "learning_rate": 4.750464231415998e-05, + "loss": 5.1085, + "step": 24118 + }, + { + "epoch": 0.14344252545437244, + "grad_norm": 1.4033042192459106, + "learning_rate": 4.75044388858864e-05, + "loss": 5.2776, + "step": 24119 + }, + { + "epoch": 0.14344847273765343, + "grad_norm": 1.2633885145187378, + "learning_rate": 4.750423544975675e-05, + "loss": 5.3406, + "step": 24120 + }, + { + "epoch": 0.14345442002093445, + "grad_norm": 1.4787468910217285, + "learning_rate": 4.7504032005771105e-05, + "loss": 5.5417, + "step": 24121 + }, + { + "epoch": 0.14346036730421544, + "grad_norm": 1.6677738428115845, + "learning_rate": 4.750382855392953e-05, + "loss": 5.39, + "step": 24122 + }, + { + "epoch": 0.14346631458749642, + "grad_norm": 1.6277536153793335, + "learning_rate": 4.750362509423211e-05, + "loss": 5.443, + "step": 24123 + }, + { + "epoch": 0.14347226187077744, + "grad_norm": 1.7157353162765503, + "learning_rate": 4.75034216266789e-05, + "loss": 5.6696, + "step": 24124 + }, + { + "epoch": 0.14347820915405843, + "grad_norm": 1.6321076154708862, + "learning_rate": 4.750321815126998e-05, + "loss": 5.4125, + "step": 24125 + }, + { + "epoch": 0.14348415643733942, + "grad_norm": 1.3769804239273071, + "learning_rate": 4.750301466800542e-05, + "loss": 5.5333, + "step": 24126 + }, + { + "epoch": 0.14349010372062043, + "grad_norm": 1.6320770978927612, + "learning_rate": 4.7502811176885286e-05, + "loss": 5.062, + "step": 24127 + }, + { + "epoch": 0.14349605100390142, + "grad_norm": 1.8570098876953125, + "learning_rate": 4.750260767790966e-05, + "loss": 4.8349, + "step": 24128 + }, + { + "epoch": 0.1435019982871824, + "grad_norm": 1.6399726867675781, + "learning_rate": 4.7502404171078604e-05, + "loss": 5.0899, + "step": 24129 + }, + { + "epoch": 0.14350794557046342, + "grad_norm": 1.6327539682388306, + "learning_rate": 4.7502200656392184e-05, + "loss": 5.4722, + "step": 24130 + }, + { + "epoch": 0.1435138928537444, + "grad_norm": 1.887136697769165, + "learning_rate": 4.750199713385048e-05, + "loss": 5.2569, + "step": 24131 + }, + { + "epoch": 0.1435198401370254, + "grad_norm": 1.8090238571166992, + "learning_rate": 4.750179360345357e-05, + "loss": 5.252, + "step": 24132 + }, + { + "epoch": 0.14352578742030642, + "grad_norm": 1.7913198471069336, + "learning_rate": 4.750159006520152e-05, + "loss": 5.2661, + "step": 24133 + }, + { + "epoch": 0.1435317347035874, + "grad_norm": 2.239309310913086, + "learning_rate": 4.7501386519094385e-05, + "loss": 5.1478, + "step": 24134 + }, + { + "epoch": 0.1435376819868684, + "grad_norm": 2.179140090942383, + "learning_rate": 4.750118296513225e-05, + "loss": 4.9088, + "step": 24135 + }, + { + "epoch": 0.1435436292701494, + "grad_norm": 1.629287838935852, + "learning_rate": 4.7500979403315186e-05, + "loss": 5.0642, + "step": 24136 + }, + { + "epoch": 0.1435495765534304, + "grad_norm": 1.598783254623413, + "learning_rate": 4.750077583364326e-05, + "loss": 5.7616, + "step": 24137 + }, + { + "epoch": 0.14355552383671139, + "grad_norm": 1.792859435081482, + "learning_rate": 4.750057225611656e-05, + "loss": 6.1022, + "step": 24138 + }, + { + "epoch": 0.1435614711199924, + "grad_norm": 1.728210687637329, + "learning_rate": 4.750036867073513e-05, + "loss": 5.904, + "step": 24139 + }, + { + "epoch": 0.1435674184032734, + "grad_norm": 1.9541816711425781, + "learning_rate": 4.7500165077499056e-05, + "loss": 5.3199, + "step": 24140 + }, + { + "epoch": 0.14357336568655438, + "grad_norm": 1.6042431592941284, + "learning_rate": 4.7499961476408405e-05, + "loss": 5.5277, + "step": 24141 + }, + { + "epoch": 0.14357931296983537, + "grad_norm": 1.50521719455719, + "learning_rate": 4.749975786746325e-05, + "loss": 5.4995, + "step": 24142 + }, + { + "epoch": 0.14358526025311638, + "grad_norm": 1.2425066232681274, + "learning_rate": 4.749955425066366e-05, + "loss": 5.6135, + "step": 24143 + }, + { + "epoch": 0.14359120753639737, + "grad_norm": 1.3020912408828735, + "learning_rate": 4.749935062600971e-05, + "loss": 5.5885, + "step": 24144 + }, + { + "epoch": 0.14359715481967836, + "grad_norm": 1.8732852935791016, + "learning_rate": 4.749914699350148e-05, + "loss": 5.3004, + "step": 24145 + }, + { + "epoch": 0.14360310210295937, + "grad_norm": 1.5296770334243774, + "learning_rate": 4.749894335313901e-05, + "loss": 5.5932, + "step": 24146 + }, + { + "epoch": 0.14360904938624036, + "grad_norm": 1.6563706398010254, + "learning_rate": 4.749873970492241e-05, + "loss": 5.4436, + "step": 24147 + }, + { + "epoch": 0.14361499666952135, + "grad_norm": 1.5168625116348267, + "learning_rate": 4.749853604885172e-05, + "loss": 5.5198, + "step": 24148 + }, + { + "epoch": 0.14362094395280237, + "grad_norm": 1.8161656856536865, + "learning_rate": 4.749833238492703e-05, + "loss": 5.3261, + "step": 24149 + }, + { + "epoch": 0.14362689123608335, + "grad_norm": 1.6286919116973877, + "learning_rate": 4.749812871314841e-05, + "loss": 5.3505, + "step": 24150 + }, + { + "epoch": 0.14363283851936434, + "grad_norm": 1.6236040592193604, + "learning_rate": 4.749792503351591e-05, + "loss": 5.4271, + "step": 24151 + }, + { + "epoch": 0.14363878580264536, + "grad_norm": 1.8177775144577026, + "learning_rate": 4.749772134602963e-05, + "loss": 5.2076, + "step": 24152 + }, + { + "epoch": 0.14364473308592635, + "grad_norm": 1.8818564414978027, + "learning_rate": 4.7497517650689616e-05, + "loss": 5.2685, + "step": 24153 + }, + { + "epoch": 0.14365068036920733, + "grad_norm": 1.7166740894317627, + "learning_rate": 4.749731394749596e-05, + "loss": 5.0742, + "step": 24154 + }, + { + "epoch": 0.14365662765248835, + "grad_norm": 1.6446893215179443, + "learning_rate": 4.749711023644873e-05, + "loss": 5.0406, + "step": 24155 + }, + { + "epoch": 0.14366257493576934, + "grad_norm": 1.5812546014785767, + "learning_rate": 4.749690651754798e-05, + "loss": 5.1155, + "step": 24156 + }, + { + "epoch": 0.14366852221905033, + "grad_norm": 1.8002673387527466, + "learning_rate": 4.749670279079379e-05, + "loss": 4.8509, + "step": 24157 + }, + { + "epoch": 0.14367446950233134, + "grad_norm": 1.6835267543792725, + "learning_rate": 4.749649905618624e-05, + "loss": 4.8694, + "step": 24158 + }, + { + "epoch": 0.14368041678561233, + "grad_norm": 1.605454683303833, + "learning_rate": 4.74962953137254e-05, + "loss": 4.926, + "step": 24159 + }, + { + "epoch": 0.14368636406889332, + "grad_norm": 1.6154637336730957, + "learning_rate": 4.749609156341133e-05, + "loss": 5.0548, + "step": 24160 + }, + { + "epoch": 0.14369231135217433, + "grad_norm": 1.7472615242004395, + "learning_rate": 4.74958878052441e-05, + "loss": 5.2218, + "step": 24161 + }, + { + "epoch": 0.14369825863545532, + "grad_norm": 1.80000901222229, + "learning_rate": 4.7495684039223795e-05, + "loss": 5.5268, + "step": 24162 + }, + { + "epoch": 0.1437042059187363, + "grad_norm": 1.6673831939697266, + "learning_rate": 4.749548026535048e-05, + "loss": 4.9823, + "step": 24163 + }, + { + "epoch": 0.14371015320201733, + "grad_norm": 1.5900602340698242, + "learning_rate": 4.749527648362422e-05, + "loss": 4.9122, + "step": 24164 + }, + { + "epoch": 0.14371610048529831, + "grad_norm": 1.538674235343933, + "learning_rate": 4.74950726940451e-05, + "loss": 4.887, + "step": 24165 + }, + { + "epoch": 0.1437220477685793, + "grad_norm": 1.5512803792953491, + "learning_rate": 4.749486889661318e-05, + "loss": 5.106, + "step": 24166 + }, + { + "epoch": 0.14372799505186032, + "grad_norm": 1.6589990854263306, + "learning_rate": 4.7494665091328524e-05, + "loss": 5.1019, + "step": 24167 + }, + { + "epoch": 0.1437339423351413, + "grad_norm": 1.3078352212905884, + "learning_rate": 4.7494461278191225e-05, + "loss": 5.5803, + "step": 24168 + }, + { + "epoch": 0.1437398896184223, + "grad_norm": 1.2839313745498657, + "learning_rate": 4.7494257457201333e-05, + "loss": 5.2538, + "step": 24169 + }, + { + "epoch": 0.1437458369017033, + "grad_norm": 1.6686280965805054, + "learning_rate": 4.749405362835894e-05, + "loss": 4.6737, + "step": 24170 + }, + { + "epoch": 0.1437517841849843, + "grad_norm": 1.6385589838027954, + "learning_rate": 4.7493849791664094e-05, + "loss": 5.3224, + "step": 24171 + }, + { + "epoch": 0.1437577314682653, + "grad_norm": 1.5661671161651611, + "learning_rate": 4.749364594711688e-05, + "loss": 5.4675, + "step": 24172 + }, + { + "epoch": 0.1437636787515463, + "grad_norm": 1.481903314590454, + "learning_rate": 4.749344209471737e-05, + "loss": 5.6801, + "step": 24173 + }, + { + "epoch": 0.1437696260348273, + "grad_norm": 1.6317354440689087, + "learning_rate": 4.749323823446562e-05, + "loss": 5.2531, + "step": 24174 + }, + { + "epoch": 0.14377557331810828, + "grad_norm": 1.7542403936386108, + "learning_rate": 4.749303436636173e-05, + "loss": 4.9242, + "step": 24175 + }, + { + "epoch": 0.1437815206013893, + "grad_norm": 1.7798454761505127, + "learning_rate": 4.7492830490405746e-05, + "loss": 5.0939, + "step": 24176 + }, + { + "epoch": 0.14378746788467028, + "grad_norm": 1.3787469863891602, + "learning_rate": 4.7492626606597744e-05, + "loss": 5.2257, + "step": 24177 + }, + { + "epoch": 0.14379341516795127, + "grad_norm": 1.7178335189819336, + "learning_rate": 4.7492422714937806e-05, + "loss": 4.5083, + "step": 24178 + }, + { + "epoch": 0.1437993624512323, + "grad_norm": 1.559964656829834, + "learning_rate": 4.7492218815425996e-05, + "loss": 5.3788, + "step": 24179 + }, + { + "epoch": 0.14380530973451328, + "grad_norm": 3.269479990005493, + "learning_rate": 4.749201490806238e-05, + "loss": 4.0238, + "step": 24180 + }, + { + "epoch": 0.14381125701779426, + "grad_norm": 1.696169137954712, + "learning_rate": 4.749181099284703e-05, + "loss": 5.7992, + "step": 24181 + }, + { + "epoch": 0.14381720430107528, + "grad_norm": 1.563265085220337, + "learning_rate": 4.749160706978003e-05, + "loss": 5.2459, + "step": 24182 + }, + { + "epoch": 0.14382315158435627, + "grad_norm": 1.6364827156066895, + "learning_rate": 4.7491403138861435e-05, + "loss": 5.2826, + "step": 24183 + }, + { + "epoch": 0.14382909886763726, + "grad_norm": 1.82567298412323, + "learning_rate": 4.749119920009132e-05, + "loss": 4.8079, + "step": 24184 + }, + { + "epoch": 0.14383504615091827, + "grad_norm": 1.3982584476470947, + "learning_rate": 4.7490995253469774e-05, + "loss": 5.4093, + "step": 24185 + }, + { + "epoch": 0.14384099343419926, + "grad_norm": 1.349155068397522, + "learning_rate": 4.749079129899684e-05, + "loss": 5.3707, + "step": 24186 + }, + { + "epoch": 0.14384694071748025, + "grad_norm": 1.4101881980895996, + "learning_rate": 4.749058733667261e-05, + "loss": 4.9554, + "step": 24187 + }, + { + "epoch": 0.14385288800076126, + "grad_norm": 1.1910806894302368, + "learning_rate": 4.749038336649715e-05, + "loss": 5.0658, + "step": 24188 + }, + { + "epoch": 0.14385883528404225, + "grad_norm": 1.5315760374069214, + "learning_rate": 4.749017938847052e-05, + "loss": 5.4716, + "step": 24189 + }, + { + "epoch": 0.14386478256732324, + "grad_norm": 1.1762129068374634, + "learning_rate": 4.7489975402592814e-05, + "loss": 5.6235, + "step": 24190 + }, + { + "epoch": 0.14387072985060426, + "grad_norm": 1.2317709922790527, + "learning_rate": 4.748977140886408e-05, + "loss": 5.8842, + "step": 24191 + }, + { + "epoch": 0.14387667713388524, + "grad_norm": 1.439610481262207, + "learning_rate": 4.7489567407284405e-05, + "loss": 5.4157, + "step": 24192 + }, + { + "epoch": 0.14388262441716623, + "grad_norm": 1.842933177947998, + "learning_rate": 4.7489363397853854e-05, + "loss": 5.1555, + "step": 24193 + }, + { + "epoch": 0.14388857170044725, + "grad_norm": 1.887911081314087, + "learning_rate": 4.748915938057249e-05, + "loss": 5.5591, + "step": 24194 + }, + { + "epoch": 0.14389451898372824, + "grad_norm": 1.7697376012802124, + "learning_rate": 4.7488955355440404e-05, + "loss": 5.5617, + "step": 24195 + }, + { + "epoch": 0.14390046626700922, + "grad_norm": 1.5946240425109863, + "learning_rate": 4.7488751322457655e-05, + "loss": 5.3901, + "step": 24196 + }, + { + "epoch": 0.14390641355029024, + "grad_norm": 1.7462904453277588, + "learning_rate": 4.7488547281624306e-05, + "loss": 5.3187, + "step": 24197 + }, + { + "epoch": 0.14391236083357123, + "grad_norm": 1.7388325929641724, + "learning_rate": 4.7488343232940445e-05, + "loss": 5.0042, + "step": 24198 + }, + { + "epoch": 0.14391830811685222, + "grad_norm": 1.5990902185440063, + "learning_rate": 4.7488139176406135e-05, + "loss": 5.1336, + "step": 24199 + }, + { + "epoch": 0.1439242554001332, + "grad_norm": 1.7063771486282349, + "learning_rate": 4.748793511202145e-05, + "loss": 5.6073, + "step": 24200 + }, + { + "epoch": 0.14393020268341422, + "grad_norm": 1.5042674541473389, + "learning_rate": 4.748773103978645e-05, + "loss": 5.6617, + "step": 24201 + }, + { + "epoch": 0.1439361499666952, + "grad_norm": 1.4366991519927979, + "learning_rate": 4.7487526959701225e-05, + "loss": 5.3679, + "step": 24202 + }, + { + "epoch": 0.1439420972499762, + "grad_norm": 1.571524977684021, + "learning_rate": 4.748732287176584e-05, + "loss": 5.5487, + "step": 24203 + }, + { + "epoch": 0.1439480445332572, + "grad_norm": 1.3584872484207153, + "learning_rate": 4.748711877598036e-05, + "loss": 5.3332, + "step": 24204 + }, + { + "epoch": 0.1439539918165382, + "grad_norm": 1.4718894958496094, + "learning_rate": 4.748691467234484e-05, + "loss": 5.3985, + "step": 24205 + }, + { + "epoch": 0.1439599390998192, + "grad_norm": 1.5978455543518066, + "learning_rate": 4.748671056085939e-05, + "loss": 5.6351, + "step": 24206 + }, + { + "epoch": 0.1439658863831002, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.748650644152406e-05, + "loss": 4.9972, + "step": 24207 + }, + { + "epoch": 0.1439718336663812, + "grad_norm": 1.7493484020233154, + "learning_rate": 4.748630231433891e-05, + "loss": 4.8863, + "step": 24208 + }, + { + "epoch": 0.14397778094966218, + "grad_norm": 1.7967579364776611, + "learning_rate": 4.748609817930405e-05, + "loss": 5.5271, + "step": 24209 + }, + { + "epoch": 0.1439837282329432, + "grad_norm": 1.3049358129501343, + "learning_rate": 4.7485894036419505e-05, + "loss": 5.631, + "step": 24210 + }, + { + "epoch": 0.14398967551622419, + "grad_norm": 2.1333138942718506, + "learning_rate": 4.7485689885685366e-05, + "loss": 4.3777, + "step": 24211 + }, + { + "epoch": 0.14399562279950517, + "grad_norm": 1.7402033805847168, + "learning_rate": 4.748548572710172e-05, + "loss": 5.0069, + "step": 24212 + }, + { + "epoch": 0.1440015700827862, + "grad_norm": 1.5663232803344727, + "learning_rate": 4.748528156066861e-05, + "loss": 5.8514, + "step": 24213 + }, + { + "epoch": 0.14400751736606718, + "grad_norm": 1.5079457759857178, + "learning_rate": 4.748507738638612e-05, + "loss": 5.771, + "step": 24214 + }, + { + "epoch": 0.14401346464934817, + "grad_norm": 1.407939076423645, + "learning_rate": 4.7484873204254334e-05, + "loss": 5.405, + "step": 24215 + }, + { + "epoch": 0.14401941193262918, + "grad_norm": 1.6172797679901123, + "learning_rate": 4.7484669014273296e-05, + "loss": 5.3918, + "step": 24216 + }, + { + "epoch": 0.14402535921591017, + "grad_norm": 1.52508544921875, + "learning_rate": 4.74844648164431e-05, + "loss": 5.3287, + "step": 24217 + }, + { + "epoch": 0.14403130649919116, + "grad_norm": 1.6615005731582642, + "learning_rate": 4.7484260610763806e-05, + "loss": 5.3211, + "step": 24218 + }, + { + "epoch": 0.14403725378247217, + "grad_norm": 1.7896537780761719, + "learning_rate": 4.74840563972355e-05, + "loss": 5.3131, + "step": 24219 + }, + { + "epoch": 0.14404320106575316, + "grad_norm": 1.665890097618103, + "learning_rate": 4.748385217585823e-05, + "loss": 5.4934, + "step": 24220 + }, + { + "epoch": 0.14404914834903415, + "grad_norm": 1.9217110872268677, + "learning_rate": 4.7483647946632085e-05, + "loss": 4.9057, + "step": 24221 + }, + { + "epoch": 0.14405509563231517, + "grad_norm": 1.3658103942871094, + "learning_rate": 4.748344370955713e-05, + "loss": 5.3585, + "step": 24222 + }, + { + "epoch": 0.14406104291559615, + "grad_norm": 1.3099697828292847, + "learning_rate": 4.748323946463343e-05, + "loss": 5.7427, + "step": 24223 + }, + { + "epoch": 0.14406699019887714, + "grad_norm": 1.5619271993637085, + "learning_rate": 4.7483035211861075e-05, + "loss": 5.4217, + "step": 24224 + }, + { + "epoch": 0.14407293748215816, + "grad_norm": 1.6359944343566895, + "learning_rate": 4.748283095124012e-05, + "loss": 5.0194, + "step": 24225 + }, + { + "epoch": 0.14407888476543915, + "grad_norm": 1.5773736238479614, + "learning_rate": 4.748262668277064e-05, + "loss": 5.0422, + "step": 24226 + }, + { + "epoch": 0.14408483204872014, + "grad_norm": 1.4909980297088623, + "learning_rate": 4.748242240645271e-05, + "loss": 5.6089, + "step": 24227 + }, + { + "epoch": 0.14409077933200115, + "grad_norm": 1.3489822149276733, + "learning_rate": 4.74822181222864e-05, + "loss": 5.6137, + "step": 24228 + }, + { + "epoch": 0.14409672661528214, + "grad_norm": 1.3335795402526855, + "learning_rate": 4.748201383027178e-05, + "loss": 5.4704, + "step": 24229 + }, + { + "epoch": 0.14410267389856313, + "grad_norm": 1.2519936561584473, + "learning_rate": 4.748180953040891e-05, + "loss": 5.5211, + "step": 24230 + }, + { + "epoch": 0.14410862118184414, + "grad_norm": 1.3223121166229248, + "learning_rate": 4.748160522269788e-05, + "loss": 5.897, + "step": 24231 + }, + { + "epoch": 0.14411456846512513, + "grad_norm": 1.3471014499664307, + "learning_rate": 4.748140090713876e-05, + "loss": 5.5012, + "step": 24232 + }, + { + "epoch": 0.14412051574840612, + "grad_norm": 1.7432321310043335, + "learning_rate": 4.74811965837316e-05, + "loss": 5.5286, + "step": 24233 + }, + { + "epoch": 0.14412646303168714, + "grad_norm": 1.4858758449554443, + "learning_rate": 4.74809922524765e-05, + "loss": 5.0719, + "step": 24234 + }, + { + "epoch": 0.14413241031496812, + "grad_norm": 1.3750518560409546, + "learning_rate": 4.7480787913373515e-05, + "loss": 5.63, + "step": 24235 + }, + { + "epoch": 0.1441383575982491, + "grad_norm": 1.3795223236083984, + "learning_rate": 4.7480583566422723e-05, + "loss": 5.5985, + "step": 24236 + }, + { + "epoch": 0.14414430488153013, + "grad_norm": 1.5779204368591309, + "learning_rate": 4.7480379211624185e-05, + "loss": 5.4503, + "step": 24237 + }, + { + "epoch": 0.14415025216481112, + "grad_norm": 1.5513705015182495, + "learning_rate": 4.7480174848977974e-05, + "loss": 5.6559, + "step": 24238 + }, + { + "epoch": 0.1441561994480921, + "grad_norm": 1.3171751499176025, + "learning_rate": 4.747997047848417e-05, + "loss": 5.7664, + "step": 24239 + }, + { + "epoch": 0.14416214673137312, + "grad_norm": 1.4049638509750366, + "learning_rate": 4.7479766100142855e-05, + "loss": 5.7167, + "step": 24240 + }, + { + "epoch": 0.1441680940146541, + "grad_norm": 1.5657798051834106, + "learning_rate": 4.747956171395407e-05, + "loss": 5.3544, + "step": 24241 + }, + { + "epoch": 0.1441740412979351, + "grad_norm": 1.7015857696533203, + "learning_rate": 4.747935731991791e-05, + "loss": 5.2192, + "step": 24242 + }, + { + "epoch": 0.1441799885812161, + "grad_norm": 1.396626591682434, + "learning_rate": 4.7479152918034433e-05, + "loss": 5.6169, + "step": 24243 + }, + { + "epoch": 0.1441859358644971, + "grad_norm": 1.5319141149520874, + "learning_rate": 4.7478948508303714e-05, + "loss": 5.5103, + "step": 24244 + }, + { + "epoch": 0.1441918831477781, + "grad_norm": 1.878131628036499, + "learning_rate": 4.747874409072583e-05, + "loss": 5.0926, + "step": 24245 + }, + { + "epoch": 0.1441978304310591, + "grad_norm": 1.3702614307403564, + "learning_rate": 4.7478539665300845e-05, + "loss": 5.5891, + "step": 24246 + }, + { + "epoch": 0.1442037777143401, + "grad_norm": 1.729227066040039, + "learning_rate": 4.7478335232028845e-05, + "loss": 5.4893, + "step": 24247 + }, + { + "epoch": 0.14420972499762108, + "grad_norm": 1.356343150138855, + "learning_rate": 4.747813079090988e-05, + "loss": 5.3913, + "step": 24248 + }, + { + "epoch": 0.1442156722809021, + "grad_norm": 1.6735188961029053, + "learning_rate": 4.7477926341944036e-05, + "loss": 5.1161, + "step": 24249 + }, + { + "epoch": 0.14422161956418308, + "grad_norm": 1.6281756162643433, + "learning_rate": 4.7477721885131376e-05, + "loss": 5.0971, + "step": 24250 + }, + { + "epoch": 0.14422756684746407, + "grad_norm": 1.789338231086731, + "learning_rate": 4.747751742047199e-05, + "loss": 5.0477, + "step": 24251 + }, + { + "epoch": 0.1442335141307451, + "grad_norm": 2.3384926319122314, + "learning_rate": 4.7477312947965915e-05, + "loss": 4.5108, + "step": 24252 + }, + { + "epoch": 0.14423946141402608, + "grad_norm": 2.1642465591430664, + "learning_rate": 4.7477108467613255e-05, + "loss": 4.6503, + "step": 24253 + }, + { + "epoch": 0.14424540869730706, + "grad_norm": 2.0242364406585693, + "learning_rate": 4.747690397941406e-05, + "loss": 4.7346, + "step": 24254 + }, + { + "epoch": 0.14425135598058808, + "grad_norm": 2.543030023574829, + "learning_rate": 4.7476699483368414e-05, + "loss": 4.4076, + "step": 24255 + }, + { + "epoch": 0.14425730326386907, + "grad_norm": 2.274937391281128, + "learning_rate": 4.747649497947638e-05, + "loss": 4.5464, + "step": 24256 + }, + { + "epoch": 0.14426325054715006, + "grad_norm": 2.695321798324585, + "learning_rate": 4.747629046773805e-05, + "loss": 4.5794, + "step": 24257 + }, + { + "epoch": 0.14426919783043107, + "grad_norm": 2.2838776111602783, + "learning_rate": 4.7476085948153465e-05, + "loss": 4.6079, + "step": 24258 + }, + { + "epoch": 0.14427514511371206, + "grad_norm": 2.1405718326568604, + "learning_rate": 4.7475881420722714e-05, + "loss": 4.4428, + "step": 24259 + }, + { + "epoch": 0.14428109239699305, + "grad_norm": 2.17814302444458, + "learning_rate": 4.747567688544586e-05, + "loss": 4.3945, + "step": 24260 + }, + { + "epoch": 0.14428703968027404, + "grad_norm": 2.24731183052063, + "learning_rate": 4.747547234232299e-05, + "loss": 4.4622, + "step": 24261 + }, + { + "epoch": 0.14429298696355505, + "grad_norm": 2.2340478897094727, + "learning_rate": 4.747526779135416e-05, + "loss": 4.3968, + "step": 24262 + }, + { + "epoch": 0.14429893424683604, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.747506323253944e-05, + "loss": 4.4357, + "step": 24263 + }, + { + "epoch": 0.14430488153011703, + "grad_norm": 2.30887770652771, + "learning_rate": 4.747485866587891e-05, + "loss": 4.3798, + "step": 24264 + }, + { + "epoch": 0.14431082881339805, + "grad_norm": 1.8898377418518066, + "learning_rate": 4.7474654091372645e-05, + "loss": 4.759, + "step": 24265 + }, + { + "epoch": 0.14431677609667903, + "grad_norm": 1.8610650300979614, + "learning_rate": 4.747444950902071e-05, + "loss": 5.2619, + "step": 24266 + }, + { + "epoch": 0.14432272337996002, + "grad_norm": 2.0524682998657227, + "learning_rate": 4.747424491882317e-05, + "loss": 5.1975, + "step": 24267 + }, + { + "epoch": 0.14432867066324104, + "grad_norm": 1.9053709506988525, + "learning_rate": 4.7474040320780114e-05, + "loss": 4.9233, + "step": 24268 + }, + { + "epoch": 0.14433461794652203, + "grad_norm": 1.8127448558807373, + "learning_rate": 4.747383571489159e-05, + "loss": 5.4335, + "step": 24269 + }, + { + "epoch": 0.14434056522980301, + "grad_norm": 1.6836609840393066, + "learning_rate": 4.747363110115769e-05, + "loss": 5.3978, + "step": 24270 + }, + { + "epoch": 0.14434651251308403, + "grad_norm": 1.5606380701065063, + "learning_rate": 4.747342647957848e-05, + "loss": 5.4756, + "step": 24271 + }, + { + "epoch": 0.14435245979636502, + "grad_norm": 1.5684814453125, + "learning_rate": 4.747322185015402e-05, + "loss": 5.2942, + "step": 24272 + }, + { + "epoch": 0.144358407079646, + "grad_norm": 1.4253596067428589, + "learning_rate": 4.7473017212884395e-05, + "loss": 5.3061, + "step": 24273 + }, + { + "epoch": 0.14436435436292702, + "grad_norm": 1.5249817371368408, + "learning_rate": 4.747281256776968e-05, + "loss": 5.2824, + "step": 24274 + }, + { + "epoch": 0.144370301646208, + "grad_norm": 1.7111622095108032, + "learning_rate": 4.747260791480992e-05, + "loss": 5.3591, + "step": 24275 + }, + { + "epoch": 0.144376248929489, + "grad_norm": 1.6259697675704956, + "learning_rate": 4.7472403254005216e-05, + "loss": 5.6083, + "step": 24276 + }, + { + "epoch": 0.14438219621277001, + "grad_norm": 1.7138687372207642, + "learning_rate": 4.7472198585355634e-05, + "loss": 5.45, + "step": 24277 + }, + { + "epoch": 0.144388143496051, + "grad_norm": 1.55049729347229, + "learning_rate": 4.7471993908861226e-05, + "loss": 5.413, + "step": 24278 + }, + { + "epoch": 0.144394090779332, + "grad_norm": 1.619774580001831, + "learning_rate": 4.7471789224522086e-05, + "loss": 5.4499, + "step": 24279 + }, + { + "epoch": 0.144400038062613, + "grad_norm": 1.4726954698562622, + "learning_rate": 4.747158453233828e-05, + "loss": 5.3787, + "step": 24280 + }, + { + "epoch": 0.144405985345894, + "grad_norm": 1.5688132047653198, + "learning_rate": 4.7471379832309865e-05, + "loss": 5.0952, + "step": 24281 + }, + { + "epoch": 0.14441193262917498, + "grad_norm": 1.5431749820709229, + "learning_rate": 4.747117512443693e-05, + "loss": 5.4646, + "step": 24282 + }, + { + "epoch": 0.144417879912456, + "grad_norm": 1.5271220207214355, + "learning_rate": 4.747097040871954e-05, + "loss": 4.7074, + "step": 24283 + }, + { + "epoch": 0.144423827195737, + "grad_norm": 1.49335777759552, + "learning_rate": 4.7470765685157765e-05, + "loss": 5.1271, + "step": 24284 + }, + { + "epoch": 0.14442977447901797, + "grad_norm": 1.624834418296814, + "learning_rate": 4.7470560953751675e-05, + "loss": 4.7448, + "step": 24285 + }, + { + "epoch": 0.144435721762299, + "grad_norm": 1.4151476621627808, + "learning_rate": 4.7470356214501355e-05, + "loss": 5.2011, + "step": 24286 + }, + { + "epoch": 0.14444166904557998, + "grad_norm": 1.4529035091400146, + "learning_rate": 4.747015146740685e-05, + "loss": 5.2849, + "step": 24287 + }, + { + "epoch": 0.14444761632886097, + "grad_norm": 1.43472158908844, + "learning_rate": 4.746994671246826e-05, + "loss": 5.2655, + "step": 24288 + }, + { + "epoch": 0.14445356361214198, + "grad_norm": 1.2202403545379639, + "learning_rate": 4.7469741949685645e-05, + "loss": 5.3629, + "step": 24289 + }, + { + "epoch": 0.14445951089542297, + "grad_norm": 1.5001815557479858, + "learning_rate": 4.746953717905906e-05, + "loss": 5.3728, + "step": 24290 + }, + { + "epoch": 0.14446545817870396, + "grad_norm": 1.3214131593704224, + "learning_rate": 4.7469332400588604e-05, + "loss": 5.2877, + "step": 24291 + }, + { + "epoch": 0.14447140546198498, + "grad_norm": 1.5443751811981201, + "learning_rate": 4.7469127614274334e-05, + "loss": 5.2852, + "step": 24292 + }, + { + "epoch": 0.14447735274526596, + "grad_norm": 1.63779616355896, + "learning_rate": 4.746892282011632e-05, + "loss": 5.1985, + "step": 24293 + }, + { + "epoch": 0.14448330002854695, + "grad_norm": 1.4744620323181152, + "learning_rate": 4.7468718018114644e-05, + "loss": 5.4607, + "step": 24294 + }, + { + "epoch": 0.14448924731182797, + "grad_norm": 1.6099250316619873, + "learning_rate": 4.7468513208269366e-05, + "loss": 5.3546, + "step": 24295 + }, + { + "epoch": 0.14449519459510896, + "grad_norm": 1.692960262298584, + "learning_rate": 4.746830839058056e-05, + "loss": 5.2117, + "step": 24296 + }, + { + "epoch": 0.14450114187838994, + "grad_norm": 2.379516363143921, + "learning_rate": 4.746810356504831e-05, + "loss": 4.3924, + "step": 24297 + }, + { + "epoch": 0.14450708916167096, + "grad_norm": 1.5348504781723022, + "learning_rate": 4.7467898731672665e-05, + "loss": 5.556, + "step": 24298 + }, + { + "epoch": 0.14451303644495195, + "grad_norm": 1.65830397605896, + "learning_rate": 4.746769389045371e-05, + "loss": 5.26, + "step": 24299 + }, + { + "epoch": 0.14451898372823294, + "grad_norm": 1.6785426139831543, + "learning_rate": 4.746748904139152e-05, + "loss": 5.6459, + "step": 24300 + }, + { + "epoch": 0.14452493101151395, + "grad_norm": 1.8990434408187866, + "learning_rate": 4.746728418448616e-05, + "loss": 5.7021, + "step": 24301 + }, + { + "epoch": 0.14453087829479494, + "grad_norm": 1.5564841032028198, + "learning_rate": 4.7467079319737706e-05, + "loss": 5.1878, + "step": 24302 + }, + { + "epoch": 0.14453682557807593, + "grad_norm": 1.5522741079330444, + "learning_rate": 4.7466874447146226e-05, + "loss": 5.356, + "step": 24303 + }, + { + "epoch": 0.14454277286135694, + "grad_norm": 1.5835893154144287, + "learning_rate": 4.746666956671179e-05, + "loss": 5.1861, + "step": 24304 + }, + { + "epoch": 0.14454872014463793, + "grad_norm": 1.5514174699783325, + "learning_rate": 4.746646467843447e-05, + "loss": 4.9673, + "step": 24305 + }, + { + "epoch": 0.14455466742791892, + "grad_norm": 1.5682575702667236, + "learning_rate": 4.746625978231435e-05, + "loss": 4.8175, + "step": 24306 + }, + { + "epoch": 0.14456061471119994, + "grad_norm": 1.7369959354400635, + "learning_rate": 4.746605487835148e-05, + "loss": 4.8891, + "step": 24307 + }, + { + "epoch": 0.14456656199448092, + "grad_norm": 1.5230990648269653, + "learning_rate": 4.7465849966545945e-05, + "loss": 4.7425, + "step": 24308 + }, + { + "epoch": 0.1445725092777619, + "grad_norm": 1.696858525276184, + "learning_rate": 4.7465645046897814e-05, + "loss": 5.2665, + "step": 24309 + }, + { + "epoch": 0.14457845656104293, + "grad_norm": 1.3940263986587524, + "learning_rate": 4.7465440119407153e-05, + "loss": 4.9574, + "step": 24310 + }, + { + "epoch": 0.14458440384432392, + "grad_norm": 1.6118072271347046, + "learning_rate": 4.7465235184074046e-05, + "loss": 4.6531, + "step": 24311 + }, + { + "epoch": 0.1445903511276049, + "grad_norm": 1.671982765197754, + "learning_rate": 4.746503024089856e-05, + "loss": 4.6481, + "step": 24312 + }, + { + "epoch": 0.14459629841088592, + "grad_norm": 1.74351167678833, + "learning_rate": 4.746482528988076e-05, + "loss": 4.6964, + "step": 24313 + }, + { + "epoch": 0.1446022456941669, + "grad_norm": 1.8614739179611206, + "learning_rate": 4.746462033102072e-05, + "loss": 4.6784, + "step": 24314 + }, + { + "epoch": 0.1446081929774479, + "grad_norm": 1.4908361434936523, + "learning_rate": 4.746441536431851e-05, + "loss": 4.5367, + "step": 24315 + }, + { + "epoch": 0.1446141402607289, + "grad_norm": 1.6088496446609497, + "learning_rate": 4.746421038977421e-05, + "loss": 4.6425, + "step": 24316 + }, + { + "epoch": 0.1446200875440099, + "grad_norm": 1.674081563949585, + "learning_rate": 4.746400540738789e-05, + "loss": 4.4158, + "step": 24317 + }, + { + "epoch": 0.1446260348272909, + "grad_norm": 1.8151460886001587, + "learning_rate": 4.746380041715961e-05, + "loss": 4.6386, + "step": 24318 + }, + { + "epoch": 0.14463198211057188, + "grad_norm": 1.9019746780395508, + "learning_rate": 4.7463595419089456e-05, + "loss": 5.501, + "step": 24319 + }, + { + "epoch": 0.1446379293938529, + "grad_norm": 1.4574391841888428, + "learning_rate": 4.746339041317749e-05, + "loss": 5.4025, + "step": 24320 + }, + { + "epoch": 0.14464387667713388, + "grad_norm": 1.6762443780899048, + "learning_rate": 4.746318539942378e-05, + "loss": 5.4696, + "step": 24321 + }, + { + "epoch": 0.14464982396041487, + "grad_norm": 1.6373367309570312, + "learning_rate": 4.746298037782841e-05, + "loss": 5.3375, + "step": 24322 + }, + { + "epoch": 0.14465577124369589, + "grad_norm": 2.50252103805542, + "learning_rate": 4.7462775348391455e-05, + "loss": 4.5236, + "step": 24323 + }, + { + "epoch": 0.14466171852697687, + "grad_norm": 2.569896936416626, + "learning_rate": 4.7462570311112965e-05, + "loss": 4.5617, + "step": 24324 + }, + { + "epoch": 0.14466766581025786, + "grad_norm": 2.6712963581085205, + "learning_rate": 4.7462365265993024e-05, + "loss": 4.552, + "step": 24325 + }, + { + "epoch": 0.14467361309353888, + "grad_norm": 2.3401951789855957, + "learning_rate": 4.7462160213031705e-05, + "loss": 4.306, + "step": 24326 + }, + { + "epoch": 0.14467956037681987, + "grad_norm": 2.5915024280548096, + "learning_rate": 4.746195515222908e-05, + "loss": 4.2392, + "step": 24327 + }, + { + "epoch": 0.14468550766010085, + "grad_norm": 1.6202619075775146, + "learning_rate": 4.746175008358522e-05, + "loss": 5.2185, + "step": 24328 + }, + { + "epoch": 0.14469145494338187, + "grad_norm": 1.3534679412841797, + "learning_rate": 4.746154500710019e-05, + "loss": 5.3462, + "step": 24329 + }, + { + "epoch": 0.14469740222666286, + "grad_norm": 1.6344646215438843, + "learning_rate": 4.746133992277407e-05, + "loss": 5.2465, + "step": 24330 + }, + { + "epoch": 0.14470334950994385, + "grad_norm": 1.4203686714172363, + "learning_rate": 4.7461134830606924e-05, + "loss": 5.3623, + "step": 24331 + }, + { + "epoch": 0.14470929679322486, + "grad_norm": 1.3993933200836182, + "learning_rate": 4.7460929730598834e-05, + "loss": 5.3726, + "step": 24332 + }, + { + "epoch": 0.14471524407650585, + "grad_norm": 1.804283618927002, + "learning_rate": 4.746072462274986e-05, + "loss": 4.8216, + "step": 24333 + }, + { + "epoch": 0.14472119135978684, + "grad_norm": 1.5801303386688232, + "learning_rate": 4.746051950706009e-05, + "loss": 5.1036, + "step": 24334 + }, + { + "epoch": 0.14472713864306785, + "grad_norm": 1.525907278060913, + "learning_rate": 4.746031438352957e-05, + "loss": 4.743, + "step": 24335 + }, + { + "epoch": 0.14473308592634884, + "grad_norm": 1.6091197729110718, + "learning_rate": 4.746010925215839e-05, + "loss": 5.347, + "step": 24336 + }, + { + "epoch": 0.14473903320962983, + "grad_norm": 1.6794999837875366, + "learning_rate": 4.7459904112946626e-05, + "loss": 4.7244, + "step": 24337 + }, + { + "epoch": 0.14474498049291085, + "grad_norm": 1.5076384544372559, + "learning_rate": 4.745969896589434e-05, + "loss": 4.591, + "step": 24338 + }, + { + "epoch": 0.14475092777619183, + "grad_norm": 1.6222561597824097, + "learning_rate": 4.74594938110016e-05, + "loss": 4.7175, + "step": 24339 + }, + { + "epoch": 0.14475687505947282, + "grad_norm": 1.6383036375045776, + "learning_rate": 4.745928864826848e-05, + "loss": 5.5165, + "step": 24340 + }, + { + "epoch": 0.14476282234275384, + "grad_norm": 1.4812443256378174, + "learning_rate": 4.745908347769507e-05, + "loss": 5.4227, + "step": 24341 + }, + { + "epoch": 0.14476876962603483, + "grad_norm": 1.4673051834106445, + "learning_rate": 4.7458878299281406e-05, + "loss": 5.1107, + "step": 24342 + }, + { + "epoch": 0.14477471690931581, + "grad_norm": 1.3475501537322998, + "learning_rate": 4.745867311302759e-05, + "loss": 4.9491, + "step": 24343 + }, + { + "epoch": 0.14478066419259683, + "grad_norm": 1.437537670135498, + "learning_rate": 4.745846791893368e-05, + "loss": 4.985, + "step": 24344 + }, + { + "epoch": 0.14478661147587782, + "grad_norm": 1.3732200860977173, + "learning_rate": 4.745826271699975e-05, + "loss": 4.9058, + "step": 24345 + }, + { + "epoch": 0.1447925587591588, + "grad_norm": 1.2727693319320679, + "learning_rate": 4.7458057507225875e-05, + "loss": 4.9508, + "step": 24346 + }, + { + "epoch": 0.14479850604243982, + "grad_norm": 1.0993971824645996, + "learning_rate": 4.7457852289612125e-05, + "loss": 5.125, + "step": 24347 + }, + { + "epoch": 0.1448044533257208, + "grad_norm": 1.325086236000061, + "learning_rate": 4.745764706415857e-05, + "loss": 5.4091, + "step": 24348 + }, + { + "epoch": 0.1448104006090018, + "grad_norm": 1.378989815711975, + "learning_rate": 4.745744183086528e-05, + "loss": 5.472, + "step": 24349 + }, + { + "epoch": 0.14481634789228282, + "grad_norm": 1.347360372543335, + "learning_rate": 4.745723658973233e-05, + "loss": 5.4071, + "step": 24350 + }, + { + "epoch": 0.1448222951755638, + "grad_norm": 1.367734670639038, + "learning_rate": 4.74570313407598e-05, + "loss": 5.3895, + "step": 24351 + }, + { + "epoch": 0.1448282424588448, + "grad_norm": 1.4136337041854858, + "learning_rate": 4.745682608394774e-05, + "loss": 5.4637, + "step": 24352 + }, + { + "epoch": 0.1448341897421258, + "grad_norm": 1.475825548171997, + "learning_rate": 4.745662081929624e-05, + "loss": 5.3135, + "step": 24353 + }, + { + "epoch": 0.1448401370254068, + "grad_norm": 1.6745150089263916, + "learning_rate": 4.745641554680537e-05, + "loss": 4.9959, + "step": 24354 + }, + { + "epoch": 0.14484608430868778, + "grad_norm": 1.7860320806503296, + "learning_rate": 4.7456210266475185e-05, + "loss": 5.054, + "step": 24355 + }, + { + "epoch": 0.1448520315919688, + "grad_norm": 1.456579327583313, + "learning_rate": 4.745600497830577e-05, + "loss": 5.2742, + "step": 24356 + }, + { + "epoch": 0.1448579788752498, + "grad_norm": 1.5492240190505981, + "learning_rate": 4.745579968229721e-05, + "loss": 5.0763, + "step": 24357 + }, + { + "epoch": 0.14486392615853078, + "grad_norm": 1.5172940492630005, + "learning_rate": 4.7455594378449554e-05, + "loss": 5.3951, + "step": 24358 + }, + { + "epoch": 0.1448698734418118, + "grad_norm": 1.5349613428115845, + "learning_rate": 4.7455389066762876e-05, + "loss": 5.5627, + "step": 24359 + }, + { + "epoch": 0.14487582072509278, + "grad_norm": 1.8341866731643677, + "learning_rate": 4.745518374723726e-05, + "loss": 5.2127, + "step": 24360 + }, + { + "epoch": 0.14488176800837377, + "grad_norm": 1.4852558374404907, + "learning_rate": 4.745497841987277e-05, + "loss": 5.2764, + "step": 24361 + }, + { + "epoch": 0.14488771529165478, + "grad_norm": 1.6629048585891724, + "learning_rate": 4.745477308466948e-05, + "loss": 5.0675, + "step": 24362 + }, + { + "epoch": 0.14489366257493577, + "grad_norm": 1.7459721565246582, + "learning_rate": 4.745456774162746e-05, + "loss": 5.0865, + "step": 24363 + }, + { + "epoch": 0.14489960985821676, + "grad_norm": 1.9257551431655884, + "learning_rate": 4.745436239074678e-05, + "loss": 4.9022, + "step": 24364 + }, + { + "epoch": 0.14490555714149778, + "grad_norm": 1.9146925210952759, + "learning_rate": 4.745415703202752e-05, + "loss": 5.3591, + "step": 24365 + }, + { + "epoch": 0.14491150442477876, + "grad_norm": 1.5624557733535767, + "learning_rate": 4.7453951665469744e-05, + "loss": 5.2383, + "step": 24366 + }, + { + "epoch": 0.14491745170805975, + "grad_norm": 1.4265660047531128, + "learning_rate": 4.745374629107352e-05, + "loss": 5.5559, + "step": 24367 + }, + { + "epoch": 0.14492339899134077, + "grad_norm": 2.072206497192383, + "learning_rate": 4.7453540908838934e-05, + "loss": 4.6001, + "step": 24368 + }, + { + "epoch": 0.14492934627462176, + "grad_norm": 2.144535779953003, + "learning_rate": 4.745333551876604e-05, + "loss": 4.6874, + "step": 24369 + }, + { + "epoch": 0.14493529355790274, + "grad_norm": 2.311624050140381, + "learning_rate": 4.745313012085492e-05, + "loss": 5.2824, + "step": 24370 + }, + { + "epoch": 0.14494124084118376, + "grad_norm": 1.6523234844207764, + "learning_rate": 4.745292471510565e-05, + "loss": 5.447, + "step": 24371 + }, + { + "epoch": 0.14494718812446475, + "grad_norm": 1.480470895767212, + "learning_rate": 4.745271930151829e-05, + "loss": 5.4511, + "step": 24372 + }, + { + "epoch": 0.14495313540774574, + "grad_norm": 1.6797810792922974, + "learning_rate": 4.7452513880092917e-05, + "loss": 5.258, + "step": 24373 + }, + { + "epoch": 0.14495908269102675, + "grad_norm": 1.541110873222351, + "learning_rate": 4.7452308450829615e-05, + "loss": 5.4877, + "step": 24374 + }, + { + "epoch": 0.14496502997430774, + "grad_norm": 1.8961621522903442, + "learning_rate": 4.745210301372843e-05, + "loss": 5.5844, + "step": 24375 + }, + { + "epoch": 0.14497097725758873, + "grad_norm": 1.8623143434524536, + "learning_rate": 4.745189756878945e-05, + "loss": 5.6454, + "step": 24376 + }, + { + "epoch": 0.14497692454086972, + "grad_norm": 1.6899166107177734, + "learning_rate": 4.745169211601276e-05, + "loss": 5.3369, + "step": 24377 + }, + { + "epoch": 0.14498287182415073, + "grad_norm": 1.7222342491149902, + "learning_rate": 4.74514866553984e-05, + "loss": 5.5805, + "step": 24378 + }, + { + "epoch": 0.14498881910743172, + "grad_norm": 1.7649940252304077, + "learning_rate": 4.745128118694646e-05, + "loss": 5.5568, + "step": 24379 + }, + { + "epoch": 0.1449947663907127, + "grad_norm": 1.9492725133895874, + "learning_rate": 4.745107571065701e-05, + "loss": 5.2019, + "step": 24380 + }, + { + "epoch": 0.14500071367399373, + "grad_norm": 1.6403963565826416, + "learning_rate": 4.745087022653013e-05, + "loss": 5.0867, + "step": 24381 + }, + { + "epoch": 0.1450066609572747, + "grad_norm": 1.6921762228012085, + "learning_rate": 4.7450664734565875e-05, + "loss": 4.823, + "step": 24382 + }, + { + "epoch": 0.1450126082405557, + "grad_norm": 1.7539616823196411, + "learning_rate": 4.745045923476432e-05, + "loss": 5.0268, + "step": 24383 + }, + { + "epoch": 0.14501855552383672, + "grad_norm": 1.7073962688446045, + "learning_rate": 4.745025372712555e-05, + "loss": 5.3082, + "step": 24384 + }, + { + "epoch": 0.1450245028071177, + "grad_norm": 1.457963228225708, + "learning_rate": 4.7450048211649626e-05, + "loss": 5.1926, + "step": 24385 + }, + { + "epoch": 0.1450304500903987, + "grad_norm": 1.7305623292922974, + "learning_rate": 4.744984268833662e-05, + "loss": 5.3563, + "step": 24386 + }, + { + "epoch": 0.1450363973736797, + "grad_norm": 1.4888592958450317, + "learning_rate": 4.744963715718661e-05, + "loss": 5.3478, + "step": 24387 + }, + { + "epoch": 0.1450423446569607, + "grad_norm": 1.7059639692306519, + "learning_rate": 4.744943161819966e-05, + "loss": 5.3782, + "step": 24388 + }, + { + "epoch": 0.14504829194024169, + "grad_norm": 1.539562463760376, + "learning_rate": 4.744922607137585e-05, + "loss": 5.4259, + "step": 24389 + }, + { + "epoch": 0.1450542392235227, + "grad_norm": 1.6427409648895264, + "learning_rate": 4.7449020516715245e-05, + "loss": 5.2505, + "step": 24390 + }, + { + "epoch": 0.1450601865068037, + "grad_norm": 1.5506988763809204, + "learning_rate": 4.7448814954217916e-05, + "loss": 5.214, + "step": 24391 + }, + { + "epoch": 0.14506613379008468, + "grad_norm": 1.7298414707183838, + "learning_rate": 4.744860938388395e-05, + "loss": 5.4361, + "step": 24392 + }, + { + "epoch": 0.1450720810733657, + "grad_norm": 1.6383203268051147, + "learning_rate": 4.744840380571339e-05, + "loss": 5.2703, + "step": 24393 + }, + { + "epoch": 0.14507802835664668, + "grad_norm": 1.6193071603775024, + "learning_rate": 4.744819821970633e-05, + "loss": 5.1414, + "step": 24394 + }, + { + "epoch": 0.14508397563992767, + "grad_norm": 1.6779429912567139, + "learning_rate": 4.7447992625862835e-05, + "loss": 5.1886, + "step": 24395 + }, + { + "epoch": 0.1450899229232087, + "grad_norm": 1.7938344478607178, + "learning_rate": 4.7447787024182975e-05, + "loss": 5.4733, + "step": 24396 + }, + { + "epoch": 0.14509587020648967, + "grad_norm": 1.7248293161392212, + "learning_rate": 4.7447581414666834e-05, + "loss": 5.484, + "step": 24397 + }, + { + "epoch": 0.14510181748977066, + "grad_norm": 1.7749347686767578, + "learning_rate": 4.744737579731447e-05, + "loss": 5.3896, + "step": 24398 + }, + { + "epoch": 0.14510776477305168, + "grad_norm": 1.6471116542816162, + "learning_rate": 4.744717017212595e-05, + "loss": 5.4622, + "step": 24399 + }, + { + "epoch": 0.14511371205633267, + "grad_norm": 1.7347856760025024, + "learning_rate": 4.7446964539101366e-05, + "loss": 5.5045, + "step": 24400 + }, + { + "epoch": 0.14511965933961365, + "grad_norm": 1.7716923952102661, + "learning_rate": 4.744675889824078e-05, + "loss": 5.3604, + "step": 24401 + }, + { + "epoch": 0.14512560662289467, + "grad_norm": 1.8484928607940674, + "learning_rate": 4.7446553249544253e-05, + "loss": 5.4746, + "step": 24402 + }, + { + "epoch": 0.14513155390617566, + "grad_norm": 1.7685359716415405, + "learning_rate": 4.7446347593011875e-05, + "loss": 5.4244, + "step": 24403 + }, + { + "epoch": 0.14513750118945665, + "grad_norm": 1.6140607595443726, + "learning_rate": 4.74461419286437e-05, + "loss": 5.4337, + "step": 24404 + }, + { + "epoch": 0.14514344847273766, + "grad_norm": 1.8425545692443848, + "learning_rate": 4.744593625643982e-05, + "loss": 4.7602, + "step": 24405 + }, + { + "epoch": 0.14514939575601865, + "grad_norm": 1.7787073850631714, + "learning_rate": 4.7445730576400284e-05, + "loss": 4.7792, + "step": 24406 + }, + { + "epoch": 0.14515534303929964, + "grad_norm": 1.7401658296585083, + "learning_rate": 4.7445524888525185e-05, + "loss": 5.1436, + "step": 24407 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 1.9028658866882324, + "learning_rate": 4.744531919281457e-05, + "loss": 5.2477, + "step": 24408 + }, + { + "epoch": 0.14516723760586164, + "grad_norm": 1.86017644405365, + "learning_rate": 4.7445113489268544e-05, + "loss": 5.593, + "step": 24409 + }, + { + "epoch": 0.14517318488914263, + "grad_norm": 1.551146149635315, + "learning_rate": 4.744490777788716e-05, + "loss": 5.7147, + "step": 24410 + }, + { + "epoch": 0.14517913217242365, + "grad_norm": 1.689828634262085, + "learning_rate": 4.744470205867048e-05, + "loss": 5.7174, + "step": 24411 + }, + { + "epoch": 0.14518507945570464, + "grad_norm": 1.6940490007400513, + "learning_rate": 4.744449633161859e-05, + "loss": 5.4586, + "step": 24412 + }, + { + "epoch": 0.14519102673898562, + "grad_norm": 1.6582127809524536, + "learning_rate": 4.7444290596731555e-05, + "loss": 5.4499, + "step": 24413 + }, + { + "epoch": 0.14519697402226664, + "grad_norm": 1.5289736986160278, + "learning_rate": 4.7444084854009454e-05, + "loss": 5.3323, + "step": 24414 + }, + { + "epoch": 0.14520292130554763, + "grad_norm": 1.597364068031311, + "learning_rate": 4.744387910345235e-05, + "loss": 5.2472, + "step": 24415 + }, + { + "epoch": 0.14520886858882862, + "grad_norm": 1.567718505859375, + "learning_rate": 4.7443673345060325e-05, + "loss": 5.1505, + "step": 24416 + }, + { + "epoch": 0.14521481587210963, + "grad_norm": 1.6296337842941284, + "learning_rate": 4.7443467578833446e-05, + "loss": 5.5358, + "step": 24417 + }, + { + "epoch": 0.14522076315539062, + "grad_norm": 1.5341614484786987, + "learning_rate": 4.744326180477179e-05, + "loss": 5.4139, + "step": 24418 + }, + { + "epoch": 0.1452267104386716, + "grad_norm": 1.6611801385879517, + "learning_rate": 4.744305602287541e-05, + "loss": 5.3999, + "step": 24419 + }, + { + "epoch": 0.14523265772195262, + "grad_norm": 1.4712778329849243, + "learning_rate": 4.74428502331444e-05, + "loss": 5.5498, + "step": 24420 + }, + { + "epoch": 0.1452386050052336, + "grad_norm": 1.6814862489700317, + "learning_rate": 4.744264443557882e-05, + "loss": 5.3511, + "step": 24421 + }, + { + "epoch": 0.1452445522885146, + "grad_norm": 1.512871265411377, + "learning_rate": 4.7442438630178746e-05, + "loss": 5.2377, + "step": 24422 + }, + { + "epoch": 0.14525049957179562, + "grad_norm": 1.4311738014221191, + "learning_rate": 4.744223281694424e-05, + "loss": 5.49, + "step": 24423 + }, + { + "epoch": 0.1452564468550766, + "grad_norm": 1.4469417333602905, + "learning_rate": 4.744202699587539e-05, + "loss": 5.2427, + "step": 24424 + }, + { + "epoch": 0.1452623941383576, + "grad_norm": 1.4444100856781006, + "learning_rate": 4.744182116697226e-05, + "loss": 5.263, + "step": 24425 + }, + { + "epoch": 0.1452683414216386, + "grad_norm": 1.4034851789474487, + "learning_rate": 4.744161533023492e-05, + "loss": 5.2735, + "step": 24426 + }, + { + "epoch": 0.1452742887049196, + "grad_norm": 1.637856364250183, + "learning_rate": 4.7441409485663444e-05, + "loss": 5.0982, + "step": 24427 + }, + { + "epoch": 0.14528023598820058, + "grad_norm": 1.7255091667175293, + "learning_rate": 4.7441203633257915e-05, + "loss": 4.9104, + "step": 24428 + }, + { + "epoch": 0.1452861832714816, + "grad_norm": 2.115915536880493, + "learning_rate": 4.744099777301838e-05, + "loss": 4.9661, + "step": 24429 + }, + { + "epoch": 0.1452921305547626, + "grad_norm": 1.8747011423110962, + "learning_rate": 4.7440791904944926e-05, + "loss": 5.2122, + "step": 24430 + }, + { + "epoch": 0.14529807783804358, + "grad_norm": 1.7300605773925781, + "learning_rate": 4.744058602903763e-05, + "loss": 5.1689, + "step": 24431 + }, + { + "epoch": 0.1453040251213246, + "grad_norm": 1.4435160160064697, + "learning_rate": 4.744038014529655e-05, + "loss": 5.2636, + "step": 24432 + }, + { + "epoch": 0.14530997240460558, + "grad_norm": 1.6441041231155396, + "learning_rate": 4.744017425372177e-05, + "loss": 5.2737, + "step": 24433 + }, + { + "epoch": 0.14531591968788657, + "grad_norm": 1.5537841320037842, + "learning_rate": 4.743996835431336e-05, + "loss": 5.1661, + "step": 24434 + }, + { + "epoch": 0.14532186697116756, + "grad_norm": 1.5431783199310303, + "learning_rate": 4.743976244707138e-05, + "loss": 5.0257, + "step": 24435 + }, + { + "epoch": 0.14532781425444857, + "grad_norm": 1.6137834787368774, + "learning_rate": 4.7439556531995914e-05, + "loss": 4.9459, + "step": 24436 + }, + { + "epoch": 0.14533376153772956, + "grad_norm": 1.6870076656341553, + "learning_rate": 4.743935060908703e-05, + "loss": 5.0615, + "step": 24437 + }, + { + "epoch": 0.14533970882101055, + "grad_norm": 1.7536146640777588, + "learning_rate": 4.74391446783448e-05, + "loss": 5.041, + "step": 24438 + }, + { + "epoch": 0.14534565610429157, + "grad_norm": 1.8259520530700684, + "learning_rate": 4.7438938739769304e-05, + "loss": 5.0222, + "step": 24439 + }, + { + "epoch": 0.14535160338757255, + "grad_norm": 1.9656455516815186, + "learning_rate": 4.74387327933606e-05, + "loss": 5.3352, + "step": 24440 + }, + { + "epoch": 0.14535755067085354, + "grad_norm": 2.096452236175537, + "learning_rate": 4.743852683911877e-05, + "loss": 5.4241, + "step": 24441 + }, + { + "epoch": 0.14536349795413456, + "grad_norm": 1.6562155485153198, + "learning_rate": 4.743832087704388e-05, + "loss": 6.0049, + "step": 24442 + }, + { + "epoch": 0.14536944523741555, + "grad_norm": 1.538763165473938, + "learning_rate": 4.7438114907136e-05, + "loss": 5.4588, + "step": 24443 + }, + { + "epoch": 0.14537539252069653, + "grad_norm": 1.835303783416748, + "learning_rate": 4.7437908929395216e-05, + "loss": 5.1866, + "step": 24444 + }, + { + "epoch": 0.14538133980397755, + "grad_norm": 1.6841330528259277, + "learning_rate": 4.743770294382158e-05, + "loss": 5.51, + "step": 24445 + }, + { + "epoch": 0.14538728708725854, + "grad_norm": 1.775283694267273, + "learning_rate": 4.743749695041517e-05, + "loss": 5.5482, + "step": 24446 + }, + { + "epoch": 0.14539323437053953, + "grad_norm": 1.5169485807418823, + "learning_rate": 4.7437290949176074e-05, + "loss": 5.5175, + "step": 24447 + }, + { + "epoch": 0.14539918165382054, + "grad_norm": 1.3337781429290771, + "learning_rate": 4.743708494010435e-05, + "loss": 5.8864, + "step": 24448 + }, + { + "epoch": 0.14540512893710153, + "grad_norm": 1.5488650798797607, + "learning_rate": 4.743687892320006e-05, + "loss": 5.9374, + "step": 24449 + }, + { + "epoch": 0.14541107622038252, + "grad_norm": 1.7683097124099731, + "learning_rate": 4.74366728984633e-05, + "loss": 5.2741, + "step": 24450 + }, + { + "epoch": 0.14541702350366353, + "grad_norm": 1.750689148902893, + "learning_rate": 4.743646686589413e-05, + "loss": 5.5179, + "step": 24451 + }, + { + "epoch": 0.14542297078694452, + "grad_norm": 1.8411931991577148, + "learning_rate": 4.7436260825492604e-05, + "loss": 5.2341, + "step": 24452 + }, + { + "epoch": 0.1454289180702255, + "grad_norm": 1.8112800121307373, + "learning_rate": 4.7436054777258824e-05, + "loss": 5.2025, + "step": 24453 + }, + { + "epoch": 0.14543486535350653, + "grad_norm": 1.5593929290771484, + "learning_rate": 4.743584872119285e-05, + "loss": 5.4906, + "step": 24454 + }, + { + "epoch": 0.14544081263678751, + "grad_norm": 1.683072805404663, + "learning_rate": 4.743564265729475e-05, + "loss": 5.279, + "step": 24455 + }, + { + "epoch": 0.1454467599200685, + "grad_norm": 1.6395639181137085, + "learning_rate": 4.74354365855646e-05, + "loss": 5.9672, + "step": 24456 + }, + { + "epoch": 0.14545270720334952, + "grad_norm": 1.5672929286956787, + "learning_rate": 4.743523050600247e-05, + "loss": 5.3588, + "step": 24457 + }, + { + "epoch": 0.1454586544866305, + "grad_norm": 1.7329927682876587, + "learning_rate": 4.7435024418608434e-05, + "loss": 5.1456, + "step": 24458 + }, + { + "epoch": 0.1454646017699115, + "grad_norm": 1.7443114519119263, + "learning_rate": 4.7434818323382554e-05, + "loss": 5.0256, + "step": 24459 + }, + { + "epoch": 0.1454705490531925, + "grad_norm": 1.6770588159561157, + "learning_rate": 4.7434612220324926e-05, + "loss": 5.0028, + "step": 24460 + }, + { + "epoch": 0.1454764963364735, + "grad_norm": 1.7134469747543335, + "learning_rate": 4.74344061094356e-05, + "loss": 5.0299, + "step": 24461 + }, + { + "epoch": 0.1454824436197545, + "grad_norm": 1.55935537815094, + "learning_rate": 4.743419999071465e-05, + "loss": 5.0422, + "step": 24462 + }, + { + "epoch": 0.1454883909030355, + "grad_norm": 1.722185730934143, + "learning_rate": 4.743399386416216e-05, + "loss": 4.9558, + "step": 24463 + }, + { + "epoch": 0.1454943381863165, + "grad_norm": 1.6128919124603271, + "learning_rate": 4.743378772977819e-05, + "loss": 4.903, + "step": 24464 + }, + { + "epoch": 0.14550028546959748, + "grad_norm": 1.6574269533157349, + "learning_rate": 4.7433581587562816e-05, + "loss": 4.9092, + "step": 24465 + }, + { + "epoch": 0.1455062327528785, + "grad_norm": 1.6132055521011353, + "learning_rate": 4.7433375437516116e-05, + "loss": 4.8561, + "step": 24466 + }, + { + "epoch": 0.14551218003615948, + "grad_norm": 1.7846872806549072, + "learning_rate": 4.743316927963814e-05, + "loss": 5.3115, + "step": 24467 + }, + { + "epoch": 0.14551812731944047, + "grad_norm": 1.787424087524414, + "learning_rate": 4.7432963113929e-05, + "loss": 5.2607, + "step": 24468 + }, + { + "epoch": 0.1455240746027215, + "grad_norm": 1.9011743068695068, + "learning_rate": 4.743275694038873e-05, + "loss": 4.989, + "step": 24469 + }, + { + "epoch": 0.14553002188600248, + "grad_norm": 1.7853960990905762, + "learning_rate": 4.7432550759017415e-05, + "loss": 5.066, + "step": 24470 + }, + { + "epoch": 0.14553596916928346, + "grad_norm": 2.131143569946289, + "learning_rate": 4.7432344569815134e-05, + "loss": 5.0322, + "step": 24471 + }, + { + "epoch": 0.14554191645256448, + "grad_norm": 1.7870924472808838, + "learning_rate": 4.743213837278195e-05, + "loss": 4.8767, + "step": 24472 + }, + { + "epoch": 0.14554786373584547, + "grad_norm": 1.8804802894592285, + "learning_rate": 4.743193216791795e-05, + "loss": 5.0155, + "step": 24473 + }, + { + "epoch": 0.14555381101912646, + "grad_norm": 2.4177560806274414, + "learning_rate": 4.7431725955223175e-05, + "loss": 4.6521, + "step": 24474 + }, + { + "epoch": 0.14555975830240747, + "grad_norm": 2.3657360076904297, + "learning_rate": 4.743151973469773e-05, + "loss": 4.5406, + "step": 24475 + }, + { + "epoch": 0.14556570558568846, + "grad_norm": 2.233304977416992, + "learning_rate": 4.743131350634167e-05, + "loss": 4.6725, + "step": 24476 + }, + { + "epoch": 0.14557165286896945, + "grad_norm": 2.314302921295166, + "learning_rate": 4.743110727015506e-05, + "loss": 4.2326, + "step": 24477 + }, + { + "epoch": 0.14557760015225046, + "grad_norm": 2.272599220275879, + "learning_rate": 4.7430901026137996e-05, + "loss": 4.2031, + "step": 24478 + }, + { + "epoch": 0.14558354743553145, + "grad_norm": 1.7667213678359985, + "learning_rate": 4.743069477429053e-05, + "loss": 5.0108, + "step": 24479 + }, + { + "epoch": 0.14558949471881244, + "grad_norm": 2.192775011062622, + "learning_rate": 4.7430488514612746e-05, + "loss": 4.0625, + "step": 24480 + }, + { + "epoch": 0.14559544200209346, + "grad_norm": 2.4205431938171387, + "learning_rate": 4.743028224710471e-05, + "loss": 4.1039, + "step": 24481 + }, + { + "epoch": 0.14560138928537444, + "grad_norm": 2.1844823360443115, + "learning_rate": 4.743007597176649e-05, + "loss": 3.9408, + "step": 24482 + }, + { + "epoch": 0.14560733656865543, + "grad_norm": 2.3235034942626953, + "learning_rate": 4.742986968859816e-05, + "loss": 4.0957, + "step": 24483 + }, + { + "epoch": 0.14561328385193645, + "grad_norm": 2.3802473545074463, + "learning_rate": 4.742966339759979e-05, + "loss": 4.2864, + "step": 24484 + }, + { + "epoch": 0.14561923113521744, + "grad_norm": 2.2253031730651855, + "learning_rate": 4.742945709877147e-05, + "loss": 4.1559, + "step": 24485 + }, + { + "epoch": 0.14562517841849842, + "grad_norm": 2.559008836746216, + "learning_rate": 4.742925079211324e-05, + "loss": 4.0356, + "step": 24486 + }, + { + "epoch": 0.14563112570177944, + "grad_norm": 2.222951889038086, + "learning_rate": 4.7429044477625206e-05, + "loss": 4.0193, + "step": 24487 + }, + { + "epoch": 0.14563707298506043, + "grad_norm": 1.9578197002410889, + "learning_rate": 4.742883815530742e-05, + "loss": 4.8917, + "step": 24488 + }, + { + "epoch": 0.14564302026834142, + "grad_norm": 1.8768174648284912, + "learning_rate": 4.742863182515996e-05, + "loss": 4.8987, + "step": 24489 + }, + { + "epoch": 0.14564896755162243, + "grad_norm": 2.0520718097686768, + "learning_rate": 4.7428425487182895e-05, + "loss": 5.2806, + "step": 24490 + }, + { + "epoch": 0.14565491483490342, + "grad_norm": 1.7171385288238525, + "learning_rate": 4.74282191413763e-05, + "loss": 4.801, + "step": 24491 + }, + { + "epoch": 0.1456608621181844, + "grad_norm": 1.5739022493362427, + "learning_rate": 4.742801278774024e-05, + "loss": 5.5888, + "step": 24492 + }, + { + "epoch": 0.1456668094014654, + "grad_norm": 1.6728390455245972, + "learning_rate": 4.742780642627479e-05, + "loss": 5.0339, + "step": 24493 + }, + { + "epoch": 0.1456727566847464, + "grad_norm": 1.5647993087768555, + "learning_rate": 4.7427600056980035e-05, + "loss": 4.859, + "step": 24494 + }, + { + "epoch": 0.1456787039680274, + "grad_norm": 1.8099721670150757, + "learning_rate": 4.7427393679856026e-05, + "loss": 5.4872, + "step": 24495 + }, + { + "epoch": 0.1456846512513084, + "grad_norm": 1.7053685188293457, + "learning_rate": 4.742718729490285e-05, + "loss": 5.0992, + "step": 24496 + }, + { + "epoch": 0.1456905985345894, + "grad_norm": 1.57960045337677, + "learning_rate": 4.742698090212058e-05, + "loss": 5.3847, + "step": 24497 + }, + { + "epoch": 0.1456965458178704, + "grad_norm": 1.6272963285446167, + "learning_rate": 4.7426774501509275e-05, + "loss": 5.2833, + "step": 24498 + }, + { + "epoch": 0.14570249310115138, + "grad_norm": 1.8782978057861328, + "learning_rate": 4.742656809306902e-05, + "loss": 5.2527, + "step": 24499 + }, + { + "epoch": 0.1457084403844324, + "grad_norm": 1.6581416130065918, + "learning_rate": 4.742636167679988e-05, + "loss": 5.4469, + "step": 24500 + }, + { + "epoch": 0.14571438766771339, + "grad_norm": 1.4809743165969849, + "learning_rate": 4.742615525270193e-05, + "loss": 5.5264, + "step": 24501 + }, + { + "epoch": 0.14572033495099437, + "grad_norm": 1.7145473957061768, + "learning_rate": 4.742594882077523e-05, + "loss": 5.3418, + "step": 24502 + }, + { + "epoch": 0.1457262822342754, + "grad_norm": 1.5335949659347534, + "learning_rate": 4.742574238101988e-05, + "loss": 5.3467, + "step": 24503 + }, + { + "epoch": 0.14573222951755638, + "grad_norm": 1.4682936668395996, + "learning_rate": 4.742553593343593e-05, + "loss": 5.3817, + "step": 24504 + }, + { + "epoch": 0.14573817680083737, + "grad_norm": 1.3231433629989624, + "learning_rate": 4.742532947802345e-05, + "loss": 5.4963, + "step": 24505 + }, + { + "epoch": 0.14574412408411838, + "grad_norm": 1.4141665697097778, + "learning_rate": 4.7425123014782525e-05, + "loss": 5.6261, + "step": 24506 + }, + { + "epoch": 0.14575007136739937, + "grad_norm": 1.5164703130722046, + "learning_rate": 4.742491654371322e-05, + "loss": 5.8411, + "step": 24507 + }, + { + "epoch": 0.14575601865068036, + "grad_norm": 1.309892177581787, + "learning_rate": 4.7424710064815606e-05, + "loss": 5.497, + "step": 24508 + }, + { + "epoch": 0.14576196593396137, + "grad_norm": 1.9315495491027832, + "learning_rate": 4.742450357808976e-05, + "loss": 5.5718, + "step": 24509 + }, + { + "epoch": 0.14576791321724236, + "grad_norm": 1.3881922960281372, + "learning_rate": 4.742429708353575e-05, + "loss": 5.6583, + "step": 24510 + }, + { + "epoch": 0.14577386050052335, + "grad_norm": 1.186221957206726, + "learning_rate": 4.7424090581153654e-05, + "loss": 5.5111, + "step": 24511 + }, + { + "epoch": 0.14577980778380437, + "grad_norm": 1.5839451551437378, + "learning_rate": 4.742388407094354e-05, + "loss": 5.285, + "step": 24512 + }, + { + "epoch": 0.14578575506708535, + "grad_norm": 1.659534215927124, + "learning_rate": 4.7423677552905474e-05, + "loss": 5.2722, + "step": 24513 + }, + { + "epoch": 0.14579170235036634, + "grad_norm": 1.530068278312683, + "learning_rate": 4.742347102703953e-05, + "loss": 5.6943, + "step": 24514 + }, + { + "epoch": 0.14579764963364736, + "grad_norm": 1.966497540473938, + "learning_rate": 4.7423264493345794e-05, + "loss": 5.3509, + "step": 24515 + }, + { + "epoch": 0.14580359691692835, + "grad_norm": 2.2554593086242676, + "learning_rate": 4.7423057951824325e-05, + "loss": 4.8778, + "step": 24516 + }, + { + "epoch": 0.14580954420020933, + "grad_norm": 1.746324062347412, + "learning_rate": 4.7422851402475195e-05, + "loss": 5.2867, + "step": 24517 + }, + { + "epoch": 0.14581549148349035, + "grad_norm": 1.5312012434005737, + "learning_rate": 4.7422644845298484e-05, + "loss": 5.3472, + "step": 24518 + }, + { + "epoch": 0.14582143876677134, + "grad_norm": 1.8742462396621704, + "learning_rate": 4.742243828029426e-05, + "loss": 5.2399, + "step": 24519 + }, + { + "epoch": 0.14582738605005233, + "grad_norm": 1.563302993774414, + "learning_rate": 4.7422231707462585e-05, + "loss": 5.3742, + "step": 24520 + }, + { + "epoch": 0.14583333333333334, + "grad_norm": 1.7737884521484375, + "learning_rate": 4.7422025126803545e-05, + "loss": 5.6674, + "step": 24521 + }, + { + "epoch": 0.14583928061661433, + "grad_norm": 1.9887245893478394, + "learning_rate": 4.742181853831721e-05, + "loss": 5.3851, + "step": 24522 + }, + { + "epoch": 0.14584522789989532, + "grad_norm": 1.773938775062561, + "learning_rate": 4.7421611942003654e-05, + "loss": 5.22, + "step": 24523 + }, + { + "epoch": 0.14585117518317633, + "grad_norm": 1.733723521232605, + "learning_rate": 4.742140533786294e-05, + "loss": 5.0786, + "step": 24524 + }, + { + "epoch": 0.14585712246645732, + "grad_norm": 1.7058782577514648, + "learning_rate": 4.742119872589514e-05, + "loss": 5.214, + "step": 24525 + }, + { + "epoch": 0.1458630697497383, + "grad_norm": 1.7503206729888916, + "learning_rate": 4.742099210610034e-05, + "loss": 5.3132, + "step": 24526 + }, + { + "epoch": 0.14586901703301933, + "grad_norm": 1.9028650522232056, + "learning_rate": 4.7420785478478596e-05, + "loss": 5.3016, + "step": 24527 + }, + { + "epoch": 0.14587496431630032, + "grad_norm": 1.7530872821807861, + "learning_rate": 4.742057884302999e-05, + "loss": 5.199, + "step": 24528 + }, + { + "epoch": 0.1458809115995813, + "grad_norm": 1.8776800632476807, + "learning_rate": 4.7420372199754595e-05, + "loss": 5.0358, + "step": 24529 + }, + { + "epoch": 0.14588685888286232, + "grad_norm": 1.6402316093444824, + "learning_rate": 4.7420165548652474e-05, + "loss": 5.0548, + "step": 24530 + }, + { + "epoch": 0.1458928061661433, + "grad_norm": 1.9277185201644897, + "learning_rate": 4.741995888972371e-05, + "loss": 5.0196, + "step": 24531 + }, + { + "epoch": 0.1458987534494243, + "grad_norm": 1.7798771858215332, + "learning_rate": 4.7419752222968364e-05, + "loss": 5.0015, + "step": 24532 + }, + { + "epoch": 0.1459047007327053, + "grad_norm": 1.6921379566192627, + "learning_rate": 4.741954554838652e-05, + "loss": 5.0044, + "step": 24533 + }, + { + "epoch": 0.1459106480159863, + "grad_norm": 1.5286321640014648, + "learning_rate": 4.741933886597825e-05, + "loss": 5.2836, + "step": 24534 + }, + { + "epoch": 0.1459165952992673, + "grad_norm": 1.5439866781234741, + "learning_rate": 4.741913217574361e-05, + "loss": 5.645, + "step": 24535 + }, + { + "epoch": 0.1459225425825483, + "grad_norm": 1.8537307977676392, + "learning_rate": 4.741892547768269e-05, + "loss": 5.7112, + "step": 24536 + }, + { + "epoch": 0.1459284898658293, + "grad_norm": 1.458747386932373, + "learning_rate": 4.741871877179554e-05, + "loss": 5.3639, + "step": 24537 + }, + { + "epoch": 0.14593443714911028, + "grad_norm": 1.8507471084594727, + "learning_rate": 4.7418512058082255e-05, + "loss": 4.7947, + "step": 24538 + }, + { + "epoch": 0.1459403844323913, + "grad_norm": 1.8104653358459473, + "learning_rate": 4.74183053365429e-05, + "loss": 4.9444, + "step": 24539 + }, + { + "epoch": 0.14594633171567228, + "grad_norm": 1.8392473459243774, + "learning_rate": 4.741809860717755e-05, + "loss": 4.6432, + "step": 24540 + }, + { + "epoch": 0.14595227899895327, + "grad_norm": 1.8322739601135254, + "learning_rate": 4.7417891869986274e-05, + "loss": 4.8165, + "step": 24541 + }, + { + "epoch": 0.1459582262822343, + "grad_norm": 1.7574645280838013, + "learning_rate": 4.741768512496914e-05, + "loss": 4.5592, + "step": 24542 + }, + { + "epoch": 0.14596417356551528, + "grad_norm": 1.6960285902023315, + "learning_rate": 4.7417478372126223e-05, + "loss": 4.8203, + "step": 24543 + }, + { + "epoch": 0.14597012084879626, + "grad_norm": 1.624930739402771, + "learning_rate": 4.741727161145759e-05, + "loss": 4.7056, + "step": 24544 + }, + { + "epoch": 0.14597606813207728, + "grad_norm": 1.6901119947433472, + "learning_rate": 4.741706484296333e-05, + "loss": 4.8837, + "step": 24545 + }, + { + "epoch": 0.14598201541535827, + "grad_norm": 1.6677742004394531, + "learning_rate": 4.74168580666435e-05, + "loss": 5.777, + "step": 24546 + }, + { + "epoch": 0.14598796269863926, + "grad_norm": 1.9622048139572144, + "learning_rate": 4.741665128249818e-05, + "loss": 5.1728, + "step": 24547 + }, + { + "epoch": 0.14599390998192027, + "grad_norm": 2.1024181842803955, + "learning_rate": 4.7416444490527435e-05, + "loss": 5.1417, + "step": 24548 + }, + { + "epoch": 0.14599985726520126, + "grad_norm": 1.9071123600006104, + "learning_rate": 4.7416237690731336e-05, + "loss": 5.1996, + "step": 24549 + }, + { + "epoch": 0.14600580454848225, + "grad_norm": 2.404794931411743, + "learning_rate": 4.741603088310997e-05, + "loss": 5.2283, + "step": 24550 + }, + { + "epoch": 0.14601175183176324, + "grad_norm": 1.6359655857086182, + "learning_rate": 4.74158240676634e-05, + "loss": 5.3233, + "step": 24551 + }, + { + "epoch": 0.14601769911504425, + "grad_norm": 2.5952274799346924, + "learning_rate": 4.7415617244391686e-05, + "loss": 4.9227, + "step": 24552 + }, + { + "epoch": 0.14602364639832524, + "grad_norm": 1.709825038909912, + "learning_rate": 4.7415410413294914e-05, + "loss": 5.2745, + "step": 24553 + }, + { + "epoch": 0.14602959368160623, + "grad_norm": 1.709489345550537, + "learning_rate": 4.741520357437316e-05, + "loss": 5.0694, + "step": 24554 + }, + { + "epoch": 0.14603554096488724, + "grad_norm": 1.6386815309524536, + "learning_rate": 4.7414996727626484e-05, + "loss": 5.1265, + "step": 24555 + }, + { + "epoch": 0.14604148824816823, + "grad_norm": 1.4357349872589111, + "learning_rate": 4.741478987305497e-05, + "loss": 5.149, + "step": 24556 + }, + { + "epoch": 0.14604743553144922, + "grad_norm": 1.951442003250122, + "learning_rate": 4.741458301065868e-05, + "loss": 5.0956, + "step": 24557 + }, + { + "epoch": 0.14605338281473024, + "grad_norm": 2.0688650608062744, + "learning_rate": 4.7414376140437696e-05, + "loss": 4.8894, + "step": 24558 + }, + { + "epoch": 0.14605933009801123, + "grad_norm": 1.6985790729522705, + "learning_rate": 4.741416926239208e-05, + "loss": 4.9548, + "step": 24559 + }, + { + "epoch": 0.1460652773812922, + "grad_norm": 1.5429292917251587, + "learning_rate": 4.7413962376521906e-05, + "loss": 4.9634, + "step": 24560 + }, + { + "epoch": 0.14607122466457323, + "grad_norm": 1.5821011066436768, + "learning_rate": 4.741375548282726e-05, + "loss": 5.3701, + "step": 24561 + }, + { + "epoch": 0.14607717194785422, + "grad_norm": 1.5868496894836426, + "learning_rate": 4.7413548581308196e-05, + "loss": 5.0315, + "step": 24562 + }, + { + "epoch": 0.1460831192311352, + "grad_norm": 1.471294641494751, + "learning_rate": 4.74133416719648e-05, + "loss": 4.9128, + "step": 24563 + }, + { + "epoch": 0.14608906651441622, + "grad_norm": 1.4862011671066284, + "learning_rate": 4.7413134754797126e-05, + "loss": 4.8533, + "step": 24564 + }, + { + "epoch": 0.1460950137976972, + "grad_norm": 1.47359037399292, + "learning_rate": 4.741292782980527e-05, + "loss": 4.8428, + "step": 24565 + }, + { + "epoch": 0.1461009610809782, + "grad_norm": 1.4886908531188965, + "learning_rate": 4.741272089698928e-05, + "loss": 4.8365, + "step": 24566 + }, + { + "epoch": 0.1461069083642592, + "grad_norm": 1.561625599861145, + "learning_rate": 4.741251395634925e-05, + "loss": 4.9553, + "step": 24567 + }, + { + "epoch": 0.1461128556475402, + "grad_norm": 1.5089234113693237, + "learning_rate": 4.741230700788524e-05, + "loss": 4.7997, + "step": 24568 + }, + { + "epoch": 0.1461188029308212, + "grad_norm": 1.5985972881317139, + "learning_rate": 4.741210005159733e-05, + "loss": 4.8006, + "step": 24569 + }, + { + "epoch": 0.1461247502141022, + "grad_norm": 1.5302664041519165, + "learning_rate": 4.741189308748558e-05, + "loss": 4.7809, + "step": 24570 + }, + { + "epoch": 0.1461306974973832, + "grad_norm": 1.5156875848770142, + "learning_rate": 4.7411686115550074e-05, + "loss": 4.6965, + "step": 24571 + }, + { + "epoch": 0.14613664478066418, + "grad_norm": 1.6026439666748047, + "learning_rate": 4.741147913579088e-05, + "loss": 4.9386, + "step": 24572 + }, + { + "epoch": 0.1461425920639452, + "grad_norm": 1.849469542503357, + "learning_rate": 4.7411272148208067e-05, + "loss": 5.7675, + "step": 24573 + }, + { + "epoch": 0.1461485393472262, + "grad_norm": 1.9813694953918457, + "learning_rate": 4.7411065152801716e-05, + "loss": 5.3741, + "step": 24574 + }, + { + "epoch": 0.14615448663050717, + "grad_norm": 2.459035634994507, + "learning_rate": 4.741085814957189e-05, + "loss": 4.6126, + "step": 24575 + }, + { + "epoch": 0.1461604339137882, + "grad_norm": 2.858220100402832, + "learning_rate": 4.741065113851867e-05, + "loss": 4.1891, + "step": 24576 + }, + { + "epoch": 0.14616638119706918, + "grad_norm": 2.2826805114746094, + "learning_rate": 4.741044411964212e-05, + "loss": 4.4009, + "step": 24577 + }, + { + "epoch": 0.14617232848035017, + "grad_norm": 2.0174343585968018, + "learning_rate": 4.741023709294231e-05, + "loss": 4.946, + "step": 24578 + }, + { + "epoch": 0.14617827576363118, + "grad_norm": 2.0307867527008057, + "learning_rate": 4.741003005841932e-05, + "loss": 5.0872, + "step": 24579 + }, + { + "epoch": 0.14618422304691217, + "grad_norm": 2.147662878036499, + "learning_rate": 4.740982301607323e-05, + "loss": 4.648, + "step": 24580 + }, + { + "epoch": 0.14619017033019316, + "grad_norm": 2.7005789279937744, + "learning_rate": 4.740961596590409e-05, + "loss": 5.0555, + "step": 24581 + }, + { + "epoch": 0.14619611761347417, + "grad_norm": 2.3652596473693848, + "learning_rate": 4.740940890791199e-05, + "loss": 4.7969, + "step": 24582 + }, + { + "epoch": 0.14620206489675516, + "grad_norm": 2.5925567150115967, + "learning_rate": 4.7409201842097e-05, + "loss": 4.7544, + "step": 24583 + }, + { + "epoch": 0.14620801218003615, + "grad_norm": 1.9309169054031372, + "learning_rate": 4.740899476845918e-05, + "loss": 5.0901, + "step": 24584 + }, + { + "epoch": 0.14621395946331717, + "grad_norm": 2.6501107215881348, + "learning_rate": 4.740878768699861e-05, + "loss": 5.1449, + "step": 24585 + }, + { + "epoch": 0.14621990674659816, + "grad_norm": 2.3010451793670654, + "learning_rate": 4.7408580597715376e-05, + "loss": 5.276, + "step": 24586 + }, + { + "epoch": 0.14622585402987914, + "grad_norm": 1.8606983423233032, + "learning_rate": 4.740837350060953e-05, + "loss": 5.1453, + "step": 24587 + }, + { + "epoch": 0.14623180131316016, + "grad_norm": 2.0047266483306885, + "learning_rate": 4.740816639568115e-05, + "loss": 4.8976, + "step": 24588 + }, + { + "epoch": 0.14623774859644115, + "grad_norm": 2.4806363582611084, + "learning_rate": 4.740795928293032e-05, + "loss": 4.1182, + "step": 24589 + }, + { + "epoch": 0.14624369587972214, + "grad_norm": 2.560715675354004, + "learning_rate": 4.74077521623571e-05, + "loss": 4.4461, + "step": 24590 + }, + { + "epoch": 0.14624964316300315, + "grad_norm": 2.3709921836853027, + "learning_rate": 4.740754503396156e-05, + "loss": 4.5193, + "step": 24591 + }, + { + "epoch": 0.14625559044628414, + "grad_norm": 2.1095876693725586, + "learning_rate": 4.7407337897743784e-05, + "loss": 4.881, + "step": 24592 + }, + { + "epoch": 0.14626153772956513, + "grad_norm": 1.6448874473571777, + "learning_rate": 4.740713075370383e-05, + "loss": 5.0707, + "step": 24593 + }, + { + "epoch": 0.14626748501284614, + "grad_norm": 1.9237885475158691, + "learning_rate": 4.740692360184178e-05, + "loss": 5.0708, + "step": 24594 + }, + { + "epoch": 0.14627343229612713, + "grad_norm": 1.7685006856918335, + "learning_rate": 4.740671644215771e-05, + "loss": 5.0034, + "step": 24595 + }, + { + "epoch": 0.14627937957940812, + "grad_norm": 1.999850869178772, + "learning_rate": 4.740650927465169e-05, + "loss": 5.1153, + "step": 24596 + }, + { + "epoch": 0.14628532686268914, + "grad_norm": 2.0358314514160156, + "learning_rate": 4.740630209932378e-05, + "loss": 5.0567, + "step": 24597 + }, + { + "epoch": 0.14629127414597012, + "grad_norm": 1.883933424949646, + "learning_rate": 4.740609491617407e-05, + "loss": 5.0562, + "step": 24598 + }, + { + "epoch": 0.1462972214292511, + "grad_norm": 2.0172266960144043, + "learning_rate": 4.740588772520261e-05, + "loss": 5.0597, + "step": 24599 + }, + { + "epoch": 0.14630316871253213, + "grad_norm": 1.798579216003418, + "learning_rate": 4.74056805264095e-05, + "loss": 4.9391, + "step": 24600 + }, + { + "epoch": 0.14630911599581312, + "grad_norm": 1.8433833122253418, + "learning_rate": 4.7405473319794794e-05, + "loss": 5.0088, + "step": 24601 + }, + { + "epoch": 0.1463150632790941, + "grad_norm": 1.7729485034942627, + "learning_rate": 4.7405266105358564e-05, + "loss": 4.8909, + "step": 24602 + }, + { + "epoch": 0.14632101056237512, + "grad_norm": 1.9823477268218994, + "learning_rate": 4.740505888310089e-05, + "loss": 5.0547, + "step": 24603 + }, + { + "epoch": 0.1463269578456561, + "grad_norm": 2.0508856773376465, + "learning_rate": 4.740485165302184e-05, + "loss": 5.0857, + "step": 24604 + }, + { + "epoch": 0.1463329051289371, + "grad_norm": 2.0253899097442627, + "learning_rate": 4.740464441512149e-05, + "loss": 4.9882, + "step": 24605 + }, + { + "epoch": 0.1463388524122181, + "grad_norm": 1.977512001991272, + "learning_rate": 4.740443716939991e-05, + "loss": 4.8881, + "step": 24606 + }, + { + "epoch": 0.1463447996954991, + "grad_norm": 1.8985627889633179, + "learning_rate": 4.7404229915857175e-05, + "loss": 5.0182, + "step": 24607 + }, + { + "epoch": 0.1463507469787801, + "grad_norm": 2.009416103363037, + "learning_rate": 4.7404022654493355e-05, + "loss": 4.7361, + "step": 24608 + }, + { + "epoch": 0.14635669426206108, + "grad_norm": 2.3150322437286377, + "learning_rate": 4.7403815385308514e-05, + "loss": 4.2706, + "step": 24609 + }, + { + "epoch": 0.1463626415453421, + "grad_norm": 2.10493540763855, + "learning_rate": 4.740360810830275e-05, + "loss": 4.2009, + "step": 24610 + }, + { + "epoch": 0.14636858882862308, + "grad_norm": 2.019585132598877, + "learning_rate": 4.7403400823476094e-05, + "loss": 4.2991, + "step": 24611 + }, + { + "epoch": 0.14637453611190407, + "grad_norm": 1.966424584388733, + "learning_rate": 4.740319353082866e-05, + "loss": 5.0383, + "step": 24612 + }, + { + "epoch": 0.14638048339518508, + "grad_norm": 2.048212766647339, + "learning_rate": 4.740298623036049e-05, + "loss": 5.0623, + "step": 24613 + }, + { + "epoch": 0.14638643067846607, + "grad_norm": 2.318051338195801, + "learning_rate": 4.740277892207168e-05, + "loss": 5.7096, + "step": 24614 + }, + { + "epoch": 0.14639237796174706, + "grad_norm": 1.6807061433792114, + "learning_rate": 4.740257160596229e-05, + "loss": 4.9725, + "step": 24615 + }, + { + "epoch": 0.14639832524502808, + "grad_norm": 1.968828558921814, + "learning_rate": 4.7402364282032386e-05, + "loss": 4.9904, + "step": 24616 + }, + { + "epoch": 0.14640427252830907, + "grad_norm": 1.8591229915618896, + "learning_rate": 4.740215695028205e-05, + "loss": 4.9013, + "step": 24617 + }, + { + "epoch": 0.14641021981159005, + "grad_norm": 1.8735779523849487, + "learning_rate": 4.740194961071136e-05, + "loss": 5.0174, + "step": 24618 + }, + { + "epoch": 0.14641616709487107, + "grad_norm": 1.9068244695663452, + "learning_rate": 4.740174226332037e-05, + "loss": 4.9578, + "step": 24619 + }, + { + "epoch": 0.14642211437815206, + "grad_norm": 2.136747360229492, + "learning_rate": 4.740153490810917e-05, + "loss": 4.953, + "step": 24620 + }, + { + "epoch": 0.14642806166143305, + "grad_norm": 2.1197381019592285, + "learning_rate": 4.740132754507782e-05, + "loss": 5.1238, + "step": 24621 + }, + { + "epoch": 0.14643400894471406, + "grad_norm": 1.8754642009735107, + "learning_rate": 4.740112017422641e-05, + "loss": 4.9628, + "step": 24622 + }, + { + "epoch": 0.14643995622799505, + "grad_norm": 1.8816076517105103, + "learning_rate": 4.740091279555499e-05, + "loss": 4.8295, + "step": 24623 + }, + { + "epoch": 0.14644590351127604, + "grad_norm": 1.7956056594848633, + "learning_rate": 4.740070540906365e-05, + "loss": 4.7985, + "step": 24624 + }, + { + "epoch": 0.14645185079455705, + "grad_norm": 2.021692991256714, + "learning_rate": 4.740049801475245e-05, + "loss": 4.9583, + "step": 24625 + }, + { + "epoch": 0.14645779807783804, + "grad_norm": 1.69369637966156, + "learning_rate": 4.7400290612621465e-05, + "loss": 4.9205, + "step": 24626 + }, + { + "epoch": 0.14646374536111903, + "grad_norm": 1.7640669345855713, + "learning_rate": 4.740008320267077e-05, + "loss": 5.0191, + "step": 24627 + }, + { + "epoch": 0.14646969264440005, + "grad_norm": 2.0161068439483643, + "learning_rate": 4.739987578490045e-05, + "loss": 5.1847, + "step": 24628 + }, + { + "epoch": 0.14647563992768103, + "grad_norm": 1.8745818138122559, + "learning_rate": 4.7399668359310555e-05, + "loss": 5.0221, + "step": 24629 + }, + { + "epoch": 0.14648158721096202, + "grad_norm": 1.8857629299163818, + "learning_rate": 4.7399460925901164e-05, + "loss": 5.0957, + "step": 24630 + }, + { + "epoch": 0.14648753449424304, + "grad_norm": 1.7315385341644287, + "learning_rate": 4.739925348467236e-05, + "loss": 5.1935, + "step": 24631 + }, + { + "epoch": 0.14649348177752403, + "grad_norm": 1.968795657157898, + "learning_rate": 4.7399046035624204e-05, + "loss": 5.2074, + "step": 24632 + }, + { + "epoch": 0.14649942906080501, + "grad_norm": 1.889760971069336, + "learning_rate": 4.739883857875677e-05, + "loss": 4.7733, + "step": 24633 + }, + { + "epoch": 0.14650537634408603, + "grad_norm": 1.9310023784637451, + "learning_rate": 4.739863111407013e-05, + "loss": 5.0259, + "step": 24634 + }, + { + "epoch": 0.14651132362736702, + "grad_norm": 1.807829737663269, + "learning_rate": 4.739842364156437e-05, + "loss": 4.8263, + "step": 24635 + }, + { + "epoch": 0.146517270910648, + "grad_norm": 1.8053529262542725, + "learning_rate": 4.739821616123955e-05, + "loss": 4.8213, + "step": 24636 + }, + { + "epoch": 0.14652321819392902, + "grad_norm": 1.9432908296585083, + "learning_rate": 4.739800867309574e-05, + "loss": 4.8625, + "step": 24637 + }, + { + "epoch": 0.14652916547721, + "grad_norm": 1.5960321426391602, + "learning_rate": 4.739780117713302e-05, + "loss": 4.6592, + "step": 24638 + }, + { + "epoch": 0.146535112760491, + "grad_norm": 1.9232900142669678, + "learning_rate": 4.739759367335145e-05, + "loss": 4.8859, + "step": 24639 + }, + { + "epoch": 0.14654106004377201, + "grad_norm": 1.8403369188308716, + "learning_rate": 4.739738616175112e-05, + "loss": 4.7934, + "step": 24640 + }, + { + "epoch": 0.146547007327053, + "grad_norm": 1.6142429113388062, + "learning_rate": 4.7397178642332095e-05, + "loss": 4.7553, + "step": 24641 + }, + { + "epoch": 0.146552954610334, + "grad_norm": 1.7207775115966797, + "learning_rate": 4.7396971115094445e-05, + "loss": 4.5229, + "step": 24642 + }, + { + "epoch": 0.146558901893615, + "grad_norm": 1.651342511177063, + "learning_rate": 4.739676358003824e-05, + "loss": 4.7882, + "step": 24643 + }, + { + "epoch": 0.146564849176896, + "grad_norm": 1.5380842685699463, + "learning_rate": 4.7396556037163556e-05, + "loss": 5.1114, + "step": 24644 + }, + { + "epoch": 0.14657079646017698, + "grad_norm": 1.7868518829345703, + "learning_rate": 4.739634848647047e-05, + "loss": 6.0014, + "step": 24645 + }, + { + "epoch": 0.146576743743458, + "grad_norm": 1.7771759033203125, + "learning_rate": 4.7396140927959045e-05, + "loss": 6.0391, + "step": 24646 + }, + { + "epoch": 0.146582691026739, + "grad_norm": 1.7818456888198853, + "learning_rate": 4.739593336162936e-05, + "loss": 5.431, + "step": 24647 + }, + { + "epoch": 0.14658863831001998, + "grad_norm": 1.6585869789123535, + "learning_rate": 4.7395725787481496e-05, + "loss": 5.4888, + "step": 24648 + }, + { + "epoch": 0.146594585593301, + "grad_norm": 1.448287010192871, + "learning_rate": 4.73955182055155e-05, + "loss": 5.5616, + "step": 24649 + }, + { + "epoch": 0.14660053287658198, + "grad_norm": 1.600519061088562, + "learning_rate": 4.739531061573147e-05, + "loss": 5.4446, + "step": 24650 + }, + { + "epoch": 0.14660648015986297, + "grad_norm": 1.5828067064285278, + "learning_rate": 4.7395103018129464e-05, + "loss": 5.7003, + "step": 24651 + }, + { + "epoch": 0.14661242744314398, + "grad_norm": 2.0968759059906006, + "learning_rate": 4.739489541270956e-05, + "loss": 5.4655, + "step": 24652 + }, + { + "epoch": 0.14661837472642497, + "grad_norm": 2.287879467010498, + "learning_rate": 4.739468779947183e-05, + "loss": 5.182, + "step": 24653 + }, + { + "epoch": 0.14662432200970596, + "grad_norm": 1.9258517026901245, + "learning_rate": 4.7394480178416344e-05, + "loss": 5.6223, + "step": 24654 + }, + { + "epoch": 0.14663026929298698, + "grad_norm": 1.9016472101211548, + "learning_rate": 4.7394272549543183e-05, + "loss": 5.304, + "step": 24655 + }, + { + "epoch": 0.14663621657626796, + "grad_norm": 1.4872523546218872, + "learning_rate": 4.739406491285241e-05, + "loss": 5.4679, + "step": 24656 + }, + { + "epoch": 0.14664216385954895, + "grad_norm": 1.6542940139770508, + "learning_rate": 4.73938572683441e-05, + "loss": 5.4644, + "step": 24657 + }, + { + "epoch": 0.14664811114282997, + "grad_norm": 2.210514545440674, + "learning_rate": 4.739364961601832e-05, + "loss": 4.6455, + "step": 24658 + }, + { + "epoch": 0.14665405842611096, + "grad_norm": 2.3305461406707764, + "learning_rate": 4.739344195587515e-05, + "loss": 4.571, + "step": 24659 + }, + { + "epoch": 0.14666000570939194, + "grad_norm": 2.243680238723755, + "learning_rate": 4.739323428791467e-05, + "loss": 4.5274, + "step": 24660 + }, + { + "epoch": 0.14666595299267296, + "grad_norm": 2.1816461086273193, + "learning_rate": 4.739302661213693e-05, + "loss": 4.4871, + "step": 24661 + }, + { + "epoch": 0.14667190027595395, + "grad_norm": 2.0428659915924072, + "learning_rate": 4.739281892854203e-05, + "loss": 4.3641, + "step": 24662 + }, + { + "epoch": 0.14667784755923494, + "grad_norm": 1.902016043663025, + "learning_rate": 4.739261123713001e-05, + "loss": 4.42, + "step": 24663 + }, + { + "epoch": 0.14668379484251595, + "grad_norm": 2.382110118865967, + "learning_rate": 4.7392403537900974e-05, + "loss": 4.3784, + "step": 24664 + }, + { + "epoch": 0.14668974212579694, + "grad_norm": 2.014251470565796, + "learning_rate": 4.739219583085498e-05, + "loss": 4.583, + "step": 24665 + }, + { + "epoch": 0.14669568940907793, + "grad_norm": 2.268214464187622, + "learning_rate": 4.7391988115992106e-05, + "loss": 4.4803, + "step": 24666 + }, + { + "epoch": 0.14670163669235892, + "grad_norm": 2.19326114654541, + "learning_rate": 4.7391780393312405e-05, + "loss": 4.5751, + "step": 24667 + }, + { + "epoch": 0.14670758397563993, + "grad_norm": 2.1453635692596436, + "learning_rate": 4.739157266281597e-05, + "loss": 4.8723, + "step": 24668 + }, + { + "epoch": 0.14671353125892092, + "grad_norm": 1.788976788520813, + "learning_rate": 4.739136492450288e-05, + "loss": 5.3339, + "step": 24669 + }, + { + "epoch": 0.1467194785422019, + "grad_norm": 2.523129940032959, + "learning_rate": 4.739115717837319e-05, + "loss": 4.314, + "step": 24670 + }, + { + "epoch": 0.14672542582548292, + "grad_norm": 2.2541866302490234, + "learning_rate": 4.739094942442698e-05, + "loss": 4.5228, + "step": 24671 + }, + { + "epoch": 0.1467313731087639, + "grad_norm": 2.5569868087768555, + "learning_rate": 4.739074166266431e-05, + "loss": 4.6268, + "step": 24672 + }, + { + "epoch": 0.1467373203920449, + "grad_norm": 1.9912770986557007, + "learning_rate": 4.739053389308528e-05, + "loss": 4.642, + "step": 24673 + }, + { + "epoch": 0.14674326767532592, + "grad_norm": 1.8588427305221558, + "learning_rate": 4.739032611568993e-05, + "loss": 5.2527, + "step": 24674 + }, + { + "epoch": 0.1467492149586069, + "grad_norm": 1.9020613431930542, + "learning_rate": 4.7390118330478356e-05, + "loss": 5.4926, + "step": 24675 + }, + { + "epoch": 0.1467551622418879, + "grad_norm": 2.319058895111084, + "learning_rate": 4.7389910537450624e-05, + "loss": 5.1275, + "step": 24676 + }, + { + "epoch": 0.1467611095251689, + "grad_norm": 1.7051849365234375, + "learning_rate": 4.7389702736606804e-05, + "loss": 5.599, + "step": 24677 + }, + { + "epoch": 0.1467670568084499, + "grad_norm": 1.7340635061264038, + "learning_rate": 4.738949492794696e-05, + "loss": 5.3359, + "step": 24678 + }, + { + "epoch": 0.14677300409173089, + "grad_norm": 1.5634024143218994, + "learning_rate": 4.738928711147119e-05, + "loss": 5.2585, + "step": 24679 + }, + { + "epoch": 0.1467789513750119, + "grad_norm": 1.559401035308838, + "learning_rate": 4.738907928717955e-05, + "loss": 5.297, + "step": 24680 + }, + { + "epoch": 0.1467848986582929, + "grad_norm": 1.5967936515808105, + "learning_rate": 4.738887145507211e-05, + "loss": 5.2068, + "step": 24681 + }, + { + "epoch": 0.14679084594157388, + "grad_norm": 1.6294320821762085, + "learning_rate": 4.7388663615148945e-05, + "loss": 5.1878, + "step": 24682 + }, + { + "epoch": 0.1467967932248549, + "grad_norm": 1.4520001411437988, + "learning_rate": 4.7388455767410135e-05, + "loss": 5.0777, + "step": 24683 + }, + { + "epoch": 0.14680274050813588, + "grad_norm": 1.3392236232757568, + "learning_rate": 4.738824791185573e-05, + "loss": 5.2396, + "step": 24684 + }, + { + "epoch": 0.14680868779141687, + "grad_norm": 1.467822551727295, + "learning_rate": 4.738804004848584e-05, + "loss": 5.253, + "step": 24685 + }, + { + "epoch": 0.14681463507469789, + "grad_norm": 1.5025224685668945, + "learning_rate": 4.7387832177300504e-05, + "loss": 5.386, + "step": 24686 + }, + { + "epoch": 0.14682058235797887, + "grad_norm": 1.6178737878799438, + "learning_rate": 4.73876242982998e-05, + "loss": 5.2601, + "step": 24687 + }, + { + "epoch": 0.14682652964125986, + "grad_norm": 1.4832427501678467, + "learning_rate": 4.7387416411483825e-05, + "loss": 5.0987, + "step": 24688 + }, + { + "epoch": 0.14683247692454088, + "grad_norm": 1.4726454019546509, + "learning_rate": 4.738720851685263e-05, + "loss": 5.3468, + "step": 24689 + }, + { + "epoch": 0.14683842420782187, + "grad_norm": 1.5659757852554321, + "learning_rate": 4.7387000614406284e-05, + "loss": 5.1591, + "step": 24690 + }, + { + "epoch": 0.14684437149110285, + "grad_norm": 1.7832130193710327, + "learning_rate": 4.7386792704144875e-05, + "loss": 5.126, + "step": 24691 + }, + { + "epoch": 0.14685031877438387, + "grad_norm": 1.6943825483322144, + "learning_rate": 4.738658478606846e-05, + "loss": 5.4705, + "step": 24692 + }, + { + "epoch": 0.14685626605766486, + "grad_norm": 1.4877350330352783, + "learning_rate": 4.738637686017713e-05, + "loss": 5.3479, + "step": 24693 + }, + { + "epoch": 0.14686221334094585, + "grad_norm": 2.306101083755493, + "learning_rate": 4.738616892647094e-05, + "loss": 4.4746, + "step": 24694 + }, + { + "epoch": 0.14686816062422686, + "grad_norm": 2.2277164459228516, + "learning_rate": 4.7385960984949976e-05, + "loss": 4.4995, + "step": 24695 + }, + { + "epoch": 0.14687410790750785, + "grad_norm": 1.535406231880188, + "learning_rate": 4.738575303561429e-05, + "loss": 5.3042, + "step": 24696 + }, + { + "epoch": 0.14688005519078884, + "grad_norm": 1.7974361181259155, + "learning_rate": 4.738554507846398e-05, + "loss": 5.3804, + "step": 24697 + }, + { + "epoch": 0.14688600247406985, + "grad_norm": 1.9455167055130005, + "learning_rate": 4.7385337113499104e-05, + "loss": 4.9782, + "step": 24698 + }, + { + "epoch": 0.14689194975735084, + "grad_norm": 2.486859083175659, + "learning_rate": 4.738512914071974e-05, + "loss": 4.5543, + "step": 24699 + }, + { + "epoch": 0.14689789704063183, + "grad_norm": 2.1134984493255615, + "learning_rate": 4.738492116012596e-05, + "loss": 4.3281, + "step": 24700 + }, + { + "epoch": 0.14690384432391285, + "grad_norm": 2.081852674484253, + "learning_rate": 4.7384713171717833e-05, + "loss": 4.3307, + "step": 24701 + }, + { + "epoch": 0.14690979160719383, + "grad_norm": 2.3121731281280518, + "learning_rate": 4.7384505175495435e-05, + "loss": 4.4791, + "step": 24702 + }, + { + "epoch": 0.14691573889047482, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.738429717145883e-05, + "loss": 4.5165, + "step": 24703 + }, + { + "epoch": 0.14692168617375584, + "grad_norm": 1.7863034009933472, + "learning_rate": 4.7384089159608115e-05, + "loss": 4.8086, + "step": 24704 + }, + { + "epoch": 0.14692763345703683, + "grad_norm": 2.0969200134277344, + "learning_rate": 4.7383881139943335e-05, + "loss": 4.7512, + "step": 24705 + }, + { + "epoch": 0.14693358074031782, + "grad_norm": 1.9164679050445557, + "learning_rate": 4.738367311246458e-05, + "loss": 4.5249, + "step": 24706 + }, + { + "epoch": 0.14693952802359883, + "grad_norm": 1.8215450048446655, + "learning_rate": 4.738346507717191e-05, + "loss": 4.7016, + "step": 24707 + }, + { + "epoch": 0.14694547530687982, + "grad_norm": 1.7830946445465088, + "learning_rate": 4.7383257034065395e-05, + "loss": 4.6173, + "step": 24708 + }, + { + "epoch": 0.1469514225901608, + "grad_norm": 1.7251957654953003, + "learning_rate": 4.7383048983145126e-05, + "loss": 4.9539, + "step": 24709 + }, + { + "epoch": 0.14695736987344182, + "grad_norm": 1.6763554811477661, + "learning_rate": 4.738284092441117e-05, + "loss": 4.8123, + "step": 24710 + }, + { + "epoch": 0.1469633171567228, + "grad_norm": 1.5693418979644775, + "learning_rate": 4.738263285786358e-05, + "loss": 4.6586, + "step": 24711 + }, + { + "epoch": 0.1469692644400038, + "grad_norm": 2.5585360527038574, + "learning_rate": 4.738242478350247e-05, + "loss": 4.2875, + "step": 24712 + }, + { + "epoch": 0.14697521172328482, + "grad_norm": 2.41618275642395, + "learning_rate": 4.738221670132786e-05, + "loss": 4.3448, + "step": 24713 + }, + { + "epoch": 0.1469811590065658, + "grad_norm": 2.233074903488159, + "learning_rate": 4.7382008611339867e-05, + "loss": 5.2453, + "step": 24714 + }, + { + "epoch": 0.1469871062898468, + "grad_norm": 1.7833389043807983, + "learning_rate": 4.738180051353854e-05, + "loss": 4.9964, + "step": 24715 + }, + { + "epoch": 0.1469930535731278, + "grad_norm": 1.7970653772354126, + "learning_rate": 4.738159240792396e-05, + "loss": 4.5124, + "step": 24716 + }, + { + "epoch": 0.1469990008564088, + "grad_norm": 2.1043243408203125, + "learning_rate": 4.738138429449619e-05, + "loss": 4.3681, + "step": 24717 + }, + { + "epoch": 0.14700494813968978, + "grad_norm": 1.5849015712738037, + "learning_rate": 4.738117617325532e-05, + "loss": 4.7756, + "step": 24718 + }, + { + "epoch": 0.1470108954229708, + "grad_norm": 1.5067150592803955, + "learning_rate": 4.73809680442014e-05, + "loss": 4.6255, + "step": 24719 + }, + { + "epoch": 0.1470168427062518, + "grad_norm": 1.5583860874176025, + "learning_rate": 4.7380759907334524e-05, + "loss": 4.7671, + "step": 24720 + }, + { + "epoch": 0.14702278998953278, + "grad_norm": 1.9732975959777832, + "learning_rate": 4.7380551762654755e-05, + "loss": 4.514, + "step": 24721 + }, + { + "epoch": 0.1470287372728138, + "grad_norm": 2.2196953296661377, + "learning_rate": 4.738034361016217e-05, + "loss": 4.2897, + "step": 24722 + }, + { + "epoch": 0.14703468455609478, + "grad_norm": 2.3124115467071533, + "learning_rate": 4.738013544985683e-05, + "loss": 4.2081, + "step": 24723 + }, + { + "epoch": 0.14704063183937577, + "grad_norm": 2.4807839393615723, + "learning_rate": 4.737992728173882e-05, + "loss": 4.5975, + "step": 24724 + }, + { + "epoch": 0.14704657912265678, + "grad_norm": 1.6757773160934448, + "learning_rate": 4.737971910580821e-05, + "loss": 5.6665, + "step": 24725 + }, + { + "epoch": 0.14705252640593777, + "grad_norm": 1.9433516263961792, + "learning_rate": 4.7379510922065074e-05, + "loss": 5.0243, + "step": 24726 + }, + { + "epoch": 0.14705847368921876, + "grad_norm": 2.392778158187866, + "learning_rate": 4.737930273050948e-05, + "loss": 4.7769, + "step": 24727 + }, + { + "epoch": 0.14706442097249975, + "grad_norm": 2.730144739151001, + "learning_rate": 4.73790945311415e-05, + "loss": 4.8214, + "step": 24728 + }, + { + "epoch": 0.14707036825578076, + "grad_norm": 1.9504640102386475, + "learning_rate": 4.7378886323961205e-05, + "loss": 4.8057, + "step": 24729 + }, + { + "epoch": 0.14707631553906175, + "grad_norm": 1.7174079418182373, + "learning_rate": 4.7378678108968675e-05, + "loss": 5.1865, + "step": 24730 + }, + { + "epoch": 0.14708226282234274, + "grad_norm": 2.109645128250122, + "learning_rate": 4.737846988616399e-05, + "loss": 5.1682, + "step": 24731 + }, + { + "epoch": 0.14708821010562376, + "grad_norm": 1.9357048273086548, + "learning_rate": 4.7378261655547204e-05, + "loss": 5.0972, + "step": 24732 + }, + { + "epoch": 0.14709415738890474, + "grad_norm": 1.4660345315933228, + "learning_rate": 4.73780534171184e-05, + "loss": 5.7247, + "step": 24733 + }, + { + "epoch": 0.14710010467218573, + "grad_norm": 1.8927645683288574, + "learning_rate": 4.7377845170877644e-05, + "loss": 5.241, + "step": 24734 + }, + { + "epoch": 0.14710605195546675, + "grad_norm": 1.1164909601211548, + "learning_rate": 4.737763691682502e-05, + "loss": 5.4844, + "step": 24735 + }, + { + "epoch": 0.14711199923874774, + "grad_norm": 1.5676599740982056, + "learning_rate": 4.7377428654960584e-05, + "loss": 5.0659, + "step": 24736 + }, + { + "epoch": 0.14711794652202873, + "grad_norm": 2.404731273651123, + "learning_rate": 4.737722038528443e-05, + "loss": 4.5183, + "step": 24737 + }, + { + "epoch": 0.14712389380530974, + "grad_norm": 1.9689422845840454, + "learning_rate": 4.7377012107796615e-05, + "loss": 4.9564, + "step": 24738 + }, + { + "epoch": 0.14712984108859073, + "grad_norm": 2.320307970046997, + "learning_rate": 4.737680382249721e-05, + "loss": 4.4609, + "step": 24739 + }, + { + "epoch": 0.14713578837187172, + "grad_norm": 1.8649024963378906, + "learning_rate": 4.7376595529386305e-05, + "loss": 4.7436, + "step": 24740 + }, + { + "epoch": 0.14714173565515273, + "grad_norm": 2.112926721572876, + "learning_rate": 4.7376387228463956e-05, + "loss": 4.6949, + "step": 24741 + }, + { + "epoch": 0.14714768293843372, + "grad_norm": 2.237760543823242, + "learning_rate": 4.737617891973024e-05, + "loss": 4.5927, + "step": 24742 + }, + { + "epoch": 0.1471536302217147, + "grad_norm": 2.115577220916748, + "learning_rate": 4.737597060318524e-05, + "loss": 4.4007, + "step": 24743 + }, + { + "epoch": 0.14715957750499573, + "grad_norm": 2.0081801414489746, + "learning_rate": 4.737576227882901e-05, + "loss": 4.3844, + "step": 24744 + }, + { + "epoch": 0.1471655247882767, + "grad_norm": 2.1995346546173096, + "learning_rate": 4.737555394666163e-05, + "loss": 4.4581, + "step": 24745 + }, + { + "epoch": 0.1471714720715577, + "grad_norm": 2.2637784481048584, + "learning_rate": 4.7375345606683184e-05, + "loss": 4.4969, + "step": 24746 + }, + { + "epoch": 0.14717741935483872, + "grad_norm": 2.4739608764648438, + "learning_rate": 4.737513725889373e-05, + "loss": 4.521, + "step": 24747 + }, + { + "epoch": 0.1471833666381197, + "grad_norm": 1.6418421268463135, + "learning_rate": 4.737492890329335e-05, + "loss": 5.1064, + "step": 24748 + }, + { + "epoch": 0.1471893139214007, + "grad_norm": 1.9451549053192139, + "learning_rate": 4.737472053988212e-05, + "loss": 4.6824, + "step": 24749 + }, + { + "epoch": 0.1471952612046817, + "grad_norm": 1.9891009330749512, + "learning_rate": 4.7374512168660094e-05, + "loss": 5.2228, + "step": 24750 + }, + { + "epoch": 0.1472012084879627, + "grad_norm": 2.1582279205322266, + "learning_rate": 4.737430378962736e-05, + "loss": 5.7231, + "step": 24751 + }, + { + "epoch": 0.1472071557712437, + "grad_norm": 1.8569883108139038, + "learning_rate": 4.737409540278399e-05, + "loss": 5.3307, + "step": 24752 + }, + { + "epoch": 0.1472131030545247, + "grad_norm": 1.4937759637832642, + "learning_rate": 4.737388700813006e-05, + "loss": 5.3213, + "step": 24753 + }, + { + "epoch": 0.1472190503378057, + "grad_norm": 1.6692577600479126, + "learning_rate": 4.737367860566563e-05, + "loss": 5.2426, + "step": 24754 + }, + { + "epoch": 0.14722499762108668, + "grad_norm": 2.3550398349761963, + "learning_rate": 4.737347019539078e-05, + "loss": 4.7053, + "step": 24755 + }, + { + "epoch": 0.1472309449043677, + "grad_norm": 2.122601270675659, + "learning_rate": 4.737326177730559e-05, + "loss": 4.9372, + "step": 24756 + }, + { + "epoch": 0.14723689218764868, + "grad_norm": 1.429738163948059, + "learning_rate": 4.737305335141012e-05, + "loss": 4.7637, + "step": 24757 + }, + { + "epoch": 0.14724283947092967, + "grad_norm": 1.6185976266860962, + "learning_rate": 4.7372844917704445e-05, + "loss": 4.6184, + "step": 24758 + }, + { + "epoch": 0.1472487867542107, + "grad_norm": 1.495154619216919, + "learning_rate": 4.737263647618865e-05, + "loss": 4.4256, + "step": 24759 + }, + { + "epoch": 0.14725473403749167, + "grad_norm": 1.366437554359436, + "learning_rate": 4.737242802686279e-05, + "loss": 4.5822, + "step": 24760 + }, + { + "epoch": 0.14726068132077266, + "grad_norm": 2.3462178707122803, + "learning_rate": 4.737221956972695e-05, + "loss": 4.9419, + "step": 24761 + }, + { + "epoch": 0.14726662860405368, + "grad_norm": 2.846083402633667, + "learning_rate": 4.73720111047812e-05, + "loss": 4.6403, + "step": 24762 + }, + { + "epoch": 0.14727257588733467, + "grad_norm": 2.388052463531494, + "learning_rate": 4.7371802632025605e-05, + "loss": 4.5375, + "step": 24763 + }, + { + "epoch": 0.14727852317061566, + "grad_norm": 1.4230948686599731, + "learning_rate": 4.7371594151460254e-05, + "loss": 4.6451, + "step": 24764 + }, + { + "epoch": 0.14728447045389667, + "grad_norm": 1.2602354288101196, + "learning_rate": 4.737138566308521e-05, + "loss": 4.4927, + "step": 24765 + }, + { + "epoch": 0.14729041773717766, + "grad_norm": 1.9645811319351196, + "learning_rate": 4.737117716690054e-05, + "loss": 4.875, + "step": 24766 + }, + { + "epoch": 0.14729636502045865, + "grad_norm": 2.729315757751465, + "learning_rate": 4.7370968662906325e-05, + "loss": 4.0048, + "step": 24767 + }, + { + "epoch": 0.14730231230373966, + "grad_norm": 2.797999382019043, + "learning_rate": 4.7370760151102635e-05, + "loss": 4.3436, + "step": 24768 + }, + { + "epoch": 0.14730825958702065, + "grad_norm": 2.058621406555176, + "learning_rate": 4.737055163148955e-05, + "loss": 4.4137, + "step": 24769 + }, + { + "epoch": 0.14731420687030164, + "grad_norm": 1.9290826320648193, + "learning_rate": 4.737034310406713e-05, + "loss": 4.4751, + "step": 24770 + }, + { + "epoch": 0.14732015415358266, + "grad_norm": 2.316140651702881, + "learning_rate": 4.737013456883546e-05, + "loss": 4.4009, + "step": 24771 + }, + { + "epoch": 0.14732610143686364, + "grad_norm": 2.326529026031494, + "learning_rate": 4.7369926025794606e-05, + "loss": 4.4272, + "step": 24772 + }, + { + "epoch": 0.14733204872014463, + "grad_norm": 2.089818239212036, + "learning_rate": 4.736971747494464e-05, + "loss": 4.4192, + "step": 24773 + }, + { + "epoch": 0.14733799600342565, + "grad_norm": 1.714152455329895, + "learning_rate": 4.736950891628564e-05, + "loss": 5.1404, + "step": 24774 + }, + { + "epoch": 0.14734394328670664, + "grad_norm": 2.01911997795105, + "learning_rate": 4.736930034981767e-05, + "loss": 4.7116, + "step": 24775 + }, + { + "epoch": 0.14734989056998762, + "grad_norm": 2.0275747776031494, + "learning_rate": 4.736909177554081e-05, + "loss": 4.4249, + "step": 24776 + }, + { + "epoch": 0.14735583785326864, + "grad_norm": 1.9515576362609863, + "learning_rate": 4.7368883193455135e-05, + "loss": 4.3968, + "step": 24777 + }, + { + "epoch": 0.14736178513654963, + "grad_norm": 1.6079367399215698, + "learning_rate": 4.736867460356071e-05, + "loss": 4.3927, + "step": 24778 + }, + { + "epoch": 0.14736773241983062, + "grad_norm": 1.856449842453003, + "learning_rate": 4.736846600585761e-05, + "loss": 4.4231, + "step": 24779 + }, + { + "epoch": 0.14737367970311163, + "grad_norm": 1.7405143976211548, + "learning_rate": 4.7368257400345915e-05, + "loss": 5.4894, + "step": 24780 + }, + { + "epoch": 0.14737962698639262, + "grad_norm": 1.6344300508499146, + "learning_rate": 4.736804878702569e-05, + "loss": 5.5489, + "step": 24781 + }, + { + "epoch": 0.1473855742696736, + "grad_norm": 1.693015694618225, + "learning_rate": 4.7367840165897014e-05, + "loss": 5.6432, + "step": 24782 + }, + { + "epoch": 0.14739152155295462, + "grad_norm": 1.5487139225006104, + "learning_rate": 4.736763153695995e-05, + "loss": 4.6316, + "step": 24783 + }, + { + "epoch": 0.1473974688362356, + "grad_norm": 1.5867420434951782, + "learning_rate": 4.736742290021458e-05, + "loss": 4.3782, + "step": 24784 + }, + { + "epoch": 0.1474034161195166, + "grad_norm": 1.7892907857894897, + "learning_rate": 4.736721425566097e-05, + "loss": 4.413, + "step": 24785 + }, + { + "epoch": 0.1474093634027976, + "grad_norm": 1.7791600227355957, + "learning_rate": 4.7367005603299206e-05, + "loss": 4.9471, + "step": 24786 + }, + { + "epoch": 0.1474153106860786, + "grad_norm": 1.5871254205703735, + "learning_rate": 4.736679694312934e-05, + "loss": 5.6475, + "step": 24787 + }, + { + "epoch": 0.1474212579693596, + "grad_norm": 1.5154014825820923, + "learning_rate": 4.7366588275151465e-05, + "loss": 5.6038, + "step": 24788 + }, + { + "epoch": 0.14742720525264058, + "grad_norm": 1.4058479070663452, + "learning_rate": 4.736637959936564e-05, + "loss": 5.4371, + "step": 24789 + }, + { + "epoch": 0.1474331525359216, + "grad_norm": 1.5023268461227417, + "learning_rate": 4.7366170915771946e-05, + "loss": 5.6043, + "step": 24790 + }, + { + "epoch": 0.14743909981920258, + "grad_norm": 1.573081135749817, + "learning_rate": 4.7365962224370445e-05, + "loss": 4.6014, + "step": 24791 + }, + { + "epoch": 0.14744504710248357, + "grad_norm": 1.413909673690796, + "learning_rate": 4.7365753525161225e-05, + "loss": 5.1478, + "step": 24792 + }, + { + "epoch": 0.1474509943857646, + "grad_norm": 1.6636765003204346, + "learning_rate": 4.736554481814435e-05, + "loss": 5.3099, + "step": 24793 + }, + { + "epoch": 0.14745694166904558, + "grad_norm": 1.4575749635696411, + "learning_rate": 4.7365336103319904e-05, + "loss": 4.7067, + "step": 24794 + }, + { + "epoch": 0.14746288895232657, + "grad_norm": 1.4840314388275146, + "learning_rate": 4.736512738068793e-05, + "loss": 5.3591, + "step": 24795 + }, + { + "epoch": 0.14746883623560758, + "grad_norm": 1.8716658353805542, + "learning_rate": 4.736491865024853e-05, + "loss": 4.9905, + "step": 24796 + }, + { + "epoch": 0.14747478351888857, + "grad_norm": 1.5661007165908813, + "learning_rate": 4.736470991200178e-05, + "loss": 5.5725, + "step": 24797 + }, + { + "epoch": 0.14748073080216956, + "grad_norm": 1.7020787000656128, + "learning_rate": 4.736450116594773e-05, + "loss": 4.97, + "step": 24798 + }, + { + "epoch": 0.14748667808545057, + "grad_norm": 1.7010732889175415, + "learning_rate": 4.736429241208646e-05, + "loss": 5.0832, + "step": 24799 + }, + { + "epoch": 0.14749262536873156, + "grad_norm": 2.984389305114746, + "learning_rate": 4.7364083650418057e-05, + "loss": 4.5466, + "step": 24800 + }, + { + "epoch": 0.14749857265201255, + "grad_norm": 1.8300197124481201, + "learning_rate": 4.7363874880942574e-05, + "loss": 4.9772, + "step": 24801 + }, + { + "epoch": 0.14750451993529357, + "grad_norm": 1.685394048690796, + "learning_rate": 4.73636661036601e-05, + "loss": 5.0689, + "step": 24802 + }, + { + "epoch": 0.14751046721857455, + "grad_norm": 1.559996485710144, + "learning_rate": 4.7363457318570695e-05, + "loss": 5.1496, + "step": 24803 + }, + { + "epoch": 0.14751641450185554, + "grad_norm": 1.5654375553131104, + "learning_rate": 4.736324852567444e-05, + "loss": 5.1427, + "step": 24804 + }, + { + "epoch": 0.14752236178513656, + "grad_norm": 2.0388715267181396, + "learning_rate": 4.736303972497141e-05, + "loss": 4.6176, + "step": 24805 + }, + { + "epoch": 0.14752830906841755, + "grad_norm": 2.139695882797241, + "learning_rate": 4.736283091646167e-05, + "loss": 4.7746, + "step": 24806 + }, + { + "epoch": 0.14753425635169853, + "grad_norm": 1.6551018953323364, + "learning_rate": 4.73626221001453e-05, + "loss": 5.3522, + "step": 24807 + }, + { + "epoch": 0.14754020363497955, + "grad_norm": 1.6643954515457153, + "learning_rate": 4.7362413276022364e-05, + "loss": 5.5479, + "step": 24808 + }, + { + "epoch": 0.14754615091826054, + "grad_norm": 1.6942282915115356, + "learning_rate": 4.7362204444092947e-05, + "loss": 5.2971, + "step": 24809 + }, + { + "epoch": 0.14755209820154153, + "grad_norm": 2.1273419857025146, + "learning_rate": 4.736199560435711e-05, + "loss": 5.1465, + "step": 24810 + }, + { + "epoch": 0.14755804548482254, + "grad_norm": 2.1430892944335938, + "learning_rate": 4.736178675681493e-05, + "loss": 4.9944, + "step": 24811 + }, + { + "epoch": 0.14756399276810353, + "grad_norm": 2.1971189975738525, + "learning_rate": 4.736157790146649e-05, + "loss": 5.2348, + "step": 24812 + }, + { + "epoch": 0.14756994005138452, + "grad_norm": 1.7993513345718384, + "learning_rate": 4.7361369038311855e-05, + "loss": 5.0186, + "step": 24813 + }, + { + "epoch": 0.14757588733466553, + "grad_norm": 1.8296352624893188, + "learning_rate": 4.7361160167351085e-05, + "loss": 4.9939, + "step": 24814 + }, + { + "epoch": 0.14758183461794652, + "grad_norm": 1.6994922161102295, + "learning_rate": 4.7360951288584276e-05, + "loss": 5.0838, + "step": 24815 + }, + { + "epoch": 0.1475877819012275, + "grad_norm": 1.8526664972305298, + "learning_rate": 4.736074240201148e-05, + "loss": 4.9977, + "step": 24816 + }, + { + "epoch": 0.14759372918450853, + "grad_norm": 1.6255830526351929, + "learning_rate": 4.736053350763279e-05, + "loss": 5.111, + "step": 24817 + }, + { + "epoch": 0.14759967646778951, + "grad_norm": 1.6871737241744995, + "learning_rate": 4.736032460544826e-05, + "loss": 4.8522, + "step": 24818 + }, + { + "epoch": 0.1476056237510705, + "grad_norm": 1.8430577516555786, + "learning_rate": 4.7360115695457975e-05, + "loss": 4.9312, + "step": 24819 + }, + { + "epoch": 0.14761157103435152, + "grad_norm": 1.6737143993377686, + "learning_rate": 4.735990677766201e-05, + "loss": 4.7894, + "step": 24820 + }, + { + "epoch": 0.1476175183176325, + "grad_norm": 1.648138403892517, + "learning_rate": 4.7359697852060425e-05, + "loss": 4.8173, + "step": 24821 + }, + { + "epoch": 0.1476234656009135, + "grad_norm": 1.8230416774749756, + "learning_rate": 4.73594889186533e-05, + "loss": 5.0618, + "step": 24822 + }, + { + "epoch": 0.1476294128841945, + "grad_norm": 1.928932547569275, + "learning_rate": 4.735927997744072e-05, + "loss": 4.8846, + "step": 24823 + }, + { + "epoch": 0.1476353601674755, + "grad_norm": 1.8593389987945557, + "learning_rate": 4.735907102842273e-05, + "loss": 5.0283, + "step": 24824 + }, + { + "epoch": 0.1476413074507565, + "grad_norm": 1.988168478012085, + "learning_rate": 4.735886207159943e-05, + "loss": 5.0253, + "step": 24825 + }, + { + "epoch": 0.1476472547340375, + "grad_norm": 1.6367772817611694, + "learning_rate": 4.7358653106970885e-05, + "loss": 4.9296, + "step": 24826 + }, + { + "epoch": 0.1476532020173185, + "grad_norm": 1.7799687385559082, + "learning_rate": 4.7358444134537154e-05, + "loss": 4.5257, + "step": 24827 + }, + { + "epoch": 0.14765914930059948, + "grad_norm": 1.8706213235855103, + "learning_rate": 4.735823515429833e-05, + "loss": 4.9739, + "step": 24828 + }, + { + "epoch": 0.1476650965838805, + "grad_norm": 1.7662311792373657, + "learning_rate": 4.7358026166254476e-05, + "loss": 4.9545, + "step": 24829 + }, + { + "epoch": 0.14767104386716148, + "grad_norm": 1.6466079950332642, + "learning_rate": 4.7357817170405664e-05, + "loss": 4.8203, + "step": 24830 + }, + { + "epoch": 0.14767699115044247, + "grad_norm": 1.7296116352081299, + "learning_rate": 4.7357608166751965e-05, + "loss": 4.7575, + "step": 24831 + }, + { + "epoch": 0.1476829384337235, + "grad_norm": 1.6118981838226318, + "learning_rate": 4.735739915529346e-05, + "loss": 4.6546, + "step": 24832 + }, + { + "epoch": 0.14768888571700448, + "grad_norm": 1.7108652591705322, + "learning_rate": 4.735719013603022e-05, + "loss": 5.5278, + "step": 24833 + }, + { + "epoch": 0.14769483300028546, + "grad_norm": 1.583243727684021, + "learning_rate": 4.735698110896232e-05, + "loss": 5.5526, + "step": 24834 + }, + { + "epoch": 0.14770078028356648, + "grad_norm": 1.9354965686798096, + "learning_rate": 4.735677207408982e-05, + "loss": 4.9137, + "step": 24835 + }, + { + "epoch": 0.14770672756684747, + "grad_norm": 2.2551913261413574, + "learning_rate": 4.7356563031412805e-05, + "loss": 5.105, + "step": 24836 + }, + { + "epoch": 0.14771267485012846, + "grad_norm": 1.8324413299560547, + "learning_rate": 4.7356353980931344e-05, + "loss": 5.1002, + "step": 24837 + }, + { + "epoch": 0.14771862213340947, + "grad_norm": 1.7993746995925903, + "learning_rate": 4.7356144922645504e-05, + "loss": 5.0061, + "step": 24838 + }, + { + "epoch": 0.14772456941669046, + "grad_norm": 1.6633015871047974, + "learning_rate": 4.735593585655538e-05, + "loss": 5.6399, + "step": 24839 + }, + { + "epoch": 0.14773051669997145, + "grad_norm": 1.6153156757354736, + "learning_rate": 4.735572678266102e-05, + "loss": 5.845, + "step": 24840 + }, + { + "epoch": 0.14773646398325246, + "grad_norm": 1.5680739879608154, + "learning_rate": 4.7355517700962506e-05, + "loss": 4.9451, + "step": 24841 + }, + { + "epoch": 0.14774241126653345, + "grad_norm": 1.7775828838348389, + "learning_rate": 4.735530861145992e-05, + "loss": 5.3363, + "step": 24842 + }, + { + "epoch": 0.14774835854981444, + "grad_norm": 1.5199836492538452, + "learning_rate": 4.7355099514153316e-05, + "loss": 5.2147, + "step": 24843 + }, + { + "epoch": 0.14775430583309543, + "grad_norm": 1.5332800149917603, + "learning_rate": 4.7354890409042783e-05, + "loss": 5.2439, + "step": 24844 + }, + { + "epoch": 0.14776025311637644, + "grad_norm": 2.0724799633026123, + "learning_rate": 4.735468129612839e-05, + "loss": 5.0292, + "step": 24845 + }, + { + "epoch": 0.14776620039965743, + "grad_norm": 2.5946760177612305, + "learning_rate": 4.73544721754102e-05, + "loss": 4.973, + "step": 24846 + }, + { + "epoch": 0.14777214768293842, + "grad_norm": 1.9194954633712769, + "learning_rate": 4.735426304688831e-05, + "loss": 4.7452, + "step": 24847 + }, + { + "epoch": 0.14777809496621944, + "grad_norm": 1.38433039188385, + "learning_rate": 4.735405391056277e-05, + "loss": 5.5551, + "step": 24848 + }, + { + "epoch": 0.14778404224950042, + "grad_norm": 1.8728227615356445, + "learning_rate": 4.735384476643366e-05, + "loss": 5.3088, + "step": 24849 + }, + { + "epoch": 0.1477899895327814, + "grad_norm": 1.6192907094955444, + "learning_rate": 4.7353635614501054e-05, + "loss": 5.3365, + "step": 24850 + }, + { + "epoch": 0.14779593681606243, + "grad_norm": 1.4671828746795654, + "learning_rate": 4.735342645476503e-05, + "loss": 5.5339, + "step": 24851 + }, + { + "epoch": 0.14780188409934342, + "grad_norm": 1.924024224281311, + "learning_rate": 4.7353217287225646e-05, + "loss": 5.2287, + "step": 24852 + }, + { + "epoch": 0.1478078313826244, + "grad_norm": 1.6585190296173096, + "learning_rate": 4.735300811188299e-05, + "loss": 5.124, + "step": 24853 + }, + { + "epoch": 0.14781377866590542, + "grad_norm": 1.6820423603057861, + "learning_rate": 4.735279892873713e-05, + "loss": 5.4088, + "step": 24854 + }, + { + "epoch": 0.1478197259491864, + "grad_norm": 1.5978790521621704, + "learning_rate": 4.7352589737788134e-05, + "loss": 5.8087, + "step": 24855 + }, + { + "epoch": 0.1478256732324674, + "grad_norm": 1.6521705389022827, + "learning_rate": 4.735238053903609e-05, + "loss": 5.2014, + "step": 24856 + }, + { + "epoch": 0.1478316205157484, + "grad_norm": 1.6667120456695557, + "learning_rate": 4.7352171332481056e-05, + "loss": 5.1015, + "step": 24857 + }, + { + "epoch": 0.1478375677990294, + "grad_norm": 1.7318087816238403, + "learning_rate": 4.735196211812311e-05, + "loss": 5.4063, + "step": 24858 + }, + { + "epoch": 0.1478435150823104, + "grad_norm": 1.7706724405288696, + "learning_rate": 4.735175289596232e-05, + "loss": 5.0941, + "step": 24859 + }, + { + "epoch": 0.1478494623655914, + "grad_norm": 1.5582432746887207, + "learning_rate": 4.7351543665998764e-05, + "loss": 5.2643, + "step": 24860 + }, + { + "epoch": 0.1478554096488724, + "grad_norm": 1.5588469505310059, + "learning_rate": 4.735133442823252e-05, + "loss": 5.5234, + "step": 24861 + }, + { + "epoch": 0.14786135693215338, + "grad_norm": 2.5532615184783936, + "learning_rate": 4.735112518266366e-05, + "loss": 4.5405, + "step": 24862 + }, + { + "epoch": 0.1478673042154344, + "grad_norm": 1.5495831966400146, + "learning_rate": 4.735091592929224e-05, + "loss": 5.5153, + "step": 24863 + }, + { + "epoch": 0.14787325149871539, + "grad_norm": 1.4878839254379272, + "learning_rate": 4.7350706668118356e-05, + "loss": 5.2186, + "step": 24864 + }, + { + "epoch": 0.14787919878199637, + "grad_norm": 1.4914618730545044, + "learning_rate": 4.735049739914207e-05, + "loss": 5.3108, + "step": 24865 + }, + { + "epoch": 0.1478851460652774, + "grad_norm": 1.6413542032241821, + "learning_rate": 4.735028812236345e-05, + "loss": 5.2726, + "step": 24866 + }, + { + "epoch": 0.14789109334855838, + "grad_norm": 1.6650172472000122, + "learning_rate": 4.735007883778259e-05, + "loss": 5.3186, + "step": 24867 + }, + { + "epoch": 0.14789704063183937, + "grad_norm": 1.5289151668548584, + "learning_rate": 4.734986954539954e-05, + "loss": 5.1124, + "step": 24868 + }, + { + "epoch": 0.14790298791512038, + "grad_norm": 1.5151697397232056, + "learning_rate": 4.734966024521438e-05, + "loss": 5.495, + "step": 24869 + }, + { + "epoch": 0.14790893519840137, + "grad_norm": 1.3832122087478638, + "learning_rate": 4.734945093722718e-05, + "loss": 5.426, + "step": 24870 + }, + { + "epoch": 0.14791488248168236, + "grad_norm": 1.6117453575134277, + "learning_rate": 4.7349241621438023e-05, + "loss": 5.2548, + "step": 24871 + }, + { + "epoch": 0.14792082976496337, + "grad_norm": 1.5391991138458252, + "learning_rate": 4.734903229784698e-05, + "loss": 4.7025, + "step": 24872 + }, + { + "epoch": 0.14792677704824436, + "grad_norm": 1.649274468421936, + "learning_rate": 4.734882296645411e-05, + "loss": 5.4152, + "step": 24873 + }, + { + "epoch": 0.14793272433152535, + "grad_norm": 1.7147942781448364, + "learning_rate": 4.734861362725951e-05, + "loss": 5.4865, + "step": 24874 + }, + { + "epoch": 0.14793867161480637, + "grad_norm": 1.4434807300567627, + "learning_rate": 4.734840428026324e-05, + "loss": 5.5211, + "step": 24875 + }, + { + "epoch": 0.14794461889808735, + "grad_norm": 1.4886515140533447, + "learning_rate": 4.7348194925465364e-05, + "loss": 5.197, + "step": 24876 + }, + { + "epoch": 0.14795056618136834, + "grad_norm": 1.3683615922927856, + "learning_rate": 4.734798556286596e-05, + "loss": 4.9886, + "step": 24877 + }, + { + "epoch": 0.14795651346464936, + "grad_norm": 1.4986892938613892, + "learning_rate": 4.734777619246512e-05, + "loss": 5.0067, + "step": 24878 + }, + { + "epoch": 0.14796246074793035, + "grad_norm": 1.8438472747802734, + "learning_rate": 4.734756681426289e-05, + "loss": 5.2865, + "step": 24879 + }, + { + "epoch": 0.14796840803121133, + "grad_norm": 1.710975170135498, + "learning_rate": 4.734735742825935e-05, + "loss": 5.1215, + "step": 24880 + }, + { + "epoch": 0.14797435531449235, + "grad_norm": 2.074619770050049, + "learning_rate": 4.7347148034454594e-05, + "loss": 4.5968, + "step": 24881 + }, + { + "epoch": 0.14798030259777334, + "grad_norm": 2.5662643909454346, + "learning_rate": 4.7346938632848676e-05, + "loss": 4.3404, + "step": 24882 + }, + { + "epoch": 0.14798624988105433, + "grad_norm": 1.6698600053787231, + "learning_rate": 4.7346729223441665e-05, + "loss": 5.2027, + "step": 24883 + }, + { + "epoch": 0.14799219716433534, + "grad_norm": 2.1604435443878174, + "learning_rate": 4.7346519806233644e-05, + "loss": 4.4595, + "step": 24884 + }, + { + "epoch": 0.14799814444761633, + "grad_norm": 2.7507572174072266, + "learning_rate": 4.734631038122469e-05, + "loss": 3.1764, + "step": 24885 + }, + { + "epoch": 0.14800409173089732, + "grad_norm": 2.8016562461853027, + "learning_rate": 4.734610094841487e-05, + "loss": 3.8763, + "step": 24886 + }, + { + "epoch": 0.14801003901417834, + "grad_norm": 2.9202160835266113, + "learning_rate": 4.7345891507804253e-05, + "loss": 3.6681, + "step": 24887 + }, + { + "epoch": 0.14801598629745932, + "grad_norm": 3.071167230606079, + "learning_rate": 4.7345682059392914e-05, + "loss": 3.027, + "step": 24888 + }, + { + "epoch": 0.1480219335807403, + "grad_norm": 2.7173242568969727, + "learning_rate": 4.734547260318093e-05, + "loss": 3.3615, + "step": 24889 + }, + { + "epoch": 0.14802788086402133, + "grad_norm": 2.1972641944885254, + "learning_rate": 4.7345263139168375e-05, + "loss": 4.8097, + "step": 24890 + }, + { + "epoch": 0.14803382814730232, + "grad_norm": 2.031700849533081, + "learning_rate": 4.7345053667355324e-05, + "loss": 5.1153, + "step": 24891 + }, + { + "epoch": 0.1480397754305833, + "grad_norm": 2.627568483352661, + "learning_rate": 4.734484418774183e-05, + "loss": 4.3777, + "step": 24892 + }, + { + "epoch": 0.14804572271386432, + "grad_norm": 2.2821667194366455, + "learning_rate": 4.734463470032799e-05, + "loss": 4.4845, + "step": 24893 + }, + { + "epoch": 0.1480516699971453, + "grad_norm": 1.8525490760803223, + "learning_rate": 4.7344425205113875e-05, + "loss": 5.4187, + "step": 24894 + }, + { + "epoch": 0.1480576172804263, + "grad_norm": 2.0583372116088867, + "learning_rate": 4.7344215702099546e-05, + "loss": 4.4807, + "step": 24895 + }, + { + "epoch": 0.1480635645637073, + "grad_norm": 1.7403303384780884, + "learning_rate": 4.734400619128509e-05, + "loss": 5.5355, + "step": 24896 + }, + { + "epoch": 0.1480695118469883, + "grad_norm": 2.953425645828247, + "learning_rate": 4.734379667267056e-05, + "loss": 4.0136, + "step": 24897 + }, + { + "epoch": 0.1480754591302693, + "grad_norm": 2.8318042755126953, + "learning_rate": 4.7343587146256044e-05, + "loss": 3.5818, + "step": 24898 + }, + { + "epoch": 0.1480814064135503, + "grad_norm": 1.6144517660140991, + "learning_rate": 4.7343377612041615e-05, + "loss": 4.789, + "step": 24899 + }, + { + "epoch": 0.1480873536968313, + "grad_norm": 1.639545202255249, + "learning_rate": 4.734316807002734e-05, + "loss": 5.1812, + "step": 24900 + }, + { + "epoch": 0.14809330098011228, + "grad_norm": 1.7593424320220947, + "learning_rate": 4.734295852021331e-05, + "loss": 5.0547, + "step": 24901 + }, + { + "epoch": 0.14809924826339327, + "grad_norm": 1.6794737577438354, + "learning_rate": 4.734274896259957e-05, + "loss": 5.125, + "step": 24902 + }, + { + "epoch": 0.14810519554667428, + "grad_norm": 1.5941787958145142, + "learning_rate": 4.734253939718621e-05, + "loss": 5.0559, + "step": 24903 + }, + { + "epoch": 0.14811114282995527, + "grad_norm": 1.9701952934265137, + "learning_rate": 4.7342329823973304e-05, + "loss": 4.7468, + "step": 24904 + }, + { + "epoch": 0.14811709011323626, + "grad_norm": 1.8744746446609497, + "learning_rate": 4.734212024296092e-05, + "loss": 5.2544, + "step": 24905 + }, + { + "epoch": 0.14812303739651728, + "grad_norm": 1.5343592166900635, + "learning_rate": 4.734191065414913e-05, + "loss": 5.1794, + "step": 24906 + }, + { + "epoch": 0.14812898467979826, + "grad_norm": 1.509623408317566, + "learning_rate": 4.734170105753801e-05, + "loss": 5.4512, + "step": 24907 + }, + { + "epoch": 0.14813493196307925, + "grad_norm": 1.4235179424285889, + "learning_rate": 4.734149145312764e-05, + "loss": 5.4535, + "step": 24908 + }, + { + "epoch": 0.14814087924636027, + "grad_norm": 1.4011653661727905, + "learning_rate": 4.7341281840918076e-05, + "loss": 5.4248, + "step": 24909 + }, + { + "epoch": 0.14814682652964126, + "grad_norm": 1.3742294311523438, + "learning_rate": 4.734107222090941e-05, + "loss": 5.3076, + "step": 24910 + }, + { + "epoch": 0.14815277381292224, + "grad_norm": 1.4808472394943237, + "learning_rate": 4.73408625931017e-05, + "loss": 5.4432, + "step": 24911 + }, + { + "epoch": 0.14815872109620326, + "grad_norm": 1.3847295045852661, + "learning_rate": 4.734065295749502e-05, + "loss": 5.4678, + "step": 24912 + }, + { + "epoch": 0.14816466837948425, + "grad_norm": 1.4962565898895264, + "learning_rate": 4.734044331408947e-05, + "loss": 5.6803, + "step": 24913 + }, + { + "epoch": 0.14817061566276524, + "grad_norm": 1.7258118391036987, + "learning_rate": 4.734023366288508e-05, + "loss": 4.933, + "step": 24914 + }, + { + "epoch": 0.14817656294604625, + "grad_norm": 1.7875369787216187, + "learning_rate": 4.7340024003881955e-05, + "loss": 4.9978, + "step": 24915 + }, + { + "epoch": 0.14818251022932724, + "grad_norm": 1.5841879844665527, + "learning_rate": 4.733981433708016e-05, + "loss": 5.1718, + "step": 24916 + }, + { + "epoch": 0.14818845751260823, + "grad_norm": 1.4346718788146973, + "learning_rate": 4.733960466247976e-05, + "loss": 4.6579, + "step": 24917 + }, + { + "epoch": 0.14819440479588925, + "grad_norm": 1.4387844800949097, + "learning_rate": 4.7339394980080844e-05, + "loss": 5.012, + "step": 24918 + }, + { + "epoch": 0.14820035207917023, + "grad_norm": 1.7081257104873657, + "learning_rate": 4.733918528988347e-05, + "loss": 5.4316, + "step": 24919 + }, + { + "epoch": 0.14820629936245122, + "grad_norm": 1.7600195407867432, + "learning_rate": 4.733897559188771e-05, + "loss": 5.309, + "step": 24920 + }, + { + "epoch": 0.14821224664573224, + "grad_norm": 1.7399616241455078, + "learning_rate": 4.733876588609366e-05, + "loss": 5.1796, + "step": 24921 + }, + { + "epoch": 0.14821819392901323, + "grad_norm": 1.7843348979949951, + "learning_rate": 4.733855617250137e-05, + "loss": 5.0371, + "step": 24922 + }, + { + "epoch": 0.1482241412122942, + "grad_norm": 1.6706308126449585, + "learning_rate": 4.733834645111092e-05, + "loss": 5.1058, + "step": 24923 + }, + { + "epoch": 0.14823008849557523, + "grad_norm": 2.6056525707244873, + "learning_rate": 4.733813672192239e-05, + "loss": 4.5804, + "step": 24924 + }, + { + "epoch": 0.14823603577885622, + "grad_norm": 1.836887001991272, + "learning_rate": 4.733792698493584e-05, + "loss": 5.0871, + "step": 24925 + }, + { + "epoch": 0.1482419830621372, + "grad_norm": 1.8913605213165283, + "learning_rate": 4.733771724015135e-05, + "loss": 5.4228, + "step": 24926 + }, + { + "epoch": 0.14824793034541822, + "grad_norm": 1.7032699584960938, + "learning_rate": 4.7337507487569e-05, + "loss": 5.5599, + "step": 24927 + }, + { + "epoch": 0.1482538776286992, + "grad_norm": 1.6115164756774902, + "learning_rate": 4.733729772718885e-05, + "loss": 5.5348, + "step": 24928 + }, + { + "epoch": 0.1482598249119802, + "grad_norm": 1.563080906867981, + "learning_rate": 4.733708795901098e-05, + "loss": 5.4334, + "step": 24929 + }, + { + "epoch": 0.14826577219526121, + "grad_norm": 1.6452966928482056, + "learning_rate": 4.733687818303547e-05, + "loss": 5.7378, + "step": 24930 + }, + { + "epoch": 0.1482717194785422, + "grad_norm": 1.602687120437622, + "learning_rate": 4.7336668399262386e-05, + "loss": 5.7311, + "step": 24931 + }, + { + "epoch": 0.1482776667618232, + "grad_norm": 1.6656992435455322, + "learning_rate": 4.73364586076918e-05, + "loss": 5.3285, + "step": 24932 + }, + { + "epoch": 0.1482836140451042, + "grad_norm": 2.0401406288146973, + "learning_rate": 4.7336248808323786e-05, + "loss": 4.9655, + "step": 24933 + }, + { + "epoch": 0.1482895613283852, + "grad_norm": 2.536595582962036, + "learning_rate": 4.733603900115842e-05, + "loss": 4.6622, + "step": 24934 + }, + { + "epoch": 0.14829550861166618, + "grad_norm": 1.5609594583511353, + "learning_rate": 4.7335829186195766e-05, + "loss": 5.2326, + "step": 24935 + }, + { + "epoch": 0.1483014558949472, + "grad_norm": 1.6761829853057861, + "learning_rate": 4.733561936343591e-05, + "loss": 5.4059, + "step": 24936 + }, + { + "epoch": 0.1483074031782282, + "grad_norm": 1.1501821279525757, + "learning_rate": 4.733540953287893e-05, + "loss": 4.8906, + "step": 24937 + }, + { + "epoch": 0.14831335046150917, + "grad_norm": 1.6217314004898071, + "learning_rate": 4.733519969452488e-05, + "loss": 4.8381, + "step": 24938 + }, + { + "epoch": 0.1483192977447902, + "grad_norm": 1.8240901231765747, + "learning_rate": 4.733498984837384e-05, + "loss": 5.4137, + "step": 24939 + }, + { + "epoch": 0.14832524502807118, + "grad_norm": 1.7012525796890259, + "learning_rate": 4.733477999442589e-05, + "loss": 5.4581, + "step": 24940 + }, + { + "epoch": 0.14833119231135217, + "grad_norm": 1.3260048627853394, + "learning_rate": 4.73345701326811e-05, + "loss": 5.6434, + "step": 24941 + }, + { + "epoch": 0.14833713959463318, + "grad_norm": 1.6175122261047363, + "learning_rate": 4.7334360263139536e-05, + "loss": 5.5073, + "step": 24942 + }, + { + "epoch": 0.14834308687791417, + "grad_norm": 1.890405535697937, + "learning_rate": 4.7334150385801276e-05, + "loss": 5.059, + "step": 24943 + }, + { + "epoch": 0.14834903416119516, + "grad_norm": 2.121887683868408, + "learning_rate": 4.733394050066641e-05, + "loss": 4.7292, + "step": 24944 + }, + { + "epoch": 0.14835498144447617, + "grad_norm": 2.054938316345215, + "learning_rate": 4.7333730607734985e-05, + "loss": 4.7551, + "step": 24945 + }, + { + "epoch": 0.14836092872775716, + "grad_norm": 1.853046178817749, + "learning_rate": 4.733352070700708e-05, + "loss": 4.7807, + "step": 24946 + }, + { + "epoch": 0.14836687601103815, + "grad_norm": 1.926611304283142, + "learning_rate": 4.733331079848279e-05, + "loss": 5.026, + "step": 24947 + }, + { + "epoch": 0.14837282329431917, + "grad_norm": 1.9281972646713257, + "learning_rate": 4.7333100882162164e-05, + "loss": 5.0131, + "step": 24948 + }, + { + "epoch": 0.14837877057760016, + "grad_norm": 2.158128499984741, + "learning_rate": 4.733289095804527e-05, + "loss": 4.8987, + "step": 24949 + }, + { + "epoch": 0.14838471786088114, + "grad_norm": 1.9640719890594482, + "learning_rate": 4.7332681026132216e-05, + "loss": 4.868, + "step": 24950 + }, + { + "epoch": 0.14839066514416216, + "grad_norm": 2.0871901512145996, + "learning_rate": 4.7332471086423045e-05, + "loss": 4.8542, + "step": 24951 + }, + { + "epoch": 0.14839661242744315, + "grad_norm": 2.2361068725585938, + "learning_rate": 4.7332261138917836e-05, + "loss": 4.9536, + "step": 24952 + }, + { + "epoch": 0.14840255971072414, + "grad_norm": 2.3177475929260254, + "learning_rate": 4.7332051183616665e-05, + "loss": 4.9228, + "step": 24953 + }, + { + "epoch": 0.14840850699400515, + "grad_norm": 2.0412709712982178, + "learning_rate": 4.733184122051961e-05, + "loss": 4.888, + "step": 24954 + }, + { + "epoch": 0.14841445427728614, + "grad_norm": 1.904599666595459, + "learning_rate": 4.733163124962674e-05, + "loss": 4.842, + "step": 24955 + }, + { + "epoch": 0.14842040156056713, + "grad_norm": 2.3957440853118896, + "learning_rate": 4.733142127093813e-05, + "loss": 4.7589, + "step": 24956 + }, + { + "epoch": 0.14842634884384814, + "grad_norm": 1.966145634651184, + "learning_rate": 4.733121128445384e-05, + "loss": 4.5783, + "step": 24957 + }, + { + "epoch": 0.14843229612712913, + "grad_norm": 2.230134963989258, + "learning_rate": 4.7331001290173966e-05, + "loss": 4.6108, + "step": 24958 + }, + { + "epoch": 0.14843824341041012, + "grad_norm": 1.9063829183578491, + "learning_rate": 4.7330791288098565e-05, + "loss": 4.765, + "step": 24959 + }, + { + "epoch": 0.1484441906936911, + "grad_norm": 2.0853664875030518, + "learning_rate": 4.7330581278227716e-05, + "loss": 6.0523, + "step": 24960 + }, + { + "epoch": 0.14845013797697212, + "grad_norm": 2.0823090076446533, + "learning_rate": 4.7330371260561494e-05, + "loss": 6.1014, + "step": 24961 + }, + { + "epoch": 0.1484560852602531, + "grad_norm": 1.7553062438964844, + "learning_rate": 4.733016123509997e-05, + "loss": 5.5322, + "step": 24962 + }, + { + "epoch": 0.1484620325435341, + "grad_norm": 1.7482306957244873, + "learning_rate": 4.7329951201843217e-05, + "loss": 5.5981, + "step": 24963 + }, + { + "epoch": 0.14846797982681512, + "grad_norm": 1.7615885734558105, + "learning_rate": 4.732974116079131e-05, + "loss": 5.447, + "step": 24964 + }, + { + "epoch": 0.1484739271100961, + "grad_norm": 1.645790696144104, + "learning_rate": 4.732953111194432e-05, + "loss": 5.4439, + "step": 24965 + }, + { + "epoch": 0.1484798743933771, + "grad_norm": 1.8099596500396729, + "learning_rate": 4.7329321055302326e-05, + "loss": 5.1291, + "step": 24966 + }, + { + "epoch": 0.1484858216766581, + "grad_norm": 1.8523690700531006, + "learning_rate": 4.732911099086539e-05, + "loss": 4.9296, + "step": 24967 + }, + { + "epoch": 0.1484917689599391, + "grad_norm": 1.7897992134094238, + "learning_rate": 4.732890091863359e-05, + "loss": 5.1764, + "step": 24968 + }, + { + "epoch": 0.14849771624322008, + "grad_norm": 1.8922818899154663, + "learning_rate": 4.7328690838607e-05, + "loss": 5.1548, + "step": 24969 + }, + { + "epoch": 0.1485036635265011, + "grad_norm": 1.9169872999191284, + "learning_rate": 4.73284807507857e-05, + "loss": 5.0837, + "step": 24970 + }, + { + "epoch": 0.1485096108097821, + "grad_norm": 1.649895429611206, + "learning_rate": 4.732827065516976e-05, + "loss": 5.2689, + "step": 24971 + }, + { + "epoch": 0.14851555809306308, + "grad_norm": 1.638153076171875, + "learning_rate": 4.732806055175925e-05, + "loss": 5.5579, + "step": 24972 + }, + { + "epoch": 0.1485215053763441, + "grad_norm": 1.6101715564727783, + "learning_rate": 4.7327850440554244e-05, + "loss": 5.5632, + "step": 24973 + }, + { + "epoch": 0.14852745265962508, + "grad_norm": 1.5299588441848755, + "learning_rate": 4.7327640321554815e-05, + "loss": 5.6415, + "step": 24974 + }, + { + "epoch": 0.14853339994290607, + "grad_norm": 1.508520245552063, + "learning_rate": 4.732743019476104e-05, + "loss": 5.1519, + "step": 24975 + }, + { + "epoch": 0.14853934722618709, + "grad_norm": 1.760366439819336, + "learning_rate": 4.732722006017299e-05, + "loss": 4.2604, + "step": 24976 + }, + { + "epoch": 0.14854529450946807, + "grad_norm": 1.6827213764190674, + "learning_rate": 4.732700991779073e-05, + "loss": 4.2258, + "step": 24977 + }, + { + "epoch": 0.14855124179274906, + "grad_norm": 1.576389193534851, + "learning_rate": 4.732679976761435e-05, + "loss": 4.2854, + "step": 24978 + }, + { + "epoch": 0.14855718907603008, + "grad_norm": 1.592392921447754, + "learning_rate": 4.732658960964391e-05, + "loss": 4.2775, + "step": 24979 + }, + { + "epoch": 0.14856313635931107, + "grad_norm": 1.6771488189697266, + "learning_rate": 4.7326379443879495e-05, + "loss": 4.3001, + "step": 24980 + }, + { + "epoch": 0.14856908364259205, + "grad_norm": 1.584578037261963, + "learning_rate": 4.732616927032117e-05, + "loss": 4.1592, + "step": 24981 + }, + { + "epoch": 0.14857503092587307, + "grad_norm": 1.7568552494049072, + "learning_rate": 4.732595908896901e-05, + "loss": 4.1514, + "step": 24982 + }, + { + "epoch": 0.14858097820915406, + "grad_norm": 1.6334513425827026, + "learning_rate": 4.732574889982309e-05, + "loss": 4.1319, + "step": 24983 + }, + { + "epoch": 0.14858692549243505, + "grad_norm": 1.7330750226974487, + "learning_rate": 4.732553870288347e-05, + "loss": 4.1036, + "step": 24984 + }, + { + "epoch": 0.14859287277571606, + "grad_norm": 1.7719300985336304, + "learning_rate": 4.732532849815024e-05, + "loss": 5.2043, + "step": 24985 + }, + { + "epoch": 0.14859882005899705, + "grad_norm": 2.9879441261291504, + "learning_rate": 4.732511828562347e-05, + "loss": 3.8784, + "step": 24986 + }, + { + "epoch": 0.14860476734227804, + "grad_norm": 1.9443185329437256, + "learning_rate": 4.732490806530324e-05, + "loss": 5.5898, + "step": 24987 + }, + { + "epoch": 0.14861071462555905, + "grad_norm": 1.800279140472412, + "learning_rate": 4.73246978371896e-05, + "loss": 5.465, + "step": 24988 + }, + { + "epoch": 0.14861666190884004, + "grad_norm": 1.9028568267822266, + "learning_rate": 4.732448760128265e-05, + "loss": 4.8782, + "step": 24989 + }, + { + "epoch": 0.14862260919212103, + "grad_norm": 2.79314923286438, + "learning_rate": 4.732427735758245e-05, + "loss": 4.5421, + "step": 24990 + }, + { + "epoch": 0.14862855647540205, + "grad_norm": 2.4686412811279297, + "learning_rate": 4.7324067106089074e-05, + "loss": 4.4616, + "step": 24991 + }, + { + "epoch": 0.14863450375868303, + "grad_norm": 1.8359897136688232, + "learning_rate": 4.73238568468026e-05, + "loss": 4.8081, + "step": 24992 + }, + { + "epoch": 0.14864045104196402, + "grad_norm": 2.3388144969940186, + "learning_rate": 4.732364657972309e-05, + "loss": 4.527, + "step": 24993 + }, + { + "epoch": 0.14864639832524504, + "grad_norm": 2.888598680496216, + "learning_rate": 4.7323436304850634e-05, + "loss": 4.1855, + "step": 24994 + }, + { + "epoch": 0.14865234560852603, + "grad_norm": 3.1639111042022705, + "learning_rate": 4.7323226022185296e-05, + "loss": 4.0865, + "step": 24995 + }, + { + "epoch": 0.14865829289180701, + "grad_norm": 2.8708033561706543, + "learning_rate": 4.732301573172715e-05, + "loss": 3.8629, + "step": 24996 + }, + { + "epoch": 0.14866424017508803, + "grad_norm": 2.667426347732544, + "learning_rate": 4.732280543347627e-05, + "loss": 4.0511, + "step": 24997 + }, + { + "epoch": 0.14867018745836902, + "grad_norm": 2.5031850337982178, + "learning_rate": 4.7322595127432725e-05, + "loss": 4.2035, + "step": 24998 + }, + { + "epoch": 0.14867613474165, + "grad_norm": 2.4356188774108887, + "learning_rate": 4.7322384813596595e-05, + "loss": 3.8996, + "step": 24999 + }, + { + "epoch": 0.14868208202493102, + "grad_norm": 2.334566354751587, + "learning_rate": 4.732217449196795e-05, + "loss": 4.2353, + "step": 25000 + }, + { + "epoch": 0.148688029308212, + "grad_norm": 2.357844591140747, + "learning_rate": 4.732196416254686e-05, + "loss": 4.3695, + "step": 25001 + }, + { + "epoch": 0.148693976591493, + "grad_norm": 2.4662234783172607, + "learning_rate": 4.7321753825333416e-05, + "loss": 3.9325, + "step": 25002 + }, + { + "epoch": 0.14869992387477401, + "grad_norm": 1.840820074081421, + "learning_rate": 4.7321543480327666e-05, + "loss": 5.1156, + "step": 25003 + }, + { + "epoch": 0.148705871158055, + "grad_norm": 1.9830942153930664, + "learning_rate": 4.73213331275297e-05, + "loss": 4.6774, + "step": 25004 + }, + { + "epoch": 0.148711818441336, + "grad_norm": 1.6185516119003296, + "learning_rate": 4.732112276693959e-05, + "loss": 4.6241, + "step": 25005 + }, + { + "epoch": 0.148717765724617, + "grad_norm": 1.8661324977874756, + "learning_rate": 4.7320912398557403e-05, + "loss": 4.6107, + "step": 25006 + }, + { + "epoch": 0.148723713007898, + "grad_norm": 1.750866174697876, + "learning_rate": 4.7320702022383226e-05, + "loss": 4.7134, + "step": 25007 + }, + { + "epoch": 0.14872966029117898, + "grad_norm": 1.7875406742095947, + "learning_rate": 4.7320491638417105e-05, + "loss": 4.6935, + "step": 25008 + }, + { + "epoch": 0.14873560757446, + "grad_norm": 1.6559946537017822, + "learning_rate": 4.732028124665915e-05, + "loss": 4.7556, + "step": 25009 + }, + { + "epoch": 0.148741554857741, + "grad_norm": 2.075535535812378, + "learning_rate": 4.7320070847109396e-05, + "loss": 4.6646, + "step": 25010 + }, + { + "epoch": 0.14874750214102198, + "grad_norm": 2.1029436588287354, + "learning_rate": 4.731986043976795e-05, + "loss": 5.0169, + "step": 25011 + }, + { + "epoch": 0.148753449424303, + "grad_norm": 1.9193171262741089, + "learning_rate": 4.7319650024634866e-05, + "loss": 5.236, + "step": 25012 + }, + { + "epoch": 0.14875939670758398, + "grad_norm": 1.6295948028564453, + "learning_rate": 4.731943960171022e-05, + "loss": 5.3538, + "step": 25013 + }, + { + "epoch": 0.14876534399086497, + "grad_norm": 1.5699677467346191, + "learning_rate": 4.73192291709941e-05, + "loss": 5.5413, + "step": 25014 + }, + { + "epoch": 0.14877129127414598, + "grad_norm": 2.8893580436706543, + "learning_rate": 4.7319018732486555e-05, + "loss": 4.5995, + "step": 25015 + }, + { + "epoch": 0.14877723855742697, + "grad_norm": 2.366352081298828, + "learning_rate": 4.731880828618768e-05, + "loss": 4.5993, + "step": 25016 + }, + { + "epoch": 0.14878318584070796, + "grad_norm": 2.1206884384155273, + "learning_rate": 4.731859783209753e-05, + "loss": 4.2081, + "step": 25017 + }, + { + "epoch": 0.14878913312398895, + "grad_norm": 2.4171648025512695, + "learning_rate": 4.73183873702162e-05, + "loss": 4.287, + "step": 25018 + }, + { + "epoch": 0.14879508040726996, + "grad_norm": 1.9675270318984985, + "learning_rate": 4.7318176900543744e-05, + "loss": 4.5648, + "step": 25019 + }, + { + "epoch": 0.14880102769055095, + "grad_norm": 1.750753402709961, + "learning_rate": 4.731796642308024e-05, + "loss": 5.6165, + "step": 25020 + }, + { + "epoch": 0.14880697497383194, + "grad_norm": 1.7137641906738281, + "learning_rate": 4.731775593782577e-05, + "loss": 5.1204, + "step": 25021 + }, + { + "epoch": 0.14881292225711296, + "grad_norm": 1.4377870559692383, + "learning_rate": 4.73175454447804e-05, + "loss": 5.4076, + "step": 25022 + }, + { + "epoch": 0.14881886954039394, + "grad_norm": 1.3382959365844727, + "learning_rate": 4.7317334943944204e-05, + "loss": 5.444, + "step": 25023 + }, + { + "epoch": 0.14882481682367493, + "grad_norm": 1.0098121166229248, + "learning_rate": 4.731712443531726e-05, + "loss": 5.2913, + "step": 25024 + }, + { + "epoch": 0.14883076410695595, + "grad_norm": 0.897736132144928, + "learning_rate": 4.7316913918899644e-05, + "loss": 5.2909, + "step": 25025 + }, + { + "epoch": 0.14883671139023694, + "grad_norm": 1.1516233682632446, + "learning_rate": 4.731670339469141e-05, + "loss": 5.3357, + "step": 25026 + }, + { + "epoch": 0.14884265867351792, + "grad_norm": 1.7736589908599854, + "learning_rate": 4.731649286269265e-05, + "loss": 5.1258, + "step": 25027 + }, + { + "epoch": 0.14884860595679894, + "grad_norm": 1.8994569778442383, + "learning_rate": 4.731628232290344e-05, + "loss": 5.5661, + "step": 25028 + }, + { + "epoch": 0.14885455324007993, + "grad_norm": 1.7552026510238647, + "learning_rate": 4.731607177532384e-05, + "loss": 5.3648, + "step": 25029 + }, + { + "epoch": 0.14886050052336092, + "grad_norm": 2.8771791458129883, + "learning_rate": 4.731586121995393e-05, + "loss": 4.6516, + "step": 25030 + }, + { + "epoch": 0.14886644780664193, + "grad_norm": 2.073287010192871, + "learning_rate": 4.731565065679379e-05, + "loss": 4.8374, + "step": 25031 + }, + { + "epoch": 0.14887239508992292, + "grad_norm": 1.6661057472229004, + "learning_rate": 4.7315440085843476e-05, + "loss": 5.0031, + "step": 25032 + }, + { + "epoch": 0.1488783423732039, + "grad_norm": 2.286806106567383, + "learning_rate": 4.7315229507103084e-05, + "loss": 4.3394, + "step": 25033 + }, + { + "epoch": 0.14888428965648492, + "grad_norm": 2.3657538890838623, + "learning_rate": 4.7315018920572666e-05, + "loss": 4.4455, + "step": 25034 + }, + { + "epoch": 0.1488902369397659, + "grad_norm": 2.1653788089752197, + "learning_rate": 4.7314808326252316e-05, + "loss": 4.5676, + "step": 25035 + }, + { + "epoch": 0.1488961842230469, + "grad_norm": 1.853837251663208, + "learning_rate": 4.731459772414208e-05, + "loss": 4.4169, + "step": 25036 + }, + { + "epoch": 0.14890213150632792, + "grad_norm": 2.1202454566955566, + "learning_rate": 4.7314387114242064e-05, + "loss": 4.4917, + "step": 25037 + }, + { + "epoch": 0.1489080787896089, + "grad_norm": 2.1203508377075195, + "learning_rate": 4.731417649655232e-05, + "loss": 4.2212, + "step": 25038 + }, + { + "epoch": 0.1489140260728899, + "grad_norm": 2.220571994781494, + "learning_rate": 4.731396587107293e-05, + "loss": 4.3678, + "step": 25039 + }, + { + "epoch": 0.1489199733561709, + "grad_norm": 1.9346973896026611, + "learning_rate": 4.731375523780397e-05, + "loss": 3.9189, + "step": 25040 + }, + { + "epoch": 0.1489259206394519, + "grad_norm": 2.1453700065612793, + "learning_rate": 4.731354459674549e-05, + "loss": 5.4543, + "step": 25041 + }, + { + "epoch": 0.14893186792273289, + "grad_norm": 2.7248880863189697, + "learning_rate": 4.73133339478976e-05, + "loss": 4.419, + "step": 25042 + }, + { + "epoch": 0.1489378152060139, + "grad_norm": 2.675060510635376, + "learning_rate": 4.731312329126035e-05, + "loss": 4.2858, + "step": 25043 + }, + { + "epoch": 0.1489437624892949, + "grad_norm": 2.5627496242523193, + "learning_rate": 4.731291262683382e-05, + "loss": 4.3065, + "step": 25044 + }, + { + "epoch": 0.14894970977257588, + "grad_norm": 2.238367795944214, + "learning_rate": 4.7312701954618086e-05, + "loss": 4.1853, + "step": 25045 + }, + { + "epoch": 0.1489556570558569, + "grad_norm": 2.144697427749634, + "learning_rate": 4.731249127461321e-05, + "loss": 4.5655, + "step": 25046 + }, + { + "epoch": 0.14896160433913788, + "grad_norm": 1.676389455795288, + "learning_rate": 4.731228058681928e-05, + "loss": 4.8332, + "step": 25047 + }, + { + "epoch": 0.14896755162241887, + "grad_norm": 2.7558321952819824, + "learning_rate": 4.7312069891236364e-05, + "loss": 3.5354, + "step": 25048 + }, + { + "epoch": 0.14897349890569989, + "grad_norm": 1.8224084377288818, + "learning_rate": 4.731185918786453e-05, + "loss": 4.8105, + "step": 25049 + }, + { + "epoch": 0.14897944618898087, + "grad_norm": 1.8380038738250732, + "learning_rate": 4.731164847670386e-05, + "loss": 4.8584, + "step": 25050 + }, + { + "epoch": 0.14898539347226186, + "grad_norm": 1.6260594129562378, + "learning_rate": 4.7311437757754425e-05, + "loss": 4.5548, + "step": 25051 + }, + { + "epoch": 0.14899134075554288, + "grad_norm": 1.490978717803955, + "learning_rate": 4.731122703101629e-05, + "loss": 4.7144, + "step": 25052 + }, + { + "epoch": 0.14899728803882387, + "grad_norm": 2.054363489151001, + "learning_rate": 4.731101629648954e-05, + "loss": 4.9561, + "step": 25053 + }, + { + "epoch": 0.14900323532210485, + "grad_norm": 2.431696891784668, + "learning_rate": 4.7310805554174255e-05, + "loss": 4.6347, + "step": 25054 + }, + { + "epoch": 0.14900918260538587, + "grad_norm": 2.9854423999786377, + "learning_rate": 4.7310594804070485e-05, + "loss": 4.3526, + "step": 25055 + }, + { + "epoch": 0.14901512988866686, + "grad_norm": 2.859827756881714, + "learning_rate": 4.731038404617832e-05, + "loss": 4.3427, + "step": 25056 + }, + { + "epoch": 0.14902107717194785, + "grad_norm": 2.866624593734741, + "learning_rate": 4.731017328049784e-05, + "loss": 4.4747, + "step": 25057 + }, + { + "epoch": 0.14902702445522886, + "grad_norm": 2.0833802223205566, + "learning_rate": 4.730996250702909e-05, + "loss": 4.1979, + "step": 25058 + }, + { + "epoch": 0.14903297173850985, + "grad_norm": 2.095679521560669, + "learning_rate": 4.7309751725772176e-05, + "loss": 4.2466, + "step": 25059 + }, + { + "epoch": 0.14903891902179084, + "grad_norm": 2.3466885089874268, + "learning_rate": 4.730954093672716e-05, + "loss": 4.3074, + "step": 25060 + }, + { + "epoch": 0.14904486630507185, + "grad_norm": 2.1188759803771973, + "learning_rate": 4.730933013989411e-05, + "loss": 4.2482, + "step": 25061 + }, + { + "epoch": 0.14905081358835284, + "grad_norm": 2.1638059616088867, + "learning_rate": 4.73091193352731e-05, + "loss": 4.1506, + "step": 25062 + }, + { + "epoch": 0.14905676087163383, + "grad_norm": 2.035240650177002, + "learning_rate": 4.7308908522864215e-05, + "loss": 4.4322, + "step": 25063 + }, + { + "epoch": 0.14906270815491485, + "grad_norm": 2.375912666320801, + "learning_rate": 4.730869770266751e-05, + "loss": 5.3206, + "step": 25064 + }, + { + "epoch": 0.14906865543819584, + "grad_norm": 1.8899742364883423, + "learning_rate": 4.7308486874683075e-05, + "loss": 5.1336, + "step": 25065 + }, + { + "epoch": 0.14907460272147682, + "grad_norm": 1.7068132162094116, + "learning_rate": 4.730827603891098e-05, + "loss": 5.0085, + "step": 25066 + }, + { + "epoch": 0.14908055000475784, + "grad_norm": 1.737470269203186, + "learning_rate": 4.730806519535129e-05, + "loss": 5.9056, + "step": 25067 + }, + { + "epoch": 0.14908649728803883, + "grad_norm": 1.251652717590332, + "learning_rate": 4.730785434400409e-05, + "loss": 5.3772, + "step": 25068 + }, + { + "epoch": 0.14909244457131982, + "grad_norm": 1.2134002447128296, + "learning_rate": 4.730764348486945e-05, + "loss": 5.4202, + "step": 25069 + }, + { + "epoch": 0.14909839185460083, + "grad_norm": 1.028356671333313, + "learning_rate": 4.730743261794743e-05, + "loss": 5.4883, + "step": 25070 + }, + { + "epoch": 0.14910433913788182, + "grad_norm": 1.3931416273117065, + "learning_rate": 4.730722174323813e-05, + "loss": 5.3274, + "step": 25071 + }, + { + "epoch": 0.1491102864211628, + "grad_norm": 1.2539725303649902, + "learning_rate": 4.7307010860741607e-05, + "loss": 5.2628, + "step": 25072 + }, + { + "epoch": 0.14911623370444382, + "grad_norm": 1.2422703504562378, + "learning_rate": 4.730679997045793e-05, + "loss": 5.1639, + "step": 25073 + }, + { + "epoch": 0.1491221809877248, + "grad_norm": 1.4616423845291138, + "learning_rate": 4.730658907238719e-05, + "loss": 5.0979, + "step": 25074 + }, + { + "epoch": 0.1491281282710058, + "grad_norm": 1.2968275547027588, + "learning_rate": 4.730637816652944e-05, + "loss": 5.0785, + "step": 25075 + }, + { + "epoch": 0.1491340755542868, + "grad_norm": 1.304254412651062, + "learning_rate": 4.730616725288477e-05, + "loss": 5.4885, + "step": 25076 + }, + { + "epoch": 0.1491400228375678, + "grad_norm": 2.3498852252960205, + "learning_rate": 4.730595633145324e-05, + "loss": 5.3064, + "step": 25077 + }, + { + "epoch": 0.1491459701208488, + "grad_norm": 1.7321240901947021, + "learning_rate": 4.730574540223493e-05, + "loss": 5.1844, + "step": 25078 + }, + { + "epoch": 0.14915191740412978, + "grad_norm": 1.903198480606079, + "learning_rate": 4.730553446522993e-05, + "loss": 5.5481, + "step": 25079 + }, + { + "epoch": 0.1491578646874108, + "grad_norm": 1.659658670425415, + "learning_rate": 4.7305323520438285e-05, + "loss": 5.3265, + "step": 25080 + }, + { + "epoch": 0.14916381197069178, + "grad_norm": 1.4510316848754883, + "learning_rate": 4.7305112567860085e-05, + "loss": 5.2607, + "step": 25081 + }, + { + "epoch": 0.14916975925397277, + "grad_norm": 1.5634890794754028, + "learning_rate": 4.73049016074954e-05, + "loss": 5.1961, + "step": 25082 + }, + { + "epoch": 0.1491757065372538, + "grad_norm": 1.5400700569152832, + "learning_rate": 4.730469063934431e-05, + "loss": 5.6441, + "step": 25083 + }, + { + "epoch": 0.14918165382053478, + "grad_norm": 1.814353108406067, + "learning_rate": 4.730447966340688e-05, + "loss": 5.1855, + "step": 25084 + }, + { + "epoch": 0.14918760110381576, + "grad_norm": 2.3644423484802246, + "learning_rate": 4.7304268679683184e-05, + "loss": 4.5312, + "step": 25085 + }, + { + "epoch": 0.14919354838709678, + "grad_norm": 2.6960058212280273, + "learning_rate": 4.73040576881733e-05, + "loss": 4.3128, + "step": 25086 + }, + { + "epoch": 0.14919949567037777, + "grad_norm": 2.50162410736084, + "learning_rate": 4.73038466888773e-05, + "loss": 4.3356, + "step": 25087 + }, + { + "epoch": 0.14920544295365876, + "grad_norm": 1.938988208770752, + "learning_rate": 4.730363568179526e-05, + "loss": 4.6391, + "step": 25088 + }, + { + "epoch": 0.14921139023693977, + "grad_norm": 2.0165152549743652, + "learning_rate": 4.730342466692725e-05, + "loss": 5.3267, + "step": 25089 + }, + { + "epoch": 0.14921733752022076, + "grad_norm": 2.3626153469085693, + "learning_rate": 4.7303213644273345e-05, + "loss": 5.2551, + "step": 25090 + }, + { + "epoch": 0.14922328480350175, + "grad_norm": 2.1070075035095215, + "learning_rate": 4.730300261383361e-05, + "loss": 5.2231, + "step": 25091 + }, + { + "epoch": 0.14922923208678276, + "grad_norm": 1.6806228160858154, + "learning_rate": 4.7302791575608144e-05, + "loss": 5.5844, + "step": 25092 + }, + { + "epoch": 0.14923517937006375, + "grad_norm": 2.149728298187256, + "learning_rate": 4.7302580529596985e-05, + "loss": 4.7185, + "step": 25093 + }, + { + "epoch": 0.14924112665334474, + "grad_norm": 1.93796968460083, + "learning_rate": 4.730236947580024e-05, + "loss": 4.7622, + "step": 25094 + }, + { + "epoch": 0.14924707393662576, + "grad_norm": 1.7360033988952637, + "learning_rate": 4.7302158414217964e-05, + "loss": 4.7068, + "step": 25095 + }, + { + "epoch": 0.14925302121990675, + "grad_norm": 1.712073564529419, + "learning_rate": 4.730194734485023e-05, + "loss": 4.8146, + "step": 25096 + }, + { + "epoch": 0.14925896850318773, + "grad_norm": 1.789083480834961, + "learning_rate": 4.730173626769712e-05, + "loss": 4.774, + "step": 25097 + }, + { + "epoch": 0.14926491578646875, + "grad_norm": 1.9072470664978027, + "learning_rate": 4.730152518275871e-05, + "loss": 4.9099, + "step": 25098 + }, + { + "epoch": 0.14927086306974974, + "grad_norm": 1.7209197282791138, + "learning_rate": 4.730131409003506e-05, + "loss": 4.7141, + "step": 25099 + }, + { + "epoch": 0.14927681035303073, + "grad_norm": 1.8528800010681152, + "learning_rate": 4.730110298952625e-05, + "loss": 4.9741, + "step": 25100 + }, + { + "epoch": 0.14928275763631174, + "grad_norm": 1.9865680932998657, + "learning_rate": 4.7300891881232365e-05, + "loss": 4.9079, + "step": 25101 + }, + { + "epoch": 0.14928870491959273, + "grad_norm": 2.1327319145202637, + "learning_rate": 4.730068076515346e-05, + "loss": 4.9929, + "step": 25102 + }, + { + "epoch": 0.14929465220287372, + "grad_norm": 1.856972336769104, + "learning_rate": 4.730046964128962e-05, + "loss": 4.935, + "step": 25103 + }, + { + "epoch": 0.14930059948615473, + "grad_norm": 1.9982047080993652, + "learning_rate": 4.7300258509640924e-05, + "loss": 5.1254, + "step": 25104 + }, + { + "epoch": 0.14930654676943572, + "grad_norm": 1.866350531578064, + "learning_rate": 4.730004737020743e-05, + "loss": 5.0198, + "step": 25105 + }, + { + "epoch": 0.1493124940527167, + "grad_norm": 1.8669421672821045, + "learning_rate": 4.729983622298922e-05, + "loss": 4.817, + "step": 25106 + }, + { + "epoch": 0.14931844133599773, + "grad_norm": 2.3156704902648926, + "learning_rate": 4.7299625067986366e-05, + "loss": 4.9341, + "step": 25107 + }, + { + "epoch": 0.14932438861927871, + "grad_norm": 2.304932117462158, + "learning_rate": 4.7299413905198956e-05, + "loss": 4.908, + "step": 25108 + }, + { + "epoch": 0.1493303359025597, + "grad_norm": 2.0287182331085205, + "learning_rate": 4.7299202734627035e-05, + "loss": 4.9244, + "step": 25109 + }, + { + "epoch": 0.14933628318584072, + "grad_norm": 2.554980754852295, + "learning_rate": 4.72989915562707e-05, + "loss": 4.7163, + "step": 25110 + }, + { + "epoch": 0.1493422304691217, + "grad_norm": 2.76092791557312, + "learning_rate": 4.7298780370130014e-05, + "loss": 5.293, + "step": 25111 + }, + { + "epoch": 0.1493481777524027, + "grad_norm": 2.203293561935425, + "learning_rate": 4.729856917620506e-05, + "loss": 4.8891, + "step": 25112 + }, + { + "epoch": 0.1493541250356837, + "grad_norm": 2.2550253868103027, + "learning_rate": 4.7298357974495905e-05, + "loss": 5.1578, + "step": 25113 + }, + { + "epoch": 0.1493600723189647, + "grad_norm": 2.41914963722229, + "learning_rate": 4.7298146765002624e-05, + "loss": 5.0363, + "step": 25114 + }, + { + "epoch": 0.1493660196022457, + "grad_norm": 2.058586359024048, + "learning_rate": 4.729793554772528e-05, + "loss": 4.9537, + "step": 25115 + }, + { + "epoch": 0.1493719668855267, + "grad_norm": 2.3880207538604736, + "learning_rate": 4.729772432266397e-05, + "loss": 4.9701, + "step": 25116 + }, + { + "epoch": 0.1493779141688077, + "grad_norm": 2.012542247772217, + "learning_rate": 4.7297513089818745e-05, + "loss": 5.0596, + "step": 25117 + }, + { + "epoch": 0.14938386145208868, + "grad_norm": 1.9091664552688599, + "learning_rate": 4.72973018491897e-05, + "loss": 5.0199, + "step": 25118 + }, + { + "epoch": 0.1493898087353697, + "grad_norm": 1.9325292110443115, + "learning_rate": 4.7297090600776886e-05, + "loss": 4.9442, + "step": 25119 + }, + { + "epoch": 0.14939575601865068, + "grad_norm": 2.106926918029785, + "learning_rate": 4.729687934458039e-05, + "loss": 4.8628, + "step": 25120 + }, + { + "epoch": 0.14940170330193167, + "grad_norm": 1.7365446090698242, + "learning_rate": 4.729666808060029e-05, + "loss": 4.8492, + "step": 25121 + }, + { + "epoch": 0.1494076505852127, + "grad_norm": 1.9125512838363647, + "learning_rate": 4.729645680883665e-05, + "loss": 4.9389, + "step": 25122 + }, + { + "epoch": 0.14941359786849367, + "grad_norm": 2.0423247814178467, + "learning_rate": 4.729624552928954e-05, + "loss": 4.8626, + "step": 25123 + }, + { + "epoch": 0.14941954515177466, + "grad_norm": 1.9502712488174438, + "learning_rate": 4.729603424195905e-05, + "loss": 5.0237, + "step": 25124 + }, + { + "epoch": 0.14942549243505568, + "grad_norm": 2.0014281272888184, + "learning_rate": 4.7295822946845245e-05, + "loss": 4.9913, + "step": 25125 + }, + { + "epoch": 0.14943143971833667, + "grad_norm": 1.9854202270507812, + "learning_rate": 4.7295611643948204e-05, + "loss": 4.9394, + "step": 25126 + }, + { + "epoch": 0.14943738700161766, + "grad_norm": 1.7897859811782837, + "learning_rate": 4.729540033326798e-05, + "loss": 4.9434, + "step": 25127 + }, + { + "epoch": 0.14944333428489867, + "grad_norm": 2.092635154724121, + "learning_rate": 4.7295189014804676e-05, + "loss": 4.9032, + "step": 25128 + }, + { + "epoch": 0.14944928156817966, + "grad_norm": 1.9637115001678467, + "learning_rate": 4.729497768855834e-05, + "loss": 4.7775, + "step": 25129 + }, + { + "epoch": 0.14945522885146065, + "grad_norm": 1.8016657829284668, + "learning_rate": 4.729476635452906e-05, + "loss": 4.791, + "step": 25130 + }, + { + "epoch": 0.14946117613474166, + "grad_norm": 2.326096534729004, + "learning_rate": 4.7294555012716915e-05, + "loss": 5.3299, + "step": 25131 + }, + { + "epoch": 0.14946712341802265, + "grad_norm": 2.1310572624206543, + "learning_rate": 4.7294343663121965e-05, + "loss": 5.1919, + "step": 25132 + }, + { + "epoch": 0.14947307070130364, + "grad_norm": 2.3155853748321533, + "learning_rate": 4.72941323057443e-05, + "loss": 5.0858, + "step": 25133 + }, + { + "epoch": 0.14947901798458466, + "grad_norm": 2.049995183944702, + "learning_rate": 4.729392094058397e-05, + "loss": 5.065, + "step": 25134 + }, + { + "epoch": 0.14948496526786564, + "grad_norm": 1.8955172300338745, + "learning_rate": 4.729370956764107e-05, + "loss": 5.1361, + "step": 25135 + }, + { + "epoch": 0.14949091255114663, + "grad_norm": 3.226020336151123, + "learning_rate": 4.729349818691567e-05, + "loss": 4.7323, + "step": 25136 + }, + { + "epoch": 0.14949685983442762, + "grad_norm": 3.1648058891296387, + "learning_rate": 4.7293286798407833e-05, + "loss": 4.6663, + "step": 25137 + }, + { + "epoch": 0.14950280711770864, + "grad_norm": 2.2341058254241943, + "learning_rate": 4.729307540211764e-05, + "loss": 4.584, + "step": 25138 + }, + { + "epoch": 0.14950875440098962, + "grad_norm": 2.088019609451294, + "learning_rate": 4.729286399804517e-05, + "loss": 4.5618, + "step": 25139 + }, + { + "epoch": 0.1495147016842706, + "grad_norm": 1.8777929544448853, + "learning_rate": 4.729265258619048e-05, + "loss": 5.0011, + "step": 25140 + }, + { + "epoch": 0.14952064896755163, + "grad_norm": 2.080986261367798, + "learning_rate": 4.729244116655366e-05, + "loss": 5.6192, + "step": 25141 + }, + { + "epoch": 0.14952659625083262, + "grad_norm": 1.9895329475402832, + "learning_rate": 4.729222973913479e-05, + "loss": 5.8569, + "step": 25142 + }, + { + "epoch": 0.1495325435341136, + "grad_norm": 2.0990312099456787, + "learning_rate": 4.7292018303933924e-05, + "loss": 5.772, + "step": 25143 + }, + { + "epoch": 0.14953849081739462, + "grad_norm": 1.8530125617980957, + "learning_rate": 4.7291806860951145e-05, + "loss": 5.9042, + "step": 25144 + }, + { + "epoch": 0.1495444381006756, + "grad_norm": 1.7631386518478394, + "learning_rate": 4.7291595410186526e-05, + "loss": 5.7611, + "step": 25145 + }, + { + "epoch": 0.1495503853839566, + "grad_norm": 1.4668217897415161, + "learning_rate": 4.729138395164015e-05, + "loss": 5.4997, + "step": 25146 + }, + { + "epoch": 0.1495563326672376, + "grad_norm": 1.2580885887145996, + "learning_rate": 4.729117248531206e-05, + "loss": 5.5554, + "step": 25147 + }, + { + "epoch": 0.1495622799505186, + "grad_norm": 1.612502932548523, + "learning_rate": 4.7290961011202375e-05, + "loss": 5.0982, + "step": 25148 + }, + { + "epoch": 0.1495682272337996, + "grad_norm": 1.6753286123275757, + "learning_rate": 4.729074952931114e-05, + "loss": 4.9553, + "step": 25149 + }, + { + "epoch": 0.1495741745170806, + "grad_norm": 1.530179738998413, + "learning_rate": 4.729053803963843e-05, + "loss": 4.9314, + "step": 25150 + }, + { + "epoch": 0.1495801218003616, + "grad_norm": 1.5077494382858276, + "learning_rate": 4.729032654218433e-05, + "loss": 5.4957, + "step": 25151 + }, + { + "epoch": 0.14958606908364258, + "grad_norm": 1.6995402574539185, + "learning_rate": 4.72901150369489e-05, + "loss": 5.7406, + "step": 25152 + }, + { + "epoch": 0.1495920163669236, + "grad_norm": 1.4611583948135376, + "learning_rate": 4.728990352393222e-05, + "loss": 5.5632, + "step": 25153 + }, + { + "epoch": 0.14959796365020459, + "grad_norm": 1.775568962097168, + "learning_rate": 4.728969200313437e-05, + "loss": 5.1666, + "step": 25154 + }, + { + "epoch": 0.14960391093348557, + "grad_norm": 1.6890829801559448, + "learning_rate": 4.728948047455541e-05, + "loss": 5.1776, + "step": 25155 + }, + { + "epoch": 0.1496098582167666, + "grad_norm": 1.7455476522445679, + "learning_rate": 4.728926893819544e-05, + "loss": 5.0308, + "step": 25156 + }, + { + "epoch": 0.14961580550004758, + "grad_norm": 2.0798380374908447, + "learning_rate": 4.72890573940545e-05, + "loss": 4.8164, + "step": 25157 + }, + { + "epoch": 0.14962175278332857, + "grad_norm": 2.0280489921569824, + "learning_rate": 4.728884584213269e-05, + "loss": 4.7693, + "step": 25158 + }, + { + "epoch": 0.14962770006660958, + "grad_norm": 1.9629135131835938, + "learning_rate": 4.728863428243008e-05, + "loss": 4.9072, + "step": 25159 + }, + { + "epoch": 0.14963364734989057, + "grad_norm": 2.1143929958343506, + "learning_rate": 4.7288422714946724e-05, + "loss": 4.6828, + "step": 25160 + }, + { + "epoch": 0.14963959463317156, + "grad_norm": 1.9618384838104248, + "learning_rate": 4.7288211139682715e-05, + "loss": 5.0383, + "step": 25161 + }, + { + "epoch": 0.14964554191645257, + "grad_norm": 1.8829975128173828, + "learning_rate": 4.728799955663812e-05, + "loss": 5.5072, + "step": 25162 + }, + { + "epoch": 0.14965148919973356, + "grad_norm": 1.5670249462127686, + "learning_rate": 4.728778796581302e-05, + "loss": 5.1815, + "step": 25163 + }, + { + "epoch": 0.14965743648301455, + "grad_norm": 2.0932981967926025, + "learning_rate": 4.728757636720748e-05, + "loss": 5.0871, + "step": 25164 + }, + { + "epoch": 0.14966338376629557, + "grad_norm": 1.5827875137329102, + "learning_rate": 4.728736476082158e-05, + "loss": 5.0983, + "step": 25165 + }, + { + "epoch": 0.14966933104957655, + "grad_norm": 1.7353198528289795, + "learning_rate": 4.728715314665538e-05, + "loss": 4.8113, + "step": 25166 + }, + { + "epoch": 0.14967527833285754, + "grad_norm": 1.6395387649536133, + "learning_rate": 4.728694152470898e-05, + "loss": 4.9403, + "step": 25167 + }, + { + "epoch": 0.14968122561613856, + "grad_norm": 1.8546936511993408, + "learning_rate": 4.7286729894982434e-05, + "loss": 4.9092, + "step": 25168 + }, + { + "epoch": 0.14968717289941955, + "grad_norm": 1.5432714223861694, + "learning_rate": 4.728651825747582e-05, + "loss": 4.8257, + "step": 25169 + }, + { + "epoch": 0.14969312018270053, + "grad_norm": 1.6309102773666382, + "learning_rate": 4.728630661218921e-05, + "loss": 5.5829, + "step": 25170 + }, + { + "epoch": 0.14969906746598155, + "grad_norm": 1.8060203790664673, + "learning_rate": 4.7286094959122685e-05, + "loss": 5.3099, + "step": 25171 + }, + { + "epoch": 0.14970501474926254, + "grad_norm": 1.8817297220230103, + "learning_rate": 4.728588329827631e-05, + "loss": 5.5393, + "step": 25172 + }, + { + "epoch": 0.14971096203254353, + "grad_norm": 1.806970477104187, + "learning_rate": 4.728567162965017e-05, + "loss": 5.8567, + "step": 25173 + }, + { + "epoch": 0.14971690931582454, + "grad_norm": 1.6101081371307373, + "learning_rate": 4.728545995324433e-05, + "loss": 5.5389, + "step": 25174 + }, + { + "epoch": 0.14972285659910553, + "grad_norm": 1.5525349378585815, + "learning_rate": 4.7285248269058854e-05, + "loss": 5.6075, + "step": 25175 + }, + { + "epoch": 0.14972880388238652, + "grad_norm": 1.543853998184204, + "learning_rate": 4.7285036577093844e-05, + "loss": 5.5287, + "step": 25176 + }, + { + "epoch": 0.14973475116566753, + "grad_norm": 1.5811434984207153, + "learning_rate": 4.728482487734935e-05, + "loss": 5.5584, + "step": 25177 + }, + { + "epoch": 0.14974069844894852, + "grad_norm": 1.2957634925842285, + "learning_rate": 4.728461316982546e-05, + "loss": 5.5264, + "step": 25178 + }, + { + "epoch": 0.1497466457322295, + "grad_norm": 1.3600691556930542, + "learning_rate": 4.728440145452224e-05, + "loss": 5.5781, + "step": 25179 + }, + { + "epoch": 0.14975259301551053, + "grad_norm": 1.3423492908477783, + "learning_rate": 4.7284189731439764e-05, + "loss": 5.535, + "step": 25180 + }, + { + "epoch": 0.14975854029879151, + "grad_norm": 1.5586212873458862, + "learning_rate": 4.7283978000578107e-05, + "loss": 5.0746, + "step": 25181 + }, + { + "epoch": 0.1497644875820725, + "grad_norm": 1.8833614587783813, + "learning_rate": 4.7283766261937346e-05, + "loss": 4.6121, + "step": 25182 + }, + { + "epoch": 0.14977043486535352, + "grad_norm": 1.8890469074249268, + "learning_rate": 4.728355451551755e-05, + "loss": 4.5572, + "step": 25183 + }, + { + "epoch": 0.1497763821486345, + "grad_norm": 1.7143722772598267, + "learning_rate": 4.728334276131879e-05, + "loss": 4.5289, + "step": 25184 + }, + { + "epoch": 0.1497823294319155, + "grad_norm": 1.766708493232727, + "learning_rate": 4.728313099934115e-05, + "loss": 4.6957, + "step": 25185 + }, + { + "epoch": 0.1497882767151965, + "grad_norm": 1.8504046201705933, + "learning_rate": 4.72829192295847e-05, + "loss": 4.8764, + "step": 25186 + }, + { + "epoch": 0.1497942239984775, + "grad_norm": 2.0711238384246826, + "learning_rate": 4.728270745204951e-05, + "loss": 4.9157, + "step": 25187 + }, + { + "epoch": 0.1498001712817585, + "grad_norm": 2.0366387367248535, + "learning_rate": 4.728249566673567e-05, + "loss": 4.9295, + "step": 25188 + }, + { + "epoch": 0.1498061185650395, + "grad_norm": 1.7883682250976562, + "learning_rate": 4.728228387364323e-05, + "loss": 5.1173, + "step": 25189 + }, + { + "epoch": 0.1498120658483205, + "grad_norm": 1.8308504819869995, + "learning_rate": 4.7282072072772276e-05, + "loss": 5.0593, + "step": 25190 + }, + { + "epoch": 0.14981801313160148, + "grad_norm": 1.5662436485290527, + "learning_rate": 4.728186026412288e-05, + "loss": 5.1499, + "step": 25191 + }, + { + "epoch": 0.1498239604148825, + "grad_norm": 1.8079571723937988, + "learning_rate": 4.728164844769511e-05, + "loss": 4.948, + "step": 25192 + }, + { + "epoch": 0.14982990769816348, + "grad_norm": 1.681217908859253, + "learning_rate": 4.728143662348906e-05, + "loss": 5.3433, + "step": 25193 + }, + { + "epoch": 0.14983585498144447, + "grad_norm": 1.5585112571716309, + "learning_rate": 4.7281224791504784e-05, + "loss": 5.6366, + "step": 25194 + }, + { + "epoch": 0.14984180226472546, + "grad_norm": 1.8676329851150513, + "learning_rate": 4.7281012951742364e-05, + "loss": 5.1824, + "step": 25195 + }, + { + "epoch": 0.14984774954800648, + "grad_norm": 2.227149248123169, + "learning_rate": 4.728080110420188e-05, + "loss": 5.0203, + "step": 25196 + }, + { + "epoch": 0.14985369683128746, + "grad_norm": 1.6362202167510986, + "learning_rate": 4.728058924888339e-05, + "loss": 5.1942, + "step": 25197 + }, + { + "epoch": 0.14985964411456845, + "grad_norm": 1.9886643886566162, + "learning_rate": 4.7280377385786976e-05, + "loss": 5.4607, + "step": 25198 + }, + { + "epoch": 0.14986559139784947, + "grad_norm": 1.8965426683425903, + "learning_rate": 4.728016551491271e-05, + "loss": 5.4426, + "step": 25199 + }, + { + "epoch": 0.14987153868113046, + "grad_norm": 1.7106379270553589, + "learning_rate": 4.7279953636260677e-05, + "loss": 5.2894, + "step": 25200 + }, + { + "epoch": 0.14987748596441144, + "grad_norm": 1.5771503448486328, + "learning_rate": 4.727974174983093e-05, + "loss": 5.7972, + "step": 25201 + }, + { + "epoch": 0.14988343324769246, + "grad_norm": 1.4394875764846802, + "learning_rate": 4.727952985562357e-05, + "loss": 5.4622, + "step": 25202 + }, + { + "epoch": 0.14988938053097345, + "grad_norm": 1.421237826347351, + "learning_rate": 4.727931795363864e-05, + "loss": 5.5927, + "step": 25203 + }, + { + "epoch": 0.14989532781425444, + "grad_norm": 1.4579883813858032, + "learning_rate": 4.727910604387624e-05, + "loss": 5.6534, + "step": 25204 + }, + { + "epoch": 0.14990127509753545, + "grad_norm": 1.5861623287200928, + "learning_rate": 4.727889412633644e-05, + "loss": 5.423, + "step": 25205 + }, + { + "epoch": 0.14990722238081644, + "grad_norm": 1.1634724140167236, + "learning_rate": 4.72786822010193e-05, + "loss": 5.5339, + "step": 25206 + }, + { + "epoch": 0.14991316966409743, + "grad_norm": 1.3486993312835693, + "learning_rate": 4.72784702679249e-05, + "loss": 5.572, + "step": 25207 + }, + { + "epoch": 0.14991911694737844, + "grad_norm": 1.1783596277236938, + "learning_rate": 4.727825832705333e-05, + "loss": 5.4949, + "step": 25208 + }, + { + "epoch": 0.14992506423065943, + "grad_norm": 1.405774712562561, + "learning_rate": 4.727804637840464e-05, + "loss": 5.4044, + "step": 25209 + }, + { + "epoch": 0.14993101151394042, + "grad_norm": 1.4211558103561401, + "learning_rate": 4.727783442197891e-05, + "loss": 5.3778, + "step": 25210 + }, + { + "epoch": 0.14993695879722144, + "grad_norm": 1.572511076927185, + "learning_rate": 4.727762245777623e-05, + "loss": 5.4308, + "step": 25211 + }, + { + "epoch": 0.14994290608050242, + "grad_norm": 1.4699571132659912, + "learning_rate": 4.727741048579665e-05, + "loss": 5.3195, + "step": 25212 + }, + { + "epoch": 0.1499488533637834, + "grad_norm": 1.231878399848938, + "learning_rate": 4.727719850604026e-05, + "loss": 5.2663, + "step": 25213 + }, + { + "epoch": 0.14995480064706443, + "grad_norm": 1.3779250383377075, + "learning_rate": 4.7276986518507136e-05, + "loss": 5.1489, + "step": 25214 + }, + { + "epoch": 0.14996074793034542, + "grad_norm": 2.058643341064453, + "learning_rate": 4.7276774523197334e-05, + "loss": 5.4943, + "step": 25215 + }, + { + "epoch": 0.1499666952136264, + "grad_norm": 2.3679542541503906, + "learning_rate": 4.727656252011095e-05, + "loss": 4.688, + "step": 25216 + }, + { + "epoch": 0.14997264249690742, + "grad_norm": 2.2339799404144287, + "learning_rate": 4.727635050924805e-05, + "loss": 5.1016, + "step": 25217 + }, + { + "epoch": 0.1499785897801884, + "grad_norm": 1.536407709121704, + "learning_rate": 4.72761384906087e-05, + "loss": 5.2741, + "step": 25218 + }, + { + "epoch": 0.1499845370634694, + "grad_norm": 1.6192244291305542, + "learning_rate": 4.7275926464192985e-05, + "loss": 5.0808, + "step": 25219 + }, + { + "epoch": 0.1499904843467504, + "grad_norm": 1.6183874607086182, + "learning_rate": 4.727571443000097e-05, + "loss": 5.4735, + "step": 25220 + }, + { + "epoch": 0.1499964316300314, + "grad_norm": 1.5945466756820679, + "learning_rate": 4.7275502388032736e-05, + "loss": 5.7213, + "step": 25221 + }, + { + "epoch": 0.1500023789133124, + "grad_norm": 1.455883264541626, + "learning_rate": 4.727529033828835e-05, + "loss": 5.588, + "step": 25222 + }, + { + "epoch": 0.1500083261965934, + "grad_norm": 1.6111440658569336, + "learning_rate": 4.727507828076789e-05, + "loss": 5.0907, + "step": 25223 + }, + { + "epoch": 0.1500142734798744, + "grad_norm": 1.6382368803024292, + "learning_rate": 4.727486621547144e-05, + "loss": 5.2271, + "step": 25224 + }, + { + "epoch": 0.15002022076315538, + "grad_norm": 1.637136697769165, + "learning_rate": 4.7274654142399056e-05, + "loss": 4.9102, + "step": 25225 + }, + { + "epoch": 0.1500261680464364, + "grad_norm": 1.8395768404006958, + "learning_rate": 4.727444206155082e-05, + "loss": 5.0519, + "step": 25226 + }, + { + "epoch": 0.15003211532971739, + "grad_norm": 1.7471513748168945, + "learning_rate": 4.727422997292681e-05, + "loss": 5.2439, + "step": 25227 + }, + { + "epoch": 0.15003806261299837, + "grad_norm": 2.3117516040802, + "learning_rate": 4.72740178765271e-05, + "loss": 5.1935, + "step": 25228 + }, + { + "epoch": 0.1500440098962794, + "grad_norm": 2.0054478645324707, + "learning_rate": 4.727380577235175e-05, + "loss": 5.2919, + "step": 25229 + }, + { + "epoch": 0.15004995717956038, + "grad_norm": 1.9058947563171387, + "learning_rate": 4.727359366040085e-05, + "loss": 4.8624, + "step": 25230 + }, + { + "epoch": 0.15005590446284137, + "grad_norm": 1.746030569076538, + "learning_rate": 4.727338154067447e-05, + "loss": 4.9731, + "step": 25231 + }, + { + "epoch": 0.15006185174612238, + "grad_norm": 1.693912386894226, + "learning_rate": 4.727316941317268e-05, + "loss": 4.948, + "step": 25232 + }, + { + "epoch": 0.15006779902940337, + "grad_norm": 1.742431640625, + "learning_rate": 4.727295727789556e-05, + "loss": 4.9891, + "step": 25233 + }, + { + "epoch": 0.15007374631268436, + "grad_norm": 2.8610570430755615, + "learning_rate": 4.7272745134843175e-05, + "loss": 3.9769, + "step": 25234 + }, + { + "epoch": 0.15007969359596537, + "grad_norm": 1.6757450103759766, + "learning_rate": 4.72725329840156e-05, + "loss": 5.4376, + "step": 25235 + }, + { + "epoch": 0.15008564087924636, + "grad_norm": 1.6358832120895386, + "learning_rate": 4.727232082541293e-05, + "loss": 5.6665, + "step": 25236 + }, + { + "epoch": 0.15009158816252735, + "grad_norm": 1.8907593488693237, + "learning_rate": 4.727210865903522e-05, + "loss": 5.4225, + "step": 25237 + }, + { + "epoch": 0.15009753544580837, + "grad_norm": 1.5822373628616333, + "learning_rate": 4.727189648488254e-05, + "loss": 5.5356, + "step": 25238 + }, + { + "epoch": 0.15010348272908935, + "grad_norm": 1.626504898071289, + "learning_rate": 4.7271684302954974e-05, + "loss": 5.2066, + "step": 25239 + }, + { + "epoch": 0.15010943001237034, + "grad_norm": 1.7297816276550293, + "learning_rate": 4.727147211325259e-05, + "loss": 5.109, + "step": 25240 + }, + { + "epoch": 0.15011537729565136, + "grad_norm": 1.6709920167922974, + "learning_rate": 4.727125991577547e-05, + "loss": 5.2468, + "step": 25241 + }, + { + "epoch": 0.15012132457893235, + "grad_norm": 1.5390464067459106, + "learning_rate": 4.727104771052368e-05, + "loss": 5.237, + "step": 25242 + }, + { + "epoch": 0.15012727186221334, + "grad_norm": 1.4673635959625244, + "learning_rate": 4.72708354974973e-05, + "loss": 5.2971, + "step": 25243 + }, + { + "epoch": 0.15013321914549435, + "grad_norm": 1.6094917058944702, + "learning_rate": 4.7270623276696394e-05, + "loss": 5.3539, + "step": 25244 + }, + { + "epoch": 0.15013916642877534, + "grad_norm": 1.697434902191162, + "learning_rate": 4.727041104812105e-05, + "loss": 4.9796, + "step": 25245 + }, + { + "epoch": 0.15014511371205633, + "grad_norm": 1.7680538892745972, + "learning_rate": 4.727019881177134e-05, + "loss": 5.0622, + "step": 25246 + }, + { + "epoch": 0.15015106099533734, + "grad_norm": 1.6313658952713013, + "learning_rate": 4.7269986567647324e-05, + "loss": 5.0507, + "step": 25247 + }, + { + "epoch": 0.15015700827861833, + "grad_norm": 1.6400883197784424, + "learning_rate": 4.72697743157491e-05, + "loss": 4.9752, + "step": 25248 + }, + { + "epoch": 0.15016295556189932, + "grad_norm": 1.6866703033447266, + "learning_rate": 4.726956205607671e-05, + "loss": 5.2475, + "step": 25249 + }, + { + "epoch": 0.15016890284518034, + "grad_norm": 1.5988578796386719, + "learning_rate": 4.7269349788630255e-05, + "loss": 4.9963, + "step": 25250 + }, + { + "epoch": 0.15017485012846132, + "grad_norm": 1.8661000728607178, + "learning_rate": 4.7269137513409796e-05, + "loss": 4.7149, + "step": 25251 + }, + { + "epoch": 0.1501807974117423, + "grad_norm": 1.5544322729110718, + "learning_rate": 4.726892523041541e-05, + "loss": 5.0037, + "step": 25252 + }, + { + "epoch": 0.1501867446950233, + "grad_norm": 1.6971745491027832, + "learning_rate": 4.726871293964718e-05, + "loss": 5.1207, + "step": 25253 + }, + { + "epoch": 0.15019269197830432, + "grad_norm": 1.508044958114624, + "learning_rate": 4.726850064110517e-05, + "loss": 5.3578, + "step": 25254 + }, + { + "epoch": 0.1501986392615853, + "grad_norm": 1.7235703468322754, + "learning_rate": 4.726828833478946e-05, + "loss": 5.3506, + "step": 25255 + }, + { + "epoch": 0.1502045865448663, + "grad_norm": 1.7117946147918701, + "learning_rate": 4.726807602070011e-05, + "loss": 5.0023, + "step": 25256 + }, + { + "epoch": 0.1502105338281473, + "grad_norm": 1.6594294309616089, + "learning_rate": 4.726786369883721e-05, + "loss": 4.8674, + "step": 25257 + }, + { + "epoch": 0.1502164811114283, + "grad_norm": 1.7046406269073486, + "learning_rate": 4.7267651369200825e-05, + "loss": 4.9614, + "step": 25258 + }, + { + "epoch": 0.15022242839470928, + "grad_norm": 1.6488447189331055, + "learning_rate": 4.726743903179104e-05, + "loss": 5.0612, + "step": 25259 + }, + { + "epoch": 0.1502283756779903, + "grad_norm": 1.5859414339065552, + "learning_rate": 4.726722668660792e-05, + "loss": 4.9399, + "step": 25260 + }, + { + "epoch": 0.1502343229612713, + "grad_norm": 2.1271414756774902, + "learning_rate": 4.726701433365154e-05, + "loss": 5.0729, + "step": 25261 + }, + { + "epoch": 0.15024027024455228, + "grad_norm": 1.9313926696777344, + "learning_rate": 4.726680197292198e-05, + "loss": 5.271, + "step": 25262 + }, + { + "epoch": 0.1502462175278333, + "grad_norm": 1.933329463005066, + "learning_rate": 4.72665896044193e-05, + "loss": 5.0125, + "step": 25263 + }, + { + "epoch": 0.15025216481111428, + "grad_norm": 1.7074263095855713, + "learning_rate": 4.726637722814359e-05, + "loss": 4.8612, + "step": 25264 + }, + { + "epoch": 0.15025811209439527, + "grad_norm": 2.2242465019226074, + "learning_rate": 4.7266164844094915e-05, + "loss": 4.5163, + "step": 25265 + }, + { + "epoch": 0.15026405937767628, + "grad_norm": 1.5982950925827026, + "learning_rate": 4.726595245227336e-05, + "loss": 5.2747, + "step": 25266 + }, + { + "epoch": 0.15027000666095727, + "grad_norm": 2.0305862426757812, + "learning_rate": 4.726574005267898e-05, + "loss": 4.6378, + "step": 25267 + }, + { + "epoch": 0.15027595394423826, + "grad_norm": 1.7604337930679321, + "learning_rate": 4.726552764531187e-05, + "loss": 5.0755, + "step": 25268 + }, + { + "epoch": 0.15028190122751928, + "grad_norm": 1.9310117959976196, + "learning_rate": 4.7265315230172087e-05, + "loss": 4.5722, + "step": 25269 + }, + { + "epoch": 0.15028784851080026, + "grad_norm": 1.7772380113601685, + "learning_rate": 4.726510280725972e-05, + "loss": 4.8739, + "step": 25270 + }, + { + "epoch": 0.15029379579408125, + "grad_norm": 1.635905385017395, + "learning_rate": 4.7264890376574824e-05, + "loss": 4.8656, + "step": 25271 + }, + { + "epoch": 0.15029974307736227, + "grad_norm": 1.7308213710784912, + "learning_rate": 4.7264677938117496e-05, + "loss": 4.8062, + "step": 25272 + }, + { + "epoch": 0.15030569036064326, + "grad_norm": 1.751625895500183, + "learning_rate": 4.7264465491887786e-05, + "loss": 4.9999, + "step": 25273 + }, + { + "epoch": 0.15031163764392425, + "grad_norm": 1.9022659063339233, + "learning_rate": 4.726425303788579e-05, + "loss": 4.3717, + "step": 25274 + }, + { + "epoch": 0.15031758492720526, + "grad_norm": 1.6903055906295776, + "learning_rate": 4.7264040576111576e-05, + "loss": 4.6601, + "step": 25275 + }, + { + "epoch": 0.15032353221048625, + "grad_norm": 1.7622424364089966, + "learning_rate": 4.726382810656521e-05, + "loss": 4.711, + "step": 25276 + }, + { + "epoch": 0.15032947949376724, + "grad_norm": 1.6687418222427368, + "learning_rate": 4.726361562924678e-05, + "loss": 4.8469, + "step": 25277 + }, + { + "epoch": 0.15033542677704825, + "grad_norm": 1.6430240869522095, + "learning_rate": 4.7263403144156334e-05, + "loss": 4.7209, + "step": 25278 + }, + { + "epoch": 0.15034137406032924, + "grad_norm": 1.8600574731826782, + "learning_rate": 4.726319065129398e-05, + "loss": 4.465, + "step": 25279 + }, + { + "epoch": 0.15034732134361023, + "grad_norm": 1.4847289323806763, + "learning_rate": 4.7262978150659776e-05, + "loss": 5.3048, + "step": 25280 + }, + { + "epoch": 0.15035326862689125, + "grad_norm": 1.5062929391860962, + "learning_rate": 4.726276564225379e-05, + "loss": 5.0202, + "step": 25281 + }, + { + "epoch": 0.15035921591017223, + "grad_norm": 1.999292254447937, + "learning_rate": 4.7262553126076106e-05, + "loss": 4.2882, + "step": 25282 + }, + { + "epoch": 0.15036516319345322, + "grad_norm": 1.7813308238983154, + "learning_rate": 4.7262340602126794e-05, + "loss": 4.7198, + "step": 25283 + }, + { + "epoch": 0.15037111047673424, + "grad_norm": 1.8029576539993286, + "learning_rate": 4.726212807040593e-05, + "loss": 4.9741, + "step": 25284 + }, + { + "epoch": 0.15037705776001523, + "grad_norm": 1.629035472869873, + "learning_rate": 4.726191553091358e-05, + "loss": 5.1917, + "step": 25285 + }, + { + "epoch": 0.15038300504329621, + "grad_norm": 1.54799222946167, + "learning_rate": 4.726170298364983e-05, + "loss": 4.9093, + "step": 25286 + }, + { + "epoch": 0.15038895232657723, + "grad_norm": 1.8892208337783813, + "learning_rate": 4.726149042861475e-05, + "loss": 4.2702, + "step": 25287 + }, + { + "epoch": 0.15039489960985822, + "grad_norm": 1.7078487873077393, + "learning_rate": 4.726127786580842e-05, + "loss": 4.2082, + "step": 25288 + }, + { + "epoch": 0.1504008468931392, + "grad_norm": 1.818529725074768, + "learning_rate": 4.72610652952309e-05, + "loss": 4.5002, + "step": 25289 + }, + { + "epoch": 0.15040679417642022, + "grad_norm": 1.600824236869812, + "learning_rate": 4.726085271688227e-05, + "loss": 4.8372, + "step": 25290 + }, + { + "epoch": 0.1504127414597012, + "grad_norm": 1.6711620092391968, + "learning_rate": 4.726064013076261e-05, + "loss": 4.8079, + "step": 25291 + }, + { + "epoch": 0.1504186887429822, + "grad_norm": 1.7478057146072388, + "learning_rate": 4.7260427536871985e-05, + "loss": 4.7123, + "step": 25292 + }, + { + "epoch": 0.15042463602626321, + "grad_norm": 1.6385493278503418, + "learning_rate": 4.726021493521048e-05, + "loss": 4.8043, + "step": 25293 + }, + { + "epoch": 0.1504305833095442, + "grad_norm": 1.6353743076324463, + "learning_rate": 4.7260002325778165e-05, + "loss": 4.7891, + "step": 25294 + }, + { + "epoch": 0.1504365305928252, + "grad_norm": 1.8076624870300293, + "learning_rate": 4.725978970857511e-05, + "loss": 4.502, + "step": 25295 + }, + { + "epoch": 0.1504424778761062, + "grad_norm": 2.979780673980713, + "learning_rate": 4.72595770836014e-05, + "loss": 3.7136, + "step": 25296 + }, + { + "epoch": 0.1504484251593872, + "grad_norm": 1.698283314704895, + "learning_rate": 4.7259364450857096e-05, + "loss": 4.9292, + "step": 25297 + }, + { + "epoch": 0.15045437244266818, + "grad_norm": 1.577962040901184, + "learning_rate": 4.725915181034228e-05, + "loss": 5.177, + "step": 25298 + }, + { + "epoch": 0.1504603197259492, + "grad_norm": 1.7820360660552979, + "learning_rate": 4.725893916205702e-05, + "loss": 4.6215, + "step": 25299 + }, + { + "epoch": 0.1504662670092302, + "grad_norm": 1.8856147527694702, + "learning_rate": 4.7258726506001396e-05, + "loss": 4.49, + "step": 25300 + }, + { + "epoch": 0.15047221429251117, + "grad_norm": 1.6485686302185059, + "learning_rate": 4.7258513842175475e-05, + "loss": 5.7732, + "step": 25301 + }, + { + "epoch": 0.1504781615757922, + "grad_norm": 2.143477439880371, + "learning_rate": 4.725830117057935e-05, + "loss": 4.8915, + "step": 25302 + }, + { + "epoch": 0.15048410885907318, + "grad_norm": 1.6669731140136719, + "learning_rate": 4.725808849121307e-05, + "loss": 5.1107, + "step": 25303 + }, + { + "epoch": 0.15049005614235417, + "grad_norm": 1.6642520427703857, + "learning_rate": 4.725787580407673e-05, + "loss": 4.5454, + "step": 25304 + }, + { + "epoch": 0.15049600342563518, + "grad_norm": 1.7125663757324219, + "learning_rate": 4.725766310917039e-05, + "loss": 4.7463, + "step": 25305 + }, + { + "epoch": 0.15050195070891617, + "grad_norm": 1.7411010265350342, + "learning_rate": 4.725745040649413e-05, + "loss": 4.643, + "step": 25306 + }, + { + "epoch": 0.15050789799219716, + "grad_norm": 1.8865814208984375, + "learning_rate": 4.725723769604803e-05, + "loss": 4.5555, + "step": 25307 + }, + { + "epoch": 0.15051384527547818, + "grad_norm": 1.6867681741714478, + "learning_rate": 4.725702497783215e-05, + "loss": 4.7334, + "step": 25308 + }, + { + "epoch": 0.15051979255875916, + "grad_norm": 1.5820156335830688, + "learning_rate": 4.7256812251846576e-05, + "loss": 5.5799, + "step": 25309 + }, + { + "epoch": 0.15052573984204015, + "grad_norm": 1.772575855255127, + "learning_rate": 4.725659951809138e-05, + "loss": 5.0303, + "step": 25310 + }, + { + "epoch": 0.15053168712532114, + "grad_norm": 1.7370164394378662, + "learning_rate": 4.725638677656663e-05, + "loss": 4.6378, + "step": 25311 + }, + { + "epoch": 0.15053763440860216, + "grad_norm": 1.6597602367401123, + "learning_rate": 4.725617402727241e-05, + "loss": 4.6918, + "step": 25312 + }, + { + "epoch": 0.15054358169188314, + "grad_norm": 1.6710939407348633, + "learning_rate": 4.725596127020879e-05, + "loss": 4.5664, + "step": 25313 + }, + { + "epoch": 0.15054952897516413, + "grad_norm": 1.7546216249465942, + "learning_rate": 4.725574850537584e-05, + "loss": 4.8903, + "step": 25314 + }, + { + "epoch": 0.15055547625844515, + "grad_norm": 1.8587819337844849, + "learning_rate": 4.725553573277365e-05, + "loss": 4.9894, + "step": 25315 + }, + { + "epoch": 0.15056142354172614, + "grad_norm": 1.3700711727142334, + "learning_rate": 4.725532295240227e-05, + "loss": 5.2452, + "step": 25316 + }, + { + "epoch": 0.15056737082500712, + "grad_norm": 1.7877662181854248, + "learning_rate": 4.725511016426179e-05, + "loss": 4.214, + "step": 25317 + }, + { + "epoch": 0.15057331810828814, + "grad_norm": 1.8162602186203003, + "learning_rate": 4.725489736835228e-05, + "loss": 4.9041, + "step": 25318 + }, + { + "epoch": 0.15057926539156913, + "grad_norm": 1.6758408546447754, + "learning_rate": 4.725468456467381e-05, + "loss": 4.3246, + "step": 25319 + }, + { + "epoch": 0.15058521267485012, + "grad_norm": 1.5553221702575684, + "learning_rate": 4.725447175322647e-05, + "loss": 5.1303, + "step": 25320 + }, + { + "epoch": 0.15059115995813113, + "grad_norm": 1.5233205556869507, + "learning_rate": 4.725425893401032e-05, + "loss": 5.4629, + "step": 25321 + }, + { + "epoch": 0.15059710724141212, + "grad_norm": 1.5840942859649658, + "learning_rate": 4.725404610702544e-05, + "loss": 5.12, + "step": 25322 + }, + { + "epoch": 0.1506030545246931, + "grad_norm": 1.787832260131836, + "learning_rate": 4.72538332722719e-05, + "loss": 5.2794, + "step": 25323 + }, + { + "epoch": 0.15060900180797412, + "grad_norm": 1.725203275680542, + "learning_rate": 4.725362042974978e-05, + "loss": 5.1121, + "step": 25324 + }, + { + "epoch": 0.1506149490912551, + "grad_norm": 1.5242986679077148, + "learning_rate": 4.725340757945914e-05, + "loss": 5.2826, + "step": 25325 + }, + { + "epoch": 0.1506208963745361, + "grad_norm": 1.9072916507720947, + "learning_rate": 4.725319472140007e-05, + "loss": 5.134, + "step": 25326 + }, + { + "epoch": 0.15062684365781712, + "grad_norm": 1.5604580640792847, + "learning_rate": 4.725298185557265e-05, + "loss": 5.1551, + "step": 25327 + }, + { + "epoch": 0.1506327909410981, + "grad_norm": 1.7541977167129517, + "learning_rate": 4.725276898197694e-05, + "loss": 4.6415, + "step": 25328 + }, + { + "epoch": 0.1506387382243791, + "grad_norm": 1.6959171295166016, + "learning_rate": 4.725255610061301e-05, + "loss": 4.9428, + "step": 25329 + }, + { + "epoch": 0.1506446855076601, + "grad_norm": 1.8614954948425293, + "learning_rate": 4.725234321148095e-05, + "loss": 5.2815, + "step": 25330 + }, + { + "epoch": 0.1506506327909411, + "grad_norm": 2.654698610305786, + "learning_rate": 4.725213031458082e-05, + "loss": 4.4367, + "step": 25331 + }, + { + "epoch": 0.15065658007422209, + "grad_norm": 2.4033470153808594, + "learning_rate": 4.7251917409912705e-05, + "loss": 4.6682, + "step": 25332 + }, + { + "epoch": 0.1506625273575031, + "grad_norm": 2.164626121520996, + "learning_rate": 4.725170449747668e-05, + "loss": 4.8865, + "step": 25333 + }, + { + "epoch": 0.1506684746407841, + "grad_norm": 2.046325445175171, + "learning_rate": 4.725149157727281e-05, + "loss": 4.9494, + "step": 25334 + }, + { + "epoch": 0.15067442192406508, + "grad_norm": 1.8939987421035767, + "learning_rate": 4.7251278649301175e-05, + "loss": 4.7641, + "step": 25335 + }, + { + "epoch": 0.1506803692073461, + "grad_norm": 1.6845778226852417, + "learning_rate": 4.725106571356185e-05, + "loss": 4.8831, + "step": 25336 + }, + { + "epoch": 0.15068631649062708, + "grad_norm": 1.7191179990768433, + "learning_rate": 4.7250852770054905e-05, + "loss": 4.9732, + "step": 25337 + }, + { + "epoch": 0.15069226377390807, + "grad_norm": 2.061174154281616, + "learning_rate": 4.725063981878042e-05, + "loss": 4.2263, + "step": 25338 + }, + { + "epoch": 0.15069821105718909, + "grad_norm": 2.3144235610961914, + "learning_rate": 4.7250426859738464e-05, + "loss": 4.2848, + "step": 25339 + }, + { + "epoch": 0.15070415834047007, + "grad_norm": 2.0103487968444824, + "learning_rate": 4.7250213892929115e-05, + "loss": 4.178, + "step": 25340 + }, + { + "epoch": 0.15071010562375106, + "grad_norm": 2.093339443206787, + "learning_rate": 4.725000091835245e-05, + "loss": 4.3689, + "step": 25341 + }, + { + "epoch": 0.15071605290703208, + "grad_norm": 2.085618495941162, + "learning_rate": 4.724978793600853e-05, + "loss": 4.1158, + "step": 25342 + }, + { + "epoch": 0.15072200019031307, + "grad_norm": 2.2095706462860107, + "learning_rate": 4.7249574945897445e-05, + "loss": 4.3338, + "step": 25343 + }, + { + "epoch": 0.15072794747359405, + "grad_norm": 2.169772148132324, + "learning_rate": 4.7249361948019267e-05, + "loss": 4.63, + "step": 25344 + }, + { + "epoch": 0.15073389475687507, + "grad_norm": 2.5633938312530518, + "learning_rate": 4.7249148942374054e-05, + "loss": 4.954, + "step": 25345 + }, + { + "epoch": 0.15073984204015606, + "grad_norm": 2.181420087814331, + "learning_rate": 4.72489359289619e-05, + "loss": 4.5234, + "step": 25346 + }, + { + "epoch": 0.15074578932343705, + "grad_norm": 2.265392541885376, + "learning_rate": 4.724872290778288e-05, + "loss": 4.1063, + "step": 25347 + }, + { + "epoch": 0.15075173660671806, + "grad_norm": 1.8531908988952637, + "learning_rate": 4.7248509878837054e-05, + "loss": 4.7115, + "step": 25348 + }, + { + "epoch": 0.15075768388999905, + "grad_norm": 2.096639633178711, + "learning_rate": 4.724829684212451e-05, + "loss": 4.2179, + "step": 25349 + }, + { + "epoch": 0.15076363117328004, + "grad_norm": 1.99870765209198, + "learning_rate": 4.72480837976453e-05, + "loss": 4.3259, + "step": 25350 + }, + { + "epoch": 0.15076957845656105, + "grad_norm": 2.024890422821045, + "learning_rate": 4.724787074539953e-05, + "loss": 4.1168, + "step": 25351 + }, + { + "epoch": 0.15077552573984204, + "grad_norm": 2.2805378437042236, + "learning_rate": 4.724765768538725e-05, + "loss": 4.3184, + "step": 25352 + }, + { + "epoch": 0.15078147302312303, + "grad_norm": 2.2098236083984375, + "learning_rate": 4.7247444617608535e-05, + "loss": 4.3815, + "step": 25353 + }, + { + "epoch": 0.15078742030640405, + "grad_norm": 2.6324753761291504, + "learning_rate": 4.724723154206348e-05, + "loss": 4.3017, + "step": 25354 + }, + { + "epoch": 0.15079336758968503, + "grad_norm": 3.0926623344421387, + "learning_rate": 4.724701845875215e-05, + "loss": 4.6768, + "step": 25355 + }, + { + "epoch": 0.15079931487296602, + "grad_norm": 2.8633837699890137, + "learning_rate": 4.7246805367674603e-05, + "loss": 4.3765, + "step": 25356 + }, + { + "epoch": 0.15080526215624704, + "grad_norm": 2.4857215881347656, + "learning_rate": 4.7246592268830924e-05, + "loss": 4.3245, + "step": 25357 + }, + { + "epoch": 0.15081120943952803, + "grad_norm": 3.3124706745147705, + "learning_rate": 4.72463791622212e-05, + "loss": 4.1451, + "step": 25358 + }, + { + "epoch": 0.15081715672280901, + "grad_norm": 2.3086657524108887, + "learning_rate": 4.724616604784549e-05, + "loss": 4.5879, + "step": 25359 + }, + { + "epoch": 0.15082310400609003, + "grad_norm": 2.082601308822632, + "learning_rate": 4.724595292570387e-05, + "loss": 5.1047, + "step": 25360 + }, + { + "epoch": 0.15082905128937102, + "grad_norm": 1.6798832416534424, + "learning_rate": 4.7245739795796426e-05, + "loss": 4.7877, + "step": 25361 + }, + { + "epoch": 0.150834998572652, + "grad_norm": 2.76798152923584, + "learning_rate": 4.724552665812322e-05, + "loss": 4.1044, + "step": 25362 + }, + { + "epoch": 0.15084094585593302, + "grad_norm": 2.7487802505493164, + "learning_rate": 4.724531351268433e-05, + "loss": 4.4089, + "step": 25363 + }, + { + "epoch": 0.150846893139214, + "grad_norm": 2.2958571910858154, + "learning_rate": 4.7245100359479833e-05, + "loss": 4.1923, + "step": 25364 + }, + { + "epoch": 0.150852840422495, + "grad_norm": 2.200896978378296, + "learning_rate": 4.7244887198509805e-05, + "loss": 4.3105, + "step": 25365 + }, + { + "epoch": 0.15085878770577602, + "grad_norm": 2.0711123943328857, + "learning_rate": 4.7244674029774307e-05, + "loss": 4.3327, + "step": 25366 + }, + { + "epoch": 0.150864734989057, + "grad_norm": 1.8481465578079224, + "learning_rate": 4.724446085327342e-05, + "loss": 4.7603, + "step": 25367 + }, + { + "epoch": 0.150870682272338, + "grad_norm": 1.5740338563919067, + "learning_rate": 4.7244247669007234e-05, + "loss": 4.7191, + "step": 25368 + }, + { + "epoch": 0.15087662955561898, + "grad_norm": 1.4988723993301392, + "learning_rate": 4.724403447697581e-05, + "loss": 4.6288, + "step": 25369 + }, + { + "epoch": 0.1508825768389, + "grad_norm": 1.8862982988357544, + "learning_rate": 4.7243821277179213e-05, + "loss": 4.6308, + "step": 25370 + }, + { + "epoch": 0.15088852412218098, + "grad_norm": 1.6412887573242188, + "learning_rate": 4.7243608069617534e-05, + "loss": 5.1476, + "step": 25371 + }, + { + "epoch": 0.15089447140546197, + "grad_norm": 1.58519446849823, + "learning_rate": 4.7243394854290847e-05, + "loss": 5.6586, + "step": 25372 + }, + { + "epoch": 0.150900418688743, + "grad_norm": 1.5548374652862549, + "learning_rate": 4.724318163119921e-05, + "loss": 5.4283, + "step": 25373 + }, + { + "epoch": 0.15090636597202398, + "grad_norm": 1.456405758857727, + "learning_rate": 4.724296840034271e-05, + "loss": 5.3778, + "step": 25374 + }, + { + "epoch": 0.15091231325530496, + "grad_norm": 1.2034344673156738, + "learning_rate": 4.7242755161721424e-05, + "loss": 5.1189, + "step": 25375 + }, + { + "epoch": 0.15091826053858598, + "grad_norm": 2.2144997119903564, + "learning_rate": 4.724254191533543e-05, + "loss": 4.7091, + "step": 25376 + }, + { + "epoch": 0.15092420782186697, + "grad_norm": 2.322824239730835, + "learning_rate": 4.7242328661184774e-05, + "loss": 4.3568, + "step": 25377 + }, + { + "epoch": 0.15093015510514796, + "grad_norm": 2.832406997680664, + "learning_rate": 4.7242115399269567e-05, + "loss": 4.156, + "step": 25378 + }, + { + "epoch": 0.15093610238842897, + "grad_norm": 2.5387492179870605, + "learning_rate": 4.724190212958986e-05, + "loss": 4.2464, + "step": 25379 + }, + { + "epoch": 0.15094204967170996, + "grad_norm": 2.3497941493988037, + "learning_rate": 4.724168885214574e-05, + "loss": 4.2937, + "step": 25380 + }, + { + "epoch": 0.15094799695499095, + "grad_norm": 1.9066410064697266, + "learning_rate": 4.724147556693727e-05, + "loss": 4.3862, + "step": 25381 + }, + { + "epoch": 0.15095394423827196, + "grad_norm": 1.981546401977539, + "learning_rate": 4.724126227396454e-05, + "loss": 4.2936, + "step": 25382 + }, + { + "epoch": 0.15095989152155295, + "grad_norm": 1.7924445867538452, + "learning_rate": 4.7241048973227604e-05, + "loss": 5.173, + "step": 25383 + }, + { + "epoch": 0.15096583880483394, + "grad_norm": 1.985730528831482, + "learning_rate": 4.724083566472655e-05, + "loss": 4.6256, + "step": 25384 + }, + { + "epoch": 0.15097178608811496, + "grad_norm": 1.7368820905685425, + "learning_rate": 4.7240622348461457e-05, + "loss": 5.2259, + "step": 25385 + }, + { + "epoch": 0.15097773337139594, + "grad_norm": 1.761334776878357, + "learning_rate": 4.724040902443239e-05, + "loss": 4.8674, + "step": 25386 + }, + { + "epoch": 0.15098368065467693, + "grad_norm": 2.460028886795044, + "learning_rate": 4.724019569263942e-05, + "loss": 4.6597, + "step": 25387 + }, + { + "epoch": 0.15098962793795795, + "grad_norm": 2.524463176727295, + "learning_rate": 4.723998235308263e-05, + "loss": 4.2823, + "step": 25388 + }, + { + "epoch": 0.15099557522123894, + "grad_norm": 2.211486577987671, + "learning_rate": 4.723976900576209e-05, + "loss": 4.2802, + "step": 25389 + }, + { + "epoch": 0.15100152250451992, + "grad_norm": 2.323294162750244, + "learning_rate": 4.723955565067788e-05, + "loss": 4.2044, + "step": 25390 + }, + { + "epoch": 0.15100746978780094, + "grad_norm": 2.0671331882476807, + "learning_rate": 4.723934228783007e-05, + "loss": 4.2368, + "step": 25391 + }, + { + "epoch": 0.15101341707108193, + "grad_norm": 2.4726204872131348, + "learning_rate": 4.723912891721874e-05, + "loss": 3.9728, + "step": 25392 + }, + { + "epoch": 0.15101936435436292, + "grad_norm": 2.278228998184204, + "learning_rate": 4.7238915538843954e-05, + "loss": 4.0742, + "step": 25393 + }, + { + "epoch": 0.15102531163764393, + "grad_norm": 2.3213517665863037, + "learning_rate": 4.7238702152705794e-05, + "loss": 4.2124, + "step": 25394 + }, + { + "epoch": 0.15103125892092492, + "grad_norm": 1.7494871616363525, + "learning_rate": 4.7238488758804334e-05, + "loss": 5.1252, + "step": 25395 + }, + { + "epoch": 0.1510372062042059, + "grad_norm": 1.8289192914962769, + "learning_rate": 4.723827535713965e-05, + "loss": 4.9194, + "step": 25396 + }, + { + "epoch": 0.15104315348748693, + "grad_norm": 1.7058460712432861, + "learning_rate": 4.723806194771181e-05, + "loss": 5.2878, + "step": 25397 + }, + { + "epoch": 0.1510491007707679, + "grad_norm": 2.0224595069885254, + "learning_rate": 4.723784853052089e-05, + "loss": 4.4899, + "step": 25398 + }, + { + "epoch": 0.1510550480540489, + "grad_norm": 2.4246976375579834, + "learning_rate": 4.723763510556697e-05, + "loss": 3.9646, + "step": 25399 + }, + { + "epoch": 0.15106099533732992, + "grad_norm": 2.473158597946167, + "learning_rate": 4.723742167285012e-05, + "loss": 4.1942, + "step": 25400 + }, + { + "epoch": 0.1510669426206109, + "grad_norm": 3.9526100158691406, + "learning_rate": 4.723720823237041e-05, + "loss": 3.6103, + "step": 25401 + }, + { + "epoch": 0.1510728899038919, + "grad_norm": 3.6537516117095947, + "learning_rate": 4.723699478412793e-05, + "loss": 4.2312, + "step": 25402 + }, + { + "epoch": 0.1510788371871729, + "grad_norm": 1.5094470977783203, + "learning_rate": 4.7236781328122745e-05, + "loss": 5.577, + "step": 25403 + }, + { + "epoch": 0.1510847844704539, + "grad_norm": 1.7783223390579224, + "learning_rate": 4.7236567864354924e-05, + "loss": 5.6923, + "step": 25404 + }, + { + "epoch": 0.15109073175373489, + "grad_norm": 1.8453465700149536, + "learning_rate": 4.723635439282455e-05, + "loss": 5.3975, + "step": 25405 + }, + { + "epoch": 0.1510966790370159, + "grad_norm": 1.7783082723617554, + "learning_rate": 4.723614091353169e-05, + "loss": 5.2236, + "step": 25406 + }, + { + "epoch": 0.1511026263202969, + "grad_norm": 1.6507834196090698, + "learning_rate": 4.723592742647643e-05, + "loss": 5.3565, + "step": 25407 + }, + { + "epoch": 0.15110857360357788, + "grad_norm": 1.4875059127807617, + "learning_rate": 4.723571393165883e-05, + "loss": 5.5752, + "step": 25408 + }, + { + "epoch": 0.1511145208868589, + "grad_norm": 1.6694411039352417, + "learning_rate": 4.7235500429078985e-05, + "loss": 5.4707, + "step": 25409 + }, + { + "epoch": 0.15112046817013988, + "grad_norm": 1.7157987356185913, + "learning_rate": 4.723528691873694e-05, + "loss": 5.3777, + "step": 25410 + }, + { + "epoch": 0.15112641545342087, + "grad_norm": 2.611750602722168, + "learning_rate": 4.72350734006328e-05, + "loss": 3.1969, + "step": 25411 + }, + { + "epoch": 0.1511323627367019, + "grad_norm": 2.0207319259643555, + "learning_rate": 4.7234859874766614e-05, + "loss": 4.8871, + "step": 25412 + }, + { + "epoch": 0.15113831001998287, + "grad_norm": 2.598403215408325, + "learning_rate": 4.723464634113847e-05, + "loss": 4.9404, + "step": 25413 + }, + { + "epoch": 0.15114425730326386, + "grad_norm": 1.764269232749939, + "learning_rate": 4.723443279974845e-05, + "loss": 5.2649, + "step": 25414 + }, + { + "epoch": 0.15115020458654488, + "grad_norm": 1.8783745765686035, + "learning_rate": 4.723421925059661e-05, + "loss": 4.8755, + "step": 25415 + }, + { + "epoch": 0.15115615186982587, + "grad_norm": 1.497833251953125, + "learning_rate": 4.7234005693683035e-05, + "loss": 5.0806, + "step": 25416 + }, + { + "epoch": 0.15116209915310685, + "grad_norm": 1.6030247211456299, + "learning_rate": 4.72337921290078e-05, + "loss": 5.0388, + "step": 25417 + }, + { + "epoch": 0.15116804643638787, + "grad_norm": 1.7181298732757568, + "learning_rate": 4.723357855657098e-05, + "loss": 4.8316, + "step": 25418 + }, + { + "epoch": 0.15117399371966886, + "grad_norm": 1.4665559530258179, + "learning_rate": 4.7233364976372644e-05, + "loss": 5.5005, + "step": 25419 + }, + { + "epoch": 0.15117994100294985, + "grad_norm": 3.3794503211975098, + "learning_rate": 4.723315138841287e-05, + "loss": 3.9864, + "step": 25420 + }, + { + "epoch": 0.15118588828623086, + "grad_norm": 1.7290079593658447, + "learning_rate": 4.723293779269173e-05, + "loss": 5.3736, + "step": 25421 + }, + { + "epoch": 0.15119183556951185, + "grad_norm": 1.995943307876587, + "learning_rate": 4.723272418920931e-05, + "loss": 4.8142, + "step": 25422 + }, + { + "epoch": 0.15119778285279284, + "grad_norm": 1.8627694845199585, + "learning_rate": 4.7232510577965674e-05, + "loss": 5.2348, + "step": 25423 + }, + { + "epoch": 0.15120373013607386, + "grad_norm": 1.5469872951507568, + "learning_rate": 4.72322969589609e-05, + "loss": 5.1102, + "step": 25424 + }, + { + "epoch": 0.15120967741935484, + "grad_norm": 1.503350853919983, + "learning_rate": 4.723208333219505e-05, + "loss": 5.2009, + "step": 25425 + }, + { + "epoch": 0.15121562470263583, + "grad_norm": 1.5141102075576782, + "learning_rate": 4.7231869697668214e-05, + "loss": 5.4231, + "step": 25426 + }, + { + "epoch": 0.15122157198591682, + "grad_norm": 1.5022274255752563, + "learning_rate": 4.723165605538046e-05, + "loss": 5.1454, + "step": 25427 + }, + { + "epoch": 0.15122751926919784, + "grad_norm": 1.2774550914764404, + "learning_rate": 4.7231442405331874e-05, + "loss": 5.4048, + "step": 25428 + }, + { + "epoch": 0.15123346655247882, + "grad_norm": 1.4588242769241333, + "learning_rate": 4.723122874752251e-05, + "loss": 5.1466, + "step": 25429 + }, + { + "epoch": 0.1512394138357598, + "grad_norm": 1.6666613817214966, + "learning_rate": 4.7231015081952454e-05, + "loss": 5.6505, + "step": 25430 + }, + { + "epoch": 0.15124536111904083, + "grad_norm": 3.1419155597686768, + "learning_rate": 4.72308014086218e-05, + "loss": 5.1714, + "step": 25431 + }, + { + "epoch": 0.15125130840232182, + "grad_norm": 1.8372479677200317, + "learning_rate": 4.723058772753058e-05, + "loss": 5.3135, + "step": 25432 + }, + { + "epoch": 0.1512572556856028, + "grad_norm": 1.4300392866134644, + "learning_rate": 4.7230374038678895e-05, + "loss": 5.4404, + "step": 25433 + }, + { + "epoch": 0.15126320296888382, + "grad_norm": 1.4411662817001343, + "learning_rate": 4.723016034206682e-05, + "loss": 5.4341, + "step": 25434 + }, + { + "epoch": 0.1512691502521648, + "grad_norm": 1.4989326000213623, + "learning_rate": 4.7229946637694425e-05, + "loss": 5.3632, + "step": 25435 + }, + { + "epoch": 0.1512750975354458, + "grad_norm": 1.2930675745010376, + "learning_rate": 4.7229732925561785e-05, + "loss": 5.1667, + "step": 25436 + }, + { + "epoch": 0.1512810448187268, + "grad_norm": 1.6399480104446411, + "learning_rate": 4.722951920566898e-05, + "loss": 5.0464, + "step": 25437 + }, + { + "epoch": 0.1512869921020078, + "grad_norm": 1.6308560371398926, + "learning_rate": 4.722930547801608e-05, + "loss": 5.416, + "step": 25438 + }, + { + "epoch": 0.1512929393852888, + "grad_norm": 1.8431388139724731, + "learning_rate": 4.722909174260316e-05, + "loss": 5.6069, + "step": 25439 + }, + { + "epoch": 0.1512988866685698, + "grad_norm": 1.964154601097107, + "learning_rate": 4.722887799943028e-05, + "loss": 5.845, + "step": 25440 + }, + { + "epoch": 0.1513048339518508, + "grad_norm": 1.731370210647583, + "learning_rate": 4.722866424849753e-05, + "loss": 5.3155, + "step": 25441 + }, + { + "epoch": 0.15131078123513178, + "grad_norm": 1.9794760942459106, + "learning_rate": 4.7228450489805e-05, + "loss": 4.8395, + "step": 25442 + }, + { + "epoch": 0.1513167285184128, + "grad_norm": 2.016857862472534, + "learning_rate": 4.7228236723352735e-05, + "loss": 4.5546, + "step": 25443 + }, + { + "epoch": 0.15132267580169378, + "grad_norm": 1.9085549116134644, + "learning_rate": 4.722802294914083e-05, + "loss": 4.7848, + "step": 25444 + }, + { + "epoch": 0.15132862308497477, + "grad_norm": 1.5769025087356567, + "learning_rate": 4.7227809167169345e-05, + "loss": 5.1207, + "step": 25445 + }, + { + "epoch": 0.1513345703682558, + "grad_norm": 1.4327126741409302, + "learning_rate": 4.7227595377438364e-05, + "loss": 5.323, + "step": 25446 + }, + { + "epoch": 0.15134051765153678, + "grad_norm": 1.536750316619873, + "learning_rate": 4.722738157994796e-05, + "loss": 4.812, + "step": 25447 + }, + { + "epoch": 0.15134646493481776, + "grad_norm": 1.6312404870986938, + "learning_rate": 4.72271677746982e-05, + "loss": 4.8753, + "step": 25448 + }, + { + "epoch": 0.15135241221809878, + "grad_norm": 1.3323699235916138, + "learning_rate": 4.722695396168917e-05, + "loss": 5.6005, + "step": 25449 + }, + { + "epoch": 0.15135835950137977, + "grad_norm": 1.5522531270980835, + "learning_rate": 4.722674014092094e-05, + "loss": 5.3848, + "step": 25450 + }, + { + "epoch": 0.15136430678466076, + "grad_norm": 1.5421935319900513, + "learning_rate": 4.722652631239358e-05, + "loss": 5.4136, + "step": 25451 + }, + { + "epoch": 0.15137025406794177, + "grad_norm": 1.564570665359497, + "learning_rate": 4.722631247610718e-05, + "loss": 5.3169, + "step": 25452 + }, + { + "epoch": 0.15137620135122276, + "grad_norm": 1.7175198793411255, + "learning_rate": 4.72260986320618e-05, + "loss": 4.5904, + "step": 25453 + }, + { + "epoch": 0.15138214863450375, + "grad_norm": 1.5852707624435425, + "learning_rate": 4.722588478025751e-05, + "loss": 4.8459, + "step": 25454 + }, + { + "epoch": 0.15138809591778477, + "grad_norm": 1.4209281206130981, + "learning_rate": 4.7225670920694404e-05, + "loss": 5.4134, + "step": 25455 + }, + { + "epoch": 0.15139404320106575, + "grad_norm": 1.4841557741165161, + "learning_rate": 4.722545705337254e-05, + "loss": 5.0996, + "step": 25456 + }, + { + "epoch": 0.15139999048434674, + "grad_norm": 1.4958367347717285, + "learning_rate": 4.7225243178292e-05, + "loss": 4.5363, + "step": 25457 + }, + { + "epoch": 0.15140593776762776, + "grad_norm": 1.6424293518066406, + "learning_rate": 4.722502929545286e-05, + "loss": 5.0227, + "step": 25458 + }, + { + "epoch": 0.15141188505090875, + "grad_norm": 1.687121868133545, + "learning_rate": 4.722481540485519e-05, + "loss": 4.9662, + "step": 25459 + }, + { + "epoch": 0.15141783233418973, + "grad_norm": 1.6748243570327759, + "learning_rate": 4.722460150649907e-05, + "loss": 4.4443, + "step": 25460 + }, + { + "epoch": 0.15142377961747075, + "grad_norm": 2.2483417987823486, + "learning_rate": 4.722438760038456e-05, + "loss": 4.8411, + "step": 25461 + }, + { + "epoch": 0.15142972690075174, + "grad_norm": 1.6556822061538696, + "learning_rate": 4.7224173686511754e-05, + "loss": 5.1596, + "step": 25462 + }, + { + "epoch": 0.15143567418403273, + "grad_norm": 1.6137731075286865, + "learning_rate": 4.722395976488072e-05, + "loss": 4.6538, + "step": 25463 + }, + { + "epoch": 0.15144162146731374, + "grad_norm": 1.7086783647537231, + "learning_rate": 4.722374583549153e-05, + "loss": 5.2168, + "step": 25464 + }, + { + "epoch": 0.15144756875059473, + "grad_norm": 1.668527603149414, + "learning_rate": 4.7223531898344256e-05, + "loss": 5.138, + "step": 25465 + }, + { + "epoch": 0.15145351603387572, + "grad_norm": 2.2906320095062256, + "learning_rate": 4.722331795343899e-05, + "loss": 4.6954, + "step": 25466 + }, + { + "epoch": 0.15145946331715673, + "grad_norm": 2.410048246383667, + "learning_rate": 4.722310400077578e-05, + "loss": 4.5377, + "step": 25467 + }, + { + "epoch": 0.15146541060043772, + "grad_norm": 1.7885384559631348, + "learning_rate": 4.722289004035471e-05, + "loss": 4.8978, + "step": 25468 + }, + { + "epoch": 0.1514713578837187, + "grad_norm": 1.5193252563476562, + "learning_rate": 4.7222676072175866e-05, + "loss": 5.2818, + "step": 25469 + }, + { + "epoch": 0.15147730516699973, + "grad_norm": 2.0139195919036865, + "learning_rate": 4.7222462096239314e-05, + "loss": 4.1632, + "step": 25470 + }, + { + "epoch": 0.15148325245028071, + "grad_norm": 2.007025718688965, + "learning_rate": 4.7222248112545133e-05, + "loss": 4.0832, + "step": 25471 + }, + { + "epoch": 0.1514891997335617, + "grad_norm": 2.2270402908325195, + "learning_rate": 4.722203412109339e-05, + "loss": 4.2317, + "step": 25472 + }, + { + "epoch": 0.15149514701684272, + "grad_norm": 2.0418808460235596, + "learning_rate": 4.722182012188417e-05, + "loss": 4.1849, + "step": 25473 + }, + { + "epoch": 0.1515010943001237, + "grad_norm": 2.087785243988037, + "learning_rate": 4.722160611491754e-05, + "loss": 4.1218, + "step": 25474 + }, + { + "epoch": 0.1515070415834047, + "grad_norm": 2.303571939468384, + "learning_rate": 4.7221392100193575e-05, + "loss": 3.9614, + "step": 25475 + }, + { + "epoch": 0.1515129888666857, + "grad_norm": 1.9516772031784058, + "learning_rate": 4.722117807771235e-05, + "loss": 3.9619, + "step": 25476 + }, + { + "epoch": 0.1515189361499667, + "grad_norm": 1.9611634016036987, + "learning_rate": 4.722096404747395e-05, + "loss": 3.9133, + "step": 25477 + }, + { + "epoch": 0.1515248834332477, + "grad_norm": 1.9254827499389648, + "learning_rate": 4.722075000947843e-05, + "loss": 3.877, + "step": 25478 + }, + { + "epoch": 0.1515308307165287, + "grad_norm": 1.803846001625061, + "learning_rate": 4.722053596372588e-05, + "loss": 3.8338, + "step": 25479 + }, + { + "epoch": 0.1515367779998097, + "grad_norm": 1.829439401626587, + "learning_rate": 4.722032191021637e-05, + "loss": 3.8183, + "step": 25480 + }, + { + "epoch": 0.15154272528309068, + "grad_norm": 1.7955585718154907, + "learning_rate": 4.722010784894998e-05, + "loss": 4.6821, + "step": 25481 + }, + { + "epoch": 0.1515486725663717, + "grad_norm": 2.9624781608581543, + "learning_rate": 4.7219893779926775e-05, + "loss": 3.9385, + "step": 25482 + }, + { + "epoch": 0.15155461984965268, + "grad_norm": 1.8687463998794556, + "learning_rate": 4.721967970314684e-05, + "loss": 4.0364, + "step": 25483 + }, + { + "epoch": 0.15156056713293367, + "grad_norm": 1.9090644121170044, + "learning_rate": 4.721946561861024e-05, + "loss": 3.8046, + "step": 25484 + }, + { + "epoch": 0.15156651441621466, + "grad_norm": 1.9757955074310303, + "learning_rate": 4.721925152631706e-05, + "loss": 3.943, + "step": 25485 + }, + { + "epoch": 0.15157246169949568, + "grad_norm": 1.9161666631698608, + "learning_rate": 4.7219037426267356e-05, + "loss": 3.8818, + "step": 25486 + }, + { + "epoch": 0.15157840898277666, + "grad_norm": 1.8484982252120972, + "learning_rate": 4.7218823318461226e-05, + "loss": 4.0713, + "step": 25487 + }, + { + "epoch": 0.15158435626605765, + "grad_norm": 1.6787267923355103, + "learning_rate": 4.7218609202898726e-05, + "loss": 5.7814, + "step": 25488 + }, + { + "epoch": 0.15159030354933867, + "grad_norm": 1.6946018934249878, + "learning_rate": 4.7218395079579946e-05, + "loss": 5.9241, + "step": 25489 + }, + { + "epoch": 0.15159625083261966, + "grad_norm": 1.5210212469100952, + "learning_rate": 4.721818094850495e-05, + "loss": 6.0828, + "step": 25490 + }, + { + "epoch": 0.15160219811590064, + "grad_norm": 1.7792625427246094, + "learning_rate": 4.721796680967382e-05, + "loss": 6.241, + "step": 25491 + }, + { + "epoch": 0.15160814539918166, + "grad_norm": 1.5366078615188599, + "learning_rate": 4.7217752663086626e-05, + "loss": 5.7111, + "step": 25492 + }, + { + "epoch": 0.15161409268246265, + "grad_norm": 1.5193569660186768, + "learning_rate": 4.721753850874344e-05, + "loss": 5.3155, + "step": 25493 + }, + { + "epoch": 0.15162003996574364, + "grad_norm": 1.9060078859329224, + "learning_rate": 4.7217324346644356e-05, + "loss": 5.368, + "step": 25494 + }, + { + "epoch": 0.15162598724902465, + "grad_norm": 1.4217309951782227, + "learning_rate": 4.7217110176789416e-05, + "loss": 5.4781, + "step": 25495 + }, + { + "epoch": 0.15163193453230564, + "grad_norm": 1.561132550239563, + "learning_rate": 4.7216895999178725e-05, + "loss": 5.3316, + "step": 25496 + }, + { + "epoch": 0.15163788181558663, + "grad_norm": 1.397314429283142, + "learning_rate": 4.7216681813812335e-05, + "loss": 5.4047, + "step": 25497 + }, + { + "epoch": 0.15164382909886764, + "grad_norm": 1.3138307332992554, + "learning_rate": 4.7216467620690335e-05, + "loss": 5.3706, + "step": 25498 + }, + { + "epoch": 0.15164977638214863, + "grad_norm": 1.4298443794250488, + "learning_rate": 4.7216253419812794e-05, + "loss": 5.3704, + "step": 25499 + }, + { + "epoch": 0.15165572366542962, + "grad_norm": 1.703792929649353, + "learning_rate": 4.72160392111798e-05, + "loss": 5.2468, + "step": 25500 + }, + { + "epoch": 0.15166167094871064, + "grad_norm": 1.566309928894043, + "learning_rate": 4.72158249947914e-05, + "loss": 5.5153, + "step": 25501 + }, + { + "epoch": 0.15166761823199162, + "grad_norm": 1.3141274452209473, + "learning_rate": 4.721561077064769e-05, + "loss": 5.6254, + "step": 25502 + }, + { + "epoch": 0.1516735655152726, + "grad_norm": 1.4979000091552734, + "learning_rate": 4.721539653874874e-05, + "loss": 5.4936, + "step": 25503 + }, + { + "epoch": 0.15167951279855363, + "grad_norm": 1.694068193435669, + "learning_rate": 4.721518229909463e-05, + "loss": 5.6601, + "step": 25504 + }, + { + "epoch": 0.15168546008183462, + "grad_norm": 1.8887871503829956, + "learning_rate": 4.721496805168543e-05, + "loss": 4.8596, + "step": 25505 + }, + { + "epoch": 0.1516914073651156, + "grad_norm": 2.5169517993927, + "learning_rate": 4.721475379652121e-05, + "loss": 4.0797, + "step": 25506 + }, + { + "epoch": 0.15169735464839662, + "grad_norm": 2.4206509590148926, + "learning_rate": 4.7214539533602046e-05, + "loss": 3.9878, + "step": 25507 + }, + { + "epoch": 0.1517033019316776, + "grad_norm": 2.054685354232788, + "learning_rate": 4.7214325262928013e-05, + "loss": 3.948, + "step": 25508 + }, + { + "epoch": 0.1517092492149586, + "grad_norm": 1.4626624584197998, + "learning_rate": 4.721411098449919e-05, + "loss": 5.4617, + "step": 25509 + }, + { + "epoch": 0.1517151964982396, + "grad_norm": 1.7592542171478271, + "learning_rate": 4.721389669831566e-05, + "loss": 5.4125, + "step": 25510 + }, + { + "epoch": 0.1517211437815206, + "grad_norm": 1.669419288635254, + "learning_rate": 4.721368240437748e-05, + "loss": 5.4718, + "step": 25511 + }, + { + "epoch": 0.1517270910648016, + "grad_norm": 1.0741300582885742, + "learning_rate": 4.721346810268473e-05, + "loss": 5.5668, + "step": 25512 + }, + { + "epoch": 0.1517330383480826, + "grad_norm": 1.41902494430542, + "learning_rate": 4.72132537932375e-05, + "loss": 5.5451, + "step": 25513 + }, + { + "epoch": 0.1517389856313636, + "grad_norm": 1.7693331241607666, + "learning_rate": 4.721303947603584e-05, + "loss": 5.7588, + "step": 25514 + }, + { + "epoch": 0.15174493291464458, + "grad_norm": 1.7695659399032593, + "learning_rate": 4.7212825151079844e-05, + "loss": 5.6659, + "step": 25515 + }, + { + "epoch": 0.1517508801979256, + "grad_norm": 1.5901025533676147, + "learning_rate": 4.7212610818369586e-05, + "loss": 5.3805, + "step": 25516 + }, + { + "epoch": 0.15175682748120659, + "grad_norm": 1.8363381624221802, + "learning_rate": 4.721239647790512e-05, + "loss": 5.808, + "step": 25517 + }, + { + "epoch": 0.15176277476448757, + "grad_norm": 1.7976000308990479, + "learning_rate": 4.721218212968655e-05, + "loss": 5.7034, + "step": 25518 + }, + { + "epoch": 0.1517687220477686, + "grad_norm": 1.7203330993652344, + "learning_rate": 4.721196777371393e-05, + "loss": 5.4174, + "step": 25519 + }, + { + "epoch": 0.15177466933104958, + "grad_norm": 1.6678218841552734, + "learning_rate": 4.7211753409987344e-05, + "loss": 5.4002, + "step": 25520 + }, + { + "epoch": 0.15178061661433057, + "grad_norm": 1.3932818174362183, + "learning_rate": 4.721153903850686e-05, + "loss": 5.7598, + "step": 25521 + }, + { + "epoch": 0.15178656389761158, + "grad_norm": 1.4975392818450928, + "learning_rate": 4.721132465927256e-05, + "loss": 5.2991, + "step": 25522 + }, + { + "epoch": 0.15179251118089257, + "grad_norm": 1.5375689268112183, + "learning_rate": 4.721111027228452e-05, + "loss": 5.7456, + "step": 25523 + }, + { + "epoch": 0.15179845846417356, + "grad_norm": 1.6894830465316772, + "learning_rate": 4.72108958775428e-05, + "loss": 5.1867, + "step": 25524 + }, + { + "epoch": 0.15180440574745457, + "grad_norm": 1.569059133529663, + "learning_rate": 4.72106814750475e-05, + "loss": 5.4544, + "step": 25525 + }, + { + "epoch": 0.15181035303073556, + "grad_norm": 1.5884952545166016, + "learning_rate": 4.721046706479867e-05, + "loss": 5.1496, + "step": 25526 + }, + { + "epoch": 0.15181630031401655, + "grad_norm": 1.552410364151001, + "learning_rate": 4.721025264679639e-05, + "loss": 5.0916, + "step": 25527 + }, + { + "epoch": 0.15182224759729757, + "grad_norm": 1.5972039699554443, + "learning_rate": 4.721003822104076e-05, + "loss": 5.2073, + "step": 25528 + }, + { + "epoch": 0.15182819488057855, + "grad_norm": 1.6742616891860962, + "learning_rate": 4.720982378753182e-05, + "loss": 5.4851, + "step": 25529 + }, + { + "epoch": 0.15183414216385954, + "grad_norm": 1.4974780082702637, + "learning_rate": 4.7209609346269665e-05, + "loss": 5.4444, + "step": 25530 + }, + { + "epoch": 0.15184008944714056, + "grad_norm": 1.5599150657653809, + "learning_rate": 4.7209394897254363e-05, + "loss": 4.8842, + "step": 25531 + }, + { + "epoch": 0.15184603673042155, + "grad_norm": 1.3979945182800293, + "learning_rate": 4.7209180440485986e-05, + "loss": 5.2836, + "step": 25532 + }, + { + "epoch": 0.15185198401370253, + "grad_norm": 1.3515275716781616, + "learning_rate": 4.720896597596462e-05, + "loss": 5.3011, + "step": 25533 + }, + { + "epoch": 0.15185793129698355, + "grad_norm": 1.7592774629592896, + "learning_rate": 4.720875150369034e-05, + "loss": 5.0874, + "step": 25534 + }, + { + "epoch": 0.15186387858026454, + "grad_norm": 1.5977163314819336, + "learning_rate": 4.72085370236632e-05, + "loss": 4.7678, + "step": 25535 + }, + { + "epoch": 0.15186982586354553, + "grad_norm": 1.3309252262115479, + "learning_rate": 4.7208322535883295e-05, + "loss": 4.9821, + "step": 25536 + }, + { + "epoch": 0.15187577314682654, + "grad_norm": 1.5985299348831177, + "learning_rate": 4.720810804035069e-05, + "loss": 5.1845, + "step": 25537 + }, + { + "epoch": 0.15188172043010753, + "grad_norm": 1.6021031141281128, + "learning_rate": 4.7207893537065475e-05, + "loss": 5.1628, + "step": 25538 + }, + { + "epoch": 0.15188766771338852, + "grad_norm": 1.6445283889770508, + "learning_rate": 4.7207679026027704e-05, + "loss": 4.7933, + "step": 25539 + }, + { + "epoch": 0.15189361499666953, + "grad_norm": 1.6480634212493896, + "learning_rate": 4.7207464507237474e-05, + "loss": 4.7912, + "step": 25540 + }, + { + "epoch": 0.15189956227995052, + "grad_norm": 1.7439652681350708, + "learning_rate": 4.720724998069483e-05, + "loss": 4.5412, + "step": 25541 + }, + { + "epoch": 0.1519055095632315, + "grad_norm": 1.5786992311477661, + "learning_rate": 4.720703544639988e-05, + "loss": 4.8873, + "step": 25542 + }, + { + "epoch": 0.1519114568465125, + "grad_norm": 1.3782871961593628, + "learning_rate": 4.7206820904352675e-05, + "loss": 4.5825, + "step": 25543 + }, + { + "epoch": 0.15191740412979352, + "grad_norm": 1.8048298358917236, + "learning_rate": 4.72066063545533e-05, + "loss": 4.746, + "step": 25544 + }, + { + "epoch": 0.1519233514130745, + "grad_norm": 1.4801894426345825, + "learning_rate": 4.7206391797001826e-05, + "loss": 4.8802, + "step": 25545 + }, + { + "epoch": 0.1519292986963555, + "grad_norm": 1.7984564304351807, + "learning_rate": 4.7206177231698333e-05, + "loss": 4.7674, + "step": 25546 + }, + { + "epoch": 0.1519352459796365, + "grad_norm": 1.7244421243667603, + "learning_rate": 4.72059626586429e-05, + "loss": 5.2729, + "step": 25547 + }, + { + "epoch": 0.1519411932629175, + "grad_norm": 1.2454429864883423, + "learning_rate": 4.7205748077835584e-05, + "loss": 4.9657, + "step": 25548 + }, + { + "epoch": 0.15194714054619848, + "grad_norm": 1.5179264545440674, + "learning_rate": 4.720553348927647e-05, + "loss": 5.2248, + "step": 25549 + }, + { + "epoch": 0.1519530878294795, + "grad_norm": 1.6204310655593872, + "learning_rate": 4.7205318892965636e-05, + "loss": 4.7349, + "step": 25550 + }, + { + "epoch": 0.1519590351127605, + "grad_norm": 1.6427180767059326, + "learning_rate": 4.7205104288903156e-05, + "loss": 4.9733, + "step": 25551 + }, + { + "epoch": 0.15196498239604148, + "grad_norm": 1.7110134363174438, + "learning_rate": 4.7204889677089104e-05, + "loss": 5.1714, + "step": 25552 + }, + { + "epoch": 0.1519709296793225, + "grad_norm": 1.6110901832580566, + "learning_rate": 4.7204675057523556e-05, + "loss": 5.409, + "step": 25553 + }, + { + "epoch": 0.15197687696260348, + "grad_norm": 1.7748627662658691, + "learning_rate": 4.720446043020658e-05, + "loss": 5.443, + "step": 25554 + }, + { + "epoch": 0.15198282424588447, + "grad_norm": 1.574576497077942, + "learning_rate": 4.720424579513826e-05, + "loss": 4.9988, + "step": 25555 + }, + { + "epoch": 0.15198877152916548, + "grad_norm": 1.4916949272155762, + "learning_rate": 4.720403115231867e-05, + "loss": 4.9242, + "step": 25556 + }, + { + "epoch": 0.15199471881244647, + "grad_norm": 1.4862215518951416, + "learning_rate": 4.7203816501747875e-05, + "loss": 5.2778, + "step": 25557 + }, + { + "epoch": 0.15200066609572746, + "grad_norm": 1.445859670639038, + "learning_rate": 4.720360184342597e-05, + "loss": 5.6821, + "step": 25558 + }, + { + "epoch": 0.15200661337900848, + "grad_norm": 1.5154931545257568, + "learning_rate": 4.7203387177353006e-05, + "loss": 5.1821, + "step": 25559 + }, + { + "epoch": 0.15201256066228946, + "grad_norm": 1.1950480937957764, + "learning_rate": 4.720317250352907e-05, + "loss": 5.55, + "step": 25560 + }, + { + "epoch": 0.15201850794557045, + "grad_norm": 1.4134416580200195, + "learning_rate": 4.720295782195423e-05, + "loss": 5.7252, + "step": 25561 + }, + { + "epoch": 0.15202445522885147, + "grad_norm": 1.5440611839294434, + "learning_rate": 4.720274313262858e-05, + "loss": 5.5527, + "step": 25562 + }, + { + "epoch": 0.15203040251213246, + "grad_norm": 1.3670108318328857, + "learning_rate": 4.720252843555217e-05, + "loss": 5.459, + "step": 25563 + }, + { + "epoch": 0.15203634979541344, + "grad_norm": 1.4591896533966064, + "learning_rate": 4.7202313730725094e-05, + "loss": 5.4654, + "step": 25564 + }, + { + "epoch": 0.15204229707869446, + "grad_norm": 1.675755500793457, + "learning_rate": 4.7202099018147414e-05, + "loss": 5.4915, + "step": 25565 + }, + { + "epoch": 0.15204824436197545, + "grad_norm": 1.9771230220794678, + "learning_rate": 4.720188429781922e-05, + "loss": 4.8577, + "step": 25566 + }, + { + "epoch": 0.15205419164525644, + "grad_norm": 1.3904792070388794, + "learning_rate": 4.720166956974057e-05, + "loss": 5.4445, + "step": 25567 + }, + { + "epoch": 0.15206013892853745, + "grad_norm": 1.4478521347045898, + "learning_rate": 4.720145483391155e-05, + "loss": 5.1729, + "step": 25568 + }, + { + "epoch": 0.15206608621181844, + "grad_norm": 2.138211250305176, + "learning_rate": 4.720124009033223e-05, + "loss": 4.0202, + "step": 25569 + }, + { + "epoch": 0.15207203349509943, + "grad_norm": 2.1613049507141113, + "learning_rate": 4.720102533900268e-05, + "loss": 4.0708, + "step": 25570 + }, + { + "epoch": 0.15207798077838044, + "grad_norm": 2.3467164039611816, + "learning_rate": 4.7200810579922996e-05, + "loss": 4.0428, + "step": 25571 + }, + { + "epoch": 0.15208392806166143, + "grad_norm": 2.0889739990234375, + "learning_rate": 4.720059581309323e-05, + "loss": 4.1653, + "step": 25572 + }, + { + "epoch": 0.15208987534494242, + "grad_norm": 1.611956238746643, + "learning_rate": 4.720038103851346e-05, + "loss": 5.3328, + "step": 25573 + }, + { + "epoch": 0.15209582262822344, + "grad_norm": 1.3318549394607544, + "learning_rate": 4.7200166256183776e-05, + "loss": 5.4102, + "step": 25574 + }, + { + "epoch": 0.15210176991150443, + "grad_norm": 1.674455165863037, + "learning_rate": 4.7199951466104234e-05, + "loss": 5.21, + "step": 25575 + }, + { + "epoch": 0.1521077171947854, + "grad_norm": 1.4780274629592896, + "learning_rate": 4.7199736668274924e-05, + "loss": 5.3385, + "step": 25576 + }, + { + "epoch": 0.15211366447806643, + "grad_norm": 1.7735114097595215, + "learning_rate": 4.719952186269592e-05, + "loss": 4.8768, + "step": 25577 + }, + { + "epoch": 0.15211961176134742, + "grad_norm": 1.6420248746871948, + "learning_rate": 4.719930704936728e-05, + "loss": 5.2584, + "step": 25578 + }, + { + "epoch": 0.1521255590446284, + "grad_norm": 1.970648169517517, + "learning_rate": 4.71990922282891e-05, + "loss": 4.4764, + "step": 25579 + }, + { + "epoch": 0.15213150632790942, + "grad_norm": 1.4318586587905884, + "learning_rate": 4.719887739946145e-05, + "loss": 5.5169, + "step": 25580 + }, + { + "epoch": 0.1521374536111904, + "grad_norm": 1.7637288570404053, + "learning_rate": 4.719866256288439e-05, + "loss": 5.1493, + "step": 25581 + }, + { + "epoch": 0.1521434008944714, + "grad_norm": 1.7159098386764526, + "learning_rate": 4.719844771855801e-05, + "loss": 5.3964, + "step": 25582 + }, + { + "epoch": 0.1521493481777524, + "grad_norm": 1.6556905508041382, + "learning_rate": 4.719823286648238e-05, + "loss": 5.3116, + "step": 25583 + }, + { + "epoch": 0.1521552954610334, + "grad_norm": 1.5177308320999146, + "learning_rate": 4.7198018006657584e-05, + "loss": 5.8963, + "step": 25584 + }, + { + "epoch": 0.1521612427443144, + "grad_norm": 1.960729718208313, + "learning_rate": 4.719780313908368e-05, + "loss": 5.266, + "step": 25585 + }, + { + "epoch": 0.1521671900275954, + "grad_norm": 1.6893891096115112, + "learning_rate": 4.719758826376076e-05, + "loss": 5.3618, + "step": 25586 + }, + { + "epoch": 0.1521731373108764, + "grad_norm": 1.5606249570846558, + "learning_rate": 4.719737338068889e-05, + "loss": 5.8684, + "step": 25587 + }, + { + "epoch": 0.15217908459415738, + "grad_norm": 1.6435186862945557, + "learning_rate": 4.7197158489868143e-05, + "loss": 4.9082, + "step": 25588 + }, + { + "epoch": 0.1521850318774384, + "grad_norm": 1.9077845811843872, + "learning_rate": 4.71969435912986e-05, + "loss": 4.0132, + "step": 25589 + }, + { + "epoch": 0.1521909791607194, + "grad_norm": 1.4427006244659424, + "learning_rate": 4.719672868498034e-05, + "loss": 5.5848, + "step": 25590 + }, + { + "epoch": 0.15219692644400037, + "grad_norm": 1.671826958656311, + "learning_rate": 4.719651377091342e-05, + "loss": 5.0797, + "step": 25591 + }, + { + "epoch": 0.1522028737272814, + "grad_norm": 1.8073980808258057, + "learning_rate": 4.719629884909793e-05, + "loss": 3.8879, + "step": 25592 + }, + { + "epoch": 0.15220882101056238, + "grad_norm": 1.8267574310302734, + "learning_rate": 4.719608391953394e-05, + "loss": 3.8104, + "step": 25593 + }, + { + "epoch": 0.15221476829384337, + "grad_norm": 1.8598294258117676, + "learning_rate": 4.7195868982221526e-05, + "loss": 3.6587, + "step": 25594 + }, + { + "epoch": 0.15222071557712438, + "grad_norm": 1.705465316772461, + "learning_rate": 4.7195654037160765e-05, + "loss": 3.9886, + "step": 25595 + }, + { + "epoch": 0.15222666286040537, + "grad_norm": 1.8253175020217896, + "learning_rate": 4.7195439084351734e-05, + "loss": 3.9031, + "step": 25596 + }, + { + "epoch": 0.15223261014368636, + "grad_norm": 1.718245506286621, + "learning_rate": 4.71952241237945e-05, + "loss": 4.2814, + "step": 25597 + }, + { + "epoch": 0.15223855742696737, + "grad_norm": 1.7115817070007324, + "learning_rate": 4.719500915548914e-05, + "loss": 4.748, + "step": 25598 + }, + { + "epoch": 0.15224450471024836, + "grad_norm": 1.53532874584198, + "learning_rate": 4.719479417943574e-05, + "loss": 5.499, + "step": 25599 + }, + { + "epoch": 0.15225045199352935, + "grad_norm": 1.854274868965149, + "learning_rate": 4.719457919563436e-05, + "loss": 4.1188, + "step": 25600 + }, + { + "epoch": 0.15225639927681037, + "grad_norm": 2.001619338989258, + "learning_rate": 4.7194364204085085e-05, + "loss": 3.89, + "step": 25601 + }, + { + "epoch": 0.15226234656009136, + "grad_norm": 1.9772802591323853, + "learning_rate": 4.7194149204787986e-05, + "loss": 3.8764, + "step": 25602 + }, + { + "epoch": 0.15226829384337234, + "grad_norm": 1.9361356496810913, + "learning_rate": 4.719393419774314e-05, + "loss": 5.0285, + "step": 25603 + }, + { + "epoch": 0.15227424112665333, + "grad_norm": 1.6824191808700562, + "learning_rate": 4.719371918295061e-05, + "loss": 5.2847, + "step": 25604 + }, + { + "epoch": 0.15228018840993435, + "grad_norm": 2.423736095428467, + "learning_rate": 4.7193504160410495e-05, + "loss": 4.087, + "step": 25605 + }, + { + "epoch": 0.15228613569321534, + "grad_norm": 1.711818814277649, + "learning_rate": 4.719328913012285e-05, + "loss": 5.0702, + "step": 25606 + }, + { + "epoch": 0.15229208297649632, + "grad_norm": 2.406665325164795, + "learning_rate": 4.7193074092087765e-05, + "loss": 4.1674, + "step": 25607 + }, + { + "epoch": 0.15229803025977734, + "grad_norm": 2.0252084732055664, + "learning_rate": 4.71928590463053e-05, + "loss": 3.9202, + "step": 25608 + }, + { + "epoch": 0.15230397754305833, + "grad_norm": 1.6908705234527588, + "learning_rate": 4.7192643992775534e-05, + "loss": 4.5446, + "step": 25609 + }, + { + "epoch": 0.15230992482633932, + "grad_norm": 1.2706576585769653, + "learning_rate": 4.719242893149855e-05, + "loss": 5.6578, + "step": 25610 + }, + { + "epoch": 0.15231587210962033, + "grad_norm": 1.380682349205017, + "learning_rate": 4.719221386247442e-05, + "loss": 5.6256, + "step": 25611 + }, + { + "epoch": 0.15232181939290132, + "grad_norm": 1.6104844808578491, + "learning_rate": 4.7191998785703214e-05, + "loss": 5.5271, + "step": 25612 + }, + { + "epoch": 0.1523277666761823, + "grad_norm": 1.5654959678649902, + "learning_rate": 4.719178370118502e-05, + "loss": 5.0767, + "step": 25613 + }, + { + "epoch": 0.15233371395946332, + "grad_norm": 1.7980438470840454, + "learning_rate": 4.719156860891989e-05, + "loss": 4.6667, + "step": 25614 + }, + { + "epoch": 0.1523396612427443, + "grad_norm": 1.6443228721618652, + "learning_rate": 4.719135350890792e-05, + "loss": 4.2763, + "step": 25615 + }, + { + "epoch": 0.1523456085260253, + "grad_norm": 1.442205548286438, + "learning_rate": 4.719113840114918e-05, + "loss": 5.0442, + "step": 25616 + }, + { + "epoch": 0.15235155580930632, + "grad_norm": 1.5215251445770264, + "learning_rate": 4.719092328564374e-05, + "loss": 5.2175, + "step": 25617 + }, + { + "epoch": 0.1523575030925873, + "grad_norm": 1.4463436603546143, + "learning_rate": 4.7190708162391677e-05, + "loss": 5.6153, + "step": 25618 + }, + { + "epoch": 0.1523634503758683, + "grad_norm": 1.624923825263977, + "learning_rate": 4.719049303139307e-05, + "loss": 5.4211, + "step": 25619 + }, + { + "epoch": 0.1523693976591493, + "grad_norm": 1.5821541547775269, + "learning_rate": 4.719027789264799e-05, + "loss": 5.7905, + "step": 25620 + }, + { + "epoch": 0.1523753449424303, + "grad_norm": 1.6683502197265625, + "learning_rate": 4.719006274615651e-05, + "loss": 5.112, + "step": 25621 + }, + { + "epoch": 0.15238129222571128, + "grad_norm": 1.3617998361587524, + "learning_rate": 4.7189847591918714e-05, + "loss": 5.3799, + "step": 25622 + }, + { + "epoch": 0.1523872395089923, + "grad_norm": 1.5106703042984009, + "learning_rate": 4.718963242993466e-05, + "loss": 4.9833, + "step": 25623 + }, + { + "epoch": 0.1523931867922733, + "grad_norm": 1.7020819187164307, + "learning_rate": 4.718941726020445e-05, + "loss": 4.2403, + "step": 25624 + }, + { + "epoch": 0.15239913407555428, + "grad_norm": 1.5678812265396118, + "learning_rate": 4.7189202082728133e-05, + "loss": 5.0985, + "step": 25625 + }, + { + "epoch": 0.1524050813588353, + "grad_norm": 1.4727619886398315, + "learning_rate": 4.71889868975058e-05, + "loss": 4.9088, + "step": 25626 + }, + { + "epoch": 0.15241102864211628, + "grad_norm": 1.5460275411605835, + "learning_rate": 4.7188771704537515e-05, + "loss": 5.2766, + "step": 25627 + }, + { + "epoch": 0.15241697592539727, + "grad_norm": 1.5763301849365234, + "learning_rate": 4.7188556503823366e-05, + "loss": 4.9134, + "step": 25628 + }, + { + "epoch": 0.15242292320867828, + "grad_norm": 1.8980252742767334, + "learning_rate": 4.718834129536341e-05, + "loss": 4.9331, + "step": 25629 + }, + { + "epoch": 0.15242887049195927, + "grad_norm": 2.768523693084717, + "learning_rate": 4.7188126079157744e-05, + "loss": 4.3952, + "step": 25630 + }, + { + "epoch": 0.15243481777524026, + "grad_norm": 2.6490437984466553, + "learning_rate": 4.718791085520643e-05, + "loss": 4.1387, + "step": 25631 + }, + { + "epoch": 0.15244076505852128, + "grad_norm": 1.806143879890442, + "learning_rate": 4.718769562350955e-05, + "loss": 4.7686, + "step": 25632 + }, + { + "epoch": 0.15244671234180227, + "grad_norm": 1.6871095895767212, + "learning_rate": 4.718748038406717e-05, + "loss": 5.3937, + "step": 25633 + }, + { + "epoch": 0.15245265962508325, + "grad_norm": 2.2100014686584473, + "learning_rate": 4.7187265136879364e-05, + "loss": 4.7869, + "step": 25634 + }, + { + "epoch": 0.15245860690836427, + "grad_norm": 1.978220820426941, + "learning_rate": 4.7187049881946224e-05, + "loss": 4.4701, + "step": 25635 + }, + { + "epoch": 0.15246455419164526, + "grad_norm": 1.8031092882156372, + "learning_rate": 4.718683461926781e-05, + "loss": 4.5107, + "step": 25636 + }, + { + "epoch": 0.15247050147492625, + "grad_norm": 1.795417308807373, + "learning_rate": 4.7186619348844196e-05, + "loss": 5.2659, + "step": 25637 + }, + { + "epoch": 0.15247644875820726, + "grad_norm": 2.3051810264587402, + "learning_rate": 4.718640407067547e-05, + "loss": 4.5413, + "step": 25638 + }, + { + "epoch": 0.15248239604148825, + "grad_norm": 1.983340859413147, + "learning_rate": 4.71861887847617e-05, + "loss": 4.5167, + "step": 25639 + }, + { + "epoch": 0.15248834332476924, + "grad_norm": 1.7354977130889893, + "learning_rate": 4.718597349110295e-05, + "loss": 4.5704, + "step": 25640 + }, + { + "epoch": 0.15249429060805025, + "grad_norm": 1.9091737270355225, + "learning_rate": 4.7185758189699313e-05, + "loss": 4.4381, + "step": 25641 + }, + { + "epoch": 0.15250023789133124, + "grad_norm": 1.8753962516784668, + "learning_rate": 4.718554288055086e-05, + "loss": 4.445, + "step": 25642 + }, + { + "epoch": 0.15250618517461223, + "grad_norm": 1.7315021753311157, + "learning_rate": 4.718532756365765e-05, + "loss": 4.7802, + "step": 25643 + }, + { + "epoch": 0.15251213245789325, + "grad_norm": 1.4017493724822998, + "learning_rate": 4.718511223901979e-05, + "loss": 5.3923, + "step": 25644 + }, + { + "epoch": 0.15251807974117423, + "grad_norm": 1.8367207050323486, + "learning_rate": 4.7184896906637326e-05, + "loss": 4.6229, + "step": 25645 + }, + { + "epoch": 0.15252402702445522, + "grad_norm": 2.3250296115875244, + "learning_rate": 4.718468156651035e-05, + "loss": 4.6332, + "step": 25646 + }, + { + "epoch": 0.15252997430773624, + "grad_norm": 2.047855854034424, + "learning_rate": 4.7184466218638925e-05, + "loss": 4.5316, + "step": 25647 + }, + { + "epoch": 0.15253592159101723, + "grad_norm": 1.9817044734954834, + "learning_rate": 4.7184250863023125e-05, + "loss": 4.3888, + "step": 25648 + }, + { + "epoch": 0.15254186887429821, + "grad_norm": 1.889957308769226, + "learning_rate": 4.718403549966305e-05, + "loss": 4.6436, + "step": 25649 + }, + { + "epoch": 0.15254781615757923, + "grad_norm": 1.4799065589904785, + "learning_rate": 4.718382012855874e-05, + "loss": 4.7965, + "step": 25650 + }, + { + "epoch": 0.15255376344086022, + "grad_norm": 2.046947717666626, + "learning_rate": 4.7183604749710296e-05, + "loss": 4.3206, + "step": 25651 + }, + { + "epoch": 0.1525597107241412, + "grad_norm": 1.970746636390686, + "learning_rate": 4.718338936311778e-05, + "loss": 4.3668, + "step": 25652 + }, + { + "epoch": 0.15256565800742222, + "grad_norm": 1.889931321144104, + "learning_rate": 4.718317396878128e-05, + "loss": 4.3436, + "step": 25653 + }, + { + "epoch": 0.1525716052907032, + "grad_norm": 2.0069503784179688, + "learning_rate": 4.7182958566700865e-05, + "loss": 4.5258, + "step": 25654 + }, + { + "epoch": 0.1525775525739842, + "grad_norm": 2.222224712371826, + "learning_rate": 4.7182743156876596e-05, + "loss": 4.362, + "step": 25655 + }, + { + "epoch": 0.15258349985726521, + "grad_norm": 2.2478747367858887, + "learning_rate": 4.718252773930857e-05, + "loss": 4.7401, + "step": 25656 + }, + { + "epoch": 0.1525894471405462, + "grad_norm": 2.224696636199951, + "learning_rate": 4.718231231399685e-05, + "loss": 4.5413, + "step": 25657 + }, + { + "epoch": 0.1525953944238272, + "grad_norm": 1.9385725259780884, + "learning_rate": 4.718209688094152e-05, + "loss": 4.7279, + "step": 25658 + }, + { + "epoch": 0.1526013417071082, + "grad_norm": 2.030127763748169, + "learning_rate": 4.718188144014264e-05, + "loss": 4.4943, + "step": 25659 + }, + { + "epoch": 0.1526072889903892, + "grad_norm": 2.115994453430176, + "learning_rate": 4.7181665991600296e-05, + "loss": 4.5709, + "step": 25660 + }, + { + "epoch": 0.15261323627367018, + "grad_norm": 1.6957606077194214, + "learning_rate": 4.718145053531456e-05, + "loss": 4.8779, + "step": 25661 + }, + { + "epoch": 0.15261918355695117, + "grad_norm": 1.9567986726760864, + "learning_rate": 4.718123507128551e-05, + "loss": 4.5541, + "step": 25662 + }, + { + "epoch": 0.1526251308402322, + "grad_norm": 2.147771120071411, + "learning_rate": 4.718101959951323e-05, + "loss": 4.5141, + "step": 25663 + }, + { + "epoch": 0.15263107812351318, + "grad_norm": 2.1374590396881104, + "learning_rate": 4.7180804119997774e-05, + "loss": 4.3474, + "step": 25664 + }, + { + "epoch": 0.15263702540679416, + "grad_norm": 2.060826539993286, + "learning_rate": 4.718058863273923e-05, + "loss": 4.4178, + "step": 25665 + }, + { + "epoch": 0.15264297269007518, + "grad_norm": 1.9931002855300903, + "learning_rate": 4.7180373137737673e-05, + "loss": 4.3213, + "step": 25666 + }, + { + "epoch": 0.15264891997335617, + "grad_norm": 1.3702372312545776, + "learning_rate": 4.718015763499318e-05, + "loss": 5.0551, + "step": 25667 + }, + { + "epoch": 0.15265486725663716, + "grad_norm": 1.8524867296218872, + "learning_rate": 4.7179942124505814e-05, + "loss": 5.0618, + "step": 25668 + }, + { + "epoch": 0.15266081453991817, + "grad_norm": 1.876756191253662, + "learning_rate": 4.717972660627567e-05, + "loss": 4.2719, + "step": 25669 + }, + { + "epoch": 0.15266676182319916, + "grad_norm": 2.0334908962249756, + "learning_rate": 4.7179511080302804e-05, + "loss": 4.5764, + "step": 25670 + }, + { + "epoch": 0.15267270910648015, + "grad_norm": 2.554891347885132, + "learning_rate": 4.717929554658731e-05, + "loss": 4.6706, + "step": 25671 + }, + { + "epoch": 0.15267865638976116, + "grad_norm": 2.032592296600342, + "learning_rate": 4.717908000512925e-05, + "loss": 4.9648, + "step": 25672 + }, + { + "epoch": 0.15268460367304215, + "grad_norm": 1.6153349876403809, + "learning_rate": 4.7178864455928696e-05, + "loss": 5.2224, + "step": 25673 + }, + { + "epoch": 0.15269055095632314, + "grad_norm": 2.0942156314849854, + "learning_rate": 4.7178648898985734e-05, + "loss": 4.6427, + "step": 25674 + }, + { + "epoch": 0.15269649823960416, + "grad_norm": 1.9911080598831177, + "learning_rate": 4.717843333430043e-05, + "loss": 4.3348, + "step": 25675 + }, + { + "epoch": 0.15270244552288514, + "grad_norm": 2.017202377319336, + "learning_rate": 4.7178217761872866e-05, + "loss": 4.5306, + "step": 25676 + }, + { + "epoch": 0.15270839280616613, + "grad_norm": 1.9934179782867432, + "learning_rate": 4.7178002181703116e-05, + "loss": 4.7443, + "step": 25677 + }, + { + "epoch": 0.15271434008944715, + "grad_norm": 1.9597182273864746, + "learning_rate": 4.717778659379126e-05, + "loss": 4.5526, + "step": 25678 + }, + { + "epoch": 0.15272028737272814, + "grad_norm": 1.3593907356262207, + "learning_rate": 4.717757099813737e-05, + "loss": 5.5802, + "step": 25679 + }, + { + "epoch": 0.15272623465600912, + "grad_norm": 2.0012102127075195, + "learning_rate": 4.717735539474151e-05, + "loss": 5.0289, + "step": 25680 + }, + { + "epoch": 0.15273218193929014, + "grad_norm": 1.5621830224990845, + "learning_rate": 4.7177139783603765e-05, + "loss": 4.9388, + "step": 25681 + }, + { + "epoch": 0.15273812922257113, + "grad_norm": 1.502643346786499, + "learning_rate": 4.717692416472421e-05, + "loss": 5.3317, + "step": 25682 + }, + { + "epoch": 0.15274407650585212, + "grad_norm": 1.6496142148971558, + "learning_rate": 4.717670853810292e-05, + "loss": 5.9642, + "step": 25683 + }, + { + "epoch": 0.15275002378913313, + "grad_norm": 1.7263692617416382, + "learning_rate": 4.717649290373997e-05, + "loss": 4.9383, + "step": 25684 + }, + { + "epoch": 0.15275597107241412, + "grad_norm": 1.4914296865463257, + "learning_rate": 4.7176277261635434e-05, + "loss": 5.2599, + "step": 25685 + }, + { + "epoch": 0.1527619183556951, + "grad_norm": 1.3947960138320923, + "learning_rate": 4.71760616117894e-05, + "loss": 5.3177, + "step": 25686 + }, + { + "epoch": 0.15276786563897612, + "grad_norm": 1.6703267097473145, + "learning_rate": 4.717584595420192e-05, + "loss": 5.0309, + "step": 25687 + }, + { + "epoch": 0.1527738129222571, + "grad_norm": 1.622600793838501, + "learning_rate": 4.7175630288873083e-05, + "loss": 5.2554, + "step": 25688 + }, + { + "epoch": 0.1527797602055381, + "grad_norm": 1.678843379020691, + "learning_rate": 4.717541461580297e-05, + "loss": 5.012, + "step": 25689 + }, + { + "epoch": 0.15278570748881912, + "grad_norm": 2.2063186168670654, + "learning_rate": 4.717519893499164e-05, + "loss": 4.4479, + "step": 25690 + }, + { + "epoch": 0.1527916547721001, + "grad_norm": 2.0667500495910645, + "learning_rate": 4.717498324643918e-05, + "loss": 4.7081, + "step": 25691 + }, + { + "epoch": 0.1527976020553811, + "grad_norm": 2.192436695098877, + "learning_rate": 4.717476755014566e-05, + "loss": 4.7662, + "step": 25692 + }, + { + "epoch": 0.1528035493386621, + "grad_norm": 1.4742953777313232, + "learning_rate": 4.7174551846111165e-05, + "loss": 5.5788, + "step": 25693 + }, + { + "epoch": 0.1528094966219431, + "grad_norm": 1.7715102434158325, + "learning_rate": 4.7174336134335765e-05, + "loss": 5.203, + "step": 25694 + }, + { + "epoch": 0.15281544390522409, + "grad_norm": 2.406721353530884, + "learning_rate": 4.717412041481952e-05, + "loss": 4.7807, + "step": 25695 + }, + { + "epoch": 0.1528213911885051, + "grad_norm": 1.765756607055664, + "learning_rate": 4.7173904687562525e-05, + "loss": 5.2479, + "step": 25696 + }, + { + "epoch": 0.1528273384717861, + "grad_norm": 1.6135215759277344, + "learning_rate": 4.7173688952564856e-05, + "loss": 5.4787, + "step": 25697 + }, + { + "epoch": 0.15283328575506708, + "grad_norm": 1.5617319345474243, + "learning_rate": 4.7173473209826566e-05, + "loss": 5.02, + "step": 25698 + }, + { + "epoch": 0.1528392330383481, + "grad_norm": 1.4704324007034302, + "learning_rate": 4.7173257459347756e-05, + "loss": 5.1675, + "step": 25699 + }, + { + "epoch": 0.15284518032162908, + "grad_norm": 1.8787862062454224, + "learning_rate": 4.7173041701128496e-05, + "loss": 4.7247, + "step": 25700 + }, + { + "epoch": 0.15285112760491007, + "grad_norm": 3.8647372722625732, + "learning_rate": 4.7172825935168845e-05, + "loss": 3.5335, + "step": 25701 + }, + { + "epoch": 0.15285707488819109, + "grad_norm": 3.6721291542053223, + "learning_rate": 4.717261016146889e-05, + "loss": 2.8843, + "step": 25702 + }, + { + "epoch": 0.15286302217147207, + "grad_norm": 2.0848543643951416, + "learning_rate": 4.717239438002872e-05, + "loss": 4.4863, + "step": 25703 + }, + { + "epoch": 0.15286896945475306, + "grad_norm": 1.7783108949661255, + "learning_rate": 4.717217859084838e-05, + "loss": 5.2903, + "step": 25704 + }, + { + "epoch": 0.15287491673803408, + "grad_norm": 2.006303548812866, + "learning_rate": 4.717196279392797e-05, + "loss": 4.3923, + "step": 25705 + }, + { + "epoch": 0.15288086402131507, + "grad_norm": 2.4214632511138916, + "learning_rate": 4.7171746989267553e-05, + "loss": 3.3506, + "step": 25706 + }, + { + "epoch": 0.15288681130459605, + "grad_norm": 2.8976924419403076, + "learning_rate": 4.7171531176867214e-05, + "loss": 3.2211, + "step": 25707 + }, + { + "epoch": 0.15289275858787707, + "grad_norm": 3.2015345096588135, + "learning_rate": 4.717131535672702e-05, + "loss": 2.8205, + "step": 25708 + }, + { + "epoch": 0.15289870587115806, + "grad_norm": 3.559465169906616, + "learning_rate": 4.7171099528847044e-05, + "loss": 2.8882, + "step": 25709 + }, + { + "epoch": 0.15290465315443905, + "grad_norm": 3.3753960132598877, + "learning_rate": 4.717088369322737e-05, + "loss": 2.6752, + "step": 25710 + }, + { + "epoch": 0.15291060043772006, + "grad_norm": 2.129783868789673, + "learning_rate": 4.717066784986806e-05, + "loss": 3.9983, + "step": 25711 + }, + { + "epoch": 0.15291654772100105, + "grad_norm": 1.797956943511963, + "learning_rate": 4.7170451998769214e-05, + "loss": 4.8075, + "step": 25712 + }, + { + "epoch": 0.15292249500428204, + "grad_norm": 3.3450467586517334, + "learning_rate": 4.717023613993089e-05, + "loss": 4.177, + "step": 25713 + }, + { + "epoch": 0.15292844228756305, + "grad_norm": 2.303511381149292, + "learning_rate": 4.7170020273353164e-05, + "loss": 4.471, + "step": 25714 + }, + { + "epoch": 0.15293438957084404, + "grad_norm": 1.4113452434539795, + "learning_rate": 4.7169804399036105e-05, + "loss": 5.4846, + "step": 25715 + }, + { + "epoch": 0.15294033685412503, + "grad_norm": 1.7091588973999023, + "learning_rate": 4.71695885169798e-05, + "loss": 4.8856, + "step": 25716 + }, + { + "epoch": 0.15294628413740605, + "grad_norm": 2.783010244369507, + "learning_rate": 4.7169372627184326e-05, + "loss": 4.3426, + "step": 25717 + }, + { + "epoch": 0.15295223142068703, + "grad_norm": 1.4658305644989014, + "learning_rate": 4.716915672964975e-05, + "loss": 5.3191, + "step": 25718 + }, + { + "epoch": 0.15295817870396802, + "grad_norm": 1.2862242460250854, + "learning_rate": 4.716894082437615e-05, + "loss": 5.3939, + "step": 25719 + }, + { + "epoch": 0.152964125987249, + "grad_norm": 1.4098438024520874, + "learning_rate": 4.71687249113636e-05, + "loss": 5.4493, + "step": 25720 + }, + { + "epoch": 0.15297007327053003, + "grad_norm": 1.4778176546096802, + "learning_rate": 4.7168508990612183e-05, + "loss": 5.2679, + "step": 25721 + }, + { + "epoch": 0.15297602055381102, + "grad_norm": 1.5448487997055054, + "learning_rate": 4.716829306212196e-05, + "loss": 5.1446, + "step": 25722 + }, + { + "epoch": 0.152981967837092, + "grad_norm": 1.3638159036636353, + "learning_rate": 4.716807712589302e-05, + "loss": 5.1152, + "step": 25723 + }, + { + "epoch": 0.15298791512037302, + "grad_norm": 1.7068208456039429, + "learning_rate": 4.716786118192543e-05, + "loss": 5.1389, + "step": 25724 + }, + { + "epoch": 0.152993862403654, + "grad_norm": 1.8191746473312378, + "learning_rate": 4.716764523021928e-05, + "loss": 5.2305, + "step": 25725 + }, + { + "epoch": 0.152999809686935, + "grad_norm": 1.6970409154891968, + "learning_rate": 4.716742927077462e-05, + "loss": 5.1097, + "step": 25726 + }, + { + "epoch": 0.153005756970216, + "grad_norm": 1.5453951358795166, + "learning_rate": 4.716721330359155e-05, + "loss": 5.2614, + "step": 25727 + }, + { + "epoch": 0.153011704253497, + "grad_norm": 1.5335613489151, + "learning_rate": 4.7166997328670125e-05, + "loss": 4.8482, + "step": 25728 + }, + { + "epoch": 0.153017651536778, + "grad_norm": 1.6566481590270996, + "learning_rate": 4.716678134601044e-05, + "loss": 4.9346, + "step": 25729 + }, + { + "epoch": 0.153023598820059, + "grad_norm": 1.7899013757705688, + "learning_rate": 4.716656535561256e-05, + "loss": 5.0877, + "step": 25730 + }, + { + "epoch": 0.15302954610334, + "grad_norm": 2.1659116744995117, + "learning_rate": 4.716634935747655e-05, + "loss": 4.6431, + "step": 25731 + }, + { + "epoch": 0.15303549338662098, + "grad_norm": 1.914923071861267, + "learning_rate": 4.71661333516025e-05, + "loss": 4.9001, + "step": 25732 + }, + { + "epoch": 0.153041440669902, + "grad_norm": 1.9240248203277588, + "learning_rate": 4.7165917337990495e-05, + "loss": 4.7709, + "step": 25733 + }, + { + "epoch": 0.15304738795318298, + "grad_norm": 1.6446973085403442, + "learning_rate": 4.7165701316640585e-05, + "loss": 4.9816, + "step": 25734 + }, + { + "epoch": 0.15305333523646397, + "grad_norm": 1.7971409559249878, + "learning_rate": 4.716548528755286e-05, + "loss": 5.0082, + "step": 25735 + }, + { + "epoch": 0.153059282519745, + "grad_norm": 1.3862462043762207, + "learning_rate": 4.716526925072739e-05, + "loss": 5.0245, + "step": 25736 + }, + { + "epoch": 0.15306522980302598, + "grad_norm": 2.157005548477173, + "learning_rate": 4.716505320616425e-05, + "loss": 4.573, + "step": 25737 + }, + { + "epoch": 0.15307117708630696, + "grad_norm": 2.4460175037384033, + "learning_rate": 4.716483715386354e-05, + "loss": 4.0872, + "step": 25738 + }, + { + "epoch": 0.15307712436958798, + "grad_norm": 1.7140263319015503, + "learning_rate": 4.7164621093825294e-05, + "loss": 4.5421, + "step": 25739 + }, + { + "epoch": 0.15308307165286897, + "grad_norm": 1.684173583984375, + "learning_rate": 4.7164405026049616e-05, + "loss": 4.5274, + "step": 25740 + }, + { + "epoch": 0.15308901893614996, + "grad_norm": 1.9424148797988892, + "learning_rate": 4.716418895053657e-05, + "loss": 4.2669, + "step": 25741 + }, + { + "epoch": 0.15309496621943097, + "grad_norm": 1.576071858406067, + "learning_rate": 4.716397286728623e-05, + "loss": 4.9536, + "step": 25742 + }, + { + "epoch": 0.15310091350271196, + "grad_norm": 1.8285739421844482, + "learning_rate": 4.7163756776298686e-05, + "loss": 4.9322, + "step": 25743 + }, + { + "epoch": 0.15310686078599295, + "grad_norm": 2.058610200881958, + "learning_rate": 4.7163540677574004e-05, + "loss": 4.4565, + "step": 25744 + }, + { + "epoch": 0.15311280806927396, + "grad_norm": 2.106513261795044, + "learning_rate": 4.716332457111226e-05, + "loss": 4.0534, + "step": 25745 + }, + { + "epoch": 0.15311875535255495, + "grad_norm": 1.821857213973999, + "learning_rate": 4.716310845691351e-05, + "loss": 4.5302, + "step": 25746 + }, + { + "epoch": 0.15312470263583594, + "grad_norm": 1.5679446458816528, + "learning_rate": 4.716289233497787e-05, + "loss": 4.9452, + "step": 25747 + }, + { + "epoch": 0.15313064991911696, + "grad_norm": 1.612362027168274, + "learning_rate": 4.716267620530538e-05, + "loss": 5.0074, + "step": 25748 + }, + { + "epoch": 0.15313659720239794, + "grad_norm": 1.6841483116149902, + "learning_rate": 4.716246006789613e-05, + "loss": 5.0202, + "step": 25749 + }, + { + "epoch": 0.15314254448567893, + "grad_norm": 1.7533215284347534, + "learning_rate": 4.7162243922750196e-05, + "loss": 4.6901, + "step": 25750 + }, + { + "epoch": 0.15314849176895995, + "grad_norm": 2.2937755584716797, + "learning_rate": 4.716202776986766e-05, + "loss": 4.0934, + "step": 25751 + }, + { + "epoch": 0.15315443905224094, + "grad_norm": 2.413012742996216, + "learning_rate": 4.7161811609248576e-05, + "loss": 4.0128, + "step": 25752 + }, + { + "epoch": 0.15316038633552193, + "grad_norm": 2.481255054473877, + "learning_rate": 4.7161595440893035e-05, + "loss": 4.4044, + "step": 25753 + }, + { + "epoch": 0.15316633361880294, + "grad_norm": 1.8999838829040527, + "learning_rate": 4.7161379264801115e-05, + "loss": 4.2328, + "step": 25754 + }, + { + "epoch": 0.15317228090208393, + "grad_norm": 2.3453261852264404, + "learning_rate": 4.7161163080972884e-05, + "loss": 4.283, + "step": 25755 + }, + { + "epoch": 0.15317822818536492, + "grad_norm": 1.6733421087265015, + "learning_rate": 4.716094688940842e-05, + "loss": 4.7254, + "step": 25756 + }, + { + "epoch": 0.15318417546864593, + "grad_norm": 1.5302658081054688, + "learning_rate": 4.7160730690107794e-05, + "loss": 4.9403, + "step": 25757 + }, + { + "epoch": 0.15319012275192692, + "grad_norm": 1.6725687980651855, + "learning_rate": 4.716051448307109e-05, + "loss": 4.699, + "step": 25758 + }, + { + "epoch": 0.1531960700352079, + "grad_norm": 2.067267894744873, + "learning_rate": 4.716029826829839e-05, + "loss": 4.0136, + "step": 25759 + }, + { + "epoch": 0.15320201731848893, + "grad_norm": 2.2834413051605225, + "learning_rate": 4.716008204578975e-05, + "loss": 4.1914, + "step": 25760 + }, + { + "epoch": 0.1532079646017699, + "grad_norm": 1.9917986392974854, + "learning_rate": 4.715986581554524e-05, + "loss": 4.2899, + "step": 25761 + }, + { + "epoch": 0.1532139118850509, + "grad_norm": 1.6681551933288574, + "learning_rate": 4.715964957756497e-05, + "loss": 4.7627, + "step": 25762 + }, + { + "epoch": 0.15321985916833192, + "grad_norm": 2.005560874938965, + "learning_rate": 4.715943333184899e-05, + "loss": 4.1686, + "step": 25763 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 1.7380902767181396, + "learning_rate": 4.715921707839738e-05, + "loss": 4.4208, + "step": 25764 + }, + { + "epoch": 0.1532317537348939, + "grad_norm": 2.6380422115325928, + "learning_rate": 4.7159000817210205e-05, + "loss": 4.9835, + "step": 25765 + }, + { + "epoch": 0.1532377010181749, + "grad_norm": 2.4079694747924805, + "learning_rate": 4.715878454828757e-05, + "loss": 4.5758, + "step": 25766 + }, + { + "epoch": 0.1532436483014559, + "grad_norm": 1.7469686269760132, + "learning_rate": 4.715856827162952e-05, + "loss": 4.8894, + "step": 25767 + }, + { + "epoch": 0.1532495955847369, + "grad_norm": 1.7569485902786255, + "learning_rate": 4.715835198723615e-05, + "loss": 5.0324, + "step": 25768 + }, + { + "epoch": 0.1532555428680179, + "grad_norm": 1.9182626008987427, + "learning_rate": 4.715813569510752e-05, + "loss": 4.2196, + "step": 25769 + }, + { + "epoch": 0.1532614901512989, + "grad_norm": 1.8836737871170044, + "learning_rate": 4.715791939524372e-05, + "loss": 4.3797, + "step": 25770 + }, + { + "epoch": 0.15326743743457988, + "grad_norm": 1.5073226690292358, + "learning_rate": 4.7157703087644816e-05, + "loss": 4.7137, + "step": 25771 + }, + { + "epoch": 0.1532733847178609, + "grad_norm": 1.764160394668579, + "learning_rate": 4.715748677231089e-05, + "loss": 4.784, + "step": 25772 + }, + { + "epoch": 0.15327933200114188, + "grad_norm": 1.5940345525741577, + "learning_rate": 4.715727044924201e-05, + "loss": 4.7749, + "step": 25773 + }, + { + "epoch": 0.15328527928442287, + "grad_norm": 1.9873480796813965, + "learning_rate": 4.715705411843826e-05, + "loss": 4.7084, + "step": 25774 + }, + { + "epoch": 0.1532912265677039, + "grad_norm": 2.712846279144287, + "learning_rate": 4.715683777989971e-05, + "loss": 4.8726, + "step": 25775 + }, + { + "epoch": 0.15329717385098487, + "grad_norm": 1.9030331373214722, + "learning_rate": 4.7156621433626434e-05, + "loss": 4.6475, + "step": 25776 + }, + { + "epoch": 0.15330312113426586, + "grad_norm": 1.9939697980880737, + "learning_rate": 4.715640507961852e-05, + "loss": 4.8202, + "step": 25777 + }, + { + "epoch": 0.15330906841754685, + "grad_norm": 1.5398924350738525, + "learning_rate": 4.715618871787602e-05, + "loss": 4.9801, + "step": 25778 + }, + { + "epoch": 0.15331501570082787, + "grad_norm": 1.5413012504577637, + "learning_rate": 4.7155972348399034e-05, + "loss": 4.7795, + "step": 25779 + }, + { + "epoch": 0.15332096298410886, + "grad_norm": 1.6835294961929321, + "learning_rate": 4.7155755971187625e-05, + "loss": 4.5937, + "step": 25780 + }, + { + "epoch": 0.15332691026738984, + "grad_norm": 1.4007564783096313, + "learning_rate": 4.715553958624187e-05, + "loss": 5.5904, + "step": 25781 + }, + { + "epoch": 0.15333285755067086, + "grad_norm": 1.6113498210906982, + "learning_rate": 4.715532319356184e-05, + "loss": 5.1083, + "step": 25782 + }, + { + "epoch": 0.15333880483395185, + "grad_norm": 1.9218871593475342, + "learning_rate": 4.715510679314762e-05, + "loss": 4.6371, + "step": 25783 + }, + { + "epoch": 0.15334475211723284, + "grad_norm": 1.4686646461486816, + "learning_rate": 4.715489038499928e-05, + "loss": 5.0536, + "step": 25784 + }, + { + "epoch": 0.15335069940051385, + "grad_norm": 1.5875191688537598, + "learning_rate": 4.71546739691169e-05, + "loss": 4.5976, + "step": 25785 + }, + { + "epoch": 0.15335664668379484, + "grad_norm": 1.5260745286941528, + "learning_rate": 4.7154457545500554e-05, + "loss": 4.6875, + "step": 25786 + }, + { + "epoch": 0.15336259396707583, + "grad_norm": 1.8652924299240112, + "learning_rate": 4.715424111415031e-05, + "loss": 4.846, + "step": 25787 + }, + { + "epoch": 0.15336854125035684, + "grad_norm": 1.3980404138565063, + "learning_rate": 4.715402467506625e-05, + "loss": 5.2552, + "step": 25788 + }, + { + "epoch": 0.15337448853363783, + "grad_norm": 1.6307755708694458, + "learning_rate": 4.715380822824845e-05, + "loss": 5.1316, + "step": 25789 + }, + { + "epoch": 0.15338043581691882, + "grad_norm": 1.9057358503341675, + "learning_rate": 4.715359177369698e-05, + "loss": 4.6232, + "step": 25790 + }, + { + "epoch": 0.15338638310019984, + "grad_norm": 1.260809302330017, + "learning_rate": 4.715337531141193e-05, + "loss": 5.1614, + "step": 25791 + }, + { + "epoch": 0.15339233038348082, + "grad_norm": 2.7115111351013184, + "learning_rate": 4.7153158841393354e-05, + "loss": 3.6292, + "step": 25792 + }, + { + "epoch": 0.1533982776667618, + "grad_norm": 1.296697974205017, + "learning_rate": 4.715294236364135e-05, + "loss": 5.5909, + "step": 25793 + }, + { + "epoch": 0.15340422495004283, + "grad_norm": 1.466179370880127, + "learning_rate": 4.7152725878155975e-05, + "loss": 5.3005, + "step": 25794 + }, + { + "epoch": 0.15341017223332382, + "grad_norm": 1.5478910207748413, + "learning_rate": 4.715250938493732e-05, + "loss": 4.9116, + "step": 25795 + }, + { + "epoch": 0.1534161195166048, + "grad_norm": 1.371853232383728, + "learning_rate": 4.715229288398544e-05, + "loss": 5.2196, + "step": 25796 + }, + { + "epoch": 0.15342206679988582, + "grad_norm": 1.4444376230239868, + "learning_rate": 4.715207637530043e-05, + "loss": 4.9255, + "step": 25797 + }, + { + "epoch": 0.1534280140831668, + "grad_norm": 1.3257986307144165, + "learning_rate": 4.715185985888236e-05, + "loss": 4.9662, + "step": 25798 + }, + { + "epoch": 0.1534339613664478, + "grad_norm": 1.4831913709640503, + "learning_rate": 4.71516433347313e-05, + "loss": 4.9466, + "step": 25799 + }, + { + "epoch": 0.1534399086497288, + "grad_norm": 1.8146830797195435, + "learning_rate": 4.715142680284734e-05, + "loss": 4.711, + "step": 25800 + }, + { + "epoch": 0.1534458559330098, + "grad_norm": 1.73066246509552, + "learning_rate": 4.7151210263230536e-05, + "loss": 4.4107, + "step": 25801 + }, + { + "epoch": 0.1534518032162908, + "grad_norm": 2.014646291732788, + "learning_rate": 4.715099371588098e-05, + "loss": 4.6119, + "step": 25802 + }, + { + "epoch": 0.1534577504995718, + "grad_norm": 2.1739413738250732, + "learning_rate": 4.715077716079874e-05, + "loss": 4.4887, + "step": 25803 + }, + { + "epoch": 0.1534636977828528, + "grad_norm": 1.4722633361816406, + "learning_rate": 4.7150560597983895e-05, + "loss": 5.0312, + "step": 25804 + }, + { + "epoch": 0.15346964506613378, + "grad_norm": 1.654250144958496, + "learning_rate": 4.715034402743651e-05, + "loss": 4.8815, + "step": 25805 + }, + { + "epoch": 0.1534755923494148, + "grad_norm": 1.6598440408706665, + "learning_rate": 4.715012744915668e-05, + "loss": 4.3904, + "step": 25806 + }, + { + "epoch": 0.15348153963269578, + "grad_norm": 1.5754339694976807, + "learning_rate": 4.714991086314445e-05, + "loss": 4.4223, + "step": 25807 + }, + { + "epoch": 0.15348748691597677, + "grad_norm": 1.800657033920288, + "learning_rate": 4.714969426939994e-05, + "loss": 4.5314, + "step": 25808 + }, + { + "epoch": 0.1534934341992578, + "grad_norm": 1.8917250633239746, + "learning_rate": 4.714947766792318e-05, + "loss": 4.4049, + "step": 25809 + }, + { + "epoch": 0.15349938148253878, + "grad_norm": 1.9953207969665527, + "learning_rate": 4.714926105871428e-05, + "loss": 4.3155, + "step": 25810 + }, + { + "epoch": 0.15350532876581977, + "grad_norm": 1.7314120531082153, + "learning_rate": 4.714904444177329e-05, + "loss": 4.3324, + "step": 25811 + }, + { + "epoch": 0.15351127604910078, + "grad_norm": 1.577124834060669, + "learning_rate": 4.7148827817100306e-05, + "loss": 4.6899, + "step": 25812 + }, + { + "epoch": 0.15351722333238177, + "grad_norm": 1.6661646366119385, + "learning_rate": 4.714861118469539e-05, + "loss": 4.9735, + "step": 25813 + }, + { + "epoch": 0.15352317061566276, + "grad_norm": 1.8606276512145996, + "learning_rate": 4.714839454455863e-05, + "loss": 5.2351, + "step": 25814 + }, + { + "epoch": 0.15352911789894377, + "grad_norm": 2.0107643604278564, + "learning_rate": 4.7148177896690085e-05, + "loss": 4.4152, + "step": 25815 + }, + { + "epoch": 0.15353506518222476, + "grad_norm": 1.6447992324829102, + "learning_rate": 4.7147961241089846e-05, + "loss": 4.5391, + "step": 25816 + }, + { + "epoch": 0.15354101246550575, + "grad_norm": 1.6666457653045654, + "learning_rate": 4.714774457775798e-05, + "loss": 4.5104, + "step": 25817 + }, + { + "epoch": 0.15354695974878677, + "grad_norm": 1.7214492559432983, + "learning_rate": 4.714752790669457e-05, + "loss": 5.0634, + "step": 25818 + }, + { + "epoch": 0.15355290703206775, + "grad_norm": 1.5697379112243652, + "learning_rate": 4.714731122789968e-05, + "loss": 4.8279, + "step": 25819 + }, + { + "epoch": 0.15355885431534874, + "grad_norm": 2.531752109527588, + "learning_rate": 4.7147094541373395e-05, + "loss": 3.9172, + "step": 25820 + }, + { + "epoch": 0.15356480159862976, + "grad_norm": 1.5037142038345337, + "learning_rate": 4.714687784711579e-05, + "loss": 4.7534, + "step": 25821 + }, + { + "epoch": 0.15357074888191075, + "grad_norm": 1.5798907279968262, + "learning_rate": 4.714666114512693e-05, + "loss": 4.6779, + "step": 25822 + }, + { + "epoch": 0.15357669616519173, + "grad_norm": 1.5223065614700317, + "learning_rate": 4.714644443540691e-05, + "loss": 4.8612, + "step": 25823 + }, + { + "epoch": 0.15358264344847275, + "grad_norm": 1.7736209630966187, + "learning_rate": 4.714622771795579e-05, + "loss": 4.9765, + "step": 25824 + }, + { + "epoch": 0.15358859073175374, + "grad_norm": 1.5920718908309937, + "learning_rate": 4.714601099277365e-05, + "loss": 5.2479, + "step": 25825 + }, + { + "epoch": 0.15359453801503473, + "grad_norm": 1.7325233221054077, + "learning_rate": 4.7145794259860576e-05, + "loss": 4.9202, + "step": 25826 + }, + { + "epoch": 0.15360048529831574, + "grad_norm": 1.6514594554901123, + "learning_rate": 4.714557751921662e-05, + "loss": 4.9212, + "step": 25827 + }, + { + "epoch": 0.15360643258159673, + "grad_norm": 1.731692910194397, + "learning_rate": 4.714536077084188e-05, + "loss": 4.8916, + "step": 25828 + }, + { + "epoch": 0.15361237986487772, + "grad_norm": 1.7444603443145752, + "learning_rate": 4.714514401473642e-05, + "loss": 4.4659, + "step": 25829 + }, + { + "epoch": 0.15361832714815873, + "grad_norm": 1.7847130298614502, + "learning_rate": 4.714492725090033e-05, + "loss": 4.3516, + "step": 25830 + }, + { + "epoch": 0.15362427443143972, + "grad_norm": 1.6140960454940796, + "learning_rate": 4.714471047933366e-05, + "loss": 4.3894, + "step": 25831 + }, + { + "epoch": 0.1536302217147207, + "grad_norm": 1.5573277473449707, + "learning_rate": 4.714449370003651e-05, + "loss": 5.0749, + "step": 25832 + }, + { + "epoch": 0.15363616899800173, + "grad_norm": 1.7352724075317383, + "learning_rate": 4.7144276913008936e-05, + "loss": 4.6311, + "step": 25833 + }, + { + "epoch": 0.15364211628128271, + "grad_norm": 2.1136815547943115, + "learning_rate": 4.714406011825103e-05, + "loss": 3.9239, + "step": 25834 + }, + { + "epoch": 0.1536480635645637, + "grad_norm": 1.5329402685165405, + "learning_rate": 4.7143843315762856e-05, + "loss": 5.0124, + "step": 25835 + }, + { + "epoch": 0.1536540108478447, + "grad_norm": 1.6305334568023682, + "learning_rate": 4.7143626505544504e-05, + "loss": 5.3047, + "step": 25836 + }, + { + "epoch": 0.1536599581311257, + "grad_norm": 1.6582584381103516, + "learning_rate": 4.714340968759604e-05, + "loss": 4.909, + "step": 25837 + }, + { + "epoch": 0.1536659054144067, + "grad_norm": 1.581274151802063, + "learning_rate": 4.7143192861917536e-05, + "loss": 4.8241, + "step": 25838 + }, + { + "epoch": 0.15367185269768768, + "grad_norm": 1.6180393695831299, + "learning_rate": 4.7142976028509076e-05, + "loss": 4.6608, + "step": 25839 + }, + { + "epoch": 0.1536777999809687, + "grad_norm": 1.8333182334899902, + "learning_rate": 4.714275918737073e-05, + "loss": 5.3005, + "step": 25840 + }, + { + "epoch": 0.1536837472642497, + "grad_norm": 1.6652151346206665, + "learning_rate": 4.714254233850257e-05, + "loss": 4.5989, + "step": 25841 + }, + { + "epoch": 0.15368969454753068, + "grad_norm": 1.7609338760375977, + "learning_rate": 4.714232548190468e-05, + "loss": 5.2105, + "step": 25842 + }, + { + "epoch": 0.1536956418308117, + "grad_norm": 1.6076292991638184, + "learning_rate": 4.714210861757714e-05, + "loss": 5.32, + "step": 25843 + }, + { + "epoch": 0.15370158911409268, + "grad_norm": 1.6114000082015991, + "learning_rate": 4.7141891745520005e-05, + "loss": 5.1365, + "step": 25844 + }, + { + "epoch": 0.15370753639737367, + "grad_norm": 1.9237120151519775, + "learning_rate": 4.714167486573337e-05, + "loss": 4.8821, + "step": 25845 + }, + { + "epoch": 0.15371348368065468, + "grad_norm": 1.7089736461639404, + "learning_rate": 4.7141457978217315e-05, + "loss": 4.8468, + "step": 25846 + }, + { + "epoch": 0.15371943096393567, + "grad_norm": 1.6240943670272827, + "learning_rate": 4.71412410829719e-05, + "loss": 4.9153, + "step": 25847 + }, + { + "epoch": 0.15372537824721666, + "grad_norm": 1.4397730827331543, + "learning_rate": 4.7141024179997205e-05, + "loss": 5.0853, + "step": 25848 + }, + { + "epoch": 0.15373132553049768, + "grad_norm": 1.6480834484100342, + "learning_rate": 4.714080726929331e-05, + "loss": 4.6492, + "step": 25849 + }, + { + "epoch": 0.15373727281377866, + "grad_norm": 1.702221155166626, + "learning_rate": 4.714059035086028e-05, + "loss": 4.5677, + "step": 25850 + }, + { + "epoch": 0.15374322009705965, + "grad_norm": 1.5285601615905762, + "learning_rate": 4.7140373424698206e-05, + "loss": 4.621, + "step": 25851 + }, + { + "epoch": 0.15374916738034067, + "grad_norm": 2.0238354206085205, + "learning_rate": 4.7140156490807156e-05, + "loss": 4.6883, + "step": 25852 + }, + { + "epoch": 0.15375511466362166, + "grad_norm": 2.392547845840454, + "learning_rate": 4.713993954918721e-05, + "loss": 4.7537, + "step": 25853 + }, + { + "epoch": 0.15376106194690264, + "grad_norm": 2.639981746673584, + "learning_rate": 4.713972259983843e-05, + "loss": 3.958, + "step": 25854 + }, + { + "epoch": 0.15376700923018366, + "grad_norm": 2.11757755279541, + "learning_rate": 4.713950564276091e-05, + "loss": 5.0082, + "step": 25855 + }, + { + "epoch": 0.15377295651346465, + "grad_norm": 2.032003879547119, + "learning_rate": 4.713928867795471e-05, + "loss": 4.9212, + "step": 25856 + }, + { + "epoch": 0.15377890379674564, + "grad_norm": 1.7791013717651367, + "learning_rate": 4.713907170541991e-05, + "loss": 4.925, + "step": 25857 + }, + { + "epoch": 0.15378485108002665, + "grad_norm": 1.8376729488372803, + "learning_rate": 4.71388547251566e-05, + "loss": 5.1545, + "step": 25858 + }, + { + "epoch": 0.15379079836330764, + "grad_norm": 1.7532944679260254, + "learning_rate": 4.7138637737164836e-05, + "loss": 5.1329, + "step": 25859 + }, + { + "epoch": 0.15379674564658863, + "grad_norm": 2.4505176544189453, + "learning_rate": 4.7138420741444704e-05, + "loss": 4.8803, + "step": 25860 + }, + { + "epoch": 0.15380269292986964, + "grad_norm": 2.4481520652770996, + "learning_rate": 4.7138203737996283e-05, + "loss": 4.9071, + "step": 25861 + }, + { + "epoch": 0.15380864021315063, + "grad_norm": 1.805619716644287, + "learning_rate": 4.7137986726819636e-05, + "loss": 4.9145, + "step": 25862 + }, + { + "epoch": 0.15381458749643162, + "grad_norm": 1.353178858757019, + "learning_rate": 4.7137769707914856e-05, + "loss": 4.8159, + "step": 25863 + }, + { + "epoch": 0.15382053477971264, + "grad_norm": 2.1220030784606934, + "learning_rate": 4.7137552681282006e-05, + "loss": 4.7573, + "step": 25864 + }, + { + "epoch": 0.15382648206299362, + "grad_norm": 1.7052141427993774, + "learning_rate": 4.713733564692116e-05, + "loss": 5.0372, + "step": 25865 + }, + { + "epoch": 0.1538324293462746, + "grad_norm": 1.5306216478347778, + "learning_rate": 4.71371186048324e-05, + "loss": 5.0694, + "step": 25866 + }, + { + "epoch": 0.15383837662955563, + "grad_norm": 1.5422348976135254, + "learning_rate": 4.713690155501581e-05, + "loss": 5.1864, + "step": 25867 + }, + { + "epoch": 0.15384432391283662, + "grad_norm": 1.5703792572021484, + "learning_rate": 4.7136684497471444e-05, + "loss": 5.1686, + "step": 25868 + }, + { + "epoch": 0.1538502711961176, + "grad_norm": 1.6716407537460327, + "learning_rate": 4.7136467432199396e-05, + "loss": 5.2515, + "step": 25869 + }, + { + "epoch": 0.15385621847939862, + "grad_norm": 1.5796306133270264, + "learning_rate": 4.713625035919974e-05, + "loss": 5.0068, + "step": 25870 + }, + { + "epoch": 0.1538621657626796, + "grad_norm": 1.6445972919464111, + "learning_rate": 4.713603327847254e-05, + "loss": 4.9683, + "step": 25871 + }, + { + "epoch": 0.1538681130459606, + "grad_norm": 1.588665246963501, + "learning_rate": 4.713581619001788e-05, + "loss": 4.9913, + "step": 25872 + }, + { + "epoch": 0.1538740603292416, + "grad_norm": 1.5067355632781982, + "learning_rate": 4.713559909383584e-05, + "loss": 5.1648, + "step": 25873 + }, + { + "epoch": 0.1538800076125226, + "grad_norm": 1.6328977346420288, + "learning_rate": 4.713538198992649e-05, + "loss": 4.9316, + "step": 25874 + }, + { + "epoch": 0.1538859548958036, + "grad_norm": 1.6389905214309692, + "learning_rate": 4.7135164878289903e-05, + "loss": 5.1095, + "step": 25875 + }, + { + "epoch": 0.1538919021790846, + "grad_norm": 1.5004593133926392, + "learning_rate": 4.713494775892616e-05, + "loss": 4.8718, + "step": 25876 + }, + { + "epoch": 0.1538978494623656, + "grad_norm": 1.7928706407546997, + "learning_rate": 4.713473063183534e-05, + "loss": 5.1074, + "step": 25877 + }, + { + "epoch": 0.15390379674564658, + "grad_norm": 1.4132859706878662, + "learning_rate": 4.713451349701751e-05, + "loss": 5.2395, + "step": 25878 + }, + { + "epoch": 0.1539097440289276, + "grad_norm": 1.7291496992111206, + "learning_rate": 4.7134296354472754e-05, + "loss": 5.2648, + "step": 25879 + }, + { + "epoch": 0.15391569131220859, + "grad_norm": 1.6724679470062256, + "learning_rate": 4.713407920420114e-05, + "loss": 5.2074, + "step": 25880 + }, + { + "epoch": 0.15392163859548957, + "grad_norm": 1.5899326801300049, + "learning_rate": 4.713386204620275e-05, + "loss": 5.0018, + "step": 25881 + }, + { + "epoch": 0.1539275858787706, + "grad_norm": 1.5092980861663818, + "learning_rate": 4.7133644880477656e-05, + "loss": 5.2861, + "step": 25882 + }, + { + "epoch": 0.15393353316205158, + "grad_norm": 1.5518758296966553, + "learning_rate": 4.7133427707025935e-05, + "loss": 5.2302, + "step": 25883 + }, + { + "epoch": 0.15393948044533257, + "grad_norm": 1.8629082441329956, + "learning_rate": 4.713321052584766e-05, + "loss": 4.8252, + "step": 25884 + }, + { + "epoch": 0.15394542772861358, + "grad_norm": 1.618132472038269, + "learning_rate": 4.713299333694291e-05, + "loss": 5.0853, + "step": 25885 + }, + { + "epoch": 0.15395137501189457, + "grad_norm": 1.494831919670105, + "learning_rate": 4.713277614031177e-05, + "loss": 5.1517, + "step": 25886 + }, + { + "epoch": 0.15395732229517556, + "grad_norm": 1.6972736120224, + "learning_rate": 4.71325589359543e-05, + "loss": 5.3104, + "step": 25887 + }, + { + "epoch": 0.15396326957845657, + "grad_norm": 1.8251672983169556, + "learning_rate": 4.713234172387058e-05, + "loss": 5.0705, + "step": 25888 + }, + { + "epoch": 0.15396921686173756, + "grad_norm": 1.4835257530212402, + "learning_rate": 4.7132124504060696e-05, + "loss": 4.5481, + "step": 25889 + }, + { + "epoch": 0.15397516414501855, + "grad_norm": 1.447768211364746, + "learning_rate": 4.713190727652471e-05, + "loss": 4.7023, + "step": 25890 + }, + { + "epoch": 0.15398111142829957, + "grad_norm": 1.581663727760315, + "learning_rate": 4.71316900412627e-05, + "loss": 4.5446, + "step": 25891 + }, + { + "epoch": 0.15398705871158055, + "grad_norm": 1.5457055568695068, + "learning_rate": 4.7131472798274754e-05, + "loss": 4.8265, + "step": 25892 + }, + { + "epoch": 0.15399300599486154, + "grad_norm": 1.5043967962265015, + "learning_rate": 4.713125554756093e-05, + "loss": 5.2398, + "step": 25893 + }, + { + "epoch": 0.15399895327814253, + "grad_norm": 1.3700400590896606, + "learning_rate": 4.7131038289121324e-05, + "loss": 4.9516, + "step": 25894 + }, + { + "epoch": 0.15400490056142355, + "grad_norm": 1.4897541999816895, + "learning_rate": 4.713082102295599e-05, + "loss": 4.9884, + "step": 25895 + }, + { + "epoch": 0.15401084784470453, + "grad_norm": 1.560887098312378, + "learning_rate": 4.713060374906503e-05, + "loss": 4.8639, + "step": 25896 + }, + { + "epoch": 0.15401679512798552, + "grad_norm": 1.542069911956787, + "learning_rate": 4.7130386467448495e-05, + "loss": 4.7692, + "step": 25897 + }, + { + "epoch": 0.15402274241126654, + "grad_norm": 1.7924245595932007, + "learning_rate": 4.7130169178106465e-05, + "loss": 4.6172, + "step": 25898 + }, + { + "epoch": 0.15402868969454753, + "grad_norm": 1.4520066976547241, + "learning_rate": 4.7129951881039033e-05, + "loss": 4.9518, + "step": 25899 + }, + { + "epoch": 0.15403463697782852, + "grad_norm": 1.4653339385986328, + "learning_rate": 4.7129734576246255e-05, + "loss": 5.0738, + "step": 25900 + }, + { + "epoch": 0.15404058426110953, + "grad_norm": 1.2604494094848633, + "learning_rate": 4.7129517263728224e-05, + "loss": 5.0677, + "step": 25901 + }, + { + "epoch": 0.15404653154439052, + "grad_norm": 1.4956402778625488, + "learning_rate": 4.7129299943485e-05, + "loss": 5.0547, + "step": 25902 + }, + { + "epoch": 0.1540524788276715, + "grad_norm": 1.3395041227340698, + "learning_rate": 4.712908261551667e-05, + "loss": 4.9042, + "step": 25903 + }, + { + "epoch": 0.15405842611095252, + "grad_norm": 1.4592647552490234, + "learning_rate": 4.7128865279823304e-05, + "loss": 4.8363, + "step": 25904 + }, + { + "epoch": 0.1540643733942335, + "grad_norm": 1.339340329170227, + "learning_rate": 4.712864793640498e-05, + "loss": 4.8916, + "step": 25905 + }, + { + "epoch": 0.1540703206775145, + "grad_norm": 1.5001643896102905, + "learning_rate": 4.7128430585261775e-05, + "loss": 5.1015, + "step": 25906 + }, + { + "epoch": 0.15407626796079552, + "grad_norm": 1.3876299858093262, + "learning_rate": 4.7128213226393756e-05, + "loss": 5.0368, + "step": 25907 + }, + { + "epoch": 0.1540822152440765, + "grad_norm": 1.4904955625534058, + "learning_rate": 4.712799585980101e-05, + "loss": 5.0785, + "step": 25908 + }, + { + "epoch": 0.1540881625273575, + "grad_norm": 1.4284460544586182, + "learning_rate": 4.712777848548362e-05, + "loss": 5.0015, + "step": 25909 + }, + { + "epoch": 0.1540941098106385, + "grad_norm": 1.4823048114776611, + "learning_rate": 4.712756110344164e-05, + "loss": 4.9969, + "step": 25910 + }, + { + "epoch": 0.1541000570939195, + "grad_norm": 1.5989056825637817, + "learning_rate": 4.712734371367516e-05, + "loss": 5.4401, + "step": 25911 + }, + { + "epoch": 0.15410600437720048, + "grad_norm": 1.475415587425232, + "learning_rate": 4.7127126316184256e-05, + "loss": 5.3553, + "step": 25912 + }, + { + "epoch": 0.1541119516604815, + "grad_norm": 1.3556677103042603, + "learning_rate": 4.712690891096899e-05, + "loss": 5.4228, + "step": 25913 + }, + { + "epoch": 0.1541178989437625, + "grad_norm": 1.4386837482452393, + "learning_rate": 4.712669149802946e-05, + "loss": 5.387, + "step": 25914 + }, + { + "epoch": 0.15412384622704348, + "grad_norm": 1.4365500211715698, + "learning_rate": 4.712647407736573e-05, + "loss": 4.8597, + "step": 25915 + }, + { + "epoch": 0.1541297935103245, + "grad_norm": 1.5703059434890747, + "learning_rate": 4.712625664897788e-05, + "loss": 5.2659, + "step": 25916 + }, + { + "epoch": 0.15413574079360548, + "grad_norm": 1.5057390928268433, + "learning_rate": 4.712603921286597e-05, + "loss": 4.9931, + "step": 25917 + }, + { + "epoch": 0.15414168807688647, + "grad_norm": 1.2982683181762695, + "learning_rate": 4.712582176903009e-05, + "loss": 5.5226, + "step": 25918 + }, + { + "epoch": 0.15414763536016748, + "grad_norm": 1.4120944738388062, + "learning_rate": 4.712560431747032e-05, + "loss": 5.4037, + "step": 25919 + }, + { + "epoch": 0.15415358264344847, + "grad_norm": 1.3634661436080933, + "learning_rate": 4.712538685818673e-05, + "loss": 5.521, + "step": 25920 + }, + { + "epoch": 0.15415952992672946, + "grad_norm": 1.3352160453796387, + "learning_rate": 4.7125169391179394e-05, + "loss": 5.2938, + "step": 25921 + }, + { + "epoch": 0.15416547721001048, + "grad_norm": 1.3874114751815796, + "learning_rate": 4.712495191644839e-05, + "loss": 5.272, + "step": 25922 + }, + { + "epoch": 0.15417142449329146, + "grad_norm": 1.5225552320480347, + "learning_rate": 4.712473443399379e-05, + "loss": 5.3211, + "step": 25923 + }, + { + "epoch": 0.15417737177657245, + "grad_norm": 1.4493452310562134, + "learning_rate": 4.712451694381568e-05, + "loss": 5.2799, + "step": 25924 + }, + { + "epoch": 0.15418331905985347, + "grad_norm": 1.3240947723388672, + "learning_rate": 4.712429944591413e-05, + "loss": 5.441, + "step": 25925 + }, + { + "epoch": 0.15418926634313446, + "grad_norm": 1.2881836891174316, + "learning_rate": 4.712408194028921e-05, + "loss": 5.4478, + "step": 25926 + }, + { + "epoch": 0.15419521362641544, + "grad_norm": 1.4163159132003784, + "learning_rate": 4.712386442694101e-05, + "loss": 5.252, + "step": 25927 + }, + { + "epoch": 0.15420116090969646, + "grad_norm": 1.4597609043121338, + "learning_rate": 4.712364690586959e-05, + "loss": 5.4359, + "step": 25928 + }, + { + "epoch": 0.15420710819297745, + "grad_norm": 1.31305992603302, + "learning_rate": 4.7123429377075036e-05, + "loss": 5.3141, + "step": 25929 + }, + { + "epoch": 0.15421305547625844, + "grad_norm": 1.1765657663345337, + "learning_rate": 4.712321184055742e-05, + "loss": 5.1828, + "step": 25930 + }, + { + "epoch": 0.15421900275953945, + "grad_norm": 1.3116487264633179, + "learning_rate": 4.7122994296316824e-05, + "loss": 5.4107, + "step": 25931 + }, + { + "epoch": 0.15422495004282044, + "grad_norm": 1.3636351823806763, + "learning_rate": 4.712277674435331e-05, + "loss": 5.3273, + "step": 25932 + }, + { + "epoch": 0.15423089732610143, + "grad_norm": 1.4326391220092773, + "learning_rate": 4.712255918466697e-05, + "loss": 5.4123, + "step": 25933 + }, + { + "epoch": 0.15423684460938245, + "grad_norm": 1.3996350765228271, + "learning_rate": 4.712234161725788e-05, + "loss": 5.3111, + "step": 25934 + }, + { + "epoch": 0.15424279189266343, + "grad_norm": 1.5358290672302246, + "learning_rate": 4.712212404212609e-05, + "loss": 5.4522, + "step": 25935 + }, + { + "epoch": 0.15424873917594442, + "grad_norm": 1.3900970220565796, + "learning_rate": 4.7121906459271716e-05, + "loss": 5.6671, + "step": 25936 + }, + { + "epoch": 0.15425468645922544, + "grad_norm": 1.5113252401351929, + "learning_rate": 4.71216888686948e-05, + "loss": 5.0736, + "step": 25937 + }, + { + "epoch": 0.15426063374250643, + "grad_norm": 1.434477686882019, + "learning_rate": 4.7121471270395434e-05, + "loss": 5.259, + "step": 25938 + }, + { + "epoch": 0.1542665810257874, + "grad_norm": 1.4467335939407349, + "learning_rate": 4.712125366437369e-05, + "loss": 5.3382, + "step": 25939 + }, + { + "epoch": 0.15427252830906843, + "grad_norm": 1.6080671548843384, + "learning_rate": 4.712103605062965e-05, + "loss": 5.1767, + "step": 25940 + }, + { + "epoch": 0.15427847559234942, + "grad_norm": 1.497689962387085, + "learning_rate": 4.712081842916338e-05, + "loss": 4.884, + "step": 25941 + }, + { + "epoch": 0.1542844228756304, + "grad_norm": 1.691441535949707, + "learning_rate": 4.712060079997496e-05, + "loss": 5.2065, + "step": 25942 + }, + { + "epoch": 0.15429037015891142, + "grad_norm": 1.4759876728057861, + "learning_rate": 4.712038316306447e-05, + "loss": 5.17, + "step": 25943 + }, + { + "epoch": 0.1542963174421924, + "grad_norm": 1.4109833240509033, + "learning_rate": 4.712016551843198e-05, + "loss": 5.1986, + "step": 25944 + }, + { + "epoch": 0.1543022647254734, + "grad_norm": 1.4481924772262573, + "learning_rate": 4.7119947866077566e-05, + "loss": 4.9301, + "step": 25945 + }, + { + "epoch": 0.15430821200875441, + "grad_norm": 1.4721769094467163, + "learning_rate": 4.711973020600131e-05, + "loss": 5.123, + "step": 25946 + }, + { + "epoch": 0.1543141592920354, + "grad_norm": 1.6822638511657715, + "learning_rate": 4.711951253820329e-05, + "loss": 5.122, + "step": 25947 + }, + { + "epoch": 0.1543201065753164, + "grad_norm": 1.6047651767730713, + "learning_rate": 4.711929486268357e-05, + "loss": 5.1417, + "step": 25948 + }, + { + "epoch": 0.1543260538585974, + "grad_norm": 1.4773536920547485, + "learning_rate": 4.711907717944224e-05, + "loss": 4.9562, + "step": 25949 + }, + { + "epoch": 0.1543320011418784, + "grad_norm": 1.4373167753219604, + "learning_rate": 4.711885948847936e-05, + "loss": 5.3515, + "step": 25950 + }, + { + "epoch": 0.15433794842515938, + "grad_norm": 1.4517033100128174, + "learning_rate": 4.711864178979501e-05, + "loss": 5.0668, + "step": 25951 + }, + { + "epoch": 0.15434389570844037, + "grad_norm": 1.7582489252090454, + "learning_rate": 4.711842408338929e-05, + "loss": 4.7104, + "step": 25952 + }, + { + "epoch": 0.1543498429917214, + "grad_norm": 1.6162217855453491, + "learning_rate": 4.711820636926224e-05, + "loss": 4.7747, + "step": 25953 + }, + { + "epoch": 0.15435579027500237, + "grad_norm": 1.7326339483261108, + "learning_rate": 4.711798864741396e-05, + "loss": 4.818, + "step": 25954 + }, + { + "epoch": 0.15436173755828336, + "grad_norm": 1.642146110534668, + "learning_rate": 4.711777091784452e-05, + "loss": 4.7517, + "step": 25955 + }, + { + "epoch": 0.15436768484156438, + "grad_norm": 1.5122802257537842, + "learning_rate": 4.711755318055399e-05, + "loss": 5.0139, + "step": 25956 + }, + { + "epoch": 0.15437363212484537, + "grad_norm": 1.7299772500991821, + "learning_rate": 4.711733543554245e-05, + "loss": 4.9988, + "step": 25957 + }, + { + "epoch": 0.15437957940812636, + "grad_norm": 1.5812711715698242, + "learning_rate": 4.711711768280998e-05, + "loss": 4.7134, + "step": 25958 + }, + { + "epoch": 0.15438552669140737, + "grad_norm": 1.5953545570373535, + "learning_rate": 4.711689992235665e-05, + "loss": 4.9644, + "step": 25959 + }, + { + "epoch": 0.15439147397468836, + "grad_norm": 1.7964719533920288, + "learning_rate": 4.711668215418255e-05, + "loss": 4.8476, + "step": 25960 + }, + { + "epoch": 0.15439742125796935, + "grad_norm": 1.6458512544631958, + "learning_rate": 4.711646437828773e-05, + "loss": 4.8117, + "step": 25961 + }, + { + "epoch": 0.15440336854125036, + "grad_norm": 1.4821311235427856, + "learning_rate": 4.711624659467229e-05, + "loss": 4.8647, + "step": 25962 + }, + { + "epoch": 0.15440931582453135, + "grad_norm": 1.4640769958496094, + "learning_rate": 4.711602880333629e-05, + "loss": 5.0038, + "step": 25963 + }, + { + "epoch": 0.15441526310781234, + "grad_norm": 1.7705153226852417, + "learning_rate": 4.711581100427981e-05, + "loss": 5.12, + "step": 25964 + }, + { + "epoch": 0.15442121039109336, + "grad_norm": 1.7333801984786987, + "learning_rate": 4.711559319750294e-05, + "loss": 4.9785, + "step": 25965 + }, + { + "epoch": 0.15442715767437434, + "grad_norm": 1.6170109510421753, + "learning_rate": 4.711537538300574e-05, + "loss": 4.9764, + "step": 25966 + }, + { + "epoch": 0.15443310495765533, + "grad_norm": 1.4895650148391724, + "learning_rate": 4.7115157560788295e-05, + "loss": 4.5585, + "step": 25967 + }, + { + "epoch": 0.15443905224093635, + "grad_norm": 1.6678147315979004, + "learning_rate": 4.711493973085067e-05, + "loss": 4.7897, + "step": 25968 + }, + { + "epoch": 0.15444499952421734, + "grad_norm": 1.537511944770813, + "learning_rate": 4.7114721893192945e-05, + "loss": 4.8845, + "step": 25969 + }, + { + "epoch": 0.15445094680749832, + "grad_norm": 1.7167041301727295, + "learning_rate": 4.711450404781521e-05, + "loss": 4.9126, + "step": 25970 + }, + { + "epoch": 0.15445689409077934, + "grad_norm": 1.763170599937439, + "learning_rate": 4.711428619471752e-05, + "loss": 4.6864, + "step": 25971 + }, + { + "epoch": 0.15446284137406033, + "grad_norm": 1.4620569944381714, + "learning_rate": 4.7114068333899964e-05, + "loss": 4.744, + "step": 25972 + }, + { + "epoch": 0.15446878865734132, + "grad_norm": 1.6106908321380615, + "learning_rate": 4.711385046536262e-05, + "loss": 5.2037, + "step": 25973 + }, + { + "epoch": 0.15447473594062233, + "grad_norm": 2.173444986343384, + "learning_rate": 4.711363258910556e-05, + "loss": 4.8086, + "step": 25974 + }, + { + "epoch": 0.15448068322390332, + "grad_norm": 2.0350496768951416, + "learning_rate": 4.711341470512885e-05, + "loss": 4.7291, + "step": 25975 + }, + { + "epoch": 0.1544866305071843, + "grad_norm": 1.9148650169372559, + "learning_rate": 4.7113196813432584e-05, + "loss": 4.7627, + "step": 25976 + }, + { + "epoch": 0.15449257779046532, + "grad_norm": 1.9944121837615967, + "learning_rate": 4.711297891401683e-05, + "loss": 4.8124, + "step": 25977 + }, + { + "epoch": 0.1544985250737463, + "grad_norm": 1.515162706375122, + "learning_rate": 4.7112761006881655e-05, + "loss": 4.8781, + "step": 25978 + }, + { + "epoch": 0.1545044723570273, + "grad_norm": 1.7549412250518799, + "learning_rate": 4.711254309202715e-05, + "loss": 4.9173, + "step": 25979 + }, + { + "epoch": 0.15451041964030832, + "grad_norm": 1.5914033651351929, + "learning_rate": 4.711232516945338e-05, + "loss": 5.012, + "step": 25980 + }, + { + "epoch": 0.1545163669235893, + "grad_norm": 1.7436847686767578, + "learning_rate": 4.711210723916043e-05, + "loss": 4.4552, + "step": 25981 + }, + { + "epoch": 0.1545223142068703, + "grad_norm": 1.5679067373275757, + "learning_rate": 4.711188930114837e-05, + "loss": 4.9158, + "step": 25982 + }, + { + "epoch": 0.1545282614901513, + "grad_norm": 1.5164258480072021, + "learning_rate": 4.711167135541727e-05, + "loss": 4.2524, + "step": 25983 + }, + { + "epoch": 0.1545342087734323, + "grad_norm": 1.7215555906295776, + "learning_rate": 4.711145340196723e-05, + "loss": 4.4035, + "step": 25984 + }, + { + "epoch": 0.15454015605671328, + "grad_norm": 1.8671064376831055, + "learning_rate": 4.7111235440798303e-05, + "loss": 4.6875, + "step": 25985 + }, + { + "epoch": 0.1545461033399943, + "grad_norm": 1.760772705078125, + "learning_rate": 4.7111017471910566e-05, + "loss": 4.7645, + "step": 25986 + }, + { + "epoch": 0.1545520506232753, + "grad_norm": 1.8126411437988281, + "learning_rate": 4.7110799495304115e-05, + "loss": 5.1524, + "step": 25987 + }, + { + "epoch": 0.15455799790655628, + "grad_norm": 1.6593974828720093, + "learning_rate": 4.7110581510979e-05, + "loss": 5.1902, + "step": 25988 + }, + { + "epoch": 0.1545639451898373, + "grad_norm": 1.721921443939209, + "learning_rate": 4.711036351893532e-05, + "loss": 5.0316, + "step": 25989 + }, + { + "epoch": 0.15456989247311828, + "grad_norm": 2.030829668045044, + "learning_rate": 4.7110145519173135e-05, + "loss": 4.9087, + "step": 25990 + }, + { + "epoch": 0.15457583975639927, + "grad_norm": 1.6568117141723633, + "learning_rate": 4.710992751169252e-05, + "loss": 4.3814, + "step": 25991 + }, + { + "epoch": 0.15458178703968029, + "grad_norm": 1.667718768119812, + "learning_rate": 4.7109709496493565e-05, + "loss": 4.8191, + "step": 25992 + }, + { + "epoch": 0.15458773432296127, + "grad_norm": 1.6483817100524902, + "learning_rate": 4.710949147357634e-05, + "loss": 5.055, + "step": 25993 + }, + { + "epoch": 0.15459368160624226, + "grad_norm": 1.703580617904663, + "learning_rate": 4.710927344294092e-05, + "loss": 5.0259, + "step": 25994 + }, + { + "epoch": 0.15459962888952328, + "grad_norm": 1.512531042098999, + "learning_rate": 4.710905540458737e-05, + "loss": 5.1221, + "step": 25995 + }, + { + "epoch": 0.15460557617280427, + "grad_norm": 1.4010028839111328, + "learning_rate": 4.710883735851579e-05, + "loss": 5.2263, + "step": 25996 + }, + { + "epoch": 0.15461152345608525, + "grad_norm": 1.694629192352295, + "learning_rate": 4.710861930472624e-05, + "loss": 4.9348, + "step": 25997 + }, + { + "epoch": 0.15461747073936627, + "grad_norm": 1.5974243879318237, + "learning_rate": 4.710840124321879e-05, + "loss": 5.1262, + "step": 25998 + }, + { + "epoch": 0.15462341802264726, + "grad_norm": 1.6333894729614258, + "learning_rate": 4.7108183173993535e-05, + "loss": 4.6557, + "step": 25999 + }, + { + "epoch": 0.15462936530592825, + "grad_norm": 1.660767674446106, + "learning_rate": 4.710796509705054e-05, + "loss": 4.9764, + "step": 26000 + }, + { + "epoch": 0.15463531258920926, + "grad_norm": 1.5514689683914185, + "learning_rate": 4.710774701238989e-05, + "loss": 4.8895, + "step": 26001 + }, + { + "epoch": 0.15464125987249025, + "grad_norm": 1.7753626108169556, + "learning_rate": 4.7107528920011645e-05, + "loss": 5.1251, + "step": 26002 + }, + { + "epoch": 0.15464720715577124, + "grad_norm": 1.5963994264602661, + "learning_rate": 4.7107310819915895e-05, + "loss": 4.9678, + "step": 26003 + }, + { + "epoch": 0.15465315443905225, + "grad_norm": 1.7098819017410278, + "learning_rate": 4.7107092712102706e-05, + "loss": 4.7313, + "step": 26004 + }, + { + "epoch": 0.15465910172233324, + "grad_norm": 1.7636046409606934, + "learning_rate": 4.710687459657216e-05, + "loss": 4.7752, + "step": 26005 + }, + { + "epoch": 0.15466504900561423, + "grad_norm": 1.5514246225357056, + "learning_rate": 4.7106656473324336e-05, + "loss": 4.6835, + "step": 26006 + }, + { + "epoch": 0.15467099628889525, + "grad_norm": 1.6040410995483398, + "learning_rate": 4.7106438342359303e-05, + "loss": 4.8096, + "step": 26007 + }, + { + "epoch": 0.15467694357217623, + "grad_norm": 1.622213363647461, + "learning_rate": 4.7106220203677144e-05, + "loss": 5.0896, + "step": 26008 + }, + { + "epoch": 0.15468289085545722, + "grad_norm": 1.6227675676345825, + "learning_rate": 4.710600205727793e-05, + "loss": 5.0895, + "step": 26009 + }, + { + "epoch": 0.1546888381387382, + "grad_norm": 1.6498078107833862, + "learning_rate": 4.710578390316174e-05, + "loss": 4.8625, + "step": 26010 + }, + { + "epoch": 0.15469478542201923, + "grad_norm": 1.6175272464752197, + "learning_rate": 4.710556574132865e-05, + "loss": 4.9729, + "step": 26011 + }, + { + "epoch": 0.15470073270530021, + "grad_norm": 1.5892902612686157, + "learning_rate": 4.7105347571778735e-05, + "loss": 4.755, + "step": 26012 + }, + { + "epoch": 0.1547066799885812, + "grad_norm": 1.4750880002975464, + "learning_rate": 4.710512939451207e-05, + "loss": 4.7497, + "step": 26013 + }, + { + "epoch": 0.15471262727186222, + "grad_norm": 1.5363775491714478, + "learning_rate": 4.710491120952874e-05, + "loss": 5.1039, + "step": 26014 + }, + { + "epoch": 0.1547185745551432, + "grad_norm": 1.5225108861923218, + "learning_rate": 4.71046930168288e-05, + "loss": 4.782, + "step": 26015 + }, + { + "epoch": 0.1547245218384242, + "grad_norm": 1.6348788738250732, + "learning_rate": 4.7104474816412345e-05, + "loss": 4.9252, + "step": 26016 + }, + { + "epoch": 0.1547304691217052, + "grad_norm": 1.6000639200210571, + "learning_rate": 4.7104256608279454e-05, + "loss": 4.9286, + "step": 26017 + }, + { + "epoch": 0.1547364164049862, + "grad_norm": 1.4785354137420654, + "learning_rate": 4.710403839243018e-05, + "loss": 4.7383, + "step": 26018 + }, + { + "epoch": 0.1547423636882672, + "grad_norm": 1.548176884651184, + "learning_rate": 4.710382016886463e-05, + "loss": 4.7526, + "step": 26019 + }, + { + "epoch": 0.1547483109715482, + "grad_norm": 1.537049651145935, + "learning_rate": 4.710360193758287e-05, + "loss": 4.6532, + "step": 26020 + }, + { + "epoch": 0.1547542582548292, + "grad_norm": 1.4506211280822754, + "learning_rate": 4.710338369858495e-05, + "loss": 5.1028, + "step": 26021 + }, + { + "epoch": 0.15476020553811018, + "grad_norm": 1.4539066553115845, + "learning_rate": 4.710316545187098e-05, + "loss": 5.0396, + "step": 26022 + }, + { + "epoch": 0.1547661528213912, + "grad_norm": 1.408674716949463, + "learning_rate": 4.7102947197441016e-05, + "loss": 5.2779, + "step": 26023 + }, + { + "epoch": 0.15477210010467218, + "grad_norm": 1.5732898712158203, + "learning_rate": 4.710272893529515e-05, + "loss": 5.1519, + "step": 26024 + }, + { + "epoch": 0.15477804738795317, + "grad_norm": 1.5260519981384277, + "learning_rate": 4.710251066543344e-05, + "loss": 5.056, + "step": 26025 + }, + { + "epoch": 0.1547839946712342, + "grad_norm": 1.4518004655838013, + "learning_rate": 4.710229238785598e-05, + "loss": 4.9322, + "step": 26026 + }, + { + "epoch": 0.15478994195451518, + "grad_norm": 1.6032034158706665, + "learning_rate": 4.7102074102562835e-05, + "loss": 5.0368, + "step": 26027 + }, + { + "epoch": 0.15479588923779616, + "grad_norm": 1.6396820545196533, + "learning_rate": 4.7101855809554085e-05, + "loss": 4.4808, + "step": 26028 + }, + { + "epoch": 0.15480183652107718, + "grad_norm": 1.6207085847854614, + "learning_rate": 4.710163750882981e-05, + "loss": 4.5206, + "step": 26029 + }, + { + "epoch": 0.15480778380435817, + "grad_norm": 1.5769189596176147, + "learning_rate": 4.7101419200390073e-05, + "loss": 4.4192, + "step": 26030 + }, + { + "epoch": 0.15481373108763916, + "grad_norm": 1.4689233303070068, + "learning_rate": 4.710120088423496e-05, + "loss": 4.8726, + "step": 26031 + }, + { + "epoch": 0.15481967837092017, + "grad_norm": 1.3557206392288208, + "learning_rate": 4.710098256036455e-05, + "loss": 5.1076, + "step": 26032 + }, + { + "epoch": 0.15482562565420116, + "grad_norm": 1.561497688293457, + "learning_rate": 4.710076422877891e-05, + "loss": 4.6845, + "step": 26033 + }, + { + "epoch": 0.15483157293748215, + "grad_norm": 1.6871447563171387, + "learning_rate": 4.710054588947813e-05, + "loss": 4.8231, + "step": 26034 + }, + { + "epoch": 0.15483752022076316, + "grad_norm": 1.7153793573379517, + "learning_rate": 4.710032754246228e-05, + "loss": 4.767, + "step": 26035 + }, + { + "epoch": 0.15484346750404415, + "grad_norm": 1.6859761476516724, + "learning_rate": 4.710010918773142e-05, + "loss": 4.6774, + "step": 26036 + }, + { + "epoch": 0.15484941478732514, + "grad_norm": 1.4598466157913208, + "learning_rate": 4.709989082528565e-05, + "loss": 4.8141, + "step": 26037 + }, + { + "epoch": 0.15485536207060616, + "grad_norm": 1.572952389717102, + "learning_rate": 4.709967245512504e-05, + "loss": 5.0215, + "step": 26038 + }, + { + "epoch": 0.15486130935388714, + "grad_norm": 1.6656177043914795, + "learning_rate": 4.7099454077249655e-05, + "loss": 4.5755, + "step": 26039 + }, + { + "epoch": 0.15486725663716813, + "grad_norm": 1.4872766733169556, + "learning_rate": 4.709923569165958e-05, + "loss": 4.9086, + "step": 26040 + }, + { + "epoch": 0.15487320392044915, + "grad_norm": 1.603215217590332, + "learning_rate": 4.70990172983549e-05, + "loss": 4.8528, + "step": 26041 + }, + { + "epoch": 0.15487915120373014, + "grad_norm": 1.5077006816864014, + "learning_rate": 4.7098798897335664e-05, + "loss": 4.8544, + "step": 26042 + }, + { + "epoch": 0.15488509848701112, + "grad_norm": 1.515825629234314, + "learning_rate": 4.709858048860197e-05, + "loss": 4.7793, + "step": 26043 + }, + { + "epoch": 0.15489104577029214, + "grad_norm": 1.472776174545288, + "learning_rate": 4.7098362072153904e-05, + "loss": 4.8047, + "step": 26044 + }, + { + "epoch": 0.15489699305357313, + "grad_norm": 1.5982736349105835, + "learning_rate": 4.709814364799151e-05, + "loss": 4.9911, + "step": 26045 + }, + { + "epoch": 0.15490294033685412, + "grad_norm": 1.3136348724365234, + "learning_rate": 4.709792521611489e-05, + "loss": 5.3009, + "step": 26046 + }, + { + "epoch": 0.15490888762013513, + "grad_norm": 1.6178503036499023, + "learning_rate": 4.709770677652412e-05, + "loss": 4.7873, + "step": 26047 + }, + { + "epoch": 0.15491483490341612, + "grad_norm": 1.544202446937561, + "learning_rate": 4.709748832921926e-05, + "loss": 4.645, + "step": 26048 + }, + { + "epoch": 0.1549207821866971, + "grad_norm": 1.359904408454895, + "learning_rate": 4.70972698742004e-05, + "loss": 5.0246, + "step": 26049 + }, + { + "epoch": 0.15492672946997812, + "grad_norm": 1.4320893287658691, + "learning_rate": 4.7097051411467606e-05, + "loss": 5.0227, + "step": 26050 + }, + { + "epoch": 0.1549326767532591, + "grad_norm": 1.7229030132293701, + "learning_rate": 4.7096832941020963e-05, + "loss": 5.2792, + "step": 26051 + }, + { + "epoch": 0.1549386240365401, + "grad_norm": 1.672554850578308, + "learning_rate": 4.709661446286054e-05, + "loss": 4.9227, + "step": 26052 + }, + { + "epoch": 0.15494457131982112, + "grad_norm": 1.5159001350402832, + "learning_rate": 4.709639597698642e-05, + "loss": 4.7464, + "step": 26053 + }, + { + "epoch": 0.1549505186031021, + "grad_norm": 1.5735573768615723, + "learning_rate": 4.7096177483398676e-05, + "loss": 5.2281, + "step": 26054 + }, + { + "epoch": 0.1549564658863831, + "grad_norm": 1.4174078702926636, + "learning_rate": 4.709595898209739e-05, + "loss": 5.138, + "step": 26055 + }, + { + "epoch": 0.1549624131696641, + "grad_norm": 1.3748446702957153, + "learning_rate": 4.7095740473082626e-05, + "loss": 5.2084, + "step": 26056 + }, + { + "epoch": 0.1549683604529451, + "grad_norm": 1.5169907808303833, + "learning_rate": 4.709552195635447e-05, + "loss": 5.3272, + "step": 26057 + }, + { + "epoch": 0.15497430773622609, + "grad_norm": 1.6235400438308716, + "learning_rate": 4.7095303431912994e-05, + "loss": 5.2201, + "step": 26058 + }, + { + "epoch": 0.1549802550195071, + "grad_norm": 1.571418046951294, + "learning_rate": 4.709508489975828e-05, + "loss": 5.3584, + "step": 26059 + }, + { + "epoch": 0.1549862023027881, + "grad_norm": 1.690524697303772, + "learning_rate": 4.70948663598904e-05, + "loss": 5.3091, + "step": 26060 + }, + { + "epoch": 0.15499214958606908, + "grad_norm": 1.6778768301010132, + "learning_rate": 4.7094647812309424e-05, + "loss": 4.8765, + "step": 26061 + }, + { + "epoch": 0.1549980968693501, + "grad_norm": 1.6365214586257935, + "learning_rate": 4.709442925701544e-05, + "loss": 5.4826, + "step": 26062 + }, + { + "epoch": 0.15500404415263108, + "grad_norm": 1.4799535274505615, + "learning_rate": 4.709421069400851e-05, + "loss": 5.5668, + "step": 26063 + }, + { + "epoch": 0.15500999143591207, + "grad_norm": 1.5750006437301636, + "learning_rate": 4.7093992123288734e-05, + "loss": 5.235, + "step": 26064 + }, + { + "epoch": 0.15501593871919309, + "grad_norm": 1.8067607879638672, + "learning_rate": 4.7093773544856165e-05, + "loss": 5.2708, + "step": 26065 + }, + { + "epoch": 0.15502188600247407, + "grad_norm": 1.4780645370483398, + "learning_rate": 4.709355495871088e-05, + "loss": 5.1626, + "step": 26066 + }, + { + "epoch": 0.15502783328575506, + "grad_norm": 1.5702919960021973, + "learning_rate": 4.709333636485298e-05, + "loss": 5.2306, + "step": 26067 + }, + { + "epoch": 0.15503378056903608, + "grad_norm": 1.7658028602600098, + "learning_rate": 4.7093117763282515e-05, + "loss": 4.9352, + "step": 26068 + }, + { + "epoch": 0.15503972785231707, + "grad_norm": 1.69098961353302, + "learning_rate": 4.709289915399957e-05, + "loss": 4.7679, + "step": 26069 + }, + { + "epoch": 0.15504567513559805, + "grad_norm": 1.704026460647583, + "learning_rate": 4.709268053700423e-05, + "loss": 4.6209, + "step": 26070 + }, + { + "epoch": 0.15505162241887904, + "grad_norm": 1.4715653657913208, + "learning_rate": 4.709246191229656e-05, + "loss": 5.1664, + "step": 26071 + }, + { + "epoch": 0.15505756970216006, + "grad_norm": 1.5663673877716064, + "learning_rate": 4.7092243279876634e-05, + "loss": 5.3833, + "step": 26072 + }, + { + "epoch": 0.15506351698544105, + "grad_norm": 1.4647293090820312, + "learning_rate": 4.709202463974454e-05, + "loss": 5.2766, + "step": 26073 + }, + { + "epoch": 0.15506946426872203, + "grad_norm": 1.5950292348861694, + "learning_rate": 4.7091805991900344e-05, + "loss": 5.2686, + "step": 26074 + }, + { + "epoch": 0.15507541155200305, + "grad_norm": 1.593206524848938, + "learning_rate": 4.709158733634413e-05, + "loss": 4.9969, + "step": 26075 + }, + { + "epoch": 0.15508135883528404, + "grad_norm": 1.5884050130844116, + "learning_rate": 4.7091368673075975e-05, + "loss": 4.9804, + "step": 26076 + }, + { + "epoch": 0.15508730611856503, + "grad_norm": 1.5333365201950073, + "learning_rate": 4.709115000209594e-05, + "loss": 4.6808, + "step": 26077 + }, + { + "epoch": 0.15509325340184604, + "grad_norm": 1.4642858505249023, + "learning_rate": 4.7090931323404116e-05, + "loss": 4.6828, + "step": 26078 + }, + { + "epoch": 0.15509920068512703, + "grad_norm": 2.0302491188049316, + "learning_rate": 4.709071263700059e-05, + "loss": 4.5523, + "step": 26079 + }, + { + "epoch": 0.15510514796840802, + "grad_norm": 1.6798481941223145, + "learning_rate": 4.709049394288541e-05, + "loss": 5.1286, + "step": 26080 + }, + { + "epoch": 0.15511109525168904, + "grad_norm": 1.5074591636657715, + "learning_rate": 4.7090275241058676e-05, + "loss": 5.3037, + "step": 26081 + }, + { + "epoch": 0.15511704253497002, + "grad_norm": 1.7001566886901855, + "learning_rate": 4.709005653152044e-05, + "loss": 5.0217, + "step": 26082 + }, + { + "epoch": 0.155122989818251, + "grad_norm": 1.84412682056427, + "learning_rate": 4.708983781427081e-05, + "loss": 4.5579, + "step": 26083 + }, + { + "epoch": 0.15512893710153203, + "grad_norm": 1.770264744758606, + "learning_rate": 4.708961908930984e-05, + "loss": 4.7394, + "step": 26084 + }, + { + "epoch": 0.15513488438481302, + "grad_norm": 1.7658874988555908, + "learning_rate": 4.7089400356637615e-05, + "loss": 4.9278, + "step": 26085 + }, + { + "epoch": 0.155140831668094, + "grad_norm": 1.5701930522918701, + "learning_rate": 4.7089181616254204e-05, + "loss": 4.7227, + "step": 26086 + }, + { + "epoch": 0.15514677895137502, + "grad_norm": 1.5790002346038818, + "learning_rate": 4.708896286815969e-05, + "loss": 4.9207, + "step": 26087 + }, + { + "epoch": 0.155152726234656, + "grad_norm": 1.8411163091659546, + "learning_rate": 4.7088744112354146e-05, + "loss": 3.8647, + "step": 26088 + }, + { + "epoch": 0.155158673517937, + "grad_norm": 1.813536524772644, + "learning_rate": 4.708852534883765e-05, + "loss": 4.1148, + "step": 26089 + }, + { + "epoch": 0.155164620801218, + "grad_norm": 1.6122519969940186, + "learning_rate": 4.708830657761028e-05, + "loss": 4.9749, + "step": 26090 + }, + { + "epoch": 0.155170568084499, + "grad_norm": 1.9105713367462158, + "learning_rate": 4.70880877986721e-05, + "loss": 4.9895, + "step": 26091 + }, + { + "epoch": 0.15517651536778, + "grad_norm": 1.849824070930481, + "learning_rate": 4.7087869012023215e-05, + "loss": 5.5382, + "step": 26092 + }, + { + "epoch": 0.155182462651061, + "grad_norm": 2.346090793609619, + "learning_rate": 4.708765021766367e-05, + "loss": 5.6398, + "step": 26093 + }, + { + "epoch": 0.155188409934342, + "grad_norm": 1.8905435800552368, + "learning_rate": 4.7087431415593555e-05, + "loss": 5.6089, + "step": 26094 + }, + { + "epoch": 0.15519435721762298, + "grad_norm": 1.6987192630767822, + "learning_rate": 4.7087212605812944e-05, + "loss": 5.4127, + "step": 26095 + }, + { + "epoch": 0.155200304500904, + "grad_norm": 1.7915600538253784, + "learning_rate": 4.708699378832193e-05, + "loss": 4.9027, + "step": 26096 + }, + { + "epoch": 0.15520625178418498, + "grad_norm": 1.5736148357391357, + "learning_rate": 4.708677496312056e-05, + "loss": 5.1403, + "step": 26097 + }, + { + "epoch": 0.15521219906746597, + "grad_norm": 1.6473568677902222, + "learning_rate": 4.708655613020893e-05, + "loss": 5.0299, + "step": 26098 + }, + { + "epoch": 0.155218146350747, + "grad_norm": 1.733720064163208, + "learning_rate": 4.708633728958711e-05, + "loss": 5.0153, + "step": 26099 + }, + { + "epoch": 0.15522409363402798, + "grad_norm": 1.842244267463684, + "learning_rate": 4.708611844125518e-05, + "loss": 4.7, + "step": 26100 + }, + { + "epoch": 0.15523004091730896, + "grad_norm": 1.8227342367172241, + "learning_rate": 4.708589958521321e-05, + "loss": 4.4889, + "step": 26101 + }, + { + "epoch": 0.15523598820058998, + "grad_norm": 1.7300339937210083, + "learning_rate": 4.708568072146129e-05, + "loss": 5.0326, + "step": 26102 + }, + { + "epoch": 0.15524193548387097, + "grad_norm": 2.0854434967041016, + "learning_rate": 4.708546184999948e-05, + "loss": 5.6966, + "step": 26103 + }, + { + "epoch": 0.15524788276715196, + "grad_norm": 1.5393275022506714, + "learning_rate": 4.708524297082786e-05, + "loss": 5.5777, + "step": 26104 + }, + { + "epoch": 0.15525383005043297, + "grad_norm": 1.7765403985977173, + "learning_rate": 4.7085024083946514e-05, + "loss": 5.7488, + "step": 26105 + }, + { + "epoch": 0.15525977733371396, + "grad_norm": 1.668286919593811, + "learning_rate": 4.708480518935552e-05, + "loss": 5.3823, + "step": 26106 + }, + { + "epoch": 0.15526572461699495, + "grad_norm": 1.7656164169311523, + "learning_rate": 4.708458628705494e-05, + "loss": 5.1098, + "step": 26107 + }, + { + "epoch": 0.15527167190027596, + "grad_norm": 1.6078004837036133, + "learning_rate": 4.708436737704486e-05, + "loss": 4.8957, + "step": 26108 + }, + { + "epoch": 0.15527761918355695, + "grad_norm": 1.5649595260620117, + "learning_rate": 4.7084148459325364e-05, + "loss": 5.4546, + "step": 26109 + }, + { + "epoch": 0.15528356646683794, + "grad_norm": 1.7555382251739502, + "learning_rate": 4.7083929533896506e-05, + "loss": 5.6428, + "step": 26110 + }, + { + "epoch": 0.15528951375011896, + "grad_norm": 1.7282280921936035, + "learning_rate": 4.708371060075839e-05, + "loss": 5.4197, + "step": 26111 + }, + { + "epoch": 0.15529546103339995, + "grad_norm": 1.8044626712799072, + "learning_rate": 4.708349165991107e-05, + "loss": 5.4676, + "step": 26112 + }, + { + "epoch": 0.15530140831668093, + "grad_norm": 1.6488827466964722, + "learning_rate": 4.7083272711354634e-05, + "loss": 5.2725, + "step": 26113 + }, + { + "epoch": 0.15530735559996195, + "grad_norm": 1.9291478395462036, + "learning_rate": 4.7083053755089155e-05, + "loss": 5.2565, + "step": 26114 + }, + { + "epoch": 0.15531330288324294, + "grad_norm": 1.9248192310333252, + "learning_rate": 4.708283479111471e-05, + "loss": 5.2514, + "step": 26115 + }, + { + "epoch": 0.15531925016652393, + "grad_norm": 1.9327218532562256, + "learning_rate": 4.708261581943137e-05, + "loss": 5.0833, + "step": 26116 + }, + { + "epoch": 0.15532519744980494, + "grad_norm": 1.952842354774475, + "learning_rate": 4.708239684003923e-05, + "loss": 5.0989, + "step": 26117 + }, + { + "epoch": 0.15533114473308593, + "grad_norm": 1.7923991680145264, + "learning_rate": 4.7082177852938344e-05, + "loss": 4.8204, + "step": 26118 + }, + { + "epoch": 0.15533709201636692, + "grad_norm": 1.761819839477539, + "learning_rate": 4.708195885812881e-05, + "loss": 5.1966, + "step": 26119 + }, + { + "epoch": 0.15534303929964793, + "grad_norm": 2.061192035675049, + "learning_rate": 4.7081739855610674e-05, + "loss": 4.7254, + "step": 26120 + }, + { + "epoch": 0.15534898658292892, + "grad_norm": 1.7219372987747192, + "learning_rate": 4.708152084538404e-05, + "loss": 5.008, + "step": 26121 + }, + { + "epoch": 0.1553549338662099, + "grad_norm": 1.836690068244934, + "learning_rate": 4.708130182744898e-05, + "loss": 4.8645, + "step": 26122 + }, + { + "epoch": 0.15536088114949093, + "grad_norm": 1.6488652229309082, + "learning_rate": 4.708108280180556e-05, + "loss": 5.1588, + "step": 26123 + }, + { + "epoch": 0.15536682843277191, + "grad_norm": 1.7643523216247559, + "learning_rate": 4.708086376845386e-05, + "loss": 4.9774, + "step": 26124 + }, + { + "epoch": 0.1553727757160529, + "grad_norm": 1.7396107912063599, + "learning_rate": 4.7080644727393967e-05, + "loss": 5.1542, + "step": 26125 + }, + { + "epoch": 0.15537872299933392, + "grad_norm": 1.723271131515503, + "learning_rate": 4.708042567862594e-05, + "loss": 4.5029, + "step": 26126 + }, + { + "epoch": 0.1553846702826149, + "grad_norm": 1.7824338674545288, + "learning_rate": 4.708020662214987e-05, + "loss": 4.8107, + "step": 26127 + }, + { + "epoch": 0.1553906175658959, + "grad_norm": 1.6587624549865723, + "learning_rate": 4.707998755796582e-05, + "loss": 5.0076, + "step": 26128 + }, + { + "epoch": 0.15539656484917688, + "grad_norm": 1.6058495044708252, + "learning_rate": 4.7079768486073884e-05, + "loss": 4.8512, + "step": 26129 + }, + { + "epoch": 0.1554025121324579, + "grad_norm": 1.6286768913269043, + "learning_rate": 4.707954940647412e-05, + "loss": 5.0587, + "step": 26130 + }, + { + "epoch": 0.1554084594157389, + "grad_norm": 1.5808156728744507, + "learning_rate": 4.707933031916662e-05, + "loss": 5.0254, + "step": 26131 + }, + { + "epoch": 0.15541440669901987, + "grad_norm": 1.7283897399902344, + "learning_rate": 4.707911122415145e-05, + "loss": 5.1255, + "step": 26132 + }, + { + "epoch": 0.1554203539823009, + "grad_norm": 1.9916651248931885, + "learning_rate": 4.70788921214287e-05, + "loss": 4.9384, + "step": 26133 + }, + { + "epoch": 0.15542630126558188, + "grad_norm": 1.5505808591842651, + "learning_rate": 4.7078673010998425e-05, + "loss": 5.0284, + "step": 26134 + }, + { + "epoch": 0.15543224854886287, + "grad_norm": 1.8529605865478516, + "learning_rate": 4.707845389286072e-05, + "loss": 5.1745, + "step": 26135 + }, + { + "epoch": 0.15543819583214388, + "grad_norm": 1.5921772718429565, + "learning_rate": 4.707823476701565e-05, + "loss": 5.1941, + "step": 26136 + }, + { + "epoch": 0.15544414311542487, + "grad_norm": 1.676703691482544, + "learning_rate": 4.70780156334633e-05, + "loss": 4.9678, + "step": 26137 + }, + { + "epoch": 0.15545009039870586, + "grad_norm": 1.5701407194137573, + "learning_rate": 4.707779649220374e-05, + "loss": 4.8332, + "step": 26138 + }, + { + "epoch": 0.15545603768198687, + "grad_norm": 1.4418753385543823, + "learning_rate": 4.707757734323706e-05, + "loss": 4.9294, + "step": 26139 + }, + { + "epoch": 0.15546198496526786, + "grad_norm": 1.4596991539001465, + "learning_rate": 4.707735818656331e-05, + "loss": 4.874, + "step": 26140 + }, + { + "epoch": 0.15546793224854885, + "grad_norm": 1.475049376487732, + "learning_rate": 4.707713902218259e-05, + "loss": 5.0269, + "step": 26141 + }, + { + "epoch": 0.15547387953182987, + "grad_norm": 1.4616882801055908, + "learning_rate": 4.7076919850094966e-05, + "loss": 5.0152, + "step": 26142 + }, + { + "epoch": 0.15547982681511086, + "grad_norm": 1.5477145910263062, + "learning_rate": 4.707670067030052e-05, + "loss": 4.9596, + "step": 26143 + }, + { + "epoch": 0.15548577409839184, + "grad_norm": 1.6296616792678833, + "learning_rate": 4.707648148279933e-05, + "loss": 4.7555, + "step": 26144 + }, + { + "epoch": 0.15549172138167286, + "grad_norm": 2.044677257537842, + "learning_rate": 4.707626228759147e-05, + "loss": 4.2117, + "step": 26145 + }, + { + "epoch": 0.15549766866495385, + "grad_norm": 1.8100709915161133, + "learning_rate": 4.7076043084677e-05, + "loss": 4.5057, + "step": 26146 + }, + { + "epoch": 0.15550361594823484, + "grad_norm": 1.698901653289795, + "learning_rate": 4.7075823874056026e-05, + "loss": 4.6707, + "step": 26147 + }, + { + "epoch": 0.15550956323151585, + "grad_norm": 1.5637656450271606, + "learning_rate": 4.70756046557286e-05, + "loss": 4.871, + "step": 26148 + }, + { + "epoch": 0.15551551051479684, + "grad_norm": 1.5465519428253174, + "learning_rate": 4.707538542969481e-05, + "loss": 4.6844, + "step": 26149 + }, + { + "epoch": 0.15552145779807783, + "grad_norm": 1.6268285512924194, + "learning_rate": 4.7075166195954736e-05, + "loss": 5.046, + "step": 26150 + }, + { + "epoch": 0.15552740508135884, + "grad_norm": 1.6071034669876099, + "learning_rate": 4.707494695450845e-05, + "loss": 4.9576, + "step": 26151 + }, + { + "epoch": 0.15553335236463983, + "grad_norm": 1.4627524614334106, + "learning_rate": 4.707472770535603e-05, + "loss": 5.0786, + "step": 26152 + }, + { + "epoch": 0.15553929964792082, + "grad_norm": 1.7464107275009155, + "learning_rate": 4.707450844849754e-05, + "loss": 5.0383, + "step": 26153 + }, + { + "epoch": 0.15554524693120184, + "grad_norm": 1.7528932094573975, + "learning_rate": 4.7074289183933077e-05, + "loss": 4.7332, + "step": 26154 + }, + { + "epoch": 0.15555119421448282, + "grad_norm": 1.9061720371246338, + "learning_rate": 4.70740699116627e-05, + "loss": 4.5108, + "step": 26155 + }, + { + "epoch": 0.1555571414977638, + "grad_norm": 1.6121511459350586, + "learning_rate": 4.70738506316865e-05, + "loss": 4.9586, + "step": 26156 + }, + { + "epoch": 0.15556308878104483, + "grad_norm": 1.622747778892517, + "learning_rate": 4.707363134400454e-05, + "loss": 5.0985, + "step": 26157 + }, + { + "epoch": 0.15556903606432582, + "grad_norm": 1.4669454097747803, + "learning_rate": 4.707341204861691e-05, + "loss": 4.9397, + "step": 26158 + }, + { + "epoch": 0.1555749833476068, + "grad_norm": 1.4583669900894165, + "learning_rate": 4.707319274552368e-05, + "loss": 5.0822, + "step": 26159 + }, + { + "epoch": 0.15558093063088782, + "grad_norm": 1.9358830451965332, + "learning_rate": 4.707297343472492e-05, + "loss": 4.9557, + "step": 26160 + }, + { + "epoch": 0.1555868779141688, + "grad_norm": 1.7523856163024902, + "learning_rate": 4.707275411622072e-05, + "loss": 4.5959, + "step": 26161 + }, + { + "epoch": 0.1555928251974498, + "grad_norm": 1.7858316898345947, + "learning_rate": 4.707253479001114e-05, + "loss": 5.1765, + "step": 26162 + }, + { + "epoch": 0.1555987724807308, + "grad_norm": 1.7400814294815063, + "learning_rate": 4.707231545609627e-05, + "loss": 5.4312, + "step": 26163 + }, + { + "epoch": 0.1556047197640118, + "grad_norm": 1.6235188245773315, + "learning_rate": 4.7072096114476186e-05, + "loss": 5.1745, + "step": 26164 + }, + { + "epoch": 0.1556106670472928, + "grad_norm": 1.6003834009170532, + "learning_rate": 4.7071876765150963e-05, + "loss": 4.9194, + "step": 26165 + }, + { + "epoch": 0.1556166143305738, + "grad_norm": 1.7427910566329956, + "learning_rate": 4.7071657408120675e-05, + "loss": 5.1942, + "step": 26166 + }, + { + "epoch": 0.1556225616138548, + "grad_norm": 1.5763969421386719, + "learning_rate": 4.7071438043385395e-05, + "loss": 4.9424, + "step": 26167 + }, + { + "epoch": 0.15562850889713578, + "grad_norm": 1.6284310817718506, + "learning_rate": 4.7071218670945206e-05, + "loss": 5.4415, + "step": 26168 + }, + { + "epoch": 0.1556344561804168, + "grad_norm": 1.3858957290649414, + "learning_rate": 4.707099929080019e-05, + "loss": 5.6362, + "step": 26169 + }, + { + "epoch": 0.15564040346369779, + "grad_norm": 1.4326859712600708, + "learning_rate": 4.70707799029504e-05, + "loss": 5.2872, + "step": 26170 + }, + { + "epoch": 0.15564635074697877, + "grad_norm": 1.6624369621276855, + "learning_rate": 4.7070560507395944e-05, + "loss": 5.1741, + "step": 26171 + }, + { + "epoch": 0.1556522980302598, + "grad_norm": 2.4475722312927246, + "learning_rate": 4.707034110413688e-05, + "loss": 4.8206, + "step": 26172 + }, + { + "epoch": 0.15565824531354078, + "grad_norm": 2.2583391666412354, + "learning_rate": 4.707012169317329e-05, + "loss": 4.6716, + "step": 26173 + }, + { + "epoch": 0.15566419259682177, + "grad_norm": 2.161346197128296, + "learning_rate": 4.706990227450524e-05, + "loss": 4.5228, + "step": 26174 + }, + { + "epoch": 0.15567013988010278, + "grad_norm": 1.550593614578247, + "learning_rate": 4.7069682848132815e-05, + "loss": 5.1581, + "step": 26175 + }, + { + "epoch": 0.15567608716338377, + "grad_norm": 1.524939775466919, + "learning_rate": 4.70694634140561e-05, + "loss": 5.6605, + "step": 26176 + }, + { + "epoch": 0.15568203444666476, + "grad_norm": 2.134462833404541, + "learning_rate": 4.7069243972275155e-05, + "loss": 4.9063, + "step": 26177 + }, + { + "epoch": 0.15568798172994577, + "grad_norm": 2.2610831260681152, + "learning_rate": 4.7069024522790075e-05, + "loss": 4.4764, + "step": 26178 + }, + { + "epoch": 0.15569392901322676, + "grad_norm": 2.4277896881103516, + "learning_rate": 4.706880506560092e-05, + "loss": 4.7747, + "step": 26179 + }, + { + "epoch": 0.15569987629650775, + "grad_norm": 2.5465261936187744, + "learning_rate": 4.706858560070777e-05, + "loss": 4.7831, + "step": 26180 + }, + { + "epoch": 0.15570582357978877, + "grad_norm": 2.4795758724212646, + "learning_rate": 4.706836612811071e-05, + "loss": 4.6256, + "step": 26181 + }, + { + "epoch": 0.15571177086306975, + "grad_norm": 2.624998092651367, + "learning_rate": 4.7068146647809805e-05, + "loss": 4.5916, + "step": 26182 + }, + { + "epoch": 0.15571771814635074, + "grad_norm": 2.1440951824188232, + "learning_rate": 4.706792715980515e-05, + "loss": 4.5955, + "step": 26183 + }, + { + "epoch": 0.15572366542963176, + "grad_norm": 2.386084794998169, + "learning_rate": 4.70677076640968e-05, + "loss": 3.9781, + "step": 26184 + }, + { + "epoch": 0.15572961271291275, + "grad_norm": 2.271477699279785, + "learning_rate": 4.7067488160684844e-05, + "loss": 4.3557, + "step": 26185 + }, + { + "epoch": 0.15573555999619373, + "grad_norm": 2.227630853652954, + "learning_rate": 4.706726864956935e-05, + "loss": 4.117, + "step": 26186 + }, + { + "epoch": 0.15574150727947472, + "grad_norm": 2.1777312755584717, + "learning_rate": 4.7067049130750414e-05, + "loss": 4.4695, + "step": 26187 + }, + { + "epoch": 0.15574745456275574, + "grad_norm": 2.131826162338257, + "learning_rate": 4.7066829604228094e-05, + "loss": 4.185, + "step": 26188 + }, + { + "epoch": 0.15575340184603673, + "grad_norm": 1.9766490459442139, + "learning_rate": 4.706661007000246e-05, + "loss": 5.6452, + "step": 26189 + }, + { + "epoch": 0.15575934912931771, + "grad_norm": 2.088787078857422, + "learning_rate": 4.706639052807361e-05, + "loss": 4.6965, + "step": 26190 + }, + { + "epoch": 0.15576529641259873, + "grad_norm": 2.012974262237549, + "learning_rate": 4.7066170978441616e-05, + "loss": 4.4508, + "step": 26191 + }, + { + "epoch": 0.15577124369587972, + "grad_norm": 2.473616123199463, + "learning_rate": 4.706595142110654e-05, + "loss": 4.4842, + "step": 26192 + }, + { + "epoch": 0.1557771909791607, + "grad_norm": 2.5314011573791504, + "learning_rate": 4.7065731856068475e-05, + "loss": 4.5175, + "step": 26193 + }, + { + "epoch": 0.15578313826244172, + "grad_norm": 2.0637693405151367, + "learning_rate": 4.7065512283327484e-05, + "loss": 4.8803, + "step": 26194 + }, + { + "epoch": 0.1557890855457227, + "grad_norm": 2.659450054168701, + "learning_rate": 4.706529270288366e-05, + "loss": 4.7659, + "step": 26195 + }, + { + "epoch": 0.1557950328290037, + "grad_norm": 1.741438865661621, + "learning_rate": 4.706507311473707e-05, + "loss": 5.5987, + "step": 26196 + }, + { + "epoch": 0.15580098011228471, + "grad_norm": 1.621771216392517, + "learning_rate": 4.706485351888778e-05, + "loss": 5.477, + "step": 26197 + }, + { + "epoch": 0.1558069273955657, + "grad_norm": 1.8086066246032715, + "learning_rate": 4.706463391533589e-05, + "loss": 5.4196, + "step": 26198 + }, + { + "epoch": 0.1558128746788467, + "grad_norm": 1.4268287420272827, + "learning_rate": 4.706441430408145e-05, + "loss": 5.8321, + "step": 26199 + }, + { + "epoch": 0.1558188219621277, + "grad_norm": 1.5565332174301147, + "learning_rate": 4.7064194685124564e-05, + "loss": 5.5548, + "step": 26200 + }, + { + "epoch": 0.1558247692454087, + "grad_norm": 1.7371162176132202, + "learning_rate": 4.706397505846529e-05, + "loss": 5.4536, + "step": 26201 + }, + { + "epoch": 0.15583071652868968, + "grad_norm": 1.6265679597854614, + "learning_rate": 4.706375542410371e-05, + "loss": 4.7589, + "step": 26202 + }, + { + "epoch": 0.1558366638119707, + "grad_norm": 1.5395931005477905, + "learning_rate": 4.70635357820399e-05, + "loss": 5.2809, + "step": 26203 + }, + { + "epoch": 0.1558426110952517, + "grad_norm": 1.5577752590179443, + "learning_rate": 4.7063316132273937e-05, + "loss": 5.2526, + "step": 26204 + }, + { + "epoch": 0.15584855837853268, + "grad_norm": 1.3954623937606812, + "learning_rate": 4.706309647480591e-05, + "loss": 5.3674, + "step": 26205 + }, + { + "epoch": 0.1558545056618137, + "grad_norm": 1.7251001596450806, + "learning_rate": 4.706287680963587e-05, + "loss": 5.2069, + "step": 26206 + }, + { + "epoch": 0.15586045294509468, + "grad_norm": 1.8611587285995483, + "learning_rate": 4.706265713676391e-05, + "loss": 5.2805, + "step": 26207 + }, + { + "epoch": 0.15586640022837567, + "grad_norm": 1.5871427059173584, + "learning_rate": 4.706243745619011e-05, + "loss": 5.2921, + "step": 26208 + }, + { + "epoch": 0.15587234751165668, + "grad_norm": 1.6353893280029297, + "learning_rate": 4.706221776791454e-05, + "loss": 5.3425, + "step": 26209 + }, + { + "epoch": 0.15587829479493767, + "grad_norm": 1.6304540634155273, + "learning_rate": 4.7061998071937274e-05, + "loss": 5.3577, + "step": 26210 + }, + { + "epoch": 0.15588424207821866, + "grad_norm": 1.6434270143508911, + "learning_rate": 4.706177836825839e-05, + "loss": 5.4573, + "step": 26211 + }, + { + "epoch": 0.15589018936149968, + "grad_norm": 1.6281068325042725, + "learning_rate": 4.7061558656877976e-05, + "loss": 4.8948, + "step": 26212 + }, + { + "epoch": 0.15589613664478066, + "grad_norm": 1.7287936210632324, + "learning_rate": 4.70613389377961e-05, + "loss": 5.2005, + "step": 26213 + }, + { + "epoch": 0.15590208392806165, + "grad_norm": 1.8355118036270142, + "learning_rate": 4.706111921101283e-05, + "loss": 5.456, + "step": 26214 + }, + { + "epoch": 0.15590803121134267, + "grad_norm": 1.5891990661621094, + "learning_rate": 4.7060899476528253e-05, + "loss": 5.1405, + "step": 26215 + }, + { + "epoch": 0.15591397849462366, + "grad_norm": 1.5852643251419067, + "learning_rate": 4.706067973434244e-05, + "loss": 5.5963, + "step": 26216 + }, + { + "epoch": 0.15591992577790464, + "grad_norm": 2.340528726577759, + "learning_rate": 4.706045998445548e-05, + "loss": 4.6047, + "step": 26217 + }, + { + "epoch": 0.15592587306118566, + "grad_norm": 1.872802495956421, + "learning_rate": 4.706024022686744e-05, + "loss": 4.7129, + "step": 26218 + }, + { + "epoch": 0.15593182034446665, + "grad_norm": 1.6725971698760986, + "learning_rate": 4.706002046157839e-05, + "loss": 5.2416, + "step": 26219 + }, + { + "epoch": 0.15593776762774764, + "grad_norm": 1.6346997022628784, + "learning_rate": 4.705980068858843e-05, + "loss": 5.0625, + "step": 26220 + }, + { + "epoch": 0.15594371491102865, + "grad_norm": 1.8969260454177856, + "learning_rate": 4.705958090789761e-05, + "loss": 4.6915, + "step": 26221 + }, + { + "epoch": 0.15594966219430964, + "grad_norm": 1.6025121212005615, + "learning_rate": 4.705936111950602e-05, + "loss": 4.9978, + "step": 26222 + }, + { + "epoch": 0.15595560947759063, + "grad_norm": 1.406001329421997, + "learning_rate": 4.705914132341374e-05, + "loss": 5.7913, + "step": 26223 + }, + { + "epoch": 0.15596155676087164, + "grad_norm": 2.1708552837371826, + "learning_rate": 4.7058921519620834e-05, + "loss": 5.1468, + "step": 26224 + }, + { + "epoch": 0.15596750404415263, + "grad_norm": 2.216993808746338, + "learning_rate": 4.705870170812739e-05, + "loss": 5.1279, + "step": 26225 + }, + { + "epoch": 0.15597345132743362, + "grad_norm": 1.7173157930374146, + "learning_rate": 4.705848188893348e-05, + "loss": 5.1289, + "step": 26226 + }, + { + "epoch": 0.15597939861071464, + "grad_norm": 1.6096726655960083, + "learning_rate": 4.705826206203918e-05, + "loss": 5.5078, + "step": 26227 + }, + { + "epoch": 0.15598534589399562, + "grad_norm": 1.8224303722381592, + "learning_rate": 4.705804222744458e-05, + "loss": 5.4791, + "step": 26228 + }, + { + "epoch": 0.1559912931772766, + "grad_norm": 1.722948431968689, + "learning_rate": 4.705782238514973e-05, + "loss": 5.1473, + "step": 26229 + }, + { + "epoch": 0.15599724046055763, + "grad_norm": 1.7583675384521484, + "learning_rate": 4.705760253515473e-05, + "loss": 5.5127, + "step": 26230 + }, + { + "epoch": 0.15600318774383862, + "grad_norm": 1.5635607242584229, + "learning_rate": 4.705738267745965e-05, + "loss": 5.417, + "step": 26231 + }, + { + "epoch": 0.1560091350271196, + "grad_norm": 1.570145606994629, + "learning_rate": 4.705716281206456e-05, + "loss": 5.266, + "step": 26232 + }, + { + "epoch": 0.15601508231040062, + "grad_norm": 1.6425197124481201, + "learning_rate": 4.705694293896955e-05, + "loss": 4.7162, + "step": 26233 + }, + { + "epoch": 0.1560210295936816, + "grad_norm": 1.6312974691390991, + "learning_rate": 4.705672305817468e-05, + "loss": 4.8861, + "step": 26234 + }, + { + "epoch": 0.1560269768769626, + "grad_norm": 1.6320679187774658, + "learning_rate": 4.7056503169680046e-05, + "loss": 5.2133, + "step": 26235 + }, + { + "epoch": 0.1560329241602436, + "grad_norm": 1.6294546127319336, + "learning_rate": 4.705628327348571e-05, + "loss": 5.7012, + "step": 26236 + }, + { + "epoch": 0.1560388714435246, + "grad_norm": 1.472088098526001, + "learning_rate": 4.705606336959175e-05, + "loss": 5.404, + "step": 26237 + }, + { + "epoch": 0.1560448187268056, + "grad_norm": 1.5214602947235107, + "learning_rate": 4.705584345799825e-05, + "loss": 5.3916, + "step": 26238 + }, + { + "epoch": 0.1560507660100866, + "grad_norm": 1.45046067237854, + "learning_rate": 4.705562353870528e-05, + "loss": 5.2275, + "step": 26239 + }, + { + "epoch": 0.1560567132933676, + "grad_norm": 1.5730977058410645, + "learning_rate": 4.705540361171292e-05, + "loss": 5.4597, + "step": 26240 + }, + { + "epoch": 0.15606266057664858, + "grad_norm": 1.6403652429580688, + "learning_rate": 4.7055183677021254e-05, + "loss": 4.7476, + "step": 26241 + }, + { + "epoch": 0.1560686078599296, + "grad_norm": 2.0256097316741943, + "learning_rate": 4.705496373463034e-05, + "loss": 4.7275, + "step": 26242 + }, + { + "epoch": 0.15607455514321059, + "grad_norm": 2.1107068061828613, + "learning_rate": 4.7054743784540265e-05, + "loss": 4.7459, + "step": 26243 + }, + { + "epoch": 0.15608050242649157, + "grad_norm": 1.4644510746002197, + "learning_rate": 4.705452382675112e-05, + "loss": 5.3951, + "step": 26244 + }, + { + "epoch": 0.15608644970977256, + "grad_norm": 1.4154125452041626, + "learning_rate": 4.705430386126296e-05, + "loss": 5.5351, + "step": 26245 + }, + { + "epoch": 0.15609239699305358, + "grad_norm": 1.4124795198440552, + "learning_rate": 4.7054083888075875e-05, + "loss": 5.3797, + "step": 26246 + }, + { + "epoch": 0.15609834427633457, + "grad_norm": 1.6197364330291748, + "learning_rate": 4.705386390718993e-05, + "loss": 5.3903, + "step": 26247 + }, + { + "epoch": 0.15610429155961555, + "grad_norm": 1.5693352222442627, + "learning_rate": 4.7053643918605216e-05, + "loss": 5.4997, + "step": 26248 + }, + { + "epoch": 0.15611023884289657, + "grad_norm": 1.4047479629516602, + "learning_rate": 4.70534239223218e-05, + "loss": 5.0258, + "step": 26249 + }, + { + "epoch": 0.15611618612617756, + "grad_norm": 1.7006193399429321, + "learning_rate": 4.705320391833976e-05, + "loss": 4.9798, + "step": 26250 + }, + { + "epoch": 0.15612213340945855, + "grad_norm": 1.7294094562530518, + "learning_rate": 4.705298390665917e-05, + "loss": 5.5811, + "step": 26251 + }, + { + "epoch": 0.15612808069273956, + "grad_norm": 1.4665381908416748, + "learning_rate": 4.705276388728013e-05, + "loss": 5.5117, + "step": 26252 + }, + { + "epoch": 0.15613402797602055, + "grad_norm": 1.4549496173858643, + "learning_rate": 4.705254386020268e-05, + "loss": 5.6141, + "step": 26253 + }, + { + "epoch": 0.15613997525930154, + "grad_norm": 1.4019516706466675, + "learning_rate": 4.705232382542691e-05, + "loss": 5.6525, + "step": 26254 + }, + { + "epoch": 0.15614592254258255, + "grad_norm": 1.3660154342651367, + "learning_rate": 4.705210378295292e-05, + "loss": 5.4377, + "step": 26255 + }, + { + "epoch": 0.15615186982586354, + "grad_norm": 1.5590531826019287, + "learning_rate": 4.7051883732780755e-05, + "loss": 5.5679, + "step": 26256 + }, + { + "epoch": 0.15615781710914453, + "grad_norm": 2.126138687133789, + "learning_rate": 4.7051663674910514e-05, + "loss": 4.8662, + "step": 26257 + }, + { + "epoch": 0.15616376439242555, + "grad_norm": 1.5536115169525146, + "learning_rate": 4.705144360934226e-05, + "loss": 4.97, + "step": 26258 + }, + { + "epoch": 0.15616971167570654, + "grad_norm": 2.0653862953186035, + "learning_rate": 4.705122353607607e-05, + "loss": 4.8683, + "step": 26259 + }, + { + "epoch": 0.15617565895898752, + "grad_norm": 1.872904658317566, + "learning_rate": 4.705100345511204e-05, + "loss": 4.8923, + "step": 26260 + }, + { + "epoch": 0.15618160624226854, + "grad_norm": 2.112368583679199, + "learning_rate": 4.7050783366450224e-05, + "loss": 4.7857, + "step": 26261 + }, + { + "epoch": 0.15618755352554953, + "grad_norm": 1.4000160694122314, + "learning_rate": 4.7050563270090704e-05, + "loss": 5.2055, + "step": 26262 + }, + { + "epoch": 0.15619350080883052, + "grad_norm": 1.4316319227218628, + "learning_rate": 4.705034316603356e-05, + "loss": 5.5257, + "step": 26263 + }, + { + "epoch": 0.15619944809211153, + "grad_norm": 1.4394290447235107, + "learning_rate": 4.705012305427887e-05, + "loss": 5.2702, + "step": 26264 + }, + { + "epoch": 0.15620539537539252, + "grad_norm": 2.0612921714782715, + "learning_rate": 4.704990293482672e-05, + "loss": 4.964, + "step": 26265 + }, + { + "epoch": 0.1562113426586735, + "grad_norm": 1.7573301792144775, + "learning_rate": 4.704968280767716e-05, + "loss": 5.1509, + "step": 26266 + }, + { + "epoch": 0.15621728994195452, + "grad_norm": 1.546891450881958, + "learning_rate": 4.70494626728303e-05, + "loss": 5.3226, + "step": 26267 + }, + { + "epoch": 0.1562232372252355, + "grad_norm": 1.672478437423706, + "learning_rate": 4.7049242530286195e-05, + "loss": 4.998, + "step": 26268 + }, + { + "epoch": 0.1562291845085165, + "grad_norm": 1.943877100944519, + "learning_rate": 4.704902238004492e-05, + "loss": 4.6489, + "step": 26269 + }, + { + "epoch": 0.15623513179179752, + "grad_norm": 2.779040813446045, + "learning_rate": 4.704880222210657e-05, + "loss": 3.8466, + "step": 26270 + }, + { + "epoch": 0.1562410790750785, + "grad_norm": 2.8241045475006104, + "learning_rate": 4.7048582056471205e-05, + "loss": 4.026, + "step": 26271 + }, + { + "epoch": 0.1562470263583595, + "grad_norm": 1.6769524812698364, + "learning_rate": 4.70483618831389e-05, + "loss": 4.6255, + "step": 26272 + }, + { + "epoch": 0.1562529736416405, + "grad_norm": 1.4940049648284912, + "learning_rate": 4.704814170210975e-05, + "loss": 4.7496, + "step": 26273 + }, + { + "epoch": 0.1562589209249215, + "grad_norm": 1.6519593000411987, + "learning_rate": 4.704792151338382e-05, + "loss": 4.7485, + "step": 26274 + }, + { + "epoch": 0.15626486820820248, + "grad_norm": 2.30234956741333, + "learning_rate": 4.704770131696119e-05, + "loss": 4.6089, + "step": 26275 + }, + { + "epoch": 0.1562708154914835, + "grad_norm": 1.6795179843902588, + "learning_rate": 4.704748111284193e-05, + "loss": 5.2412, + "step": 26276 + }, + { + "epoch": 0.1562767627747645, + "grad_norm": 2.194812536239624, + "learning_rate": 4.7047260901026124e-05, + "loss": 5.156, + "step": 26277 + }, + { + "epoch": 0.15628271005804548, + "grad_norm": 2.5557010173797607, + "learning_rate": 4.704704068151385e-05, + "loss": 4.5438, + "step": 26278 + }, + { + "epoch": 0.1562886573413265, + "grad_norm": 1.95830237865448, + "learning_rate": 4.704682045430518e-05, + "loss": 4.6183, + "step": 26279 + }, + { + "epoch": 0.15629460462460748, + "grad_norm": 2.1255557537078857, + "learning_rate": 4.704660021940019e-05, + "loss": 4.5619, + "step": 26280 + }, + { + "epoch": 0.15630055190788847, + "grad_norm": 1.6092948913574219, + "learning_rate": 4.704637997679896e-05, + "loss": 5.64, + "step": 26281 + }, + { + "epoch": 0.15630649919116948, + "grad_norm": 2.1546456813812256, + "learning_rate": 4.704615972650157e-05, + "loss": 4.9573, + "step": 26282 + }, + { + "epoch": 0.15631244647445047, + "grad_norm": 2.154639959335327, + "learning_rate": 4.7045939468508095e-05, + "loss": 4.4704, + "step": 26283 + }, + { + "epoch": 0.15631839375773146, + "grad_norm": 1.819509744644165, + "learning_rate": 4.7045719202818605e-05, + "loss": 4.6245, + "step": 26284 + }, + { + "epoch": 0.15632434104101248, + "grad_norm": 2.337667942047119, + "learning_rate": 4.704549892943318e-05, + "loss": 4.4268, + "step": 26285 + }, + { + "epoch": 0.15633028832429346, + "grad_norm": 2.308842658996582, + "learning_rate": 4.704527864835191e-05, + "loss": 4.7084, + "step": 26286 + }, + { + "epoch": 0.15633623560757445, + "grad_norm": 1.664182424545288, + "learning_rate": 4.704505835957486e-05, + "loss": 5.2576, + "step": 26287 + }, + { + "epoch": 0.15634218289085547, + "grad_norm": 1.7331715822219849, + "learning_rate": 4.7044838063102096e-05, + "loss": 5.3069, + "step": 26288 + }, + { + "epoch": 0.15634813017413646, + "grad_norm": 1.4833427667617798, + "learning_rate": 4.7044617758933714e-05, + "loss": 4.8484, + "step": 26289 + }, + { + "epoch": 0.15635407745741745, + "grad_norm": 2.975609064102173, + "learning_rate": 4.704439744706978e-05, + "loss": 5.5747, + "step": 26290 + }, + { + "epoch": 0.15636002474069846, + "grad_norm": 1.8256950378417969, + "learning_rate": 4.704417712751038e-05, + "loss": 5.2464, + "step": 26291 + }, + { + "epoch": 0.15636597202397945, + "grad_norm": 1.5019065141677856, + "learning_rate": 4.7043956800255585e-05, + "loss": 5.5261, + "step": 26292 + }, + { + "epoch": 0.15637191930726044, + "grad_norm": 1.4906537532806396, + "learning_rate": 4.7043736465305464e-05, + "loss": 5.38, + "step": 26293 + }, + { + "epoch": 0.15637786659054145, + "grad_norm": 1.601969599723816, + "learning_rate": 4.704351612266012e-05, + "loss": 5.2111, + "step": 26294 + }, + { + "epoch": 0.15638381387382244, + "grad_norm": 1.5806862115859985, + "learning_rate": 4.70432957723196e-05, + "loss": 5.5473, + "step": 26295 + }, + { + "epoch": 0.15638976115710343, + "grad_norm": 1.5971914529800415, + "learning_rate": 4.7043075414283986e-05, + "loss": 5.4841, + "step": 26296 + }, + { + "epoch": 0.15639570844038445, + "grad_norm": 1.6458126306533813, + "learning_rate": 4.704285504855337e-05, + "loss": 5.3215, + "step": 26297 + }, + { + "epoch": 0.15640165572366543, + "grad_norm": 1.5553637742996216, + "learning_rate": 4.704263467512782e-05, + "loss": 5.4461, + "step": 26298 + }, + { + "epoch": 0.15640760300694642, + "grad_norm": 1.447519063949585, + "learning_rate": 4.704241429400742e-05, + "loss": 5.3617, + "step": 26299 + }, + { + "epoch": 0.15641355029022744, + "grad_norm": 1.5533196926116943, + "learning_rate": 4.704219390519223e-05, + "loss": 4.8446, + "step": 26300 + }, + { + "epoch": 0.15641949757350843, + "grad_norm": 1.5320333242416382, + "learning_rate": 4.7041973508682344e-05, + "loss": 5.3333, + "step": 26301 + }, + { + "epoch": 0.15642544485678941, + "grad_norm": 1.6192045211791992, + "learning_rate": 4.704175310447784e-05, + "loss": 5.221, + "step": 26302 + }, + { + "epoch": 0.1564313921400704, + "grad_norm": 1.4964373111724854, + "learning_rate": 4.704153269257878e-05, + "loss": 5.3061, + "step": 26303 + }, + { + "epoch": 0.15643733942335142, + "grad_norm": 1.6173138618469238, + "learning_rate": 4.704131227298525e-05, + "loss": 5.3485, + "step": 26304 + }, + { + "epoch": 0.1564432867066324, + "grad_norm": 1.511825680732727, + "learning_rate": 4.704109184569733e-05, + "loss": 5.2024, + "step": 26305 + }, + { + "epoch": 0.1564492339899134, + "grad_norm": 1.5368350744247437, + "learning_rate": 4.704087141071508e-05, + "loss": 5.3867, + "step": 26306 + }, + { + "epoch": 0.1564551812731944, + "grad_norm": 1.612384557723999, + "learning_rate": 4.7040650968038605e-05, + "loss": 5.1923, + "step": 26307 + }, + { + "epoch": 0.1564611285564754, + "grad_norm": 1.5889664888381958, + "learning_rate": 4.704043051766795e-05, + "loss": 5.0457, + "step": 26308 + }, + { + "epoch": 0.1564670758397564, + "grad_norm": 1.5363719463348389, + "learning_rate": 4.704021005960322e-05, + "loss": 5.3852, + "step": 26309 + }, + { + "epoch": 0.1564730231230374, + "grad_norm": 1.5099613666534424, + "learning_rate": 4.703998959384447e-05, + "loss": 5.8659, + "step": 26310 + }, + { + "epoch": 0.1564789704063184, + "grad_norm": 1.5517312288284302, + "learning_rate": 4.70397691203918e-05, + "loss": 6.0298, + "step": 26311 + }, + { + "epoch": 0.15648491768959938, + "grad_norm": 1.616828441619873, + "learning_rate": 4.703954863924527e-05, + "loss": 4.8686, + "step": 26312 + }, + { + "epoch": 0.1564908649728804, + "grad_norm": 1.4939557313919067, + "learning_rate": 4.703932815040496e-05, + "loss": 5.3872, + "step": 26313 + }, + { + "epoch": 0.15649681225616138, + "grad_norm": 1.444994568824768, + "learning_rate": 4.7039107653870954e-05, + "loss": 5.38, + "step": 26314 + }, + { + "epoch": 0.15650275953944237, + "grad_norm": 1.7697070837020874, + "learning_rate": 4.7038887149643304e-05, + "loss": 5.6994, + "step": 26315 + }, + { + "epoch": 0.1565087068227234, + "grad_norm": 1.628763198852539, + "learning_rate": 4.703866663772213e-05, + "loss": 5.5986, + "step": 26316 + }, + { + "epoch": 0.15651465410600438, + "grad_norm": 1.5433357954025269, + "learning_rate": 4.703844611810747e-05, + "loss": 5.5968, + "step": 26317 + }, + { + "epoch": 0.15652060138928536, + "grad_norm": 1.452527403831482, + "learning_rate": 4.7038225590799424e-05, + "loss": 5.5669, + "step": 26318 + }, + { + "epoch": 0.15652654867256638, + "grad_norm": 1.6079583168029785, + "learning_rate": 4.703800505579806e-05, + "loss": 5.2624, + "step": 26319 + }, + { + "epoch": 0.15653249595584737, + "grad_norm": 1.4639090299606323, + "learning_rate": 4.703778451310345e-05, + "loss": 5.4219, + "step": 26320 + }, + { + "epoch": 0.15653844323912836, + "grad_norm": 1.7064789533615112, + "learning_rate": 4.703756396271568e-05, + "loss": 5.055, + "step": 26321 + }, + { + "epoch": 0.15654439052240937, + "grad_norm": 1.596901297569275, + "learning_rate": 4.7037343404634824e-05, + "loss": 6.4061, + "step": 26322 + }, + { + "epoch": 0.15655033780569036, + "grad_norm": 1.4072599411010742, + "learning_rate": 4.703712283886097e-05, + "loss": 5.4348, + "step": 26323 + }, + { + "epoch": 0.15655628508897135, + "grad_norm": 1.4027669429779053, + "learning_rate": 4.703690226539417e-05, + "loss": 5.285, + "step": 26324 + }, + { + "epoch": 0.15656223237225236, + "grad_norm": 1.3492887020111084, + "learning_rate": 4.703668168423452e-05, + "loss": 5.2334, + "step": 26325 + }, + { + "epoch": 0.15656817965553335, + "grad_norm": 1.5650583505630493, + "learning_rate": 4.703646109538209e-05, + "loss": 5.3706, + "step": 26326 + }, + { + "epoch": 0.15657412693881434, + "grad_norm": 1.549395203590393, + "learning_rate": 4.703624049883696e-05, + "loss": 5.3483, + "step": 26327 + }, + { + "epoch": 0.15658007422209536, + "grad_norm": 1.5657979249954224, + "learning_rate": 4.70360198945992e-05, + "loss": 5.2897, + "step": 26328 + }, + { + "epoch": 0.15658602150537634, + "grad_norm": 1.3859858512878418, + "learning_rate": 4.7035799282668906e-05, + "loss": 5.3292, + "step": 26329 + }, + { + "epoch": 0.15659196878865733, + "grad_norm": 1.8330230712890625, + "learning_rate": 4.7035578663046136e-05, + "loss": 5.6592, + "step": 26330 + }, + { + "epoch": 0.15659791607193835, + "grad_norm": 1.6347804069519043, + "learning_rate": 4.703535803573097e-05, + "loss": 5.5734, + "step": 26331 + }, + { + "epoch": 0.15660386335521934, + "grad_norm": 1.615646481513977, + "learning_rate": 4.7035137400723496e-05, + "loss": 5.8483, + "step": 26332 + }, + { + "epoch": 0.15660981063850032, + "grad_norm": 1.7376673221588135, + "learning_rate": 4.703491675802378e-05, + "loss": 5.327, + "step": 26333 + }, + { + "epoch": 0.15661575792178134, + "grad_norm": 2.2167186737060547, + "learning_rate": 4.70346961076319e-05, + "loss": 4.6295, + "step": 26334 + }, + { + "epoch": 0.15662170520506233, + "grad_norm": 1.8190215826034546, + "learning_rate": 4.703447544954794e-05, + "loss": 4.6977, + "step": 26335 + }, + { + "epoch": 0.15662765248834332, + "grad_norm": 1.8056445121765137, + "learning_rate": 4.703425478377197e-05, + "loss": 4.7828, + "step": 26336 + }, + { + "epoch": 0.15663359977162433, + "grad_norm": 1.3003071546554565, + "learning_rate": 4.7034034110304056e-05, + "loss": 5.3244, + "step": 26337 + }, + { + "epoch": 0.15663954705490532, + "grad_norm": 1.5494154691696167, + "learning_rate": 4.703381342914431e-05, + "loss": 5.2614, + "step": 26338 + }, + { + "epoch": 0.1566454943381863, + "grad_norm": 1.4443477392196655, + "learning_rate": 4.703359274029278e-05, + "loss": 5.6987, + "step": 26339 + }, + { + "epoch": 0.15665144162146732, + "grad_norm": 1.6877416372299194, + "learning_rate": 4.703337204374955e-05, + "loss": 5.0908, + "step": 26340 + }, + { + "epoch": 0.1566573889047483, + "grad_norm": 1.7778805494308472, + "learning_rate": 4.703315133951469e-05, + "loss": 5.067, + "step": 26341 + }, + { + "epoch": 0.1566633361880293, + "grad_norm": 1.8032246828079224, + "learning_rate": 4.703293062758829e-05, + "loss": 5.2325, + "step": 26342 + }, + { + "epoch": 0.15666928347131032, + "grad_norm": 1.6244032382965088, + "learning_rate": 4.703270990797042e-05, + "loss": 4.7988, + "step": 26343 + }, + { + "epoch": 0.1566752307545913, + "grad_norm": 2.212272882461548, + "learning_rate": 4.7032489180661154e-05, + "loss": 4.6136, + "step": 26344 + }, + { + "epoch": 0.1566811780378723, + "grad_norm": 1.4413294792175293, + "learning_rate": 4.703226844566059e-05, + "loss": 5.1378, + "step": 26345 + }, + { + "epoch": 0.1566871253211533, + "grad_norm": 1.7251073122024536, + "learning_rate": 4.703204770296877e-05, + "loss": 4.8629, + "step": 26346 + }, + { + "epoch": 0.1566930726044343, + "grad_norm": 1.8171210289001465, + "learning_rate": 4.70318269525858e-05, + "loss": 4.8487, + "step": 26347 + }, + { + "epoch": 0.15669901988771529, + "grad_norm": 1.7784240245819092, + "learning_rate": 4.703160619451175e-05, + "loss": 5.3187, + "step": 26348 + }, + { + "epoch": 0.1567049671709963, + "grad_norm": 1.7092580795288086, + "learning_rate": 4.703138542874669e-05, + "loss": 5.0771, + "step": 26349 + }, + { + "epoch": 0.1567109144542773, + "grad_norm": 1.4181660413742065, + "learning_rate": 4.7031164655290695e-05, + "loss": 5.3487, + "step": 26350 + }, + { + "epoch": 0.15671686173755828, + "grad_norm": 1.6292651891708374, + "learning_rate": 4.703094387414385e-05, + "loss": 5.2079, + "step": 26351 + }, + { + "epoch": 0.1567228090208393, + "grad_norm": 1.5617179870605469, + "learning_rate": 4.703072308530624e-05, + "loss": 5.3438, + "step": 26352 + }, + { + "epoch": 0.15672875630412028, + "grad_norm": 1.8505250215530396, + "learning_rate": 4.703050228877792e-05, + "loss": 5.223, + "step": 26353 + }, + { + "epoch": 0.15673470358740127, + "grad_norm": 1.2503677606582642, + "learning_rate": 4.7030281484558984e-05, + "loss": 4.7168, + "step": 26354 + }, + { + "epoch": 0.15674065087068229, + "grad_norm": 1.4453564882278442, + "learning_rate": 4.70300606726495e-05, + "loss": 5.3493, + "step": 26355 + }, + { + "epoch": 0.15674659815396327, + "grad_norm": 1.305949091911316, + "learning_rate": 4.702983985304956e-05, + "loss": 5.0599, + "step": 26356 + }, + { + "epoch": 0.15675254543724426, + "grad_norm": 2.160369634628296, + "learning_rate": 4.702961902575923e-05, + "loss": 4.2452, + "step": 26357 + }, + { + "epoch": 0.15675849272052528, + "grad_norm": 4.334263324737549, + "learning_rate": 4.7029398190778574e-05, + "loss": 2.7403, + "step": 26358 + }, + { + "epoch": 0.15676444000380627, + "grad_norm": 2.7898688316345215, + "learning_rate": 4.702917734810769e-05, + "loss": 2.7024, + "step": 26359 + }, + { + "epoch": 0.15677038728708725, + "grad_norm": 2.939950466156006, + "learning_rate": 4.702895649774665e-05, + "loss": 2.5659, + "step": 26360 + }, + { + "epoch": 0.15677633457036824, + "grad_norm": 2.2159571647644043, + "learning_rate": 4.702873563969553e-05, + "loss": 4.2729, + "step": 26361 + }, + { + "epoch": 0.15678228185364926, + "grad_norm": 1.4781655073165894, + "learning_rate": 4.7028514773954404e-05, + "loss": 4.7654, + "step": 26362 + }, + { + "epoch": 0.15678822913693025, + "grad_norm": 3.3153202533721924, + "learning_rate": 4.702829390052335e-05, + "loss": 4.055, + "step": 26363 + }, + { + "epoch": 0.15679417642021123, + "grad_norm": 4.366955757141113, + "learning_rate": 4.7028073019402446e-05, + "loss": 2.463, + "step": 26364 + }, + { + "epoch": 0.15680012370349225, + "grad_norm": 3.7748520374298096, + "learning_rate": 4.702785213059177e-05, + "loss": 2.8617, + "step": 26365 + }, + { + "epoch": 0.15680607098677324, + "grad_norm": 3.252652645111084, + "learning_rate": 4.7027631234091394e-05, + "loss": 2.8654, + "step": 26366 + }, + { + "epoch": 0.15681201827005423, + "grad_norm": 3.4591829776763916, + "learning_rate": 4.7027410329901414e-05, + "loss": 3.3268, + "step": 26367 + }, + { + "epoch": 0.15681796555333524, + "grad_norm": 2.971773624420166, + "learning_rate": 4.702718941802188e-05, + "loss": 2.835, + "step": 26368 + }, + { + "epoch": 0.15682391283661623, + "grad_norm": 2.8094983100891113, + "learning_rate": 4.7026968498452884e-05, + "loss": 3.5431, + "step": 26369 + }, + { + "epoch": 0.15682986011989722, + "grad_norm": 3.014570474624634, + "learning_rate": 4.7026747571194496e-05, + "loss": 3.2034, + "step": 26370 + }, + { + "epoch": 0.15683580740317823, + "grad_norm": 3.1913933753967285, + "learning_rate": 4.7026526636246805e-05, + "loss": 2.944, + "step": 26371 + }, + { + "epoch": 0.15684175468645922, + "grad_norm": 3.0981903076171875, + "learning_rate": 4.7026305693609884e-05, + "loss": 3.1399, + "step": 26372 + }, + { + "epoch": 0.1568477019697402, + "grad_norm": 2.7449357509613037, + "learning_rate": 4.70260847432838e-05, + "loss": 2.9713, + "step": 26373 + }, + { + "epoch": 0.15685364925302123, + "grad_norm": 2.5030126571655273, + "learning_rate": 4.7025863785268645e-05, + "loss": 4.1367, + "step": 26374 + }, + { + "epoch": 0.15685959653630221, + "grad_norm": 1.7585763931274414, + "learning_rate": 4.7025642819564476e-05, + "loss": 5.4266, + "step": 26375 + }, + { + "epoch": 0.1568655438195832, + "grad_norm": 1.6513370275497437, + "learning_rate": 4.702542184617139e-05, + "loss": 5.4329, + "step": 26376 + }, + { + "epoch": 0.15687149110286422, + "grad_norm": 1.381144404411316, + "learning_rate": 4.702520086508946e-05, + "loss": 5.2046, + "step": 26377 + }, + { + "epoch": 0.1568774383861452, + "grad_norm": 1.9510244131088257, + "learning_rate": 4.702497987631875e-05, + "loss": 5.365, + "step": 26378 + }, + { + "epoch": 0.1568833856694262, + "grad_norm": 2.6427478790283203, + "learning_rate": 4.702475887985936e-05, + "loss": 4.8551, + "step": 26379 + }, + { + "epoch": 0.1568893329527072, + "grad_norm": 1.9253584146499634, + "learning_rate": 4.702453787571135e-05, + "loss": 4.7738, + "step": 26380 + }, + { + "epoch": 0.1568952802359882, + "grad_norm": 1.9647809267044067, + "learning_rate": 4.7024316863874795e-05, + "loss": 5.0153, + "step": 26381 + }, + { + "epoch": 0.1569012275192692, + "grad_norm": 1.7858566045761108, + "learning_rate": 4.7024095844349786e-05, + "loss": 5.4806, + "step": 26382 + }, + { + "epoch": 0.1569071748025502, + "grad_norm": 1.5491056442260742, + "learning_rate": 4.7023874817136395e-05, + "loss": 5.1898, + "step": 26383 + }, + { + "epoch": 0.1569131220858312, + "grad_norm": 1.4932126998901367, + "learning_rate": 4.702365378223469e-05, + "loss": 5.3636, + "step": 26384 + }, + { + "epoch": 0.15691906936911218, + "grad_norm": 1.5436698198318481, + "learning_rate": 4.702343273964475e-05, + "loss": 5.2469, + "step": 26385 + }, + { + "epoch": 0.1569250166523932, + "grad_norm": 1.9735430479049683, + "learning_rate": 4.7023211689366666e-05, + "loss": 5.111, + "step": 26386 + }, + { + "epoch": 0.15693096393567418, + "grad_norm": 1.4643042087554932, + "learning_rate": 4.70229906314005e-05, + "loss": 4.9215, + "step": 26387 + }, + { + "epoch": 0.15693691121895517, + "grad_norm": 2.3229660987854004, + "learning_rate": 4.7022769565746345e-05, + "loss": 4.7726, + "step": 26388 + }, + { + "epoch": 0.1569428585022362, + "grad_norm": 4.978843688964844, + "learning_rate": 4.7022548492404264e-05, + "loss": 4.1208, + "step": 26389 + }, + { + "epoch": 0.15694880578551718, + "grad_norm": 4.040123462677002, + "learning_rate": 4.702232741137434e-05, + "loss": 4.6272, + "step": 26390 + }, + { + "epoch": 0.15695475306879816, + "grad_norm": 1.6977242231369019, + "learning_rate": 4.7022106322656643e-05, + "loss": 5.0605, + "step": 26391 + }, + { + "epoch": 0.15696070035207918, + "grad_norm": 2.055257558822632, + "learning_rate": 4.702188522625126e-05, + "loss": 4.9685, + "step": 26392 + }, + { + "epoch": 0.15696664763536017, + "grad_norm": 1.5921961069107056, + "learning_rate": 4.7021664122158264e-05, + "loss": 5.1433, + "step": 26393 + }, + { + "epoch": 0.15697259491864116, + "grad_norm": 1.5311743021011353, + "learning_rate": 4.7021443010377734e-05, + "loss": 5.2865, + "step": 26394 + }, + { + "epoch": 0.15697854220192217, + "grad_norm": 1.4683947563171387, + "learning_rate": 4.702122189090975e-05, + "loss": 5.2697, + "step": 26395 + }, + { + "epoch": 0.15698448948520316, + "grad_norm": 1.5425411462783813, + "learning_rate": 4.702100076375438e-05, + "loss": 5.5033, + "step": 26396 + }, + { + "epoch": 0.15699043676848415, + "grad_norm": 1.8671424388885498, + "learning_rate": 4.70207796289117e-05, + "loss": 4.544, + "step": 26397 + }, + { + "epoch": 0.15699638405176516, + "grad_norm": 2.107107400894165, + "learning_rate": 4.70205584863818e-05, + "loss": 4.2386, + "step": 26398 + }, + { + "epoch": 0.15700233133504615, + "grad_norm": 1.6025463342666626, + "learning_rate": 4.7020337336164746e-05, + "loss": 5.742, + "step": 26399 + }, + { + "epoch": 0.15700827861832714, + "grad_norm": 1.4157508611679077, + "learning_rate": 4.702011617826063e-05, + "loss": 6.2568, + "step": 26400 + }, + { + "epoch": 0.15701422590160816, + "grad_norm": 1.4367010593414307, + "learning_rate": 4.701989501266951e-05, + "loss": 6.0992, + "step": 26401 + }, + { + "epoch": 0.15702017318488914, + "grad_norm": 1.7271238565444946, + "learning_rate": 4.7019673839391476e-05, + "loss": 4.9925, + "step": 26402 + }, + { + "epoch": 0.15702612046817013, + "grad_norm": 1.4689936637878418, + "learning_rate": 4.70194526584266e-05, + "loss": 5.1224, + "step": 26403 + }, + { + "epoch": 0.15703206775145115, + "grad_norm": 1.816994071006775, + "learning_rate": 4.701923146977496e-05, + "loss": 4.5333, + "step": 26404 + }, + { + "epoch": 0.15703801503473214, + "grad_norm": 1.6789166927337646, + "learning_rate": 4.7019010273436634e-05, + "loss": 4.9303, + "step": 26405 + }, + { + "epoch": 0.15704396231801313, + "grad_norm": 1.8921838998794556, + "learning_rate": 4.70187890694117e-05, + "loss": 4.3924, + "step": 26406 + }, + { + "epoch": 0.15704990960129414, + "grad_norm": 2.397531270980835, + "learning_rate": 4.701856785770024e-05, + "loss": 3.317, + "step": 26407 + }, + { + "epoch": 0.15705585688457513, + "grad_norm": 2.1896491050720215, + "learning_rate": 4.7018346638302314e-05, + "loss": 4.2621, + "step": 26408 + }, + { + "epoch": 0.15706180416785612, + "grad_norm": 1.5073274374008179, + "learning_rate": 4.7018125411218014e-05, + "loss": 5.238, + "step": 26409 + }, + { + "epoch": 0.15706775145113713, + "grad_norm": 1.672512173652649, + "learning_rate": 4.701790417644741e-05, + "loss": 5.0822, + "step": 26410 + }, + { + "epoch": 0.15707369873441812, + "grad_norm": 1.6251648664474487, + "learning_rate": 4.701768293399059e-05, + "loss": 5.3444, + "step": 26411 + }, + { + "epoch": 0.1570796460176991, + "grad_norm": 1.8805150985717773, + "learning_rate": 4.701746168384763e-05, + "loss": 4.8765, + "step": 26412 + }, + { + "epoch": 0.15708559330098013, + "grad_norm": 1.7325724363327026, + "learning_rate": 4.701724042601859e-05, + "loss": 5.3281, + "step": 26413 + }, + { + "epoch": 0.1570915405842611, + "grad_norm": 1.5105476379394531, + "learning_rate": 4.701701916050357e-05, + "loss": 5.2577, + "step": 26414 + }, + { + "epoch": 0.1570974878675421, + "grad_norm": 1.766034722328186, + "learning_rate": 4.701679788730263e-05, + "loss": 4.8186, + "step": 26415 + }, + { + "epoch": 0.15710343515082312, + "grad_norm": 1.5909993648529053, + "learning_rate": 4.701657660641585e-05, + "loss": 4.9077, + "step": 26416 + }, + { + "epoch": 0.1571093824341041, + "grad_norm": 1.663878083229065, + "learning_rate": 4.7016355317843316e-05, + "loss": 5.3196, + "step": 26417 + }, + { + "epoch": 0.1571153297173851, + "grad_norm": 1.8101507425308228, + "learning_rate": 4.7016134021585095e-05, + "loss": 4.7219, + "step": 26418 + }, + { + "epoch": 0.15712127700066608, + "grad_norm": 1.3929054737091064, + "learning_rate": 4.7015912717641276e-05, + "loss": 5.169, + "step": 26419 + }, + { + "epoch": 0.1571272242839471, + "grad_norm": 1.6896204948425293, + "learning_rate": 4.701569140601192e-05, + "loss": 4.9141, + "step": 26420 + }, + { + "epoch": 0.15713317156722809, + "grad_norm": 2.3035976886749268, + "learning_rate": 4.7015470086697124e-05, + "loss": 4.4289, + "step": 26421 + }, + { + "epoch": 0.15713911885050907, + "grad_norm": 1.8286256790161133, + "learning_rate": 4.701524875969695e-05, + "loss": 4.7177, + "step": 26422 + }, + { + "epoch": 0.1571450661337901, + "grad_norm": 1.7254390716552734, + "learning_rate": 4.701502742501147e-05, + "loss": 3.99, + "step": 26423 + }, + { + "epoch": 0.15715101341707108, + "grad_norm": 1.6733616590499878, + "learning_rate": 4.701480608264078e-05, + "loss": 5.4146, + "step": 26424 + }, + { + "epoch": 0.15715696070035207, + "grad_norm": 2.167525291442871, + "learning_rate": 4.701458473258496e-05, + "loss": 5.751, + "step": 26425 + }, + { + "epoch": 0.15716290798363308, + "grad_norm": 1.5784038305282593, + "learning_rate": 4.7014363374844064e-05, + "loss": 5.2341, + "step": 26426 + }, + { + "epoch": 0.15716885526691407, + "grad_norm": 1.6087944507598877, + "learning_rate": 4.7014142009418176e-05, + "loss": 4.6644, + "step": 26427 + }, + { + "epoch": 0.15717480255019506, + "grad_norm": 2.1396427154541016, + "learning_rate": 4.701392063630739e-05, + "loss": 4.7034, + "step": 26428 + }, + { + "epoch": 0.15718074983347607, + "grad_norm": 2.069359540939331, + "learning_rate": 4.701369925551177e-05, + "loss": 4.1612, + "step": 26429 + }, + { + "epoch": 0.15718669711675706, + "grad_norm": 2.0008041858673096, + "learning_rate": 4.7013477867031385e-05, + "loss": 4.3536, + "step": 26430 + }, + { + "epoch": 0.15719264440003805, + "grad_norm": 1.9997189044952393, + "learning_rate": 4.701325647086633e-05, + "loss": 4.4613, + "step": 26431 + }, + { + "epoch": 0.15719859168331907, + "grad_norm": 1.625603437423706, + "learning_rate": 4.701303506701667e-05, + "loss": 4.63, + "step": 26432 + }, + { + "epoch": 0.15720453896660005, + "grad_norm": 1.5895150899887085, + "learning_rate": 4.701281365548249e-05, + "loss": 4.884, + "step": 26433 + }, + { + "epoch": 0.15721048624988104, + "grad_norm": 1.6569048166275024, + "learning_rate": 4.7012592236263865e-05, + "loss": 4.5834, + "step": 26434 + }, + { + "epoch": 0.15721643353316206, + "grad_norm": 1.9942916631698608, + "learning_rate": 4.7012370809360874e-05, + "loss": 4.8536, + "step": 26435 + }, + { + "epoch": 0.15722238081644305, + "grad_norm": 1.7535972595214844, + "learning_rate": 4.701214937477359e-05, + "loss": 4.9008, + "step": 26436 + }, + { + "epoch": 0.15722832809972404, + "grad_norm": 1.9767074584960938, + "learning_rate": 4.7011927932502085e-05, + "loss": 5.4972, + "step": 26437 + }, + { + "epoch": 0.15723427538300505, + "grad_norm": 1.6117023229599, + "learning_rate": 4.701170648254645e-05, + "loss": 5.2583, + "step": 26438 + }, + { + "epoch": 0.15724022266628604, + "grad_norm": 1.6277034282684326, + "learning_rate": 4.7011485024906754e-05, + "loss": 5.0635, + "step": 26439 + }, + { + "epoch": 0.15724616994956703, + "grad_norm": 1.5075265169143677, + "learning_rate": 4.701126355958308e-05, + "loss": 5.2974, + "step": 26440 + }, + { + "epoch": 0.15725211723284804, + "grad_norm": 1.377233862876892, + "learning_rate": 4.70110420865755e-05, + "loss": 5.0643, + "step": 26441 + }, + { + "epoch": 0.15725806451612903, + "grad_norm": 1.5468838214874268, + "learning_rate": 4.7010820605884085e-05, + "loss": 5.0746, + "step": 26442 + }, + { + "epoch": 0.15726401179941002, + "grad_norm": 1.864901065826416, + "learning_rate": 4.701059911750893e-05, + "loss": 5.0492, + "step": 26443 + }, + { + "epoch": 0.15726995908269104, + "grad_norm": 2.086214542388916, + "learning_rate": 4.70103776214501e-05, + "loss": 4.8566, + "step": 26444 + }, + { + "epoch": 0.15727590636597202, + "grad_norm": 1.571226716041565, + "learning_rate": 4.701015611770767e-05, + "loss": 4.7567, + "step": 26445 + }, + { + "epoch": 0.157281853649253, + "grad_norm": 2.299607753753662, + "learning_rate": 4.7009934606281726e-05, + "loss": 4.8576, + "step": 26446 + }, + { + "epoch": 0.15728780093253403, + "grad_norm": 2.019814968109131, + "learning_rate": 4.7009713087172335e-05, + "loss": 4.6524, + "step": 26447 + }, + { + "epoch": 0.15729374821581502, + "grad_norm": 1.8718371391296387, + "learning_rate": 4.700949156037959e-05, + "loss": 4.6629, + "step": 26448 + }, + { + "epoch": 0.157299695499096, + "grad_norm": 1.9023678302764893, + "learning_rate": 4.700927002590355e-05, + "loss": 4.8558, + "step": 26449 + }, + { + "epoch": 0.15730564278237702, + "grad_norm": 1.8519774675369263, + "learning_rate": 4.700904848374431e-05, + "loss": 4.8498, + "step": 26450 + }, + { + "epoch": 0.157311590065658, + "grad_norm": 2.1003715991973877, + "learning_rate": 4.7008826933901937e-05, + "loss": 4.9443, + "step": 26451 + }, + { + "epoch": 0.157317537348939, + "grad_norm": 1.8350003957748413, + "learning_rate": 4.7008605376376504e-05, + "loss": 4.9194, + "step": 26452 + }, + { + "epoch": 0.15732348463222, + "grad_norm": 1.9740381240844727, + "learning_rate": 4.70083838111681e-05, + "loss": 5.035, + "step": 26453 + }, + { + "epoch": 0.157329431915501, + "grad_norm": 1.8660650253295898, + "learning_rate": 4.700816223827679e-05, + "loss": 4.7712, + "step": 26454 + }, + { + "epoch": 0.157335379198782, + "grad_norm": 2.6117658615112305, + "learning_rate": 4.700794065770266e-05, + "loss": 4.0286, + "step": 26455 + }, + { + "epoch": 0.157341326482063, + "grad_norm": 2.0968191623687744, + "learning_rate": 4.700771906944579e-05, + "loss": 4.505, + "step": 26456 + }, + { + "epoch": 0.157347273765344, + "grad_norm": 2.0062074661254883, + "learning_rate": 4.700749747350624e-05, + "loss": 4.806, + "step": 26457 + }, + { + "epoch": 0.15735322104862498, + "grad_norm": 1.8398696184158325, + "learning_rate": 4.700727586988412e-05, + "loss": 4.799, + "step": 26458 + }, + { + "epoch": 0.157359168331906, + "grad_norm": 1.8096837997436523, + "learning_rate": 4.7007054258579474e-05, + "loss": 5.0503, + "step": 26459 + }, + { + "epoch": 0.15736511561518698, + "grad_norm": 1.735893726348877, + "learning_rate": 4.7006832639592396e-05, + "loss": 5.037, + "step": 26460 + }, + { + "epoch": 0.15737106289846797, + "grad_norm": 1.9189250469207764, + "learning_rate": 4.7006611012922966e-05, + "loss": 5.3352, + "step": 26461 + }, + { + "epoch": 0.157377010181749, + "grad_norm": 2.387317657470703, + "learning_rate": 4.7006389378571246e-05, + "loss": 4.055, + "step": 26462 + }, + { + "epoch": 0.15738295746502998, + "grad_norm": 2.414651870727539, + "learning_rate": 4.7006167736537323e-05, + "loss": 3.7756, + "step": 26463 + }, + { + "epoch": 0.15738890474831096, + "grad_norm": 2.497237205505371, + "learning_rate": 4.700594608682127e-05, + "loss": 3.7823, + "step": 26464 + }, + { + "epoch": 0.15739485203159198, + "grad_norm": 2.2141029834747314, + "learning_rate": 4.700572442942318e-05, + "loss": 4.1131, + "step": 26465 + }, + { + "epoch": 0.15740079931487297, + "grad_norm": 1.8615038394927979, + "learning_rate": 4.700550276434312e-05, + "loss": 4.8686, + "step": 26466 + }, + { + "epoch": 0.15740674659815396, + "grad_norm": 1.7082819938659668, + "learning_rate": 4.700528109158115e-05, + "loss": 5.2237, + "step": 26467 + }, + { + "epoch": 0.15741269388143497, + "grad_norm": 1.8039544820785522, + "learning_rate": 4.700505941113739e-05, + "loss": 4.5243, + "step": 26468 + }, + { + "epoch": 0.15741864116471596, + "grad_norm": 1.874585509300232, + "learning_rate": 4.700483772301187e-05, + "loss": 4.7674, + "step": 26469 + }, + { + "epoch": 0.15742458844799695, + "grad_norm": 2.083904266357422, + "learning_rate": 4.70046160272047e-05, + "loss": 4.8949, + "step": 26470 + }, + { + "epoch": 0.15743053573127797, + "grad_norm": 1.3937793970108032, + "learning_rate": 4.700439432371593e-05, + "loss": 5.6113, + "step": 26471 + }, + { + "epoch": 0.15743648301455895, + "grad_norm": 1.924481987953186, + "learning_rate": 4.700417261254567e-05, + "loss": 5.1439, + "step": 26472 + }, + { + "epoch": 0.15744243029783994, + "grad_norm": 1.6527281999588013, + "learning_rate": 4.700395089369397e-05, + "loss": 5.6962, + "step": 26473 + }, + { + "epoch": 0.15744837758112096, + "grad_norm": 1.5053030252456665, + "learning_rate": 4.700372916716093e-05, + "loss": 4.7299, + "step": 26474 + }, + { + "epoch": 0.15745432486440195, + "grad_norm": 1.2048367261886597, + "learning_rate": 4.7003507432946604e-05, + "loss": 5.5429, + "step": 26475 + }, + { + "epoch": 0.15746027214768293, + "grad_norm": 1.3451159000396729, + "learning_rate": 4.700328569105108e-05, + "loss": 5.5326, + "step": 26476 + }, + { + "epoch": 0.15746621943096392, + "grad_norm": 1.4441956281661987, + "learning_rate": 4.700306394147445e-05, + "loss": 5.5795, + "step": 26477 + }, + { + "epoch": 0.15747216671424494, + "grad_norm": 1.5551849603652954, + "learning_rate": 4.700284218421676e-05, + "loss": 5.2977, + "step": 26478 + }, + { + "epoch": 0.15747811399752593, + "grad_norm": 1.713437795639038, + "learning_rate": 4.7002620419278115e-05, + "loss": 5.242, + "step": 26479 + }, + { + "epoch": 0.15748406128080691, + "grad_norm": 1.4137530326843262, + "learning_rate": 4.7002398646658586e-05, + "loss": 5.2396, + "step": 26480 + }, + { + "epoch": 0.15749000856408793, + "grad_norm": 1.846640706062317, + "learning_rate": 4.700217686635824e-05, + "loss": 4.926, + "step": 26481 + }, + { + "epoch": 0.15749595584736892, + "grad_norm": 2.2699780464172363, + "learning_rate": 4.7001955078377156e-05, + "loss": 3.8352, + "step": 26482 + }, + { + "epoch": 0.1575019031306499, + "grad_norm": 1.959821105003357, + "learning_rate": 4.700173328271543e-05, + "loss": 4.7261, + "step": 26483 + }, + { + "epoch": 0.15750785041393092, + "grad_norm": 1.5478743314743042, + "learning_rate": 4.700151147937312e-05, + "loss": 5.463, + "step": 26484 + }, + { + "epoch": 0.1575137976972119, + "grad_norm": 1.835830807685852, + "learning_rate": 4.7001289668350314e-05, + "loss": 4.9938, + "step": 26485 + }, + { + "epoch": 0.1575197449804929, + "grad_norm": 2.1762354373931885, + "learning_rate": 4.700106784964708e-05, + "loss": 4.0548, + "step": 26486 + }, + { + "epoch": 0.15752569226377391, + "grad_norm": 1.8922265768051147, + "learning_rate": 4.70008460232635e-05, + "loss": 4.1947, + "step": 26487 + }, + { + "epoch": 0.1575316395470549, + "grad_norm": 1.6450932025909424, + "learning_rate": 4.7000624189199646e-05, + "loss": 5.014, + "step": 26488 + }, + { + "epoch": 0.1575375868303359, + "grad_norm": 1.5196298360824585, + "learning_rate": 4.7000402347455616e-05, + "loss": 5.332, + "step": 26489 + }, + { + "epoch": 0.1575435341136169, + "grad_norm": 1.665044903755188, + "learning_rate": 4.700018049803146e-05, + "loss": 4.992, + "step": 26490 + }, + { + "epoch": 0.1575494813968979, + "grad_norm": 1.4281147718429565, + "learning_rate": 4.6999958640927275e-05, + "loss": 4.9014, + "step": 26491 + }, + { + "epoch": 0.15755542868017888, + "grad_norm": 1.4559162855148315, + "learning_rate": 4.6999736776143135e-05, + "loss": 4.9361, + "step": 26492 + }, + { + "epoch": 0.1575613759634599, + "grad_norm": 1.7235175371170044, + "learning_rate": 4.699951490367911e-05, + "loss": 5.2429, + "step": 26493 + }, + { + "epoch": 0.1575673232467409, + "grad_norm": 1.5422228574752808, + "learning_rate": 4.699929302353528e-05, + "loss": 5.5294, + "step": 26494 + }, + { + "epoch": 0.15757327053002188, + "grad_norm": 1.6905406713485718, + "learning_rate": 4.699907113571173e-05, + "loss": 5.0958, + "step": 26495 + }, + { + "epoch": 0.1575792178133029, + "grad_norm": 1.8692830801010132, + "learning_rate": 4.699884924020853e-05, + "loss": 4.7711, + "step": 26496 + }, + { + "epoch": 0.15758516509658388, + "grad_norm": 1.7128182649612427, + "learning_rate": 4.699862733702575e-05, + "loss": 5.344, + "step": 26497 + }, + { + "epoch": 0.15759111237986487, + "grad_norm": 1.7795850038528442, + "learning_rate": 4.6998405426163486e-05, + "loss": 5.044, + "step": 26498 + }, + { + "epoch": 0.15759705966314588, + "grad_norm": 1.8591927289962769, + "learning_rate": 4.6998183507621804e-05, + "loss": 5.7269, + "step": 26499 + }, + { + "epoch": 0.15760300694642687, + "grad_norm": 1.7289692163467407, + "learning_rate": 4.6997961581400785e-05, + "loss": 5.295, + "step": 26500 + }, + { + "epoch": 0.15760895422970786, + "grad_norm": 2.03056001663208, + "learning_rate": 4.699773964750049e-05, + "loss": 4.9402, + "step": 26501 + }, + { + "epoch": 0.15761490151298888, + "grad_norm": 1.7518073320388794, + "learning_rate": 4.699751770592104e-05, + "loss": 4.8934, + "step": 26502 + }, + { + "epoch": 0.15762084879626986, + "grad_norm": 1.7724835872650146, + "learning_rate": 4.6997295756662465e-05, + "loss": 4.6237, + "step": 26503 + }, + { + "epoch": 0.15762679607955085, + "grad_norm": 1.475229263305664, + "learning_rate": 4.699707379972485e-05, + "loss": 5.2655, + "step": 26504 + }, + { + "epoch": 0.15763274336283187, + "grad_norm": 1.4267539978027344, + "learning_rate": 4.69968518351083e-05, + "loss": 5.2016, + "step": 26505 + }, + { + "epoch": 0.15763869064611286, + "grad_norm": 2.1211252212524414, + "learning_rate": 4.699662986281288e-05, + "loss": 4.1632, + "step": 26506 + }, + { + "epoch": 0.15764463792939384, + "grad_norm": 2.0549299716949463, + "learning_rate": 4.699640788283866e-05, + "loss": 4.0886, + "step": 26507 + }, + { + "epoch": 0.15765058521267486, + "grad_norm": 2.210500717163086, + "learning_rate": 4.699618589518572e-05, + "loss": 4.3042, + "step": 26508 + }, + { + "epoch": 0.15765653249595585, + "grad_norm": 2.2884981632232666, + "learning_rate": 4.699596389985413e-05, + "loss": 4.178, + "step": 26509 + }, + { + "epoch": 0.15766247977923684, + "grad_norm": 2.24526047706604, + "learning_rate": 4.699574189684399e-05, + "loss": 4.2319, + "step": 26510 + }, + { + "epoch": 0.15766842706251785, + "grad_norm": 2.401103973388672, + "learning_rate": 4.699551988615535e-05, + "loss": 4.1215, + "step": 26511 + }, + { + "epoch": 0.15767437434579884, + "grad_norm": 2.3012118339538574, + "learning_rate": 4.699529786778831e-05, + "loss": 4.3254, + "step": 26512 + }, + { + "epoch": 0.15768032162907983, + "grad_norm": 1.963396668434143, + "learning_rate": 4.699507584174294e-05, + "loss": 4.4707, + "step": 26513 + }, + { + "epoch": 0.15768626891236084, + "grad_norm": 2.3375425338745117, + "learning_rate": 4.699485380801931e-05, + "loss": 4.2861, + "step": 26514 + }, + { + "epoch": 0.15769221619564183, + "grad_norm": 2.189077377319336, + "learning_rate": 4.699463176661751e-05, + "loss": 4.3273, + "step": 26515 + }, + { + "epoch": 0.15769816347892282, + "grad_norm": 1.8198938369750977, + "learning_rate": 4.699440971753761e-05, + "loss": 4.6847, + "step": 26516 + }, + { + "epoch": 0.15770411076220384, + "grad_norm": 1.646579623222351, + "learning_rate": 4.699418766077969e-05, + "loss": 5.126, + "step": 26517 + }, + { + "epoch": 0.15771005804548482, + "grad_norm": 2.0718090534210205, + "learning_rate": 4.6993965596343825e-05, + "loss": 4.5059, + "step": 26518 + }, + { + "epoch": 0.1577160053287658, + "grad_norm": 1.6022831201553345, + "learning_rate": 4.699374352423009e-05, + "loss": 5.5119, + "step": 26519 + }, + { + "epoch": 0.15772195261204683, + "grad_norm": 1.3838839530944824, + "learning_rate": 4.699352144443857e-05, + "loss": 5.0512, + "step": 26520 + }, + { + "epoch": 0.15772789989532782, + "grad_norm": 1.3122941255569458, + "learning_rate": 4.699329935696934e-05, + "loss": 5.1832, + "step": 26521 + }, + { + "epoch": 0.1577338471786088, + "grad_norm": 1.6332945823669434, + "learning_rate": 4.699307726182247e-05, + "loss": 5.081, + "step": 26522 + }, + { + "epoch": 0.15773979446188982, + "grad_norm": 1.5045149326324463, + "learning_rate": 4.699285515899805e-05, + "loss": 5.2076, + "step": 26523 + }, + { + "epoch": 0.1577457417451708, + "grad_norm": 1.4530036449432373, + "learning_rate": 4.699263304849615e-05, + "loss": 5.3623, + "step": 26524 + }, + { + "epoch": 0.1577516890284518, + "grad_norm": 1.6600695848464966, + "learning_rate": 4.699241093031685e-05, + "loss": 5.5862, + "step": 26525 + }, + { + "epoch": 0.1577576363117328, + "grad_norm": 1.6276617050170898, + "learning_rate": 4.6992188804460225e-05, + "loss": 5.282, + "step": 26526 + }, + { + "epoch": 0.1577635835950138, + "grad_norm": 1.7213892936706543, + "learning_rate": 4.6991966670926355e-05, + "loss": 5.4613, + "step": 26527 + }, + { + "epoch": 0.1577695308782948, + "grad_norm": 1.63749361038208, + "learning_rate": 4.6991744529715316e-05, + "loss": 5.4498, + "step": 26528 + }, + { + "epoch": 0.1577754781615758, + "grad_norm": 1.5182081460952759, + "learning_rate": 4.6991522380827184e-05, + "loss": 5.3962, + "step": 26529 + }, + { + "epoch": 0.1577814254448568, + "grad_norm": 1.6695536375045776, + "learning_rate": 4.699130022426204e-05, + "loss": 5.1221, + "step": 26530 + }, + { + "epoch": 0.15778737272813778, + "grad_norm": 1.4350519180297852, + "learning_rate": 4.6991078060019966e-05, + "loss": 5.319, + "step": 26531 + }, + { + "epoch": 0.1577933200114188, + "grad_norm": 1.2092465162277222, + "learning_rate": 4.699085588810103e-05, + "loss": 5.4316, + "step": 26532 + }, + { + "epoch": 0.15779926729469979, + "grad_norm": 1.474252700805664, + "learning_rate": 4.6990633708505304e-05, + "loss": 5.6559, + "step": 26533 + }, + { + "epoch": 0.15780521457798077, + "grad_norm": 1.6271101236343384, + "learning_rate": 4.699041152123289e-05, + "loss": 5.7491, + "step": 26534 + }, + { + "epoch": 0.1578111618612618, + "grad_norm": 1.6184288263320923, + "learning_rate": 4.699018932628384e-05, + "loss": 5.3195, + "step": 26535 + }, + { + "epoch": 0.15781710914454278, + "grad_norm": 1.3626726865768433, + "learning_rate": 4.698996712365825e-05, + "loss": 5.2913, + "step": 26536 + }, + { + "epoch": 0.15782305642782377, + "grad_norm": 2.3408188819885254, + "learning_rate": 4.6989744913356185e-05, + "loss": 4.774, + "step": 26537 + }, + { + "epoch": 0.15782900371110475, + "grad_norm": 1.500992774963379, + "learning_rate": 4.698952269537773e-05, + "loss": 5.5717, + "step": 26538 + }, + { + "epoch": 0.15783495099438577, + "grad_norm": 1.393517017364502, + "learning_rate": 4.6989300469722955e-05, + "loss": 6.1478, + "step": 26539 + }, + { + "epoch": 0.15784089827766676, + "grad_norm": 1.6048024892807007, + "learning_rate": 4.698907823639195e-05, + "loss": 5.5076, + "step": 26540 + }, + { + "epoch": 0.15784684556094775, + "grad_norm": 1.7231130599975586, + "learning_rate": 4.698885599538478e-05, + "loss": 5.1799, + "step": 26541 + }, + { + "epoch": 0.15785279284422876, + "grad_norm": 1.4809112548828125, + "learning_rate": 4.6988633746701525e-05, + "loss": 5.146, + "step": 26542 + }, + { + "epoch": 0.15785874012750975, + "grad_norm": 1.6530802249908447, + "learning_rate": 4.6988411490342266e-05, + "loss": 5.3245, + "step": 26543 + }, + { + "epoch": 0.15786468741079074, + "grad_norm": 1.5264098644256592, + "learning_rate": 4.6988189226307087e-05, + "loss": 5.3715, + "step": 26544 + }, + { + "epoch": 0.15787063469407175, + "grad_norm": 1.3241318464279175, + "learning_rate": 4.6987966954596054e-05, + "loss": 5.387, + "step": 26545 + }, + { + "epoch": 0.15787658197735274, + "grad_norm": 1.6130857467651367, + "learning_rate": 4.698774467520924e-05, + "loss": 5.2902, + "step": 26546 + }, + { + "epoch": 0.15788252926063373, + "grad_norm": 1.4999042749404907, + "learning_rate": 4.698752238814674e-05, + "loss": 5.2129, + "step": 26547 + }, + { + "epoch": 0.15788847654391475, + "grad_norm": 1.4773963689804077, + "learning_rate": 4.698730009340863e-05, + "loss": 5.7722, + "step": 26548 + }, + { + "epoch": 0.15789442382719573, + "grad_norm": 1.666413426399231, + "learning_rate": 4.698707779099497e-05, + "loss": 5.7418, + "step": 26549 + }, + { + "epoch": 0.15790037111047672, + "grad_norm": 1.4869890213012695, + "learning_rate": 4.698685548090585e-05, + "loss": 4.8418, + "step": 26550 + }, + { + "epoch": 0.15790631839375774, + "grad_norm": 1.6295100450515747, + "learning_rate": 4.698663316314135e-05, + "loss": 4.7722, + "step": 26551 + }, + { + "epoch": 0.15791226567703873, + "grad_norm": 1.5449434518814087, + "learning_rate": 4.698641083770154e-05, + "loss": 5.0621, + "step": 26552 + }, + { + "epoch": 0.15791821296031971, + "grad_norm": 1.6735725402832031, + "learning_rate": 4.6986188504586507e-05, + "loss": 5.5605, + "step": 26553 + }, + { + "epoch": 0.15792416024360073, + "grad_norm": 1.6270878314971924, + "learning_rate": 4.698596616379631e-05, + "loss": 5.279, + "step": 26554 + }, + { + "epoch": 0.15793010752688172, + "grad_norm": 1.6335285902023315, + "learning_rate": 4.698574381533105e-05, + "loss": 5.398, + "step": 26555 + }, + { + "epoch": 0.1579360548101627, + "grad_norm": 2.2176520824432373, + "learning_rate": 4.698552145919079e-05, + "loss": 4.9806, + "step": 26556 + }, + { + "epoch": 0.15794200209344372, + "grad_norm": 1.8645645380020142, + "learning_rate": 4.6985299095375615e-05, + "loss": 5.2633, + "step": 26557 + }, + { + "epoch": 0.1579479493767247, + "grad_norm": 1.708526372909546, + "learning_rate": 4.698507672388559e-05, + "loss": 5.0308, + "step": 26558 + }, + { + "epoch": 0.1579538966600057, + "grad_norm": 2.148980140686035, + "learning_rate": 4.698485434472081e-05, + "loss": 4.5213, + "step": 26559 + }, + { + "epoch": 0.15795984394328672, + "grad_norm": 2.402442693710327, + "learning_rate": 4.6984631957881346e-05, + "loss": 4.4377, + "step": 26560 + }, + { + "epoch": 0.1579657912265677, + "grad_norm": 2.298003911972046, + "learning_rate": 4.698440956336727e-05, + "loss": 4.5809, + "step": 26561 + }, + { + "epoch": 0.1579717385098487, + "grad_norm": 2.53639554977417, + "learning_rate": 4.698418716117867e-05, + "loss": 4.1869, + "step": 26562 + }, + { + "epoch": 0.1579776857931297, + "grad_norm": 2.0686380863189697, + "learning_rate": 4.698396475131561e-05, + "loss": 4.413, + "step": 26563 + }, + { + "epoch": 0.1579836330764107, + "grad_norm": 1.8968595266342163, + "learning_rate": 4.698374233377818e-05, + "loss": 4.9939, + "step": 26564 + }, + { + "epoch": 0.15798958035969168, + "grad_norm": 1.8896044492721558, + "learning_rate": 4.698351990856645e-05, + "loss": 4.6383, + "step": 26565 + }, + { + "epoch": 0.1579955276429727, + "grad_norm": 1.7179672718048096, + "learning_rate": 4.6983297475680496e-05, + "loss": 5.5635, + "step": 26566 + }, + { + "epoch": 0.1580014749262537, + "grad_norm": 1.6506478786468506, + "learning_rate": 4.6983075035120404e-05, + "loss": 5.1821, + "step": 26567 + }, + { + "epoch": 0.15800742220953468, + "grad_norm": 2.180238723754883, + "learning_rate": 4.698285258688625e-05, + "loss": 4.1298, + "step": 26568 + }, + { + "epoch": 0.1580133694928157, + "grad_norm": 2.208676338195801, + "learning_rate": 4.698263013097811e-05, + "loss": 4.3238, + "step": 26569 + }, + { + "epoch": 0.15801931677609668, + "grad_norm": 1.694823145866394, + "learning_rate": 4.6982407667396055e-05, + "loss": 5.3418, + "step": 26570 + }, + { + "epoch": 0.15802526405937767, + "grad_norm": 1.7310692071914673, + "learning_rate": 4.6982185196140174e-05, + "loss": 5.4066, + "step": 26571 + }, + { + "epoch": 0.15803121134265868, + "grad_norm": 2.302055597305298, + "learning_rate": 4.698196271721054e-05, + "loss": 4.1817, + "step": 26572 + }, + { + "epoch": 0.15803715862593967, + "grad_norm": 1.872363567352295, + "learning_rate": 4.698174023060722e-05, + "loss": 4.6733, + "step": 26573 + }, + { + "epoch": 0.15804310590922066, + "grad_norm": 2.134537696838379, + "learning_rate": 4.698151773633032e-05, + "loss": 4.3211, + "step": 26574 + }, + { + "epoch": 0.15804905319250168, + "grad_norm": 2.4381020069122314, + "learning_rate": 4.698129523437989e-05, + "loss": 4.2212, + "step": 26575 + }, + { + "epoch": 0.15805500047578266, + "grad_norm": 1.6739851236343384, + "learning_rate": 4.6981072724756e-05, + "loss": 5.3057, + "step": 26576 + }, + { + "epoch": 0.15806094775906365, + "grad_norm": 1.8092267513275146, + "learning_rate": 4.6980850207458765e-05, + "loss": 4.7359, + "step": 26577 + }, + { + "epoch": 0.15806689504234467, + "grad_norm": 1.6420230865478516, + "learning_rate": 4.6980627682488235e-05, + "loss": 5.086, + "step": 26578 + }, + { + "epoch": 0.15807284232562566, + "grad_norm": 1.8741960525512695, + "learning_rate": 4.6980405149844494e-05, + "loss": 4.7842, + "step": 26579 + }, + { + "epoch": 0.15807878960890664, + "grad_norm": 2.6539900302886963, + "learning_rate": 4.698018260952763e-05, + "loss": 3.809, + "step": 26580 + }, + { + "epoch": 0.15808473689218766, + "grad_norm": 1.8262064456939697, + "learning_rate": 4.69799600615377e-05, + "loss": 4.8959, + "step": 26581 + }, + { + "epoch": 0.15809068417546865, + "grad_norm": 1.7090948820114136, + "learning_rate": 4.6979737505874796e-05, + "loss": 4.7723, + "step": 26582 + }, + { + "epoch": 0.15809663145874964, + "grad_norm": 1.5634857416152954, + "learning_rate": 4.6979514942539e-05, + "loss": 4.7533, + "step": 26583 + }, + { + "epoch": 0.15810257874203065, + "grad_norm": 1.6470197439193726, + "learning_rate": 4.697929237153037e-05, + "loss": 5.2194, + "step": 26584 + }, + { + "epoch": 0.15810852602531164, + "grad_norm": 2.060804605484009, + "learning_rate": 4.697906979284901e-05, + "loss": 4.3637, + "step": 26585 + }, + { + "epoch": 0.15811447330859263, + "grad_norm": 2.065943717956543, + "learning_rate": 4.697884720649498e-05, + "loss": 4.8908, + "step": 26586 + }, + { + "epoch": 0.15812042059187364, + "grad_norm": 1.5104914903640747, + "learning_rate": 4.697862461246836e-05, + "loss": 5.7029, + "step": 26587 + }, + { + "epoch": 0.15812636787515463, + "grad_norm": 1.593296766281128, + "learning_rate": 4.697840201076922e-05, + "loss": 5.7005, + "step": 26588 + }, + { + "epoch": 0.15813231515843562, + "grad_norm": 1.6516765356063843, + "learning_rate": 4.697817940139766e-05, + "loss": 5.3843, + "step": 26589 + }, + { + "epoch": 0.15813826244171664, + "grad_norm": 1.3671473264694214, + "learning_rate": 4.697795678435374e-05, + "loss": 5.4862, + "step": 26590 + }, + { + "epoch": 0.15814420972499763, + "grad_norm": 1.4163672924041748, + "learning_rate": 4.697773415963754e-05, + "loss": 5.4793, + "step": 26591 + }, + { + "epoch": 0.1581501570082786, + "grad_norm": 1.5477086305618286, + "learning_rate": 4.697751152724914e-05, + "loss": 5.2835, + "step": 26592 + }, + { + "epoch": 0.15815610429155963, + "grad_norm": 1.6029425859451294, + "learning_rate": 4.697728888718862e-05, + "loss": 5.3689, + "step": 26593 + }, + { + "epoch": 0.15816205157484062, + "grad_norm": 1.5130633115768433, + "learning_rate": 4.697706623945605e-05, + "loss": 6.1627, + "step": 26594 + }, + { + "epoch": 0.1581679988581216, + "grad_norm": 1.5171791315078735, + "learning_rate": 4.697684358405152e-05, + "loss": 4.9849, + "step": 26595 + }, + { + "epoch": 0.1581739461414026, + "grad_norm": 1.449781894683838, + "learning_rate": 4.69766209209751e-05, + "loss": 5.5273, + "step": 26596 + }, + { + "epoch": 0.1581798934246836, + "grad_norm": 1.430094838142395, + "learning_rate": 4.697639825022687e-05, + "loss": 5.6825, + "step": 26597 + }, + { + "epoch": 0.1581858407079646, + "grad_norm": 1.2635716199874878, + "learning_rate": 4.69761755718069e-05, + "loss": 5.2177, + "step": 26598 + }, + { + "epoch": 0.15819178799124559, + "grad_norm": 2.20355224609375, + "learning_rate": 4.697595288571528e-05, + "loss": 4.6664, + "step": 26599 + }, + { + "epoch": 0.1581977352745266, + "grad_norm": 1.586509108543396, + "learning_rate": 4.6975730191952086e-05, + "loss": 5.056, + "step": 26600 + }, + { + "epoch": 0.1582036825578076, + "grad_norm": 1.4773000478744507, + "learning_rate": 4.697550749051738e-05, + "loss": 5.2931, + "step": 26601 + }, + { + "epoch": 0.15820962984108858, + "grad_norm": 1.4557143449783325, + "learning_rate": 4.697528478141125e-05, + "loss": 4.9378, + "step": 26602 + }, + { + "epoch": 0.1582155771243696, + "grad_norm": 1.5859819650650024, + "learning_rate": 4.697506206463379e-05, + "loss": 5.1998, + "step": 26603 + }, + { + "epoch": 0.15822152440765058, + "grad_norm": 1.5068250894546509, + "learning_rate": 4.697483934018505e-05, + "loss": 5.2748, + "step": 26604 + }, + { + "epoch": 0.15822747169093157, + "grad_norm": 1.5842232704162598, + "learning_rate": 4.697461660806513e-05, + "loss": 5.326, + "step": 26605 + }, + { + "epoch": 0.1582334189742126, + "grad_norm": 1.5164762735366821, + "learning_rate": 4.697439386827409e-05, + "loss": 5.2282, + "step": 26606 + }, + { + "epoch": 0.15823936625749357, + "grad_norm": 1.5359309911727905, + "learning_rate": 4.697417112081203e-05, + "loss": 5.3723, + "step": 26607 + }, + { + "epoch": 0.15824531354077456, + "grad_norm": 1.560502529144287, + "learning_rate": 4.6973948365678996e-05, + "loss": 5.0822, + "step": 26608 + }, + { + "epoch": 0.15825126082405558, + "grad_norm": 1.5915874242782593, + "learning_rate": 4.69737256028751e-05, + "loss": 5.2849, + "step": 26609 + }, + { + "epoch": 0.15825720810733657, + "grad_norm": 1.613585352897644, + "learning_rate": 4.697350283240039e-05, + "loss": 5.1898, + "step": 26610 + }, + { + "epoch": 0.15826315539061755, + "grad_norm": 1.5696673393249512, + "learning_rate": 4.6973280054254966e-05, + "loss": 5.2518, + "step": 26611 + }, + { + "epoch": 0.15826910267389857, + "grad_norm": 1.2109240293502808, + "learning_rate": 4.697305726843889e-05, + "loss": 5.4032, + "step": 26612 + }, + { + "epoch": 0.15827504995717956, + "grad_norm": 1.47042715549469, + "learning_rate": 4.697283447495225e-05, + "loss": 5.1456, + "step": 26613 + }, + { + "epoch": 0.15828099724046055, + "grad_norm": 1.3937478065490723, + "learning_rate": 4.697261167379512e-05, + "loss": 5.3592, + "step": 26614 + }, + { + "epoch": 0.15828694452374156, + "grad_norm": 1.6204369068145752, + "learning_rate": 4.6972388864967574e-05, + "loss": 5.2882, + "step": 26615 + }, + { + "epoch": 0.15829289180702255, + "grad_norm": 1.654252290725708, + "learning_rate": 4.69721660484697e-05, + "loss": 5.2655, + "step": 26616 + }, + { + "epoch": 0.15829883909030354, + "grad_norm": 1.583075761795044, + "learning_rate": 4.6971943224301576e-05, + "loss": 5.097, + "step": 26617 + }, + { + "epoch": 0.15830478637358456, + "grad_norm": 1.3745534420013428, + "learning_rate": 4.697172039246326e-05, + "loss": 5.1911, + "step": 26618 + }, + { + "epoch": 0.15831073365686554, + "grad_norm": 1.662632703781128, + "learning_rate": 4.697149755295485e-05, + "loss": 4.9032, + "step": 26619 + }, + { + "epoch": 0.15831668094014653, + "grad_norm": 1.3548792600631714, + "learning_rate": 4.697127470577642e-05, + "loss": 5.3656, + "step": 26620 + }, + { + "epoch": 0.15832262822342755, + "grad_norm": 1.2697865962982178, + "learning_rate": 4.697105185092804e-05, + "loss": 5.2743, + "step": 26621 + }, + { + "epoch": 0.15832857550670854, + "grad_norm": 1.424477458000183, + "learning_rate": 4.69708289884098e-05, + "loss": 5.1278, + "step": 26622 + }, + { + "epoch": 0.15833452278998952, + "grad_norm": 1.5525426864624023, + "learning_rate": 4.697060611822176e-05, + "loss": 5.2804, + "step": 26623 + }, + { + "epoch": 0.15834047007327054, + "grad_norm": 1.5966732501983643, + "learning_rate": 4.697038324036401e-05, + "loss": 5.3546, + "step": 26624 + }, + { + "epoch": 0.15834641735655153, + "grad_norm": 1.4296703338623047, + "learning_rate": 4.6970160354836634e-05, + "loss": 5.1681, + "step": 26625 + }, + { + "epoch": 0.15835236463983252, + "grad_norm": 1.5928189754486084, + "learning_rate": 4.69699374616397e-05, + "loss": 5.2565, + "step": 26626 + }, + { + "epoch": 0.15835831192311353, + "grad_norm": 1.437814712524414, + "learning_rate": 4.696971456077328e-05, + "loss": 5.1813, + "step": 26627 + }, + { + "epoch": 0.15836425920639452, + "grad_norm": 1.4782744646072388, + "learning_rate": 4.696949165223747e-05, + "loss": 5.365, + "step": 26628 + }, + { + "epoch": 0.1583702064896755, + "grad_norm": 1.5123037099838257, + "learning_rate": 4.696926873603233e-05, + "loss": 5.255, + "step": 26629 + }, + { + "epoch": 0.15837615377295652, + "grad_norm": 1.4208122491836548, + "learning_rate": 4.696904581215795e-05, + "loss": 5.0531, + "step": 26630 + }, + { + "epoch": 0.1583821010562375, + "grad_norm": 1.4333672523498535, + "learning_rate": 4.69688228806144e-05, + "loss": 5.1035, + "step": 26631 + }, + { + "epoch": 0.1583880483395185, + "grad_norm": 1.3645392656326294, + "learning_rate": 4.696859994140176e-05, + "loss": 5.0107, + "step": 26632 + }, + { + "epoch": 0.15839399562279952, + "grad_norm": 1.6100040674209595, + "learning_rate": 4.6968376994520116e-05, + "loss": 5.054, + "step": 26633 + }, + { + "epoch": 0.1583999429060805, + "grad_norm": 1.431036353111267, + "learning_rate": 4.696815403996953e-05, + "loss": 5.4406, + "step": 26634 + }, + { + "epoch": 0.1584058901893615, + "grad_norm": 1.6785353422164917, + "learning_rate": 4.6967931077750096e-05, + "loss": 5.7861, + "step": 26635 + }, + { + "epoch": 0.1584118374726425, + "grad_norm": 1.549333095550537, + "learning_rate": 4.6967708107861876e-05, + "loss": 5.6662, + "step": 26636 + }, + { + "epoch": 0.1584177847559235, + "grad_norm": 1.5669690370559692, + "learning_rate": 4.696748513030496e-05, + "loss": 5.3213, + "step": 26637 + }, + { + "epoch": 0.15842373203920448, + "grad_norm": 1.6420881748199463, + "learning_rate": 4.696726214507942e-05, + "loss": 5.2381, + "step": 26638 + }, + { + "epoch": 0.1584296793224855, + "grad_norm": 1.811171293258667, + "learning_rate": 4.6967039152185345e-05, + "loss": 5.3656, + "step": 26639 + }, + { + "epoch": 0.1584356266057665, + "grad_norm": 1.7578849792480469, + "learning_rate": 4.696681615162279e-05, + "loss": 4.8774, + "step": 26640 + }, + { + "epoch": 0.15844157388904748, + "grad_norm": 2.0880799293518066, + "learning_rate": 4.696659314339185e-05, + "loss": 4.945, + "step": 26641 + }, + { + "epoch": 0.1584475211723285, + "grad_norm": 1.4735814332962036, + "learning_rate": 4.6966370127492603e-05, + "loss": 5.5415, + "step": 26642 + }, + { + "epoch": 0.15845346845560948, + "grad_norm": 1.7141392230987549, + "learning_rate": 4.696614710392512e-05, + "loss": 4.8197, + "step": 26643 + }, + { + "epoch": 0.15845941573889047, + "grad_norm": 1.9631140232086182, + "learning_rate": 4.696592407268949e-05, + "loss": 5.0147, + "step": 26644 + }, + { + "epoch": 0.15846536302217148, + "grad_norm": 2.1569128036499023, + "learning_rate": 4.696570103378577e-05, + "loss": 4.8175, + "step": 26645 + }, + { + "epoch": 0.15847131030545247, + "grad_norm": 2.07602596282959, + "learning_rate": 4.696547798721406e-05, + "loss": 5.0289, + "step": 26646 + }, + { + "epoch": 0.15847725758873346, + "grad_norm": 1.5406705141067505, + "learning_rate": 4.696525493297443e-05, + "loss": 5.1569, + "step": 26647 + }, + { + "epoch": 0.15848320487201448, + "grad_norm": 1.630928635597229, + "learning_rate": 4.696503187106695e-05, + "loss": 5.4698, + "step": 26648 + }, + { + "epoch": 0.15848915215529547, + "grad_norm": 1.5992403030395508, + "learning_rate": 4.696480880149171e-05, + "loss": 5.296, + "step": 26649 + }, + { + "epoch": 0.15849509943857645, + "grad_norm": 1.8908748626708984, + "learning_rate": 4.696458572424878e-05, + "loss": 5.0706, + "step": 26650 + }, + { + "epoch": 0.15850104672185747, + "grad_norm": 2.149810552597046, + "learning_rate": 4.6964362639338236e-05, + "loss": 4.8138, + "step": 26651 + }, + { + "epoch": 0.15850699400513846, + "grad_norm": 2.049520254135132, + "learning_rate": 4.696413954676016e-05, + "loss": 4.9173, + "step": 26652 + }, + { + "epoch": 0.15851294128841945, + "grad_norm": 2.03076434135437, + "learning_rate": 4.6963916446514634e-05, + "loss": 4.889, + "step": 26653 + }, + { + "epoch": 0.15851888857170043, + "grad_norm": 1.8261823654174805, + "learning_rate": 4.696369333860173e-05, + "loss": 4.7856, + "step": 26654 + }, + { + "epoch": 0.15852483585498145, + "grad_norm": 1.864707112312317, + "learning_rate": 4.6963470223021535e-05, + "loss": 4.8419, + "step": 26655 + }, + { + "epoch": 0.15853078313826244, + "grad_norm": 1.9796535968780518, + "learning_rate": 4.696324709977411e-05, + "loss": 4.7506, + "step": 26656 + }, + { + "epoch": 0.15853673042154343, + "grad_norm": 1.9936281442642212, + "learning_rate": 4.696302396885954e-05, + "loss": 4.8773, + "step": 26657 + }, + { + "epoch": 0.15854267770482444, + "grad_norm": 1.790238618850708, + "learning_rate": 4.696280083027791e-05, + "loss": 4.7998, + "step": 26658 + }, + { + "epoch": 0.15854862498810543, + "grad_norm": 1.9320149421691895, + "learning_rate": 4.69625776840293e-05, + "loss": 4.7113, + "step": 26659 + }, + { + "epoch": 0.15855457227138642, + "grad_norm": 1.6032037734985352, + "learning_rate": 4.696235453011377e-05, + "loss": 5.2848, + "step": 26660 + }, + { + "epoch": 0.15856051955466743, + "grad_norm": 1.8947795629501343, + "learning_rate": 4.696213136853141e-05, + "loss": 4.7212, + "step": 26661 + }, + { + "epoch": 0.15856646683794842, + "grad_norm": 2.017988681793213, + "learning_rate": 4.69619081992823e-05, + "loss": 4.8043, + "step": 26662 + }, + { + "epoch": 0.1585724141212294, + "grad_norm": 2.114877223968506, + "learning_rate": 4.696168502236652e-05, + "loss": 4.6626, + "step": 26663 + }, + { + "epoch": 0.15857836140451043, + "grad_norm": 2.029026985168457, + "learning_rate": 4.6961461837784134e-05, + "loss": 4.5315, + "step": 26664 + }, + { + "epoch": 0.15858430868779141, + "grad_norm": 2.052255630493164, + "learning_rate": 4.696123864553523e-05, + "loss": 4.6957, + "step": 26665 + }, + { + "epoch": 0.1585902559710724, + "grad_norm": 1.9599274396896362, + "learning_rate": 4.696101544561989e-05, + "loss": 4.792, + "step": 26666 + }, + { + "epoch": 0.15859620325435342, + "grad_norm": 2.1609420776367188, + "learning_rate": 4.6960792238038184e-05, + "loss": 4.8083, + "step": 26667 + }, + { + "epoch": 0.1586021505376344, + "grad_norm": 2.0834262371063232, + "learning_rate": 4.696056902279019e-05, + "loss": 4.7683, + "step": 26668 + }, + { + "epoch": 0.1586080978209154, + "grad_norm": 2.0544068813323975, + "learning_rate": 4.6960345799875995e-05, + "loss": 4.7, + "step": 26669 + }, + { + "epoch": 0.1586140451041964, + "grad_norm": 2.036548137664795, + "learning_rate": 4.696012256929566e-05, + "loss": 4.5653, + "step": 26670 + }, + { + "epoch": 0.1586199923874774, + "grad_norm": 1.7801802158355713, + "learning_rate": 4.6959899331049276e-05, + "loss": 4.7215, + "step": 26671 + }, + { + "epoch": 0.1586259396707584, + "grad_norm": 2.0025057792663574, + "learning_rate": 4.695967608513692e-05, + "loss": 4.6259, + "step": 26672 + }, + { + "epoch": 0.1586318869540394, + "grad_norm": 2.0719566345214844, + "learning_rate": 4.695945283155867e-05, + "loss": 4.7383, + "step": 26673 + }, + { + "epoch": 0.1586378342373204, + "grad_norm": 2.0565052032470703, + "learning_rate": 4.69592295703146e-05, + "loss": 5.2066, + "step": 26674 + }, + { + "epoch": 0.15864378152060138, + "grad_norm": 1.7758921384811401, + "learning_rate": 4.695900630140479e-05, + "loss": 5.5619, + "step": 26675 + }, + { + "epoch": 0.1586497288038824, + "grad_norm": 1.799654483795166, + "learning_rate": 4.695878302482931e-05, + "loss": 5.0901, + "step": 26676 + }, + { + "epoch": 0.15865567608716338, + "grad_norm": 1.785900592803955, + "learning_rate": 4.695855974058826e-05, + "loss": 4.9323, + "step": 26677 + }, + { + "epoch": 0.15866162337044437, + "grad_norm": 1.9525444507598877, + "learning_rate": 4.695833644868169e-05, + "loss": 4.7603, + "step": 26678 + }, + { + "epoch": 0.1586675706537254, + "grad_norm": 1.9197458028793335, + "learning_rate": 4.69581131491097e-05, + "loss": 4.636, + "step": 26679 + }, + { + "epoch": 0.15867351793700638, + "grad_norm": 2.3043594360351562, + "learning_rate": 4.695788984187236e-05, + "loss": 4.4529, + "step": 26680 + }, + { + "epoch": 0.15867946522028736, + "grad_norm": 1.687930703163147, + "learning_rate": 4.6957666526969744e-05, + "loss": 4.9549, + "step": 26681 + }, + { + "epoch": 0.15868541250356838, + "grad_norm": 1.5754574537277222, + "learning_rate": 4.6957443204401935e-05, + "loss": 5.4364, + "step": 26682 + }, + { + "epoch": 0.15869135978684937, + "grad_norm": 1.5300992727279663, + "learning_rate": 4.6957219874169013e-05, + "loss": 5.3151, + "step": 26683 + }, + { + "epoch": 0.15869730707013036, + "grad_norm": 1.7758506536483765, + "learning_rate": 4.695699653627105e-05, + "loss": 5.2053, + "step": 26684 + }, + { + "epoch": 0.15870325435341137, + "grad_norm": 1.5882158279418945, + "learning_rate": 4.6956773190708116e-05, + "loss": 4.8202, + "step": 26685 + }, + { + "epoch": 0.15870920163669236, + "grad_norm": 1.5649267435073853, + "learning_rate": 4.695654983748031e-05, + "loss": 4.3946, + "step": 26686 + }, + { + "epoch": 0.15871514891997335, + "grad_norm": 1.5999925136566162, + "learning_rate": 4.6956326476587696e-05, + "loss": 4.3512, + "step": 26687 + }, + { + "epoch": 0.15872109620325436, + "grad_norm": 1.699987530708313, + "learning_rate": 4.6956103108030356e-05, + "loss": 4.7479, + "step": 26688 + }, + { + "epoch": 0.15872704348653535, + "grad_norm": 1.4755208492279053, + "learning_rate": 4.695587973180837e-05, + "loss": 5.1206, + "step": 26689 + }, + { + "epoch": 0.15873299076981634, + "grad_norm": 1.7642509937286377, + "learning_rate": 4.6955656347921813e-05, + "loss": 5.3179, + "step": 26690 + }, + { + "epoch": 0.15873893805309736, + "grad_norm": 1.5511635541915894, + "learning_rate": 4.695543295637076e-05, + "loss": 4.4365, + "step": 26691 + }, + { + "epoch": 0.15874488533637834, + "grad_norm": 1.5347273349761963, + "learning_rate": 4.6955209557155286e-05, + "loss": 4.368, + "step": 26692 + }, + { + "epoch": 0.15875083261965933, + "grad_norm": 1.5347685813903809, + "learning_rate": 4.695498615027549e-05, + "loss": 4.2812, + "step": 26693 + }, + { + "epoch": 0.15875677990294035, + "grad_norm": 1.5469902753829956, + "learning_rate": 4.6954762735731425e-05, + "loss": 4.4445, + "step": 26694 + }, + { + "epoch": 0.15876272718622134, + "grad_norm": 1.4887003898620605, + "learning_rate": 4.695453931352318e-05, + "loss": 4.3584, + "step": 26695 + }, + { + "epoch": 0.15876867446950232, + "grad_norm": 1.5207375288009644, + "learning_rate": 4.695431588365084e-05, + "loss": 4.7219, + "step": 26696 + }, + { + "epoch": 0.15877462175278334, + "grad_norm": 1.7801141738891602, + "learning_rate": 4.695409244611447e-05, + "loss": 5.0328, + "step": 26697 + }, + { + "epoch": 0.15878056903606433, + "grad_norm": 1.8171552419662476, + "learning_rate": 4.695386900091415e-05, + "loss": 5.4286, + "step": 26698 + }, + { + "epoch": 0.15878651631934532, + "grad_norm": 1.698379397392273, + "learning_rate": 4.695364554804996e-05, + "loss": 4.7824, + "step": 26699 + }, + { + "epoch": 0.15879246360262633, + "grad_norm": 1.5824103355407715, + "learning_rate": 4.695342208752199e-05, + "loss": 4.2949, + "step": 26700 + }, + { + "epoch": 0.15879841088590732, + "grad_norm": 1.5890088081359863, + "learning_rate": 4.6953198619330295e-05, + "loss": 4.3689, + "step": 26701 + }, + { + "epoch": 0.1588043581691883, + "grad_norm": 1.6158654689788818, + "learning_rate": 4.6952975143474975e-05, + "loss": 4.7294, + "step": 26702 + }, + { + "epoch": 0.15881030545246932, + "grad_norm": 1.5613304376602173, + "learning_rate": 4.695275165995609e-05, + "loss": 5.4067, + "step": 26703 + }, + { + "epoch": 0.1588162527357503, + "grad_norm": 1.5085046291351318, + "learning_rate": 4.695252816877373e-05, + "loss": 5.4355, + "step": 26704 + }, + { + "epoch": 0.1588222000190313, + "grad_norm": 1.6180028915405273, + "learning_rate": 4.695230466992797e-05, + "loss": 5.0541, + "step": 26705 + }, + { + "epoch": 0.15882814730231232, + "grad_norm": 1.8564766645431519, + "learning_rate": 4.695208116341888e-05, + "loss": 5.3307, + "step": 26706 + }, + { + "epoch": 0.1588340945855933, + "grad_norm": 1.762041449546814, + "learning_rate": 4.6951857649246555e-05, + "loss": 5.2526, + "step": 26707 + }, + { + "epoch": 0.1588400418688743, + "grad_norm": 1.5610746145248413, + "learning_rate": 4.695163412741106e-05, + "loss": 5.0561, + "step": 26708 + }, + { + "epoch": 0.1588459891521553, + "grad_norm": 1.6463086605072021, + "learning_rate": 4.695141059791247e-05, + "loss": 4.9357, + "step": 26709 + }, + { + "epoch": 0.1588519364354363, + "grad_norm": 1.794967532157898, + "learning_rate": 4.695118706075088e-05, + "loss": 5.4786, + "step": 26710 + }, + { + "epoch": 0.15885788371871729, + "grad_norm": 1.6720161437988281, + "learning_rate": 4.695096351592635e-05, + "loss": 5.4702, + "step": 26711 + }, + { + "epoch": 0.15886383100199827, + "grad_norm": 1.6844518184661865, + "learning_rate": 4.6950739963438975e-05, + "loss": 5.2407, + "step": 26712 + }, + { + "epoch": 0.1588697782852793, + "grad_norm": 1.7027579545974731, + "learning_rate": 4.695051640328881e-05, + "loss": 5.1027, + "step": 26713 + }, + { + "epoch": 0.15887572556856028, + "grad_norm": 2.385157823562622, + "learning_rate": 4.695029283547595e-05, + "loss": 4.6013, + "step": 26714 + }, + { + "epoch": 0.15888167285184127, + "grad_norm": 1.7393914461135864, + "learning_rate": 4.6950069260000475e-05, + "loss": 4.9536, + "step": 26715 + }, + { + "epoch": 0.15888762013512228, + "grad_norm": 1.5079336166381836, + "learning_rate": 4.694984567686246e-05, + "loss": 4.9043, + "step": 26716 + }, + { + "epoch": 0.15889356741840327, + "grad_norm": 1.3347656726837158, + "learning_rate": 4.694962208606197e-05, + "loss": 5.6922, + "step": 26717 + }, + { + "epoch": 0.15889951470168426, + "grad_norm": 1.8166699409484863, + "learning_rate": 4.6949398487599106e-05, + "loss": 5.3646, + "step": 26718 + }, + { + "epoch": 0.15890546198496527, + "grad_norm": 1.7105693817138672, + "learning_rate": 4.694917488147392e-05, + "loss": 5.3915, + "step": 26719 + }, + { + "epoch": 0.15891140926824626, + "grad_norm": 2.5474836826324463, + "learning_rate": 4.6948951267686514e-05, + "loss": 4.7226, + "step": 26720 + }, + { + "epoch": 0.15891735655152725, + "grad_norm": 2.544551372528076, + "learning_rate": 4.694872764623696e-05, + "loss": 4.5184, + "step": 26721 + }, + { + "epoch": 0.15892330383480827, + "grad_norm": 1.6088052988052368, + "learning_rate": 4.6948504017125316e-05, + "loss": 4.913, + "step": 26722 + }, + { + "epoch": 0.15892925111808925, + "grad_norm": 2.0992431640625, + "learning_rate": 4.6948280380351686e-05, + "loss": 4.7943, + "step": 26723 + }, + { + "epoch": 0.15893519840137024, + "grad_norm": 2.4200751781463623, + "learning_rate": 4.6948056735916135e-05, + "loss": 4.6055, + "step": 26724 + }, + { + "epoch": 0.15894114568465126, + "grad_norm": 1.802924633026123, + "learning_rate": 4.694783308381875e-05, + "loss": 4.2752, + "step": 26725 + }, + { + "epoch": 0.15894709296793225, + "grad_norm": 1.8056386709213257, + "learning_rate": 4.694760942405959e-05, + "loss": 5.0256, + "step": 26726 + }, + { + "epoch": 0.15895304025121323, + "grad_norm": 1.5216751098632812, + "learning_rate": 4.694738575663876e-05, + "loss": 4.9218, + "step": 26727 + }, + { + "epoch": 0.15895898753449425, + "grad_norm": 2.3782224655151367, + "learning_rate": 4.694716208155632e-05, + "loss": 4.7504, + "step": 26728 + }, + { + "epoch": 0.15896493481777524, + "grad_norm": 2.0227694511413574, + "learning_rate": 4.694693839881236e-05, + "loss": 4.5376, + "step": 26729 + }, + { + "epoch": 0.15897088210105623, + "grad_norm": 2.289433240890503, + "learning_rate": 4.694671470840693e-05, + "loss": 4.4428, + "step": 26730 + }, + { + "epoch": 0.15897682938433724, + "grad_norm": 2.2303051948547363, + "learning_rate": 4.694649101034015e-05, + "loss": 4.326, + "step": 26731 + }, + { + "epoch": 0.15898277666761823, + "grad_norm": 1.6835930347442627, + "learning_rate": 4.6946267304612067e-05, + "loss": 4.7231, + "step": 26732 + }, + { + "epoch": 0.15898872395089922, + "grad_norm": 1.6131420135498047, + "learning_rate": 4.694604359122277e-05, + "loss": 5.5532, + "step": 26733 + }, + { + "epoch": 0.15899467123418023, + "grad_norm": 1.4710865020751953, + "learning_rate": 4.6945819870172335e-05, + "loss": 5.3341, + "step": 26734 + }, + { + "epoch": 0.15900061851746122, + "grad_norm": 1.5708924531936646, + "learning_rate": 4.694559614146085e-05, + "loss": 4.9195, + "step": 26735 + }, + { + "epoch": 0.1590065658007422, + "grad_norm": 1.5540367364883423, + "learning_rate": 4.6945372405088374e-05, + "loss": 5.2529, + "step": 26736 + }, + { + "epoch": 0.15901251308402323, + "grad_norm": 1.8328397274017334, + "learning_rate": 4.6945148661054995e-05, + "loss": 5.0446, + "step": 26737 + }, + { + "epoch": 0.15901846036730422, + "grad_norm": 1.9213111400604248, + "learning_rate": 4.694492490936079e-05, + "loss": 4.505, + "step": 26738 + }, + { + "epoch": 0.1590244076505852, + "grad_norm": 1.6417537927627563, + "learning_rate": 4.694470115000584e-05, + "loss": 5.064, + "step": 26739 + }, + { + "epoch": 0.15903035493386622, + "grad_norm": 1.4690046310424805, + "learning_rate": 4.6944477382990224e-05, + "loss": 5.1935, + "step": 26740 + }, + { + "epoch": 0.1590363022171472, + "grad_norm": 1.6286424398422241, + "learning_rate": 4.694425360831402e-05, + "loss": 4.8251, + "step": 26741 + }, + { + "epoch": 0.1590422495004282, + "grad_norm": 1.6581510305404663, + "learning_rate": 4.6944029825977296e-05, + "loss": 4.9166, + "step": 26742 + }, + { + "epoch": 0.1590481967837092, + "grad_norm": 1.4425448179244995, + "learning_rate": 4.694380603598015e-05, + "loss": 4.9857, + "step": 26743 + }, + { + "epoch": 0.1590541440669902, + "grad_norm": 1.6443709135055542, + "learning_rate": 4.694358223832263e-05, + "loss": 4.3642, + "step": 26744 + }, + { + "epoch": 0.1590600913502712, + "grad_norm": 1.8886995315551758, + "learning_rate": 4.6943358433004856e-05, + "loss": 4.2237, + "step": 26745 + }, + { + "epoch": 0.1590660386335522, + "grad_norm": 1.779401421546936, + "learning_rate": 4.6943134620026865e-05, + "loss": 3.8314, + "step": 26746 + }, + { + "epoch": 0.1590719859168332, + "grad_norm": 1.9053362607955933, + "learning_rate": 4.6942910799388755e-05, + "loss": 4.0761, + "step": 26747 + }, + { + "epoch": 0.15907793320011418, + "grad_norm": 1.7256511449813843, + "learning_rate": 4.694268697109061e-05, + "loss": 5.4427, + "step": 26748 + }, + { + "epoch": 0.1590838804833952, + "grad_norm": 1.7450202703475952, + "learning_rate": 4.6942463135132484e-05, + "loss": 4.535, + "step": 26749 + }, + { + "epoch": 0.15908982776667618, + "grad_norm": 1.4825485944747925, + "learning_rate": 4.6942239291514486e-05, + "loss": 4.4373, + "step": 26750 + }, + { + "epoch": 0.15909577504995717, + "grad_norm": 1.5326381921768188, + "learning_rate": 4.6942015440236673e-05, + "loss": 4.3876, + "step": 26751 + }, + { + "epoch": 0.1591017223332382, + "grad_norm": 1.7042746543884277, + "learning_rate": 4.694179158129913e-05, + "loss": 4.6091, + "step": 26752 + }, + { + "epoch": 0.15910766961651918, + "grad_norm": 1.584315299987793, + "learning_rate": 4.6941567714701926e-05, + "loss": 4.5937, + "step": 26753 + }, + { + "epoch": 0.15911361689980016, + "grad_norm": 1.5627310276031494, + "learning_rate": 4.694134384044516e-05, + "loss": 4.719, + "step": 26754 + }, + { + "epoch": 0.15911956418308118, + "grad_norm": 1.726309061050415, + "learning_rate": 4.694111995852889e-05, + "loss": 4.8064, + "step": 26755 + }, + { + "epoch": 0.15912551146636217, + "grad_norm": 1.6186972856521606, + "learning_rate": 4.6940896068953204e-05, + "loss": 5.215, + "step": 26756 + }, + { + "epoch": 0.15913145874964316, + "grad_norm": 1.7018485069274902, + "learning_rate": 4.694067217171818e-05, + "loss": 5.1681, + "step": 26757 + }, + { + "epoch": 0.15913740603292417, + "grad_norm": 1.8986917734146118, + "learning_rate": 4.694044826682389e-05, + "loss": 5.1551, + "step": 26758 + }, + { + "epoch": 0.15914335331620516, + "grad_norm": 1.6398760080337524, + "learning_rate": 4.694022435427042e-05, + "loss": 4.8223, + "step": 26759 + }, + { + "epoch": 0.15914930059948615, + "grad_norm": 1.5714781284332275, + "learning_rate": 4.694000043405784e-05, + "loss": 4.6631, + "step": 26760 + }, + { + "epoch": 0.15915524788276716, + "grad_norm": 1.9300872087478638, + "learning_rate": 4.6939776506186234e-05, + "loss": 4.8107, + "step": 26761 + }, + { + "epoch": 0.15916119516604815, + "grad_norm": 1.8684272766113281, + "learning_rate": 4.6939552570655674e-05, + "loss": 4.9762, + "step": 26762 + }, + { + "epoch": 0.15916714244932914, + "grad_norm": 1.6835062503814697, + "learning_rate": 4.693932862746625e-05, + "loss": 4.8015, + "step": 26763 + }, + { + "epoch": 0.15917308973261016, + "grad_norm": 1.5635250806808472, + "learning_rate": 4.693910467661803e-05, + "loss": 4.7763, + "step": 26764 + }, + { + "epoch": 0.15917903701589114, + "grad_norm": 1.584123134613037, + "learning_rate": 4.69388807181111e-05, + "loss": 4.7093, + "step": 26765 + }, + { + "epoch": 0.15918498429917213, + "grad_norm": 1.597011685371399, + "learning_rate": 4.693865675194553e-05, + "loss": 4.7376, + "step": 26766 + }, + { + "epoch": 0.15919093158245315, + "grad_norm": 1.5018924474716187, + "learning_rate": 4.693843277812141e-05, + "loss": 4.5752, + "step": 26767 + }, + { + "epoch": 0.15919687886573414, + "grad_norm": 1.5398659706115723, + "learning_rate": 4.6938208796638796e-05, + "loss": 4.3835, + "step": 26768 + }, + { + "epoch": 0.15920282614901513, + "grad_norm": 1.753659963607788, + "learning_rate": 4.693798480749778e-05, + "loss": 4.5366, + "step": 26769 + }, + { + "epoch": 0.1592087734322961, + "grad_norm": 1.6807688474655151, + "learning_rate": 4.693776081069845e-05, + "loss": 4.5043, + "step": 26770 + }, + { + "epoch": 0.15921472071557713, + "grad_norm": 1.547088384628296, + "learning_rate": 4.6937536806240865e-05, + "loss": 4.4129, + "step": 26771 + }, + { + "epoch": 0.15922066799885812, + "grad_norm": 1.6225403547286987, + "learning_rate": 4.693731279412512e-05, + "loss": 4.3027, + "step": 26772 + }, + { + "epoch": 0.1592266152821391, + "grad_norm": 1.521183967590332, + "learning_rate": 4.693708877435128e-05, + "loss": 4.3267, + "step": 26773 + }, + { + "epoch": 0.15923256256542012, + "grad_norm": 1.503652572631836, + "learning_rate": 4.693686474691944e-05, + "loss": 4.5069, + "step": 26774 + }, + { + "epoch": 0.1592385098487011, + "grad_norm": 1.3765262365341187, + "learning_rate": 4.693664071182965e-05, + "loss": 4.8385, + "step": 26775 + }, + { + "epoch": 0.1592444571319821, + "grad_norm": 1.552372932434082, + "learning_rate": 4.6936416669082015e-05, + "loss": 4.7109, + "step": 26776 + }, + { + "epoch": 0.1592504044152631, + "grad_norm": 1.5098180770874023, + "learning_rate": 4.693619261867661e-05, + "loss": 4.6682, + "step": 26777 + }, + { + "epoch": 0.1592563516985441, + "grad_norm": 1.7043485641479492, + "learning_rate": 4.69359685606135e-05, + "loss": 4.7291, + "step": 26778 + }, + { + "epoch": 0.1592622989818251, + "grad_norm": 1.342060923576355, + "learning_rate": 4.693574449489277e-05, + "loss": 4.4172, + "step": 26779 + }, + { + "epoch": 0.1592682462651061, + "grad_norm": 1.5385740995407104, + "learning_rate": 4.6935520421514494e-05, + "loss": 4.1767, + "step": 26780 + }, + { + "epoch": 0.1592741935483871, + "grad_norm": 1.3378406763076782, + "learning_rate": 4.6935296340478764e-05, + "loss": 4.419, + "step": 26781 + }, + { + "epoch": 0.15928014083166808, + "grad_norm": 1.5734392404556274, + "learning_rate": 4.693507225178564e-05, + "loss": 4.3342, + "step": 26782 + }, + { + "epoch": 0.1592860881149491, + "grad_norm": 1.9071681499481201, + "learning_rate": 4.6934848155435216e-05, + "loss": 4.4808, + "step": 26783 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 1.4852991104125977, + "learning_rate": 4.693462405142755e-05, + "loss": 5.2923, + "step": 26784 + }, + { + "epoch": 0.15929798268151107, + "grad_norm": 1.7078371047973633, + "learning_rate": 4.6934399939762746e-05, + "loss": 4.5363, + "step": 26785 + }, + { + "epoch": 0.1593039299647921, + "grad_norm": 1.731362223625183, + "learning_rate": 4.693417582044087e-05, + "loss": 4.3905, + "step": 26786 + }, + { + "epoch": 0.15930987724807308, + "grad_norm": 1.7854750156402588, + "learning_rate": 4.6933951693462e-05, + "loss": 4.6509, + "step": 26787 + }, + { + "epoch": 0.15931582453135407, + "grad_norm": 1.804178237915039, + "learning_rate": 4.69337275588262e-05, + "loss": 4.5157, + "step": 26788 + }, + { + "epoch": 0.15932177181463508, + "grad_norm": 1.9014322757720947, + "learning_rate": 4.693350341653358e-05, + "loss": 4.5673, + "step": 26789 + }, + { + "epoch": 0.15932771909791607, + "grad_norm": 2.1549782752990723, + "learning_rate": 4.693327926658418e-05, + "loss": 4.6754, + "step": 26790 + }, + { + "epoch": 0.15933366638119706, + "grad_norm": 1.9609428644180298, + "learning_rate": 4.693305510897812e-05, + "loss": 4.6832, + "step": 26791 + }, + { + "epoch": 0.15933961366447807, + "grad_norm": 2.0541574954986572, + "learning_rate": 4.693283094371545e-05, + "loss": 4.3928, + "step": 26792 + }, + { + "epoch": 0.15934556094775906, + "grad_norm": 2.151719331741333, + "learning_rate": 4.693260677079625e-05, + "loss": 4.2179, + "step": 26793 + }, + { + "epoch": 0.15935150823104005, + "grad_norm": 1.6300101280212402, + "learning_rate": 4.693238259022062e-05, + "loss": 5.202, + "step": 26794 + }, + { + "epoch": 0.15935745551432107, + "grad_norm": 1.860836148262024, + "learning_rate": 4.69321584019886e-05, + "loss": 4.7327, + "step": 26795 + }, + { + "epoch": 0.15936340279760206, + "grad_norm": 1.7627391815185547, + "learning_rate": 4.6931934206100304e-05, + "loss": 5.0884, + "step": 26796 + }, + { + "epoch": 0.15936935008088304, + "grad_norm": 1.6358652114868164, + "learning_rate": 4.693171000255579e-05, + "loss": 5.1218, + "step": 26797 + }, + { + "epoch": 0.15937529736416406, + "grad_norm": 1.938833475112915, + "learning_rate": 4.693148579135514e-05, + "loss": 5.0097, + "step": 26798 + }, + { + "epoch": 0.15938124464744505, + "grad_norm": 1.6986185312271118, + "learning_rate": 4.6931261572498445e-05, + "loss": 5.0552, + "step": 26799 + }, + { + "epoch": 0.15938719193072604, + "grad_norm": 1.9049108028411865, + "learning_rate": 4.693103734598576e-05, + "loss": 4.5521, + "step": 26800 + }, + { + "epoch": 0.15939313921400705, + "grad_norm": 1.723593831062317, + "learning_rate": 4.693081311181719e-05, + "loss": 4.624, + "step": 26801 + }, + { + "epoch": 0.15939908649728804, + "grad_norm": 1.8977972269058228, + "learning_rate": 4.693058886999279e-05, + "loss": 4.508, + "step": 26802 + }, + { + "epoch": 0.15940503378056903, + "grad_norm": 1.8587881326675415, + "learning_rate": 4.6930364620512656e-05, + "loss": 4.5824, + "step": 26803 + }, + { + "epoch": 0.15941098106385004, + "grad_norm": 2.033412456512451, + "learning_rate": 4.693014036337685e-05, + "loss": 4.2831, + "step": 26804 + }, + { + "epoch": 0.15941692834713103, + "grad_norm": 1.7461220026016235, + "learning_rate": 4.692991609858547e-05, + "loss": 4.3987, + "step": 26805 + }, + { + "epoch": 0.15942287563041202, + "grad_norm": 1.5717246532440186, + "learning_rate": 4.692969182613857e-05, + "loss": 4.4173, + "step": 26806 + }, + { + "epoch": 0.15942882291369304, + "grad_norm": 1.825589656829834, + "learning_rate": 4.692946754603625e-05, + "loss": 4.5616, + "step": 26807 + }, + { + "epoch": 0.15943477019697402, + "grad_norm": 1.5404088497161865, + "learning_rate": 4.6929243258278576e-05, + "loss": 5.393, + "step": 26808 + }, + { + "epoch": 0.159440717480255, + "grad_norm": 2.0158777236938477, + "learning_rate": 4.692901896286563e-05, + "loss": 4.7878, + "step": 26809 + }, + { + "epoch": 0.15944666476353603, + "grad_norm": 2.152909755706787, + "learning_rate": 4.6928794659797494e-05, + "loss": 4.1923, + "step": 26810 + }, + { + "epoch": 0.15945261204681702, + "grad_norm": 2.1839582920074463, + "learning_rate": 4.692857034907423e-05, + "loss": 4.4213, + "step": 26811 + }, + { + "epoch": 0.159458559330098, + "grad_norm": 1.7359018325805664, + "learning_rate": 4.6928346030695934e-05, + "loss": 4.4409, + "step": 26812 + }, + { + "epoch": 0.15946450661337902, + "grad_norm": 1.6525425910949707, + "learning_rate": 4.692812170466269e-05, + "loss": 5.0243, + "step": 26813 + }, + { + "epoch": 0.15947045389666, + "grad_norm": 1.471819519996643, + "learning_rate": 4.692789737097455e-05, + "loss": 5.5855, + "step": 26814 + }, + { + "epoch": 0.159476401179941, + "grad_norm": 1.4903481006622314, + "learning_rate": 4.692767302963162e-05, + "loss": 5.4807, + "step": 26815 + }, + { + "epoch": 0.159482348463222, + "grad_norm": 1.6658556461334229, + "learning_rate": 4.6927448680633954e-05, + "loss": 5.2928, + "step": 26816 + }, + { + "epoch": 0.159488295746503, + "grad_norm": 1.8180750608444214, + "learning_rate": 4.692722432398166e-05, + "loss": 5.0372, + "step": 26817 + }, + { + "epoch": 0.159494243029784, + "grad_norm": 1.4245752096176147, + "learning_rate": 4.692699995967478e-05, + "loss": 4.9285, + "step": 26818 + }, + { + "epoch": 0.159500190313065, + "grad_norm": 1.5879698991775513, + "learning_rate": 4.692677558771342e-05, + "loss": 4.7327, + "step": 26819 + }, + { + "epoch": 0.159506137596346, + "grad_norm": 2.3847367763519287, + "learning_rate": 4.692655120809764e-05, + "loss": 4.0357, + "step": 26820 + }, + { + "epoch": 0.15951208487962698, + "grad_norm": 2.5753002166748047, + "learning_rate": 4.692632682082754e-05, + "loss": 3.9462, + "step": 26821 + }, + { + "epoch": 0.159518032162908, + "grad_norm": 2.6524651050567627, + "learning_rate": 4.6926102425903185e-05, + "loss": 4.1065, + "step": 26822 + }, + { + "epoch": 0.15952397944618898, + "grad_norm": 2.808206558227539, + "learning_rate": 4.692587802332464e-05, + "loss": 4.112, + "step": 26823 + }, + { + "epoch": 0.15952992672946997, + "grad_norm": 1.5214722156524658, + "learning_rate": 4.692565361309201e-05, + "loss": 5.4128, + "step": 26824 + }, + { + "epoch": 0.159535874012751, + "grad_norm": 2.1168901920318604, + "learning_rate": 4.692542919520536e-05, + "loss": 4.1342, + "step": 26825 + }, + { + "epoch": 0.15954182129603198, + "grad_norm": 2.5575170516967773, + "learning_rate": 4.692520476966477e-05, + "loss": 4.0117, + "step": 26826 + }, + { + "epoch": 0.15954776857931297, + "grad_norm": 2.9047164916992188, + "learning_rate": 4.6924980336470314e-05, + "loss": 4.1555, + "step": 26827 + }, + { + "epoch": 0.15955371586259395, + "grad_norm": 2.678936719894409, + "learning_rate": 4.6924755895622076e-05, + "loss": 4.0008, + "step": 26828 + }, + { + "epoch": 0.15955966314587497, + "grad_norm": 2.4771978855133057, + "learning_rate": 4.692453144712014e-05, + "loss": 4.1707, + "step": 26829 + }, + { + "epoch": 0.15956561042915596, + "grad_norm": 2.1536855697631836, + "learning_rate": 4.6924306990964564e-05, + "loss": 4.1883, + "step": 26830 + }, + { + "epoch": 0.15957155771243695, + "grad_norm": 1.8136900663375854, + "learning_rate": 4.692408252715544e-05, + "loss": 4.8374, + "step": 26831 + }, + { + "epoch": 0.15957750499571796, + "grad_norm": 2.4778616428375244, + "learning_rate": 4.692385805569285e-05, + "loss": 3.9603, + "step": 26832 + }, + { + "epoch": 0.15958345227899895, + "grad_norm": 1.9646393060684204, + "learning_rate": 4.692363357657686e-05, + "loss": 4.2872, + "step": 26833 + }, + { + "epoch": 0.15958939956227994, + "grad_norm": 2.0261855125427246, + "learning_rate": 4.6923409089807566e-05, + "loss": 4.2673, + "step": 26834 + }, + { + "epoch": 0.15959534684556095, + "grad_norm": 2.361943244934082, + "learning_rate": 4.692318459538503e-05, + "loss": 3.9284, + "step": 26835 + }, + { + "epoch": 0.15960129412884194, + "grad_norm": 1.9567387104034424, + "learning_rate": 4.6922960093309334e-05, + "loss": 4.366, + "step": 26836 + }, + { + "epoch": 0.15960724141212293, + "grad_norm": 2.046351432800293, + "learning_rate": 4.692273558358057e-05, + "loss": 4.1074, + "step": 26837 + }, + { + "epoch": 0.15961318869540395, + "grad_norm": 1.9861648082733154, + "learning_rate": 4.6922511066198796e-05, + "loss": 4.1299, + "step": 26838 + }, + { + "epoch": 0.15961913597868493, + "grad_norm": 2.061688184738159, + "learning_rate": 4.692228654116411e-05, + "loss": 4.056, + "step": 26839 + }, + { + "epoch": 0.15962508326196592, + "grad_norm": 2.4299874305725098, + "learning_rate": 4.692206200847656e-05, + "loss": 3.8725, + "step": 26840 + }, + { + "epoch": 0.15963103054524694, + "grad_norm": 2.0996625423431396, + "learning_rate": 4.692183746813626e-05, + "loss": 3.9208, + "step": 26841 + }, + { + "epoch": 0.15963697782852793, + "grad_norm": 1.4910566806793213, + "learning_rate": 4.6921612920143276e-05, + "loss": 5.4869, + "step": 26842 + }, + { + "epoch": 0.15964292511180891, + "grad_norm": 2.304666042327881, + "learning_rate": 4.692138836449768e-05, + "loss": 4.3594, + "step": 26843 + }, + { + "epoch": 0.15964887239508993, + "grad_norm": 2.0998356342315674, + "learning_rate": 4.6921163801199553e-05, + "loss": 4.184, + "step": 26844 + }, + { + "epoch": 0.15965481967837092, + "grad_norm": 2.05517315864563, + "learning_rate": 4.692093923024897e-05, + "loss": 4.0709, + "step": 26845 + }, + { + "epoch": 0.1596607669616519, + "grad_norm": 1.7358988523483276, + "learning_rate": 4.692071465164601e-05, + "loss": 4.8628, + "step": 26846 + }, + { + "epoch": 0.15966671424493292, + "grad_norm": 2.173988103866577, + "learning_rate": 4.6920490065390766e-05, + "loss": 4.2944, + "step": 26847 + }, + { + "epoch": 0.1596726615282139, + "grad_norm": 1.41978919506073, + "learning_rate": 4.69202654714833e-05, + "loss": 4.9699, + "step": 26848 + }, + { + "epoch": 0.1596786088114949, + "grad_norm": 1.748255968093872, + "learning_rate": 4.6920040869923695e-05, + "loss": 3.9938, + "step": 26849 + }, + { + "epoch": 0.15968455609477591, + "grad_norm": 1.7858299016952515, + "learning_rate": 4.691981626071204e-05, + "loss": 4.7106, + "step": 26850 + }, + { + "epoch": 0.1596905033780569, + "grad_norm": 1.575324296951294, + "learning_rate": 4.691959164384839e-05, + "loss": 5.4768, + "step": 26851 + }, + { + "epoch": 0.1596964506613379, + "grad_norm": 1.383719801902771, + "learning_rate": 4.691936701933285e-05, + "loss": 5.154, + "step": 26852 + }, + { + "epoch": 0.1597023979446189, + "grad_norm": 1.559497356414795, + "learning_rate": 4.6919142387165476e-05, + "loss": 5.4081, + "step": 26853 + }, + { + "epoch": 0.1597083452278999, + "grad_norm": 2.3833580017089844, + "learning_rate": 4.691891774734636e-05, + "loss": 4.3001, + "step": 26854 + }, + { + "epoch": 0.15971429251118088, + "grad_norm": 1.5790619850158691, + "learning_rate": 4.6918693099875575e-05, + "loss": 5.1468, + "step": 26855 + }, + { + "epoch": 0.1597202397944619, + "grad_norm": 2.088935613632202, + "learning_rate": 4.69184684447532e-05, + "loss": 4.6097, + "step": 26856 + }, + { + "epoch": 0.1597261870777429, + "grad_norm": 1.7923367023468018, + "learning_rate": 4.691824378197931e-05, + "loss": 4.2733, + "step": 26857 + }, + { + "epoch": 0.15973213436102388, + "grad_norm": 1.583054780960083, + "learning_rate": 4.691801911155399e-05, + "loss": 4.7933, + "step": 26858 + }, + { + "epoch": 0.1597380816443049, + "grad_norm": 1.6564888954162598, + "learning_rate": 4.691779443347733e-05, + "loss": 4.6326, + "step": 26859 + }, + { + "epoch": 0.15974402892758588, + "grad_norm": 1.4905378818511963, + "learning_rate": 4.691756974774938e-05, + "loss": 4.8904, + "step": 26860 + }, + { + "epoch": 0.15974997621086687, + "grad_norm": 1.6564618349075317, + "learning_rate": 4.6917345054370234e-05, + "loss": 4.6245, + "step": 26861 + }, + { + "epoch": 0.15975592349414788, + "grad_norm": 1.262850284576416, + "learning_rate": 4.691712035333996e-05, + "loss": 5.584, + "step": 26862 + }, + { + "epoch": 0.15976187077742887, + "grad_norm": 1.54867684841156, + "learning_rate": 4.691689564465867e-05, + "loss": 5.543, + "step": 26863 + }, + { + "epoch": 0.15976781806070986, + "grad_norm": 1.470517635345459, + "learning_rate": 4.69166709283264e-05, + "loss": 5.5524, + "step": 26864 + }, + { + "epoch": 0.15977376534399088, + "grad_norm": 1.5773262977600098, + "learning_rate": 4.6916446204343245e-05, + "loss": 4.9904, + "step": 26865 + }, + { + "epoch": 0.15977971262727186, + "grad_norm": 1.5984915494918823, + "learning_rate": 4.6916221472709295e-05, + "loss": 4.7114, + "step": 26866 + }, + { + "epoch": 0.15978565991055285, + "grad_norm": 1.4829813241958618, + "learning_rate": 4.691599673342462e-05, + "loss": 4.9843, + "step": 26867 + }, + { + "epoch": 0.15979160719383387, + "grad_norm": 1.7312453985214233, + "learning_rate": 4.691577198648929e-05, + "loss": 4.2701, + "step": 26868 + }, + { + "epoch": 0.15979755447711486, + "grad_norm": 1.4807355403900146, + "learning_rate": 4.691554723190339e-05, + "loss": 4.7952, + "step": 26869 + }, + { + "epoch": 0.15980350176039584, + "grad_norm": 1.3604083061218262, + "learning_rate": 4.6915322469667e-05, + "loss": 5.1496, + "step": 26870 + }, + { + "epoch": 0.15980944904367686, + "grad_norm": 1.5444153547286987, + "learning_rate": 4.69150976997802e-05, + "loss": 5.791, + "step": 26871 + }, + { + "epoch": 0.15981539632695785, + "grad_norm": 1.617533564567566, + "learning_rate": 4.691487292224306e-05, + "loss": 5.5533, + "step": 26872 + }, + { + "epoch": 0.15982134361023884, + "grad_norm": 1.5946470499038696, + "learning_rate": 4.691464813705567e-05, + "loss": 5.5958, + "step": 26873 + }, + { + "epoch": 0.15982729089351985, + "grad_norm": 1.862707495689392, + "learning_rate": 4.691442334421809e-05, + "loss": 4.8171, + "step": 26874 + }, + { + "epoch": 0.15983323817680084, + "grad_norm": 1.355368971824646, + "learning_rate": 4.6914198543730425e-05, + "loss": 5.5431, + "step": 26875 + }, + { + "epoch": 0.15983918546008183, + "grad_norm": 1.4658385515213013, + "learning_rate": 4.6913973735592744e-05, + "loss": 5.3588, + "step": 26876 + }, + { + "epoch": 0.15984513274336284, + "grad_norm": 1.4573192596435547, + "learning_rate": 4.6913748919805115e-05, + "loss": 5.5454, + "step": 26877 + }, + { + "epoch": 0.15985108002664383, + "grad_norm": 1.495696783065796, + "learning_rate": 4.691352409636762e-05, + "loss": 5.5131, + "step": 26878 + }, + { + "epoch": 0.15985702730992482, + "grad_norm": 1.474161148071289, + "learning_rate": 4.691329926528034e-05, + "loss": 5.6235, + "step": 26879 + }, + { + "epoch": 0.15986297459320584, + "grad_norm": 1.5069948434829712, + "learning_rate": 4.6913074426543355e-05, + "loss": 5.3926, + "step": 26880 + }, + { + "epoch": 0.15986892187648682, + "grad_norm": 1.4088873863220215, + "learning_rate": 4.691284958015674e-05, + "loss": 5.2991, + "step": 26881 + }, + { + "epoch": 0.1598748691597678, + "grad_norm": 1.483222484588623, + "learning_rate": 4.691262472612058e-05, + "loss": 5.205, + "step": 26882 + }, + { + "epoch": 0.15988081644304883, + "grad_norm": 1.5325754880905151, + "learning_rate": 4.6912399864434953e-05, + "loss": 5.261, + "step": 26883 + }, + { + "epoch": 0.15988676372632982, + "grad_norm": 1.4159071445465088, + "learning_rate": 4.691217499509992e-05, + "loss": 5.2486, + "step": 26884 + }, + { + "epoch": 0.1598927110096108, + "grad_norm": 1.514702320098877, + "learning_rate": 4.6911950118115584e-05, + "loss": 5.332, + "step": 26885 + }, + { + "epoch": 0.1598986582928918, + "grad_norm": 1.757711410522461, + "learning_rate": 4.6911725233482005e-05, + "loss": 4.5752, + "step": 26886 + }, + { + "epoch": 0.1599046055761728, + "grad_norm": 1.6628808975219727, + "learning_rate": 4.691150034119928e-05, + "loss": 4.8776, + "step": 26887 + }, + { + "epoch": 0.1599105528594538, + "grad_norm": 1.6468075513839722, + "learning_rate": 4.691127544126746e-05, + "loss": 4.7613, + "step": 26888 + }, + { + "epoch": 0.15991650014273479, + "grad_norm": 1.603371262550354, + "learning_rate": 4.6911050533686656e-05, + "loss": 4.8145, + "step": 26889 + }, + { + "epoch": 0.1599224474260158, + "grad_norm": 1.4971832036972046, + "learning_rate": 4.6910825618456925e-05, + "loss": 5.5747, + "step": 26890 + }, + { + "epoch": 0.1599283947092968, + "grad_norm": 1.6911252737045288, + "learning_rate": 4.691060069557836e-05, + "loss": 5.5936, + "step": 26891 + }, + { + "epoch": 0.15993434199257778, + "grad_norm": 1.4903403520584106, + "learning_rate": 4.6910375765051016e-05, + "loss": 5.6195, + "step": 26892 + }, + { + "epoch": 0.1599402892758588, + "grad_norm": 1.8719216585159302, + "learning_rate": 4.6910150826874986e-05, + "loss": 4.818, + "step": 26893 + }, + { + "epoch": 0.15994623655913978, + "grad_norm": 1.7679294347763062, + "learning_rate": 4.690992588105036e-05, + "loss": 4.9175, + "step": 26894 + }, + { + "epoch": 0.15995218384242077, + "grad_norm": 1.8319326639175415, + "learning_rate": 4.69097009275772e-05, + "loss": 5.7222, + "step": 26895 + }, + { + "epoch": 0.15995813112570179, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.690947596645559e-05, + "loss": 5.2146, + "step": 26896 + }, + { + "epoch": 0.15996407840898277, + "grad_norm": 1.6124671697616577, + "learning_rate": 4.690925099768561e-05, + "loss": 5.3234, + "step": 26897 + }, + { + "epoch": 0.15997002569226376, + "grad_norm": 1.546627402305603, + "learning_rate": 4.6909026021267336e-05, + "loss": 5.4278, + "step": 26898 + }, + { + "epoch": 0.15997597297554478, + "grad_norm": 1.492988109588623, + "learning_rate": 4.690880103720084e-05, + "loss": 5.5902, + "step": 26899 + }, + { + "epoch": 0.15998192025882577, + "grad_norm": 1.4887235164642334, + "learning_rate": 4.690857604548622e-05, + "loss": 5.5054, + "step": 26900 + }, + { + "epoch": 0.15998786754210675, + "grad_norm": 1.6349844932556152, + "learning_rate": 4.690835104612353e-05, + "loss": 5.4657, + "step": 26901 + }, + { + "epoch": 0.15999381482538777, + "grad_norm": 1.5228698253631592, + "learning_rate": 4.690812603911287e-05, + "loss": 5.3062, + "step": 26902 + }, + { + "epoch": 0.15999976210866876, + "grad_norm": 2.3719773292541504, + "learning_rate": 4.69079010244543e-05, + "loss": 4.3533, + "step": 26903 + }, + { + "epoch": 0.16000570939194975, + "grad_norm": 1.7740064859390259, + "learning_rate": 4.690767600214792e-05, + "loss": 4.8227, + "step": 26904 + }, + { + "epoch": 0.16001165667523076, + "grad_norm": 1.5493906736373901, + "learning_rate": 4.690745097219379e-05, + "loss": 5.2635, + "step": 26905 + }, + { + "epoch": 0.16001760395851175, + "grad_norm": 1.5318504571914673, + "learning_rate": 4.6907225934592e-05, + "loss": 5.1352, + "step": 26906 + }, + { + "epoch": 0.16002355124179274, + "grad_norm": 1.6286877393722534, + "learning_rate": 4.6907000889342626e-05, + "loss": 5.122, + "step": 26907 + }, + { + "epoch": 0.16002949852507375, + "grad_norm": 1.7091056108474731, + "learning_rate": 4.6906775836445735e-05, + "loss": 4.8629, + "step": 26908 + }, + { + "epoch": 0.16003544580835474, + "grad_norm": 1.8141852617263794, + "learning_rate": 4.6906550775901417e-05, + "loss": 5.0909, + "step": 26909 + }, + { + "epoch": 0.16004139309163573, + "grad_norm": 1.5500266551971436, + "learning_rate": 4.690632570770975e-05, + "loss": 5.3479, + "step": 26910 + }, + { + "epoch": 0.16004734037491675, + "grad_norm": 1.6703251600265503, + "learning_rate": 4.690610063187081e-05, + "loss": 5.264, + "step": 26911 + }, + { + "epoch": 0.16005328765819773, + "grad_norm": 1.2872283458709717, + "learning_rate": 4.690587554838468e-05, + "loss": 5.3643, + "step": 26912 + }, + { + "epoch": 0.16005923494147872, + "grad_norm": 1.456085443496704, + "learning_rate": 4.6905650457251435e-05, + "loss": 5.4866, + "step": 26913 + }, + { + "epoch": 0.16006518222475974, + "grad_norm": 1.560021996498108, + "learning_rate": 4.690542535847115e-05, + "loss": 5.3858, + "step": 26914 + }, + { + "epoch": 0.16007112950804073, + "grad_norm": 1.4462066888809204, + "learning_rate": 4.690520025204391e-05, + "loss": 5.2111, + "step": 26915 + }, + { + "epoch": 0.16007707679132172, + "grad_norm": 1.5655597448349, + "learning_rate": 4.6904975137969786e-05, + "loss": 5.2547, + "step": 26916 + }, + { + "epoch": 0.16008302407460273, + "grad_norm": 1.3707412481307983, + "learning_rate": 4.6904750016248865e-05, + "loss": 5.3997, + "step": 26917 + }, + { + "epoch": 0.16008897135788372, + "grad_norm": 1.7030435800552368, + "learning_rate": 4.690452488688123e-05, + "loss": 5.4115, + "step": 26918 + }, + { + "epoch": 0.1600949186411647, + "grad_norm": 1.4965012073516846, + "learning_rate": 4.690429974986694e-05, + "loss": 4.9977, + "step": 26919 + }, + { + "epoch": 0.16010086592444572, + "grad_norm": 1.3461761474609375, + "learning_rate": 4.69040746052061e-05, + "loss": 5.3629, + "step": 26920 + }, + { + "epoch": 0.1601068132077267, + "grad_norm": 1.3323198556900024, + "learning_rate": 4.690384945289875e-05, + "loss": 5.3162, + "step": 26921 + }, + { + "epoch": 0.1601127604910077, + "grad_norm": 1.6808300018310547, + "learning_rate": 4.690362429294501e-05, + "loss": 5.0513, + "step": 26922 + }, + { + "epoch": 0.16011870777428872, + "grad_norm": 1.659193515777588, + "learning_rate": 4.690339912534494e-05, + "loss": 5.2587, + "step": 26923 + }, + { + "epoch": 0.1601246550575697, + "grad_norm": 1.7092478275299072, + "learning_rate": 4.690317395009861e-05, + "loss": 5.1897, + "step": 26924 + }, + { + "epoch": 0.1601306023408507, + "grad_norm": 1.5868886709213257, + "learning_rate": 4.6902948767206115e-05, + "loss": 4.7132, + "step": 26925 + }, + { + "epoch": 0.1601365496241317, + "grad_norm": 1.584676742553711, + "learning_rate": 4.690272357666753e-05, + "loss": 4.8759, + "step": 26926 + }, + { + "epoch": 0.1601424969074127, + "grad_norm": 1.6470085382461548, + "learning_rate": 4.690249837848293e-05, + "loss": 4.9947, + "step": 26927 + }, + { + "epoch": 0.16014844419069368, + "grad_norm": 1.4562335014343262, + "learning_rate": 4.690227317265239e-05, + "loss": 5.1101, + "step": 26928 + }, + { + "epoch": 0.1601543914739747, + "grad_norm": 1.4088939428329468, + "learning_rate": 4.690204795917599e-05, + "loss": 5.3212, + "step": 26929 + }, + { + "epoch": 0.1601603387572557, + "grad_norm": 1.4988348484039307, + "learning_rate": 4.6901822738053816e-05, + "loss": 4.9456, + "step": 26930 + }, + { + "epoch": 0.16016628604053668, + "grad_norm": 1.608365535736084, + "learning_rate": 4.690159750928594e-05, + "loss": 5.082, + "step": 26931 + }, + { + "epoch": 0.1601722333238177, + "grad_norm": 1.5603444576263428, + "learning_rate": 4.6901372272872445e-05, + "loss": 5.4297, + "step": 26932 + }, + { + "epoch": 0.16017818060709868, + "grad_norm": 1.6907488107681274, + "learning_rate": 4.690114702881341e-05, + "loss": 4.9653, + "step": 26933 + }, + { + "epoch": 0.16018412789037967, + "grad_norm": 1.566992998123169, + "learning_rate": 4.69009217771089e-05, + "loss": 5.2261, + "step": 26934 + }, + { + "epoch": 0.16019007517366068, + "grad_norm": 1.4666292667388916, + "learning_rate": 4.690069651775901e-05, + "loss": 5.0251, + "step": 26935 + }, + { + "epoch": 0.16019602245694167, + "grad_norm": 1.5898406505584717, + "learning_rate": 4.690047125076382e-05, + "loss": 5.1041, + "step": 26936 + }, + { + "epoch": 0.16020196974022266, + "grad_norm": 1.3918042182922363, + "learning_rate": 4.6900245976123396e-05, + "loss": 5.3757, + "step": 26937 + }, + { + "epoch": 0.16020791702350368, + "grad_norm": 1.390620231628418, + "learning_rate": 4.690002069383782e-05, + "loss": 5.2667, + "step": 26938 + }, + { + "epoch": 0.16021386430678466, + "grad_norm": 1.4058221578598022, + "learning_rate": 4.6899795403907174e-05, + "loss": 5.8193, + "step": 26939 + }, + { + "epoch": 0.16021981159006565, + "grad_norm": 1.7895981073379517, + "learning_rate": 4.689957010633154e-05, + "loss": 4.9949, + "step": 26940 + }, + { + "epoch": 0.16022575887334667, + "grad_norm": 1.6591132879257202, + "learning_rate": 4.689934480111099e-05, + "loss": 5.0723, + "step": 26941 + }, + { + "epoch": 0.16023170615662766, + "grad_norm": 1.6578445434570312, + "learning_rate": 4.6899119488245605e-05, + "loss": 5.0636, + "step": 26942 + }, + { + "epoch": 0.16023765343990864, + "grad_norm": 1.7342137098312378, + "learning_rate": 4.6898894167735464e-05, + "loss": 4.9476, + "step": 26943 + }, + { + "epoch": 0.16024360072318963, + "grad_norm": 1.7774765491485596, + "learning_rate": 4.689866883958065e-05, + "loss": 5.04, + "step": 26944 + }, + { + "epoch": 0.16024954800647065, + "grad_norm": 1.519485354423523, + "learning_rate": 4.689844350378122e-05, + "loss": 5.353, + "step": 26945 + }, + { + "epoch": 0.16025549528975164, + "grad_norm": 1.7019078731536865, + "learning_rate": 4.6898218160337286e-05, + "loss": 5.2927, + "step": 26946 + }, + { + "epoch": 0.16026144257303263, + "grad_norm": 1.6364177465438843, + "learning_rate": 4.6897992809248903e-05, + "loss": 5.3286, + "step": 26947 + }, + { + "epoch": 0.16026738985631364, + "grad_norm": 1.5034300088882446, + "learning_rate": 4.6897767450516164e-05, + "loss": 5.1647, + "step": 26948 + }, + { + "epoch": 0.16027333713959463, + "grad_norm": 1.4327138662338257, + "learning_rate": 4.6897542084139135e-05, + "loss": 5.1381, + "step": 26949 + }, + { + "epoch": 0.16027928442287562, + "grad_norm": 1.666137456893921, + "learning_rate": 4.68973167101179e-05, + "loss": 4.7333, + "step": 26950 + }, + { + "epoch": 0.16028523170615663, + "grad_norm": 1.6748521327972412, + "learning_rate": 4.689709132845254e-05, + "loss": 4.8698, + "step": 26951 + }, + { + "epoch": 0.16029117898943762, + "grad_norm": 1.7348641157150269, + "learning_rate": 4.689686593914313e-05, + "loss": 5.0501, + "step": 26952 + }, + { + "epoch": 0.1602971262727186, + "grad_norm": 1.6517002582550049, + "learning_rate": 4.689664054218975e-05, + "loss": 4.9992, + "step": 26953 + }, + { + "epoch": 0.16030307355599963, + "grad_norm": 1.9717700481414795, + "learning_rate": 4.689641513759249e-05, + "loss": 4.6581, + "step": 26954 + }, + { + "epoch": 0.1603090208392806, + "grad_norm": 1.9283233880996704, + "learning_rate": 4.68961897253514e-05, + "loss": 4.1993, + "step": 26955 + }, + { + "epoch": 0.1603149681225616, + "grad_norm": 2.814549446105957, + "learning_rate": 4.689596430546659e-05, + "loss": 4.2436, + "step": 26956 + }, + { + "epoch": 0.16032091540584262, + "grad_norm": 1.8716390132904053, + "learning_rate": 4.689573887793811e-05, + "loss": 4.7558, + "step": 26957 + }, + { + "epoch": 0.1603268626891236, + "grad_norm": 1.5305246114730835, + "learning_rate": 4.689551344276607e-05, + "loss": 5.0986, + "step": 26958 + }, + { + "epoch": 0.1603328099724046, + "grad_norm": 1.7304683923721313, + "learning_rate": 4.689528799995052e-05, + "loss": 4.8627, + "step": 26959 + }, + { + "epoch": 0.1603387572556856, + "grad_norm": 1.6693211793899536, + "learning_rate": 4.6895062549491564e-05, + "loss": 4.6759, + "step": 26960 + }, + { + "epoch": 0.1603447045389666, + "grad_norm": 1.6889755725860596, + "learning_rate": 4.6894837091389256e-05, + "loss": 4.6676, + "step": 26961 + }, + { + "epoch": 0.1603506518222476, + "grad_norm": 1.7085540294647217, + "learning_rate": 4.6894611625643695e-05, + "loss": 5.2494, + "step": 26962 + }, + { + "epoch": 0.1603565991055286, + "grad_norm": 1.7167129516601562, + "learning_rate": 4.689438615225494e-05, + "loss": 4.7013, + "step": 26963 + }, + { + "epoch": 0.1603625463888096, + "grad_norm": 1.6896833181381226, + "learning_rate": 4.689416067122309e-05, + "loss": 5.0363, + "step": 26964 + }, + { + "epoch": 0.16036849367209058, + "grad_norm": 1.4529087543487549, + "learning_rate": 4.6893935182548215e-05, + "loss": 5.2665, + "step": 26965 + }, + { + "epoch": 0.1603744409553716, + "grad_norm": 1.630214810371399, + "learning_rate": 4.689370968623039e-05, + "loss": 5.3018, + "step": 26966 + }, + { + "epoch": 0.16038038823865258, + "grad_norm": 1.4638413190841675, + "learning_rate": 4.6893484182269697e-05, + "loss": 5.4105, + "step": 26967 + }, + { + "epoch": 0.16038633552193357, + "grad_norm": 1.7969051599502563, + "learning_rate": 4.689325867066622e-05, + "loss": 5.3511, + "step": 26968 + }, + { + "epoch": 0.1603922828052146, + "grad_norm": 1.65691339969635, + "learning_rate": 4.689303315142003e-05, + "loss": 5.158, + "step": 26969 + }, + { + "epoch": 0.16039823008849557, + "grad_norm": 1.391390085220337, + "learning_rate": 4.689280762453121e-05, + "loss": 5.2721, + "step": 26970 + }, + { + "epoch": 0.16040417737177656, + "grad_norm": 1.699019193649292, + "learning_rate": 4.689258208999983e-05, + "loss": 5.0995, + "step": 26971 + }, + { + "epoch": 0.16041012465505758, + "grad_norm": 1.7829947471618652, + "learning_rate": 4.6892356547825984e-05, + "loss": 4.9086, + "step": 26972 + }, + { + "epoch": 0.16041607193833857, + "grad_norm": 1.7381236553192139, + "learning_rate": 4.689213099800974e-05, + "loss": 4.9298, + "step": 26973 + }, + { + "epoch": 0.16042201922161956, + "grad_norm": 1.273488998413086, + "learning_rate": 4.689190544055118e-05, + "loss": 5.1877, + "step": 26974 + }, + { + "epoch": 0.16042796650490057, + "grad_norm": 1.5737167596817017, + "learning_rate": 4.689167987545038e-05, + "loss": 5.229, + "step": 26975 + }, + { + "epoch": 0.16043391378818156, + "grad_norm": 1.4660385847091675, + "learning_rate": 4.6891454302707414e-05, + "loss": 5.3256, + "step": 26976 + }, + { + "epoch": 0.16043986107146255, + "grad_norm": 1.7380048036575317, + "learning_rate": 4.6891228722322375e-05, + "loss": 4.3369, + "step": 26977 + }, + { + "epoch": 0.16044580835474356, + "grad_norm": 1.686514139175415, + "learning_rate": 4.6891003134295336e-05, + "loss": 4.9901, + "step": 26978 + }, + { + "epoch": 0.16045175563802455, + "grad_norm": 1.8255095481872559, + "learning_rate": 4.689077753862637e-05, + "loss": 4.7844, + "step": 26979 + }, + { + "epoch": 0.16045770292130554, + "grad_norm": 1.7652206420898438, + "learning_rate": 4.689055193531556e-05, + "loss": 5.2592, + "step": 26980 + }, + { + "epoch": 0.16046365020458656, + "grad_norm": 2.122629165649414, + "learning_rate": 4.6890326324362985e-05, + "loss": 4.9435, + "step": 26981 + }, + { + "epoch": 0.16046959748786754, + "grad_norm": 2.0414109230041504, + "learning_rate": 4.689010070576872e-05, + "loss": 4.8267, + "step": 26982 + }, + { + "epoch": 0.16047554477114853, + "grad_norm": 1.8635056018829346, + "learning_rate": 4.6889875079532855e-05, + "loss": 5.0768, + "step": 26983 + }, + { + "epoch": 0.16048149205442955, + "grad_norm": 1.649129033088684, + "learning_rate": 4.688964944565546e-05, + "loss": 5.1536, + "step": 26984 + }, + { + "epoch": 0.16048743933771054, + "grad_norm": 1.6211038827896118, + "learning_rate": 4.688942380413661e-05, + "loss": 5.1866, + "step": 26985 + }, + { + "epoch": 0.16049338662099152, + "grad_norm": 1.862961769104004, + "learning_rate": 4.6889198154976387e-05, + "loss": 4.9439, + "step": 26986 + }, + { + "epoch": 0.16049933390427254, + "grad_norm": 2.02945613861084, + "learning_rate": 4.6888972498174874e-05, + "loss": 4.8791, + "step": 26987 + }, + { + "epoch": 0.16050528118755353, + "grad_norm": 2.434349536895752, + "learning_rate": 4.688874683373215e-05, + "loss": 4.9336, + "step": 26988 + }, + { + "epoch": 0.16051122847083452, + "grad_norm": 1.6819970607757568, + "learning_rate": 4.6888521161648284e-05, + "loss": 4.9917, + "step": 26989 + }, + { + "epoch": 0.16051717575411553, + "grad_norm": 1.7764739990234375, + "learning_rate": 4.688829548192337e-05, + "loss": 5.274, + "step": 26990 + }, + { + "epoch": 0.16052312303739652, + "grad_norm": 1.4962623119354248, + "learning_rate": 4.6888069794557465e-05, + "loss": 5.0699, + "step": 26991 + }, + { + "epoch": 0.1605290703206775, + "grad_norm": 1.7750627994537354, + "learning_rate": 4.688784409955067e-05, + "loss": 4.9197, + "step": 26992 + }, + { + "epoch": 0.16053501760395852, + "grad_norm": 1.7030991315841675, + "learning_rate": 4.6887618396903055e-05, + "loss": 5.1113, + "step": 26993 + }, + { + "epoch": 0.1605409648872395, + "grad_norm": 1.7158962488174438, + "learning_rate": 4.68873926866147e-05, + "loss": 5.2175, + "step": 26994 + }, + { + "epoch": 0.1605469121705205, + "grad_norm": 1.5792635679244995, + "learning_rate": 4.6887166968685684e-05, + "loss": 5.2031, + "step": 26995 + }, + { + "epoch": 0.16055285945380152, + "grad_norm": 1.6441086530685425, + "learning_rate": 4.688694124311607e-05, + "loss": 4.669, + "step": 26996 + }, + { + "epoch": 0.1605588067370825, + "grad_norm": 1.4879902601242065, + "learning_rate": 4.688671550990597e-05, + "loss": 5.2163, + "step": 26997 + }, + { + "epoch": 0.1605647540203635, + "grad_norm": 1.7525761127471924, + "learning_rate": 4.688648976905543e-05, + "loss": 4.6094, + "step": 26998 + }, + { + "epoch": 0.1605707013036445, + "grad_norm": 1.500331163406372, + "learning_rate": 4.6886264020564544e-05, + "loss": 5.0793, + "step": 26999 + }, + { + "epoch": 0.1605766485869255, + "grad_norm": 1.505900502204895, + "learning_rate": 4.688603826443339e-05, + "loss": 4.9562, + "step": 27000 + }, + { + "epoch": 0.16058259587020648, + "grad_norm": 1.558977723121643, + "learning_rate": 4.688581250066205e-05, + "loss": 4.8143, + "step": 27001 + }, + { + "epoch": 0.1605885431534875, + "grad_norm": 1.4914512634277344, + "learning_rate": 4.6885586729250596e-05, + "loss": 4.624, + "step": 27002 + }, + { + "epoch": 0.1605944904367685, + "grad_norm": 1.482251763343811, + "learning_rate": 4.688536095019911e-05, + "loss": 4.87, + "step": 27003 + }, + { + "epoch": 0.16060043772004948, + "grad_norm": 1.4962702989578247, + "learning_rate": 4.688513516350767e-05, + "loss": 5.1775, + "step": 27004 + }, + { + "epoch": 0.16060638500333047, + "grad_norm": 1.71797513961792, + "learning_rate": 4.688490936917636e-05, + "loss": 5.3413, + "step": 27005 + }, + { + "epoch": 0.16061233228661148, + "grad_norm": 1.5410555601119995, + "learning_rate": 4.688468356720525e-05, + "loss": 5.399, + "step": 27006 + }, + { + "epoch": 0.16061827956989247, + "grad_norm": 1.597773551940918, + "learning_rate": 4.6884457757594424e-05, + "loss": 5.4056, + "step": 27007 + }, + { + "epoch": 0.16062422685317346, + "grad_norm": 1.3013349771499634, + "learning_rate": 4.688423194034396e-05, + "loss": 5.6953, + "step": 27008 + }, + { + "epoch": 0.16063017413645447, + "grad_norm": 1.557054877281189, + "learning_rate": 4.6884006115453935e-05, + "loss": 5.078, + "step": 27009 + }, + { + "epoch": 0.16063612141973546, + "grad_norm": 1.5944912433624268, + "learning_rate": 4.688378028292443e-05, + "loss": 5.0212, + "step": 27010 + }, + { + "epoch": 0.16064206870301645, + "grad_norm": 1.45020592212677, + "learning_rate": 4.6883554442755526e-05, + "loss": 4.9653, + "step": 27011 + }, + { + "epoch": 0.16064801598629747, + "grad_norm": 1.7178733348846436, + "learning_rate": 4.68833285949473e-05, + "loss": 5.2027, + "step": 27012 + }, + { + "epoch": 0.16065396326957845, + "grad_norm": 1.574744462966919, + "learning_rate": 4.688310273949983e-05, + "loss": 5.3929, + "step": 27013 + }, + { + "epoch": 0.16065991055285944, + "grad_norm": 1.511526107788086, + "learning_rate": 4.688287687641319e-05, + "loss": 4.9275, + "step": 27014 + }, + { + "epoch": 0.16066585783614046, + "grad_norm": 1.5261460542678833, + "learning_rate": 4.688265100568747e-05, + "loss": 5.193, + "step": 27015 + }, + { + "epoch": 0.16067180511942145, + "grad_norm": 1.3765456676483154, + "learning_rate": 4.688242512732274e-05, + "loss": 5.006, + "step": 27016 + }, + { + "epoch": 0.16067775240270243, + "grad_norm": 1.4258984327316284, + "learning_rate": 4.688219924131908e-05, + "loss": 5.0301, + "step": 27017 + }, + { + "epoch": 0.16068369968598345, + "grad_norm": 1.6083779335021973, + "learning_rate": 4.688197334767657e-05, + "loss": 5.0202, + "step": 27018 + }, + { + "epoch": 0.16068964696926444, + "grad_norm": 1.3578145503997803, + "learning_rate": 4.6881747446395285e-05, + "loss": 5.0357, + "step": 27019 + }, + { + "epoch": 0.16069559425254543, + "grad_norm": 1.5515062808990479, + "learning_rate": 4.6881521537475316e-05, + "loss": 4.7463, + "step": 27020 + }, + { + "epoch": 0.16070154153582644, + "grad_norm": 1.5254274606704712, + "learning_rate": 4.688129562091673e-05, + "loss": 5.0846, + "step": 27021 + }, + { + "epoch": 0.16070748881910743, + "grad_norm": 1.6628260612487793, + "learning_rate": 4.6881069696719615e-05, + "loss": 4.7732, + "step": 27022 + }, + { + "epoch": 0.16071343610238842, + "grad_norm": 1.5955768823623657, + "learning_rate": 4.6880843764884044e-05, + "loss": 4.7582, + "step": 27023 + }, + { + "epoch": 0.16071938338566943, + "grad_norm": 1.4915939569473267, + "learning_rate": 4.6880617825410086e-05, + "loss": 4.7503, + "step": 27024 + }, + { + "epoch": 0.16072533066895042, + "grad_norm": 1.6703109741210938, + "learning_rate": 4.6880391878297836e-05, + "loss": 4.393, + "step": 27025 + }, + { + "epoch": 0.1607312779522314, + "grad_norm": 1.6725270748138428, + "learning_rate": 4.688016592354737e-05, + "loss": 5.2538, + "step": 27026 + }, + { + "epoch": 0.16073722523551243, + "grad_norm": 1.820046305656433, + "learning_rate": 4.687993996115876e-05, + "loss": 4.7337, + "step": 27027 + }, + { + "epoch": 0.16074317251879341, + "grad_norm": 1.7842957973480225, + "learning_rate": 4.6879713991132096e-05, + "loss": 4.8615, + "step": 27028 + }, + { + "epoch": 0.1607491198020744, + "grad_norm": 1.9226150512695312, + "learning_rate": 4.687948801346745e-05, + "loss": 4.3828, + "step": 27029 + }, + { + "epoch": 0.16075506708535542, + "grad_norm": 1.3625149726867676, + "learning_rate": 4.6879262028164895e-05, + "loss": 4.962, + "step": 27030 + }, + { + "epoch": 0.1607610143686364, + "grad_norm": 1.6589162349700928, + "learning_rate": 4.687903603522452e-05, + "loss": 4.373, + "step": 27031 + }, + { + "epoch": 0.1607669616519174, + "grad_norm": 1.5190513134002686, + "learning_rate": 4.6878810034646395e-05, + "loss": 5.3889, + "step": 27032 + }, + { + "epoch": 0.1607729089351984, + "grad_norm": 1.4899837970733643, + "learning_rate": 4.6878584026430604e-05, + "loss": 4.6972, + "step": 27033 + }, + { + "epoch": 0.1607788562184794, + "grad_norm": 1.7779310941696167, + "learning_rate": 4.6878358010577226e-05, + "loss": 5.0265, + "step": 27034 + }, + { + "epoch": 0.1607848035017604, + "grad_norm": 1.7755082845687866, + "learning_rate": 4.687813198708634e-05, + "loss": 4.7129, + "step": 27035 + }, + { + "epoch": 0.1607907507850414, + "grad_norm": 1.986676573753357, + "learning_rate": 4.6877905955958024e-05, + "loss": 4.5315, + "step": 27036 + }, + { + "epoch": 0.1607966980683224, + "grad_norm": 1.727644443511963, + "learning_rate": 4.687767991719235e-05, + "loss": 4.5498, + "step": 27037 + }, + { + "epoch": 0.16080264535160338, + "grad_norm": 1.936285138130188, + "learning_rate": 4.687745387078942e-05, + "loss": 4.2741, + "step": 27038 + }, + { + "epoch": 0.1608085926348844, + "grad_norm": 1.7781955003738403, + "learning_rate": 4.687722781674928e-05, + "loss": 5.0867, + "step": 27039 + }, + { + "epoch": 0.16081453991816538, + "grad_norm": 1.7659040689468384, + "learning_rate": 4.687700175507204e-05, + "loss": 5.2197, + "step": 27040 + }, + { + "epoch": 0.16082048720144637, + "grad_norm": 1.8074475526809692, + "learning_rate": 4.6876775685757755e-05, + "loss": 4.8669, + "step": 27041 + }, + { + "epoch": 0.1608264344847274, + "grad_norm": 1.8640440702438354, + "learning_rate": 4.687654960880652e-05, + "loss": 4.2379, + "step": 27042 + }, + { + "epoch": 0.16083238176800838, + "grad_norm": 2.278597831726074, + "learning_rate": 4.6876323524218405e-05, + "loss": 4.4334, + "step": 27043 + }, + { + "epoch": 0.16083832905128936, + "grad_norm": 1.7002289295196533, + "learning_rate": 4.6876097431993486e-05, + "loss": 4.9251, + "step": 27044 + }, + { + "epoch": 0.16084427633457038, + "grad_norm": 1.626347303390503, + "learning_rate": 4.687587133213186e-05, + "loss": 5.3526, + "step": 27045 + }, + { + "epoch": 0.16085022361785137, + "grad_norm": 1.6184710264205933, + "learning_rate": 4.687564522463358e-05, + "loss": 4.9963, + "step": 27046 + }, + { + "epoch": 0.16085617090113236, + "grad_norm": 1.9560445547103882, + "learning_rate": 4.687541910949874e-05, + "loss": 4.3859, + "step": 27047 + }, + { + "epoch": 0.16086211818441337, + "grad_norm": 1.8181273937225342, + "learning_rate": 4.687519298672743e-05, + "loss": 4.7349, + "step": 27048 + }, + { + "epoch": 0.16086806546769436, + "grad_norm": 1.76878023147583, + "learning_rate": 4.68749668563197e-05, + "loss": 4.6734, + "step": 27049 + }, + { + "epoch": 0.16087401275097535, + "grad_norm": 1.6105148792266846, + "learning_rate": 4.6874740718275655e-05, + "loss": 4.7374, + "step": 27050 + }, + { + "epoch": 0.16087996003425636, + "grad_norm": 1.7216439247131348, + "learning_rate": 4.687451457259536e-05, + "loss": 4.7108, + "step": 27051 + }, + { + "epoch": 0.16088590731753735, + "grad_norm": 1.591200828552246, + "learning_rate": 4.68742884192789e-05, + "loss": 4.8113, + "step": 27052 + }, + { + "epoch": 0.16089185460081834, + "grad_norm": 1.8275965452194214, + "learning_rate": 4.687406225832635e-05, + "loss": 4.765, + "step": 27053 + }, + { + "epoch": 0.16089780188409936, + "grad_norm": 1.796170949935913, + "learning_rate": 4.68738360897378e-05, + "loss": 4.5656, + "step": 27054 + }, + { + "epoch": 0.16090374916738034, + "grad_norm": 1.6721670627593994, + "learning_rate": 4.6873609913513307e-05, + "loss": 4.7761, + "step": 27055 + }, + { + "epoch": 0.16090969645066133, + "grad_norm": 1.577500820159912, + "learning_rate": 4.687338372965296e-05, + "loss": 4.6552, + "step": 27056 + }, + { + "epoch": 0.16091564373394235, + "grad_norm": 1.4649289846420288, + "learning_rate": 4.687315753815685e-05, + "loss": 4.7041, + "step": 27057 + }, + { + "epoch": 0.16092159101722334, + "grad_norm": 1.5088578462600708, + "learning_rate": 4.687293133902505e-05, + "loss": 4.9058, + "step": 27058 + }, + { + "epoch": 0.16092753830050432, + "grad_norm": 1.5987037420272827, + "learning_rate": 4.687270513225763e-05, + "loss": 4.6935, + "step": 27059 + }, + { + "epoch": 0.16093348558378534, + "grad_norm": 1.6780216693878174, + "learning_rate": 4.687247891785468e-05, + "loss": 4.6561, + "step": 27060 + }, + { + "epoch": 0.16093943286706633, + "grad_norm": 1.678200125694275, + "learning_rate": 4.6872252695816265e-05, + "loss": 4.7769, + "step": 27061 + }, + { + "epoch": 0.16094538015034732, + "grad_norm": 1.7499932050704956, + "learning_rate": 4.687202646614248e-05, + "loss": 4.8831, + "step": 27062 + }, + { + "epoch": 0.1609513274336283, + "grad_norm": 1.5174812078475952, + "learning_rate": 4.687180022883339e-05, + "loss": 5.3915, + "step": 27063 + }, + { + "epoch": 0.16095727471690932, + "grad_norm": 1.6853543519973755, + "learning_rate": 4.6871573983889084e-05, + "loss": 5.0194, + "step": 27064 + }, + { + "epoch": 0.1609632220001903, + "grad_norm": 1.590044379234314, + "learning_rate": 4.6871347731309634e-05, + "loss": 4.8239, + "step": 27065 + }, + { + "epoch": 0.1609691692834713, + "grad_norm": 1.6128438711166382, + "learning_rate": 4.6871121471095124e-05, + "loss": 4.418, + "step": 27066 + }, + { + "epoch": 0.1609751165667523, + "grad_norm": 1.5933514833450317, + "learning_rate": 4.6870895203245635e-05, + "loss": 4.5319, + "step": 27067 + }, + { + "epoch": 0.1609810638500333, + "grad_norm": 2.0290753841400146, + "learning_rate": 4.687066892776124e-05, + "loss": 4.2566, + "step": 27068 + }, + { + "epoch": 0.1609870111333143, + "grad_norm": 1.7339308261871338, + "learning_rate": 4.687044264464202e-05, + "loss": 4.7884, + "step": 27069 + }, + { + "epoch": 0.1609929584165953, + "grad_norm": 1.3594622611999512, + "learning_rate": 4.6870216353888056e-05, + "loss": 5.2241, + "step": 27070 + }, + { + "epoch": 0.1609989056998763, + "grad_norm": 1.599043607711792, + "learning_rate": 4.6869990055499424e-05, + "loss": 4.7043, + "step": 27071 + }, + { + "epoch": 0.16100485298315728, + "grad_norm": 1.6405742168426514, + "learning_rate": 4.686976374947621e-05, + "loss": 4.7731, + "step": 27072 + }, + { + "epoch": 0.1610108002664383, + "grad_norm": 1.544199824333191, + "learning_rate": 4.686953743581848e-05, + "loss": 4.3322, + "step": 27073 + }, + { + "epoch": 0.16101674754971929, + "grad_norm": 1.5622215270996094, + "learning_rate": 4.686931111452633e-05, + "loss": 4.4059, + "step": 27074 + }, + { + "epoch": 0.16102269483300027, + "grad_norm": 1.472733497619629, + "learning_rate": 4.6869084785599814e-05, + "loss": 4.5119, + "step": 27075 + }, + { + "epoch": 0.1610286421162813, + "grad_norm": 1.6917856931686401, + "learning_rate": 4.686885844903904e-05, + "loss": 4.4056, + "step": 27076 + }, + { + "epoch": 0.16103458939956228, + "grad_norm": 1.67365300655365, + "learning_rate": 4.6868632104844066e-05, + "loss": 4.6975, + "step": 27077 + }, + { + "epoch": 0.16104053668284327, + "grad_norm": 1.7588708400726318, + "learning_rate": 4.6868405753014974e-05, + "loss": 4.5234, + "step": 27078 + }, + { + "epoch": 0.16104648396612428, + "grad_norm": 1.703722596168518, + "learning_rate": 4.686817939355186e-05, + "loss": 4.8189, + "step": 27079 + }, + { + "epoch": 0.16105243124940527, + "grad_norm": 1.9225337505340576, + "learning_rate": 4.686795302645478e-05, + "loss": 4.6807, + "step": 27080 + }, + { + "epoch": 0.16105837853268626, + "grad_norm": 1.9755665063858032, + "learning_rate": 4.686772665172383e-05, + "loss": 4.6981, + "step": 27081 + }, + { + "epoch": 0.16106432581596727, + "grad_norm": 1.8112698793411255, + "learning_rate": 4.6867500269359084e-05, + "loss": 4.6576, + "step": 27082 + }, + { + "epoch": 0.16107027309924826, + "grad_norm": 1.5739562511444092, + "learning_rate": 4.686727387936062e-05, + "loss": 4.8203, + "step": 27083 + }, + { + "epoch": 0.16107622038252925, + "grad_norm": 1.6816823482513428, + "learning_rate": 4.686704748172851e-05, + "loss": 4.9051, + "step": 27084 + }, + { + "epoch": 0.16108216766581027, + "grad_norm": 1.9315879344940186, + "learning_rate": 4.6866821076462844e-05, + "loss": 4.9205, + "step": 27085 + }, + { + "epoch": 0.16108811494909125, + "grad_norm": 1.9262312650680542, + "learning_rate": 4.686659466356369e-05, + "loss": 4.8491, + "step": 27086 + }, + { + "epoch": 0.16109406223237224, + "grad_norm": 2.244142532348633, + "learning_rate": 4.686636824303114e-05, + "loss": 4.1662, + "step": 27087 + }, + { + "epoch": 0.16110000951565326, + "grad_norm": 1.8732181787490845, + "learning_rate": 4.6866141814865266e-05, + "loss": 4.6906, + "step": 27088 + }, + { + "epoch": 0.16110595679893425, + "grad_norm": 1.7964503765106201, + "learning_rate": 4.686591537906615e-05, + "loss": 4.8282, + "step": 27089 + }, + { + "epoch": 0.16111190408221523, + "grad_norm": 1.828946828842163, + "learning_rate": 4.686568893563387e-05, + "loss": 4.6226, + "step": 27090 + }, + { + "epoch": 0.16111785136549625, + "grad_norm": 1.6230894327163696, + "learning_rate": 4.68654624845685e-05, + "loss": 4.9008, + "step": 27091 + }, + { + "epoch": 0.16112379864877724, + "grad_norm": 1.7094733715057373, + "learning_rate": 4.686523602587012e-05, + "loss": 4.4854, + "step": 27092 + }, + { + "epoch": 0.16112974593205823, + "grad_norm": 1.5419751405715942, + "learning_rate": 4.6865009559538815e-05, + "loss": 4.7452, + "step": 27093 + }, + { + "epoch": 0.16113569321533924, + "grad_norm": 1.7994260787963867, + "learning_rate": 4.686478308557466e-05, + "loss": 4.798, + "step": 27094 + }, + { + "epoch": 0.16114164049862023, + "grad_norm": 1.5732755661010742, + "learning_rate": 4.6864556603977736e-05, + "loss": 5.0714, + "step": 27095 + }, + { + "epoch": 0.16114758778190122, + "grad_norm": 1.7569549083709717, + "learning_rate": 4.686433011474812e-05, + "loss": 5.1888, + "step": 27096 + }, + { + "epoch": 0.16115353506518224, + "grad_norm": 1.5478622913360596, + "learning_rate": 4.6864103617885895e-05, + "loss": 5.1684, + "step": 27097 + }, + { + "epoch": 0.16115948234846322, + "grad_norm": 1.80837082862854, + "learning_rate": 4.6863877113391136e-05, + "loss": 5.0916, + "step": 27098 + }, + { + "epoch": 0.1611654296317442, + "grad_norm": 1.6820951700210571, + "learning_rate": 4.686365060126392e-05, + "loss": 5.0685, + "step": 27099 + }, + { + "epoch": 0.16117137691502523, + "grad_norm": 1.6210129261016846, + "learning_rate": 4.686342408150434e-05, + "loss": 4.591, + "step": 27100 + }, + { + "epoch": 0.16117732419830622, + "grad_norm": 1.7377861738204956, + "learning_rate": 4.6863197554112455e-05, + "loss": 4.7656, + "step": 27101 + }, + { + "epoch": 0.1611832714815872, + "grad_norm": 1.5875985622406006, + "learning_rate": 4.686297101908835e-05, + "loss": 5.003, + "step": 27102 + }, + { + "epoch": 0.16118921876486822, + "grad_norm": 1.6775810718536377, + "learning_rate": 4.686274447643212e-05, + "loss": 5.269, + "step": 27103 + }, + { + "epoch": 0.1611951660481492, + "grad_norm": 1.7519687414169312, + "learning_rate": 4.6862517926143826e-05, + "loss": 5.3185, + "step": 27104 + }, + { + "epoch": 0.1612011133314302, + "grad_norm": 1.6947530508041382, + "learning_rate": 4.6862291368223554e-05, + "loss": 5.0105, + "step": 27105 + }, + { + "epoch": 0.1612070606147112, + "grad_norm": 1.6445891857147217, + "learning_rate": 4.686206480267138e-05, + "loss": 4.6697, + "step": 27106 + }, + { + "epoch": 0.1612130078979922, + "grad_norm": 1.7407753467559814, + "learning_rate": 4.6861838229487385e-05, + "loss": 4.6508, + "step": 27107 + }, + { + "epoch": 0.1612189551812732, + "grad_norm": 1.7013847827911377, + "learning_rate": 4.686161164867164e-05, + "loss": 4.6613, + "step": 27108 + }, + { + "epoch": 0.1612249024645542, + "grad_norm": 1.5500074625015259, + "learning_rate": 4.686138506022425e-05, + "loss": 4.5501, + "step": 27109 + }, + { + "epoch": 0.1612308497478352, + "grad_norm": 1.7138715982437134, + "learning_rate": 4.686115846414526e-05, + "loss": 5.1747, + "step": 27110 + }, + { + "epoch": 0.16123679703111618, + "grad_norm": 1.6952149868011475, + "learning_rate": 4.686093186043478e-05, + "loss": 5.6011, + "step": 27111 + }, + { + "epoch": 0.1612427443143972, + "grad_norm": 1.4229787588119507, + "learning_rate": 4.6860705249092864e-05, + "loss": 5.2581, + "step": 27112 + }, + { + "epoch": 0.16124869159767818, + "grad_norm": 1.5605623722076416, + "learning_rate": 4.68604786301196e-05, + "loss": 4.8483, + "step": 27113 + }, + { + "epoch": 0.16125463888095917, + "grad_norm": 1.7442682981491089, + "learning_rate": 4.686025200351508e-05, + "loss": 5.1217, + "step": 27114 + }, + { + "epoch": 0.1612605861642402, + "grad_norm": 1.8555563688278198, + "learning_rate": 4.6860025369279365e-05, + "loss": 4.8616, + "step": 27115 + }, + { + "epoch": 0.16126653344752118, + "grad_norm": 1.525015115737915, + "learning_rate": 4.685979872741254e-05, + "loss": 5.5315, + "step": 27116 + }, + { + "epoch": 0.16127248073080216, + "grad_norm": 1.656496524810791, + "learning_rate": 4.685957207791468e-05, + "loss": 5.081, + "step": 27117 + }, + { + "epoch": 0.16127842801408318, + "grad_norm": 1.717789649963379, + "learning_rate": 4.685934542078588e-05, + "loss": 5.0375, + "step": 27118 + }, + { + "epoch": 0.16128437529736417, + "grad_norm": 1.4504932165145874, + "learning_rate": 4.6859118756026205e-05, + "loss": 5.5946, + "step": 27119 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 1.7576172351837158, + "learning_rate": 4.685889208363573e-05, + "loss": 5.2869, + "step": 27120 + }, + { + "epoch": 0.16129626986392614, + "grad_norm": 1.7422624826431274, + "learning_rate": 4.685866540361456e-05, + "loss": 5.5119, + "step": 27121 + }, + { + "epoch": 0.16130221714720716, + "grad_norm": 1.8503597974777222, + "learning_rate": 4.685843871596274e-05, + "loss": 5.2748, + "step": 27122 + }, + { + "epoch": 0.16130816443048815, + "grad_norm": 1.4682457447052002, + "learning_rate": 4.685821202068037e-05, + "loss": 5.1808, + "step": 27123 + }, + { + "epoch": 0.16131411171376914, + "grad_norm": 1.6852977275848389, + "learning_rate": 4.685798531776752e-05, + "loss": 5.0024, + "step": 27124 + }, + { + "epoch": 0.16132005899705015, + "grad_norm": 1.3914788961410522, + "learning_rate": 4.6857758607224275e-05, + "loss": 5.6072, + "step": 27125 + }, + { + "epoch": 0.16132600628033114, + "grad_norm": 1.3304249048233032, + "learning_rate": 4.6857531889050716e-05, + "loss": 5.6519, + "step": 27126 + }, + { + "epoch": 0.16133195356361213, + "grad_norm": 1.4981189966201782, + "learning_rate": 4.6857305163246915e-05, + "loss": 5.377, + "step": 27127 + }, + { + "epoch": 0.16133790084689315, + "grad_norm": 1.6323606967926025, + "learning_rate": 4.685707842981295e-05, + "loss": 5.3525, + "step": 27128 + }, + { + "epoch": 0.16134384813017413, + "grad_norm": 1.7571280002593994, + "learning_rate": 4.685685168874892e-05, + "loss": 5.7243, + "step": 27129 + }, + { + "epoch": 0.16134979541345512, + "grad_norm": 1.4080052375793457, + "learning_rate": 4.685662494005487e-05, + "loss": 5.368, + "step": 27130 + }, + { + "epoch": 0.16135574269673614, + "grad_norm": 1.3173414468765259, + "learning_rate": 4.685639818373091e-05, + "loss": 5.6447, + "step": 27131 + }, + { + "epoch": 0.16136168998001713, + "grad_norm": 1.6236382722854614, + "learning_rate": 4.685617141977711e-05, + "loss": 5.4868, + "step": 27132 + }, + { + "epoch": 0.1613676372632981, + "grad_norm": 1.4955110549926758, + "learning_rate": 4.6855944648193535e-05, + "loss": 5.6484, + "step": 27133 + }, + { + "epoch": 0.16137358454657913, + "grad_norm": 1.408130407333374, + "learning_rate": 4.685571786898028e-05, + "loss": 5.4925, + "step": 27134 + }, + { + "epoch": 0.16137953182986012, + "grad_norm": 1.2188119888305664, + "learning_rate": 4.685549108213742e-05, + "loss": 5.459, + "step": 27135 + }, + { + "epoch": 0.1613854791131411, + "grad_norm": 1.5991405248641968, + "learning_rate": 4.685526428766503e-05, + "loss": 5.3962, + "step": 27136 + }, + { + "epoch": 0.16139142639642212, + "grad_norm": 1.3470097780227661, + "learning_rate": 4.68550374855632e-05, + "loss": 5.4446, + "step": 27137 + }, + { + "epoch": 0.1613973736797031, + "grad_norm": 1.439078450202942, + "learning_rate": 4.685481067583201e-05, + "loss": 5.5934, + "step": 27138 + }, + { + "epoch": 0.1614033209629841, + "grad_norm": 1.5195162296295166, + "learning_rate": 4.6854583858471514e-05, + "loss": 5.5948, + "step": 27139 + }, + { + "epoch": 0.16140926824626511, + "grad_norm": 1.3565785884857178, + "learning_rate": 4.6854357033481815e-05, + "loss": 5.4467, + "step": 27140 + }, + { + "epoch": 0.1614152155295461, + "grad_norm": 1.3213258981704712, + "learning_rate": 4.685413020086299e-05, + "loss": 4.7896, + "step": 27141 + }, + { + "epoch": 0.1614211628128271, + "grad_norm": 1.6580665111541748, + "learning_rate": 4.6853903360615106e-05, + "loss": 4.9581, + "step": 27142 + }, + { + "epoch": 0.1614271100961081, + "grad_norm": 1.5277694463729858, + "learning_rate": 4.685367651273825e-05, + "loss": 5.3508, + "step": 27143 + }, + { + "epoch": 0.1614330573793891, + "grad_norm": 1.6369842290878296, + "learning_rate": 4.685344965723251e-05, + "loss": 5.2761, + "step": 27144 + }, + { + "epoch": 0.16143900466267008, + "grad_norm": 1.6954752206802368, + "learning_rate": 4.685322279409795e-05, + "loss": 5.1258, + "step": 27145 + }, + { + "epoch": 0.1614449519459511, + "grad_norm": 1.5073758363723755, + "learning_rate": 4.6852995923334664e-05, + "loss": 5.2927, + "step": 27146 + }, + { + "epoch": 0.1614508992292321, + "grad_norm": 1.5817281007766724, + "learning_rate": 4.685276904494271e-05, + "loss": 5.208, + "step": 27147 + }, + { + "epoch": 0.16145684651251307, + "grad_norm": 1.4444465637207031, + "learning_rate": 4.685254215892219e-05, + "loss": 5.0568, + "step": 27148 + }, + { + "epoch": 0.1614627937957941, + "grad_norm": 1.6507529020309448, + "learning_rate": 4.6852315265273175e-05, + "loss": 4.4881, + "step": 27149 + }, + { + "epoch": 0.16146874107907508, + "grad_norm": 1.3630253076553345, + "learning_rate": 4.685208836399573e-05, + "loss": 4.4938, + "step": 27150 + }, + { + "epoch": 0.16147468836235607, + "grad_norm": 1.5907013416290283, + "learning_rate": 4.685186145508996e-05, + "loss": 4.6613, + "step": 27151 + }, + { + "epoch": 0.16148063564563708, + "grad_norm": 1.4582465887069702, + "learning_rate": 4.6851634538555925e-05, + "loss": 4.8144, + "step": 27152 + }, + { + "epoch": 0.16148658292891807, + "grad_norm": 1.5481383800506592, + "learning_rate": 4.685140761439371e-05, + "loss": 5.2502, + "step": 27153 + }, + { + "epoch": 0.16149253021219906, + "grad_norm": 1.523826003074646, + "learning_rate": 4.685118068260339e-05, + "loss": 5.6317, + "step": 27154 + }, + { + "epoch": 0.16149847749548008, + "grad_norm": 1.502137541770935, + "learning_rate": 4.6850953743185055e-05, + "loss": 5.12, + "step": 27155 + }, + { + "epoch": 0.16150442477876106, + "grad_norm": 1.5802767276763916, + "learning_rate": 4.6850726796138776e-05, + "loss": 4.8374, + "step": 27156 + }, + { + "epoch": 0.16151037206204205, + "grad_norm": 1.6513301134109497, + "learning_rate": 4.685049984146463e-05, + "loss": 5.0668, + "step": 27157 + }, + { + "epoch": 0.16151631934532307, + "grad_norm": 1.5628081560134888, + "learning_rate": 4.6850272879162714e-05, + "loss": 4.7497, + "step": 27158 + }, + { + "epoch": 0.16152226662860406, + "grad_norm": 1.4100914001464844, + "learning_rate": 4.685004590923308e-05, + "loss": 5.606, + "step": 27159 + }, + { + "epoch": 0.16152821391188504, + "grad_norm": 1.3457648754119873, + "learning_rate": 4.684981893167583e-05, + "loss": 5.5325, + "step": 27160 + }, + { + "epoch": 0.16153416119516606, + "grad_norm": 1.6010215282440186, + "learning_rate": 4.684959194649102e-05, + "loss": 5.5653, + "step": 27161 + }, + { + "epoch": 0.16154010847844705, + "grad_norm": 1.8687788248062134, + "learning_rate": 4.684936495367875e-05, + "loss": 5.2795, + "step": 27162 + }, + { + "epoch": 0.16154605576172804, + "grad_norm": 2.1888749599456787, + "learning_rate": 4.68491379532391e-05, + "loss": 5.0313, + "step": 27163 + }, + { + "epoch": 0.16155200304500905, + "grad_norm": 1.466637372970581, + "learning_rate": 4.684891094517214e-05, + "loss": 5.221, + "step": 27164 + }, + { + "epoch": 0.16155795032829004, + "grad_norm": 1.518754482269287, + "learning_rate": 4.684868392947794e-05, + "loss": 5.037, + "step": 27165 + }, + { + "epoch": 0.16156389761157103, + "grad_norm": 1.550714373588562, + "learning_rate": 4.68484569061566e-05, + "loss": 4.8986, + "step": 27166 + }, + { + "epoch": 0.16156984489485204, + "grad_norm": 1.5226268768310547, + "learning_rate": 4.6848229875208186e-05, + "loss": 5.2425, + "step": 27167 + }, + { + "epoch": 0.16157579217813303, + "grad_norm": 1.4854047298431396, + "learning_rate": 4.684800283663279e-05, + "loss": 5.0766, + "step": 27168 + }, + { + "epoch": 0.16158173946141402, + "grad_norm": 1.6625477075576782, + "learning_rate": 4.684777579043047e-05, + "loss": 5.1967, + "step": 27169 + }, + { + "epoch": 0.16158768674469504, + "grad_norm": 1.409916877746582, + "learning_rate": 4.684754873660132e-05, + "loss": 5.0735, + "step": 27170 + }, + { + "epoch": 0.16159363402797602, + "grad_norm": 1.4444838762283325, + "learning_rate": 4.684732167514542e-05, + "loss": 5.013, + "step": 27171 + }, + { + "epoch": 0.161599581311257, + "grad_norm": 1.5226528644561768, + "learning_rate": 4.684709460606284e-05, + "loss": 4.9328, + "step": 27172 + }, + { + "epoch": 0.16160552859453803, + "grad_norm": 1.7353004217147827, + "learning_rate": 4.6846867529353664e-05, + "loss": 4.9422, + "step": 27173 + }, + { + "epoch": 0.16161147587781902, + "grad_norm": 1.613166093826294, + "learning_rate": 4.6846640445017974e-05, + "loss": 5.0545, + "step": 27174 + }, + { + "epoch": 0.1616174231611, + "grad_norm": 1.7949568033218384, + "learning_rate": 4.684641335305585e-05, + "loss": 4.944, + "step": 27175 + }, + { + "epoch": 0.16162337044438102, + "grad_norm": 1.508563756942749, + "learning_rate": 4.684618625346737e-05, + "loss": 5.2551, + "step": 27176 + }, + { + "epoch": 0.161629317727662, + "grad_norm": 1.6090425252914429, + "learning_rate": 4.6845959146252605e-05, + "loss": 5.0839, + "step": 27177 + }, + { + "epoch": 0.161635265010943, + "grad_norm": 1.6595830917358398, + "learning_rate": 4.6845732031411646e-05, + "loss": 5.2307, + "step": 27178 + }, + { + "epoch": 0.16164121229422398, + "grad_norm": 1.787662386894226, + "learning_rate": 4.684550490894457e-05, + "loss": 5.3956, + "step": 27179 + }, + { + "epoch": 0.161647159577505, + "grad_norm": 1.5315039157867432, + "learning_rate": 4.684527777885145e-05, + "loss": 5.8196, + "step": 27180 + }, + { + "epoch": 0.161653106860786, + "grad_norm": 2.004093647003174, + "learning_rate": 4.684505064113238e-05, + "loss": 4.9922, + "step": 27181 + }, + { + "epoch": 0.16165905414406698, + "grad_norm": 1.655718445777893, + "learning_rate": 4.684482349578742e-05, + "loss": 5.0178, + "step": 27182 + }, + { + "epoch": 0.161665001427348, + "grad_norm": 1.721838116645813, + "learning_rate": 4.6844596342816654e-05, + "loss": 4.8412, + "step": 27183 + }, + { + "epoch": 0.16167094871062898, + "grad_norm": 1.6883397102355957, + "learning_rate": 4.684436918222017e-05, + "loss": 4.602, + "step": 27184 + }, + { + "epoch": 0.16167689599390997, + "grad_norm": 1.4376475811004639, + "learning_rate": 4.6844142013998035e-05, + "loss": 4.7408, + "step": 27185 + }, + { + "epoch": 0.16168284327719099, + "grad_norm": 1.5542229413986206, + "learning_rate": 4.684391483815035e-05, + "loss": 5.384, + "step": 27186 + }, + { + "epoch": 0.16168879056047197, + "grad_norm": 1.4321660995483398, + "learning_rate": 4.6843687654677163e-05, + "loss": 5.3393, + "step": 27187 + }, + { + "epoch": 0.16169473784375296, + "grad_norm": 1.7398759126663208, + "learning_rate": 4.684346046357858e-05, + "loss": 5.2492, + "step": 27188 + }, + { + "epoch": 0.16170068512703398, + "grad_norm": 1.802462100982666, + "learning_rate": 4.684323326485467e-05, + "loss": 5.8437, + "step": 27189 + }, + { + "epoch": 0.16170663241031497, + "grad_norm": 1.5931847095489502, + "learning_rate": 4.684300605850551e-05, + "loss": 5.6417, + "step": 27190 + }, + { + "epoch": 0.16171257969359595, + "grad_norm": 1.6900547742843628, + "learning_rate": 4.684277884453119e-05, + "loss": 4.4741, + "step": 27191 + }, + { + "epoch": 0.16171852697687697, + "grad_norm": 1.5422314405441284, + "learning_rate": 4.684255162293178e-05, + "loss": 4.5219, + "step": 27192 + }, + { + "epoch": 0.16172447426015796, + "grad_norm": 1.816253662109375, + "learning_rate": 4.6842324393707354e-05, + "loss": 4.5676, + "step": 27193 + }, + { + "epoch": 0.16173042154343895, + "grad_norm": 1.3935781717300415, + "learning_rate": 4.6842097156858e-05, + "loss": 5.5091, + "step": 27194 + }, + { + "epoch": 0.16173636882671996, + "grad_norm": 1.7103323936462402, + "learning_rate": 4.6841869912383794e-05, + "loss": 5.3831, + "step": 27195 + }, + { + "epoch": 0.16174231611000095, + "grad_norm": 1.4029678106307983, + "learning_rate": 4.6841642660284826e-05, + "loss": 5.2129, + "step": 27196 + }, + { + "epoch": 0.16174826339328194, + "grad_norm": 1.7814414501190186, + "learning_rate": 4.684141540056116e-05, + "loss": 5.3053, + "step": 27197 + }, + { + "epoch": 0.16175421067656295, + "grad_norm": 1.5314795970916748, + "learning_rate": 4.684118813321288e-05, + "loss": 5.3863, + "step": 27198 + }, + { + "epoch": 0.16176015795984394, + "grad_norm": 1.5359210968017578, + "learning_rate": 4.6840960858240065e-05, + "loss": 5.1326, + "step": 27199 + }, + { + "epoch": 0.16176610524312493, + "grad_norm": 1.5624679327011108, + "learning_rate": 4.68407335756428e-05, + "loss": 4.8275, + "step": 27200 + }, + { + "epoch": 0.16177205252640595, + "grad_norm": 1.4580225944519043, + "learning_rate": 4.6840506285421165e-05, + "loss": 4.8576, + "step": 27201 + }, + { + "epoch": 0.16177799980968693, + "grad_norm": 1.687788724899292, + "learning_rate": 4.684027898757523e-05, + "loss": 4.8731, + "step": 27202 + }, + { + "epoch": 0.16178394709296792, + "grad_norm": 1.882171869277954, + "learning_rate": 4.684005168210508e-05, + "loss": 4.8912, + "step": 27203 + }, + { + "epoch": 0.16178989437624894, + "grad_norm": 1.513374924659729, + "learning_rate": 4.6839824369010795e-05, + "loss": 5.2447, + "step": 27204 + }, + { + "epoch": 0.16179584165952993, + "grad_norm": 1.7321797609329224, + "learning_rate": 4.683959704829245e-05, + "loss": 5.0003, + "step": 27205 + }, + { + "epoch": 0.16180178894281091, + "grad_norm": 1.677239179611206, + "learning_rate": 4.683936971995013e-05, + "loss": 5.4732, + "step": 27206 + }, + { + "epoch": 0.16180773622609193, + "grad_norm": 1.615957498550415, + "learning_rate": 4.6839142383983906e-05, + "loss": 5.4448, + "step": 27207 + }, + { + "epoch": 0.16181368350937292, + "grad_norm": 1.4981861114501953, + "learning_rate": 4.6838915040393874e-05, + "loss": 5.4369, + "step": 27208 + }, + { + "epoch": 0.1618196307926539, + "grad_norm": 1.5658632516860962, + "learning_rate": 4.683868768918009e-05, + "loss": 5.474, + "step": 27209 + }, + { + "epoch": 0.16182557807593492, + "grad_norm": 1.469122052192688, + "learning_rate": 4.6838460330342646e-05, + "loss": 5.3001, + "step": 27210 + }, + { + "epoch": 0.1618315253592159, + "grad_norm": 1.5378628969192505, + "learning_rate": 4.683823296388163e-05, + "loss": 4.8535, + "step": 27211 + }, + { + "epoch": 0.1618374726424969, + "grad_norm": 1.6330792903900146, + "learning_rate": 4.6838005589797106e-05, + "loss": 4.812, + "step": 27212 + }, + { + "epoch": 0.16184341992577791, + "grad_norm": 1.89853036403656, + "learning_rate": 4.683777820808917e-05, + "loss": 5.1666, + "step": 27213 + }, + { + "epoch": 0.1618493672090589, + "grad_norm": 1.5161629915237427, + "learning_rate": 4.683755081875788e-05, + "loss": 5.1444, + "step": 27214 + }, + { + "epoch": 0.1618553144923399, + "grad_norm": 1.6083909273147583, + "learning_rate": 4.683732342180333e-05, + "loss": 5.1403, + "step": 27215 + }, + { + "epoch": 0.1618612617756209, + "grad_norm": 1.5731655359268188, + "learning_rate": 4.68370960172256e-05, + "loss": 5.1038, + "step": 27216 + }, + { + "epoch": 0.1618672090589019, + "grad_norm": 1.8221924304962158, + "learning_rate": 4.6836868605024756e-05, + "loss": 4.8889, + "step": 27217 + }, + { + "epoch": 0.16187315634218288, + "grad_norm": 1.7264484167099, + "learning_rate": 4.683664118520089e-05, + "loss": 5.2846, + "step": 27218 + }, + { + "epoch": 0.1618791036254639, + "grad_norm": 1.6429424285888672, + "learning_rate": 4.683641375775409e-05, + "loss": 5.1433, + "step": 27219 + }, + { + "epoch": 0.1618850509087449, + "grad_norm": 1.6444041728973389, + "learning_rate": 4.683618632268441e-05, + "loss": 5.7116, + "step": 27220 + }, + { + "epoch": 0.16189099819202588, + "grad_norm": 1.595996379852295, + "learning_rate": 4.683595887999195e-05, + "loss": 5.4419, + "step": 27221 + }, + { + "epoch": 0.1618969454753069, + "grad_norm": 1.489001989364624, + "learning_rate": 4.6835731429676776e-05, + "loss": 5.2004, + "step": 27222 + }, + { + "epoch": 0.16190289275858788, + "grad_norm": 1.6208230257034302, + "learning_rate": 4.683550397173898e-05, + "loss": 5.2405, + "step": 27223 + }, + { + "epoch": 0.16190884004186887, + "grad_norm": 1.7584507465362549, + "learning_rate": 4.683527650617863e-05, + "loss": 4.5921, + "step": 27224 + }, + { + "epoch": 0.16191478732514988, + "grad_norm": 1.8459594249725342, + "learning_rate": 4.683504903299581e-05, + "loss": 4.6269, + "step": 27225 + }, + { + "epoch": 0.16192073460843087, + "grad_norm": 2.055671453475952, + "learning_rate": 4.683482155219061e-05, + "loss": 4.8219, + "step": 27226 + }, + { + "epoch": 0.16192668189171186, + "grad_norm": 1.8772468566894531, + "learning_rate": 4.683459406376309e-05, + "loss": 4.9343, + "step": 27227 + }, + { + "epoch": 0.16193262917499288, + "grad_norm": 1.8033567667007446, + "learning_rate": 4.683436656771334e-05, + "loss": 4.5181, + "step": 27228 + }, + { + "epoch": 0.16193857645827386, + "grad_norm": 1.8112131357192993, + "learning_rate": 4.6834139064041436e-05, + "loss": 4.6479, + "step": 27229 + }, + { + "epoch": 0.16194452374155485, + "grad_norm": 1.958721399307251, + "learning_rate": 4.6833911552747466e-05, + "loss": 4.3747, + "step": 27230 + }, + { + "epoch": 0.16195047102483587, + "grad_norm": 1.9740078449249268, + "learning_rate": 4.683368403383151e-05, + "loss": 4.5357, + "step": 27231 + }, + { + "epoch": 0.16195641830811686, + "grad_norm": 1.8071064949035645, + "learning_rate": 4.683345650729362e-05, + "loss": 4.2025, + "step": 27232 + }, + { + "epoch": 0.16196236559139784, + "grad_norm": 2.067153215408325, + "learning_rate": 4.6833228973133914e-05, + "loss": 4.7599, + "step": 27233 + }, + { + "epoch": 0.16196831287467886, + "grad_norm": 2.219170570373535, + "learning_rate": 4.683300143135244e-05, + "loss": 4.8643, + "step": 27234 + }, + { + "epoch": 0.16197426015795985, + "grad_norm": 1.8077818155288696, + "learning_rate": 4.68327738819493e-05, + "loss": 4.9781, + "step": 27235 + }, + { + "epoch": 0.16198020744124084, + "grad_norm": 2.1170096397399902, + "learning_rate": 4.683254632492456e-05, + "loss": 4.5507, + "step": 27236 + }, + { + "epoch": 0.16198615472452182, + "grad_norm": 1.9441372156143188, + "learning_rate": 4.6832318760278306e-05, + "loss": 4.2419, + "step": 27237 + }, + { + "epoch": 0.16199210200780284, + "grad_norm": 2.261038064956665, + "learning_rate": 4.6832091188010615e-05, + "loss": 4.8287, + "step": 27238 + }, + { + "epoch": 0.16199804929108383, + "grad_norm": 1.906253457069397, + "learning_rate": 4.6831863608121565e-05, + "loss": 4.7154, + "step": 27239 + }, + { + "epoch": 0.16200399657436482, + "grad_norm": 1.7181471586227417, + "learning_rate": 4.683163602061124e-05, + "loss": 4.8286, + "step": 27240 + }, + { + "epoch": 0.16200994385764583, + "grad_norm": 1.6163973808288574, + "learning_rate": 4.683140842547971e-05, + "loss": 5.1988, + "step": 27241 + }, + { + "epoch": 0.16201589114092682, + "grad_norm": 1.8723608255386353, + "learning_rate": 4.6831180822727064e-05, + "loss": 4.6135, + "step": 27242 + }, + { + "epoch": 0.1620218384242078, + "grad_norm": 1.557589054107666, + "learning_rate": 4.683095321235338e-05, + "loss": 4.7632, + "step": 27243 + }, + { + "epoch": 0.16202778570748883, + "grad_norm": 1.3284127712249756, + "learning_rate": 4.683072559435873e-05, + "loss": 4.0683, + "step": 27244 + }, + { + "epoch": 0.1620337329907698, + "grad_norm": 1.5295307636260986, + "learning_rate": 4.68304979687432e-05, + "loss": 4.2219, + "step": 27245 + }, + { + "epoch": 0.1620396802740508, + "grad_norm": 2.0153698921203613, + "learning_rate": 4.683027033550687e-05, + "loss": 4.8334, + "step": 27246 + }, + { + "epoch": 0.16204562755733182, + "grad_norm": 1.3090236186981201, + "learning_rate": 4.683004269464983e-05, + "loss": 5.1588, + "step": 27247 + }, + { + "epoch": 0.1620515748406128, + "grad_norm": 1.4936387538909912, + "learning_rate": 4.6829815046172136e-05, + "loss": 5.2226, + "step": 27248 + }, + { + "epoch": 0.1620575221238938, + "grad_norm": 1.6028317213058472, + "learning_rate": 4.682958739007388e-05, + "loss": 5.0174, + "step": 27249 + }, + { + "epoch": 0.1620634694071748, + "grad_norm": 1.221101999282837, + "learning_rate": 4.6829359726355144e-05, + "loss": 5.3307, + "step": 27250 + }, + { + "epoch": 0.1620694166904558, + "grad_norm": 1.348512887954712, + "learning_rate": 4.6829132055016e-05, + "loss": 5.4754, + "step": 27251 + }, + { + "epoch": 0.16207536397373679, + "grad_norm": 1.506373643875122, + "learning_rate": 4.682890437605654e-05, + "loss": 5.0422, + "step": 27252 + }, + { + "epoch": 0.1620813112570178, + "grad_norm": 1.7753325700759888, + "learning_rate": 4.6828676689476825e-05, + "loss": 5.0218, + "step": 27253 + }, + { + "epoch": 0.1620872585402988, + "grad_norm": 1.5221372842788696, + "learning_rate": 4.6828448995276945e-05, + "loss": 5.1423, + "step": 27254 + }, + { + "epoch": 0.16209320582357978, + "grad_norm": 1.7772079706192017, + "learning_rate": 4.682822129345699e-05, + "loss": 4.8782, + "step": 27255 + }, + { + "epoch": 0.1620991531068608, + "grad_norm": 1.495651125907898, + "learning_rate": 4.6827993584017014e-05, + "loss": 5.2042, + "step": 27256 + }, + { + "epoch": 0.16210510039014178, + "grad_norm": 1.5901660919189453, + "learning_rate": 4.682776586695712e-05, + "loss": 5.5121, + "step": 27257 + }, + { + "epoch": 0.16211104767342277, + "grad_norm": 1.7442855834960938, + "learning_rate": 4.6827538142277373e-05, + "loss": 4.9278, + "step": 27258 + }, + { + "epoch": 0.16211699495670379, + "grad_norm": 2.777273416519165, + "learning_rate": 4.682731040997786e-05, + "loss": 4.6258, + "step": 27259 + }, + { + "epoch": 0.16212294223998477, + "grad_norm": 1.8470478057861328, + "learning_rate": 4.6827082670058655e-05, + "loss": 4.87, + "step": 27260 + }, + { + "epoch": 0.16212888952326576, + "grad_norm": 1.545902132987976, + "learning_rate": 4.6826854922519844e-05, + "loss": 4.8776, + "step": 27261 + }, + { + "epoch": 0.16213483680654678, + "grad_norm": 1.5720170736312866, + "learning_rate": 4.682662716736151e-05, + "loss": 4.9046, + "step": 27262 + }, + { + "epoch": 0.16214078408982777, + "grad_norm": 1.6243836879730225, + "learning_rate": 4.682639940458372e-05, + "loss": 5.0243, + "step": 27263 + }, + { + "epoch": 0.16214673137310875, + "grad_norm": 2.738065719604492, + "learning_rate": 4.682617163418656e-05, + "loss": 4.1899, + "step": 27264 + }, + { + "epoch": 0.16215267865638977, + "grad_norm": 4.745233058929443, + "learning_rate": 4.682594385617011e-05, + "loss": 3.0819, + "step": 27265 + }, + { + "epoch": 0.16215862593967076, + "grad_norm": 4.1978936195373535, + "learning_rate": 4.6825716070534444e-05, + "loss": 3.1755, + "step": 27266 + }, + { + "epoch": 0.16216457322295175, + "grad_norm": 2.8367183208465576, + "learning_rate": 4.682548827727965e-05, + "loss": 3.53, + "step": 27267 + }, + { + "epoch": 0.16217052050623276, + "grad_norm": 1.7866027355194092, + "learning_rate": 4.6825260476405805e-05, + "loss": 4.6173, + "step": 27268 + }, + { + "epoch": 0.16217646778951375, + "grad_norm": 1.7661093473434448, + "learning_rate": 4.6825032667912984e-05, + "loss": 5.0541, + "step": 27269 + }, + { + "epoch": 0.16218241507279474, + "grad_norm": 1.9146814346313477, + "learning_rate": 4.682480485180127e-05, + "loss": 4.9121, + "step": 27270 + }, + { + "epoch": 0.16218836235607575, + "grad_norm": 2.8185949325561523, + "learning_rate": 4.682457702807075e-05, + "loss": 2.9822, + "step": 27271 + }, + { + "epoch": 0.16219430963935674, + "grad_norm": 3.360478162765503, + "learning_rate": 4.682434919672148e-05, + "loss": 2.2526, + "step": 27272 + }, + { + "epoch": 0.16220025692263773, + "grad_norm": 3.5563254356384277, + "learning_rate": 4.682412135775357e-05, + "loss": 3.3203, + "step": 27273 + }, + { + "epoch": 0.16220620420591875, + "grad_norm": 2.84264874458313, + "learning_rate": 4.682389351116707e-05, + "loss": 3.1093, + "step": 27274 + }, + { + "epoch": 0.16221215148919974, + "grad_norm": 2.6400508880615234, + "learning_rate": 4.682366565696208e-05, + "loss": 4.1745, + "step": 27275 + }, + { + "epoch": 0.16221809877248072, + "grad_norm": 2.5986385345458984, + "learning_rate": 4.682343779513868e-05, + "loss": 5.5863, + "step": 27276 + }, + { + "epoch": 0.16222404605576174, + "grad_norm": 2.3456249237060547, + "learning_rate": 4.6823209925696945e-05, + "loss": 4.3825, + "step": 27277 + }, + { + "epoch": 0.16222999333904273, + "grad_norm": 1.909117341041565, + "learning_rate": 4.682298204863694e-05, + "loss": 4.9451, + "step": 27278 + }, + { + "epoch": 0.16223594062232372, + "grad_norm": 1.6204262971878052, + "learning_rate": 4.682275416395877e-05, + "loss": 5.0483, + "step": 27279 + }, + { + "epoch": 0.16224188790560473, + "grad_norm": 1.5689494609832764, + "learning_rate": 4.68225262716625e-05, + "loss": 5.0821, + "step": 27280 + }, + { + "epoch": 0.16224783518888572, + "grad_norm": 1.553642749786377, + "learning_rate": 4.682229837174821e-05, + "loss": 5.3247, + "step": 27281 + }, + { + "epoch": 0.1622537824721667, + "grad_norm": 2.1375479698181152, + "learning_rate": 4.682207046421597e-05, + "loss": 4.4596, + "step": 27282 + }, + { + "epoch": 0.16225972975544772, + "grad_norm": 2.6894989013671875, + "learning_rate": 4.682184254906589e-05, + "loss": 4.1466, + "step": 27283 + }, + { + "epoch": 0.1622656770387287, + "grad_norm": 2.0883328914642334, + "learning_rate": 4.6821614626298015e-05, + "loss": 4.1182, + "step": 27284 + }, + { + "epoch": 0.1622716243220097, + "grad_norm": 2.263207197189331, + "learning_rate": 4.6821386695912444e-05, + "loss": 4.1029, + "step": 27285 + }, + { + "epoch": 0.16227757160529072, + "grad_norm": 2.2623839378356934, + "learning_rate": 4.6821158757909255e-05, + "loss": 4.0745, + "step": 27286 + }, + { + "epoch": 0.1622835188885717, + "grad_norm": 1.7428866624832153, + "learning_rate": 4.682093081228852e-05, + "loss": 4.7707, + "step": 27287 + }, + { + "epoch": 0.1622894661718527, + "grad_norm": 2.0418710708618164, + "learning_rate": 4.682070285905033e-05, + "loss": 4.5464, + "step": 27288 + }, + { + "epoch": 0.1622954134551337, + "grad_norm": 2.421755313873291, + "learning_rate": 4.682047489819475e-05, + "loss": 3.9835, + "step": 27289 + }, + { + "epoch": 0.1623013607384147, + "grad_norm": 2.3179736137390137, + "learning_rate": 4.682024692972188e-05, + "loss": 3.8936, + "step": 27290 + }, + { + "epoch": 0.16230730802169568, + "grad_norm": 2.144463300704956, + "learning_rate": 4.682001895363177e-05, + "loss": 4.123, + "step": 27291 + }, + { + "epoch": 0.1623132553049767, + "grad_norm": 1.8054444789886475, + "learning_rate": 4.681979096992454e-05, + "loss": 4.5947, + "step": 27292 + }, + { + "epoch": 0.1623192025882577, + "grad_norm": 1.9559820890426636, + "learning_rate": 4.681956297860023e-05, + "loss": 4.1805, + "step": 27293 + }, + { + "epoch": 0.16232514987153868, + "grad_norm": 2.253756284713745, + "learning_rate": 4.6819334979658934e-05, + "loss": 3.7279, + "step": 27294 + }, + { + "epoch": 0.16233109715481966, + "grad_norm": 2.1193337440490723, + "learning_rate": 4.681910697310074e-05, + "loss": 3.646, + "step": 27295 + }, + { + "epoch": 0.16233704443810068, + "grad_norm": 2.2527666091918945, + "learning_rate": 4.681887895892572e-05, + "loss": 4.0891, + "step": 27296 + }, + { + "epoch": 0.16234299172138167, + "grad_norm": 2.255565643310547, + "learning_rate": 4.681865093713396e-05, + "loss": 3.8497, + "step": 27297 + }, + { + "epoch": 0.16234893900466266, + "grad_norm": 2.3153398036956787, + "learning_rate": 4.681842290772552e-05, + "loss": 3.787, + "step": 27298 + }, + { + "epoch": 0.16235488628794367, + "grad_norm": 2.7600228786468506, + "learning_rate": 4.681819487070051e-05, + "loss": 4.0376, + "step": 27299 + }, + { + "epoch": 0.16236083357122466, + "grad_norm": 1.8102682828903198, + "learning_rate": 4.681796682605898e-05, + "loss": 4.1901, + "step": 27300 + }, + { + "epoch": 0.16236678085450565, + "grad_norm": 2.125884771347046, + "learning_rate": 4.6817738773801035e-05, + "loss": 4.4809, + "step": 27301 + }, + { + "epoch": 0.16237272813778666, + "grad_norm": 2.308034658432007, + "learning_rate": 4.681751071392674e-05, + "loss": 3.7836, + "step": 27302 + }, + { + "epoch": 0.16237867542106765, + "grad_norm": 2.2197370529174805, + "learning_rate": 4.6817282646436166e-05, + "loss": 3.7033, + "step": 27303 + }, + { + "epoch": 0.16238462270434864, + "grad_norm": 1.7763569355010986, + "learning_rate": 4.681705457132942e-05, + "loss": 4.7483, + "step": 27304 + }, + { + "epoch": 0.16239056998762966, + "grad_norm": 2.2781457901000977, + "learning_rate": 4.681682648860656e-05, + "loss": 3.5617, + "step": 27305 + }, + { + "epoch": 0.16239651727091065, + "grad_norm": 2.257497787475586, + "learning_rate": 4.6816598398267664e-05, + "loss": 3.7756, + "step": 27306 + }, + { + "epoch": 0.16240246455419163, + "grad_norm": 2.277405261993408, + "learning_rate": 4.681637030031283e-05, + "loss": 3.6759, + "step": 27307 + }, + { + "epoch": 0.16240841183747265, + "grad_norm": 2.160238265991211, + "learning_rate": 4.681614219474212e-05, + "loss": 3.568, + "step": 27308 + }, + { + "epoch": 0.16241435912075364, + "grad_norm": 2.0068106651306152, + "learning_rate": 4.6815914081555624e-05, + "loss": 3.7039, + "step": 27309 + }, + { + "epoch": 0.16242030640403463, + "grad_norm": 3.0893945693969727, + "learning_rate": 4.681568596075341e-05, + "loss": 3.8708, + "step": 27310 + }, + { + "epoch": 0.16242625368731564, + "grad_norm": 2.5544440746307373, + "learning_rate": 4.681545783233557e-05, + "loss": 3.9529, + "step": 27311 + }, + { + "epoch": 0.16243220097059663, + "grad_norm": 1.7706321477890015, + "learning_rate": 4.681522969630218e-05, + "loss": 6.004, + "step": 27312 + }, + { + "epoch": 0.16243814825387762, + "grad_norm": 2.0155160427093506, + "learning_rate": 4.681500155265332e-05, + "loss": 4.1088, + "step": 27313 + }, + { + "epoch": 0.16244409553715863, + "grad_norm": 2.436854124069214, + "learning_rate": 4.681477340138907e-05, + "loss": 3.7281, + "step": 27314 + }, + { + "epoch": 0.16245004282043962, + "grad_norm": 1.7717199325561523, + "learning_rate": 4.68145452425095e-05, + "loss": 4.7058, + "step": 27315 + }, + { + "epoch": 0.1624559901037206, + "grad_norm": 1.8537521362304688, + "learning_rate": 4.6814317076014705e-05, + "loss": 5.5633, + "step": 27316 + }, + { + "epoch": 0.16246193738700163, + "grad_norm": 1.4485749006271362, + "learning_rate": 4.681408890190475e-05, + "loss": 6.1646, + "step": 27317 + }, + { + "epoch": 0.16246788467028261, + "grad_norm": 1.7619411945343018, + "learning_rate": 4.681386072017973e-05, + "loss": 4.9872, + "step": 27318 + }, + { + "epoch": 0.1624738319535636, + "grad_norm": 1.3868266344070435, + "learning_rate": 4.681363253083971e-05, + "loss": 5.337, + "step": 27319 + }, + { + "epoch": 0.16247977923684462, + "grad_norm": 2.339705467224121, + "learning_rate": 4.681340433388478e-05, + "loss": 4.1131, + "step": 27320 + }, + { + "epoch": 0.1624857265201256, + "grad_norm": 2.4623711109161377, + "learning_rate": 4.681317612931502e-05, + "loss": 4.0167, + "step": 27321 + }, + { + "epoch": 0.1624916738034066, + "grad_norm": 2.06557559967041, + "learning_rate": 4.68129479171305e-05, + "loss": 4.4482, + "step": 27322 + }, + { + "epoch": 0.1624976210866876, + "grad_norm": 1.9864208698272705, + "learning_rate": 4.681271969733131e-05, + "loss": 4.5421, + "step": 27323 + }, + { + "epoch": 0.1625035683699686, + "grad_norm": 2.29591703414917, + "learning_rate": 4.6812491469917516e-05, + "loss": 4.4407, + "step": 27324 + }, + { + "epoch": 0.1625095156532496, + "grad_norm": 1.9640796184539795, + "learning_rate": 4.681226323488921e-05, + "loss": 4.3113, + "step": 27325 + }, + { + "epoch": 0.1625154629365306, + "grad_norm": 1.6823822259902954, + "learning_rate": 4.6812034992246464e-05, + "loss": 5.3048, + "step": 27326 + }, + { + "epoch": 0.1625214102198116, + "grad_norm": 1.7765403985977173, + "learning_rate": 4.681180674198937e-05, + "loss": 4.7484, + "step": 27327 + }, + { + "epoch": 0.16252735750309258, + "grad_norm": 2.8496274948120117, + "learning_rate": 4.6811578484118e-05, + "loss": 2.9788, + "step": 27328 + }, + { + "epoch": 0.1625333047863736, + "grad_norm": 2.600203514099121, + "learning_rate": 4.681135021863243e-05, + "loss": 3.6706, + "step": 27329 + }, + { + "epoch": 0.16253925206965458, + "grad_norm": 2.3449292182922363, + "learning_rate": 4.681112194553274e-05, + "loss": 3.1501, + "step": 27330 + }, + { + "epoch": 0.16254519935293557, + "grad_norm": 2.6009342670440674, + "learning_rate": 4.681089366481902e-05, + "loss": 3.3097, + "step": 27331 + }, + { + "epoch": 0.1625511466362166, + "grad_norm": 2.4977009296417236, + "learning_rate": 4.681066537649134e-05, + "loss": 3.2114, + "step": 27332 + }, + { + "epoch": 0.16255709391949758, + "grad_norm": 1.9522204399108887, + "learning_rate": 4.681043708054978e-05, + "loss": 4.9502, + "step": 27333 + }, + { + "epoch": 0.16256304120277856, + "grad_norm": 1.8254719972610474, + "learning_rate": 4.6810208776994425e-05, + "loss": 5.1497, + "step": 27334 + }, + { + "epoch": 0.16256898848605958, + "grad_norm": 2.9470701217651367, + "learning_rate": 4.680998046582535e-05, + "loss": 3.1034, + "step": 27335 + }, + { + "epoch": 0.16257493576934057, + "grad_norm": 3.033200979232788, + "learning_rate": 4.680975214704263e-05, + "loss": 3.1627, + "step": 27336 + }, + { + "epoch": 0.16258088305262156, + "grad_norm": 2.9590744972229004, + "learning_rate": 4.680952382064636e-05, + "loss": 3.6219, + "step": 27337 + }, + { + "epoch": 0.16258683033590257, + "grad_norm": 1.759320616722107, + "learning_rate": 4.680929548663661e-05, + "loss": 5.0067, + "step": 27338 + }, + { + "epoch": 0.16259277761918356, + "grad_norm": 1.7571178674697876, + "learning_rate": 4.680906714501345e-05, + "loss": 4.9829, + "step": 27339 + }, + { + "epoch": 0.16259872490246455, + "grad_norm": 1.7212225198745728, + "learning_rate": 4.680883879577698e-05, + "loss": 4.854, + "step": 27340 + }, + { + "epoch": 0.16260467218574556, + "grad_norm": 1.732384204864502, + "learning_rate": 4.680861043892727e-05, + "loss": 4.7023, + "step": 27341 + }, + { + "epoch": 0.16261061946902655, + "grad_norm": 1.8678463697433472, + "learning_rate": 4.680838207446439e-05, + "loss": 5.3755, + "step": 27342 + }, + { + "epoch": 0.16261656675230754, + "grad_norm": 1.6973927021026611, + "learning_rate": 4.680815370238843e-05, + "loss": 4.678, + "step": 27343 + }, + { + "epoch": 0.16262251403558856, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.680792532269948e-05, + "loss": 4.8053, + "step": 27344 + }, + { + "epoch": 0.16262846131886954, + "grad_norm": 1.6367913484573364, + "learning_rate": 4.6807696935397604e-05, + "loss": 4.8855, + "step": 27345 + }, + { + "epoch": 0.16263440860215053, + "grad_norm": 1.5021651983261108, + "learning_rate": 4.680746854048288e-05, + "loss": 5.318, + "step": 27346 + }, + { + "epoch": 0.16264035588543155, + "grad_norm": 1.329917073249817, + "learning_rate": 4.68072401379554e-05, + "loss": 6.0137, + "step": 27347 + }, + { + "epoch": 0.16264630316871254, + "grad_norm": 1.6316022872924805, + "learning_rate": 4.680701172781524e-05, + "loss": 5.8787, + "step": 27348 + }, + { + "epoch": 0.16265225045199352, + "grad_norm": 1.640479564666748, + "learning_rate": 4.6806783310062476e-05, + "loss": 5.568, + "step": 27349 + }, + { + "epoch": 0.16265819773527454, + "grad_norm": 1.6600250005722046, + "learning_rate": 4.680655488469718e-05, + "loss": 5.3461, + "step": 27350 + }, + { + "epoch": 0.16266414501855553, + "grad_norm": 1.7950623035430908, + "learning_rate": 4.680632645171945e-05, + "loss": 4.8529, + "step": 27351 + }, + { + "epoch": 0.16267009230183652, + "grad_norm": 1.732972502708435, + "learning_rate": 4.6806098011129356e-05, + "loss": 4.8085, + "step": 27352 + }, + { + "epoch": 0.1626760395851175, + "grad_norm": 1.7508574724197388, + "learning_rate": 4.680586956292698e-05, + "loss": 4.9188, + "step": 27353 + }, + { + "epoch": 0.16268198686839852, + "grad_norm": 1.521814227104187, + "learning_rate": 4.6805641107112395e-05, + "loss": 4.6616, + "step": 27354 + }, + { + "epoch": 0.1626879341516795, + "grad_norm": 1.7594850063323975, + "learning_rate": 4.6805412643685684e-05, + "loss": 4.6634, + "step": 27355 + }, + { + "epoch": 0.1626938814349605, + "grad_norm": 1.5281226634979248, + "learning_rate": 4.6805184172646944e-05, + "loss": 5.0508, + "step": 27356 + }, + { + "epoch": 0.1626998287182415, + "grad_norm": 1.3342808485031128, + "learning_rate": 4.6804955693996225e-05, + "loss": 5.605, + "step": 27357 + }, + { + "epoch": 0.1627057760015225, + "grad_norm": 1.5639429092407227, + "learning_rate": 4.680472720773362e-05, + "loss": 5.0959, + "step": 27358 + }, + { + "epoch": 0.1627117232848035, + "grad_norm": 1.661442756652832, + "learning_rate": 4.680449871385922e-05, + "loss": 4.8981, + "step": 27359 + }, + { + "epoch": 0.1627176705680845, + "grad_norm": 1.601442813873291, + "learning_rate": 4.6804270212373094e-05, + "loss": 4.8313, + "step": 27360 + }, + { + "epoch": 0.1627236178513655, + "grad_norm": 1.5367902517318726, + "learning_rate": 4.6804041703275315e-05, + "loss": 4.8772, + "step": 27361 + }, + { + "epoch": 0.16272956513464648, + "grad_norm": 1.5161237716674805, + "learning_rate": 4.680381318656597e-05, + "loss": 4.7877, + "step": 27362 + }, + { + "epoch": 0.1627355124179275, + "grad_norm": 1.790384292602539, + "learning_rate": 4.680358466224515e-05, + "loss": 5.2596, + "step": 27363 + }, + { + "epoch": 0.16274145970120849, + "grad_norm": 1.6441622972488403, + "learning_rate": 4.6803356130312915e-05, + "loss": 5.3774, + "step": 27364 + }, + { + "epoch": 0.16274740698448947, + "grad_norm": 1.4816210269927979, + "learning_rate": 4.680312759076935e-05, + "loss": 5.4754, + "step": 27365 + }, + { + "epoch": 0.1627533542677705, + "grad_norm": 1.5345895290374756, + "learning_rate": 4.680289904361454e-05, + "loss": 5.2805, + "step": 27366 + }, + { + "epoch": 0.16275930155105148, + "grad_norm": 1.3760472536087036, + "learning_rate": 4.680267048884857e-05, + "loss": 5.327, + "step": 27367 + }, + { + "epoch": 0.16276524883433247, + "grad_norm": 2.4343063831329346, + "learning_rate": 4.680244192647151e-05, + "loss": 4.8059, + "step": 27368 + }, + { + "epoch": 0.16277119611761348, + "grad_norm": 2.8197708129882812, + "learning_rate": 4.6802213356483444e-05, + "loss": 4.1087, + "step": 27369 + }, + { + "epoch": 0.16277714340089447, + "grad_norm": 3.0709099769592285, + "learning_rate": 4.680198477888445e-05, + "loss": 4.1441, + "step": 27370 + }, + { + "epoch": 0.16278309068417546, + "grad_norm": 2.8608505725860596, + "learning_rate": 4.680175619367461e-05, + "loss": 4.3136, + "step": 27371 + }, + { + "epoch": 0.16278903796745647, + "grad_norm": 2.9403672218322754, + "learning_rate": 4.6801527600854e-05, + "loss": 3.903, + "step": 27372 + }, + { + "epoch": 0.16279498525073746, + "grad_norm": 1.7551895380020142, + "learning_rate": 4.6801299000422696e-05, + "loss": 5.0392, + "step": 27373 + }, + { + "epoch": 0.16280093253401845, + "grad_norm": 1.862855076789856, + "learning_rate": 4.680107039238079e-05, + "loss": 4.712, + "step": 27374 + }, + { + "epoch": 0.16280687981729947, + "grad_norm": 1.6673380136489868, + "learning_rate": 4.680084177672835e-05, + "loss": 5.1954, + "step": 27375 + }, + { + "epoch": 0.16281282710058045, + "grad_norm": 1.3807284832000732, + "learning_rate": 4.680061315346547e-05, + "loss": 5.7525, + "step": 27376 + }, + { + "epoch": 0.16281877438386144, + "grad_norm": 1.6106042861938477, + "learning_rate": 4.680038452259222e-05, + "loss": 6.1879, + "step": 27377 + }, + { + "epoch": 0.16282472166714246, + "grad_norm": 1.3592698574066162, + "learning_rate": 4.6800155884108674e-05, + "loss": 5.725, + "step": 27378 + }, + { + "epoch": 0.16283066895042345, + "grad_norm": 1.7938450574874878, + "learning_rate": 4.679992723801493e-05, + "loss": 4.8694, + "step": 27379 + }, + { + "epoch": 0.16283661623370443, + "grad_norm": 2.0678904056549072, + "learning_rate": 4.679969858431105e-05, + "loss": 5.0753, + "step": 27380 + }, + { + "epoch": 0.16284256351698545, + "grad_norm": 2.147873640060425, + "learning_rate": 4.679946992299712e-05, + "loss": 5.2131, + "step": 27381 + }, + { + "epoch": 0.16284851080026644, + "grad_norm": 1.7163617610931396, + "learning_rate": 4.679924125407322e-05, + "loss": 5.2478, + "step": 27382 + }, + { + "epoch": 0.16285445808354743, + "grad_norm": 2.040842056274414, + "learning_rate": 4.679901257753943e-05, + "loss": 5.2402, + "step": 27383 + }, + { + "epoch": 0.16286040536682844, + "grad_norm": 1.8307139873504639, + "learning_rate": 4.6798783893395834e-05, + "loss": 4.5761, + "step": 27384 + }, + { + "epoch": 0.16286635265010943, + "grad_norm": 1.4522336721420288, + "learning_rate": 4.67985552016425e-05, + "loss": 4.7127, + "step": 27385 + }, + { + "epoch": 0.16287229993339042, + "grad_norm": 1.8996527194976807, + "learning_rate": 4.679832650227952e-05, + "loss": 4.7754, + "step": 27386 + }, + { + "epoch": 0.16287824721667143, + "grad_norm": 2.1785221099853516, + "learning_rate": 4.679809779530697e-05, + "loss": 4.9305, + "step": 27387 + }, + { + "epoch": 0.16288419449995242, + "grad_norm": 2.266005754470825, + "learning_rate": 4.679786908072493e-05, + "loss": 5.1013, + "step": 27388 + }, + { + "epoch": 0.1628901417832334, + "grad_norm": 2.08335542678833, + "learning_rate": 4.679764035853348e-05, + "loss": 5.0172, + "step": 27389 + }, + { + "epoch": 0.16289608906651443, + "grad_norm": 2.1042888164520264, + "learning_rate": 4.679741162873269e-05, + "loss": 5.0088, + "step": 27390 + }, + { + "epoch": 0.16290203634979541, + "grad_norm": 2.0641071796417236, + "learning_rate": 4.679718289132266e-05, + "loss": 4.9374, + "step": 27391 + }, + { + "epoch": 0.1629079836330764, + "grad_norm": 1.855651617050171, + "learning_rate": 4.6796954146303454e-05, + "loss": 5.0419, + "step": 27392 + }, + { + "epoch": 0.16291393091635742, + "grad_norm": 1.8837964534759521, + "learning_rate": 4.679672539367516e-05, + "loss": 5.0203, + "step": 27393 + }, + { + "epoch": 0.1629198781996384, + "grad_norm": 1.9748656749725342, + "learning_rate": 4.679649663343785e-05, + "loss": 5.0305, + "step": 27394 + }, + { + "epoch": 0.1629258254829194, + "grad_norm": 2.2613768577575684, + "learning_rate": 4.67962678655916e-05, + "loss": 4.9047, + "step": 27395 + }, + { + "epoch": 0.1629317727662004, + "grad_norm": 1.583208441734314, + "learning_rate": 4.6796039090136514e-05, + "loss": 4.6715, + "step": 27396 + }, + { + "epoch": 0.1629377200494814, + "grad_norm": 1.6698166131973267, + "learning_rate": 4.679581030707265e-05, + "loss": 5.3792, + "step": 27397 + }, + { + "epoch": 0.1629436673327624, + "grad_norm": 1.778937816619873, + "learning_rate": 4.679558151640009e-05, + "loss": 5.682, + "step": 27398 + }, + { + "epoch": 0.1629496146160434, + "grad_norm": 1.7441314458847046, + "learning_rate": 4.679535271811892e-05, + "loss": 5.2928, + "step": 27399 + }, + { + "epoch": 0.1629555618993244, + "grad_norm": 2.2535476684570312, + "learning_rate": 4.679512391222922e-05, + "loss": 4.9041, + "step": 27400 + }, + { + "epoch": 0.16296150918260538, + "grad_norm": 2.237154483795166, + "learning_rate": 4.679489509873106e-05, + "loss": 4.8852, + "step": 27401 + }, + { + "epoch": 0.1629674564658864, + "grad_norm": 1.7429604530334473, + "learning_rate": 4.679466627762454e-05, + "loss": 4.7548, + "step": 27402 + }, + { + "epoch": 0.16297340374916738, + "grad_norm": 2.02030086517334, + "learning_rate": 4.6794437448909723e-05, + "loss": 4.8708, + "step": 27403 + }, + { + "epoch": 0.16297935103244837, + "grad_norm": 1.5148401260375977, + "learning_rate": 4.6794208612586684e-05, + "loss": 4.8774, + "step": 27404 + }, + { + "epoch": 0.1629852983157294, + "grad_norm": 1.9291085004806519, + "learning_rate": 4.679397976865552e-05, + "loss": 4.7936, + "step": 27405 + }, + { + "epoch": 0.16299124559901038, + "grad_norm": 2.0261623859405518, + "learning_rate": 4.67937509171163e-05, + "loss": 4.5639, + "step": 27406 + }, + { + "epoch": 0.16299719288229136, + "grad_norm": 2.1595592498779297, + "learning_rate": 4.679352205796911e-05, + "loss": 4.7767, + "step": 27407 + }, + { + "epoch": 0.16300314016557238, + "grad_norm": 1.7030655145645142, + "learning_rate": 4.679329319121403e-05, + "loss": 4.9251, + "step": 27408 + }, + { + "epoch": 0.16300908744885337, + "grad_norm": 1.5864980220794678, + "learning_rate": 4.679306431685112e-05, + "loss": 5.0048, + "step": 27409 + }, + { + "epoch": 0.16301503473213436, + "grad_norm": 1.695307970046997, + "learning_rate": 4.679283543488049e-05, + "loss": 5.1882, + "step": 27410 + }, + { + "epoch": 0.16302098201541534, + "grad_norm": 1.4839437007904053, + "learning_rate": 4.6792606545302206e-05, + "loss": 5.3838, + "step": 27411 + }, + { + "epoch": 0.16302692929869636, + "grad_norm": 1.883641242980957, + "learning_rate": 4.6792377648116346e-05, + "loss": 4.9213, + "step": 27412 + }, + { + "epoch": 0.16303287658197735, + "grad_norm": 2.2560174465179443, + "learning_rate": 4.6792148743322985e-05, + "loss": 4.2573, + "step": 27413 + }, + { + "epoch": 0.16303882386525834, + "grad_norm": 2.452279567718506, + "learning_rate": 4.6791919830922225e-05, + "loss": 4.526, + "step": 27414 + }, + { + "epoch": 0.16304477114853935, + "grad_norm": 2.429499387741089, + "learning_rate": 4.679169091091412e-05, + "loss": 4.1269, + "step": 27415 + }, + { + "epoch": 0.16305071843182034, + "grad_norm": 1.7020376920700073, + "learning_rate": 4.6791461983298764e-05, + "loss": 5.367, + "step": 27416 + }, + { + "epoch": 0.16305666571510133, + "grad_norm": 1.6802117824554443, + "learning_rate": 4.679123304807623e-05, + "loss": 5.628, + "step": 27417 + }, + { + "epoch": 0.16306261299838234, + "grad_norm": 1.5536737442016602, + "learning_rate": 4.6791004105246606e-05, + "loss": 4.4013, + "step": 27418 + }, + { + "epoch": 0.16306856028166333, + "grad_norm": 1.6626231670379639, + "learning_rate": 4.6790775154809966e-05, + "loss": 5.1377, + "step": 27419 + }, + { + "epoch": 0.16307450756494432, + "grad_norm": 1.4954432249069214, + "learning_rate": 4.6790546196766395e-05, + "loss": 4.8278, + "step": 27420 + }, + { + "epoch": 0.16308045484822534, + "grad_norm": 2.2759921550750732, + "learning_rate": 4.679031723111597e-05, + "loss": 4.0856, + "step": 27421 + }, + { + "epoch": 0.16308640213150633, + "grad_norm": 2.298222541809082, + "learning_rate": 4.679008825785877e-05, + "loss": 4.169, + "step": 27422 + }, + { + "epoch": 0.1630923494147873, + "grad_norm": 2.435786247253418, + "learning_rate": 4.678985927699486e-05, + "loss": 3.9992, + "step": 27423 + }, + { + "epoch": 0.16309829669806833, + "grad_norm": 2.273677110671997, + "learning_rate": 4.678963028852436e-05, + "loss": 3.689, + "step": 27424 + }, + { + "epoch": 0.16310424398134932, + "grad_norm": 2.1706488132476807, + "learning_rate": 4.6789401292447306e-05, + "loss": 3.7752, + "step": 27425 + }, + { + "epoch": 0.1631101912646303, + "grad_norm": 1.7838464975357056, + "learning_rate": 4.6789172288763804e-05, + "loss": 4.863, + "step": 27426 + }, + { + "epoch": 0.16311613854791132, + "grad_norm": 2.0465335845947266, + "learning_rate": 4.678894327747393e-05, + "loss": 4.8415, + "step": 27427 + }, + { + "epoch": 0.1631220858311923, + "grad_norm": 2.5023603439331055, + "learning_rate": 4.678871425857775e-05, + "loss": 3.8268, + "step": 27428 + }, + { + "epoch": 0.1631280331144733, + "grad_norm": 3.1593286991119385, + "learning_rate": 4.6788485232075366e-05, + "loss": 3.8232, + "step": 27429 + }, + { + "epoch": 0.1631339803977543, + "grad_norm": 2.5644307136535645, + "learning_rate": 4.6788256197966847e-05, + "loss": 3.4984, + "step": 27430 + }, + { + "epoch": 0.1631399276810353, + "grad_norm": 2.0135555267333984, + "learning_rate": 4.678802715625227e-05, + "loss": 4.1888, + "step": 27431 + }, + { + "epoch": 0.1631458749643163, + "grad_norm": 2.4584031105041504, + "learning_rate": 4.678779810693171e-05, + "loss": 4.2168, + "step": 27432 + }, + { + "epoch": 0.1631518222475973, + "grad_norm": 3.071559429168701, + "learning_rate": 4.678756905000526e-05, + "loss": 4.191, + "step": 27433 + }, + { + "epoch": 0.1631577695308783, + "grad_norm": 2.8028981685638428, + "learning_rate": 4.6787339985473e-05, + "loss": 3.9579, + "step": 27434 + }, + { + "epoch": 0.16316371681415928, + "grad_norm": 1.8563295602798462, + "learning_rate": 4.6787110913335006e-05, + "loss": 4.7058, + "step": 27435 + }, + { + "epoch": 0.1631696640974403, + "grad_norm": 1.576141357421875, + "learning_rate": 4.678688183359135e-05, + "loss": 5.2126, + "step": 27436 + }, + { + "epoch": 0.16317561138072129, + "grad_norm": 1.715032935142517, + "learning_rate": 4.6786652746242124e-05, + "loss": 5.1945, + "step": 27437 + }, + { + "epoch": 0.16318155866400227, + "grad_norm": 1.5476752519607544, + "learning_rate": 4.67864236512874e-05, + "loss": 5.523, + "step": 27438 + }, + { + "epoch": 0.1631875059472833, + "grad_norm": 1.4861894845962524, + "learning_rate": 4.6786194548727255e-05, + "loss": 5.4119, + "step": 27439 + }, + { + "epoch": 0.16319345323056428, + "grad_norm": 1.3097593784332275, + "learning_rate": 4.6785965438561784e-05, + "loss": 5.4008, + "step": 27440 + }, + { + "epoch": 0.16319940051384527, + "grad_norm": 1.733404517173767, + "learning_rate": 4.678573632079105e-05, + "loss": 4.4261, + "step": 27441 + }, + { + "epoch": 0.16320534779712628, + "grad_norm": 1.4431440830230713, + "learning_rate": 4.678550719541514e-05, + "loss": 3.8523, + "step": 27442 + }, + { + "epoch": 0.16321129508040727, + "grad_norm": 1.5869112014770508, + "learning_rate": 4.678527806243415e-05, + "loss": 5.0346, + "step": 27443 + }, + { + "epoch": 0.16321724236368826, + "grad_norm": 1.7510712146759033, + "learning_rate": 4.6785048921848127e-05, + "loss": 5.2022, + "step": 27444 + }, + { + "epoch": 0.16322318964696927, + "grad_norm": 2.5091726779937744, + "learning_rate": 4.678481977365717e-05, + "loss": 4.3526, + "step": 27445 + }, + { + "epoch": 0.16322913693025026, + "grad_norm": 2.355930805206299, + "learning_rate": 4.6784590617861365e-05, + "loss": 3.9097, + "step": 27446 + }, + { + "epoch": 0.16323508421353125, + "grad_norm": 2.104262113571167, + "learning_rate": 4.678436145446078e-05, + "loss": 3.9491, + "step": 27447 + }, + { + "epoch": 0.16324103149681227, + "grad_norm": 2.6814212799072266, + "learning_rate": 4.678413228345551e-05, + "loss": 3.9986, + "step": 27448 + }, + { + "epoch": 0.16324697878009325, + "grad_norm": 2.017530679702759, + "learning_rate": 4.678390310484561e-05, + "loss": 4.0997, + "step": 27449 + }, + { + "epoch": 0.16325292606337424, + "grad_norm": 2.437260389328003, + "learning_rate": 4.6783673918631175e-05, + "loss": 4.2466, + "step": 27450 + }, + { + "epoch": 0.16325887334665526, + "grad_norm": 2.4225821495056152, + "learning_rate": 4.67834447248123e-05, + "loss": 4.0411, + "step": 27451 + }, + { + "epoch": 0.16326482062993625, + "grad_norm": 1.833397388458252, + "learning_rate": 4.6783215523389035e-05, + "loss": 4.5873, + "step": 27452 + }, + { + "epoch": 0.16327076791321724, + "grad_norm": 1.7432091236114502, + "learning_rate": 4.6782986314361477e-05, + "loss": 5.3351, + "step": 27453 + }, + { + "epoch": 0.16327671519649825, + "grad_norm": 1.8234552145004272, + "learning_rate": 4.6782757097729704e-05, + "loss": 5.3769, + "step": 27454 + }, + { + "epoch": 0.16328266247977924, + "grad_norm": 1.7435389757156372, + "learning_rate": 4.67825278734938e-05, + "loss": 4.6875, + "step": 27455 + }, + { + "epoch": 0.16328860976306023, + "grad_norm": 2.265040874481201, + "learning_rate": 4.678229864165383e-05, + "loss": 4.6138, + "step": 27456 + }, + { + "epoch": 0.16329455704634124, + "grad_norm": 2.105421304702759, + "learning_rate": 4.678206940220989e-05, + "loss": 4.7799, + "step": 27457 + }, + { + "epoch": 0.16330050432962223, + "grad_norm": 1.9669932126998901, + "learning_rate": 4.678184015516206e-05, + "loss": 4.3826, + "step": 27458 + }, + { + "epoch": 0.16330645161290322, + "grad_norm": 2.2020108699798584, + "learning_rate": 4.6781610900510406e-05, + "loss": 4.7784, + "step": 27459 + }, + { + "epoch": 0.16331239889618424, + "grad_norm": 2.0246944427490234, + "learning_rate": 4.678138163825503e-05, + "loss": 4.5324, + "step": 27460 + }, + { + "epoch": 0.16331834617946522, + "grad_norm": 2.0522918701171875, + "learning_rate": 4.678115236839599e-05, + "loss": 4.1903, + "step": 27461 + }, + { + "epoch": 0.1633242934627462, + "grad_norm": 2.0524399280548096, + "learning_rate": 4.678092309093337e-05, + "loss": 4.5542, + "step": 27462 + }, + { + "epoch": 0.16333024074602723, + "grad_norm": 2.0562379360198975, + "learning_rate": 4.678069380586726e-05, + "loss": 4.6572, + "step": 27463 + }, + { + "epoch": 0.16333618802930822, + "grad_norm": 1.931517481803894, + "learning_rate": 4.678046451319774e-05, + "loss": 4.3204, + "step": 27464 + }, + { + "epoch": 0.1633421353125892, + "grad_norm": 1.852124810218811, + "learning_rate": 4.678023521292487e-05, + "loss": 4.5307, + "step": 27465 + }, + { + "epoch": 0.16334808259587022, + "grad_norm": 1.690384030342102, + "learning_rate": 4.6780005905048764e-05, + "loss": 5.1771, + "step": 27466 + }, + { + "epoch": 0.1633540298791512, + "grad_norm": 1.7573405504226685, + "learning_rate": 4.6779776589569466e-05, + "loss": 4.894, + "step": 27467 + }, + { + "epoch": 0.1633599771624322, + "grad_norm": 2.139704942703247, + "learning_rate": 4.677954726648708e-05, + "loss": 4.7212, + "step": 27468 + }, + { + "epoch": 0.1633659244457132, + "grad_norm": 1.9621661901474, + "learning_rate": 4.677931793580168e-05, + "loss": 4.6083, + "step": 27469 + }, + { + "epoch": 0.1633718717289942, + "grad_norm": 1.9202685356140137, + "learning_rate": 4.6779088597513346e-05, + "loss": 5.3296, + "step": 27470 + }, + { + "epoch": 0.1633778190122752, + "grad_norm": 1.6269041299819946, + "learning_rate": 4.677885925162216e-05, + "loss": 5.4541, + "step": 27471 + }, + { + "epoch": 0.16338376629555618, + "grad_norm": 1.928564190864563, + "learning_rate": 4.677862989812819e-05, + "loss": 4.8419, + "step": 27472 + }, + { + "epoch": 0.1633897135788372, + "grad_norm": 2.1393957138061523, + "learning_rate": 4.677840053703153e-05, + "loss": 4.5768, + "step": 27473 + }, + { + "epoch": 0.16339566086211818, + "grad_norm": 2.2332470417022705, + "learning_rate": 4.677817116833225e-05, + "loss": 4.7571, + "step": 27474 + }, + { + "epoch": 0.16340160814539917, + "grad_norm": 1.7523399591445923, + "learning_rate": 4.6777941792030446e-05, + "loss": 5.0372, + "step": 27475 + }, + { + "epoch": 0.16340755542868018, + "grad_norm": 1.5460946559906006, + "learning_rate": 4.677771240812619e-05, + "loss": 5.1194, + "step": 27476 + }, + { + "epoch": 0.16341350271196117, + "grad_norm": 1.6920409202575684, + "learning_rate": 4.677748301661954e-05, + "loss": 5.0852, + "step": 27477 + }, + { + "epoch": 0.16341944999524216, + "grad_norm": 1.5086921453475952, + "learning_rate": 4.677725361751061e-05, + "loss": 5.2414, + "step": 27478 + }, + { + "epoch": 0.16342539727852318, + "grad_norm": 1.4637200832366943, + "learning_rate": 4.6777024210799465e-05, + "loss": 4.9873, + "step": 27479 + }, + { + "epoch": 0.16343134456180416, + "grad_norm": 1.6477910280227661, + "learning_rate": 4.677679479648618e-05, + "loss": 5.2834, + "step": 27480 + }, + { + "epoch": 0.16343729184508515, + "grad_norm": 1.7025471925735474, + "learning_rate": 4.6776565374570844e-05, + "loss": 5.3655, + "step": 27481 + }, + { + "epoch": 0.16344323912836617, + "grad_norm": 1.8360841274261475, + "learning_rate": 4.677633594505354e-05, + "loss": 4.4539, + "step": 27482 + }, + { + "epoch": 0.16344918641164716, + "grad_norm": 2.10629940032959, + "learning_rate": 4.6776106507934336e-05, + "loss": 4.2894, + "step": 27483 + }, + { + "epoch": 0.16345513369492815, + "grad_norm": 1.706100583076477, + "learning_rate": 4.677587706321333e-05, + "loss": 4.7572, + "step": 27484 + }, + { + "epoch": 0.16346108097820916, + "grad_norm": 1.518978238105774, + "learning_rate": 4.677564761089057e-05, + "loss": 5.8137, + "step": 27485 + }, + { + "epoch": 0.16346702826149015, + "grad_norm": 1.903784155845642, + "learning_rate": 4.677541815096617e-05, + "loss": 4.7093, + "step": 27486 + }, + { + "epoch": 0.16347297554477114, + "grad_norm": 1.9231067895889282, + "learning_rate": 4.677518868344019e-05, + "loss": 4.6492, + "step": 27487 + }, + { + "epoch": 0.16347892282805215, + "grad_norm": 1.5489968061447144, + "learning_rate": 4.6774959208312717e-05, + "loss": 5.1375, + "step": 27488 + }, + { + "epoch": 0.16348487011133314, + "grad_norm": 1.6851353645324707, + "learning_rate": 4.677472972558383e-05, + "loss": 5.3354, + "step": 27489 + }, + { + "epoch": 0.16349081739461413, + "grad_norm": 1.6556458473205566, + "learning_rate": 4.6774500235253614e-05, + "loss": 4.4959, + "step": 27490 + }, + { + "epoch": 0.16349676467789515, + "grad_norm": 1.8800296783447266, + "learning_rate": 4.6774270737322145e-05, + "loss": 4.0961, + "step": 27491 + }, + { + "epoch": 0.16350271196117613, + "grad_norm": 1.847226858139038, + "learning_rate": 4.67740412317895e-05, + "loss": 4.0567, + "step": 27492 + }, + { + "epoch": 0.16350865924445712, + "grad_norm": 1.8994855880737305, + "learning_rate": 4.6773811718655766e-05, + "loss": 4.8829, + "step": 27493 + }, + { + "epoch": 0.16351460652773814, + "grad_norm": 1.6551505327224731, + "learning_rate": 4.677358219792102e-05, + "loss": 5.0247, + "step": 27494 + }, + { + "epoch": 0.16352055381101913, + "grad_norm": 1.6510465145111084, + "learning_rate": 4.6773352669585336e-05, + "loss": 5.2324, + "step": 27495 + }, + { + "epoch": 0.16352650109430011, + "grad_norm": 1.851661205291748, + "learning_rate": 4.67731231336488e-05, + "loss": 4.1622, + "step": 27496 + }, + { + "epoch": 0.16353244837758113, + "grad_norm": 1.9479695558547974, + "learning_rate": 4.67728935901115e-05, + "loss": 3.9269, + "step": 27497 + }, + { + "epoch": 0.16353839566086212, + "grad_norm": 1.8207287788391113, + "learning_rate": 4.67726640389735e-05, + "loss": 3.8434, + "step": 27498 + }, + { + "epoch": 0.1635443429441431, + "grad_norm": 1.8698455095291138, + "learning_rate": 4.677243448023489e-05, + "loss": 3.9786, + "step": 27499 + }, + { + "epoch": 0.16355029022742412, + "grad_norm": 1.8257921934127808, + "learning_rate": 4.6772204913895746e-05, + "loss": 3.947, + "step": 27500 + }, + { + "epoch": 0.1635562375107051, + "grad_norm": 1.6152242422103882, + "learning_rate": 4.6771975339956155e-05, + "loss": 4.4898, + "step": 27501 + }, + { + "epoch": 0.1635621847939861, + "grad_norm": 1.956666350364685, + "learning_rate": 4.6771745758416185e-05, + "loss": 4.8584, + "step": 27502 + }, + { + "epoch": 0.16356813207726711, + "grad_norm": 1.8477699756622314, + "learning_rate": 4.677151616927593e-05, + "loss": 5.0331, + "step": 27503 + }, + { + "epoch": 0.1635740793605481, + "grad_norm": 1.705209732055664, + "learning_rate": 4.677128657253545e-05, + "loss": 4.193, + "step": 27504 + }, + { + "epoch": 0.1635800266438291, + "grad_norm": 1.8259029388427734, + "learning_rate": 4.677105696819486e-05, + "loss": 3.8187, + "step": 27505 + }, + { + "epoch": 0.1635859739271101, + "grad_norm": 1.633556604385376, + "learning_rate": 4.677082735625421e-05, + "loss": 3.8045, + "step": 27506 + }, + { + "epoch": 0.1635919212103911, + "grad_norm": 1.7349916696548462, + "learning_rate": 4.677059773671358e-05, + "loss": 4.1425, + "step": 27507 + }, + { + "epoch": 0.16359786849367208, + "grad_norm": 1.8932249546051025, + "learning_rate": 4.677036810957307e-05, + "loss": 4.838, + "step": 27508 + }, + { + "epoch": 0.1636038157769531, + "grad_norm": 1.6211893558502197, + "learning_rate": 4.677013847483275e-05, + "loss": 5.2038, + "step": 27509 + }, + { + "epoch": 0.1636097630602341, + "grad_norm": 1.7109664678573608, + "learning_rate": 4.6769908832492694e-05, + "loss": 4.8308, + "step": 27510 + }, + { + "epoch": 0.16361571034351508, + "grad_norm": 1.603644847869873, + "learning_rate": 4.6769679182553e-05, + "loss": 4.8959, + "step": 27511 + }, + { + "epoch": 0.1636216576267961, + "grad_norm": 1.6871256828308105, + "learning_rate": 4.676944952501372e-05, + "loss": 4.7762, + "step": 27512 + }, + { + "epoch": 0.16362760491007708, + "grad_norm": 1.5820897817611694, + "learning_rate": 4.676921985987496e-05, + "loss": 4.4533, + "step": 27513 + }, + { + "epoch": 0.16363355219335807, + "grad_norm": 1.6850042343139648, + "learning_rate": 4.676899018713678e-05, + "loss": 4.7149, + "step": 27514 + }, + { + "epoch": 0.16363949947663908, + "grad_norm": 1.6211190223693848, + "learning_rate": 4.676876050679928e-05, + "loss": 5.1372, + "step": 27515 + }, + { + "epoch": 0.16364544675992007, + "grad_norm": 1.7970921993255615, + "learning_rate": 4.676853081886252e-05, + "loss": 4.9738, + "step": 27516 + }, + { + "epoch": 0.16365139404320106, + "grad_norm": 1.9819167852401733, + "learning_rate": 4.67683011233266e-05, + "loss": 4.9069, + "step": 27517 + }, + { + "epoch": 0.16365734132648208, + "grad_norm": 1.9208866357803345, + "learning_rate": 4.6768071420191596e-05, + "loss": 4.6224, + "step": 27518 + }, + { + "epoch": 0.16366328860976306, + "grad_norm": 1.4924341440200806, + "learning_rate": 4.676784170945757e-05, + "loss": 4.4268, + "step": 27519 + }, + { + "epoch": 0.16366923589304405, + "grad_norm": 1.5947877168655396, + "learning_rate": 4.676761199112462e-05, + "loss": 4.231, + "step": 27520 + }, + { + "epoch": 0.16367518317632507, + "grad_norm": 1.4336072206497192, + "learning_rate": 4.676738226519283e-05, + "loss": 4.7233, + "step": 27521 + }, + { + "epoch": 0.16368113045960606, + "grad_norm": 1.496932864189148, + "learning_rate": 4.676715253166226e-05, + "loss": 4.2295, + "step": 27522 + }, + { + "epoch": 0.16368707774288704, + "grad_norm": 1.3215701580047607, + "learning_rate": 4.6766922790533005e-05, + "loss": 4.2627, + "step": 27523 + }, + { + "epoch": 0.16369302502616806, + "grad_norm": 1.524957299232483, + "learning_rate": 4.676669304180514e-05, + "loss": 4.5299, + "step": 27524 + }, + { + "epoch": 0.16369897230944905, + "grad_norm": 2.0174505710601807, + "learning_rate": 4.676646328547876e-05, + "loss": 4.8986, + "step": 27525 + }, + { + "epoch": 0.16370491959273004, + "grad_norm": 1.6895251274108887, + "learning_rate": 4.676623352155392e-05, + "loss": 4.6933, + "step": 27526 + }, + { + "epoch": 0.16371086687601105, + "grad_norm": 1.3915743827819824, + "learning_rate": 4.676600375003072e-05, + "loss": 4.3735, + "step": 27527 + }, + { + "epoch": 0.16371681415929204, + "grad_norm": 2.5097527503967285, + "learning_rate": 4.6765773970909224e-05, + "loss": 4.7227, + "step": 27528 + }, + { + "epoch": 0.16372276144257303, + "grad_norm": 1.4059836864471436, + "learning_rate": 4.676554418418953e-05, + "loss": 4.3861, + "step": 27529 + }, + { + "epoch": 0.16372870872585402, + "grad_norm": 1.5270711183547974, + "learning_rate": 4.6765314389871704e-05, + "loss": 4.4302, + "step": 27530 + }, + { + "epoch": 0.16373465600913503, + "grad_norm": 1.8292162418365479, + "learning_rate": 4.676508458795583e-05, + "loss": 4.697, + "step": 27531 + }, + { + "epoch": 0.16374060329241602, + "grad_norm": 1.8712737560272217, + "learning_rate": 4.6764854778442e-05, + "loss": 4.6228, + "step": 27532 + }, + { + "epoch": 0.163746550575697, + "grad_norm": 1.551424503326416, + "learning_rate": 4.6764624961330274e-05, + "loss": 5.1146, + "step": 27533 + }, + { + "epoch": 0.16375249785897802, + "grad_norm": 1.522362232208252, + "learning_rate": 4.6764395136620745e-05, + "loss": 4.8196, + "step": 27534 + }, + { + "epoch": 0.163758445142259, + "grad_norm": 2.196622371673584, + "learning_rate": 4.676416530431349e-05, + "loss": 4.6695, + "step": 27535 + }, + { + "epoch": 0.16376439242554, + "grad_norm": 1.7196024656295776, + "learning_rate": 4.676393546440859e-05, + "loss": 4.3153, + "step": 27536 + }, + { + "epoch": 0.16377033970882102, + "grad_norm": 1.841454267501831, + "learning_rate": 4.676370561690613e-05, + "loss": 3.9704, + "step": 27537 + }, + { + "epoch": 0.163776286992102, + "grad_norm": 1.8239476680755615, + "learning_rate": 4.6763475761806185e-05, + "loss": 3.9419, + "step": 27538 + }, + { + "epoch": 0.163782234275383, + "grad_norm": 1.8012974262237549, + "learning_rate": 4.6763245899108834e-05, + "loss": 3.9246, + "step": 27539 + }, + { + "epoch": 0.163788181558664, + "grad_norm": 1.7155267000198364, + "learning_rate": 4.676301602881415e-05, + "loss": 4.7766, + "step": 27540 + }, + { + "epoch": 0.163794128841945, + "grad_norm": 1.986662745475769, + "learning_rate": 4.676278615092223e-05, + "loss": 4.5932, + "step": 27541 + }, + { + "epoch": 0.16380007612522599, + "grad_norm": 1.7661755084991455, + "learning_rate": 4.676255626543314e-05, + "loss": 4.2295, + "step": 27542 + }, + { + "epoch": 0.163806023408507, + "grad_norm": 1.7953100204467773, + "learning_rate": 4.676232637234698e-05, + "loss": 3.7245, + "step": 27543 + }, + { + "epoch": 0.163811970691788, + "grad_norm": 1.8963271379470825, + "learning_rate": 4.6762096471663805e-05, + "loss": 3.7599, + "step": 27544 + }, + { + "epoch": 0.16381791797506898, + "grad_norm": 1.8365765810012817, + "learning_rate": 4.676186656338371e-05, + "loss": 3.8955, + "step": 27545 + }, + { + "epoch": 0.16382386525835, + "grad_norm": 1.7611230611801147, + "learning_rate": 4.676163664750677e-05, + "loss": 3.7164, + "step": 27546 + }, + { + "epoch": 0.16382981254163098, + "grad_norm": 1.6881484985351562, + "learning_rate": 4.676140672403307e-05, + "loss": 3.905, + "step": 27547 + }, + { + "epoch": 0.16383575982491197, + "grad_norm": 1.655831217765808, + "learning_rate": 4.676117679296269e-05, + "loss": 4.9185, + "step": 27548 + }, + { + "epoch": 0.16384170710819299, + "grad_norm": 1.602988839149475, + "learning_rate": 4.6760946854295707e-05, + "loss": 5.228, + "step": 27549 + }, + { + "epoch": 0.16384765439147397, + "grad_norm": 1.6523774862289429, + "learning_rate": 4.67607169080322e-05, + "loss": 5.2095, + "step": 27550 + }, + { + "epoch": 0.16385360167475496, + "grad_norm": 2.0141515731811523, + "learning_rate": 4.676048695417224e-05, + "loss": 5.2764, + "step": 27551 + }, + { + "epoch": 0.16385954895803598, + "grad_norm": 1.824358344078064, + "learning_rate": 4.676025699271594e-05, + "loss": 4.4083, + "step": 27552 + }, + { + "epoch": 0.16386549624131697, + "grad_norm": 1.90078604221344, + "learning_rate": 4.676002702366334e-05, + "loss": 4.3142, + "step": 27553 + }, + { + "epoch": 0.16387144352459795, + "grad_norm": 2.1593260765075684, + "learning_rate": 4.6759797047014554e-05, + "loss": 4.8884, + "step": 27554 + }, + { + "epoch": 0.16387739080787897, + "grad_norm": 1.6608953475952148, + "learning_rate": 4.675956706276965e-05, + "loss": 5.0272, + "step": 27555 + }, + { + "epoch": 0.16388333809115996, + "grad_norm": 1.6689786911010742, + "learning_rate": 4.67593370709287e-05, + "loss": 4.8278, + "step": 27556 + }, + { + "epoch": 0.16388928537444095, + "grad_norm": 1.5720055103302002, + "learning_rate": 4.675910707149178e-05, + "loss": 4.9288, + "step": 27557 + }, + { + "epoch": 0.16389523265772196, + "grad_norm": 1.6609811782836914, + "learning_rate": 4.675887706445899e-05, + "loss": 4.9233, + "step": 27558 + }, + { + "epoch": 0.16390117994100295, + "grad_norm": 1.7448883056640625, + "learning_rate": 4.6758647049830405e-05, + "loss": 4.8793, + "step": 27559 + }, + { + "epoch": 0.16390712722428394, + "grad_norm": 1.728389859199524, + "learning_rate": 4.6758417027606094e-05, + "loss": 5.2122, + "step": 27560 + }, + { + "epoch": 0.16391307450756495, + "grad_norm": 1.4038145542144775, + "learning_rate": 4.675818699778615e-05, + "loss": 5.1715, + "step": 27561 + }, + { + "epoch": 0.16391902179084594, + "grad_norm": 1.7425341606140137, + "learning_rate": 4.675795696037064e-05, + "loss": 5.3856, + "step": 27562 + }, + { + "epoch": 0.16392496907412693, + "grad_norm": 1.6463298797607422, + "learning_rate": 4.675772691535966e-05, + "loss": 4.8584, + "step": 27563 + }, + { + "epoch": 0.16393091635740795, + "grad_norm": 1.8424142599105835, + "learning_rate": 4.675749686275328e-05, + "loss": 4.7667, + "step": 27564 + }, + { + "epoch": 0.16393686364068893, + "grad_norm": 2.32179594039917, + "learning_rate": 4.675726680255158e-05, + "loss": 4.2014, + "step": 27565 + }, + { + "epoch": 0.16394281092396992, + "grad_norm": 2.380255699157715, + "learning_rate": 4.675703673475464e-05, + "loss": 4.5618, + "step": 27566 + }, + { + "epoch": 0.16394875820725094, + "grad_norm": 1.846535563468933, + "learning_rate": 4.675680665936255e-05, + "loss": 4.9291, + "step": 27567 + }, + { + "epoch": 0.16395470549053193, + "grad_norm": 1.9701546430587769, + "learning_rate": 4.675657657637538e-05, + "loss": 4.4594, + "step": 27568 + }, + { + "epoch": 0.16396065277381291, + "grad_norm": 2.15051007270813, + "learning_rate": 4.675634648579322e-05, + "loss": 4.0397, + "step": 27569 + }, + { + "epoch": 0.16396660005709393, + "grad_norm": 1.7181464433670044, + "learning_rate": 4.6756116387616136e-05, + "loss": 5.0483, + "step": 27570 + }, + { + "epoch": 0.16397254734037492, + "grad_norm": 1.3659751415252686, + "learning_rate": 4.675588628184422e-05, + "loss": 5.0627, + "step": 27571 + }, + { + "epoch": 0.1639784946236559, + "grad_norm": 1.7381535768508911, + "learning_rate": 4.6755656168477553e-05, + "loss": 4.8013, + "step": 27572 + }, + { + "epoch": 0.16398444190693692, + "grad_norm": 1.9152921438217163, + "learning_rate": 4.6755426047516205e-05, + "loss": 4.5437, + "step": 27573 + }, + { + "epoch": 0.1639903891902179, + "grad_norm": 1.449018955230713, + "learning_rate": 4.675519591896026e-05, + "loss": 5.046, + "step": 27574 + }, + { + "epoch": 0.1639963364734989, + "grad_norm": 2.2243831157684326, + "learning_rate": 4.675496578280981e-05, + "loss": 4.0585, + "step": 27575 + }, + { + "epoch": 0.16400228375677992, + "grad_norm": 1.9781684875488281, + "learning_rate": 4.675473563906492e-05, + "loss": 4.6334, + "step": 27576 + }, + { + "epoch": 0.1640082310400609, + "grad_norm": 1.9873735904693604, + "learning_rate": 4.675450548772568e-05, + "loss": 4.6854, + "step": 27577 + }, + { + "epoch": 0.1640141783233419, + "grad_norm": 1.914959192276001, + "learning_rate": 4.675427532879216e-05, + "loss": 4.7866, + "step": 27578 + }, + { + "epoch": 0.1640201256066229, + "grad_norm": 1.8510034084320068, + "learning_rate": 4.675404516226446e-05, + "loss": 4.4274, + "step": 27579 + }, + { + "epoch": 0.1640260728899039, + "grad_norm": 1.726172924041748, + "learning_rate": 4.6753814988142644e-05, + "loss": 4.4166, + "step": 27580 + }, + { + "epoch": 0.16403202017318488, + "grad_norm": 1.7206041812896729, + "learning_rate": 4.6753584806426786e-05, + "loss": 4.3724, + "step": 27581 + }, + { + "epoch": 0.1640379674564659, + "grad_norm": 1.9253183603286743, + "learning_rate": 4.6753354617116987e-05, + "loss": 3.8641, + "step": 27582 + }, + { + "epoch": 0.1640439147397469, + "grad_norm": 1.9023802280426025, + "learning_rate": 4.6753124420213306e-05, + "loss": 4.231, + "step": 27583 + }, + { + "epoch": 0.16404986202302788, + "grad_norm": 2.092531442642212, + "learning_rate": 4.675289421571584e-05, + "loss": 4.4025, + "step": 27584 + }, + { + "epoch": 0.1640558093063089, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.675266400362466e-05, + "loss": 4.4643, + "step": 27585 + }, + { + "epoch": 0.16406175658958988, + "grad_norm": 2.1016385555267334, + "learning_rate": 4.6752433783939855e-05, + "loss": 4.4391, + "step": 27586 + }, + { + "epoch": 0.16406770387287087, + "grad_norm": 2.07698130607605, + "learning_rate": 4.67522035566615e-05, + "loss": 4.483, + "step": 27587 + }, + { + "epoch": 0.16407365115615186, + "grad_norm": 2.172579288482666, + "learning_rate": 4.6751973321789675e-05, + "loss": 4.2118, + "step": 27588 + }, + { + "epoch": 0.16407959843943287, + "grad_norm": 2.1808786392211914, + "learning_rate": 4.675174307932446e-05, + "loss": 4.4722, + "step": 27589 + }, + { + "epoch": 0.16408554572271386, + "grad_norm": 2.163482427597046, + "learning_rate": 4.675151282926593e-05, + "loss": 4.747, + "step": 27590 + }, + { + "epoch": 0.16409149300599485, + "grad_norm": 2.431328773498535, + "learning_rate": 4.675128257161418e-05, + "loss": 4.0239, + "step": 27591 + }, + { + "epoch": 0.16409744028927586, + "grad_norm": 2.2003822326660156, + "learning_rate": 4.675105230636928e-05, + "loss": 4.2945, + "step": 27592 + }, + { + "epoch": 0.16410338757255685, + "grad_norm": 1.8259824514389038, + "learning_rate": 4.675082203353131e-05, + "loss": 4.3246, + "step": 27593 + }, + { + "epoch": 0.16410933485583784, + "grad_norm": 2.02915358543396, + "learning_rate": 4.6750591753100356e-05, + "loss": 5.5056, + "step": 27594 + }, + { + "epoch": 0.16411528213911886, + "grad_norm": 2.2010276317596436, + "learning_rate": 4.675036146507649e-05, + "loss": 5.3688, + "step": 27595 + }, + { + "epoch": 0.16412122942239984, + "grad_norm": 1.8411953449249268, + "learning_rate": 4.6750131169459806e-05, + "loss": 5.6156, + "step": 27596 + }, + { + "epoch": 0.16412717670568083, + "grad_norm": 1.8446851968765259, + "learning_rate": 4.674990086625037e-05, + "loss": 5.1344, + "step": 27597 + }, + { + "epoch": 0.16413312398896185, + "grad_norm": 1.7121134996414185, + "learning_rate": 4.674967055544827e-05, + "loss": 5.164, + "step": 27598 + }, + { + "epoch": 0.16413907127224284, + "grad_norm": 1.68525230884552, + "learning_rate": 4.6749440237053574e-05, + "loss": 4.9757, + "step": 27599 + }, + { + "epoch": 0.16414501855552383, + "grad_norm": 1.9436984062194824, + "learning_rate": 4.6749209911066396e-05, + "loss": 4.4168, + "step": 27600 + }, + { + "epoch": 0.16415096583880484, + "grad_norm": 1.9261338710784912, + "learning_rate": 4.6748979577486774e-05, + "loss": 4.5949, + "step": 27601 + }, + { + "epoch": 0.16415691312208583, + "grad_norm": 1.4877192974090576, + "learning_rate": 4.6748749236314816e-05, + "loss": 5.0274, + "step": 27602 + }, + { + "epoch": 0.16416286040536682, + "grad_norm": 2.030029296875, + "learning_rate": 4.674851888755059e-05, + "loss": 5.3301, + "step": 27603 + }, + { + "epoch": 0.16416880768864783, + "grad_norm": 1.4313018321990967, + "learning_rate": 4.674828853119418e-05, + "loss": 4.9408, + "step": 27604 + }, + { + "epoch": 0.16417475497192882, + "grad_norm": 1.4011638164520264, + "learning_rate": 4.674805816724568e-05, + "loss": 5.2628, + "step": 27605 + }, + { + "epoch": 0.1641807022552098, + "grad_norm": 1.6607071161270142, + "learning_rate": 4.674782779570514e-05, + "loss": 5.0739, + "step": 27606 + }, + { + "epoch": 0.16418664953849083, + "grad_norm": 2.07830810546875, + "learning_rate": 4.674759741657267e-05, + "loss": 4.7624, + "step": 27607 + }, + { + "epoch": 0.1641925968217718, + "grad_norm": 2.402186870574951, + "learning_rate": 4.674736702984833e-05, + "loss": 4.2407, + "step": 27608 + }, + { + "epoch": 0.1641985441050528, + "grad_norm": 2.498345136642456, + "learning_rate": 4.674713663553222e-05, + "loss": 4.2357, + "step": 27609 + }, + { + "epoch": 0.16420449138833382, + "grad_norm": 2.4307384490966797, + "learning_rate": 4.67469062336244e-05, + "loss": 4.4379, + "step": 27610 + }, + { + "epoch": 0.1642104386716148, + "grad_norm": 1.721940279006958, + "learning_rate": 4.6746675824124964e-05, + "loss": 4.4393, + "step": 27611 + }, + { + "epoch": 0.1642163859548958, + "grad_norm": 1.9504097700119019, + "learning_rate": 4.674644540703399e-05, + "loss": 4.753, + "step": 27612 + }, + { + "epoch": 0.1642223332381768, + "grad_norm": 2.2953338623046875, + "learning_rate": 4.674621498235155e-05, + "loss": 4.7865, + "step": 27613 + }, + { + "epoch": 0.1642282805214578, + "grad_norm": 2.291163921356201, + "learning_rate": 4.674598455007773e-05, + "loss": 4.7659, + "step": 27614 + }, + { + "epoch": 0.16423422780473879, + "grad_norm": 2.1821818351745605, + "learning_rate": 4.674575411021262e-05, + "loss": 4.0771, + "step": 27615 + }, + { + "epoch": 0.1642401750880198, + "grad_norm": 2.2602016925811768, + "learning_rate": 4.6745523662756286e-05, + "loss": 4.2426, + "step": 27616 + }, + { + "epoch": 0.1642461223713008, + "grad_norm": 1.443772792816162, + "learning_rate": 4.674529320770882e-05, + "loss": 5.2936, + "step": 27617 + }, + { + "epoch": 0.16425206965458178, + "grad_norm": 2.0360827445983887, + "learning_rate": 4.674506274507029e-05, + "loss": 5.3444, + "step": 27618 + }, + { + "epoch": 0.1642580169378628, + "grad_norm": 1.7705327272415161, + "learning_rate": 4.6744832274840786e-05, + "loss": 5.0619, + "step": 27619 + }, + { + "epoch": 0.16426396422114378, + "grad_norm": 2.3405168056488037, + "learning_rate": 4.6744601797020384e-05, + "loss": 4.0113, + "step": 27620 + }, + { + "epoch": 0.16426991150442477, + "grad_norm": 1.6145120859146118, + "learning_rate": 4.674437131160917e-05, + "loss": 4.87, + "step": 27621 + }, + { + "epoch": 0.1642758587877058, + "grad_norm": 1.7102009057998657, + "learning_rate": 4.674414081860722e-05, + "loss": 5.2878, + "step": 27622 + }, + { + "epoch": 0.16428180607098677, + "grad_norm": 1.5974667072296143, + "learning_rate": 4.674391031801461e-05, + "loss": 5.1225, + "step": 27623 + }, + { + "epoch": 0.16428775335426776, + "grad_norm": 1.7934401035308838, + "learning_rate": 4.674367980983143e-05, + "loss": 5.1496, + "step": 27624 + }, + { + "epoch": 0.16429370063754878, + "grad_norm": 1.625554084777832, + "learning_rate": 4.674344929405775e-05, + "loss": 4.9198, + "step": 27625 + }, + { + "epoch": 0.16429964792082977, + "grad_norm": 1.5650711059570312, + "learning_rate": 4.674321877069366e-05, + "loss": 5.6505, + "step": 27626 + }, + { + "epoch": 0.16430559520411075, + "grad_norm": 1.8613455295562744, + "learning_rate": 4.674298823973924e-05, + "loss": 5.6026, + "step": 27627 + }, + { + "epoch": 0.16431154248739177, + "grad_norm": 1.617720603942871, + "learning_rate": 4.674275770119457e-05, + "loss": 5.4009, + "step": 27628 + }, + { + "epoch": 0.16431748977067276, + "grad_norm": 1.937449336051941, + "learning_rate": 4.6742527155059724e-05, + "loss": 4.8275, + "step": 27629 + }, + { + "epoch": 0.16432343705395375, + "grad_norm": 2.541095733642578, + "learning_rate": 4.674229660133479e-05, + "loss": 4.0442, + "step": 27630 + }, + { + "epoch": 0.16432938433723476, + "grad_norm": 2.760444402694702, + "learning_rate": 4.674206604001984e-05, + "loss": 3.19, + "step": 27631 + }, + { + "epoch": 0.16433533162051575, + "grad_norm": 2.561680316925049, + "learning_rate": 4.674183547111496e-05, + "loss": 3.9053, + "step": 27632 + }, + { + "epoch": 0.16434127890379674, + "grad_norm": 2.6636784076690674, + "learning_rate": 4.6741604894620225e-05, + "loss": 4.1, + "step": 27633 + }, + { + "epoch": 0.16434722618707776, + "grad_norm": 2.010796070098877, + "learning_rate": 4.674137431053573e-05, + "loss": 4.5599, + "step": 27634 + }, + { + "epoch": 0.16435317347035874, + "grad_norm": 2.131115198135376, + "learning_rate": 4.674114371886154e-05, + "loss": 4.2314, + "step": 27635 + }, + { + "epoch": 0.16435912075363973, + "grad_norm": 2.2468631267547607, + "learning_rate": 4.674091311959774e-05, + "loss": 4.3132, + "step": 27636 + }, + { + "epoch": 0.16436506803692075, + "grad_norm": 2.325503349304199, + "learning_rate": 4.674068251274442e-05, + "loss": 3.7301, + "step": 27637 + }, + { + "epoch": 0.16437101532020174, + "grad_norm": 2.631612777709961, + "learning_rate": 4.6740451898301646e-05, + "loss": 3.6578, + "step": 27638 + }, + { + "epoch": 0.16437696260348272, + "grad_norm": 2.2272074222564697, + "learning_rate": 4.67402212762695e-05, + "loss": 4.1707, + "step": 27639 + }, + { + "epoch": 0.16438290988676374, + "grad_norm": 1.6620466709136963, + "learning_rate": 4.673999064664808e-05, + "loss": 5.1998, + "step": 27640 + }, + { + "epoch": 0.16438885717004473, + "grad_norm": 2.39687442779541, + "learning_rate": 4.673976000943745e-05, + "loss": 4.99, + "step": 27641 + }, + { + "epoch": 0.16439480445332572, + "grad_norm": 2.3301122188568115, + "learning_rate": 4.673952936463769e-05, + "loss": 4.7562, + "step": 27642 + }, + { + "epoch": 0.16440075173660673, + "grad_norm": 2.335031509399414, + "learning_rate": 4.6739298712248887e-05, + "loss": 4.6406, + "step": 27643 + }, + { + "epoch": 0.16440669901988772, + "grad_norm": 2.3373608589172363, + "learning_rate": 4.6739068052271115e-05, + "loss": 4.555, + "step": 27644 + }, + { + "epoch": 0.1644126463031687, + "grad_norm": 1.887984037399292, + "learning_rate": 4.6738837384704463e-05, + "loss": 5.0687, + "step": 27645 + }, + { + "epoch": 0.1644185935864497, + "grad_norm": 2.8348052501678467, + "learning_rate": 4.673860670954901e-05, + "loss": 5.0324, + "step": 27646 + }, + { + "epoch": 0.1644245408697307, + "grad_norm": 2.3812403678894043, + "learning_rate": 4.673837602680483e-05, + "loss": 5.1471, + "step": 27647 + }, + { + "epoch": 0.1644304881530117, + "grad_norm": 2.797342300415039, + "learning_rate": 4.673814533647201e-05, + "loss": 4.9506, + "step": 27648 + }, + { + "epoch": 0.1644364354362927, + "grad_norm": 2.2026922702789307, + "learning_rate": 4.673791463855063e-05, + "loss": 4.8893, + "step": 27649 + }, + { + "epoch": 0.1644423827195737, + "grad_norm": 1.6675883531570435, + "learning_rate": 4.6737683933040766e-05, + "loss": 5.247, + "step": 27650 + }, + { + "epoch": 0.1644483300028547, + "grad_norm": 1.771507978439331, + "learning_rate": 4.6737453219942495e-05, + "loss": 5.0371, + "step": 27651 + }, + { + "epoch": 0.16445427728613568, + "grad_norm": 1.753718614578247, + "learning_rate": 4.6737222499255914e-05, + "loss": 4.9673, + "step": 27652 + }, + { + "epoch": 0.1644602245694167, + "grad_norm": 2.460538387298584, + "learning_rate": 4.673699177098109e-05, + "loss": 5.0578, + "step": 27653 + }, + { + "epoch": 0.16446617185269768, + "grad_norm": 2.2908952236175537, + "learning_rate": 4.6736761035118104e-05, + "loss": 4.9473, + "step": 27654 + }, + { + "epoch": 0.16447211913597867, + "grad_norm": 2.1169328689575195, + "learning_rate": 4.673653029166704e-05, + "loss": 4.8466, + "step": 27655 + }, + { + "epoch": 0.1644780664192597, + "grad_norm": 1.8647359609603882, + "learning_rate": 4.673629954062797e-05, + "loss": 4.9256, + "step": 27656 + }, + { + "epoch": 0.16448401370254068, + "grad_norm": 2.2176151275634766, + "learning_rate": 4.6736068782001e-05, + "loss": 5.1344, + "step": 27657 + }, + { + "epoch": 0.16448996098582166, + "grad_norm": 2.300567626953125, + "learning_rate": 4.6735838015786185e-05, + "loss": 4.9018, + "step": 27658 + }, + { + "epoch": 0.16449590826910268, + "grad_norm": 2.458017110824585, + "learning_rate": 4.673560724198361e-05, + "loss": 5.2333, + "step": 27659 + }, + { + "epoch": 0.16450185555238367, + "grad_norm": 2.418851852416992, + "learning_rate": 4.673537646059336e-05, + "loss": 5.0428, + "step": 27660 + }, + { + "epoch": 0.16450780283566466, + "grad_norm": 2.163425922393799, + "learning_rate": 4.673514567161551e-05, + "loss": 5.2115, + "step": 27661 + }, + { + "epoch": 0.16451375011894567, + "grad_norm": 2.171957492828369, + "learning_rate": 4.673491487505015e-05, + "loss": 5.1336, + "step": 27662 + }, + { + "epoch": 0.16451969740222666, + "grad_norm": 1.6024816036224365, + "learning_rate": 4.6734684070897364e-05, + "loss": 5.2832, + "step": 27663 + }, + { + "epoch": 0.16452564468550765, + "grad_norm": 2.581366777420044, + "learning_rate": 4.673445325915722e-05, + "loss": 4.2245, + "step": 27664 + }, + { + "epoch": 0.16453159196878867, + "grad_norm": 2.65466570854187, + "learning_rate": 4.67342224398298e-05, + "loss": 3.8786, + "step": 27665 + }, + { + "epoch": 0.16453753925206965, + "grad_norm": 1.909327745437622, + "learning_rate": 4.673399161291519e-05, + "loss": 5.2398, + "step": 27666 + }, + { + "epoch": 0.16454348653535064, + "grad_norm": 2.0884993076324463, + "learning_rate": 4.673376077841346e-05, + "loss": 4.8081, + "step": 27667 + }, + { + "epoch": 0.16454943381863166, + "grad_norm": 2.1802215576171875, + "learning_rate": 4.67335299363247e-05, + "loss": 4.9251, + "step": 27668 + }, + { + "epoch": 0.16455538110191265, + "grad_norm": 2.281020402908325, + "learning_rate": 4.6733299086648996e-05, + "loss": 4.2682, + "step": 27669 + }, + { + "epoch": 0.16456132838519363, + "grad_norm": 2.34698224067688, + "learning_rate": 4.673306822938642e-05, + "loss": 3.8815, + "step": 27670 + }, + { + "epoch": 0.16456727566847465, + "grad_norm": 2.84965181350708, + "learning_rate": 4.673283736453705e-05, + "loss": 3.8124, + "step": 27671 + }, + { + "epoch": 0.16457322295175564, + "grad_norm": 2.604818344116211, + "learning_rate": 4.673260649210098e-05, + "loss": 3.8991, + "step": 27672 + }, + { + "epoch": 0.16457917023503663, + "grad_norm": 2.5472776889801025, + "learning_rate": 4.673237561207827e-05, + "loss": 3.8003, + "step": 27673 + }, + { + "epoch": 0.16458511751831764, + "grad_norm": 1.9040625095367432, + "learning_rate": 4.673214472446902e-05, + "loss": 4.1075, + "step": 27674 + }, + { + "epoch": 0.16459106480159863, + "grad_norm": 1.5493569374084473, + "learning_rate": 4.6731913829273303e-05, + "loss": 5.5934, + "step": 27675 + }, + { + "epoch": 0.16459701208487962, + "grad_norm": 1.683307409286499, + "learning_rate": 4.67316829264912e-05, + "loss": 5.3139, + "step": 27676 + }, + { + "epoch": 0.16460295936816063, + "grad_norm": 1.5558831691741943, + "learning_rate": 4.673145201612279e-05, + "loss": 5.331, + "step": 27677 + }, + { + "epoch": 0.16460890665144162, + "grad_norm": 1.7119014263153076, + "learning_rate": 4.673122109816815e-05, + "loss": 5.4438, + "step": 27678 + }, + { + "epoch": 0.1646148539347226, + "grad_norm": 2.4084794521331787, + "learning_rate": 4.673099017262737e-05, + "loss": 4.2357, + "step": 27679 + }, + { + "epoch": 0.16462080121800363, + "grad_norm": 1.8065168857574463, + "learning_rate": 4.673075923950053e-05, + "loss": 4.4894, + "step": 27680 + }, + { + "epoch": 0.16462674850128461, + "grad_norm": 1.5240797996520996, + "learning_rate": 4.673052829878769e-05, + "loss": 4.6992, + "step": 27681 + }, + { + "epoch": 0.1646326957845656, + "grad_norm": 1.9197040796279907, + "learning_rate": 4.673029735048896e-05, + "loss": 5.0591, + "step": 27682 + }, + { + "epoch": 0.16463864306784662, + "grad_norm": 1.5522626638412476, + "learning_rate": 4.673006639460441e-05, + "loss": 5.2923, + "step": 27683 + }, + { + "epoch": 0.1646445903511276, + "grad_norm": 1.663277506828308, + "learning_rate": 4.6729835431134115e-05, + "loss": 5.0555, + "step": 27684 + }, + { + "epoch": 0.1646505376344086, + "grad_norm": 1.5276461839675903, + "learning_rate": 4.672960446007816e-05, + "loss": 5.1765, + "step": 27685 + }, + { + "epoch": 0.1646564849176896, + "grad_norm": 1.5308914184570312, + "learning_rate": 4.672937348143662e-05, + "loss": 4.371, + "step": 27686 + }, + { + "epoch": 0.1646624322009706, + "grad_norm": 1.6172471046447754, + "learning_rate": 4.672914249520958e-05, + "loss": 4.8801, + "step": 27687 + }, + { + "epoch": 0.1646683794842516, + "grad_norm": 1.523914098739624, + "learning_rate": 4.6728911501397124e-05, + "loss": 4.3999, + "step": 27688 + }, + { + "epoch": 0.1646743267675326, + "grad_norm": 1.2214871644973755, + "learning_rate": 4.672868049999933e-05, + "loss": 3.7981, + "step": 27689 + }, + { + "epoch": 0.1646802740508136, + "grad_norm": 1.340168833732605, + "learning_rate": 4.672844949101628e-05, + "loss": 4.5471, + "step": 27690 + }, + { + "epoch": 0.16468622133409458, + "grad_norm": 1.8667452335357666, + "learning_rate": 4.672821847444805e-05, + "loss": 4.3881, + "step": 27691 + }, + { + "epoch": 0.1646921686173756, + "grad_norm": 3.047363042831421, + "learning_rate": 4.672798745029472e-05, + "loss": 3.7606, + "step": 27692 + }, + { + "epoch": 0.16469811590065658, + "grad_norm": 1.8616588115692139, + "learning_rate": 4.672775641855638e-05, + "loss": 5.0264, + "step": 27693 + }, + { + "epoch": 0.16470406318393757, + "grad_norm": 1.9045435190200806, + "learning_rate": 4.67275253792331e-05, + "loss": 4.5934, + "step": 27694 + }, + { + "epoch": 0.1647100104672186, + "grad_norm": 1.9803951978683472, + "learning_rate": 4.672729433232497e-05, + "loss": 4.0846, + "step": 27695 + }, + { + "epoch": 0.16471595775049958, + "grad_norm": 1.797312617301941, + "learning_rate": 4.672706327783206e-05, + "loss": 4.5876, + "step": 27696 + }, + { + "epoch": 0.16472190503378056, + "grad_norm": 1.954188585281372, + "learning_rate": 4.672683221575446e-05, + "loss": 4.3985, + "step": 27697 + }, + { + "epoch": 0.16472785231706158, + "grad_norm": 2.246690273284912, + "learning_rate": 4.6726601146092255e-05, + "loss": 4.1058, + "step": 27698 + }, + { + "epoch": 0.16473379960034257, + "grad_norm": 2.160576343536377, + "learning_rate": 4.67263700688455e-05, + "loss": 4.0139, + "step": 27699 + }, + { + "epoch": 0.16473974688362356, + "grad_norm": 2.5650711059570312, + "learning_rate": 4.672613898401431e-05, + "loss": 3.5785, + "step": 27700 + }, + { + "epoch": 0.16474569416690457, + "grad_norm": 2.6694283485412598, + "learning_rate": 4.6725907891598744e-05, + "loss": 3.4553, + "step": 27701 + }, + { + "epoch": 0.16475164145018556, + "grad_norm": 1.8965697288513184, + "learning_rate": 4.672567679159888e-05, + "loss": 3.8723, + "step": 27702 + }, + { + "epoch": 0.16475758873346655, + "grad_norm": 2.0568554401397705, + "learning_rate": 4.6725445684014824e-05, + "loss": 4.6828, + "step": 27703 + }, + { + "epoch": 0.16476353601674754, + "grad_norm": 1.7810505628585815, + "learning_rate": 4.672521456884663e-05, + "loss": 5.5463, + "step": 27704 + }, + { + "epoch": 0.16476948330002855, + "grad_norm": 1.8636524677276611, + "learning_rate": 4.6724983446094385e-05, + "loss": 4.9334, + "step": 27705 + }, + { + "epoch": 0.16477543058330954, + "grad_norm": 2.172565460205078, + "learning_rate": 4.6724752315758174e-05, + "loss": 5.4723, + "step": 27706 + }, + { + "epoch": 0.16478137786659053, + "grad_norm": 2.461881637573242, + "learning_rate": 4.672452117783808e-05, + "loss": 4.6138, + "step": 27707 + }, + { + "epoch": 0.16478732514987154, + "grad_norm": 2.3633780479431152, + "learning_rate": 4.672429003233418e-05, + "loss": 4.2636, + "step": 27708 + }, + { + "epoch": 0.16479327243315253, + "grad_norm": 2.4033286571502686, + "learning_rate": 4.6724058879246546e-05, + "loss": 3.716, + "step": 27709 + }, + { + "epoch": 0.16479921971643352, + "grad_norm": 2.201249599456787, + "learning_rate": 4.672382771857527e-05, + "loss": 4.9046, + "step": 27710 + }, + { + "epoch": 0.16480516699971454, + "grad_norm": 2.0308284759521484, + "learning_rate": 4.672359655032044e-05, + "loss": 4.255, + "step": 27711 + }, + { + "epoch": 0.16481111428299552, + "grad_norm": 2.46120023727417, + "learning_rate": 4.672336537448212e-05, + "loss": 3.7853, + "step": 27712 + }, + { + "epoch": 0.1648170615662765, + "grad_norm": 2.130208969116211, + "learning_rate": 4.6723134191060404e-05, + "loss": 3.8114, + "step": 27713 + }, + { + "epoch": 0.16482300884955753, + "grad_norm": 2.138585329055786, + "learning_rate": 4.672290300005536e-05, + "loss": 4.6266, + "step": 27714 + }, + { + "epoch": 0.16482895613283852, + "grad_norm": 1.8015727996826172, + "learning_rate": 4.6722671801467074e-05, + "loss": 4.2178, + "step": 27715 + }, + { + "epoch": 0.1648349034161195, + "grad_norm": 2.3047871589660645, + "learning_rate": 4.672244059529564e-05, + "loss": 4.258, + "step": 27716 + }, + { + "epoch": 0.16484085069940052, + "grad_norm": 2.022953987121582, + "learning_rate": 4.672220938154111e-05, + "loss": 3.7605, + "step": 27717 + }, + { + "epoch": 0.1648467979826815, + "grad_norm": 2.3721368312835693, + "learning_rate": 4.672197816020358e-05, + "loss": 3.6132, + "step": 27718 + }, + { + "epoch": 0.1648527452659625, + "grad_norm": 1.9578886032104492, + "learning_rate": 4.672174693128314e-05, + "loss": 3.9983, + "step": 27719 + }, + { + "epoch": 0.1648586925492435, + "grad_norm": 2.0287981033325195, + "learning_rate": 4.672151569477987e-05, + "loss": 3.8297, + "step": 27720 + }, + { + "epoch": 0.1648646398325245, + "grad_norm": 2.1453230381011963, + "learning_rate": 4.672128445069383e-05, + "loss": 3.5676, + "step": 27721 + }, + { + "epoch": 0.1648705871158055, + "grad_norm": 2.209982395172119, + "learning_rate": 4.672105319902512e-05, + "loss": 3.6304, + "step": 27722 + }, + { + "epoch": 0.1648765343990865, + "grad_norm": 2.1707348823547363, + "learning_rate": 4.672082193977382e-05, + "loss": 3.679, + "step": 27723 + }, + { + "epoch": 0.1648824816823675, + "grad_norm": 1.9688754081726074, + "learning_rate": 4.672059067293999e-05, + "loss": 4.235, + "step": 27724 + }, + { + "epoch": 0.16488842896564848, + "grad_norm": 1.988599419593811, + "learning_rate": 4.672035939852374e-05, + "loss": 3.8704, + "step": 27725 + }, + { + "epoch": 0.1648943762489295, + "grad_norm": 1.7759329080581665, + "learning_rate": 4.672012811652513e-05, + "loss": 4.5621, + "step": 27726 + }, + { + "epoch": 0.16490032353221049, + "grad_norm": 1.9790258407592773, + "learning_rate": 4.6719896826944255e-05, + "loss": 4.2214, + "step": 27727 + }, + { + "epoch": 0.16490627081549147, + "grad_norm": 1.6736228466033936, + "learning_rate": 4.671966552978118e-05, + "loss": 4.555, + "step": 27728 + }, + { + "epoch": 0.1649122180987725, + "grad_norm": 2.4587225914001465, + "learning_rate": 4.6719434225036e-05, + "loss": 4.4134, + "step": 27729 + }, + { + "epoch": 0.16491816538205348, + "grad_norm": 1.5891488790512085, + "learning_rate": 4.671920291270879e-05, + "loss": 5.4629, + "step": 27730 + }, + { + "epoch": 0.16492411266533447, + "grad_norm": 1.5606093406677246, + "learning_rate": 4.671897159279962e-05, + "loss": 4.5045, + "step": 27731 + }, + { + "epoch": 0.16493005994861548, + "grad_norm": 2.5481436252593994, + "learning_rate": 4.6718740265308595e-05, + "loss": 3.2812, + "step": 27732 + }, + { + "epoch": 0.16493600723189647, + "grad_norm": 2.602802276611328, + "learning_rate": 4.671850893023577e-05, + "loss": 3.082, + "step": 27733 + }, + { + "epoch": 0.16494195451517746, + "grad_norm": 2.3786399364471436, + "learning_rate": 4.6718277587581246e-05, + "loss": 3.5956, + "step": 27734 + }, + { + "epoch": 0.16494790179845847, + "grad_norm": 1.5555487871170044, + "learning_rate": 4.67180462373451e-05, + "loss": 5.2082, + "step": 27735 + }, + { + "epoch": 0.16495384908173946, + "grad_norm": 1.6801286935806274, + "learning_rate": 4.67178148795274e-05, + "loss": 5.3879, + "step": 27736 + }, + { + "epoch": 0.16495979636502045, + "grad_norm": 1.3999351263046265, + "learning_rate": 4.671758351412824e-05, + "loss": 4.9347, + "step": 27737 + }, + { + "epoch": 0.16496574364830147, + "grad_norm": 2.48246693611145, + "learning_rate": 4.6717352141147696e-05, + "loss": 3.5764, + "step": 27738 + }, + { + "epoch": 0.16497169093158245, + "grad_norm": 2.5625791549682617, + "learning_rate": 4.6717120760585844e-05, + "loss": 2.841, + "step": 27739 + }, + { + "epoch": 0.16497763821486344, + "grad_norm": 2.025188684463501, + "learning_rate": 4.6716889372442775e-05, + "loss": 3.9643, + "step": 27740 + }, + { + "epoch": 0.16498358549814446, + "grad_norm": 2.5314674377441406, + "learning_rate": 4.671665797671856e-05, + "loss": 1.9703, + "step": 27741 + }, + { + "epoch": 0.16498953278142545, + "grad_norm": 2.7406599521636963, + "learning_rate": 4.671642657341329e-05, + "loss": 3.0071, + "step": 27742 + }, + { + "epoch": 0.16499548006470643, + "grad_norm": 2.0033769607543945, + "learning_rate": 4.671619516252703e-05, + "loss": 4.5621, + "step": 27743 + }, + { + "epoch": 0.16500142734798745, + "grad_norm": 1.587997555732727, + "learning_rate": 4.6715963744059874e-05, + "loss": 4.9265, + "step": 27744 + }, + { + "epoch": 0.16500737463126844, + "grad_norm": 1.6401697397232056, + "learning_rate": 4.6715732318011905e-05, + "loss": 4.6801, + "step": 27745 + }, + { + "epoch": 0.16501332191454943, + "grad_norm": 2.994272232055664, + "learning_rate": 4.671550088438319e-05, + "loss": 2.0322, + "step": 27746 + }, + { + "epoch": 0.16501926919783044, + "grad_norm": 3.038865089416504, + "learning_rate": 4.671526944317382e-05, + "loss": 2.0138, + "step": 27747 + }, + { + "epoch": 0.16502521648111143, + "grad_norm": 2.9179296493530273, + "learning_rate": 4.671503799438388e-05, + "loss": 3.2955, + "step": 27748 + }, + { + "epoch": 0.16503116376439242, + "grad_norm": 1.7475281953811646, + "learning_rate": 4.6714806538013446e-05, + "loss": 5.4316, + "step": 27749 + }, + { + "epoch": 0.16503711104767343, + "grad_norm": 1.4781032800674438, + "learning_rate": 4.6714575074062596e-05, + "loss": 5.4519, + "step": 27750 + }, + { + "epoch": 0.16504305833095442, + "grad_norm": 1.3684823513031006, + "learning_rate": 4.6714343602531404e-05, + "loss": 5.3335, + "step": 27751 + }, + { + "epoch": 0.1650490056142354, + "grad_norm": 1.6875170469284058, + "learning_rate": 4.6714112123419965e-05, + "loss": 5.0396, + "step": 27752 + }, + { + "epoch": 0.16505495289751643, + "grad_norm": 1.6213173866271973, + "learning_rate": 4.6713880636728346e-05, + "loss": 4.763, + "step": 27753 + }, + { + "epoch": 0.16506090018079742, + "grad_norm": 1.5345633029937744, + "learning_rate": 4.6713649142456644e-05, + "loss": 4.9192, + "step": 27754 + }, + { + "epoch": 0.1650668474640784, + "grad_norm": 1.9773199558258057, + "learning_rate": 4.671341764060493e-05, + "loss": 4.7158, + "step": 27755 + }, + { + "epoch": 0.16507279474735942, + "grad_norm": 1.786027193069458, + "learning_rate": 4.6713186131173284e-05, + "loss": 5.6319, + "step": 27756 + }, + { + "epoch": 0.1650787420306404, + "grad_norm": 1.5743745565414429, + "learning_rate": 4.6712954614161797e-05, + "loss": 5.5466, + "step": 27757 + }, + { + "epoch": 0.1650846893139214, + "grad_norm": 1.6003429889678955, + "learning_rate": 4.671272308957053e-05, + "loss": 5.5166, + "step": 27758 + }, + { + "epoch": 0.1650906365972024, + "grad_norm": 1.567775011062622, + "learning_rate": 4.6712491557399585e-05, + "loss": 5.1731, + "step": 27759 + }, + { + "epoch": 0.1650965838804834, + "grad_norm": 1.7042558193206787, + "learning_rate": 4.671226001764903e-05, + "loss": 4.7854, + "step": 27760 + }, + { + "epoch": 0.1651025311637644, + "grad_norm": 2.414813280105591, + "learning_rate": 4.6712028470318946e-05, + "loss": 3.969, + "step": 27761 + }, + { + "epoch": 0.16510847844704538, + "grad_norm": 2.2361044883728027, + "learning_rate": 4.671179691540942e-05, + "loss": 4.0416, + "step": 27762 + }, + { + "epoch": 0.1651144257303264, + "grad_norm": 1.4998681545257568, + "learning_rate": 4.6711565352920526e-05, + "loss": 4.0769, + "step": 27763 + }, + { + "epoch": 0.16512037301360738, + "grad_norm": 1.8944214582443237, + "learning_rate": 4.6711333782852364e-05, + "loss": 3.9101, + "step": 27764 + }, + { + "epoch": 0.16512632029688837, + "grad_norm": 2.432981252670288, + "learning_rate": 4.671110220520498e-05, + "loss": 3.7838, + "step": 27765 + }, + { + "epoch": 0.16513226758016938, + "grad_norm": 2.3724024295806885, + "learning_rate": 4.6710870619978486e-05, + "loss": 4.0045, + "step": 27766 + }, + { + "epoch": 0.16513821486345037, + "grad_norm": 2.136061429977417, + "learning_rate": 4.671063902717295e-05, + "loss": 4.3335, + "step": 27767 + }, + { + "epoch": 0.16514416214673136, + "grad_norm": 2.263643264770508, + "learning_rate": 4.671040742678845e-05, + "loss": 4.417, + "step": 27768 + }, + { + "epoch": 0.16515010943001238, + "grad_norm": 2.2661242485046387, + "learning_rate": 4.671017581882507e-05, + "loss": 3.9163, + "step": 27769 + }, + { + "epoch": 0.16515605671329336, + "grad_norm": 1.9908580780029297, + "learning_rate": 4.6709944203282905e-05, + "loss": 4.5396, + "step": 27770 + }, + { + "epoch": 0.16516200399657435, + "grad_norm": 1.7676030397415161, + "learning_rate": 4.6709712580162014e-05, + "loss": 4.3445, + "step": 27771 + }, + { + "epoch": 0.16516795127985537, + "grad_norm": 2.308959722518921, + "learning_rate": 4.670948094946248e-05, + "loss": 3.3659, + "step": 27772 + }, + { + "epoch": 0.16517389856313636, + "grad_norm": 2.0675418376922607, + "learning_rate": 4.67092493111844e-05, + "loss": 3.5967, + "step": 27773 + }, + { + "epoch": 0.16517984584641734, + "grad_norm": 2.192368268966675, + "learning_rate": 4.670901766532784e-05, + "loss": 3.7969, + "step": 27774 + }, + { + "epoch": 0.16518579312969836, + "grad_norm": 2.0077974796295166, + "learning_rate": 4.670878601189289e-05, + "loss": 3.43, + "step": 27775 + }, + { + "epoch": 0.16519174041297935, + "grad_norm": 2.169884443283081, + "learning_rate": 4.670855435087963e-05, + "loss": 4.8072, + "step": 27776 + }, + { + "epoch": 0.16519768769626034, + "grad_norm": 2.4910061359405518, + "learning_rate": 4.670832268228813e-05, + "loss": 3.5874, + "step": 27777 + }, + { + "epoch": 0.16520363497954135, + "grad_norm": 2.0694758892059326, + "learning_rate": 4.670809100611848e-05, + "loss": 4.3965, + "step": 27778 + }, + { + "epoch": 0.16520958226282234, + "grad_norm": 1.5337821245193481, + "learning_rate": 4.670785932237076e-05, + "loss": 4.8369, + "step": 27779 + }, + { + "epoch": 0.16521552954610333, + "grad_norm": 1.8797821998596191, + "learning_rate": 4.670762763104506e-05, + "loss": 5.2661, + "step": 27780 + }, + { + "epoch": 0.16522147682938434, + "grad_norm": 1.6902687549591064, + "learning_rate": 4.670739593214144e-05, + "loss": 5.4648, + "step": 27781 + }, + { + "epoch": 0.16522742411266533, + "grad_norm": 1.485190987586975, + "learning_rate": 4.670716422565999e-05, + "loss": 4.9547, + "step": 27782 + }, + { + "epoch": 0.16523337139594632, + "grad_norm": 1.7863634824752808, + "learning_rate": 4.670693251160081e-05, + "loss": 4.9542, + "step": 27783 + }, + { + "epoch": 0.16523931867922734, + "grad_norm": 1.7533354759216309, + "learning_rate": 4.670670078996395e-05, + "loss": 4.7394, + "step": 27784 + }, + { + "epoch": 0.16524526596250833, + "grad_norm": 1.7423986196517944, + "learning_rate": 4.670646906074951e-05, + "loss": 4.8273, + "step": 27785 + }, + { + "epoch": 0.1652512132457893, + "grad_norm": 1.3752869367599487, + "learning_rate": 4.670623732395756e-05, + "loss": 5.0926, + "step": 27786 + }, + { + "epoch": 0.16525716052907033, + "grad_norm": 1.5826581716537476, + "learning_rate": 4.67060055795882e-05, + "loss": 5.167, + "step": 27787 + }, + { + "epoch": 0.16526310781235132, + "grad_norm": 1.5029367208480835, + "learning_rate": 4.6705773827641485e-05, + "loss": 5.145, + "step": 27788 + }, + { + "epoch": 0.1652690550956323, + "grad_norm": 1.720220923423767, + "learning_rate": 4.670554206811751e-05, + "loss": 5.2389, + "step": 27789 + }, + { + "epoch": 0.16527500237891332, + "grad_norm": 1.8043471574783325, + "learning_rate": 4.6705310301016355e-05, + "loss": 5.0942, + "step": 27790 + }, + { + "epoch": 0.1652809496621943, + "grad_norm": 1.7888808250427246, + "learning_rate": 4.670507852633811e-05, + "loss": 5.2764, + "step": 27791 + }, + { + "epoch": 0.1652868969454753, + "grad_norm": 1.6223100423812866, + "learning_rate": 4.6704846744082835e-05, + "loss": 5.2812, + "step": 27792 + }, + { + "epoch": 0.1652928442287563, + "grad_norm": 1.5120409727096558, + "learning_rate": 4.670461495425063e-05, + "loss": 5.2022, + "step": 27793 + }, + { + "epoch": 0.1652987915120373, + "grad_norm": 1.5947920083999634, + "learning_rate": 4.670438315684156e-05, + "loss": 5.2711, + "step": 27794 + }, + { + "epoch": 0.1653047387953183, + "grad_norm": 1.6690993309020996, + "learning_rate": 4.6704151351855716e-05, + "loss": 4.8284, + "step": 27795 + }, + { + "epoch": 0.1653106860785993, + "grad_norm": 1.4904134273529053, + "learning_rate": 4.670391953929318e-05, + "loss": 5.2171, + "step": 27796 + }, + { + "epoch": 0.1653166333618803, + "grad_norm": 1.556333065032959, + "learning_rate": 4.6703687719154034e-05, + "loss": 5.6598, + "step": 27797 + }, + { + "epoch": 0.16532258064516128, + "grad_norm": 1.55083167552948, + "learning_rate": 4.670345589143835e-05, + "loss": 5.5919, + "step": 27798 + }, + { + "epoch": 0.1653285279284423, + "grad_norm": 1.9281244277954102, + "learning_rate": 4.670322405614621e-05, + "loss": 5.3313, + "step": 27799 + }, + { + "epoch": 0.1653344752117233, + "grad_norm": 1.4666374921798706, + "learning_rate": 4.670299221327771e-05, + "loss": 5.0905, + "step": 27800 + }, + { + "epoch": 0.16534042249500427, + "grad_norm": 1.8032478094100952, + "learning_rate": 4.670276036283291e-05, + "loss": 4.9322, + "step": 27801 + }, + { + "epoch": 0.1653463697782853, + "grad_norm": 1.7652195692062378, + "learning_rate": 4.67025285048119e-05, + "loss": 4.6763, + "step": 27802 + }, + { + "epoch": 0.16535231706156628, + "grad_norm": 1.7903348207473755, + "learning_rate": 4.6702296639214766e-05, + "loss": 4.491, + "step": 27803 + }, + { + "epoch": 0.16535826434484727, + "grad_norm": 1.6135162115097046, + "learning_rate": 4.6702064766041584e-05, + "loss": 4.3593, + "step": 27804 + }, + { + "epoch": 0.16536421162812828, + "grad_norm": 1.5779284238815308, + "learning_rate": 4.670183288529243e-05, + "loss": 4.3606, + "step": 27805 + }, + { + "epoch": 0.16537015891140927, + "grad_norm": 1.6469144821166992, + "learning_rate": 4.67016009969674e-05, + "loss": 4.3772, + "step": 27806 + }, + { + "epoch": 0.16537610619469026, + "grad_norm": 2.209540367126465, + "learning_rate": 4.670136910106656e-05, + "loss": 5.1859, + "step": 27807 + }, + { + "epoch": 0.16538205347797127, + "grad_norm": 2.5719592571258545, + "learning_rate": 4.670113719758999e-05, + "loss": 5.1312, + "step": 27808 + }, + { + "epoch": 0.16538800076125226, + "grad_norm": 2.1322646141052246, + "learning_rate": 4.670090528653779e-05, + "loss": 5.5602, + "step": 27809 + }, + { + "epoch": 0.16539394804453325, + "grad_norm": 1.8350342512130737, + "learning_rate": 4.670067336791002e-05, + "loss": 5.6963, + "step": 27810 + }, + { + "epoch": 0.16539989532781427, + "grad_norm": 1.6520220041275024, + "learning_rate": 4.670044144170677e-05, + "loss": 5.8053, + "step": 27811 + }, + { + "epoch": 0.16540584261109526, + "grad_norm": 1.559950590133667, + "learning_rate": 4.670020950792812e-05, + "loss": 5.5382, + "step": 27812 + }, + { + "epoch": 0.16541178989437624, + "grad_norm": 1.7970432043075562, + "learning_rate": 4.669997756657415e-05, + "loss": 4.7823, + "step": 27813 + }, + { + "epoch": 0.16541773717765726, + "grad_norm": 1.8613402843475342, + "learning_rate": 4.6699745617644945e-05, + "loss": 5.3559, + "step": 27814 + }, + { + "epoch": 0.16542368446093825, + "grad_norm": 2.660762310028076, + "learning_rate": 4.669951366114058e-05, + "loss": 4.7255, + "step": 27815 + }, + { + "epoch": 0.16542963174421924, + "grad_norm": 2.8636231422424316, + "learning_rate": 4.669928169706114e-05, + "loss": 4.8591, + "step": 27816 + }, + { + "epoch": 0.16543557902750025, + "grad_norm": 1.6894406080245972, + "learning_rate": 4.669904972540671e-05, + "loss": 5.0576, + "step": 27817 + }, + { + "epoch": 0.16544152631078124, + "grad_norm": 2.539830207824707, + "learning_rate": 4.669881774617736e-05, + "loss": 4.9346, + "step": 27818 + }, + { + "epoch": 0.16544747359406223, + "grad_norm": 2.0870940685272217, + "learning_rate": 4.669858575937318e-05, + "loss": 5.0034, + "step": 27819 + }, + { + "epoch": 0.16545342087734322, + "grad_norm": 1.6307538747787476, + "learning_rate": 4.669835376499425e-05, + "loss": 5.4536, + "step": 27820 + }, + { + "epoch": 0.16545936816062423, + "grad_norm": 1.1906611919403076, + "learning_rate": 4.669812176304064e-05, + "loss": 5.5965, + "step": 27821 + }, + { + "epoch": 0.16546531544390522, + "grad_norm": 1.5987422466278076, + "learning_rate": 4.669788975351245e-05, + "loss": 5.4403, + "step": 27822 + }, + { + "epoch": 0.1654712627271862, + "grad_norm": 2.267430543899536, + "learning_rate": 4.669765773640974e-05, + "loss": 5.0344, + "step": 27823 + }, + { + "epoch": 0.16547721001046722, + "grad_norm": 2.2842605113983154, + "learning_rate": 4.669742571173261e-05, + "loss": 4.481, + "step": 27824 + }, + { + "epoch": 0.1654831572937482, + "grad_norm": 1.5940486192703247, + "learning_rate": 4.6697193679481135e-05, + "loss": 5.1313, + "step": 27825 + }, + { + "epoch": 0.1654891045770292, + "grad_norm": 1.9549680948257446, + "learning_rate": 4.6696961639655386e-05, + "loss": 5.1298, + "step": 27826 + }, + { + "epoch": 0.16549505186031022, + "grad_norm": 2.387866497039795, + "learning_rate": 4.6696729592255454e-05, + "loss": 4.9029, + "step": 27827 + }, + { + "epoch": 0.1655009991435912, + "grad_norm": 1.6883118152618408, + "learning_rate": 4.669649753728142e-05, + "loss": 5.1273, + "step": 27828 + }, + { + "epoch": 0.1655069464268722, + "grad_norm": 1.6538794040679932, + "learning_rate": 4.669626547473336e-05, + "loss": 5.2022, + "step": 27829 + }, + { + "epoch": 0.1655128937101532, + "grad_norm": 1.7652950286865234, + "learning_rate": 4.669603340461136e-05, + "loss": 5.5397, + "step": 27830 + }, + { + "epoch": 0.1655188409934342, + "grad_norm": 1.6421597003936768, + "learning_rate": 4.66958013269155e-05, + "loss": 4.9982, + "step": 27831 + }, + { + "epoch": 0.16552478827671518, + "grad_norm": 1.5024685859680176, + "learning_rate": 4.669556924164586e-05, + "loss": 5.6933, + "step": 27832 + }, + { + "epoch": 0.1655307355599962, + "grad_norm": 1.4680891036987305, + "learning_rate": 4.669533714880252e-05, + "loss": 5.3262, + "step": 27833 + }, + { + "epoch": 0.1655366828432772, + "grad_norm": 1.375623345375061, + "learning_rate": 4.669510504838556e-05, + "loss": 5.2673, + "step": 27834 + }, + { + "epoch": 0.16554263012655818, + "grad_norm": 2.1354503631591797, + "learning_rate": 4.669487294039506e-05, + "loss": 4.2156, + "step": 27835 + }, + { + "epoch": 0.1655485774098392, + "grad_norm": 1.5564913749694824, + "learning_rate": 4.669464082483112e-05, + "loss": 4.7238, + "step": 27836 + }, + { + "epoch": 0.16555452469312018, + "grad_norm": 1.6255192756652832, + "learning_rate": 4.669440870169379e-05, + "loss": 5.6043, + "step": 27837 + }, + { + "epoch": 0.16556047197640117, + "grad_norm": 1.6268353462219238, + "learning_rate": 4.6694176570983174e-05, + "loss": 5.3919, + "step": 27838 + }, + { + "epoch": 0.16556641925968218, + "grad_norm": 1.5626128911972046, + "learning_rate": 4.669394443269933e-05, + "loss": 5.5142, + "step": 27839 + }, + { + "epoch": 0.16557236654296317, + "grad_norm": 1.5001987218856812, + "learning_rate": 4.669371228684237e-05, + "loss": 4.7294, + "step": 27840 + }, + { + "epoch": 0.16557831382624416, + "grad_norm": 1.5922046899795532, + "learning_rate": 4.669348013341235e-05, + "loss": 4.9363, + "step": 27841 + }, + { + "epoch": 0.16558426110952518, + "grad_norm": 1.555086374282837, + "learning_rate": 4.669324797240937e-05, + "loss": 4.6704, + "step": 27842 + }, + { + "epoch": 0.16559020839280617, + "grad_norm": 1.711538553237915, + "learning_rate": 4.66930158038335e-05, + "loss": 4.673, + "step": 27843 + }, + { + "epoch": 0.16559615567608715, + "grad_norm": 1.7905937433242798, + "learning_rate": 4.669278362768481e-05, + "loss": 4.5295, + "step": 27844 + }, + { + "epoch": 0.16560210295936817, + "grad_norm": 1.8714954853057861, + "learning_rate": 4.669255144396341e-05, + "loss": 4.699, + "step": 27845 + }, + { + "epoch": 0.16560805024264916, + "grad_norm": 1.6783734560012817, + "learning_rate": 4.669231925266935e-05, + "loss": 5.5447, + "step": 27846 + }, + { + "epoch": 0.16561399752593015, + "grad_norm": 1.3632158041000366, + "learning_rate": 4.669208705380273e-05, + "loss": 5.5541, + "step": 27847 + }, + { + "epoch": 0.16561994480921116, + "grad_norm": 1.6476699113845825, + "learning_rate": 4.669185484736362e-05, + "loss": 4.5751, + "step": 27848 + }, + { + "epoch": 0.16562589209249215, + "grad_norm": 1.630963921546936, + "learning_rate": 4.669162263335212e-05, + "loss": 5.3621, + "step": 27849 + }, + { + "epoch": 0.16563183937577314, + "grad_norm": 1.4858328104019165, + "learning_rate": 4.66913904117683e-05, + "loss": 5.3973, + "step": 27850 + }, + { + "epoch": 0.16563778665905415, + "grad_norm": 1.7069036960601807, + "learning_rate": 4.669115818261223e-05, + "loss": 5.0102, + "step": 27851 + }, + { + "epoch": 0.16564373394233514, + "grad_norm": 1.4385701417922974, + "learning_rate": 4.6690925945884e-05, + "loss": 5.4805, + "step": 27852 + }, + { + "epoch": 0.16564968122561613, + "grad_norm": 1.6895365715026855, + "learning_rate": 4.66906937015837e-05, + "loss": 4.9834, + "step": 27853 + }, + { + "epoch": 0.16565562850889715, + "grad_norm": 2.1618361473083496, + "learning_rate": 4.66904614497114e-05, + "loss": 4.6309, + "step": 27854 + }, + { + "epoch": 0.16566157579217813, + "grad_norm": 2.331005811691284, + "learning_rate": 4.669022919026718e-05, + "loss": 4.1853, + "step": 27855 + }, + { + "epoch": 0.16566752307545912, + "grad_norm": 1.7161813974380493, + "learning_rate": 4.668999692325113e-05, + "loss": 4.5842, + "step": 27856 + }, + { + "epoch": 0.16567347035874014, + "grad_norm": 2.117947816848755, + "learning_rate": 4.668976464866332e-05, + "loss": 4.6009, + "step": 27857 + }, + { + "epoch": 0.16567941764202113, + "grad_norm": 1.6272234916687012, + "learning_rate": 4.6689532366503846e-05, + "loss": 4.8592, + "step": 27858 + }, + { + "epoch": 0.16568536492530211, + "grad_norm": 1.9852404594421387, + "learning_rate": 4.6689300076772776e-05, + "loss": 4.363, + "step": 27859 + }, + { + "epoch": 0.16569131220858313, + "grad_norm": 1.6235220432281494, + "learning_rate": 4.6689067779470194e-05, + "loss": 4.6625, + "step": 27860 + }, + { + "epoch": 0.16569725949186412, + "grad_norm": 1.7212275266647339, + "learning_rate": 4.668883547459618e-05, + "loss": 4.7013, + "step": 27861 + }, + { + "epoch": 0.1657032067751451, + "grad_norm": 2.5496368408203125, + "learning_rate": 4.6688603162150824e-05, + "loss": 4.0435, + "step": 27862 + }, + { + "epoch": 0.16570915405842612, + "grad_norm": 2.681366443634033, + "learning_rate": 4.66883708421342e-05, + "loss": 4.4567, + "step": 27863 + }, + { + "epoch": 0.1657151013417071, + "grad_norm": 2.2227134704589844, + "learning_rate": 4.668813851454639e-05, + "loss": 4.5467, + "step": 27864 + }, + { + "epoch": 0.1657210486249881, + "grad_norm": 2.413037061691284, + "learning_rate": 4.668790617938748e-05, + "loss": 4.1955, + "step": 27865 + }, + { + "epoch": 0.16572699590826911, + "grad_norm": 2.749058723449707, + "learning_rate": 4.668767383665753e-05, + "loss": 4.1209, + "step": 27866 + }, + { + "epoch": 0.1657329431915501, + "grad_norm": 2.075108528137207, + "learning_rate": 4.668744148635665e-05, + "loss": 4.2322, + "step": 27867 + }, + { + "epoch": 0.1657388904748311, + "grad_norm": 1.7476239204406738, + "learning_rate": 4.66872091284849e-05, + "loss": 4.7075, + "step": 27868 + }, + { + "epoch": 0.1657448377581121, + "grad_norm": 1.7722108364105225, + "learning_rate": 4.6686976763042376e-05, + "loss": 4.7211, + "step": 27869 + }, + { + "epoch": 0.1657507850413931, + "grad_norm": 1.57614266872406, + "learning_rate": 4.668674439002915e-05, + "loss": 4.8495, + "step": 27870 + }, + { + "epoch": 0.16575673232467408, + "grad_norm": 1.5763459205627441, + "learning_rate": 4.6686512009445306e-05, + "loss": 5.1311, + "step": 27871 + }, + { + "epoch": 0.1657626796079551, + "grad_norm": 1.5253850221633911, + "learning_rate": 4.6686279621290925e-05, + "loss": 5.3513, + "step": 27872 + }, + { + "epoch": 0.1657686268912361, + "grad_norm": 1.8837103843688965, + "learning_rate": 4.668604722556609e-05, + "loss": 4.9349, + "step": 27873 + }, + { + "epoch": 0.16577457417451708, + "grad_norm": 1.7190310955047607, + "learning_rate": 4.668581482227087e-05, + "loss": 5.4962, + "step": 27874 + }, + { + "epoch": 0.1657805214577981, + "grad_norm": 1.6501142978668213, + "learning_rate": 4.668558241140537e-05, + "loss": 5.0092, + "step": 27875 + }, + { + "epoch": 0.16578646874107908, + "grad_norm": 2.03367018699646, + "learning_rate": 4.668534999296965e-05, + "loss": 5.2323, + "step": 27876 + }, + { + "epoch": 0.16579241602436007, + "grad_norm": 2.455427885055542, + "learning_rate": 4.66851175669638e-05, + "loss": 4.2927, + "step": 27877 + }, + { + "epoch": 0.16579836330764108, + "grad_norm": 2.443146228790283, + "learning_rate": 4.668488513338789e-05, + "loss": 4.3029, + "step": 27878 + }, + { + "epoch": 0.16580431059092207, + "grad_norm": 2.656646251678467, + "learning_rate": 4.6684652692242026e-05, + "loss": 4.2249, + "step": 27879 + }, + { + "epoch": 0.16581025787420306, + "grad_norm": 2.4562222957611084, + "learning_rate": 4.668442024352626e-05, + "loss": 4.5162, + "step": 27880 + }, + { + "epoch": 0.16581620515748405, + "grad_norm": 2.8980703353881836, + "learning_rate": 4.6684187787240695e-05, + "loss": 4.1083, + "step": 27881 + }, + { + "epoch": 0.16582215244076506, + "grad_norm": 2.5985610485076904, + "learning_rate": 4.668395532338541e-05, + "loss": 4.1557, + "step": 27882 + }, + { + "epoch": 0.16582809972404605, + "grad_norm": 2.4054651260375977, + "learning_rate": 4.6683722851960465e-05, + "loss": 4.2334, + "step": 27883 + }, + { + "epoch": 0.16583404700732704, + "grad_norm": 2.0977237224578857, + "learning_rate": 4.668349037296597e-05, + "loss": 4.5715, + "step": 27884 + }, + { + "epoch": 0.16583999429060806, + "grad_norm": 2.0701677799224854, + "learning_rate": 4.6683257886401985e-05, + "loss": 4.7195, + "step": 27885 + }, + { + "epoch": 0.16584594157388904, + "grad_norm": 1.9294004440307617, + "learning_rate": 4.6683025392268597e-05, + "loss": 4.6521, + "step": 27886 + }, + { + "epoch": 0.16585188885717003, + "grad_norm": 2.1713595390319824, + "learning_rate": 4.66827928905659e-05, + "loss": 4.7052, + "step": 27887 + }, + { + "epoch": 0.16585783614045105, + "grad_norm": 2.835434913635254, + "learning_rate": 4.668256038129395e-05, + "loss": 4.4006, + "step": 27888 + }, + { + "epoch": 0.16586378342373204, + "grad_norm": 2.466986894607544, + "learning_rate": 4.668232786445285e-05, + "loss": 4.3107, + "step": 27889 + }, + { + "epoch": 0.16586973070701302, + "grad_norm": 1.7013013362884521, + "learning_rate": 4.6682095340042675e-05, + "loss": 4.4813, + "step": 27890 + }, + { + "epoch": 0.16587567799029404, + "grad_norm": 1.7486096620559692, + "learning_rate": 4.66818628080635e-05, + "loss": 4.6227, + "step": 27891 + }, + { + "epoch": 0.16588162527357503, + "grad_norm": 1.6579736471176147, + "learning_rate": 4.6681630268515407e-05, + "loss": 4.7124, + "step": 27892 + }, + { + "epoch": 0.16588757255685602, + "grad_norm": 1.5885511636734009, + "learning_rate": 4.668139772139849e-05, + "loss": 4.6244, + "step": 27893 + }, + { + "epoch": 0.16589351984013703, + "grad_norm": 1.5703203678131104, + "learning_rate": 4.668116516671282e-05, + "loss": 4.7233, + "step": 27894 + }, + { + "epoch": 0.16589946712341802, + "grad_norm": 1.6852905750274658, + "learning_rate": 4.668093260445847e-05, + "loss": 5.0091, + "step": 27895 + }, + { + "epoch": 0.165905414406699, + "grad_norm": 1.7425652742385864, + "learning_rate": 4.668070003463554e-05, + "loss": 5.0271, + "step": 27896 + }, + { + "epoch": 0.16591136168998002, + "grad_norm": 1.7271431684494019, + "learning_rate": 4.6680467457244104e-05, + "loss": 4.525, + "step": 27897 + }, + { + "epoch": 0.165917308973261, + "grad_norm": 1.8759088516235352, + "learning_rate": 4.668023487228423e-05, + "loss": 4.4729, + "step": 27898 + }, + { + "epoch": 0.165923256256542, + "grad_norm": 1.5073447227478027, + "learning_rate": 4.668000227975602e-05, + "loss": 4.8768, + "step": 27899 + }, + { + "epoch": 0.16592920353982302, + "grad_norm": 1.3689100742340088, + "learning_rate": 4.667976967965954e-05, + "loss": 5.1424, + "step": 27900 + }, + { + "epoch": 0.165935150823104, + "grad_norm": 1.7475918531417847, + "learning_rate": 4.6679537071994874e-05, + "loss": 4.7103, + "step": 27901 + }, + { + "epoch": 0.165941098106385, + "grad_norm": 1.5559403896331787, + "learning_rate": 4.6679304456762107e-05, + "loss": 5.0524, + "step": 27902 + }, + { + "epoch": 0.165947045389666, + "grad_norm": 1.7627094984054565, + "learning_rate": 4.667907183396132e-05, + "loss": 4.9901, + "step": 27903 + }, + { + "epoch": 0.165952992672947, + "grad_norm": 1.8173136711120605, + "learning_rate": 4.667883920359259e-05, + "loss": 4.6419, + "step": 27904 + }, + { + "epoch": 0.16595893995622799, + "grad_norm": 2.0207037925720215, + "learning_rate": 4.667860656565601e-05, + "loss": 5.2537, + "step": 27905 + }, + { + "epoch": 0.165964887239509, + "grad_norm": 1.6715987920761108, + "learning_rate": 4.6678373920151646e-05, + "loss": 5.0337, + "step": 27906 + }, + { + "epoch": 0.16597083452279, + "grad_norm": 1.6425293684005737, + "learning_rate": 4.667814126707959e-05, + "loss": 5.0065, + "step": 27907 + }, + { + "epoch": 0.16597678180607098, + "grad_norm": 1.8118547201156616, + "learning_rate": 4.667790860643991e-05, + "loss": 4.9293, + "step": 27908 + }, + { + "epoch": 0.165982729089352, + "grad_norm": 1.5994832515716553, + "learning_rate": 4.66776759382327e-05, + "loss": 5.2379, + "step": 27909 + }, + { + "epoch": 0.16598867637263298, + "grad_norm": 1.6475836038589478, + "learning_rate": 4.667744326245804e-05, + "loss": 5.4609, + "step": 27910 + }, + { + "epoch": 0.16599462365591397, + "grad_norm": 1.4168953895568848, + "learning_rate": 4.6677210579116e-05, + "loss": 5.5907, + "step": 27911 + }, + { + "epoch": 0.16600057093919499, + "grad_norm": 1.46638822555542, + "learning_rate": 4.667697788820669e-05, + "loss": 5.4639, + "step": 27912 + }, + { + "epoch": 0.16600651822247597, + "grad_norm": 1.6889718770980835, + "learning_rate": 4.667674518973015e-05, + "loss": 5.4013, + "step": 27913 + }, + { + "epoch": 0.16601246550575696, + "grad_norm": 1.8182064294815063, + "learning_rate": 4.6676512483686495e-05, + "loss": 4.7796, + "step": 27914 + }, + { + "epoch": 0.16601841278903798, + "grad_norm": 1.6663529872894287, + "learning_rate": 4.6676279770075784e-05, + "loss": 4.8987, + "step": 27915 + }, + { + "epoch": 0.16602436007231897, + "grad_norm": 1.762170672416687, + "learning_rate": 4.6676047048898115e-05, + "loss": 4.8513, + "step": 27916 + }, + { + "epoch": 0.16603030735559995, + "grad_norm": 1.6480133533477783, + "learning_rate": 4.6675814320153554e-05, + "loss": 4.7579, + "step": 27917 + }, + { + "epoch": 0.16603625463888097, + "grad_norm": 1.698567271232605, + "learning_rate": 4.66755815838422e-05, + "loss": 4.8489, + "step": 27918 + }, + { + "epoch": 0.16604220192216196, + "grad_norm": 1.5158538818359375, + "learning_rate": 4.667534883996412e-05, + "loss": 4.878, + "step": 27919 + }, + { + "epoch": 0.16604814920544295, + "grad_norm": 2.1120738983154297, + "learning_rate": 4.66751160885194e-05, + "loss": 4.8203, + "step": 27920 + }, + { + "epoch": 0.16605409648872396, + "grad_norm": 1.8523337841033936, + "learning_rate": 4.667488332950812e-05, + "loss": 4.79, + "step": 27921 + }, + { + "epoch": 0.16606004377200495, + "grad_norm": 1.9057866334915161, + "learning_rate": 4.6674650562930364e-05, + "loss": 4.55, + "step": 27922 + }, + { + "epoch": 0.16606599105528594, + "grad_norm": 1.690329670906067, + "learning_rate": 4.6674417788786206e-05, + "loss": 4.8434, + "step": 27923 + }, + { + "epoch": 0.16607193833856695, + "grad_norm": 1.796695590019226, + "learning_rate": 4.667418500707574e-05, + "loss": 4.8883, + "step": 27924 + }, + { + "epoch": 0.16607788562184794, + "grad_norm": 1.9424879550933838, + "learning_rate": 4.6673952217799035e-05, + "loss": 4.2556, + "step": 27925 + }, + { + "epoch": 0.16608383290512893, + "grad_norm": 1.886226773262024, + "learning_rate": 4.6673719420956176e-05, + "loss": 4.5911, + "step": 27926 + }, + { + "epoch": 0.16608978018840995, + "grad_norm": 2.1246280670166016, + "learning_rate": 4.6673486616547254e-05, + "loss": 4.5398, + "step": 27927 + }, + { + "epoch": 0.16609572747169093, + "grad_norm": 2.219155788421631, + "learning_rate": 4.667325380457233e-05, + "loss": 4.6747, + "step": 27928 + }, + { + "epoch": 0.16610167475497192, + "grad_norm": 2.0169975757598877, + "learning_rate": 4.66730209850315e-05, + "loss": 4.7622, + "step": 27929 + }, + { + "epoch": 0.16610762203825294, + "grad_norm": 1.884619116783142, + "learning_rate": 4.667278815792485e-05, + "loss": 5.0192, + "step": 27930 + }, + { + "epoch": 0.16611356932153393, + "grad_norm": 1.8539994955062866, + "learning_rate": 4.6672555323252446e-05, + "loss": 4.2732, + "step": 27931 + }, + { + "epoch": 0.16611951660481492, + "grad_norm": 2.045879364013672, + "learning_rate": 4.667232248101439e-05, + "loss": 3.8245, + "step": 27932 + }, + { + "epoch": 0.16612546388809593, + "grad_norm": 2.005019426345825, + "learning_rate": 4.667208963121073e-05, + "loss": 3.9687, + "step": 27933 + }, + { + "epoch": 0.16613141117137692, + "grad_norm": 1.7998180389404297, + "learning_rate": 4.667185677384158e-05, + "loss": 3.84, + "step": 27934 + }, + { + "epoch": 0.1661373584546579, + "grad_norm": 1.9813350439071655, + "learning_rate": 4.6671623908907e-05, + "loss": 3.7387, + "step": 27935 + }, + { + "epoch": 0.16614330573793892, + "grad_norm": 1.9212778806686401, + "learning_rate": 4.6671391036407086e-05, + "loss": 3.48, + "step": 27936 + }, + { + "epoch": 0.1661492530212199, + "grad_norm": 1.9081000089645386, + "learning_rate": 4.667115815634192e-05, + "loss": 3.4218, + "step": 27937 + }, + { + "epoch": 0.1661552003045009, + "grad_norm": 2.209960699081421, + "learning_rate": 4.667092526871156e-05, + "loss": 3.7272, + "step": 27938 + }, + { + "epoch": 0.1661611475877819, + "grad_norm": 2.3802664279937744, + "learning_rate": 4.6670692373516124e-05, + "loss": 3.6476, + "step": 27939 + }, + { + "epoch": 0.1661670948710629, + "grad_norm": 2.359929323196411, + "learning_rate": 4.667045947075566e-05, + "loss": 3.7406, + "step": 27940 + }, + { + "epoch": 0.1661730421543439, + "grad_norm": 2.242333173751831, + "learning_rate": 4.6670226560430266e-05, + "loss": 3.8315, + "step": 27941 + }, + { + "epoch": 0.16617898943762488, + "grad_norm": 1.7727068662643433, + "learning_rate": 4.6669993642540017e-05, + "loss": 4.6083, + "step": 27942 + }, + { + "epoch": 0.1661849367209059, + "grad_norm": 2.2704246044158936, + "learning_rate": 4.6669760717085e-05, + "loss": 3.947, + "step": 27943 + }, + { + "epoch": 0.16619088400418688, + "grad_norm": 2.550279140472412, + "learning_rate": 4.6669527784065295e-05, + "loss": 3.5335, + "step": 27944 + }, + { + "epoch": 0.16619683128746787, + "grad_norm": 2.455237627029419, + "learning_rate": 4.666929484348097e-05, + "loss": 3.5817, + "step": 27945 + }, + { + "epoch": 0.1662027785707489, + "grad_norm": 1.9026764631271362, + "learning_rate": 4.666906189533213e-05, + "loss": 3.4742, + "step": 27946 + }, + { + "epoch": 0.16620872585402988, + "grad_norm": 1.9334417581558228, + "learning_rate": 4.6668828939618845e-05, + "loss": 3.3938, + "step": 27947 + }, + { + "epoch": 0.16621467313731086, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.666859597634119e-05, + "loss": 4.0506, + "step": 27948 + }, + { + "epoch": 0.16622062042059188, + "grad_norm": 1.702767252922058, + "learning_rate": 4.666836300549926e-05, + "loss": 5.1613, + "step": 27949 + }, + { + "epoch": 0.16622656770387287, + "grad_norm": 2.1399359703063965, + "learning_rate": 4.666813002709312e-05, + "loss": 4.9766, + "step": 27950 + }, + { + "epoch": 0.16623251498715386, + "grad_norm": 2.493435859680176, + "learning_rate": 4.666789704112286e-05, + "loss": 4.2058, + "step": 27951 + }, + { + "epoch": 0.16623846227043487, + "grad_norm": 2.689168930053711, + "learning_rate": 4.666766404758857e-05, + "loss": 3.7151, + "step": 27952 + }, + { + "epoch": 0.16624440955371586, + "grad_norm": 2.172666311264038, + "learning_rate": 4.666743104649031e-05, + "loss": 3.6916, + "step": 27953 + }, + { + "epoch": 0.16625035683699685, + "grad_norm": 1.551274299621582, + "learning_rate": 4.6667198037828173e-05, + "loss": 4.9331, + "step": 27954 + }, + { + "epoch": 0.16625630412027786, + "grad_norm": 1.7849092483520508, + "learning_rate": 4.666696502160226e-05, + "loss": 5.288, + "step": 27955 + }, + { + "epoch": 0.16626225140355885, + "grad_norm": 1.8850775957107544, + "learning_rate": 4.6666731997812614e-05, + "loss": 5.1946, + "step": 27956 + }, + { + "epoch": 0.16626819868683984, + "grad_norm": 1.4710248708724976, + "learning_rate": 4.666649896645934e-05, + "loss": 5.2753, + "step": 27957 + }, + { + "epoch": 0.16627414597012086, + "grad_norm": 1.4987525939941406, + "learning_rate": 4.6666265927542516e-05, + "loss": 5.3751, + "step": 27958 + }, + { + "epoch": 0.16628009325340184, + "grad_norm": 1.5894343852996826, + "learning_rate": 4.666603288106223e-05, + "loss": 5.1087, + "step": 27959 + }, + { + "epoch": 0.16628604053668283, + "grad_norm": 1.491098165512085, + "learning_rate": 4.666579982701855e-05, + "loss": 5.1876, + "step": 27960 + }, + { + "epoch": 0.16629198781996385, + "grad_norm": 1.6574211120605469, + "learning_rate": 4.666556676541156e-05, + "loss": 5.1677, + "step": 27961 + }, + { + "epoch": 0.16629793510324484, + "grad_norm": 1.3962849378585815, + "learning_rate": 4.666533369624135e-05, + "loss": 4.6312, + "step": 27962 + }, + { + "epoch": 0.16630388238652583, + "grad_norm": 1.3819752931594849, + "learning_rate": 4.6665100619507986e-05, + "loss": 5.1794, + "step": 27963 + }, + { + "epoch": 0.16630982966980684, + "grad_norm": 1.392821192741394, + "learning_rate": 4.666486753521157e-05, + "loss": 5.192, + "step": 27964 + }, + { + "epoch": 0.16631577695308783, + "grad_norm": 1.3655375242233276, + "learning_rate": 4.6664634443352176e-05, + "loss": 5.0533, + "step": 27965 + }, + { + "epoch": 0.16632172423636882, + "grad_norm": 1.7046358585357666, + "learning_rate": 4.6664401343929864e-05, + "loss": 4.7244, + "step": 27966 + }, + { + "epoch": 0.16632767151964983, + "grad_norm": 1.8924365043640137, + "learning_rate": 4.6664168236944747e-05, + "loss": 4.8182, + "step": 27967 + }, + { + "epoch": 0.16633361880293082, + "grad_norm": 1.7032650709152222, + "learning_rate": 4.666393512239689e-05, + "loss": 4.6594, + "step": 27968 + }, + { + "epoch": 0.1663395660862118, + "grad_norm": 2.0425281524658203, + "learning_rate": 4.666370200028638e-05, + "loss": 4.0096, + "step": 27969 + }, + { + "epoch": 0.16634551336949283, + "grad_norm": 2.4013113975524902, + "learning_rate": 4.666346887061329e-05, + "loss": 3.7662, + "step": 27970 + }, + { + "epoch": 0.1663514606527738, + "grad_norm": 1.8698662519454956, + "learning_rate": 4.666323573337771e-05, + "loss": 4.2575, + "step": 27971 + }, + { + "epoch": 0.1663574079360548, + "grad_norm": 1.5415421724319458, + "learning_rate": 4.666300258857972e-05, + "loss": 4.739, + "step": 27972 + }, + { + "epoch": 0.16636335521933582, + "grad_norm": 1.79619562625885, + "learning_rate": 4.666276943621939e-05, + "loss": 4.7542, + "step": 27973 + }, + { + "epoch": 0.1663693025026168, + "grad_norm": 1.5592199563980103, + "learning_rate": 4.666253627629682e-05, + "loss": 4.5968, + "step": 27974 + }, + { + "epoch": 0.1663752497858978, + "grad_norm": 1.7237550020217896, + "learning_rate": 4.666230310881208e-05, + "loss": 4.6581, + "step": 27975 + }, + { + "epoch": 0.1663811970691788, + "grad_norm": 1.6247119903564453, + "learning_rate": 4.6662069933765255e-05, + "loss": 4.6803, + "step": 27976 + }, + { + "epoch": 0.1663871443524598, + "grad_norm": 1.6257696151733398, + "learning_rate": 4.666183675115643e-05, + "loss": 4.7591, + "step": 27977 + }, + { + "epoch": 0.1663930916357408, + "grad_norm": 1.6353588104248047, + "learning_rate": 4.666160356098567e-05, + "loss": 4.3821, + "step": 27978 + }, + { + "epoch": 0.1663990389190218, + "grad_norm": 1.7179335355758667, + "learning_rate": 4.666137036325308e-05, + "loss": 4.6386, + "step": 27979 + }, + { + "epoch": 0.1664049862023028, + "grad_norm": 1.6724573373794556, + "learning_rate": 4.6661137157958716e-05, + "loss": 4.596, + "step": 27980 + }, + { + "epoch": 0.16641093348558378, + "grad_norm": 1.8331623077392578, + "learning_rate": 4.666090394510269e-05, + "loss": 4.6706, + "step": 27981 + }, + { + "epoch": 0.1664168807688648, + "grad_norm": 1.5815516710281372, + "learning_rate": 4.666067072468505e-05, + "loss": 4.5764, + "step": 27982 + }, + { + "epoch": 0.16642282805214578, + "grad_norm": 1.6047725677490234, + "learning_rate": 4.66604374967059e-05, + "loss": 4.4228, + "step": 27983 + }, + { + "epoch": 0.16642877533542677, + "grad_norm": 2.057325601577759, + "learning_rate": 4.666020426116531e-05, + "loss": 3.886, + "step": 27984 + }, + { + "epoch": 0.1664347226187078, + "grad_norm": 2.2633588314056396, + "learning_rate": 4.6659971018063375e-05, + "loss": 4.2796, + "step": 27985 + }, + { + "epoch": 0.16644066990198877, + "grad_norm": 1.9848732948303223, + "learning_rate": 4.6659737767400166e-05, + "loss": 4.4349, + "step": 27986 + }, + { + "epoch": 0.16644661718526976, + "grad_norm": 1.8116247653961182, + "learning_rate": 4.6659504509175764e-05, + "loss": 4.5313, + "step": 27987 + }, + { + "epoch": 0.16645256446855078, + "grad_norm": 1.8909553289413452, + "learning_rate": 4.665927124339026e-05, + "loss": 4.5166, + "step": 27988 + }, + { + "epoch": 0.16645851175183177, + "grad_norm": 1.6827013492584229, + "learning_rate": 4.665903797004371e-05, + "loss": 4.7353, + "step": 27989 + }, + { + "epoch": 0.16646445903511276, + "grad_norm": 1.8081045150756836, + "learning_rate": 4.6658804689136227e-05, + "loss": 4.743, + "step": 27990 + }, + { + "epoch": 0.16647040631839377, + "grad_norm": 1.7859995365142822, + "learning_rate": 4.665857140066788e-05, + "loss": 4.6476, + "step": 27991 + }, + { + "epoch": 0.16647635360167476, + "grad_norm": 2.158141613006592, + "learning_rate": 4.665833810463874e-05, + "loss": 4.1541, + "step": 27992 + }, + { + "epoch": 0.16648230088495575, + "grad_norm": 2.059534788131714, + "learning_rate": 4.665810480104891e-05, + "loss": 4.2993, + "step": 27993 + }, + { + "epoch": 0.16648824816823676, + "grad_norm": 2.0945677757263184, + "learning_rate": 4.665787148989845e-05, + "loss": 4.5941, + "step": 27994 + }, + { + "epoch": 0.16649419545151775, + "grad_norm": 1.8792952299118042, + "learning_rate": 4.6657638171187455e-05, + "loss": 4.5735, + "step": 27995 + }, + { + "epoch": 0.16650014273479874, + "grad_norm": 1.7018059492111206, + "learning_rate": 4.665740484491601e-05, + "loss": 4.6591, + "step": 27996 + }, + { + "epoch": 0.16650609001807973, + "grad_norm": 1.6992706060409546, + "learning_rate": 4.6657171511084175e-05, + "loss": 4.512, + "step": 27997 + }, + { + "epoch": 0.16651203730136074, + "grad_norm": 1.7492562532424927, + "learning_rate": 4.6656938169692054e-05, + "loss": 4.6722, + "step": 27998 + }, + { + "epoch": 0.16651798458464173, + "grad_norm": 1.6457120180130005, + "learning_rate": 4.665670482073972e-05, + "loss": 4.5632, + "step": 27999 + }, + { + "epoch": 0.16652393186792272, + "grad_norm": 1.9052523374557495, + "learning_rate": 4.6656471464227246e-05, + "loss": 4.5678, + "step": 28000 + }, + { + "epoch": 0.16652987915120374, + "grad_norm": 1.7932218313217163, + "learning_rate": 4.665623810015473e-05, + "loss": 4.5433, + "step": 28001 + }, + { + "epoch": 0.16653582643448472, + "grad_norm": 1.7252825498580933, + "learning_rate": 4.665600472852224e-05, + "loss": 4.3902, + "step": 28002 + }, + { + "epoch": 0.1665417737177657, + "grad_norm": 1.810628890991211, + "learning_rate": 4.665577134932986e-05, + "loss": 4.242, + "step": 28003 + }, + { + "epoch": 0.16654772100104673, + "grad_norm": 1.7332589626312256, + "learning_rate": 4.6655537962577676e-05, + "loss": 4.2713, + "step": 28004 + }, + { + "epoch": 0.16655366828432772, + "grad_norm": 1.720533847808838, + "learning_rate": 4.6655304568265776e-05, + "loss": 4.3828, + "step": 28005 + }, + { + "epoch": 0.1665596155676087, + "grad_norm": 1.680240511894226, + "learning_rate": 4.665507116639423e-05, + "loss": 4.4578, + "step": 28006 + }, + { + "epoch": 0.16656556285088972, + "grad_norm": 1.6451648473739624, + "learning_rate": 4.665483775696311e-05, + "loss": 4.4493, + "step": 28007 + }, + { + "epoch": 0.1665715101341707, + "grad_norm": 1.8150712251663208, + "learning_rate": 4.665460433997252e-05, + "loss": 4.353, + "step": 28008 + }, + { + "epoch": 0.1665774574174517, + "grad_norm": 1.6858443021774292, + "learning_rate": 4.665437091542253e-05, + "loss": 4.2929, + "step": 28009 + }, + { + "epoch": 0.1665834047007327, + "grad_norm": 1.7269021272659302, + "learning_rate": 4.665413748331322e-05, + "loss": 4.2856, + "step": 28010 + }, + { + "epoch": 0.1665893519840137, + "grad_norm": 1.6517678499221802, + "learning_rate": 4.665390404364468e-05, + "loss": 4.977, + "step": 28011 + }, + { + "epoch": 0.1665952992672947, + "grad_norm": 1.8300232887268066, + "learning_rate": 4.665367059641698e-05, + "loss": 4.3829, + "step": 28012 + }, + { + "epoch": 0.1666012465505757, + "grad_norm": 1.7685927152633667, + "learning_rate": 4.6653437141630215e-05, + "loss": 4.3178, + "step": 28013 + }, + { + "epoch": 0.1666071938338567, + "grad_norm": 1.944615125656128, + "learning_rate": 4.665320367928445e-05, + "loss": 4.2248, + "step": 28014 + }, + { + "epoch": 0.16661314111713768, + "grad_norm": 2.097490072250366, + "learning_rate": 4.6652970209379775e-05, + "loss": 4.2814, + "step": 28015 + }, + { + "epoch": 0.1666190884004187, + "grad_norm": 1.5824095010757446, + "learning_rate": 4.665273673191628e-05, + "loss": 4.2074, + "step": 28016 + }, + { + "epoch": 0.16662503568369968, + "grad_norm": 1.6979020833969116, + "learning_rate": 4.665250324689403e-05, + "loss": 4.3534, + "step": 28017 + }, + { + "epoch": 0.16663098296698067, + "grad_norm": 1.7754404544830322, + "learning_rate": 4.6652269754313125e-05, + "loss": 4.3066, + "step": 28018 + }, + { + "epoch": 0.1666369302502617, + "grad_norm": 1.8645826578140259, + "learning_rate": 4.665203625417363e-05, + "loss": 4.1896, + "step": 28019 + }, + { + "epoch": 0.16664287753354268, + "grad_norm": 1.8967339992523193, + "learning_rate": 4.6651802746475633e-05, + "loss": 4.4092, + "step": 28020 + }, + { + "epoch": 0.16664882481682367, + "grad_norm": 1.76931893825531, + "learning_rate": 4.665156923121922e-05, + "loss": 4.5632, + "step": 28021 + }, + { + "epoch": 0.16665477210010468, + "grad_norm": 2.338927745819092, + "learning_rate": 4.665133570840446e-05, + "loss": 4.2858, + "step": 28022 + }, + { + "epoch": 0.16666071938338567, + "grad_norm": 1.747149109840393, + "learning_rate": 4.665110217803145e-05, + "loss": 4.6505, + "step": 28023 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 2.8555750846862793, + "learning_rate": 4.6650868640100254e-05, + "loss": 4.4681, + "step": 28024 + }, + { + "epoch": 0.16667261394994767, + "grad_norm": 2.284841299057007, + "learning_rate": 4.665063509461097e-05, + "loss": 3.9607, + "step": 28025 + }, + { + "epoch": 0.16667856123322866, + "grad_norm": 2.51346492767334, + "learning_rate": 4.6650401541563673e-05, + "loss": 3.8373, + "step": 28026 + }, + { + "epoch": 0.16668450851650965, + "grad_norm": 2.33309006690979, + "learning_rate": 4.6650167980958444e-05, + "loss": 3.8783, + "step": 28027 + }, + { + "epoch": 0.16669045579979067, + "grad_norm": 1.886756420135498, + "learning_rate": 4.664993441279536e-05, + "loss": 4.4607, + "step": 28028 + }, + { + "epoch": 0.16669640308307165, + "grad_norm": 1.6356357336044312, + "learning_rate": 4.664970083707452e-05, + "loss": 4.2901, + "step": 28029 + }, + { + "epoch": 0.16670235036635264, + "grad_norm": 2.366969108581543, + "learning_rate": 4.664946725379598e-05, + "loss": 3.7561, + "step": 28030 + }, + { + "epoch": 0.16670829764963366, + "grad_norm": 2.387471914291382, + "learning_rate": 4.664923366295984e-05, + "loss": 3.745, + "step": 28031 + }, + { + "epoch": 0.16671424493291465, + "grad_norm": 2.0741076469421387, + "learning_rate": 4.664900006456617e-05, + "loss": 3.6031, + "step": 28032 + }, + { + "epoch": 0.16672019221619563, + "grad_norm": 2.5359690189361572, + "learning_rate": 4.6648766458615066e-05, + "loss": 3.8495, + "step": 28033 + }, + { + "epoch": 0.16672613949947665, + "grad_norm": 2.423198938369751, + "learning_rate": 4.6648532845106604e-05, + "loss": 3.5664, + "step": 28034 + }, + { + "epoch": 0.16673208678275764, + "grad_norm": 1.7040067911148071, + "learning_rate": 4.664829922404086e-05, + "loss": 4.4474, + "step": 28035 + }, + { + "epoch": 0.16673803406603863, + "grad_norm": 1.8603174686431885, + "learning_rate": 4.664806559541791e-05, + "loss": 4.7263, + "step": 28036 + }, + { + "epoch": 0.16674398134931964, + "grad_norm": 1.7510238885879517, + "learning_rate": 4.664783195923785e-05, + "loss": 4.5566, + "step": 28037 + }, + { + "epoch": 0.16674992863260063, + "grad_norm": 1.6786305904388428, + "learning_rate": 4.6647598315500764e-05, + "loss": 4.5139, + "step": 28038 + }, + { + "epoch": 0.16675587591588162, + "grad_norm": 1.7382848262786865, + "learning_rate": 4.664736466420671e-05, + "loss": 4.5474, + "step": 28039 + }, + { + "epoch": 0.16676182319916263, + "grad_norm": 2.1977128982543945, + "learning_rate": 4.664713100535579e-05, + "loss": 3.8657, + "step": 28040 + }, + { + "epoch": 0.16676777048244362, + "grad_norm": 2.147538185119629, + "learning_rate": 4.664689733894808e-05, + "loss": 4.3258, + "step": 28041 + }, + { + "epoch": 0.1667737177657246, + "grad_norm": 1.6165980100631714, + "learning_rate": 4.6646663664983667e-05, + "loss": 5.6764, + "step": 28042 + }, + { + "epoch": 0.16677966504900563, + "grad_norm": 1.5513676404953003, + "learning_rate": 4.664642998346263e-05, + "loss": 5.286, + "step": 28043 + }, + { + "epoch": 0.16678561233228661, + "grad_norm": 2.4869754314422607, + "learning_rate": 4.664619629438504e-05, + "loss": 3.9925, + "step": 28044 + }, + { + "epoch": 0.1667915596155676, + "grad_norm": 1.9613736867904663, + "learning_rate": 4.6645962597750985e-05, + "loss": 4.3125, + "step": 28045 + }, + { + "epoch": 0.16679750689884862, + "grad_norm": 2.345310688018799, + "learning_rate": 4.664572889356055e-05, + "loss": 4.5785, + "step": 28046 + }, + { + "epoch": 0.1668034541821296, + "grad_norm": 1.6253316402435303, + "learning_rate": 4.664549518181382e-05, + "loss": 4.9924, + "step": 28047 + }, + { + "epoch": 0.1668094014654106, + "grad_norm": 1.7358524799346924, + "learning_rate": 4.664526146251087e-05, + "loss": 4.7523, + "step": 28048 + }, + { + "epoch": 0.1668153487486916, + "grad_norm": 1.6783114671707153, + "learning_rate": 4.664502773565178e-05, + "loss": 5.1525, + "step": 28049 + }, + { + "epoch": 0.1668212960319726, + "grad_norm": 1.102388858795166, + "learning_rate": 4.664479400123663e-05, + "loss": 5.5695, + "step": 28050 + }, + { + "epoch": 0.1668272433152536, + "grad_norm": 1.6548655033111572, + "learning_rate": 4.664456025926551e-05, + "loss": 4.7331, + "step": 28051 + }, + { + "epoch": 0.1668331905985346, + "grad_norm": 1.6468528509140015, + "learning_rate": 4.66443265097385e-05, + "loss": 4.7818, + "step": 28052 + }, + { + "epoch": 0.1668391378818156, + "grad_norm": 1.623849630355835, + "learning_rate": 4.664409275265568e-05, + "loss": 4.9336, + "step": 28053 + }, + { + "epoch": 0.16684508516509658, + "grad_norm": 1.4946188926696777, + "learning_rate": 4.664385898801713e-05, + "loss": 4.9361, + "step": 28054 + }, + { + "epoch": 0.16685103244837757, + "grad_norm": 1.6323179006576538, + "learning_rate": 4.664362521582293e-05, + "loss": 4.9529, + "step": 28055 + }, + { + "epoch": 0.16685697973165858, + "grad_norm": 1.579441785812378, + "learning_rate": 4.6643391436073165e-05, + "loss": 4.5593, + "step": 28056 + }, + { + "epoch": 0.16686292701493957, + "grad_norm": 1.578658103942871, + "learning_rate": 4.664315764876791e-05, + "loss": 4.5736, + "step": 28057 + }, + { + "epoch": 0.16686887429822056, + "grad_norm": 1.661720633506775, + "learning_rate": 4.664292385390726e-05, + "loss": 4.9137, + "step": 28058 + }, + { + "epoch": 0.16687482158150158, + "grad_norm": 1.9020450115203857, + "learning_rate": 4.664269005149129e-05, + "loss": 5.3975, + "step": 28059 + }, + { + "epoch": 0.16688076886478256, + "grad_norm": 2.0544557571411133, + "learning_rate": 4.664245624152007e-05, + "loss": 5.3485, + "step": 28060 + }, + { + "epoch": 0.16688671614806355, + "grad_norm": 1.8861839771270752, + "learning_rate": 4.664222242399371e-05, + "loss": 5.3917, + "step": 28061 + }, + { + "epoch": 0.16689266343134457, + "grad_norm": 1.7115676403045654, + "learning_rate": 4.6641988598912256e-05, + "loss": 5.3101, + "step": 28062 + }, + { + "epoch": 0.16689861071462556, + "grad_norm": 2.6457252502441406, + "learning_rate": 4.6641754766275815e-05, + "loss": 4.8323, + "step": 28063 + }, + { + "epoch": 0.16690455799790654, + "grad_norm": 1.9158306121826172, + "learning_rate": 4.664152092608446e-05, + "loss": 4.902, + "step": 28064 + }, + { + "epoch": 0.16691050528118756, + "grad_norm": 1.5592490434646606, + "learning_rate": 4.664128707833828e-05, + "loss": 4.85, + "step": 28065 + }, + { + "epoch": 0.16691645256446855, + "grad_norm": 1.8784046173095703, + "learning_rate": 4.664105322303734e-05, + "loss": 4.9118, + "step": 28066 + }, + { + "epoch": 0.16692239984774954, + "grad_norm": 1.8043493032455444, + "learning_rate": 4.6640819360181734e-05, + "loss": 4.8248, + "step": 28067 + }, + { + "epoch": 0.16692834713103055, + "grad_norm": 1.925399661064148, + "learning_rate": 4.664058548977155e-05, + "loss": 4.8808, + "step": 28068 + }, + { + "epoch": 0.16693429441431154, + "grad_norm": 2.1420938968658447, + "learning_rate": 4.664035161180686e-05, + "loss": 4.5251, + "step": 28069 + }, + { + "epoch": 0.16694024169759253, + "grad_norm": 1.3386578559875488, + "learning_rate": 4.664011772628774e-05, + "loss": 5.0788, + "step": 28070 + }, + { + "epoch": 0.16694618898087354, + "grad_norm": 1.7500650882720947, + "learning_rate": 4.663988383321427e-05, + "loss": 4.6332, + "step": 28071 + }, + { + "epoch": 0.16695213626415453, + "grad_norm": 1.6339102983474731, + "learning_rate": 4.6639649932586555e-05, + "loss": 4.9342, + "step": 28072 + }, + { + "epoch": 0.16695808354743552, + "grad_norm": 1.634045124053955, + "learning_rate": 4.6639416024404655e-05, + "loss": 4.8166, + "step": 28073 + }, + { + "epoch": 0.16696403083071654, + "grad_norm": 1.6168557405471802, + "learning_rate": 4.663918210866866e-05, + "loss": 4.9086, + "step": 28074 + }, + { + "epoch": 0.16696997811399752, + "grad_norm": 1.7027981281280518, + "learning_rate": 4.663894818537865e-05, + "loss": 4.7404, + "step": 28075 + }, + { + "epoch": 0.1669759253972785, + "grad_norm": 1.621127724647522, + "learning_rate": 4.663871425453471e-05, + "loss": 4.8458, + "step": 28076 + }, + { + "epoch": 0.16698187268055953, + "grad_norm": 1.524674415588379, + "learning_rate": 4.663848031613691e-05, + "loss": 4.8977, + "step": 28077 + }, + { + "epoch": 0.16698781996384052, + "grad_norm": 1.3619705438613892, + "learning_rate": 4.663824637018535e-05, + "loss": 4.6809, + "step": 28078 + }, + { + "epoch": 0.1669937672471215, + "grad_norm": 1.6202057600021362, + "learning_rate": 4.66380124166801e-05, + "loss": 4.2993, + "step": 28079 + }, + { + "epoch": 0.16699971453040252, + "grad_norm": 1.6400598287582397, + "learning_rate": 4.663777845562124e-05, + "loss": 4.498, + "step": 28080 + }, + { + "epoch": 0.1670056618136835, + "grad_norm": 1.6038521528244019, + "learning_rate": 4.663754448700885e-05, + "loss": 4.5864, + "step": 28081 + }, + { + "epoch": 0.1670116090969645, + "grad_norm": 1.6111528873443604, + "learning_rate": 4.663731051084303e-05, + "loss": 4.77, + "step": 28082 + }, + { + "epoch": 0.1670175563802455, + "grad_norm": 1.7978882789611816, + "learning_rate": 4.663707652712384e-05, + "loss": 4.2634, + "step": 28083 + }, + { + "epoch": 0.1670235036635265, + "grad_norm": 1.8533109426498413, + "learning_rate": 4.6636842535851374e-05, + "loss": 4.6601, + "step": 28084 + }, + { + "epoch": 0.1670294509468075, + "grad_norm": 1.776292324066162, + "learning_rate": 4.663660853702571e-05, + "loss": 4.4957, + "step": 28085 + }, + { + "epoch": 0.1670353982300885, + "grad_norm": 2.15081524848938, + "learning_rate": 4.663637453064692e-05, + "loss": 4.2726, + "step": 28086 + }, + { + "epoch": 0.1670413455133695, + "grad_norm": 1.4586591720581055, + "learning_rate": 4.6636140516715104e-05, + "loss": 5.4757, + "step": 28087 + }, + { + "epoch": 0.16704729279665048, + "grad_norm": 1.4819058179855347, + "learning_rate": 4.663590649523033e-05, + "loss": 5.6895, + "step": 28088 + }, + { + "epoch": 0.1670532400799315, + "grad_norm": 1.8194465637207031, + "learning_rate": 4.663567246619269e-05, + "loss": 4.5697, + "step": 28089 + }, + { + "epoch": 0.16705918736321249, + "grad_norm": 1.8187286853790283, + "learning_rate": 4.663543842960226e-05, + "loss": 4.4745, + "step": 28090 + }, + { + "epoch": 0.16706513464649347, + "grad_norm": 1.7815576791763306, + "learning_rate": 4.663520438545912e-05, + "loss": 4.7309, + "step": 28091 + }, + { + "epoch": 0.1670710819297745, + "grad_norm": 1.9799631834030151, + "learning_rate": 4.663497033376335e-05, + "loss": 4.5429, + "step": 28092 + }, + { + "epoch": 0.16707702921305548, + "grad_norm": 1.7019764184951782, + "learning_rate": 4.663473627451504e-05, + "loss": 4.402, + "step": 28093 + }, + { + "epoch": 0.16708297649633647, + "grad_norm": 1.9056285619735718, + "learning_rate": 4.663450220771427e-05, + "loss": 4.3428, + "step": 28094 + }, + { + "epoch": 0.16708892377961748, + "grad_norm": 1.877556562423706, + "learning_rate": 4.663426813336112e-05, + "loss": 4.4579, + "step": 28095 + }, + { + "epoch": 0.16709487106289847, + "grad_norm": 1.6415005922317505, + "learning_rate": 4.663403405145565e-05, + "loss": 5.1392, + "step": 28096 + }, + { + "epoch": 0.16710081834617946, + "grad_norm": 2.0315005779266357, + "learning_rate": 4.663379996199798e-05, + "loss": 4.5666, + "step": 28097 + }, + { + "epoch": 0.16710676562946047, + "grad_norm": 1.744367241859436, + "learning_rate": 4.663356586498817e-05, + "loss": 4.6629, + "step": 28098 + }, + { + "epoch": 0.16711271291274146, + "grad_norm": 1.8645330667495728, + "learning_rate": 4.663333176042631e-05, + "loss": 4.2716, + "step": 28099 + }, + { + "epoch": 0.16711866019602245, + "grad_norm": 1.6384168863296509, + "learning_rate": 4.6633097648312476e-05, + "loss": 4.1565, + "step": 28100 + }, + { + "epoch": 0.16712460747930347, + "grad_norm": 2.0455496311187744, + "learning_rate": 4.663286352864675e-05, + "loss": 4.3342, + "step": 28101 + }, + { + "epoch": 0.16713055476258445, + "grad_norm": 1.689454197883606, + "learning_rate": 4.663262940142921e-05, + "loss": 5.1503, + "step": 28102 + }, + { + "epoch": 0.16713650204586544, + "grad_norm": 1.7138323783874512, + "learning_rate": 4.663239526665995e-05, + "loss": 4.3616, + "step": 28103 + }, + { + "epoch": 0.16714244932914646, + "grad_norm": 2.171147584915161, + "learning_rate": 4.663216112433904e-05, + "loss": 4.3054, + "step": 28104 + }, + { + "epoch": 0.16714839661242745, + "grad_norm": 2.5418312549591064, + "learning_rate": 4.663192697446657e-05, + "loss": 3.387, + "step": 28105 + }, + { + "epoch": 0.16715434389570843, + "grad_norm": 1.5790460109710693, + "learning_rate": 4.6631692817042615e-05, + "loss": 5.2555, + "step": 28106 + }, + { + "epoch": 0.16716029117898945, + "grad_norm": 1.4285277128219604, + "learning_rate": 4.663145865206726e-05, + "loss": 5.2408, + "step": 28107 + }, + { + "epoch": 0.16716623846227044, + "grad_norm": 1.3292522430419922, + "learning_rate": 4.663122447954058e-05, + "loss": 5.1494, + "step": 28108 + }, + { + "epoch": 0.16717218574555143, + "grad_norm": 1.7032718658447266, + "learning_rate": 4.663099029946267e-05, + "loss": 4.7939, + "step": 28109 + }, + { + "epoch": 0.16717813302883244, + "grad_norm": 1.6049028635025024, + "learning_rate": 4.6630756111833605e-05, + "loss": 4.9406, + "step": 28110 + }, + { + "epoch": 0.16718408031211343, + "grad_norm": 1.4805787801742554, + "learning_rate": 4.663052191665347e-05, + "loss": 4.9251, + "step": 28111 + }, + { + "epoch": 0.16719002759539442, + "grad_norm": 1.585306167602539, + "learning_rate": 4.663028771392234e-05, + "loss": 5.3119, + "step": 28112 + }, + { + "epoch": 0.1671959748786754, + "grad_norm": 1.5918222665786743, + "learning_rate": 4.663005350364029e-05, + "loss": 5.1405, + "step": 28113 + }, + { + "epoch": 0.16720192216195642, + "grad_norm": 1.5273454189300537, + "learning_rate": 4.6629819285807426e-05, + "loss": 4.9654, + "step": 28114 + }, + { + "epoch": 0.1672078694452374, + "grad_norm": 2.3424551486968994, + "learning_rate": 4.662958506042381e-05, + "loss": 4.6364, + "step": 28115 + }, + { + "epoch": 0.1672138167285184, + "grad_norm": 1.5244309902191162, + "learning_rate": 4.6629350827489527e-05, + "loss": 5.1469, + "step": 28116 + }, + { + "epoch": 0.16721976401179942, + "grad_norm": 1.6393519639968872, + "learning_rate": 4.662911658700466e-05, + "loss": 5.3803, + "step": 28117 + }, + { + "epoch": 0.1672257112950804, + "grad_norm": 1.6506540775299072, + "learning_rate": 4.662888233896929e-05, + "loss": 5.2188, + "step": 28118 + }, + { + "epoch": 0.1672316585783614, + "grad_norm": 1.481735110282898, + "learning_rate": 4.6628648083383516e-05, + "loss": 5.4692, + "step": 28119 + }, + { + "epoch": 0.1672376058616424, + "grad_norm": 1.5239784717559814, + "learning_rate": 4.662841382024739e-05, + "loss": 5.5937, + "step": 28120 + }, + { + "epoch": 0.1672435531449234, + "grad_norm": 1.7525306940078735, + "learning_rate": 4.662817954956101e-05, + "loss": 5.2913, + "step": 28121 + }, + { + "epoch": 0.16724950042820438, + "grad_norm": 1.5808900594711304, + "learning_rate": 4.662794527132446e-05, + "loss": 4.8341, + "step": 28122 + }, + { + "epoch": 0.1672554477114854, + "grad_norm": 1.7503292560577393, + "learning_rate": 4.662771098553782e-05, + "loss": 4.6066, + "step": 28123 + }, + { + "epoch": 0.1672613949947664, + "grad_norm": 2.0583229064941406, + "learning_rate": 4.662747669220116e-05, + "loss": 4.747, + "step": 28124 + }, + { + "epoch": 0.16726734227804738, + "grad_norm": 1.8209635019302368, + "learning_rate": 4.662724239131458e-05, + "loss": 4.6837, + "step": 28125 + }, + { + "epoch": 0.1672732895613284, + "grad_norm": 1.3161481618881226, + "learning_rate": 4.662700808287815e-05, + "loss": 5.1877, + "step": 28126 + }, + { + "epoch": 0.16727923684460938, + "grad_norm": 1.492100715637207, + "learning_rate": 4.662677376689195e-05, + "loss": 5.0719, + "step": 28127 + }, + { + "epoch": 0.16728518412789037, + "grad_norm": 1.5123339891433716, + "learning_rate": 4.662653944335608e-05, + "loss": 5.2237, + "step": 28128 + }, + { + "epoch": 0.16729113141117138, + "grad_norm": 1.3963336944580078, + "learning_rate": 4.66263051122706e-05, + "loss": 5.5465, + "step": 28129 + }, + { + "epoch": 0.16729707869445237, + "grad_norm": 1.4128196239471436, + "learning_rate": 4.662607077363559e-05, + "loss": 5.4236, + "step": 28130 + }, + { + "epoch": 0.16730302597773336, + "grad_norm": 1.5107556581497192, + "learning_rate": 4.662583642745116e-05, + "loss": 5.411, + "step": 28131 + }, + { + "epoch": 0.16730897326101438, + "grad_norm": 1.4282488822937012, + "learning_rate": 4.662560207371737e-05, + "loss": 5.4301, + "step": 28132 + }, + { + "epoch": 0.16731492054429536, + "grad_norm": 1.7082507610321045, + "learning_rate": 4.6625367712434295e-05, + "loss": 5.2167, + "step": 28133 + }, + { + "epoch": 0.16732086782757635, + "grad_norm": 1.4769392013549805, + "learning_rate": 4.662513334360204e-05, + "loss": 4.8894, + "step": 28134 + }, + { + "epoch": 0.16732681511085737, + "grad_norm": 1.6305506229400635, + "learning_rate": 4.6624898967220664e-05, + "loss": 5.2891, + "step": 28135 + }, + { + "epoch": 0.16733276239413836, + "grad_norm": 1.4358271360397339, + "learning_rate": 4.662466458329027e-05, + "loss": 5.4362, + "step": 28136 + }, + { + "epoch": 0.16733870967741934, + "grad_norm": 1.3945128917694092, + "learning_rate": 4.662443019181092e-05, + "loss": 5.4208, + "step": 28137 + }, + { + "epoch": 0.16734465696070036, + "grad_norm": 1.3432549238204956, + "learning_rate": 4.662419579278271e-05, + "loss": 5.4326, + "step": 28138 + }, + { + "epoch": 0.16735060424398135, + "grad_norm": 1.3106540441513062, + "learning_rate": 4.662396138620571e-05, + "loss": 5.554, + "step": 28139 + }, + { + "epoch": 0.16735655152726234, + "grad_norm": 1.449013590812683, + "learning_rate": 4.662372697208002e-05, + "loss": 5.3896, + "step": 28140 + }, + { + "epoch": 0.16736249881054335, + "grad_norm": 1.2621738910675049, + "learning_rate": 4.66234925504057e-05, + "loss": 5.5235, + "step": 28141 + }, + { + "epoch": 0.16736844609382434, + "grad_norm": 1.5813289880752563, + "learning_rate": 4.6623258121182845e-05, + "loss": 5.4607, + "step": 28142 + }, + { + "epoch": 0.16737439337710533, + "grad_norm": 1.4719443321228027, + "learning_rate": 4.662302368441154e-05, + "loss": 5.2416, + "step": 28143 + }, + { + "epoch": 0.16738034066038635, + "grad_norm": 1.3261717557907104, + "learning_rate": 4.662278924009185e-05, + "loss": 5.2426, + "step": 28144 + }, + { + "epoch": 0.16738628794366733, + "grad_norm": 1.409119725227356, + "learning_rate": 4.6622554788223874e-05, + "loss": 4.8306, + "step": 28145 + }, + { + "epoch": 0.16739223522694832, + "grad_norm": 1.3746771812438965, + "learning_rate": 4.662232032880769e-05, + "loss": 5.3939, + "step": 28146 + }, + { + "epoch": 0.16739818251022934, + "grad_norm": 1.5453044176101685, + "learning_rate": 4.662208586184337e-05, + "loss": 5.2989, + "step": 28147 + }, + { + "epoch": 0.16740412979351033, + "grad_norm": 2.140986919403076, + "learning_rate": 4.6621851387331003e-05, + "loss": 4.7526, + "step": 28148 + }, + { + "epoch": 0.1674100770767913, + "grad_norm": 1.305344820022583, + "learning_rate": 4.662161690527068e-05, + "loss": 5.3339, + "step": 28149 + }, + { + "epoch": 0.16741602436007233, + "grad_norm": 1.200656533241272, + "learning_rate": 4.662138241566247e-05, + "loss": 5.2464, + "step": 28150 + }, + { + "epoch": 0.16742197164335332, + "grad_norm": 1.2441010475158691, + "learning_rate": 4.6621147918506457e-05, + "loss": 5.4545, + "step": 28151 + }, + { + "epoch": 0.1674279189266343, + "grad_norm": 1.6146814823150635, + "learning_rate": 4.662091341380272e-05, + "loss": 4.9968, + "step": 28152 + }, + { + "epoch": 0.16743386620991532, + "grad_norm": 1.2502530813217163, + "learning_rate": 4.6620678901551354e-05, + "loss": 5.3297, + "step": 28153 + }, + { + "epoch": 0.1674398134931963, + "grad_norm": 1.5260026454925537, + "learning_rate": 4.662044438175243e-05, + "loss": 5.2643, + "step": 28154 + }, + { + "epoch": 0.1674457607764773, + "grad_norm": 1.2725012302398682, + "learning_rate": 4.662020985440603e-05, + "loss": 5.4469, + "step": 28155 + }, + { + "epoch": 0.16745170805975831, + "grad_norm": 1.717331051826477, + "learning_rate": 4.661997531951224e-05, + "loss": 5.2711, + "step": 28156 + }, + { + "epoch": 0.1674576553430393, + "grad_norm": 1.6104686260223389, + "learning_rate": 4.661974077707114e-05, + "loss": 5.0773, + "step": 28157 + }, + { + "epoch": 0.1674636026263203, + "grad_norm": 1.568558692932129, + "learning_rate": 4.661950622708281e-05, + "loss": 4.4339, + "step": 28158 + }, + { + "epoch": 0.1674695499096013, + "grad_norm": 1.5101975202560425, + "learning_rate": 4.661927166954734e-05, + "loss": 3.9035, + "step": 28159 + }, + { + "epoch": 0.1674754971928823, + "grad_norm": 1.6529417037963867, + "learning_rate": 4.66190371044648e-05, + "loss": 3.8917, + "step": 28160 + }, + { + "epoch": 0.16748144447616328, + "grad_norm": 1.2637635469436646, + "learning_rate": 4.6618802531835285e-05, + "loss": 5.2091, + "step": 28161 + }, + { + "epoch": 0.1674873917594443, + "grad_norm": 1.4303425550460815, + "learning_rate": 4.661856795165886e-05, + "loss": 5.368, + "step": 28162 + }, + { + "epoch": 0.1674933390427253, + "grad_norm": 1.8119208812713623, + "learning_rate": 4.661833336393562e-05, + "loss": 4.257, + "step": 28163 + }, + { + "epoch": 0.16749928632600627, + "grad_norm": 2.0059077739715576, + "learning_rate": 4.661809876866564e-05, + "loss": 4.225, + "step": 28164 + }, + { + "epoch": 0.1675052336092873, + "grad_norm": 1.87846839427948, + "learning_rate": 4.6617864165849005e-05, + "loss": 4.182, + "step": 28165 + }, + { + "epoch": 0.16751118089256828, + "grad_norm": 1.5655750036239624, + "learning_rate": 4.66176295554858e-05, + "loss": 5.441, + "step": 28166 + }, + { + "epoch": 0.16751712817584927, + "grad_norm": 1.735921025276184, + "learning_rate": 4.661739493757611e-05, + "loss": 5.1415, + "step": 28167 + }, + { + "epoch": 0.16752307545913028, + "grad_norm": 1.6819477081298828, + "learning_rate": 4.661716031212e-05, + "loss": 5.2213, + "step": 28168 + }, + { + "epoch": 0.16752902274241127, + "grad_norm": 1.5038045644760132, + "learning_rate": 4.661692567911756e-05, + "loss": 4.3357, + "step": 28169 + }, + { + "epoch": 0.16753497002569226, + "grad_norm": 1.8683745861053467, + "learning_rate": 4.6616691038568885e-05, + "loss": 4.5498, + "step": 28170 + }, + { + "epoch": 0.16754091730897325, + "grad_norm": 1.6156747341156006, + "learning_rate": 4.661645639047405e-05, + "loss": 4.7422, + "step": 28171 + }, + { + "epoch": 0.16754686459225426, + "grad_norm": 1.8638094663619995, + "learning_rate": 4.661622173483312e-05, + "loss": 4.4363, + "step": 28172 + }, + { + "epoch": 0.16755281187553525, + "grad_norm": 1.800417184829712, + "learning_rate": 4.6615987071646194e-05, + "loss": 4.355, + "step": 28173 + }, + { + "epoch": 0.16755875915881624, + "grad_norm": 1.765234351158142, + "learning_rate": 4.661575240091336e-05, + "loss": 4.3521, + "step": 28174 + }, + { + "epoch": 0.16756470644209726, + "grad_norm": 1.7296849489212036, + "learning_rate": 4.661551772263468e-05, + "loss": 4.8884, + "step": 28175 + }, + { + "epoch": 0.16757065372537824, + "grad_norm": 1.609222650527954, + "learning_rate": 4.661528303681025e-05, + "loss": 4.6088, + "step": 28176 + }, + { + "epoch": 0.16757660100865923, + "grad_norm": 1.910651445388794, + "learning_rate": 4.6615048343440145e-05, + "loss": 4.3531, + "step": 28177 + }, + { + "epoch": 0.16758254829194025, + "grad_norm": 1.6934939622879028, + "learning_rate": 4.6614813642524454e-05, + "loss": 4.1895, + "step": 28178 + }, + { + "epoch": 0.16758849557522124, + "grad_norm": 1.630308985710144, + "learning_rate": 4.6614578934063244e-05, + "loss": 4.5883, + "step": 28179 + }, + { + "epoch": 0.16759444285850222, + "grad_norm": 1.4629896879196167, + "learning_rate": 4.6614344218056624e-05, + "loss": 4.4655, + "step": 28180 + }, + { + "epoch": 0.16760039014178324, + "grad_norm": 1.522980809211731, + "learning_rate": 4.6614109494504654e-05, + "loss": 5.3745, + "step": 28181 + }, + { + "epoch": 0.16760633742506423, + "grad_norm": 1.3758256435394287, + "learning_rate": 4.661387476340742e-05, + "loss": 5.4374, + "step": 28182 + }, + { + "epoch": 0.16761228470834522, + "grad_norm": 1.4767520427703857, + "learning_rate": 4.661364002476501e-05, + "loss": 5.4039, + "step": 28183 + }, + { + "epoch": 0.16761823199162623, + "grad_norm": 1.3167197704315186, + "learning_rate": 4.661340527857749e-05, + "loss": 5.3886, + "step": 28184 + }, + { + "epoch": 0.16762417927490722, + "grad_norm": 1.8137489557266235, + "learning_rate": 4.661317052484496e-05, + "loss": 4.6928, + "step": 28185 + }, + { + "epoch": 0.1676301265581882, + "grad_norm": 1.7553741931915283, + "learning_rate": 4.66129357635675e-05, + "loss": 5.0159, + "step": 28186 + }, + { + "epoch": 0.16763607384146922, + "grad_norm": 1.341352939605713, + "learning_rate": 4.661270099474518e-05, + "loss": 5.4529, + "step": 28187 + }, + { + "epoch": 0.1676420211247502, + "grad_norm": 1.553514003753662, + "learning_rate": 4.661246621837809e-05, + "loss": 5.1907, + "step": 28188 + }, + { + "epoch": 0.1676479684080312, + "grad_norm": 1.4974607229232788, + "learning_rate": 4.661223143446631e-05, + "loss": 5.2143, + "step": 28189 + }, + { + "epoch": 0.16765391569131222, + "grad_norm": 1.5769060850143433, + "learning_rate": 4.661199664300993e-05, + "loss": 5.1265, + "step": 28190 + }, + { + "epoch": 0.1676598629745932, + "grad_norm": 1.4753527641296387, + "learning_rate": 4.6611761844009026e-05, + "loss": 4.974, + "step": 28191 + }, + { + "epoch": 0.1676658102578742, + "grad_norm": 1.5406947135925293, + "learning_rate": 4.661152703746368e-05, + "loss": 4.8269, + "step": 28192 + }, + { + "epoch": 0.1676717575411552, + "grad_norm": 1.864577054977417, + "learning_rate": 4.661129222337397e-05, + "loss": 4.505, + "step": 28193 + }, + { + "epoch": 0.1676777048244362, + "grad_norm": 1.561606526374817, + "learning_rate": 4.6611057401739976e-05, + "loss": 4.6992, + "step": 28194 + }, + { + "epoch": 0.16768365210771718, + "grad_norm": 1.6339094638824463, + "learning_rate": 4.661082257256179e-05, + "loss": 4.8973, + "step": 28195 + }, + { + "epoch": 0.1676895993909982, + "grad_norm": 1.8106483221054077, + "learning_rate": 4.661058773583949e-05, + "loss": 4.5909, + "step": 28196 + }, + { + "epoch": 0.1676955466742792, + "grad_norm": 1.6181379556655884, + "learning_rate": 4.661035289157316e-05, + "loss": 5.225, + "step": 28197 + }, + { + "epoch": 0.16770149395756018, + "grad_norm": 1.8745672702789307, + "learning_rate": 4.6610118039762876e-05, + "loss": 4.6381, + "step": 28198 + }, + { + "epoch": 0.1677074412408412, + "grad_norm": 1.6809148788452148, + "learning_rate": 4.6609883180408717e-05, + "loss": 4.879, + "step": 28199 + }, + { + "epoch": 0.16771338852412218, + "grad_norm": 1.6960088014602661, + "learning_rate": 4.660964831351078e-05, + "loss": 4.8171, + "step": 28200 + }, + { + "epoch": 0.16771933580740317, + "grad_norm": 1.8078324794769287, + "learning_rate": 4.660941343906913e-05, + "loss": 4.4722, + "step": 28201 + }, + { + "epoch": 0.16772528309068419, + "grad_norm": 1.6765756607055664, + "learning_rate": 4.660917855708386e-05, + "loss": 4.3086, + "step": 28202 + }, + { + "epoch": 0.16773123037396517, + "grad_norm": 1.608927845954895, + "learning_rate": 4.660894366755505e-05, + "loss": 4.5967, + "step": 28203 + }, + { + "epoch": 0.16773717765724616, + "grad_norm": 2.0235023498535156, + "learning_rate": 4.660870877048278e-05, + "loss": 4.4936, + "step": 28204 + }, + { + "epoch": 0.16774312494052718, + "grad_norm": 1.6895809173583984, + "learning_rate": 4.660847386586713e-05, + "loss": 4.9949, + "step": 28205 + }, + { + "epoch": 0.16774907222380817, + "grad_norm": 1.6481704711914062, + "learning_rate": 4.660823895370819e-05, + "loss": 5.2061, + "step": 28206 + }, + { + "epoch": 0.16775501950708915, + "grad_norm": 1.5078449249267578, + "learning_rate": 4.660800403400604e-05, + "loss": 5.0231, + "step": 28207 + }, + { + "epoch": 0.16776096679037017, + "grad_norm": 1.6977524757385254, + "learning_rate": 4.660776910676076e-05, + "loss": 4.9922, + "step": 28208 + }, + { + "epoch": 0.16776691407365116, + "grad_norm": 1.826011300086975, + "learning_rate": 4.6607534171972425e-05, + "loss": 4.2673, + "step": 28209 + }, + { + "epoch": 0.16777286135693215, + "grad_norm": 2.544302463531494, + "learning_rate": 4.660729922964112e-05, + "loss": 4.3124, + "step": 28210 + }, + { + "epoch": 0.16777880864021316, + "grad_norm": 1.7719815969467163, + "learning_rate": 4.660706427976693e-05, + "loss": 4.249, + "step": 28211 + }, + { + "epoch": 0.16778475592349415, + "grad_norm": 1.6741911172866821, + "learning_rate": 4.660682932234994e-05, + "loss": 4.3522, + "step": 28212 + }, + { + "epoch": 0.16779070320677514, + "grad_norm": 1.6827515363693237, + "learning_rate": 4.660659435739023e-05, + "loss": 4.3316, + "step": 28213 + }, + { + "epoch": 0.16779665049005615, + "grad_norm": 1.722598671913147, + "learning_rate": 4.6606359384887884e-05, + "loss": 4.3367, + "step": 28214 + }, + { + "epoch": 0.16780259777333714, + "grad_norm": 1.7667568922042847, + "learning_rate": 4.660612440484298e-05, + "loss": 4.2754, + "step": 28215 + }, + { + "epoch": 0.16780854505661813, + "grad_norm": 1.7074247598648071, + "learning_rate": 4.6605889417255596e-05, + "loss": 4.2489, + "step": 28216 + }, + { + "epoch": 0.16781449233989915, + "grad_norm": 1.8784146308898926, + "learning_rate": 4.6605654422125836e-05, + "loss": 4.4672, + "step": 28217 + }, + { + "epoch": 0.16782043962318013, + "grad_norm": 1.909641981124878, + "learning_rate": 4.660541941945374e-05, + "loss": 4.5413, + "step": 28218 + }, + { + "epoch": 0.16782638690646112, + "grad_norm": 1.4848551750183105, + "learning_rate": 4.660518440923943e-05, + "loss": 4.6922, + "step": 28219 + }, + { + "epoch": 0.16783233418974214, + "grad_norm": 1.5976632833480835, + "learning_rate": 4.6604949391482974e-05, + "loss": 4.7525, + "step": 28220 + }, + { + "epoch": 0.16783828147302313, + "grad_norm": 1.609236478805542, + "learning_rate": 4.6604714366184455e-05, + "loss": 5.1537, + "step": 28221 + }, + { + "epoch": 0.16784422875630411, + "grad_norm": 1.4178111553192139, + "learning_rate": 4.660447933334394e-05, + "loss": 5.0935, + "step": 28222 + }, + { + "epoch": 0.16785017603958513, + "grad_norm": 1.7521015405654907, + "learning_rate": 4.660424429296154e-05, + "loss": 4.6712, + "step": 28223 + }, + { + "epoch": 0.16785612332286612, + "grad_norm": 1.8282933235168457, + "learning_rate": 4.660400924503731e-05, + "loss": 5.8207, + "step": 28224 + }, + { + "epoch": 0.1678620706061471, + "grad_norm": 1.5437854528427124, + "learning_rate": 4.6603774189571345e-05, + "loss": 5.751, + "step": 28225 + }, + { + "epoch": 0.16786801788942812, + "grad_norm": 1.723281979560852, + "learning_rate": 4.660353912656373e-05, + "loss": 4.6481, + "step": 28226 + }, + { + "epoch": 0.1678739651727091, + "grad_norm": 1.718805193901062, + "learning_rate": 4.6603304056014545e-05, + "loss": 5.4971, + "step": 28227 + }, + { + "epoch": 0.1678799124559901, + "grad_norm": 1.6174219846725464, + "learning_rate": 4.660306897792387e-05, + "loss": 4.475, + "step": 28228 + }, + { + "epoch": 0.1678858597392711, + "grad_norm": 1.8539583683013916, + "learning_rate": 4.660283389229178e-05, + "loss": 4.3182, + "step": 28229 + }, + { + "epoch": 0.1678918070225521, + "grad_norm": 1.6682637929916382, + "learning_rate": 4.660259879911837e-05, + "loss": 4.5625, + "step": 28230 + }, + { + "epoch": 0.1678977543058331, + "grad_norm": 1.825737714767456, + "learning_rate": 4.660236369840371e-05, + "loss": 4.1975, + "step": 28231 + }, + { + "epoch": 0.16790370158911408, + "grad_norm": 1.6130248308181763, + "learning_rate": 4.6602128590147894e-05, + "loss": 5.6634, + "step": 28232 + }, + { + "epoch": 0.1679096488723951, + "grad_norm": 1.6243139505386353, + "learning_rate": 4.660189347435099e-05, + "loss": 4.972, + "step": 28233 + }, + { + "epoch": 0.16791559615567608, + "grad_norm": 1.5760700702667236, + "learning_rate": 4.66016583510131e-05, + "loss": 4.7272, + "step": 28234 + }, + { + "epoch": 0.16792154343895707, + "grad_norm": 1.2500736713409424, + "learning_rate": 4.660142322013429e-05, + "loss": 4.469, + "step": 28235 + }, + { + "epoch": 0.1679274907222381, + "grad_norm": 1.3888235092163086, + "learning_rate": 4.660118808171464e-05, + "loss": 5.3952, + "step": 28236 + }, + { + "epoch": 0.16793343800551908, + "grad_norm": 1.3789753913879395, + "learning_rate": 4.660095293575424e-05, + "loss": 5.8424, + "step": 28237 + }, + { + "epoch": 0.16793938528880006, + "grad_norm": 1.1890273094177246, + "learning_rate": 4.660071778225317e-05, + "loss": 5.9341, + "step": 28238 + }, + { + "epoch": 0.16794533257208108, + "grad_norm": 1.3315849304199219, + "learning_rate": 4.660048262121152e-05, + "loss": 5.9202, + "step": 28239 + }, + { + "epoch": 0.16795127985536207, + "grad_norm": 1.5866754055023193, + "learning_rate": 4.6600247452629365e-05, + "loss": 5.1867, + "step": 28240 + }, + { + "epoch": 0.16795722713864306, + "grad_norm": 1.842445969581604, + "learning_rate": 4.660001227650678e-05, + "loss": 4.4602, + "step": 28241 + }, + { + "epoch": 0.16796317442192407, + "grad_norm": 1.7466117143630981, + "learning_rate": 4.6599777092843855e-05, + "loss": 4.696, + "step": 28242 + }, + { + "epoch": 0.16796912170520506, + "grad_norm": 1.5599199533462524, + "learning_rate": 4.6599541901640665e-05, + "loss": 4.5027, + "step": 28243 + }, + { + "epoch": 0.16797506898848605, + "grad_norm": 1.3156886100769043, + "learning_rate": 4.6599306702897304e-05, + "loss": 4.2991, + "step": 28244 + }, + { + "epoch": 0.16798101627176706, + "grad_norm": 1.372679352760315, + "learning_rate": 4.659907149661386e-05, + "loss": 4.6257, + "step": 28245 + }, + { + "epoch": 0.16798696355504805, + "grad_norm": 1.599493384361267, + "learning_rate": 4.659883628279039e-05, + "loss": 4.4781, + "step": 28246 + }, + { + "epoch": 0.16799291083832904, + "grad_norm": 1.516619324684143, + "learning_rate": 4.6598601061426986e-05, + "loss": 4.4817, + "step": 28247 + }, + { + "epoch": 0.16799885812161006, + "grad_norm": 1.6319454908370972, + "learning_rate": 4.6598365832523736e-05, + "loss": 4.4314, + "step": 28248 + }, + { + "epoch": 0.16800480540489104, + "grad_norm": 1.5013442039489746, + "learning_rate": 4.6598130596080726e-05, + "loss": 4.3608, + "step": 28249 + }, + { + "epoch": 0.16801075268817203, + "grad_norm": 1.5573625564575195, + "learning_rate": 4.659789535209803e-05, + "loss": 4.38, + "step": 28250 + }, + { + "epoch": 0.16801669997145305, + "grad_norm": 1.5244330167770386, + "learning_rate": 4.659766010057574e-05, + "loss": 4.4152, + "step": 28251 + }, + { + "epoch": 0.16802264725473404, + "grad_norm": 2.792175054550171, + "learning_rate": 4.659742484151391e-05, + "loss": 3.7226, + "step": 28252 + }, + { + "epoch": 0.16802859453801502, + "grad_norm": 2.0370240211486816, + "learning_rate": 4.6597189574912654e-05, + "loss": 4.1552, + "step": 28253 + }, + { + "epoch": 0.16803454182129604, + "grad_norm": 1.6263444423675537, + "learning_rate": 4.6596954300772044e-05, + "loss": 4.7215, + "step": 28254 + }, + { + "epoch": 0.16804048910457703, + "grad_norm": 1.6130170822143555, + "learning_rate": 4.659671901909215e-05, + "loss": 4.5078, + "step": 28255 + }, + { + "epoch": 0.16804643638785802, + "grad_norm": 1.3925176858901978, + "learning_rate": 4.659648372987308e-05, + "loss": 4.6085, + "step": 28256 + }, + { + "epoch": 0.16805238367113903, + "grad_norm": 1.4680298566818237, + "learning_rate": 4.6596248433114886e-05, + "loss": 4.4605, + "step": 28257 + }, + { + "epoch": 0.16805833095442002, + "grad_norm": 1.9639580249786377, + "learning_rate": 4.659601312881767e-05, + "loss": 4.1688, + "step": 28258 + }, + { + "epoch": 0.168064278237701, + "grad_norm": 1.7880107164382935, + "learning_rate": 4.6595777816981515e-05, + "loss": 4.3835, + "step": 28259 + }, + { + "epoch": 0.16807022552098203, + "grad_norm": 1.8420106172561646, + "learning_rate": 4.659554249760649e-05, + "loss": 4.4068, + "step": 28260 + }, + { + "epoch": 0.168076172804263, + "grad_norm": 1.7331891059875488, + "learning_rate": 4.659530717069269e-05, + "loss": 4.2069, + "step": 28261 + }, + { + "epoch": 0.168082120087544, + "grad_norm": 1.6757560968399048, + "learning_rate": 4.659507183624019e-05, + "loss": 4.7915, + "step": 28262 + }, + { + "epoch": 0.16808806737082502, + "grad_norm": 1.6277943849563599, + "learning_rate": 4.6594836494249066e-05, + "loss": 4.431, + "step": 28263 + }, + { + "epoch": 0.168094014654106, + "grad_norm": 1.9865028858184814, + "learning_rate": 4.6594601144719406e-05, + "loss": 4.8244, + "step": 28264 + }, + { + "epoch": 0.168099961937387, + "grad_norm": 1.818390130996704, + "learning_rate": 4.659436578765131e-05, + "loss": 4.7089, + "step": 28265 + }, + { + "epoch": 0.168105909220668, + "grad_norm": 1.3201099634170532, + "learning_rate": 4.6594130423044836e-05, + "loss": 4.8117, + "step": 28266 + }, + { + "epoch": 0.168111856503949, + "grad_norm": 1.7755099534988403, + "learning_rate": 4.6593895050900074e-05, + "loss": 4.4389, + "step": 28267 + }, + { + "epoch": 0.16811780378722999, + "grad_norm": 1.6653193235397339, + "learning_rate": 4.65936596712171e-05, + "loss": 4.3489, + "step": 28268 + }, + { + "epoch": 0.168123751070511, + "grad_norm": 1.4699918031692505, + "learning_rate": 4.6593424283996004e-05, + "loss": 4.935, + "step": 28269 + }, + { + "epoch": 0.168129698353792, + "grad_norm": 1.8290356397628784, + "learning_rate": 4.659318888923687e-05, + "loss": 5.1348, + "step": 28270 + }, + { + "epoch": 0.16813564563707298, + "grad_norm": 1.7782410383224487, + "learning_rate": 4.6592953486939784e-05, + "loss": 5.1601, + "step": 28271 + }, + { + "epoch": 0.168141592920354, + "grad_norm": 1.8384326696395874, + "learning_rate": 4.6592718077104814e-05, + "loss": 4.7923, + "step": 28272 + }, + { + "epoch": 0.16814754020363498, + "grad_norm": 1.6723445653915405, + "learning_rate": 4.659248265973205e-05, + "loss": 4.9497, + "step": 28273 + }, + { + "epoch": 0.16815348748691597, + "grad_norm": 1.4820493459701538, + "learning_rate": 4.6592247234821575e-05, + "loss": 4.3104, + "step": 28274 + }, + { + "epoch": 0.16815943477019699, + "grad_norm": 1.4215086698532104, + "learning_rate": 4.659201180237346e-05, + "loss": 4.5723, + "step": 28275 + }, + { + "epoch": 0.16816538205347797, + "grad_norm": 1.6446219682693481, + "learning_rate": 4.6591776362387804e-05, + "loss": 4.6208, + "step": 28276 + }, + { + "epoch": 0.16817132933675896, + "grad_norm": 1.6352293491363525, + "learning_rate": 4.6591540914864686e-05, + "loss": 5.03, + "step": 28277 + }, + { + "epoch": 0.16817727662003998, + "grad_norm": 1.59463369846344, + "learning_rate": 4.659130545980418e-05, + "loss": 4.5116, + "step": 28278 + }, + { + "epoch": 0.16818322390332097, + "grad_norm": 1.8565449714660645, + "learning_rate": 4.659106999720637e-05, + "loss": 4.4572, + "step": 28279 + }, + { + "epoch": 0.16818917118660195, + "grad_norm": 1.7354021072387695, + "learning_rate": 4.659083452707135e-05, + "loss": 4.9343, + "step": 28280 + }, + { + "epoch": 0.16819511846988297, + "grad_norm": 1.8169907331466675, + "learning_rate": 4.659059904939918e-05, + "loss": 4.6285, + "step": 28281 + }, + { + "epoch": 0.16820106575316396, + "grad_norm": 1.6343300342559814, + "learning_rate": 4.659036356418996e-05, + "loss": 4.6125, + "step": 28282 + }, + { + "epoch": 0.16820701303644495, + "grad_norm": 1.5487629175186157, + "learning_rate": 4.659012807144377e-05, + "loss": 4.5907, + "step": 28283 + }, + { + "epoch": 0.16821296031972596, + "grad_norm": 1.4640655517578125, + "learning_rate": 4.658989257116069e-05, + "loss": 4.4199, + "step": 28284 + }, + { + "epoch": 0.16821890760300695, + "grad_norm": 1.4370266199111938, + "learning_rate": 4.65896570633408e-05, + "loss": 4.5677, + "step": 28285 + }, + { + "epoch": 0.16822485488628794, + "grad_norm": 1.6564301252365112, + "learning_rate": 4.658942154798418e-05, + "loss": 4.5189, + "step": 28286 + }, + { + "epoch": 0.16823080216956893, + "grad_norm": 1.6301320791244507, + "learning_rate": 4.658918602509091e-05, + "loss": 4.9653, + "step": 28287 + }, + { + "epoch": 0.16823674945284994, + "grad_norm": 1.5462539196014404, + "learning_rate": 4.6588950494661096e-05, + "loss": 5.011, + "step": 28288 + }, + { + "epoch": 0.16824269673613093, + "grad_norm": 1.7004579305648804, + "learning_rate": 4.658871495669479e-05, + "loss": 4.7863, + "step": 28289 + }, + { + "epoch": 0.16824864401941192, + "grad_norm": 1.47449791431427, + "learning_rate": 4.658847941119209e-05, + "loss": 4.8344, + "step": 28290 + }, + { + "epoch": 0.16825459130269294, + "grad_norm": 1.7310223579406738, + "learning_rate": 4.658824385815308e-05, + "loss": 4.5996, + "step": 28291 + }, + { + "epoch": 0.16826053858597392, + "grad_norm": 1.5716323852539062, + "learning_rate": 4.658800829757782e-05, + "loss": 4.6623, + "step": 28292 + }, + { + "epoch": 0.1682664858692549, + "grad_norm": 1.8458023071289062, + "learning_rate": 4.6587772729466426e-05, + "loss": 4.8966, + "step": 28293 + }, + { + "epoch": 0.16827243315253593, + "grad_norm": 1.4939119815826416, + "learning_rate": 4.658753715381896e-05, + "loss": 4.9607, + "step": 28294 + }, + { + "epoch": 0.16827838043581692, + "grad_norm": 1.6060224771499634, + "learning_rate": 4.658730157063551e-05, + "loss": 4.9144, + "step": 28295 + }, + { + "epoch": 0.1682843277190979, + "grad_norm": 1.6743205785751343, + "learning_rate": 4.658706597991615e-05, + "loss": 5.1634, + "step": 28296 + }, + { + "epoch": 0.16829027500237892, + "grad_norm": 1.6277934312820435, + "learning_rate": 4.658683038166097e-05, + "loss": 4.5367, + "step": 28297 + }, + { + "epoch": 0.1682962222856599, + "grad_norm": 2.8272674083709717, + "learning_rate": 4.658659477587005e-05, + "loss": 4.5467, + "step": 28298 + }, + { + "epoch": 0.1683021695689409, + "grad_norm": 2.199181318283081, + "learning_rate": 4.658635916254348e-05, + "loss": 4.595, + "step": 28299 + }, + { + "epoch": 0.1683081168522219, + "grad_norm": 1.860811710357666, + "learning_rate": 4.6586123541681324e-05, + "loss": 4.6934, + "step": 28300 + }, + { + "epoch": 0.1683140641355029, + "grad_norm": 1.5959035158157349, + "learning_rate": 4.6585887913283685e-05, + "loss": 4.5346, + "step": 28301 + }, + { + "epoch": 0.1683200114187839, + "grad_norm": 1.503235936164856, + "learning_rate": 4.658565227735063e-05, + "loss": 4.7135, + "step": 28302 + }, + { + "epoch": 0.1683259587020649, + "grad_norm": 1.5272914171218872, + "learning_rate": 4.658541663388225e-05, + "loss": 4.507, + "step": 28303 + }, + { + "epoch": 0.1683319059853459, + "grad_norm": 1.7282012701034546, + "learning_rate": 4.6585180982878615e-05, + "loss": 4.4787, + "step": 28304 + }, + { + "epoch": 0.16833785326862688, + "grad_norm": 1.6522059440612793, + "learning_rate": 4.6584945324339823e-05, + "loss": 4.5825, + "step": 28305 + }, + { + "epoch": 0.1683438005519079, + "grad_norm": 1.3752492666244507, + "learning_rate": 4.6584709658265955e-05, + "loss": 4.7064, + "step": 28306 + }, + { + "epoch": 0.16834974783518888, + "grad_norm": 2.415187358856201, + "learning_rate": 4.6584473984657086e-05, + "loss": 4.1959, + "step": 28307 + }, + { + "epoch": 0.16835569511846987, + "grad_norm": 1.545029640197754, + "learning_rate": 4.6584238303513295e-05, + "loss": 4.426, + "step": 28308 + }, + { + "epoch": 0.1683616424017509, + "grad_norm": 1.6749895811080933, + "learning_rate": 4.6584002614834666e-05, + "loss": 5.19, + "step": 28309 + }, + { + "epoch": 0.16836758968503188, + "grad_norm": 1.5567103624343872, + "learning_rate": 4.65837669186213e-05, + "loss": 4.854, + "step": 28310 + }, + { + "epoch": 0.16837353696831286, + "grad_norm": 1.2138694524765015, + "learning_rate": 4.658353121487324e-05, + "loss": 4.6035, + "step": 28311 + }, + { + "epoch": 0.16837948425159388, + "grad_norm": 1.4592459201812744, + "learning_rate": 4.658329550359061e-05, + "loss": 4.6315, + "step": 28312 + }, + { + "epoch": 0.16838543153487487, + "grad_norm": 1.5305829048156738, + "learning_rate": 4.658305978477348e-05, + "loss": 4.9041, + "step": 28313 + }, + { + "epoch": 0.16839137881815586, + "grad_norm": 2.0584359169006348, + "learning_rate": 4.658282405842191e-05, + "loss": 3.7849, + "step": 28314 + }, + { + "epoch": 0.16839732610143687, + "grad_norm": 3.1896352767944336, + "learning_rate": 4.658258832453601e-05, + "loss": 3.9083, + "step": 28315 + }, + { + "epoch": 0.16840327338471786, + "grad_norm": 2.942909002304077, + "learning_rate": 4.658235258311584e-05, + "loss": 3.6764, + "step": 28316 + }, + { + "epoch": 0.16840922066799885, + "grad_norm": 3.2764618396759033, + "learning_rate": 4.65821168341615e-05, + "loss": 3.8794, + "step": 28317 + }, + { + "epoch": 0.16841516795127986, + "grad_norm": 2.8366522789001465, + "learning_rate": 4.6581881077673074e-05, + "loss": 4.8133, + "step": 28318 + }, + { + "epoch": 0.16842111523456085, + "grad_norm": 1.551155686378479, + "learning_rate": 4.658164531365063e-05, + "loss": 4.7024, + "step": 28319 + }, + { + "epoch": 0.16842706251784184, + "grad_norm": 2.4063937664031982, + "learning_rate": 4.6581409542094255e-05, + "loss": 3.2516, + "step": 28320 + }, + { + "epoch": 0.16843300980112286, + "grad_norm": 2.5758605003356934, + "learning_rate": 4.658117376300404e-05, + "loss": 3.5301, + "step": 28321 + }, + { + "epoch": 0.16843895708440385, + "grad_norm": 2.643880605697632, + "learning_rate": 4.658093797638005e-05, + "loss": 3.2137, + "step": 28322 + }, + { + "epoch": 0.16844490436768483, + "grad_norm": 2.6048755645751953, + "learning_rate": 4.658070218222238e-05, + "loss": 3.3595, + "step": 28323 + }, + { + "epoch": 0.16845085165096585, + "grad_norm": 2.677281141281128, + "learning_rate": 4.6580466380531116e-05, + "loss": 4.0526, + "step": 28324 + }, + { + "epoch": 0.16845679893424684, + "grad_norm": 2.1559438705444336, + "learning_rate": 4.658023057130633e-05, + "loss": 3.6773, + "step": 28325 + }, + { + "epoch": 0.16846274621752783, + "grad_norm": 2.271451711654663, + "learning_rate": 4.6579994754548105e-05, + "loss": 3.3233, + "step": 28326 + }, + { + "epoch": 0.16846869350080884, + "grad_norm": 2.6819088459014893, + "learning_rate": 4.657975893025653e-05, + "loss": 3.0184, + "step": 28327 + }, + { + "epoch": 0.16847464078408983, + "grad_norm": 2.7791247367858887, + "learning_rate": 4.6579523098431686e-05, + "loss": 3.4093, + "step": 28328 + }, + { + "epoch": 0.16848058806737082, + "grad_norm": 2.7528347969055176, + "learning_rate": 4.6579287259073654e-05, + "loss": 3.0479, + "step": 28329 + }, + { + "epoch": 0.16848653535065183, + "grad_norm": 2.3715124130249023, + "learning_rate": 4.657905141218252e-05, + "loss": 3.7365, + "step": 28330 + }, + { + "epoch": 0.16849248263393282, + "grad_norm": 1.9896430969238281, + "learning_rate": 4.657881555775835e-05, + "loss": 4.6336, + "step": 28331 + }, + { + "epoch": 0.1684984299172138, + "grad_norm": 1.6838959455490112, + "learning_rate": 4.657857969580124e-05, + "loss": 4.8033, + "step": 28332 + }, + { + "epoch": 0.16850437720049483, + "grad_norm": 1.7189829349517822, + "learning_rate": 4.6578343826311274e-05, + "loss": 4.721, + "step": 28333 + }, + { + "epoch": 0.16851032448377581, + "grad_norm": 2.3129501342773438, + "learning_rate": 4.657810794928854e-05, + "loss": 3.626, + "step": 28334 + }, + { + "epoch": 0.1685162717670568, + "grad_norm": 3.216485023498535, + "learning_rate": 4.6577872064733094e-05, + "loss": 3.2259, + "step": 28335 + }, + { + "epoch": 0.16852221905033782, + "grad_norm": 2.995213031768799, + "learning_rate": 4.657763617264506e-05, + "loss": 3.2364, + "step": 28336 + }, + { + "epoch": 0.1685281663336188, + "grad_norm": 2.6219449043273926, + "learning_rate": 4.6577400273024474e-05, + "loss": 4.2354, + "step": 28337 + }, + { + "epoch": 0.1685341136168998, + "grad_norm": 1.6310757398605347, + "learning_rate": 4.657716436587145e-05, + "loss": 5.3334, + "step": 28338 + }, + { + "epoch": 0.1685400609001808, + "grad_norm": 2.375399589538574, + "learning_rate": 4.657692845118605e-05, + "loss": 4.4366, + "step": 28339 + }, + { + "epoch": 0.1685460081834618, + "grad_norm": 1.874076247215271, + "learning_rate": 4.657669252896838e-05, + "loss": 5.2293, + "step": 28340 + }, + { + "epoch": 0.1685519554667428, + "grad_norm": 1.8757516145706177, + "learning_rate": 4.657645659921851e-05, + "loss": 4.6433, + "step": 28341 + }, + { + "epoch": 0.1685579027500238, + "grad_norm": 1.6679904460906982, + "learning_rate": 4.6576220661936514e-05, + "loss": 4.591, + "step": 28342 + }, + { + "epoch": 0.1685638500333048, + "grad_norm": 1.5081669092178345, + "learning_rate": 4.6575984717122487e-05, + "loss": 4.9147, + "step": 28343 + }, + { + "epoch": 0.16856979731658578, + "grad_norm": 1.4801992177963257, + "learning_rate": 4.657574876477651e-05, + "loss": 5.3181, + "step": 28344 + }, + { + "epoch": 0.1685757445998668, + "grad_norm": 1.5100293159484863, + "learning_rate": 4.657551280489865e-05, + "loss": 4.6282, + "step": 28345 + }, + { + "epoch": 0.16858169188314778, + "grad_norm": 1.5850365161895752, + "learning_rate": 4.6575276837489016e-05, + "loss": 4.566, + "step": 28346 + }, + { + "epoch": 0.16858763916642877, + "grad_norm": 1.9910119771957397, + "learning_rate": 4.657504086254766e-05, + "loss": 5.1222, + "step": 28347 + }, + { + "epoch": 0.16859358644970976, + "grad_norm": 1.8456346988677979, + "learning_rate": 4.65748048800747e-05, + "loss": 4.7977, + "step": 28348 + }, + { + "epoch": 0.16859953373299078, + "grad_norm": 2.4570720195770264, + "learning_rate": 4.657456889007018e-05, + "loss": 4.6518, + "step": 28349 + }, + { + "epoch": 0.16860548101627176, + "grad_norm": 2.76509952545166, + "learning_rate": 4.657433289253421e-05, + "loss": 4.2894, + "step": 28350 + }, + { + "epoch": 0.16861142829955275, + "grad_norm": 2.61690616607666, + "learning_rate": 4.657409688746686e-05, + "loss": 4.1016, + "step": 28351 + }, + { + "epoch": 0.16861737558283377, + "grad_norm": 2.678689479827881, + "learning_rate": 4.6573860874868214e-05, + "loss": 4.4325, + "step": 28352 + }, + { + "epoch": 0.16862332286611476, + "grad_norm": 2.1475918292999268, + "learning_rate": 4.657362485473836e-05, + "loss": 4.8043, + "step": 28353 + }, + { + "epoch": 0.16862927014939574, + "grad_norm": 1.7649880647659302, + "learning_rate": 4.657338882707738e-05, + "loss": 5.5315, + "step": 28354 + }, + { + "epoch": 0.16863521743267676, + "grad_norm": 2.451415538787842, + "learning_rate": 4.657315279188534e-05, + "loss": 4.4149, + "step": 28355 + }, + { + "epoch": 0.16864116471595775, + "grad_norm": 2.628056764602661, + "learning_rate": 4.657291674916234e-05, + "loss": 3.9996, + "step": 28356 + }, + { + "epoch": 0.16864711199923874, + "grad_norm": 2.5917954444885254, + "learning_rate": 4.657268069890847e-05, + "loss": 4.1523, + "step": 28357 + }, + { + "epoch": 0.16865305928251975, + "grad_norm": 2.5339810848236084, + "learning_rate": 4.657244464112379e-05, + "loss": 4.1835, + "step": 28358 + }, + { + "epoch": 0.16865900656580074, + "grad_norm": 2.5512847900390625, + "learning_rate": 4.657220857580839e-05, + "loss": 4.2205, + "step": 28359 + }, + { + "epoch": 0.16866495384908173, + "grad_norm": 1.9828633069992065, + "learning_rate": 4.657197250296236e-05, + "loss": 4.5812, + "step": 28360 + }, + { + "epoch": 0.16867090113236274, + "grad_norm": 1.9058914184570312, + "learning_rate": 4.657173642258578e-05, + "loss": 4.9579, + "step": 28361 + }, + { + "epoch": 0.16867684841564373, + "grad_norm": 2.473252534866333, + "learning_rate": 4.657150033467872e-05, + "loss": 4.2123, + "step": 28362 + }, + { + "epoch": 0.16868279569892472, + "grad_norm": 2.2516047954559326, + "learning_rate": 4.657126423924128e-05, + "loss": 4.2096, + "step": 28363 + }, + { + "epoch": 0.16868874298220574, + "grad_norm": 2.4706156253814697, + "learning_rate": 4.657102813627353e-05, + "loss": 4.0615, + "step": 28364 + }, + { + "epoch": 0.16869469026548672, + "grad_norm": 2.5827410221099854, + "learning_rate": 4.657079202577556e-05, + "loss": 4.4003, + "step": 28365 + }, + { + "epoch": 0.1687006375487677, + "grad_norm": 1.812254548072815, + "learning_rate": 4.657055590774745e-05, + "loss": 4.7705, + "step": 28366 + }, + { + "epoch": 0.16870658483204873, + "grad_norm": 1.5623784065246582, + "learning_rate": 4.6570319782189284e-05, + "loss": 5.3618, + "step": 28367 + }, + { + "epoch": 0.16871253211532972, + "grad_norm": 1.9756156206130981, + "learning_rate": 4.657008364910114e-05, + "loss": 5.0061, + "step": 28368 + }, + { + "epoch": 0.1687184793986107, + "grad_norm": 2.592015027999878, + "learning_rate": 4.65698475084831e-05, + "loss": 4.771, + "step": 28369 + }, + { + "epoch": 0.16872442668189172, + "grad_norm": 1.7394741773605347, + "learning_rate": 4.656961136033525e-05, + "loss": 5.4057, + "step": 28370 + }, + { + "epoch": 0.1687303739651727, + "grad_norm": 1.712748646736145, + "learning_rate": 4.656937520465767e-05, + "loss": 5.242, + "step": 28371 + }, + { + "epoch": 0.1687363212484537, + "grad_norm": 1.794945240020752, + "learning_rate": 4.6569139041450446e-05, + "loss": 5.1821, + "step": 28372 + }, + { + "epoch": 0.1687422685317347, + "grad_norm": 1.6122878789901733, + "learning_rate": 4.656890287071366e-05, + "loss": 5.3729, + "step": 28373 + }, + { + "epoch": 0.1687482158150157, + "grad_norm": 1.6189091205596924, + "learning_rate": 4.656866669244739e-05, + "loss": 5.5319, + "step": 28374 + }, + { + "epoch": 0.1687541630982967, + "grad_norm": 1.4604097604751587, + "learning_rate": 4.6568430506651715e-05, + "loss": 5.7885, + "step": 28375 + }, + { + "epoch": 0.1687601103815777, + "grad_norm": 1.4060790538787842, + "learning_rate": 4.656819431332673e-05, + "loss": 5.8022, + "step": 28376 + }, + { + "epoch": 0.1687660576648587, + "grad_norm": 1.4350751638412476, + "learning_rate": 4.6567958112472515e-05, + "loss": 5.8437, + "step": 28377 + }, + { + "epoch": 0.16877200494813968, + "grad_norm": 1.572094202041626, + "learning_rate": 4.656772190408914e-05, + "loss": 5.2559, + "step": 28378 + }, + { + "epoch": 0.1687779522314207, + "grad_norm": 1.5529630184173584, + "learning_rate": 4.656748568817671e-05, + "loss": 5.325, + "step": 28379 + }, + { + "epoch": 0.16878389951470169, + "grad_norm": 1.5496705770492554, + "learning_rate": 4.656724946473528e-05, + "loss": 5.2824, + "step": 28380 + }, + { + "epoch": 0.16878984679798267, + "grad_norm": 1.4349329471588135, + "learning_rate": 4.656701323376496e-05, + "loss": 5.3192, + "step": 28381 + }, + { + "epoch": 0.1687957940812637, + "grad_norm": 1.391747236251831, + "learning_rate": 4.6566776995265804e-05, + "loss": 5.2476, + "step": 28382 + }, + { + "epoch": 0.16880174136454468, + "grad_norm": 1.3532518148422241, + "learning_rate": 4.6566540749237916e-05, + "loss": 5.1795, + "step": 28383 + }, + { + "epoch": 0.16880768864782567, + "grad_norm": 1.4906384944915771, + "learning_rate": 4.656630449568137e-05, + "loss": 5.3211, + "step": 28384 + }, + { + "epoch": 0.16881363593110668, + "grad_norm": 1.560478687286377, + "learning_rate": 4.656606823459625e-05, + "loss": 5.2823, + "step": 28385 + }, + { + "epoch": 0.16881958321438767, + "grad_norm": 1.6834107637405396, + "learning_rate": 4.656583196598264e-05, + "loss": 5.206, + "step": 28386 + }, + { + "epoch": 0.16882553049766866, + "grad_norm": 1.4601906538009644, + "learning_rate": 4.656559568984062e-05, + "loss": 5.2269, + "step": 28387 + }, + { + "epoch": 0.16883147778094967, + "grad_norm": 1.7208976745605469, + "learning_rate": 4.656535940617027e-05, + "loss": 5.3731, + "step": 28388 + }, + { + "epoch": 0.16883742506423066, + "grad_norm": 1.6507620811462402, + "learning_rate": 4.656512311497168e-05, + "loss": 5.544, + "step": 28389 + }, + { + "epoch": 0.16884337234751165, + "grad_norm": 1.7269225120544434, + "learning_rate": 4.6564886816244926e-05, + "loss": 5.5757, + "step": 28390 + }, + { + "epoch": 0.16884931963079267, + "grad_norm": 1.8436660766601562, + "learning_rate": 4.6564650509990096e-05, + "loss": 5.2549, + "step": 28391 + }, + { + "epoch": 0.16885526691407365, + "grad_norm": 2.2432281970977783, + "learning_rate": 4.656441419620727e-05, + "loss": 4.788, + "step": 28392 + }, + { + "epoch": 0.16886121419735464, + "grad_norm": 1.6931114196777344, + "learning_rate": 4.656417787489652e-05, + "loss": 4.9039, + "step": 28393 + }, + { + "epoch": 0.16886716148063566, + "grad_norm": 1.6208950281143188, + "learning_rate": 4.656394154605795e-05, + "loss": 5.2821, + "step": 28394 + }, + { + "epoch": 0.16887310876391665, + "grad_norm": 2.725078821182251, + "learning_rate": 4.656370520969162e-05, + "loss": 4.3892, + "step": 28395 + }, + { + "epoch": 0.16887905604719763, + "grad_norm": 3.6109495162963867, + "learning_rate": 4.6563468865797636e-05, + "loss": 4.1935, + "step": 28396 + }, + { + "epoch": 0.16888500333047865, + "grad_norm": 1.9827744960784912, + "learning_rate": 4.656323251437606e-05, + "loss": 5.1187, + "step": 28397 + }, + { + "epoch": 0.16889095061375964, + "grad_norm": 1.8615485429763794, + "learning_rate": 4.6562996155426985e-05, + "loss": 5.6777, + "step": 28398 + }, + { + "epoch": 0.16889689789704063, + "grad_norm": 1.7114287614822388, + "learning_rate": 4.6562759788950484e-05, + "loss": 5.5126, + "step": 28399 + }, + { + "epoch": 0.16890284518032164, + "grad_norm": 1.672108769416809, + "learning_rate": 4.656252341494666e-05, + "loss": 5.2453, + "step": 28400 + }, + { + "epoch": 0.16890879246360263, + "grad_norm": 1.7363505363464355, + "learning_rate": 4.656228703341556e-05, + "loss": 5.1452, + "step": 28401 + }, + { + "epoch": 0.16891473974688362, + "grad_norm": 1.6358929872512817, + "learning_rate": 4.656205064435731e-05, + "loss": 4.7812, + "step": 28402 + }, + { + "epoch": 0.16892068703016463, + "grad_norm": 1.5269345045089722, + "learning_rate": 4.656181424777196e-05, + "loss": 4.9725, + "step": 28403 + }, + { + "epoch": 0.16892663431344562, + "grad_norm": 1.8694361448287964, + "learning_rate": 4.656157784365961e-05, + "loss": 4.8145, + "step": 28404 + }, + { + "epoch": 0.1689325815967266, + "grad_norm": 1.6409978866577148, + "learning_rate": 4.6561341432020335e-05, + "loss": 4.8409, + "step": 28405 + }, + { + "epoch": 0.1689385288800076, + "grad_norm": 1.586323618888855, + "learning_rate": 4.656110501285421e-05, + "loss": 4.9883, + "step": 28406 + }, + { + "epoch": 0.16894447616328861, + "grad_norm": 1.936805009841919, + "learning_rate": 4.656086858616133e-05, + "loss": 4.8728, + "step": 28407 + }, + { + "epoch": 0.1689504234465696, + "grad_norm": 2.4873859882354736, + "learning_rate": 4.656063215194178e-05, + "loss": 4.3402, + "step": 28408 + }, + { + "epoch": 0.1689563707298506, + "grad_norm": 2.295729637145996, + "learning_rate": 4.6560395710195624e-05, + "loss": 4.2334, + "step": 28409 + }, + { + "epoch": 0.1689623180131316, + "grad_norm": 2.2564427852630615, + "learning_rate": 4.6560159260922966e-05, + "loss": 4.6056, + "step": 28410 + }, + { + "epoch": 0.1689682652964126, + "grad_norm": 1.5321199893951416, + "learning_rate": 4.655992280412388e-05, + "loss": 5.7092, + "step": 28411 + }, + { + "epoch": 0.16897421257969358, + "grad_norm": 1.4915989637374878, + "learning_rate": 4.655968633979844e-05, + "loss": 5.5028, + "step": 28412 + }, + { + "epoch": 0.1689801598629746, + "grad_norm": 1.6282528638839722, + "learning_rate": 4.655944986794675e-05, + "loss": 5.405, + "step": 28413 + }, + { + "epoch": 0.1689861071462556, + "grad_norm": 1.5174504518508911, + "learning_rate": 4.6559213388568865e-05, + "loss": 5.2818, + "step": 28414 + }, + { + "epoch": 0.16899205442953658, + "grad_norm": 1.6792948246002197, + "learning_rate": 4.6558976901664885e-05, + "loss": 5.4466, + "step": 28415 + }, + { + "epoch": 0.1689980017128176, + "grad_norm": 1.5633111000061035, + "learning_rate": 4.655874040723489e-05, + "loss": 5.3313, + "step": 28416 + }, + { + "epoch": 0.16900394899609858, + "grad_norm": 1.6550037860870361, + "learning_rate": 4.655850390527896e-05, + "loss": 5.3279, + "step": 28417 + }, + { + "epoch": 0.16900989627937957, + "grad_norm": 1.6670206785202026, + "learning_rate": 4.6558267395797186e-05, + "loss": 5.0354, + "step": 28418 + }, + { + "epoch": 0.16901584356266058, + "grad_norm": 1.577187180519104, + "learning_rate": 4.6558030878789635e-05, + "loss": 4.9382, + "step": 28419 + }, + { + "epoch": 0.16902179084594157, + "grad_norm": 1.5832712650299072, + "learning_rate": 4.65577943542564e-05, + "loss": 5.3036, + "step": 28420 + }, + { + "epoch": 0.16902773812922256, + "grad_norm": 1.4962387084960938, + "learning_rate": 4.655755782219756e-05, + "loss": 5.3586, + "step": 28421 + }, + { + "epoch": 0.16903368541250358, + "grad_norm": 1.2843531370162964, + "learning_rate": 4.655732128261321e-05, + "loss": 5.3972, + "step": 28422 + }, + { + "epoch": 0.16903963269578456, + "grad_norm": 1.1370457410812378, + "learning_rate": 4.6557084735503406e-05, + "loss": 5.2004, + "step": 28423 + }, + { + "epoch": 0.16904557997906555, + "grad_norm": 2.759056329727173, + "learning_rate": 4.655684818086825e-05, + "loss": 4.5741, + "step": 28424 + }, + { + "epoch": 0.16905152726234657, + "grad_norm": 2.7487027645111084, + "learning_rate": 4.655661161870783e-05, + "loss": 4.1308, + "step": 28425 + }, + { + "epoch": 0.16905747454562756, + "grad_norm": 2.479084014892578, + "learning_rate": 4.655637504902221e-05, + "loss": 4.2166, + "step": 28426 + }, + { + "epoch": 0.16906342182890854, + "grad_norm": 2.667968511581421, + "learning_rate": 4.65561384718115e-05, + "loss": 4.1276, + "step": 28427 + }, + { + "epoch": 0.16906936911218956, + "grad_norm": 2.6374669075012207, + "learning_rate": 4.655590188707575e-05, + "loss": 3.7747, + "step": 28428 + }, + { + "epoch": 0.16907531639547055, + "grad_norm": 2.0448408126831055, + "learning_rate": 4.655566529481505e-05, + "loss": 4.7242, + "step": 28429 + }, + { + "epoch": 0.16908126367875154, + "grad_norm": 2.416416645050049, + "learning_rate": 4.65554286950295e-05, + "loss": 4.3241, + "step": 28430 + }, + { + "epoch": 0.16908721096203255, + "grad_norm": 2.018310308456421, + "learning_rate": 4.6555192087719175e-05, + "loss": 4.2137, + "step": 28431 + }, + { + "epoch": 0.16909315824531354, + "grad_norm": 2.2149248123168945, + "learning_rate": 4.655495547288415e-05, + "loss": 4.2518, + "step": 28432 + }, + { + "epoch": 0.16909910552859453, + "grad_norm": 2.190190553665161, + "learning_rate": 4.655471885052452e-05, + "loss": 4.0488, + "step": 28433 + }, + { + "epoch": 0.16910505281187554, + "grad_norm": 2.146759033203125, + "learning_rate": 4.6554482220640347e-05, + "loss": 4.005, + "step": 28434 + }, + { + "epoch": 0.16911100009515653, + "grad_norm": 1.7445921897888184, + "learning_rate": 4.655424558323174e-05, + "loss": 4.5846, + "step": 28435 + }, + { + "epoch": 0.16911694737843752, + "grad_norm": 1.924498200416565, + "learning_rate": 4.655400893829876e-05, + "loss": 4.4729, + "step": 28436 + }, + { + "epoch": 0.16912289466171854, + "grad_norm": 2.297170877456665, + "learning_rate": 4.65537722858415e-05, + "loss": 4.0639, + "step": 28437 + }, + { + "epoch": 0.16912884194499953, + "grad_norm": 2.254561424255371, + "learning_rate": 4.6553535625860044e-05, + "loss": 3.6444, + "step": 28438 + }, + { + "epoch": 0.1691347892282805, + "grad_norm": 2.3372230529785156, + "learning_rate": 4.655329895835447e-05, + "loss": 3.9905, + "step": 28439 + }, + { + "epoch": 0.16914073651156153, + "grad_norm": 2.376207113265991, + "learning_rate": 4.655306228332486e-05, + "loss": 3.9777, + "step": 28440 + }, + { + "epoch": 0.16914668379484252, + "grad_norm": 1.6520785093307495, + "learning_rate": 4.65528256007713e-05, + "loss": 4.9314, + "step": 28441 + }, + { + "epoch": 0.1691526310781235, + "grad_norm": 1.93073308467865, + "learning_rate": 4.6552588910693876e-05, + "loss": 5.1317, + "step": 28442 + }, + { + "epoch": 0.16915857836140452, + "grad_norm": 1.5278276205062866, + "learning_rate": 4.655235221309266e-05, + "loss": 5.2949, + "step": 28443 + }, + { + "epoch": 0.1691645256446855, + "grad_norm": 1.5671179294586182, + "learning_rate": 4.6552115507967744e-05, + "loss": 4.8824, + "step": 28444 + }, + { + "epoch": 0.1691704729279665, + "grad_norm": 1.6631091833114624, + "learning_rate": 4.6551878795319204e-05, + "loss": 4.6696, + "step": 28445 + }, + { + "epoch": 0.1691764202112475, + "grad_norm": 1.9113469123840332, + "learning_rate": 4.655164207514713e-05, + "loss": 4.2842, + "step": 28446 + }, + { + "epoch": 0.1691823674945285, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.655140534745159e-05, + "loss": 5.3818, + "step": 28447 + }, + { + "epoch": 0.1691883147778095, + "grad_norm": 1.7372487783432007, + "learning_rate": 4.6551168612232685e-05, + "loss": 5.2441, + "step": 28448 + }, + { + "epoch": 0.1691942620610905, + "grad_norm": 1.8049054145812988, + "learning_rate": 4.655093186949049e-05, + "loss": 5.2056, + "step": 28449 + }, + { + "epoch": 0.1692002093443715, + "grad_norm": 2.019453763961792, + "learning_rate": 4.6550695119225086e-05, + "loss": 5.4237, + "step": 28450 + }, + { + "epoch": 0.16920615662765248, + "grad_norm": 1.3187928199768066, + "learning_rate": 4.6550458361436554e-05, + "loss": 5.2069, + "step": 28451 + }, + { + "epoch": 0.1692121039109335, + "grad_norm": 2.054603099822998, + "learning_rate": 4.655022159612499e-05, + "loss": 4.4155, + "step": 28452 + }, + { + "epoch": 0.16921805119421449, + "grad_norm": 2.41377854347229, + "learning_rate": 4.6549984823290454e-05, + "loss": 3.613, + "step": 28453 + }, + { + "epoch": 0.16922399847749547, + "grad_norm": 1.9458948373794556, + "learning_rate": 4.654974804293305e-05, + "loss": 3.6051, + "step": 28454 + }, + { + "epoch": 0.1692299457607765, + "grad_norm": 1.7371017932891846, + "learning_rate": 4.6549511255052844e-05, + "loss": 5.1229, + "step": 28455 + }, + { + "epoch": 0.16923589304405748, + "grad_norm": 1.3374329805374146, + "learning_rate": 4.654927445964993e-05, + "loss": 5.7105, + "step": 28456 + }, + { + "epoch": 0.16924184032733847, + "grad_norm": 1.453912377357483, + "learning_rate": 4.654903765672439e-05, + "loss": 5.7225, + "step": 28457 + }, + { + "epoch": 0.16924778761061948, + "grad_norm": 1.984152913093567, + "learning_rate": 4.65488008462763e-05, + "loss": 4.874, + "step": 28458 + }, + { + "epoch": 0.16925373489390047, + "grad_norm": 1.618017554283142, + "learning_rate": 4.6548564028305746e-05, + "loss": 4.6159, + "step": 28459 + }, + { + "epoch": 0.16925968217718146, + "grad_norm": 2.104875087738037, + "learning_rate": 4.654832720281281e-05, + "loss": 3.9827, + "step": 28460 + }, + { + "epoch": 0.16926562946046247, + "grad_norm": 1.9092068672180176, + "learning_rate": 4.654809036979758e-05, + "loss": 3.8551, + "step": 28461 + }, + { + "epoch": 0.16927157674374346, + "grad_norm": 1.6868946552276611, + "learning_rate": 4.6547853529260135e-05, + "loss": 5.6583, + "step": 28462 + }, + { + "epoch": 0.16927752402702445, + "grad_norm": 2.0791547298431396, + "learning_rate": 4.6547616681200544e-05, + "loss": 4.7682, + "step": 28463 + }, + { + "epoch": 0.16928347131030544, + "grad_norm": 2.254826307296753, + "learning_rate": 4.654737982561892e-05, + "loss": 3.7339, + "step": 28464 + }, + { + "epoch": 0.16928941859358645, + "grad_norm": 1.6225947141647339, + "learning_rate": 4.6547142962515314e-05, + "loss": 4.8278, + "step": 28465 + }, + { + "epoch": 0.16929536587686744, + "grad_norm": 1.8425785303115845, + "learning_rate": 4.654690609188983e-05, + "loss": 4.0161, + "step": 28466 + }, + { + "epoch": 0.16930131316014843, + "grad_norm": 1.9367843866348267, + "learning_rate": 4.6546669213742545e-05, + "loss": 3.794, + "step": 28467 + }, + { + "epoch": 0.16930726044342945, + "grad_norm": 1.988096833229065, + "learning_rate": 4.654643232807354e-05, + "loss": 3.7874, + "step": 28468 + }, + { + "epoch": 0.16931320772671044, + "grad_norm": 1.84897780418396, + "learning_rate": 4.6546195434882895e-05, + "loss": 3.8368, + "step": 28469 + }, + { + "epoch": 0.16931915500999142, + "grad_norm": 1.7867851257324219, + "learning_rate": 4.65459585341707e-05, + "loss": 3.7485, + "step": 28470 + }, + { + "epoch": 0.16932510229327244, + "grad_norm": 1.8112739324569702, + "learning_rate": 4.654572162593703e-05, + "loss": 3.7541, + "step": 28471 + }, + { + "epoch": 0.16933104957655343, + "grad_norm": 1.7835328578948975, + "learning_rate": 4.6545484710181974e-05, + "loss": 3.8461, + "step": 28472 + }, + { + "epoch": 0.16933699685983442, + "grad_norm": 1.7823615074157715, + "learning_rate": 4.6545247786905614e-05, + "loss": 3.7878, + "step": 28473 + }, + { + "epoch": 0.16934294414311543, + "grad_norm": 1.8897929191589355, + "learning_rate": 4.654501085610802e-05, + "loss": 3.8613, + "step": 28474 + }, + { + "epoch": 0.16934889142639642, + "grad_norm": 1.9433989524841309, + "learning_rate": 4.654477391778929e-05, + "loss": 3.7189, + "step": 28475 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 1.688061237335205, + "learning_rate": 4.6544536971949504e-05, + "loss": 4.1471, + "step": 28476 + }, + { + "epoch": 0.16936078599295842, + "grad_norm": 1.9753577709197998, + "learning_rate": 4.654430001858874e-05, + "loss": 4.1729, + "step": 28477 + }, + { + "epoch": 0.1693667332762394, + "grad_norm": 1.6471655368804932, + "learning_rate": 4.654406305770709e-05, + "loss": 5.4232, + "step": 28478 + }, + { + "epoch": 0.1693726805595204, + "grad_norm": 1.5919240713119507, + "learning_rate": 4.6543826089304626e-05, + "loss": 5.6299, + "step": 28479 + }, + { + "epoch": 0.16937862784280142, + "grad_norm": 1.505886435508728, + "learning_rate": 4.6543589113381434e-05, + "loss": 5.472, + "step": 28480 + }, + { + "epoch": 0.1693845751260824, + "grad_norm": 1.3407920598983765, + "learning_rate": 4.65433521299376e-05, + "loss": 5.4519, + "step": 28481 + }, + { + "epoch": 0.1693905224093634, + "grad_norm": 1.785452127456665, + "learning_rate": 4.65431151389732e-05, + "loss": 5.0539, + "step": 28482 + }, + { + "epoch": 0.1693964696926444, + "grad_norm": 1.6076501607894897, + "learning_rate": 4.654287814048833e-05, + "loss": 5.5523, + "step": 28483 + }, + { + "epoch": 0.1694024169759254, + "grad_norm": 1.7751826047897339, + "learning_rate": 4.654264113448306e-05, + "loss": 5.3904, + "step": 28484 + }, + { + "epoch": 0.16940836425920638, + "grad_norm": 2.516270160675049, + "learning_rate": 4.6542404120957465e-05, + "loss": 3.6737, + "step": 28485 + }, + { + "epoch": 0.1694143115424874, + "grad_norm": 2.094210386276245, + "learning_rate": 4.654216709991165e-05, + "loss": 3.3822, + "step": 28486 + }, + { + "epoch": 0.1694202588257684, + "grad_norm": 1.9401110410690308, + "learning_rate": 4.6541930071345685e-05, + "loss": 3.3866, + "step": 28487 + }, + { + "epoch": 0.16942620610904938, + "grad_norm": 1.6965755224227905, + "learning_rate": 4.654169303525966e-05, + "loss": 4.8492, + "step": 28488 + }, + { + "epoch": 0.1694321533923304, + "grad_norm": 2.676941156387329, + "learning_rate": 4.654145599165365e-05, + "loss": 4.4578, + "step": 28489 + }, + { + "epoch": 0.16943810067561138, + "grad_norm": 2.53593111038208, + "learning_rate": 4.654121894052773e-05, + "loss": 3.9574, + "step": 28490 + }, + { + "epoch": 0.16944404795889237, + "grad_norm": 2.355025053024292, + "learning_rate": 4.6540981881882006e-05, + "loss": 4.0911, + "step": 28491 + }, + { + "epoch": 0.16944999524217338, + "grad_norm": 2.2941341400146484, + "learning_rate": 4.654074481571654e-05, + "loss": 4.3186, + "step": 28492 + }, + { + "epoch": 0.16945594252545437, + "grad_norm": 2.2436282634735107, + "learning_rate": 4.654050774203143e-05, + "loss": 4.0785, + "step": 28493 + }, + { + "epoch": 0.16946188980873536, + "grad_norm": 2.8532540798187256, + "learning_rate": 4.6540270660826744e-05, + "loss": 3.2517, + "step": 28494 + }, + { + "epoch": 0.16946783709201638, + "grad_norm": 2.7810893058776855, + "learning_rate": 4.6540033572102575e-05, + "loss": 3.462, + "step": 28495 + }, + { + "epoch": 0.16947378437529736, + "grad_norm": 2.5841453075408936, + "learning_rate": 4.6539796475859004e-05, + "loss": 4.4611, + "step": 28496 + }, + { + "epoch": 0.16947973165857835, + "grad_norm": 2.433039903640747, + "learning_rate": 4.653955937209611e-05, + "loss": 3.7666, + "step": 28497 + }, + { + "epoch": 0.16948567894185937, + "grad_norm": 1.7830419540405273, + "learning_rate": 4.6539322260813984e-05, + "loss": 4.9613, + "step": 28498 + }, + { + "epoch": 0.16949162622514036, + "grad_norm": 1.8452028036117554, + "learning_rate": 4.653908514201269e-05, + "loss": 5.0721, + "step": 28499 + }, + { + "epoch": 0.16949757350842135, + "grad_norm": 1.9641203880310059, + "learning_rate": 4.6538848015692336e-05, + "loss": 4.2726, + "step": 28500 + }, + { + "epoch": 0.16950352079170236, + "grad_norm": 2.1620960235595703, + "learning_rate": 4.6538610881853e-05, + "loss": 3.9638, + "step": 28501 + }, + { + "epoch": 0.16950946807498335, + "grad_norm": 1.977523922920227, + "learning_rate": 4.6538373740494737e-05, + "loss": 4.0448, + "step": 28502 + }, + { + "epoch": 0.16951541535826434, + "grad_norm": 1.7069354057312012, + "learning_rate": 4.653813659161766e-05, + "loss": 4.053, + "step": 28503 + }, + { + "epoch": 0.16952136264154535, + "grad_norm": 1.8894158601760864, + "learning_rate": 4.653789943522184e-05, + "loss": 4.1357, + "step": 28504 + }, + { + "epoch": 0.16952730992482634, + "grad_norm": 1.8103679418563843, + "learning_rate": 4.6537662271307366e-05, + "loss": 3.8426, + "step": 28505 + }, + { + "epoch": 0.16953325720810733, + "grad_norm": 1.6966679096221924, + "learning_rate": 4.653742509987431e-05, + "loss": 3.9686, + "step": 28506 + }, + { + "epoch": 0.16953920449138835, + "grad_norm": 1.8758342266082764, + "learning_rate": 4.653718792092278e-05, + "loss": 3.7168, + "step": 28507 + }, + { + "epoch": 0.16954515177466933, + "grad_norm": 1.738481879234314, + "learning_rate": 4.6536950734452824e-05, + "loss": 4.0376, + "step": 28508 + }, + { + "epoch": 0.16955109905795032, + "grad_norm": 1.8814899921417236, + "learning_rate": 4.653671354046454e-05, + "loss": 3.7981, + "step": 28509 + }, + { + "epoch": 0.16955704634123134, + "grad_norm": 1.7275527715682983, + "learning_rate": 4.653647633895801e-05, + "loss": 3.7576, + "step": 28510 + }, + { + "epoch": 0.16956299362451233, + "grad_norm": 1.5637880563735962, + "learning_rate": 4.6536239129933326e-05, + "loss": 5.5343, + "step": 28511 + }, + { + "epoch": 0.16956894090779331, + "grad_norm": 1.6974562406539917, + "learning_rate": 4.653600191339056e-05, + "loss": 5.9386, + "step": 28512 + }, + { + "epoch": 0.16957488819107433, + "grad_norm": 2.0787951946258545, + "learning_rate": 4.65357646893298e-05, + "loss": 5.6018, + "step": 28513 + }, + { + "epoch": 0.16958083547435532, + "grad_norm": 2.0893337726593018, + "learning_rate": 4.653552745775113e-05, + "loss": 5.5357, + "step": 28514 + }, + { + "epoch": 0.1695867827576363, + "grad_norm": 2.1055009365081787, + "learning_rate": 4.6535290218654624e-05, + "loss": 5.6448, + "step": 28515 + }, + { + "epoch": 0.16959273004091732, + "grad_norm": 2.247347116470337, + "learning_rate": 4.653505297204037e-05, + "loss": 4.0233, + "step": 28516 + }, + { + "epoch": 0.1695986773241983, + "grad_norm": 1.5102436542510986, + "learning_rate": 4.653481571790846e-05, + "loss": 5.1274, + "step": 28517 + }, + { + "epoch": 0.1696046246074793, + "grad_norm": 1.5515743494033813, + "learning_rate": 4.653457845625896e-05, + "loss": 6.1905, + "step": 28518 + }, + { + "epoch": 0.16961057189076031, + "grad_norm": 1.5858293771743774, + "learning_rate": 4.6534341187091965e-05, + "loss": 5.2316, + "step": 28519 + }, + { + "epoch": 0.1696165191740413, + "grad_norm": 3.305469274520874, + "learning_rate": 4.653410391040755e-05, + "loss": 4.022, + "step": 28520 + }, + { + "epoch": 0.1696224664573223, + "grad_norm": 1.6751025915145874, + "learning_rate": 4.6533866626205805e-05, + "loss": 5.2442, + "step": 28521 + }, + { + "epoch": 0.16962841374060328, + "grad_norm": 1.777486801147461, + "learning_rate": 4.653362933448681e-05, + "loss": 5.0407, + "step": 28522 + }, + { + "epoch": 0.1696343610238843, + "grad_norm": 1.5896446704864502, + "learning_rate": 4.653339203525065e-05, + "loss": 4.807, + "step": 28523 + }, + { + "epoch": 0.16964030830716528, + "grad_norm": 1.9087060689926147, + "learning_rate": 4.65331547284974e-05, + "loss": 5.0863, + "step": 28524 + }, + { + "epoch": 0.16964625559044627, + "grad_norm": 1.7064319849014282, + "learning_rate": 4.653291741422715e-05, + "loss": 5.2761, + "step": 28525 + }, + { + "epoch": 0.1696522028737273, + "grad_norm": 1.5838422775268555, + "learning_rate": 4.6532680092439986e-05, + "loss": 5.316, + "step": 28526 + }, + { + "epoch": 0.16965815015700828, + "grad_norm": 1.702512264251709, + "learning_rate": 4.653244276313598e-05, + "loss": 5.2548, + "step": 28527 + }, + { + "epoch": 0.16966409744028926, + "grad_norm": 1.4088670015335083, + "learning_rate": 4.6532205426315215e-05, + "loss": 5.1767, + "step": 28528 + }, + { + "epoch": 0.16967004472357028, + "grad_norm": 2.7728757858276367, + "learning_rate": 4.653196808197779e-05, + "loss": 4.5771, + "step": 28529 + }, + { + "epoch": 0.16967599200685127, + "grad_norm": 2.977949857711792, + "learning_rate": 4.653173073012377e-05, + "loss": 4.2778, + "step": 28530 + }, + { + "epoch": 0.16968193929013226, + "grad_norm": 2.986652374267578, + "learning_rate": 4.6531493370753254e-05, + "loss": 4.1076, + "step": 28531 + }, + { + "epoch": 0.16968788657341327, + "grad_norm": 2.596334934234619, + "learning_rate": 4.6531256003866305e-05, + "loss": 3.6769, + "step": 28532 + }, + { + "epoch": 0.16969383385669426, + "grad_norm": 2.381591796875, + "learning_rate": 4.653101862946303e-05, + "loss": 3.9261, + "step": 28533 + }, + { + "epoch": 0.16969978113997525, + "grad_norm": 2.287313938140869, + "learning_rate": 4.653078124754349e-05, + "loss": 4.4583, + "step": 28534 + }, + { + "epoch": 0.16970572842325626, + "grad_norm": 1.716257929801941, + "learning_rate": 4.6530543858107776e-05, + "loss": 5.1735, + "step": 28535 + }, + { + "epoch": 0.16971167570653725, + "grad_norm": 1.5777500867843628, + "learning_rate": 4.6530306461155976e-05, + "loss": 4.958, + "step": 28536 + }, + { + "epoch": 0.16971762298981824, + "grad_norm": 1.6747970581054688, + "learning_rate": 4.653006905668817e-05, + "loss": 4.6559, + "step": 28537 + }, + { + "epoch": 0.16972357027309926, + "grad_norm": 1.8283017873764038, + "learning_rate": 4.652983164470444e-05, + "loss": 4.4711, + "step": 28538 + }, + { + "epoch": 0.16972951755638024, + "grad_norm": 2.753277063369751, + "learning_rate": 4.652959422520485e-05, + "loss": 3.9467, + "step": 28539 + }, + { + "epoch": 0.16973546483966123, + "grad_norm": 1.993268370628357, + "learning_rate": 4.652935679818952e-05, + "loss": 4.8315, + "step": 28540 + }, + { + "epoch": 0.16974141212294225, + "grad_norm": 1.7056300640106201, + "learning_rate": 4.652911936365851e-05, + "loss": 5.6509, + "step": 28541 + }, + { + "epoch": 0.16974735940622324, + "grad_norm": 1.6653499603271484, + "learning_rate": 4.6528881921611904e-05, + "loss": 5.5002, + "step": 28542 + }, + { + "epoch": 0.16975330668950422, + "grad_norm": 1.5368744134902954, + "learning_rate": 4.6528644472049795e-05, + "loss": 5.0847, + "step": 28543 + }, + { + "epoch": 0.16975925397278524, + "grad_norm": 1.597609043121338, + "learning_rate": 4.6528407014972255e-05, + "loss": 5.4779, + "step": 28544 + }, + { + "epoch": 0.16976520125606623, + "grad_norm": 1.5362802743911743, + "learning_rate": 4.6528169550379364e-05, + "loss": 4.931, + "step": 28545 + }, + { + "epoch": 0.16977114853934722, + "grad_norm": 1.4700133800506592, + "learning_rate": 4.652793207827122e-05, + "loss": 5.6209, + "step": 28546 + }, + { + "epoch": 0.16977709582262823, + "grad_norm": 2.0117483139038086, + "learning_rate": 4.652769459864788e-05, + "loss": 4.7425, + "step": 28547 + }, + { + "epoch": 0.16978304310590922, + "grad_norm": 1.4520665407180786, + "learning_rate": 4.652745711150946e-05, + "loss": 5.135, + "step": 28548 + }, + { + "epoch": 0.1697889903891902, + "grad_norm": 1.5992931127548218, + "learning_rate": 4.6527219616856036e-05, + "loss": 5.2732, + "step": 28549 + }, + { + "epoch": 0.16979493767247122, + "grad_norm": 1.689389944076538, + "learning_rate": 4.6526982114687666e-05, + "loss": 5.1537, + "step": 28550 + }, + { + "epoch": 0.1698008849557522, + "grad_norm": 1.5059309005737305, + "learning_rate": 4.652674460500446e-05, + "loss": 4.9021, + "step": 28551 + }, + { + "epoch": 0.1698068322390332, + "grad_norm": 2.6482186317443848, + "learning_rate": 4.652650708780648e-05, + "loss": 4.9221, + "step": 28552 + }, + { + "epoch": 0.16981277952231422, + "grad_norm": 1.7961699962615967, + "learning_rate": 4.652626956309382e-05, + "loss": 5.3804, + "step": 28553 + }, + { + "epoch": 0.1698187268055952, + "grad_norm": 1.704698085784912, + "learning_rate": 4.652603203086656e-05, + "loss": 5.775, + "step": 28554 + }, + { + "epoch": 0.1698246740888762, + "grad_norm": 1.7374398708343506, + "learning_rate": 4.65257944911248e-05, + "loss": 5.6455, + "step": 28555 + }, + { + "epoch": 0.1698306213721572, + "grad_norm": 1.5410466194152832, + "learning_rate": 4.652555694386859e-05, + "loss": 5.7316, + "step": 28556 + }, + { + "epoch": 0.1698365686554382, + "grad_norm": 1.5294291973114014, + "learning_rate": 4.652531938909804e-05, + "loss": 5.0427, + "step": 28557 + }, + { + "epoch": 0.16984251593871919, + "grad_norm": 2.2420549392700195, + "learning_rate": 4.652508182681322e-05, + "loss": 3.8954, + "step": 28558 + }, + { + "epoch": 0.1698484632220002, + "grad_norm": 1.640631079673767, + "learning_rate": 4.652484425701422e-05, + "loss": 5.2021, + "step": 28559 + }, + { + "epoch": 0.1698544105052812, + "grad_norm": 1.3961762189865112, + "learning_rate": 4.652460667970111e-05, + "loss": 4.6562, + "step": 28560 + }, + { + "epoch": 0.16986035778856218, + "grad_norm": 1.408497929573059, + "learning_rate": 4.6524369094873985e-05, + "loss": 5.2449, + "step": 28561 + }, + { + "epoch": 0.1698663050718432, + "grad_norm": 1.544072151184082, + "learning_rate": 4.6524131502532934e-05, + "loss": 5.1623, + "step": 28562 + }, + { + "epoch": 0.16987225235512418, + "grad_norm": 1.4092038869857788, + "learning_rate": 4.652389390267802e-05, + "loss": 5.1672, + "step": 28563 + }, + { + "epoch": 0.16987819963840517, + "grad_norm": 1.533828616142273, + "learning_rate": 4.6523656295309346e-05, + "loss": 5.1873, + "step": 28564 + }, + { + "epoch": 0.16988414692168619, + "grad_norm": 1.690058946609497, + "learning_rate": 4.6523418680426986e-05, + "loss": 5.1518, + "step": 28565 + }, + { + "epoch": 0.16989009420496717, + "grad_norm": 1.192253828048706, + "learning_rate": 4.652318105803102e-05, + "loss": 5.1708, + "step": 28566 + }, + { + "epoch": 0.16989604148824816, + "grad_norm": 1.6222058534622192, + "learning_rate": 4.6522943428121526e-05, + "loss": 5.2261, + "step": 28567 + }, + { + "epoch": 0.16990198877152918, + "grad_norm": 1.9990545511245728, + "learning_rate": 4.65227057906986e-05, + "loss": 5.1013, + "step": 28568 + }, + { + "epoch": 0.16990793605481017, + "grad_norm": 1.929602861404419, + "learning_rate": 4.652246814576233e-05, + "loss": 4.8618, + "step": 28569 + }, + { + "epoch": 0.16991388333809115, + "grad_norm": 1.3916577100753784, + "learning_rate": 4.6522230493312777e-05, + "loss": 4.929, + "step": 28570 + }, + { + "epoch": 0.16991983062137217, + "grad_norm": 1.7045917510986328, + "learning_rate": 4.6521992833350036e-05, + "loss": 4.925, + "step": 28571 + }, + { + "epoch": 0.16992577790465316, + "grad_norm": 1.68044114112854, + "learning_rate": 4.6521755165874194e-05, + "loss": 5.3032, + "step": 28572 + }, + { + "epoch": 0.16993172518793415, + "grad_norm": 1.747460126876831, + "learning_rate": 4.652151749088533e-05, + "loss": 5.1043, + "step": 28573 + }, + { + "epoch": 0.16993767247121516, + "grad_norm": 1.7225557565689087, + "learning_rate": 4.6521279808383526e-05, + "loss": 4.7359, + "step": 28574 + }, + { + "epoch": 0.16994361975449615, + "grad_norm": 1.9875255823135376, + "learning_rate": 4.652104211836886e-05, + "loss": 3.912, + "step": 28575 + }, + { + "epoch": 0.16994956703777714, + "grad_norm": 1.898094654083252, + "learning_rate": 4.652080442084142e-05, + "loss": 4.012, + "step": 28576 + }, + { + "epoch": 0.16995551432105815, + "grad_norm": 1.8791594505310059, + "learning_rate": 4.65205667158013e-05, + "loss": 3.8007, + "step": 28577 + }, + { + "epoch": 0.16996146160433914, + "grad_norm": 1.85286545753479, + "learning_rate": 4.652032900324857e-05, + "loss": 3.8686, + "step": 28578 + }, + { + "epoch": 0.16996740888762013, + "grad_norm": 1.8084555864334106, + "learning_rate": 4.652009128318331e-05, + "loss": 3.8287, + "step": 28579 + }, + { + "epoch": 0.16997335617090112, + "grad_norm": 1.8365230560302734, + "learning_rate": 4.651985355560562e-05, + "loss": 3.8072, + "step": 28580 + }, + { + "epoch": 0.16997930345418213, + "grad_norm": 1.8318002223968506, + "learning_rate": 4.651961582051555e-05, + "loss": 3.5751, + "step": 28581 + }, + { + "epoch": 0.16998525073746312, + "grad_norm": 2.9217238426208496, + "learning_rate": 4.651937807791322e-05, + "loss": 4.3074, + "step": 28582 + }, + { + "epoch": 0.1699911980207441, + "grad_norm": 1.8495897054672241, + "learning_rate": 4.651914032779869e-05, + "loss": 3.5268, + "step": 28583 + }, + { + "epoch": 0.16999714530402513, + "grad_norm": 1.7885898351669312, + "learning_rate": 4.651890257017206e-05, + "loss": 3.2383, + "step": 28584 + }, + { + "epoch": 0.17000309258730611, + "grad_norm": 1.9159060716629028, + "learning_rate": 4.6518664805033395e-05, + "loss": 3.7259, + "step": 28585 + }, + { + "epoch": 0.1700090398705871, + "grad_norm": 1.733549952507019, + "learning_rate": 4.6518427032382793e-05, + "loss": 5.1259, + "step": 28586 + }, + { + "epoch": 0.17001498715386812, + "grad_norm": 2.508037805557251, + "learning_rate": 4.651818925222033e-05, + "loss": 3.8367, + "step": 28587 + }, + { + "epoch": 0.1700209344371491, + "grad_norm": 2.5397400856018066, + "learning_rate": 4.651795146454608e-05, + "loss": 3.4588, + "step": 28588 + }, + { + "epoch": 0.1700268817204301, + "grad_norm": 2.3859269618988037, + "learning_rate": 4.651771366936015e-05, + "loss": 3.3977, + "step": 28589 + }, + { + "epoch": 0.1700328290037111, + "grad_norm": 1.8520206212997437, + "learning_rate": 4.65174758666626e-05, + "loss": 4.0797, + "step": 28590 + }, + { + "epoch": 0.1700387762869921, + "grad_norm": 2.0465288162231445, + "learning_rate": 4.651723805645352e-05, + "loss": 3.2528, + "step": 28591 + }, + { + "epoch": 0.1700447235702731, + "grad_norm": 2.100496530532837, + "learning_rate": 4.651700023873299e-05, + "loss": 2.9472, + "step": 28592 + }, + { + "epoch": 0.1700506708535541, + "grad_norm": 2.4353413581848145, + "learning_rate": 4.6516762413501106e-05, + "loss": 3.161, + "step": 28593 + }, + { + "epoch": 0.1700566181368351, + "grad_norm": 2.609565019607544, + "learning_rate": 4.651652458075794e-05, + "loss": 3.5234, + "step": 28594 + }, + { + "epoch": 0.17006256542011608, + "grad_norm": 2.2567410469055176, + "learning_rate": 4.651628674050358e-05, + "loss": 3.5863, + "step": 28595 + }, + { + "epoch": 0.1700685127033971, + "grad_norm": 2.6345736980438232, + "learning_rate": 4.6516048892738104e-05, + "loss": 3.5194, + "step": 28596 + }, + { + "epoch": 0.17007445998667808, + "grad_norm": 1.9039238691329956, + "learning_rate": 4.65158110374616e-05, + "loss": 4.0329, + "step": 28597 + }, + { + "epoch": 0.17008040726995907, + "grad_norm": 1.6507738828659058, + "learning_rate": 4.6515573174674143e-05, + "loss": 4.9022, + "step": 28598 + }, + { + "epoch": 0.1700863545532401, + "grad_norm": 1.6945186853408813, + "learning_rate": 4.651533530437583e-05, + "loss": 4.9487, + "step": 28599 + }, + { + "epoch": 0.17009230183652108, + "grad_norm": 1.8337676525115967, + "learning_rate": 4.651509742656673e-05, + "loss": 5.1238, + "step": 28600 + }, + { + "epoch": 0.17009824911980206, + "grad_norm": 1.4968239068984985, + "learning_rate": 4.651485954124694e-05, + "loss": 4.782, + "step": 28601 + }, + { + "epoch": 0.17010419640308308, + "grad_norm": 1.8200058937072754, + "learning_rate": 4.651462164841652e-05, + "loss": 5.3675, + "step": 28602 + }, + { + "epoch": 0.17011014368636407, + "grad_norm": 1.788134217262268, + "learning_rate": 4.6514383748075575e-05, + "loss": 4.6486, + "step": 28603 + }, + { + "epoch": 0.17011609096964506, + "grad_norm": 1.6064730882644653, + "learning_rate": 4.6514145840224184e-05, + "loss": 4.4153, + "step": 28604 + }, + { + "epoch": 0.17012203825292607, + "grad_norm": 1.4705356359481812, + "learning_rate": 4.651390792486242e-05, + "loss": 4.7254, + "step": 28605 + }, + { + "epoch": 0.17012798553620706, + "grad_norm": 1.5670931339263916, + "learning_rate": 4.6513670001990385e-05, + "loss": 5.0288, + "step": 28606 + }, + { + "epoch": 0.17013393281948805, + "grad_norm": 1.9141185283660889, + "learning_rate": 4.651343207160814e-05, + "loss": 5.0111, + "step": 28607 + }, + { + "epoch": 0.17013988010276906, + "grad_norm": 1.485753059387207, + "learning_rate": 4.6513194133715776e-05, + "loss": 5.0013, + "step": 28608 + }, + { + "epoch": 0.17014582738605005, + "grad_norm": 1.6797868013381958, + "learning_rate": 4.651295618831338e-05, + "loss": 5.0576, + "step": 28609 + }, + { + "epoch": 0.17015177466933104, + "grad_norm": 2.6057140827178955, + "learning_rate": 4.651271823540104e-05, + "loss": 3.9116, + "step": 28610 + }, + { + "epoch": 0.17015772195261206, + "grad_norm": 2.83886456489563, + "learning_rate": 4.651248027497883e-05, + "loss": 4.3674, + "step": 28611 + }, + { + "epoch": 0.17016366923589304, + "grad_norm": 2.470137596130371, + "learning_rate": 4.6512242307046834e-05, + "loss": 4.5506, + "step": 28612 + }, + { + "epoch": 0.17016961651917403, + "grad_norm": 2.0518956184387207, + "learning_rate": 4.6512004331605134e-05, + "loss": 4.9991, + "step": 28613 + }, + { + "epoch": 0.17017556380245505, + "grad_norm": 2.012444257736206, + "learning_rate": 4.6511766348653816e-05, + "loss": 4.6678, + "step": 28614 + }, + { + "epoch": 0.17018151108573604, + "grad_norm": 2.152315616607666, + "learning_rate": 4.651152835819297e-05, + "loss": 3.7695, + "step": 28615 + }, + { + "epoch": 0.17018745836901703, + "grad_norm": 2.255277156829834, + "learning_rate": 4.6511290360222664e-05, + "loss": 3.861, + "step": 28616 + }, + { + "epoch": 0.17019340565229804, + "grad_norm": 2.317800998687744, + "learning_rate": 4.651105235474299e-05, + "loss": 3.813, + "step": 28617 + }, + { + "epoch": 0.17019935293557903, + "grad_norm": 2.330914258956909, + "learning_rate": 4.651081434175403e-05, + "loss": 3.6723, + "step": 28618 + }, + { + "epoch": 0.17020530021886002, + "grad_norm": 2.112302541732788, + "learning_rate": 4.651057632125587e-05, + "loss": 3.6212, + "step": 28619 + }, + { + "epoch": 0.17021124750214103, + "grad_norm": 1.9216437339782715, + "learning_rate": 4.651033829324859e-05, + "loss": 4.3208, + "step": 28620 + }, + { + "epoch": 0.17021719478542202, + "grad_norm": 1.9902441501617432, + "learning_rate": 4.651010025773227e-05, + "loss": 4.7577, + "step": 28621 + }, + { + "epoch": 0.170223142068703, + "grad_norm": 1.7886050939559937, + "learning_rate": 4.6509862214707e-05, + "loss": 4.494, + "step": 28622 + }, + { + "epoch": 0.17022908935198403, + "grad_norm": 1.8544505834579468, + "learning_rate": 4.650962416417285e-05, + "loss": 5.4149, + "step": 28623 + }, + { + "epoch": 0.170235036635265, + "grad_norm": 1.682219386100769, + "learning_rate": 4.650938610612992e-05, + "loss": 5.434, + "step": 28624 + }, + { + "epoch": 0.170240983918546, + "grad_norm": 2.096231698989868, + "learning_rate": 4.650914804057829e-05, + "loss": 4.3005, + "step": 28625 + }, + { + "epoch": 0.17024693120182702, + "grad_norm": 2.311213970184326, + "learning_rate": 4.650890996751803e-05, + "loss": 3.7311, + "step": 28626 + }, + { + "epoch": 0.170252878485108, + "grad_norm": 1.9578297138214111, + "learning_rate": 4.650867188694924e-05, + "loss": 4.6696, + "step": 28627 + }, + { + "epoch": 0.170258825768389, + "grad_norm": 2.9123547077178955, + "learning_rate": 4.650843379887199e-05, + "loss": 3.8884, + "step": 28628 + }, + { + "epoch": 0.17026477305167, + "grad_norm": 2.6703314781188965, + "learning_rate": 4.650819570328636e-05, + "loss": 3.9453, + "step": 28629 + }, + { + "epoch": 0.170270720334951, + "grad_norm": 1.7576513290405273, + "learning_rate": 4.6507957600192454e-05, + "loss": 4.8754, + "step": 28630 + }, + { + "epoch": 0.17027666761823199, + "grad_norm": 1.6122910976409912, + "learning_rate": 4.650771948959033e-05, + "loss": 5.0507, + "step": 28631 + }, + { + "epoch": 0.170282614901513, + "grad_norm": 1.5017814636230469, + "learning_rate": 4.650748137148009e-05, + "loss": 4.9571, + "step": 28632 + }, + { + "epoch": 0.170288562184794, + "grad_norm": 1.4443883895874023, + "learning_rate": 4.6507243245861815e-05, + "loss": 4.524, + "step": 28633 + }, + { + "epoch": 0.17029450946807498, + "grad_norm": 1.8001708984375, + "learning_rate": 4.650700511273558e-05, + "loss": 4.8942, + "step": 28634 + }, + { + "epoch": 0.170300456751356, + "grad_norm": 2.039597749710083, + "learning_rate": 4.650676697210147e-05, + "loss": 5.0357, + "step": 28635 + }, + { + "epoch": 0.17030640403463698, + "grad_norm": 1.7828583717346191, + "learning_rate": 4.650652882395957e-05, + "loss": 4.8489, + "step": 28636 + }, + { + "epoch": 0.17031235131791797, + "grad_norm": 2.0128636360168457, + "learning_rate": 4.650629066830996e-05, + "loss": 4.3581, + "step": 28637 + }, + { + "epoch": 0.17031829860119896, + "grad_norm": 1.6843047142028809, + "learning_rate": 4.650605250515273e-05, + "loss": 5.2302, + "step": 28638 + }, + { + "epoch": 0.17032424588447997, + "grad_norm": 1.6175137758255005, + "learning_rate": 4.650581433448796e-05, + "loss": 5.2985, + "step": 28639 + }, + { + "epoch": 0.17033019316776096, + "grad_norm": 1.982064962387085, + "learning_rate": 4.6505576156315734e-05, + "loss": 4.8775, + "step": 28640 + }, + { + "epoch": 0.17033614045104195, + "grad_norm": 1.9722973108291626, + "learning_rate": 4.650533797063613e-05, + "loss": 4.6054, + "step": 28641 + }, + { + "epoch": 0.17034208773432297, + "grad_norm": 2.2383551597595215, + "learning_rate": 4.650509977744923e-05, + "loss": 4.2201, + "step": 28642 + }, + { + "epoch": 0.17034803501760395, + "grad_norm": 1.647186040878296, + "learning_rate": 4.650486157675513e-05, + "loss": 4.8552, + "step": 28643 + }, + { + "epoch": 0.17035398230088494, + "grad_norm": 2.658078193664551, + "learning_rate": 4.650462336855391e-05, + "loss": 4.0346, + "step": 28644 + }, + { + "epoch": 0.17035992958416596, + "grad_norm": 1.9004065990447998, + "learning_rate": 4.650438515284564e-05, + "loss": 4.7588, + "step": 28645 + }, + { + "epoch": 0.17036587686744695, + "grad_norm": 1.6584961414337158, + "learning_rate": 4.650414692963041e-05, + "loss": 5.0345, + "step": 28646 + }, + { + "epoch": 0.17037182415072794, + "grad_norm": 1.6760051250457764, + "learning_rate": 4.650390869890831e-05, + "loss": 5.2614, + "step": 28647 + }, + { + "epoch": 0.17037777143400895, + "grad_norm": 1.538028597831726, + "learning_rate": 4.650367046067942e-05, + "loss": 5.3746, + "step": 28648 + }, + { + "epoch": 0.17038371871728994, + "grad_norm": 1.592532992362976, + "learning_rate": 4.650343221494381e-05, + "loss": 5.2738, + "step": 28649 + }, + { + "epoch": 0.17038966600057093, + "grad_norm": 1.472048044204712, + "learning_rate": 4.650319396170158e-05, + "loss": 5.1399, + "step": 28650 + }, + { + "epoch": 0.17039561328385194, + "grad_norm": 1.570019245147705, + "learning_rate": 4.650295570095281e-05, + "loss": 5.199, + "step": 28651 + }, + { + "epoch": 0.17040156056713293, + "grad_norm": 1.82230806350708, + "learning_rate": 4.6502717432697577e-05, + "loss": 5.1108, + "step": 28652 + }, + { + "epoch": 0.17040750785041392, + "grad_norm": 1.9128144979476929, + "learning_rate": 4.650247915693596e-05, + "loss": 5.1805, + "step": 28653 + }, + { + "epoch": 0.17041345513369494, + "grad_norm": 1.683923363685608, + "learning_rate": 4.650224087366806e-05, + "loss": 5.203, + "step": 28654 + }, + { + "epoch": 0.17041940241697592, + "grad_norm": 1.5329160690307617, + "learning_rate": 4.6502002582893944e-05, + "loss": 4.8658, + "step": 28655 + }, + { + "epoch": 0.1704253497002569, + "grad_norm": 2.3513686656951904, + "learning_rate": 4.65017642846137e-05, + "loss": 4.9593, + "step": 28656 + }, + { + "epoch": 0.17043129698353793, + "grad_norm": 1.7208911180496216, + "learning_rate": 4.650152597882742e-05, + "loss": 5.2315, + "step": 28657 + }, + { + "epoch": 0.17043724426681892, + "grad_norm": 1.7835557460784912, + "learning_rate": 4.650128766553518e-05, + "loss": 5.2212, + "step": 28658 + }, + { + "epoch": 0.1704431915500999, + "grad_norm": 2.004202365875244, + "learning_rate": 4.650104934473705e-05, + "loss": 4.8766, + "step": 28659 + }, + { + "epoch": 0.17044913883338092, + "grad_norm": 1.7374918460845947, + "learning_rate": 4.650081101643314e-05, + "loss": 5.3659, + "step": 28660 + }, + { + "epoch": 0.1704550861166619, + "grad_norm": 1.5580469369888306, + "learning_rate": 4.650057268062351e-05, + "loss": 5.012, + "step": 28661 + }, + { + "epoch": 0.1704610333999429, + "grad_norm": 1.7098673582077026, + "learning_rate": 4.650033433730826e-05, + "loss": 5.0506, + "step": 28662 + }, + { + "epoch": 0.1704669806832239, + "grad_norm": 1.7775324583053589, + "learning_rate": 4.6500095986487454e-05, + "loss": 5.3536, + "step": 28663 + }, + { + "epoch": 0.1704729279665049, + "grad_norm": 1.7413294315338135, + "learning_rate": 4.649985762816119e-05, + "loss": 5.2773, + "step": 28664 + }, + { + "epoch": 0.1704788752497859, + "grad_norm": 1.791043996810913, + "learning_rate": 4.649961926232955e-05, + "loss": 5.1409, + "step": 28665 + }, + { + "epoch": 0.1704848225330669, + "grad_norm": 1.8042404651641846, + "learning_rate": 4.649938088899262e-05, + "loss": 5.3099, + "step": 28666 + }, + { + "epoch": 0.1704907698163479, + "grad_norm": 2.329183340072632, + "learning_rate": 4.649914250815047e-05, + "loss": 4.631, + "step": 28667 + }, + { + "epoch": 0.17049671709962888, + "grad_norm": 2.9833004474639893, + "learning_rate": 4.64989041198032e-05, + "loss": 5.1604, + "step": 28668 + }, + { + "epoch": 0.1705026643829099, + "grad_norm": 3.150871992111206, + "learning_rate": 4.649866572395088e-05, + "loss": 5.0831, + "step": 28669 + }, + { + "epoch": 0.17050861166619088, + "grad_norm": 1.6283338069915771, + "learning_rate": 4.64984273205936e-05, + "loss": 5.1733, + "step": 28670 + }, + { + "epoch": 0.17051455894947187, + "grad_norm": 1.6267815828323364, + "learning_rate": 4.649818890973143e-05, + "loss": 5.3692, + "step": 28671 + }, + { + "epoch": 0.1705205062327529, + "grad_norm": 1.638006567955017, + "learning_rate": 4.649795049136448e-05, + "loss": 5.5058, + "step": 28672 + }, + { + "epoch": 0.17052645351603388, + "grad_norm": 1.605161428451538, + "learning_rate": 4.649771206549281e-05, + "loss": 4.9665, + "step": 28673 + }, + { + "epoch": 0.17053240079931486, + "grad_norm": 1.762798547744751, + "learning_rate": 4.649747363211652e-05, + "loss": 4.6831, + "step": 28674 + }, + { + "epoch": 0.17053834808259588, + "grad_norm": 2.23942494392395, + "learning_rate": 4.649723519123567e-05, + "loss": 4.6154, + "step": 28675 + }, + { + "epoch": 0.17054429536587687, + "grad_norm": 1.6567063331604004, + "learning_rate": 4.649699674285036e-05, + "loss": 5.0949, + "step": 28676 + }, + { + "epoch": 0.17055024264915786, + "grad_norm": 1.4644149541854858, + "learning_rate": 4.649675828696067e-05, + "loss": 5.5432, + "step": 28677 + }, + { + "epoch": 0.17055618993243887, + "grad_norm": 1.7737239599227905, + "learning_rate": 4.6496519823566695e-05, + "loss": 5.0056, + "step": 28678 + }, + { + "epoch": 0.17056213721571986, + "grad_norm": 2.3689754009246826, + "learning_rate": 4.64962813526685e-05, + "loss": 3.7473, + "step": 28679 + }, + { + "epoch": 0.17056808449900085, + "grad_norm": 2.3994569778442383, + "learning_rate": 4.649604287426618e-05, + "loss": 3.7447, + "step": 28680 + }, + { + "epoch": 0.17057403178228187, + "grad_norm": 2.2940452098846436, + "learning_rate": 4.64958043883598e-05, + "loss": 3.623, + "step": 28681 + }, + { + "epoch": 0.17057997906556285, + "grad_norm": 2.1584625244140625, + "learning_rate": 4.6495565894949466e-05, + "loss": 3.5711, + "step": 28682 + }, + { + "epoch": 0.17058592634884384, + "grad_norm": 1.7486004829406738, + "learning_rate": 4.649532739403526e-05, + "loss": 4.4838, + "step": 28683 + }, + { + "epoch": 0.17059187363212486, + "grad_norm": 1.8745564222335815, + "learning_rate": 4.6495088885617245e-05, + "loss": 4.6985, + "step": 28684 + }, + { + "epoch": 0.17059782091540585, + "grad_norm": 1.6774717569351196, + "learning_rate": 4.6494850369695517e-05, + "loss": 4.9845, + "step": 28685 + }, + { + "epoch": 0.17060376819868683, + "grad_norm": 1.6051801443099976, + "learning_rate": 4.649461184627017e-05, + "loss": 5.085, + "step": 28686 + }, + { + "epoch": 0.17060971548196785, + "grad_norm": 1.9558120965957642, + "learning_rate": 4.649437331534126e-05, + "loss": 5.7887, + "step": 28687 + }, + { + "epoch": 0.17061566276524884, + "grad_norm": 2.1222105026245117, + "learning_rate": 4.649413477690889e-05, + "loss": 3.9971, + "step": 28688 + }, + { + "epoch": 0.17062161004852983, + "grad_norm": 2.5469319820404053, + "learning_rate": 4.6493896230973147e-05, + "loss": 3.3402, + "step": 28689 + }, + { + "epoch": 0.17062755733181084, + "grad_norm": 1.747454285621643, + "learning_rate": 4.6493657677534107e-05, + "loss": 4.5433, + "step": 28690 + }, + { + "epoch": 0.17063350461509183, + "grad_norm": 2.327911138534546, + "learning_rate": 4.6493419116591845e-05, + "loss": 5.1279, + "step": 28691 + }, + { + "epoch": 0.17063945189837282, + "grad_norm": 1.96173894405365, + "learning_rate": 4.649318054814646e-05, + "loss": 4.6642, + "step": 28692 + }, + { + "epoch": 0.17064539918165383, + "grad_norm": 2.74940824508667, + "learning_rate": 4.6492941972198026e-05, + "loss": 4.9272, + "step": 28693 + }, + { + "epoch": 0.17065134646493482, + "grad_norm": 2.1249771118164062, + "learning_rate": 4.649270338874663e-05, + "loss": 4.8603, + "step": 28694 + }, + { + "epoch": 0.1706572937482158, + "grad_norm": 1.5566577911376953, + "learning_rate": 4.6492464797792344e-05, + "loss": 5.0004, + "step": 28695 + }, + { + "epoch": 0.1706632410314968, + "grad_norm": 1.5969873666763306, + "learning_rate": 4.649222619933527e-05, + "loss": 5.1347, + "step": 28696 + }, + { + "epoch": 0.17066918831477781, + "grad_norm": 1.894946813583374, + "learning_rate": 4.649198759337548e-05, + "loss": 5.1455, + "step": 28697 + }, + { + "epoch": 0.1706751355980588, + "grad_norm": 1.7214184999465942, + "learning_rate": 4.6491748979913056e-05, + "loss": 5.2916, + "step": 28698 + }, + { + "epoch": 0.1706810828813398, + "grad_norm": 1.8061472177505493, + "learning_rate": 4.649151035894809e-05, + "loss": 4.8581, + "step": 28699 + }, + { + "epoch": 0.1706870301646208, + "grad_norm": 2.3920493125915527, + "learning_rate": 4.649127173048066e-05, + "loss": 4.8851, + "step": 28700 + }, + { + "epoch": 0.1706929774479018, + "grad_norm": 1.7309520244598389, + "learning_rate": 4.649103309451084e-05, + "loss": 4.5377, + "step": 28701 + }, + { + "epoch": 0.17069892473118278, + "grad_norm": 1.757692813873291, + "learning_rate": 4.6490794451038725e-05, + "loss": 4.9765, + "step": 28702 + }, + { + "epoch": 0.1707048720144638, + "grad_norm": 2.2090845108032227, + "learning_rate": 4.64905558000644e-05, + "loss": 4.741, + "step": 28703 + }, + { + "epoch": 0.1707108192977448, + "grad_norm": 1.7464302778244019, + "learning_rate": 4.649031714158794e-05, + "loss": 4.9167, + "step": 28704 + }, + { + "epoch": 0.17071676658102578, + "grad_norm": 1.4639854431152344, + "learning_rate": 4.649007847560944e-05, + "loss": 5.1732, + "step": 28705 + }, + { + "epoch": 0.1707227138643068, + "grad_norm": 1.8633160591125488, + "learning_rate": 4.648983980212896e-05, + "loss": 4.3169, + "step": 28706 + }, + { + "epoch": 0.17072866114758778, + "grad_norm": 1.645669937133789, + "learning_rate": 4.648960112114662e-05, + "loss": 5.3615, + "step": 28707 + }, + { + "epoch": 0.17073460843086877, + "grad_norm": 1.802817702293396, + "learning_rate": 4.648936243266246e-05, + "loss": 4.6081, + "step": 28708 + }, + { + "epoch": 0.17074055571414978, + "grad_norm": 1.6780096292495728, + "learning_rate": 4.648912373667661e-05, + "loss": 4.8164, + "step": 28709 + }, + { + "epoch": 0.17074650299743077, + "grad_norm": 1.6830222606658936, + "learning_rate": 4.648888503318911e-05, + "loss": 5.1217, + "step": 28710 + }, + { + "epoch": 0.17075245028071176, + "grad_norm": 1.9091911315917969, + "learning_rate": 4.648864632220007e-05, + "loss": 4.6718, + "step": 28711 + }, + { + "epoch": 0.17075839756399278, + "grad_norm": 1.7040106058120728, + "learning_rate": 4.6488407603709566e-05, + "loss": 5.3872, + "step": 28712 + }, + { + "epoch": 0.17076434484727376, + "grad_norm": 1.5387471914291382, + "learning_rate": 4.648816887771768e-05, + "loss": 4.999, + "step": 28713 + }, + { + "epoch": 0.17077029213055475, + "grad_norm": 1.6032272577285767, + "learning_rate": 4.648793014422449e-05, + "loss": 5.3291, + "step": 28714 + }, + { + "epoch": 0.17077623941383577, + "grad_norm": 2.1550817489624023, + "learning_rate": 4.6487691403230096e-05, + "loss": 4.4169, + "step": 28715 + }, + { + "epoch": 0.17078218669711676, + "grad_norm": 1.632123589515686, + "learning_rate": 4.648745265473457e-05, + "loss": 4.8016, + "step": 28716 + }, + { + "epoch": 0.17078813398039774, + "grad_norm": 1.9822715520858765, + "learning_rate": 4.6487213898737986e-05, + "loss": 4.8404, + "step": 28717 + }, + { + "epoch": 0.17079408126367876, + "grad_norm": 1.4587271213531494, + "learning_rate": 4.648697513524044e-05, + "loss": 5.195, + "step": 28718 + }, + { + "epoch": 0.17080002854695975, + "grad_norm": 1.4583262205123901, + "learning_rate": 4.648673636424202e-05, + "loss": 5.331, + "step": 28719 + }, + { + "epoch": 0.17080597583024074, + "grad_norm": 1.508599877357483, + "learning_rate": 4.648649758574279e-05, + "loss": 5.3316, + "step": 28720 + }, + { + "epoch": 0.17081192311352175, + "grad_norm": 1.5801657438278198, + "learning_rate": 4.648625879974287e-05, + "loss": 4.9691, + "step": 28721 + }, + { + "epoch": 0.17081787039680274, + "grad_norm": 1.383544921875, + "learning_rate": 4.648602000624229e-05, + "loss": 4.8747, + "step": 28722 + }, + { + "epoch": 0.17082381768008373, + "grad_norm": 1.6122874021530151, + "learning_rate": 4.648578120524118e-05, + "loss": 4.8057, + "step": 28723 + }, + { + "epoch": 0.17082976496336474, + "grad_norm": 1.7532804012298584, + "learning_rate": 4.64855423967396e-05, + "loss": 4.7074, + "step": 28724 + }, + { + "epoch": 0.17083571224664573, + "grad_norm": 1.440300703048706, + "learning_rate": 4.648530358073764e-05, + "loss": 4.6827, + "step": 28725 + }, + { + "epoch": 0.17084165952992672, + "grad_norm": 1.4043488502502441, + "learning_rate": 4.648506475723539e-05, + "loss": 5.1083, + "step": 28726 + }, + { + "epoch": 0.17084760681320774, + "grad_norm": 2.273939609527588, + "learning_rate": 4.6484825926232914e-05, + "loss": 4.3264, + "step": 28727 + }, + { + "epoch": 0.17085355409648872, + "grad_norm": 2.029352903366089, + "learning_rate": 4.6484587087730316e-05, + "loss": 4.2814, + "step": 28728 + }, + { + "epoch": 0.1708595013797697, + "grad_norm": 1.6527879238128662, + "learning_rate": 4.648434824172767e-05, + "loss": 4.6651, + "step": 28729 + }, + { + "epoch": 0.17086544866305073, + "grad_norm": 1.6313071250915527, + "learning_rate": 4.648410938822505e-05, + "loss": 5.202, + "step": 28730 + }, + { + "epoch": 0.17087139594633172, + "grad_norm": 1.706916332244873, + "learning_rate": 4.648387052722256e-05, + "loss": 5.1041, + "step": 28731 + }, + { + "epoch": 0.1708773432296127, + "grad_norm": 1.8511303663253784, + "learning_rate": 4.6483631658720265e-05, + "loss": 4.7474, + "step": 28732 + }, + { + "epoch": 0.17088329051289372, + "grad_norm": 2.102651357650757, + "learning_rate": 4.648339278271826e-05, + "loss": 4.7116, + "step": 28733 + }, + { + "epoch": 0.1708892377961747, + "grad_norm": 1.5868231058120728, + "learning_rate": 4.648315389921662e-05, + "loss": 4.8723, + "step": 28734 + }, + { + "epoch": 0.1708951850794557, + "grad_norm": 1.5616002082824707, + "learning_rate": 4.648291500821544e-05, + "loss": 4.7078, + "step": 28735 + }, + { + "epoch": 0.1709011323627367, + "grad_norm": 1.8076444864273071, + "learning_rate": 4.6482676109714804e-05, + "loss": 4.0856, + "step": 28736 + }, + { + "epoch": 0.1709070796460177, + "grad_norm": 2.5661611557006836, + "learning_rate": 4.6482437203714766e-05, + "loss": 4.0065, + "step": 28737 + }, + { + "epoch": 0.1709130269292987, + "grad_norm": 1.9630448818206787, + "learning_rate": 4.648219829021545e-05, + "loss": 4.3436, + "step": 28738 + }, + { + "epoch": 0.1709189742125797, + "grad_norm": 1.588693618774414, + "learning_rate": 4.648195936921691e-05, + "loss": 4.8528, + "step": 28739 + }, + { + "epoch": 0.1709249214958607, + "grad_norm": 1.6260273456573486, + "learning_rate": 4.6481720440719246e-05, + "loss": 4.9007, + "step": 28740 + }, + { + "epoch": 0.17093086877914168, + "grad_norm": 1.4332720041275024, + "learning_rate": 4.648148150472253e-05, + "loss": 4.6039, + "step": 28741 + }, + { + "epoch": 0.1709368160624227, + "grad_norm": 1.5845040082931519, + "learning_rate": 4.648124256122686e-05, + "loss": 4.6129, + "step": 28742 + }, + { + "epoch": 0.17094276334570369, + "grad_norm": 1.9368457794189453, + "learning_rate": 4.6481003610232296e-05, + "loss": 4.4027, + "step": 28743 + }, + { + "epoch": 0.17094871062898467, + "grad_norm": 2.4336676597595215, + "learning_rate": 4.648076465173894e-05, + "loss": 3.9717, + "step": 28744 + }, + { + "epoch": 0.1709546579122657, + "grad_norm": 2.120758056640625, + "learning_rate": 4.648052568574688e-05, + "loss": 3.4959, + "step": 28745 + }, + { + "epoch": 0.17096060519554668, + "grad_norm": 2.1304919719696045, + "learning_rate": 4.648028671225618e-05, + "loss": 3.6002, + "step": 28746 + }, + { + "epoch": 0.17096655247882767, + "grad_norm": 2.2495477199554443, + "learning_rate": 4.648004773126694e-05, + "loss": 3.8202, + "step": 28747 + }, + { + "epoch": 0.17097249976210868, + "grad_norm": 2.0952799320220947, + "learning_rate": 4.647980874277924e-05, + "loss": 4.0671, + "step": 28748 + }, + { + "epoch": 0.17097844704538967, + "grad_norm": 2.260267972946167, + "learning_rate": 4.6479569746793154e-05, + "loss": 4.004, + "step": 28749 + }, + { + "epoch": 0.17098439432867066, + "grad_norm": 1.6694860458374023, + "learning_rate": 4.647933074330878e-05, + "loss": 4.6784, + "step": 28750 + }, + { + "epoch": 0.17099034161195167, + "grad_norm": 1.8118653297424316, + "learning_rate": 4.647909173232618e-05, + "loss": 4.4819, + "step": 28751 + }, + { + "epoch": 0.17099628889523266, + "grad_norm": 1.6766449213027954, + "learning_rate": 4.647885271384546e-05, + "loss": 4.5391, + "step": 28752 + }, + { + "epoch": 0.17100223617851365, + "grad_norm": 2.1435959339141846, + "learning_rate": 4.6478613687866696e-05, + "loss": 3.5559, + "step": 28753 + }, + { + "epoch": 0.17100818346179464, + "grad_norm": 2.2521913051605225, + "learning_rate": 4.647837465438997e-05, + "loss": 3.434, + "step": 28754 + }, + { + "epoch": 0.17101413074507565, + "grad_norm": 2.012451171875, + "learning_rate": 4.6478135613415366e-05, + "loss": 3.7475, + "step": 28755 + }, + { + "epoch": 0.17102007802835664, + "grad_norm": 2.383465528488159, + "learning_rate": 4.6477896564942956e-05, + "loss": 4.2333, + "step": 28756 + }, + { + "epoch": 0.17102602531163763, + "grad_norm": 2.0753815174102783, + "learning_rate": 4.647765750897284e-05, + "loss": 3.9532, + "step": 28757 + }, + { + "epoch": 0.17103197259491865, + "grad_norm": 2.0559349060058594, + "learning_rate": 4.64774184455051e-05, + "loss": 3.8132, + "step": 28758 + }, + { + "epoch": 0.17103791987819963, + "grad_norm": 2.2562434673309326, + "learning_rate": 4.6477179374539814e-05, + "loss": 3.9445, + "step": 28759 + }, + { + "epoch": 0.17104386716148062, + "grad_norm": 1.9799115657806396, + "learning_rate": 4.6476940296077065e-05, + "loss": 4.0676, + "step": 28760 + }, + { + "epoch": 0.17104981444476164, + "grad_norm": 2.034501552581787, + "learning_rate": 4.6476701210116935e-05, + "loss": 3.5055, + "step": 28761 + }, + { + "epoch": 0.17105576172804263, + "grad_norm": 2.2014403343200684, + "learning_rate": 4.6476462116659514e-05, + "loss": 3.7419, + "step": 28762 + }, + { + "epoch": 0.17106170901132361, + "grad_norm": 2.271733522415161, + "learning_rate": 4.6476223015704875e-05, + "loss": 3.5206, + "step": 28763 + }, + { + "epoch": 0.17106765629460463, + "grad_norm": 2.144587278366089, + "learning_rate": 4.647598390725312e-05, + "loss": 3.4963, + "step": 28764 + }, + { + "epoch": 0.17107360357788562, + "grad_norm": 1.8896453380584717, + "learning_rate": 4.647574479130432e-05, + "loss": 3.6917, + "step": 28765 + }, + { + "epoch": 0.1710795508611666, + "grad_norm": 2.5320651531219482, + "learning_rate": 4.6475505667858556e-05, + "loss": 3.4057, + "step": 28766 + }, + { + "epoch": 0.17108549814444762, + "grad_norm": 2.5660650730133057, + "learning_rate": 4.647526653691591e-05, + "loss": 3.5343, + "step": 28767 + }, + { + "epoch": 0.1710914454277286, + "grad_norm": 2.016521453857422, + "learning_rate": 4.647502739847647e-05, + "loss": 5.0209, + "step": 28768 + }, + { + "epoch": 0.1710973927110096, + "grad_norm": 2.098594903945923, + "learning_rate": 4.6474788252540323e-05, + "loss": 3.4916, + "step": 28769 + }, + { + "epoch": 0.17110333999429062, + "grad_norm": 2.502556562423706, + "learning_rate": 4.6474549099107555e-05, + "loss": 3.6106, + "step": 28770 + }, + { + "epoch": 0.1711092872775716, + "grad_norm": 2.3364086151123047, + "learning_rate": 4.647430993817824e-05, + "loss": 3.6718, + "step": 28771 + }, + { + "epoch": 0.1711152345608526, + "grad_norm": 2.453624963760376, + "learning_rate": 4.647407076975247e-05, + "loss": 4.0256, + "step": 28772 + }, + { + "epoch": 0.1711211818441336, + "grad_norm": 2.250152826309204, + "learning_rate": 4.647383159383031e-05, + "loss": 3.8149, + "step": 28773 + }, + { + "epoch": 0.1711271291274146, + "grad_norm": 2.2971277236938477, + "learning_rate": 4.6473592410411864e-05, + "loss": 4.0557, + "step": 28774 + }, + { + "epoch": 0.17113307641069558, + "grad_norm": 2.2991559505462646, + "learning_rate": 4.647335321949721e-05, + "loss": 3.9136, + "step": 28775 + }, + { + "epoch": 0.1711390236939766, + "grad_norm": 2.220536708831787, + "learning_rate": 4.647311402108643e-05, + "loss": 4.0714, + "step": 28776 + }, + { + "epoch": 0.1711449709772576, + "grad_norm": 2.1241915225982666, + "learning_rate": 4.647287481517961e-05, + "loss": 3.5843, + "step": 28777 + }, + { + "epoch": 0.17115091826053858, + "grad_norm": 2.195129632949829, + "learning_rate": 4.647263560177683e-05, + "loss": 3.5294, + "step": 28778 + }, + { + "epoch": 0.1711568655438196, + "grad_norm": 2.3440191745758057, + "learning_rate": 4.647239638087817e-05, + "loss": 3.6608, + "step": 28779 + }, + { + "epoch": 0.17116281282710058, + "grad_norm": 2.478482246398926, + "learning_rate": 4.6472157152483726e-05, + "loss": 3.8389, + "step": 28780 + }, + { + "epoch": 0.17116876011038157, + "grad_norm": 2.488262414932251, + "learning_rate": 4.647191791659357e-05, + "loss": 3.3664, + "step": 28781 + }, + { + "epoch": 0.17117470739366258, + "grad_norm": 1.9902031421661377, + "learning_rate": 4.6471678673207784e-05, + "loss": 3.4656, + "step": 28782 + }, + { + "epoch": 0.17118065467694357, + "grad_norm": 1.7979692220687866, + "learning_rate": 4.647143942232647e-05, + "loss": 4.1077, + "step": 28783 + }, + { + "epoch": 0.17118660196022456, + "grad_norm": 2.0550832748413086, + "learning_rate": 4.647120016394969e-05, + "loss": 5.0827, + "step": 28784 + }, + { + "epoch": 0.17119254924350558, + "grad_norm": 2.58035945892334, + "learning_rate": 4.647096089807753e-05, + "loss": 3.3431, + "step": 28785 + }, + { + "epoch": 0.17119849652678656, + "grad_norm": 2.9299840927124023, + "learning_rate": 4.647072162471009e-05, + "loss": 4.3467, + "step": 28786 + }, + { + "epoch": 0.17120444381006755, + "grad_norm": 2.9246139526367188, + "learning_rate": 4.6470482343847434e-05, + "loss": 4.5002, + "step": 28787 + }, + { + "epoch": 0.17121039109334857, + "grad_norm": 2.434800148010254, + "learning_rate": 4.647024305548966e-05, + "loss": 4.39, + "step": 28788 + }, + { + "epoch": 0.17121633837662956, + "grad_norm": 2.0700294971466064, + "learning_rate": 4.647000375963685e-05, + "loss": 3.6275, + "step": 28789 + }, + { + "epoch": 0.17122228565991054, + "grad_norm": 2.0739026069641113, + "learning_rate": 4.6469764456289075e-05, + "loss": 3.294, + "step": 28790 + }, + { + "epoch": 0.17122823294319156, + "grad_norm": 2.158195972442627, + "learning_rate": 4.646952514544643e-05, + "loss": 3.0345, + "step": 28791 + }, + { + "epoch": 0.17123418022647255, + "grad_norm": 2.25756573677063, + "learning_rate": 4.6469285827109e-05, + "loss": 3.4395, + "step": 28792 + }, + { + "epoch": 0.17124012750975354, + "grad_norm": 1.756030559539795, + "learning_rate": 4.646904650127686e-05, + "loss": 4.57, + "step": 28793 + }, + { + "epoch": 0.17124607479303455, + "grad_norm": 1.7527079582214355, + "learning_rate": 4.6468807167950096e-05, + "loss": 4.8592, + "step": 28794 + }, + { + "epoch": 0.17125202207631554, + "grad_norm": 2.0758533477783203, + "learning_rate": 4.646856782712879e-05, + "loss": 3.6941, + "step": 28795 + }, + { + "epoch": 0.17125796935959653, + "grad_norm": 1.977253794670105, + "learning_rate": 4.646832847881304e-05, + "loss": 3.3686, + "step": 28796 + }, + { + "epoch": 0.17126391664287755, + "grad_norm": 2.0132908821105957, + "learning_rate": 4.646808912300291e-05, + "loss": 3.3937, + "step": 28797 + }, + { + "epoch": 0.17126986392615853, + "grad_norm": 1.8328338861465454, + "learning_rate": 4.646784975969849e-05, + "loss": 3.4359, + "step": 28798 + }, + { + "epoch": 0.17127581120943952, + "grad_norm": 1.7316343784332275, + "learning_rate": 4.646761038889987e-05, + "loss": 4.062, + "step": 28799 + }, + { + "epoch": 0.17128175849272054, + "grad_norm": 1.98564875125885, + "learning_rate": 4.646737101060713e-05, + "loss": 3.9671, + "step": 28800 + }, + { + "epoch": 0.17128770577600153, + "grad_norm": 1.4254114627838135, + "learning_rate": 4.646713162482035e-05, + "loss": 5.6623, + "step": 28801 + }, + { + "epoch": 0.1712936530592825, + "grad_norm": 1.7182563543319702, + "learning_rate": 4.646689223153962e-05, + "loss": 3.7951, + "step": 28802 + }, + { + "epoch": 0.17129960034256353, + "grad_norm": 1.9816060066223145, + "learning_rate": 4.646665283076502e-05, + "loss": 3.1926, + "step": 28803 + }, + { + "epoch": 0.17130554762584452, + "grad_norm": 1.9026448726654053, + "learning_rate": 4.646641342249663e-05, + "loss": 3.4481, + "step": 28804 + }, + { + "epoch": 0.1713114949091255, + "grad_norm": 1.9280551671981812, + "learning_rate": 4.646617400673453e-05, + "loss": 3.7474, + "step": 28805 + }, + { + "epoch": 0.17131744219240652, + "grad_norm": 1.9468990564346313, + "learning_rate": 4.646593458347882e-05, + "loss": 3.6522, + "step": 28806 + }, + { + "epoch": 0.1713233894756875, + "grad_norm": 1.8785784244537354, + "learning_rate": 4.646569515272957e-05, + "loss": 4.4277, + "step": 28807 + }, + { + "epoch": 0.1713293367589685, + "grad_norm": 2.5380280017852783, + "learning_rate": 4.6465455714486875e-05, + "loss": 4.7558, + "step": 28808 + }, + { + "epoch": 0.1713352840422495, + "grad_norm": 2.311422824859619, + "learning_rate": 4.64652162687508e-05, + "loss": 4.5887, + "step": 28809 + }, + { + "epoch": 0.1713412313255305, + "grad_norm": 2.215386390686035, + "learning_rate": 4.646497681552144e-05, + "loss": 4.6318, + "step": 28810 + }, + { + "epoch": 0.1713471786088115, + "grad_norm": 2.1793322563171387, + "learning_rate": 4.646473735479889e-05, + "loss": 4.8652, + "step": 28811 + }, + { + "epoch": 0.1713531258920925, + "grad_norm": 1.6395008563995361, + "learning_rate": 4.646449788658321e-05, + "loss": 5.1602, + "step": 28812 + }, + { + "epoch": 0.1713590731753735, + "grad_norm": 1.781542181968689, + "learning_rate": 4.646425841087451e-05, + "loss": 5.5992, + "step": 28813 + }, + { + "epoch": 0.17136502045865448, + "grad_norm": 1.7979416847229004, + "learning_rate": 4.6464018927672846e-05, + "loss": 5.4619, + "step": 28814 + }, + { + "epoch": 0.17137096774193547, + "grad_norm": 1.5196144580841064, + "learning_rate": 4.646377943697832e-05, + "loss": 5.5668, + "step": 28815 + }, + { + "epoch": 0.1713769150252165, + "grad_norm": 1.849569320678711, + "learning_rate": 4.6463539938791e-05, + "loss": 5.2762, + "step": 28816 + }, + { + "epoch": 0.17138286230849747, + "grad_norm": 2.4651362895965576, + "learning_rate": 4.6463300433111e-05, + "loss": 4.2121, + "step": 28817 + }, + { + "epoch": 0.17138880959177846, + "grad_norm": 2.2481956481933594, + "learning_rate": 4.646306091993837e-05, + "loss": 4.2369, + "step": 28818 + }, + { + "epoch": 0.17139475687505948, + "grad_norm": 1.5985668897628784, + "learning_rate": 4.646282139927321e-05, + "loss": 5.0238, + "step": 28819 + }, + { + "epoch": 0.17140070415834047, + "grad_norm": 1.5861318111419678, + "learning_rate": 4.64625818711156e-05, + "loss": 4.6181, + "step": 28820 + }, + { + "epoch": 0.17140665144162145, + "grad_norm": 1.5382401943206787, + "learning_rate": 4.646234233546562e-05, + "loss": 4.9682, + "step": 28821 + }, + { + "epoch": 0.17141259872490247, + "grad_norm": 1.604730248451233, + "learning_rate": 4.646210279232337e-05, + "loss": 5.2491, + "step": 28822 + }, + { + "epoch": 0.17141854600818346, + "grad_norm": 1.83149254322052, + "learning_rate": 4.6461863241688914e-05, + "loss": 5.514, + "step": 28823 + }, + { + "epoch": 0.17142449329146445, + "grad_norm": 2.151071786880493, + "learning_rate": 4.6461623683562336e-05, + "loss": 4.6684, + "step": 28824 + }, + { + "epoch": 0.17143044057474546, + "grad_norm": 1.934921145439148, + "learning_rate": 4.646138411794374e-05, + "loss": 4.5529, + "step": 28825 + }, + { + "epoch": 0.17143638785802645, + "grad_norm": 3.118504047393799, + "learning_rate": 4.646114454483319e-05, + "loss": 3.8805, + "step": 28826 + }, + { + "epoch": 0.17144233514130744, + "grad_norm": 2.784353733062744, + "learning_rate": 4.6460904964230776e-05, + "loss": 3.7983, + "step": 28827 + }, + { + "epoch": 0.17144828242458846, + "grad_norm": 2.2608816623687744, + "learning_rate": 4.6460665376136586e-05, + "loss": 4.0043, + "step": 28828 + }, + { + "epoch": 0.17145422970786944, + "grad_norm": 2.0400445461273193, + "learning_rate": 4.6460425780550695e-05, + "loss": 4.3601, + "step": 28829 + }, + { + "epoch": 0.17146017699115043, + "grad_norm": 1.7697999477386475, + "learning_rate": 4.64601861774732e-05, + "loss": 5.0038, + "step": 28830 + }, + { + "epoch": 0.17146612427443145, + "grad_norm": 1.916419267654419, + "learning_rate": 4.645994656690417e-05, + "loss": 3.8579, + "step": 28831 + }, + { + "epoch": 0.17147207155771244, + "grad_norm": 1.8474862575531006, + "learning_rate": 4.6459706948843687e-05, + "loss": 4.528, + "step": 28832 + }, + { + "epoch": 0.17147801884099342, + "grad_norm": 1.532090425491333, + "learning_rate": 4.645946732329185e-05, + "loss": 5.7598, + "step": 28833 + }, + { + "epoch": 0.17148396612427444, + "grad_norm": 1.4666064977645874, + "learning_rate": 4.645922769024873e-05, + "loss": 5.3868, + "step": 28834 + }, + { + "epoch": 0.17148991340755543, + "grad_norm": 1.5077399015426636, + "learning_rate": 4.645898804971442e-05, + "loss": 5.1645, + "step": 28835 + }, + { + "epoch": 0.17149586069083642, + "grad_norm": 1.5031183958053589, + "learning_rate": 4.6458748401689e-05, + "loss": 4.6318, + "step": 28836 + }, + { + "epoch": 0.17150180797411743, + "grad_norm": 1.9876207113265991, + "learning_rate": 4.6458508746172544e-05, + "loss": 3.7609, + "step": 28837 + }, + { + "epoch": 0.17150775525739842, + "grad_norm": 1.9552377462387085, + "learning_rate": 4.6458269083165155e-05, + "loss": 3.7297, + "step": 28838 + }, + { + "epoch": 0.1715137025406794, + "grad_norm": 1.7688027620315552, + "learning_rate": 4.64580294126669e-05, + "loss": 4.2031, + "step": 28839 + }, + { + "epoch": 0.17151964982396042, + "grad_norm": 1.7358896732330322, + "learning_rate": 4.645778973467787e-05, + "loss": 5.3203, + "step": 28840 + }, + { + "epoch": 0.1715255971072414, + "grad_norm": 1.6685024499893188, + "learning_rate": 4.645755004919814e-05, + "loss": 4.1383, + "step": 28841 + }, + { + "epoch": 0.1715315443905224, + "grad_norm": 1.7474262714385986, + "learning_rate": 4.645731035622781e-05, + "loss": 4.3956, + "step": 28842 + }, + { + "epoch": 0.17153749167380342, + "grad_norm": 2.3153438568115234, + "learning_rate": 4.6457070655766956e-05, + "loss": 3.6617, + "step": 28843 + }, + { + "epoch": 0.1715434389570844, + "grad_norm": 1.6651357412338257, + "learning_rate": 4.645683094781565e-05, + "loss": 3.7946, + "step": 28844 + }, + { + "epoch": 0.1715493862403654, + "grad_norm": 1.8230834007263184, + "learning_rate": 4.645659123237399e-05, + "loss": 3.6286, + "step": 28845 + }, + { + "epoch": 0.1715553335236464, + "grad_norm": 1.724862813949585, + "learning_rate": 4.645635150944206e-05, + "loss": 3.8681, + "step": 28846 + }, + { + "epoch": 0.1715612808069274, + "grad_norm": 1.7765378952026367, + "learning_rate": 4.645611177901994e-05, + "loss": 3.9172, + "step": 28847 + }, + { + "epoch": 0.17156722809020838, + "grad_norm": 1.7206759452819824, + "learning_rate": 4.645587204110771e-05, + "loss": 3.8603, + "step": 28848 + }, + { + "epoch": 0.1715731753734894, + "grad_norm": 1.9421840906143188, + "learning_rate": 4.645563229570546e-05, + "loss": 3.5207, + "step": 28849 + }, + { + "epoch": 0.1715791226567704, + "grad_norm": 1.9873075485229492, + "learning_rate": 4.645539254281327e-05, + "loss": 4.0805, + "step": 28850 + }, + { + "epoch": 0.17158506994005138, + "grad_norm": 1.7919063568115234, + "learning_rate": 4.645515278243122e-05, + "loss": 4.1832, + "step": 28851 + }, + { + "epoch": 0.1715910172233324, + "grad_norm": 1.6959470510482788, + "learning_rate": 4.6454913014559395e-05, + "loss": 4.135, + "step": 28852 + }, + { + "epoch": 0.17159696450661338, + "grad_norm": 2.2556352615356445, + "learning_rate": 4.645467323919789e-05, + "loss": 3.9897, + "step": 28853 + }, + { + "epoch": 0.17160291178989437, + "grad_norm": 2.394732713699341, + "learning_rate": 4.645443345634678e-05, + "loss": 4.0581, + "step": 28854 + }, + { + "epoch": 0.17160885907317538, + "grad_norm": 1.7620495557785034, + "learning_rate": 4.6454193666006144e-05, + "loss": 3.6301, + "step": 28855 + }, + { + "epoch": 0.17161480635645637, + "grad_norm": 2.046990394592285, + "learning_rate": 4.645395386817607e-05, + "loss": 3.6809, + "step": 28856 + }, + { + "epoch": 0.17162075363973736, + "grad_norm": 1.8854444026947021, + "learning_rate": 4.6453714062856645e-05, + "loss": 3.8665, + "step": 28857 + }, + { + "epoch": 0.17162670092301838, + "grad_norm": 1.952010989189148, + "learning_rate": 4.645347425004795e-05, + "loss": 3.9584, + "step": 28858 + }, + { + "epoch": 0.17163264820629937, + "grad_norm": 2.7259037494659424, + "learning_rate": 4.645323442975007e-05, + "loss": 4.1483, + "step": 28859 + }, + { + "epoch": 0.17163859548958035, + "grad_norm": 2.6531686782836914, + "learning_rate": 4.645299460196309e-05, + "loss": 4.2874, + "step": 28860 + }, + { + "epoch": 0.17164454277286137, + "grad_norm": 2.204883337020874, + "learning_rate": 4.645275476668708e-05, + "loss": 4.6409, + "step": 28861 + }, + { + "epoch": 0.17165049005614236, + "grad_norm": 1.8465254306793213, + "learning_rate": 4.645251492392214e-05, + "loss": 3.6078, + "step": 28862 + }, + { + "epoch": 0.17165643733942335, + "grad_norm": 1.6021015644073486, + "learning_rate": 4.645227507366835e-05, + "loss": 3.9142, + "step": 28863 + }, + { + "epoch": 0.17166238462270436, + "grad_norm": 1.9014915227890015, + "learning_rate": 4.645203521592579e-05, + "loss": 4.5439, + "step": 28864 + }, + { + "epoch": 0.17166833190598535, + "grad_norm": 2.176541805267334, + "learning_rate": 4.645179535069455e-05, + "loss": 4.0324, + "step": 28865 + }, + { + "epoch": 0.17167427918926634, + "grad_norm": 1.6138490438461304, + "learning_rate": 4.645155547797472e-05, + "loss": 5.2606, + "step": 28866 + }, + { + "epoch": 0.17168022647254735, + "grad_norm": 1.5091575384140015, + "learning_rate": 4.645131559776635e-05, + "loss": 4.8829, + "step": 28867 + }, + { + "epoch": 0.17168617375582834, + "grad_norm": 2.131401777267456, + "learning_rate": 4.645107571006957e-05, + "loss": 5.1779, + "step": 28868 + }, + { + "epoch": 0.17169212103910933, + "grad_norm": 1.871749758720398, + "learning_rate": 4.645083581488443e-05, + "loss": 4.8126, + "step": 28869 + }, + { + "epoch": 0.17169806832239035, + "grad_norm": 1.825909972190857, + "learning_rate": 4.6450595912211026e-05, + "loss": 4.4965, + "step": 28870 + }, + { + "epoch": 0.17170401560567133, + "grad_norm": 1.546570897102356, + "learning_rate": 4.645035600204944e-05, + "loss": 4.8261, + "step": 28871 + }, + { + "epoch": 0.17170996288895232, + "grad_norm": 1.6035295724868774, + "learning_rate": 4.6450116084399753e-05, + "loss": 4.8019, + "step": 28872 + }, + { + "epoch": 0.1717159101722333, + "grad_norm": 1.6257683038711548, + "learning_rate": 4.644987615926206e-05, + "loss": 4.6993, + "step": 28873 + }, + { + "epoch": 0.17172185745551433, + "grad_norm": 1.6006081104278564, + "learning_rate": 4.6449636226636427e-05, + "loss": 4.7575, + "step": 28874 + }, + { + "epoch": 0.17172780473879531, + "grad_norm": 1.9441580772399902, + "learning_rate": 4.6449396286522954e-05, + "loss": 4.4509, + "step": 28875 + }, + { + "epoch": 0.1717337520220763, + "grad_norm": 2.2355899810791016, + "learning_rate": 4.6449156338921716e-05, + "loss": 3.3666, + "step": 28876 + }, + { + "epoch": 0.17173969930535732, + "grad_norm": 1.863898754119873, + "learning_rate": 4.644891638383281e-05, + "loss": 3.4932, + "step": 28877 + }, + { + "epoch": 0.1717456465886383, + "grad_norm": 1.505720615386963, + "learning_rate": 4.64486764212563e-05, + "loss": 4.3892, + "step": 28878 + }, + { + "epoch": 0.1717515938719193, + "grad_norm": 2.197970151901245, + "learning_rate": 4.644843645119228e-05, + "loss": 4.5169, + "step": 28879 + }, + { + "epoch": 0.1717575411552003, + "grad_norm": 2.1132233142852783, + "learning_rate": 4.644819647364082e-05, + "loss": 3.9246, + "step": 28880 + }, + { + "epoch": 0.1717634884384813, + "grad_norm": 2.273036479949951, + "learning_rate": 4.644795648860203e-05, + "loss": 4.0134, + "step": 28881 + }, + { + "epoch": 0.1717694357217623, + "grad_norm": 2.3725993633270264, + "learning_rate": 4.6447716496075975e-05, + "loss": 3.9562, + "step": 28882 + }, + { + "epoch": 0.1717753830050433, + "grad_norm": 1.6925543546676636, + "learning_rate": 4.6447476496062745e-05, + "loss": 5.22, + "step": 28883 + }, + { + "epoch": 0.1717813302883243, + "grad_norm": 1.7216755151748657, + "learning_rate": 4.644723648856243e-05, + "loss": 4.2907, + "step": 28884 + }, + { + "epoch": 0.17178727757160528, + "grad_norm": 1.9896382093429565, + "learning_rate": 4.64469964735751e-05, + "loss": 3.4634, + "step": 28885 + }, + { + "epoch": 0.1717932248548863, + "grad_norm": 1.924800992012024, + "learning_rate": 4.6446756451100844e-05, + "loss": 3.627, + "step": 28886 + }, + { + "epoch": 0.17179917213816728, + "grad_norm": 2.1140928268432617, + "learning_rate": 4.644651642113975e-05, + "loss": 3.8234, + "step": 28887 + }, + { + "epoch": 0.17180511942144827, + "grad_norm": 1.9103795289993286, + "learning_rate": 4.644627638369189e-05, + "loss": 3.7129, + "step": 28888 + }, + { + "epoch": 0.1718110667047293, + "grad_norm": 2.002732038497925, + "learning_rate": 4.6446036338757363e-05, + "loss": 3.741, + "step": 28889 + }, + { + "epoch": 0.17181701398801028, + "grad_norm": 1.6863858699798584, + "learning_rate": 4.644579628633625e-05, + "loss": 4.3454, + "step": 28890 + }, + { + "epoch": 0.17182296127129126, + "grad_norm": 1.5118045806884766, + "learning_rate": 4.6445556226428625e-05, + "loss": 5.1573, + "step": 28891 + }, + { + "epoch": 0.17182890855457228, + "grad_norm": 2.336212158203125, + "learning_rate": 4.644531615903458e-05, + "loss": 3.7499, + "step": 28892 + }, + { + "epoch": 0.17183485583785327, + "grad_norm": 1.5706313848495483, + "learning_rate": 4.6445076084154195e-05, + "loss": 4.5392, + "step": 28893 + }, + { + "epoch": 0.17184080312113426, + "grad_norm": 1.9531837701797485, + "learning_rate": 4.644483600178756e-05, + "loss": 3.72, + "step": 28894 + }, + { + "epoch": 0.17184675040441527, + "grad_norm": 1.652535080909729, + "learning_rate": 4.644459591193475e-05, + "loss": 4.6445, + "step": 28895 + }, + { + "epoch": 0.17185269768769626, + "grad_norm": 1.856799840927124, + "learning_rate": 4.644435581459585e-05, + "loss": 3.6899, + "step": 28896 + }, + { + "epoch": 0.17185864497097725, + "grad_norm": 1.8917557001113892, + "learning_rate": 4.644411570977096e-05, + "loss": 3.7475, + "step": 28897 + }, + { + "epoch": 0.17186459225425826, + "grad_norm": 1.7784960269927979, + "learning_rate": 4.644387559746014e-05, + "loss": 3.6315, + "step": 28898 + }, + { + "epoch": 0.17187053953753925, + "grad_norm": 1.8464044332504272, + "learning_rate": 4.644363547766348e-05, + "loss": 4.0489, + "step": 28899 + }, + { + "epoch": 0.17187648682082024, + "grad_norm": 1.8629194498062134, + "learning_rate": 4.6443395350381084e-05, + "loss": 3.755, + "step": 28900 + }, + { + "epoch": 0.17188243410410126, + "grad_norm": 1.774107813835144, + "learning_rate": 4.644315521561301e-05, + "loss": 3.6051, + "step": 28901 + }, + { + "epoch": 0.17188838138738224, + "grad_norm": 1.6542714834213257, + "learning_rate": 4.644291507335935e-05, + "loss": 3.622, + "step": 28902 + }, + { + "epoch": 0.17189432867066323, + "grad_norm": 1.7980518341064453, + "learning_rate": 4.64426749236202e-05, + "loss": 3.7703, + "step": 28903 + }, + { + "epoch": 0.17190027595394425, + "grad_norm": 1.771996021270752, + "learning_rate": 4.644243476639563e-05, + "loss": 3.8511, + "step": 28904 + }, + { + "epoch": 0.17190622323722524, + "grad_norm": 1.9656630754470825, + "learning_rate": 4.644219460168572e-05, + "loss": 5.0433, + "step": 28905 + }, + { + "epoch": 0.17191217052050622, + "grad_norm": 1.7453303337097168, + "learning_rate": 4.6441954429490564e-05, + "loss": 4.3733, + "step": 28906 + }, + { + "epoch": 0.17191811780378724, + "grad_norm": 1.8528467416763306, + "learning_rate": 4.644171424981025e-05, + "loss": 3.7542, + "step": 28907 + }, + { + "epoch": 0.17192406508706823, + "grad_norm": 1.8916527032852173, + "learning_rate": 4.6441474062644844e-05, + "loss": 3.726, + "step": 28908 + }, + { + "epoch": 0.17193001237034922, + "grad_norm": 1.8707592487335205, + "learning_rate": 4.644123386799445e-05, + "loss": 3.77, + "step": 28909 + }, + { + "epoch": 0.17193595965363023, + "grad_norm": 1.7839124202728271, + "learning_rate": 4.644099366585914e-05, + "loss": 3.8036, + "step": 28910 + }, + { + "epoch": 0.17194190693691122, + "grad_norm": 2.1418814659118652, + "learning_rate": 4.6440753456239e-05, + "loss": 3.83, + "step": 28911 + }, + { + "epoch": 0.1719478542201922, + "grad_norm": 1.7159006595611572, + "learning_rate": 4.644051323913412e-05, + "loss": 3.6423, + "step": 28912 + }, + { + "epoch": 0.17195380150347322, + "grad_norm": 2.0046510696411133, + "learning_rate": 4.644027301454457e-05, + "loss": 3.6761, + "step": 28913 + }, + { + "epoch": 0.1719597487867542, + "grad_norm": 1.8171806335449219, + "learning_rate": 4.6440032782470446e-05, + "loss": 3.6621, + "step": 28914 + }, + { + "epoch": 0.1719656960700352, + "grad_norm": 1.813620924949646, + "learning_rate": 4.6439792542911826e-05, + "loss": 3.6249, + "step": 28915 + }, + { + "epoch": 0.17197164335331622, + "grad_norm": 1.8341031074523926, + "learning_rate": 4.64395522958688e-05, + "loss": 4.1758, + "step": 28916 + }, + { + "epoch": 0.1719775906365972, + "grad_norm": 2.3422980308532715, + "learning_rate": 4.643931204134144e-05, + "loss": 4.0642, + "step": 28917 + }, + { + "epoch": 0.1719835379198782, + "grad_norm": 2.2799339294433594, + "learning_rate": 4.643907177932985e-05, + "loss": 3.5248, + "step": 28918 + }, + { + "epoch": 0.1719894852031592, + "grad_norm": 2.3583829402923584, + "learning_rate": 4.643883150983409e-05, + "loss": 3.4972, + "step": 28919 + }, + { + "epoch": 0.1719954324864402, + "grad_norm": 2.667558431625366, + "learning_rate": 4.6438591232854265e-05, + "loss": 3.3926, + "step": 28920 + }, + { + "epoch": 0.17200137976972119, + "grad_norm": 2.2808713912963867, + "learning_rate": 4.6438350948390444e-05, + "loss": 3.2806, + "step": 28921 + }, + { + "epoch": 0.1720073270530022, + "grad_norm": 2.0563879013061523, + "learning_rate": 4.6438110656442713e-05, + "loss": 4.4691, + "step": 28922 + }, + { + "epoch": 0.1720132743362832, + "grad_norm": 1.8717663288116455, + "learning_rate": 4.643787035701116e-05, + "loss": 4.8282, + "step": 28923 + }, + { + "epoch": 0.17201922161956418, + "grad_norm": 2.2592520713806152, + "learning_rate": 4.643763005009588e-05, + "loss": 3.6768, + "step": 28924 + }, + { + "epoch": 0.1720251689028452, + "grad_norm": 2.2937116622924805, + "learning_rate": 4.643738973569693e-05, + "loss": 3.5727, + "step": 28925 + }, + { + "epoch": 0.17203111618612618, + "grad_norm": 2.3913755416870117, + "learning_rate": 4.643714941381441e-05, + "loss": 3.6011, + "step": 28926 + }, + { + "epoch": 0.17203706346940717, + "grad_norm": 2.3368663787841797, + "learning_rate": 4.643690908444841e-05, + "loss": 3.6664, + "step": 28927 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 1.4821833372116089, + "learning_rate": 4.6436668747599005e-05, + "loss": 5.495, + "step": 28928 + }, + { + "epoch": 0.17204895803596917, + "grad_norm": 1.8062217235565186, + "learning_rate": 4.643642840326627e-05, + "loss": 4.632, + "step": 28929 + }, + { + "epoch": 0.17205490531925016, + "grad_norm": 2.0992000102996826, + "learning_rate": 4.6436188051450314e-05, + "loss": 4.1965, + "step": 28930 + }, + { + "epoch": 0.17206085260253115, + "grad_norm": 1.6724803447723389, + "learning_rate": 4.6435947692151207e-05, + "loss": 5.1407, + "step": 28931 + }, + { + "epoch": 0.17206679988581217, + "grad_norm": 2.1039113998413086, + "learning_rate": 4.6435707325369024e-05, + "loss": 4.9189, + "step": 28932 + }, + { + "epoch": 0.17207274716909315, + "grad_norm": 1.7378982305526733, + "learning_rate": 4.6435466951103853e-05, + "loss": 5.0936, + "step": 28933 + }, + { + "epoch": 0.17207869445237414, + "grad_norm": 1.7237809896469116, + "learning_rate": 4.643522656935579e-05, + "loss": 5.175, + "step": 28934 + }, + { + "epoch": 0.17208464173565516, + "grad_norm": 1.5770435333251953, + "learning_rate": 4.6434986180124904e-05, + "loss": 5.0878, + "step": 28935 + }, + { + "epoch": 0.17209058901893615, + "grad_norm": 1.5708106756210327, + "learning_rate": 4.6434745783411294e-05, + "loss": 5.185, + "step": 28936 + }, + { + "epoch": 0.17209653630221713, + "grad_norm": 1.840494990348816, + "learning_rate": 4.643450537921503e-05, + "loss": 5.0293, + "step": 28937 + }, + { + "epoch": 0.17210248358549815, + "grad_norm": 1.9380584955215454, + "learning_rate": 4.64342649675362e-05, + "loss": 4.6983, + "step": 28938 + }, + { + "epoch": 0.17210843086877914, + "grad_norm": 1.6215778589248657, + "learning_rate": 4.64340245483749e-05, + "loss": 5.1622, + "step": 28939 + }, + { + "epoch": 0.17211437815206013, + "grad_norm": 2.1743335723876953, + "learning_rate": 4.6433784121731196e-05, + "loss": 4.2748, + "step": 28940 + }, + { + "epoch": 0.17212032543534114, + "grad_norm": 2.269792318344116, + "learning_rate": 4.643354368760517e-05, + "loss": 3.682, + "step": 28941 + }, + { + "epoch": 0.17212627271862213, + "grad_norm": 1.956141471862793, + "learning_rate": 4.643330324599693e-05, + "loss": 4.4543, + "step": 28942 + }, + { + "epoch": 0.17213222000190312, + "grad_norm": 1.5037137269973755, + "learning_rate": 4.6433062796906544e-05, + "loss": 5.4757, + "step": 28943 + }, + { + "epoch": 0.17213816728518413, + "grad_norm": 2.0092952251434326, + "learning_rate": 4.643282234033409e-05, + "loss": 3.9942, + "step": 28944 + }, + { + "epoch": 0.17214411456846512, + "grad_norm": 2.0670738220214844, + "learning_rate": 4.643258187627967e-05, + "loss": 3.2918, + "step": 28945 + }, + { + "epoch": 0.1721500618517461, + "grad_norm": 2.011192560195923, + "learning_rate": 4.643234140474334e-05, + "loss": 3.6096, + "step": 28946 + }, + { + "epoch": 0.17215600913502713, + "grad_norm": 2.221064805984497, + "learning_rate": 4.643210092572522e-05, + "loss": 4.0979, + "step": 28947 + }, + { + "epoch": 0.17216195641830812, + "grad_norm": 2.543839931488037, + "learning_rate": 4.643186043922536e-05, + "loss": 3.8645, + "step": 28948 + }, + { + "epoch": 0.1721679037015891, + "grad_norm": 1.8699936866760254, + "learning_rate": 4.6431619945243866e-05, + "loss": 3.8908, + "step": 28949 + }, + { + "epoch": 0.17217385098487012, + "grad_norm": 1.6603435277938843, + "learning_rate": 4.6431379443780815e-05, + "loss": 4.9394, + "step": 28950 + }, + { + "epoch": 0.1721797982681511, + "grad_norm": 2.0914523601531982, + "learning_rate": 4.643113893483629e-05, + "loss": 3.1328, + "step": 28951 + }, + { + "epoch": 0.1721857455514321, + "grad_norm": 2.469694137573242, + "learning_rate": 4.6430898418410373e-05, + "loss": 3.5583, + "step": 28952 + }, + { + "epoch": 0.1721916928347131, + "grad_norm": 2.5100619792938232, + "learning_rate": 4.643065789450315e-05, + "loss": 3.7234, + "step": 28953 + }, + { + "epoch": 0.1721976401179941, + "grad_norm": 2.565922737121582, + "learning_rate": 4.643041736311471e-05, + "loss": 3.3566, + "step": 28954 + }, + { + "epoch": 0.1722035874012751, + "grad_norm": 2.454882860183716, + "learning_rate": 4.643017682424513e-05, + "loss": 3.6576, + "step": 28955 + }, + { + "epoch": 0.1722095346845561, + "grad_norm": 1.6239404678344727, + "learning_rate": 4.64299362778945e-05, + "loss": 4.7344, + "step": 28956 + }, + { + "epoch": 0.1722154819678371, + "grad_norm": 1.6332730054855347, + "learning_rate": 4.6429695724062906e-05, + "loss": 4.9091, + "step": 28957 + }, + { + "epoch": 0.17222142925111808, + "grad_norm": 1.495293378829956, + "learning_rate": 4.642945516275041e-05, + "loss": 4.7336, + "step": 28958 + }, + { + "epoch": 0.1722273765343991, + "grad_norm": 1.531150460243225, + "learning_rate": 4.6429214593957125e-05, + "loss": 4.7503, + "step": 28959 + }, + { + "epoch": 0.17223332381768008, + "grad_norm": 1.2761198282241821, + "learning_rate": 4.642897401768312e-05, + "loss": 4.6507, + "step": 28960 + }, + { + "epoch": 0.17223927110096107, + "grad_norm": 1.366808295249939, + "learning_rate": 4.642873343392848e-05, + "loss": 4.7195, + "step": 28961 + }, + { + "epoch": 0.1722452183842421, + "grad_norm": 2.072298765182495, + "learning_rate": 4.6428492842693295e-05, + "loss": 4.3342, + "step": 28962 + }, + { + "epoch": 0.17225116566752308, + "grad_norm": 2.4667413234710693, + "learning_rate": 4.642825224397764e-05, + "loss": 3.3579, + "step": 28963 + }, + { + "epoch": 0.17225711295080406, + "grad_norm": 2.5743234157562256, + "learning_rate": 4.64280116377816e-05, + "loss": 3.559, + "step": 28964 + }, + { + "epoch": 0.17226306023408508, + "grad_norm": 2.4581592082977295, + "learning_rate": 4.6427771024105274e-05, + "loss": 3.6332, + "step": 28965 + }, + { + "epoch": 0.17226900751736607, + "grad_norm": 2.156362533569336, + "learning_rate": 4.642753040294873e-05, + "loss": 4.5459, + "step": 28966 + }, + { + "epoch": 0.17227495480064706, + "grad_norm": 2.2250757217407227, + "learning_rate": 4.642728977431205e-05, + "loss": 3.7909, + "step": 28967 + }, + { + "epoch": 0.17228090208392807, + "grad_norm": 2.06371808052063, + "learning_rate": 4.642704913819533e-05, + "loss": 5.3105, + "step": 28968 + }, + { + "epoch": 0.17228684936720906, + "grad_norm": 2.0080556869506836, + "learning_rate": 4.642680849459865e-05, + "loss": 5.2019, + "step": 28969 + }, + { + "epoch": 0.17229279665049005, + "grad_norm": 1.4533225297927856, + "learning_rate": 4.642656784352209e-05, + "loss": 5.3035, + "step": 28970 + }, + { + "epoch": 0.17229874393377106, + "grad_norm": 1.8252445459365845, + "learning_rate": 4.642632718496573e-05, + "loss": 4.5186, + "step": 28971 + }, + { + "epoch": 0.17230469121705205, + "grad_norm": 2.125659465789795, + "learning_rate": 4.642608651892967e-05, + "loss": 4.5968, + "step": 28972 + }, + { + "epoch": 0.17231063850033304, + "grad_norm": 1.7049205303192139, + "learning_rate": 4.6425845845413984e-05, + "loss": 5.2613, + "step": 28973 + }, + { + "epoch": 0.17231658578361406, + "grad_norm": 1.818495512008667, + "learning_rate": 4.642560516441875e-05, + "loss": 4.5706, + "step": 28974 + }, + { + "epoch": 0.17232253306689505, + "grad_norm": 1.4389350414276123, + "learning_rate": 4.6425364475944065e-05, + "loss": 5.3398, + "step": 28975 + }, + { + "epoch": 0.17232848035017603, + "grad_norm": 1.3256508111953735, + "learning_rate": 4.6425123779990005e-05, + "loss": 5.0498, + "step": 28976 + }, + { + "epoch": 0.17233442763345705, + "grad_norm": 1.3190927505493164, + "learning_rate": 4.642488307655666e-05, + "loss": 5.1833, + "step": 28977 + }, + { + "epoch": 0.17234037491673804, + "grad_norm": 1.6174373626708984, + "learning_rate": 4.64246423656441e-05, + "loss": 4.7737, + "step": 28978 + }, + { + "epoch": 0.17234632220001903, + "grad_norm": 1.3956570625305176, + "learning_rate": 4.6424401647252425e-05, + "loss": 5.0439, + "step": 28979 + }, + { + "epoch": 0.17235226948330004, + "grad_norm": 1.3336056470870972, + "learning_rate": 4.642416092138171e-05, + "loss": 5.526, + "step": 28980 + }, + { + "epoch": 0.17235821676658103, + "grad_norm": 1.9870527982711792, + "learning_rate": 4.642392018803204e-05, + "loss": 4.6277, + "step": 28981 + }, + { + "epoch": 0.17236416404986202, + "grad_norm": 1.9504579305648804, + "learning_rate": 4.64236794472035e-05, + "loss": 4.6113, + "step": 28982 + }, + { + "epoch": 0.17237011133314303, + "grad_norm": 1.7667953968048096, + "learning_rate": 4.642343869889618e-05, + "loss": 5.0653, + "step": 28983 + }, + { + "epoch": 0.17237605861642402, + "grad_norm": 1.6792775392532349, + "learning_rate": 4.642319794311016e-05, + "loss": 5.1556, + "step": 28984 + }, + { + "epoch": 0.172382005899705, + "grad_norm": 1.7935463190078735, + "learning_rate": 4.642295717984551e-05, + "loss": 4.4604, + "step": 28985 + }, + { + "epoch": 0.17238795318298603, + "grad_norm": 1.8608596324920654, + "learning_rate": 4.642271640910235e-05, + "loss": 5.1865, + "step": 28986 + }, + { + "epoch": 0.172393900466267, + "grad_norm": 1.7945232391357422, + "learning_rate": 4.642247563088073e-05, + "loss": 4.8413, + "step": 28987 + }, + { + "epoch": 0.172399847749548, + "grad_norm": 1.6362812519073486, + "learning_rate": 4.6422234845180734e-05, + "loss": 5.4072, + "step": 28988 + }, + { + "epoch": 0.172405795032829, + "grad_norm": 1.7283893823623657, + "learning_rate": 4.642199405200247e-05, + "loss": 5.2463, + "step": 28989 + }, + { + "epoch": 0.17241174231611, + "grad_norm": 2.589603900909424, + "learning_rate": 4.6421753251346004e-05, + "loss": 4.0614, + "step": 28990 + }, + { + "epoch": 0.172417689599391, + "grad_norm": 1.785037875175476, + "learning_rate": 4.642151244321143e-05, + "loss": 4.7127, + "step": 28991 + }, + { + "epoch": 0.17242363688267198, + "grad_norm": 1.5093384981155396, + "learning_rate": 4.6421271627598826e-05, + "loss": 5.2746, + "step": 28992 + }, + { + "epoch": 0.172429584165953, + "grad_norm": 1.4697469472885132, + "learning_rate": 4.642103080450828e-05, + "loss": 5.2444, + "step": 28993 + }, + { + "epoch": 0.172435531449234, + "grad_norm": 1.5588436126708984, + "learning_rate": 4.642078997393986e-05, + "loss": 5.3832, + "step": 28994 + }, + { + "epoch": 0.17244147873251497, + "grad_norm": 1.4939788579940796, + "learning_rate": 4.642054913589368e-05, + "loss": 5.5868, + "step": 28995 + }, + { + "epoch": 0.172447426015796, + "grad_norm": 1.8973298072814941, + "learning_rate": 4.6420308290369795e-05, + "loss": 5.3981, + "step": 28996 + }, + { + "epoch": 0.17245337329907698, + "grad_norm": 1.7295379638671875, + "learning_rate": 4.642006743736831e-05, + "loss": 4.8308, + "step": 28997 + }, + { + "epoch": 0.17245932058235797, + "grad_norm": 1.519732117652893, + "learning_rate": 4.641982657688929e-05, + "loss": 5.423, + "step": 28998 + }, + { + "epoch": 0.17246526786563898, + "grad_norm": 1.6511726379394531, + "learning_rate": 4.641958570893284e-05, + "loss": 5.2029, + "step": 28999 + }, + { + "epoch": 0.17247121514891997, + "grad_norm": 1.5355091094970703, + "learning_rate": 4.641934483349903e-05, + "loss": 5.3556, + "step": 29000 + }, + { + "epoch": 0.17247716243220096, + "grad_norm": 1.562451720237732, + "learning_rate": 4.641910395058795e-05, + "loss": 5.3171, + "step": 29001 + }, + { + "epoch": 0.17248310971548197, + "grad_norm": 1.4412742853164673, + "learning_rate": 4.6418863060199684e-05, + "loss": 5.1771, + "step": 29002 + }, + { + "epoch": 0.17248905699876296, + "grad_norm": 1.5048646926879883, + "learning_rate": 4.6418622162334315e-05, + "loss": 5.3242, + "step": 29003 + }, + { + "epoch": 0.17249500428204395, + "grad_norm": 1.4204987287521362, + "learning_rate": 4.641838125699192e-05, + "loss": 5.3281, + "step": 29004 + }, + { + "epoch": 0.17250095156532497, + "grad_norm": 1.5606169700622559, + "learning_rate": 4.641814034417259e-05, + "loss": 5.0594, + "step": 29005 + }, + { + "epoch": 0.17250689884860596, + "grad_norm": 1.5690323114395142, + "learning_rate": 4.641789942387641e-05, + "loss": 5.2602, + "step": 29006 + }, + { + "epoch": 0.17251284613188694, + "grad_norm": 1.4904906749725342, + "learning_rate": 4.641765849610347e-05, + "loss": 5.2554, + "step": 29007 + }, + { + "epoch": 0.17251879341516796, + "grad_norm": 1.8319326639175415, + "learning_rate": 4.641741756085384e-05, + "loss": 4.5856, + "step": 29008 + }, + { + "epoch": 0.17252474069844895, + "grad_norm": 1.984311819076538, + "learning_rate": 4.6417176618127614e-05, + "loss": 5.2343, + "step": 29009 + }, + { + "epoch": 0.17253068798172994, + "grad_norm": 1.8066591024398804, + "learning_rate": 4.6416935667924864e-05, + "loss": 5.6382, + "step": 29010 + }, + { + "epoch": 0.17253663526501095, + "grad_norm": 1.3843746185302734, + "learning_rate": 4.641669471024569e-05, + "loss": 5.4115, + "step": 29011 + }, + { + "epoch": 0.17254258254829194, + "grad_norm": 1.6255708932876587, + "learning_rate": 4.6416453745090164e-05, + "loss": 5.1379, + "step": 29012 + }, + { + "epoch": 0.17254852983157293, + "grad_norm": 1.4723587036132812, + "learning_rate": 4.641621277245838e-05, + "loss": 5.1829, + "step": 29013 + }, + { + "epoch": 0.17255447711485394, + "grad_norm": 1.7830013036727905, + "learning_rate": 4.641597179235042e-05, + "loss": 4.8646, + "step": 29014 + }, + { + "epoch": 0.17256042439813493, + "grad_norm": 1.6139211654663086, + "learning_rate": 4.641573080476636e-05, + "loss": 5.3989, + "step": 29015 + }, + { + "epoch": 0.17256637168141592, + "grad_norm": 2.9187774658203125, + "learning_rate": 4.641548980970629e-05, + "loss": 3.3579, + "step": 29016 + }, + { + "epoch": 0.17257231896469694, + "grad_norm": 1.4265162944793701, + "learning_rate": 4.6415248807170296e-05, + "loss": 5.1783, + "step": 29017 + }, + { + "epoch": 0.17257826624797792, + "grad_norm": 1.3095968961715698, + "learning_rate": 4.641500779715846e-05, + "loss": 5.6357, + "step": 29018 + }, + { + "epoch": 0.1725842135312589, + "grad_norm": 1.3929443359375, + "learning_rate": 4.641476677967087e-05, + "loss": 5.3234, + "step": 29019 + }, + { + "epoch": 0.17259016081453993, + "grad_norm": 1.6466419696807861, + "learning_rate": 4.64145257547076e-05, + "loss": 5.5066, + "step": 29020 + }, + { + "epoch": 0.17259610809782092, + "grad_norm": 1.4895389080047607, + "learning_rate": 4.6414284722268745e-05, + "loss": 5.0983, + "step": 29021 + }, + { + "epoch": 0.1726020553811019, + "grad_norm": 1.6978981494903564, + "learning_rate": 4.641404368235438e-05, + "loss": 5.3724, + "step": 29022 + }, + { + "epoch": 0.17260800266438292, + "grad_norm": 1.7038211822509766, + "learning_rate": 4.641380263496459e-05, + "loss": 5.2525, + "step": 29023 + }, + { + "epoch": 0.1726139499476639, + "grad_norm": 1.4917408227920532, + "learning_rate": 4.641356158009947e-05, + "loss": 4.9793, + "step": 29024 + }, + { + "epoch": 0.1726198972309449, + "grad_norm": 1.6916602849960327, + "learning_rate": 4.6413320517759094e-05, + "loss": 5.0735, + "step": 29025 + }, + { + "epoch": 0.1726258445142259, + "grad_norm": 1.4852558374404907, + "learning_rate": 4.6413079447943556e-05, + "loss": 5.27, + "step": 29026 + }, + { + "epoch": 0.1726317917975069, + "grad_norm": 1.6030479669570923, + "learning_rate": 4.6412838370652925e-05, + "loss": 5.2712, + "step": 29027 + }, + { + "epoch": 0.1726377390807879, + "grad_norm": 1.5208861827850342, + "learning_rate": 4.6412597285887296e-05, + "loss": 5.4238, + "step": 29028 + }, + { + "epoch": 0.1726436863640689, + "grad_norm": 1.8001056909561157, + "learning_rate": 4.6412356193646744e-05, + "loss": 5.433, + "step": 29029 + }, + { + "epoch": 0.1726496336473499, + "grad_norm": 1.570449948310852, + "learning_rate": 4.641211509393136e-05, + "loss": 5.3843, + "step": 29030 + }, + { + "epoch": 0.17265558093063088, + "grad_norm": 1.4007776975631714, + "learning_rate": 4.641187398674124e-05, + "loss": 5.213, + "step": 29031 + }, + { + "epoch": 0.1726615282139119, + "grad_norm": 1.7244693040847778, + "learning_rate": 4.641163287207645e-05, + "loss": 4.342, + "step": 29032 + }, + { + "epoch": 0.17266747549719288, + "grad_norm": 1.752119779586792, + "learning_rate": 4.6411391749937076e-05, + "loss": 5.2256, + "step": 29033 + }, + { + "epoch": 0.17267342278047387, + "grad_norm": 1.7031835317611694, + "learning_rate": 4.6411150620323214e-05, + "loss": 5.3993, + "step": 29034 + }, + { + "epoch": 0.1726793700637549, + "grad_norm": 1.6741119623184204, + "learning_rate": 4.641090948323493e-05, + "loss": 5.3929, + "step": 29035 + }, + { + "epoch": 0.17268531734703588, + "grad_norm": 1.5801132917404175, + "learning_rate": 4.6410668338672326e-05, + "loss": 5.5049, + "step": 29036 + }, + { + "epoch": 0.17269126463031687, + "grad_norm": 1.6885874271392822, + "learning_rate": 4.641042718663548e-05, + "loss": 5.4284, + "step": 29037 + }, + { + "epoch": 0.17269721191359788, + "grad_norm": 2.0031561851501465, + "learning_rate": 4.6410186027124475e-05, + "loss": 5.064, + "step": 29038 + }, + { + "epoch": 0.17270315919687887, + "grad_norm": 1.9345756769180298, + "learning_rate": 4.640994486013939e-05, + "loss": 4.902, + "step": 29039 + }, + { + "epoch": 0.17270910648015986, + "grad_norm": 1.7898815870285034, + "learning_rate": 4.640970368568032e-05, + "loss": 4.576, + "step": 29040 + }, + { + "epoch": 0.17271505376344087, + "grad_norm": 1.7370834350585938, + "learning_rate": 4.640946250374734e-05, + "loss": 4.2676, + "step": 29041 + }, + { + "epoch": 0.17272100104672186, + "grad_norm": 1.3820379972457886, + "learning_rate": 4.640922131434054e-05, + "loss": 4.1509, + "step": 29042 + }, + { + "epoch": 0.17272694833000285, + "grad_norm": 1.507027506828308, + "learning_rate": 4.640898011746e-05, + "loss": 4.8934, + "step": 29043 + }, + { + "epoch": 0.17273289561328387, + "grad_norm": 1.7124078273773193, + "learning_rate": 4.640873891310581e-05, + "loss": 5.0756, + "step": 29044 + }, + { + "epoch": 0.17273884289656485, + "grad_norm": 1.5267462730407715, + "learning_rate": 4.6408497701278045e-05, + "loss": 5.2387, + "step": 29045 + }, + { + "epoch": 0.17274479017984584, + "grad_norm": 1.560703158378601, + "learning_rate": 4.64082564819768e-05, + "loss": 4.8667, + "step": 29046 + }, + { + "epoch": 0.17275073746312683, + "grad_norm": 1.5322329998016357, + "learning_rate": 4.6408015255202145e-05, + "loss": 5.013, + "step": 29047 + }, + { + "epoch": 0.17275668474640785, + "grad_norm": 1.675746202468872, + "learning_rate": 4.640777402095419e-05, + "loss": 4.8509, + "step": 29048 + }, + { + "epoch": 0.17276263202968883, + "grad_norm": 1.6513665914535522, + "learning_rate": 4.640753277923299e-05, + "loss": 4.9737, + "step": 29049 + }, + { + "epoch": 0.17276857931296982, + "grad_norm": 1.7950671911239624, + "learning_rate": 4.640729153003864e-05, + "loss": 4.3243, + "step": 29050 + }, + { + "epoch": 0.17277452659625084, + "grad_norm": 1.7763174772262573, + "learning_rate": 4.6407050273371225e-05, + "loss": 4.3468, + "step": 29051 + }, + { + "epoch": 0.17278047387953183, + "grad_norm": 1.7274105548858643, + "learning_rate": 4.640680900923083e-05, + "loss": 4.3678, + "step": 29052 + }, + { + "epoch": 0.17278642116281281, + "grad_norm": 1.8083571195602417, + "learning_rate": 4.640656773761755e-05, + "loss": 4.0583, + "step": 29053 + }, + { + "epoch": 0.17279236844609383, + "grad_norm": 1.5555697679519653, + "learning_rate": 4.640632645853145e-05, + "loss": 4.9759, + "step": 29054 + }, + { + "epoch": 0.17279831572937482, + "grad_norm": 1.5617389678955078, + "learning_rate": 4.640608517197263e-05, + "loss": 4.9137, + "step": 29055 + }, + { + "epoch": 0.1728042630126558, + "grad_norm": 1.549464225769043, + "learning_rate": 4.640584387794115e-05, + "loss": 5.158, + "step": 29056 + }, + { + "epoch": 0.17281021029593682, + "grad_norm": 1.7087653875350952, + "learning_rate": 4.6405602576437126e-05, + "loss": 5.136, + "step": 29057 + }, + { + "epoch": 0.1728161575792178, + "grad_norm": 1.5118201971054077, + "learning_rate": 4.640536126746062e-05, + "loss": 5.1956, + "step": 29058 + }, + { + "epoch": 0.1728221048624988, + "grad_norm": 1.6387808322906494, + "learning_rate": 4.640511995101173e-05, + "loss": 5.0441, + "step": 29059 + }, + { + "epoch": 0.17282805214577981, + "grad_norm": 1.652024745941162, + "learning_rate": 4.640487862709053e-05, + "loss": 4.9147, + "step": 29060 + }, + { + "epoch": 0.1728339994290608, + "grad_norm": 1.6259782314300537, + "learning_rate": 4.640463729569711e-05, + "loss": 4.2755, + "step": 29061 + }, + { + "epoch": 0.1728399467123418, + "grad_norm": 1.6286218166351318, + "learning_rate": 4.640439595683155e-05, + "loss": 4.6328, + "step": 29062 + }, + { + "epoch": 0.1728458939956228, + "grad_norm": 1.7396693229675293, + "learning_rate": 4.6404154610493934e-05, + "loss": 4.5711, + "step": 29063 + }, + { + "epoch": 0.1728518412789038, + "grad_norm": 1.4926822185516357, + "learning_rate": 4.640391325668435e-05, + "loss": 5.118, + "step": 29064 + }, + { + "epoch": 0.17285778856218478, + "grad_norm": 2.454763650894165, + "learning_rate": 4.6403671895402884e-05, + "loss": 4.817, + "step": 29065 + }, + { + "epoch": 0.1728637358454658, + "grad_norm": 1.6225837469100952, + "learning_rate": 4.640343052664962e-05, + "loss": 4.9953, + "step": 29066 + }, + { + "epoch": 0.1728696831287468, + "grad_norm": 1.8164595365524292, + "learning_rate": 4.640318915042463e-05, + "loss": 4.9384, + "step": 29067 + }, + { + "epoch": 0.17287563041202778, + "grad_norm": 1.4794782400131226, + "learning_rate": 4.640294776672801e-05, + "loss": 5.2635, + "step": 29068 + }, + { + "epoch": 0.1728815776953088, + "grad_norm": 1.6981302499771118, + "learning_rate": 4.640270637555985e-05, + "loss": 5.283, + "step": 29069 + }, + { + "epoch": 0.17288752497858978, + "grad_norm": 1.8669052124023438, + "learning_rate": 4.640246497692022e-05, + "loss": 4.303, + "step": 29070 + }, + { + "epoch": 0.17289347226187077, + "grad_norm": 1.8505442142486572, + "learning_rate": 4.640222357080921e-05, + "loss": 4.6573, + "step": 29071 + }, + { + "epoch": 0.17289941954515178, + "grad_norm": 1.6368263959884644, + "learning_rate": 4.640198215722691e-05, + "loss": 4.5301, + "step": 29072 + }, + { + "epoch": 0.17290536682843277, + "grad_norm": 1.665531039237976, + "learning_rate": 4.640174073617339e-05, + "loss": 5.2184, + "step": 29073 + }, + { + "epoch": 0.17291131411171376, + "grad_norm": 1.663392186164856, + "learning_rate": 4.640149930764875e-05, + "loss": 4.1373, + "step": 29074 + }, + { + "epoch": 0.17291726139499478, + "grad_norm": 1.8580307960510254, + "learning_rate": 4.640125787165307e-05, + "loss": 4.4035, + "step": 29075 + }, + { + "epoch": 0.17292320867827576, + "grad_norm": 1.5936819314956665, + "learning_rate": 4.640101642818643e-05, + "loss": 5.145, + "step": 29076 + }, + { + "epoch": 0.17292915596155675, + "grad_norm": 1.7124170064926147, + "learning_rate": 4.6400774977248915e-05, + "loss": 4.1569, + "step": 29077 + }, + { + "epoch": 0.17293510324483777, + "grad_norm": 2.51955509185791, + "learning_rate": 4.6400533518840614e-05, + "loss": 3.8795, + "step": 29078 + }, + { + "epoch": 0.17294105052811876, + "grad_norm": 1.6238064765930176, + "learning_rate": 4.6400292052961604e-05, + "loss": 5.0575, + "step": 29079 + }, + { + "epoch": 0.17294699781139974, + "grad_norm": 1.7471083402633667, + "learning_rate": 4.6400050579611974e-05, + "loss": 4.1607, + "step": 29080 + }, + { + "epoch": 0.17295294509468076, + "grad_norm": 1.7179365158081055, + "learning_rate": 4.639980909879181e-05, + "loss": 4.2253, + "step": 29081 + }, + { + "epoch": 0.17295889237796175, + "grad_norm": 1.6772149801254272, + "learning_rate": 4.639956761050119e-05, + "loss": 4.0833, + "step": 29082 + }, + { + "epoch": 0.17296483966124274, + "grad_norm": 1.6395635604858398, + "learning_rate": 4.639932611474021e-05, + "loss": 4.3961, + "step": 29083 + }, + { + "epoch": 0.17297078694452375, + "grad_norm": 1.5897985696792603, + "learning_rate": 4.6399084611508935e-05, + "loss": 4.5272, + "step": 29084 + }, + { + "epoch": 0.17297673422780474, + "grad_norm": 1.5276799201965332, + "learning_rate": 4.639884310080746e-05, + "loss": 5.037, + "step": 29085 + }, + { + "epoch": 0.17298268151108573, + "grad_norm": 1.5612523555755615, + "learning_rate": 4.639860158263588e-05, + "loss": 5.2272, + "step": 29086 + }, + { + "epoch": 0.17298862879436674, + "grad_norm": 1.7078372240066528, + "learning_rate": 4.639836005699426e-05, + "loss": 4.2294, + "step": 29087 + }, + { + "epoch": 0.17299457607764773, + "grad_norm": 1.643798828125, + "learning_rate": 4.63981185238827e-05, + "loss": 4.1974, + "step": 29088 + }, + { + "epoch": 0.17300052336092872, + "grad_norm": 1.7256457805633545, + "learning_rate": 4.639787698330128e-05, + "loss": 4.3683, + "step": 29089 + }, + { + "epoch": 0.17300647064420974, + "grad_norm": 1.9199156761169434, + "learning_rate": 4.6397635435250076e-05, + "loss": 4.3005, + "step": 29090 + }, + { + "epoch": 0.17301241792749072, + "grad_norm": 1.927114486694336, + "learning_rate": 4.6397393879729176e-05, + "loss": 3.53, + "step": 29091 + }, + { + "epoch": 0.1730183652107717, + "grad_norm": 1.5402168035507202, + "learning_rate": 4.639715231673868e-05, + "loss": 5.048, + "step": 29092 + }, + { + "epoch": 0.17302431249405273, + "grad_norm": 1.4014962911605835, + "learning_rate": 4.6396910746278646e-05, + "loss": 4.9029, + "step": 29093 + }, + { + "epoch": 0.17303025977733372, + "grad_norm": 1.3504273891448975, + "learning_rate": 4.639666916834918e-05, + "loss": 4.9728, + "step": 29094 + }, + { + "epoch": 0.1730362070606147, + "grad_norm": 1.4277746677398682, + "learning_rate": 4.639642758295035e-05, + "loss": 4.9853, + "step": 29095 + }, + { + "epoch": 0.17304215434389572, + "grad_norm": 1.664764165878296, + "learning_rate": 4.639618599008225e-05, + "loss": 4.9195, + "step": 29096 + }, + { + "epoch": 0.1730481016271767, + "grad_norm": 1.7788653373718262, + "learning_rate": 4.639594438974497e-05, + "loss": 4.6073, + "step": 29097 + }, + { + "epoch": 0.1730540489104577, + "grad_norm": 1.543224573135376, + "learning_rate": 4.639570278193858e-05, + "loss": 4.5988, + "step": 29098 + }, + { + "epoch": 0.1730599961937387, + "grad_norm": 1.8790651559829712, + "learning_rate": 4.639546116666317e-05, + "loss": 4.3982, + "step": 29099 + }, + { + "epoch": 0.1730659434770197, + "grad_norm": 1.6308414936065674, + "learning_rate": 4.639521954391883e-05, + "loss": 4.8477, + "step": 29100 + }, + { + "epoch": 0.1730718907603007, + "grad_norm": 1.7135157585144043, + "learning_rate": 4.639497791370564e-05, + "loss": 5.0111, + "step": 29101 + }, + { + "epoch": 0.1730778380435817, + "grad_norm": 1.9777605533599854, + "learning_rate": 4.639473627602369e-05, + "loss": 5.2615, + "step": 29102 + }, + { + "epoch": 0.1730837853268627, + "grad_norm": 1.8689080476760864, + "learning_rate": 4.639449463087304e-05, + "loss": 5.4032, + "step": 29103 + }, + { + "epoch": 0.17308973261014368, + "grad_norm": 1.8719011545181274, + "learning_rate": 4.6394252978253814e-05, + "loss": 4.7377, + "step": 29104 + }, + { + "epoch": 0.17309567989342467, + "grad_norm": 2.0242390632629395, + "learning_rate": 4.6394011318166066e-05, + "loss": 4.3017, + "step": 29105 + }, + { + "epoch": 0.17310162717670569, + "grad_norm": 1.6117249727249146, + "learning_rate": 4.639376965060989e-05, + "loss": 4.5215, + "step": 29106 + }, + { + "epoch": 0.17310757445998667, + "grad_norm": 1.9272388219833374, + "learning_rate": 4.639352797558536e-05, + "loss": 4.4802, + "step": 29107 + }, + { + "epoch": 0.17311352174326766, + "grad_norm": 1.7987074851989746, + "learning_rate": 4.639328629309259e-05, + "loss": 4.4009, + "step": 29108 + }, + { + "epoch": 0.17311946902654868, + "grad_norm": 1.8932039737701416, + "learning_rate": 4.639304460313163e-05, + "loss": 4.3668, + "step": 29109 + }, + { + "epoch": 0.17312541630982967, + "grad_norm": 2.2508416175842285, + "learning_rate": 4.639280290570258e-05, + "loss": 4.9557, + "step": 29110 + }, + { + "epoch": 0.17313136359311065, + "grad_norm": 2.086383104324341, + "learning_rate": 4.639256120080553e-05, + "loss": 5.0933, + "step": 29111 + }, + { + "epoch": 0.17313731087639167, + "grad_norm": 1.7917490005493164, + "learning_rate": 4.639231948844056e-05, + "loss": 5.2057, + "step": 29112 + }, + { + "epoch": 0.17314325815967266, + "grad_norm": 1.8576172590255737, + "learning_rate": 4.639207776860774e-05, + "loss": 4.4434, + "step": 29113 + }, + { + "epoch": 0.17314920544295365, + "grad_norm": 1.746186375617981, + "learning_rate": 4.639183604130717e-05, + "loss": 4.2003, + "step": 29114 + }, + { + "epoch": 0.17315515272623466, + "grad_norm": 2.03523588180542, + "learning_rate": 4.639159430653894e-05, + "loss": 4.2907, + "step": 29115 + }, + { + "epoch": 0.17316110000951565, + "grad_norm": 2.0713725090026855, + "learning_rate": 4.639135256430312e-05, + "loss": 4.3741, + "step": 29116 + }, + { + "epoch": 0.17316704729279664, + "grad_norm": 2.745671510696411, + "learning_rate": 4.63911108145998e-05, + "loss": 4.6313, + "step": 29117 + }, + { + "epoch": 0.17317299457607765, + "grad_norm": 1.9662394523620605, + "learning_rate": 4.639086905742906e-05, + "loss": 4.2027, + "step": 29118 + }, + { + "epoch": 0.17317894185935864, + "grad_norm": 1.7448909282684326, + "learning_rate": 4.6390627292791e-05, + "loss": 4.9481, + "step": 29119 + }, + { + "epoch": 0.17318488914263963, + "grad_norm": 1.684590458869934, + "learning_rate": 4.639038552068569e-05, + "loss": 4.8794, + "step": 29120 + }, + { + "epoch": 0.17319083642592065, + "grad_norm": 1.8462331295013428, + "learning_rate": 4.639014374111321e-05, + "loss": 3.9728, + "step": 29121 + }, + { + "epoch": 0.17319678370920163, + "grad_norm": 1.9657787084579468, + "learning_rate": 4.638990195407366e-05, + "loss": 4.0798, + "step": 29122 + }, + { + "epoch": 0.17320273099248262, + "grad_norm": 1.7591108083724976, + "learning_rate": 4.638966015956711e-05, + "loss": 3.9714, + "step": 29123 + }, + { + "epoch": 0.17320867827576364, + "grad_norm": 1.6764097213745117, + "learning_rate": 4.638941835759365e-05, + "loss": 4.7804, + "step": 29124 + }, + { + "epoch": 0.17321462555904463, + "grad_norm": 1.7766660451889038, + "learning_rate": 4.638917654815336e-05, + "loss": 4.8408, + "step": 29125 + }, + { + "epoch": 0.17322057284232562, + "grad_norm": 1.7548637390136719, + "learning_rate": 4.638893473124634e-05, + "loss": 4.9905, + "step": 29126 + }, + { + "epoch": 0.17322652012560663, + "grad_norm": 1.933996319770813, + "learning_rate": 4.6388692906872664e-05, + "loss": 4.757, + "step": 29127 + }, + { + "epoch": 0.17323246740888762, + "grad_norm": 1.6957604885101318, + "learning_rate": 4.638845107503241e-05, + "loss": 5.1555, + "step": 29128 + }, + { + "epoch": 0.1732384146921686, + "grad_norm": 1.7500252723693848, + "learning_rate": 4.638820923572567e-05, + "loss": 4.9637, + "step": 29129 + }, + { + "epoch": 0.17324436197544962, + "grad_norm": 1.8749233484268188, + "learning_rate": 4.638796738895253e-05, + "loss": 4.0375, + "step": 29130 + }, + { + "epoch": 0.1732503092587306, + "grad_norm": 2.124462842941284, + "learning_rate": 4.6387725534713066e-05, + "loss": 4.6226, + "step": 29131 + }, + { + "epoch": 0.1732562565420116, + "grad_norm": 1.877875804901123, + "learning_rate": 4.6387483673007375e-05, + "loss": 4.572, + "step": 29132 + }, + { + "epoch": 0.17326220382529262, + "grad_norm": 1.7845820188522339, + "learning_rate": 4.6387241803835535e-05, + "loss": 4.4978, + "step": 29133 + }, + { + "epoch": 0.1732681511085736, + "grad_norm": 1.5177055597305298, + "learning_rate": 4.638699992719762e-05, + "loss": 4.6488, + "step": 29134 + }, + { + "epoch": 0.1732740983918546, + "grad_norm": 1.6078678369522095, + "learning_rate": 4.6386758043093736e-05, + "loss": 4.5668, + "step": 29135 + }, + { + "epoch": 0.1732800456751356, + "grad_norm": 1.640980839729309, + "learning_rate": 4.638651615152395e-05, + "loss": 4.8367, + "step": 29136 + }, + { + "epoch": 0.1732859929584166, + "grad_norm": 1.4911829233169556, + "learning_rate": 4.638627425248835e-05, + "loss": 4.6056, + "step": 29137 + }, + { + "epoch": 0.17329194024169758, + "grad_norm": 1.7402757406234741, + "learning_rate": 4.6386032345987026e-05, + "loss": 4.6695, + "step": 29138 + }, + { + "epoch": 0.1732978875249786, + "grad_norm": 1.7571971416473389, + "learning_rate": 4.638579043202006e-05, + "loss": 4.3587, + "step": 29139 + }, + { + "epoch": 0.1733038348082596, + "grad_norm": 1.9201890230178833, + "learning_rate": 4.6385548510587527e-05, + "loss": 4.6875, + "step": 29140 + }, + { + "epoch": 0.17330978209154058, + "grad_norm": 1.61739182472229, + "learning_rate": 4.638530658168954e-05, + "loss": 4.354, + "step": 29141 + }, + { + "epoch": 0.1733157293748216, + "grad_norm": 1.530254602432251, + "learning_rate": 4.6385064645326144e-05, + "loss": 5.1195, + "step": 29142 + }, + { + "epoch": 0.17332167665810258, + "grad_norm": 1.604181170463562, + "learning_rate": 4.638482270149745e-05, + "loss": 4.5733, + "step": 29143 + }, + { + "epoch": 0.17332762394138357, + "grad_norm": 1.5250577926635742, + "learning_rate": 4.638458075020353e-05, + "loss": 5.0787, + "step": 29144 + }, + { + "epoch": 0.17333357122466458, + "grad_norm": 1.539345383644104, + "learning_rate": 4.638433879144448e-05, + "loss": 4.5644, + "step": 29145 + }, + { + "epoch": 0.17333951850794557, + "grad_norm": 1.4076765775680542, + "learning_rate": 4.6384096825220376e-05, + "loss": 4.8226, + "step": 29146 + }, + { + "epoch": 0.17334546579122656, + "grad_norm": 1.5576672554016113, + "learning_rate": 4.6383854851531304e-05, + "loss": 4.8671, + "step": 29147 + }, + { + "epoch": 0.17335141307450758, + "grad_norm": 1.4902443885803223, + "learning_rate": 4.638361287037735e-05, + "loss": 5.0003, + "step": 29148 + }, + { + "epoch": 0.17335736035778856, + "grad_norm": 1.3985077142715454, + "learning_rate": 4.63833708817586e-05, + "loss": 5.297, + "step": 29149 + }, + { + "epoch": 0.17336330764106955, + "grad_norm": 1.798403263092041, + "learning_rate": 4.638312888567513e-05, + "loss": 4.8625, + "step": 29150 + }, + { + "epoch": 0.17336925492435057, + "grad_norm": 1.5843340158462524, + "learning_rate": 4.638288688212704e-05, + "loss": 4.7577, + "step": 29151 + }, + { + "epoch": 0.17337520220763156, + "grad_norm": 1.5716784000396729, + "learning_rate": 4.63826448711144e-05, + "loss": 5.1091, + "step": 29152 + }, + { + "epoch": 0.17338114949091255, + "grad_norm": 1.7493597269058228, + "learning_rate": 4.6382402852637294e-05, + "loss": 4.8566, + "step": 29153 + }, + { + "epoch": 0.17338709677419356, + "grad_norm": 1.6974579095840454, + "learning_rate": 4.638216082669582e-05, + "loss": 4.8687, + "step": 29154 + }, + { + "epoch": 0.17339304405747455, + "grad_norm": 1.6314281225204468, + "learning_rate": 4.6381918793290055e-05, + "loss": 4.8077, + "step": 29155 + }, + { + "epoch": 0.17339899134075554, + "grad_norm": 1.6575573682785034, + "learning_rate": 4.6381676752420076e-05, + "loss": 4.9225, + "step": 29156 + }, + { + "epoch": 0.17340493862403655, + "grad_norm": 1.4562337398529053, + "learning_rate": 4.638143470408598e-05, + "loss": 5.056, + "step": 29157 + }, + { + "epoch": 0.17341088590731754, + "grad_norm": 1.6989314556121826, + "learning_rate": 4.638119264828784e-05, + "loss": 5.0006, + "step": 29158 + }, + { + "epoch": 0.17341683319059853, + "grad_norm": 1.6114591360092163, + "learning_rate": 4.638095058502575e-05, + "loss": 4.7174, + "step": 29159 + }, + { + "epoch": 0.17342278047387955, + "grad_norm": 1.8833446502685547, + "learning_rate": 4.6380708514299794e-05, + "loss": 4.6826, + "step": 29160 + }, + { + "epoch": 0.17342872775716053, + "grad_norm": 1.8556679487228394, + "learning_rate": 4.638046643611006e-05, + "loss": 4.6246, + "step": 29161 + }, + { + "epoch": 0.17343467504044152, + "grad_norm": 1.8661102056503296, + "learning_rate": 4.6380224350456615e-05, + "loss": 4.4789, + "step": 29162 + }, + { + "epoch": 0.1734406223237225, + "grad_norm": 1.7095074653625488, + "learning_rate": 4.637998225733956e-05, + "loss": 4.923, + "step": 29163 + }, + { + "epoch": 0.17344656960700353, + "grad_norm": 1.34967041015625, + "learning_rate": 4.6379740156758966e-05, + "loss": 4.797, + "step": 29164 + }, + { + "epoch": 0.1734525168902845, + "grad_norm": 1.7319891452789307, + "learning_rate": 4.637949804871493e-05, + "loss": 4.5764, + "step": 29165 + }, + { + "epoch": 0.1734584641735655, + "grad_norm": 1.7644058465957642, + "learning_rate": 4.637925593320754e-05, + "loss": 4.5592, + "step": 29166 + }, + { + "epoch": 0.17346441145684652, + "grad_norm": 1.773938775062561, + "learning_rate": 4.637901381023686e-05, + "loss": 5.0608, + "step": 29167 + }, + { + "epoch": 0.1734703587401275, + "grad_norm": 1.7514781951904297, + "learning_rate": 4.637877167980299e-05, + "loss": 4.6467, + "step": 29168 + }, + { + "epoch": 0.1734763060234085, + "grad_norm": 1.6960844993591309, + "learning_rate": 4.637852954190602e-05, + "loss": 4.4893, + "step": 29169 + }, + { + "epoch": 0.1734822533066895, + "grad_norm": 1.687488317489624, + "learning_rate": 4.6378287396546024e-05, + "loss": 4.5032, + "step": 29170 + }, + { + "epoch": 0.1734882005899705, + "grad_norm": 1.71660315990448, + "learning_rate": 4.6378045243723084e-05, + "loss": 4.9538, + "step": 29171 + }, + { + "epoch": 0.1734941478732515, + "grad_norm": 1.8937394618988037, + "learning_rate": 4.637780308343729e-05, + "loss": 4.6157, + "step": 29172 + }, + { + "epoch": 0.1735000951565325, + "grad_norm": 1.8577438592910767, + "learning_rate": 4.637756091568873e-05, + "loss": 4.5289, + "step": 29173 + }, + { + "epoch": 0.1735060424398135, + "grad_norm": 1.6964426040649414, + "learning_rate": 4.637731874047748e-05, + "loss": 4.6735, + "step": 29174 + }, + { + "epoch": 0.17351198972309448, + "grad_norm": 1.708333134651184, + "learning_rate": 4.637707655780363e-05, + "loss": 4.7042, + "step": 29175 + }, + { + "epoch": 0.1735179370063755, + "grad_norm": 2.0699708461761475, + "learning_rate": 4.637683436766726e-05, + "loss": 4.259, + "step": 29176 + }, + { + "epoch": 0.17352388428965648, + "grad_norm": 1.9782260656356812, + "learning_rate": 4.637659217006846e-05, + "loss": 4.2724, + "step": 29177 + }, + { + "epoch": 0.17352983157293747, + "grad_norm": 1.8892062902450562, + "learning_rate": 4.6376349965007316e-05, + "loss": 4.0619, + "step": 29178 + }, + { + "epoch": 0.1735357788562185, + "grad_norm": 3.4207348823547363, + "learning_rate": 4.637610775248391e-05, + "loss": 4.0752, + "step": 29179 + }, + { + "epoch": 0.17354172613949947, + "grad_norm": 2.4128661155700684, + "learning_rate": 4.6375865532498316e-05, + "loss": 3.7859, + "step": 29180 + }, + { + "epoch": 0.17354767342278046, + "grad_norm": 1.7334697246551514, + "learning_rate": 4.6375623305050635e-05, + "loss": 4.586, + "step": 29181 + }, + { + "epoch": 0.17355362070606148, + "grad_norm": 2.0362465381622314, + "learning_rate": 4.6375381070140946e-05, + "loss": 4.2091, + "step": 29182 + }, + { + "epoch": 0.17355956798934247, + "grad_norm": 1.7851359844207764, + "learning_rate": 4.637513882776933e-05, + "loss": 4.1567, + "step": 29183 + }, + { + "epoch": 0.17356551527262346, + "grad_norm": 1.9078037738800049, + "learning_rate": 4.637489657793588e-05, + "loss": 4.0716, + "step": 29184 + }, + { + "epoch": 0.17357146255590447, + "grad_norm": 1.7366207838058472, + "learning_rate": 4.6374654320640666e-05, + "loss": 4.3262, + "step": 29185 + }, + { + "epoch": 0.17357740983918546, + "grad_norm": 1.8948423862457275, + "learning_rate": 4.6374412055883785e-05, + "loss": 4.1564, + "step": 29186 + }, + { + "epoch": 0.17358335712246645, + "grad_norm": 1.9613217115402222, + "learning_rate": 4.637416978366532e-05, + "loss": 4.1586, + "step": 29187 + }, + { + "epoch": 0.17358930440574746, + "grad_norm": 2.4783365726470947, + "learning_rate": 4.637392750398535e-05, + "loss": 3.6734, + "step": 29188 + }, + { + "epoch": 0.17359525168902845, + "grad_norm": 2.1660149097442627, + "learning_rate": 4.637368521684396e-05, + "loss": 3.7469, + "step": 29189 + }, + { + "epoch": 0.17360119897230944, + "grad_norm": 2.462066650390625, + "learning_rate": 4.637344292224124e-05, + "loss": 3.6566, + "step": 29190 + }, + { + "epoch": 0.17360714625559046, + "grad_norm": 1.8963021039962769, + "learning_rate": 4.637320062017727e-05, + "loss": 4.0244, + "step": 29191 + }, + { + "epoch": 0.17361309353887144, + "grad_norm": 1.9739018678665161, + "learning_rate": 4.6372958310652135e-05, + "loss": 4.1696, + "step": 29192 + }, + { + "epoch": 0.17361904082215243, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.637271599366593e-05, + "loss": 4.7111, + "step": 29193 + }, + { + "epoch": 0.17362498810543345, + "grad_norm": 1.8292521238327026, + "learning_rate": 4.637247366921872e-05, + "loss": 4.6283, + "step": 29194 + }, + { + "epoch": 0.17363093538871444, + "grad_norm": 1.5309460163116455, + "learning_rate": 4.6372231337310605e-05, + "loss": 4.6252, + "step": 29195 + }, + { + "epoch": 0.17363688267199542, + "grad_norm": 1.8792744874954224, + "learning_rate": 4.637198899794167e-05, + "loss": 4.2226, + "step": 29196 + }, + { + "epoch": 0.17364282995527644, + "grad_norm": 2.1824088096618652, + "learning_rate": 4.6371746651111985e-05, + "loss": 4.2028, + "step": 29197 + }, + { + "epoch": 0.17364877723855743, + "grad_norm": 2.0413753986358643, + "learning_rate": 4.637150429682165e-05, + "loss": 4.1982, + "step": 29198 + }, + { + "epoch": 0.17365472452183842, + "grad_norm": 1.6897474527359009, + "learning_rate": 4.637126193507074e-05, + "loss": 4.5085, + "step": 29199 + }, + { + "epoch": 0.17366067180511943, + "grad_norm": 1.6577891111373901, + "learning_rate": 4.637101956585935e-05, + "loss": 4.6212, + "step": 29200 + }, + { + "epoch": 0.17366661908840042, + "grad_norm": 1.6855782270431519, + "learning_rate": 4.637077718918755e-05, + "loss": 4.7156, + "step": 29201 + }, + { + "epoch": 0.1736725663716814, + "grad_norm": 2.017664909362793, + "learning_rate": 4.637053480505543e-05, + "loss": 4.5439, + "step": 29202 + }, + { + "epoch": 0.17367851365496242, + "grad_norm": 1.7421058416366577, + "learning_rate": 4.637029241346309e-05, + "loss": 4.3292, + "step": 29203 + }, + { + "epoch": 0.1736844609382434, + "grad_norm": 1.6741775274276733, + "learning_rate": 4.6370050014410594e-05, + "loss": 4.3136, + "step": 29204 + }, + { + "epoch": 0.1736904082215244, + "grad_norm": 1.9777534008026123, + "learning_rate": 4.636980760789803e-05, + "loss": 4.1499, + "step": 29205 + }, + { + "epoch": 0.17369635550480542, + "grad_norm": 2.133716583251953, + "learning_rate": 4.6369565193925505e-05, + "loss": 4.2251, + "step": 29206 + }, + { + "epoch": 0.1737023027880864, + "grad_norm": 2.047595739364624, + "learning_rate": 4.636932277249306e-05, + "loss": 4.0876, + "step": 29207 + }, + { + "epoch": 0.1737082500713674, + "grad_norm": 1.9693220853805542, + "learning_rate": 4.636908034360082e-05, + "loss": 3.8007, + "step": 29208 + }, + { + "epoch": 0.1737141973546484, + "grad_norm": 1.7148840427398682, + "learning_rate": 4.6368837907248855e-05, + "loss": 4.3048, + "step": 29209 + }, + { + "epoch": 0.1737201446379294, + "grad_norm": 1.4605804681777954, + "learning_rate": 4.6368595463437246e-05, + "loss": 4.402, + "step": 29210 + }, + { + "epoch": 0.17372609192121038, + "grad_norm": 1.8033897876739502, + "learning_rate": 4.636835301216608e-05, + "loss": 4.491, + "step": 29211 + }, + { + "epoch": 0.1737320392044914, + "grad_norm": 1.6581388711929321, + "learning_rate": 4.636811055343545e-05, + "loss": 4.5847, + "step": 29212 + }, + { + "epoch": 0.1737379864877724, + "grad_norm": 1.7046984434127808, + "learning_rate": 4.636786808724542e-05, + "loss": 4.7485, + "step": 29213 + }, + { + "epoch": 0.17374393377105338, + "grad_norm": 1.735479474067688, + "learning_rate": 4.6367625613596096e-05, + "loss": 4.8771, + "step": 29214 + }, + { + "epoch": 0.1737498810543344, + "grad_norm": 1.781473994255066, + "learning_rate": 4.636738313248756e-05, + "loss": 4.4308, + "step": 29215 + }, + { + "epoch": 0.17375582833761538, + "grad_norm": 1.945377230644226, + "learning_rate": 4.636714064391988e-05, + "loss": 3.9839, + "step": 29216 + }, + { + "epoch": 0.17376177562089637, + "grad_norm": 1.9880878925323486, + "learning_rate": 4.6366898147893165e-05, + "loss": 4.1544, + "step": 29217 + }, + { + "epoch": 0.17376772290417739, + "grad_norm": 1.9976726770401, + "learning_rate": 4.6366655644407475e-05, + "loss": 4.4061, + "step": 29218 + }, + { + "epoch": 0.17377367018745837, + "grad_norm": 2.0192174911499023, + "learning_rate": 4.6366413133462915e-05, + "loss": 4.3094, + "step": 29219 + }, + { + "epoch": 0.17377961747073936, + "grad_norm": 1.9302101135253906, + "learning_rate": 4.636617061505956e-05, + "loss": 4.7673, + "step": 29220 + }, + { + "epoch": 0.17378556475402035, + "grad_norm": 1.6863242387771606, + "learning_rate": 4.636592808919749e-05, + "loss": 4.7641, + "step": 29221 + }, + { + "epoch": 0.17379151203730137, + "grad_norm": 1.8345664739608765, + "learning_rate": 4.63656855558768e-05, + "loss": 4.6849, + "step": 29222 + }, + { + "epoch": 0.17379745932058235, + "grad_norm": 1.5179288387298584, + "learning_rate": 4.636544301509756e-05, + "loss": 4.7481, + "step": 29223 + }, + { + "epoch": 0.17380340660386334, + "grad_norm": 1.82593834400177, + "learning_rate": 4.6365200466859876e-05, + "loss": 4.7234, + "step": 29224 + }, + { + "epoch": 0.17380935388714436, + "grad_norm": 1.7959182262420654, + "learning_rate": 4.636495791116382e-05, + "loss": 5.0005, + "step": 29225 + }, + { + "epoch": 0.17381530117042535, + "grad_norm": 2.36141037940979, + "learning_rate": 4.636471534800947e-05, + "loss": 4.1279, + "step": 29226 + }, + { + "epoch": 0.17382124845370633, + "grad_norm": 1.8446800708770752, + "learning_rate": 4.636447277739693e-05, + "loss": 4.3379, + "step": 29227 + }, + { + "epoch": 0.17382719573698735, + "grad_norm": 1.9190828800201416, + "learning_rate": 4.636423019932626e-05, + "loss": 4.3296, + "step": 29228 + }, + { + "epoch": 0.17383314302026834, + "grad_norm": 1.863991379737854, + "learning_rate": 4.636398761379756e-05, + "loss": 4.3733, + "step": 29229 + }, + { + "epoch": 0.17383909030354933, + "grad_norm": 1.7630629539489746, + "learning_rate": 4.636374502081092e-05, + "loss": 4.3829, + "step": 29230 + }, + { + "epoch": 0.17384503758683034, + "grad_norm": 1.554083228111267, + "learning_rate": 4.636350242036642e-05, + "loss": 4.6883, + "step": 29231 + }, + { + "epoch": 0.17385098487011133, + "grad_norm": 1.6765477657318115, + "learning_rate": 4.6363259812464135e-05, + "loss": 4.5129, + "step": 29232 + }, + { + "epoch": 0.17385693215339232, + "grad_norm": 1.6007416248321533, + "learning_rate": 4.636301719710416e-05, + "loss": 4.561, + "step": 29233 + }, + { + "epoch": 0.17386287943667333, + "grad_norm": 1.6795105934143066, + "learning_rate": 4.6362774574286575e-05, + "loss": 4.6389, + "step": 29234 + }, + { + "epoch": 0.17386882671995432, + "grad_norm": 1.6491032838821411, + "learning_rate": 4.6362531944011464e-05, + "loss": 4.3857, + "step": 29235 + }, + { + "epoch": 0.1738747740032353, + "grad_norm": 2.123032569885254, + "learning_rate": 4.636228930627892e-05, + "loss": 3.8423, + "step": 29236 + }, + { + "epoch": 0.17388072128651633, + "grad_norm": 2.0041513442993164, + "learning_rate": 4.636204666108902e-05, + "loss": 3.1621, + "step": 29237 + }, + { + "epoch": 0.17388666856979731, + "grad_norm": 1.6654435396194458, + "learning_rate": 4.636180400844185e-05, + "loss": 4.3272, + "step": 29238 + }, + { + "epoch": 0.1738926158530783, + "grad_norm": 1.553393006324768, + "learning_rate": 4.636156134833749e-05, + "loss": 4.9542, + "step": 29239 + }, + { + "epoch": 0.17389856313635932, + "grad_norm": 1.6511328220367432, + "learning_rate": 4.6361318680776035e-05, + "loss": 5.0055, + "step": 29240 + }, + { + "epoch": 0.1739045104196403, + "grad_norm": 1.8133567571640015, + "learning_rate": 4.6361076005757554e-05, + "loss": 4.4575, + "step": 29241 + }, + { + "epoch": 0.1739104577029213, + "grad_norm": 2.6649341583251953, + "learning_rate": 4.636083332328215e-05, + "loss": 4.1054, + "step": 29242 + }, + { + "epoch": 0.1739164049862023, + "grad_norm": 2.676636219024658, + "learning_rate": 4.63605906333499e-05, + "loss": 3.5847, + "step": 29243 + }, + { + "epoch": 0.1739223522694833, + "grad_norm": 2.376490592956543, + "learning_rate": 4.636034793596089e-05, + "loss": 3.9051, + "step": 29244 + }, + { + "epoch": 0.1739282995527643, + "grad_norm": 1.6567094326019287, + "learning_rate": 4.63601052311152e-05, + "loss": 5.1711, + "step": 29245 + }, + { + "epoch": 0.1739342468360453, + "grad_norm": 1.981115698814392, + "learning_rate": 4.6359862518812924e-05, + "loss": 3.8426, + "step": 29246 + }, + { + "epoch": 0.1739401941193263, + "grad_norm": 1.640690565109253, + "learning_rate": 4.6359619799054136e-05, + "loss": 4.3196, + "step": 29247 + }, + { + "epoch": 0.17394614140260728, + "grad_norm": 1.6027098894119263, + "learning_rate": 4.635937707183892e-05, + "loss": 5.2091, + "step": 29248 + }, + { + "epoch": 0.1739520886858883, + "grad_norm": 1.732526183128357, + "learning_rate": 4.6359134337167375e-05, + "loss": 5.0799, + "step": 29249 + }, + { + "epoch": 0.17395803596916928, + "grad_norm": 1.7720987796783447, + "learning_rate": 4.635889159503957e-05, + "loss": 4.9359, + "step": 29250 + }, + { + "epoch": 0.17396398325245027, + "grad_norm": 1.60392427444458, + "learning_rate": 4.63586488454556e-05, + "loss": 4.8213, + "step": 29251 + }, + { + "epoch": 0.1739699305357313, + "grad_norm": 1.4416741132736206, + "learning_rate": 4.635840608841555e-05, + "loss": 5.1283, + "step": 29252 + }, + { + "epoch": 0.17397587781901228, + "grad_norm": 1.9322450160980225, + "learning_rate": 4.63581633239195e-05, + "loss": 4.4477, + "step": 29253 + }, + { + "epoch": 0.17398182510229326, + "grad_norm": 1.661475658416748, + "learning_rate": 4.635792055196753e-05, + "loss": 4.6993, + "step": 29254 + }, + { + "epoch": 0.17398777238557428, + "grad_norm": 1.7771600484848022, + "learning_rate": 4.635767777255973e-05, + "loss": 4.4883, + "step": 29255 + }, + { + "epoch": 0.17399371966885527, + "grad_norm": 1.8131498098373413, + "learning_rate": 4.635743498569619e-05, + "loss": 3.9214, + "step": 29256 + }, + { + "epoch": 0.17399966695213626, + "grad_norm": 1.6624927520751953, + "learning_rate": 4.635719219137699e-05, + "loss": 4.9492, + "step": 29257 + }, + { + "epoch": 0.17400561423541727, + "grad_norm": 2.7123286724090576, + "learning_rate": 4.6356949389602214e-05, + "loss": 4.7048, + "step": 29258 + }, + { + "epoch": 0.17401156151869826, + "grad_norm": 2.078057050704956, + "learning_rate": 4.6356706580371945e-05, + "loss": 4.5294, + "step": 29259 + }, + { + "epoch": 0.17401750880197925, + "grad_norm": 1.738935947418213, + "learning_rate": 4.6356463763686275e-05, + "loss": 4.7332, + "step": 29260 + }, + { + "epoch": 0.17402345608526026, + "grad_norm": 1.8803629875183105, + "learning_rate": 4.635622093954528e-05, + "loss": 4.9347, + "step": 29261 + }, + { + "epoch": 0.17402940336854125, + "grad_norm": 1.3738025426864624, + "learning_rate": 4.635597810794905e-05, + "loss": 5.4709, + "step": 29262 + }, + { + "epoch": 0.17403535065182224, + "grad_norm": 1.6917965412139893, + "learning_rate": 4.635573526889767e-05, + "loss": 4.5494, + "step": 29263 + }, + { + "epoch": 0.17404129793510326, + "grad_norm": 1.9916536808013916, + "learning_rate": 4.6355492422391226e-05, + "loss": 4.4302, + "step": 29264 + }, + { + "epoch": 0.17404724521838424, + "grad_norm": 1.8959016799926758, + "learning_rate": 4.63552495684298e-05, + "loss": 4.1595, + "step": 29265 + }, + { + "epoch": 0.17405319250166523, + "grad_norm": 1.7730271816253662, + "learning_rate": 4.635500670701347e-05, + "loss": 4.6212, + "step": 29266 + }, + { + "epoch": 0.17405913978494625, + "grad_norm": 1.9785410165786743, + "learning_rate": 4.635476383814233e-05, + "loss": 4.6885, + "step": 29267 + }, + { + "epoch": 0.17406508706822724, + "grad_norm": 1.915924310684204, + "learning_rate": 4.6354520961816475e-05, + "loss": 4.4186, + "step": 29268 + }, + { + "epoch": 0.17407103435150822, + "grad_norm": 1.6227480173110962, + "learning_rate": 4.6354278078035964e-05, + "loss": 4.6483, + "step": 29269 + }, + { + "epoch": 0.17407698163478924, + "grad_norm": 1.6679190397262573, + "learning_rate": 4.635403518680089e-05, + "loss": 4.9393, + "step": 29270 + }, + { + "epoch": 0.17408292891807023, + "grad_norm": 1.3380484580993652, + "learning_rate": 4.6353792288111353e-05, + "loss": 5.1539, + "step": 29271 + }, + { + "epoch": 0.17408887620135122, + "grad_norm": 1.3670740127563477, + "learning_rate": 4.635354938196743e-05, + "loss": 5.1949, + "step": 29272 + }, + { + "epoch": 0.17409482348463223, + "grad_norm": 1.288189172744751, + "learning_rate": 4.63533064683692e-05, + "loss": 5.4657, + "step": 29273 + }, + { + "epoch": 0.17410077076791322, + "grad_norm": 1.4686154127120972, + "learning_rate": 4.635306354731675e-05, + "loss": 5.5222, + "step": 29274 + }, + { + "epoch": 0.1741067180511942, + "grad_norm": 1.4154938459396362, + "learning_rate": 4.635282061881017e-05, + "loss": 5.261, + "step": 29275 + }, + { + "epoch": 0.17411266533447523, + "grad_norm": 1.7723246812820435, + "learning_rate": 4.635257768284953e-05, + "loss": 5.1817, + "step": 29276 + }, + { + "epoch": 0.1741186126177562, + "grad_norm": 1.7621451616287231, + "learning_rate": 4.635233473943494e-05, + "loss": 5.2426, + "step": 29277 + }, + { + "epoch": 0.1741245599010372, + "grad_norm": 1.6899840831756592, + "learning_rate": 4.6352091788566466e-05, + "loss": 4.5392, + "step": 29278 + }, + { + "epoch": 0.17413050718431822, + "grad_norm": 1.7704182863235474, + "learning_rate": 4.6351848830244195e-05, + "loss": 4.4345, + "step": 29279 + }, + { + "epoch": 0.1741364544675992, + "grad_norm": 1.9371867179870605, + "learning_rate": 4.635160586446821e-05, + "loss": 4.1621, + "step": 29280 + }, + { + "epoch": 0.1741424017508802, + "grad_norm": 1.771759271621704, + "learning_rate": 4.63513628912386e-05, + "loss": 4.7001, + "step": 29281 + }, + { + "epoch": 0.17414834903416118, + "grad_norm": 2.212144136428833, + "learning_rate": 4.635111991055546e-05, + "loss": 3.7101, + "step": 29282 + }, + { + "epoch": 0.1741542963174422, + "grad_norm": 2.0476841926574707, + "learning_rate": 4.6350876922418864e-05, + "loss": 3.4412, + "step": 29283 + }, + { + "epoch": 0.17416024360072319, + "grad_norm": 1.849636197090149, + "learning_rate": 4.635063392682889e-05, + "loss": 4.553, + "step": 29284 + }, + { + "epoch": 0.17416619088400417, + "grad_norm": 1.9307612180709839, + "learning_rate": 4.6350390923785634e-05, + "loss": 3.7483, + "step": 29285 + }, + { + "epoch": 0.1741721381672852, + "grad_norm": 1.9862045049667358, + "learning_rate": 4.6350147913289176e-05, + "loss": 4.3754, + "step": 29286 + }, + { + "epoch": 0.17417808545056618, + "grad_norm": 1.7079651355743408, + "learning_rate": 4.63499048953396e-05, + "loss": 5.3671, + "step": 29287 + }, + { + "epoch": 0.17418403273384717, + "grad_norm": 1.8182214498519897, + "learning_rate": 4.6349661869937e-05, + "loss": 5.148, + "step": 29288 + }, + { + "epoch": 0.17418998001712818, + "grad_norm": 1.7571437358856201, + "learning_rate": 4.6349418837081445e-05, + "loss": 4.4641, + "step": 29289 + }, + { + "epoch": 0.17419592730040917, + "grad_norm": 1.6432558298110962, + "learning_rate": 4.6349175796773026e-05, + "loss": 4.6966, + "step": 29290 + }, + { + "epoch": 0.17420187458369016, + "grad_norm": 1.729112148284912, + "learning_rate": 4.634893274901184e-05, + "loss": 4.6125, + "step": 29291 + }, + { + "epoch": 0.17420782186697117, + "grad_norm": 1.5376940965652466, + "learning_rate": 4.6348689693797954e-05, + "loss": 4.4921, + "step": 29292 + }, + { + "epoch": 0.17421376915025216, + "grad_norm": 1.8997972011566162, + "learning_rate": 4.634844663113147e-05, + "loss": 4.4163, + "step": 29293 + }, + { + "epoch": 0.17421971643353315, + "grad_norm": 1.6643134355545044, + "learning_rate": 4.634820356101246e-05, + "loss": 4.7624, + "step": 29294 + }, + { + "epoch": 0.17422566371681417, + "grad_norm": 1.4758678674697876, + "learning_rate": 4.6347960483441013e-05, + "loss": 5.3261, + "step": 29295 + }, + { + "epoch": 0.17423161100009515, + "grad_norm": 1.7518540620803833, + "learning_rate": 4.6347717398417203e-05, + "loss": 4.4916, + "step": 29296 + }, + { + "epoch": 0.17423755828337614, + "grad_norm": 1.6143438816070557, + "learning_rate": 4.634747430594114e-05, + "loss": 4.336, + "step": 29297 + }, + { + "epoch": 0.17424350556665716, + "grad_norm": 1.6077839136123657, + "learning_rate": 4.634723120601289e-05, + "loss": 4.5486, + "step": 29298 + }, + { + "epoch": 0.17424945284993815, + "grad_norm": 1.9146685600280762, + "learning_rate": 4.634698809863254e-05, + "loss": 5.1115, + "step": 29299 + }, + { + "epoch": 0.17425540013321913, + "grad_norm": 1.6625542640686035, + "learning_rate": 4.634674498380018e-05, + "loss": 4.653, + "step": 29300 + }, + { + "epoch": 0.17426134741650015, + "grad_norm": 1.7577245235443115, + "learning_rate": 4.634650186151589e-05, + "loss": 4.8305, + "step": 29301 + }, + { + "epoch": 0.17426729469978114, + "grad_norm": 1.5614792108535767, + "learning_rate": 4.6346258731779755e-05, + "loss": 4.8553, + "step": 29302 + }, + { + "epoch": 0.17427324198306213, + "grad_norm": 1.5734407901763916, + "learning_rate": 4.634601559459186e-05, + "loss": 4.9925, + "step": 29303 + }, + { + "epoch": 0.17427918926634314, + "grad_norm": 1.855974555015564, + "learning_rate": 4.6345772449952293e-05, + "loss": 4.7862, + "step": 29304 + }, + { + "epoch": 0.17428513654962413, + "grad_norm": 2.7702269554138184, + "learning_rate": 4.6345529297861146e-05, + "loss": 3.766, + "step": 29305 + }, + { + "epoch": 0.17429108383290512, + "grad_norm": 2.7763569355010986, + "learning_rate": 4.634528613831848e-05, + "loss": 3.343, + "step": 29306 + }, + { + "epoch": 0.17429703111618614, + "grad_norm": 3.1644763946533203, + "learning_rate": 4.6345042971324406e-05, + "loss": 3.4067, + "step": 29307 + }, + { + "epoch": 0.17430297839946712, + "grad_norm": 2.487724781036377, + "learning_rate": 4.6344799796878996e-05, + "loss": 3.226, + "step": 29308 + }, + { + "epoch": 0.1743089256827481, + "grad_norm": 2.340416431427002, + "learning_rate": 4.634455661498234e-05, + "loss": 3.2899, + "step": 29309 + }, + { + "epoch": 0.17431487296602913, + "grad_norm": 1.6526988744735718, + "learning_rate": 4.634431342563451e-05, + "loss": 4.9105, + "step": 29310 + }, + { + "epoch": 0.17432082024931012, + "grad_norm": 2.876229763031006, + "learning_rate": 4.6344070228835614e-05, + "loss": 3.4319, + "step": 29311 + }, + { + "epoch": 0.1743267675325911, + "grad_norm": 2.176748037338257, + "learning_rate": 4.6343827024585716e-05, + "loss": 3.2444, + "step": 29312 + }, + { + "epoch": 0.17433271481587212, + "grad_norm": 2.6688148975372314, + "learning_rate": 4.6343583812884904e-05, + "loss": 3.3417, + "step": 29313 + }, + { + "epoch": 0.1743386620991531, + "grad_norm": 2.5572376251220703, + "learning_rate": 4.634334059373328e-05, + "loss": 4.0048, + "step": 29314 + }, + { + "epoch": 0.1743446093824341, + "grad_norm": 2.3012688159942627, + "learning_rate": 4.6343097367130905e-05, + "loss": 3.5363, + "step": 29315 + }, + { + "epoch": 0.1743505566657151, + "grad_norm": 1.9870244264602661, + "learning_rate": 4.6342854133077875e-05, + "loss": 4.0843, + "step": 29316 + }, + { + "epoch": 0.1743565039489961, + "grad_norm": 2.538632392883301, + "learning_rate": 4.6342610891574276e-05, + "loss": 3.3337, + "step": 29317 + }, + { + "epoch": 0.1743624512322771, + "grad_norm": 2.8932511806488037, + "learning_rate": 4.63423676426202e-05, + "loss": 3.2887, + "step": 29318 + }, + { + "epoch": 0.1743683985155581, + "grad_norm": 2.55438494682312, + "learning_rate": 4.634212438621572e-05, + "loss": 3.6218, + "step": 29319 + }, + { + "epoch": 0.1743743457988391, + "grad_norm": 2.505047082901001, + "learning_rate": 4.634188112236092e-05, + "loss": 3.182, + "step": 29320 + }, + { + "epoch": 0.17438029308212008, + "grad_norm": 2.8068132400512695, + "learning_rate": 4.63416378510559e-05, + "loss": 3.5654, + "step": 29321 + }, + { + "epoch": 0.1743862403654011, + "grad_norm": 1.9296205043792725, + "learning_rate": 4.6341394572300725e-05, + "loss": 4.492, + "step": 29322 + }, + { + "epoch": 0.17439218764868208, + "grad_norm": 1.6537705659866333, + "learning_rate": 4.63411512860955e-05, + "loss": 5.0017, + "step": 29323 + }, + { + "epoch": 0.17439813493196307, + "grad_norm": 1.8064972162246704, + "learning_rate": 4.634090799244028e-05, + "loss": 4.5991, + "step": 29324 + }, + { + "epoch": 0.1744040822152441, + "grad_norm": 1.7944536209106445, + "learning_rate": 4.634066469133519e-05, + "loss": 4.6755, + "step": 29325 + }, + { + "epoch": 0.17441002949852508, + "grad_norm": 2.222592830657959, + "learning_rate": 4.6340421382780286e-05, + "loss": 4.3161, + "step": 29326 + }, + { + "epoch": 0.17441597678180606, + "grad_norm": 2.1058638095855713, + "learning_rate": 4.634017806677567e-05, + "loss": 4.077, + "step": 29327 + }, + { + "epoch": 0.17442192406508708, + "grad_norm": 1.8931814432144165, + "learning_rate": 4.63399347433214e-05, + "loss": 4.2838, + "step": 29328 + }, + { + "epoch": 0.17442787134836807, + "grad_norm": 1.7035942077636719, + "learning_rate": 4.6339691412417586e-05, + "loss": 4.8623, + "step": 29329 + }, + { + "epoch": 0.17443381863164906, + "grad_norm": 1.7701468467712402, + "learning_rate": 4.6339448074064314e-05, + "loss": 4.9063, + "step": 29330 + }, + { + "epoch": 0.17443976591493007, + "grad_norm": 1.7608574628829956, + "learning_rate": 4.633920472826165e-05, + "loss": 4.42, + "step": 29331 + }, + { + "epoch": 0.17444571319821106, + "grad_norm": 2.5129191875457764, + "learning_rate": 4.633896137500971e-05, + "loss": 3.2521, + "step": 29332 + }, + { + "epoch": 0.17445166048149205, + "grad_norm": 1.477378010749817, + "learning_rate": 4.6338718014308534e-05, + "loss": 5.6654, + "step": 29333 + }, + { + "epoch": 0.17445760776477306, + "grad_norm": 1.6242940425872803, + "learning_rate": 4.633847464615825e-05, + "loss": 5.392, + "step": 29334 + }, + { + "epoch": 0.17446355504805405, + "grad_norm": 1.827919602394104, + "learning_rate": 4.633823127055892e-05, + "loss": 4.8818, + "step": 29335 + }, + { + "epoch": 0.17446950233133504, + "grad_norm": 1.6197007894515991, + "learning_rate": 4.633798788751063e-05, + "loss": 4.983, + "step": 29336 + }, + { + "epoch": 0.17447544961461606, + "grad_norm": 1.703899621963501, + "learning_rate": 4.633774449701347e-05, + "loss": 4.9122, + "step": 29337 + }, + { + "epoch": 0.17448139689789705, + "grad_norm": 1.7812259197235107, + "learning_rate": 4.633750109906753e-05, + "loss": 4.6429, + "step": 29338 + }, + { + "epoch": 0.17448734418117803, + "grad_norm": 1.6351381540298462, + "learning_rate": 4.633725769367288e-05, + "loss": 4.9078, + "step": 29339 + }, + { + "epoch": 0.17449329146445902, + "grad_norm": 1.7403061389923096, + "learning_rate": 4.633701428082962e-05, + "loss": 4.6946, + "step": 29340 + }, + { + "epoch": 0.17449923874774004, + "grad_norm": 1.8006681203842163, + "learning_rate": 4.633677086053783e-05, + "loss": 4.2984, + "step": 29341 + }, + { + "epoch": 0.17450518603102103, + "grad_norm": 1.7105704545974731, + "learning_rate": 4.633652743279759e-05, + "loss": 4.4426, + "step": 29342 + }, + { + "epoch": 0.174511133314302, + "grad_norm": 1.7440415620803833, + "learning_rate": 4.6336283997608984e-05, + "loss": 4.4029, + "step": 29343 + }, + { + "epoch": 0.17451708059758303, + "grad_norm": 1.7197996377944946, + "learning_rate": 4.633604055497211e-05, + "loss": 4.263, + "step": 29344 + }, + { + "epoch": 0.17452302788086402, + "grad_norm": 1.7282319068908691, + "learning_rate": 4.633579710488704e-05, + "loss": 4.546, + "step": 29345 + }, + { + "epoch": 0.174528975164145, + "grad_norm": 1.7449449300765991, + "learning_rate": 4.633555364735387e-05, + "loss": 5.1083, + "step": 29346 + }, + { + "epoch": 0.17453492244742602, + "grad_norm": 1.645507574081421, + "learning_rate": 4.633531018237267e-05, + "loss": 4.1636, + "step": 29347 + }, + { + "epoch": 0.174540869730707, + "grad_norm": 1.671286702156067, + "learning_rate": 4.6335066709943534e-05, + "loss": 4.5991, + "step": 29348 + }, + { + "epoch": 0.174546817013988, + "grad_norm": 1.5074694156646729, + "learning_rate": 4.6334823230066554e-05, + "loss": 4.5064, + "step": 29349 + }, + { + "epoch": 0.17455276429726901, + "grad_norm": 1.7285078763961792, + "learning_rate": 4.63345797427418e-05, + "loss": 4.561, + "step": 29350 + }, + { + "epoch": 0.17455871158055, + "grad_norm": 1.9212089776992798, + "learning_rate": 4.6334336247969376e-05, + "loss": 4.2444, + "step": 29351 + }, + { + "epoch": 0.174564658863831, + "grad_norm": 1.6223878860473633, + "learning_rate": 4.633409274574935e-05, + "loss": 4.8405, + "step": 29352 + }, + { + "epoch": 0.174570606147112, + "grad_norm": 1.7474267482757568, + "learning_rate": 4.6333849236081805e-05, + "loss": 4.5651, + "step": 29353 + }, + { + "epoch": 0.174576553430393, + "grad_norm": 1.6735780239105225, + "learning_rate": 4.6333605718966844e-05, + "loss": 4.1536, + "step": 29354 + }, + { + "epoch": 0.17458250071367398, + "grad_norm": 1.7096998691558838, + "learning_rate": 4.633336219440453e-05, + "loss": 4.4034, + "step": 29355 + }, + { + "epoch": 0.174588447996955, + "grad_norm": 1.7881802320480347, + "learning_rate": 4.633311866239497e-05, + "loss": 4.308, + "step": 29356 + }, + { + "epoch": 0.174594395280236, + "grad_norm": 1.4776397943496704, + "learning_rate": 4.6332875122938236e-05, + "loss": 5.1879, + "step": 29357 + }, + { + "epoch": 0.17460034256351697, + "grad_norm": 1.499626636505127, + "learning_rate": 4.6332631576034414e-05, + "loss": 5.1217, + "step": 29358 + }, + { + "epoch": 0.174606289846798, + "grad_norm": 1.5779564380645752, + "learning_rate": 4.6332388021683594e-05, + "loss": 5.1155, + "step": 29359 + }, + { + "epoch": 0.17461223713007898, + "grad_norm": 1.5778738260269165, + "learning_rate": 4.633214445988585e-05, + "loss": 5.0889, + "step": 29360 + }, + { + "epoch": 0.17461818441335997, + "grad_norm": 1.4342097043991089, + "learning_rate": 4.633190089064128e-05, + "loss": 5.1313, + "step": 29361 + }, + { + "epoch": 0.17462413169664098, + "grad_norm": 1.977306604385376, + "learning_rate": 4.6331657313949975e-05, + "loss": 4.3384, + "step": 29362 + }, + { + "epoch": 0.17463007897992197, + "grad_norm": 1.7359813451766968, + "learning_rate": 4.633141372981199e-05, + "loss": 4.9874, + "step": 29363 + }, + { + "epoch": 0.17463602626320296, + "grad_norm": 1.5922671556472778, + "learning_rate": 4.6331170138227435e-05, + "loss": 5.1194, + "step": 29364 + }, + { + "epoch": 0.17464197354648398, + "grad_norm": 1.8139041662216187, + "learning_rate": 4.63309265391964e-05, + "loss": 5.0046, + "step": 29365 + }, + { + "epoch": 0.17464792082976496, + "grad_norm": 1.6782366037368774, + "learning_rate": 4.633068293271895e-05, + "loss": 5.056, + "step": 29366 + }, + { + "epoch": 0.17465386811304595, + "grad_norm": 1.6051324605941772, + "learning_rate": 4.6330439318795174e-05, + "loss": 5.1002, + "step": 29367 + }, + { + "epoch": 0.17465981539632697, + "grad_norm": 1.6109590530395508, + "learning_rate": 4.633019569742517e-05, + "loss": 4.9802, + "step": 29368 + }, + { + "epoch": 0.17466576267960796, + "grad_norm": 1.5063222646713257, + "learning_rate": 4.6329952068609005e-05, + "loss": 5.5857, + "step": 29369 + }, + { + "epoch": 0.17467170996288894, + "grad_norm": 1.6874276399612427, + "learning_rate": 4.632970843234678e-05, + "loss": 5.161, + "step": 29370 + }, + { + "epoch": 0.17467765724616996, + "grad_norm": 1.8858634233474731, + "learning_rate": 4.6329464788638576e-05, + "loss": 4.6397, + "step": 29371 + }, + { + "epoch": 0.17468360452945095, + "grad_norm": 2.004140853881836, + "learning_rate": 4.632922113748447e-05, + "loss": 4.5306, + "step": 29372 + }, + { + "epoch": 0.17468955181273194, + "grad_norm": 1.278494954109192, + "learning_rate": 4.632897747888456e-05, + "loss": 5.032, + "step": 29373 + }, + { + "epoch": 0.17469549909601295, + "grad_norm": 1.7012786865234375, + "learning_rate": 4.6328733812838925e-05, + "loss": 5.1362, + "step": 29374 + }, + { + "epoch": 0.17470144637929394, + "grad_norm": 1.6155195236206055, + "learning_rate": 4.632849013934765e-05, + "loss": 5.4839, + "step": 29375 + }, + { + "epoch": 0.17470739366257493, + "grad_norm": 1.3312060832977295, + "learning_rate": 4.6328246458410816e-05, + "loss": 5.521, + "step": 29376 + }, + { + "epoch": 0.17471334094585594, + "grad_norm": 1.6347986459732056, + "learning_rate": 4.632800277002851e-05, + "loss": 5.1883, + "step": 29377 + }, + { + "epoch": 0.17471928822913693, + "grad_norm": 1.6213163137435913, + "learning_rate": 4.632775907420082e-05, + "loss": 5.1724, + "step": 29378 + }, + { + "epoch": 0.17472523551241792, + "grad_norm": 2.0514700412750244, + "learning_rate": 4.632751537092783e-05, + "loss": 3.6934, + "step": 29379 + }, + { + "epoch": 0.17473118279569894, + "grad_norm": 1.4713187217712402, + "learning_rate": 4.6327271660209626e-05, + "loss": 4.7456, + "step": 29380 + }, + { + "epoch": 0.17473713007897992, + "grad_norm": 1.5584750175476074, + "learning_rate": 4.6327027942046286e-05, + "loss": 5.0259, + "step": 29381 + }, + { + "epoch": 0.1747430773622609, + "grad_norm": 1.7405140399932861, + "learning_rate": 4.632678421643791e-05, + "loss": 5.1115, + "step": 29382 + }, + { + "epoch": 0.17474902464554193, + "grad_norm": 1.7233058214187622, + "learning_rate": 4.632654048338457e-05, + "loss": 5.3849, + "step": 29383 + }, + { + "epoch": 0.17475497192882292, + "grad_norm": 1.7387725114822388, + "learning_rate": 4.6326296742886356e-05, + "loss": 5.4367, + "step": 29384 + }, + { + "epoch": 0.1747609192121039, + "grad_norm": 1.7022291421890259, + "learning_rate": 4.632605299494335e-05, + "loss": 5.1317, + "step": 29385 + }, + { + "epoch": 0.17476686649538492, + "grad_norm": 1.7683387994766235, + "learning_rate": 4.632580923955564e-05, + "loss": 4.4575, + "step": 29386 + }, + { + "epoch": 0.1747728137786659, + "grad_norm": 1.4611074924468994, + "learning_rate": 4.632556547672331e-05, + "loss": 4.7676, + "step": 29387 + }, + { + "epoch": 0.1747787610619469, + "grad_norm": 1.9123033285140991, + "learning_rate": 4.632532170644644e-05, + "loss": 4.966, + "step": 29388 + }, + { + "epoch": 0.1747847083452279, + "grad_norm": 1.857445478439331, + "learning_rate": 4.632507792872513e-05, + "loss": 4.0338, + "step": 29389 + }, + { + "epoch": 0.1747906556285089, + "grad_norm": 2.620339870452881, + "learning_rate": 4.632483414355945e-05, + "loss": 3.4506, + "step": 29390 + }, + { + "epoch": 0.1747966029117899, + "grad_norm": 2.141939401626587, + "learning_rate": 4.6324590350949494e-05, + "loss": 4.516, + "step": 29391 + }, + { + "epoch": 0.1748025501950709, + "grad_norm": 1.5560227632522583, + "learning_rate": 4.632434655089535e-05, + "loss": 4.8785, + "step": 29392 + }, + { + "epoch": 0.1748084974783519, + "grad_norm": 1.640221357345581, + "learning_rate": 4.632410274339708e-05, + "loss": 4.9614, + "step": 29393 + }, + { + "epoch": 0.17481444476163288, + "grad_norm": 1.6104960441589355, + "learning_rate": 4.63238589284548e-05, + "loss": 4.7536, + "step": 29394 + }, + { + "epoch": 0.1748203920449139, + "grad_norm": 1.599259853363037, + "learning_rate": 4.6323615106068575e-05, + "loss": 5.0939, + "step": 29395 + }, + { + "epoch": 0.17482633932819489, + "grad_norm": 1.630430817604065, + "learning_rate": 4.6323371276238496e-05, + "loss": 4.8851, + "step": 29396 + }, + { + "epoch": 0.17483228661147587, + "grad_norm": 1.6281993389129639, + "learning_rate": 4.632312743896465e-05, + "loss": 4.8152, + "step": 29397 + }, + { + "epoch": 0.17483823389475686, + "grad_norm": 1.7055253982543945, + "learning_rate": 4.632288359424712e-05, + "loss": 4.2515, + "step": 29398 + }, + { + "epoch": 0.17484418117803788, + "grad_norm": 1.739365577697754, + "learning_rate": 4.6322639742085995e-05, + "loss": 4.5137, + "step": 29399 + }, + { + "epoch": 0.17485012846131887, + "grad_norm": 1.7686853408813477, + "learning_rate": 4.632239588248135e-05, + "loss": 5.307, + "step": 29400 + }, + { + "epoch": 0.17485607574459985, + "grad_norm": 1.369730830192566, + "learning_rate": 4.632215201543328e-05, + "loss": 5.3096, + "step": 29401 + }, + { + "epoch": 0.17486202302788087, + "grad_norm": 1.6965676546096802, + "learning_rate": 4.6321908140941874e-05, + "loss": 4.9252, + "step": 29402 + }, + { + "epoch": 0.17486797031116186, + "grad_norm": 1.797540307044983, + "learning_rate": 4.63216642590072e-05, + "loss": 4.4397, + "step": 29403 + }, + { + "epoch": 0.17487391759444285, + "grad_norm": 1.7250994443893433, + "learning_rate": 4.632142036962936e-05, + "loss": 4.4416, + "step": 29404 + }, + { + "epoch": 0.17487986487772386, + "grad_norm": 1.649828314781189, + "learning_rate": 4.632117647280843e-05, + "loss": 4.4497, + "step": 29405 + }, + { + "epoch": 0.17488581216100485, + "grad_norm": 1.7073628902435303, + "learning_rate": 4.632093256854449e-05, + "loss": 4.3074, + "step": 29406 + }, + { + "epoch": 0.17489175944428584, + "grad_norm": 1.6241555213928223, + "learning_rate": 4.632068865683765e-05, + "loss": 4.1219, + "step": 29407 + }, + { + "epoch": 0.17489770672756685, + "grad_norm": 1.356092929840088, + "learning_rate": 4.6320444737687965e-05, + "loss": 4.5548, + "step": 29408 + }, + { + "epoch": 0.17490365401084784, + "grad_norm": 1.5094983577728271, + "learning_rate": 4.632020081109554e-05, + "loss": 5.0598, + "step": 29409 + }, + { + "epoch": 0.17490960129412883, + "grad_norm": 1.596183180809021, + "learning_rate": 4.6319956877060445e-05, + "loss": 5.0795, + "step": 29410 + }, + { + "epoch": 0.17491554857740985, + "grad_norm": 1.7887545824050903, + "learning_rate": 4.6319712935582784e-05, + "loss": 4.9287, + "step": 29411 + }, + { + "epoch": 0.17492149586069083, + "grad_norm": 1.4806302785873413, + "learning_rate": 4.631946898666262e-05, + "loss": 5.0627, + "step": 29412 + }, + { + "epoch": 0.17492744314397182, + "grad_norm": 1.5581897497177124, + "learning_rate": 4.631922503030005e-05, + "loss": 5.2001, + "step": 29413 + }, + { + "epoch": 0.17493339042725284, + "grad_norm": 1.614473819732666, + "learning_rate": 4.631898106649517e-05, + "loss": 4.396, + "step": 29414 + }, + { + "epoch": 0.17493933771053383, + "grad_norm": 1.9394686222076416, + "learning_rate": 4.6318737095248044e-05, + "loss": 3.9614, + "step": 29415 + }, + { + "epoch": 0.17494528499381481, + "grad_norm": 1.6874741315841675, + "learning_rate": 4.631849311655877e-05, + "loss": 4.4714, + "step": 29416 + }, + { + "epoch": 0.17495123227709583, + "grad_norm": 1.8840105533599854, + "learning_rate": 4.6318249130427435e-05, + "loss": 4.51, + "step": 29417 + }, + { + "epoch": 0.17495717956037682, + "grad_norm": 1.7205270528793335, + "learning_rate": 4.631800513685412e-05, + "loss": 4.554, + "step": 29418 + }, + { + "epoch": 0.1749631268436578, + "grad_norm": 1.449798583984375, + "learning_rate": 4.6317761135838896e-05, + "loss": 5.0114, + "step": 29419 + }, + { + "epoch": 0.17496907412693882, + "grad_norm": 1.6449236869812012, + "learning_rate": 4.631751712738187e-05, + "loss": 5.7704, + "step": 29420 + }, + { + "epoch": 0.1749750214102198, + "grad_norm": 1.5362746715545654, + "learning_rate": 4.631727311148312e-05, + "loss": 5.6398, + "step": 29421 + }, + { + "epoch": 0.1749809686935008, + "grad_norm": 1.6383920907974243, + "learning_rate": 4.6317029088142726e-05, + "loss": 5.2901, + "step": 29422 + }, + { + "epoch": 0.17498691597678181, + "grad_norm": 1.8682830333709717, + "learning_rate": 4.631678505736079e-05, + "loss": 4.2822, + "step": 29423 + }, + { + "epoch": 0.1749928632600628, + "grad_norm": 1.9640558958053589, + "learning_rate": 4.631654101913737e-05, + "loss": 4.121, + "step": 29424 + }, + { + "epoch": 0.1749988105433438, + "grad_norm": 1.569744348526001, + "learning_rate": 4.6316296973472576e-05, + "loss": 4.3937, + "step": 29425 + }, + { + "epoch": 0.1750047578266248, + "grad_norm": 1.524356484413147, + "learning_rate": 4.6316052920366475e-05, + "loss": 4.8107, + "step": 29426 + }, + { + "epoch": 0.1750107051099058, + "grad_norm": 1.7055494785308838, + "learning_rate": 4.6315808859819164e-05, + "loss": 4.8751, + "step": 29427 + }, + { + "epoch": 0.17501665239318678, + "grad_norm": 1.683262586593628, + "learning_rate": 4.631556479183072e-05, + "loss": 5.4053, + "step": 29428 + }, + { + "epoch": 0.1750225996764678, + "grad_norm": 1.7124066352844238, + "learning_rate": 4.6315320716401244e-05, + "loss": 5.0109, + "step": 29429 + }, + { + "epoch": 0.1750285469597488, + "grad_norm": 1.6951466798782349, + "learning_rate": 4.63150766335308e-05, + "loss": 5.4747, + "step": 29430 + }, + { + "epoch": 0.17503449424302978, + "grad_norm": 1.5457607507705688, + "learning_rate": 4.631483254321949e-05, + "loss": 4.8729, + "step": 29431 + }, + { + "epoch": 0.1750404415263108, + "grad_norm": 1.5366050004959106, + "learning_rate": 4.6314588445467386e-05, + "loss": 5.0268, + "step": 29432 + }, + { + "epoch": 0.17504638880959178, + "grad_norm": 1.6533615589141846, + "learning_rate": 4.6314344340274573e-05, + "loss": 4.7626, + "step": 29433 + }, + { + "epoch": 0.17505233609287277, + "grad_norm": 1.559486746788025, + "learning_rate": 4.631410022764115e-05, + "loss": 5.0673, + "step": 29434 + }, + { + "epoch": 0.17505828337615378, + "grad_norm": 1.534456729888916, + "learning_rate": 4.63138561075672e-05, + "loss": 5.5142, + "step": 29435 + }, + { + "epoch": 0.17506423065943477, + "grad_norm": 1.641667366027832, + "learning_rate": 4.63136119800528e-05, + "loss": 4.7032, + "step": 29436 + }, + { + "epoch": 0.17507017794271576, + "grad_norm": 1.4128551483154297, + "learning_rate": 4.631336784509803e-05, + "loss": 4.8777, + "step": 29437 + }, + { + "epoch": 0.17507612522599678, + "grad_norm": 1.4912710189819336, + "learning_rate": 4.6313123702703e-05, + "loss": 4.866, + "step": 29438 + }, + { + "epoch": 0.17508207250927776, + "grad_norm": 1.381341576576233, + "learning_rate": 4.631287955286776e-05, + "loss": 4.6116, + "step": 29439 + }, + { + "epoch": 0.17508801979255875, + "grad_norm": 1.4270753860473633, + "learning_rate": 4.631263539559243e-05, + "loss": 5.0519, + "step": 29440 + }, + { + "epoch": 0.17509396707583977, + "grad_norm": 1.4962128400802612, + "learning_rate": 4.6312391230877074e-05, + "loss": 4.6934, + "step": 29441 + }, + { + "epoch": 0.17509991435912076, + "grad_norm": 1.3959366083145142, + "learning_rate": 4.631214705872178e-05, + "loss": 4.9172, + "step": 29442 + }, + { + "epoch": 0.17510586164240174, + "grad_norm": 1.5014355182647705, + "learning_rate": 4.631190287912663e-05, + "loss": 4.8429, + "step": 29443 + }, + { + "epoch": 0.17511180892568276, + "grad_norm": 1.584879994392395, + "learning_rate": 4.631165869209172e-05, + "loss": 5.1186, + "step": 29444 + }, + { + "epoch": 0.17511775620896375, + "grad_norm": 1.6547553539276123, + "learning_rate": 4.6311414497617135e-05, + "loss": 4.9739, + "step": 29445 + }, + { + "epoch": 0.17512370349224474, + "grad_norm": 1.4584704637527466, + "learning_rate": 4.631117029570295e-05, + "loss": 4.927, + "step": 29446 + }, + { + "epoch": 0.17512965077552575, + "grad_norm": 1.5092477798461914, + "learning_rate": 4.631092608634926e-05, + "loss": 4.9163, + "step": 29447 + }, + { + "epoch": 0.17513559805880674, + "grad_norm": 1.466023564338684, + "learning_rate": 4.631068186955614e-05, + "loss": 4.9867, + "step": 29448 + }, + { + "epoch": 0.17514154534208773, + "grad_norm": 1.8561779260635376, + "learning_rate": 4.6310437645323676e-05, + "loss": 4.6118, + "step": 29449 + }, + { + "epoch": 0.17514749262536874, + "grad_norm": 2.27844500541687, + "learning_rate": 4.631019341365197e-05, + "loss": 4.4978, + "step": 29450 + }, + { + "epoch": 0.17515343990864973, + "grad_norm": 1.7874199151992798, + "learning_rate": 4.6309949174541096e-05, + "loss": 3.7357, + "step": 29451 + }, + { + "epoch": 0.17515938719193072, + "grad_norm": 1.6950316429138184, + "learning_rate": 4.6309704927991136e-05, + "loss": 4.1866, + "step": 29452 + }, + { + "epoch": 0.17516533447521174, + "grad_norm": 1.6692928075790405, + "learning_rate": 4.630946067400217e-05, + "loss": 3.9566, + "step": 29453 + }, + { + "epoch": 0.17517128175849273, + "grad_norm": 1.680684208869934, + "learning_rate": 4.63092164125743e-05, + "loss": 4.0473, + "step": 29454 + }, + { + "epoch": 0.1751772290417737, + "grad_norm": 1.7636792659759521, + "learning_rate": 4.6308972143707606e-05, + "loss": 4.161, + "step": 29455 + }, + { + "epoch": 0.1751831763250547, + "grad_norm": 1.7277029752731323, + "learning_rate": 4.6308727867402165e-05, + "loss": 4.6943, + "step": 29456 + }, + { + "epoch": 0.17518912360833572, + "grad_norm": 1.7087599039077759, + "learning_rate": 4.630848358365807e-05, + "loss": 4.9239, + "step": 29457 + }, + { + "epoch": 0.1751950708916167, + "grad_norm": 1.8207015991210938, + "learning_rate": 4.63082392924754e-05, + "loss": 4.8358, + "step": 29458 + }, + { + "epoch": 0.1752010181748977, + "grad_norm": 1.9595861434936523, + "learning_rate": 4.6307994993854245e-05, + "loss": 4.3975, + "step": 29459 + }, + { + "epoch": 0.1752069654581787, + "grad_norm": 2.330233335494995, + "learning_rate": 4.630775068779469e-05, + "loss": 3.9516, + "step": 29460 + }, + { + "epoch": 0.1752129127414597, + "grad_norm": 1.801896572113037, + "learning_rate": 4.630750637429682e-05, + "loss": 4.3272, + "step": 29461 + }, + { + "epoch": 0.17521886002474069, + "grad_norm": 1.8079783916473389, + "learning_rate": 4.630726205336071e-05, + "loss": 4.4698, + "step": 29462 + }, + { + "epoch": 0.1752248073080217, + "grad_norm": 1.7742640972137451, + "learning_rate": 4.6307017724986466e-05, + "loss": 4.5466, + "step": 29463 + }, + { + "epoch": 0.1752307545913027, + "grad_norm": 1.5979267358779907, + "learning_rate": 4.6306773389174154e-05, + "loss": 4.497, + "step": 29464 + }, + { + "epoch": 0.17523670187458368, + "grad_norm": 1.6667109727859497, + "learning_rate": 4.630652904592388e-05, + "loss": 5.338, + "step": 29465 + }, + { + "epoch": 0.1752426491578647, + "grad_norm": 1.5170248746871948, + "learning_rate": 4.63062846952357e-05, + "loss": 4.6994, + "step": 29466 + }, + { + "epoch": 0.17524859644114568, + "grad_norm": 1.597468376159668, + "learning_rate": 4.630604033710974e-05, + "loss": 4.1865, + "step": 29467 + }, + { + "epoch": 0.17525454372442667, + "grad_norm": 1.638096809387207, + "learning_rate": 4.630579597154604e-05, + "loss": 4.2936, + "step": 29468 + }, + { + "epoch": 0.17526049100770769, + "grad_norm": 1.5512175559997559, + "learning_rate": 4.630555159854472e-05, + "loss": 4.6191, + "step": 29469 + }, + { + "epoch": 0.17526643829098867, + "grad_norm": 1.57890784740448, + "learning_rate": 4.630530721810584e-05, + "loss": 4.9381, + "step": 29470 + }, + { + "epoch": 0.17527238557426966, + "grad_norm": 1.7156378030776978, + "learning_rate": 4.63050628302295e-05, + "loss": 5.022, + "step": 29471 + }, + { + "epoch": 0.17527833285755068, + "grad_norm": 1.6688953638076782, + "learning_rate": 4.630481843491579e-05, + "loss": 4.5509, + "step": 29472 + }, + { + "epoch": 0.17528428014083167, + "grad_norm": 1.835450530052185, + "learning_rate": 4.630457403216478e-05, + "loss": 4.6413, + "step": 29473 + }, + { + "epoch": 0.17529022742411265, + "grad_norm": 1.2935006618499756, + "learning_rate": 4.6304329621976574e-05, + "loss": 4.9823, + "step": 29474 + }, + { + "epoch": 0.17529617470739367, + "grad_norm": 2.152981758117676, + "learning_rate": 4.6304085204351234e-05, + "loss": 4.6183, + "step": 29475 + }, + { + "epoch": 0.17530212199067466, + "grad_norm": 1.6258760690689087, + "learning_rate": 4.630384077928886e-05, + "loss": 4.9874, + "step": 29476 + }, + { + "epoch": 0.17530806927395565, + "grad_norm": 1.6755950450897217, + "learning_rate": 4.630359634678954e-05, + "loss": 5.089, + "step": 29477 + }, + { + "epoch": 0.17531401655723666, + "grad_norm": 1.7208611965179443, + "learning_rate": 4.6303351906853355e-05, + "loss": 5.3393, + "step": 29478 + }, + { + "epoch": 0.17531996384051765, + "grad_norm": 1.5461162328720093, + "learning_rate": 4.630310745948039e-05, + "loss": 5.2263, + "step": 29479 + }, + { + "epoch": 0.17532591112379864, + "grad_norm": 1.9592080116271973, + "learning_rate": 4.630286300467073e-05, + "loss": 4.1235, + "step": 29480 + }, + { + "epoch": 0.17533185840707965, + "grad_norm": 1.8409465551376343, + "learning_rate": 4.630261854242446e-05, + "loss": 4.8235, + "step": 29481 + }, + { + "epoch": 0.17533780569036064, + "grad_norm": 1.6198770999908447, + "learning_rate": 4.630237407274166e-05, + "loss": 5.5198, + "step": 29482 + }, + { + "epoch": 0.17534375297364163, + "grad_norm": 1.692572832107544, + "learning_rate": 4.630212959562243e-05, + "loss": 4.8526, + "step": 29483 + }, + { + "epoch": 0.17534970025692265, + "grad_norm": 1.7479051351547241, + "learning_rate": 4.6301885111066847e-05, + "loss": 4.8774, + "step": 29484 + }, + { + "epoch": 0.17535564754020364, + "grad_norm": 2.0946943759918213, + "learning_rate": 4.630164061907499e-05, + "loss": 4.4918, + "step": 29485 + }, + { + "epoch": 0.17536159482348462, + "grad_norm": 1.702415943145752, + "learning_rate": 4.6301396119646954e-05, + "loss": 4.424, + "step": 29486 + }, + { + "epoch": 0.17536754210676564, + "grad_norm": 1.4786335229873657, + "learning_rate": 4.630115161278282e-05, + "loss": 5.5655, + "step": 29487 + }, + { + "epoch": 0.17537348939004663, + "grad_norm": 1.5471251010894775, + "learning_rate": 4.630090709848267e-05, + "loss": 5.2839, + "step": 29488 + }, + { + "epoch": 0.17537943667332762, + "grad_norm": 1.8128043413162231, + "learning_rate": 4.6300662576746595e-05, + "loss": 4.7968, + "step": 29489 + }, + { + "epoch": 0.17538538395660863, + "grad_norm": 1.6280453205108643, + "learning_rate": 4.630041804757469e-05, + "loss": 4.7266, + "step": 29490 + }, + { + "epoch": 0.17539133123988962, + "grad_norm": 1.6138848066329956, + "learning_rate": 4.6300173510967015e-05, + "loss": 4.3718, + "step": 29491 + }, + { + "epoch": 0.1753972785231706, + "grad_norm": 1.6392838954925537, + "learning_rate": 4.6299928966923675e-05, + "loss": 4.7491, + "step": 29492 + }, + { + "epoch": 0.17540322580645162, + "grad_norm": 1.722277283668518, + "learning_rate": 4.629968441544475e-05, + "loss": 4.4053, + "step": 29493 + }, + { + "epoch": 0.1754091730897326, + "grad_norm": 1.4803645610809326, + "learning_rate": 4.629943985653032e-05, + "loss": 4.5624, + "step": 29494 + }, + { + "epoch": 0.1754151203730136, + "grad_norm": 1.696871042251587, + "learning_rate": 4.629919529018048e-05, + "loss": 4.2274, + "step": 29495 + }, + { + "epoch": 0.17542106765629462, + "grad_norm": 2.0104081630706787, + "learning_rate": 4.629895071639531e-05, + "loss": 4.954, + "step": 29496 + }, + { + "epoch": 0.1754270149395756, + "grad_norm": 1.91762113571167, + "learning_rate": 4.62987061351749e-05, + "loss": 4.5869, + "step": 29497 + }, + { + "epoch": 0.1754329622228566, + "grad_norm": 2.0672197341918945, + "learning_rate": 4.629846154651932e-05, + "loss": 4.3838, + "step": 29498 + }, + { + "epoch": 0.1754389095061376, + "grad_norm": 1.9841183423995972, + "learning_rate": 4.629821695042869e-05, + "loss": 5.2067, + "step": 29499 + }, + { + "epoch": 0.1754448567894186, + "grad_norm": 1.850253701210022, + "learning_rate": 4.6297972346903055e-05, + "loss": 4.7302, + "step": 29500 + }, + { + "epoch": 0.17545080407269958, + "grad_norm": 1.4990947246551514, + "learning_rate": 4.629772773594252e-05, + "loss": 4.9005, + "step": 29501 + }, + { + "epoch": 0.1754567513559806, + "grad_norm": 1.5953363180160522, + "learning_rate": 4.629748311754717e-05, + "loss": 4.9025, + "step": 29502 + }, + { + "epoch": 0.1754626986392616, + "grad_norm": 1.5136396884918213, + "learning_rate": 4.6297238491717085e-05, + "loss": 4.835, + "step": 29503 + }, + { + "epoch": 0.17546864592254258, + "grad_norm": 1.7335329055786133, + "learning_rate": 4.6296993858452356e-05, + "loss": 4.7231, + "step": 29504 + }, + { + "epoch": 0.1754745932058236, + "grad_norm": 1.5969070196151733, + "learning_rate": 4.629674921775307e-05, + "loss": 4.7903, + "step": 29505 + }, + { + "epoch": 0.17548054048910458, + "grad_norm": 1.7393018007278442, + "learning_rate": 4.62965045696193e-05, + "loss": 5.2468, + "step": 29506 + }, + { + "epoch": 0.17548648777238557, + "grad_norm": 1.4993494749069214, + "learning_rate": 4.629625991405116e-05, + "loss": 5.0639, + "step": 29507 + }, + { + "epoch": 0.17549243505566658, + "grad_norm": 1.559507966041565, + "learning_rate": 4.62960152510487e-05, + "loss": 5.2718, + "step": 29508 + }, + { + "epoch": 0.17549838233894757, + "grad_norm": 1.6528722047805786, + "learning_rate": 4.629577058061202e-05, + "loss": 5.0881, + "step": 29509 + }, + { + "epoch": 0.17550432962222856, + "grad_norm": 1.5357880592346191, + "learning_rate": 4.629552590274121e-05, + "loss": 4.5841, + "step": 29510 + }, + { + "epoch": 0.17551027690550958, + "grad_norm": 1.7293065786361694, + "learning_rate": 4.629528121743635e-05, + "loss": 4.6718, + "step": 29511 + }, + { + "epoch": 0.17551622418879056, + "grad_norm": 2.699164390563965, + "learning_rate": 4.6295036524697536e-05, + "loss": 4.1491, + "step": 29512 + }, + { + "epoch": 0.17552217147207155, + "grad_norm": 1.5221933126449585, + "learning_rate": 4.629479182452483e-05, + "loss": 4.8606, + "step": 29513 + }, + { + "epoch": 0.17552811875535254, + "grad_norm": 1.5474234819412231, + "learning_rate": 4.629454711691835e-05, + "loss": 4.7198, + "step": 29514 + }, + { + "epoch": 0.17553406603863356, + "grad_norm": 1.5748153924942017, + "learning_rate": 4.629430240187816e-05, + "loss": 4.9429, + "step": 29515 + }, + { + "epoch": 0.17554001332191455, + "grad_norm": 1.5812437534332275, + "learning_rate": 4.629405767940434e-05, + "loss": 4.7219, + "step": 29516 + }, + { + "epoch": 0.17554596060519553, + "grad_norm": 1.572482943534851, + "learning_rate": 4.629381294949698e-05, + "loss": 4.9071, + "step": 29517 + }, + { + "epoch": 0.17555190788847655, + "grad_norm": 1.8683935403823853, + "learning_rate": 4.629356821215618e-05, + "loss": 4.539, + "step": 29518 + }, + { + "epoch": 0.17555785517175754, + "grad_norm": 3.200904607772827, + "learning_rate": 4.629332346738201e-05, + "loss": 4.2734, + "step": 29519 + }, + { + "epoch": 0.17556380245503853, + "grad_norm": 2.051896572113037, + "learning_rate": 4.629307871517457e-05, + "loss": 5.0986, + "step": 29520 + }, + { + "epoch": 0.17556974973831954, + "grad_norm": 1.7927826642990112, + "learning_rate": 4.6292833955533926e-05, + "loss": 4.6581, + "step": 29521 + }, + { + "epoch": 0.17557569702160053, + "grad_norm": 1.6184303760528564, + "learning_rate": 4.629258918846018e-05, + "loss": 4.8106, + "step": 29522 + }, + { + "epoch": 0.17558164430488152, + "grad_norm": 1.4969747066497803, + "learning_rate": 4.62923444139534e-05, + "loss": 5.2787, + "step": 29523 + }, + { + "epoch": 0.17558759158816253, + "grad_norm": 1.471805214881897, + "learning_rate": 4.6292099632013695e-05, + "loss": 5.3599, + "step": 29524 + }, + { + "epoch": 0.17559353887144352, + "grad_norm": 1.3968273401260376, + "learning_rate": 4.629185484264113e-05, + "loss": 4.9754, + "step": 29525 + }, + { + "epoch": 0.1755994861547245, + "grad_norm": 1.627172589302063, + "learning_rate": 4.629161004583581e-05, + "loss": 4.3703, + "step": 29526 + }, + { + "epoch": 0.17560543343800553, + "grad_norm": 1.5334340333938599, + "learning_rate": 4.62913652415978e-05, + "loss": 4.8447, + "step": 29527 + }, + { + "epoch": 0.17561138072128651, + "grad_norm": 1.552454948425293, + "learning_rate": 4.6291120429927194e-05, + "loss": 4.823, + "step": 29528 + }, + { + "epoch": 0.1756173280045675, + "grad_norm": 1.4378019571304321, + "learning_rate": 4.629087561082408e-05, + "loss": 5.019, + "step": 29529 + }, + { + "epoch": 0.17562327528784852, + "grad_norm": 1.513752818107605, + "learning_rate": 4.6290630784288544e-05, + "loss": 4.7146, + "step": 29530 + }, + { + "epoch": 0.1756292225711295, + "grad_norm": 1.5130308866500854, + "learning_rate": 4.629038595032066e-05, + "loss": 4.5687, + "step": 29531 + }, + { + "epoch": 0.1756351698544105, + "grad_norm": 1.6177191734313965, + "learning_rate": 4.6290141108920534e-05, + "loss": 4.49, + "step": 29532 + }, + { + "epoch": 0.1756411171376915, + "grad_norm": 1.6133641004562378, + "learning_rate": 4.628989626008823e-05, + "loss": 4.6966, + "step": 29533 + }, + { + "epoch": 0.1756470644209725, + "grad_norm": 1.5740238428115845, + "learning_rate": 4.628965140382385e-05, + "loss": 4.8149, + "step": 29534 + }, + { + "epoch": 0.1756530117042535, + "grad_norm": 1.4787334203720093, + "learning_rate": 4.6289406540127466e-05, + "loss": 4.7759, + "step": 29535 + }, + { + "epoch": 0.1756589589875345, + "grad_norm": 1.5558816194534302, + "learning_rate": 4.628916166899917e-05, + "loss": 5.0831, + "step": 29536 + }, + { + "epoch": 0.1756649062708155, + "grad_norm": 1.3332229852676392, + "learning_rate": 4.628891679043905e-05, + "loss": 4.9866, + "step": 29537 + }, + { + "epoch": 0.17567085355409648, + "grad_norm": 1.5539603233337402, + "learning_rate": 4.6288671904447195e-05, + "loss": 4.96, + "step": 29538 + }, + { + "epoch": 0.1756768008373775, + "grad_norm": 1.4858051538467407, + "learning_rate": 4.628842701102368e-05, + "loss": 4.9161, + "step": 29539 + }, + { + "epoch": 0.17568274812065848, + "grad_norm": 1.6222684383392334, + "learning_rate": 4.62881821101686e-05, + "loss": 4.9328, + "step": 29540 + }, + { + "epoch": 0.17568869540393947, + "grad_norm": 1.6516577005386353, + "learning_rate": 4.6287937201882025e-05, + "loss": 4.7577, + "step": 29541 + }, + { + "epoch": 0.1756946426872205, + "grad_norm": 1.7349826097488403, + "learning_rate": 4.6287692286164056e-05, + "loss": 4.5927, + "step": 29542 + }, + { + "epoch": 0.17570058997050148, + "grad_norm": 1.4014586210250854, + "learning_rate": 4.6287447363014776e-05, + "loss": 4.8835, + "step": 29543 + }, + { + "epoch": 0.17570653725378246, + "grad_norm": 1.5037766695022583, + "learning_rate": 4.6287202432434265e-05, + "loss": 4.9221, + "step": 29544 + }, + { + "epoch": 0.17571248453706348, + "grad_norm": 1.5138404369354248, + "learning_rate": 4.628695749442261e-05, + "loss": 4.5962, + "step": 29545 + }, + { + "epoch": 0.17571843182034447, + "grad_norm": 1.5634385347366333, + "learning_rate": 4.6286712548979907e-05, + "loss": 5.2178, + "step": 29546 + }, + { + "epoch": 0.17572437910362546, + "grad_norm": 1.6049305200576782, + "learning_rate": 4.628646759610622e-05, + "loss": 5.1726, + "step": 29547 + }, + { + "epoch": 0.17573032638690647, + "grad_norm": 1.6202237606048584, + "learning_rate": 4.628622263580166e-05, + "loss": 4.8598, + "step": 29548 + }, + { + "epoch": 0.17573627367018746, + "grad_norm": 1.4801881313323975, + "learning_rate": 4.628597766806629e-05, + "loss": 4.9164, + "step": 29549 + }, + { + "epoch": 0.17574222095346845, + "grad_norm": 1.5014153718948364, + "learning_rate": 4.628573269290021e-05, + "loss": 4.3787, + "step": 29550 + }, + { + "epoch": 0.17574816823674946, + "grad_norm": 1.5468509197235107, + "learning_rate": 4.62854877103035e-05, + "loss": 4.9178, + "step": 29551 + }, + { + "epoch": 0.17575411552003045, + "grad_norm": 1.4622128009796143, + "learning_rate": 4.628524272027624e-05, + "loss": 4.8219, + "step": 29552 + }, + { + "epoch": 0.17576006280331144, + "grad_norm": 1.6060843467712402, + "learning_rate": 4.628499772281853e-05, + "loss": 4.869, + "step": 29553 + }, + { + "epoch": 0.17576601008659246, + "grad_norm": 1.7407468557357788, + "learning_rate": 4.628475271793044e-05, + "loss": 4.7171, + "step": 29554 + }, + { + "epoch": 0.17577195736987344, + "grad_norm": 1.5435397624969482, + "learning_rate": 4.628450770561207e-05, + "loss": 4.6929, + "step": 29555 + }, + { + "epoch": 0.17577790465315443, + "grad_norm": 1.5211220979690552, + "learning_rate": 4.628426268586349e-05, + "loss": 4.6811, + "step": 29556 + }, + { + "epoch": 0.17578385193643545, + "grad_norm": 1.3432724475860596, + "learning_rate": 4.6284017658684796e-05, + "loss": 4.8499, + "step": 29557 + }, + { + "epoch": 0.17578979921971644, + "grad_norm": 1.6592440605163574, + "learning_rate": 4.628377262407608e-05, + "loss": 4.4278, + "step": 29558 + }, + { + "epoch": 0.17579574650299742, + "grad_norm": 1.5314370393753052, + "learning_rate": 4.6283527582037415e-05, + "loss": 5.0514, + "step": 29559 + }, + { + "epoch": 0.17580169378627844, + "grad_norm": 1.8792412281036377, + "learning_rate": 4.6283282532568884e-05, + "loss": 4.3201, + "step": 29560 + }, + { + "epoch": 0.17580764106955943, + "grad_norm": 1.726537823677063, + "learning_rate": 4.628303747567058e-05, + "loss": 4.4524, + "step": 29561 + }, + { + "epoch": 0.17581358835284042, + "grad_norm": 1.5222519636154175, + "learning_rate": 4.628279241134259e-05, + "loss": 4.7075, + "step": 29562 + }, + { + "epoch": 0.17581953563612143, + "grad_norm": 1.6036890745162964, + "learning_rate": 4.6282547339585e-05, + "loss": 4.6974, + "step": 29563 + }, + { + "epoch": 0.17582548291940242, + "grad_norm": 1.6295074224472046, + "learning_rate": 4.628230226039789e-05, + "loss": 4.4021, + "step": 29564 + }, + { + "epoch": 0.1758314302026834, + "grad_norm": 2.6549839973449707, + "learning_rate": 4.628205717378135e-05, + "loss": 3.8639, + "step": 29565 + }, + { + "epoch": 0.17583737748596442, + "grad_norm": 2.752455234527588, + "learning_rate": 4.628181207973547e-05, + "loss": 3.745, + "step": 29566 + }, + { + "epoch": 0.1758433247692454, + "grad_norm": 2.4327378273010254, + "learning_rate": 4.6281566978260314e-05, + "loss": 3.4675, + "step": 29567 + }, + { + "epoch": 0.1758492720525264, + "grad_norm": 2.2893288135528564, + "learning_rate": 4.628132186935599e-05, + "loss": 3.4223, + "step": 29568 + }, + { + "epoch": 0.17585521933580742, + "grad_norm": 2.6514787673950195, + "learning_rate": 4.628107675302258e-05, + "loss": 3.6378, + "step": 29569 + }, + { + "epoch": 0.1758611666190884, + "grad_norm": 1.501243233680725, + "learning_rate": 4.628083162926016e-05, + "loss": 4.9402, + "step": 29570 + }, + { + "epoch": 0.1758671139023694, + "grad_norm": 2.5400307178497314, + "learning_rate": 4.6280586498068824e-05, + "loss": 3.9097, + "step": 29571 + }, + { + "epoch": 0.17587306118565038, + "grad_norm": 3.0715131759643555, + "learning_rate": 4.628034135944865e-05, + "loss": 3.8084, + "step": 29572 + }, + { + "epoch": 0.1758790084689314, + "grad_norm": 2.320291042327881, + "learning_rate": 4.628009621339974e-05, + "loss": 3.743, + "step": 29573 + }, + { + "epoch": 0.17588495575221239, + "grad_norm": 2.653029441833496, + "learning_rate": 4.627985105992216e-05, + "loss": 3.5106, + "step": 29574 + }, + { + "epoch": 0.17589090303549337, + "grad_norm": 2.5279390811920166, + "learning_rate": 4.6279605899016007e-05, + "loss": 3.6074, + "step": 29575 + }, + { + "epoch": 0.1758968503187744, + "grad_norm": 2.6520915031433105, + "learning_rate": 4.6279360730681364e-05, + "loss": 3.5559, + "step": 29576 + }, + { + "epoch": 0.17590279760205538, + "grad_norm": 1.5509624481201172, + "learning_rate": 4.627911555491831e-05, + "loss": 4.8954, + "step": 29577 + }, + { + "epoch": 0.17590874488533637, + "grad_norm": 2.044759750366211, + "learning_rate": 4.627887037172695e-05, + "loss": 3.7401, + "step": 29578 + }, + { + "epoch": 0.17591469216861738, + "grad_norm": 2.512817144393921, + "learning_rate": 4.6278625181107336e-05, + "loss": 3.3898, + "step": 29579 + }, + { + "epoch": 0.17592063945189837, + "grad_norm": 2.3796133995056152, + "learning_rate": 4.627837998305959e-05, + "loss": 3.5277, + "step": 29580 + }, + { + "epoch": 0.17592658673517936, + "grad_norm": 2.6435763835906982, + "learning_rate": 4.6278134777583774e-05, + "loss": 3.6078, + "step": 29581 + }, + { + "epoch": 0.17593253401846037, + "grad_norm": 1.9326622486114502, + "learning_rate": 4.6277889564679986e-05, + "loss": 4.3017, + "step": 29582 + }, + { + "epoch": 0.17593848130174136, + "grad_norm": 2.0501444339752197, + "learning_rate": 4.62776443443483e-05, + "loss": 4.2909, + "step": 29583 + }, + { + "epoch": 0.17594442858502235, + "grad_norm": 2.1053049564361572, + "learning_rate": 4.6277399116588816e-05, + "loss": 3.4639, + "step": 29584 + }, + { + "epoch": 0.17595037586830337, + "grad_norm": 2.2305474281311035, + "learning_rate": 4.627715388140161e-05, + "loss": 3.6551, + "step": 29585 + }, + { + "epoch": 0.17595632315158435, + "grad_norm": 2.328937292098999, + "learning_rate": 4.6276908638786766e-05, + "loss": 3.2528, + "step": 29586 + }, + { + "epoch": 0.17596227043486534, + "grad_norm": 3.2846357822418213, + "learning_rate": 4.627666338874437e-05, + "loss": 3.7581, + "step": 29587 + }, + { + "epoch": 0.17596821771814636, + "grad_norm": 2.145848512649536, + "learning_rate": 4.627641813127452e-05, + "loss": 3.6736, + "step": 29588 + }, + { + "epoch": 0.17597416500142735, + "grad_norm": 2.367215871810913, + "learning_rate": 4.627617286637729e-05, + "loss": 3.3043, + "step": 29589 + }, + { + "epoch": 0.17598011228470833, + "grad_norm": 2.314913272857666, + "learning_rate": 4.627592759405276e-05, + "loss": 3.3871, + "step": 29590 + }, + { + "epoch": 0.17598605956798935, + "grad_norm": 2.3208961486816406, + "learning_rate": 4.627568231430103e-05, + "loss": 3.3427, + "step": 29591 + }, + { + "epoch": 0.17599200685127034, + "grad_norm": 2.2277936935424805, + "learning_rate": 4.627543702712218e-05, + "loss": 3.4393, + "step": 29592 + }, + { + "epoch": 0.17599795413455133, + "grad_norm": 2.6522443294525146, + "learning_rate": 4.627519173251629e-05, + "loss": 3.4554, + "step": 29593 + }, + { + "epoch": 0.17600390141783234, + "grad_norm": 1.6064810752868652, + "learning_rate": 4.6274946430483454e-05, + "loss": 5.2487, + "step": 29594 + }, + { + "epoch": 0.17600984870111333, + "grad_norm": 2.488597869873047, + "learning_rate": 4.627470112102375e-05, + "loss": 3.8507, + "step": 29595 + }, + { + "epoch": 0.17601579598439432, + "grad_norm": 2.4922280311584473, + "learning_rate": 4.627445580413727e-05, + "loss": 3.901, + "step": 29596 + }, + { + "epoch": 0.17602174326767533, + "grad_norm": 2.5545835494995117, + "learning_rate": 4.62742104798241e-05, + "loss": 3.7327, + "step": 29597 + }, + { + "epoch": 0.17602769055095632, + "grad_norm": 2.674534559249878, + "learning_rate": 4.627396514808432e-05, + "loss": 3.6846, + "step": 29598 + }, + { + "epoch": 0.1760336378342373, + "grad_norm": 2.51946759223938, + "learning_rate": 4.627371980891801e-05, + "loss": 3.504, + "step": 29599 + }, + { + "epoch": 0.17603958511751833, + "grad_norm": 1.584033489227295, + "learning_rate": 4.6273474462325286e-05, + "loss": 4.9813, + "step": 29600 + }, + { + "epoch": 0.17604553240079931, + "grad_norm": 1.5800496339797974, + "learning_rate": 4.6273229108306195e-05, + "loss": 5.6641, + "step": 29601 + }, + { + "epoch": 0.1760514796840803, + "grad_norm": 1.5663219690322876, + "learning_rate": 4.627298374686084e-05, + "loss": 5.6077, + "step": 29602 + }, + { + "epoch": 0.17605742696736132, + "grad_norm": 1.5315394401550293, + "learning_rate": 4.627273837798932e-05, + "loss": 5.3647, + "step": 29603 + }, + { + "epoch": 0.1760633742506423, + "grad_norm": 1.6742242574691772, + "learning_rate": 4.627249300169169e-05, + "loss": 5.2066, + "step": 29604 + }, + { + "epoch": 0.1760693215339233, + "grad_norm": 1.6399402618408203, + "learning_rate": 4.627224761796806e-05, + "loss": 5.0195, + "step": 29605 + }, + { + "epoch": 0.1760752688172043, + "grad_norm": 1.7168047428131104, + "learning_rate": 4.627200222681851e-05, + "loss": 5.3056, + "step": 29606 + }, + { + "epoch": 0.1760812161004853, + "grad_norm": 1.6890738010406494, + "learning_rate": 4.627175682824312e-05, + "loss": 5.1811, + "step": 29607 + }, + { + "epoch": 0.1760871633837663, + "grad_norm": 1.7669142484664917, + "learning_rate": 4.627151142224198e-05, + "loss": 5.2459, + "step": 29608 + }, + { + "epoch": 0.1760931106670473, + "grad_norm": 1.4989925622940063, + "learning_rate": 4.627126600881517e-05, + "loss": 5.092, + "step": 29609 + }, + { + "epoch": 0.1760990579503283, + "grad_norm": 1.4541029930114746, + "learning_rate": 4.627102058796279e-05, + "loss": 5.0705, + "step": 29610 + }, + { + "epoch": 0.17610500523360928, + "grad_norm": 2.039470911026001, + "learning_rate": 4.627077515968492e-05, + "loss": 4.1636, + "step": 29611 + }, + { + "epoch": 0.1761109525168903, + "grad_norm": 3.1738526821136475, + "learning_rate": 4.6270529723981635e-05, + "loss": 2.1184, + "step": 29612 + }, + { + "epoch": 0.17611689980017128, + "grad_norm": 1.7128700017929077, + "learning_rate": 4.6270284280853024e-05, + "loss": 5.7775, + "step": 29613 + }, + { + "epoch": 0.17612284708345227, + "grad_norm": 1.7605071067810059, + "learning_rate": 4.627003883029918e-05, + "loss": 5.6578, + "step": 29614 + }, + { + "epoch": 0.1761287943667333, + "grad_norm": 1.6726125478744507, + "learning_rate": 4.6269793372320186e-05, + "loss": 5.3621, + "step": 29615 + }, + { + "epoch": 0.17613474165001428, + "grad_norm": 1.6924387216567993, + "learning_rate": 4.626954790691612e-05, + "loss": 5.2866, + "step": 29616 + }, + { + "epoch": 0.17614068893329526, + "grad_norm": 1.705000400543213, + "learning_rate": 4.6269302434087085e-05, + "loss": 5.009, + "step": 29617 + }, + { + "epoch": 0.17614663621657628, + "grad_norm": 1.6577481031417847, + "learning_rate": 4.6269056953833157e-05, + "loss": 5.4761, + "step": 29618 + }, + { + "epoch": 0.17615258349985727, + "grad_norm": 1.635854721069336, + "learning_rate": 4.6268811466154415e-05, + "loss": 5.3624, + "step": 29619 + }, + { + "epoch": 0.17615853078313826, + "grad_norm": 1.6608973741531372, + "learning_rate": 4.626856597105095e-05, + "loss": 5.4398, + "step": 29620 + }, + { + "epoch": 0.17616447806641927, + "grad_norm": 1.5028787851333618, + "learning_rate": 4.626832046852285e-05, + "loss": 5.3025, + "step": 29621 + }, + { + "epoch": 0.17617042534970026, + "grad_norm": 2.694622278213501, + "learning_rate": 4.62680749585702e-05, + "loss": 2.389, + "step": 29622 + }, + { + "epoch": 0.17617637263298125, + "grad_norm": 1.6484723091125488, + "learning_rate": 4.6267829441193086e-05, + "loss": 4.871, + "step": 29623 + }, + { + "epoch": 0.17618231991626226, + "grad_norm": 1.6752315759658813, + "learning_rate": 4.626758391639159e-05, + "loss": 5.1089, + "step": 29624 + }, + { + "epoch": 0.17618826719954325, + "grad_norm": 1.8165408372879028, + "learning_rate": 4.62673383841658e-05, + "loss": 5.1408, + "step": 29625 + }, + { + "epoch": 0.17619421448282424, + "grad_norm": 1.7555296421051025, + "learning_rate": 4.6267092844515804e-05, + "loss": 5.2196, + "step": 29626 + }, + { + "epoch": 0.17620016176610526, + "grad_norm": 1.6462376117706299, + "learning_rate": 4.626684729744168e-05, + "loss": 5.2127, + "step": 29627 + }, + { + "epoch": 0.17620610904938624, + "grad_norm": 1.7403783798217773, + "learning_rate": 4.6266601742943526e-05, + "loss": 5.1372, + "step": 29628 + }, + { + "epoch": 0.17621205633266723, + "grad_norm": 2.6064391136169434, + "learning_rate": 4.626635618102142e-05, + "loss": 5.3963, + "step": 29629 + }, + { + "epoch": 0.17621800361594822, + "grad_norm": 1.4826772212982178, + "learning_rate": 4.6266110611675446e-05, + "loss": 5.7049, + "step": 29630 + }, + { + "epoch": 0.17622395089922924, + "grad_norm": 1.685837984085083, + "learning_rate": 4.62658650349057e-05, + "loss": 5.117, + "step": 29631 + }, + { + "epoch": 0.17622989818251023, + "grad_norm": 1.5930708646774292, + "learning_rate": 4.626561945071225e-05, + "loss": 5.1709, + "step": 29632 + }, + { + "epoch": 0.1762358454657912, + "grad_norm": 1.7052996158599854, + "learning_rate": 4.6265373859095197e-05, + "loss": 5.3743, + "step": 29633 + }, + { + "epoch": 0.17624179274907223, + "grad_norm": 1.9218865633010864, + "learning_rate": 4.626512826005462e-05, + "loss": 5.0207, + "step": 29634 + }, + { + "epoch": 0.17624774003235322, + "grad_norm": 2.1410880088806152, + "learning_rate": 4.62648826535906e-05, + "loss": 4.7898, + "step": 29635 + }, + { + "epoch": 0.1762536873156342, + "grad_norm": 3.278724431991577, + "learning_rate": 4.626463703970324e-05, + "loss": 3.7456, + "step": 29636 + }, + { + "epoch": 0.17625963459891522, + "grad_norm": 1.6557966470718384, + "learning_rate": 4.6264391418392615e-05, + "loss": 5.1905, + "step": 29637 + }, + { + "epoch": 0.1762655818821962, + "grad_norm": 1.3662563562393188, + "learning_rate": 4.6264145789658804e-05, + "loss": 5.2232, + "step": 29638 + }, + { + "epoch": 0.1762715291654772, + "grad_norm": 1.5638326406478882, + "learning_rate": 4.62639001535019e-05, + "loss": 5.0933, + "step": 29639 + }, + { + "epoch": 0.1762774764487582, + "grad_norm": 1.81962251663208, + "learning_rate": 4.6263654509921996e-05, + "loss": 4.6625, + "step": 29640 + }, + { + "epoch": 0.1762834237320392, + "grad_norm": 1.5421823263168335, + "learning_rate": 4.626340885891916e-05, + "loss": 5.0372, + "step": 29641 + }, + { + "epoch": 0.1762893710153202, + "grad_norm": 1.8756135702133179, + "learning_rate": 4.626316320049349e-05, + "loss": 5.224, + "step": 29642 + }, + { + "epoch": 0.1762953182986012, + "grad_norm": 1.617411494255066, + "learning_rate": 4.6262917534645076e-05, + "loss": 5.3449, + "step": 29643 + }, + { + "epoch": 0.1763012655818822, + "grad_norm": 1.3965401649475098, + "learning_rate": 4.626267186137399e-05, + "loss": 5.4929, + "step": 29644 + }, + { + "epoch": 0.17630721286516318, + "grad_norm": 1.4743956327438354, + "learning_rate": 4.626242618068033e-05, + "loss": 5.3105, + "step": 29645 + }, + { + "epoch": 0.1763131601484442, + "grad_norm": 1.5603059530258179, + "learning_rate": 4.626218049256417e-05, + "loss": 5.2059, + "step": 29646 + }, + { + "epoch": 0.17631910743172519, + "grad_norm": 1.5562357902526855, + "learning_rate": 4.626193479702561e-05, + "loss": 5.0752, + "step": 29647 + }, + { + "epoch": 0.17632505471500617, + "grad_norm": 1.4330555200576782, + "learning_rate": 4.6261689094064724e-05, + "loss": 5.0991, + "step": 29648 + }, + { + "epoch": 0.1763310019982872, + "grad_norm": 1.636109709739685, + "learning_rate": 4.62614433836816e-05, + "loss": 5.2, + "step": 29649 + }, + { + "epoch": 0.17633694928156818, + "grad_norm": 1.4994865655899048, + "learning_rate": 4.626119766587633e-05, + "loss": 5.4368, + "step": 29650 + }, + { + "epoch": 0.17634289656484917, + "grad_norm": 1.5928007364273071, + "learning_rate": 4.6260951940648996e-05, + "loss": 5.3432, + "step": 29651 + }, + { + "epoch": 0.17634884384813018, + "grad_norm": 2.4773452281951904, + "learning_rate": 4.626070620799968e-05, + "loss": 4.6023, + "step": 29652 + }, + { + "epoch": 0.17635479113141117, + "grad_norm": 1.4862966537475586, + "learning_rate": 4.626046046792847e-05, + "loss": 5.2271, + "step": 29653 + }, + { + "epoch": 0.17636073841469216, + "grad_norm": 1.659691333770752, + "learning_rate": 4.626021472043546e-05, + "loss": 5.1621, + "step": 29654 + }, + { + "epoch": 0.17636668569797317, + "grad_norm": 1.708454966545105, + "learning_rate": 4.625996896552073e-05, + "loss": 4.9272, + "step": 29655 + }, + { + "epoch": 0.17637263298125416, + "grad_norm": 1.7151225805282593, + "learning_rate": 4.625972320318435e-05, + "loss": 5.0272, + "step": 29656 + }, + { + "epoch": 0.17637858026453515, + "grad_norm": 1.635591983795166, + "learning_rate": 4.625947743342644e-05, + "loss": 5.1541, + "step": 29657 + }, + { + "epoch": 0.17638452754781617, + "grad_norm": 1.6878983974456787, + "learning_rate": 4.625923165624705e-05, + "loss": 5.1822, + "step": 29658 + }, + { + "epoch": 0.17639047483109715, + "grad_norm": 1.5905377864837646, + "learning_rate": 4.625898587164628e-05, + "loss": 4.9331, + "step": 29659 + }, + { + "epoch": 0.17639642211437814, + "grad_norm": 1.5988421440124512, + "learning_rate": 4.625874007962423e-05, + "loss": 4.811, + "step": 29660 + }, + { + "epoch": 0.17640236939765916, + "grad_norm": 1.725674033164978, + "learning_rate": 4.625849428018096e-05, + "loss": 4.95, + "step": 29661 + }, + { + "epoch": 0.17640831668094015, + "grad_norm": 1.6319259405136108, + "learning_rate": 4.625824847331658e-05, + "loss": 4.8133, + "step": 29662 + }, + { + "epoch": 0.17641426396422114, + "grad_norm": 1.6534069776535034, + "learning_rate": 4.625800265903116e-05, + "loss": 4.8914, + "step": 29663 + }, + { + "epoch": 0.17642021124750215, + "grad_norm": 1.6242649555206299, + "learning_rate": 4.6257756837324793e-05, + "loss": 5.1348, + "step": 29664 + }, + { + "epoch": 0.17642615853078314, + "grad_norm": 1.59992253780365, + "learning_rate": 4.625751100819757e-05, + "loss": 5.5775, + "step": 29665 + }, + { + "epoch": 0.17643210581406413, + "grad_norm": 1.8516936302185059, + "learning_rate": 4.625726517164956e-05, + "loss": 4.8874, + "step": 29666 + }, + { + "epoch": 0.17643805309734514, + "grad_norm": 2.0659658908843994, + "learning_rate": 4.625701932768086e-05, + "loss": 4.8295, + "step": 29667 + }, + { + "epoch": 0.17644400038062613, + "grad_norm": 1.914340615272522, + "learning_rate": 4.625677347629156e-05, + "loss": 4.8001, + "step": 29668 + }, + { + "epoch": 0.17644994766390712, + "grad_norm": 1.76264226436615, + "learning_rate": 4.6256527617481734e-05, + "loss": 5.0296, + "step": 29669 + }, + { + "epoch": 0.17645589494718814, + "grad_norm": 2.414245367050171, + "learning_rate": 4.625628175125147e-05, + "loss": 4.4596, + "step": 29670 + }, + { + "epoch": 0.17646184223046912, + "grad_norm": 2.4253740310668945, + "learning_rate": 4.625603587760087e-05, + "loss": 4.8557, + "step": 29671 + }, + { + "epoch": 0.1764677895137501, + "grad_norm": 1.5761579275131226, + "learning_rate": 4.6255789996529995e-05, + "loss": 5.3967, + "step": 29672 + }, + { + "epoch": 0.17647373679703113, + "grad_norm": 1.6232905387878418, + "learning_rate": 4.625554410803895e-05, + "loss": 5.2305, + "step": 29673 + }, + { + "epoch": 0.17647968408031212, + "grad_norm": 1.5074714422225952, + "learning_rate": 4.6255298212127806e-05, + "loss": 5.0091, + "step": 29674 + }, + { + "epoch": 0.1764856313635931, + "grad_norm": 1.4851216077804565, + "learning_rate": 4.625505230879667e-05, + "loss": 5.3812, + "step": 29675 + }, + { + "epoch": 0.17649157864687412, + "grad_norm": 1.5750563144683838, + "learning_rate": 4.62548063980456e-05, + "loss": 5.1194, + "step": 29676 + }, + { + "epoch": 0.1764975259301551, + "grad_norm": 1.6650339365005493, + "learning_rate": 4.625456047987471e-05, + "loss": 5.7083, + "step": 29677 + }, + { + "epoch": 0.1765034732134361, + "grad_norm": 1.6024653911590576, + "learning_rate": 4.625431455428407e-05, + "loss": 5.435, + "step": 29678 + }, + { + "epoch": 0.1765094204967171, + "grad_norm": 2.434255361557007, + "learning_rate": 4.625406862127376e-05, + "loss": 4.4856, + "step": 29679 + }, + { + "epoch": 0.1765153677799981, + "grad_norm": 2.248991012573242, + "learning_rate": 4.6253822680843885e-05, + "loss": 4.5724, + "step": 29680 + }, + { + "epoch": 0.1765213150632791, + "grad_norm": 2.187962293624878, + "learning_rate": 4.625357673299451e-05, + "loss": 4.7556, + "step": 29681 + }, + { + "epoch": 0.1765272623465601, + "grad_norm": 1.6530205011367798, + "learning_rate": 4.625333077772574e-05, + "loss": 5.1289, + "step": 29682 + }, + { + "epoch": 0.1765332096298411, + "grad_norm": 1.3826985359191895, + "learning_rate": 4.625308481503765e-05, + "loss": 5.2029, + "step": 29683 + }, + { + "epoch": 0.17653915691312208, + "grad_norm": 1.4573781490325928, + "learning_rate": 4.625283884493032e-05, + "loss": 5.1572, + "step": 29684 + }, + { + "epoch": 0.1765451041964031, + "grad_norm": 1.4935249090194702, + "learning_rate": 4.6252592867403856e-05, + "loss": 5.0828, + "step": 29685 + }, + { + "epoch": 0.17655105147968408, + "grad_norm": 1.6328359842300415, + "learning_rate": 4.625234688245832e-05, + "loss": 5.1604, + "step": 29686 + }, + { + "epoch": 0.17655699876296507, + "grad_norm": 1.4190014600753784, + "learning_rate": 4.6252100890093816e-05, + "loss": 4.9567, + "step": 29687 + }, + { + "epoch": 0.17656294604624606, + "grad_norm": 1.7209579944610596, + "learning_rate": 4.625185489031042e-05, + "loss": 4.412, + "step": 29688 + }, + { + "epoch": 0.17656889332952708, + "grad_norm": 1.5644607543945312, + "learning_rate": 4.625160888310822e-05, + "loss": 4.9651, + "step": 29689 + }, + { + "epoch": 0.17657484061280806, + "grad_norm": 1.498563289642334, + "learning_rate": 4.62513628684873e-05, + "loss": 5.4318, + "step": 29690 + }, + { + "epoch": 0.17658078789608905, + "grad_norm": 1.4302527904510498, + "learning_rate": 4.625111684644776e-05, + "loss": 4.9763, + "step": 29691 + }, + { + "epoch": 0.17658673517937007, + "grad_norm": 1.5234086513519287, + "learning_rate": 4.6250870816989664e-05, + "loss": 4.9747, + "step": 29692 + }, + { + "epoch": 0.17659268246265106, + "grad_norm": 1.611867904663086, + "learning_rate": 4.6250624780113116e-05, + "loss": 4.8275, + "step": 29693 + }, + { + "epoch": 0.17659862974593205, + "grad_norm": 2.0380537509918213, + "learning_rate": 4.625037873581819e-05, + "loss": 5.1795, + "step": 29694 + }, + { + "epoch": 0.17660457702921306, + "grad_norm": 1.433166742324829, + "learning_rate": 4.625013268410498e-05, + "loss": 5.3237, + "step": 29695 + }, + { + "epoch": 0.17661052431249405, + "grad_norm": 1.8627065420150757, + "learning_rate": 4.6249886624973564e-05, + "loss": 5.28, + "step": 29696 + }, + { + "epoch": 0.17661647159577504, + "grad_norm": 1.572050929069519, + "learning_rate": 4.6249640558424036e-05, + "loss": 5.3744, + "step": 29697 + }, + { + "epoch": 0.17662241887905605, + "grad_norm": 3.271996021270752, + "learning_rate": 4.624939448445648e-05, + "loss": 3.856, + "step": 29698 + }, + { + "epoch": 0.17662836616233704, + "grad_norm": 1.7473957538604736, + "learning_rate": 4.624914840307098e-05, + "loss": 4.7745, + "step": 29699 + }, + { + "epoch": 0.17663431344561803, + "grad_norm": 1.5957887172698975, + "learning_rate": 4.62489023142676e-05, + "loss": 5.3401, + "step": 29700 + }, + { + "epoch": 0.17664026072889905, + "grad_norm": 1.519698977470398, + "learning_rate": 4.624865621804647e-05, + "loss": 5.2996, + "step": 29701 + }, + { + "epoch": 0.17664620801218003, + "grad_norm": 1.4777617454528809, + "learning_rate": 4.624841011440765e-05, + "loss": 5.2181, + "step": 29702 + }, + { + "epoch": 0.17665215529546102, + "grad_norm": 1.5206866264343262, + "learning_rate": 4.624816400335123e-05, + "loss": 5.3529, + "step": 29703 + }, + { + "epoch": 0.17665810257874204, + "grad_norm": 1.6352920532226562, + "learning_rate": 4.6247917884877296e-05, + "loss": 5.3274, + "step": 29704 + }, + { + "epoch": 0.17666404986202303, + "grad_norm": 1.572554111480713, + "learning_rate": 4.6247671758985934e-05, + "loss": 5.3941, + "step": 29705 + }, + { + "epoch": 0.17666999714530401, + "grad_norm": 2.0956475734710693, + "learning_rate": 4.624742562567722e-05, + "loss": 4.0032, + "step": 29706 + }, + { + "epoch": 0.17667594442858503, + "grad_norm": 1.382948398590088, + "learning_rate": 4.624717948495126e-05, + "loss": 5.539, + "step": 29707 + }, + { + "epoch": 0.17668189171186602, + "grad_norm": 1.406977653503418, + "learning_rate": 4.6246933336808126e-05, + "loss": 5.5437, + "step": 29708 + }, + { + "epoch": 0.176687838995147, + "grad_norm": 1.6577895879745483, + "learning_rate": 4.62466871812479e-05, + "loss": 5.1155, + "step": 29709 + }, + { + "epoch": 0.17669378627842802, + "grad_norm": 1.9551897048950195, + "learning_rate": 4.624644101827069e-05, + "loss": 4.6531, + "step": 29710 + }, + { + "epoch": 0.176699733561709, + "grad_norm": 2.409532308578491, + "learning_rate": 4.624619484787655e-05, + "loss": 4.5918, + "step": 29711 + }, + { + "epoch": 0.17670568084499, + "grad_norm": 1.8758010864257812, + "learning_rate": 4.6245948670065594e-05, + "loss": 4.9051, + "step": 29712 + }, + { + "epoch": 0.17671162812827101, + "grad_norm": 1.777886152267456, + "learning_rate": 4.6245702484837894e-05, + "loss": 5.1955, + "step": 29713 + }, + { + "epoch": 0.176717575411552, + "grad_norm": 1.6413220167160034, + "learning_rate": 4.624545629219354e-05, + "loss": 4.9031, + "step": 29714 + }, + { + "epoch": 0.176723522694833, + "grad_norm": 1.7025271654129028, + "learning_rate": 4.624521009213262e-05, + "loss": 5.2195, + "step": 29715 + }, + { + "epoch": 0.176729469978114, + "grad_norm": 1.4530411958694458, + "learning_rate": 4.6244963884655204e-05, + "loss": 5.3771, + "step": 29716 + }, + { + "epoch": 0.176735417261395, + "grad_norm": 1.4960378408432007, + "learning_rate": 4.62447176697614e-05, + "loss": 5.1019, + "step": 29717 + }, + { + "epoch": 0.17674136454467598, + "grad_norm": 1.863013505935669, + "learning_rate": 4.624447144745129e-05, + "loss": 4.7721, + "step": 29718 + }, + { + "epoch": 0.176747311827957, + "grad_norm": 1.7837802171707153, + "learning_rate": 4.624422521772495e-05, + "loss": 5.2047, + "step": 29719 + }, + { + "epoch": 0.176753259111238, + "grad_norm": 2.3820879459381104, + "learning_rate": 4.6243978980582456e-05, + "loss": 4.7627, + "step": 29720 + }, + { + "epoch": 0.17675920639451898, + "grad_norm": 2.2981441020965576, + "learning_rate": 4.6243732736023926e-05, + "loss": 4.7149, + "step": 29721 + }, + { + "epoch": 0.1767651536778, + "grad_norm": 1.916215181350708, + "learning_rate": 4.6243486484049426e-05, + "loss": 4.6663, + "step": 29722 + }, + { + "epoch": 0.17677110096108098, + "grad_norm": 1.7512091398239136, + "learning_rate": 4.624324022465904e-05, + "loss": 5.0612, + "step": 29723 + }, + { + "epoch": 0.17677704824436197, + "grad_norm": 1.513918161392212, + "learning_rate": 4.6242993957852855e-05, + "loss": 5.131, + "step": 29724 + }, + { + "epoch": 0.17678299552764298, + "grad_norm": 1.5861341953277588, + "learning_rate": 4.6242747683630966e-05, + "loss": 5.1035, + "step": 29725 + }, + { + "epoch": 0.17678894281092397, + "grad_norm": 1.5094410181045532, + "learning_rate": 4.6242501401993454e-05, + "loss": 5.0484, + "step": 29726 + }, + { + "epoch": 0.17679489009420496, + "grad_norm": 1.5102661848068237, + "learning_rate": 4.6242255112940405e-05, + "loss": 5.0001, + "step": 29727 + }, + { + "epoch": 0.17680083737748598, + "grad_norm": 1.8255689144134521, + "learning_rate": 4.62420088164719e-05, + "loss": 5.4749, + "step": 29728 + }, + { + "epoch": 0.17680678466076696, + "grad_norm": 1.9394241571426392, + "learning_rate": 4.624176251258803e-05, + "loss": 5.3997, + "step": 29729 + }, + { + "epoch": 0.17681273194404795, + "grad_norm": 1.6546714305877686, + "learning_rate": 4.624151620128888e-05, + "loss": 5.396, + "step": 29730 + }, + { + "epoch": 0.17681867922732897, + "grad_norm": 1.55864679813385, + "learning_rate": 4.6241269882574534e-05, + "loss": 5.145, + "step": 29731 + }, + { + "epoch": 0.17682462651060996, + "grad_norm": 1.5503425598144531, + "learning_rate": 4.6241023556445084e-05, + "loss": 5.0982, + "step": 29732 + }, + { + "epoch": 0.17683057379389094, + "grad_norm": 1.6777262687683105, + "learning_rate": 4.624077722290061e-05, + "loss": 4.8005, + "step": 29733 + }, + { + "epoch": 0.17683652107717196, + "grad_norm": 1.4268922805786133, + "learning_rate": 4.62405308819412e-05, + "loss": 5.0045, + "step": 29734 + }, + { + "epoch": 0.17684246836045295, + "grad_norm": 1.7886883020401, + "learning_rate": 4.6240284533566946e-05, + "loss": 4.8464, + "step": 29735 + }, + { + "epoch": 0.17684841564373394, + "grad_norm": 1.5553979873657227, + "learning_rate": 4.624003817777792e-05, + "loss": 5.3561, + "step": 29736 + }, + { + "epoch": 0.17685436292701495, + "grad_norm": 1.508204698562622, + "learning_rate": 4.6239791814574224e-05, + "loss": 5.3903, + "step": 29737 + }, + { + "epoch": 0.17686031021029594, + "grad_norm": 1.3388547897338867, + "learning_rate": 4.623954544395593e-05, + "loss": 5.488, + "step": 29738 + }, + { + "epoch": 0.17686625749357693, + "grad_norm": 1.518465280532837, + "learning_rate": 4.623929906592313e-05, + "loss": 5.4595, + "step": 29739 + }, + { + "epoch": 0.17687220477685794, + "grad_norm": 1.5171095132827759, + "learning_rate": 4.623905268047592e-05, + "loss": 5.6942, + "step": 29740 + }, + { + "epoch": 0.17687815206013893, + "grad_norm": 1.4345729351043701, + "learning_rate": 4.623880628761436e-05, + "loss": 5.598, + "step": 29741 + }, + { + "epoch": 0.17688409934341992, + "grad_norm": 1.3692567348480225, + "learning_rate": 4.623855988733856e-05, + "loss": 5.8299, + "step": 29742 + }, + { + "epoch": 0.17689004662670094, + "grad_norm": 1.6717381477355957, + "learning_rate": 4.62383134796486e-05, + "loss": 4.9299, + "step": 29743 + }, + { + "epoch": 0.17689599390998192, + "grad_norm": 1.6725213527679443, + "learning_rate": 4.6238067064544565e-05, + "loss": 4.8448, + "step": 29744 + }, + { + "epoch": 0.1769019411932629, + "grad_norm": 1.885776400566101, + "learning_rate": 4.623782064202653e-05, + "loss": 4.8159, + "step": 29745 + }, + { + "epoch": 0.17690788847654393, + "grad_norm": 1.7408405542373657, + "learning_rate": 4.6237574212094605e-05, + "loss": 5.3162, + "step": 29746 + }, + { + "epoch": 0.17691383575982492, + "grad_norm": 1.4585955142974854, + "learning_rate": 4.6237327774748856e-05, + "loss": 5.933, + "step": 29747 + }, + { + "epoch": 0.1769197830431059, + "grad_norm": 1.6204352378845215, + "learning_rate": 4.623708132998937e-05, + "loss": 5.4457, + "step": 29748 + }, + { + "epoch": 0.1769257303263869, + "grad_norm": 1.4227222204208374, + "learning_rate": 4.623683487781625e-05, + "loss": 5.387, + "step": 29749 + }, + { + "epoch": 0.1769316776096679, + "grad_norm": 1.4104609489440918, + "learning_rate": 4.623658841822956e-05, + "loss": 5.5075, + "step": 29750 + }, + { + "epoch": 0.1769376248929489, + "grad_norm": 2.1077404022216797, + "learning_rate": 4.6236341951229406e-05, + "loss": 4.343, + "step": 29751 + }, + { + "epoch": 0.17694357217622989, + "grad_norm": 1.820806622505188, + "learning_rate": 4.6236095476815855e-05, + "loss": 4.8388, + "step": 29752 + }, + { + "epoch": 0.1769495194595109, + "grad_norm": 1.6640592813491821, + "learning_rate": 4.623584899498901e-05, + "loss": 5.129, + "step": 29753 + }, + { + "epoch": 0.1769554667427919, + "grad_norm": 1.6439399719238281, + "learning_rate": 4.623560250574894e-05, + "loss": 5.1712, + "step": 29754 + }, + { + "epoch": 0.17696141402607288, + "grad_norm": 1.6510851383209229, + "learning_rate": 4.623535600909575e-05, + "loss": 5.1796, + "step": 29755 + }, + { + "epoch": 0.1769673613093539, + "grad_norm": 1.8089758157730103, + "learning_rate": 4.6235109505029515e-05, + "loss": 4.5897, + "step": 29756 + }, + { + "epoch": 0.17697330859263488, + "grad_norm": 1.734377384185791, + "learning_rate": 4.6234862993550324e-05, + "loss": 5.1078, + "step": 29757 + }, + { + "epoch": 0.17697925587591587, + "grad_norm": 1.7873172760009766, + "learning_rate": 4.623461647465825e-05, + "loss": 5.3811, + "step": 29758 + }, + { + "epoch": 0.17698520315919689, + "grad_norm": 2.1304049491882324, + "learning_rate": 4.623436994835341e-05, + "loss": 4.6419, + "step": 29759 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 2.734135150909424, + "learning_rate": 4.6234123414635856e-05, + "loss": 4.4103, + "step": 29760 + }, + { + "epoch": 0.17699709772575886, + "grad_norm": 1.9526289701461792, + "learning_rate": 4.6233876873505694e-05, + "loss": 4.4495, + "step": 29761 + }, + { + "epoch": 0.17700304500903988, + "grad_norm": 1.7902294397354126, + "learning_rate": 4.6233630324963004e-05, + "loss": 4.9202, + "step": 29762 + }, + { + "epoch": 0.17700899229232087, + "grad_norm": 2.161142587661743, + "learning_rate": 4.6233383769007874e-05, + "loss": 4.1941, + "step": 29763 + }, + { + "epoch": 0.17701493957560185, + "grad_norm": 2.3652687072753906, + "learning_rate": 4.6233137205640386e-05, + "loss": 4.085, + "step": 29764 + }, + { + "epoch": 0.17702088685888287, + "grad_norm": 2.204157829284668, + "learning_rate": 4.6232890634860635e-05, + "loss": 3.9856, + "step": 29765 + }, + { + "epoch": 0.17702683414216386, + "grad_norm": 2.5543384552001953, + "learning_rate": 4.6232644056668695e-05, + "loss": 4.1421, + "step": 29766 + }, + { + "epoch": 0.17703278142544485, + "grad_norm": 2.0842933654785156, + "learning_rate": 4.623239747106466e-05, + "loss": 3.8326, + "step": 29767 + }, + { + "epoch": 0.17703872870872586, + "grad_norm": 1.953341007232666, + "learning_rate": 4.623215087804862e-05, + "loss": 4.0444, + "step": 29768 + }, + { + "epoch": 0.17704467599200685, + "grad_norm": 2.1980764865875244, + "learning_rate": 4.6231904277620644e-05, + "loss": 4.2192, + "step": 29769 + }, + { + "epoch": 0.17705062327528784, + "grad_norm": 2.225207567214966, + "learning_rate": 4.6231657669780836e-05, + "loss": 4.7365, + "step": 29770 + }, + { + "epoch": 0.17705657055856885, + "grad_norm": 2.128333330154419, + "learning_rate": 4.623141105452928e-05, + "loss": 5.6755, + "step": 29771 + }, + { + "epoch": 0.17706251784184984, + "grad_norm": 1.8886544704437256, + "learning_rate": 4.623116443186605e-05, + "loss": 4.9885, + "step": 29772 + }, + { + "epoch": 0.17706846512513083, + "grad_norm": 3.213632345199585, + "learning_rate": 4.623091780179125e-05, + "loss": 3.1388, + "step": 29773 + }, + { + "epoch": 0.17707441240841185, + "grad_norm": 2.6279642581939697, + "learning_rate": 4.623067116430495e-05, + "loss": 4.1536, + "step": 29774 + }, + { + "epoch": 0.17708035969169283, + "grad_norm": 1.6456087827682495, + "learning_rate": 4.623042451940724e-05, + "loss": 5.1824, + "step": 29775 + }, + { + "epoch": 0.17708630697497382, + "grad_norm": 1.8505003452301025, + "learning_rate": 4.623017786709821e-05, + "loss": 5.1548, + "step": 29776 + }, + { + "epoch": 0.17709225425825484, + "grad_norm": 1.5285630226135254, + "learning_rate": 4.622993120737794e-05, + "loss": 5.1444, + "step": 29777 + }, + { + "epoch": 0.17709820154153583, + "grad_norm": 1.6634210348129272, + "learning_rate": 4.622968454024652e-05, + "loss": 5.3108, + "step": 29778 + }, + { + "epoch": 0.17710414882481681, + "grad_norm": 1.6948342323303223, + "learning_rate": 4.622943786570405e-05, + "loss": 5.0025, + "step": 29779 + }, + { + "epoch": 0.17711009610809783, + "grad_norm": 2.1120948791503906, + "learning_rate": 4.6229191183750594e-05, + "loss": 4.6668, + "step": 29780 + }, + { + "epoch": 0.17711604339137882, + "grad_norm": 5.567571640014648, + "learning_rate": 4.622894449438624e-05, + "loss": 4.7644, + "step": 29781 + }, + { + "epoch": 0.1771219906746598, + "grad_norm": 4.830391883850098, + "learning_rate": 4.622869779761109e-05, + "loss": 4.5086, + "step": 29782 + }, + { + "epoch": 0.17712793795794082, + "grad_norm": 3.956571578979492, + "learning_rate": 4.622845109342522e-05, + "loss": 4.311, + "step": 29783 + }, + { + "epoch": 0.1771338852412218, + "grad_norm": 3.274723529815674, + "learning_rate": 4.622820438182871e-05, + "loss": 4.3097, + "step": 29784 + }, + { + "epoch": 0.1771398325245028, + "grad_norm": 2.478320360183716, + "learning_rate": 4.6227957662821666e-05, + "loss": 4.4818, + "step": 29785 + }, + { + "epoch": 0.17714577980778382, + "grad_norm": 1.271023154258728, + "learning_rate": 4.6227710936404144e-05, + "loss": 5.4578, + "step": 29786 + }, + { + "epoch": 0.1771517270910648, + "grad_norm": 1.687338948249817, + "learning_rate": 4.622746420257626e-05, + "loss": 5.0832, + "step": 29787 + }, + { + "epoch": 0.1771576743743458, + "grad_norm": 1.6693392992019653, + "learning_rate": 4.6227217461338084e-05, + "loss": 5.23, + "step": 29788 + }, + { + "epoch": 0.1771636216576268, + "grad_norm": 1.884928822517395, + "learning_rate": 4.622697071268971e-05, + "loss": 4.4254, + "step": 29789 + }, + { + "epoch": 0.1771695689409078, + "grad_norm": 1.8463094234466553, + "learning_rate": 4.622672395663121e-05, + "loss": 4.3649, + "step": 29790 + }, + { + "epoch": 0.17717551622418878, + "grad_norm": 1.5451326370239258, + "learning_rate": 4.6226477193162685e-05, + "loss": 4.7212, + "step": 29791 + }, + { + "epoch": 0.1771814635074698, + "grad_norm": 1.6390217542648315, + "learning_rate": 4.622623042228422e-05, + "loss": 5.5775, + "step": 29792 + }, + { + "epoch": 0.1771874107907508, + "grad_norm": 1.553244709968567, + "learning_rate": 4.62259836439959e-05, + "loss": 5.5905, + "step": 29793 + }, + { + "epoch": 0.17719335807403178, + "grad_norm": 1.398796558380127, + "learning_rate": 4.62257368582978e-05, + "loss": 5.5597, + "step": 29794 + }, + { + "epoch": 0.1771993053573128, + "grad_norm": 1.6612623929977417, + "learning_rate": 4.622549006519001e-05, + "loss": 4.9175, + "step": 29795 + }, + { + "epoch": 0.17720525264059378, + "grad_norm": 1.7774828672409058, + "learning_rate": 4.622524326467263e-05, + "loss": 5.2457, + "step": 29796 + }, + { + "epoch": 0.17721119992387477, + "grad_norm": 1.447310447692871, + "learning_rate": 4.622499645674574e-05, + "loss": 4.6974, + "step": 29797 + }, + { + "epoch": 0.17721714720715578, + "grad_norm": 1.8368786573410034, + "learning_rate": 4.6224749641409417e-05, + "loss": 4.7698, + "step": 29798 + }, + { + "epoch": 0.17722309449043677, + "grad_norm": 1.7796480655670166, + "learning_rate": 4.622450281866375e-05, + "loss": 5.0171, + "step": 29799 + }, + { + "epoch": 0.17722904177371776, + "grad_norm": 1.584720492362976, + "learning_rate": 4.6224255988508836e-05, + "loss": 5.5296, + "step": 29800 + }, + { + "epoch": 0.17723498905699878, + "grad_norm": 1.7539535760879517, + "learning_rate": 4.622400915094475e-05, + "loss": 5.5441, + "step": 29801 + }, + { + "epoch": 0.17724093634027976, + "grad_norm": 1.608579397201538, + "learning_rate": 4.6223762305971576e-05, + "loss": 5.3746, + "step": 29802 + }, + { + "epoch": 0.17724688362356075, + "grad_norm": 1.7146000862121582, + "learning_rate": 4.622351545358942e-05, + "loss": 5.2776, + "step": 29803 + }, + { + "epoch": 0.17725283090684177, + "grad_norm": 1.741254448890686, + "learning_rate": 4.622326859379834e-05, + "loss": 5.14, + "step": 29804 + }, + { + "epoch": 0.17725877819012276, + "grad_norm": 1.669607162475586, + "learning_rate": 4.6223021726598434e-05, + "loss": 4.9702, + "step": 29805 + }, + { + "epoch": 0.17726472547340374, + "grad_norm": 1.817954659461975, + "learning_rate": 4.62227748519898e-05, + "loss": 5.1888, + "step": 29806 + }, + { + "epoch": 0.17727067275668473, + "grad_norm": 1.7606234550476074, + "learning_rate": 4.6222527969972516e-05, + "loss": 5.1171, + "step": 29807 + }, + { + "epoch": 0.17727662003996575, + "grad_norm": 1.6854933500289917, + "learning_rate": 4.622228108054666e-05, + "loss": 4.9143, + "step": 29808 + }, + { + "epoch": 0.17728256732324674, + "grad_norm": 1.801241159439087, + "learning_rate": 4.622203418371233e-05, + "loss": 4.5452, + "step": 29809 + }, + { + "epoch": 0.17728851460652773, + "grad_norm": 1.7132951021194458, + "learning_rate": 4.6221787279469606e-05, + "loss": 5.6643, + "step": 29810 + }, + { + "epoch": 0.17729446188980874, + "grad_norm": 1.5202804803848267, + "learning_rate": 4.6221540367818576e-05, + "loss": 5.7674, + "step": 29811 + }, + { + "epoch": 0.17730040917308973, + "grad_norm": 1.3772656917572021, + "learning_rate": 4.622129344875932e-05, + "loss": 5.4231, + "step": 29812 + }, + { + "epoch": 0.17730635645637072, + "grad_norm": 1.7075127363204956, + "learning_rate": 4.6221046522291936e-05, + "loss": 5.1009, + "step": 29813 + }, + { + "epoch": 0.17731230373965173, + "grad_norm": 1.6497002840042114, + "learning_rate": 4.622079958841651e-05, + "loss": 5.2202, + "step": 29814 + }, + { + "epoch": 0.17731825102293272, + "grad_norm": 1.796449065208435, + "learning_rate": 4.622055264713311e-05, + "loss": 4.9304, + "step": 29815 + }, + { + "epoch": 0.1773241983062137, + "grad_norm": 1.6709007024765015, + "learning_rate": 4.6220305698441836e-05, + "loss": 4.9885, + "step": 29816 + }, + { + "epoch": 0.17733014558949473, + "grad_norm": 1.4689090251922607, + "learning_rate": 4.622005874234278e-05, + "loss": 4.9051, + "step": 29817 + }, + { + "epoch": 0.1773360928727757, + "grad_norm": 1.7701568603515625, + "learning_rate": 4.621981177883601e-05, + "loss": 4.8309, + "step": 29818 + }, + { + "epoch": 0.1773420401560567, + "grad_norm": 1.6992321014404297, + "learning_rate": 4.621956480792163e-05, + "loss": 4.7161, + "step": 29819 + }, + { + "epoch": 0.17734798743933772, + "grad_norm": 1.7641901969909668, + "learning_rate": 4.6219317829599715e-05, + "loss": 4.5102, + "step": 29820 + }, + { + "epoch": 0.1773539347226187, + "grad_norm": 1.9778741598129272, + "learning_rate": 4.621907084387036e-05, + "loss": 5.0063, + "step": 29821 + }, + { + "epoch": 0.1773598820058997, + "grad_norm": 2.4267444610595703, + "learning_rate": 4.6218823850733636e-05, + "loss": 4.6155, + "step": 29822 + }, + { + "epoch": 0.1773658292891807, + "grad_norm": 1.8586831092834473, + "learning_rate": 4.6218576850189655e-05, + "loss": 5.1348, + "step": 29823 + }, + { + "epoch": 0.1773717765724617, + "grad_norm": 2.0853071212768555, + "learning_rate": 4.621832984223849e-05, + "loss": 4.9064, + "step": 29824 + }, + { + "epoch": 0.17737772385574269, + "grad_norm": 1.9400508403778076, + "learning_rate": 4.6218082826880205e-05, + "loss": 5.0123, + "step": 29825 + }, + { + "epoch": 0.1773836711390237, + "grad_norm": 1.6919422149658203, + "learning_rate": 4.621783580411492e-05, + "loss": 4.8755, + "step": 29826 + }, + { + "epoch": 0.1773896184223047, + "grad_norm": 2.295384407043457, + "learning_rate": 4.621758877394271e-05, + "loss": 3.9202, + "step": 29827 + }, + { + "epoch": 0.17739556570558568, + "grad_norm": 2.417031764984131, + "learning_rate": 4.621734173636365e-05, + "loss": 4.441, + "step": 29828 + }, + { + "epoch": 0.1774015129888667, + "grad_norm": 3.097060203552246, + "learning_rate": 4.6217094691377835e-05, + "loss": 4.6754, + "step": 29829 + }, + { + "epoch": 0.17740746027214768, + "grad_norm": 2.9717020988464355, + "learning_rate": 4.621684763898536e-05, + "loss": 4.7217, + "step": 29830 + }, + { + "epoch": 0.17741340755542867, + "grad_norm": 1.9695039987564087, + "learning_rate": 4.62166005791863e-05, + "loss": 5.0213, + "step": 29831 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 1.6653083562850952, + "learning_rate": 4.621635351198074e-05, + "loss": 4.7739, + "step": 29832 + }, + { + "epoch": 0.17742530212199067, + "grad_norm": 1.9583450555801392, + "learning_rate": 4.621610643736878e-05, + "loss": 5.0863, + "step": 29833 + }, + { + "epoch": 0.17743124940527166, + "grad_norm": 2.460378646850586, + "learning_rate": 4.621585935535049e-05, + "loss": 4.5889, + "step": 29834 + }, + { + "epoch": 0.17743719668855268, + "grad_norm": 2.478996992111206, + "learning_rate": 4.621561226592596e-05, + "loss": 3.7157, + "step": 29835 + }, + { + "epoch": 0.17744314397183367, + "grad_norm": 2.4852869510650635, + "learning_rate": 4.6215365169095283e-05, + "loss": 3.674, + "step": 29836 + }, + { + "epoch": 0.17744909125511465, + "grad_norm": 3.0013065338134766, + "learning_rate": 4.621511806485853e-05, + "loss": 3.6854, + "step": 29837 + }, + { + "epoch": 0.17745503853839567, + "grad_norm": 2.071744918823242, + "learning_rate": 4.621487095321581e-05, + "loss": 4.2681, + "step": 29838 + }, + { + "epoch": 0.17746098582167666, + "grad_norm": 1.7033419609069824, + "learning_rate": 4.62146238341672e-05, + "loss": 5.4001, + "step": 29839 + }, + { + "epoch": 0.17746693310495765, + "grad_norm": 1.9189993143081665, + "learning_rate": 4.621437670771278e-05, + "loss": 4.9708, + "step": 29840 + }, + { + "epoch": 0.17747288038823866, + "grad_norm": 1.924280047416687, + "learning_rate": 4.621412957385264e-05, + "loss": 4.5928, + "step": 29841 + }, + { + "epoch": 0.17747882767151965, + "grad_norm": 1.6338030099868774, + "learning_rate": 4.621388243258686e-05, + "loss": 4.6546, + "step": 29842 + }, + { + "epoch": 0.17748477495480064, + "grad_norm": 1.6776630878448486, + "learning_rate": 4.621363528391555e-05, + "loss": 5.0897, + "step": 29843 + }, + { + "epoch": 0.17749072223808166, + "grad_norm": 2.56796932220459, + "learning_rate": 4.621338812783877e-05, + "loss": 4.1294, + "step": 29844 + }, + { + "epoch": 0.17749666952136264, + "grad_norm": 2.6277754306793213, + "learning_rate": 4.621314096435661e-05, + "loss": 4.2364, + "step": 29845 + }, + { + "epoch": 0.17750261680464363, + "grad_norm": 2.836585760116577, + "learning_rate": 4.621289379346916e-05, + "loss": 4.4706, + "step": 29846 + }, + { + "epoch": 0.17750856408792465, + "grad_norm": 2.3705074787139893, + "learning_rate": 4.6212646615176514e-05, + "loss": 4.245, + "step": 29847 + }, + { + "epoch": 0.17751451137120564, + "grad_norm": 1.7258014678955078, + "learning_rate": 4.621239942947875e-05, + "loss": 5.338, + "step": 29848 + }, + { + "epoch": 0.17752045865448662, + "grad_norm": 1.5844351053237915, + "learning_rate": 4.621215223637596e-05, + "loss": 4.85, + "step": 29849 + }, + { + "epoch": 0.17752640593776764, + "grad_norm": 1.7583924531936646, + "learning_rate": 4.6211905035868224e-05, + "loss": 4.1059, + "step": 29850 + }, + { + "epoch": 0.17753235322104863, + "grad_norm": 1.7784796953201294, + "learning_rate": 4.621165782795564e-05, + "loss": 4.1206, + "step": 29851 + }, + { + "epoch": 0.17753830050432962, + "grad_norm": 2.0315020084381104, + "learning_rate": 4.6211410612638273e-05, + "loss": 4.8268, + "step": 29852 + }, + { + "epoch": 0.17754424778761063, + "grad_norm": 2.137162923812866, + "learning_rate": 4.621116338991622e-05, + "loss": 4.6874, + "step": 29853 + }, + { + "epoch": 0.17755019507089162, + "grad_norm": 2.5275580883026123, + "learning_rate": 4.621091615978957e-05, + "loss": 4.4036, + "step": 29854 + }, + { + "epoch": 0.1775561423541726, + "grad_norm": 2.170762062072754, + "learning_rate": 4.621066892225842e-05, + "loss": 4.8377, + "step": 29855 + }, + { + "epoch": 0.17756208963745362, + "grad_norm": 1.592443823814392, + "learning_rate": 4.6210421677322833e-05, + "loss": 4.4257, + "step": 29856 + }, + { + "epoch": 0.1775680369207346, + "grad_norm": 1.479036569595337, + "learning_rate": 4.6210174424982914e-05, + "loss": 4.0881, + "step": 29857 + }, + { + "epoch": 0.1775739842040156, + "grad_norm": 1.5338127613067627, + "learning_rate": 4.620992716523874e-05, + "loss": 4.1253, + "step": 29858 + }, + { + "epoch": 0.17757993148729662, + "grad_norm": 1.6100810766220093, + "learning_rate": 4.62096798980904e-05, + "loss": 4.224, + "step": 29859 + }, + { + "epoch": 0.1775858787705776, + "grad_norm": 1.6029894351959229, + "learning_rate": 4.6209432623537984e-05, + "loss": 4.3054, + "step": 29860 + }, + { + "epoch": 0.1775918260538586, + "grad_norm": 1.5900243520736694, + "learning_rate": 4.620918534158157e-05, + "loss": 5.0011, + "step": 29861 + }, + { + "epoch": 0.1775977733371396, + "grad_norm": 1.4453150033950806, + "learning_rate": 4.620893805222124e-05, + "loss": 5.035, + "step": 29862 + }, + { + "epoch": 0.1776037206204206, + "grad_norm": 1.2561450004577637, + "learning_rate": 4.62086907554571e-05, + "loss": 5.0042, + "step": 29863 + }, + { + "epoch": 0.17760966790370158, + "grad_norm": 1.6127535104751587, + "learning_rate": 4.620844345128923e-05, + "loss": 5.1504, + "step": 29864 + }, + { + "epoch": 0.17761561518698257, + "grad_norm": 1.505549430847168, + "learning_rate": 4.6208196139717697e-05, + "loss": 5.2917, + "step": 29865 + }, + { + "epoch": 0.1776215624702636, + "grad_norm": 1.652327537536621, + "learning_rate": 4.620794882074261e-05, + "loss": 4.7241, + "step": 29866 + }, + { + "epoch": 0.17762750975354458, + "grad_norm": 1.750353217124939, + "learning_rate": 4.620770149436405e-05, + "loss": 4.1058, + "step": 29867 + }, + { + "epoch": 0.17763345703682556, + "grad_norm": 1.6184377670288086, + "learning_rate": 4.6207454160582094e-05, + "loss": 4.0415, + "step": 29868 + }, + { + "epoch": 0.17763940432010658, + "grad_norm": 1.493651270866394, + "learning_rate": 4.6207206819396834e-05, + "loss": 4.3537, + "step": 29869 + }, + { + "epoch": 0.17764535160338757, + "grad_norm": 1.4839292764663696, + "learning_rate": 4.6206959470808364e-05, + "loss": 4.7692, + "step": 29870 + }, + { + "epoch": 0.17765129888666856, + "grad_norm": 1.726027488708496, + "learning_rate": 4.620671211481676e-05, + "loss": 4.971, + "step": 29871 + }, + { + "epoch": 0.17765724616994957, + "grad_norm": 1.6284557580947876, + "learning_rate": 4.6206464751422105e-05, + "loss": 4.4246, + "step": 29872 + }, + { + "epoch": 0.17766319345323056, + "grad_norm": 2.2713751792907715, + "learning_rate": 4.6206217380624505e-05, + "loss": 4.3045, + "step": 29873 + }, + { + "epoch": 0.17766914073651155, + "grad_norm": 1.8392630815505981, + "learning_rate": 4.620597000242403e-05, + "loss": 4.1344, + "step": 29874 + }, + { + "epoch": 0.17767508801979257, + "grad_norm": 1.5239953994750977, + "learning_rate": 4.620572261682077e-05, + "loss": 3.9802, + "step": 29875 + }, + { + "epoch": 0.17768103530307355, + "grad_norm": 1.6723328828811646, + "learning_rate": 4.6205475223814804e-05, + "loss": 4.0901, + "step": 29876 + }, + { + "epoch": 0.17768698258635454, + "grad_norm": 1.555239200592041, + "learning_rate": 4.620522782340623e-05, + "loss": 3.9096, + "step": 29877 + }, + { + "epoch": 0.17769292986963556, + "grad_norm": 1.8839585781097412, + "learning_rate": 4.620498041559513e-05, + "loss": 4.9657, + "step": 29878 + }, + { + "epoch": 0.17769887715291655, + "grad_norm": 1.9911398887634277, + "learning_rate": 4.620473300038159e-05, + "loss": 4.5497, + "step": 29879 + }, + { + "epoch": 0.17770482443619753, + "grad_norm": 2.2058022022247314, + "learning_rate": 4.62044855777657e-05, + "loss": 3.7231, + "step": 29880 + }, + { + "epoch": 0.17771077171947855, + "grad_norm": 2.0669283866882324, + "learning_rate": 4.6204238147747535e-05, + "loss": 3.8466, + "step": 29881 + }, + { + "epoch": 0.17771671900275954, + "grad_norm": 2.122668981552124, + "learning_rate": 4.62039907103272e-05, + "loss": 3.5758, + "step": 29882 + }, + { + "epoch": 0.17772266628604053, + "grad_norm": 2.091607093811035, + "learning_rate": 4.6203743265504765e-05, + "loss": 3.3965, + "step": 29883 + }, + { + "epoch": 0.17772861356932154, + "grad_norm": 2.204787492752075, + "learning_rate": 4.620349581328033e-05, + "loss": 4.3546, + "step": 29884 + }, + { + "epoch": 0.17773456085260253, + "grad_norm": 1.5886098146438599, + "learning_rate": 4.620324835365396e-05, + "loss": 5.0842, + "step": 29885 + }, + { + "epoch": 0.17774050813588352, + "grad_norm": 1.6993340253829956, + "learning_rate": 4.6203000886625766e-05, + "loss": 4.8315, + "step": 29886 + }, + { + "epoch": 0.17774645541916453, + "grad_norm": 1.6817113161087036, + "learning_rate": 4.620275341219582e-05, + "loss": 4.9972, + "step": 29887 + }, + { + "epoch": 0.17775240270244552, + "grad_norm": 1.7113308906555176, + "learning_rate": 4.620250593036421e-05, + "loss": 4.8823, + "step": 29888 + }, + { + "epoch": 0.1777583499857265, + "grad_norm": 1.7548478841781616, + "learning_rate": 4.620225844113103e-05, + "loss": 5.121, + "step": 29889 + }, + { + "epoch": 0.17776429726900753, + "grad_norm": 1.8111287355422974, + "learning_rate": 4.6202010944496356e-05, + "loss": 4.8074, + "step": 29890 + }, + { + "epoch": 0.17777024455228851, + "grad_norm": 1.279390573501587, + "learning_rate": 4.620176344046028e-05, + "loss": 4.6303, + "step": 29891 + }, + { + "epoch": 0.1777761918355695, + "grad_norm": 1.2164942026138306, + "learning_rate": 4.620151592902288e-05, + "loss": 4.6222, + "step": 29892 + }, + { + "epoch": 0.17778213911885052, + "grad_norm": 1.5320428609848022, + "learning_rate": 4.620126841018426e-05, + "loss": 4.9938, + "step": 29893 + }, + { + "epoch": 0.1777880864021315, + "grad_norm": 1.5564218759536743, + "learning_rate": 4.620102088394449e-05, + "loss": 4.961, + "step": 29894 + }, + { + "epoch": 0.1777940336854125, + "grad_norm": 1.5532233715057373, + "learning_rate": 4.6200773350303675e-05, + "loss": 4.8086, + "step": 29895 + }, + { + "epoch": 0.1777999809686935, + "grad_norm": 1.9697725772857666, + "learning_rate": 4.620052580926187e-05, + "loss": 4.6753, + "step": 29896 + }, + { + "epoch": 0.1778059282519745, + "grad_norm": 2.0587549209594727, + "learning_rate": 4.62002782608192e-05, + "loss": 5.3824, + "step": 29897 + }, + { + "epoch": 0.1778118755352555, + "grad_norm": 1.5464704036712646, + "learning_rate": 4.620003070497572e-05, + "loss": 5.2827, + "step": 29898 + }, + { + "epoch": 0.1778178228185365, + "grad_norm": 2.052751064300537, + "learning_rate": 4.619978314173152e-05, + "loss": 4.8924, + "step": 29899 + }, + { + "epoch": 0.1778237701018175, + "grad_norm": 1.857614517211914, + "learning_rate": 4.619953557108671e-05, + "loss": 4.9826, + "step": 29900 + }, + { + "epoch": 0.17782971738509848, + "grad_norm": 1.5344221591949463, + "learning_rate": 4.619928799304136e-05, + "loss": 5.0715, + "step": 29901 + }, + { + "epoch": 0.1778356646683795, + "grad_norm": 1.6682283878326416, + "learning_rate": 4.619904040759555e-05, + "loss": 5.5025, + "step": 29902 + }, + { + "epoch": 0.17784161195166048, + "grad_norm": 1.8382456302642822, + "learning_rate": 4.619879281474938e-05, + "loss": 5.0428, + "step": 29903 + }, + { + "epoch": 0.17784755923494147, + "grad_norm": 1.5137388706207275, + "learning_rate": 4.619854521450293e-05, + "loss": 5.1731, + "step": 29904 + }, + { + "epoch": 0.1778535065182225, + "grad_norm": 1.5241427421569824, + "learning_rate": 4.619829760685628e-05, + "loss": 5.11, + "step": 29905 + }, + { + "epoch": 0.17785945380150348, + "grad_norm": 1.6426124572753906, + "learning_rate": 4.6198049991809534e-05, + "loss": 5.0386, + "step": 29906 + }, + { + "epoch": 0.17786540108478446, + "grad_norm": 1.240784764289856, + "learning_rate": 4.6197802369362756e-05, + "loss": 4.9999, + "step": 29907 + }, + { + "epoch": 0.17787134836806548, + "grad_norm": 1.7629567384719849, + "learning_rate": 4.6197554739516054e-05, + "loss": 5.1035, + "step": 29908 + }, + { + "epoch": 0.17787729565134647, + "grad_norm": 1.7833048105239868, + "learning_rate": 4.61973071022695e-05, + "loss": 5.2879, + "step": 29909 + }, + { + "epoch": 0.17788324293462746, + "grad_norm": 1.6848218441009521, + "learning_rate": 4.619705945762318e-05, + "loss": 5.1269, + "step": 29910 + }, + { + "epoch": 0.17788919021790847, + "grad_norm": 1.917606234550476, + "learning_rate": 4.61968118055772e-05, + "loss": 4.422, + "step": 29911 + }, + { + "epoch": 0.17789513750118946, + "grad_norm": 2.092909336090088, + "learning_rate": 4.619656414613162e-05, + "loss": 4.4046, + "step": 29912 + }, + { + "epoch": 0.17790108478447045, + "grad_norm": 1.580072283744812, + "learning_rate": 4.6196316479286547e-05, + "loss": 5.117, + "step": 29913 + }, + { + "epoch": 0.17790703206775146, + "grad_norm": 1.5650675296783447, + "learning_rate": 4.619606880504205e-05, + "loss": 5.0848, + "step": 29914 + }, + { + "epoch": 0.17791297935103245, + "grad_norm": 1.5918974876403809, + "learning_rate": 4.619582112339823e-05, + "loss": 5.108, + "step": 29915 + }, + { + "epoch": 0.17791892663431344, + "grad_norm": 1.6393519639968872, + "learning_rate": 4.619557343435516e-05, + "loss": 5.1883, + "step": 29916 + }, + { + "epoch": 0.17792487391759446, + "grad_norm": 1.6605910062789917, + "learning_rate": 4.619532573791294e-05, + "loss": 5.3422, + "step": 29917 + }, + { + "epoch": 0.17793082120087544, + "grad_norm": 1.618237853050232, + "learning_rate": 4.619507803407166e-05, + "loss": 5.3366, + "step": 29918 + }, + { + "epoch": 0.17793676848415643, + "grad_norm": 1.7383369207382202, + "learning_rate": 4.6194830322831384e-05, + "loss": 5.2423, + "step": 29919 + }, + { + "epoch": 0.17794271576743745, + "grad_norm": 1.7745330333709717, + "learning_rate": 4.619458260419222e-05, + "loss": 5.5013, + "step": 29920 + }, + { + "epoch": 0.17794866305071844, + "grad_norm": 1.64639151096344, + "learning_rate": 4.6194334878154244e-05, + "loss": 5.6739, + "step": 29921 + }, + { + "epoch": 0.17795461033399942, + "grad_norm": 1.6652768850326538, + "learning_rate": 4.619408714471754e-05, + "loss": 5.5507, + "step": 29922 + }, + { + "epoch": 0.1779605576172804, + "grad_norm": 1.8969260454177856, + "learning_rate": 4.61938394038822e-05, + "loss": 4.7228, + "step": 29923 + }, + { + "epoch": 0.17796650490056143, + "grad_norm": 2.7471752166748047, + "learning_rate": 4.619359165564832e-05, + "loss": 3.7551, + "step": 29924 + }, + { + "epoch": 0.17797245218384242, + "grad_norm": 1.68784499168396, + "learning_rate": 4.6193343900015964e-05, + "loss": 4.6853, + "step": 29925 + }, + { + "epoch": 0.1779783994671234, + "grad_norm": 1.6362453699111938, + "learning_rate": 4.619309613698523e-05, + "loss": 4.665, + "step": 29926 + }, + { + "epoch": 0.17798434675040442, + "grad_norm": 1.737727165222168, + "learning_rate": 4.619284836655621e-05, + "loss": 4.9511, + "step": 29927 + }, + { + "epoch": 0.1779902940336854, + "grad_norm": 1.4916706085205078, + "learning_rate": 4.6192600588728985e-05, + "loss": 4.9043, + "step": 29928 + }, + { + "epoch": 0.1779962413169664, + "grad_norm": 1.6925257444381714, + "learning_rate": 4.619235280350365e-05, + "loss": 4.764, + "step": 29929 + }, + { + "epoch": 0.1780021886002474, + "grad_norm": 1.525317668914795, + "learning_rate": 4.619210501088027e-05, + "loss": 4.5491, + "step": 29930 + }, + { + "epoch": 0.1780081358835284, + "grad_norm": 1.771481990814209, + "learning_rate": 4.619185721085895e-05, + "loss": 4.7972, + "step": 29931 + }, + { + "epoch": 0.1780140831668094, + "grad_norm": 2.018819808959961, + "learning_rate": 4.619160940343977e-05, + "loss": 3.8428, + "step": 29932 + }, + { + "epoch": 0.1780200304500904, + "grad_norm": 1.7792484760284424, + "learning_rate": 4.6191361588622825e-05, + "loss": 4.9156, + "step": 29933 + }, + { + "epoch": 0.1780259777333714, + "grad_norm": 1.8811469078063965, + "learning_rate": 4.619111376640819e-05, + "loss": 4.0915, + "step": 29934 + }, + { + "epoch": 0.17803192501665238, + "grad_norm": 1.7818450927734375, + "learning_rate": 4.619086593679596e-05, + "loss": 5.1882, + "step": 29935 + }, + { + "epoch": 0.1780378722999334, + "grad_norm": 1.587109088897705, + "learning_rate": 4.619061809978621e-05, + "loss": 4.8753, + "step": 29936 + }, + { + "epoch": 0.17804381958321439, + "grad_norm": 1.6229913234710693, + "learning_rate": 4.619037025537904e-05, + "loss": 4.5926, + "step": 29937 + }, + { + "epoch": 0.17804976686649537, + "grad_norm": 2.0784964561462402, + "learning_rate": 4.619012240357452e-05, + "loss": 3.6958, + "step": 29938 + }, + { + "epoch": 0.1780557141497764, + "grad_norm": 1.829585313796997, + "learning_rate": 4.6189874544372766e-05, + "loss": 3.5768, + "step": 29939 + }, + { + "epoch": 0.17806166143305738, + "grad_norm": 2.243161201477051, + "learning_rate": 4.6189626677773837e-05, + "loss": 3.6418, + "step": 29940 + }, + { + "epoch": 0.17806760871633837, + "grad_norm": 1.8179738521575928, + "learning_rate": 4.618937880377782e-05, + "loss": 3.6718, + "step": 29941 + }, + { + "epoch": 0.17807355599961938, + "grad_norm": 1.7654396295547485, + "learning_rate": 4.618913092238482e-05, + "loss": 4.4997, + "step": 29942 + }, + { + "epoch": 0.17807950328290037, + "grad_norm": 1.615114688873291, + "learning_rate": 4.6188883033594907e-05, + "loss": 4.7439, + "step": 29943 + }, + { + "epoch": 0.17808545056618136, + "grad_norm": 1.2790718078613281, + "learning_rate": 4.6188635137408174e-05, + "loss": 4.6724, + "step": 29944 + }, + { + "epoch": 0.17809139784946237, + "grad_norm": 1.6814706325531006, + "learning_rate": 4.6188387233824717e-05, + "loss": 4.9715, + "step": 29945 + }, + { + "epoch": 0.17809734513274336, + "grad_norm": 2.3926637172698975, + "learning_rate": 4.61881393228446e-05, + "loss": 3.682, + "step": 29946 + }, + { + "epoch": 0.17810329241602435, + "grad_norm": 1.4340671300888062, + "learning_rate": 4.618789140446793e-05, + "loss": 4.586, + "step": 29947 + }, + { + "epoch": 0.17810923969930537, + "grad_norm": 1.6323633193969727, + "learning_rate": 4.6187643478694784e-05, + "loss": 4.7435, + "step": 29948 + }, + { + "epoch": 0.17811518698258635, + "grad_norm": 1.6034373044967651, + "learning_rate": 4.618739554552526e-05, + "loss": 4.9142, + "step": 29949 + }, + { + "epoch": 0.17812113426586734, + "grad_norm": 1.599575161933899, + "learning_rate": 4.618714760495943e-05, + "loss": 4.7991, + "step": 29950 + }, + { + "epoch": 0.17812708154914836, + "grad_norm": 1.7768034934997559, + "learning_rate": 4.618689965699737e-05, + "loss": 4.9267, + "step": 29951 + }, + { + "epoch": 0.17813302883242935, + "grad_norm": 1.8471229076385498, + "learning_rate": 4.6186651701639195e-05, + "loss": 4.4194, + "step": 29952 + }, + { + "epoch": 0.17813897611571033, + "grad_norm": 2.222182512283325, + "learning_rate": 4.6186403738884984e-05, + "loss": 4.1248, + "step": 29953 + }, + { + "epoch": 0.17814492339899135, + "grad_norm": 2.373452663421631, + "learning_rate": 4.6186155768734806e-05, + "loss": 4.3799, + "step": 29954 + }, + { + "epoch": 0.17815087068227234, + "grad_norm": 2.6431610584259033, + "learning_rate": 4.618590779118877e-05, + "loss": 4.4425, + "step": 29955 + }, + { + "epoch": 0.17815681796555333, + "grad_norm": 2.160435676574707, + "learning_rate": 4.618565980624695e-05, + "loss": 4.3708, + "step": 29956 + }, + { + "epoch": 0.17816276524883434, + "grad_norm": 2.0715856552124023, + "learning_rate": 4.618541181390943e-05, + "loss": 4.7181, + "step": 29957 + }, + { + "epoch": 0.17816871253211533, + "grad_norm": 2.107534408569336, + "learning_rate": 4.618516381417631e-05, + "loss": 3.9446, + "step": 29958 + }, + { + "epoch": 0.17817465981539632, + "grad_norm": 2.215634822845459, + "learning_rate": 4.618491580704766e-05, + "loss": 4.3066, + "step": 29959 + }, + { + "epoch": 0.17818060709867733, + "grad_norm": 1.760855793952942, + "learning_rate": 4.618466779252359e-05, + "loss": 4.757, + "step": 29960 + }, + { + "epoch": 0.17818655438195832, + "grad_norm": 1.6130295991897583, + "learning_rate": 4.618441977060415e-05, + "loss": 5.0813, + "step": 29961 + }, + { + "epoch": 0.1781925016652393, + "grad_norm": 1.4686352014541626, + "learning_rate": 4.6184171741289454e-05, + "loss": 4.5848, + "step": 29962 + }, + { + "epoch": 0.17819844894852033, + "grad_norm": 1.5685728788375854, + "learning_rate": 4.618392370457959e-05, + "loss": 4.5756, + "step": 29963 + }, + { + "epoch": 0.17820439623180132, + "grad_norm": 1.7625272274017334, + "learning_rate": 4.618367566047463e-05, + "loss": 4.4729, + "step": 29964 + }, + { + "epoch": 0.1782103435150823, + "grad_norm": 2.350189685821533, + "learning_rate": 4.618342760897467e-05, + "loss": 4.2178, + "step": 29965 + }, + { + "epoch": 0.17821629079836332, + "grad_norm": 2.462435007095337, + "learning_rate": 4.6183179550079796e-05, + "loss": 4.5618, + "step": 29966 + }, + { + "epoch": 0.1782222380816443, + "grad_norm": 2.354248523712158, + "learning_rate": 4.618293148379009e-05, + "loss": 4.4869, + "step": 29967 + }, + { + "epoch": 0.1782281853649253, + "grad_norm": 2.1047489643096924, + "learning_rate": 4.6182683410105646e-05, + "loss": 4.3849, + "step": 29968 + }, + { + "epoch": 0.1782341326482063, + "grad_norm": 1.859437108039856, + "learning_rate": 4.618243532902655e-05, + "loss": 4.3603, + "step": 29969 + }, + { + "epoch": 0.1782400799314873, + "grad_norm": 2.014723539352417, + "learning_rate": 4.6182187240552875e-05, + "loss": 5.363, + "step": 29970 + }, + { + "epoch": 0.1782460272147683, + "grad_norm": 1.637157917022705, + "learning_rate": 4.618193914468472e-05, + "loss": 5.0457, + "step": 29971 + }, + { + "epoch": 0.1782519744980493, + "grad_norm": 2.200927734375, + "learning_rate": 4.618169104142217e-05, + "loss": 4.9131, + "step": 29972 + }, + { + "epoch": 0.1782579217813303, + "grad_norm": 2.0116817951202393, + "learning_rate": 4.6181442930765305e-05, + "loss": 4.8401, + "step": 29973 + }, + { + "epoch": 0.17826386906461128, + "grad_norm": 1.9755736589431763, + "learning_rate": 4.618119481271422e-05, + "loss": 4.8402, + "step": 29974 + }, + { + "epoch": 0.1782698163478923, + "grad_norm": 1.954923152923584, + "learning_rate": 4.618094668726901e-05, + "loss": 4.7746, + "step": 29975 + }, + { + "epoch": 0.17827576363117328, + "grad_norm": 2.0195765495300293, + "learning_rate": 4.6180698554429737e-05, + "loss": 4.4359, + "step": 29976 + }, + { + "epoch": 0.17828171091445427, + "grad_norm": 1.9346232414245605, + "learning_rate": 4.618045041419651e-05, + "loss": 5.132, + "step": 29977 + }, + { + "epoch": 0.1782876581977353, + "grad_norm": 1.880932331085205, + "learning_rate": 4.6180202266569394e-05, + "loss": 5.26, + "step": 29978 + }, + { + "epoch": 0.17829360548101628, + "grad_norm": 1.8841670751571655, + "learning_rate": 4.6179954111548495e-05, + "loss": 4.7878, + "step": 29979 + }, + { + "epoch": 0.17829955276429726, + "grad_norm": 1.9039348363876343, + "learning_rate": 4.61797059491339e-05, + "loss": 4.8547, + "step": 29980 + }, + { + "epoch": 0.17830550004757825, + "grad_norm": 2.0296382904052734, + "learning_rate": 4.617945777932568e-05, + "loss": 5.0599, + "step": 29981 + }, + { + "epoch": 0.17831144733085927, + "grad_norm": 1.8153882026672363, + "learning_rate": 4.617920960212393e-05, + "loss": 5.0123, + "step": 29982 + }, + { + "epoch": 0.17831739461414026, + "grad_norm": 1.5454435348510742, + "learning_rate": 4.617896141752874e-05, + "loss": 4.4975, + "step": 29983 + }, + { + "epoch": 0.17832334189742124, + "grad_norm": 1.5883069038391113, + "learning_rate": 4.6178713225540196e-05, + "loss": 4.8825, + "step": 29984 + }, + { + "epoch": 0.17832928918070226, + "grad_norm": 1.58603036403656, + "learning_rate": 4.617846502615837e-05, + "loss": 5.3068, + "step": 29985 + }, + { + "epoch": 0.17833523646398325, + "grad_norm": 1.6731973886489868, + "learning_rate": 4.6178216819383374e-05, + "loss": 5.5331, + "step": 29986 + }, + { + "epoch": 0.17834118374726424, + "grad_norm": 1.6074113845825195, + "learning_rate": 4.6177968605215276e-05, + "loss": 5.5162, + "step": 29987 + }, + { + "epoch": 0.17834713103054525, + "grad_norm": 1.4040982723236084, + "learning_rate": 4.6177720383654166e-05, + "loss": 5.3135, + "step": 29988 + }, + { + "epoch": 0.17835307831382624, + "grad_norm": 1.6419864892959595, + "learning_rate": 4.617747215470014e-05, + "loss": 4.9229, + "step": 29989 + }, + { + "epoch": 0.17835902559710723, + "grad_norm": 1.7256529331207275, + "learning_rate": 4.617722391835327e-05, + "loss": 5.0782, + "step": 29990 + }, + { + "epoch": 0.17836497288038825, + "grad_norm": 1.7224550247192383, + "learning_rate": 4.617697567461365e-05, + "loss": 4.8078, + "step": 29991 + }, + { + "epoch": 0.17837092016366923, + "grad_norm": 1.63644278049469, + "learning_rate": 4.617672742348137e-05, + "loss": 5.2103, + "step": 29992 + }, + { + "epoch": 0.17837686744695022, + "grad_norm": 1.9455114603042603, + "learning_rate": 4.617647916495651e-05, + "loss": 5.3372, + "step": 29993 + }, + { + "epoch": 0.17838281473023124, + "grad_norm": 1.6073265075683594, + "learning_rate": 4.6176230899039166e-05, + "loss": 4.4093, + "step": 29994 + }, + { + "epoch": 0.17838876201351223, + "grad_norm": 2.0087218284606934, + "learning_rate": 4.6175982625729405e-05, + "loss": 5.0169, + "step": 29995 + }, + { + "epoch": 0.1783947092967932, + "grad_norm": 2.3341264724731445, + "learning_rate": 4.617573434502734e-05, + "loss": 4.271, + "step": 29996 + }, + { + "epoch": 0.17840065658007423, + "grad_norm": 1.6453101634979248, + "learning_rate": 4.617548605693305e-05, + "loss": 5.0354, + "step": 29997 + }, + { + "epoch": 0.17840660386335522, + "grad_norm": 1.6747314929962158, + "learning_rate": 4.61752377614466e-05, + "loss": 5.0349, + "step": 29998 + }, + { + "epoch": 0.1784125511466362, + "grad_norm": 1.7050796747207642, + "learning_rate": 4.617498945856811e-05, + "loss": 4.753, + "step": 29999 + }, + { + "epoch": 0.17841849842991722, + "grad_norm": 1.7062735557556152, + "learning_rate": 4.617474114829764e-05, + "loss": 5.1345, + "step": 30000 + } + ], + "logging_steps": 1, + "max_steps": 168144, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.2299865191507558e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-40000/config.json b/checkpoint-40000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-40000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-40000/generation_config.json b/checkpoint-40000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-40000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-40000/model.safetensors.index.json b/checkpoint-40000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-40000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-40000/rng_state_0.pth b/checkpoint-40000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-40000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-40000/rng_state_1.pth b/checkpoint-40000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-40000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-40000/rng_state_2.pth b/checkpoint-40000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-40000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-40000/rng_state_3.pth b/checkpoint-40000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-40000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-40000/rng_state_4.pth b/checkpoint-40000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-40000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-40000/rng_state_6.pth b/checkpoint-40000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-40000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-40000/rng_state_7.pth b/checkpoint-40000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-40000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-40000/scheduler.pt b/checkpoint-40000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ef2a95dafe909e3867856c8a577c72b359b0724 --- /dev/null +++ b/checkpoint-40000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c22a0765a6ac9c0b685e51a0b16d12e4fff82983239c3bf9ed214c64e56b3ea +size 1064 diff --git a/checkpoint-40000/trainer_state.json b/checkpoint-40000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..81da4e33abea1579dc4ef658bb53645ab9b3628a --- /dev/null +++ b/checkpoint-40000/trainer_state.json @@ -0,0 +1,280034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.23789133123988962, + "eval_steps": 500, + "global_step": 40000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.947283280997241e-06, + "grad_norm": 179.1047821044922, + "learning_rate": 5e-05, + "loss": 14.5158, + "step": 1 + }, + { + "epoch": 1.1894566561994482e-05, + "grad_norm": 40.39401626586914, + "learning_rate": 4.999999999563638e-05, + "loss": 14.152, + "step": 2 + }, + { + "epoch": 1.7841849842991722e-05, + "grad_norm": 137.05079650878906, + "learning_rate": 4.999999998254552e-05, + "loss": 14.6334, + "step": 3 + }, + { + "epoch": 2.3789133123988963e-05, + "grad_norm": 23.315088272094727, + "learning_rate": 4.9999999960727415e-05, + "loss": 12.6458, + "step": 4 + }, + { + "epoch": 2.97364164049862e-05, + "grad_norm": 7.943603992462158, + "learning_rate": 4.9999999930182065e-05, + "loss": 11.8435, + "step": 5 + }, + { + "epoch": 3.5683699685983445e-05, + "grad_norm": 6.374181270599365, + "learning_rate": 4.999999989090948e-05, + "loss": 11.4544, + "step": 6 + }, + { + "epoch": 4.1630982966980686e-05, + "grad_norm": 8.948652267456055, + "learning_rate": 4.999999984290965e-05, + "loss": 11.3516, + "step": 7 + }, + { + "epoch": 4.7578266247977927e-05, + "grad_norm": 3.2318713665008545, + "learning_rate": 4.999999978618258e-05, + "loss": 11.1021, + "step": 8 + }, + { + "epoch": 5.352554952897517e-05, + "grad_norm": 5.6542534828186035, + "learning_rate": 4.9999999720728266e-05, + "loss": 11.0132, + "step": 9 + }, + { + "epoch": 5.94728328099724e-05, + "grad_norm": 3.623577356338501, + "learning_rate": 4.999999964654671e-05, + "loss": 10.8896, + "step": 10 + }, + { + "epoch": 6.542011609096965e-05, + "grad_norm": 3.3209445476531982, + "learning_rate": 4.9999999563637915e-05, + "loss": 10.7339, + "step": 11 + }, + { + "epoch": 7.136739937196689e-05, + "grad_norm": 3.4527082443237305, + "learning_rate": 4.999999947200188e-05, + "loss": 10.5472, + "step": 12 + }, + { + "epoch": 7.731468265296413e-05, + "grad_norm": 3.784444570541382, + "learning_rate": 4.99999993716386e-05, + "loss": 10.4353, + "step": 13 + }, + { + "epoch": 8.326196593396137e-05, + "grad_norm": 4.304569244384766, + "learning_rate": 4.999999926254808e-05, + "loss": 10.4652, + "step": 14 + }, + { + "epoch": 8.920924921495861e-05, + "grad_norm": 3.5867838859558105, + "learning_rate": 4.999999914473032e-05, + "loss": 10.5746, + "step": 15 + }, + { + "epoch": 9.515653249595585e-05, + "grad_norm": 6.1308207511901855, + "learning_rate": 4.9999999018185316e-05, + "loss": 10.4129, + "step": 16 + }, + { + "epoch": 0.0001011038157769531, + "grad_norm": 3.4687230587005615, + "learning_rate": 4.999999888291307e-05, + "loss": 10.2246, + "step": 17 + }, + { + "epoch": 0.00010705109905795033, + "grad_norm": 4.041895866394043, + "learning_rate": 4.9999998738913586e-05, + "loss": 10.0852, + "step": 18 + }, + { + "epoch": 0.00011299838233894758, + "grad_norm": 4.437602519989014, + "learning_rate": 4.999999858618686e-05, + "loss": 9.8841, + "step": 19 + }, + { + "epoch": 0.0001189456656199448, + "grad_norm": 3.9608142375946045, + "learning_rate": 4.9999998424732884e-05, + "loss": 10.0537, + "step": 20 + }, + { + "epoch": 0.00012489294890094204, + "grad_norm": 3.799363613128662, + "learning_rate": 4.999999825455168e-05, + "loss": 9.8487, + "step": 21 + }, + { + "epoch": 0.0001308402321819393, + "grad_norm": 3.626058340072632, + "learning_rate": 4.999999807564323e-05, + "loss": 9.8048, + "step": 22 + }, + { + "epoch": 0.00013678751546293653, + "grad_norm": 4.21406364440918, + "learning_rate": 4.999999788800754e-05, + "loss": 9.6091, + "step": 23 + }, + { + "epoch": 0.00014273479874393378, + "grad_norm": 5.26548957824707, + "learning_rate": 4.9999997691644605e-05, + "loss": 9.3935, + "step": 24 + }, + { + "epoch": 0.000148682082024931, + "grad_norm": 6.5113396644592285, + "learning_rate": 4.999999748655443e-05, + "loss": 9.2602, + "step": 25 + }, + { + "epoch": 0.00015462936530592826, + "grad_norm": 4.6141133308410645, + "learning_rate": 4.9999997272737014e-05, + "loss": 9.1492, + "step": 26 + }, + { + "epoch": 0.0001605766485869255, + "grad_norm": 4.645262241363525, + "learning_rate": 4.999999705019236e-05, + "loss": 9.2238, + "step": 27 + }, + { + "epoch": 0.00016652393186792274, + "grad_norm": 4.599213123321533, + "learning_rate": 4.9999996818920464e-05, + "loss": 9.1673, + "step": 28 + }, + { + "epoch": 0.00017247121514891997, + "grad_norm": 4.820634365081787, + "learning_rate": 4.999999657892133e-05, + "loss": 9.0044, + "step": 29 + }, + { + "epoch": 0.00017841849842991722, + "grad_norm": 4.57854700088501, + "learning_rate": 4.9999996330194956e-05, + "loss": 8.8746, + "step": 30 + }, + { + "epoch": 0.00018436578171091445, + "grad_norm": 4.567880153656006, + "learning_rate": 4.999999607274133e-05, + "loss": 8.7224, + "step": 31 + }, + { + "epoch": 0.0001903130649919117, + "grad_norm": 4.545701503753662, + "learning_rate": 4.9999995806560475e-05, + "loss": 8.6979, + "step": 32 + }, + { + "epoch": 0.00019626034827290893, + "grad_norm": 4.098274230957031, + "learning_rate": 4.9999995531652374e-05, + "loss": 8.5787, + "step": 33 + }, + { + "epoch": 0.0002022076315539062, + "grad_norm": 4.341195106506348, + "learning_rate": 4.999999524801704e-05, + "loss": 8.4452, + "step": 34 + }, + { + "epoch": 0.00020815491483490341, + "grad_norm": 4.651747703552246, + "learning_rate": 4.999999495565446e-05, + "loss": 8.4383, + "step": 35 + }, + { + "epoch": 0.00021410219811590067, + "grad_norm": 4.187220573425293, + "learning_rate": 4.999999465456464e-05, + "loss": 8.2441, + "step": 36 + }, + { + "epoch": 0.0002200494813968979, + "grad_norm": 4.094058990478516, + "learning_rate": 4.999999434474758e-05, + "loss": 8.2784, + "step": 37 + }, + { + "epoch": 0.00022599676467789515, + "grad_norm": 4.6094794273376465, + "learning_rate": 4.999999402620329e-05, + "loss": 8.3893, + "step": 38 + }, + { + "epoch": 0.00023194404795889238, + "grad_norm": 5.391327381134033, + "learning_rate": 4.999999369893175e-05, + "loss": 8.6491, + "step": 39 + }, + { + "epoch": 0.0002378913312398896, + "grad_norm": 5.03748893737793, + "learning_rate": 4.9999993362932974e-05, + "loss": 8.5279, + "step": 40 + }, + { + "epoch": 0.00024383861452088686, + "grad_norm": 5.306002616882324, + "learning_rate": 4.9999993018206956e-05, + "loss": 9.9965, + "step": 41 + }, + { + "epoch": 0.0002497858978018841, + "grad_norm": 5.5374274253845215, + "learning_rate": 4.99999926647537e-05, + "loss": 10.5594, + "step": 42 + }, + { + "epoch": 0.00025573318108288134, + "grad_norm": 3.8107693195343018, + "learning_rate": 4.999999230257321e-05, + "loss": 10.5451, + "step": 43 + }, + { + "epoch": 0.0002616804643638786, + "grad_norm": 3.922286033630371, + "learning_rate": 4.999999193166547e-05, + "loss": 10.4123, + "step": 44 + }, + { + "epoch": 0.0002676277476448758, + "grad_norm": 3.2090535163879395, + "learning_rate": 4.99999915520305e-05, + "loss": 10.0646, + "step": 45 + }, + { + "epoch": 0.00027357503092587305, + "grad_norm": 3.153404474258423, + "learning_rate": 4.9999991163668285e-05, + "loss": 10.237, + "step": 46 + }, + { + "epoch": 0.0002795223142068703, + "grad_norm": 4.83523416519165, + "learning_rate": 4.999999076657884e-05, + "loss": 8.9392, + "step": 47 + }, + { + "epoch": 0.00028546959748786756, + "grad_norm": 3.954632043838501, + "learning_rate": 4.999999036076215e-05, + "loss": 8.8562, + "step": 48 + }, + { + "epoch": 0.00029141688076886476, + "grad_norm": 4.452631950378418, + "learning_rate": 4.999998994621822e-05, + "loss": 9.8819, + "step": 49 + }, + { + "epoch": 0.000297364164049862, + "grad_norm": 4.71603536605835, + "learning_rate": 4.9999989522947055e-05, + "loss": 9.8503, + "step": 50 + }, + { + "epoch": 0.00030331144733085927, + "grad_norm": 3.8008105754852295, + "learning_rate": 4.999998909094865e-05, + "loss": 9.8072, + "step": 51 + }, + { + "epoch": 0.0003092587306118565, + "grad_norm": 3.9906716346740723, + "learning_rate": 4.999998865022301e-05, + "loss": 9.168, + "step": 52 + }, + { + "epoch": 0.0003152060138928537, + "grad_norm": 3.9425785541534424, + "learning_rate": 4.999998820077013e-05, + "loss": 9.8441, + "step": 53 + }, + { + "epoch": 0.000321153297173851, + "grad_norm": 3.6698031425476074, + "learning_rate": 4.999998774259002e-05, + "loss": 10.036, + "step": 54 + }, + { + "epoch": 0.00032710058045484823, + "grad_norm": 3.3027005195617676, + "learning_rate": 4.999998727568266e-05, + "loss": 9.8701, + "step": 55 + }, + { + "epoch": 0.0003330478637358455, + "grad_norm": 3.312570333480835, + "learning_rate": 4.999998680004807e-05, + "loss": 9.3354, + "step": 56 + }, + { + "epoch": 0.0003389951470168427, + "grad_norm": 3.323969602584839, + "learning_rate": 4.999998631568624e-05, + "loss": 9.2899, + "step": 57 + }, + { + "epoch": 0.00034494243029783994, + "grad_norm": 3.1319313049316406, + "learning_rate": 4.999998582259717e-05, + "loss": 9.1033, + "step": 58 + }, + { + "epoch": 0.0003508897135788372, + "grad_norm": 3.655060291290283, + "learning_rate": 4.999998532078087e-05, + "loss": 9.1574, + "step": 59 + }, + { + "epoch": 0.00035683699685983445, + "grad_norm": 3.2051918506622314, + "learning_rate": 4.999998481023733e-05, + "loss": 9.564, + "step": 60 + }, + { + "epoch": 0.00036278428014083165, + "grad_norm": 3.223015308380127, + "learning_rate": 4.999998429096656e-05, + "loss": 9.46, + "step": 61 + }, + { + "epoch": 0.0003687315634218289, + "grad_norm": 4.121186256408691, + "learning_rate": 4.999998376296855e-05, + "loss": 8.4136, + "step": 62 + }, + { + "epoch": 0.00037467884670282616, + "grad_norm": 3.5580086708068848, + "learning_rate": 4.9999983226243296e-05, + "loss": 9.3504, + "step": 63 + }, + { + "epoch": 0.0003806261299838234, + "grad_norm": 3.664219379425049, + "learning_rate": 4.999998268079081e-05, + "loss": 9.2889, + "step": 64 + }, + { + "epoch": 0.0003865734132648206, + "grad_norm": 2.955582618713379, + "learning_rate": 4.99999821266111e-05, + "loss": 8.9193, + "step": 65 + }, + { + "epoch": 0.00039252069654581787, + "grad_norm": 3.0592539310455322, + "learning_rate": 4.9999981563704144e-05, + "loss": 9.6739, + "step": 66 + }, + { + "epoch": 0.0003984679798268151, + "grad_norm": 3.32024884223938, + "learning_rate": 4.999998099206995e-05, + "loss": 9.3648, + "step": 67 + }, + { + "epoch": 0.0004044152631078124, + "grad_norm": 3.2716033458709717, + "learning_rate": 4.9999980411708524e-05, + "loss": 9.3652, + "step": 68 + }, + { + "epoch": 0.0004103625463888096, + "grad_norm": 3.1926631927490234, + "learning_rate": 4.999997982261987e-05, + "loss": 9.2924, + "step": 69 + }, + { + "epoch": 0.00041630982966980683, + "grad_norm": 3.589841604232788, + "learning_rate": 4.999997922480397e-05, + "loss": 9.2185, + "step": 70 + }, + { + "epoch": 0.0004222571129508041, + "grad_norm": 2.902132034301758, + "learning_rate": 4.999997861826084e-05, + "loss": 9.1047, + "step": 71 + }, + { + "epoch": 0.00042820439623180134, + "grad_norm": 3.2352359294891357, + "learning_rate": 4.999997800299048e-05, + "loss": 9.0309, + "step": 72 + }, + { + "epoch": 0.00043415167951279854, + "grad_norm": 2.683664560317993, + "learning_rate": 4.9999977378992884e-05, + "loss": 8.9977, + "step": 73 + }, + { + "epoch": 0.0004400989627937958, + "grad_norm": 3.0073423385620117, + "learning_rate": 4.9999976746268055e-05, + "loss": 9.0967, + "step": 74 + }, + { + "epoch": 0.00044604624607479305, + "grad_norm": 3.364819288253784, + "learning_rate": 4.9999976104815994e-05, + "loss": 8.9401, + "step": 75 + }, + { + "epoch": 0.0004519935293557903, + "grad_norm": 3.478936195373535, + "learning_rate": 4.9999975454636695e-05, + "loss": 8.8173, + "step": 76 + }, + { + "epoch": 0.0004579408126367875, + "grad_norm": 3.059669017791748, + "learning_rate": 4.9999974795730165e-05, + "loss": 9.2588, + "step": 77 + }, + { + "epoch": 0.00046388809591778476, + "grad_norm": 3.1980936527252197, + "learning_rate": 4.999997412809639e-05, + "loss": 9.3374, + "step": 78 + }, + { + "epoch": 0.000469835379198782, + "grad_norm": 2.859935998916626, + "learning_rate": 4.9999973451735405e-05, + "loss": 8.8996, + "step": 79 + }, + { + "epoch": 0.0004757826624797792, + "grad_norm": 3.6268489360809326, + "learning_rate": 4.9999972766647175e-05, + "loss": 8.7878, + "step": 80 + }, + { + "epoch": 0.00048172994576077647, + "grad_norm": 3.0187010765075684, + "learning_rate": 4.9999972072831714e-05, + "loss": 8.9177, + "step": 81 + }, + { + "epoch": 0.0004876772290417737, + "grad_norm": 3.304633378982544, + "learning_rate": 4.9999971370289014e-05, + "loss": 8.8098, + "step": 82 + }, + { + "epoch": 0.0004936245123227709, + "grad_norm": 3.678696870803833, + "learning_rate": 4.999997065901909e-05, + "loss": 8.9408, + "step": 83 + }, + { + "epoch": 0.0004995717956037682, + "grad_norm": 3.485488176345825, + "learning_rate": 4.9999969939021936e-05, + "loss": 8.7374, + "step": 84 + }, + { + "epoch": 0.0005055190788847654, + "grad_norm": 3.276916265487671, + "learning_rate": 4.999996921029755e-05, + "loss": 8.7177, + "step": 85 + }, + { + "epoch": 0.0005114663621657627, + "grad_norm": 3.060227632522583, + "learning_rate": 4.9999968472845926e-05, + "loss": 8.9673, + "step": 86 + }, + { + "epoch": 0.0005174136454467599, + "grad_norm": 3.359055995941162, + "learning_rate": 4.999996772666708e-05, + "loss": 8.8029, + "step": 87 + }, + { + "epoch": 0.0005233609287277572, + "grad_norm": 3.8916943073272705, + "learning_rate": 4.9999966971761004e-05, + "loss": 8.8363, + "step": 88 + }, + { + "epoch": 0.0005293082120087544, + "grad_norm": 3.825075387954712, + "learning_rate": 4.9999966208127694e-05, + "loss": 8.5683, + "step": 89 + }, + { + "epoch": 0.0005352554952897516, + "grad_norm": 3.475759267807007, + "learning_rate": 4.999996543576715e-05, + "loss": 8.5723, + "step": 90 + }, + { + "epoch": 0.0005412027785707488, + "grad_norm": 3.609776020050049, + "learning_rate": 4.9999964654679385e-05, + "loss": 8.6123, + "step": 91 + }, + { + "epoch": 0.0005471500618517461, + "grad_norm": 3.3749685287475586, + "learning_rate": 4.999996386486439e-05, + "loss": 8.4887, + "step": 92 + }, + { + "epoch": 0.0005530973451327434, + "grad_norm": 3.3853306770324707, + "learning_rate": 4.999996306632215e-05, + "loss": 8.56, + "step": 93 + }, + { + "epoch": 0.0005590446284137406, + "grad_norm": 3.9347422122955322, + "learning_rate": 4.99999622590527e-05, + "loss": 8.5053, + "step": 94 + }, + { + "epoch": 0.0005649919116947379, + "grad_norm": 3.6037611961364746, + "learning_rate": 4.999996144305601e-05, + "loss": 8.3367, + "step": 95 + }, + { + "epoch": 0.0005709391949757351, + "grad_norm": 3.4608941078186035, + "learning_rate": 4.99999606183321e-05, + "loss": 8.0674, + "step": 96 + }, + { + "epoch": 0.0005768864782567324, + "grad_norm": 3.4882898330688477, + "learning_rate": 4.999995978488096e-05, + "loss": 8.1728, + "step": 97 + }, + { + "epoch": 0.0005828337615377295, + "grad_norm": 3.6789562702178955, + "learning_rate": 4.999995894270258e-05, + "loss": 7.9535, + "step": 98 + }, + { + "epoch": 0.0005887810448187268, + "grad_norm": 3.57328200340271, + "learning_rate": 4.9999958091796986e-05, + "loss": 8.2048, + "step": 99 + }, + { + "epoch": 0.000594728328099724, + "grad_norm": 3.803468942642212, + "learning_rate": 4.999995723216416e-05, + "loss": 7.8073, + "step": 100 + }, + { + "epoch": 0.0006006756113807213, + "grad_norm": 3.8187785148620605, + "learning_rate": 4.9999956363804116e-05, + "loss": 7.6325, + "step": 101 + }, + { + "epoch": 0.0006066228946617185, + "grad_norm": 3.8681981563568115, + "learning_rate": 4.999995548671684e-05, + "loss": 7.7104, + "step": 102 + }, + { + "epoch": 0.0006125701779427158, + "grad_norm": 3.869074583053589, + "learning_rate": 4.9999954600902334e-05, + "loss": 7.8445, + "step": 103 + }, + { + "epoch": 0.000618517461223713, + "grad_norm": 3.852057695388794, + "learning_rate": 4.99999537063606e-05, + "loss": 7.872, + "step": 104 + }, + { + "epoch": 0.0006244647445047103, + "grad_norm": 4.784586429595947, + "learning_rate": 4.9999952803091654e-05, + "loss": 9.2218, + "step": 105 + }, + { + "epoch": 0.0006304120277857074, + "grad_norm": 4.296675682067871, + "learning_rate": 4.9999951891095474e-05, + "loss": 9.0957, + "step": 106 + }, + { + "epoch": 0.0006363593110667047, + "grad_norm": 3.9155995845794678, + "learning_rate": 4.999995097037207e-05, + "loss": 8.9829, + "step": 107 + }, + { + "epoch": 0.000642306594347702, + "grad_norm": 3.8967478275299072, + "learning_rate": 4.999995004092144e-05, + "loss": 8.2017, + "step": 108 + }, + { + "epoch": 0.0006482538776286992, + "grad_norm": 5.238500595092773, + "learning_rate": 4.999994910274358e-05, + "loss": 7.7976, + "step": 109 + }, + { + "epoch": 0.0006542011609096965, + "grad_norm": 3.7043144702911377, + "learning_rate": 4.9999948155838504e-05, + "loss": 8.3116, + "step": 110 + }, + { + "epoch": 0.0006601484441906937, + "grad_norm": 2.9745211601257324, + "learning_rate": 4.99999472002062e-05, + "loss": 8.69, + "step": 111 + }, + { + "epoch": 0.000666095727471691, + "grad_norm": 3.172652006149292, + "learning_rate": 4.999994623584668e-05, + "loss": 8.6244, + "step": 112 + }, + { + "epoch": 0.0006720430107526882, + "grad_norm": 3.224888801574707, + "learning_rate": 4.999994526275993e-05, + "loss": 8.6823, + "step": 113 + }, + { + "epoch": 0.0006779902940336854, + "grad_norm": 3.53104305267334, + "learning_rate": 4.9999944280945964e-05, + "loss": 8.495, + "step": 114 + }, + { + "epoch": 0.0006839375773146826, + "grad_norm": 3.013505697250366, + "learning_rate": 4.999994329040477e-05, + "loss": 8.4807, + "step": 115 + }, + { + "epoch": 0.0006898848605956799, + "grad_norm": 4.4741339683532715, + "learning_rate": 4.999994229113636e-05, + "loss": 8.94, + "step": 116 + }, + { + "epoch": 0.0006958321438766771, + "grad_norm": 4.78712272644043, + "learning_rate": 4.999994128314072e-05, + "loss": 8.9367, + "step": 117 + }, + { + "epoch": 0.0007017794271576744, + "grad_norm": 3.6983933448791504, + "learning_rate": 4.999994026641787e-05, + "loss": 8.7524, + "step": 118 + }, + { + "epoch": 0.0007077267104386716, + "grad_norm": 3.74997615814209, + "learning_rate": 4.9999939240967784e-05, + "loss": 8.3417, + "step": 119 + }, + { + "epoch": 0.0007136739937196689, + "grad_norm": 3.614593982696533, + "learning_rate": 4.999993820679049e-05, + "loss": 8.4848, + "step": 120 + }, + { + "epoch": 0.000719621277000666, + "grad_norm": 2.903045654296875, + "learning_rate": 4.999993716388597e-05, + "loss": 8.5519, + "step": 121 + }, + { + "epoch": 0.0007255685602816633, + "grad_norm": 3.402444839477539, + "learning_rate": 4.999993611225423e-05, + "loss": 8.2905, + "step": 122 + }, + { + "epoch": 0.0007315158435626606, + "grad_norm": 3.663893938064575, + "learning_rate": 4.9999935051895274e-05, + "loss": 8.4842, + "step": 123 + }, + { + "epoch": 0.0007374631268436578, + "grad_norm": 3.7535622119903564, + "learning_rate": 4.99999339828091e-05, + "loss": 8.4766, + "step": 124 + }, + { + "epoch": 0.0007434104101246551, + "grad_norm": 3.1285574436187744, + "learning_rate": 4.99999329049957e-05, + "loss": 8.3716, + "step": 125 + }, + { + "epoch": 0.0007493576934056523, + "grad_norm": 3.648869752883911, + "learning_rate": 4.9999931818455086e-05, + "loss": 8.3413, + "step": 126 + }, + { + "epoch": 0.0007553049766866496, + "grad_norm": 3.253399133682251, + "learning_rate": 4.9999930723187255e-05, + "loss": 8.0412, + "step": 127 + }, + { + "epoch": 0.0007612522599676468, + "grad_norm": 3.5694124698638916, + "learning_rate": 4.999992961919221e-05, + "loss": 8.0895, + "step": 128 + }, + { + "epoch": 0.000767199543248644, + "grad_norm": 4.106658458709717, + "learning_rate": 4.999992850646994e-05, + "loss": 8.3654, + "step": 129 + }, + { + "epoch": 0.0007731468265296412, + "grad_norm": 4.082829475402832, + "learning_rate": 4.9999927385020455e-05, + "loss": 8.2663, + "step": 130 + }, + { + "epoch": 0.0007790941098106385, + "grad_norm": 4.349386215209961, + "learning_rate": 4.9999926254843753e-05, + "loss": 8.2435, + "step": 131 + }, + { + "epoch": 0.0007850413930916357, + "grad_norm": 3.375697135925293, + "learning_rate": 4.999992511593984e-05, + "loss": 8.0827, + "step": 132 + }, + { + "epoch": 0.000790988676372633, + "grad_norm": 3.2566957473754883, + "learning_rate": 4.999992396830871e-05, + "loss": 8.4891, + "step": 133 + }, + { + "epoch": 0.0007969359596536302, + "grad_norm": 3.791579008102417, + "learning_rate": 4.999992281195036e-05, + "loss": 8.1567, + "step": 134 + }, + { + "epoch": 0.0008028832429346275, + "grad_norm": 3.8741838932037354, + "learning_rate": 4.99999216468648e-05, + "loss": 8.4033, + "step": 135 + }, + { + "epoch": 0.0008088305262156248, + "grad_norm": 4.229452133178711, + "learning_rate": 4.999992047305203e-05, + "loss": 8.3897, + "step": 136 + }, + { + "epoch": 0.0008147778094966219, + "grad_norm": 3.2732088565826416, + "learning_rate": 4.9999919290512034e-05, + "loss": 8.1758, + "step": 137 + }, + { + "epoch": 0.0008207250927776192, + "grad_norm": 3.2048966884613037, + "learning_rate": 4.9999918099244836e-05, + "loss": 8.1459, + "step": 138 + }, + { + "epoch": 0.0008266723760586164, + "grad_norm": 3.8639938831329346, + "learning_rate": 4.999991689925042e-05, + "loss": 7.9437, + "step": 139 + }, + { + "epoch": 0.0008326196593396137, + "grad_norm": 3.297252655029297, + "learning_rate": 4.9999915690528794e-05, + "loss": 8.1751, + "step": 140 + }, + { + "epoch": 0.0008385669426206109, + "grad_norm": 3.878218173980713, + "learning_rate": 4.999991447307995e-05, + "loss": 8.0572, + "step": 141 + }, + { + "epoch": 0.0008445142259016082, + "grad_norm": 3.6870739459991455, + "learning_rate": 4.9999913246903895e-05, + "loss": 8.0958, + "step": 142 + }, + { + "epoch": 0.0008504615091826054, + "grad_norm": 3.1817922592163086, + "learning_rate": 4.9999912012000636e-05, + "loss": 8.2683, + "step": 143 + }, + { + "epoch": 0.0008564087924636027, + "grad_norm": 3.4008772373199463, + "learning_rate": 4.999991076837016e-05, + "loss": 8.4171, + "step": 144 + }, + { + "epoch": 0.0008623560757445998, + "grad_norm": 3.002333641052246, + "learning_rate": 4.999990951601247e-05, + "loss": 8.1149, + "step": 145 + }, + { + "epoch": 0.0008683033590255971, + "grad_norm": 3.51910662651062, + "learning_rate": 4.999990825492757e-05, + "loss": 8.5284, + "step": 146 + }, + { + "epoch": 0.0008742506423065943, + "grad_norm": 2.978875160217285, + "learning_rate": 4.999990698511548e-05, + "loss": 8.4855, + "step": 147 + }, + { + "epoch": 0.0008801979255875916, + "grad_norm": 3.4708774089813232, + "learning_rate": 4.999990570657616e-05, + "loss": 8.333, + "step": 148 + }, + { + "epoch": 0.0008861452088685888, + "grad_norm": 2.994084596633911, + "learning_rate": 4.999990441930963e-05, + "loss": 8.3456, + "step": 149 + }, + { + "epoch": 0.0008920924921495861, + "grad_norm": 3.1295697689056396, + "learning_rate": 4.99999031233159e-05, + "loss": 8.2204, + "step": 150 + }, + { + "epoch": 0.0008980397754305833, + "grad_norm": 3.349720001220703, + "learning_rate": 4.9999901818594966e-05, + "loss": 8.2739, + "step": 151 + }, + { + "epoch": 0.0009039870587115806, + "grad_norm": 3.852964401245117, + "learning_rate": 4.999990050514681e-05, + "loss": 8.4225, + "step": 152 + }, + { + "epoch": 0.0009099343419925777, + "grad_norm": 3.92203950881958, + "learning_rate": 4.9999899182971456e-05, + "loss": 8.2882, + "step": 153 + }, + { + "epoch": 0.000915881625273575, + "grad_norm": 3.9960269927978516, + "learning_rate": 4.99998978520689e-05, + "loss": 8.2091, + "step": 154 + }, + { + "epoch": 0.0009218289085545723, + "grad_norm": 3.952327251434326, + "learning_rate": 4.999989651243913e-05, + "loss": 8.1726, + "step": 155 + }, + { + "epoch": 0.0009277761918355695, + "grad_norm": 3.9594647884368896, + "learning_rate": 4.9999895164082156e-05, + "loss": 8.0241, + "step": 156 + }, + { + "epoch": 0.0009337234751165668, + "grad_norm": 3.1129961013793945, + "learning_rate": 4.999989380699798e-05, + "loss": 8.14, + "step": 157 + }, + { + "epoch": 0.000939670758397564, + "grad_norm": 4.7737860679626465, + "learning_rate": 4.9999892441186604e-05, + "loss": 7.869, + "step": 158 + }, + { + "epoch": 0.0009456180416785613, + "grad_norm": 3.351327657699585, + "learning_rate": 4.9999891066648006e-05, + "loss": 8.1831, + "step": 159 + }, + { + "epoch": 0.0009515653249595584, + "grad_norm": 3.0245375633239746, + "learning_rate": 4.999988968338222e-05, + "loss": 8.3871, + "step": 160 + }, + { + "epoch": 0.0009575126082405557, + "grad_norm": 4.766855716705322, + "learning_rate": 4.999988829138923e-05, + "loss": 8.0078, + "step": 161 + }, + { + "epoch": 0.0009634598915215529, + "grad_norm": 3.975804090499878, + "learning_rate": 4.999988689066903e-05, + "loss": 7.6923, + "step": 162 + }, + { + "epoch": 0.0009694071748025502, + "grad_norm": 4.024605751037598, + "learning_rate": 4.999988548122163e-05, + "loss": 8.2986, + "step": 163 + }, + { + "epoch": 0.0009753544580835474, + "grad_norm": 4.230019569396973, + "learning_rate": 4.999988406304703e-05, + "loss": 8.2903, + "step": 164 + }, + { + "epoch": 0.0009813017413645446, + "grad_norm": 3.972825050354004, + "learning_rate": 4.9999882636145236e-05, + "loss": 8.3589, + "step": 165 + }, + { + "epoch": 0.0009872490246455418, + "grad_norm": 3.6381688117980957, + "learning_rate": 4.999988120051623e-05, + "loss": 8.2648, + "step": 166 + }, + { + "epoch": 0.000993196307926539, + "grad_norm": 4.203462600708008, + "learning_rate": 4.9999879756160025e-05, + "loss": 8.363, + "step": 167 + }, + { + "epoch": 0.0009991435912075363, + "grad_norm": 2.944103479385376, + "learning_rate": 4.9999878303076624e-05, + "loss": 7.9752, + "step": 168 + }, + { + "epoch": 0.0010050908744885336, + "grad_norm": 3.4115283489227295, + "learning_rate": 4.9999876841266025e-05, + "loss": 8.1044, + "step": 169 + }, + { + "epoch": 0.0010110381577695309, + "grad_norm": 4.185582160949707, + "learning_rate": 4.999987537072822e-05, + "loss": 8.0347, + "step": 170 + }, + { + "epoch": 0.0010169854410505281, + "grad_norm": 3.333649158477783, + "learning_rate": 4.999987389146323e-05, + "loss": 8.0545, + "step": 171 + }, + { + "epoch": 0.0010229327243315254, + "grad_norm": 3.7702765464782715, + "learning_rate": 4.999987240347103e-05, + "loss": 7.8936, + "step": 172 + }, + { + "epoch": 0.0010288800076125226, + "grad_norm": 4.113167762756348, + "learning_rate": 4.9999870906751636e-05, + "loss": 7.9447, + "step": 173 + }, + { + "epoch": 0.0010348272908935199, + "grad_norm": 3.370821714401245, + "learning_rate": 4.999986940130505e-05, + "loss": 7.9745, + "step": 174 + }, + { + "epoch": 0.0010407745741745171, + "grad_norm": 3.552391767501831, + "learning_rate": 4.999986788713126e-05, + "loss": 7.8882, + "step": 175 + }, + { + "epoch": 0.0010467218574555144, + "grad_norm": 3.3497536182403564, + "learning_rate": 4.999986636423028e-05, + "loss": 7.8601, + "step": 176 + }, + { + "epoch": 0.0010526691407365116, + "grad_norm": 3.256685733795166, + "learning_rate": 4.9999864832602105e-05, + "loss": 7.8341, + "step": 177 + }, + { + "epoch": 0.001058616424017509, + "grad_norm": 3.028108835220337, + "learning_rate": 4.999986329224674e-05, + "loss": 7.884, + "step": 178 + }, + { + "epoch": 0.0010645637072985061, + "grad_norm": 2.9583778381347656, + "learning_rate": 4.9999861743164165e-05, + "loss": 7.7875, + "step": 179 + }, + { + "epoch": 0.0010705109905795032, + "grad_norm": 3.109215497970581, + "learning_rate": 4.999986018535441e-05, + "loss": 8.4081, + "step": 180 + }, + { + "epoch": 0.0010764582738605004, + "grad_norm": 3.8907759189605713, + "learning_rate": 4.999985861881746e-05, + "loss": 8.0971, + "step": 181 + }, + { + "epoch": 0.0010824055571414977, + "grad_norm": 4.20400857925415, + "learning_rate": 4.9999857043553314e-05, + "loss": 7.9077, + "step": 182 + }, + { + "epoch": 0.001088352840422495, + "grad_norm": 3.580486297607422, + "learning_rate": 4.999985545956198e-05, + "loss": 7.8935, + "step": 183 + }, + { + "epoch": 0.0010943001237034922, + "grad_norm": 3.3833847045898438, + "learning_rate": 4.999985386684345e-05, + "loss": 7.9956, + "step": 184 + }, + { + "epoch": 0.0011002474069844895, + "grad_norm": 2.8848624229431152, + "learning_rate": 4.9999852265397734e-05, + "loss": 8.0718, + "step": 185 + }, + { + "epoch": 0.0011061946902654867, + "grad_norm": 3.8933818340301514, + "learning_rate": 4.999985065522483e-05, + "loss": 8.0517, + "step": 186 + }, + { + "epoch": 0.001112141973546484, + "grad_norm": 3.6559605598449707, + "learning_rate": 4.999984903632473e-05, + "loss": 8.3664, + "step": 187 + }, + { + "epoch": 0.0011180892568274812, + "grad_norm": 3.4633536338806152, + "learning_rate": 4.999984740869744e-05, + "loss": 8.3481, + "step": 188 + }, + { + "epoch": 0.0011240365401084785, + "grad_norm": 3.483020305633545, + "learning_rate": 4.999984577234297e-05, + "loss": 8.3407, + "step": 189 + }, + { + "epoch": 0.0011299838233894757, + "grad_norm": 2.772434711456299, + "learning_rate": 4.999984412726131e-05, + "loss": 8.4524, + "step": 190 + }, + { + "epoch": 0.001135931106670473, + "grad_norm": 3.3341007232666016, + "learning_rate": 4.999984247345246e-05, + "loss": 8.1063, + "step": 191 + }, + { + "epoch": 0.0011418783899514702, + "grad_norm": 3.0063467025756836, + "learning_rate": 4.999984081091642e-05, + "loss": 8.0077, + "step": 192 + }, + { + "epoch": 0.0011478256732324675, + "grad_norm": 2.9670779705047607, + "learning_rate": 4.99998391396532e-05, + "loss": 8.2338, + "step": 193 + }, + { + "epoch": 0.0011537729565134647, + "grad_norm": 3.024505138397217, + "learning_rate": 4.999983745966279e-05, + "loss": 8.1794, + "step": 194 + }, + { + "epoch": 0.0011597202397944618, + "grad_norm": 2.834131956100464, + "learning_rate": 4.9999835770945195e-05, + "loss": 8.2078, + "step": 195 + }, + { + "epoch": 0.001165667523075459, + "grad_norm": 3.555525064468384, + "learning_rate": 4.999983407350042e-05, + "loss": 8.0838, + "step": 196 + }, + { + "epoch": 0.0011716148063564563, + "grad_norm": 3.5013587474823, + "learning_rate": 4.999983236732846e-05, + "loss": 8.092, + "step": 197 + }, + { + "epoch": 0.0011775620896374535, + "grad_norm": 3.3721518516540527, + "learning_rate": 4.9999830652429314e-05, + "loss": 8.1137, + "step": 198 + }, + { + "epoch": 0.0011835093729184508, + "grad_norm": 3.364952564239502, + "learning_rate": 4.9999828928802986e-05, + "loss": 8.1197, + "step": 199 + }, + { + "epoch": 0.001189456656199448, + "grad_norm": 3.691249132156372, + "learning_rate": 4.999982719644948e-05, + "loss": 8.0922, + "step": 200 + }, + { + "epoch": 0.0011954039394804453, + "grad_norm": 6.919185161590576, + "learning_rate": 4.9999825455368785e-05, + "loss": 7.9215, + "step": 201 + }, + { + "epoch": 0.0012013512227614426, + "grad_norm": 3.3332598209381104, + "learning_rate": 4.999982370556091e-05, + "loss": 7.7605, + "step": 202 + }, + { + "epoch": 0.0012072985060424398, + "grad_norm": 2.842517375946045, + "learning_rate": 4.999982194702586e-05, + "loss": 8.0527, + "step": 203 + }, + { + "epoch": 0.001213245789323437, + "grad_norm": 3.086371660232544, + "learning_rate": 4.999982017976364e-05, + "loss": 8.2637, + "step": 204 + }, + { + "epoch": 0.0012191930726044343, + "grad_norm": 3.0870208740234375, + "learning_rate": 4.999981840377422e-05, + "loss": 8.3538, + "step": 205 + }, + { + "epoch": 0.0012251403558854316, + "grad_norm": 3.1244094371795654, + "learning_rate": 4.9999816619057633e-05, + "loss": 8.4604, + "step": 206 + }, + { + "epoch": 0.0012310876391664288, + "grad_norm": 2.7808034420013428, + "learning_rate": 4.999981482561387e-05, + "loss": 8.3227, + "step": 207 + }, + { + "epoch": 0.001237034922447426, + "grad_norm": 2.791182518005371, + "learning_rate": 4.999981302344292e-05, + "loss": 8.1481, + "step": 208 + }, + { + "epoch": 0.0012429822057284233, + "grad_norm": 3.045971632003784, + "learning_rate": 4.99998112125448e-05, + "loss": 7.7842, + "step": 209 + }, + { + "epoch": 0.0012489294890094206, + "grad_norm": 3.2548067569732666, + "learning_rate": 4.99998093929195e-05, + "loss": 7.9935, + "step": 210 + }, + { + "epoch": 0.0012548767722904176, + "grad_norm": 3.5448713302612305, + "learning_rate": 4.999980756456704e-05, + "loss": 8.0323, + "step": 211 + }, + { + "epoch": 0.0012608240555714149, + "grad_norm": 3.717900514602661, + "learning_rate": 4.9999805727487395e-05, + "loss": 8.0532, + "step": 212 + }, + { + "epoch": 0.0012667713388524121, + "grad_norm": 3.2943921089172363, + "learning_rate": 4.9999803881680576e-05, + "loss": 8.0326, + "step": 213 + }, + { + "epoch": 0.0012727186221334094, + "grad_norm": 3.4586269855499268, + "learning_rate": 4.999980202714658e-05, + "loss": 7.8765, + "step": 214 + }, + { + "epoch": 0.0012786659054144067, + "grad_norm": 3.1898810863494873, + "learning_rate": 4.9999800163885414e-05, + "loss": 7.8859, + "step": 215 + }, + { + "epoch": 0.001284613188695404, + "grad_norm": 2.977229595184326, + "learning_rate": 4.9999798291897084e-05, + "loss": 7.8841, + "step": 216 + }, + { + "epoch": 0.0012905604719764012, + "grad_norm": 3.368680000305176, + "learning_rate": 4.999979641118157e-05, + "loss": 7.8055, + "step": 217 + }, + { + "epoch": 0.0012965077552573984, + "grad_norm": 4.295344352722168, + "learning_rate": 4.9999794521738894e-05, + "loss": 7.6456, + "step": 218 + }, + { + "epoch": 0.0013024550385383957, + "grad_norm": 3.985480546951294, + "learning_rate": 4.999979262356904e-05, + "loss": 7.6987, + "step": 219 + }, + { + "epoch": 0.001308402321819393, + "grad_norm": 3.8719842433929443, + "learning_rate": 4.999979071667202e-05, + "loss": 7.6994, + "step": 220 + }, + { + "epoch": 0.0013143496051003902, + "grad_norm": 4.699835300445557, + "learning_rate": 4.999978880104784e-05, + "loss": 8.1815, + "step": 221 + }, + { + "epoch": 0.0013202968883813874, + "grad_norm": 3.9221127033233643, + "learning_rate": 4.9999786876696485e-05, + "loss": 7.8765, + "step": 222 + }, + { + "epoch": 0.0013262441716623847, + "grad_norm": 4.4223504066467285, + "learning_rate": 4.9999784943617964e-05, + "loss": 7.7244, + "step": 223 + }, + { + "epoch": 0.001332191454943382, + "grad_norm": 3.4598348140716553, + "learning_rate": 4.999978300181227e-05, + "loss": 7.7072, + "step": 224 + }, + { + "epoch": 0.0013381387382243792, + "grad_norm": 3.536752223968506, + "learning_rate": 4.999978105127941e-05, + "loss": 7.6337, + "step": 225 + }, + { + "epoch": 0.0013440860215053765, + "grad_norm": 3.6432204246520996, + "learning_rate": 4.99997790920194e-05, + "loss": 7.8078, + "step": 226 + }, + { + "epoch": 0.0013500333047863735, + "grad_norm": 4.8305768966674805, + "learning_rate": 4.999977712403221e-05, + "loss": 7.9003, + "step": 227 + }, + { + "epoch": 0.0013559805880673707, + "grad_norm": 3.773876428604126, + "learning_rate": 4.999977514731786e-05, + "loss": 8.0513, + "step": 228 + }, + { + "epoch": 0.001361927871348368, + "grad_norm": 4.465645790100098, + "learning_rate": 4.999977316187635e-05, + "loss": 7.9847, + "step": 229 + }, + { + "epoch": 0.0013678751546293653, + "grad_norm": 3.9466493129730225, + "learning_rate": 4.9999771167707674e-05, + "loss": 7.9902, + "step": 230 + }, + { + "epoch": 0.0013738224379103625, + "grad_norm": 4.432138919830322, + "learning_rate": 4.9999769164811846e-05, + "loss": 7.8929, + "step": 231 + }, + { + "epoch": 0.0013797697211913598, + "grad_norm": 3.5211949348449707, + "learning_rate": 4.999976715318885e-05, + "loss": 8.1838, + "step": 232 + }, + { + "epoch": 0.001385717004472357, + "grad_norm": 3.0819287300109863, + "learning_rate": 4.9999765132838686e-05, + "loss": 8.2823, + "step": 233 + }, + { + "epoch": 0.0013916642877533543, + "grad_norm": 3.436112880706787, + "learning_rate": 4.9999763103761374e-05, + "loss": 7.7796, + "step": 234 + }, + { + "epoch": 0.0013976115710343515, + "grad_norm": 3.6699061393737793, + "learning_rate": 4.99997610659569e-05, + "loss": 7.5792, + "step": 235 + }, + { + "epoch": 0.0014035588543153488, + "grad_norm": 3.814182758331299, + "learning_rate": 4.999975901942526e-05, + "loss": 7.5631, + "step": 236 + }, + { + "epoch": 0.001409506137596346, + "grad_norm": 3.84110164642334, + "learning_rate": 4.9999756964166465e-05, + "loss": 7.4244, + "step": 237 + }, + { + "epoch": 0.0014154534208773433, + "grad_norm": 3.278045415878296, + "learning_rate": 4.999975490018052e-05, + "loss": 7.9049, + "step": 238 + }, + { + "epoch": 0.0014214007041583405, + "grad_norm": 3.5502712726593018, + "learning_rate": 4.999975282746742e-05, + "loss": 8.0021, + "step": 239 + }, + { + "epoch": 0.0014273479874393378, + "grad_norm": 2.7919108867645264, + "learning_rate": 4.9999750746027153e-05, + "loss": 8.2854, + "step": 240 + }, + { + "epoch": 0.001433295270720335, + "grad_norm": 3.1689581871032715, + "learning_rate": 4.999974865585973e-05, + "loss": 8.3177, + "step": 241 + }, + { + "epoch": 0.001439242554001332, + "grad_norm": 2.728679656982422, + "learning_rate": 4.999974655696517e-05, + "loss": 8.3181, + "step": 242 + }, + { + "epoch": 0.0014451898372823293, + "grad_norm": 3.5175108909606934, + "learning_rate": 4.9999744449343445e-05, + "loss": 8.03, + "step": 243 + }, + { + "epoch": 0.0014511371205633266, + "grad_norm": 3.714219808578491, + "learning_rate": 4.999974233299457e-05, + "loss": 8.0824, + "step": 244 + }, + { + "epoch": 0.0014570844038443239, + "grad_norm": 3.42090106010437, + "learning_rate": 4.9999740207918546e-05, + "loss": 8.0455, + "step": 245 + }, + { + "epoch": 0.001463031687125321, + "grad_norm": 3.035047769546509, + "learning_rate": 4.999973807411537e-05, + "loss": 8.0117, + "step": 246 + }, + { + "epoch": 0.0014689789704063184, + "grad_norm": 3.4878122806549072, + "learning_rate": 4.9999735931585034e-05, + "loss": 8.1368, + "step": 247 + }, + { + "epoch": 0.0014749262536873156, + "grad_norm": 3.648115873336792, + "learning_rate": 4.999973378032756e-05, + "loss": 7.9987, + "step": 248 + }, + { + "epoch": 0.0014808735369683129, + "grad_norm": 3.171255588531494, + "learning_rate": 4.9999731620342936e-05, + "loss": 7.9733, + "step": 249 + }, + { + "epoch": 0.0014868208202493101, + "grad_norm": 3.157804250717163, + "learning_rate": 4.999972945163116e-05, + "loss": 7.8511, + "step": 250 + }, + { + "epoch": 0.0014927681035303074, + "grad_norm": 3.4346978664398193, + "learning_rate": 4.999972727419224e-05, + "loss": 7.9075, + "step": 251 + }, + { + "epoch": 0.0014987153868113046, + "grad_norm": 3.281135082244873, + "learning_rate": 4.9999725088026175e-05, + "loss": 7.876, + "step": 252 + }, + { + "epoch": 0.0015046626700923019, + "grad_norm": 3.1481714248657227, + "learning_rate": 4.9999722893132954e-05, + "loss": 8.1458, + "step": 253 + }, + { + "epoch": 0.0015106099533732991, + "grad_norm": 2.821460247039795, + "learning_rate": 4.99997206895126e-05, + "loss": 7.9141, + "step": 254 + }, + { + "epoch": 0.0015165572366542964, + "grad_norm": 2.887997627258301, + "learning_rate": 4.999971847716509e-05, + "loss": 8.2246, + "step": 255 + }, + { + "epoch": 0.0015225045199352936, + "grad_norm": 2.8097078800201416, + "learning_rate": 4.999971625609044e-05, + "loss": 7.8576, + "step": 256 + }, + { + "epoch": 0.001528451803216291, + "grad_norm": 2.9272890090942383, + "learning_rate": 4.999971402628866e-05, + "loss": 7.6856, + "step": 257 + }, + { + "epoch": 0.001534399086497288, + "grad_norm": 3.487027168273926, + "learning_rate": 4.999971178775973e-05, + "loss": 7.8179, + "step": 258 + }, + { + "epoch": 0.0015403463697782852, + "grad_norm": 3.575681209564209, + "learning_rate": 4.9999709540503656e-05, + "loss": 7.8115, + "step": 259 + }, + { + "epoch": 0.0015462936530592824, + "grad_norm": 3.457756757736206, + "learning_rate": 4.9999707284520435e-05, + "loss": 7.7985, + "step": 260 + }, + { + "epoch": 0.0015522409363402797, + "grad_norm": 3.732728958129883, + "learning_rate": 4.999970501981009e-05, + "loss": 7.8369, + "step": 261 + }, + { + "epoch": 0.001558188219621277, + "grad_norm": 4.1466898918151855, + "learning_rate": 4.99997027463726e-05, + "loss": 8.2435, + "step": 262 + }, + { + "epoch": 0.0015641355029022742, + "grad_norm": 4.028534889221191, + "learning_rate": 4.9999700464207965e-05, + "loss": 8.2338, + "step": 263 + }, + { + "epoch": 0.0015700827861832715, + "grad_norm": 3.7445273399353027, + "learning_rate": 4.99996981733162e-05, + "loss": 8.1182, + "step": 264 + }, + { + "epoch": 0.0015760300694642687, + "grad_norm": 3.455228567123413, + "learning_rate": 4.99996958736973e-05, + "loss": 8.1932, + "step": 265 + }, + { + "epoch": 0.001581977352745266, + "grad_norm": 3.1530332565307617, + "learning_rate": 4.9999693565351256e-05, + "loss": 7.8304, + "step": 266 + }, + { + "epoch": 0.0015879246360262632, + "grad_norm": 3.113161325454712, + "learning_rate": 4.999969124827809e-05, + "loss": 7.6625, + "step": 267 + }, + { + "epoch": 0.0015938719193072605, + "grad_norm": 3.621076822280884, + "learning_rate": 4.999968892247778e-05, + "loss": 8.0983, + "step": 268 + }, + { + "epoch": 0.0015998192025882577, + "grad_norm": 3.533395767211914, + "learning_rate": 4.9999686587950346e-05, + "loss": 7.9564, + "step": 269 + }, + { + "epoch": 0.001605766485869255, + "grad_norm": 3.6486849784851074, + "learning_rate": 4.999968424469577e-05, + "loss": 7.9864, + "step": 270 + }, + { + "epoch": 0.0016117137691502522, + "grad_norm": 3.223167657852173, + "learning_rate": 4.999968189271407e-05, + "loss": 7.8516, + "step": 271 + }, + { + "epoch": 0.0016176610524312495, + "grad_norm": 3.282062530517578, + "learning_rate": 4.999967953200523e-05, + "loss": 7.9247, + "step": 272 + }, + { + "epoch": 0.0016236083357122465, + "grad_norm": 2.8589930534362793, + "learning_rate": 4.999967716256927e-05, + "loss": 7.8871, + "step": 273 + }, + { + "epoch": 0.0016295556189932438, + "grad_norm": 3.136882781982422, + "learning_rate": 4.9999674784406174e-05, + "loss": 7.8793, + "step": 274 + }, + { + "epoch": 0.001635502902274241, + "grad_norm": 3.9103915691375732, + "learning_rate": 4.999967239751595e-05, + "loss": 7.9005, + "step": 275 + }, + { + "epoch": 0.0016414501855552383, + "grad_norm": 4.40267276763916, + "learning_rate": 4.99996700018986e-05, + "loss": 7.9247, + "step": 276 + }, + { + "epoch": 0.0016473974688362356, + "grad_norm": 3.6620242595672607, + "learning_rate": 4.9999667597554136e-05, + "loss": 8.0719, + "step": 277 + }, + { + "epoch": 0.0016533447521172328, + "grad_norm": 3.1278858184814453, + "learning_rate": 4.999966518448253e-05, + "loss": 8.0822, + "step": 278 + }, + { + "epoch": 0.00165929203539823, + "grad_norm": 3.321831464767456, + "learning_rate": 4.9999662762683805e-05, + "loss": 8.1266, + "step": 279 + }, + { + "epoch": 0.0016652393186792273, + "grad_norm": 3.4116811752319336, + "learning_rate": 4.999966033215795e-05, + "loss": 8.2159, + "step": 280 + }, + { + "epoch": 0.0016711866019602246, + "grad_norm": 3.58381724357605, + "learning_rate": 4.999965789290498e-05, + "loss": 8.0275, + "step": 281 + }, + { + "epoch": 0.0016771338852412218, + "grad_norm": 3.0357518196105957, + "learning_rate": 4.9999655444924884e-05, + "loss": 8.1171, + "step": 282 + }, + { + "epoch": 0.001683081168522219, + "grad_norm": 3.237764596939087, + "learning_rate": 4.999965298821767e-05, + "loss": 7.822, + "step": 283 + }, + { + "epoch": 0.0016890284518032163, + "grad_norm": 3.0861873626708984, + "learning_rate": 4.999965052278334e-05, + "loss": 7.7991, + "step": 284 + }, + { + "epoch": 0.0016949757350842136, + "grad_norm": 2.8045542240142822, + "learning_rate": 4.999964804862187e-05, + "loss": 7.9659, + "step": 285 + }, + { + "epoch": 0.0017009230183652108, + "grad_norm": 3.1282641887664795, + "learning_rate": 4.9999645565733297e-05, + "loss": 7.8354, + "step": 286 + }, + { + "epoch": 0.001706870301646208, + "grad_norm": 2.980001211166382, + "learning_rate": 4.999964307411761e-05, + "loss": 7.806, + "step": 287 + }, + { + "epoch": 0.0017128175849272054, + "grad_norm": 3.114238977432251, + "learning_rate": 4.99996405737748e-05, + "loss": 7.6173, + "step": 288 + }, + { + "epoch": 0.0017187648682082024, + "grad_norm": 2.6732640266418457, + "learning_rate": 4.9999638064704866e-05, + "loss": 7.5944, + "step": 289 + }, + { + "epoch": 0.0017247121514891996, + "grad_norm": 3.2139906883239746, + "learning_rate": 4.999963554690783e-05, + "loss": 7.5738, + "step": 290 + }, + { + "epoch": 0.001730659434770197, + "grad_norm": 3.0964555740356445, + "learning_rate": 4.999963302038368e-05, + "loss": 7.4431, + "step": 291 + }, + { + "epoch": 0.0017366067180511942, + "grad_norm": 3.0611374378204346, + "learning_rate": 4.99996304851324e-05, + "loss": 7.3748, + "step": 292 + }, + { + "epoch": 0.0017425540013321914, + "grad_norm": 2.88114333152771, + "learning_rate": 4.999962794115402e-05, + "loss": 7.3554, + "step": 293 + }, + { + "epoch": 0.0017485012846131887, + "grad_norm": 2.895141363143921, + "learning_rate": 4.999962538844852e-05, + "loss": 7.2801, + "step": 294 + }, + { + "epoch": 0.001754448567894186, + "grad_norm": 3.0645008087158203, + "learning_rate": 4.9999622827015914e-05, + "loss": 7.1753, + "step": 295 + }, + { + "epoch": 0.0017603958511751832, + "grad_norm": 3.0750465393066406, + "learning_rate": 4.99996202568562e-05, + "loss": 7.1905, + "step": 296 + }, + { + "epoch": 0.0017663431344561804, + "grad_norm": 3.1322436332702637, + "learning_rate": 4.9999617677969374e-05, + "loss": 7.0851, + "step": 297 + }, + { + "epoch": 0.0017722904177371777, + "grad_norm": 3.8287153244018555, + "learning_rate": 4.999961509035544e-05, + "loss": 7.0842, + "step": 298 + }, + { + "epoch": 0.001778237701018175, + "grad_norm": 2.874312162399292, + "learning_rate": 4.9999612494014403e-05, + "loss": 6.9588, + "step": 299 + }, + { + "epoch": 0.0017841849842991722, + "grad_norm": 2.916250705718994, + "learning_rate": 4.999960988894625e-05, + "loss": 7.1342, + "step": 300 + }, + { + "epoch": 0.0017901322675801694, + "grad_norm": 2.71624755859375, + "learning_rate": 4.9999607275151e-05, + "loss": 7.0418, + "step": 301 + }, + { + "epoch": 0.0017960795508611667, + "grad_norm": 2.655630350112915, + "learning_rate": 4.999960465262864e-05, + "loss": 6.937, + "step": 302 + }, + { + "epoch": 0.001802026834142164, + "grad_norm": 2.8819122314453125, + "learning_rate": 4.999960202137918e-05, + "loss": 7.0116, + "step": 303 + }, + { + "epoch": 0.0018079741174231612, + "grad_norm": 2.909701108932495, + "learning_rate": 4.999959938140262e-05, + "loss": 6.9588, + "step": 304 + }, + { + "epoch": 0.0018139214007041582, + "grad_norm": 3.276395797729492, + "learning_rate": 4.999959673269895e-05, + "loss": 6.9066, + "step": 305 + }, + { + "epoch": 0.0018198686839851555, + "grad_norm": 2.8774867057800293, + "learning_rate": 4.9999594075268186e-05, + "loss": 7.0112, + "step": 306 + }, + { + "epoch": 0.0018258159672661528, + "grad_norm": 2.9667818546295166, + "learning_rate": 4.999959140911032e-05, + "loss": 7.1467, + "step": 307 + }, + { + "epoch": 0.00183176325054715, + "grad_norm": 6.6612958908081055, + "learning_rate": 4.999958873422536e-05, + "loss": 8.4457, + "step": 308 + }, + { + "epoch": 0.0018377105338281473, + "grad_norm": 4.234557628631592, + "learning_rate": 4.999958605061329e-05, + "loss": 8.904, + "step": 309 + }, + { + "epoch": 0.0018436578171091445, + "grad_norm": 4.049502372741699, + "learning_rate": 4.999958335827413e-05, + "loss": 7.5174, + "step": 310 + }, + { + "epoch": 0.0018496051003901418, + "grad_norm": 3.574474334716797, + "learning_rate": 4.999958065720787e-05, + "loss": 8.6537, + "step": 311 + }, + { + "epoch": 0.001855552383671139, + "grad_norm": 3.6154026985168457, + "learning_rate": 4.9999577947414515e-05, + "loss": 8.5833, + "step": 312 + }, + { + "epoch": 0.0018614996669521363, + "grad_norm": 2.9204158782958984, + "learning_rate": 4.999957522889407e-05, + "loss": 8.5486, + "step": 313 + }, + { + "epoch": 0.0018674469502331335, + "grad_norm": 3.095310688018799, + "learning_rate": 4.999957250164653e-05, + "loss": 8.3855, + "step": 314 + }, + { + "epoch": 0.0018733942335141308, + "grad_norm": 3.872267723083496, + "learning_rate": 4.999956976567189e-05, + "loss": 8.2715, + "step": 315 + }, + { + "epoch": 0.001879341516795128, + "grad_norm": 3.5560686588287354, + "learning_rate": 4.9999567020970175e-05, + "loss": 8.1571, + "step": 316 + }, + { + "epoch": 0.0018852888000761253, + "grad_norm": 2.6759164333343506, + "learning_rate": 4.9999564267541356e-05, + "loss": 8.4072, + "step": 317 + }, + { + "epoch": 0.0018912360833571226, + "grad_norm": 4.034712791442871, + "learning_rate": 4.999956150538545e-05, + "loss": 7.7622, + "step": 318 + }, + { + "epoch": 0.0018971833666381198, + "grad_norm": 3.8927831649780273, + "learning_rate": 4.999955873450246e-05, + "loss": 7.5012, + "step": 319 + }, + { + "epoch": 0.0019031306499191168, + "grad_norm": 3.4422812461853027, + "learning_rate": 4.999955595489237e-05, + "loss": 7.6894, + "step": 320 + }, + { + "epoch": 0.001909077933200114, + "grad_norm": 3.0367283821105957, + "learning_rate": 4.999955316655521e-05, + "loss": 7.8151, + "step": 321 + }, + { + "epoch": 0.0019150252164811114, + "grad_norm": 3.7553489208221436, + "learning_rate": 4.9999550369490955e-05, + "loss": 8.0462, + "step": 322 + }, + { + "epoch": 0.0019209724997621086, + "grad_norm": 3.432591438293457, + "learning_rate": 4.999954756369962e-05, + "loss": 7.8782, + "step": 323 + }, + { + "epoch": 0.0019269197830431059, + "grad_norm": 2.7325966358184814, + "learning_rate": 4.9999544749181196e-05, + "loss": 7.9045, + "step": 324 + }, + { + "epoch": 0.0019328670663241031, + "grad_norm": 4.31963586807251, + "learning_rate": 4.9999541925935686e-05, + "loss": 7.7791, + "step": 325 + }, + { + "epoch": 0.0019388143496051004, + "grad_norm": 2.840189218521118, + "learning_rate": 4.999953909396311e-05, + "loss": 7.8334, + "step": 326 + }, + { + "epoch": 0.0019447616328860976, + "grad_norm": 3.2388041019439697, + "learning_rate": 4.9999536253263434e-05, + "loss": 7.6756, + "step": 327 + }, + { + "epoch": 0.0019507089161670949, + "grad_norm": 3.6291563510894775, + "learning_rate": 4.999953340383669e-05, + "loss": 7.6511, + "step": 328 + }, + { + "epoch": 0.001956656199448092, + "grad_norm": 3.35703706741333, + "learning_rate": 4.999953054568287e-05, + "loss": 7.6382, + "step": 329 + }, + { + "epoch": 0.001962603482729089, + "grad_norm": 3.117281198501587, + "learning_rate": 4.999952767880196e-05, + "loss": 7.6233, + "step": 330 + }, + { + "epoch": 0.0019685507660100864, + "grad_norm": 2.8385257720947266, + "learning_rate": 4.999952480319398e-05, + "loss": 7.6594, + "step": 331 + }, + { + "epoch": 0.0019744980492910837, + "grad_norm": 2.5914418697357178, + "learning_rate": 4.999952191885893e-05, + "loss": 8.2647, + "step": 332 + }, + { + "epoch": 0.001980445332572081, + "grad_norm": 2.5847742557525635, + "learning_rate": 4.9999519025796795e-05, + "loss": 8.339, + "step": 333 + }, + { + "epoch": 0.001986392615853078, + "grad_norm": 2.7022132873535156, + "learning_rate": 4.999951612400759e-05, + "loss": 7.9114, + "step": 334 + }, + { + "epoch": 0.0019923398991340754, + "grad_norm": 3.0290884971618652, + "learning_rate": 4.999951321349131e-05, + "loss": 7.4531, + "step": 335 + }, + { + "epoch": 0.0019982871824150727, + "grad_norm": 2.8910324573516846, + "learning_rate": 4.999951029424796e-05, + "loss": 7.398, + "step": 336 + }, + { + "epoch": 0.00200423446569607, + "grad_norm": 2.8917605876922607, + "learning_rate": 4.9999507366277545e-05, + "loss": 7.48, + "step": 337 + }, + { + "epoch": 0.002010181748977067, + "grad_norm": 2.8957982063293457, + "learning_rate": 4.999950442958005e-05, + "loss": 7.8662, + "step": 338 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 3.562232255935669, + "learning_rate": 4.9999501484155485e-05, + "loss": 7.8388, + "step": 339 + }, + { + "epoch": 0.0020220763155390617, + "grad_norm": 2.51676607131958, + "learning_rate": 4.9999498530003866e-05, + "loss": 8.2834, + "step": 340 + }, + { + "epoch": 0.002028023598820059, + "grad_norm": 2.326110363006592, + "learning_rate": 4.999949556712517e-05, + "loss": 8.2528, + "step": 341 + }, + { + "epoch": 0.0020339708821010562, + "grad_norm": 2.7621335983276367, + "learning_rate": 4.999949259551941e-05, + "loss": 7.9791, + "step": 342 + }, + { + "epoch": 0.0020399181653820535, + "grad_norm": 3.045431137084961, + "learning_rate": 4.999948961518659e-05, + "loss": 7.8575, + "step": 343 + }, + { + "epoch": 0.0020458654486630507, + "grad_norm": 3.1940131187438965, + "learning_rate": 4.9999486626126703e-05, + "loss": 7.8581, + "step": 344 + }, + { + "epoch": 0.002051812731944048, + "grad_norm": 2.964136838912964, + "learning_rate": 4.999948362833975e-05, + "loss": 7.9656, + "step": 345 + }, + { + "epoch": 0.0020577600152250452, + "grad_norm": 3.167573928833008, + "learning_rate": 4.999948062182574e-05, + "loss": 7.7448, + "step": 346 + }, + { + "epoch": 0.0020637072985060425, + "grad_norm": 3.062666177749634, + "learning_rate": 4.9999477606584666e-05, + "loss": 7.7655, + "step": 347 + }, + { + "epoch": 0.0020696545817870397, + "grad_norm": 3.1097402572631836, + "learning_rate": 4.999947458261653e-05, + "loss": 7.643, + "step": 348 + }, + { + "epoch": 0.002075601865068037, + "grad_norm": 3.1663928031921387, + "learning_rate": 4.999947154992135e-05, + "loss": 7.8348, + "step": 349 + }, + { + "epoch": 0.0020815491483490343, + "grad_norm": 2.8295886516571045, + "learning_rate": 4.99994685084991e-05, + "loss": 7.7752, + "step": 350 + }, + { + "epoch": 0.0020874964316300315, + "grad_norm": 2.7384233474731445, + "learning_rate": 4.99994654583498e-05, + "loss": 7.7644, + "step": 351 + }, + { + "epoch": 0.0020934437149110288, + "grad_norm": 2.6654486656188965, + "learning_rate": 4.999946239947344e-05, + "loss": 7.7489, + "step": 352 + }, + { + "epoch": 0.002099390998192026, + "grad_norm": 2.8949942588806152, + "learning_rate": 4.999945933187003e-05, + "loss": 7.7105, + "step": 353 + }, + { + "epoch": 0.0021053382814730233, + "grad_norm": 2.590036630630493, + "learning_rate": 4.999945625553957e-05, + "loss": 7.6821, + "step": 354 + }, + { + "epoch": 0.0021112855647540205, + "grad_norm": 3.4601457118988037, + "learning_rate": 4.999945317048205e-05, + "loss": 7.3552, + "step": 355 + }, + { + "epoch": 0.002117232848035018, + "grad_norm": 4.022705078125, + "learning_rate": 4.999945007669748e-05, + "loss": 7.0281, + "step": 356 + }, + { + "epoch": 0.002123180131316015, + "grad_norm": 3.249699592590332, + "learning_rate": 4.999944697418587e-05, + "loss": 7.9279, + "step": 357 + }, + { + "epoch": 0.0021291274145970123, + "grad_norm": 2.8424601554870605, + "learning_rate": 4.99994438629472e-05, + "loss": 8.1485, + "step": 358 + }, + { + "epoch": 0.002135074697878009, + "grad_norm": 3.0473172664642334, + "learning_rate": 4.9999440742981486e-05, + "loss": 8.0877, + "step": 359 + }, + { + "epoch": 0.0021410219811590064, + "grad_norm": 3.0614171028137207, + "learning_rate": 4.9999437614288726e-05, + "loss": 7.7817, + "step": 360 + }, + { + "epoch": 0.0021469692644400036, + "grad_norm": 3.309464931488037, + "learning_rate": 4.9999434476868925e-05, + "loss": 7.857, + "step": 361 + }, + { + "epoch": 0.002152916547721001, + "grad_norm": 3.031921148300171, + "learning_rate": 4.999943133072207e-05, + "loss": 7.6393, + "step": 362 + }, + { + "epoch": 0.002158863831001998, + "grad_norm": 3.3756978511810303, + "learning_rate": 4.999942817584818e-05, + "loss": 7.7422, + "step": 363 + }, + { + "epoch": 0.0021648111142829954, + "grad_norm": 3.53362774848938, + "learning_rate": 4.999942501224724e-05, + "loss": 7.9388, + "step": 364 + }, + { + "epoch": 0.0021707583975639926, + "grad_norm": 3.4082882404327393, + "learning_rate": 4.999942183991927e-05, + "loss": 7.3578, + "step": 365 + }, + { + "epoch": 0.00217670568084499, + "grad_norm": 4.035211086273193, + "learning_rate": 4.999941865886425e-05, + "loss": 7.7833, + "step": 366 + }, + { + "epoch": 0.002182652964125987, + "grad_norm": 3.0394630432128906, + "learning_rate": 4.99994154690822e-05, + "loss": 7.9392, + "step": 367 + }, + { + "epoch": 0.0021886002474069844, + "grad_norm": 3.088926076889038, + "learning_rate": 4.99994122705731e-05, + "loss": 7.8149, + "step": 368 + }, + { + "epoch": 0.0021945475306879817, + "grad_norm": 2.3173277378082275, + "learning_rate": 4.9999409063336976e-05, + "loss": 8.2211, + "step": 369 + }, + { + "epoch": 0.002200494813968979, + "grad_norm": 2.9960854053497314, + "learning_rate": 4.9999405847373815e-05, + "loss": 7.6764, + "step": 370 + }, + { + "epoch": 0.002206442097249976, + "grad_norm": 2.841848134994507, + "learning_rate": 4.999940262268361e-05, + "loss": 7.9418, + "step": 371 + }, + { + "epoch": 0.0022123893805309734, + "grad_norm": 3.748779058456421, + "learning_rate": 4.999939938926638e-05, + "loss": 7.7843, + "step": 372 + }, + { + "epoch": 0.0022183366638119707, + "grad_norm": 2.8345019817352295, + "learning_rate": 4.999939614712212e-05, + "loss": 7.592, + "step": 373 + }, + { + "epoch": 0.002224283947092968, + "grad_norm": 3.12503719329834, + "learning_rate": 4.9999392896250826e-05, + "loss": 7.9543, + "step": 374 + }, + { + "epoch": 0.002230231230373965, + "grad_norm": 2.7812912464141846, + "learning_rate": 4.99993896366525e-05, + "loss": 7.8738, + "step": 375 + }, + { + "epoch": 0.0022361785136549624, + "grad_norm": 2.9477410316467285, + "learning_rate": 4.9999386368327144e-05, + "loss": 7.7738, + "step": 376 + }, + { + "epoch": 0.0022421257969359597, + "grad_norm": 2.305204391479492, + "learning_rate": 4.999938309127477e-05, + "loss": 7.9123, + "step": 377 + }, + { + "epoch": 0.002248073080216957, + "grad_norm": 3.3839781284332275, + "learning_rate": 4.999937980549536e-05, + "loss": 7.8542, + "step": 378 + }, + { + "epoch": 0.002254020363497954, + "grad_norm": 3.6973462104797363, + "learning_rate": 4.9999376510988924e-05, + "loss": 7.6953, + "step": 379 + }, + { + "epoch": 0.0022599676467789515, + "grad_norm": 3.8176333904266357, + "learning_rate": 4.999937320775547e-05, + "loss": 7.6548, + "step": 380 + }, + { + "epoch": 0.0022659149300599487, + "grad_norm": 3.0237386226654053, + "learning_rate": 4.999936989579499e-05, + "loss": 7.7843, + "step": 381 + }, + { + "epoch": 0.002271862213340946, + "grad_norm": 2.699695348739624, + "learning_rate": 4.999936657510749e-05, + "loss": 7.8841, + "step": 382 + }, + { + "epoch": 0.0022778094966219432, + "grad_norm": 3.7468206882476807, + "learning_rate": 4.9999363245692965e-05, + "loss": 7.8069, + "step": 383 + }, + { + "epoch": 0.0022837567799029405, + "grad_norm": 3.1074821949005127, + "learning_rate": 4.999935990755142e-05, + "loss": 7.8392, + "step": 384 + }, + { + "epoch": 0.0022897040631839377, + "grad_norm": 2.420884609222412, + "learning_rate": 4.999935656068287e-05, + "loss": 7.9238, + "step": 385 + }, + { + "epoch": 0.002295651346464935, + "grad_norm": 3.1354825496673584, + "learning_rate": 4.9999353205087296e-05, + "loss": 7.9766, + "step": 386 + }, + { + "epoch": 0.0023015986297459322, + "grad_norm": 2.7911901473999023, + "learning_rate": 4.9999349840764695e-05, + "loss": 7.9118, + "step": 387 + }, + { + "epoch": 0.0023075459130269295, + "grad_norm": 2.59529447555542, + "learning_rate": 4.999934646771509e-05, + "loss": 7.8839, + "step": 388 + }, + { + "epoch": 0.0023134931963079267, + "grad_norm": 4.121276378631592, + "learning_rate": 4.999934308593848e-05, + "loss": 7.8406, + "step": 389 + }, + { + "epoch": 0.0023194404795889236, + "grad_norm": 2.9091265201568604, + "learning_rate": 4.999933969543485e-05, + "loss": 7.86, + "step": 390 + }, + { + "epoch": 0.002325387762869921, + "grad_norm": 3.0700483322143555, + "learning_rate": 4.9999336296204195e-05, + "loss": 7.8214, + "step": 391 + }, + { + "epoch": 0.002331335046150918, + "grad_norm": 3.3008790016174316, + "learning_rate": 4.999933288824654e-05, + "loss": 7.5863, + "step": 392 + }, + { + "epoch": 0.0023372823294319153, + "grad_norm": 3.1414108276367188, + "learning_rate": 4.999932947156188e-05, + "loss": 7.5815, + "step": 393 + }, + { + "epoch": 0.0023432296127129126, + "grad_norm": 2.6881701946258545, + "learning_rate": 4.999932604615021e-05, + "loss": 7.959, + "step": 394 + }, + { + "epoch": 0.00234917689599391, + "grad_norm": 2.45609712600708, + "learning_rate": 4.9999322612011534e-05, + "loss": 7.9668, + "step": 395 + }, + { + "epoch": 0.002355124179274907, + "grad_norm": 3.1126747131347656, + "learning_rate": 4.999931916914585e-05, + "loss": 7.774, + "step": 396 + }, + { + "epoch": 0.0023610714625559043, + "grad_norm": 2.806708574295044, + "learning_rate": 4.999931571755316e-05, + "loss": 7.6297, + "step": 397 + }, + { + "epoch": 0.0023670187458369016, + "grad_norm": 3.220013380050659, + "learning_rate": 4.999931225723348e-05, + "loss": 7.3856, + "step": 398 + }, + { + "epoch": 0.002372966029117899, + "grad_norm": 3.0159943103790283, + "learning_rate": 4.9999308788186786e-05, + "loss": 7.3822, + "step": 399 + }, + { + "epoch": 0.002378913312398896, + "grad_norm": 3.1066205501556396, + "learning_rate": 4.9999305310413094e-05, + "loss": 7.3905, + "step": 400 + }, + { + "epoch": 0.0023848605956798934, + "grad_norm": 2.8004367351531982, + "learning_rate": 4.99993018239124e-05, + "loss": 7.8548, + "step": 401 + }, + { + "epoch": 0.0023908078789608906, + "grad_norm": 3.004378318786621, + "learning_rate": 4.999929832868471e-05, + "loss": 7.7846, + "step": 402 + }, + { + "epoch": 0.002396755162241888, + "grad_norm": 3.42901349067688, + "learning_rate": 4.9999294824730025e-05, + "loss": 7.9188, + "step": 403 + }, + { + "epoch": 0.002402702445522885, + "grad_norm": 3.7258527278900146, + "learning_rate": 4.9999291312048343e-05, + "loss": 7.7302, + "step": 404 + }, + { + "epoch": 0.0024086497288038824, + "grad_norm": 4.215145111083984, + "learning_rate": 4.999928779063967e-05, + "loss": 7.6597, + "step": 405 + }, + { + "epoch": 0.0024145970120848796, + "grad_norm": 3.157273769378662, + "learning_rate": 4.9999284260504004e-05, + "loss": 7.7262, + "step": 406 + }, + { + "epoch": 0.002420544295365877, + "grad_norm": 2.9977381229400635, + "learning_rate": 4.999928072164135e-05, + "loss": 7.72, + "step": 407 + }, + { + "epoch": 0.002426491578646874, + "grad_norm": 2.791682720184326, + "learning_rate": 4.9999277174051696e-05, + "loss": 7.8022, + "step": 408 + }, + { + "epoch": 0.0024324388619278714, + "grad_norm": 3.4143035411834717, + "learning_rate": 4.999927361773506e-05, + "loss": 7.5116, + "step": 409 + }, + { + "epoch": 0.0024383861452088687, + "grad_norm": 3.3458821773529053, + "learning_rate": 4.9999270052691425e-05, + "loss": 7.4337, + "step": 410 + }, + { + "epoch": 0.002444333428489866, + "grad_norm": 3.3339595794677734, + "learning_rate": 4.999926647892081e-05, + "loss": 7.7345, + "step": 411 + }, + { + "epoch": 0.002450280711770863, + "grad_norm": 4.285780429840088, + "learning_rate": 4.999926289642321e-05, + "loss": 7.9388, + "step": 412 + }, + { + "epoch": 0.0024562279950518604, + "grad_norm": 3.9473414421081543, + "learning_rate": 4.9999259305198624e-05, + "loss": 7.6038, + "step": 413 + }, + { + "epoch": 0.0024621752783328577, + "grad_norm": 3.504227638244629, + "learning_rate": 4.999925570524706e-05, + "loss": 7.4818, + "step": 414 + }, + { + "epoch": 0.002468122561613855, + "grad_norm": 3.2182157039642334, + "learning_rate": 4.999925209656851e-05, + "loss": 7.3493, + "step": 415 + }, + { + "epoch": 0.002474069844894852, + "grad_norm": 3.1944262981414795, + "learning_rate": 4.999924847916297e-05, + "loss": 7.3646, + "step": 416 + }, + { + "epoch": 0.0024800171281758494, + "grad_norm": 2.957244634628296, + "learning_rate": 4.999924485303047e-05, + "loss": 7.4403, + "step": 417 + }, + { + "epoch": 0.0024859644114568467, + "grad_norm": 2.971285343170166, + "learning_rate": 4.999924121817098e-05, + "loss": 7.7266, + "step": 418 + }, + { + "epoch": 0.002491911694737844, + "grad_norm": 4.029009819030762, + "learning_rate": 4.999923757458451e-05, + "loss": 7.3919, + "step": 419 + }, + { + "epoch": 0.002497858978018841, + "grad_norm": 3.9034767150878906, + "learning_rate": 4.999923392227107e-05, + "loss": 7.2349, + "step": 420 + }, + { + "epoch": 0.002503806261299838, + "grad_norm": 3.23218035697937, + "learning_rate": 4.9999230261230656e-05, + "loss": 7.5146, + "step": 421 + }, + { + "epoch": 0.0025097535445808353, + "grad_norm": 3.193225622177124, + "learning_rate": 4.9999226591463265e-05, + "loss": 7.1699, + "step": 422 + }, + { + "epoch": 0.0025157008278618325, + "grad_norm": 2.9796435832977295, + "learning_rate": 4.999922291296891e-05, + "loss": 7.5719, + "step": 423 + }, + { + "epoch": 0.0025216481111428298, + "grad_norm": 2.6746885776519775, + "learning_rate": 4.999921922574758e-05, + "loss": 7.8086, + "step": 424 + }, + { + "epoch": 0.002527595394423827, + "grad_norm": 3.0622920989990234, + "learning_rate": 4.999921552979928e-05, + "loss": 7.3233, + "step": 425 + }, + { + "epoch": 0.0025335426777048243, + "grad_norm": 3.0908501148223877, + "learning_rate": 4.999921182512402e-05, + "loss": 7.2582, + "step": 426 + }, + { + "epoch": 0.0025394899609858215, + "grad_norm": 2.6913537979125977, + "learning_rate": 4.999920811172178e-05, + "loss": 7.6643, + "step": 427 + }, + { + "epoch": 0.002545437244266819, + "grad_norm": 2.7793848514556885, + "learning_rate": 4.999920438959258e-05, + "loss": 7.9445, + "step": 428 + }, + { + "epoch": 0.002551384527547816, + "grad_norm": 2.741617202758789, + "learning_rate": 4.999920065873642e-05, + "loss": 8.0755, + "step": 429 + }, + { + "epoch": 0.0025573318108288133, + "grad_norm": 2.7102227210998535, + "learning_rate": 4.999919691915329e-05, + "loss": 7.8908, + "step": 430 + }, + { + "epoch": 0.0025632790941098106, + "grad_norm": 2.687788248062134, + "learning_rate": 4.9999193170843206e-05, + "loss": 7.9025, + "step": 431 + }, + { + "epoch": 0.002569226377390808, + "grad_norm": 2.923664093017578, + "learning_rate": 4.999918941380616e-05, + "loss": 7.9331, + "step": 432 + }, + { + "epoch": 0.002575173660671805, + "grad_norm": 2.934735059738159, + "learning_rate": 4.999918564804215e-05, + "loss": 7.722, + "step": 433 + }, + { + "epoch": 0.0025811209439528023, + "grad_norm": 3.8156228065490723, + "learning_rate": 4.999918187355119e-05, + "loss": 7.9392, + "step": 434 + }, + { + "epoch": 0.0025870682272337996, + "grad_norm": 2.333798408508301, + "learning_rate": 4.999917809033327e-05, + "loss": 7.9093, + "step": 435 + }, + { + "epoch": 0.002593015510514797, + "grad_norm": 2.078932046890259, + "learning_rate": 4.99991742983884e-05, + "loss": 7.8484, + "step": 436 + }, + { + "epoch": 0.002598962793795794, + "grad_norm": 2.433375835418701, + "learning_rate": 4.999917049771657e-05, + "loss": 7.9124, + "step": 437 + }, + { + "epoch": 0.0026049100770767913, + "grad_norm": 3.1881024837493896, + "learning_rate": 4.999916668831779e-05, + "loss": 7.3966, + "step": 438 + }, + { + "epoch": 0.0026108573603577886, + "grad_norm": 2.4724855422973633, + "learning_rate": 4.9999162870192065e-05, + "loss": 7.535, + "step": 439 + }, + { + "epoch": 0.002616804643638786, + "grad_norm": 2.8757777214050293, + "learning_rate": 4.999915904333938e-05, + "loss": 7.6728, + "step": 440 + }, + { + "epoch": 0.002622751926919783, + "grad_norm": 3.5439565181732178, + "learning_rate": 4.999915520775975e-05, + "loss": 7.5308, + "step": 441 + }, + { + "epoch": 0.0026286992102007804, + "grad_norm": 2.8345577716827393, + "learning_rate": 4.999915136345318e-05, + "loss": 7.7083, + "step": 442 + }, + { + "epoch": 0.0026346464934817776, + "grad_norm": 3.0842509269714355, + "learning_rate": 4.999914751041965e-05, + "loss": 7.9281, + "step": 443 + }, + { + "epoch": 0.002640593776762775, + "grad_norm": 3.0017757415771484, + "learning_rate": 4.999914364865919e-05, + "loss": 7.4727, + "step": 444 + }, + { + "epoch": 0.002646541060043772, + "grad_norm": 2.637838125228882, + "learning_rate": 4.9999139778171785e-05, + "loss": 7.5284, + "step": 445 + }, + { + "epoch": 0.0026524883433247694, + "grad_norm": 2.7749550342559814, + "learning_rate": 4.999913589895743e-05, + "loss": 7.7006, + "step": 446 + }, + { + "epoch": 0.0026584356266057666, + "grad_norm": 3.1636059284210205, + "learning_rate": 4.9999132011016146e-05, + "loss": 7.6441, + "step": 447 + }, + { + "epoch": 0.002664382909886764, + "grad_norm": 2.623776435852051, + "learning_rate": 4.9999128114347913e-05, + "loss": 7.8027, + "step": 448 + }, + { + "epoch": 0.002670330193167761, + "grad_norm": 2.803612232208252, + "learning_rate": 4.9999124208952755e-05, + "loss": 7.553, + "step": 449 + }, + { + "epoch": 0.0026762774764487584, + "grad_norm": 3.3169047832489014, + "learning_rate": 4.9999120294830656e-05, + "loss": 8.0965, + "step": 450 + }, + { + "epoch": 0.0026822247597297556, + "grad_norm": 3.9928581714630127, + "learning_rate": 4.999911637198161e-05, + "loss": 7.8152, + "step": 451 + }, + { + "epoch": 0.002688172043010753, + "grad_norm": 2.8126320838928223, + "learning_rate": 4.9999112440405646e-05, + "loss": 7.4843, + "step": 452 + }, + { + "epoch": 0.0026941193262917497, + "grad_norm": 2.773427963256836, + "learning_rate": 4.999910850010275e-05, + "loss": 7.7074, + "step": 453 + }, + { + "epoch": 0.002700066609572747, + "grad_norm": 2.8877642154693604, + "learning_rate": 4.999910455107292e-05, + "loss": 7.7764, + "step": 454 + }, + { + "epoch": 0.0027060138928537442, + "grad_norm": 2.6323535442352295, + "learning_rate": 4.9999100593316155e-05, + "loss": 7.7336, + "step": 455 + }, + { + "epoch": 0.0027119611761347415, + "grad_norm": 2.939509153366089, + "learning_rate": 4.9999096626832465e-05, + "loss": 7.8184, + "step": 456 + }, + { + "epoch": 0.0027179084594157387, + "grad_norm": 2.6926229000091553, + "learning_rate": 4.9999092651621855e-05, + "loss": 7.5027, + "step": 457 + }, + { + "epoch": 0.002723855742696736, + "grad_norm": 2.889389991760254, + "learning_rate": 4.999908866768431e-05, + "loss": 7.1138, + "step": 458 + }, + { + "epoch": 0.0027298030259777332, + "grad_norm": 2.951796531677246, + "learning_rate": 4.999908467501985e-05, + "loss": 7.7549, + "step": 459 + }, + { + "epoch": 0.0027357503092587305, + "grad_norm": 2.9076783657073975, + "learning_rate": 4.999908067362847e-05, + "loss": 7.6577, + "step": 460 + }, + { + "epoch": 0.0027416975925397278, + "grad_norm": 3.010636806488037, + "learning_rate": 4.9999076663510155e-05, + "loss": 7.6467, + "step": 461 + }, + { + "epoch": 0.002747644875820725, + "grad_norm": 2.7591371536254883, + "learning_rate": 4.9999072644664935e-05, + "loss": 7.5825, + "step": 462 + }, + { + "epoch": 0.0027535921591017223, + "grad_norm": 2.503632068634033, + "learning_rate": 4.9999068617092795e-05, + "loss": 7.711, + "step": 463 + }, + { + "epoch": 0.0027595394423827195, + "grad_norm": 2.6518661975860596, + "learning_rate": 4.999906458079373e-05, + "loss": 7.557, + "step": 464 + }, + { + "epoch": 0.0027654867256637168, + "grad_norm": 2.6865615844726562, + "learning_rate": 4.9999060535767764e-05, + "loss": 7.5788, + "step": 465 + }, + { + "epoch": 0.002771434008944714, + "grad_norm": 2.715190887451172, + "learning_rate": 4.999905648201487e-05, + "loss": 7.517, + "step": 466 + }, + { + "epoch": 0.0027773812922257113, + "grad_norm": 3.1603381633758545, + "learning_rate": 4.999905241953506e-05, + "loss": 7.6176, + "step": 467 + }, + { + "epoch": 0.0027833285755067085, + "grad_norm": 3.1451528072357178, + "learning_rate": 4.999904834832836e-05, + "loss": 7.6051, + "step": 468 + }, + { + "epoch": 0.002789275858787706, + "grad_norm": 2.5310862064361572, + "learning_rate": 4.9999044268394736e-05, + "loss": 7.6075, + "step": 469 + }, + { + "epoch": 0.002795223142068703, + "grad_norm": 2.9285359382629395, + "learning_rate": 4.99990401797342e-05, + "loss": 7.5399, + "step": 470 + }, + { + "epoch": 0.0028011704253497003, + "grad_norm": 3.2180614471435547, + "learning_rate": 4.9999036082346766e-05, + "loss": 7.6952, + "step": 471 + }, + { + "epoch": 0.0028071177086306976, + "grad_norm": 4.041499614715576, + "learning_rate": 4.9999031976232426e-05, + "loss": 7.841, + "step": 472 + }, + { + "epoch": 0.002813064991911695, + "grad_norm": 3.233492612838745, + "learning_rate": 4.999902786139118e-05, + "loss": 7.5267, + "step": 473 + }, + { + "epoch": 0.002819012275192692, + "grad_norm": 2.7749760150909424, + "learning_rate": 4.9999023737823034e-05, + "loss": 7.3703, + "step": 474 + }, + { + "epoch": 0.0028249595584736893, + "grad_norm": 2.9886162281036377, + "learning_rate": 4.999901960552798e-05, + "loss": 7.4684, + "step": 475 + }, + { + "epoch": 0.0028309068417546866, + "grad_norm": 2.934190511703491, + "learning_rate": 4.999901546450604e-05, + "loss": 7.4432, + "step": 476 + }, + { + "epoch": 0.002836854125035684, + "grad_norm": 3.696247100830078, + "learning_rate": 4.9999011314757196e-05, + "loss": 7.4944, + "step": 477 + }, + { + "epoch": 0.002842801408316681, + "grad_norm": 3.6706700325012207, + "learning_rate": 4.9999007156281454e-05, + "loss": 7.3726, + "step": 478 + }, + { + "epoch": 0.0028487486915976783, + "grad_norm": 3.8638553619384766, + "learning_rate": 4.999900298907881e-05, + "loss": 7.072, + "step": 479 + }, + { + "epoch": 0.0028546959748786756, + "grad_norm": 4.307566165924072, + "learning_rate": 4.999899881314928e-05, + "loss": 6.9371, + "step": 480 + }, + { + "epoch": 0.002860643258159673, + "grad_norm": 3.337372064590454, + "learning_rate": 4.9998994628492854e-05, + "loss": 7.7299, + "step": 481 + }, + { + "epoch": 0.00286659054144067, + "grad_norm": 3.1284921169281006, + "learning_rate": 4.9998990435109535e-05, + "loss": 7.5629, + "step": 482 + }, + { + "epoch": 0.0028725378247216674, + "grad_norm": 3.06904935836792, + "learning_rate": 4.999898623299933e-05, + "loss": 7.5332, + "step": 483 + }, + { + "epoch": 0.002878485108002664, + "grad_norm": 2.985121011734009, + "learning_rate": 4.999898202216224e-05, + "loss": 7.5972, + "step": 484 + }, + { + "epoch": 0.0028844323912836614, + "grad_norm": 2.9188039302825928, + "learning_rate": 4.999897780259827e-05, + "loss": 7.6242, + "step": 485 + }, + { + "epoch": 0.0028903796745646587, + "grad_norm": 3.2263259887695312, + "learning_rate": 4.9998973574307406e-05, + "loss": 7.5746, + "step": 486 + }, + { + "epoch": 0.002896326957845656, + "grad_norm": 2.645188331604004, + "learning_rate": 4.999896933728966e-05, + "loss": 7.6122, + "step": 487 + }, + { + "epoch": 0.002902274241126653, + "grad_norm": 2.89583158493042, + "learning_rate": 4.9998965091545035e-05, + "loss": 7.6157, + "step": 488 + }, + { + "epoch": 0.0029082215244076504, + "grad_norm": 3.6182286739349365, + "learning_rate": 4.9998960837073524e-05, + "loss": 7.4056, + "step": 489 + }, + { + "epoch": 0.0029141688076886477, + "grad_norm": 3.377560615539551, + "learning_rate": 4.9998956573875135e-05, + "loss": 7.4408, + "step": 490 + }, + { + "epoch": 0.002920116090969645, + "grad_norm": 3.0581517219543457, + "learning_rate": 4.9998952301949874e-05, + "loss": 7.5776, + "step": 491 + }, + { + "epoch": 0.002926063374250642, + "grad_norm": 3.5199148654937744, + "learning_rate": 4.999894802129773e-05, + "loss": 7.4747, + "step": 492 + }, + { + "epoch": 0.0029320106575316395, + "grad_norm": 3.866055727005005, + "learning_rate": 4.9998943731918714e-05, + "loss": 7.5985, + "step": 493 + }, + { + "epoch": 0.0029379579408126367, + "grad_norm": 2.856255054473877, + "learning_rate": 4.999893943381283e-05, + "loss": 7.9698, + "step": 494 + }, + { + "epoch": 0.002943905224093634, + "grad_norm": 3.0758626461029053, + "learning_rate": 4.999893512698007e-05, + "loss": 7.6311, + "step": 495 + }, + { + "epoch": 0.0029498525073746312, + "grad_norm": 3.739844560623169, + "learning_rate": 4.999893081142044e-05, + "loss": 7.6829, + "step": 496 + }, + { + "epoch": 0.0029557997906556285, + "grad_norm": 4.025709629058838, + "learning_rate": 4.999892648713394e-05, + "loss": 7.2717, + "step": 497 + }, + { + "epoch": 0.0029617470739366257, + "grad_norm": 3.6604738235473633, + "learning_rate": 4.999892215412057e-05, + "loss": 7.2985, + "step": 498 + }, + { + "epoch": 0.002967694357217623, + "grad_norm": 3.230109930038452, + "learning_rate": 4.999891781238034e-05, + "loss": 8.1041, + "step": 499 + }, + { + "epoch": 0.0029736416404986202, + "grad_norm": 2.5046725273132324, + "learning_rate": 4.999891346191325e-05, + "loss": 8.0888, + "step": 500 + }, + { + "epoch": 0.0029795889237796175, + "grad_norm": 2.916459798812866, + "learning_rate": 4.999890910271929e-05, + "loss": 7.8675, + "step": 501 + }, + { + "epoch": 0.0029855362070606148, + "grad_norm": 2.7806055545806885, + "learning_rate": 4.999890473479848e-05, + "loss": 7.8903, + "step": 502 + }, + { + "epoch": 0.002991483490341612, + "grad_norm": 2.9877662658691406, + "learning_rate": 4.99989003581508e-05, + "loss": 7.473, + "step": 503 + }, + { + "epoch": 0.0029974307736226093, + "grad_norm": 3.1581692695617676, + "learning_rate": 4.999889597277626e-05, + "loss": 7.5654, + "step": 504 + }, + { + "epoch": 0.0030033780569036065, + "grad_norm": 3.102539539337158, + "learning_rate": 4.9998891578674866e-05, + "loss": 7.8865, + "step": 505 + }, + { + "epoch": 0.0030093253401846038, + "grad_norm": 3.0357863903045654, + "learning_rate": 4.999888717584662e-05, + "loss": 7.291, + "step": 506 + }, + { + "epoch": 0.003015272623465601, + "grad_norm": 2.604048252105713, + "learning_rate": 4.999888276429152e-05, + "loss": 7.4892, + "step": 507 + }, + { + "epoch": 0.0030212199067465983, + "grad_norm": 2.734354257583618, + "learning_rate": 4.999887834400957e-05, + "loss": 7.1182, + "step": 508 + }, + { + "epoch": 0.0030271671900275955, + "grad_norm": 2.5255348682403564, + "learning_rate": 4.9998873915000775e-05, + "loss": 7.449, + "step": 509 + }, + { + "epoch": 0.003033114473308593, + "grad_norm": 2.864072322845459, + "learning_rate": 4.999886947726512e-05, + "loss": 7.3213, + "step": 510 + }, + { + "epoch": 0.00303906175658959, + "grad_norm": 2.764187812805176, + "learning_rate": 4.999886503080262e-05, + "loss": 7.337, + "step": 511 + }, + { + "epoch": 0.0030450090398705873, + "grad_norm": 3.5725066661834717, + "learning_rate": 4.9998860575613285e-05, + "loss": 7.8398, + "step": 512 + }, + { + "epoch": 0.0030509563231515846, + "grad_norm": 3.8559648990631104, + "learning_rate": 4.9998856111697096e-05, + "loss": 7.395, + "step": 513 + }, + { + "epoch": 0.003056903606432582, + "grad_norm": 2.9047908782958984, + "learning_rate": 4.999885163905407e-05, + "loss": 7.7016, + "step": 514 + }, + { + "epoch": 0.0030628508897135786, + "grad_norm": 3.1485037803649902, + "learning_rate": 4.99988471576842e-05, + "loss": 6.9411, + "step": 515 + }, + { + "epoch": 0.003068798172994576, + "grad_norm": 3.2763617038726807, + "learning_rate": 4.999884266758749e-05, + "loss": 6.4778, + "step": 516 + }, + { + "epoch": 0.003074745456275573, + "grad_norm": 2.7609500885009766, + "learning_rate": 4.999883816876394e-05, + "loss": 7.0576, + "step": 517 + }, + { + "epoch": 0.0030806927395565704, + "grad_norm": 3.7407751083374023, + "learning_rate": 4.999883366121356e-05, + "loss": 7.7389, + "step": 518 + }, + { + "epoch": 0.0030866400228375676, + "grad_norm": 3.3356568813323975, + "learning_rate": 4.999882914493634e-05, + "loss": 7.7, + "step": 519 + }, + { + "epoch": 0.003092587306118565, + "grad_norm": 2.635594129562378, + "learning_rate": 4.999882461993229e-05, + "loss": 7.6103, + "step": 520 + }, + { + "epoch": 0.003098534589399562, + "grad_norm": 3.7604281902313232, + "learning_rate": 4.9998820086201406e-05, + "loss": 7.6814, + "step": 521 + }, + { + "epoch": 0.0031044818726805594, + "grad_norm": 3.6567211151123047, + "learning_rate": 4.99988155437437e-05, + "loss": 7.6729, + "step": 522 + }, + { + "epoch": 0.0031104291559615567, + "grad_norm": 3.605442523956299, + "learning_rate": 4.999881099255916e-05, + "loss": 7.7464, + "step": 523 + }, + { + "epoch": 0.003116376439242554, + "grad_norm": 3.015500783920288, + "learning_rate": 4.99988064326478e-05, + "loss": 7.5168, + "step": 524 + }, + { + "epoch": 0.003122323722523551, + "grad_norm": 2.9037563800811768, + "learning_rate": 4.9998801864009604e-05, + "loss": 7.7059, + "step": 525 + }, + { + "epoch": 0.0031282710058045484, + "grad_norm": 2.812509059906006, + "learning_rate": 4.999879728664458e-05, + "loss": 7.4178, + "step": 526 + }, + { + "epoch": 0.0031342182890855457, + "grad_norm": 3.340226888656616, + "learning_rate": 4.9998792700552746e-05, + "loss": 7.7872, + "step": 527 + }, + { + "epoch": 0.003140165572366543, + "grad_norm": 3.0951550006866455, + "learning_rate": 4.999878810573409e-05, + "loss": 8.0153, + "step": 528 + }, + { + "epoch": 0.00314611285564754, + "grad_norm": 3.1077651977539062, + "learning_rate": 4.9998783502188616e-05, + "loss": 7.7053, + "step": 529 + }, + { + "epoch": 0.0031520601389285374, + "grad_norm": 3.442451000213623, + "learning_rate": 4.999877888991632e-05, + "loss": 7.5149, + "step": 530 + }, + { + "epoch": 0.0031580074222095347, + "grad_norm": 3.7479207515716553, + "learning_rate": 4.9998774268917215e-05, + "loss": 7.3448, + "step": 531 + }, + { + "epoch": 0.003163954705490532, + "grad_norm": 2.660789966583252, + "learning_rate": 4.999876963919129e-05, + "loss": 7.8348, + "step": 532 + }, + { + "epoch": 0.003169901988771529, + "grad_norm": 2.6255943775177, + "learning_rate": 4.9998765000738556e-05, + "loss": 7.542, + "step": 533 + }, + { + "epoch": 0.0031758492720525265, + "grad_norm": 3.121521472930908, + "learning_rate": 4.9998760353559017e-05, + "loss": 7.46, + "step": 534 + }, + { + "epoch": 0.0031817965553335237, + "grad_norm": 2.958880662918091, + "learning_rate": 4.999875569765266e-05, + "loss": 7.5385, + "step": 535 + }, + { + "epoch": 0.003187743838614521, + "grad_norm": 3.4153661727905273, + "learning_rate": 4.99987510330195e-05, + "loss": 7.4989, + "step": 536 + }, + { + "epoch": 0.0031936911218955182, + "grad_norm": 3.0877597332000732, + "learning_rate": 4.999874635965953e-05, + "loss": 7.5512, + "step": 537 + }, + { + "epoch": 0.0031996384051765155, + "grad_norm": 3.109522581100464, + "learning_rate": 4.9998741677572756e-05, + "loss": 7.4679, + "step": 538 + }, + { + "epoch": 0.0032055856884575127, + "grad_norm": 3.4434239864349365, + "learning_rate": 4.999873698675919e-05, + "loss": 7.0599, + "step": 539 + }, + { + "epoch": 0.00321153297173851, + "grad_norm": 3.83335018157959, + "learning_rate": 4.999873228721882e-05, + "loss": 7.5355, + "step": 540 + }, + { + "epoch": 0.0032174802550195072, + "grad_norm": 3.0679752826690674, + "learning_rate": 4.999872757895164e-05, + "loss": 7.7231, + "step": 541 + }, + { + "epoch": 0.0032234275383005045, + "grad_norm": 3.272196054458618, + "learning_rate": 4.999872286195767e-05, + "loss": 7.6674, + "step": 542 + }, + { + "epoch": 0.0032293748215815017, + "grad_norm": 2.8453965187072754, + "learning_rate": 4.9998718136236897e-05, + "loss": 7.4451, + "step": 543 + }, + { + "epoch": 0.003235322104862499, + "grad_norm": 3.074399709701538, + "learning_rate": 4.999871340178934e-05, + "loss": 7.6011, + "step": 544 + }, + { + "epoch": 0.0032412693881434963, + "grad_norm": 3.173004150390625, + "learning_rate": 4.999870865861499e-05, + "loss": 7.5268, + "step": 545 + }, + { + "epoch": 0.003247216671424493, + "grad_norm": 2.820848226547241, + "learning_rate": 4.999870390671384e-05, + "loss": 7.9872, + "step": 546 + }, + { + "epoch": 0.0032531639547054903, + "grad_norm": 2.692702293395996, + "learning_rate": 4.9998699146085906e-05, + "loss": 7.4676, + "step": 547 + }, + { + "epoch": 0.0032591112379864876, + "grad_norm": 2.2766902446746826, + "learning_rate": 4.999869437673119e-05, + "loss": 7.3826, + "step": 548 + }, + { + "epoch": 0.003265058521267485, + "grad_norm": 2.1190011501312256, + "learning_rate": 4.9998689598649686e-05, + "loss": 7.4767, + "step": 549 + }, + { + "epoch": 0.003271005804548482, + "grad_norm": 2.687633514404297, + "learning_rate": 4.999868481184139e-05, + "loss": 7.9922, + "step": 550 + }, + { + "epoch": 0.0032769530878294794, + "grad_norm": 3.403298854827881, + "learning_rate": 4.999868001630632e-05, + "loss": 7.8035, + "step": 551 + }, + { + "epoch": 0.0032829003711104766, + "grad_norm": 3.074881076812744, + "learning_rate": 4.999867521204446e-05, + "loss": 7.7106, + "step": 552 + }, + { + "epoch": 0.003288847654391474, + "grad_norm": 3.28725004196167, + "learning_rate": 4.9998670399055827e-05, + "loss": 7.4661, + "step": 553 + }, + { + "epoch": 0.003294794937672471, + "grad_norm": 3.8624775409698486, + "learning_rate": 4.999866557734041e-05, + "loss": 7.7156, + "step": 554 + }, + { + "epoch": 0.0033007422209534684, + "grad_norm": 2.53586745262146, + "learning_rate": 4.999866074689823e-05, + "loss": 7.945, + "step": 555 + }, + { + "epoch": 0.0033066895042344656, + "grad_norm": 3.8261072635650635, + "learning_rate": 4.9998655907729265e-05, + "loss": 8.0446, + "step": 556 + }, + { + "epoch": 0.003312636787515463, + "grad_norm": 2.7173407077789307, + "learning_rate": 4.999865105983353e-05, + "loss": 7.8363, + "step": 557 + }, + { + "epoch": 0.00331858407079646, + "grad_norm": 4.68424654006958, + "learning_rate": 4.999864620321102e-05, + "loss": 7.667, + "step": 558 + }, + { + "epoch": 0.0033245313540774574, + "grad_norm": 2.8763632774353027, + "learning_rate": 4.999864133786175e-05, + "loss": 7.6133, + "step": 559 + }, + { + "epoch": 0.0033304786373584546, + "grad_norm": 3.0986382961273193, + "learning_rate": 4.9998636463785705e-05, + "loss": 7.6257, + "step": 560 + }, + { + "epoch": 0.003336425920639452, + "grad_norm": 2.6826348304748535, + "learning_rate": 4.9998631580982905e-05, + "loss": 7.5187, + "step": 561 + }, + { + "epoch": 0.003342373203920449, + "grad_norm": 2.2172515392303467, + "learning_rate": 4.9998626689453334e-05, + "loss": 7.961, + "step": 562 + }, + { + "epoch": 0.0033483204872014464, + "grad_norm": 2.6083858013153076, + "learning_rate": 4.9998621789197e-05, + "loss": 7.7887, + "step": 563 + }, + { + "epoch": 0.0033542677704824437, + "grad_norm": 3.6838009357452393, + "learning_rate": 4.99986168802139e-05, + "loss": 7.4945, + "step": 564 + }, + { + "epoch": 0.003360215053763441, + "grad_norm": 3.2091991901397705, + "learning_rate": 4.999861196250405e-05, + "loss": 7.4243, + "step": 565 + }, + { + "epoch": 0.003366162337044438, + "grad_norm": 3.142982244491577, + "learning_rate": 4.9998607036067434e-05, + "loss": 7.4684, + "step": 566 + }, + { + "epoch": 0.0033721096203254354, + "grad_norm": 3.7751007080078125, + "learning_rate": 4.9998602100904065e-05, + "loss": 7.3722, + "step": 567 + }, + { + "epoch": 0.0033780569036064327, + "grad_norm": 3.276843547821045, + "learning_rate": 4.9998597157013946e-05, + "loss": 7.4012, + "step": 568 + }, + { + "epoch": 0.00338400418688743, + "grad_norm": 2.840106725692749, + "learning_rate": 4.999859220439708e-05, + "loss": 7.4013, + "step": 569 + }, + { + "epoch": 0.003389951470168427, + "grad_norm": 2.7816810607910156, + "learning_rate": 4.999858724305346e-05, + "loss": 7.3136, + "step": 570 + }, + { + "epoch": 0.0033958987534494244, + "grad_norm": 4.523340225219727, + "learning_rate": 4.999858227298308e-05, + "loss": 7.0553, + "step": 571 + }, + { + "epoch": 0.0034018460367304217, + "grad_norm": 3.9653191566467285, + "learning_rate": 4.9998577294185964e-05, + "loss": 7.1907, + "step": 572 + }, + { + "epoch": 0.003407793320011419, + "grad_norm": 3.243089199066162, + "learning_rate": 4.999857230666211e-05, + "loss": 7.0749, + "step": 573 + }, + { + "epoch": 0.003413740603292416, + "grad_norm": 3.3622777462005615, + "learning_rate": 4.99985673104115e-05, + "loss": 7.0005, + "step": 574 + }, + { + "epoch": 0.0034196878865734135, + "grad_norm": 2.561732292175293, + "learning_rate": 4.9998562305434154e-05, + "loss": 7.271, + "step": 575 + }, + { + "epoch": 0.0034256351698544107, + "grad_norm": 3.1846745014190674, + "learning_rate": 4.999855729173006e-05, + "loss": 7.7333, + "step": 576 + }, + { + "epoch": 0.0034315824531354075, + "grad_norm": 3.0318918228149414, + "learning_rate": 4.999855226929924e-05, + "loss": 7.5535, + "step": 577 + }, + { + "epoch": 0.003437529736416405, + "grad_norm": 2.993086099624634, + "learning_rate": 4.999854723814168e-05, + "loss": 7.6272, + "step": 578 + }, + { + "epoch": 0.003443477019697402, + "grad_norm": 2.8511712551116943, + "learning_rate": 4.999854219825738e-05, + "loss": 7.6619, + "step": 579 + }, + { + "epoch": 0.0034494243029783993, + "grad_norm": 2.6181185245513916, + "learning_rate": 4.9998537149646355e-05, + "loss": 7.7452, + "step": 580 + }, + { + "epoch": 0.0034553715862593965, + "grad_norm": 2.9932363033294678, + "learning_rate": 4.9998532092308593e-05, + "loss": 7.7475, + "step": 581 + }, + { + "epoch": 0.003461318869540394, + "grad_norm": 3.541944742202759, + "learning_rate": 4.99985270262441e-05, + "loss": 7.5808, + "step": 582 + }, + { + "epoch": 0.003467266152821391, + "grad_norm": 2.780372381210327, + "learning_rate": 4.9998521951452895e-05, + "loss": 7.8167, + "step": 583 + }, + { + "epoch": 0.0034732134361023883, + "grad_norm": 2.9156363010406494, + "learning_rate": 4.9998516867934945e-05, + "loss": 7.74, + "step": 584 + }, + { + "epoch": 0.0034791607193833856, + "grad_norm": 3.9492485523223877, + "learning_rate": 4.9998511775690285e-05, + "loss": 7.1128, + "step": 585 + }, + { + "epoch": 0.003485108002664383, + "grad_norm": 2.8288252353668213, + "learning_rate": 4.9998506674718896e-05, + "loss": 7.4884, + "step": 586 + }, + { + "epoch": 0.00349105528594538, + "grad_norm": 2.8906798362731934, + "learning_rate": 4.999850156502078e-05, + "loss": 7.6378, + "step": 587 + }, + { + "epoch": 0.0034970025692263773, + "grad_norm": 2.8806405067443848, + "learning_rate": 4.9998496446595955e-05, + "loss": 7.4641, + "step": 588 + }, + { + "epoch": 0.0035029498525073746, + "grad_norm": 3.1794772148132324, + "learning_rate": 4.999849131944441e-05, + "loss": 7.1633, + "step": 589 + }, + { + "epoch": 0.003508897135788372, + "grad_norm": 2.886009454727173, + "learning_rate": 4.999848618356615e-05, + "loss": 7.1793, + "step": 590 + }, + { + "epoch": 0.003514844419069369, + "grad_norm": 2.76184344291687, + "learning_rate": 4.999848103896118e-05, + "loss": 7.1377, + "step": 591 + }, + { + "epoch": 0.0035207917023503663, + "grad_norm": 3.127793788909912, + "learning_rate": 4.999847588562949e-05, + "loss": 7.2793, + "step": 592 + }, + { + "epoch": 0.0035267389856313636, + "grad_norm": 3.7768073081970215, + "learning_rate": 4.99984707235711e-05, + "loss": 7.8203, + "step": 593 + }, + { + "epoch": 0.003532686268912361, + "grad_norm": 3.1750540733337402, + "learning_rate": 4.9998465552786e-05, + "loss": 7.7078, + "step": 594 + }, + { + "epoch": 0.003538633552193358, + "grad_norm": 2.8884522914886475, + "learning_rate": 4.999846037327419e-05, + "loss": 7.6864, + "step": 595 + }, + { + "epoch": 0.0035445808354743554, + "grad_norm": 2.783928394317627, + "learning_rate": 4.999845518503568e-05, + "loss": 7.7329, + "step": 596 + }, + { + "epoch": 0.0035505281187553526, + "grad_norm": 2.8093652725219727, + "learning_rate": 4.9998449988070465e-05, + "loss": 7.7157, + "step": 597 + }, + { + "epoch": 0.00355647540203635, + "grad_norm": 2.54380464553833, + "learning_rate": 4.999844478237855e-05, + "loss": 7.6353, + "step": 598 + }, + { + "epoch": 0.003562422685317347, + "grad_norm": 3.478878974914551, + "learning_rate": 4.999843956795993e-05, + "loss": 7.4221, + "step": 599 + }, + { + "epoch": 0.0035683699685983444, + "grad_norm": 3.882807493209839, + "learning_rate": 4.999843434481463e-05, + "loss": 7.4857, + "step": 600 + }, + { + "epoch": 0.0035743172518793416, + "grad_norm": 3.0975584983825684, + "learning_rate": 4.999842911294261e-05, + "loss": 7.5121, + "step": 601 + }, + { + "epoch": 0.003580264535160339, + "grad_norm": 3.1857712268829346, + "learning_rate": 4.999842387234391e-05, + "loss": 7.4469, + "step": 602 + }, + { + "epoch": 0.003586211818441336, + "grad_norm": 2.892927885055542, + "learning_rate": 4.999841862301853e-05, + "loss": 7.4047, + "step": 603 + }, + { + "epoch": 0.0035921591017223334, + "grad_norm": 4.186185359954834, + "learning_rate": 4.999841336496645e-05, + "loss": 7.5146, + "step": 604 + }, + { + "epoch": 0.0035981063850033307, + "grad_norm": 3.27422833442688, + "learning_rate": 4.9998408098187674e-05, + "loss": 7.3347, + "step": 605 + }, + { + "epoch": 0.003604053668284328, + "grad_norm": 4.817208290100098, + "learning_rate": 4.9998402822682225e-05, + "loss": 7.9883, + "step": 606 + }, + { + "epoch": 0.003610000951565325, + "grad_norm": 5.903015613555908, + "learning_rate": 4.999839753845008e-05, + "loss": 7.9043, + "step": 607 + }, + { + "epoch": 0.0036159482348463224, + "grad_norm": 4.720086574554443, + "learning_rate": 4.999839224549127e-05, + "loss": 7.8456, + "step": 608 + }, + { + "epoch": 0.0036218955181273192, + "grad_norm": 4.518443584442139, + "learning_rate": 4.9998386943805764e-05, + "loss": 7.3659, + "step": 609 + }, + { + "epoch": 0.0036278428014083165, + "grad_norm": 2.621833086013794, + "learning_rate": 4.999838163339358e-05, + "loss": 8.0512, + "step": 610 + }, + { + "epoch": 0.0036337900846893137, + "grad_norm": 4.015076160430908, + "learning_rate": 4.9998376314254726e-05, + "loss": 7.8581, + "step": 611 + }, + { + "epoch": 0.003639737367970311, + "grad_norm": 3.8145275115966797, + "learning_rate": 4.999837098638919e-05, + "loss": 7.4288, + "step": 612 + }, + { + "epoch": 0.0036456846512513083, + "grad_norm": 3.396488904953003, + "learning_rate": 4.9998365649796985e-05, + "loss": 7.7812, + "step": 613 + }, + { + "epoch": 0.0036516319345323055, + "grad_norm": 2.931187391281128, + "learning_rate": 4.999836030447811e-05, + "loss": 7.5898, + "step": 614 + }, + { + "epoch": 0.0036575792178133028, + "grad_norm": 2.6349267959594727, + "learning_rate": 4.999835495043257e-05, + "loss": 7.5345, + "step": 615 + }, + { + "epoch": 0.0036635265010943, + "grad_norm": 3.014085531234741, + "learning_rate": 4.999834958766035e-05, + "loss": 7.5985, + "step": 616 + }, + { + "epoch": 0.0036694737843752973, + "grad_norm": 2.971475124359131, + "learning_rate": 4.999834421616147e-05, + "loss": 7.589, + "step": 617 + }, + { + "epoch": 0.0036754210676562945, + "grad_norm": 3.867366075515747, + "learning_rate": 4.999833883593593e-05, + "loss": 7.4026, + "step": 618 + }, + { + "epoch": 0.0036813683509372918, + "grad_norm": 2.3917908668518066, + "learning_rate": 4.9998333446983734e-05, + "loss": 7.4361, + "step": 619 + }, + { + "epoch": 0.003687315634218289, + "grad_norm": 4.583080768585205, + "learning_rate": 4.999832804930487e-05, + "loss": 7.5525, + "step": 620 + }, + { + "epoch": 0.0036932629174992863, + "grad_norm": 2.6039721965789795, + "learning_rate": 4.999832264289934e-05, + "loss": 7.636, + "step": 621 + }, + { + "epoch": 0.0036992102007802835, + "grad_norm": 4.123409748077393, + "learning_rate": 4.9998317227767165e-05, + "loss": 7.7803, + "step": 622 + }, + { + "epoch": 0.003705157484061281, + "grad_norm": 4.220766544342041, + "learning_rate": 4.999831180390834e-05, + "loss": 7.8086, + "step": 623 + }, + { + "epoch": 0.003711104767342278, + "grad_norm": 3.0759594440460205, + "learning_rate": 4.999830637132285e-05, + "loss": 7.4815, + "step": 624 + }, + { + "epoch": 0.0037170520506232753, + "grad_norm": 2.7870442867279053, + "learning_rate": 4.999830093001071e-05, + "loss": 7.3925, + "step": 625 + }, + { + "epoch": 0.0037229993339042726, + "grad_norm": 2.5292582511901855, + "learning_rate": 4.999829547997193e-05, + "loss": 7.2049, + "step": 626 + }, + { + "epoch": 0.00372894661718527, + "grad_norm": 2.5836963653564453, + "learning_rate": 4.99982900212065e-05, + "loss": 7.2858, + "step": 627 + }, + { + "epoch": 0.003734893900466267, + "grad_norm": 2.6433279514312744, + "learning_rate": 4.9998284553714425e-05, + "loss": 7.5894, + "step": 628 + }, + { + "epoch": 0.0037408411837472643, + "grad_norm": 3.1093215942382812, + "learning_rate": 4.999827907749571e-05, + "loss": 7.2859, + "step": 629 + }, + { + "epoch": 0.0037467884670282616, + "grad_norm": 2.313305616378784, + "learning_rate": 4.9998273592550346e-05, + "loss": 7.6275, + "step": 630 + }, + { + "epoch": 0.003752735750309259, + "grad_norm": 3.7002785205841064, + "learning_rate": 4.9998268098878355e-05, + "loss": 7.7068, + "step": 631 + }, + { + "epoch": 0.003758683033590256, + "grad_norm": 3.090707778930664, + "learning_rate": 4.9998262596479715e-05, + "loss": 7.7304, + "step": 632 + }, + { + "epoch": 0.0037646303168712533, + "grad_norm": 2.425614833831787, + "learning_rate": 4.999825708535445e-05, + "loss": 7.927, + "step": 633 + }, + { + "epoch": 0.0037705776001522506, + "grad_norm": 2.1477420330047607, + "learning_rate": 4.999825156550254e-05, + "loss": 8.1082, + "step": 634 + }, + { + "epoch": 0.003776524883433248, + "grad_norm": 2.434638738632202, + "learning_rate": 4.999824603692401e-05, + "loss": 7.8808, + "step": 635 + }, + { + "epoch": 0.003782472166714245, + "grad_norm": 2.563283681869507, + "learning_rate": 4.999824049961884e-05, + "loss": 7.8515, + "step": 636 + }, + { + "epoch": 0.0037884194499952424, + "grad_norm": 2.6878623962402344, + "learning_rate": 4.9998234953587054e-05, + "loss": 7.6393, + "step": 637 + }, + { + "epoch": 0.0037943667332762396, + "grad_norm": 2.6270666122436523, + "learning_rate": 4.999822939882863e-05, + "loss": 7.8246, + "step": 638 + }, + { + "epoch": 0.003800314016557237, + "grad_norm": 3.300494909286499, + "learning_rate": 4.9998223835343596e-05, + "loss": 7.4991, + "step": 639 + }, + { + "epoch": 0.0038062612998382337, + "grad_norm": 2.726902723312378, + "learning_rate": 4.9998218263131925e-05, + "loss": 7.6663, + "step": 640 + }, + { + "epoch": 0.003812208583119231, + "grad_norm": 2.8147871494293213, + "learning_rate": 4.9998212682193645e-05, + "loss": 7.5272, + "step": 641 + }, + { + "epoch": 0.003818155866400228, + "grad_norm": 2.324422597885132, + "learning_rate": 4.9998207092528745e-05, + "loss": 7.6577, + "step": 642 + }, + { + "epoch": 0.0038241031496812255, + "grad_norm": 2.4525058269500732, + "learning_rate": 4.999820149413723e-05, + "loss": 7.6793, + "step": 643 + }, + { + "epoch": 0.0038300504329622227, + "grad_norm": 2.4011337757110596, + "learning_rate": 4.9998195887019094e-05, + "loss": 7.4869, + "step": 644 + }, + { + "epoch": 0.00383599771624322, + "grad_norm": 2.3403005599975586, + "learning_rate": 4.9998190271174364e-05, + "loss": 7.9552, + "step": 645 + }, + { + "epoch": 0.003841944999524217, + "grad_norm": 2.1421074867248535, + "learning_rate": 4.9998184646603005e-05, + "loss": 7.4021, + "step": 646 + }, + { + "epoch": 0.0038478922828052145, + "grad_norm": 2.4157450199127197, + "learning_rate": 4.9998179013305046e-05, + "loss": 7.6666, + "step": 647 + }, + { + "epoch": 0.0038538395660862117, + "grad_norm": 2.737692356109619, + "learning_rate": 4.999817337128048e-05, + "loss": 7.7441, + "step": 648 + }, + { + "epoch": 0.003859786849367209, + "grad_norm": 3.2240428924560547, + "learning_rate": 4.999816772052931e-05, + "loss": 7.5691, + "step": 649 + }, + { + "epoch": 0.0038657341326482062, + "grad_norm": 2.8538997173309326, + "learning_rate": 4.9998162061051534e-05, + "loss": 7.4994, + "step": 650 + }, + { + "epoch": 0.0038716814159292035, + "grad_norm": 2.6562373638153076, + "learning_rate": 4.9998156392847164e-05, + "loss": 7.5156, + "step": 651 + }, + { + "epoch": 0.0038776286992102007, + "grad_norm": 2.5513811111450195, + "learning_rate": 4.999815071591619e-05, + "loss": 7.6503, + "step": 652 + }, + { + "epoch": 0.003883575982491198, + "grad_norm": 2.4196572303771973, + "learning_rate": 4.999814503025863e-05, + "loss": 7.9868, + "step": 653 + }, + { + "epoch": 0.0038895232657721952, + "grad_norm": 3.0201921463012695, + "learning_rate": 4.999813933587447e-05, + "loss": 7.5405, + "step": 654 + }, + { + "epoch": 0.0038954705490531925, + "grad_norm": 2.352625846862793, + "learning_rate": 4.9998133632763714e-05, + "loss": 7.5461, + "step": 655 + }, + { + "epoch": 0.0039014178323341898, + "grad_norm": 2.5318710803985596, + "learning_rate": 4.999812792092637e-05, + "loss": 7.5596, + "step": 656 + }, + { + "epoch": 0.003907365115615187, + "grad_norm": 2.710785388946533, + "learning_rate": 4.9998122200362444e-05, + "loss": 7.4828, + "step": 657 + }, + { + "epoch": 0.003913312398896184, + "grad_norm": 2.7441353797912598, + "learning_rate": 4.999811647107192e-05, + "loss": 7.2496, + "step": 658 + }, + { + "epoch": 0.0039192596821771815, + "grad_norm": 2.4602885246276855, + "learning_rate": 4.9998110733054824e-05, + "loss": 7.6134, + "step": 659 + }, + { + "epoch": 0.003925206965458178, + "grad_norm": 2.6842973232269287, + "learning_rate": 4.999810498631114e-05, + "loss": 7.3544, + "step": 660 + }, + { + "epoch": 0.003931154248739176, + "grad_norm": 2.8062961101531982, + "learning_rate": 4.9998099230840875e-05, + "loss": 7.5162, + "step": 661 + }, + { + "epoch": 0.003937101532020173, + "grad_norm": 4.0753679275512695, + "learning_rate": 4.9998093466644036e-05, + "loss": 7.5241, + "step": 662 + }, + { + "epoch": 0.0039430488153011705, + "grad_norm": 3.0165748596191406, + "learning_rate": 4.999808769372061e-05, + "loss": 7.5313, + "step": 663 + }, + { + "epoch": 0.003948996098582167, + "grad_norm": 2.73825740814209, + "learning_rate": 4.9998081912070623e-05, + "loss": 7.4433, + "step": 664 + }, + { + "epoch": 0.003954943381863165, + "grad_norm": 2.6649749279022217, + "learning_rate": 4.9998076121694056e-05, + "loss": 7.4852, + "step": 665 + }, + { + "epoch": 0.003960890665144162, + "grad_norm": 2.609389066696167, + "learning_rate": 4.999807032259092e-05, + "loss": 7.4127, + "step": 666 + }, + { + "epoch": 0.0039668379484251596, + "grad_norm": 2.50502610206604, + "learning_rate": 4.999806451476122e-05, + "loss": 7.3113, + "step": 667 + }, + { + "epoch": 0.003972785231706156, + "grad_norm": 2.565142869949341, + "learning_rate": 4.999805869820495e-05, + "loss": 7.1875, + "step": 668 + }, + { + "epoch": 0.003978732514987154, + "grad_norm": 2.582742214202881, + "learning_rate": 4.9998052872922117e-05, + "loss": 7.3251, + "step": 669 + }, + { + "epoch": 0.003984679798268151, + "grad_norm": 2.718780279159546, + "learning_rate": 4.999804703891272e-05, + "loss": 7.3599, + "step": 670 + }, + { + "epoch": 0.003990627081549149, + "grad_norm": 2.5971410274505615, + "learning_rate": 4.999804119617677e-05, + "loss": 7.2304, + "step": 671 + }, + { + "epoch": 0.003996574364830145, + "grad_norm": 2.5905725955963135, + "learning_rate": 4.9998035344714255e-05, + "loss": 7.3664, + "step": 672 + }, + { + "epoch": 0.004002521648111143, + "grad_norm": 2.659102439880371, + "learning_rate": 4.999802948452519e-05, + "loss": 7.4296, + "step": 673 + }, + { + "epoch": 0.00400846893139214, + "grad_norm": 2.5933544635772705, + "learning_rate": 4.999802361560957e-05, + "loss": 7.4605, + "step": 674 + }, + { + "epoch": 0.004014416214673138, + "grad_norm": 3.3860044479370117, + "learning_rate": 4.999801773796739e-05, + "loss": 7.5159, + "step": 675 + }, + { + "epoch": 0.004020363497954134, + "grad_norm": 3.742635726928711, + "learning_rate": 4.9998011851598666e-05, + "loss": 7.4988, + "step": 676 + }, + { + "epoch": 0.004026310781235132, + "grad_norm": 3.5960240364074707, + "learning_rate": 4.999800595650339e-05, + "loss": 7.4607, + "step": 677 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 2.654444694519043, + "learning_rate": 4.9998000052681585e-05, + "loss": 7.2166, + "step": 678 + }, + { + "epoch": 0.004038205347797127, + "grad_norm": 2.4538326263427734, + "learning_rate": 4.999799414013322e-05, + "loss": 7.2334, + "step": 679 + }, + { + "epoch": 0.004044152631078123, + "grad_norm": 2.5899672508239746, + "learning_rate": 4.9997988218858316e-05, + "loss": 7.2754, + "step": 680 + }, + { + "epoch": 0.004050099914359121, + "grad_norm": 2.721224069595337, + "learning_rate": 4.999798228885687e-05, + "loss": 7.188, + "step": 681 + }, + { + "epoch": 0.004056047197640118, + "grad_norm": 6.5863189697265625, + "learning_rate": 4.9997976350128894e-05, + "loss": 7.369, + "step": 682 + }, + { + "epoch": 0.004061994480921116, + "grad_norm": 2.6562674045562744, + "learning_rate": 4.999797040267438e-05, + "loss": 7.176, + "step": 683 + }, + { + "epoch": 0.0040679417642021124, + "grad_norm": 2.503666877746582, + "learning_rate": 4.9997964446493326e-05, + "loss": 7.2765, + "step": 684 + }, + { + "epoch": 0.00407388904748311, + "grad_norm": 9.070426940917969, + "learning_rate": 4.9997958481585756e-05, + "loss": 7.5187, + "step": 685 + }, + { + "epoch": 0.004079836330764107, + "grad_norm": 2.7480480670928955, + "learning_rate": 4.9997952507951645e-05, + "loss": 7.5244, + "step": 686 + }, + { + "epoch": 0.004085783614045104, + "grad_norm": 3.8338348865509033, + "learning_rate": 4.999794652559101e-05, + "loss": 7.6672, + "step": 687 + }, + { + "epoch": 0.0040917308973261015, + "grad_norm": 3.1132454872131348, + "learning_rate": 4.999794053450385e-05, + "loss": 7.9594, + "step": 688 + }, + { + "epoch": 0.004097678180607098, + "grad_norm": 2.6279757022857666, + "learning_rate": 4.999793453469017e-05, + "loss": 7.4737, + "step": 689 + }, + { + "epoch": 0.004103625463888096, + "grad_norm": 3.440145492553711, + "learning_rate": 4.9997928526149966e-05, + "loss": 7.2968, + "step": 690 + }, + { + "epoch": 0.004109572747169093, + "grad_norm": 2.3300867080688477, + "learning_rate": 4.9997922508883244e-05, + "loss": 7.3693, + "step": 691 + }, + { + "epoch": 0.0041155200304500905, + "grad_norm": 2.9034078121185303, + "learning_rate": 4.999791648289001e-05, + "loss": 7.7227, + "step": 692 + }, + { + "epoch": 0.004121467313731087, + "grad_norm": 2.5685503482818604, + "learning_rate": 4.9997910448170254e-05, + "loss": 7.9706, + "step": 693 + }, + { + "epoch": 0.004127414597012085, + "grad_norm": 3.260779619216919, + "learning_rate": 4.9997904404723986e-05, + "loss": 7.7231, + "step": 694 + }, + { + "epoch": 0.004133361880293082, + "grad_norm": 2.668193817138672, + "learning_rate": 4.999789835255121e-05, + "loss": 7.7677, + "step": 695 + }, + { + "epoch": 0.0041393091635740795, + "grad_norm": 2.545276641845703, + "learning_rate": 4.999789229165193e-05, + "loss": 7.9297, + "step": 696 + }, + { + "epoch": 0.004145256446855076, + "grad_norm": 3.2137503623962402, + "learning_rate": 4.9997886222026146e-05, + "loss": 7.697, + "step": 697 + }, + { + "epoch": 0.004151203730136074, + "grad_norm": 2.7501730918884277, + "learning_rate": 4.999788014367385e-05, + "loss": 7.3686, + "step": 698 + }, + { + "epoch": 0.004157151013417071, + "grad_norm": 2.2456486225128174, + "learning_rate": 4.9997874056595055e-05, + "loss": 7.7238, + "step": 699 + }, + { + "epoch": 0.0041630982966980685, + "grad_norm": 2.3958070278167725, + "learning_rate": 4.9997867960789764e-05, + "loss": 7.8349, + "step": 700 + }, + { + "epoch": 0.004169045579979065, + "grad_norm": 2.509744644165039, + "learning_rate": 4.9997861856257974e-05, + "loss": 7.5884, + "step": 701 + }, + { + "epoch": 0.004174992863260063, + "grad_norm": 3.6095783710479736, + "learning_rate": 4.9997855742999684e-05, + "loss": 7.4726, + "step": 702 + }, + { + "epoch": 0.00418094014654106, + "grad_norm": 3.3515326976776123, + "learning_rate": 4.99978496210149e-05, + "loss": 7.5214, + "step": 703 + }, + { + "epoch": 0.0041868874298220575, + "grad_norm": 4.7553791999816895, + "learning_rate": 4.999784349030363e-05, + "loss": 7.4577, + "step": 704 + }, + { + "epoch": 0.004192834713103054, + "grad_norm": 5.959117412567139, + "learning_rate": 4.9997837350865874e-05, + "loss": 7.2559, + "step": 705 + }, + { + "epoch": 0.004198781996384052, + "grad_norm": 2.9650065898895264, + "learning_rate": 4.999783120270163e-05, + "loss": 7.3712, + "step": 706 + }, + { + "epoch": 0.004204729279665049, + "grad_norm": 3.4171416759490967, + "learning_rate": 4.9997825045810895e-05, + "loss": 7.5014, + "step": 707 + }, + { + "epoch": 0.0042106765629460466, + "grad_norm": 3.297393798828125, + "learning_rate": 4.9997818880193684e-05, + "loss": 7.4553, + "step": 708 + }, + { + "epoch": 0.004216623846227043, + "grad_norm": 3.193859338760376, + "learning_rate": 4.999781270584999e-05, + "loss": 7.3414, + "step": 709 + }, + { + "epoch": 0.004222571129508041, + "grad_norm": 2.5028324127197266, + "learning_rate": 4.999780652277982e-05, + "loss": 7.4615, + "step": 710 + }, + { + "epoch": 0.004228518412789038, + "grad_norm": 3.43390154838562, + "learning_rate": 4.999780033098317e-05, + "loss": 7.3801, + "step": 711 + }, + { + "epoch": 0.004234465696070036, + "grad_norm": 3.3093984127044678, + "learning_rate": 4.999779413046004e-05, + "loss": 7.2938, + "step": 712 + }, + { + "epoch": 0.004240412979351032, + "grad_norm": 2.6643831729888916, + "learning_rate": 4.999778792121046e-05, + "loss": 7.3916, + "step": 713 + }, + { + "epoch": 0.00424636026263203, + "grad_norm": 2.779407501220703, + "learning_rate": 4.999778170323439e-05, + "loss": 7.5783, + "step": 714 + }, + { + "epoch": 0.004252307545913027, + "grad_norm": 2.959345817565918, + "learning_rate": 4.999777547653186e-05, + "loss": 7.9854, + "step": 715 + }, + { + "epoch": 0.004258254829194025, + "grad_norm": 2.9909780025482178, + "learning_rate": 4.9997769241102866e-05, + "loss": 7.997, + "step": 716 + }, + { + "epoch": 0.004264202112475021, + "grad_norm": 3.081831932067871, + "learning_rate": 4.9997762996947405e-05, + "loss": 7.9393, + "step": 717 + }, + { + "epoch": 0.004270149395756018, + "grad_norm": 2.8901429176330566, + "learning_rate": 4.9997756744065485e-05, + "loss": 7.8152, + "step": 718 + }, + { + "epoch": 0.004276096679037016, + "grad_norm": 3.3065547943115234, + "learning_rate": 4.9997750482457106e-05, + "loss": 7.1176, + "step": 719 + }, + { + "epoch": 0.004282043962318013, + "grad_norm": 3.1083710193634033, + "learning_rate": 4.9997744212122276e-05, + "loss": 7.6215, + "step": 720 + }, + { + "epoch": 0.00428799124559901, + "grad_norm": 4.010551452636719, + "learning_rate": 4.9997737933060987e-05, + "loss": 7.7665, + "step": 721 + }, + { + "epoch": 0.004293938528880007, + "grad_norm": 3.9287984371185303, + "learning_rate": 4.9997731645273245e-05, + "loss": 7.7185, + "step": 722 + }, + { + "epoch": 0.004299885812161005, + "grad_norm": 2.7739338874816895, + "learning_rate": 4.999772534875905e-05, + "loss": 7.7226, + "step": 723 + }, + { + "epoch": 0.004305833095442002, + "grad_norm": 2.675567865371704, + "learning_rate": 4.9997719043518414e-05, + "loss": 7.686, + "step": 724 + }, + { + "epoch": 0.0043117803787229994, + "grad_norm": 3.8513898849487305, + "learning_rate": 4.999771272955133e-05, + "loss": 7.6584, + "step": 725 + }, + { + "epoch": 0.004317727662003996, + "grad_norm": 10.309504508972168, + "learning_rate": 4.99977064068578e-05, + "loss": 7.4006, + "step": 726 + }, + { + "epoch": 0.004323674945284994, + "grad_norm": 2.712939977645874, + "learning_rate": 4.9997700075437836e-05, + "loss": 7.6275, + "step": 727 + }, + { + "epoch": 0.004329622228565991, + "grad_norm": 2.7880115509033203, + "learning_rate": 4.999769373529143e-05, + "loss": 7.4154, + "step": 728 + }, + { + "epoch": 0.0043355695118469885, + "grad_norm": 3.2352819442749023, + "learning_rate": 4.999768738641859e-05, + "loss": 7.4827, + "step": 729 + }, + { + "epoch": 0.004341516795127985, + "grad_norm": 3.5176644325256348, + "learning_rate": 4.999768102881931e-05, + "loss": 7.4748, + "step": 730 + }, + { + "epoch": 0.004347464078408983, + "grad_norm": 2.996829032897949, + "learning_rate": 4.99976746624936e-05, + "loss": 7.445, + "step": 731 + }, + { + "epoch": 0.00435341136168998, + "grad_norm": 4.5892534255981445, + "learning_rate": 4.9997668287441454e-05, + "loss": 7.6464, + "step": 732 + }, + { + "epoch": 0.0043593586449709775, + "grad_norm": 3.689419984817505, + "learning_rate": 4.999766190366289e-05, + "loss": 7.4215, + "step": 733 + }, + { + "epoch": 0.004365305928251974, + "grad_norm": 2.9146885871887207, + "learning_rate": 4.9997655511157896e-05, + "loss": 7.4852, + "step": 734 + }, + { + "epoch": 0.004371253211532972, + "grad_norm": 3.8503024578094482, + "learning_rate": 4.9997649109926484e-05, + "loss": 7.4779, + "step": 735 + }, + { + "epoch": 0.004377200494813969, + "grad_norm": 3.929422616958618, + "learning_rate": 4.9997642699968646e-05, + "loss": 7.3526, + "step": 736 + }, + { + "epoch": 0.0043831477780949665, + "grad_norm": 3.3365838527679443, + "learning_rate": 4.999763628128439e-05, + "loss": 7.3895, + "step": 737 + }, + { + "epoch": 0.004389095061375963, + "grad_norm": 3.147660970687866, + "learning_rate": 4.999762985387372e-05, + "loss": 7.1885, + "step": 738 + }, + { + "epoch": 0.004395042344656961, + "grad_norm": 3.3230104446411133, + "learning_rate": 4.9997623417736626e-05, + "loss": 7.5839, + "step": 739 + }, + { + "epoch": 0.004400989627937958, + "grad_norm": 3.285144090652466, + "learning_rate": 4.999761697287313e-05, + "loss": 7.4859, + "step": 740 + }, + { + "epoch": 0.0044069369112189555, + "grad_norm": 3.3811442852020264, + "learning_rate": 4.9997610519283216e-05, + "loss": 7.4871, + "step": 741 + }, + { + "epoch": 0.004412884194499952, + "grad_norm": 2.9662907123565674, + "learning_rate": 4.9997604056966904e-05, + "loss": 7.2546, + "step": 742 + }, + { + "epoch": 0.00441883147778095, + "grad_norm": 3.1432855129241943, + "learning_rate": 4.999759758592418e-05, + "loss": 7.5273, + "step": 743 + }, + { + "epoch": 0.004424778761061947, + "grad_norm": 3.0559749603271484, + "learning_rate": 4.9997591106155054e-05, + "loss": 7.0754, + "step": 744 + }, + { + "epoch": 0.0044307260443429445, + "grad_norm": 2.6778409481048584, + "learning_rate": 4.999758461765953e-05, + "loss": 7.1723, + "step": 745 + }, + { + "epoch": 0.004436673327623941, + "grad_norm": 2.592228412628174, + "learning_rate": 4.9997578120437606e-05, + "loss": 7.2671, + "step": 746 + }, + { + "epoch": 0.004442620610904939, + "grad_norm": 2.5546112060546875, + "learning_rate": 4.999757161448928e-05, + "loss": 7.2571, + "step": 747 + }, + { + "epoch": 0.004448567894185936, + "grad_norm": 2.745755672454834, + "learning_rate": 4.999756509981457e-05, + "loss": 7.3895, + "step": 748 + }, + { + "epoch": 0.004454515177466933, + "grad_norm": 2.9785144329071045, + "learning_rate": 4.999755857641346e-05, + "loss": 7.2431, + "step": 749 + }, + { + "epoch": 0.00446046246074793, + "grad_norm": 2.918891191482544, + "learning_rate": 4.9997552044285965e-05, + "loss": 7.3805, + "step": 750 + }, + { + "epoch": 0.004466409744028927, + "grad_norm": 2.7858519554138184, + "learning_rate": 4.999754550343209e-05, + "loss": 7.5942, + "step": 751 + }, + { + "epoch": 0.004472357027309925, + "grad_norm": 2.7758638858795166, + "learning_rate": 4.999753895385181e-05, + "loss": 7.5896, + "step": 752 + }, + { + "epoch": 0.004478304310590922, + "grad_norm": 2.7125916481018066, + "learning_rate": 4.999753239554517e-05, + "loss": 7.4341, + "step": 753 + }, + { + "epoch": 0.004484251593871919, + "grad_norm": 4.241726875305176, + "learning_rate": 4.999752582851214e-05, + "loss": 7.0517, + "step": 754 + }, + { + "epoch": 0.004490198877152916, + "grad_norm": 2.9547781944274902, + "learning_rate": 4.999751925275272e-05, + "loss": 7.2616, + "step": 755 + }, + { + "epoch": 0.004496146160433914, + "grad_norm": 4.2594122886657715, + "learning_rate": 4.9997512668266945e-05, + "loss": 7.3069, + "step": 756 + }, + { + "epoch": 0.004502093443714911, + "grad_norm": 4.1758246421813965, + "learning_rate": 4.9997506075054776e-05, + "loss": 7.3417, + "step": 757 + }, + { + "epoch": 0.004508040726995908, + "grad_norm": 2.8398962020874023, + "learning_rate": 4.999749947311625e-05, + "loss": 7.107, + "step": 758 + }, + { + "epoch": 0.004513988010276905, + "grad_norm": 3.487478017807007, + "learning_rate": 4.9997492862451354e-05, + "loss": 7.0014, + "step": 759 + }, + { + "epoch": 0.004519935293557903, + "grad_norm": 2.883409261703491, + "learning_rate": 4.999748624306009e-05, + "loss": 7.4691, + "step": 760 + }, + { + "epoch": 0.0045258825768389, + "grad_norm": 3.0092155933380127, + "learning_rate": 4.999747961494246e-05, + "loss": 7.3771, + "step": 761 + }, + { + "epoch": 0.004531829860119897, + "grad_norm": 2.9571943283081055, + "learning_rate": 4.999747297809847e-05, + "loss": 7.4664, + "step": 762 + }, + { + "epoch": 0.004537777143400894, + "grad_norm": 2.7476816177368164, + "learning_rate": 4.999746633252812e-05, + "loss": 7.2943, + "step": 763 + }, + { + "epoch": 0.004543724426681892, + "grad_norm": 4.903059959411621, + "learning_rate": 4.9997459678231415e-05, + "loss": 7.3467, + "step": 764 + }, + { + "epoch": 0.004549671709962889, + "grad_norm": 3.8205373287200928, + "learning_rate": 4.999745301520835e-05, + "loss": 7.2807, + "step": 765 + }, + { + "epoch": 0.0045556189932438864, + "grad_norm": 2.6003127098083496, + "learning_rate": 4.9997446343458934e-05, + "loss": 7.2736, + "step": 766 + }, + { + "epoch": 0.004561566276524883, + "grad_norm": 3.288313627243042, + "learning_rate": 4.999743966298317e-05, + "loss": 7.3832, + "step": 767 + }, + { + "epoch": 0.004567513559805881, + "grad_norm": 3.4839234352111816, + "learning_rate": 4.999743297378106e-05, + "loss": 7.2932, + "step": 768 + }, + { + "epoch": 0.004573460843086878, + "grad_norm": 3.2667462825775146, + "learning_rate": 4.99974262758526e-05, + "loss": 7.4855, + "step": 769 + }, + { + "epoch": 0.0045794081263678755, + "grad_norm": 3.3637850284576416, + "learning_rate": 4.99974195691978e-05, + "loss": 7.4864, + "step": 770 + }, + { + "epoch": 0.004585355409648872, + "grad_norm": 4.691596508026123, + "learning_rate": 4.999741285381666e-05, + "loss": 7.4751, + "step": 771 + }, + { + "epoch": 0.00459130269292987, + "grad_norm": 3.8831942081451416, + "learning_rate": 4.999740612970918e-05, + "loss": 7.4554, + "step": 772 + }, + { + "epoch": 0.004597249976210867, + "grad_norm": 2.9129562377929688, + "learning_rate": 4.999739939687536e-05, + "loss": 7.7096, + "step": 773 + }, + { + "epoch": 0.0046031972594918645, + "grad_norm": 3.928882598876953, + "learning_rate": 4.9997392655315207e-05, + "loss": 7.6453, + "step": 774 + }, + { + "epoch": 0.004609144542772861, + "grad_norm": 4.19191312789917, + "learning_rate": 4.9997385905028726e-05, + "loss": 7.6038, + "step": 775 + }, + { + "epoch": 0.004615091826053859, + "grad_norm": 2.4585883617401123, + "learning_rate": 4.999737914601591e-05, + "loss": 7.5734, + "step": 776 + }, + { + "epoch": 0.004621039109334856, + "grad_norm": 3.500932455062866, + "learning_rate": 4.9997372378276776e-05, + "loss": 7.6535, + "step": 777 + }, + { + "epoch": 0.0046269863926158535, + "grad_norm": 3.1256210803985596, + "learning_rate": 4.9997365601811306e-05, + "loss": 7.4844, + "step": 778 + }, + { + "epoch": 0.00463293367589685, + "grad_norm": 2.083902597427368, + "learning_rate": 4.999735881661952e-05, + "loss": 7.646, + "step": 779 + }, + { + "epoch": 0.004638880959177847, + "grad_norm": 2.2990450859069824, + "learning_rate": 4.999735202270142e-05, + "loss": 7.5756, + "step": 780 + }, + { + "epoch": 0.004644828242458845, + "grad_norm": 2.782463550567627, + "learning_rate": 4.9997345220057004e-05, + "loss": 7.6191, + "step": 781 + }, + { + "epoch": 0.004650775525739842, + "grad_norm": 4.157378673553467, + "learning_rate": 4.9997338408686255e-05, + "loss": 7.5265, + "step": 782 + }, + { + "epoch": 0.004656722809020839, + "grad_norm": 2.850106716156006, + "learning_rate": 4.999733158858921e-05, + "loss": 7.4562, + "step": 783 + }, + { + "epoch": 0.004662670092301836, + "grad_norm": 2.8073840141296387, + "learning_rate": 4.999732475976585e-05, + "loss": 7.3913, + "step": 784 + }, + { + "epoch": 0.004668617375582834, + "grad_norm": 2.85048770904541, + "learning_rate": 4.999731792221618e-05, + "loss": 7.3945, + "step": 785 + }, + { + "epoch": 0.004674564658863831, + "grad_norm": 2.760990619659424, + "learning_rate": 4.999731107594021e-05, + "loss": 7.6088, + "step": 786 + }, + { + "epoch": 0.004680511942144828, + "grad_norm": 2.4395666122436523, + "learning_rate": 4.9997304220937933e-05, + "loss": 7.6996, + "step": 787 + }, + { + "epoch": 0.004686459225425825, + "grad_norm": 2.5826008319854736, + "learning_rate": 4.9997297357209354e-05, + "loss": 7.5888, + "step": 788 + }, + { + "epoch": 0.004692406508706823, + "grad_norm": 3.434957981109619, + "learning_rate": 4.999729048475448e-05, + "loss": 7.4659, + "step": 789 + }, + { + "epoch": 0.00469835379198782, + "grad_norm": 4.103111743927002, + "learning_rate": 4.9997283603573306e-05, + "loss": 7.6704, + "step": 790 + }, + { + "epoch": 0.004704301075268817, + "grad_norm": 3.7879343032836914, + "learning_rate": 4.999727671366584e-05, + "loss": 7.5387, + "step": 791 + }, + { + "epoch": 0.004710248358549814, + "grad_norm": 3.706599235534668, + "learning_rate": 4.999726981503209e-05, + "loss": 7.3413, + "step": 792 + }, + { + "epoch": 0.004716195641830812, + "grad_norm": 2.1999869346618652, + "learning_rate": 4.999726290767204e-05, + "loss": 7.1809, + "step": 793 + }, + { + "epoch": 0.004722142925111809, + "grad_norm": 2.8561251163482666, + "learning_rate": 4.999725599158571e-05, + "loss": 7.3496, + "step": 794 + }, + { + "epoch": 0.004728090208392806, + "grad_norm": 3.0696613788604736, + "learning_rate": 4.99972490667731e-05, + "loss": 7.542, + "step": 795 + }, + { + "epoch": 0.004734037491673803, + "grad_norm": 2.706404685974121, + "learning_rate": 4.99972421332342e-05, + "loss": 7.4233, + "step": 796 + }, + { + "epoch": 0.004739984774954801, + "grad_norm": 2.388360023498535, + "learning_rate": 4.9997235190969025e-05, + "loss": 7.5754, + "step": 797 + }, + { + "epoch": 0.004745932058235798, + "grad_norm": 2.3414177894592285, + "learning_rate": 4.999722823997758e-05, + "loss": 7.438, + "step": 798 + }, + { + "epoch": 0.004751879341516795, + "grad_norm": 2.46012544631958, + "learning_rate": 4.999722128025985e-05, + "loss": 6.9522, + "step": 799 + }, + { + "epoch": 0.004757826624797792, + "grad_norm": 2.5721335411071777, + "learning_rate": 4.9997214311815855e-05, + "loss": 6.9632, + "step": 800 + }, + { + "epoch": 0.00476377390807879, + "grad_norm": 2.4028279781341553, + "learning_rate": 4.999720733464559e-05, + "loss": 7.3834, + "step": 801 + }, + { + "epoch": 0.004769721191359787, + "grad_norm": 2.378971576690674, + "learning_rate": 4.9997200348749055e-05, + "loss": 7.7919, + "step": 802 + }, + { + "epoch": 0.004775668474640784, + "grad_norm": 2.1871516704559326, + "learning_rate": 4.999719335412626e-05, + "loss": 7.6832, + "step": 803 + }, + { + "epoch": 0.004781615757921781, + "grad_norm": 2.4183239936828613, + "learning_rate": 4.9997186350777206e-05, + "loss": 7.5013, + "step": 804 + }, + { + "epoch": 0.004787563041202779, + "grad_norm": 2.2322120666503906, + "learning_rate": 4.9997179338701884e-05, + "loss": 7.4224, + "step": 805 + }, + { + "epoch": 0.004793510324483776, + "grad_norm": 3.2633447647094727, + "learning_rate": 4.99971723179003e-05, + "loss": 7.1966, + "step": 806 + }, + { + "epoch": 0.004799457607764773, + "grad_norm": 3.1195995807647705, + "learning_rate": 4.999716528837247e-05, + "loss": 7.4057, + "step": 807 + }, + { + "epoch": 0.00480540489104577, + "grad_norm": 2.6904098987579346, + "learning_rate": 4.9997158250118395e-05, + "loss": 7.4585, + "step": 808 + }, + { + "epoch": 0.004811352174326768, + "grad_norm": 2.6955599784851074, + "learning_rate": 4.999715120313806e-05, + "loss": 7.6053, + "step": 809 + }, + { + "epoch": 0.004817299457607765, + "grad_norm": 3.569037675857544, + "learning_rate": 4.999714414743148e-05, + "loss": 7.5085, + "step": 810 + }, + { + "epoch": 0.004823246740888762, + "grad_norm": 3.5231528282165527, + "learning_rate": 4.9997137082998655e-05, + "loss": 7.4554, + "step": 811 + }, + { + "epoch": 0.004829194024169759, + "grad_norm": 2.7118120193481445, + "learning_rate": 4.999713000983959e-05, + "loss": 7.4323, + "step": 812 + }, + { + "epoch": 0.004835141307450756, + "grad_norm": 3.229548931121826, + "learning_rate": 4.9997122927954284e-05, + "loss": 7.3098, + "step": 813 + }, + { + "epoch": 0.004841088590731754, + "grad_norm": 2.4224696159362793, + "learning_rate": 4.999711583734273e-05, + "loss": 7.3488, + "step": 814 + }, + { + "epoch": 0.004847035874012751, + "grad_norm": 2.627565383911133, + "learning_rate": 4.999710873800496e-05, + "loss": 7.457, + "step": 815 + }, + { + "epoch": 0.004852983157293748, + "grad_norm": 2.5339515209198, + "learning_rate": 4.999710162994094e-05, + "loss": 7.6602, + "step": 816 + }, + { + "epoch": 0.004858930440574745, + "grad_norm": 2.663694143295288, + "learning_rate": 4.9997094513150706e-05, + "loss": 7.1064, + "step": 817 + }, + { + "epoch": 0.004864877723855743, + "grad_norm": 2.372504472732544, + "learning_rate": 4.9997087387634234e-05, + "loss": 7.341, + "step": 818 + }, + { + "epoch": 0.00487082500713674, + "grad_norm": 2.145191192626953, + "learning_rate": 4.999708025339154e-05, + "loss": 7.3216, + "step": 819 + }, + { + "epoch": 0.004876772290417737, + "grad_norm": 2.39685320854187, + "learning_rate": 4.9997073110422626e-05, + "loss": 7.3463, + "step": 820 + }, + { + "epoch": 0.004882719573698734, + "grad_norm": 2.2227275371551514, + "learning_rate": 4.999706595872749e-05, + "loss": 7.2517, + "step": 821 + }, + { + "epoch": 0.004888666856979732, + "grad_norm": 2.7770352363586426, + "learning_rate": 4.999705879830614e-05, + "loss": 7.3117, + "step": 822 + }, + { + "epoch": 0.004894614140260729, + "grad_norm": 2.448026180267334, + "learning_rate": 4.999705162915857e-05, + "loss": 6.9883, + "step": 823 + }, + { + "epoch": 0.004900561423541726, + "grad_norm": 2.2304437160491943, + "learning_rate": 4.999704445128479e-05, + "loss": 7.2644, + "step": 824 + }, + { + "epoch": 0.004906508706822723, + "grad_norm": 2.351707696914673, + "learning_rate": 4.9997037264684796e-05, + "loss": 7.1984, + "step": 825 + }, + { + "epoch": 0.004912455990103721, + "grad_norm": 2.7631921768188477, + "learning_rate": 4.99970300693586e-05, + "loss": 7.3774, + "step": 826 + }, + { + "epoch": 0.004918403273384718, + "grad_norm": 2.4636785984039307, + "learning_rate": 4.9997022865306195e-05, + "loss": 7.3778, + "step": 827 + }, + { + "epoch": 0.004924350556665715, + "grad_norm": 3.5510878562927246, + "learning_rate": 4.999701565252759e-05, + "loss": 7.166, + "step": 828 + }, + { + "epoch": 0.004930297839946712, + "grad_norm": 3.2581429481506348, + "learning_rate": 4.999700843102278e-05, + "loss": 7.286, + "step": 829 + }, + { + "epoch": 0.00493624512322771, + "grad_norm": 2.4304182529449463, + "learning_rate": 4.999700120079178e-05, + "loss": 7.5076, + "step": 830 + }, + { + "epoch": 0.004942192406508707, + "grad_norm": 2.428854465484619, + "learning_rate": 4.999699396183458e-05, + "loss": 7.405, + "step": 831 + }, + { + "epoch": 0.004948139689789704, + "grad_norm": 2.7680416107177734, + "learning_rate": 4.9996986714151195e-05, + "loss": 7.4944, + "step": 832 + }, + { + "epoch": 0.004954086973070701, + "grad_norm": 2.6787109375, + "learning_rate": 4.999697945774161e-05, + "loss": 7.5946, + "step": 833 + }, + { + "epoch": 0.004960034256351699, + "grad_norm": 2.6396615505218506, + "learning_rate": 4.9996972192605845e-05, + "loss": 7.5405, + "step": 834 + }, + { + "epoch": 0.004965981539632696, + "grad_norm": 2.89387583732605, + "learning_rate": 4.999696491874389e-05, + "loss": 7.3809, + "step": 835 + }, + { + "epoch": 0.004971928822913693, + "grad_norm": 2.332838535308838, + "learning_rate": 4.999695763615576e-05, + "loss": 7.3638, + "step": 836 + }, + { + "epoch": 0.00497787610619469, + "grad_norm": 2.2880585193634033, + "learning_rate": 4.9996950344841444e-05, + "loss": 7.3557, + "step": 837 + }, + { + "epoch": 0.004983823389475688, + "grad_norm": 2.7478256225585938, + "learning_rate": 4.999694304480096e-05, + "loss": 7.4, + "step": 838 + }, + { + "epoch": 0.004989770672756685, + "grad_norm": 3.4789531230926514, + "learning_rate": 4.999693573603429e-05, + "loss": 7.4438, + "step": 839 + }, + { + "epoch": 0.004995717956037682, + "grad_norm": 2.7377078533172607, + "learning_rate": 4.9996928418541455e-05, + "loss": 7.4074, + "step": 840 + }, + { + "epoch": 0.005001665239318679, + "grad_norm": 3.04420804977417, + "learning_rate": 4.9996921092322444e-05, + "loss": 7.3834, + "step": 841 + }, + { + "epoch": 0.005007612522599676, + "grad_norm": 2.759244203567505, + "learning_rate": 4.999691375737727e-05, + "loss": 7.4492, + "step": 842 + }, + { + "epoch": 0.005013559805880674, + "grad_norm": 2.5327556133270264, + "learning_rate": 4.9996906413705933e-05, + "loss": 7.4403, + "step": 843 + }, + { + "epoch": 0.0050195070891616705, + "grad_norm": 2.8170409202575684, + "learning_rate": 4.9996899061308434e-05, + "loss": 7.623, + "step": 844 + }, + { + "epoch": 0.005025454372442668, + "grad_norm": 3.8642547130584717, + "learning_rate": 4.9996891700184774e-05, + "loss": 7.6099, + "step": 845 + }, + { + "epoch": 0.005031401655723665, + "grad_norm": 4.704552173614502, + "learning_rate": 4.999688433033496e-05, + "loss": 7.6755, + "step": 846 + }, + { + "epoch": 0.005037348939004663, + "grad_norm": 4.128530979156494, + "learning_rate": 4.9996876951758986e-05, + "loss": 7.5246, + "step": 847 + }, + { + "epoch": 0.0050432962222856596, + "grad_norm": 2.233447551727295, + "learning_rate": 4.9996869564456865e-05, + "loss": 7.1139, + "step": 848 + }, + { + "epoch": 0.005049243505566657, + "grad_norm": 5.96085262298584, + "learning_rate": 4.999686216842859e-05, + "loss": 7.4114, + "step": 849 + }, + { + "epoch": 0.005055190788847654, + "grad_norm": 4.828244686126709, + "learning_rate": 4.9996854763674175e-05, + "loss": 7.6743, + "step": 850 + }, + { + "epoch": 0.005061138072128652, + "grad_norm": 3.0259342193603516, + "learning_rate": 4.999684735019362e-05, + "loss": 7.7537, + "step": 851 + }, + { + "epoch": 0.005067085355409649, + "grad_norm": 2.807244062423706, + "learning_rate": 4.999683992798692e-05, + "loss": 7.7744, + "step": 852 + }, + { + "epoch": 0.005073032638690646, + "grad_norm": 2.81384015083313, + "learning_rate": 4.999683249705408e-05, + "loss": 7.2922, + "step": 853 + }, + { + "epoch": 0.005078979921971643, + "grad_norm": 2.582836627960205, + "learning_rate": 4.9996825057395105e-05, + "loss": 7.3421, + "step": 854 + }, + { + "epoch": 0.005084927205252641, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9996817609009996e-05, + "loss": 7.6249, + "step": 855 + }, + { + "epoch": 0.005090874488533638, + "grad_norm": 2.3322219848632812, + "learning_rate": 4.999681015189875e-05, + "loss": 7.4695, + "step": 856 + }, + { + "epoch": 0.005096821771814635, + "grad_norm": 2.5582947731018066, + "learning_rate": 4.9996802686061384e-05, + "loss": 7.2747, + "step": 857 + }, + { + "epoch": 0.005102769055095632, + "grad_norm": 3.192093849182129, + "learning_rate": 4.999679521149789e-05, + "loss": 7.504, + "step": 858 + }, + { + "epoch": 0.00510871633837663, + "grad_norm": 4.1585588455200195, + "learning_rate": 4.999678772820827e-05, + "loss": 7.5966, + "step": 859 + }, + { + "epoch": 0.005114663621657627, + "grad_norm": 5.052750587463379, + "learning_rate": 4.999678023619253e-05, + "loss": 7.3243, + "step": 860 + }, + { + "epoch": 0.005120610904938624, + "grad_norm": 2.395909070968628, + "learning_rate": 4.999677273545068e-05, + "loss": 7.4477, + "step": 861 + }, + { + "epoch": 0.005126558188219621, + "grad_norm": 2.487334966659546, + "learning_rate": 4.999676522598271e-05, + "loss": 7.591, + "step": 862 + }, + { + "epoch": 0.005132505471500619, + "grad_norm": 3.7094171047210693, + "learning_rate": 4.999675770778863e-05, + "loss": 7.5387, + "step": 863 + }, + { + "epoch": 0.005138452754781616, + "grad_norm": 4.468298435211182, + "learning_rate": 4.9996750180868435e-05, + "loss": 7.5754, + "step": 864 + }, + { + "epoch": 0.005144400038062613, + "grad_norm": 3.2769386768341064, + "learning_rate": 4.999674264522213e-05, + "loss": 7.459, + "step": 865 + }, + { + "epoch": 0.00515034732134361, + "grad_norm": 2.7162864208221436, + "learning_rate": 4.9996735100849726e-05, + "loss": 7.3473, + "step": 866 + }, + { + "epoch": 0.005156294604624608, + "grad_norm": 3.646401882171631, + "learning_rate": 4.999672754775122e-05, + "loss": 7.4446, + "step": 867 + }, + { + "epoch": 0.005162241887905605, + "grad_norm": 8.917684555053711, + "learning_rate": 4.999671998592662e-05, + "loss": 7.2016, + "step": 868 + }, + { + "epoch": 0.005168189171186602, + "grad_norm": 2.949993133544922, + "learning_rate": 4.999671241537591e-05, + "loss": 7.3081, + "step": 869 + }, + { + "epoch": 0.005174136454467599, + "grad_norm": 2.4531025886535645, + "learning_rate": 4.999670483609912e-05, + "loss": 7.402, + "step": 870 + }, + { + "epoch": 0.005180083737748597, + "grad_norm": 3.1903798580169678, + "learning_rate": 4.999669724809623e-05, + "loss": 7.2514, + "step": 871 + }, + { + "epoch": 0.005186031021029594, + "grad_norm": 3.461353302001953, + "learning_rate": 4.999668965136726e-05, + "loss": 7.1637, + "step": 872 + }, + { + "epoch": 0.005191978304310591, + "grad_norm": 2.623075246810913, + "learning_rate": 4.9996682045912194e-05, + "loss": 7.5482, + "step": 873 + }, + { + "epoch": 0.005197925587591588, + "grad_norm": 2.9072840213775635, + "learning_rate": 4.9996674431731044e-05, + "loss": 7.484, + "step": 874 + }, + { + "epoch": 0.005203872870872585, + "grad_norm": 3.0219666957855225, + "learning_rate": 4.999666680882382e-05, + "loss": 7.5223, + "step": 875 + }, + { + "epoch": 0.005209820154153583, + "grad_norm": 2.9892475605010986, + "learning_rate": 4.9996659177190514e-05, + "loss": 7.3843, + "step": 876 + }, + { + "epoch": 0.0052157674374345795, + "grad_norm": 2.6199591159820557, + "learning_rate": 4.9996651536831126e-05, + "loss": 7.2728, + "step": 877 + }, + { + "epoch": 0.005221714720715577, + "grad_norm": 2.6897647380828857, + "learning_rate": 4.999664388774567e-05, + "loss": 7.5323, + "step": 878 + }, + { + "epoch": 0.005227662003996574, + "grad_norm": 3.5945560932159424, + "learning_rate": 4.9996636229934155e-05, + "loss": 7.5001, + "step": 879 + }, + { + "epoch": 0.005233609287277572, + "grad_norm": 2.9064812660217285, + "learning_rate": 4.9996628563396563e-05, + "loss": 7.5463, + "step": 880 + }, + { + "epoch": 0.0052395565705585685, + "grad_norm": 3.6150660514831543, + "learning_rate": 4.999662088813291e-05, + "loss": 7.6596, + "step": 881 + }, + { + "epoch": 0.005245503853839566, + "grad_norm": 2.729684591293335, + "learning_rate": 4.99966132041432e-05, + "loss": 7.5342, + "step": 882 + }, + { + "epoch": 0.005251451137120563, + "grad_norm": 2.6782853603363037, + "learning_rate": 4.9996605511427416e-05, + "loss": 7.5837, + "step": 883 + }, + { + "epoch": 0.005257398420401561, + "grad_norm": 4.171568393707275, + "learning_rate": 4.9996597809985576e-05, + "loss": 7.3626, + "step": 884 + }, + { + "epoch": 0.0052633457036825575, + "grad_norm": 2.189725637435913, + "learning_rate": 4.999659009981769e-05, + "loss": 7.5431, + "step": 885 + }, + { + "epoch": 0.005269292986963555, + "grad_norm": 2.2473320960998535, + "learning_rate": 4.999658238092375e-05, + "loss": 7.4731, + "step": 886 + }, + { + "epoch": 0.005275240270244552, + "grad_norm": 3.4393012523651123, + "learning_rate": 4.999657465330376e-05, + "loss": 7.6839, + "step": 887 + }, + { + "epoch": 0.00528118755352555, + "grad_norm": 2.717742919921875, + "learning_rate": 4.9996566916957735e-05, + "loss": 7.6812, + "step": 888 + }, + { + "epoch": 0.0052871348368065466, + "grad_norm": 3.829698085784912, + "learning_rate": 4.9996559171885655e-05, + "loss": 7.4525, + "step": 889 + }, + { + "epoch": 0.005293082120087544, + "grad_norm": 2.764598846435547, + "learning_rate": 4.9996551418087536e-05, + "loss": 7.5379, + "step": 890 + }, + { + "epoch": 0.005299029403368541, + "grad_norm": 2.4230268001556396, + "learning_rate": 4.999654365556338e-05, + "loss": 7.454, + "step": 891 + }, + { + "epoch": 0.005304976686649539, + "grad_norm": 2.31870436668396, + "learning_rate": 4.999653588431319e-05, + "loss": 7.5306, + "step": 892 + }, + { + "epoch": 0.005310923969930536, + "grad_norm": 2.332259178161621, + "learning_rate": 4.999652810433697e-05, + "loss": 7.4008, + "step": 893 + }, + { + "epoch": 0.005316871253211533, + "grad_norm": 2.630568504333496, + "learning_rate": 4.999652031563471e-05, + "loss": 7.4046, + "step": 894 + }, + { + "epoch": 0.00532281853649253, + "grad_norm": 3.327211856842041, + "learning_rate": 4.999651251820643e-05, + "loss": 7.2901, + "step": 895 + }, + { + "epoch": 0.005328765819773528, + "grad_norm": 2.2383713722229004, + "learning_rate": 4.999650471205213e-05, + "loss": 7.5116, + "step": 896 + }, + { + "epoch": 0.005334713103054525, + "grad_norm": 2.972820997238159, + "learning_rate": 4.99964968971718e-05, + "loss": 7.4013, + "step": 897 + }, + { + "epoch": 0.005340660386335522, + "grad_norm": 2.7254672050476074, + "learning_rate": 4.999648907356545e-05, + "loss": 7.3174, + "step": 898 + }, + { + "epoch": 0.005346607669616519, + "grad_norm": 2.6943607330322266, + "learning_rate": 4.9996481241233096e-05, + "loss": 7.386, + "step": 899 + }, + { + "epoch": 0.005352554952897517, + "grad_norm": 2.9217519760131836, + "learning_rate": 4.999647340017473e-05, + "loss": 7.5398, + "step": 900 + }, + { + "epoch": 0.005358502236178514, + "grad_norm": 2.7950780391693115, + "learning_rate": 4.999646555039034e-05, + "loss": 7.6336, + "step": 901 + }, + { + "epoch": 0.005364449519459511, + "grad_norm": 2.763364553451538, + "learning_rate": 4.999645769187995e-05, + "loss": 7.5161, + "step": 902 + }, + { + "epoch": 0.005370396802740508, + "grad_norm": 2.3095102310180664, + "learning_rate": 4.999644982464355e-05, + "loss": 7.5859, + "step": 903 + }, + { + "epoch": 0.005376344086021506, + "grad_norm": 2.7287917137145996, + "learning_rate": 4.999644194868115e-05, + "loss": 7.3983, + "step": 904 + }, + { + "epoch": 0.005382291369302503, + "grad_norm": 2.6175942420959473, + "learning_rate": 4.999643406399275e-05, + "loss": 7.4278, + "step": 905 + }, + { + "epoch": 0.0053882386525834994, + "grad_norm": 2.3898375034332275, + "learning_rate": 4.999642617057835e-05, + "loss": 7.4537, + "step": 906 + }, + { + "epoch": 0.005394185935864497, + "grad_norm": 2.964381694793701, + "learning_rate": 4.999641826843796e-05, + "loss": 7.3258, + "step": 907 + }, + { + "epoch": 0.005400133219145494, + "grad_norm": 3.1146717071533203, + "learning_rate": 4.999641035757158e-05, + "loss": 7.5412, + "step": 908 + }, + { + "epoch": 0.005406080502426492, + "grad_norm": 3.4733238220214844, + "learning_rate": 4.999640243797921e-05, + "loss": 7.423, + "step": 909 + }, + { + "epoch": 0.0054120277857074885, + "grad_norm": 3.621044158935547, + "learning_rate": 4.999639450966085e-05, + "loss": 7.5885, + "step": 910 + }, + { + "epoch": 0.005417975068988486, + "grad_norm": 2.4800662994384766, + "learning_rate": 4.999638657261651e-05, + "loss": 7.5231, + "step": 911 + }, + { + "epoch": 0.005423922352269483, + "grad_norm": 3.3247363567352295, + "learning_rate": 4.999637862684619e-05, + "loss": 7.2367, + "step": 912 + }, + { + "epoch": 0.005429869635550481, + "grad_norm": 4.293686866760254, + "learning_rate": 4.999637067234989e-05, + "loss": 6.8423, + "step": 913 + }, + { + "epoch": 0.0054358169188314775, + "grad_norm": 2.6713979244232178, + "learning_rate": 4.999636270912762e-05, + "loss": 6.7962, + "step": 914 + }, + { + "epoch": 0.005441764202112475, + "grad_norm": 2.9386653900146484, + "learning_rate": 4.9996354737179376e-05, + "loss": 6.7582, + "step": 915 + }, + { + "epoch": 0.005447711485393472, + "grad_norm": 2.8030481338500977, + "learning_rate": 4.999634675650516e-05, + "loss": 6.6516, + "step": 916 + }, + { + "epoch": 0.00545365876867447, + "grad_norm": 2.7315666675567627, + "learning_rate": 4.9996338767104985e-05, + "loss": 6.6159, + "step": 917 + }, + { + "epoch": 0.0054596060519554665, + "grad_norm": 3.116098403930664, + "learning_rate": 4.999633076897884e-05, + "loss": 7.2121, + "step": 918 + }, + { + "epoch": 0.005465553335236464, + "grad_norm": 2.867687940597534, + "learning_rate": 4.999632276212673e-05, + "loss": 7.5124, + "step": 919 + }, + { + "epoch": 0.005471500618517461, + "grad_norm": 2.9864203929901123, + "learning_rate": 4.9996314746548676e-05, + "loss": 7.5168, + "step": 920 + }, + { + "epoch": 0.005477447901798459, + "grad_norm": 2.9083375930786133, + "learning_rate": 4.9996306722244656e-05, + "loss": 7.5027, + "step": 921 + }, + { + "epoch": 0.0054833951850794555, + "grad_norm": 2.5569801330566406, + "learning_rate": 4.9996298689214686e-05, + "loss": 7.2988, + "step": 922 + }, + { + "epoch": 0.005489342468360453, + "grad_norm": 3.7101242542266846, + "learning_rate": 4.9996290647458765e-05, + "loss": 7.33, + "step": 923 + }, + { + "epoch": 0.00549528975164145, + "grad_norm": 2.848881244659424, + "learning_rate": 4.99962825969769e-05, + "loss": 7.4534, + "step": 924 + }, + { + "epoch": 0.005501237034922448, + "grad_norm": 3.072282075881958, + "learning_rate": 4.999627453776909e-05, + "loss": 7.4398, + "step": 925 + }, + { + "epoch": 0.0055071843182034445, + "grad_norm": 2.8132996559143066, + "learning_rate": 4.999626646983534e-05, + "loss": 7.5617, + "step": 926 + }, + { + "epoch": 0.005513131601484442, + "grad_norm": 2.2710142135620117, + "learning_rate": 4.999625839317565e-05, + "loss": 7.5975, + "step": 927 + }, + { + "epoch": 0.005519078884765439, + "grad_norm": 2.745007276535034, + "learning_rate": 4.9996250307790026e-05, + "loss": 7.4599, + "step": 928 + }, + { + "epoch": 0.005525026168046437, + "grad_norm": 3.2031302452087402, + "learning_rate": 4.999624221367847e-05, + "loss": 7.3528, + "step": 929 + }, + { + "epoch": 0.0055309734513274336, + "grad_norm": 6.417830467224121, + "learning_rate": 4.999623411084098e-05, + "loss": 7.5118, + "step": 930 + }, + { + "epoch": 0.005536920734608431, + "grad_norm": 2.7960314750671387, + "learning_rate": 4.999622599927756e-05, + "loss": 6.5016, + "step": 931 + }, + { + "epoch": 0.005542868017889428, + "grad_norm": 2.959507703781128, + "learning_rate": 4.999621787898822e-05, + "loss": 7.6521, + "step": 932 + }, + { + "epoch": 0.005548815301170426, + "grad_norm": 3.328834056854248, + "learning_rate": 4.999620974997296e-05, + "loss": 7.6267, + "step": 933 + }, + { + "epoch": 0.005554762584451423, + "grad_norm": 2.5232200622558594, + "learning_rate": 4.9996201612231786e-05, + "loss": 7.471, + "step": 934 + }, + { + "epoch": 0.00556070986773242, + "grad_norm": 2.2766942977905273, + "learning_rate": 4.999619346576468e-05, + "loss": 7.4204, + "step": 935 + }, + { + "epoch": 0.005566657151013417, + "grad_norm": 2.584068536758423, + "learning_rate": 4.999618531057168e-05, + "loss": 7.4384, + "step": 936 + }, + { + "epoch": 0.005572604434294414, + "grad_norm": 3.004523277282715, + "learning_rate": 4.999617714665276e-05, + "loss": 7.5681, + "step": 937 + }, + { + "epoch": 0.005578551717575412, + "grad_norm": 4.102936267852783, + "learning_rate": 4.999616897400794e-05, + "loss": 7.4571, + "step": 938 + }, + { + "epoch": 0.005584499000856408, + "grad_norm": 2.745293378829956, + "learning_rate": 4.99961607926372e-05, + "loss": 7.588, + "step": 939 + }, + { + "epoch": 0.005590446284137406, + "grad_norm": 2.9720282554626465, + "learning_rate": 4.9996152602540576e-05, + "loss": 7.4761, + "step": 940 + }, + { + "epoch": 0.005596393567418403, + "grad_norm": 3.150047540664673, + "learning_rate": 4.999614440371805e-05, + "loss": 7.4525, + "step": 941 + }, + { + "epoch": 0.005602340850699401, + "grad_norm": 2.6735856533050537, + "learning_rate": 4.999613619616962e-05, + "loss": 7.2754, + "step": 942 + }, + { + "epoch": 0.005608288133980397, + "grad_norm": 2.6451661586761475, + "learning_rate": 4.9996127979895304e-05, + "loss": 7.5742, + "step": 943 + }, + { + "epoch": 0.005614235417261395, + "grad_norm": 2.7551536560058594, + "learning_rate": 4.9996119754895095e-05, + "loss": 7.4981, + "step": 944 + }, + { + "epoch": 0.005620182700542392, + "grad_norm": 2.7445640563964844, + "learning_rate": 4.9996111521168995e-05, + "loss": 7.4761, + "step": 945 + }, + { + "epoch": 0.00562612998382339, + "grad_norm": 2.537924289703369, + "learning_rate": 4.9996103278717013e-05, + "loss": 7.5483, + "step": 946 + }, + { + "epoch": 0.0056320772671043864, + "grad_norm": 3.503661632537842, + "learning_rate": 4.9996095027539156e-05, + "loss": 7.3074, + "step": 947 + }, + { + "epoch": 0.005638024550385384, + "grad_norm": 2.8088479042053223, + "learning_rate": 4.999608676763542e-05, + "loss": 7.5675, + "step": 948 + }, + { + "epoch": 0.005643971833666381, + "grad_norm": 2.6219863891601562, + "learning_rate": 4.99960784990058e-05, + "loss": 7.6037, + "step": 949 + }, + { + "epoch": 0.005649919116947379, + "grad_norm": 2.88737416267395, + "learning_rate": 4.999607022165031e-05, + "loss": 7.4815, + "step": 950 + }, + { + "epoch": 0.0056558664002283755, + "grad_norm": 2.455707550048828, + "learning_rate": 4.999606193556895e-05, + "loss": 7.553, + "step": 951 + }, + { + "epoch": 0.005661813683509373, + "grad_norm": 2.2502405643463135, + "learning_rate": 4.999605364076173e-05, + "loss": 7.387, + "step": 952 + }, + { + "epoch": 0.00566776096679037, + "grad_norm": 2.754972457885742, + "learning_rate": 4.9996045337228635e-05, + "loss": 7.3088, + "step": 953 + }, + { + "epoch": 0.005673708250071368, + "grad_norm": 3.111553192138672, + "learning_rate": 4.9996037024969686e-05, + "loss": 7.5063, + "step": 954 + }, + { + "epoch": 0.0056796555333523645, + "grad_norm": 2.4000720977783203, + "learning_rate": 4.9996028703984875e-05, + "loss": 7.5705, + "step": 955 + }, + { + "epoch": 0.005685602816633362, + "grad_norm": 2.495659351348877, + "learning_rate": 4.9996020374274215e-05, + "loss": 7.5421, + "step": 956 + }, + { + "epoch": 0.005691550099914359, + "grad_norm": 3.025509834289551, + "learning_rate": 4.99960120358377e-05, + "loss": 7.5406, + "step": 957 + }, + { + "epoch": 0.005697497383195357, + "grad_norm": 2.224342107772827, + "learning_rate": 4.999600368867533e-05, + "loss": 7.4323, + "step": 958 + }, + { + "epoch": 0.0057034446664763535, + "grad_norm": 2.661423683166504, + "learning_rate": 4.999599533278712e-05, + "loss": 7.565, + "step": 959 + }, + { + "epoch": 0.005709391949757351, + "grad_norm": 2.503293037414551, + "learning_rate": 4.999598696817307e-05, + "loss": 7.3552, + "step": 960 + }, + { + "epoch": 0.005715339233038348, + "grad_norm": 2.2878923416137695, + "learning_rate": 4.999597859483316e-05, + "loss": 7.4542, + "step": 961 + }, + { + "epoch": 0.005721286516319346, + "grad_norm": 2.759594678878784, + "learning_rate": 4.999597021276743e-05, + "loss": 7.2349, + "step": 962 + }, + { + "epoch": 0.0057272337996003425, + "grad_norm": 4.5453314781188965, + "learning_rate": 4.999596182197586e-05, + "loss": 7.4728, + "step": 963 + }, + { + "epoch": 0.00573318108288134, + "grad_norm": 2.4369568824768066, + "learning_rate": 4.999595342245846e-05, + "loss": 7.4396, + "step": 964 + }, + { + "epoch": 0.005739128366162337, + "grad_norm": 2.4081692695617676, + "learning_rate": 4.999594501421523e-05, + "loss": 7.536, + "step": 965 + }, + { + "epoch": 0.005745075649443335, + "grad_norm": 3.0494678020477295, + "learning_rate": 4.9995936597246176e-05, + "loss": 7.4061, + "step": 966 + }, + { + "epoch": 0.0057510229327243315, + "grad_norm": 3.3492188453674316, + "learning_rate": 4.999592817155129e-05, + "loss": 7.5419, + "step": 967 + }, + { + "epoch": 0.005756970216005328, + "grad_norm": 2.254714012145996, + "learning_rate": 4.999591973713059e-05, + "loss": 7.4568, + "step": 968 + }, + { + "epoch": 0.005762917499286326, + "grad_norm": 2.3336634635925293, + "learning_rate": 4.999591129398407e-05, + "loss": 7.4386, + "step": 969 + }, + { + "epoch": 0.005768864782567323, + "grad_norm": 2.545154094696045, + "learning_rate": 4.999590284211174e-05, + "loss": 7.226, + "step": 970 + }, + { + "epoch": 0.0057748120658483205, + "grad_norm": 2.891068458557129, + "learning_rate": 4.99958943815136e-05, + "loss": 7.4235, + "step": 971 + }, + { + "epoch": 0.005780759349129317, + "grad_norm": 3.0321712493896484, + "learning_rate": 4.999588591218964e-05, + "loss": 7.2918, + "step": 972 + }, + { + "epoch": 0.005786706632410315, + "grad_norm": 2.935490846633911, + "learning_rate": 4.9995877434139884e-05, + "loss": 7.4172, + "step": 973 + }, + { + "epoch": 0.005792653915691312, + "grad_norm": 3.0021424293518066, + "learning_rate": 4.9995868947364324e-05, + "loss": 7.521, + "step": 974 + }, + { + "epoch": 0.0057986011989723096, + "grad_norm": 2.2784783840179443, + "learning_rate": 4.9995860451862964e-05, + "loss": 7.5716, + "step": 975 + }, + { + "epoch": 0.005804548482253306, + "grad_norm": 2.9321484565734863, + "learning_rate": 4.999585194763581e-05, + "loss": 7.0965, + "step": 976 + }, + { + "epoch": 0.005810495765534304, + "grad_norm": 2.284874439239502, + "learning_rate": 4.999584343468285e-05, + "loss": 7.4376, + "step": 977 + }, + { + "epoch": 0.005816443048815301, + "grad_norm": 2.2066683769226074, + "learning_rate": 4.9995834913004115e-05, + "loss": 7.4478, + "step": 978 + }, + { + "epoch": 0.005822390332096299, + "grad_norm": 2.286323070526123, + "learning_rate": 4.999582638259959e-05, + "loss": 7.4139, + "step": 979 + }, + { + "epoch": 0.005828337615377295, + "grad_norm": 2.5052928924560547, + "learning_rate": 4.999581784346927e-05, + "loss": 7.4278, + "step": 980 + }, + { + "epoch": 0.005834284898658293, + "grad_norm": 2.273698091506958, + "learning_rate": 4.9995809295613175e-05, + "loss": 7.4019, + "step": 981 + }, + { + "epoch": 0.00584023218193929, + "grad_norm": 2.729466676712036, + "learning_rate": 4.999580073903129e-05, + "loss": 7.4716, + "step": 982 + }, + { + "epoch": 0.005846179465220288, + "grad_norm": 2.5776185989379883, + "learning_rate": 4.999579217372365e-05, + "loss": 7.4708, + "step": 983 + }, + { + "epoch": 0.005852126748501284, + "grad_norm": 2.4125893115997314, + "learning_rate": 4.9995783599690226e-05, + "loss": 7.4505, + "step": 984 + }, + { + "epoch": 0.005858074031782282, + "grad_norm": 2.975911855697632, + "learning_rate": 4.9995775016931035e-05, + "loss": 7.4095, + "step": 985 + }, + { + "epoch": 0.005864021315063279, + "grad_norm": 2.4155962467193604, + "learning_rate": 4.9995766425446076e-05, + "loss": 7.3084, + "step": 986 + }, + { + "epoch": 0.005869968598344277, + "grad_norm": 2.436950922012329, + "learning_rate": 4.999575782523535e-05, + "loss": 7.2782, + "step": 987 + }, + { + "epoch": 0.0058759158816252734, + "grad_norm": 2.2371575832366943, + "learning_rate": 4.999574921629887e-05, + "loss": 7.3879, + "step": 988 + }, + { + "epoch": 0.005881863164906271, + "grad_norm": 2.3079733848571777, + "learning_rate": 4.999574059863663e-05, + "loss": 7.5117, + "step": 989 + }, + { + "epoch": 0.005887810448187268, + "grad_norm": 2.4018514156341553, + "learning_rate": 4.9995731972248626e-05, + "loss": 7.4486, + "step": 990 + }, + { + "epoch": 0.005893757731468266, + "grad_norm": 2.3437294960021973, + "learning_rate": 4.9995723337134884e-05, + "loss": 7.461, + "step": 991 + }, + { + "epoch": 0.0058997050147492625, + "grad_norm": 3.15254545211792, + "learning_rate": 4.999571469329538e-05, + "loss": 7.014, + "step": 992 + }, + { + "epoch": 0.00590565229803026, + "grad_norm": 2.4809768199920654, + "learning_rate": 4.999570604073014e-05, + "loss": 7.4339, + "step": 993 + }, + { + "epoch": 0.005911599581311257, + "grad_norm": 3.4286630153656006, + "learning_rate": 4.9995697379439154e-05, + "loss": 7.3086, + "step": 994 + }, + { + "epoch": 0.005917546864592255, + "grad_norm": 3.9362127780914307, + "learning_rate": 4.999568870942243e-05, + "loss": 7.2635, + "step": 995 + }, + { + "epoch": 0.0059234941478732515, + "grad_norm": 2.6632091999053955, + "learning_rate": 4.9995680030679965e-05, + "loss": 7.2779, + "step": 996 + }, + { + "epoch": 0.005929441431154249, + "grad_norm": 5.218096733093262, + "learning_rate": 4.999567134321177e-05, + "loss": 7.4285, + "step": 997 + }, + { + "epoch": 0.005935388714435246, + "grad_norm": 3.441894769668579, + "learning_rate": 4.9995662647017835e-05, + "loss": 7.5576, + "step": 998 + }, + { + "epoch": 0.005941335997716243, + "grad_norm": 2.560178279876709, + "learning_rate": 4.9995653942098184e-05, + "loss": 7.5692, + "step": 999 + }, + { + "epoch": 0.0059472832809972405, + "grad_norm": 2.458313226699829, + "learning_rate": 4.999564522845281e-05, + "loss": 7.0495, + "step": 1000 + }, + { + "epoch": 0.005953230564278237, + "grad_norm": 2.539314031600952, + "learning_rate": 4.999563650608171e-05, + "loss": 7.1919, + "step": 1001 + }, + { + "epoch": 0.005959177847559235, + "grad_norm": 3.6134390830993652, + "learning_rate": 4.999562777498489e-05, + "loss": 7.0725, + "step": 1002 + }, + { + "epoch": 0.005965125130840232, + "grad_norm": 2.6582295894622803, + "learning_rate": 4.9995619035162355e-05, + "loss": 7.3008, + "step": 1003 + }, + { + "epoch": 0.0059710724141212295, + "grad_norm": 2.4968035221099854, + "learning_rate": 4.999561028661411e-05, + "loss": 7.2862, + "step": 1004 + }, + { + "epoch": 0.005977019697402226, + "grad_norm": 3.002840042114258, + "learning_rate": 4.999560152934015e-05, + "loss": 7.1721, + "step": 1005 + }, + { + "epoch": 0.005982966980683224, + "grad_norm": 3.4327914714813232, + "learning_rate": 4.999559276334049e-05, + "loss": 7.242, + "step": 1006 + }, + { + "epoch": 0.005988914263964221, + "grad_norm": 2.4082493782043457, + "learning_rate": 4.999558398861513e-05, + "loss": 7.1588, + "step": 1007 + }, + { + "epoch": 0.0059948615472452185, + "grad_norm": 2.39475417137146, + "learning_rate": 4.9995575205164056e-05, + "loss": 7.1713, + "step": 1008 + }, + { + "epoch": 0.006000808830526215, + "grad_norm": 2.946331024169922, + "learning_rate": 4.99955664129873e-05, + "loss": 7.1553, + "step": 1009 + }, + { + "epoch": 0.006006756113807213, + "grad_norm": 2.4334871768951416, + "learning_rate": 4.999555761208484e-05, + "loss": 7.1898, + "step": 1010 + }, + { + "epoch": 0.00601270339708821, + "grad_norm": 2.3159971237182617, + "learning_rate": 4.999554880245669e-05, + "loss": 7.0642, + "step": 1011 + }, + { + "epoch": 0.0060186506803692075, + "grad_norm": 2.9773905277252197, + "learning_rate": 4.9995539984102854e-05, + "loss": 7.3285, + "step": 1012 + }, + { + "epoch": 0.006024597963650204, + "grad_norm": 3.444267749786377, + "learning_rate": 4.999553115702334e-05, + "loss": 7.1263, + "step": 1013 + }, + { + "epoch": 0.006030545246931202, + "grad_norm": 2.6518173217773438, + "learning_rate": 4.9995522321218136e-05, + "loss": 7.3915, + "step": 1014 + }, + { + "epoch": 0.006036492530212199, + "grad_norm": 2.46230149269104, + "learning_rate": 4.9995513476687254e-05, + "loss": 7.1808, + "step": 1015 + }, + { + "epoch": 0.0060424398134931966, + "grad_norm": 2.2243192195892334, + "learning_rate": 4.99955046234307e-05, + "loss": 7.4262, + "step": 1016 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 3.0834670066833496, + "learning_rate": 4.999549576144847e-05, + "loss": 7.4028, + "step": 1017 + }, + { + "epoch": 0.006054334380055191, + "grad_norm": 3.2453930377960205, + "learning_rate": 4.9995486890740573e-05, + "loss": 7.5537, + "step": 1018 + }, + { + "epoch": 0.006060281663336188, + "grad_norm": 2.7142229080200195, + "learning_rate": 4.9995478011307015e-05, + "loss": 7.4131, + "step": 1019 + }, + { + "epoch": 0.006066228946617186, + "grad_norm": 2.9567463397979736, + "learning_rate": 4.9995469123147784e-05, + "loss": 7.5969, + "step": 1020 + }, + { + "epoch": 0.006072176229898182, + "grad_norm": 2.5698695182800293, + "learning_rate": 4.99954602262629e-05, + "loss": 7.2721, + "step": 1021 + }, + { + "epoch": 0.00607812351317918, + "grad_norm": 2.3958864212036133, + "learning_rate": 4.999545132065235e-05, + "loss": 7.3414, + "step": 1022 + }, + { + "epoch": 0.006084070796460177, + "grad_norm": 2.528024911880493, + "learning_rate": 4.9995442406316156e-05, + "loss": 7.2821, + "step": 1023 + }, + { + "epoch": 0.006090018079741175, + "grad_norm": 2.6904075145721436, + "learning_rate": 4.999543348325431e-05, + "loss": 7.3726, + "step": 1024 + }, + { + "epoch": 0.006095965363022171, + "grad_norm": 2.8618202209472656, + "learning_rate": 4.999542455146681e-05, + "loss": 7.4232, + "step": 1025 + }, + { + "epoch": 0.006101912646303169, + "grad_norm": 1.978455662727356, + "learning_rate": 4.999541561095367e-05, + "loss": 7.5949, + "step": 1026 + }, + { + "epoch": 0.006107859929584166, + "grad_norm": 2.882568836212158, + "learning_rate": 4.999540666171489e-05, + "loss": 7.4868, + "step": 1027 + }, + { + "epoch": 0.006113807212865164, + "grad_norm": 2.9586474895477295, + "learning_rate": 4.999539770375047e-05, + "loss": 7.1556, + "step": 1028 + }, + { + "epoch": 0.00611975449614616, + "grad_norm": 2.5675363540649414, + "learning_rate": 4.999538873706041e-05, + "loss": 7.3306, + "step": 1029 + }, + { + "epoch": 0.006125701779427157, + "grad_norm": 3.440857410430908, + "learning_rate": 4.999537976164472e-05, + "loss": 7.3654, + "step": 1030 + }, + { + "epoch": 0.006131649062708155, + "grad_norm": 3.7741217613220215, + "learning_rate": 4.999537077750341e-05, + "loss": 6.8088, + "step": 1031 + }, + { + "epoch": 0.006137596345989152, + "grad_norm": 3.801609754562378, + "learning_rate": 4.999536178463647e-05, + "loss": 6.989, + "step": 1032 + }, + { + "epoch": 0.0061435436292701495, + "grad_norm": 2.627225875854492, + "learning_rate": 4.9995352783043905e-05, + "loss": 7.4066, + "step": 1033 + }, + { + "epoch": 0.006149490912551146, + "grad_norm": 3.3529040813446045, + "learning_rate": 4.9995343772725725e-05, + "loss": 7.0403, + "step": 1034 + }, + { + "epoch": 0.006155438195832144, + "grad_norm": 3.248558521270752, + "learning_rate": 4.999533475368192e-05, + "loss": 7.2664, + "step": 1035 + }, + { + "epoch": 0.006161385479113141, + "grad_norm": 3.1260814666748047, + "learning_rate": 4.9995325725912515e-05, + "loss": 7.3257, + "step": 1036 + }, + { + "epoch": 0.0061673327623941385, + "grad_norm": 2.379659414291382, + "learning_rate": 4.999531668941748e-05, + "loss": 7.4448, + "step": 1037 + }, + { + "epoch": 0.006173280045675135, + "grad_norm": 2.8478498458862305, + "learning_rate": 4.999530764419685e-05, + "loss": 7.3892, + "step": 1038 + }, + { + "epoch": 0.006179227328956133, + "grad_norm": 4.104954719543457, + "learning_rate": 4.999529859025062e-05, + "loss": 7.5172, + "step": 1039 + }, + { + "epoch": 0.00618517461223713, + "grad_norm": 2.50160813331604, + "learning_rate": 4.999528952757879e-05, + "loss": 7.1894, + "step": 1040 + }, + { + "epoch": 0.0061911218955181275, + "grad_norm": 2.5545871257781982, + "learning_rate": 4.999528045618136e-05, + "loss": 7.3892, + "step": 1041 + }, + { + "epoch": 0.006197069178799124, + "grad_norm": 2.9980626106262207, + "learning_rate": 4.999527137605833e-05, + "loss": 7.3517, + "step": 1042 + }, + { + "epoch": 0.006203016462080122, + "grad_norm": 2.5920562744140625, + "learning_rate": 4.999526228720971e-05, + "loss": 7.1716, + "step": 1043 + }, + { + "epoch": 0.006208963745361119, + "grad_norm": 2.5224244594573975, + "learning_rate": 4.999525318963551e-05, + "loss": 7.1892, + "step": 1044 + }, + { + "epoch": 0.0062149110286421165, + "grad_norm": 2.7092106342315674, + "learning_rate": 4.999524408333572e-05, + "loss": 7.178, + "step": 1045 + }, + { + "epoch": 0.006220858311923113, + "grad_norm": 2.523320198059082, + "learning_rate": 4.999523496831035e-05, + "loss": 7.1486, + "step": 1046 + }, + { + "epoch": 0.006226805595204111, + "grad_norm": 2.4491217136383057, + "learning_rate": 4.99952258445594e-05, + "loss": 7.121, + "step": 1047 + }, + { + "epoch": 0.006232752878485108, + "grad_norm": 2.29109263420105, + "learning_rate": 4.9995216712082875e-05, + "loss": 7.4323, + "step": 1048 + }, + { + "epoch": 0.0062387001617661055, + "grad_norm": 2.5234057903289795, + "learning_rate": 4.9995207570880783e-05, + "loss": 7.1552, + "step": 1049 + }, + { + "epoch": 0.006244647445047102, + "grad_norm": 2.301316499710083, + "learning_rate": 4.9995198420953115e-05, + "loss": 7.3625, + "step": 1050 + }, + { + "epoch": 0.0062505947283281, + "grad_norm": 2.4358527660369873, + "learning_rate": 4.999518926229989e-05, + "loss": 7.2462, + "step": 1051 + }, + { + "epoch": 0.006256542011609097, + "grad_norm": 2.3915181159973145, + "learning_rate": 4.999518009492109e-05, + "loss": 7.173, + "step": 1052 + }, + { + "epoch": 0.0062624892948900945, + "grad_norm": 2.5529091358184814, + "learning_rate": 4.999517091881674e-05, + "loss": 7.2463, + "step": 1053 + }, + { + "epoch": 0.006268436578171091, + "grad_norm": 3.235435724258423, + "learning_rate": 4.999516173398683e-05, + "loss": 7.1149, + "step": 1054 + }, + { + "epoch": 0.006274383861452089, + "grad_norm": 2.692140817642212, + "learning_rate": 4.9995152540431375e-05, + "loss": 7.3554, + "step": 1055 + }, + { + "epoch": 0.006280331144733086, + "grad_norm": 2.910116195678711, + "learning_rate": 4.999514333815036e-05, + "loss": 7.4424, + "step": 1056 + }, + { + "epoch": 0.0062862784280140836, + "grad_norm": 2.897463798522949, + "learning_rate": 4.9995134127143804e-05, + "loss": 7.2345, + "step": 1057 + }, + { + "epoch": 0.00629222571129508, + "grad_norm": 2.5925514698028564, + "learning_rate": 4.999512490741171e-05, + "loss": 7.1539, + "step": 1058 + }, + { + "epoch": 0.006298172994576078, + "grad_norm": 2.693816900253296, + "learning_rate": 4.999511567895407e-05, + "loss": 7.0905, + "step": 1059 + }, + { + "epoch": 0.006304120277857075, + "grad_norm": 3.3717474937438965, + "learning_rate": 4.9995106441770896e-05, + "loss": 7.1407, + "step": 1060 + }, + { + "epoch": 0.006310067561138072, + "grad_norm": 2.6128973960876465, + "learning_rate": 4.999509719586218e-05, + "loss": 7.2748, + "step": 1061 + }, + { + "epoch": 0.006316014844419069, + "grad_norm": 2.24324369430542, + "learning_rate": 4.999508794122795e-05, + "loss": 7.2553, + "step": 1062 + }, + { + "epoch": 0.006321962127700066, + "grad_norm": 2.7593698501586914, + "learning_rate": 4.999507867786818e-05, + "loss": 7.1039, + "step": 1063 + }, + { + "epoch": 0.006327909410981064, + "grad_norm": 2.6210618019104004, + "learning_rate": 4.999506940578289e-05, + "loss": 7.0247, + "step": 1064 + }, + { + "epoch": 0.006333856694262061, + "grad_norm": 2.410187244415283, + "learning_rate": 4.9995060124972084e-05, + "loss": 7.3931, + "step": 1065 + }, + { + "epoch": 0.006339803977543058, + "grad_norm": 2.795302391052246, + "learning_rate": 4.999505083543575e-05, + "loss": 7.3168, + "step": 1066 + }, + { + "epoch": 0.006345751260824055, + "grad_norm": 2.3720662593841553, + "learning_rate": 4.999504153717391e-05, + "loss": 7.3719, + "step": 1067 + }, + { + "epoch": 0.006351698544105053, + "grad_norm": 2.721585988998413, + "learning_rate": 4.9995032230186556e-05, + "loss": 7.3847, + "step": 1068 + }, + { + "epoch": 0.00635764582738605, + "grad_norm": 2.967153549194336, + "learning_rate": 4.99950229144737e-05, + "loss": 7.3224, + "step": 1069 + }, + { + "epoch": 0.006363593110667047, + "grad_norm": 3.8144783973693848, + "learning_rate": 4.999501359003533e-05, + "loss": 7.0767, + "step": 1070 + }, + { + "epoch": 0.006369540393948044, + "grad_norm": 3.7694199085235596, + "learning_rate": 4.999500425687147e-05, + "loss": 7.4486, + "step": 1071 + }, + { + "epoch": 0.006375487677229042, + "grad_norm": 2.9668312072753906, + "learning_rate": 4.999499491498211e-05, + "loss": 7.3415, + "step": 1072 + }, + { + "epoch": 0.006381434960510039, + "grad_norm": 4.196050643920898, + "learning_rate": 4.999498556436725e-05, + "loss": 7.3784, + "step": 1073 + }, + { + "epoch": 0.0063873822437910364, + "grad_norm": 4.676602363586426, + "learning_rate": 4.99949762050269e-05, + "loss": 7.3773, + "step": 1074 + }, + { + "epoch": 0.006393329527072033, + "grad_norm": 2.8828656673431396, + "learning_rate": 4.999496683696107e-05, + "loss": 7.2359, + "step": 1075 + }, + { + "epoch": 0.006399276810353031, + "grad_norm": 2.7532308101654053, + "learning_rate": 4.9994957460169745e-05, + "loss": 7.356, + "step": 1076 + }, + { + "epoch": 0.006405224093634028, + "grad_norm": 5.535451412200928, + "learning_rate": 4.999494807465293e-05, + "loss": 7.261, + "step": 1077 + }, + { + "epoch": 0.0064111713769150255, + "grad_norm": 3.6439530849456787, + "learning_rate": 4.999493868041066e-05, + "loss": 7.4664, + "step": 1078 + }, + { + "epoch": 0.006417118660196022, + "grad_norm": 3.563948154449463, + "learning_rate": 4.99949292774429e-05, + "loss": 7.0427, + "step": 1079 + }, + { + "epoch": 0.00642306594347702, + "grad_norm": 3.6243784427642822, + "learning_rate": 4.9994919865749675e-05, + "loss": 7.3292, + "step": 1080 + }, + { + "epoch": 0.006429013226758017, + "grad_norm": 5.1197590827941895, + "learning_rate": 4.999491044533098e-05, + "loss": 7.3717, + "step": 1081 + }, + { + "epoch": 0.0064349605100390145, + "grad_norm": 4.3969902992248535, + "learning_rate": 4.999490101618682e-05, + "loss": 7.2875, + "step": 1082 + }, + { + "epoch": 0.006440907793320011, + "grad_norm": 2.6302945613861084, + "learning_rate": 4.999489157831719e-05, + "loss": 7.1958, + "step": 1083 + }, + { + "epoch": 0.006446855076601009, + "grad_norm": 3.782078504562378, + "learning_rate": 4.9994882131722116e-05, + "loss": 7.2951, + "step": 1084 + }, + { + "epoch": 0.006452802359882006, + "grad_norm": 3.432082414627075, + "learning_rate": 4.999487267640158e-05, + "loss": 7.0974, + "step": 1085 + }, + { + "epoch": 0.0064587496431630035, + "grad_norm": 3.364793300628662, + "learning_rate": 4.999486321235559e-05, + "loss": 7.0847, + "step": 1086 + }, + { + "epoch": 0.006464696926444, + "grad_norm": 2.7063019275665283, + "learning_rate": 4.999485373958416e-05, + "loss": 7.1421, + "step": 1087 + }, + { + "epoch": 0.006470644209724998, + "grad_norm": 3.0648648738861084, + "learning_rate": 4.999484425808727e-05, + "loss": 7.2723, + "step": 1088 + }, + { + "epoch": 0.006476591493005995, + "grad_norm": 3.3968300819396973, + "learning_rate": 4.999483476786495e-05, + "loss": 7.1438, + "step": 1089 + }, + { + "epoch": 0.0064825387762869925, + "grad_norm": 2.864647150039673, + "learning_rate": 4.999482526891719e-05, + "loss": 7.1512, + "step": 1090 + }, + { + "epoch": 0.006488486059567989, + "grad_norm": 2.577043056488037, + "learning_rate": 4.999481576124399e-05, + "loss": 6.8914, + "step": 1091 + }, + { + "epoch": 0.006494433342848986, + "grad_norm": 2.83754563331604, + "learning_rate": 4.999480624484536e-05, + "loss": 6.9999, + "step": 1092 + }, + { + "epoch": 0.006500380626129984, + "grad_norm": 3.5623857975006104, + "learning_rate": 4.999479671972131e-05, + "loss": 7.0567, + "step": 1093 + }, + { + "epoch": 0.006506327909410981, + "grad_norm": 2.35555362701416, + "learning_rate": 4.9994787185871814e-05, + "loss": 7.3075, + "step": 1094 + }, + { + "epoch": 0.006512275192691978, + "grad_norm": 3.8677117824554443, + "learning_rate": 4.9994777643296914e-05, + "loss": 7.3608, + "step": 1095 + }, + { + "epoch": 0.006518222475972975, + "grad_norm": 3.8163843154907227, + "learning_rate": 4.999476809199659e-05, + "loss": 7.4368, + "step": 1096 + }, + { + "epoch": 0.006524169759253973, + "grad_norm": 2.5424652099609375, + "learning_rate": 4.999475853197085e-05, + "loss": 7.4968, + "step": 1097 + }, + { + "epoch": 0.00653011704253497, + "grad_norm": 2.876898765563965, + "learning_rate": 4.99947489632197e-05, + "loss": 6.9948, + "step": 1098 + }, + { + "epoch": 0.006536064325815967, + "grad_norm": 3.3934860229492188, + "learning_rate": 4.999473938574314e-05, + "loss": 6.9588, + "step": 1099 + }, + { + "epoch": 0.006542011609096964, + "grad_norm": 2.1184024810791016, + "learning_rate": 4.9994729799541176e-05, + "loss": 7.1933, + "step": 1100 + }, + { + "epoch": 0.006547958892377962, + "grad_norm": 2.2882895469665527, + "learning_rate": 4.999472020461381e-05, + "loss": 7.0796, + "step": 1101 + }, + { + "epoch": 0.006553906175658959, + "grad_norm": 3.239429235458374, + "learning_rate": 4.9994710600961045e-05, + "loss": 6.9535, + "step": 1102 + }, + { + "epoch": 0.006559853458939956, + "grad_norm": 2.4653263092041016, + "learning_rate": 4.9994700988582884e-05, + "loss": 6.9316, + "step": 1103 + }, + { + "epoch": 0.006565800742220953, + "grad_norm": 2.511516571044922, + "learning_rate": 4.999469136747933e-05, + "loss": 6.9844, + "step": 1104 + }, + { + "epoch": 0.006571748025501951, + "grad_norm": 2.9725844860076904, + "learning_rate": 4.9994681737650384e-05, + "loss": 7.1955, + "step": 1105 + }, + { + "epoch": 0.006577695308782948, + "grad_norm": 3.04697585105896, + "learning_rate": 4.9994672099096066e-05, + "loss": 7.1044, + "step": 1106 + }, + { + "epoch": 0.006583642592063945, + "grad_norm": 3.395076274871826, + "learning_rate": 4.999466245181635e-05, + "loss": 7.1968, + "step": 1107 + }, + { + "epoch": 0.006589589875344942, + "grad_norm": 2.362884044647217, + "learning_rate": 4.999465279581127e-05, + "loss": 7.3114, + "step": 1108 + }, + { + "epoch": 0.00659553715862594, + "grad_norm": 2.730980396270752, + "learning_rate": 4.99946431310808e-05, + "loss": 7.1978, + "step": 1109 + }, + { + "epoch": 0.006601484441906937, + "grad_norm": 3.288687229156494, + "learning_rate": 4.9994633457624974e-05, + "loss": 7.4397, + "step": 1110 + }, + { + "epoch": 0.006607431725187934, + "grad_norm": 3.3060662746429443, + "learning_rate": 4.999462377544377e-05, + "loss": 7.1638, + "step": 1111 + }, + { + "epoch": 0.006613379008468931, + "grad_norm": 2.2697036266326904, + "learning_rate": 4.9994614084537204e-05, + "loss": 7.2654, + "step": 1112 + }, + { + "epoch": 0.006619326291749929, + "grad_norm": 2.330495595932007, + "learning_rate": 4.999460438490528e-05, + "loss": 7.2132, + "step": 1113 + }, + { + "epoch": 0.006625273575030926, + "grad_norm": 2.8239340782165527, + "learning_rate": 4.999459467654799e-05, + "loss": 7.3477, + "step": 1114 + }, + { + "epoch": 0.0066312208583119234, + "grad_norm": 2.591614246368408, + "learning_rate": 4.999458495946535e-05, + "loss": 7.0377, + "step": 1115 + }, + { + "epoch": 0.00663716814159292, + "grad_norm": 4.554818630218506, + "learning_rate": 4.999457523365736e-05, + "loss": 7.1266, + "step": 1116 + }, + { + "epoch": 0.006643115424873918, + "grad_norm": 2.21018123626709, + "learning_rate": 4.999456549912401e-05, + "loss": 7.1433, + "step": 1117 + }, + { + "epoch": 0.006649062708154915, + "grad_norm": 2.0298593044281006, + "learning_rate": 4.999455575586533e-05, + "loss": 7.257, + "step": 1118 + }, + { + "epoch": 0.0066550099914359125, + "grad_norm": 2.4532642364501953, + "learning_rate": 4.9994546003881305e-05, + "loss": 7.0618, + "step": 1119 + }, + { + "epoch": 0.006660957274716909, + "grad_norm": 2.428380012512207, + "learning_rate": 4.999453624317194e-05, + "loss": 7.2039, + "step": 1120 + }, + { + "epoch": 0.006666904557997907, + "grad_norm": 2.5572609901428223, + "learning_rate": 4.999452647373724e-05, + "loss": 7.0991, + "step": 1121 + }, + { + "epoch": 0.006672851841278904, + "grad_norm": 2.379640817642212, + "learning_rate": 4.999451669557721e-05, + "loss": 7.1424, + "step": 1122 + }, + { + "epoch": 0.006678799124559901, + "grad_norm": 2.5764007568359375, + "learning_rate": 4.999450690869185e-05, + "loss": 7.1218, + "step": 1123 + }, + { + "epoch": 0.006684746407840898, + "grad_norm": 2.6560606956481934, + "learning_rate": 4.999449711308117e-05, + "loss": 7.2994, + "step": 1124 + }, + { + "epoch": 0.006690693691121895, + "grad_norm": 2.4687581062316895, + "learning_rate": 4.999448730874518e-05, + "loss": 7.4169, + "step": 1125 + }, + { + "epoch": 0.006696640974402893, + "grad_norm": 2.8232173919677734, + "learning_rate": 4.999447749568386e-05, + "loss": 7.291, + "step": 1126 + }, + { + "epoch": 0.00670258825768389, + "grad_norm": 2.6960325241088867, + "learning_rate": 4.9994467673897224e-05, + "loss": 7.3162, + "step": 1127 + }, + { + "epoch": 0.006708535540964887, + "grad_norm": 2.222391366958618, + "learning_rate": 4.999445784338528e-05, + "loss": 7.221, + "step": 1128 + }, + { + "epoch": 0.006714482824245884, + "grad_norm": 2.334995985031128, + "learning_rate": 4.9994448004148024e-05, + "loss": 7.4813, + "step": 1129 + }, + { + "epoch": 0.006720430107526882, + "grad_norm": 2.653491497039795, + "learning_rate": 4.999443815618548e-05, + "loss": 7.3515, + "step": 1130 + }, + { + "epoch": 0.006726377390807879, + "grad_norm": 2.6943631172180176, + "learning_rate": 4.999442829949762e-05, + "loss": 7.2674, + "step": 1131 + }, + { + "epoch": 0.006732324674088876, + "grad_norm": 2.395573377609253, + "learning_rate": 4.999441843408447e-05, + "loss": 7.483, + "step": 1132 + }, + { + "epoch": 0.006738271957369873, + "grad_norm": 2.3801541328430176, + "learning_rate": 4.999440855994603e-05, + "loss": 7.3355, + "step": 1133 + }, + { + "epoch": 0.006744219240650871, + "grad_norm": 2.8566555976867676, + "learning_rate": 4.999439867708229e-05, + "loss": 6.8323, + "step": 1134 + }, + { + "epoch": 0.006750166523931868, + "grad_norm": 2.5987985134124756, + "learning_rate": 4.999438878549327e-05, + "loss": 6.957, + "step": 1135 + }, + { + "epoch": 0.006756113807212865, + "grad_norm": 2.4411563873291016, + "learning_rate": 4.9994378885178964e-05, + "loss": 6.9935, + "step": 1136 + }, + { + "epoch": 0.006762061090493862, + "grad_norm": 2.4227802753448486, + "learning_rate": 4.9994368976139386e-05, + "loss": 7.2856, + "step": 1137 + }, + { + "epoch": 0.00676800837377486, + "grad_norm": 2.55317759513855, + "learning_rate": 4.999435905837453e-05, + "loss": 7.1741, + "step": 1138 + }, + { + "epoch": 0.006773955657055857, + "grad_norm": 2.3329968452453613, + "learning_rate": 4.9994349131884396e-05, + "loss": 7.2007, + "step": 1139 + }, + { + "epoch": 0.006779902940336854, + "grad_norm": 2.538499593734741, + "learning_rate": 4.999433919666899e-05, + "loss": 7.1755, + "step": 1140 + }, + { + "epoch": 0.006785850223617851, + "grad_norm": 2.3580374717712402, + "learning_rate": 4.999432925272833e-05, + "loss": 7.2249, + "step": 1141 + }, + { + "epoch": 0.006791797506898849, + "grad_norm": 2.2783255577087402, + "learning_rate": 4.99943193000624e-05, + "loss": 7.3627, + "step": 1142 + }, + { + "epoch": 0.006797744790179846, + "grad_norm": 3.0798208713531494, + "learning_rate": 4.999430933867122e-05, + "loss": 7.2718, + "step": 1143 + }, + { + "epoch": 0.006803692073460843, + "grad_norm": 2.703232526779175, + "learning_rate": 4.9994299368554776e-05, + "loss": 7.116, + "step": 1144 + }, + { + "epoch": 0.00680963935674184, + "grad_norm": 2.480327606201172, + "learning_rate": 4.9994289389713076e-05, + "loss": 6.9743, + "step": 1145 + }, + { + "epoch": 0.006815586640022838, + "grad_norm": 2.2707130908966064, + "learning_rate": 4.9994279402146137e-05, + "loss": 6.9919, + "step": 1146 + }, + { + "epoch": 0.006821533923303835, + "grad_norm": 2.0424580574035645, + "learning_rate": 4.999426940585396e-05, + "loss": 7.0366, + "step": 1147 + }, + { + "epoch": 0.006827481206584832, + "grad_norm": 1.9720054864883423, + "learning_rate": 4.999425940083653e-05, + "loss": 6.8622, + "step": 1148 + }, + { + "epoch": 0.006833428489865829, + "grad_norm": 2.7109742164611816, + "learning_rate": 4.9994249387093864e-05, + "loss": 7.5375, + "step": 1149 + }, + { + "epoch": 0.006839375773146827, + "grad_norm": 2.267328977584839, + "learning_rate": 4.999423936462596e-05, + "loss": 7.5606, + "step": 1150 + }, + { + "epoch": 0.006845323056427824, + "grad_norm": 2.958360433578491, + "learning_rate": 4.999422933343283e-05, + "loss": 7.3503, + "step": 1151 + }, + { + "epoch": 0.006851270339708821, + "grad_norm": 2.2681283950805664, + "learning_rate": 4.9994219293514475e-05, + "loss": 6.9278, + "step": 1152 + }, + { + "epoch": 0.006857217622989818, + "grad_norm": 2.4755337238311768, + "learning_rate": 4.999420924487089e-05, + "loss": 7.1385, + "step": 1153 + }, + { + "epoch": 0.006863164906270815, + "grad_norm": 2.283277988433838, + "learning_rate": 4.999419918750209e-05, + "loss": 6.9287, + "step": 1154 + }, + { + "epoch": 0.006869112189551813, + "grad_norm": 2.3692893981933594, + "learning_rate": 4.999418912140808e-05, + "loss": 7.0648, + "step": 1155 + }, + { + "epoch": 0.00687505947283281, + "grad_norm": 2.2676453590393066, + "learning_rate": 4.999417904658884e-05, + "loss": 6.9754, + "step": 1156 + }, + { + "epoch": 0.006881006756113807, + "grad_norm": 2.4106669425964355, + "learning_rate": 4.9994168963044405e-05, + "loss": 7.033, + "step": 1157 + }, + { + "epoch": 0.006886954039394804, + "grad_norm": 2.947758913040161, + "learning_rate": 4.9994158870774754e-05, + "loss": 7.0821, + "step": 1158 + }, + { + "epoch": 0.006892901322675802, + "grad_norm": 2.5338058471679688, + "learning_rate": 4.9994148769779905e-05, + "loss": 6.9426, + "step": 1159 + }, + { + "epoch": 0.006898848605956799, + "grad_norm": 2.4848148822784424, + "learning_rate": 4.999413866005985e-05, + "loss": 7.2488, + "step": 1160 + }, + { + "epoch": 0.006904795889237796, + "grad_norm": 2.444077730178833, + "learning_rate": 4.999412854161461e-05, + "loss": 6.871, + "step": 1161 + }, + { + "epoch": 0.006910743172518793, + "grad_norm": 2.376962661743164, + "learning_rate": 4.9994118414444174e-05, + "loss": 7.0258, + "step": 1162 + }, + { + "epoch": 0.006916690455799791, + "grad_norm": 3.502023458480835, + "learning_rate": 4.9994108278548545e-05, + "loss": 7.4869, + "step": 1163 + }, + { + "epoch": 0.006922637739080788, + "grad_norm": 3.117741584777832, + "learning_rate": 4.999409813392774e-05, + "loss": 7.4437, + "step": 1164 + }, + { + "epoch": 0.006928585022361785, + "grad_norm": 3.805560827255249, + "learning_rate": 4.999408798058175e-05, + "loss": 7.3796, + "step": 1165 + }, + { + "epoch": 0.006934532305642782, + "grad_norm": 3.67065167427063, + "learning_rate": 4.9994077818510576e-05, + "loss": 7.2304, + "step": 1166 + }, + { + "epoch": 0.00694047958892378, + "grad_norm": 2.5749545097351074, + "learning_rate": 4.9994067647714236e-05, + "loss": 7.0943, + "step": 1167 + }, + { + "epoch": 0.006946426872204777, + "grad_norm": 2.561405897140503, + "learning_rate": 4.9994057468192724e-05, + "loss": 6.9496, + "step": 1168 + }, + { + "epoch": 0.006952374155485774, + "grad_norm": 2.477344512939453, + "learning_rate": 4.999404727994604e-05, + "loss": 7.3494, + "step": 1169 + }, + { + "epoch": 0.006958321438766771, + "grad_norm": 2.897580146789551, + "learning_rate": 4.999403708297419e-05, + "loss": 7.6081, + "step": 1170 + }, + { + "epoch": 0.006964268722047769, + "grad_norm": 3.899249792098999, + "learning_rate": 4.999402687727719e-05, + "loss": 7.4448, + "step": 1171 + }, + { + "epoch": 0.006970216005328766, + "grad_norm": 3.0791561603546143, + "learning_rate": 4.9994016662855025e-05, + "loss": 7.1616, + "step": 1172 + }, + { + "epoch": 0.006976163288609763, + "grad_norm": 2.8212931156158447, + "learning_rate": 4.999400643970771e-05, + "loss": 7.1824, + "step": 1173 + }, + { + "epoch": 0.00698211057189076, + "grad_norm": 4.33271598815918, + "learning_rate": 4.9993996207835246e-05, + "loss": 7.2432, + "step": 1174 + }, + { + "epoch": 0.006988057855171758, + "grad_norm": 2.985125780105591, + "learning_rate": 4.999398596723764e-05, + "loss": 7.6521, + "step": 1175 + }, + { + "epoch": 0.006994005138452755, + "grad_norm": 3.1069905757904053, + "learning_rate": 4.9993975717914885e-05, + "loss": 7.0071, + "step": 1176 + }, + { + "epoch": 0.006999952421733752, + "grad_norm": 2.915214776992798, + "learning_rate": 4.9993965459866995e-05, + "loss": 7.6192, + "step": 1177 + }, + { + "epoch": 0.007005899705014749, + "grad_norm": 5.314033031463623, + "learning_rate": 4.999395519309397e-05, + "loss": 6.9447, + "step": 1178 + }, + { + "epoch": 0.007011846988295747, + "grad_norm": 2.2723114490509033, + "learning_rate": 4.999394491759581e-05, + "loss": 7.1228, + "step": 1179 + }, + { + "epoch": 0.007017794271576744, + "grad_norm": 2.936365842819214, + "learning_rate": 4.999393463337253e-05, + "loss": 7.136, + "step": 1180 + }, + { + "epoch": 0.007023741554857741, + "grad_norm": 2.864250898361206, + "learning_rate": 4.9993924340424115e-05, + "loss": 7.026, + "step": 1181 + }, + { + "epoch": 0.007029688838138738, + "grad_norm": 3.299370050430298, + "learning_rate": 4.9993914038750586e-05, + "loss": 7.1114, + "step": 1182 + }, + { + "epoch": 0.007035636121419736, + "grad_norm": 3.0609943866729736, + "learning_rate": 4.999390372835193e-05, + "loss": 7.3052, + "step": 1183 + }, + { + "epoch": 0.007041583404700733, + "grad_norm": 3.54488468170166, + "learning_rate": 4.9993893409228176e-05, + "loss": 7.4845, + "step": 1184 + }, + { + "epoch": 0.0070475306879817295, + "grad_norm": 2.5196385383605957, + "learning_rate": 4.99938830813793e-05, + "loss": 7.312, + "step": 1185 + }, + { + "epoch": 0.007053477971262727, + "grad_norm": 3.570802927017212, + "learning_rate": 4.9993872744805326e-05, + "loss": 7.0038, + "step": 1186 + }, + { + "epoch": 0.007059425254543724, + "grad_norm": 2.631058931350708, + "learning_rate": 4.999386239950624e-05, + "loss": 7.5574, + "step": 1187 + }, + { + "epoch": 0.007065372537824722, + "grad_norm": 3.027251958847046, + "learning_rate": 4.999385204548206e-05, + "loss": 6.9837, + "step": 1188 + }, + { + "epoch": 0.0070713198211057185, + "grad_norm": 3.00128173828125, + "learning_rate": 4.999384168273279e-05, + "loss": 7.4479, + "step": 1189 + }, + { + "epoch": 0.007077267104386716, + "grad_norm": 2.127028226852417, + "learning_rate": 4.999383131125842e-05, + "loss": 7.3609, + "step": 1190 + }, + { + "epoch": 0.007083214387667713, + "grad_norm": 2.375511646270752, + "learning_rate": 4.9993820931058965e-05, + "loss": 7.3695, + "step": 1191 + }, + { + "epoch": 0.007089161670948711, + "grad_norm": 2.527743101119995, + "learning_rate": 4.999381054213442e-05, + "loss": 7.1478, + "step": 1192 + }, + { + "epoch": 0.0070951089542297075, + "grad_norm": 2.1600632667541504, + "learning_rate": 4.99938001444848e-05, + "loss": 7.7111, + "step": 1193 + }, + { + "epoch": 0.007101056237510705, + "grad_norm": 2.3242850303649902, + "learning_rate": 4.99937897381101e-05, + "loss": 7.6751, + "step": 1194 + }, + { + "epoch": 0.007107003520791702, + "grad_norm": 3.4553158283233643, + "learning_rate": 4.9993779323010334e-05, + "loss": 7.775, + "step": 1195 + }, + { + "epoch": 0.0071129508040727, + "grad_norm": 2.4339516162872314, + "learning_rate": 4.999376889918549e-05, + "loss": 7.099, + "step": 1196 + }, + { + "epoch": 0.0071188980873536966, + "grad_norm": 2.531851291656494, + "learning_rate": 4.9993758466635574e-05, + "loss": 7.5222, + "step": 1197 + }, + { + "epoch": 0.007124845370634694, + "grad_norm": 2.6549220085144043, + "learning_rate": 4.999374802536061e-05, + "loss": 7.4917, + "step": 1198 + }, + { + "epoch": 0.007130792653915691, + "grad_norm": 2.9149320125579834, + "learning_rate": 4.999373757536058e-05, + "loss": 7.0438, + "step": 1199 + }, + { + "epoch": 0.007136739937196689, + "grad_norm": 3.0234971046447754, + "learning_rate": 4.999372711663549e-05, + "loss": 7.6838, + "step": 1200 + }, + { + "epoch": 0.007142687220477686, + "grad_norm": 2.4006800651550293, + "learning_rate": 4.999371664918535e-05, + "loss": 7.6607, + "step": 1201 + }, + { + "epoch": 0.007148634503758683, + "grad_norm": 2.6191699504852295, + "learning_rate": 4.9993706173010164e-05, + "loss": 7.4727, + "step": 1202 + }, + { + "epoch": 0.00715458178703968, + "grad_norm": 3.040844440460205, + "learning_rate": 4.999369568810993e-05, + "loss": 7.1459, + "step": 1203 + }, + { + "epoch": 0.007160529070320678, + "grad_norm": 2.8474466800689697, + "learning_rate": 4.9993685194484654e-05, + "loss": 7.4615, + "step": 1204 + }, + { + "epoch": 0.007166476353601675, + "grad_norm": 1.928662657737732, + "learning_rate": 4.999367469213435e-05, + "loss": 7.4259, + "step": 1205 + }, + { + "epoch": 0.007172423636882672, + "grad_norm": 2.369540214538574, + "learning_rate": 4.999366418105901e-05, + "loss": 6.9342, + "step": 1206 + }, + { + "epoch": 0.007178370920163669, + "grad_norm": 4.003239154815674, + "learning_rate": 4.999365366125863e-05, + "loss": 7.3289, + "step": 1207 + }, + { + "epoch": 0.007184318203444667, + "grad_norm": 4.491976261138916, + "learning_rate": 4.9993643132733234e-05, + "loss": 7.3479, + "step": 1208 + }, + { + "epoch": 0.007190265486725664, + "grad_norm": 2.3678557872772217, + "learning_rate": 4.9993632595482806e-05, + "loss": 7.3091, + "step": 1209 + }, + { + "epoch": 0.007196212770006661, + "grad_norm": 2.9310050010681152, + "learning_rate": 4.999362204950737e-05, + "loss": 7.1996, + "step": 1210 + }, + { + "epoch": 0.007202160053287658, + "grad_norm": 3.6861345767974854, + "learning_rate": 4.999361149480691e-05, + "loss": 7.43, + "step": 1211 + }, + { + "epoch": 0.007208107336568656, + "grad_norm": 2.657515287399292, + "learning_rate": 4.9993600931381446e-05, + "loss": 6.9888, + "step": 1212 + }, + { + "epoch": 0.007214054619849653, + "grad_norm": 2.8346996307373047, + "learning_rate": 4.999359035923097e-05, + "loss": 7.0366, + "step": 1213 + }, + { + "epoch": 0.00722000190313065, + "grad_norm": 3.494162082672119, + "learning_rate": 4.9993579778355487e-05, + "loss": 7.499, + "step": 1214 + }, + { + "epoch": 0.007225949186411647, + "grad_norm": 2.9848556518554688, + "learning_rate": 4.999356918875501e-05, + "loss": 7.2064, + "step": 1215 + }, + { + "epoch": 0.007231896469692645, + "grad_norm": 2.391390562057495, + "learning_rate": 4.999355859042953e-05, + "loss": 7.2752, + "step": 1216 + }, + { + "epoch": 0.007237843752973642, + "grad_norm": 2.872891902923584, + "learning_rate": 4.9993547983379065e-05, + "loss": 6.9865, + "step": 1217 + }, + { + "epoch": 0.0072437910362546385, + "grad_norm": 2.760213613510132, + "learning_rate": 4.99935373676036e-05, + "loss": 7.0211, + "step": 1218 + }, + { + "epoch": 0.007249738319535636, + "grad_norm": 2.8857531547546387, + "learning_rate": 4.9993526743103156e-05, + "loss": 6.9162, + "step": 1219 + }, + { + "epoch": 0.007255685602816633, + "grad_norm": 3.150836229324341, + "learning_rate": 4.999351610987772e-05, + "loss": 7.2929, + "step": 1220 + }, + { + "epoch": 0.007261632886097631, + "grad_norm": 2.2004289627075195, + "learning_rate": 4.999350546792732e-05, + "loss": 7.4729, + "step": 1221 + }, + { + "epoch": 0.0072675801693786275, + "grad_norm": 2.5004026889801025, + "learning_rate": 4.999349481725194e-05, + "loss": 7.5235, + "step": 1222 + }, + { + "epoch": 0.007273527452659625, + "grad_norm": 2.8355395793914795, + "learning_rate": 4.999348415785159e-05, + "loss": 7.3535, + "step": 1223 + }, + { + "epoch": 0.007279474735940622, + "grad_norm": 2.559330701828003, + "learning_rate": 4.9993473489726276e-05, + "loss": 6.9634, + "step": 1224 + }, + { + "epoch": 0.00728542201922162, + "grad_norm": 2.3559181690216064, + "learning_rate": 4.999346281287599e-05, + "loss": 6.9246, + "step": 1225 + }, + { + "epoch": 0.0072913693025026165, + "grad_norm": 2.3852717876434326, + "learning_rate": 4.999345212730075e-05, + "loss": 6.6417, + "step": 1226 + }, + { + "epoch": 0.007297316585783614, + "grad_norm": 2.2604117393493652, + "learning_rate": 4.999344143300055e-05, + "loss": 7.4182, + "step": 1227 + }, + { + "epoch": 0.007303263869064611, + "grad_norm": 2.57983660697937, + "learning_rate": 4.9993430729975396e-05, + "loss": 7.4841, + "step": 1228 + }, + { + "epoch": 0.007309211152345609, + "grad_norm": 2.653935670852661, + "learning_rate": 4.99934200182253e-05, + "loss": 7.5477, + "step": 1229 + }, + { + "epoch": 0.0073151584356266055, + "grad_norm": 2.0740158557891846, + "learning_rate": 4.999340929775026e-05, + "loss": 7.4359, + "step": 1230 + }, + { + "epoch": 0.007321105718907603, + "grad_norm": 2.62064528465271, + "learning_rate": 4.9993398568550275e-05, + "loss": 7.1817, + "step": 1231 + }, + { + "epoch": 0.0073270530021886, + "grad_norm": 2.318244457244873, + "learning_rate": 4.999338783062536e-05, + "loss": 7.1663, + "step": 1232 + }, + { + "epoch": 0.007333000285469598, + "grad_norm": 3.0533225536346436, + "learning_rate": 4.99933770839755e-05, + "loss": 7.3051, + "step": 1233 + }, + { + "epoch": 0.0073389475687505945, + "grad_norm": 4.821422100067139, + "learning_rate": 4.999336632860072e-05, + "loss": 7.3435, + "step": 1234 + }, + { + "epoch": 0.007344894852031592, + "grad_norm": 2.680873155593872, + "learning_rate": 4.999335556450101e-05, + "loss": 7.3447, + "step": 1235 + }, + { + "epoch": 0.007350842135312589, + "grad_norm": 3.287454605102539, + "learning_rate": 4.999334479167638e-05, + "loss": 7.1957, + "step": 1236 + }, + { + "epoch": 0.007356789418593587, + "grad_norm": 3.7452759742736816, + "learning_rate": 4.999333401012682e-05, + "loss": 7.2093, + "step": 1237 + }, + { + "epoch": 0.0073627367018745836, + "grad_norm": 3.363443374633789, + "learning_rate": 4.999332321985236e-05, + "loss": 7.297, + "step": 1238 + }, + { + "epoch": 0.007368683985155581, + "grad_norm": 3.070962905883789, + "learning_rate": 4.999331242085299e-05, + "loss": 7.0831, + "step": 1239 + }, + { + "epoch": 0.007374631268436578, + "grad_norm": 3.635183095932007, + "learning_rate": 4.9993301613128706e-05, + "loss": 7.3116, + "step": 1240 + }, + { + "epoch": 0.007380578551717576, + "grad_norm": 2.532179594039917, + "learning_rate": 4.9993290796679516e-05, + "loss": 7.5238, + "step": 1241 + }, + { + "epoch": 0.007386525834998573, + "grad_norm": 2.1147687435150146, + "learning_rate": 4.999327997150543e-05, + "loss": 7.2279, + "step": 1242 + }, + { + "epoch": 0.00739247311827957, + "grad_norm": 2.1221182346343994, + "learning_rate": 4.999326913760645e-05, + "loss": 7.6575, + "step": 1243 + }, + { + "epoch": 0.007398420401560567, + "grad_norm": 2.2920000553131104, + "learning_rate": 4.999325829498257e-05, + "loss": 7.5652, + "step": 1244 + }, + { + "epoch": 0.007404367684841565, + "grad_norm": 2.3444230556488037, + "learning_rate": 4.9993247443633814e-05, + "loss": 7.3992, + "step": 1245 + }, + { + "epoch": 0.007410314968122562, + "grad_norm": 2.2778663635253906, + "learning_rate": 4.9993236583560164e-05, + "loss": 7.1212, + "step": 1246 + }, + { + "epoch": 0.007416262251403559, + "grad_norm": 2.38369083404541, + "learning_rate": 4.999322571476164e-05, + "loss": 7.4605, + "step": 1247 + }, + { + "epoch": 0.007422209534684556, + "grad_norm": 3.578537702560425, + "learning_rate": 4.999321483723823e-05, + "loss": 7.1446, + "step": 1248 + }, + { + "epoch": 0.007428156817965553, + "grad_norm": 5.227176666259766, + "learning_rate": 4.9993203950989954e-05, + "loss": 7.2308, + "step": 1249 + }, + { + "epoch": 0.007434104101246551, + "grad_norm": 2.665844440460205, + "learning_rate": 4.9993193056016805e-05, + "loss": 7.102, + "step": 1250 + }, + { + "epoch": 0.007440051384527547, + "grad_norm": 4.462922096252441, + "learning_rate": 4.9993182152318796e-05, + "loss": 7.003, + "step": 1251 + }, + { + "epoch": 0.007445998667808545, + "grad_norm": 4.9459099769592285, + "learning_rate": 4.999317123989592e-05, + "loss": 7.1338, + "step": 1252 + }, + { + "epoch": 0.007451945951089542, + "grad_norm": 3.127427339553833, + "learning_rate": 4.9993160318748186e-05, + "loss": 7.045, + "step": 1253 + }, + { + "epoch": 0.00745789323437054, + "grad_norm": 3.03910231590271, + "learning_rate": 4.9993149388875606e-05, + "loss": 6.8523, + "step": 1254 + }, + { + "epoch": 0.0074638405176515365, + "grad_norm": 2.931033134460449, + "learning_rate": 4.9993138450278166e-05, + "loss": 7.3065, + "step": 1255 + }, + { + "epoch": 0.007469787800932534, + "grad_norm": 4.60735559463501, + "learning_rate": 4.999312750295588e-05, + "loss": 7.5384, + "step": 1256 + }, + { + "epoch": 0.007475735084213531, + "grad_norm": 3.0745065212249756, + "learning_rate": 4.9993116546908755e-05, + "loss": 7.6279, + "step": 1257 + }, + { + "epoch": 0.007481682367494529, + "grad_norm": 2.7158751487731934, + "learning_rate": 4.9993105582136804e-05, + "loss": 7.1885, + "step": 1258 + }, + { + "epoch": 0.0074876296507755255, + "grad_norm": 3.5049819946289062, + "learning_rate": 4.999309460864e-05, + "loss": 6.6833, + "step": 1259 + }, + { + "epoch": 0.007493576934056523, + "grad_norm": 3.229778289794922, + "learning_rate": 4.999308362641837e-05, + "loss": 6.784, + "step": 1260 + }, + { + "epoch": 0.00749952421733752, + "grad_norm": 2.7032854557037354, + "learning_rate": 4.999307263547191e-05, + "loss": 6.8003, + "step": 1261 + }, + { + "epoch": 0.007505471500618518, + "grad_norm": 5.892059326171875, + "learning_rate": 4.999306163580063e-05, + "loss": 7.2365, + "step": 1262 + }, + { + "epoch": 0.0075114187838995145, + "grad_norm": 5.8021135330200195, + "learning_rate": 4.999305062740453e-05, + "loss": 7.3822, + "step": 1263 + }, + { + "epoch": 0.007517366067180512, + "grad_norm": 5.1242899894714355, + "learning_rate": 4.9993039610283614e-05, + "loss": 7.2192, + "step": 1264 + }, + { + "epoch": 0.007523313350461509, + "grad_norm": 3.102980375289917, + "learning_rate": 4.9993028584437884e-05, + "loss": 7.4895, + "step": 1265 + }, + { + "epoch": 0.007529260633742507, + "grad_norm": 4.993838310241699, + "learning_rate": 4.999301754986735e-05, + "loss": 7.4771, + "step": 1266 + }, + { + "epoch": 0.0075352079170235035, + "grad_norm": 4.003589630126953, + "learning_rate": 4.999300650657201e-05, + "loss": 7.3591, + "step": 1267 + }, + { + "epoch": 0.007541155200304501, + "grad_norm": 3.6125710010528564, + "learning_rate": 4.999299545455187e-05, + "loss": 7.262, + "step": 1268 + }, + { + "epoch": 0.007547102483585498, + "grad_norm": 3.182196617126465, + "learning_rate": 4.999298439380693e-05, + "loss": 7.2689, + "step": 1269 + }, + { + "epoch": 0.007553049766866496, + "grad_norm": 2.428313732147217, + "learning_rate": 4.99929733243372e-05, + "loss": 7.2364, + "step": 1270 + }, + { + "epoch": 0.0075589970501474925, + "grad_norm": 2.673356771469116, + "learning_rate": 4.999296224614268e-05, + "loss": 7.2356, + "step": 1271 + }, + { + "epoch": 0.00756494433342849, + "grad_norm": 2.508026361465454, + "learning_rate": 4.9992951159223376e-05, + "loss": 7.1052, + "step": 1272 + }, + { + "epoch": 0.007570891616709487, + "grad_norm": 2.7501845359802246, + "learning_rate": 4.99929400635793e-05, + "loss": 7.5041, + "step": 1273 + }, + { + "epoch": 0.007576838899990485, + "grad_norm": 2.4604434967041016, + "learning_rate": 4.999292895921044e-05, + "loss": 7.5042, + "step": 1274 + }, + { + "epoch": 0.0075827861832714815, + "grad_norm": 2.4926865100860596, + "learning_rate": 4.99929178461168e-05, + "loss": 7.2104, + "step": 1275 + }, + { + "epoch": 0.007588733466552479, + "grad_norm": 2.631985664367676, + "learning_rate": 4.999290672429839e-05, + "loss": 6.8608, + "step": 1276 + }, + { + "epoch": 0.007594680749833476, + "grad_norm": 2.5684268474578857, + "learning_rate": 4.999289559375523e-05, + "loss": 7.1199, + "step": 1277 + }, + { + "epoch": 0.007600628033114474, + "grad_norm": 2.4312644004821777, + "learning_rate": 4.99928844544873e-05, + "loss": 7.1814, + "step": 1278 + }, + { + "epoch": 0.0076065753163954706, + "grad_norm": 2.794407367706299, + "learning_rate": 4.99928733064946e-05, + "loss": 7.2909, + "step": 1279 + }, + { + "epoch": 0.007612522599676467, + "grad_norm": 2.5903992652893066, + "learning_rate": 4.9992862149777166e-05, + "loss": 7.354, + "step": 1280 + }, + { + "epoch": 0.007618469882957465, + "grad_norm": 2.266364336013794, + "learning_rate": 4.999285098433497e-05, + "loss": 7.5697, + "step": 1281 + }, + { + "epoch": 0.007624417166238462, + "grad_norm": 3.1871070861816406, + "learning_rate": 4.999283981016803e-05, + "loss": 7.4393, + "step": 1282 + }, + { + "epoch": 0.00763036444951946, + "grad_norm": 2.137981653213501, + "learning_rate": 4.999282862727635e-05, + "loss": 7.3591, + "step": 1283 + }, + { + "epoch": 0.007636311732800456, + "grad_norm": 2.3166019916534424, + "learning_rate": 4.999281743565993e-05, + "loss": 7.4307, + "step": 1284 + }, + { + "epoch": 0.007642259016081454, + "grad_norm": 2.331110954284668, + "learning_rate": 4.999280623531878e-05, + "loss": 7.3214, + "step": 1285 + }, + { + "epoch": 0.007648206299362451, + "grad_norm": 2.7417728900909424, + "learning_rate": 4.999279502625289e-05, + "loss": 7.3593, + "step": 1286 + }, + { + "epoch": 0.007654153582643449, + "grad_norm": 3.089448928833008, + "learning_rate": 4.999278380846228e-05, + "loss": 7.3347, + "step": 1287 + }, + { + "epoch": 0.007660100865924445, + "grad_norm": 2.9446022510528564, + "learning_rate": 4.999277258194694e-05, + "loss": 7.3109, + "step": 1288 + }, + { + "epoch": 0.007666048149205443, + "grad_norm": 2.713355302810669, + "learning_rate": 4.9992761346706896e-05, + "loss": 7.2962, + "step": 1289 + }, + { + "epoch": 0.00767199543248644, + "grad_norm": 2.9480702877044678, + "learning_rate": 4.9992750102742125e-05, + "loss": 7.2081, + "step": 1290 + }, + { + "epoch": 0.007677942715767438, + "grad_norm": 2.737271785736084, + "learning_rate": 4.999273885005265e-05, + "loss": 7.2251, + "step": 1291 + }, + { + "epoch": 0.007683889999048434, + "grad_norm": 2.6954190731048584, + "learning_rate": 4.9992727588638466e-05, + "loss": 7.3437, + "step": 1292 + }, + { + "epoch": 0.007689837282329432, + "grad_norm": 3.0270752906799316, + "learning_rate": 4.999271631849958e-05, + "loss": 7.2516, + "step": 1293 + }, + { + "epoch": 0.007695784565610429, + "grad_norm": 2.824052333831787, + "learning_rate": 4.999270503963599e-05, + "loss": 7.2706, + "step": 1294 + }, + { + "epoch": 0.007701731848891427, + "grad_norm": 2.800713300704956, + "learning_rate": 4.999269375204771e-05, + "loss": 7.2497, + "step": 1295 + }, + { + "epoch": 0.0077076791321724234, + "grad_norm": 3.2510271072387695, + "learning_rate": 4.999268245573474e-05, + "loss": 7.025, + "step": 1296 + }, + { + "epoch": 0.007713626415453421, + "grad_norm": 3.095862627029419, + "learning_rate": 4.999267115069708e-05, + "loss": 7.1815, + "step": 1297 + }, + { + "epoch": 0.007719573698734418, + "grad_norm": 3.2238826751708984, + "learning_rate": 4.999265983693473e-05, + "loss": 7.2268, + "step": 1298 + }, + { + "epoch": 0.007725520982015416, + "grad_norm": 3.18687105178833, + "learning_rate": 4.999264851444771e-05, + "loss": 7.2076, + "step": 1299 + }, + { + "epoch": 0.0077314682652964125, + "grad_norm": 3.1385931968688965, + "learning_rate": 4.9992637183236016e-05, + "loss": 7.2323, + "step": 1300 + }, + { + "epoch": 0.00773741554857741, + "grad_norm": 2.3172361850738525, + "learning_rate": 4.999262584329964e-05, + "loss": 7.1225, + "step": 1301 + }, + { + "epoch": 0.007743362831858407, + "grad_norm": 3.3223013877868652, + "learning_rate": 4.99926144946386e-05, + "loss": 7.2108, + "step": 1302 + }, + { + "epoch": 0.007749310115139405, + "grad_norm": 3.197218894958496, + "learning_rate": 4.99926031372529e-05, + "loss": 7.5123, + "step": 1303 + }, + { + "epoch": 0.0077552573984204015, + "grad_norm": 2.8411800861358643, + "learning_rate": 4.999259177114254e-05, + "loss": 7.3047, + "step": 1304 + }, + { + "epoch": 0.007761204681701399, + "grad_norm": 2.7549736499786377, + "learning_rate": 4.9992580396307524e-05, + "loss": 7.3478, + "step": 1305 + }, + { + "epoch": 0.007767151964982396, + "grad_norm": 2.8829352855682373, + "learning_rate": 4.999256901274786e-05, + "loss": 7.1871, + "step": 1306 + }, + { + "epoch": 0.007773099248263394, + "grad_norm": 2.710076332092285, + "learning_rate": 4.999255762046354e-05, + "loss": 7.0891, + "step": 1307 + }, + { + "epoch": 0.0077790465315443905, + "grad_norm": 2.6598877906799316, + "learning_rate": 4.999254621945458e-05, + "loss": 7.6178, + "step": 1308 + }, + { + "epoch": 0.007784993814825388, + "grad_norm": 2.4012649059295654, + "learning_rate": 4.999253480972099e-05, + "loss": 7.5925, + "step": 1309 + }, + { + "epoch": 0.007790941098106385, + "grad_norm": 2.1501622200012207, + "learning_rate": 4.999252339126275e-05, + "loss": 7.6471, + "step": 1310 + }, + { + "epoch": 0.007796888381387382, + "grad_norm": 3.2150895595550537, + "learning_rate": 4.9992511964079886e-05, + "loss": 7.3995, + "step": 1311 + }, + { + "epoch": 0.0078028356646683795, + "grad_norm": 2.450465440750122, + "learning_rate": 4.9992500528172395e-05, + "loss": 7.219, + "step": 1312 + }, + { + "epoch": 0.007808782947949376, + "grad_norm": 2.714510679244995, + "learning_rate": 4.9992489083540274e-05, + "loss": 7.2023, + "step": 1313 + }, + { + "epoch": 0.007814730231230374, + "grad_norm": 2.660019636154175, + "learning_rate": 4.999247763018354e-05, + "loss": 6.8686, + "step": 1314 + }, + { + "epoch": 0.00782067751451137, + "grad_norm": 2.1031477451324463, + "learning_rate": 4.999246616810218e-05, + "loss": 7.305, + "step": 1315 + }, + { + "epoch": 0.007826624797792368, + "grad_norm": 3.0037856101989746, + "learning_rate": 4.999245469729622e-05, + "loss": 6.9788, + "step": 1316 + }, + { + "epoch": 0.007832572081073366, + "grad_norm": 3.1931207180023193, + "learning_rate": 4.999244321776565e-05, + "loss": 6.9312, + "step": 1317 + }, + { + "epoch": 0.007838519364354363, + "grad_norm": 2.7419891357421875, + "learning_rate": 4.999243172951047e-05, + "loss": 6.7732, + "step": 1318 + }, + { + "epoch": 0.00784446664763536, + "grad_norm": 2.772061824798584, + "learning_rate": 4.99924202325307e-05, + "loss": 6.9576, + "step": 1319 + }, + { + "epoch": 0.007850413930916357, + "grad_norm": 2.9300522804260254, + "learning_rate": 4.999240872682632e-05, + "loss": 6.8366, + "step": 1320 + }, + { + "epoch": 0.007856361214197355, + "grad_norm": 3.4697458744049072, + "learning_rate": 4.9992397212397365e-05, + "loss": 6.9234, + "step": 1321 + }, + { + "epoch": 0.007862308497478352, + "grad_norm": 3.044647693634033, + "learning_rate": 4.999238568924381e-05, + "loss": 6.8406, + "step": 1322 + }, + { + "epoch": 0.007868255780759349, + "grad_norm": 2.4429051876068115, + "learning_rate": 4.999237415736567e-05, + "loss": 6.9815, + "step": 1323 + }, + { + "epoch": 0.007874203064040346, + "grad_norm": 2.6193530559539795, + "learning_rate": 4.999236261676296e-05, + "loss": 7.3867, + "step": 1324 + }, + { + "epoch": 0.007880150347321344, + "grad_norm": 3.9543204307556152, + "learning_rate": 4.999235106743567e-05, + "loss": 7.2391, + "step": 1325 + }, + { + "epoch": 0.007886097630602341, + "grad_norm": 3.12777042388916, + "learning_rate": 4.9992339509383814e-05, + "loss": 7.0976, + "step": 1326 + }, + { + "epoch": 0.007892044913883338, + "grad_norm": 2.4543895721435547, + "learning_rate": 4.999232794260739e-05, + "loss": 7.1865, + "step": 1327 + }, + { + "epoch": 0.007897992197164335, + "grad_norm": 4.254832744598389, + "learning_rate": 4.999231636710639e-05, + "loss": 6.777, + "step": 1328 + }, + { + "epoch": 0.007903939480445333, + "grad_norm": 2.7835497856140137, + "learning_rate": 4.999230478288084e-05, + "loss": 6.8508, + "step": 1329 + }, + { + "epoch": 0.00790988676372633, + "grad_norm": 3.2724666595458984, + "learning_rate": 4.999229318993073e-05, + "loss": 6.7636, + "step": 1330 + }, + { + "epoch": 0.007915834047007327, + "grad_norm": 4.657248020172119, + "learning_rate": 4.9992281588256075e-05, + "loss": 7.3677, + "step": 1331 + }, + { + "epoch": 0.007921781330288324, + "grad_norm": 6.201416492462158, + "learning_rate": 4.999226997785686e-05, + "loss": 7.5804, + "step": 1332 + }, + { + "epoch": 0.007927728613569322, + "grad_norm": 4.955161094665527, + "learning_rate": 4.999225835873312e-05, + "loss": 7.1867, + "step": 1333 + }, + { + "epoch": 0.007933675896850319, + "grad_norm": 3.4105887413024902, + "learning_rate": 4.9992246730884826e-05, + "loss": 7.0948, + "step": 1334 + }, + { + "epoch": 0.007939623180131316, + "grad_norm": 2.514570951461792, + "learning_rate": 4.999223509431201e-05, + "loss": 6.9367, + "step": 1335 + }, + { + "epoch": 0.007945570463412313, + "grad_norm": 3.7689249515533447, + "learning_rate": 4.9992223449014654e-05, + "loss": 7.2209, + "step": 1336 + }, + { + "epoch": 0.007951517746693311, + "grad_norm": 4.997833728790283, + "learning_rate": 4.999221179499277e-05, + "loss": 7.3336, + "step": 1337 + }, + { + "epoch": 0.007957465029974308, + "grad_norm": 5.1314287185668945, + "learning_rate": 4.999220013224637e-05, + "loss": 6.933, + "step": 1338 + }, + { + "epoch": 0.007963412313255305, + "grad_norm": 3.708528518676758, + "learning_rate": 4.9992188460775447e-05, + "loss": 6.9598, + "step": 1339 + }, + { + "epoch": 0.007969359596536302, + "grad_norm": 3.029602289199829, + "learning_rate": 4.999217678058001e-05, + "loss": 7.3674, + "step": 1340 + }, + { + "epoch": 0.007975306879817299, + "grad_norm": 3.000312089920044, + "learning_rate": 4.999216509166006e-05, + "loss": 7.2705, + "step": 1341 + }, + { + "epoch": 0.007981254163098297, + "grad_norm": 4.852355480194092, + "learning_rate": 4.999215339401561e-05, + "loss": 7.1842, + "step": 1342 + }, + { + "epoch": 0.007987201446379294, + "grad_norm": 3.0430521965026855, + "learning_rate": 4.999214168764664e-05, + "loss": 7.5616, + "step": 1343 + }, + { + "epoch": 0.00799314872966029, + "grad_norm": 2.793760061264038, + "learning_rate": 4.999212997255319e-05, + "loss": 7.4867, + "step": 1344 + }, + { + "epoch": 0.007999096012941288, + "grad_norm": 3.516545295715332, + "learning_rate": 4.9992118248735245e-05, + "loss": 7.5857, + "step": 1345 + }, + { + "epoch": 0.008005043296222286, + "grad_norm": 4.272013187408447, + "learning_rate": 4.9992106516192796e-05, + "loss": 7.5686, + "step": 1346 + }, + { + "epoch": 0.008010990579503283, + "grad_norm": 3.176974058151245, + "learning_rate": 4.999209477492587e-05, + "loss": 7.1826, + "step": 1347 + }, + { + "epoch": 0.00801693786278428, + "grad_norm": 3.2615413665771484, + "learning_rate": 4.999208302493447e-05, + "loss": 7.3933, + "step": 1348 + }, + { + "epoch": 0.008022885146065277, + "grad_norm": 2.9548113346099854, + "learning_rate": 4.999207126621858e-05, + "loss": 7.339, + "step": 1349 + }, + { + "epoch": 0.008028832429346275, + "grad_norm": 3.445829153060913, + "learning_rate": 4.999205949877822e-05, + "loss": 7.4223, + "step": 1350 + }, + { + "epoch": 0.008034779712627272, + "grad_norm": 3.471991777420044, + "learning_rate": 4.999204772261338e-05, + "loss": 7.4192, + "step": 1351 + }, + { + "epoch": 0.008040726995908269, + "grad_norm": 3.1682589054107666, + "learning_rate": 4.999203593772409e-05, + "loss": 7.3433, + "step": 1352 + }, + { + "epoch": 0.008046674279189266, + "grad_norm": 4.693798065185547, + "learning_rate": 4.999202414411033e-05, + "loss": 7.1479, + "step": 1353 + }, + { + "epoch": 0.008052621562470264, + "grad_norm": 3.0599937438964844, + "learning_rate": 4.9992012341772114e-05, + "loss": 7.3137, + "step": 1354 + }, + { + "epoch": 0.008058568845751261, + "grad_norm": 2.9557557106018066, + "learning_rate": 4.999200053070945e-05, + "loss": 7.4466, + "step": 1355 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 2.5595791339874268, + "learning_rate": 4.999198871092233e-05, + "loss": 7.4716, + "step": 1356 + }, + { + "epoch": 0.008070463412313255, + "grad_norm": 2.919729709625244, + "learning_rate": 4.999197688241076e-05, + "loss": 7.0754, + "step": 1357 + }, + { + "epoch": 0.008076410695594253, + "grad_norm": 2.5880625247955322, + "learning_rate": 4.9991965045174763e-05, + "loss": 7.2794, + "step": 1358 + }, + { + "epoch": 0.00808235797887525, + "grad_norm": 2.9933066368103027, + "learning_rate": 4.999195319921432e-05, + "loss": 7.3547, + "step": 1359 + }, + { + "epoch": 0.008088305262156247, + "grad_norm": 5.097862243652344, + "learning_rate": 4.999194134452945e-05, + "loss": 7.1922, + "step": 1360 + }, + { + "epoch": 0.008094252545437244, + "grad_norm": 4.1795830726623535, + "learning_rate": 4.9991929481120146e-05, + "loss": 7.0437, + "step": 1361 + }, + { + "epoch": 0.008100199828718242, + "grad_norm": 3.292961835861206, + "learning_rate": 4.999191760898642e-05, + "loss": 6.8637, + "step": 1362 + }, + { + "epoch": 0.008106147111999239, + "grad_norm": 3.052610397338867, + "learning_rate": 4.999190572812828e-05, + "loss": 7.1675, + "step": 1363 + }, + { + "epoch": 0.008112094395280236, + "grad_norm": 2.975646734237671, + "learning_rate": 4.999189383854571e-05, + "loss": 7.1309, + "step": 1364 + }, + { + "epoch": 0.008118041678561233, + "grad_norm": 2.71195912361145, + "learning_rate": 4.999188194023874e-05, + "loss": 7.2247, + "step": 1365 + }, + { + "epoch": 0.008123988961842231, + "grad_norm": 2.751002311706543, + "learning_rate": 4.9991870033207354e-05, + "loss": 6.8553, + "step": 1366 + }, + { + "epoch": 0.008129936245123228, + "grad_norm": 3.4521234035491943, + "learning_rate": 4.999185811745157e-05, + "loss": 6.8373, + "step": 1367 + }, + { + "epoch": 0.008135883528404225, + "grad_norm": 3.054330348968506, + "learning_rate": 4.999184619297138e-05, + "loss": 6.6982, + "step": 1368 + }, + { + "epoch": 0.008141830811685222, + "grad_norm": 3.513794183731079, + "learning_rate": 4.99918342597668e-05, + "loss": 6.5567, + "step": 1369 + }, + { + "epoch": 0.00814777809496622, + "grad_norm": 3.681838274002075, + "learning_rate": 4.9991822317837836e-05, + "loss": 6.6335, + "step": 1370 + }, + { + "epoch": 0.008153725378247217, + "grad_norm": 4.144393444061279, + "learning_rate": 4.999181036718447e-05, + "loss": 6.5361, + "step": 1371 + }, + { + "epoch": 0.008159672661528214, + "grad_norm": 2.9771196842193604, + "learning_rate": 4.9991798407806736e-05, + "loss": 7.0085, + "step": 1372 + }, + { + "epoch": 0.00816561994480921, + "grad_norm": 3.114884376525879, + "learning_rate": 4.9991786439704615e-05, + "loss": 7.1498, + "step": 1373 + }, + { + "epoch": 0.008171567228090208, + "grad_norm": 2.76042103767395, + "learning_rate": 4.9991774462878115e-05, + "loss": 6.8462, + "step": 1374 + }, + { + "epoch": 0.008177514511371206, + "grad_norm": 3.257528066635132, + "learning_rate": 4.999176247732725e-05, + "loss": 6.4595, + "step": 1375 + }, + { + "epoch": 0.008183461794652203, + "grad_norm": 3.377774238586426, + "learning_rate": 4.999175048305202e-05, + "loss": 6.3131, + "step": 1376 + }, + { + "epoch": 0.0081894090779332, + "grad_norm": 3.029477834701538, + "learning_rate": 4.999173848005243e-05, + "loss": 6.7182, + "step": 1377 + }, + { + "epoch": 0.008195356361214197, + "grad_norm": 3.0353076457977295, + "learning_rate": 4.9991726468328476e-05, + "loss": 7.009, + "step": 1378 + }, + { + "epoch": 0.008201303644495195, + "grad_norm": 2.465014934539795, + "learning_rate": 4.999171444788017e-05, + "loss": 7.6277, + "step": 1379 + }, + { + "epoch": 0.008207250927776192, + "grad_norm": 3.025954484939575, + "learning_rate": 4.999170241870752e-05, + "loss": 7.2815, + "step": 1380 + }, + { + "epoch": 0.008213198211057189, + "grad_norm": 3.8414018154144287, + "learning_rate": 4.999169038081052e-05, + "loss": 7.2238, + "step": 1381 + }, + { + "epoch": 0.008219145494338186, + "grad_norm": 3.2927470207214355, + "learning_rate": 4.999167833418918e-05, + "loss": 7.1505, + "step": 1382 + }, + { + "epoch": 0.008225092777619184, + "grad_norm": 2.6132330894470215, + "learning_rate": 4.999166627884351e-05, + "loss": 7.2499, + "step": 1383 + }, + { + "epoch": 0.008231040060900181, + "grad_norm": 2.523366689682007, + "learning_rate": 4.9991654214773497e-05, + "loss": 6.9812, + "step": 1384 + }, + { + "epoch": 0.008236987344181178, + "grad_norm": 3.977471351623535, + "learning_rate": 4.9991642141979154e-05, + "loss": 7.3196, + "step": 1385 + }, + { + "epoch": 0.008242934627462175, + "grad_norm": 2.731952428817749, + "learning_rate": 4.99916300604605e-05, + "loss": 7.1014, + "step": 1386 + }, + { + "epoch": 0.008248881910743173, + "grad_norm": 2.6128756999969482, + "learning_rate": 4.999161797021752e-05, + "loss": 7.0235, + "step": 1387 + }, + { + "epoch": 0.00825482919402417, + "grad_norm": 2.263430595397949, + "learning_rate": 4.999160587125023e-05, + "loss": 7.0183, + "step": 1388 + }, + { + "epoch": 0.008260776477305167, + "grad_norm": 2.799994707107544, + "learning_rate": 4.9991593763558614e-05, + "loss": 6.9553, + "step": 1389 + }, + { + "epoch": 0.008266723760586164, + "grad_norm": 2.5443058013916016, + "learning_rate": 4.99915816471427e-05, + "loss": 7.2302, + "step": 1390 + }, + { + "epoch": 0.008272671043867162, + "grad_norm": 2.304185152053833, + "learning_rate": 4.999156952200248e-05, + "loss": 7.2589, + "step": 1391 + }, + { + "epoch": 0.008278618327148159, + "grad_norm": 2.1639649868011475, + "learning_rate": 4.999155738813797e-05, + "loss": 7.0067, + "step": 1392 + }, + { + "epoch": 0.008284565610429156, + "grad_norm": 2.276514768600464, + "learning_rate": 4.999154524554915e-05, + "loss": 7.2721, + "step": 1393 + }, + { + "epoch": 0.008290512893710153, + "grad_norm": 2.212200880050659, + "learning_rate": 4.9991533094236055e-05, + "loss": 7.1183, + "step": 1394 + }, + { + "epoch": 0.008296460176991151, + "grad_norm": 2.5289459228515625, + "learning_rate": 4.999152093419867e-05, + "loss": 7.0289, + "step": 1395 + }, + { + "epoch": 0.008302407460272148, + "grad_norm": 2.5915603637695312, + "learning_rate": 4.999150876543699e-05, + "loss": 6.7497, + "step": 1396 + }, + { + "epoch": 0.008308354743553145, + "grad_norm": 2.680513858795166, + "learning_rate": 4.999149658795105e-05, + "loss": 6.7139, + "step": 1397 + }, + { + "epoch": 0.008314302026834142, + "grad_norm": 2.65744948387146, + "learning_rate": 4.999148440174083e-05, + "loss": 6.6151, + "step": 1398 + }, + { + "epoch": 0.00832024931011514, + "grad_norm": 3.8028745651245117, + "learning_rate": 4.9991472206806334e-05, + "loss": 7.1992, + "step": 1399 + }, + { + "epoch": 0.008326196593396137, + "grad_norm": 2.8436119556427, + "learning_rate": 4.999146000314758e-05, + "loss": 7.165, + "step": 1400 + }, + { + "epoch": 0.008332143876677134, + "grad_norm": 2.6658496856689453, + "learning_rate": 4.999144779076457e-05, + "loss": 7.5945, + "step": 1401 + }, + { + "epoch": 0.00833809115995813, + "grad_norm": 2.909703016281128, + "learning_rate": 4.99914355696573e-05, + "loss": 7.6378, + "step": 1402 + }, + { + "epoch": 0.00834403844323913, + "grad_norm": 2.5827598571777344, + "learning_rate": 4.9991423339825776e-05, + "loss": 7.5441, + "step": 1403 + }, + { + "epoch": 0.008349985726520126, + "grad_norm": 3.0283706188201904, + "learning_rate": 4.999141110127e-05, + "loss": 7.1162, + "step": 1404 + }, + { + "epoch": 0.008355933009801123, + "grad_norm": 3.11690354347229, + "learning_rate": 4.999139885398999e-05, + "loss": 6.5123, + "step": 1405 + }, + { + "epoch": 0.00836188029308212, + "grad_norm": 2.6188690662384033, + "learning_rate": 4.999138659798574e-05, + "loss": 7.6384, + "step": 1406 + }, + { + "epoch": 0.008367827576363117, + "grad_norm": 3.4412481784820557, + "learning_rate": 4.999137433325725e-05, + "loss": 7.4067, + "step": 1407 + }, + { + "epoch": 0.008373774859644115, + "grad_norm": 3.1690893173217773, + "learning_rate": 4.999136205980454e-05, + "loss": 7.3937, + "step": 1408 + }, + { + "epoch": 0.008379722142925112, + "grad_norm": 2.1589877605438232, + "learning_rate": 4.999134977762759e-05, + "loss": 7.454, + "step": 1409 + }, + { + "epoch": 0.008385669426206109, + "grad_norm": 2.485901117324829, + "learning_rate": 4.999133748672642e-05, + "loss": 7.3421, + "step": 1410 + }, + { + "epoch": 0.008391616709487106, + "grad_norm": 2.543128252029419, + "learning_rate": 4.999132518710104e-05, + "loss": 7.3162, + "step": 1411 + }, + { + "epoch": 0.008397563992768104, + "grad_norm": 2.8048489093780518, + "learning_rate": 4.999131287875144e-05, + "loss": 7.297, + "step": 1412 + }, + { + "epoch": 0.008403511276049101, + "grad_norm": 3.0391035079956055, + "learning_rate": 4.9991300561677634e-05, + "loss": 7.2409, + "step": 1413 + }, + { + "epoch": 0.008409458559330098, + "grad_norm": 2.3196053504943848, + "learning_rate": 4.999128823587962e-05, + "loss": 7.1358, + "step": 1414 + }, + { + "epoch": 0.008415405842611095, + "grad_norm": 3.1876983642578125, + "learning_rate": 4.999127590135741e-05, + "loss": 7.1501, + "step": 1415 + }, + { + "epoch": 0.008421353125892093, + "grad_norm": 3.6832327842712402, + "learning_rate": 4.9991263558111e-05, + "loss": 7.181, + "step": 1416 + }, + { + "epoch": 0.00842730040917309, + "grad_norm": 3.7491936683654785, + "learning_rate": 4.99912512061404e-05, + "loss": 6.9669, + "step": 1417 + }, + { + "epoch": 0.008433247692454087, + "grad_norm": 3.1583478450775146, + "learning_rate": 4.9991238845445615e-05, + "loss": 7.2155, + "step": 1418 + }, + { + "epoch": 0.008439194975735084, + "grad_norm": 3.11611008644104, + "learning_rate": 4.999122647602664e-05, + "loss": 7.164, + "step": 1419 + }, + { + "epoch": 0.008445142259016082, + "grad_norm": 6.127118110656738, + "learning_rate": 4.9991214097883495e-05, + "loss": 7.232, + "step": 1420 + }, + { + "epoch": 0.008451089542297079, + "grad_norm": 4.736495494842529, + "learning_rate": 4.9991201711016166e-05, + "loss": 7.3685, + "step": 1421 + }, + { + "epoch": 0.008457036825578076, + "grad_norm": 2.9656684398651123, + "learning_rate": 4.999118931542467e-05, + "loss": 7.2658, + "step": 1422 + }, + { + "epoch": 0.008462984108859073, + "grad_norm": 2.5959243774414062, + "learning_rate": 4.999117691110901e-05, + "loss": 7.0908, + "step": 1423 + }, + { + "epoch": 0.008468931392140071, + "grad_norm": 4.546379089355469, + "learning_rate": 4.999116449806919e-05, + "loss": 7.1343, + "step": 1424 + }, + { + "epoch": 0.008474878675421068, + "grad_norm": 3.6856796741485596, + "learning_rate": 4.9991152076305206e-05, + "loss": 6.9205, + "step": 1425 + }, + { + "epoch": 0.008480825958702065, + "grad_norm": 3.293973922729492, + "learning_rate": 4.9991139645817075e-05, + "loss": 6.9954, + "step": 1426 + }, + { + "epoch": 0.008486773241983062, + "grad_norm": 3.2511162757873535, + "learning_rate": 4.999112720660479e-05, + "loss": 6.7661, + "step": 1427 + }, + { + "epoch": 0.00849272052526406, + "grad_norm": 3.990840196609497, + "learning_rate": 4.9991114758668364e-05, + "loss": 6.7402, + "step": 1428 + }, + { + "epoch": 0.008498667808545057, + "grad_norm": 3.306809186935425, + "learning_rate": 4.9991102302007804e-05, + "loss": 6.6801, + "step": 1429 + }, + { + "epoch": 0.008504615091826054, + "grad_norm": 5.208675384521484, + "learning_rate": 4.99910898366231e-05, + "loss": 7.0128, + "step": 1430 + }, + { + "epoch": 0.00851056237510705, + "grad_norm": 4.131346225738525, + "learning_rate": 4.9991077362514266e-05, + "loss": 7.0992, + "step": 1431 + }, + { + "epoch": 0.00851650965838805, + "grad_norm": 2.60927152633667, + "learning_rate": 4.99910648796813e-05, + "loss": 7.2731, + "step": 1432 + }, + { + "epoch": 0.008522456941669046, + "grad_norm": 5.654631614685059, + "learning_rate": 4.9991052388124224e-05, + "loss": 6.6105, + "step": 1433 + }, + { + "epoch": 0.008528404224950043, + "grad_norm": 6.108455657958984, + "learning_rate": 4.9991039887843025e-05, + "loss": 6.3548, + "step": 1434 + }, + { + "epoch": 0.00853435150823104, + "grad_norm": 3.758371591567993, + "learning_rate": 4.9991027378837705e-05, + "loss": 6.6171, + "step": 1435 + }, + { + "epoch": 0.008540298791512036, + "grad_norm": 2.1995320320129395, + "learning_rate": 4.9991014861108285e-05, + "loss": 6.5987, + "step": 1436 + }, + { + "epoch": 0.008546246074793035, + "grad_norm": 2.3778254985809326, + "learning_rate": 4.999100233465476e-05, + "loss": 6.8067, + "step": 1437 + }, + { + "epoch": 0.008552193358074032, + "grad_norm": 2.521928310394287, + "learning_rate": 4.999098979947713e-05, + "loss": 6.7756, + "step": 1438 + }, + { + "epoch": 0.008558140641355029, + "grad_norm": 2.109605073928833, + "learning_rate": 4.99909772555754e-05, + "loss": 6.7091, + "step": 1439 + }, + { + "epoch": 0.008564087924636025, + "grad_norm": 2.55838680267334, + "learning_rate": 4.9990964702949585e-05, + "loss": 6.8989, + "step": 1440 + }, + { + "epoch": 0.008570035207917024, + "grad_norm": 2.4499685764312744, + "learning_rate": 4.9990952141599675e-05, + "loss": 6.6241, + "step": 1441 + }, + { + "epoch": 0.00857598249119802, + "grad_norm": 2.265371322631836, + "learning_rate": 4.9990939571525685e-05, + "loss": 7.6681, + "step": 1442 + }, + { + "epoch": 0.008581929774479018, + "grad_norm": 2.4496965408325195, + "learning_rate": 4.999092699272762e-05, + "loss": 6.8177, + "step": 1443 + }, + { + "epoch": 0.008587877057760014, + "grad_norm": 2.5555005073547363, + "learning_rate": 4.999091440520548e-05, + "loss": 6.6402, + "step": 1444 + }, + { + "epoch": 0.008593824341041013, + "grad_norm": 2.042592763900757, + "learning_rate": 4.999090180895927e-05, + "loss": 6.6114, + "step": 1445 + }, + { + "epoch": 0.00859977162432201, + "grad_norm": 2.3100671768188477, + "learning_rate": 4.9990889203988986e-05, + "loss": 6.712, + "step": 1446 + }, + { + "epoch": 0.008605718907603007, + "grad_norm": 2.7600841522216797, + "learning_rate": 4.999087659029465e-05, + "loss": 6.6531, + "step": 1447 + }, + { + "epoch": 0.008611666190884004, + "grad_norm": 3.292684316635132, + "learning_rate": 4.999086396787625e-05, + "loss": 6.9896, + "step": 1448 + }, + { + "epoch": 0.008617613474165002, + "grad_norm": 2.7579386234283447, + "learning_rate": 4.999085133673381e-05, + "loss": 7.1559, + "step": 1449 + }, + { + "epoch": 0.008623560757445999, + "grad_norm": 2.7898707389831543, + "learning_rate": 4.999083869686731e-05, + "loss": 6.9861, + "step": 1450 + }, + { + "epoch": 0.008629508040726996, + "grad_norm": 3.439809799194336, + "learning_rate": 4.999082604827677e-05, + "loss": 6.759, + "step": 1451 + }, + { + "epoch": 0.008635455324007993, + "grad_norm": 2.924859046936035, + "learning_rate": 4.999081339096219e-05, + "loss": 6.5438, + "step": 1452 + }, + { + "epoch": 0.008641402607288991, + "grad_norm": 3.363886594772339, + "learning_rate": 4.999080072492358e-05, + "loss": 7.0477, + "step": 1453 + }, + { + "epoch": 0.008647349890569988, + "grad_norm": 2.924988031387329, + "learning_rate": 4.999078805016093e-05, + "loss": 6.9228, + "step": 1454 + }, + { + "epoch": 0.008653297173850985, + "grad_norm": 3.2283847332000732, + "learning_rate": 4.999077536667426e-05, + "loss": 6.8763, + "step": 1455 + }, + { + "epoch": 0.008659244457131982, + "grad_norm": 2.635744094848633, + "learning_rate": 4.999076267446357e-05, + "loss": 6.6438, + "step": 1456 + }, + { + "epoch": 0.00866519174041298, + "grad_norm": 2.829801559448242, + "learning_rate": 4.9990749973528864e-05, + "loss": 6.9466, + "step": 1457 + }, + { + "epoch": 0.008671139023693977, + "grad_norm": 3.3631057739257812, + "learning_rate": 4.999073726387014e-05, + "loss": 7.2652, + "step": 1458 + }, + { + "epoch": 0.008677086306974974, + "grad_norm": 3.9970719814300537, + "learning_rate": 4.999072454548741e-05, + "loss": 7.053, + "step": 1459 + }, + { + "epoch": 0.00868303359025597, + "grad_norm": 3.322787046432495, + "learning_rate": 4.9990711818380674e-05, + "loss": 7.0272, + "step": 1460 + }, + { + "epoch": 0.008688980873536969, + "grad_norm": 2.7370798587799072, + "learning_rate": 4.999069908254995e-05, + "loss": 6.8545, + "step": 1461 + }, + { + "epoch": 0.008694928156817966, + "grad_norm": 2.845191240310669, + "learning_rate": 4.999068633799522e-05, + "loss": 6.9393, + "step": 1462 + }, + { + "epoch": 0.008700875440098963, + "grad_norm": 3.064960241317749, + "learning_rate": 4.99906735847165e-05, + "loss": 6.7734, + "step": 1463 + }, + { + "epoch": 0.00870682272337996, + "grad_norm": 7.113090515136719, + "learning_rate": 4.99906608227138e-05, + "loss": 7.0532, + "step": 1464 + }, + { + "epoch": 0.008712770006660958, + "grad_norm": 5.90821647644043, + "learning_rate": 4.999064805198711e-05, + "loss": 7.1494, + "step": 1465 + }, + { + "epoch": 0.008718717289941955, + "grad_norm": 3.9366238117218018, + "learning_rate": 4.9990635272536454e-05, + "loss": 7.623, + "step": 1466 + }, + { + "epoch": 0.008724664573222952, + "grad_norm": 3.1239330768585205, + "learning_rate": 4.9990622484361814e-05, + "loss": 7.4938, + "step": 1467 + }, + { + "epoch": 0.008730611856503949, + "grad_norm": 2.6688928604125977, + "learning_rate": 4.9990609687463216e-05, + "loss": 7.3445, + "step": 1468 + }, + { + "epoch": 0.008736559139784945, + "grad_norm": 3.047154664993286, + "learning_rate": 4.9990596881840646e-05, + "loss": 7.158, + "step": 1469 + }, + { + "epoch": 0.008742506423065944, + "grad_norm": 2.5230467319488525, + "learning_rate": 4.999058406749412e-05, + "loss": 7.1368, + "step": 1470 + }, + { + "epoch": 0.00874845370634694, + "grad_norm": 2.729705333709717, + "learning_rate": 4.999057124442364e-05, + "loss": 7.0144, + "step": 1471 + }, + { + "epoch": 0.008754400989627938, + "grad_norm": 2.5796756744384766, + "learning_rate": 4.999055841262921e-05, + "loss": 7.2157, + "step": 1472 + }, + { + "epoch": 0.008760348272908934, + "grad_norm": 3.458691358566284, + "learning_rate": 4.999054557211084e-05, + "loss": 6.7631, + "step": 1473 + }, + { + "epoch": 0.008766295556189933, + "grad_norm": 2.7262747287750244, + "learning_rate": 4.999053272286851e-05, + "loss": 6.9784, + "step": 1474 + }, + { + "epoch": 0.00877224283947093, + "grad_norm": 2.6003808975219727, + "learning_rate": 4.9990519864902267e-05, + "loss": 7.1369, + "step": 1475 + }, + { + "epoch": 0.008778190122751927, + "grad_norm": 3.4032137393951416, + "learning_rate": 4.999050699821207e-05, + "loss": 6.9569, + "step": 1476 + }, + { + "epoch": 0.008784137406032923, + "grad_norm": 4.099828243255615, + "learning_rate": 4.9990494122797957e-05, + "loss": 6.9977, + "step": 1477 + }, + { + "epoch": 0.008790084689313922, + "grad_norm": 3.1837944984436035, + "learning_rate": 4.999048123865992e-05, + "loss": 7.1331, + "step": 1478 + }, + { + "epoch": 0.008796031972594919, + "grad_norm": 2.618847131729126, + "learning_rate": 4.999046834579796e-05, + "loss": 7.0043, + "step": 1479 + }, + { + "epoch": 0.008801979255875916, + "grad_norm": 3.0132501125335693, + "learning_rate": 4.999045544421209e-05, + "loss": 6.7836, + "step": 1480 + }, + { + "epoch": 0.008807926539156912, + "grad_norm": 2.4608371257781982, + "learning_rate": 4.999044253390231e-05, + "loss": 7.0721, + "step": 1481 + }, + { + "epoch": 0.008813873822437911, + "grad_norm": 3.280649423599243, + "learning_rate": 4.999042961486863e-05, + "loss": 7.959, + "step": 1482 + }, + { + "epoch": 0.008819821105718908, + "grad_norm": 2.7038395404815674, + "learning_rate": 4.999041668711104e-05, + "loss": 7.1256, + "step": 1483 + }, + { + "epoch": 0.008825768388999905, + "grad_norm": 2.1451892852783203, + "learning_rate": 4.9990403750629556e-05, + "loss": 7.2219, + "step": 1484 + }, + { + "epoch": 0.008831715672280901, + "grad_norm": 2.3731601238250732, + "learning_rate": 4.999039080542418e-05, + "loss": 7.2023, + "step": 1485 + }, + { + "epoch": 0.0088376629555619, + "grad_norm": 2.444089651107788, + "learning_rate": 4.999037785149492e-05, + "loss": 7.0988, + "step": 1486 + }, + { + "epoch": 0.008843610238842897, + "grad_norm": 2.644712448120117, + "learning_rate": 4.999036488884177e-05, + "loss": 7.1916, + "step": 1487 + }, + { + "epoch": 0.008849557522123894, + "grad_norm": 5.477145671844482, + "learning_rate": 4.999035191746475e-05, + "loss": 6.7256, + "step": 1488 + }, + { + "epoch": 0.00885550480540489, + "grad_norm": 2.2691709995269775, + "learning_rate": 4.999033893736386e-05, + "loss": 7.2505, + "step": 1489 + }, + { + "epoch": 0.008861452088685889, + "grad_norm": 2.5880343914031982, + "learning_rate": 4.999032594853909e-05, + "loss": 6.9549, + "step": 1490 + }, + { + "epoch": 0.008867399371966886, + "grad_norm": 2.2748520374298096, + "learning_rate": 4.999031295099046e-05, + "loss": 6.8269, + "step": 1491 + }, + { + "epoch": 0.008873346655247883, + "grad_norm": 2.262706995010376, + "learning_rate": 4.999029994471797e-05, + "loss": 6.8876, + "step": 1492 + }, + { + "epoch": 0.00887929393852888, + "grad_norm": 2.264256238937378, + "learning_rate": 4.999028692972162e-05, + "loss": 7.1545, + "step": 1493 + }, + { + "epoch": 0.008885241221809878, + "grad_norm": 2.489259719848633, + "learning_rate": 4.9990273906001424e-05, + "loss": 7.194, + "step": 1494 + }, + { + "epoch": 0.008891188505090875, + "grad_norm": 2.7545981407165527, + "learning_rate": 4.999026087355738e-05, + "loss": 7.0148, + "step": 1495 + }, + { + "epoch": 0.008897135788371872, + "grad_norm": 2.6869328022003174, + "learning_rate": 4.999024783238949e-05, + "loss": 7.2535, + "step": 1496 + }, + { + "epoch": 0.008903083071652869, + "grad_norm": 2.5216503143310547, + "learning_rate": 4.999023478249777e-05, + "loss": 6.4351, + "step": 1497 + }, + { + "epoch": 0.008909030354933865, + "grad_norm": 2.5090575218200684, + "learning_rate": 4.9990221723882216e-05, + "loss": 7.3068, + "step": 1498 + }, + { + "epoch": 0.008914977638214864, + "grad_norm": 2.5026490688323975, + "learning_rate": 4.999020865654283e-05, + "loss": 7.1274, + "step": 1499 + }, + { + "epoch": 0.00892092492149586, + "grad_norm": 2.8030898571014404, + "learning_rate": 4.999019558047963e-05, + "loss": 7.0016, + "step": 1500 + }, + { + "epoch": 0.008926872204776858, + "grad_norm": 2.533383846282959, + "learning_rate": 4.99901824956926e-05, + "loss": 6.8991, + "step": 1501 + }, + { + "epoch": 0.008932819488057854, + "grad_norm": 2.5584118366241455, + "learning_rate": 4.999016940218175e-05, + "loss": 6.9237, + "step": 1502 + }, + { + "epoch": 0.008938766771338853, + "grad_norm": 2.778592586517334, + "learning_rate": 4.99901562999471e-05, + "loss": 7.0941, + "step": 1503 + }, + { + "epoch": 0.00894471405461985, + "grad_norm": 4.023860931396484, + "learning_rate": 4.999014318898865e-05, + "loss": 6.5188, + "step": 1504 + }, + { + "epoch": 0.008950661337900847, + "grad_norm": 3.018118143081665, + "learning_rate": 4.999013006930639e-05, + "loss": 7.0557, + "step": 1505 + }, + { + "epoch": 0.008956608621181843, + "grad_norm": 2.802061080932617, + "learning_rate": 4.999011694090033e-05, + "loss": 7.2645, + "step": 1506 + }, + { + "epoch": 0.008962555904462842, + "grad_norm": 2.3782076835632324, + "learning_rate": 4.999010380377049e-05, + "loss": 7.3707, + "step": 1507 + }, + { + "epoch": 0.008968503187743839, + "grad_norm": 2.451878309249878, + "learning_rate": 4.999009065791686e-05, + "loss": 7.2783, + "step": 1508 + }, + { + "epoch": 0.008974450471024836, + "grad_norm": 3.85514235496521, + "learning_rate": 4.999007750333945e-05, + "loss": 6.3543, + "step": 1509 + }, + { + "epoch": 0.008980397754305832, + "grad_norm": 2.617177963256836, + "learning_rate": 4.999006434003825e-05, + "loss": 7.0175, + "step": 1510 + }, + { + "epoch": 0.008986345037586831, + "grad_norm": 2.6909587383270264, + "learning_rate": 4.999005116801329e-05, + "loss": 7.3282, + "step": 1511 + }, + { + "epoch": 0.008992292320867828, + "grad_norm": 2.332165241241455, + "learning_rate": 4.9990037987264546e-05, + "loss": 7.0993, + "step": 1512 + }, + { + "epoch": 0.008998239604148825, + "grad_norm": 2.5398497581481934, + "learning_rate": 4.9990024797792055e-05, + "loss": 7.2867, + "step": 1513 + }, + { + "epoch": 0.009004186887429821, + "grad_norm": 2.432264566421509, + "learning_rate": 4.9990011599595796e-05, + "loss": 7.1619, + "step": 1514 + }, + { + "epoch": 0.00901013417071082, + "grad_norm": 2.2937278747558594, + "learning_rate": 4.998999839267578e-05, + "loss": 7.1138, + "step": 1515 + }, + { + "epoch": 0.009016081453991817, + "grad_norm": 2.3305680751800537, + "learning_rate": 4.998998517703202e-05, + "loss": 7.0569, + "step": 1516 + }, + { + "epoch": 0.009022028737272814, + "grad_norm": 3.0785884857177734, + "learning_rate": 4.998997195266451e-05, + "loss": 7.0922, + "step": 1517 + }, + { + "epoch": 0.00902797602055381, + "grad_norm": 2.354283571243286, + "learning_rate": 4.998995871957326e-05, + "loss": 7.0024, + "step": 1518 + }, + { + "epoch": 0.009033923303834809, + "grad_norm": 2.488194465637207, + "learning_rate": 4.998994547775827e-05, + "loss": 7.0045, + "step": 1519 + }, + { + "epoch": 0.009039870587115806, + "grad_norm": 2.6196579933166504, + "learning_rate": 4.998993222721956e-05, + "loss": 6.9416, + "step": 1520 + }, + { + "epoch": 0.009045817870396803, + "grad_norm": 2.6524155139923096, + "learning_rate": 4.998991896795711e-05, + "loss": 6.9562, + "step": 1521 + }, + { + "epoch": 0.0090517651536778, + "grad_norm": 3.308661460876465, + "learning_rate": 4.998990569997094e-05, + "loss": 6.8602, + "step": 1522 + }, + { + "epoch": 0.009057712436958798, + "grad_norm": 2.7995994091033936, + "learning_rate": 4.9989892423261055e-05, + "loss": 7.7049, + "step": 1523 + }, + { + "epoch": 0.009063659720239795, + "grad_norm": 2.547189235687256, + "learning_rate": 4.9989879137827456e-05, + "loss": 7.0254, + "step": 1524 + }, + { + "epoch": 0.009069607003520792, + "grad_norm": 2.796393871307373, + "learning_rate": 4.998986584367015e-05, + "loss": 7.0124, + "step": 1525 + }, + { + "epoch": 0.009075554286801788, + "grad_norm": 2.9441823959350586, + "learning_rate": 4.9989852540789136e-05, + "loss": 7.0174, + "step": 1526 + }, + { + "epoch": 0.009081501570082787, + "grad_norm": 2.509150743484497, + "learning_rate": 4.998983922918443e-05, + "loss": 6.9405, + "step": 1527 + }, + { + "epoch": 0.009087448853363784, + "grad_norm": 2.3686184883117676, + "learning_rate": 4.998982590885603e-05, + "loss": 6.794, + "step": 1528 + }, + { + "epoch": 0.00909339613664478, + "grad_norm": 2.937530755996704, + "learning_rate": 4.998981257980393e-05, + "loss": 6.9716, + "step": 1529 + }, + { + "epoch": 0.009099343419925777, + "grad_norm": 2.493178606033325, + "learning_rate": 4.998979924202814e-05, + "loss": 6.5986, + "step": 1530 + }, + { + "epoch": 0.009105290703206774, + "grad_norm": 2.071356773376465, + "learning_rate": 4.9989785895528686e-05, + "loss": 6.536, + "step": 1531 + }, + { + "epoch": 0.009111237986487773, + "grad_norm": 1.9372920989990234, + "learning_rate": 4.998977254030554e-05, + "loss": 6.4036, + "step": 1532 + }, + { + "epoch": 0.00911718526976877, + "grad_norm": 2.3329098224639893, + "learning_rate": 4.998975917635873e-05, + "loss": 6.4861, + "step": 1533 + }, + { + "epoch": 0.009123132553049767, + "grad_norm": 2.9681191444396973, + "learning_rate": 4.998974580368826e-05, + "loss": 6.939, + "step": 1534 + }, + { + "epoch": 0.009129079836330763, + "grad_norm": 2.5993690490722656, + "learning_rate": 4.9989732422294125e-05, + "loss": 7.0809, + "step": 1535 + }, + { + "epoch": 0.009135027119611762, + "grad_norm": 2.827244997024536, + "learning_rate": 4.998971903217633e-05, + "loss": 7.597, + "step": 1536 + }, + { + "epoch": 0.009140974402892759, + "grad_norm": 2.712247848510742, + "learning_rate": 4.9989705633334884e-05, + "loss": 7.3695, + "step": 1537 + }, + { + "epoch": 0.009146921686173756, + "grad_norm": 1.7997468709945679, + "learning_rate": 4.998969222576978e-05, + "loss": 7.6497, + "step": 1538 + }, + { + "epoch": 0.009152868969454752, + "grad_norm": 2.234931230545044, + "learning_rate": 4.998967880948104e-05, + "loss": 7.1636, + "step": 1539 + }, + { + "epoch": 0.009158816252735751, + "grad_norm": 2.150766611099243, + "learning_rate": 4.9989665384468666e-05, + "loss": 6.8621, + "step": 1540 + }, + { + "epoch": 0.009164763536016748, + "grad_norm": 2.9628021717071533, + "learning_rate": 4.998965195073265e-05, + "loss": 6.5059, + "step": 1541 + }, + { + "epoch": 0.009170710819297745, + "grad_norm": 2.720155715942383, + "learning_rate": 4.998963850827301e-05, + "loss": 7.0129, + "step": 1542 + }, + { + "epoch": 0.009176658102578741, + "grad_norm": 2.994684934616089, + "learning_rate": 4.9989625057089744e-05, + "loss": 7.3621, + "step": 1543 + }, + { + "epoch": 0.00918260538585974, + "grad_norm": 2.5991618633270264, + "learning_rate": 4.998961159718286e-05, + "loss": 6.7278, + "step": 1544 + }, + { + "epoch": 0.009188552669140737, + "grad_norm": 2.406353712081909, + "learning_rate": 4.9989598128552355e-05, + "loss": 7.5987, + "step": 1545 + }, + { + "epoch": 0.009194499952421734, + "grad_norm": 3.1308467388153076, + "learning_rate": 4.998958465119824e-05, + "loss": 7.1947, + "step": 1546 + }, + { + "epoch": 0.00920044723570273, + "grad_norm": 2.5381908416748047, + "learning_rate": 4.998957116512053e-05, + "loss": 6.8415, + "step": 1547 + }, + { + "epoch": 0.009206394518983729, + "grad_norm": 2.666410446166992, + "learning_rate": 4.998955767031921e-05, + "loss": 6.9052, + "step": 1548 + }, + { + "epoch": 0.009212341802264726, + "grad_norm": 2.156036138534546, + "learning_rate": 4.9989544166794286e-05, + "loss": 7.6604, + "step": 1549 + }, + { + "epoch": 0.009218289085545723, + "grad_norm": 2.620114803314209, + "learning_rate": 4.998953065454578e-05, + "loss": 6.5475, + "step": 1550 + }, + { + "epoch": 0.00922423636882672, + "grad_norm": 3.2780802249908447, + "learning_rate": 4.9989517133573694e-05, + "loss": 7.0572, + "step": 1551 + }, + { + "epoch": 0.009230183652107718, + "grad_norm": 3.6108100414276123, + "learning_rate": 4.998950360387802e-05, + "loss": 7.0149, + "step": 1552 + }, + { + "epoch": 0.009236130935388715, + "grad_norm": 3.4336259365081787, + "learning_rate": 4.998949006545876e-05, + "loss": 7.2436, + "step": 1553 + }, + { + "epoch": 0.009242078218669712, + "grad_norm": 3.271630048751831, + "learning_rate": 4.9989476518315934e-05, + "loss": 7.3807, + "step": 1554 + }, + { + "epoch": 0.009248025501950708, + "grad_norm": 3.0718438625335693, + "learning_rate": 4.998946296244954e-05, + "loss": 7.2313, + "step": 1555 + }, + { + "epoch": 0.009253972785231707, + "grad_norm": 2.2010579109191895, + "learning_rate": 4.9989449397859575e-05, + "loss": 7.4269, + "step": 1556 + }, + { + "epoch": 0.009259920068512704, + "grad_norm": 2.9805495738983154, + "learning_rate": 4.998943582454607e-05, + "loss": 7.2107, + "step": 1557 + }, + { + "epoch": 0.0092658673517937, + "grad_norm": 2.8313159942626953, + "learning_rate": 4.9989422242508995e-05, + "loss": 7.0453, + "step": 1558 + }, + { + "epoch": 0.009271814635074697, + "grad_norm": 2.7660701274871826, + "learning_rate": 4.998940865174837e-05, + "loss": 7.2205, + "step": 1559 + }, + { + "epoch": 0.009277761918355694, + "grad_norm": 3.808122396469116, + "learning_rate": 4.998939505226421e-05, + "loss": 6.9966, + "step": 1560 + }, + { + "epoch": 0.009283709201636693, + "grad_norm": 3.188976526260376, + "learning_rate": 4.99893814440565e-05, + "loss": 7.0049, + "step": 1561 + }, + { + "epoch": 0.00928965648491769, + "grad_norm": 2.5491533279418945, + "learning_rate": 4.998936782712526e-05, + "loss": 7.0451, + "step": 1562 + }, + { + "epoch": 0.009295603768198686, + "grad_norm": 3.4607698917388916, + "learning_rate": 4.99893542014705e-05, + "loss": 7.0304, + "step": 1563 + }, + { + "epoch": 0.009301551051479683, + "grad_norm": 3.4761910438537598, + "learning_rate": 4.99893405670922e-05, + "loss": 6.9787, + "step": 1564 + }, + { + "epoch": 0.009307498334760682, + "grad_norm": 3.15938138961792, + "learning_rate": 4.998932692399039e-05, + "loss": 7.0203, + "step": 1565 + }, + { + "epoch": 0.009313445618041679, + "grad_norm": 2.600304126739502, + "learning_rate": 4.9989313272165064e-05, + "loss": 7.0782, + "step": 1566 + }, + { + "epoch": 0.009319392901322675, + "grad_norm": 2.54158616065979, + "learning_rate": 4.9989299611616216e-05, + "loss": 6.8354, + "step": 1567 + }, + { + "epoch": 0.009325340184603672, + "grad_norm": 3.4649429321289062, + "learning_rate": 4.9989285942343864e-05, + "loss": 6.8238, + "step": 1568 + }, + { + "epoch": 0.00933128746788467, + "grad_norm": 2.522388458251953, + "learning_rate": 4.998927226434802e-05, + "loss": 6.9544, + "step": 1569 + }, + { + "epoch": 0.009337234751165668, + "grad_norm": 4.074129581451416, + "learning_rate": 4.9989258577628675e-05, + "loss": 6.7229, + "step": 1570 + }, + { + "epoch": 0.009343182034446664, + "grad_norm": 3.395894765853882, + "learning_rate": 4.998924488218584e-05, + "loss": 7.1372, + "step": 1571 + }, + { + "epoch": 0.009349129317727661, + "grad_norm": 2.9850378036499023, + "learning_rate": 4.9989231178019516e-05, + "loss": 6.8966, + "step": 1572 + }, + { + "epoch": 0.00935507660100866, + "grad_norm": 3.1391544342041016, + "learning_rate": 4.9989217465129704e-05, + "loss": 6.6744, + "step": 1573 + }, + { + "epoch": 0.009361023884289657, + "grad_norm": 3.8727803230285645, + "learning_rate": 4.9989203743516414e-05, + "loss": 6.9359, + "step": 1574 + }, + { + "epoch": 0.009366971167570654, + "grad_norm": 3.466169595718384, + "learning_rate": 4.998919001317966e-05, + "loss": 6.979, + "step": 1575 + }, + { + "epoch": 0.00937291845085165, + "grad_norm": 3.3481826782226562, + "learning_rate": 4.998917627411943e-05, + "loss": 6.7749, + "step": 1576 + }, + { + "epoch": 0.009378865734132649, + "grad_norm": 2.425971031188965, + "learning_rate": 4.9989162526335745e-05, + "loss": 7.0127, + "step": 1577 + }, + { + "epoch": 0.009384813017413646, + "grad_norm": 2.8379313945770264, + "learning_rate": 4.9989148769828595e-05, + "loss": 6.5782, + "step": 1578 + }, + { + "epoch": 0.009390760300694643, + "grad_norm": 3.0456466674804688, + "learning_rate": 4.9989135004597994e-05, + "loss": 6.9832, + "step": 1579 + }, + { + "epoch": 0.00939670758397564, + "grad_norm": 2.690138101577759, + "learning_rate": 4.9989121230643944e-05, + "loss": 7.0079, + "step": 1580 + }, + { + "epoch": 0.009402654867256638, + "grad_norm": 3.683105945587158, + "learning_rate": 4.9989107447966444e-05, + "loss": 7.2734, + "step": 1581 + }, + { + "epoch": 0.009408602150537635, + "grad_norm": 2.3310985565185547, + "learning_rate": 4.9989093656565513e-05, + "loss": 7.2388, + "step": 1582 + }, + { + "epoch": 0.009414549433818632, + "grad_norm": 2.353322982788086, + "learning_rate": 4.998907985644115e-05, + "loss": 7.0612, + "step": 1583 + }, + { + "epoch": 0.009420496717099628, + "grad_norm": 2.8458571434020996, + "learning_rate": 4.9989066047593344e-05, + "loss": 7.3093, + "step": 1584 + }, + { + "epoch": 0.009426444000380627, + "grad_norm": 2.3322811126708984, + "learning_rate": 4.9989052230022125e-05, + "loss": 6.983, + "step": 1585 + }, + { + "epoch": 0.009432391283661624, + "grad_norm": 2.7431764602661133, + "learning_rate": 4.998903840372748e-05, + "loss": 6.9694, + "step": 1586 + }, + { + "epoch": 0.00943833856694262, + "grad_norm": 2.7704508304595947, + "learning_rate": 4.998902456870942e-05, + "loss": 6.7727, + "step": 1587 + }, + { + "epoch": 0.009444285850223617, + "grad_norm": 2.4920814037323, + "learning_rate": 4.998901072496796e-05, + "loss": 7.0612, + "step": 1588 + }, + { + "epoch": 0.009450233133504616, + "grad_norm": 2.5911498069763184, + "learning_rate": 4.998899687250308e-05, + "loss": 6.8774, + "step": 1589 + }, + { + "epoch": 0.009456180416785613, + "grad_norm": 2.7269680500030518, + "learning_rate": 4.998898301131481e-05, + "loss": 7.0782, + "step": 1590 + }, + { + "epoch": 0.00946212770006661, + "grad_norm": 2.9707436561584473, + "learning_rate": 4.998896914140314e-05, + "loss": 7.307, + "step": 1591 + }, + { + "epoch": 0.009468074983347606, + "grad_norm": 3.064683675765991, + "learning_rate": 4.998895526276808e-05, + "loss": 7.3708, + "step": 1592 + }, + { + "epoch": 0.009474022266628603, + "grad_norm": 2.4465317726135254, + "learning_rate": 4.998894137540963e-05, + "loss": 7.0085, + "step": 1593 + }, + { + "epoch": 0.009479969549909602, + "grad_norm": 3.3061211109161377, + "learning_rate": 4.99889274793278e-05, + "loss": 6.8353, + "step": 1594 + }, + { + "epoch": 0.009485916833190599, + "grad_norm": 3.283397912979126, + "learning_rate": 4.9988913574522594e-05, + "loss": 6.6848, + "step": 1595 + }, + { + "epoch": 0.009491864116471595, + "grad_norm": 2.770745277404785, + "learning_rate": 4.9988899660994014e-05, + "loss": 7.1742, + "step": 1596 + }, + { + "epoch": 0.009497811399752592, + "grad_norm": 2.7975432872772217, + "learning_rate": 4.998888573874207e-05, + "loss": 6.7329, + "step": 1597 + }, + { + "epoch": 0.00950375868303359, + "grad_norm": 2.545919418334961, + "learning_rate": 4.998887180776677e-05, + "loss": 6.7203, + "step": 1598 + }, + { + "epoch": 0.009509705966314588, + "grad_norm": 2.7961528301239014, + "learning_rate": 4.99888578680681e-05, + "loss": 7.384, + "step": 1599 + }, + { + "epoch": 0.009515653249595584, + "grad_norm": 2.570570230484009, + "learning_rate": 4.9988843919646096e-05, + "loss": 7.0246, + "step": 1600 + }, + { + "epoch": 0.009521600532876581, + "grad_norm": 2.5365843772888184, + "learning_rate": 4.9988829962500734e-05, + "loss": 6.8801, + "step": 1601 + }, + { + "epoch": 0.00952754781615758, + "grad_norm": 2.4713737964630127, + "learning_rate": 4.998881599663203e-05, + "loss": 7.1974, + "step": 1602 + }, + { + "epoch": 0.009533495099438577, + "grad_norm": 2.5286331176757812, + "learning_rate": 4.998880202203999e-05, + "loss": 7.26, + "step": 1603 + }, + { + "epoch": 0.009539442382719573, + "grad_norm": 2.2333719730377197, + "learning_rate": 4.998878803872461e-05, + "loss": 7.3254, + "step": 1604 + }, + { + "epoch": 0.00954538966600057, + "grad_norm": 2.544095277786255, + "learning_rate": 4.9988774046685915e-05, + "loss": 7.407, + "step": 1605 + }, + { + "epoch": 0.009551336949281569, + "grad_norm": 3.057140588760376, + "learning_rate": 4.9988760045923886e-05, + "loss": 6.5303, + "step": 1606 + }, + { + "epoch": 0.009557284232562566, + "grad_norm": 3.0190670490264893, + "learning_rate": 4.998874603643854e-05, + "loss": 6.3276, + "step": 1607 + }, + { + "epoch": 0.009563231515843562, + "grad_norm": 2.208249568939209, + "learning_rate": 4.998873201822989e-05, + "loss": 6.856, + "step": 1608 + }, + { + "epoch": 0.00956917879912456, + "grad_norm": 2.3519229888916016, + "learning_rate": 4.998871799129793e-05, + "loss": 6.9854, + "step": 1609 + }, + { + "epoch": 0.009575126082405558, + "grad_norm": 2.604816198348999, + "learning_rate": 4.9988703955642655e-05, + "loss": 7.3127, + "step": 1610 + }, + { + "epoch": 0.009581073365686555, + "grad_norm": 2.320030927658081, + "learning_rate": 4.9988689911264094e-05, + "loss": 7.216, + "step": 1611 + }, + { + "epoch": 0.009587020648967551, + "grad_norm": 2.8475282192230225, + "learning_rate": 4.998867585816224e-05, + "loss": 6.6743, + "step": 1612 + }, + { + "epoch": 0.009592967932248548, + "grad_norm": 2.518707036972046, + "learning_rate": 4.998866179633709e-05, + "loss": 7.0257, + "step": 1613 + }, + { + "epoch": 0.009598915215529547, + "grad_norm": 2.7348618507385254, + "learning_rate": 4.998864772578866e-05, + "loss": 7.1933, + "step": 1614 + }, + { + "epoch": 0.009604862498810544, + "grad_norm": 2.5701184272766113, + "learning_rate": 4.9988633646516946e-05, + "loss": 7.1071, + "step": 1615 + }, + { + "epoch": 0.00961080978209154, + "grad_norm": 2.916544198989868, + "learning_rate": 4.998861955852197e-05, + "loss": 7.1331, + "step": 1616 + }, + { + "epoch": 0.009616757065372537, + "grad_norm": 2.390934944152832, + "learning_rate": 4.998860546180371e-05, + "loss": 7.3252, + "step": 1617 + }, + { + "epoch": 0.009622704348653536, + "grad_norm": 2.6720097064971924, + "learning_rate": 4.998859135636219e-05, + "loss": 7.0105, + "step": 1618 + }, + { + "epoch": 0.009628651631934533, + "grad_norm": 2.3859329223632812, + "learning_rate": 4.998857724219742e-05, + "loss": 7.023, + "step": 1619 + }, + { + "epoch": 0.00963459891521553, + "grad_norm": 2.9713187217712402, + "learning_rate": 4.998856311930939e-05, + "loss": 7.0338, + "step": 1620 + }, + { + "epoch": 0.009640546198496526, + "grad_norm": 2.33858060836792, + "learning_rate": 4.998854898769811e-05, + "loss": 7.0103, + "step": 1621 + }, + { + "epoch": 0.009646493481777523, + "grad_norm": 2.8897042274475098, + "learning_rate": 4.9988534847363585e-05, + "loss": 7.1225, + "step": 1622 + }, + { + "epoch": 0.009652440765058522, + "grad_norm": 2.354513645172119, + "learning_rate": 4.9988520698305826e-05, + "loss": 6.9272, + "step": 1623 + }, + { + "epoch": 0.009658388048339519, + "grad_norm": 2.5571863651275635, + "learning_rate": 4.9988506540524826e-05, + "loss": 6.3418, + "step": 1624 + }, + { + "epoch": 0.009664335331620515, + "grad_norm": 2.342381238937378, + "learning_rate": 4.99884923740206e-05, + "loss": 6.4265, + "step": 1625 + }, + { + "epoch": 0.009670282614901512, + "grad_norm": 2.5594370365142822, + "learning_rate": 4.998847819879315e-05, + "loss": 6.9801, + "step": 1626 + }, + { + "epoch": 0.00967622989818251, + "grad_norm": 3.6932148933410645, + "learning_rate": 4.9988464014842476e-05, + "loss": 7.0231, + "step": 1627 + }, + { + "epoch": 0.009682177181463508, + "grad_norm": 2.713508367538452, + "learning_rate": 4.998844982216859e-05, + "loss": 6.9041, + "step": 1628 + }, + { + "epoch": 0.009688124464744504, + "grad_norm": 2.703103542327881, + "learning_rate": 4.99884356207715e-05, + "loss": 6.9272, + "step": 1629 + }, + { + "epoch": 0.009694071748025501, + "grad_norm": 3.228708267211914, + "learning_rate": 4.9988421410651197e-05, + "loss": 6.9242, + "step": 1630 + }, + { + "epoch": 0.0097000190313065, + "grad_norm": 3.3407063484191895, + "learning_rate": 4.9988407191807694e-05, + "loss": 6.8871, + "step": 1631 + }, + { + "epoch": 0.009705966314587497, + "grad_norm": 2.3833165168762207, + "learning_rate": 4.9988392964241005e-05, + "loss": 6.9667, + "step": 1632 + }, + { + "epoch": 0.009711913597868493, + "grad_norm": 3.607023239135742, + "learning_rate": 4.9988378727951123e-05, + "loss": 6.93, + "step": 1633 + }, + { + "epoch": 0.00971786088114949, + "grad_norm": 3.797107219696045, + "learning_rate": 4.9988364482938056e-05, + "loss": 6.8115, + "step": 1634 + }, + { + "epoch": 0.009723808164430489, + "grad_norm": 2.5586941242218018, + "learning_rate": 4.998835022920181e-05, + "loss": 6.7322, + "step": 1635 + }, + { + "epoch": 0.009729755447711486, + "grad_norm": 2.377680540084839, + "learning_rate": 4.9988335966742385e-05, + "loss": 6.7127, + "step": 1636 + }, + { + "epoch": 0.009735702730992482, + "grad_norm": 2.510584592819214, + "learning_rate": 4.998832169555979e-05, + "loss": 6.836, + "step": 1637 + }, + { + "epoch": 0.00974165001427348, + "grad_norm": 2.8817014694213867, + "learning_rate": 4.9988307415654025e-05, + "loss": 6.7812, + "step": 1638 + }, + { + "epoch": 0.009747597297554478, + "grad_norm": 2.878535509109497, + "learning_rate": 4.998829312702511e-05, + "loss": 6.7852, + "step": 1639 + }, + { + "epoch": 0.009753544580835475, + "grad_norm": 2.5870323181152344, + "learning_rate": 4.998827882967304e-05, + "loss": 6.8569, + "step": 1640 + }, + { + "epoch": 0.009759491864116471, + "grad_norm": 2.7275760173797607, + "learning_rate": 4.998826452359782e-05, + "loss": 6.8304, + "step": 1641 + }, + { + "epoch": 0.009765439147397468, + "grad_norm": 2.24550461769104, + "learning_rate": 4.998825020879945e-05, + "loss": 6.7609, + "step": 1642 + }, + { + "epoch": 0.009771386430678467, + "grad_norm": 2.2101621627807617, + "learning_rate": 4.9988235885277934e-05, + "loss": 6.7548, + "step": 1643 + }, + { + "epoch": 0.009777333713959464, + "grad_norm": 2.289870023727417, + "learning_rate": 4.9988221553033294e-05, + "loss": 6.8899, + "step": 1644 + }, + { + "epoch": 0.00978328099724046, + "grad_norm": 2.6337740421295166, + "learning_rate": 4.9988207212065516e-05, + "loss": 6.7605, + "step": 1645 + }, + { + "epoch": 0.009789228280521457, + "grad_norm": 2.442605972290039, + "learning_rate": 4.998819286237462e-05, + "loss": 6.6299, + "step": 1646 + }, + { + "epoch": 0.009795175563802456, + "grad_norm": 2.6570451259613037, + "learning_rate": 4.9988178503960606e-05, + "loss": 6.6933, + "step": 1647 + }, + { + "epoch": 0.009801122847083453, + "grad_norm": 2.597043752670288, + "learning_rate": 4.9988164136823467e-05, + "loss": 6.7667, + "step": 1648 + }, + { + "epoch": 0.00980707013036445, + "grad_norm": 3.2576608657836914, + "learning_rate": 4.998814976096323e-05, + "loss": 7.1774, + "step": 1649 + }, + { + "epoch": 0.009813017413645446, + "grad_norm": 3.110119342803955, + "learning_rate": 4.998813537637988e-05, + "loss": 7.2139, + "step": 1650 + }, + { + "epoch": 0.009818964696926445, + "grad_norm": 3.038086414337158, + "learning_rate": 4.998812098307343e-05, + "loss": 7.2752, + "step": 1651 + }, + { + "epoch": 0.009824911980207442, + "grad_norm": 2.965916872024536, + "learning_rate": 4.998810658104389e-05, + "loss": 7.1151, + "step": 1652 + }, + { + "epoch": 0.009830859263488438, + "grad_norm": 3.011476755142212, + "learning_rate": 4.998809217029126e-05, + "loss": 7.1335, + "step": 1653 + }, + { + "epoch": 0.009836806546769435, + "grad_norm": 3.8196349143981934, + "learning_rate": 4.9988077750815534e-05, + "loss": 7.0865, + "step": 1654 + }, + { + "epoch": 0.009842753830050432, + "grad_norm": 3.2577872276306152, + "learning_rate": 4.998806332261674e-05, + "loss": 7.4285, + "step": 1655 + }, + { + "epoch": 0.00984870111333143, + "grad_norm": 2.847039222717285, + "learning_rate": 4.998804888569487e-05, + "loss": 7.3251, + "step": 1656 + }, + { + "epoch": 0.009854648396612428, + "grad_norm": 3.4066355228424072, + "learning_rate": 4.998803444004992e-05, + "loss": 7.3137, + "step": 1657 + }, + { + "epoch": 0.009860595679893424, + "grad_norm": 3.6774044036865234, + "learning_rate": 4.998801998568192e-05, + "loss": 7.0772, + "step": 1658 + }, + { + "epoch": 0.009866542963174421, + "grad_norm": 3.1404600143432617, + "learning_rate": 4.998800552259085e-05, + "loss": 7.1143, + "step": 1659 + }, + { + "epoch": 0.00987249024645542, + "grad_norm": 3.6337625980377197, + "learning_rate": 4.998799105077674e-05, + "loss": 7.1296, + "step": 1660 + }, + { + "epoch": 0.009878437529736417, + "grad_norm": 4.551114082336426, + "learning_rate": 4.9987976570239566e-05, + "loss": 7.1343, + "step": 1661 + }, + { + "epoch": 0.009884384813017413, + "grad_norm": 3.2305374145507812, + "learning_rate": 4.998796208097935e-05, + "loss": 7.0852, + "step": 1662 + }, + { + "epoch": 0.00989033209629841, + "grad_norm": 2.5174615383148193, + "learning_rate": 4.99879475829961e-05, + "loss": 7.2315, + "step": 1663 + }, + { + "epoch": 0.009896279379579409, + "grad_norm": 3.623525381088257, + "learning_rate": 4.9987933076289804e-05, + "loss": 7.4222, + "step": 1664 + }, + { + "epoch": 0.009902226662860406, + "grad_norm": 4.217465877532959, + "learning_rate": 4.998791856086049e-05, + "loss": 7.4003, + "step": 1665 + }, + { + "epoch": 0.009908173946141402, + "grad_norm": 2.42301344871521, + "learning_rate": 4.998790403670815e-05, + "loss": 7.3295, + "step": 1666 + }, + { + "epoch": 0.0099141212294224, + "grad_norm": 2.3003029823303223, + "learning_rate": 4.998788950383279e-05, + "loss": 7.2072, + "step": 1667 + }, + { + "epoch": 0.009920068512703398, + "grad_norm": 3.3792307376861572, + "learning_rate": 4.9987874962234414e-05, + "loss": 7.2882, + "step": 1668 + }, + { + "epoch": 0.009926015795984395, + "grad_norm": 3.42130184173584, + "learning_rate": 4.998786041191303e-05, + "loss": 7.1231, + "step": 1669 + }, + { + "epoch": 0.009931963079265391, + "grad_norm": 3.496676445007324, + "learning_rate": 4.9987845852868644e-05, + "loss": 7.2535, + "step": 1670 + }, + { + "epoch": 0.009937910362546388, + "grad_norm": 2.695780038833618, + "learning_rate": 4.9987831285101255e-05, + "loss": 7.3784, + "step": 1671 + }, + { + "epoch": 0.009943857645827387, + "grad_norm": 2.2745561599731445, + "learning_rate": 4.998781670861088e-05, + "loss": 7.1184, + "step": 1672 + }, + { + "epoch": 0.009949804929108384, + "grad_norm": 3.8487844467163086, + "learning_rate": 4.99878021233975e-05, + "loss": 7.277, + "step": 1673 + }, + { + "epoch": 0.00995575221238938, + "grad_norm": 2.6628305912017822, + "learning_rate": 4.998778752946115e-05, + "loss": 6.8204, + "step": 1674 + }, + { + "epoch": 0.009961699495670377, + "grad_norm": 3.6330301761627197, + "learning_rate": 4.998777292680182e-05, + "loss": 7.3003, + "step": 1675 + }, + { + "epoch": 0.009967646778951376, + "grad_norm": 2.644237995147705, + "learning_rate": 4.998775831541952e-05, + "loss": 7.1492, + "step": 1676 + }, + { + "epoch": 0.009973594062232373, + "grad_norm": 2.895193099975586, + "learning_rate": 4.998774369531424e-05, + "loss": 7.3986, + "step": 1677 + }, + { + "epoch": 0.00997954134551337, + "grad_norm": 3.2180328369140625, + "learning_rate": 4.998772906648601e-05, + "loss": 7.1085, + "step": 1678 + }, + { + "epoch": 0.009985488628794366, + "grad_norm": 3.5874838829040527, + "learning_rate": 4.9987714428934815e-05, + "loss": 6.9554, + "step": 1679 + }, + { + "epoch": 0.009991435912075365, + "grad_norm": 2.419516086578369, + "learning_rate": 4.9987699782660666e-05, + "loss": 6.6222, + "step": 1680 + }, + { + "epoch": 0.009997383195356362, + "grad_norm": 2.715808153152466, + "learning_rate": 4.9987685127663574e-05, + "loss": 6.8417, + "step": 1681 + }, + { + "epoch": 0.010003330478637358, + "grad_norm": 2.2847111225128174, + "learning_rate": 4.9987670463943534e-05, + "loss": 7.1649, + "step": 1682 + }, + { + "epoch": 0.010009277761918355, + "grad_norm": 2.402684450149536, + "learning_rate": 4.998765579150056e-05, + "loss": 7.6113, + "step": 1683 + }, + { + "epoch": 0.010015225045199352, + "grad_norm": 2.54388689994812, + "learning_rate": 4.998764111033465e-05, + "loss": 7.1261, + "step": 1684 + }, + { + "epoch": 0.01002117232848035, + "grad_norm": 2.8077542781829834, + "learning_rate": 4.9987626420445823e-05, + "loss": 7.1349, + "step": 1685 + }, + { + "epoch": 0.010027119611761347, + "grad_norm": 2.228707790374756, + "learning_rate": 4.9987611721834063e-05, + "loss": 7.1123, + "step": 1686 + }, + { + "epoch": 0.010033066895042344, + "grad_norm": 2.648607015609741, + "learning_rate": 4.998759701449939e-05, + "loss": 7.0263, + "step": 1687 + }, + { + "epoch": 0.010039014178323341, + "grad_norm": 3.0278162956237793, + "learning_rate": 4.99875822984418e-05, + "loss": 6.6463, + "step": 1688 + }, + { + "epoch": 0.01004496146160434, + "grad_norm": 3.1550052165985107, + "learning_rate": 4.998756757366131e-05, + "loss": 6.8773, + "step": 1689 + }, + { + "epoch": 0.010050908744885336, + "grad_norm": 3.3911843299865723, + "learning_rate": 4.998755284015792e-05, + "loss": 7.5045, + "step": 1690 + }, + { + "epoch": 0.010056856028166333, + "grad_norm": 2.668861150741577, + "learning_rate": 4.998753809793162e-05, + "loss": 7.5545, + "step": 1691 + }, + { + "epoch": 0.01006280331144733, + "grad_norm": 2.182792901992798, + "learning_rate": 4.998752334698244e-05, + "loss": 7.2315, + "step": 1692 + }, + { + "epoch": 0.010068750594728329, + "grad_norm": 2.981476068496704, + "learning_rate": 4.998750858731037e-05, + "loss": 7.3455, + "step": 1693 + }, + { + "epoch": 0.010074697878009325, + "grad_norm": 3.1855525970458984, + "learning_rate": 4.998749381891542e-05, + "loss": 7.3408, + "step": 1694 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 2.5677361488342285, + "learning_rate": 4.998747904179759e-05, + "loss": 6.7591, + "step": 1695 + }, + { + "epoch": 0.010086592444571319, + "grad_norm": 2.7397539615631104, + "learning_rate": 4.9987464255956894e-05, + "loss": 7.3976, + "step": 1696 + }, + { + "epoch": 0.010092539727852318, + "grad_norm": 2.1141586303710938, + "learning_rate": 4.998744946139333e-05, + "loss": 7.4287, + "step": 1697 + }, + { + "epoch": 0.010098487011133314, + "grad_norm": 2.1999096870422363, + "learning_rate": 4.998743465810691e-05, + "loss": 7.4804, + "step": 1698 + }, + { + "epoch": 0.010104434294414311, + "grad_norm": 2.4150960445404053, + "learning_rate": 4.9987419846097634e-05, + "loss": 7.1743, + "step": 1699 + }, + { + "epoch": 0.010110381577695308, + "grad_norm": 2.564270496368408, + "learning_rate": 4.998740502536551e-05, + "loss": 7.262, + "step": 1700 + }, + { + "epoch": 0.010116328860976307, + "grad_norm": 3.045964241027832, + "learning_rate": 4.9987390195910536e-05, + "loss": 7.0778, + "step": 1701 + }, + { + "epoch": 0.010122276144257304, + "grad_norm": 3.2720210552215576, + "learning_rate": 4.998737535773272e-05, + "loss": 7.2188, + "step": 1702 + }, + { + "epoch": 0.0101282234275383, + "grad_norm": 2.54496693611145, + "learning_rate": 4.998736051083207e-05, + "loss": 6.9985, + "step": 1703 + }, + { + "epoch": 0.010134170710819297, + "grad_norm": 3.6252541542053223, + "learning_rate": 4.998734565520859e-05, + "loss": 7.3502, + "step": 1704 + }, + { + "epoch": 0.010140117994100296, + "grad_norm": 3.468963146209717, + "learning_rate": 4.99873307908623e-05, + "loss": 6.9642, + "step": 1705 + }, + { + "epoch": 0.010146065277381293, + "grad_norm": 2.8778045177459717, + "learning_rate": 4.9987315917793174e-05, + "loss": 6.8675, + "step": 1706 + }, + { + "epoch": 0.01015201256066229, + "grad_norm": 2.4492053985595703, + "learning_rate": 4.9987301036001236e-05, + "loss": 7.3484, + "step": 1707 + }, + { + "epoch": 0.010157959843943286, + "grad_norm": 2.5170838832855225, + "learning_rate": 4.99872861454865e-05, + "loss": 7.6004, + "step": 1708 + }, + { + "epoch": 0.010163907127224285, + "grad_norm": 2.3539648056030273, + "learning_rate": 4.998727124624895e-05, + "loss": 7.3304, + "step": 1709 + }, + { + "epoch": 0.010169854410505282, + "grad_norm": 2.6097705364227295, + "learning_rate": 4.998725633828861e-05, + "loss": 7.3227, + "step": 1710 + }, + { + "epoch": 0.010175801693786278, + "grad_norm": 2.5909392833709717, + "learning_rate": 4.9987241421605466e-05, + "loss": 7.3797, + "step": 1711 + }, + { + "epoch": 0.010181748977067275, + "grad_norm": 3.143157958984375, + "learning_rate": 4.998722649619954e-05, + "loss": 7.1236, + "step": 1712 + }, + { + "epoch": 0.010187696260348274, + "grad_norm": 2.0621843338012695, + "learning_rate": 4.9987211562070835e-05, + "loss": 7.5322, + "step": 1713 + }, + { + "epoch": 0.01019364354362927, + "grad_norm": 1.7781084775924683, + "learning_rate": 4.9987196619219354e-05, + "loss": 7.428, + "step": 1714 + }, + { + "epoch": 0.010199590826910267, + "grad_norm": 2.3108980655670166, + "learning_rate": 4.9987181667645094e-05, + "loss": 7.3814, + "step": 1715 + }, + { + "epoch": 0.010205538110191264, + "grad_norm": 2.5184621810913086, + "learning_rate": 4.998716670734807e-05, + "loss": 7.374, + "step": 1716 + }, + { + "epoch": 0.010211485393472261, + "grad_norm": 1.9185826778411865, + "learning_rate": 4.9987151738328284e-05, + "loss": 7.3352, + "step": 1717 + }, + { + "epoch": 0.01021743267675326, + "grad_norm": 2.794224262237549, + "learning_rate": 4.998713676058574e-05, + "loss": 7.0293, + "step": 1718 + }, + { + "epoch": 0.010223379960034256, + "grad_norm": 3.601804733276367, + "learning_rate": 4.998712177412045e-05, + "loss": 7.0277, + "step": 1719 + }, + { + "epoch": 0.010229327243315253, + "grad_norm": 3.3258707523345947, + "learning_rate": 4.998710677893241e-05, + "loss": 6.9478, + "step": 1720 + }, + { + "epoch": 0.01023527452659625, + "grad_norm": 3.147439956665039, + "learning_rate": 4.9987091775021625e-05, + "loss": 6.7295, + "step": 1721 + }, + { + "epoch": 0.010241221809877249, + "grad_norm": 2.7821006774902344, + "learning_rate": 4.998707676238811e-05, + "loss": 6.7587, + "step": 1722 + }, + { + "epoch": 0.010247169093158245, + "grad_norm": 2.580597400665283, + "learning_rate": 4.998706174103186e-05, + "loss": 6.9091, + "step": 1723 + }, + { + "epoch": 0.010253116376439242, + "grad_norm": 2.5501208305358887, + "learning_rate": 4.998704671095289e-05, + "loss": 7.3262, + "step": 1724 + }, + { + "epoch": 0.010259063659720239, + "grad_norm": 2.5460124015808105, + "learning_rate": 4.99870316721512e-05, + "loss": 7.278, + "step": 1725 + }, + { + "epoch": 0.010265010943001238, + "grad_norm": 2.0253796577453613, + "learning_rate": 4.998701662462679e-05, + "loss": 7.1757, + "step": 1726 + }, + { + "epoch": 0.010270958226282234, + "grad_norm": 2.3127388954162598, + "learning_rate": 4.998700156837968e-05, + "loss": 7.1057, + "step": 1727 + }, + { + "epoch": 0.010276905509563231, + "grad_norm": 2.931878089904785, + "learning_rate": 4.998698650340986e-05, + "loss": 6.9993, + "step": 1728 + }, + { + "epoch": 0.010282852792844228, + "grad_norm": 3.239272356033325, + "learning_rate": 4.998697142971734e-05, + "loss": 6.7754, + "step": 1729 + }, + { + "epoch": 0.010288800076125227, + "grad_norm": 2.388212203979492, + "learning_rate": 4.998695634730213e-05, + "loss": 7.2794, + "step": 1730 + }, + { + "epoch": 0.010294747359406223, + "grad_norm": 2.7766799926757812, + "learning_rate": 4.998694125616423e-05, + "loss": 7.4636, + "step": 1731 + }, + { + "epoch": 0.01030069464268722, + "grad_norm": 2.543757915496826, + "learning_rate": 4.9986926156303646e-05, + "loss": 6.8801, + "step": 1732 + }, + { + "epoch": 0.010306641925968217, + "grad_norm": 1.8907097578048706, + "learning_rate": 4.9986911047720384e-05, + "loss": 7.0353, + "step": 1733 + }, + { + "epoch": 0.010312589209249216, + "grad_norm": 1.9585598707199097, + "learning_rate": 4.9986895930414444e-05, + "loss": 7.0469, + "step": 1734 + }, + { + "epoch": 0.010318536492530212, + "grad_norm": 2.5191497802734375, + "learning_rate": 4.998688080438585e-05, + "loss": 7.1469, + "step": 1735 + }, + { + "epoch": 0.01032448377581121, + "grad_norm": 3.5709545612335205, + "learning_rate": 4.998686566963459e-05, + "loss": 7.0499, + "step": 1736 + }, + { + "epoch": 0.010330431059092206, + "grad_norm": 2.3778624534606934, + "learning_rate": 4.998685052616067e-05, + "loss": 7.5897, + "step": 1737 + }, + { + "epoch": 0.010336378342373205, + "grad_norm": 2.0795674324035645, + "learning_rate": 4.9986835373964094e-05, + "loss": 6.8778, + "step": 1738 + }, + { + "epoch": 0.010342325625654201, + "grad_norm": 2.7674901485443115, + "learning_rate": 4.9986820213044875e-05, + "loss": 6.4428, + "step": 1739 + }, + { + "epoch": 0.010348272908935198, + "grad_norm": 2.7203595638275146, + "learning_rate": 4.998680504340302e-05, + "loss": 7.4668, + "step": 1740 + }, + { + "epoch": 0.010354220192216195, + "grad_norm": 2.840240955352783, + "learning_rate": 4.998678986503853e-05, + "loss": 7.2219, + "step": 1741 + }, + { + "epoch": 0.010360167475497194, + "grad_norm": 2.7803452014923096, + "learning_rate": 4.9986774677951404e-05, + "loss": 6.5674, + "step": 1742 + }, + { + "epoch": 0.01036611475877819, + "grad_norm": 2.467574119567871, + "learning_rate": 4.998675948214165e-05, + "loss": 6.9621, + "step": 1743 + }, + { + "epoch": 0.010372062042059187, + "grad_norm": 2.1437904834747314, + "learning_rate": 4.998674427760929e-05, + "loss": 7.1564, + "step": 1744 + }, + { + "epoch": 0.010378009325340184, + "grad_norm": 2.504685163497925, + "learning_rate": 4.9986729064354304e-05, + "loss": 6.8836, + "step": 1745 + }, + { + "epoch": 0.010383956608621183, + "grad_norm": 2.401296615600586, + "learning_rate": 4.998671384237671e-05, + "loss": 7.2906, + "step": 1746 + }, + { + "epoch": 0.01038990389190218, + "grad_norm": 2.233701705932617, + "learning_rate": 4.9986698611676516e-05, + "loss": 6.6854, + "step": 1747 + }, + { + "epoch": 0.010395851175183176, + "grad_norm": 2.9597983360290527, + "learning_rate": 4.998668337225373e-05, + "loss": 6.8859, + "step": 1748 + }, + { + "epoch": 0.010401798458464173, + "grad_norm": 3.2164804935455322, + "learning_rate": 4.998666812410834e-05, + "loss": 6.8255, + "step": 1749 + }, + { + "epoch": 0.01040774574174517, + "grad_norm": 3.010002374649048, + "learning_rate": 4.9986652867240364e-05, + "loss": 6.7092, + "step": 1750 + }, + { + "epoch": 0.010413693025026169, + "grad_norm": 2.8442068099975586, + "learning_rate": 4.998663760164981e-05, + "loss": 6.7231, + "step": 1751 + }, + { + "epoch": 0.010419640308307165, + "grad_norm": 3.127922773361206, + "learning_rate": 4.9986622327336676e-05, + "loss": 6.6072, + "step": 1752 + }, + { + "epoch": 0.010425587591588162, + "grad_norm": 2.7306833267211914, + "learning_rate": 4.998660704430097e-05, + "loss": 6.696, + "step": 1753 + }, + { + "epoch": 0.010431534874869159, + "grad_norm": 2.9005799293518066, + "learning_rate": 4.99865917525427e-05, + "loss": 6.6598, + "step": 1754 + }, + { + "epoch": 0.010437482158150158, + "grad_norm": 3.17934513092041, + "learning_rate": 4.9986576452061865e-05, + "loss": 6.5887, + "step": 1755 + }, + { + "epoch": 0.010443429441431154, + "grad_norm": 2.9390244483947754, + "learning_rate": 4.9986561142858476e-05, + "loss": 6.5375, + "step": 1756 + }, + { + "epoch": 0.010449376724712151, + "grad_norm": 2.5547196865081787, + "learning_rate": 4.998654582493254e-05, + "loss": 6.7484, + "step": 1757 + }, + { + "epoch": 0.010455324007993148, + "grad_norm": 2.9969568252563477, + "learning_rate": 4.9986530498284054e-05, + "loss": 6.6496, + "step": 1758 + }, + { + "epoch": 0.010461271291274147, + "grad_norm": 2.843932867050171, + "learning_rate": 4.998651516291303e-05, + "loss": 6.5713, + "step": 1759 + }, + { + "epoch": 0.010467218574555143, + "grad_norm": 2.9114811420440674, + "learning_rate": 4.9986499818819476e-05, + "loss": 7.5248, + "step": 1760 + }, + { + "epoch": 0.01047316585783614, + "grad_norm": 3.0292229652404785, + "learning_rate": 4.998648446600339e-05, + "loss": 7.2346, + "step": 1761 + }, + { + "epoch": 0.010479113141117137, + "grad_norm": 2.553088426589966, + "learning_rate": 4.998646910446478e-05, + "loss": 7.1531, + "step": 1762 + }, + { + "epoch": 0.010485060424398136, + "grad_norm": 2.9838356971740723, + "learning_rate": 4.998645373420365e-05, + "loss": 6.6561, + "step": 1763 + }, + { + "epoch": 0.010491007707679132, + "grad_norm": 2.8948864936828613, + "learning_rate": 4.9986438355220014e-05, + "loss": 6.463, + "step": 1764 + }, + { + "epoch": 0.01049695499096013, + "grad_norm": 2.805084228515625, + "learning_rate": 4.9986422967513856e-05, + "loss": 6.701, + "step": 1765 + }, + { + "epoch": 0.010502902274241126, + "grad_norm": 2.748077869415283, + "learning_rate": 4.998640757108522e-05, + "loss": 7.3223, + "step": 1766 + }, + { + "epoch": 0.010508849557522125, + "grad_norm": 3.0048258304595947, + "learning_rate": 4.998639216593406e-05, + "loss": 7.2582, + "step": 1767 + }, + { + "epoch": 0.010514796840803121, + "grad_norm": 2.538522958755493, + "learning_rate": 4.998637675206043e-05, + "loss": 7.1208, + "step": 1768 + }, + { + "epoch": 0.010520744124084118, + "grad_norm": 2.2091188430786133, + "learning_rate": 4.99863613294643e-05, + "loss": 7.0577, + "step": 1769 + }, + { + "epoch": 0.010526691407365115, + "grad_norm": 2.8454909324645996, + "learning_rate": 4.998634589814569e-05, + "loss": 7.1296, + "step": 1770 + }, + { + "epoch": 0.010532638690646114, + "grad_norm": 3.4139351844787598, + "learning_rate": 4.998633045810461e-05, + "loss": 6.9565, + "step": 1771 + }, + { + "epoch": 0.01053858597392711, + "grad_norm": 2.3192107677459717, + "learning_rate": 4.9986315009341066e-05, + "loss": 6.6027, + "step": 1772 + }, + { + "epoch": 0.010544533257208107, + "grad_norm": 2.309290647506714, + "learning_rate": 4.998629955185505e-05, + "loss": 7.0417, + "step": 1773 + }, + { + "epoch": 0.010550480540489104, + "grad_norm": 3.2046520709991455, + "learning_rate": 4.998628408564657e-05, + "loss": 7.0368, + "step": 1774 + }, + { + "epoch": 0.010556427823770103, + "grad_norm": 2.459064483642578, + "learning_rate": 4.9986268610715646e-05, + "loss": 7.2726, + "step": 1775 + }, + { + "epoch": 0.0105623751070511, + "grad_norm": 2.602522134780884, + "learning_rate": 4.998625312706227e-05, + "loss": 7.3377, + "step": 1776 + }, + { + "epoch": 0.010568322390332096, + "grad_norm": 3.9599175453186035, + "learning_rate": 4.998623763468645e-05, + "loss": 6.9146, + "step": 1777 + }, + { + "epoch": 0.010574269673613093, + "grad_norm": 3.312527894973755, + "learning_rate": 4.99862221335882e-05, + "loss": 6.7457, + "step": 1778 + }, + { + "epoch": 0.01058021695689409, + "grad_norm": 2.5287606716156006, + "learning_rate": 4.9986206623767506e-05, + "loss": 7.2651, + "step": 1779 + }, + { + "epoch": 0.010586164240175088, + "grad_norm": 2.4065616130828857, + "learning_rate": 4.99861911052244e-05, + "loss": 7.1135, + "step": 1780 + }, + { + "epoch": 0.010592111523456085, + "grad_norm": 2.321385383605957, + "learning_rate": 4.998617557795886e-05, + "loss": 7.1985, + "step": 1781 + }, + { + "epoch": 0.010598058806737082, + "grad_norm": 2.118995189666748, + "learning_rate": 4.9986160041970906e-05, + "loss": 7.2832, + "step": 1782 + }, + { + "epoch": 0.010604006090018079, + "grad_norm": 2.2536606788635254, + "learning_rate": 4.9986144497260544e-05, + "loss": 7.191, + "step": 1783 + }, + { + "epoch": 0.010609953373299078, + "grad_norm": 2.2956738471984863, + "learning_rate": 4.998612894382778e-05, + "loss": 7.0496, + "step": 1784 + }, + { + "epoch": 0.010615900656580074, + "grad_norm": 2.4258289337158203, + "learning_rate": 4.9986113381672614e-05, + "loss": 7.2767, + "step": 1785 + }, + { + "epoch": 0.010621847939861071, + "grad_norm": 2.4731507301330566, + "learning_rate": 4.998609781079505e-05, + "loss": 6.8805, + "step": 1786 + }, + { + "epoch": 0.010627795223142068, + "grad_norm": 2.3245391845703125, + "learning_rate": 4.9986082231195105e-05, + "loss": 6.8921, + "step": 1787 + }, + { + "epoch": 0.010633742506423067, + "grad_norm": 2.6239898204803467, + "learning_rate": 4.998606664287278e-05, + "loss": 6.9353, + "step": 1788 + }, + { + "epoch": 0.010639689789704063, + "grad_norm": 2.186162233352661, + "learning_rate": 4.9986051045828065e-05, + "loss": 6.8466, + "step": 1789 + }, + { + "epoch": 0.01064563707298506, + "grad_norm": 2.2362232208251953, + "learning_rate": 4.998603544006098e-05, + "loss": 6.82, + "step": 1790 + }, + { + "epoch": 0.010651584356266057, + "grad_norm": 2.2302427291870117, + "learning_rate": 4.998601982557153e-05, + "loss": 6.7034, + "step": 1791 + }, + { + "epoch": 0.010657531639547056, + "grad_norm": 2.0393195152282715, + "learning_rate": 4.998600420235972e-05, + "loss": 6.6646, + "step": 1792 + }, + { + "epoch": 0.010663478922828052, + "grad_norm": 1.976536512374878, + "learning_rate": 4.9985988570425556e-05, + "loss": 6.4994, + "step": 1793 + }, + { + "epoch": 0.01066942620610905, + "grad_norm": 2.4167046546936035, + "learning_rate": 4.998597292976904e-05, + "loss": 6.7849, + "step": 1794 + }, + { + "epoch": 0.010675373489390046, + "grad_norm": 2.3077776432037354, + "learning_rate": 4.998595728039018e-05, + "loss": 6.8356, + "step": 1795 + }, + { + "epoch": 0.010681320772671045, + "grad_norm": 2.5263309478759766, + "learning_rate": 4.998594162228898e-05, + "loss": 6.6351, + "step": 1796 + }, + { + "epoch": 0.010687268055952041, + "grad_norm": 2.153365135192871, + "learning_rate": 4.9985925955465443e-05, + "loss": 6.7911, + "step": 1797 + }, + { + "epoch": 0.010693215339233038, + "grad_norm": 3.3034393787384033, + "learning_rate": 4.998591027991958e-05, + "loss": 6.7589, + "step": 1798 + }, + { + "epoch": 0.010699162622514035, + "grad_norm": 2.2177388668060303, + "learning_rate": 4.998589459565139e-05, + "loss": 6.571, + "step": 1799 + }, + { + "epoch": 0.010705109905795034, + "grad_norm": 2.3165230751037598, + "learning_rate": 4.9985878902660886e-05, + "loss": 6.9124, + "step": 1800 + }, + { + "epoch": 0.01071105718907603, + "grad_norm": 2.270045757293701, + "learning_rate": 4.998586320094807e-05, + "loss": 6.4442, + "step": 1801 + }, + { + "epoch": 0.010717004472357027, + "grad_norm": 2.1198744773864746, + "learning_rate": 4.9985847490512945e-05, + "loss": 6.555, + "step": 1802 + }, + { + "epoch": 0.010722951755638024, + "grad_norm": 2.5428359508514404, + "learning_rate": 4.998583177135552e-05, + "loss": 6.8991, + "step": 1803 + }, + { + "epoch": 0.010728899038919023, + "grad_norm": 1.983817219734192, + "learning_rate": 4.99858160434758e-05, + "loss": 6.6428, + "step": 1804 + }, + { + "epoch": 0.01073484632220002, + "grad_norm": 2.2749712467193604, + "learning_rate": 4.998580030687379e-05, + "loss": 6.7294, + "step": 1805 + }, + { + "epoch": 0.010740793605481016, + "grad_norm": 1.914762258529663, + "learning_rate": 4.998578456154949e-05, + "loss": 7.0395, + "step": 1806 + }, + { + "epoch": 0.010746740888762013, + "grad_norm": 1.6850765943527222, + "learning_rate": 4.998576880750292e-05, + "loss": 6.862, + "step": 1807 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 2.2930233478546143, + "learning_rate": 4.9985753044734076e-05, + "loss": 6.8213, + "step": 1808 + }, + { + "epoch": 0.010758635455324008, + "grad_norm": 2.193464756011963, + "learning_rate": 4.998573727324295e-05, + "loss": 6.9303, + "step": 1809 + }, + { + "epoch": 0.010764582738605005, + "grad_norm": 2.2451658248901367, + "learning_rate": 4.9985721493029576e-05, + "loss": 6.8061, + "step": 1810 + }, + { + "epoch": 0.010770530021886002, + "grad_norm": 2.164214849472046, + "learning_rate": 4.998570570409394e-05, + "loss": 6.6485, + "step": 1811 + }, + { + "epoch": 0.010776477305166999, + "grad_norm": 2.3530375957489014, + "learning_rate": 4.9985689906436054e-05, + "loss": 6.6826, + "step": 1812 + }, + { + "epoch": 0.010782424588447997, + "grad_norm": 3.007641553878784, + "learning_rate": 4.998567410005591e-05, + "loss": 6.0781, + "step": 1813 + }, + { + "epoch": 0.010788371871728994, + "grad_norm": 2.500411033630371, + "learning_rate": 4.998565828495354e-05, + "loss": 7.0544, + "step": 1814 + }, + { + "epoch": 0.010794319155009991, + "grad_norm": 2.329221725463867, + "learning_rate": 4.998564246112893e-05, + "loss": 7.2505, + "step": 1815 + }, + { + "epoch": 0.010800266438290988, + "grad_norm": 2.05120849609375, + "learning_rate": 4.998562662858209e-05, + "loss": 7.3094, + "step": 1816 + }, + { + "epoch": 0.010806213721571986, + "grad_norm": 1.83049738407135, + "learning_rate": 4.9985610787313023e-05, + "loss": 6.7752, + "step": 1817 + }, + { + "epoch": 0.010812161004852983, + "grad_norm": 2.2754576206207275, + "learning_rate": 4.998559493732174e-05, + "loss": 6.9396, + "step": 1818 + }, + { + "epoch": 0.01081810828813398, + "grad_norm": 2.104849338531494, + "learning_rate": 4.998557907860825e-05, + "loss": 7.2624, + "step": 1819 + }, + { + "epoch": 0.010824055571414977, + "grad_norm": 3.152069568634033, + "learning_rate": 4.998556321117254e-05, + "loss": 6.6763, + "step": 1820 + }, + { + "epoch": 0.010830002854695975, + "grad_norm": 3.4046475887298584, + "learning_rate": 4.9985547335014636e-05, + "loss": 6.7145, + "step": 1821 + }, + { + "epoch": 0.010835950137976972, + "grad_norm": 1.9208084344863892, + "learning_rate": 4.9985531450134534e-05, + "loss": 6.8985, + "step": 1822 + }, + { + "epoch": 0.010841897421257969, + "grad_norm": 2.4949824810028076, + "learning_rate": 4.998551555653224e-05, + "loss": 6.8196, + "step": 1823 + }, + { + "epoch": 0.010847844704538966, + "grad_norm": 2.613175392150879, + "learning_rate": 4.998549965420776e-05, + "loss": 6.7918, + "step": 1824 + }, + { + "epoch": 0.010853791987819965, + "grad_norm": 2.3322529792785645, + "learning_rate": 4.9985483743161105e-05, + "loss": 6.6133, + "step": 1825 + }, + { + "epoch": 0.010859739271100961, + "grad_norm": 3.116680860519409, + "learning_rate": 4.998546782339227e-05, + "loss": 7.4026, + "step": 1826 + }, + { + "epoch": 0.010865686554381958, + "grad_norm": 2.673938274383545, + "learning_rate": 4.998545189490127e-05, + "loss": 6.9181, + "step": 1827 + }, + { + "epoch": 0.010871633837662955, + "grad_norm": 2.135727643966675, + "learning_rate": 4.998543595768811e-05, + "loss": 6.9514, + "step": 1828 + }, + { + "epoch": 0.010877581120943954, + "grad_norm": 2.241696357727051, + "learning_rate": 4.9985420011752784e-05, + "loss": 7.126, + "step": 1829 + }, + { + "epoch": 0.01088352840422495, + "grad_norm": 2.316342830657959, + "learning_rate": 4.9985404057095315e-05, + "loss": 6.9752, + "step": 1830 + }, + { + "epoch": 0.010889475687505947, + "grad_norm": 2.591611623764038, + "learning_rate": 4.998538809371569e-05, + "loss": 6.8721, + "step": 1831 + }, + { + "epoch": 0.010895422970786944, + "grad_norm": 2.2846317291259766, + "learning_rate": 4.9985372121613935e-05, + "loss": 6.9468, + "step": 1832 + }, + { + "epoch": 0.010901370254067943, + "grad_norm": 2.0799343585968018, + "learning_rate": 4.998535614079004e-05, + "loss": 7.0839, + "step": 1833 + }, + { + "epoch": 0.01090731753734894, + "grad_norm": 2.1908833980560303, + "learning_rate": 4.998534015124401e-05, + "loss": 6.7228, + "step": 1834 + }, + { + "epoch": 0.010913264820629936, + "grad_norm": 2.329401969909668, + "learning_rate": 4.998532415297587e-05, + "loss": 6.715, + "step": 1835 + }, + { + "epoch": 0.010919212103910933, + "grad_norm": 1.9492794275283813, + "learning_rate": 4.998530814598559e-05, + "loss": 6.6762, + "step": 1836 + }, + { + "epoch": 0.010925159387191932, + "grad_norm": 1.9564979076385498, + "learning_rate": 4.998529213027321e-05, + "loss": 6.8545, + "step": 1837 + }, + { + "epoch": 0.010931106670472928, + "grad_norm": 1.8424931764602661, + "learning_rate": 4.998527610583872e-05, + "loss": 6.8505, + "step": 1838 + }, + { + "epoch": 0.010937053953753925, + "grad_norm": 1.9743967056274414, + "learning_rate": 4.998526007268213e-05, + "loss": 6.8413, + "step": 1839 + }, + { + "epoch": 0.010943001237034922, + "grad_norm": 2.31296968460083, + "learning_rate": 4.998524403080345e-05, + "loss": 6.7327, + "step": 1840 + }, + { + "epoch": 0.010948948520315919, + "grad_norm": 2.049689292907715, + "learning_rate": 4.9985227980202665e-05, + "loss": 7.0029, + "step": 1841 + }, + { + "epoch": 0.010954895803596917, + "grad_norm": 2.1640658378601074, + "learning_rate": 4.99852119208798e-05, + "loss": 7.0749, + "step": 1842 + }, + { + "epoch": 0.010960843086877914, + "grad_norm": 1.8896230459213257, + "learning_rate": 4.998519585283486e-05, + "loss": 6.7249, + "step": 1843 + }, + { + "epoch": 0.010966790370158911, + "grad_norm": 2.4835314750671387, + "learning_rate": 4.998517977606785e-05, + "loss": 6.5605, + "step": 1844 + }, + { + "epoch": 0.010972737653439908, + "grad_norm": 2.2472622394561768, + "learning_rate": 4.998516369057876e-05, + "loss": 6.8291, + "step": 1845 + }, + { + "epoch": 0.010978684936720906, + "grad_norm": 2.499096155166626, + "learning_rate": 4.998514759636762e-05, + "loss": 6.6921, + "step": 1846 + }, + { + "epoch": 0.010984632220001903, + "grad_norm": 2.296786308288574, + "learning_rate": 4.998513149343442e-05, + "loss": 7.0475, + "step": 1847 + }, + { + "epoch": 0.0109905795032829, + "grad_norm": 2.2896368503570557, + "learning_rate": 4.998511538177916e-05, + "loss": 6.775, + "step": 1848 + }, + { + "epoch": 0.010996526786563897, + "grad_norm": 2.025575637817383, + "learning_rate": 4.998509926140186e-05, + "loss": 6.9538, + "step": 1849 + }, + { + "epoch": 0.011002474069844895, + "grad_norm": 2.23502779006958, + "learning_rate": 4.9985083132302525e-05, + "loss": 7.0595, + "step": 1850 + }, + { + "epoch": 0.011008421353125892, + "grad_norm": 2.7158777713775635, + "learning_rate": 4.998506699448115e-05, + "loss": 7.0086, + "step": 1851 + }, + { + "epoch": 0.011014368636406889, + "grad_norm": 2.2707183361053467, + "learning_rate": 4.998505084793775e-05, + "loss": 6.6396, + "step": 1852 + }, + { + "epoch": 0.011020315919687886, + "grad_norm": 3.196085214614868, + "learning_rate": 4.998503469267232e-05, + "loss": 6.6026, + "step": 1853 + }, + { + "epoch": 0.011026263202968884, + "grad_norm": 2.4472603797912598, + "learning_rate": 4.9985018528684876e-05, + "loss": 7.1332, + "step": 1854 + }, + { + "epoch": 0.011032210486249881, + "grad_norm": 2.7070915699005127, + "learning_rate": 4.998500235597542e-05, + "loss": 6.9669, + "step": 1855 + }, + { + "epoch": 0.011038157769530878, + "grad_norm": 2.127729654312134, + "learning_rate": 4.998498617454396e-05, + "loss": 6.9589, + "step": 1856 + }, + { + "epoch": 0.011044105052811875, + "grad_norm": 2.2897160053253174, + "learning_rate": 4.99849699843905e-05, + "loss": 7.0402, + "step": 1857 + }, + { + "epoch": 0.011050052336092873, + "grad_norm": 1.888961672782898, + "learning_rate": 4.998495378551504e-05, + "loss": 6.9406, + "step": 1858 + }, + { + "epoch": 0.01105599961937387, + "grad_norm": 1.9889254570007324, + "learning_rate": 4.9984937577917594e-05, + "loss": 6.8392, + "step": 1859 + }, + { + "epoch": 0.011061946902654867, + "grad_norm": 3.042891025543213, + "learning_rate": 4.998492136159817e-05, + "loss": 6.7743, + "step": 1860 + }, + { + "epoch": 0.011067894185935864, + "grad_norm": 2.423988103866577, + "learning_rate": 4.998490513655676e-05, + "loss": 6.9802, + "step": 1861 + }, + { + "epoch": 0.011073841469216862, + "grad_norm": 2.6415674686431885, + "learning_rate": 4.998488890279338e-05, + "loss": 6.7104, + "step": 1862 + }, + { + "epoch": 0.01107978875249786, + "grad_norm": 2.686969518661499, + "learning_rate": 4.998487266030804e-05, + "loss": 7.0539, + "step": 1863 + }, + { + "epoch": 0.011085736035778856, + "grad_norm": 2.6695480346679688, + "learning_rate": 4.998485640910072e-05, + "loss": 6.9812, + "step": 1864 + }, + { + "epoch": 0.011091683319059853, + "grad_norm": 2.6251392364501953, + "learning_rate": 4.9984840149171466e-05, + "loss": 6.9954, + "step": 1865 + }, + { + "epoch": 0.011097630602340851, + "grad_norm": 2.487593650817871, + "learning_rate": 4.998482388052025e-05, + "loss": 7.0847, + "step": 1866 + }, + { + "epoch": 0.011103577885621848, + "grad_norm": 2.3249282836914062, + "learning_rate": 4.998480760314709e-05, + "loss": 6.9936, + "step": 1867 + }, + { + "epoch": 0.011109525168902845, + "grad_norm": 2.170452833175659, + "learning_rate": 4.9984791317052e-05, + "loss": 6.9155, + "step": 1868 + }, + { + "epoch": 0.011115472452183842, + "grad_norm": 3.331779718399048, + "learning_rate": 4.9984775022234975e-05, + "loss": 6.9128, + "step": 1869 + }, + { + "epoch": 0.01112141973546484, + "grad_norm": 2.7665064334869385, + "learning_rate": 4.9984758718696026e-05, + "loss": 6.9002, + "step": 1870 + }, + { + "epoch": 0.011127367018745837, + "grad_norm": 2.2872116565704346, + "learning_rate": 4.998474240643515e-05, + "loss": 6.9058, + "step": 1871 + }, + { + "epoch": 0.011133314302026834, + "grad_norm": 2.2125210762023926, + "learning_rate": 4.998472608545236e-05, + "loss": 6.932, + "step": 1872 + }, + { + "epoch": 0.011139261585307831, + "grad_norm": 2.1135666370391846, + "learning_rate": 4.998470975574766e-05, + "loss": 7.0018, + "step": 1873 + }, + { + "epoch": 0.011145208868588828, + "grad_norm": 2.0649492740631104, + "learning_rate": 4.998469341732105e-05, + "loss": 7.0132, + "step": 1874 + }, + { + "epoch": 0.011151156151869826, + "grad_norm": 4.0558576583862305, + "learning_rate": 4.9984677070172546e-05, + "loss": 6.8826, + "step": 1875 + }, + { + "epoch": 0.011157103435150823, + "grad_norm": 2.5675904750823975, + "learning_rate": 4.998466071430216e-05, + "loss": 7.0314, + "step": 1876 + }, + { + "epoch": 0.01116305071843182, + "grad_norm": 2.9773342609405518, + "learning_rate": 4.998464434970987e-05, + "loss": 6.8608, + "step": 1877 + }, + { + "epoch": 0.011168998001712817, + "grad_norm": 2.804995059967041, + "learning_rate": 4.9984627976395705e-05, + "loss": 6.6857, + "step": 1878 + }, + { + "epoch": 0.011174945284993815, + "grad_norm": 3.758509874343872, + "learning_rate": 4.9984611594359664e-05, + "loss": 6.9995, + "step": 1879 + }, + { + "epoch": 0.011180892568274812, + "grad_norm": 2.583061933517456, + "learning_rate": 4.998459520360176e-05, + "loss": 6.5844, + "step": 1880 + }, + { + "epoch": 0.011186839851555809, + "grad_norm": 2.357642889022827, + "learning_rate": 4.998457880412198e-05, + "loss": 6.6435, + "step": 1881 + }, + { + "epoch": 0.011192787134836806, + "grad_norm": 2.181558609008789, + "learning_rate": 4.9984562395920356e-05, + "loss": 7.045, + "step": 1882 + }, + { + "epoch": 0.011198734418117804, + "grad_norm": 2.4768264293670654, + "learning_rate": 4.998454597899688e-05, + "loss": 7.2053, + "step": 1883 + }, + { + "epoch": 0.011204681701398801, + "grad_norm": 2.4422380924224854, + "learning_rate": 4.998452955335154e-05, + "loss": 6.8038, + "step": 1884 + }, + { + "epoch": 0.011210628984679798, + "grad_norm": 3.3173701763153076, + "learning_rate": 4.998451311898437e-05, + "loss": 6.8619, + "step": 1885 + }, + { + "epoch": 0.011216576267960795, + "grad_norm": 2.4492833614349365, + "learning_rate": 4.9984496675895366e-05, + "loss": 6.6681, + "step": 1886 + }, + { + "epoch": 0.011222523551241793, + "grad_norm": 3.065016031265259, + "learning_rate": 4.998448022408453e-05, + "loss": 6.7439, + "step": 1887 + }, + { + "epoch": 0.01122847083452279, + "grad_norm": 3.327730655670166, + "learning_rate": 4.998446376355187e-05, + "loss": 6.735, + "step": 1888 + }, + { + "epoch": 0.011234418117803787, + "grad_norm": 3.428292751312256, + "learning_rate": 4.998444729429739e-05, + "loss": 6.5277, + "step": 1889 + }, + { + "epoch": 0.011240365401084784, + "grad_norm": 2.4982972145080566, + "learning_rate": 4.9984430816321095e-05, + "loss": 6.8228, + "step": 1890 + }, + { + "epoch": 0.011246312684365782, + "grad_norm": 2.568232297897339, + "learning_rate": 4.9984414329623e-05, + "loss": 7.0772, + "step": 1891 + }, + { + "epoch": 0.01125225996764678, + "grad_norm": 2.534109115600586, + "learning_rate": 4.99843978342031e-05, + "loss": 7.0259, + "step": 1892 + }, + { + "epoch": 0.011258207250927776, + "grad_norm": 2.6394994258880615, + "learning_rate": 4.998438133006141e-05, + "loss": 6.8692, + "step": 1893 + }, + { + "epoch": 0.011264154534208773, + "grad_norm": 2.4049339294433594, + "learning_rate": 4.998436481719792e-05, + "loss": 6.8653, + "step": 1894 + }, + { + "epoch": 0.011270101817489771, + "grad_norm": 2.661191701889038, + "learning_rate": 4.998434829561266e-05, + "loss": 6.628, + "step": 1895 + }, + { + "epoch": 0.011276049100770768, + "grad_norm": 2.395829916000366, + "learning_rate": 4.998433176530561e-05, + "loss": 6.9876, + "step": 1896 + }, + { + "epoch": 0.011281996384051765, + "grad_norm": 2.547858715057373, + "learning_rate": 4.99843152262768e-05, + "loss": 7.3832, + "step": 1897 + }, + { + "epoch": 0.011287943667332762, + "grad_norm": 2.364246368408203, + "learning_rate": 4.998429867852621e-05, + "loss": 7.3771, + "step": 1898 + }, + { + "epoch": 0.01129389095061376, + "grad_norm": 2.3385260105133057, + "learning_rate": 4.998428212205387e-05, + "loss": 6.971, + "step": 1899 + }, + { + "epoch": 0.011299838233894757, + "grad_norm": 2.253760576248169, + "learning_rate": 4.998426555685977e-05, + "loss": 7.0588, + "step": 1900 + }, + { + "epoch": 0.011305785517175754, + "grad_norm": 2.4103500843048096, + "learning_rate": 4.998424898294392e-05, + "loss": 6.8731, + "step": 1901 + }, + { + "epoch": 0.011311732800456751, + "grad_norm": 2.4819014072418213, + "learning_rate": 4.998423240030633e-05, + "loss": 6.9502, + "step": 1902 + }, + { + "epoch": 0.011317680083737748, + "grad_norm": 2.503901243209839, + "learning_rate": 4.998421580894701e-05, + "loss": 7.017, + "step": 1903 + }, + { + "epoch": 0.011323627367018746, + "grad_norm": 2.2224137783050537, + "learning_rate": 4.9984199208865943e-05, + "loss": 7.1938, + "step": 1904 + }, + { + "epoch": 0.011329574650299743, + "grad_norm": 2.1291286945343018, + "learning_rate": 4.998418260006316e-05, + "loss": 7.1152, + "step": 1905 + }, + { + "epoch": 0.01133552193358074, + "grad_norm": 2.4611241817474365, + "learning_rate": 4.9984165982538655e-05, + "loss": 7.0316, + "step": 1906 + }, + { + "epoch": 0.011341469216861737, + "grad_norm": 2.329432487487793, + "learning_rate": 4.998414935629243e-05, + "loss": 7.0032, + "step": 1907 + }, + { + "epoch": 0.011347416500142735, + "grad_norm": 2.0618371963500977, + "learning_rate": 4.9984132721324505e-05, + "loss": 7.2566, + "step": 1908 + }, + { + "epoch": 0.011353363783423732, + "grad_norm": 2.063511371612549, + "learning_rate": 4.998411607763487e-05, + "loss": 7.0144, + "step": 1909 + }, + { + "epoch": 0.011359311066704729, + "grad_norm": 2.188871145248413, + "learning_rate": 4.998409942522355e-05, + "loss": 6.9652, + "step": 1910 + }, + { + "epoch": 0.011365258349985726, + "grad_norm": 2.499746322631836, + "learning_rate": 4.998408276409053e-05, + "loss": 6.9173, + "step": 1911 + }, + { + "epoch": 0.011371205633266724, + "grad_norm": 2.2809276580810547, + "learning_rate": 4.9984066094235826e-05, + "loss": 6.9202, + "step": 1912 + }, + { + "epoch": 0.011377152916547721, + "grad_norm": 1.7967042922973633, + "learning_rate": 4.998404941565944e-05, + "loss": 7.0652, + "step": 1913 + }, + { + "epoch": 0.011383100199828718, + "grad_norm": 2.339747667312622, + "learning_rate": 4.9984032728361384e-05, + "loss": 6.943, + "step": 1914 + }, + { + "epoch": 0.011389047483109715, + "grad_norm": 2.65795636177063, + "learning_rate": 4.998401603234166e-05, + "loss": 6.7197, + "step": 1915 + }, + { + "epoch": 0.011394994766390713, + "grad_norm": 2.181105852127075, + "learning_rate": 4.998399932760027e-05, + "loss": 6.7358, + "step": 1916 + }, + { + "epoch": 0.01140094204967171, + "grad_norm": 2.4130990505218506, + "learning_rate": 4.998398261413723e-05, + "loss": 6.8653, + "step": 1917 + }, + { + "epoch": 0.011406889332952707, + "grad_norm": 2.23822021484375, + "learning_rate": 4.998396589195254e-05, + "loss": 7.2125, + "step": 1918 + }, + { + "epoch": 0.011412836616233704, + "grad_norm": 2.176309823989868, + "learning_rate": 4.9983949161046207e-05, + "loss": 7.1077, + "step": 1919 + }, + { + "epoch": 0.011418783899514702, + "grad_norm": 2.2468202114105225, + "learning_rate": 4.9983932421418226e-05, + "loss": 7.1411, + "step": 1920 + }, + { + "epoch": 0.0114247311827957, + "grad_norm": 2.0748138427734375, + "learning_rate": 4.998391567306862e-05, + "loss": 7.0605, + "step": 1921 + }, + { + "epoch": 0.011430678466076696, + "grad_norm": 2.93007230758667, + "learning_rate": 4.998389891599738e-05, + "loss": 6.5832, + "step": 1922 + }, + { + "epoch": 0.011436625749357693, + "grad_norm": 2.125582218170166, + "learning_rate": 4.9983882150204534e-05, + "loss": 7.0761, + "step": 1923 + }, + { + "epoch": 0.011442573032638691, + "grad_norm": 2.3291571140289307, + "learning_rate": 4.998386537569005e-05, + "loss": 6.8781, + "step": 1924 + }, + { + "epoch": 0.011448520315919688, + "grad_norm": 2.8930649757385254, + "learning_rate": 4.9983848592453975e-05, + "loss": 7.1694, + "step": 1925 + }, + { + "epoch": 0.011454467599200685, + "grad_norm": 2.8450441360473633, + "learning_rate": 4.998383180049629e-05, + "loss": 7.1474, + "step": 1926 + }, + { + "epoch": 0.011460414882481682, + "grad_norm": 2.5900778770446777, + "learning_rate": 4.9983814999817016e-05, + "loss": 7.0423, + "step": 1927 + }, + { + "epoch": 0.01146636216576268, + "grad_norm": 2.289428949356079, + "learning_rate": 4.998379819041614e-05, + "loss": 6.9777, + "step": 1928 + }, + { + "epoch": 0.011472309449043677, + "grad_norm": 2.609384059906006, + "learning_rate": 4.998378137229368e-05, + "loss": 7.0488, + "step": 1929 + }, + { + "epoch": 0.011478256732324674, + "grad_norm": 2.1039459705352783, + "learning_rate": 4.998376454544964e-05, + "loss": 6.9308, + "step": 1930 + }, + { + "epoch": 0.01148420401560567, + "grad_norm": 2.1776134967803955, + "learning_rate": 4.9983747709884024e-05, + "loss": 6.9951, + "step": 1931 + }, + { + "epoch": 0.01149015129888667, + "grad_norm": 2.3150827884674072, + "learning_rate": 4.998373086559684e-05, + "loss": 6.9165, + "step": 1932 + }, + { + "epoch": 0.011496098582167666, + "grad_norm": 2.308370590209961, + "learning_rate": 4.99837140125881e-05, + "loss": 7.0155, + "step": 1933 + }, + { + "epoch": 0.011502045865448663, + "grad_norm": 2.234208106994629, + "learning_rate": 4.99836971508578e-05, + "loss": 6.9901, + "step": 1934 + }, + { + "epoch": 0.01150799314872966, + "grad_norm": 2.2340307235717773, + "learning_rate": 4.9983680280405953e-05, + "loss": 7.004, + "step": 1935 + }, + { + "epoch": 0.011513940432010657, + "grad_norm": 2.9458208084106445, + "learning_rate": 4.998366340123256e-05, + "loss": 7.3797, + "step": 1936 + }, + { + "epoch": 0.011519887715291655, + "grad_norm": 2.8516271114349365, + "learning_rate": 4.998364651333762e-05, + "loss": 7.3503, + "step": 1937 + }, + { + "epoch": 0.011525834998572652, + "grad_norm": 1.974025845527649, + "learning_rate": 4.998362961672116e-05, + "loss": 7.21, + "step": 1938 + }, + { + "epoch": 0.011531782281853649, + "grad_norm": 2.110117197036743, + "learning_rate": 4.998361271138317e-05, + "loss": 6.9494, + "step": 1939 + }, + { + "epoch": 0.011537729565134646, + "grad_norm": 2.2003207206726074, + "learning_rate": 4.9983595797323646e-05, + "loss": 6.8858, + "step": 1940 + }, + { + "epoch": 0.011543676848415644, + "grad_norm": 2.200982093811035, + "learning_rate": 4.998357887454262e-05, + "loss": 6.9512, + "step": 1941 + }, + { + "epoch": 0.011549624131696641, + "grad_norm": 2.303903102874756, + "learning_rate": 4.998356194304008e-05, + "loss": 7.2823, + "step": 1942 + }, + { + "epoch": 0.011555571414977638, + "grad_norm": 2.1376724243164062, + "learning_rate": 4.9983545002816035e-05, + "loss": 7.0321, + "step": 1943 + }, + { + "epoch": 0.011561518698258635, + "grad_norm": 2.3128151893615723, + "learning_rate": 4.99835280538705e-05, + "loss": 6.9714, + "step": 1944 + }, + { + "epoch": 0.011567465981539633, + "grad_norm": 2.359212636947632, + "learning_rate": 4.9983511096203465e-05, + "loss": 7.0496, + "step": 1945 + }, + { + "epoch": 0.01157341326482063, + "grad_norm": 2.346946954727173, + "learning_rate": 4.9983494129814945e-05, + "loss": 6.9865, + "step": 1946 + }, + { + "epoch": 0.011579360548101627, + "grad_norm": 2.447598934173584, + "learning_rate": 4.998347715470495e-05, + "loss": 6.9609, + "step": 1947 + }, + { + "epoch": 0.011585307831382624, + "grad_norm": 2.355300188064575, + "learning_rate": 4.998346017087348e-05, + "loss": 7.03, + "step": 1948 + }, + { + "epoch": 0.011591255114663622, + "grad_norm": 2.3207437992095947, + "learning_rate": 4.9983443178320545e-05, + "loss": 6.8181, + "step": 1949 + }, + { + "epoch": 0.011597202397944619, + "grad_norm": 2.359839677810669, + "learning_rate": 4.998342617704615e-05, + "loss": 6.8828, + "step": 1950 + }, + { + "epoch": 0.011603149681225616, + "grad_norm": 2.264890432357788, + "learning_rate": 4.9983409167050284e-05, + "loss": 7.3467, + "step": 1951 + }, + { + "epoch": 0.011609096964506613, + "grad_norm": 2.2720789909362793, + "learning_rate": 4.998339214833298e-05, + "loss": 7.3912, + "step": 1952 + }, + { + "epoch": 0.011615044247787611, + "grad_norm": 2.414433240890503, + "learning_rate": 4.9983375120894226e-05, + "loss": 7.1505, + "step": 1953 + }, + { + "epoch": 0.011620991531068608, + "grad_norm": 2.095290422439575, + "learning_rate": 4.998335808473404e-05, + "loss": 7.1642, + "step": 1954 + }, + { + "epoch": 0.011626938814349605, + "grad_norm": 2.118901252746582, + "learning_rate": 4.998334103985242e-05, + "loss": 7.0528, + "step": 1955 + }, + { + "epoch": 0.011632886097630602, + "grad_norm": 2.4361472129821777, + "learning_rate": 4.998332398624937e-05, + "loss": 7.3064, + "step": 1956 + }, + { + "epoch": 0.0116388333809116, + "grad_norm": 2.0978667736053467, + "learning_rate": 4.99833069239249e-05, + "loss": 7.0041, + "step": 1957 + }, + { + "epoch": 0.011644780664192597, + "grad_norm": 3.156329393386841, + "learning_rate": 4.998328985287902e-05, + "loss": 6.9169, + "step": 1958 + }, + { + "epoch": 0.011650727947473594, + "grad_norm": 2.311004400253296, + "learning_rate": 4.9983272773111735e-05, + "loss": 7.1128, + "step": 1959 + }, + { + "epoch": 0.01165667523075459, + "grad_norm": 2.406993865966797, + "learning_rate": 4.9983255684623036e-05, + "loss": 7.1403, + "step": 1960 + }, + { + "epoch": 0.01166262251403559, + "grad_norm": 2.0262861251831055, + "learning_rate": 4.998323858741295e-05, + "loss": 7.1014, + "step": 1961 + }, + { + "epoch": 0.011668569797316586, + "grad_norm": 2.369420051574707, + "learning_rate": 4.998322148148147e-05, + "loss": 7.1422, + "step": 1962 + }, + { + "epoch": 0.011674517080597583, + "grad_norm": 2.156019687652588, + "learning_rate": 4.998320436682861e-05, + "loss": 6.8405, + "step": 1963 + }, + { + "epoch": 0.01168046436387858, + "grad_norm": 2.35737681388855, + "learning_rate": 4.998318724345436e-05, + "loss": 6.8004, + "step": 1964 + }, + { + "epoch": 0.011686411647159577, + "grad_norm": 2.443676233291626, + "learning_rate": 4.998317011135875e-05, + "loss": 7.1959, + "step": 1965 + }, + { + "epoch": 0.011692358930440575, + "grad_norm": 2.1023004055023193, + "learning_rate": 4.998315297054177e-05, + "loss": 7.0684, + "step": 1966 + }, + { + "epoch": 0.011698306213721572, + "grad_norm": 2.5166187286376953, + "learning_rate": 4.998313582100342e-05, + "loss": 6.5876, + "step": 1967 + }, + { + "epoch": 0.011704253497002569, + "grad_norm": 2.1868557929992676, + "learning_rate": 4.9983118662743726e-05, + "loss": 6.6097, + "step": 1968 + }, + { + "epoch": 0.011710200780283566, + "grad_norm": 2.196786880493164, + "learning_rate": 4.998310149576269e-05, + "loss": 6.9798, + "step": 1969 + }, + { + "epoch": 0.011716148063564564, + "grad_norm": 2.361915111541748, + "learning_rate": 4.998308432006029e-05, + "loss": 6.8441, + "step": 1970 + }, + { + "epoch": 0.011722095346845561, + "grad_norm": 2.3234047889709473, + "learning_rate": 4.998306713563657e-05, + "loss": 6.9481, + "step": 1971 + }, + { + "epoch": 0.011728042630126558, + "grad_norm": 2.4995763301849365, + "learning_rate": 4.9983049942491514e-05, + "loss": 6.9903, + "step": 1972 + }, + { + "epoch": 0.011733989913407555, + "grad_norm": 2.21274995803833, + "learning_rate": 4.998303274062514e-05, + "loss": 7.1484, + "step": 1973 + }, + { + "epoch": 0.011739937196688553, + "grad_norm": 2.4777519702911377, + "learning_rate": 4.998301553003743e-05, + "loss": 7.144, + "step": 1974 + }, + { + "epoch": 0.01174588447996955, + "grad_norm": 2.089796304702759, + "learning_rate": 4.9982998310728426e-05, + "loss": 6.6765, + "step": 1975 + }, + { + "epoch": 0.011751831763250547, + "grad_norm": 3.012753963470459, + "learning_rate": 4.998298108269811e-05, + "loss": 6.8501, + "step": 1976 + }, + { + "epoch": 0.011757779046531544, + "grad_norm": 2.5427911281585693, + "learning_rate": 4.9982963845946486e-05, + "loss": 7.0171, + "step": 1977 + }, + { + "epoch": 0.011763726329812542, + "grad_norm": 2.8591670989990234, + "learning_rate": 4.998294660047358e-05, + "loss": 6.9881, + "step": 1978 + }, + { + "epoch": 0.011769673613093539, + "grad_norm": 2.952085256576538, + "learning_rate": 4.998292934627937e-05, + "loss": 6.9459, + "step": 1979 + }, + { + "epoch": 0.011775620896374536, + "grad_norm": 2.451958656311035, + "learning_rate": 4.998291208336388e-05, + "loss": 6.9515, + "step": 1980 + }, + { + "epoch": 0.011781568179655533, + "grad_norm": 2.448319435119629, + "learning_rate": 4.998289481172713e-05, + "loss": 6.8618, + "step": 1981 + }, + { + "epoch": 0.011787515462936531, + "grad_norm": 3.1797080039978027, + "learning_rate": 4.99828775313691e-05, + "loss": 6.7528, + "step": 1982 + }, + { + "epoch": 0.011793462746217528, + "grad_norm": 2.841120719909668, + "learning_rate": 4.99828602422898e-05, + "loss": 6.8, + "step": 1983 + }, + { + "epoch": 0.011799410029498525, + "grad_norm": 3.128098726272583, + "learning_rate": 4.998284294448925e-05, + "loss": 6.7574, + "step": 1984 + }, + { + "epoch": 0.011805357312779522, + "grad_norm": 2.7724568843841553, + "learning_rate": 4.998282563796744e-05, + "loss": 6.6119, + "step": 1985 + }, + { + "epoch": 0.01181130459606052, + "grad_norm": 2.8025269508361816, + "learning_rate": 4.998280832272439e-05, + "loss": 6.4676, + "step": 1986 + }, + { + "epoch": 0.011817251879341517, + "grad_norm": 2.5756618976593018, + "learning_rate": 4.99827909987601e-05, + "loss": 6.5421, + "step": 1987 + }, + { + "epoch": 0.011823199162622514, + "grad_norm": 2.9116249084472656, + "learning_rate": 4.998277366607457e-05, + "loss": 6.5446, + "step": 1988 + }, + { + "epoch": 0.01182914644590351, + "grad_norm": 2.571019411087036, + "learning_rate": 4.9982756324667815e-05, + "loss": 6.7898, + "step": 1989 + }, + { + "epoch": 0.01183509372918451, + "grad_norm": 2.818885326385498, + "learning_rate": 4.998273897453984e-05, + "loss": 6.6604, + "step": 1990 + }, + { + "epoch": 0.011841041012465506, + "grad_norm": 2.8561007976531982, + "learning_rate": 4.998272161569064e-05, + "loss": 6.5473, + "step": 1991 + }, + { + "epoch": 0.011846988295746503, + "grad_norm": 2.5539605617523193, + "learning_rate": 4.998270424812024e-05, + "loss": 6.5492, + "step": 1992 + }, + { + "epoch": 0.0118529355790275, + "grad_norm": 2.3242900371551514, + "learning_rate": 4.998268687182863e-05, + "loss": 6.4577, + "step": 1993 + }, + { + "epoch": 0.011858882862308498, + "grad_norm": 2.874807596206665, + "learning_rate": 4.998266948681582e-05, + "loss": 6.6071, + "step": 1994 + }, + { + "epoch": 0.011864830145589495, + "grad_norm": 2.9014296531677246, + "learning_rate": 4.9982652093081827e-05, + "loss": 7.2221, + "step": 1995 + }, + { + "epoch": 0.011870777428870492, + "grad_norm": 2.5874252319335938, + "learning_rate": 4.998263469062665e-05, + "loss": 6.593, + "step": 1996 + }, + { + "epoch": 0.011876724712151489, + "grad_norm": 2.4252052307128906, + "learning_rate": 4.998261727945028e-05, + "loss": 7.0138, + "step": 1997 + }, + { + "epoch": 0.011882671995432486, + "grad_norm": 2.3569211959838867, + "learning_rate": 4.998259985955275e-05, + "loss": 6.8743, + "step": 1998 + }, + { + "epoch": 0.011888619278713484, + "grad_norm": 2.560659408569336, + "learning_rate": 4.9982582430934045e-05, + "loss": 6.8926, + "step": 1999 + }, + { + "epoch": 0.011894566561994481, + "grad_norm": 2.0855636596679688, + "learning_rate": 4.9982564993594184e-05, + "loss": 7.1691, + "step": 2000 + }, + { + "epoch": 0.011900513845275478, + "grad_norm": 2.024829387664795, + "learning_rate": 4.998254754753316e-05, + "loss": 7.1797, + "step": 2001 + }, + { + "epoch": 0.011906461128556475, + "grad_norm": 2.093733549118042, + "learning_rate": 4.998253009275099e-05, + "loss": 6.9706, + "step": 2002 + }, + { + "epoch": 0.011912408411837473, + "grad_norm": 1.9211688041687012, + "learning_rate": 4.998251262924768e-05, + "loss": 7.018, + "step": 2003 + }, + { + "epoch": 0.01191835569511847, + "grad_norm": 2.3146321773529053, + "learning_rate": 4.998249515702323e-05, + "loss": 6.9384, + "step": 2004 + }, + { + "epoch": 0.011924302978399467, + "grad_norm": 2.346309185028076, + "learning_rate": 4.998247767607765e-05, + "loss": 6.5674, + "step": 2005 + }, + { + "epoch": 0.011930250261680464, + "grad_norm": 2.39471697807312, + "learning_rate": 4.998246018641094e-05, + "loss": 6.769, + "step": 2006 + }, + { + "epoch": 0.011936197544961462, + "grad_norm": 2.1689298152923584, + "learning_rate": 4.998244268802312e-05, + "loss": 7.0945, + "step": 2007 + }, + { + "epoch": 0.011942144828242459, + "grad_norm": 2.4209859371185303, + "learning_rate": 4.998242518091418e-05, + "loss": 6.98, + "step": 2008 + }, + { + "epoch": 0.011948092111523456, + "grad_norm": 2.6378684043884277, + "learning_rate": 4.998240766508414e-05, + "loss": 6.6833, + "step": 2009 + }, + { + "epoch": 0.011954039394804453, + "grad_norm": 2.2804839611053467, + "learning_rate": 4.9982390140532995e-05, + "loss": 6.7129, + "step": 2010 + }, + { + "epoch": 0.011959986678085451, + "grad_norm": 2.1788251399993896, + "learning_rate": 4.998237260726075e-05, + "loss": 7.0175, + "step": 2011 + }, + { + "epoch": 0.011965933961366448, + "grad_norm": 1.8988546133041382, + "learning_rate": 4.998235506526743e-05, + "loss": 7.0857, + "step": 2012 + }, + { + "epoch": 0.011971881244647445, + "grad_norm": 2.560107469558716, + "learning_rate": 4.9982337514553026e-05, + "loss": 7.0771, + "step": 2013 + }, + { + "epoch": 0.011977828527928442, + "grad_norm": 2.1771798133850098, + "learning_rate": 4.998231995511754e-05, + "loss": 7.071, + "step": 2014 + }, + { + "epoch": 0.01198377581120944, + "grad_norm": 1.9619860649108887, + "learning_rate": 4.998230238696098e-05, + "loss": 6.9109, + "step": 2015 + }, + { + "epoch": 0.011989723094490437, + "grad_norm": 2.16719126701355, + "learning_rate": 4.998228481008337e-05, + "loss": 6.903, + "step": 2016 + }, + { + "epoch": 0.011995670377771434, + "grad_norm": 2.4643077850341797, + "learning_rate": 4.998226722448469e-05, + "loss": 6.5301, + "step": 2017 + }, + { + "epoch": 0.01200161766105243, + "grad_norm": 2.5153393745422363, + "learning_rate": 4.9982249630164965e-05, + "loss": 7.107, + "step": 2018 + }, + { + "epoch": 0.01200756494433343, + "grad_norm": 2.6180920600891113, + "learning_rate": 4.998223202712419e-05, + "loss": 6.9905, + "step": 2019 + }, + { + "epoch": 0.012013512227614426, + "grad_norm": 2.333186149597168, + "learning_rate": 4.998221441536238e-05, + "loss": 7.074, + "step": 2020 + }, + { + "epoch": 0.012019459510895423, + "grad_norm": 2.138176918029785, + "learning_rate": 4.998219679487953e-05, + "loss": 7.0211, + "step": 2021 + }, + { + "epoch": 0.01202540679417642, + "grad_norm": 2.9845499992370605, + "learning_rate": 4.998217916567567e-05, + "loss": 6.7341, + "step": 2022 + }, + { + "epoch": 0.012031354077457418, + "grad_norm": 3.1216208934783936, + "learning_rate": 4.998216152775077e-05, + "loss": 7.1569, + "step": 2023 + }, + { + "epoch": 0.012037301360738415, + "grad_norm": 2.4693727493286133, + "learning_rate": 4.998214388110487e-05, + "loss": 6.6427, + "step": 2024 + }, + { + "epoch": 0.012043248644019412, + "grad_norm": 2.784562349319458, + "learning_rate": 4.9982126225737955e-05, + "loss": 6.6898, + "step": 2025 + }, + { + "epoch": 0.012049195927300409, + "grad_norm": 3.0549166202545166, + "learning_rate": 4.9982108561650036e-05, + "loss": 6.6004, + "step": 2026 + }, + { + "epoch": 0.012055143210581406, + "grad_norm": 2.565505266189575, + "learning_rate": 4.998209088884113e-05, + "loss": 6.5981, + "step": 2027 + }, + { + "epoch": 0.012061090493862404, + "grad_norm": 2.862548828125, + "learning_rate": 4.998207320731122e-05, + "loss": 6.4329, + "step": 2028 + }, + { + "epoch": 0.012067037777143401, + "grad_norm": 2.835280179977417, + "learning_rate": 4.998205551706033e-05, + "loss": 6.6854, + "step": 2029 + }, + { + "epoch": 0.012072985060424398, + "grad_norm": 2.4550364017486572, + "learning_rate": 4.9982037818088474e-05, + "loss": 6.7115, + "step": 2030 + }, + { + "epoch": 0.012078932343705395, + "grad_norm": 2.9977426528930664, + "learning_rate": 4.998202011039564e-05, + "loss": 6.341, + "step": 2031 + }, + { + "epoch": 0.012084879626986393, + "grad_norm": 2.258370876312256, + "learning_rate": 4.998200239398184e-05, + "loss": 6.7094, + "step": 2032 + }, + { + "epoch": 0.01209082691026739, + "grad_norm": 2.4484050273895264, + "learning_rate": 4.9981984668847085e-05, + "loss": 7.1115, + "step": 2033 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 2.4668514728546143, + "learning_rate": 4.9981966934991366e-05, + "loss": 6.9411, + "step": 2034 + }, + { + "epoch": 0.012102721476829384, + "grad_norm": 2.218479871749878, + "learning_rate": 4.998194919241471e-05, + "loss": 6.7175, + "step": 2035 + }, + { + "epoch": 0.012108668760110382, + "grad_norm": 2.201815366744995, + "learning_rate": 4.9981931441117115e-05, + "loss": 6.8684, + "step": 2036 + }, + { + "epoch": 0.012114616043391379, + "grad_norm": 2.4610331058502197, + "learning_rate": 4.998191368109858e-05, + "loss": 6.7214, + "step": 2037 + }, + { + "epoch": 0.012120563326672376, + "grad_norm": 2.7274906635284424, + "learning_rate": 4.998189591235912e-05, + "loss": 6.7611, + "step": 2038 + }, + { + "epoch": 0.012126510609953373, + "grad_norm": 2.7716658115386963, + "learning_rate": 4.9981878134898735e-05, + "loss": 6.7679, + "step": 2039 + }, + { + "epoch": 0.012132457893234371, + "grad_norm": 3.3206236362457275, + "learning_rate": 4.9981860348717434e-05, + "loss": 6.6283, + "step": 2040 + }, + { + "epoch": 0.012138405176515368, + "grad_norm": 2.511906862258911, + "learning_rate": 4.9981842553815225e-05, + "loss": 6.9537, + "step": 2041 + }, + { + "epoch": 0.012144352459796365, + "grad_norm": 2.7797024250030518, + "learning_rate": 4.998182475019212e-05, + "loss": 7.0488, + "step": 2042 + }, + { + "epoch": 0.012150299743077362, + "grad_norm": 3.523092031478882, + "learning_rate": 4.998180693784811e-05, + "loss": 6.9249, + "step": 2043 + }, + { + "epoch": 0.01215624702635836, + "grad_norm": 3.1001851558685303, + "learning_rate": 4.998178911678322e-05, + "loss": 7.0998, + "step": 2044 + }, + { + "epoch": 0.012162194309639357, + "grad_norm": 2.5291028022766113, + "learning_rate": 4.998177128699743e-05, + "loss": 6.8381, + "step": 2045 + }, + { + "epoch": 0.012168141592920354, + "grad_norm": 3.308398723602295, + "learning_rate": 4.998175344849077e-05, + "loss": 6.6849, + "step": 2046 + }, + { + "epoch": 0.01217408887620135, + "grad_norm": 3.4255475997924805, + "learning_rate": 4.998173560126323e-05, + "loss": 6.7816, + "step": 2047 + }, + { + "epoch": 0.01218003615948235, + "grad_norm": 3.4510817527770996, + "learning_rate": 4.998171774531484e-05, + "loss": 6.7961, + "step": 2048 + }, + { + "epoch": 0.012185983442763346, + "grad_norm": 3.15468168258667, + "learning_rate": 4.998169988064558e-05, + "loss": 6.9409, + "step": 2049 + }, + { + "epoch": 0.012191930726044343, + "grad_norm": 2.5568132400512695, + "learning_rate": 4.998168200725547e-05, + "loss": 6.8573, + "step": 2050 + }, + { + "epoch": 0.01219787800932534, + "grad_norm": 1.9745045900344849, + "learning_rate": 4.9981664125144515e-05, + "loss": 6.7948, + "step": 2051 + }, + { + "epoch": 0.012203825292606338, + "grad_norm": 2.2304463386535645, + "learning_rate": 4.9981646234312714e-05, + "loss": 6.6896, + "step": 2052 + }, + { + "epoch": 0.012209772575887335, + "grad_norm": 2.4391567707061768, + "learning_rate": 4.998162833476008e-05, + "loss": 6.7129, + "step": 2053 + }, + { + "epoch": 0.012215719859168332, + "grad_norm": 3.243905544281006, + "learning_rate": 4.9981610426486615e-05, + "loss": 7.0744, + "step": 2054 + }, + { + "epoch": 0.012221667142449329, + "grad_norm": 3.2596933841705322, + "learning_rate": 4.998159250949233e-05, + "loss": 6.9361, + "step": 2055 + }, + { + "epoch": 0.012227614425730327, + "grad_norm": 2.554436445236206, + "learning_rate": 4.998157458377723e-05, + "loss": 6.9354, + "step": 2056 + }, + { + "epoch": 0.012233561709011324, + "grad_norm": 2.3636975288391113, + "learning_rate": 4.998155664934132e-05, + "loss": 6.849, + "step": 2057 + }, + { + "epoch": 0.01223950899229232, + "grad_norm": 2.224684953689575, + "learning_rate": 4.99815387061846e-05, + "loss": 6.7011, + "step": 2058 + }, + { + "epoch": 0.012245456275573318, + "grad_norm": 2.6892964839935303, + "learning_rate": 4.9981520754307096e-05, + "loss": 6.753, + "step": 2059 + }, + { + "epoch": 0.012251403558854315, + "grad_norm": 2.7645084857940674, + "learning_rate": 4.9981502793708796e-05, + "loss": 6.5437, + "step": 2060 + }, + { + "epoch": 0.012257350842135313, + "grad_norm": 2.1315746307373047, + "learning_rate": 4.9981484824389716e-05, + "loss": 6.8843, + "step": 2061 + }, + { + "epoch": 0.01226329812541631, + "grad_norm": 2.6275408267974854, + "learning_rate": 4.998146684634984e-05, + "loss": 6.7275, + "step": 2062 + }, + { + "epoch": 0.012269245408697307, + "grad_norm": 2.530688762664795, + "learning_rate": 4.998144885958921e-05, + "loss": 6.6089, + "step": 2063 + }, + { + "epoch": 0.012275192691978304, + "grad_norm": 2.0959835052490234, + "learning_rate": 4.998143086410781e-05, + "loss": 6.7425, + "step": 2064 + }, + { + "epoch": 0.012281139975259302, + "grad_norm": 2.887242078781128, + "learning_rate": 4.998141285990565e-05, + "loss": 6.6867, + "step": 2065 + }, + { + "epoch": 0.012287087258540299, + "grad_norm": 2.430122137069702, + "learning_rate": 4.9981394846982734e-05, + "loss": 6.6636, + "step": 2066 + }, + { + "epoch": 0.012293034541821296, + "grad_norm": 2.269162654876709, + "learning_rate": 4.998137682533907e-05, + "loss": 7.1165, + "step": 2067 + }, + { + "epoch": 0.012298981825102293, + "grad_norm": 2.6741089820861816, + "learning_rate": 4.998135879497467e-05, + "loss": 6.6678, + "step": 2068 + }, + { + "epoch": 0.012304929108383291, + "grad_norm": 2.3362507820129395, + "learning_rate": 4.998134075588953e-05, + "loss": 7.0103, + "step": 2069 + }, + { + "epoch": 0.012310876391664288, + "grad_norm": 2.310638189315796, + "learning_rate": 4.9981322708083666e-05, + "loss": 6.9235, + "step": 2070 + }, + { + "epoch": 0.012316823674945285, + "grad_norm": 2.161853790283203, + "learning_rate": 4.998130465155708e-05, + "loss": 6.9392, + "step": 2071 + }, + { + "epoch": 0.012322770958226282, + "grad_norm": 2.2609059810638428, + "learning_rate": 4.9981286586309786e-05, + "loss": 6.888, + "step": 2072 + }, + { + "epoch": 0.01232871824150728, + "grad_norm": 2.6072967052459717, + "learning_rate": 4.998126851234177e-05, + "loss": 6.7739, + "step": 2073 + }, + { + "epoch": 0.012334665524788277, + "grad_norm": 3.092834711074829, + "learning_rate": 4.9981250429653056e-05, + "loss": 6.5529, + "step": 2074 + }, + { + "epoch": 0.012340612808069274, + "grad_norm": 2.303149461746216, + "learning_rate": 4.998123233824366e-05, + "loss": 6.618, + "step": 2075 + }, + { + "epoch": 0.01234656009135027, + "grad_norm": 2.888063907623291, + "learning_rate": 4.998121423811355e-05, + "loss": 6.9224, + "step": 2076 + }, + { + "epoch": 0.012352507374631269, + "grad_norm": 2.990727424621582, + "learning_rate": 4.998119612926277e-05, + "loss": 6.94, + "step": 2077 + }, + { + "epoch": 0.012358454657912266, + "grad_norm": 3.016002893447876, + "learning_rate": 4.998117801169131e-05, + "loss": 6.6231, + "step": 2078 + }, + { + "epoch": 0.012364401941193263, + "grad_norm": 2.057124614715576, + "learning_rate": 4.998115988539918e-05, + "loss": 6.803, + "step": 2079 + }, + { + "epoch": 0.01237034922447426, + "grad_norm": 2.371136426925659, + "learning_rate": 4.998114175038639e-05, + "loss": 6.8244, + "step": 2080 + }, + { + "epoch": 0.012376296507755258, + "grad_norm": 2.804365873336792, + "learning_rate": 4.998112360665292e-05, + "loss": 6.8787, + "step": 2081 + }, + { + "epoch": 0.012382243791036255, + "grad_norm": 3.4987633228302, + "learning_rate": 4.998110545419882e-05, + "loss": 6.6946, + "step": 2082 + }, + { + "epoch": 0.012388191074317252, + "grad_norm": 2.950608968734741, + "learning_rate": 4.998108729302407e-05, + "loss": 6.7915, + "step": 2083 + }, + { + "epoch": 0.012394138357598249, + "grad_norm": 2.4327776432037354, + "learning_rate": 4.998106912312868e-05, + "loss": 6.727, + "step": 2084 + }, + { + "epoch": 0.012400085640879247, + "grad_norm": 2.46014142036438, + "learning_rate": 4.998105094451265e-05, + "loss": 6.6797, + "step": 2085 + }, + { + "epoch": 0.012406032924160244, + "grad_norm": 2.947566270828247, + "learning_rate": 4.9981032757175995e-05, + "loss": 6.6401, + "step": 2086 + }, + { + "epoch": 0.01241198020744124, + "grad_norm": 2.5999064445495605, + "learning_rate": 4.9981014561118724e-05, + "loss": 6.58, + "step": 2087 + }, + { + "epoch": 0.012417927490722238, + "grad_norm": 2.9761807918548584, + "learning_rate": 4.9980996356340836e-05, + "loss": 6.8538, + "step": 2088 + }, + { + "epoch": 0.012423874774003236, + "grad_norm": 2.690925121307373, + "learning_rate": 4.9980978142842336e-05, + "loss": 6.9087, + "step": 2089 + }, + { + "epoch": 0.012429822057284233, + "grad_norm": 2.218524217605591, + "learning_rate": 4.998095992062325e-05, + "loss": 6.7221, + "step": 2090 + }, + { + "epoch": 0.01243576934056523, + "grad_norm": 2.630094051361084, + "learning_rate": 4.998094168968355e-05, + "loss": 6.7346, + "step": 2091 + }, + { + "epoch": 0.012441716623846227, + "grad_norm": 2.7839179039001465, + "learning_rate": 4.9980923450023276e-05, + "loss": 6.8668, + "step": 2092 + }, + { + "epoch": 0.012447663907127223, + "grad_norm": 2.422914743423462, + "learning_rate": 4.9980905201642415e-05, + "loss": 6.7953, + "step": 2093 + }, + { + "epoch": 0.012453611190408222, + "grad_norm": 2.525883674621582, + "learning_rate": 4.998088694454097e-05, + "loss": 6.6322, + "step": 2094 + }, + { + "epoch": 0.012459558473689219, + "grad_norm": 2.515536308288574, + "learning_rate": 4.998086867871896e-05, + "loss": 7.4297, + "step": 2095 + }, + { + "epoch": 0.012465505756970216, + "grad_norm": 2.689542055130005, + "learning_rate": 4.998085040417639e-05, + "loss": 7.4316, + "step": 2096 + }, + { + "epoch": 0.012471453040251212, + "grad_norm": 2.4374492168426514, + "learning_rate": 4.998083212091327e-05, + "loss": 6.8035, + "step": 2097 + }, + { + "epoch": 0.012477400323532211, + "grad_norm": 2.284153699874878, + "learning_rate": 4.998081382892959e-05, + "loss": 6.6644, + "step": 2098 + }, + { + "epoch": 0.012483347606813208, + "grad_norm": 2.113539218902588, + "learning_rate": 4.9980795528225366e-05, + "loss": 6.5201, + "step": 2099 + }, + { + "epoch": 0.012489294890094205, + "grad_norm": 2.2590157985687256, + "learning_rate": 4.998077721880061e-05, + "loss": 6.8074, + "step": 2100 + }, + { + "epoch": 0.012495242173375202, + "grad_norm": 2.077986717224121, + "learning_rate": 4.9980758900655316e-05, + "loss": 6.6986, + "step": 2101 + }, + { + "epoch": 0.0125011894566562, + "grad_norm": 2.495882987976074, + "learning_rate": 4.99807405737895e-05, + "loss": 6.6949, + "step": 2102 + }, + { + "epoch": 0.012507136739937197, + "grad_norm": 2.224621295928955, + "learning_rate": 4.998072223820317e-05, + "loss": 6.5723, + "step": 2103 + }, + { + "epoch": 0.012513084023218194, + "grad_norm": 2.515867233276367, + "learning_rate": 4.998070389389632e-05, + "loss": 6.4327, + "step": 2104 + }, + { + "epoch": 0.01251903130649919, + "grad_norm": 2.3134326934814453, + "learning_rate": 4.998068554086897e-05, + "loss": 6.2818, + "step": 2105 + }, + { + "epoch": 0.012524978589780189, + "grad_norm": 2.7688093185424805, + "learning_rate": 4.998066717912112e-05, + "loss": 6.4585, + "step": 2106 + }, + { + "epoch": 0.012530925873061186, + "grad_norm": 3.211790084838867, + "learning_rate": 4.998064880865277e-05, + "loss": 6.5227, + "step": 2107 + }, + { + "epoch": 0.012536873156342183, + "grad_norm": 2.9701578617095947, + "learning_rate": 4.998063042946395e-05, + "loss": 6.5674, + "step": 2108 + }, + { + "epoch": 0.01254282043962318, + "grad_norm": 2.1295664310455322, + "learning_rate": 4.998061204155463e-05, + "loss": 6.5697, + "step": 2109 + }, + { + "epoch": 0.012548767722904178, + "grad_norm": 2.841683864593506, + "learning_rate": 4.998059364492485e-05, + "loss": 6.453, + "step": 2110 + }, + { + "epoch": 0.012554715006185175, + "grad_norm": 2.481001615524292, + "learning_rate": 4.99805752395746e-05, + "loss": 6.555, + "step": 2111 + }, + { + "epoch": 0.012560662289466172, + "grad_norm": 2.357745885848999, + "learning_rate": 4.998055682550389e-05, + "loss": 6.7916, + "step": 2112 + }, + { + "epoch": 0.012566609572747169, + "grad_norm": 2.349417209625244, + "learning_rate": 4.9980538402712725e-05, + "loss": 6.7257, + "step": 2113 + }, + { + "epoch": 0.012572556856028167, + "grad_norm": 2.846930742263794, + "learning_rate": 4.998051997120111e-05, + "loss": 6.7095, + "step": 2114 + }, + { + "epoch": 0.012578504139309164, + "grad_norm": 2.362506628036499, + "learning_rate": 4.998050153096906e-05, + "loss": 6.675, + "step": 2115 + }, + { + "epoch": 0.01258445142259016, + "grad_norm": 2.3275344371795654, + "learning_rate": 4.998048308201656e-05, + "loss": 6.9031, + "step": 2116 + }, + { + "epoch": 0.012590398705871158, + "grad_norm": 2.194359540939331, + "learning_rate": 4.9980464624343644e-05, + "loss": 6.8258, + "step": 2117 + }, + { + "epoch": 0.012596345989152156, + "grad_norm": 2.3926312923431396, + "learning_rate": 4.99804461579503e-05, + "loss": 6.7136, + "step": 2118 + }, + { + "epoch": 0.012602293272433153, + "grad_norm": 2.7430222034454346, + "learning_rate": 4.9980427682836546e-05, + "loss": 6.5475, + "step": 2119 + }, + { + "epoch": 0.01260824055571415, + "grad_norm": 2.1563844680786133, + "learning_rate": 4.998040919900237e-05, + "loss": 6.7105, + "step": 2120 + }, + { + "epoch": 0.012614187838995147, + "grad_norm": 2.1061437129974365, + "learning_rate": 4.998039070644781e-05, + "loss": 6.6411, + "step": 2121 + }, + { + "epoch": 0.012620135122276143, + "grad_norm": 2.6192378997802734, + "learning_rate": 4.9980372205172844e-05, + "loss": 6.6831, + "step": 2122 + }, + { + "epoch": 0.012626082405557142, + "grad_norm": 2.794616222381592, + "learning_rate": 4.9980353695177495e-05, + "loss": 6.8128, + "step": 2123 + }, + { + "epoch": 0.012632029688838139, + "grad_norm": 2.3656489849090576, + "learning_rate": 4.998033517646176e-05, + "loss": 6.8109, + "step": 2124 + }, + { + "epoch": 0.012637976972119136, + "grad_norm": 2.658433437347412, + "learning_rate": 4.998031664902564e-05, + "loss": 6.7979, + "step": 2125 + }, + { + "epoch": 0.012643924255400132, + "grad_norm": 2.889954090118408, + "learning_rate": 4.9980298112869154e-05, + "loss": 6.6745, + "step": 2126 + }, + { + "epoch": 0.012649871538681131, + "grad_norm": 2.469790458679199, + "learning_rate": 4.9980279567992304e-05, + "loss": 6.7056, + "step": 2127 + }, + { + "epoch": 0.012655818821962128, + "grad_norm": 2.4310262203216553, + "learning_rate": 4.9980261014395094e-05, + "loss": 6.8809, + "step": 2128 + }, + { + "epoch": 0.012661766105243125, + "grad_norm": 2.772359609603882, + "learning_rate": 4.998024245207754e-05, + "loss": 7.0383, + "step": 2129 + }, + { + "epoch": 0.012667713388524121, + "grad_norm": 2.292144775390625, + "learning_rate": 4.9980223881039635e-05, + "loss": 6.9062, + "step": 2130 + }, + { + "epoch": 0.01267366067180512, + "grad_norm": 2.590363025665283, + "learning_rate": 4.998020530128139e-05, + "loss": 6.5803, + "step": 2131 + }, + { + "epoch": 0.012679607955086117, + "grad_norm": 2.78432035446167, + "learning_rate": 4.9980186712802824e-05, + "loss": 6.788, + "step": 2132 + }, + { + "epoch": 0.012685555238367114, + "grad_norm": 2.6188290119171143, + "learning_rate": 4.998016811560392e-05, + "loss": 6.5827, + "step": 2133 + }, + { + "epoch": 0.01269150252164811, + "grad_norm": 2.868215560913086, + "learning_rate": 4.99801495096847e-05, + "loss": 6.5845, + "step": 2134 + }, + { + "epoch": 0.012697449804929109, + "grad_norm": 2.4738945960998535, + "learning_rate": 4.998013089504518e-05, + "loss": 6.5019, + "step": 2135 + }, + { + "epoch": 0.012703397088210106, + "grad_norm": 2.5315287113189697, + "learning_rate": 4.998011227168534e-05, + "loss": 6.6765, + "step": 2136 + }, + { + "epoch": 0.012709344371491103, + "grad_norm": 2.7871086597442627, + "learning_rate": 4.998009363960521e-05, + "loss": 6.64, + "step": 2137 + }, + { + "epoch": 0.0127152916547721, + "grad_norm": 2.267502784729004, + "learning_rate": 4.998007499880479e-05, + "loss": 6.8665, + "step": 2138 + }, + { + "epoch": 0.012721238938053098, + "grad_norm": 2.5014212131500244, + "learning_rate": 4.998005634928408e-05, + "loss": 6.6757, + "step": 2139 + }, + { + "epoch": 0.012727186221334095, + "grad_norm": 2.3600070476531982, + "learning_rate": 4.998003769104308e-05, + "loss": 6.5425, + "step": 2140 + }, + { + "epoch": 0.012733133504615092, + "grad_norm": 2.32123064994812, + "learning_rate": 4.998001902408182e-05, + "loss": 6.5192, + "step": 2141 + }, + { + "epoch": 0.012739080787896088, + "grad_norm": 2.5059258937835693, + "learning_rate": 4.998000034840029e-05, + "loss": 6.6315, + "step": 2142 + }, + { + "epoch": 0.012745028071177087, + "grad_norm": 2.2143092155456543, + "learning_rate": 4.99799816639985e-05, + "loss": 6.6058, + "step": 2143 + }, + { + "epoch": 0.012750975354458084, + "grad_norm": 2.3660342693328857, + "learning_rate": 4.997996297087645e-05, + "loss": 6.554, + "step": 2144 + }, + { + "epoch": 0.01275692263773908, + "grad_norm": 2.4286036491394043, + "learning_rate": 4.9979944269034164e-05, + "loss": 6.4857, + "step": 2145 + }, + { + "epoch": 0.012762869921020078, + "grad_norm": 2.4002180099487305, + "learning_rate": 4.997992555847163e-05, + "loss": 6.5083, + "step": 2146 + }, + { + "epoch": 0.012768817204301076, + "grad_norm": 2.418942451477051, + "learning_rate": 4.997990683918886e-05, + "loss": 6.5471, + "step": 2147 + }, + { + "epoch": 0.012774764487582073, + "grad_norm": 2.535654067993164, + "learning_rate": 4.997988811118587e-05, + "loss": 6.5999, + "step": 2148 + }, + { + "epoch": 0.01278071177086307, + "grad_norm": 2.581505298614502, + "learning_rate": 4.9979869374462655e-05, + "loss": 6.2525, + "step": 2149 + }, + { + "epoch": 0.012786659054144067, + "grad_norm": 2.681297779083252, + "learning_rate": 4.997985062901923e-05, + "loss": 6.1463, + "step": 2150 + }, + { + "epoch": 0.012792606337425065, + "grad_norm": 2.3542990684509277, + "learning_rate": 4.997983187485559e-05, + "loss": 6.433, + "step": 2151 + }, + { + "epoch": 0.012798553620706062, + "grad_norm": 2.2994048595428467, + "learning_rate": 4.997981311197175e-05, + "loss": 6.5952, + "step": 2152 + }, + { + "epoch": 0.012804500903987059, + "grad_norm": 2.4703454971313477, + "learning_rate": 4.9979794340367724e-05, + "loss": 6.5581, + "step": 2153 + }, + { + "epoch": 0.012810448187268056, + "grad_norm": 2.511383533477783, + "learning_rate": 4.9979775560043504e-05, + "loss": 6.577, + "step": 2154 + }, + { + "epoch": 0.012816395470549052, + "grad_norm": 2.3300156593322754, + "learning_rate": 4.99797567709991e-05, + "loss": 6.4349, + "step": 2155 + }, + { + "epoch": 0.012822342753830051, + "grad_norm": 2.523878574371338, + "learning_rate": 4.997973797323452e-05, + "loss": 6.5044, + "step": 2156 + }, + { + "epoch": 0.012828290037111048, + "grad_norm": 2.4185073375701904, + "learning_rate": 4.9979719166749776e-05, + "loss": 6.537, + "step": 2157 + }, + { + "epoch": 0.012834237320392045, + "grad_norm": 2.324090003967285, + "learning_rate": 4.997970035154487e-05, + "loss": 6.803, + "step": 2158 + }, + { + "epoch": 0.012840184603673041, + "grad_norm": 2.468872547149658, + "learning_rate": 4.9979681527619804e-05, + "loss": 7.0837, + "step": 2159 + }, + { + "epoch": 0.01284613188695404, + "grad_norm": 2.1467936038970947, + "learning_rate": 4.99796626949746e-05, + "loss": 6.7373, + "step": 2160 + }, + { + "epoch": 0.012852079170235037, + "grad_norm": 2.3208062648773193, + "learning_rate": 4.9979643853609246e-05, + "loss": 6.5483, + "step": 2161 + }, + { + "epoch": 0.012858026453516034, + "grad_norm": 2.2797584533691406, + "learning_rate": 4.997962500352376e-05, + "loss": 6.5857, + "step": 2162 + }, + { + "epoch": 0.01286397373679703, + "grad_norm": 2.3447721004486084, + "learning_rate": 4.9979606144718135e-05, + "loss": 6.8511, + "step": 2163 + }, + { + "epoch": 0.012869921020078029, + "grad_norm": 2.6456334590911865, + "learning_rate": 4.9979587277192395e-05, + "loss": 6.9457, + "step": 2164 + }, + { + "epoch": 0.012875868303359026, + "grad_norm": 3.2567737102508545, + "learning_rate": 4.997956840094654e-05, + "loss": 6.6405, + "step": 2165 + }, + { + "epoch": 0.012881815586640023, + "grad_norm": 2.847371816635132, + "learning_rate": 4.9979549515980574e-05, + "loss": 6.751, + "step": 2166 + }, + { + "epoch": 0.01288776286992102, + "grad_norm": 2.999779462814331, + "learning_rate": 4.99795306222945e-05, + "loss": 6.7437, + "step": 2167 + }, + { + "epoch": 0.012893710153202018, + "grad_norm": 2.3793458938598633, + "learning_rate": 4.9979511719888336e-05, + "loss": 6.6864, + "step": 2168 + }, + { + "epoch": 0.012899657436483015, + "grad_norm": 2.284724473953247, + "learning_rate": 4.9979492808762084e-05, + "loss": 6.4237, + "step": 2169 + }, + { + "epoch": 0.012905604719764012, + "grad_norm": 2.560758352279663, + "learning_rate": 4.997947388891575e-05, + "loss": 6.5964, + "step": 2170 + }, + { + "epoch": 0.012911552003045008, + "grad_norm": 2.7461421489715576, + "learning_rate": 4.997945496034934e-05, + "loss": 6.5354, + "step": 2171 + }, + { + "epoch": 0.012917499286326007, + "grad_norm": 3.0868208408355713, + "learning_rate": 4.9979436023062854e-05, + "loss": 6.6445, + "step": 2172 + }, + { + "epoch": 0.012923446569607004, + "grad_norm": 2.565009593963623, + "learning_rate": 4.997941707705631e-05, + "loss": 6.6015, + "step": 2173 + }, + { + "epoch": 0.012929393852888, + "grad_norm": 2.9424686431884766, + "learning_rate": 4.997939812232971e-05, + "loss": 6.4887, + "step": 2174 + }, + { + "epoch": 0.012935341136168997, + "grad_norm": 3.0674476623535156, + "learning_rate": 4.997937915888305e-05, + "loss": 6.4728, + "step": 2175 + }, + { + "epoch": 0.012941288419449996, + "grad_norm": 3.040189266204834, + "learning_rate": 4.997936018671636e-05, + "loss": 6.3788, + "step": 2176 + }, + { + "epoch": 0.012947235702730993, + "grad_norm": 2.756211042404175, + "learning_rate": 4.9979341205829626e-05, + "loss": 6.4167, + "step": 2177 + }, + { + "epoch": 0.01295318298601199, + "grad_norm": 2.6333322525024414, + "learning_rate": 4.997932221622287e-05, + "loss": 6.6392, + "step": 2178 + }, + { + "epoch": 0.012959130269292986, + "grad_norm": 2.6951076984405518, + "learning_rate": 4.997930321789608e-05, + "loss": 6.3299, + "step": 2179 + }, + { + "epoch": 0.012965077552573985, + "grad_norm": 2.5388028621673584, + "learning_rate": 4.997928421084928e-05, + "loss": 6.2646, + "step": 2180 + }, + { + "epoch": 0.012971024835854982, + "grad_norm": 3.312171459197998, + "learning_rate": 4.997926519508247e-05, + "loss": 6.6331, + "step": 2181 + }, + { + "epoch": 0.012976972119135979, + "grad_norm": 3.437025547027588, + "learning_rate": 4.997924617059565e-05, + "loss": 5.5981, + "step": 2182 + }, + { + "epoch": 0.012982919402416975, + "grad_norm": 2.74035906791687, + "learning_rate": 4.997922713738884e-05, + "loss": 5.1641, + "step": 2183 + }, + { + "epoch": 0.012988866685697972, + "grad_norm": 2.618525505065918, + "learning_rate": 4.9979208095462036e-05, + "loss": 5.9978, + "step": 2184 + }, + { + "epoch": 0.012994813968978971, + "grad_norm": 2.633692502975464, + "learning_rate": 4.9979189044815254e-05, + "loss": 6.2812, + "step": 2185 + }, + { + "epoch": 0.013000761252259968, + "grad_norm": 2.087557792663574, + "learning_rate": 4.997916998544849e-05, + "loss": 6.2864, + "step": 2186 + }, + { + "epoch": 0.013006708535540965, + "grad_norm": 3.365112066268921, + "learning_rate": 4.997915091736176e-05, + "loss": 5.3517, + "step": 2187 + }, + { + "epoch": 0.013012655818821961, + "grad_norm": 2.7561593055725098, + "learning_rate": 4.997913184055506e-05, + "loss": 6.3667, + "step": 2188 + }, + { + "epoch": 0.01301860310210296, + "grad_norm": 2.630976676940918, + "learning_rate": 4.9979112755028415e-05, + "loss": 6.5858, + "step": 2189 + }, + { + "epoch": 0.013024550385383957, + "grad_norm": 2.56007981300354, + "learning_rate": 4.9979093660781805e-05, + "loss": 6.6862, + "step": 2190 + }, + { + "epoch": 0.013030497668664954, + "grad_norm": 2.509631633758545, + "learning_rate": 4.997907455781526e-05, + "loss": 6.4699, + "step": 2191 + }, + { + "epoch": 0.01303644495194595, + "grad_norm": 2.442028522491455, + "learning_rate": 4.997905544612878e-05, + "loss": 6.5755, + "step": 2192 + }, + { + "epoch": 0.013042392235226949, + "grad_norm": 2.561016321182251, + "learning_rate": 4.997903632572236e-05, + "loss": 6.4529, + "step": 2193 + }, + { + "epoch": 0.013048339518507946, + "grad_norm": 2.585753917694092, + "learning_rate": 4.9979017196596025e-05, + "loss": 6.188, + "step": 2194 + }, + { + "epoch": 0.013054286801788943, + "grad_norm": 2.3657655715942383, + "learning_rate": 4.997899805874977e-05, + "loss": 6.1414, + "step": 2195 + }, + { + "epoch": 0.01306023408506994, + "grad_norm": 2.818251609802246, + "learning_rate": 4.997897891218361e-05, + "loss": 6.5276, + "step": 2196 + }, + { + "epoch": 0.013066181368350938, + "grad_norm": 2.9687695503234863, + "learning_rate": 4.997895975689754e-05, + "loss": 6.131, + "step": 2197 + }, + { + "epoch": 0.013072128651631935, + "grad_norm": 2.8505353927612305, + "learning_rate": 4.997894059289157e-05, + "loss": 6.5269, + "step": 2198 + }, + { + "epoch": 0.013078075934912932, + "grad_norm": 2.331573486328125, + "learning_rate": 4.997892142016573e-05, + "loss": 6.1101, + "step": 2199 + }, + { + "epoch": 0.013084023218193928, + "grad_norm": 2.3241569995880127, + "learning_rate": 4.997890223871998e-05, + "loss": 6.5081, + "step": 2200 + }, + { + "epoch": 0.013089970501474927, + "grad_norm": 2.658834218978882, + "learning_rate": 4.997888304855437e-05, + "loss": 6.554, + "step": 2201 + }, + { + "epoch": 0.013095917784755924, + "grad_norm": 2.703911304473877, + "learning_rate": 4.997886384966889e-05, + "loss": 6.337, + "step": 2202 + }, + { + "epoch": 0.01310186506803692, + "grad_norm": 3.020775318145752, + "learning_rate": 4.997884464206354e-05, + "loss": 6.4375, + "step": 2203 + }, + { + "epoch": 0.013107812351317917, + "grad_norm": 3.324218273162842, + "learning_rate": 4.9978825425738334e-05, + "loss": 6.4871, + "step": 2204 + }, + { + "epoch": 0.013113759634598916, + "grad_norm": 3.822019577026367, + "learning_rate": 4.9978806200693276e-05, + "loss": 6.6372, + "step": 2205 + }, + { + "epoch": 0.013119706917879913, + "grad_norm": 3.3639512062072754, + "learning_rate": 4.997878696692838e-05, + "loss": 6.1826, + "step": 2206 + }, + { + "epoch": 0.01312565420116091, + "grad_norm": 3.580603837966919, + "learning_rate": 4.997876772444365e-05, + "loss": 6.793, + "step": 2207 + }, + { + "epoch": 0.013131601484441906, + "grad_norm": 2.472733497619629, + "learning_rate": 4.9978748473239084e-05, + "loss": 6.9054, + "step": 2208 + }, + { + "epoch": 0.013137548767722905, + "grad_norm": 3.327461004257202, + "learning_rate": 4.99787292133147e-05, + "loss": 6.6735, + "step": 2209 + }, + { + "epoch": 0.013143496051003902, + "grad_norm": 3.493234157562256, + "learning_rate": 4.99787099446705e-05, + "loss": 6.9702, + "step": 2210 + }, + { + "epoch": 0.013149443334284899, + "grad_norm": 2.2516424655914307, + "learning_rate": 4.9978690667306483e-05, + "loss": 7.196, + "step": 2211 + }, + { + "epoch": 0.013155390617565895, + "grad_norm": 1.8846355676651, + "learning_rate": 4.9978671381222665e-05, + "loss": 7.0373, + "step": 2212 + }, + { + "epoch": 0.013161337900846894, + "grad_norm": 2.9334232807159424, + "learning_rate": 4.997865208641906e-05, + "loss": 6.2065, + "step": 2213 + }, + { + "epoch": 0.01316728518412789, + "grad_norm": 2.713006019592285, + "learning_rate": 4.997863278289565e-05, + "loss": 6.788, + "step": 2214 + }, + { + "epoch": 0.013173232467408888, + "grad_norm": 2.6246018409729004, + "learning_rate": 4.9978613470652466e-05, + "loss": 6.7979, + "step": 2215 + }, + { + "epoch": 0.013179179750689884, + "grad_norm": 2.2770373821258545, + "learning_rate": 4.997859414968951e-05, + "loss": 6.8307, + "step": 2216 + }, + { + "epoch": 0.013185127033970881, + "grad_norm": 2.6244993209838867, + "learning_rate": 4.997857482000679e-05, + "loss": 6.3176, + "step": 2217 + }, + { + "epoch": 0.01319107431725188, + "grad_norm": 3.4668054580688477, + "learning_rate": 4.997855548160429e-05, + "loss": 6.8962, + "step": 2218 + }, + { + "epoch": 0.013197021600532877, + "grad_norm": 2.711785078048706, + "learning_rate": 4.9978536134482047e-05, + "loss": 6.7111, + "step": 2219 + }, + { + "epoch": 0.013202968883813873, + "grad_norm": 2.6757078170776367, + "learning_rate": 4.997851677864005e-05, + "loss": 6.5501, + "step": 2220 + }, + { + "epoch": 0.01320891616709487, + "grad_norm": 2.150338888168335, + "learning_rate": 4.997849741407831e-05, + "loss": 6.43, + "step": 2221 + }, + { + "epoch": 0.013214863450375869, + "grad_norm": 3.115309953689575, + "learning_rate": 4.9978478040796836e-05, + "loss": 6.4074, + "step": 2222 + }, + { + "epoch": 0.013220810733656866, + "grad_norm": 2.8754189014434814, + "learning_rate": 4.997845865879564e-05, + "loss": 6.2663, + "step": 2223 + }, + { + "epoch": 0.013226758016937862, + "grad_norm": 2.6169707775115967, + "learning_rate": 4.9978439268074716e-05, + "loss": 6.5987, + "step": 2224 + }, + { + "epoch": 0.01323270530021886, + "grad_norm": 2.3814637660980225, + "learning_rate": 4.997841986863408e-05, + "loss": 6.8124, + "step": 2225 + }, + { + "epoch": 0.013238652583499858, + "grad_norm": 2.0276811122894287, + "learning_rate": 4.997840046047373e-05, + "loss": 6.6632, + "step": 2226 + }, + { + "epoch": 0.013244599866780855, + "grad_norm": 2.7943263053894043, + "learning_rate": 4.997838104359368e-05, + "loss": 6.5452, + "step": 2227 + }, + { + "epoch": 0.013250547150061852, + "grad_norm": 2.4058234691619873, + "learning_rate": 4.997836161799393e-05, + "loss": 6.4697, + "step": 2228 + }, + { + "epoch": 0.013256494433342848, + "grad_norm": 2.2487008571624756, + "learning_rate": 4.9978342183674504e-05, + "loss": 6.3361, + "step": 2229 + }, + { + "epoch": 0.013262441716623847, + "grad_norm": 2.3470170497894287, + "learning_rate": 4.997832274063539e-05, + "loss": 6.4024, + "step": 2230 + }, + { + "epoch": 0.013268388999904844, + "grad_norm": 2.589695692062378, + "learning_rate": 4.9978303288876606e-05, + "loss": 6.4184, + "step": 2231 + }, + { + "epoch": 0.01327433628318584, + "grad_norm": 2.691371440887451, + "learning_rate": 4.997828382839815e-05, + "loss": 6.4225, + "step": 2232 + }, + { + "epoch": 0.013280283566466837, + "grad_norm": 3.110410213470459, + "learning_rate": 4.997826435920003e-05, + "loss": 6.5307, + "step": 2233 + }, + { + "epoch": 0.013286230849747836, + "grad_norm": 2.688519239425659, + "learning_rate": 4.9978244881282266e-05, + "loss": 6.568, + "step": 2234 + }, + { + "epoch": 0.013292178133028833, + "grad_norm": 2.3346059322357178, + "learning_rate": 4.997822539464485e-05, + "loss": 6.8837, + "step": 2235 + }, + { + "epoch": 0.01329812541630983, + "grad_norm": 2.679826021194458, + "learning_rate": 4.997820589928779e-05, + "loss": 6.3961, + "step": 2236 + }, + { + "epoch": 0.013304072699590826, + "grad_norm": 2.388120412826538, + "learning_rate": 4.99781863952111e-05, + "loss": 6.4363, + "step": 2237 + }, + { + "epoch": 0.013310019982871825, + "grad_norm": 2.834341049194336, + "learning_rate": 4.997816688241478e-05, + "loss": 6.4855, + "step": 2238 + }, + { + "epoch": 0.013315967266152822, + "grad_norm": 2.8623831272125244, + "learning_rate": 4.997814736089885e-05, + "loss": 6.8607, + "step": 2239 + }, + { + "epoch": 0.013321914549433819, + "grad_norm": 3.001241683959961, + "learning_rate": 4.99781278306633e-05, + "loss": 6.9777, + "step": 2240 + }, + { + "epoch": 0.013327861832714815, + "grad_norm": 2.9721016883850098, + "learning_rate": 4.9978108291708135e-05, + "loss": 6.9821, + "step": 2241 + }, + { + "epoch": 0.013333809115995814, + "grad_norm": 2.798360824584961, + "learning_rate": 4.997808874403338e-05, + "loss": 7.0096, + "step": 2242 + }, + { + "epoch": 0.01333975639927681, + "grad_norm": 3.2242093086242676, + "learning_rate": 4.997806918763903e-05, + "loss": 6.9091, + "step": 2243 + }, + { + "epoch": 0.013345703682557808, + "grad_norm": 2.681920289993286, + "learning_rate": 4.99780496225251e-05, + "loss": 6.7769, + "step": 2244 + }, + { + "epoch": 0.013351650965838804, + "grad_norm": 3.199514865875244, + "learning_rate": 4.9978030048691584e-05, + "loss": 6.6202, + "step": 2245 + }, + { + "epoch": 0.013357598249119801, + "grad_norm": 2.89886474609375, + "learning_rate": 4.9978010466138496e-05, + "loss": 6.7075, + "step": 2246 + }, + { + "epoch": 0.0133635455324008, + "grad_norm": 2.7091262340545654, + "learning_rate": 4.997799087486584e-05, + "loss": 6.9129, + "step": 2247 + }, + { + "epoch": 0.013369492815681797, + "grad_norm": 2.2538888454437256, + "learning_rate": 4.997797127487364e-05, + "loss": 6.6412, + "step": 2248 + }, + { + "epoch": 0.013375440098962793, + "grad_norm": 2.668286085128784, + "learning_rate": 4.997795166616187e-05, + "loss": 6.8506, + "step": 2249 + }, + { + "epoch": 0.01338138738224379, + "grad_norm": 3.915975570678711, + "learning_rate": 4.997793204873057e-05, + "loss": 6.567, + "step": 2250 + }, + { + "epoch": 0.013387334665524789, + "grad_norm": 2.5549614429473877, + "learning_rate": 4.997791242257972e-05, + "loss": 6.7971, + "step": 2251 + }, + { + "epoch": 0.013393281948805786, + "grad_norm": 2.511810064315796, + "learning_rate": 4.997789278770935e-05, + "loss": 7.1949, + "step": 2252 + }, + { + "epoch": 0.013399229232086782, + "grad_norm": 2.026937484741211, + "learning_rate": 4.9977873144119445e-05, + "loss": 7.2067, + "step": 2253 + }, + { + "epoch": 0.01340517651536778, + "grad_norm": 3.6016058921813965, + "learning_rate": 4.997785349181002e-05, + "loss": 6.549, + "step": 2254 + }, + { + "epoch": 0.013411123798648778, + "grad_norm": 2.867418050765991, + "learning_rate": 4.9977833830781094e-05, + "loss": 6.5562, + "step": 2255 + }, + { + "epoch": 0.013417071081929775, + "grad_norm": 2.2168800830841064, + "learning_rate": 4.9977814161032665e-05, + "loss": 7.1798, + "step": 2256 + }, + { + "epoch": 0.013423018365210771, + "grad_norm": 2.728299856185913, + "learning_rate": 4.997779448256473e-05, + "loss": 6.9314, + "step": 2257 + }, + { + "epoch": 0.013428965648491768, + "grad_norm": 2.7336437702178955, + "learning_rate": 4.997777479537732e-05, + "loss": 7.0643, + "step": 2258 + }, + { + "epoch": 0.013434912931772767, + "grad_norm": 3.1546053886413574, + "learning_rate": 4.997775509947041e-05, + "loss": 6.8853, + "step": 2259 + }, + { + "epoch": 0.013440860215053764, + "grad_norm": 3.037036180496216, + "learning_rate": 4.997773539484404e-05, + "loss": 6.6892, + "step": 2260 + }, + { + "epoch": 0.01344680749833476, + "grad_norm": 2.8779382705688477, + "learning_rate": 4.997771568149818e-05, + "loss": 6.4991, + "step": 2261 + }, + { + "epoch": 0.013452754781615757, + "grad_norm": 3.1105282306671143, + "learning_rate": 4.997769595943288e-05, + "loss": 6.4253, + "step": 2262 + }, + { + "epoch": 0.013458702064896756, + "grad_norm": 4.604808330535889, + "learning_rate": 4.997767622864811e-05, + "loss": 6.504, + "step": 2263 + }, + { + "epoch": 0.013464649348177753, + "grad_norm": 4.345273017883301, + "learning_rate": 4.9977656489143896e-05, + "loss": 6.2, + "step": 2264 + }, + { + "epoch": 0.01347059663145875, + "grad_norm": 2.9744133949279785, + "learning_rate": 4.9977636740920243e-05, + "loss": 6.5458, + "step": 2265 + }, + { + "epoch": 0.013476543914739746, + "grad_norm": 3.3981447219848633, + "learning_rate": 4.9977616983977146e-05, + "loss": 6.9791, + "step": 2266 + }, + { + "epoch": 0.013482491198020745, + "grad_norm": 2.5855109691619873, + "learning_rate": 4.997759721831463e-05, + "loss": 6.7425, + "step": 2267 + }, + { + "epoch": 0.013488438481301742, + "grad_norm": 3.961195707321167, + "learning_rate": 4.997757744393269e-05, + "loss": 6.4042, + "step": 2268 + }, + { + "epoch": 0.013494385764582739, + "grad_norm": 3.8216230869293213, + "learning_rate": 4.997755766083133e-05, + "loss": 6.4962, + "step": 2269 + }, + { + "epoch": 0.013500333047863735, + "grad_norm": 3.077279567718506, + "learning_rate": 4.9977537869010574e-05, + "loss": 6.4298, + "step": 2270 + }, + { + "epoch": 0.013506280331144734, + "grad_norm": 2.56152081489563, + "learning_rate": 4.9977518068470406e-05, + "loss": 6.35, + "step": 2271 + }, + { + "epoch": 0.01351222761442573, + "grad_norm": 2.4069855213165283, + "learning_rate": 4.9977498259210854e-05, + "loss": 6.2923, + "step": 2272 + }, + { + "epoch": 0.013518174897706728, + "grad_norm": 2.9591124057769775, + "learning_rate": 4.9977478441231904e-05, + "loss": 6.2477, + "step": 2273 + }, + { + "epoch": 0.013524122180987724, + "grad_norm": 2.627110481262207, + "learning_rate": 4.997745861453359e-05, + "loss": 6.1012, + "step": 2274 + }, + { + "epoch": 0.013530069464268723, + "grad_norm": 2.3042867183685303, + "learning_rate": 4.997743877911589e-05, + "loss": 6.1155, + "step": 2275 + }, + { + "epoch": 0.01353601674754972, + "grad_norm": 2.709324359893799, + "learning_rate": 4.997741893497882e-05, + "loss": 6.0103, + "step": 2276 + }, + { + "epoch": 0.013541964030830717, + "grad_norm": 2.7087934017181396, + "learning_rate": 4.997739908212241e-05, + "loss": 6.0709, + "step": 2277 + }, + { + "epoch": 0.013547911314111713, + "grad_norm": 3.560149669647217, + "learning_rate": 4.997737922054664e-05, + "loss": 6.1775, + "step": 2278 + }, + { + "epoch": 0.01355385859739271, + "grad_norm": 4.623898506164551, + "learning_rate": 4.997735935025152e-05, + "loss": 6.1993, + "step": 2279 + }, + { + "epoch": 0.013559805880673709, + "grad_norm": 2.9960882663726807, + "learning_rate": 4.997733947123707e-05, + "loss": 6.4211, + "step": 2280 + }, + { + "epoch": 0.013565753163954706, + "grad_norm": 3.8918421268463135, + "learning_rate": 4.9977319583503276e-05, + "loss": 6.0194, + "step": 2281 + }, + { + "epoch": 0.013571700447235702, + "grad_norm": 3.4164741039276123, + "learning_rate": 4.997729968705017e-05, + "loss": 5.9824, + "step": 2282 + }, + { + "epoch": 0.0135776477305167, + "grad_norm": 2.4005794525146484, + "learning_rate": 4.997727978187774e-05, + "loss": 5.9727, + "step": 2283 + }, + { + "epoch": 0.013583595013797698, + "grad_norm": 2.4654550552368164, + "learning_rate": 4.9977259867986e-05, + "loss": 6.2681, + "step": 2284 + }, + { + "epoch": 0.013589542297078695, + "grad_norm": 3.193905830383301, + "learning_rate": 4.997723994537496e-05, + "loss": 6.4996, + "step": 2285 + }, + { + "epoch": 0.013595489580359691, + "grad_norm": 2.4845757484436035, + "learning_rate": 4.997722001404462e-05, + "loss": 7.0464, + "step": 2286 + }, + { + "epoch": 0.013601436863640688, + "grad_norm": 3.170182466506958, + "learning_rate": 4.9977200073995e-05, + "loss": 6.1071, + "step": 2287 + }, + { + "epoch": 0.013607384146921687, + "grad_norm": 2.2331149578094482, + "learning_rate": 4.997718012522609e-05, + "loss": 6.6823, + "step": 2288 + }, + { + "epoch": 0.013613331430202684, + "grad_norm": 2.4146671295166016, + "learning_rate": 4.9977160167737904e-05, + "loss": 6.4398, + "step": 2289 + }, + { + "epoch": 0.01361927871348368, + "grad_norm": 3.23956561088562, + "learning_rate": 4.9977140201530445e-05, + "loss": 6.9295, + "step": 2290 + }, + { + "epoch": 0.013625225996764677, + "grad_norm": 3.402979850769043, + "learning_rate": 4.997712022660374e-05, + "loss": 6.7116, + "step": 2291 + }, + { + "epoch": 0.013631173280045676, + "grad_norm": 3.241320848464966, + "learning_rate": 4.997710024295777e-05, + "loss": 6.8871, + "step": 2292 + }, + { + "epoch": 0.013637120563326673, + "grad_norm": 2.5378634929656982, + "learning_rate": 4.997708025059255e-05, + "loss": 6.9548, + "step": 2293 + }, + { + "epoch": 0.01364306784660767, + "grad_norm": 3.1968839168548584, + "learning_rate": 4.9977060249508087e-05, + "loss": 6.6388, + "step": 2294 + }, + { + "epoch": 0.013649015129888666, + "grad_norm": 2.6951656341552734, + "learning_rate": 4.99770402397044e-05, + "loss": 6.9654, + "step": 2295 + }, + { + "epoch": 0.013654962413169665, + "grad_norm": 2.4168484210968018, + "learning_rate": 4.997702022118147e-05, + "loss": 6.6666, + "step": 2296 + }, + { + "epoch": 0.013660909696450662, + "grad_norm": 3.1395177841186523, + "learning_rate": 4.997700019393934e-05, + "loss": 6.4957, + "step": 2297 + }, + { + "epoch": 0.013666856979731658, + "grad_norm": 3.1591687202453613, + "learning_rate": 4.9976980157977985e-05, + "loss": 6.4392, + "step": 2298 + }, + { + "epoch": 0.013672804263012655, + "grad_norm": 2.2415151596069336, + "learning_rate": 4.9976960113297436e-05, + "loss": 6.4543, + "step": 2299 + }, + { + "epoch": 0.013678751546293654, + "grad_norm": 3.9113616943359375, + "learning_rate": 4.997694005989767e-05, + "loss": 6.7088, + "step": 2300 + }, + { + "epoch": 0.01368469882957465, + "grad_norm": 4.218390941619873, + "learning_rate": 4.997691999777873e-05, + "loss": 6.7199, + "step": 2301 + }, + { + "epoch": 0.013690646112855647, + "grad_norm": 4.200760841369629, + "learning_rate": 4.997689992694059e-05, + "loss": 6.6343, + "step": 2302 + }, + { + "epoch": 0.013696593396136644, + "grad_norm": 3.7164547443389893, + "learning_rate": 4.997687984738328e-05, + "loss": 6.772, + "step": 2303 + }, + { + "epoch": 0.013702540679417643, + "grad_norm": 2.1898231506347656, + "learning_rate": 4.99768597591068e-05, + "loss": 6.6165, + "step": 2304 + }, + { + "epoch": 0.01370848796269864, + "grad_norm": 2.72632098197937, + "learning_rate": 4.9976839662111166e-05, + "loss": 6.6474, + "step": 2305 + }, + { + "epoch": 0.013714435245979636, + "grad_norm": 3.64900279045105, + "learning_rate": 4.997681955639636e-05, + "loss": 6.4322, + "step": 2306 + }, + { + "epoch": 0.013720382529260633, + "grad_norm": 3.978445053100586, + "learning_rate": 4.997679944196241e-05, + "loss": 6.5434, + "step": 2307 + }, + { + "epoch": 0.01372632981254163, + "grad_norm": 5.709702491760254, + "learning_rate": 4.997677931880931e-05, + "loss": 6.5234, + "step": 2308 + }, + { + "epoch": 0.013732277095822629, + "grad_norm": 3.0389838218688965, + "learning_rate": 4.997675918693708e-05, + "loss": 6.4163, + "step": 2309 + }, + { + "epoch": 0.013738224379103625, + "grad_norm": 2.695113182067871, + "learning_rate": 4.9976739046345725e-05, + "loss": 6.6956, + "step": 2310 + }, + { + "epoch": 0.013744171662384622, + "grad_norm": 2.9768142700195312, + "learning_rate": 4.997671889703525e-05, + "loss": 6.5315, + "step": 2311 + }, + { + "epoch": 0.01375011894566562, + "grad_norm": 3.750454902648926, + "learning_rate": 4.997669873900566e-05, + "loss": 6.5568, + "step": 2312 + }, + { + "epoch": 0.013756066228946618, + "grad_norm": 3.390232801437378, + "learning_rate": 4.9976678572256955e-05, + "loss": 6.4916, + "step": 2313 + }, + { + "epoch": 0.013762013512227615, + "grad_norm": 3.1487748622894287, + "learning_rate": 4.997665839678915e-05, + "loss": 6.6378, + "step": 2314 + }, + { + "epoch": 0.013767960795508611, + "grad_norm": 2.5654940605163574, + "learning_rate": 4.997663821260226e-05, + "loss": 6.5817, + "step": 2315 + }, + { + "epoch": 0.013773908078789608, + "grad_norm": 2.7092552185058594, + "learning_rate": 4.9976618019696275e-05, + "loss": 6.982, + "step": 2316 + }, + { + "epoch": 0.013779855362070607, + "grad_norm": 3.642826557159424, + "learning_rate": 4.9976597818071214e-05, + "loss": 6.7951, + "step": 2317 + }, + { + "epoch": 0.013785802645351604, + "grad_norm": 3.4288947582244873, + "learning_rate": 4.997657760772708e-05, + "loss": 6.4366, + "step": 2318 + }, + { + "epoch": 0.0137917499286326, + "grad_norm": 2.7620253562927246, + "learning_rate": 4.997655738866389e-05, + "loss": 6.6588, + "step": 2319 + }, + { + "epoch": 0.013797697211913597, + "grad_norm": 2.4266698360443115, + "learning_rate": 4.997653716088163e-05, + "loss": 6.697, + "step": 2320 + }, + { + "epoch": 0.013803644495194596, + "grad_norm": 2.289365768432617, + "learning_rate": 4.9976516924380325e-05, + "loss": 6.7583, + "step": 2321 + }, + { + "epoch": 0.013809591778475593, + "grad_norm": 2.4238948822021484, + "learning_rate": 4.9976496679159976e-05, + "loss": 6.7949, + "step": 2322 + }, + { + "epoch": 0.01381553906175659, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.997647642522059e-05, + "loss": 6.5914, + "step": 2323 + }, + { + "epoch": 0.013821486345037586, + "grad_norm": 2.961089849472046, + "learning_rate": 4.997645616256217e-05, + "loss": 6.3513, + "step": 2324 + }, + { + "epoch": 0.013827433628318585, + "grad_norm": 2.437685251235962, + "learning_rate": 4.997643589118472e-05, + "loss": 6.4626, + "step": 2325 + }, + { + "epoch": 0.013833380911599582, + "grad_norm": 2.769731044769287, + "learning_rate": 4.9976415611088267e-05, + "loss": 6.2801, + "step": 2326 + }, + { + "epoch": 0.013839328194880578, + "grad_norm": 2.700697183609009, + "learning_rate": 4.9976395322272805e-05, + "loss": 6.1969, + "step": 2327 + }, + { + "epoch": 0.013845275478161575, + "grad_norm": 3.8049886226654053, + "learning_rate": 4.997637502473834e-05, + "loss": 6.769, + "step": 2328 + }, + { + "epoch": 0.013851222761442574, + "grad_norm": 3.748903512954712, + "learning_rate": 4.9976354718484875e-05, + "loss": 6.6486, + "step": 2329 + }, + { + "epoch": 0.01385717004472357, + "grad_norm": 3.7807834148406982, + "learning_rate": 4.9976334403512426e-05, + "loss": 6.6251, + "step": 2330 + }, + { + "epoch": 0.013863117328004567, + "grad_norm": 2.5358874797821045, + "learning_rate": 4.997631407982099e-05, + "loss": 6.4425, + "step": 2331 + }, + { + "epoch": 0.013869064611285564, + "grad_norm": 2.4619522094726562, + "learning_rate": 4.9976293747410596e-05, + "loss": 7.2166, + "step": 2332 + }, + { + "epoch": 0.013875011894566563, + "grad_norm": 2.740412473678589, + "learning_rate": 4.997627340628123e-05, + "loss": 6.8907, + "step": 2333 + }, + { + "epoch": 0.01388095917784756, + "grad_norm": 2.872852087020874, + "learning_rate": 4.9976253056432895e-05, + "loss": 6.6142, + "step": 2334 + }, + { + "epoch": 0.013886906461128556, + "grad_norm": 2.01629900932312, + "learning_rate": 4.997623269786562e-05, + "loss": 6.398, + "step": 2335 + }, + { + "epoch": 0.013892853744409553, + "grad_norm": 2.4405698776245117, + "learning_rate": 4.99762123305794e-05, + "loss": 6.9282, + "step": 2336 + }, + { + "epoch": 0.013898801027690552, + "grad_norm": 2.2520413398742676, + "learning_rate": 4.9976191954574235e-05, + "loss": 6.5565, + "step": 2337 + }, + { + "epoch": 0.013904748310971549, + "grad_norm": 2.314852476119995, + "learning_rate": 4.997617156985014e-05, + "loss": 6.3055, + "step": 2338 + }, + { + "epoch": 0.013910695594252545, + "grad_norm": 2.9049081802368164, + "learning_rate": 4.9976151176407124e-05, + "loss": 7.1806, + "step": 2339 + }, + { + "epoch": 0.013916642877533542, + "grad_norm": 2.7533769607543945, + "learning_rate": 4.9976130774245197e-05, + "loss": 7.0047, + "step": 2340 + }, + { + "epoch": 0.013922590160814539, + "grad_norm": 2.124826431274414, + "learning_rate": 4.997611036336435e-05, + "loss": 7.1897, + "step": 2341 + }, + { + "epoch": 0.013928537444095538, + "grad_norm": 2.5205366611480713, + "learning_rate": 4.997608994376461e-05, + "loss": 6.8592, + "step": 2342 + }, + { + "epoch": 0.013934484727376534, + "grad_norm": 2.8026719093322754, + "learning_rate": 4.9976069515445975e-05, + "loss": 6.6622, + "step": 2343 + }, + { + "epoch": 0.013940432010657531, + "grad_norm": 3.045438051223755, + "learning_rate": 4.997604907840845e-05, + "loss": 6.6176, + "step": 2344 + }, + { + "epoch": 0.013946379293938528, + "grad_norm": 2.820199489593506, + "learning_rate": 4.997602863265204e-05, + "loss": 6.4489, + "step": 2345 + }, + { + "epoch": 0.013952326577219527, + "grad_norm": 2.997990369796753, + "learning_rate": 4.997600817817676e-05, + "loss": 7.0989, + "step": 2346 + }, + { + "epoch": 0.013958273860500523, + "grad_norm": 3.316575050354004, + "learning_rate": 4.9975987714982606e-05, + "loss": 6.9042, + "step": 2347 + }, + { + "epoch": 0.01396422114378152, + "grad_norm": 2.3339803218841553, + "learning_rate": 4.99759672430696e-05, + "loss": 6.8831, + "step": 2348 + }, + { + "epoch": 0.013970168427062517, + "grad_norm": 2.510274648666382, + "learning_rate": 4.997594676243775e-05, + "loss": 7.1093, + "step": 2349 + }, + { + "epoch": 0.013976115710343516, + "grad_norm": 2.893909215927124, + "learning_rate": 4.997592627308705e-05, + "loss": 6.5477, + "step": 2350 + }, + { + "epoch": 0.013982062993624512, + "grad_norm": 3.6036674976348877, + "learning_rate": 4.9975905775017505e-05, + "loss": 6.3278, + "step": 2351 + }, + { + "epoch": 0.01398801027690551, + "grad_norm": 2.1260125637054443, + "learning_rate": 4.9975885268229127e-05, + "loss": 6.7883, + "step": 2352 + }, + { + "epoch": 0.013993957560186506, + "grad_norm": 2.328247308731079, + "learning_rate": 4.997586475272193e-05, + "loss": 6.4832, + "step": 2353 + }, + { + "epoch": 0.013999904843467505, + "grad_norm": 2.8075780868530273, + "learning_rate": 4.997584422849593e-05, + "loss": 6.9333, + "step": 2354 + }, + { + "epoch": 0.014005852126748502, + "grad_norm": 1.9339990615844727, + "learning_rate": 4.9975823695551106e-05, + "loss": 6.6856, + "step": 2355 + }, + { + "epoch": 0.014011799410029498, + "grad_norm": 2.842968225479126, + "learning_rate": 4.997580315388748e-05, + "loss": 6.48, + "step": 2356 + }, + { + "epoch": 0.014017746693310495, + "grad_norm": 1.8715558052062988, + "learning_rate": 4.997578260350506e-05, + "loss": 6.8702, + "step": 2357 + }, + { + "epoch": 0.014023693976591494, + "grad_norm": 2.4310202598571777, + "learning_rate": 4.9975762044403865e-05, + "loss": 7.0112, + "step": 2358 + }, + { + "epoch": 0.01402964125987249, + "grad_norm": 2.292121648788452, + "learning_rate": 4.997574147658387e-05, + "loss": 6.6505, + "step": 2359 + }, + { + "epoch": 0.014035588543153487, + "grad_norm": 2.374007225036621, + "learning_rate": 4.997572090004511e-05, + "loss": 6.7332, + "step": 2360 + }, + { + "epoch": 0.014041535826434484, + "grad_norm": 2.198131561279297, + "learning_rate": 4.997570031478759e-05, + "loss": 6.6358, + "step": 2361 + }, + { + "epoch": 0.014047483109715483, + "grad_norm": 2.3109302520751953, + "learning_rate": 4.997567972081131e-05, + "loss": 6.6194, + "step": 2362 + }, + { + "epoch": 0.01405343039299648, + "grad_norm": 2.49338698387146, + "learning_rate": 4.997565911811627e-05, + "loss": 6.5036, + "step": 2363 + }, + { + "epoch": 0.014059377676277476, + "grad_norm": 2.6462419033050537, + "learning_rate": 4.997563850670249e-05, + "loss": 6.4294, + "step": 2364 + }, + { + "epoch": 0.014065324959558473, + "grad_norm": 3.0072524547576904, + "learning_rate": 4.997561788656997e-05, + "loss": 6.8814, + "step": 2365 + }, + { + "epoch": 0.014071272242839472, + "grad_norm": 2.435209035873413, + "learning_rate": 4.997559725771872e-05, + "loss": 6.4684, + "step": 2366 + }, + { + "epoch": 0.014077219526120469, + "grad_norm": 2.8023672103881836, + "learning_rate": 4.997557662014875e-05, + "loss": 6.7922, + "step": 2367 + }, + { + "epoch": 0.014083166809401465, + "grad_norm": 2.6129658222198486, + "learning_rate": 4.9975555973860065e-05, + "loss": 6.4539, + "step": 2368 + }, + { + "epoch": 0.014089114092682462, + "grad_norm": 2.559117317199707, + "learning_rate": 4.997553531885267e-05, + "loss": 6.4713, + "step": 2369 + }, + { + "epoch": 0.014095061375963459, + "grad_norm": 2.4535956382751465, + "learning_rate": 4.9975514655126575e-05, + "loss": 6.963, + "step": 2370 + }, + { + "epoch": 0.014101008659244458, + "grad_norm": 2.3025150299072266, + "learning_rate": 4.997549398268178e-05, + "loss": 6.9299, + "step": 2371 + }, + { + "epoch": 0.014106955942525454, + "grad_norm": 2.834411382675171, + "learning_rate": 4.997547330151831e-05, + "loss": 6.299, + "step": 2372 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 2.8046083450317383, + "learning_rate": 4.997545261163615e-05, + "loss": 5.7691, + "step": 2373 + }, + { + "epoch": 0.014118850509087448, + "grad_norm": 2.663776159286499, + "learning_rate": 4.997543191303532e-05, + "loss": 5.969, + "step": 2374 + }, + { + "epoch": 0.014124797792368447, + "grad_norm": 2.725154161453247, + "learning_rate": 4.997541120571582e-05, + "loss": 5.7473, + "step": 2375 + }, + { + "epoch": 0.014130745075649443, + "grad_norm": 2.9021074771881104, + "learning_rate": 4.9975390489677663e-05, + "loss": 6.3177, + "step": 2376 + }, + { + "epoch": 0.01413669235893044, + "grad_norm": 2.4043307304382324, + "learning_rate": 4.9975369764920866e-05, + "loss": 6.358, + "step": 2377 + }, + { + "epoch": 0.014142639642211437, + "grad_norm": 2.4163010120391846, + "learning_rate": 4.997534903144542e-05, + "loss": 6.6807, + "step": 2378 + }, + { + "epoch": 0.014148586925492436, + "grad_norm": 3.0710666179656982, + "learning_rate": 4.9975328289251335e-05, + "loss": 6.2416, + "step": 2379 + }, + { + "epoch": 0.014154534208773432, + "grad_norm": 2.159627676010132, + "learning_rate": 4.997530753833862e-05, + "loss": 7.1434, + "step": 2380 + }, + { + "epoch": 0.01416048149205443, + "grad_norm": 2.308382034301758, + "learning_rate": 4.997528677870729e-05, + "loss": 7.1243, + "step": 2381 + }, + { + "epoch": 0.014166428775335426, + "grad_norm": 2.7461323738098145, + "learning_rate": 4.997526601035734e-05, + "loss": 6.3066, + "step": 2382 + }, + { + "epoch": 0.014172376058616425, + "grad_norm": 2.8835322856903076, + "learning_rate": 4.997524523328878e-05, + "loss": 6.28, + "step": 2383 + }, + { + "epoch": 0.014178323341897421, + "grad_norm": 2.5195534229278564, + "learning_rate": 4.997522444750162e-05, + "loss": 6.9561, + "step": 2384 + }, + { + "epoch": 0.014184270625178418, + "grad_norm": 3.1697885990142822, + "learning_rate": 4.997520365299587e-05, + "loss": 6.7432, + "step": 2385 + }, + { + "epoch": 0.014190217908459415, + "grad_norm": 3.6300339698791504, + "learning_rate": 4.997518284977154e-05, + "loss": 6.3676, + "step": 2386 + }, + { + "epoch": 0.014196165191740414, + "grad_norm": 3.261981964111328, + "learning_rate": 4.9975162037828625e-05, + "loss": 6.0991, + "step": 2387 + }, + { + "epoch": 0.01420211247502141, + "grad_norm": 3.6291120052337646, + "learning_rate": 4.9975141217167146e-05, + "loss": 6.1239, + "step": 2388 + }, + { + "epoch": 0.014208059758302407, + "grad_norm": 3.192958116531372, + "learning_rate": 4.997512038778709e-05, + "loss": 6.4455, + "step": 2389 + }, + { + "epoch": 0.014214007041583404, + "grad_norm": 2.8887948989868164, + "learning_rate": 4.997509954968849e-05, + "loss": 6.9441, + "step": 2390 + }, + { + "epoch": 0.014219954324864403, + "grad_norm": 2.3568248748779297, + "learning_rate": 4.9975078702871336e-05, + "loss": 7.0207, + "step": 2391 + }, + { + "epoch": 0.0142259016081454, + "grad_norm": 2.2629294395446777, + "learning_rate": 4.997505784733564e-05, + "loss": 6.9575, + "step": 2392 + }, + { + "epoch": 0.014231848891426396, + "grad_norm": 2.5458898544311523, + "learning_rate": 4.99750369830814e-05, + "loss": 6.8533, + "step": 2393 + }, + { + "epoch": 0.014237796174707393, + "grad_norm": 2.5125060081481934, + "learning_rate": 4.997501611010865e-05, + "loss": 6.8615, + "step": 2394 + }, + { + "epoch": 0.014243743457988392, + "grad_norm": 2.9903738498687744, + "learning_rate": 4.997499522841737e-05, + "loss": 6.6927, + "step": 2395 + }, + { + "epoch": 0.014249690741269389, + "grad_norm": 2.7536470890045166, + "learning_rate": 4.997497433800758e-05, + "loss": 6.6454, + "step": 2396 + }, + { + "epoch": 0.014255638024550385, + "grad_norm": 3.5041043758392334, + "learning_rate": 4.997495343887928e-05, + "loss": 6.485, + "step": 2397 + }, + { + "epoch": 0.014261585307831382, + "grad_norm": 3.8025100231170654, + "learning_rate": 4.997493253103249e-05, + "loss": 6.3731, + "step": 2398 + }, + { + "epoch": 0.01426753259111238, + "grad_norm": 3.2657718658447266, + "learning_rate": 4.99749116144672e-05, + "loss": 6.23, + "step": 2399 + }, + { + "epoch": 0.014273479874393378, + "grad_norm": 2.721632719039917, + "learning_rate": 4.997489068918343e-05, + "loss": 6.7292, + "step": 2400 + }, + { + "epoch": 0.014279427157674374, + "grad_norm": 2.3483569622039795, + "learning_rate": 4.9974869755181186e-05, + "loss": 6.4842, + "step": 2401 + }, + { + "epoch": 0.014285374440955371, + "grad_norm": 2.4931676387786865, + "learning_rate": 4.997484881246047e-05, + "loss": 7.0529, + "step": 2402 + }, + { + "epoch": 0.014291321724236368, + "grad_norm": 2.4944825172424316, + "learning_rate": 4.99748278610213e-05, + "loss": 7.0185, + "step": 2403 + }, + { + "epoch": 0.014297269007517367, + "grad_norm": 2.9124202728271484, + "learning_rate": 4.997480690086367e-05, + "loss": 6.9847, + "step": 2404 + }, + { + "epoch": 0.014303216290798363, + "grad_norm": 2.5802674293518066, + "learning_rate": 4.997478593198759e-05, + "loss": 7.0389, + "step": 2405 + }, + { + "epoch": 0.01430916357407936, + "grad_norm": 2.636709451675415, + "learning_rate": 4.9974764954393075e-05, + "loss": 6.7281, + "step": 2406 + }, + { + "epoch": 0.014315110857360357, + "grad_norm": 3.801760196685791, + "learning_rate": 4.997474396808012e-05, + "loss": 5.9962, + "step": 2407 + }, + { + "epoch": 0.014321058140641356, + "grad_norm": 3.7983996868133545, + "learning_rate": 4.997472297304875e-05, + "loss": 6.3821, + "step": 2408 + }, + { + "epoch": 0.014327005423922352, + "grad_norm": 2.863408088684082, + "learning_rate": 4.997470196929895e-05, + "loss": 6.2206, + "step": 2409 + }, + { + "epoch": 0.01433295270720335, + "grad_norm": 2.6187095642089844, + "learning_rate": 4.997468095683076e-05, + "loss": 6.2205, + "step": 2410 + }, + { + "epoch": 0.014338899990484346, + "grad_norm": 3.202986240386963, + "learning_rate": 4.997465993564414e-05, + "loss": 6.259, + "step": 2411 + }, + { + "epoch": 0.014344847273765345, + "grad_norm": 2.9131264686584473, + "learning_rate": 4.9974638905739146e-05, + "loss": 6.4159, + "step": 2412 + }, + { + "epoch": 0.014350794557046341, + "grad_norm": 2.384477376937866, + "learning_rate": 4.9974617867115754e-05, + "loss": 6.6669, + "step": 2413 + }, + { + "epoch": 0.014356741840327338, + "grad_norm": 2.448495626449585, + "learning_rate": 4.997459681977398e-05, + "loss": 6.5679, + "step": 2414 + }, + { + "epoch": 0.014362689123608335, + "grad_norm": 2.1945343017578125, + "learning_rate": 4.997457576371384e-05, + "loss": 6.3856, + "step": 2415 + }, + { + "epoch": 0.014368636406889334, + "grad_norm": 1.867848515510559, + "learning_rate": 4.997455469893533e-05, + "loss": 6.3127, + "step": 2416 + }, + { + "epoch": 0.01437458369017033, + "grad_norm": 2.560976266860962, + "learning_rate": 4.997453362543846e-05, + "loss": 6.4619, + "step": 2417 + }, + { + "epoch": 0.014380530973451327, + "grad_norm": 3.2440431118011475, + "learning_rate": 4.997451254322323e-05, + "loss": 6.399, + "step": 2418 + }, + { + "epoch": 0.014386478256732324, + "grad_norm": 3.0021307468414307, + "learning_rate": 4.9974491452289664e-05, + "loss": 6.174, + "step": 2419 + }, + { + "epoch": 0.014392425540013323, + "grad_norm": 2.6046524047851562, + "learning_rate": 4.997447035263776e-05, + "loss": 6.8284, + "step": 2420 + }, + { + "epoch": 0.01439837282329432, + "grad_norm": 3.1395344734191895, + "learning_rate": 4.997444924426753e-05, + "loss": 6.3395, + "step": 2421 + }, + { + "epoch": 0.014404320106575316, + "grad_norm": 3.056152582168579, + "learning_rate": 4.997442812717897e-05, + "loss": 6.3468, + "step": 2422 + }, + { + "epoch": 0.014410267389856313, + "grad_norm": 2.2532267570495605, + "learning_rate": 4.9974407001372105e-05, + "loss": 6.5187, + "step": 2423 + }, + { + "epoch": 0.014416214673137312, + "grad_norm": 2.0228383541107178, + "learning_rate": 4.997438586684693e-05, + "loss": 6.4452, + "step": 2424 + }, + { + "epoch": 0.014422161956418308, + "grad_norm": 3.2889909744262695, + "learning_rate": 4.997436472360345e-05, + "loss": 6.6466, + "step": 2425 + }, + { + "epoch": 0.014428109239699305, + "grad_norm": 2.957916498184204, + "learning_rate": 4.9974343571641677e-05, + "loss": 6.9617, + "step": 2426 + }, + { + "epoch": 0.014434056522980302, + "grad_norm": 2.7629241943359375, + "learning_rate": 4.997432241096162e-05, + "loss": 6.1687, + "step": 2427 + }, + { + "epoch": 0.0144400038062613, + "grad_norm": 2.849297285079956, + "learning_rate": 4.997430124156329e-05, + "loss": 6.4647, + "step": 2428 + }, + { + "epoch": 0.014445951089542297, + "grad_norm": 2.2432122230529785, + "learning_rate": 4.997428006344669e-05, + "loss": 7.1739, + "step": 2429 + }, + { + "epoch": 0.014451898372823294, + "grad_norm": 2.814807891845703, + "learning_rate": 4.997425887661181e-05, + "loss": 5.945, + "step": 2430 + }, + { + "epoch": 0.014457845656104291, + "grad_norm": 3.140153646469116, + "learning_rate": 4.997423768105869e-05, + "loss": 6.5948, + "step": 2431 + }, + { + "epoch": 0.01446379293938529, + "grad_norm": 2.5276620388031006, + "learning_rate": 4.997421647678732e-05, + "loss": 6.9813, + "step": 2432 + }, + { + "epoch": 0.014469740222666286, + "grad_norm": 2.462204694747925, + "learning_rate": 4.9974195263797705e-05, + "loss": 6.8987, + "step": 2433 + }, + { + "epoch": 0.014475687505947283, + "grad_norm": 3.117255210876465, + "learning_rate": 4.997417404208986e-05, + "loss": 5.883, + "step": 2434 + }, + { + "epoch": 0.01448163478922828, + "grad_norm": 2.6207518577575684, + "learning_rate": 4.997415281166379e-05, + "loss": 6.8065, + "step": 2435 + }, + { + "epoch": 0.014487582072509277, + "grad_norm": 2.996624231338501, + "learning_rate": 4.99741315725195e-05, + "loss": 6.5162, + "step": 2436 + }, + { + "epoch": 0.014493529355790276, + "grad_norm": 2.1946496963500977, + "learning_rate": 4.9974110324656996e-05, + "loss": 6.9521, + "step": 2437 + }, + { + "epoch": 0.014499476639071272, + "grad_norm": 2.273017406463623, + "learning_rate": 4.997408906807629e-05, + "loss": 7.0144, + "step": 2438 + }, + { + "epoch": 0.01450542392235227, + "grad_norm": 2.516509771347046, + "learning_rate": 4.997406780277739e-05, + "loss": 7.013, + "step": 2439 + }, + { + "epoch": 0.014511371205633266, + "grad_norm": 3.0296435356140137, + "learning_rate": 4.9974046528760296e-05, + "loss": 6.934, + "step": 2440 + }, + { + "epoch": 0.014517318488914265, + "grad_norm": 2.6135010719299316, + "learning_rate": 4.9974025246025024e-05, + "loss": 6.7151, + "step": 2441 + }, + { + "epoch": 0.014523265772195261, + "grad_norm": 2.6850788593292236, + "learning_rate": 4.997400395457158e-05, + "loss": 6.5223, + "step": 2442 + }, + { + "epoch": 0.014529213055476258, + "grad_norm": 3.0401692390441895, + "learning_rate": 4.9973982654399966e-05, + "loss": 7.2006, + "step": 2443 + }, + { + "epoch": 0.014535160338757255, + "grad_norm": 3.016805410385132, + "learning_rate": 4.997396134551019e-05, + "loss": 7.0633, + "step": 2444 + }, + { + "epoch": 0.014541107622038254, + "grad_norm": 3.107154130935669, + "learning_rate": 4.9973940027902264e-05, + "loss": 6.9096, + "step": 2445 + }, + { + "epoch": 0.01454705490531925, + "grad_norm": 2.720054864883423, + "learning_rate": 4.9973918701576196e-05, + "loss": 6.7061, + "step": 2446 + }, + { + "epoch": 0.014553002188600247, + "grad_norm": 2.386401414871216, + "learning_rate": 4.9973897366531984e-05, + "loss": 6.5877, + "step": 2447 + }, + { + "epoch": 0.014558949471881244, + "grad_norm": 2.488243579864502, + "learning_rate": 4.997387602276965e-05, + "loss": 6.7792, + "step": 2448 + }, + { + "epoch": 0.014564896755162243, + "grad_norm": 2.7504360675811768, + "learning_rate": 4.9973854670289196e-05, + "loss": 6.6164, + "step": 2449 + }, + { + "epoch": 0.01457084403844324, + "grad_norm": 3.001441240310669, + "learning_rate": 4.9973833309090626e-05, + "loss": 6.5933, + "step": 2450 + }, + { + "epoch": 0.014576791321724236, + "grad_norm": 2.6449999809265137, + "learning_rate": 4.997381193917394e-05, + "loss": 6.5323, + "step": 2451 + }, + { + "epoch": 0.014582738605005233, + "grad_norm": 2.81846022605896, + "learning_rate": 4.9973790560539156e-05, + "loss": 6.5146, + "step": 2452 + }, + { + "epoch": 0.014588685888286232, + "grad_norm": 2.662916421890259, + "learning_rate": 4.997376917318629e-05, + "loss": 6.161, + "step": 2453 + }, + { + "epoch": 0.014594633171567228, + "grad_norm": 2.689601421356201, + "learning_rate": 4.997374777711533e-05, + "loss": 6.2008, + "step": 2454 + }, + { + "epoch": 0.014600580454848225, + "grad_norm": 2.6690561771392822, + "learning_rate": 4.99737263723263e-05, + "loss": 6.4418, + "step": 2455 + }, + { + "epoch": 0.014606527738129222, + "grad_norm": 2.897270917892456, + "learning_rate": 4.997370495881919e-05, + "loss": 6.3968, + "step": 2456 + }, + { + "epoch": 0.01461247502141022, + "grad_norm": 2.9327831268310547, + "learning_rate": 4.997368353659402e-05, + "loss": 6.4665, + "step": 2457 + }, + { + "epoch": 0.014618422304691217, + "grad_norm": 2.658013343811035, + "learning_rate": 4.99736621056508e-05, + "loss": 6.399, + "step": 2458 + }, + { + "epoch": 0.014624369587972214, + "grad_norm": 2.6055238246917725, + "learning_rate": 4.997364066598953e-05, + "loss": 6.4679, + "step": 2459 + }, + { + "epoch": 0.014630316871253211, + "grad_norm": 3.0595951080322266, + "learning_rate": 4.997361921761022e-05, + "loss": 5.8797, + "step": 2460 + }, + { + "epoch": 0.01463626415453421, + "grad_norm": 2.994694471359253, + "learning_rate": 4.997359776051288e-05, + "loss": 5.704, + "step": 2461 + }, + { + "epoch": 0.014642211437815206, + "grad_norm": 2.78153657913208, + "learning_rate": 4.9973576294697514e-05, + "loss": 5.7289, + "step": 2462 + }, + { + "epoch": 0.014648158721096203, + "grad_norm": 2.5119385719299316, + "learning_rate": 4.997355482016414e-05, + "loss": 5.5494, + "step": 2463 + }, + { + "epoch": 0.0146541060043772, + "grad_norm": 2.7880990505218506, + "learning_rate": 4.997353333691274e-05, + "loss": 5.5905, + "step": 2464 + }, + { + "epoch": 0.014660053287658197, + "grad_norm": 2.827352523803711, + "learning_rate": 4.9973511844943346e-05, + "loss": 6.4429, + "step": 2465 + }, + { + "epoch": 0.014666000570939195, + "grad_norm": 2.4297358989715576, + "learning_rate": 4.997349034425595e-05, + "loss": 6.8647, + "step": 2466 + }, + { + "epoch": 0.014671947854220192, + "grad_norm": 2.649064064025879, + "learning_rate": 4.997346883485057e-05, + "loss": 6.5568, + "step": 2467 + }, + { + "epoch": 0.014677895137501189, + "grad_norm": 3.2215452194213867, + "learning_rate": 4.9973447316727215e-05, + "loss": 5.5684, + "step": 2468 + }, + { + "epoch": 0.014683842420782186, + "grad_norm": 2.8760056495666504, + "learning_rate": 4.9973425789885884e-05, + "loss": 5.6395, + "step": 2469 + }, + { + "epoch": 0.014689789704063184, + "grad_norm": 2.4002890586853027, + "learning_rate": 4.9973404254326585e-05, + "loss": 5.9525, + "step": 2470 + }, + { + "epoch": 0.014695736987344181, + "grad_norm": 2.32314395904541, + "learning_rate": 4.997338271004933e-05, + "loss": 6.9675, + "step": 2471 + }, + { + "epoch": 0.014701684270625178, + "grad_norm": 2.262680768966675, + "learning_rate": 4.997336115705413e-05, + "loss": 7.1361, + "step": 2472 + }, + { + "epoch": 0.014707631553906175, + "grad_norm": 2.2855215072631836, + "learning_rate": 4.997333959534098e-05, + "loss": 7.1141, + "step": 2473 + }, + { + "epoch": 0.014713578837187173, + "grad_norm": 2.5461738109588623, + "learning_rate": 4.99733180249099e-05, + "loss": 7.0492, + "step": 2474 + }, + { + "epoch": 0.01471952612046817, + "grad_norm": 2.455561399459839, + "learning_rate": 4.99732964457609e-05, + "loss": 6.9303, + "step": 2475 + }, + { + "epoch": 0.014725473403749167, + "grad_norm": 3.3767740726470947, + "learning_rate": 4.997327485789397e-05, + "loss": 6.8531, + "step": 2476 + }, + { + "epoch": 0.014731420687030164, + "grad_norm": 2.9320104122161865, + "learning_rate": 4.9973253261309125e-05, + "loss": 6.9258, + "step": 2477 + }, + { + "epoch": 0.014737367970311162, + "grad_norm": 2.380960464477539, + "learning_rate": 4.997323165600638e-05, + "loss": 6.8581, + "step": 2478 + }, + { + "epoch": 0.01474331525359216, + "grad_norm": 2.727154016494751, + "learning_rate": 4.997321004198574e-05, + "loss": 7.3814, + "step": 2479 + }, + { + "epoch": 0.014749262536873156, + "grad_norm": 2.8693020343780518, + "learning_rate": 4.997318841924721e-05, + "loss": 6.3793, + "step": 2480 + }, + { + "epoch": 0.014755209820154153, + "grad_norm": 2.941622734069824, + "learning_rate": 4.997316678779079e-05, + "loss": 7.3567, + "step": 2481 + }, + { + "epoch": 0.014761157103435152, + "grad_norm": 3.0310213565826416, + "learning_rate": 4.9973145147616505e-05, + "loss": 6.8832, + "step": 2482 + }, + { + "epoch": 0.014767104386716148, + "grad_norm": 1.9184696674346924, + "learning_rate": 4.9973123498724353e-05, + "loss": 6.7369, + "step": 2483 + }, + { + "epoch": 0.014773051669997145, + "grad_norm": 2.3090195655822754, + "learning_rate": 4.9973101841114335e-05, + "loss": 6.8927, + "step": 2484 + }, + { + "epoch": 0.014778998953278142, + "grad_norm": 2.2947685718536377, + "learning_rate": 4.997308017478647e-05, + "loss": 6.9441, + "step": 2485 + }, + { + "epoch": 0.01478494623655914, + "grad_norm": 2.363690137863159, + "learning_rate": 4.997305849974076e-05, + "loss": 6.9397, + "step": 2486 + }, + { + "epoch": 0.014790893519840137, + "grad_norm": 1.7546948194503784, + "learning_rate": 4.997303681597721e-05, + "loss": 6.7888, + "step": 2487 + }, + { + "epoch": 0.014796840803121134, + "grad_norm": 1.8824211359024048, + "learning_rate": 4.997301512349584e-05, + "loss": 6.6486, + "step": 2488 + }, + { + "epoch": 0.014802788086402131, + "grad_norm": 3.68865704536438, + "learning_rate": 4.9972993422296636e-05, + "loss": 7.0318, + "step": 2489 + }, + { + "epoch": 0.01480873536968313, + "grad_norm": 3.0788486003875732, + "learning_rate": 4.997297171237962e-05, + "loss": 6.814, + "step": 2490 + }, + { + "epoch": 0.014814682652964126, + "grad_norm": 2.6903607845306396, + "learning_rate": 4.997294999374481e-05, + "loss": 6.9752, + "step": 2491 + }, + { + "epoch": 0.014820629936245123, + "grad_norm": 2.6673712730407715, + "learning_rate": 4.9972928266392194e-05, + "loss": 6.9083, + "step": 2492 + }, + { + "epoch": 0.01482657721952612, + "grad_norm": 2.335632801055908, + "learning_rate": 4.9972906530321786e-05, + "loss": 7.027, + "step": 2493 + }, + { + "epoch": 0.014832524502807119, + "grad_norm": 3.2885966300964355, + "learning_rate": 4.997288478553359e-05, + "loss": 6.6551, + "step": 2494 + }, + { + "epoch": 0.014838471786088115, + "grad_norm": 2.7297918796539307, + "learning_rate": 4.997286303202762e-05, + "loss": 6.7345, + "step": 2495 + }, + { + "epoch": 0.014844419069369112, + "grad_norm": 2.640814781188965, + "learning_rate": 4.997284126980388e-05, + "loss": 6.743, + "step": 2496 + }, + { + "epoch": 0.014850366352650109, + "grad_norm": 2.699632167816162, + "learning_rate": 4.997281949886239e-05, + "loss": 6.4633, + "step": 2497 + }, + { + "epoch": 0.014856313635931106, + "grad_norm": 2.5185790061950684, + "learning_rate": 4.9972797719203135e-05, + "loss": 6.5496, + "step": 2498 + }, + { + "epoch": 0.014862260919212104, + "grad_norm": 2.659393548965454, + "learning_rate": 4.9972775930826144e-05, + "loss": 6.5066, + "step": 2499 + }, + { + "epoch": 0.014868208202493101, + "grad_norm": 2.160808563232422, + "learning_rate": 4.99727541337314e-05, + "loss": 6.9851, + "step": 2500 + }, + { + "epoch": 0.014874155485774098, + "grad_norm": 2.656506299972534, + "learning_rate": 4.997273232791894e-05, + "loss": 7.5696, + "step": 2501 + }, + { + "epoch": 0.014880102769055095, + "grad_norm": 2.490612506866455, + "learning_rate": 4.9972710513388754e-05, + "loss": 7.2623, + "step": 2502 + }, + { + "epoch": 0.014886050052336093, + "grad_norm": 2.1744866371154785, + "learning_rate": 4.997268869014085e-05, + "loss": 6.5208, + "step": 2503 + }, + { + "epoch": 0.01489199733561709, + "grad_norm": 2.8058252334594727, + "learning_rate": 4.9972666858175236e-05, + "loss": 6.1527, + "step": 2504 + }, + { + "epoch": 0.014897944618898087, + "grad_norm": 2.418827533721924, + "learning_rate": 4.997264501749193e-05, + "loss": 6.2244, + "step": 2505 + }, + { + "epoch": 0.014903891902179084, + "grad_norm": 2.499648332595825, + "learning_rate": 4.997262316809092e-05, + "loss": 6.8904, + "step": 2506 + }, + { + "epoch": 0.014909839185460082, + "grad_norm": 2.3598594665527344, + "learning_rate": 4.9972601309972235e-05, + "loss": 7.0794, + "step": 2507 + }, + { + "epoch": 0.01491578646874108, + "grad_norm": 2.2443082332611084, + "learning_rate": 4.997257944313587e-05, + "loss": 7.3078, + "step": 2508 + }, + { + "epoch": 0.014921733752022076, + "grad_norm": 2.407501459121704, + "learning_rate": 4.9972557567581835e-05, + "loss": 7.0677, + "step": 2509 + }, + { + "epoch": 0.014927681035303073, + "grad_norm": 2.060865640640259, + "learning_rate": 4.997253568331014e-05, + "loss": 6.7128, + "step": 2510 + }, + { + "epoch": 0.014933628318584071, + "grad_norm": 2.3876516819000244, + "learning_rate": 4.997251379032078e-05, + "loss": 6.7562, + "step": 2511 + }, + { + "epoch": 0.014939575601865068, + "grad_norm": 2.387176990509033, + "learning_rate": 4.997249188861379e-05, + "loss": 6.8237, + "step": 2512 + }, + { + "epoch": 0.014945522885146065, + "grad_norm": 2.7324886322021484, + "learning_rate": 4.997246997818915e-05, + "loss": 6.8963, + "step": 2513 + }, + { + "epoch": 0.014951470168427062, + "grad_norm": 2.3832128047943115, + "learning_rate": 4.997244805904689e-05, + "loss": 6.9467, + "step": 2514 + }, + { + "epoch": 0.01495741745170806, + "grad_norm": 1.8594162464141846, + "learning_rate": 4.9972426131187e-05, + "loss": 7.0712, + "step": 2515 + }, + { + "epoch": 0.014963364734989057, + "grad_norm": 2.322068691253662, + "learning_rate": 4.997240419460949e-05, + "loss": 6.8898, + "step": 2516 + }, + { + "epoch": 0.014969312018270054, + "grad_norm": 2.4850032329559326, + "learning_rate": 4.997238224931438e-05, + "loss": 6.5439, + "step": 2517 + }, + { + "epoch": 0.014975259301551051, + "grad_norm": 2.919579029083252, + "learning_rate": 4.997236029530166e-05, + "loss": 6.3987, + "step": 2518 + }, + { + "epoch": 0.01498120658483205, + "grad_norm": 2.651900053024292, + "learning_rate": 4.997233833257135e-05, + "loss": 6.2735, + "step": 2519 + }, + { + "epoch": 0.014987153868113046, + "grad_norm": 2.7912142276763916, + "learning_rate": 4.997231636112346e-05, + "loss": 6.9835, + "step": 2520 + }, + { + "epoch": 0.014993101151394043, + "grad_norm": 2.5735433101654053, + "learning_rate": 4.997229438095799e-05, + "loss": 7.1218, + "step": 2521 + }, + { + "epoch": 0.01499904843467504, + "grad_norm": 2.483186721801758, + "learning_rate": 4.997227239207494e-05, + "loss": 7.0343, + "step": 2522 + }, + { + "epoch": 0.015004995717956039, + "grad_norm": 2.9296681880950928, + "learning_rate": 4.997225039447434e-05, + "loss": 6.5455, + "step": 2523 + }, + { + "epoch": 0.015010943001237035, + "grad_norm": 2.5536422729492188, + "learning_rate": 4.997222838815618e-05, + "loss": 6.7173, + "step": 2524 + }, + { + "epoch": 0.015016890284518032, + "grad_norm": 6.365324020385742, + "learning_rate": 4.997220637312047e-05, + "loss": 6.0909, + "step": 2525 + }, + { + "epoch": 0.015022837567799029, + "grad_norm": 3.7258150577545166, + "learning_rate": 4.997218434936723e-05, + "loss": 5.9019, + "step": 2526 + }, + { + "epoch": 0.015028784851080026, + "grad_norm": 2.9021997451782227, + "learning_rate": 4.997216231689645e-05, + "loss": 5.8601, + "step": 2527 + }, + { + "epoch": 0.015034732134361024, + "grad_norm": 2.570988416671753, + "learning_rate": 4.997214027570815e-05, + "loss": 6.1513, + "step": 2528 + }, + { + "epoch": 0.015040679417642021, + "grad_norm": 3.013540029525757, + "learning_rate": 4.997211822580233e-05, + "loss": 6.6471, + "step": 2529 + }, + { + "epoch": 0.015046626700923018, + "grad_norm": 2.612210750579834, + "learning_rate": 4.997209616717901e-05, + "loss": 6.5523, + "step": 2530 + }, + { + "epoch": 0.015052573984204015, + "grad_norm": 2.93513822555542, + "learning_rate": 4.9972074099838186e-05, + "loss": 6.1845, + "step": 2531 + }, + { + "epoch": 0.015058521267485013, + "grad_norm": 3.569002389907837, + "learning_rate": 4.9972052023779865e-05, + "loss": 6.7383, + "step": 2532 + }, + { + "epoch": 0.01506446855076601, + "grad_norm": 2.560023784637451, + "learning_rate": 4.9972029939004064e-05, + "loss": 6.4978, + "step": 2533 + }, + { + "epoch": 0.015070415834047007, + "grad_norm": 2.304612398147583, + "learning_rate": 4.997200784551078e-05, + "loss": 6.3316, + "step": 2534 + }, + { + "epoch": 0.015076363117328004, + "grad_norm": 2.4442996978759766, + "learning_rate": 4.997198574330003e-05, + "loss": 6.4245, + "step": 2535 + }, + { + "epoch": 0.015082310400609002, + "grad_norm": 2.764831304550171, + "learning_rate": 4.997196363237181e-05, + "loss": 6.2251, + "step": 2536 + }, + { + "epoch": 0.01508825768389, + "grad_norm": 2.6534347534179688, + "learning_rate": 4.997194151272615e-05, + "loss": 6.6674, + "step": 2537 + }, + { + "epoch": 0.015094204967170996, + "grad_norm": 2.5901331901550293, + "learning_rate": 4.997191938436303e-05, + "loss": 6.5724, + "step": 2538 + }, + { + "epoch": 0.015100152250451993, + "grad_norm": 2.6827733516693115, + "learning_rate": 4.9971897247282474e-05, + "loss": 6.4774, + "step": 2539 + }, + { + "epoch": 0.015106099533732991, + "grad_norm": 2.087397813796997, + "learning_rate": 4.997187510148449e-05, + "loss": 6.5011, + "step": 2540 + }, + { + "epoch": 0.015112046817013988, + "grad_norm": 2.157935619354248, + "learning_rate": 4.9971852946969076e-05, + "loss": 6.3258, + "step": 2541 + }, + { + "epoch": 0.015117994100294985, + "grad_norm": 2.680481195449829, + "learning_rate": 4.997183078373625e-05, + "loss": 6.5631, + "step": 2542 + }, + { + "epoch": 0.015123941383575982, + "grad_norm": 2.897608995437622, + "learning_rate": 4.997180861178602e-05, + "loss": 6.7913, + "step": 2543 + }, + { + "epoch": 0.01512988866685698, + "grad_norm": 2.5714452266693115, + "learning_rate": 4.997178643111838e-05, + "loss": 6.767, + "step": 2544 + }, + { + "epoch": 0.015135835950137977, + "grad_norm": 2.096376419067383, + "learning_rate": 4.997176424173336e-05, + "loss": 6.7365, + "step": 2545 + }, + { + "epoch": 0.015141783233418974, + "grad_norm": 2.083101987838745, + "learning_rate": 4.9971742043630955e-05, + "loss": 6.4693, + "step": 2546 + }, + { + "epoch": 0.015147730516699971, + "grad_norm": 3.509512186050415, + "learning_rate": 4.997171983681116e-05, + "loss": 6.4068, + "step": 2547 + }, + { + "epoch": 0.01515367779998097, + "grad_norm": 3.055772304534912, + "learning_rate": 4.997169762127401e-05, + "loss": 6.3411, + "step": 2548 + }, + { + "epoch": 0.015159625083261966, + "grad_norm": 2.627429485321045, + "learning_rate": 4.997167539701949e-05, + "loss": 6.3788, + "step": 2549 + }, + { + "epoch": 0.015165572366542963, + "grad_norm": 2.408599853515625, + "learning_rate": 4.997165316404761e-05, + "loss": 6.2822, + "step": 2550 + }, + { + "epoch": 0.01517151964982396, + "grad_norm": 2.906006336212158, + "learning_rate": 4.997163092235839e-05, + "loss": 6.2615, + "step": 2551 + }, + { + "epoch": 0.015177466933104958, + "grad_norm": 2.4585347175598145, + "learning_rate": 4.997160867195183e-05, + "loss": 6.4076, + "step": 2552 + }, + { + "epoch": 0.015183414216385955, + "grad_norm": 2.495539665222168, + "learning_rate": 4.9971586412827944e-05, + "loss": 6.4893, + "step": 2553 + }, + { + "epoch": 0.015189361499666952, + "grad_norm": 2.719583034515381, + "learning_rate": 4.9971564144986734e-05, + "loss": 6.276, + "step": 2554 + }, + { + "epoch": 0.015195308782947949, + "grad_norm": 2.464207887649536, + "learning_rate": 4.9971541868428206e-05, + "loss": 6.2713, + "step": 2555 + }, + { + "epoch": 0.015201256066228947, + "grad_norm": 2.3604822158813477, + "learning_rate": 4.997151958315237e-05, + "loss": 6.2648, + "step": 2556 + }, + { + "epoch": 0.015207203349509944, + "grad_norm": 2.729820966720581, + "learning_rate": 4.997149728915924e-05, + "loss": 6.2985, + "step": 2557 + }, + { + "epoch": 0.015213150632790941, + "grad_norm": 2.565760612487793, + "learning_rate": 4.997147498644882e-05, + "loss": 6.401, + "step": 2558 + }, + { + "epoch": 0.015219097916071938, + "grad_norm": 3.091628074645996, + "learning_rate": 4.9971452675021104e-05, + "loss": 6.1774, + "step": 2559 + }, + { + "epoch": 0.015225045199352935, + "grad_norm": 2.452453851699829, + "learning_rate": 4.9971430354876125e-05, + "loss": 6.4669, + "step": 2560 + }, + { + "epoch": 0.015230992482633933, + "grad_norm": 2.4285218715667725, + "learning_rate": 4.997140802601387e-05, + "loss": 6.4086, + "step": 2561 + }, + { + "epoch": 0.01523693976591493, + "grad_norm": 2.094043254852295, + "learning_rate": 4.9971385688434356e-05, + "loss": 6.2502, + "step": 2562 + }, + { + "epoch": 0.015242887049195927, + "grad_norm": 2.5989573001861572, + "learning_rate": 4.9971363342137586e-05, + "loss": 6.2948, + "step": 2563 + }, + { + "epoch": 0.015248834332476924, + "grad_norm": 2.5372314453125, + "learning_rate": 4.9971340987123574e-05, + "loss": 6.5643, + "step": 2564 + }, + { + "epoch": 0.015254781615757922, + "grad_norm": 2.3666064739227295, + "learning_rate": 4.9971318623392325e-05, + "loss": 6.4807, + "step": 2565 + }, + { + "epoch": 0.01526072889903892, + "grad_norm": 2.3216497898101807, + "learning_rate": 4.997129625094385e-05, + "loss": 6.448, + "step": 2566 + }, + { + "epoch": 0.015266676182319916, + "grad_norm": 2.202665090560913, + "learning_rate": 4.9971273869778153e-05, + "loss": 6.3766, + "step": 2567 + }, + { + "epoch": 0.015272623465600913, + "grad_norm": 2.5678982734680176, + "learning_rate": 4.997125147989524e-05, + "loss": 6.0799, + "step": 2568 + }, + { + "epoch": 0.015278570748881911, + "grad_norm": 2.7904717922210693, + "learning_rate": 4.997122908129512e-05, + "loss": 6.3446, + "step": 2569 + }, + { + "epoch": 0.015284518032162908, + "grad_norm": 2.383120059967041, + "learning_rate": 4.99712066739778e-05, + "loss": 6.2398, + "step": 2570 + }, + { + "epoch": 0.015290465315443905, + "grad_norm": 2.4302077293395996, + "learning_rate": 4.9971184257943294e-05, + "loss": 6.2678, + "step": 2571 + }, + { + "epoch": 0.015296412598724902, + "grad_norm": 2.2923178672790527, + "learning_rate": 4.99711618331916e-05, + "loss": 6.4742, + "step": 2572 + }, + { + "epoch": 0.0153023598820059, + "grad_norm": 2.582810878753662, + "learning_rate": 4.9971139399722735e-05, + "loss": 6.4679, + "step": 2573 + }, + { + "epoch": 0.015308307165286897, + "grad_norm": 2.718228578567505, + "learning_rate": 4.997111695753671e-05, + "loss": 6.2475, + "step": 2574 + }, + { + "epoch": 0.015314254448567894, + "grad_norm": 2.4639811515808105, + "learning_rate": 4.997109450663352e-05, + "loss": 6.463, + "step": 2575 + }, + { + "epoch": 0.01532020173184889, + "grad_norm": 2.6998252868652344, + "learning_rate": 4.997107204701318e-05, + "loss": 6.2885, + "step": 2576 + }, + { + "epoch": 0.01532614901512989, + "grad_norm": 2.831291437149048, + "learning_rate": 4.997104957867569e-05, + "loss": 6.2056, + "step": 2577 + }, + { + "epoch": 0.015332096298410886, + "grad_norm": 2.9070980548858643, + "learning_rate": 4.997102710162107e-05, + "loss": 6.3247, + "step": 2578 + }, + { + "epoch": 0.015338043581691883, + "grad_norm": 2.2583134174346924, + "learning_rate": 4.997100461584933e-05, + "loss": 6.3241, + "step": 2579 + }, + { + "epoch": 0.01534399086497288, + "grad_norm": 2.1661887168884277, + "learning_rate": 4.997098212136045e-05, + "loss": 6.173, + "step": 2580 + }, + { + "epoch": 0.015349938148253878, + "grad_norm": 2.146256446838379, + "learning_rate": 4.997095961815448e-05, + "loss": 6.2267, + "step": 2581 + }, + { + "epoch": 0.015355885431534875, + "grad_norm": 2.5691211223602295, + "learning_rate": 4.997093710623139e-05, + "loss": 6.3302, + "step": 2582 + }, + { + "epoch": 0.015361832714815872, + "grad_norm": 2.5439505577087402, + "learning_rate": 4.997091458559121e-05, + "loss": 6.2111, + "step": 2583 + }, + { + "epoch": 0.015367779998096869, + "grad_norm": 2.451582670211792, + "learning_rate": 4.997089205623394e-05, + "loss": 6.2369, + "step": 2584 + }, + { + "epoch": 0.015373727281377867, + "grad_norm": 2.6275687217712402, + "learning_rate": 4.99708695181596e-05, + "loss": 6.1104, + "step": 2585 + }, + { + "epoch": 0.015379674564658864, + "grad_norm": 2.7068562507629395, + "learning_rate": 4.997084697136818e-05, + "loss": 6.1646, + "step": 2586 + }, + { + "epoch": 0.015385621847939861, + "grad_norm": 2.7819957733154297, + "learning_rate": 4.9970824415859694e-05, + "loss": 6.4203, + "step": 2587 + }, + { + "epoch": 0.015391569131220858, + "grad_norm": 2.7021708488464355, + "learning_rate": 4.9970801851634154e-05, + "loss": 6.1535, + "step": 2588 + }, + { + "epoch": 0.015397516414501855, + "grad_norm": 2.50740909576416, + "learning_rate": 4.997077927869156e-05, + "loss": 6.0139, + "step": 2589 + }, + { + "epoch": 0.015403463697782853, + "grad_norm": 2.5769078731536865, + "learning_rate": 4.997075669703193e-05, + "loss": 6.129, + "step": 2590 + }, + { + "epoch": 0.01540941098106385, + "grad_norm": 2.7379090785980225, + "learning_rate": 4.997073410665526e-05, + "loss": 6.4168, + "step": 2591 + }, + { + "epoch": 0.015415358264344847, + "grad_norm": 2.3530659675598145, + "learning_rate": 4.9970711507561565e-05, + "loss": 6.3114, + "step": 2592 + }, + { + "epoch": 0.015421305547625844, + "grad_norm": 2.6025893688201904, + "learning_rate": 4.997068889975086e-05, + "loss": 6.2506, + "step": 2593 + }, + { + "epoch": 0.015427252830906842, + "grad_norm": 2.311833143234253, + "learning_rate": 4.9970666283223145e-05, + "loss": 6.3372, + "step": 2594 + }, + { + "epoch": 0.015433200114187839, + "grad_norm": 2.339947462081909, + "learning_rate": 4.997064365797842e-05, + "loss": 6.2987, + "step": 2595 + }, + { + "epoch": 0.015439147397468836, + "grad_norm": 2.2132725715637207, + "learning_rate": 4.9970621024016714e-05, + "loss": 6.2473, + "step": 2596 + }, + { + "epoch": 0.015445094680749833, + "grad_norm": 2.7063987255096436, + "learning_rate": 4.9970598381338014e-05, + "loss": 6.1702, + "step": 2597 + }, + { + "epoch": 0.015451041964030831, + "grad_norm": 2.4952430725097656, + "learning_rate": 4.9970575729942335e-05, + "loss": 6.3301, + "step": 2598 + }, + { + "epoch": 0.015456989247311828, + "grad_norm": 2.7442502975463867, + "learning_rate": 4.997055306982969e-05, + "loss": 6.1922, + "step": 2599 + }, + { + "epoch": 0.015462936530592825, + "grad_norm": 2.860058069229126, + "learning_rate": 4.997053040100008e-05, + "loss": 6.0674, + "step": 2600 + }, + { + "epoch": 0.015468883813873822, + "grad_norm": 2.821620464324951, + "learning_rate": 4.997050772345352e-05, + "loss": 6.0445, + "step": 2601 + }, + { + "epoch": 0.01547483109715482, + "grad_norm": 2.369174003601074, + "learning_rate": 4.997048503719001e-05, + "loss": 5.8641, + "step": 2602 + }, + { + "epoch": 0.015480778380435817, + "grad_norm": 2.2836029529571533, + "learning_rate": 4.997046234220956e-05, + "loss": 5.7629, + "step": 2603 + }, + { + "epoch": 0.015486725663716814, + "grad_norm": 3.13094162940979, + "learning_rate": 4.997043963851218e-05, + "loss": 6.7871, + "step": 2604 + }, + { + "epoch": 0.01549267294699781, + "grad_norm": 2.884119749069214, + "learning_rate": 4.9970416926097885e-05, + "loss": 6.1079, + "step": 2605 + }, + { + "epoch": 0.01549862023027881, + "grad_norm": 3.0921716690063477, + "learning_rate": 4.997039420496666e-05, + "loss": 5.9221, + "step": 2606 + }, + { + "epoch": 0.015504567513559806, + "grad_norm": 2.6903741359710693, + "learning_rate": 4.997037147511855e-05, + "loss": 5.7377, + "step": 2607 + }, + { + "epoch": 0.015510514796840803, + "grad_norm": 2.177030086517334, + "learning_rate": 4.997034873655352e-05, + "loss": 5.7272, + "step": 2608 + }, + { + "epoch": 0.0155164620801218, + "grad_norm": 2.41406512260437, + "learning_rate": 4.997032598927162e-05, + "loss": 5.6456, + "step": 2609 + }, + { + "epoch": 0.015522409363402798, + "grad_norm": 2.6853182315826416, + "learning_rate": 4.997030323327282e-05, + "loss": 6.1634, + "step": 2610 + }, + { + "epoch": 0.015528356646683795, + "grad_norm": 2.734081983566284, + "learning_rate": 4.997028046855715e-05, + "loss": 6.1366, + "step": 2611 + }, + { + "epoch": 0.015534303929964792, + "grad_norm": 2.234046459197998, + "learning_rate": 4.997025769512461e-05, + "loss": 5.6773, + "step": 2612 + }, + { + "epoch": 0.015540251213245789, + "grad_norm": 2.467381715774536, + "learning_rate": 4.9970234912975226e-05, + "loss": 5.6409, + "step": 2613 + }, + { + "epoch": 0.015546198496526787, + "grad_norm": 2.4890551567077637, + "learning_rate": 4.997021212210897e-05, + "loss": 5.5961, + "step": 2614 + }, + { + "epoch": 0.015552145779807784, + "grad_norm": 2.254138708114624, + "learning_rate": 4.997018932252588e-05, + "loss": 5.6039, + "step": 2615 + }, + { + "epoch": 0.015558093063088781, + "grad_norm": 2.5773816108703613, + "learning_rate": 4.9970166514225955e-05, + "loss": 5.9935, + "step": 2616 + }, + { + "epoch": 0.015564040346369778, + "grad_norm": 2.308300733566284, + "learning_rate": 4.997014369720921e-05, + "loss": 5.8307, + "step": 2617 + }, + { + "epoch": 0.015569987629650776, + "grad_norm": 2.3276724815368652, + "learning_rate": 4.9970120871475634e-05, + "loss": 5.5819, + "step": 2618 + }, + { + "epoch": 0.015575934912931773, + "grad_norm": 2.7989203929901123, + "learning_rate": 4.997009803702526e-05, + "loss": 6.0816, + "step": 2619 + }, + { + "epoch": 0.01558188219621277, + "grad_norm": 2.5614469051361084, + "learning_rate": 4.997007519385807e-05, + "loss": 5.6677, + "step": 2620 + }, + { + "epoch": 0.015587829479493767, + "grad_norm": 2.4494402408599854, + "learning_rate": 4.9970052341974096e-05, + "loss": 5.7754, + "step": 2621 + }, + { + "epoch": 0.015593776762774764, + "grad_norm": 2.214578151702881, + "learning_rate": 4.997002948137333e-05, + "loss": 6.4244, + "step": 2622 + }, + { + "epoch": 0.015599724046055762, + "grad_norm": 2.8115196228027344, + "learning_rate": 4.9970006612055776e-05, + "loss": 5.9822, + "step": 2623 + }, + { + "epoch": 0.015605671329336759, + "grad_norm": 2.4020626544952393, + "learning_rate": 4.996998373402146e-05, + "loss": 6.0481, + "step": 2624 + }, + { + "epoch": 0.015611618612617756, + "grad_norm": 2.3936421871185303, + "learning_rate": 4.996996084727038e-05, + "loss": 6.0663, + "step": 2625 + }, + { + "epoch": 0.015617565895898753, + "grad_norm": 2.2710554599761963, + "learning_rate": 4.996993795180254e-05, + "loss": 6.0668, + "step": 2626 + }, + { + "epoch": 0.015623513179179751, + "grad_norm": 2.141789436340332, + "learning_rate": 4.9969915047617955e-05, + "loss": 6.2159, + "step": 2627 + }, + { + "epoch": 0.015629460462460748, + "grad_norm": 2.557889461517334, + "learning_rate": 4.9969892134716635e-05, + "loss": 6.262, + "step": 2628 + }, + { + "epoch": 0.015635407745741747, + "grad_norm": 2.3966641426086426, + "learning_rate": 4.9969869213098574e-05, + "loss": 6.0412, + "step": 2629 + }, + { + "epoch": 0.01564135502902274, + "grad_norm": 2.301426410675049, + "learning_rate": 4.99698462827638e-05, + "loss": 6.0798, + "step": 2630 + }, + { + "epoch": 0.01564730231230374, + "grad_norm": 2.4315614700317383, + "learning_rate": 4.996982334371231e-05, + "loss": 5.8736, + "step": 2631 + }, + { + "epoch": 0.015653249595584735, + "grad_norm": 2.5549440383911133, + "learning_rate": 4.9969800395944105e-05, + "loss": 5.7858, + "step": 2632 + }, + { + "epoch": 0.015659196878865734, + "grad_norm": 2.480375289916992, + "learning_rate": 4.99697774394592e-05, + "loss": 6.3261, + "step": 2633 + }, + { + "epoch": 0.015665144162146732, + "grad_norm": 2.42866849899292, + "learning_rate": 4.9969754474257614e-05, + "loss": 6.1729, + "step": 2634 + }, + { + "epoch": 0.015671091445427728, + "grad_norm": 2.32722544670105, + "learning_rate": 4.9969731500339335e-05, + "loss": 5.7746, + "step": 2635 + }, + { + "epoch": 0.015677038728708726, + "grad_norm": 2.6797266006469727, + "learning_rate": 4.996970851770438e-05, + "loss": 6.1657, + "step": 2636 + }, + { + "epoch": 0.015682986011989725, + "grad_norm": 2.87758731842041, + "learning_rate": 4.9969685526352775e-05, + "loss": 6.1475, + "step": 2637 + }, + { + "epoch": 0.01568893329527072, + "grad_norm": 2.898663282394409, + "learning_rate": 4.996966252628449e-05, + "loss": 6.2942, + "step": 2638 + }, + { + "epoch": 0.01569488057855172, + "grad_norm": 3.3087987899780273, + "learning_rate": 4.996963951749957e-05, + "loss": 5.9962, + "step": 2639 + }, + { + "epoch": 0.015700827861832713, + "grad_norm": 2.4418020248413086, + "learning_rate": 4.996961649999799e-05, + "loss": 6.1065, + "step": 2640 + }, + { + "epoch": 0.015706775145113712, + "grad_norm": 2.5839014053344727, + "learning_rate": 4.9969593473779786e-05, + "loss": 6.2303, + "step": 2641 + }, + { + "epoch": 0.01571272242839471, + "grad_norm": 2.683163642883301, + "learning_rate": 4.996957043884495e-05, + "loss": 5.7194, + "step": 2642 + }, + { + "epoch": 0.015718669711675706, + "grad_norm": 2.628574848175049, + "learning_rate": 4.99695473951935e-05, + "loss": 5.6239, + "step": 2643 + }, + { + "epoch": 0.015724616994956704, + "grad_norm": 3.0716800689697266, + "learning_rate": 4.9969524342825434e-05, + "loss": 6.1957, + "step": 2644 + }, + { + "epoch": 0.015730564278237703, + "grad_norm": 2.415626287460327, + "learning_rate": 4.996950128174077e-05, + "loss": 6.2953, + "step": 2645 + }, + { + "epoch": 0.015736511561518698, + "grad_norm": 2.6836612224578857, + "learning_rate": 4.996947821193951e-05, + "loss": 6.103, + "step": 2646 + }, + { + "epoch": 0.015742458844799696, + "grad_norm": 2.2673206329345703, + "learning_rate": 4.996945513342166e-05, + "loss": 6.2628, + "step": 2647 + }, + { + "epoch": 0.01574840612808069, + "grad_norm": 2.629955530166626, + "learning_rate": 4.996943204618724e-05, + "loss": 6.2444, + "step": 2648 + }, + { + "epoch": 0.01575435341136169, + "grad_norm": 2.6730127334594727, + "learning_rate": 4.996940895023623e-05, + "loss": 6.0595, + "step": 2649 + }, + { + "epoch": 0.01576030069464269, + "grad_norm": 2.607389450073242, + "learning_rate": 4.996938584556867e-05, + "loss": 6.0253, + "step": 2650 + }, + { + "epoch": 0.015766247977923684, + "grad_norm": 2.264345407485962, + "learning_rate": 4.996936273218456e-05, + "loss": 6.1011, + "step": 2651 + }, + { + "epoch": 0.015772195261204682, + "grad_norm": 2.218766450881958, + "learning_rate": 4.99693396100839e-05, + "loss": 6.0545, + "step": 2652 + }, + { + "epoch": 0.015778142544485677, + "grad_norm": 2.435213088989258, + "learning_rate": 4.99693164792667e-05, + "loss": 6.0679, + "step": 2653 + }, + { + "epoch": 0.015784089827766676, + "grad_norm": 2.2278120517730713, + "learning_rate": 4.996929333973297e-05, + "loss": 6.0864, + "step": 2654 + }, + { + "epoch": 0.015790037111047674, + "grad_norm": 1.983554482460022, + "learning_rate": 4.9969270191482715e-05, + "loss": 6.124, + "step": 2655 + }, + { + "epoch": 0.01579598439432867, + "grad_norm": 1.9382312297821045, + "learning_rate": 4.996924703451594e-05, + "loss": 6.392, + "step": 2656 + }, + { + "epoch": 0.015801931677609668, + "grad_norm": 2.8142831325531006, + "learning_rate": 4.9969223868832674e-05, + "loss": 6.017, + "step": 2657 + }, + { + "epoch": 0.015807878960890667, + "grad_norm": 2.3466787338256836, + "learning_rate": 4.9969200694432904e-05, + "loss": 5.9588, + "step": 2658 + }, + { + "epoch": 0.01581382624417166, + "grad_norm": 2.0172243118286133, + "learning_rate": 4.996917751131664e-05, + "loss": 5.9513, + "step": 2659 + }, + { + "epoch": 0.01581977352745266, + "grad_norm": 2.3778223991394043, + "learning_rate": 4.99691543194839e-05, + "loss": 6.2205, + "step": 2660 + }, + { + "epoch": 0.015825720810733655, + "grad_norm": 2.4351084232330322, + "learning_rate": 4.9969131118934675e-05, + "loss": 6.0916, + "step": 2661 + }, + { + "epoch": 0.015831668094014654, + "grad_norm": 2.22328519821167, + "learning_rate": 4.9969107909669e-05, + "loss": 6.5546, + "step": 2662 + }, + { + "epoch": 0.015837615377295652, + "grad_norm": 2.4626407623291016, + "learning_rate": 4.996908469168685e-05, + "loss": 6.522, + "step": 2663 + }, + { + "epoch": 0.015843562660576647, + "grad_norm": 2.1032283306121826, + "learning_rate": 4.9969061464988266e-05, + "loss": 6.3372, + "step": 2664 + }, + { + "epoch": 0.015849509943857646, + "grad_norm": 2.1436524391174316, + "learning_rate": 4.9969038229573236e-05, + "loss": 6.3792, + "step": 2665 + }, + { + "epoch": 0.015855457227138645, + "grad_norm": 2.42084002494812, + "learning_rate": 4.996901498544176e-05, + "loss": 6.701, + "step": 2666 + }, + { + "epoch": 0.01586140451041964, + "grad_norm": 2.854630947113037, + "learning_rate": 4.996899173259388e-05, + "loss": 6.3273, + "step": 2667 + }, + { + "epoch": 0.015867351793700638, + "grad_norm": 2.2480521202087402, + "learning_rate": 4.996896847102957e-05, + "loss": 6.4314, + "step": 2668 + }, + { + "epoch": 0.015873299076981633, + "grad_norm": 3.7074203491210938, + "learning_rate": 4.996894520074886e-05, + "loss": 5.9438, + "step": 2669 + }, + { + "epoch": 0.015879246360262632, + "grad_norm": 3.1037209033966064, + "learning_rate": 4.9968921921751735e-05, + "loss": 5.7915, + "step": 2670 + }, + { + "epoch": 0.01588519364354363, + "grad_norm": 2.8338170051574707, + "learning_rate": 4.996889863403823e-05, + "loss": 6.7765, + "step": 2671 + }, + { + "epoch": 0.015891140926824626, + "grad_norm": 2.6366934776306152, + "learning_rate": 4.996887533760833e-05, + "loss": 6.8019, + "step": 2672 + }, + { + "epoch": 0.015897088210105624, + "grad_norm": 2.3954126834869385, + "learning_rate": 4.996885203246207e-05, + "loss": 6.3946, + "step": 2673 + }, + { + "epoch": 0.015903035493386623, + "grad_norm": 2.5771238803863525, + "learning_rate": 4.996882871859943e-05, + "loss": 6.3767, + "step": 2674 + }, + { + "epoch": 0.015908982776667618, + "grad_norm": 3.8544304370880127, + "learning_rate": 4.9968805396020424e-05, + "loss": 7.0813, + "step": 2675 + }, + { + "epoch": 0.015914930059948616, + "grad_norm": 3.4221606254577637, + "learning_rate": 4.996878206472507e-05, + "loss": 6.4782, + "step": 2676 + }, + { + "epoch": 0.01592087734322961, + "grad_norm": 3.6425843238830566, + "learning_rate": 4.996875872471338e-05, + "loss": 5.8685, + "step": 2677 + }, + { + "epoch": 0.01592682462651061, + "grad_norm": 3.255345344543457, + "learning_rate": 4.996873537598535e-05, + "loss": 5.7099, + "step": 2678 + }, + { + "epoch": 0.01593277190979161, + "grad_norm": 2.5217175483703613, + "learning_rate": 4.9968712018540997e-05, + "loss": 5.8978, + "step": 2679 + }, + { + "epoch": 0.015938719193072604, + "grad_norm": 2.2415871620178223, + "learning_rate": 4.996868865238031e-05, + "loss": 6.8186, + "step": 2680 + }, + { + "epoch": 0.015944666476353602, + "grad_norm": 2.1412270069122314, + "learning_rate": 4.996866527750332e-05, + "loss": 6.8056, + "step": 2681 + }, + { + "epoch": 0.015950613759634597, + "grad_norm": 2.423093557357788, + "learning_rate": 4.996864189391004e-05, + "loss": 7.0769, + "step": 2682 + }, + { + "epoch": 0.015956561042915596, + "grad_norm": 2.2334039211273193, + "learning_rate": 4.9968618501600454e-05, + "loss": 6.9954, + "step": 2683 + }, + { + "epoch": 0.015962508326196594, + "grad_norm": 2.4311838150024414, + "learning_rate": 4.996859510057458e-05, + "loss": 6.8375, + "step": 2684 + }, + { + "epoch": 0.01596845560947759, + "grad_norm": 4.861137866973877, + "learning_rate": 4.996857169083242e-05, + "loss": 6.2628, + "step": 2685 + }, + { + "epoch": 0.015974402892758588, + "grad_norm": 3.064213991165161, + "learning_rate": 4.996854827237401e-05, + "loss": 6.4316, + "step": 2686 + }, + { + "epoch": 0.015980350176039586, + "grad_norm": 2.307011365890503, + "learning_rate": 4.996852484519932e-05, + "loss": 6.6212, + "step": 2687 + }, + { + "epoch": 0.01598629745932058, + "grad_norm": 2.5157034397125244, + "learning_rate": 4.9968501409308374e-05, + "loss": 7.153, + "step": 2688 + }, + { + "epoch": 0.01599224474260158, + "grad_norm": 2.4122424125671387, + "learning_rate": 4.996847796470119e-05, + "loss": 7.2244, + "step": 2689 + }, + { + "epoch": 0.015998192025882575, + "grad_norm": 2.305055618286133, + "learning_rate": 4.9968454511377773e-05, + "loss": 7.4751, + "step": 2690 + }, + { + "epoch": 0.016004139309163574, + "grad_norm": 3.068027973175049, + "learning_rate": 4.9968431049338116e-05, + "loss": 6.5709, + "step": 2691 + }, + { + "epoch": 0.016010086592444572, + "grad_norm": 2.09893798828125, + "learning_rate": 4.9968407578582246e-05, + "loss": 6.7212, + "step": 2692 + }, + { + "epoch": 0.016016033875725567, + "grad_norm": 2.3161933422088623, + "learning_rate": 4.9968384099110163e-05, + "loss": 6.6243, + "step": 2693 + }, + { + "epoch": 0.016021981159006566, + "grad_norm": 2.913304090499878, + "learning_rate": 4.9968360610921874e-05, + "loss": 6.1946, + "step": 2694 + }, + { + "epoch": 0.016027928442287565, + "grad_norm": 2.746368408203125, + "learning_rate": 4.9968337114017386e-05, + "loss": 6.3783, + "step": 2695 + }, + { + "epoch": 0.01603387572556856, + "grad_norm": 2.40331768989563, + "learning_rate": 4.9968313608396705e-05, + "loss": 6.9898, + "step": 2696 + }, + { + "epoch": 0.016039823008849558, + "grad_norm": 2.214869976043701, + "learning_rate": 4.9968290094059844e-05, + "loss": 6.4497, + "step": 2697 + }, + { + "epoch": 0.016045770292130553, + "grad_norm": 2.050436019897461, + "learning_rate": 4.996826657100682e-05, + "loss": 6.8897, + "step": 2698 + }, + { + "epoch": 0.016051717575411552, + "grad_norm": 2.294149398803711, + "learning_rate": 4.996824303923763e-05, + "loss": 6.5583, + "step": 2699 + }, + { + "epoch": 0.01605766485869255, + "grad_norm": 2.26918625831604, + "learning_rate": 4.996821949875228e-05, + "loss": 6.7411, + "step": 2700 + }, + { + "epoch": 0.016063612141973545, + "grad_norm": 2.1330158710479736, + "learning_rate": 4.9968195949550775e-05, + "loss": 6.8068, + "step": 2701 + }, + { + "epoch": 0.016069559425254544, + "grad_norm": 1.8605769872665405, + "learning_rate": 4.996817239163315e-05, + "loss": 6.4833, + "step": 2702 + }, + { + "epoch": 0.016075506708535543, + "grad_norm": 3.132803440093994, + "learning_rate": 4.996814882499938e-05, + "loss": 5.8281, + "step": 2703 + }, + { + "epoch": 0.016081453991816538, + "grad_norm": 3.1079390048980713, + "learning_rate": 4.996812524964949e-05, + "loss": 5.6894, + "step": 2704 + }, + { + "epoch": 0.016087401275097536, + "grad_norm": 2.2877023220062256, + "learning_rate": 4.996810166558349e-05, + "loss": 7.0128, + "step": 2705 + }, + { + "epoch": 0.01609334855837853, + "grad_norm": 2.415696859359741, + "learning_rate": 4.996807807280138e-05, + "loss": 6.8098, + "step": 2706 + }, + { + "epoch": 0.01609929584165953, + "grad_norm": 2.342111110687256, + "learning_rate": 4.996805447130317e-05, + "loss": 7.2452, + "step": 2707 + }, + { + "epoch": 0.01610524312494053, + "grad_norm": 2.6504852771759033, + "learning_rate": 4.996803086108887e-05, + "loss": 6.6731, + "step": 2708 + }, + { + "epoch": 0.016111190408221523, + "grad_norm": 2.6157166957855225, + "learning_rate": 4.996800724215849e-05, + "loss": 6.9377, + "step": 2709 + }, + { + "epoch": 0.016117137691502522, + "grad_norm": 2.6289443969726562, + "learning_rate": 4.9967983614512036e-05, + "loss": 6.639, + "step": 2710 + }, + { + "epoch": 0.01612308497478352, + "grad_norm": 2.966489791870117, + "learning_rate": 4.996795997814952e-05, + "loss": 6.3681, + "step": 2711 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 3.7333364486694336, + "learning_rate": 4.9967936333070944e-05, + "loss": 5.6015, + "step": 2712 + }, + { + "epoch": 0.016134979541345514, + "grad_norm": 2.942728281021118, + "learning_rate": 4.9967912679276316e-05, + "loss": 5.6548, + "step": 2713 + }, + { + "epoch": 0.01614092682462651, + "grad_norm": 2.394622802734375, + "learning_rate": 4.996788901676566e-05, + "loss": 6.5119, + "step": 2714 + }, + { + "epoch": 0.016146874107907508, + "grad_norm": 2.8388447761535645, + "learning_rate": 4.9967865345538963e-05, + "loss": 6.4424, + "step": 2715 + }, + { + "epoch": 0.016152821391188506, + "grad_norm": 2.7682905197143555, + "learning_rate": 4.9967841665596245e-05, + "loss": 6.4688, + "step": 2716 + }, + { + "epoch": 0.0161587686744695, + "grad_norm": 3.0281460285186768, + "learning_rate": 4.996781797693751e-05, + "loss": 6.52, + "step": 2717 + }, + { + "epoch": 0.0161647159577505, + "grad_norm": 2.9734318256378174, + "learning_rate": 4.996779427956276e-05, + "loss": 6.4307, + "step": 2718 + }, + { + "epoch": 0.016170663241031495, + "grad_norm": 2.7653586864471436, + "learning_rate": 4.996777057347202e-05, + "loss": 6.1783, + "step": 2719 + }, + { + "epoch": 0.016176610524312494, + "grad_norm": 2.9418516159057617, + "learning_rate": 4.996774685866529e-05, + "loss": 6.5466, + "step": 2720 + }, + { + "epoch": 0.016182557807593492, + "grad_norm": 2.789217233657837, + "learning_rate": 4.996772313514258e-05, + "loss": 6.9296, + "step": 2721 + }, + { + "epoch": 0.016188505090874487, + "grad_norm": 2.8092539310455322, + "learning_rate": 4.996769940290389e-05, + "loss": 6.6186, + "step": 2722 + }, + { + "epoch": 0.016194452374155486, + "grad_norm": 2.696572780609131, + "learning_rate": 4.996767566194923e-05, + "loss": 6.5361, + "step": 2723 + }, + { + "epoch": 0.016200399657436484, + "grad_norm": 2.5987300872802734, + "learning_rate": 4.996765191227862e-05, + "loss": 6.4029, + "step": 2724 + }, + { + "epoch": 0.01620634694071748, + "grad_norm": 2.083057165145874, + "learning_rate": 4.996762815389205e-05, + "loss": 6.4747, + "step": 2725 + }, + { + "epoch": 0.016212294223998478, + "grad_norm": 2.912338972091675, + "learning_rate": 4.9967604386789555e-05, + "loss": 6.8869, + "step": 2726 + }, + { + "epoch": 0.016218241507279473, + "grad_norm": 2.642224073410034, + "learning_rate": 4.9967580610971124e-05, + "loss": 6.6701, + "step": 2727 + }, + { + "epoch": 0.016224188790560472, + "grad_norm": 2.673652410507202, + "learning_rate": 4.996755682643676e-05, + "loss": 6.8624, + "step": 2728 + }, + { + "epoch": 0.01623013607384147, + "grad_norm": 2.5223872661590576, + "learning_rate": 4.996753303318648e-05, + "loss": 6.8247, + "step": 2729 + }, + { + "epoch": 0.016236083357122465, + "grad_norm": 2.252037525177002, + "learning_rate": 4.99675092312203e-05, + "loss": 6.7924, + "step": 2730 + }, + { + "epoch": 0.016242030640403464, + "grad_norm": 2.2854461669921875, + "learning_rate": 4.9967485420538216e-05, + "loss": 6.4761, + "step": 2731 + }, + { + "epoch": 0.016247977923684463, + "grad_norm": 2.426912546157837, + "learning_rate": 4.9967461601140244e-05, + "loss": 6.6028, + "step": 2732 + }, + { + "epoch": 0.016253925206965458, + "grad_norm": 2.7375681400299072, + "learning_rate": 4.9967437773026384e-05, + "loss": 6.5283, + "step": 2733 + }, + { + "epoch": 0.016259872490246456, + "grad_norm": 2.7669689655303955, + "learning_rate": 4.996741393619665e-05, + "loss": 6.4382, + "step": 2734 + }, + { + "epoch": 0.01626581977352745, + "grad_norm": 2.294597864151001, + "learning_rate": 4.996739009065105e-05, + "loss": 6.7479, + "step": 2735 + }, + { + "epoch": 0.01627176705680845, + "grad_norm": 2.4791014194488525, + "learning_rate": 4.996736623638959e-05, + "loss": 6.7043, + "step": 2736 + }, + { + "epoch": 0.01627771434008945, + "grad_norm": 2.4080021381378174, + "learning_rate": 4.9967342373412286e-05, + "loss": 6.6046, + "step": 2737 + }, + { + "epoch": 0.016283661623370443, + "grad_norm": 2.463109254837036, + "learning_rate": 4.996731850171914e-05, + "loss": 6.3895, + "step": 2738 + }, + { + "epoch": 0.016289608906651442, + "grad_norm": 2.665908098220825, + "learning_rate": 4.9967294621310155e-05, + "loss": 6.6482, + "step": 2739 + }, + { + "epoch": 0.01629555618993244, + "grad_norm": 2.399526357650757, + "learning_rate": 4.996727073218536e-05, + "loss": 6.7098, + "step": 2740 + }, + { + "epoch": 0.016301503473213436, + "grad_norm": 2.678091287612915, + "learning_rate": 4.996724683434473e-05, + "loss": 6.419, + "step": 2741 + }, + { + "epoch": 0.016307450756494434, + "grad_norm": 2.5573642253875732, + "learning_rate": 4.99672229277883e-05, + "loss": 6.4703, + "step": 2742 + }, + { + "epoch": 0.01631339803977543, + "grad_norm": 2.644097089767456, + "learning_rate": 4.996719901251607e-05, + "loss": 5.9854, + "step": 2743 + }, + { + "epoch": 0.016319345323056428, + "grad_norm": 2.6165592670440674, + "learning_rate": 4.996717508852805e-05, + "loss": 6.1776, + "step": 2744 + }, + { + "epoch": 0.016325292606337426, + "grad_norm": 2.175647020339966, + "learning_rate": 4.996715115582426e-05, + "loss": 6.5533, + "step": 2745 + }, + { + "epoch": 0.01633123988961842, + "grad_norm": 2.112217664718628, + "learning_rate": 4.996712721440467e-05, + "loss": 6.5572, + "step": 2746 + }, + { + "epoch": 0.01633718717289942, + "grad_norm": 2.165111541748047, + "learning_rate": 4.996710326426933e-05, + "loss": 6.2798, + "step": 2747 + }, + { + "epoch": 0.016343134456180415, + "grad_norm": 2.5812315940856934, + "learning_rate": 4.996707930541823e-05, + "loss": 6.0831, + "step": 2748 + }, + { + "epoch": 0.016349081739461414, + "grad_norm": 2.2306227684020996, + "learning_rate": 4.996705533785138e-05, + "loss": 6.5833, + "step": 2749 + }, + { + "epoch": 0.016355029022742412, + "grad_norm": 1.999974250793457, + "learning_rate": 4.996703136156878e-05, + "loss": 6.2461, + "step": 2750 + }, + { + "epoch": 0.016360976306023407, + "grad_norm": 2.0521416664123535, + "learning_rate": 4.996700737657046e-05, + "loss": 6.4606, + "step": 2751 + }, + { + "epoch": 0.016366923589304406, + "grad_norm": 1.8630053997039795, + "learning_rate": 4.996698338285642e-05, + "loss": 6.1375, + "step": 2752 + }, + { + "epoch": 0.016372870872585404, + "grad_norm": 1.7525913715362549, + "learning_rate": 4.9966959380426646e-05, + "loss": 6.1769, + "step": 2753 + }, + { + "epoch": 0.0163788181558664, + "grad_norm": 2.8151230812072754, + "learning_rate": 4.996693536928118e-05, + "loss": 5.9066, + "step": 2754 + }, + { + "epoch": 0.016384765439147398, + "grad_norm": 2.503230571746826, + "learning_rate": 4.9966911349420004e-05, + "loss": 6.3725, + "step": 2755 + }, + { + "epoch": 0.016390712722428393, + "grad_norm": 2.676284074783325, + "learning_rate": 4.996688732084314e-05, + "loss": 6.9086, + "step": 2756 + }, + { + "epoch": 0.01639666000570939, + "grad_norm": 2.3367252349853516, + "learning_rate": 4.99668632835506e-05, + "loss": 6.1323, + "step": 2757 + }, + { + "epoch": 0.01640260728899039, + "grad_norm": 3.3071084022521973, + "learning_rate": 4.996683923754237e-05, + "loss": 6.162, + "step": 2758 + }, + { + "epoch": 0.016408554572271385, + "grad_norm": 2.64388370513916, + "learning_rate": 4.9966815182818494e-05, + "loss": 6.171, + "step": 2759 + }, + { + "epoch": 0.016414501855552384, + "grad_norm": 2.2378199100494385, + "learning_rate": 4.996679111937895e-05, + "loss": 6.4466, + "step": 2760 + }, + { + "epoch": 0.016420449138833382, + "grad_norm": 2.5944395065307617, + "learning_rate": 4.996676704722376e-05, + "loss": 6.7034, + "step": 2761 + }, + { + "epoch": 0.016426396422114378, + "grad_norm": 2.768211841583252, + "learning_rate": 4.996674296635293e-05, + "loss": 6.7551, + "step": 2762 + }, + { + "epoch": 0.016432343705395376, + "grad_norm": 2.80188250541687, + "learning_rate": 4.9966718876766467e-05, + "loss": 6.8437, + "step": 2763 + }, + { + "epoch": 0.01643829098867637, + "grad_norm": 2.2422847747802734, + "learning_rate": 4.996669477846438e-05, + "loss": 6.5365, + "step": 2764 + }, + { + "epoch": 0.01644423827195737, + "grad_norm": 2.526724100112915, + "learning_rate": 4.996667067144668e-05, + "loss": 6.3735, + "step": 2765 + }, + { + "epoch": 0.01645018555523837, + "grad_norm": 3.2267372608184814, + "learning_rate": 4.996664655571337e-05, + "loss": 6.0508, + "step": 2766 + }, + { + "epoch": 0.016456132838519363, + "grad_norm": 3.393270969390869, + "learning_rate": 4.996662243126446e-05, + "loss": 6.5543, + "step": 2767 + }, + { + "epoch": 0.016462080121800362, + "grad_norm": 2.7712342739105225, + "learning_rate": 4.996659829809996e-05, + "loss": 6.5891, + "step": 2768 + }, + { + "epoch": 0.01646802740508136, + "grad_norm": 2.5687179565429688, + "learning_rate": 4.996657415621988e-05, + "loss": 6.464, + "step": 2769 + }, + { + "epoch": 0.016473974688362356, + "grad_norm": 3.059953451156616, + "learning_rate": 4.996655000562424e-05, + "loss": 6.4286, + "step": 2770 + }, + { + "epoch": 0.016479921971643354, + "grad_norm": 3.3729803562164307, + "learning_rate": 4.9966525846313015e-05, + "loss": 6.5937, + "step": 2771 + }, + { + "epoch": 0.01648586925492435, + "grad_norm": 2.907397985458374, + "learning_rate": 4.996650167828624e-05, + "loss": 6.2559, + "step": 2772 + }, + { + "epoch": 0.016491816538205348, + "grad_norm": 3.5011706352233887, + "learning_rate": 4.996647750154392e-05, + "loss": 5.7897, + "step": 2773 + }, + { + "epoch": 0.016497763821486346, + "grad_norm": 2.5495986938476562, + "learning_rate": 4.996645331608607e-05, + "loss": 6.688, + "step": 2774 + }, + { + "epoch": 0.01650371110476734, + "grad_norm": 2.486416816711426, + "learning_rate": 4.9966429121912675e-05, + "loss": 6.8169, + "step": 2775 + }, + { + "epoch": 0.01650965838804834, + "grad_norm": 2.272162437438965, + "learning_rate": 4.9966404919023755e-05, + "loss": 6.696, + "step": 2776 + }, + { + "epoch": 0.016515605671329335, + "grad_norm": 2.9408323764801025, + "learning_rate": 4.9966380707419334e-05, + "loss": 6.1711, + "step": 2777 + }, + { + "epoch": 0.016521552954610334, + "grad_norm": 3.361907958984375, + "learning_rate": 4.99663564870994e-05, + "loss": 5.6029, + "step": 2778 + }, + { + "epoch": 0.016527500237891332, + "grad_norm": 3.06835675239563, + "learning_rate": 4.996633225806397e-05, + "loss": 5.332, + "step": 2779 + }, + { + "epoch": 0.016533447521172327, + "grad_norm": 3.058638572692871, + "learning_rate": 4.9966308020313054e-05, + "loss": 6.3345, + "step": 2780 + }, + { + "epoch": 0.016539394804453326, + "grad_norm": 2.8265507221221924, + "learning_rate": 4.9966283773846654e-05, + "loss": 5.4231, + "step": 2781 + }, + { + "epoch": 0.016545342087734324, + "grad_norm": 3.128094434738159, + "learning_rate": 4.996625951866478e-05, + "loss": 5.4144, + "step": 2782 + }, + { + "epoch": 0.01655128937101532, + "grad_norm": 2.6830554008483887, + "learning_rate": 4.9966235254767445e-05, + "loss": 6.0084, + "step": 2783 + }, + { + "epoch": 0.016557236654296318, + "grad_norm": 2.7146122455596924, + "learning_rate": 4.996621098215466e-05, + "loss": 6.7104, + "step": 2784 + }, + { + "epoch": 0.016563183937577313, + "grad_norm": 3.518169403076172, + "learning_rate": 4.9966186700826425e-05, + "loss": 5.4509, + "step": 2785 + }, + { + "epoch": 0.01656913122085831, + "grad_norm": 2.7607035636901855, + "learning_rate": 4.9966162410782755e-05, + "loss": 6.2149, + "step": 2786 + }, + { + "epoch": 0.01657507850413931, + "grad_norm": 2.897862195968628, + "learning_rate": 4.996613811202365e-05, + "loss": 6.4713, + "step": 2787 + }, + { + "epoch": 0.016581025787420305, + "grad_norm": 2.6984574794769287, + "learning_rate": 4.9966113804549134e-05, + "loss": 6.2298, + "step": 2788 + }, + { + "epoch": 0.016586973070701304, + "grad_norm": 2.7281908988952637, + "learning_rate": 4.996608948835919e-05, + "loss": 6.0244, + "step": 2789 + }, + { + "epoch": 0.016592920353982302, + "grad_norm": 2.314769983291626, + "learning_rate": 4.996606516345386e-05, + "loss": 6.8523, + "step": 2790 + }, + { + "epoch": 0.016598867637263297, + "grad_norm": 2.887943744659424, + "learning_rate": 4.9966040829833115e-05, + "loss": 6.8407, + "step": 2791 + }, + { + "epoch": 0.016604814920544296, + "grad_norm": 3.4924309253692627, + "learning_rate": 4.9966016487497e-05, + "loss": 6.3646, + "step": 2792 + }, + { + "epoch": 0.01661076220382529, + "grad_norm": 2.3095340728759766, + "learning_rate": 4.9965992136445495e-05, + "loss": 6.407, + "step": 2793 + }, + { + "epoch": 0.01661670948710629, + "grad_norm": 3.771980047225952, + "learning_rate": 4.9965967776678627e-05, + "loss": 6.0596, + "step": 2794 + }, + { + "epoch": 0.016622656770387288, + "grad_norm": 3.452252149581909, + "learning_rate": 4.99659434081964e-05, + "loss": 6.1351, + "step": 2795 + }, + { + "epoch": 0.016628604053668283, + "grad_norm": 2.4391021728515625, + "learning_rate": 4.996591903099881e-05, + "loss": 6.3304, + "step": 2796 + }, + { + "epoch": 0.016634551336949282, + "grad_norm": 2.7057220935821533, + "learning_rate": 4.9965894645085885e-05, + "loss": 6.8328, + "step": 2797 + }, + { + "epoch": 0.01664049862023028, + "grad_norm": 2.392627716064453, + "learning_rate": 4.996587025045762e-05, + "loss": 6.8491, + "step": 2798 + }, + { + "epoch": 0.016646445903511276, + "grad_norm": 2.47928786277771, + "learning_rate": 4.9965845847114024e-05, + "loss": 6.6323, + "step": 2799 + }, + { + "epoch": 0.016652393186792274, + "grad_norm": 2.438870668411255, + "learning_rate": 4.9965821435055115e-05, + "loss": 6.3832, + "step": 2800 + }, + { + "epoch": 0.01665834047007327, + "grad_norm": 2.6875247955322266, + "learning_rate": 4.9965797014280895e-05, + "loss": 6.6994, + "step": 2801 + }, + { + "epoch": 0.016664287753354268, + "grad_norm": 2.71785044670105, + "learning_rate": 4.996577258479137e-05, + "loss": 6.2505, + "step": 2802 + }, + { + "epoch": 0.016670235036635266, + "grad_norm": 2.32853102684021, + "learning_rate": 4.996574814658655e-05, + "loss": 6.4409, + "step": 2803 + }, + { + "epoch": 0.01667618231991626, + "grad_norm": 2.271027088165283, + "learning_rate": 4.996572369966646e-05, + "loss": 6.4928, + "step": 2804 + }, + { + "epoch": 0.01668212960319726, + "grad_norm": 2.621448278427124, + "learning_rate": 4.996569924403108e-05, + "loss": 6.7248, + "step": 2805 + }, + { + "epoch": 0.01668807688647826, + "grad_norm": 3.621654748916626, + "learning_rate": 4.9965674779680435e-05, + "loss": 6.7268, + "step": 2806 + }, + { + "epoch": 0.016694024169759254, + "grad_norm": 2.2045094966888428, + "learning_rate": 4.9965650306614534e-05, + "loss": 6.6406, + "step": 2807 + }, + { + "epoch": 0.016699971453040252, + "grad_norm": 2.4885873794555664, + "learning_rate": 4.9965625824833376e-05, + "loss": 6.611, + "step": 2808 + }, + { + "epoch": 0.016705918736321247, + "grad_norm": 2.796971082687378, + "learning_rate": 4.996560133433697e-05, + "loss": 6.455, + "step": 2809 + }, + { + "epoch": 0.016711866019602246, + "grad_norm": 2.539395570755005, + "learning_rate": 4.996557683512535e-05, + "loss": 6.8169, + "step": 2810 + }, + { + "epoch": 0.016717813302883244, + "grad_norm": 2.322824239730835, + "learning_rate": 4.99655523271985e-05, + "loss": 6.3217, + "step": 2811 + }, + { + "epoch": 0.01672376058616424, + "grad_norm": 2.4404520988464355, + "learning_rate": 4.9965527810556424e-05, + "loss": 6.5026, + "step": 2812 + }, + { + "epoch": 0.016729707869445238, + "grad_norm": 2.287362575531006, + "learning_rate": 4.996550328519915e-05, + "loss": 6.9183, + "step": 2813 + }, + { + "epoch": 0.016735655152726233, + "grad_norm": 2.369877815246582, + "learning_rate": 4.996547875112667e-05, + "loss": 6.7488, + "step": 2814 + }, + { + "epoch": 0.01674160243600723, + "grad_norm": 2.323082685470581, + "learning_rate": 4.996545420833899e-05, + "loss": 6.6177, + "step": 2815 + }, + { + "epoch": 0.01674754971928823, + "grad_norm": 2.221214532852173, + "learning_rate": 4.9965429656836145e-05, + "loss": 6.6844, + "step": 2816 + }, + { + "epoch": 0.016753497002569225, + "grad_norm": 2.246819496154785, + "learning_rate": 4.9965405096618116e-05, + "loss": 6.5631, + "step": 2817 + }, + { + "epoch": 0.016759444285850224, + "grad_norm": 2.411806583404541, + "learning_rate": 4.996538052768493e-05, + "loss": 6.4037, + "step": 2818 + }, + { + "epoch": 0.016765391569131222, + "grad_norm": 1.941197395324707, + "learning_rate": 4.996535595003658e-05, + "loss": 6.5232, + "step": 2819 + }, + { + "epoch": 0.016771338852412217, + "grad_norm": 2.149991750717163, + "learning_rate": 4.996533136367309e-05, + "loss": 6.4166, + "step": 2820 + }, + { + "epoch": 0.016777286135693216, + "grad_norm": 2.5388433933258057, + "learning_rate": 4.9965306768594454e-05, + "loss": 6.5733, + "step": 2821 + }, + { + "epoch": 0.01678323341897421, + "grad_norm": 2.1857333183288574, + "learning_rate": 4.9965282164800694e-05, + "loss": 6.5558, + "step": 2822 + }, + { + "epoch": 0.01678918070225521, + "grad_norm": 2.1090164184570312, + "learning_rate": 4.9965257552291804e-05, + "loss": 6.6916, + "step": 2823 + }, + { + "epoch": 0.016795127985536208, + "grad_norm": 2.1102349758148193, + "learning_rate": 4.9965232931067806e-05, + "loss": 6.5852, + "step": 2824 + }, + { + "epoch": 0.016801075268817203, + "grad_norm": 2.384660005569458, + "learning_rate": 4.99652083011287e-05, + "loss": 6.5033, + "step": 2825 + }, + { + "epoch": 0.016807022552098202, + "grad_norm": 2.314896821975708, + "learning_rate": 4.9965183662474504e-05, + "loss": 6.4108, + "step": 2826 + }, + { + "epoch": 0.0168129698353792, + "grad_norm": 2.4358227252960205, + "learning_rate": 4.9965159015105215e-05, + "loss": 6.5309, + "step": 2827 + }, + { + "epoch": 0.016818917118660195, + "grad_norm": 2.179905652999878, + "learning_rate": 4.9965134359020844e-05, + "loss": 6.4593, + "step": 2828 + }, + { + "epoch": 0.016824864401941194, + "grad_norm": 2.2742464542388916, + "learning_rate": 4.99651096942214e-05, + "loss": 6.6654, + "step": 2829 + }, + { + "epoch": 0.01683081168522219, + "grad_norm": 2.211026668548584, + "learning_rate": 4.9965085020706906e-05, + "loss": 6.4527, + "step": 2830 + }, + { + "epoch": 0.016836758968503188, + "grad_norm": 2.552072763442993, + "learning_rate": 4.996506033847735e-05, + "loss": 6.5338, + "step": 2831 + }, + { + "epoch": 0.016842706251784186, + "grad_norm": 2.3208038806915283, + "learning_rate": 4.996503564753276e-05, + "loss": 6.473, + "step": 2832 + }, + { + "epoch": 0.01684865353506518, + "grad_norm": 2.3756048679351807, + "learning_rate": 4.996501094787312e-05, + "loss": 6.4223, + "step": 2833 + }, + { + "epoch": 0.01685460081834618, + "grad_norm": 2.386152982711792, + "learning_rate": 4.996498623949846e-05, + "loss": 6.317, + "step": 2834 + }, + { + "epoch": 0.01686054810162718, + "grad_norm": 2.144510507583618, + "learning_rate": 4.996496152240878e-05, + "loss": 6.4039, + "step": 2835 + }, + { + "epoch": 0.016866495384908173, + "grad_norm": 2.3362607955932617, + "learning_rate": 4.996493679660409e-05, + "loss": 6.5411, + "step": 2836 + }, + { + "epoch": 0.016872442668189172, + "grad_norm": 2.156428337097168, + "learning_rate": 4.9964912062084404e-05, + "loss": 6.3399, + "step": 2837 + }, + { + "epoch": 0.016878389951470167, + "grad_norm": 2.3429903984069824, + "learning_rate": 4.9964887318849715e-05, + "loss": 6.5159, + "step": 2838 + }, + { + "epoch": 0.016884337234751166, + "grad_norm": 2.1888442039489746, + "learning_rate": 4.9964862566900045e-05, + "loss": 6.3906, + "step": 2839 + }, + { + "epoch": 0.016890284518032164, + "grad_norm": 2.3973047733306885, + "learning_rate": 4.9964837806235396e-05, + "loss": 6.3452, + "step": 2840 + }, + { + "epoch": 0.01689623180131316, + "grad_norm": 2.232057809829712, + "learning_rate": 4.996481303685578e-05, + "loss": 6.5203, + "step": 2841 + }, + { + "epoch": 0.016902179084594158, + "grad_norm": 2.672342300415039, + "learning_rate": 4.996478825876122e-05, + "loss": 6.8615, + "step": 2842 + }, + { + "epoch": 0.016908126367875153, + "grad_norm": 2.603943347930908, + "learning_rate": 4.996476347195171e-05, + "loss": 7.1632, + "step": 2843 + }, + { + "epoch": 0.01691407365115615, + "grad_norm": 2.684616804122925, + "learning_rate": 4.9964738676427234e-05, + "loss": 6.5546, + "step": 2844 + }, + { + "epoch": 0.01692002093443715, + "grad_norm": 2.1103904247283936, + "learning_rate": 4.996471387218785e-05, + "loss": 6.4666, + "step": 2845 + }, + { + "epoch": 0.016925968217718145, + "grad_norm": 2.8278937339782715, + "learning_rate": 4.9964689059233525e-05, + "loss": 6.3685, + "step": 2846 + }, + { + "epoch": 0.016931915500999144, + "grad_norm": 3.2611489295959473, + "learning_rate": 4.9964664237564296e-05, + "loss": 6.5537, + "step": 2847 + }, + { + "epoch": 0.016937862784280142, + "grad_norm": 3.029353141784668, + "learning_rate": 4.9964639407180155e-05, + "loss": 6.6097, + "step": 2848 + }, + { + "epoch": 0.016943810067561137, + "grad_norm": 2.6735312938690186, + "learning_rate": 4.996461456808112e-05, + "loss": 6.5854, + "step": 2849 + }, + { + "epoch": 0.016949757350842136, + "grad_norm": 2.7619409561157227, + "learning_rate": 4.99645897202672e-05, + "loss": 6.5944, + "step": 2850 + }, + { + "epoch": 0.01695570463412313, + "grad_norm": 3.0398738384246826, + "learning_rate": 4.9964564863738396e-05, + "loss": 6.3804, + "step": 2851 + }, + { + "epoch": 0.01696165191740413, + "grad_norm": 3.5388784408569336, + "learning_rate": 4.996453999849472e-05, + "loss": 7.0993, + "step": 2852 + }, + { + "epoch": 0.016967599200685128, + "grad_norm": 2.3602113723754883, + "learning_rate": 4.9964515124536185e-05, + "loss": 6.4981, + "step": 2853 + }, + { + "epoch": 0.016973546483966123, + "grad_norm": 2.346632957458496, + "learning_rate": 4.996449024186278e-05, + "loss": 6.4892, + "step": 2854 + }, + { + "epoch": 0.016979493767247122, + "grad_norm": 2.9653544425964355, + "learning_rate": 4.996446535047454e-05, + "loss": 6.2772, + "step": 2855 + }, + { + "epoch": 0.01698544105052812, + "grad_norm": 3.1064538955688477, + "learning_rate": 4.996444045037147e-05, + "loss": 6.238, + "step": 2856 + }, + { + "epoch": 0.016991388333809115, + "grad_norm": 2.9617815017700195, + "learning_rate": 4.9964415541553564e-05, + "loss": 6.2991, + "step": 2857 + }, + { + "epoch": 0.016997335617090114, + "grad_norm": 2.5993905067443848, + "learning_rate": 4.996439062402084e-05, + "loss": 6.5482, + "step": 2858 + }, + { + "epoch": 0.01700328290037111, + "grad_norm": 2.5469226837158203, + "learning_rate": 4.996436569777331e-05, + "loss": 6.437, + "step": 2859 + }, + { + "epoch": 0.017009230183652108, + "grad_norm": 2.709184408187866, + "learning_rate": 4.9964340762810965e-05, + "loss": 6.1362, + "step": 2860 + }, + { + "epoch": 0.017015177466933106, + "grad_norm": 2.843942880630493, + "learning_rate": 4.9964315819133837e-05, + "loss": 6.2443, + "step": 2861 + }, + { + "epoch": 0.0170211247502141, + "grad_norm": 3.022735357284546, + "learning_rate": 4.9964290866741925e-05, + "loss": 6.3161, + "step": 2862 + }, + { + "epoch": 0.0170270720334951, + "grad_norm": 2.487271308898926, + "learning_rate": 4.996426590563523e-05, + "loss": 6.3352, + "step": 2863 + }, + { + "epoch": 0.0170330193167761, + "grad_norm": 2.624000072479248, + "learning_rate": 4.996424093581377e-05, + "loss": 6.3575, + "step": 2864 + }, + { + "epoch": 0.017038966600057093, + "grad_norm": 2.378368854522705, + "learning_rate": 4.996421595727756e-05, + "loss": 6.3284, + "step": 2865 + }, + { + "epoch": 0.017044913883338092, + "grad_norm": 2.6903984546661377, + "learning_rate": 4.996419097002659e-05, + "loss": 6.271, + "step": 2866 + }, + { + "epoch": 0.017050861166619087, + "grad_norm": 2.536391019821167, + "learning_rate": 4.9964165974060875e-05, + "loss": 6.1276, + "step": 2867 + }, + { + "epoch": 0.017056808449900086, + "grad_norm": 2.470395803451538, + "learning_rate": 4.9964140969380434e-05, + "loss": 6.1032, + "step": 2868 + }, + { + "epoch": 0.017062755733181084, + "grad_norm": 2.929818630218506, + "learning_rate": 4.996411595598528e-05, + "loss": 6.0994, + "step": 2869 + }, + { + "epoch": 0.01706870301646208, + "grad_norm": 2.548701763153076, + "learning_rate": 4.99640909338754e-05, + "loss": 6.2227, + "step": 2870 + }, + { + "epoch": 0.017074650299743078, + "grad_norm": 2.6044397354125977, + "learning_rate": 4.99640659030508e-05, + "loss": 6.0778, + "step": 2871 + }, + { + "epoch": 0.017080597583024073, + "grad_norm": 2.687392473220825, + "learning_rate": 4.996404086351153e-05, + "loss": 6.2975, + "step": 2872 + }, + { + "epoch": 0.01708654486630507, + "grad_norm": 2.740201711654663, + "learning_rate": 4.9964015815257556e-05, + "loss": 6.5955, + "step": 2873 + }, + { + "epoch": 0.01709249214958607, + "grad_norm": 2.605958938598633, + "learning_rate": 4.99639907582889e-05, + "loss": 6.2112, + "step": 2874 + }, + { + "epoch": 0.017098439432867065, + "grad_norm": 2.9691529273986816, + "learning_rate": 4.996396569260558e-05, + "loss": 6.1435, + "step": 2875 + }, + { + "epoch": 0.017104386716148064, + "grad_norm": 2.822201728820801, + "learning_rate": 4.9963940618207593e-05, + "loss": 6.1949, + "step": 2876 + }, + { + "epoch": 0.017110333999429062, + "grad_norm": 2.6231529712677, + "learning_rate": 4.996391553509495e-05, + "loss": 6.5082, + "step": 2877 + }, + { + "epoch": 0.017116281282710057, + "grad_norm": 2.6511785984039307, + "learning_rate": 4.9963890443267666e-05, + "loss": 6.4461, + "step": 2878 + }, + { + "epoch": 0.017122228565991056, + "grad_norm": 2.4790167808532715, + "learning_rate": 4.996386534272575e-05, + "loss": 6.4642, + "step": 2879 + }, + { + "epoch": 0.01712817584927205, + "grad_norm": 3.6982533931732178, + "learning_rate": 4.99638402334692e-05, + "loss": 6.2957, + "step": 2880 + }, + { + "epoch": 0.01713412313255305, + "grad_norm": 2.380385160446167, + "learning_rate": 4.996381511549804e-05, + "loss": 6.3174, + "step": 2881 + }, + { + "epoch": 0.017140070415834048, + "grad_norm": 2.425537347793579, + "learning_rate": 4.996378998881226e-05, + "loss": 6.2055, + "step": 2882 + }, + { + "epoch": 0.017146017699115043, + "grad_norm": 2.4667842388153076, + "learning_rate": 4.996376485341188e-05, + "loss": 6.245, + "step": 2883 + }, + { + "epoch": 0.01715196498239604, + "grad_norm": 2.6306424140930176, + "learning_rate": 4.996373970929691e-05, + "loss": 6.1162, + "step": 2884 + }, + { + "epoch": 0.01715791226567704, + "grad_norm": 4.439255714416504, + "learning_rate": 4.996371455646736e-05, + "loss": 5.9868, + "step": 2885 + }, + { + "epoch": 0.017163859548958035, + "grad_norm": 3.3248472213745117, + "learning_rate": 4.9963689394923224e-05, + "loss": 5.861, + "step": 2886 + }, + { + "epoch": 0.017169806832239034, + "grad_norm": 2.45271897315979, + "learning_rate": 4.996366422466453e-05, + "loss": 6.1588, + "step": 2887 + }, + { + "epoch": 0.01717575411552003, + "grad_norm": 3.1748130321502686, + "learning_rate": 4.996363904569128e-05, + "loss": 6.3607, + "step": 2888 + }, + { + "epoch": 0.017181701398801028, + "grad_norm": 3.300736427307129, + "learning_rate": 4.996361385800348e-05, + "loss": 6.0709, + "step": 2889 + }, + { + "epoch": 0.017187648682082026, + "grad_norm": 2.720550060272217, + "learning_rate": 4.9963588661601136e-05, + "loss": 6.0496, + "step": 2890 + }, + { + "epoch": 0.01719359596536302, + "grad_norm": 2.251845121383667, + "learning_rate": 4.9963563456484266e-05, + "loss": 6.0088, + "step": 2891 + }, + { + "epoch": 0.01719954324864402, + "grad_norm": 2.7863035202026367, + "learning_rate": 4.996353824265288e-05, + "loss": 5.9478, + "step": 2892 + }, + { + "epoch": 0.01720549053192502, + "grad_norm": 2.831744432449341, + "learning_rate": 4.996351302010697e-05, + "loss": 6.1629, + "step": 2893 + }, + { + "epoch": 0.017211437815206013, + "grad_norm": 4.583891868591309, + "learning_rate": 4.9963487788846556e-05, + "loss": 6.7936, + "step": 2894 + }, + { + "epoch": 0.017217385098487012, + "grad_norm": 2.4525468349456787, + "learning_rate": 4.996346254887165e-05, + "loss": 6.3188, + "step": 2895 + }, + { + "epoch": 0.017223332381768007, + "grad_norm": 3.0866281986236572, + "learning_rate": 4.9963437300182254e-05, + "loss": 6.0207, + "step": 2896 + }, + { + "epoch": 0.017229279665049006, + "grad_norm": 3.1188113689422607, + "learning_rate": 4.996341204277838e-05, + "loss": 5.9873, + "step": 2897 + }, + { + "epoch": 0.017235226948330004, + "grad_norm": 2.4119350910186768, + "learning_rate": 4.996338677666004e-05, + "loss": 5.8104, + "step": 2898 + }, + { + "epoch": 0.017241174231611, + "grad_norm": 1.9601647853851318, + "learning_rate": 4.996336150182724e-05, + "loss": 6.2166, + "step": 2899 + }, + { + "epoch": 0.017247121514891998, + "grad_norm": 3.428379535675049, + "learning_rate": 4.9963336218279986e-05, + "loss": 6.4284, + "step": 2900 + }, + { + "epoch": 0.017253068798172993, + "grad_norm": 2.629446506500244, + "learning_rate": 4.996331092601829e-05, + "loss": 6.4916, + "step": 2901 + }, + { + "epoch": 0.01725901608145399, + "grad_norm": 2.3860316276550293, + "learning_rate": 4.996328562504216e-05, + "loss": 6.5035, + "step": 2902 + }, + { + "epoch": 0.01726496336473499, + "grad_norm": 2.6754682064056396, + "learning_rate": 4.996326031535161e-05, + "loss": 6.6374, + "step": 2903 + }, + { + "epoch": 0.017270910648015985, + "grad_norm": 2.737901210784912, + "learning_rate": 4.9963234996946635e-05, + "loss": 6.5023, + "step": 2904 + }, + { + "epoch": 0.017276857931296984, + "grad_norm": 2.481691837310791, + "learning_rate": 4.996320966982726e-05, + "loss": 6.5211, + "step": 2905 + }, + { + "epoch": 0.017282805214577982, + "grad_norm": 3.3993568420410156, + "learning_rate": 4.996318433399348e-05, + "loss": 6.4239, + "step": 2906 + }, + { + "epoch": 0.017288752497858977, + "grad_norm": 3.9149057865142822, + "learning_rate": 4.9963158989445316e-05, + "loss": 6.3874, + "step": 2907 + }, + { + "epoch": 0.017294699781139976, + "grad_norm": 2.3808562755584717, + "learning_rate": 4.996313363618276e-05, + "loss": 6.2887, + "step": 2908 + }, + { + "epoch": 0.01730064706442097, + "grad_norm": 2.6186649799346924, + "learning_rate": 4.996310827420585e-05, + "loss": 6.2944, + "step": 2909 + }, + { + "epoch": 0.01730659434770197, + "grad_norm": 2.5251142978668213, + "learning_rate": 4.9963082903514554e-05, + "loss": 6.0944, + "step": 2910 + }, + { + "epoch": 0.017312541630982968, + "grad_norm": 2.8212270736694336, + "learning_rate": 4.9963057524108926e-05, + "loss": 6.6621, + "step": 2911 + }, + { + "epoch": 0.017318488914263963, + "grad_norm": 2.477485418319702, + "learning_rate": 4.996303213598894e-05, + "loss": 6.3941, + "step": 2912 + }, + { + "epoch": 0.01732443619754496, + "grad_norm": 3.6508305072784424, + "learning_rate": 4.996300673915462e-05, + "loss": 6.3234, + "step": 2913 + }, + { + "epoch": 0.01733038348082596, + "grad_norm": 2.1635468006134033, + "learning_rate": 4.996298133360598e-05, + "loss": 6.2877, + "step": 2914 + }, + { + "epoch": 0.017336330764106955, + "grad_norm": 3.431082010269165, + "learning_rate": 4.9962955919343004e-05, + "loss": 6.2627, + "step": 2915 + }, + { + "epoch": 0.017342278047387954, + "grad_norm": 3.272376775741577, + "learning_rate": 4.9962930496365736e-05, + "loss": 6.1458, + "step": 2916 + }, + { + "epoch": 0.01734822533066895, + "grad_norm": 3.5927000045776367, + "learning_rate": 4.996290506467415e-05, + "loss": 5.9828, + "step": 2917 + }, + { + "epoch": 0.017354172613949947, + "grad_norm": 3.569641351699829, + "learning_rate": 4.996287962426829e-05, + "loss": 6.5957, + "step": 2918 + }, + { + "epoch": 0.017360119897230946, + "grad_norm": 3.281855344772339, + "learning_rate": 4.9962854175148134e-05, + "loss": 6.3393, + "step": 2919 + }, + { + "epoch": 0.01736606718051194, + "grad_norm": 2.6009061336517334, + "learning_rate": 4.9962828717313706e-05, + "loss": 6.3537, + "step": 2920 + }, + { + "epoch": 0.01737201446379294, + "grad_norm": 3.964467763900757, + "learning_rate": 4.996280325076501e-05, + "loss": 6.0281, + "step": 2921 + }, + { + "epoch": 0.017377961747073938, + "grad_norm": 3.9164865016937256, + "learning_rate": 4.9962777775502064e-05, + "loss": 6.5255, + "step": 2922 + }, + { + "epoch": 0.017383909030354933, + "grad_norm": 2.349709987640381, + "learning_rate": 4.996275229152486e-05, + "loss": 6.2459, + "step": 2923 + }, + { + "epoch": 0.017389856313635932, + "grad_norm": 2.5735161304473877, + "learning_rate": 4.9962726798833425e-05, + "loss": 6.0463, + "step": 2924 + }, + { + "epoch": 0.017395803596916927, + "grad_norm": 2.228271961212158, + "learning_rate": 4.9962701297427764e-05, + "loss": 6.1147, + "step": 2925 + }, + { + "epoch": 0.017401750880197926, + "grad_norm": 2.4587175846099854, + "learning_rate": 4.9962675787307875e-05, + "loss": 7.0868, + "step": 2926 + }, + { + "epoch": 0.017407698163478924, + "grad_norm": 2.2712674140930176, + "learning_rate": 4.996265026847378e-05, + "loss": 6.175, + "step": 2927 + }, + { + "epoch": 0.01741364544675992, + "grad_norm": 3.0724384784698486, + "learning_rate": 4.996262474092547e-05, + "loss": 6.5354, + "step": 2928 + }, + { + "epoch": 0.017419592730040918, + "grad_norm": 4.872220039367676, + "learning_rate": 4.996259920466297e-05, + "loss": 6.1938, + "step": 2929 + }, + { + "epoch": 0.017425540013321916, + "grad_norm": 4.508706569671631, + "learning_rate": 4.996257365968629e-05, + "loss": 6.1813, + "step": 2930 + }, + { + "epoch": 0.01743148729660291, + "grad_norm": 3.0419485569000244, + "learning_rate": 4.996254810599543e-05, + "loss": 5.9529, + "step": 2931 + }, + { + "epoch": 0.01743743457988391, + "grad_norm": 2.8372066020965576, + "learning_rate": 4.996252254359041e-05, + "loss": 5.9422, + "step": 2932 + }, + { + "epoch": 0.017443381863164905, + "grad_norm": 4.554285526275635, + "learning_rate": 4.996249697247122e-05, + "loss": 6.9073, + "step": 2933 + }, + { + "epoch": 0.017449329146445904, + "grad_norm": 3.121094226837158, + "learning_rate": 4.996247139263788e-05, + "loss": 6.2827, + "step": 2934 + }, + { + "epoch": 0.017455276429726902, + "grad_norm": 3.936596632003784, + "learning_rate": 4.996244580409041e-05, + "loss": 6.7863, + "step": 2935 + }, + { + "epoch": 0.017461223713007897, + "grad_norm": 3.5771539211273193, + "learning_rate": 4.99624202068288e-05, + "loss": 7.0691, + "step": 2936 + }, + { + "epoch": 0.017467170996288896, + "grad_norm": 2.0674471855163574, + "learning_rate": 4.996239460085307e-05, + "loss": 6.9768, + "step": 2937 + }, + { + "epoch": 0.01747311827956989, + "grad_norm": 2.600167989730835, + "learning_rate": 4.996236898616322e-05, + "loss": 6.4235, + "step": 2938 + }, + { + "epoch": 0.01747906556285089, + "grad_norm": 2.9444847106933594, + "learning_rate": 4.9962343362759267e-05, + "loss": 6.7305, + "step": 2939 + }, + { + "epoch": 0.017485012846131888, + "grad_norm": 3.721101999282837, + "learning_rate": 4.996231773064122e-05, + "loss": 6.5147, + "step": 2940 + }, + { + "epoch": 0.017490960129412883, + "grad_norm": 5.715269565582275, + "learning_rate": 4.9962292089809086e-05, + "loss": 6.1433, + "step": 2941 + }, + { + "epoch": 0.01749690741269388, + "grad_norm": 4.245530128479004, + "learning_rate": 4.996226644026287e-05, + "loss": 6.2163, + "step": 2942 + }, + { + "epoch": 0.01750285469597488, + "grad_norm": 2.7717039585113525, + "learning_rate": 4.996224078200259e-05, + "loss": 5.877, + "step": 2943 + }, + { + "epoch": 0.017508801979255875, + "grad_norm": 3.4189441204071045, + "learning_rate": 4.9962215115028255e-05, + "loss": 5.9575, + "step": 2944 + }, + { + "epoch": 0.017514749262536874, + "grad_norm": 3.754513740539551, + "learning_rate": 4.996218943933986e-05, + "loss": 5.7512, + "step": 2945 + }, + { + "epoch": 0.01752069654581787, + "grad_norm": 3.4231228828430176, + "learning_rate": 4.9962163754937426e-05, + "loss": 6.4566, + "step": 2946 + }, + { + "epoch": 0.017526643829098867, + "grad_norm": 2.7481472492218018, + "learning_rate": 4.996213806182095e-05, + "loss": 6.1385, + "step": 2947 + }, + { + "epoch": 0.017532591112379866, + "grad_norm": 2.802342414855957, + "learning_rate": 4.996211235999046e-05, + "loss": 5.6656, + "step": 2948 + }, + { + "epoch": 0.01753853839566086, + "grad_norm": 2.60530686378479, + "learning_rate": 4.996208664944595e-05, + "loss": 5.7339, + "step": 2949 + }, + { + "epoch": 0.01754448567894186, + "grad_norm": 2.476100206375122, + "learning_rate": 4.996206093018744e-05, + "loss": 6.0447, + "step": 2950 + }, + { + "epoch": 0.017550432962222858, + "grad_norm": 2.3516924381256104, + "learning_rate": 4.9962035202214916e-05, + "loss": 6.2046, + "step": 2951 + }, + { + "epoch": 0.017556380245503853, + "grad_norm": 2.447519302368164, + "learning_rate": 4.996200946552842e-05, + "loss": 6.0279, + "step": 2952 + }, + { + "epoch": 0.017562327528784852, + "grad_norm": 2.679766893386841, + "learning_rate": 4.996198372012794e-05, + "loss": 5.9072, + "step": 2953 + }, + { + "epoch": 0.017568274812065847, + "grad_norm": 2.3413944244384766, + "learning_rate": 4.9961957966013486e-05, + "loss": 5.9214, + "step": 2954 + }, + { + "epoch": 0.017574222095346845, + "grad_norm": 2.273725986480713, + "learning_rate": 4.996193220318507e-05, + "loss": 6.2107, + "step": 2955 + }, + { + "epoch": 0.017580169378627844, + "grad_norm": 2.9424052238464355, + "learning_rate": 4.99619064316427e-05, + "loss": 5.8618, + "step": 2956 + }, + { + "epoch": 0.01758611666190884, + "grad_norm": 2.40987229347229, + "learning_rate": 4.9961880651386394e-05, + "loss": 6.1306, + "step": 2957 + }, + { + "epoch": 0.017592063945189838, + "grad_norm": 2.542084217071533, + "learning_rate": 4.9961854862416144e-05, + "loss": 6.2225, + "step": 2958 + }, + { + "epoch": 0.017598011228470836, + "grad_norm": 2.06935977935791, + "learning_rate": 4.996182906473198e-05, + "loss": 5.9899, + "step": 2959 + }, + { + "epoch": 0.01760395851175183, + "grad_norm": 2.1998584270477295, + "learning_rate": 4.99618032583339e-05, + "loss": 6.2268, + "step": 2960 + }, + { + "epoch": 0.01760990579503283, + "grad_norm": 2.5595617294311523, + "learning_rate": 4.99617774432219e-05, + "loss": 6.2856, + "step": 2961 + }, + { + "epoch": 0.017615853078313825, + "grad_norm": 2.9262382984161377, + "learning_rate": 4.9961751619396e-05, + "loss": 6.2747, + "step": 2962 + }, + { + "epoch": 0.017621800361594823, + "grad_norm": 2.3705809116363525, + "learning_rate": 4.996172578685622e-05, + "loss": 6.1376, + "step": 2963 + }, + { + "epoch": 0.017627747644875822, + "grad_norm": 2.20991849899292, + "learning_rate": 4.996169994560256e-05, + "loss": 6.0118, + "step": 2964 + }, + { + "epoch": 0.017633694928156817, + "grad_norm": 2.2801706790924072, + "learning_rate": 4.996167409563502e-05, + "loss": 6.0924, + "step": 2965 + }, + { + "epoch": 0.017639642211437816, + "grad_norm": 2.5618062019348145, + "learning_rate": 4.996164823695362e-05, + "loss": 6.0931, + "step": 2966 + }, + { + "epoch": 0.01764558949471881, + "grad_norm": 2.2933573722839355, + "learning_rate": 4.996162236955837e-05, + "loss": 6.1584, + "step": 2967 + }, + { + "epoch": 0.01765153677799981, + "grad_norm": 2.2387471199035645, + "learning_rate": 4.996159649344928e-05, + "loss": 6.1224, + "step": 2968 + }, + { + "epoch": 0.017657484061280808, + "grad_norm": 2.425929069519043, + "learning_rate": 4.9961570608626347e-05, + "loss": 6.2419, + "step": 2969 + }, + { + "epoch": 0.017663431344561803, + "grad_norm": 3.0279812812805176, + "learning_rate": 4.996154471508959e-05, + "loss": 6.0478, + "step": 2970 + }, + { + "epoch": 0.0176693786278428, + "grad_norm": 2.8950276374816895, + "learning_rate": 4.9961518812839015e-05, + "loss": 5.9663, + "step": 2971 + }, + { + "epoch": 0.0176753259111238, + "grad_norm": 2.9908859729766846, + "learning_rate": 4.996149290187463e-05, + "loss": 5.8101, + "step": 2972 + }, + { + "epoch": 0.017681273194404795, + "grad_norm": 2.900987148284912, + "learning_rate": 4.996146698219645e-05, + "loss": 6.133, + "step": 2973 + }, + { + "epoch": 0.017687220477685794, + "grad_norm": 3.3194754123687744, + "learning_rate": 4.996144105380447e-05, + "loss": 5.9763, + "step": 2974 + }, + { + "epoch": 0.01769316776096679, + "grad_norm": 2.4997923374176025, + "learning_rate": 4.996141511669872e-05, + "loss": 6.1062, + "step": 2975 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 2.3048369884490967, + "learning_rate": 4.996138917087919e-05, + "loss": 6.138, + "step": 2976 + }, + { + "epoch": 0.017705062327528786, + "grad_norm": 2.3391027450561523, + "learning_rate": 4.99613632163459e-05, + "loss": 6.0612, + "step": 2977 + }, + { + "epoch": 0.01771100961080978, + "grad_norm": 2.6164605617523193, + "learning_rate": 4.996133725309886e-05, + "loss": 6.0402, + "step": 2978 + }, + { + "epoch": 0.01771695689409078, + "grad_norm": 2.6534295082092285, + "learning_rate": 4.996131128113807e-05, + "loss": 5.9027, + "step": 2979 + }, + { + "epoch": 0.017722904177371778, + "grad_norm": 2.1807172298431396, + "learning_rate": 4.996128530046354e-05, + "loss": 5.7083, + "step": 2980 + }, + { + "epoch": 0.017728851460652773, + "grad_norm": 2.433762550354004, + "learning_rate": 4.9961259311075296e-05, + "loss": 6.1587, + "step": 2981 + }, + { + "epoch": 0.017734798743933772, + "grad_norm": 2.4656107425689697, + "learning_rate": 4.996123331297333e-05, + "loss": 5.9831, + "step": 2982 + }, + { + "epoch": 0.017740746027214767, + "grad_norm": 2.536060333251953, + "learning_rate": 4.996120730615765e-05, + "loss": 5.9083, + "step": 2983 + }, + { + "epoch": 0.017746693310495765, + "grad_norm": 2.2993409633636475, + "learning_rate": 4.996118129062828e-05, + "loss": 6.0156, + "step": 2984 + }, + { + "epoch": 0.017752640593776764, + "grad_norm": 2.0221481323242188, + "learning_rate": 4.996115526638521e-05, + "loss": 5.9836, + "step": 2985 + }, + { + "epoch": 0.01775858787705776, + "grad_norm": 2.401350498199463, + "learning_rate": 4.996112923342846e-05, + "loss": 5.8071, + "step": 2986 + }, + { + "epoch": 0.017764535160338758, + "grad_norm": 2.469214677810669, + "learning_rate": 4.996110319175804e-05, + "loss": 5.8784, + "step": 2987 + }, + { + "epoch": 0.017770482443619756, + "grad_norm": 2.454481601715088, + "learning_rate": 4.9961077141373955e-05, + "loss": 5.9168, + "step": 2988 + }, + { + "epoch": 0.01777642972690075, + "grad_norm": 2.3173487186431885, + "learning_rate": 4.996105108227621e-05, + "loss": 5.8797, + "step": 2989 + }, + { + "epoch": 0.01778237701018175, + "grad_norm": 2.1967554092407227, + "learning_rate": 4.996102501446483e-05, + "loss": 5.972, + "step": 2990 + }, + { + "epoch": 0.017788324293462745, + "grad_norm": 2.1263201236724854, + "learning_rate": 4.996099893793981e-05, + "loss": 5.9301, + "step": 2991 + }, + { + "epoch": 0.017794271576743743, + "grad_norm": 2.1959195137023926, + "learning_rate": 4.9960972852701165e-05, + "loss": 6.0422, + "step": 2992 + }, + { + "epoch": 0.017800218860024742, + "grad_norm": 2.3290374279022217, + "learning_rate": 4.99609467587489e-05, + "loss": 6.1926, + "step": 2993 + }, + { + "epoch": 0.017806166143305737, + "grad_norm": 2.3518059253692627, + "learning_rate": 4.996092065608303e-05, + "loss": 5.8583, + "step": 2994 + }, + { + "epoch": 0.017812113426586736, + "grad_norm": 2.4263339042663574, + "learning_rate": 4.996089454470355e-05, + "loss": 5.8149, + "step": 2995 + }, + { + "epoch": 0.01781806070986773, + "grad_norm": 2.0764389038085938, + "learning_rate": 4.99608684246105e-05, + "loss": 5.8782, + "step": 2996 + }, + { + "epoch": 0.01782400799314873, + "grad_norm": 2.086904764175415, + "learning_rate": 4.996084229580385e-05, + "loss": 5.7885, + "step": 2997 + }, + { + "epoch": 0.017829955276429728, + "grad_norm": 2.1907291412353516, + "learning_rate": 4.996081615828363e-05, + "loss": 5.9246, + "step": 2998 + }, + { + "epoch": 0.017835902559710723, + "grad_norm": 2.4596495628356934, + "learning_rate": 4.9960790012049854e-05, + "loss": 5.7786, + "step": 2999 + }, + { + "epoch": 0.01784184984299172, + "grad_norm": 2.0762453079223633, + "learning_rate": 4.996076385710252e-05, + "loss": 5.9901, + "step": 3000 + }, + { + "epoch": 0.01784779712627272, + "grad_norm": 2.068714141845703, + "learning_rate": 4.996073769344164e-05, + "loss": 5.9437, + "step": 3001 + }, + { + "epoch": 0.017853744409553715, + "grad_norm": 2.4760496616363525, + "learning_rate": 4.9960711521067226e-05, + "loss": 5.8633, + "step": 3002 + }, + { + "epoch": 0.017859691692834714, + "grad_norm": 2.395643949508667, + "learning_rate": 4.996068533997928e-05, + "loss": 5.8024, + "step": 3003 + }, + { + "epoch": 0.01786563897611571, + "grad_norm": 2.120586633682251, + "learning_rate": 4.996065915017783e-05, + "loss": 6.0712, + "step": 3004 + }, + { + "epoch": 0.017871586259396707, + "grad_norm": 2.384794235229492, + "learning_rate": 4.9960632951662866e-05, + "loss": 5.9089, + "step": 3005 + }, + { + "epoch": 0.017877533542677706, + "grad_norm": 2.24297833442688, + "learning_rate": 4.99606067444344e-05, + "loss": 6.0263, + "step": 3006 + }, + { + "epoch": 0.0178834808259587, + "grad_norm": 1.983299732208252, + "learning_rate": 4.996058052849245e-05, + "loss": 5.8706, + "step": 3007 + }, + { + "epoch": 0.0178894281092397, + "grad_norm": 2.2866950035095215, + "learning_rate": 4.996055430383701e-05, + "loss": 5.9031, + "step": 3008 + }, + { + "epoch": 0.017895375392520698, + "grad_norm": 2.3343560695648193, + "learning_rate": 4.996052807046811e-05, + "loss": 5.9155, + "step": 3009 + }, + { + "epoch": 0.017901322675801693, + "grad_norm": 2.079763650894165, + "learning_rate": 4.9960501828385734e-05, + "loss": 5.8102, + "step": 3010 + }, + { + "epoch": 0.01790726995908269, + "grad_norm": 2.0398895740509033, + "learning_rate": 4.996047557758991e-05, + "loss": 5.773, + "step": 3011 + }, + { + "epoch": 0.017913217242363687, + "grad_norm": 2.2478318214416504, + "learning_rate": 4.996044931808064e-05, + "loss": 5.8584, + "step": 3012 + }, + { + "epoch": 0.017919164525644685, + "grad_norm": 2.301398992538452, + "learning_rate": 4.996042304985794e-05, + "loss": 5.9053, + "step": 3013 + }, + { + "epoch": 0.017925111808925684, + "grad_norm": 2.0428216457366943, + "learning_rate": 4.996039677292181e-05, + "loss": 5.9571, + "step": 3014 + }, + { + "epoch": 0.01793105909220668, + "grad_norm": 2.049572467803955, + "learning_rate": 4.9960370487272266e-05, + "loss": 5.9464, + "step": 3015 + }, + { + "epoch": 0.017937006375487678, + "grad_norm": 2.1681618690490723, + "learning_rate": 4.996034419290931e-05, + "loss": 5.9969, + "step": 3016 + }, + { + "epoch": 0.017942953658768676, + "grad_norm": 2.3879425525665283, + "learning_rate": 4.996031788983296e-05, + "loss": 5.7962, + "step": 3017 + }, + { + "epoch": 0.01794890094204967, + "grad_norm": 2.232508420944214, + "learning_rate": 4.996029157804323e-05, + "loss": 5.8479, + "step": 3018 + }, + { + "epoch": 0.01795484822533067, + "grad_norm": 2.222257137298584, + "learning_rate": 4.9960265257540104e-05, + "loss": 5.952, + "step": 3019 + }, + { + "epoch": 0.017960795508611665, + "grad_norm": 2.213777542114258, + "learning_rate": 4.996023892832362e-05, + "loss": 5.9891, + "step": 3020 + }, + { + "epoch": 0.017966742791892663, + "grad_norm": 2.286097764968872, + "learning_rate": 4.996021259039377e-05, + "loss": 5.8995, + "step": 3021 + }, + { + "epoch": 0.017972690075173662, + "grad_norm": 2.1588432788848877, + "learning_rate": 4.996018624375056e-05, + "loss": 5.988, + "step": 3022 + }, + { + "epoch": 0.017978637358454657, + "grad_norm": 2.2468602657318115, + "learning_rate": 4.996015988839402e-05, + "loss": 5.9303, + "step": 3023 + }, + { + "epoch": 0.017984584641735656, + "grad_norm": 2.1732120513916016, + "learning_rate": 4.9960133524324135e-05, + "loss": 5.8696, + "step": 3024 + }, + { + "epoch": 0.01799053192501665, + "grad_norm": 2.2985105514526367, + "learning_rate": 4.996010715154093e-05, + "loss": 5.9251, + "step": 3025 + }, + { + "epoch": 0.01799647920829765, + "grad_norm": 2.1920788288116455, + "learning_rate": 4.996008077004441e-05, + "loss": 5.8023, + "step": 3026 + }, + { + "epoch": 0.018002426491578648, + "grad_norm": 1.9393725395202637, + "learning_rate": 4.996005437983458e-05, + "loss": 5.9576, + "step": 3027 + }, + { + "epoch": 0.018008373774859643, + "grad_norm": 2.115035057067871, + "learning_rate": 4.9960027980911455e-05, + "loss": 5.9105, + "step": 3028 + }, + { + "epoch": 0.01801432105814064, + "grad_norm": 2.143432855606079, + "learning_rate": 4.996000157327504e-05, + "loss": 5.9951, + "step": 3029 + }, + { + "epoch": 0.01802026834142164, + "grad_norm": 2.4353296756744385, + "learning_rate": 4.995997515692536e-05, + "loss": 5.9761, + "step": 3030 + }, + { + "epoch": 0.018026215624702635, + "grad_norm": 1.999054193496704, + "learning_rate": 4.995994873186239e-05, + "loss": 6.028, + "step": 3031 + }, + { + "epoch": 0.018032162907983634, + "grad_norm": 2.05645751953125, + "learning_rate": 4.995992229808617e-05, + "loss": 5.9778, + "step": 3032 + }, + { + "epoch": 0.01803811019126463, + "grad_norm": 1.948923110961914, + "learning_rate": 4.99598958555967e-05, + "loss": 5.8735, + "step": 3033 + }, + { + "epoch": 0.018044057474545627, + "grad_norm": 2.1208486557006836, + "learning_rate": 4.995986940439399e-05, + "loss": 5.7913, + "step": 3034 + }, + { + "epoch": 0.018050004757826626, + "grad_norm": 2.051079750061035, + "learning_rate": 4.995984294447804e-05, + "loss": 5.8097, + "step": 3035 + }, + { + "epoch": 0.01805595204110762, + "grad_norm": 2.021207571029663, + "learning_rate": 4.995981647584887e-05, + "loss": 5.8425, + "step": 3036 + }, + { + "epoch": 0.01806189932438862, + "grad_norm": 2.471315622329712, + "learning_rate": 4.995978999850649e-05, + "loss": 5.7735, + "step": 3037 + }, + { + "epoch": 0.018067846607669618, + "grad_norm": 2.604836940765381, + "learning_rate": 4.9959763512450896e-05, + "loss": 6.4525, + "step": 3038 + }, + { + "epoch": 0.018073793890950613, + "grad_norm": 2.375361919403076, + "learning_rate": 4.995973701768212e-05, + "loss": 5.8072, + "step": 3039 + }, + { + "epoch": 0.01807974117423161, + "grad_norm": 2.354280471801758, + "learning_rate": 4.995971051420014e-05, + "loss": 5.9434, + "step": 3040 + }, + { + "epoch": 0.018085688457512607, + "grad_norm": 2.7335755825042725, + "learning_rate": 4.9959684002005e-05, + "loss": 5.5899, + "step": 3041 + }, + { + "epoch": 0.018091635740793605, + "grad_norm": 2.244917869567871, + "learning_rate": 4.995965748109668e-05, + "loss": 5.799, + "step": 3042 + }, + { + "epoch": 0.018097583024074604, + "grad_norm": 2.2413697242736816, + "learning_rate": 4.995963095147521e-05, + "loss": 5.8635, + "step": 3043 + }, + { + "epoch": 0.0181035303073556, + "grad_norm": 2.122586488723755, + "learning_rate": 4.9959604413140584e-05, + "loss": 5.8098, + "step": 3044 + }, + { + "epoch": 0.018109477590636597, + "grad_norm": 2.407517910003662, + "learning_rate": 4.995957786609282e-05, + "loss": 6.0319, + "step": 3045 + }, + { + "epoch": 0.018115424873917596, + "grad_norm": 2.5628743171691895, + "learning_rate": 4.9959551310331934e-05, + "loss": 5.9561, + "step": 3046 + }, + { + "epoch": 0.01812137215719859, + "grad_norm": 2.335650682449341, + "learning_rate": 4.995952474585791e-05, + "loss": 6.1168, + "step": 3047 + }, + { + "epoch": 0.01812731944047959, + "grad_norm": 2.169771432876587, + "learning_rate": 4.995949817267078e-05, + "loss": 6.0555, + "step": 3048 + }, + { + "epoch": 0.018133266723760585, + "grad_norm": 2.2245211601257324, + "learning_rate": 4.995947159077056e-05, + "loss": 5.9084, + "step": 3049 + }, + { + "epoch": 0.018139214007041583, + "grad_norm": 2.2296931743621826, + "learning_rate": 4.995944500015723e-05, + "loss": 5.8878, + "step": 3050 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 2.2372493743896484, + "learning_rate": 4.995941840083082e-05, + "loss": 5.9521, + "step": 3051 + }, + { + "epoch": 0.018151108573603577, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.995939179279134e-05, + "loss": 5.899, + "step": 3052 + }, + { + "epoch": 0.018157055856884576, + "grad_norm": 2.218245267868042, + "learning_rate": 4.995936517603879e-05, + "loss": 6.0311, + "step": 3053 + }, + { + "epoch": 0.018163003140165574, + "grad_norm": 2.2877273559570312, + "learning_rate": 4.995933855057318e-05, + "loss": 6.0052, + "step": 3054 + }, + { + "epoch": 0.01816895042344657, + "grad_norm": 2.225764751434326, + "learning_rate": 4.995931191639453e-05, + "loss": 6.0373, + "step": 3055 + }, + { + "epoch": 0.018174897706727568, + "grad_norm": 2.5069313049316406, + "learning_rate": 4.995928527350284e-05, + "loss": 5.8729, + "step": 3056 + }, + { + "epoch": 0.018180844990008563, + "grad_norm": 2.089759588241577, + "learning_rate": 4.995925862189812e-05, + "loss": 5.9462, + "step": 3057 + }, + { + "epoch": 0.01818679227328956, + "grad_norm": 2.0159049034118652, + "learning_rate": 4.9959231961580376e-05, + "loss": 5.9276, + "step": 3058 + }, + { + "epoch": 0.01819273955657056, + "grad_norm": 2.207636594772339, + "learning_rate": 4.995920529254963e-05, + "loss": 5.9921, + "step": 3059 + }, + { + "epoch": 0.018198686839851555, + "grad_norm": 2.380232810974121, + "learning_rate": 4.995917861480588e-05, + "loss": 5.9092, + "step": 3060 + }, + { + "epoch": 0.018204634123132554, + "grad_norm": 2.073237895965576, + "learning_rate": 4.9959151928349134e-05, + "loss": 5.8472, + "step": 3061 + }, + { + "epoch": 0.01821058140641355, + "grad_norm": 1.824062705039978, + "learning_rate": 4.995912523317942e-05, + "loss": 5.7958, + "step": 3062 + }, + { + "epoch": 0.018216528689694547, + "grad_norm": 2.3961215019226074, + "learning_rate": 4.995909852929672e-05, + "loss": 6.1388, + "step": 3063 + }, + { + "epoch": 0.018222475972975546, + "grad_norm": 2.8391239643096924, + "learning_rate": 4.9959071816701065e-05, + "loss": 5.7564, + "step": 3064 + }, + { + "epoch": 0.01822842325625654, + "grad_norm": 2.4684112071990967, + "learning_rate": 4.995904509539244e-05, + "loss": 5.8372, + "step": 3065 + }, + { + "epoch": 0.01823437053953754, + "grad_norm": 2.419983386993408, + "learning_rate": 4.995901836537089e-05, + "loss": 5.9332, + "step": 3066 + }, + { + "epoch": 0.018240317822818538, + "grad_norm": 2.500227928161621, + "learning_rate": 4.99589916266364e-05, + "loss": 6.0848, + "step": 3067 + }, + { + "epoch": 0.018246265106099533, + "grad_norm": 2.1683971881866455, + "learning_rate": 4.9958964879188976e-05, + "loss": 6.0911, + "step": 3068 + }, + { + "epoch": 0.01825221238938053, + "grad_norm": 2.2345223426818848, + "learning_rate": 4.995893812302864e-05, + "loss": 6.016, + "step": 3069 + }, + { + "epoch": 0.018258159672661527, + "grad_norm": 2.318321466445923, + "learning_rate": 4.995891135815539e-05, + "loss": 5.9622, + "step": 3070 + }, + { + "epoch": 0.018264106955942525, + "grad_norm": 2.294602155685425, + "learning_rate": 4.9958884584569255e-05, + "loss": 5.8908, + "step": 3071 + }, + { + "epoch": 0.018270054239223524, + "grad_norm": 2.5472419261932373, + "learning_rate": 4.995885780227022e-05, + "loss": 5.7906, + "step": 3072 + }, + { + "epoch": 0.01827600152250452, + "grad_norm": 2.319101095199585, + "learning_rate": 4.995883101125831e-05, + "loss": 6.3366, + "step": 3073 + }, + { + "epoch": 0.018281948805785517, + "grad_norm": 2.3564186096191406, + "learning_rate": 4.995880421153353e-05, + "loss": 5.9863, + "step": 3074 + }, + { + "epoch": 0.018287896089066516, + "grad_norm": 2.434756278991699, + "learning_rate": 4.995877740309589e-05, + "loss": 5.885, + "step": 3075 + }, + { + "epoch": 0.01829384337234751, + "grad_norm": 2.062861442565918, + "learning_rate": 4.99587505859454e-05, + "loss": 6.0813, + "step": 3076 + }, + { + "epoch": 0.01829979065562851, + "grad_norm": 2.127049684524536, + "learning_rate": 4.995872376008206e-05, + "loss": 6.1226, + "step": 3077 + }, + { + "epoch": 0.018305737938909505, + "grad_norm": 2.288405656814575, + "learning_rate": 4.995869692550589e-05, + "loss": 5.9625, + "step": 3078 + }, + { + "epoch": 0.018311685222190503, + "grad_norm": 2.2387006282806396, + "learning_rate": 4.9958670082216905e-05, + "loss": 5.9479, + "step": 3079 + }, + { + "epoch": 0.018317632505471502, + "grad_norm": 2.18864107131958, + "learning_rate": 4.9958643230215096e-05, + "loss": 5.9223, + "step": 3080 + }, + { + "epoch": 0.018323579788752497, + "grad_norm": 2.3457415103912354, + "learning_rate": 4.995861636950049e-05, + "loss": 5.7857, + "step": 3081 + }, + { + "epoch": 0.018329527072033495, + "grad_norm": 2.6946494579315186, + "learning_rate": 4.995858950007309e-05, + "loss": 5.5546, + "step": 3082 + }, + { + "epoch": 0.018335474355314494, + "grad_norm": 2.5135412216186523, + "learning_rate": 4.99585626219329e-05, + "loss": 5.5624, + "step": 3083 + }, + { + "epoch": 0.01834142163859549, + "grad_norm": 2.6617767810821533, + "learning_rate": 4.9958535735079934e-05, + "loss": 5.8789, + "step": 3084 + }, + { + "epoch": 0.018347368921876488, + "grad_norm": 2.099261522293091, + "learning_rate": 4.9958508839514196e-05, + "loss": 5.9365, + "step": 3085 + }, + { + "epoch": 0.018353316205157483, + "grad_norm": 2.5267064571380615, + "learning_rate": 4.9958481935235715e-05, + "loss": 6.0935, + "step": 3086 + }, + { + "epoch": 0.01835926348843848, + "grad_norm": 2.3353283405303955, + "learning_rate": 4.995845502224447e-05, + "loss": 5.909, + "step": 3087 + }, + { + "epoch": 0.01836521077171948, + "grad_norm": 2.396430492401123, + "learning_rate": 4.9958428100540496e-05, + "loss": 6.0272, + "step": 3088 + }, + { + "epoch": 0.018371158055000475, + "grad_norm": 2.095308303833008, + "learning_rate": 4.9958401170123784e-05, + "loss": 5.9791, + "step": 3089 + }, + { + "epoch": 0.018377105338281473, + "grad_norm": 2.7606077194213867, + "learning_rate": 4.9958374230994357e-05, + "loss": 5.9716, + "step": 3090 + }, + { + "epoch": 0.01838305262156247, + "grad_norm": 2.4490914344787598, + "learning_rate": 4.995834728315222e-05, + "loss": 5.8763, + "step": 3091 + }, + { + "epoch": 0.018388999904843467, + "grad_norm": 2.709092855453491, + "learning_rate": 4.9958320326597385e-05, + "loss": 5.74, + "step": 3092 + }, + { + "epoch": 0.018394947188124466, + "grad_norm": 2.8829305171966553, + "learning_rate": 4.9958293361329856e-05, + "loss": 5.8469, + "step": 3093 + }, + { + "epoch": 0.01840089447140546, + "grad_norm": 2.6500396728515625, + "learning_rate": 4.995826638734964e-05, + "loss": 5.8578, + "step": 3094 + }, + { + "epoch": 0.01840684175468646, + "grad_norm": 2.0665056705474854, + "learning_rate": 4.9958239404656755e-05, + "loss": 5.9662, + "step": 3095 + }, + { + "epoch": 0.018412789037967458, + "grad_norm": 2.3198931217193604, + "learning_rate": 4.9958212413251205e-05, + "loss": 6.0663, + "step": 3096 + }, + { + "epoch": 0.018418736321248453, + "grad_norm": 2.9056031703948975, + "learning_rate": 4.9958185413133e-05, + "loss": 5.8015, + "step": 3097 + }, + { + "epoch": 0.01842468360452945, + "grad_norm": 2.446164131164551, + "learning_rate": 4.995815840430216e-05, + "loss": 5.6878, + "step": 3098 + }, + { + "epoch": 0.018430630887810447, + "grad_norm": 2.797506093978882, + "learning_rate": 4.995813138675867e-05, + "loss": 5.7675, + "step": 3099 + }, + { + "epoch": 0.018436578171091445, + "grad_norm": 3.2914962768554688, + "learning_rate": 4.995810436050256e-05, + "loss": 6.3661, + "step": 3100 + }, + { + "epoch": 0.018442525454372444, + "grad_norm": 2.444363594055176, + "learning_rate": 4.995807732553384e-05, + "loss": 5.9251, + "step": 3101 + }, + { + "epoch": 0.01844847273765344, + "grad_norm": 2.526951551437378, + "learning_rate": 4.9958050281852505e-05, + "loss": 5.8202, + "step": 3102 + }, + { + "epoch": 0.018454420020934437, + "grad_norm": 2.2046117782592773, + "learning_rate": 4.995802322945857e-05, + "loss": 6.0572, + "step": 3103 + }, + { + "epoch": 0.018460367304215436, + "grad_norm": 2.5484018325805664, + "learning_rate": 4.9957996168352055e-05, + "loss": 6.1215, + "step": 3104 + }, + { + "epoch": 0.01846631458749643, + "grad_norm": 2.4785003662109375, + "learning_rate": 4.9957969098532965e-05, + "loss": 5.9524, + "step": 3105 + }, + { + "epoch": 0.01847226187077743, + "grad_norm": 2.9028711318969727, + "learning_rate": 4.9957942020001294e-05, + "loss": 6.1175, + "step": 3106 + }, + { + "epoch": 0.018478209154058425, + "grad_norm": 2.1766602993011475, + "learning_rate": 4.995791493275707e-05, + "loss": 5.9746, + "step": 3107 + }, + { + "epoch": 0.018484156437339423, + "grad_norm": 2.079423189163208, + "learning_rate": 4.995788783680029e-05, + "loss": 5.9463, + "step": 3108 + }, + { + "epoch": 0.018490103720620422, + "grad_norm": 2.285184144973755, + "learning_rate": 4.995786073213098e-05, + "loss": 5.5174, + "step": 3109 + }, + { + "epoch": 0.018496051003901417, + "grad_norm": 2.170018196105957, + "learning_rate": 4.9957833618749126e-05, + "loss": 5.7948, + "step": 3110 + }, + { + "epoch": 0.018501998287182415, + "grad_norm": 2.284517526626587, + "learning_rate": 4.9957806496654754e-05, + "loss": 5.9455, + "step": 3111 + }, + { + "epoch": 0.018507945570463414, + "grad_norm": 2.5539982318878174, + "learning_rate": 4.9957779365847876e-05, + "loss": 5.9791, + "step": 3112 + }, + { + "epoch": 0.01851389285374441, + "grad_norm": 2.1735522747039795, + "learning_rate": 4.995775222632849e-05, + "loss": 5.9549, + "step": 3113 + }, + { + "epoch": 0.018519840137025408, + "grad_norm": 2.2272653579711914, + "learning_rate": 4.995772507809662e-05, + "loss": 5.8618, + "step": 3114 + }, + { + "epoch": 0.018525787420306403, + "grad_norm": 1.9390417337417603, + "learning_rate": 4.995769792115225e-05, + "loss": 5.9617, + "step": 3115 + }, + { + "epoch": 0.0185317347035874, + "grad_norm": 2.6526312828063965, + "learning_rate": 4.9957670755495414e-05, + "loss": 5.9296, + "step": 3116 + }, + { + "epoch": 0.0185376819868684, + "grad_norm": 2.533996105194092, + "learning_rate": 4.995764358112611e-05, + "loss": 6.0045, + "step": 3117 + }, + { + "epoch": 0.018543629270149395, + "grad_norm": 2.183347225189209, + "learning_rate": 4.995761639804436e-05, + "loss": 5.9254, + "step": 3118 + }, + { + "epoch": 0.018549576553430393, + "grad_norm": 1.9411321878433228, + "learning_rate": 4.995758920625015e-05, + "loss": 5.9404, + "step": 3119 + }, + { + "epoch": 0.01855552383671139, + "grad_norm": 4.914453029632568, + "learning_rate": 4.9957562005743514e-05, + "loss": 5.8139, + "step": 3120 + }, + { + "epoch": 0.018561471119992387, + "grad_norm": 2.3052754402160645, + "learning_rate": 4.9957534796524444e-05, + "loss": 5.6525, + "step": 3121 + }, + { + "epoch": 0.018567418403273386, + "grad_norm": 2.424464464187622, + "learning_rate": 4.995750757859296e-05, + "loss": 5.9599, + "step": 3122 + }, + { + "epoch": 0.01857336568655438, + "grad_norm": 2.1392033100128174, + "learning_rate": 4.995748035194907e-05, + "loss": 5.9558, + "step": 3123 + }, + { + "epoch": 0.01857931296983538, + "grad_norm": 4.67656135559082, + "learning_rate": 4.995745311659278e-05, + "loss": 5.7606, + "step": 3124 + }, + { + "epoch": 0.018585260253116378, + "grad_norm": 2.0772082805633545, + "learning_rate": 4.99574258725241e-05, + "loss": 5.9328, + "step": 3125 + }, + { + "epoch": 0.018591207536397373, + "grad_norm": 2.0255486965179443, + "learning_rate": 4.995739861974303e-05, + "loss": 5.9395, + "step": 3126 + }, + { + "epoch": 0.01859715481967837, + "grad_norm": 2.3629064559936523, + "learning_rate": 4.995737135824961e-05, + "loss": 5.9663, + "step": 3127 + }, + { + "epoch": 0.018603102102959367, + "grad_norm": 1.9924237728118896, + "learning_rate": 4.9957344088043814e-05, + "loss": 5.8998, + "step": 3128 + }, + { + "epoch": 0.018609049386240365, + "grad_norm": 2.096774101257324, + "learning_rate": 4.9957316809125676e-05, + "loss": 5.7178, + "step": 3129 + }, + { + "epoch": 0.018614996669521364, + "grad_norm": 2.2288100719451904, + "learning_rate": 4.9957289521495194e-05, + "loss": 5.9096, + "step": 3130 + }, + { + "epoch": 0.01862094395280236, + "grad_norm": 2.456099033355713, + "learning_rate": 4.995726222515238e-05, + "loss": 5.7738, + "step": 3131 + }, + { + "epoch": 0.018626891236083357, + "grad_norm": 2.238218069076538, + "learning_rate": 4.995723492009724e-05, + "loss": 5.6929, + "step": 3132 + }, + { + "epoch": 0.018632838519364356, + "grad_norm": 1.8309845924377441, + "learning_rate": 4.9957207606329795e-05, + "loss": 5.9339, + "step": 3133 + }, + { + "epoch": 0.01863878580264535, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.995718028385003e-05, + "loss": 5.9704, + "step": 3134 + }, + { + "epoch": 0.01864473308592635, + "grad_norm": 2.0929813385009766, + "learning_rate": 4.9957152952657995e-05, + "loss": 5.7598, + "step": 3135 + }, + { + "epoch": 0.018650680369207345, + "grad_norm": 2.2813265323638916, + "learning_rate": 4.995712561275366e-05, + "loss": 5.7986, + "step": 3136 + }, + { + "epoch": 0.018656627652488343, + "grad_norm": 2.1189653873443604, + "learning_rate": 4.995709826413705e-05, + "loss": 5.6603, + "step": 3137 + }, + { + "epoch": 0.01866257493576934, + "grad_norm": 2.1439480781555176, + "learning_rate": 4.9957070906808185e-05, + "loss": 5.6952, + "step": 3138 + }, + { + "epoch": 0.018668522219050337, + "grad_norm": 2.4345993995666504, + "learning_rate": 4.995704354076706e-05, + "loss": 5.7531, + "step": 3139 + }, + { + "epoch": 0.018674469502331335, + "grad_norm": 2.5551047325134277, + "learning_rate": 4.995701616601368e-05, + "loss": 5.544, + "step": 3140 + }, + { + "epoch": 0.018680416785612334, + "grad_norm": 2.333603620529175, + "learning_rate": 4.9956988782548075e-05, + "loss": 5.5732, + "step": 3141 + }, + { + "epoch": 0.01868636406889333, + "grad_norm": 2.2983827590942383, + "learning_rate": 4.995696139037024e-05, + "loss": 5.8779, + "step": 3142 + }, + { + "epoch": 0.018692311352174328, + "grad_norm": 2.7525672912597656, + "learning_rate": 4.995693398948018e-05, + "loss": 5.5998, + "step": 3143 + }, + { + "epoch": 0.018698258635455323, + "grad_norm": 2.3622052669525146, + "learning_rate": 4.995690657987793e-05, + "loss": 5.8851, + "step": 3144 + }, + { + "epoch": 0.01870420591873632, + "grad_norm": 2.4975669384002686, + "learning_rate": 4.995687916156346e-05, + "loss": 5.6388, + "step": 3145 + }, + { + "epoch": 0.01871015320201732, + "grad_norm": 2.5763049125671387, + "learning_rate": 4.9956851734536816e-05, + "loss": 5.4931, + "step": 3146 + }, + { + "epoch": 0.018716100485298315, + "grad_norm": 2.7156779766082764, + "learning_rate": 4.995682429879799e-05, + "loss": 5.8035, + "step": 3147 + }, + { + "epoch": 0.018722047768579313, + "grad_norm": 2.259134292602539, + "learning_rate": 4.995679685434699e-05, + "loss": 5.9519, + "step": 3148 + }, + { + "epoch": 0.018727995051860312, + "grad_norm": 2.544829845428467, + "learning_rate": 4.995676940118383e-05, + "loss": 5.7373, + "step": 3149 + }, + { + "epoch": 0.018733942335141307, + "grad_norm": 2.326660633087158, + "learning_rate": 4.995674193930853e-05, + "loss": 5.7719, + "step": 3150 + }, + { + "epoch": 0.018739889618422306, + "grad_norm": 2.25370192527771, + "learning_rate": 4.995671446872108e-05, + "loss": 5.813, + "step": 3151 + }, + { + "epoch": 0.0187458369017033, + "grad_norm": 2.1467692852020264, + "learning_rate": 4.99566869894215e-05, + "loss": 5.5836, + "step": 3152 + }, + { + "epoch": 0.0187517841849843, + "grad_norm": 2.30096697807312, + "learning_rate": 4.9956659501409796e-05, + "loss": 5.8249, + "step": 3153 + }, + { + "epoch": 0.018757731468265298, + "grad_norm": 2.3050386905670166, + "learning_rate": 4.9956632004685986e-05, + "loss": 5.6806, + "step": 3154 + }, + { + "epoch": 0.018763678751546293, + "grad_norm": 2.473008632659912, + "learning_rate": 4.995660449925007e-05, + "loss": 5.4512, + "step": 3155 + }, + { + "epoch": 0.01876962603482729, + "grad_norm": 2.0691702365875244, + "learning_rate": 4.995657698510206e-05, + "loss": 5.6582, + "step": 3156 + }, + { + "epoch": 0.018775573318108287, + "grad_norm": 2.332423686981201, + "learning_rate": 4.995654946224197e-05, + "loss": 5.6017, + "step": 3157 + }, + { + "epoch": 0.018781520601389285, + "grad_norm": 2.6423730850219727, + "learning_rate": 4.9956521930669806e-05, + "loss": 5.619, + "step": 3158 + }, + { + "epoch": 0.018787467884670284, + "grad_norm": 3.0884950160980225, + "learning_rate": 4.995649439038558e-05, + "loss": 5.7813, + "step": 3159 + }, + { + "epoch": 0.01879341516795128, + "grad_norm": 2.4923598766326904, + "learning_rate": 4.995646684138929e-05, + "loss": 5.8089, + "step": 3160 + }, + { + "epoch": 0.018799362451232277, + "grad_norm": 2.5505683422088623, + "learning_rate": 4.9956439283680965e-05, + "loss": 5.8171, + "step": 3161 + }, + { + "epoch": 0.018805309734513276, + "grad_norm": 2.7343056201934814, + "learning_rate": 4.99564117172606e-05, + "loss": 6.3472, + "step": 3162 + }, + { + "epoch": 0.01881125701779427, + "grad_norm": 2.9170796871185303, + "learning_rate": 4.995638414212821e-05, + "loss": 5.7478, + "step": 3163 + }, + { + "epoch": 0.01881720430107527, + "grad_norm": 2.392648696899414, + "learning_rate": 4.9956356558283815e-05, + "loss": 5.8105, + "step": 3164 + }, + { + "epoch": 0.018823151584356265, + "grad_norm": 2.532207727432251, + "learning_rate": 4.9956328965727394e-05, + "loss": 5.9285, + "step": 3165 + }, + { + "epoch": 0.018829098867637263, + "grad_norm": 2.6717050075531006, + "learning_rate": 4.995630136445899e-05, + "loss": 6.0344, + "step": 3166 + }, + { + "epoch": 0.01883504615091826, + "grad_norm": 2.1829564571380615, + "learning_rate": 4.99562737544786e-05, + "loss": 6.0078, + "step": 3167 + }, + { + "epoch": 0.018840993434199257, + "grad_norm": 2.2728323936462402, + "learning_rate": 4.995624613578622e-05, + "loss": 5.8211, + "step": 3168 + }, + { + "epoch": 0.018846940717480255, + "grad_norm": 2.046717882156372, + "learning_rate": 4.995621850838189e-05, + "loss": 5.9685, + "step": 3169 + }, + { + "epoch": 0.018852888000761254, + "grad_norm": 2.737494945526123, + "learning_rate": 4.995619087226559e-05, + "loss": 5.649, + "step": 3170 + }, + { + "epoch": 0.01885883528404225, + "grad_norm": 2.276503801345825, + "learning_rate": 4.9956163227437345e-05, + "loss": 5.8137, + "step": 3171 + }, + { + "epoch": 0.018864782567323247, + "grad_norm": 2.2799227237701416, + "learning_rate": 4.9956135573897155e-05, + "loss": 5.8277, + "step": 3172 + }, + { + "epoch": 0.018870729850604243, + "grad_norm": 2.131425619125366, + "learning_rate": 4.995610791164505e-05, + "loss": 5.8909, + "step": 3173 + }, + { + "epoch": 0.01887667713388524, + "grad_norm": 2.2295737266540527, + "learning_rate": 4.995608024068102e-05, + "loss": 5.8236, + "step": 3174 + }, + { + "epoch": 0.01888262441716624, + "grad_norm": 2.30082631111145, + "learning_rate": 4.9956052561005076e-05, + "loss": 5.7331, + "step": 3175 + }, + { + "epoch": 0.018888571700447235, + "grad_norm": 2.751847505569458, + "learning_rate": 4.9956024872617225e-05, + "loss": 5.8673, + "step": 3176 + }, + { + "epoch": 0.018894518983728233, + "grad_norm": 2.4597535133361816, + "learning_rate": 4.995599717551749e-05, + "loss": 5.7561, + "step": 3177 + }, + { + "epoch": 0.018900466267009232, + "grad_norm": 2.1418228149414062, + "learning_rate": 4.9955969469705874e-05, + "loss": 5.7112, + "step": 3178 + }, + { + "epoch": 0.018906413550290227, + "grad_norm": 2.0560619831085205, + "learning_rate": 4.9955941755182395e-05, + "loss": 5.7764, + "step": 3179 + }, + { + "epoch": 0.018912360833571226, + "grad_norm": 2.268781900405884, + "learning_rate": 4.9955914031947046e-05, + "loss": 5.7319, + "step": 3180 + }, + { + "epoch": 0.01891830811685222, + "grad_norm": 2.6272811889648438, + "learning_rate": 4.995588629999985e-05, + "loss": 6.0601, + "step": 3181 + }, + { + "epoch": 0.01892425540013322, + "grad_norm": 2.1991870403289795, + "learning_rate": 4.995585855934081e-05, + "loss": 5.602, + "step": 3182 + }, + { + "epoch": 0.018930202683414218, + "grad_norm": 2.0521514415740967, + "learning_rate": 4.995583080996994e-05, + "loss": 5.8075, + "step": 3183 + }, + { + "epoch": 0.018936149966695213, + "grad_norm": 2.153473138809204, + "learning_rate": 4.995580305188724e-05, + "loss": 5.8219, + "step": 3184 + }, + { + "epoch": 0.01894209724997621, + "grad_norm": 2.0663251876831055, + "learning_rate": 4.9955775285092735e-05, + "loss": 5.836, + "step": 3185 + }, + { + "epoch": 0.018948044533257206, + "grad_norm": 1.8808318376541138, + "learning_rate": 4.995574750958642e-05, + "loss": 5.7938, + "step": 3186 + }, + { + "epoch": 0.018953991816538205, + "grad_norm": 2.256012201309204, + "learning_rate": 4.995571972536831e-05, + "loss": 5.6404, + "step": 3187 + }, + { + "epoch": 0.018959939099819204, + "grad_norm": 2.29636287689209, + "learning_rate": 4.995569193243843e-05, + "loss": 5.7161, + "step": 3188 + }, + { + "epoch": 0.0189658863831002, + "grad_norm": 2.728804588317871, + "learning_rate": 4.995566413079676e-05, + "loss": 5.8165, + "step": 3189 + }, + { + "epoch": 0.018971833666381197, + "grad_norm": 2.3115599155426025, + "learning_rate": 4.995563632044333e-05, + "loss": 5.7004, + "step": 3190 + }, + { + "epoch": 0.018977780949662196, + "grad_norm": 2.1607725620269775, + "learning_rate": 4.995560850137815e-05, + "loss": 5.7788, + "step": 3191 + }, + { + "epoch": 0.01898372823294319, + "grad_norm": 2.322132110595703, + "learning_rate": 4.995558067360122e-05, + "loss": 5.5677, + "step": 3192 + }, + { + "epoch": 0.01898967551622419, + "grad_norm": 2.148022174835205, + "learning_rate": 4.995555283711256e-05, + "loss": 5.7708, + "step": 3193 + }, + { + "epoch": 0.018995622799505184, + "grad_norm": 2.339812994003296, + "learning_rate": 4.9955524991912165e-05, + "loss": 5.7945, + "step": 3194 + }, + { + "epoch": 0.019001570082786183, + "grad_norm": 1.9469980001449585, + "learning_rate": 4.995549713800006e-05, + "loss": 5.695, + "step": 3195 + }, + { + "epoch": 0.01900751736606718, + "grad_norm": 2.1744890213012695, + "learning_rate": 4.9955469275376254e-05, + "loss": 5.7544, + "step": 3196 + }, + { + "epoch": 0.019013464649348177, + "grad_norm": 2.175123691558838, + "learning_rate": 4.9955441404040745e-05, + "loss": 5.598, + "step": 3197 + }, + { + "epoch": 0.019019411932629175, + "grad_norm": 2.3011369705200195, + "learning_rate": 4.995541352399355e-05, + "loss": 5.7069, + "step": 3198 + }, + { + "epoch": 0.019025359215910174, + "grad_norm": 2.2227025032043457, + "learning_rate": 4.9955385635234675e-05, + "loss": 5.6854, + "step": 3199 + }, + { + "epoch": 0.01903130649919117, + "grad_norm": 2.5465073585510254, + "learning_rate": 4.995535773776414e-05, + "loss": 5.9085, + "step": 3200 + }, + { + "epoch": 0.019037253782472167, + "grad_norm": 2.936612844467163, + "learning_rate": 4.995532983158194e-05, + "loss": 6.0519, + "step": 3201 + }, + { + "epoch": 0.019043201065753163, + "grad_norm": 2.8298418521881104, + "learning_rate": 4.9955301916688094e-05, + "loss": 5.9473, + "step": 3202 + }, + { + "epoch": 0.01904914834903416, + "grad_norm": 2.2295944690704346, + "learning_rate": 4.9955273993082615e-05, + "loss": 5.9652, + "step": 3203 + }, + { + "epoch": 0.01905509563231516, + "grad_norm": 2.7771801948547363, + "learning_rate": 4.9955246060765505e-05, + "loss": 5.9291, + "step": 3204 + }, + { + "epoch": 0.019061042915596155, + "grad_norm": 3.0721678733825684, + "learning_rate": 4.9955218119736776e-05, + "loss": 6.2319, + "step": 3205 + }, + { + "epoch": 0.019066990198877153, + "grad_norm": 2.7866547107696533, + "learning_rate": 4.9955190169996434e-05, + "loss": 6.0412, + "step": 3206 + }, + { + "epoch": 0.019072937482158152, + "grad_norm": 2.287216901779175, + "learning_rate": 4.99551622115445e-05, + "loss": 5.6435, + "step": 3207 + }, + { + "epoch": 0.019078884765439147, + "grad_norm": 2.3618898391723633, + "learning_rate": 4.995513424438098e-05, + "loss": 5.7711, + "step": 3208 + }, + { + "epoch": 0.019084832048720145, + "grad_norm": 2.192997932434082, + "learning_rate": 4.995510626850587e-05, + "loss": 5.8351, + "step": 3209 + }, + { + "epoch": 0.01909077933200114, + "grad_norm": 2.252722978591919, + "learning_rate": 4.995507828391919e-05, + "loss": 5.5989, + "step": 3210 + }, + { + "epoch": 0.01909672661528214, + "grad_norm": 2.451167106628418, + "learning_rate": 4.995505029062095e-05, + "loss": 5.8533, + "step": 3211 + }, + { + "epoch": 0.019102673898563138, + "grad_norm": 2.1897904872894287, + "learning_rate": 4.995502228861116e-05, + "loss": 6.2807, + "step": 3212 + }, + { + "epoch": 0.019108621181844133, + "grad_norm": 2.196805715560913, + "learning_rate": 4.995499427788984e-05, + "loss": 5.9418, + "step": 3213 + }, + { + "epoch": 0.01911456846512513, + "grad_norm": 1.9791160821914673, + "learning_rate": 4.995496625845698e-05, + "loss": 5.9909, + "step": 3214 + }, + { + "epoch": 0.019120515748406126, + "grad_norm": 2.3592171669006348, + "learning_rate": 4.995493823031261e-05, + "loss": 5.807, + "step": 3215 + }, + { + "epoch": 0.019126463031687125, + "grad_norm": 2.8238747119903564, + "learning_rate": 4.9954910193456713e-05, + "loss": 5.7587, + "step": 3216 + }, + { + "epoch": 0.019132410314968123, + "grad_norm": 2.4695584774017334, + "learning_rate": 4.9954882147889326e-05, + "loss": 5.746, + "step": 3217 + }, + { + "epoch": 0.01913835759824912, + "grad_norm": 2.3983800411224365, + "learning_rate": 4.995485409361044e-05, + "loss": 5.9364, + "step": 3218 + }, + { + "epoch": 0.019144304881530117, + "grad_norm": 2.1279618740081787, + "learning_rate": 4.995482603062008e-05, + "loss": 5.9383, + "step": 3219 + }, + { + "epoch": 0.019150252164811116, + "grad_norm": 18.583581924438477, + "learning_rate": 4.9954797958918244e-05, + "loss": 5.8596, + "step": 3220 + }, + { + "epoch": 0.01915619944809211, + "grad_norm": 2.1420741081237793, + "learning_rate": 4.995476987850495e-05, + "loss": 5.9311, + "step": 3221 + }, + { + "epoch": 0.01916214673137311, + "grad_norm": 2.314380645751953, + "learning_rate": 4.99547417893802e-05, + "loss": 5.8229, + "step": 3222 + }, + { + "epoch": 0.019168094014654104, + "grad_norm": 2.3818936347961426, + "learning_rate": 4.9954713691544004e-05, + "loss": 6.1124, + "step": 3223 + }, + { + "epoch": 0.019174041297935103, + "grad_norm": 2.521789789199829, + "learning_rate": 4.9954685584996377e-05, + "loss": 5.8939, + "step": 3224 + }, + { + "epoch": 0.0191799885812161, + "grad_norm": 1.9583165645599365, + "learning_rate": 4.9954657469737334e-05, + "loss": 6.0005, + "step": 3225 + }, + { + "epoch": 0.019185935864497097, + "grad_norm": 2.349581241607666, + "learning_rate": 4.995462934576687e-05, + "loss": 5.8467, + "step": 3226 + }, + { + "epoch": 0.019191883147778095, + "grad_norm": 2.081836223602295, + "learning_rate": 4.9954601213085e-05, + "loss": 6.1001, + "step": 3227 + }, + { + "epoch": 0.019197830431059094, + "grad_norm": 2.3207972049713135, + "learning_rate": 4.995457307169175e-05, + "loss": 5.794, + "step": 3228 + }, + { + "epoch": 0.01920377771434009, + "grad_norm": 1.8516380786895752, + "learning_rate": 4.99545449215871e-05, + "loss": 5.785, + "step": 3229 + }, + { + "epoch": 0.019209724997621087, + "grad_norm": 2.3822309970855713, + "learning_rate": 4.995451676277109e-05, + "loss": 5.7861, + "step": 3230 + }, + { + "epoch": 0.019215672280902082, + "grad_norm": 2.857161283493042, + "learning_rate": 4.995448859524371e-05, + "loss": 5.8333, + "step": 3231 + }, + { + "epoch": 0.01922161956418308, + "grad_norm": 2.201551914215088, + "learning_rate": 4.9954460419004974e-05, + "loss": 5.8653, + "step": 3232 + }, + { + "epoch": 0.01922756684746408, + "grad_norm": 2.1707022190093994, + "learning_rate": 4.995443223405489e-05, + "loss": 5.772, + "step": 3233 + }, + { + "epoch": 0.019233514130745075, + "grad_norm": 2.1242458820343018, + "learning_rate": 4.995440404039348e-05, + "loss": 5.8806, + "step": 3234 + }, + { + "epoch": 0.019239461414026073, + "grad_norm": 2.106945514678955, + "learning_rate": 4.995437583802074e-05, + "loss": 5.6746, + "step": 3235 + }, + { + "epoch": 0.019245408697307072, + "grad_norm": 2.083181858062744, + "learning_rate": 4.995434762693669e-05, + "loss": 5.9332, + "step": 3236 + }, + { + "epoch": 0.019251355980588067, + "grad_norm": 2.1857783794403076, + "learning_rate": 4.995431940714134e-05, + "loss": 5.6663, + "step": 3237 + }, + { + "epoch": 0.019257303263869065, + "grad_norm": 2.031041145324707, + "learning_rate": 4.995429117863468e-05, + "loss": 5.6734, + "step": 3238 + }, + { + "epoch": 0.01926325054715006, + "grad_norm": 2.31980037689209, + "learning_rate": 4.995426294141674e-05, + "loss": 5.8851, + "step": 3239 + }, + { + "epoch": 0.01926919783043106, + "grad_norm": 2.102965831756592, + "learning_rate": 4.9954234695487535e-05, + "loss": 5.7092, + "step": 3240 + }, + { + "epoch": 0.019275145113712058, + "grad_norm": 2.031169891357422, + "learning_rate": 4.995420644084705e-05, + "loss": 5.9755, + "step": 3241 + }, + { + "epoch": 0.019281092396993053, + "grad_norm": 2.2460241317749023, + "learning_rate": 4.995417817749532e-05, + "loss": 5.8895, + "step": 3242 + }, + { + "epoch": 0.01928703968027405, + "grad_norm": 2.618539571762085, + "learning_rate": 4.9954149905432336e-05, + "loss": 5.6964, + "step": 3243 + }, + { + "epoch": 0.019292986963555046, + "grad_norm": 2.1615748405456543, + "learning_rate": 4.995412162465812e-05, + "loss": 5.7162, + "step": 3244 + }, + { + "epoch": 0.019298934246836045, + "grad_norm": 2.363663673400879, + "learning_rate": 4.995409333517268e-05, + "loss": 5.7957, + "step": 3245 + }, + { + "epoch": 0.019304881530117043, + "grad_norm": 2.131084680557251, + "learning_rate": 4.9954065036976025e-05, + "loss": 5.7925, + "step": 3246 + }, + { + "epoch": 0.01931082881339804, + "grad_norm": 2.4043118953704834, + "learning_rate": 4.9954036730068155e-05, + "loss": 5.7895, + "step": 3247 + }, + { + "epoch": 0.019316776096679037, + "grad_norm": 2.521756887435913, + "learning_rate": 4.995400841444909e-05, + "loss": 5.6279, + "step": 3248 + }, + { + "epoch": 0.019322723379960036, + "grad_norm": 2.1791021823883057, + "learning_rate": 4.9953980090118846e-05, + "loss": 5.717, + "step": 3249 + }, + { + "epoch": 0.01932867066324103, + "grad_norm": 2.6562376022338867, + "learning_rate": 4.995395175707742e-05, + "loss": 5.7407, + "step": 3250 + }, + { + "epoch": 0.01933461794652203, + "grad_norm": 2.4377942085266113, + "learning_rate": 4.995392341532483e-05, + "loss": 5.539, + "step": 3251 + }, + { + "epoch": 0.019340565229803024, + "grad_norm": 2.3716847896575928, + "learning_rate": 4.995389506486109e-05, + "loss": 5.7251, + "step": 3252 + }, + { + "epoch": 0.019346512513084023, + "grad_norm": 2.2509348392486572, + "learning_rate": 4.995386670568619e-05, + "loss": 5.8749, + "step": 3253 + }, + { + "epoch": 0.01935245979636502, + "grad_norm": 2.265608072280884, + "learning_rate": 4.995383833780016e-05, + "loss": 5.8236, + "step": 3254 + }, + { + "epoch": 0.019358407079646017, + "grad_norm": 1.972179651260376, + "learning_rate": 4.9953809961203e-05, + "loss": 5.9235, + "step": 3255 + }, + { + "epoch": 0.019364354362927015, + "grad_norm": 2.314030170440674, + "learning_rate": 4.9953781575894723e-05, + "loss": 5.7355, + "step": 3256 + }, + { + "epoch": 0.019370301646208014, + "grad_norm": 2.3061349391937256, + "learning_rate": 4.995375318187534e-05, + "loss": 5.7337, + "step": 3257 + }, + { + "epoch": 0.01937624892948901, + "grad_norm": 1.9106477499008179, + "learning_rate": 4.9953724779144864e-05, + "loss": 5.8342, + "step": 3258 + }, + { + "epoch": 0.019382196212770007, + "grad_norm": 2.313750982284546, + "learning_rate": 4.9953696367703296e-05, + "loss": 5.7981, + "step": 3259 + }, + { + "epoch": 0.019388143496051002, + "grad_norm": 2.4477834701538086, + "learning_rate": 4.9953667947550644e-05, + "loss": 5.8212, + "step": 3260 + }, + { + "epoch": 0.019394090779332, + "grad_norm": 2.072659730911255, + "learning_rate": 4.9953639518686936e-05, + "loss": 5.7335, + "step": 3261 + }, + { + "epoch": 0.019400038062613, + "grad_norm": 2.0848984718322754, + "learning_rate": 4.995361108111216e-05, + "loss": 5.7427, + "step": 3262 + }, + { + "epoch": 0.019405985345893995, + "grad_norm": 1.938265323638916, + "learning_rate": 4.9953582634826345e-05, + "loss": 5.7946, + "step": 3263 + }, + { + "epoch": 0.019411932629174993, + "grad_norm": 2.227194309234619, + "learning_rate": 4.995355417982949e-05, + "loss": 5.9095, + "step": 3264 + }, + { + "epoch": 0.01941787991245599, + "grad_norm": 2.3245849609375, + "learning_rate": 4.9953525716121604e-05, + "loss": 5.802, + "step": 3265 + }, + { + "epoch": 0.019423827195736987, + "grad_norm": 2.08950138092041, + "learning_rate": 4.9953497243702696e-05, + "loss": 5.9001, + "step": 3266 + }, + { + "epoch": 0.019429774479017985, + "grad_norm": 1.93153715133667, + "learning_rate": 4.9953468762572786e-05, + "loss": 5.9042, + "step": 3267 + }, + { + "epoch": 0.01943572176229898, + "grad_norm": 2.4099066257476807, + "learning_rate": 4.9953440272731874e-05, + "loss": 5.8181, + "step": 3268 + }, + { + "epoch": 0.01944166904557998, + "grad_norm": 2.078752279281616, + "learning_rate": 4.995341177417998e-05, + "loss": 5.8771, + "step": 3269 + }, + { + "epoch": 0.019447616328860978, + "grad_norm": 2.012592077255249, + "learning_rate": 4.9953383266917106e-05, + "loss": 5.8135, + "step": 3270 + }, + { + "epoch": 0.019453563612141973, + "grad_norm": 2.0364151000976562, + "learning_rate": 4.995335475094326e-05, + "loss": 5.8767, + "step": 3271 + }, + { + "epoch": 0.01945951089542297, + "grad_norm": 2.0447049140930176, + "learning_rate": 4.995332622625846e-05, + "loss": 5.8236, + "step": 3272 + }, + { + "epoch": 0.01946545817870397, + "grad_norm": 2.2354300022125244, + "learning_rate": 4.995329769286271e-05, + "loss": 5.7794, + "step": 3273 + }, + { + "epoch": 0.019471405461984965, + "grad_norm": 2.031331777572632, + "learning_rate": 4.995326915075602e-05, + "loss": 5.87, + "step": 3274 + }, + { + "epoch": 0.019477352745265963, + "grad_norm": 2.2116496562957764, + "learning_rate": 4.99532405999384e-05, + "loss": 5.885, + "step": 3275 + }, + { + "epoch": 0.01948330002854696, + "grad_norm": 1.9008034467697144, + "learning_rate": 4.995321204040987e-05, + "loss": 5.8646, + "step": 3276 + }, + { + "epoch": 0.019489247311827957, + "grad_norm": 2.1743087768554688, + "learning_rate": 4.995318347217042e-05, + "loss": 5.9742, + "step": 3277 + }, + { + "epoch": 0.019495194595108956, + "grad_norm": 2.09171724319458, + "learning_rate": 4.995315489522008e-05, + "loss": 5.882, + "step": 3278 + }, + { + "epoch": 0.01950114187838995, + "grad_norm": 1.816938042640686, + "learning_rate": 4.995312630955885e-05, + "loss": 5.9164, + "step": 3279 + }, + { + "epoch": 0.01950708916167095, + "grad_norm": 2.065207004547119, + "learning_rate": 4.995309771518674e-05, + "loss": 5.9273, + "step": 3280 + }, + { + "epoch": 0.019513036444951944, + "grad_norm": 2.1037240028381348, + "learning_rate": 4.9953069112103757e-05, + "loss": 5.863, + "step": 3281 + }, + { + "epoch": 0.019518983728232943, + "grad_norm": 2.011705160140991, + "learning_rate": 4.995304050030992e-05, + "loss": 5.712, + "step": 3282 + }, + { + "epoch": 0.01952493101151394, + "grad_norm": 2.2053868770599365, + "learning_rate": 4.995301187980523e-05, + "loss": 5.6988, + "step": 3283 + }, + { + "epoch": 0.019530878294794937, + "grad_norm": 2.0522396564483643, + "learning_rate": 4.995298325058971e-05, + "loss": 5.6831, + "step": 3284 + }, + { + "epoch": 0.019536825578075935, + "grad_norm": 1.9751875400543213, + "learning_rate": 4.995295461266336e-05, + "loss": 6.0187, + "step": 3285 + }, + { + "epoch": 0.019542772861356934, + "grad_norm": 2.79711651802063, + "learning_rate": 4.9952925966026185e-05, + "loss": 6.4995, + "step": 3286 + }, + { + "epoch": 0.01954872014463793, + "grad_norm": 2.1059019565582275, + "learning_rate": 4.9952897310678206e-05, + "loss": 5.9603, + "step": 3287 + }, + { + "epoch": 0.019554667427918927, + "grad_norm": 2.169428825378418, + "learning_rate": 4.995286864661942e-05, + "loss": 5.7973, + "step": 3288 + }, + { + "epoch": 0.019560614711199922, + "grad_norm": 2.165508985519409, + "learning_rate": 4.995283997384985e-05, + "loss": 5.9132, + "step": 3289 + }, + { + "epoch": 0.01956656199448092, + "grad_norm": 2.248450994491577, + "learning_rate": 4.9952811292369506e-05, + "loss": 5.8202, + "step": 3290 + }, + { + "epoch": 0.01957250927776192, + "grad_norm": 2.3068084716796875, + "learning_rate": 4.9952782602178394e-05, + "loss": 5.8223, + "step": 3291 + }, + { + "epoch": 0.019578456561042915, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.9952753903276516e-05, + "loss": 5.6231, + "step": 3292 + }, + { + "epoch": 0.019584403844323913, + "grad_norm": 2.136564254760742, + "learning_rate": 4.9952725195663895e-05, + "loss": 5.9859, + "step": 3293 + }, + { + "epoch": 0.01959035112760491, + "grad_norm": 2.6265337467193604, + "learning_rate": 4.9952696479340535e-05, + "loss": 5.9126, + "step": 3294 + }, + { + "epoch": 0.019596298410885907, + "grad_norm": 2.442678928375244, + "learning_rate": 4.9952667754306445e-05, + "loss": 5.9361, + "step": 3295 + }, + { + "epoch": 0.019602245694166905, + "grad_norm": 2.0740134716033936, + "learning_rate": 4.9952639020561644e-05, + "loss": 5.913, + "step": 3296 + }, + { + "epoch": 0.0196081929774479, + "grad_norm": 2.4088518619537354, + "learning_rate": 4.995261027810612e-05, + "loss": 5.8297, + "step": 3297 + }, + { + "epoch": 0.0196141402607289, + "grad_norm": 2.1514804363250732, + "learning_rate": 4.995258152693991e-05, + "loss": 5.8256, + "step": 3298 + }, + { + "epoch": 0.019620087544009897, + "grad_norm": 2.921570062637329, + "learning_rate": 4.9952552767063e-05, + "loss": 6.0243, + "step": 3299 + }, + { + "epoch": 0.019626034827290893, + "grad_norm": 2.398749828338623, + "learning_rate": 4.995252399847542e-05, + "loss": 6.004, + "step": 3300 + }, + { + "epoch": 0.01963198211057189, + "grad_norm": 2.2024805545806885, + "learning_rate": 4.995249522117717e-05, + "loss": 5.9201, + "step": 3301 + }, + { + "epoch": 0.01963792939385289, + "grad_norm": 2.112269401550293, + "learning_rate": 4.9952466435168266e-05, + "loss": 5.8488, + "step": 3302 + }, + { + "epoch": 0.019643876677133885, + "grad_norm": 2.04632568359375, + "learning_rate": 4.99524376404487e-05, + "loss": 5.8054, + "step": 3303 + }, + { + "epoch": 0.019649823960414883, + "grad_norm": 2.6293606758117676, + "learning_rate": 4.995240883701851e-05, + "loss": 5.6799, + "step": 3304 + }, + { + "epoch": 0.01965577124369588, + "grad_norm": 2.5172793865203857, + "learning_rate": 4.995238002487769e-05, + "loss": 5.712, + "step": 3305 + }, + { + "epoch": 0.019661718526976877, + "grad_norm": 2.549194097518921, + "learning_rate": 4.995235120402625e-05, + "loss": 5.7208, + "step": 3306 + }, + { + "epoch": 0.019667665810257876, + "grad_norm": 2.2993295192718506, + "learning_rate": 4.99523223744642e-05, + "loss": 5.7952, + "step": 3307 + }, + { + "epoch": 0.01967361309353887, + "grad_norm": 2.1270902156829834, + "learning_rate": 4.9952293536191555e-05, + "loss": 5.6988, + "step": 3308 + }, + { + "epoch": 0.01967956037681987, + "grad_norm": 2.349858283996582, + "learning_rate": 4.9952264689208315e-05, + "loss": 5.623, + "step": 3309 + }, + { + "epoch": 0.019685507660100864, + "grad_norm": 2.1501529216766357, + "learning_rate": 4.9952235833514506e-05, + "loss": 5.6498, + "step": 3310 + }, + { + "epoch": 0.019691454943381863, + "grad_norm": 2.0577821731567383, + "learning_rate": 4.995220696911012e-05, + "loss": 5.6863, + "step": 3311 + }, + { + "epoch": 0.01969740222666286, + "grad_norm": 2.0787386894226074, + "learning_rate": 4.9952178095995185e-05, + "loss": 5.6314, + "step": 3312 + }, + { + "epoch": 0.019703349509943856, + "grad_norm": 2.4042680263519287, + "learning_rate": 4.99521492141697e-05, + "loss": 5.6152, + "step": 3313 + }, + { + "epoch": 0.019709296793224855, + "grad_norm": 2.444410800933838, + "learning_rate": 4.995212032363368e-05, + "loss": 5.5375, + "step": 3314 + }, + { + "epoch": 0.019715244076505854, + "grad_norm": 2.1678028106689453, + "learning_rate": 4.995209142438712e-05, + "loss": 5.6239, + "step": 3315 + }, + { + "epoch": 0.01972119135978685, + "grad_norm": 2.5436410903930664, + "learning_rate": 4.9952062516430054e-05, + "loss": 5.4234, + "step": 3316 + }, + { + "epoch": 0.019727138643067847, + "grad_norm": 2.454561471939087, + "learning_rate": 4.9952033599762484e-05, + "loss": 5.4198, + "step": 3317 + }, + { + "epoch": 0.019733085926348842, + "grad_norm": 2.388125419616699, + "learning_rate": 4.9952004674384413e-05, + "loss": 5.5073, + "step": 3318 + }, + { + "epoch": 0.01973903320962984, + "grad_norm": 2.1900579929351807, + "learning_rate": 4.995197574029585e-05, + "loss": 5.3463, + "step": 3319 + }, + { + "epoch": 0.01974498049291084, + "grad_norm": 2.5625739097595215, + "learning_rate": 4.995194679749681e-05, + "loss": 5.4291, + "step": 3320 + }, + { + "epoch": 0.019750927776191834, + "grad_norm": 2.52402400970459, + "learning_rate": 4.995191784598731e-05, + "loss": 5.3826, + "step": 3321 + }, + { + "epoch": 0.019756875059472833, + "grad_norm": 2.5888168811798096, + "learning_rate": 4.995188888576735e-05, + "loss": 5.381, + "step": 3322 + }, + { + "epoch": 0.01976282234275383, + "grad_norm": 2.637080669403076, + "learning_rate": 4.995185991683694e-05, + "loss": 5.3321, + "step": 3323 + }, + { + "epoch": 0.019768769626034827, + "grad_norm": 2.46553111076355, + "learning_rate": 4.9951830939196095e-05, + "loss": 5.3663, + "step": 3324 + }, + { + "epoch": 0.019774716909315825, + "grad_norm": 2.2397992610931396, + "learning_rate": 4.9951801952844826e-05, + "loss": 5.3237, + "step": 3325 + }, + { + "epoch": 0.01978066419259682, + "grad_norm": 2.3519208431243896, + "learning_rate": 4.9951772957783144e-05, + "loss": 5.4166, + "step": 3326 + }, + { + "epoch": 0.01978661147587782, + "grad_norm": 2.6235291957855225, + "learning_rate": 4.9951743954011056e-05, + "loss": 5.8094, + "step": 3327 + }, + { + "epoch": 0.019792558759158817, + "grad_norm": 2.162285327911377, + "learning_rate": 4.995171494152856e-05, + "loss": 5.6491, + "step": 3328 + }, + { + "epoch": 0.019798506042439813, + "grad_norm": 2.231853485107422, + "learning_rate": 4.995168592033569e-05, + "loss": 5.69, + "step": 3329 + }, + { + "epoch": 0.01980445332572081, + "grad_norm": 2.7305827140808105, + "learning_rate": 4.995165689043244e-05, + "loss": 5.5028, + "step": 3330 + }, + { + "epoch": 0.01981040060900181, + "grad_norm": 2.9917726516723633, + "learning_rate": 4.9951627851818824e-05, + "loss": 5.3227, + "step": 3331 + }, + { + "epoch": 0.019816347892282805, + "grad_norm": 3.0039985179901123, + "learning_rate": 4.995159880449486e-05, + "loss": 5.5965, + "step": 3332 + }, + { + "epoch": 0.019822295175563803, + "grad_norm": 3.081099510192871, + "learning_rate": 4.995156974846054e-05, + "loss": 5.6945, + "step": 3333 + }, + { + "epoch": 0.0198282424588448, + "grad_norm": 2.042445182800293, + "learning_rate": 4.995154068371589e-05, + "loss": 5.693, + "step": 3334 + }, + { + "epoch": 0.019834189742125797, + "grad_norm": 2.8875865936279297, + "learning_rate": 4.995151161026091e-05, + "loss": 5.5981, + "step": 3335 + }, + { + "epoch": 0.019840137025406795, + "grad_norm": 2.4203453063964844, + "learning_rate": 4.9951482528095615e-05, + "loss": 5.6269, + "step": 3336 + }, + { + "epoch": 0.01984608430868779, + "grad_norm": 2.332151174545288, + "learning_rate": 4.995145343722002e-05, + "loss": 5.6002, + "step": 3337 + }, + { + "epoch": 0.01985203159196879, + "grad_norm": 2.556549310684204, + "learning_rate": 4.995142433763413e-05, + "loss": 5.7715, + "step": 3338 + }, + { + "epoch": 0.019857978875249784, + "grad_norm": 2.453113079071045, + "learning_rate": 4.995139522933796e-05, + "loss": 5.8958, + "step": 3339 + }, + { + "epoch": 0.019863926158530783, + "grad_norm": 1.9842414855957031, + "learning_rate": 4.995136611233151e-05, + "loss": 5.9781, + "step": 3340 + }, + { + "epoch": 0.01986987344181178, + "grad_norm": 2.3725521564483643, + "learning_rate": 4.995133698661479e-05, + "loss": 5.9902, + "step": 3341 + }, + { + "epoch": 0.019875820725092776, + "grad_norm": 2.679001808166504, + "learning_rate": 4.9951307852187824e-05, + "loss": 5.9526, + "step": 3342 + }, + { + "epoch": 0.019881768008373775, + "grad_norm": 2.272595167160034, + "learning_rate": 4.995127870905061e-05, + "loss": 5.9685, + "step": 3343 + }, + { + "epoch": 0.019887715291654774, + "grad_norm": 2.0300357341766357, + "learning_rate": 4.995124955720317e-05, + "loss": 5.7702, + "step": 3344 + }, + { + "epoch": 0.01989366257493577, + "grad_norm": 2.5023481845855713, + "learning_rate": 4.9951220396645504e-05, + "loss": 5.6612, + "step": 3345 + }, + { + "epoch": 0.019899609858216767, + "grad_norm": 2.426457166671753, + "learning_rate": 4.995119122737762e-05, + "loss": 5.767, + "step": 3346 + }, + { + "epoch": 0.019905557141497762, + "grad_norm": 2.4919028282165527, + "learning_rate": 4.995116204939954e-05, + "loss": 6.0578, + "step": 3347 + }, + { + "epoch": 0.01991150442477876, + "grad_norm": 3.099792957305908, + "learning_rate": 4.995113286271126e-05, + "loss": 7.053, + "step": 3348 + }, + { + "epoch": 0.01991745170805976, + "grad_norm": 2.597169876098633, + "learning_rate": 4.9951103667312795e-05, + "loss": 5.8467, + "step": 3349 + }, + { + "epoch": 0.019923398991340754, + "grad_norm": 2.1132469177246094, + "learning_rate": 4.995107446320416e-05, + "loss": 5.7296, + "step": 3350 + }, + { + "epoch": 0.019929346274621753, + "grad_norm": 2.4141721725463867, + "learning_rate": 4.995104525038537e-05, + "loss": 5.8705, + "step": 3351 + }, + { + "epoch": 0.01993529355790275, + "grad_norm": 1.9012199640274048, + "learning_rate": 4.995101602885642e-05, + "loss": 5.8759, + "step": 3352 + }, + { + "epoch": 0.019941240841183747, + "grad_norm": 2.168673038482666, + "learning_rate": 4.9950986798617335e-05, + "loss": 5.8161, + "step": 3353 + }, + { + "epoch": 0.019947188124464745, + "grad_norm": 2.1579155921936035, + "learning_rate": 4.995095755966811e-05, + "loss": 5.8699, + "step": 3354 + }, + { + "epoch": 0.01995313540774574, + "grad_norm": 2.1460800170898438, + "learning_rate": 4.9950928312008774e-05, + "loss": 5.9144, + "step": 3355 + }, + { + "epoch": 0.01995908269102674, + "grad_norm": 2.402167558670044, + "learning_rate": 4.995089905563932e-05, + "loss": 5.8857, + "step": 3356 + }, + { + "epoch": 0.019965029974307737, + "grad_norm": 2.6381726264953613, + "learning_rate": 4.995086979055976e-05, + "loss": 6.0021, + "step": 3357 + }, + { + "epoch": 0.019970977257588732, + "grad_norm": 2.5577943325042725, + "learning_rate": 4.995084051677012e-05, + "loss": 5.9425, + "step": 3358 + }, + { + "epoch": 0.01997692454086973, + "grad_norm": 2.188215494155884, + "learning_rate": 4.995081123427039e-05, + "loss": 6.0656, + "step": 3359 + }, + { + "epoch": 0.01998287182415073, + "grad_norm": 1.8278366327285767, + "learning_rate": 4.9950781943060596e-05, + "loss": 5.8229, + "step": 3360 + }, + { + "epoch": 0.019988819107431725, + "grad_norm": 1.9054077863693237, + "learning_rate": 4.995075264314074e-05, + "loss": 5.8158, + "step": 3361 + }, + { + "epoch": 0.019994766390712723, + "grad_norm": 2.1255416870117188, + "learning_rate": 4.9950723334510826e-05, + "loss": 5.8816, + "step": 3362 + }, + { + "epoch": 0.02000071367399372, + "grad_norm": 2.026923656463623, + "learning_rate": 4.995069401717088e-05, + "loss": 5.7463, + "step": 3363 + }, + { + "epoch": 0.020006660957274717, + "grad_norm": 2.015178680419922, + "learning_rate": 4.9950664691120905e-05, + "loss": 5.6689, + "step": 3364 + }, + { + "epoch": 0.020012608240555715, + "grad_norm": 1.7729417085647583, + "learning_rate": 4.995063535636091e-05, + "loss": 5.701, + "step": 3365 + }, + { + "epoch": 0.02001855552383671, + "grad_norm": 1.9893600940704346, + "learning_rate": 4.9950606012890905e-05, + "loss": 5.7502, + "step": 3366 + }, + { + "epoch": 0.02002450280711771, + "grad_norm": 1.8950870037078857, + "learning_rate": 4.99505766607109e-05, + "loss": 5.6094, + "step": 3367 + }, + { + "epoch": 0.020030450090398704, + "grad_norm": 2.4140830039978027, + "learning_rate": 4.995054729982091e-05, + "loss": 5.8387, + "step": 3368 + }, + { + "epoch": 0.020036397373679703, + "grad_norm": 2.1887669563293457, + "learning_rate": 4.995051793022094e-05, + "loss": 5.7348, + "step": 3369 + }, + { + "epoch": 0.0200423446569607, + "grad_norm": 1.9632731676101685, + "learning_rate": 4.9950488551911e-05, + "loss": 5.5568, + "step": 3370 + }, + { + "epoch": 0.020048291940241696, + "grad_norm": 2.116834878921509, + "learning_rate": 4.995045916489111e-05, + "loss": 5.461, + "step": 3371 + }, + { + "epoch": 0.020054239223522695, + "grad_norm": 2.021256923675537, + "learning_rate": 4.9950429769161266e-05, + "loss": 5.6601, + "step": 3372 + }, + { + "epoch": 0.020060186506803693, + "grad_norm": 2.1648659706115723, + "learning_rate": 4.9950400364721486e-05, + "loss": 5.5364, + "step": 3373 + }, + { + "epoch": 0.02006613379008469, + "grad_norm": 2.043499231338501, + "learning_rate": 4.9950370951571775e-05, + "loss": 5.7273, + "step": 3374 + }, + { + "epoch": 0.020072081073365687, + "grad_norm": 2.296121597290039, + "learning_rate": 4.995034152971215e-05, + "loss": 5.8494, + "step": 3375 + }, + { + "epoch": 0.020078028356646682, + "grad_norm": 2.401031494140625, + "learning_rate": 4.995031209914261e-05, + "loss": 5.719, + "step": 3376 + }, + { + "epoch": 0.02008397563992768, + "grad_norm": 2.3130364418029785, + "learning_rate": 4.995028265986319e-05, + "loss": 5.7998, + "step": 3377 + }, + { + "epoch": 0.02008992292320868, + "grad_norm": 2.3820009231567383, + "learning_rate": 4.9950253211873874e-05, + "loss": 6.0632, + "step": 3378 + }, + { + "epoch": 0.020095870206489674, + "grad_norm": 2.1970956325531006, + "learning_rate": 4.995022375517469e-05, + "loss": 5.9776, + "step": 3379 + }, + { + "epoch": 0.020101817489770673, + "grad_norm": 1.912102460861206, + "learning_rate": 4.995019428976564e-05, + "loss": 5.7194, + "step": 3380 + }, + { + "epoch": 0.02010776477305167, + "grad_norm": 2.3187389373779297, + "learning_rate": 4.995016481564673e-05, + "loss": 6.0225, + "step": 3381 + }, + { + "epoch": 0.020113712056332667, + "grad_norm": 1.959000587463379, + "learning_rate": 4.995013533281797e-05, + "loss": 5.8453, + "step": 3382 + }, + { + "epoch": 0.020119659339613665, + "grad_norm": 2.0283286571502686, + "learning_rate": 4.995010584127938e-05, + "loss": 5.6837, + "step": 3383 + }, + { + "epoch": 0.02012560662289466, + "grad_norm": 2.410351037979126, + "learning_rate": 4.995007634103097e-05, + "loss": 5.8172, + "step": 3384 + }, + { + "epoch": 0.02013155390617566, + "grad_norm": 2.2864298820495605, + "learning_rate": 4.995004683207275e-05, + "loss": 5.8995, + "step": 3385 + }, + { + "epoch": 0.020137501189456657, + "grad_norm": 2.830883026123047, + "learning_rate": 4.995001731440472e-05, + "loss": 5.7273, + "step": 3386 + }, + { + "epoch": 0.020143448472737652, + "grad_norm": 2.486783981323242, + "learning_rate": 4.9949987788026896e-05, + "loss": 5.88, + "step": 3387 + }, + { + "epoch": 0.02014939575601865, + "grad_norm": 2.109975576400757, + "learning_rate": 4.994995825293929e-05, + "loss": 5.8618, + "step": 3388 + }, + { + "epoch": 0.02015534303929965, + "grad_norm": 2.249293327331543, + "learning_rate": 4.994992870914191e-05, + "loss": 5.8511, + "step": 3389 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.5433366298675537, + "learning_rate": 4.9949899156634774e-05, + "loss": 5.7375, + "step": 3390 + }, + { + "epoch": 0.020167237605861643, + "grad_norm": 2.7013652324676514, + "learning_rate": 4.9949869595417876e-05, + "loss": 5.8886, + "step": 3391 + }, + { + "epoch": 0.020173184889142638, + "grad_norm": 2.536972761154175, + "learning_rate": 4.994984002549124e-05, + "loss": 5.4203, + "step": 3392 + }, + { + "epoch": 0.020179132172423637, + "grad_norm": 2.596230983734131, + "learning_rate": 4.9949810446854876e-05, + "loss": 5.7882, + "step": 3393 + }, + { + "epoch": 0.020185079455704635, + "grad_norm": 2.6889936923980713, + "learning_rate": 4.9949780859508786e-05, + "loss": 5.6822, + "step": 3394 + }, + { + "epoch": 0.02019102673898563, + "grad_norm": 2.541027069091797, + "learning_rate": 4.994975126345299e-05, + "loss": 5.7394, + "step": 3395 + }, + { + "epoch": 0.02019697402226663, + "grad_norm": 2.2267251014709473, + "learning_rate": 4.9949721658687485e-05, + "loss": 5.7847, + "step": 3396 + }, + { + "epoch": 0.020202921305547628, + "grad_norm": 2.439689874649048, + "learning_rate": 4.994969204521231e-05, + "loss": 5.6222, + "step": 3397 + }, + { + "epoch": 0.020208868588828623, + "grad_norm": 2.9407742023468018, + "learning_rate": 4.9949662423027434e-05, + "loss": 5.6629, + "step": 3398 + }, + { + "epoch": 0.02021481587210962, + "grad_norm": 2.42802357673645, + "learning_rate": 4.9949632792132894e-05, + "loss": 5.3369, + "step": 3399 + }, + { + "epoch": 0.020220763155390616, + "grad_norm": 2.465508222579956, + "learning_rate": 4.99496031525287e-05, + "loss": 5.3365, + "step": 3400 + }, + { + "epoch": 0.020226710438671615, + "grad_norm": 2.408794403076172, + "learning_rate": 4.9949573504214854e-05, + "loss": 5.3156, + "step": 3401 + }, + { + "epoch": 0.020232657721952613, + "grad_norm": 2.229372978210449, + "learning_rate": 4.9949543847191374e-05, + "loss": 5.9194, + "step": 3402 + }, + { + "epoch": 0.02023860500523361, + "grad_norm": 4.567020416259766, + "learning_rate": 4.9949514181458254e-05, + "loss": 6.3379, + "step": 3403 + }, + { + "epoch": 0.020244552288514607, + "grad_norm": 3.9927520751953125, + "learning_rate": 4.9949484507015534e-05, + "loss": 6.3351, + "step": 3404 + }, + { + "epoch": 0.020250499571795602, + "grad_norm": 2.4830081462860107, + "learning_rate": 4.9949454823863195e-05, + "loss": 6.4046, + "step": 3405 + }, + { + "epoch": 0.0202564468550766, + "grad_norm": 2.282722234725952, + "learning_rate": 4.994942513200126e-05, + "loss": 6.5473, + "step": 3406 + }, + { + "epoch": 0.0202623941383576, + "grad_norm": 2.411367416381836, + "learning_rate": 4.994939543142973e-05, + "loss": 5.7898, + "step": 3407 + }, + { + "epoch": 0.020268341421638594, + "grad_norm": 3.2052342891693115, + "learning_rate": 4.994936572214864e-05, + "loss": 5.6695, + "step": 3408 + }, + { + "epoch": 0.020274288704919593, + "grad_norm": 4.142974853515625, + "learning_rate": 4.994933600415798e-05, + "loss": 6.2037, + "step": 3409 + }, + { + "epoch": 0.02028023598820059, + "grad_norm": 2.839066982269287, + "learning_rate": 4.994930627745776e-05, + "loss": 6.7308, + "step": 3410 + }, + { + "epoch": 0.020286183271481587, + "grad_norm": 3.3138885498046875, + "learning_rate": 4.9949276542048e-05, + "loss": 5.8873, + "step": 3411 + }, + { + "epoch": 0.020292130554762585, + "grad_norm": 2.6651928424835205, + "learning_rate": 4.9949246797928704e-05, + "loss": 6.6325, + "step": 3412 + }, + { + "epoch": 0.02029807783804358, + "grad_norm": 2.919436454772949, + "learning_rate": 4.994921704509988e-05, + "loss": 6.3239, + "step": 3413 + }, + { + "epoch": 0.02030402512132458, + "grad_norm": 2.6901097297668457, + "learning_rate": 4.994918728356155e-05, + "loss": 6.1712, + "step": 3414 + }, + { + "epoch": 0.020309972404605577, + "grad_norm": 2.573249340057373, + "learning_rate": 4.9949157513313704e-05, + "loss": 5.8194, + "step": 3415 + }, + { + "epoch": 0.020315919687886572, + "grad_norm": 3.0603950023651123, + "learning_rate": 4.994912773435637e-05, + "loss": 6.3881, + "step": 3416 + }, + { + "epoch": 0.02032186697116757, + "grad_norm": 3.1800057888031006, + "learning_rate": 4.994909794668956e-05, + "loss": 5.9486, + "step": 3417 + }, + { + "epoch": 0.02032781425444857, + "grad_norm": 2.537182092666626, + "learning_rate": 4.994906815031327e-05, + "loss": 6.5454, + "step": 3418 + }, + { + "epoch": 0.020333761537729565, + "grad_norm": 2.474705457687378, + "learning_rate": 4.9949038345227525e-05, + "loss": 6.5356, + "step": 3419 + }, + { + "epoch": 0.020339708821010563, + "grad_norm": 3.054689645767212, + "learning_rate": 4.994900853143232e-05, + "loss": 6.4526, + "step": 3420 + }, + { + "epoch": 0.020345656104291558, + "grad_norm": 2.587644100189209, + "learning_rate": 4.994897870892769e-05, + "loss": 6.2811, + "step": 3421 + }, + { + "epoch": 0.020351603387572557, + "grad_norm": 2.110041618347168, + "learning_rate": 4.994894887771361e-05, + "loss": 6.0428, + "step": 3422 + }, + { + "epoch": 0.020357550670853555, + "grad_norm": 2.4931492805480957, + "learning_rate": 4.9948919037790115e-05, + "loss": 6.3683, + "step": 3423 + }, + { + "epoch": 0.02036349795413455, + "grad_norm": 2.7169463634490967, + "learning_rate": 4.994888918915721e-05, + "loss": 6.5335, + "step": 3424 + }, + { + "epoch": 0.02036944523741555, + "grad_norm": 2.164363145828247, + "learning_rate": 4.994885933181491e-05, + "loss": 6.0409, + "step": 3425 + }, + { + "epoch": 0.020375392520696547, + "grad_norm": 2.480468273162842, + "learning_rate": 4.994882946576322e-05, + "loss": 5.8816, + "step": 3426 + }, + { + "epoch": 0.020381339803977543, + "grad_norm": 2.928361415863037, + "learning_rate": 4.994879959100215e-05, + "loss": 6.1706, + "step": 3427 + }, + { + "epoch": 0.02038728708725854, + "grad_norm": 2.1536660194396973, + "learning_rate": 4.994876970753171e-05, + "loss": 6.0559, + "step": 3428 + }, + { + "epoch": 0.020393234370539536, + "grad_norm": 2.6913530826568604, + "learning_rate": 4.994873981535192e-05, + "loss": 6.7411, + "step": 3429 + }, + { + "epoch": 0.020399181653820535, + "grad_norm": 2.647124767303467, + "learning_rate": 4.994870991446278e-05, + "loss": 6.5251, + "step": 3430 + }, + { + "epoch": 0.020405128937101533, + "grad_norm": 2.621612310409546, + "learning_rate": 4.994868000486429e-05, + "loss": 6.7029, + "step": 3431 + }, + { + "epoch": 0.02041107622038253, + "grad_norm": 2.1986844539642334, + "learning_rate": 4.994865008655649e-05, + "loss": 6.4561, + "step": 3432 + }, + { + "epoch": 0.020417023503663527, + "grad_norm": 2.706897735595703, + "learning_rate": 4.994862015953936e-05, + "loss": 6.3125, + "step": 3433 + }, + { + "epoch": 0.020422970786944522, + "grad_norm": 2.403346300125122, + "learning_rate": 4.994859022381294e-05, + "loss": 6.0808, + "step": 3434 + }, + { + "epoch": 0.02042891807022552, + "grad_norm": 2.367835521697998, + "learning_rate": 4.994856027937722e-05, + "loss": 6.2634, + "step": 3435 + }, + { + "epoch": 0.02043486535350652, + "grad_norm": 2.8564250469207764, + "learning_rate": 4.9948530326232205e-05, + "loss": 6.579, + "step": 3436 + }, + { + "epoch": 0.020440812636787514, + "grad_norm": 2.9472100734710693, + "learning_rate": 4.9948500364377925e-05, + "loss": 6.3873, + "step": 3437 + }, + { + "epoch": 0.020446759920068513, + "grad_norm": 2.3005917072296143, + "learning_rate": 4.994847039381438e-05, + "loss": 6.2316, + "step": 3438 + }, + { + "epoch": 0.02045270720334951, + "grad_norm": 2.0548787117004395, + "learning_rate": 4.9948440414541584e-05, + "loss": 6.5022, + "step": 3439 + }, + { + "epoch": 0.020458654486630506, + "grad_norm": 2.1332197189331055, + "learning_rate": 4.9948410426559536e-05, + "loss": 6.1486, + "step": 3440 + }, + { + "epoch": 0.020464601769911505, + "grad_norm": 2.112738847732544, + "learning_rate": 4.994838042986827e-05, + "loss": 5.9125, + "step": 3441 + }, + { + "epoch": 0.0204705490531925, + "grad_norm": 2.714627981185913, + "learning_rate": 4.9948350424467774e-05, + "loss": 6.1164, + "step": 3442 + }, + { + "epoch": 0.0204764963364735, + "grad_norm": 2.337571382522583, + "learning_rate": 4.994832041035806e-05, + "loss": 6.0567, + "step": 3443 + }, + { + "epoch": 0.020482443619754497, + "grad_norm": 2.354389190673828, + "learning_rate": 4.994829038753915e-05, + "loss": 5.5922, + "step": 3444 + }, + { + "epoch": 0.020488390903035492, + "grad_norm": 2.3885531425476074, + "learning_rate": 4.994826035601106e-05, + "loss": 6.4178, + "step": 3445 + }, + { + "epoch": 0.02049433818631649, + "grad_norm": 2.931328058242798, + "learning_rate": 4.994823031577378e-05, + "loss": 6.356, + "step": 3446 + }, + { + "epoch": 0.02050028546959749, + "grad_norm": 2.4858877658843994, + "learning_rate": 4.994820026682733e-05, + "loss": 6.0601, + "step": 3447 + }, + { + "epoch": 0.020506232752878484, + "grad_norm": 2.626811981201172, + "learning_rate": 4.9948170209171725e-05, + "loss": 6.4372, + "step": 3448 + }, + { + "epoch": 0.020512180036159483, + "grad_norm": 2.2917356491088867, + "learning_rate": 4.994814014280696e-05, + "loss": 5.9828, + "step": 3449 + }, + { + "epoch": 0.020518127319440478, + "grad_norm": 2.174531936645508, + "learning_rate": 4.9948110067733075e-05, + "loss": 6.3382, + "step": 3450 + }, + { + "epoch": 0.020524074602721477, + "grad_norm": 2.9880006313323975, + "learning_rate": 4.994807998395005e-05, + "loss": 6.7493, + "step": 3451 + }, + { + "epoch": 0.020530021886002475, + "grad_norm": 2.6577212810516357, + "learning_rate": 4.994804989145792e-05, + "loss": 6.853, + "step": 3452 + }, + { + "epoch": 0.02053596916928347, + "grad_norm": 2.8832437992095947, + "learning_rate": 4.994801979025667e-05, + "loss": 6.5829, + "step": 3453 + }, + { + "epoch": 0.02054191645256447, + "grad_norm": 2.473177194595337, + "learning_rate": 4.994798968034633e-05, + "loss": 6.2879, + "step": 3454 + }, + { + "epoch": 0.020547863735845467, + "grad_norm": 2.7484633922576904, + "learning_rate": 4.994795956172691e-05, + "loss": 6.2037, + "step": 3455 + }, + { + "epoch": 0.020553811019126463, + "grad_norm": 1.6647555828094482, + "learning_rate": 4.9947929434398403e-05, + "loss": 6.5639, + "step": 3456 + }, + { + "epoch": 0.02055975830240746, + "grad_norm": 3.71087908744812, + "learning_rate": 4.994789929836084e-05, + "loss": 6.8464, + "step": 3457 + }, + { + "epoch": 0.020565705585688456, + "grad_norm": 2.705892324447632, + "learning_rate": 4.994786915361422e-05, + "loss": 6.8316, + "step": 3458 + }, + { + "epoch": 0.020571652868969455, + "grad_norm": 2.3619437217712402, + "learning_rate": 4.994783900015856e-05, + "loss": 6.3441, + "step": 3459 + }, + { + "epoch": 0.020577600152250453, + "grad_norm": 2.490499258041382, + "learning_rate": 4.9947808837993864e-05, + "loss": 6.1467, + "step": 3460 + }, + { + "epoch": 0.02058354743553145, + "grad_norm": 2.546614170074463, + "learning_rate": 4.994777866712015e-05, + "loss": 5.6677, + "step": 3461 + }, + { + "epoch": 0.020589494718812447, + "grad_norm": 2.473695755004883, + "learning_rate": 4.994774848753741e-05, + "loss": 5.7815, + "step": 3462 + }, + { + "epoch": 0.020595442002093442, + "grad_norm": 2.0494625568389893, + "learning_rate": 4.994771829924569e-05, + "loss": 5.674, + "step": 3463 + }, + { + "epoch": 0.02060138928537444, + "grad_norm": 2.1504273414611816, + "learning_rate": 4.9947688102244964e-05, + "loss": 5.5299, + "step": 3464 + }, + { + "epoch": 0.02060733656865544, + "grad_norm": 2.908170700073242, + "learning_rate": 4.994765789653526e-05, + "loss": 5.8448, + "step": 3465 + }, + { + "epoch": 0.020613283851936434, + "grad_norm": 3.1434714794158936, + "learning_rate": 4.994762768211659e-05, + "loss": 5.8413, + "step": 3466 + }, + { + "epoch": 0.020619231135217433, + "grad_norm": 2.4688189029693604, + "learning_rate": 4.994759745898896e-05, + "loss": 5.6458, + "step": 3467 + }, + { + "epoch": 0.02062517841849843, + "grad_norm": 2.172083854675293, + "learning_rate": 4.994756722715238e-05, + "loss": 5.723, + "step": 3468 + }, + { + "epoch": 0.020631125701779426, + "grad_norm": 2.0702707767486572, + "learning_rate": 4.994753698660687e-05, + "loss": 5.6199, + "step": 3469 + }, + { + "epoch": 0.020637072985060425, + "grad_norm": 2.2142136096954346, + "learning_rate": 4.9947506737352425e-05, + "loss": 5.5476, + "step": 3470 + }, + { + "epoch": 0.02064302026834142, + "grad_norm": 2.156874179840088, + "learning_rate": 4.994747647938907e-05, + "loss": 5.4773, + "step": 3471 + }, + { + "epoch": 0.02064896755162242, + "grad_norm": 3.3683371543884277, + "learning_rate": 4.9947446212716795e-05, + "loss": 6.4804, + "step": 3472 + }, + { + "epoch": 0.020654914834903417, + "grad_norm": 2.2435977458953857, + "learning_rate": 4.9947415937335635e-05, + "loss": 6.0622, + "step": 3473 + }, + { + "epoch": 0.020660862118184412, + "grad_norm": 3.0824263095855713, + "learning_rate": 4.994738565324558e-05, + "loss": 6.8809, + "step": 3474 + }, + { + "epoch": 0.02066680940146541, + "grad_norm": 2.6978909969329834, + "learning_rate": 4.9947355360446664e-05, + "loss": 6.823, + "step": 3475 + }, + { + "epoch": 0.02067275668474641, + "grad_norm": 3.041680097579956, + "learning_rate": 4.9947325058938874e-05, + "loss": 6.4268, + "step": 3476 + }, + { + "epoch": 0.020678703968027404, + "grad_norm": 3.5326781272888184, + "learning_rate": 4.9947294748722237e-05, + "loss": 6.3516, + "step": 3477 + }, + { + "epoch": 0.020684651251308403, + "grad_norm": 2.7611732482910156, + "learning_rate": 4.994726442979675e-05, + "loss": 6.2206, + "step": 3478 + }, + { + "epoch": 0.020690598534589398, + "grad_norm": 3.8533458709716797, + "learning_rate": 4.994723410216244e-05, + "loss": 6.7907, + "step": 3479 + }, + { + "epoch": 0.020696545817870397, + "grad_norm": 2.8091351985931396, + "learning_rate": 4.99472037658193e-05, + "loss": 6.7468, + "step": 3480 + }, + { + "epoch": 0.020702493101151395, + "grad_norm": 2.4317073822021484, + "learning_rate": 4.994717342076736e-05, + "loss": 6.4682, + "step": 3481 + }, + { + "epoch": 0.02070844038443239, + "grad_norm": 2.5132029056549072, + "learning_rate": 4.994714306700661e-05, + "loss": 6.1966, + "step": 3482 + }, + { + "epoch": 0.02071438766771339, + "grad_norm": 2.8161535263061523, + "learning_rate": 4.994711270453707e-05, + "loss": 5.6045, + "step": 3483 + }, + { + "epoch": 0.020720334950994387, + "grad_norm": 2.654115915298462, + "learning_rate": 4.994708233335875e-05, + "loss": 5.8983, + "step": 3484 + }, + { + "epoch": 0.020726282234275382, + "grad_norm": 2.5971553325653076, + "learning_rate": 4.9947051953471664e-05, + "loss": 5.4422, + "step": 3485 + }, + { + "epoch": 0.02073222951755638, + "grad_norm": 2.5758557319641113, + "learning_rate": 4.9947021564875816e-05, + "loss": 5.5921, + "step": 3486 + }, + { + "epoch": 0.020738176800837376, + "grad_norm": 2.635345458984375, + "learning_rate": 4.994699116757122e-05, + "loss": 6.2316, + "step": 3487 + }, + { + "epoch": 0.020744124084118375, + "grad_norm": 2.573514938354492, + "learning_rate": 4.9946960761557896e-05, + "loss": 6.5069, + "step": 3488 + }, + { + "epoch": 0.020750071367399373, + "grad_norm": 2.587735176086426, + "learning_rate": 4.994693034683584e-05, + "loss": 5.9114, + "step": 3489 + }, + { + "epoch": 0.02075601865068037, + "grad_norm": 2.4980244636535645, + "learning_rate": 4.9946899923405075e-05, + "loss": 6.1805, + "step": 3490 + }, + { + "epoch": 0.020761965933961367, + "grad_norm": 2.614003896713257, + "learning_rate": 4.9946869491265594e-05, + "loss": 6.2294, + "step": 3491 + }, + { + "epoch": 0.020767913217242365, + "grad_norm": 3.3819997310638428, + "learning_rate": 4.994683905041743e-05, + "loss": 5.4716, + "step": 3492 + }, + { + "epoch": 0.02077386050052336, + "grad_norm": 3.168170213699341, + "learning_rate": 4.994680860086057e-05, + "loss": 5.4041, + "step": 3493 + }, + { + "epoch": 0.02077980778380436, + "grad_norm": 3.05253267288208, + "learning_rate": 4.994677814259504e-05, + "loss": 5.4958, + "step": 3494 + }, + { + "epoch": 0.020785755067085354, + "grad_norm": 2.8560431003570557, + "learning_rate": 4.994674767562085e-05, + "loss": 5.4153, + "step": 3495 + }, + { + "epoch": 0.020791702350366353, + "grad_norm": 2.790382146835327, + "learning_rate": 4.994671719993801e-05, + "loss": 6.3581, + "step": 3496 + }, + { + "epoch": 0.02079764963364735, + "grad_norm": 2.9860496520996094, + "learning_rate": 4.9946686715546535e-05, + "loss": 6.5779, + "step": 3497 + }, + { + "epoch": 0.020803596916928346, + "grad_norm": 2.744859457015991, + "learning_rate": 4.994665622244642e-05, + "loss": 6.5748, + "step": 3498 + }, + { + "epoch": 0.020809544200209345, + "grad_norm": 2.7951292991638184, + "learning_rate": 4.9946625720637683e-05, + "loss": 6.1954, + "step": 3499 + }, + { + "epoch": 0.02081549148349034, + "grad_norm": 3.2961854934692383, + "learning_rate": 4.994659521012034e-05, + "loss": 6.243, + "step": 3500 + }, + { + "epoch": 0.02082143876677134, + "grad_norm": 2.934246301651001, + "learning_rate": 4.99465646908944e-05, + "loss": 6.1307, + "step": 3501 + }, + { + "epoch": 0.020827386050052337, + "grad_norm": 3.9152729511260986, + "learning_rate": 4.994653416295987e-05, + "loss": 6.0167, + "step": 3502 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 4.510169506072998, + "learning_rate": 4.994650362631676e-05, + "loss": 6.533, + "step": 3503 + }, + { + "epoch": 0.02083928061661433, + "grad_norm": 3.415665864944458, + "learning_rate": 4.994647308096509e-05, + "loss": 6.4978, + "step": 3504 + }, + { + "epoch": 0.02084522789989533, + "grad_norm": 2.6515185832977295, + "learning_rate": 4.9946442526904856e-05, + "loss": 6.3859, + "step": 3505 + }, + { + "epoch": 0.020851175183176324, + "grad_norm": 2.8215248584747314, + "learning_rate": 4.994641196413609e-05, + "loss": 6.243, + "step": 3506 + }, + { + "epoch": 0.020857122466457323, + "grad_norm": 2.644529104232788, + "learning_rate": 4.9946381392658773e-05, + "loss": 6.2954, + "step": 3507 + }, + { + "epoch": 0.020863069749738318, + "grad_norm": 3.349699020385742, + "learning_rate": 4.994635081247294e-05, + "loss": 6.5617, + "step": 3508 + }, + { + "epoch": 0.020869017033019317, + "grad_norm": 3.3669090270996094, + "learning_rate": 4.9946320223578596e-05, + "loss": 6.6458, + "step": 3509 + }, + { + "epoch": 0.020874964316300315, + "grad_norm": 2.5562078952789307, + "learning_rate": 4.994628962597575e-05, + "loss": 5.5041, + "step": 3510 + }, + { + "epoch": 0.02088091159958131, + "grad_norm": 2.851809501647949, + "learning_rate": 4.994625901966441e-05, + "loss": 5.4607, + "step": 3511 + }, + { + "epoch": 0.02088685888286231, + "grad_norm": 3.2769458293914795, + "learning_rate": 4.994622840464458e-05, + "loss": 5.3115, + "step": 3512 + }, + { + "epoch": 0.020892806166143307, + "grad_norm": 2.5495102405548096, + "learning_rate": 4.994619778091629e-05, + "loss": 5.9997, + "step": 3513 + }, + { + "epoch": 0.020898753449424302, + "grad_norm": 2.609463930130005, + "learning_rate": 4.994616714847954e-05, + "loss": 6.562, + "step": 3514 + }, + { + "epoch": 0.0209047007327053, + "grad_norm": 2.5731685161590576, + "learning_rate": 4.994613650733433e-05, + "loss": 6.5341, + "step": 3515 + }, + { + "epoch": 0.020910648015986296, + "grad_norm": 2.481297254562378, + "learning_rate": 4.99461058574807e-05, + "loss": 6.5878, + "step": 3516 + }, + { + "epoch": 0.020916595299267295, + "grad_norm": 2.4096593856811523, + "learning_rate": 4.9946075198918624e-05, + "loss": 6.5054, + "step": 3517 + }, + { + "epoch": 0.020922542582548293, + "grad_norm": 2.4417459964752197, + "learning_rate": 4.994604453164814e-05, + "loss": 6.3292, + "step": 3518 + }, + { + "epoch": 0.020928489865829288, + "grad_norm": 2.7062435150146484, + "learning_rate": 4.994601385566925e-05, + "loss": 5.564, + "step": 3519 + }, + { + "epoch": 0.020934437149110287, + "grad_norm": 2.613614559173584, + "learning_rate": 4.9945983170981955e-05, + "loss": 5.3929, + "step": 3520 + }, + { + "epoch": 0.020940384432391285, + "grad_norm": 2.4933719635009766, + "learning_rate": 4.994595247758629e-05, + "loss": 6.1841, + "step": 3521 + }, + { + "epoch": 0.02094633171567228, + "grad_norm": 2.251507043838501, + "learning_rate": 4.994592177548224e-05, + "loss": 6.3109, + "step": 3522 + }, + { + "epoch": 0.02095227899895328, + "grad_norm": 2.3830223083496094, + "learning_rate": 4.994589106466983e-05, + "loss": 5.9421, + "step": 3523 + }, + { + "epoch": 0.020958226282234274, + "grad_norm": 2.2940196990966797, + "learning_rate": 4.994586034514906e-05, + "loss": 6.0858, + "step": 3524 + }, + { + "epoch": 0.020964173565515273, + "grad_norm": 2.916836977005005, + "learning_rate": 4.994582961691996e-05, + "loss": 5.166, + "step": 3525 + }, + { + "epoch": 0.02097012084879627, + "grad_norm": 2.7183029651641846, + "learning_rate": 4.994579887998252e-05, + "loss": 6.9732, + "step": 3526 + }, + { + "epoch": 0.020976068132077266, + "grad_norm": 2.70143985748291, + "learning_rate": 4.994576813433676e-05, + "loss": 5.917, + "step": 3527 + }, + { + "epoch": 0.020982015415358265, + "grad_norm": 2.7375986576080322, + "learning_rate": 4.994573737998269e-05, + "loss": 5.3025, + "step": 3528 + }, + { + "epoch": 0.02098796269863926, + "grad_norm": 2.656982183456421, + "learning_rate": 4.994570661692033e-05, + "loss": 5.2383, + "step": 3529 + }, + { + "epoch": 0.02099390998192026, + "grad_norm": 2.2119734287261963, + "learning_rate": 4.994567584514968e-05, + "loss": 6.0456, + "step": 3530 + }, + { + "epoch": 0.020999857265201257, + "grad_norm": 2.9191582202911377, + "learning_rate": 4.9945645064670737e-05, + "loss": 6.3808, + "step": 3531 + }, + { + "epoch": 0.021005804548482252, + "grad_norm": 3.124101400375366, + "learning_rate": 4.994561427548354e-05, + "loss": 5.3631, + "step": 3532 + }, + { + "epoch": 0.02101175183176325, + "grad_norm": 2.803938150405884, + "learning_rate": 4.994558347758808e-05, + "loss": 5.3172, + "step": 3533 + }, + { + "epoch": 0.02101769911504425, + "grad_norm": 2.6231577396392822, + "learning_rate": 4.994555267098438e-05, + "loss": 6.4466, + "step": 3534 + }, + { + "epoch": 0.021023646398325244, + "grad_norm": 2.735590696334839, + "learning_rate": 4.994552185567244e-05, + "loss": 5.3115, + "step": 3535 + }, + { + "epoch": 0.021029593681606243, + "grad_norm": 2.730459690093994, + "learning_rate": 4.994549103165228e-05, + "loss": 5.2311, + "step": 3536 + }, + { + "epoch": 0.021035540964887238, + "grad_norm": 2.1241424083709717, + "learning_rate": 4.994546019892391e-05, + "loss": 5.6599, + "step": 3537 + }, + { + "epoch": 0.021041488248168237, + "grad_norm": 2.607807159423828, + "learning_rate": 4.994542935748733e-05, + "loss": 6.1182, + "step": 3538 + }, + { + "epoch": 0.021047435531449235, + "grad_norm": 2.6896564960479736, + "learning_rate": 4.9945398507342567e-05, + "loss": 6.2827, + "step": 3539 + }, + { + "epoch": 0.02105338281473023, + "grad_norm": 2.9237961769104004, + "learning_rate": 4.994536764848962e-05, + "loss": 5.9629, + "step": 3540 + }, + { + "epoch": 0.02105933009801123, + "grad_norm": 2.7576143741607666, + "learning_rate": 4.99453367809285e-05, + "loss": 5.7612, + "step": 3541 + }, + { + "epoch": 0.021065277381292227, + "grad_norm": 3.1622097492218018, + "learning_rate": 4.9945305904659226e-05, + "loss": 6.0415, + "step": 3542 + }, + { + "epoch": 0.021071224664573222, + "grad_norm": 2.471127510070801, + "learning_rate": 4.994527501968179e-05, + "loss": 6.1264, + "step": 3543 + }, + { + "epoch": 0.02107717194785422, + "grad_norm": 2.797504425048828, + "learning_rate": 4.994524412599623e-05, + "loss": 6.3515, + "step": 3544 + }, + { + "epoch": 0.021083119231135216, + "grad_norm": 2.4932103157043457, + "learning_rate": 4.9945213223602535e-05, + "loss": 6.4327, + "step": 3545 + }, + { + "epoch": 0.021089066514416215, + "grad_norm": 2.5194599628448486, + "learning_rate": 4.9945182312500725e-05, + "loss": 6.4003, + "step": 3546 + }, + { + "epoch": 0.021095013797697213, + "grad_norm": 2.287858247756958, + "learning_rate": 4.9945151392690814e-05, + "loss": 6.3287, + "step": 3547 + }, + { + "epoch": 0.021100961080978208, + "grad_norm": 2.941619873046875, + "learning_rate": 4.994512046417281e-05, + "loss": 6.1364, + "step": 3548 + }, + { + "epoch": 0.021106908364259207, + "grad_norm": 3.1448967456817627, + "learning_rate": 4.994508952694672e-05, + "loss": 5.8638, + "step": 3549 + }, + { + "epoch": 0.021112855647540205, + "grad_norm": 2.869966983795166, + "learning_rate": 4.994505858101255e-05, + "loss": 6.0122, + "step": 3550 + }, + { + "epoch": 0.0211188029308212, + "grad_norm": 2.421264886856079, + "learning_rate": 4.9945027626370325e-05, + "loss": 6.1243, + "step": 3551 + }, + { + "epoch": 0.0211247502141022, + "grad_norm": 2.599456310272217, + "learning_rate": 4.9944996663020047e-05, + "loss": 5.9484, + "step": 3552 + }, + { + "epoch": 0.021130697497383194, + "grad_norm": 3.1029574871063232, + "learning_rate": 4.994496569096173e-05, + "loss": 5.9347, + "step": 3553 + }, + { + "epoch": 0.021136644780664193, + "grad_norm": 3.02494478225708, + "learning_rate": 4.994493471019538e-05, + "loss": 5.814, + "step": 3554 + }, + { + "epoch": 0.02114259206394519, + "grad_norm": 2.359682559967041, + "learning_rate": 4.994490372072101e-05, + "loss": 5.8533, + "step": 3555 + }, + { + "epoch": 0.021148539347226186, + "grad_norm": 2.7072582244873047, + "learning_rate": 4.994487272253864e-05, + "loss": 5.855, + "step": 3556 + }, + { + "epoch": 0.021154486630507185, + "grad_norm": 2.3102664947509766, + "learning_rate": 4.994484171564826e-05, + "loss": 5.6701, + "step": 3557 + }, + { + "epoch": 0.02116043391378818, + "grad_norm": 2.3804259300231934, + "learning_rate": 4.9944810700049906e-05, + "loss": 5.5096, + "step": 3558 + }, + { + "epoch": 0.02116638119706918, + "grad_norm": 2.463280439376831, + "learning_rate": 4.994477967574357e-05, + "loss": 5.5178, + "step": 3559 + }, + { + "epoch": 0.021172328480350177, + "grad_norm": 2.884152412414551, + "learning_rate": 4.9944748642729265e-05, + "loss": 6.1013, + "step": 3560 + }, + { + "epoch": 0.021178275763631172, + "grad_norm": 3.009460210800171, + "learning_rate": 4.9944717601007006e-05, + "loss": 6.2725, + "step": 3561 + }, + { + "epoch": 0.02118422304691217, + "grad_norm": 2.5930371284484863, + "learning_rate": 4.9944686550576814e-05, + "loss": 6.1138, + "step": 3562 + }, + { + "epoch": 0.02119017033019317, + "grad_norm": 2.8212878704071045, + "learning_rate": 4.9944655491438684e-05, + "loss": 5.6209, + "step": 3563 + }, + { + "epoch": 0.021196117613474164, + "grad_norm": 2.9814743995666504, + "learning_rate": 4.9944624423592634e-05, + "loss": 5.8912, + "step": 3564 + }, + { + "epoch": 0.021202064896755163, + "grad_norm": 3.1456093788146973, + "learning_rate": 4.994459334703867e-05, + "loss": 5.961, + "step": 3565 + }, + { + "epoch": 0.021208012180036158, + "grad_norm": 2.9300050735473633, + "learning_rate": 4.9944562261776805e-05, + "loss": 6.773, + "step": 3566 + }, + { + "epoch": 0.021213959463317156, + "grad_norm": 2.570685625076294, + "learning_rate": 4.994453116780705e-05, + "loss": 6.3575, + "step": 3567 + }, + { + "epoch": 0.021219906746598155, + "grad_norm": 2.7060914039611816, + "learning_rate": 4.994450006512943e-05, + "loss": 6.249, + "step": 3568 + }, + { + "epoch": 0.02122585402987915, + "grad_norm": 3.0027518272399902, + "learning_rate": 4.994446895374393e-05, + "loss": 5.8243, + "step": 3569 + }, + { + "epoch": 0.02123180131316015, + "grad_norm": 2.785888195037842, + "learning_rate": 4.994443783365058e-05, + "loss": 5.9836, + "step": 3570 + }, + { + "epoch": 0.021237748596441147, + "grad_norm": 2.5480010509490967, + "learning_rate": 4.994440670484938e-05, + "loss": 6.4237, + "step": 3571 + }, + { + "epoch": 0.021243695879722142, + "grad_norm": 2.687121629714966, + "learning_rate": 4.9944375567340345e-05, + "loss": 6.4497, + "step": 3572 + }, + { + "epoch": 0.02124964316300314, + "grad_norm": 2.6066362857818604, + "learning_rate": 4.994434442112349e-05, + "loss": 6.3853, + "step": 3573 + }, + { + "epoch": 0.021255590446284136, + "grad_norm": 2.880352020263672, + "learning_rate": 4.994431326619882e-05, + "loss": 6.382, + "step": 3574 + }, + { + "epoch": 0.021261537729565134, + "grad_norm": 3.0415213108062744, + "learning_rate": 4.9944282102566345e-05, + "loss": 6.4472, + "step": 3575 + }, + { + "epoch": 0.021267485012846133, + "grad_norm": 2.4917140007019043, + "learning_rate": 4.994425093022609e-05, + "loss": 6.2546, + "step": 3576 + }, + { + "epoch": 0.021273432296127128, + "grad_norm": 2.53648042678833, + "learning_rate": 4.9944219749178044e-05, + "loss": 6.37, + "step": 3577 + }, + { + "epoch": 0.021279379579408127, + "grad_norm": 2.796342134475708, + "learning_rate": 4.994418855942223e-05, + "loss": 6.1691, + "step": 3578 + }, + { + "epoch": 0.021285326862689125, + "grad_norm": 2.9148125648498535, + "learning_rate": 4.9944157360958656e-05, + "loss": 6.2552, + "step": 3579 + }, + { + "epoch": 0.02129127414597012, + "grad_norm": 3.0777838230133057, + "learning_rate": 4.994412615378734e-05, + "loss": 6.2359, + "step": 3580 + }, + { + "epoch": 0.02129722142925112, + "grad_norm": 2.5878093242645264, + "learning_rate": 4.994409493790828e-05, + "loss": 6.0746, + "step": 3581 + }, + { + "epoch": 0.021303168712532114, + "grad_norm": 3.2084906101226807, + "learning_rate": 4.99440637133215e-05, + "loss": 6.1357, + "step": 3582 + }, + { + "epoch": 0.021309115995813113, + "grad_norm": 3.7210965156555176, + "learning_rate": 4.9944032480027004e-05, + "loss": 6.5117, + "step": 3583 + }, + { + "epoch": 0.02131506327909411, + "grad_norm": 2.8332109451293945, + "learning_rate": 4.994400123802481e-05, + "loss": 6.0908, + "step": 3584 + }, + { + "epoch": 0.021321010562375106, + "grad_norm": 2.83854341506958, + "learning_rate": 4.994396998731491e-05, + "loss": 6.1522, + "step": 3585 + }, + { + "epoch": 0.021326957845656105, + "grad_norm": 2.5171611309051514, + "learning_rate": 4.9943938727897335e-05, + "loss": 6.2253, + "step": 3586 + }, + { + "epoch": 0.0213329051289371, + "grad_norm": 2.2111763954162598, + "learning_rate": 4.9943907459772086e-05, + "loss": 5.7673, + "step": 3587 + }, + { + "epoch": 0.0213388524122181, + "grad_norm": 2.5147926807403564, + "learning_rate": 4.994387618293918e-05, + "loss": 6.8327, + "step": 3588 + }, + { + "epoch": 0.021344799695499097, + "grad_norm": 2.969285488128662, + "learning_rate": 4.9943844897398626e-05, + "loss": 6.9995, + "step": 3589 + }, + { + "epoch": 0.021350746978780092, + "grad_norm": 4.00917911529541, + "learning_rate": 4.994381360315043e-05, + "loss": 6.6377, + "step": 3590 + }, + { + "epoch": 0.02135669426206109, + "grad_norm": 3.899319887161255, + "learning_rate": 4.994378230019461e-05, + "loss": 6.162, + "step": 3591 + }, + { + "epoch": 0.02136264154534209, + "grad_norm": 2.9522764682769775, + "learning_rate": 4.994375098853117e-05, + "loss": 6.4405, + "step": 3592 + }, + { + "epoch": 0.021368588828623084, + "grad_norm": 3.0569825172424316, + "learning_rate": 4.994371966816012e-05, + "loss": 6.2631, + "step": 3593 + }, + { + "epoch": 0.021374536111904083, + "grad_norm": 2.9470009803771973, + "learning_rate": 4.994368833908148e-05, + "loss": 6.4785, + "step": 3594 + }, + { + "epoch": 0.021380483395185078, + "grad_norm": 2.913940668106079, + "learning_rate": 4.994365700129525e-05, + "loss": 6.6566, + "step": 3595 + }, + { + "epoch": 0.021386430678466076, + "grad_norm": 2.6037404537200928, + "learning_rate": 4.9943625654801465e-05, + "loss": 6.2535, + "step": 3596 + }, + { + "epoch": 0.021392377961747075, + "grad_norm": 2.998276948928833, + "learning_rate": 4.99435942996001e-05, + "loss": 6.8851, + "step": 3597 + }, + { + "epoch": 0.02139832524502807, + "grad_norm": 2.2189996242523193, + "learning_rate": 4.994356293569119e-05, + "loss": 6.8707, + "step": 3598 + }, + { + "epoch": 0.02140427252830907, + "grad_norm": 2.4528486728668213, + "learning_rate": 4.994353156307474e-05, + "loss": 6.9166, + "step": 3599 + }, + { + "epoch": 0.021410219811590067, + "grad_norm": 3.0538241863250732, + "learning_rate": 4.994350018175076e-05, + "loss": 6.3258, + "step": 3600 + }, + { + "epoch": 0.021416167094871062, + "grad_norm": 3.789745569229126, + "learning_rate": 4.994346879171926e-05, + "loss": 6.1962, + "step": 3601 + }, + { + "epoch": 0.02142211437815206, + "grad_norm": 3.2789254188537598, + "learning_rate": 4.994343739298025e-05, + "loss": 6.2126, + "step": 3602 + }, + { + "epoch": 0.021428061661433056, + "grad_norm": 3.0887696743011475, + "learning_rate": 4.994340598553375e-05, + "loss": 6.2395, + "step": 3603 + }, + { + "epoch": 0.021434008944714054, + "grad_norm": 2.9189252853393555, + "learning_rate": 4.994337456937977e-05, + "loss": 6.193, + "step": 3604 + }, + { + "epoch": 0.021439956227995053, + "grad_norm": 2.8582170009613037, + "learning_rate": 4.9943343144518306e-05, + "loss": 6.1077, + "step": 3605 + }, + { + "epoch": 0.021445903511276048, + "grad_norm": 3.076979160308838, + "learning_rate": 4.994331171094938e-05, + "loss": 6.0474, + "step": 3606 + }, + { + "epoch": 0.021451850794557047, + "grad_norm": 3.482161045074463, + "learning_rate": 4.994328026867301e-05, + "loss": 6.0551, + "step": 3607 + }, + { + "epoch": 0.021457798077838045, + "grad_norm": 3.001046895980835, + "learning_rate": 4.994324881768919e-05, + "loss": 6.0393, + "step": 3608 + }, + { + "epoch": 0.02146374536111904, + "grad_norm": 2.8006365299224854, + "learning_rate": 4.994321735799794e-05, + "loss": 6.0042, + "step": 3609 + }, + { + "epoch": 0.02146969264440004, + "grad_norm": 3.10727858543396, + "learning_rate": 4.994318588959927e-05, + "loss": 5.8981, + "step": 3610 + }, + { + "epoch": 0.021475639927681034, + "grad_norm": 2.660557985305786, + "learning_rate": 4.9943154412493194e-05, + "loss": 6.0426, + "step": 3611 + }, + { + "epoch": 0.021481587210962032, + "grad_norm": 2.8504562377929688, + "learning_rate": 4.994312292667972e-05, + "loss": 6.9774, + "step": 3612 + }, + { + "epoch": 0.02148753449424303, + "grad_norm": 3.0076539516448975, + "learning_rate": 4.994309143215886e-05, + "loss": 6.3238, + "step": 3613 + }, + { + "epoch": 0.021493481777524026, + "grad_norm": 2.2966883182525635, + "learning_rate": 4.9943059928930626e-05, + "loss": 7.0015, + "step": 3614 + }, + { + "epoch": 0.021499429060805025, + "grad_norm": 2.5054080486297607, + "learning_rate": 4.994302841699502e-05, + "loss": 6.9226, + "step": 3615 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 2.856278657913208, + "learning_rate": 4.9942996896352066e-05, + "loss": 6.7836, + "step": 3616 + }, + { + "epoch": 0.02151132362736702, + "grad_norm": 2.4902377128601074, + "learning_rate": 4.994296536700177e-05, + "loss": 6.7077, + "step": 3617 + }, + { + "epoch": 0.021517270910648017, + "grad_norm": 2.477932929992676, + "learning_rate": 4.994293382894414e-05, + "loss": 6.8284, + "step": 3618 + }, + { + "epoch": 0.021523218193929012, + "grad_norm": 2.3034260272979736, + "learning_rate": 4.994290228217919e-05, + "loss": 6.8012, + "step": 3619 + }, + { + "epoch": 0.02152916547721001, + "grad_norm": 2.3850560188293457, + "learning_rate": 4.9942870726706934e-05, + "loss": 6.6208, + "step": 3620 + }, + { + "epoch": 0.02153511276049101, + "grad_norm": 2.4397644996643066, + "learning_rate": 4.994283916252738e-05, + "loss": 6.7522, + "step": 3621 + }, + { + "epoch": 0.021541060043772004, + "grad_norm": 2.400846242904663, + "learning_rate": 4.994280758964053e-05, + "loss": 6.7529, + "step": 3622 + }, + { + "epoch": 0.021547007327053003, + "grad_norm": 2.358290195465088, + "learning_rate": 4.994277600804641e-05, + "loss": 6.6812, + "step": 3623 + }, + { + "epoch": 0.021552954610333998, + "grad_norm": 2.7409300804138184, + "learning_rate": 4.994274441774503e-05, + "loss": 6.668, + "step": 3624 + }, + { + "epoch": 0.021558901893614996, + "grad_norm": 2.6890954971313477, + "learning_rate": 4.994271281873639e-05, + "loss": 6.5537, + "step": 3625 + }, + { + "epoch": 0.021564849176895995, + "grad_norm": 2.8959596157073975, + "learning_rate": 4.9942681211020505e-05, + "loss": 6.4492, + "step": 3626 + }, + { + "epoch": 0.02157079646017699, + "grad_norm": 2.4325244426727295, + "learning_rate": 4.994264959459738e-05, + "loss": 6.9819, + "step": 3627 + }, + { + "epoch": 0.02157674374345799, + "grad_norm": 2.92891263961792, + "learning_rate": 4.9942617969467045e-05, + "loss": 6.9266, + "step": 3628 + }, + { + "epoch": 0.021582691026738987, + "grad_norm": 2.4398467540740967, + "learning_rate": 4.994258633562951e-05, + "loss": 6.514, + "step": 3629 + }, + { + "epoch": 0.021588638310019982, + "grad_norm": 2.577467203140259, + "learning_rate": 4.9942554693084756e-05, + "loss": 6.7248, + "step": 3630 + }, + { + "epoch": 0.02159458559330098, + "grad_norm": 2.3682591915130615, + "learning_rate": 4.9942523041832824e-05, + "loss": 6.7798, + "step": 3631 + }, + { + "epoch": 0.021600532876581976, + "grad_norm": 2.1863434314727783, + "learning_rate": 4.9942491381873705e-05, + "loss": 6.6636, + "step": 3632 + }, + { + "epoch": 0.021606480159862974, + "grad_norm": 2.0172441005706787, + "learning_rate": 4.9942459713207426e-05, + "loss": 6.6772, + "step": 3633 + }, + { + "epoch": 0.021612427443143973, + "grad_norm": 1.8671952486038208, + "learning_rate": 4.9942428035834e-05, + "loss": 6.3648, + "step": 3634 + }, + { + "epoch": 0.021618374726424968, + "grad_norm": 3.226900815963745, + "learning_rate": 4.9942396349753416e-05, + "loss": 6.4127, + "step": 3635 + }, + { + "epoch": 0.021624322009705967, + "grad_norm": 2.7766973972320557, + "learning_rate": 4.994236465496571e-05, + "loss": 6.4476, + "step": 3636 + }, + { + "epoch": 0.021630269292986965, + "grad_norm": 2.157118082046509, + "learning_rate": 4.9942332951470875e-05, + "loss": 6.5876, + "step": 3637 + }, + { + "epoch": 0.02163621657626796, + "grad_norm": 2.3870396614074707, + "learning_rate": 4.994230123926893e-05, + "loss": 6.5861, + "step": 3638 + }, + { + "epoch": 0.02164216385954896, + "grad_norm": 2.8139939308166504, + "learning_rate": 4.994226951835989e-05, + "loss": 6.4845, + "step": 3639 + }, + { + "epoch": 0.021648111142829954, + "grad_norm": 2.856207847595215, + "learning_rate": 4.9942237788743764e-05, + "loss": 6.1514, + "step": 3640 + }, + { + "epoch": 0.021654058426110952, + "grad_norm": 3.523162603378296, + "learning_rate": 4.9942206050420545e-05, + "loss": 5.8114, + "step": 3641 + }, + { + "epoch": 0.02166000570939195, + "grad_norm": 2.746587038040161, + "learning_rate": 4.9942174303390274e-05, + "loss": 5.7397, + "step": 3642 + }, + { + "epoch": 0.021665952992672946, + "grad_norm": 2.902067184448242, + "learning_rate": 4.9942142547652946e-05, + "loss": 6.4353, + "step": 3643 + }, + { + "epoch": 0.021671900275953945, + "grad_norm": 2.981391191482544, + "learning_rate": 4.994211078320857e-05, + "loss": 6.2153, + "step": 3644 + }, + { + "epoch": 0.021677847559234943, + "grad_norm": 2.6004254817962646, + "learning_rate": 4.994207901005716e-05, + "loss": 6.2365, + "step": 3645 + }, + { + "epoch": 0.021683794842515938, + "grad_norm": 2.748678684234619, + "learning_rate": 4.994204722819873e-05, + "loss": 5.8126, + "step": 3646 + }, + { + "epoch": 0.021689742125796937, + "grad_norm": 2.675466299057007, + "learning_rate": 4.994201543763329e-05, + "loss": 6.3032, + "step": 3647 + }, + { + "epoch": 0.021695689409077932, + "grad_norm": 2.681823253631592, + "learning_rate": 4.9941983638360855e-05, + "loss": 6.2706, + "step": 3648 + }, + { + "epoch": 0.02170163669235893, + "grad_norm": 2.481586217880249, + "learning_rate": 4.994195183038142e-05, + "loss": 6.1792, + "step": 3649 + }, + { + "epoch": 0.02170758397563993, + "grad_norm": 2.3379831314086914, + "learning_rate": 4.9941920013695024e-05, + "loss": 6.2689, + "step": 3650 + }, + { + "epoch": 0.021713531258920924, + "grad_norm": 2.5885238647460938, + "learning_rate": 4.994188818830164e-05, + "loss": 6.3018, + "step": 3651 + }, + { + "epoch": 0.021719478542201923, + "grad_norm": 2.341939687728882, + "learning_rate": 4.994185635420131e-05, + "loss": 5.6178, + "step": 3652 + }, + { + "epoch": 0.021725425825482918, + "grad_norm": 2.4126031398773193, + "learning_rate": 4.9941824511394044e-05, + "loss": 5.4044, + "step": 3653 + }, + { + "epoch": 0.021731373108763916, + "grad_norm": 2.2289719581604004, + "learning_rate": 4.994179265987983e-05, + "loss": 5.4134, + "step": 3654 + }, + { + "epoch": 0.021737320392044915, + "grad_norm": 2.5151331424713135, + "learning_rate": 4.994176079965871e-05, + "loss": 5.3321, + "step": 3655 + }, + { + "epoch": 0.02174326767532591, + "grad_norm": 2.0761523246765137, + "learning_rate": 4.9941728930730665e-05, + "loss": 5.3363, + "step": 3656 + }, + { + "epoch": 0.02174921495860691, + "grad_norm": 2.272510051727295, + "learning_rate": 4.994169705309573e-05, + "loss": 6.0208, + "step": 3657 + }, + { + "epoch": 0.021755162241887907, + "grad_norm": 2.6145198345184326, + "learning_rate": 4.994166516675389e-05, + "loss": 6.299, + "step": 3658 + }, + { + "epoch": 0.021761109525168902, + "grad_norm": 2.978618621826172, + "learning_rate": 4.994163327170519e-05, + "loss": 5.1248, + "step": 3659 + }, + { + "epoch": 0.0217670568084499, + "grad_norm": 2.398813247680664, + "learning_rate": 4.994160136794962e-05, + "loss": 5.1217, + "step": 3660 + }, + { + "epoch": 0.021773004091730896, + "grad_norm": 2.1145291328430176, + "learning_rate": 4.994156945548719e-05, + "loss": 5.2676, + "step": 3661 + }, + { + "epoch": 0.021778951375011894, + "grad_norm": 2.045334577560425, + "learning_rate": 4.9941537534317915e-05, + "loss": 5.2088, + "step": 3662 + }, + { + "epoch": 0.021784898658292893, + "grad_norm": 2.0598506927490234, + "learning_rate": 4.9941505604441806e-05, + "loss": 5.363, + "step": 3663 + }, + { + "epoch": 0.021790845941573888, + "grad_norm": 2.189143657684326, + "learning_rate": 4.9941473665858884e-05, + "loss": 6.0592, + "step": 3664 + }, + { + "epoch": 0.021796793224854887, + "grad_norm": 6.8580780029296875, + "learning_rate": 4.994144171856915e-05, + "loss": 6.0323, + "step": 3665 + }, + { + "epoch": 0.021802740508135885, + "grad_norm": 2.0607001781463623, + "learning_rate": 4.994140976257261e-05, + "loss": 6.0883, + "step": 3666 + }, + { + "epoch": 0.02180868779141688, + "grad_norm": 2.1669631004333496, + "learning_rate": 4.9941377797869284e-05, + "loss": 6.0546, + "step": 3667 + }, + { + "epoch": 0.02181463507469788, + "grad_norm": 2.912822961807251, + "learning_rate": 4.994134582445917e-05, + "loss": 6.0285, + "step": 3668 + }, + { + "epoch": 0.021820582357978874, + "grad_norm": 2.3223111629486084, + "learning_rate": 4.994131384234231e-05, + "loss": 6.0948, + "step": 3669 + }, + { + "epoch": 0.021826529641259872, + "grad_norm": 2.067002296447754, + "learning_rate": 4.994128185151868e-05, + "loss": 6.2908, + "step": 3670 + }, + { + "epoch": 0.02183247692454087, + "grad_norm": 2.593642473220825, + "learning_rate": 4.9941249851988317e-05, + "loss": 6.2878, + "step": 3671 + }, + { + "epoch": 0.021838424207821866, + "grad_norm": 2.6345975399017334, + "learning_rate": 4.994121784375121e-05, + "loss": 6.0796, + "step": 3672 + }, + { + "epoch": 0.021844371491102865, + "grad_norm": 2.398861885070801, + "learning_rate": 4.994118582680739e-05, + "loss": 6.096, + "step": 3673 + }, + { + "epoch": 0.021850318774383863, + "grad_norm": 2.102933883666992, + "learning_rate": 4.994115380115686e-05, + "loss": 6.1347, + "step": 3674 + }, + { + "epoch": 0.021856266057664858, + "grad_norm": 2.43632435798645, + "learning_rate": 4.994112176679963e-05, + "loss": 6.074, + "step": 3675 + }, + { + "epoch": 0.021862213340945857, + "grad_norm": 2.304213523864746, + "learning_rate": 4.9941089723735706e-05, + "loss": 5.8897, + "step": 3676 + }, + { + "epoch": 0.021868160624226852, + "grad_norm": 2.6283092498779297, + "learning_rate": 4.9941057671965106e-05, + "loss": 5.9605, + "step": 3677 + }, + { + "epoch": 0.02187410790750785, + "grad_norm": 2.0781428813934326, + "learning_rate": 4.994102561148785e-05, + "loss": 6.0645, + "step": 3678 + }, + { + "epoch": 0.02188005519078885, + "grad_norm": 2.229210376739502, + "learning_rate": 4.994099354230393e-05, + "loss": 6.223, + "step": 3679 + }, + { + "epoch": 0.021886002474069844, + "grad_norm": 2.4410789012908936, + "learning_rate": 4.9940961464413374e-05, + "loss": 6.1115, + "step": 3680 + }, + { + "epoch": 0.021891949757350843, + "grad_norm": 2.99076771736145, + "learning_rate": 4.994092937781618e-05, + "loss": 5.9028, + "step": 3681 + }, + { + "epoch": 0.021897897040631838, + "grad_norm": 2.8403074741363525, + "learning_rate": 4.994089728251237e-05, + "loss": 5.7286, + "step": 3682 + }, + { + "epoch": 0.021903844323912836, + "grad_norm": 2.0928149223327637, + "learning_rate": 4.994086517850195e-05, + "loss": 5.849, + "step": 3683 + }, + { + "epoch": 0.021909791607193835, + "grad_norm": 2.320279836654663, + "learning_rate": 4.994083306578492e-05, + "loss": 5.6767, + "step": 3684 + }, + { + "epoch": 0.02191573889047483, + "grad_norm": 3.0701658725738525, + "learning_rate": 4.994080094436132e-05, + "loss": 5.9555, + "step": 3685 + }, + { + "epoch": 0.02192168617375583, + "grad_norm": 2.1042048931121826, + "learning_rate": 4.994076881423113e-05, + "loss": 5.7651, + "step": 3686 + }, + { + "epoch": 0.021927633457036827, + "grad_norm": 2.35819673538208, + "learning_rate": 4.9940736675394385e-05, + "loss": 6.0203, + "step": 3687 + }, + { + "epoch": 0.021933580740317822, + "grad_norm": 2.659224510192871, + "learning_rate": 4.994070452785108e-05, + "loss": 5.9935, + "step": 3688 + }, + { + "epoch": 0.02193952802359882, + "grad_norm": 2.4628207683563232, + "learning_rate": 4.994067237160124e-05, + "loss": 5.9135, + "step": 3689 + }, + { + "epoch": 0.021945475306879816, + "grad_norm": 3.7227911949157715, + "learning_rate": 4.9940640206644865e-05, + "loss": 5.8365, + "step": 3690 + }, + { + "epoch": 0.021951422590160814, + "grad_norm": 3.5226151943206787, + "learning_rate": 4.994060803298197e-05, + "loss": 5.7807, + "step": 3691 + }, + { + "epoch": 0.021957369873441813, + "grad_norm": 2.3665735721588135, + "learning_rate": 4.994057585061256e-05, + "loss": 5.9632, + "step": 3692 + }, + { + "epoch": 0.021963317156722808, + "grad_norm": 2.877263069152832, + "learning_rate": 4.9940543659536666e-05, + "loss": 5.6425, + "step": 3693 + }, + { + "epoch": 0.021969264440003806, + "grad_norm": 2.5431532859802246, + "learning_rate": 4.994051145975428e-05, + "loss": 5.6531, + "step": 3694 + }, + { + "epoch": 0.021975211723284805, + "grad_norm": 2.7033538818359375, + "learning_rate": 4.9940479251265415e-05, + "loss": 5.6907, + "step": 3695 + }, + { + "epoch": 0.0219811590065658, + "grad_norm": 3.6627206802368164, + "learning_rate": 4.9940447034070093e-05, + "loss": 5.9118, + "step": 3696 + }, + { + "epoch": 0.0219871062898468, + "grad_norm": 3.896959066390991, + "learning_rate": 4.994041480816831e-05, + "loss": 5.9926, + "step": 3697 + }, + { + "epoch": 0.021993053573127794, + "grad_norm": 3.37575626373291, + "learning_rate": 4.994038257356009e-05, + "loss": 5.9768, + "step": 3698 + }, + { + "epoch": 0.021999000856408792, + "grad_norm": 2.7694313526153564, + "learning_rate": 4.9940350330245444e-05, + "loss": 5.8486, + "step": 3699 + }, + { + "epoch": 0.02200494813968979, + "grad_norm": 2.3815293312072754, + "learning_rate": 4.9940318078224376e-05, + "loss": 6.0663, + "step": 3700 + }, + { + "epoch": 0.022010895422970786, + "grad_norm": 2.3171627521514893, + "learning_rate": 4.99402858174969e-05, + "loss": 5.8543, + "step": 3701 + }, + { + "epoch": 0.022016842706251784, + "grad_norm": 2.5090551376342773, + "learning_rate": 4.994025354806303e-05, + "loss": 5.7005, + "step": 3702 + }, + { + "epoch": 0.022022789989532783, + "grad_norm": 2.7024855613708496, + "learning_rate": 4.9940221269922774e-05, + "loss": 5.7375, + "step": 3703 + }, + { + "epoch": 0.022028737272813778, + "grad_norm": 2.7900679111480713, + "learning_rate": 4.994018898307614e-05, + "loss": 6.0094, + "step": 3704 + }, + { + "epoch": 0.022034684556094777, + "grad_norm": 2.3678438663482666, + "learning_rate": 4.994015668752315e-05, + "loss": 5.822, + "step": 3705 + }, + { + "epoch": 0.022040631839375772, + "grad_norm": 2.5406653881073, + "learning_rate": 4.9940124383263807e-05, + "loss": 5.8984, + "step": 3706 + }, + { + "epoch": 0.02204657912265677, + "grad_norm": 2.371800422668457, + "learning_rate": 4.994009207029813e-05, + "loss": 5.9821, + "step": 3707 + }, + { + "epoch": 0.02205252640593777, + "grad_norm": 2.004669666290283, + "learning_rate": 4.994005974862612e-05, + "loss": 5.8801, + "step": 3708 + }, + { + "epoch": 0.022058473689218764, + "grad_norm": 2.777472972869873, + "learning_rate": 4.9940027418247787e-05, + "loss": 5.8821, + "step": 3709 + }, + { + "epoch": 0.022064420972499763, + "grad_norm": 2.599883556365967, + "learning_rate": 4.9939995079163156e-05, + "loss": 5.8716, + "step": 3710 + }, + { + "epoch": 0.022070368255780758, + "grad_norm": 2.5891127586364746, + "learning_rate": 4.993996273137223e-05, + "loss": 5.7607, + "step": 3711 + }, + { + "epoch": 0.022076315539061756, + "grad_norm": 2.3737518787384033, + "learning_rate": 4.993993037487501e-05, + "loss": 5.7825, + "step": 3712 + }, + { + "epoch": 0.022082262822342755, + "grad_norm": 2.421785831451416, + "learning_rate": 4.9939898009671524e-05, + "loss": 5.7143, + "step": 3713 + }, + { + "epoch": 0.02208821010562375, + "grad_norm": 2.4267804622650146, + "learning_rate": 4.9939865635761785e-05, + "loss": 5.8031, + "step": 3714 + }, + { + "epoch": 0.02209415738890475, + "grad_norm": 2.390333414077759, + "learning_rate": 4.993983325314579e-05, + "loss": 5.7985, + "step": 3715 + }, + { + "epoch": 0.022100104672185747, + "grad_norm": 2.2265970706939697, + "learning_rate": 4.993980086182356e-05, + "loss": 5.6261, + "step": 3716 + }, + { + "epoch": 0.022106051955466742, + "grad_norm": 2.3872458934783936, + "learning_rate": 4.99397684617951e-05, + "loss": 5.8185, + "step": 3717 + }, + { + "epoch": 0.02211199923874774, + "grad_norm": 2.077075958251953, + "learning_rate": 4.9939736053060425e-05, + "loss": 5.6252, + "step": 3718 + }, + { + "epoch": 0.022117946522028736, + "grad_norm": 2.0642287731170654, + "learning_rate": 4.993970363561954e-05, + "loss": 5.8034, + "step": 3719 + }, + { + "epoch": 0.022123893805309734, + "grad_norm": 3.5353951454162598, + "learning_rate": 4.9939671209472474e-05, + "loss": 6.7808, + "step": 3720 + }, + { + "epoch": 0.022129841088590733, + "grad_norm": 2.910531520843506, + "learning_rate": 4.9939638774619216e-05, + "loss": 5.9323, + "step": 3721 + }, + { + "epoch": 0.022135788371871728, + "grad_norm": 2.7450106143951416, + "learning_rate": 4.9939606331059794e-05, + "loss": 5.9926, + "step": 3722 + }, + { + "epoch": 0.022141735655152726, + "grad_norm": 2.7628188133239746, + "learning_rate": 4.993957387879421e-05, + "loss": 5.9129, + "step": 3723 + }, + { + "epoch": 0.022147682938433725, + "grad_norm": 2.6644890308380127, + "learning_rate": 4.9939541417822485e-05, + "loss": 5.7038, + "step": 3724 + }, + { + "epoch": 0.02215363022171472, + "grad_norm": 2.143744707107544, + "learning_rate": 4.993950894814461e-05, + "loss": 5.5821, + "step": 3725 + }, + { + "epoch": 0.02215957750499572, + "grad_norm": 2.1691160202026367, + "learning_rate": 4.993947646976063e-05, + "loss": 5.5929, + "step": 3726 + }, + { + "epoch": 0.022165524788276714, + "grad_norm": 2.1479709148406982, + "learning_rate": 4.993944398267052e-05, + "loss": 5.6653, + "step": 3727 + }, + { + "epoch": 0.022171472071557712, + "grad_norm": 2.7749600410461426, + "learning_rate": 4.993941148687431e-05, + "loss": 5.5682, + "step": 3728 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 2.668672561645508, + "learning_rate": 4.993937898237201e-05, + "loss": 5.5968, + "step": 3729 + }, + { + "epoch": 0.022183366638119706, + "grad_norm": 2.3903374671936035, + "learning_rate": 4.993934646916364e-05, + "loss": 5.7541, + "step": 3730 + }, + { + "epoch": 0.022189313921400704, + "grad_norm": 1.8555344343185425, + "learning_rate": 4.993931394724919e-05, + "loss": 5.5449, + "step": 3731 + }, + { + "epoch": 0.022195261204681703, + "grad_norm": 2.1140637397766113, + "learning_rate": 4.993928141662869e-05, + "loss": 5.8201, + "step": 3732 + }, + { + "epoch": 0.022201208487962698, + "grad_norm": 2.221573829650879, + "learning_rate": 4.993924887730213e-05, + "loss": 5.7583, + "step": 3733 + }, + { + "epoch": 0.022207155771243697, + "grad_norm": 2.0801634788513184, + "learning_rate": 4.993921632926956e-05, + "loss": 5.7083, + "step": 3734 + }, + { + "epoch": 0.02221310305452469, + "grad_norm": 2.0167016983032227, + "learning_rate": 4.993918377253095e-05, + "loss": 5.7798, + "step": 3735 + }, + { + "epoch": 0.02221905033780569, + "grad_norm": 2.104529619216919, + "learning_rate": 4.993915120708634e-05, + "loss": 5.7346, + "step": 3736 + }, + { + "epoch": 0.02222499762108669, + "grad_norm": 2.0807201862335205, + "learning_rate": 4.993911863293572e-05, + "loss": 5.7663, + "step": 3737 + }, + { + "epoch": 0.022230944904367684, + "grad_norm": 1.9223891496658325, + "learning_rate": 4.9939086050079115e-05, + "loss": 5.648, + "step": 3738 + }, + { + "epoch": 0.022236892187648682, + "grad_norm": 2.3831584453582764, + "learning_rate": 4.9939053458516535e-05, + "loss": 5.7988, + "step": 3739 + }, + { + "epoch": 0.02224283947092968, + "grad_norm": 2.433318853378296, + "learning_rate": 4.993902085824799e-05, + "loss": 5.7794, + "step": 3740 + }, + { + "epoch": 0.022248786754210676, + "grad_norm": 2.2488365173339844, + "learning_rate": 4.993898824927348e-05, + "loss": 5.7332, + "step": 3741 + }, + { + "epoch": 0.022254734037491675, + "grad_norm": 2.2924392223358154, + "learning_rate": 4.993895563159303e-05, + "loss": 5.8977, + "step": 3742 + }, + { + "epoch": 0.02226068132077267, + "grad_norm": 2.1601176261901855, + "learning_rate": 4.9938923005206664e-05, + "loss": 5.8588, + "step": 3743 + }, + { + "epoch": 0.02226662860405367, + "grad_norm": 2.256439447402954, + "learning_rate": 4.993889037011436e-05, + "loss": 5.6111, + "step": 3744 + }, + { + "epoch": 0.022272575887334667, + "grad_norm": 2.184950828552246, + "learning_rate": 4.993885772631615e-05, + "loss": 5.7544, + "step": 3745 + }, + { + "epoch": 0.022278523170615662, + "grad_norm": 2.250422716140747, + "learning_rate": 4.993882507381205e-05, + "loss": 5.6534, + "step": 3746 + }, + { + "epoch": 0.02228447045389666, + "grad_norm": 2.473811626434326, + "learning_rate": 4.9938792412602056e-05, + "loss": 5.5699, + "step": 3747 + }, + { + "epoch": 0.022290417737177656, + "grad_norm": 2.2859978675842285, + "learning_rate": 4.993875974268619e-05, + "loss": 5.8712, + "step": 3748 + }, + { + "epoch": 0.022296365020458654, + "grad_norm": 2.4002318382263184, + "learning_rate": 4.993872706406446e-05, + "loss": 5.8121, + "step": 3749 + }, + { + "epoch": 0.022302312303739653, + "grad_norm": 2.2692153453826904, + "learning_rate": 4.9938694376736884e-05, + "loss": 5.5516, + "step": 3750 + }, + { + "epoch": 0.022308259587020648, + "grad_norm": 2.1874892711639404, + "learning_rate": 4.9938661680703456e-05, + "loss": 5.8264, + "step": 3751 + }, + { + "epoch": 0.022314206870301646, + "grad_norm": 2.3802871704101562, + "learning_rate": 4.993862897596421e-05, + "loss": 5.6523, + "step": 3752 + }, + { + "epoch": 0.022320154153582645, + "grad_norm": 2.514646530151367, + "learning_rate": 4.9938596262519145e-05, + "loss": 5.5193, + "step": 3753 + }, + { + "epoch": 0.02232610143686364, + "grad_norm": 2.3175413608551025, + "learning_rate": 4.993856354036827e-05, + "loss": 5.5372, + "step": 3754 + }, + { + "epoch": 0.02233204872014464, + "grad_norm": 2.2071855068206787, + "learning_rate": 4.9938530809511595e-05, + "loss": 5.5002, + "step": 3755 + }, + { + "epoch": 0.022337996003425634, + "grad_norm": 2.046440839767456, + "learning_rate": 4.9938498069949144e-05, + "loss": 5.585, + "step": 3756 + }, + { + "epoch": 0.022343943286706632, + "grad_norm": 2.3971145153045654, + "learning_rate": 4.9938465321680915e-05, + "loss": 5.7858, + "step": 3757 + }, + { + "epoch": 0.02234989056998763, + "grad_norm": 2.462597131729126, + "learning_rate": 4.9938432564706936e-05, + "loss": 5.5606, + "step": 3758 + }, + { + "epoch": 0.022355837853268626, + "grad_norm": 2.3134138584136963, + "learning_rate": 4.99383997990272e-05, + "loss": 5.4587, + "step": 3759 + }, + { + "epoch": 0.022361785136549624, + "grad_norm": 2.137929916381836, + "learning_rate": 4.993836702464173e-05, + "loss": 5.4768, + "step": 3760 + }, + { + "epoch": 0.022367732419830623, + "grad_norm": 2.647691011428833, + "learning_rate": 4.993833424155053e-05, + "loss": 5.7902, + "step": 3761 + }, + { + "epoch": 0.022373679703111618, + "grad_norm": 2.535640239715576, + "learning_rate": 4.993830144975361e-05, + "loss": 5.8263, + "step": 3762 + }, + { + "epoch": 0.022379626986392617, + "grad_norm": 2.422997236251831, + "learning_rate": 4.9938268649251e-05, + "loss": 5.7751, + "step": 3763 + }, + { + "epoch": 0.02238557426967361, + "grad_norm": 2.6906728744506836, + "learning_rate": 4.9938235840042694e-05, + "loss": 5.5974, + "step": 3764 + }, + { + "epoch": 0.02239152155295461, + "grad_norm": 2.0284483432769775, + "learning_rate": 4.99382030221287e-05, + "loss": 5.6816, + "step": 3765 + }, + { + "epoch": 0.02239746883623561, + "grad_norm": 2.6392064094543457, + "learning_rate": 4.9938170195509035e-05, + "loss": 5.9052, + "step": 3766 + }, + { + "epoch": 0.022403416119516604, + "grad_norm": 2.6770617961883545, + "learning_rate": 4.993813736018372e-05, + "loss": 5.9041, + "step": 3767 + }, + { + "epoch": 0.022409363402797602, + "grad_norm": 2.5972392559051514, + "learning_rate": 4.993810451615276e-05, + "loss": 5.7834, + "step": 3768 + }, + { + "epoch": 0.0224153106860786, + "grad_norm": 2.0095736980438232, + "learning_rate": 4.993807166341616e-05, + "loss": 5.6074, + "step": 3769 + }, + { + "epoch": 0.022421257969359596, + "grad_norm": 2.412578582763672, + "learning_rate": 4.9938038801973945e-05, + "loss": 5.742, + "step": 3770 + }, + { + "epoch": 0.022427205252640595, + "grad_norm": 2.1285388469696045, + "learning_rate": 4.993800593182612e-05, + "loss": 5.7665, + "step": 3771 + }, + { + "epoch": 0.02243315253592159, + "grad_norm": 2.091252326965332, + "learning_rate": 4.993797305297268e-05, + "loss": 5.7165, + "step": 3772 + }, + { + "epoch": 0.022439099819202588, + "grad_norm": 2.5366342067718506, + "learning_rate": 4.993794016541367e-05, + "loss": 6.259, + "step": 3773 + }, + { + "epoch": 0.022445047102483587, + "grad_norm": 2.2637953758239746, + "learning_rate": 4.9937907269149063e-05, + "loss": 6.2132, + "step": 3774 + }, + { + "epoch": 0.022450994385764582, + "grad_norm": 2.570979595184326, + "learning_rate": 4.99378743641789e-05, + "loss": 5.9656, + "step": 3775 + }, + { + "epoch": 0.02245694166904558, + "grad_norm": 2.0587873458862305, + "learning_rate": 4.993784145050319e-05, + "loss": 5.7096, + "step": 3776 + }, + { + "epoch": 0.022462888952326576, + "grad_norm": 2.396812677383423, + "learning_rate": 4.993780852812192e-05, + "loss": 5.7258, + "step": 3777 + }, + { + "epoch": 0.022468836235607574, + "grad_norm": 2.081541061401367, + "learning_rate": 4.993777559703513e-05, + "loss": 5.6777, + "step": 3778 + }, + { + "epoch": 0.022474783518888573, + "grad_norm": 2.5242559909820557, + "learning_rate": 4.993774265724281e-05, + "loss": 5.961, + "step": 3779 + }, + { + "epoch": 0.022480730802169568, + "grad_norm": 2.4249329566955566, + "learning_rate": 4.993770970874499e-05, + "loss": 6.0494, + "step": 3780 + }, + { + "epoch": 0.022486678085450566, + "grad_norm": 2.7482552528381348, + "learning_rate": 4.993767675154169e-05, + "loss": 5.7579, + "step": 3781 + }, + { + "epoch": 0.022492625368731565, + "grad_norm": 4.115204811096191, + "learning_rate": 4.993764378563288e-05, + "loss": 6.3891, + "step": 3782 + }, + { + "epoch": 0.02249857265201256, + "grad_norm": 2.51346755027771, + "learning_rate": 4.99376108110186e-05, + "loss": 5.7982, + "step": 3783 + }, + { + "epoch": 0.02250451993529356, + "grad_norm": 2.2737278938293457, + "learning_rate": 4.993757782769887e-05, + "loss": 5.7576, + "step": 3784 + }, + { + "epoch": 0.022510467218574554, + "grad_norm": 2.2068402767181396, + "learning_rate": 4.9937544835673674e-05, + "loss": 5.9801, + "step": 3785 + }, + { + "epoch": 0.022516414501855552, + "grad_norm": 1.8548356294631958, + "learning_rate": 4.993751183494305e-05, + "loss": 6.2054, + "step": 3786 + }, + { + "epoch": 0.02252236178513655, + "grad_norm": 2.3499045372009277, + "learning_rate": 4.993747882550699e-05, + "loss": 6.0694, + "step": 3787 + }, + { + "epoch": 0.022528309068417546, + "grad_norm": 2.2253386974334717, + "learning_rate": 4.993744580736552e-05, + "loss": 5.709, + "step": 3788 + }, + { + "epoch": 0.022534256351698544, + "grad_norm": 2.1136696338653564, + "learning_rate": 4.993741278051864e-05, + "loss": 5.9546, + "step": 3789 + }, + { + "epoch": 0.022540203634979543, + "grad_norm": 1.8777605295181274, + "learning_rate": 4.9937379744966375e-05, + "loss": 5.7587, + "step": 3790 + }, + { + "epoch": 0.022546150918260538, + "grad_norm": 2.527571201324463, + "learning_rate": 4.9937346700708723e-05, + "loss": 5.0992, + "step": 3791 + }, + { + "epoch": 0.022552098201541537, + "grad_norm": 2.515805244445801, + "learning_rate": 4.99373136477457e-05, + "loss": 4.9766, + "step": 3792 + }, + { + "epoch": 0.02255804548482253, + "grad_norm": 2.442979574203491, + "learning_rate": 4.9937280586077315e-05, + "loss": 5.0981, + "step": 3793 + }, + { + "epoch": 0.02256399276810353, + "grad_norm": 2.575383424758911, + "learning_rate": 4.993724751570359e-05, + "loss": 5.0809, + "step": 3794 + }, + { + "epoch": 0.02256994005138453, + "grad_norm": 2.0855023860931396, + "learning_rate": 4.9937214436624524e-05, + "loss": 5.5744, + "step": 3795 + }, + { + "epoch": 0.022575887334665524, + "grad_norm": 2.237565040588379, + "learning_rate": 4.993718134884013e-05, + "loss": 5.6796, + "step": 3796 + }, + { + "epoch": 0.022581834617946522, + "grad_norm": 2.5895159244537354, + "learning_rate": 4.993714825235044e-05, + "loss": 5.2068, + "step": 3797 + }, + { + "epoch": 0.02258778190122752, + "grad_norm": 2.1277096271514893, + "learning_rate": 4.993711514715544e-05, + "loss": 5.5588, + "step": 3798 + }, + { + "epoch": 0.022593729184508516, + "grad_norm": 2.7074246406555176, + "learning_rate": 4.993708203325515e-05, + "loss": 5.0104, + "step": 3799 + }, + { + "epoch": 0.022599676467789515, + "grad_norm": 2.114569664001465, + "learning_rate": 4.993704891064958e-05, + "loss": 5.0453, + "step": 3800 + }, + { + "epoch": 0.02260562375107051, + "grad_norm": 2.4222404956817627, + "learning_rate": 4.9937015779338746e-05, + "loss": 5.3799, + "step": 3801 + }, + { + "epoch": 0.022611571034351508, + "grad_norm": 2.238755941390991, + "learning_rate": 4.993698263932266e-05, + "loss": 5.0075, + "step": 3802 + }, + { + "epoch": 0.022617518317632507, + "grad_norm": 2.0748255252838135, + "learning_rate": 4.993694949060133e-05, + "loss": 5.0007, + "step": 3803 + }, + { + "epoch": 0.022623465600913502, + "grad_norm": 2.1528635025024414, + "learning_rate": 4.993691633317477e-05, + "loss": 5.1048, + "step": 3804 + }, + { + "epoch": 0.0226294128841945, + "grad_norm": 2.0237200260162354, + "learning_rate": 4.993688316704298e-05, + "loss": 5.1465, + "step": 3805 + }, + { + "epoch": 0.022635360167475495, + "grad_norm": 2.2698304653167725, + "learning_rate": 4.993684999220599e-05, + "loss": 4.9642, + "step": 3806 + }, + { + "epoch": 0.022641307450756494, + "grad_norm": 2.7863757610321045, + "learning_rate": 4.993681680866381e-05, + "loss": 5.6277, + "step": 3807 + }, + { + "epoch": 0.022647254734037493, + "grad_norm": 2.394087553024292, + "learning_rate": 4.9936783616416436e-05, + "loss": 6.0895, + "step": 3808 + }, + { + "epoch": 0.022653202017318488, + "grad_norm": 2.8036317825317383, + "learning_rate": 4.993675041546389e-05, + "loss": 6.2002, + "step": 3809 + }, + { + "epoch": 0.022659149300599486, + "grad_norm": 2.4970054626464844, + "learning_rate": 4.993671720580618e-05, + "loss": 5.5114, + "step": 3810 + }, + { + "epoch": 0.022665096583880485, + "grad_norm": 3.2434241771698, + "learning_rate": 4.993668398744332e-05, + "loss": 5.0366, + "step": 3811 + }, + { + "epoch": 0.02267104386716148, + "grad_norm": 2.707104206085205, + "learning_rate": 4.9936650760375326e-05, + "loss": 5.5132, + "step": 3812 + }, + { + "epoch": 0.02267699115044248, + "grad_norm": 2.540231466293335, + "learning_rate": 4.9936617524602204e-05, + "loss": 5.8026, + "step": 3813 + }, + { + "epoch": 0.022682938433723474, + "grad_norm": 2.8549184799194336, + "learning_rate": 4.993658428012397e-05, + "loss": 6.0854, + "step": 3814 + }, + { + "epoch": 0.022688885717004472, + "grad_norm": 2.5972952842712402, + "learning_rate": 4.993655102694062e-05, + "loss": 5.8055, + "step": 3815 + }, + { + "epoch": 0.02269483300028547, + "grad_norm": 3.1625113487243652, + "learning_rate": 4.9936517765052184e-05, + "loss": 5.9683, + "step": 3816 + }, + { + "epoch": 0.022700780283566466, + "grad_norm": 3.239820718765259, + "learning_rate": 4.993648449445867e-05, + "loss": 5.9725, + "step": 3817 + }, + { + "epoch": 0.022706727566847464, + "grad_norm": 2.9632809162139893, + "learning_rate": 4.993645121516008e-05, + "loss": 5.9767, + "step": 3818 + }, + { + "epoch": 0.022712674850128463, + "grad_norm": 2.7486021518707275, + "learning_rate": 4.9936417927156435e-05, + "loss": 6.3471, + "step": 3819 + }, + { + "epoch": 0.022718622133409458, + "grad_norm": 3.8044490814208984, + "learning_rate": 4.993638463044775e-05, + "loss": 6.1275, + "step": 3820 + }, + { + "epoch": 0.022724569416690456, + "grad_norm": 4.851193428039551, + "learning_rate": 4.9936351325034024e-05, + "loss": 5.6658, + "step": 3821 + }, + { + "epoch": 0.02273051669997145, + "grad_norm": 3.1302716732025146, + "learning_rate": 4.993631801091528e-05, + "loss": 5.5256, + "step": 3822 + }, + { + "epoch": 0.02273646398325245, + "grad_norm": 5.310885906219482, + "learning_rate": 4.9936284688091526e-05, + "loss": 5.4771, + "step": 3823 + }, + { + "epoch": 0.02274241126653345, + "grad_norm": 5.493198394775391, + "learning_rate": 4.9936251356562765e-05, + "loss": 6.0993, + "step": 3824 + }, + { + "epoch": 0.022748358549814444, + "grad_norm": 3.5346286296844482, + "learning_rate": 4.993621801632902e-05, + "loss": 6.6862, + "step": 3825 + }, + { + "epoch": 0.022754305833095442, + "grad_norm": 4.550736904144287, + "learning_rate": 4.9936184667390304e-05, + "loss": 6.5658, + "step": 3826 + }, + { + "epoch": 0.02276025311637644, + "grad_norm": 3.3957576751708984, + "learning_rate": 4.993615130974662e-05, + "loss": 6.0596, + "step": 3827 + }, + { + "epoch": 0.022766200399657436, + "grad_norm": 2.614089012145996, + "learning_rate": 4.993611794339798e-05, + "loss": 6.77, + "step": 3828 + }, + { + "epoch": 0.022772147682938434, + "grad_norm": 3.712106704711914, + "learning_rate": 4.99360845683444e-05, + "loss": 6.4084, + "step": 3829 + }, + { + "epoch": 0.02277809496621943, + "grad_norm": 3.7331995964050293, + "learning_rate": 4.99360511845859e-05, + "loss": 6.2627, + "step": 3830 + }, + { + "epoch": 0.022784042249500428, + "grad_norm": 3.8898067474365234, + "learning_rate": 4.993601779212247e-05, + "loss": 6.6476, + "step": 3831 + }, + { + "epoch": 0.022789989532781427, + "grad_norm": 2.829078435897827, + "learning_rate": 4.9935984390954136e-05, + "loss": 6.2307, + "step": 3832 + }, + { + "epoch": 0.022795936816062422, + "grad_norm": 3.467954635620117, + "learning_rate": 4.9935950981080906e-05, + "loss": 6.5283, + "step": 3833 + }, + { + "epoch": 0.02280188409934342, + "grad_norm": 2.317840099334717, + "learning_rate": 4.99359175625028e-05, + "loss": 6.4549, + "step": 3834 + }, + { + "epoch": 0.02280783138262442, + "grad_norm": 2.7261998653411865, + "learning_rate": 4.9935884135219825e-05, + "loss": 6.2049, + "step": 3835 + }, + { + "epoch": 0.022813778665905414, + "grad_norm": 2.623098373413086, + "learning_rate": 4.993585069923198e-05, + "loss": 6.3847, + "step": 3836 + }, + { + "epoch": 0.022819725949186413, + "grad_norm": 2.4825377464294434, + "learning_rate": 4.993581725453929e-05, + "loss": 6.3532, + "step": 3837 + }, + { + "epoch": 0.022825673232467408, + "grad_norm": 2.278151750564575, + "learning_rate": 4.993578380114176e-05, + "loss": 5.8885, + "step": 3838 + }, + { + "epoch": 0.022831620515748406, + "grad_norm": 2.045839548110962, + "learning_rate": 4.9935750339039425e-05, + "loss": 6.6852, + "step": 3839 + }, + { + "epoch": 0.022837567799029405, + "grad_norm": 2.4009597301483154, + "learning_rate": 4.993571686823226e-05, + "loss": 6.1676, + "step": 3840 + }, + { + "epoch": 0.0228435150823104, + "grad_norm": 2.759819507598877, + "learning_rate": 4.9935683388720296e-05, + "loss": 6.3913, + "step": 3841 + }, + { + "epoch": 0.0228494623655914, + "grad_norm": 2.798785924911499, + "learning_rate": 4.9935649900503546e-05, + "loss": 6.8169, + "step": 3842 + }, + { + "epoch": 0.022855409648872393, + "grad_norm": 2.389890432357788, + "learning_rate": 4.9935616403582015e-05, + "loss": 6.7506, + "step": 3843 + }, + { + "epoch": 0.022861356932153392, + "grad_norm": 2.882474184036255, + "learning_rate": 4.9935582897955715e-05, + "loss": 6.2458, + "step": 3844 + }, + { + "epoch": 0.02286730421543439, + "grad_norm": 2.2487478256225586, + "learning_rate": 4.993554938362467e-05, + "loss": 6.7296, + "step": 3845 + }, + { + "epoch": 0.022873251498715386, + "grad_norm": 1.9563521146774292, + "learning_rate": 4.993551586058888e-05, + "loss": 6.6878, + "step": 3846 + }, + { + "epoch": 0.022879198781996384, + "grad_norm": 7.555780410766602, + "learning_rate": 4.993548232884835e-05, + "loss": 6.3309, + "step": 3847 + }, + { + "epoch": 0.022885146065277383, + "grad_norm": 2.2573931217193604, + "learning_rate": 4.99354487884031e-05, + "loss": 6.3384, + "step": 3848 + }, + { + "epoch": 0.022891093348558378, + "grad_norm": 2.063267946243286, + "learning_rate": 4.993541523925316e-05, + "loss": 6.2342, + "step": 3849 + }, + { + "epoch": 0.022897040631839376, + "grad_norm": 2.1032445430755615, + "learning_rate": 4.9935381681398505e-05, + "loss": 6.5458, + "step": 3850 + }, + { + "epoch": 0.02290298791512037, + "grad_norm": 2.233400583267212, + "learning_rate": 4.9935348114839176e-05, + "loss": 6.46, + "step": 3851 + }, + { + "epoch": 0.02290893519840137, + "grad_norm": 2.069182872772217, + "learning_rate": 4.9935314539575174e-05, + "loss": 6.4829, + "step": 3852 + }, + { + "epoch": 0.02291488248168237, + "grad_norm": 1.9986059665679932, + "learning_rate": 4.993528095560651e-05, + "loss": 6.4651, + "step": 3853 + }, + { + "epoch": 0.022920829764963364, + "grad_norm": 2.0529284477233887, + "learning_rate": 4.99352473629332e-05, + "loss": 6.1151, + "step": 3854 + }, + { + "epoch": 0.022926777048244362, + "grad_norm": 1.9643630981445312, + "learning_rate": 4.993521376155525e-05, + "loss": 5.991, + "step": 3855 + }, + { + "epoch": 0.02293272433152536, + "grad_norm": 2.2183501720428467, + "learning_rate": 4.9935180151472674e-05, + "loss": 6.8568, + "step": 3856 + }, + { + "epoch": 0.022938671614806356, + "grad_norm": 2.2095682621002197, + "learning_rate": 4.993514653268548e-05, + "loss": 6.8145, + "step": 3857 + }, + { + "epoch": 0.022944618898087354, + "grad_norm": 2.194451332092285, + "learning_rate": 4.9935112905193694e-05, + "loss": 6.4781, + "step": 3858 + }, + { + "epoch": 0.02295056618136835, + "grad_norm": 2.2242066860198975, + "learning_rate": 4.9935079268997306e-05, + "loss": 6.0535, + "step": 3859 + }, + { + "epoch": 0.022956513464649348, + "grad_norm": 2.336190938949585, + "learning_rate": 4.9935045624096354e-05, + "loss": 6.2453, + "step": 3860 + }, + { + "epoch": 0.022962460747930347, + "grad_norm": 1.9997279644012451, + "learning_rate": 4.9935011970490824e-05, + "loss": 6.3852, + "step": 3861 + }, + { + "epoch": 0.02296840803121134, + "grad_norm": 2.9107778072357178, + "learning_rate": 4.993497830818074e-05, + "loss": 6.0891, + "step": 3862 + }, + { + "epoch": 0.02297435531449234, + "grad_norm": 2.1357171535491943, + "learning_rate": 4.993494463716612e-05, + "loss": 6.5111, + "step": 3863 + }, + { + "epoch": 0.02298030259777334, + "grad_norm": 2.0228497982025146, + "learning_rate": 4.9934910957446954e-05, + "loss": 6.6009, + "step": 3864 + }, + { + "epoch": 0.022986249881054334, + "grad_norm": 2.8057942390441895, + "learning_rate": 4.993487726902328e-05, + "loss": 6.414, + "step": 3865 + }, + { + "epoch": 0.022992197164335332, + "grad_norm": 3.0660998821258545, + "learning_rate": 4.99348435718951e-05, + "loss": 6.3673, + "step": 3866 + }, + { + "epoch": 0.022998144447616328, + "grad_norm": 2.2440497875213623, + "learning_rate": 4.9934809866062416e-05, + "loss": 6.1793, + "step": 3867 + }, + { + "epoch": 0.023004091730897326, + "grad_norm": 2.342358350753784, + "learning_rate": 4.993477615152525e-05, + "loss": 6.5279, + "step": 3868 + }, + { + "epoch": 0.023010039014178325, + "grad_norm": 1.9231956005096436, + "learning_rate": 4.993474242828361e-05, + "loss": 6.4975, + "step": 3869 + }, + { + "epoch": 0.02301598629745932, + "grad_norm": 2.503028631210327, + "learning_rate": 4.9934708696337516e-05, + "loss": 6.5261, + "step": 3870 + }, + { + "epoch": 0.02302193358074032, + "grad_norm": 2.2343928813934326, + "learning_rate": 4.993467495568697e-05, + "loss": 6.0525, + "step": 3871 + }, + { + "epoch": 0.023027880864021313, + "grad_norm": 2.851964235305786, + "learning_rate": 4.993464120633198e-05, + "loss": 6.1271, + "step": 3872 + }, + { + "epoch": 0.023033828147302312, + "grad_norm": 2.580017328262329, + "learning_rate": 4.993460744827257e-05, + "loss": 6.2018, + "step": 3873 + }, + { + "epoch": 0.02303977543058331, + "grad_norm": 2.227879047393799, + "learning_rate": 4.9934573681508744e-05, + "loss": 6.0177, + "step": 3874 + }, + { + "epoch": 0.023045722713864306, + "grad_norm": 2.696531295776367, + "learning_rate": 4.993453990604051e-05, + "loss": 6.627, + "step": 3875 + }, + { + "epoch": 0.023051669997145304, + "grad_norm": 2.3439393043518066, + "learning_rate": 4.99345061218679e-05, + "loss": 6.5388, + "step": 3876 + }, + { + "epoch": 0.023057617280426303, + "grad_norm": 2.5400748252868652, + "learning_rate": 4.99344723289909e-05, + "loss": 5.9162, + "step": 3877 + }, + { + "epoch": 0.023063564563707298, + "grad_norm": 2.658193588256836, + "learning_rate": 4.9934438527409535e-05, + "loss": 5.6645, + "step": 3878 + }, + { + "epoch": 0.023069511846988296, + "grad_norm": 2.3102848529815674, + "learning_rate": 4.9934404717123814e-05, + "loss": 5.9969, + "step": 3879 + }, + { + "epoch": 0.02307545913026929, + "grad_norm": 2.6107916831970215, + "learning_rate": 4.993437089813376e-05, + "loss": 6.1776, + "step": 3880 + }, + { + "epoch": 0.02308140641355029, + "grad_norm": 2.6275434494018555, + "learning_rate": 4.993433707043937e-05, + "loss": 6.2563, + "step": 3881 + }, + { + "epoch": 0.02308735369683129, + "grad_norm": 2.8595218658447266, + "learning_rate": 4.993430323404066e-05, + "loss": 5.9371, + "step": 3882 + }, + { + "epoch": 0.023093300980112284, + "grad_norm": 2.2947659492492676, + "learning_rate": 4.993426938893764e-05, + "loss": 5.7263, + "step": 3883 + }, + { + "epoch": 0.023099248263393282, + "grad_norm": 3.3769729137420654, + "learning_rate": 4.9934235535130326e-05, + "loss": 6.2706, + "step": 3884 + }, + { + "epoch": 0.02310519554667428, + "grad_norm": 2.792043447494507, + "learning_rate": 4.9934201672618716e-05, + "loss": 5.9264, + "step": 3885 + }, + { + "epoch": 0.023111142829955276, + "grad_norm": 2.592167615890503, + "learning_rate": 4.993416780140285e-05, + "loss": 6.4031, + "step": 3886 + }, + { + "epoch": 0.023117090113236274, + "grad_norm": 2.429898977279663, + "learning_rate": 4.9934133921482716e-05, + "loss": 6.4609, + "step": 3887 + }, + { + "epoch": 0.02312303739651727, + "grad_norm": 2.1771554946899414, + "learning_rate": 4.993410003285834e-05, + "loss": 6.2873, + "step": 3888 + }, + { + "epoch": 0.023128984679798268, + "grad_norm": 2.7799339294433594, + "learning_rate": 4.9934066135529724e-05, + "loss": 5.7405, + "step": 3889 + }, + { + "epoch": 0.023134931963079267, + "grad_norm": 2.626492977142334, + "learning_rate": 4.993403222949688e-05, + "loss": 5.783, + "step": 3890 + }, + { + "epoch": 0.02314087924636026, + "grad_norm": 2.837663412094116, + "learning_rate": 4.993399831475982e-05, + "loss": 5.8039, + "step": 3891 + }, + { + "epoch": 0.02314682652964126, + "grad_norm": 2.68230938911438, + "learning_rate": 4.9933964391318564e-05, + "loss": 5.6587, + "step": 3892 + }, + { + "epoch": 0.02315277381292226, + "grad_norm": 3.2064061164855957, + "learning_rate": 4.993393045917312e-05, + "loss": 5.9516, + "step": 3893 + }, + { + "epoch": 0.023158721096203254, + "grad_norm": 3.5179402828216553, + "learning_rate": 4.99338965183235e-05, + "loss": 5.7925, + "step": 3894 + }, + { + "epoch": 0.023164668379484252, + "grad_norm": 2.9261434078216553, + "learning_rate": 4.993386256876971e-05, + "loss": 5.8677, + "step": 3895 + }, + { + "epoch": 0.023170615662765248, + "grad_norm": 3.092033624649048, + "learning_rate": 4.9933828610511766e-05, + "loss": 5.6248, + "step": 3896 + }, + { + "epoch": 0.023176562946046246, + "grad_norm": 2.7650182247161865, + "learning_rate": 4.9933794643549683e-05, + "loss": 5.7371, + "step": 3897 + }, + { + "epoch": 0.023182510229327245, + "grad_norm": 2.402839422225952, + "learning_rate": 4.993376066788347e-05, + "loss": 5.4802, + "step": 3898 + }, + { + "epoch": 0.02318845751260824, + "grad_norm": 2.606062889099121, + "learning_rate": 4.993372668351314e-05, + "loss": 5.5766, + "step": 3899 + }, + { + "epoch": 0.023194404795889238, + "grad_norm": 2.2177329063415527, + "learning_rate": 4.99336926904387e-05, + "loss": 5.5744, + "step": 3900 + }, + { + "epoch": 0.023200352079170233, + "grad_norm": 2.6953063011169434, + "learning_rate": 4.9933658688660166e-05, + "loss": 5.6414, + "step": 3901 + }, + { + "epoch": 0.023206299362451232, + "grad_norm": 2.90512752532959, + "learning_rate": 4.993362467817755e-05, + "loss": 5.5445, + "step": 3902 + }, + { + "epoch": 0.02321224664573223, + "grad_norm": 3.724168062210083, + "learning_rate": 4.993359065899086e-05, + "loss": 5.7733, + "step": 3903 + }, + { + "epoch": 0.023218193929013226, + "grad_norm": 2.9355592727661133, + "learning_rate": 4.993355663110012e-05, + "loss": 5.579, + "step": 3904 + }, + { + "epoch": 0.023224141212294224, + "grad_norm": 2.7822163105010986, + "learning_rate": 4.993352259450532e-05, + "loss": 5.5105, + "step": 3905 + }, + { + "epoch": 0.023230088495575223, + "grad_norm": 3.672539710998535, + "learning_rate": 4.99334885492065e-05, + "loss": 6.3865, + "step": 3906 + }, + { + "epoch": 0.023236035778856218, + "grad_norm": 2.26755952835083, + "learning_rate": 4.993345449520364e-05, + "loss": 5.5472, + "step": 3907 + }, + { + "epoch": 0.023241983062137216, + "grad_norm": 2.8935770988464355, + "learning_rate": 4.993342043249678e-05, + "loss": 5.5948, + "step": 3908 + }, + { + "epoch": 0.02324793034541821, + "grad_norm": 3.077798366546631, + "learning_rate": 4.9933386361085924e-05, + "loss": 5.288, + "step": 3909 + }, + { + "epoch": 0.02325387762869921, + "grad_norm": 2.479198694229126, + "learning_rate": 4.993335228097107e-05, + "loss": 5.3743, + "step": 3910 + }, + { + "epoch": 0.02325982491198021, + "grad_norm": 2.429049015045166, + "learning_rate": 4.9933318192152244e-05, + "loss": 5.6709, + "step": 3911 + }, + { + "epoch": 0.023265772195261204, + "grad_norm": 2.4515016078948975, + "learning_rate": 4.993328409462945e-05, + "loss": 5.4946, + "step": 3912 + }, + { + "epoch": 0.023271719478542202, + "grad_norm": 2.3859386444091797, + "learning_rate": 4.993324998840271e-05, + "loss": 5.5947, + "step": 3913 + }, + { + "epoch": 0.0232776667618232, + "grad_norm": 2.746438503265381, + "learning_rate": 4.993321587347203e-05, + "loss": 5.6743, + "step": 3914 + }, + { + "epoch": 0.023283614045104196, + "grad_norm": 2.416118621826172, + "learning_rate": 4.993318174983742e-05, + "loss": 5.7073, + "step": 3915 + }, + { + "epoch": 0.023289561328385194, + "grad_norm": 2.3427727222442627, + "learning_rate": 4.99331476174989e-05, + "loss": 5.5933, + "step": 3916 + }, + { + "epoch": 0.02329550861166619, + "grad_norm": 2.2179009914398193, + "learning_rate": 4.993311347645647e-05, + "loss": 5.7726, + "step": 3917 + }, + { + "epoch": 0.023301455894947188, + "grad_norm": 2.732923984527588, + "learning_rate": 4.993307932671014e-05, + "loss": 5.5783, + "step": 3918 + }, + { + "epoch": 0.023307403178228187, + "grad_norm": 2.5090553760528564, + "learning_rate": 4.993304516825994e-05, + "loss": 5.6598, + "step": 3919 + }, + { + "epoch": 0.02331335046150918, + "grad_norm": 2.690276622772217, + "learning_rate": 4.993301100110587e-05, + "loss": 5.9688, + "step": 3920 + }, + { + "epoch": 0.02331929774479018, + "grad_norm": 2.559215784072876, + "learning_rate": 4.993297682524794e-05, + "loss": 6.3315, + "step": 3921 + }, + { + "epoch": 0.02332524502807118, + "grad_norm": 2.2800240516662598, + "learning_rate": 4.993294264068617e-05, + "loss": 6.2787, + "step": 3922 + }, + { + "epoch": 0.023331192311352174, + "grad_norm": 2.478898525238037, + "learning_rate": 4.993290844742057e-05, + "loss": 6.1145, + "step": 3923 + }, + { + "epoch": 0.023337139594633172, + "grad_norm": 2.4902184009552, + "learning_rate": 4.993287424545115e-05, + "loss": 6.0665, + "step": 3924 + }, + { + "epoch": 0.023343086877914167, + "grad_norm": 2.4157116413116455, + "learning_rate": 4.9932840034777906e-05, + "loss": 6.1697, + "step": 3925 + }, + { + "epoch": 0.023349034161195166, + "grad_norm": 2.340575933456421, + "learning_rate": 4.993280581540087e-05, + "loss": 6.1121, + "step": 3926 + }, + { + "epoch": 0.023354981444476165, + "grad_norm": 2.586881160736084, + "learning_rate": 4.993277158732006e-05, + "loss": 6.1792, + "step": 3927 + }, + { + "epoch": 0.02336092872775716, + "grad_norm": 2.448880910873413, + "learning_rate": 4.9932737350535476e-05, + "loss": 6.084, + "step": 3928 + }, + { + "epoch": 0.023366876011038158, + "grad_norm": 2.525082588195801, + "learning_rate": 4.993270310504712e-05, + "loss": 5.6726, + "step": 3929 + }, + { + "epoch": 0.023372823294319153, + "grad_norm": 2.310445547103882, + "learning_rate": 4.993266885085503e-05, + "loss": 5.9496, + "step": 3930 + }, + { + "epoch": 0.023378770577600152, + "grad_norm": 2.275416612625122, + "learning_rate": 4.993263458795918e-05, + "loss": 6.0042, + "step": 3931 + }, + { + "epoch": 0.02338471786088115, + "grad_norm": 2.481973648071289, + "learning_rate": 4.993260031635963e-05, + "loss": 5.6177, + "step": 3932 + }, + { + "epoch": 0.023390665144162145, + "grad_norm": 2.439544677734375, + "learning_rate": 4.993256603605635e-05, + "loss": 5.9745, + "step": 3933 + }, + { + "epoch": 0.023396612427443144, + "grad_norm": 2.1909360885620117, + "learning_rate": 4.993253174704937e-05, + "loss": 5.9966, + "step": 3934 + }, + { + "epoch": 0.023402559710724143, + "grad_norm": 2.1893911361694336, + "learning_rate": 4.993249744933871e-05, + "loss": 6.0643, + "step": 3935 + }, + { + "epoch": 0.023408506994005138, + "grad_norm": 3.2023842334747314, + "learning_rate": 4.993246314292437e-05, + "loss": 6.2284, + "step": 3936 + }, + { + "epoch": 0.023414454277286136, + "grad_norm": 2.980842113494873, + "learning_rate": 4.9932428827806356e-05, + "loss": 6.2359, + "step": 3937 + }, + { + "epoch": 0.02342040156056713, + "grad_norm": 2.6659433841705322, + "learning_rate": 4.99323945039847e-05, + "loss": 6.2901, + "step": 3938 + }, + { + "epoch": 0.02342634884384813, + "grad_norm": 2.2173492908477783, + "learning_rate": 4.993236017145939e-05, + "loss": 5.8157, + "step": 3939 + }, + { + "epoch": 0.02343229612712913, + "grad_norm": 2.592771530151367, + "learning_rate": 4.993232583023046e-05, + "loss": 5.7747, + "step": 3940 + }, + { + "epoch": 0.023438243410410124, + "grad_norm": 2.328951835632324, + "learning_rate": 4.9932291480297915e-05, + "loss": 5.7367, + "step": 3941 + }, + { + "epoch": 0.023444190693691122, + "grad_norm": 2.3135616779327393, + "learning_rate": 4.993225712166176e-05, + "loss": 6.0592, + "step": 3942 + }, + { + "epoch": 0.02345013797697212, + "grad_norm": 2.49661922454834, + "learning_rate": 4.993222275432201e-05, + "loss": 5.9737, + "step": 3943 + }, + { + "epoch": 0.023456085260253116, + "grad_norm": 2.6462106704711914, + "learning_rate": 4.9932188378278683e-05, + "loss": 5.7053, + "step": 3944 + }, + { + "epoch": 0.023462032543534114, + "grad_norm": 2.102663516998291, + "learning_rate": 4.993215399353178e-05, + "loss": 5.9006, + "step": 3945 + }, + { + "epoch": 0.02346797982681511, + "grad_norm": 2.474500894546509, + "learning_rate": 4.9932119600081326e-05, + "loss": 6.092, + "step": 3946 + }, + { + "epoch": 0.023473927110096108, + "grad_norm": 2.6023428440093994, + "learning_rate": 4.993208519792732e-05, + "loss": 5.9045, + "step": 3947 + }, + { + "epoch": 0.023479874393377106, + "grad_norm": 2.76432466506958, + "learning_rate": 4.99320507870698e-05, + "loss": 5.8178, + "step": 3948 + }, + { + "epoch": 0.0234858216766581, + "grad_norm": 2.250816822052002, + "learning_rate": 4.993201636750874e-05, + "loss": 5.9091, + "step": 3949 + }, + { + "epoch": 0.0234917689599391, + "grad_norm": 2.1984071731567383, + "learning_rate": 4.993198193924417e-05, + "loss": 5.8804, + "step": 3950 + }, + { + "epoch": 0.0234977162432201, + "grad_norm": 2.5217959880828857, + "learning_rate": 4.993194750227611e-05, + "loss": 5.9879, + "step": 3951 + }, + { + "epoch": 0.023503663526501094, + "grad_norm": 2.080110788345337, + "learning_rate": 4.993191305660456e-05, + "loss": 5.6352, + "step": 3952 + }, + { + "epoch": 0.023509610809782092, + "grad_norm": 2.637500286102295, + "learning_rate": 4.9931878602229545e-05, + "loss": 5.7924, + "step": 3953 + }, + { + "epoch": 0.023515558093063087, + "grad_norm": 2.660531759262085, + "learning_rate": 4.9931844139151056e-05, + "loss": 6.1936, + "step": 3954 + }, + { + "epoch": 0.023521505376344086, + "grad_norm": 2.423699378967285, + "learning_rate": 4.993180966736913e-05, + "loss": 5.8974, + "step": 3955 + }, + { + "epoch": 0.023527452659625085, + "grad_norm": 2.581876277923584, + "learning_rate": 4.993177518688375e-05, + "loss": 5.833, + "step": 3956 + }, + { + "epoch": 0.02353339994290608, + "grad_norm": 2.586538076400757, + "learning_rate": 4.9931740697694965e-05, + "loss": 5.9649, + "step": 3957 + }, + { + "epoch": 0.023539347226187078, + "grad_norm": 2.5123441219329834, + "learning_rate": 4.993170619980276e-05, + "loss": 6.1251, + "step": 3958 + }, + { + "epoch": 0.023545294509468077, + "grad_norm": 3.076904535293579, + "learning_rate": 4.993167169320715e-05, + "loss": 5.9559, + "step": 3959 + }, + { + "epoch": 0.023551241792749072, + "grad_norm": 2.572312593460083, + "learning_rate": 4.9931637177908153e-05, + "loss": 6.0291, + "step": 3960 + }, + { + "epoch": 0.02355718907603007, + "grad_norm": 1.9910492897033691, + "learning_rate": 4.9931602653905776e-05, + "loss": 5.8413, + "step": 3961 + }, + { + "epoch": 0.023563136359311065, + "grad_norm": 2.530710458755493, + "learning_rate": 4.993156812120004e-05, + "loss": 6.1217, + "step": 3962 + }, + { + "epoch": 0.023569083642592064, + "grad_norm": 2.3089046478271484, + "learning_rate": 4.993153357979095e-05, + "loss": 5.822, + "step": 3963 + }, + { + "epoch": 0.023575030925873063, + "grad_norm": 2.8980624675750732, + "learning_rate": 4.993149902967852e-05, + "loss": 6.3906, + "step": 3964 + }, + { + "epoch": 0.023580978209154058, + "grad_norm": 2.2176012992858887, + "learning_rate": 4.993146447086275e-05, + "loss": 5.9259, + "step": 3965 + }, + { + "epoch": 0.023586925492435056, + "grad_norm": 2.01096773147583, + "learning_rate": 4.993142990334367e-05, + "loss": 6.3141, + "step": 3966 + }, + { + "epoch": 0.02359287277571605, + "grad_norm": 3.4096288681030273, + "learning_rate": 4.993139532712129e-05, + "loss": 6.3165, + "step": 3967 + }, + { + "epoch": 0.02359882005899705, + "grad_norm": 2.20595645904541, + "learning_rate": 4.9931360742195623e-05, + "loss": 6.016, + "step": 3968 + }, + { + "epoch": 0.02360476734227805, + "grad_norm": 3.543301820755005, + "learning_rate": 4.993132614856666e-05, + "loss": 5.722, + "step": 3969 + }, + { + "epoch": 0.023610714625559043, + "grad_norm": 2.82092547416687, + "learning_rate": 4.993129154623444e-05, + "loss": 5.8217, + "step": 3970 + }, + { + "epoch": 0.023616661908840042, + "grad_norm": 2.4585440158843994, + "learning_rate": 4.9931256935198954e-05, + "loss": 6.3298, + "step": 3971 + }, + { + "epoch": 0.02362260919212104, + "grad_norm": 2.104340076446533, + "learning_rate": 4.993122231546024e-05, + "loss": 5.9174, + "step": 3972 + }, + { + "epoch": 0.023628556475402036, + "grad_norm": 2.5130183696746826, + "learning_rate": 4.993118768701828e-05, + "loss": 6.3075, + "step": 3973 + }, + { + "epoch": 0.023634503758683034, + "grad_norm": 2.4567196369171143, + "learning_rate": 4.99311530498731e-05, + "loss": 6.0088, + "step": 3974 + }, + { + "epoch": 0.02364045104196403, + "grad_norm": 2.5174858570098877, + "learning_rate": 4.993111840402471e-05, + "loss": 6.6739, + "step": 3975 + }, + { + "epoch": 0.023646398325245028, + "grad_norm": 2.0032241344451904, + "learning_rate": 4.9931083749473136e-05, + "loss": 5.7052, + "step": 3976 + }, + { + "epoch": 0.023652345608526026, + "grad_norm": 2.9536757469177246, + "learning_rate": 4.993104908621837e-05, + "loss": 5.415, + "step": 3977 + }, + { + "epoch": 0.02365829289180702, + "grad_norm": 2.6650888919830322, + "learning_rate": 4.9931014414260435e-05, + "loss": 5.4333, + "step": 3978 + }, + { + "epoch": 0.02366424017508802, + "grad_norm": 2.3574490547180176, + "learning_rate": 4.9930979733599334e-05, + "loss": 5.5802, + "step": 3979 + }, + { + "epoch": 0.02367018745836902, + "grad_norm": 2.855534791946411, + "learning_rate": 4.99309450442351e-05, + "loss": 5.5131, + "step": 3980 + }, + { + "epoch": 0.023676134741650014, + "grad_norm": 2.430943727493286, + "learning_rate": 4.993091034616772e-05, + "loss": 6.2497, + "step": 3981 + }, + { + "epoch": 0.023682082024931012, + "grad_norm": 2.1671106815338135, + "learning_rate": 4.993087563939722e-05, + "loss": 5.9994, + "step": 3982 + }, + { + "epoch": 0.023688029308212007, + "grad_norm": 2.3268723487854004, + "learning_rate": 4.9930840923923606e-05, + "loss": 5.4779, + "step": 3983 + }, + { + "epoch": 0.023693976591493006, + "grad_norm": 2.3953616619110107, + "learning_rate": 4.993080619974689e-05, + "loss": 5.4044, + "step": 3984 + }, + { + "epoch": 0.023699923874774004, + "grad_norm": 2.043724775314331, + "learning_rate": 4.993077146686709e-05, + "loss": 5.6252, + "step": 3985 + }, + { + "epoch": 0.023705871158055, + "grad_norm": 2.5629520416259766, + "learning_rate": 4.9930736725284224e-05, + "loss": 5.1765, + "step": 3986 + }, + { + "epoch": 0.023711818441335998, + "grad_norm": 2.2148349285125732, + "learning_rate": 4.993070197499828e-05, + "loss": 5.5452, + "step": 3987 + }, + { + "epoch": 0.023717765724616997, + "grad_norm": 2.3913650512695312, + "learning_rate": 4.9930667216009295e-05, + "loss": 6.0882, + "step": 3988 + }, + { + "epoch": 0.02372371300789799, + "grad_norm": 2.619607925415039, + "learning_rate": 4.993063244831727e-05, + "loss": 6.4482, + "step": 3989 + }, + { + "epoch": 0.02372966029117899, + "grad_norm": 2.0585055351257324, + "learning_rate": 4.993059767192222e-05, + "loss": 6.0467, + "step": 3990 + }, + { + "epoch": 0.023735607574459985, + "grad_norm": 2.3380227088928223, + "learning_rate": 4.993056288682416e-05, + "loss": 5.9382, + "step": 3991 + }, + { + "epoch": 0.023741554857740984, + "grad_norm": 2.7252683639526367, + "learning_rate": 4.9930528093023085e-05, + "loss": 6.0444, + "step": 3992 + }, + { + "epoch": 0.023747502141021982, + "grad_norm": 2.333296060562134, + "learning_rate": 4.993049329051903e-05, + "loss": 5.6614, + "step": 3993 + }, + { + "epoch": 0.023753449424302978, + "grad_norm": 2.3571507930755615, + "learning_rate": 4.9930458479312e-05, + "loss": 6.328, + "step": 3994 + }, + { + "epoch": 0.023759396707583976, + "grad_norm": 2.7106499671936035, + "learning_rate": 4.9930423659402005e-05, + "loss": 6.0347, + "step": 3995 + }, + { + "epoch": 0.02376534399086497, + "grad_norm": 3.000009298324585, + "learning_rate": 4.9930388830789043e-05, + "loss": 5.5511, + "step": 3996 + }, + { + "epoch": 0.02377129127414597, + "grad_norm": 2.787912130355835, + "learning_rate": 4.993035399347316e-05, + "loss": 5.2059, + "step": 3997 + }, + { + "epoch": 0.02377723855742697, + "grad_norm": 2.7351326942443848, + "learning_rate": 4.993031914745433e-05, + "loss": 5.2997, + "step": 3998 + }, + { + "epoch": 0.023783185840707963, + "grad_norm": 2.770566701889038, + "learning_rate": 4.993028429273259e-05, + "loss": 5.8871, + "step": 3999 + }, + { + "epoch": 0.023789133123988962, + "grad_norm": 2.9528706073760986, + "learning_rate": 4.993024942930794e-05, + "loss": 5.8177, + "step": 4000 + }, + { + "epoch": 0.02379508040726996, + "grad_norm": 2.543329954147339, + "learning_rate": 4.993021455718041e-05, + "loss": 5.6446, + "step": 4001 + }, + { + "epoch": 0.023801027690550956, + "grad_norm": 2.7284936904907227, + "learning_rate": 4.993017967634999e-05, + "loss": 5.8404, + "step": 4002 + }, + { + "epoch": 0.023806974973831954, + "grad_norm": 2.752187728881836, + "learning_rate": 4.99301447868167e-05, + "loss": 5.6959, + "step": 4003 + }, + { + "epoch": 0.02381292225711295, + "grad_norm": 2.86651611328125, + "learning_rate": 4.993010988858056e-05, + "loss": 5.6329, + "step": 4004 + }, + { + "epoch": 0.023818869540393948, + "grad_norm": 3.9363176822662354, + "learning_rate": 4.9930074981641574e-05, + "loss": 5.31, + "step": 4005 + }, + { + "epoch": 0.023824816823674946, + "grad_norm": 3.41188907623291, + "learning_rate": 4.9930040065999764e-05, + "loss": 5.9905, + "step": 4006 + }, + { + "epoch": 0.02383076410695594, + "grad_norm": 3.4761459827423096, + "learning_rate": 4.9930005141655125e-05, + "loss": 6.0575, + "step": 4007 + }, + { + "epoch": 0.02383671139023694, + "grad_norm": 3.1562440395355225, + "learning_rate": 4.992997020860768e-05, + "loss": 5.9915, + "step": 4008 + }, + { + "epoch": 0.02384265867351794, + "grad_norm": 2.884049415588379, + "learning_rate": 4.992993526685744e-05, + "loss": 5.8051, + "step": 4009 + }, + { + "epoch": 0.023848605956798934, + "grad_norm": 3.3188138008117676, + "learning_rate": 4.992990031640442e-05, + "loss": 5.9637, + "step": 4010 + }, + { + "epoch": 0.023854553240079932, + "grad_norm": 3.2048282623291016, + "learning_rate": 4.992986535724862e-05, + "loss": 6.631, + "step": 4011 + }, + { + "epoch": 0.023860500523360927, + "grad_norm": 2.80204701423645, + "learning_rate": 4.992983038939008e-05, + "loss": 6.0063, + "step": 4012 + }, + { + "epoch": 0.023866447806641926, + "grad_norm": 2.993398427963257, + "learning_rate": 4.992979541282877e-05, + "loss": 5.9778, + "step": 4013 + }, + { + "epoch": 0.023872395089922924, + "grad_norm": 2.7519168853759766, + "learning_rate": 4.9929760427564744e-05, + "loss": 6.4272, + "step": 4014 + }, + { + "epoch": 0.02387834237320392, + "grad_norm": 2.9606168270111084, + "learning_rate": 4.992972543359799e-05, + "loss": 5.5372, + "step": 4015 + }, + { + "epoch": 0.023884289656484918, + "grad_norm": 2.1724514961242676, + "learning_rate": 4.992969043092853e-05, + "loss": 6.3115, + "step": 4016 + }, + { + "epoch": 0.023890236939765917, + "grad_norm": 2.1742191314697266, + "learning_rate": 4.9929655419556365e-05, + "loss": 6.5097, + "step": 4017 + }, + { + "epoch": 0.02389618422304691, + "grad_norm": 1.9729878902435303, + "learning_rate": 4.9929620399481526e-05, + "loss": 6.7061, + "step": 4018 + }, + { + "epoch": 0.02390213150632791, + "grad_norm": 2.6273725032806396, + "learning_rate": 4.9929585370704e-05, + "loss": 6.2838, + "step": 4019 + }, + { + "epoch": 0.023908078789608905, + "grad_norm": 2.5495283603668213, + "learning_rate": 4.9929550333223826e-05, + "loss": 6.1175, + "step": 4020 + }, + { + "epoch": 0.023914026072889904, + "grad_norm": 2.50193452835083, + "learning_rate": 4.9929515287041e-05, + "loss": 5.7689, + "step": 4021 + }, + { + "epoch": 0.023919973356170902, + "grad_norm": 2.402991771697998, + "learning_rate": 4.992948023215553e-05, + "loss": 6.4222, + "step": 4022 + }, + { + "epoch": 0.023925920639451898, + "grad_norm": 2.1722981929779053, + "learning_rate": 4.9929445168567444e-05, + "loss": 6.2335, + "step": 4023 + }, + { + "epoch": 0.023931867922732896, + "grad_norm": 1.6895688772201538, + "learning_rate": 4.992941009627675e-05, + "loss": 6.163, + "step": 4024 + }, + { + "epoch": 0.02393781520601389, + "grad_norm": 1.9944639205932617, + "learning_rate": 4.992937501528345e-05, + "loss": 6.2622, + "step": 4025 + }, + { + "epoch": 0.02394376248929489, + "grad_norm": 2.6157150268554688, + "learning_rate": 4.9929339925587565e-05, + "loss": 6.4582, + "step": 4026 + }, + { + "epoch": 0.023949709772575888, + "grad_norm": 2.021772623062134, + "learning_rate": 4.992930482718911e-05, + "loss": 6.2921, + "step": 4027 + }, + { + "epoch": 0.023955657055856883, + "grad_norm": 2.465402603149414, + "learning_rate": 4.992926972008808e-05, + "loss": 6.6426, + "step": 4028 + }, + { + "epoch": 0.023961604339137882, + "grad_norm": 2.337763547897339, + "learning_rate": 4.99292346042845e-05, + "loss": 6.4988, + "step": 4029 + }, + { + "epoch": 0.02396755162241888, + "grad_norm": 2.400064706802368, + "learning_rate": 4.9929199479778394e-05, + "loss": 6.6666, + "step": 4030 + }, + { + "epoch": 0.023973498905699876, + "grad_norm": 2.4205784797668457, + "learning_rate": 4.9929164346569756e-05, + "loss": 5.8805, + "step": 4031 + }, + { + "epoch": 0.023979446188980874, + "grad_norm": 2.312434673309326, + "learning_rate": 4.9929129204658605e-05, + "loss": 6.5161, + "step": 4032 + }, + { + "epoch": 0.02398539347226187, + "grad_norm": 2.02748966217041, + "learning_rate": 4.9929094054044944e-05, + "loss": 6.1272, + "step": 4033 + }, + { + "epoch": 0.023991340755542868, + "grad_norm": 2.280242443084717, + "learning_rate": 4.992905889472881e-05, + "loss": 5.7217, + "step": 4034 + }, + { + "epoch": 0.023997288038823866, + "grad_norm": 2.3911778926849365, + "learning_rate": 4.992902372671019e-05, + "loss": 5.7441, + "step": 4035 + }, + { + "epoch": 0.02400323532210486, + "grad_norm": 2.1767921447753906, + "learning_rate": 4.99289885499891e-05, + "loss": 5.7212, + "step": 4036 + }, + { + "epoch": 0.02400918260538586, + "grad_norm": 2.3067142963409424, + "learning_rate": 4.992895336456557e-05, + "loss": 5.6689, + "step": 4037 + }, + { + "epoch": 0.02401512988866686, + "grad_norm": 2.1564273834228516, + "learning_rate": 4.992891817043959e-05, + "loss": 6.1445, + "step": 4038 + }, + { + "epoch": 0.024021077171947854, + "grad_norm": 2.4852945804595947, + "learning_rate": 4.9928882967611184e-05, + "loss": 6.1883, + "step": 4039 + }, + { + "epoch": 0.024027024455228852, + "grad_norm": 2.9280812740325928, + "learning_rate": 4.992884775608036e-05, + "loss": 6.097, + "step": 4040 + }, + { + "epoch": 0.024032971738509847, + "grad_norm": 2.3219356536865234, + "learning_rate": 4.992881253584714e-05, + "loss": 6.3163, + "step": 4041 + }, + { + "epoch": 0.024038919021790846, + "grad_norm": 2.672386884689331, + "learning_rate": 4.9928777306911525e-05, + "loss": 5.9615, + "step": 4042 + }, + { + "epoch": 0.024044866305071844, + "grad_norm": 2.5886473655700684, + "learning_rate": 4.992874206927353e-05, + "loss": 6.0114, + "step": 4043 + }, + { + "epoch": 0.02405081358835284, + "grad_norm": 2.991230010986328, + "learning_rate": 4.992870682293318e-05, + "loss": 5.6805, + "step": 4044 + }, + { + "epoch": 0.024056760871633838, + "grad_norm": 2.3270034790039062, + "learning_rate": 4.9928671567890464e-05, + "loss": 5.7503, + "step": 4045 + }, + { + "epoch": 0.024062708154914837, + "grad_norm": 2.591627359390259, + "learning_rate": 4.99286363041454e-05, + "loss": 5.5707, + "step": 4046 + }, + { + "epoch": 0.02406865543819583, + "grad_norm": 2.1936891078948975, + "learning_rate": 4.992860103169802e-05, + "loss": 5.6503, + "step": 4047 + }, + { + "epoch": 0.02407460272147683, + "grad_norm": 2.2928214073181152, + "learning_rate": 4.992856575054832e-05, + "loss": 5.6067, + "step": 4048 + }, + { + "epoch": 0.024080550004757825, + "grad_norm": 2.4503591060638428, + "learning_rate": 4.992853046069632e-05, + "loss": 6.0067, + "step": 4049 + }, + { + "epoch": 0.024086497288038824, + "grad_norm": 2.84260630607605, + "learning_rate": 4.992849516214202e-05, + "loss": 6.4533, + "step": 4050 + }, + { + "epoch": 0.024092444571319822, + "grad_norm": 2.7172651290893555, + "learning_rate": 4.992845985488543e-05, + "loss": 6.4901, + "step": 4051 + }, + { + "epoch": 0.024098391854600817, + "grad_norm": 2.2101316452026367, + "learning_rate": 4.992842453892659e-05, + "loss": 6.3481, + "step": 4052 + }, + { + "epoch": 0.024104339137881816, + "grad_norm": 2.488199234008789, + "learning_rate": 4.992838921426549e-05, + "loss": 6.4893, + "step": 4053 + }, + { + "epoch": 0.02411028642116281, + "grad_norm": 2.3767058849334717, + "learning_rate": 4.992835388090215e-05, + "loss": 5.9828, + "step": 4054 + }, + { + "epoch": 0.02411623370444381, + "grad_norm": 2.3979814052581787, + "learning_rate": 4.992831853883657e-05, + "loss": 5.7607, + "step": 4055 + }, + { + "epoch": 0.024122180987724808, + "grad_norm": 2.766644239425659, + "learning_rate": 4.992828318806877e-05, + "loss": 5.523, + "step": 4056 + }, + { + "epoch": 0.024128128271005803, + "grad_norm": 3.3954427242279053, + "learning_rate": 4.9928247828598775e-05, + "loss": 6.1247, + "step": 4057 + }, + { + "epoch": 0.024134075554286802, + "grad_norm": 3.5597097873687744, + "learning_rate": 4.9928212460426585e-05, + "loss": 6.0877, + "step": 4058 + }, + { + "epoch": 0.0241400228375678, + "grad_norm": 2.8089418411254883, + "learning_rate": 4.992817708355221e-05, + "loss": 5.324, + "step": 4059 + }, + { + "epoch": 0.024145970120848795, + "grad_norm": 2.6756842136383057, + "learning_rate": 4.992814169797566e-05, + "loss": 5.5516, + "step": 4060 + }, + { + "epoch": 0.024151917404129794, + "grad_norm": 2.1218929290771484, + "learning_rate": 4.992810630369696e-05, + "loss": 6.102, + "step": 4061 + }, + { + "epoch": 0.02415786468741079, + "grad_norm": 2.7189652919769287, + "learning_rate": 4.992807090071611e-05, + "loss": 6.4258, + "step": 4062 + }, + { + "epoch": 0.024163811970691788, + "grad_norm": 2.4340744018554688, + "learning_rate": 4.992803548903313e-05, + "loss": 5.8059, + "step": 4063 + }, + { + "epoch": 0.024169759253972786, + "grad_norm": 2.46604323387146, + "learning_rate": 4.992800006864804e-05, + "loss": 5.8963, + "step": 4064 + }, + { + "epoch": 0.02417570653725378, + "grad_norm": 2.1969218254089355, + "learning_rate": 4.9927964639560835e-05, + "loss": 5.7835, + "step": 4065 + }, + { + "epoch": 0.02418165382053478, + "grad_norm": 2.4529223442077637, + "learning_rate": 4.9927929201771535e-05, + "loss": 6.3405, + "step": 4066 + }, + { + "epoch": 0.02418760110381578, + "grad_norm": 2.145331859588623, + "learning_rate": 4.992789375528015e-05, + "loss": 6.14, + "step": 4067 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 2.212646961212158, + "learning_rate": 4.99278583000867e-05, + "loss": 5.8793, + "step": 4068 + }, + { + "epoch": 0.024199495670377772, + "grad_norm": 2.3249876499176025, + "learning_rate": 4.992782283619118e-05, + "loss": 5.8702, + "step": 4069 + }, + { + "epoch": 0.024205442953658767, + "grad_norm": 2.180964946746826, + "learning_rate": 4.9927787363593634e-05, + "loss": 6.216, + "step": 4070 + }, + { + "epoch": 0.024211390236939766, + "grad_norm": 2.5633153915405273, + "learning_rate": 4.992775188229405e-05, + "loss": 6.031, + "step": 4071 + }, + { + "epoch": 0.024217337520220764, + "grad_norm": 2.867342233657837, + "learning_rate": 4.992771639229244e-05, + "loss": 5.9853, + "step": 4072 + }, + { + "epoch": 0.02422328480350176, + "grad_norm": 2.111253023147583, + "learning_rate": 4.992768089358882e-05, + "loss": 5.8404, + "step": 4073 + }, + { + "epoch": 0.024229232086782758, + "grad_norm": 1.9325549602508545, + "learning_rate": 4.992764538618321e-05, + "loss": 6.0175, + "step": 4074 + }, + { + "epoch": 0.024235179370063756, + "grad_norm": 2.721740484237671, + "learning_rate": 4.992760987007561e-05, + "loss": 5.9274, + "step": 4075 + }, + { + "epoch": 0.02424112665334475, + "grad_norm": 3.5240588188171387, + "learning_rate": 4.992757434526604e-05, + "loss": 5.3593, + "step": 4076 + }, + { + "epoch": 0.02424707393662575, + "grad_norm": 2.744248867034912, + "learning_rate": 4.9927538811754516e-05, + "loss": 5.8938, + "step": 4077 + }, + { + "epoch": 0.024253021219906745, + "grad_norm": 2.545384645462036, + "learning_rate": 4.992750326954104e-05, + "loss": 6.2127, + "step": 4078 + }, + { + "epoch": 0.024258968503187744, + "grad_norm": 2.7550806999206543, + "learning_rate": 4.992746771862563e-05, + "loss": 6.0784, + "step": 4079 + }, + { + "epoch": 0.024264915786468742, + "grad_norm": 2.408040761947632, + "learning_rate": 4.9927432159008305e-05, + "loss": 5.5908, + "step": 4080 + }, + { + "epoch": 0.024270863069749737, + "grad_norm": 2.581378698348999, + "learning_rate": 4.9927396590689066e-05, + "loss": 5.4438, + "step": 4081 + }, + { + "epoch": 0.024276810353030736, + "grad_norm": 2.4320218563079834, + "learning_rate": 4.992736101366794e-05, + "loss": 5.6239, + "step": 4082 + }, + { + "epoch": 0.024282757636311735, + "grad_norm": 2.4725472927093506, + "learning_rate": 4.992732542794492e-05, + "loss": 6.237, + "step": 4083 + }, + { + "epoch": 0.02428870491959273, + "grad_norm": 2.3081839084625244, + "learning_rate": 4.992728983352003e-05, + "loss": 5.9917, + "step": 4084 + }, + { + "epoch": 0.024294652202873728, + "grad_norm": 1.9090701341629028, + "learning_rate": 4.9927254230393287e-05, + "loss": 5.9125, + "step": 4085 + }, + { + "epoch": 0.024300599486154723, + "grad_norm": 2.3943240642547607, + "learning_rate": 4.992721861856468e-05, + "loss": 5.3431, + "step": 4086 + }, + { + "epoch": 0.024306546769435722, + "grad_norm": 2.226968765258789, + "learning_rate": 4.992718299803425e-05, + "loss": 5.4328, + "step": 4087 + }, + { + "epoch": 0.02431249405271672, + "grad_norm": 2.238218307495117, + "learning_rate": 4.9927147368801994e-05, + "loss": 5.4877, + "step": 4088 + }, + { + "epoch": 0.024318441335997715, + "grad_norm": 2.216540575027466, + "learning_rate": 4.992711173086794e-05, + "loss": 5.4037, + "step": 4089 + }, + { + "epoch": 0.024324388619278714, + "grad_norm": 2.3136301040649414, + "learning_rate": 4.992707608423208e-05, + "loss": 5.4576, + "step": 4090 + }, + { + "epoch": 0.02433033590255971, + "grad_norm": 2.0434980392456055, + "learning_rate": 4.9927040428894436e-05, + "loss": 5.8044, + "step": 4091 + }, + { + "epoch": 0.024336283185840708, + "grad_norm": 2.7837064266204834, + "learning_rate": 4.992700476485502e-05, + "loss": 6.4183, + "step": 4092 + }, + { + "epoch": 0.024342230469121706, + "grad_norm": 2.580411195755005, + "learning_rate": 4.992696909211384e-05, + "loss": 5.4545, + "step": 4093 + }, + { + "epoch": 0.0243481777524027, + "grad_norm": 2.1215696334838867, + "learning_rate": 4.9926933410670916e-05, + "loss": 5.5629, + "step": 4094 + }, + { + "epoch": 0.0243541250356837, + "grad_norm": 1.9621074199676514, + "learning_rate": 4.992689772052626e-05, + "loss": 5.5248, + "step": 4095 + }, + { + "epoch": 0.0243600723189647, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.992686202167988e-05, + "loss": 5.3285, + "step": 4096 + }, + { + "epoch": 0.024366019602245693, + "grad_norm": 1.9506359100341797, + "learning_rate": 4.992682631413179e-05, + "loss": 5.7989, + "step": 4097 + }, + { + "epoch": 0.024371966885526692, + "grad_norm": 1.9154741764068604, + "learning_rate": 4.9926790597882e-05, + "loss": 5.6029, + "step": 4098 + }, + { + "epoch": 0.024377914168807687, + "grad_norm": 2.2147481441497803, + "learning_rate": 4.9926754872930524e-05, + "loss": 5.5406, + "step": 4099 + }, + { + "epoch": 0.024383861452088686, + "grad_norm": 2.1268460750579834, + "learning_rate": 4.992671913927738e-05, + "loss": 5.6434, + "step": 4100 + }, + { + "epoch": 0.024389808735369684, + "grad_norm": 2.1212456226348877, + "learning_rate": 4.992668339692258e-05, + "loss": 5.6888, + "step": 4101 + }, + { + "epoch": 0.02439575601865068, + "grad_norm": 2.2292001247406006, + "learning_rate": 4.992664764586612e-05, + "loss": 5.3982, + "step": 4102 + }, + { + "epoch": 0.024401703301931678, + "grad_norm": 2.2713210582733154, + "learning_rate": 4.9926611886108035e-05, + "loss": 5.3521, + "step": 4103 + }, + { + "epoch": 0.024407650585212676, + "grad_norm": 2.273437738418579, + "learning_rate": 4.9926576117648314e-05, + "loss": 5.474, + "step": 4104 + }, + { + "epoch": 0.02441359786849367, + "grad_norm": 2.2879083156585693, + "learning_rate": 4.9926540340487e-05, + "loss": 5.4474, + "step": 4105 + }, + { + "epoch": 0.02441954515177467, + "grad_norm": 2.2517430782318115, + "learning_rate": 4.992650455462408e-05, + "loss": 5.5013, + "step": 4106 + }, + { + "epoch": 0.024425492435055665, + "grad_norm": 2.1391677856445312, + "learning_rate": 4.992646876005957e-05, + "loss": 5.3899, + "step": 4107 + }, + { + "epoch": 0.024431439718336664, + "grad_norm": 2.2989962100982666, + "learning_rate": 4.9926432956793494e-05, + "loss": 5.7995, + "step": 4108 + }, + { + "epoch": 0.024437387001617662, + "grad_norm": 2.550706386566162, + "learning_rate": 4.992639714482586e-05, + "loss": 5.6599, + "step": 4109 + }, + { + "epoch": 0.024443334284898657, + "grad_norm": 2.321398973464966, + "learning_rate": 4.992636132415667e-05, + "loss": 5.6852, + "step": 4110 + }, + { + "epoch": 0.024449281568179656, + "grad_norm": 2.300795555114746, + "learning_rate": 4.992632549478595e-05, + "loss": 5.7318, + "step": 4111 + }, + { + "epoch": 0.024455228851460654, + "grad_norm": 2.229156970977783, + "learning_rate": 4.992628965671371e-05, + "loss": 5.6617, + "step": 4112 + }, + { + "epoch": 0.02446117613474165, + "grad_norm": 2.253934144973755, + "learning_rate": 4.992625380993995e-05, + "loss": 5.5762, + "step": 4113 + }, + { + "epoch": 0.024467123418022648, + "grad_norm": 2.0932998657226562, + "learning_rate": 4.992621795446471e-05, + "loss": 5.568, + "step": 4114 + }, + { + "epoch": 0.024473070701303643, + "grad_norm": 2.5969886779785156, + "learning_rate": 4.9926182090287966e-05, + "loss": 5.6626, + "step": 4115 + }, + { + "epoch": 0.02447901798458464, + "grad_norm": 2.5260698795318604, + "learning_rate": 4.992614621740976e-05, + "loss": 5.6333, + "step": 4116 + }, + { + "epoch": 0.02448496526786564, + "grad_norm": 2.0017902851104736, + "learning_rate": 4.992611033583009e-05, + "loss": 5.793, + "step": 4117 + }, + { + "epoch": 0.024490912551146635, + "grad_norm": 2.1847705841064453, + "learning_rate": 4.992607444554898e-05, + "loss": 5.8348, + "step": 4118 + }, + { + "epoch": 0.024496859834427634, + "grad_norm": 2.141007900238037, + "learning_rate": 4.992603854656642e-05, + "loss": 5.7835, + "step": 4119 + }, + { + "epoch": 0.02450280711770863, + "grad_norm": 2.294605255126953, + "learning_rate": 4.992600263888245e-05, + "loss": 5.6615, + "step": 4120 + }, + { + "epoch": 0.024508754400989628, + "grad_norm": 2.433936357498169, + "learning_rate": 4.9925966722497064e-05, + "loss": 5.6479, + "step": 4121 + }, + { + "epoch": 0.024514701684270626, + "grad_norm": 2.1522979736328125, + "learning_rate": 4.992593079741028e-05, + "loss": 5.5761, + "step": 4122 + }, + { + "epoch": 0.02452064896755162, + "grad_norm": 2.141065835952759, + "learning_rate": 4.9925894863622114e-05, + "loss": 5.602, + "step": 4123 + }, + { + "epoch": 0.02452659625083262, + "grad_norm": 2.187838554382324, + "learning_rate": 4.9925858921132576e-05, + "loss": 5.6337, + "step": 4124 + }, + { + "epoch": 0.02453254353411362, + "grad_norm": 2.303027629852295, + "learning_rate": 4.992582296994167e-05, + "loss": 5.6126, + "step": 4125 + }, + { + "epoch": 0.024538490817394613, + "grad_norm": 1.9233589172363281, + "learning_rate": 4.992578701004943e-05, + "loss": 5.5852, + "step": 4126 + }, + { + "epoch": 0.024544438100675612, + "grad_norm": 2.0383386611938477, + "learning_rate": 4.992575104145585e-05, + "loss": 5.6477, + "step": 4127 + }, + { + "epoch": 0.024550385383956607, + "grad_norm": 2.2752933502197266, + "learning_rate": 4.9925715064160946e-05, + "loss": 5.6263, + "step": 4128 + }, + { + "epoch": 0.024556332667237606, + "grad_norm": 2.400083541870117, + "learning_rate": 4.9925679078164734e-05, + "loss": 5.5249, + "step": 4129 + }, + { + "epoch": 0.024562279950518604, + "grad_norm": 2.167365312576294, + "learning_rate": 4.992564308346722e-05, + "loss": 5.7299, + "step": 4130 + }, + { + "epoch": 0.0245682272337996, + "grad_norm": 1.9696096181869507, + "learning_rate": 4.9925607080068426e-05, + "loss": 5.7961, + "step": 4131 + }, + { + "epoch": 0.024574174517080598, + "grad_norm": 2.1817007064819336, + "learning_rate": 4.992557106796836e-05, + "loss": 5.7973, + "step": 4132 + }, + { + "epoch": 0.024580121800361596, + "grad_norm": 2.4329075813293457, + "learning_rate": 4.992553504716704e-05, + "loss": 6.2428, + "step": 4133 + }, + { + "epoch": 0.02458606908364259, + "grad_norm": 2.159193754196167, + "learning_rate": 4.9925499017664464e-05, + "loss": 5.5784, + "step": 4134 + }, + { + "epoch": 0.02459201636692359, + "grad_norm": 2.2614853382110596, + "learning_rate": 4.992546297946066e-05, + "loss": 5.7572, + "step": 4135 + }, + { + "epoch": 0.024597963650204585, + "grad_norm": 2.2874412536621094, + "learning_rate": 4.992542693255563e-05, + "loss": 5.5726, + "step": 4136 + }, + { + "epoch": 0.024603910933485584, + "grad_norm": 2.1634466648101807, + "learning_rate": 4.992539087694939e-05, + "loss": 5.5112, + "step": 4137 + }, + { + "epoch": 0.024609858216766582, + "grad_norm": 2.195528507232666, + "learning_rate": 4.9925354812641955e-05, + "loss": 5.6073, + "step": 4138 + }, + { + "epoch": 0.024615805500047577, + "grad_norm": 2.0328054428100586, + "learning_rate": 4.992531873963334e-05, + "loss": 5.5686, + "step": 4139 + }, + { + "epoch": 0.024621752783328576, + "grad_norm": 2.244218349456787, + "learning_rate": 4.992528265792355e-05, + "loss": 5.6871, + "step": 4140 + }, + { + "epoch": 0.024627700066609574, + "grad_norm": 2.081721544265747, + "learning_rate": 4.992524656751261e-05, + "loss": 5.5327, + "step": 4141 + }, + { + "epoch": 0.02463364734989057, + "grad_norm": 1.9305940866470337, + "learning_rate": 4.992521046840051e-05, + "loss": 5.5265, + "step": 4142 + }, + { + "epoch": 0.024639594633171568, + "grad_norm": 2.624286651611328, + "learning_rate": 4.992517436058728e-05, + "loss": 5.3881, + "step": 4143 + }, + { + "epoch": 0.024645541916452563, + "grad_norm": 2.204803705215454, + "learning_rate": 4.9925138244072935e-05, + "loss": 5.6686, + "step": 4144 + }, + { + "epoch": 0.02465148919973356, + "grad_norm": 2.4664852619171143, + "learning_rate": 4.992510211885748e-05, + "loss": 5.3152, + "step": 4145 + }, + { + "epoch": 0.02465743648301456, + "grad_norm": 2.3428542613983154, + "learning_rate": 4.992506598494093e-05, + "loss": 5.5875, + "step": 4146 + }, + { + "epoch": 0.024663383766295555, + "grad_norm": 2.1902847290039062, + "learning_rate": 4.992502984232329e-05, + "loss": 5.4826, + "step": 4147 + }, + { + "epoch": 0.024669331049576554, + "grad_norm": 2.0401039123535156, + "learning_rate": 4.992499369100459e-05, + "loss": 5.518, + "step": 4148 + }, + { + "epoch": 0.02467527833285755, + "grad_norm": 2.5250306129455566, + "learning_rate": 4.9924957530984825e-05, + "loss": 5.5744, + "step": 4149 + }, + { + "epoch": 0.024681225616138548, + "grad_norm": 1.9975959062576294, + "learning_rate": 4.9924921362264016e-05, + "loss": 5.6834, + "step": 4150 + }, + { + "epoch": 0.024687172899419546, + "grad_norm": 2.047011375427246, + "learning_rate": 4.992488518484217e-05, + "loss": 5.6703, + "step": 4151 + }, + { + "epoch": 0.02469312018270054, + "grad_norm": 2.142411470413208, + "learning_rate": 4.9924848998719314e-05, + "loss": 5.781, + "step": 4152 + }, + { + "epoch": 0.02469906746598154, + "grad_norm": 2.1012768745422363, + "learning_rate": 4.992481280389545e-05, + "loss": 5.618, + "step": 4153 + }, + { + "epoch": 0.024705014749262538, + "grad_norm": 2.4698173999786377, + "learning_rate": 4.9924776600370584e-05, + "loss": 6.4773, + "step": 4154 + }, + { + "epoch": 0.024710962032543533, + "grad_norm": 2.4975368976593018, + "learning_rate": 4.992474038814474e-05, + "loss": 5.2568, + "step": 4155 + }, + { + "epoch": 0.024716909315824532, + "grad_norm": 1.8329259157180786, + "learning_rate": 4.992470416721793e-05, + "loss": 5.775, + "step": 4156 + }, + { + "epoch": 0.024722856599105527, + "grad_norm": 1.9757754802703857, + "learning_rate": 4.992466793759015e-05, + "loss": 5.5408, + "step": 4157 + }, + { + "epoch": 0.024728803882386526, + "grad_norm": 1.8300005197525024, + "learning_rate": 4.9924631699261434e-05, + "loss": 5.5356, + "step": 4158 + }, + { + "epoch": 0.024734751165667524, + "grad_norm": 2.099102735519409, + "learning_rate": 4.992459545223179e-05, + "loss": 5.6811, + "step": 4159 + }, + { + "epoch": 0.02474069844894852, + "grad_norm": 2.000169277191162, + "learning_rate": 4.992455919650123e-05, + "loss": 5.511, + "step": 4160 + }, + { + "epoch": 0.024746645732229518, + "grad_norm": 2.0555150508880615, + "learning_rate": 4.992452293206976e-05, + "loss": 5.7553, + "step": 4161 + }, + { + "epoch": 0.024752593015510516, + "grad_norm": 2.0416486263275146, + "learning_rate": 4.99244866589374e-05, + "loss": 5.6965, + "step": 4162 + }, + { + "epoch": 0.02475854029879151, + "grad_norm": 2.0028059482574463, + "learning_rate": 4.9924450377104146e-05, + "loss": 5.7211, + "step": 4163 + }, + { + "epoch": 0.02476448758207251, + "grad_norm": 2.22377872467041, + "learning_rate": 4.992441408657004e-05, + "loss": 5.6384, + "step": 4164 + }, + { + "epoch": 0.024770434865353505, + "grad_norm": 2.038804531097412, + "learning_rate": 4.9924377787335064e-05, + "loss": 5.6351, + "step": 4165 + }, + { + "epoch": 0.024776382148634504, + "grad_norm": 2.357773542404175, + "learning_rate": 4.992434147939925e-05, + "loss": 5.2791, + "step": 4166 + }, + { + "epoch": 0.024782329431915502, + "grad_norm": 2.1949357986450195, + "learning_rate": 4.992430516276261e-05, + "loss": 5.7389, + "step": 4167 + }, + { + "epoch": 0.024788276715196497, + "grad_norm": 2.1015608310699463, + "learning_rate": 4.992426883742516e-05, + "loss": 5.632, + "step": 4168 + }, + { + "epoch": 0.024794223998477496, + "grad_norm": 2.166201591491699, + "learning_rate": 4.992423250338689e-05, + "loss": 5.5701, + "step": 4169 + }, + { + "epoch": 0.024800171281758494, + "grad_norm": 2.0805492401123047, + "learning_rate": 4.9924196160647836e-05, + "loss": 5.5955, + "step": 4170 + }, + { + "epoch": 0.02480611856503949, + "grad_norm": 1.803229570388794, + "learning_rate": 4.9924159809208e-05, + "loss": 5.6267, + "step": 4171 + }, + { + "epoch": 0.024812065848320488, + "grad_norm": 2.008639335632324, + "learning_rate": 4.9924123449067393e-05, + "loss": 5.6667, + "step": 4172 + }, + { + "epoch": 0.024818013131601483, + "grad_norm": 1.9843655824661255, + "learning_rate": 4.9924087080226044e-05, + "loss": 5.5981, + "step": 4173 + }, + { + "epoch": 0.02482396041488248, + "grad_norm": 2.10270357131958, + "learning_rate": 4.9924050702683946e-05, + "loss": 5.5293, + "step": 4174 + }, + { + "epoch": 0.02482990769816348, + "grad_norm": 2.315976142883301, + "learning_rate": 4.992401431644112e-05, + "loss": 5.6046, + "step": 4175 + }, + { + "epoch": 0.024835854981444475, + "grad_norm": 2.168473482131958, + "learning_rate": 4.992397792149758e-05, + "loss": 5.4271, + "step": 4176 + }, + { + "epoch": 0.024841802264725474, + "grad_norm": 2.1870200634002686, + "learning_rate": 4.9923941517853335e-05, + "loss": 5.6399, + "step": 4177 + }, + { + "epoch": 0.024847749548006472, + "grad_norm": 2.2944717407226562, + "learning_rate": 4.9923905105508394e-05, + "loss": 5.4483, + "step": 4178 + }, + { + "epoch": 0.024853696831287467, + "grad_norm": 2.1662731170654297, + "learning_rate": 4.9923868684462785e-05, + "loss": 5.6773, + "step": 4179 + }, + { + "epoch": 0.024859644114568466, + "grad_norm": 1.7448937892913818, + "learning_rate": 4.992383225471651e-05, + "loss": 5.6097, + "step": 4180 + }, + { + "epoch": 0.02486559139784946, + "grad_norm": 2.3577585220336914, + "learning_rate": 4.9923795816269576e-05, + "loss": 5.5003, + "step": 4181 + }, + { + "epoch": 0.02487153868113046, + "grad_norm": 2.4175360202789307, + "learning_rate": 4.9923759369122e-05, + "loss": 5.4925, + "step": 4182 + }, + { + "epoch": 0.024877485964411458, + "grad_norm": 2.199329137802124, + "learning_rate": 4.992372291327381e-05, + "loss": 5.6239, + "step": 4183 + }, + { + "epoch": 0.024883433247692453, + "grad_norm": 2.054450511932373, + "learning_rate": 4.9923686448724994e-05, + "loss": 5.59, + "step": 4184 + }, + { + "epoch": 0.024889380530973452, + "grad_norm": 2.0354533195495605, + "learning_rate": 4.9923649975475585e-05, + "loss": 5.6092, + "step": 4185 + }, + { + "epoch": 0.024895327814254447, + "grad_norm": 2.0409371852874756, + "learning_rate": 4.9923613493525576e-05, + "loss": 5.5009, + "step": 4186 + }, + { + "epoch": 0.024901275097535445, + "grad_norm": 2.3314719200134277, + "learning_rate": 4.992357700287501e-05, + "loss": 5.5077, + "step": 4187 + }, + { + "epoch": 0.024907222380816444, + "grad_norm": 2.050706386566162, + "learning_rate": 4.9923540503523865e-05, + "loss": 5.5857, + "step": 4188 + }, + { + "epoch": 0.02491316966409744, + "grad_norm": 2.3477721214294434, + "learning_rate": 4.992350399547218e-05, + "loss": 5.5119, + "step": 4189 + }, + { + "epoch": 0.024919116947378438, + "grad_norm": 2.365171194076538, + "learning_rate": 4.992346747871994e-05, + "loss": 5.583, + "step": 4190 + }, + { + "epoch": 0.024925064230659436, + "grad_norm": 1.9642738103866577, + "learning_rate": 4.992343095326719e-05, + "loss": 5.3527, + "step": 4191 + }, + { + "epoch": 0.02493101151394043, + "grad_norm": 2.25437593460083, + "learning_rate": 4.992339441911392e-05, + "loss": 5.4751, + "step": 4192 + }, + { + "epoch": 0.02493695879722143, + "grad_norm": 2.0476715564727783, + "learning_rate": 4.992335787626016e-05, + "loss": 5.5808, + "step": 4193 + }, + { + "epoch": 0.024942906080502425, + "grad_norm": 2.248382329940796, + "learning_rate": 4.992332132470591e-05, + "loss": 5.5771, + "step": 4194 + }, + { + "epoch": 0.024948853363783424, + "grad_norm": 2.279232978820801, + "learning_rate": 4.992328476445118e-05, + "loss": 5.3803, + "step": 4195 + }, + { + "epoch": 0.024954800647064422, + "grad_norm": 2.0171918869018555, + "learning_rate": 4.992324819549599e-05, + "loss": 5.662, + "step": 4196 + }, + { + "epoch": 0.024960747930345417, + "grad_norm": 2.14736008644104, + "learning_rate": 4.992321161784036e-05, + "loss": 5.6422, + "step": 4197 + }, + { + "epoch": 0.024966695213626416, + "grad_norm": 2.1694438457489014, + "learning_rate": 4.9923175031484284e-05, + "loss": 5.4377, + "step": 4198 + }, + { + "epoch": 0.024972642496907414, + "grad_norm": 1.9280356168746948, + "learning_rate": 4.9923138436427784e-05, + "loss": 5.5499, + "step": 4199 + }, + { + "epoch": 0.02497858978018841, + "grad_norm": 2.185974359512329, + "learning_rate": 4.992310183267088e-05, + "loss": 5.6404, + "step": 4200 + }, + { + "epoch": 0.024984537063469408, + "grad_norm": 2.102681875228882, + "learning_rate": 4.9923065220213585e-05, + "loss": 5.5888, + "step": 4201 + }, + { + "epoch": 0.024990484346750403, + "grad_norm": 2.07100772857666, + "learning_rate": 4.99230285990559e-05, + "loss": 5.6473, + "step": 4202 + }, + { + "epoch": 0.0249964316300314, + "grad_norm": 2.088634967803955, + "learning_rate": 4.992299196919784e-05, + "loss": 5.4993, + "step": 4203 + }, + { + "epoch": 0.0250023789133124, + "grad_norm": 2.2086873054504395, + "learning_rate": 4.992295533063942e-05, + "loss": 5.5797, + "step": 4204 + }, + { + "epoch": 0.025008326196593395, + "grad_norm": 2.250753164291382, + "learning_rate": 4.992291868338066e-05, + "loss": 5.5666, + "step": 4205 + }, + { + "epoch": 0.025014273479874394, + "grad_norm": 2.132636785507202, + "learning_rate": 4.992288202742156e-05, + "loss": 5.6715, + "step": 4206 + }, + { + "epoch": 0.025020220763155392, + "grad_norm": 2.8332200050354004, + "learning_rate": 4.992284536276214e-05, + "loss": 4.9687, + "step": 4207 + }, + { + "epoch": 0.025026168046436387, + "grad_norm": 2.345991849899292, + "learning_rate": 4.992280868940241e-05, + "loss": 5.2181, + "step": 4208 + }, + { + "epoch": 0.025032115329717386, + "grad_norm": 2.149568557739258, + "learning_rate": 4.992277200734239e-05, + "loss": 5.5336, + "step": 4209 + }, + { + "epoch": 0.02503806261299838, + "grad_norm": 2.031353235244751, + "learning_rate": 4.992273531658209e-05, + "loss": 5.5779, + "step": 4210 + }, + { + "epoch": 0.02504400989627938, + "grad_norm": 2.217374086380005, + "learning_rate": 4.9922698617121524e-05, + "loss": 5.782, + "step": 4211 + }, + { + "epoch": 0.025049957179560378, + "grad_norm": 2.3629000186920166, + "learning_rate": 4.992266190896069e-05, + "loss": 5.7916, + "step": 4212 + }, + { + "epoch": 0.025055904462841373, + "grad_norm": 2.2439091205596924, + "learning_rate": 4.9922625192099616e-05, + "loss": 5.8002, + "step": 4213 + }, + { + "epoch": 0.025061851746122372, + "grad_norm": 2.1707634925842285, + "learning_rate": 4.992258846653831e-05, + "loss": 6.5789, + "step": 4214 + }, + { + "epoch": 0.025067799029403367, + "grad_norm": 3.1655468940734863, + "learning_rate": 4.992255173227679e-05, + "loss": 6.3867, + "step": 4215 + }, + { + "epoch": 0.025073746312684365, + "grad_norm": 3.1309874057769775, + "learning_rate": 4.992251498931506e-05, + "loss": 6.2682, + "step": 4216 + }, + { + "epoch": 0.025079693595965364, + "grad_norm": 3.2077460289001465, + "learning_rate": 4.992247823765315e-05, + "loss": 5.8593, + "step": 4217 + }, + { + "epoch": 0.02508564087924636, + "grad_norm": 2.2944962978363037, + "learning_rate": 4.992244147729105e-05, + "loss": 5.7994, + "step": 4218 + }, + { + "epoch": 0.025091588162527358, + "grad_norm": 2.2380926609039307, + "learning_rate": 4.9922404708228776e-05, + "loss": 5.7606, + "step": 4219 + }, + { + "epoch": 0.025097535445808356, + "grad_norm": 2.601795196533203, + "learning_rate": 4.992236793046636e-05, + "loss": 5.7585, + "step": 4220 + }, + { + "epoch": 0.02510348272908935, + "grad_norm": 2.494765520095825, + "learning_rate": 4.99223311440038e-05, + "loss": 5.8102, + "step": 4221 + }, + { + "epoch": 0.02510943001237035, + "grad_norm": 2.4690544605255127, + "learning_rate": 4.992229434884111e-05, + "loss": 5.8682, + "step": 4222 + }, + { + "epoch": 0.025115377295651345, + "grad_norm": 2.1011085510253906, + "learning_rate": 4.99222575449783e-05, + "loss": 5.6982, + "step": 4223 + }, + { + "epoch": 0.025121324578932343, + "grad_norm": 2.2298128604888916, + "learning_rate": 4.992222073241539e-05, + "loss": 5.7606, + "step": 4224 + }, + { + "epoch": 0.025127271862213342, + "grad_norm": 1.93464994430542, + "learning_rate": 4.99221839111524e-05, + "loss": 5.7097, + "step": 4225 + }, + { + "epoch": 0.025133219145494337, + "grad_norm": 2.15191650390625, + "learning_rate": 4.9922147081189324e-05, + "loss": 5.5852, + "step": 4226 + }, + { + "epoch": 0.025139166428775336, + "grad_norm": 2.086954355239868, + "learning_rate": 4.992211024252619e-05, + "loss": 5.5871, + "step": 4227 + }, + { + "epoch": 0.025145113712056334, + "grad_norm": 2.212296724319458, + "learning_rate": 4.9922073395162995e-05, + "loss": 5.562, + "step": 4228 + }, + { + "epoch": 0.02515106099533733, + "grad_norm": 2.0786778926849365, + "learning_rate": 4.992203653909977e-05, + "loss": 5.6599, + "step": 4229 + }, + { + "epoch": 0.025157008278618328, + "grad_norm": 2.3243489265441895, + "learning_rate": 4.9921999674336514e-05, + "loss": 5.9791, + "step": 4230 + }, + { + "epoch": 0.025162955561899323, + "grad_norm": 2.1922898292541504, + "learning_rate": 4.9921962800873247e-05, + "loss": 5.7352, + "step": 4231 + }, + { + "epoch": 0.02516890284518032, + "grad_norm": 2.1154398918151855, + "learning_rate": 4.992192591870998e-05, + "loss": 5.6408, + "step": 4232 + }, + { + "epoch": 0.02517485012846132, + "grad_norm": 2.3520143032073975, + "learning_rate": 4.992188902784673e-05, + "loss": 5.6318, + "step": 4233 + }, + { + "epoch": 0.025180797411742315, + "grad_norm": 2.16597580909729, + "learning_rate": 4.99218521282835e-05, + "loss": 5.4978, + "step": 4234 + }, + { + "epoch": 0.025186744695023314, + "grad_norm": 2.2510032653808594, + "learning_rate": 4.992181522002032e-05, + "loss": 5.4863, + "step": 4235 + }, + { + "epoch": 0.025192691978304312, + "grad_norm": 1.9984945058822632, + "learning_rate": 4.9921778303057174e-05, + "loss": 5.7514, + "step": 4236 + }, + { + "epoch": 0.025198639261585307, + "grad_norm": 2.019435167312622, + "learning_rate": 4.9921741377394106e-05, + "loss": 5.6481, + "step": 4237 + }, + { + "epoch": 0.025204586544866306, + "grad_norm": 1.8546136617660522, + "learning_rate": 4.9921704443031114e-05, + "loss": 5.5907, + "step": 4238 + }, + { + "epoch": 0.0252105338281473, + "grad_norm": 2.012821912765503, + "learning_rate": 4.9921667499968214e-05, + "loss": 5.6942, + "step": 4239 + }, + { + "epoch": 0.0252164811114283, + "grad_norm": 2.215322971343994, + "learning_rate": 4.992163054820541e-05, + "loss": 5.6248, + "step": 4240 + }, + { + "epoch": 0.025222428394709298, + "grad_norm": 2.1009631156921387, + "learning_rate": 4.9921593587742726e-05, + "loss": 5.7769, + "step": 4241 + }, + { + "epoch": 0.025228375677990293, + "grad_norm": 2.280970335006714, + "learning_rate": 4.992155661858017e-05, + "loss": 5.4233, + "step": 4242 + }, + { + "epoch": 0.025234322961271292, + "grad_norm": 2.324589729309082, + "learning_rate": 4.992151964071776e-05, + "loss": 5.7138, + "step": 4243 + }, + { + "epoch": 0.025240270244552287, + "grad_norm": 2.01705002784729, + "learning_rate": 4.9921482654155506e-05, + "loss": 5.6946, + "step": 4244 + }, + { + "epoch": 0.025246217527833285, + "grad_norm": 2.0912036895751953, + "learning_rate": 4.9921445658893414e-05, + "loss": 5.8085, + "step": 4245 + }, + { + "epoch": 0.025252164811114284, + "grad_norm": 2.03450870513916, + "learning_rate": 4.99214086549315e-05, + "loss": 5.9129, + "step": 4246 + }, + { + "epoch": 0.02525811209439528, + "grad_norm": 2.1532092094421387, + "learning_rate": 4.9921371642269786e-05, + "loss": 5.708, + "step": 4247 + }, + { + "epoch": 0.025264059377676278, + "grad_norm": 2.2842540740966797, + "learning_rate": 4.992133462090828e-05, + "loss": 5.6693, + "step": 4248 + }, + { + "epoch": 0.025270006660957276, + "grad_norm": 2.0693325996398926, + "learning_rate": 4.9921297590846997e-05, + "loss": 5.7278, + "step": 4249 + }, + { + "epoch": 0.02527595394423827, + "grad_norm": 2.0139124393463135, + "learning_rate": 4.9921260552085934e-05, + "loss": 5.5897, + "step": 4250 + }, + { + "epoch": 0.02528190122751927, + "grad_norm": 2.4587321281433105, + "learning_rate": 4.9921223504625125e-05, + "loss": 5.6884, + "step": 4251 + }, + { + "epoch": 0.025287848510800265, + "grad_norm": 2.062640428543091, + "learning_rate": 4.992118644846457e-05, + "loss": 5.6189, + "step": 4252 + }, + { + "epoch": 0.025293795794081263, + "grad_norm": 1.9889299869537354, + "learning_rate": 4.992114938360429e-05, + "loss": 5.7326, + "step": 4253 + }, + { + "epoch": 0.025299743077362262, + "grad_norm": 2.001913547515869, + "learning_rate": 4.992111231004429e-05, + "loss": 5.6765, + "step": 4254 + }, + { + "epoch": 0.025305690360643257, + "grad_norm": 2.0345358848571777, + "learning_rate": 4.992107522778459e-05, + "loss": 5.5783, + "step": 4255 + }, + { + "epoch": 0.025311637643924256, + "grad_norm": 2.277817487716675, + "learning_rate": 4.9921038136825205e-05, + "loss": 5.6672, + "step": 4256 + }, + { + "epoch": 0.025317584927205254, + "grad_norm": 1.8992491960525513, + "learning_rate": 4.992100103716614e-05, + "loss": 5.532, + "step": 4257 + }, + { + "epoch": 0.02532353221048625, + "grad_norm": 2.202746629714966, + "learning_rate": 4.992096392880741e-05, + "loss": 5.697, + "step": 4258 + }, + { + "epoch": 0.025329479493767248, + "grad_norm": 2.020514488220215, + "learning_rate": 4.992092681174903e-05, + "loss": 5.9102, + "step": 4259 + }, + { + "epoch": 0.025335426777048243, + "grad_norm": 2.0697989463806152, + "learning_rate": 4.9920889685991e-05, + "loss": 5.5165, + "step": 4260 + }, + { + "epoch": 0.02534137406032924, + "grad_norm": 2.619258165359497, + "learning_rate": 4.992085255153336e-05, + "loss": 5.6577, + "step": 4261 + }, + { + "epoch": 0.02534732134361024, + "grad_norm": 2.1612637042999268, + "learning_rate": 4.99208154083761e-05, + "loss": 5.8193, + "step": 4262 + }, + { + "epoch": 0.025353268626891235, + "grad_norm": 1.9237465858459473, + "learning_rate": 4.9920778256519244e-05, + "loss": 5.6533, + "step": 4263 + }, + { + "epoch": 0.025359215910172234, + "grad_norm": 2.164339065551758, + "learning_rate": 4.99207410959628e-05, + "loss": 5.5566, + "step": 4264 + }, + { + "epoch": 0.025365163193453232, + "grad_norm": 2.0753626823425293, + "learning_rate": 4.992070392670678e-05, + "loss": 5.8444, + "step": 4265 + }, + { + "epoch": 0.025371110476734227, + "grad_norm": 1.977522850036621, + "learning_rate": 4.992066674875121e-05, + "loss": 5.6615, + "step": 4266 + }, + { + "epoch": 0.025377057760015226, + "grad_norm": 1.9911431074142456, + "learning_rate": 4.992062956209608e-05, + "loss": 5.6366, + "step": 4267 + }, + { + "epoch": 0.02538300504329622, + "grad_norm": 2.0334808826446533, + "learning_rate": 4.992059236674142e-05, + "loss": 5.8399, + "step": 4268 + }, + { + "epoch": 0.02538895232657722, + "grad_norm": 2.2869162559509277, + "learning_rate": 4.992055516268724e-05, + "loss": 5.7302, + "step": 4269 + }, + { + "epoch": 0.025394899609858218, + "grad_norm": 2.0845389366149902, + "learning_rate": 4.9920517949933556e-05, + "loss": 5.619, + "step": 4270 + }, + { + "epoch": 0.025400846893139213, + "grad_norm": 2.290881633758545, + "learning_rate": 4.9920480728480376e-05, + "loss": 5.5629, + "step": 4271 + }, + { + "epoch": 0.02540679417642021, + "grad_norm": 2.0897767543792725, + "learning_rate": 4.9920443498327706e-05, + "loss": 5.7009, + "step": 4272 + }, + { + "epoch": 0.025412741459701207, + "grad_norm": 1.8389668464660645, + "learning_rate": 4.9920406259475574e-05, + "loss": 5.6359, + "step": 4273 + }, + { + "epoch": 0.025418688742982205, + "grad_norm": 2.0262937545776367, + "learning_rate": 4.992036901192399e-05, + "loss": 5.6707, + "step": 4274 + }, + { + "epoch": 0.025424636026263204, + "grad_norm": 2.04280686378479, + "learning_rate": 4.992033175567295e-05, + "loss": 5.7917, + "step": 4275 + }, + { + "epoch": 0.0254305833095442, + "grad_norm": 2.0945205688476562, + "learning_rate": 4.992029449072249e-05, + "loss": 5.7208, + "step": 4276 + }, + { + "epoch": 0.025436530592825198, + "grad_norm": 1.9662036895751953, + "learning_rate": 4.992025721707261e-05, + "loss": 5.7141, + "step": 4277 + }, + { + "epoch": 0.025442477876106196, + "grad_norm": 2.582284450531006, + "learning_rate": 4.9920219934723316e-05, + "loss": 5.9514, + "step": 4278 + }, + { + "epoch": 0.02544842515938719, + "grad_norm": 1.9792051315307617, + "learning_rate": 4.992018264367464e-05, + "loss": 5.3867, + "step": 4279 + }, + { + "epoch": 0.02545437244266819, + "grad_norm": 2.0107717514038086, + "learning_rate": 4.992014534392658e-05, + "loss": 5.5985, + "step": 4280 + }, + { + "epoch": 0.025460319725949185, + "grad_norm": 2.2035727500915527, + "learning_rate": 4.9920108035479166e-05, + "loss": 5.6356, + "step": 4281 + }, + { + "epoch": 0.025466267009230183, + "grad_norm": 2.1973958015441895, + "learning_rate": 4.992007071833239e-05, + "loss": 5.3557, + "step": 4282 + }, + { + "epoch": 0.025472214292511182, + "grad_norm": 2.031371831893921, + "learning_rate": 4.9920033392486275e-05, + "loss": 5.484, + "step": 4283 + }, + { + "epoch": 0.025478161575792177, + "grad_norm": 1.9966185092926025, + "learning_rate": 4.991999605794084e-05, + "loss": 5.4137, + "step": 4284 + }, + { + "epoch": 0.025484108859073176, + "grad_norm": 1.699460506439209, + "learning_rate": 4.9919958714696085e-05, + "loss": 5.7099, + "step": 4285 + }, + { + "epoch": 0.025490056142354174, + "grad_norm": 2.270535945892334, + "learning_rate": 4.991992136275203e-05, + "loss": 5.6654, + "step": 4286 + }, + { + "epoch": 0.02549600342563517, + "grad_norm": 2.0636515617370605, + "learning_rate": 4.99198840021087e-05, + "loss": 5.6996, + "step": 4287 + }, + { + "epoch": 0.025501950708916168, + "grad_norm": 2.217365026473999, + "learning_rate": 4.991984663276608e-05, + "loss": 5.6148, + "step": 4288 + }, + { + "epoch": 0.025507897992197163, + "grad_norm": 2.182109832763672, + "learning_rate": 4.99198092547242e-05, + "loss": 5.6469, + "step": 4289 + }, + { + "epoch": 0.02551384527547816, + "grad_norm": 1.995924472808838, + "learning_rate": 4.9919771867983084e-05, + "loss": 5.7607, + "step": 4290 + }, + { + "epoch": 0.02551979255875916, + "grad_norm": 1.9308382272720337, + "learning_rate": 4.991973447254272e-05, + "loss": 5.7219, + "step": 4291 + }, + { + "epoch": 0.025525739842040155, + "grad_norm": 2.2675700187683105, + "learning_rate": 4.991969706840315e-05, + "loss": 5.7348, + "step": 4292 + }, + { + "epoch": 0.025531687125321154, + "grad_norm": 2.0441880226135254, + "learning_rate": 4.991965965556435e-05, + "loss": 5.5827, + "step": 4293 + }, + { + "epoch": 0.025537634408602152, + "grad_norm": 2.0111331939697266, + "learning_rate": 4.9919622234026376e-05, + "loss": 5.5355, + "step": 4294 + }, + { + "epoch": 0.025543581691883147, + "grad_norm": 2.214946985244751, + "learning_rate": 4.991958480378921e-05, + "loss": 5.5327, + "step": 4295 + }, + { + "epoch": 0.025549528975164146, + "grad_norm": 1.9673919677734375, + "learning_rate": 4.991954736485287e-05, + "loss": 5.5744, + "step": 4296 + }, + { + "epoch": 0.02555547625844514, + "grad_norm": 2.0662097930908203, + "learning_rate": 4.991950991721738e-05, + "loss": 5.5301, + "step": 4297 + }, + { + "epoch": 0.02556142354172614, + "grad_norm": 2.1912949085235596, + "learning_rate": 4.991947246088274e-05, + "loss": 5.6505, + "step": 4298 + }, + { + "epoch": 0.025567370825007138, + "grad_norm": 2.1073548793792725, + "learning_rate": 4.991943499584898e-05, + "loss": 5.7429, + "step": 4299 + }, + { + "epoch": 0.025573318108288133, + "grad_norm": 2.4015331268310547, + "learning_rate": 4.9919397522116096e-05, + "loss": 5.9959, + "step": 4300 + }, + { + "epoch": 0.02557926539156913, + "grad_norm": 2.5571470260620117, + "learning_rate": 4.99193600396841e-05, + "loss": 5.9058, + "step": 4301 + }, + { + "epoch": 0.02558521267485013, + "grad_norm": 2.148449182510376, + "learning_rate": 4.9919322548553026e-05, + "loss": 5.6298, + "step": 4302 + }, + { + "epoch": 0.025591159958131125, + "grad_norm": 2.3006222248077393, + "learning_rate": 4.991928504872287e-05, + "loss": 5.4854, + "step": 4303 + }, + { + "epoch": 0.025597107241412124, + "grad_norm": 2.2384679317474365, + "learning_rate": 4.9919247540193646e-05, + "loss": 5.7089, + "step": 4304 + }, + { + "epoch": 0.02560305452469312, + "grad_norm": 2.195736885070801, + "learning_rate": 4.9919210022965376e-05, + "loss": 5.986, + "step": 4305 + }, + { + "epoch": 0.025609001807974117, + "grad_norm": 2.3446342945098877, + "learning_rate": 4.991917249703806e-05, + "loss": 5.88, + "step": 4306 + }, + { + "epoch": 0.025614949091255116, + "grad_norm": 2.3800623416900635, + "learning_rate": 4.9919134962411724e-05, + "loss": 5.6897, + "step": 4307 + }, + { + "epoch": 0.02562089637453611, + "grad_norm": 1.8407396078109741, + "learning_rate": 4.991909741908637e-05, + "loss": 5.7359, + "step": 4308 + }, + { + "epoch": 0.02562684365781711, + "grad_norm": 2.3566956520080566, + "learning_rate": 4.9919059867062026e-05, + "loss": 5.5606, + "step": 4309 + }, + { + "epoch": 0.025632790941098105, + "grad_norm": 2.149317741394043, + "learning_rate": 4.991902230633869e-05, + "loss": 5.6966, + "step": 4310 + }, + { + "epoch": 0.025638738224379103, + "grad_norm": 2.3567728996276855, + "learning_rate": 4.991898473691638e-05, + "loss": 5.4694, + "step": 4311 + }, + { + "epoch": 0.025644685507660102, + "grad_norm": 1.9388068914413452, + "learning_rate": 4.9918947158795106e-05, + "loss": 5.5947, + "step": 4312 + }, + { + "epoch": 0.025650632790941097, + "grad_norm": 1.844419002532959, + "learning_rate": 4.9918909571974893e-05, + "loss": 5.6159, + "step": 4313 + }, + { + "epoch": 0.025656580074222095, + "grad_norm": 1.8664250373840332, + "learning_rate": 4.991887197645574e-05, + "loss": 5.7211, + "step": 4314 + }, + { + "epoch": 0.025662527357503094, + "grad_norm": 2.073004961013794, + "learning_rate": 4.991883437223767e-05, + "loss": 5.8873, + "step": 4315 + }, + { + "epoch": 0.02566847464078409, + "grad_norm": 2.316938877105713, + "learning_rate": 4.991879675932068e-05, + "loss": 5.4372, + "step": 4316 + }, + { + "epoch": 0.025674421924065088, + "grad_norm": 2.2646546363830566, + "learning_rate": 4.991875913770481e-05, + "loss": 5.5486, + "step": 4317 + }, + { + "epoch": 0.025680369207346083, + "grad_norm": 2.2417361736297607, + "learning_rate": 4.991872150739005e-05, + "loss": 5.2264, + "step": 4318 + }, + { + "epoch": 0.02568631649062708, + "grad_norm": 2.271566867828369, + "learning_rate": 4.9918683868376437e-05, + "loss": 5.1546, + "step": 4319 + }, + { + "epoch": 0.02569226377390808, + "grad_norm": 2.211650848388672, + "learning_rate": 4.9918646220663954e-05, + "loss": 5.382, + "step": 4320 + }, + { + "epoch": 0.025698211057189075, + "grad_norm": 2.3627288341522217, + "learning_rate": 4.991860856425263e-05, + "loss": 5.6099, + "step": 4321 + }, + { + "epoch": 0.025704158340470074, + "grad_norm": 2.3968141078948975, + "learning_rate": 4.991857089914249e-05, + "loss": 5.3689, + "step": 4322 + }, + { + "epoch": 0.025710105623751072, + "grad_norm": 2.3576786518096924, + "learning_rate": 4.991853322533352e-05, + "loss": 5.4441, + "step": 4323 + }, + { + "epoch": 0.025716052907032067, + "grad_norm": 2.0814530849456787, + "learning_rate": 4.991849554282575e-05, + "loss": 5.6137, + "step": 4324 + }, + { + "epoch": 0.025722000190313066, + "grad_norm": 2.103505849838257, + "learning_rate": 4.991845785161919e-05, + "loss": 5.5518, + "step": 4325 + }, + { + "epoch": 0.02572794747359406, + "grad_norm": 2.188350200653076, + "learning_rate": 4.991842015171386e-05, + "loss": 5.5958, + "step": 4326 + }, + { + "epoch": 0.02573389475687506, + "grad_norm": 2.124088764190674, + "learning_rate": 4.9918382443109766e-05, + "loss": 5.3851, + "step": 4327 + }, + { + "epoch": 0.025739842040156058, + "grad_norm": 2.181466579437256, + "learning_rate": 4.991834472580692e-05, + "loss": 5.4629, + "step": 4328 + }, + { + "epoch": 0.025745789323437053, + "grad_norm": 1.9634013175964355, + "learning_rate": 4.9918306999805344e-05, + "loss": 5.4768, + "step": 4329 + }, + { + "epoch": 0.02575173660671805, + "grad_norm": 2.2046115398406982, + "learning_rate": 4.991826926510503e-05, + "loss": 5.3977, + "step": 4330 + }, + { + "epoch": 0.02575768388999905, + "grad_norm": 1.8660465478897095, + "learning_rate": 4.9918231521706014e-05, + "loss": 5.4837, + "step": 4331 + }, + { + "epoch": 0.025763631173280045, + "grad_norm": 1.9825572967529297, + "learning_rate": 4.99181937696083e-05, + "loss": 5.5158, + "step": 4332 + }, + { + "epoch": 0.025769578456561044, + "grad_norm": 1.9114030599594116, + "learning_rate": 4.9918156008811906e-05, + "loss": 5.3291, + "step": 4333 + }, + { + "epoch": 0.02577552573984204, + "grad_norm": 2.008059024810791, + "learning_rate": 4.9918118239316835e-05, + "loss": 5.2993, + "step": 4334 + }, + { + "epoch": 0.025781473023123037, + "grad_norm": 2.0090153217315674, + "learning_rate": 4.991808046112311e-05, + "loss": 5.2951, + "step": 4335 + }, + { + "epoch": 0.025787420306404036, + "grad_norm": 2.013878345489502, + "learning_rate": 4.991804267423074e-05, + "loss": 5.3491, + "step": 4336 + }, + { + "epoch": 0.02579336758968503, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.9918004878639734e-05, + "loss": 5.2744, + "step": 4337 + }, + { + "epoch": 0.02579931487296603, + "grad_norm": 1.9945006370544434, + "learning_rate": 4.991796707435012e-05, + "loss": 5.5176, + "step": 4338 + }, + { + "epoch": 0.025805262156247025, + "grad_norm": 2.1205811500549316, + "learning_rate": 4.9917929261361894e-05, + "loss": 5.6534, + "step": 4339 + }, + { + "epoch": 0.025811209439528023, + "grad_norm": 2.6607353687286377, + "learning_rate": 4.991789143967508e-05, + "loss": 6.343, + "step": 4340 + }, + { + "epoch": 0.025817156722809022, + "grad_norm": 2.241818904876709, + "learning_rate": 4.991785360928968e-05, + "loss": 5.6774, + "step": 4341 + }, + { + "epoch": 0.025823104006090017, + "grad_norm": 1.9817326068878174, + "learning_rate": 4.9917815770205723e-05, + "loss": 5.7686, + "step": 4342 + }, + { + "epoch": 0.025829051289371015, + "grad_norm": 2.323802947998047, + "learning_rate": 4.991777792242321e-05, + "loss": 5.9564, + "step": 4343 + }, + { + "epoch": 0.025834998572652014, + "grad_norm": 2.3318228721618652, + "learning_rate": 4.991774006594216e-05, + "loss": 5.9057, + "step": 4344 + }, + { + "epoch": 0.02584094585593301, + "grad_norm": 2.032776355743408, + "learning_rate": 4.991770220076258e-05, + "loss": 5.9753, + "step": 4345 + }, + { + "epoch": 0.025846893139214008, + "grad_norm": 2.116837739944458, + "learning_rate": 4.9917664326884495e-05, + "loss": 5.8458, + "step": 4346 + }, + { + "epoch": 0.025852840422495003, + "grad_norm": 2.312878370285034, + "learning_rate": 4.991762644430791e-05, + "loss": 5.5128, + "step": 4347 + }, + { + "epoch": 0.025858787705776, + "grad_norm": 2.3003859519958496, + "learning_rate": 4.991758855303283e-05, + "loss": 5.7192, + "step": 4348 + }, + { + "epoch": 0.025864734989057, + "grad_norm": 1.898258924484253, + "learning_rate": 4.9917550653059286e-05, + "loss": 5.6422, + "step": 4349 + }, + { + "epoch": 0.025870682272337995, + "grad_norm": 1.9477754831314087, + "learning_rate": 4.9917512744387276e-05, + "loss": 5.7885, + "step": 4350 + }, + { + "epoch": 0.025876629555618993, + "grad_norm": 2.479979991912842, + "learning_rate": 4.991747482701683e-05, + "loss": 5.4692, + "step": 4351 + }, + { + "epoch": 0.025882576838899992, + "grad_norm": 2.324336290359497, + "learning_rate": 4.991743690094794e-05, + "loss": 5.4186, + "step": 4352 + }, + { + "epoch": 0.025888524122180987, + "grad_norm": 2.076723337173462, + "learning_rate": 4.9917398966180625e-05, + "loss": 5.4363, + "step": 4353 + }, + { + "epoch": 0.025894471405461986, + "grad_norm": 1.9004534482955933, + "learning_rate": 4.991736102271492e-05, + "loss": 5.6451, + "step": 4354 + }, + { + "epoch": 0.02590041868874298, + "grad_norm": 1.8098558187484741, + "learning_rate": 4.991732307055082e-05, + "loss": 5.8666, + "step": 4355 + }, + { + "epoch": 0.02590636597202398, + "grad_norm": 2.1158571243286133, + "learning_rate": 4.991728510968833e-05, + "loss": 5.5421, + "step": 4356 + }, + { + "epoch": 0.025912313255304978, + "grad_norm": 2.1235690116882324, + "learning_rate": 4.991724714012748e-05, + "loss": 5.9947, + "step": 4357 + }, + { + "epoch": 0.025918260538585973, + "grad_norm": 2.1306662559509277, + "learning_rate": 4.9917209161868276e-05, + "loss": 5.4648, + "step": 4358 + }, + { + "epoch": 0.02592420782186697, + "grad_norm": 1.7927355766296387, + "learning_rate": 4.991717117491073e-05, + "loss": 5.4339, + "step": 4359 + }, + { + "epoch": 0.02593015510514797, + "grad_norm": 2.314069986343384, + "learning_rate": 4.991713317925485e-05, + "loss": 5.5534, + "step": 4360 + }, + { + "epoch": 0.025936102388428965, + "grad_norm": 2.2628493309020996, + "learning_rate": 4.9917095174900665e-05, + "loss": 5.5996, + "step": 4361 + }, + { + "epoch": 0.025942049671709964, + "grad_norm": 2.1669869422912598, + "learning_rate": 4.991705716184818e-05, + "loss": 5.704, + "step": 4362 + }, + { + "epoch": 0.02594799695499096, + "grad_norm": 2.2048137187957764, + "learning_rate": 4.99170191400974e-05, + "loss": 5.6576, + "step": 4363 + }, + { + "epoch": 0.025953944238271957, + "grad_norm": 2.172398328781128, + "learning_rate": 4.991698110964835e-05, + "loss": 5.7254, + "step": 4364 + }, + { + "epoch": 0.025959891521552956, + "grad_norm": 1.9689068794250488, + "learning_rate": 4.9916943070501047e-05, + "loss": 5.7303, + "step": 4365 + }, + { + "epoch": 0.02596583880483395, + "grad_norm": 1.7037044763565063, + "learning_rate": 4.991690502265549e-05, + "loss": 5.6542, + "step": 4366 + }, + { + "epoch": 0.02597178608811495, + "grad_norm": 1.7666655778884888, + "learning_rate": 4.9916866966111695e-05, + "loss": 5.7833, + "step": 4367 + }, + { + "epoch": 0.025977733371395945, + "grad_norm": 2.0178141593933105, + "learning_rate": 4.991682890086968e-05, + "loss": 5.7759, + "step": 4368 + }, + { + "epoch": 0.025983680654676943, + "grad_norm": 1.7989983558654785, + "learning_rate": 4.991679082692946e-05, + "loss": 5.8772, + "step": 4369 + }, + { + "epoch": 0.025989627937957942, + "grad_norm": 1.8004199266433716, + "learning_rate": 4.9916752744291054e-05, + "loss": 5.6145, + "step": 4370 + }, + { + "epoch": 0.025995575221238937, + "grad_norm": 1.837074637413025, + "learning_rate": 4.991671465295446e-05, + "loss": 5.4874, + "step": 4371 + }, + { + "epoch": 0.026001522504519935, + "grad_norm": 1.7436491250991821, + "learning_rate": 4.991667655291969e-05, + "loss": 5.7212, + "step": 4372 + }, + { + "epoch": 0.026007469787800934, + "grad_norm": 1.7802095413208008, + "learning_rate": 4.991663844418678e-05, + "loss": 5.7004, + "step": 4373 + }, + { + "epoch": 0.02601341707108193, + "grad_norm": 2.112487316131592, + "learning_rate": 4.991660032675572e-05, + "loss": 5.5579, + "step": 4374 + }, + { + "epoch": 0.026019364354362928, + "grad_norm": 2.0917413234710693, + "learning_rate": 4.9916562200626535e-05, + "loss": 5.7825, + "step": 4375 + }, + { + "epoch": 0.026025311637643923, + "grad_norm": 1.8323053121566772, + "learning_rate": 4.991652406579924e-05, + "loss": 5.7699, + "step": 4376 + }, + { + "epoch": 0.02603125892092492, + "grad_norm": 1.9480723142623901, + "learning_rate": 4.9916485922273835e-05, + "loss": 5.6591, + "step": 4377 + }, + { + "epoch": 0.02603720620420592, + "grad_norm": 2.000739812850952, + "learning_rate": 4.991644777005035e-05, + "loss": 5.8919, + "step": 4378 + }, + { + "epoch": 0.026043153487486915, + "grad_norm": 2.093573808670044, + "learning_rate": 4.991640960912879e-05, + "loss": 5.7357, + "step": 4379 + }, + { + "epoch": 0.026049100770767913, + "grad_norm": 1.932019591331482, + "learning_rate": 4.991637143950916e-05, + "loss": 5.7268, + "step": 4380 + }, + { + "epoch": 0.026055048054048912, + "grad_norm": 1.820102572441101, + "learning_rate": 4.991633326119149e-05, + "loss": 5.8733, + "step": 4381 + }, + { + "epoch": 0.026060995337329907, + "grad_norm": 1.9091769456863403, + "learning_rate": 4.991629507417578e-05, + "loss": 5.5532, + "step": 4382 + }, + { + "epoch": 0.026066942620610906, + "grad_norm": 2.0037779808044434, + "learning_rate": 4.991625687846205e-05, + "loss": 5.7841, + "step": 4383 + }, + { + "epoch": 0.0260728899038919, + "grad_norm": 1.7106568813323975, + "learning_rate": 4.991621867405032e-05, + "loss": 5.4486, + "step": 4384 + }, + { + "epoch": 0.0260788371871729, + "grad_norm": 1.7802643775939941, + "learning_rate": 4.9916180460940585e-05, + "loss": 5.7494, + "step": 4385 + }, + { + "epoch": 0.026084784470453898, + "grad_norm": 2.089503288269043, + "learning_rate": 4.991614223913288e-05, + "loss": 5.6044, + "step": 4386 + }, + { + "epoch": 0.026090731753734893, + "grad_norm": 2.3315577507019043, + "learning_rate": 4.99161040086272e-05, + "loss": 5.9552, + "step": 4387 + }, + { + "epoch": 0.02609667903701589, + "grad_norm": 2.1202025413513184, + "learning_rate": 4.9916065769423566e-05, + "loss": 5.778, + "step": 4388 + }, + { + "epoch": 0.02610262632029689, + "grad_norm": 2.3448777198791504, + "learning_rate": 4.991602752152199e-05, + "loss": 5.8014, + "step": 4389 + }, + { + "epoch": 0.026108573603577885, + "grad_norm": 2.1613330841064453, + "learning_rate": 4.9915989264922495e-05, + "loss": 5.731, + "step": 4390 + }, + { + "epoch": 0.026114520886858884, + "grad_norm": 2.0314743518829346, + "learning_rate": 4.991595099962507e-05, + "loss": 5.8181, + "step": 4391 + }, + { + "epoch": 0.02612046817013988, + "grad_norm": 2.053994655609131, + "learning_rate": 4.9915912725629755e-05, + "loss": 5.7264, + "step": 4392 + }, + { + "epoch": 0.026126415453420877, + "grad_norm": 1.8720483779907227, + "learning_rate": 4.991587444293655e-05, + "loss": 5.5229, + "step": 4393 + }, + { + "epoch": 0.026132362736701876, + "grad_norm": 1.8745067119598389, + "learning_rate": 4.991583615154547e-05, + "loss": 5.612, + "step": 4394 + }, + { + "epoch": 0.02613831001998287, + "grad_norm": 2.124157428741455, + "learning_rate": 4.9915797851456525e-05, + "loss": 5.7276, + "step": 4395 + }, + { + "epoch": 0.02614425730326387, + "grad_norm": 2.2587873935699463, + "learning_rate": 4.991575954266974e-05, + "loss": 5.7994, + "step": 4396 + }, + { + "epoch": 0.026150204586544865, + "grad_norm": 1.9030078649520874, + "learning_rate": 4.9915721225185116e-05, + "loss": 5.7491, + "step": 4397 + }, + { + "epoch": 0.026156151869825863, + "grad_norm": 2.2278738021850586, + "learning_rate": 4.991568289900267e-05, + "loss": 5.4701, + "step": 4398 + }, + { + "epoch": 0.02616209915310686, + "grad_norm": 2.190974473953247, + "learning_rate": 4.991564456412242e-05, + "loss": 5.6731, + "step": 4399 + }, + { + "epoch": 0.026168046436387857, + "grad_norm": 2.3491454124450684, + "learning_rate": 4.991560622054438e-05, + "loss": 5.4041, + "step": 4400 + }, + { + "epoch": 0.026173993719668855, + "grad_norm": 2.2767796516418457, + "learning_rate": 4.991556786826854e-05, + "loss": 5.9005, + "step": 4401 + }, + { + "epoch": 0.026179941002949854, + "grad_norm": 2.3645145893096924, + "learning_rate": 4.991552950729496e-05, + "loss": 6.3108, + "step": 4402 + }, + { + "epoch": 0.02618588828623085, + "grad_norm": 2.1715476512908936, + "learning_rate": 4.9915491137623605e-05, + "loss": 5.8186, + "step": 4403 + }, + { + "epoch": 0.026191835569511848, + "grad_norm": 2.195758581161499, + "learning_rate": 4.991545275925452e-05, + "loss": 5.692, + "step": 4404 + }, + { + "epoch": 0.026197782852792843, + "grad_norm": 2.1124489307403564, + "learning_rate": 4.9915414372187705e-05, + "loss": 5.6582, + "step": 4405 + }, + { + "epoch": 0.02620373013607384, + "grad_norm": 1.9873831272125244, + "learning_rate": 4.991537597642317e-05, + "loss": 5.6309, + "step": 4406 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 1.9675770998001099, + "learning_rate": 4.991533757196094e-05, + "loss": 5.7095, + "step": 4407 + }, + { + "epoch": 0.026215624702635835, + "grad_norm": 1.9072648286819458, + "learning_rate": 4.991529915880103e-05, + "loss": 5.6449, + "step": 4408 + }, + { + "epoch": 0.026221571985916833, + "grad_norm": 2.3060495853424072, + "learning_rate": 4.9915260736943435e-05, + "loss": 5.6712, + "step": 4409 + }, + { + "epoch": 0.026227519269197832, + "grad_norm": 2.4438107013702393, + "learning_rate": 4.991522230638819e-05, + "loss": 5.2384, + "step": 4410 + }, + { + "epoch": 0.026233466552478827, + "grad_norm": 1.8102613687515259, + "learning_rate": 4.991518386713529e-05, + "loss": 5.5508, + "step": 4411 + }, + { + "epoch": 0.026239413835759826, + "grad_norm": 2.0226693153381348, + "learning_rate": 4.991514541918476e-05, + "loss": 5.4049, + "step": 4412 + }, + { + "epoch": 0.02624536111904082, + "grad_norm": 2.261418104171753, + "learning_rate": 4.991510696253661e-05, + "loss": 5.3324, + "step": 4413 + }, + { + "epoch": 0.02625130840232182, + "grad_norm": 2.232844352722168, + "learning_rate": 4.9915068497190856e-05, + "loss": 5.2601, + "step": 4414 + }, + { + "epoch": 0.026257255685602818, + "grad_norm": 2.2306487560272217, + "learning_rate": 4.99150300231475e-05, + "loss": 5.3329, + "step": 4415 + }, + { + "epoch": 0.026263202968883813, + "grad_norm": 2.1368730068206787, + "learning_rate": 4.9914991540406574e-05, + "loss": 5.573, + "step": 4416 + }, + { + "epoch": 0.02626915025216481, + "grad_norm": 1.984078288078308, + "learning_rate": 4.991495304896808e-05, + "loss": 5.6518, + "step": 4417 + }, + { + "epoch": 0.02627509753544581, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.9914914548832034e-05, + "loss": 5.7076, + "step": 4418 + }, + { + "epoch": 0.026281044818726805, + "grad_norm": 1.9880858659744263, + "learning_rate": 4.991487603999845e-05, + "loss": 5.6533, + "step": 4419 + }, + { + "epoch": 0.026286992102007804, + "grad_norm": 2.0475687980651855, + "learning_rate": 4.991483752246734e-05, + "loss": 5.6311, + "step": 4420 + }, + { + "epoch": 0.0262929393852888, + "grad_norm": 2.2796714305877686, + "learning_rate": 4.991479899623871e-05, + "loss": 5.364, + "step": 4421 + }, + { + "epoch": 0.026298886668569797, + "grad_norm": 1.8535730838775635, + "learning_rate": 4.991476046131259e-05, + "loss": 5.6153, + "step": 4422 + }, + { + "epoch": 0.026304833951850796, + "grad_norm": 1.97511887550354, + "learning_rate": 4.9914721917688976e-05, + "loss": 5.5682, + "step": 4423 + }, + { + "epoch": 0.02631078123513179, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.99146833653679e-05, + "loss": 5.5609, + "step": 4424 + }, + { + "epoch": 0.02631672851841279, + "grad_norm": 1.9997434616088867, + "learning_rate": 4.9914644804349356e-05, + "loss": 5.6196, + "step": 4425 + }, + { + "epoch": 0.026322675801693788, + "grad_norm": 1.6116957664489746, + "learning_rate": 4.991460623463337e-05, + "loss": 5.5003, + "step": 4426 + }, + { + "epoch": 0.026328623084974783, + "grad_norm": 1.8156583309173584, + "learning_rate": 4.991456765621996e-05, + "loss": 5.5875, + "step": 4427 + }, + { + "epoch": 0.02633457036825578, + "grad_norm": 2.0364272594451904, + "learning_rate": 4.991452906910912e-05, + "loss": 5.6541, + "step": 4428 + }, + { + "epoch": 0.026340517651536777, + "grad_norm": 1.8430767059326172, + "learning_rate": 4.991449047330088e-05, + "loss": 5.5408, + "step": 4429 + }, + { + "epoch": 0.026346464934817775, + "grad_norm": 2.049476385116577, + "learning_rate": 4.991445186879525e-05, + "loss": 5.5644, + "step": 4430 + }, + { + "epoch": 0.026352412218098774, + "grad_norm": 1.9186240434646606, + "learning_rate": 4.991441325559224e-05, + "loss": 5.5977, + "step": 4431 + }, + { + "epoch": 0.02635835950137977, + "grad_norm": 1.80244779586792, + "learning_rate": 4.991437463369186e-05, + "loss": 5.5114, + "step": 4432 + }, + { + "epoch": 0.026364306784660767, + "grad_norm": 2.2580177783966064, + "learning_rate": 4.991433600309414e-05, + "loss": 5.4132, + "step": 4433 + }, + { + "epoch": 0.026370254067941763, + "grad_norm": 2.0970637798309326, + "learning_rate": 4.991429736379908e-05, + "loss": 5.6211, + "step": 4434 + }, + { + "epoch": 0.02637620135122276, + "grad_norm": 2.0690932273864746, + "learning_rate": 4.9914258715806696e-05, + "loss": 5.6511, + "step": 4435 + }, + { + "epoch": 0.02638214863450376, + "grad_norm": 2.063052177429199, + "learning_rate": 4.9914220059117e-05, + "loss": 5.5169, + "step": 4436 + }, + { + "epoch": 0.026388095917784755, + "grad_norm": 1.990708827972412, + "learning_rate": 4.991418139373001e-05, + "loss": 5.5018, + "step": 4437 + }, + { + "epoch": 0.026394043201065753, + "grad_norm": 2.1311633586883545, + "learning_rate": 4.9914142719645736e-05, + "loss": 5.4714, + "step": 4438 + }, + { + "epoch": 0.026399990484346752, + "grad_norm": 1.7688508033752441, + "learning_rate": 4.991410403686419e-05, + "loss": 5.5208, + "step": 4439 + }, + { + "epoch": 0.026405937767627747, + "grad_norm": 2.3486130237579346, + "learning_rate": 4.9914065345385383e-05, + "loss": 5.4524, + "step": 4440 + }, + { + "epoch": 0.026411885050908745, + "grad_norm": 2.0333707332611084, + "learning_rate": 4.9914026645209344e-05, + "loss": 5.6747, + "step": 4441 + }, + { + "epoch": 0.02641783233418974, + "grad_norm": 1.8731845617294312, + "learning_rate": 4.991398793633607e-05, + "loss": 5.6436, + "step": 4442 + }, + { + "epoch": 0.02642377961747074, + "grad_norm": 2.003361225128174, + "learning_rate": 4.991394921876558e-05, + "loss": 5.4628, + "step": 4443 + }, + { + "epoch": 0.026429726900751738, + "grad_norm": 2.1195411682128906, + "learning_rate": 4.991391049249789e-05, + "loss": 5.4096, + "step": 4444 + }, + { + "epoch": 0.026435674184032733, + "grad_norm": 1.857364535331726, + "learning_rate": 4.991387175753301e-05, + "loss": 5.3928, + "step": 4445 + }, + { + "epoch": 0.02644162146731373, + "grad_norm": 1.8932915925979614, + "learning_rate": 4.991383301387095e-05, + "loss": 5.4917, + "step": 4446 + }, + { + "epoch": 0.02644756875059473, + "grad_norm": 1.8743010759353638, + "learning_rate": 4.991379426151174e-05, + "loss": 5.6766, + "step": 4447 + }, + { + "epoch": 0.026453516033875725, + "grad_norm": 1.910796046257019, + "learning_rate": 4.991375550045537e-05, + "loss": 5.4347, + "step": 4448 + }, + { + "epoch": 0.026459463317156724, + "grad_norm": 1.7901744842529297, + "learning_rate": 4.991371673070187e-05, + "loss": 5.5339, + "step": 4449 + }, + { + "epoch": 0.02646541060043772, + "grad_norm": 1.86943519115448, + "learning_rate": 4.9913677952251244e-05, + "loss": 5.4867, + "step": 4450 + }, + { + "epoch": 0.026471357883718717, + "grad_norm": 1.8662208318710327, + "learning_rate": 4.991363916510352e-05, + "loss": 5.4992, + "step": 4451 + }, + { + "epoch": 0.026477305166999716, + "grad_norm": 1.7465355396270752, + "learning_rate": 4.99136003692587e-05, + "loss": 5.5243, + "step": 4452 + }, + { + "epoch": 0.02648325245028071, + "grad_norm": 1.9097687005996704, + "learning_rate": 4.9913561564716794e-05, + "loss": 5.5096, + "step": 4453 + }, + { + "epoch": 0.02648919973356171, + "grad_norm": 2.1472127437591553, + "learning_rate": 4.991352275147783e-05, + "loss": 5.4462, + "step": 4454 + }, + { + "epoch": 0.026495147016842708, + "grad_norm": 2.3966939449310303, + "learning_rate": 4.9913483929541806e-05, + "loss": 5.2938, + "step": 4455 + }, + { + "epoch": 0.026501094300123703, + "grad_norm": 2.1738977432250977, + "learning_rate": 4.991344509890874e-05, + "loss": 5.317, + "step": 4456 + }, + { + "epoch": 0.0265070415834047, + "grad_norm": 1.963944435119629, + "learning_rate": 4.9913406259578646e-05, + "loss": 5.3827, + "step": 4457 + }, + { + "epoch": 0.026512988866685697, + "grad_norm": 2.1755871772766113, + "learning_rate": 4.991336741155155e-05, + "loss": 5.2941, + "step": 4458 + }, + { + "epoch": 0.026518936149966695, + "grad_norm": 2.2461934089660645, + "learning_rate": 4.991332855482744e-05, + "loss": 5.3503, + "step": 4459 + }, + { + "epoch": 0.026524883433247694, + "grad_norm": 2.2270491123199463, + "learning_rate": 4.9913289689406355e-05, + "loss": 5.417, + "step": 4460 + }, + { + "epoch": 0.02653083071652869, + "grad_norm": 2.437074661254883, + "learning_rate": 4.991325081528829e-05, + "loss": 5.1938, + "step": 4461 + }, + { + "epoch": 0.026536777999809687, + "grad_norm": 2.159170150756836, + "learning_rate": 4.991321193247328e-05, + "loss": 5.2088, + "step": 4462 + }, + { + "epoch": 0.026542725283090682, + "grad_norm": 2.08797287940979, + "learning_rate": 4.9913173040961315e-05, + "loss": 5.1829, + "step": 4463 + }, + { + "epoch": 0.02654867256637168, + "grad_norm": 2.805191993713379, + "learning_rate": 4.991313414075242e-05, + "loss": 6.3049, + "step": 4464 + }, + { + "epoch": 0.02655461984965268, + "grad_norm": 2.3204843997955322, + "learning_rate": 4.991309523184661e-05, + "loss": 5.3831, + "step": 4465 + }, + { + "epoch": 0.026560567132933675, + "grad_norm": 2.217212200164795, + "learning_rate": 4.991305631424389e-05, + "loss": 5.4647, + "step": 4466 + }, + { + "epoch": 0.026566514416214673, + "grad_norm": 2.1094207763671875, + "learning_rate": 4.991301738794429e-05, + "loss": 5.5837, + "step": 4467 + }, + { + "epoch": 0.026572461699495672, + "grad_norm": 2.225660562515259, + "learning_rate": 4.99129784529478e-05, + "loss": 5.8316, + "step": 4468 + }, + { + "epoch": 0.026578408982776667, + "grad_norm": 2.361238956451416, + "learning_rate": 4.991293950925446e-05, + "loss": 5.8358, + "step": 4469 + }, + { + "epoch": 0.026584356266057665, + "grad_norm": 2.3268609046936035, + "learning_rate": 4.991290055686426e-05, + "loss": 5.732, + "step": 4470 + }, + { + "epoch": 0.02659030354933866, + "grad_norm": 2.1456172466278076, + "learning_rate": 4.9912861595777226e-05, + "loss": 5.9, + "step": 4471 + }, + { + "epoch": 0.02659625083261966, + "grad_norm": 2.114696979522705, + "learning_rate": 4.991282262599337e-05, + "loss": 5.4464, + "step": 4472 + }, + { + "epoch": 0.026602198115900658, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9912783647512705e-05, + "loss": 5.5053, + "step": 4473 + }, + { + "epoch": 0.026608145399181653, + "grad_norm": 1.9743404388427734, + "learning_rate": 4.9912744660335245e-05, + "loss": 5.5877, + "step": 4474 + }, + { + "epoch": 0.02661409268246265, + "grad_norm": 2.052358865737915, + "learning_rate": 4.991270566446101e-05, + "loss": 5.5891, + "step": 4475 + }, + { + "epoch": 0.02662003996574365, + "grad_norm": 2.1602041721343994, + "learning_rate": 4.991266665989e-05, + "loss": 5.581, + "step": 4476 + }, + { + "epoch": 0.026625987249024645, + "grad_norm": 2.241586685180664, + "learning_rate": 4.9912627646622236e-05, + "loss": 5.5375, + "step": 4477 + }, + { + "epoch": 0.026631934532305643, + "grad_norm": 1.7952601909637451, + "learning_rate": 4.991258862465773e-05, + "loss": 5.5273, + "step": 4478 + }, + { + "epoch": 0.02663788181558664, + "grad_norm": 1.9767752885818481, + "learning_rate": 4.991254959399649e-05, + "loss": 5.4476, + "step": 4479 + }, + { + "epoch": 0.026643829098867637, + "grad_norm": 1.7997682094573975, + "learning_rate": 4.991251055463855e-05, + "loss": 5.5666, + "step": 4480 + }, + { + "epoch": 0.026649776382148636, + "grad_norm": 2.3247575759887695, + "learning_rate": 4.9912471506583905e-05, + "loss": 5.5247, + "step": 4481 + }, + { + "epoch": 0.02665572366542963, + "grad_norm": 2.165900230407715, + "learning_rate": 4.991243244983257e-05, + "loss": 5.6807, + "step": 4482 + }, + { + "epoch": 0.02666167094871063, + "grad_norm": 2.598257303237915, + "learning_rate": 4.991239338438456e-05, + "loss": 5.6609, + "step": 4483 + }, + { + "epoch": 0.026667618231991628, + "grad_norm": 2.2752041816711426, + "learning_rate": 4.991235431023989e-05, + "loss": 5.5199, + "step": 4484 + }, + { + "epoch": 0.026673565515272623, + "grad_norm": 2.3482842445373535, + "learning_rate": 4.9912315227398586e-05, + "loss": 5.6438, + "step": 4485 + }, + { + "epoch": 0.02667951279855362, + "grad_norm": 2.034403085708618, + "learning_rate": 4.991227613586065e-05, + "loss": 5.6191, + "step": 4486 + }, + { + "epoch": 0.026685460081834617, + "grad_norm": 1.9002971649169922, + "learning_rate": 4.9912237035626085e-05, + "loss": 5.6627, + "step": 4487 + }, + { + "epoch": 0.026691407365115615, + "grad_norm": 2.0305564403533936, + "learning_rate": 4.9912197926694924e-05, + "loss": 5.7009, + "step": 4488 + }, + { + "epoch": 0.026697354648396614, + "grad_norm": 2.029777765274048, + "learning_rate": 4.991215880906717e-05, + "loss": 5.5201, + "step": 4489 + }, + { + "epoch": 0.02670330193167761, + "grad_norm": 1.8889492750167847, + "learning_rate": 4.991211968274283e-05, + "loss": 5.602, + "step": 4490 + }, + { + "epoch": 0.026709249214958607, + "grad_norm": 1.9616930484771729, + "learning_rate": 4.9912080547721934e-05, + "loss": 5.5352, + "step": 4491 + }, + { + "epoch": 0.026715196498239602, + "grad_norm": 2.449345827102661, + "learning_rate": 4.9912041404004485e-05, + "loss": 5.7103, + "step": 4492 + }, + { + "epoch": 0.0267211437815206, + "grad_norm": 2.5550389289855957, + "learning_rate": 4.991200225159051e-05, + "loss": 5.5593, + "step": 4493 + }, + { + "epoch": 0.0267270910648016, + "grad_norm": 2.2512362003326416, + "learning_rate": 4.9911963090479996e-05, + "loss": 5.6329, + "step": 4494 + }, + { + "epoch": 0.026733038348082595, + "grad_norm": 2.0346968173980713, + "learning_rate": 4.9911923920672984e-05, + "loss": 5.5966, + "step": 4495 + }, + { + "epoch": 0.026738985631363593, + "grad_norm": 2.013648271560669, + "learning_rate": 4.991188474216947e-05, + "loss": 5.6532, + "step": 4496 + }, + { + "epoch": 0.026744932914644592, + "grad_norm": 1.8361715078353882, + "learning_rate": 4.9911845554969484e-05, + "loss": 5.519, + "step": 4497 + }, + { + "epoch": 0.026750880197925587, + "grad_norm": 2.1487016677856445, + "learning_rate": 4.991180635907302e-05, + "loss": 5.436, + "step": 4498 + }, + { + "epoch": 0.026756827481206585, + "grad_norm": 2.277714967727661, + "learning_rate": 4.991176715448011e-05, + "loss": 5.3574, + "step": 4499 + }, + { + "epoch": 0.02676277476448758, + "grad_norm": 2.3313565254211426, + "learning_rate": 4.9911727941190755e-05, + "loss": 5.5408, + "step": 4500 + }, + { + "epoch": 0.02676872204776858, + "grad_norm": 2.105825662612915, + "learning_rate": 4.9911688719204975e-05, + "loss": 5.4801, + "step": 4501 + }, + { + "epoch": 0.026774669331049578, + "grad_norm": 2.122138261795044, + "learning_rate": 4.991164948852278e-05, + "loss": 5.4645, + "step": 4502 + }, + { + "epoch": 0.026780616614330573, + "grad_norm": 1.8742777109146118, + "learning_rate": 4.991161024914419e-05, + "loss": 5.5646, + "step": 4503 + }, + { + "epoch": 0.02678656389761157, + "grad_norm": 1.762276291847229, + "learning_rate": 4.991157100106921e-05, + "loss": 5.5672, + "step": 4504 + }, + { + "epoch": 0.02679251118089257, + "grad_norm": 1.9174740314483643, + "learning_rate": 4.9911531744297855e-05, + "loss": 5.4296, + "step": 4505 + }, + { + "epoch": 0.026798458464173565, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.991149247883015e-05, + "loss": 5.5685, + "step": 4506 + }, + { + "epoch": 0.026804405747454563, + "grad_norm": 1.8675988912582397, + "learning_rate": 4.9911453204666094e-05, + "loss": 5.4757, + "step": 4507 + }, + { + "epoch": 0.02681035303073556, + "grad_norm": 2.3117783069610596, + "learning_rate": 4.99114139218057e-05, + "loss": 5.7057, + "step": 4508 + }, + { + "epoch": 0.026816300314016557, + "grad_norm": 2.5439465045928955, + "learning_rate": 4.9911374630249007e-05, + "loss": 5.7393, + "step": 4509 + }, + { + "epoch": 0.026822247597297556, + "grad_norm": 2.4611666202545166, + "learning_rate": 4.9911335329996e-05, + "loss": 5.7215, + "step": 4510 + }, + { + "epoch": 0.02682819488057855, + "grad_norm": 2.1540768146514893, + "learning_rate": 4.99112960210467e-05, + "loss": 5.7059, + "step": 4511 + }, + { + "epoch": 0.02683414216385955, + "grad_norm": 2.1183645725250244, + "learning_rate": 4.9911256703401134e-05, + "loss": 5.4454, + "step": 4512 + }, + { + "epoch": 0.026840089447140548, + "grad_norm": 2.1757540702819824, + "learning_rate": 4.9911217377059295e-05, + "loss": 5.6851, + "step": 4513 + }, + { + "epoch": 0.026846036730421543, + "grad_norm": 2.2770378589630127, + "learning_rate": 4.9911178042021214e-05, + "loss": 5.5957, + "step": 4514 + }, + { + "epoch": 0.02685198401370254, + "grad_norm": 2.320993185043335, + "learning_rate": 4.9911138698286895e-05, + "loss": 5.4674, + "step": 4515 + }, + { + "epoch": 0.026857931296983537, + "grad_norm": 2.2340428829193115, + "learning_rate": 4.991109934585636e-05, + "loss": 5.4514, + "step": 4516 + }, + { + "epoch": 0.026863878580264535, + "grad_norm": 2.1531431674957275, + "learning_rate": 4.991105998472962e-05, + "loss": 5.4386, + "step": 4517 + }, + { + "epoch": 0.026869825863545534, + "grad_norm": 2.1567044258117676, + "learning_rate": 4.991102061490667e-05, + "loss": 5.422, + "step": 4518 + }, + { + "epoch": 0.02687577314682653, + "grad_norm": 2.1181681156158447, + "learning_rate": 4.9910981236387554e-05, + "loss": 5.7214, + "step": 4519 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 2.3410873413085938, + "learning_rate": 4.9910941849172263e-05, + "loss": 5.8603, + "step": 4520 + }, + { + "epoch": 0.026887667713388526, + "grad_norm": 2.4943840503692627, + "learning_rate": 4.9910902453260824e-05, + "loss": 5.7084, + "step": 4521 + }, + { + "epoch": 0.02689361499666952, + "grad_norm": 2.1420044898986816, + "learning_rate": 4.991086304865325e-05, + "loss": 5.528, + "step": 4522 + }, + { + "epoch": 0.02689956227995052, + "grad_norm": 2.3257980346679688, + "learning_rate": 4.991082363534955e-05, + "loss": 5.6791, + "step": 4523 + }, + { + "epoch": 0.026905509563231515, + "grad_norm": 2.335049867630005, + "learning_rate": 4.991078421334974e-05, + "loss": 5.6184, + "step": 4524 + }, + { + "epoch": 0.026911456846512513, + "grad_norm": 3.7381551265716553, + "learning_rate": 4.9910744782653825e-05, + "loss": 5.954, + "step": 4525 + }, + { + "epoch": 0.02691740412979351, + "grad_norm": 3.1807587146759033, + "learning_rate": 4.991070534326183e-05, + "loss": 6.5662, + "step": 4526 + }, + { + "epoch": 0.026923351413074507, + "grad_norm": 2.378366708755493, + "learning_rate": 4.991066589517376e-05, + "loss": 6.2312, + "step": 4527 + }, + { + "epoch": 0.026929298696355505, + "grad_norm": 2.5797109603881836, + "learning_rate": 4.991062643838964e-05, + "loss": 5.9969, + "step": 4528 + }, + { + "epoch": 0.0269352459796365, + "grad_norm": 2.522815704345703, + "learning_rate": 4.991058697290948e-05, + "loss": 5.919, + "step": 4529 + }, + { + "epoch": 0.0269411932629175, + "grad_norm": 2.5215437412261963, + "learning_rate": 4.991054749873329e-05, + "loss": 5.8812, + "step": 4530 + }, + { + "epoch": 0.026947140546198498, + "grad_norm": 2.1608335971832275, + "learning_rate": 4.991050801586108e-05, + "loss": 5.8381, + "step": 4531 + }, + { + "epoch": 0.026953087829479493, + "grad_norm": 2.37752366065979, + "learning_rate": 4.991046852429288e-05, + "loss": 5.7612, + "step": 4532 + }, + { + "epoch": 0.02695903511276049, + "grad_norm": 2.117534875869751, + "learning_rate": 4.991042902402868e-05, + "loss": 5.6762, + "step": 4533 + }, + { + "epoch": 0.02696498239604149, + "grad_norm": 2.595797061920166, + "learning_rate": 4.991038951506851e-05, + "loss": 6.19, + "step": 4534 + }, + { + "epoch": 0.026970929679322485, + "grad_norm": 2.2216086387634277, + "learning_rate": 4.991034999741239e-05, + "loss": 6.1612, + "step": 4535 + }, + { + "epoch": 0.026976876962603483, + "grad_norm": 2.829735279083252, + "learning_rate": 4.991031047106032e-05, + "loss": 5.6955, + "step": 4536 + }, + { + "epoch": 0.02698282424588448, + "grad_norm": 2.5018115043640137, + "learning_rate": 4.991027093601231e-05, + "loss": 5.4966, + "step": 4537 + }, + { + "epoch": 0.026988771529165477, + "grad_norm": 2.334052085876465, + "learning_rate": 4.9910231392268385e-05, + "loss": 6.1603, + "step": 4538 + }, + { + "epoch": 0.026994718812446476, + "grad_norm": 2.497351884841919, + "learning_rate": 4.991019183982856e-05, + "loss": 6.0128, + "step": 4539 + }, + { + "epoch": 0.02700066609572747, + "grad_norm": 2.2976267337799072, + "learning_rate": 4.991015227869284e-05, + "loss": 5.6696, + "step": 4540 + }, + { + "epoch": 0.02700661337900847, + "grad_norm": 2.6851742267608643, + "learning_rate": 4.991011270886125e-05, + "loss": 5.7996, + "step": 4541 + }, + { + "epoch": 0.027012560662289468, + "grad_norm": 2.531029224395752, + "learning_rate": 4.991007313033379e-05, + "loss": 5.6671, + "step": 4542 + }, + { + "epoch": 0.027018507945570463, + "grad_norm": 2.195552110671997, + "learning_rate": 4.991003354311048e-05, + "loss": 6.3213, + "step": 4543 + }, + { + "epoch": 0.02702445522885146, + "grad_norm": 2.2973361015319824, + "learning_rate": 4.9909993947191336e-05, + "loss": 6.1523, + "step": 4544 + }, + { + "epoch": 0.027030402512132456, + "grad_norm": 2.4766385555267334, + "learning_rate": 4.990995434257637e-05, + "loss": 5.7894, + "step": 4545 + }, + { + "epoch": 0.027036349795413455, + "grad_norm": 2.486384630203247, + "learning_rate": 4.9909914729265606e-05, + "loss": 6.2814, + "step": 4546 + }, + { + "epoch": 0.027042297078694454, + "grad_norm": 2.5054233074188232, + "learning_rate": 4.9909875107259036e-05, + "loss": 6.2859, + "step": 4547 + }, + { + "epoch": 0.02704824436197545, + "grad_norm": 2.70576548576355, + "learning_rate": 4.990983547655669e-05, + "loss": 6.2424, + "step": 4548 + }, + { + "epoch": 0.027054191645256447, + "grad_norm": 3.0937716960906982, + "learning_rate": 4.990979583715858e-05, + "loss": 6.4392, + "step": 4549 + }, + { + "epoch": 0.027060138928537446, + "grad_norm": 2.6290581226348877, + "learning_rate": 4.9909756189064714e-05, + "loss": 6.3565, + "step": 4550 + }, + { + "epoch": 0.02706608621181844, + "grad_norm": 2.5180583000183105, + "learning_rate": 4.990971653227511e-05, + "loss": 6.1482, + "step": 4551 + }, + { + "epoch": 0.02707203349509944, + "grad_norm": 2.6096208095550537, + "learning_rate": 4.990967686678978e-05, + "loss": 5.7724, + "step": 4552 + }, + { + "epoch": 0.027077980778380435, + "grad_norm": 3.187276840209961, + "learning_rate": 4.990963719260874e-05, + "loss": 5.682, + "step": 4553 + }, + { + "epoch": 0.027083928061661433, + "grad_norm": 2.3522419929504395, + "learning_rate": 4.9909597509732006e-05, + "loss": 6.7045, + "step": 4554 + }, + { + "epoch": 0.02708987534494243, + "grad_norm": 2.6016366481781006, + "learning_rate": 4.990955781815959e-05, + "loss": 6.0653, + "step": 4555 + }, + { + "epoch": 0.027095822628223427, + "grad_norm": 2.5409183502197266, + "learning_rate": 4.99095181178915e-05, + "loss": 5.861, + "step": 4556 + }, + { + "epoch": 0.027101769911504425, + "grad_norm": 2.5297863483428955, + "learning_rate": 4.9909478408927754e-05, + "loss": 5.5301, + "step": 4557 + }, + { + "epoch": 0.02710771719478542, + "grad_norm": 2.4822275638580322, + "learning_rate": 4.990943869126837e-05, + "loss": 5.6919, + "step": 4558 + }, + { + "epoch": 0.02711366447806642, + "grad_norm": 2.3832650184631348, + "learning_rate": 4.9909398964913365e-05, + "loss": 5.9589, + "step": 4559 + }, + { + "epoch": 0.027119611761347417, + "grad_norm": 2.0038483142852783, + "learning_rate": 4.9909359229862734e-05, + "loss": 6.1847, + "step": 4560 + }, + { + "epoch": 0.027125559044628413, + "grad_norm": 2.3678700923919678, + "learning_rate": 4.990931948611651e-05, + "loss": 6.4794, + "step": 4561 + }, + { + "epoch": 0.02713150632790941, + "grad_norm": 2.7433204650878906, + "learning_rate": 4.990927973367469e-05, + "loss": 6.6997, + "step": 4562 + }, + { + "epoch": 0.02713745361119041, + "grad_norm": 3.5579798221588135, + "learning_rate": 4.990923997253731e-05, + "loss": 6.1809, + "step": 4563 + }, + { + "epoch": 0.027143400894471405, + "grad_norm": 3.254093647003174, + "learning_rate": 4.990920020270436e-05, + "loss": 6.1446, + "step": 4564 + }, + { + "epoch": 0.027149348177752403, + "grad_norm": 3.0661215782165527, + "learning_rate": 4.990916042417588e-05, + "loss": 6.6702, + "step": 4565 + }, + { + "epoch": 0.0271552954610334, + "grad_norm": 2.641291618347168, + "learning_rate": 4.9909120636951864e-05, + "loss": 6.4951, + "step": 4566 + }, + { + "epoch": 0.027161242744314397, + "grad_norm": 2.050675868988037, + "learning_rate": 4.990908084103233e-05, + "loss": 6.3365, + "step": 4567 + }, + { + "epoch": 0.027167190027595396, + "grad_norm": 2.081108331680298, + "learning_rate": 4.990904103641729e-05, + "loss": 6.1874, + "step": 4568 + }, + { + "epoch": 0.02717313731087639, + "grad_norm": 2.5833899974823, + "learning_rate": 4.9909001223106766e-05, + "loss": 6.0892, + "step": 4569 + }, + { + "epoch": 0.02717908459415739, + "grad_norm": 2.7387397289276123, + "learning_rate": 4.990896140110076e-05, + "loss": 6.1036, + "step": 4570 + }, + { + "epoch": 0.027185031877438388, + "grad_norm": 2.5665578842163086, + "learning_rate": 4.99089215703993e-05, + "loss": 5.9577, + "step": 4571 + }, + { + "epoch": 0.027190979160719383, + "grad_norm": 2.3825178146362305, + "learning_rate": 4.990888173100239e-05, + "loss": 5.9654, + "step": 4572 + }, + { + "epoch": 0.02719692644400038, + "grad_norm": 2.562509059906006, + "learning_rate": 4.990884188291005e-05, + "loss": 6.009, + "step": 4573 + }, + { + "epoch": 0.027202873727281376, + "grad_norm": 2.141941785812378, + "learning_rate": 4.9908802026122284e-05, + "loss": 5.8315, + "step": 4574 + }, + { + "epoch": 0.027208821010562375, + "grad_norm": 2.5348474979400635, + "learning_rate": 4.990876216063912e-05, + "loss": 6.3763, + "step": 4575 + }, + { + "epoch": 0.027214768293843374, + "grad_norm": 2.751520872116089, + "learning_rate": 4.990872228646056e-05, + "loss": 6.5684, + "step": 4576 + }, + { + "epoch": 0.02722071557712437, + "grad_norm": 4.626354694366455, + "learning_rate": 4.990868240358662e-05, + "loss": 6.115, + "step": 4577 + }, + { + "epoch": 0.027226662860405367, + "grad_norm": 2.648479700088501, + "learning_rate": 4.990864251201732e-05, + "loss": 6.0879, + "step": 4578 + }, + { + "epoch": 0.027232610143686366, + "grad_norm": 2.21056866645813, + "learning_rate": 4.990860261175268e-05, + "loss": 6.2923, + "step": 4579 + }, + { + "epoch": 0.02723855742696736, + "grad_norm": 2.3460421562194824, + "learning_rate": 4.9908562702792684e-05, + "loss": 6.4044, + "step": 4580 + }, + { + "epoch": 0.02724450471024836, + "grad_norm": 2.6087262630462646, + "learning_rate": 4.990852278513738e-05, + "loss": 6.5131, + "step": 4581 + }, + { + "epoch": 0.027250451993529354, + "grad_norm": 2.6969377994537354, + "learning_rate": 4.9908482858786765e-05, + "loss": 6.3483, + "step": 4582 + }, + { + "epoch": 0.027256399276810353, + "grad_norm": 2.64043927192688, + "learning_rate": 4.990844292374085e-05, + "loss": 5.8712, + "step": 4583 + }, + { + "epoch": 0.02726234656009135, + "grad_norm": 2.5738205909729004, + "learning_rate": 4.9908402979999654e-05, + "loss": 5.9165, + "step": 4584 + }, + { + "epoch": 0.027268293843372347, + "grad_norm": 2.2725625038146973, + "learning_rate": 4.99083630275632e-05, + "loss": 5.8454, + "step": 4585 + }, + { + "epoch": 0.027274241126653345, + "grad_norm": 2.5911824703216553, + "learning_rate": 4.9908323066431494e-05, + "loss": 5.6729, + "step": 4586 + }, + { + "epoch": 0.02728018840993434, + "grad_norm": 2.6691668033599854, + "learning_rate": 4.9908283096604546e-05, + "loss": 5.7726, + "step": 4587 + }, + { + "epoch": 0.02728613569321534, + "grad_norm": 2.6512796878814697, + "learning_rate": 4.990824311808238e-05, + "loss": 6.1295, + "step": 4588 + }, + { + "epoch": 0.027292082976496337, + "grad_norm": 2.816943645477295, + "learning_rate": 4.9908203130865e-05, + "loss": 5.5172, + "step": 4589 + }, + { + "epoch": 0.027298030259777332, + "grad_norm": 2.6252098083496094, + "learning_rate": 4.990816313495242e-05, + "loss": 5.5955, + "step": 4590 + }, + { + "epoch": 0.02730397754305833, + "grad_norm": 2.3711740970611572, + "learning_rate": 4.990812313034466e-05, + "loss": 5.3348, + "step": 4591 + }, + { + "epoch": 0.02730992482633933, + "grad_norm": 2.355436086654663, + "learning_rate": 4.990808311704173e-05, + "loss": 5.6171, + "step": 4592 + }, + { + "epoch": 0.027315872109620325, + "grad_norm": 2.3344695568084717, + "learning_rate": 4.990804309504365e-05, + "loss": 5.46, + "step": 4593 + }, + { + "epoch": 0.027321819392901323, + "grad_norm": 2.3890786170959473, + "learning_rate": 4.990800306435043e-05, + "loss": 5.5658, + "step": 4594 + }, + { + "epoch": 0.02732776667618232, + "grad_norm": 2.5606987476348877, + "learning_rate": 4.990796302496208e-05, + "loss": 5.4778, + "step": 4595 + }, + { + "epoch": 0.027333713959463317, + "grad_norm": 2.2443172931671143, + "learning_rate": 4.9907922976878616e-05, + "loss": 5.486, + "step": 4596 + }, + { + "epoch": 0.027339661242744315, + "grad_norm": 2.3428351879119873, + "learning_rate": 4.990788292010005e-05, + "loss": 5.3332, + "step": 4597 + }, + { + "epoch": 0.02734560852602531, + "grad_norm": 2.6336300373077393, + "learning_rate": 4.9907842854626406e-05, + "loss": 5.4606, + "step": 4598 + }, + { + "epoch": 0.02735155580930631, + "grad_norm": 2.3052382469177246, + "learning_rate": 4.990780278045769e-05, + "loss": 5.4028, + "step": 4599 + }, + { + "epoch": 0.027357503092587308, + "grad_norm": 2.4661340713500977, + "learning_rate": 4.990776269759392e-05, + "loss": 5.6011, + "step": 4600 + }, + { + "epoch": 0.027363450375868303, + "grad_norm": 2.400527238845825, + "learning_rate": 4.99077226060351e-05, + "loss": 5.5952, + "step": 4601 + }, + { + "epoch": 0.0273693976591493, + "grad_norm": 2.364900827407837, + "learning_rate": 4.9907682505781256e-05, + "loss": 5.2125, + "step": 4602 + }, + { + "epoch": 0.027375344942430296, + "grad_norm": 2.383680820465088, + "learning_rate": 4.99076423968324e-05, + "loss": 5.4253, + "step": 4603 + }, + { + "epoch": 0.027381292225711295, + "grad_norm": 2.681903839111328, + "learning_rate": 4.990760227918854e-05, + "loss": 5.3741, + "step": 4604 + }, + { + "epoch": 0.027387239508992293, + "grad_norm": 2.3454341888427734, + "learning_rate": 4.990756215284969e-05, + "loss": 5.3032, + "step": 4605 + }, + { + "epoch": 0.02739318679227329, + "grad_norm": 2.439807653427124, + "learning_rate": 4.990752201781587e-05, + "loss": 5.3368, + "step": 4606 + }, + { + "epoch": 0.027399134075554287, + "grad_norm": 2.938976764678955, + "learning_rate": 4.990748187408709e-05, + "loss": 6.1251, + "step": 4607 + }, + { + "epoch": 0.027405081358835286, + "grad_norm": 3.353973865509033, + "learning_rate": 4.990744172166337e-05, + "loss": 6.72, + "step": 4608 + }, + { + "epoch": 0.02741102864211628, + "grad_norm": 2.4661834239959717, + "learning_rate": 4.990740156054472e-05, + "loss": 5.7156, + "step": 4609 + }, + { + "epoch": 0.02741697592539728, + "grad_norm": 2.303976058959961, + "learning_rate": 4.990736139073116e-05, + "loss": 5.3493, + "step": 4610 + }, + { + "epoch": 0.027422923208678274, + "grad_norm": 2.4225149154663086, + "learning_rate": 4.990732121222268e-05, + "loss": 5.4831, + "step": 4611 + }, + { + "epoch": 0.027428870491959273, + "grad_norm": 2.5566627979278564, + "learning_rate": 4.990728102501932e-05, + "loss": 5.9159, + "step": 4612 + }, + { + "epoch": 0.02743481777524027, + "grad_norm": 2.64258074760437, + "learning_rate": 4.9907240829121085e-05, + "loss": 6.7137, + "step": 4613 + }, + { + "epoch": 0.027440765058521267, + "grad_norm": 2.967501640319824, + "learning_rate": 4.9907200624527986e-05, + "loss": 6.3333, + "step": 4614 + }, + { + "epoch": 0.027446712341802265, + "grad_norm": 2.6084952354431152, + "learning_rate": 4.990716041124005e-05, + "loss": 6.1201, + "step": 4615 + }, + { + "epoch": 0.02745265962508326, + "grad_norm": 3.0721616744995117, + "learning_rate": 4.990712018925727e-05, + "loss": 6.396, + "step": 4616 + }, + { + "epoch": 0.02745860690836426, + "grad_norm": 2.888263463973999, + "learning_rate": 4.990707995857968e-05, + "loss": 6.0773, + "step": 4617 + }, + { + "epoch": 0.027464554191645257, + "grad_norm": 2.7506093978881836, + "learning_rate": 4.990703971920728e-05, + "loss": 5.9909, + "step": 4618 + }, + { + "epoch": 0.027470501474926252, + "grad_norm": 2.8273298740386963, + "learning_rate": 4.99069994711401e-05, + "loss": 5.9591, + "step": 4619 + }, + { + "epoch": 0.02747644875820725, + "grad_norm": 2.451011896133423, + "learning_rate": 4.990695921437813e-05, + "loss": 6.1596, + "step": 4620 + }, + { + "epoch": 0.02748239604148825, + "grad_norm": 2.762265920639038, + "learning_rate": 4.990691894892141e-05, + "loss": 6.6233, + "step": 4621 + }, + { + "epoch": 0.027488343324769245, + "grad_norm": 2.4570846557617188, + "learning_rate": 4.990687867476994e-05, + "loss": 6.5025, + "step": 4622 + }, + { + "epoch": 0.027494290608050243, + "grad_norm": 3.108992576599121, + "learning_rate": 4.990683839192373e-05, + "loss": 5.921, + "step": 4623 + }, + { + "epoch": 0.02750023789133124, + "grad_norm": 2.887580156326294, + "learning_rate": 4.99067981003828e-05, + "loss": 5.9266, + "step": 4624 + }, + { + "epoch": 0.027506185174612237, + "grad_norm": 3.083556890487671, + "learning_rate": 4.990675780014718e-05, + "loss": 5.765, + "step": 4625 + }, + { + "epoch": 0.027512132457893235, + "grad_norm": 2.710231304168701, + "learning_rate": 4.990671749121685e-05, + "loss": 5.7674, + "step": 4626 + }, + { + "epoch": 0.02751807974117423, + "grad_norm": 2.738926410675049, + "learning_rate": 4.9906677173591845e-05, + "loss": 5.801, + "step": 4627 + }, + { + "epoch": 0.02752402702445523, + "grad_norm": 2.6737735271453857, + "learning_rate": 4.9906636847272176e-05, + "loss": 6.2581, + "step": 4628 + }, + { + "epoch": 0.027529974307736228, + "grad_norm": 2.623969554901123, + "learning_rate": 4.990659651225786e-05, + "loss": 5.5044, + "step": 4629 + }, + { + "epoch": 0.027535921591017223, + "grad_norm": 3.069460153579712, + "learning_rate": 4.990655616854891e-05, + "loss": 5.9639, + "step": 4630 + }, + { + "epoch": 0.02754186887429822, + "grad_norm": 2.6889147758483887, + "learning_rate": 4.990651581614534e-05, + "loss": 6.3032, + "step": 4631 + }, + { + "epoch": 0.027547816157579216, + "grad_norm": 3.5284838676452637, + "learning_rate": 4.990647545504716e-05, + "loss": 6.4104, + "step": 4632 + }, + { + "epoch": 0.027553763440860215, + "grad_norm": 2.326162338256836, + "learning_rate": 4.9906435085254384e-05, + "loss": 6.2593, + "step": 4633 + }, + { + "epoch": 0.027559710724141213, + "grad_norm": 1.946542739868164, + "learning_rate": 4.990639470676703e-05, + "loss": 6.1522, + "step": 4634 + }, + { + "epoch": 0.02756565800742221, + "grad_norm": 2.26143741607666, + "learning_rate": 4.990635431958511e-05, + "loss": 6.0189, + "step": 4635 + }, + { + "epoch": 0.027571605290703207, + "grad_norm": 2.8332626819610596, + "learning_rate": 4.990631392370865e-05, + "loss": 5.6226, + "step": 4636 + }, + { + "epoch": 0.027577552573984206, + "grad_norm": 3.919443130493164, + "learning_rate": 4.9906273519137636e-05, + "loss": 6.2147, + "step": 4637 + }, + { + "epoch": 0.0275834998572652, + "grad_norm": 2.4030275344848633, + "learning_rate": 4.9906233105872115e-05, + "loss": 5.6589, + "step": 4638 + }, + { + "epoch": 0.0275894471405462, + "grad_norm": 2.7806994915008545, + "learning_rate": 4.990619268391207e-05, + "loss": 5.4349, + "step": 4639 + }, + { + "epoch": 0.027595394423827194, + "grad_norm": 2.5759501457214355, + "learning_rate": 4.990615225325754e-05, + "loss": 6.1171, + "step": 4640 + }, + { + "epoch": 0.027601341707108193, + "grad_norm": 2.337517023086548, + "learning_rate": 4.990611181390853e-05, + "loss": 5.5514, + "step": 4641 + }, + { + "epoch": 0.02760728899038919, + "grad_norm": 2.6464250087738037, + "learning_rate": 4.990607136586505e-05, + "loss": 6.1852, + "step": 4642 + }, + { + "epoch": 0.027613236273670187, + "grad_norm": 2.030210256576538, + "learning_rate": 4.9906030909127125e-05, + "loss": 6.0919, + "step": 4643 + }, + { + "epoch": 0.027619183556951185, + "grad_norm": 2.4546520709991455, + "learning_rate": 4.990599044369475e-05, + "loss": 6.3018, + "step": 4644 + }, + { + "epoch": 0.027625130840232184, + "grad_norm": 2.508500337600708, + "learning_rate": 4.990594996956796e-05, + "loss": 5.7933, + "step": 4645 + }, + { + "epoch": 0.02763107812351318, + "grad_norm": 2.3363263607025146, + "learning_rate": 4.990590948674676e-05, + "loss": 6.4252, + "step": 4646 + }, + { + "epoch": 0.027637025406794177, + "grad_norm": 2.794673442840576, + "learning_rate": 4.990586899523116e-05, + "loss": 5.3554, + "step": 4647 + }, + { + "epoch": 0.027642972690075172, + "grad_norm": 2.5396835803985596, + "learning_rate": 4.990582849502118e-05, + "loss": 5.2352, + "step": 4648 + }, + { + "epoch": 0.02764891997335617, + "grad_norm": 2.6878976821899414, + "learning_rate": 4.990578798611684e-05, + "loss": 4.9262, + "step": 4649 + }, + { + "epoch": 0.02765486725663717, + "grad_norm": 2.2143187522888184, + "learning_rate": 4.9905747468518136e-05, + "loss": 6.0785, + "step": 4650 + }, + { + "epoch": 0.027660814539918165, + "grad_norm": 2.6812448501586914, + "learning_rate": 4.9905706942225094e-05, + "loss": 5.1692, + "step": 4651 + }, + { + "epoch": 0.027666761823199163, + "grad_norm": 2.5155227184295654, + "learning_rate": 4.9905666407237726e-05, + "loss": 5.0194, + "step": 4652 + }, + { + "epoch": 0.027672709106480158, + "grad_norm": 2.406834363937378, + "learning_rate": 4.9905625863556047e-05, + "loss": 5.1249, + "step": 4653 + }, + { + "epoch": 0.027678656389761157, + "grad_norm": 3.3666698932647705, + "learning_rate": 4.990558531118008e-05, + "loss": 5.9619, + "step": 4654 + }, + { + "epoch": 0.027684603673042155, + "grad_norm": 2.6557607650756836, + "learning_rate": 4.9905544750109826e-05, + "loss": 5.9118, + "step": 4655 + }, + { + "epoch": 0.02769055095632315, + "grad_norm": 2.60469651222229, + "learning_rate": 4.9905504180345304e-05, + "loss": 6.3746, + "step": 4656 + }, + { + "epoch": 0.02769649823960415, + "grad_norm": 2.5417349338531494, + "learning_rate": 4.9905463601886526e-05, + "loss": 5.6975, + "step": 4657 + }, + { + "epoch": 0.027702445522885148, + "grad_norm": 2.723829984664917, + "learning_rate": 4.990542301473351e-05, + "loss": 5.6189, + "step": 4658 + }, + { + "epoch": 0.027708392806166143, + "grad_norm": 3.0544204711914062, + "learning_rate": 4.990538241888627e-05, + "loss": 5.4999, + "step": 4659 + }, + { + "epoch": 0.02771434008944714, + "grad_norm": 3.0536513328552246, + "learning_rate": 4.990534181434481e-05, + "loss": 6.0636, + "step": 4660 + }, + { + "epoch": 0.027720287372728136, + "grad_norm": 3.0618786811828613, + "learning_rate": 4.990530120110916e-05, + "loss": 6.0856, + "step": 4661 + }, + { + "epoch": 0.027726234656009135, + "grad_norm": 2.6602306365966797, + "learning_rate": 4.9905260579179325e-05, + "loss": 5.8341, + "step": 4662 + }, + { + "epoch": 0.027732181939290133, + "grad_norm": 2.729137420654297, + "learning_rate": 4.990521994855532e-05, + "loss": 6.7052, + "step": 4663 + }, + { + "epoch": 0.02773812922257113, + "grad_norm": 3.0878489017486572, + "learning_rate": 4.990517930923716e-05, + "loss": 6.1308, + "step": 4664 + }, + { + "epoch": 0.027744076505852127, + "grad_norm": 2.524418354034424, + "learning_rate": 4.990513866122486e-05, + "loss": 6.2547, + "step": 4665 + }, + { + "epoch": 0.027750023789133126, + "grad_norm": 2.457075595855713, + "learning_rate": 4.990509800451844e-05, + "loss": 6.6615, + "step": 4666 + }, + { + "epoch": 0.02775597107241412, + "grad_norm": 2.474487543106079, + "learning_rate": 4.9905057339117894e-05, + "loss": 6.63, + "step": 4667 + }, + { + "epoch": 0.02776191835569512, + "grad_norm": 2.611098289489746, + "learning_rate": 4.9905016665023254e-05, + "loss": 5.8232, + "step": 4668 + }, + { + "epoch": 0.027767865638976114, + "grad_norm": 2.8012242317199707, + "learning_rate": 4.990497598223454e-05, + "loss": 5.8478, + "step": 4669 + }, + { + "epoch": 0.027773812922257113, + "grad_norm": 2.706725597381592, + "learning_rate": 4.990493529075174e-05, + "loss": 5.8585, + "step": 4670 + }, + { + "epoch": 0.02777976020553811, + "grad_norm": 2.490032196044922, + "learning_rate": 4.99048945905749e-05, + "loss": 6.2181, + "step": 4671 + }, + { + "epoch": 0.027785707488819106, + "grad_norm": 2.4735357761383057, + "learning_rate": 4.990485388170401e-05, + "loss": 6.2153, + "step": 4672 + }, + { + "epoch": 0.027791654772100105, + "grad_norm": 2.7573068141937256, + "learning_rate": 4.9904813164139094e-05, + "loss": 6.217, + "step": 4673 + }, + { + "epoch": 0.027797602055381104, + "grad_norm": 2.4663283824920654, + "learning_rate": 4.990477243788017e-05, + "loss": 6.4153, + "step": 4674 + }, + { + "epoch": 0.0278035493386621, + "grad_norm": 2.737656831741333, + "learning_rate": 4.9904731702927234e-05, + "loss": 6.5209, + "step": 4675 + }, + { + "epoch": 0.027809496621943097, + "grad_norm": 2.5112721920013428, + "learning_rate": 4.990469095928032e-05, + "loss": 5.979, + "step": 4676 + }, + { + "epoch": 0.027815443905224092, + "grad_norm": 2.6602795124053955, + "learning_rate": 4.990465020693944e-05, + "loss": 5.9206, + "step": 4677 + }, + { + "epoch": 0.02782139118850509, + "grad_norm": 2.460538625717163, + "learning_rate": 4.9904609445904606e-05, + "loss": 5.9855, + "step": 4678 + }, + { + "epoch": 0.02782733847178609, + "grad_norm": 2.750138998031616, + "learning_rate": 4.990456867617582e-05, + "loss": 5.8425, + "step": 4679 + }, + { + "epoch": 0.027833285755067085, + "grad_norm": 2.9843833446502686, + "learning_rate": 4.9904527897753114e-05, + "loss": 6.1385, + "step": 4680 + }, + { + "epoch": 0.027839233038348083, + "grad_norm": 2.586923360824585, + "learning_rate": 4.99044871106365e-05, + "loss": 5.6278, + "step": 4681 + }, + { + "epoch": 0.027845180321629078, + "grad_norm": 3.114211082458496, + "learning_rate": 4.990444631482597e-05, + "loss": 6.1259, + "step": 4682 + }, + { + "epoch": 0.027851127604910077, + "grad_norm": 2.3222453594207764, + "learning_rate": 4.990440551032157e-05, + "loss": 6.3048, + "step": 4683 + }, + { + "epoch": 0.027857074888191075, + "grad_norm": 2.15678334236145, + "learning_rate": 4.99043646971233e-05, + "loss": 5.9082, + "step": 4684 + }, + { + "epoch": 0.02786302217147207, + "grad_norm": 3.946350574493408, + "learning_rate": 4.990432387523116e-05, + "loss": 5.6907, + "step": 4685 + }, + { + "epoch": 0.02786896945475307, + "grad_norm": 2.9612419605255127, + "learning_rate": 4.9904283044645185e-05, + "loss": 5.3894, + "step": 4686 + }, + { + "epoch": 0.027874916738034067, + "grad_norm": 2.3602261543273926, + "learning_rate": 4.990424220536538e-05, + "loss": 6.0716, + "step": 4687 + }, + { + "epoch": 0.027880864021315063, + "grad_norm": 2.822300672531128, + "learning_rate": 4.990420135739177e-05, + "loss": 5.9788, + "step": 4688 + }, + { + "epoch": 0.02788681130459606, + "grad_norm": 2.766280174255371, + "learning_rate": 4.990416050072435e-05, + "loss": 5.9945, + "step": 4689 + }, + { + "epoch": 0.027892758587877056, + "grad_norm": 2.810359239578247, + "learning_rate": 4.990411963536315e-05, + "loss": 6.0598, + "step": 4690 + }, + { + "epoch": 0.027898705871158055, + "grad_norm": 2.510014295578003, + "learning_rate": 4.990407876130818e-05, + "loss": 6.1793, + "step": 4691 + }, + { + "epoch": 0.027904653154439053, + "grad_norm": 2.5394086837768555, + "learning_rate": 4.990403787855945e-05, + "loss": 6.1309, + "step": 4692 + }, + { + "epoch": 0.02791060043772005, + "grad_norm": 2.922084093093872, + "learning_rate": 4.990399698711698e-05, + "loss": 6.1956, + "step": 4693 + }, + { + "epoch": 0.027916547721001047, + "grad_norm": 3.6614181995391846, + "learning_rate": 4.9903956086980785e-05, + "loss": 6.535, + "step": 4694 + }, + { + "epoch": 0.027922495004282046, + "grad_norm": 3.3680684566497803, + "learning_rate": 4.990391517815087e-05, + "loss": 6.5729, + "step": 4695 + }, + { + "epoch": 0.02792844228756304, + "grad_norm": 2.522193431854248, + "learning_rate": 4.990387426062726e-05, + "loss": 5.9406, + "step": 4696 + }, + { + "epoch": 0.02793438957084404, + "grad_norm": 2.9665534496307373, + "learning_rate": 4.990383333440996e-05, + "loss": 6.0281, + "step": 4697 + }, + { + "epoch": 0.027940336854125034, + "grad_norm": 2.643218755722046, + "learning_rate": 4.9903792399498996e-05, + "loss": 5.8965, + "step": 4698 + }, + { + "epoch": 0.027946284137406033, + "grad_norm": 2.498765230178833, + "learning_rate": 4.990375145589436e-05, + "loss": 6.0975, + "step": 4699 + }, + { + "epoch": 0.02795223142068703, + "grad_norm": 4.380255699157715, + "learning_rate": 4.99037105035961e-05, + "loss": 6.6298, + "step": 4700 + }, + { + "epoch": 0.027958178703968026, + "grad_norm": 3.925454616546631, + "learning_rate": 4.990366954260421e-05, + "loss": 6.5742, + "step": 4701 + }, + { + "epoch": 0.027964125987249025, + "grad_norm": 2.5388591289520264, + "learning_rate": 4.99036285729187e-05, + "loss": 6.6102, + "step": 4702 + }, + { + "epoch": 0.027970073270530024, + "grad_norm": 2.6793510913848877, + "learning_rate": 4.9903587594539594e-05, + "loss": 6.4265, + "step": 4703 + }, + { + "epoch": 0.02797602055381102, + "grad_norm": 2.8652729988098145, + "learning_rate": 4.9903546607466903e-05, + "loss": 6.4567, + "step": 4704 + }, + { + "epoch": 0.027981967837092017, + "grad_norm": 2.936021089553833, + "learning_rate": 4.990350561170063e-05, + "loss": 6.404, + "step": 4705 + }, + { + "epoch": 0.027987915120373012, + "grad_norm": 3.256253719329834, + "learning_rate": 4.9903464607240816e-05, + "loss": 6.2291, + "step": 4706 + }, + { + "epoch": 0.02799386240365401, + "grad_norm": 2.8268187046051025, + "learning_rate": 4.990342359408745e-05, + "loss": 6.2582, + "step": 4707 + }, + { + "epoch": 0.02799980968693501, + "grad_norm": 2.5889041423797607, + "learning_rate": 4.9903382572240556e-05, + "loss": 6.3325, + "step": 4708 + }, + { + "epoch": 0.028005756970216004, + "grad_norm": 2.635388135910034, + "learning_rate": 4.9903341541700154e-05, + "loss": 6.1256, + "step": 4709 + }, + { + "epoch": 0.028011704253497003, + "grad_norm": 2.562976360321045, + "learning_rate": 4.990330050246625e-05, + "loss": 5.9333, + "step": 4710 + }, + { + "epoch": 0.028017651536777998, + "grad_norm": 3.488809585571289, + "learning_rate": 4.990325945453887e-05, + "loss": 6.3651, + "step": 4711 + }, + { + "epoch": 0.028023598820058997, + "grad_norm": 2.963324546813965, + "learning_rate": 4.9903218397918e-05, + "loss": 6.718, + "step": 4712 + }, + { + "epoch": 0.028029546103339995, + "grad_norm": 2.4070823192596436, + "learning_rate": 4.990317733260369e-05, + "loss": 6.2502, + "step": 4713 + }, + { + "epoch": 0.02803549338662099, + "grad_norm": 2.711190938949585, + "learning_rate": 4.9903136258595925e-05, + "loss": 6.0397, + "step": 4714 + }, + { + "epoch": 0.02804144066990199, + "grad_norm": 2.466150999069214, + "learning_rate": 4.9903095175894746e-05, + "loss": 5.9344, + "step": 4715 + }, + { + "epoch": 0.028047387953182987, + "grad_norm": 2.4558048248291016, + "learning_rate": 4.990305408450014e-05, + "loss": 6.1121, + "step": 4716 + }, + { + "epoch": 0.028053335236463982, + "grad_norm": 2.4023051261901855, + "learning_rate": 4.990301298441215e-05, + "loss": 6.0202, + "step": 4717 + }, + { + "epoch": 0.02805928251974498, + "grad_norm": 3.118098258972168, + "learning_rate": 4.9902971875630765e-05, + "loss": 6.5365, + "step": 4718 + }, + { + "epoch": 0.028065229803025976, + "grad_norm": 2.3716087341308594, + "learning_rate": 4.990293075815602e-05, + "loss": 6.1382, + "step": 4719 + }, + { + "epoch": 0.028071177086306975, + "grad_norm": 2.4663496017456055, + "learning_rate": 4.990288963198791e-05, + "loss": 5.9804, + "step": 4720 + }, + { + "epoch": 0.028077124369587973, + "grad_norm": 2.2623326778411865, + "learning_rate": 4.9902848497126466e-05, + "loss": 5.9666, + "step": 4721 + }, + { + "epoch": 0.02808307165286897, + "grad_norm": 2.4884161949157715, + "learning_rate": 4.990280735357168e-05, + "loss": 6.0203, + "step": 4722 + }, + { + "epoch": 0.028089018936149967, + "grad_norm": 2.6154520511627197, + "learning_rate": 4.990276620132359e-05, + "loss": 5.9191, + "step": 4723 + }, + { + "epoch": 0.028094966219430965, + "grad_norm": 2.692396879196167, + "learning_rate": 4.990272504038221e-05, + "loss": 6.5314, + "step": 4724 + }, + { + "epoch": 0.02810091350271196, + "grad_norm": 2.483306407928467, + "learning_rate": 4.990268387074754e-05, + "loss": 6.6522, + "step": 4725 + }, + { + "epoch": 0.02810686078599296, + "grad_norm": 3.2098593711853027, + "learning_rate": 4.99026426924196e-05, + "loss": 5.8712, + "step": 4726 + }, + { + "epoch": 0.028112808069273954, + "grad_norm": 2.7335867881774902, + "learning_rate": 4.99026015053984e-05, + "loss": 5.7678, + "step": 4727 + }, + { + "epoch": 0.028118755352554953, + "grad_norm": 2.7587473392486572, + "learning_rate": 4.990256030968396e-05, + "loss": 6.4233, + "step": 4728 + }, + { + "epoch": 0.02812470263583595, + "grad_norm": 2.7686030864715576, + "learning_rate": 4.99025191052763e-05, + "loss": 6.4572, + "step": 4729 + }, + { + "epoch": 0.028130649919116946, + "grad_norm": 2.755916118621826, + "learning_rate": 4.990247789217543e-05, + "loss": 5.9858, + "step": 4730 + }, + { + "epoch": 0.028136597202397945, + "grad_norm": 2.614316463470459, + "learning_rate": 4.990243667038135e-05, + "loss": 6.2315, + "step": 4731 + }, + { + "epoch": 0.028142544485678943, + "grad_norm": 2.0796027183532715, + "learning_rate": 4.990239543989409e-05, + "loss": 6.236, + "step": 4732 + }, + { + "epoch": 0.02814849176895994, + "grad_norm": 2.623412847518921, + "learning_rate": 4.9902354200713665e-05, + "loss": 6.3962, + "step": 4733 + }, + { + "epoch": 0.028154439052240937, + "grad_norm": 2.2746191024780273, + "learning_rate": 4.9902312952840086e-05, + "loss": 5.9101, + "step": 4734 + }, + { + "epoch": 0.028160386335521932, + "grad_norm": 2.102444887161255, + "learning_rate": 4.990227169627336e-05, + "loss": 6.4652, + "step": 4735 + }, + { + "epoch": 0.02816633361880293, + "grad_norm": 2.7720580101013184, + "learning_rate": 4.990223043101352e-05, + "loss": 5.8981, + "step": 4736 + }, + { + "epoch": 0.02817228090208393, + "grad_norm": 2.4479453563690186, + "learning_rate": 4.9902189157060564e-05, + "loss": 6.3554, + "step": 4737 + }, + { + "epoch": 0.028178228185364924, + "grad_norm": 2.7894740104675293, + "learning_rate": 4.990214787441451e-05, + "loss": 6.0017, + "step": 4738 + }, + { + "epoch": 0.028184175468645923, + "grad_norm": 2.869884490966797, + "learning_rate": 4.990210658307537e-05, + "loss": 5.9419, + "step": 4739 + }, + { + "epoch": 0.028190122751926918, + "grad_norm": 2.262723207473755, + "learning_rate": 4.990206528304316e-05, + "loss": 6.172, + "step": 4740 + }, + { + "epoch": 0.028196070035207917, + "grad_norm": 2.179358720779419, + "learning_rate": 4.99020239743179e-05, + "loss": 6.5204, + "step": 4741 + }, + { + "epoch": 0.028202017318488915, + "grad_norm": 2.085179328918457, + "learning_rate": 4.9901982656899606e-05, + "loss": 6.3972, + "step": 4742 + }, + { + "epoch": 0.02820796460176991, + "grad_norm": 1.657567024230957, + "learning_rate": 4.990194133078828e-05, + "loss": 6.4199, + "step": 4743 + }, + { + "epoch": 0.02821391188505091, + "grad_norm": 1.8054349422454834, + "learning_rate": 4.990189999598395e-05, + "loss": 6.3768, + "step": 4744 + }, + { + "epoch": 0.028219859168331907, + "grad_norm": 2.0365710258483887, + "learning_rate": 4.990185865248662e-05, + "loss": 6.3228, + "step": 4745 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 2.069211006164551, + "learning_rate": 4.9901817300296304e-05, + "loss": 5.9874, + "step": 4746 + }, + { + "epoch": 0.0282317537348939, + "grad_norm": 2.3339149951934814, + "learning_rate": 4.9901775939413026e-05, + "loss": 6.1526, + "step": 4747 + }, + { + "epoch": 0.028237701018174896, + "grad_norm": 2.0425326824188232, + "learning_rate": 4.99017345698368e-05, + "loss": 6.2157, + "step": 4748 + }, + { + "epoch": 0.028243648301455895, + "grad_norm": 2.1598799228668213, + "learning_rate": 4.9901693191567625e-05, + "loss": 6.2653, + "step": 4749 + }, + { + "epoch": 0.028249595584736893, + "grad_norm": 2.066566228866577, + "learning_rate": 4.990165180460553e-05, + "loss": 6.3788, + "step": 4750 + }, + { + "epoch": 0.02825554286801789, + "grad_norm": 2.2870383262634277, + "learning_rate": 4.9901610408950527e-05, + "loss": 6.2608, + "step": 4751 + }, + { + "epoch": 0.028261490151298887, + "grad_norm": 2.3180785179138184, + "learning_rate": 4.990156900460263e-05, + "loss": 6.3545, + "step": 4752 + }, + { + "epoch": 0.028267437434579885, + "grad_norm": 2.55261492729187, + "learning_rate": 4.990152759156185e-05, + "loss": 6.3888, + "step": 4753 + }, + { + "epoch": 0.02827338471786088, + "grad_norm": 2.087925910949707, + "learning_rate": 4.990148616982821e-05, + "loss": 6.3585, + "step": 4754 + }, + { + "epoch": 0.02827933200114188, + "grad_norm": 2.2446579933166504, + "learning_rate": 4.9901444739401714e-05, + "loss": 6.4655, + "step": 4755 + }, + { + "epoch": 0.028285279284422874, + "grad_norm": 2.2980077266693115, + "learning_rate": 4.990140330028238e-05, + "loss": 6.3776, + "step": 4756 + }, + { + "epoch": 0.028291226567703873, + "grad_norm": 2.0658226013183594, + "learning_rate": 4.9901361852470224e-05, + "loss": 6.0412, + "step": 4757 + }, + { + "epoch": 0.02829717385098487, + "grad_norm": 2.8402137756347656, + "learning_rate": 4.990132039596526e-05, + "loss": 6.0017, + "step": 4758 + }, + { + "epoch": 0.028303121134265866, + "grad_norm": 2.4620237350463867, + "learning_rate": 4.99012789307675e-05, + "loss": 5.9235, + "step": 4759 + }, + { + "epoch": 0.028309068417546865, + "grad_norm": 2.3318607807159424, + "learning_rate": 4.990123745687697e-05, + "loss": 6.2464, + "step": 4760 + }, + { + "epoch": 0.028315015700827863, + "grad_norm": 2.4998981952667236, + "learning_rate": 4.9901195974293666e-05, + "loss": 6.2731, + "step": 4761 + }, + { + "epoch": 0.02832096298410886, + "grad_norm": 2.4374287128448486, + "learning_rate": 4.9901154483017614e-05, + "loss": 6.362, + "step": 4762 + }, + { + "epoch": 0.028326910267389857, + "grad_norm": 2.6257424354553223, + "learning_rate": 4.990111298304882e-05, + "loss": 6.1456, + "step": 4763 + }, + { + "epoch": 0.028332857550670852, + "grad_norm": 2.74934458732605, + "learning_rate": 4.990107147438732e-05, + "loss": 6.0121, + "step": 4764 + }, + { + "epoch": 0.02833880483395185, + "grad_norm": 2.33137583732605, + "learning_rate": 4.9901029957033106e-05, + "loss": 6.0207, + "step": 4765 + }, + { + "epoch": 0.02834475211723285, + "grad_norm": 1.9006321430206299, + "learning_rate": 4.9900988430986196e-05, + "loss": 5.8946, + "step": 4766 + }, + { + "epoch": 0.028350699400513844, + "grad_norm": 1.9786534309387207, + "learning_rate": 4.990094689624661e-05, + "loss": 5.7782, + "step": 4767 + }, + { + "epoch": 0.028356646683794843, + "grad_norm": 2.1215951442718506, + "learning_rate": 4.9900905352814365e-05, + "loss": 5.8129, + "step": 4768 + }, + { + "epoch": 0.02836259396707584, + "grad_norm": 2.9569597244262695, + "learning_rate": 4.9900863800689465e-05, + "loss": 5.7882, + "step": 4769 + }, + { + "epoch": 0.028368541250356837, + "grad_norm": 2.720447540283203, + "learning_rate": 4.990082223987193e-05, + "loss": 5.9075, + "step": 4770 + }, + { + "epoch": 0.028374488533637835, + "grad_norm": 2.8727002143859863, + "learning_rate": 4.990078067036178e-05, + "loss": 6.1571, + "step": 4771 + }, + { + "epoch": 0.02838043581691883, + "grad_norm": 2.2992594242095947, + "learning_rate": 4.990073909215902e-05, + "loss": 6.0195, + "step": 4772 + }, + { + "epoch": 0.02838638310019983, + "grad_norm": 2.0323293209075928, + "learning_rate": 4.990069750526368e-05, + "loss": 5.8049, + "step": 4773 + }, + { + "epoch": 0.028392330383480827, + "grad_norm": 2.938795328140259, + "learning_rate": 4.9900655909675755e-05, + "loss": 6.9215, + "step": 4774 + }, + { + "epoch": 0.028398277666761822, + "grad_norm": 2.6333048343658447, + "learning_rate": 4.990061430539527e-05, + "loss": 5.868, + "step": 4775 + }, + { + "epoch": 0.02840422495004282, + "grad_norm": 2.8569674491882324, + "learning_rate": 4.990057269242223e-05, + "loss": 5.8782, + "step": 4776 + }, + { + "epoch": 0.028410172233323816, + "grad_norm": 2.62206768989563, + "learning_rate": 4.9900531070756666e-05, + "loss": 5.7751, + "step": 4777 + }, + { + "epoch": 0.028416119516604815, + "grad_norm": 2.2112414836883545, + "learning_rate": 4.990048944039858e-05, + "loss": 5.7985, + "step": 4778 + }, + { + "epoch": 0.028422066799885813, + "grad_norm": 2.1571342945098877, + "learning_rate": 4.990044780134799e-05, + "loss": 5.9089, + "step": 4779 + }, + { + "epoch": 0.028428014083166808, + "grad_norm": 2.4310410022735596, + "learning_rate": 4.9900406153604916e-05, + "loss": 5.6728, + "step": 4780 + }, + { + "epoch": 0.028433961366447807, + "grad_norm": 2.25822377204895, + "learning_rate": 4.990036449716937e-05, + "loss": 5.5808, + "step": 4781 + }, + { + "epoch": 0.028439908649728805, + "grad_norm": 2.3068299293518066, + "learning_rate": 4.990032283204136e-05, + "loss": 5.729, + "step": 4782 + }, + { + "epoch": 0.0284458559330098, + "grad_norm": 2.0582191944122314, + "learning_rate": 4.9900281158220905e-05, + "loss": 5.6877, + "step": 4783 + }, + { + "epoch": 0.0284518032162908, + "grad_norm": 2.572824239730835, + "learning_rate": 4.9900239475708015e-05, + "loss": 5.9522, + "step": 4784 + }, + { + "epoch": 0.028457750499571794, + "grad_norm": 2.299001693725586, + "learning_rate": 4.990019778450271e-05, + "loss": 5.7579, + "step": 4785 + }, + { + "epoch": 0.028463697782852793, + "grad_norm": 2.231381893157959, + "learning_rate": 4.990015608460501e-05, + "loss": 5.756, + "step": 4786 + }, + { + "epoch": 0.02846964506613379, + "grad_norm": 1.7982486486434937, + "learning_rate": 4.990011437601492e-05, + "loss": 5.8076, + "step": 4787 + }, + { + "epoch": 0.028475592349414786, + "grad_norm": 1.8788951635360718, + "learning_rate": 4.990007265873245e-05, + "loss": 5.8798, + "step": 4788 + }, + { + "epoch": 0.028481539632695785, + "grad_norm": 1.6190022230148315, + "learning_rate": 4.9900030932757623e-05, + "loss": 5.5695, + "step": 4789 + }, + { + "epoch": 0.028487486915976783, + "grad_norm": 1.9226019382476807, + "learning_rate": 4.9899989198090455e-05, + "loss": 5.671, + "step": 4790 + }, + { + "epoch": 0.02849343419925778, + "grad_norm": 1.7437139749526978, + "learning_rate": 4.989994745473097e-05, + "loss": 5.6728, + "step": 4791 + }, + { + "epoch": 0.028499381482538777, + "grad_norm": 1.624126672744751, + "learning_rate": 4.989990570267915e-05, + "loss": 5.6209, + "step": 4792 + }, + { + "epoch": 0.028505328765819772, + "grad_norm": 2.1894004344940186, + "learning_rate": 4.9899863941935046e-05, + "loss": 5.6669, + "step": 4793 + }, + { + "epoch": 0.02851127604910077, + "grad_norm": 2.2243428230285645, + "learning_rate": 4.9899822172498646e-05, + "loss": 5.4557, + "step": 4794 + }, + { + "epoch": 0.02851722333238177, + "grad_norm": 2.032611608505249, + "learning_rate": 4.989978039436998e-05, + "loss": 5.7883, + "step": 4795 + }, + { + "epoch": 0.028523170615662764, + "grad_norm": 1.8496538400650024, + "learning_rate": 4.989973860754906e-05, + "loss": 5.6329, + "step": 4796 + }, + { + "epoch": 0.028529117898943763, + "grad_norm": 1.7072707414627075, + "learning_rate": 4.989969681203589e-05, + "loss": 5.7242, + "step": 4797 + }, + { + "epoch": 0.02853506518222476, + "grad_norm": 1.7351912260055542, + "learning_rate": 4.9899655007830504e-05, + "loss": 5.648, + "step": 4798 + }, + { + "epoch": 0.028541012465505756, + "grad_norm": 2.514162302017212, + "learning_rate": 4.9899613194932904e-05, + "loss": 5.556, + "step": 4799 + }, + { + "epoch": 0.028546959748786755, + "grad_norm": 10.245063781738281, + "learning_rate": 4.98995713733431e-05, + "loss": 5.5922, + "step": 4800 + }, + { + "epoch": 0.02855290703206775, + "grad_norm": 2.012106418609619, + "learning_rate": 4.989952954306112e-05, + "loss": 5.5092, + "step": 4801 + }, + { + "epoch": 0.02855885431534875, + "grad_norm": 1.8654139041900635, + "learning_rate": 4.9899487704086966e-05, + "loss": 5.4164, + "step": 4802 + }, + { + "epoch": 0.028564801598629747, + "grad_norm": 1.778798222541809, + "learning_rate": 4.9899445856420656e-05, + "loss": 5.5537, + "step": 4803 + }, + { + "epoch": 0.028570748881910742, + "grad_norm": 2.205038547515869, + "learning_rate": 4.989940400006221e-05, + "loss": 5.9338, + "step": 4804 + }, + { + "epoch": 0.02857669616519174, + "grad_norm": 2.3908839225769043, + "learning_rate": 4.989936213501164e-05, + "loss": 5.8962, + "step": 4805 + }, + { + "epoch": 0.028582643448472736, + "grad_norm": 2.3438172340393066, + "learning_rate": 4.9899320261268966e-05, + "loss": 5.8133, + "step": 4806 + }, + { + "epoch": 0.028588590731753735, + "grad_norm": 2.4021737575531006, + "learning_rate": 4.989927837883419e-05, + "loss": 5.8366, + "step": 4807 + }, + { + "epoch": 0.028594538015034733, + "grad_norm": 1.9976004362106323, + "learning_rate": 4.989923648770734e-05, + "loss": 5.6976, + "step": 4808 + }, + { + "epoch": 0.028600485298315728, + "grad_norm": 2.2234697341918945, + "learning_rate": 4.989919458788841e-05, + "loss": 5.7871, + "step": 4809 + }, + { + "epoch": 0.028606432581596727, + "grad_norm": 2.203223705291748, + "learning_rate": 4.989915267937744e-05, + "loss": 5.5799, + "step": 4810 + }, + { + "epoch": 0.028612379864877725, + "grad_norm": 2.2155261039733887, + "learning_rate": 4.989911076217442e-05, + "loss": 5.6022, + "step": 4811 + }, + { + "epoch": 0.02861832714815872, + "grad_norm": 1.9379621744155884, + "learning_rate": 4.989906883627939e-05, + "loss": 5.8647, + "step": 4812 + }, + { + "epoch": 0.02862427443143972, + "grad_norm": 2.0589749813079834, + "learning_rate": 4.9899026901692345e-05, + "loss": 5.6048, + "step": 4813 + }, + { + "epoch": 0.028630221714720714, + "grad_norm": 2.3813774585723877, + "learning_rate": 4.9898984958413315e-05, + "loss": 5.6726, + "step": 4814 + }, + { + "epoch": 0.028636168998001713, + "grad_norm": 2.06425142288208, + "learning_rate": 4.98989430064423e-05, + "loss": 5.8505, + "step": 4815 + }, + { + "epoch": 0.02864211628128271, + "grad_norm": 2.199697494506836, + "learning_rate": 4.9898901045779326e-05, + "loss": 5.6114, + "step": 4816 + }, + { + "epoch": 0.028648063564563706, + "grad_norm": 2.136411428451538, + "learning_rate": 4.98988590764244e-05, + "loss": 5.3987, + "step": 4817 + }, + { + "epoch": 0.028654010847844705, + "grad_norm": 1.914929986000061, + "learning_rate": 4.9898817098377534e-05, + "loss": 5.702, + "step": 4818 + }, + { + "epoch": 0.028659958131125703, + "grad_norm": 2.316027879714966, + "learning_rate": 4.989877511163876e-05, + "loss": 5.5886, + "step": 4819 + }, + { + "epoch": 0.0286659054144067, + "grad_norm": 3.2775018215179443, + "learning_rate": 4.9898733116208076e-05, + "loss": 5.5337, + "step": 4820 + }, + { + "epoch": 0.028671852697687697, + "grad_norm": 2.16430926322937, + "learning_rate": 4.989869111208549e-05, + "loss": 5.7189, + "step": 4821 + }, + { + "epoch": 0.028677799980968692, + "grad_norm": 2.1936638355255127, + "learning_rate": 4.9898649099271046e-05, + "loss": 5.2942, + "step": 4822 + }, + { + "epoch": 0.02868374726424969, + "grad_norm": 2.262485980987549, + "learning_rate": 4.9898607077764736e-05, + "loss": 5.4284, + "step": 4823 + }, + { + "epoch": 0.02868969454753069, + "grad_norm": 1.7890170812606812, + "learning_rate": 4.989856504756657e-05, + "loss": 5.6021, + "step": 4824 + }, + { + "epoch": 0.028695641830811684, + "grad_norm": 1.747862696647644, + "learning_rate": 4.9898523008676585e-05, + "loss": 5.72, + "step": 4825 + }, + { + "epoch": 0.028701589114092683, + "grad_norm": 1.9750064611434937, + "learning_rate": 4.989848096109477e-05, + "loss": 5.8923, + "step": 4826 + }, + { + "epoch": 0.02870753639737368, + "grad_norm": 2.0249626636505127, + "learning_rate": 4.989843890482117e-05, + "loss": 5.4866, + "step": 4827 + }, + { + "epoch": 0.028713483680654676, + "grad_norm": 2.2737395763397217, + "learning_rate": 4.9898396839855765e-05, + "loss": 5.5498, + "step": 4828 + }, + { + "epoch": 0.028719430963935675, + "grad_norm": 2.2852187156677246, + "learning_rate": 4.98983547661986e-05, + "loss": 5.672, + "step": 4829 + }, + { + "epoch": 0.02872537824721667, + "grad_norm": 1.9441994428634644, + "learning_rate": 4.989831268384967e-05, + "loss": 5.4933, + "step": 4830 + }, + { + "epoch": 0.02873132553049767, + "grad_norm": 1.9561070203781128, + "learning_rate": 4.989827059280899e-05, + "loss": 5.7465, + "step": 4831 + }, + { + "epoch": 0.028737272813778667, + "grad_norm": 2.482849597930908, + "learning_rate": 4.9898228493076594e-05, + "loss": 5.4338, + "step": 4832 + }, + { + "epoch": 0.028743220097059662, + "grad_norm": 1.8582524061203003, + "learning_rate": 4.989818638465247e-05, + "loss": 5.5378, + "step": 4833 + }, + { + "epoch": 0.02874916738034066, + "grad_norm": 2.119783639907837, + "learning_rate": 4.9898144267536654e-05, + "loss": 5.6012, + "step": 4834 + }, + { + "epoch": 0.028755114663621656, + "grad_norm": 2.333965301513672, + "learning_rate": 4.989810214172915e-05, + "loss": 5.7376, + "step": 4835 + }, + { + "epoch": 0.028761061946902654, + "grad_norm": 2.600861072540283, + "learning_rate": 4.989806000722999e-05, + "loss": 6.2747, + "step": 4836 + }, + { + "epoch": 0.028767009230183653, + "grad_norm": 2.3250534534454346, + "learning_rate": 4.989801786403916e-05, + "loss": 5.5993, + "step": 4837 + }, + { + "epoch": 0.028772956513464648, + "grad_norm": 2.507377862930298, + "learning_rate": 4.9897975712156686e-05, + "loss": 5.3919, + "step": 4838 + }, + { + "epoch": 0.028778903796745647, + "grad_norm": 1.9882018566131592, + "learning_rate": 4.9897933551582596e-05, + "loss": 5.5939, + "step": 4839 + }, + { + "epoch": 0.028784851080026645, + "grad_norm": 2.235269784927368, + "learning_rate": 4.989789138231688e-05, + "loss": 5.4036, + "step": 4840 + }, + { + "epoch": 0.02879079836330764, + "grad_norm": 1.895071029663086, + "learning_rate": 4.989784920435959e-05, + "loss": 5.7259, + "step": 4841 + }, + { + "epoch": 0.02879674564658864, + "grad_norm": 2.0197908878326416, + "learning_rate": 4.989780701771071e-05, + "loss": 5.5114, + "step": 4842 + }, + { + "epoch": 0.028802692929869634, + "grad_norm": 1.9679557085037231, + "learning_rate": 4.989776482237025e-05, + "loss": 5.5798, + "step": 4843 + }, + { + "epoch": 0.028808640213150633, + "grad_norm": 1.980610728263855, + "learning_rate": 4.989772261833825e-05, + "loss": 5.5509, + "step": 4844 + }, + { + "epoch": 0.02881458749643163, + "grad_norm": 2.4565272331237793, + "learning_rate": 4.989768040561471e-05, + "loss": 5.4723, + "step": 4845 + }, + { + "epoch": 0.028820534779712626, + "grad_norm": 2.0567848682403564, + "learning_rate": 4.989763818419964e-05, + "loss": 5.546, + "step": 4846 + }, + { + "epoch": 0.028826482062993625, + "grad_norm": 2.0259108543395996, + "learning_rate": 4.989759595409307e-05, + "loss": 5.4138, + "step": 4847 + }, + { + "epoch": 0.028832429346274623, + "grad_norm": 1.9334442615509033, + "learning_rate": 4.9897553715295003e-05, + "loss": 5.7036, + "step": 4848 + }, + { + "epoch": 0.02883837662955562, + "grad_norm": 1.8335916996002197, + "learning_rate": 4.989751146780546e-05, + "loss": 5.6399, + "step": 4849 + }, + { + "epoch": 0.028844323912836617, + "grad_norm": 2.129821538925171, + "learning_rate": 4.989746921162445e-05, + "loss": 5.7108, + "step": 4850 + }, + { + "epoch": 0.028850271196117612, + "grad_norm": 2.4127001762390137, + "learning_rate": 4.9897426946751994e-05, + "loss": 5.3901, + "step": 4851 + }, + { + "epoch": 0.02885621847939861, + "grad_norm": 1.9506126642227173, + "learning_rate": 4.98973846731881e-05, + "loss": 5.7781, + "step": 4852 + }, + { + "epoch": 0.02886216576267961, + "grad_norm": 1.6746875047683716, + "learning_rate": 4.9897342390932786e-05, + "loss": 5.7408, + "step": 4853 + }, + { + "epoch": 0.028868113045960604, + "grad_norm": 1.95681893825531, + "learning_rate": 4.989730009998607e-05, + "loss": 5.7181, + "step": 4854 + }, + { + "epoch": 0.028874060329241603, + "grad_norm": 1.782030701637268, + "learning_rate": 4.9897257800347964e-05, + "loss": 5.5901, + "step": 4855 + }, + { + "epoch": 0.0288800076125226, + "grad_norm": 1.7590057849884033, + "learning_rate": 4.9897215492018476e-05, + "loss": 5.4566, + "step": 4856 + }, + { + "epoch": 0.028885954895803596, + "grad_norm": 2.4675025939941406, + "learning_rate": 4.989717317499764e-05, + "loss": 5.7738, + "step": 4857 + }, + { + "epoch": 0.028891902179084595, + "grad_norm": 2.221975326538086, + "learning_rate": 4.989713084928545e-05, + "loss": 5.591, + "step": 4858 + }, + { + "epoch": 0.02889784946236559, + "grad_norm": 2.21158504486084, + "learning_rate": 4.989708851488192e-05, + "loss": 5.7755, + "step": 4859 + }, + { + "epoch": 0.02890379674564659, + "grad_norm": 2.2253987789154053, + "learning_rate": 4.989704617178709e-05, + "loss": 5.8653, + "step": 4860 + }, + { + "epoch": 0.028909744028927587, + "grad_norm": 2.3298027515411377, + "learning_rate": 4.989700382000094e-05, + "loss": 5.3371, + "step": 4861 + }, + { + "epoch": 0.028915691312208582, + "grad_norm": 2.1918935775756836, + "learning_rate": 4.989696145952352e-05, + "loss": 5.4893, + "step": 4862 + }, + { + "epoch": 0.02892163859548958, + "grad_norm": 2.422117233276367, + "learning_rate": 4.989691909035482e-05, + "loss": 5.8775, + "step": 4863 + }, + { + "epoch": 0.02892758587877058, + "grad_norm": 2.4346981048583984, + "learning_rate": 4.989687671249487e-05, + "loss": 6.3671, + "step": 4864 + }, + { + "epoch": 0.028933533162051574, + "grad_norm": 2.094780921936035, + "learning_rate": 4.989683432594367e-05, + "loss": 5.7814, + "step": 4865 + }, + { + "epoch": 0.028939480445332573, + "grad_norm": 2.240318775177002, + "learning_rate": 4.9896791930701244e-05, + "loss": 5.6606, + "step": 4866 + }, + { + "epoch": 0.028945427728613568, + "grad_norm": 2.102381706237793, + "learning_rate": 4.989674952676761e-05, + "loss": 5.8477, + "step": 4867 + }, + { + "epoch": 0.028951375011894567, + "grad_norm": 2.2786238193511963, + "learning_rate": 4.989670711414277e-05, + "loss": 5.8786, + "step": 4868 + }, + { + "epoch": 0.028957322295175565, + "grad_norm": 2.079899549484253, + "learning_rate": 4.989666469282675e-05, + "loss": 6.2171, + "step": 4869 + }, + { + "epoch": 0.02896326957845656, + "grad_norm": 2.024061679840088, + "learning_rate": 4.989662226281956e-05, + "loss": 6.2889, + "step": 4870 + }, + { + "epoch": 0.02896921686173756, + "grad_norm": 2.1397578716278076, + "learning_rate": 4.989657982412122e-05, + "loss": 6.2477, + "step": 4871 + }, + { + "epoch": 0.028975164145018554, + "grad_norm": 2.1303393840789795, + "learning_rate": 4.989653737673174e-05, + "loss": 6.3005, + "step": 4872 + }, + { + "epoch": 0.028981111428299552, + "grad_norm": 2.4091451168060303, + "learning_rate": 4.989649492065114e-05, + "loss": 5.997, + "step": 4873 + }, + { + "epoch": 0.02898705871158055, + "grad_norm": 2.2236886024475098, + "learning_rate": 4.989645245587942e-05, + "loss": 5.7886, + "step": 4874 + }, + { + "epoch": 0.028993005994861546, + "grad_norm": 2.6160736083984375, + "learning_rate": 4.989640998241661e-05, + "loss": 6.1542, + "step": 4875 + }, + { + "epoch": 0.028998953278142545, + "grad_norm": 2.4163296222686768, + "learning_rate": 4.989636750026273e-05, + "loss": 6.392, + "step": 4876 + }, + { + "epoch": 0.029004900561423543, + "grad_norm": 2.079172372817993, + "learning_rate": 4.989632500941778e-05, + "loss": 6.2886, + "step": 4877 + }, + { + "epoch": 0.02901084784470454, + "grad_norm": 2.628694772720337, + "learning_rate": 4.989628250988178e-05, + "loss": 6.0359, + "step": 4878 + }, + { + "epoch": 0.029016795127985537, + "grad_norm": 2.2080392837524414, + "learning_rate": 4.989624000165474e-05, + "loss": 5.9916, + "step": 4879 + }, + { + "epoch": 0.029022742411266532, + "grad_norm": 2.4130380153656006, + "learning_rate": 4.9896197484736685e-05, + "loss": 6.3835, + "step": 4880 + }, + { + "epoch": 0.02902868969454753, + "grad_norm": 2.328511953353882, + "learning_rate": 4.989615495912762e-05, + "loss": 5.838, + "step": 4881 + }, + { + "epoch": 0.02903463697782853, + "grad_norm": 2.273345470428467, + "learning_rate": 4.989611242482757e-05, + "loss": 5.8764, + "step": 4882 + }, + { + "epoch": 0.029040584261109524, + "grad_norm": 2.1498537063598633, + "learning_rate": 4.9896069881836535e-05, + "loss": 6.1562, + "step": 4883 + }, + { + "epoch": 0.029046531544390523, + "grad_norm": 2.497267723083496, + "learning_rate": 4.989602733015455e-05, + "loss": 5.6708, + "step": 4884 + }, + { + "epoch": 0.02905247882767152, + "grad_norm": 2.232802152633667, + "learning_rate": 4.989598476978161e-05, + "loss": 5.6854, + "step": 4885 + }, + { + "epoch": 0.029058426110952516, + "grad_norm": 2.0582375526428223, + "learning_rate": 4.989594220071775e-05, + "loss": 6.5288, + "step": 4886 + }, + { + "epoch": 0.029064373394233515, + "grad_norm": 3.2556731700897217, + "learning_rate": 4.989589962296296e-05, + "loss": 5.9985, + "step": 4887 + }, + { + "epoch": 0.02907032067751451, + "grad_norm": 2.2807655334472656, + "learning_rate": 4.989585703651728e-05, + "loss": 6.1802, + "step": 4888 + }, + { + "epoch": 0.02907626796079551, + "grad_norm": 2.379136085510254, + "learning_rate": 4.989581444138071e-05, + "loss": 6.3531, + "step": 4889 + }, + { + "epoch": 0.029082215244076507, + "grad_norm": 2.9518685340881348, + "learning_rate": 4.989577183755327e-05, + "loss": 6.0689, + "step": 4890 + }, + { + "epoch": 0.029088162527357502, + "grad_norm": 2.823340654373169, + "learning_rate": 4.9895729225034973e-05, + "loss": 6.3405, + "step": 4891 + }, + { + "epoch": 0.0290941098106385, + "grad_norm": 2.4327731132507324, + "learning_rate": 4.989568660382583e-05, + "loss": 6.4928, + "step": 4892 + }, + { + "epoch": 0.0291000570939195, + "grad_norm": 2.0744240283966064, + "learning_rate": 4.9895643973925864e-05, + "loss": 6.2664, + "step": 4893 + }, + { + "epoch": 0.029106004377200494, + "grad_norm": 2.373710870742798, + "learning_rate": 4.9895601335335085e-05, + "loss": 5.9738, + "step": 4894 + }, + { + "epoch": 0.029111951660481493, + "grad_norm": 2.2934412956237793, + "learning_rate": 4.9895558688053505e-05, + "loss": 6.1353, + "step": 4895 + }, + { + "epoch": 0.029117898943762488, + "grad_norm": 2.4360926151275635, + "learning_rate": 4.989551603208114e-05, + "loss": 5.4768, + "step": 4896 + }, + { + "epoch": 0.029123846227043487, + "grad_norm": 2.8072469234466553, + "learning_rate": 4.989547336741802e-05, + "loss": 5.977, + "step": 4897 + }, + { + "epoch": 0.029129793510324485, + "grad_norm": 2.7759921550750732, + "learning_rate": 4.9895430694064135e-05, + "loss": 6.3918, + "step": 4898 + }, + { + "epoch": 0.02913574079360548, + "grad_norm": 2.4547574520111084, + "learning_rate": 4.989538801201953e-05, + "loss": 6.0461, + "step": 4899 + }, + { + "epoch": 0.02914168807688648, + "grad_norm": 2.6097168922424316, + "learning_rate": 4.9895345321284184e-05, + "loss": 5.88, + "step": 4900 + }, + { + "epoch": 0.029147635360167474, + "grad_norm": 2.8312575817108154, + "learning_rate": 4.989530262185814e-05, + "loss": 6.0314, + "step": 4901 + }, + { + "epoch": 0.029153582643448472, + "grad_norm": 2.928974151611328, + "learning_rate": 4.98952599137414e-05, + "loss": 6.3698, + "step": 4902 + }, + { + "epoch": 0.02915952992672947, + "grad_norm": 2.527578115463257, + "learning_rate": 4.989521719693398e-05, + "loss": 6.4301, + "step": 4903 + }, + { + "epoch": 0.029165477210010466, + "grad_norm": 2.392106771469116, + "learning_rate": 4.9895174471435904e-05, + "loss": 6.3515, + "step": 4904 + }, + { + "epoch": 0.029171424493291465, + "grad_norm": 1.9899437427520752, + "learning_rate": 4.989513173724717e-05, + "loss": 6.3265, + "step": 4905 + }, + { + "epoch": 0.029177371776572463, + "grad_norm": 2.057600736618042, + "learning_rate": 4.9895088994367806e-05, + "loss": 6.2402, + "step": 4906 + }, + { + "epoch": 0.029183319059853458, + "grad_norm": 2.8310391902923584, + "learning_rate": 4.989504624279783e-05, + "loss": 5.9056, + "step": 4907 + }, + { + "epoch": 0.029189266343134457, + "grad_norm": 2.904785394668579, + "learning_rate": 4.989500348253724e-05, + "loss": 5.8847, + "step": 4908 + }, + { + "epoch": 0.029195213626415452, + "grad_norm": 2.7728030681610107, + "learning_rate": 4.989496071358607e-05, + "loss": 5.8997, + "step": 4909 + }, + { + "epoch": 0.02920116090969645, + "grad_norm": 2.768862009048462, + "learning_rate": 4.989491793594432e-05, + "loss": 6.1267, + "step": 4910 + }, + { + "epoch": 0.02920710819297745, + "grad_norm": 2.4353668689727783, + "learning_rate": 4.989487514961201e-05, + "loss": 5.9087, + "step": 4911 + }, + { + "epoch": 0.029213055476258444, + "grad_norm": 2.5170469284057617, + "learning_rate": 4.9894832354589164e-05, + "loss": 6.0971, + "step": 4912 + }, + { + "epoch": 0.029219002759539443, + "grad_norm": 2.345998764038086, + "learning_rate": 4.9894789550875784e-05, + "loss": 6.2518, + "step": 4913 + }, + { + "epoch": 0.02922495004282044, + "grad_norm": 2.429123878479004, + "learning_rate": 4.98947467384719e-05, + "loss": 6.238, + "step": 4914 + }, + { + "epoch": 0.029230897326101436, + "grad_norm": 2.531514883041382, + "learning_rate": 4.9894703917377506e-05, + "loss": 6.0177, + "step": 4915 + }, + { + "epoch": 0.029236844609382435, + "grad_norm": 2.833874464035034, + "learning_rate": 4.9894661087592634e-05, + "loss": 6.2018, + "step": 4916 + }, + { + "epoch": 0.02924279189266343, + "grad_norm": 2.521381378173828, + "learning_rate": 4.9894618249117287e-05, + "loss": 6.1777, + "step": 4917 + }, + { + "epoch": 0.02924873917594443, + "grad_norm": 2.731703758239746, + "learning_rate": 4.989457540195149e-05, + "loss": 6.0237, + "step": 4918 + }, + { + "epoch": 0.029254686459225427, + "grad_norm": 2.918398141860962, + "learning_rate": 4.989453254609525e-05, + "loss": 6.5688, + "step": 4919 + }, + { + "epoch": 0.029260633742506422, + "grad_norm": 2.407552480697632, + "learning_rate": 4.989448968154859e-05, + "loss": 5.9751, + "step": 4920 + }, + { + "epoch": 0.02926658102578742, + "grad_norm": 2.575258731842041, + "learning_rate": 4.989444680831152e-05, + "loss": 5.7587, + "step": 4921 + }, + { + "epoch": 0.02927252830906842, + "grad_norm": 2.6550750732421875, + "learning_rate": 4.989440392638406e-05, + "loss": 6.6404, + "step": 4922 + }, + { + "epoch": 0.029278475592349414, + "grad_norm": 2.569438934326172, + "learning_rate": 4.989436103576621e-05, + "loss": 5.8615, + "step": 4923 + }, + { + "epoch": 0.029284422875630413, + "grad_norm": 2.4601991176605225, + "learning_rate": 4.989431813645801e-05, + "loss": 5.8969, + "step": 4924 + }, + { + "epoch": 0.029290370158911408, + "grad_norm": 3.579819917678833, + "learning_rate": 4.989427522845945e-05, + "loss": 5.8832, + "step": 4925 + }, + { + "epoch": 0.029296317442192406, + "grad_norm": 2.5762264728546143, + "learning_rate": 4.9894232311770556e-05, + "loss": 5.4841, + "step": 4926 + }, + { + "epoch": 0.029302264725473405, + "grad_norm": 3.352381706237793, + "learning_rate": 4.989418938639134e-05, + "loss": 5.8936, + "step": 4927 + }, + { + "epoch": 0.0293082120087544, + "grad_norm": 2.824322462081909, + "learning_rate": 4.9894146452321835e-05, + "loss": 5.8291, + "step": 4928 + }, + { + "epoch": 0.0293141592920354, + "grad_norm": 2.6431384086608887, + "learning_rate": 4.9894103509562026e-05, + "loss": 6.2519, + "step": 4929 + }, + { + "epoch": 0.029320106575316394, + "grad_norm": 3.0580949783325195, + "learning_rate": 4.989406055811195e-05, + "loss": 6.4141, + "step": 4930 + }, + { + "epoch": 0.029326053858597392, + "grad_norm": 2.757420778274536, + "learning_rate": 4.989401759797161e-05, + "loss": 6.1427, + "step": 4931 + }, + { + "epoch": 0.02933200114187839, + "grad_norm": 2.713111639022827, + "learning_rate": 4.989397462914103e-05, + "loss": 6.4107, + "step": 4932 + }, + { + "epoch": 0.029337948425159386, + "grad_norm": 2.7954351902008057, + "learning_rate": 4.9893931651620215e-05, + "loss": 5.7657, + "step": 4933 + }, + { + "epoch": 0.029343895708440385, + "grad_norm": 2.3637917041778564, + "learning_rate": 4.9893888665409196e-05, + "loss": 5.8209, + "step": 4934 + }, + { + "epoch": 0.029349842991721383, + "grad_norm": 2.938631296157837, + "learning_rate": 4.9893845670507964e-05, + "loss": 6.0502, + "step": 4935 + }, + { + "epoch": 0.029355790275002378, + "grad_norm": 2.8911824226379395, + "learning_rate": 4.989380266691655e-05, + "loss": 5.9736, + "step": 4936 + }, + { + "epoch": 0.029361737558283377, + "grad_norm": 2.9410245418548584, + "learning_rate": 4.989375965463498e-05, + "loss": 5.2824, + "step": 4937 + }, + { + "epoch": 0.029367684841564372, + "grad_norm": 2.4925217628479004, + "learning_rate": 4.9893716633663244e-05, + "loss": 5.5829, + "step": 4938 + }, + { + "epoch": 0.02937363212484537, + "grad_norm": 2.485349178314209, + "learning_rate": 4.9893673604001366e-05, + "loss": 5.8812, + "step": 4939 + }, + { + "epoch": 0.02937957940812637, + "grad_norm": 2.3950133323669434, + "learning_rate": 4.9893630565649376e-05, + "loss": 5.9314, + "step": 4940 + }, + { + "epoch": 0.029385526691407364, + "grad_norm": 2.28104829788208, + "learning_rate": 4.989358751860726e-05, + "loss": 6.1768, + "step": 4941 + }, + { + "epoch": 0.029391473974688363, + "grad_norm": 2.4479010105133057, + "learning_rate": 4.989354446287507e-05, + "loss": 6.1645, + "step": 4942 + }, + { + "epoch": 0.02939742125796936, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.989350139845279e-05, + "loss": 5.7145, + "step": 4943 + }, + { + "epoch": 0.029403368541250356, + "grad_norm": 2.4120032787323, + "learning_rate": 4.989345832534045e-05, + "loss": 5.695, + "step": 4944 + }, + { + "epoch": 0.029409315824531355, + "grad_norm": 2.6345109939575195, + "learning_rate": 4.989341524353805e-05, + "loss": 5.4805, + "step": 4945 + }, + { + "epoch": 0.02941526310781235, + "grad_norm": 2.8750240802764893, + "learning_rate": 4.989337215304563e-05, + "loss": 5.0352, + "step": 4946 + }, + { + "epoch": 0.02942121039109335, + "grad_norm": 2.7220489978790283, + "learning_rate": 4.989332905386318e-05, + "loss": 5.1646, + "step": 4947 + }, + { + "epoch": 0.029427157674374347, + "grad_norm": 2.464871883392334, + "learning_rate": 4.9893285945990734e-05, + "loss": 4.9989, + "step": 4948 + }, + { + "epoch": 0.029433104957655342, + "grad_norm": 2.261049270629883, + "learning_rate": 4.989324282942829e-05, + "loss": 6.2217, + "step": 4949 + }, + { + "epoch": 0.02943905224093634, + "grad_norm": 2.224818468093872, + "learning_rate": 4.9893199704175876e-05, + "loss": 6.3964, + "step": 4950 + }, + { + "epoch": 0.02944499952421734, + "grad_norm": 2.366520643234253, + "learning_rate": 4.989315657023351e-05, + "loss": 6.3572, + "step": 4951 + }, + { + "epoch": 0.029450946807498334, + "grad_norm": 2.4811010360717773, + "learning_rate": 4.989311342760119e-05, + "loss": 5.7867, + "step": 4952 + }, + { + "epoch": 0.029456894090779333, + "grad_norm": 2.246730089187622, + "learning_rate": 4.989307027627895e-05, + "loss": 6.0865, + "step": 4953 + }, + { + "epoch": 0.029462841374060328, + "grad_norm": 2.297379493713379, + "learning_rate": 4.989302711626679e-05, + "loss": 5.9257, + "step": 4954 + }, + { + "epoch": 0.029468788657341326, + "grad_norm": 2.5890488624572754, + "learning_rate": 4.989298394756473e-05, + "loss": 5.7631, + "step": 4955 + }, + { + "epoch": 0.029474735940622325, + "grad_norm": 3.3777449131011963, + "learning_rate": 4.989294077017279e-05, + "loss": 5.4014, + "step": 4956 + }, + { + "epoch": 0.02948068322390332, + "grad_norm": 2.0395402908325195, + "learning_rate": 4.9892897584090986e-05, + "loss": 6.2429, + "step": 4957 + }, + { + "epoch": 0.02948663050718432, + "grad_norm": 2.0414693355560303, + "learning_rate": 4.989285438931932e-05, + "loss": 6.4685, + "step": 4958 + }, + { + "epoch": 0.029492577790465314, + "grad_norm": 2.2383265495300293, + "learning_rate": 4.989281118585783e-05, + "loss": 6.1651, + "step": 4959 + }, + { + "epoch": 0.029498525073746312, + "grad_norm": 2.559720754623413, + "learning_rate": 4.98927679737065e-05, + "loss": 6.3822, + "step": 4960 + }, + { + "epoch": 0.02950447235702731, + "grad_norm": 2.810699939727783, + "learning_rate": 4.989272475286537e-05, + "loss": 6.2076, + "step": 4961 + }, + { + "epoch": 0.029510419640308306, + "grad_norm": 2.9151525497436523, + "learning_rate": 4.989268152333445e-05, + "loss": 5.9892, + "step": 4962 + }, + { + "epoch": 0.029516366923589304, + "grad_norm": 2.295197010040283, + "learning_rate": 4.9892638285113744e-05, + "loss": 6.1392, + "step": 4963 + }, + { + "epoch": 0.029522314206870303, + "grad_norm": 2.271088123321533, + "learning_rate": 4.989259503820328e-05, + "loss": 6.6991, + "step": 4964 + }, + { + "epoch": 0.029528261490151298, + "grad_norm": 2.338074207305908, + "learning_rate": 4.9892551782603064e-05, + "loss": 5.9615, + "step": 4965 + }, + { + "epoch": 0.029534208773432297, + "grad_norm": 2.3510494232177734, + "learning_rate": 4.989250851831312e-05, + "loss": 5.8894, + "step": 4966 + }, + { + "epoch": 0.029540156056713292, + "grad_norm": 2.1170454025268555, + "learning_rate": 4.989246524533345e-05, + "loss": 5.6921, + "step": 4967 + }, + { + "epoch": 0.02954610333999429, + "grad_norm": 3.289508104324341, + "learning_rate": 4.989242196366409e-05, + "loss": 6.1689, + "step": 4968 + }, + { + "epoch": 0.02955205062327529, + "grad_norm": 2.068229913711548, + "learning_rate": 4.989237867330504e-05, + "loss": 6.3342, + "step": 4969 + }, + { + "epoch": 0.029557997906556284, + "grad_norm": 2.198928117752075, + "learning_rate": 4.9892335374256316e-05, + "loss": 6.5125, + "step": 4970 + }, + { + "epoch": 0.029563945189837283, + "grad_norm": 2.3634228706359863, + "learning_rate": 4.989229206651793e-05, + "loss": 5.8328, + "step": 4971 + }, + { + "epoch": 0.02956989247311828, + "grad_norm": 2.1632115840911865, + "learning_rate": 4.989224875008991e-05, + "loss": 6.0702, + "step": 4972 + }, + { + "epoch": 0.029575839756399276, + "grad_norm": 2.461888313293457, + "learning_rate": 4.989220542497226e-05, + "loss": 6.01, + "step": 4973 + }, + { + "epoch": 0.029581787039680275, + "grad_norm": 2.668333053588867, + "learning_rate": 4.9892162091164997e-05, + "loss": 6.0369, + "step": 4974 + }, + { + "epoch": 0.02958773432296127, + "grad_norm": 3.0210723876953125, + "learning_rate": 4.9892118748668135e-05, + "loss": 6.0652, + "step": 4975 + }, + { + "epoch": 0.02959368160624227, + "grad_norm": 2.937350034713745, + "learning_rate": 4.98920753974817e-05, + "loss": 6.0205, + "step": 4976 + }, + { + "epoch": 0.029599628889523267, + "grad_norm": 2.904499053955078, + "learning_rate": 4.9892032037605685e-05, + "loss": 5.9561, + "step": 4977 + }, + { + "epoch": 0.029605576172804262, + "grad_norm": 2.218867778778076, + "learning_rate": 4.989198866904013e-05, + "loss": 5.4173, + "step": 4978 + }, + { + "epoch": 0.02961152345608526, + "grad_norm": 3.009920835494995, + "learning_rate": 4.9891945291785034e-05, + "loss": 5.5577, + "step": 4979 + }, + { + "epoch": 0.02961747073936626, + "grad_norm": 2.731687545776367, + "learning_rate": 4.9891901905840424e-05, + "loss": 5.6591, + "step": 4980 + }, + { + "epoch": 0.029623418022647254, + "grad_norm": 2.244101047515869, + "learning_rate": 4.98918585112063e-05, + "loss": 6.1434, + "step": 4981 + }, + { + "epoch": 0.029629365305928253, + "grad_norm": 2.3366870880126953, + "learning_rate": 4.989181510788269e-05, + "loss": 6.0132, + "step": 4982 + }, + { + "epoch": 0.029635312589209248, + "grad_norm": 3.2757890224456787, + "learning_rate": 4.98917716958696e-05, + "loss": 5.7486, + "step": 4983 + }, + { + "epoch": 0.029641259872490246, + "grad_norm": 2.361041784286499, + "learning_rate": 4.989172827516705e-05, + "loss": 5.8192, + "step": 4984 + }, + { + "epoch": 0.029647207155771245, + "grad_norm": 3.3433775901794434, + "learning_rate": 4.9891684845775054e-05, + "loss": 5.8688, + "step": 4985 + }, + { + "epoch": 0.02965315443905224, + "grad_norm": 2.6427462100982666, + "learning_rate": 4.9891641407693635e-05, + "loss": 5.9459, + "step": 4986 + }, + { + "epoch": 0.02965910172233324, + "grad_norm": 3.0931055545806885, + "learning_rate": 4.9891597960922795e-05, + "loss": 6.4822, + "step": 4987 + }, + { + "epoch": 0.029665049005614237, + "grad_norm": 2.598477840423584, + "learning_rate": 4.989155450546256e-05, + "loss": 6.0362, + "step": 4988 + }, + { + "epoch": 0.029670996288895232, + "grad_norm": 2.460313081741333, + "learning_rate": 4.989151104131294e-05, + "loss": 5.6209, + "step": 4989 + }, + { + "epoch": 0.02967694357217623, + "grad_norm": 2.4712390899658203, + "learning_rate": 4.989146756847395e-05, + "loss": 6.3849, + "step": 4990 + }, + { + "epoch": 0.029682890855457226, + "grad_norm": 2.365860939025879, + "learning_rate": 4.98914240869456e-05, + "loss": 6.2791, + "step": 4991 + }, + { + "epoch": 0.029688838138738224, + "grad_norm": 2.6213366985321045, + "learning_rate": 4.9891380596727915e-05, + "loss": 6.2888, + "step": 4992 + }, + { + "epoch": 0.029694785422019223, + "grad_norm": 2.742213487625122, + "learning_rate": 4.989133709782091e-05, + "loss": 6.3522, + "step": 4993 + }, + { + "epoch": 0.029700732705300218, + "grad_norm": 2.2428665161132812, + "learning_rate": 4.9891293590224594e-05, + "loss": 6.6735, + "step": 4994 + }, + { + "epoch": 0.029706679988581217, + "grad_norm": 2.4242279529571533, + "learning_rate": 4.989125007393898e-05, + "loss": 6.2283, + "step": 4995 + }, + { + "epoch": 0.02971262727186221, + "grad_norm": 2.422177314758301, + "learning_rate": 4.989120654896409e-05, + "loss": 6.0273, + "step": 4996 + }, + { + "epoch": 0.02971857455514321, + "grad_norm": 2.4325926303863525, + "learning_rate": 4.989116301529994e-05, + "loss": 5.9504, + "step": 4997 + }, + { + "epoch": 0.02972452183842421, + "grad_norm": 2.42901873588562, + "learning_rate": 4.9891119472946544e-05, + "loss": 5.8156, + "step": 4998 + }, + { + "epoch": 0.029730469121705204, + "grad_norm": 2.4361307621002197, + "learning_rate": 4.989107592190391e-05, + "loss": 5.9025, + "step": 4999 + }, + { + "epoch": 0.029736416404986202, + "grad_norm": 2.9486470222473145, + "learning_rate": 4.9891032362172065e-05, + "loss": 6.3204, + "step": 5000 + }, + { + "epoch": 0.0297423636882672, + "grad_norm": 2.456681966781616, + "learning_rate": 4.989098879375101e-05, + "loss": 5.8203, + "step": 5001 + }, + { + "epoch": 0.029748310971548196, + "grad_norm": 2.5065391063690186, + "learning_rate": 4.9890945216640775e-05, + "loss": 6.452, + "step": 5002 + }, + { + "epoch": 0.029754258254829195, + "grad_norm": 2.386488199234009, + "learning_rate": 4.989090163084136e-05, + "loss": 5.9195, + "step": 5003 + }, + { + "epoch": 0.02976020553811019, + "grad_norm": 2.1387040615081787, + "learning_rate": 4.9890858036352796e-05, + "loss": 6.2127, + "step": 5004 + }, + { + "epoch": 0.02976615282139119, + "grad_norm": 2.518099784851074, + "learning_rate": 4.989081443317508e-05, + "loss": 6.1099, + "step": 5005 + }, + { + "epoch": 0.029772100104672187, + "grad_norm": 3.2108826637268066, + "learning_rate": 4.989077082130825e-05, + "loss": 5.9808, + "step": 5006 + }, + { + "epoch": 0.029778047387953182, + "grad_norm": 2.176065444946289, + "learning_rate": 4.9890727200752304e-05, + "loss": 6.0825, + "step": 5007 + }, + { + "epoch": 0.02978399467123418, + "grad_norm": 2.2961249351501465, + "learning_rate": 4.9890683571507265e-05, + "loss": 5.968, + "step": 5008 + }, + { + "epoch": 0.02978994195451518, + "grad_norm": 2.1954386234283447, + "learning_rate": 4.9890639933573144e-05, + "loss": 6.0799, + "step": 5009 + }, + { + "epoch": 0.029795889237796174, + "grad_norm": 2.256039619445801, + "learning_rate": 4.989059628694995e-05, + "loss": 5.9503, + "step": 5010 + }, + { + "epoch": 0.029801836521077173, + "grad_norm": 2.4350922107696533, + "learning_rate": 4.9890552631637715e-05, + "loss": 5.6741, + "step": 5011 + }, + { + "epoch": 0.029807783804358168, + "grad_norm": 2.68904447555542, + "learning_rate": 4.989050896763645e-05, + "loss": 5.5872, + "step": 5012 + }, + { + "epoch": 0.029813731087639166, + "grad_norm": 2.2877871990203857, + "learning_rate": 4.989046529494615e-05, + "loss": 6.1273, + "step": 5013 + }, + { + "epoch": 0.029819678370920165, + "grad_norm": 2.350348711013794, + "learning_rate": 4.989042161356686e-05, + "loss": 6.1113, + "step": 5014 + }, + { + "epoch": 0.02982562565420116, + "grad_norm": 2.295382499694824, + "learning_rate": 4.989037792349858e-05, + "loss": 6.036, + "step": 5015 + }, + { + "epoch": 0.02983157293748216, + "grad_norm": 2.317863941192627, + "learning_rate": 4.989033422474131e-05, + "loss": 5.961, + "step": 5016 + }, + { + "epoch": 0.029837520220763157, + "grad_norm": 2.286289930343628, + "learning_rate": 4.9890290517295095e-05, + "loss": 5.8163, + "step": 5017 + }, + { + "epoch": 0.029843467504044152, + "grad_norm": 2.246863842010498, + "learning_rate": 4.989024680115993e-05, + "loss": 5.9689, + "step": 5018 + }, + { + "epoch": 0.02984941478732515, + "grad_norm": 1.8732661008834839, + "learning_rate": 4.989020307633585e-05, + "loss": 5.9046, + "step": 5019 + }, + { + "epoch": 0.029855362070606146, + "grad_norm": 2.0211753845214844, + "learning_rate": 4.989015934282285e-05, + "loss": 5.95, + "step": 5020 + }, + { + "epoch": 0.029861309353887144, + "grad_norm": 2.014890193939209, + "learning_rate": 4.9890115600620946e-05, + "loss": 5.7312, + "step": 5021 + }, + { + "epoch": 0.029867256637168143, + "grad_norm": 2.2749524116516113, + "learning_rate": 4.989007184973017e-05, + "loss": 6.2573, + "step": 5022 + }, + { + "epoch": 0.029873203920449138, + "grad_norm": 2.080747604370117, + "learning_rate": 4.989002809015052e-05, + "loss": 5.7607, + "step": 5023 + }, + { + "epoch": 0.029879151203730137, + "grad_norm": 2.3403279781341553, + "learning_rate": 4.988998432188202e-05, + "loss": 5.7876, + "step": 5024 + }, + { + "epoch": 0.02988509848701113, + "grad_norm": 2.573802947998047, + "learning_rate": 4.988994054492468e-05, + "loss": 5.9036, + "step": 5025 + }, + { + "epoch": 0.02989104577029213, + "grad_norm": 2.267409324645996, + "learning_rate": 4.988989675927853e-05, + "loss": 5.7433, + "step": 5026 + }, + { + "epoch": 0.02989699305357313, + "grad_norm": 2.8241517543792725, + "learning_rate": 4.9889852964943566e-05, + "loss": 6.2338, + "step": 5027 + }, + { + "epoch": 0.029902940336854124, + "grad_norm": 2.338927745819092, + "learning_rate": 4.988980916191982e-05, + "loss": 6.0226, + "step": 5028 + }, + { + "epoch": 0.029908887620135122, + "grad_norm": 2.0798492431640625, + "learning_rate": 4.9889765350207285e-05, + "loss": 5.6919, + "step": 5029 + }, + { + "epoch": 0.02991483490341612, + "grad_norm": 2.3199923038482666, + "learning_rate": 4.9889721529806e-05, + "loss": 5.7533, + "step": 5030 + }, + { + "epoch": 0.029920782186697116, + "grad_norm": 2.1074399948120117, + "learning_rate": 4.988967770071596e-05, + "loss": 5.7486, + "step": 5031 + }, + { + "epoch": 0.029926729469978115, + "grad_norm": 2.2539381980895996, + "learning_rate": 4.9889633862937205e-05, + "loss": 5.6816, + "step": 5032 + }, + { + "epoch": 0.02993267675325911, + "grad_norm": 2.1393015384674072, + "learning_rate": 4.9889590016469726e-05, + "loss": 5.6635, + "step": 5033 + }, + { + "epoch": 0.029938624036540108, + "grad_norm": 2.6661975383758545, + "learning_rate": 4.988954616131355e-05, + "loss": 6.0218, + "step": 5034 + }, + { + "epoch": 0.029944571319821107, + "grad_norm": 2.6529600620269775, + "learning_rate": 4.988950229746869e-05, + "loss": 5.8847, + "step": 5035 + }, + { + "epoch": 0.029950518603102102, + "grad_norm": 2.510859966278076, + "learning_rate": 4.988945842493517e-05, + "loss": 5.7154, + "step": 5036 + }, + { + "epoch": 0.0299564658863831, + "grad_norm": 2.875394105911255, + "learning_rate": 4.9889414543712985e-05, + "loss": 5.6304, + "step": 5037 + }, + { + "epoch": 0.0299624131696641, + "grad_norm": 2.718808650970459, + "learning_rate": 4.988937065380217e-05, + "loss": 5.6562, + "step": 5038 + }, + { + "epoch": 0.029968360452945094, + "grad_norm": 2.702265501022339, + "learning_rate": 4.988932675520273e-05, + "loss": 5.6484, + "step": 5039 + }, + { + "epoch": 0.029974307736226093, + "grad_norm": 2.765209436416626, + "learning_rate": 4.988928284791469e-05, + "loss": 5.793, + "step": 5040 + }, + { + "epoch": 0.029980255019507088, + "grad_norm": 3.386352062225342, + "learning_rate": 4.9889238931938047e-05, + "loss": 5.5392, + "step": 5041 + }, + { + "epoch": 0.029986202302788086, + "grad_norm": 2.1632583141326904, + "learning_rate": 4.988919500727284e-05, + "loss": 5.8032, + "step": 5042 + }, + { + "epoch": 0.029992149586069085, + "grad_norm": 2.4121060371398926, + "learning_rate": 4.9889151073919064e-05, + "loss": 5.9793, + "step": 5043 + }, + { + "epoch": 0.02999809686935008, + "grad_norm": 2.2160584926605225, + "learning_rate": 4.988910713187674e-05, + "loss": 5.8802, + "step": 5044 + }, + { + "epoch": 0.03000404415263108, + "grad_norm": 3.120509386062622, + "learning_rate": 4.988906318114589e-05, + "loss": 5.5691, + "step": 5045 + }, + { + "epoch": 0.030009991435912077, + "grad_norm": 3.0660078525543213, + "learning_rate": 4.988901922172652e-05, + "loss": 5.3687, + "step": 5046 + }, + { + "epoch": 0.030015938719193072, + "grad_norm": 1.939757227897644, + "learning_rate": 4.988897525361867e-05, + "loss": 5.526, + "step": 5047 + }, + { + "epoch": 0.03002188600247407, + "grad_norm": 2.2970168590545654, + "learning_rate": 4.9888931276822315e-05, + "loss": 5.6334, + "step": 5048 + }, + { + "epoch": 0.030027833285755066, + "grad_norm": 2.162632942199707, + "learning_rate": 4.988888729133749e-05, + "loss": 5.8887, + "step": 5049 + }, + { + "epoch": 0.030033780569036064, + "grad_norm": 2.027017831802368, + "learning_rate": 4.9888843297164223e-05, + "loss": 5.9237, + "step": 5050 + }, + { + "epoch": 0.030039727852317063, + "grad_norm": 1.9226456880569458, + "learning_rate": 4.988879929430251e-05, + "loss": 5.6833, + "step": 5051 + }, + { + "epoch": 0.030045675135598058, + "grad_norm": 1.6490615606307983, + "learning_rate": 4.9888755282752384e-05, + "loss": 5.5738, + "step": 5052 + }, + { + "epoch": 0.030051622418879056, + "grad_norm": 2.456385850906372, + "learning_rate": 4.9888711262513846e-05, + "loss": 5.3771, + "step": 5053 + }, + { + "epoch": 0.03005756970216005, + "grad_norm": 2.480044364929199, + "learning_rate": 4.988866723358692e-05, + "loss": 5.2456, + "step": 5054 + }, + { + "epoch": 0.03006351698544105, + "grad_norm": 2.4033162593841553, + "learning_rate": 4.988862319597161e-05, + "loss": 5.1629, + "step": 5055 + }, + { + "epoch": 0.03006946426872205, + "grad_norm": 2.7228541374206543, + "learning_rate": 4.9888579149667935e-05, + "loss": 5.0195, + "step": 5056 + }, + { + "epoch": 0.030075411552003044, + "grad_norm": 2.4641635417938232, + "learning_rate": 4.9888535094675926e-05, + "loss": 5.3259, + "step": 5057 + }, + { + "epoch": 0.030081358835284042, + "grad_norm": 2.443666458129883, + "learning_rate": 4.9888491030995575e-05, + "loss": 5.4212, + "step": 5058 + }, + { + "epoch": 0.03008730611856504, + "grad_norm": 2.3267531394958496, + "learning_rate": 4.988844695862692e-05, + "loss": 5.6517, + "step": 5059 + }, + { + "epoch": 0.030093253401846036, + "grad_norm": 1.9090640544891357, + "learning_rate": 4.988840287756996e-05, + "loss": 5.7946, + "step": 5060 + }, + { + "epoch": 0.030099200685127035, + "grad_norm": 1.6169202327728271, + "learning_rate": 4.988835878782472e-05, + "loss": 5.7332, + "step": 5061 + }, + { + "epoch": 0.03010514796840803, + "grad_norm": 1.9369432926177979, + "learning_rate": 4.9888314689391205e-05, + "loss": 5.5954, + "step": 5062 + }, + { + "epoch": 0.030111095251689028, + "grad_norm": 2.0444133281707764, + "learning_rate": 4.9888270582269434e-05, + "loss": 5.5332, + "step": 5063 + }, + { + "epoch": 0.030117042534970027, + "grad_norm": 1.949061632156372, + "learning_rate": 4.988822646645943e-05, + "loss": 5.6064, + "step": 5064 + }, + { + "epoch": 0.030122989818251022, + "grad_norm": 1.5208648443222046, + "learning_rate": 4.988818234196121e-05, + "loss": 5.6615, + "step": 5065 + }, + { + "epoch": 0.03012893710153202, + "grad_norm": 1.8466709852218628, + "learning_rate": 4.988813820877477e-05, + "loss": 5.79, + "step": 5066 + }, + { + "epoch": 0.03013488438481302, + "grad_norm": 1.7094037532806396, + "learning_rate": 4.988809406690015e-05, + "loss": 5.8194, + "step": 5067 + }, + { + "epoch": 0.030140831668094014, + "grad_norm": 1.5698916912078857, + "learning_rate": 4.988804991633734e-05, + "loss": 5.5981, + "step": 5068 + }, + { + "epoch": 0.030146778951375013, + "grad_norm": 2.032156467437744, + "learning_rate": 4.988800575708638e-05, + "loss": 5.6729, + "step": 5069 + }, + { + "epoch": 0.030152726234656008, + "grad_norm": 1.9716484546661377, + "learning_rate": 4.988796158914727e-05, + "loss": 5.5227, + "step": 5070 + }, + { + "epoch": 0.030158673517937006, + "grad_norm": 1.8809682130813599, + "learning_rate": 4.988791741252002e-05, + "loss": 5.6231, + "step": 5071 + }, + { + "epoch": 0.030164620801218005, + "grad_norm": 1.8293371200561523, + "learning_rate": 4.9887873227204675e-05, + "loss": 5.5067, + "step": 5072 + }, + { + "epoch": 0.030170568084499, + "grad_norm": 2.225281000137329, + "learning_rate": 4.988782903320122e-05, + "loss": 5.3056, + "step": 5073 + }, + { + "epoch": 0.03017651536778, + "grad_norm": 2.0776474475860596, + "learning_rate": 4.988778483050968e-05, + "loss": 5.206, + "step": 5074 + }, + { + "epoch": 0.030182462651060997, + "grad_norm": 2.068323850631714, + "learning_rate": 4.9887740619130076e-05, + "loss": 5.5975, + "step": 5075 + }, + { + "epoch": 0.030188409934341992, + "grad_norm": 2.077782392501831, + "learning_rate": 4.988769639906241e-05, + "loss": 5.6967, + "step": 5076 + }, + { + "epoch": 0.03019435721762299, + "grad_norm": 1.9837195873260498, + "learning_rate": 4.988765217030672e-05, + "loss": 5.7834, + "step": 5077 + }, + { + "epoch": 0.030200304500903986, + "grad_norm": 1.9612236022949219, + "learning_rate": 4.9887607932863e-05, + "loss": 5.5472, + "step": 5078 + }, + { + "epoch": 0.030206251784184984, + "grad_norm": 2.022251605987549, + "learning_rate": 4.988756368673127e-05, + "loss": 5.704, + "step": 5079 + }, + { + "epoch": 0.030212199067465983, + "grad_norm": 2.02227783203125, + "learning_rate": 4.988751943191156e-05, + "loss": 5.4125, + "step": 5080 + }, + { + "epoch": 0.030218146350746978, + "grad_norm": 2.0527732372283936, + "learning_rate": 4.9887475168403856e-05, + "loss": 5.464, + "step": 5081 + }, + { + "epoch": 0.030224093634027976, + "grad_norm": 2.1465423107147217, + "learning_rate": 4.9887430896208205e-05, + "loss": 5.3415, + "step": 5082 + }, + { + "epoch": 0.03023004091730897, + "grad_norm": 1.9170550107955933, + "learning_rate": 4.9887386615324606e-05, + "loss": 5.5762, + "step": 5083 + }, + { + "epoch": 0.03023598820058997, + "grad_norm": 3.367650032043457, + "learning_rate": 4.988734232575307e-05, + "loss": 6.26, + "step": 5084 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 2.0784621238708496, + "learning_rate": 4.988729802749363e-05, + "loss": 5.5316, + "step": 5085 + }, + { + "epoch": 0.030247882767151964, + "grad_norm": 1.9531089067459106, + "learning_rate": 4.988725372054629e-05, + "loss": 5.5901, + "step": 5086 + }, + { + "epoch": 0.030253830050432962, + "grad_norm": 1.9677239656448364, + "learning_rate": 4.988720940491106e-05, + "loss": 5.4963, + "step": 5087 + }, + { + "epoch": 0.03025977733371396, + "grad_norm": 1.9835426807403564, + "learning_rate": 4.988716508058797e-05, + "loss": 5.6355, + "step": 5088 + }, + { + "epoch": 0.030265724616994956, + "grad_norm": 1.908250331878662, + "learning_rate": 4.988712074757703e-05, + "loss": 5.165, + "step": 5089 + }, + { + "epoch": 0.030271671900275954, + "grad_norm": 1.9852073192596436, + "learning_rate": 4.9887076405878246e-05, + "loss": 5.6623, + "step": 5090 + }, + { + "epoch": 0.03027761918355695, + "grad_norm": 1.9073505401611328, + "learning_rate": 4.988703205549164e-05, + "loss": 5.6685, + "step": 5091 + }, + { + "epoch": 0.030283566466837948, + "grad_norm": 1.744931697845459, + "learning_rate": 4.988698769641724e-05, + "loss": 5.4004, + "step": 5092 + }, + { + "epoch": 0.030289513750118947, + "grad_norm": 2.0623345375061035, + "learning_rate": 4.9886943328655034e-05, + "loss": 5.3846, + "step": 5093 + }, + { + "epoch": 0.030295461033399942, + "grad_norm": 1.647375226020813, + "learning_rate": 4.9886898952205064e-05, + "loss": 5.5823, + "step": 5094 + }, + { + "epoch": 0.03030140831668094, + "grad_norm": 2.2364108562469482, + "learning_rate": 4.9886854567067334e-05, + "loss": 5.5959, + "step": 5095 + }, + { + "epoch": 0.03030735559996194, + "grad_norm": 2.059187650680542, + "learning_rate": 4.988681017324185e-05, + "loss": 5.6043, + "step": 5096 + }, + { + "epoch": 0.030313302883242934, + "grad_norm": 1.8996437788009644, + "learning_rate": 4.988676577072865e-05, + "loss": 5.4366, + "step": 5097 + }, + { + "epoch": 0.030319250166523933, + "grad_norm": 2.0983266830444336, + "learning_rate": 4.988672135952773e-05, + "loss": 5.5568, + "step": 5098 + }, + { + "epoch": 0.030325197449804928, + "grad_norm": 2.065119743347168, + "learning_rate": 4.988667693963911e-05, + "loss": 5.4239, + "step": 5099 + }, + { + "epoch": 0.030331144733085926, + "grad_norm": 1.9394044876098633, + "learning_rate": 4.988663251106282e-05, + "loss": 5.573, + "step": 5100 + }, + { + "epoch": 0.030337092016366925, + "grad_norm": 2.225097417831421, + "learning_rate": 4.9886588073798855e-05, + "loss": 5.5877, + "step": 5101 + }, + { + "epoch": 0.03034303929964792, + "grad_norm": 2.185018539428711, + "learning_rate": 4.9886543627847236e-05, + "loss": 5.6884, + "step": 5102 + }, + { + "epoch": 0.03034898658292892, + "grad_norm": 1.9751871824264526, + "learning_rate": 4.988649917320799e-05, + "loss": 5.4836, + "step": 5103 + }, + { + "epoch": 0.030354933866209917, + "grad_norm": 1.8753101825714111, + "learning_rate": 4.988645470988113e-05, + "loss": 5.4049, + "step": 5104 + }, + { + "epoch": 0.030360881149490912, + "grad_norm": 2.12246036529541, + "learning_rate": 4.988641023786665e-05, + "loss": 5.5365, + "step": 5105 + }, + { + "epoch": 0.03036682843277191, + "grad_norm": 2.1078991889953613, + "learning_rate": 4.988636575716459e-05, + "loss": 5.5269, + "step": 5106 + }, + { + "epoch": 0.030372775716052906, + "grad_norm": 1.9127923250198364, + "learning_rate": 4.9886321267774946e-05, + "loss": 5.48, + "step": 5107 + }, + { + "epoch": 0.030378722999333904, + "grad_norm": 1.8971906900405884, + "learning_rate": 4.988627676969776e-05, + "loss": 5.5202, + "step": 5108 + }, + { + "epoch": 0.030384670282614903, + "grad_norm": 2.162097454071045, + "learning_rate": 4.9886232262933024e-05, + "loss": 5.5229, + "step": 5109 + }, + { + "epoch": 0.030390617565895898, + "grad_norm": 2.21211838722229, + "learning_rate": 4.988618774748076e-05, + "loss": 5.3648, + "step": 5110 + }, + { + "epoch": 0.030396564849176896, + "grad_norm": 1.8907619714736938, + "learning_rate": 4.988614322334099e-05, + "loss": 5.4338, + "step": 5111 + }, + { + "epoch": 0.030402512132457895, + "grad_norm": 2.0131993293762207, + "learning_rate": 4.9886098690513725e-05, + "loss": 5.4005, + "step": 5112 + }, + { + "epoch": 0.03040845941573889, + "grad_norm": 1.9474748373031616, + "learning_rate": 4.9886054148998975e-05, + "loss": 5.5544, + "step": 5113 + }, + { + "epoch": 0.03041440669901989, + "grad_norm": 1.9809894561767578, + "learning_rate": 4.988600959879676e-05, + "loss": 5.6204, + "step": 5114 + }, + { + "epoch": 0.030420353982300884, + "grad_norm": 2.1792514324188232, + "learning_rate": 4.9885965039907104e-05, + "loss": 5.5368, + "step": 5115 + }, + { + "epoch": 0.030426301265581882, + "grad_norm": 2.050903081893921, + "learning_rate": 4.9885920472330004e-05, + "loss": 5.4717, + "step": 5116 + }, + { + "epoch": 0.03043224854886288, + "grad_norm": 1.9938042163848877, + "learning_rate": 4.988587589606549e-05, + "loss": 5.5373, + "step": 5117 + }, + { + "epoch": 0.030438195832143876, + "grad_norm": 1.7375110387802124, + "learning_rate": 4.988583131111358e-05, + "loss": 5.5621, + "step": 5118 + }, + { + "epoch": 0.030444143115424874, + "grad_norm": 2.077605962753296, + "learning_rate": 4.988578671747428e-05, + "loss": 5.5451, + "step": 5119 + }, + { + "epoch": 0.03045009039870587, + "grad_norm": 2.071706771850586, + "learning_rate": 4.988574211514761e-05, + "loss": 5.327, + "step": 5120 + }, + { + "epoch": 0.030456037681986868, + "grad_norm": 1.8317911624908447, + "learning_rate": 4.9885697504133574e-05, + "loss": 5.4123, + "step": 5121 + }, + { + "epoch": 0.030461984965267867, + "grad_norm": 2.1231188774108887, + "learning_rate": 4.988565288443221e-05, + "loss": 5.3789, + "step": 5122 + }, + { + "epoch": 0.03046793224854886, + "grad_norm": 2.1298999786376953, + "learning_rate": 4.988560825604352e-05, + "loss": 5.4382, + "step": 5123 + }, + { + "epoch": 0.03047387953182986, + "grad_norm": 1.791053056716919, + "learning_rate": 4.9885563618967525e-05, + "loss": 5.3918, + "step": 5124 + }, + { + "epoch": 0.03047982681511086, + "grad_norm": 1.9610999822616577, + "learning_rate": 4.988551897320423e-05, + "loss": 5.3232, + "step": 5125 + }, + { + "epoch": 0.030485774098391854, + "grad_norm": 1.9926520586013794, + "learning_rate": 4.9885474318753654e-05, + "loss": 5.4316, + "step": 5126 + }, + { + "epoch": 0.030491721381672852, + "grad_norm": 1.8942431211471558, + "learning_rate": 4.988542965561582e-05, + "loss": 5.4055, + "step": 5127 + }, + { + "epoch": 0.030497668664953848, + "grad_norm": 1.7872856855392456, + "learning_rate": 4.988538498379074e-05, + "loss": 5.5117, + "step": 5128 + }, + { + "epoch": 0.030503615948234846, + "grad_norm": 2.040205478668213, + "learning_rate": 4.988534030327843e-05, + "loss": 5.4068, + "step": 5129 + }, + { + "epoch": 0.030509563231515845, + "grad_norm": 2.0108931064605713, + "learning_rate": 4.988529561407891e-05, + "loss": 5.3636, + "step": 5130 + }, + { + "epoch": 0.03051551051479684, + "grad_norm": 2.0339555740356445, + "learning_rate": 4.988525091619218e-05, + "loss": 5.2811, + "step": 5131 + }, + { + "epoch": 0.03052145779807784, + "grad_norm": 1.7631195783615112, + "learning_rate": 4.988520620961828e-05, + "loss": 5.3407, + "step": 5132 + }, + { + "epoch": 0.030527405081358837, + "grad_norm": 1.6906533241271973, + "learning_rate": 4.988516149435719e-05, + "loss": 5.3121, + "step": 5133 + }, + { + "epoch": 0.030533352364639832, + "grad_norm": 2.0753448009490967, + "learning_rate": 4.988511677040897e-05, + "loss": 5.4532, + "step": 5134 + }, + { + "epoch": 0.03053929964792083, + "grad_norm": 1.9836634397506714, + "learning_rate": 4.9885072037773595e-05, + "loss": 5.4345, + "step": 5135 + }, + { + "epoch": 0.030545246931201826, + "grad_norm": 1.8526780605316162, + "learning_rate": 4.988502729645111e-05, + "loss": 5.446, + "step": 5136 + }, + { + "epoch": 0.030551194214482824, + "grad_norm": 2.126626968383789, + "learning_rate": 4.988498254644152e-05, + "loss": 5.703, + "step": 5137 + }, + { + "epoch": 0.030557141497763823, + "grad_norm": 1.9711220264434814, + "learning_rate": 4.988493778774483e-05, + "loss": 5.5872, + "step": 5138 + }, + { + "epoch": 0.030563088781044818, + "grad_norm": 2.070727586746216, + "learning_rate": 4.988489302036107e-05, + "loss": 5.4407, + "step": 5139 + }, + { + "epoch": 0.030569036064325816, + "grad_norm": 2.1414859294891357, + "learning_rate": 4.988484824429025e-05, + "loss": 5.5291, + "step": 5140 + }, + { + "epoch": 0.030574983347606815, + "grad_norm": 2.01366925239563, + "learning_rate": 4.9884803459532384e-05, + "loss": 5.3561, + "step": 5141 + }, + { + "epoch": 0.03058093063088781, + "grad_norm": 1.851836085319519, + "learning_rate": 4.988475866608749e-05, + "loss": 5.679, + "step": 5142 + }, + { + "epoch": 0.03058687791416881, + "grad_norm": 1.6984909772872925, + "learning_rate": 4.988471386395559e-05, + "loss": 5.6075, + "step": 5143 + }, + { + "epoch": 0.030592825197449804, + "grad_norm": 1.9371756315231323, + "learning_rate": 4.9884669053136696e-05, + "loss": 5.7062, + "step": 5144 + }, + { + "epoch": 0.030598772480730802, + "grad_norm": 1.9286617040634155, + "learning_rate": 4.9884624233630815e-05, + "loss": 5.573, + "step": 5145 + }, + { + "epoch": 0.0306047197640118, + "grad_norm": 2.7633650302886963, + "learning_rate": 4.988457940543797e-05, + "loss": 6.2082, + "step": 5146 + }, + { + "epoch": 0.030610667047292796, + "grad_norm": 2.6948676109313965, + "learning_rate": 4.9884534568558173e-05, + "loss": 5.7475, + "step": 5147 + }, + { + "epoch": 0.030616614330573794, + "grad_norm": 2.1618316173553467, + "learning_rate": 4.988448972299145e-05, + "loss": 5.4049, + "step": 5148 + }, + { + "epoch": 0.03062256161385479, + "grad_norm": 2.417043685913086, + "learning_rate": 4.98844448687378e-05, + "loss": 5.3663, + "step": 5149 + }, + { + "epoch": 0.030628508897135788, + "grad_norm": 1.9748867750167847, + "learning_rate": 4.988440000579725e-05, + "loss": 5.1876, + "step": 5150 + }, + { + "epoch": 0.030634456180416787, + "grad_norm": 2.0534770488739014, + "learning_rate": 4.988435513416981e-05, + "loss": 5.4519, + "step": 5151 + }, + { + "epoch": 0.03064040346369778, + "grad_norm": 1.9772714376449585, + "learning_rate": 4.98843102538555e-05, + "loss": 5.5241, + "step": 5152 + }, + { + "epoch": 0.03064635074697878, + "grad_norm": 2.4160993099212646, + "learning_rate": 4.988426536485434e-05, + "loss": 5.6535, + "step": 5153 + }, + { + "epoch": 0.03065229803025978, + "grad_norm": 1.9931175708770752, + "learning_rate": 4.9884220467166345e-05, + "loss": 5.6693, + "step": 5154 + }, + { + "epoch": 0.030658245313540774, + "grad_norm": 1.9071956872940063, + "learning_rate": 4.9884175560791516e-05, + "loss": 5.5533, + "step": 5155 + }, + { + "epoch": 0.030664192596821772, + "grad_norm": 1.8562983274459839, + "learning_rate": 4.9884130645729876e-05, + "loss": 5.5621, + "step": 5156 + }, + { + "epoch": 0.030670139880102767, + "grad_norm": 2.087606430053711, + "learning_rate": 4.9884085721981446e-05, + "loss": 5.5256, + "step": 5157 + }, + { + "epoch": 0.030676087163383766, + "grad_norm": 2.3242955207824707, + "learning_rate": 4.988404078954624e-05, + "loss": 5.3906, + "step": 5158 + }, + { + "epoch": 0.030682034446664765, + "grad_norm": 2.221330404281616, + "learning_rate": 4.988399584842427e-05, + "loss": 5.5719, + "step": 5159 + }, + { + "epoch": 0.03068798172994576, + "grad_norm": 1.7819960117340088, + "learning_rate": 4.988395089861556e-05, + "loss": 5.5823, + "step": 5160 + }, + { + "epoch": 0.030693929013226758, + "grad_norm": 1.781802773475647, + "learning_rate": 4.988390594012011e-05, + "loss": 5.6087, + "step": 5161 + }, + { + "epoch": 0.030699876296507757, + "grad_norm": 2.0003581047058105, + "learning_rate": 4.988386097293796e-05, + "loss": 5.5695, + "step": 5162 + }, + { + "epoch": 0.030705823579788752, + "grad_norm": 1.9411736726760864, + "learning_rate": 4.98838159970691e-05, + "loss": 5.441, + "step": 5163 + }, + { + "epoch": 0.03071177086306975, + "grad_norm": 2.159541368484497, + "learning_rate": 4.9883771012513556e-05, + "loss": 5.6191, + "step": 5164 + }, + { + "epoch": 0.030717718146350746, + "grad_norm": 2.1045689582824707, + "learning_rate": 4.988372601927135e-05, + "loss": 5.3261, + "step": 5165 + }, + { + "epoch": 0.030723665429631744, + "grad_norm": 2.004770040512085, + "learning_rate": 4.988368101734249e-05, + "loss": 5.3392, + "step": 5166 + }, + { + "epoch": 0.030729612712912743, + "grad_norm": 2.1851232051849365, + "learning_rate": 4.9883636006726996e-05, + "loss": 5.3048, + "step": 5167 + }, + { + "epoch": 0.030735559996193738, + "grad_norm": 2.1333882808685303, + "learning_rate": 4.988359098742488e-05, + "loss": 5.336, + "step": 5168 + }, + { + "epoch": 0.030741507279474736, + "grad_norm": 2.1911604404449463, + "learning_rate": 4.9883545959436165e-05, + "loss": 5.757, + "step": 5169 + }, + { + "epoch": 0.030747454562755735, + "grad_norm": 2.0385994911193848, + "learning_rate": 4.988350092276085e-05, + "loss": 5.7889, + "step": 5170 + }, + { + "epoch": 0.03075340184603673, + "grad_norm": 2.2300381660461426, + "learning_rate": 4.988345587739897e-05, + "loss": 5.3812, + "step": 5171 + }, + { + "epoch": 0.03075934912931773, + "grad_norm": 2.4643938541412354, + "learning_rate": 4.988341082335053e-05, + "loss": 5.2503, + "step": 5172 + }, + { + "epoch": 0.030765296412598724, + "grad_norm": 2.0791194438934326, + "learning_rate": 4.988336576061555e-05, + "loss": 5.2958, + "step": 5173 + }, + { + "epoch": 0.030771243695879722, + "grad_norm": 2.1123111248016357, + "learning_rate": 4.988332068919405e-05, + "loss": 5.3656, + "step": 5174 + }, + { + "epoch": 0.03077719097916072, + "grad_norm": 2.199747323989868, + "learning_rate": 4.9883275609086026e-05, + "loss": 5.7015, + "step": 5175 + }, + { + "epoch": 0.030783138262441716, + "grad_norm": 2.0083510875701904, + "learning_rate": 4.988323052029151e-05, + "loss": 5.7068, + "step": 5176 + }, + { + "epoch": 0.030789085545722714, + "grad_norm": 2.1027777194976807, + "learning_rate": 4.988318542281053e-05, + "loss": 5.6986, + "step": 5177 + }, + { + "epoch": 0.03079503282900371, + "grad_norm": 1.8593190908432007, + "learning_rate": 4.9883140316643074e-05, + "loss": 5.7194, + "step": 5178 + }, + { + "epoch": 0.030800980112284708, + "grad_norm": 1.9712544679641724, + "learning_rate": 4.988309520178918e-05, + "loss": 5.6472, + "step": 5179 + }, + { + "epoch": 0.030806927395565707, + "grad_norm": 2.1114501953125, + "learning_rate": 4.9883050078248836e-05, + "loss": 5.6767, + "step": 5180 + }, + { + "epoch": 0.0308128746788467, + "grad_norm": 3.0505895614624023, + "learning_rate": 4.988300494602209e-05, + "loss": 5.3705, + "step": 5181 + }, + { + "epoch": 0.0308188219621277, + "grad_norm": 2.648364782333374, + "learning_rate": 4.988295980510895e-05, + "loss": 5.3072, + "step": 5182 + }, + { + "epoch": 0.0308247692454087, + "grad_norm": 2.2162837982177734, + "learning_rate": 4.9882914655509414e-05, + "loss": 5.3359, + "step": 5183 + }, + { + "epoch": 0.030830716528689694, + "grad_norm": 2.16666316986084, + "learning_rate": 4.988286949722352e-05, + "loss": 5.3446, + "step": 5184 + }, + { + "epoch": 0.030836663811970692, + "grad_norm": 2.951157569885254, + "learning_rate": 4.988282433025126e-05, + "loss": 5.7776, + "step": 5185 + }, + { + "epoch": 0.030842611095251687, + "grad_norm": 2.9967124462127686, + "learning_rate": 4.988277915459267e-05, + "loss": 5.6004, + "step": 5186 + }, + { + "epoch": 0.030848558378532686, + "grad_norm": 2.3998372554779053, + "learning_rate": 4.988273397024777e-05, + "loss": 5.3562, + "step": 5187 + }, + { + "epoch": 0.030854505661813685, + "grad_norm": 2.290592670440674, + "learning_rate": 4.9882688777216544e-05, + "loss": 5.3211, + "step": 5188 + }, + { + "epoch": 0.03086045294509468, + "grad_norm": 2.0349433422088623, + "learning_rate": 4.988264357549904e-05, + "loss": 5.2917, + "step": 5189 + }, + { + "epoch": 0.030866400228375678, + "grad_norm": 1.922006607055664, + "learning_rate": 4.988259836509526e-05, + "loss": 5.2297, + "step": 5190 + }, + { + "epoch": 0.030872347511656677, + "grad_norm": 1.9518259763717651, + "learning_rate": 4.9882553146005225e-05, + "loss": 5.2232, + "step": 5191 + }, + { + "epoch": 0.030878294794937672, + "grad_norm": 2.1054210662841797, + "learning_rate": 4.988250791822894e-05, + "loss": 5.3705, + "step": 5192 + }, + { + "epoch": 0.03088424207821867, + "grad_norm": 2.0954079627990723, + "learning_rate": 4.988246268176644e-05, + "loss": 5.2522, + "step": 5193 + }, + { + "epoch": 0.030890189361499665, + "grad_norm": 1.8628660440444946, + "learning_rate": 4.9882417436617724e-05, + "loss": 5.3856, + "step": 5194 + }, + { + "epoch": 0.030896136644780664, + "grad_norm": 2.2788021564483643, + "learning_rate": 4.988237218278281e-05, + "loss": 5.4399, + "step": 5195 + }, + { + "epoch": 0.030902083928061663, + "grad_norm": 1.981086015701294, + "learning_rate": 4.9882326920261717e-05, + "loss": 5.2853, + "step": 5196 + }, + { + "epoch": 0.030908031211342658, + "grad_norm": 1.9278241395950317, + "learning_rate": 4.988228164905446e-05, + "loss": 5.3997, + "step": 5197 + }, + { + "epoch": 0.030913978494623656, + "grad_norm": 1.842748999595642, + "learning_rate": 4.988223636916106e-05, + "loss": 5.3215, + "step": 5198 + }, + { + "epoch": 0.030919925777904655, + "grad_norm": 1.9974339008331299, + "learning_rate": 4.988219108058153e-05, + "loss": 5.4851, + "step": 5199 + }, + { + "epoch": 0.03092587306118565, + "grad_norm": 2.015939474105835, + "learning_rate": 4.988214578331588e-05, + "loss": 5.322, + "step": 5200 + }, + { + "epoch": 0.03093182034446665, + "grad_norm": 2.035209894180298, + "learning_rate": 4.9882100477364135e-05, + "loss": 5.3896, + "step": 5201 + }, + { + "epoch": 0.030937767627747643, + "grad_norm": 1.9803009033203125, + "learning_rate": 4.9882055162726296e-05, + "loss": 5.2624, + "step": 5202 + }, + { + "epoch": 0.030943714911028642, + "grad_norm": 1.9504352807998657, + "learning_rate": 4.98820098394024e-05, + "loss": 5.2333, + "step": 5203 + }, + { + "epoch": 0.03094966219430964, + "grad_norm": 1.850542664527893, + "learning_rate": 4.9881964507392443e-05, + "loss": 5.5632, + "step": 5204 + }, + { + "epoch": 0.030955609477590636, + "grad_norm": 1.8594067096710205, + "learning_rate": 4.9881919166696456e-05, + "loss": 5.3775, + "step": 5205 + }, + { + "epoch": 0.030961556760871634, + "grad_norm": 2.019274950027466, + "learning_rate": 4.988187381731444e-05, + "loss": 5.4565, + "step": 5206 + }, + { + "epoch": 0.030967504044152633, + "grad_norm": 1.7151249647140503, + "learning_rate": 4.988182845924643e-05, + "loss": 5.5984, + "step": 5207 + }, + { + "epoch": 0.030973451327433628, + "grad_norm": 2.5127339363098145, + "learning_rate": 4.988178309249242e-05, + "loss": 6.2724, + "step": 5208 + }, + { + "epoch": 0.030979398610714626, + "grad_norm": 1.869344711303711, + "learning_rate": 4.9881737717052436e-05, + "loss": 5.5408, + "step": 5209 + }, + { + "epoch": 0.03098534589399562, + "grad_norm": 2.035419225692749, + "learning_rate": 4.98816923329265e-05, + "loss": 5.4154, + "step": 5210 + }, + { + "epoch": 0.03099129317727662, + "grad_norm": 1.7084250450134277, + "learning_rate": 4.9881646940114624e-05, + "loss": 5.6327, + "step": 5211 + }, + { + "epoch": 0.03099724046055762, + "grad_norm": 2.1035211086273193, + "learning_rate": 4.9881601538616816e-05, + "loss": 5.5041, + "step": 5212 + }, + { + "epoch": 0.031003187743838614, + "grad_norm": 1.920366883277893, + "learning_rate": 4.9881556128433105e-05, + "loss": 5.5919, + "step": 5213 + }, + { + "epoch": 0.031009135027119612, + "grad_norm": 2.000555992126465, + "learning_rate": 4.988151070956349e-05, + "loss": 5.5078, + "step": 5214 + }, + { + "epoch": 0.031015082310400607, + "grad_norm": 1.9930146932601929, + "learning_rate": 4.9881465282008e-05, + "loss": 5.5002, + "step": 5215 + }, + { + "epoch": 0.031021029593681606, + "grad_norm": 2.163329839706421, + "learning_rate": 4.988141984576665e-05, + "loss": 5.3504, + "step": 5216 + }, + { + "epoch": 0.031026976876962604, + "grad_norm": 1.766228437423706, + "learning_rate": 4.988137440083946e-05, + "loss": 5.5304, + "step": 5217 + }, + { + "epoch": 0.0310329241602436, + "grad_norm": 2.1399648189544678, + "learning_rate": 4.988132894722644e-05, + "loss": 5.4757, + "step": 5218 + }, + { + "epoch": 0.031038871443524598, + "grad_norm": 2.2287001609802246, + "learning_rate": 4.988128348492759e-05, + "loss": 5.4902, + "step": 5219 + }, + { + "epoch": 0.031044818726805597, + "grad_norm": 2.095080852508545, + "learning_rate": 4.988123801394295e-05, + "loss": 5.3462, + "step": 5220 + }, + { + "epoch": 0.031050766010086592, + "grad_norm": 2.0873003005981445, + "learning_rate": 4.988119253427253e-05, + "loss": 5.2825, + "step": 5221 + }, + { + "epoch": 0.03105671329336759, + "grad_norm": 2.0918655395507812, + "learning_rate": 4.988114704591633e-05, + "loss": 5.2859, + "step": 5222 + }, + { + "epoch": 0.031062660576648585, + "grad_norm": 1.9637762308120728, + "learning_rate": 4.9881101548874384e-05, + "loss": 5.4687, + "step": 5223 + }, + { + "epoch": 0.031068607859929584, + "grad_norm": 2.046672821044922, + "learning_rate": 4.988105604314671e-05, + "loss": 5.5095, + "step": 5224 + }, + { + "epoch": 0.031074555143210583, + "grad_norm": 2.0264053344726562, + "learning_rate": 4.988101052873332e-05, + "loss": 5.4221, + "step": 5225 + }, + { + "epoch": 0.031080502426491578, + "grad_norm": 1.9367676973342896, + "learning_rate": 4.9880965005634216e-05, + "loss": 5.1881, + "step": 5226 + }, + { + "epoch": 0.031086449709772576, + "grad_norm": 2.0398001670837402, + "learning_rate": 4.9880919473849425e-05, + "loss": 5.4938, + "step": 5227 + }, + { + "epoch": 0.031092396993053575, + "grad_norm": 2.037411689758301, + "learning_rate": 4.988087393337896e-05, + "loss": 5.0893, + "step": 5228 + }, + { + "epoch": 0.03109834427633457, + "grad_norm": 2.1337075233459473, + "learning_rate": 4.988082838422285e-05, + "loss": 4.9822, + "step": 5229 + }, + { + "epoch": 0.03110429155961557, + "grad_norm": 1.9911794662475586, + "learning_rate": 4.988078282638109e-05, + "loss": 5.2472, + "step": 5230 + }, + { + "epoch": 0.031110238842896563, + "grad_norm": 2.1050829887390137, + "learning_rate": 4.98807372598537e-05, + "loss": 5.3478, + "step": 5231 + }, + { + "epoch": 0.031116186126177562, + "grad_norm": 1.9364343881607056, + "learning_rate": 4.988069168464071e-05, + "loss": 5.2551, + "step": 5232 + }, + { + "epoch": 0.03112213340945856, + "grad_norm": 1.9834885597229004, + "learning_rate": 4.988064610074213e-05, + "loss": 5.2147, + "step": 5233 + }, + { + "epoch": 0.031128080692739556, + "grad_norm": 2.0815906524658203, + "learning_rate": 4.9880600508157974e-05, + "loss": 5.1607, + "step": 5234 + }, + { + "epoch": 0.031134027976020554, + "grad_norm": 1.9558357000350952, + "learning_rate": 4.988055490688825e-05, + "loss": 5.4, + "step": 5235 + }, + { + "epoch": 0.031139975259301553, + "grad_norm": 1.9036076068878174, + "learning_rate": 4.9880509296932986e-05, + "loss": 5.4953, + "step": 5236 + }, + { + "epoch": 0.031145922542582548, + "grad_norm": 2.4709548950195312, + "learning_rate": 4.98804636782922e-05, + "loss": 5.2628, + "step": 5237 + }, + { + "epoch": 0.031151869825863546, + "grad_norm": 2.2380030155181885, + "learning_rate": 4.988041805096589e-05, + "loss": 5.2423, + "step": 5238 + }, + { + "epoch": 0.03115781710914454, + "grad_norm": 2.348639726638794, + "learning_rate": 4.988037241495409e-05, + "loss": 5.1966, + "step": 5239 + }, + { + "epoch": 0.03116376439242554, + "grad_norm": 1.9384468793869019, + "learning_rate": 4.9880326770256805e-05, + "loss": 5.47, + "step": 5240 + }, + { + "epoch": 0.03116971167570654, + "grad_norm": 2.2664244174957275, + "learning_rate": 4.988028111687406e-05, + "loss": 5.5511, + "step": 5241 + }, + { + "epoch": 0.031175658958987534, + "grad_norm": 2.1356422901153564, + "learning_rate": 4.988023545480586e-05, + "loss": 5.6462, + "step": 5242 + }, + { + "epoch": 0.031181606242268532, + "grad_norm": 2.240190267562866, + "learning_rate": 4.9880189784052226e-05, + "loss": 5.3494, + "step": 5243 + }, + { + "epoch": 0.031187553525549527, + "grad_norm": 1.8032485246658325, + "learning_rate": 4.988014410461318e-05, + "loss": 5.2305, + "step": 5244 + }, + { + "epoch": 0.031193500808830526, + "grad_norm": 2.177501678466797, + "learning_rate": 4.988009841648873e-05, + "loss": 5.1891, + "step": 5245 + }, + { + "epoch": 0.031199448092111524, + "grad_norm": 2.157317876815796, + "learning_rate": 4.988005271967889e-05, + "loss": 5.1038, + "step": 5246 + }, + { + "epoch": 0.03120539537539252, + "grad_norm": 1.9995821714401245, + "learning_rate": 4.988000701418369e-05, + "loss": 5.1098, + "step": 5247 + }, + { + "epoch": 0.031211342658673518, + "grad_norm": 2.201558828353882, + "learning_rate": 4.987996130000313e-05, + "loss": 5.0702, + "step": 5248 + }, + { + "epoch": 0.031217289941954517, + "grad_norm": 2.065645933151245, + "learning_rate": 4.987991557713724e-05, + "loss": 5.2012, + "step": 5249 + }, + { + "epoch": 0.03122323722523551, + "grad_norm": 1.908347487449646, + "learning_rate": 4.9879869845586024e-05, + "loss": 5.0913, + "step": 5250 + }, + { + "epoch": 0.03122918450851651, + "grad_norm": 1.913979411125183, + "learning_rate": 4.98798241053495e-05, + "loss": 5.0036, + "step": 5251 + }, + { + "epoch": 0.031235131791797505, + "grad_norm": 2.217616558074951, + "learning_rate": 4.9879778356427686e-05, + "loss": 5.0621, + "step": 5252 + }, + { + "epoch": 0.031241079075078504, + "grad_norm": 2.419713258743286, + "learning_rate": 4.9879732598820605e-05, + "loss": 5.1264, + "step": 5253 + }, + { + "epoch": 0.031247026358359502, + "grad_norm": 2.298295497894287, + "learning_rate": 4.987968683252826e-05, + "loss": 5.0576, + "step": 5254 + }, + { + "epoch": 0.0312529736416405, + "grad_norm": 2.120589256286621, + "learning_rate": 4.987964105755067e-05, + "loss": 5.175, + "step": 5255 + }, + { + "epoch": 0.031258920924921496, + "grad_norm": 2.3129806518554688, + "learning_rate": 4.987959527388787e-05, + "loss": 5.1827, + "step": 5256 + }, + { + "epoch": 0.03126486820820249, + "grad_norm": 2.251680612564087, + "learning_rate": 4.9879549481539846e-05, + "loss": 5.0473, + "step": 5257 + }, + { + "epoch": 0.03127081549148349, + "grad_norm": 2.101229429244995, + "learning_rate": 4.987950368050663e-05, + "loss": 5.0453, + "step": 5258 + }, + { + "epoch": 0.03127676277476449, + "grad_norm": 2.189565420150757, + "learning_rate": 4.987945787078824e-05, + "loss": 5.087, + "step": 5259 + }, + { + "epoch": 0.03128271005804548, + "grad_norm": 2.05485463142395, + "learning_rate": 4.9879412052384687e-05, + "loss": 5.0192, + "step": 5260 + }, + { + "epoch": 0.031288657341326485, + "grad_norm": 1.8166489601135254, + "learning_rate": 4.9879366225295994e-05, + "loss": 5.0456, + "step": 5261 + }, + { + "epoch": 0.03129460462460748, + "grad_norm": 2.1403279304504395, + "learning_rate": 4.9879320389522165e-05, + "loss": 4.9455, + "step": 5262 + }, + { + "epoch": 0.031300551907888476, + "grad_norm": 1.8833802938461304, + "learning_rate": 4.9879274545063226e-05, + "loss": 5.0891, + "step": 5263 + }, + { + "epoch": 0.03130649919116947, + "grad_norm": 2.000692367553711, + "learning_rate": 4.987922869191918e-05, + "loss": 5.1125, + "step": 5264 + }, + { + "epoch": 0.03131244647445047, + "grad_norm": 1.947544813156128, + "learning_rate": 4.9879182830090065e-05, + "loss": 4.9139, + "step": 5265 + }, + { + "epoch": 0.03131839375773147, + "grad_norm": 1.8827823400497437, + "learning_rate": 4.987913695957588e-05, + "loss": 5.0154, + "step": 5266 + }, + { + "epoch": 0.03132434104101246, + "grad_norm": 2.268115997314453, + "learning_rate": 4.987909108037664e-05, + "loss": 5.0379, + "step": 5267 + }, + { + "epoch": 0.031330288324293465, + "grad_norm": 1.85139000415802, + "learning_rate": 4.987904519249237e-05, + "loss": 4.9428, + "step": 5268 + }, + { + "epoch": 0.03133623560757446, + "grad_norm": 2.208338737487793, + "learning_rate": 4.987899929592308e-05, + "loss": 4.9366, + "step": 5269 + }, + { + "epoch": 0.031342182890855455, + "grad_norm": 3.5571236610412598, + "learning_rate": 4.987895339066879e-05, + "loss": 6.8471, + "step": 5270 + }, + { + "epoch": 0.03134813017413646, + "grad_norm": 2.000157594680786, + "learning_rate": 4.9878907476729516e-05, + "loss": 5.025, + "step": 5271 + }, + { + "epoch": 0.03135407745741745, + "grad_norm": 2.0588366985321045, + "learning_rate": 4.987886155410527e-05, + "loss": 4.8955, + "step": 5272 + }, + { + "epoch": 0.03136002474069845, + "grad_norm": 2.217839241027832, + "learning_rate": 4.9878815622796074e-05, + "loss": 4.9889, + "step": 5273 + }, + { + "epoch": 0.03136597202397945, + "grad_norm": 2.2453126907348633, + "learning_rate": 4.987876968280194e-05, + "loss": 5.3774, + "step": 5274 + }, + { + "epoch": 0.031371919307260444, + "grad_norm": 1.9839471578598022, + "learning_rate": 4.9878723734122876e-05, + "loss": 4.993, + "step": 5275 + }, + { + "epoch": 0.03137786659054144, + "grad_norm": 1.9534602165222168, + "learning_rate": 4.987867777675892e-05, + "loss": 4.9079, + "step": 5276 + }, + { + "epoch": 0.031383813873822435, + "grad_norm": 1.96163809299469, + "learning_rate": 4.9878631810710066e-05, + "loss": 4.9829, + "step": 5277 + }, + { + "epoch": 0.03138976115710344, + "grad_norm": 2.0814366340637207, + "learning_rate": 4.987858583597634e-05, + "loss": 4.8731, + "step": 5278 + }, + { + "epoch": 0.03139570844038443, + "grad_norm": 1.9846211671829224, + "learning_rate": 4.987853985255776e-05, + "loss": 4.9495, + "step": 5279 + }, + { + "epoch": 0.03140165572366543, + "grad_norm": 2.1237289905548096, + "learning_rate": 4.9878493860454335e-05, + "loss": 5.3887, + "step": 5280 + }, + { + "epoch": 0.03140760300694643, + "grad_norm": 2.1526784896850586, + "learning_rate": 4.9878447859666086e-05, + "loss": 5.3603, + "step": 5281 + }, + { + "epoch": 0.031413550290227424, + "grad_norm": 2.0563082695007324, + "learning_rate": 4.987840185019303e-05, + "loss": 5.4104, + "step": 5282 + }, + { + "epoch": 0.03141949757350842, + "grad_norm": 2.0586647987365723, + "learning_rate": 4.9878355832035175e-05, + "loss": 5.517, + "step": 5283 + }, + { + "epoch": 0.03142544485678942, + "grad_norm": 1.8817695379257202, + "learning_rate": 4.9878309805192546e-05, + "loss": 5.3616, + "step": 5284 + }, + { + "epoch": 0.031431392140070416, + "grad_norm": 2.0987086296081543, + "learning_rate": 4.987826376966516e-05, + "loss": 5.3237, + "step": 5285 + }, + { + "epoch": 0.03143733942335141, + "grad_norm": 2.3505301475524902, + "learning_rate": 4.987821772545302e-05, + "loss": 5.5165, + "step": 5286 + }, + { + "epoch": 0.03144328670663241, + "grad_norm": 2.1199939250946045, + "learning_rate": 4.987817167255616e-05, + "loss": 5.3029, + "step": 5287 + }, + { + "epoch": 0.03144923398991341, + "grad_norm": 1.7463518381118774, + "learning_rate": 4.987812561097458e-05, + "loss": 5.3589, + "step": 5288 + }, + { + "epoch": 0.0314551812731944, + "grad_norm": 1.9957356452941895, + "learning_rate": 4.987807954070831e-05, + "loss": 5.2459, + "step": 5289 + }, + { + "epoch": 0.031461128556475405, + "grad_norm": 1.7865337133407593, + "learning_rate": 4.987803346175736e-05, + "loss": 5.3041, + "step": 5290 + }, + { + "epoch": 0.0314670758397564, + "grad_norm": 1.82949960231781, + "learning_rate": 4.9877987374121744e-05, + "loss": 5.5761, + "step": 5291 + }, + { + "epoch": 0.031473023123037396, + "grad_norm": 1.974692940711975, + "learning_rate": 4.9877941277801475e-05, + "loss": 5.5033, + "step": 5292 + }, + { + "epoch": 0.03147897040631839, + "grad_norm": 2.1808922290802, + "learning_rate": 4.9877895172796577e-05, + "loss": 5.6739, + "step": 5293 + }, + { + "epoch": 0.03148491768959939, + "grad_norm": 2.7555716037750244, + "learning_rate": 4.987784905910706e-05, + "loss": 5.2489, + "step": 5294 + }, + { + "epoch": 0.03149086497288039, + "grad_norm": 2.475541353225708, + "learning_rate": 4.9877802936732955e-05, + "loss": 5.2304, + "step": 5295 + }, + { + "epoch": 0.03149681225616138, + "grad_norm": 1.945482611656189, + "learning_rate": 4.987775680567425e-05, + "loss": 5.4085, + "step": 5296 + }, + { + "epoch": 0.031502759539442385, + "grad_norm": 1.9879848957061768, + "learning_rate": 4.987771066593099e-05, + "loss": 5.5372, + "step": 5297 + }, + { + "epoch": 0.03150870682272338, + "grad_norm": 2.0529556274414062, + "learning_rate": 4.987766451750317e-05, + "loss": 5.578, + "step": 5298 + }, + { + "epoch": 0.031514654106004375, + "grad_norm": 1.7769572734832764, + "learning_rate": 4.9877618360390816e-05, + "loss": 5.5348, + "step": 5299 + }, + { + "epoch": 0.03152060138928538, + "grad_norm": 1.9111005067825317, + "learning_rate": 4.987757219459395e-05, + "loss": 5.4267, + "step": 5300 + }, + { + "epoch": 0.03152654867256637, + "grad_norm": 1.9047571420669556, + "learning_rate": 4.987752602011256e-05, + "loss": 5.433, + "step": 5301 + }, + { + "epoch": 0.03153249595584737, + "grad_norm": 1.9031875133514404, + "learning_rate": 4.98774798369467e-05, + "loss": 5.4929, + "step": 5302 + }, + { + "epoch": 0.03153844323912837, + "grad_norm": 1.858656883239746, + "learning_rate": 4.987743364509637e-05, + "loss": 5.3583, + "step": 5303 + }, + { + "epoch": 0.031544390522409364, + "grad_norm": 1.9254835844039917, + "learning_rate": 4.987738744456158e-05, + "loss": 5.4885, + "step": 5304 + }, + { + "epoch": 0.03155033780569036, + "grad_norm": 1.96173095703125, + "learning_rate": 4.987734123534235e-05, + "loss": 5.4869, + "step": 5305 + }, + { + "epoch": 0.031556285088971354, + "grad_norm": 1.7857433557510376, + "learning_rate": 4.98772950174387e-05, + "loss": 5.3845, + "step": 5306 + }, + { + "epoch": 0.031562232372252357, + "grad_norm": 1.9360556602478027, + "learning_rate": 4.9877248790850636e-05, + "loss": 5.3809, + "step": 5307 + }, + { + "epoch": 0.03156817965553335, + "grad_norm": 2.2044126987457275, + "learning_rate": 4.9877202555578197e-05, + "loss": 5.2413, + "step": 5308 + }, + { + "epoch": 0.03157412693881435, + "grad_norm": 1.8200992345809937, + "learning_rate": 4.9877156311621365e-05, + "loss": 5.6241, + "step": 5309 + }, + { + "epoch": 0.03158007422209535, + "grad_norm": 2.0771358013153076, + "learning_rate": 4.987711005898019e-05, + "loss": 5.6854, + "step": 5310 + }, + { + "epoch": 0.031586021505376344, + "grad_norm": 1.8330012559890747, + "learning_rate": 4.987706379765466e-05, + "loss": 5.712, + "step": 5311 + }, + { + "epoch": 0.03159196878865734, + "grad_norm": 1.941501498222351, + "learning_rate": 4.987701752764481e-05, + "loss": 5.4131, + "step": 5312 + }, + { + "epoch": 0.03159791607193834, + "grad_norm": 1.8688616752624512, + "learning_rate": 4.987697124895065e-05, + "loss": 5.3719, + "step": 5313 + }, + { + "epoch": 0.031603863355219336, + "grad_norm": 1.8723224401474, + "learning_rate": 4.98769249615722e-05, + "loss": 5.665, + "step": 5314 + }, + { + "epoch": 0.03160981063850033, + "grad_norm": 1.9460058212280273, + "learning_rate": 4.9876878665509474e-05, + "loss": 5.7048, + "step": 5315 + }, + { + "epoch": 0.03161575792178133, + "grad_norm": 1.9752602577209473, + "learning_rate": 4.987683236076248e-05, + "loss": 5.7098, + "step": 5316 + }, + { + "epoch": 0.03162170520506233, + "grad_norm": 1.8122695684432983, + "learning_rate": 4.9876786047331244e-05, + "loss": 5.2717, + "step": 5317 + }, + { + "epoch": 0.03162765248834332, + "grad_norm": 1.961983323097229, + "learning_rate": 4.9876739725215775e-05, + "loss": 5.5593, + "step": 5318 + }, + { + "epoch": 0.031633599771624325, + "grad_norm": 1.7362732887268066, + "learning_rate": 4.98766933944161e-05, + "loss": 5.5002, + "step": 5319 + }, + { + "epoch": 0.03163954705490532, + "grad_norm": 2.084033489227295, + "learning_rate": 4.9876647054932226e-05, + "loss": 5.5398, + "step": 5320 + }, + { + "epoch": 0.031645494338186315, + "grad_norm": 1.869452953338623, + "learning_rate": 4.9876600706764165e-05, + "loss": 5.5985, + "step": 5321 + }, + { + "epoch": 0.03165144162146731, + "grad_norm": 3.597667694091797, + "learning_rate": 4.9876554349911943e-05, + "loss": 5.4143, + "step": 5322 + }, + { + "epoch": 0.03165738890474831, + "grad_norm": 2.2364773750305176, + "learning_rate": 4.9876507984375574e-05, + "loss": 5.3756, + "step": 5323 + }, + { + "epoch": 0.03166333618802931, + "grad_norm": 2.0204551219940186, + "learning_rate": 4.987646161015508e-05, + "loss": 5.4964, + "step": 5324 + }, + { + "epoch": 0.0316692834713103, + "grad_norm": 1.7375823259353638, + "learning_rate": 4.987641522725046e-05, + "loss": 5.5249, + "step": 5325 + }, + { + "epoch": 0.031675230754591305, + "grad_norm": 1.661597728729248, + "learning_rate": 4.987636883566175e-05, + "loss": 5.4828, + "step": 5326 + }, + { + "epoch": 0.0316811780378723, + "grad_norm": 1.8612693548202515, + "learning_rate": 4.9876322435388944e-05, + "loss": 5.4711, + "step": 5327 + }, + { + "epoch": 0.031687125321153295, + "grad_norm": 1.8282328844070435, + "learning_rate": 4.987627602643208e-05, + "loss": 5.5234, + "step": 5328 + }, + { + "epoch": 0.0316930726044343, + "grad_norm": 1.951170802116394, + "learning_rate": 4.987622960879116e-05, + "loss": 5.4117, + "step": 5329 + }, + { + "epoch": 0.03169901988771529, + "grad_norm": 1.819174885749817, + "learning_rate": 4.9876183182466207e-05, + "loss": 5.3446, + "step": 5330 + }, + { + "epoch": 0.03170496717099629, + "grad_norm": 1.8710874319076538, + "learning_rate": 4.9876136747457245e-05, + "loss": 5.3755, + "step": 5331 + }, + { + "epoch": 0.03171091445427729, + "grad_norm": 2.1957387924194336, + "learning_rate": 4.9876090303764264e-05, + "loss": 6.3036, + "step": 5332 + }, + { + "epoch": 0.031716861737558284, + "grad_norm": 1.774741530418396, + "learning_rate": 4.987604385138731e-05, + "loss": 5.3822, + "step": 5333 + }, + { + "epoch": 0.03172280902083928, + "grad_norm": 1.793230414390564, + "learning_rate": 4.987599739032638e-05, + "loss": 5.4224, + "step": 5334 + }, + { + "epoch": 0.031728756304120274, + "grad_norm": 1.7986340522766113, + "learning_rate": 4.98759509205815e-05, + "loss": 5.3939, + "step": 5335 + }, + { + "epoch": 0.031734703587401276, + "grad_norm": 1.7775462865829468, + "learning_rate": 4.9875904442152675e-05, + "loss": 5.4356, + "step": 5336 + }, + { + "epoch": 0.03174065087068227, + "grad_norm": 1.882104516029358, + "learning_rate": 4.987585795503994e-05, + "loss": 5.2852, + "step": 5337 + }, + { + "epoch": 0.03174659815396327, + "grad_norm": 1.9842430353164673, + "learning_rate": 4.987581145924329e-05, + "loss": 5.4089, + "step": 5338 + }, + { + "epoch": 0.03175254543724427, + "grad_norm": 1.7098103761672974, + "learning_rate": 4.9875764954762754e-05, + "loss": 5.2442, + "step": 5339 + }, + { + "epoch": 0.031758492720525264, + "grad_norm": 1.8304857015609741, + "learning_rate": 4.9875718441598354e-05, + "loss": 5.5403, + "step": 5340 + }, + { + "epoch": 0.03176444000380626, + "grad_norm": 2.0763137340545654, + "learning_rate": 4.987567191975009e-05, + "loss": 5.8295, + "step": 5341 + }, + { + "epoch": 0.03177038728708726, + "grad_norm": 1.907271385192871, + "learning_rate": 4.9875625389217984e-05, + "loss": 5.6979, + "step": 5342 + }, + { + "epoch": 0.031776334570368256, + "grad_norm": 2.1263620853424072, + "learning_rate": 4.9875578850002056e-05, + "loss": 5.7713, + "step": 5343 + }, + { + "epoch": 0.03178228185364925, + "grad_norm": 2.038358211517334, + "learning_rate": 4.987553230210232e-05, + "loss": 6.0019, + "step": 5344 + }, + { + "epoch": 0.03178822913693025, + "grad_norm": 1.5671371221542358, + "learning_rate": 4.987548574551879e-05, + "loss": 5.9237, + "step": 5345 + }, + { + "epoch": 0.03179417642021125, + "grad_norm": 1.9159321784973145, + "learning_rate": 4.987543918025149e-05, + "loss": 6.0363, + "step": 5346 + }, + { + "epoch": 0.03180012370349224, + "grad_norm": 1.8012747764587402, + "learning_rate": 4.987539260630043e-05, + "loss": 5.901, + "step": 5347 + }, + { + "epoch": 0.031806070986773245, + "grad_norm": 2.154933214187622, + "learning_rate": 4.9875346023665625e-05, + "loss": 5.6379, + "step": 5348 + }, + { + "epoch": 0.03181201827005424, + "grad_norm": 2.191539764404297, + "learning_rate": 4.98752994323471e-05, + "loss": 5.5322, + "step": 5349 + }, + { + "epoch": 0.031817965553335235, + "grad_norm": 2.0007123947143555, + "learning_rate": 4.9875252832344856e-05, + "loss": 5.7398, + "step": 5350 + }, + { + "epoch": 0.03182391283661623, + "grad_norm": 1.7119163274765015, + "learning_rate": 4.9875206223658924e-05, + "loss": 5.8507, + "step": 5351 + }, + { + "epoch": 0.03182986011989723, + "grad_norm": 1.8882098197937012, + "learning_rate": 4.987515960628931e-05, + "loss": 5.8668, + "step": 5352 + }, + { + "epoch": 0.03183580740317823, + "grad_norm": 2.005493402481079, + "learning_rate": 4.987511298023604e-05, + "loss": 5.9672, + "step": 5353 + }, + { + "epoch": 0.03184175468645922, + "grad_norm": 1.858807921409607, + "learning_rate": 4.987506634549912e-05, + "loss": 5.9344, + "step": 5354 + }, + { + "epoch": 0.031847701969740225, + "grad_norm": 2.2698724269866943, + "learning_rate": 4.987501970207858e-05, + "loss": 5.6553, + "step": 5355 + }, + { + "epoch": 0.03185364925302122, + "grad_norm": 1.7690725326538086, + "learning_rate": 4.987497304997442e-05, + "loss": 5.6255, + "step": 5356 + }, + { + "epoch": 0.031859596536302215, + "grad_norm": 2.008002758026123, + "learning_rate": 4.987492638918667e-05, + "loss": 5.5578, + "step": 5357 + }, + { + "epoch": 0.03186554381958322, + "grad_norm": 1.6483304500579834, + "learning_rate": 4.987487971971533e-05, + "loss": 5.4786, + "step": 5358 + }, + { + "epoch": 0.03187149110286421, + "grad_norm": 1.9136204719543457, + "learning_rate": 4.987483304156044e-05, + "loss": 5.6043, + "step": 5359 + }, + { + "epoch": 0.03187743838614521, + "grad_norm": 1.9811625480651855, + "learning_rate": 4.987478635472199e-05, + "loss": 5.6172, + "step": 5360 + }, + { + "epoch": 0.03188338566942621, + "grad_norm": 2.012134075164795, + "learning_rate": 4.987473965920002e-05, + "loss": 5.6715, + "step": 5361 + }, + { + "epoch": 0.031889332952707204, + "grad_norm": 1.930550217628479, + "learning_rate": 4.987469295499453e-05, + "loss": 5.516, + "step": 5362 + }, + { + "epoch": 0.0318952802359882, + "grad_norm": 2.1190578937530518, + "learning_rate": 4.987464624210554e-05, + "loss": 5.5176, + "step": 5363 + }, + { + "epoch": 0.031901227519269194, + "grad_norm": 2.428710699081421, + "learning_rate": 4.987459952053307e-05, + "loss": 5.4088, + "step": 5364 + }, + { + "epoch": 0.031907174802550196, + "grad_norm": 1.8820819854736328, + "learning_rate": 4.987455279027713e-05, + "loss": 5.3753, + "step": 5365 + }, + { + "epoch": 0.03191312208583119, + "grad_norm": 1.6506859064102173, + "learning_rate": 4.987450605133775e-05, + "loss": 5.6018, + "step": 5366 + }, + { + "epoch": 0.03191906936911219, + "grad_norm": 2.060772657394409, + "learning_rate": 4.9874459303714925e-05, + "loss": 5.3587, + "step": 5367 + }, + { + "epoch": 0.03192501665239319, + "grad_norm": 2.3591532707214355, + "learning_rate": 4.9874412547408694e-05, + "loss": 5.7685, + "step": 5368 + }, + { + "epoch": 0.031930963935674184, + "grad_norm": 2.140322685241699, + "learning_rate": 4.987436578241906e-05, + "loss": 5.9015, + "step": 5369 + }, + { + "epoch": 0.03193691121895518, + "grad_norm": 2.2479233741760254, + "learning_rate": 4.987431900874604e-05, + "loss": 5.6079, + "step": 5370 + }, + { + "epoch": 0.03194285850223618, + "grad_norm": 2.0334317684173584, + "learning_rate": 4.987427222638965e-05, + "loss": 5.6364, + "step": 5371 + }, + { + "epoch": 0.031948805785517176, + "grad_norm": 2.0599231719970703, + "learning_rate": 4.987422543534991e-05, + "loss": 5.6578, + "step": 5372 + }, + { + "epoch": 0.03195475306879817, + "grad_norm": 2.237504720687866, + "learning_rate": 4.9874178635626836e-05, + "loss": 5.5784, + "step": 5373 + }, + { + "epoch": 0.03196070035207917, + "grad_norm": 2.013193130493164, + "learning_rate": 4.987413182722044e-05, + "loss": 5.4874, + "step": 5374 + }, + { + "epoch": 0.03196664763536017, + "grad_norm": 1.9806950092315674, + "learning_rate": 4.987408501013075e-05, + "loss": 5.41, + "step": 5375 + }, + { + "epoch": 0.03197259491864116, + "grad_norm": 1.7534204721450806, + "learning_rate": 4.9874038184357766e-05, + "loss": 5.4596, + "step": 5376 + }, + { + "epoch": 0.031978542201922165, + "grad_norm": 1.5722386837005615, + "learning_rate": 4.987399134990152e-05, + "loss": 5.508, + "step": 5377 + }, + { + "epoch": 0.03198448948520316, + "grad_norm": 7.868972301483154, + "learning_rate": 4.987394450676201e-05, + "loss": 5.1734, + "step": 5378 + }, + { + "epoch": 0.031990436768484155, + "grad_norm": 2.2103798389434814, + "learning_rate": 4.9873897654939274e-05, + "loss": 5.6766, + "step": 5379 + }, + { + "epoch": 0.03199638405176515, + "grad_norm": 1.9590017795562744, + "learning_rate": 4.9873850794433306e-05, + "loss": 5.7764, + "step": 5380 + }, + { + "epoch": 0.03200233133504615, + "grad_norm": 1.96006441116333, + "learning_rate": 4.9873803925244146e-05, + "loss": 5.7933, + "step": 5381 + }, + { + "epoch": 0.03200827861832715, + "grad_norm": 1.7377163171768188, + "learning_rate": 4.987375704737178e-05, + "loss": 5.692, + "step": 5382 + }, + { + "epoch": 0.03201422590160814, + "grad_norm": 2.0734782218933105, + "learning_rate": 4.9873710160816256e-05, + "loss": 5.5466, + "step": 5383 + }, + { + "epoch": 0.032020173184889145, + "grad_norm": 2.4700942039489746, + "learning_rate": 4.9873663265577574e-05, + "loss": 5.5837, + "step": 5384 + }, + { + "epoch": 0.03202612046817014, + "grad_norm": 2.067009925842285, + "learning_rate": 4.987361636165576e-05, + "loss": 5.4777, + "step": 5385 + }, + { + "epoch": 0.032032067751451135, + "grad_norm": 1.9585732221603394, + "learning_rate": 4.9873569449050815e-05, + "loss": 5.62, + "step": 5386 + }, + { + "epoch": 0.03203801503473214, + "grad_norm": 2.0210976600646973, + "learning_rate": 4.9873522527762766e-05, + "loss": 5.3554, + "step": 5387 + }, + { + "epoch": 0.03204396231801313, + "grad_norm": 2.0345299243927, + "learning_rate": 4.987347559779163e-05, + "loss": 5.3912, + "step": 5388 + }, + { + "epoch": 0.03204990960129413, + "grad_norm": 2.0960853099823, + "learning_rate": 4.987342865913742e-05, + "loss": 5.3497, + "step": 5389 + }, + { + "epoch": 0.03205585688457513, + "grad_norm": 2.0156044960021973, + "learning_rate": 4.987338171180015e-05, + "loss": 5.2769, + "step": 5390 + }, + { + "epoch": 0.032061804167856124, + "grad_norm": 2.0021722316741943, + "learning_rate": 4.987333475577984e-05, + "loss": 5.2338, + "step": 5391 + }, + { + "epoch": 0.03206775145113712, + "grad_norm": 1.8502025604248047, + "learning_rate": 4.987328779107651e-05, + "loss": 5.4231, + "step": 5392 + }, + { + "epoch": 0.03207369873441812, + "grad_norm": 2.0788064002990723, + "learning_rate": 4.987324081769016e-05, + "loss": 5.3989, + "step": 5393 + }, + { + "epoch": 0.032079646017699116, + "grad_norm": 5.172029495239258, + "learning_rate": 4.987319383562083e-05, + "loss": 6.5943, + "step": 5394 + }, + { + "epoch": 0.03208559330098011, + "grad_norm": 1.8732082843780518, + "learning_rate": 4.987314684486852e-05, + "loss": 5.3085, + "step": 5395 + }, + { + "epoch": 0.032091540584261107, + "grad_norm": 2.0511786937713623, + "learning_rate": 4.987309984543326e-05, + "loss": 5.1598, + "step": 5396 + }, + { + "epoch": 0.03209748786754211, + "grad_norm": 2.1821703910827637, + "learning_rate": 4.987305283731505e-05, + "loss": 5.3575, + "step": 5397 + }, + { + "epoch": 0.032103435150823104, + "grad_norm": 2.1190478801727295, + "learning_rate": 4.9873005820513906e-05, + "loss": 5.2371, + "step": 5398 + }, + { + "epoch": 0.0321093824341041, + "grad_norm": 2.1476964950561523, + "learning_rate": 4.987295879502987e-05, + "loss": 5.1378, + "step": 5399 + }, + { + "epoch": 0.0321153297173851, + "grad_norm": 2.3466129302978516, + "learning_rate": 4.987291176086293e-05, + "loss": 5.0642, + "step": 5400 + }, + { + "epoch": 0.032121277000666096, + "grad_norm": 2.267949104309082, + "learning_rate": 4.9872864718013115e-05, + "loss": 5.6835, + "step": 5401 + }, + { + "epoch": 0.03212722428394709, + "grad_norm": 3.1235604286193848, + "learning_rate": 4.987281766648044e-05, + "loss": 6.2094, + "step": 5402 + }, + { + "epoch": 0.03213317156722809, + "grad_norm": 2.494929790496826, + "learning_rate": 4.987277060626493e-05, + "loss": 6.2387, + "step": 5403 + }, + { + "epoch": 0.03213911885050909, + "grad_norm": 2.554422616958618, + "learning_rate": 4.987272353736658e-05, + "loss": 5.9655, + "step": 5404 + }, + { + "epoch": 0.03214506613379008, + "grad_norm": 3.688295841217041, + "learning_rate": 4.987267645978543e-05, + "loss": 6.3994, + "step": 5405 + }, + { + "epoch": 0.032151013417071085, + "grad_norm": 2.773847818374634, + "learning_rate": 4.987262937352147e-05, + "loss": 5.515, + "step": 5406 + }, + { + "epoch": 0.03215696070035208, + "grad_norm": 3.067812204360962, + "learning_rate": 4.987258227857475e-05, + "loss": 5.7388, + "step": 5407 + }, + { + "epoch": 0.032162907983633075, + "grad_norm": 3.0557258129119873, + "learning_rate": 4.987253517494525e-05, + "loss": 6.0334, + "step": 5408 + }, + { + "epoch": 0.03216885526691407, + "grad_norm": 2.2864489555358887, + "learning_rate": 4.9872488062633026e-05, + "loss": 6.2805, + "step": 5409 + }, + { + "epoch": 0.03217480255019507, + "grad_norm": 3.2848916053771973, + "learning_rate": 4.987244094163807e-05, + "loss": 6.4782, + "step": 5410 + }, + { + "epoch": 0.03218074983347607, + "grad_norm": 3.7147631645202637, + "learning_rate": 4.987239381196039e-05, + "loss": 6.6618, + "step": 5411 + }, + { + "epoch": 0.03218669711675706, + "grad_norm": 2.740705966949463, + "learning_rate": 4.9872346673600017e-05, + "loss": 6.0261, + "step": 5412 + }, + { + "epoch": 0.032192644400038065, + "grad_norm": 2.6408498287200928, + "learning_rate": 4.9872299526556965e-05, + "loss": 5.8645, + "step": 5413 + }, + { + "epoch": 0.03219859168331906, + "grad_norm": 2.8298256397247314, + "learning_rate": 4.987225237083125e-05, + "loss": 5.9263, + "step": 5414 + }, + { + "epoch": 0.032204538966600055, + "grad_norm": 2.9417197704315186, + "learning_rate": 4.987220520642289e-05, + "loss": 5.8018, + "step": 5415 + }, + { + "epoch": 0.03221048624988106, + "grad_norm": 3.2862906455993652, + "learning_rate": 4.9872158033331904e-05, + "loss": 5.8429, + "step": 5416 + }, + { + "epoch": 0.03221643353316205, + "grad_norm": 2.7724359035491943, + "learning_rate": 4.9872110851558306e-05, + "loss": 5.9504, + "step": 5417 + }, + { + "epoch": 0.03222238081644305, + "grad_norm": 2.2753829956054688, + "learning_rate": 4.9872063661102106e-05, + "loss": 5.6443, + "step": 5418 + }, + { + "epoch": 0.03222832809972405, + "grad_norm": 2.597649097442627, + "learning_rate": 4.987201646196332e-05, + "loss": 6.4441, + "step": 5419 + }, + { + "epoch": 0.032234275383005044, + "grad_norm": 2.7298800945281982, + "learning_rate": 4.987196925414198e-05, + "loss": 6.2988, + "step": 5420 + }, + { + "epoch": 0.03224022266628604, + "grad_norm": 3.2329537868499756, + "learning_rate": 4.987192203763809e-05, + "loss": 5.8743, + "step": 5421 + }, + { + "epoch": 0.03224616994956704, + "grad_norm": 3.033226251602173, + "learning_rate": 4.987187481245167e-05, + "loss": 5.4863, + "step": 5422 + }, + { + "epoch": 0.032252117232848036, + "grad_norm": 2.7728521823883057, + "learning_rate": 4.987182757858273e-05, + "loss": 5.5722, + "step": 5423 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.6083309650421143, + "learning_rate": 4.98717803360313e-05, + "loss": 6.5257, + "step": 5424 + }, + { + "epoch": 0.032264011799410026, + "grad_norm": 2.5422329902648926, + "learning_rate": 4.987173308479738e-05, + "loss": 6.5582, + "step": 5425 + }, + { + "epoch": 0.03226995908269103, + "grad_norm": 2.7634811401367188, + "learning_rate": 4.9871685824881e-05, + "loss": 6.0987, + "step": 5426 + }, + { + "epoch": 0.032275906365972024, + "grad_norm": 3.631476640701294, + "learning_rate": 4.987163855628217e-05, + "loss": 5.8506, + "step": 5427 + }, + { + "epoch": 0.03228185364925302, + "grad_norm": 2.9783661365509033, + "learning_rate": 4.9871591279000904e-05, + "loss": 5.9387, + "step": 5428 + }, + { + "epoch": 0.03228780093253402, + "grad_norm": 2.369645357131958, + "learning_rate": 4.9871543993037225e-05, + "loss": 5.8097, + "step": 5429 + }, + { + "epoch": 0.032293748215815016, + "grad_norm": 2.782055616378784, + "learning_rate": 4.9871496698391155e-05, + "loss": 5.5301, + "step": 5430 + }, + { + "epoch": 0.03229969549909601, + "grad_norm": 2.408205270767212, + "learning_rate": 4.98714493950627e-05, + "loss": 5.6514, + "step": 5431 + }, + { + "epoch": 0.03230564278237701, + "grad_norm": 2.0641589164733887, + "learning_rate": 4.987140208305187e-05, + "loss": 5.6168, + "step": 5432 + }, + { + "epoch": 0.03231159006565801, + "grad_norm": 2.109773874282837, + "learning_rate": 4.987135476235869e-05, + "loss": 5.6678, + "step": 5433 + }, + { + "epoch": 0.032317537348939, + "grad_norm": 2.9809730052948, + "learning_rate": 4.987130743298318e-05, + "loss": 6.0531, + "step": 5434 + }, + { + "epoch": 0.032323484632220005, + "grad_norm": 2.5728509426116943, + "learning_rate": 4.9871260094925365e-05, + "loss": 6.05, + "step": 5435 + }, + { + "epoch": 0.032329431915501, + "grad_norm": 2.477074146270752, + "learning_rate": 4.9871212748185236e-05, + "loss": 6.351, + "step": 5436 + }, + { + "epoch": 0.032335379198781995, + "grad_norm": 2.3485517501831055, + "learning_rate": 4.987116539276283e-05, + "loss": 6.3033, + "step": 5437 + }, + { + "epoch": 0.03234132648206299, + "grad_norm": 2.4214296340942383, + "learning_rate": 4.987111802865816e-05, + "loss": 6.1152, + "step": 5438 + }, + { + "epoch": 0.03234727376534399, + "grad_norm": 3.5628256797790527, + "learning_rate": 4.9871070655871234e-05, + "loss": 5.6502, + "step": 5439 + }, + { + "epoch": 0.03235322104862499, + "grad_norm": 3.190075159072876, + "learning_rate": 4.987102327440208e-05, + "loss": 5.4164, + "step": 5440 + }, + { + "epoch": 0.03235916833190598, + "grad_norm": 2.402754306793213, + "learning_rate": 4.9870975884250696e-05, + "loss": 5.7116, + "step": 5441 + }, + { + "epoch": 0.032365115615186985, + "grad_norm": 2.846653938293457, + "learning_rate": 4.987092848541712e-05, + "loss": 6.1456, + "step": 5442 + }, + { + "epoch": 0.03237106289846798, + "grad_norm": 2.6700549125671387, + "learning_rate": 4.987088107790136e-05, + "loss": 5.9777, + "step": 5443 + }, + { + "epoch": 0.032377010181748975, + "grad_norm": 2.8929460048675537, + "learning_rate": 4.987083366170343e-05, + "loss": 6.1459, + "step": 5444 + }, + { + "epoch": 0.03238295746502998, + "grad_norm": 2.524376153945923, + "learning_rate": 4.987078623682335e-05, + "loss": 6.4341, + "step": 5445 + }, + { + "epoch": 0.03238890474831097, + "grad_norm": 2.0901076793670654, + "learning_rate": 4.987073880326114e-05, + "loss": 6.3968, + "step": 5446 + }, + { + "epoch": 0.03239485203159197, + "grad_norm": 3.0033867359161377, + "learning_rate": 4.9870691361016805e-05, + "loss": 5.8656, + "step": 5447 + }, + { + "epoch": 0.03240079931487297, + "grad_norm": 2.7715492248535156, + "learning_rate": 4.987064391009038e-05, + "loss": 6.1634, + "step": 5448 + }, + { + "epoch": 0.032406746598153964, + "grad_norm": 2.6102347373962402, + "learning_rate": 4.9870596450481855e-05, + "loss": 6.2521, + "step": 5449 + }, + { + "epoch": 0.03241269388143496, + "grad_norm": 2.326253890991211, + "learning_rate": 4.9870548982191265e-05, + "loss": 6.2517, + "step": 5450 + }, + { + "epoch": 0.03241864116471596, + "grad_norm": 2.3012197017669678, + "learning_rate": 4.987050150521863e-05, + "loss": 6.2261, + "step": 5451 + }, + { + "epoch": 0.032424588447996956, + "grad_norm": 2.100337505340576, + "learning_rate": 4.987045401956396e-05, + "loss": 5.6291, + "step": 5452 + }, + { + "epoch": 0.03243053573127795, + "grad_norm": 3.094754219055176, + "learning_rate": 4.987040652522727e-05, + "loss": 5.897, + "step": 5453 + }, + { + "epoch": 0.032436483014558946, + "grad_norm": 2.7406179904937744, + "learning_rate": 4.987035902220857e-05, + "loss": 6.0083, + "step": 5454 + }, + { + "epoch": 0.03244243029783995, + "grad_norm": 2.4106287956237793, + "learning_rate": 4.9870311510507895e-05, + "loss": 5.8538, + "step": 5455 + }, + { + "epoch": 0.032448377581120944, + "grad_norm": 2.7335946559906006, + "learning_rate": 4.987026399012525e-05, + "loss": 5.9181, + "step": 5456 + }, + { + "epoch": 0.03245432486440194, + "grad_norm": 2.796175003051758, + "learning_rate": 4.987021646106064e-05, + "loss": 5.6461, + "step": 5457 + }, + { + "epoch": 0.03246027214768294, + "grad_norm": 3.086470127105713, + "learning_rate": 4.987016892331411e-05, + "loss": 5.6692, + "step": 5458 + }, + { + "epoch": 0.032466219430963936, + "grad_norm": 2.394465923309326, + "learning_rate": 4.9870121376885656e-05, + "loss": 6.3046, + "step": 5459 + }, + { + "epoch": 0.03247216671424493, + "grad_norm": 2.0745291709899902, + "learning_rate": 4.98700738217753e-05, + "loss": 6.0491, + "step": 5460 + }, + { + "epoch": 0.03247811399752593, + "grad_norm": 2.66359281539917, + "learning_rate": 4.987002625798305e-05, + "loss": 5.6468, + "step": 5461 + }, + { + "epoch": 0.03248406128080693, + "grad_norm": 2.392833948135376, + "learning_rate": 4.9869978685508936e-05, + "loss": 5.8421, + "step": 5462 + }, + { + "epoch": 0.03249000856408792, + "grad_norm": 2.671710252761841, + "learning_rate": 4.9869931104352975e-05, + "loss": 5.6892, + "step": 5463 + }, + { + "epoch": 0.032495955847368925, + "grad_norm": 2.7013144493103027, + "learning_rate": 4.986988351451517e-05, + "loss": 5.7911, + "step": 5464 + }, + { + "epoch": 0.03250190313064992, + "grad_norm": 1.926703929901123, + "learning_rate": 4.9869835915995555e-05, + "loss": 5.5492, + "step": 5465 + }, + { + "epoch": 0.032507850413930915, + "grad_norm": 2.5668530464172363, + "learning_rate": 4.986978830879413e-05, + "loss": 5.8949, + "step": 5466 + }, + { + "epoch": 0.03251379769721191, + "grad_norm": 2.555305004119873, + "learning_rate": 4.986974069291092e-05, + "loss": 5.7408, + "step": 5467 + }, + { + "epoch": 0.03251974498049291, + "grad_norm": 2.551226854324341, + "learning_rate": 4.986969306834594e-05, + "loss": 5.7738, + "step": 5468 + }, + { + "epoch": 0.03252569226377391, + "grad_norm": 2.3194847106933594, + "learning_rate": 4.986964543509921e-05, + "loss": 6.2837, + "step": 5469 + }, + { + "epoch": 0.0325316395470549, + "grad_norm": 1.9618690013885498, + "learning_rate": 4.986959779317074e-05, + "loss": 5.9236, + "step": 5470 + }, + { + "epoch": 0.032537586830335904, + "grad_norm": 2.351971387863159, + "learning_rate": 4.986955014256055e-05, + "loss": 5.591, + "step": 5471 + }, + { + "epoch": 0.0325435341136169, + "grad_norm": 2.3772034645080566, + "learning_rate": 4.986950248326866e-05, + "loss": 5.6785, + "step": 5472 + }, + { + "epoch": 0.032549481396897895, + "grad_norm": 2.5764195919036865, + "learning_rate": 4.9869454815295085e-05, + "loss": 5.525, + "step": 5473 + }, + { + "epoch": 0.0325554286801789, + "grad_norm": 2.231048107147217, + "learning_rate": 4.986940713863984e-05, + "loss": 5.6789, + "step": 5474 + }, + { + "epoch": 0.03256137596345989, + "grad_norm": 2.8053946495056152, + "learning_rate": 4.986935945330294e-05, + "loss": 5.6319, + "step": 5475 + }, + { + "epoch": 0.03256732324674089, + "grad_norm": 3.4610519409179688, + "learning_rate": 4.98693117592844e-05, + "loss": 5.9855, + "step": 5476 + }, + { + "epoch": 0.03257327053002189, + "grad_norm": 2.5019664764404297, + "learning_rate": 4.986926405658425e-05, + "loss": 5.9997, + "step": 5477 + }, + { + "epoch": 0.032579217813302884, + "grad_norm": 2.6583313941955566, + "learning_rate": 4.986921634520249e-05, + "loss": 6.3755, + "step": 5478 + }, + { + "epoch": 0.03258516509658388, + "grad_norm": 2.990699291229248, + "learning_rate": 4.986916862513914e-05, + "loss": 5.8932, + "step": 5479 + }, + { + "epoch": 0.03259111237986488, + "grad_norm": 3.282546043395996, + "learning_rate": 4.986912089639423e-05, + "loss": 5.5508, + "step": 5480 + }, + { + "epoch": 0.032597059663145876, + "grad_norm": 3.1012487411499023, + "learning_rate": 4.9869073158967755e-05, + "loss": 5.5567, + "step": 5481 + }, + { + "epoch": 0.03260300694642687, + "grad_norm": 2.141892433166504, + "learning_rate": 4.986902541285975e-05, + "loss": 5.6195, + "step": 5482 + }, + { + "epoch": 0.032608954229707866, + "grad_norm": 2.173670530319214, + "learning_rate": 4.986897765807023e-05, + "loss": 5.6913, + "step": 5483 + }, + { + "epoch": 0.03261490151298887, + "grad_norm": 2.4076435565948486, + "learning_rate": 4.98689298945992e-05, + "loss": 5.8324, + "step": 5484 + }, + { + "epoch": 0.03262084879626986, + "grad_norm": 2.8968818187713623, + "learning_rate": 4.986888212244668e-05, + "loss": 6.0086, + "step": 5485 + }, + { + "epoch": 0.03262679607955086, + "grad_norm": 2.2434191703796387, + "learning_rate": 4.9868834341612696e-05, + "loss": 5.9645, + "step": 5486 + }, + { + "epoch": 0.03263274336283186, + "grad_norm": 1.9683157205581665, + "learning_rate": 4.9868786552097255e-05, + "loss": 5.9173, + "step": 5487 + }, + { + "epoch": 0.032638690646112856, + "grad_norm": 2.369816303253174, + "learning_rate": 4.9868738753900384e-05, + "loss": 6.2728, + "step": 5488 + }, + { + "epoch": 0.03264463792939385, + "grad_norm": 2.1152775287628174, + "learning_rate": 4.986869094702209e-05, + "loss": 6.0474, + "step": 5489 + }, + { + "epoch": 0.03265058521267485, + "grad_norm": 2.3219857215881348, + "learning_rate": 4.9868643131462397e-05, + "loss": 5.7451, + "step": 5490 + }, + { + "epoch": 0.03265653249595585, + "grad_norm": 2.236046075820923, + "learning_rate": 4.986859530722131e-05, + "loss": 5.7775, + "step": 5491 + }, + { + "epoch": 0.03266247977923684, + "grad_norm": 2.3334364891052246, + "learning_rate": 4.986854747429886e-05, + "loss": 5.7429, + "step": 5492 + }, + { + "epoch": 0.032668427062517845, + "grad_norm": 2.5464704036712646, + "learning_rate": 4.986849963269505e-05, + "loss": 5.5781, + "step": 5493 + }, + { + "epoch": 0.03267437434579884, + "grad_norm": 2.104419469833374, + "learning_rate": 4.986845178240991e-05, + "loss": 5.6378, + "step": 5494 + }, + { + "epoch": 0.032680321629079835, + "grad_norm": 2.3115224838256836, + "learning_rate": 4.9868403923443444e-05, + "loss": 5.7617, + "step": 5495 + }, + { + "epoch": 0.03268626891236083, + "grad_norm": 2.3370540142059326, + "learning_rate": 4.9868356055795685e-05, + "loss": 6.1278, + "step": 5496 + }, + { + "epoch": 0.03269221619564183, + "grad_norm": 2.8618736267089844, + "learning_rate": 4.986830817946663e-05, + "loss": 6.0879, + "step": 5497 + }, + { + "epoch": 0.03269816347892283, + "grad_norm": 2.3229949474334717, + "learning_rate": 4.986826029445631e-05, + "loss": 6.0915, + "step": 5498 + }, + { + "epoch": 0.03270411076220382, + "grad_norm": 2.549914598464966, + "learning_rate": 4.986821240076473e-05, + "loss": 6.2375, + "step": 5499 + }, + { + "epoch": 0.032710058045484824, + "grad_norm": 2.595916271209717, + "learning_rate": 4.986816449839192e-05, + "loss": 6.095, + "step": 5500 + }, + { + "epoch": 0.03271600532876582, + "grad_norm": 2.4409420490264893, + "learning_rate": 4.98681165873379e-05, + "loss": 5.353, + "step": 5501 + }, + { + "epoch": 0.032721952612046815, + "grad_norm": 2.550156593322754, + "learning_rate": 4.986806866760266e-05, + "loss": 5.558, + "step": 5502 + }, + { + "epoch": 0.03272789989532782, + "grad_norm": 2.7811737060546875, + "learning_rate": 4.986802073918625e-05, + "loss": 5.7174, + "step": 5503 + }, + { + "epoch": 0.03273384717860881, + "grad_norm": 2.8430123329162598, + "learning_rate": 4.986797280208866e-05, + "loss": 5.5644, + "step": 5504 + }, + { + "epoch": 0.03273979446188981, + "grad_norm": 3.021040201187134, + "learning_rate": 4.986792485630992e-05, + "loss": 5.9451, + "step": 5505 + }, + { + "epoch": 0.03274574174517081, + "grad_norm": 2.69866681098938, + "learning_rate": 4.986787690185005e-05, + "loss": 5.9934, + "step": 5506 + }, + { + "epoch": 0.032751689028451804, + "grad_norm": 2.7202444076538086, + "learning_rate": 4.986782893870906e-05, + "loss": 6.1298, + "step": 5507 + }, + { + "epoch": 0.0327576363117328, + "grad_norm": 2.223405122756958, + "learning_rate": 4.986778096688696e-05, + "loss": 5.8968, + "step": 5508 + }, + { + "epoch": 0.0327635835950138, + "grad_norm": 2.5733680725097656, + "learning_rate": 4.986773298638378e-05, + "loss": 6.0928, + "step": 5509 + }, + { + "epoch": 0.032769530878294796, + "grad_norm": 2.584397554397583, + "learning_rate": 4.986768499719953e-05, + "loss": 5.7879, + "step": 5510 + }, + { + "epoch": 0.03277547816157579, + "grad_norm": 3.160489797592163, + "learning_rate": 4.986763699933423e-05, + "loss": 5.6413, + "step": 5511 + }, + { + "epoch": 0.032781425444856786, + "grad_norm": 2.8224406242370605, + "learning_rate": 4.9867588992787894e-05, + "loss": 6.1476, + "step": 5512 + }, + { + "epoch": 0.03278737272813779, + "grad_norm": 2.2565996646881104, + "learning_rate": 4.986754097756054e-05, + "loss": 6.208, + "step": 5513 + }, + { + "epoch": 0.03279332001141878, + "grad_norm": 2.5425479412078857, + "learning_rate": 4.9867492953652184e-05, + "loss": 5.934, + "step": 5514 + }, + { + "epoch": 0.03279926729469978, + "grad_norm": 2.6598689556121826, + "learning_rate": 4.986744492106284e-05, + "loss": 5.7433, + "step": 5515 + }, + { + "epoch": 0.03280521457798078, + "grad_norm": 2.419388771057129, + "learning_rate": 4.986739687979253e-05, + "loss": 5.378, + "step": 5516 + }, + { + "epoch": 0.032811161861261776, + "grad_norm": 2.72784161567688, + "learning_rate": 4.986734882984127e-05, + "loss": 5.4089, + "step": 5517 + }, + { + "epoch": 0.03281710914454277, + "grad_norm": 3.0592923164367676, + "learning_rate": 4.9867300771209075e-05, + "loss": 5.9573, + "step": 5518 + }, + { + "epoch": 0.03282305642782377, + "grad_norm": 2.7681832313537598, + "learning_rate": 4.9867252703895965e-05, + "loss": 5.5325, + "step": 5519 + }, + { + "epoch": 0.03282900371110477, + "grad_norm": 2.6752777099609375, + "learning_rate": 4.9867204627901946e-05, + "loss": 5.7543, + "step": 5520 + }, + { + "epoch": 0.03283495099438576, + "grad_norm": 2.481203317642212, + "learning_rate": 4.9867156543227046e-05, + "loss": 5.575, + "step": 5521 + }, + { + "epoch": 0.032840898277666765, + "grad_norm": 2.6403908729553223, + "learning_rate": 4.986710844987128e-05, + "loss": 5.4381, + "step": 5522 + }, + { + "epoch": 0.03284684556094776, + "grad_norm": 2.6146085262298584, + "learning_rate": 4.986706034783466e-05, + "loss": 5.8672, + "step": 5523 + }, + { + "epoch": 0.032852792844228755, + "grad_norm": 3.453666925430298, + "learning_rate": 4.986701223711722e-05, + "loss": 5.8353, + "step": 5524 + }, + { + "epoch": 0.03285874012750975, + "grad_norm": 2.511216640472412, + "learning_rate": 4.986696411771895e-05, + "loss": 5.9567, + "step": 5525 + }, + { + "epoch": 0.03286468741079075, + "grad_norm": 2.57395601272583, + "learning_rate": 4.986691598963988e-05, + "loss": 5.6396, + "step": 5526 + }, + { + "epoch": 0.03287063469407175, + "grad_norm": 2.778801441192627, + "learning_rate": 4.986686785288003e-05, + "loss": 6.0237, + "step": 5527 + }, + { + "epoch": 0.03287658197735274, + "grad_norm": 2.5216047763824463, + "learning_rate": 4.986681970743941e-05, + "loss": 6.1305, + "step": 5528 + }, + { + "epoch": 0.032882529260633744, + "grad_norm": 2.5105085372924805, + "learning_rate": 4.986677155331804e-05, + "loss": 6.4951, + "step": 5529 + }, + { + "epoch": 0.03288847654391474, + "grad_norm": 2.4105372428894043, + "learning_rate": 4.9866723390515946e-05, + "loss": 6.291, + "step": 5530 + }, + { + "epoch": 0.032894423827195735, + "grad_norm": 2.740095853805542, + "learning_rate": 4.9866675219033125e-05, + "loss": 5.762, + "step": 5531 + }, + { + "epoch": 0.03290037111047674, + "grad_norm": 2.327892541885376, + "learning_rate": 4.9866627038869605e-05, + "loss": 6.1023, + "step": 5532 + }, + { + "epoch": 0.03290631839375773, + "grad_norm": 2.71732497215271, + "learning_rate": 4.9866578850025414e-05, + "loss": 6.0739, + "step": 5533 + }, + { + "epoch": 0.03291226567703873, + "grad_norm": 2.1895039081573486, + "learning_rate": 4.9866530652500545e-05, + "loss": 5.801, + "step": 5534 + }, + { + "epoch": 0.03291821296031973, + "grad_norm": 2.39670729637146, + "learning_rate": 4.986648244629503e-05, + "loss": 6.0105, + "step": 5535 + }, + { + "epoch": 0.032924160243600724, + "grad_norm": 2.14630126953125, + "learning_rate": 4.986643423140889e-05, + "loss": 5.8457, + "step": 5536 + }, + { + "epoch": 0.03293010752688172, + "grad_norm": 2.111196994781494, + "learning_rate": 4.9866386007842125e-05, + "loss": 6.0804, + "step": 5537 + }, + { + "epoch": 0.03293605481016272, + "grad_norm": 2.8245434761047363, + "learning_rate": 4.986633777559476e-05, + "loss": 6.3152, + "step": 5538 + }, + { + "epoch": 0.032942002093443716, + "grad_norm": 2.3561060428619385, + "learning_rate": 4.9866289534666824e-05, + "loss": 6.286, + "step": 5539 + }, + { + "epoch": 0.03294794937672471, + "grad_norm": 3.21701979637146, + "learning_rate": 4.986624128505832e-05, + "loss": 5.9775, + "step": 5540 + }, + { + "epoch": 0.032953896660005706, + "grad_norm": 3.9414072036743164, + "learning_rate": 4.9866193026769265e-05, + "loss": 5.9413, + "step": 5541 + }, + { + "epoch": 0.03295984394328671, + "grad_norm": 2.7801051139831543, + "learning_rate": 4.986614475979968e-05, + "loss": 5.8642, + "step": 5542 + }, + { + "epoch": 0.0329657912265677, + "grad_norm": 2.7095935344696045, + "learning_rate": 4.986609648414958e-05, + "loss": 5.6952, + "step": 5543 + }, + { + "epoch": 0.0329717385098487, + "grad_norm": 2.5800812244415283, + "learning_rate": 4.986604819981898e-05, + "loss": 6.0285, + "step": 5544 + }, + { + "epoch": 0.0329776857931297, + "grad_norm": 2.6105730533599854, + "learning_rate": 4.9865999906807904e-05, + "loss": 5.6683, + "step": 5545 + }, + { + "epoch": 0.032983633076410696, + "grad_norm": 2.635570764541626, + "learning_rate": 4.9865951605116366e-05, + "loss": 5.9092, + "step": 5546 + }, + { + "epoch": 0.03298958035969169, + "grad_norm": 2.3708200454711914, + "learning_rate": 4.9865903294744373e-05, + "loss": 6.0034, + "step": 5547 + }, + { + "epoch": 0.03299552764297269, + "grad_norm": 2.437201499938965, + "learning_rate": 4.986585497569196e-05, + "loss": 6.2587, + "step": 5548 + }, + { + "epoch": 0.03300147492625369, + "grad_norm": 2.076016426086426, + "learning_rate": 4.9865806647959126e-05, + "loss": 6.358, + "step": 5549 + }, + { + "epoch": 0.03300742220953468, + "grad_norm": 1.8261257410049438, + "learning_rate": 4.98657583115459e-05, + "loss": 6.0431, + "step": 5550 + }, + { + "epoch": 0.033013369492815685, + "grad_norm": 2.8339858055114746, + "learning_rate": 4.98657099664523e-05, + "loss": 5.7956, + "step": 5551 + }, + { + "epoch": 0.03301931677609668, + "grad_norm": 2.7288596630096436, + "learning_rate": 4.986566161267833e-05, + "loss": 5.7092, + "step": 5552 + }, + { + "epoch": 0.033025264059377675, + "grad_norm": 2.7197329998016357, + "learning_rate": 4.986561325022402e-05, + "loss": 5.649, + "step": 5553 + }, + { + "epoch": 0.03303121134265867, + "grad_norm": 2.6161739826202393, + "learning_rate": 4.986556487908937e-05, + "loss": 5.6935, + "step": 5554 + }, + { + "epoch": 0.03303715862593967, + "grad_norm": 2.695068597793579, + "learning_rate": 4.986551649927441e-05, + "loss": 5.6901, + "step": 5555 + }, + { + "epoch": 0.03304310590922067, + "grad_norm": 3.0315186977386475, + "learning_rate": 4.986546811077917e-05, + "loss": 5.6317, + "step": 5556 + }, + { + "epoch": 0.03304905319250166, + "grad_norm": 2.3597543239593506, + "learning_rate": 4.986541971360364e-05, + "loss": 5.8129, + "step": 5557 + }, + { + "epoch": 0.033055000475782664, + "grad_norm": 2.8090550899505615, + "learning_rate": 4.986537130774785e-05, + "loss": 6.4427, + "step": 5558 + }, + { + "epoch": 0.03306094775906366, + "grad_norm": 3.4232771396636963, + "learning_rate": 4.986532289321182e-05, + "loss": 6.5737, + "step": 5559 + }, + { + "epoch": 0.033066895042344654, + "grad_norm": 2.1425294876098633, + "learning_rate": 4.986527446999556e-05, + "loss": 6.2395, + "step": 5560 + }, + { + "epoch": 0.033072842325625657, + "grad_norm": 2.5348880290985107, + "learning_rate": 4.986522603809909e-05, + "loss": 6.0425, + "step": 5561 + }, + { + "epoch": 0.03307878960890665, + "grad_norm": 3.0824179649353027, + "learning_rate": 4.986517759752242e-05, + "loss": 5.8785, + "step": 5562 + }, + { + "epoch": 0.03308473689218765, + "grad_norm": 2.297706365585327, + "learning_rate": 4.986512914826558e-05, + "loss": 5.8989, + "step": 5563 + }, + { + "epoch": 0.03309068417546865, + "grad_norm": 2.866257667541504, + "learning_rate": 4.986508069032858e-05, + "loss": 5.8905, + "step": 5564 + }, + { + "epoch": 0.033096631458749644, + "grad_norm": 2.2450008392333984, + "learning_rate": 4.9865032223711436e-05, + "loss": 6.3302, + "step": 5565 + }, + { + "epoch": 0.03310257874203064, + "grad_norm": 2.235558271408081, + "learning_rate": 4.9864983748414166e-05, + "loss": 6.4235, + "step": 5566 + }, + { + "epoch": 0.03310852602531164, + "grad_norm": 2.5197713375091553, + "learning_rate": 4.986493526443679e-05, + "loss": 6.3999, + "step": 5567 + }, + { + "epoch": 0.033114473308592636, + "grad_norm": 2.5716195106506348, + "learning_rate": 4.986488677177932e-05, + "loss": 6.0258, + "step": 5568 + }, + { + "epoch": 0.03312042059187363, + "grad_norm": 2.468663454055786, + "learning_rate": 4.986483827044177e-05, + "loss": 6.7553, + "step": 5569 + }, + { + "epoch": 0.033126367875154626, + "grad_norm": 2.4334170818328857, + "learning_rate": 4.986478976042417e-05, + "loss": 6.4722, + "step": 5570 + }, + { + "epoch": 0.03313231515843563, + "grad_norm": 2.234487533569336, + "learning_rate": 4.986474124172652e-05, + "loss": 5.7158, + "step": 5571 + }, + { + "epoch": 0.03313826244171662, + "grad_norm": 2.8017537593841553, + "learning_rate": 4.9864692714348857e-05, + "loss": 5.9552, + "step": 5572 + }, + { + "epoch": 0.03314420972499762, + "grad_norm": 3.171354055404663, + "learning_rate": 4.986464417829118e-05, + "loss": 6.027, + "step": 5573 + }, + { + "epoch": 0.03315015700827862, + "grad_norm": 2.890169620513916, + "learning_rate": 4.9864595633553516e-05, + "loss": 6.2768, + "step": 5574 + }, + { + "epoch": 0.033156104291559615, + "grad_norm": 3.010934829711914, + "learning_rate": 4.986454708013587e-05, + "loss": 6.4054, + "step": 5575 + }, + { + "epoch": 0.03316205157484061, + "grad_norm": 2.143833875656128, + "learning_rate": 4.9864498518038274e-05, + "loss": 6.3771, + "step": 5576 + }, + { + "epoch": 0.03316799885812161, + "grad_norm": 2.2067418098449707, + "learning_rate": 4.986444994726074e-05, + "loss": 6.0158, + "step": 5577 + }, + { + "epoch": 0.03317394614140261, + "grad_norm": 2.3396403789520264, + "learning_rate": 4.986440136780328e-05, + "loss": 6.4286, + "step": 5578 + }, + { + "epoch": 0.0331798934246836, + "grad_norm": 2.8305866718292236, + "learning_rate": 4.9864352779665915e-05, + "loss": 5.7804, + "step": 5579 + }, + { + "epoch": 0.033185840707964605, + "grad_norm": 2.748194456100464, + "learning_rate": 4.9864304182848664e-05, + "loss": 6.1711, + "step": 5580 + }, + { + "epoch": 0.0331917879912456, + "grad_norm": 2.329761505126953, + "learning_rate": 4.9864255577351534e-05, + "loss": 6.2722, + "step": 5581 + }, + { + "epoch": 0.033197735274526595, + "grad_norm": 2.4633524417877197, + "learning_rate": 4.986420696317457e-05, + "loss": 6.1349, + "step": 5582 + }, + { + "epoch": 0.03320368255780759, + "grad_norm": 1.8909802436828613, + "learning_rate": 4.986415834031775e-05, + "loss": 6.2181, + "step": 5583 + }, + { + "epoch": 0.03320962984108859, + "grad_norm": 2.1794517040252686, + "learning_rate": 4.9864109708781104e-05, + "loss": 6.2808, + "step": 5584 + }, + { + "epoch": 0.03321557712436959, + "grad_norm": 2.1766669750213623, + "learning_rate": 4.986406106856466e-05, + "loss": 6.3004, + "step": 5585 + }, + { + "epoch": 0.03322152440765058, + "grad_norm": 2.27526593208313, + "learning_rate": 4.986401241966844e-05, + "loss": 5.9225, + "step": 5586 + }, + { + "epoch": 0.033227471690931584, + "grad_norm": 3.2843096256256104, + "learning_rate": 4.986396376209244e-05, + "loss": 5.8364, + "step": 5587 + }, + { + "epoch": 0.03323341897421258, + "grad_norm": 2.509831666946411, + "learning_rate": 4.9863915095836685e-05, + "loss": 5.6958, + "step": 5588 + }, + { + "epoch": 0.033239366257493574, + "grad_norm": 2.5235815048217773, + "learning_rate": 4.98638664209012e-05, + "loss": 5.4937, + "step": 5589 + }, + { + "epoch": 0.033245313540774576, + "grad_norm": 2.918334484100342, + "learning_rate": 4.986381773728599e-05, + "loss": 5.8284, + "step": 5590 + }, + { + "epoch": 0.03325126082405557, + "grad_norm": 2.8091490268707275, + "learning_rate": 4.986376904499108e-05, + "loss": 5.8126, + "step": 5591 + }, + { + "epoch": 0.03325720810733657, + "grad_norm": 2.555173635482788, + "learning_rate": 4.986372034401649e-05, + "loss": 5.6393, + "step": 5592 + }, + { + "epoch": 0.03326315539061757, + "grad_norm": 2.6366164684295654, + "learning_rate": 4.986367163436223e-05, + "loss": 6.6675, + "step": 5593 + }, + { + "epoch": 0.033269102673898564, + "grad_norm": 2.5691051483154297, + "learning_rate": 4.9863622916028316e-05, + "loss": 6.5808, + "step": 5594 + }, + { + "epoch": 0.03327504995717956, + "grad_norm": 2.239384889602661, + "learning_rate": 4.986357418901477e-05, + "loss": 6.0191, + "step": 5595 + }, + { + "epoch": 0.03328099724046056, + "grad_norm": 2.3877806663513184, + "learning_rate": 4.9863525453321614e-05, + "loss": 5.7429, + "step": 5596 + }, + { + "epoch": 0.033286944523741556, + "grad_norm": 2.559633731842041, + "learning_rate": 4.9863476708948846e-05, + "loss": 5.4866, + "step": 5597 + }, + { + "epoch": 0.03329289180702255, + "grad_norm": 3.7681171894073486, + "learning_rate": 4.98634279558965e-05, + "loss": 5.6139, + "step": 5598 + }, + { + "epoch": 0.033298839090303546, + "grad_norm": 3.999264717102051, + "learning_rate": 4.9863379194164594e-05, + "loss": 5.6031, + "step": 5599 + }, + { + "epoch": 0.03330478637358455, + "grad_norm": 3.1031601428985596, + "learning_rate": 4.986333042375313e-05, + "loss": 5.5397, + "step": 5600 + }, + { + "epoch": 0.03331073365686554, + "grad_norm": 3.104998826980591, + "learning_rate": 4.986328164466214e-05, + "loss": 5.4274, + "step": 5601 + }, + { + "epoch": 0.03331668094014654, + "grad_norm": 2.9426207542419434, + "learning_rate": 4.986323285689163e-05, + "loss": 5.5859, + "step": 5602 + }, + { + "epoch": 0.03332262822342754, + "grad_norm": 2.6912827491760254, + "learning_rate": 4.986318406044163e-05, + "loss": 5.7375, + "step": 5603 + }, + { + "epoch": 0.033328575506708535, + "grad_norm": 4.394237041473389, + "learning_rate": 4.9863135255312145e-05, + "loss": 5.8246, + "step": 5604 + }, + { + "epoch": 0.03333452278998953, + "grad_norm": 2.812197685241699, + "learning_rate": 4.986308644150319e-05, + "loss": 5.6263, + "step": 5605 + }, + { + "epoch": 0.03334047007327053, + "grad_norm": 3.1969878673553467, + "learning_rate": 4.98630376190148e-05, + "loss": 5.4174, + "step": 5606 + }, + { + "epoch": 0.03334641735655153, + "grad_norm": 2.6018595695495605, + "learning_rate": 4.9862988787846975e-05, + "loss": 5.3917, + "step": 5607 + }, + { + "epoch": 0.03335236463983252, + "grad_norm": 2.5274007320404053, + "learning_rate": 4.986293994799974e-05, + "loss": 5.4252, + "step": 5608 + }, + { + "epoch": 0.033358311923113525, + "grad_norm": 2.57043194770813, + "learning_rate": 4.9862891099473105e-05, + "loss": 5.5321, + "step": 5609 + }, + { + "epoch": 0.03336425920639452, + "grad_norm": 3.4353785514831543, + "learning_rate": 4.986284224226709e-05, + "loss": 5.6599, + "step": 5610 + }, + { + "epoch": 0.033370206489675515, + "grad_norm": 3.308945894241333, + "learning_rate": 4.986279337638172e-05, + "loss": 5.8668, + "step": 5611 + }, + { + "epoch": 0.03337615377295652, + "grad_norm": 2.789703607559204, + "learning_rate": 4.9862744501817006e-05, + "loss": 5.8352, + "step": 5612 + }, + { + "epoch": 0.03338210105623751, + "grad_norm": 1.9887118339538574, + "learning_rate": 4.986269561857296e-05, + "loss": 5.7527, + "step": 5613 + }, + { + "epoch": 0.03338804833951851, + "grad_norm": 2.5447990894317627, + "learning_rate": 4.986264672664961e-05, + "loss": 5.5539, + "step": 5614 + }, + { + "epoch": 0.0333939956227995, + "grad_norm": 2.2903668880462646, + "learning_rate": 4.9862597826046965e-05, + "loss": 5.4555, + "step": 5615 + }, + { + "epoch": 0.033399942906080504, + "grad_norm": 3.1669414043426514, + "learning_rate": 4.986254891676504e-05, + "loss": 5.6852, + "step": 5616 + }, + { + "epoch": 0.0334058901893615, + "grad_norm": 3.7491395473480225, + "learning_rate": 4.986249999880386e-05, + "loss": 5.682, + "step": 5617 + }, + { + "epoch": 0.033411837472642494, + "grad_norm": 3.0548582077026367, + "learning_rate": 4.986245107216343e-05, + "loss": 5.7844, + "step": 5618 + }, + { + "epoch": 0.033417784755923496, + "grad_norm": 2.628957509994507, + "learning_rate": 4.986240213684378e-05, + "loss": 5.5646, + "step": 5619 + }, + { + "epoch": 0.03342373203920449, + "grad_norm": 2.050936460494995, + "learning_rate": 4.986235319284492e-05, + "loss": 5.7187, + "step": 5620 + }, + { + "epoch": 0.03342967932248549, + "grad_norm": 2.2839999198913574, + "learning_rate": 4.986230424016688e-05, + "loss": 5.6613, + "step": 5621 + }, + { + "epoch": 0.03343562660576649, + "grad_norm": 2.177778959274292, + "learning_rate": 4.986225527880966e-05, + "loss": 5.7205, + "step": 5622 + }, + { + "epoch": 0.033441573889047484, + "grad_norm": 2.1690266132354736, + "learning_rate": 4.9862206308773286e-05, + "loss": 5.4344, + "step": 5623 + }, + { + "epoch": 0.03344752117232848, + "grad_norm": 2.0134127140045166, + "learning_rate": 4.9862157330057766e-05, + "loss": 5.7872, + "step": 5624 + }, + { + "epoch": 0.03345346845560948, + "grad_norm": 2.0246710777282715, + "learning_rate": 4.986210834266313e-05, + "loss": 5.3291, + "step": 5625 + }, + { + "epoch": 0.033459415738890476, + "grad_norm": 2.020939350128174, + "learning_rate": 4.986205934658939e-05, + "loss": 5.3966, + "step": 5626 + }, + { + "epoch": 0.03346536302217147, + "grad_norm": 2.3261308670043945, + "learning_rate": 4.986201034183655e-05, + "loss": 5.4667, + "step": 5627 + }, + { + "epoch": 0.033471310305452466, + "grad_norm": 2.135641574859619, + "learning_rate": 4.9861961328404646e-05, + "loss": 5.4925, + "step": 5628 + }, + { + "epoch": 0.03347725758873347, + "grad_norm": 2.3122894763946533, + "learning_rate": 4.986191230629369e-05, + "loss": 5.6665, + "step": 5629 + }, + { + "epoch": 0.03348320487201446, + "grad_norm": 2.4461214542388916, + "learning_rate": 4.98618632755037e-05, + "loss": 5.8442, + "step": 5630 + }, + { + "epoch": 0.03348915215529546, + "grad_norm": 2.189009189605713, + "learning_rate": 4.9861814236034685e-05, + "loss": 5.5793, + "step": 5631 + }, + { + "epoch": 0.03349509943857646, + "grad_norm": 2.1961586475372314, + "learning_rate": 4.986176518788667e-05, + "loss": 5.5364, + "step": 5632 + }, + { + "epoch": 0.033501046721857455, + "grad_norm": 2.120177745819092, + "learning_rate": 4.986171613105967e-05, + "loss": 5.4042, + "step": 5633 + }, + { + "epoch": 0.03350699400513845, + "grad_norm": 1.9021252393722534, + "learning_rate": 4.9861667065553696e-05, + "loss": 5.2665, + "step": 5634 + }, + { + "epoch": 0.03351294128841945, + "grad_norm": 1.8944766521453857, + "learning_rate": 4.986161799136878e-05, + "loss": 5.3853, + "step": 5635 + }, + { + "epoch": 0.03351888857170045, + "grad_norm": 2.059847354888916, + "learning_rate": 4.9861568908504916e-05, + "loss": 5.3046, + "step": 5636 + }, + { + "epoch": 0.03352483585498144, + "grad_norm": 2.1350111961364746, + "learning_rate": 4.9861519816962155e-05, + "loss": 5.3684, + "step": 5637 + }, + { + "epoch": 0.033530783138262445, + "grad_norm": 2.0733792781829834, + "learning_rate": 4.986147071674048e-05, + "loss": 5.4581, + "step": 5638 + }, + { + "epoch": 0.03353673042154344, + "grad_norm": 2.0736827850341797, + "learning_rate": 4.986142160783993e-05, + "loss": 5.7019, + "step": 5639 + }, + { + "epoch": 0.033542677704824435, + "grad_norm": 2.1903107166290283, + "learning_rate": 4.986137249026051e-05, + "loss": 5.4353, + "step": 5640 + }, + { + "epoch": 0.03354862498810544, + "grad_norm": 2.2678940296173096, + "learning_rate": 4.9861323364002244e-05, + "loss": 5.4951, + "step": 5641 + }, + { + "epoch": 0.03355457227138643, + "grad_norm": 3.590702772140503, + "learning_rate": 4.9861274229065145e-05, + "loss": 6.1522, + "step": 5642 + }, + { + "epoch": 0.03356051955466743, + "grad_norm": 2.0955893993377686, + "learning_rate": 4.9861225085449224e-05, + "loss": 5.3544, + "step": 5643 + }, + { + "epoch": 0.03356646683794842, + "grad_norm": 1.9370301961898804, + "learning_rate": 4.986117593315452e-05, + "loss": 5.4732, + "step": 5644 + }, + { + "epoch": 0.033572414121229424, + "grad_norm": 2.141752243041992, + "learning_rate": 4.986112677218103e-05, + "loss": 5.5768, + "step": 5645 + }, + { + "epoch": 0.03357836140451042, + "grad_norm": 1.9236360788345337, + "learning_rate": 4.986107760252878e-05, + "loss": 5.7641, + "step": 5646 + }, + { + "epoch": 0.033584308687791414, + "grad_norm": 1.8353725671768188, + "learning_rate": 4.9861028424197785e-05, + "loss": 5.8011, + "step": 5647 + }, + { + "epoch": 0.033590255971072416, + "grad_norm": 2.0918078422546387, + "learning_rate": 4.9860979237188055e-05, + "loss": 5.6862, + "step": 5648 + }, + { + "epoch": 0.03359620325435341, + "grad_norm": 2.2244462966918945, + "learning_rate": 4.986093004149962e-05, + "loss": 5.472, + "step": 5649 + }, + { + "epoch": 0.033602150537634407, + "grad_norm": 2.1517422199249268, + "learning_rate": 4.9860880837132495e-05, + "loss": 5.3655, + "step": 5650 + }, + { + "epoch": 0.03360809782091541, + "grad_norm": 2.241863489151001, + "learning_rate": 4.986083162408669e-05, + "loss": 5.5385, + "step": 5651 + }, + { + "epoch": 0.033614045104196404, + "grad_norm": 2.458171844482422, + "learning_rate": 4.986078240236222e-05, + "loss": 5.5531, + "step": 5652 + }, + { + "epoch": 0.0336199923874774, + "grad_norm": 2.2601864337921143, + "learning_rate": 4.986073317195911e-05, + "loss": 5.9313, + "step": 5653 + }, + { + "epoch": 0.0336259396707584, + "grad_norm": 2.243647575378418, + "learning_rate": 4.986068393287738e-05, + "loss": 5.4064, + "step": 5654 + }, + { + "epoch": 0.033631886954039396, + "grad_norm": 2.283515453338623, + "learning_rate": 4.986063468511704e-05, + "loss": 5.295, + "step": 5655 + }, + { + "epoch": 0.03363783423732039, + "grad_norm": 2.701770305633545, + "learning_rate": 4.986058542867811e-05, + "loss": 5.8548, + "step": 5656 + }, + { + "epoch": 0.033643781520601386, + "grad_norm": 2.8186864852905273, + "learning_rate": 4.98605361635606e-05, + "loss": 5.378, + "step": 5657 + }, + { + "epoch": 0.03364972880388239, + "grad_norm": 2.6508500576019287, + "learning_rate": 4.9860486889764536e-05, + "loss": 5.469, + "step": 5658 + }, + { + "epoch": 0.03365567608716338, + "grad_norm": 2.3984878063201904, + "learning_rate": 4.986043760728994e-05, + "loss": 5.3978, + "step": 5659 + }, + { + "epoch": 0.03366162337044438, + "grad_norm": 3.64663028717041, + "learning_rate": 4.9860388316136814e-05, + "loss": 5.502, + "step": 5660 + }, + { + "epoch": 0.03366757065372538, + "grad_norm": 3.1112046241760254, + "learning_rate": 4.986033901630519e-05, + "loss": 5.7347, + "step": 5661 + }, + { + "epoch": 0.033673517937006375, + "grad_norm": 2.619877338409424, + "learning_rate": 4.9860289707795074e-05, + "loss": 6.2099, + "step": 5662 + }, + { + "epoch": 0.03367946522028737, + "grad_norm": 2.0318470001220703, + "learning_rate": 4.986024039060648e-05, + "loss": 6.246, + "step": 5663 + }, + { + "epoch": 0.03368541250356837, + "grad_norm": 2.1484673023223877, + "learning_rate": 4.986019106473945e-05, + "loss": 6.1689, + "step": 5664 + }, + { + "epoch": 0.03369135978684937, + "grad_norm": 2.6159844398498535, + "learning_rate": 4.9860141730193974e-05, + "loss": 5.8217, + "step": 5665 + }, + { + "epoch": 0.03369730707013036, + "grad_norm": 2.5019965171813965, + "learning_rate": 4.9860092386970084e-05, + "loss": 6.1138, + "step": 5666 + }, + { + "epoch": 0.033703254353411365, + "grad_norm": 2.962315797805786, + "learning_rate": 4.9860043035067785e-05, + "loss": 5.7057, + "step": 5667 + }, + { + "epoch": 0.03370920163669236, + "grad_norm": 2.455721139907837, + "learning_rate": 4.9859993674487106e-05, + "loss": 5.6203, + "step": 5668 + }, + { + "epoch": 0.033715148919973355, + "grad_norm": 2.432368278503418, + "learning_rate": 4.9859944305228066e-05, + "loss": 6.2337, + "step": 5669 + }, + { + "epoch": 0.03372109620325436, + "grad_norm": 2.3222782611846924, + "learning_rate": 4.985989492729067e-05, + "loss": 6.2845, + "step": 5670 + }, + { + "epoch": 0.03372704348653535, + "grad_norm": 2.107440948486328, + "learning_rate": 4.985984554067494e-05, + "loss": 6.2404, + "step": 5671 + }, + { + "epoch": 0.03373299076981635, + "grad_norm": 1.9450268745422363, + "learning_rate": 4.98597961453809e-05, + "loss": 6.1679, + "step": 5672 + }, + { + "epoch": 0.03373893805309734, + "grad_norm": 1.7591795921325684, + "learning_rate": 4.9859746741408554e-05, + "loss": 6.3425, + "step": 5673 + }, + { + "epoch": 0.033744885336378344, + "grad_norm": 2.009420871734619, + "learning_rate": 4.985969732875794e-05, + "loss": 6.3607, + "step": 5674 + }, + { + "epoch": 0.03375083261965934, + "grad_norm": 2.097215175628662, + "learning_rate": 4.9859647907429054e-05, + "loss": 6.2009, + "step": 5675 + }, + { + "epoch": 0.033756779902940334, + "grad_norm": 1.7670379877090454, + "learning_rate": 4.985959847742192e-05, + "loss": 5.935, + "step": 5676 + }, + { + "epoch": 0.033762727186221336, + "grad_norm": 2.052022695541382, + "learning_rate": 4.985954903873656e-05, + "loss": 5.4054, + "step": 5677 + }, + { + "epoch": 0.03376867446950233, + "grad_norm": 1.9225167036056519, + "learning_rate": 4.985949959137298e-05, + "loss": 5.6905, + "step": 5678 + }, + { + "epoch": 0.033774621752783326, + "grad_norm": 2.4080653190612793, + "learning_rate": 4.985945013533122e-05, + "loss": 6.5566, + "step": 5679 + }, + { + "epoch": 0.03378056903606433, + "grad_norm": 2.8340251445770264, + "learning_rate": 4.985940067061128e-05, + "loss": 6.3556, + "step": 5680 + }, + { + "epoch": 0.033786516319345324, + "grad_norm": 2.2872672080993652, + "learning_rate": 4.985935119721317e-05, + "loss": 6.1806, + "step": 5681 + }, + { + "epoch": 0.03379246360262632, + "grad_norm": 3.309203863143921, + "learning_rate": 4.985930171513692e-05, + "loss": 6.1766, + "step": 5682 + }, + { + "epoch": 0.03379841088590732, + "grad_norm": 2.936709403991699, + "learning_rate": 4.985925222438255e-05, + "loss": 5.907, + "step": 5683 + }, + { + "epoch": 0.033804358169188316, + "grad_norm": 2.3226964473724365, + "learning_rate": 4.985920272495007e-05, + "loss": 5.5734, + "step": 5684 + }, + { + "epoch": 0.03381030545246931, + "grad_norm": 2.3053154945373535, + "learning_rate": 4.98591532168395e-05, + "loss": 6.5688, + "step": 5685 + }, + { + "epoch": 0.033816252735750306, + "grad_norm": 2.2494077682495117, + "learning_rate": 4.985910370005086e-05, + "loss": 6.3539, + "step": 5686 + }, + { + "epoch": 0.03382220001903131, + "grad_norm": 1.9559924602508545, + "learning_rate": 4.9859054174584155e-05, + "loss": 6.2015, + "step": 5687 + }, + { + "epoch": 0.0338281473023123, + "grad_norm": 2.7915425300598145, + "learning_rate": 4.985900464043942e-05, + "loss": 5.7426, + "step": 5688 + }, + { + "epoch": 0.0338340945855933, + "grad_norm": 2.448496103286743, + "learning_rate": 4.985895509761665e-05, + "loss": 6.2697, + "step": 5689 + }, + { + "epoch": 0.0338400418688743, + "grad_norm": 1.7736696004867554, + "learning_rate": 4.9858905546115885e-05, + "loss": 6.5513, + "step": 5690 + }, + { + "epoch": 0.033845989152155295, + "grad_norm": 1.668285608291626, + "learning_rate": 4.9858855985937136e-05, + "loss": 6.0179, + "step": 5691 + }, + { + "epoch": 0.03385193643543629, + "grad_norm": 2.157799243927002, + "learning_rate": 4.985880641708042e-05, + "loss": 6.1863, + "step": 5692 + }, + { + "epoch": 0.03385788371871729, + "grad_norm": 2.2437758445739746, + "learning_rate": 4.985875683954574e-05, + "loss": 6.128, + "step": 5693 + }, + { + "epoch": 0.03386383100199829, + "grad_norm": 2.8323628902435303, + "learning_rate": 4.9858707253333124e-05, + "loss": 6.2746, + "step": 5694 + }, + { + "epoch": 0.03386977828527928, + "grad_norm": 2.270587205886841, + "learning_rate": 4.98586576584426e-05, + "loss": 6.1002, + "step": 5695 + }, + { + "epoch": 0.033875725568560285, + "grad_norm": 1.9165533781051636, + "learning_rate": 4.985860805487417e-05, + "loss": 5.7016, + "step": 5696 + }, + { + "epoch": 0.03388167285184128, + "grad_norm": 2.230407953262329, + "learning_rate": 4.985855844262786e-05, + "loss": 5.9649, + "step": 5697 + }, + { + "epoch": 0.033887620135122275, + "grad_norm": 2.5094211101531982, + "learning_rate": 4.985850882170368e-05, + "loss": 6.0184, + "step": 5698 + }, + { + "epoch": 0.03389356741840328, + "grad_norm": 2.6195943355560303, + "learning_rate": 4.9858459192101656e-05, + "loss": 5.8501, + "step": 5699 + }, + { + "epoch": 0.03389951470168427, + "grad_norm": 2.747486114501953, + "learning_rate": 4.9858409553821794e-05, + "loss": 5.7066, + "step": 5700 + }, + { + "epoch": 0.03390546198496527, + "grad_norm": 2.154109001159668, + "learning_rate": 4.985835990686413e-05, + "loss": 6.1072, + "step": 5701 + }, + { + "epoch": 0.03391140926824626, + "grad_norm": 2.4329216480255127, + "learning_rate": 4.9858310251228655e-05, + "loss": 5.9552, + "step": 5702 + }, + { + "epoch": 0.033917356551527264, + "grad_norm": 2.4760935306549072, + "learning_rate": 4.9858260586915405e-05, + "loss": 5.9023, + "step": 5703 + }, + { + "epoch": 0.03392330383480826, + "grad_norm": 2.400474786758423, + "learning_rate": 4.9858210913924397e-05, + "loss": 6.1688, + "step": 5704 + }, + { + "epoch": 0.033929251118089254, + "grad_norm": 2.402930498123169, + "learning_rate": 4.9858161232255644e-05, + "loss": 6.0776, + "step": 5705 + }, + { + "epoch": 0.033935198401370256, + "grad_norm": 2.0408313274383545, + "learning_rate": 4.985811154190916e-05, + "loss": 6.1841, + "step": 5706 + }, + { + "epoch": 0.03394114568465125, + "grad_norm": 1.889190912246704, + "learning_rate": 4.9858061842884976e-05, + "loss": 5.9689, + "step": 5707 + }, + { + "epoch": 0.033947092967932246, + "grad_norm": 2.2231624126434326, + "learning_rate": 4.9858012135183086e-05, + "loss": 6.0009, + "step": 5708 + }, + { + "epoch": 0.03395304025121325, + "grad_norm": 2.0229554176330566, + "learning_rate": 4.985796241880353e-05, + "loss": 6.3237, + "step": 5709 + }, + { + "epoch": 0.033958987534494244, + "grad_norm": 2.0570971965789795, + "learning_rate": 4.985791269374631e-05, + "loss": 6.3104, + "step": 5710 + }, + { + "epoch": 0.03396493481777524, + "grad_norm": 2.584663152694702, + "learning_rate": 4.9857862960011454e-05, + "loss": 5.8493, + "step": 5711 + }, + { + "epoch": 0.03397088210105624, + "grad_norm": 1.7870328426361084, + "learning_rate": 4.985781321759897e-05, + "loss": 6.2321, + "step": 5712 + }, + { + "epoch": 0.033976829384337236, + "grad_norm": 2.201756000518799, + "learning_rate": 4.9857763466508886e-05, + "loss": 6.1936, + "step": 5713 + }, + { + "epoch": 0.03398277666761823, + "grad_norm": 2.4489476680755615, + "learning_rate": 4.9857713706741216e-05, + "loss": 6.11, + "step": 5714 + }, + { + "epoch": 0.033988723950899226, + "grad_norm": 2.007643461227417, + "learning_rate": 4.9857663938295964e-05, + "loss": 6.288, + "step": 5715 + }, + { + "epoch": 0.03399467123418023, + "grad_norm": 1.8299764394760132, + "learning_rate": 4.9857614161173165e-05, + "loss": 6.0719, + "step": 5716 + }, + { + "epoch": 0.03400061851746122, + "grad_norm": 1.7619884014129639, + "learning_rate": 4.985756437537283e-05, + "loss": 6.1418, + "step": 5717 + }, + { + "epoch": 0.03400656580074222, + "grad_norm": 1.9445360898971558, + "learning_rate": 4.985751458089498e-05, + "loss": 6.1223, + "step": 5718 + }, + { + "epoch": 0.03401251308402322, + "grad_norm": 2.2320010662078857, + "learning_rate": 4.985746477773962e-05, + "loss": 5.5239, + "step": 5719 + }, + { + "epoch": 0.034018460367304215, + "grad_norm": 2.631765365600586, + "learning_rate": 4.985741496590678e-05, + "loss": 5.6348, + "step": 5720 + }, + { + "epoch": 0.03402440765058521, + "grad_norm": 2.4715576171875, + "learning_rate": 4.985736514539647e-05, + "loss": 5.9608, + "step": 5721 + }, + { + "epoch": 0.03403035493386621, + "grad_norm": 2.633188009262085, + "learning_rate": 4.985731531620871e-05, + "loss": 5.602, + "step": 5722 + }, + { + "epoch": 0.03403630221714721, + "grad_norm": 2.4303035736083984, + "learning_rate": 4.9857265478343526e-05, + "loss": 5.495, + "step": 5723 + }, + { + "epoch": 0.0340422495004282, + "grad_norm": 2.463447332382202, + "learning_rate": 4.985721563180092e-05, + "loss": 5.4633, + "step": 5724 + }, + { + "epoch": 0.034048196783709204, + "grad_norm": 2.349965810775757, + "learning_rate": 4.985716577658092e-05, + "loss": 6.0067, + "step": 5725 + }, + { + "epoch": 0.0340541440669902, + "grad_norm": 1.8741793632507324, + "learning_rate": 4.985711591268354e-05, + "loss": 5.8658, + "step": 5726 + }, + { + "epoch": 0.034060091350271195, + "grad_norm": 1.957612156867981, + "learning_rate": 4.98570660401088e-05, + "loss": 6.2016, + "step": 5727 + }, + { + "epoch": 0.0340660386335522, + "grad_norm": 2.4883556365966797, + "learning_rate": 4.985701615885671e-05, + "loss": 6.3056, + "step": 5728 + }, + { + "epoch": 0.03407198591683319, + "grad_norm": 2.6959800720214844, + "learning_rate": 4.98569662689273e-05, + "loss": 5.7267, + "step": 5729 + }, + { + "epoch": 0.03407793320011419, + "grad_norm": 2.579802989959717, + "learning_rate": 4.985691637032057e-05, + "loss": 5.2467, + "step": 5730 + }, + { + "epoch": 0.03408388048339518, + "grad_norm": 2.136262893676758, + "learning_rate": 4.985686646303656e-05, + "loss": 5.7071, + "step": 5731 + }, + { + "epoch": 0.034089827766676184, + "grad_norm": 2.1442244052886963, + "learning_rate": 4.985681654707526e-05, + "loss": 6.3961, + "step": 5732 + }, + { + "epoch": 0.03409577504995718, + "grad_norm": 2.164340019226074, + "learning_rate": 4.9856766622436714e-05, + "loss": 6.2455, + "step": 5733 + }, + { + "epoch": 0.034101722333238174, + "grad_norm": 2.199791193008423, + "learning_rate": 4.985671668912092e-05, + "loss": 5.8804, + "step": 5734 + }, + { + "epoch": 0.034107669616519176, + "grad_norm": 2.0359933376312256, + "learning_rate": 4.9856666747127905e-05, + "loss": 6.359, + "step": 5735 + }, + { + "epoch": 0.03411361689980017, + "grad_norm": 2.17069935798645, + "learning_rate": 4.985661679645769e-05, + "loss": 6.6736, + "step": 5736 + }, + { + "epoch": 0.034119564183081166, + "grad_norm": 1.9114634990692139, + "learning_rate": 4.9856566837110275e-05, + "loss": 5.9629, + "step": 5737 + }, + { + "epoch": 0.03412551146636217, + "grad_norm": 2.2872474193573, + "learning_rate": 4.9856516869085704e-05, + "loss": 5.5856, + "step": 5738 + }, + { + "epoch": 0.03413145874964316, + "grad_norm": 2.0800466537475586, + "learning_rate": 4.9856466892383965e-05, + "loss": 5.7732, + "step": 5739 + }, + { + "epoch": 0.03413740603292416, + "grad_norm": 2.37117338180542, + "learning_rate": 4.98564169070051e-05, + "loss": 5.667, + "step": 5740 + }, + { + "epoch": 0.03414335331620516, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.985636691294911e-05, + "loss": 5.4874, + "step": 5741 + }, + { + "epoch": 0.034149300599486156, + "grad_norm": 2.0097250938415527, + "learning_rate": 4.9856316910216024e-05, + "loss": 5.5469, + "step": 5742 + }, + { + "epoch": 0.03415524788276715, + "grad_norm": 2.430954933166504, + "learning_rate": 4.985626689880586e-05, + "loss": 5.7635, + "step": 5743 + }, + { + "epoch": 0.034161195166048146, + "grad_norm": 2.1000874042510986, + "learning_rate": 4.985621687871862e-05, + "loss": 5.7102, + "step": 5744 + }, + { + "epoch": 0.03416714244932915, + "grad_norm": 2.2048611640930176, + "learning_rate": 4.9856166849954336e-05, + "loss": 5.8156, + "step": 5745 + }, + { + "epoch": 0.03417308973261014, + "grad_norm": 2.145538330078125, + "learning_rate": 4.985611681251302e-05, + "loss": 5.9101, + "step": 5746 + }, + { + "epoch": 0.03417903701589114, + "grad_norm": 2.86169695854187, + "learning_rate": 4.9856066766394685e-05, + "loss": 5.7358, + "step": 5747 + }, + { + "epoch": 0.03418498429917214, + "grad_norm": 2.0648229122161865, + "learning_rate": 4.985601671159936e-05, + "loss": 6.0529, + "step": 5748 + }, + { + "epoch": 0.034190931582453135, + "grad_norm": 2.191251039505005, + "learning_rate": 4.985596664812706e-05, + "loss": 6.1999, + "step": 5749 + }, + { + "epoch": 0.03419687886573413, + "grad_norm": 2.556640148162842, + "learning_rate": 4.985591657597779e-05, + "loss": 6.0671, + "step": 5750 + }, + { + "epoch": 0.03420282614901513, + "grad_norm": 2.1796281337738037, + "learning_rate": 4.985586649515158e-05, + "loss": 6.1537, + "step": 5751 + }, + { + "epoch": 0.03420877343229613, + "grad_norm": 2.1884169578552246, + "learning_rate": 4.985581640564845e-05, + "loss": 5.7667, + "step": 5752 + }, + { + "epoch": 0.03421472071557712, + "grad_norm": 2.3836331367492676, + "learning_rate": 4.9855766307468404e-05, + "loss": 5.6608, + "step": 5753 + }, + { + "epoch": 0.034220667998858124, + "grad_norm": 2.0464322566986084, + "learning_rate": 4.985571620061147e-05, + "loss": 5.5317, + "step": 5754 + }, + { + "epoch": 0.03422661528213912, + "grad_norm": 2.3275644779205322, + "learning_rate": 4.9855666085077654e-05, + "loss": 5.8611, + "step": 5755 + }, + { + "epoch": 0.034232562565420115, + "grad_norm": 2.7268338203430176, + "learning_rate": 4.9855615960867e-05, + "loss": 5.6323, + "step": 5756 + }, + { + "epoch": 0.03423850984870112, + "grad_norm": 2.578986406326294, + "learning_rate": 4.985556582797949e-05, + "loss": 5.6108, + "step": 5757 + }, + { + "epoch": 0.03424445713198211, + "grad_norm": 2.4127955436706543, + "learning_rate": 4.985551568641516e-05, + "loss": 5.7054, + "step": 5758 + }, + { + "epoch": 0.03425040441526311, + "grad_norm": 2.1954357624053955, + "learning_rate": 4.985546553617404e-05, + "loss": 6.194, + "step": 5759 + }, + { + "epoch": 0.0342563516985441, + "grad_norm": 2.43851900100708, + "learning_rate": 4.985541537725612e-05, + "loss": 5.9067, + "step": 5760 + }, + { + "epoch": 0.034262298981825104, + "grad_norm": 2.0910801887512207, + "learning_rate": 4.9855365209661445e-05, + "loss": 6.1017, + "step": 5761 + }, + { + "epoch": 0.0342682462651061, + "grad_norm": 1.9936187267303467, + "learning_rate": 4.985531503339e-05, + "loss": 6.1239, + "step": 5762 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 2.0663299560546875, + "learning_rate": 4.985526484844183e-05, + "loss": 6.0514, + "step": 5763 + }, + { + "epoch": 0.034280140831668096, + "grad_norm": 2.4357266426086426, + "learning_rate": 4.985521465481695e-05, + "loss": 5.3695, + "step": 5764 + }, + { + "epoch": 0.03428608811494909, + "grad_norm": 2.12214994430542, + "learning_rate": 4.985516445251537e-05, + "loss": 5.5531, + "step": 5765 + }, + { + "epoch": 0.034292035398230086, + "grad_norm": 2.731661319732666, + "learning_rate": 4.9855114241537105e-05, + "loss": 6.2403, + "step": 5766 + }, + { + "epoch": 0.03429798268151109, + "grad_norm": 2.0668931007385254, + "learning_rate": 4.985506402188217e-05, + "loss": 6.0873, + "step": 5767 + }, + { + "epoch": 0.03430392996479208, + "grad_norm": 2.3165833950042725, + "learning_rate": 4.98550137935506e-05, + "loss": 5.9365, + "step": 5768 + }, + { + "epoch": 0.03430987724807308, + "grad_norm": 1.8637720346450806, + "learning_rate": 4.98549635565424e-05, + "loss": 6.0837, + "step": 5769 + }, + { + "epoch": 0.03431582453135408, + "grad_norm": 2.1689205169677734, + "learning_rate": 4.985491331085758e-05, + "loss": 5.703, + "step": 5770 + }, + { + "epoch": 0.034321771814635076, + "grad_norm": 2.245283365249634, + "learning_rate": 4.985486305649618e-05, + "loss": 6.0134, + "step": 5771 + }, + { + "epoch": 0.03432771909791607, + "grad_norm": 2.2685303688049316, + "learning_rate": 4.98548127934582e-05, + "loss": 5.279, + "step": 5772 + }, + { + "epoch": 0.034333666381197066, + "grad_norm": 2.376253128051758, + "learning_rate": 4.985476252174365e-05, + "loss": 5.5812, + "step": 5773 + }, + { + "epoch": 0.03433961366447807, + "grad_norm": 2.2636559009552, + "learning_rate": 4.985471224135257e-05, + "loss": 5.6906, + "step": 5774 + }, + { + "epoch": 0.03434556094775906, + "grad_norm": 2.22103214263916, + "learning_rate": 4.9854661952284965e-05, + "loss": 6.2066, + "step": 5775 + }, + { + "epoch": 0.03435150823104006, + "grad_norm": 2.308610439300537, + "learning_rate": 4.985461165454085e-05, + "loss": 6.1582, + "step": 5776 + }, + { + "epoch": 0.03435745551432106, + "grad_norm": 1.9191935062408447, + "learning_rate": 4.985456134812026e-05, + "loss": 5.4587, + "step": 5777 + }, + { + "epoch": 0.034363402797602055, + "grad_norm": 2.3127100467681885, + "learning_rate": 4.9854511033023184e-05, + "loss": 5.3375, + "step": 5778 + }, + { + "epoch": 0.03436935008088305, + "grad_norm": 2.4817371368408203, + "learning_rate": 4.985446070924966e-05, + "loss": 5.4961, + "step": 5779 + }, + { + "epoch": 0.03437529736416405, + "grad_norm": 2.0995922088623047, + "learning_rate": 4.9854410376799695e-05, + "loss": 5.7676, + "step": 5780 + }, + { + "epoch": 0.03438124464744505, + "grad_norm": 2.261229991912842, + "learning_rate": 4.985436003567332e-05, + "loss": 5.4446, + "step": 5781 + }, + { + "epoch": 0.03438719193072604, + "grad_norm": 2.275536060333252, + "learning_rate": 4.985430968587055e-05, + "loss": 5.4297, + "step": 5782 + }, + { + "epoch": 0.034393139214007044, + "grad_norm": 2.3733773231506348, + "learning_rate": 4.985425932739138e-05, + "loss": 5.7658, + "step": 5783 + }, + { + "epoch": 0.03439908649728804, + "grad_norm": 2.201716184616089, + "learning_rate": 4.985420896023586e-05, + "loss": 5.5502, + "step": 5784 + }, + { + "epoch": 0.034405033780569035, + "grad_norm": 2.1012730598449707, + "learning_rate": 4.9854158584403985e-05, + "loss": 5.7199, + "step": 5785 + }, + { + "epoch": 0.03441098106385004, + "grad_norm": 2.065568685531616, + "learning_rate": 4.985410819989579e-05, + "loss": 6.1547, + "step": 5786 + }, + { + "epoch": 0.03441692834713103, + "grad_norm": 1.9217867851257324, + "learning_rate": 4.9854057806711275e-05, + "loss": 6.2556, + "step": 5787 + }, + { + "epoch": 0.03442287563041203, + "grad_norm": 2.028602123260498, + "learning_rate": 4.985400740485047e-05, + "loss": 5.9347, + "step": 5788 + }, + { + "epoch": 0.03442882291369302, + "grad_norm": 2.002855062484741, + "learning_rate": 4.9853956994313376e-05, + "loss": 5.3966, + "step": 5789 + }, + { + "epoch": 0.034434770196974024, + "grad_norm": 2.3740642070770264, + "learning_rate": 4.985390657510003e-05, + "loss": 5.7801, + "step": 5790 + }, + { + "epoch": 0.03444071748025502, + "grad_norm": 2.1149635314941406, + "learning_rate": 4.9853856147210444e-05, + "loss": 5.6504, + "step": 5791 + }, + { + "epoch": 0.034446664763536014, + "grad_norm": 2.3519630432128906, + "learning_rate": 4.985380571064463e-05, + "loss": 5.9172, + "step": 5792 + }, + { + "epoch": 0.034452612046817016, + "grad_norm": 2.38930082321167, + "learning_rate": 4.985375526540261e-05, + "loss": 5.6196, + "step": 5793 + }, + { + "epoch": 0.03445855933009801, + "grad_norm": 2.245596408843994, + "learning_rate": 4.98537048114844e-05, + "loss": 5.5034, + "step": 5794 + }, + { + "epoch": 0.034464506613379006, + "grad_norm": 2.272158622741699, + "learning_rate": 4.985365434889002e-05, + "loss": 5.5867, + "step": 5795 + }, + { + "epoch": 0.03447045389666001, + "grad_norm": 2.2090094089508057, + "learning_rate": 4.9853603877619485e-05, + "loss": 5.68, + "step": 5796 + }, + { + "epoch": 0.034476401179941, + "grad_norm": 2.0545220375061035, + "learning_rate": 4.985355339767281e-05, + "loss": 5.8382, + "step": 5797 + }, + { + "epoch": 0.034482348463222, + "grad_norm": 2.143134593963623, + "learning_rate": 4.985350290905003e-05, + "loss": 5.5753, + "step": 5798 + }, + { + "epoch": 0.034488295746503, + "grad_norm": 2.3938257694244385, + "learning_rate": 4.985345241175114e-05, + "loss": 5.7545, + "step": 5799 + }, + { + "epoch": 0.034494243029783996, + "grad_norm": 2.132998466491699, + "learning_rate": 4.985340190577616e-05, + "loss": 5.5477, + "step": 5800 + }, + { + "epoch": 0.03450019031306499, + "grad_norm": 3.141417980194092, + "learning_rate": 4.9853351391125126e-05, + "loss": 5.3509, + "step": 5801 + }, + { + "epoch": 0.034506137596345986, + "grad_norm": 2.4776933193206787, + "learning_rate": 4.9853300867798034e-05, + "loss": 6.1052, + "step": 5802 + }, + { + "epoch": 0.03451208487962699, + "grad_norm": 2.1782073974609375, + "learning_rate": 4.985325033579492e-05, + "loss": 5.9599, + "step": 5803 + }, + { + "epoch": 0.03451803216290798, + "grad_norm": 2.2631704807281494, + "learning_rate": 4.9853199795115794e-05, + "loss": 5.534, + "step": 5804 + }, + { + "epoch": 0.03452397944618898, + "grad_norm": 2.140612840652466, + "learning_rate": 4.985314924576066e-05, + "loss": 5.7479, + "step": 5805 + }, + { + "epoch": 0.03452992672946998, + "grad_norm": 2.726651668548584, + "learning_rate": 4.9853098687729563e-05, + "loss": 5.4639, + "step": 5806 + }, + { + "epoch": 0.034535874012750975, + "grad_norm": 1.852423071861267, + "learning_rate": 4.985304812102249e-05, + "loss": 5.4209, + "step": 5807 + }, + { + "epoch": 0.03454182129603197, + "grad_norm": 2.5236833095550537, + "learning_rate": 4.9852997545639485e-05, + "loss": 5.9653, + "step": 5808 + }, + { + "epoch": 0.03454776857931297, + "grad_norm": 2.2740652561187744, + "learning_rate": 4.985294696158056e-05, + "loss": 5.9457, + "step": 5809 + }, + { + "epoch": 0.03455371586259397, + "grad_norm": 2.931777000427246, + "learning_rate": 4.9852896368845715e-05, + "loss": 5.6709, + "step": 5810 + }, + { + "epoch": 0.03455966314587496, + "grad_norm": 2.6981759071350098, + "learning_rate": 4.9852845767434986e-05, + "loss": 5.1747, + "step": 5811 + }, + { + "epoch": 0.034565610429155964, + "grad_norm": 2.2675211429595947, + "learning_rate": 4.985279515734839e-05, + "loss": 5.2393, + "step": 5812 + }, + { + "epoch": 0.03457155771243696, + "grad_norm": 2.535473346710205, + "learning_rate": 4.985274453858594e-05, + "loss": 6.2184, + "step": 5813 + }, + { + "epoch": 0.034577504995717954, + "grad_norm": 2.8692495822906494, + "learning_rate": 4.985269391114765e-05, + "loss": 5.2557, + "step": 5814 + }, + { + "epoch": 0.034583452278998957, + "grad_norm": 2.908472776412964, + "learning_rate": 4.985264327503354e-05, + "loss": 5.1559, + "step": 5815 + }, + { + "epoch": 0.03458939956227995, + "grad_norm": 2.3630192279815674, + "learning_rate": 4.985259263024363e-05, + "loss": 5.3159, + "step": 5816 + }, + { + "epoch": 0.03459534684556095, + "grad_norm": 2.1287102699279785, + "learning_rate": 4.9852541976777933e-05, + "loss": 5.2069, + "step": 5817 + }, + { + "epoch": 0.03460129412884194, + "grad_norm": 2.751567840576172, + "learning_rate": 4.985249131463647e-05, + "loss": 5.6561, + "step": 5818 + }, + { + "epoch": 0.034607241412122944, + "grad_norm": 2.505608081817627, + "learning_rate": 4.985244064381927e-05, + "loss": 5.9708, + "step": 5819 + }, + { + "epoch": 0.03461318869540394, + "grad_norm": 2.351593255996704, + "learning_rate": 4.9852389964326337e-05, + "loss": 5.9046, + "step": 5820 + }, + { + "epoch": 0.034619135978684934, + "grad_norm": 2.3037939071655273, + "learning_rate": 4.985233927615769e-05, + "loss": 6.0069, + "step": 5821 + }, + { + "epoch": 0.034625083261965936, + "grad_norm": 2.2482705116271973, + "learning_rate": 4.985228857931334e-05, + "loss": 5.9492, + "step": 5822 + }, + { + "epoch": 0.03463103054524693, + "grad_norm": 2.23640513420105, + "learning_rate": 4.985223787379332e-05, + "loss": 5.6631, + "step": 5823 + }, + { + "epoch": 0.034636977828527926, + "grad_norm": 2.710275411605835, + "learning_rate": 4.985218715959764e-05, + "loss": 5.5961, + "step": 5824 + }, + { + "epoch": 0.03464292511180893, + "grad_norm": 2.7220160961151123, + "learning_rate": 4.9852136436726313e-05, + "loss": 5.6922, + "step": 5825 + }, + { + "epoch": 0.03464887239508992, + "grad_norm": 2.4542758464813232, + "learning_rate": 4.985208570517937e-05, + "loss": 5.4742, + "step": 5826 + }, + { + "epoch": 0.03465481967837092, + "grad_norm": 2.7492685317993164, + "learning_rate": 4.9852034964956816e-05, + "loss": 5.4598, + "step": 5827 + }, + { + "epoch": 0.03466076696165192, + "grad_norm": 2.757937431335449, + "learning_rate": 4.9851984216058677e-05, + "loss": 6.1865, + "step": 5828 + }, + { + "epoch": 0.034666714244932915, + "grad_norm": 2.835890531539917, + "learning_rate": 4.985193345848497e-05, + "loss": 5.3368, + "step": 5829 + }, + { + "epoch": 0.03467266152821391, + "grad_norm": 2.694884777069092, + "learning_rate": 4.98518826922357e-05, + "loss": 5.3654, + "step": 5830 + }, + { + "epoch": 0.03467860881149491, + "grad_norm": 2.443784236907959, + "learning_rate": 4.98518319173109e-05, + "loss": 5.7879, + "step": 5831 + }, + { + "epoch": 0.03468455609477591, + "grad_norm": 2.0198488235473633, + "learning_rate": 4.985178113371058e-05, + "loss": 5.766, + "step": 5832 + }, + { + "epoch": 0.0346905033780569, + "grad_norm": 2.8718788623809814, + "learning_rate": 4.985173034143476e-05, + "loss": 5.5506, + "step": 5833 + }, + { + "epoch": 0.0346964506613379, + "grad_norm": 2.4353652000427246, + "learning_rate": 4.9851679540483455e-05, + "loss": 5.7139, + "step": 5834 + }, + { + "epoch": 0.0347023979446189, + "grad_norm": 1.9376598596572876, + "learning_rate": 4.985162873085669e-05, + "loss": 6.2326, + "step": 5835 + }, + { + "epoch": 0.034708345227899895, + "grad_norm": 2.2225289344787598, + "learning_rate": 4.985157791255448e-05, + "loss": 5.5997, + "step": 5836 + }, + { + "epoch": 0.03471429251118089, + "grad_norm": 2.011493682861328, + "learning_rate": 4.985152708557684e-05, + "loss": 5.6882, + "step": 5837 + }, + { + "epoch": 0.03472023979446189, + "grad_norm": 1.8679020404815674, + "learning_rate": 4.985147624992378e-05, + "loss": 5.5427, + "step": 5838 + }, + { + "epoch": 0.03472618707774289, + "grad_norm": 1.9470884799957275, + "learning_rate": 4.9851425405595334e-05, + "loss": 5.5957, + "step": 5839 + }, + { + "epoch": 0.03473213436102388, + "grad_norm": 2.0765669345855713, + "learning_rate": 4.985137455259151e-05, + "loss": 5.4416, + "step": 5840 + }, + { + "epoch": 0.034738081644304884, + "grad_norm": 2.0521979331970215, + "learning_rate": 4.985132369091233e-05, + "loss": 5.4641, + "step": 5841 + }, + { + "epoch": 0.03474402892758588, + "grad_norm": 1.7439172267913818, + "learning_rate": 4.985127282055781e-05, + "loss": 5.1998, + "step": 5842 + }, + { + "epoch": 0.034749976210866874, + "grad_norm": 1.7347313165664673, + "learning_rate": 4.985122194152797e-05, + "loss": 5.2392, + "step": 5843 + }, + { + "epoch": 0.034755923494147876, + "grad_norm": 1.7362169027328491, + "learning_rate": 4.985117105382282e-05, + "loss": 5.1769, + "step": 5844 + }, + { + "epoch": 0.03476187077742887, + "grad_norm": 1.7468090057373047, + "learning_rate": 4.985112015744239e-05, + "loss": 5.3915, + "step": 5845 + }, + { + "epoch": 0.03476781806070987, + "grad_norm": 1.8685250282287598, + "learning_rate": 4.985106925238668e-05, + "loss": 5.6119, + "step": 5846 + }, + { + "epoch": 0.03477376534399086, + "grad_norm": 1.9595715999603271, + "learning_rate": 4.985101833865572e-05, + "loss": 5.5536, + "step": 5847 + }, + { + "epoch": 0.034779712627271864, + "grad_norm": 1.8454965353012085, + "learning_rate": 4.985096741624953e-05, + "loss": 5.8127, + "step": 5848 + }, + { + "epoch": 0.03478565991055286, + "grad_norm": 1.9182006120681763, + "learning_rate": 4.985091648516813e-05, + "loss": 5.8807, + "step": 5849 + }, + { + "epoch": 0.034791607193833854, + "grad_norm": 2.042923927307129, + "learning_rate": 4.9850865545411526e-05, + "loss": 5.9013, + "step": 5850 + }, + { + "epoch": 0.034797554477114856, + "grad_norm": 2.341055393218994, + "learning_rate": 4.985081459697974e-05, + "loss": 6.214, + "step": 5851 + }, + { + "epoch": 0.03480350176039585, + "grad_norm": 2.026190996170044, + "learning_rate": 4.985076363987279e-05, + "loss": 5.3693, + "step": 5852 + }, + { + "epoch": 0.034809449043676846, + "grad_norm": 2.045264482498169, + "learning_rate": 4.98507126740907e-05, + "loss": 5.6325, + "step": 5853 + }, + { + "epoch": 0.03481539632695785, + "grad_norm": 2.2710580825805664, + "learning_rate": 4.985066169963348e-05, + "loss": 5.8355, + "step": 5854 + }, + { + "epoch": 0.03482134361023884, + "grad_norm": 1.8813494443893433, + "learning_rate": 4.985061071650115e-05, + "loss": 5.5849, + "step": 5855 + }, + { + "epoch": 0.03482729089351984, + "grad_norm": 2.2177746295928955, + "learning_rate": 4.985055972469373e-05, + "loss": 5.5518, + "step": 5856 + }, + { + "epoch": 0.03483323817680084, + "grad_norm": 1.897653341293335, + "learning_rate": 4.9850508724211234e-05, + "loss": 5.6035, + "step": 5857 + }, + { + "epoch": 0.034839185460081835, + "grad_norm": 2.349821090698242, + "learning_rate": 4.985045771505369e-05, + "loss": 5.8181, + "step": 5858 + }, + { + "epoch": 0.03484513274336283, + "grad_norm": 1.900538682937622, + "learning_rate": 4.98504066972211e-05, + "loss": 5.2751, + "step": 5859 + }, + { + "epoch": 0.03485108002664383, + "grad_norm": 2.1902174949645996, + "learning_rate": 4.985035567071349e-05, + "loss": 5.2709, + "step": 5860 + }, + { + "epoch": 0.03485702730992483, + "grad_norm": 1.7833307981491089, + "learning_rate": 4.9850304635530884e-05, + "loss": 5.2104, + "step": 5861 + }, + { + "epoch": 0.03486297459320582, + "grad_norm": 2.017603874206543, + "learning_rate": 4.985025359167329e-05, + "loss": 5.2257, + "step": 5862 + }, + { + "epoch": 0.03486892187648682, + "grad_norm": 1.9828181266784668, + "learning_rate": 4.9850202539140724e-05, + "loss": 5.2303, + "step": 5863 + }, + { + "epoch": 0.03487486915976782, + "grad_norm": 2.0273706912994385, + "learning_rate": 4.9850151477933216e-05, + "loss": 5.1743, + "step": 5864 + }, + { + "epoch": 0.034880816443048815, + "grad_norm": 1.9634721279144287, + "learning_rate": 4.985010040805077e-05, + "loss": 5.1541, + "step": 5865 + }, + { + "epoch": 0.03488676372632981, + "grad_norm": 2.2766621112823486, + "learning_rate": 4.985004932949342e-05, + "loss": 5.1372, + "step": 5866 + }, + { + "epoch": 0.03489271100961081, + "grad_norm": 2.0768795013427734, + "learning_rate": 4.984999824226117e-05, + "loss": 5.2567, + "step": 5867 + }, + { + "epoch": 0.03489865829289181, + "grad_norm": 1.8665590286254883, + "learning_rate": 4.984994714635404e-05, + "loss": 5.1356, + "step": 5868 + }, + { + "epoch": 0.0349046055761728, + "grad_norm": 2.056450843811035, + "learning_rate": 4.984989604177205e-05, + "loss": 5.1667, + "step": 5869 + }, + { + "epoch": 0.034910552859453804, + "grad_norm": 2.1191976070404053, + "learning_rate": 4.984984492851522e-05, + "loss": 5.1898, + "step": 5870 + }, + { + "epoch": 0.0349165001427348, + "grad_norm": 2.049450397491455, + "learning_rate": 4.9849793806583566e-05, + "loss": 5.1568, + "step": 5871 + }, + { + "epoch": 0.034922447426015794, + "grad_norm": 1.79837167263031, + "learning_rate": 4.984974267597711e-05, + "loss": 5.1288, + "step": 5872 + }, + { + "epoch": 0.034928394709296796, + "grad_norm": 1.959088683128357, + "learning_rate": 4.984969153669585e-05, + "loss": 5.1063, + "step": 5873 + }, + { + "epoch": 0.03493434199257779, + "grad_norm": 1.9193873405456543, + "learning_rate": 4.9849640388739836e-05, + "loss": 5.1608, + "step": 5874 + }, + { + "epoch": 0.03494028927585879, + "grad_norm": 1.6684316396713257, + "learning_rate": 4.9849589232109065e-05, + "loss": 5.0926, + "step": 5875 + }, + { + "epoch": 0.03494623655913978, + "grad_norm": 1.8383700847625732, + "learning_rate": 4.984953806680356e-05, + "loss": 5.0474, + "step": 5876 + }, + { + "epoch": 0.034952183842420784, + "grad_norm": 2.233779191970825, + "learning_rate": 4.984948689282333e-05, + "loss": 5.5046, + "step": 5877 + }, + { + "epoch": 0.03495813112570178, + "grad_norm": 2.2267282009124756, + "learning_rate": 4.9849435710168415e-05, + "loss": 5.6235, + "step": 5878 + }, + { + "epoch": 0.034964078408982774, + "grad_norm": 1.7933586835861206, + "learning_rate": 4.9849384518838804e-05, + "loss": 5.0968, + "step": 5879 + }, + { + "epoch": 0.034970025692263776, + "grad_norm": 2.0050230026245117, + "learning_rate": 4.984933331883453e-05, + "loss": 4.9789, + "step": 5880 + }, + { + "epoch": 0.03497597297554477, + "grad_norm": 1.7422970533370972, + "learning_rate": 4.9849282110155627e-05, + "loss": 5.1556, + "step": 5881 + }, + { + "epoch": 0.034981920258825766, + "grad_norm": 2.1242151260375977, + "learning_rate": 4.984923089280209e-05, + "loss": 5.7039, + "step": 5882 + }, + { + "epoch": 0.03498786754210677, + "grad_norm": 1.8656666278839111, + "learning_rate": 4.9849179666773934e-05, + "loss": 5.7185, + "step": 5883 + }, + { + "epoch": 0.03499381482538776, + "grad_norm": 1.6954991817474365, + "learning_rate": 4.984912843207119e-05, + "loss": 5.5686, + "step": 5884 + }, + { + "epoch": 0.03499976210866876, + "grad_norm": 1.7692710161209106, + "learning_rate": 4.984907718869387e-05, + "loss": 5.4058, + "step": 5885 + }, + { + "epoch": 0.03500570939194976, + "grad_norm": 1.8496350049972534, + "learning_rate": 4.9849025936642004e-05, + "loss": 5.5037, + "step": 5886 + }, + { + "epoch": 0.035011656675230755, + "grad_norm": 2.0124640464782715, + "learning_rate": 4.984897467591559e-05, + "loss": 5.6146, + "step": 5887 + }, + { + "epoch": 0.03501760395851175, + "grad_norm": 2.5522549152374268, + "learning_rate": 4.984892340651466e-05, + "loss": 5.6403, + "step": 5888 + }, + { + "epoch": 0.03502355124179275, + "grad_norm": 2.2127344608306885, + "learning_rate": 4.9848872128439224e-05, + "loss": 5.6277, + "step": 5889 + }, + { + "epoch": 0.03502949852507375, + "grad_norm": 2.578322172164917, + "learning_rate": 4.9848820841689305e-05, + "loss": 5.849, + "step": 5890 + }, + { + "epoch": 0.03503544580835474, + "grad_norm": 1.8083957433700562, + "learning_rate": 4.9848769546264915e-05, + "loss": 5.4407, + "step": 5891 + }, + { + "epoch": 0.03504139309163574, + "grad_norm": 1.885387897491455, + "learning_rate": 4.984871824216609e-05, + "loss": 5.4486, + "step": 5892 + }, + { + "epoch": 0.03504734037491674, + "grad_norm": 1.9450737237930298, + "learning_rate": 4.9848666929392817e-05, + "loss": 5.4196, + "step": 5893 + }, + { + "epoch": 0.035053287658197735, + "grad_norm": 1.9072003364562988, + "learning_rate": 4.984861560794514e-05, + "loss": 5.6293, + "step": 5894 + }, + { + "epoch": 0.03505923494147873, + "grad_norm": 2.064192056655884, + "learning_rate": 4.984856427782307e-05, + "loss": 5.7105, + "step": 5895 + }, + { + "epoch": 0.03506518222475973, + "grad_norm": 2.0101802349090576, + "learning_rate": 4.984851293902663e-05, + "loss": 5.5623, + "step": 5896 + }, + { + "epoch": 0.03507112950804073, + "grad_norm": 1.9813642501831055, + "learning_rate": 4.984846159155581e-05, + "loss": 5.653, + "step": 5897 + }, + { + "epoch": 0.03507707679132172, + "grad_norm": 1.9213227033615112, + "learning_rate": 4.9848410235410666e-05, + "loss": 5.5194, + "step": 5898 + }, + { + "epoch": 0.035083024074602724, + "grad_norm": 1.803076982498169, + "learning_rate": 4.984835887059119e-05, + "loss": 5.4101, + "step": 5899 + }, + { + "epoch": 0.03508897135788372, + "grad_norm": 1.8419232368469238, + "learning_rate": 4.9848307497097414e-05, + "loss": 5.7329, + "step": 5900 + }, + { + "epoch": 0.035094918641164714, + "grad_norm": 1.9258531332015991, + "learning_rate": 4.984825611492935e-05, + "loss": 5.559, + "step": 5901 + }, + { + "epoch": 0.035100865924445716, + "grad_norm": 1.869529366493225, + "learning_rate": 4.984820472408701e-05, + "loss": 5.5682, + "step": 5902 + }, + { + "epoch": 0.03510681320772671, + "grad_norm": 1.753365159034729, + "learning_rate": 4.984815332457042e-05, + "loss": 5.6241, + "step": 5903 + }, + { + "epoch": 0.035112760491007707, + "grad_norm": 1.6581326723098755, + "learning_rate": 4.98481019163796e-05, + "loss": 5.4752, + "step": 5904 + }, + { + "epoch": 0.0351187077742887, + "grad_norm": 1.9120882749557495, + "learning_rate": 4.9848050499514565e-05, + "loss": 5.5678, + "step": 5905 + }, + { + "epoch": 0.035124655057569704, + "grad_norm": 1.9840329885482788, + "learning_rate": 4.984799907397533e-05, + "loss": 5.5369, + "step": 5906 + }, + { + "epoch": 0.0351306023408507, + "grad_norm": 1.7970712184906006, + "learning_rate": 4.9847947639761914e-05, + "loss": 5.5857, + "step": 5907 + }, + { + "epoch": 0.035136549624131694, + "grad_norm": 1.7219270467758179, + "learning_rate": 4.984789619687435e-05, + "loss": 5.609, + "step": 5908 + }, + { + "epoch": 0.035142496907412696, + "grad_norm": 1.8945105075836182, + "learning_rate": 4.984784474531262e-05, + "loss": 5.5893, + "step": 5909 + }, + { + "epoch": 0.03514844419069369, + "grad_norm": 1.8570127487182617, + "learning_rate": 4.984779328507678e-05, + "loss": 5.4556, + "step": 5910 + }, + { + "epoch": 0.035154391473974686, + "grad_norm": 1.9291017055511475, + "learning_rate": 4.984774181616683e-05, + "loss": 5.476, + "step": 5911 + }, + { + "epoch": 0.03516033875725569, + "grad_norm": 1.9138598442077637, + "learning_rate": 4.984769033858278e-05, + "loss": 5.6329, + "step": 5912 + }, + { + "epoch": 0.03516628604053668, + "grad_norm": 1.9484977722167969, + "learning_rate": 4.9847638852324665e-05, + "loss": 5.5305, + "step": 5913 + }, + { + "epoch": 0.03517223332381768, + "grad_norm": 1.7338584661483765, + "learning_rate": 4.984758735739249e-05, + "loss": 5.4842, + "step": 5914 + }, + { + "epoch": 0.03517818060709868, + "grad_norm": 1.8625437021255493, + "learning_rate": 4.984753585378629e-05, + "loss": 5.3696, + "step": 5915 + }, + { + "epoch": 0.035184127890379675, + "grad_norm": 1.798782229423523, + "learning_rate": 4.984748434150607e-05, + "loss": 5.5803, + "step": 5916 + }, + { + "epoch": 0.03519007517366067, + "grad_norm": 2.0596888065338135, + "learning_rate": 4.9847432820551845e-05, + "loss": 5.3274, + "step": 5917 + }, + { + "epoch": 0.03519602245694167, + "grad_norm": 2.0848498344421387, + "learning_rate": 4.984738129092364e-05, + "loss": 5.3334, + "step": 5918 + }, + { + "epoch": 0.03520196974022267, + "grad_norm": 2.000460386276245, + "learning_rate": 4.984732975262147e-05, + "loss": 5.4411, + "step": 5919 + }, + { + "epoch": 0.03520791702350366, + "grad_norm": 1.676957607269287, + "learning_rate": 4.9847278205645355e-05, + "loss": 5.47, + "step": 5920 + }, + { + "epoch": 0.03521386430678466, + "grad_norm": 1.911482334136963, + "learning_rate": 4.984722664999531e-05, + "loss": 5.5736, + "step": 5921 + }, + { + "epoch": 0.03521981159006566, + "grad_norm": 1.9573029279708862, + "learning_rate": 4.9847175085671356e-05, + "loss": 5.5509, + "step": 5922 + }, + { + "epoch": 0.035225758873346655, + "grad_norm": 1.8878334760665894, + "learning_rate": 4.984712351267351e-05, + "loss": 5.6437, + "step": 5923 + }, + { + "epoch": 0.03523170615662765, + "grad_norm": 1.9107712507247925, + "learning_rate": 4.984707193100179e-05, + "loss": 5.4471, + "step": 5924 + }, + { + "epoch": 0.03523765343990865, + "grad_norm": 1.7408612966537476, + "learning_rate": 4.9847020340656215e-05, + "loss": 5.3706, + "step": 5925 + }, + { + "epoch": 0.03524360072318965, + "grad_norm": 1.9594995975494385, + "learning_rate": 4.98469687416368e-05, + "loss": 5.4113, + "step": 5926 + }, + { + "epoch": 0.03524954800647064, + "grad_norm": 1.8772166967391968, + "learning_rate": 4.984691713394356e-05, + "loss": 5.368, + "step": 5927 + }, + { + "epoch": 0.035255495289751644, + "grad_norm": 2.1143953800201416, + "learning_rate": 4.9846865517576524e-05, + "loss": 5.3829, + "step": 5928 + }, + { + "epoch": 0.03526144257303264, + "grad_norm": 2.0923383235931396, + "learning_rate": 4.984681389253571e-05, + "loss": 5.9834, + "step": 5929 + }, + { + "epoch": 0.035267389856313634, + "grad_norm": 2.016749620437622, + "learning_rate": 4.984676225882112e-05, + "loss": 5.68, + "step": 5930 + }, + { + "epoch": 0.035273337139594636, + "grad_norm": 1.6040265560150146, + "learning_rate": 4.984671061643279e-05, + "loss": 5.7406, + "step": 5931 + }, + { + "epoch": 0.03527928442287563, + "grad_norm": 2.100774049758911, + "learning_rate": 4.984665896537072e-05, + "loss": 5.5545, + "step": 5932 + }, + { + "epoch": 0.035285231706156626, + "grad_norm": 2.008575439453125, + "learning_rate": 4.984660730563494e-05, + "loss": 5.3769, + "step": 5933 + }, + { + "epoch": 0.03529117898943762, + "grad_norm": 1.9622136354446411, + "learning_rate": 4.984655563722547e-05, + "loss": 5.5792, + "step": 5934 + }, + { + "epoch": 0.035297126272718624, + "grad_norm": 1.764647364616394, + "learning_rate": 4.9846503960142325e-05, + "loss": 5.6543, + "step": 5935 + }, + { + "epoch": 0.03530307355599962, + "grad_norm": 1.6166809797286987, + "learning_rate": 4.984645227438552e-05, + "loss": 5.7948, + "step": 5936 + }, + { + "epoch": 0.035309020839280614, + "grad_norm": 1.7368977069854736, + "learning_rate": 4.9846400579955074e-05, + "loss": 5.6288, + "step": 5937 + }, + { + "epoch": 0.035314968122561616, + "grad_norm": 1.649059772491455, + "learning_rate": 4.984634887685101e-05, + "loss": 5.8538, + "step": 5938 + }, + { + "epoch": 0.03532091540584261, + "grad_norm": 1.6092652082443237, + "learning_rate": 4.984629716507334e-05, + "loss": 5.7077, + "step": 5939 + }, + { + "epoch": 0.035326862689123606, + "grad_norm": 1.76821768283844, + "learning_rate": 4.984624544462209e-05, + "loss": 5.4206, + "step": 5940 + }, + { + "epoch": 0.03533280997240461, + "grad_norm": 1.5885004997253418, + "learning_rate": 4.984619371549727e-05, + "loss": 5.3997, + "step": 5941 + }, + { + "epoch": 0.0353387572556856, + "grad_norm": 1.6730574369430542, + "learning_rate": 4.984614197769889e-05, + "loss": 5.4952, + "step": 5942 + }, + { + "epoch": 0.0353447045389666, + "grad_norm": 1.9951595067977905, + "learning_rate": 4.984609023122699e-05, + "loss": 5.5658, + "step": 5943 + }, + { + "epoch": 0.0353506518222476, + "grad_norm": 1.8277794122695923, + "learning_rate": 4.984603847608157e-05, + "loss": 5.5313, + "step": 5944 + }, + { + "epoch": 0.035356599105528595, + "grad_norm": 1.5988150835037231, + "learning_rate": 4.984598671226266e-05, + "loss": 5.4661, + "step": 5945 + }, + { + "epoch": 0.03536254638880959, + "grad_norm": 1.8313721418380737, + "learning_rate": 4.9845934939770264e-05, + "loss": 5.3005, + "step": 5946 + }, + { + "epoch": 0.03536849367209059, + "grad_norm": 1.8441407680511475, + "learning_rate": 4.984588315860442e-05, + "loss": 5.4564, + "step": 5947 + }, + { + "epoch": 0.03537444095537159, + "grad_norm": 2.8165388107299805, + "learning_rate": 4.9845831368765126e-05, + "loss": 5.4582, + "step": 5948 + }, + { + "epoch": 0.03538038823865258, + "grad_norm": 1.8860023021697998, + "learning_rate": 4.9845779570252415e-05, + "loss": 5.4952, + "step": 5949 + }, + { + "epoch": 0.03538633552193358, + "grad_norm": 1.7752633094787598, + "learning_rate": 4.98457277630663e-05, + "loss": 5.4301, + "step": 5950 + }, + { + "epoch": 0.03539228280521458, + "grad_norm": 1.9038548469543457, + "learning_rate": 4.984567594720679e-05, + "loss": 5.2591, + "step": 5951 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 2.6449787616729736, + "learning_rate": 4.984562412267392e-05, + "loss": 5.9317, + "step": 5952 + }, + { + "epoch": 0.03540417737177657, + "grad_norm": 1.95949125289917, + "learning_rate": 4.98455722894677e-05, + "loss": 5.4686, + "step": 5953 + }, + { + "epoch": 0.03541012465505757, + "grad_norm": 2.0208640098571777, + "learning_rate": 4.984552044758814e-05, + "loss": 5.6361, + "step": 5954 + }, + { + "epoch": 0.03541607193833857, + "grad_norm": 2.2328197956085205, + "learning_rate": 4.9845468597035274e-05, + "loss": 5.455, + "step": 5955 + }, + { + "epoch": 0.03542201922161956, + "grad_norm": 2.115952968597412, + "learning_rate": 4.9845416737809105e-05, + "loss": 5.3275, + "step": 5956 + }, + { + "epoch": 0.035427966504900564, + "grad_norm": 2.023791790008545, + "learning_rate": 4.984536486990966e-05, + "loss": 5.3135, + "step": 5957 + }, + { + "epoch": 0.03543391378818156, + "grad_norm": 1.9721077680587769, + "learning_rate": 4.9845312993336945e-05, + "loss": 5.3429, + "step": 5958 + }, + { + "epoch": 0.035439861071462554, + "grad_norm": 2.047588586807251, + "learning_rate": 4.9845261108091e-05, + "loss": 5.4027, + "step": 5959 + }, + { + "epoch": 0.035445808354743556, + "grad_norm": 1.9019498825073242, + "learning_rate": 4.9845209214171826e-05, + "loss": 5.3867, + "step": 5960 + }, + { + "epoch": 0.03545175563802455, + "grad_norm": 1.9442843198776245, + "learning_rate": 4.984515731157945e-05, + "loss": 5.3189, + "step": 5961 + }, + { + "epoch": 0.035457702921305546, + "grad_norm": 2.051422357559204, + "learning_rate": 4.9845105400313885e-05, + "loss": 5.5713, + "step": 5962 + }, + { + "epoch": 0.03546365020458654, + "grad_norm": 1.811908483505249, + "learning_rate": 4.9845053480375145e-05, + "loss": 5.6221, + "step": 5963 + }, + { + "epoch": 0.035469597487867544, + "grad_norm": 2.017991542816162, + "learning_rate": 4.984500155176326e-05, + "loss": 5.2774, + "step": 5964 + }, + { + "epoch": 0.03547554477114854, + "grad_norm": 1.972644329071045, + "learning_rate": 4.9844949614478244e-05, + "loss": 5.3208, + "step": 5965 + }, + { + "epoch": 0.035481492054429534, + "grad_norm": 1.9937026500701904, + "learning_rate": 4.984489766852011e-05, + "loss": 5.455, + "step": 5966 + }, + { + "epoch": 0.035487439337710536, + "grad_norm": 1.7297019958496094, + "learning_rate": 4.984484571388887e-05, + "loss": 5.3829, + "step": 5967 + }, + { + "epoch": 0.03549338662099153, + "grad_norm": 1.6428204774856567, + "learning_rate": 4.984479375058456e-05, + "loss": 5.3638, + "step": 5968 + }, + { + "epoch": 0.035499333904272526, + "grad_norm": 1.9522719383239746, + "learning_rate": 4.9844741778607186e-05, + "loss": 5.3379, + "step": 5969 + }, + { + "epoch": 0.03550528118755353, + "grad_norm": 2.0280921459198, + "learning_rate": 4.984468979795677e-05, + "loss": 5.4366, + "step": 5970 + }, + { + "epoch": 0.03551122847083452, + "grad_norm": 2.0396251678466797, + "learning_rate": 4.9844637808633334e-05, + "loss": 5.5681, + "step": 5971 + }, + { + "epoch": 0.03551717575411552, + "grad_norm": 1.5256271362304688, + "learning_rate": 4.984458581063689e-05, + "loss": 5.602, + "step": 5972 + }, + { + "epoch": 0.03552312303739652, + "grad_norm": 1.8829892873764038, + "learning_rate": 4.984453380396745e-05, + "loss": 5.3851, + "step": 5973 + }, + { + "epoch": 0.035529070320677515, + "grad_norm": 2.047106981277466, + "learning_rate": 4.984448178862505e-05, + "loss": 5.3724, + "step": 5974 + }, + { + "epoch": 0.03553501760395851, + "grad_norm": 2.066572904586792, + "learning_rate": 4.984442976460969e-05, + "loss": 5.3352, + "step": 5975 + }, + { + "epoch": 0.03554096488723951, + "grad_norm": 1.9785430431365967, + "learning_rate": 4.98443777319214e-05, + "loss": 5.2641, + "step": 5976 + }, + { + "epoch": 0.03554691217052051, + "grad_norm": 1.8999443054199219, + "learning_rate": 4.98443256905602e-05, + "loss": 5.3402, + "step": 5977 + }, + { + "epoch": 0.0355528594538015, + "grad_norm": 1.8599263429641724, + "learning_rate": 4.98442736405261e-05, + "loss": 5.2612, + "step": 5978 + }, + { + "epoch": 0.0355588067370825, + "grad_norm": 1.7216875553131104, + "learning_rate": 4.984422158181911e-05, + "loss": 5.4041, + "step": 5979 + }, + { + "epoch": 0.0355647540203635, + "grad_norm": 2.0259687900543213, + "learning_rate": 4.984416951443926e-05, + "loss": 5.4895, + "step": 5980 + }, + { + "epoch": 0.035570701303644495, + "grad_norm": 1.705736756324768, + "learning_rate": 4.9844117438386583e-05, + "loss": 5.5845, + "step": 5981 + }, + { + "epoch": 0.03557664858692549, + "grad_norm": 1.9546462297439575, + "learning_rate": 4.9844065353661074e-05, + "loss": 5.6803, + "step": 5982 + }, + { + "epoch": 0.03558259587020649, + "grad_norm": 1.829689383506775, + "learning_rate": 4.984401326026275e-05, + "loss": 5.5816, + "step": 5983 + }, + { + "epoch": 0.03558854315348749, + "grad_norm": 1.6464663743972778, + "learning_rate": 4.984396115819164e-05, + "loss": 5.5738, + "step": 5984 + }, + { + "epoch": 0.03559449043676848, + "grad_norm": 1.7786076068878174, + "learning_rate": 4.984390904744777e-05, + "loss": 5.3667, + "step": 5985 + }, + { + "epoch": 0.035600437720049484, + "grad_norm": 2.210754871368408, + "learning_rate": 4.984385692803114e-05, + "loss": 5.5259, + "step": 5986 + }, + { + "epoch": 0.03560638500333048, + "grad_norm": 1.7361842393875122, + "learning_rate": 4.984380479994179e-05, + "loss": 5.6108, + "step": 5987 + }, + { + "epoch": 0.035612332286611474, + "grad_norm": 1.926477313041687, + "learning_rate": 4.9843752663179703e-05, + "loss": 5.593, + "step": 5988 + }, + { + "epoch": 0.035618279569892476, + "grad_norm": 1.6683733463287354, + "learning_rate": 4.984370051774493e-05, + "loss": 5.6305, + "step": 5989 + }, + { + "epoch": 0.03562422685317347, + "grad_norm": 1.790499210357666, + "learning_rate": 4.9843648363637475e-05, + "loss": 5.596, + "step": 5990 + }, + { + "epoch": 0.035630174136454466, + "grad_norm": 1.8355207443237305, + "learning_rate": 4.984359620085736e-05, + "loss": 5.5818, + "step": 5991 + }, + { + "epoch": 0.03563612141973546, + "grad_norm": 1.9352680444717407, + "learning_rate": 4.98435440294046e-05, + "loss": 5.187, + "step": 5992 + }, + { + "epoch": 0.03564206870301646, + "grad_norm": 2.063159465789795, + "learning_rate": 4.9843491849279225e-05, + "loss": 5.3245, + "step": 5993 + }, + { + "epoch": 0.03564801598629746, + "grad_norm": 1.6848958730697632, + "learning_rate": 4.984343966048123e-05, + "loss": 5.4454, + "step": 5994 + }, + { + "epoch": 0.035653963269578454, + "grad_norm": 2.1244423389434814, + "learning_rate": 4.9843387463010654e-05, + "loss": 5.5018, + "step": 5995 + }, + { + "epoch": 0.035659910552859456, + "grad_norm": 1.9100427627563477, + "learning_rate": 4.9843335256867505e-05, + "loss": 5.5597, + "step": 5996 + }, + { + "epoch": 0.03566585783614045, + "grad_norm": 1.9130252599716187, + "learning_rate": 4.984328304205181e-05, + "loss": 5.4538, + "step": 5997 + }, + { + "epoch": 0.035671805119421446, + "grad_norm": 1.6285213232040405, + "learning_rate": 4.984323081856358e-05, + "loss": 5.7361, + "step": 5998 + }, + { + "epoch": 0.03567775240270245, + "grad_norm": 1.6690980195999146, + "learning_rate": 4.984317858640283e-05, + "loss": 5.7537, + "step": 5999 + }, + { + "epoch": 0.03568369968598344, + "grad_norm": 1.5258572101593018, + "learning_rate": 4.984312634556959e-05, + "loss": 5.7419, + "step": 6000 + }, + { + "epoch": 0.03568964696926444, + "grad_norm": 1.9586881399154663, + "learning_rate": 4.984307409606386e-05, + "loss": 5.4449, + "step": 6001 + }, + { + "epoch": 0.03569559425254544, + "grad_norm": 2.1795685291290283, + "learning_rate": 4.9843021837885684e-05, + "loss": 5.3833, + "step": 6002 + }, + { + "epoch": 0.035701541535826435, + "grad_norm": 2.1241326332092285, + "learning_rate": 4.984296957103506e-05, + "loss": 5.3064, + "step": 6003 + }, + { + "epoch": 0.03570748881910743, + "grad_norm": 1.9621204137802124, + "learning_rate": 4.9842917295512004e-05, + "loss": 5.3002, + "step": 6004 + }, + { + "epoch": 0.03571343610238843, + "grad_norm": 2.041503429412842, + "learning_rate": 4.984286501131655e-05, + "loss": 5.2885, + "step": 6005 + }, + { + "epoch": 0.03571938338566943, + "grad_norm": 2.1099791526794434, + "learning_rate": 4.984281271844871e-05, + "loss": 5.3038, + "step": 6006 + }, + { + "epoch": 0.03572533066895042, + "grad_norm": 2.0209009647369385, + "learning_rate": 4.98427604169085e-05, + "loss": 5.8373, + "step": 6007 + }, + { + "epoch": 0.03573127795223142, + "grad_norm": 1.7534282207489014, + "learning_rate": 4.9842708106695934e-05, + "loss": 5.6522, + "step": 6008 + }, + { + "epoch": 0.03573722523551242, + "grad_norm": 2.3014237880706787, + "learning_rate": 4.984265578781104e-05, + "loss": 5.462, + "step": 6009 + }, + { + "epoch": 0.035743172518793415, + "grad_norm": 2.123767614364624, + "learning_rate": 4.984260346025382e-05, + "loss": 5.3901, + "step": 6010 + }, + { + "epoch": 0.03574911980207441, + "grad_norm": 2.4190175533294678, + "learning_rate": 4.9842551124024315e-05, + "loss": 5.1526, + "step": 6011 + }, + { + "epoch": 0.03575506708535541, + "grad_norm": 1.9972834587097168, + "learning_rate": 4.984249877912254e-05, + "loss": 5.2987, + "step": 6012 + }, + { + "epoch": 0.03576101436863641, + "grad_norm": 2.002969980239868, + "learning_rate": 4.9842446425548494e-05, + "loss": 5.5244, + "step": 6013 + }, + { + "epoch": 0.0357669616519174, + "grad_norm": 2.8208391666412354, + "learning_rate": 4.984239406330221e-05, + "loss": 5.834, + "step": 6014 + }, + { + "epoch": 0.035772908935198404, + "grad_norm": 2.409303665161133, + "learning_rate": 4.98423416923837e-05, + "loss": 5.1709, + "step": 6015 + }, + { + "epoch": 0.0357788562184794, + "grad_norm": 2.215888500213623, + "learning_rate": 4.984228931279298e-05, + "loss": 5.38, + "step": 6016 + }, + { + "epoch": 0.035784803501760394, + "grad_norm": 1.9130421876907349, + "learning_rate": 4.9842236924530086e-05, + "loss": 5.4551, + "step": 6017 + }, + { + "epoch": 0.035790750785041396, + "grad_norm": 1.8963314294815063, + "learning_rate": 4.9842184527595015e-05, + "loss": 5.3512, + "step": 6018 + }, + { + "epoch": 0.03579669806832239, + "grad_norm": 2.0085666179656982, + "learning_rate": 4.98421321219878e-05, + "loss": 5.3013, + "step": 6019 + }, + { + "epoch": 0.035802645351603386, + "grad_norm": 2.1059834957122803, + "learning_rate": 4.9842079707708446e-05, + "loss": 5.4052, + "step": 6020 + }, + { + "epoch": 0.03580859263488438, + "grad_norm": 1.965694785118103, + "learning_rate": 4.984202728475699e-05, + "loss": 5.5392, + "step": 6021 + }, + { + "epoch": 0.03581453991816538, + "grad_norm": 1.9495680332183838, + "learning_rate": 4.9841974853133425e-05, + "loss": 5.309, + "step": 6022 + }, + { + "epoch": 0.03582048720144638, + "grad_norm": 1.9762555360794067, + "learning_rate": 4.9841922412837795e-05, + "loss": 5.3979, + "step": 6023 + }, + { + "epoch": 0.035826434484727374, + "grad_norm": 1.7825839519500732, + "learning_rate": 4.98418699638701e-05, + "loss": 5.3502, + "step": 6024 + }, + { + "epoch": 0.035832381768008376, + "grad_norm": 1.9636192321777344, + "learning_rate": 4.984181750623037e-05, + "loss": 5.6341, + "step": 6025 + }, + { + "epoch": 0.03583832905128937, + "grad_norm": 1.833883285522461, + "learning_rate": 4.984176503991861e-05, + "loss": 5.5861, + "step": 6026 + }, + { + "epoch": 0.035844276334570366, + "grad_norm": 1.91568124294281, + "learning_rate": 4.984171256493485e-05, + "loss": 5.591, + "step": 6027 + }, + { + "epoch": 0.03585022361785137, + "grad_norm": 2.153472423553467, + "learning_rate": 4.9841660081279105e-05, + "loss": 5.3463, + "step": 6028 + }, + { + "epoch": 0.03585617090113236, + "grad_norm": 1.8164830207824707, + "learning_rate": 4.984160758895139e-05, + "loss": 5.4886, + "step": 6029 + }, + { + "epoch": 0.03586211818441336, + "grad_norm": 2.0216922760009766, + "learning_rate": 4.984155508795174e-05, + "loss": 5.5777, + "step": 6030 + }, + { + "epoch": 0.03586806546769436, + "grad_norm": 1.966779351234436, + "learning_rate": 4.984150257828014e-05, + "loss": 5.1867, + "step": 6031 + }, + { + "epoch": 0.035874012750975355, + "grad_norm": 2.091109275817871, + "learning_rate": 4.9841450059936645e-05, + "loss": 5.5302, + "step": 6032 + }, + { + "epoch": 0.03587996003425635, + "grad_norm": 1.8772802352905273, + "learning_rate": 4.984139753292125e-05, + "loss": 5.2904, + "step": 6033 + }, + { + "epoch": 0.03588590731753735, + "grad_norm": 2.049431800842285, + "learning_rate": 4.984134499723397e-05, + "loss": 5.293, + "step": 6034 + }, + { + "epoch": 0.03589185460081835, + "grad_norm": 2.0902609825134277, + "learning_rate": 4.984129245287485e-05, + "loss": 5.2689, + "step": 6035 + }, + { + "epoch": 0.03589780188409934, + "grad_norm": 1.91702139377594, + "learning_rate": 4.9841239899843886e-05, + "loss": 5.255, + "step": 6036 + }, + { + "epoch": 0.03590374916738034, + "grad_norm": 1.7073708772659302, + "learning_rate": 4.984118733814109e-05, + "loss": 5.3272, + "step": 6037 + }, + { + "epoch": 0.03590969645066134, + "grad_norm": 1.625712275505066, + "learning_rate": 4.9841134767766506e-05, + "loss": 5.5366, + "step": 6038 + }, + { + "epoch": 0.035915643733942335, + "grad_norm": 1.8465087413787842, + "learning_rate": 4.984108218872014e-05, + "loss": 5.3373, + "step": 6039 + }, + { + "epoch": 0.03592159101722333, + "grad_norm": 2.2392280101776123, + "learning_rate": 4.9841029601002e-05, + "loss": 5.5898, + "step": 6040 + }, + { + "epoch": 0.03592753830050433, + "grad_norm": 2.6571459770202637, + "learning_rate": 4.984097700461212e-05, + "loss": 5.963, + "step": 6041 + }, + { + "epoch": 0.03593348558378533, + "grad_norm": 2.7220845222473145, + "learning_rate": 4.98409243995505e-05, + "loss": 5.6997, + "step": 6042 + }, + { + "epoch": 0.03593943286706632, + "grad_norm": 2.430968999862671, + "learning_rate": 4.9840871785817185e-05, + "loss": 5.2949, + "step": 6043 + }, + { + "epoch": 0.035945380150347324, + "grad_norm": 2.3006606101989746, + "learning_rate": 4.984081916341217e-05, + "loss": 5.2045, + "step": 6044 + }, + { + "epoch": 0.03595132743362832, + "grad_norm": 2.2382659912109375, + "learning_rate": 4.984076653233548e-05, + "loss": 5.417, + "step": 6045 + }, + { + "epoch": 0.035957274716909314, + "grad_norm": 2.1896233558654785, + "learning_rate": 4.9840713892587146e-05, + "loss": 5.7215, + "step": 6046 + }, + { + "epoch": 0.035963222000190316, + "grad_norm": 1.8175956010818481, + "learning_rate": 4.9840661244167166e-05, + "loss": 5.569, + "step": 6047 + }, + { + "epoch": 0.03596916928347131, + "grad_norm": 2.066828727722168, + "learning_rate": 4.984060858707557e-05, + "loss": 5.6285, + "step": 6048 + }, + { + "epoch": 0.035975116566752306, + "grad_norm": 2.246291160583496, + "learning_rate": 4.984055592131237e-05, + "loss": 5.5583, + "step": 6049 + }, + { + "epoch": 0.0359810638500333, + "grad_norm": 2.2394871711730957, + "learning_rate": 4.984050324687759e-05, + "loss": 5.3917, + "step": 6050 + }, + { + "epoch": 0.0359870111333143, + "grad_norm": 2.5051162242889404, + "learning_rate": 4.984045056377125e-05, + "loss": 5.6955, + "step": 6051 + }, + { + "epoch": 0.0359929584165953, + "grad_norm": 2.1360414028167725, + "learning_rate": 4.984039787199336e-05, + "loss": 5.5451, + "step": 6052 + }, + { + "epoch": 0.035998905699876294, + "grad_norm": 2.0267562866210938, + "learning_rate": 4.984034517154395e-05, + "loss": 5.4559, + "step": 6053 + }, + { + "epoch": 0.036004852983157296, + "grad_norm": 1.7683112621307373, + "learning_rate": 4.984029246242303e-05, + "loss": 5.4663, + "step": 6054 + }, + { + "epoch": 0.03601080026643829, + "grad_norm": 2.0600638389587402, + "learning_rate": 4.9840239744630626e-05, + "loss": 5.5081, + "step": 6055 + }, + { + "epoch": 0.036016747549719286, + "grad_norm": 2.093698740005493, + "learning_rate": 4.984018701816674e-05, + "loss": 5.5435, + "step": 6056 + }, + { + "epoch": 0.03602269483300029, + "grad_norm": 2.217721462249756, + "learning_rate": 4.984013428303141e-05, + "loss": 5.7482, + "step": 6057 + }, + { + "epoch": 0.03602864211628128, + "grad_norm": 1.9680962562561035, + "learning_rate": 4.9840081539224636e-05, + "loss": 5.9722, + "step": 6058 + }, + { + "epoch": 0.03603458939956228, + "grad_norm": 1.8606425523757935, + "learning_rate": 4.9840028786746455e-05, + "loss": 5.8379, + "step": 6059 + }, + { + "epoch": 0.03604053668284328, + "grad_norm": 2.0129475593566895, + "learning_rate": 4.983997602559688e-05, + "loss": 5.7199, + "step": 6060 + }, + { + "epoch": 0.036046483966124275, + "grad_norm": 1.9370187520980835, + "learning_rate": 4.9839923255775917e-05, + "loss": 5.3563, + "step": 6061 + }, + { + "epoch": 0.03605243124940527, + "grad_norm": 1.775894284248352, + "learning_rate": 4.983987047728359e-05, + "loss": 5.5201, + "step": 6062 + }, + { + "epoch": 0.03605837853268627, + "grad_norm": 1.9943023920059204, + "learning_rate": 4.9839817690119934e-05, + "loss": 5.4034, + "step": 6063 + }, + { + "epoch": 0.03606432581596727, + "grad_norm": 1.9605768918991089, + "learning_rate": 4.983976489428494e-05, + "loss": 5.5314, + "step": 6064 + }, + { + "epoch": 0.03607027309924826, + "grad_norm": 1.7820254564285278, + "learning_rate": 4.983971208977866e-05, + "loss": 5.6131, + "step": 6065 + }, + { + "epoch": 0.03607622038252926, + "grad_norm": 2.010796070098877, + "learning_rate": 4.983965927660108e-05, + "loss": 5.5114, + "step": 6066 + }, + { + "epoch": 0.03608216766581026, + "grad_norm": 1.8461687564849854, + "learning_rate": 4.983960645475223e-05, + "loss": 5.4752, + "step": 6067 + }, + { + "epoch": 0.036088114949091255, + "grad_norm": 2.048119068145752, + "learning_rate": 4.983955362423214e-05, + "loss": 5.3325, + "step": 6068 + }, + { + "epoch": 0.03609406223237225, + "grad_norm": 2.021646499633789, + "learning_rate": 4.9839500785040804e-05, + "loss": 5.2238, + "step": 6069 + }, + { + "epoch": 0.03610000951565325, + "grad_norm": 1.9979503154754639, + "learning_rate": 4.9839447937178264e-05, + "loss": 5.4054, + "step": 6070 + }, + { + "epoch": 0.03610595679893425, + "grad_norm": 1.980776071548462, + "learning_rate": 4.983939508064453e-05, + "loss": 5.4094, + "step": 6071 + }, + { + "epoch": 0.03611190408221524, + "grad_norm": 1.8364293575286865, + "learning_rate": 4.9839342215439615e-05, + "loss": 5.4372, + "step": 6072 + }, + { + "epoch": 0.036117851365496244, + "grad_norm": 1.8870443105697632, + "learning_rate": 4.983928934156354e-05, + "loss": 5.4075, + "step": 6073 + }, + { + "epoch": 0.03612379864877724, + "grad_norm": 2.176180124282837, + "learning_rate": 4.9839236459016337e-05, + "loss": 5.4302, + "step": 6074 + }, + { + "epoch": 0.036129745932058234, + "grad_norm": 2.054960012435913, + "learning_rate": 4.983918356779801e-05, + "loss": 5.3796, + "step": 6075 + }, + { + "epoch": 0.036135693215339236, + "grad_norm": 2.2146401405334473, + "learning_rate": 4.9839130667908576e-05, + "loss": 5.651, + "step": 6076 + }, + { + "epoch": 0.03614164049862023, + "grad_norm": 1.908640742301941, + "learning_rate": 4.983907775934806e-05, + "loss": 5.3002, + "step": 6077 + }, + { + "epoch": 0.036147587781901226, + "grad_norm": 1.9364973306655884, + "learning_rate": 4.983902484211648e-05, + "loss": 5.2299, + "step": 6078 + }, + { + "epoch": 0.03615353506518223, + "grad_norm": 1.7405542135238647, + "learning_rate": 4.983897191621385e-05, + "loss": 5.268, + "step": 6079 + }, + { + "epoch": 0.03615948234846322, + "grad_norm": 2.0347912311553955, + "learning_rate": 4.9838918981640195e-05, + "loss": 5.4887, + "step": 6080 + }, + { + "epoch": 0.03616542963174422, + "grad_norm": 2.0755162239074707, + "learning_rate": 4.9838866038395524e-05, + "loss": 5.2208, + "step": 6081 + }, + { + "epoch": 0.03617137691502521, + "grad_norm": 1.9119634628295898, + "learning_rate": 4.9838813086479865e-05, + "loss": 5.2659, + "step": 6082 + }, + { + "epoch": 0.036177324198306215, + "grad_norm": 1.9172658920288086, + "learning_rate": 4.983876012589324e-05, + "loss": 5.4098, + "step": 6083 + }, + { + "epoch": 0.03618327148158721, + "grad_norm": 2.09004545211792, + "learning_rate": 4.983870715663565e-05, + "loss": 5.5866, + "step": 6084 + }, + { + "epoch": 0.036189218764868206, + "grad_norm": 2.0952436923980713, + "learning_rate": 4.983865417870712e-05, + "loss": 5.5288, + "step": 6085 + }, + { + "epoch": 0.03619516604814921, + "grad_norm": 1.8599412441253662, + "learning_rate": 4.9838601192107686e-05, + "loss": 5.7538, + "step": 6086 + }, + { + "epoch": 0.0362011133314302, + "grad_norm": 1.8318936824798584, + "learning_rate": 4.983854819683735e-05, + "loss": 5.9613, + "step": 6087 + }, + { + "epoch": 0.0362070606147112, + "grad_norm": 1.8312503099441528, + "learning_rate": 4.983849519289613e-05, + "loss": 5.2749, + "step": 6088 + }, + { + "epoch": 0.0362130078979922, + "grad_norm": 2.157576560974121, + "learning_rate": 4.983844218028405e-05, + "loss": 5.2826, + "step": 6089 + }, + { + "epoch": 0.036218955181273195, + "grad_norm": 2.1377198696136475, + "learning_rate": 4.983838915900112e-05, + "loss": 5.2843, + "step": 6090 + }, + { + "epoch": 0.03622490246455419, + "grad_norm": 2.0167126655578613, + "learning_rate": 4.983833612904737e-05, + "loss": 5.4713, + "step": 6091 + }, + { + "epoch": 0.03623084974783519, + "grad_norm": 1.748759388923645, + "learning_rate": 4.9838283090422814e-05, + "loss": 5.3685, + "step": 6092 + }, + { + "epoch": 0.03623679703111619, + "grad_norm": 2.0344316959381104, + "learning_rate": 4.983823004312747e-05, + "loss": 5.1093, + "step": 6093 + }, + { + "epoch": 0.03624274431439718, + "grad_norm": 1.9061161279678345, + "learning_rate": 4.9838176987161356e-05, + "loss": 5.2035, + "step": 6094 + }, + { + "epoch": 0.03624869159767818, + "grad_norm": 1.9090344905853271, + "learning_rate": 4.983812392252449e-05, + "loss": 5.3863, + "step": 6095 + }, + { + "epoch": 0.03625463888095918, + "grad_norm": 1.9536118507385254, + "learning_rate": 4.9838070849216894e-05, + "loss": 5.5349, + "step": 6096 + }, + { + "epoch": 0.036260586164240174, + "grad_norm": 1.89446222782135, + "learning_rate": 4.983801776723858e-05, + "loss": 5.7098, + "step": 6097 + }, + { + "epoch": 0.03626653344752117, + "grad_norm": 1.6403870582580566, + "learning_rate": 4.983796467658958e-05, + "loss": 5.6726, + "step": 6098 + }, + { + "epoch": 0.03627248073080217, + "grad_norm": 1.7792481184005737, + "learning_rate": 4.983791157726989e-05, + "loss": 5.6761, + "step": 6099 + }, + { + "epoch": 0.03627842801408317, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.9837858469279554e-05, + "loss": 5.6576, + "step": 6100 + }, + { + "epoch": 0.03628437529736416, + "grad_norm": 1.9885895252227783, + "learning_rate": 4.983780535261857e-05, + "loss": 5.5944, + "step": 6101 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 1.771620750427246, + "learning_rate": 4.983775222728697e-05, + "loss": 5.7949, + "step": 6102 + }, + { + "epoch": 0.03629626986392616, + "grad_norm": 1.684471845626831, + "learning_rate": 4.9837699093284765e-05, + "loss": 5.5435, + "step": 6103 + }, + { + "epoch": 0.036302217147207154, + "grad_norm": 1.8454065322875977, + "learning_rate": 4.9837645950611966e-05, + "loss": 5.4526, + "step": 6104 + }, + { + "epoch": 0.036308164430488156, + "grad_norm": 1.6522735357284546, + "learning_rate": 4.983759279926862e-05, + "loss": 5.7302, + "step": 6105 + }, + { + "epoch": 0.03631411171376915, + "grad_norm": 1.8691065311431885, + "learning_rate": 4.9837539639254713e-05, + "loss": 5.6494, + "step": 6106 + }, + { + "epoch": 0.036320058997050146, + "grad_norm": 1.9420015811920166, + "learning_rate": 4.9837486470570286e-05, + "loss": 5.77, + "step": 6107 + }, + { + "epoch": 0.03632600628033115, + "grad_norm": 1.8399784564971924, + "learning_rate": 4.9837433293215344e-05, + "loss": 5.6669, + "step": 6108 + }, + { + "epoch": 0.03633195356361214, + "grad_norm": 1.799460530281067, + "learning_rate": 4.983738010718991e-05, + "loss": 5.5557, + "step": 6109 + }, + { + "epoch": 0.03633790084689314, + "grad_norm": 1.8826879262924194, + "learning_rate": 4.9837326912494e-05, + "loss": 5.4865, + "step": 6110 + }, + { + "epoch": 0.03634384813017413, + "grad_norm": 1.9582240581512451, + "learning_rate": 4.983727370912764e-05, + "loss": 5.5882, + "step": 6111 + }, + { + "epoch": 0.036349795413455135, + "grad_norm": 2.011892795562744, + "learning_rate": 4.9837220497090846e-05, + "loss": 5.4932, + "step": 6112 + }, + { + "epoch": 0.03635574269673613, + "grad_norm": 1.7751367092132568, + "learning_rate": 4.983716727638363e-05, + "loss": 5.4981, + "step": 6113 + }, + { + "epoch": 0.036361689980017126, + "grad_norm": 1.984121322631836, + "learning_rate": 4.983711404700603e-05, + "loss": 5.4801, + "step": 6114 + }, + { + "epoch": 0.03636763726329813, + "grad_norm": 1.9601882696151733, + "learning_rate": 4.983706080895804e-05, + "loss": 5.218, + "step": 6115 + }, + { + "epoch": 0.03637358454657912, + "grad_norm": 1.800227165222168, + "learning_rate": 4.9837007562239684e-05, + "loss": 5.5178, + "step": 6116 + }, + { + "epoch": 0.03637953182986012, + "grad_norm": 1.9257889986038208, + "learning_rate": 4.983695430685099e-05, + "loss": 5.6695, + "step": 6117 + }, + { + "epoch": 0.03638547911314112, + "grad_norm": 1.8011913299560547, + "learning_rate": 4.9836901042791976e-05, + "loss": 5.7478, + "step": 6118 + }, + { + "epoch": 0.036391426396422115, + "grad_norm": 1.8668690919876099, + "learning_rate": 4.983684777006264e-05, + "loss": 5.7027, + "step": 6119 + }, + { + "epoch": 0.03639737367970311, + "grad_norm": 1.898126244544983, + "learning_rate": 4.983679448866304e-05, + "loss": 5.5206, + "step": 6120 + }, + { + "epoch": 0.03640332096298411, + "grad_norm": 1.8264409303665161, + "learning_rate": 4.983674119859316e-05, + "loss": 5.4686, + "step": 6121 + }, + { + "epoch": 0.03640926824626511, + "grad_norm": 1.8090230226516724, + "learning_rate": 4.983668789985303e-05, + "loss": 5.4761, + "step": 6122 + }, + { + "epoch": 0.0364152155295461, + "grad_norm": 1.8193403482437134, + "learning_rate": 4.983663459244266e-05, + "loss": 5.3443, + "step": 6123 + }, + { + "epoch": 0.0364211628128271, + "grad_norm": 1.8199255466461182, + "learning_rate": 4.9836581276362095e-05, + "loss": 5.427, + "step": 6124 + }, + { + "epoch": 0.0364271100961081, + "grad_norm": 1.72145414352417, + "learning_rate": 4.9836527951611325e-05, + "loss": 5.4372, + "step": 6125 + }, + { + "epoch": 0.036433057379389094, + "grad_norm": 1.8164423704147339, + "learning_rate": 4.9836474618190386e-05, + "loss": 5.4702, + "step": 6126 + }, + { + "epoch": 0.03643900466267009, + "grad_norm": 1.897775650024414, + "learning_rate": 4.9836421276099287e-05, + "loss": 5.4259, + "step": 6127 + }, + { + "epoch": 0.03644495194595109, + "grad_norm": 1.851101279258728, + "learning_rate": 4.9836367925338046e-05, + "loss": 5.3837, + "step": 6128 + }, + { + "epoch": 0.03645089922923209, + "grad_norm": 1.749374508857727, + "learning_rate": 4.98363145659067e-05, + "loss": 5.3232, + "step": 6129 + }, + { + "epoch": 0.03645684651251308, + "grad_norm": 1.95986008644104, + "learning_rate": 4.9836261197805235e-05, + "loss": 5.2692, + "step": 6130 + }, + { + "epoch": 0.036462793795794084, + "grad_norm": 1.7947750091552734, + "learning_rate": 4.98362078210337e-05, + "loss": 5.409, + "step": 6131 + }, + { + "epoch": 0.03646874107907508, + "grad_norm": 2.119044303894043, + "learning_rate": 4.983615443559209e-05, + "loss": 5.5924, + "step": 6132 + }, + { + "epoch": 0.036474688362356074, + "grad_norm": 1.7285267114639282, + "learning_rate": 4.983610104148044e-05, + "loss": 5.6955, + "step": 6133 + }, + { + "epoch": 0.036480635645637076, + "grad_norm": 2.1711652278900146, + "learning_rate": 4.983604763869877e-05, + "loss": 5.1941, + "step": 6134 + }, + { + "epoch": 0.03648658292891807, + "grad_norm": 2.060039758682251, + "learning_rate": 4.983599422724709e-05, + "loss": 5.5131, + "step": 6135 + }, + { + "epoch": 0.036492530212199066, + "grad_norm": 1.6212393045425415, + "learning_rate": 4.9835940807125415e-05, + "loss": 5.4856, + "step": 6136 + }, + { + "epoch": 0.03649847749548007, + "grad_norm": 1.7602918148040771, + "learning_rate": 4.983588737833378e-05, + "loss": 5.4177, + "step": 6137 + }, + { + "epoch": 0.03650442477876106, + "grad_norm": 2.660930633544922, + "learning_rate": 4.983583394087218e-05, + "loss": 5.5879, + "step": 6138 + }, + { + "epoch": 0.03651037206204206, + "grad_norm": 2.3608336448669434, + "learning_rate": 4.9835780494740655e-05, + "loss": 5.3894, + "step": 6139 + }, + { + "epoch": 0.03651631934532305, + "grad_norm": 2.071632146835327, + "learning_rate": 4.983572703993922e-05, + "loss": 5.6185, + "step": 6140 + }, + { + "epoch": 0.036522266628604055, + "grad_norm": 1.7023842334747314, + "learning_rate": 4.983567357646788e-05, + "loss": 5.5648, + "step": 6141 + }, + { + "epoch": 0.03652821391188505, + "grad_norm": 2.2168798446655273, + "learning_rate": 4.983562010432667e-05, + "loss": 5.4578, + "step": 6142 + }, + { + "epoch": 0.036534161195166046, + "grad_norm": 2.0916104316711426, + "learning_rate": 4.98355666235156e-05, + "loss": 5.4977, + "step": 6143 + }, + { + "epoch": 0.03654010847844705, + "grad_norm": 1.7101606130599976, + "learning_rate": 4.9835513134034686e-05, + "loss": 5.4081, + "step": 6144 + }, + { + "epoch": 0.03654605576172804, + "grad_norm": 1.9058302640914917, + "learning_rate": 4.983545963588395e-05, + "loss": 5.2145, + "step": 6145 + }, + { + "epoch": 0.03655200304500904, + "grad_norm": 2.319023847579956, + "learning_rate": 4.9835406129063424e-05, + "loss": 5.3023, + "step": 6146 + }, + { + "epoch": 0.03655795032829004, + "grad_norm": 2.1135916709899902, + "learning_rate": 4.98353526135731e-05, + "loss": 5.4796, + "step": 6147 + }, + { + "epoch": 0.036563897611571035, + "grad_norm": 2.409088373184204, + "learning_rate": 4.983529908941302e-05, + "loss": 5.3124, + "step": 6148 + }, + { + "epoch": 0.03656984489485203, + "grad_norm": 1.8679871559143066, + "learning_rate": 4.9835245556583185e-05, + "loss": 5.3741, + "step": 6149 + }, + { + "epoch": 0.03657579217813303, + "grad_norm": 1.9335602521896362, + "learning_rate": 4.983519201508363e-05, + "loss": 5.3231, + "step": 6150 + }, + { + "epoch": 0.03658173946141403, + "grad_norm": 2.0352535247802734, + "learning_rate": 4.9835138464914366e-05, + "loss": 5.4643, + "step": 6151 + }, + { + "epoch": 0.03658768674469502, + "grad_norm": 2.4156594276428223, + "learning_rate": 4.983508490607541e-05, + "loss": 5.4092, + "step": 6152 + }, + { + "epoch": 0.03659363402797602, + "grad_norm": 2.1936473846435547, + "learning_rate": 4.983503133856678e-05, + "loss": 5.5093, + "step": 6153 + }, + { + "epoch": 0.03659958131125702, + "grad_norm": 1.6346958875656128, + "learning_rate": 4.98349777623885e-05, + "loss": 5.512, + "step": 6154 + }, + { + "epoch": 0.036605528594538014, + "grad_norm": 1.9810141324996948, + "learning_rate": 4.9834924177540584e-05, + "loss": 5.4981, + "step": 6155 + }, + { + "epoch": 0.03661147587781901, + "grad_norm": 2.1253950595855713, + "learning_rate": 4.9834870584023055e-05, + "loss": 5.4022, + "step": 6156 + }, + { + "epoch": 0.03661742316110001, + "grad_norm": 2.011754274368286, + "learning_rate": 4.9834816981835926e-05, + "loss": 5.6107, + "step": 6157 + }, + { + "epoch": 0.036623370444381007, + "grad_norm": 2.210934638977051, + "learning_rate": 4.983476337097922e-05, + "loss": 5.4348, + "step": 6158 + }, + { + "epoch": 0.036629317727662, + "grad_norm": 2.1351871490478516, + "learning_rate": 4.983470975145296e-05, + "loss": 5.2022, + "step": 6159 + }, + { + "epoch": 0.036635265010943004, + "grad_norm": 2.1564714908599854, + "learning_rate": 4.983465612325715e-05, + "loss": 5.3583, + "step": 6160 + }, + { + "epoch": 0.036641212294224, + "grad_norm": 1.9411755800247192, + "learning_rate": 4.983460248639182e-05, + "loss": 5.4643, + "step": 6161 + }, + { + "epoch": 0.036647159577504994, + "grad_norm": 2.129741907119751, + "learning_rate": 4.983454884085699e-05, + "loss": 5.3834, + "step": 6162 + }, + { + "epoch": 0.036653106860785996, + "grad_norm": 2.12172269821167, + "learning_rate": 4.983449518665268e-05, + "loss": 5.4418, + "step": 6163 + }, + { + "epoch": 0.03665905414406699, + "grad_norm": 2.097452163696289, + "learning_rate": 4.9834441523778893e-05, + "loss": 5.3741, + "step": 6164 + }, + { + "epoch": 0.036665001427347986, + "grad_norm": 2.0458765029907227, + "learning_rate": 4.983438785223567e-05, + "loss": 5.373, + "step": 6165 + }, + { + "epoch": 0.03667094871062899, + "grad_norm": 1.9431376457214355, + "learning_rate": 4.983433417202301e-05, + "loss": 5.4003, + "step": 6166 + }, + { + "epoch": 0.03667689599390998, + "grad_norm": 2.136819362640381, + "learning_rate": 4.983428048314095e-05, + "loss": 5.503, + "step": 6167 + }, + { + "epoch": 0.03668284327719098, + "grad_norm": 1.863153338432312, + "learning_rate": 4.983422678558949e-05, + "loss": 5.4357, + "step": 6168 + }, + { + "epoch": 0.03668879056047197, + "grad_norm": 1.9198437929153442, + "learning_rate": 4.9834173079368665e-05, + "loss": 5.4304, + "step": 6169 + }, + { + "epoch": 0.036694737843752975, + "grad_norm": 1.9080480337142944, + "learning_rate": 4.9834119364478484e-05, + "loss": 5.4329, + "step": 6170 + }, + { + "epoch": 0.03670068512703397, + "grad_norm": 1.9116952419281006, + "learning_rate": 4.983406564091897e-05, + "loss": 5.3248, + "step": 6171 + }, + { + "epoch": 0.036706632410314965, + "grad_norm": 2.007685661315918, + "learning_rate": 4.983401190869014e-05, + "loss": 5.3554, + "step": 6172 + }, + { + "epoch": 0.03671257969359597, + "grad_norm": 1.8134535551071167, + "learning_rate": 4.983395816779201e-05, + "loss": 5.2907, + "step": 6173 + }, + { + "epoch": 0.03671852697687696, + "grad_norm": 2.093061685562134, + "learning_rate": 4.9833904418224606e-05, + "loss": 5.4055, + "step": 6174 + }, + { + "epoch": 0.03672447426015796, + "grad_norm": 2.1263599395751953, + "learning_rate": 4.9833850659987934e-05, + "loss": 5.2758, + "step": 6175 + }, + { + "epoch": 0.03673042154343896, + "grad_norm": 1.9442895650863647, + "learning_rate": 4.983379689308203e-05, + "loss": 5.4183, + "step": 6176 + }, + { + "epoch": 0.036736368826719955, + "grad_norm": 1.9587830305099487, + "learning_rate": 4.98337431175069e-05, + "loss": 5.3624, + "step": 6177 + }, + { + "epoch": 0.03674231611000095, + "grad_norm": 1.9845789670944214, + "learning_rate": 4.9833689333262565e-05, + "loss": 5.3933, + "step": 6178 + }, + { + "epoch": 0.03674826339328195, + "grad_norm": 1.9748643636703491, + "learning_rate": 4.9833635540349055e-05, + "loss": 5.5221, + "step": 6179 + }, + { + "epoch": 0.03675421067656295, + "grad_norm": 1.8139559030532837, + "learning_rate": 4.983358173876638e-05, + "loss": 5.5524, + "step": 6180 + }, + { + "epoch": 0.03676015795984394, + "grad_norm": 1.93784499168396, + "learning_rate": 4.9833527928514546e-05, + "loss": 5.7145, + "step": 6181 + }, + { + "epoch": 0.03676610524312494, + "grad_norm": 1.9064222574234009, + "learning_rate": 4.9833474109593594e-05, + "loss": 5.5283, + "step": 6182 + }, + { + "epoch": 0.03677205252640594, + "grad_norm": 1.7044670581817627, + "learning_rate": 4.9833420282003524e-05, + "loss": 5.2877, + "step": 6183 + }, + { + "epoch": 0.036777999809686934, + "grad_norm": 1.8328427076339722, + "learning_rate": 4.983336644574437e-05, + "loss": 5.5019, + "step": 6184 + }, + { + "epoch": 0.03678394709296793, + "grad_norm": 1.600780725479126, + "learning_rate": 4.983331260081614e-05, + "loss": 5.5347, + "step": 6185 + }, + { + "epoch": 0.03678989437624893, + "grad_norm": 1.8333978652954102, + "learning_rate": 4.983325874721886e-05, + "loss": 5.5127, + "step": 6186 + }, + { + "epoch": 0.036795841659529926, + "grad_norm": 1.8825682401657104, + "learning_rate": 4.9833204884952546e-05, + "loss": 5.5338, + "step": 6187 + }, + { + "epoch": 0.03680178894281092, + "grad_norm": 1.6875951290130615, + "learning_rate": 4.983315101401721e-05, + "loss": 5.2465, + "step": 6188 + }, + { + "epoch": 0.036807736226091924, + "grad_norm": 1.6224017143249512, + "learning_rate": 4.983309713441289e-05, + "loss": 5.4741, + "step": 6189 + }, + { + "epoch": 0.03681368350937292, + "grad_norm": 1.991721272468567, + "learning_rate": 4.983304324613958e-05, + "loss": 5.4547, + "step": 6190 + }, + { + "epoch": 0.036819630792653914, + "grad_norm": 1.843961238861084, + "learning_rate": 4.983298934919732e-05, + "loss": 5.3262, + "step": 6191 + }, + { + "epoch": 0.036825578075934916, + "grad_norm": 1.8342533111572266, + "learning_rate": 4.983293544358612e-05, + "loss": 5.6808, + "step": 6192 + }, + { + "epoch": 0.03683152535921591, + "grad_norm": 1.8796159029006958, + "learning_rate": 4.983288152930599e-05, + "loss": 5.5454, + "step": 6193 + }, + { + "epoch": 0.036837472642496906, + "grad_norm": 1.9033316373825073, + "learning_rate": 4.983282760635696e-05, + "loss": 5.3566, + "step": 6194 + }, + { + "epoch": 0.03684341992577791, + "grad_norm": 1.915873408317566, + "learning_rate": 4.9832773674739054e-05, + "loss": 5.4555, + "step": 6195 + }, + { + "epoch": 0.0368493672090589, + "grad_norm": 1.8510993719100952, + "learning_rate": 4.983271973445228e-05, + "loss": 5.5042, + "step": 6196 + }, + { + "epoch": 0.0368553144923399, + "grad_norm": 1.7180782556533813, + "learning_rate": 4.983266578549666e-05, + "loss": 5.4671, + "step": 6197 + }, + { + "epoch": 0.03686126177562089, + "grad_norm": 1.7828874588012695, + "learning_rate": 4.983261182787221e-05, + "loss": 5.4943, + "step": 6198 + }, + { + "epoch": 0.036867209058901895, + "grad_norm": 1.5032141208648682, + "learning_rate": 4.983255786157895e-05, + "loss": 5.3881, + "step": 6199 + }, + { + "epoch": 0.03687315634218289, + "grad_norm": 2.530954599380493, + "learning_rate": 4.983250388661691e-05, + "loss": 5.4449, + "step": 6200 + }, + { + "epoch": 0.036879103625463885, + "grad_norm": 2.011044979095459, + "learning_rate": 4.983244990298609e-05, + "loss": 5.2722, + "step": 6201 + }, + { + "epoch": 0.03688505090874489, + "grad_norm": 2.2209532260894775, + "learning_rate": 4.9832395910686525e-05, + "loss": 5.0932, + "step": 6202 + }, + { + "epoch": 0.03689099819202588, + "grad_norm": 1.8695623874664307, + "learning_rate": 4.983234190971823e-05, + "loss": 5.2891, + "step": 6203 + }, + { + "epoch": 0.03689694547530688, + "grad_norm": 2.172349691390991, + "learning_rate": 4.983228790008121e-05, + "loss": 5.578, + "step": 6204 + }, + { + "epoch": 0.03690289275858788, + "grad_norm": 2.1099209785461426, + "learning_rate": 4.9832233881775505e-05, + "loss": 5.3708, + "step": 6205 + }, + { + "epoch": 0.036908840041868875, + "grad_norm": 2.16737961769104, + "learning_rate": 4.9832179854801116e-05, + "loss": 5.303, + "step": 6206 + }, + { + "epoch": 0.03691478732514987, + "grad_norm": 2.248220682144165, + "learning_rate": 4.983212581915807e-05, + "loss": 5.362, + "step": 6207 + }, + { + "epoch": 0.03692073460843087, + "grad_norm": 2.0701045989990234, + "learning_rate": 4.983207177484639e-05, + "loss": 5.4528, + "step": 6208 + }, + { + "epoch": 0.03692668189171187, + "grad_norm": 1.9989019632339478, + "learning_rate": 4.983201772186609e-05, + "loss": 5.786, + "step": 6209 + }, + { + "epoch": 0.03693262917499286, + "grad_norm": 1.9126088619232178, + "learning_rate": 4.983196366021719e-05, + "loss": 5.2312, + "step": 6210 + }, + { + "epoch": 0.03693857645827386, + "grad_norm": 2.1317548751831055, + "learning_rate": 4.9831909589899695e-05, + "loss": 5.3028, + "step": 6211 + }, + { + "epoch": 0.03694452374155486, + "grad_norm": 2.164898157119751, + "learning_rate": 4.983185551091365e-05, + "loss": 5.3186, + "step": 6212 + }, + { + "epoch": 0.036950471024835854, + "grad_norm": 2.1085855960845947, + "learning_rate": 4.983180142325906e-05, + "loss": 5.3026, + "step": 6213 + }, + { + "epoch": 0.03695641830811685, + "grad_norm": 1.8321222066879272, + "learning_rate": 4.983174732693594e-05, + "loss": 5.6632, + "step": 6214 + }, + { + "epoch": 0.03696236559139785, + "grad_norm": 2.0537941455841064, + "learning_rate": 4.983169322194432e-05, + "loss": 5.2269, + "step": 6215 + }, + { + "epoch": 0.036968312874678846, + "grad_norm": 1.9598063230514526, + "learning_rate": 4.98316391082842e-05, + "loss": 5.4974, + "step": 6216 + }, + { + "epoch": 0.03697426015795984, + "grad_norm": 2.3764376640319824, + "learning_rate": 4.983158498595563e-05, + "loss": 5.7715, + "step": 6217 + }, + { + "epoch": 0.036980207441240844, + "grad_norm": 1.8938835859298706, + "learning_rate": 4.9831530854958595e-05, + "loss": 5.5577, + "step": 6218 + }, + { + "epoch": 0.03698615472452184, + "grad_norm": 2.2023189067840576, + "learning_rate": 4.9831476715293134e-05, + "loss": 5.2596, + "step": 6219 + }, + { + "epoch": 0.036992102007802834, + "grad_norm": 1.9010800123214722, + "learning_rate": 4.9831422566959266e-05, + "loss": 5.3313, + "step": 6220 + }, + { + "epoch": 0.036998049291083836, + "grad_norm": 1.9679474830627441, + "learning_rate": 4.9831368409957e-05, + "loss": 5.2701, + "step": 6221 + }, + { + "epoch": 0.03700399657436483, + "grad_norm": 1.903558373451233, + "learning_rate": 4.983131424428635e-05, + "loss": 5.2821, + "step": 6222 + }, + { + "epoch": 0.037009943857645826, + "grad_norm": 1.976114273071289, + "learning_rate": 4.983126006994736e-05, + "loss": 5.374, + "step": 6223 + }, + { + "epoch": 0.03701589114092683, + "grad_norm": 2.9803311824798584, + "learning_rate": 4.983120588694003e-05, + "loss": 5.3576, + "step": 6224 + }, + { + "epoch": 0.03702183842420782, + "grad_norm": 1.5921218395233154, + "learning_rate": 4.983115169526438e-05, + "loss": 5.1654, + "step": 6225 + }, + { + "epoch": 0.03702778570748882, + "grad_norm": 1.7458349466323853, + "learning_rate": 4.983109749492043e-05, + "loss": 5.1038, + "step": 6226 + }, + { + "epoch": 0.03703373299076981, + "grad_norm": 1.9425132274627686, + "learning_rate": 4.983104328590821e-05, + "loss": 5.3815, + "step": 6227 + }, + { + "epoch": 0.037039680274050815, + "grad_norm": 1.9506715536117554, + "learning_rate": 4.983098906822772e-05, + "loss": 5.2215, + "step": 6228 + }, + { + "epoch": 0.03704562755733181, + "grad_norm": 1.8596410751342773, + "learning_rate": 4.983093484187899e-05, + "loss": 5.2058, + "step": 6229 + }, + { + "epoch": 0.037051574840612805, + "grad_norm": 1.720473289489746, + "learning_rate": 4.9830880606862043e-05, + "loss": 5.2701, + "step": 6230 + }, + { + "epoch": 0.03705752212389381, + "grad_norm": 1.7786411046981812, + "learning_rate": 4.983082636317688e-05, + "loss": 5.3216, + "step": 6231 + }, + { + "epoch": 0.0370634694071748, + "grad_norm": 3.6291537284851074, + "learning_rate": 4.983077211082354e-05, + "loss": 5.2282, + "step": 6232 + }, + { + "epoch": 0.0370694166904558, + "grad_norm": 1.7453030347824097, + "learning_rate": 4.983071784980203e-05, + "loss": 5.2667, + "step": 6233 + }, + { + "epoch": 0.0370753639737368, + "grad_norm": 1.7036694288253784, + "learning_rate": 4.983066358011238e-05, + "loss": 5.3023, + "step": 6234 + }, + { + "epoch": 0.037081311257017795, + "grad_norm": 1.7196505069732666, + "learning_rate": 4.9830609301754595e-05, + "loss": 5.2211, + "step": 6235 + }, + { + "epoch": 0.03708725854029879, + "grad_norm": 3.4630305767059326, + "learning_rate": 4.983055501472871e-05, + "loss": 5.6159, + "step": 6236 + }, + { + "epoch": 0.03709320582357979, + "grad_norm": 2.9739367961883545, + "learning_rate": 4.9830500719034726e-05, + "loss": 5.4477, + "step": 6237 + }, + { + "epoch": 0.03709915310686079, + "grad_norm": 2.760664463043213, + "learning_rate": 4.983044641467267e-05, + "loss": 5.0879, + "step": 6238 + }, + { + "epoch": 0.03710510039014178, + "grad_norm": 2.166203022003174, + "learning_rate": 4.9830392101642566e-05, + "loss": 5.5635, + "step": 6239 + }, + { + "epoch": 0.03711104767342278, + "grad_norm": 2.3798410892486572, + "learning_rate": 4.9830337779944425e-05, + "loss": 5.0676, + "step": 6240 + }, + { + "epoch": 0.03711699495670378, + "grad_norm": 2.3990557193756104, + "learning_rate": 4.983028344957827e-05, + "loss": 5.2788, + "step": 6241 + }, + { + "epoch": 0.037122942239984774, + "grad_norm": 2.487978458404541, + "learning_rate": 4.9830229110544124e-05, + "loss": 5.852, + "step": 6242 + }, + { + "epoch": 0.03712888952326577, + "grad_norm": 2.304749011993408, + "learning_rate": 4.9830174762842e-05, + "loss": 6.0886, + "step": 6243 + }, + { + "epoch": 0.03713483680654677, + "grad_norm": 2.169614791870117, + "learning_rate": 4.983012040647191e-05, + "loss": 6.1178, + "step": 6244 + }, + { + "epoch": 0.037140784089827766, + "grad_norm": 2.119131326675415, + "learning_rate": 4.98300660414339e-05, + "loss": 6.25, + "step": 6245 + }, + { + "epoch": 0.03714673137310876, + "grad_norm": 2.3797547817230225, + "learning_rate": 4.9830011667727964e-05, + "loss": 5.879, + "step": 6246 + }, + { + "epoch": 0.03715267865638976, + "grad_norm": 2.303718328475952, + "learning_rate": 4.982995728535411e-05, + "loss": 6.0015, + "step": 6247 + }, + { + "epoch": 0.03715862593967076, + "grad_norm": 2.867103099822998, + "learning_rate": 4.9829902894312396e-05, + "loss": 5.8726, + "step": 6248 + }, + { + "epoch": 0.037164573222951754, + "grad_norm": 2.4248557090759277, + "learning_rate": 4.9829848494602806e-05, + "loss": 5.6579, + "step": 6249 + }, + { + "epoch": 0.037170520506232756, + "grad_norm": 2.2622148990631104, + "learning_rate": 4.982979408622538e-05, + "loss": 5.7677, + "step": 6250 + }, + { + "epoch": 0.03717646778951375, + "grad_norm": 2.320502996444702, + "learning_rate": 4.9829739669180126e-05, + "loss": 5.7362, + "step": 6251 + }, + { + "epoch": 0.037182415072794746, + "grad_norm": 2.2096636295318604, + "learning_rate": 4.9829685243467065e-05, + "loss": 5.9069, + "step": 6252 + }, + { + "epoch": 0.03718836235607575, + "grad_norm": 2.620361089706421, + "learning_rate": 4.982963080908623e-05, + "loss": 5.9419, + "step": 6253 + }, + { + "epoch": 0.03719430963935674, + "grad_norm": 2.478158950805664, + "learning_rate": 4.982957636603761e-05, + "loss": 6.4776, + "step": 6254 + }, + { + "epoch": 0.03720025692263774, + "grad_norm": 2.5912528038024902, + "learning_rate": 4.982952191432125e-05, + "loss": 5.7176, + "step": 6255 + }, + { + "epoch": 0.03720620420591873, + "grad_norm": 2.57177734375, + "learning_rate": 4.982946745393716e-05, + "loss": 5.4271, + "step": 6256 + }, + { + "epoch": 0.037212151489199735, + "grad_norm": 2.424567699432373, + "learning_rate": 4.982941298488535e-05, + "loss": 5.82, + "step": 6257 + }, + { + "epoch": 0.03721809877248073, + "grad_norm": 2.477827548980713, + "learning_rate": 4.9829358507165856e-05, + "loss": 5.7961, + "step": 6258 + }, + { + "epoch": 0.037224046055761725, + "grad_norm": 2.0598270893096924, + "learning_rate": 4.982930402077869e-05, + "loss": 5.9264, + "step": 6259 + }, + { + "epoch": 0.03722999333904273, + "grad_norm": 2.0599095821380615, + "learning_rate": 4.9829249525723875e-05, + "loss": 6.0518, + "step": 6260 + }, + { + "epoch": 0.03723594062232372, + "grad_norm": 2.110170841217041, + "learning_rate": 4.982919502200142e-05, + "loss": 5.8631, + "step": 6261 + }, + { + "epoch": 0.03724188790560472, + "grad_norm": 2.333972930908203, + "learning_rate": 4.982914050961135e-05, + "loss": 5.5361, + "step": 6262 + }, + { + "epoch": 0.03724783518888572, + "grad_norm": 2.2322769165039062, + "learning_rate": 4.982908598855369e-05, + "loss": 5.8002, + "step": 6263 + }, + { + "epoch": 0.037253782472166715, + "grad_norm": 1.9915717840194702, + "learning_rate": 4.982903145882845e-05, + "loss": 5.7096, + "step": 6264 + }, + { + "epoch": 0.03725972975544771, + "grad_norm": 2.2031619548797607, + "learning_rate": 4.9828976920435645e-05, + "loss": 5.5716, + "step": 6265 + }, + { + "epoch": 0.03726567703872871, + "grad_norm": 2.9422314167022705, + "learning_rate": 4.9828922373375295e-05, + "loss": 5.929, + "step": 6266 + }, + { + "epoch": 0.03727162432200971, + "grad_norm": 3.264784336090088, + "learning_rate": 4.982886781764744e-05, + "loss": 5.9801, + "step": 6267 + }, + { + "epoch": 0.0372775716052907, + "grad_norm": 2.8314197063446045, + "learning_rate": 4.982881325325208e-05, + "loss": 6.0173, + "step": 6268 + }, + { + "epoch": 0.0372835188885717, + "grad_norm": 2.9550328254699707, + "learning_rate": 4.9828758680189234e-05, + "loss": 5.9838, + "step": 6269 + }, + { + "epoch": 0.0372894661718527, + "grad_norm": 2.6827526092529297, + "learning_rate": 4.9828704098458924e-05, + "loss": 6.0235, + "step": 6270 + }, + { + "epoch": 0.037295413455133694, + "grad_norm": 2.7174222469329834, + "learning_rate": 4.982864950806118e-05, + "loss": 5.8315, + "step": 6271 + }, + { + "epoch": 0.03730136073841469, + "grad_norm": 2.6177315711975098, + "learning_rate": 4.9828594908996e-05, + "loss": 5.8577, + "step": 6272 + }, + { + "epoch": 0.03730730802169569, + "grad_norm": 2.449669361114502, + "learning_rate": 4.982854030126342e-05, + "loss": 5.9591, + "step": 6273 + }, + { + "epoch": 0.037313255304976686, + "grad_norm": 2.5328989028930664, + "learning_rate": 4.9828485684863446e-05, + "loss": 5.7764, + "step": 6274 + }, + { + "epoch": 0.03731920258825768, + "grad_norm": 2.2581989765167236, + "learning_rate": 4.982843105979611e-05, + "loss": 5.9524, + "step": 6275 + }, + { + "epoch": 0.03732514987153868, + "grad_norm": 2.261212110519409, + "learning_rate": 4.982837642606142e-05, + "loss": 5.5814, + "step": 6276 + }, + { + "epoch": 0.03733109715481968, + "grad_norm": 2.2957348823547363, + "learning_rate": 4.98283217836594e-05, + "loss": 5.6967, + "step": 6277 + }, + { + "epoch": 0.037337044438100674, + "grad_norm": 2.814037322998047, + "learning_rate": 4.982826713259008e-05, + "loss": 5.8787, + "step": 6278 + }, + { + "epoch": 0.037342991721381676, + "grad_norm": 2.678133249282837, + "learning_rate": 4.9828212472853464e-05, + "loss": 5.94, + "step": 6279 + }, + { + "epoch": 0.03734893900466267, + "grad_norm": 2.2949652671813965, + "learning_rate": 4.982815780444957e-05, + "loss": 5.7263, + "step": 6280 + }, + { + "epoch": 0.037354886287943666, + "grad_norm": 2.4542131423950195, + "learning_rate": 4.982810312737842e-05, + "loss": 5.8317, + "step": 6281 + }, + { + "epoch": 0.03736083357122467, + "grad_norm": 2.7850544452667236, + "learning_rate": 4.982804844164005e-05, + "loss": 5.5631, + "step": 6282 + }, + { + "epoch": 0.03736678085450566, + "grad_norm": 2.6285061836242676, + "learning_rate": 4.9827993747234454e-05, + "loss": 5.6212, + "step": 6283 + }, + { + "epoch": 0.03737272813778666, + "grad_norm": 2.602590799331665, + "learning_rate": 4.9827939044161666e-05, + "loss": 5.5529, + "step": 6284 + }, + { + "epoch": 0.03737867542106765, + "grad_norm": 2.6196670532226562, + "learning_rate": 4.98278843324217e-05, + "loss": 5.6915, + "step": 6285 + }, + { + "epoch": 0.037384622704348655, + "grad_norm": 2.7072317600250244, + "learning_rate": 4.982782961201457e-05, + "loss": 5.7535, + "step": 6286 + }, + { + "epoch": 0.03739056998762965, + "grad_norm": 2.626033067703247, + "learning_rate": 4.982777488294031e-05, + "loss": 5.6053, + "step": 6287 + }, + { + "epoch": 0.037396517270910645, + "grad_norm": 1.8426648378372192, + "learning_rate": 4.982772014519892e-05, + "loss": 5.6167, + "step": 6288 + }, + { + "epoch": 0.03740246455419165, + "grad_norm": 2.5587830543518066, + "learning_rate": 4.9827665398790445e-05, + "loss": 5.6442, + "step": 6289 + }, + { + "epoch": 0.03740841183747264, + "grad_norm": 2.6163039207458496, + "learning_rate": 4.9827610643714877e-05, + "loss": 5.699, + "step": 6290 + }, + { + "epoch": 0.03741435912075364, + "grad_norm": 2.5752358436584473, + "learning_rate": 4.982755587997225e-05, + "loss": 5.666, + "step": 6291 + }, + { + "epoch": 0.03742030640403464, + "grad_norm": 2.6609575748443604, + "learning_rate": 4.982750110756258e-05, + "loss": 5.5634, + "step": 6292 + }, + { + "epoch": 0.037426253687315635, + "grad_norm": 2.724731683731079, + "learning_rate": 4.9827446326485884e-05, + "loss": 5.6259, + "step": 6293 + }, + { + "epoch": 0.03743220097059663, + "grad_norm": 2.5849807262420654, + "learning_rate": 4.9827391536742185e-05, + "loss": 5.6182, + "step": 6294 + }, + { + "epoch": 0.03743814825387763, + "grad_norm": 2.6737449169158936, + "learning_rate": 4.9827336738331496e-05, + "loss": 5.5426, + "step": 6295 + }, + { + "epoch": 0.03744409553715863, + "grad_norm": 2.5739669799804688, + "learning_rate": 4.9827281931253844e-05, + "loss": 5.6283, + "step": 6296 + }, + { + "epoch": 0.03745004282043962, + "grad_norm": 2.652730703353882, + "learning_rate": 4.982722711550924e-05, + "loss": 5.5241, + "step": 6297 + }, + { + "epoch": 0.037455990103720624, + "grad_norm": 2.7140653133392334, + "learning_rate": 4.982717229109772e-05, + "loss": 5.7052, + "step": 6298 + }, + { + "epoch": 0.03746193738700162, + "grad_norm": 2.1617860794067383, + "learning_rate": 4.982711745801928e-05, + "loss": 5.6224, + "step": 6299 + }, + { + "epoch": 0.037467884670282614, + "grad_norm": 2.1400585174560547, + "learning_rate": 4.982706261627395e-05, + "loss": 5.5753, + "step": 6300 + }, + { + "epoch": 0.03747383195356361, + "grad_norm": 2.4439101219177246, + "learning_rate": 4.9827007765861754e-05, + "loss": 5.6219, + "step": 6301 + }, + { + "epoch": 0.03747977923684461, + "grad_norm": 2.507141351699829, + "learning_rate": 4.9826952906782697e-05, + "loss": 5.6666, + "step": 6302 + }, + { + "epoch": 0.037485726520125606, + "grad_norm": 2.2664029598236084, + "learning_rate": 4.982689803903682e-05, + "loss": 5.7792, + "step": 6303 + }, + { + "epoch": 0.0374916738034066, + "grad_norm": 2.49678635597229, + "learning_rate": 4.982684316262411e-05, + "loss": 5.5899, + "step": 6304 + }, + { + "epoch": 0.0374976210866876, + "grad_norm": 2.244603395462036, + "learning_rate": 4.9826788277544625e-05, + "loss": 5.4624, + "step": 6305 + }, + { + "epoch": 0.0375035683699686, + "grad_norm": 2.144343376159668, + "learning_rate": 4.9826733383798366e-05, + "loss": 5.3428, + "step": 6306 + }, + { + "epoch": 0.037509515653249594, + "grad_norm": 1.7709565162658691, + "learning_rate": 4.982667848138534e-05, + "loss": 5.3596, + "step": 6307 + }, + { + "epoch": 0.037515462936530596, + "grad_norm": 2.0245232582092285, + "learning_rate": 4.9826623570305574e-05, + "loss": 5.4005, + "step": 6308 + }, + { + "epoch": 0.03752141021981159, + "grad_norm": 2.5346829891204834, + "learning_rate": 4.9826568650559095e-05, + "loss": 5.5089, + "step": 6309 + }, + { + "epoch": 0.037527357503092586, + "grad_norm": 2.638684034347534, + "learning_rate": 4.982651372214592e-05, + "loss": 5.6847, + "step": 6310 + }, + { + "epoch": 0.03753330478637359, + "grad_norm": 2.024423122406006, + "learning_rate": 4.982645878506606e-05, + "loss": 5.3633, + "step": 6311 + }, + { + "epoch": 0.03753925206965458, + "grad_norm": 1.983167290687561, + "learning_rate": 4.982640383931955e-05, + "loss": 5.2086, + "step": 6312 + }, + { + "epoch": 0.03754519935293558, + "grad_norm": 1.8388524055480957, + "learning_rate": 4.982634888490639e-05, + "loss": 5.1904, + "step": 6313 + }, + { + "epoch": 0.03755114663621657, + "grad_norm": 1.8280584812164307, + "learning_rate": 4.982629392182661e-05, + "loss": 5.3072, + "step": 6314 + }, + { + "epoch": 0.037557093919497575, + "grad_norm": 1.6278408765792847, + "learning_rate": 4.982623895008023e-05, + "loss": 5.3003, + "step": 6315 + }, + { + "epoch": 0.03756304120277857, + "grad_norm": 2.0519096851348877, + "learning_rate": 4.982618396966726e-05, + "loss": 5.3494, + "step": 6316 + }, + { + "epoch": 0.037568988486059565, + "grad_norm": 1.935744285583496, + "learning_rate": 4.982612898058773e-05, + "loss": 5.6993, + "step": 6317 + }, + { + "epoch": 0.03757493576934057, + "grad_norm": 1.882163166999817, + "learning_rate": 4.9826073982841656e-05, + "loss": 5.758, + "step": 6318 + }, + { + "epoch": 0.03758088305262156, + "grad_norm": 1.7747882604599, + "learning_rate": 4.982601897642906e-05, + "loss": 5.1501, + "step": 6319 + }, + { + "epoch": 0.03758683033590256, + "grad_norm": 2.044093370437622, + "learning_rate": 4.982596396134995e-05, + "loss": 5.2801, + "step": 6320 + }, + { + "epoch": 0.03759277761918356, + "grad_norm": 1.739441990852356, + "learning_rate": 4.9825908937604346e-05, + "loss": 5.1619, + "step": 6321 + }, + { + "epoch": 0.037598724902464555, + "grad_norm": 2.0353312492370605, + "learning_rate": 4.982585390519229e-05, + "loss": 5.6796, + "step": 6322 + }, + { + "epoch": 0.03760467218574555, + "grad_norm": 2.076667308807373, + "learning_rate": 4.9825798864113774e-05, + "loss": 6.2522, + "step": 6323 + }, + { + "epoch": 0.03761061946902655, + "grad_norm": 2.773676633834839, + "learning_rate": 4.982574381436883e-05, + "loss": 5.879, + "step": 6324 + }, + { + "epoch": 0.03761656675230755, + "grad_norm": 2.2013933658599854, + "learning_rate": 4.982568875595748e-05, + "loss": 6.0341, + "step": 6325 + }, + { + "epoch": 0.03762251403558854, + "grad_norm": 2.288806915283203, + "learning_rate": 4.9825633688879736e-05, + "loss": 6.219, + "step": 6326 + }, + { + "epoch": 0.037628461318869544, + "grad_norm": 2.874372720718384, + "learning_rate": 4.982557861313561e-05, + "loss": 5.7616, + "step": 6327 + }, + { + "epoch": 0.03763440860215054, + "grad_norm": 2.7471537590026855, + "learning_rate": 4.982552352872515e-05, + "loss": 5.7214, + "step": 6328 + }, + { + "epoch": 0.037640355885431534, + "grad_norm": 2.475513458251953, + "learning_rate": 4.982546843564834e-05, + "loss": 6.0039, + "step": 6329 + }, + { + "epoch": 0.03764630316871253, + "grad_norm": 2.5376412868499756, + "learning_rate": 4.982541333390523e-05, + "loss": 6.3042, + "step": 6330 + }, + { + "epoch": 0.03765225045199353, + "grad_norm": 2.599989414215088, + "learning_rate": 4.9825358223495814e-05, + "loss": 6.488, + "step": 6331 + }, + { + "epoch": 0.037658197735274526, + "grad_norm": 2.2657089233398438, + "learning_rate": 4.9825303104420115e-05, + "loss": 6.2743, + "step": 6332 + }, + { + "epoch": 0.03766414501855552, + "grad_norm": 2.303926467895508, + "learning_rate": 4.982524797667818e-05, + "loss": 6.3888, + "step": 6333 + }, + { + "epoch": 0.03767009230183652, + "grad_norm": 2.771775007247925, + "learning_rate": 4.982519284026999e-05, + "loss": 6.0911, + "step": 6334 + }, + { + "epoch": 0.03767603958511752, + "grad_norm": 2.492748260498047, + "learning_rate": 4.982513769519559e-05, + "loss": 5.9905, + "step": 6335 + }, + { + "epoch": 0.03768198686839851, + "grad_norm": 2.294985771179199, + "learning_rate": 4.982508254145498e-05, + "loss": 6.4574, + "step": 6336 + }, + { + "epoch": 0.037687934151679515, + "grad_norm": 2.6514554023742676, + "learning_rate": 4.9825027379048205e-05, + "loss": 6.1541, + "step": 6337 + }, + { + "epoch": 0.03769388143496051, + "grad_norm": 2.0114963054656982, + "learning_rate": 4.982497220797526e-05, + "loss": 6.0602, + "step": 6338 + }, + { + "epoch": 0.037699828718241506, + "grad_norm": 2.6345295906066895, + "learning_rate": 4.982491702823618e-05, + "loss": 6.024, + "step": 6339 + }, + { + "epoch": 0.03770577600152251, + "grad_norm": 2.619980573654175, + "learning_rate": 4.982486183983097e-05, + "loss": 6.0642, + "step": 6340 + }, + { + "epoch": 0.0377117232848035, + "grad_norm": 2.491279125213623, + "learning_rate": 4.9824806642759664e-05, + "loss": 5.8517, + "step": 6341 + }, + { + "epoch": 0.0377176705680845, + "grad_norm": 2.5161385536193848, + "learning_rate": 4.982475143702227e-05, + "loss": 5.7467, + "step": 6342 + }, + { + "epoch": 0.03772361785136549, + "grad_norm": 2.3237602710723877, + "learning_rate": 4.982469622261882e-05, + "loss": 5.801, + "step": 6343 + }, + { + "epoch": 0.037729565134646495, + "grad_norm": 2.21382999420166, + "learning_rate": 4.9824640999549314e-05, + "loss": 5.968, + "step": 6344 + }, + { + "epoch": 0.03773551241792749, + "grad_norm": 2.1770498752593994, + "learning_rate": 4.9824585767813794e-05, + "loss": 6.2998, + "step": 6345 + }, + { + "epoch": 0.037741459701208485, + "grad_norm": 2.321563720703125, + "learning_rate": 4.982453052741225e-05, + "loss": 5.631, + "step": 6346 + }, + { + "epoch": 0.03774740698448949, + "grad_norm": 3.2769439220428467, + "learning_rate": 4.982447527834473e-05, + "loss": 5.4845, + "step": 6347 + }, + { + "epoch": 0.03775335426777048, + "grad_norm": 2.954331874847412, + "learning_rate": 4.9824420020611244e-05, + "loss": 5.2, + "step": 6348 + }, + { + "epoch": 0.03775930155105148, + "grad_norm": 2.735182523727417, + "learning_rate": 4.98243647542118e-05, + "loss": 5.1907, + "step": 6349 + }, + { + "epoch": 0.03776524883433248, + "grad_norm": 2.872142791748047, + "learning_rate": 4.982430947914644e-05, + "loss": 5.5159, + "step": 6350 + }, + { + "epoch": 0.037771196117613474, + "grad_norm": 3.14219331741333, + "learning_rate": 4.982425419541517e-05, + "loss": 5.0843, + "step": 6351 + }, + { + "epoch": 0.03777714340089447, + "grad_norm": 2.2689874172210693, + "learning_rate": 4.9824198903018e-05, + "loss": 6.0446, + "step": 6352 + }, + { + "epoch": 0.03778309068417547, + "grad_norm": 2.3468856811523438, + "learning_rate": 4.982414360195496e-05, + "loss": 5.952, + "step": 6353 + }, + { + "epoch": 0.03778903796745647, + "grad_norm": 2.944509983062744, + "learning_rate": 4.9824088292226065e-05, + "loss": 5.4918, + "step": 6354 + }, + { + "epoch": 0.03779498525073746, + "grad_norm": 2.8139286041259766, + "learning_rate": 4.982403297383135e-05, + "loss": 5.3296, + "step": 6355 + }, + { + "epoch": 0.037800932534018464, + "grad_norm": 2.540224552154541, + "learning_rate": 4.982397764677081e-05, + "loss": 5.3464, + "step": 6356 + }, + { + "epoch": 0.03780687981729946, + "grad_norm": 2.56709885597229, + "learning_rate": 4.982392231104448e-05, + "loss": 5.2313, + "step": 6357 + }, + { + "epoch": 0.037812827100580454, + "grad_norm": 2.2051165103912354, + "learning_rate": 4.982386696665238e-05, + "loss": 5.7783, + "step": 6358 + }, + { + "epoch": 0.03781877438386145, + "grad_norm": 2.5773870944976807, + "learning_rate": 4.9823811613594515e-05, + "loss": 5.6691, + "step": 6359 + }, + { + "epoch": 0.03782472166714245, + "grad_norm": 2.5163073539733887, + "learning_rate": 4.982375625187092e-05, + "loss": 5.7936, + "step": 6360 + }, + { + "epoch": 0.037830668950423446, + "grad_norm": 2.4268851280212402, + "learning_rate": 4.98237008814816e-05, + "loss": 5.8116, + "step": 6361 + }, + { + "epoch": 0.03783661623370444, + "grad_norm": 2.397402286529541, + "learning_rate": 4.9823645502426597e-05, + "loss": 5.9895, + "step": 6362 + }, + { + "epoch": 0.03784256351698544, + "grad_norm": 2.590672731399536, + "learning_rate": 4.98235901147059e-05, + "loss": 5.9022, + "step": 6363 + }, + { + "epoch": 0.03784851080026644, + "grad_norm": 2.268540859222412, + "learning_rate": 4.9823534718319557e-05, + "loss": 5.8958, + "step": 6364 + }, + { + "epoch": 0.03785445808354743, + "grad_norm": 2.1419460773468018, + "learning_rate": 4.982347931326757e-05, + "loss": 5.8446, + "step": 6365 + }, + { + "epoch": 0.037860405366828435, + "grad_norm": 2.3988053798675537, + "learning_rate": 4.9823423899549957e-05, + "loss": 6.2267, + "step": 6366 + }, + { + "epoch": 0.03786635265010943, + "grad_norm": 2.120121955871582, + "learning_rate": 4.9823368477166755e-05, + "loss": 6.1352, + "step": 6367 + }, + { + "epoch": 0.037872299933390426, + "grad_norm": 2.274610996246338, + "learning_rate": 4.982331304611796e-05, + "loss": 6.1342, + "step": 6368 + }, + { + "epoch": 0.03787824721667143, + "grad_norm": 1.6934765577316284, + "learning_rate": 4.98232576064036e-05, + "loss": 5.7969, + "step": 6369 + }, + { + "epoch": 0.03788419449995242, + "grad_norm": 2.62416672706604, + "learning_rate": 4.982320215802371e-05, + "loss": 5.9669, + "step": 6370 + }, + { + "epoch": 0.03789014178323342, + "grad_norm": 2.416639804840088, + "learning_rate": 4.98231467009783e-05, + "loss": 5.9628, + "step": 6371 + }, + { + "epoch": 0.03789608906651441, + "grad_norm": 2.049412965774536, + "learning_rate": 4.9823091235267375e-05, + "loss": 5.658, + "step": 6372 + }, + { + "epoch": 0.037902036349795415, + "grad_norm": 2.0502147674560547, + "learning_rate": 4.982303576089097e-05, + "loss": 5.9114, + "step": 6373 + }, + { + "epoch": 0.03790798363307641, + "grad_norm": 2.1566948890686035, + "learning_rate": 4.982298027784909e-05, + "loss": 5.6932, + "step": 6374 + }, + { + "epoch": 0.037913930916357405, + "grad_norm": 2.394083261489868, + "learning_rate": 4.9822924786141774e-05, + "loss": 6.3041, + "step": 6375 + }, + { + "epoch": 0.03791987819963841, + "grad_norm": 2.545910120010376, + "learning_rate": 4.9822869285769024e-05, + "loss": 6.2125, + "step": 6376 + }, + { + "epoch": 0.0379258254829194, + "grad_norm": 2.271461248397827, + "learning_rate": 4.9822813776730875e-05, + "loss": 6.2322, + "step": 6377 + }, + { + "epoch": 0.0379317727662004, + "grad_norm": 2.3840630054473877, + "learning_rate": 4.9822758259027336e-05, + "loss": 6.0167, + "step": 6378 + }, + { + "epoch": 0.0379377200494814, + "grad_norm": 2.600618600845337, + "learning_rate": 4.9822702732658426e-05, + "loss": 5.6722, + "step": 6379 + }, + { + "epoch": 0.037943667332762394, + "grad_norm": 2.0911965370178223, + "learning_rate": 4.982264719762417e-05, + "loss": 5.579, + "step": 6380 + }, + { + "epoch": 0.03794961461604339, + "grad_norm": 2.015505075454712, + "learning_rate": 4.9822591653924575e-05, + "loss": 5.9747, + "step": 6381 + }, + { + "epoch": 0.03795556189932439, + "grad_norm": 2.237262010574341, + "learning_rate": 4.982253610155968e-05, + "loss": 6.3792, + "step": 6382 + }, + { + "epoch": 0.03796150918260539, + "grad_norm": 2.1448137760162354, + "learning_rate": 4.982248054052949e-05, + "loss": 6.1049, + "step": 6383 + }, + { + "epoch": 0.03796745646588638, + "grad_norm": 2.2597758769989014, + "learning_rate": 4.9822424970834034e-05, + "loss": 5.8428, + "step": 6384 + }, + { + "epoch": 0.037973403749167384, + "grad_norm": 1.9935969114303589, + "learning_rate": 4.982236939247332e-05, + "loss": 6.0032, + "step": 6385 + }, + { + "epoch": 0.03797935103244838, + "grad_norm": 2.506916046142578, + "learning_rate": 4.982231380544737e-05, + "loss": 5.9221, + "step": 6386 + }, + { + "epoch": 0.037985298315729374, + "grad_norm": 2.083393096923828, + "learning_rate": 4.9822258209756214e-05, + "loss": 5.8862, + "step": 6387 + }, + { + "epoch": 0.03799124559901037, + "grad_norm": 2.631091594696045, + "learning_rate": 4.982220260539987e-05, + "loss": 5.6593, + "step": 6388 + }, + { + "epoch": 0.03799719288229137, + "grad_norm": 2.5732531547546387, + "learning_rate": 4.982214699237834e-05, + "loss": 5.5084, + "step": 6389 + }, + { + "epoch": 0.038003140165572366, + "grad_norm": 2.7797791957855225, + "learning_rate": 4.982209137069166e-05, + "loss": 5.6792, + "step": 6390 + }, + { + "epoch": 0.03800908744885336, + "grad_norm": 2.2800772190093994, + "learning_rate": 4.982203574033984e-05, + "loss": 5.6299, + "step": 6391 + }, + { + "epoch": 0.03801503473213436, + "grad_norm": 2.4182863235473633, + "learning_rate": 4.9821980101322905e-05, + "loss": 5.71, + "step": 6392 + }, + { + "epoch": 0.03802098201541536, + "grad_norm": 2.2968835830688477, + "learning_rate": 4.982192445364088e-05, + "loss": 5.6112, + "step": 6393 + }, + { + "epoch": 0.03802692929869635, + "grad_norm": 2.3713324069976807, + "learning_rate": 4.982186879729377e-05, + "loss": 5.423, + "step": 6394 + }, + { + "epoch": 0.038032876581977355, + "grad_norm": 2.745352268218994, + "learning_rate": 4.98218131322816e-05, + "loss": 5.5145, + "step": 6395 + }, + { + "epoch": 0.03803882386525835, + "grad_norm": 2.755211353302002, + "learning_rate": 4.98217574586044e-05, + "loss": 5.4399, + "step": 6396 + }, + { + "epoch": 0.038044771148539346, + "grad_norm": 2.5452096462249756, + "learning_rate": 4.982170177626217e-05, + "loss": 5.5691, + "step": 6397 + }, + { + "epoch": 0.03805071843182035, + "grad_norm": 2.6195876598358154, + "learning_rate": 4.9821646085254954e-05, + "loss": 5.4512, + "step": 6398 + }, + { + "epoch": 0.03805666571510134, + "grad_norm": 2.4931671619415283, + "learning_rate": 4.982159038558275e-05, + "loss": 6.0505, + "step": 6399 + }, + { + "epoch": 0.03806261299838234, + "grad_norm": 2.45062255859375, + "learning_rate": 4.982153467724558e-05, + "loss": 6.2367, + "step": 6400 + }, + { + "epoch": 0.03806856028166333, + "grad_norm": 2.688624620437622, + "learning_rate": 4.982147896024348e-05, + "loss": 6.0522, + "step": 6401 + }, + { + "epoch": 0.038074507564944335, + "grad_norm": 2.421660900115967, + "learning_rate": 4.982142323457645e-05, + "loss": 5.8166, + "step": 6402 + }, + { + "epoch": 0.03808045484822533, + "grad_norm": 2.594134569168091, + "learning_rate": 4.982136750024452e-05, + "loss": 5.5476, + "step": 6403 + }, + { + "epoch": 0.038086402131506325, + "grad_norm": 2.4492971897125244, + "learning_rate": 4.982131175724771e-05, + "loss": 5.2302, + "step": 6404 + }, + { + "epoch": 0.03809234941478733, + "grad_norm": 2.4200360774993896, + "learning_rate": 4.9821256005586036e-05, + "loss": 6.1404, + "step": 6405 + }, + { + "epoch": 0.03809829669806832, + "grad_norm": 2.1949775218963623, + "learning_rate": 4.982120024525951e-05, + "loss": 5.9589, + "step": 6406 + }, + { + "epoch": 0.03810424398134932, + "grad_norm": 2.3570375442504883, + "learning_rate": 4.9821144476268164e-05, + "loss": 5.9022, + "step": 6407 + }, + { + "epoch": 0.03811019126463032, + "grad_norm": 2.16460919380188, + "learning_rate": 4.9821088698612016e-05, + "loss": 5.8535, + "step": 6408 + }, + { + "epoch": 0.038116138547911314, + "grad_norm": 1.8189443349838257, + "learning_rate": 4.982103291229108e-05, + "loss": 5.9345, + "step": 6409 + }, + { + "epoch": 0.03812208583119231, + "grad_norm": 2.553919792175293, + "learning_rate": 4.9820977117305376e-05, + "loss": 5.31, + "step": 6410 + }, + { + "epoch": 0.03812803311447331, + "grad_norm": 2.8085403442382812, + "learning_rate": 4.982092131365493e-05, + "loss": 4.9902, + "step": 6411 + }, + { + "epoch": 0.03813398039775431, + "grad_norm": 2.3698999881744385, + "learning_rate": 4.982086550133976e-05, + "loss": 5.4982, + "step": 6412 + }, + { + "epoch": 0.0381399276810353, + "grad_norm": 1.996026873588562, + "learning_rate": 4.9820809680359876e-05, + "loss": 5.6556, + "step": 6413 + }, + { + "epoch": 0.038145874964316304, + "grad_norm": 2.0816900730133057, + "learning_rate": 4.9820753850715305e-05, + "loss": 5.8823, + "step": 6414 + }, + { + "epoch": 0.0381518222475973, + "grad_norm": 2.282745122909546, + "learning_rate": 4.982069801240606e-05, + "loss": 5.1641, + "step": 6415 + }, + { + "epoch": 0.038157769530878294, + "grad_norm": 2.043991804122925, + "learning_rate": 4.982064216543217e-05, + "loss": 5.7569, + "step": 6416 + }, + { + "epoch": 0.03816371681415929, + "grad_norm": 2.086071014404297, + "learning_rate": 4.982058630979365e-05, + "loss": 5.9586, + "step": 6417 + }, + { + "epoch": 0.03816966409744029, + "grad_norm": 2.295060873031616, + "learning_rate": 4.9820530445490525e-05, + "loss": 5.3733, + "step": 6418 + }, + { + "epoch": 0.038175611380721286, + "grad_norm": 2.512267827987671, + "learning_rate": 4.98204745725228e-05, + "loss": 5.0399, + "step": 6419 + }, + { + "epoch": 0.03818155866400228, + "grad_norm": 2.5434467792510986, + "learning_rate": 4.982041869089051e-05, + "loss": 4.7907, + "step": 6420 + }, + { + "epoch": 0.03818750594728328, + "grad_norm": 2.4192142486572266, + "learning_rate": 4.9820362800593666e-05, + "loss": 4.9116, + "step": 6421 + }, + { + "epoch": 0.03819345323056428, + "grad_norm": 2.867542028427124, + "learning_rate": 4.9820306901632296e-05, + "loss": 5.9905, + "step": 6422 + }, + { + "epoch": 0.03819940051384527, + "grad_norm": 2.3099327087402344, + "learning_rate": 4.982025099400641e-05, + "loss": 5.9319, + "step": 6423 + }, + { + "epoch": 0.038205347797126275, + "grad_norm": 2.28169584274292, + "learning_rate": 4.9820195077716026e-05, + "loss": 6.2533, + "step": 6424 + }, + { + "epoch": 0.03821129508040727, + "grad_norm": 2.1065595149993896, + "learning_rate": 4.9820139152761167e-05, + "loss": 5.7123, + "step": 6425 + }, + { + "epoch": 0.038217242363688265, + "grad_norm": 2.0210213661193848, + "learning_rate": 4.9820083219141865e-05, + "loss": 5.7758, + "step": 6426 + }, + { + "epoch": 0.03822318964696927, + "grad_norm": 1.6545369625091553, + "learning_rate": 4.9820027276858114e-05, + "loss": 5.6792, + "step": 6427 + }, + { + "epoch": 0.03822913693025026, + "grad_norm": 2.177621841430664, + "learning_rate": 4.981997132590996e-05, + "loss": 6.0167, + "step": 6428 + }, + { + "epoch": 0.03823508421353126, + "grad_norm": 2.3910553455352783, + "learning_rate": 4.981991536629741e-05, + "loss": 6.1161, + "step": 6429 + }, + { + "epoch": 0.03824103149681225, + "grad_norm": 2.4915859699249268, + "learning_rate": 4.981985939802047e-05, + "loss": 5.6449, + "step": 6430 + }, + { + "epoch": 0.038246978780093255, + "grad_norm": 2.0343215465545654, + "learning_rate": 4.981980342107919e-05, + "loss": 5.967, + "step": 6431 + }, + { + "epoch": 0.03825292606337425, + "grad_norm": 1.8326199054718018, + "learning_rate": 4.9819747435473565e-05, + "loss": 5.9183, + "step": 6432 + }, + { + "epoch": 0.038258873346655245, + "grad_norm": 2.1482350826263428, + "learning_rate": 4.981969144120362e-05, + "loss": 5.794, + "step": 6433 + }, + { + "epoch": 0.03826482062993625, + "grad_norm": 2.346355438232422, + "learning_rate": 4.9819635438269384e-05, + "loss": 5.6775, + "step": 6434 + }, + { + "epoch": 0.03827076791321724, + "grad_norm": 2.252150774002075, + "learning_rate": 4.981957942667087e-05, + "loss": 5.9383, + "step": 6435 + }, + { + "epoch": 0.03827671519649824, + "grad_norm": 2.1851654052734375, + "learning_rate": 4.981952340640809e-05, + "loss": 6.0555, + "step": 6436 + }, + { + "epoch": 0.03828266247977924, + "grad_norm": 2.0609381198883057, + "learning_rate": 4.9819467377481076e-05, + "loss": 6.3209, + "step": 6437 + }, + { + "epoch": 0.038288609763060234, + "grad_norm": 2.4882800579071045, + "learning_rate": 4.981941133988984e-05, + "loss": 6.2411, + "step": 6438 + }, + { + "epoch": 0.03829455704634123, + "grad_norm": 1.8794118165969849, + "learning_rate": 4.981935529363441e-05, + "loss": 5.5696, + "step": 6439 + }, + { + "epoch": 0.03830050432962223, + "grad_norm": 2.542656660079956, + "learning_rate": 4.981929923871479e-05, + "loss": 5.8106, + "step": 6440 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 2.3871288299560547, + "learning_rate": 4.981924317513101e-05, + "loss": 5.6354, + "step": 6441 + }, + { + "epoch": 0.03831239889618422, + "grad_norm": 2.4628939628601074, + "learning_rate": 4.981918710288309e-05, + "loss": 5.9695, + "step": 6442 + }, + { + "epoch": 0.038318346179465224, + "grad_norm": 2.908543586730957, + "learning_rate": 4.9819131021971056e-05, + "loss": 5.2742, + "step": 6443 + }, + { + "epoch": 0.03832429346274622, + "grad_norm": 3.353813886642456, + "learning_rate": 4.9819074932394916e-05, + "loss": 5.3823, + "step": 6444 + }, + { + "epoch": 0.038330240746027214, + "grad_norm": 2.5253870487213135, + "learning_rate": 4.981901883415469e-05, + "loss": 5.7, + "step": 6445 + }, + { + "epoch": 0.03833618802930821, + "grad_norm": 2.3375632762908936, + "learning_rate": 4.98189627272504e-05, + "loss": 5.2862, + "step": 6446 + }, + { + "epoch": 0.03834213531258921, + "grad_norm": 2.534599542617798, + "learning_rate": 4.981890661168207e-05, + "loss": 5.3961, + "step": 6447 + }, + { + "epoch": 0.038348082595870206, + "grad_norm": 2.383511781692505, + "learning_rate": 4.9818850487449716e-05, + "loss": 6.4658, + "step": 6448 + }, + { + "epoch": 0.0383540298791512, + "grad_norm": 2.2824161052703857, + "learning_rate": 4.981879435455336e-05, + "loss": 5.5221, + "step": 6449 + }, + { + "epoch": 0.0383599771624322, + "grad_norm": 2.355271100997925, + "learning_rate": 4.981873821299301e-05, + "loss": 5.5054, + "step": 6450 + }, + { + "epoch": 0.0383659244457132, + "grad_norm": 2.0071253776550293, + "learning_rate": 4.981868206276871e-05, + "loss": 5.5911, + "step": 6451 + }, + { + "epoch": 0.03837187172899419, + "grad_norm": 2.2770705223083496, + "learning_rate": 4.9818625903880445e-05, + "loss": 5.8978, + "step": 6452 + }, + { + "epoch": 0.038377819012275195, + "grad_norm": 2.2425332069396973, + "learning_rate": 4.981856973632827e-05, + "loss": 6.3189, + "step": 6453 + }, + { + "epoch": 0.03838376629555619, + "grad_norm": 2.300560235977173, + "learning_rate": 4.981851356011218e-05, + "loss": 5.745, + "step": 6454 + }, + { + "epoch": 0.038389713578837185, + "grad_norm": 2.4516983032226562, + "learning_rate": 4.981845737523221e-05, + "loss": 5.8978, + "step": 6455 + }, + { + "epoch": 0.03839566086211819, + "grad_norm": 2.3463354110717773, + "learning_rate": 4.981840118168837e-05, + "loss": 5.668, + "step": 6456 + }, + { + "epoch": 0.03840160814539918, + "grad_norm": 2.623608112335205, + "learning_rate": 4.981834497948068e-05, + "loss": 5.471, + "step": 6457 + }, + { + "epoch": 0.03840755542868018, + "grad_norm": 2.441089391708374, + "learning_rate": 4.9818288768609166e-05, + "loss": 5.0986, + "step": 6458 + }, + { + "epoch": 0.03841350271196117, + "grad_norm": 2.597635507583618, + "learning_rate": 4.981823254907384e-05, + "loss": 5.1046, + "step": 6459 + }, + { + "epoch": 0.038419449995242175, + "grad_norm": 2.344855785369873, + "learning_rate": 4.9818176320874727e-05, + "loss": 5.8878, + "step": 6460 + }, + { + "epoch": 0.03842539727852317, + "grad_norm": 2.2569222450256348, + "learning_rate": 4.981812008401184e-05, + "loss": 5.342, + "step": 6461 + }, + { + "epoch": 0.038431344561804165, + "grad_norm": 2.276780843734741, + "learning_rate": 4.981806383848522e-05, + "loss": 5.566, + "step": 6462 + }, + { + "epoch": 0.03843729184508517, + "grad_norm": 2.1354174613952637, + "learning_rate": 4.9818007584294856e-05, + "loss": 5.8678, + "step": 6463 + }, + { + "epoch": 0.03844323912836616, + "grad_norm": 2.164092779159546, + "learning_rate": 4.981795132144078e-05, + "loss": 5.7937, + "step": 6464 + }, + { + "epoch": 0.03844918641164716, + "grad_norm": 2.3034324645996094, + "learning_rate": 4.981789504992303e-05, + "loss": 5.843, + "step": 6465 + }, + { + "epoch": 0.03845513369492816, + "grad_norm": 1.9616999626159668, + "learning_rate": 4.9817838769741584e-05, + "loss": 6.0563, + "step": 6466 + }, + { + "epoch": 0.038461080978209154, + "grad_norm": 2.2784626483917236, + "learning_rate": 4.9817782480896505e-05, + "loss": 6.4152, + "step": 6467 + }, + { + "epoch": 0.03846702826149015, + "grad_norm": 1.8581526279449463, + "learning_rate": 4.981772618338779e-05, + "loss": 5.9833, + "step": 6468 + }, + { + "epoch": 0.03847297554477115, + "grad_norm": 2.2493395805358887, + "learning_rate": 4.9817669877215466e-05, + "loss": 6.2985, + "step": 6469 + }, + { + "epoch": 0.038478922828052146, + "grad_norm": 2.289125919342041, + "learning_rate": 4.981761356237955e-05, + "loss": 5.8555, + "step": 6470 + }, + { + "epoch": 0.03848487011133314, + "grad_norm": 2.11012601852417, + "learning_rate": 4.981755723888006e-05, + "loss": 6.6137, + "step": 6471 + }, + { + "epoch": 0.038490817394614144, + "grad_norm": 2.1793103218078613, + "learning_rate": 4.981750090671702e-05, + "loss": 6.0117, + "step": 6472 + }, + { + "epoch": 0.03849676467789514, + "grad_norm": 2.1857750415802, + "learning_rate": 4.9817444565890436e-05, + "loss": 5.9877, + "step": 6473 + }, + { + "epoch": 0.038502711961176134, + "grad_norm": 1.7430874109268188, + "learning_rate": 4.981738821640035e-05, + "loss": 5.829, + "step": 6474 + }, + { + "epoch": 0.03850865924445713, + "grad_norm": 1.8017771244049072, + "learning_rate": 4.981733185824676e-05, + "loss": 6.3853, + "step": 6475 + }, + { + "epoch": 0.03851460652773813, + "grad_norm": 2.1420724391937256, + "learning_rate": 4.9817275491429705e-05, + "loss": 5.982, + "step": 6476 + }, + { + "epoch": 0.038520553811019126, + "grad_norm": 2.441521167755127, + "learning_rate": 4.9817219115949195e-05, + "loss": 6.1159, + "step": 6477 + }, + { + "epoch": 0.03852650109430012, + "grad_norm": 2.158682346343994, + "learning_rate": 4.9817162731805246e-05, + "loss": 6.1306, + "step": 6478 + }, + { + "epoch": 0.03853244837758112, + "grad_norm": 2.154538869857788, + "learning_rate": 4.9817106338997884e-05, + "loss": 6.0745, + "step": 6479 + }, + { + "epoch": 0.03853839566086212, + "grad_norm": 2.077674388885498, + "learning_rate": 4.981704993752713e-05, + "loss": 6.2171, + "step": 6480 + }, + { + "epoch": 0.03854434294414311, + "grad_norm": 2.181500196456909, + "learning_rate": 4.981699352739299e-05, + "loss": 6.228, + "step": 6481 + }, + { + "epoch": 0.038550290227424115, + "grad_norm": 2.678189992904663, + "learning_rate": 4.98169371085955e-05, + "loss": 5.965, + "step": 6482 + }, + { + "epoch": 0.03855623751070511, + "grad_norm": 2.713480234146118, + "learning_rate": 4.981688068113467e-05, + "loss": 5.9078, + "step": 6483 + }, + { + "epoch": 0.038562184793986105, + "grad_norm": 2.4872853755950928, + "learning_rate": 4.981682424501053e-05, + "loss": 5.7525, + "step": 6484 + }, + { + "epoch": 0.03856813207726711, + "grad_norm": 2.274711847305298, + "learning_rate": 4.98167678002231e-05, + "loss": 5.9193, + "step": 6485 + }, + { + "epoch": 0.0385740793605481, + "grad_norm": 2.4730162620544434, + "learning_rate": 4.981671134677238e-05, + "loss": 6.2961, + "step": 6486 + }, + { + "epoch": 0.0385800266438291, + "grad_norm": 1.7856062650680542, + "learning_rate": 4.9816654884658396e-05, + "loss": 5.9005, + "step": 6487 + }, + { + "epoch": 0.03858597392711009, + "grad_norm": 1.8812140226364136, + "learning_rate": 4.981659841388119e-05, + "loss": 5.9428, + "step": 6488 + }, + { + "epoch": 0.038591921210391095, + "grad_norm": 1.9963254928588867, + "learning_rate": 4.9816541934440756e-05, + "loss": 6.0136, + "step": 6489 + }, + { + "epoch": 0.03859786849367209, + "grad_norm": 2.741892099380493, + "learning_rate": 4.981648544633713e-05, + "loss": 6.5065, + "step": 6490 + }, + { + "epoch": 0.038603815776953085, + "grad_norm": 2.226672410964966, + "learning_rate": 4.981642894957032e-05, + "loss": 5.9705, + "step": 6491 + }, + { + "epoch": 0.03860976306023409, + "grad_norm": 2.015429973602295, + "learning_rate": 4.981637244414036e-05, + "loss": 6.1418, + "step": 6492 + }, + { + "epoch": 0.03861571034351508, + "grad_norm": 2.032304286956787, + "learning_rate": 4.981631593004725e-05, + "loss": 6.2104, + "step": 6493 + }, + { + "epoch": 0.03862165762679608, + "grad_norm": 2.0174217224121094, + "learning_rate": 4.981625940729102e-05, + "loss": 5.9861, + "step": 6494 + }, + { + "epoch": 0.03862760491007708, + "grad_norm": 1.9466323852539062, + "learning_rate": 4.98162028758717e-05, + "loss": 6.0958, + "step": 6495 + }, + { + "epoch": 0.038633552193358074, + "grad_norm": 1.6796106100082397, + "learning_rate": 4.9816146335789296e-05, + "loss": 6.0708, + "step": 6496 + }, + { + "epoch": 0.03863949947663907, + "grad_norm": 2.0496580600738525, + "learning_rate": 4.9816089787043826e-05, + "loss": 6.0137, + "step": 6497 + }, + { + "epoch": 0.03864544675992007, + "grad_norm": 2.5402488708496094, + "learning_rate": 4.9816033229635324e-05, + "loss": 6.1389, + "step": 6498 + }, + { + "epoch": 0.038651394043201066, + "grad_norm": 2.2701938152313232, + "learning_rate": 4.9815976663563795e-05, + "loss": 6.1277, + "step": 6499 + }, + { + "epoch": 0.03865734132648206, + "grad_norm": 2.328554630279541, + "learning_rate": 4.9815920088829273e-05, + "loss": 6.0402, + "step": 6500 + }, + { + "epoch": 0.038663288609763063, + "grad_norm": 2.1817965507507324, + "learning_rate": 4.981586350543176e-05, + "loss": 6.2732, + "step": 6501 + }, + { + "epoch": 0.03866923589304406, + "grad_norm": 2.4273757934570312, + "learning_rate": 4.981580691337129e-05, + "loss": 6.1842, + "step": 6502 + }, + { + "epoch": 0.038675183176325054, + "grad_norm": 2.1365530490875244, + "learning_rate": 4.981575031264787e-05, + "loss": 6.1527, + "step": 6503 + }, + { + "epoch": 0.03868113045960605, + "grad_norm": 2.2198991775512695, + "learning_rate": 4.981569370326154e-05, + "loss": 6.0841, + "step": 6504 + }, + { + "epoch": 0.03868707774288705, + "grad_norm": 2.0078141689300537, + "learning_rate": 4.98156370852123e-05, + "loss": 6.0401, + "step": 6505 + }, + { + "epoch": 0.038693025026168046, + "grad_norm": 2.0243566036224365, + "learning_rate": 4.9815580458500184e-05, + "loss": 5.9111, + "step": 6506 + }, + { + "epoch": 0.03869897230944904, + "grad_norm": 2.3084707260131836, + "learning_rate": 4.98155238231252e-05, + "loss": 5.9865, + "step": 6507 + }, + { + "epoch": 0.03870491959273004, + "grad_norm": 1.8110517263412476, + "learning_rate": 4.981546717908738e-05, + "loss": 5.9132, + "step": 6508 + }, + { + "epoch": 0.03871086687601104, + "grad_norm": 2.2639706134796143, + "learning_rate": 4.981541052638673e-05, + "loss": 5.8195, + "step": 6509 + }, + { + "epoch": 0.03871681415929203, + "grad_norm": 2.2684152126312256, + "learning_rate": 4.981535386502327e-05, + "loss": 6.4894, + "step": 6510 + }, + { + "epoch": 0.038722761442573035, + "grad_norm": 2.363118886947632, + "learning_rate": 4.981529719499704e-05, + "loss": 6.1888, + "step": 6511 + }, + { + "epoch": 0.03872870872585403, + "grad_norm": 2.2158865928649902, + "learning_rate": 4.9815240516308045e-05, + "loss": 6.3361, + "step": 6512 + }, + { + "epoch": 0.038734656009135025, + "grad_norm": 2.096928834915161, + "learning_rate": 4.98151838289563e-05, + "loss": 5.8554, + "step": 6513 + }, + { + "epoch": 0.03874060329241603, + "grad_norm": 2.2228331565856934, + "learning_rate": 4.981512713294183e-05, + "loss": 5.9961, + "step": 6514 + }, + { + "epoch": 0.03874655057569702, + "grad_norm": 1.8646903038024902, + "learning_rate": 4.981507042826466e-05, + "loss": 6.1471, + "step": 6515 + }, + { + "epoch": 0.03875249785897802, + "grad_norm": 2.227267265319824, + "learning_rate": 4.98150137149248e-05, + "loss": 5.9655, + "step": 6516 + }, + { + "epoch": 0.03875844514225902, + "grad_norm": 2.6884701251983643, + "learning_rate": 4.981495699292228e-05, + "loss": 5.7958, + "step": 6517 + }, + { + "epoch": 0.038764392425540015, + "grad_norm": 2.953523635864258, + "learning_rate": 4.981490026225711e-05, + "loss": 5.8305, + "step": 6518 + }, + { + "epoch": 0.03877033970882101, + "grad_norm": 2.5009984970092773, + "learning_rate": 4.981484352292932e-05, + "loss": 5.7838, + "step": 6519 + }, + { + "epoch": 0.038776286992102005, + "grad_norm": 2.2291715145111084, + "learning_rate": 4.981478677493892e-05, + "loss": 5.7622, + "step": 6520 + }, + { + "epoch": 0.03878223427538301, + "grad_norm": 2.1492466926574707, + "learning_rate": 4.9814730018285935e-05, + "loss": 5.5379, + "step": 6521 + }, + { + "epoch": 0.038788181558664, + "grad_norm": 1.8914062976837158, + "learning_rate": 4.981467325297039e-05, + "loss": 5.8368, + "step": 6522 + }, + { + "epoch": 0.038794128841945, + "grad_norm": 2.301670789718628, + "learning_rate": 4.981461647899229e-05, + "loss": 5.9019, + "step": 6523 + }, + { + "epoch": 0.038800076125226, + "grad_norm": 2.2850520610809326, + "learning_rate": 4.981455969635167e-05, + "loss": 5.6616, + "step": 6524 + }, + { + "epoch": 0.038806023408506994, + "grad_norm": 2.4155313968658447, + "learning_rate": 4.9814502905048546e-05, + "loss": 5.7842, + "step": 6525 + }, + { + "epoch": 0.03881197069178799, + "grad_norm": 2.0731799602508545, + "learning_rate": 4.981444610508293e-05, + "loss": 6.084, + "step": 6526 + }, + { + "epoch": 0.03881791797506899, + "grad_norm": 2.990232229232788, + "learning_rate": 4.981438929645484e-05, + "loss": 5.2556, + "step": 6527 + }, + { + "epoch": 0.038823865258349986, + "grad_norm": 3.0814263820648193, + "learning_rate": 4.981433247916432e-05, + "loss": 5.1895, + "step": 6528 + }, + { + "epoch": 0.03882981254163098, + "grad_norm": 3.197000503540039, + "learning_rate": 4.9814275653211365e-05, + "loss": 4.9539, + "step": 6529 + }, + { + "epoch": 0.03883575982491198, + "grad_norm": 3.062098979949951, + "learning_rate": 4.9814218818596e-05, + "loss": 4.8417, + "step": 6530 + }, + { + "epoch": 0.03884170710819298, + "grad_norm": 3.092667579650879, + "learning_rate": 4.981416197531825e-05, + "loss": 5.0479, + "step": 6531 + }, + { + "epoch": 0.038847654391473974, + "grad_norm": 3.00508713722229, + "learning_rate": 4.981410512337813e-05, + "loss": 5.864, + "step": 6532 + }, + { + "epoch": 0.03885360167475497, + "grad_norm": 3.3760926723480225, + "learning_rate": 4.981404826277567e-05, + "loss": 6.5745, + "step": 6533 + }, + { + "epoch": 0.03885954895803597, + "grad_norm": 2.6170921325683594, + "learning_rate": 4.981399139351087e-05, + "loss": 5.7959, + "step": 6534 + }, + { + "epoch": 0.038865496241316966, + "grad_norm": 2.9855849742889404, + "learning_rate": 4.981393451558377e-05, + "loss": 4.9118, + "step": 6535 + }, + { + "epoch": 0.03887144352459796, + "grad_norm": 2.885373830795288, + "learning_rate": 4.981387762899438e-05, + "loss": 4.8342, + "step": 6536 + }, + { + "epoch": 0.03887739080787896, + "grad_norm": 2.6936960220336914, + "learning_rate": 4.981382073374272e-05, + "loss": 4.7323, + "step": 6537 + }, + { + "epoch": 0.03888333809115996, + "grad_norm": 2.7214853763580322, + "learning_rate": 4.981376382982882e-05, + "loss": 5.5414, + "step": 6538 + }, + { + "epoch": 0.03888928537444095, + "grad_norm": 2.449828863143921, + "learning_rate": 4.981370691725269e-05, + "loss": 5.6385, + "step": 6539 + }, + { + "epoch": 0.038895232657721955, + "grad_norm": 2.551046133041382, + "learning_rate": 4.981364999601434e-05, + "loss": 5.4699, + "step": 6540 + }, + { + "epoch": 0.03890117994100295, + "grad_norm": 2.1208136081695557, + "learning_rate": 4.981359306611381e-05, + "loss": 5.6674, + "step": 6541 + }, + { + "epoch": 0.038907127224283945, + "grad_norm": 2.4039392471313477, + "learning_rate": 4.9813536127551105e-05, + "loss": 6.1872, + "step": 6542 + }, + { + "epoch": 0.03891307450756495, + "grad_norm": 2.0119946002960205, + "learning_rate": 4.9813479180326256e-05, + "loss": 6.0917, + "step": 6543 + }, + { + "epoch": 0.03891902179084594, + "grad_norm": 3.2959303855895996, + "learning_rate": 4.9813422224439275e-05, + "loss": 5.5646, + "step": 6544 + }, + { + "epoch": 0.03892496907412694, + "grad_norm": 2.9011316299438477, + "learning_rate": 4.981336525989019e-05, + "loss": 5.5324, + "step": 6545 + }, + { + "epoch": 0.03893091635740794, + "grad_norm": 2.2984118461608887, + "learning_rate": 4.981330828667901e-05, + "loss": 5.4961, + "step": 6546 + }, + { + "epoch": 0.038936863640688935, + "grad_norm": 2.1745059490203857, + "learning_rate": 4.981325130480576e-05, + "loss": 5.6631, + "step": 6547 + }, + { + "epoch": 0.03894281092396993, + "grad_norm": 2.3001794815063477, + "learning_rate": 4.981319431427046e-05, + "loss": 5.5897, + "step": 6548 + }, + { + "epoch": 0.038948758207250925, + "grad_norm": 2.329446315765381, + "learning_rate": 4.9813137315073136e-05, + "loss": 5.4599, + "step": 6549 + }, + { + "epoch": 0.03895470549053193, + "grad_norm": 2.4700307846069336, + "learning_rate": 4.98130803072138e-05, + "loss": 5.2788, + "step": 6550 + }, + { + "epoch": 0.03896065277381292, + "grad_norm": 2.309767484664917, + "learning_rate": 4.9813023290692467e-05, + "loss": 5.3828, + "step": 6551 + }, + { + "epoch": 0.03896660005709392, + "grad_norm": 2.1923089027404785, + "learning_rate": 4.981296626550917e-05, + "loss": 5.225, + "step": 6552 + }, + { + "epoch": 0.03897254734037492, + "grad_norm": 2.424954652786255, + "learning_rate": 4.981290923166392e-05, + "loss": 5.2007, + "step": 6553 + }, + { + "epoch": 0.038978494623655914, + "grad_norm": 2.53446102142334, + "learning_rate": 4.981285218915674e-05, + "loss": 5.142, + "step": 6554 + }, + { + "epoch": 0.03898444190693691, + "grad_norm": 2.492788791656494, + "learning_rate": 4.9812795137987655e-05, + "loss": 5.5755, + "step": 6555 + }, + { + "epoch": 0.03899038919021791, + "grad_norm": 2.8081278800964355, + "learning_rate": 4.9812738078156674e-05, + "loss": 4.9815, + "step": 6556 + }, + { + "epoch": 0.038996336473498906, + "grad_norm": 2.535109758377075, + "learning_rate": 4.981268100966383e-05, + "loss": 5.3678, + "step": 6557 + }, + { + "epoch": 0.0390022837567799, + "grad_norm": 2.36004900932312, + "learning_rate": 4.981262393250913e-05, + "loss": 5.0422, + "step": 6558 + }, + { + "epoch": 0.0390082310400609, + "grad_norm": 2.2315657138824463, + "learning_rate": 4.98125668466926e-05, + "loss": 5.0345, + "step": 6559 + }, + { + "epoch": 0.0390141783233419, + "grad_norm": 2.293947696685791, + "learning_rate": 4.981250975221425e-05, + "loss": 4.9308, + "step": 6560 + }, + { + "epoch": 0.039020125606622894, + "grad_norm": 2.239915132522583, + "learning_rate": 4.9812452649074124e-05, + "loss": 5.3504, + "step": 6561 + }, + { + "epoch": 0.03902607288990389, + "grad_norm": 1.8740140199661255, + "learning_rate": 4.981239553727222e-05, + "loss": 5.9432, + "step": 6562 + }, + { + "epoch": 0.03903202017318489, + "grad_norm": 1.7221744060516357, + "learning_rate": 4.981233841680857e-05, + "loss": 5.8387, + "step": 6563 + }, + { + "epoch": 0.039037967456465886, + "grad_norm": 1.9648221731185913, + "learning_rate": 4.981228128768318e-05, + "loss": 5.7836, + "step": 6564 + }, + { + "epoch": 0.03904391473974688, + "grad_norm": 1.7790826559066772, + "learning_rate": 4.981222414989608e-05, + "loss": 5.842, + "step": 6565 + }, + { + "epoch": 0.03904986202302788, + "grad_norm": 2.039483070373535, + "learning_rate": 4.9812167003447296e-05, + "loss": 5.6509, + "step": 6566 + }, + { + "epoch": 0.03905580930630888, + "grad_norm": 2.1241865158081055, + "learning_rate": 4.981210984833684e-05, + "loss": 5.5626, + "step": 6567 + }, + { + "epoch": 0.03906175658958987, + "grad_norm": 2.1290524005889893, + "learning_rate": 4.981205268456473e-05, + "loss": 5.5114, + "step": 6568 + }, + { + "epoch": 0.039067703872870875, + "grad_norm": 2.181558132171631, + "learning_rate": 4.981199551213099e-05, + "loss": 5.5356, + "step": 6569 + }, + { + "epoch": 0.03907365115615187, + "grad_norm": 2.1696360111236572, + "learning_rate": 4.9811938331035635e-05, + "loss": 5.5684, + "step": 6570 + }, + { + "epoch": 0.039079598439432865, + "grad_norm": 1.8040674924850464, + "learning_rate": 4.98118811412787e-05, + "loss": 5.605, + "step": 6571 + }, + { + "epoch": 0.03908554572271387, + "grad_norm": 2.4475252628326416, + "learning_rate": 4.981182394286018e-05, + "loss": 6.4733, + "step": 6572 + }, + { + "epoch": 0.03909149300599486, + "grad_norm": 2.0800678730010986, + "learning_rate": 4.981176673578011e-05, + "loss": 5.5613, + "step": 6573 + }, + { + "epoch": 0.03909744028927586, + "grad_norm": 1.7632306814193726, + "learning_rate": 4.981170952003852e-05, + "loss": 5.5971, + "step": 6574 + }, + { + "epoch": 0.03910338757255686, + "grad_norm": 1.6671072244644165, + "learning_rate": 4.981165229563541e-05, + "loss": 5.4462, + "step": 6575 + }, + { + "epoch": 0.039109334855837855, + "grad_norm": 1.8972923755645752, + "learning_rate": 4.981159506257081e-05, + "loss": 5.7747, + "step": 6576 + }, + { + "epoch": 0.03911528213911885, + "grad_norm": 1.8343021869659424, + "learning_rate": 4.981153782084473e-05, + "loss": 5.7542, + "step": 6577 + }, + { + "epoch": 0.039121229422399845, + "grad_norm": 1.669877529144287, + "learning_rate": 4.9811480570457216e-05, + "loss": 5.6736, + "step": 6578 + }, + { + "epoch": 0.03912717670568085, + "grad_norm": 1.9555165767669678, + "learning_rate": 4.981142331140825e-05, + "loss": 5.2997, + "step": 6579 + }, + { + "epoch": 0.03913312398896184, + "grad_norm": 2.5131587982177734, + "learning_rate": 4.981136604369789e-05, + "loss": 5.2093, + "step": 6580 + }, + { + "epoch": 0.03913907127224284, + "grad_norm": 2.0637567043304443, + "learning_rate": 4.9811308767326134e-05, + "loss": 5.1671, + "step": 6581 + }, + { + "epoch": 0.03914501855552384, + "grad_norm": 2.140839099884033, + "learning_rate": 4.9811251482293e-05, + "loss": 5.3237, + "step": 6582 + }, + { + "epoch": 0.039150965838804834, + "grad_norm": 1.968489408493042, + "learning_rate": 4.981119418859852e-05, + "loss": 5.6015, + "step": 6583 + }, + { + "epoch": 0.03915691312208583, + "grad_norm": 1.873827338218689, + "learning_rate": 4.9811136886242705e-05, + "loss": 5.3316, + "step": 6584 + }, + { + "epoch": 0.03916286040536683, + "grad_norm": 1.9897359609603882, + "learning_rate": 4.981107957522558e-05, + "loss": 5.1548, + "step": 6585 + }, + { + "epoch": 0.039168807688647826, + "grad_norm": 2.004457950592041, + "learning_rate": 4.9811022255547165e-05, + "loss": 5.1977, + "step": 6586 + }, + { + "epoch": 0.03917475497192882, + "grad_norm": 2.1058437824249268, + "learning_rate": 4.9810964927207485e-05, + "loss": 5.0217, + "step": 6587 + }, + { + "epoch": 0.03918070225520982, + "grad_norm": 1.9846851825714111, + "learning_rate": 4.981090759020654e-05, + "loss": 5.1123, + "step": 6588 + }, + { + "epoch": 0.03918664953849082, + "grad_norm": 2.018026828765869, + "learning_rate": 4.981085024454437e-05, + "loss": 5.0516, + "step": 6589 + }, + { + "epoch": 0.039192596821771813, + "grad_norm": 1.7792260646820068, + "learning_rate": 4.9810792890220995e-05, + "loss": 5.5266, + "step": 6590 + }, + { + "epoch": 0.03919854410505281, + "grad_norm": 2.0855109691619873, + "learning_rate": 4.981073552723642e-05, + "loss": 5.5504, + "step": 6591 + }, + { + "epoch": 0.03920449138833381, + "grad_norm": 1.9998018741607666, + "learning_rate": 4.9810678155590676e-05, + "loss": 5.3447, + "step": 6592 + }, + { + "epoch": 0.039210438671614806, + "grad_norm": 2.332714557647705, + "learning_rate": 4.981062077528377e-05, + "loss": 5.6166, + "step": 6593 + }, + { + "epoch": 0.0392163859548958, + "grad_norm": 1.9647892713546753, + "learning_rate": 4.981056338631575e-05, + "loss": 5.0113, + "step": 6594 + }, + { + "epoch": 0.0392223332381768, + "grad_norm": 1.9961154460906982, + "learning_rate": 4.9810505988686604e-05, + "loss": 5.0143, + "step": 6595 + }, + { + "epoch": 0.0392282805214578, + "grad_norm": 1.9039133787155151, + "learning_rate": 4.981044858239637e-05, + "loss": 5.3602, + "step": 6596 + }, + { + "epoch": 0.03923422780473879, + "grad_norm": 1.9076604843139648, + "learning_rate": 4.981039116744507e-05, + "loss": 5.4165, + "step": 6597 + }, + { + "epoch": 0.039240175088019795, + "grad_norm": 1.6676216125488281, + "learning_rate": 4.981033374383272e-05, + "loss": 5.4018, + "step": 6598 + }, + { + "epoch": 0.03924612237130079, + "grad_norm": 1.7158783674240112, + "learning_rate": 4.981027631155933e-05, + "loss": 5.3233, + "step": 6599 + }, + { + "epoch": 0.039252069654581785, + "grad_norm": 1.6659481525421143, + "learning_rate": 4.9810218870624945e-05, + "loss": 5.4671, + "step": 6600 + }, + { + "epoch": 0.03925801693786279, + "grad_norm": 2.008171319961548, + "learning_rate": 4.981016142102956e-05, + "loss": 5.6424, + "step": 6601 + }, + { + "epoch": 0.03926396422114378, + "grad_norm": 2.213045835494995, + "learning_rate": 4.9810103962773204e-05, + "loss": 5.419, + "step": 6602 + }, + { + "epoch": 0.03926991150442478, + "grad_norm": 2.0159718990325928, + "learning_rate": 4.981004649585589e-05, + "loss": 5.4301, + "step": 6603 + }, + { + "epoch": 0.03927585878770578, + "grad_norm": 1.982701063156128, + "learning_rate": 4.9809989020277646e-05, + "loss": 5.6001, + "step": 6604 + }, + { + "epoch": 0.039281806070986774, + "grad_norm": 2.1933834552764893, + "learning_rate": 4.98099315360385e-05, + "loss": 5.6756, + "step": 6605 + }, + { + "epoch": 0.03928775335426777, + "grad_norm": 1.858798623085022, + "learning_rate": 4.980987404313846e-05, + "loss": 5.43, + "step": 6606 + }, + { + "epoch": 0.039293700637548765, + "grad_norm": 1.8233433961868286, + "learning_rate": 4.980981654157755e-05, + "loss": 5.4638, + "step": 6607 + }, + { + "epoch": 0.03929964792082977, + "grad_norm": 2.0368216037750244, + "learning_rate": 4.9809759031355784e-05, + "loss": 5.71, + "step": 6608 + }, + { + "epoch": 0.03930559520411076, + "grad_norm": 1.9923310279846191, + "learning_rate": 4.9809701512473196e-05, + "loss": 5.6443, + "step": 6609 + }, + { + "epoch": 0.03931154248739176, + "grad_norm": 2.391463279724121, + "learning_rate": 4.9809643984929785e-05, + "loss": 5.4701, + "step": 6610 + }, + { + "epoch": 0.03931748977067276, + "grad_norm": 1.8456658124923706, + "learning_rate": 4.98095864487256e-05, + "loss": 5.4346, + "step": 6611 + }, + { + "epoch": 0.039323437053953754, + "grad_norm": 1.7941107749938965, + "learning_rate": 4.980952890386063e-05, + "loss": 5.4198, + "step": 6612 + }, + { + "epoch": 0.03932938433723475, + "grad_norm": 1.8455369472503662, + "learning_rate": 4.980947135033492e-05, + "loss": 5.3915, + "step": 6613 + }, + { + "epoch": 0.03933533162051575, + "grad_norm": 1.8710846900939941, + "learning_rate": 4.980941378814847e-05, + "loss": 5.2744, + "step": 6614 + }, + { + "epoch": 0.039341278903796746, + "grad_norm": 2.203129768371582, + "learning_rate": 4.980935621730132e-05, + "loss": 5.4409, + "step": 6615 + }, + { + "epoch": 0.03934722618707774, + "grad_norm": 1.8944141864776611, + "learning_rate": 4.980929863779348e-05, + "loss": 5.4661, + "step": 6616 + }, + { + "epoch": 0.03935317347035874, + "grad_norm": 1.8268091678619385, + "learning_rate": 4.9809241049624966e-05, + "loss": 5.4088, + "step": 6617 + }, + { + "epoch": 0.03935912075363974, + "grad_norm": 1.838927984237671, + "learning_rate": 4.98091834527958e-05, + "loss": 5.5335, + "step": 6618 + }, + { + "epoch": 0.03936506803692073, + "grad_norm": 1.8441804647445679, + "learning_rate": 4.9809125847306e-05, + "loss": 5.4639, + "step": 6619 + }, + { + "epoch": 0.03937101532020173, + "grad_norm": 2.012754440307617, + "learning_rate": 4.980906823315561e-05, + "loss": 5.5606, + "step": 6620 + }, + { + "epoch": 0.03937696260348273, + "grad_norm": 1.8358973264694214, + "learning_rate": 4.980901061034461e-05, + "loss": 5.4217, + "step": 6621 + }, + { + "epoch": 0.039382909886763726, + "grad_norm": 2.0668959617614746, + "learning_rate": 4.980895297887305e-05, + "loss": 5.5164, + "step": 6622 + }, + { + "epoch": 0.03938885717004472, + "grad_norm": 2.032320976257324, + "learning_rate": 4.9808895338740934e-05, + "loss": 5.4914, + "step": 6623 + }, + { + "epoch": 0.03939480445332572, + "grad_norm": 1.8650145530700684, + "learning_rate": 4.980883768994829e-05, + "loss": 5.3718, + "step": 6624 + }, + { + "epoch": 0.03940075173660672, + "grad_norm": 4.494358539581299, + "learning_rate": 4.980878003249515e-05, + "loss": 5.5253, + "step": 6625 + }, + { + "epoch": 0.03940669901988771, + "grad_norm": 1.9295374155044556, + "learning_rate": 4.980872236638151e-05, + "loss": 5.3187, + "step": 6626 + }, + { + "epoch": 0.039412646303168715, + "grad_norm": 2.089717388153076, + "learning_rate": 4.980866469160741e-05, + "loss": 5.5311, + "step": 6627 + }, + { + "epoch": 0.03941859358644971, + "grad_norm": 1.701429843902588, + "learning_rate": 4.980860700817285e-05, + "loss": 5.4529, + "step": 6628 + }, + { + "epoch": 0.039424540869730705, + "grad_norm": 1.8336073160171509, + "learning_rate": 4.980854931607787e-05, + "loss": 5.2987, + "step": 6629 + }, + { + "epoch": 0.03943048815301171, + "grad_norm": 2.7922565937042236, + "learning_rate": 4.9808491615322475e-05, + "loss": 5.3492, + "step": 6630 + }, + { + "epoch": 0.0394364354362927, + "grad_norm": 1.8253742456436157, + "learning_rate": 4.980843390590669e-05, + "loss": 5.3928, + "step": 6631 + }, + { + "epoch": 0.0394423827195737, + "grad_norm": 2.646916151046753, + "learning_rate": 4.980837618783055e-05, + "loss": 5.4329, + "step": 6632 + }, + { + "epoch": 0.0394483300028547, + "grad_norm": 2.1956236362457275, + "learning_rate": 4.980831846109405e-05, + "loss": 5.4794, + "step": 6633 + }, + { + "epoch": 0.039454277286135694, + "grad_norm": 2.7274577617645264, + "learning_rate": 4.980826072569723e-05, + "loss": 5.9666, + "step": 6634 + }, + { + "epoch": 0.03946022456941669, + "grad_norm": 1.9890350103378296, + "learning_rate": 4.98082029816401e-05, + "loss": 5.5518, + "step": 6635 + }, + { + "epoch": 0.039466171852697685, + "grad_norm": 2.7760517597198486, + "learning_rate": 4.980814522892268e-05, + "loss": 5.2777, + "step": 6636 + }, + { + "epoch": 0.03947211913597869, + "grad_norm": 2.035254716873169, + "learning_rate": 4.9808087467544995e-05, + "loss": 5.5872, + "step": 6637 + }, + { + "epoch": 0.03947806641925968, + "grad_norm": 1.9728864431381226, + "learning_rate": 4.980802969750706e-05, + "loss": 5.3357, + "step": 6638 + }, + { + "epoch": 0.03948401370254068, + "grad_norm": 1.795480489730835, + "learning_rate": 4.98079719188089e-05, + "loss": 5.6414, + "step": 6639 + }, + { + "epoch": 0.03948996098582168, + "grad_norm": 1.7882109880447388, + "learning_rate": 4.980791413145054e-05, + "loss": 5.3499, + "step": 6640 + }, + { + "epoch": 0.039495908269102674, + "grad_norm": 1.8416422605514526, + "learning_rate": 4.9807856335431994e-05, + "loss": 5.3292, + "step": 6641 + }, + { + "epoch": 0.03950185555238367, + "grad_norm": 1.9525254964828491, + "learning_rate": 4.9807798530753266e-05, + "loss": 5.2782, + "step": 6642 + }, + { + "epoch": 0.03950780283566467, + "grad_norm": 1.5100830793380737, + "learning_rate": 4.9807740717414406e-05, + "loss": 5.2807, + "step": 6643 + }, + { + "epoch": 0.039513750118945666, + "grad_norm": 2.029430866241455, + "learning_rate": 4.9807682895415406e-05, + "loss": 5.4496, + "step": 6644 + }, + { + "epoch": 0.03951969740222666, + "grad_norm": 1.7976901531219482, + "learning_rate": 4.9807625064756315e-05, + "loss": 5.1021, + "step": 6645 + }, + { + "epoch": 0.03952564468550766, + "grad_norm": 1.5770336389541626, + "learning_rate": 4.980756722543714e-05, + "loss": 5.3946, + "step": 6646 + }, + { + "epoch": 0.03953159196878866, + "grad_norm": 1.8289496898651123, + "learning_rate": 4.980750937745788e-05, + "loss": 5.4821, + "step": 6647 + }, + { + "epoch": 0.03953753925206965, + "grad_norm": 1.7413506507873535, + "learning_rate": 4.980745152081859e-05, + "loss": 5.4827, + "step": 6648 + }, + { + "epoch": 0.03954348653535065, + "grad_norm": 2.048400402069092, + "learning_rate": 4.980739365551927e-05, + "loss": 5.2359, + "step": 6649 + }, + { + "epoch": 0.03954943381863165, + "grad_norm": 2.331897735595703, + "learning_rate": 4.980733578155995e-05, + "loss": 5.2988, + "step": 6650 + }, + { + "epoch": 0.039555381101912646, + "grad_norm": 2.1224608421325684, + "learning_rate": 4.980727789894065e-05, + "loss": 5.1228, + "step": 6651 + }, + { + "epoch": 0.03956132838519364, + "grad_norm": 1.5331578254699707, + "learning_rate": 4.9807220007661374e-05, + "loss": 5.184, + "step": 6652 + }, + { + "epoch": 0.03956727566847464, + "grad_norm": 1.773489236831665, + "learning_rate": 4.980716210772216e-05, + "loss": 5.1883, + "step": 6653 + }, + { + "epoch": 0.03957322295175564, + "grad_norm": 2.119302749633789, + "learning_rate": 4.9807104199123016e-05, + "loss": 5.5437, + "step": 6654 + }, + { + "epoch": 0.03957917023503663, + "grad_norm": 2.0695033073425293, + "learning_rate": 4.9807046281863974e-05, + "loss": 5.5951, + "step": 6655 + }, + { + "epoch": 0.039585117518317635, + "grad_norm": 2.0522243976593018, + "learning_rate": 4.980698835594505e-05, + "loss": 5.2736, + "step": 6656 + }, + { + "epoch": 0.03959106480159863, + "grad_norm": 2.3200113773345947, + "learning_rate": 4.980693042136626e-05, + "loss": 5.5701, + "step": 6657 + }, + { + "epoch": 0.039597012084879625, + "grad_norm": 1.8731193542480469, + "learning_rate": 4.980687247812762e-05, + "loss": 5.3929, + "step": 6658 + }, + { + "epoch": 0.03960295936816063, + "grad_norm": 1.8390223979949951, + "learning_rate": 4.980681452622916e-05, + "loss": 5.1684, + "step": 6659 + }, + { + "epoch": 0.03960890665144162, + "grad_norm": 2.24766206741333, + "learning_rate": 4.980675656567091e-05, + "loss": 5.0232, + "step": 6660 + }, + { + "epoch": 0.03961485393472262, + "grad_norm": 2.2592451572418213, + "learning_rate": 4.980669859645286e-05, + "loss": 4.9878, + "step": 6661 + }, + { + "epoch": 0.03962080121800362, + "grad_norm": 2.14709734916687, + "learning_rate": 4.9806640618575064e-05, + "loss": 5.1036, + "step": 6662 + }, + { + "epoch": 0.039626748501284614, + "grad_norm": 2.133910655975342, + "learning_rate": 4.9806582632037516e-05, + "loss": 5.0356, + "step": 6663 + }, + { + "epoch": 0.03963269578456561, + "grad_norm": 2.2513222694396973, + "learning_rate": 4.980652463684025e-05, + "loss": 5.2357, + "step": 6664 + }, + { + "epoch": 0.039638643067846605, + "grad_norm": 2.078355312347412, + "learning_rate": 4.980646663298328e-05, + "loss": 5.3857, + "step": 6665 + }, + { + "epoch": 0.03964459035112761, + "grad_norm": 2.3798105716705322, + "learning_rate": 4.980640862046663e-05, + "loss": 5.0888, + "step": 6666 + }, + { + "epoch": 0.0396505376344086, + "grad_norm": 2.241868019104004, + "learning_rate": 4.980635059929032e-05, + "loss": 5.1397, + "step": 6667 + }, + { + "epoch": 0.0396564849176896, + "grad_norm": 2.2053534984588623, + "learning_rate": 4.9806292569454365e-05, + "loss": 4.799, + "step": 6668 + }, + { + "epoch": 0.0396624322009706, + "grad_norm": 2.2996716499328613, + "learning_rate": 4.980623453095879e-05, + "loss": 4.9597, + "step": 6669 + }, + { + "epoch": 0.039668379484251594, + "grad_norm": 1.9892657995224, + "learning_rate": 4.9806176483803615e-05, + "loss": 5.0784, + "step": 6670 + }, + { + "epoch": 0.03967432676753259, + "grad_norm": 2.2087242603302, + "learning_rate": 4.980611842798887e-05, + "loss": 5.4099, + "step": 6671 + }, + { + "epoch": 0.03968027405081359, + "grad_norm": 2.215728521347046, + "learning_rate": 4.980606036351455e-05, + "loss": 5.2889, + "step": 6672 + }, + { + "epoch": 0.039686221334094586, + "grad_norm": 2.228073835372925, + "learning_rate": 4.9806002290380705e-05, + "loss": 5.3816, + "step": 6673 + }, + { + "epoch": 0.03969216861737558, + "grad_norm": 2.209808826446533, + "learning_rate": 4.980594420858733e-05, + "loss": 5.6233, + "step": 6674 + }, + { + "epoch": 0.03969811590065658, + "grad_norm": 1.8294177055358887, + "learning_rate": 4.980588611813446e-05, + "loss": 5.5756, + "step": 6675 + }, + { + "epoch": 0.03970406318393758, + "grad_norm": 2.236435890197754, + "learning_rate": 4.980582801902212e-05, + "loss": 5.4807, + "step": 6676 + }, + { + "epoch": 0.03971001046721857, + "grad_norm": 2.528804063796997, + "learning_rate": 4.980576991125031e-05, + "loss": 5.6503, + "step": 6677 + }, + { + "epoch": 0.03971595775049957, + "grad_norm": 2.312063217163086, + "learning_rate": 4.9805711794819065e-05, + "loss": 5.5517, + "step": 6678 + }, + { + "epoch": 0.03972190503378057, + "grad_norm": 2.336134672164917, + "learning_rate": 4.98056536697284e-05, + "loss": 5.5708, + "step": 6679 + }, + { + "epoch": 0.039727852317061566, + "grad_norm": 2.2809929847717285, + "learning_rate": 4.980559553597834e-05, + "loss": 5.453, + "step": 6680 + }, + { + "epoch": 0.03973379960034256, + "grad_norm": 2.0603368282318115, + "learning_rate": 4.98055373935689e-05, + "loss": 5.3482, + "step": 6681 + }, + { + "epoch": 0.03973974688362356, + "grad_norm": 1.9654933214187622, + "learning_rate": 4.980547924250011e-05, + "loss": 5.29, + "step": 6682 + }, + { + "epoch": 0.03974569416690456, + "grad_norm": 2.4211983680725098, + "learning_rate": 4.9805421082771985e-05, + "loss": 5.4261, + "step": 6683 + }, + { + "epoch": 0.03975164145018555, + "grad_norm": 2.129987955093384, + "learning_rate": 4.9805362914384533e-05, + "loss": 5.3551, + "step": 6684 + }, + { + "epoch": 0.039757588733466555, + "grad_norm": 2.127936601638794, + "learning_rate": 4.9805304737337796e-05, + "loss": 5.4647, + "step": 6685 + }, + { + "epoch": 0.03976353601674755, + "grad_norm": 2.303382158279419, + "learning_rate": 4.980524655163178e-05, + "loss": 5.1699, + "step": 6686 + }, + { + "epoch": 0.039769483300028545, + "grad_norm": 2.6889941692352295, + "learning_rate": 4.98051883572665e-05, + "loss": 5.2031, + "step": 6687 + }, + { + "epoch": 0.03977543058330955, + "grad_norm": 3.321950674057007, + "learning_rate": 4.9805130154242e-05, + "loss": 4.9815, + "step": 6688 + }, + { + "epoch": 0.03978137786659054, + "grad_norm": 3.1951568126678467, + "learning_rate": 4.980507194255827e-05, + "loss": 4.8946, + "step": 6689 + }, + { + "epoch": 0.03978732514987154, + "grad_norm": 2.355271816253662, + "learning_rate": 4.9805013722215355e-05, + "loss": 5.9223, + "step": 6690 + }, + { + "epoch": 0.03979327243315254, + "grad_norm": 2.3401644229888916, + "learning_rate": 4.9804955493213264e-05, + "loss": 6.1826, + "step": 6691 + }, + { + "epoch": 0.039799219716433534, + "grad_norm": 2.191997766494751, + "learning_rate": 4.980489725555202e-05, + "loss": 5.5617, + "step": 6692 + }, + { + "epoch": 0.03980516699971453, + "grad_norm": 2.377803087234497, + "learning_rate": 4.9804839009231644e-05, + "loss": 5.684, + "step": 6693 + }, + { + "epoch": 0.039811114282995524, + "grad_norm": 1.9084972143173218, + "learning_rate": 4.980478075425215e-05, + "loss": 6.0291, + "step": 6694 + }, + { + "epoch": 0.039817061566276526, + "grad_norm": 2.185628890991211, + "learning_rate": 4.9804722490613566e-05, + "loss": 5.5808, + "step": 6695 + }, + { + "epoch": 0.03982300884955752, + "grad_norm": 2.3253934383392334, + "learning_rate": 4.980466421831591e-05, + "loss": 5.7076, + "step": 6696 + }, + { + "epoch": 0.03982895613283852, + "grad_norm": 2.1599392890930176, + "learning_rate": 4.98046059373592e-05, + "loss": 5.9607, + "step": 6697 + }, + { + "epoch": 0.03983490341611952, + "grad_norm": 2.093137741088867, + "learning_rate": 4.980454764774346e-05, + "loss": 6.0014, + "step": 6698 + }, + { + "epoch": 0.039840850699400514, + "grad_norm": 2.4242093563079834, + "learning_rate": 4.980448934946871e-05, + "loss": 5.6255, + "step": 6699 + }, + { + "epoch": 0.03984679798268151, + "grad_norm": 2.523277521133423, + "learning_rate": 4.980443104253497e-05, + "loss": 5.5302, + "step": 6700 + }, + { + "epoch": 0.03985274526596251, + "grad_norm": 1.7926498651504517, + "learning_rate": 4.980437272694225e-05, + "loss": 5.6467, + "step": 6701 + }, + { + "epoch": 0.039858692549243506, + "grad_norm": 1.7630435228347778, + "learning_rate": 4.980431440269059e-05, + "loss": 5.9615, + "step": 6702 + }, + { + "epoch": 0.0398646398325245, + "grad_norm": 1.8051058053970337, + "learning_rate": 4.980425606978e-05, + "loss": 6.13, + "step": 6703 + }, + { + "epoch": 0.0398705871158055, + "grad_norm": 2.104901075363159, + "learning_rate": 4.98041977282105e-05, + "loss": 6.142, + "step": 6704 + }, + { + "epoch": 0.0398765343990865, + "grad_norm": 1.7022942304611206, + "learning_rate": 4.98041393779821e-05, + "loss": 5.6764, + "step": 6705 + }, + { + "epoch": 0.03988248168236749, + "grad_norm": 2.140230178833008, + "learning_rate": 4.980408101909485e-05, + "loss": 5.9796, + "step": 6706 + }, + { + "epoch": 0.03988842896564849, + "grad_norm": 1.9564754962921143, + "learning_rate": 4.9804022651548734e-05, + "loss": 6.005, + "step": 6707 + }, + { + "epoch": 0.03989437624892949, + "grad_norm": 1.9460588693618774, + "learning_rate": 4.9803964275343795e-05, + "loss": 5.9784, + "step": 6708 + }, + { + "epoch": 0.039900323532210485, + "grad_norm": 1.7314271926879883, + "learning_rate": 4.980390589048005e-05, + "loss": 5.7766, + "step": 6709 + }, + { + "epoch": 0.03990627081549148, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.9803847496957524e-05, + "loss": 5.7386, + "step": 6710 + }, + { + "epoch": 0.03991221809877248, + "grad_norm": 2.3194711208343506, + "learning_rate": 4.980378909477622e-05, + "loss": 6.1324, + "step": 6711 + }, + { + "epoch": 0.03991816538205348, + "grad_norm": 2.3532958030700684, + "learning_rate": 4.980373068393618e-05, + "loss": 6.027, + "step": 6712 + }, + { + "epoch": 0.03992411266533447, + "grad_norm": 2.5944385528564453, + "learning_rate": 4.980367226443741e-05, + "loss": 6.2892, + "step": 6713 + }, + { + "epoch": 0.039930059948615475, + "grad_norm": 1.5707015991210938, + "learning_rate": 4.9803613836279926e-05, + "loss": 5.6525, + "step": 6714 + }, + { + "epoch": 0.03993600723189647, + "grad_norm": 2.022613286972046, + "learning_rate": 4.980355539946376e-05, + "loss": 5.8943, + "step": 6715 + }, + { + "epoch": 0.039941954515177465, + "grad_norm": 1.7783907651901245, + "learning_rate": 4.980349695398894e-05, + "loss": 5.6451, + "step": 6716 + }, + { + "epoch": 0.03994790179845847, + "grad_norm": 2.098841428756714, + "learning_rate": 4.980343849985547e-05, + "loss": 6.1143, + "step": 6717 + }, + { + "epoch": 0.03995384908173946, + "grad_norm": 2.045955181121826, + "learning_rate": 4.9803380037063374e-05, + "loss": 6.1802, + "step": 6718 + }, + { + "epoch": 0.03995979636502046, + "grad_norm": 1.7324507236480713, + "learning_rate": 4.980332156561267e-05, + "loss": 6.081, + "step": 6719 + }, + { + "epoch": 0.03996574364830146, + "grad_norm": 1.795184850692749, + "learning_rate": 4.9803263085503385e-05, + "loss": 5.6075, + "step": 6720 + }, + { + "epoch": 0.039971690931582454, + "grad_norm": 2.1466586589813232, + "learning_rate": 4.980320459673554e-05, + "loss": 6.045, + "step": 6721 + }, + { + "epoch": 0.03997763821486345, + "grad_norm": 2.1261258125305176, + "learning_rate": 4.980314609930915e-05, + "loss": 6.0589, + "step": 6722 + }, + { + "epoch": 0.039983585498144444, + "grad_norm": 2.559584617614746, + "learning_rate": 4.980308759322424e-05, + "loss": 6.3894, + "step": 6723 + }, + { + "epoch": 0.039989532781425446, + "grad_norm": 2.4580929279327393, + "learning_rate": 4.980302907848083e-05, + "loss": 6.3979, + "step": 6724 + }, + { + "epoch": 0.03999548006470644, + "grad_norm": 1.8877859115600586, + "learning_rate": 4.9802970555078934e-05, + "loss": 5.5076, + "step": 6725 + }, + { + "epoch": 0.04000142734798744, + "grad_norm": 2.145123243331909, + "learning_rate": 4.9802912023018585e-05, + "loss": 6.1913, + "step": 6726 + }, + { + "epoch": 0.04000737463126844, + "grad_norm": 1.9321368932724, + "learning_rate": 4.980285348229979e-05, + "loss": 5.9614, + "step": 6727 + }, + { + "epoch": 0.040013321914549434, + "grad_norm": 1.883589506149292, + "learning_rate": 4.9802794932922577e-05, + "loss": 5.4293, + "step": 6728 + }, + { + "epoch": 0.04001926919783043, + "grad_norm": 1.9066367149353027, + "learning_rate": 4.980273637488696e-05, + "loss": 5.4299, + "step": 6729 + }, + { + "epoch": 0.04002521648111143, + "grad_norm": 1.845290184020996, + "learning_rate": 4.9802677808192963e-05, + "loss": 5.596, + "step": 6730 + }, + { + "epoch": 0.040031163764392426, + "grad_norm": 2.3295016288757324, + "learning_rate": 4.980261923284062e-05, + "loss": 6.1266, + "step": 6731 + }, + { + "epoch": 0.04003711104767342, + "grad_norm": 2.451676368713379, + "learning_rate": 4.980256064882993e-05, + "loss": 6.0578, + "step": 6732 + }, + { + "epoch": 0.04004305833095442, + "grad_norm": 2.1317830085754395, + "learning_rate": 4.9802502056160915e-05, + "loss": 6.2627, + "step": 6733 + }, + { + "epoch": 0.04004900561423542, + "grad_norm": 2.223085641860962, + "learning_rate": 4.980244345483361e-05, + "loss": 5.5751, + "step": 6734 + }, + { + "epoch": 0.04005495289751641, + "grad_norm": 2.508385181427002, + "learning_rate": 4.9802384844848035e-05, + "loss": 5.572, + "step": 6735 + }, + { + "epoch": 0.04006090018079741, + "grad_norm": 2.5150837898254395, + "learning_rate": 4.98023262262042e-05, + "loss": 5.3443, + "step": 6736 + }, + { + "epoch": 0.04006684746407841, + "grad_norm": 2.293503761291504, + "learning_rate": 4.980226759890212e-05, + "loss": 5.37, + "step": 6737 + }, + { + "epoch": 0.040072794747359405, + "grad_norm": 1.8764920234680176, + "learning_rate": 4.9802208962941834e-05, + "loss": 5.3804, + "step": 6738 + }, + { + "epoch": 0.0400787420306404, + "grad_norm": 1.8443305492401123, + "learning_rate": 4.980215031832335e-05, + "loss": 5.7787, + "step": 6739 + }, + { + "epoch": 0.0400846893139214, + "grad_norm": 2.6707816123962402, + "learning_rate": 4.980209166504669e-05, + "loss": 6.2858, + "step": 6740 + }, + { + "epoch": 0.0400906365972024, + "grad_norm": 2.3520665168762207, + "learning_rate": 4.980203300311188e-05, + "loss": 5.8069, + "step": 6741 + }, + { + "epoch": 0.04009658388048339, + "grad_norm": 2.0564348697662354, + "learning_rate": 4.980197433251893e-05, + "loss": 6.1698, + "step": 6742 + }, + { + "epoch": 0.040102531163764395, + "grad_norm": 2.205469846725464, + "learning_rate": 4.9801915653267875e-05, + "loss": 5.8401, + "step": 6743 + }, + { + "epoch": 0.04010847844704539, + "grad_norm": 2.042363405227661, + "learning_rate": 4.980185696535873e-05, + "loss": 5.9673, + "step": 6744 + }, + { + "epoch": 0.040114425730326385, + "grad_norm": 1.7575644254684448, + "learning_rate": 4.98017982687915e-05, + "loss": 5.7852, + "step": 6745 + }, + { + "epoch": 0.04012037301360739, + "grad_norm": 1.968548059463501, + "learning_rate": 4.980173956356623e-05, + "loss": 6.2085, + "step": 6746 + }, + { + "epoch": 0.04012632029688838, + "grad_norm": 2.0365097522735596, + "learning_rate": 4.980168084968292e-05, + "loss": 6.4235, + "step": 6747 + }, + { + "epoch": 0.04013226758016938, + "grad_norm": 2.7265079021453857, + "learning_rate": 4.9801622127141605e-05, + "loss": 6.0804, + "step": 6748 + }, + { + "epoch": 0.04013821486345038, + "grad_norm": 2.1604299545288086, + "learning_rate": 4.98015633959423e-05, + "loss": 5.942, + "step": 6749 + }, + { + "epoch": 0.040144162146731374, + "grad_norm": 2.4122307300567627, + "learning_rate": 4.980150465608502e-05, + "loss": 6.2877, + "step": 6750 + }, + { + "epoch": 0.04015010943001237, + "grad_norm": 2.040780782699585, + "learning_rate": 4.98014459075698e-05, + "loss": 5.645, + "step": 6751 + }, + { + "epoch": 0.040156056713293364, + "grad_norm": 2.3660147190093994, + "learning_rate": 4.980138715039665e-05, + "loss": 5.975, + "step": 6752 + }, + { + "epoch": 0.040162003996574366, + "grad_norm": 2.2332143783569336, + "learning_rate": 4.980132838456558e-05, + "loss": 6.1383, + "step": 6753 + }, + { + "epoch": 0.04016795127985536, + "grad_norm": 2.7028262615203857, + "learning_rate": 4.9801269610076635e-05, + "loss": 6.3817, + "step": 6754 + }, + { + "epoch": 0.04017389856313636, + "grad_norm": 2.4653360843658447, + "learning_rate": 4.980121082692982e-05, + "loss": 6.3079, + "step": 6755 + }, + { + "epoch": 0.04017984584641736, + "grad_norm": 2.1470963954925537, + "learning_rate": 4.980115203512515e-05, + "loss": 6.063, + "step": 6756 + }, + { + "epoch": 0.040185793129698354, + "grad_norm": 2.3440990447998047, + "learning_rate": 4.9801093234662666e-05, + "loss": 5.818, + "step": 6757 + }, + { + "epoch": 0.04019174041297935, + "grad_norm": 2.120245933532715, + "learning_rate": 4.980103442554237e-05, + "loss": 5.5867, + "step": 6758 + }, + { + "epoch": 0.04019768769626035, + "grad_norm": 3.196829080581665, + "learning_rate": 4.980097560776429e-05, + "loss": 6.0369, + "step": 6759 + }, + { + "epoch": 0.040203634979541346, + "grad_norm": 2.247997522354126, + "learning_rate": 4.9800916781328456e-05, + "loss": 5.8383, + "step": 6760 + }, + { + "epoch": 0.04020958226282234, + "grad_norm": 2.26254940032959, + "learning_rate": 4.9800857946234866e-05, + "loss": 5.8477, + "step": 6761 + }, + { + "epoch": 0.04021552954610334, + "grad_norm": 2.200495958328247, + "learning_rate": 4.9800799102483556e-05, + "loss": 5.681, + "step": 6762 + }, + { + "epoch": 0.04022147682938434, + "grad_norm": 2.136009454727173, + "learning_rate": 4.980074025007454e-05, + "loss": 5.6453, + "step": 6763 + }, + { + "epoch": 0.04022742411266533, + "grad_norm": 2.3510351181030273, + "learning_rate": 4.980068138900785e-05, + "loss": 5.5735, + "step": 6764 + }, + { + "epoch": 0.040233371395946335, + "grad_norm": 2.249199628829956, + "learning_rate": 4.980062251928349e-05, + "loss": 5.9883, + "step": 6765 + }, + { + "epoch": 0.04023931867922733, + "grad_norm": 2.426816463470459, + "learning_rate": 4.9800563640901494e-05, + "loss": 6.1658, + "step": 6766 + }, + { + "epoch": 0.040245265962508325, + "grad_norm": 2.1044836044311523, + "learning_rate": 4.9800504753861874e-05, + "loss": 5.8627, + "step": 6767 + }, + { + "epoch": 0.04025121324578932, + "grad_norm": 1.9563783407211304, + "learning_rate": 4.9800445858164656e-05, + "loss": 5.9642, + "step": 6768 + }, + { + "epoch": 0.04025716052907032, + "grad_norm": 2.3810997009277344, + "learning_rate": 4.980038695380986e-05, + "loss": 5.2938, + "step": 6769 + }, + { + "epoch": 0.04026310781235132, + "grad_norm": 2.3180932998657227, + "learning_rate": 4.98003280407975e-05, + "loss": 5.7682, + "step": 6770 + }, + { + "epoch": 0.04026905509563231, + "grad_norm": 2.420954704284668, + "learning_rate": 4.980026911912761e-05, + "loss": 5.5724, + "step": 6771 + }, + { + "epoch": 0.040275002378913315, + "grad_norm": 2.447460651397705, + "learning_rate": 4.9800210188800193e-05, + "loss": 5.4844, + "step": 6772 + }, + { + "epoch": 0.04028094966219431, + "grad_norm": 2.4059863090515137, + "learning_rate": 4.980015124981529e-05, + "loss": 5.604, + "step": 6773 + }, + { + "epoch": 0.040286896945475305, + "grad_norm": 2.251492977142334, + "learning_rate": 4.9800092302172894e-05, + "loss": 5.4565, + "step": 6774 + }, + { + "epoch": 0.04029284422875631, + "grad_norm": 2.478682279586792, + "learning_rate": 4.980003334587305e-05, + "loss": 5.9416, + "step": 6775 + }, + { + "epoch": 0.0402987915120373, + "grad_norm": 2.2685835361480713, + "learning_rate": 4.9799974380915785e-05, + "loss": 5.9659, + "step": 6776 + }, + { + "epoch": 0.0403047387953183, + "grad_norm": 2.833101987838745, + "learning_rate": 4.979991540730108e-05, + "loss": 5.3406, + "step": 6777 + }, + { + "epoch": 0.0403106860785993, + "grad_norm": 3.0967416763305664, + "learning_rate": 4.9799856425029e-05, + "loss": 5.5848, + "step": 6778 + }, + { + "epoch": 0.040316633361880294, + "grad_norm": 2.3081796169281006, + "learning_rate": 4.9799797434099536e-05, + "loss": 5.5964, + "step": 6779 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 2.359531879425049, + "learning_rate": 4.9799738434512724e-05, + "loss": 5.6614, + "step": 6780 + }, + { + "epoch": 0.040328527928442284, + "grad_norm": 2.1566221714019775, + "learning_rate": 4.979967942626858e-05, + "loss": 6.0517, + "step": 6781 + }, + { + "epoch": 0.040334475211723286, + "grad_norm": 2.3964991569519043, + "learning_rate": 4.979962040936712e-05, + "loss": 5.9516, + "step": 6782 + }, + { + "epoch": 0.04034042249500428, + "grad_norm": 1.9913266897201538, + "learning_rate": 4.9799561383808365e-05, + "loss": 5.9144, + "step": 6783 + }, + { + "epoch": 0.040346369778285276, + "grad_norm": 1.7329169511795044, + "learning_rate": 4.979950234959235e-05, + "loss": 6.0393, + "step": 6784 + }, + { + "epoch": 0.04035231706156628, + "grad_norm": 1.8278034925460815, + "learning_rate": 4.979944330671908e-05, + "loss": 5.9318, + "step": 6785 + }, + { + "epoch": 0.040358264344847274, + "grad_norm": 2.089806318283081, + "learning_rate": 4.979938425518858e-05, + "loss": 5.5726, + "step": 6786 + }, + { + "epoch": 0.04036421162812827, + "grad_norm": 2.03664231300354, + "learning_rate": 4.9799325195000874e-05, + "loss": 5.8265, + "step": 6787 + }, + { + "epoch": 0.04037015891140927, + "grad_norm": 1.8801567554473877, + "learning_rate": 4.979926612615597e-05, + "loss": 5.7575, + "step": 6788 + }, + { + "epoch": 0.040376106194690266, + "grad_norm": 1.814959168434143, + "learning_rate": 4.979920704865391e-05, + "loss": 5.8737, + "step": 6789 + }, + { + "epoch": 0.04038205347797126, + "grad_norm": 1.7018035650253296, + "learning_rate": 4.97991479624947e-05, + "loss": 5.6768, + "step": 6790 + }, + { + "epoch": 0.04038800076125226, + "grad_norm": 2.21545147895813, + "learning_rate": 4.979908886767837e-05, + "loss": 5.4206, + "step": 6791 + }, + { + "epoch": 0.04039394804453326, + "grad_norm": 2.6184499263763428, + "learning_rate": 4.979902976420492e-05, + "loss": 5.0255, + "step": 6792 + }, + { + "epoch": 0.04039989532781425, + "grad_norm": 2.3914453983306885, + "learning_rate": 4.9798970652074396e-05, + "loss": 4.884, + "step": 6793 + }, + { + "epoch": 0.040405842611095255, + "grad_norm": 2.4367334842681885, + "learning_rate": 4.97989115312868e-05, + "loss": 4.7445, + "step": 6794 + }, + { + "epoch": 0.04041178989437625, + "grad_norm": 2.794490337371826, + "learning_rate": 4.9798852401842165e-05, + "loss": 4.9686, + "step": 6795 + }, + { + "epoch": 0.040417737177657245, + "grad_norm": 2.665395736694336, + "learning_rate": 4.979879326374051e-05, + "loss": 4.854, + "step": 6796 + }, + { + "epoch": 0.04042368446093824, + "grad_norm": 2.0832581520080566, + "learning_rate": 4.979873411698184e-05, + "loss": 5.0371, + "step": 6797 + }, + { + "epoch": 0.04042963174421924, + "grad_norm": 2.4604554176330566, + "learning_rate": 4.979867496156619e-05, + "loss": 4.7524, + "step": 6798 + }, + { + "epoch": 0.04043557902750024, + "grad_norm": 2.3760480880737305, + "learning_rate": 4.979861579749359e-05, + "loss": 4.7645, + "step": 6799 + }, + { + "epoch": 0.04044152631078123, + "grad_norm": 2.468043088912964, + "learning_rate": 4.979855662476405e-05, + "loss": 4.7791, + "step": 6800 + }, + { + "epoch": 0.040447473594062235, + "grad_norm": 2.516026258468628, + "learning_rate": 4.979849744337758e-05, + "loss": 4.7978, + "step": 6801 + }, + { + "epoch": 0.04045342087734323, + "grad_norm": 2.1882307529449463, + "learning_rate": 4.979843825333421e-05, + "loss": 5.002, + "step": 6802 + }, + { + "epoch": 0.040459368160624225, + "grad_norm": 2.423140525817871, + "learning_rate": 4.979837905463397e-05, + "loss": 5.0161, + "step": 6803 + }, + { + "epoch": 0.04046531544390523, + "grad_norm": 2.485739231109619, + "learning_rate": 4.979831984727687e-05, + "loss": 4.7613, + "step": 6804 + }, + { + "epoch": 0.04047126272718622, + "grad_norm": 2.267744302749634, + "learning_rate": 4.979826063126293e-05, + "loss": 4.7496, + "step": 6805 + }, + { + "epoch": 0.04047721001046722, + "grad_norm": 2.3172249794006348, + "learning_rate": 4.9798201406592176e-05, + "loss": 4.8153, + "step": 6806 + }, + { + "epoch": 0.04048315729374822, + "grad_norm": 2.309471607208252, + "learning_rate": 4.979814217326463e-05, + "loss": 4.9874, + "step": 6807 + }, + { + "epoch": 0.040489104577029214, + "grad_norm": 1.989372968673706, + "learning_rate": 4.97980829312803e-05, + "loss": 5.1254, + "step": 6808 + }, + { + "epoch": 0.04049505186031021, + "grad_norm": 2.4409830570220947, + "learning_rate": 4.9798023680639216e-05, + "loss": 4.6476, + "step": 6809 + }, + { + "epoch": 0.040500999143591204, + "grad_norm": 2.5192453861236572, + "learning_rate": 4.97979644213414e-05, + "loss": 4.6933, + "step": 6810 + }, + { + "epoch": 0.040506946426872206, + "grad_norm": 2.294718027114868, + "learning_rate": 4.979790515338688e-05, + "loss": 4.8266, + "step": 6811 + }, + { + "epoch": 0.0405128937101532, + "grad_norm": 2.294550657272339, + "learning_rate": 4.979784587677565e-05, + "loss": 4.6691, + "step": 6812 + }, + { + "epoch": 0.040518840993434196, + "grad_norm": 2.332326889038086, + "learning_rate": 4.979778659150776e-05, + "loss": 4.8366, + "step": 6813 + }, + { + "epoch": 0.0405247882767152, + "grad_norm": 2.325439929962158, + "learning_rate": 4.979772729758322e-05, + "loss": 4.8149, + "step": 6814 + }, + { + "epoch": 0.040530735559996194, + "grad_norm": 2.165926456451416, + "learning_rate": 4.979766799500204e-05, + "loss": 4.7309, + "step": 6815 + }, + { + "epoch": 0.04053668284327719, + "grad_norm": 2.3184943199157715, + "learning_rate": 4.9797608683764264e-05, + "loss": 4.7163, + "step": 6816 + }, + { + "epoch": 0.04054263012655819, + "grad_norm": 2.2161147594451904, + "learning_rate": 4.979754936386989e-05, + "loss": 4.5549, + "step": 6817 + }, + { + "epoch": 0.040548577409839186, + "grad_norm": 2.415496587753296, + "learning_rate": 4.979749003531895e-05, + "loss": 4.7676, + "step": 6818 + }, + { + "epoch": 0.04055452469312018, + "grad_norm": 2.1700618267059326, + "learning_rate": 4.979743069811146e-05, + "loss": 4.8448, + "step": 6819 + }, + { + "epoch": 0.04056047197640118, + "grad_norm": 2.4978747367858887, + "learning_rate": 4.9797371352247446e-05, + "loss": 6.363, + "step": 6820 + }, + { + "epoch": 0.04056641925968218, + "grad_norm": 1.9293922185897827, + "learning_rate": 4.979731199772693e-05, + "loss": 5.6502, + "step": 6821 + }, + { + "epoch": 0.04057236654296317, + "grad_norm": 2.5583136081695557, + "learning_rate": 4.9797252634549915e-05, + "loss": 4.874, + "step": 6822 + }, + { + "epoch": 0.040578313826244175, + "grad_norm": 2.263460159301758, + "learning_rate": 4.979719326271645e-05, + "loss": 5.8457, + "step": 6823 + }, + { + "epoch": 0.04058426110952517, + "grad_norm": 2.5630266666412354, + "learning_rate": 4.979713388222653e-05, + "loss": 4.8668, + "step": 6824 + }, + { + "epoch": 0.040590208392806165, + "grad_norm": 2.2965216636657715, + "learning_rate": 4.9797074493080186e-05, + "loss": 5.0049, + "step": 6825 + }, + { + "epoch": 0.04059615567608716, + "grad_norm": 2.222405433654785, + "learning_rate": 4.979701509527745e-05, + "loss": 5.0204, + "step": 6826 + }, + { + "epoch": 0.04060210295936816, + "grad_norm": 2.4425504207611084, + "learning_rate": 4.979695568881833e-05, + "loss": 5.687, + "step": 6827 + }, + { + "epoch": 0.04060805024264916, + "grad_norm": 2.329901933670044, + "learning_rate": 4.979689627370284e-05, + "loss": 5.9447, + "step": 6828 + }, + { + "epoch": 0.04061399752593015, + "grad_norm": 2.3041510581970215, + "learning_rate": 4.9796836849931015e-05, + "loss": 5.9277, + "step": 6829 + }, + { + "epoch": 0.040619944809211155, + "grad_norm": 2.3020026683807373, + "learning_rate": 4.979677741750287e-05, + "loss": 5.9675, + "step": 6830 + }, + { + "epoch": 0.04062589209249215, + "grad_norm": 2.1861371994018555, + "learning_rate": 4.9796717976418426e-05, + "loss": 6.1312, + "step": 6831 + }, + { + "epoch": 0.040631839375773145, + "grad_norm": 1.9544565677642822, + "learning_rate": 4.979665852667771e-05, + "loss": 5.9218, + "step": 6832 + }, + { + "epoch": 0.04063778665905415, + "grad_norm": 2.346431016921997, + "learning_rate": 4.979659906828073e-05, + "loss": 6.1668, + "step": 6833 + }, + { + "epoch": 0.04064373394233514, + "grad_norm": 2.0405263900756836, + "learning_rate": 4.979653960122751e-05, + "loss": 6.0501, + "step": 6834 + }, + { + "epoch": 0.04064968122561614, + "grad_norm": 1.7645004987716675, + "learning_rate": 4.979648012551809e-05, + "loss": 6.0299, + "step": 6835 + }, + { + "epoch": 0.04065562850889714, + "grad_norm": 2.284703016281128, + "learning_rate": 4.979642064115246e-05, + "loss": 5.5501, + "step": 6836 + }, + { + "epoch": 0.040661575792178134, + "grad_norm": 1.7246543169021606, + "learning_rate": 4.979636114813066e-05, + "loss": 5.5733, + "step": 6837 + }, + { + "epoch": 0.04066752307545913, + "grad_norm": 2.0958921909332275, + "learning_rate": 4.9796301646452705e-05, + "loss": 5.8998, + "step": 6838 + }, + { + "epoch": 0.040673470358740124, + "grad_norm": 2.2123169898986816, + "learning_rate": 4.979624213611862e-05, + "loss": 6.0322, + "step": 6839 + }, + { + "epoch": 0.040679417642021126, + "grad_norm": 1.9541656970977783, + "learning_rate": 4.9796182617128426e-05, + "loss": 5.9255, + "step": 6840 + }, + { + "epoch": 0.04068536492530212, + "grad_norm": 2.077601909637451, + "learning_rate": 4.979612308948213e-05, + "loss": 5.6975, + "step": 6841 + }, + { + "epoch": 0.040691312208583116, + "grad_norm": 2.0595803260803223, + "learning_rate": 4.979606355317977e-05, + "loss": 6.0696, + "step": 6842 + }, + { + "epoch": 0.04069725949186412, + "grad_norm": 1.9800641536712646, + "learning_rate": 4.979600400822136e-05, + "loss": 5.7357, + "step": 6843 + }, + { + "epoch": 0.040703206775145113, + "grad_norm": 2.26238751411438, + "learning_rate": 4.979594445460692e-05, + "loss": 5.9119, + "step": 6844 + }, + { + "epoch": 0.04070915405842611, + "grad_norm": 2.0941457748413086, + "learning_rate": 4.979588489233648e-05, + "loss": 5.945, + "step": 6845 + }, + { + "epoch": 0.04071510134170711, + "grad_norm": 2.1995291709899902, + "learning_rate": 4.979582532141005e-05, + "loss": 5.8406, + "step": 6846 + }, + { + "epoch": 0.040721048624988106, + "grad_norm": 2.0138349533081055, + "learning_rate": 4.9795765741827646e-05, + "loss": 5.7984, + "step": 6847 + }, + { + "epoch": 0.0407269959082691, + "grad_norm": 1.9314415454864502, + "learning_rate": 4.9795706153589304e-05, + "loss": 5.8686, + "step": 6848 + }, + { + "epoch": 0.0407329431915501, + "grad_norm": 2.1324212551116943, + "learning_rate": 4.979564655669503e-05, + "loss": 5.8477, + "step": 6849 + }, + { + "epoch": 0.0407388904748311, + "grad_norm": 1.9601761102676392, + "learning_rate": 4.979558695114486e-05, + "loss": 5.9078, + "step": 6850 + }, + { + "epoch": 0.04074483775811209, + "grad_norm": 2.004333734512329, + "learning_rate": 4.97955273369388e-05, + "loss": 5.9852, + "step": 6851 + }, + { + "epoch": 0.040750785041393095, + "grad_norm": 1.9015164375305176, + "learning_rate": 4.979546771407688e-05, + "loss": 5.6286, + "step": 6852 + }, + { + "epoch": 0.04075673232467409, + "grad_norm": 1.9674208164215088, + "learning_rate": 4.979540808255911e-05, + "loss": 5.8715, + "step": 6853 + }, + { + "epoch": 0.040762679607955085, + "grad_norm": 2.0473713874816895, + "learning_rate": 4.9795348442385534e-05, + "loss": 5.7488, + "step": 6854 + }, + { + "epoch": 0.04076862689123608, + "grad_norm": 1.9536950588226318, + "learning_rate": 4.979528879355615e-05, + "loss": 5.6755, + "step": 6855 + }, + { + "epoch": 0.04077457417451708, + "grad_norm": 2.189659595489502, + "learning_rate": 4.979522913607099e-05, + "loss": 5.7934, + "step": 6856 + }, + { + "epoch": 0.04078052145779808, + "grad_norm": 1.999742031097412, + "learning_rate": 4.9795169469930067e-05, + "loss": 5.7341, + "step": 6857 + }, + { + "epoch": 0.04078646874107907, + "grad_norm": 2.1212494373321533, + "learning_rate": 4.9795109795133414e-05, + "loss": 5.8465, + "step": 6858 + }, + { + "epoch": 0.040792416024360074, + "grad_norm": 1.966467261314392, + "learning_rate": 4.979505011168104e-05, + "loss": 5.8699, + "step": 6859 + }, + { + "epoch": 0.04079836330764107, + "grad_norm": 2.290205955505371, + "learning_rate": 4.979499041957297e-05, + "loss": 6.387, + "step": 6860 + }, + { + "epoch": 0.040804310590922065, + "grad_norm": 2.41827130317688, + "learning_rate": 4.979493071880923e-05, + "loss": 6.893, + "step": 6861 + }, + { + "epoch": 0.04081025787420307, + "grad_norm": 2.0652520656585693, + "learning_rate": 4.979487100938983e-05, + "loss": 6.6435, + "step": 6862 + }, + { + "epoch": 0.04081620515748406, + "grad_norm": 1.8594858646392822, + "learning_rate": 4.979481129131479e-05, + "loss": 5.7441, + "step": 6863 + }, + { + "epoch": 0.04082215244076506, + "grad_norm": 2.269240617752075, + "learning_rate": 4.979475156458415e-05, + "loss": 5.8468, + "step": 6864 + }, + { + "epoch": 0.04082809972404606, + "grad_norm": 2.2355518341064453, + "learning_rate": 4.979469182919792e-05, + "loss": 5.8717, + "step": 6865 + }, + { + "epoch": 0.040834047007327054, + "grad_norm": 1.9578050374984741, + "learning_rate": 4.9794632085156105e-05, + "loss": 5.6777, + "step": 6866 + }, + { + "epoch": 0.04083999429060805, + "grad_norm": 2.354609727859497, + "learning_rate": 4.979457233245875e-05, + "loss": 5.7993, + "step": 6867 + }, + { + "epoch": 0.040845941573889044, + "grad_norm": 1.978289008140564, + "learning_rate": 4.9794512571105865e-05, + "loss": 5.7429, + "step": 6868 + }, + { + "epoch": 0.040851888857170046, + "grad_norm": 1.9695252180099487, + "learning_rate": 4.979445280109747e-05, + "loss": 6.1322, + "step": 6869 + }, + { + "epoch": 0.04085783614045104, + "grad_norm": 2.172510862350464, + "learning_rate": 4.9794393022433586e-05, + "loss": 5.9443, + "step": 6870 + }, + { + "epoch": 0.040863783423732036, + "grad_norm": 2.1992416381835938, + "learning_rate": 4.9794333235114244e-05, + "loss": 6.4094, + "step": 6871 + }, + { + "epoch": 0.04086973070701304, + "grad_norm": 2.1804773807525635, + "learning_rate": 4.979427343913945e-05, + "loss": 6.3871, + "step": 6872 + }, + { + "epoch": 0.04087567799029403, + "grad_norm": 2.2877554893493652, + "learning_rate": 4.979421363450923e-05, + "loss": 6.2509, + "step": 6873 + }, + { + "epoch": 0.04088162527357503, + "grad_norm": 2.0697927474975586, + "learning_rate": 4.979415382122361e-05, + "loss": 5.9008, + "step": 6874 + }, + { + "epoch": 0.04088757255685603, + "grad_norm": 2.2907917499542236, + "learning_rate": 4.97940939992826e-05, + "loss": 5.6137, + "step": 6875 + }, + { + "epoch": 0.040893519840137026, + "grad_norm": 1.9960983991622925, + "learning_rate": 4.979403416868623e-05, + "loss": 5.7283, + "step": 6876 + }, + { + "epoch": 0.04089946712341802, + "grad_norm": 2.2767558097839355, + "learning_rate": 4.9793974329434525e-05, + "loss": 5.3632, + "step": 6877 + }, + { + "epoch": 0.04090541440669902, + "grad_norm": 2.295635461807251, + "learning_rate": 4.97939144815275e-05, + "loss": 5.4524, + "step": 6878 + }, + { + "epoch": 0.04091136168998002, + "grad_norm": 2.247194766998291, + "learning_rate": 4.9793854624965166e-05, + "loss": 5.7846, + "step": 6879 + }, + { + "epoch": 0.04091730897326101, + "grad_norm": 2.2641420364379883, + "learning_rate": 4.9793794759747565e-05, + "loss": 5.7479, + "step": 6880 + }, + { + "epoch": 0.040923256256542015, + "grad_norm": 2.002126455307007, + "learning_rate": 4.97937348858747e-05, + "loss": 5.2694, + "step": 6881 + }, + { + "epoch": 0.04092920353982301, + "grad_norm": 2.079157590866089, + "learning_rate": 4.9793675003346596e-05, + "loss": 6.2711, + "step": 6882 + }, + { + "epoch": 0.040935150823104005, + "grad_norm": 1.9030524492263794, + "learning_rate": 4.979361511216328e-05, + "loss": 5.7259, + "step": 6883 + }, + { + "epoch": 0.040941098106385, + "grad_norm": 1.9157373905181885, + "learning_rate": 4.9793555212324774e-05, + "loss": 6.086, + "step": 6884 + }, + { + "epoch": 0.040947045389666, + "grad_norm": 1.8622015714645386, + "learning_rate": 4.979349530383108e-05, + "loss": 6.1318, + "step": 6885 + }, + { + "epoch": 0.040952992672947, + "grad_norm": 2.3341257572174072, + "learning_rate": 4.9793435386682256e-05, + "loss": 5.9421, + "step": 6886 + }, + { + "epoch": 0.04095893995622799, + "grad_norm": 2.6894209384918213, + "learning_rate": 4.979337546087828e-05, + "loss": 5.5351, + "step": 6887 + }, + { + "epoch": 0.040964887239508994, + "grad_norm": 2.5316739082336426, + "learning_rate": 4.979331552641919e-05, + "loss": 5.5056, + "step": 6888 + }, + { + "epoch": 0.04097083452278999, + "grad_norm": 2.5129077434539795, + "learning_rate": 4.979325558330502e-05, + "loss": 5.3091, + "step": 6889 + }, + { + "epoch": 0.040976781806070985, + "grad_norm": 2.275536298751831, + "learning_rate": 4.979319563153578e-05, + "loss": 5.494, + "step": 6890 + }, + { + "epoch": 0.04098272908935199, + "grad_norm": 2.749375104904175, + "learning_rate": 4.9793135671111494e-05, + "loss": 6.0139, + "step": 6891 + }, + { + "epoch": 0.04098867637263298, + "grad_norm": 2.419163227081299, + "learning_rate": 4.9793075702032177e-05, + "loss": 6.1102, + "step": 6892 + }, + { + "epoch": 0.04099462365591398, + "grad_norm": 2.311450958251953, + "learning_rate": 4.9793015724297856e-05, + "loss": 5.9798, + "step": 6893 + }, + { + "epoch": 0.04100057093919498, + "grad_norm": 2.0522212982177734, + "learning_rate": 4.979295573790854e-05, + "loss": 5.9247, + "step": 6894 + }, + { + "epoch": 0.041006518222475974, + "grad_norm": 2.1928513050079346, + "learning_rate": 4.979289574286427e-05, + "loss": 5.8001, + "step": 6895 + }, + { + "epoch": 0.04101246550575697, + "grad_norm": 2.1945207118988037, + "learning_rate": 4.979283573916505e-05, + "loss": 5.9975, + "step": 6896 + }, + { + "epoch": 0.041018412789037964, + "grad_norm": 2.274843454360962, + "learning_rate": 4.979277572681091e-05, + "loss": 5.693, + "step": 6897 + }, + { + "epoch": 0.041024360072318966, + "grad_norm": 2.2715282440185547, + "learning_rate": 4.979271570580186e-05, + "loss": 5.9952, + "step": 6898 + }, + { + "epoch": 0.04103030735559996, + "grad_norm": 2.4459903240203857, + "learning_rate": 4.9792655676137943e-05, + "loss": 6.0305, + "step": 6899 + }, + { + "epoch": 0.041036254638880956, + "grad_norm": 2.8737339973449707, + "learning_rate": 4.9792595637819165e-05, + "loss": 6.0982, + "step": 6900 + }, + { + "epoch": 0.04104220192216196, + "grad_norm": 2.382143974304199, + "learning_rate": 4.979253559084553e-05, + "loss": 5.6122, + "step": 6901 + }, + { + "epoch": 0.04104814920544295, + "grad_norm": 2.4127237796783447, + "learning_rate": 4.97924755352171e-05, + "loss": 5.7723, + "step": 6902 + }, + { + "epoch": 0.04105409648872395, + "grad_norm": 2.3108956813812256, + "learning_rate": 4.979241547093386e-05, + "loss": 6.1655, + "step": 6903 + }, + { + "epoch": 0.04106004377200495, + "grad_norm": 2.250555992126465, + "learning_rate": 4.979235539799584e-05, + "loss": 6.0627, + "step": 6904 + }, + { + "epoch": 0.041065991055285946, + "grad_norm": 2.187957525253296, + "learning_rate": 4.979229531640307e-05, + "loss": 6.1438, + "step": 6905 + }, + { + "epoch": 0.04107193833856694, + "grad_norm": 1.9089539051055908, + "learning_rate": 4.979223522615557e-05, + "loss": 6.1431, + "step": 6906 + }, + { + "epoch": 0.04107788562184794, + "grad_norm": 2.343569040298462, + "learning_rate": 4.979217512725336e-05, + "loss": 5.9774, + "step": 6907 + }, + { + "epoch": 0.04108383290512894, + "grad_norm": 2.759631633758545, + "learning_rate": 4.979211501969645e-05, + "loss": 5.7982, + "step": 6908 + }, + { + "epoch": 0.04108978018840993, + "grad_norm": 2.295811414718628, + "learning_rate": 4.979205490348487e-05, + "loss": 6.0843, + "step": 6909 + }, + { + "epoch": 0.041095727471690935, + "grad_norm": 2.6259605884552, + "learning_rate": 4.979199477861864e-05, + "loss": 5.6498, + "step": 6910 + }, + { + "epoch": 0.04110167475497193, + "grad_norm": 2.396895408630371, + "learning_rate": 4.9791934645097785e-05, + "loss": 5.9936, + "step": 6911 + }, + { + "epoch": 0.041107622038252925, + "grad_norm": 2.020845651626587, + "learning_rate": 4.979187450292231e-05, + "loss": 5.4867, + "step": 6912 + }, + { + "epoch": 0.04111356932153392, + "grad_norm": 2.6473753452301025, + "learning_rate": 4.979181435209226e-05, + "loss": 5.3556, + "step": 6913 + }, + { + "epoch": 0.04111951660481492, + "grad_norm": 2.353158712387085, + "learning_rate": 4.9791754192607636e-05, + "loss": 6.3122, + "step": 6914 + }, + { + "epoch": 0.04112546388809592, + "grad_norm": 2.499817132949829, + "learning_rate": 4.9791694024468474e-05, + "loss": 5.816, + "step": 6915 + }, + { + "epoch": 0.04113141117137691, + "grad_norm": 2.009239673614502, + "learning_rate": 4.979163384767478e-05, + "loss": 5.5982, + "step": 6916 + }, + { + "epoch": 0.041137358454657914, + "grad_norm": 2.3885819911956787, + "learning_rate": 4.9791573662226586e-05, + "loss": 5.7403, + "step": 6917 + }, + { + "epoch": 0.04114330573793891, + "grad_norm": 2.3135135173797607, + "learning_rate": 4.979151346812391e-05, + "loss": 5.3151, + "step": 6918 + }, + { + "epoch": 0.041149253021219905, + "grad_norm": 1.9801241159439087, + "learning_rate": 4.979145326536677e-05, + "loss": 5.5148, + "step": 6919 + }, + { + "epoch": 0.04115520030450091, + "grad_norm": 2.0724904537200928, + "learning_rate": 4.979139305395519e-05, + "loss": 5.5355, + "step": 6920 + }, + { + "epoch": 0.0411611475877819, + "grad_norm": 1.8104170560836792, + "learning_rate": 4.97913328338892e-05, + "loss": 5.4861, + "step": 6921 + }, + { + "epoch": 0.0411670948710629, + "grad_norm": 1.81072998046875, + "learning_rate": 4.9791272605168804e-05, + "loss": 5.5075, + "step": 6922 + }, + { + "epoch": 0.0411730421543439, + "grad_norm": 1.709191083908081, + "learning_rate": 4.979121236779403e-05, + "loss": 6.1353, + "step": 6923 + }, + { + "epoch": 0.041178989437624894, + "grad_norm": 2.004974126815796, + "learning_rate": 4.9791152121764903e-05, + "loss": 5.478, + "step": 6924 + }, + { + "epoch": 0.04118493672090589, + "grad_norm": 1.937933325767517, + "learning_rate": 4.979109186708144e-05, + "loss": 5.4022, + "step": 6925 + }, + { + "epoch": 0.041190884004186884, + "grad_norm": 1.9453305006027222, + "learning_rate": 4.979103160374367e-05, + "loss": 5.243, + "step": 6926 + }, + { + "epoch": 0.041196831287467886, + "grad_norm": 1.8552072048187256, + "learning_rate": 4.979097133175159e-05, + "loss": 5.3104, + "step": 6927 + }, + { + "epoch": 0.04120277857074888, + "grad_norm": 1.9148203134536743, + "learning_rate": 4.9790911051105246e-05, + "loss": 5.5538, + "step": 6928 + }, + { + "epoch": 0.041208725854029876, + "grad_norm": 1.9658032655715942, + "learning_rate": 4.979085076180466e-05, + "loss": 5.5285, + "step": 6929 + }, + { + "epoch": 0.04121467313731088, + "grad_norm": 1.7332781553268433, + "learning_rate": 4.9790790463849835e-05, + "loss": 5.1959, + "step": 6930 + }, + { + "epoch": 0.04122062042059187, + "grad_norm": 1.5762557983398438, + "learning_rate": 4.9790730157240804e-05, + "loss": 5.3672, + "step": 6931 + }, + { + "epoch": 0.04122656770387287, + "grad_norm": 1.7899656295776367, + "learning_rate": 4.979066984197759e-05, + "loss": 5.3588, + "step": 6932 + }, + { + "epoch": 0.04123251498715387, + "grad_norm": 1.5992622375488281, + "learning_rate": 4.97906095180602e-05, + "loss": 5.275, + "step": 6933 + }, + { + "epoch": 0.041238462270434866, + "grad_norm": 1.875116229057312, + "learning_rate": 4.9790549185488666e-05, + "loss": 5.3428, + "step": 6934 + }, + { + "epoch": 0.04124440955371586, + "grad_norm": 1.8110510110855103, + "learning_rate": 4.979048884426301e-05, + "loss": 5.2416, + "step": 6935 + }, + { + "epoch": 0.04125035683699686, + "grad_norm": 1.5512267351150513, + "learning_rate": 4.979042849438325e-05, + "loss": 5.3643, + "step": 6936 + }, + { + "epoch": 0.04125630412027786, + "grad_norm": 1.8929630517959595, + "learning_rate": 4.979036813584941e-05, + "loss": 5.4232, + "step": 6937 + }, + { + "epoch": 0.04126225140355885, + "grad_norm": 1.8569291830062866, + "learning_rate": 4.9790307768661504e-05, + "loss": 5.2949, + "step": 6938 + }, + { + "epoch": 0.041268198686839855, + "grad_norm": 1.6058611869812012, + "learning_rate": 4.9790247392819564e-05, + "loss": 5.3736, + "step": 6939 + }, + { + "epoch": 0.04127414597012085, + "grad_norm": 1.8455227613449097, + "learning_rate": 4.97901870083236e-05, + "loss": 5.2768, + "step": 6940 + }, + { + "epoch": 0.041280093253401845, + "grad_norm": 1.9346935749053955, + "learning_rate": 4.979012661517364e-05, + "loss": 5.4316, + "step": 6941 + }, + { + "epoch": 0.04128604053668284, + "grad_norm": 1.8085594177246094, + "learning_rate": 4.97900662133697e-05, + "loss": 5.365, + "step": 6942 + }, + { + "epoch": 0.04129198781996384, + "grad_norm": 1.73456871509552, + "learning_rate": 4.9790005802911804e-05, + "loss": 5.2726, + "step": 6943 + }, + { + "epoch": 0.04129793510324484, + "grad_norm": 2.1071617603302, + "learning_rate": 4.978994538379997e-05, + "loss": 6.2313, + "step": 6944 + }, + { + "epoch": 0.04130388238652583, + "grad_norm": 1.7098963260650635, + "learning_rate": 4.978988495603423e-05, + "loss": 5.3162, + "step": 6945 + }, + { + "epoch": 0.041309829669806834, + "grad_norm": 1.8131905794143677, + "learning_rate": 4.978982451961459e-05, + "loss": 5.2486, + "step": 6946 + }, + { + "epoch": 0.04131577695308783, + "grad_norm": 1.8162381649017334, + "learning_rate": 4.978976407454109e-05, + "loss": 5.2806, + "step": 6947 + }, + { + "epoch": 0.041321724236368824, + "grad_norm": 1.9250297546386719, + "learning_rate": 4.9789703620813734e-05, + "loss": 5.1742, + "step": 6948 + }, + { + "epoch": 0.041327671519649826, + "grad_norm": 1.8263678550720215, + "learning_rate": 4.978964315843254e-05, + "loss": 5.1786, + "step": 6949 + }, + { + "epoch": 0.04133361880293082, + "grad_norm": 1.6751807928085327, + "learning_rate": 4.9789582687397546e-05, + "loss": 5.4798, + "step": 6950 + }, + { + "epoch": 0.04133956608621182, + "grad_norm": 1.7842947244644165, + "learning_rate": 4.9789522207708764e-05, + "loss": 5.201, + "step": 6951 + }, + { + "epoch": 0.04134551336949282, + "grad_norm": 1.6785067319869995, + "learning_rate": 4.978946171936621e-05, + "loss": 5.3852, + "step": 6952 + }, + { + "epoch": 0.041351460652773814, + "grad_norm": 1.5475291013717651, + "learning_rate": 4.978940122236992e-05, + "loss": 5.4083, + "step": 6953 + }, + { + "epoch": 0.04135740793605481, + "grad_norm": 1.7445106506347656, + "learning_rate": 4.97893407167199e-05, + "loss": 5.3125, + "step": 6954 + }, + { + "epoch": 0.041363355219335804, + "grad_norm": 1.7334082126617432, + "learning_rate": 4.9789280202416175e-05, + "loss": 5.5388, + "step": 6955 + }, + { + "epoch": 0.041369302502616806, + "grad_norm": 1.7267119884490967, + "learning_rate": 4.9789219679458774e-05, + "loss": 5.5175, + "step": 6956 + }, + { + "epoch": 0.0413752497858978, + "grad_norm": 1.8033246994018555, + "learning_rate": 4.978915914784771e-05, + "loss": 5.3523, + "step": 6957 + }, + { + "epoch": 0.041381197069178796, + "grad_norm": 1.9836528301239014, + "learning_rate": 4.978909860758301e-05, + "loss": 5.3808, + "step": 6958 + }, + { + "epoch": 0.0413871443524598, + "grad_norm": 1.6260416507720947, + "learning_rate": 4.978903805866469e-05, + "loss": 5.4642, + "step": 6959 + }, + { + "epoch": 0.04139309163574079, + "grad_norm": 1.7260626554489136, + "learning_rate": 4.978897750109277e-05, + "loss": 5.4975, + "step": 6960 + }, + { + "epoch": 0.04139903891902179, + "grad_norm": 1.6948668956756592, + "learning_rate": 4.978891693486728e-05, + "loss": 5.5768, + "step": 6961 + }, + { + "epoch": 0.04140498620230279, + "grad_norm": 1.7885476350784302, + "learning_rate": 4.978885635998824e-05, + "loss": 5.4156, + "step": 6962 + }, + { + "epoch": 0.041410933485583785, + "grad_norm": 1.8626813888549805, + "learning_rate": 4.978879577645565e-05, + "loss": 5.354, + "step": 6963 + }, + { + "epoch": 0.04141688076886478, + "grad_norm": 1.867090106010437, + "learning_rate": 4.9788735184269553e-05, + "loss": 5.2934, + "step": 6964 + }, + { + "epoch": 0.04142282805214578, + "grad_norm": 1.7208340167999268, + "learning_rate": 4.9788674583429974e-05, + "loss": 5.2116, + "step": 6965 + }, + { + "epoch": 0.04142877533542678, + "grad_norm": 1.934480905532837, + "learning_rate": 4.9788613973936916e-05, + "loss": 5.5801, + "step": 6966 + }, + { + "epoch": 0.04143472261870777, + "grad_norm": 1.6263724565505981, + "learning_rate": 4.978855335579041e-05, + "loss": 5.3835, + "step": 6967 + }, + { + "epoch": 0.041440669901988775, + "grad_norm": 1.743996262550354, + "learning_rate": 4.9788492728990474e-05, + "loss": 5.3281, + "step": 6968 + }, + { + "epoch": 0.04144661718526977, + "grad_norm": 1.5556843280792236, + "learning_rate": 4.978843209353714e-05, + "loss": 5.442, + "step": 6969 + }, + { + "epoch": 0.041452564468550765, + "grad_norm": 1.5540435314178467, + "learning_rate": 4.978837144943041e-05, + "loss": 5.3621, + "step": 6970 + }, + { + "epoch": 0.04145851175183176, + "grad_norm": 1.7884414196014404, + "learning_rate": 4.9788310796670326e-05, + "loss": 5.571, + "step": 6971 + }, + { + "epoch": 0.04146445903511276, + "grad_norm": 1.7550957202911377, + "learning_rate": 4.9788250135256886e-05, + "loss": 5.61, + "step": 6972 + }, + { + "epoch": 0.04147040631839376, + "grad_norm": 1.9336804151535034, + "learning_rate": 4.978818946519013e-05, + "loss": 5.6142, + "step": 6973 + }, + { + "epoch": 0.04147635360167475, + "grad_norm": 1.8888505697250366, + "learning_rate": 4.978812878647008e-05, + "loss": 5.4908, + "step": 6974 + }, + { + "epoch": 0.041482300884955754, + "grad_norm": 1.940371036529541, + "learning_rate": 4.978806809909674e-05, + "loss": 5.5407, + "step": 6975 + }, + { + "epoch": 0.04148824816823675, + "grad_norm": 2.0182151794433594, + "learning_rate": 4.9788007403070146e-05, + "loss": 5.3643, + "step": 6976 + }, + { + "epoch": 0.041494195451517744, + "grad_norm": 1.7960541248321533, + "learning_rate": 4.978794669839032e-05, + "loss": 5.4994, + "step": 6977 + }, + { + "epoch": 0.041500142734798746, + "grad_norm": 1.8403207063674927, + "learning_rate": 4.978788598505727e-05, + "loss": 5.4501, + "step": 6978 + }, + { + "epoch": 0.04150609001807974, + "grad_norm": 1.7232698202133179, + "learning_rate": 4.978782526307103e-05, + "loss": 5.5406, + "step": 6979 + }, + { + "epoch": 0.04151203730136074, + "grad_norm": 1.7003169059753418, + "learning_rate": 4.9787764532431615e-05, + "loss": 5.3427, + "step": 6980 + }, + { + "epoch": 0.04151798458464174, + "grad_norm": 2.041384696960449, + "learning_rate": 4.978770379313904e-05, + "loss": 5.5121, + "step": 6981 + }, + { + "epoch": 0.041523931867922734, + "grad_norm": 1.5773900747299194, + "learning_rate": 4.978764304519334e-05, + "loss": 5.4604, + "step": 6982 + }, + { + "epoch": 0.04152987915120373, + "grad_norm": 1.8834172487258911, + "learning_rate": 4.9787582288594535e-05, + "loss": 5.5141, + "step": 6983 + }, + { + "epoch": 0.04153582643448473, + "grad_norm": 1.7956576347351074, + "learning_rate": 4.978752152334264e-05, + "loss": 5.5664, + "step": 6984 + }, + { + "epoch": 0.041541773717765726, + "grad_norm": 1.8676495552062988, + "learning_rate": 4.978746074943767e-05, + "loss": 5.2846, + "step": 6985 + }, + { + "epoch": 0.04154772100104672, + "grad_norm": 1.7709665298461914, + "learning_rate": 4.9787399966879654e-05, + "loss": 5.3375, + "step": 6986 + }, + { + "epoch": 0.041553668284327716, + "grad_norm": 2.012941837310791, + "learning_rate": 4.978733917566862e-05, + "loss": 5.6973, + "step": 6987 + }, + { + "epoch": 0.04155961556760872, + "grad_norm": 1.8220570087432861, + "learning_rate": 4.978727837580458e-05, + "loss": 5.191, + "step": 6988 + }, + { + "epoch": 0.04156556285088971, + "grad_norm": 1.6511586904525757, + "learning_rate": 4.978721756728755e-05, + "loss": 5.2787, + "step": 6989 + }, + { + "epoch": 0.04157151013417071, + "grad_norm": 1.9026141166687012, + "learning_rate": 4.978715675011757e-05, + "loss": 5.4456, + "step": 6990 + }, + { + "epoch": 0.04157745741745171, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.9787095924294633e-05, + "loss": 5.5013, + "step": 6991 + }, + { + "epoch": 0.041583404700732705, + "grad_norm": 1.8720741271972656, + "learning_rate": 4.978703508981879e-05, + "loss": 5.3952, + "step": 6992 + }, + { + "epoch": 0.0415893519840137, + "grad_norm": 1.817356824874878, + "learning_rate": 4.978697424669005e-05, + "loss": 5.4719, + "step": 6993 + }, + { + "epoch": 0.0415952992672947, + "grad_norm": 1.740702509880066, + "learning_rate": 4.978691339490843e-05, + "loss": 5.6484, + "step": 6994 + }, + { + "epoch": 0.0416012465505757, + "grad_norm": 1.8752427101135254, + "learning_rate": 4.978685253447395e-05, + "loss": 5.6394, + "step": 6995 + }, + { + "epoch": 0.04160719383385669, + "grad_norm": 1.8180509805679321, + "learning_rate": 4.978679166538665e-05, + "loss": 5.3401, + "step": 6996 + }, + { + "epoch": 0.041613141117137695, + "grad_norm": 1.9002251625061035, + "learning_rate": 4.9786730787646516e-05, + "loss": 5.3237, + "step": 6997 + }, + { + "epoch": 0.04161908840041869, + "grad_norm": 1.741176724433899, + "learning_rate": 4.978666990125361e-05, + "loss": 5.2311, + "step": 6998 + }, + { + "epoch": 0.041625035683699685, + "grad_norm": 2.0994246006011963, + "learning_rate": 4.9786609006207925e-05, + "loss": 5.3549, + "step": 6999 + }, + { + "epoch": 0.04163098296698068, + "grad_norm": 1.8438987731933594, + "learning_rate": 4.978654810250949e-05, + "loss": 5.4322, + "step": 7000 + }, + { + "epoch": 0.04163693025026168, + "grad_norm": 1.7411181926727295, + "learning_rate": 4.978648719015833e-05, + "loss": 5.455, + "step": 7001 + }, + { + "epoch": 0.04164287753354268, + "grad_norm": 1.6879174709320068, + "learning_rate": 4.978642626915446e-05, + "loss": 5.3676, + "step": 7002 + }, + { + "epoch": 0.04164882481682367, + "grad_norm": 1.8912461996078491, + "learning_rate": 4.9786365339497906e-05, + "loss": 5.6181, + "step": 7003 + }, + { + "epoch": 0.041654772100104674, + "grad_norm": 1.9234617948532104, + "learning_rate": 4.978630440118869e-05, + "loss": 5.5388, + "step": 7004 + }, + { + "epoch": 0.04166071938338567, + "grad_norm": 2.1059048175811768, + "learning_rate": 4.9786243454226824e-05, + "loss": 5.6856, + "step": 7005 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 2.1900687217712402, + "learning_rate": 4.9786182498612347e-05, + "loss": 6.2426, + "step": 7006 + }, + { + "epoch": 0.041672613949947666, + "grad_norm": 1.7580265998840332, + "learning_rate": 4.9786121534345265e-05, + "loss": 5.2342, + "step": 7007 + }, + { + "epoch": 0.04167856123322866, + "grad_norm": 1.4747200012207031, + "learning_rate": 4.97860605614256e-05, + "loss": 5.1977, + "step": 7008 + }, + { + "epoch": 0.04168450851650966, + "grad_norm": 1.8164165019989014, + "learning_rate": 4.978599957985338e-05, + "loss": 5.1362, + "step": 7009 + }, + { + "epoch": 0.04169045579979066, + "grad_norm": 1.468550443649292, + "learning_rate": 4.978593858962863e-05, + "loss": 5.1265, + "step": 7010 + }, + { + "epoch": 0.041696403083071654, + "grad_norm": 1.584343433380127, + "learning_rate": 4.9785877590751356e-05, + "loss": 5.2611, + "step": 7011 + }, + { + "epoch": 0.04170235036635265, + "grad_norm": 1.7864785194396973, + "learning_rate": 4.978581658322159e-05, + "loss": 5.5214, + "step": 7012 + }, + { + "epoch": 0.04170829764963365, + "grad_norm": 1.8359016180038452, + "learning_rate": 4.978575556703936e-05, + "loss": 5.3808, + "step": 7013 + }, + { + "epoch": 0.041714244932914646, + "grad_norm": 1.8298325538635254, + "learning_rate": 4.978569454220467e-05, + "loss": 5.5606, + "step": 7014 + }, + { + "epoch": 0.04172019221619564, + "grad_norm": 2.1555540561676025, + "learning_rate": 4.978563350871755e-05, + "loss": 5.6592, + "step": 7015 + }, + { + "epoch": 0.041726139499476636, + "grad_norm": 2.5251846313476562, + "learning_rate": 4.9785572466578026e-05, + "loss": 5.5771, + "step": 7016 + }, + { + "epoch": 0.04173208678275764, + "grad_norm": 1.7765661478042603, + "learning_rate": 4.9785511415786115e-05, + "loss": 5.5558, + "step": 7017 + }, + { + "epoch": 0.04173803406603863, + "grad_norm": 1.9711554050445557, + "learning_rate": 4.978545035634183e-05, + "loss": 5.5565, + "step": 7018 + }, + { + "epoch": 0.04174398134931963, + "grad_norm": 1.8080202341079712, + "learning_rate": 4.978538928824521e-05, + "loss": 5.5037, + "step": 7019 + }, + { + "epoch": 0.04174992863260063, + "grad_norm": 1.7506872415542603, + "learning_rate": 4.978532821149626e-05, + "loss": 5.3362, + "step": 7020 + }, + { + "epoch": 0.041755875915881625, + "grad_norm": 1.5606149435043335, + "learning_rate": 4.978526712609501e-05, + "loss": 5.3541, + "step": 7021 + }, + { + "epoch": 0.04176182319916262, + "grad_norm": 1.8840737342834473, + "learning_rate": 4.9785206032041476e-05, + "loss": 5.2315, + "step": 7022 + }, + { + "epoch": 0.04176777048244362, + "grad_norm": 2.118178606033325, + "learning_rate": 4.978514492933569e-05, + "loss": 5.6174, + "step": 7023 + }, + { + "epoch": 0.04177371776572462, + "grad_norm": 2.043907403945923, + "learning_rate": 4.978508381797766e-05, + "loss": 5.6272, + "step": 7024 + }, + { + "epoch": 0.04177966504900561, + "grad_norm": 1.764411211013794, + "learning_rate": 4.978502269796742e-05, + "loss": 5.6153, + "step": 7025 + }, + { + "epoch": 0.041785612332286615, + "grad_norm": 1.5760626792907715, + "learning_rate": 4.978496156930498e-05, + "loss": 5.5734, + "step": 7026 + }, + { + "epoch": 0.04179155961556761, + "grad_norm": 1.8857802152633667, + "learning_rate": 4.9784900431990366e-05, + "loss": 5.5295, + "step": 7027 + }, + { + "epoch": 0.041797506898848605, + "grad_norm": 1.7287275791168213, + "learning_rate": 4.97848392860236e-05, + "loss": 5.3175, + "step": 7028 + }, + { + "epoch": 0.0418034541821296, + "grad_norm": 1.915263295173645, + "learning_rate": 4.97847781314047e-05, + "loss": 5.4838, + "step": 7029 + }, + { + "epoch": 0.0418094014654106, + "grad_norm": 2.049435615539551, + "learning_rate": 4.97847169681337e-05, + "loss": 5.5508, + "step": 7030 + }, + { + "epoch": 0.0418153487486916, + "grad_norm": 1.8955415487289429, + "learning_rate": 4.97846557962106e-05, + "loss": 5.4618, + "step": 7031 + }, + { + "epoch": 0.04182129603197259, + "grad_norm": 1.8957183361053467, + "learning_rate": 4.978459461563543e-05, + "loss": 5.5293, + "step": 7032 + }, + { + "epoch": 0.041827243315253594, + "grad_norm": 2.050734043121338, + "learning_rate": 4.978453342640822e-05, + "loss": 5.8002, + "step": 7033 + }, + { + "epoch": 0.04183319059853459, + "grad_norm": 1.9867476224899292, + "learning_rate": 4.978447222852899e-05, + "loss": 5.466, + "step": 7034 + }, + { + "epoch": 0.041839137881815584, + "grad_norm": 1.7928507328033447, + "learning_rate": 4.978441102199775e-05, + "loss": 5.3312, + "step": 7035 + }, + { + "epoch": 0.041845085165096586, + "grad_norm": 1.7984018325805664, + "learning_rate": 4.978434980681453e-05, + "loss": 5.2936, + "step": 7036 + }, + { + "epoch": 0.04185103244837758, + "grad_norm": 1.8011672496795654, + "learning_rate": 4.9784288582979355e-05, + "loss": 5.484, + "step": 7037 + }, + { + "epoch": 0.041856979731658576, + "grad_norm": 1.9439928531646729, + "learning_rate": 4.9784227350492236e-05, + "loss": 5.4563, + "step": 7038 + }, + { + "epoch": 0.04186292701493958, + "grad_norm": 1.71321439743042, + "learning_rate": 4.97841661093532e-05, + "loss": 5.3909, + "step": 7039 + }, + { + "epoch": 0.041868874298220574, + "grad_norm": 1.629333734512329, + "learning_rate": 4.9784104859562266e-05, + "loss": 5.3112, + "step": 7040 + }, + { + "epoch": 0.04187482158150157, + "grad_norm": 1.5248417854309082, + "learning_rate": 4.9784043601119456e-05, + "loss": 5.3724, + "step": 7041 + }, + { + "epoch": 0.04188076886478257, + "grad_norm": 1.8886220455169678, + "learning_rate": 4.97839823340248e-05, + "loss": 5.443, + "step": 7042 + }, + { + "epoch": 0.041886716148063566, + "grad_norm": 1.5902595520019531, + "learning_rate": 4.9783921058278307e-05, + "loss": 5.4249, + "step": 7043 + }, + { + "epoch": 0.04189266343134456, + "grad_norm": 1.837579369544983, + "learning_rate": 4.978385977388e-05, + "loss": 5.3767, + "step": 7044 + }, + { + "epoch": 0.041898610714625556, + "grad_norm": 1.8306061029434204, + "learning_rate": 4.9783798480829905e-05, + "loss": 5.4206, + "step": 7045 + }, + { + "epoch": 0.04190455799790656, + "grad_norm": 1.6887965202331543, + "learning_rate": 4.9783737179128044e-05, + "loss": 5.5327, + "step": 7046 + }, + { + "epoch": 0.04191050528118755, + "grad_norm": 1.8081728219985962, + "learning_rate": 4.978367586877444e-05, + "loss": 5.4547, + "step": 7047 + }, + { + "epoch": 0.04191645256446855, + "grad_norm": 1.8341114521026611, + "learning_rate": 4.97836145497691e-05, + "loss": 5.4175, + "step": 7048 + }, + { + "epoch": 0.04192239984774955, + "grad_norm": 1.965240240097046, + "learning_rate": 4.978355322211207e-05, + "loss": 5.4253, + "step": 7049 + }, + { + "epoch": 0.041928347131030545, + "grad_norm": 1.7060484886169434, + "learning_rate": 4.9783491885803343e-05, + "loss": 5.3493, + "step": 7050 + }, + { + "epoch": 0.04193429441431154, + "grad_norm": 1.8203076124191284, + "learning_rate": 4.978343054084297e-05, + "loss": 5.4601, + "step": 7051 + }, + { + "epoch": 0.04194024169759254, + "grad_norm": 1.919954538345337, + "learning_rate": 4.9783369187230945e-05, + "loss": 5.4921, + "step": 7052 + }, + { + "epoch": 0.04194618898087354, + "grad_norm": 1.4519730806350708, + "learning_rate": 4.9783307824967306e-05, + "loss": 5.4922, + "step": 7053 + }, + { + "epoch": 0.04195213626415453, + "grad_norm": 1.8431898355484009, + "learning_rate": 4.9783246454052066e-05, + "loss": 5.384, + "step": 7054 + }, + { + "epoch": 0.041958083547435535, + "grad_norm": 1.5493370294570923, + "learning_rate": 4.978318507448526e-05, + "loss": 5.5294, + "step": 7055 + }, + { + "epoch": 0.04196403083071653, + "grad_norm": 1.6405844688415527, + "learning_rate": 4.97831236862669e-05, + "loss": 5.492, + "step": 7056 + }, + { + "epoch": 0.041969978113997525, + "grad_norm": 1.7830392122268677, + "learning_rate": 4.9783062289396996e-05, + "loss": 5.2977, + "step": 7057 + }, + { + "epoch": 0.04197592539727852, + "grad_norm": 1.8268102407455444, + "learning_rate": 4.9783000883875595e-05, + "loss": 5.3396, + "step": 7058 + }, + { + "epoch": 0.04198187268055952, + "grad_norm": 1.942901849746704, + "learning_rate": 4.9782939469702694e-05, + "loss": 5.3338, + "step": 7059 + }, + { + "epoch": 0.04198781996384052, + "grad_norm": 1.5793414115905762, + "learning_rate": 4.9782878046878334e-05, + "loss": 5.3286, + "step": 7060 + }, + { + "epoch": 0.04199376724712151, + "grad_norm": 1.5777463912963867, + "learning_rate": 4.9782816615402515e-05, + "loss": 5.2942, + "step": 7061 + }, + { + "epoch": 0.041999714530402514, + "grad_norm": 1.6393412351608276, + "learning_rate": 4.978275517527528e-05, + "loss": 5.2557, + "step": 7062 + }, + { + "epoch": 0.04200566181368351, + "grad_norm": 1.9657515287399292, + "learning_rate": 4.978269372649664e-05, + "loss": 5.3875, + "step": 7063 + }, + { + "epoch": 0.042011609096964504, + "grad_norm": 2.1419737339019775, + "learning_rate": 4.9782632269066623e-05, + "loss": 5.2014, + "step": 7064 + }, + { + "epoch": 0.042017556380245506, + "grad_norm": 2.0425620079040527, + "learning_rate": 4.978257080298523e-05, + "loss": 5.194, + "step": 7065 + }, + { + "epoch": 0.0420235036635265, + "grad_norm": 1.7248409986495972, + "learning_rate": 4.978250932825251e-05, + "loss": 5.1922, + "step": 7066 + }, + { + "epoch": 0.042029450946807496, + "grad_norm": 1.8265177011489868, + "learning_rate": 4.978244784486847e-05, + "loss": 5.4474, + "step": 7067 + }, + { + "epoch": 0.0420353982300885, + "grad_norm": 1.803701400756836, + "learning_rate": 4.9782386352833134e-05, + "loss": 6.2155, + "step": 7068 + }, + { + "epoch": 0.042041345513369494, + "grad_norm": 1.9970064163208008, + "learning_rate": 4.978232485214652e-05, + "loss": 5.3622, + "step": 7069 + }, + { + "epoch": 0.04204729279665049, + "grad_norm": 1.7449073791503906, + "learning_rate": 4.978226334280865e-05, + "loss": 5.3146, + "step": 7070 + }, + { + "epoch": 0.04205324007993149, + "grad_norm": 2.0284547805786133, + "learning_rate": 4.978220182481955e-05, + "loss": 5.0169, + "step": 7071 + }, + { + "epoch": 0.042059187363212486, + "grad_norm": 1.6801714897155762, + "learning_rate": 4.978214029817924e-05, + "loss": 5.1294, + "step": 7072 + }, + { + "epoch": 0.04206513464649348, + "grad_norm": 2.160585641860962, + "learning_rate": 4.978207876288774e-05, + "loss": 5.072, + "step": 7073 + }, + { + "epoch": 0.042071081929774476, + "grad_norm": 2.07739520072937, + "learning_rate": 4.978201721894508e-05, + "loss": 5.2065, + "step": 7074 + }, + { + "epoch": 0.04207702921305548, + "grad_norm": 2.1396286487579346, + "learning_rate": 4.978195566635127e-05, + "loss": 5.1066, + "step": 7075 + }, + { + "epoch": 0.04208297649633647, + "grad_norm": 1.883280634880066, + "learning_rate": 4.978189410510633e-05, + "loss": 5.2842, + "step": 7076 + }, + { + "epoch": 0.04208892377961747, + "grad_norm": 1.9917101860046387, + "learning_rate": 4.978183253521029e-05, + "loss": 5.0799, + "step": 7077 + }, + { + "epoch": 0.04209487106289847, + "grad_norm": 1.9387022256851196, + "learning_rate": 4.9781770956663164e-05, + "loss": 5.1898, + "step": 7078 + }, + { + "epoch": 0.042100818346179465, + "grad_norm": 1.9767060279846191, + "learning_rate": 4.978170936946498e-05, + "loss": 5.0692, + "step": 7079 + }, + { + "epoch": 0.04210676562946046, + "grad_norm": 2.0076138973236084, + "learning_rate": 4.978164777361576e-05, + "loss": 5.0255, + "step": 7080 + }, + { + "epoch": 0.04211271291274146, + "grad_norm": 1.8253445625305176, + "learning_rate": 4.978158616911552e-05, + "loss": 5.0111, + "step": 7081 + }, + { + "epoch": 0.04211866019602246, + "grad_norm": 1.6551930904388428, + "learning_rate": 4.978152455596429e-05, + "loss": 4.9849, + "step": 7082 + }, + { + "epoch": 0.04212460747930345, + "grad_norm": 1.8462406396865845, + "learning_rate": 4.9781462934162084e-05, + "loss": 5.0862, + "step": 7083 + }, + { + "epoch": 0.042130554762584455, + "grad_norm": 2.0828206539154053, + "learning_rate": 4.978140130370892e-05, + "loss": 5.031, + "step": 7084 + }, + { + "epoch": 0.04213650204586545, + "grad_norm": 1.7917357683181763, + "learning_rate": 4.978133966460483e-05, + "loss": 5.0028, + "step": 7085 + }, + { + "epoch": 0.042142449329146445, + "grad_norm": 1.7324126958847046, + "learning_rate": 4.9781278016849834e-05, + "loss": 4.9759, + "step": 7086 + }, + { + "epoch": 0.04214839661242744, + "grad_norm": 1.8673282861709595, + "learning_rate": 4.978121636044394e-05, + "loss": 5.3631, + "step": 7087 + }, + { + "epoch": 0.04215434389570844, + "grad_norm": 1.7723935842514038, + "learning_rate": 4.9781154695387186e-05, + "loss": 5.3427, + "step": 7088 + }, + { + "epoch": 0.04216029117898944, + "grad_norm": 1.4671146869659424, + "learning_rate": 4.978109302167958e-05, + "loss": 5.3003, + "step": 7089 + }, + { + "epoch": 0.04216623846227043, + "grad_norm": 1.9667481184005737, + "learning_rate": 4.9781031339321156e-05, + "loss": 5.0957, + "step": 7090 + }, + { + "epoch": 0.042172185745551434, + "grad_norm": 1.8162986040115356, + "learning_rate": 4.978096964831193e-05, + "loss": 5.1472, + "step": 7091 + }, + { + "epoch": 0.04217813302883243, + "grad_norm": 1.7793545722961426, + "learning_rate": 4.9780907948651926e-05, + "loss": 5.1771, + "step": 7092 + }, + { + "epoch": 0.042184080312113424, + "grad_norm": 1.8093308210372925, + "learning_rate": 4.9780846240341156e-05, + "loss": 5.1611, + "step": 7093 + }, + { + "epoch": 0.042190027595394426, + "grad_norm": 1.7010010480880737, + "learning_rate": 4.978078452337965e-05, + "loss": 5.4478, + "step": 7094 + }, + { + "epoch": 0.04219597487867542, + "grad_norm": 1.7978744506835938, + "learning_rate": 4.9780722797767434e-05, + "loss": 5.4443, + "step": 7095 + }, + { + "epoch": 0.042201922161956416, + "grad_norm": 1.4861794710159302, + "learning_rate": 4.9780661063504516e-05, + "loss": 5.3773, + "step": 7096 + }, + { + "epoch": 0.04220786944523742, + "grad_norm": 1.7805769443511963, + "learning_rate": 4.978059932059093e-05, + "loss": 5.0896, + "step": 7097 + }, + { + "epoch": 0.042213816728518413, + "grad_norm": 1.7392783164978027, + "learning_rate": 4.9780537569026695e-05, + "loss": 5.0602, + "step": 7098 + }, + { + "epoch": 0.04221976401179941, + "grad_norm": 1.8742554187774658, + "learning_rate": 4.978047580881182e-05, + "loss": 5.2595, + "step": 7099 + }, + { + "epoch": 0.04222571129508041, + "grad_norm": 1.6077641248703003, + "learning_rate": 4.978041403994635e-05, + "loss": 5.0925, + "step": 7100 + }, + { + "epoch": 0.042231658578361406, + "grad_norm": 1.7536481618881226, + "learning_rate": 4.9780352262430286e-05, + "loss": 5.2546, + "step": 7101 + }, + { + "epoch": 0.0422376058616424, + "grad_norm": 1.6404869556427002, + "learning_rate": 4.9780290476263656e-05, + "loss": 5.1349, + "step": 7102 + }, + { + "epoch": 0.042243553144923396, + "grad_norm": 1.7223635911941528, + "learning_rate": 4.978022868144649e-05, + "loss": 5.2894, + "step": 7103 + }, + { + "epoch": 0.0422495004282044, + "grad_norm": 1.7856663465499878, + "learning_rate": 4.9780166877978796e-05, + "loss": 5.384, + "step": 7104 + }, + { + "epoch": 0.04225544771148539, + "grad_norm": 1.6434816122055054, + "learning_rate": 4.978010506586061e-05, + "loss": 5.257, + "step": 7105 + }, + { + "epoch": 0.04226139499476639, + "grad_norm": 1.668371558189392, + "learning_rate": 4.9780043245091936e-05, + "loss": 5.2698, + "step": 7106 + }, + { + "epoch": 0.04226734227804739, + "grad_norm": 1.7553619146347046, + "learning_rate": 4.97799814156728e-05, + "loss": 5.1591, + "step": 7107 + }, + { + "epoch": 0.042273289561328385, + "grad_norm": 1.6918652057647705, + "learning_rate": 4.977991957760324e-05, + "loss": 5.2727, + "step": 7108 + }, + { + "epoch": 0.04227923684460938, + "grad_norm": 1.6634269952774048, + "learning_rate": 4.977985773088326e-05, + "loss": 5.3099, + "step": 7109 + }, + { + "epoch": 0.04228518412789038, + "grad_norm": 2.131647825241089, + "learning_rate": 4.977979587551289e-05, + "loss": 5.0885, + "step": 7110 + }, + { + "epoch": 0.04229113141117138, + "grad_norm": 1.6632722616195679, + "learning_rate": 4.977973401149215e-05, + "loss": 5.1546, + "step": 7111 + }, + { + "epoch": 0.04229707869445237, + "grad_norm": 1.762418270111084, + "learning_rate": 4.977967213882107e-05, + "loss": 5.0884, + "step": 7112 + }, + { + "epoch": 0.042303025977733374, + "grad_norm": 1.9325755834579468, + "learning_rate": 4.977961025749964e-05, + "loss": 5.1857, + "step": 7113 + }, + { + "epoch": 0.04230897326101437, + "grad_norm": 1.8359284400939941, + "learning_rate": 4.9779548367527926e-05, + "loss": 5.165, + "step": 7114 + }, + { + "epoch": 0.042314920544295365, + "grad_norm": 1.8305978775024414, + "learning_rate": 4.977948646890591e-05, + "loss": 5.1347, + "step": 7115 + }, + { + "epoch": 0.04232086782757636, + "grad_norm": 1.7374697923660278, + "learning_rate": 4.9779424561633644e-05, + "loss": 5.5219, + "step": 7116 + }, + { + "epoch": 0.04232681511085736, + "grad_norm": 1.9947689771652222, + "learning_rate": 4.9779362645711135e-05, + "loss": 5.4445, + "step": 7117 + }, + { + "epoch": 0.04233276239413836, + "grad_norm": 1.6639795303344727, + "learning_rate": 4.97793007211384e-05, + "loss": 5.3798, + "step": 7118 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 1.6983096599578857, + "learning_rate": 4.977923878791547e-05, + "loss": 5.2847, + "step": 7119 + }, + { + "epoch": 0.042344656960700354, + "grad_norm": 1.7397092580795288, + "learning_rate": 4.9779176846042366e-05, + "loss": 5.3175, + "step": 7120 + }, + { + "epoch": 0.04235060424398135, + "grad_norm": 1.5255639553070068, + "learning_rate": 4.977911489551911e-05, + "loss": 5.2735, + "step": 7121 + }, + { + "epoch": 0.042356551527262344, + "grad_norm": 1.5646785497665405, + "learning_rate": 4.9779052936345715e-05, + "loss": 5.3892, + "step": 7122 + }, + { + "epoch": 0.042362498810543346, + "grad_norm": 1.7479640245437622, + "learning_rate": 4.977899096852221e-05, + "loss": 5.4341, + "step": 7123 + }, + { + "epoch": 0.04236844609382434, + "grad_norm": 1.6275604963302612, + "learning_rate": 4.9778928992048615e-05, + "loss": 5.5209, + "step": 7124 + }, + { + "epoch": 0.042374393377105336, + "grad_norm": 1.6917749643325806, + "learning_rate": 4.977886700692496e-05, + "loss": 5.5779, + "step": 7125 + }, + { + "epoch": 0.04238034066038634, + "grad_norm": 1.683716058731079, + "learning_rate": 4.977880501315125e-05, + "loss": 5.475, + "step": 7126 + }, + { + "epoch": 0.04238628794366733, + "grad_norm": 1.7665706872940063, + "learning_rate": 4.977874301072751e-05, + "loss": 5.3666, + "step": 7127 + }, + { + "epoch": 0.04239223522694833, + "grad_norm": 1.715329885482788, + "learning_rate": 4.977868099965377e-05, + "loss": 5.407, + "step": 7128 + }, + { + "epoch": 0.04239818251022933, + "grad_norm": 1.8468618392944336, + "learning_rate": 4.977861897993006e-05, + "loss": 5.328, + "step": 7129 + }, + { + "epoch": 0.042404129793510326, + "grad_norm": 1.59178626537323, + "learning_rate": 4.977855695155638e-05, + "loss": 5.7797, + "step": 7130 + }, + { + "epoch": 0.04241007707679132, + "grad_norm": 1.4733757972717285, + "learning_rate": 4.977849491453277e-05, + "loss": 5.3019, + "step": 7131 + }, + { + "epoch": 0.042416024360072316, + "grad_norm": 1.4632091522216797, + "learning_rate": 4.977843286885923e-05, + "loss": 5.1754, + "step": 7132 + }, + { + "epoch": 0.04242197164335332, + "grad_norm": 1.530564308166504, + "learning_rate": 4.97783708145358e-05, + "loss": 5.3613, + "step": 7133 + }, + { + "epoch": 0.04242791892663431, + "grad_norm": 1.954219102859497, + "learning_rate": 4.97783087515625e-05, + "loss": 5.4013, + "step": 7134 + }, + { + "epoch": 0.04243386620991531, + "grad_norm": 1.8276890516281128, + "learning_rate": 4.977824667993935e-05, + "loss": 5.3611, + "step": 7135 + }, + { + "epoch": 0.04243981349319631, + "grad_norm": 2.1430561542510986, + "learning_rate": 4.977818459966637e-05, + "loss": 5.1501, + "step": 7136 + }, + { + "epoch": 0.042445760776477305, + "grad_norm": 1.9150115251541138, + "learning_rate": 4.977812251074357e-05, + "loss": 5.1778, + "step": 7137 + }, + { + "epoch": 0.0424517080597583, + "grad_norm": 1.6958523988723755, + "learning_rate": 4.9778060413171004e-05, + "loss": 5.5029, + "step": 7138 + }, + { + "epoch": 0.0424576553430393, + "grad_norm": 1.7183772325515747, + "learning_rate": 4.977799830694866e-05, + "loss": 5.4323, + "step": 7139 + }, + { + "epoch": 0.0424636026263203, + "grad_norm": 1.717731237411499, + "learning_rate": 4.977793619207657e-05, + "loss": 5.3418, + "step": 7140 + }, + { + "epoch": 0.04246954990960129, + "grad_norm": 1.8155564069747925, + "learning_rate": 4.9777874068554766e-05, + "loss": 5.2865, + "step": 7141 + }, + { + "epoch": 0.042475497192882294, + "grad_norm": 1.9890762567520142, + "learning_rate": 4.9777811936383254e-05, + "loss": 5.4101, + "step": 7142 + }, + { + "epoch": 0.04248144447616329, + "grad_norm": 1.8181748390197754, + "learning_rate": 4.977774979556207e-05, + "loss": 5.2719, + "step": 7143 + }, + { + "epoch": 0.042487391759444285, + "grad_norm": 1.7353019714355469, + "learning_rate": 4.9777687646091234e-05, + "loss": 5.4202, + "step": 7144 + }, + { + "epoch": 0.04249333904272528, + "grad_norm": 1.6121984720230103, + "learning_rate": 4.977762548797076e-05, + "loss": 5.3174, + "step": 7145 + }, + { + "epoch": 0.04249928632600628, + "grad_norm": 1.9579551219940186, + "learning_rate": 4.977756332120067e-05, + "loss": 5.135, + "step": 7146 + }, + { + "epoch": 0.04250523360928728, + "grad_norm": 1.9396319389343262, + "learning_rate": 4.977750114578099e-05, + "loss": 5.7521, + "step": 7147 + }, + { + "epoch": 0.04251118089256827, + "grad_norm": 1.8567198514938354, + "learning_rate": 4.977743896171173e-05, + "loss": 5.7521, + "step": 7148 + }, + { + "epoch": 0.042517128175849274, + "grad_norm": 2.139861583709717, + "learning_rate": 4.977737676899293e-05, + "loss": 5.472, + "step": 7149 + }, + { + "epoch": 0.04252307545913027, + "grad_norm": 1.6526445150375366, + "learning_rate": 4.977731456762461e-05, + "loss": 5.5557, + "step": 7150 + }, + { + "epoch": 0.042529022742411264, + "grad_norm": 1.7761725187301636, + "learning_rate": 4.9777252357606784e-05, + "loss": 5.1922, + "step": 7151 + }, + { + "epoch": 0.042534970025692266, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.977719013893947e-05, + "loss": 5.5067, + "step": 7152 + }, + { + "epoch": 0.04254091730897326, + "grad_norm": 1.746470332145691, + "learning_rate": 4.97771279116227e-05, + "loss": 5.28, + "step": 7153 + }, + { + "epoch": 0.042546864592254256, + "grad_norm": 1.9258379936218262, + "learning_rate": 4.9777065675656484e-05, + "loss": 5.7223, + "step": 7154 + }, + { + "epoch": 0.04255281187553526, + "grad_norm": 1.9928748607635498, + "learning_rate": 4.977700343104086e-05, + "loss": 5.727, + "step": 7155 + }, + { + "epoch": 0.04255875915881625, + "grad_norm": 1.7435163259506226, + "learning_rate": 4.9776941177775824e-05, + "loss": 5.6636, + "step": 7156 + }, + { + "epoch": 0.04256470644209725, + "grad_norm": 1.6818004846572876, + "learning_rate": 4.977687891586143e-05, + "loss": 5.6589, + "step": 7157 + }, + { + "epoch": 0.04257065372537825, + "grad_norm": 1.812779426574707, + "learning_rate": 4.9776816645297676e-05, + "loss": 5.2705, + "step": 7158 + }, + { + "epoch": 0.042576601008659246, + "grad_norm": 1.7637232542037964, + "learning_rate": 4.977675436608459e-05, + "loss": 5.2872, + "step": 7159 + }, + { + "epoch": 0.04258254829194024, + "grad_norm": 1.9504014253616333, + "learning_rate": 4.97766920782222e-05, + "loss": 5.1324, + "step": 7160 + }, + { + "epoch": 0.042588495575221236, + "grad_norm": 1.7741994857788086, + "learning_rate": 4.9776629781710525e-05, + "loss": 5.4164, + "step": 7161 + }, + { + "epoch": 0.04259444285850224, + "grad_norm": 2.0005195140838623, + "learning_rate": 4.9776567476549576e-05, + "loss": 5.4667, + "step": 7162 + }, + { + "epoch": 0.04260039014178323, + "grad_norm": 2.256420612335205, + "learning_rate": 4.977650516273939e-05, + "loss": 5.1116, + "step": 7163 + }, + { + "epoch": 0.04260633742506423, + "grad_norm": 2.0806920528411865, + "learning_rate": 4.977644284027998e-05, + "loss": 5.2333, + "step": 7164 + }, + { + "epoch": 0.04261228470834523, + "grad_norm": 1.898760199546814, + "learning_rate": 4.9776380509171364e-05, + "loss": 5.4761, + "step": 7165 + }, + { + "epoch": 0.042618231991626225, + "grad_norm": 1.7251659631729126, + "learning_rate": 4.977631816941358e-05, + "loss": 5.5584, + "step": 7166 + }, + { + "epoch": 0.04262417927490722, + "grad_norm": 1.741645336151123, + "learning_rate": 4.977625582100664e-05, + "loss": 5.4133, + "step": 7167 + }, + { + "epoch": 0.04263012655818822, + "grad_norm": 1.921617031097412, + "learning_rate": 4.977619346395055e-05, + "loss": 5.1829, + "step": 7168 + }, + { + "epoch": 0.04263607384146922, + "grad_norm": 1.7597262859344482, + "learning_rate": 4.977613109824536e-05, + "loss": 5.1743, + "step": 7169 + }, + { + "epoch": 0.04264202112475021, + "grad_norm": 1.8069764375686646, + "learning_rate": 4.977606872389107e-05, + "loss": 5.4004, + "step": 7170 + }, + { + "epoch": 0.042647968408031214, + "grad_norm": 1.7694367170333862, + "learning_rate": 4.9776006340887714e-05, + "loss": 5.2018, + "step": 7171 + }, + { + "epoch": 0.04265391569131221, + "grad_norm": 1.8260759115219116, + "learning_rate": 4.9775943949235316e-05, + "loss": 5.4115, + "step": 7172 + }, + { + "epoch": 0.042659862974593205, + "grad_norm": 1.71034574508667, + "learning_rate": 4.9775881548933884e-05, + "loss": 5.2781, + "step": 7173 + }, + { + "epoch": 0.0426658102578742, + "grad_norm": 1.7208900451660156, + "learning_rate": 4.977581913998345e-05, + "loss": 5.4686, + "step": 7174 + }, + { + "epoch": 0.0426717575411552, + "grad_norm": 1.8545277118682861, + "learning_rate": 4.977575672238404e-05, + "loss": 5.4545, + "step": 7175 + }, + { + "epoch": 0.0426777048244362, + "grad_norm": 1.7892229557037354, + "learning_rate": 4.9775694296135656e-05, + "loss": 5.6612, + "step": 7176 + }, + { + "epoch": 0.04268365210771719, + "grad_norm": 1.8321889638900757, + "learning_rate": 4.9775631861238343e-05, + "loss": 5.5889, + "step": 7177 + }, + { + "epoch": 0.042689599390998194, + "grad_norm": 1.7925626039505005, + "learning_rate": 4.977556941769211e-05, + "loss": 5.6218, + "step": 7178 + }, + { + "epoch": 0.04269554667427919, + "grad_norm": 1.9650121927261353, + "learning_rate": 4.9775506965496984e-05, + "loss": 5.5228, + "step": 7179 + }, + { + "epoch": 0.042701493957560184, + "grad_norm": 1.9050647020339966, + "learning_rate": 4.977544450465298e-05, + "loss": 5.5547, + "step": 7180 + }, + { + "epoch": 0.042707441240841186, + "grad_norm": 1.8334670066833496, + "learning_rate": 4.977538203516013e-05, + "loss": 5.3895, + "step": 7181 + }, + { + "epoch": 0.04271338852412218, + "grad_norm": 1.803544521331787, + "learning_rate": 4.9775319557018444e-05, + "loss": 5.6288, + "step": 7182 + }, + { + "epoch": 0.042719335807403176, + "grad_norm": 1.823440432548523, + "learning_rate": 4.9775257070227956e-05, + "loss": 5.4996, + "step": 7183 + }, + { + "epoch": 0.04272528309068418, + "grad_norm": 1.9730159044265747, + "learning_rate": 4.977519457478868e-05, + "loss": 5.5004, + "step": 7184 + }, + { + "epoch": 0.04273123037396517, + "grad_norm": 1.9566004276275635, + "learning_rate": 4.977513207070064e-05, + "loss": 5.5496, + "step": 7185 + }, + { + "epoch": 0.04273717765724617, + "grad_norm": 2.0958995819091797, + "learning_rate": 4.977506955796385e-05, + "loss": 5.5256, + "step": 7186 + }, + { + "epoch": 0.04274312494052717, + "grad_norm": 1.8957890272140503, + "learning_rate": 4.977500703657835e-05, + "loss": 5.3337, + "step": 7187 + }, + { + "epoch": 0.042749072223808166, + "grad_norm": 1.8224141597747803, + "learning_rate": 4.977494450654414e-05, + "loss": 5.1362, + "step": 7188 + }, + { + "epoch": 0.04275501950708916, + "grad_norm": 1.648296594619751, + "learning_rate": 4.977488196786126e-05, + "loss": 5.3398, + "step": 7189 + }, + { + "epoch": 0.042760966790370156, + "grad_norm": 1.6238311529159546, + "learning_rate": 4.977481942052972e-05, + "loss": 5.2083, + "step": 7190 + }, + { + "epoch": 0.04276691407365116, + "grad_norm": 1.7399996519088745, + "learning_rate": 4.977475686454956e-05, + "loss": 5.2403, + "step": 7191 + }, + { + "epoch": 0.04277286135693215, + "grad_norm": 1.7260342836380005, + "learning_rate": 4.977469429992077e-05, + "loss": 5.2282, + "step": 7192 + }, + { + "epoch": 0.04277880864021315, + "grad_norm": 4.4954447746276855, + "learning_rate": 4.9774631726643396e-05, + "loss": 5.1044, + "step": 7193 + }, + { + "epoch": 0.04278475592349415, + "grad_norm": 1.879869818687439, + "learning_rate": 4.977456914471746e-05, + "loss": 5.3431, + "step": 7194 + }, + { + "epoch": 0.042790703206775145, + "grad_norm": 1.8826582431793213, + "learning_rate": 4.977450655414297e-05, + "loss": 5.2951, + "step": 7195 + }, + { + "epoch": 0.04279665049005614, + "grad_norm": 1.8973712921142578, + "learning_rate": 4.977444395491996e-05, + "loss": 5.343, + "step": 7196 + }, + { + "epoch": 0.04280259777333714, + "grad_norm": 1.6125551462173462, + "learning_rate": 4.977438134704845e-05, + "loss": 5.2849, + "step": 7197 + }, + { + "epoch": 0.04280854505661814, + "grad_norm": 1.441159963607788, + "learning_rate": 4.9774318730528456e-05, + "loss": 5.2955, + "step": 7198 + }, + { + "epoch": 0.04281449233989913, + "grad_norm": 1.9655884504318237, + "learning_rate": 4.9774256105360004e-05, + "loss": 5.2093, + "step": 7199 + }, + { + "epoch": 0.042820439623180134, + "grad_norm": 1.7824043035507202, + "learning_rate": 4.9774193471543116e-05, + "loss": 5.2105, + "step": 7200 + }, + { + "epoch": 0.04282638690646113, + "grad_norm": 1.8331031799316406, + "learning_rate": 4.977413082907781e-05, + "loss": 5.3359, + "step": 7201 + }, + { + "epoch": 0.042832334189742124, + "grad_norm": 1.8695242404937744, + "learning_rate": 4.977406817796412e-05, + "loss": 5.3686, + "step": 7202 + }, + { + "epoch": 0.042838281473023126, + "grad_norm": 1.70205557346344, + "learning_rate": 4.977400551820205e-05, + "loss": 5.2689, + "step": 7203 + }, + { + "epoch": 0.04284422875630412, + "grad_norm": 1.700307846069336, + "learning_rate": 4.9773942849791635e-05, + "loss": 5.3946, + "step": 7204 + }, + { + "epoch": 0.04285017603958512, + "grad_norm": 1.625637173652649, + "learning_rate": 4.977388017273288e-05, + "loss": 5.095, + "step": 7205 + }, + { + "epoch": 0.04285612332286611, + "grad_norm": 1.7689390182495117, + "learning_rate": 4.977381748702583e-05, + "loss": 5.0097, + "step": 7206 + }, + { + "epoch": 0.042862070606147114, + "grad_norm": 1.856493353843689, + "learning_rate": 4.97737547926705e-05, + "loss": 5.0551, + "step": 7207 + }, + { + "epoch": 0.04286801788942811, + "grad_norm": 1.6497242450714111, + "learning_rate": 4.97736920896669e-05, + "loss": 5.031, + "step": 7208 + }, + { + "epoch": 0.042873965172709104, + "grad_norm": 1.5884608030319214, + "learning_rate": 4.977362937801506e-05, + "loss": 5.0758, + "step": 7209 + }, + { + "epoch": 0.042879912455990106, + "grad_norm": 1.5206499099731445, + "learning_rate": 4.9773566657715006e-05, + "loss": 5.049, + "step": 7210 + }, + { + "epoch": 0.0428858597392711, + "grad_norm": 1.7026933431625366, + "learning_rate": 4.977350392876676e-05, + "loss": 5.001, + "step": 7211 + }, + { + "epoch": 0.042891807022552096, + "grad_norm": 1.4197289943695068, + "learning_rate": 4.977344119117034e-05, + "loss": 5.0446, + "step": 7212 + }, + { + "epoch": 0.0428977543058331, + "grad_norm": 1.498713731765747, + "learning_rate": 4.977337844492576e-05, + "loss": 5.0574, + "step": 7213 + }, + { + "epoch": 0.04290370158911409, + "grad_norm": 1.7583528757095337, + "learning_rate": 4.9773315690033054e-05, + "loss": 4.994, + "step": 7214 + }, + { + "epoch": 0.04290964887239509, + "grad_norm": 1.8511004447937012, + "learning_rate": 4.9773252926492236e-05, + "loss": 4.9888, + "step": 7215 + }, + { + "epoch": 0.04291559615567609, + "grad_norm": 1.5799078941345215, + "learning_rate": 4.9773190154303334e-05, + "loss": 5.0028, + "step": 7216 + }, + { + "epoch": 0.042921543438957085, + "grad_norm": 1.6737205982208252, + "learning_rate": 4.977312737346637e-05, + "loss": 5.0701, + "step": 7217 + }, + { + "epoch": 0.04292749072223808, + "grad_norm": 1.537049412727356, + "learning_rate": 4.977306458398136e-05, + "loss": 5.0747, + "step": 7218 + }, + { + "epoch": 0.042933438005519076, + "grad_norm": 1.7501899003982544, + "learning_rate": 4.977300178584833e-05, + "loss": 5.0172, + "step": 7219 + }, + { + "epoch": 0.04293938528880008, + "grad_norm": 1.5130890607833862, + "learning_rate": 4.9772938979067294e-05, + "loss": 5.0196, + "step": 7220 + }, + { + "epoch": 0.04294533257208107, + "grad_norm": 1.628053903579712, + "learning_rate": 4.977287616363829e-05, + "loss": 5.0526, + "step": 7221 + }, + { + "epoch": 0.04295127985536207, + "grad_norm": 1.6736811399459839, + "learning_rate": 4.977281333956133e-05, + "loss": 5.0093, + "step": 7222 + }, + { + "epoch": 0.04295722713864307, + "grad_norm": 1.6157552003860474, + "learning_rate": 4.977275050683643e-05, + "loss": 4.9562, + "step": 7223 + }, + { + "epoch": 0.042963174421924065, + "grad_norm": 1.6699459552764893, + "learning_rate": 4.9772687665463625e-05, + "loss": 4.9603, + "step": 7224 + }, + { + "epoch": 0.04296912170520506, + "grad_norm": 1.4698256254196167, + "learning_rate": 4.9772624815442925e-05, + "loss": 4.9908, + "step": 7225 + }, + { + "epoch": 0.04297506898848606, + "grad_norm": 1.5310906171798706, + "learning_rate": 4.9772561956774365e-05, + "loss": 5.0081, + "step": 7226 + }, + { + "epoch": 0.04298101627176706, + "grad_norm": 1.6135941743850708, + "learning_rate": 4.977249908945795e-05, + "loss": 5.1394, + "step": 7227 + }, + { + "epoch": 0.04298696355504805, + "grad_norm": 1.7632607221603394, + "learning_rate": 4.977243621349372e-05, + "loss": 4.9992, + "step": 7228 + }, + { + "epoch": 0.042992910838329054, + "grad_norm": 1.574826955795288, + "learning_rate": 4.977237332888168e-05, + "loss": 4.9361, + "step": 7229 + }, + { + "epoch": 0.04299885812161005, + "grad_norm": 1.6633859872817993, + "learning_rate": 4.9772310435621874e-05, + "loss": 4.9085, + "step": 7230 + }, + { + "epoch": 0.043004805404891044, + "grad_norm": 1.6180634498596191, + "learning_rate": 4.97722475337143e-05, + "loss": 4.939, + "step": 7231 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 1.959694266319275, + "learning_rate": 4.9772184623158996e-05, + "loss": 5.231, + "step": 7232 + }, + { + "epoch": 0.04301669997145304, + "grad_norm": 1.6264785528182983, + "learning_rate": 4.977212170395598e-05, + "loss": 5.3228, + "step": 7233 + }, + { + "epoch": 0.04302264725473404, + "grad_norm": 2.109292507171631, + "learning_rate": 4.9772058776105264e-05, + "loss": 5.4579, + "step": 7234 + }, + { + "epoch": 0.04302859453801503, + "grad_norm": 1.991877555847168, + "learning_rate": 4.977199583960688e-05, + "loss": 5.355, + "step": 7235 + }, + { + "epoch": 0.043034541821296034, + "grad_norm": 2.23330020904541, + "learning_rate": 4.977193289446085e-05, + "loss": 5.3233, + "step": 7236 + }, + { + "epoch": 0.04304048910457703, + "grad_norm": 2.077359914779663, + "learning_rate": 4.9771869940667194e-05, + "loss": 5.2003, + "step": 7237 + }, + { + "epoch": 0.043046436387858024, + "grad_norm": 1.652498722076416, + "learning_rate": 4.977180697822593e-05, + "loss": 5.0232, + "step": 7238 + }, + { + "epoch": 0.043052383671139026, + "grad_norm": 1.9277194738388062, + "learning_rate": 4.977174400713709e-05, + "loss": 5.3826, + "step": 7239 + }, + { + "epoch": 0.04305833095442002, + "grad_norm": 1.9263273477554321, + "learning_rate": 4.9771681027400694e-05, + "loss": 5.5258, + "step": 7240 + }, + { + "epoch": 0.043064278237701016, + "grad_norm": 2.066934108734131, + "learning_rate": 4.9771618039016756e-05, + "loss": 5.6398, + "step": 7241 + }, + { + "epoch": 0.04307022552098202, + "grad_norm": 1.7810741662979126, + "learning_rate": 4.9771555041985295e-05, + "loss": 5.3716, + "step": 7242 + }, + { + "epoch": 0.04307617280426301, + "grad_norm": 1.7068313360214233, + "learning_rate": 4.977149203630635e-05, + "loss": 5.4042, + "step": 7243 + }, + { + "epoch": 0.04308212008754401, + "grad_norm": 1.8587994575500488, + "learning_rate": 4.977142902197992e-05, + "loss": 5.3635, + "step": 7244 + }, + { + "epoch": 0.04308806737082501, + "grad_norm": 2.101649284362793, + "learning_rate": 4.9771365999006054e-05, + "loss": 5.5292, + "step": 7245 + }, + { + "epoch": 0.043094014654106005, + "grad_norm": 1.8571972846984863, + "learning_rate": 4.9771302967384756e-05, + "loss": 5.4577, + "step": 7246 + }, + { + "epoch": 0.043099961937387, + "grad_norm": 1.9837383031845093, + "learning_rate": 4.9771239927116045e-05, + "loss": 5.4976, + "step": 7247 + }, + { + "epoch": 0.043105909220667996, + "grad_norm": 1.7688343524932861, + "learning_rate": 4.977117687819996e-05, + "loss": 5.448, + "step": 7248 + }, + { + "epoch": 0.043111856503949, + "grad_norm": 1.923824429512024, + "learning_rate": 4.9771113820636505e-05, + "loss": 5.3436, + "step": 7249 + }, + { + "epoch": 0.04311780378722999, + "grad_norm": 1.4405949115753174, + "learning_rate": 4.9771050754425715e-05, + "loss": 5.2751, + "step": 7250 + }, + { + "epoch": 0.04312375107051099, + "grad_norm": 1.7337450981140137, + "learning_rate": 4.977098767956761e-05, + "loss": 5.4693, + "step": 7251 + }, + { + "epoch": 0.04312969835379199, + "grad_norm": 2.063887119293213, + "learning_rate": 4.977092459606221e-05, + "loss": 5.4576, + "step": 7252 + }, + { + "epoch": 0.043135645637072985, + "grad_norm": 1.576517105102539, + "learning_rate": 4.9770861503909524e-05, + "loss": 5.4052, + "step": 7253 + }, + { + "epoch": 0.04314159292035398, + "grad_norm": 1.8137834072113037, + "learning_rate": 4.9770798403109596e-05, + "loss": 5.5732, + "step": 7254 + }, + { + "epoch": 0.04314754020363498, + "grad_norm": 1.7954564094543457, + "learning_rate": 4.977073529366244e-05, + "loss": 5.4213, + "step": 7255 + }, + { + "epoch": 0.04315348748691598, + "grad_norm": 1.993961215019226, + "learning_rate": 4.977067217556807e-05, + "loss": 5.2909, + "step": 7256 + }, + { + "epoch": 0.04315943477019697, + "grad_norm": 1.6993632316589355, + "learning_rate": 4.977060904882651e-05, + "loss": 5.4523, + "step": 7257 + }, + { + "epoch": 0.043165382053477974, + "grad_norm": 1.8541932106018066, + "learning_rate": 4.977054591343779e-05, + "loss": 5.3182, + "step": 7258 + }, + { + "epoch": 0.04317132933675897, + "grad_norm": 1.7425625324249268, + "learning_rate": 4.9770482769401935e-05, + "loss": 5.2527, + "step": 7259 + }, + { + "epoch": 0.043177276620039964, + "grad_norm": 1.7028024196624756, + "learning_rate": 4.9770419616718955e-05, + "loss": 5.1305, + "step": 7260 + }, + { + "epoch": 0.043183223903320966, + "grad_norm": 1.745316982269287, + "learning_rate": 4.977035645538888e-05, + "loss": 5.0368, + "step": 7261 + }, + { + "epoch": 0.04318917118660196, + "grad_norm": 1.8373509645462036, + "learning_rate": 4.977029328541173e-05, + "loss": 5.353, + "step": 7262 + }, + { + "epoch": 0.04319511846988296, + "grad_norm": 1.9976449012756348, + "learning_rate": 4.9770230106787526e-05, + "loss": 5.363, + "step": 7263 + }, + { + "epoch": 0.04320106575316395, + "grad_norm": 1.7109822034835815, + "learning_rate": 4.977016691951629e-05, + "loss": 5.3462, + "step": 7264 + }, + { + "epoch": 0.043207013036444954, + "grad_norm": 1.8688478469848633, + "learning_rate": 4.9770103723598036e-05, + "loss": 5.3564, + "step": 7265 + }, + { + "epoch": 0.04321296031972595, + "grad_norm": 1.8680217266082764, + "learning_rate": 4.9770040519032804e-05, + "loss": 5.2713, + "step": 7266 + }, + { + "epoch": 0.043218907603006944, + "grad_norm": 1.8022522926330566, + "learning_rate": 4.976997730582061e-05, + "loss": 5.153, + "step": 7267 + }, + { + "epoch": 0.043224854886287946, + "grad_norm": 1.7128162384033203, + "learning_rate": 4.976991408396147e-05, + "loss": 5.3107, + "step": 7268 + }, + { + "epoch": 0.04323080216956894, + "grad_norm": 1.8222606182098389, + "learning_rate": 4.9769850853455404e-05, + "loss": 5.3599, + "step": 7269 + }, + { + "epoch": 0.043236749452849936, + "grad_norm": 1.829373836517334, + "learning_rate": 4.976978761430244e-05, + "loss": 5.3991, + "step": 7270 + }, + { + "epoch": 0.04324269673613094, + "grad_norm": 1.8270717859268188, + "learning_rate": 4.97697243665026e-05, + "loss": 5.2434, + "step": 7271 + }, + { + "epoch": 0.04324864401941193, + "grad_norm": 1.9759695529937744, + "learning_rate": 4.976966111005591e-05, + "loss": 5.4585, + "step": 7272 + }, + { + "epoch": 0.04325459130269293, + "grad_norm": 2.0235564708709717, + "learning_rate": 4.9769597844962376e-05, + "loss": 5.3996, + "step": 7273 + }, + { + "epoch": 0.04326053858597393, + "grad_norm": 1.9220880270004272, + "learning_rate": 4.976953457122204e-05, + "loss": 5.344, + "step": 7274 + }, + { + "epoch": 0.043266485869254925, + "grad_norm": 1.6257338523864746, + "learning_rate": 4.976947128883492e-05, + "loss": 5.4012, + "step": 7275 + }, + { + "epoch": 0.04327243315253592, + "grad_norm": 1.6390771865844727, + "learning_rate": 4.976940799780103e-05, + "loss": 5.3693, + "step": 7276 + }, + { + "epoch": 0.043278380435816916, + "grad_norm": 1.5769712924957275, + "learning_rate": 4.976934469812039e-05, + "loss": 5.3214, + "step": 7277 + }, + { + "epoch": 0.04328432771909792, + "grad_norm": 1.539920687675476, + "learning_rate": 4.9769281389793035e-05, + "loss": 5.2784, + "step": 7278 + }, + { + "epoch": 0.04329027500237891, + "grad_norm": 1.662835717201233, + "learning_rate": 4.976921807281897e-05, + "loss": 5.2717, + "step": 7279 + }, + { + "epoch": 0.04329622228565991, + "grad_norm": 1.3613345623016357, + "learning_rate": 4.9769154747198234e-05, + "loss": 5.4241, + "step": 7280 + }, + { + "epoch": 0.04330216956894091, + "grad_norm": 1.5267658233642578, + "learning_rate": 4.976909141293084e-05, + "loss": 5.454, + "step": 7281 + }, + { + "epoch": 0.043308116852221905, + "grad_norm": 1.5050435066223145, + "learning_rate": 4.976902807001681e-05, + "loss": 5.4975, + "step": 7282 + }, + { + "epoch": 0.0433140641355029, + "grad_norm": 1.292698621749878, + "learning_rate": 4.976896471845617e-05, + "loss": 5.4071, + "step": 7283 + }, + { + "epoch": 0.0433200114187839, + "grad_norm": 1.6818265914916992, + "learning_rate": 4.9768901358248946e-05, + "loss": 5.3561, + "step": 7284 + }, + { + "epoch": 0.0433259587020649, + "grad_norm": 1.5995383262634277, + "learning_rate": 4.976883798939515e-05, + "loss": 5.2623, + "step": 7285 + }, + { + "epoch": 0.04333190598534589, + "grad_norm": 1.6959342956542969, + "learning_rate": 4.976877461189481e-05, + "loss": 5.3193, + "step": 7286 + }, + { + "epoch": 0.043337853268626894, + "grad_norm": 1.6978071928024292, + "learning_rate": 4.976871122574794e-05, + "loss": 5.5653, + "step": 7287 + }, + { + "epoch": 0.04334380055190789, + "grad_norm": 1.7587183713912964, + "learning_rate": 4.976864783095457e-05, + "loss": 5.545, + "step": 7288 + }, + { + "epoch": 0.043349747835188884, + "grad_norm": 1.6225430965423584, + "learning_rate": 4.976858442751473e-05, + "loss": 5.5804, + "step": 7289 + }, + { + "epoch": 0.043355695118469886, + "grad_norm": 1.5895410776138306, + "learning_rate": 4.976852101542843e-05, + "loss": 5.4798, + "step": 7290 + }, + { + "epoch": 0.04336164240175088, + "grad_norm": 1.759022831916809, + "learning_rate": 4.976845759469569e-05, + "loss": 5.4794, + "step": 7291 + }, + { + "epoch": 0.043367589685031877, + "grad_norm": 1.483383059501648, + "learning_rate": 4.976839416531654e-05, + "loss": 5.2547, + "step": 7292 + }, + { + "epoch": 0.04337353696831287, + "grad_norm": 2.136172294616699, + "learning_rate": 4.9768330727291e-05, + "loss": 5.1655, + "step": 7293 + }, + { + "epoch": 0.043379484251593874, + "grad_norm": 1.9202553033828735, + "learning_rate": 4.9768267280619094e-05, + "loss": 5.1945, + "step": 7294 + }, + { + "epoch": 0.04338543153487487, + "grad_norm": 1.7927708625793457, + "learning_rate": 4.976820382530084e-05, + "loss": 5.4936, + "step": 7295 + }, + { + "epoch": 0.043391378818155864, + "grad_norm": 1.597887396812439, + "learning_rate": 4.976814036133626e-05, + "loss": 5.5516, + "step": 7296 + }, + { + "epoch": 0.043397326101436866, + "grad_norm": 1.493356466293335, + "learning_rate": 4.9768076888725376e-05, + "loss": 5.552, + "step": 7297 + }, + { + "epoch": 0.04340327338471786, + "grad_norm": 1.6748720407485962, + "learning_rate": 4.976801340746822e-05, + "loss": 5.3957, + "step": 7298 + }, + { + "epoch": 0.043409220667998856, + "grad_norm": 1.541945457458496, + "learning_rate": 4.9767949917564794e-05, + "loss": 5.5558, + "step": 7299 + }, + { + "epoch": 0.04341516795127986, + "grad_norm": 1.6436586380004883, + "learning_rate": 4.976788641901514e-05, + "loss": 5.4918, + "step": 7300 + }, + { + "epoch": 0.04342111523456085, + "grad_norm": 1.69910728931427, + "learning_rate": 4.9767822911819274e-05, + "loss": 5.4688, + "step": 7301 + }, + { + "epoch": 0.04342706251784185, + "grad_norm": 1.8294274806976318, + "learning_rate": 4.976775939597721e-05, + "loss": 5.505, + "step": 7302 + }, + { + "epoch": 0.04343300980112285, + "grad_norm": 1.720880389213562, + "learning_rate": 4.976769587148899e-05, + "loss": 5.3509, + "step": 7303 + }, + { + "epoch": 0.043438957084403845, + "grad_norm": 1.5898194313049316, + "learning_rate": 4.976763233835461e-05, + "loss": 5.2955, + "step": 7304 + }, + { + "epoch": 0.04344490436768484, + "grad_norm": 1.569218397140503, + "learning_rate": 4.976756879657412e-05, + "loss": 5.5695, + "step": 7305 + }, + { + "epoch": 0.043450851650965835, + "grad_norm": 1.5551841259002686, + "learning_rate": 4.976750524614752e-05, + "loss": 5.5313, + "step": 7306 + }, + { + "epoch": 0.04345679893424684, + "grad_norm": 1.5870057344436646, + "learning_rate": 4.9767441687074834e-05, + "loss": 5.7525, + "step": 7307 + }, + { + "epoch": 0.04346274621752783, + "grad_norm": 1.5421022176742554, + "learning_rate": 4.97673781193561e-05, + "loss": 5.6176, + "step": 7308 + }, + { + "epoch": 0.04346869350080883, + "grad_norm": 1.9368326663970947, + "learning_rate": 4.976731454299132e-05, + "loss": 5.4239, + "step": 7309 + }, + { + "epoch": 0.04347464078408983, + "grad_norm": 1.719084620475769, + "learning_rate": 4.976725095798053e-05, + "loss": 5.3526, + "step": 7310 + }, + { + "epoch": 0.043480588067370825, + "grad_norm": 1.8004268407821655, + "learning_rate": 4.9767187364323756e-05, + "loss": 5.7112, + "step": 7311 + }, + { + "epoch": 0.04348653535065182, + "grad_norm": 1.9922735691070557, + "learning_rate": 4.9767123762021003e-05, + "loss": 5.4993, + "step": 7312 + }, + { + "epoch": 0.04349248263393282, + "grad_norm": 1.6768959760665894, + "learning_rate": 4.976706015107231e-05, + "loss": 5.4713, + "step": 7313 + }, + { + "epoch": 0.04349842991721382, + "grad_norm": 1.6070122718811035, + "learning_rate": 4.976699653147768e-05, + "loss": 5.4695, + "step": 7314 + }, + { + "epoch": 0.04350437720049481, + "grad_norm": 1.5641200542449951, + "learning_rate": 4.976693290323716e-05, + "loss": 5.3596, + "step": 7315 + }, + { + "epoch": 0.043510324483775814, + "grad_norm": 3.0344419479370117, + "learning_rate": 4.976686926635076e-05, + "loss": 5.7371, + "step": 7316 + }, + { + "epoch": 0.04351627176705681, + "grad_norm": 1.8784242868423462, + "learning_rate": 4.9766805620818494e-05, + "loss": 5.5142, + "step": 7317 + }, + { + "epoch": 0.043522219050337804, + "grad_norm": 2.0644166469573975, + "learning_rate": 4.9766741966640394e-05, + "loss": 5.276, + "step": 7318 + }, + { + "epoch": 0.043528166333618806, + "grad_norm": 1.8128771781921387, + "learning_rate": 4.976667830381649e-05, + "loss": 5.3515, + "step": 7319 + }, + { + "epoch": 0.0435341136168998, + "grad_norm": 1.8899081945419312, + "learning_rate": 4.9766614632346786e-05, + "loss": 5.3981, + "step": 7320 + }, + { + "epoch": 0.043540060900180796, + "grad_norm": 1.89181649684906, + "learning_rate": 4.976655095223131e-05, + "loss": 5.4378, + "step": 7321 + }, + { + "epoch": 0.04354600818346179, + "grad_norm": 1.6332184076309204, + "learning_rate": 4.976648726347009e-05, + "loss": 5.4023, + "step": 7322 + }, + { + "epoch": 0.043551955466742794, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.976642356606315e-05, + "loss": 5.8375, + "step": 7323 + }, + { + "epoch": 0.04355790275002379, + "grad_norm": 2.029244899749756, + "learning_rate": 4.97663598600105e-05, + "loss": 5.5617, + "step": 7324 + }, + { + "epoch": 0.043563850033304784, + "grad_norm": 2.138946056365967, + "learning_rate": 4.9766296145312175e-05, + "loss": 5.5076, + "step": 7325 + }, + { + "epoch": 0.043569797316585786, + "grad_norm": 1.8702884912490845, + "learning_rate": 4.9766232421968184e-05, + "loss": 5.123, + "step": 7326 + }, + { + "epoch": 0.04357574459986678, + "grad_norm": 1.8917137384414673, + "learning_rate": 4.976616868997856e-05, + "loss": 5.4809, + "step": 7327 + }, + { + "epoch": 0.043581691883147776, + "grad_norm": 2.2203474044799805, + "learning_rate": 4.976610494934333e-05, + "loss": 5.6359, + "step": 7328 + }, + { + "epoch": 0.04358763916642878, + "grad_norm": 2.4505302906036377, + "learning_rate": 4.976604120006251e-05, + "loss": 6.1423, + "step": 7329 + }, + { + "epoch": 0.04359358644970977, + "grad_norm": 2.4601128101348877, + "learning_rate": 4.976597744213611e-05, + "loss": 6.0908, + "step": 7330 + }, + { + "epoch": 0.04359953373299077, + "grad_norm": 1.9502687454223633, + "learning_rate": 4.976591367556417e-05, + "loss": 5.918, + "step": 7331 + }, + { + "epoch": 0.04360548101627177, + "grad_norm": 2.180250644683838, + "learning_rate": 4.9765849900346696e-05, + "loss": 5.7203, + "step": 7332 + }, + { + "epoch": 0.043611428299552765, + "grad_norm": 2.125669002532959, + "learning_rate": 4.9765786116483726e-05, + "loss": 5.7875, + "step": 7333 + }, + { + "epoch": 0.04361737558283376, + "grad_norm": 2.0372321605682373, + "learning_rate": 4.9765722323975286e-05, + "loss": 5.6777, + "step": 7334 + }, + { + "epoch": 0.043623322866114755, + "grad_norm": 2.5857362747192383, + "learning_rate": 4.976565852282137e-05, + "loss": 5.2989, + "step": 7335 + }, + { + "epoch": 0.04362927014939576, + "grad_norm": 2.5774800777435303, + "learning_rate": 4.976559471302203e-05, + "loss": 6.0479, + "step": 7336 + }, + { + "epoch": 0.04363521743267675, + "grad_norm": 2.0820937156677246, + "learning_rate": 4.976553089457727e-05, + "loss": 5.7636, + "step": 7337 + }, + { + "epoch": 0.04364116471595775, + "grad_norm": 2.287719964981079, + "learning_rate": 4.9765467067487126e-05, + "loss": 5.7706, + "step": 7338 + }, + { + "epoch": 0.04364711199923875, + "grad_norm": 2.6578378677368164, + "learning_rate": 4.9765403231751614e-05, + "loss": 6.1506, + "step": 7339 + }, + { + "epoch": 0.043653059282519745, + "grad_norm": 2.503955841064453, + "learning_rate": 4.976533938737075e-05, + "loss": 6.0658, + "step": 7340 + }, + { + "epoch": 0.04365900656580074, + "grad_norm": 2.28857684135437, + "learning_rate": 4.976527553434456e-05, + "loss": 5.833, + "step": 7341 + }, + { + "epoch": 0.04366495384908174, + "grad_norm": 2.327331781387329, + "learning_rate": 4.976521167267307e-05, + "loss": 5.934, + "step": 7342 + }, + { + "epoch": 0.04367090113236274, + "grad_norm": 1.7726761102676392, + "learning_rate": 4.976514780235631e-05, + "loss": 6.034, + "step": 7343 + }, + { + "epoch": 0.04367684841564373, + "grad_norm": 2.180790662765503, + "learning_rate": 4.9765083923394285e-05, + "loss": 6.1377, + "step": 7344 + }, + { + "epoch": 0.043682795698924734, + "grad_norm": 2.031378984451294, + "learning_rate": 4.9765020035787024e-05, + "loss": 5.7203, + "step": 7345 + }, + { + "epoch": 0.04368874298220573, + "grad_norm": 2.453611135482788, + "learning_rate": 4.9764956139534545e-05, + "loss": 5.9798, + "step": 7346 + }, + { + "epoch": 0.043694690265486724, + "grad_norm": 2.3802528381347656, + "learning_rate": 4.976489223463688e-05, + "loss": 5.9343, + "step": 7347 + }, + { + "epoch": 0.043700637548767726, + "grad_norm": 2.771704912185669, + "learning_rate": 4.976482832109406e-05, + "loss": 6.5202, + "step": 7348 + }, + { + "epoch": 0.04370658483204872, + "grad_norm": 1.9455180168151855, + "learning_rate": 4.9764764398906084e-05, + "loss": 6.1159, + "step": 7349 + }, + { + "epoch": 0.043712532115329716, + "grad_norm": 1.9527102708816528, + "learning_rate": 4.9764700468072976e-05, + "loss": 5.7773, + "step": 7350 + }, + { + "epoch": 0.04371847939861071, + "grad_norm": 1.9531358480453491, + "learning_rate": 4.976463652859478e-05, + "loss": 5.9918, + "step": 7351 + }, + { + "epoch": 0.043724426681891713, + "grad_norm": 2.375239849090576, + "learning_rate": 4.97645725804715e-05, + "loss": 5.5054, + "step": 7352 + }, + { + "epoch": 0.04373037396517271, + "grad_norm": 2.156553030014038, + "learning_rate": 4.9764508623703166e-05, + "loss": 5.664, + "step": 7353 + }, + { + "epoch": 0.043736321248453704, + "grad_norm": 2.317331075668335, + "learning_rate": 4.9764444658289796e-05, + "loss": 5.4473, + "step": 7354 + }, + { + "epoch": 0.043742268531734706, + "grad_norm": 2.1958348751068115, + "learning_rate": 4.976438068423141e-05, + "loss": 5.3584, + "step": 7355 + }, + { + "epoch": 0.0437482158150157, + "grad_norm": 2.152045249938965, + "learning_rate": 4.976431670152803e-05, + "loss": 5.4388, + "step": 7356 + }, + { + "epoch": 0.043754163098296696, + "grad_norm": 2.0661544799804688, + "learning_rate": 4.976425271017971e-05, + "loss": 5.3866, + "step": 7357 + }, + { + "epoch": 0.0437601103815777, + "grad_norm": 2.106480598449707, + "learning_rate": 4.976418871018642e-05, + "loss": 5.5928, + "step": 7358 + }, + { + "epoch": 0.04376605766485869, + "grad_norm": 2.5921759605407715, + "learning_rate": 4.976412470154821e-05, + "loss": 6.0133, + "step": 7359 + }, + { + "epoch": 0.04377200494813969, + "grad_norm": 2.4117794036865234, + "learning_rate": 4.97640606842651e-05, + "loss": 6.0988, + "step": 7360 + }, + { + "epoch": 0.04377795223142069, + "grad_norm": 1.9839050769805908, + "learning_rate": 4.976399665833712e-05, + "loss": 5.9568, + "step": 7361 + }, + { + "epoch": 0.043783899514701685, + "grad_norm": 2.166215419769287, + "learning_rate": 4.9763932623764285e-05, + "loss": 5.9205, + "step": 7362 + }, + { + "epoch": 0.04378984679798268, + "grad_norm": 2.8216545581817627, + "learning_rate": 4.9763868580546616e-05, + "loss": 5.792, + "step": 7363 + }, + { + "epoch": 0.043795794081263675, + "grad_norm": 2.907707929611206, + "learning_rate": 4.976380452868413e-05, + "loss": 5.5824, + "step": 7364 + }, + { + "epoch": 0.04380174136454468, + "grad_norm": 2.173025369644165, + "learning_rate": 4.976374046817686e-05, + "loss": 6.2752, + "step": 7365 + }, + { + "epoch": 0.04380768864782567, + "grad_norm": 2.1098685264587402, + "learning_rate": 4.9763676399024814e-05, + "loss": 5.8052, + "step": 7366 + }, + { + "epoch": 0.04381363593110667, + "grad_norm": 2.1980762481689453, + "learning_rate": 4.9763612321228035e-05, + "loss": 5.3456, + "step": 7367 + }, + { + "epoch": 0.04381958321438767, + "grad_norm": 2.091327667236328, + "learning_rate": 4.976354823478654e-05, + "loss": 5.211, + "step": 7368 + }, + { + "epoch": 0.043825530497668665, + "grad_norm": 2.37920880317688, + "learning_rate": 4.976348413970033e-05, + "loss": 5.8652, + "step": 7369 + }, + { + "epoch": 0.04383147778094966, + "grad_norm": 2.454202175140381, + "learning_rate": 4.976342003596946e-05, + "loss": 5.9654, + "step": 7370 + }, + { + "epoch": 0.04383742506423066, + "grad_norm": 2.04577898979187, + "learning_rate": 4.9763355923593927e-05, + "loss": 6.3042, + "step": 7371 + }, + { + "epoch": 0.04384337234751166, + "grad_norm": 2.358250141143799, + "learning_rate": 4.976329180257376e-05, + "loss": 6.1403, + "step": 7372 + }, + { + "epoch": 0.04384931963079265, + "grad_norm": 2.177819013595581, + "learning_rate": 4.9763227672909e-05, + "loss": 5.8993, + "step": 7373 + }, + { + "epoch": 0.043855266914073654, + "grad_norm": 2.24910569190979, + "learning_rate": 4.976316353459963e-05, + "loss": 5.9763, + "step": 7374 + }, + { + "epoch": 0.04386121419735465, + "grad_norm": 2.3985965251922607, + "learning_rate": 4.976309938764571e-05, + "loss": 6.2288, + "step": 7375 + }, + { + "epoch": 0.043867161480635644, + "grad_norm": 2.1250808238983154, + "learning_rate": 4.9763035232047244e-05, + "loss": 6.1588, + "step": 7376 + }, + { + "epoch": 0.043873108763916646, + "grad_norm": 1.9815669059753418, + "learning_rate": 4.976297106780426e-05, + "loss": 6.3202, + "step": 7377 + }, + { + "epoch": 0.04387905604719764, + "grad_norm": 2.181999683380127, + "learning_rate": 4.976290689491677e-05, + "loss": 5.9125, + "step": 7378 + }, + { + "epoch": 0.043885003330478636, + "grad_norm": 2.365546703338623, + "learning_rate": 4.9762842713384815e-05, + "loss": 6.0991, + "step": 7379 + }, + { + "epoch": 0.04389095061375963, + "grad_norm": 2.0843441486358643, + "learning_rate": 4.9762778523208406e-05, + "loss": 5.9675, + "step": 7380 + }, + { + "epoch": 0.04389689789704063, + "grad_norm": 2.271576404571533, + "learning_rate": 4.9762714324387566e-05, + "loss": 5.5703, + "step": 7381 + }, + { + "epoch": 0.04390284518032163, + "grad_norm": 2.244211435317993, + "learning_rate": 4.9762650116922314e-05, + "loss": 5.4674, + "step": 7382 + }, + { + "epoch": 0.043908792463602624, + "grad_norm": 1.728034257888794, + "learning_rate": 4.9762585900812684e-05, + "loss": 5.6264, + "step": 7383 + }, + { + "epoch": 0.043914739746883626, + "grad_norm": 2.400587320327759, + "learning_rate": 4.976252167605869e-05, + "loss": 6.052, + "step": 7384 + }, + { + "epoch": 0.04392068703016462, + "grad_norm": 1.9865821599960327, + "learning_rate": 4.9762457442660346e-05, + "loss": 5.8544, + "step": 7385 + }, + { + "epoch": 0.043926634313445616, + "grad_norm": 2.236527681350708, + "learning_rate": 4.97623932006177e-05, + "loss": 5.5033, + "step": 7386 + }, + { + "epoch": 0.04393258159672662, + "grad_norm": 2.0424020290374756, + "learning_rate": 4.9762328949930746e-05, + "loss": 5.4088, + "step": 7387 + }, + { + "epoch": 0.04393852888000761, + "grad_norm": 2.0601999759674072, + "learning_rate": 4.976226469059952e-05, + "loss": 5.8599, + "step": 7388 + }, + { + "epoch": 0.04394447616328861, + "grad_norm": 2.5052783489227295, + "learning_rate": 4.976220042262404e-05, + "loss": 5.8202, + "step": 7389 + }, + { + "epoch": 0.04395042344656961, + "grad_norm": 2.178549289703369, + "learning_rate": 4.9762136146004344e-05, + "loss": 5.4554, + "step": 7390 + }, + { + "epoch": 0.043956370729850605, + "grad_norm": 1.9407802820205688, + "learning_rate": 4.976207186074043e-05, + "loss": 5.4062, + "step": 7391 + }, + { + "epoch": 0.0439623180131316, + "grad_norm": 1.4814093112945557, + "learning_rate": 4.9762007566832336e-05, + "loss": 5.4662, + "step": 7392 + }, + { + "epoch": 0.043968265296412595, + "grad_norm": 1.8808835744857788, + "learning_rate": 4.9761943264280086e-05, + "loss": 6.1617, + "step": 7393 + }, + { + "epoch": 0.0439742125796936, + "grad_norm": 1.9318643808364868, + "learning_rate": 4.97618789530837e-05, + "loss": 6.1357, + "step": 7394 + }, + { + "epoch": 0.04398015986297459, + "grad_norm": 2.2515900135040283, + "learning_rate": 4.976181463324319e-05, + "loss": 6.11, + "step": 7395 + }, + { + "epoch": 0.04398610714625559, + "grad_norm": 2.375298023223877, + "learning_rate": 4.9761750304758584e-05, + "loss": 6.1121, + "step": 7396 + }, + { + "epoch": 0.04399205442953659, + "grad_norm": 2.2254321575164795, + "learning_rate": 4.9761685967629914e-05, + "loss": 6.0136, + "step": 7397 + }, + { + "epoch": 0.043998001712817585, + "grad_norm": 2.146164894104004, + "learning_rate": 4.976162162185719e-05, + "loss": 5.8391, + "step": 7398 + }, + { + "epoch": 0.04400394899609858, + "grad_norm": 2.3237650394439697, + "learning_rate": 4.976155726744044e-05, + "loss": 5.461, + "step": 7399 + }, + { + "epoch": 0.04400989627937958, + "grad_norm": 2.2263002395629883, + "learning_rate": 4.976149290437969e-05, + "loss": 5.5885, + "step": 7400 + }, + { + "epoch": 0.04401584356266058, + "grad_norm": 1.9597729444503784, + "learning_rate": 4.9761428532674956e-05, + "loss": 5.348, + "step": 7401 + }, + { + "epoch": 0.04402179084594157, + "grad_norm": 2.2215018272399902, + "learning_rate": 4.976136415232626e-05, + "loss": 5.933, + "step": 7402 + }, + { + "epoch": 0.044027738129222574, + "grad_norm": 2.258618116378784, + "learning_rate": 4.9761299763333635e-05, + "loss": 6.0685, + "step": 7403 + }, + { + "epoch": 0.04403368541250357, + "grad_norm": 2.3045873641967773, + "learning_rate": 4.976123536569709e-05, + "loss": 5.7277, + "step": 7404 + }, + { + "epoch": 0.044039632695784564, + "grad_norm": 2.546252489089966, + "learning_rate": 4.976117095941666e-05, + "loss": 5.8839, + "step": 7405 + }, + { + "epoch": 0.044045579979065566, + "grad_norm": 1.8963768482208252, + "learning_rate": 4.976110654449235e-05, + "loss": 6.1247, + "step": 7406 + }, + { + "epoch": 0.04405152726234656, + "grad_norm": 2.6287784576416016, + "learning_rate": 4.976104212092421e-05, + "loss": 5.9712, + "step": 7407 + }, + { + "epoch": 0.044057474545627556, + "grad_norm": 2.562612295150757, + "learning_rate": 4.976097768871223e-05, + "loss": 6.1226, + "step": 7408 + }, + { + "epoch": 0.04406342182890855, + "grad_norm": 2.2308688163757324, + "learning_rate": 4.976091324785645e-05, + "loss": 6.3235, + "step": 7409 + }, + { + "epoch": 0.04406936911218955, + "grad_norm": 2.4595553874969482, + "learning_rate": 4.976084879835691e-05, + "loss": 5.8164, + "step": 7410 + }, + { + "epoch": 0.04407531639547055, + "grad_norm": 2.3693978786468506, + "learning_rate": 4.97607843402136e-05, + "loss": 5.7727, + "step": 7411 + }, + { + "epoch": 0.044081263678751544, + "grad_norm": 4.144592761993408, + "learning_rate": 4.9760719873426546e-05, + "loss": 5.6382, + "step": 7412 + }, + { + "epoch": 0.044087210962032546, + "grad_norm": 2.5423779487609863, + "learning_rate": 4.9760655397995794e-05, + "loss": 5.7526, + "step": 7413 + }, + { + "epoch": 0.04409315824531354, + "grad_norm": 2.119281053543091, + "learning_rate": 4.976059091392135e-05, + "loss": 5.7246, + "step": 7414 + }, + { + "epoch": 0.044099105528594536, + "grad_norm": 2.177074432373047, + "learning_rate": 4.976052642120324e-05, + "loss": 5.7296, + "step": 7415 + }, + { + "epoch": 0.04410505281187554, + "grad_norm": 1.8897806406021118, + "learning_rate": 4.9760461919841486e-05, + "loss": 5.6349, + "step": 7416 + }, + { + "epoch": 0.04411100009515653, + "grad_norm": 2.445082187652588, + "learning_rate": 4.97603974098361e-05, + "loss": 5.7414, + "step": 7417 + }, + { + "epoch": 0.04411694737843753, + "grad_norm": 2.2564280033111572, + "learning_rate": 4.976033289118713e-05, + "loss": 5.6709, + "step": 7418 + }, + { + "epoch": 0.04412289466171853, + "grad_norm": 2.1907529830932617, + "learning_rate": 4.976026836389458e-05, + "loss": 5.6067, + "step": 7419 + }, + { + "epoch": 0.044128841944999525, + "grad_norm": 2.1872594356536865, + "learning_rate": 4.976020382795848e-05, + "loss": 5.5166, + "step": 7420 + }, + { + "epoch": 0.04413478922828052, + "grad_norm": 1.7740691900253296, + "learning_rate": 4.9760139283378835e-05, + "loss": 5.5833, + "step": 7421 + }, + { + "epoch": 0.044140736511561515, + "grad_norm": 2.128389358520508, + "learning_rate": 4.976007473015569e-05, + "loss": 5.6403, + "step": 7422 + }, + { + "epoch": 0.04414668379484252, + "grad_norm": 2.6193220615386963, + "learning_rate": 4.9760010168289053e-05, + "loss": 5.8139, + "step": 7423 + }, + { + "epoch": 0.04415263107812351, + "grad_norm": 2.727902412414551, + "learning_rate": 4.9759945597778955e-05, + "loss": 5.3286, + "step": 7424 + }, + { + "epoch": 0.04415857836140451, + "grad_norm": 2.4500436782836914, + "learning_rate": 4.975988101862542e-05, + "loss": 5.2647, + "step": 7425 + }, + { + "epoch": 0.04416452564468551, + "grad_norm": 2.1040356159210205, + "learning_rate": 4.975981643082846e-05, + "loss": 6.0935, + "step": 7426 + }, + { + "epoch": 0.044170472927966505, + "grad_norm": 1.9168792963027954, + "learning_rate": 4.975975183438811e-05, + "loss": 5.5147, + "step": 7427 + }, + { + "epoch": 0.0441764202112475, + "grad_norm": 2.0156469345092773, + "learning_rate": 4.9759687229304384e-05, + "loss": 6.2896, + "step": 7428 + }, + { + "epoch": 0.0441823674945285, + "grad_norm": 2.362933874130249, + "learning_rate": 4.975962261557731e-05, + "loss": 5.9514, + "step": 7429 + }, + { + "epoch": 0.0441883147778095, + "grad_norm": 2.2892727851867676, + "learning_rate": 4.9759557993206906e-05, + "loss": 5.5646, + "step": 7430 + }, + { + "epoch": 0.04419426206109049, + "grad_norm": 2.287722587585449, + "learning_rate": 4.97594933621932e-05, + "loss": 5.364, + "step": 7431 + }, + { + "epoch": 0.044200209344371494, + "grad_norm": 2.0421855449676514, + "learning_rate": 4.9759428722536194e-05, + "loss": 5.6838, + "step": 7432 + }, + { + "epoch": 0.04420615662765249, + "grad_norm": 2.2392499446868896, + "learning_rate": 4.9759364074235944e-05, + "loss": 6.0727, + "step": 7433 + }, + { + "epoch": 0.044212103910933484, + "grad_norm": 2.084768295288086, + "learning_rate": 4.975929941729245e-05, + "loss": 6.1208, + "step": 7434 + }, + { + "epoch": 0.044218051194214486, + "grad_norm": 1.817015528678894, + "learning_rate": 4.975923475170574e-05, + "loss": 6.3405, + "step": 7435 + }, + { + "epoch": 0.04422399847749548, + "grad_norm": 1.974926233291626, + "learning_rate": 4.9759170077475834e-05, + "loss": 5.9607, + "step": 7436 + }, + { + "epoch": 0.044229945760776476, + "grad_norm": 2.1244025230407715, + "learning_rate": 4.975910539460277e-05, + "loss": 6.2579, + "step": 7437 + }, + { + "epoch": 0.04423589304405747, + "grad_norm": 1.9459706544876099, + "learning_rate": 4.975904070308655e-05, + "loss": 5.5877, + "step": 7438 + }, + { + "epoch": 0.04424184032733847, + "grad_norm": 2.1891977787017822, + "learning_rate": 4.97589760029272e-05, + "loss": 5.9913, + "step": 7439 + }, + { + "epoch": 0.04424778761061947, + "grad_norm": 2.0368902683258057, + "learning_rate": 4.9758911294124756e-05, + "loss": 5.9478, + "step": 7440 + }, + { + "epoch": 0.044253734893900463, + "grad_norm": 2.2937796115875244, + "learning_rate": 4.975884657667922e-05, + "loss": 6.1529, + "step": 7441 + }, + { + "epoch": 0.044259682177181466, + "grad_norm": 2.601637125015259, + "learning_rate": 4.975878185059064e-05, + "loss": 5.4446, + "step": 7442 + }, + { + "epoch": 0.04426562946046246, + "grad_norm": 2.2025954723358154, + "learning_rate": 4.975871711585902e-05, + "loss": 5.8911, + "step": 7443 + }, + { + "epoch": 0.044271576743743456, + "grad_norm": 2.0498836040496826, + "learning_rate": 4.975865237248438e-05, + "loss": 6.0604, + "step": 7444 + }, + { + "epoch": 0.04427752402702446, + "grad_norm": 2.308239459991455, + "learning_rate": 4.975858762046676e-05, + "loss": 5.9599, + "step": 7445 + }, + { + "epoch": 0.04428347131030545, + "grad_norm": 2.286747455596924, + "learning_rate": 4.9758522859806165e-05, + "loss": 6.3528, + "step": 7446 + }, + { + "epoch": 0.04428941859358645, + "grad_norm": 2.2376902103424072, + "learning_rate": 4.975845809050264e-05, + "loss": 6.205, + "step": 7447 + }, + { + "epoch": 0.04429536587686745, + "grad_norm": 1.8052057027816772, + "learning_rate": 4.9758393312556176e-05, + "loss": 6.2188, + "step": 7448 + }, + { + "epoch": 0.044301313160148445, + "grad_norm": 1.9839476346969604, + "learning_rate": 4.975832852596682e-05, + "loss": 6.1479, + "step": 7449 + }, + { + "epoch": 0.04430726044342944, + "grad_norm": 1.8890517950057983, + "learning_rate": 4.975826373073459e-05, + "loss": 6.2524, + "step": 7450 + }, + { + "epoch": 0.04431320772671044, + "grad_norm": 2.049192428588867, + "learning_rate": 4.97581989268595e-05, + "loss": 5.5486, + "step": 7451 + }, + { + "epoch": 0.04431915500999144, + "grad_norm": 2.8271291255950928, + "learning_rate": 4.975813411434158e-05, + "loss": 5.1916, + "step": 7452 + }, + { + "epoch": 0.04432510229327243, + "grad_norm": 1.94833505153656, + "learning_rate": 4.975806929318085e-05, + "loss": 5.6747, + "step": 7453 + }, + { + "epoch": 0.04433104957655343, + "grad_norm": 2.14536190032959, + "learning_rate": 4.975800446337734e-05, + "loss": 5.4066, + "step": 7454 + }, + { + "epoch": 0.04433699685983443, + "grad_norm": 2.5557188987731934, + "learning_rate": 4.975793962493106e-05, + "loss": 5.2257, + "step": 7455 + }, + { + "epoch": 0.044342944143115424, + "grad_norm": 2.4718832969665527, + "learning_rate": 4.975787477784205e-05, + "loss": 6.0248, + "step": 7456 + }, + { + "epoch": 0.04434889142639642, + "grad_norm": 2.8627419471740723, + "learning_rate": 4.975780992211031e-05, + "loss": 5.3245, + "step": 7457 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 2.932990789413452, + "learning_rate": 4.9757745057735876e-05, + "loss": 4.8914, + "step": 7458 + }, + { + "epoch": 0.04436078599295842, + "grad_norm": 2.6231770515441895, + "learning_rate": 4.975768018471877e-05, + "loss": 5.3323, + "step": 7459 + }, + { + "epoch": 0.04436673327623941, + "grad_norm": 2.5591986179351807, + "learning_rate": 4.975761530305901e-05, + "loss": 5.4972, + "step": 7460 + }, + { + "epoch": 0.044372680559520414, + "grad_norm": 2.4060492515563965, + "learning_rate": 4.975755041275664e-05, + "loss": 5.5988, + "step": 7461 + }, + { + "epoch": 0.04437862784280141, + "grad_norm": 2.377260446548462, + "learning_rate": 4.975748551381164e-05, + "loss": 5.2137, + "step": 7462 + }, + { + "epoch": 0.044384575126082404, + "grad_norm": 2.171934127807617, + "learning_rate": 4.9757420606224076e-05, + "loss": 5.6313, + "step": 7463 + }, + { + "epoch": 0.044390522409363406, + "grad_norm": 2.1225788593292236, + "learning_rate": 4.975735568999394e-05, + "loss": 5.839, + "step": 7464 + }, + { + "epoch": 0.0443964696926444, + "grad_norm": 2.271127939224243, + "learning_rate": 4.975729076512128e-05, + "loss": 5.7111, + "step": 7465 + }, + { + "epoch": 0.044402416975925396, + "grad_norm": 2.7138264179229736, + "learning_rate": 4.975722583160609e-05, + "loss": 5.3169, + "step": 7466 + }, + { + "epoch": 0.04440836425920639, + "grad_norm": 2.8181982040405273, + "learning_rate": 4.9757160889448416e-05, + "loss": 5.3323, + "step": 7467 + }, + { + "epoch": 0.04441431154248739, + "grad_norm": 2.680816411972046, + "learning_rate": 4.975709593864828e-05, + "loss": 5.6924, + "step": 7468 + }, + { + "epoch": 0.04442025882576839, + "grad_norm": 2.3682074546813965, + "learning_rate": 4.975703097920569e-05, + "loss": 6.0049, + "step": 7469 + }, + { + "epoch": 0.04442620610904938, + "grad_norm": 2.3080508708953857, + "learning_rate": 4.9756966011120674e-05, + "loss": 6.4438, + "step": 7470 + }, + { + "epoch": 0.044432153392330385, + "grad_norm": 2.2631113529205322, + "learning_rate": 4.9756901034393265e-05, + "loss": 5.9296, + "step": 7471 + }, + { + "epoch": 0.04443810067561138, + "grad_norm": 2.283712148666382, + "learning_rate": 4.975683604902347e-05, + "loss": 5.831, + "step": 7472 + }, + { + "epoch": 0.044444047958892376, + "grad_norm": 2.2130608558654785, + "learning_rate": 4.975677105501132e-05, + "loss": 5.8757, + "step": 7473 + }, + { + "epoch": 0.04444999524217338, + "grad_norm": 1.9392763376235962, + "learning_rate": 4.975670605235684e-05, + "loss": 5.5836, + "step": 7474 + }, + { + "epoch": 0.04445594252545437, + "grad_norm": 2.097076416015625, + "learning_rate": 4.975664104106005e-05, + "loss": 6.0782, + "step": 7475 + }, + { + "epoch": 0.04446188980873537, + "grad_norm": 2.063021183013916, + "learning_rate": 4.975657602112097e-05, + "loss": 6.2171, + "step": 7476 + }, + { + "epoch": 0.04446783709201637, + "grad_norm": 2.4466049671173096, + "learning_rate": 4.9756510992539626e-05, + "loss": 5.8649, + "step": 7477 + }, + { + "epoch": 0.044473784375297365, + "grad_norm": 2.2160751819610596, + "learning_rate": 4.975644595531605e-05, + "loss": 5.9297, + "step": 7478 + }, + { + "epoch": 0.04447973165857836, + "grad_norm": 2.69352650642395, + "learning_rate": 4.975638090945024e-05, + "loss": 6.1062, + "step": 7479 + }, + { + "epoch": 0.04448567894185936, + "grad_norm": 2.2830610275268555, + "learning_rate": 4.975631585494224e-05, + "loss": 6.1663, + "step": 7480 + }, + { + "epoch": 0.04449162622514036, + "grad_norm": 2.936842203140259, + "learning_rate": 4.975625079179206e-05, + "loss": 5.9952, + "step": 7481 + }, + { + "epoch": 0.04449757350842135, + "grad_norm": 2.1398322582244873, + "learning_rate": 4.9756185719999725e-05, + "loss": 6.0005, + "step": 7482 + }, + { + "epoch": 0.04450352079170235, + "grad_norm": 2.2835536003112793, + "learning_rate": 4.9756120639565275e-05, + "loss": 5.7155, + "step": 7483 + }, + { + "epoch": 0.04450946807498335, + "grad_norm": 2.22917103767395, + "learning_rate": 4.975605555048871e-05, + "loss": 5.7134, + "step": 7484 + }, + { + "epoch": 0.044515415358264344, + "grad_norm": 2.0195605754852295, + "learning_rate": 4.975599045277006e-05, + "loss": 5.6369, + "step": 7485 + }, + { + "epoch": 0.04452136264154534, + "grad_norm": 1.8495477437973022, + "learning_rate": 4.975592534640936e-05, + "loss": 5.9035, + "step": 7486 + }, + { + "epoch": 0.04452730992482634, + "grad_norm": 2.4814226627349854, + "learning_rate": 4.9755860231406616e-05, + "loss": 6.1024, + "step": 7487 + }, + { + "epoch": 0.04453325720810734, + "grad_norm": 2.221820831298828, + "learning_rate": 4.975579510776186e-05, + "loss": 6.1193, + "step": 7488 + }, + { + "epoch": 0.04453920449138833, + "grad_norm": 1.935722827911377, + "learning_rate": 4.975572997547511e-05, + "loss": 6.1088, + "step": 7489 + }, + { + "epoch": 0.044545151774669334, + "grad_norm": 2.1287481784820557, + "learning_rate": 4.975566483454638e-05, + "loss": 6.1064, + "step": 7490 + }, + { + "epoch": 0.04455109905795033, + "grad_norm": 2.1914093494415283, + "learning_rate": 4.9755599684975716e-05, + "loss": 6.072, + "step": 7491 + }, + { + "epoch": 0.044557046341231324, + "grad_norm": 2.1979966163635254, + "learning_rate": 4.975553452676312e-05, + "loss": 6.1447, + "step": 7492 + }, + { + "epoch": 0.044562993624512326, + "grad_norm": 2.108259916305542, + "learning_rate": 4.975546935990863e-05, + "loss": 6.0109, + "step": 7493 + }, + { + "epoch": 0.04456894090779332, + "grad_norm": 2.2454450130462646, + "learning_rate": 4.975540418441226e-05, + "loss": 5.8627, + "step": 7494 + }, + { + "epoch": 0.044574888191074316, + "grad_norm": 2.151130437850952, + "learning_rate": 4.9755339000274027e-05, + "loss": 6.0241, + "step": 7495 + }, + { + "epoch": 0.04458083547435531, + "grad_norm": 1.9150489568710327, + "learning_rate": 4.975527380749397e-05, + "loss": 6.0179, + "step": 7496 + }, + { + "epoch": 0.04458678275763631, + "grad_norm": 1.9065133333206177, + "learning_rate": 4.97552086060721e-05, + "loss": 5.9991, + "step": 7497 + }, + { + "epoch": 0.04459273004091731, + "grad_norm": 1.9627622365951538, + "learning_rate": 4.975514339600844e-05, + "loss": 5.9633, + "step": 7498 + }, + { + "epoch": 0.0445986773241983, + "grad_norm": 1.7777502536773682, + "learning_rate": 4.975507817730302e-05, + "loss": 5.9426, + "step": 7499 + }, + { + "epoch": 0.044604624607479305, + "grad_norm": 1.6735023260116577, + "learning_rate": 4.9755012949955846e-05, + "loss": 5.9432, + "step": 7500 + }, + { + "epoch": 0.0446105718907603, + "grad_norm": 2.1570491790771484, + "learning_rate": 4.975494771396697e-05, + "loss": 6.2032, + "step": 7501 + }, + { + "epoch": 0.044616519174041296, + "grad_norm": 2.286522150039673, + "learning_rate": 4.9754882469336387e-05, + "loss": 5.7226, + "step": 7502 + }, + { + "epoch": 0.0446224664573223, + "grad_norm": 2.1940622329711914, + "learning_rate": 4.975481721606413e-05, + "loss": 6.2215, + "step": 7503 + }, + { + "epoch": 0.04462841374060329, + "grad_norm": 2.329263210296631, + "learning_rate": 4.9754751954150224e-05, + "loss": 5.5403, + "step": 7504 + }, + { + "epoch": 0.04463436102388429, + "grad_norm": 2.112712860107422, + "learning_rate": 4.975468668359469e-05, + "loss": 5.7581, + "step": 7505 + }, + { + "epoch": 0.04464030830716529, + "grad_norm": 2.2875239849090576, + "learning_rate": 4.975462140439755e-05, + "loss": 5.9593, + "step": 7506 + }, + { + "epoch": 0.044646255590446285, + "grad_norm": 2.282121419906616, + "learning_rate": 4.975455611655883e-05, + "loss": 5.8684, + "step": 7507 + }, + { + "epoch": 0.04465220287372728, + "grad_norm": 1.8482197523117065, + "learning_rate": 4.975449082007855e-05, + "loss": 5.753, + "step": 7508 + }, + { + "epoch": 0.04465815015700828, + "grad_norm": 2.6635684967041016, + "learning_rate": 4.9754425514956724e-05, + "loss": 5.0732, + "step": 7509 + }, + { + "epoch": 0.04466409744028928, + "grad_norm": 2.6632800102233887, + "learning_rate": 4.9754360201193395e-05, + "loss": 5.1644, + "step": 7510 + }, + { + "epoch": 0.04467004472357027, + "grad_norm": 2.630445718765259, + "learning_rate": 4.9754294878788574e-05, + "loss": 5.0322, + "step": 7511 + }, + { + "epoch": 0.04467599200685127, + "grad_norm": 2.4036223888397217, + "learning_rate": 4.975422954774228e-05, + "loss": 4.8949, + "step": 7512 + }, + { + "epoch": 0.04468193929013227, + "grad_norm": 2.381810426712036, + "learning_rate": 4.9754164208054535e-05, + "loss": 5.7921, + "step": 7513 + }, + { + "epoch": 0.044687886573413264, + "grad_norm": 2.570949077606201, + "learning_rate": 4.9754098859725377e-05, + "loss": 5.9612, + "step": 7514 + }, + { + "epoch": 0.04469383385669426, + "grad_norm": 2.510998010635376, + "learning_rate": 4.9754033502754815e-05, + "loss": 5.7273, + "step": 7515 + }, + { + "epoch": 0.04469978113997526, + "grad_norm": 2.6216115951538086, + "learning_rate": 4.975396813714288e-05, + "loss": 5.7601, + "step": 7516 + }, + { + "epoch": 0.04470572842325626, + "grad_norm": 2.5298542976379395, + "learning_rate": 4.975390276288958e-05, + "loss": 5.8007, + "step": 7517 + }, + { + "epoch": 0.04471167570653725, + "grad_norm": 2.6195290088653564, + "learning_rate": 4.975383737999496e-05, + "loss": 5.6071, + "step": 7518 + }, + { + "epoch": 0.044717622989818254, + "grad_norm": 2.5432629585266113, + "learning_rate": 4.975377198845902e-05, + "loss": 6.0224, + "step": 7519 + }, + { + "epoch": 0.04472357027309925, + "grad_norm": 2.2290337085723877, + "learning_rate": 4.97537065882818e-05, + "loss": 5.7141, + "step": 7520 + }, + { + "epoch": 0.044729517556380244, + "grad_norm": 2.627206802368164, + "learning_rate": 4.975364117946332e-05, + "loss": 6.2518, + "step": 7521 + }, + { + "epoch": 0.044735464839661246, + "grad_norm": 2.386993169784546, + "learning_rate": 4.975357576200359e-05, + "loss": 6.0494, + "step": 7522 + }, + { + "epoch": 0.04474141212294224, + "grad_norm": 2.20511794090271, + "learning_rate": 4.9753510335902656e-05, + "loss": 6.2563, + "step": 7523 + }, + { + "epoch": 0.044747359406223236, + "grad_norm": 2.5564749240875244, + "learning_rate": 4.975344490116052e-05, + "loss": 6.2498, + "step": 7524 + }, + { + "epoch": 0.04475330668950423, + "grad_norm": 2.6001932621002197, + "learning_rate": 4.975337945777721e-05, + "loss": 5.6721, + "step": 7525 + }, + { + "epoch": 0.04475925397278523, + "grad_norm": 2.6677772998809814, + "learning_rate": 4.975331400575275e-05, + "loss": 5.88, + "step": 7526 + }, + { + "epoch": 0.04476520125606623, + "grad_norm": 3.616734027862549, + "learning_rate": 4.975324854508716e-05, + "loss": 5.4835, + "step": 7527 + }, + { + "epoch": 0.04477114853934722, + "grad_norm": 3.0301461219787598, + "learning_rate": 4.975318307578048e-05, + "loss": 5.326, + "step": 7528 + }, + { + "epoch": 0.044777095822628225, + "grad_norm": 2.029836893081665, + "learning_rate": 4.975311759783271e-05, + "loss": 5.3516, + "step": 7529 + }, + { + "epoch": 0.04478304310590922, + "grad_norm": 1.9886969327926636, + "learning_rate": 4.9753052111243885e-05, + "loss": 5.3442, + "step": 7530 + }, + { + "epoch": 0.044788990389190216, + "grad_norm": 2.4227612018585205, + "learning_rate": 4.975298661601403e-05, + "loss": 5.4273, + "step": 7531 + }, + { + "epoch": 0.04479493767247122, + "grad_norm": 2.8426849842071533, + "learning_rate": 4.975292111214316e-05, + "loss": 5.6604, + "step": 7532 + }, + { + "epoch": 0.04480088495575221, + "grad_norm": 2.4818854331970215, + "learning_rate": 4.97528555996313e-05, + "loss": 6.4941, + "step": 7533 + }, + { + "epoch": 0.04480683223903321, + "grad_norm": 2.291642904281616, + "learning_rate": 4.9752790078478465e-05, + "loss": 6.404, + "step": 7534 + }, + { + "epoch": 0.04481277952231421, + "grad_norm": 2.4973669052124023, + "learning_rate": 4.9752724548684695e-05, + "loss": 5.6068, + "step": 7535 + }, + { + "epoch": 0.044818726805595205, + "grad_norm": 2.273130416870117, + "learning_rate": 4.975265901025001e-05, + "loss": 6.1689, + "step": 7536 + }, + { + "epoch": 0.0448246740888762, + "grad_norm": 3.362520456314087, + "learning_rate": 4.9752593463174424e-05, + "loss": 5.5346, + "step": 7537 + }, + { + "epoch": 0.0448306213721572, + "grad_norm": 5.170871257781982, + "learning_rate": 4.9752527907457956e-05, + "loss": 5.3831, + "step": 7538 + }, + { + "epoch": 0.0448365686554382, + "grad_norm": 4.224242687225342, + "learning_rate": 4.975246234310064e-05, + "loss": 5.2511, + "step": 7539 + }, + { + "epoch": 0.04484251593871919, + "grad_norm": 3.1753036975860596, + "learning_rate": 4.97523967701025e-05, + "loss": 5.06, + "step": 7540 + }, + { + "epoch": 0.04484846322200019, + "grad_norm": 2.4226467609405518, + "learning_rate": 4.975233118846355e-05, + "loss": 5.5225, + "step": 7541 + }, + { + "epoch": 0.04485441050528119, + "grad_norm": 2.5356781482696533, + "learning_rate": 4.9752265598183814e-05, + "loss": 5.5865, + "step": 7542 + }, + { + "epoch": 0.044860357788562184, + "grad_norm": 2.1505908966064453, + "learning_rate": 4.9752199999263326e-05, + "loss": 5.7436, + "step": 7543 + }, + { + "epoch": 0.04486630507184318, + "grad_norm": 2.675703763961792, + "learning_rate": 4.97521343917021e-05, + "loss": 5.3693, + "step": 7544 + }, + { + "epoch": 0.04487225235512418, + "grad_norm": 3.5228023529052734, + "learning_rate": 4.975206877550015e-05, + "loss": 4.8527, + "step": 7545 + }, + { + "epoch": 0.044878199638405177, + "grad_norm": 3.1165566444396973, + "learning_rate": 4.975200315065752e-05, + "loss": 4.7971, + "step": 7546 + }, + { + "epoch": 0.04488414692168617, + "grad_norm": 2.6216177940368652, + "learning_rate": 4.975193751717421e-05, + "loss": 4.9328, + "step": 7547 + }, + { + "epoch": 0.044890094204967174, + "grad_norm": 2.352031707763672, + "learning_rate": 4.975187187505026e-05, + "loss": 5.0021, + "step": 7548 + }, + { + "epoch": 0.04489604148824817, + "grad_norm": 1.8147127628326416, + "learning_rate": 4.975180622428569e-05, + "loss": 5.7009, + "step": 7549 + }, + { + "epoch": 0.044901988771529164, + "grad_norm": 2.1674726009368896, + "learning_rate": 4.9751740564880516e-05, + "loss": 5.2545, + "step": 7550 + }, + { + "epoch": 0.044907936054810166, + "grad_norm": 2.2935330867767334, + "learning_rate": 4.975167489683477e-05, + "loss": 5.2351, + "step": 7551 + }, + { + "epoch": 0.04491388333809116, + "grad_norm": 2.2964932918548584, + "learning_rate": 4.975160922014846e-05, + "loss": 5.483, + "step": 7552 + }, + { + "epoch": 0.044919830621372156, + "grad_norm": 1.8180936574935913, + "learning_rate": 4.9751543534821635e-05, + "loss": 5.668, + "step": 7553 + }, + { + "epoch": 0.04492577790465315, + "grad_norm": 1.906435251235962, + "learning_rate": 4.9751477840854286e-05, + "loss": 5.6664, + "step": 7554 + }, + { + "epoch": 0.04493172518793415, + "grad_norm": 2.459702253341675, + "learning_rate": 4.9751412138246455e-05, + "loss": 5.5272, + "step": 7555 + }, + { + "epoch": 0.04493767247121515, + "grad_norm": 2.1219170093536377, + "learning_rate": 4.975134642699817e-05, + "loss": 5.638, + "step": 7556 + }, + { + "epoch": 0.04494361975449614, + "grad_norm": 2.1492953300476074, + "learning_rate": 4.975128070710944e-05, + "loss": 5.9422, + "step": 7557 + }, + { + "epoch": 0.044949567037777145, + "grad_norm": 1.813988208770752, + "learning_rate": 4.97512149785803e-05, + "loss": 5.9875, + "step": 7558 + }, + { + "epoch": 0.04495551432105814, + "grad_norm": 1.6336817741394043, + "learning_rate": 4.975114924141075e-05, + "loss": 5.9245, + "step": 7559 + }, + { + "epoch": 0.044961461604339135, + "grad_norm": 1.9339455366134644, + "learning_rate": 4.9751083495600847e-05, + "loss": 5.3263, + "step": 7560 + }, + { + "epoch": 0.04496740888762014, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.975101774115059e-05, + "loss": 5.4625, + "step": 7561 + }, + { + "epoch": 0.04497335617090113, + "grad_norm": 2.2994346618652344, + "learning_rate": 4.9750951978060004e-05, + "loss": 5.6327, + "step": 7562 + }, + { + "epoch": 0.04497930345418213, + "grad_norm": 2.1627299785614014, + "learning_rate": 4.975088620632912e-05, + "loss": 5.4882, + "step": 7563 + }, + { + "epoch": 0.04498525073746313, + "grad_norm": 2.763397693634033, + "learning_rate": 4.9750820425957954e-05, + "loss": 5.727, + "step": 7564 + }, + { + "epoch": 0.044991198020744125, + "grad_norm": 2.0107216835021973, + "learning_rate": 4.975075463694654e-05, + "loss": 5.3852, + "step": 7565 + }, + { + "epoch": 0.04499714530402512, + "grad_norm": 1.8424763679504395, + "learning_rate": 4.975068883929489e-05, + "loss": 5.3072, + "step": 7566 + }, + { + "epoch": 0.04500309258730612, + "grad_norm": 1.946702003479004, + "learning_rate": 4.975062303300303e-05, + "loss": 5.3184, + "step": 7567 + }, + { + "epoch": 0.04500903987058712, + "grad_norm": 2.1091182231903076, + "learning_rate": 4.9750557218070984e-05, + "loss": 5.0689, + "step": 7568 + }, + { + "epoch": 0.04501498715386811, + "grad_norm": 2.0064187049865723, + "learning_rate": 4.975049139449877e-05, + "loss": 4.8495, + "step": 7569 + }, + { + "epoch": 0.04502093443714911, + "grad_norm": 1.7544279098510742, + "learning_rate": 4.9750425562286416e-05, + "loss": 4.9524, + "step": 7570 + }, + { + "epoch": 0.04502688172043011, + "grad_norm": 2.0814568996429443, + "learning_rate": 4.9750359721433945e-05, + "loss": 4.798, + "step": 7571 + }, + { + "epoch": 0.045032829003711104, + "grad_norm": 2.1185543537139893, + "learning_rate": 4.975029387194139e-05, + "loss": 4.9313, + "step": 7572 + }, + { + "epoch": 0.0450387762869921, + "grad_norm": 2.3774518966674805, + "learning_rate": 4.975022801380875e-05, + "loss": 5.5954, + "step": 7573 + }, + { + "epoch": 0.0450447235702731, + "grad_norm": 2.261306047439575, + "learning_rate": 4.975016214703606e-05, + "loss": 5.5598, + "step": 7574 + }, + { + "epoch": 0.045050670853554096, + "grad_norm": 2.128244161605835, + "learning_rate": 4.975009627162335e-05, + "loss": 5.359, + "step": 7575 + }, + { + "epoch": 0.04505661813683509, + "grad_norm": 2.0767438411712646, + "learning_rate": 4.975003038757064e-05, + "loss": 5.6855, + "step": 7576 + }, + { + "epoch": 0.045062565420116094, + "grad_norm": 1.9789010286331177, + "learning_rate": 4.974996449487794e-05, + "loss": 5.1807, + "step": 7577 + }, + { + "epoch": 0.04506851270339709, + "grad_norm": 1.9136112928390503, + "learning_rate": 4.97498985935453e-05, + "loss": 5.3811, + "step": 7578 + }, + { + "epoch": 0.045074459986678084, + "grad_norm": 2.150641441345215, + "learning_rate": 4.974983268357271e-05, + "loss": 5.3281, + "step": 7579 + }, + { + "epoch": 0.045080407269959086, + "grad_norm": 1.9636656045913696, + "learning_rate": 4.9749766764960215e-05, + "loss": 5.5003, + "step": 7580 + }, + { + "epoch": 0.04508635455324008, + "grad_norm": 1.826335072517395, + "learning_rate": 4.974970083770783e-05, + "loss": 5.4687, + "step": 7581 + }, + { + "epoch": 0.045092301836521076, + "grad_norm": 1.9246041774749756, + "learning_rate": 4.974963490181558e-05, + "loss": 5.5373, + "step": 7582 + }, + { + "epoch": 0.04509824911980207, + "grad_norm": 1.8421686887741089, + "learning_rate": 4.974956895728349e-05, + "loss": 5.386, + "step": 7583 + }, + { + "epoch": 0.04510419640308307, + "grad_norm": 1.8685556650161743, + "learning_rate": 4.974950300411158e-05, + "loss": 5.5857, + "step": 7584 + }, + { + "epoch": 0.04511014368636407, + "grad_norm": 1.7022168636322021, + "learning_rate": 4.974943704229987e-05, + "loss": 5.2562, + "step": 7585 + }, + { + "epoch": 0.04511609096964506, + "grad_norm": 1.876855731010437, + "learning_rate": 4.97493710718484e-05, + "loss": 5.1359, + "step": 7586 + }, + { + "epoch": 0.045122038252926065, + "grad_norm": 1.8728361129760742, + "learning_rate": 4.974930509275717e-05, + "loss": 5.3124, + "step": 7587 + }, + { + "epoch": 0.04512798553620706, + "grad_norm": 1.930086612701416, + "learning_rate": 4.974923910502622e-05, + "loss": 5.3261, + "step": 7588 + }, + { + "epoch": 0.045133932819488055, + "grad_norm": 2.0309081077575684, + "learning_rate": 4.9749173108655564e-05, + "loss": 5.1138, + "step": 7589 + }, + { + "epoch": 0.04513988010276906, + "grad_norm": 2.042174816131592, + "learning_rate": 4.974910710364522e-05, + "loss": 5.3521, + "step": 7590 + }, + { + "epoch": 0.04514582738605005, + "grad_norm": 1.5278770923614502, + "learning_rate": 4.9749041089995224e-05, + "loss": 5.4075, + "step": 7591 + }, + { + "epoch": 0.04515177466933105, + "grad_norm": 1.7624976634979248, + "learning_rate": 4.974897506770559e-05, + "loss": 5.1698, + "step": 7592 + }, + { + "epoch": 0.04515772195261205, + "grad_norm": 1.9077380895614624, + "learning_rate": 4.974890903677635e-05, + "loss": 5.3973, + "step": 7593 + }, + { + "epoch": 0.045163669235893045, + "grad_norm": 1.5724380016326904, + "learning_rate": 4.974884299720752e-05, + "loss": 5.6325, + "step": 7594 + }, + { + "epoch": 0.04516961651917404, + "grad_norm": 1.9702832698822021, + "learning_rate": 4.974877694899913e-05, + "loss": 5.247, + "step": 7595 + }, + { + "epoch": 0.04517556380245504, + "grad_norm": 1.9913853406906128, + "learning_rate": 4.974871089215118e-05, + "loss": 5.6393, + "step": 7596 + }, + { + "epoch": 0.04518151108573604, + "grad_norm": 1.806470274925232, + "learning_rate": 4.974864482666372e-05, + "loss": 5.302, + "step": 7597 + }, + { + "epoch": 0.04518745836901703, + "grad_norm": 1.7056912183761597, + "learning_rate": 4.974857875253678e-05, + "loss": 5.4066, + "step": 7598 + }, + { + "epoch": 0.04519340565229803, + "grad_norm": 1.5990647077560425, + "learning_rate": 4.974851266977035e-05, + "loss": 5.4087, + "step": 7599 + }, + { + "epoch": 0.04519935293557903, + "grad_norm": 1.9233685731887817, + "learning_rate": 4.974844657836447e-05, + "loss": 5.4891, + "step": 7600 + }, + { + "epoch": 0.045205300218860024, + "grad_norm": 1.8654414415359497, + "learning_rate": 4.9748380478319165e-05, + "loss": 5.4955, + "step": 7601 + }, + { + "epoch": 0.04521124750214102, + "grad_norm": 1.7592424154281616, + "learning_rate": 4.974831436963446e-05, + "loss": 5.2298, + "step": 7602 + }, + { + "epoch": 0.04521719478542202, + "grad_norm": 1.8132792711257935, + "learning_rate": 4.974824825231037e-05, + "loss": 5.3487, + "step": 7603 + }, + { + "epoch": 0.045223142068703016, + "grad_norm": 1.8109947443008423, + "learning_rate": 4.974818212634692e-05, + "loss": 5.4511, + "step": 7604 + }, + { + "epoch": 0.04522908935198401, + "grad_norm": 1.96711266040802, + "learning_rate": 4.974811599174414e-05, + "loss": 5.3249, + "step": 7605 + }, + { + "epoch": 0.045235036635265014, + "grad_norm": 1.9123655557632446, + "learning_rate": 4.9748049848502054e-05, + "loss": 5.3681, + "step": 7606 + }, + { + "epoch": 0.04524098391854601, + "grad_norm": 1.7210376262664795, + "learning_rate": 4.974798369662067e-05, + "loss": 5.3441, + "step": 7607 + }, + { + "epoch": 0.045246931201827004, + "grad_norm": 1.590617060661316, + "learning_rate": 4.974791753610002e-05, + "loss": 5.5619, + "step": 7608 + }, + { + "epoch": 0.045252878485108006, + "grad_norm": 1.77785062789917, + "learning_rate": 4.974785136694013e-05, + "loss": 5.4717, + "step": 7609 + }, + { + "epoch": 0.045258825768389, + "grad_norm": 1.66475510597229, + "learning_rate": 4.9747785189141025e-05, + "loss": 5.3501, + "step": 7610 + }, + { + "epoch": 0.045264773051669996, + "grad_norm": 1.9176442623138428, + "learning_rate": 4.974771900270272e-05, + "loss": 5.1197, + "step": 7611 + }, + { + "epoch": 0.04527072033495099, + "grad_norm": 1.8143234252929688, + "learning_rate": 4.974765280762525e-05, + "loss": 5.3103, + "step": 7612 + }, + { + "epoch": 0.04527666761823199, + "grad_norm": 1.8954168558120728, + "learning_rate": 4.974758660390861e-05, + "loss": 5.2009, + "step": 7613 + }, + { + "epoch": 0.04528261490151299, + "grad_norm": 1.7779622077941895, + "learning_rate": 4.974752039155286e-05, + "loss": 5.519, + "step": 7614 + }, + { + "epoch": 0.04528856218479398, + "grad_norm": 1.8181761503219604, + "learning_rate": 4.9747454170558e-05, + "loss": 5.4967, + "step": 7615 + }, + { + "epoch": 0.045294509468074985, + "grad_norm": 1.657665491104126, + "learning_rate": 4.9747387940924064e-05, + "loss": 5.6437, + "step": 7616 + }, + { + "epoch": 0.04530045675135598, + "grad_norm": 1.7993237972259521, + "learning_rate": 4.974732170265107e-05, + "loss": 5.3094, + "step": 7617 + }, + { + "epoch": 0.045306404034636975, + "grad_norm": 1.8798805475234985, + "learning_rate": 4.974725545573904e-05, + "loss": 5.3268, + "step": 7618 + }, + { + "epoch": 0.04531235131791798, + "grad_norm": 1.9271420240402222, + "learning_rate": 4.974718920018799e-05, + "loss": 5.3405, + "step": 7619 + }, + { + "epoch": 0.04531829860119897, + "grad_norm": 1.9256294965744019, + "learning_rate": 4.9747122935997967e-05, + "loss": 5.3118, + "step": 7620 + }, + { + "epoch": 0.04532424588447997, + "grad_norm": 2.3345041275024414, + "learning_rate": 4.9747056663168965e-05, + "loss": 4.9813, + "step": 7621 + }, + { + "epoch": 0.04533019316776097, + "grad_norm": 1.7056258916854858, + "learning_rate": 4.974699038170103e-05, + "loss": 5.4725, + "step": 7622 + }, + { + "epoch": 0.045336140451041965, + "grad_norm": 2.075711250305176, + "learning_rate": 4.9746924091594174e-05, + "loss": 5.2215, + "step": 7623 + }, + { + "epoch": 0.04534208773432296, + "grad_norm": 1.818048357963562, + "learning_rate": 4.974685779284843e-05, + "loss": 5.0463, + "step": 7624 + }, + { + "epoch": 0.04534803501760396, + "grad_norm": 1.6590908765792847, + "learning_rate": 4.9746791485463806e-05, + "loss": 5.2476, + "step": 7625 + }, + { + "epoch": 0.04535398230088496, + "grad_norm": 2.2024991512298584, + "learning_rate": 4.974672516944033e-05, + "loss": 5.6437, + "step": 7626 + }, + { + "epoch": 0.04535992958416595, + "grad_norm": 1.71639883518219, + "learning_rate": 4.974665884477803e-05, + "loss": 5.2418, + "step": 7627 + }, + { + "epoch": 0.04536587686744695, + "grad_norm": 1.75436270236969, + "learning_rate": 4.974659251147693e-05, + "loss": 5.2209, + "step": 7628 + }, + { + "epoch": 0.04537182415072795, + "grad_norm": 2.577916383743286, + "learning_rate": 4.974652616953705e-05, + "loss": 5.2385, + "step": 7629 + }, + { + "epoch": 0.045377771434008944, + "grad_norm": 1.9784717559814453, + "learning_rate": 4.9746459818958416e-05, + "loss": 5.265, + "step": 7630 + }, + { + "epoch": 0.04538371871728994, + "grad_norm": 1.971383810043335, + "learning_rate": 4.974639345974104e-05, + "loss": 5.0548, + "step": 7631 + }, + { + "epoch": 0.04538966600057094, + "grad_norm": 2.096876621246338, + "learning_rate": 4.974632709188496e-05, + "loss": 5.1491, + "step": 7632 + }, + { + "epoch": 0.045395613283851936, + "grad_norm": 1.6079102754592896, + "learning_rate": 4.974626071539019e-05, + "loss": 5.1959, + "step": 7633 + }, + { + "epoch": 0.04540156056713293, + "grad_norm": 1.6881030797958374, + "learning_rate": 4.9746194330256755e-05, + "loss": 5.1772, + "step": 7634 + }, + { + "epoch": 0.04540750785041393, + "grad_norm": 1.7459675073623657, + "learning_rate": 4.974612793648469e-05, + "loss": 5.1885, + "step": 7635 + }, + { + "epoch": 0.04541345513369493, + "grad_norm": 1.739272117614746, + "learning_rate": 4.9746061534073993e-05, + "loss": 5.318, + "step": 7636 + }, + { + "epoch": 0.045419402416975924, + "grad_norm": 1.7761027812957764, + "learning_rate": 4.974599512302471e-05, + "loss": 5.1525, + "step": 7637 + }, + { + "epoch": 0.045425349700256926, + "grad_norm": 1.8695855140686035, + "learning_rate": 4.9745928703336854e-05, + "loss": 5.5754, + "step": 7638 + }, + { + "epoch": 0.04543129698353792, + "grad_norm": 1.8737404346466064, + "learning_rate": 4.9745862275010446e-05, + "loss": 5.2908, + "step": 7639 + }, + { + "epoch": 0.045437244266818916, + "grad_norm": 1.731676459312439, + "learning_rate": 4.9745795838045515e-05, + "loss": 5.2671, + "step": 7640 + }, + { + "epoch": 0.04544319155009991, + "grad_norm": 1.6687474250793457, + "learning_rate": 4.974572939244209e-05, + "loss": 5.1629, + "step": 7641 + }, + { + "epoch": 0.04544913883338091, + "grad_norm": 2.1376633644104004, + "learning_rate": 4.974566293820018e-05, + "loss": 5.2853, + "step": 7642 + }, + { + "epoch": 0.04545508611666191, + "grad_norm": 2.0989861488342285, + "learning_rate": 4.974559647531981e-05, + "loss": 5.1311, + "step": 7643 + }, + { + "epoch": 0.0454610333999429, + "grad_norm": 2.3433620929718018, + "learning_rate": 4.974553000380102e-05, + "loss": 4.9854, + "step": 7644 + }, + { + "epoch": 0.045466980683223905, + "grad_norm": 2.306170701980591, + "learning_rate": 4.974546352364381e-05, + "loss": 5.3152, + "step": 7645 + }, + { + "epoch": 0.0454729279665049, + "grad_norm": 1.9588537216186523, + "learning_rate": 4.974539703484822e-05, + "loss": 5.3903, + "step": 7646 + }, + { + "epoch": 0.045478875249785895, + "grad_norm": 1.7994736433029175, + "learning_rate": 4.9745330537414265e-05, + "loss": 5.2505, + "step": 7647 + }, + { + "epoch": 0.0454848225330669, + "grad_norm": 1.983175277709961, + "learning_rate": 4.974526403134197e-05, + "loss": 5.2607, + "step": 7648 + }, + { + "epoch": 0.04549076981634789, + "grad_norm": 1.8853832483291626, + "learning_rate": 4.974519751663136e-05, + "loss": 5.1475, + "step": 7649 + }, + { + "epoch": 0.04549671709962889, + "grad_norm": 1.9374700784683228, + "learning_rate": 4.9745130993282464e-05, + "loss": 5.2039, + "step": 7650 + }, + { + "epoch": 0.04550266438290989, + "grad_norm": 1.8200404644012451, + "learning_rate": 4.974506446129529e-05, + "loss": 5.2794, + "step": 7651 + }, + { + "epoch": 0.045508611666190885, + "grad_norm": 1.8375320434570312, + "learning_rate": 4.974499792066987e-05, + "loss": 5.1149, + "step": 7652 + }, + { + "epoch": 0.04551455894947188, + "grad_norm": 1.7842520475387573, + "learning_rate": 4.974493137140623e-05, + "loss": 5.0332, + "step": 7653 + }, + { + "epoch": 0.04552050623275288, + "grad_norm": 2.0220818519592285, + "learning_rate": 4.974486481350439e-05, + "loss": 5.0277, + "step": 7654 + }, + { + "epoch": 0.04552645351603388, + "grad_norm": 2.0787746906280518, + "learning_rate": 4.9744798246964375e-05, + "loss": 5.0587, + "step": 7655 + }, + { + "epoch": 0.04553240079931487, + "grad_norm": 1.7024985551834106, + "learning_rate": 4.97447316717862e-05, + "loss": 5.0184, + "step": 7656 + }, + { + "epoch": 0.04553834808259587, + "grad_norm": 1.9057540893554688, + "learning_rate": 4.97446650879699e-05, + "loss": 5.3945, + "step": 7657 + }, + { + "epoch": 0.04554429536587687, + "grad_norm": 1.7963287830352783, + "learning_rate": 4.974459849551549e-05, + "loss": 4.9869, + "step": 7658 + }, + { + "epoch": 0.045550242649157864, + "grad_norm": 2.027353286743164, + "learning_rate": 4.974453189442299e-05, + "loss": 5.1389, + "step": 7659 + }, + { + "epoch": 0.04555618993243886, + "grad_norm": 1.7137126922607422, + "learning_rate": 4.9744465284692445e-05, + "loss": 5.058, + "step": 7660 + }, + { + "epoch": 0.04556213721571986, + "grad_norm": 2.0363876819610596, + "learning_rate": 4.9744398666323854e-05, + "loss": 4.9174, + "step": 7661 + }, + { + "epoch": 0.045568084499000856, + "grad_norm": 2.1440837383270264, + "learning_rate": 4.9744332039317255e-05, + "loss": 4.8894, + "step": 7662 + }, + { + "epoch": 0.04557403178228185, + "grad_norm": 1.9582308530807495, + "learning_rate": 4.9744265403672655e-05, + "loss": 5.0666, + "step": 7663 + }, + { + "epoch": 0.04557997906556285, + "grad_norm": 1.9997116327285767, + "learning_rate": 4.97441987593901e-05, + "loss": 5.0804, + "step": 7664 + }, + { + "epoch": 0.04558592634884385, + "grad_norm": 2.067361831665039, + "learning_rate": 4.9744132106469586e-05, + "loss": 4.8655, + "step": 7665 + }, + { + "epoch": 0.045591873632124844, + "grad_norm": 1.7066930532455444, + "learning_rate": 4.9744065444911165e-05, + "loss": 4.792, + "step": 7666 + }, + { + "epoch": 0.045597820915405846, + "grad_norm": 1.8526182174682617, + "learning_rate": 4.974399877471484e-05, + "loss": 4.755, + "step": 7667 + }, + { + "epoch": 0.04560376819868684, + "grad_norm": 1.8744564056396484, + "learning_rate": 4.9743932095880644e-05, + "loss": 4.7732, + "step": 7668 + }, + { + "epoch": 0.045609715481967836, + "grad_norm": 1.849574327468872, + "learning_rate": 4.97438654084086e-05, + "loss": 4.7743, + "step": 7669 + }, + { + "epoch": 0.04561566276524884, + "grad_norm": 1.87284255027771, + "learning_rate": 4.9743798712298714e-05, + "loss": 5.0582, + "step": 7670 + }, + { + "epoch": 0.04562161004852983, + "grad_norm": 2.206273078918457, + "learning_rate": 4.974373200755104e-05, + "loss": 5.4683, + "step": 7671 + }, + { + "epoch": 0.04562755733181083, + "grad_norm": 1.9849058389663696, + "learning_rate": 4.974366529416557e-05, + "loss": 5.4087, + "step": 7672 + }, + { + "epoch": 0.04563350461509182, + "grad_norm": 1.9440083503723145, + "learning_rate": 4.974359857214235e-05, + "loss": 4.9607, + "step": 7673 + }, + { + "epoch": 0.045639451898372825, + "grad_norm": 1.7112319469451904, + "learning_rate": 4.974353184148139e-05, + "loss": 5.6589, + "step": 7674 + }, + { + "epoch": 0.04564539918165382, + "grad_norm": 1.921215295791626, + "learning_rate": 4.974346510218273e-05, + "loss": 5.4495, + "step": 7675 + }, + { + "epoch": 0.045651346464934815, + "grad_norm": 1.9582061767578125, + "learning_rate": 4.974339835424637e-05, + "loss": 5.2459, + "step": 7676 + }, + { + "epoch": 0.04565729374821582, + "grad_norm": 1.9781824350357056, + "learning_rate": 4.974333159767235e-05, + "loss": 5.3424, + "step": 7677 + }, + { + "epoch": 0.04566324103149681, + "grad_norm": 1.7183479070663452, + "learning_rate": 4.974326483246069e-05, + "loss": 5.3741, + "step": 7678 + }, + { + "epoch": 0.04566918831477781, + "grad_norm": 1.7942447662353516, + "learning_rate": 4.974319805861141e-05, + "loss": 5.4008, + "step": 7679 + }, + { + "epoch": 0.04567513559805881, + "grad_norm": 1.8255115747451782, + "learning_rate": 4.974313127612454e-05, + "loss": 5.1849, + "step": 7680 + }, + { + "epoch": 0.045681082881339805, + "grad_norm": 1.7907564640045166, + "learning_rate": 4.974306448500009e-05, + "loss": 5.1757, + "step": 7681 + }, + { + "epoch": 0.0456870301646208, + "grad_norm": 2.911489486694336, + "learning_rate": 4.97429976852381e-05, + "loss": 4.8909, + "step": 7682 + }, + { + "epoch": 0.0456929774479018, + "grad_norm": 2.849125623703003, + "learning_rate": 4.9742930876838576e-05, + "loss": 4.7733, + "step": 7683 + }, + { + "epoch": 0.0456989247311828, + "grad_norm": 2.4196949005126953, + "learning_rate": 4.9742864059801565e-05, + "loss": 4.8571, + "step": 7684 + }, + { + "epoch": 0.04570487201446379, + "grad_norm": 1.9430558681488037, + "learning_rate": 4.974279723412706e-05, + "loss": 5.1338, + "step": 7685 + }, + { + "epoch": 0.04571081929774479, + "grad_norm": 1.7538554668426514, + "learning_rate": 4.9742730399815105e-05, + "loss": 5.5524, + "step": 7686 + }, + { + "epoch": 0.04571676658102579, + "grad_norm": 2.006115198135376, + "learning_rate": 4.9742663556865724e-05, + "loss": 5.3343, + "step": 7687 + }, + { + "epoch": 0.045722713864306784, + "grad_norm": 2.554234027862549, + "learning_rate": 4.974259670527893e-05, + "loss": 5.8426, + "step": 7688 + }, + { + "epoch": 0.04572866114758778, + "grad_norm": 2.656747579574585, + "learning_rate": 4.974252984505475e-05, + "loss": 5.1578, + "step": 7689 + }, + { + "epoch": 0.04573460843086878, + "grad_norm": 2.800208568572998, + "learning_rate": 4.9742462976193216e-05, + "loss": 4.8019, + "step": 7690 + }, + { + "epoch": 0.045740555714149776, + "grad_norm": 2.674938201904297, + "learning_rate": 4.974239609869433e-05, + "loss": 4.7177, + "step": 7691 + }, + { + "epoch": 0.04574650299743077, + "grad_norm": 2.751533269882202, + "learning_rate": 4.974232921255815e-05, + "loss": 4.7568, + "step": 7692 + }, + { + "epoch": 0.04575245028071177, + "grad_norm": 2.623917818069458, + "learning_rate": 4.974226231778466e-05, + "loss": 4.5908, + "step": 7693 + }, + { + "epoch": 0.04575839756399277, + "grad_norm": 2.2248899936676025, + "learning_rate": 4.9742195414373904e-05, + "loss": 5.4066, + "step": 7694 + }, + { + "epoch": 0.045764344847273764, + "grad_norm": 1.7959388494491577, + "learning_rate": 4.974212850232591e-05, + "loss": 6.1414, + "step": 7695 + }, + { + "epoch": 0.045770292130554766, + "grad_norm": 2.0049352645874023, + "learning_rate": 4.974206158164069e-05, + "loss": 6.0106, + "step": 7696 + }, + { + "epoch": 0.04577623941383576, + "grad_norm": 2.4794270992279053, + "learning_rate": 4.9741994652318276e-05, + "loss": 5.8647, + "step": 7697 + }, + { + "epoch": 0.045782186697116756, + "grad_norm": 3.9380109310150146, + "learning_rate": 4.974192771435868e-05, + "loss": 5.719, + "step": 7698 + }, + { + "epoch": 0.04578813398039776, + "grad_norm": 2.564023017883301, + "learning_rate": 4.974186076776194e-05, + "loss": 4.7294, + "step": 7699 + }, + { + "epoch": 0.04579408126367875, + "grad_norm": 3.7082693576812744, + "learning_rate": 4.974179381252807e-05, + "loss": 5.1975, + "step": 7700 + }, + { + "epoch": 0.04580002854695975, + "grad_norm": 4.0067524909973145, + "learning_rate": 4.97417268486571e-05, + "loss": 5.4047, + "step": 7701 + }, + { + "epoch": 0.04580597583024074, + "grad_norm": 3.978787660598755, + "learning_rate": 4.974165987614904e-05, + "loss": 5.7023, + "step": 7702 + }, + { + "epoch": 0.045811923113521745, + "grad_norm": 4.597605228424072, + "learning_rate": 4.974159289500392e-05, + "loss": 6.5186, + "step": 7703 + }, + { + "epoch": 0.04581787039680274, + "grad_norm": 2.8793985843658447, + "learning_rate": 4.974152590522177e-05, + "loss": 6.1476, + "step": 7704 + }, + { + "epoch": 0.045823817680083735, + "grad_norm": 2.466089963912964, + "learning_rate": 4.974145890680262e-05, + "loss": 5.5154, + "step": 7705 + }, + { + "epoch": 0.04582976496336474, + "grad_norm": 2.937228202819824, + "learning_rate": 4.974139189974647e-05, + "loss": 5.5146, + "step": 7706 + }, + { + "epoch": 0.04583571224664573, + "grad_norm": 2.4580399990081787, + "learning_rate": 4.974132488405336e-05, + "loss": 6.214, + "step": 7707 + }, + { + "epoch": 0.04584165952992673, + "grad_norm": 4.910717010498047, + "learning_rate": 4.97412578597233e-05, + "loss": 5.819, + "step": 7708 + }, + { + "epoch": 0.04584760681320773, + "grad_norm": 5.372139930725098, + "learning_rate": 4.974119082675634e-05, + "loss": 5.3242, + "step": 7709 + }, + { + "epoch": 0.045853554096488724, + "grad_norm": 2.050492525100708, + "learning_rate": 4.9741123785152474e-05, + "loss": 6.0468, + "step": 7710 + }, + { + "epoch": 0.04585950137976972, + "grad_norm": 1.7090541124343872, + "learning_rate": 4.974105673491174e-05, + "loss": 5.7652, + "step": 7711 + }, + { + "epoch": 0.04586544866305072, + "grad_norm": 2.512538194656372, + "learning_rate": 4.974098967603415e-05, + "loss": 5.3184, + "step": 7712 + }, + { + "epoch": 0.04587139594633172, + "grad_norm": 3.311289072036743, + "learning_rate": 4.974092260851975e-05, + "loss": 5.5379, + "step": 7713 + }, + { + "epoch": 0.04587734322961271, + "grad_norm": 3.3318710327148438, + "learning_rate": 4.974085553236854e-05, + "loss": 5.5543, + "step": 7714 + }, + { + "epoch": 0.04588329051289371, + "grad_norm": 2.6384379863739014, + "learning_rate": 4.9740788447580555e-05, + "loss": 6.3475, + "step": 7715 + }, + { + "epoch": 0.04588923779617471, + "grad_norm": 2.0066304206848145, + "learning_rate": 4.974072135415582e-05, + "loss": 6.3685, + "step": 7716 + }, + { + "epoch": 0.045895185079455704, + "grad_norm": 2.4189116954803467, + "learning_rate": 4.9740654252094356e-05, + "loss": 5.4128, + "step": 7717 + }, + { + "epoch": 0.0459011323627367, + "grad_norm": 2.431011438369751, + "learning_rate": 4.974058714139618e-05, + "loss": 5.34, + "step": 7718 + }, + { + "epoch": 0.0459070796460177, + "grad_norm": 2.1997156143188477, + "learning_rate": 4.974052002206132e-05, + "loss": 5.4223, + "step": 7719 + }, + { + "epoch": 0.045913026929298696, + "grad_norm": 2.0700082778930664, + "learning_rate": 4.9740452894089806e-05, + "loss": 5.4255, + "step": 7720 + }, + { + "epoch": 0.04591897421257969, + "grad_norm": 2.3476040363311768, + "learning_rate": 4.974038575748165e-05, + "loss": 5.5055, + "step": 7721 + }, + { + "epoch": 0.04592492149586069, + "grad_norm": 4.2995524406433105, + "learning_rate": 4.974031861223688e-05, + "loss": 5.8869, + "step": 7722 + }, + { + "epoch": 0.04593086877914169, + "grad_norm": 4.690639495849609, + "learning_rate": 4.974025145835552e-05, + "loss": 6.0808, + "step": 7723 + }, + { + "epoch": 0.04593681606242268, + "grad_norm": 3.9823479652404785, + "learning_rate": 4.97401842958376e-05, + "loss": 6.0844, + "step": 7724 + }, + { + "epoch": 0.045942763345703685, + "grad_norm": 3.69808030128479, + "learning_rate": 4.9740117124683136e-05, + "loss": 5.9611, + "step": 7725 + }, + { + "epoch": 0.04594871062898468, + "grad_norm": 2.5912535190582275, + "learning_rate": 4.974004994489215e-05, + "loss": 5.9669, + "step": 7726 + }, + { + "epoch": 0.045954657912265676, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.973998275646467e-05, + "loss": 5.6717, + "step": 7727 + }, + { + "epoch": 0.04596060519554668, + "grad_norm": 2.179302930831909, + "learning_rate": 4.973991555940072e-05, + "loss": 5.4077, + "step": 7728 + }, + { + "epoch": 0.04596655247882767, + "grad_norm": 2.4919214248657227, + "learning_rate": 4.973984835370031e-05, + "loss": 6.118, + "step": 7729 + }, + { + "epoch": 0.04597249976210867, + "grad_norm": 3.5036723613739014, + "learning_rate": 4.9739781139363485e-05, + "loss": 5.436, + "step": 7730 + }, + { + "epoch": 0.04597844704538966, + "grad_norm": 4.129561424255371, + "learning_rate": 4.973971391639026e-05, + "loss": 4.8414, + "step": 7731 + }, + { + "epoch": 0.045984394328670665, + "grad_norm": 2.867039203643799, + "learning_rate": 4.973964668478065e-05, + "loss": 4.7385, + "step": 7732 + }, + { + "epoch": 0.04599034161195166, + "grad_norm": 2.754023313522339, + "learning_rate": 4.973957944453469e-05, + "loss": 4.6063, + "step": 7733 + }, + { + "epoch": 0.045996288895232655, + "grad_norm": 2.1025235652923584, + "learning_rate": 4.973951219565239e-05, + "loss": 5.3233, + "step": 7734 + }, + { + "epoch": 0.04600223617851366, + "grad_norm": 2.352883815765381, + "learning_rate": 4.973944493813379e-05, + "loss": 5.5648, + "step": 7735 + }, + { + "epoch": 0.04600818346179465, + "grad_norm": 2.049377679824829, + "learning_rate": 4.97393776719789e-05, + "loss": 6.1241, + "step": 7736 + }, + { + "epoch": 0.04601413074507565, + "grad_norm": 1.7124110460281372, + "learning_rate": 4.9739310397187756e-05, + "loss": 6.1258, + "step": 7737 + }, + { + "epoch": 0.04602007802835665, + "grad_norm": 2.2592861652374268, + "learning_rate": 4.9739243113760364e-05, + "loss": 6.1972, + "step": 7738 + }, + { + "epoch": 0.046026025311637644, + "grad_norm": 2.3926188945770264, + "learning_rate": 4.973917582169677e-05, + "loss": 6.1681, + "step": 7739 + }, + { + "epoch": 0.04603197259491864, + "grad_norm": 1.9956084489822388, + "learning_rate": 4.973910852099698e-05, + "loss": 6.2068, + "step": 7740 + }, + { + "epoch": 0.04603791987819964, + "grad_norm": 1.924467921257019, + "learning_rate": 4.973904121166102e-05, + "loss": 6.4391, + "step": 7741 + }, + { + "epoch": 0.04604386716148064, + "grad_norm": 1.9410041570663452, + "learning_rate": 4.973897389368891e-05, + "loss": 5.9378, + "step": 7742 + }, + { + "epoch": 0.04604981444476163, + "grad_norm": 2.0418617725372314, + "learning_rate": 4.9738906567080686e-05, + "loss": 5.8823, + "step": 7743 + }, + { + "epoch": 0.04605576172804263, + "grad_norm": 2.696143627166748, + "learning_rate": 4.973883923183637e-05, + "loss": 5.8551, + "step": 7744 + }, + { + "epoch": 0.04606170901132363, + "grad_norm": 2.482703447341919, + "learning_rate": 4.973877188795598e-05, + "loss": 5.5752, + "step": 7745 + }, + { + "epoch": 0.046067656294604624, + "grad_norm": 2.520437240600586, + "learning_rate": 4.973870453543954e-05, + "loss": 5.571, + "step": 7746 + }, + { + "epoch": 0.04607360357788562, + "grad_norm": 2.568150758743286, + "learning_rate": 4.973863717428707e-05, + "loss": 5.9145, + "step": 7747 + }, + { + "epoch": 0.04607955086116662, + "grad_norm": 2.6373183727264404, + "learning_rate": 4.9738569804498605e-05, + "loss": 5.9414, + "step": 7748 + }, + { + "epoch": 0.046085498144447616, + "grad_norm": 2.1663565635681152, + "learning_rate": 4.973850242607415e-05, + "loss": 6.2316, + "step": 7749 + }, + { + "epoch": 0.04609144542772861, + "grad_norm": 2.044316053390503, + "learning_rate": 4.973843503901374e-05, + "loss": 5.7232, + "step": 7750 + }, + { + "epoch": 0.04609739271100961, + "grad_norm": 2.1740782260894775, + "learning_rate": 4.9738367643317405e-05, + "loss": 6.0388, + "step": 7751 + }, + { + "epoch": 0.04610333999429061, + "grad_norm": 2.0643458366394043, + "learning_rate": 4.973830023898516e-05, + "loss": 5.8201, + "step": 7752 + }, + { + "epoch": 0.0461092872775716, + "grad_norm": 1.7433217763900757, + "learning_rate": 4.973823282601703e-05, + "loss": 6.0464, + "step": 7753 + }, + { + "epoch": 0.046115234560852605, + "grad_norm": 2.657677412033081, + "learning_rate": 4.9738165404413037e-05, + "loss": 5.2849, + "step": 7754 + }, + { + "epoch": 0.0461211818441336, + "grad_norm": 1.7317034006118774, + "learning_rate": 4.9738097974173205e-05, + "loss": 6.0619, + "step": 7755 + }, + { + "epoch": 0.046127129127414596, + "grad_norm": 1.6109949350357056, + "learning_rate": 4.973803053529756e-05, + "loss": 5.7832, + "step": 7756 + }, + { + "epoch": 0.0461330764106956, + "grad_norm": 2.2980475425720215, + "learning_rate": 4.9737963087786125e-05, + "loss": 5.4346, + "step": 7757 + }, + { + "epoch": 0.04613902369397659, + "grad_norm": 2.5162737369537354, + "learning_rate": 4.973789563163892e-05, + "loss": 5.3723, + "step": 7758 + }, + { + "epoch": 0.04614497097725759, + "grad_norm": 2.3493261337280273, + "learning_rate": 4.973782816685597e-05, + "loss": 5.7474, + "step": 7759 + }, + { + "epoch": 0.04615091826053858, + "grad_norm": 2.1428544521331787, + "learning_rate": 4.9737760693437306e-05, + "loss": 5.6318, + "step": 7760 + }, + { + "epoch": 0.046156865543819585, + "grad_norm": 2.11627197265625, + "learning_rate": 4.973769321138294e-05, + "loss": 5.38, + "step": 7761 + }, + { + "epoch": 0.04616281282710058, + "grad_norm": 2.411957263946533, + "learning_rate": 4.9737625720692906e-05, + "loss": 5.1822, + "step": 7762 + }, + { + "epoch": 0.046168760110381575, + "grad_norm": 2.3566222190856934, + "learning_rate": 4.973755822136722e-05, + "loss": 5.0405, + "step": 7763 + }, + { + "epoch": 0.04617470739366258, + "grad_norm": 2.2235679626464844, + "learning_rate": 4.973749071340591e-05, + "loss": 5.4746, + "step": 7764 + }, + { + "epoch": 0.04618065467694357, + "grad_norm": 2.4175586700439453, + "learning_rate": 4.973742319680899e-05, + "loss": 5.7519, + "step": 7765 + }, + { + "epoch": 0.04618660196022457, + "grad_norm": 2.3386452198028564, + "learning_rate": 4.9737355671576496e-05, + "loss": 6.1765, + "step": 7766 + }, + { + "epoch": 0.04619254924350557, + "grad_norm": 2.084333658218384, + "learning_rate": 4.973728813770845e-05, + "loss": 6.1439, + "step": 7767 + }, + { + "epoch": 0.046198496526786564, + "grad_norm": 2.0523531436920166, + "learning_rate": 4.973722059520487e-05, + "loss": 6.294, + "step": 7768 + }, + { + "epoch": 0.04620444381006756, + "grad_norm": 2.1187572479248047, + "learning_rate": 4.973715304406578e-05, + "loss": 5.3679, + "step": 7769 + }, + { + "epoch": 0.04621039109334856, + "grad_norm": 2.5249836444854736, + "learning_rate": 4.9737085484291204e-05, + "loss": 5.9086, + "step": 7770 + }, + { + "epoch": 0.04621633837662956, + "grad_norm": 2.35662841796875, + "learning_rate": 4.973701791588117e-05, + "loss": 6.3135, + "step": 7771 + }, + { + "epoch": 0.04622228565991055, + "grad_norm": 2.070955276489258, + "learning_rate": 4.9736950338835695e-05, + "loss": 5.8748, + "step": 7772 + }, + { + "epoch": 0.04622823294319155, + "grad_norm": 2.151587963104248, + "learning_rate": 4.9736882753154814e-05, + "loss": 6.2053, + "step": 7773 + }, + { + "epoch": 0.04623418022647255, + "grad_norm": 2.2187843322753906, + "learning_rate": 4.9736815158838534e-05, + "loss": 5.762, + "step": 7774 + }, + { + "epoch": 0.046240127509753544, + "grad_norm": 1.8676223754882812, + "learning_rate": 4.973674755588689e-05, + "loss": 6.06, + "step": 7775 + }, + { + "epoch": 0.04624607479303454, + "grad_norm": 2.2110252380371094, + "learning_rate": 4.9736679944299906e-05, + "loss": 5.6474, + "step": 7776 + }, + { + "epoch": 0.04625202207631554, + "grad_norm": 2.0635151863098145, + "learning_rate": 4.9736612324077605e-05, + "loss": 5.5579, + "step": 7777 + }, + { + "epoch": 0.046257969359596536, + "grad_norm": 2.1654598712921143, + "learning_rate": 4.973654469522e-05, + "loss": 5.5388, + "step": 7778 + }, + { + "epoch": 0.04626391664287753, + "grad_norm": 2.3735673427581787, + "learning_rate": 4.973647705772713e-05, + "loss": 5.4383, + "step": 7779 + }, + { + "epoch": 0.04626986392615853, + "grad_norm": 2.344160318374634, + "learning_rate": 4.9736409411599e-05, + "loss": 5.6501, + "step": 7780 + }, + { + "epoch": 0.04627581120943953, + "grad_norm": 3.023350477218628, + "learning_rate": 4.973634175683566e-05, + "loss": 5.2688, + "step": 7781 + }, + { + "epoch": 0.04628175849272052, + "grad_norm": 2.8814494609832764, + "learning_rate": 4.973627409343711e-05, + "loss": 5.08, + "step": 7782 + }, + { + "epoch": 0.046287705776001525, + "grad_norm": 2.475191831588745, + "learning_rate": 4.973620642140339e-05, + "loss": 5.0761, + "step": 7783 + }, + { + "epoch": 0.04629365305928252, + "grad_norm": 2.5567755699157715, + "learning_rate": 4.9736138740734504e-05, + "loss": 5.46, + "step": 7784 + }, + { + "epoch": 0.046299600342563516, + "grad_norm": 2.9225175380706787, + "learning_rate": 4.973607105143049e-05, + "loss": 5.5219, + "step": 7785 + }, + { + "epoch": 0.04630554762584452, + "grad_norm": 2.3112781047821045, + "learning_rate": 4.973600335349138e-05, + "loss": 6.4204, + "step": 7786 + }, + { + "epoch": 0.04631149490912551, + "grad_norm": 2.228182554244995, + "learning_rate": 4.973593564691717e-05, + "loss": 6.3299, + "step": 7787 + }, + { + "epoch": 0.04631744219240651, + "grad_norm": 1.8612277507781982, + "learning_rate": 4.973586793170792e-05, + "loss": 5.994, + "step": 7788 + }, + { + "epoch": 0.0463233894756875, + "grad_norm": 1.9788155555725098, + "learning_rate": 4.9735800207863626e-05, + "loss": 6.1676, + "step": 7789 + }, + { + "epoch": 0.046329336758968505, + "grad_norm": 2.2335264682769775, + "learning_rate": 4.973573247538431e-05, + "loss": 6.3112, + "step": 7790 + }, + { + "epoch": 0.0463352840422495, + "grad_norm": 2.168656349182129, + "learning_rate": 4.973566473427001e-05, + "loss": 5.8326, + "step": 7791 + }, + { + "epoch": 0.046341231325530495, + "grad_norm": 1.9187591075897217, + "learning_rate": 4.9735596984520755e-05, + "loss": 5.8734, + "step": 7792 + }, + { + "epoch": 0.0463471786088115, + "grad_norm": 2.195242166519165, + "learning_rate": 4.973552922613655e-05, + "loss": 6.1325, + "step": 7793 + }, + { + "epoch": 0.04635312589209249, + "grad_norm": 1.9698888063430786, + "learning_rate": 4.973546145911743e-05, + "loss": 5.8586, + "step": 7794 + }, + { + "epoch": 0.04635907317537349, + "grad_norm": 2.2149972915649414, + "learning_rate": 4.973539368346342e-05, + "loss": 5.4087, + "step": 7795 + }, + { + "epoch": 0.04636502045865449, + "grad_norm": 1.8587820529937744, + "learning_rate": 4.973532589917453e-05, + "loss": 5.9956, + "step": 7796 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 2.022866725921631, + "learning_rate": 4.97352581062508e-05, + "loss": 6.0905, + "step": 7797 + }, + { + "epoch": 0.04637691502521648, + "grad_norm": 2.0257678031921387, + "learning_rate": 4.973519030469225e-05, + "loss": 6.02, + "step": 7798 + }, + { + "epoch": 0.04638286230849748, + "grad_norm": 1.6909089088439941, + "learning_rate": 4.973512249449889e-05, + "loss": 5.727, + "step": 7799 + }, + { + "epoch": 0.046388809591778477, + "grad_norm": 1.8882997035980225, + "learning_rate": 4.9735054675670754e-05, + "loss": 5.655, + "step": 7800 + }, + { + "epoch": 0.04639475687505947, + "grad_norm": 2.1775193214416504, + "learning_rate": 4.9734986848207876e-05, + "loss": 5.8067, + "step": 7801 + }, + { + "epoch": 0.04640070415834047, + "grad_norm": 2.136690139770508, + "learning_rate": 4.973491901211027e-05, + "loss": 5.5515, + "step": 7802 + }, + { + "epoch": 0.04640665144162147, + "grad_norm": 1.8036144971847534, + "learning_rate": 4.973485116737795e-05, + "loss": 5.8404, + "step": 7803 + }, + { + "epoch": 0.046412598724902464, + "grad_norm": 2.1350481510162354, + "learning_rate": 4.973478331401096e-05, + "loss": 6.1635, + "step": 7804 + }, + { + "epoch": 0.04641854600818346, + "grad_norm": 2.4152462482452393, + "learning_rate": 4.97347154520093e-05, + "loss": 5.9882, + "step": 7805 + }, + { + "epoch": 0.04642449329146446, + "grad_norm": 2.166402578353882, + "learning_rate": 4.9734647581373015e-05, + "loss": 5.8982, + "step": 7806 + }, + { + "epoch": 0.046430440574745456, + "grad_norm": 1.8684437274932861, + "learning_rate": 4.973457970210211e-05, + "loss": 5.9501, + "step": 7807 + }, + { + "epoch": 0.04643638785802645, + "grad_norm": 1.775829792022705, + "learning_rate": 4.973451181419663e-05, + "loss": 5.83, + "step": 7808 + }, + { + "epoch": 0.04644233514130745, + "grad_norm": 1.7500759363174438, + "learning_rate": 4.973444391765659e-05, + "loss": 6.0084, + "step": 7809 + }, + { + "epoch": 0.04644828242458845, + "grad_norm": 2.3920938968658447, + "learning_rate": 4.9734376012482e-05, + "loss": 5.559, + "step": 7810 + }, + { + "epoch": 0.04645422970786944, + "grad_norm": 2.7680983543395996, + "learning_rate": 4.97343080986729e-05, + "loss": 5.3521, + "step": 7811 + }, + { + "epoch": 0.046460176991150445, + "grad_norm": 2.6618781089782715, + "learning_rate": 4.9734240176229316e-05, + "loss": 5.6917, + "step": 7812 + }, + { + "epoch": 0.04646612427443144, + "grad_norm": 2.086775541305542, + "learning_rate": 4.9734172245151256e-05, + "loss": 5.582, + "step": 7813 + }, + { + "epoch": 0.046472071557712435, + "grad_norm": 2.190012216567993, + "learning_rate": 4.973410430543875e-05, + "loss": 5.9132, + "step": 7814 + }, + { + "epoch": 0.04647801884099344, + "grad_norm": 2.317610740661621, + "learning_rate": 4.973403635709183e-05, + "loss": 5.7055, + "step": 7815 + }, + { + "epoch": 0.04648396612427443, + "grad_norm": 2.1291167736053467, + "learning_rate": 4.973396840011051e-05, + "loss": 5.6711, + "step": 7816 + }, + { + "epoch": 0.04648991340755543, + "grad_norm": 1.5421113967895508, + "learning_rate": 4.9733900434494815e-05, + "loss": 5.6433, + "step": 7817 + }, + { + "epoch": 0.04649586069083642, + "grad_norm": 2.222355604171753, + "learning_rate": 4.973383246024477e-05, + "loss": 5.3685, + "step": 7818 + }, + { + "epoch": 0.046501807974117425, + "grad_norm": 2.097116708755493, + "learning_rate": 4.97337644773604e-05, + "loss": 5.6528, + "step": 7819 + }, + { + "epoch": 0.04650775525739842, + "grad_norm": 2.0224382877349854, + "learning_rate": 4.973369648584174e-05, + "loss": 5.8849, + "step": 7820 + }, + { + "epoch": 0.046513702540679415, + "grad_norm": 2.1581428050994873, + "learning_rate": 4.973362848568879e-05, + "loss": 5.985, + "step": 7821 + }, + { + "epoch": 0.04651964982396042, + "grad_norm": 2.43945574760437, + "learning_rate": 4.9733560476901584e-05, + "loss": 5.5682, + "step": 7822 + }, + { + "epoch": 0.04652559710724141, + "grad_norm": 3.174143075942993, + "learning_rate": 4.9733492459480157e-05, + "loss": 4.832, + "step": 7823 + }, + { + "epoch": 0.04653154439052241, + "grad_norm": 2.269339084625244, + "learning_rate": 4.973342443342452e-05, + "loss": 5.5804, + "step": 7824 + }, + { + "epoch": 0.04653749167380341, + "grad_norm": 2.3775289058685303, + "learning_rate": 4.9733356398734695e-05, + "loss": 5.8299, + "step": 7825 + }, + { + "epoch": 0.046543438957084404, + "grad_norm": 2.065579414367676, + "learning_rate": 4.9733288355410716e-05, + "loss": 5.6985, + "step": 7826 + }, + { + "epoch": 0.0465493862403654, + "grad_norm": 1.9699875116348267, + "learning_rate": 4.9733220303452604e-05, + "loss": 6.0161, + "step": 7827 + }, + { + "epoch": 0.0465553335236464, + "grad_norm": 2.1414806842803955, + "learning_rate": 4.9733152242860374e-05, + "loss": 6.2534, + "step": 7828 + }, + { + "epoch": 0.046561280806927396, + "grad_norm": 2.414738416671753, + "learning_rate": 4.973308417363406e-05, + "loss": 5.8402, + "step": 7829 + }, + { + "epoch": 0.04656722809020839, + "grad_norm": 2.4105031490325928, + "learning_rate": 4.973301609577368e-05, + "loss": 5.8728, + "step": 7830 + }, + { + "epoch": 0.04657317537348939, + "grad_norm": 2.7718660831451416, + "learning_rate": 4.9732948009279264e-05, + "loss": 5.637, + "step": 7831 + }, + { + "epoch": 0.04657912265677039, + "grad_norm": 2.205103874206543, + "learning_rate": 4.9732879914150824e-05, + "loss": 5.4119, + "step": 7832 + }, + { + "epoch": 0.046585069940051384, + "grad_norm": 1.9080390930175781, + "learning_rate": 4.9732811810388394e-05, + "loss": 5.3387, + "step": 7833 + }, + { + "epoch": 0.04659101722333238, + "grad_norm": 1.6600725650787354, + "learning_rate": 4.9732743697992e-05, + "loss": 5.3192, + "step": 7834 + }, + { + "epoch": 0.04659696450661338, + "grad_norm": 1.9428787231445312, + "learning_rate": 4.973267557696165e-05, + "loss": 5.3127, + "step": 7835 + }, + { + "epoch": 0.046602911789894376, + "grad_norm": 2.174811840057373, + "learning_rate": 4.973260744729738e-05, + "loss": 5.7181, + "step": 7836 + }, + { + "epoch": 0.04660885907317537, + "grad_norm": 2.5420422554016113, + "learning_rate": 4.9732539308999224e-05, + "loss": 5.934, + "step": 7837 + }, + { + "epoch": 0.04661480635645637, + "grad_norm": 2.079343795776367, + "learning_rate": 4.973247116206719e-05, + "loss": 5.236, + "step": 7838 + }, + { + "epoch": 0.04662075363973737, + "grad_norm": 1.7748003005981445, + "learning_rate": 4.97324030065013e-05, + "loss": 5.2929, + "step": 7839 + }, + { + "epoch": 0.04662670092301836, + "grad_norm": 2.2746875286102295, + "learning_rate": 4.973233484230159e-05, + "loss": 5.182, + "step": 7840 + }, + { + "epoch": 0.046632648206299365, + "grad_norm": 1.7846394777297974, + "learning_rate": 4.9732266669468074e-05, + "loss": 5.2682, + "step": 7841 + }, + { + "epoch": 0.04663859548958036, + "grad_norm": 2.078132152557373, + "learning_rate": 4.973219848800078e-05, + "loss": 5.3245, + "step": 7842 + }, + { + "epoch": 0.046644542772861355, + "grad_norm": 1.7784876823425293, + "learning_rate": 4.9732130297899726e-05, + "loss": 5.4582, + "step": 7843 + }, + { + "epoch": 0.04665049005614236, + "grad_norm": 1.8421920537948608, + "learning_rate": 4.973206209916495e-05, + "loss": 5.3504, + "step": 7844 + }, + { + "epoch": 0.04665643733942335, + "grad_norm": 1.9958820343017578, + "learning_rate": 4.9731993891796455e-05, + "loss": 5.2914, + "step": 7845 + }, + { + "epoch": 0.04666238462270435, + "grad_norm": 2.0615813732147217, + "learning_rate": 4.9731925675794286e-05, + "loss": 5.3318, + "step": 7846 + }, + { + "epoch": 0.04666833190598534, + "grad_norm": 1.7690422534942627, + "learning_rate": 4.973185745115846e-05, + "loss": 5.3169, + "step": 7847 + }, + { + "epoch": 0.046674279189266345, + "grad_norm": 1.7990578413009644, + "learning_rate": 4.9731789217888994e-05, + "loss": 5.3136, + "step": 7848 + }, + { + "epoch": 0.04668022647254734, + "grad_norm": 2.0028672218322754, + "learning_rate": 4.9731720975985905e-05, + "loss": 5.2115, + "step": 7849 + }, + { + "epoch": 0.046686173755828335, + "grad_norm": 2.0703940391540527, + "learning_rate": 4.973165272544924e-05, + "loss": 5.2439, + "step": 7850 + }, + { + "epoch": 0.04669212103910934, + "grad_norm": 2.1105704307556152, + "learning_rate": 4.973158446627901e-05, + "loss": 5.5812, + "step": 7851 + }, + { + "epoch": 0.04669806832239033, + "grad_norm": 1.7391036748886108, + "learning_rate": 4.9731516198475236e-05, + "loss": 5.229, + "step": 7852 + }, + { + "epoch": 0.04670401560567133, + "grad_norm": 1.6907505989074707, + "learning_rate": 4.973144792203795e-05, + "loss": 5.2674, + "step": 7853 + }, + { + "epoch": 0.04670996288895233, + "grad_norm": 1.608168125152588, + "learning_rate": 4.973137963696717e-05, + "loss": 5.389, + "step": 7854 + }, + { + "epoch": 0.046715910172233324, + "grad_norm": 1.7521610260009766, + "learning_rate": 4.9731311343262913e-05, + "loss": 5.2436, + "step": 7855 + }, + { + "epoch": 0.04672185745551432, + "grad_norm": 2.0182595252990723, + "learning_rate": 4.973124304092522e-05, + "loss": 5.2746, + "step": 7856 + }, + { + "epoch": 0.04672780473879532, + "grad_norm": 1.7990871667861938, + "learning_rate": 4.97311747299541e-05, + "loss": 5.4241, + "step": 7857 + }, + { + "epoch": 0.046733752022076316, + "grad_norm": 2.124717950820923, + "learning_rate": 4.973110641034958e-05, + "loss": 5.5133, + "step": 7858 + }, + { + "epoch": 0.04673969930535731, + "grad_norm": 2.066869020462036, + "learning_rate": 4.973103808211169e-05, + "loss": 5.252, + "step": 7859 + }, + { + "epoch": 0.04674564658863831, + "grad_norm": 1.8004878759384155, + "learning_rate": 4.9730969745240455e-05, + "loss": 5.483, + "step": 7860 + }, + { + "epoch": 0.04675159387191931, + "grad_norm": 1.6822713613510132, + "learning_rate": 4.9730901399735886e-05, + "loss": 5.3916, + "step": 7861 + }, + { + "epoch": 0.046757541155200304, + "grad_norm": 1.7024493217468262, + "learning_rate": 4.973083304559802e-05, + "loss": 5.3504, + "step": 7862 + }, + { + "epoch": 0.0467634884384813, + "grad_norm": 1.5939997434616089, + "learning_rate": 4.973076468282687e-05, + "loss": 5.4151, + "step": 7863 + }, + { + "epoch": 0.0467694357217623, + "grad_norm": 1.7603535652160645, + "learning_rate": 4.9730696311422475e-05, + "loss": 5.351, + "step": 7864 + }, + { + "epoch": 0.046775383005043296, + "grad_norm": 1.737897276878357, + "learning_rate": 4.973062793138484e-05, + "loss": 5.0834, + "step": 7865 + }, + { + "epoch": 0.04678133028832429, + "grad_norm": 2.4130520820617676, + "learning_rate": 4.973055954271401e-05, + "loss": 4.833, + "step": 7866 + }, + { + "epoch": 0.04678727757160529, + "grad_norm": 1.9712201356887817, + "learning_rate": 4.9730491145409987e-05, + "loss": 5.0048, + "step": 7867 + }, + { + "epoch": 0.04679322485488629, + "grad_norm": 1.808608055114746, + "learning_rate": 4.97304227394728e-05, + "loss": 5.3134, + "step": 7868 + }, + { + "epoch": 0.04679917213816728, + "grad_norm": 1.8121775388717651, + "learning_rate": 4.973035432490249e-05, + "loss": 5.2594, + "step": 7869 + }, + { + "epoch": 0.046805119421448285, + "grad_norm": 1.7191296815872192, + "learning_rate": 4.9730285901699064e-05, + "loss": 5.206, + "step": 7870 + }, + { + "epoch": 0.04681106670472928, + "grad_norm": 1.931894063949585, + "learning_rate": 4.973021746986255e-05, + "loss": 5.3349, + "step": 7871 + }, + { + "epoch": 0.046817013988010275, + "grad_norm": 2.5420172214508057, + "learning_rate": 4.973014902939297e-05, + "loss": 5.2894, + "step": 7872 + }, + { + "epoch": 0.04682296127129128, + "grad_norm": 2.5522336959838867, + "learning_rate": 4.973008058029036e-05, + "loss": 5.2144, + "step": 7873 + }, + { + "epoch": 0.04682890855457227, + "grad_norm": 3.1389801502227783, + "learning_rate": 4.973001212255472e-05, + "loss": 5.7229, + "step": 7874 + }, + { + "epoch": 0.04683485583785327, + "grad_norm": 1.8687554597854614, + "learning_rate": 4.97299436561861e-05, + "loss": 5.483, + "step": 7875 + }, + { + "epoch": 0.04684080312113426, + "grad_norm": 2.2526602745056152, + "learning_rate": 4.972987518118451e-05, + "loss": 5.4562, + "step": 7876 + }, + { + "epoch": 0.046846750404415265, + "grad_norm": 2.108677625656128, + "learning_rate": 4.972980669754997e-05, + "loss": 5.2005, + "step": 7877 + }, + { + "epoch": 0.04685269768769626, + "grad_norm": 2.023118019104004, + "learning_rate": 4.972973820528252e-05, + "loss": 5.3674, + "step": 7878 + }, + { + "epoch": 0.046858644970977255, + "grad_norm": 1.6553964614868164, + "learning_rate": 4.9729669704382165e-05, + "loss": 5.3256, + "step": 7879 + }, + { + "epoch": 0.04686459225425826, + "grad_norm": 1.8197314739227295, + "learning_rate": 4.972960119484894e-05, + "loss": 5.1738, + "step": 7880 + }, + { + "epoch": 0.04687053953753925, + "grad_norm": 1.6142289638519287, + "learning_rate": 4.972953267668287e-05, + "loss": 5.245, + "step": 7881 + }, + { + "epoch": 0.04687648682082025, + "grad_norm": 1.4962797164916992, + "learning_rate": 4.972946414988398e-05, + "loss": 5.3121, + "step": 7882 + }, + { + "epoch": 0.04688243410410125, + "grad_norm": 1.487801432609558, + "learning_rate": 4.972939561445228e-05, + "loss": 5.1828, + "step": 7883 + }, + { + "epoch": 0.046888381387382244, + "grad_norm": 1.9139772653579712, + "learning_rate": 4.972932707038781e-05, + "loss": 5.2432, + "step": 7884 + }, + { + "epoch": 0.04689432867066324, + "grad_norm": 1.7533615827560425, + "learning_rate": 4.972925851769058e-05, + "loss": 5.6451, + "step": 7885 + }, + { + "epoch": 0.04690027595394424, + "grad_norm": 1.8561608791351318, + "learning_rate": 4.972918995636062e-05, + "loss": 5.4293, + "step": 7886 + }, + { + "epoch": 0.046906223237225236, + "grad_norm": 1.6891844272613525, + "learning_rate": 4.972912138639797e-05, + "loss": 5.2736, + "step": 7887 + }, + { + "epoch": 0.04691217052050623, + "grad_norm": 1.9279890060424805, + "learning_rate": 4.972905280780262e-05, + "loss": 5.5733, + "step": 7888 + }, + { + "epoch": 0.04691811780378723, + "grad_norm": 1.7810181379318237, + "learning_rate": 4.9728984220574624e-05, + "loss": 5.2036, + "step": 7889 + }, + { + "epoch": 0.04692406508706823, + "grad_norm": 1.6455233097076416, + "learning_rate": 4.9728915624714004e-05, + "loss": 5.3493, + "step": 7890 + }, + { + "epoch": 0.046930012370349224, + "grad_norm": 1.5345048904418945, + "learning_rate": 4.9728847020220756e-05, + "loss": 5.2528, + "step": 7891 + }, + { + "epoch": 0.04693595965363022, + "grad_norm": 1.455165982246399, + "learning_rate": 4.9728778407094935e-05, + "loss": 5.2769, + "step": 7892 + }, + { + "epoch": 0.04694190693691122, + "grad_norm": 1.577910304069519, + "learning_rate": 4.972870978533655e-05, + "loss": 5.2182, + "step": 7893 + }, + { + "epoch": 0.046947854220192216, + "grad_norm": 1.728143334388733, + "learning_rate": 4.972864115494563e-05, + "loss": 5.3446, + "step": 7894 + }, + { + "epoch": 0.04695380150347321, + "grad_norm": 1.6157398223876953, + "learning_rate": 4.972857251592219e-05, + "loss": 5.4866, + "step": 7895 + }, + { + "epoch": 0.04695974878675421, + "grad_norm": 1.5386699438095093, + "learning_rate": 4.9728503868266266e-05, + "loss": 5.4626, + "step": 7896 + }, + { + "epoch": 0.04696569607003521, + "grad_norm": 1.874915599822998, + "learning_rate": 4.972843521197788e-05, + "loss": 5.4152, + "step": 7897 + }, + { + "epoch": 0.0469716433533162, + "grad_norm": 1.7093253135681152, + "learning_rate": 4.9728366547057046e-05, + "loss": 5.2852, + "step": 7898 + }, + { + "epoch": 0.046977590636597205, + "grad_norm": 1.6435173749923706, + "learning_rate": 4.9728297873503806e-05, + "loss": 5.3985, + "step": 7899 + }, + { + "epoch": 0.0469835379198782, + "grad_norm": 1.5776588916778564, + "learning_rate": 4.972822919131816e-05, + "loss": 5.2914, + "step": 7900 + }, + { + "epoch": 0.046989485203159195, + "grad_norm": 2.051072835922241, + "learning_rate": 4.972816050050015e-05, + "loss": 5.343, + "step": 7901 + }, + { + "epoch": 0.0469954324864402, + "grad_norm": 2.003816604614258, + "learning_rate": 4.972809180104979e-05, + "loss": 5.3577, + "step": 7902 + }, + { + "epoch": 0.04700137976972119, + "grad_norm": 1.9092657566070557, + "learning_rate": 4.9728023092967116e-05, + "loss": 5.551, + "step": 7903 + }, + { + "epoch": 0.04700732705300219, + "grad_norm": 1.763007640838623, + "learning_rate": 4.972795437625214e-05, + "loss": 5.5611, + "step": 7904 + }, + { + "epoch": 0.04701327433628318, + "grad_norm": 2.637850046157837, + "learning_rate": 4.9727885650904895e-05, + "loss": 5.937, + "step": 7905 + }, + { + "epoch": 0.047019221619564185, + "grad_norm": 1.6650307178497314, + "learning_rate": 4.9727816916925395e-05, + "loss": 5.6418, + "step": 7906 + }, + { + "epoch": 0.04702516890284518, + "grad_norm": 1.6943029165267944, + "learning_rate": 4.972774817431367e-05, + "loss": 5.4826, + "step": 7907 + }, + { + "epoch": 0.047031116186126175, + "grad_norm": 1.4689685106277466, + "learning_rate": 4.972767942306975e-05, + "loss": 5.4849, + "step": 7908 + }, + { + "epoch": 0.04703706346940718, + "grad_norm": 1.759244441986084, + "learning_rate": 4.9727610663193644e-05, + "loss": 5.3496, + "step": 7909 + }, + { + "epoch": 0.04704301075268817, + "grad_norm": 1.8706889152526855, + "learning_rate": 4.9727541894685395e-05, + "loss": 5.2836, + "step": 7910 + }, + { + "epoch": 0.04704895803596917, + "grad_norm": 1.486164927482605, + "learning_rate": 4.972747311754501e-05, + "loss": 5.4125, + "step": 7911 + }, + { + "epoch": 0.04705490531925017, + "grad_norm": 1.6479889154434204, + "learning_rate": 4.972740433177252e-05, + "loss": 5.1986, + "step": 7912 + }, + { + "epoch": 0.047060852602531164, + "grad_norm": 1.5741796493530273, + "learning_rate": 4.9727335537367944e-05, + "loss": 5.4761, + "step": 7913 + }, + { + "epoch": 0.04706679988581216, + "grad_norm": 1.5001682043075562, + "learning_rate": 4.972726673433131e-05, + "loss": 5.6267, + "step": 7914 + }, + { + "epoch": 0.04707274716909316, + "grad_norm": 1.774282455444336, + "learning_rate": 4.972719792266265e-05, + "loss": 5.5944, + "step": 7915 + }, + { + "epoch": 0.047078694452374156, + "grad_norm": 1.6656653881072998, + "learning_rate": 4.972712910236198e-05, + "loss": 5.4159, + "step": 7916 + }, + { + "epoch": 0.04708464173565515, + "grad_norm": 1.7174065113067627, + "learning_rate": 4.972706027342933e-05, + "loss": 5.4239, + "step": 7917 + }, + { + "epoch": 0.04709058901893615, + "grad_norm": 1.607878565788269, + "learning_rate": 4.9726991435864705e-05, + "loss": 5.4517, + "step": 7918 + }, + { + "epoch": 0.04709653630221715, + "grad_norm": 1.9639167785644531, + "learning_rate": 4.972692258966815e-05, + "loss": 5.5371, + "step": 7919 + }, + { + "epoch": 0.047102483585498144, + "grad_norm": 1.5418875217437744, + "learning_rate": 4.9726853734839684e-05, + "loss": 5.4798, + "step": 7920 + }, + { + "epoch": 0.04710843086877914, + "grad_norm": 1.54796302318573, + "learning_rate": 4.9726784871379326e-05, + "loss": 5.5329, + "step": 7921 + }, + { + "epoch": 0.04711437815206014, + "grad_norm": 1.8075921535491943, + "learning_rate": 4.97267159992871e-05, + "loss": 5.6049, + "step": 7922 + }, + { + "epoch": 0.047120325435341136, + "grad_norm": 1.4973857402801514, + "learning_rate": 4.972664711856304e-05, + "loss": 5.27, + "step": 7923 + }, + { + "epoch": 0.04712627271862213, + "grad_norm": 2.1028542518615723, + "learning_rate": 4.9726578229207155e-05, + "loss": 5.3626, + "step": 7924 + }, + { + "epoch": 0.04713222000190313, + "grad_norm": 2.2057480812072754, + "learning_rate": 4.9726509331219485e-05, + "loss": 5.1767, + "step": 7925 + }, + { + "epoch": 0.04713816728518413, + "grad_norm": 2.0549347400665283, + "learning_rate": 4.972644042460004e-05, + "loss": 5.3362, + "step": 7926 + }, + { + "epoch": 0.04714411456846512, + "grad_norm": 2.0960693359375, + "learning_rate": 4.972637150934885e-05, + "loss": 5.5162, + "step": 7927 + }, + { + "epoch": 0.047150061851746125, + "grad_norm": 2.2022509574890137, + "learning_rate": 4.9726302585465945e-05, + "loss": 5.3263, + "step": 7928 + }, + { + "epoch": 0.04715600913502712, + "grad_norm": 1.7065988779067993, + "learning_rate": 4.9726233652951335e-05, + "loss": 5.4349, + "step": 7929 + }, + { + "epoch": 0.047161956418308115, + "grad_norm": 1.742591142654419, + "learning_rate": 4.972616471180506e-05, + "loss": 5.2396, + "step": 7930 + }, + { + "epoch": 0.04716790370158912, + "grad_norm": 1.888846755027771, + "learning_rate": 4.972609576202713e-05, + "loss": 5.3453, + "step": 7931 + }, + { + "epoch": 0.04717385098487011, + "grad_norm": 1.6499360799789429, + "learning_rate": 4.972602680361758e-05, + "loss": 5.2819, + "step": 7932 + }, + { + "epoch": 0.04717979826815111, + "grad_norm": 1.8801236152648926, + "learning_rate": 4.9725957836576434e-05, + "loss": 5.2456, + "step": 7933 + }, + { + "epoch": 0.0471857455514321, + "grad_norm": 2.050522565841675, + "learning_rate": 4.97258888609037e-05, + "loss": 5.2069, + "step": 7934 + }, + { + "epoch": 0.047191692834713105, + "grad_norm": 2.0722391605377197, + "learning_rate": 4.972581987659942e-05, + "loss": 5.5057, + "step": 7935 + }, + { + "epoch": 0.0471976401179941, + "grad_norm": 2.728468179702759, + "learning_rate": 4.972575088366361e-05, + "loss": 5.5485, + "step": 7936 + }, + { + "epoch": 0.047203587401275095, + "grad_norm": 2.0293211936950684, + "learning_rate": 4.9725681882096295e-05, + "loss": 5.7126, + "step": 7937 + }, + { + "epoch": 0.0472095346845561, + "grad_norm": 2.1351194381713867, + "learning_rate": 4.97256128718975e-05, + "loss": 5.7313, + "step": 7938 + }, + { + "epoch": 0.04721548196783709, + "grad_norm": 1.9040015935897827, + "learning_rate": 4.972554385306726e-05, + "loss": 5.696, + "step": 7939 + }, + { + "epoch": 0.04722142925111809, + "grad_norm": 1.640110731124878, + "learning_rate": 4.9725474825605574e-05, + "loss": 5.2626, + "step": 7940 + }, + { + "epoch": 0.04722737653439909, + "grad_norm": 1.887408971786499, + "learning_rate": 4.972540578951249e-05, + "loss": 5.2734, + "step": 7941 + }, + { + "epoch": 0.047233323817680084, + "grad_norm": 1.8867583274841309, + "learning_rate": 4.972533674478801e-05, + "loss": 5.6811, + "step": 7942 + }, + { + "epoch": 0.04723927110096108, + "grad_norm": 1.811104655265808, + "learning_rate": 4.9725267691432174e-05, + "loss": 5.575, + "step": 7943 + }, + { + "epoch": 0.04724521838424208, + "grad_norm": 1.8644812107086182, + "learning_rate": 4.9725198629445014e-05, + "loss": 5.5718, + "step": 7944 + }, + { + "epoch": 0.047251165667523076, + "grad_norm": 1.693788766860962, + "learning_rate": 4.972512955882653e-05, + "loss": 5.5924, + "step": 7945 + }, + { + "epoch": 0.04725711295080407, + "grad_norm": 1.8305641412734985, + "learning_rate": 4.9725060479576766e-05, + "loss": 5.6529, + "step": 7946 + }, + { + "epoch": 0.04726306023408507, + "grad_norm": 1.7662039995193481, + "learning_rate": 4.9724991391695734e-05, + "loss": 5.6709, + "step": 7947 + }, + { + "epoch": 0.04726900751736607, + "grad_norm": 2.1799724102020264, + "learning_rate": 4.972492229518347e-05, + "loss": 5.6266, + "step": 7948 + }, + { + "epoch": 0.047274954800647064, + "grad_norm": 1.9300130605697632, + "learning_rate": 4.972485319003998e-05, + "loss": 5.6494, + "step": 7949 + }, + { + "epoch": 0.04728090208392806, + "grad_norm": 1.9196375608444214, + "learning_rate": 4.9724784076265307e-05, + "loss": 5.571, + "step": 7950 + }, + { + "epoch": 0.04728684936720906, + "grad_norm": 1.906616449356079, + "learning_rate": 4.972471495385947e-05, + "loss": 5.6537, + "step": 7951 + }, + { + "epoch": 0.047292796650490056, + "grad_norm": 1.826536774635315, + "learning_rate": 4.972464582282249e-05, + "loss": 5.6251, + "step": 7952 + }, + { + "epoch": 0.04729874393377105, + "grad_norm": 1.7790716886520386, + "learning_rate": 4.972457668315438e-05, + "loss": 5.3488, + "step": 7953 + }, + { + "epoch": 0.04730469121705205, + "grad_norm": 1.8892159461975098, + "learning_rate": 4.972450753485519e-05, + "loss": 5.4794, + "step": 7954 + }, + { + "epoch": 0.04731063850033305, + "grad_norm": 1.9409239292144775, + "learning_rate": 4.972443837792492e-05, + "loss": 5.6058, + "step": 7955 + }, + { + "epoch": 0.04731658578361404, + "grad_norm": 1.9935575723648071, + "learning_rate": 4.972436921236361e-05, + "loss": 5.6481, + "step": 7956 + }, + { + "epoch": 0.047322533066895045, + "grad_norm": 1.8507076501846313, + "learning_rate": 4.9724300038171276e-05, + "loss": 5.4723, + "step": 7957 + }, + { + "epoch": 0.04732848035017604, + "grad_norm": 1.9355841875076294, + "learning_rate": 4.972423085534794e-05, + "loss": 5.3843, + "step": 7958 + }, + { + "epoch": 0.047334427633457035, + "grad_norm": 1.9815531969070435, + "learning_rate": 4.972416166389363e-05, + "loss": 5.5635, + "step": 7959 + }, + { + "epoch": 0.04734037491673804, + "grad_norm": 1.7955007553100586, + "learning_rate": 4.972409246380838e-05, + "loss": 5.6002, + "step": 7960 + }, + { + "epoch": 0.04734632220001903, + "grad_norm": 2.0184547901153564, + "learning_rate": 4.97240232550922e-05, + "loss": 5.5458, + "step": 7961 + }, + { + "epoch": 0.04735226948330003, + "grad_norm": 1.7418156862258911, + "learning_rate": 4.972395403774512e-05, + "loss": 5.6443, + "step": 7962 + }, + { + "epoch": 0.04735821676658102, + "grad_norm": 1.9832762479782104, + "learning_rate": 4.972388481176716e-05, + "loss": 5.3799, + "step": 7963 + }, + { + "epoch": 0.047364164049862024, + "grad_norm": 1.8777718544006348, + "learning_rate": 4.972381557715835e-05, + "loss": 5.4349, + "step": 7964 + }, + { + "epoch": 0.04737011133314302, + "grad_norm": 1.519038438796997, + "learning_rate": 4.972374633391871e-05, + "loss": 5.2418, + "step": 7965 + }, + { + "epoch": 0.047376058616424015, + "grad_norm": 1.6425752639770508, + "learning_rate": 4.972367708204826e-05, + "loss": 5.1648, + "step": 7966 + }, + { + "epoch": 0.04738200589970502, + "grad_norm": 1.7461836338043213, + "learning_rate": 4.972360782154704e-05, + "loss": 5.1745, + "step": 7967 + }, + { + "epoch": 0.04738795318298601, + "grad_norm": 1.7991663217544556, + "learning_rate": 4.9723538552415064e-05, + "loss": 5.2268, + "step": 7968 + }, + { + "epoch": 0.04739390046626701, + "grad_norm": 1.9127873182296753, + "learning_rate": 4.9723469274652345e-05, + "loss": 5.5205, + "step": 7969 + }, + { + "epoch": 0.04739984774954801, + "grad_norm": 1.8836725950241089, + "learning_rate": 4.972339998825893e-05, + "loss": 5.3803, + "step": 7970 + }, + { + "epoch": 0.047405795032829004, + "grad_norm": 1.8391705751419067, + "learning_rate": 4.9723330693234825e-05, + "loss": 5.3084, + "step": 7971 + }, + { + "epoch": 0.04741174231611, + "grad_norm": 1.6707972288131714, + "learning_rate": 4.9723261389580063e-05, + "loss": 5.3275, + "step": 7972 + }, + { + "epoch": 0.047417689599391, + "grad_norm": 1.8807258605957031, + "learning_rate": 4.972319207729467e-05, + "loss": 5.0766, + "step": 7973 + }, + { + "epoch": 0.047423636882671996, + "grad_norm": 1.8980032205581665, + "learning_rate": 4.9723122756378655e-05, + "loss": 5.185, + "step": 7974 + }, + { + "epoch": 0.04742958416595299, + "grad_norm": 1.9011166095733643, + "learning_rate": 4.9723053426832055e-05, + "loss": 5.2494, + "step": 7975 + }, + { + "epoch": 0.04743553144923399, + "grad_norm": 1.6457782983779907, + "learning_rate": 4.97229840886549e-05, + "loss": 5.4205, + "step": 7976 + }, + { + "epoch": 0.04744147873251499, + "grad_norm": 1.558515191078186, + "learning_rate": 4.9722914741847206e-05, + "loss": 5.2111, + "step": 7977 + }, + { + "epoch": 0.04744742601579598, + "grad_norm": 1.4780910015106201, + "learning_rate": 4.9722845386409e-05, + "loss": 5.3365, + "step": 7978 + }, + { + "epoch": 0.04745337329907698, + "grad_norm": 1.529249668121338, + "learning_rate": 4.9722776022340296e-05, + "loss": 5.1323, + "step": 7979 + }, + { + "epoch": 0.04745932058235798, + "grad_norm": 1.66848886013031, + "learning_rate": 4.972270664964113e-05, + "loss": 5.2057, + "step": 7980 + }, + { + "epoch": 0.047465267865638976, + "grad_norm": 1.5645034313201904, + "learning_rate": 4.972263726831152e-05, + "loss": 5.1537, + "step": 7981 + }, + { + "epoch": 0.04747121514891997, + "grad_norm": 1.8793894052505493, + "learning_rate": 4.9722567878351496e-05, + "loss": 5.4403, + "step": 7982 + }, + { + "epoch": 0.04747716243220097, + "grad_norm": 1.7316640615463257, + "learning_rate": 4.972249847976108e-05, + "loss": 5.3642, + "step": 7983 + }, + { + "epoch": 0.04748310971548197, + "grad_norm": 1.7195171117782593, + "learning_rate": 4.972242907254029e-05, + "loss": 5.2603, + "step": 7984 + }, + { + "epoch": 0.04748905699876296, + "grad_norm": 1.6860026121139526, + "learning_rate": 4.972235965668916e-05, + "loss": 5.356, + "step": 7985 + }, + { + "epoch": 0.047495004282043965, + "grad_norm": 1.5396910905838013, + "learning_rate": 4.972229023220771e-05, + "loss": 5.2566, + "step": 7986 + }, + { + "epoch": 0.04750095156532496, + "grad_norm": 1.694547176361084, + "learning_rate": 4.9722220799095956e-05, + "loss": 5.0897, + "step": 7987 + }, + { + "epoch": 0.047506898848605955, + "grad_norm": 1.7608548402786255, + "learning_rate": 4.972215135735394e-05, + "loss": 5.4084, + "step": 7988 + }, + { + "epoch": 0.04751284613188696, + "grad_norm": 1.697198748588562, + "learning_rate": 4.9722081906981675e-05, + "loss": 5.4133, + "step": 7989 + }, + { + "epoch": 0.04751879341516795, + "grad_norm": 1.6107436418533325, + "learning_rate": 4.972201244797918e-05, + "loss": 5.2839, + "step": 7990 + }, + { + "epoch": 0.04752474069844895, + "grad_norm": 1.8178008794784546, + "learning_rate": 4.972194298034649e-05, + "loss": 5.3722, + "step": 7991 + }, + { + "epoch": 0.04753068798172994, + "grad_norm": 1.6542725563049316, + "learning_rate": 4.972187350408363e-05, + "loss": 5.3434, + "step": 7992 + }, + { + "epoch": 0.047536635265010944, + "grad_norm": 1.8194152116775513, + "learning_rate": 4.972180401919061e-05, + "loss": 5.3763, + "step": 7993 + }, + { + "epoch": 0.04754258254829194, + "grad_norm": 1.890317678451538, + "learning_rate": 4.9721734525667476e-05, + "loss": 5.529, + "step": 7994 + }, + { + "epoch": 0.047548529831572935, + "grad_norm": 1.813226342201233, + "learning_rate": 4.972166502351423e-05, + "loss": 5.0826, + "step": 7995 + }, + { + "epoch": 0.04755447711485394, + "grad_norm": 1.7679328918457031, + "learning_rate": 4.9721595512730905e-05, + "loss": 5.3589, + "step": 7996 + }, + { + "epoch": 0.04756042439813493, + "grad_norm": 1.8390278816223145, + "learning_rate": 4.972152599331753e-05, + "loss": 5.1568, + "step": 7997 + }, + { + "epoch": 0.04756637168141593, + "grad_norm": 2.9323909282684326, + "learning_rate": 4.972145646527413e-05, + "loss": 5.6457, + "step": 7998 + }, + { + "epoch": 0.04757231896469693, + "grad_norm": 1.8839350938796997, + "learning_rate": 4.972138692860072e-05, + "loss": 5.1204, + "step": 7999 + }, + { + "epoch": 0.047578266247977924, + "grad_norm": 1.9047685861587524, + "learning_rate": 4.972131738329733e-05, + "loss": 5.2741, + "step": 8000 + }, + { + "epoch": 0.04758421353125892, + "grad_norm": 2.39807391166687, + "learning_rate": 4.972124782936398e-05, + "loss": 5.0134, + "step": 8001 + }, + { + "epoch": 0.04759016081453992, + "grad_norm": 2.197404146194458, + "learning_rate": 4.972117826680071e-05, + "loss": 5.3012, + "step": 8002 + }, + { + "epoch": 0.047596108097820916, + "grad_norm": 2.2648651599884033, + "learning_rate": 4.9721108695607515e-05, + "loss": 5.7196, + "step": 8003 + }, + { + "epoch": 0.04760205538110191, + "grad_norm": 1.7686847448349, + "learning_rate": 4.972103911578444e-05, + "loss": 5.4261, + "step": 8004 + }, + { + "epoch": 0.04760800266438291, + "grad_norm": 1.726653814315796, + "learning_rate": 4.972096952733152e-05, + "loss": 5.33, + "step": 8005 + }, + { + "epoch": 0.04761394994766391, + "grad_norm": 1.6855807304382324, + "learning_rate": 4.972089993024875e-05, + "loss": 5.2382, + "step": 8006 + }, + { + "epoch": 0.0476198972309449, + "grad_norm": 1.644954800605774, + "learning_rate": 4.972083032453617e-05, + "loss": 5.3309, + "step": 8007 + }, + { + "epoch": 0.0476258445142259, + "grad_norm": 1.8630400896072388, + "learning_rate": 4.9720760710193816e-05, + "loss": 5.282, + "step": 8008 + }, + { + "epoch": 0.0476317917975069, + "grad_norm": 1.862716555595398, + "learning_rate": 4.972069108722168e-05, + "loss": 5.3307, + "step": 8009 + }, + { + "epoch": 0.047637739080787896, + "grad_norm": 1.8025259971618652, + "learning_rate": 4.972062145561982e-05, + "loss": 5.2236, + "step": 8010 + }, + { + "epoch": 0.04764368636406889, + "grad_norm": 1.7213356494903564, + "learning_rate": 4.972055181538825e-05, + "loss": 5.0635, + "step": 8011 + }, + { + "epoch": 0.04764963364734989, + "grad_norm": 1.5237104892730713, + "learning_rate": 4.9720482166526986e-05, + "loss": 5.3089, + "step": 8012 + }, + { + "epoch": 0.04765558093063089, + "grad_norm": 1.628957748413086, + "learning_rate": 4.972041250903605e-05, + "loss": 5.2299, + "step": 8013 + }, + { + "epoch": 0.04766152821391188, + "grad_norm": 1.9217725992202759, + "learning_rate": 4.972034284291548e-05, + "loss": 5.2504, + "step": 8014 + }, + { + "epoch": 0.047667475497192885, + "grad_norm": 2.114549160003662, + "learning_rate": 4.97202731681653e-05, + "loss": 5.219, + "step": 8015 + }, + { + "epoch": 0.04767342278047388, + "grad_norm": 1.9268896579742432, + "learning_rate": 4.9720203484785525e-05, + "loss": 5.145, + "step": 8016 + }, + { + "epoch": 0.047679370063754875, + "grad_norm": 2.04050874710083, + "learning_rate": 4.9720133792776166e-05, + "loss": 5.354, + "step": 8017 + }, + { + "epoch": 0.04768531734703588, + "grad_norm": 1.8002599477767944, + "learning_rate": 4.972006409213728e-05, + "loss": 5.0547, + "step": 8018 + }, + { + "epoch": 0.04769126463031687, + "grad_norm": 1.9655365943908691, + "learning_rate": 4.9719994382868876e-05, + "loss": 5.2188, + "step": 8019 + }, + { + "epoch": 0.04769721191359787, + "grad_norm": 1.7188535928726196, + "learning_rate": 4.971992466497097e-05, + "loss": 5.1792, + "step": 8020 + }, + { + "epoch": 0.04770315919687886, + "grad_norm": 1.582184910774231, + "learning_rate": 4.97198549384436e-05, + "loss": 5.2295, + "step": 8021 + }, + { + "epoch": 0.047709106480159864, + "grad_norm": 1.4490164518356323, + "learning_rate": 4.971978520328677e-05, + "loss": 5.1677, + "step": 8022 + }, + { + "epoch": 0.04771505376344086, + "grad_norm": 1.472896695137024, + "learning_rate": 4.971971545950054e-05, + "loss": 4.9954, + "step": 8023 + }, + { + "epoch": 0.047721001046721855, + "grad_norm": 1.5845187902450562, + "learning_rate": 4.97196457070849e-05, + "loss": 5.1273, + "step": 8024 + }, + { + "epoch": 0.04772694833000286, + "grad_norm": 1.6418551206588745, + "learning_rate": 4.9719575946039887e-05, + "loss": 5.0835, + "step": 8025 + }, + { + "epoch": 0.04773289561328385, + "grad_norm": 1.379805088043213, + "learning_rate": 4.971950617636553e-05, + "loss": 5.1058, + "step": 8026 + }, + { + "epoch": 0.04773884289656485, + "grad_norm": 1.7939400672912598, + "learning_rate": 4.9719436398061835e-05, + "loss": 5.0105, + "step": 8027 + }, + { + "epoch": 0.04774479017984585, + "grad_norm": 1.5610185861587524, + "learning_rate": 4.971936661112886e-05, + "loss": 5.032, + "step": 8028 + }, + { + "epoch": 0.047750737463126844, + "grad_norm": 1.524402379989624, + "learning_rate": 4.9719296815566594e-05, + "loss": 5.1376, + "step": 8029 + }, + { + "epoch": 0.04775668474640784, + "grad_norm": 1.7448087930679321, + "learning_rate": 4.971922701137509e-05, + "loss": 4.9496, + "step": 8030 + }, + { + "epoch": 0.04776263202968884, + "grad_norm": 1.7382763624191284, + "learning_rate": 4.971915719855435e-05, + "loss": 4.9755, + "step": 8031 + }, + { + "epoch": 0.047768579312969836, + "grad_norm": 1.6728250980377197, + "learning_rate": 4.971908737710441e-05, + "loss": 5.1436, + "step": 8032 + }, + { + "epoch": 0.04777452659625083, + "grad_norm": 1.4256306886672974, + "learning_rate": 4.971901754702529e-05, + "loss": 4.9739, + "step": 8033 + }, + { + "epoch": 0.04778047387953183, + "grad_norm": 1.660714864730835, + "learning_rate": 4.971894770831702e-05, + "loss": 5.1337, + "step": 8034 + }, + { + "epoch": 0.04778642116281283, + "grad_norm": 1.5240182876586914, + "learning_rate": 4.9718877860979615e-05, + "loss": 5.1143, + "step": 8035 + }, + { + "epoch": 0.04779236844609382, + "grad_norm": 1.478852391242981, + "learning_rate": 4.971880800501311e-05, + "loss": 4.968, + "step": 8036 + }, + { + "epoch": 0.04779831572937482, + "grad_norm": 1.5343812704086304, + "learning_rate": 4.971873814041752e-05, + "loss": 4.9393, + "step": 8037 + }, + { + "epoch": 0.04780426301265582, + "grad_norm": 1.6728276014328003, + "learning_rate": 4.971866826719288e-05, + "loss": 5.0535, + "step": 8038 + }, + { + "epoch": 0.047810210295936816, + "grad_norm": 1.4831758737564087, + "learning_rate": 4.971859838533921e-05, + "loss": 5.0705, + "step": 8039 + }, + { + "epoch": 0.04781615757921781, + "grad_norm": 1.7412161827087402, + "learning_rate": 4.971852849485653e-05, + "loss": 4.9338, + "step": 8040 + }, + { + "epoch": 0.04782210486249881, + "grad_norm": 1.4696041345596313, + "learning_rate": 4.971845859574487e-05, + "loss": 5.0643, + "step": 8041 + }, + { + "epoch": 0.04782805214577981, + "grad_norm": 1.4190481901168823, + "learning_rate": 4.9718388688004235e-05, + "loss": 5.0743, + "step": 8042 + }, + { + "epoch": 0.0478339994290608, + "grad_norm": 1.513454556465149, + "learning_rate": 4.9718318771634686e-05, + "loss": 4.8832, + "step": 8043 + }, + { + "epoch": 0.047839946712341805, + "grad_norm": 1.7310774326324463, + "learning_rate": 4.9718248846636216e-05, + "loss": 4.957, + "step": 8044 + }, + { + "epoch": 0.0478458939956228, + "grad_norm": 1.4895838499069214, + "learning_rate": 4.971817891300886e-05, + "loss": 4.9121, + "step": 8045 + }, + { + "epoch": 0.047851841278903795, + "grad_norm": 1.6848632097244263, + "learning_rate": 4.9718108970752656e-05, + "loss": 5.1337, + "step": 8046 + }, + { + "epoch": 0.0478577885621848, + "grad_norm": 1.7145766019821167, + "learning_rate": 4.97180390198676e-05, + "loss": 5.1827, + "step": 8047 + }, + { + "epoch": 0.04786373584546579, + "grad_norm": 1.668140172958374, + "learning_rate": 4.971796906035374e-05, + "loss": 5.4071, + "step": 8048 + }, + { + "epoch": 0.04786968312874679, + "grad_norm": 1.6927748918533325, + "learning_rate": 4.9717899092211094e-05, + "loss": 5.4319, + "step": 8049 + }, + { + "epoch": 0.04787563041202778, + "grad_norm": 1.6696170568466187, + "learning_rate": 4.971782911543968e-05, + "loss": 5.4137, + "step": 8050 + }, + { + "epoch": 0.047881577695308784, + "grad_norm": 1.9299427270889282, + "learning_rate": 4.971775913003953e-05, + "loss": 5.6676, + "step": 8051 + }, + { + "epoch": 0.04788752497858978, + "grad_norm": 1.7163755893707275, + "learning_rate": 4.971768913601066e-05, + "loss": 5.2916, + "step": 8052 + }, + { + "epoch": 0.047893472261870774, + "grad_norm": 1.7822209596633911, + "learning_rate": 4.971761913335311e-05, + "loss": 5.6364, + "step": 8053 + }, + { + "epoch": 0.047899419545151777, + "grad_norm": 1.725375771522522, + "learning_rate": 4.971754912206689e-05, + "loss": 5.045, + "step": 8054 + }, + { + "epoch": 0.04790536682843277, + "grad_norm": 1.5243995189666748, + "learning_rate": 4.9717479102152027e-05, + "loss": 5.4691, + "step": 8055 + }, + { + "epoch": 0.04791131411171377, + "grad_norm": 1.6673872470855713, + "learning_rate": 4.971740907360854e-05, + "loss": 5.4851, + "step": 8056 + }, + { + "epoch": 0.04791726139499477, + "grad_norm": 1.6378693580627441, + "learning_rate": 4.971733903643647e-05, + "loss": 5.2574, + "step": 8057 + }, + { + "epoch": 0.047923208678275764, + "grad_norm": 1.484250545501709, + "learning_rate": 4.9717268990635835e-05, + "loss": 5.2988, + "step": 8058 + }, + { + "epoch": 0.04792915596155676, + "grad_norm": 1.626955270767212, + "learning_rate": 4.971719893620665e-05, + "loss": 5.3502, + "step": 8059 + }, + { + "epoch": 0.04793510324483776, + "grad_norm": 2.1421375274658203, + "learning_rate": 4.9717128873148954e-05, + "loss": 5.3006, + "step": 8060 + }, + { + "epoch": 0.047941050528118756, + "grad_norm": 1.5175740718841553, + "learning_rate": 4.971705880146276e-05, + "loss": 5.4144, + "step": 8061 + }, + { + "epoch": 0.04794699781139975, + "grad_norm": 1.6170361042022705, + "learning_rate": 4.9716988721148095e-05, + "loss": 5.3635, + "step": 8062 + }, + { + "epoch": 0.04795294509468075, + "grad_norm": 1.7269384860992432, + "learning_rate": 4.971691863220499e-05, + "loss": 5.2813, + "step": 8063 + }, + { + "epoch": 0.04795889237796175, + "grad_norm": 1.5144844055175781, + "learning_rate": 4.971684853463345e-05, + "loss": 5.3242, + "step": 8064 + }, + { + "epoch": 0.04796483966124274, + "grad_norm": 1.7125827074050903, + "learning_rate": 4.971677842843353e-05, + "loss": 5.2968, + "step": 8065 + }, + { + "epoch": 0.04797078694452374, + "grad_norm": 1.6067146062850952, + "learning_rate": 4.9716708313605234e-05, + "loss": 5.4446, + "step": 8066 + }, + { + "epoch": 0.04797673422780474, + "grad_norm": 1.8911150693893433, + "learning_rate": 4.9716638190148585e-05, + "loss": 5.1875, + "step": 8067 + }, + { + "epoch": 0.047982681511085735, + "grad_norm": 1.6865830421447754, + "learning_rate": 4.971656805806362e-05, + "loss": 5.1909, + "step": 8068 + }, + { + "epoch": 0.04798862879436673, + "grad_norm": 2.009566068649292, + "learning_rate": 4.9716497917350345e-05, + "loss": 4.9392, + "step": 8069 + }, + { + "epoch": 0.04799457607764773, + "grad_norm": 1.8578897714614868, + "learning_rate": 4.97164277680088e-05, + "loss": 5.3101, + "step": 8070 + }, + { + "epoch": 0.04800052336092873, + "grad_norm": 1.8935741186141968, + "learning_rate": 4.971635761003901e-05, + "loss": 5.3952, + "step": 8071 + }, + { + "epoch": 0.04800647064420972, + "grad_norm": 2.0030407905578613, + "learning_rate": 4.9716287443440994e-05, + "loss": 5.1685, + "step": 8072 + }, + { + "epoch": 0.048012417927490725, + "grad_norm": 2.0079195499420166, + "learning_rate": 4.9716217268214775e-05, + "loss": 5.4942, + "step": 8073 + }, + { + "epoch": 0.04801836521077172, + "grad_norm": 1.7105878591537476, + "learning_rate": 4.971614708436038e-05, + "loss": 5.4124, + "step": 8074 + }, + { + "epoch": 0.048024312494052715, + "grad_norm": 1.7642161846160889, + "learning_rate": 4.971607689187784e-05, + "loss": 5.3187, + "step": 8075 + }, + { + "epoch": 0.04803025977733372, + "grad_norm": 1.7304610013961792, + "learning_rate": 4.9716006690767165e-05, + "loss": 5.308, + "step": 8076 + }, + { + "epoch": 0.04803620706061471, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.971593648102839e-05, + "loss": 5.4581, + "step": 8077 + }, + { + "epoch": 0.04804215434389571, + "grad_norm": 1.8008997440338135, + "learning_rate": 4.971586626266154e-05, + "loss": 5.3266, + "step": 8078 + }, + { + "epoch": 0.0480481016271767, + "grad_norm": 1.8691446781158447, + "learning_rate": 4.971579603566663e-05, + "loss": 5.2847, + "step": 8079 + }, + { + "epoch": 0.048054048910457704, + "grad_norm": 1.7805777788162231, + "learning_rate": 4.97157258000437e-05, + "loss": 5.446, + "step": 8080 + }, + { + "epoch": 0.0480599961937387, + "grad_norm": 1.4973244667053223, + "learning_rate": 4.971565555579275e-05, + "loss": 5.412, + "step": 8081 + }, + { + "epoch": 0.048065943477019694, + "grad_norm": 1.5994775295257568, + "learning_rate": 4.971558530291384e-05, + "loss": 5.3285, + "step": 8082 + }, + { + "epoch": 0.048071890760300696, + "grad_norm": 1.7743935585021973, + "learning_rate": 4.971551504140696e-05, + "loss": 5.326, + "step": 8083 + }, + { + "epoch": 0.04807783804358169, + "grad_norm": 1.5922112464904785, + "learning_rate": 4.9715444771272154e-05, + "loss": 5.3338, + "step": 8084 + }, + { + "epoch": 0.04808378532686269, + "grad_norm": 1.5587191581726074, + "learning_rate": 4.971537449250944e-05, + "loss": 5.2437, + "step": 8085 + }, + { + "epoch": 0.04808973261014369, + "grad_norm": 1.4972636699676514, + "learning_rate": 4.971530420511884e-05, + "loss": 5.2271, + "step": 8086 + }, + { + "epoch": 0.048095679893424684, + "grad_norm": 1.6221843957901, + "learning_rate": 4.971523390910039e-05, + "loss": 5.3225, + "step": 8087 + }, + { + "epoch": 0.04810162717670568, + "grad_norm": 1.5826990604400635, + "learning_rate": 4.971516360445411e-05, + "loss": 5.2955, + "step": 8088 + }, + { + "epoch": 0.04810757445998668, + "grad_norm": 1.729963779449463, + "learning_rate": 4.971509329118001e-05, + "loss": 5.3263, + "step": 8089 + }, + { + "epoch": 0.048113521743267676, + "grad_norm": 1.680851697921753, + "learning_rate": 4.971502296927813e-05, + "loss": 5.3579, + "step": 8090 + }, + { + "epoch": 0.04811946902654867, + "grad_norm": 2.028024673461914, + "learning_rate": 4.9714952638748504e-05, + "loss": 5.3632, + "step": 8091 + }, + { + "epoch": 0.04812541630982967, + "grad_norm": 1.6236159801483154, + "learning_rate": 4.9714882299591127e-05, + "loss": 5.222, + "step": 8092 + }, + { + "epoch": 0.04813136359311067, + "grad_norm": 1.7522811889648438, + "learning_rate": 4.971481195180605e-05, + "loss": 5.3752, + "step": 8093 + }, + { + "epoch": 0.04813731087639166, + "grad_norm": 1.7108362913131714, + "learning_rate": 4.9714741595393274e-05, + "loss": 5.2994, + "step": 8094 + }, + { + "epoch": 0.04814325815967266, + "grad_norm": 1.7863954305648804, + "learning_rate": 4.971467123035285e-05, + "loss": 5.2386, + "step": 8095 + }, + { + "epoch": 0.04814920544295366, + "grad_norm": 2.0054473876953125, + "learning_rate": 4.971460085668479e-05, + "loss": 5.3565, + "step": 8096 + }, + { + "epoch": 0.048155152726234655, + "grad_norm": 1.6878743171691895, + "learning_rate": 4.971453047438911e-05, + "loss": 5.3448, + "step": 8097 + }, + { + "epoch": 0.04816110000951565, + "grad_norm": 1.8534557819366455, + "learning_rate": 4.971446008346585e-05, + "loss": 5.1446, + "step": 8098 + }, + { + "epoch": 0.04816704729279665, + "grad_norm": 1.8549425601959229, + "learning_rate": 4.9714389683915025e-05, + "loss": 5.2433, + "step": 8099 + }, + { + "epoch": 0.04817299457607765, + "grad_norm": 1.5624927282333374, + "learning_rate": 4.9714319275736666e-05, + "loss": 5.0645, + "step": 8100 + }, + { + "epoch": 0.04817894185935864, + "grad_norm": 1.670462965965271, + "learning_rate": 4.971424885893078e-05, + "loss": 5.1213, + "step": 8101 + }, + { + "epoch": 0.048184889142639645, + "grad_norm": 2.039595603942871, + "learning_rate": 4.9714178433497414e-05, + "loss": 5.1797, + "step": 8102 + }, + { + "epoch": 0.04819083642592064, + "grad_norm": 1.9546380043029785, + "learning_rate": 4.971410799943659e-05, + "loss": 5.2432, + "step": 8103 + }, + { + "epoch": 0.048196783709201635, + "grad_norm": 1.892397403717041, + "learning_rate": 4.971403755674832e-05, + "loss": 5.1775, + "step": 8104 + }, + { + "epoch": 0.04820273099248264, + "grad_norm": 1.7021955251693726, + "learning_rate": 4.971396710543263e-05, + "loss": 5.2242, + "step": 8105 + }, + { + "epoch": 0.04820867827576363, + "grad_norm": 1.7652686834335327, + "learning_rate": 4.9713896645489556e-05, + "loss": 5.1419, + "step": 8106 + }, + { + "epoch": 0.04821462555904463, + "grad_norm": 1.8669620752334595, + "learning_rate": 4.971382617691911e-05, + "loss": 5.1392, + "step": 8107 + }, + { + "epoch": 0.04822057284232562, + "grad_norm": 1.8774491548538208, + "learning_rate": 4.971375569972133e-05, + "loss": 5.1853, + "step": 8108 + }, + { + "epoch": 0.048226520125606624, + "grad_norm": 1.6108628511428833, + "learning_rate": 4.971368521389623e-05, + "loss": 5.4858, + "step": 8109 + }, + { + "epoch": 0.04823246740888762, + "grad_norm": 1.6839191913604736, + "learning_rate": 4.9713614719443835e-05, + "loss": 5.4217, + "step": 8110 + }, + { + "epoch": 0.048238414692168614, + "grad_norm": 1.9300925731658936, + "learning_rate": 4.9713544216364176e-05, + "loss": 5.2259, + "step": 8111 + }, + { + "epoch": 0.048244361975449616, + "grad_norm": 1.9142355918884277, + "learning_rate": 4.971347370465728e-05, + "loss": 5.2, + "step": 8112 + }, + { + "epoch": 0.04825030925873061, + "grad_norm": 1.8046603202819824, + "learning_rate": 4.971340318432315e-05, + "loss": 5.0951, + "step": 8113 + }, + { + "epoch": 0.04825625654201161, + "grad_norm": 1.9129396677017212, + "learning_rate": 4.971333265536184e-05, + "loss": 5.0376, + "step": 8114 + }, + { + "epoch": 0.04826220382529261, + "grad_norm": 1.6774524450302124, + "learning_rate": 4.971326211777335e-05, + "loss": 5.4313, + "step": 8115 + }, + { + "epoch": 0.048268151108573604, + "grad_norm": 1.8156472444534302, + "learning_rate": 4.971319157155773e-05, + "loss": 5.4336, + "step": 8116 + }, + { + "epoch": 0.0482740983918546, + "grad_norm": 1.5704171657562256, + "learning_rate": 4.9713121016714976e-05, + "loss": 5.6878, + "step": 8117 + }, + { + "epoch": 0.0482800456751356, + "grad_norm": 1.585528016090393, + "learning_rate": 4.9713050453245135e-05, + "loss": 5.6208, + "step": 8118 + }, + { + "epoch": 0.048285992958416596, + "grad_norm": 1.3975930213928223, + "learning_rate": 4.9712979881148215e-05, + "loss": 5.8001, + "step": 8119 + }, + { + "epoch": 0.04829194024169759, + "grad_norm": 1.8124761581420898, + "learning_rate": 4.971290930042426e-05, + "loss": 5.6006, + "step": 8120 + }, + { + "epoch": 0.04829788752497859, + "grad_norm": 1.8448232412338257, + "learning_rate": 4.971283871107327e-05, + "loss": 5.4324, + "step": 8121 + }, + { + "epoch": 0.04830383480825959, + "grad_norm": 1.772218108177185, + "learning_rate": 4.97127681130953e-05, + "loss": 6.0943, + "step": 8122 + }, + { + "epoch": 0.04830978209154058, + "grad_norm": 2.038703441619873, + "learning_rate": 4.9712697506490345e-05, + "loss": 5.4224, + "step": 8123 + }, + { + "epoch": 0.04831572937482158, + "grad_norm": 1.576430320739746, + "learning_rate": 4.971262689125845e-05, + "loss": 5.351, + "step": 8124 + }, + { + "epoch": 0.04832167665810258, + "grad_norm": 1.857021450996399, + "learning_rate": 4.971255626739963e-05, + "loss": 5.258, + "step": 8125 + }, + { + "epoch": 0.048327623941383575, + "grad_norm": 1.7989404201507568, + "learning_rate": 4.971248563491391e-05, + "loss": 5.3925, + "step": 8126 + }, + { + "epoch": 0.04833357122466457, + "grad_norm": 1.8104023933410645, + "learning_rate": 4.9712414993801314e-05, + "loss": 5.4326, + "step": 8127 + }, + { + "epoch": 0.04833951850794557, + "grad_norm": 1.898054838180542, + "learning_rate": 4.971234434406188e-05, + "loss": 5.2094, + "step": 8128 + }, + { + "epoch": 0.04834546579122657, + "grad_norm": 1.436633586883545, + "learning_rate": 4.971227368569561e-05, + "loss": 5.2994, + "step": 8129 + }, + { + "epoch": 0.04835141307450756, + "grad_norm": 1.4576120376586914, + "learning_rate": 4.971220301870255e-05, + "loss": 5.3504, + "step": 8130 + }, + { + "epoch": 0.048357360357788565, + "grad_norm": 1.7260229587554932, + "learning_rate": 4.971213234308271e-05, + "loss": 5.1083, + "step": 8131 + }, + { + "epoch": 0.04836330764106956, + "grad_norm": 1.8110415935516357, + "learning_rate": 4.971206165883612e-05, + "loss": 5.1298, + "step": 8132 + }, + { + "epoch": 0.048369254924350555, + "grad_norm": 2.1696786880493164, + "learning_rate": 4.9711990965962804e-05, + "loss": 5.8155, + "step": 8133 + }, + { + "epoch": 0.04837520220763156, + "grad_norm": 1.9905856847763062, + "learning_rate": 4.971192026446279e-05, + "loss": 5.5814, + "step": 8134 + }, + { + "epoch": 0.04838114949091255, + "grad_norm": 1.7459521293640137, + "learning_rate": 4.97118495543361e-05, + "loss": 5.4358, + "step": 8135 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 1.8495198488235474, + "learning_rate": 4.9711778835582756e-05, + "loss": 5.3652, + "step": 8136 + }, + { + "epoch": 0.04839304405747455, + "grad_norm": 1.782850742340088, + "learning_rate": 4.971170810820279e-05, + "loss": 5.2361, + "step": 8137 + }, + { + "epoch": 0.048398991340755544, + "grad_norm": 1.7327016592025757, + "learning_rate": 4.971163737219622e-05, + "loss": 5.0802, + "step": 8138 + }, + { + "epoch": 0.04840493862403654, + "grad_norm": 1.663620114326477, + "learning_rate": 4.9711566627563066e-05, + "loss": 5.1566, + "step": 8139 + }, + { + "epoch": 0.048410885907317534, + "grad_norm": 1.5109026432037354, + "learning_rate": 4.971149587430336e-05, + "loss": 5.1499, + "step": 8140 + }, + { + "epoch": 0.048416833190598536, + "grad_norm": 1.3494226932525635, + "learning_rate": 4.971142511241714e-05, + "loss": 5.1684, + "step": 8141 + }, + { + "epoch": 0.04842278047387953, + "grad_norm": 1.721880555152893, + "learning_rate": 4.97113543419044e-05, + "loss": 5.0199, + "step": 8142 + }, + { + "epoch": 0.048428727757160527, + "grad_norm": 1.7465516328811646, + "learning_rate": 4.971128356276519e-05, + "loss": 5.1181, + "step": 8143 + }, + { + "epoch": 0.04843467504044153, + "grad_norm": 1.8127025365829468, + "learning_rate": 4.971121277499953e-05, + "loss": 5.6514, + "step": 8144 + }, + { + "epoch": 0.048440622323722524, + "grad_norm": 1.6027450561523438, + "learning_rate": 4.971114197860743e-05, + "loss": 5.3408, + "step": 8145 + }, + { + "epoch": 0.04844656960700352, + "grad_norm": 1.6985208988189697, + "learning_rate": 4.971107117358894e-05, + "loss": 5.2002, + "step": 8146 + }, + { + "epoch": 0.04845251689028452, + "grad_norm": 1.681305170059204, + "learning_rate": 4.971100035994406e-05, + "loss": 5.1389, + "step": 8147 + }, + { + "epoch": 0.048458464173565516, + "grad_norm": 1.6053674221038818, + "learning_rate": 4.971092953767282e-05, + "loss": 5.0665, + "step": 8148 + }, + { + "epoch": 0.04846441145684651, + "grad_norm": 1.743134617805481, + "learning_rate": 4.9710858706775266e-05, + "loss": 5.1427, + "step": 8149 + }, + { + "epoch": 0.04847035874012751, + "grad_norm": 1.4901342391967773, + "learning_rate": 4.9710787867251396e-05, + "loss": 5.1957, + "step": 8150 + }, + { + "epoch": 0.04847630602340851, + "grad_norm": 1.6003857851028442, + "learning_rate": 4.971071701910125e-05, + "loss": 5.0658, + "step": 8151 + }, + { + "epoch": 0.0484822533066895, + "grad_norm": 1.7036428451538086, + "learning_rate": 4.971064616232484e-05, + "loss": 5.0823, + "step": 8152 + }, + { + "epoch": 0.0484882005899705, + "grad_norm": 1.5894789695739746, + "learning_rate": 4.97105752969222e-05, + "loss": 5.093, + "step": 8153 + }, + { + "epoch": 0.0484941478732515, + "grad_norm": 1.487648367881775, + "learning_rate": 4.9710504422893364e-05, + "loss": 5.0089, + "step": 8154 + }, + { + "epoch": 0.048500095156532495, + "grad_norm": 2.0251479148864746, + "learning_rate": 4.971043354023834e-05, + "loss": 5.0552, + "step": 8155 + }, + { + "epoch": 0.04850604243981349, + "grad_norm": 1.7097325325012207, + "learning_rate": 4.971036264895715e-05, + "loss": 5.2737, + "step": 8156 + }, + { + "epoch": 0.04851198972309449, + "grad_norm": 1.784836769104004, + "learning_rate": 4.971029174904984e-05, + "loss": 5.2863, + "step": 8157 + }, + { + "epoch": 0.04851793700637549, + "grad_norm": 1.4765781164169312, + "learning_rate": 4.9710220840516416e-05, + "loss": 5.4057, + "step": 8158 + }, + { + "epoch": 0.04852388428965648, + "grad_norm": 1.4173041582107544, + "learning_rate": 4.9710149923356915e-05, + "loss": 5.187, + "step": 8159 + }, + { + "epoch": 0.048529831572937485, + "grad_norm": 1.488173007965088, + "learning_rate": 4.971007899757135e-05, + "loss": 4.975, + "step": 8160 + }, + { + "epoch": 0.04853577885621848, + "grad_norm": 1.391435980796814, + "learning_rate": 4.9710008063159756e-05, + "loss": 5.0782, + "step": 8161 + }, + { + "epoch": 0.048541726139499475, + "grad_norm": 1.7100436687469482, + "learning_rate": 4.970993712012215e-05, + "loss": 5.4953, + "step": 8162 + }, + { + "epoch": 0.04854767342278048, + "grad_norm": 1.8748459815979004, + "learning_rate": 4.970986616845856e-05, + "loss": 5.4535, + "step": 8163 + }, + { + "epoch": 0.04855362070606147, + "grad_norm": 1.901802897453308, + "learning_rate": 4.970979520816902e-05, + "loss": 5.3619, + "step": 8164 + }, + { + "epoch": 0.04855956798934247, + "grad_norm": 1.9850586652755737, + "learning_rate": 4.970972423925354e-05, + "loss": 5.039, + "step": 8165 + }, + { + "epoch": 0.04856551527262347, + "grad_norm": 1.5195177793502808, + "learning_rate": 4.970965326171214e-05, + "loss": 5.1721, + "step": 8166 + }, + { + "epoch": 0.048571462555904464, + "grad_norm": 1.4180214405059814, + "learning_rate": 4.9709582275544866e-05, + "loss": 5.2319, + "step": 8167 + }, + { + "epoch": 0.04857740983918546, + "grad_norm": 1.3797354698181152, + "learning_rate": 4.970951128075173e-05, + "loss": 5.1813, + "step": 8168 + }, + { + "epoch": 0.048583357122466454, + "grad_norm": 1.6448336839675903, + "learning_rate": 4.970944027733276e-05, + "loss": 5.1968, + "step": 8169 + }, + { + "epoch": 0.048589304405747456, + "grad_norm": 1.6626337766647339, + "learning_rate": 4.9709369265287986e-05, + "loss": 5.1303, + "step": 8170 + }, + { + "epoch": 0.04859525168902845, + "grad_norm": 1.5715514421463013, + "learning_rate": 4.970929824461742e-05, + "loss": 5.1609, + "step": 8171 + }, + { + "epoch": 0.048601198972309446, + "grad_norm": 1.5971697568893433, + "learning_rate": 4.970922721532108e-05, + "loss": 5.1489, + "step": 8172 + }, + { + "epoch": 0.04860714625559045, + "grad_norm": 1.6784114837646484, + "learning_rate": 4.970915617739903e-05, + "loss": 5.2778, + "step": 8173 + }, + { + "epoch": 0.048613093538871444, + "grad_norm": 1.7507476806640625, + "learning_rate": 4.970908513085125e-05, + "loss": 5.5719, + "step": 8174 + }, + { + "epoch": 0.04861904082215244, + "grad_norm": 1.7017735242843628, + "learning_rate": 4.970901407567779e-05, + "loss": 5.5197, + "step": 8175 + }, + { + "epoch": 0.04862498810543344, + "grad_norm": 1.8569817543029785, + "learning_rate": 4.9708943011878674e-05, + "loss": 5.3823, + "step": 8176 + }, + { + "epoch": 0.048630935388714436, + "grad_norm": 1.5183817148208618, + "learning_rate": 4.970887193945391e-05, + "loss": 5.5518, + "step": 8177 + }, + { + "epoch": 0.04863688267199543, + "grad_norm": 1.4175498485565186, + "learning_rate": 4.970880085840354e-05, + "loss": 5.4526, + "step": 8178 + }, + { + "epoch": 0.04864282995527643, + "grad_norm": 1.7228561639785767, + "learning_rate": 4.970872976872758e-05, + "loss": 5.5162, + "step": 8179 + }, + { + "epoch": 0.04864877723855743, + "grad_norm": 2.043182849884033, + "learning_rate": 4.970865867042606e-05, + "loss": 5.4212, + "step": 8180 + }, + { + "epoch": 0.04865472452183842, + "grad_norm": 1.377565622329712, + "learning_rate": 4.970858756349901e-05, + "loss": 5.2817, + "step": 8181 + }, + { + "epoch": 0.04866067180511942, + "grad_norm": 1.6977208852767944, + "learning_rate": 4.970851644794643e-05, + "loss": 5.4081, + "step": 8182 + }, + { + "epoch": 0.04866661908840042, + "grad_norm": 1.3136184215545654, + "learning_rate": 4.970844532376838e-05, + "loss": 5.4272, + "step": 8183 + }, + { + "epoch": 0.048672566371681415, + "grad_norm": 1.8863121271133423, + "learning_rate": 4.9708374190964854e-05, + "loss": 5.441, + "step": 8184 + }, + { + "epoch": 0.04867851365496241, + "grad_norm": 1.6755374670028687, + "learning_rate": 4.97083030495359e-05, + "loss": 5.5045, + "step": 8185 + }, + { + "epoch": 0.04868446093824341, + "grad_norm": 1.8439961671829224, + "learning_rate": 4.970823189948153e-05, + "loss": 5.5252, + "step": 8186 + }, + { + "epoch": 0.04869040822152441, + "grad_norm": 1.9662889242172241, + "learning_rate": 4.9708160740801765e-05, + "loss": 5.4379, + "step": 8187 + }, + { + "epoch": 0.0486963555048054, + "grad_norm": 1.691857099533081, + "learning_rate": 4.970808957349664e-05, + "loss": 5.3652, + "step": 8188 + }, + { + "epoch": 0.048702302788086405, + "grad_norm": 1.7482357025146484, + "learning_rate": 4.970801839756618e-05, + "loss": 5.1436, + "step": 8189 + }, + { + "epoch": 0.0487082500713674, + "grad_norm": 1.9221199750900269, + "learning_rate": 4.9707947213010396e-05, + "loss": 5.1936, + "step": 8190 + }, + { + "epoch": 0.048714197354648395, + "grad_norm": 1.9124062061309814, + "learning_rate": 4.970787601982933e-05, + "loss": 5.28, + "step": 8191 + }, + { + "epoch": 0.0487201446379294, + "grad_norm": 1.8999123573303223, + "learning_rate": 4.9707804818023e-05, + "loss": 5.3262, + "step": 8192 + }, + { + "epoch": 0.04872609192121039, + "grad_norm": 1.7711995840072632, + "learning_rate": 4.970773360759143e-05, + "loss": 5.1764, + "step": 8193 + }, + { + "epoch": 0.04873203920449139, + "grad_norm": 2.122689962387085, + "learning_rate": 4.970766238853465e-05, + "loss": 5.4345, + "step": 8194 + }, + { + "epoch": 0.04873798648777239, + "grad_norm": 2.1027848720550537, + "learning_rate": 4.9707591160852675e-05, + "loss": 5.4547, + "step": 8195 + }, + { + "epoch": 0.048743933771053384, + "grad_norm": 1.6944631338119507, + "learning_rate": 4.970751992454553e-05, + "loss": 5.3638, + "step": 8196 + }, + { + "epoch": 0.04874988105433438, + "grad_norm": 1.7444918155670166, + "learning_rate": 4.9707448679613256e-05, + "loss": 5.2378, + "step": 8197 + }, + { + "epoch": 0.048755828337615374, + "grad_norm": 1.8864104747772217, + "learning_rate": 4.970737742605586e-05, + "loss": 5.3142, + "step": 8198 + }, + { + "epoch": 0.048761775620896376, + "grad_norm": 1.968748927116394, + "learning_rate": 4.970730616387338e-05, + "loss": 5.0824, + "step": 8199 + }, + { + "epoch": 0.04876772290417737, + "grad_norm": 2.166405439376831, + "learning_rate": 4.9707234893065824e-05, + "loss": 5.0999, + "step": 8200 + }, + { + "epoch": 0.048773670187458366, + "grad_norm": 1.9185746908187866, + "learning_rate": 4.970716361363323e-05, + "loss": 5.1465, + "step": 8201 + }, + { + "epoch": 0.04877961747073937, + "grad_norm": 1.9191651344299316, + "learning_rate": 4.9707092325575635e-05, + "loss": 5.0713, + "step": 8202 + }, + { + "epoch": 0.048785564754020364, + "grad_norm": 1.6470153331756592, + "learning_rate": 4.9707021028893034e-05, + "loss": 5.0816, + "step": 8203 + }, + { + "epoch": 0.04879151203730136, + "grad_norm": 1.6995042562484741, + "learning_rate": 4.9706949723585475e-05, + "loss": 5.0207, + "step": 8204 + }, + { + "epoch": 0.04879745932058236, + "grad_norm": 1.8208703994750977, + "learning_rate": 4.970687840965297e-05, + "loss": 4.9789, + "step": 8205 + }, + { + "epoch": 0.048803406603863356, + "grad_norm": 1.8558207750320435, + "learning_rate": 4.9706807087095555e-05, + "loss": 5.0655, + "step": 8206 + }, + { + "epoch": 0.04880935388714435, + "grad_norm": 1.6349478960037231, + "learning_rate": 4.9706735755913234e-05, + "loss": 5.2657, + "step": 8207 + }, + { + "epoch": 0.04881530117042535, + "grad_norm": 1.587143063545227, + "learning_rate": 4.9706664416106065e-05, + "loss": 5.0765, + "step": 8208 + }, + { + "epoch": 0.04882124845370635, + "grad_norm": 1.8467018604278564, + "learning_rate": 4.9706593067674047e-05, + "loss": 5.1458, + "step": 8209 + }, + { + "epoch": 0.04882719573698734, + "grad_norm": 1.8066186904907227, + "learning_rate": 4.9706521710617214e-05, + "loss": 5.0656, + "step": 8210 + }, + { + "epoch": 0.04883314302026834, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9706450344935586e-05, + "loss": 5.1448, + "step": 8211 + }, + { + "epoch": 0.04883909030354934, + "grad_norm": 1.8924201726913452, + "learning_rate": 4.97063789706292e-05, + "loss": 4.748, + "step": 8212 + }, + { + "epoch": 0.048845037586830335, + "grad_norm": 2.091324806213379, + "learning_rate": 4.9706307587698064e-05, + "loss": 5.6537, + "step": 8213 + }, + { + "epoch": 0.04885098487011133, + "grad_norm": 3.1737043857574463, + "learning_rate": 4.970623619614221e-05, + "loss": 5.6898, + "step": 8214 + }, + { + "epoch": 0.04885693215339233, + "grad_norm": 2.194577932357788, + "learning_rate": 4.970616479596167e-05, + "loss": 5.4958, + "step": 8215 + }, + { + "epoch": 0.04886287943667333, + "grad_norm": 2.2362759113311768, + "learning_rate": 4.970609338715646e-05, + "loss": 4.9919, + "step": 8216 + }, + { + "epoch": 0.04886882671995432, + "grad_norm": 1.703684687614441, + "learning_rate": 4.970602196972661e-05, + "loss": 4.8733, + "step": 8217 + }, + { + "epoch": 0.048874774003235325, + "grad_norm": 2.0205307006835938, + "learning_rate": 4.970595054367214e-05, + "loss": 5.1177, + "step": 8218 + }, + { + "epoch": 0.04888072128651632, + "grad_norm": 2.1270928382873535, + "learning_rate": 4.970587910899308e-05, + "loss": 5.6208, + "step": 8219 + }, + { + "epoch": 0.048886668569797315, + "grad_norm": 1.8992488384246826, + "learning_rate": 4.9705807665689455e-05, + "loss": 5.7754, + "step": 8220 + }, + { + "epoch": 0.04889261585307832, + "grad_norm": 2.279099225997925, + "learning_rate": 4.9705736213761286e-05, + "loss": 5.5924, + "step": 8221 + }, + { + "epoch": 0.04889856313635931, + "grad_norm": 1.9186346530914307, + "learning_rate": 4.9705664753208594e-05, + "loss": 5.9424, + "step": 8222 + }, + { + "epoch": 0.04890451041964031, + "grad_norm": 2.0286009311676025, + "learning_rate": 4.970559328403141e-05, + "loss": 5.8461, + "step": 8223 + }, + { + "epoch": 0.04891045770292131, + "grad_norm": 1.797555685043335, + "learning_rate": 4.970552180622977e-05, + "loss": 5.4929, + "step": 8224 + }, + { + "epoch": 0.048916404986202304, + "grad_norm": 2.4879684448242188, + "learning_rate": 4.970545031980368e-05, + "loss": 5.5253, + "step": 8225 + }, + { + "epoch": 0.0489223522694833, + "grad_norm": 2.749763011932373, + "learning_rate": 4.970537882475318e-05, + "loss": 5.6001, + "step": 8226 + }, + { + "epoch": 0.048928299552764294, + "grad_norm": 2.2076292037963867, + "learning_rate": 4.970530732107827e-05, + "loss": 5.5876, + "step": 8227 + }, + { + "epoch": 0.048934246836045296, + "grad_norm": 2.6566662788391113, + "learning_rate": 4.970523580877901e-05, + "loss": 5.7151, + "step": 8228 + }, + { + "epoch": 0.04894019411932629, + "grad_norm": 2.4873850345611572, + "learning_rate": 4.97051642878554e-05, + "loss": 5.7124, + "step": 8229 + }, + { + "epoch": 0.048946141402607286, + "grad_norm": 1.8365200757980347, + "learning_rate": 4.970509275830748e-05, + "loss": 5.292, + "step": 8230 + }, + { + "epoch": 0.04895208868588829, + "grad_norm": 2.064730644226074, + "learning_rate": 4.9705021220135254e-05, + "loss": 5.2854, + "step": 8231 + }, + { + "epoch": 0.04895803596916928, + "grad_norm": 1.969298005104065, + "learning_rate": 4.970494967333877e-05, + "loss": 5.2113, + "step": 8232 + }, + { + "epoch": 0.04896398325245028, + "grad_norm": 1.8438071012496948, + "learning_rate": 4.9704878117918044e-05, + "loss": 5.2281, + "step": 8233 + }, + { + "epoch": 0.04896993053573128, + "grad_norm": 1.9163525104522705, + "learning_rate": 4.97048065538731e-05, + "loss": 5.043, + "step": 8234 + }, + { + "epoch": 0.048975877819012276, + "grad_norm": 1.802356243133545, + "learning_rate": 4.970473498120395e-05, + "loss": 5.2079, + "step": 8235 + }, + { + "epoch": 0.04898182510229327, + "grad_norm": 1.7572704553604126, + "learning_rate": 4.9704663399910645e-05, + "loss": 5.1119, + "step": 8236 + }, + { + "epoch": 0.04898777238557427, + "grad_norm": 1.848747730255127, + "learning_rate": 4.970459180999319e-05, + "loss": 5.0233, + "step": 8237 + }, + { + "epoch": 0.04899371966885527, + "grad_norm": 2.023036003112793, + "learning_rate": 4.9704520211451624e-05, + "loss": 5.2793, + "step": 8238 + }, + { + "epoch": 0.04899966695213626, + "grad_norm": 1.6738852262496948, + "learning_rate": 4.9704448604285965e-05, + "loss": 5.5255, + "step": 8239 + }, + { + "epoch": 0.04900561423541726, + "grad_norm": 1.6676057577133179, + "learning_rate": 4.970437698849624e-05, + "loss": 5.4287, + "step": 8240 + }, + { + "epoch": 0.04901156151869826, + "grad_norm": 1.9960590600967407, + "learning_rate": 4.970430536408247e-05, + "loss": 5.2939, + "step": 8241 + }, + { + "epoch": 0.049017508801979255, + "grad_norm": 2.7218708992004395, + "learning_rate": 4.9704233731044675e-05, + "loss": 5.9019, + "step": 8242 + }, + { + "epoch": 0.04902345608526025, + "grad_norm": 2.385664224624634, + "learning_rate": 4.970416208938289e-05, + "loss": 5.9146, + "step": 8243 + }, + { + "epoch": 0.04902940336854125, + "grad_norm": 2.2598092555999756, + "learning_rate": 4.970409043909714e-05, + "loss": 5.7451, + "step": 8244 + }, + { + "epoch": 0.04903535065182225, + "grad_norm": 2.3063299655914307, + "learning_rate": 4.970401878018745e-05, + "loss": 5.8675, + "step": 8245 + }, + { + "epoch": 0.04904129793510324, + "grad_norm": 2.1543853282928467, + "learning_rate": 4.9703947112653836e-05, + "loss": 5.9136, + "step": 8246 + }, + { + "epoch": 0.049047245218384244, + "grad_norm": 2.267531633377075, + "learning_rate": 4.970387543649634e-05, + "loss": 5.6834, + "step": 8247 + }, + { + "epoch": 0.04905319250166524, + "grad_norm": 2.047351121902466, + "learning_rate": 4.970380375171496e-05, + "loss": 5.5754, + "step": 8248 + }, + { + "epoch": 0.049059139784946235, + "grad_norm": 2.2565114498138428, + "learning_rate": 4.9703732058309745e-05, + "loss": 5.7067, + "step": 8249 + }, + { + "epoch": 0.04906508706822724, + "grad_norm": 1.7584022283554077, + "learning_rate": 4.970366035628073e-05, + "loss": 5.3926, + "step": 8250 + }, + { + "epoch": 0.04907103435150823, + "grad_norm": 1.9898183345794678, + "learning_rate": 4.9703588645627896e-05, + "loss": 5.7163, + "step": 8251 + }, + { + "epoch": 0.04907698163478923, + "grad_norm": 2.4134786128997803, + "learning_rate": 4.970351692635131e-05, + "loss": 5.672, + "step": 8252 + }, + { + "epoch": 0.04908292891807023, + "grad_norm": 2.1059436798095703, + "learning_rate": 4.970344519845097e-05, + "loss": 5.7719, + "step": 8253 + }, + { + "epoch": 0.049088876201351224, + "grad_norm": 2.0731539726257324, + "learning_rate": 4.970337346192692e-05, + "loss": 5.7104, + "step": 8254 + }, + { + "epoch": 0.04909482348463222, + "grad_norm": 2.3058536052703857, + "learning_rate": 4.970330171677918e-05, + "loss": 5.7435, + "step": 8255 + }, + { + "epoch": 0.049100770767913214, + "grad_norm": 2.051424980163574, + "learning_rate": 4.970322996300777e-05, + "loss": 5.7371, + "step": 8256 + }, + { + "epoch": 0.049106718051194216, + "grad_norm": 2.1715517044067383, + "learning_rate": 4.970315820061271e-05, + "loss": 5.5805, + "step": 8257 + }, + { + "epoch": 0.04911266533447521, + "grad_norm": 2.136617422103882, + "learning_rate": 4.9703086429594034e-05, + "loss": 5.8689, + "step": 8258 + }, + { + "epoch": 0.049118612617756206, + "grad_norm": 1.7089059352874756, + "learning_rate": 4.970301464995178e-05, + "loss": 6.0614, + "step": 8259 + }, + { + "epoch": 0.04912455990103721, + "grad_norm": 2.410067319869995, + "learning_rate": 4.970294286168595e-05, + "loss": 5.8762, + "step": 8260 + }, + { + "epoch": 0.0491305071843182, + "grad_norm": 2.2186291217803955, + "learning_rate": 4.970287106479657e-05, + "loss": 5.4903, + "step": 8261 + }, + { + "epoch": 0.0491364544675992, + "grad_norm": 2.312793016433716, + "learning_rate": 4.970279925928368e-05, + "loss": 6.2488, + "step": 8262 + }, + { + "epoch": 0.0491424017508802, + "grad_norm": 2.127859354019165, + "learning_rate": 4.9702727445147305e-05, + "loss": 5.9976, + "step": 8263 + }, + { + "epoch": 0.049148349034161196, + "grad_norm": 2.604367733001709, + "learning_rate": 4.9702655622387454e-05, + "loss": 5.4153, + "step": 8264 + }, + { + "epoch": 0.04915429631744219, + "grad_norm": 1.7832142114639282, + "learning_rate": 4.9702583791004165e-05, + "loss": 5.4024, + "step": 8265 + }, + { + "epoch": 0.04916024360072319, + "grad_norm": 2.04298734664917, + "learning_rate": 4.970251195099746e-05, + "loss": 5.7034, + "step": 8266 + }, + { + "epoch": 0.04916619088400419, + "grad_norm": 2.1806769371032715, + "learning_rate": 4.970244010236736e-05, + "loss": 6.1212, + "step": 8267 + }, + { + "epoch": 0.04917213816728518, + "grad_norm": 1.8740427494049072, + "learning_rate": 4.970236824511389e-05, + "loss": 5.7562, + "step": 8268 + }, + { + "epoch": 0.04917808545056618, + "grad_norm": 1.7718658447265625, + "learning_rate": 4.970229637923709e-05, + "loss": 5.5126, + "step": 8269 + }, + { + "epoch": 0.04918403273384718, + "grad_norm": 1.4966565370559692, + "learning_rate": 4.970222450473696e-05, + "loss": 5.5422, + "step": 8270 + }, + { + "epoch": 0.049189980017128175, + "grad_norm": 1.8283390998840332, + "learning_rate": 4.970215262161355e-05, + "loss": 5.9333, + "step": 8271 + }, + { + "epoch": 0.04919592730040917, + "grad_norm": 2.087460517883301, + "learning_rate": 4.970208072986687e-05, + "loss": 5.5413, + "step": 8272 + }, + { + "epoch": 0.04920187458369017, + "grad_norm": 2.2952873706817627, + "learning_rate": 4.970200882949694e-05, + "loss": 5.7848, + "step": 8273 + }, + { + "epoch": 0.04920782186697117, + "grad_norm": 1.9511842727661133, + "learning_rate": 4.9701936920503804e-05, + "loss": 5.6172, + "step": 8274 + }, + { + "epoch": 0.04921376915025216, + "grad_norm": 1.992211937904358, + "learning_rate": 4.970186500288748e-05, + "loss": 5.48, + "step": 8275 + }, + { + "epoch": 0.049219716433533164, + "grad_norm": 1.739013910293579, + "learning_rate": 4.9701793076647984e-05, + "loss": 5.6351, + "step": 8276 + }, + { + "epoch": 0.04922566371681416, + "grad_norm": 2.150797128677368, + "learning_rate": 4.970172114178534e-05, + "loss": 5.5957, + "step": 8277 + }, + { + "epoch": 0.049231611000095155, + "grad_norm": 2.074070930480957, + "learning_rate": 4.9701649198299594e-05, + "loss": 5.4751, + "step": 8278 + }, + { + "epoch": 0.04923755828337616, + "grad_norm": 2.2276322841644287, + "learning_rate": 4.970157724619075e-05, + "loss": 5.4434, + "step": 8279 + }, + { + "epoch": 0.04924350556665715, + "grad_norm": 1.9707896709442139, + "learning_rate": 4.970150528545884e-05, + "loss": 5.6935, + "step": 8280 + }, + { + "epoch": 0.04924945284993815, + "grad_norm": 2.07774019241333, + "learning_rate": 4.9701433316103895e-05, + "loss": 6.0455, + "step": 8281 + }, + { + "epoch": 0.04925540013321915, + "grad_norm": 2.3262722492218018, + "learning_rate": 4.970136133812593e-05, + "loss": 5.6039, + "step": 8282 + }, + { + "epoch": 0.049261347416500144, + "grad_norm": 2.4353108406066895, + "learning_rate": 4.970128935152498e-05, + "loss": 5.3823, + "step": 8283 + }, + { + "epoch": 0.04926729469978114, + "grad_norm": 2.7383084297180176, + "learning_rate": 4.970121735630106e-05, + "loss": 5.4039, + "step": 8284 + }, + { + "epoch": 0.049273241983062134, + "grad_norm": 2.9022698402404785, + "learning_rate": 4.9701145352454205e-05, + "loss": 5.3571, + "step": 8285 + }, + { + "epoch": 0.049279189266343136, + "grad_norm": 2.314373731613159, + "learning_rate": 4.970107333998443e-05, + "loss": 5.4877, + "step": 8286 + }, + { + "epoch": 0.04928513654962413, + "grad_norm": 1.9494023323059082, + "learning_rate": 4.970100131889177e-05, + "loss": 5.5171, + "step": 8287 + }, + { + "epoch": 0.049291083832905126, + "grad_norm": 2.7892074584960938, + "learning_rate": 4.9700929289176245e-05, + "loss": 5.5347, + "step": 8288 + }, + { + "epoch": 0.04929703111618613, + "grad_norm": 2.305204391479492, + "learning_rate": 4.970085725083788e-05, + "loss": 5.8689, + "step": 8289 + }, + { + "epoch": 0.04930297839946712, + "grad_norm": 2.4212634563446045, + "learning_rate": 4.97007852038767e-05, + "loss": 5.8982, + "step": 8290 + }, + { + "epoch": 0.04930892568274812, + "grad_norm": 3.584625482559204, + "learning_rate": 4.9700713148292734e-05, + "loss": 5.2341, + "step": 8291 + }, + { + "epoch": 0.04931487296602912, + "grad_norm": 2.874703884124756, + "learning_rate": 4.9700641084086e-05, + "loss": 5.2312, + "step": 8292 + }, + { + "epoch": 0.049320820249310116, + "grad_norm": 2.113234519958496, + "learning_rate": 4.9700569011256524e-05, + "loss": 5.5779, + "step": 8293 + }, + { + "epoch": 0.04932676753259111, + "grad_norm": 3.027318000793457, + "learning_rate": 4.970049692980434e-05, + "loss": 5.3899, + "step": 8294 + }, + { + "epoch": 0.04933271481587211, + "grad_norm": 2.779520273208618, + "learning_rate": 4.970042483972947e-05, + "loss": 5.4023, + "step": 8295 + }, + { + "epoch": 0.04933866209915311, + "grad_norm": 2.4358251094818115, + "learning_rate": 4.970035274103193e-05, + "loss": 5.4932, + "step": 8296 + }, + { + "epoch": 0.0493446093824341, + "grad_norm": 1.926193118095398, + "learning_rate": 4.970028063371176e-05, + "loss": 5.4058, + "step": 8297 + }, + { + "epoch": 0.0493505566657151, + "grad_norm": 1.7216569185256958, + "learning_rate": 4.970020851776898e-05, + "loss": 5.3265, + "step": 8298 + }, + { + "epoch": 0.0493565039489961, + "grad_norm": 1.9850976467132568, + "learning_rate": 4.97001363932036e-05, + "loss": 5.1626, + "step": 8299 + }, + { + "epoch": 0.049362451232277095, + "grad_norm": 2.1380982398986816, + "learning_rate": 4.9700064260015666e-05, + "loss": 5.3285, + "step": 8300 + }, + { + "epoch": 0.04936839851555809, + "grad_norm": 2.118781566619873, + "learning_rate": 4.969999211820518e-05, + "loss": 5.3544, + "step": 8301 + }, + { + "epoch": 0.04937434579883909, + "grad_norm": 2.0255584716796875, + "learning_rate": 4.96999199677722e-05, + "loss": 5.4256, + "step": 8302 + }, + { + "epoch": 0.04938029308212009, + "grad_norm": 2.0269806385040283, + "learning_rate": 4.9699847808716724e-05, + "loss": 5.9744, + "step": 8303 + }, + { + "epoch": 0.04938624036540108, + "grad_norm": 2.60446834564209, + "learning_rate": 4.969977564103879e-05, + "loss": 5.3926, + "step": 8304 + }, + { + "epoch": 0.049392187648682084, + "grad_norm": 2.1011881828308105, + "learning_rate": 4.9699703464738426e-05, + "loss": 5.4278, + "step": 8305 + }, + { + "epoch": 0.04939813493196308, + "grad_norm": 1.9267319440841675, + "learning_rate": 4.969963127981564e-05, + "loss": 5.6232, + "step": 8306 + }, + { + "epoch": 0.049404082215244075, + "grad_norm": 2.1958322525024414, + "learning_rate": 4.969955908627048e-05, + "loss": 5.8577, + "step": 8307 + }, + { + "epoch": 0.049410029498525077, + "grad_norm": 2.392241954803467, + "learning_rate": 4.969948688410294e-05, + "loss": 5.8013, + "step": 8308 + }, + { + "epoch": 0.04941597678180607, + "grad_norm": 2.8284695148468018, + "learning_rate": 4.969941467331308e-05, + "loss": 6.1246, + "step": 8309 + }, + { + "epoch": 0.04942192406508707, + "grad_norm": 2.8590078353881836, + "learning_rate": 4.96993424539009e-05, + "loss": 6.1068, + "step": 8310 + }, + { + "epoch": 0.04942787134836807, + "grad_norm": 1.876207709312439, + "learning_rate": 4.969927022586644e-05, + "loss": 5.5493, + "step": 8311 + }, + { + "epoch": 0.049433818631649064, + "grad_norm": 1.988061547279358, + "learning_rate": 4.969919798920972e-05, + "loss": 5.7059, + "step": 8312 + }, + { + "epoch": 0.04943976591493006, + "grad_norm": 2.8230605125427246, + "learning_rate": 4.969912574393077e-05, + "loss": 5.9381, + "step": 8313 + }, + { + "epoch": 0.049445713198211054, + "grad_norm": 2.4622697830200195, + "learning_rate": 4.96990534900296e-05, + "loss": 6.0935, + "step": 8314 + }, + { + "epoch": 0.049451660481492056, + "grad_norm": 2.0811798572540283, + "learning_rate": 4.9698981227506254e-05, + "loss": 6.3475, + "step": 8315 + }, + { + "epoch": 0.04945760776477305, + "grad_norm": 2.099489212036133, + "learning_rate": 4.9698908956360745e-05, + "loss": 5.7266, + "step": 8316 + }, + { + "epoch": 0.049463555048054046, + "grad_norm": 2.1711854934692383, + "learning_rate": 4.9698836676593104e-05, + "loss": 5.6067, + "step": 8317 + }, + { + "epoch": 0.04946950233133505, + "grad_norm": 2.195296287536621, + "learning_rate": 4.969876438820335e-05, + "loss": 5.3896, + "step": 8318 + }, + { + "epoch": 0.04947544961461604, + "grad_norm": 2.114830255508423, + "learning_rate": 4.969869209119151e-05, + "loss": 5.6922, + "step": 8319 + }, + { + "epoch": 0.04948139689789704, + "grad_norm": 2.1534018516540527, + "learning_rate": 4.969861978555762e-05, + "loss": 6.1372, + "step": 8320 + }, + { + "epoch": 0.04948734418117804, + "grad_norm": 2.151495933532715, + "learning_rate": 4.9698547471301696e-05, + "loss": 6.0915, + "step": 8321 + }, + { + "epoch": 0.049493291464459035, + "grad_norm": 1.8232096433639526, + "learning_rate": 4.9698475148423764e-05, + "loss": 6.1492, + "step": 8322 + }, + { + "epoch": 0.04949923874774003, + "grad_norm": 2.1538467407226562, + "learning_rate": 4.9698402816923844e-05, + "loss": 5.6253, + "step": 8323 + }, + { + "epoch": 0.04950518603102103, + "grad_norm": 2.278797149658203, + "learning_rate": 4.969833047680197e-05, + "loss": 6.0055, + "step": 8324 + }, + { + "epoch": 0.04951113331430203, + "grad_norm": 2.479342460632324, + "learning_rate": 4.9698258128058164e-05, + "loss": 5.7909, + "step": 8325 + }, + { + "epoch": 0.04951708059758302, + "grad_norm": 2.2959346771240234, + "learning_rate": 4.969818577069245e-05, + "loss": 5.6888, + "step": 8326 + }, + { + "epoch": 0.04952302788086402, + "grad_norm": 1.841544270515442, + "learning_rate": 4.969811340470486e-05, + "loss": 5.5091, + "step": 8327 + }, + { + "epoch": 0.04952897516414502, + "grad_norm": 2.4512903690338135, + "learning_rate": 4.969804103009541e-05, + "loss": 5.7271, + "step": 8328 + }, + { + "epoch": 0.049534922447426015, + "grad_norm": 2.035473585128784, + "learning_rate": 4.969796864686413e-05, + "loss": 5.3056, + "step": 8329 + }, + { + "epoch": 0.04954086973070701, + "grad_norm": 2.030576705932617, + "learning_rate": 4.9697896255011046e-05, + "loss": 5.2765, + "step": 8330 + }, + { + "epoch": 0.04954681701398801, + "grad_norm": 1.680253505706787, + "learning_rate": 4.9697823854536175e-05, + "loss": 5.1968, + "step": 8331 + }, + { + "epoch": 0.04955276429726901, + "grad_norm": 1.962259292602539, + "learning_rate": 4.969775144543955e-05, + "loss": 5.0743, + "step": 8332 + }, + { + "epoch": 0.04955871158055, + "grad_norm": 2.499044895172119, + "learning_rate": 4.96976790277212e-05, + "loss": 5.5204, + "step": 8333 + }, + { + "epoch": 0.049564658863831004, + "grad_norm": 2.004849672317505, + "learning_rate": 4.969760660138114e-05, + "loss": 5.5714, + "step": 8334 + }, + { + "epoch": 0.049570606147112, + "grad_norm": 2.255171775817871, + "learning_rate": 4.9697534166419405e-05, + "loss": 5.0766, + "step": 8335 + }, + { + "epoch": 0.049576553430392994, + "grad_norm": 2.1219112873077393, + "learning_rate": 4.969746172283601e-05, + "loss": 5.0613, + "step": 8336 + }, + { + "epoch": 0.049582500713673996, + "grad_norm": 1.9718400239944458, + "learning_rate": 4.9697389270631004e-05, + "loss": 5.0007, + "step": 8337 + }, + { + "epoch": 0.04958844799695499, + "grad_norm": 1.87917160987854, + "learning_rate": 4.969731680980437e-05, + "loss": 4.9533, + "step": 8338 + }, + { + "epoch": 0.04959439528023599, + "grad_norm": 1.9610000848770142, + "learning_rate": 4.969724434035618e-05, + "loss": 4.9761, + "step": 8339 + }, + { + "epoch": 0.04960034256351699, + "grad_norm": 1.859434723854065, + "learning_rate": 4.969717186228642e-05, + "loss": 5.2373, + "step": 8340 + }, + { + "epoch": 0.049606289846797984, + "grad_norm": 1.9905357360839844, + "learning_rate": 4.9697099375595144e-05, + "loss": 4.8858, + "step": 8341 + }, + { + "epoch": 0.04961223713007898, + "grad_norm": 1.995355486869812, + "learning_rate": 4.969702688028236e-05, + "loss": 4.9468, + "step": 8342 + }, + { + "epoch": 0.049618184413359974, + "grad_norm": 1.9970706701278687, + "learning_rate": 4.96969543763481e-05, + "loss": 4.8891, + "step": 8343 + }, + { + "epoch": 0.049624131696640976, + "grad_norm": 1.9036997556686401, + "learning_rate": 4.9696881863792385e-05, + "loss": 4.7622, + "step": 8344 + }, + { + "epoch": 0.04963007897992197, + "grad_norm": 1.9532603025436401, + "learning_rate": 4.9696809342615245e-05, + "loss": 4.7832, + "step": 8345 + }, + { + "epoch": 0.049636026263202966, + "grad_norm": 1.9032143354415894, + "learning_rate": 4.969673681281671e-05, + "loss": 4.7569, + "step": 8346 + }, + { + "epoch": 0.04964197354648397, + "grad_norm": 3.4294323921203613, + "learning_rate": 4.96966642743968e-05, + "loss": 5.9381, + "step": 8347 + }, + { + "epoch": 0.04964792082976496, + "grad_norm": 4.137698173522949, + "learning_rate": 4.969659172735554e-05, + "loss": 6.4081, + "step": 8348 + }, + { + "epoch": 0.04965386811304596, + "grad_norm": 2.774838447570801, + "learning_rate": 4.969651917169295e-05, + "loss": 5.9888, + "step": 8349 + }, + { + "epoch": 0.04965981539632696, + "grad_norm": 2.4056432247161865, + "learning_rate": 4.9696446607409054e-05, + "loss": 6.1239, + "step": 8350 + }, + { + "epoch": 0.049665762679607955, + "grad_norm": 2.098475456237793, + "learning_rate": 4.969637403450389e-05, + "loss": 6.4226, + "step": 8351 + }, + { + "epoch": 0.04967170996288895, + "grad_norm": 2.1402597427368164, + "learning_rate": 4.9696301452977475e-05, + "loss": 5.8836, + "step": 8352 + }, + { + "epoch": 0.04967765724616995, + "grad_norm": 2.8023130893707275, + "learning_rate": 4.9696228862829844e-05, + "loss": 6.2452, + "step": 8353 + }, + { + "epoch": 0.04968360452945095, + "grad_norm": 2.7669503688812256, + "learning_rate": 4.9696156264061e-05, + "loss": 6.0093, + "step": 8354 + }, + { + "epoch": 0.04968955181273194, + "grad_norm": 2.2357375621795654, + "learning_rate": 4.9696083656671e-05, + "loss": 6.0614, + "step": 8355 + }, + { + "epoch": 0.049695499096012945, + "grad_norm": 2.1435539722442627, + "learning_rate": 4.969601104065984e-05, + "loss": 6.0718, + "step": 8356 + }, + { + "epoch": 0.04970144637929394, + "grad_norm": 2.6372897624969482, + "learning_rate": 4.969593841602757e-05, + "loss": 5.4878, + "step": 8357 + }, + { + "epoch": 0.049707393662574935, + "grad_norm": 1.9730110168457031, + "learning_rate": 4.9695865782774186e-05, + "loss": 5.8913, + "step": 8358 + }, + { + "epoch": 0.04971334094585593, + "grad_norm": 2.262437105178833, + "learning_rate": 4.9695793140899737e-05, + "loss": 5.0382, + "step": 8359 + }, + { + "epoch": 0.04971928822913693, + "grad_norm": 1.794268250465393, + "learning_rate": 4.9695720490404254e-05, + "loss": 5.784, + "step": 8360 + }, + { + "epoch": 0.04972523551241793, + "grad_norm": 1.9568414688110352, + "learning_rate": 4.969564783128773e-05, + "loss": 5.8939, + "step": 8361 + }, + { + "epoch": 0.04973118279569892, + "grad_norm": 2.0560479164123535, + "learning_rate": 4.969557516355022e-05, + "loss": 5.8806, + "step": 8362 + }, + { + "epoch": 0.049737130078979924, + "grad_norm": 1.9009175300598145, + "learning_rate": 4.9695502487191746e-05, + "loss": 5.5568, + "step": 8363 + }, + { + "epoch": 0.04974307736226092, + "grad_norm": 2.1240882873535156, + "learning_rate": 4.9695429802212325e-05, + "loss": 5.4514, + "step": 8364 + }, + { + "epoch": 0.049749024645541914, + "grad_norm": 2.0803675651550293, + "learning_rate": 4.969535710861198e-05, + "loss": 5.7679, + "step": 8365 + }, + { + "epoch": 0.049754971928822916, + "grad_norm": 1.9357428550720215, + "learning_rate": 4.969528440639074e-05, + "loss": 6.1658, + "step": 8366 + }, + { + "epoch": 0.04976091921210391, + "grad_norm": 1.89462411403656, + "learning_rate": 4.9695211695548635e-05, + "loss": 6.0559, + "step": 8367 + }, + { + "epoch": 0.04976686649538491, + "grad_norm": 1.5986123085021973, + "learning_rate": 4.969513897608569e-05, + "loss": 5.7787, + "step": 8368 + }, + { + "epoch": 0.04977281377866591, + "grad_norm": 2.0391738414764404, + "learning_rate": 4.969506624800192e-05, + "loss": 5.5559, + "step": 8369 + }, + { + "epoch": 0.049778761061946904, + "grad_norm": 2.1463794708251953, + "learning_rate": 4.969499351129736e-05, + "loss": 5.5734, + "step": 8370 + }, + { + "epoch": 0.0497847083452279, + "grad_norm": 2.1488826274871826, + "learning_rate": 4.969492076597203e-05, + "loss": 5.7502, + "step": 8371 + }, + { + "epoch": 0.049790655628508894, + "grad_norm": 2.214439868927002, + "learning_rate": 4.9694848012025966e-05, + "loss": 5.8829, + "step": 8372 + }, + { + "epoch": 0.049796602911789896, + "grad_norm": 2.366196632385254, + "learning_rate": 4.969477524945918e-05, + "loss": 5.3428, + "step": 8373 + }, + { + "epoch": 0.04980255019507089, + "grad_norm": 2.239044189453125, + "learning_rate": 4.96947024782717e-05, + "loss": 5.7258, + "step": 8374 + }, + { + "epoch": 0.049808497478351886, + "grad_norm": 2.315492868423462, + "learning_rate": 4.9694629698463554e-05, + "loss": 5.6542, + "step": 8375 + }, + { + "epoch": 0.04981444476163289, + "grad_norm": 2.340740919113159, + "learning_rate": 4.969455691003478e-05, + "loss": 5.0699, + "step": 8376 + }, + { + "epoch": 0.04982039204491388, + "grad_norm": 2.644800901412964, + "learning_rate": 4.9694484112985386e-05, + "loss": 5.3808, + "step": 8377 + }, + { + "epoch": 0.04982633932819488, + "grad_norm": 2.7073781490325928, + "learning_rate": 4.96944113073154e-05, + "loss": 5.5233, + "step": 8378 + }, + { + "epoch": 0.04983228661147588, + "grad_norm": 2.5480713844299316, + "learning_rate": 4.969433849302485e-05, + "loss": 5.3908, + "step": 8379 + }, + { + "epoch": 0.049838233894756875, + "grad_norm": 2.494356155395508, + "learning_rate": 4.969426567011376e-05, + "loss": 5.3528, + "step": 8380 + }, + { + "epoch": 0.04984418117803787, + "grad_norm": 2.4249942302703857, + "learning_rate": 4.9694192838582155e-05, + "loss": 5.2995, + "step": 8381 + }, + { + "epoch": 0.04985012846131887, + "grad_norm": 2.5930840969085693, + "learning_rate": 4.9694119998430066e-05, + "loss": 6.0202, + "step": 8382 + }, + { + "epoch": 0.04985607574459987, + "grad_norm": 2.391972541809082, + "learning_rate": 4.969404714965752e-05, + "loss": 6.0247, + "step": 8383 + }, + { + "epoch": 0.04986202302788086, + "grad_norm": 2.2849159240722656, + "learning_rate": 4.9693974292264535e-05, + "loss": 5.892, + "step": 8384 + }, + { + "epoch": 0.049867970311161865, + "grad_norm": 2.1887097358703613, + "learning_rate": 4.9693901426251134e-05, + "loss": 6.0196, + "step": 8385 + }, + { + "epoch": 0.04987391759444286, + "grad_norm": 2.3988685607910156, + "learning_rate": 4.969382855161735e-05, + "loss": 5.5596, + "step": 8386 + }, + { + "epoch": 0.049879864877723855, + "grad_norm": 2.675144910812378, + "learning_rate": 4.9693755668363204e-05, + "loss": 5.3495, + "step": 8387 + }, + { + "epoch": 0.04988581216100485, + "grad_norm": 2.3753585815429688, + "learning_rate": 4.969368277648873e-05, + "loss": 5.8823, + "step": 8388 + }, + { + "epoch": 0.04989175944428585, + "grad_norm": 2.3168766498565674, + "learning_rate": 4.969360987599394e-05, + "loss": 5.9768, + "step": 8389 + }, + { + "epoch": 0.04989770672756685, + "grad_norm": 2.427138566970825, + "learning_rate": 4.969353696687886e-05, + "loss": 6.1823, + "step": 8390 + }, + { + "epoch": 0.04990365401084784, + "grad_norm": 2.304731845855713, + "learning_rate": 4.9693464049143526e-05, + "loss": 5.8697, + "step": 8391 + }, + { + "epoch": 0.049909601294128844, + "grad_norm": 2.2139687538146973, + "learning_rate": 4.9693391122787966e-05, + "loss": 6.0274, + "step": 8392 + }, + { + "epoch": 0.04991554857740984, + "grad_norm": 2.1165316104888916, + "learning_rate": 4.9693318187812185e-05, + "loss": 5.2499, + "step": 8393 + }, + { + "epoch": 0.049921495860690834, + "grad_norm": 2.5213639736175537, + "learning_rate": 4.969324524421624e-05, + "loss": 4.9105, + "step": 8394 + }, + { + "epoch": 0.049927443143971836, + "grad_norm": 2.2188315391540527, + "learning_rate": 4.9693172292000125e-05, + "loss": 4.8652, + "step": 8395 + }, + { + "epoch": 0.04993339042725283, + "grad_norm": 2.393179416656494, + "learning_rate": 4.9693099331163886e-05, + "loss": 4.924, + "step": 8396 + }, + { + "epoch": 0.04993933771053383, + "grad_norm": 2.150264024734497, + "learning_rate": 4.969302636170753e-05, + "loss": 4.9168, + "step": 8397 + }, + { + "epoch": 0.04994528499381483, + "grad_norm": 2.252499580383301, + "learning_rate": 4.96929533836311e-05, + "loss": 4.7822, + "step": 8398 + }, + { + "epoch": 0.049951232277095824, + "grad_norm": 2.342132806777954, + "learning_rate": 4.969288039693461e-05, + "loss": 5.3691, + "step": 8399 + }, + { + "epoch": 0.04995717956037682, + "grad_norm": 2.3533523082733154, + "learning_rate": 4.96928074016181e-05, + "loss": 5.9989, + "step": 8400 + }, + { + "epoch": 0.049963126843657814, + "grad_norm": 2.185727834701538, + "learning_rate": 4.969273439768158e-05, + "loss": 5.6101, + "step": 8401 + }, + { + "epoch": 0.049969074126938816, + "grad_norm": 2.3396189212799072, + "learning_rate": 4.969266138512509e-05, + "loss": 5.845, + "step": 8402 + }, + { + "epoch": 0.04997502141021981, + "grad_norm": 2.2145371437072754, + "learning_rate": 4.969258836394864e-05, + "loss": 5.6657, + "step": 8403 + }, + { + "epoch": 0.049980968693500806, + "grad_norm": 2.2084364891052246, + "learning_rate": 4.969251533415226e-05, + "loss": 5.8823, + "step": 8404 + }, + { + "epoch": 0.04998691597678181, + "grad_norm": 1.7423903942108154, + "learning_rate": 4.9692442295735984e-05, + "loss": 5.8209, + "step": 8405 + }, + { + "epoch": 0.0499928632600628, + "grad_norm": 2.3057217597961426, + "learning_rate": 4.9692369248699824e-05, + "loss": 5.8352, + "step": 8406 + }, + { + "epoch": 0.0499988105433438, + "grad_norm": 2.1800148487091064, + "learning_rate": 4.969229619304382e-05, + "loss": 5.783, + "step": 8407 + }, + { + "epoch": 0.0500047578266248, + "grad_norm": 1.8594306707382202, + "learning_rate": 4.969222312876799e-05, + "loss": 6.01, + "step": 8408 + }, + { + "epoch": 0.050010705109905795, + "grad_norm": 2.119917392730713, + "learning_rate": 4.9692150055872355e-05, + "loss": 5.7282, + "step": 8409 + }, + { + "epoch": 0.05001665239318679, + "grad_norm": 2.5282747745513916, + "learning_rate": 4.969207697435695e-05, + "loss": 5.0853, + "step": 8410 + }, + { + "epoch": 0.05002259967646779, + "grad_norm": 2.5683388710021973, + "learning_rate": 4.969200388422179e-05, + "loss": 4.9841, + "step": 8411 + }, + { + "epoch": 0.05002854695974879, + "grad_norm": 2.649918794631958, + "learning_rate": 4.969193078546692e-05, + "loss": 5.6365, + "step": 8412 + }, + { + "epoch": 0.05003449424302978, + "grad_norm": 2.3040120601654053, + "learning_rate": 4.969185767809234e-05, + "loss": 5.8272, + "step": 8413 + }, + { + "epoch": 0.050040441526310785, + "grad_norm": 2.033600330352783, + "learning_rate": 4.9691784562098084e-05, + "loss": 5.9779, + "step": 8414 + }, + { + "epoch": 0.05004638880959178, + "grad_norm": 2.1903419494628906, + "learning_rate": 4.96917114374842e-05, + "loss": 5.8651, + "step": 8415 + }, + { + "epoch": 0.050052336092872775, + "grad_norm": 2.4431047439575195, + "learning_rate": 4.969163830425068e-05, + "loss": 4.7787, + "step": 8416 + }, + { + "epoch": 0.05005828337615377, + "grad_norm": 2.6652824878692627, + "learning_rate": 4.969156516239756e-05, + "loss": 4.7133, + "step": 8417 + }, + { + "epoch": 0.05006423065943477, + "grad_norm": 2.4090182781219482, + "learning_rate": 4.969149201192488e-05, + "loss": 4.4506, + "step": 8418 + }, + { + "epoch": 0.05007017794271577, + "grad_norm": 2.5310218334198, + "learning_rate": 4.969141885283265e-05, + "loss": 4.5286, + "step": 8419 + }, + { + "epoch": 0.05007612522599676, + "grad_norm": 2.5333101749420166, + "learning_rate": 4.9691345685120905e-05, + "loss": 4.6012, + "step": 8420 + }, + { + "epoch": 0.050082072509277764, + "grad_norm": 2.172724485397339, + "learning_rate": 4.9691272508789665e-05, + "loss": 4.9161, + "step": 8421 + }, + { + "epoch": 0.05008801979255876, + "grad_norm": 2.034684181213379, + "learning_rate": 4.969119932383896e-05, + "loss": 5.3105, + "step": 8422 + }, + { + "epoch": 0.050093967075839754, + "grad_norm": 1.9046155214309692, + "learning_rate": 4.969112613026881e-05, + "loss": 5.4308, + "step": 8423 + }, + { + "epoch": 0.050099914359120756, + "grad_norm": 1.7256773710250854, + "learning_rate": 4.9691052928079226e-05, + "loss": 5.2232, + "step": 8424 + }, + { + "epoch": 0.05010586164240175, + "grad_norm": 2.0075321197509766, + "learning_rate": 4.969097971727027e-05, + "loss": 6.1764, + "step": 8425 + }, + { + "epoch": 0.050111808925682746, + "grad_norm": 2.1523852348327637, + "learning_rate": 4.9690906497841946e-05, + "loss": 5.8419, + "step": 8426 + }, + { + "epoch": 0.05011775620896375, + "grad_norm": 1.9675406217575073, + "learning_rate": 4.969083326979428e-05, + "loss": 5.7919, + "step": 8427 + }, + { + "epoch": 0.050123703492244744, + "grad_norm": 2.0327789783477783, + "learning_rate": 4.9690760033127295e-05, + "loss": 5.0232, + "step": 8428 + }, + { + "epoch": 0.05012965077552574, + "grad_norm": 1.677471399307251, + "learning_rate": 4.969068678784102e-05, + "loss": 5.1106, + "step": 8429 + }, + { + "epoch": 0.050135598058806734, + "grad_norm": 1.727847933769226, + "learning_rate": 4.9690613533935496e-05, + "loss": 5.1589, + "step": 8430 + }, + { + "epoch": 0.050141545342087736, + "grad_norm": 1.8167927265167236, + "learning_rate": 4.9690540271410726e-05, + "loss": 5.1207, + "step": 8431 + }, + { + "epoch": 0.05014749262536873, + "grad_norm": 2.277425527572632, + "learning_rate": 4.969046700026674e-05, + "loss": 5.6614, + "step": 8432 + }, + { + "epoch": 0.050153439908649726, + "grad_norm": 1.6471065282821655, + "learning_rate": 4.969039372050356e-05, + "loss": 5.2065, + "step": 8433 + }, + { + "epoch": 0.05015938719193073, + "grad_norm": 1.9049899578094482, + "learning_rate": 4.9690320432121226e-05, + "loss": 5.7453, + "step": 8434 + }, + { + "epoch": 0.05016533447521172, + "grad_norm": 1.9145495891571045, + "learning_rate": 4.969024713511976e-05, + "loss": 6.2207, + "step": 8435 + }, + { + "epoch": 0.05017128175849272, + "grad_norm": 1.6634061336517334, + "learning_rate": 4.969017382949918e-05, + "loss": 6.1694, + "step": 8436 + }, + { + "epoch": 0.05017722904177372, + "grad_norm": 1.9804925918579102, + "learning_rate": 4.969010051525952e-05, + "loss": 6.2917, + "step": 8437 + }, + { + "epoch": 0.050183176325054715, + "grad_norm": 1.9674698114395142, + "learning_rate": 4.969002719240079e-05, + "loss": 6.3105, + "step": 8438 + }, + { + "epoch": 0.05018912360833571, + "grad_norm": 2.1540520191192627, + "learning_rate": 4.968995386092303e-05, + "loss": 5.964, + "step": 8439 + }, + { + "epoch": 0.05019507089161671, + "grad_norm": 1.8545453548431396, + "learning_rate": 4.9689880520826274e-05, + "loss": 5.8744, + "step": 8440 + }, + { + "epoch": 0.05020101817489771, + "grad_norm": 1.8022514581680298, + "learning_rate": 4.968980717211053e-05, + "loss": 6.1547, + "step": 8441 + }, + { + "epoch": 0.0502069654581787, + "grad_norm": 1.6297475099563599, + "learning_rate": 4.968973381477582e-05, + "loss": 6.1397, + "step": 8442 + }, + { + "epoch": 0.050212912741459705, + "grad_norm": 1.6256400346755981, + "learning_rate": 4.968966044882219e-05, + "loss": 6.0529, + "step": 8443 + }, + { + "epoch": 0.0502188600247407, + "grad_norm": 1.5988365411758423, + "learning_rate": 4.968958707424965e-05, + "loss": 6.0653, + "step": 8444 + }, + { + "epoch": 0.050224807308021695, + "grad_norm": 1.7062568664550781, + "learning_rate": 4.968951369105823e-05, + "loss": 5.6761, + "step": 8445 + }, + { + "epoch": 0.05023075459130269, + "grad_norm": 2.6108970642089844, + "learning_rate": 4.968944029924796e-05, + "loss": 5.7222, + "step": 8446 + }, + { + "epoch": 0.05023670187458369, + "grad_norm": 2.2341887950897217, + "learning_rate": 4.9689366898818854e-05, + "loss": 6.057, + "step": 8447 + }, + { + "epoch": 0.05024264915786469, + "grad_norm": 2.1819159984588623, + "learning_rate": 4.968929348977095e-05, + "loss": 6.0386, + "step": 8448 + }, + { + "epoch": 0.05024859644114568, + "grad_norm": 1.9941349029541016, + "learning_rate": 4.968922007210427e-05, + "loss": 6.132, + "step": 8449 + }, + { + "epoch": 0.050254543724426684, + "grad_norm": 1.7330418825149536, + "learning_rate": 4.968914664581883e-05, + "loss": 6.0834, + "step": 8450 + }, + { + "epoch": 0.05026049100770768, + "grad_norm": 1.8946608304977417, + "learning_rate": 4.968907321091467e-05, + "loss": 5.9147, + "step": 8451 + }, + { + "epoch": 0.050266438290988674, + "grad_norm": 2.314767599105835, + "learning_rate": 4.9688999767391815e-05, + "loss": 5.7087, + "step": 8452 + }, + { + "epoch": 0.050272385574269676, + "grad_norm": 2.604673147201538, + "learning_rate": 4.968892631525028e-05, + "loss": 5.7348, + "step": 8453 + }, + { + "epoch": 0.05027833285755067, + "grad_norm": 2.3386125564575195, + "learning_rate": 4.9688852854490097e-05, + "loss": 5.7509, + "step": 8454 + }, + { + "epoch": 0.050284280140831666, + "grad_norm": 2.3919529914855957, + "learning_rate": 4.968877938511129e-05, + "loss": 5.5851, + "step": 8455 + }, + { + "epoch": 0.05029022742411267, + "grad_norm": 2.0978026390075684, + "learning_rate": 4.9688705907113886e-05, + "loss": 5.3663, + "step": 8456 + }, + { + "epoch": 0.050296174707393664, + "grad_norm": 2.1700327396392822, + "learning_rate": 4.9688632420497904e-05, + "loss": 6.0197, + "step": 8457 + }, + { + "epoch": 0.05030212199067466, + "grad_norm": 2.1657676696777344, + "learning_rate": 4.968855892526338e-05, + "loss": 6.1721, + "step": 8458 + }, + { + "epoch": 0.050308069273955654, + "grad_norm": 2.434732437133789, + "learning_rate": 4.968848542141033e-05, + "loss": 6.0217, + "step": 8459 + }, + { + "epoch": 0.050314016557236656, + "grad_norm": 1.8453216552734375, + "learning_rate": 4.96884119089388e-05, + "loss": 6.4071, + "step": 8460 + }, + { + "epoch": 0.05031996384051765, + "grad_norm": 1.930168628692627, + "learning_rate": 4.9688338387848784e-05, + "loss": 6.5024, + "step": 8461 + }, + { + "epoch": 0.050325911123798646, + "grad_norm": 2.1785950660705566, + "learning_rate": 4.968826485814033e-05, + "loss": 5.803, + "step": 8462 + }, + { + "epoch": 0.05033185840707965, + "grad_norm": 2.003187894821167, + "learning_rate": 4.968819131981346e-05, + "loss": 6.2269, + "step": 8463 + }, + { + "epoch": 0.05033780569036064, + "grad_norm": 2.9522452354431152, + "learning_rate": 4.9688117772868195e-05, + "loss": 5.5603, + "step": 8464 + }, + { + "epoch": 0.05034375297364164, + "grad_norm": 1.9813052415847778, + "learning_rate": 4.968804421730457e-05, + "loss": 6.0101, + "step": 8465 + }, + { + "epoch": 0.05034970025692264, + "grad_norm": 2.370225667953491, + "learning_rate": 4.9687970653122596e-05, + "loss": 6.3236, + "step": 8466 + }, + { + "epoch": 0.050355647540203635, + "grad_norm": 1.9233943223953247, + "learning_rate": 4.968789708032231e-05, + "loss": 6.2962, + "step": 8467 + }, + { + "epoch": 0.05036159482348463, + "grad_norm": 1.8740222454071045, + "learning_rate": 4.968782349890373e-05, + "loss": 5.5454, + "step": 8468 + }, + { + "epoch": 0.05036754210676563, + "grad_norm": 1.8627724647521973, + "learning_rate": 4.968774990886689e-05, + "loss": 5.9242, + "step": 8469 + }, + { + "epoch": 0.05037348939004663, + "grad_norm": 1.7016552686691284, + "learning_rate": 4.968767631021181e-05, + "loss": 6.3302, + "step": 8470 + }, + { + "epoch": 0.05037943667332762, + "grad_norm": 1.8826018571853638, + "learning_rate": 4.9687602702938515e-05, + "loss": 6.3308, + "step": 8471 + }, + { + "epoch": 0.050385383956608625, + "grad_norm": 1.777480959892273, + "learning_rate": 4.9687529087047036e-05, + "loss": 6.3948, + "step": 8472 + }, + { + "epoch": 0.05039133123988962, + "grad_norm": 2.10075306892395, + "learning_rate": 4.9687455462537396e-05, + "loss": 6.1615, + "step": 8473 + }, + { + "epoch": 0.050397278523170615, + "grad_norm": 2.3484537601470947, + "learning_rate": 4.9687381829409616e-05, + "loss": 5.8286, + "step": 8474 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 1.8243837356567383, + "learning_rate": 4.968730818766373e-05, + "loss": 6.014, + "step": 8475 + }, + { + "epoch": 0.05040917308973261, + "grad_norm": 1.8149470090866089, + "learning_rate": 4.9687234537299765e-05, + "loss": 5.9723, + "step": 8476 + }, + { + "epoch": 0.05041512037301361, + "grad_norm": 2.400754451751709, + "learning_rate": 4.968716087831773e-05, + "loss": 5.237, + "step": 8477 + }, + { + "epoch": 0.0504210676562946, + "grad_norm": 2.4394338130950928, + "learning_rate": 4.968708721071767e-05, + "loss": 5.1106, + "step": 8478 + }, + { + "epoch": 0.050427014939575604, + "grad_norm": 2.210686445236206, + "learning_rate": 4.96870135344996e-05, + "loss": 5.0002, + "step": 8479 + }, + { + "epoch": 0.0504329622228566, + "grad_norm": 2.302997589111328, + "learning_rate": 4.968693984966355e-05, + "loss": 5.689, + "step": 8480 + }, + { + "epoch": 0.050438909506137594, + "grad_norm": 2.0761525630950928, + "learning_rate": 4.9686866156209546e-05, + "loss": 5.4452, + "step": 8481 + }, + { + "epoch": 0.050444856789418596, + "grad_norm": 2.3239383697509766, + "learning_rate": 4.968679245413761e-05, + "loss": 5.4427, + "step": 8482 + }, + { + "epoch": 0.05045080407269959, + "grad_norm": 3.2064802646636963, + "learning_rate": 4.9686718743447766e-05, + "loss": 5.2947, + "step": 8483 + }, + { + "epoch": 0.050456751355980586, + "grad_norm": 2.680786371231079, + "learning_rate": 4.968664502414004e-05, + "loss": 5.4776, + "step": 8484 + }, + { + "epoch": 0.05046269863926159, + "grad_norm": 2.107583522796631, + "learning_rate": 4.9686571296214476e-05, + "loss": 5.5172, + "step": 8485 + }, + { + "epoch": 0.050468645922542583, + "grad_norm": 1.939788579940796, + "learning_rate": 4.9686497559671075e-05, + "loss": 5.6056, + "step": 8486 + }, + { + "epoch": 0.05047459320582358, + "grad_norm": 1.883991003036499, + "learning_rate": 4.968642381450987e-05, + "loss": 5.6511, + "step": 8487 + }, + { + "epoch": 0.050480540489104574, + "grad_norm": 1.8518444299697876, + "learning_rate": 4.96863500607309e-05, + "loss": 5.5897, + "step": 8488 + }, + { + "epoch": 0.050486487772385576, + "grad_norm": 1.6704350709915161, + "learning_rate": 4.968627629833418e-05, + "loss": 5.5002, + "step": 8489 + }, + { + "epoch": 0.05049243505566657, + "grad_norm": 1.755231261253357, + "learning_rate": 4.968620252731972e-05, + "loss": 5.6012, + "step": 8490 + }, + { + "epoch": 0.050498382338947566, + "grad_norm": 1.8532077074050903, + "learning_rate": 4.968612874768758e-05, + "loss": 5.4443, + "step": 8491 + }, + { + "epoch": 0.05050432962222857, + "grad_norm": 1.787781000137329, + "learning_rate": 4.9686054959437756e-05, + "loss": 5.5623, + "step": 8492 + }, + { + "epoch": 0.05051027690550956, + "grad_norm": 1.6963365077972412, + "learning_rate": 4.9685981162570295e-05, + "loss": 5.5349, + "step": 8493 + }, + { + "epoch": 0.05051622418879056, + "grad_norm": 4.328898906707764, + "learning_rate": 4.96859073570852e-05, + "loss": 5.8026, + "step": 8494 + }, + { + "epoch": 0.05052217147207156, + "grad_norm": 1.6906582117080688, + "learning_rate": 4.968583354298252e-05, + "loss": 5.4804, + "step": 8495 + }, + { + "epoch": 0.050528118755352555, + "grad_norm": 1.5316333770751953, + "learning_rate": 4.968575972026227e-05, + "loss": 5.6005, + "step": 8496 + }, + { + "epoch": 0.05053406603863355, + "grad_norm": 1.6029349565505981, + "learning_rate": 4.968568588892447e-05, + "loss": 5.5991, + "step": 8497 + }, + { + "epoch": 0.05054001332191455, + "grad_norm": 2.246537685394287, + "learning_rate": 4.968561204896916e-05, + "loss": 5.8537, + "step": 8498 + }, + { + "epoch": 0.05054596060519555, + "grad_norm": 2.0347564220428467, + "learning_rate": 4.9685538200396355e-05, + "loss": 5.7968, + "step": 8499 + }, + { + "epoch": 0.05055190788847654, + "grad_norm": 1.7635436058044434, + "learning_rate": 4.968546434320608e-05, + "loss": 5.6324, + "step": 8500 + }, + { + "epoch": 0.050557855171757544, + "grad_norm": 2.415397882461548, + "learning_rate": 4.9685390477398363e-05, + "loss": 5.3795, + "step": 8501 + }, + { + "epoch": 0.05056380245503854, + "grad_norm": 2.1499149799346924, + "learning_rate": 4.9685316602973245e-05, + "loss": 5.5638, + "step": 8502 + }, + { + "epoch": 0.050569749738319535, + "grad_norm": 2.0479557514190674, + "learning_rate": 4.9685242719930725e-05, + "loss": 5.3902, + "step": 8503 + }, + { + "epoch": 0.05057569702160053, + "grad_norm": 1.874993085861206, + "learning_rate": 4.9685168828270845e-05, + "loss": 5.4607, + "step": 8504 + }, + { + "epoch": 0.05058164430488153, + "grad_norm": 1.6361217498779297, + "learning_rate": 4.9685094927993623e-05, + "loss": 5.4378, + "step": 8505 + }, + { + "epoch": 0.05058759158816253, + "grad_norm": 1.598026990890503, + "learning_rate": 4.9685021019099096e-05, + "loss": 5.4336, + "step": 8506 + }, + { + "epoch": 0.05059353887144352, + "grad_norm": 1.7636823654174805, + "learning_rate": 4.968494710158728e-05, + "loss": 5.4757, + "step": 8507 + }, + { + "epoch": 0.050599486154724524, + "grad_norm": 1.7823325395584106, + "learning_rate": 4.968487317545821e-05, + "loss": 5.4872, + "step": 8508 + }, + { + "epoch": 0.05060543343800552, + "grad_norm": 2.39149808883667, + "learning_rate": 4.9684799240711896e-05, + "loss": 5.039, + "step": 8509 + }, + { + "epoch": 0.050611380721286514, + "grad_norm": 2.0295841693878174, + "learning_rate": 4.968472529734838e-05, + "loss": 5.1086, + "step": 8510 + }, + { + "epoch": 0.050617328004567516, + "grad_norm": 2.6830973625183105, + "learning_rate": 4.9684651345367684e-05, + "loss": 4.8889, + "step": 8511 + }, + { + "epoch": 0.05062327528784851, + "grad_norm": 2.3600027561187744, + "learning_rate": 4.9684577384769825e-05, + "loss": 5.5305, + "step": 8512 + }, + { + "epoch": 0.050629222571129506, + "grad_norm": 2.1680233478546143, + "learning_rate": 4.968450341555484e-05, + "loss": 5.8196, + "step": 8513 + }, + { + "epoch": 0.05063516985441051, + "grad_norm": 1.800645351409912, + "learning_rate": 4.968442943772275e-05, + "loss": 5.2689, + "step": 8514 + }, + { + "epoch": 0.0506411171376915, + "grad_norm": 1.983245849609375, + "learning_rate": 4.9684355451273566e-05, + "loss": 4.7782, + "step": 8515 + }, + { + "epoch": 0.0506470644209725, + "grad_norm": 2.12082576751709, + "learning_rate": 4.968428145620735e-05, + "loss": 4.7946, + "step": 8516 + }, + { + "epoch": 0.050653011704253494, + "grad_norm": 1.7249135971069336, + "learning_rate": 4.968420745252409e-05, + "loss": 4.7055, + "step": 8517 + }, + { + "epoch": 0.050658958987534496, + "grad_norm": 1.971240758895874, + "learning_rate": 4.968413344022384e-05, + "loss": 4.7343, + "step": 8518 + }, + { + "epoch": 0.05066490627081549, + "grad_norm": 1.780387282371521, + "learning_rate": 4.968405941930661e-05, + "loss": 4.7502, + "step": 8519 + }, + { + "epoch": 0.050670853554096486, + "grad_norm": 1.772007942199707, + "learning_rate": 4.968398538977242e-05, + "loss": 4.7439, + "step": 8520 + }, + { + "epoch": 0.05067680083737749, + "grad_norm": 1.9167592525482178, + "learning_rate": 4.9683911351621324e-05, + "loss": 4.6393, + "step": 8521 + }, + { + "epoch": 0.05068274812065848, + "grad_norm": 2.0527031421661377, + "learning_rate": 4.968383730485331e-05, + "loss": 4.6379, + "step": 8522 + }, + { + "epoch": 0.05068869540393948, + "grad_norm": 2.0608508586883545, + "learning_rate": 4.968376324946844e-05, + "loss": 4.6128, + "step": 8523 + }, + { + "epoch": 0.05069464268722048, + "grad_norm": 1.984731674194336, + "learning_rate": 4.968368918546672e-05, + "loss": 4.5969, + "step": 8524 + }, + { + "epoch": 0.050700589970501475, + "grad_norm": 1.7904438972473145, + "learning_rate": 4.968361511284817e-05, + "loss": 4.6853, + "step": 8525 + }, + { + "epoch": 0.05070653725378247, + "grad_norm": 1.8095389604568481, + "learning_rate": 4.968354103161283e-05, + "loss": 4.5748, + "step": 8526 + }, + { + "epoch": 0.05071248453706347, + "grad_norm": 1.8565012216567993, + "learning_rate": 4.968346694176073e-05, + "loss": 4.5249, + "step": 8527 + }, + { + "epoch": 0.05071843182034447, + "grad_norm": 1.7721836566925049, + "learning_rate": 4.968339284329188e-05, + "loss": 4.6593, + "step": 8528 + }, + { + "epoch": 0.05072437910362546, + "grad_norm": 1.9470161199569702, + "learning_rate": 4.968331873620631e-05, + "loss": 4.5432, + "step": 8529 + }, + { + "epoch": 0.050730326386906464, + "grad_norm": 1.8639118671417236, + "learning_rate": 4.968324462050404e-05, + "loss": 4.4464, + "step": 8530 + }, + { + "epoch": 0.05073627367018746, + "grad_norm": 1.9226467609405518, + "learning_rate": 4.9683170496185114e-05, + "loss": 4.4364, + "step": 8531 + }, + { + "epoch": 0.050742220953468455, + "grad_norm": 1.988198161125183, + "learning_rate": 4.9683096363249545e-05, + "loss": 4.6614, + "step": 8532 + }, + { + "epoch": 0.05074816823674945, + "grad_norm": 1.903645396232605, + "learning_rate": 4.9683022221697374e-05, + "loss": 4.5168, + "step": 8533 + }, + { + "epoch": 0.05075411552003045, + "grad_norm": 1.903448224067688, + "learning_rate": 4.96829480715286e-05, + "loss": 4.5899, + "step": 8534 + }, + { + "epoch": 0.05076006280331145, + "grad_norm": 1.864522099494934, + "learning_rate": 4.9682873912743274e-05, + "loss": 4.5896, + "step": 8535 + }, + { + "epoch": 0.05076601008659244, + "grad_norm": 1.8760302066802979, + "learning_rate": 4.9682799745341406e-05, + "loss": 4.593, + "step": 8536 + }, + { + "epoch": 0.050771957369873444, + "grad_norm": 1.9024009704589844, + "learning_rate": 4.968272556932303e-05, + "loss": 4.9861, + "step": 8537 + }, + { + "epoch": 0.05077790465315444, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9682651384688176e-05, + "loss": 5.6755, + "step": 8538 + }, + { + "epoch": 0.050783851936435434, + "grad_norm": 1.758934736251831, + "learning_rate": 4.9682577191436854e-05, + "loss": 5.4334, + "step": 8539 + }, + { + "epoch": 0.050789799219716436, + "grad_norm": 2.3531200885772705, + "learning_rate": 4.968250298956909e-05, + "loss": 4.9819, + "step": 8540 + }, + { + "epoch": 0.05079574650299743, + "grad_norm": 1.901681661605835, + "learning_rate": 4.968242877908494e-05, + "loss": 5.1642, + "step": 8541 + }, + { + "epoch": 0.050801693786278426, + "grad_norm": 1.7250633239746094, + "learning_rate": 4.96823545599844e-05, + "loss": 5.4847, + "step": 8542 + }, + { + "epoch": 0.05080764106955943, + "grad_norm": 1.7400966882705688, + "learning_rate": 4.968228033226751e-05, + "loss": 5.5902, + "step": 8543 + }, + { + "epoch": 0.05081358835284042, + "grad_norm": 1.5469578504562378, + "learning_rate": 4.968220609593428e-05, + "loss": 5.6432, + "step": 8544 + }, + { + "epoch": 0.05081953563612142, + "grad_norm": 1.8277182579040527, + "learning_rate": 4.968213185098475e-05, + "loss": 5.3296, + "step": 8545 + }, + { + "epoch": 0.050825482919402414, + "grad_norm": 2.0535261631011963, + "learning_rate": 4.9682057597418943e-05, + "loss": 5.5278, + "step": 8546 + }, + { + "epoch": 0.050831430202683416, + "grad_norm": 1.8631746768951416, + "learning_rate": 4.9681983335236894e-05, + "loss": 5.556, + "step": 8547 + }, + { + "epoch": 0.05083737748596441, + "grad_norm": 1.6663711071014404, + "learning_rate": 4.968190906443861e-05, + "loss": 5.4321, + "step": 8548 + }, + { + "epoch": 0.050843324769245406, + "grad_norm": 1.8302260637283325, + "learning_rate": 4.968183478502413e-05, + "loss": 5.4746, + "step": 8549 + }, + { + "epoch": 0.05084927205252641, + "grad_norm": 1.9203182458877563, + "learning_rate": 4.968176049699347e-05, + "loss": 5.4334, + "step": 8550 + }, + { + "epoch": 0.0508552193358074, + "grad_norm": 2.0406670570373535, + "learning_rate": 4.9681686200346674e-05, + "loss": 5.6509, + "step": 8551 + }, + { + "epoch": 0.0508611666190884, + "grad_norm": 2.3438572883605957, + "learning_rate": 4.968161189508374e-05, + "loss": 5.8662, + "step": 8552 + }, + { + "epoch": 0.0508671139023694, + "grad_norm": 1.9612985849380493, + "learning_rate": 4.968153758120473e-05, + "loss": 5.6813, + "step": 8553 + }, + { + "epoch": 0.050873061185650395, + "grad_norm": 1.4175993204116821, + "learning_rate": 4.968146325870964e-05, + "loss": 5.4593, + "step": 8554 + }, + { + "epoch": 0.05087900846893139, + "grad_norm": 1.3445212841033936, + "learning_rate": 4.96813889275985e-05, + "loss": 5.4195, + "step": 8555 + }, + { + "epoch": 0.05088495575221239, + "grad_norm": 1.9938427209854126, + "learning_rate": 4.968131458787135e-05, + "loss": 5.8791, + "step": 8556 + }, + { + "epoch": 0.05089090303549339, + "grad_norm": 1.7449276447296143, + "learning_rate": 4.9681240239528216e-05, + "loss": 5.3574, + "step": 8557 + }, + { + "epoch": 0.05089685031877438, + "grad_norm": 2.0117087364196777, + "learning_rate": 4.96811658825691e-05, + "loss": 5.3548, + "step": 8558 + }, + { + "epoch": 0.050902797602055384, + "grad_norm": 1.97372567653656, + "learning_rate": 4.968109151699406e-05, + "loss": 5.5281, + "step": 8559 + }, + { + "epoch": 0.05090874488533638, + "grad_norm": 1.8815237283706665, + "learning_rate": 4.9681017142803095e-05, + "loss": 5.4849, + "step": 8560 + }, + { + "epoch": 0.050914692168617375, + "grad_norm": 1.627252221107483, + "learning_rate": 4.968094275999624e-05, + "loss": 5.2125, + "step": 8561 + }, + { + "epoch": 0.05092063945189837, + "grad_norm": 1.4768601655960083, + "learning_rate": 4.968086836857353e-05, + "loss": 5.0817, + "step": 8562 + }, + { + "epoch": 0.05092658673517937, + "grad_norm": 2.0249485969543457, + "learning_rate": 4.968079396853498e-05, + "loss": 5.4025, + "step": 8563 + }, + { + "epoch": 0.05093253401846037, + "grad_norm": 2.0904550552368164, + "learning_rate": 4.968071955988062e-05, + "loss": 5.4404, + "step": 8564 + }, + { + "epoch": 0.05093848130174136, + "grad_norm": 1.935063123703003, + "learning_rate": 4.9680645142610475e-05, + "loss": 5.4961, + "step": 8565 + }, + { + "epoch": 0.050944428585022364, + "grad_norm": 1.9836292266845703, + "learning_rate": 4.968057071672457e-05, + "loss": 5.2469, + "step": 8566 + }, + { + "epoch": 0.05095037586830336, + "grad_norm": 1.8337205648422241, + "learning_rate": 4.9680496282222944e-05, + "loss": 5.4432, + "step": 8567 + }, + { + "epoch": 0.050956323151584354, + "grad_norm": 1.9169154167175293, + "learning_rate": 4.9680421839105604e-05, + "loss": 5.2606, + "step": 8568 + }, + { + "epoch": 0.050962270434865356, + "grad_norm": 1.5869332551956177, + "learning_rate": 4.968034738737258e-05, + "loss": 5.006, + "step": 8569 + }, + { + "epoch": 0.05096821771814635, + "grad_norm": 1.5824979543685913, + "learning_rate": 4.968027292702391e-05, + "loss": 5.2078, + "step": 8570 + }, + { + "epoch": 0.050974165001427346, + "grad_norm": 1.7121458053588867, + "learning_rate": 4.96801984580596e-05, + "loss": 5.3913, + "step": 8571 + }, + { + "epoch": 0.05098011228470835, + "grad_norm": 1.7111082077026367, + "learning_rate": 4.96801239804797e-05, + "loss": 5.3957, + "step": 8572 + }, + { + "epoch": 0.05098605956798934, + "grad_norm": 1.834083080291748, + "learning_rate": 4.968004949428421e-05, + "loss": 5.501, + "step": 8573 + }, + { + "epoch": 0.05099200685127034, + "grad_norm": 1.773421287536621, + "learning_rate": 4.967997499947318e-05, + "loss": 5.429, + "step": 8574 + }, + { + "epoch": 0.05099795413455134, + "grad_norm": 1.7471132278442383, + "learning_rate": 4.967990049604663e-05, + "loss": 5.4853, + "step": 8575 + }, + { + "epoch": 0.051003901417832335, + "grad_norm": 1.7264289855957031, + "learning_rate": 4.967982598400457e-05, + "loss": 5.4415, + "step": 8576 + }, + { + "epoch": 0.05100984870111333, + "grad_norm": 1.750982403755188, + "learning_rate": 4.9679751463347044e-05, + "loss": 5.1731, + "step": 8577 + }, + { + "epoch": 0.051015795984394326, + "grad_norm": 1.6106518507003784, + "learning_rate": 4.967967693407407e-05, + "loss": 5.2692, + "step": 8578 + }, + { + "epoch": 0.05102174326767533, + "grad_norm": 1.8728212118148804, + "learning_rate": 4.967960239618568e-05, + "loss": 5.2416, + "step": 8579 + }, + { + "epoch": 0.05102769055095632, + "grad_norm": 1.6410562992095947, + "learning_rate": 4.967952784968189e-05, + "loss": 5.1824, + "step": 8580 + }, + { + "epoch": 0.05103363783423732, + "grad_norm": 1.7119427919387817, + "learning_rate": 4.967945329456274e-05, + "loss": 5.2316, + "step": 8581 + }, + { + "epoch": 0.05103958511751832, + "grad_norm": 1.667602300643921, + "learning_rate": 4.967937873082824e-05, + "loss": 4.9599, + "step": 8582 + }, + { + "epoch": 0.051045532400799315, + "grad_norm": 1.9595974683761597, + "learning_rate": 4.967930415847842e-05, + "loss": 4.9613, + "step": 8583 + }, + { + "epoch": 0.05105147968408031, + "grad_norm": 1.70210862159729, + "learning_rate": 4.967922957751332e-05, + "loss": 5.3587, + "step": 8584 + }, + { + "epoch": 0.05105742696736131, + "grad_norm": 2.101145029067993, + "learning_rate": 4.967915498793295e-05, + "loss": 5.2782, + "step": 8585 + }, + { + "epoch": 0.05106337425064231, + "grad_norm": 1.8836926221847534, + "learning_rate": 4.9679080389737344e-05, + "loss": 5.3128, + "step": 8586 + }, + { + "epoch": 0.0510693215339233, + "grad_norm": 1.7542184591293335, + "learning_rate": 4.967900578292652e-05, + "loss": 5.2236, + "step": 8587 + }, + { + "epoch": 0.051075268817204304, + "grad_norm": 1.8415964841842651, + "learning_rate": 4.967893116750052e-05, + "loss": 5.1267, + "step": 8588 + }, + { + "epoch": 0.0510812161004853, + "grad_norm": 1.7702316045761108, + "learning_rate": 4.967885654345936e-05, + "loss": 5.6495, + "step": 8589 + }, + { + "epoch": 0.051087163383766294, + "grad_norm": 1.7790406942367554, + "learning_rate": 4.967878191080306e-05, + "loss": 5.2561, + "step": 8590 + }, + { + "epoch": 0.05109311066704729, + "grad_norm": 1.7282217741012573, + "learning_rate": 4.967870726953165e-05, + "loss": 5.2589, + "step": 8591 + }, + { + "epoch": 0.05109905795032829, + "grad_norm": 1.6590560674667358, + "learning_rate": 4.967863261964517e-05, + "loss": 5.1952, + "step": 8592 + }, + { + "epoch": 0.05110500523360929, + "grad_norm": 1.5948386192321777, + "learning_rate": 4.9678557961143625e-05, + "loss": 5.297, + "step": 8593 + }, + { + "epoch": 0.05111095251689028, + "grad_norm": 1.8219022750854492, + "learning_rate": 4.9678483294027046e-05, + "loss": 5.3391, + "step": 8594 + }, + { + "epoch": 0.051116899800171284, + "grad_norm": 1.547616720199585, + "learning_rate": 4.967840861829547e-05, + "loss": 5.4224, + "step": 8595 + }, + { + "epoch": 0.05112284708345228, + "grad_norm": 1.7924590110778809, + "learning_rate": 4.9678333933948914e-05, + "loss": 5.2371, + "step": 8596 + }, + { + "epoch": 0.051128794366733274, + "grad_norm": 1.7630747556686401, + "learning_rate": 4.9678259240987416e-05, + "loss": 5.4849, + "step": 8597 + }, + { + "epoch": 0.051134741650014276, + "grad_norm": 1.7853891849517822, + "learning_rate": 4.967818453941098e-05, + "loss": 5.1753, + "step": 8598 + }, + { + "epoch": 0.05114068893329527, + "grad_norm": 1.6572301387786865, + "learning_rate": 4.9678109829219654e-05, + "loss": 5.3747, + "step": 8599 + }, + { + "epoch": 0.051146636216576266, + "grad_norm": 1.6574329137802124, + "learning_rate": 4.9678035110413445e-05, + "loss": 5.417, + "step": 8600 + }, + { + "epoch": 0.05115258349985727, + "grad_norm": 1.7093894481658936, + "learning_rate": 4.9677960382992396e-05, + "loss": 5.4605, + "step": 8601 + }, + { + "epoch": 0.05115853078313826, + "grad_norm": 1.6304559707641602, + "learning_rate": 4.967788564695652e-05, + "loss": 5.6186, + "step": 8602 + }, + { + "epoch": 0.05116447806641926, + "grad_norm": 1.6134929656982422, + "learning_rate": 4.967781090230586e-05, + "loss": 5.5084, + "step": 8603 + }, + { + "epoch": 0.05117042534970026, + "grad_norm": 1.7007251977920532, + "learning_rate": 4.9677736149040426e-05, + "loss": 5.2542, + "step": 8604 + }, + { + "epoch": 0.051176372632981255, + "grad_norm": 1.6648818254470825, + "learning_rate": 4.967766138716025e-05, + "loss": 5.4136, + "step": 8605 + }, + { + "epoch": 0.05118231991626225, + "grad_norm": 1.5595816373825073, + "learning_rate": 4.967758661666535e-05, + "loss": 5.181, + "step": 8606 + }, + { + "epoch": 0.051188267199543246, + "grad_norm": 1.7358763217926025, + "learning_rate": 4.967751183755577e-05, + "loss": 5.3509, + "step": 8607 + }, + { + "epoch": 0.05119421448282425, + "grad_norm": 1.6836191415786743, + "learning_rate": 4.967743704983152e-05, + "loss": 5.4656, + "step": 8608 + }, + { + "epoch": 0.05120016176610524, + "grad_norm": 1.4641087055206299, + "learning_rate": 4.967736225349263e-05, + "loss": 5.5304, + "step": 8609 + }, + { + "epoch": 0.05120610904938624, + "grad_norm": 1.6273541450500488, + "learning_rate": 4.967728744853913e-05, + "loss": 5.4029, + "step": 8610 + }, + { + "epoch": 0.05121205633266724, + "grad_norm": 1.6471314430236816, + "learning_rate": 4.967721263497105e-05, + "loss": 5.4333, + "step": 8611 + }, + { + "epoch": 0.051218003615948235, + "grad_norm": 1.798155665397644, + "learning_rate": 4.96771378127884e-05, + "loss": 5.5214, + "step": 8612 + }, + { + "epoch": 0.05122395089922923, + "grad_norm": 1.8606700897216797, + "learning_rate": 4.967706298199122e-05, + "loss": 4.8808, + "step": 8613 + }, + { + "epoch": 0.05122989818251023, + "grad_norm": 1.7144849300384521, + "learning_rate": 4.967698814257953e-05, + "loss": 4.9451, + "step": 8614 + }, + { + "epoch": 0.05123584546579123, + "grad_norm": 1.7411640882492065, + "learning_rate": 4.9676913294553364e-05, + "loss": 4.9771, + "step": 8615 + }, + { + "epoch": 0.05124179274907222, + "grad_norm": 1.7012072801589966, + "learning_rate": 4.9676838437912736e-05, + "loss": 4.9028, + "step": 8616 + }, + { + "epoch": 0.051247740032353224, + "grad_norm": 1.8154243230819702, + "learning_rate": 4.967676357265768e-05, + "loss": 5.4115, + "step": 8617 + }, + { + "epoch": 0.05125368731563422, + "grad_norm": 2.7746822834014893, + "learning_rate": 4.967668869878823e-05, + "loss": 5.5487, + "step": 8618 + }, + { + "epoch": 0.051259634598915214, + "grad_norm": 1.8362152576446533, + "learning_rate": 4.9676613816304395e-05, + "loss": 5.486, + "step": 8619 + }, + { + "epoch": 0.05126558188219621, + "grad_norm": 1.975853681564331, + "learning_rate": 4.967653892520621e-05, + "loss": 5.4348, + "step": 8620 + }, + { + "epoch": 0.05127152916547721, + "grad_norm": 1.8126581907272339, + "learning_rate": 4.96764640254937e-05, + "loss": 5.4558, + "step": 8621 + }, + { + "epoch": 0.05127747644875821, + "grad_norm": 1.6068531274795532, + "learning_rate": 4.967638911716689e-05, + "loss": 5.4672, + "step": 8622 + }, + { + "epoch": 0.0512834237320392, + "grad_norm": 1.6384878158569336, + "learning_rate": 4.9676314200225804e-05, + "loss": 5.1591, + "step": 8623 + }, + { + "epoch": 0.051289371015320204, + "grad_norm": 2.0413742065429688, + "learning_rate": 4.9676239274670474e-05, + "loss": 4.8992, + "step": 8624 + }, + { + "epoch": 0.0512953182986012, + "grad_norm": 1.7591389417648315, + "learning_rate": 4.967616434050093e-05, + "loss": 5.3629, + "step": 8625 + }, + { + "epoch": 0.051301265581882194, + "grad_norm": 1.9222301244735718, + "learning_rate": 4.967608939771719e-05, + "loss": 5.5082, + "step": 8626 + }, + { + "epoch": 0.051307212865163196, + "grad_norm": 1.8040579557418823, + "learning_rate": 4.967601444631928e-05, + "loss": 5.4019, + "step": 8627 + }, + { + "epoch": 0.05131316014844419, + "grad_norm": 2.0685603618621826, + "learning_rate": 4.967593948630723e-05, + "loss": 5.1959, + "step": 8628 + }, + { + "epoch": 0.051319107431725186, + "grad_norm": 1.446341872215271, + "learning_rate": 4.967586451768106e-05, + "loss": 5.4233, + "step": 8629 + }, + { + "epoch": 0.05132505471500619, + "grad_norm": 1.4487289190292358, + "learning_rate": 4.9675789540440806e-05, + "loss": 5.4065, + "step": 8630 + }, + { + "epoch": 0.05133100199828718, + "grad_norm": 2.367469310760498, + "learning_rate": 4.967571455458648e-05, + "loss": 5.3512, + "step": 8631 + }, + { + "epoch": 0.05133694928156818, + "grad_norm": 2.7115249633789062, + "learning_rate": 4.967563956011812e-05, + "loss": 5.4494, + "step": 8632 + }, + { + "epoch": 0.05134289656484918, + "grad_norm": 2.6692097187042236, + "learning_rate": 4.967556455703576e-05, + "loss": 5.2747, + "step": 8633 + }, + { + "epoch": 0.051348843848130175, + "grad_norm": 2.516005754470825, + "learning_rate": 4.967548954533941e-05, + "loss": 5.2305, + "step": 8634 + }, + { + "epoch": 0.05135479113141117, + "grad_norm": 1.6234782934188843, + "learning_rate": 4.96754145250291e-05, + "loss": 5.5192, + "step": 8635 + }, + { + "epoch": 0.051360738414692166, + "grad_norm": 1.9273806810379028, + "learning_rate": 4.9675339496104855e-05, + "loss": 5.4479, + "step": 8636 + }, + { + "epoch": 0.05136668569797317, + "grad_norm": 2.510847568511963, + "learning_rate": 4.967526445856671e-05, + "loss": 4.9858, + "step": 8637 + }, + { + "epoch": 0.05137263298125416, + "grad_norm": 2.3722991943359375, + "learning_rate": 4.967518941241468e-05, + "loss": 5.2287, + "step": 8638 + }, + { + "epoch": 0.05137858026453516, + "grad_norm": 2.286569118499756, + "learning_rate": 4.96751143576488e-05, + "loss": 5.2643, + "step": 8639 + }, + { + "epoch": 0.05138452754781616, + "grad_norm": 2.493534803390503, + "learning_rate": 4.9675039294269086e-05, + "loss": 5.1207, + "step": 8640 + }, + { + "epoch": 0.051390474831097155, + "grad_norm": 2.622694969177246, + "learning_rate": 4.967496422227558e-05, + "loss": 4.9735, + "step": 8641 + }, + { + "epoch": 0.05139642211437815, + "grad_norm": 1.7518365383148193, + "learning_rate": 4.967488914166829e-05, + "loss": 5.8818, + "step": 8642 + }, + { + "epoch": 0.05140236939765915, + "grad_norm": 2.0281870365142822, + "learning_rate": 4.9674814052447256e-05, + "loss": 6.3773, + "step": 8643 + }, + { + "epoch": 0.05140831668094015, + "grad_norm": 1.880083441734314, + "learning_rate": 4.96747389546125e-05, + "loss": 5.831, + "step": 8644 + }, + { + "epoch": 0.05141426396422114, + "grad_norm": 2.0792593955993652, + "learning_rate": 4.967466384816404e-05, + "loss": 5.8799, + "step": 8645 + }, + { + "epoch": 0.051420211247502144, + "grad_norm": 2.4550280570983887, + "learning_rate": 4.967458873310192e-05, + "loss": 5.2983, + "step": 8646 + }, + { + "epoch": 0.05142615853078314, + "grad_norm": 2.5590765476226807, + "learning_rate": 4.967451360942615e-05, + "loss": 5.1157, + "step": 8647 + }, + { + "epoch": 0.051432105814064134, + "grad_norm": 2.2328450679779053, + "learning_rate": 4.967443847713677e-05, + "loss": 5.047, + "step": 8648 + }, + { + "epoch": 0.05143805309734513, + "grad_norm": 2.0624022483825684, + "learning_rate": 4.9674363336233786e-05, + "loss": 5.6819, + "step": 8649 + }, + { + "epoch": 0.05144400038062613, + "grad_norm": 2.075239658355713, + "learning_rate": 4.9674288186717246e-05, + "loss": 5.895, + "step": 8650 + }, + { + "epoch": 0.05144994766390713, + "grad_norm": 1.7228562831878662, + "learning_rate": 4.967421302858716e-05, + "loss": 5.9199, + "step": 8651 + }, + { + "epoch": 0.05145589494718812, + "grad_norm": 2.235020637512207, + "learning_rate": 4.967413786184356e-05, + "loss": 5.0644, + "step": 8652 + }, + { + "epoch": 0.051461842230469124, + "grad_norm": 1.8620972633361816, + "learning_rate": 4.967406268648648e-05, + "loss": 5.7956, + "step": 8653 + }, + { + "epoch": 0.05146778951375012, + "grad_norm": 1.7914378643035889, + "learning_rate": 4.967398750251594e-05, + "loss": 5.742, + "step": 8654 + }, + { + "epoch": 0.051473736797031114, + "grad_norm": 2.0010504722595215, + "learning_rate": 4.967391230993196e-05, + "loss": 5.7808, + "step": 8655 + }, + { + "epoch": 0.051479684080312116, + "grad_norm": 2.1851212978363037, + "learning_rate": 4.9673837108734575e-05, + "loss": 5.4217, + "step": 8656 + }, + { + "epoch": 0.05148563136359311, + "grad_norm": 1.6896641254425049, + "learning_rate": 4.967376189892382e-05, + "loss": 6.321, + "step": 8657 + }, + { + "epoch": 0.051491578646874106, + "grad_norm": 1.7083675861358643, + "learning_rate": 4.967368668049969e-05, + "loss": 5.495, + "step": 8658 + }, + { + "epoch": 0.05149752593015511, + "grad_norm": 2.537256956100464, + "learning_rate": 4.967361145346224e-05, + "loss": 5.4096, + "step": 8659 + }, + { + "epoch": 0.0515034732134361, + "grad_norm": 2.3463892936706543, + "learning_rate": 4.967353621781149e-05, + "loss": 6.2461, + "step": 8660 + }, + { + "epoch": 0.0515094204967171, + "grad_norm": 1.6834701299667358, + "learning_rate": 4.967346097354746e-05, + "loss": 6.1007, + "step": 8661 + }, + { + "epoch": 0.0515153677799981, + "grad_norm": 2.140557289123535, + "learning_rate": 4.9673385720670184e-05, + "loss": 5.9908, + "step": 8662 + }, + { + "epoch": 0.051521315063279095, + "grad_norm": 2.211639165878296, + "learning_rate": 4.9673310459179676e-05, + "loss": 6.4192, + "step": 8663 + }, + { + "epoch": 0.05152726234656009, + "grad_norm": 1.8421399593353271, + "learning_rate": 4.9673235189075975e-05, + "loss": 6.099, + "step": 8664 + }, + { + "epoch": 0.051533209629841085, + "grad_norm": 1.7775965929031372, + "learning_rate": 4.96731599103591e-05, + "loss": 5.9572, + "step": 8665 + }, + { + "epoch": 0.05153915691312209, + "grad_norm": 1.7500132322311401, + "learning_rate": 4.967308462302909e-05, + "loss": 6.0987, + "step": 8666 + }, + { + "epoch": 0.05154510419640308, + "grad_norm": 1.7952892780303955, + "learning_rate": 4.967300932708595e-05, + "loss": 6.0235, + "step": 8667 + }, + { + "epoch": 0.05155105147968408, + "grad_norm": 1.7696008682250977, + "learning_rate": 4.967293402252972e-05, + "loss": 5.8253, + "step": 8668 + }, + { + "epoch": 0.05155699876296508, + "grad_norm": 1.848975419998169, + "learning_rate": 4.967285870936042e-05, + "loss": 6.0942, + "step": 8669 + }, + { + "epoch": 0.051562946046246075, + "grad_norm": 2.412909507751465, + "learning_rate": 4.967278338757808e-05, + "loss": 5.5752, + "step": 8670 + }, + { + "epoch": 0.05156889332952707, + "grad_norm": 2.0214738845825195, + "learning_rate": 4.967270805718273e-05, + "loss": 5.5721, + "step": 8671 + }, + { + "epoch": 0.05157484061280807, + "grad_norm": 2.3830201625823975, + "learning_rate": 4.967263271817439e-05, + "loss": 6.034, + "step": 8672 + }, + { + "epoch": 0.05158078789608907, + "grad_norm": 2.213979959487915, + "learning_rate": 4.9672557370553094e-05, + "loss": 6.0169, + "step": 8673 + }, + { + "epoch": 0.05158673517937006, + "grad_norm": 1.9657354354858398, + "learning_rate": 4.967248201431887e-05, + "loss": 6.0159, + "step": 8674 + }, + { + "epoch": 0.051592682462651064, + "grad_norm": 2.0882673263549805, + "learning_rate": 4.967240664947172e-05, + "loss": 6.1088, + "step": 8675 + }, + { + "epoch": 0.05159862974593206, + "grad_norm": 2.291152000427246, + "learning_rate": 4.96723312760117e-05, + "loss": 5.4534, + "step": 8676 + }, + { + "epoch": 0.051604577029213054, + "grad_norm": 2.3495421409606934, + "learning_rate": 4.967225589393881e-05, + "loss": 5.5524, + "step": 8677 + }, + { + "epoch": 0.05161052431249405, + "grad_norm": 2.2665255069732666, + "learning_rate": 4.9672180503253106e-05, + "loss": 5.5208, + "step": 8678 + }, + { + "epoch": 0.05161647159577505, + "grad_norm": 2.1587207317352295, + "learning_rate": 4.9672105103954594e-05, + "loss": 5.7016, + "step": 8679 + }, + { + "epoch": 0.051622418879056046, + "grad_norm": 2.2260420322418213, + "learning_rate": 4.96720296960433e-05, + "loss": 5.6179, + "step": 8680 + }, + { + "epoch": 0.05162836616233704, + "grad_norm": 3.1678147315979004, + "learning_rate": 4.967195427951926e-05, + "loss": 5.4655, + "step": 8681 + }, + { + "epoch": 0.051634313445618044, + "grad_norm": 3.0126166343688965, + "learning_rate": 4.967187885438249e-05, + "loss": 5.5663, + "step": 8682 + }, + { + "epoch": 0.05164026072889904, + "grad_norm": 2.290069341659546, + "learning_rate": 4.9671803420633034e-05, + "loss": 5.7462, + "step": 8683 + }, + { + "epoch": 0.051646208012180034, + "grad_norm": 2.1958532333374023, + "learning_rate": 4.96717279782709e-05, + "loss": 5.8359, + "step": 8684 + }, + { + "epoch": 0.051652155295461036, + "grad_norm": 2.063312530517578, + "learning_rate": 4.967165252729611e-05, + "loss": 5.847, + "step": 8685 + }, + { + "epoch": 0.05165810257874203, + "grad_norm": 1.8041539192199707, + "learning_rate": 4.967157706770872e-05, + "loss": 5.9408, + "step": 8686 + }, + { + "epoch": 0.051664049862023026, + "grad_norm": 1.684831976890564, + "learning_rate": 4.967150159950873e-05, + "loss": 6.019, + "step": 8687 + }, + { + "epoch": 0.05166999714530403, + "grad_norm": 2.4915740489959717, + "learning_rate": 4.967142612269616e-05, + "loss": 5.357, + "step": 8688 + }, + { + "epoch": 0.05167594442858502, + "grad_norm": 2.2621138095855713, + "learning_rate": 4.967135063727106e-05, + "loss": 5.7726, + "step": 8689 + }, + { + "epoch": 0.05168189171186602, + "grad_norm": 1.9304747581481934, + "learning_rate": 4.967127514323345e-05, + "loss": 6.0958, + "step": 8690 + }, + { + "epoch": 0.05168783899514702, + "grad_norm": 1.7657890319824219, + "learning_rate": 4.9671199640583354e-05, + "loss": 6.1036, + "step": 8691 + }, + { + "epoch": 0.051693786278428015, + "grad_norm": 1.7449486255645752, + "learning_rate": 4.9671124129320794e-05, + "loss": 6.0843, + "step": 8692 + }, + { + "epoch": 0.05169973356170901, + "grad_norm": 2.0155117511749268, + "learning_rate": 4.96710486094458e-05, + "loss": 5.9626, + "step": 8693 + }, + { + "epoch": 0.051705680844990005, + "grad_norm": 2.1015188694000244, + "learning_rate": 4.967097308095839e-05, + "loss": 5.6053, + "step": 8694 + }, + { + "epoch": 0.05171162812827101, + "grad_norm": 1.9602909088134766, + "learning_rate": 4.967089754385861e-05, + "loss": 5.1988, + "step": 8695 + }, + { + "epoch": 0.051717575411552, + "grad_norm": 2.141657590866089, + "learning_rate": 4.9670821998146474e-05, + "loss": 5.2994, + "step": 8696 + }, + { + "epoch": 0.051723522694833, + "grad_norm": 2.1301774978637695, + "learning_rate": 4.9670746443822006e-05, + "loss": 5.7935, + "step": 8697 + }, + { + "epoch": 0.051729469978114, + "grad_norm": 1.9465678930282593, + "learning_rate": 4.9670670880885225e-05, + "loss": 5.1861, + "step": 8698 + }, + { + "epoch": 0.051735417261394995, + "grad_norm": 2.177234411239624, + "learning_rate": 4.967059530933618e-05, + "loss": 5.1114, + "step": 8699 + }, + { + "epoch": 0.05174136454467599, + "grad_norm": 2.0886077880859375, + "learning_rate": 4.967051972917488e-05, + "loss": 5.2905, + "step": 8700 + }, + { + "epoch": 0.05174731182795699, + "grad_norm": 1.8517125844955444, + "learning_rate": 4.967044414040136e-05, + "loss": 5.1672, + "step": 8701 + }, + { + "epoch": 0.05175325911123799, + "grad_norm": 1.7342808246612549, + "learning_rate": 4.967036854301564e-05, + "loss": 5.2767, + "step": 8702 + }, + { + "epoch": 0.05175920639451898, + "grad_norm": 1.7315362691879272, + "learning_rate": 4.9670292937017746e-05, + "loss": 5.2897, + "step": 8703 + }, + { + "epoch": 0.051765153677799984, + "grad_norm": 1.8794540166854858, + "learning_rate": 4.967021732240772e-05, + "loss": 5.3808, + "step": 8704 + }, + { + "epoch": 0.05177110096108098, + "grad_norm": 1.8047478199005127, + "learning_rate": 4.9670141699185565e-05, + "loss": 5.1074, + "step": 8705 + }, + { + "epoch": 0.051777048244361974, + "grad_norm": 1.699475884437561, + "learning_rate": 4.967006606735132e-05, + "loss": 5.8162, + "step": 8706 + }, + { + "epoch": 0.05178299552764297, + "grad_norm": 2.008352518081665, + "learning_rate": 4.966999042690501e-05, + "loss": 6.3593, + "step": 8707 + }, + { + "epoch": 0.05178894281092397, + "grad_norm": 1.8776370286941528, + "learning_rate": 4.966991477784667e-05, + "loss": 6.3419, + "step": 8708 + }, + { + "epoch": 0.051794890094204966, + "grad_norm": 2.018157720565796, + "learning_rate": 4.9669839120176306e-05, + "loss": 6.1927, + "step": 8709 + }, + { + "epoch": 0.05180083737748596, + "grad_norm": 1.833764910697937, + "learning_rate": 4.966976345389396e-05, + "loss": 5.0803, + "step": 8710 + }, + { + "epoch": 0.051806784660766964, + "grad_norm": 1.7809339761734009, + "learning_rate": 4.9669687778999655e-05, + "loss": 5.3891, + "step": 8711 + }, + { + "epoch": 0.05181273194404796, + "grad_norm": 1.9905017614364624, + "learning_rate": 4.966961209549341e-05, + "loss": 6.247, + "step": 8712 + }, + { + "epoch": 0.051818679227328954, + "grad_norm": 2.1396658420562744, + "learning_rate": 4.966953640337527e-05, + "loss": 6.2506, + "step": 8713 + }, + { + "epoch": 0.051824626510609956, + "grad_norm": 1.778996467590332, + "learning_rate": 4.9669460702645244e-05, + "loss": 6.1333, + "step": 8714 + }, + { + "epoch": 0.05183057379389095, + "grad_norm": 1.9936842918395996, + "learning_rate": 4.9669384993303366e-05, + "loss": 5.6486, + "step": 8715 + }, + { + "epoch": 0.051836521077171946, + "grad_norm": 1.8064475059509277, + "learning_rate": 4.9669309275349656e-05, + "loss": 6.1217, + "step": 8716 + }, + { + "epoch": 0.05184246836045295, + "grad_norm": 1.9532819986343384, + "learning_rate": 4.966923354878414e-05, + "loss": 5.5402, + "step": 8717 + }, + { + "epoch": 0.05184841564373394, + "grad_norm": 2.4843015670776367, + "learning_rate": 4.966915781360686e-05, + "loss": 4.7674, + "step": 8718 + }, + { + "epoch": 0.05185436292701494, + "grad_norm": 2.7453129291534424, + "learning_rate": 4.9669082069817835e-05, + "loss": 4.4489, + "step": 8719 + }, + { + "epoch": 0.05186031021029594, + "grad_norm": 3.0180628299713135, + "learning_rate": 4.9669006317417084e-05, + "loss": 4.1401, + "step": 8720 + }, + { + "epoch": 0.051866257493576935, + "grad_norm": 2.44638991355896, + "learning_rate": 4.966893055640464e-05, + "loss": 4.7241, + "step": 8721 + }, + { + "epoch": 0.05187220477685793, + "grad_norm": 2.0131804943084717, + "learning_rate": 4.9668854786780514e-05, + "loss": 5.6495, + "step": 8722 + }, + { + "epoch": 0.051878152060138925, + "grad_norm": 2.0331337451934814, + "learning_rate": 4.966877900854476e-05, + "loss": 5.6812, + "step": 8723 + }, + { + "epoch": 0.05188409934341993, + "grad_norm": 2.5784926414489746, + "learning_rate": 4.9668703221697385e-05, + "loss": 5.3617, + "step": 8724 + }, + { + "epoch": 0.05189004662670092, + "grad_norm": 2.599321126937866, + "learning_rate": 4.9668627426238425e-05, + "loss": 5.6273, + "step": 8725 + }, + { + "epoch": 0.05189599390998192, + "grad_norm": 2.53541898727417, + "learning_rate": 4.966855162216789e-05, + "loss": 5.2916, + "step": 8726 + }, + { + "epoch": 0.05190194119326292, + "grad_norm": 2.165160655975342, + "learning_rate": 4.9668475809485825e-05, + "loss": 5.6152, + "step": 8727 + }, + { + "epoch": 0.051907888476543915, + "grad_norm": 2.4488654136657715, + "learning_rate": 4.966839998819225e-05, + "loss": 5.4163, + "step": 8728 + }, + { + "epoch": 0.05191383575982491, + "grad_norm": 2.2756056785583496, + "learning_rate": 4.96683241582872e-05, + "loss": 5.9449, + "step": 8729 + }, + { + "epoch": 0.05191978304310591, + "grad_norm": 2.7889063358306885, + "learning_rate": 4.9668248319770683e-05, + "loss": 5.9502, + "step": 8730 + }, + { + "epoch": 0.05192573032638691, + "grad_norm": 2.620378255844116, + "learning_rate": 4.9668172472642735e-05, + "loss": 4.8344, + "step": 8731 + }, + { + "epoch": 0.0519316776096679, + "grad_norm": 2.2405688762664795, + "learning_rate": 4.9668096616903395e-05, + "loss": 5.598, + "step": 8732 + }, + { + "epoch": 0.051937624892948904, + "grad_norm": 2.3559701442718506, + "learning_rate": 4.9668020752552664e-05, + "loss": 5.7951, + "step": 8733 + }, + { + "epoch": 0.0519435721762299, + "grad_norm": 1.9856364727020264, + "learning_rate": 4.966794487959058e-05, + "loss": 5.3907, + "step": 8734 + }, + { + "epoch": 0.051949519459510894, + "grad_norm": 2.345541000366211, + "learning_rate": 4.966786899801718e-05, + "loss": 5.9875, + "step": 8735 + }, + { + "epoch": 0.05195546674279189, + "grad_norm": 2.4069056510925293, + "learning_rate": 4.9667793107832485e-05, + "loss": 6.0062, + "step": 8736 + }, + { + "epoch": 0.05196141402607289, + "grad_norm": 1.9191378355026245, + "learning_rate": 4.966771720903651e-05, + "loss": 6.1341, + "step": 8737 + }, + { + "epoch": 0.051967361309353886, + "grad_norm": 2.135986089706421, + "learning_rate": 4.9667641301629284e-05, + "loss": 5.6993, + "step": 8738 + }, + { + "epoch": 0.05197330859263488, + "grad_norm": 2.0774824619293213, + "learning_rate": 4.966756538561085e-05, + "loss": 5.9791, + "step": 8739 + }, + { + "epoch": 0.051979255875915883, + "grad_norm": 2.1451659202575684, + "learning_rate": 4.9667489460981224e-05, + "loss": 5.8181, + "step": 8740 + }, + { + "epoch": 0.05198520315919688, + "grad_norm": 2.2769901752471924, + "learning_rate": 4.966741352774043e-05, + "loss": 5.6799, + "step": 8741 + }, + { + "epoch": 0.051991150442477874, + "grad_norm": 2.22038197517395, + "learning_rate": 4.9667337585888494e-05, + "loss": 5.8781, + "step": 8742 + }, + { + "epoch": 0.051997097725758876, + "grad_norm": 2.417508125305176, + "learning_rate": 4.9667261635425446e-05, + "loss": 5.3458, + "step": 8743 + }, + { + "epoch": 0.05200304500903987, + "grad_norm": 2.0334360599517822, + "learning_rate": 4.966718567635131e-05, + "loss": 5.5241, + "step": 8744 + }, + { + "epoch": 0.052008992292320866, + "grad_norm": 2.3476316928863525, + "learning_rate": 4.9667109708666126e-05, + "loss": 5.8786, + "step": 8745 + }, + { + "epoch": 0.05201493957560187, + "grad_norm": 2.160106897354126, + "learning_rate": 4.96670337323699e-05, + "loss": 5.616, + "step": 8746 + }, + { + "epoch": 0.05202088685888286, + "grad_norm": 2.0048086643218994, + "learning_rate": 4.9666957747462665e-05, + "loss": 5.5787, + "step": 8747 + }, + { + "epoch": 0.05202683414216386, + "grad_norm": 2.9226925373077393, + "learning_rate": 4.966688175394446e-05, + "loss": 5.3708, + "step": 8748 + }, + { + "epoch": 0.05203278142544486, + "grad_norm": 1.9020568132400513, + "learning_rate": 4.9666805751815294e-05, + "loss": 5.6037, + "step": 8749 + }, + { + "epoch": 0.052038728708725855, + "grad_norm": 2.218637466430664, + "learning_rate": 4.966672974107519e-05, + "loss": 5.2983, + "step": 8750 + }, + { + "epoch": 0.05204467599200685, + "grad_norm": 2.906625270843506, + "learning_rate": 4.96666537217242e-05, + "loss": 5.1234, + "step": 8751 + }, + { + "epoch": 0.052050623275287845, + "grad_norm": 2.0095551013946533, + "learning_rate": 4.966657769376234e-05, + "loss": 5.2695, + "step": 8752 + }, + { + "epoch": 0.05205657055856885, + "grad_norm": 2.1369643211364746, + "learning_rate": 4.966650165718963e-05, + "loss": 5.5426, + "step": 8753 + }, + { + "epoch": 0.05206251784184984, + "grad_norm": 2.4762122631073, + "learning_rate": 4.966642561200608e-05, + "loss": 5.5595, + "step": 8754 + }, + { + "epoch": 0.05206846512513084, + "grad_norm": 2.199430227279663, + "learning_rate": 4.966634955821176e-05, + "loss": 5.5155, + "step": 8755 + }, + { + "epoch": 0.05207441240841184, + "grad_norm": 2.132460355758667, + "learning_rate": 4.966627349580666e-05, + "loss": 5.5344, + "step": 8756 + }, + { + "epoch": 0.052080359691692835, + "grad_norm": 2.4437100887298584, + "learning_rate": 4.966619742479082e-05, + "loss": 5.0135, + "step": 8757 + }, + { + "epoch": 0.05208630697497383, + "grad_norm": 1.5223499536514282, + "learning_rate": 4.9666121345164265e-05, + "loss": 5.5467, + "step": 8758 + }, + { + "epoch": 0.05209225425825483, + "grad_norm": 2.101797580718994, + "learning_rate": 4.966604525692702e-05, + "loss": 5.9493, + "step": 8759 + }, + { + "epoch": 0.05209820154153583, + "grad_norm": 1.9338927268981934, + "learning_rate": 4.966596916007912e-05, + "loss": 5.6625, + "step": 8760 + }, + { + "epoch": 0.05210414882481682, + "grad_norm": 2.1328654289245605, + "learning_rate": 4.966589305462058e-05, + "loss": 6.3202, + "step": 8761 + }, + { + "epoch": 0.052110096108097824, + "grad_norm": 1.963287115097046, + "learning_rate": 4.9665816940551434e-05, + "loss": 5.8885, + "step": 8762 + }, + { + "epoch": 0.05211604339137882, + "grad_norm": 2.124155282974243, + "learning_rate": 4.96657408178717e-05, + "loss": 5.6015, + "step": 8763 + }, + { + "epoch": 0.052121990674659814, + "grad_norm": 2.1011505126953125, + "learning_rate": 4.966566468658142e-05, + "loss": 5.7786, + "step": 8764 + }, + { + "epoch": 0.05212793795794081, + "grad_norm": 1.769573450088501, + "learning_rate": 4.966558854668061e-05, + "loss": 5.8229, + "step": 8765 + }, + { + "epoch": 0.05213388524122181, + "grad_norm": 1.7712751626968384, + "learning_rate": 4.966551239816929e-05, + "loss": 5.733, + "step": 8766 + }, + { + "epoch": 0.052139832524502806, + "grad_norm": 1.68185555934906, + "learning_rate": 4.9665436241047503e-05, + "loss": 6.015, + "step": 8767 + }, + { + "epoch": 0.0521457798077838, + "grad_norm": 1.8619519472122192, + "learning_rate": 4.966536007531526e-05, + "loss": 5.9545, + "step": 8768 + }, + { + "epoch": 0.0521517270910648, + "grad_norm": 1.6538097858428955, + "learning_rate": 4.96652839009726e-05, + "loss": 5.6138, + "step": 8769 + }, + { + "epoch": 0.0521576743743458, + "grad_norm": 1.721737027168274, + "learning_rate": 4.966520771801955e-05, + "loss": 6.0001, + "step": 8770 + }, + { + "epoch": 0.052163621657626794, + "grad_norm": 1.8449060916900635, + "learning_rate": 4.966513152645612e-05, + "loss": 5.6811, + "step": 8771 + }, + { + "epoch": 0.052169568940907796, + "grad_norm": 2.3810017108917236, + "learning_rate": 4.966505532628235e-05, + "loss": 5.4662, + "step": 8772 + }, + { + "epoch": 0.05217551622418879, + "grad_norm": 2.9262144565582275, + "learning_rate": 4.9664979117498265e-05, + "loss": 5.3555, + "step": 8773 + }, + { + "epoch": 0.052181463507469786, + "grad_norm": 2.1560001373291016, + "learning_rate": 4.966490290010389e-05, + "loss": 5.988, + "step": 8774 + }, + { + "epoch": 0.05218741079075079, + "grad_norm": 1.8220587968826294, + "learning_rate": 4.966482667409925e-05, + "loss": 5.8334, + "step": 8775 + }, + { + "epoch": 0.05219335807403178, + "grad_norm": 2.393651008605957, + "learning_rate": 4.9664750439484375e-05, + "loss": 5.5866, + "step": 8776 + }, + { + "epoch": 0.05219930535731278, + "grad_norm": 2.193864583969116, + "learning_rate": 4.966467419625929e-05, + "loss": 5.6642, + "step": 8777 + }, + { + "epoch": 0.05220525264059378, + "grad_norm": 2.24094820022583, + "learning_rate": 4.966459794442403e-05, + "loss": 5.7149, + "step": 8778 + }, + { + "epoch": 0.052211199923874775, + "grad_norm": 2.447439670562744, + "learning_rate": 4.9664521683978606e-05, + "loss": 5.4759, + "step": 8779 + }, + { + "epoch": 0.05221714720715577, + "grad_norm": 1.9538700580596924, + "learning_rate": 4.9664445414923055e-05, + "loss": 5.7, + "step": 8780 + }, + { + "epoch": 0.052223094490436765, + "grad_norm": 1.8960500955581665, + "learning_rate": 4.966436913725739e-05, + "loss": 5.7852, + "step": 8781 + }, + { + "epoch": 0.05222904177371777, + "grad_norm": 1.9234421253204346, + "learning_rate": 4.966429285098166e-05, + "loss": 5.9842, + "step": 8782 + }, + { + "epoch": 0.05223498905699876, + "grad_norm": 2.2879858016967773, + "learning_rate": 4.966421655609588e-05, + "loss": 5.6572, + "step": 8783 + }, + { + "epoch": 0.05224093634027976, + "grad_norm": 2.287932872772217, + "learning_rate": 4.966414025260008e-05, + "loss": 6.0675, + "step": 8784 + }, + { + "epoch": 0.05224688362356076, + "grad_norm": 1.6395118236541748, + "learning_rate": 4.9664063940494275e-05, + "loss": 5.6846, + "step": 8785 + }, + { + "epoch": 0.052252830906841755, + "grad_norm": 1.7121644020080566, + "learning_rate": 4.966398761977851e-05, + "loss": 5.7014, + "step": 8786 + }, + { + "epoch": 0.05225877819012275, + "grad_norm": 1.6225544214248657, + "learning_rate": 4.966391129045279e-05, + "loss": 5.6152, + "step": 8787 + }, + { + "epoch": 0.05226472547340375, + "grad_norm": 1.8484382629394531, + "learning_rate": 4.966383495251716e-05, + "loss": 5.8109, + "step": 8788 + }, + { + "epoch": 0.05227067275668475, + "grad_norm": 1.8225692510604858, + "learning_rate": 4.966375860597164e-05, + "loss": 6.0587, + "step": 8789 + }, + { + "epoch": 0.05227662003996574, + "grad_norm": 2.0333876609802246, + "learning_rate": 4.9663682250816255e-05, + "loss": 6.1406, + "step": 8790 + }, + { + "epoch": 0.052282567323246744, + "grad_norm": 2.0004124641418457, + "learning_rate": 4.9663605887051036e-05, + "loss": 5.6227, + "step": 8791 + }, + { + "epoch": 0.05228851460652774, + "grad_norm": 1.723655343055725, + "learning_rate": 4.9663529514676005e-05, + "loss": 5.5013, + "step": 8792 + }, + { + "epoch": 0.052294461889808734, + "grad_norm": 1.8351995944976807, + "learning_rate": 4.966345313369119e-05, + "loss": 5.3327, + "step": 8793 + }, + { + "epoch": 0.05230040917308973, + "grad_norm": 1.7514569759368896, + "learning_rate": 4.9663376744096615e-05, + "loss": 5.235, + "step": 8794 + }, + { + "epoch": 0.05230635645637073, + "grad_norm": 1.6678166389465332, + "learning_rate": 4.966330034589232e-05, + "loss": 5.2269, + "step": 8795 + }, + { + "epoch": 0.052312303739651726, + "grad_norm": 1.82132887840271, + "learning_rate": 4.9663223939078315e-05, + "loss": 5.0288, + "step": 8796 + }, + { + "epoch": 0.05231825102293272, + "grad_norm": 1.7815704345703125, + "learning_rate": 4.966314752365463e-05, + "loss": 5.4489, + "step": 8797 + }, + { + "epoch": 0.05232419830621372, + "grad_norm": 2.5268197059631348, + "learning_rate": 4.96630710996213e-05, + "loss": 5.0321, + "step": 8798 + }, + { + "epoch": 0.05233014558949472, + "grad_norm": 2.921208620071411, + "learning_rate": 4.9662994666978346e-05, + "loss": 5.0826, + "step": 8799 + }, + { + "epoch": 0.052336092872775714, + "grad_norm": 2.83243727684021, + "learning_rate": 4.9662918225725794e-05, + "loss": 4.9754, + "step": 8800 + }, + { + "epoch": 0.052342040156056716, + "grad_norm": 2.960346221923828, + "learning_rate": 4.966284177586368e-05, + "loss": 5.5808, + "step": 8801 + }, + { + "epoch": 0.05234798743933771, + "grad_norm": 2.479055643081665, + "learning_rate": 4.966276531739201e-05, + "loss": 5.3779, + "step": 8802 + }, + { + "epoch": 0.052353934722618706, + "grad_norm": 2.8753128051757812, + "learning_rate": 4.966268885031083e-05, + "loss": 5.4023, + "step": 8803 + }, + { + "epoch": 0.05235988200589971, + "grad_norm": 2.1152822971343994, + "learning_rate": 4.966261237462016e-05, + "loss": 6.1181, + "step": 8804 + }, + { + "epoch": 0.0523658292891807, + "grad_norm": 2.7178313732147217, + "learning_rate": 4.966253589032003e-05, + "loss": 5.1597, + "step": 8805 + }, + { + "epoch": 0.0523717765724617, + "grad_norm": 2.6567695140838623, + "learning_rate": 4.966245939741045e-05, + "loss": 5.0582, + "step": 8806 + }, + { + "epoch": 0.0523777238557427, + "grad_norm": 3.0211431980133057, + "learning_rate": 4.966238289589147e-05, + "loss": 4.8331, + "step": 8807 + }, + { + "epoch": 0.052383671139023695, + "grad_norm": 2.9341561794281006, + "learning_rate": 4.9662306385763114e-05, + "loss": 4.8482, + "step": 8808 + }, + { + "epoch": 0.05238961842230469, + "grad_norm": 2.781118631362915, + "learning_rate": 4.966222986702539e-05, + "loss": 4.9199, + "step": 8809 + }, + { + "epoch": 0.052395565705585685, + "grad_norm": 2.459233283996582, + "learning_rate": 4.9662153339678344e-05, + "loss": 5.4156, + "step": 8810 + }, + { + "epoch": 0.05240151298886669, + "grad_norm": 1.9862231016159058, + "learning_rate": 4.966207680372199e-05, + "loss": 5.3937, + "step": 8811 + }, + { + "epoch": 0.05240746027214768, + "grad_norm": 3.3698437213897705, + "learning_rate": 4.966200025915636e-05, + "loss": 4.6231, + "step": 8812 + }, + { + "epoch": 0.05241340755542868, + "grad_norm": 2.9254424571990967, + "learning_rate": 4.9661923705981486e-05, + "loss": 4.5612, + "step": 8813 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 2.684386968612671, + "learning_rate": 4.966184714419738e-05, + "loss": 4.8646, + "step": 8814 + }, + { + "epoch": 0.052425302121990675, + "grad_norm": 2.812406539916992, + "learning_rate": 4.966177057380409e-05, + "loss": 4.5116, + "step": 8815 + }, + { + "epoch": 0.05243124940527167, + "grad_norm": 2.1739046573638916, + "learning_rate": 4.966169399480162e-05, + "loss": 5.3369, + "step": 8816 + }, + { + "epoch": 0.05243719668855267, + "grad_norm": 2.408341407775879, + "learning_rate": 4.966161740719001e-05, + "loss": 5.0368, + "step": 8817 + }, + { + "epoch": 0.05244314397183367, + "grad_norm": 2.2844927310943604, + "learning_rate": 4.966154081096929e-05, + "loss": 5.0657, + "step": 8818 + }, + { + "epoch": 0.05244909125511466, + "grad_norm": 2.5329723358154297, + "learning_rate": 4.9661464206139475e-05, + "loss": 5.2006, + "step": 8819 + }, + { + "epoch": 0.052455038538395664, + "grad_norm": 2.154224395751953, + "learning_rate": 4.9661387592700595e-05, + "loss": 5.238, + "step": 8820 + }, + { + "epoch": 0.05246098582167666, + "grad_norm": 2.1069657802581787, + "learning_rate": 4.966131097065269e-05, + "loss": 5.0894, + "step": 8821 + }, + { + "epoch": 0.052466933104957654, + "grad_norm": 2.165954351425171, + "learning_rate": 4.9661234339995763e-05, + "loss": 5.1148, + "step": 8822 + }, + { + "epoch": 0.052472880388238656, + "grad_norm": 1.8859459161758423, + "learning_rate": 4.9661157700729866e-05, + "loss": 5.1703, + "step": 8823 + }, + { + "epoch": 0.05247882767151965, + "grad_norm": 1.9739452600479126, + "learning_rate": 4.9661081052855004e-05, + "loss": 5.3978, + "step": 8824 + }, + { + "epoch": 0.052484774954800646, + "grad_norm": 1.95566987991333, + "learning_rate": 4.966100439637122e-05, + "loss": 5.3592, + "step": 8825 + }, + { + "epoch": 0.05249072223808164, + "grad_norm": 1.8613550662994385, + "learning_rate": 4.966092773127853e-05, + "loss": 5.3746, + "step": 8826 + }, + { + "epoch": 0.05249666952136264, + "grad_norm": 2.001701831817627, + "learning_rate": 4.9660851057576966e-05, + "loss": 5.3269, + "step": 8827 + }, + { + "epoch": 0.05250261680464364, + "grad_norm": 1.8846383094787598, + "learning_rate": 4.9660774375266556e-05, + "loss": 5.7906, + "step": 8828 + }, + { + "epoch": 0.052508564087924633, + "grad_norm": 1.982998251914978, + "learning_rate": 4.966069768434732e-05, + "loss": 5.6609, + "step": 8829 + }, + { + "epoch": 0.052514511371205636, + "grad_norm": 2.3036038875579834, + "learning_rate": 4.9660620984819294e-05, + "loss": 5.6172, + "step": 8830 + }, + { + "epoch": 0.05252045865448663, + "grad_norm": 1.9227113723754883, + "learning_rate": 4.9660544276682496e-05, + "loss": 5.4734, + "step": 8831 + }, + { + "epoch": 0.052526405937767626, + "grad_norm": 2.038203716278076, + "learning_rate": 4.9660467559936964e-05, + "loss": 5.6484, + "step": 8832 + }, + { + "epoch": 0.05253235322104863, + "grad_norm": 2.217108964920044, + "learning_rate": 4.9660390834582704e-05, + "loss": 5.4064, + "step": 8833 + }, + { + "epoch": 0.05253830050432962, + "grad_norm": 2.4458765983581543, + "learning_rate": 4.966031410061976e-05, + "loss": 5.605, + "step": 8834 + }, + { + "epoch": 0.05254424778761062, + "grad_norm": 2.2767014503479004, + "learning_rate": 4.966023735804817e-05, + "loss": 5.4258, + "step": 8835 + }, + { + "epoch": 0.05255019507089162, + "grad_norm": 2.3594579696655273, + "learning_rate": 4.9660160606867936e-05, + "loss": 5.5138, + "step": 8836 + }, + { + "epoch": 0.052556142354172615, + "grad_norm": 1.8961461782455444, + "learning_rate": 4.966008384707909e-05, + "loss": 5.9879, + "step": 8837 + }, + { + "epoch": 0.05256208963745361, + "grad_norm": 1.824751615524292, + "learning_rate": 4.966000707868167e-05, + "loss": 5.4558, + "step": 8838 + }, + { + "epoch": 0.052568036920734605, + "grad_norm": 2.005291223526001, + "learning_rate": 4.9659930301675694e-05, + "loss": 5.821, + "step": 8839 + }, + { + "epoch": 0.05257398420401561, + "grad_norm": 2.0951414108276367, + "learning_rate": 4.965985351606119e-05, + "loss": 5.2816, + "step": 8840 + }, + { + "epoch": 0.0525799314872966, + "grad_norm": 2.236849069595337, + "learning_rate": 4.9659776721838194e-05, + "loss": 5.4734, + "step": 8841 + }, + { + "epoch": 0.0525858787705776, + "grad_norm": 1.8877390623092651, + "learning_rate": 4.965969991900671e-05, + "loss": 5.2445, + "step": 8842 + }, + { + "epoch": 0.0525918260538586, + "grad_norm": 2.726071834564209, + "learning_rate": 4.9659623107566785e-05, + "loss": 5.6059, + "step": 8843 + }, + { + "epoch": 0.052597773337139594, + "grad_norm": 2.279759168624878, + "learning_rate": 4.965954628751844e-05, + "loss": 5.6755, + "step": 8844 + }, + { + "epoch": 0.05260372062042059, + "grad_norm": 1.9941623210906982, + "learning_rate": 4.965946945886171e-05, + "loss": 5.5222, + "step": 8845 + }, + { + "epoch": 0.05260966790370159, + "grad_norm": 2.0556750297546387, + "learning_rate": 4.965939262159661e-05, + "loss": 5.6064, + "step": 8846 + }, + { + "epoch": 0.05261561518698259, + "grad_norm": 1.9260958433151245, + "learning_rate": 4.965931577572317e-05, + "loss": 5.6264, + "step": 8847 + }, + { + "epoch": 0.05262156247026358, + "grad_norm": 2.1252758502960205, + "learning_rate": 4.9659238921241413e-05, + "loss": 5.9832, + "step": 8848 + }, + { + "epoch": 0.052627509753544584, + "grad_norm": 1.8081480264663696, + "learning_rate": 4.9659162058151377e-05, + "loss": 5.4391, + "step": 8849 + }, + { + "epoch": 0.05263345703682558, + "grad_norm": 1.8439849615097046, + "learning_rate": 4.965908518645308e-05, + "loss": 5.5351, + "step": 8850 + }, + { + "epoch": 0.052639404320106574, + "grad_norm": 2.1782681941986084, + "learning_rate": 4.9659008306146556e-05, + "loss": 5.9692, + "step": 8851 + }, + { + "epoch": 0.052645351603387576, + "grad_norm": 2.0206944942474365, + "learning_rate": 4.965893141723182e-05, + "loss": 5.4736, + "step": 8852 + }, + { + "epoch": 0.05265129888666857, + "grad_norm": 2.283517360687256, + "learning_rate": 4.965885451970891e-05, + "loss": 5.4504, + "step": 8853 + }, + { + "epoch": 0.052657246169949566, + "grad_norm": 2.701608180999756, + "learning_rate": 4.965877761357784e-05, + "loss": 5.318, + "step": 8854 + }, + { + "epoch": 0.05266319345323056, + "grad_norm": 2.8494722843170166, + "learning_rate": 4.965870069883866e-05, + "loss": 4.9835, + "step": 8855 + }, + { + "epoch": 0.05266914073651156, + "grad_norm": 2.0555408000946045, + "learning_rate": 4.965862377549137e-05, + "loss": 5.7587, + "step": 8856 + }, + { + "epoch": 0.05267508801979256, + "grad_norm": 2.3476004600524902, + "learning_rate": 4.9658546843536014e-05, + "loss": 5.8775, + "step": 8857 + }, + { + "epoch": 0.05268103530307355, + "grad_norm": 1.8152700662612915, + "learning_rate": 4.965846990297262e-05, + "loss": 5.6274, + "step": 8858 + }, + { + "epoch": 0.052686982586354555, + "grad_norm": 2.1541671752929688, + "learning_rate": 4.965839295380119e-05, + "loss": 5.6786, + "step": 8859 + }, + { + "epoch": 0.05269292986963555, + "grad_norm": 2.1708984375, + "learning_rate": 4.965831599602179e-05, + "loss": 5.8817, + "step": 8860 + }, + { + "epoch": 0.052698877152916546, + "grad_norm": 1.6558966636657715, + "learning_rate": 4.9658239029634415e-05, + "loss": 5.5375, + "step": 8861 + }, + { + "epoch": 0.05270482443619755, + "grad_norm": 2.1165130138397217, + "learning_rate": 4.9658162054639115e-05, + "loss": 5.5936, + "step": 8862 + }, + { + "epoch": 0.05271077171947854, + "grad_norm": 2.4143176078796387, + "learning_rate": 4.9658085071035893e-05, + "loss": 5.71, + "step": 8863 + }, + { + "epoch": 0.05271671900275954, + "grad_norm": 1.9471622705459595, + "learning_rate": 4.965800807882479e-05, + "loss": 5.7588, + "step": 8864 + }, + { + "epoch": 0.05272266628604054, + "grad_norm": 2.2014408111572266, + "learning_rate": 4.9657931078005835e-05, + "loss": 5.7699, + "step": 8865 + }, + { + "epoch": 0.052728613569321535, + "grad_norm": 1.7588191032409668, + "learning_rate": 4.965785406857905e-05, + "loss": 5.3921, + "step": 8866 + }, + { + "epoch": 0.05273456085260253, + "grad_norm": 1.835635781288147, + "learning_rate": 4.965777705054446e-05, + "loss": 5.1531, + "step": 8867 + }, + { + "epoch": 0.052740508135883525, + "grad_norm": 2.3071937561035156, + "learning_rate": 4.96577000239021e-05, + "loss": 5.5926, + "step": 8868 + }, + { + "epoch": 0.05274645541916453, + "grad_norm": 2.195712089538574, + "learning_rate": 4.9657622988651995e-05, + "loss": 5.4579, + "step": 8869 + }, + { + "epoch": 0.05275240270244552, + "grad_norm": 2.273738145828247, + "learning_rate": 4.9657545944794156e-05, + "loss": 5.6138, + "step": 8870 + }, + { + "epoch": 0.05275834998572652, + "grad_norm": 2.208343982696533, + "learning_rate": 4.9657468892328626e-05, + "loss": 5.5508, + "step": 8871 + }, + { + "epoch": 0.05276429726900752, + "grad_norm": 2.2111566066741943, + "learning_rate": 4.965739183125544e-05, + "loss": 5.7044, + "step": 8872 + }, + { + "epoch": 0.052770244552288514, + "grad_norm": 1.7516666650772095, + "learning_rate": 4.96573147615746e-05, + "loss": 5.4357, + "step": 8873 + }, + { + "epoch": 0.05277619183556951, + "grad_norm": 2.0703322887420654, + "learning_rate": 4.9657237683286155e-05, + "loss": 5.5383, + "step": 8874 + }, + { + "epoch": 0.05278213911885051, + "grad_norm": 1.796243667602539, + "learning_rate": 4.965716059639012e-05, + "loss": 5.5024, + "step": 8875 + }, + { + "epoch": 0.05278808640213151, + "grad_norm": 2.322397232055664, + "learning_rate": 4.9657083500886526e-05, + "loss": 5.8814, + "step": 8876 + }, + { + "epoch": 0.0527940336854125, + "grad_norm": 2.6743311882019043, + "learning_rate": 4.96570063967754e-05, + "loss": 5.4989, + "step": 8877 + }, + { + "epoch": 0.052799980968693504, + "grad_norm": 2.4381649494171143, + "learning_rate": 4.965692928405676e-05, + "loss": 5.5807, + "step": 8878 + }, + { + "epoch": 0.0528059282519745, + "grad_norm": 2.3703296184539795, + "learning_rate": 4.9656852162730646e-05, + "loss": 5.5586, + "step": 8879 + }, + { + "epoch": 0.052811875535255494, + "grad_norm": 1.7828437089920044, + "learning_rate": 4.9656775032797075e-05, + "loss": 5.2553, + "step": 8880 + }, + { + "epoch": 0.052817822818536496, + "grad_norm": 1.730290412902832, + "learning_rate": 4.9656697894256085e-05, + "loss": 5.3558, + "step": 8881 + }, + { + "epoch": 0.05282377010181749, + "grad_norm": 1.6909739971160889, + "learning_rate": 4.9656620747107694e-05, + "loss": 5.4397, + "step": 8882 + }, + { + "epoch": 0.052829717385098486, + "grad_norm": 1.9772145748138428, + "learning_rate": 4.965654359135193e-05, + "loss": 5.5786, + "step": 8883 + }, + { + "epoch": 0.05283566466837948, + "grad_norm": 1.8624964952468872, + "learning_rate": 4.965646642698883e-05, + "loss": 5.5466, + "step": 8884 + }, + { + "epoch": 0.05284161195166048, + "grad_norm": 1.7061936855316162, + "learning_rate": 4.96563892540184e-05, + "loss": 5.3439, + "step": 8885 + }, + { + "epoch": 0.05284755923494148, + "grad_norm": 1.715483546257019, + "learning_rate": 4.965631207244069e-05, + "loss": 5.2732, + "step": 8886 + }, + { + "epoch": 0.05285350651822247, + "grad_norm": 1.7801883220672607, + "learning_rate": 4.965623488225571e-05, + "loss": 5.2427, + "step": 8887 + }, + { + "epoch": 0.052859453801503475, + "grad_norm": 1.5122452974319458, + "learning_rate": 4.9656157683463495e-05, + "loss": 5.2812, + "step": 8888 + }, + { + "epoch": 0.05286540108478447, + "grad_norm": 1.878077507019043, + "learning_rate": 4.965608047606407e-05, + "loss": 5.6385, + "step": 8889 + }, + { + "epoch": 0.052871348368065466, + "grad_norm": 2.0781304836273193, + "learning_rate": 4.965600326005746e-05, + "loss": 5.3345, + "step": 8890 + }, + { + "epoch": 0.05287729565134647, + "grad_norm": 1.953302264213562, + "learning_rate": 4.965592603544369e-05, + "loss": 5.2694, + "step": 8891 + }, + { + "epoch": 0.05288324293462746, + "grad_norm": 1.9993265867233276, + "learning_rate": 4.96558488022228e-05, + "loss": 5.3323, + "step": 8892 + }, + { + "epoch": 0.05288919021790846, + "grad_norm": 1.7653480768203735, + "learning_rate": 4.96557715603948e-05, + "loss": 5.389, + "step": 8893 + }, + { + "epoch": 0.05289513750118946, + "grad_norm": 1.8843438625335693, + "learning_rate": 4.965569430995973e-05, + "loss": 5.3334, + "step": 8894 + }, + { + "epoch": 0.052901084784470455, + "grad_norm": 1.6673407554626465, + "learning_rate": 4.9655617050917616e-05, + "loss": 5.4469, + "step": 8895 + }, + { + "epoch": 0.05290703206775145, + "grad_norm": 1.8208844661712646, + "learning_rate": 4.9655539783268476e-05, + "loss": 5.6288, + "step": 8896 + }, + { + "epoch": 0.052912979351032445, + "grad_norm": 1.755162000656128, + "learning_rate": 4.965546250701234e-05, + "loss": 5.4388, + "step": 8897 + }, + { + "epoch": 0.05291892663431345, + "grad_norm": 1.9435405731201172, + "learning_rate": 4.965538522214924e-05, + "loss": 5.5877, + "step": 8898 + }, + { + "epoch": 0.05292487391759444, + "grad_norm": 1.8579509258270264, + "learning_rate": 4.9655307928679196e-05, + "loss": 5.4405, + "step": 8899 + }, + { + "epoch": 0.05293082120087544, + "grad_norm": 1.8897236585617065, + "learning_rate": 4.9655230626602246e-05, + "loss": 5.2931, + "step": 8900 + }, + { + "epoch": 0.05293676848415644, + "grad_norm": 1.928133487701416, + "learning_rate": 4.9655153315918403e-05, + "loss": 5.2345, + "step": 8901 + }, + { + "epoch": 0.052942715767437434, + "grad_norm": 1.8830339908599854, + "learning_rate": 4.96550759966277e-05, + "loss": 5.3288, + "step": 8902 + }, + { + "epoch": 0.05294866305071843, + "grad_norm": 1.6774102449417114, + "learning_rate": 4.9654998668730167e-05, + "loss": 5.2939, + "step": 8903 + }, + { + "epoch": 0.05295461033399943, + "grad_norm": 1.7440418004989624, + "learning_rate": 4.9654921332225826e-05, + "loss": 5.4663, + "step": 8904 + }, + { + "epoch": 0.05296055761728043, + "grad_norm": 1.92295241355896, + "learning_rate": 4.965484398711471e-05, + "loss": 5.556, + "step": 8905 + }, + { + "epoch": 0.05296650490056142, + "grad_norm": 1.5319017171859741, + "learning_rate": 4.965476663339684e-05, + "loss": 5.5267, + "step": 8906 + }, + { + "epoch": 0.052972452183842424, + "grad_norm": 1.7626374959945679, + "learning_rate": 4.9654689271072255e-05, + "loss": 5.3774, + "step": 8907 + }, + { + "epoch": 0.05297839946712342, + "grad_norm": 1.745743989944458, + "learning_rate": 4.965461190014096e-05, + "loss": 5.4877, + "step": 8908 + }, + { + "epoch": 0.052984346750404414, + "grad_norm": 1.6091177463531494, + "learning_rate": 4.9654534520603e-05, + "loss": 5.2969, + "step": 8909 + }, + { + "epoch": 0.052990294033685416, + "grad_norm": 1.7392489910125732, + "learning_rate": 4.96544571324584e-05, + "loss": 5.4247, + "step": 8910 + }, + { + "epoch": 0.05299624131696641, + "grad_norm": 1.9275293350219727, + "learning_rate": 4.965437973570718e-05, + "loss": 5.2184, + "step": 8911 + }, + { + "epoch": 0.053002188600247406, + "grad_norm": 1.6901222467422485, + "learning_rate": 4.965430233034937e-05, + "loss": 5.1459, + "step": 8912 + }, + { + "epoch": 0.0530081358835284, + "grad_norm": 1.9212596416473389, + "learning_rate": 4.965422491638499e-05, + "loss": 5.2439, + "step": 8913 + }, + { + "epoch": 0.0530140831668094, + "grad_norm": 1.814706802368164, + "learning_rate": 4.965414749381409e-05, + "loss": 5.5608, + "step": 8914 + }, + { + "epoch": 0.0530200304500904, + "grad_norm": 1.7997081279754639, + "learning_rate": 4.965407006263668e-05, + "loss": 5.6099, + "step": 8915 + }, + { + "epoch": 0.05302597773337139, + "grad_norm": 1.8545546531677246, + "learning_rate": 4.9653992622852777e-05, + "loss": 5.5844, + "step": 8916 + }, + { + "epoch": 0.053031925016652395, + "grad_norm": 1.665958285331726, + "learning_rate": 4.965391517446243e-05, + "loss": 5.4967, + "step": 8917 + }, + { + "epoch": 0.05303787229993339, + "grad_norm": 1.6157240867614746, + "learning_rate": 4.9653837717465655e-05, + "loss": 5.2523, + "step": 8918 + }, + { + "epoch": 0.053043819583214386, + "grad_norm": 1.9782540798187256, + "learning_rate": 4.965376025186248e-05, + "loss": 5.2384, + "step": 8919 + }, + { + "epoch": 0.05304976686649539, + "grad_norm": 2.0229971408843994, + "learning_rate": 4.9653682777652925e-05, + "loss": 5.1703, + "step": 8920 + }, + { + "epoch": 0.05305571414977638, + "grad_norm": 1.8299061059951782, + "learning_rate": 4.965360529483703e-05, + "loss": 5.0257, + "step": 8921 + }, + { + "epoch": 0.05306166143305738, + "grad_norm": 1.9080857038497925, + "learning_rate": 4.965352780341482e-05, + "loss": 5.2516, + "step": 8922 + }, + { + "epoch": 0.05306760871633838, + "grad_norm": 1.9998538494110107, + "learning_rate": 4.965345030338631e-05, + "loss": 5.1991, + "step": 8923 + }, + { + "epoch": 0.053073555999619375, + "grad_norm": 1.7606618404388428, + "learning_rate": 4.965337279475154e-05, + "loss": 5.2194, + "step": 8924 + }, + { + "epoch": 0.05307950328290037, + "grad_norm": 1.9633625745773315, + "learning_rate": 4.9653295277510525e-05, + "loss": 5.2463, + "step": 8925 + }, + { + "epoch": 0.053085450566181365, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.9653217751663306e-05, + "loss": 5.2737, + "step": 8926 + }, + { + "epoch": 0.05309139784946237, + "grad_norm": 1.836289405822754, + "learning_rate": 4.965314021720991e-05, + "loss": 5.1157, + "step": 8927 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 1.8526496887207031, + "learning_rate": 4.965306267415035e-05, + "loss": 5.6541, + "step": 8928 + }, + { + "epoch": 0.05310329241602436, + "grad_norm": 1.9928539991378784, + "learning_rate": 4.965298512248466e-05, + "loss": 5.194, + "step": 8929 + }, + { + "epoch": 0.05310923969930536, + "grad_norm": 1.601536512374878, + "learning_rate": 4.9652907562212867e-05, + "loss": 5.285, + "step": 8930 + }, + { + "epoch": 0.053115186982586354, + "grad_norm": 1.8940081596374512, + "learning_rate": 4.9652829993335e-05, + "loss": 5.1791, + "step": 8931 + }, + { + "epoch": 0.05312113426586735, + "grad_norm": 1.7984519004821777, + "learning_rate": 4.9652752415851085e-05, + "loss": 5.2225, + "step": 8932 + }, + { + "epoch": 0.05312708154914835, + "grad_norm": 1.7474113702774048, + "learning_rate": 4.965267482976115e-05, + "loss": 5.0099, + "step": 8933 + }, + { + "epoch": 0.053133028832429346, + "grad_norm": 1.7044427394866943, + "learning_rate": 4.9652597235065214e-05, + "loss": 5.1456, + "step": 8934 + }, + { + "epoch": 0.05313897611571034, + "grad_norm": 1.5422965288162231, + "learning_rate": 4.9652519631763316e-05, + "loss": 5.0714, + "step": 8935 + }, + { + "epoch": 0.053144923398991344, + "grad_norm": 1.6831375360488892, + "learning_rate": 4.965244201985548e-05, + "loss": 5.0742, + "step": 8936 + }, + { + "epoch": 0.05315087068227234, + "grad_norm": 1.7648097276687622, + "learning_rate": 4.9652364399341734e-05, + "loss": 5.1108, + "step": 8937 + }, + { + "epoch": 0.053156817965553334, + "grad_norm": 1.669393539428711, + "learning_rate": 4.965228677022209e-05, + "loss": 5.1801, + "step": 8938 + }, + { + "epoch": 0.053162765248834336, + "grad_norm": 2.0252909660339355, + "learning_rate": 4.96522091324966e-05, + "loss": 5.3955, + "step": 8939 + }, + { + "epoch": 0.05316871253211533, + "grad_norm": 1.686355710029602, + "learning_rate": 4.965213148616527e-05, + "loss": 5.2626, + "step": 8940 + }, + { + "epoch": 0.053174659815396326, + "grad_norm": 1.7601011991500854, + "learning_rate": 4.965205383122814e-05, + "loss": 5.1603, + "step": 8941 + }, + { + "epoch": 0.05318060709867732, + "grad_norm": 1.7249791622161865, + "learning_rate": 4.9651976167685235e-05, + "loss": 5.4245, + "step": 8942 + }, + { + "epoch": 0.05318655438195832, + "grad_norm": 1.869367003440857, + "learning_rate": 4.9651898495536574e-05, + "loss": 5.2269, + "step": 8943 + }, + { + "epoch": 0.05319250166523932, + "grad_norm": 1.8296380043029785, + "learning_rate": 4.965182081478219e-05, + "loss": 5.3236, + "step": 8944 + }, + { + "epoch": 0.05319844894852031, + "grad_norm": 1.8211008310317993, + "learning_rate": 4.9651743125422115e-05, + "loss": 5.269, + "step": 8945 + }, + { + "epoch": 0.053204396231801315, + "grad_norm": 1.868295431137085, + "learning_rate": 4.965166542745637e-05, + "loss": 5.2733, + "step": 8946 + }, + { + "epoch": 0.05321034351508231, + "grad_norm": 1.6603426933288574, + "learning_rate": 4.965158772088498e-05, + "loss": 5.2685, + "step": 8947 + }, + { + "epoch": 0.053216290798363305, + "grad_norm": 1.680565357208252, + "learning_rate": 4.965151000570798e-05, + "loss": 5.4452, + "step": 8948 + }, + { + "epoch": 0.05322223808164431, + "grad_norm": 1.6473147869110107, + "learning_rate": 4.9651432281925394e-05, + "loss": 5.4476, + "step": 8949 + }, + { + "epoch": 0.0532281853649253, + "grad_norm": 1.5291423797607422, + "learning_rate": 4.965135454953724e-05, + "loss": 5.4617, + "step": 8950 + }, + { + "epoch": 0.0532341326482063, + "grad_norm": 1.4708455801010132, + "learning_rate": 4.965127680854356e-05, + "loss": 5.5431, + "step": 8951 + }, + { + "epoch": 0.0532400799314873, + "grad_norm": 1.4297362565994263, + "learning_rate": 4.9651199058944366e-05, + "loss": 5.431, + "step": 8952 + }, + { + "epoch": 0.053246027214768295, + "grad_norm": 1.726123571395874, + "learning_rate": 4.96511213007397e-05, + "loss": 5.2801, + "step": 8953 + }, + { + "epoch": 0.05325197449804929, + "grad_norm": 1.7977174520492554, + "learning_rate": 4.9651043533929584e-05, + "loss": 5.3273, + "step": 8954 + }, + { + "epoch": 0.053257921781330285, + "grad_norm": 1.8125461339950562, + "learning_rate": 4.9650965758514034e-05, + "loss": 5.3135, + "step": 8955 + }, + { + "epoch": 0.05326386906461129, + "grad_norm": 1.4925352334976196, + "learning_rate": 4.965088797449309e-05, + "loss": 5.1454, + "step": 8956 + }, + { + "epoch": 0.05326981634789228, + "grad_norm": 1.6977181434631348, + "learning_rate": 4.965081018186678e-05, + "loss": 5.3207, + "step": 8957 + }, + { + "epoch": 0.05327576363117328, + "grad_norm": 1.7767595052719116, + "learning_rate": 4.965073238063512e-05, + "loss": 5.203, + "step": 8958 + }, + { + "epoch": 0.05328171091445428, + "grad_norm": 1.53665292263031, + "learning_rate": 4.965065457079815e-05, + "loss": 5.3088, + "step": 8959 + }, + { + "epoch": 0.053287658197735274, + "grad_norm": 1.724476933479309, + "learning_rate": 4.965057675235589e-05, + "loss": 5.2628, + "step": 8960 + }, + { + "epoch": 0.05329360548101627, + "grad_norm": 1.7339463233947754, + "learning_rate": 4.965049892530837e-05, + "loss": 5.3174, + "step": 8961 + }, + { + "epoch": 0.05329955276429727, + "grad_norm": 1.8414005041122437, + "learning_rate": 4.965042108965561e-05, + "loss": 5.2121, + "step": 8962 + }, + { + "epoch": 0.053305500047578266, + "grad_norm": 1.7969903945922852, + "learning_rate": 4.9650343245397655e-05, + "loss": 5.0947, + "step": 8963 + }, + { + "epoch": 0.05331144733085926, + "grad_norm": 1.573320746421814, + "learning_rate": 4.965026539253451e-05, + "loss": 5.0624, + "step": 8964 + }, + { + "epoch": 0.053317394614140264, + "grad_norm": 1.7296351194381714, + "learning_rate": 4.9650187531066204e-05, + "loss": 5.5497, + "step": 8965 + }, + { + "epoch": 0.05332334189742126, + "grad_norm": 1.931847095489502, + "learning_rate": 4.9650109660992784e-05, + "loss": 5.537, + "step": 8966 + }, + { + "epoch": 0.053329289180702254, + "grad_norm": 1.8911564350128174, + "learning_rate": 4.965003178231427e-05, + "loss": 5.4891, + "step": 8967 + }, + { + "epoch": 0.053335236463983256, + "grad_norm": 1.933401107788086, + "learning_rate": 4.964995389503067e-05, + "loss": 5.3157, + "step": 8968 + }, + { + "epoch": 0.05334118374726425, + "grad_norm": 1.8299031257629395, + "learning_rate": 4.964987599914204e-05, + "loss": 5.2955, + "step": 8969 + }, + { + "epoch": 0.053347131030545246, + "grad_norm": 1.5823233127593994, + "learning_rate": 4.964979809464838e-05, + "loss": 5.2708, + "step": 8970 + }, + { + "epoch": 0.05335307831382624, + "grad_norm": 1.602689504623413, + "learning_rate": 4.9649720181549737e-05, + "loss": 5.3646, + "step": 8971 + }, + { + "epoch": 0.05335902559710724, + "grad_norm": 2.2379884719848633, + "learning_rate": 4.964964225984613e-05, + "loss": 5.5453, + "step": 8972 + }, + { + "epoch": 0.05336497288038824, + "grad_norm": 2.2210440635681152, + "learning_rate": 4.964956432953759e-05, + "loss": 5.2123, + "step": 8973 + }, + { + "epoch": 0.05337092016366923, + "grad_norm": 2.4450249671936035, + "learning_rate": 4.964948639062413e-05, + "loss": 5.172, + "step": 8974 + }, + { + "epoch": 0.053376867446950235, + "grad_norm": 1.7727516889572144, + "learning_rate": 4.9649408443105806e-05, + "loss": 5.3447, + "step": 8975 + }, + { + "epoch": 0.05338281473023123, + "grad_norm": 1.8239831924438477, + "learning_rate": 4.964933048698262e-05, + "loss": 5.3628, + "step": 8976 + }, + { + "epoch": 0.053388762013512225, + "grad_norm": 1.9517360925674438, + "learning_rate": 4.964925252225461e-05, + "loss": 5.6118, + "step": 8977 + }, + { + "epoch": 0.05339470929679323, + "grad_norm": 2.1735262870788574, + "learning_rate": 4.9649174548921796e-05, + "loss": 5.7332, + "step": 8978 + }, + { + "epoch": 0.05340065658007422, + "grad_norm": 1.4132062196731567, + "learning_rate": 4.964909656698421e-05, + "loss": 5.8078, + "step": 8979 + }, + { + "epoch": 0.05340660386335522, + "grad_norm": 1.5568846464157104, + "learning_rate": 4.964901857644188e-05, + "loss": 5.6328, + "step": 8980 + }, + { + "epoch": 0.05341255114663622, + "grad_norm": 1.6015586853027344, + "learning_rate": 4.964894057729484e-05, + "loss": 5.3738, + "step": 8981 + }, + { + "epoch": 0.053418498429917215, + "grad_norm": 1.492748737335205, + "learning_rate": 4.9648862569543105e-05, + "loss": 5.4336, + "step": 8982 + }, + { + "epoch": 0.05342444571319821, + "grad_norm": 1.9008845090866089, + "learning_rate": 4.96487845531867e-05, + "loss": 5.455, + "step": 8983 + }, + { + "epoch": 0.053430392996479205, + "grad_norm": 1.9590948820114136, + "learning_rate": 4.9648706528225664e-05, + "loss": 5.3308, + "step": 8984 + }, + { + "epoch": 0.05343634027976021, + "grad_norm": 1.9980428218841553, + "learning_rate": 4.964862849466002e-05, + "loss": 5.3777, + "step": 8985 + }, + { + "epoch": 0.0534422875630412, + "grad_norm": 1.769711971282959, + "learning_rate": 4.964855045248979e-05, + "loss": 5.4451, + "step": 8986 + }, + { + "epoch": 0.0534482348463222, + "grad_norm": 1.769977331161499, + "learning_rate": 4.964847240171502e-05, + "loss": 5.277, + "step": 8987 + }, + { + "epoch": 0.0534541821296032, + "grad_norm": 1.6647396087646484, + "learning_rate": 4.9648394342335705e-05, + "loss": 5.4655, + "step": 8988 + }, + { + "epoch": 0.053460129412884194, + "grad_norm": 1.861554503440857, + "learning_rate": 4.9648316274351906e-05, + "loss": 5.308, + "step": 8989 + }, + { + "epoch": 0.05346607669616519, + "grad_norm": 1.9457745552062988, + "learning_rate": 4.964823819776362e-05, + "loss": 6.2361, + "step": 8990 + }, + { + "epoch": 0.05347202397944619, + "grad_norm": 1.7702157497406006, + "learning_rate": 4.9648160112570896e-05, + "loss": 5.366, + "step": 8991 + }, + { + "epoch": 0.053477971262727186, + "grad_norm": 2.0074565410614014, + "learning_rate": 4.964808201877375e-05, + "loss": 5.3598, + "step": 8992 + }, + { + "epoch": 0.05348391854600818, + "grad_norm": 1.8686721324920654, + "learning_rate": 4.964800391637222e-05, + "loss": 5.4607, + "step": 8993 + }, + { + "epoch": 0.053489865829289183, + "grad_norm": 1.9749736785888672, + "learning_rate": 4.964792580536632e-05, + "loss": 5.3734, + "step": 8994 + }, + { + "epoch": 0.05349581311257018, + "grad_norm": 1.8435015678405762, + "learning_rate": 4.964784768575609e-05, + "loss": 5.3815, + "step": 8995 + }, + { + "epoch": 0.053501760395851174, + "grad_norm": 2.01983380317688, + "learning_rate": 4.9647769557541546e-05, + "loss": 5.4089, + "step": 8996 + }, + { + "epoch": 0.053507707679132176, + "grad_norm": 2.014798402786255, + "learning_rate": 4.964769142072272e-05, + "loss": 5.3906, + "step": 8997 + }, + { + "epoch": 0.05351365496241317, + "grad_norm": 1.8822753429412842, + "learning_rate": 4.9647613275299644e-05, + "loss": 5.3598, + "step": 8998 + }, + { + "epoch": 0.053519602245694166, + "grad_norm": 1.6534459590911865, + "learning_rate": 4.9647535121272334e-05, + "loss": 5.4577, + "step": 8999 + }, + { + "epoch": 0.05352554952897516, + "grad_norm": 1.6497015953063965, + "learning_rate": 4.964745695864083e-05, + "loss": 5.3915, + "step": 9000 + }, + { + "epoch": 0.05353149681225616, + "grad_norm": 1.5535780191421509, + "learning_rate": 4.964737878740515e-05, + "loss": 5.2444, + "step": 9001 + }, + { + "epoch": 0.05353744409553716, + "grad_norm": 1.6840674877166748, + "learning_rate": 4.964730060756533e-05, + "loss": 5.3439, + "step": 9002 + }, + { + "epoch": 0.05354339137881815, + "grad_norm": 1.7857226133346558, + "learning_rate": 4.9647222419121384e-05, + "loss": 5.3231, + "step": 9003 + }, + { + "epoch": 0.053549338662099155, + "grad_norm": 1.6067994832992554, + "learning_rate": 4.964714422207335e-05, + "loss": 5.4019, + "step": 9004 + }, + { + "epoch": 0.05355528594538015, + "grad_norm": 1.7026724815368652, + "learning_rate": 4.964706601642125e-05, + "loss": 5.2716, + "step": 9005 + }, + { + "epoch": 0.053561233228661145, + "grad_norm": 1.632804036140442, + "learning_rate": 4.964698780216512e-05, + "loss": 5.4132, + "step": 9006 + }, + { + "epoch": 0.05356718051194215, + "grad_norm": 1.6569499969482422, + "learning_rate": 4.964690957930498e-05, + "loss": 5.294, + "step": 9007 + }, + { + "epoch": 0.05357312779522314, + "grad_norm": 1.8141810894012451, + "learning_rate": 4.964683134784086e-05, + "loss": 5.3365, + "step": 9008 + }, + { + "epoch": 0.05357907507850414, + "grad_norm": 1.6555678844451904, + "learning_rate": 4.964675310777278e-05, + "loss": 5.3488, + "step": 9009 + }, + { + "epoch": 0.05358502236178514, + "grad_norm": 1.8363603353500366, + "learning_rate": 4.964667485910078e-05, + "loss": 5.3679, + "step": 9010 + }, + { + "epoch": 0.053590969645066135, + "grad_norm": 1.7839024066925049, + "learning_rate": 4.9646596601824874e-05, + "loss": 5.2514, + "step": 9011 + }, + { + "epoch": 0.05359691692834713, + "grad_norm": 1.8712091445922852, + "learning_rate": 4.96465183359451e-05, + "loss": 5.4313, + "step": 9012 + }, + { + "epoch": 0.053602864211628125, + "grad_norm": 1.9677501916885376, + "learning_rate": 4.964644006146148e-05, + "loss": 5.2442, + "step": 9013 + }, + { + "epoch": 0.05360881149490913, + "grad_norm": 1.8567090034484863, + "learning_rate": 4.964636177837404e-05, + "loss": 5.105, + "step": 9014 + }, + { + "epoch": 0.05361475877819012, + "grad_norm": 1.7319908142089844, + "learning_rate": 4.964628348668281e-05, + "loss": 5.2962, + "step": 9015 + }, + { + "epoch": 0.05362070606147112, + "grad_norm": 1.6412272453308105, + "learning_rate": 4.9646205186387824e-05, + "loss": 5.2302, + "step": 9016 + }, + { + "epoch": 0.05362665334475212, + "grad_norm": 1.9401088953018188, + "learning_rate": 4.96461268774891e-05, + "loss": 5.4425, + "step": 9017 + }, + { + "epoch": 0.053632600628033114, + "grad_norm": 1.7045506238937378, + "learning_rate": 4.964604855998666e-05, + "loss": 5.2325, + "step": 9018 + }, + { + "epoch": 0.05363854791131411, + "grad_norm": 1.8232519626617432, + "learning_rate": 4.9645970233880545e-05, + "loss": 5.5047, + "step": 9019 + }, + { + "epoch": 0.05364449519459511, + "grad_norm": 1.718833327293396, + "learning_rate": 4.964589189917077e-05, + "loss": 5.3323, + "step": 9020 + }, + { + "epoch": 0.053650442477876106, + "grad_norm": 1.608774185180664, + "learning_rate": 4.9645813555857376e-05, + "loss": 5.2374, + "step": 9021 + }, + { + "epoch": 0.0536563897611571, + "grad_norm": 1.6789363622665405, + "learning_rate": 4.964573520394039e-05, + "loss": 5.3291, + "step": 9022 + }, + { + "epoch": 0.0536623370444381, + "grad_norm": 1.6596689224243164, + "learning_rate": 4.964565684341982e-05, + "loss": 5.308, + "step": 9023 + }, + { + "epoch": 0.0536682843277191, + "grad_norm": 1.8141522407531738, + "learning_rate": 4.9645578474295703e-05, + "loss": 5.2033, + "step": 9024 + }, + { + "epoch": 0.053674231611000094, + "grad_norm": 1.428606390953064, + "learning_rate": 4.964550009656808e-05, + "loss": 5.2441, + "step": 9025 + }, + { + "epoch": 0.053680178894281096, + "grad_norm": 1.5033652782440186, + "learning_rate": 4.9645421710236965e-05, + "loss": 5.2132, + "step": 9026 + }, + { + "epoch": 0.05368612617756209, + "grad_norm": 1.7123147249221802, + "learning_rate": 4.9645343315302385e-05, + "loss": 5.3145, + "step": 9027 + }, + { + "epoch": 0.053692073460843086, + "grad_norm": 1.5851943492889404, + "learning_rate": 4.9645264911764376e-05, + "loss": 5.353, + "step": 9028 + }, + { + "epoch": 0.05369802074412408, + "grad_norm": 1.6627084016799927, + "learning_rate": 4.964518649962295e-05, + "loss": 5.1049, + "step": 9029 + }, + { + "epoch": 0.05370396802740508, + "grad_norm": 1.51585853099823, + "learning_rate": 4.964510807887815e-05, + "loss": 4.9433, + "step": 9030 + }, + { + "epoch": 0.05370991531068608, + "grad_norm": 1.7350785732269287, + "learning_rate": 4.964502964952999e-05, + "loss": 5.1761, + "step": 9031 + }, + { + "epoch": 0.05371586259396707, + "grad_norm": 1.925410509109497, + "learning_rate": 4.964495121157852e-05, + "loss": 5.0528, + "step": 9032 + }, + { + "epoch": 0.053721809877248075, + "grad_norm": 1.794162631034851, + "learning_rate": 4.964487276502374e-05, + "loss": 5.2009, + "step": 9033 + }, + { + "epoch": 0.05372775716052907, + "grad_norm": 1.6729109287261963, + "learning_rate": 4.964479430986569e-05, + "loss": 5.16, + "step": 9034 + }, + { + "epoch": 0.053733704443810065, + "grad_norm": 1.8543394804000854, + "learning_rate": 4.9644715846104406e-05, + "loss": 5.3545, + "step": 9035 + }, + { + "epoch": 0.05373965172709107, + "grad_norm": 1.6876883506774902, + "learning_rate": 4.96446373737399e-05, + "loss": 5.2074, + "step": 9036 + }, + { + "epoch": 0.05374559901037206, + "grad_norm": 1.816701054573059, + "learning_rate": 4.9644558892772205e-05, + "loss": 5.154, + "step": 9037 + }, + { + "epoch": 0.05375154629365306, + "grad_norm": 1.471283197402954, + "learning_rate": 4.964448040320135e-05, + "loss": 5.2577, + "step": 9038 + }, + { + "epoch": 0.05375749357693406, + "grad_norm": 1.5764297246932983, + "learning_rate": 4.964440190502736e-05, + "loss": 5.0115, + "step": 9039 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 1.6854795217514038, + "learning_rate": 4.964432339825027e-05, + "loss": 5.1957, + "step": 9040 + }, + { + "epoch": 0.05376938814349605, + "grad_norm": 1.889570951461792, + "learning_rate": 4.964424488287009e-05, + "loss": 5.1229, + "step": 9041 + }, + { + "epoch": 0.05377533542677705, + "grad_norm": 1.7528218030929565, + "learning_rate": 4.964416635888687e-05, + "loss": 5.0002, + "step": 9042 + }, + { + "epoch": 0.05378128271005805, + "grad_norm": 1.68081796169281, + "learning_rate": 4.964408782630062e-05, + "loss": 5.0567, + "step": 9043 + }, + { + "epoch": 0.05378722999333904, + "grad_norm": 1.6083979606628418, + "learning_rate": 4.9644009285111384e-05, + "loss": 5.0775, + "step": 9044 + }, + { + "epoch": 0.05379317727662004, + "grad_norm": 1.676720380783081, + "learning_rate": 4.9643930735319164e-05, + "loss": 5.0446, + "step": 9045 + }, + { + "epoch": 0.05379912455990104, + "grad_norm": 1.6502453088760376, + "learning_rate": 4.964385217692401e-05, + "loss": 5.3751, + "step": 9046 + }, + { + "epoch": 0.053805071843182034, + "grad_norm": 1.9226343631744385, + "learning_rate": 4.9643773609925935e-05, + "loss": 5.2442, + "step": 9047 + }, + { + "epoch": 0.05381101912646303, + "grad_norm": 1.8054014444351196, + "learning_rate": 4.964369503432498e-05, + "loss": 5.4844, + "step": 9048 + }, + { + "epoch": 0.05381696640974403, + "grad_norm": 1.5151008367538452, + "learning_rate": 4.9643616450121166e-05, + "loss": 5.2834, + "step": 9049 + }, + { + "epoch": 0.053822913693025026, + "grad_norm": 2.0237820148468018, + "learning_rate": 4.964353785731452e-05, + "loss": 5.3166, + "step": 9050 + }, + { + "epoch": 0.05382886097630602, + "grad_norm": 2.145364999771118, + "learning_rate": 4.964345925590507e-05, + "loss": 5.3803, + "step": 9051 + }, + { + "epoch": 0.05383480825958702, + "grad_norm": 1.747369408607483, + "learning_rate": 4.964338064589284e-05, + "loss": 6.1041, + "step": 9052 + }, + { + "epoch": 0.05384075554286802, + "grad_norm": 1.9964301586151123, + "learning_rate": 4.964330202727786e-05, + "loss": 5.1707, + "step": 9053 + }, + { + "epoch": 0.053846702826149014, + "grad_norm": 1.630233645439148, + "learning_rate": 4.9643223400060155e-05, + "loss": 4.9385, + "step": 9054 + }, + { + "epoch": 0.053852650109430016, + "grad_norm": 1.5782960653305054, + "learning_rate": 4.9643144764239765e-05, + "loss": 4.9953, + "step": 9055 + }, + { + "epoch": 0.05385859739271101, + "grad_norm": 2.1511783599853516, + "learning_rate": 4.9643066119816706e-05, + "loss": 5.4329, + "step": 9056 + }, + { + "epoch": 0.053864544675992006, + "grad_norm": 2.2133493423461914, + "learning_rate": 4.9642987466791004e-05, + "loss": 5.7347, + "step": 9057 + }, + { + "epoch": 0.053870491959273, + "grad_norm": 1.7669782638549805, + "learning_rate": 4.9642908805162686e-05, + "loss": 5.4129, + "step": 9058 + }, + { + "epoch": 0.053876439242554, + "grad_norm": 1.8005794286727905, + "learning_rate": 4.9642830134931787e-05, + "loss": 5.2397, + "step": 9059 + }, + { + "epoch": 0.053882386525835, + "grad_norm": 1.697607398033142, + "learning_rate": 4.9642751456098325e-05, + "loss": 5.3388, + "step": 9060 + }, + { + "epoch": 0.05388833380911599, + "grad_norm": 1.4916869401931763, + "learning_rate": 4.9642672768662344e-05, + "loss": 5.2574, + "step": 9061 + }, + { + "epoch": 0.053894281092396995, + "grad_norm": 1.7112784385681152, + "learning_rate": 4.964259407262385e-05, + "loss": 4.9881, + "step": 9062 + }, + { + "epoch": 0.05390022837567799, + "grad_norm": 1.4831846952438354, + "learning_rate": 4.964251536798289e-05, + "loss": 5.3976, + "step": 9063 + }, + { + "epoch": 0.053906175658958985, + "grad_norm": 1.626370906829834, + "learning_rate": 4.9642436654739476e-05, + "loss": 5.2409, + "step": 9064 + }, + { + "epoch": 0.05391212294223999, + "grad_norm": 1.7369413375854492, + "learning_rate": 4.964235793289365e-05, + "loss": 5.2732, + "step": 9065 + }, + { + "epoch": 0.05391807022552098, + "grad_norm": 1.7028629779815674, + "learning_rate": 4.964227920244542e-05, + "loss": 5.3161, + "step": 9066 + }, + { + "epoch": 0.05392401750880198, + "grad_norm": 1.9031678438186646, + "learning_rate": 4.964220046339483e-05, + "loss": 5.2517, + "step": 9067 + }, + { + "epoch": 0.05392996479208298, + "grad_norm": 1.8210735321044922, + "learning_rate": 4.96421217157419e-05, + "loss": 5.2819, + "step": 9068 + }, + { + "epoch": 0.053935912075363975, + "grad_norm": 1.7334645986557007, + "learning_rate": 4.9642042959486666e-05, + "loss": 5.4296, + "step": 9069 + }, + { + "epoch": 0.05394185935864497, + "grad_norm": 1.732790231704712, + "learning_rate": 4.964196419462914e-05, + "loss": 5.3589, + "step": 9070 + }, + { + "epoch": 0.05394780664192597, + "grad_norm": 1.417751669883728, + "learning_rate": 4.964188542116937e-05, + "loss": 5.0958, + "step": 9071 + }, + { + "epoch": 0.05395375392520697, + "grad_norm": 1.8562361001968384, + "learning_rate": 4.964180663910737e-05, + "loss": 5.2622, + "step": 9072 + }, + { + "epoch": 0.05395970120848796, + "grad_norm": 1.7366154193878174, + "learning_rate": 4.9641727848443166e-05, + "loss": 5.2329, + "step": 9073 + }, + { + "epoch": 0.05396564849176896, + "grad_norm": 1.8587182760238647, + "learning_rate": 4.9641649049176785e-05, + "loss": 4.9392, + "step": 9074 + }, + { + "epoch": 0.05397159577504996, + "grad_norm": 1.6152398586273193, + "learning_rate": 4.964157024130827e-05, + "loss": 5.473, + "step": 9075 + }, + { + "epoch": 0.053977543058330954, + "grad_norm": 1.5967273712158203, + "learning_rate": 4.9641491424837626e-05, + "loss": 5.2877, + "step": 9076 + }, + { + "epoch": 0.05398349034161195, + "grad_norm": 1.4986391067504883, + "learning_rate": 4.96414125997649e-05, + "loss": 5.2163, + "step": 9077 + }, + { + "epoch": 0.05398943762489295, + "grad_norm": 1.563905119895935, + "learning_rate": 4.964133376609011e-05, + "loss": 5.2043, + "step": 9078 + }, + { + "epoch": 0.053995384908173946, + "grad_norm": 1.5690317153930664, + "learning_rate": 4.964125492381329e-05, + "loss": 5.2226, + "step": 9079 + }, + { + "epoch": 0.05400133219145494, + "grad_norm": 1.7732517719268799, + "learning_rate": 4.9641176072934446e-05, + "loss": 5.3123, + "step": 9080 + }, + { + "epoch": 0.05400727947473594, + "grad_norm": 1.7045226097106934, + "learning_rate": 4.964109721345364e-05, + "loss": 5.0872, + "step": 9081 + }, + { + "epoch": 0.05401322675801694, + "grad_norm": 1.6405664682388306, + "learning_rate": 4.964101834537087e-05, + "loss": 5.3863, + "step": 9082 + }, + { + "epoch": 0.054019174041297933, + "grad_norm": 1.7410979270935059, + "learning_rate": 4.964093946868618e-05, + "loss": 5.0952, + "step": 9083 + }, + { + "epoch": 0.054025121324578936, + "grad_norm": 2.0102951526641846, + "learning_rate": 4.964086058339959e-05, + "loss": 4.9484, + "step": 9084 + }, + { + "epoch": 0.05403106860785993, + "grad_norm": 1.8228510618209839, + "learning_rate": 4.9640781689511133e-05, + "loss": 5.1141, + "step": 9085 + }, + { + "epoch": 0.054037015891140926, + "grad_norm": 1.7363582849502563, + "learning_rate": 4.964070278702083e-05, + "loss": 5.1164, + "step": 9086 + }, + { + "epoch": 0.05404296317442192, + "grad_norm": 1.6060153245925903, + "learning_rate": 4.9640623875928714e-05, + "loss": 5.1746, + "step": 9087 + }, + { + "epoch": 0.05404891045770292, + "grad_norm": 1.6690374612808228, + "learning_rate": 4.9640544956234814e-05, + "loss": 5.0931, + "step": 9088 + }, + { + "epoch": 0.05405485774098392, + "grad_norm": 1.613527774810791, + "learning_rate": 4.964046602793916e-05, + "loss": 5.2224, + "step": 9089 + }, + { + "epoch": 0.05406080502426491, + "grad_norm": 1.6461642980575562, + "learning_rate": 4.964038709104176e-05, + "loss": 5.3175, + "step": 9090 + }, + { + "epoch": 0.054066752307545915, + "grad_norm": 1.839709758758545, + "learning_rate": 4.9640308145542664e-05, + "loss": 5.3247, + "step": 9091 + }, + { + "epoch": 0.05407269959082691, + "grad_norm": 1.8977348804473877, + "learning_rate": 4.9640229191441886e-05, + "loss": 5.4256, + "step": 9092 + }, + { + "epoch": 0.054078646874107905, + "grad_norm": 1.9805532693862915, + "learning_rate": 4.9640150228739454e-05, + "loss": 4.9413, + "step": 9093 + }, + { + "epoch": 0.05408459415738891, + "grad_norm": 2.0237114429473877, + "learning_rate": 4.964007125743542e-05, + "loss": 4.8808, + "step": 9094 + }, + { + "epoch": 0.0540905414406699, + "grad_norm": 1.9848511219024658, + "learning_rate": 4.963999227752977e-05, + "loss": 5.0295, + "step": 9095 + }, + { + "epoch": 0.0540964887239509, + "grad_norm": 1.925876498222351, + "learning_rate": 4.9639913289022564e-05, + "loss": 5.0129, + "step": 9096 + }, + { + "epoch": 0.0541024360072319, + "grad_norm": 1.4887725114822388, + "learning_rate": 4.963983429191382e-05, + "loss": 4.9706, + "step": 9097 + }, + { + "epoch": 0.054108383290512894, + "grad_norm": 1.615160584449768, + "learning_rate": 4.963975528620356e-05, + "loss": 5.0066, + "step": 9098 + }, + { + "epoch": 0.05411433057379389, + "grad_norm": 1.969086766242981, + "learning_rate": 4.9639676271891816e-05, + "loss": 4.9539, + "step": 9099 + }, + { + "epoch": 0.05412027785707489, + "grad_norm": 1.8290555477142334, + "learning_rate": 4.963959724897862e-05, + "loss": 5.2467, + "step": 9100 + }, + { + "epoch": 0.05412622514035589, + "grad_norm": 2.004157066345215, + "learning_rate": 4.963951821746399e-05, + "loss": 4.8, + "step": 9101 + }, + { + "epoch": 0.05413217242363688, + "grad_norm": 1.9732778072357178, + "learning_rate": 4.9639439177347955e-05, + "loss": 4.8828, + "step": 9102 + }, + { + "epoch": 0.05413811970691788, + "grad_norm": 1.8653557300567627, + "learning_rate": 4.963936012863056e-05, + "loss": 5.0591, + "step": 9103 + }, + { + "epoch": 0.05414406699019888, + "grad_norm": 1.7854375839233398, + "learning_rate": 4.9639281071311804e-05, + "loss": 5.0914, + "step": 9104 + }, + { + "epoch": 0.054150014273479874, + "grad_norm": 1.7956377267837524, + "learning_rate": 4.963920200539174e-05, + "loss": 5.3484, + "step": 9105 + }, + { + "epoch": 0.05415596155676087, + "grad_norm": 1.7851346731185913, + "learning_rate": 4.963912293087039e-05, + "loss": 5.3146, + "step": 9106 + }, + { + "epoch": 0.05416190884004187, + "grad_norm": 1.72859787940979, + "learning_rate": 4.9639043847747756e-05, + "loss": 5.1611, + "step": 9107 + }, + { + "epoch": 0.054167856123322866, + "grad_norm": 1.5961265563964844, + "learning_rate": 4.9638964756023904e-05, + "loss": 5.247, + "step": 9108 + }, + { + "epoch": 0.05417380340660386, + "grad_norm": 1.7507922649383545, + "learning_rate": 4.963888565569884e-05, + "loss": 5.2011, + "step": 9109 + }, + { + "epoch": 0.05417975068988486, + "grad_norm": 1.8338440656661987, + "learning_rate": 4.9638806546772594e-05, + "loss": 5.2413, + "step": 9110 + }, + { + "epoch": 0.05418569797316586, + "grad_norm": 1.8935306072235107, + "learning_rate": 4.963872742924519e-05, + "loss": 5.1042, + "step": 9111 + }, + { + "epoch": 0.05419164525644685, + "grad_norm": 1.6512808799743652, + "learning_rate": 4.963864830311667e-05, + "loss": 5.2437, + "step": 9112 + }, + { + "epoch": 0.054197592539727855, + "grad_norm": 1.6099332571029663, + "learning_rate": 4.963856916838705e-05, + "loss": 5.2828, + "step": 9113 + }, + { + "epoch": 0.05420353982300885, + "grad_norm": 2.114581823348999, + "learning_rate": 4.9638490025056355e-05, + "loss": 6.1534, + "step": 9114 + }, + { + "epoch": 0.054209487106289846, + "grad_norm": 1.762335181236267, + "learning_rate": 4.963841087312462e-05, + "loss": 5.1504, + "step": 9115 + }, + { + "epoch": 0.05421543438957084, + "grad_norm": 1.7669222354888916, + "learning_rate": 4.963833171259187e-05, + "loss": 5.0365, + "step": 9116 + }, + { + "epoch": 0.05422138167285184, + "grad_norm": 1.7319819927215576, + "learning_rate": 4.963825254345814e-05, + "loss": 5.0724, + "step": 9117 + }, + { + "epoch": 0.05422732895613284, + "grad_norm": 1.618116021156311, + "learning_rate": 4.9638173365723444e-05, + "loss": 5.0964, + "step": 9118 + }, + { + "epoch": 0.05423327623941383, + "grad_norm": 1.6506006717681885, + "learning_rate": 4.9638094179387814e-05, + "loss": 5.1189, + "step": 9119 + }, + { + "epoch": 0.054239223522694835, + "grad_norm": 1.7512328624725342, + "learning_rate": 4.963801498445129e-05, + "loss": 5.2732, + "step": 9120 + }, + { + "epoch": 0.05424517080597583, + "grad_norm": 1.5639985799789429, + "learning_rate": 4.963793578091388e-05, + "loss": 5.0718, + "step": 9121 + }, + { + "epoch": 0.054251118089256825, + "grad_norm": 1.7059093713760376, + "learning_rate": 4.963785656877562e-05, + "loss": 5.0744, + "step": 9122 + }, + { + "epoch": 0.05425706537253783, + "grad_norm": 1.574802279472351, + "learning_rate": 4.9637777348036546e-05, + "loss": 5.2663, + "step": 9123 + }, + { + "epoch": 0.05426301265581882, + "grad_norm": 1.7343204021453857, + "learning_rate": 4.9637698118696674e-05, + "loss": 5.0805, + "step": 9124 + }, + { + "epoch": 0.05426895993909982, + "grad_norm": 1.6154165267944336, + "learning_rate": 4.963761888075604e-05, + "loss": 5.1402, + "step": 9125 + }, + { + "epoch": 0.05427490722238082, + "grad_norm": 1.6474148035049438, + "learning_rate": 4.9637539634214666e-05, + "loss": 5.0601, + "step": 9126 + }, + { + "epoch": 0.054280854505661814, + "grad_norm": 1.7573519945144653, + "learning_rate": 4.963746037907258e-05, + "loss": 5.1846, + "step": 9127 + }, + { + "epoch": 0.05428680178894281, + "grad_norm": 1.4558652639389038, + "learning_rate": 4.963738111532981e-05, + "loss": 5.3132, + "step": 9128 + }, + { + "epoch": 0.05429274907222381, + "grad_norm": 1.6261000633239746, + "learning_rate": 4.963730184298639e-05, + "loss": 5.2843, + "step": 9129 + }, + { + "epoch": 0.05429869635550481, + "grad_norm": 1.4502191543579102, + "learning_rate": 4.963722256204234e-05, + "loss": 5.14, + "step": 9130 + }, + { + "epoch": 0.0543046436387858, + "grad_norm": 1.6366747617721558, + "learning_rate": 4.9637143272497686e-05, + "loss": 5.1496, + "step": 9131 + }, + { + "epoch": 0.0543105909220668, + "grad_norm": 1.603745698928833, + "learning_rate": 4.963706397435246e-05, + "loss": 5.0644, + "step": 9132 + }, + { + "epoch": 0.0543165382053478, + "grad_norm": 1.419536828994751, + "learning_rate": 4.963698466760669e-05, + "loss": 5.3182, + "step": 9133 + }, + { + "epoch": 0.054322485488628794, + "grad_norm": 1.511765480041504, + "learning_rate": 4.963690535226041e-05, + "loss": 5.2808, + "step": 9134 + }, + { + "epoch": 0.05432843277190979, + "grad_norm": 1.4999688863754272, + "learning_rate": 4.963682602831364e-05, + "loss": 4.9235, + "step": 9135 + }, + { + "epoch": 0.05433438005519079, + "grad_norm": 1.5918420553207397, + "learning_rate": 4.96367466957664e-05, + "loss": 4.9293, + "step": 9136 + }, + { + "epoch": 0.054340327338471786, + "grad_norm": 1.502748727798462, + "learning_rate": 4.963666735461874e-05, + "loss": 5.2692, + "step": 9137 + }, + { + "epoch": 0.05434627462175278, + "grad_norm": 1.6474169492721558, + "learning_rate": 4.963658800487066e-05, + "loss": 5.1638, + "step": 9138 + }, + { + "epoch": 0.05435222190503378, + "grad_norm": 2.0195884704589844, + "learning_rate": 4.9636508646522204e-05, + "loss": 5.1085, + "step": 9139 + }, + { + "epoch": 0.05435816918831478, + "grad_norm": 1.7266180515289307, + "learning_rate": 4.9636429279573406e-05, + "loss": 5.0747, + "step": 9140 + }, + { + "epoch": 0.05436411647159577, + "grad_norm": 1.6965065002441406, + "learning_rate": 4.963634990402428e-05, + "loss": 5.1246, + "step": 9141 + }, + { + "epoch": 0.054370063754876775, + "grad_norm": 1.7629759311676025, + "learning_rate": 4.9636270519874856e-05, + "loss": 5.274, + "step": 9142 + }, + { + "epoch": 0.05437601103815777, + "grad_norm": 1.6365042924880981, + "learning_rate": 4.9636191127125164e-05, + "loss": 5.2469, + "step": 9143 + }, + { + "epoch": 0.054381958321438766, + "grad_norm": 1.6777831315994263, + "learning_rate": 4.9636111725775235e-05, + "loss": 5.3041, + "step": 9144 + }, + { + "epoch": 0.05438790560471976, + "grad_norm": 1.5354039669036865, + "learning_rate": 4.9636032315825096e-05, + "loss": 5.1799, + "step": 9145 + }, + { + "epoch": 0.05439385288800076, + "grad_norm": 1.508083701133728, + "learning_rate": 4.9635952897274773e-05, + "loss": 5.0822, + "step": 9146 + }, + { + "epoch": 0.05439980017128176, + "grad_norm": 1.5960441827774048, + "learning_rate": 4.963587347012429e-05, + "loss": 5.1618, + "step": 9147 + }, + { + "epoch": 0.05440574745456275, + "grad_norm": 1.4927520751953125, + "learning_rate": 4.9635794034373675e-05, + "loss": 5.1464, + "step": 9148 + }, + { + "epoch": 0.054411694737843755, + "grad_norm": 1.7420401573181152, + "learning_rate": 4.9635714590022966e-05, + "loss": 5.2866, + "step": 9149 + }, + { + "epoch": 0.05441764202112475, + "grad_norm": 1.7907800674438477, + "learning_rate": 4.9635635137072176e-05, + "loss": 5.1042, + "step": 9150 + }, + { + "epoch": 0.054423589304405745, + "grad_norm": 1.7073547840118408, + "learning_rate": 4.963555567552135e-05, + "loss": 5.1986, + "step": 9151 + }, + { + "epoch": 0.05442953658768675, + "grad_norm": 1.894405484199524, + "learning_rate": 4.96354762053705e-05, + "loss": 5.225, + "step": 9152 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 1.5830878019332886, + "learning_rate": 4.9635396726619656e-05, + "loss": 5.2902, + "step": 9153 + }, + { + "epoch": 0.05444143115424874, + "grad_norm": 1.5435214042663574, + "learning_rate": 4.963531723926885e-05, + "loss": 5.0773, + "step": 9154 + }, + { + "epoch": 0.05444737843752974, + "grad_norm": 1.4262596368789673, + "learning_rate": 4.9635237743318117e-05, + "loss": 5.129, + "step": 9155 + }, + { + "epoch": 0.054453325720810734, + "grad_norm": 1.5793390274047852, + "learning_rate": 4.9635158238767475e-05, + "loss": 5.1693, + "step": 9156 + }, + { + "epoch": 0.05445927300409173, + "grad_norm": 1.767318606376648, + "learning_rate": 4.963507872561695e-05, + "loss": 5.2541, + "step": 9157 + }, + { + "epoch": 0.05446522028737273, + "grad_norm": 1.5084065198898315, + "learning_rate": 4.963499920386658e-05, + "loss": 5.2531, + "step": 9158 + }, + { + "epoch": 0.05447116757065373, + "grad_norm": 1.797877311706543, + "learning_rate": 4.963491967351638e-05, + "loss": 5.2278, + "step": 9159 + }, + { + "epoch": 0.05447711485393472, + "grad_norm": 1.7463361024856567, + "learning_rate": 4.963484013456639e-05, + "loss": 5.1005, + "step": 9160 + }, + { + "epoch": 0.05448306213721572, + "grad_norm": 1.8208277225494385, + "learning_rate": 4.9634760587016626e-05, + "loss": 5.1437, + "step": 9161 + }, + { + "epoch": 0.05448900942049672, + "grad_norm": 1.9020015001296997, + "learning_rate": 4.9634681030867116e-05, + "loss": 5.1554, + "step": 9162 + }, + { + "epoch": 0.054494956703777714, + "grad_norm": 1.8370200395584106, + "learning_rate": 4.9634601466117904e-05, + "loss": 5.2418, + "step": 9163 + }, + { + "epoch": 0.05450090398705871, + "grad_norm": 1.785875678062439, + "learning_rate": 4.9634521892769004e-05, + "loss": 5.1916, + "step": 9164 + }, + { + "epoch": 0.05450685127033971, + "grad_norm": 1.7501643896102905, + "learning_rate": 4.963444231082045e-05, + "loss": 5.0887, + "step": 9165 + }, + { + "epoch": 0.054512798553620706, + "grad_norm": 1.6924220323562622, + "learning_rate": 4.963436272027227e-05, + "loss": 5.2458, + "step": 9166 + }, + { + "epoch": 0.0545187458369017, + "grad_norm": 1.895605206489563, + "learning_rate": 4.963428312112447e-05, + "loss": 5.1286, + "step": 9167 + }, + { + "epoch": 0.0545246931201827, + "grad_norm": 1.842207908630371, + "learning_rate": 4.963420351337711e-05, + "loss": 5.1177, + "step": 9168 + }, + { + "epoch": 0.0545306404034637, + "grad_norm": 1.7467048168182373, + "learning_rate": 4.963412389703021e-05, + "loss": 5.1616, + "step": 9169 + }, + { + "epoch": 0.05453658768674469, + "grad_norm": 1.8047499656677246, + "learning_rate": 4.963404427208378e-05, + "loss": 5.0543, + "step": 9170 + }, + { + "epoch": 0.054542534970025695, + "grad_norm": 1.5830637216567993, + "learning_rate": 4.963396463853786e-05, + "loss": 5.0989, + "step": 9171 + }, + { + "epoch": 0.05454848225330669, + "grad_norm": 1.7481937408447266, + "learning_rate": 4.9633884996392485e-05, + "loss": 5.1686, + "step": 9172 + }, + { + "epoch": 0.054554429536587686, + "grad_norm": 1.7132925987243652, + "learning_rate": 4.9633805345647664e-05, + "loss": 4.9683, + "step": 9173 + }, + { + "epoch": 0.05456037681986868, + "grad_norm": 1.8369117975234985, + "learning_rate": 4.9633725686303445e-05, + "loss": 5.154, + "step": 9174 + }, + { + "epoch": 0.05456632410314968, + "grad_norm": 1.615011215209961, + "learning_rate": 4.963364601835985e-05, + "loss": 5.0982, + "step": 9175 + }, + { + "epoch": 0.05457227138643068, + "grad_norm": 1.853742003440857, + "learning_rate": 4.963356634181689e-05, + "loss": 6.0599, + "step": 9176 + }, + { + "epoch": 0.05457821866971167, + "grad_norm": 1.5529752969741821, + "learning_rate": 4.963348665667462e-05, + "loss": 5.1355, + "step": 9177 + }, + { + "epoch": 0.054584165952992675, + "grad_norm": 1.5113881826400757, + "learning_rate": 4.963340696293305e-05, + "loss": 5.1947, + "step": 9178 + }, + { + "epoch": 0.05459011323627367, + "grad_norm": 1.6840931177139282, + "learning_rate": 4.963332726059221e-05, + "loss": 5.2163, + "step": 9179 + }, + { + "epoch": 0.054596060519554665, + "grad_norm": 1.7720422744750977, + "learning_rate": 4.963324754965214e-05, + "loss": 5.4737, + "step": 9180 + }, + { + "epoch": 0.05460200780283567, + "grad_norm": 1.632574200630188, + "learning_rate": 4.963316783011285e-05, + "loss": 5.2274, + "step": 9181 + }, + { + "epoch": 0.05460795508611666, + "grad_norm": 1.5859557390213013, + "learning_rate": 4.963308810197437e-05, + "loss": 5.3503, + "step": 9182 + }, + { + "epoch": 0.05461390236939766, + "grad_norm": 1.8342604637145996, + "learning_rate": 4.963300836523674e-05, + "loss": 5.1967, + "step": 9183 + }, + { + "epoch": 0.05461984965267866, + "grad_norm": 1.7443957328796387, + "learning_rate": 4.963292861989998e-05, + "loss": 5.0935, + "step": 9184 + }, + { + "epoch": 0.054625796935959654, + "grad_norm": 1.9289584159851074, + "learning_rate": 4.963284886596412e-05, + "loss": 5.1817, + "step": 9185 + }, + { + "epoch": 0.05463174421924065, + "grad_norm": 1.8695822954177856, + "learning_rate": 4.9632769103429186e-05, + "loss": 5.4304, + "step": 9186 + }, + { + "epoch": 0.05463769150252165, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.9632689332295206e-05, + "loss": 5.3924, + "step": 9187 + }, + { + "epoch": 0.054643638785802646, + "grad_norm": 1.6061500310897827, + "learning_rate": 4.963260955256221e-05, + "loss": 5.2309, + "step": 9188 + }, + { + "epoch": 0.05464958606908364, + "grad_norm": 1.5478893518447876, + "learning_rate": 4.963252976423022e-05, + "loss": 5.2615, + "step": 9189 + }, + { + "epoch": 0.05465553335236464, + "grad_norm": 1.4304052591323853, + "learning_rate": 4.9632449967299276e-05, + "loss": 5.2116, + "step": 9190 + }, + { + "epoch": 0.05466148063564564, + "grad_norm": 1.5438693761825562, + "learning_rate": 4.9632370161769395e-05, + "loss": 5.1176, + "step": 9191 + }, + { + "epoch": 0.054667427918926634, + "grad_norm": 1.6602065563201904, + "learning_rate": 4.9632290347640606e-05, + "loss": 5.1521, + "step": 9192 + }, + { + "epoch": 0.05467337520220763, + "grad_norm": 1.530038595199585, + "learning_rate": 4.9632210524912934e-05, + "loss": 5.1437, + "step": 9193 + }, + { + "epoch": 0.05467932248548863, + "grad_norm": 1.617691159248352, + "learning_rate": 4.963213069358643e-05, + "loss": 5.0601, + "step": 9194 + }, + { + "epoch": 0.054685269768769626, + "grad_norm": 1.722401738166809, + "learning_rate": 4.963205085366108e-05, + "loss": 5.2664, + "step": 9195 + }, + { + "epoch": 0.05469121705205062, + "grad_norm": 1.803673267364502, + "learning_rate": 4.963197100513696e-05, + "loss": 5.4164, + "step": 9196 + }, + { + "epoch": 0.05469716433533162, + "grad_norm": 1.8565739393234253, + "learning_rate": 4.963189114801405e-05, + "loss": 5.225, + "step": 9197 + }, + { + "epoch": 0.05470311161861262, + "grad_norm": 1.780698299407959, + "learning_rate": 4.963181128229242e-05, + "loss": 5.1694, + "step": 9198 + }, + { + "epoch": 0.05470905890189361, + "grad_norm": 1.820416808128357, + "learning_rate": 4.963173140797207e-05, + "loss": 5.3305, + "step": 9199 + }, + { + "epoch": 0.054715006185174615, + "grad_norm": 1.471983551979065, + "learning_rate": 4.963165152505304e-05, + "loss": 5.3217, + "step": 9200 + }, + { + "epoch": 0.05472095346845561, + "grad_norm": 1.504616141319275, + "learning_rate": 4.9631571633535354e-05, + "loss": 5.3349, + "step": 9201 + }, + { + "epoch": 0.054726900751736605, + "grad_norm": 1.5888862609863281, + "learning_rate": 4.963149173341903e-05, + "loss": 5.3431, + "step": 9202 + }, + { + "epoch": 0.0547328480350176, + "grad_norm": 1.6633155345916748, + "learning_rate": 4.963141182470412e-05, + "loss": 5.2678, + "step": 9203 + }, + { + "epoch": 0.0547387953182986, + "grad_norm": 1.7259690761566162, + "learning_rate": 4.9631331907390636e-05, + "loss": 5.348, + "step": 9204 + }, + { + "epoch": 0.0547447426015796, + "grad_norm": 1.703925371170044, + "learning_rate": 4.963125198147861e-05, + "loss": 5.4123, + "step": 9205 + }, + { + "epoch": 0.05475068988486059, + "grad_norm": 1.6619760990142822, + "learning_rate": 4.963117204696807e-05, + "loss": 5.1732, + "step": 9206 + }, + { + "epoch": 0.054756637168141595, + "grad_norm": 1.7368190288543701, + "learning_rate": 4.963109210385903e-05, + "loss": 5.0843, + "step": 9207 + }, + { + "epoch": 0.05476258445142259, + "grad_norm": 1.781179666519165, + "learning_rate": 4.9631012152151545e-05, + "loss": 5.1343, + "step": 9208 + }, + { + "epoch": 0.054768531734703585, + "grad_norm": 1.674793004989624, + "learning_rate": 4.9630932191845624e-05, + "loss": 5.4079, + "step": 9209 + }, + { + "epoch": 0.05477447901798459, + "grad_norm": 1.7708344459533691, + "learning_rate": 4.9630852222941296e-05, + "loss": 4.9702, + "step": 9210 + }, + { + "epoch": 0.05478042630126558, + "grad_norm": 1.684725046157837, + "learning_rate": 4.9630772245438594e-05, + "loss": 5.263, + "step": 9211 + }, + { + "epoch": 0.05478637358454658, + "grad_norm": 1.6064784526824951, + "learning_rate": 4.963069225933754e-05, + "loss": 5.3402, + "step": 9212 + }, + { + "epoch": 0.05479232086782758, + "grad_norm": 1.5189318656921387, + "learning_rate": 4.963061226463816e-05, + "loss": 5.1928, + "step": 9213 + }, + { + "epoch": 0.054798268151108574, + "grad_norm": 1.8095827102661133, + "learning_rate": 4.96305322613405e-05, + "loss": 5.262, + "step": 9214 + }, + { + "epoch": 0.05480421543438957, + "grad_norm": 1.8325434923171997, + "learning_rate": 4.963045224944458e-05, + "loss": 5.4975, + "step": 9215 + }, + { + "epoch": 0.05481016271767057, + "grad_norm": 1.6597868204116821, + "learning_rate": 4.963037222895042e-05, + "loss": 5.6232, + "step": 9216 + }, + { + "epoch": 0.054816110000951566, + "grad_norm": 1.6402417421340942, + "learning_rate": 4.9630292199858044e-05, + "loss": 5.5358, + "step": 9217 + }, + { + "epoch": 0.05482205728423256, + "grad_norm": 1.3956371545791626, + "learning_rate": 4.963021216216749e-05, + "loss": 5.2563, + "step": 9218 + }, + { + "epoch": 0.05482800456751356, + "grad_norm": 1.5958374738693237, + "learning_rate": 4.963013211587878e-05, + "loss": 5.1539, + "step": 9219 + }, + { + "epoch": 0.05483395185079456, + "grad_norm": 1.6152080297470093, + "learning_rate": 4.963005206099195e-05, + "loss": 5.4025, + "step": 9220 + }, + { + "epoch": 0.054839899134075554, + "grad_norm": 1.392427921295166, + "learning_rate": 4.962997199750702e-05, + "loss": 5.4149, + "step": 9221 + }, + { + "epoch": 0.05484584641735655, + "grad_norm": 1.5625338554382324, + "learning_rate": 4.962989192542403e-05, + "loss": 5.5837, + "step": 9222 + }, + { + "epoch": 0.05485179370063755, + "grad_norm": 1.6465163230895996, + "learning_rate": 4.962981184474299e-05, + "loss": 5.2934, + "step": 9223 + }, + { + "epoch": 0.054857740983918546, + "grad_norm": 1.5344611406326294, + "learning_rate": 4.962973175546394e-05, + "loss": 5.4734, + "step": 9224 + }, + { + "epoch": 0.05486368826719954, + "grad_norm": 1.2378648519515991, + "learning_rate": 4.962965165758691e-05, + "loss": 5.3368, + "step": 9225 + }, + { + "epoch": 0.05486963555048054, + "grad_norm": 1.396785020828247, + "learning_rate": 4.9629571551111915e-05, + "loss": 5.3163, + "step": 9226 + }, + { + "epoch": 0.05487558283376154, + "grad_norm": 1.639452338218689, + "learning_rate": 4.9629491436038994e-05, + "loss": 5.3933, + "step": 9227 + }, + { + "epoch": 0.05488153011704253, + "grad_norm": 1.5648834705352783, + "learning_rate": 4.9629411312368166e-05, + "loss": 5.3717, + "step": 9228 + }, + { + "epoch": 0.054887477400323535, + "grad_norm": 1.4774922132492065, + "learning_rate": 4.962933118009947e-05, + "loss": 5.1318, + "step": 9229 + }, + { + "epoch": 0.05489342468360453, + "grad_norm": 1.4987083673477173, + "learning_rate": 4.9629251039232935e-05, + "loss": 5.1436, + "step": 9230 + }, + { + "epoch": 0.054899371966885525, + "grad_norm": 1.660605788230896, + "learning_rate": 4.9629170889768586e-05, + "loss": 5.1841, + "step": 9231 + }, + { + "epoch": 0.05490531925016652, + "grad_norm": 1.4441273212432861, + "learning_rate": 4.962909073170643e-05, + "loss": 5.3108, + "step": 9232 + }, + { + "epoch": 0.05491126653344752, + "grad_norm": 1.3297922611236572, + "learning_rate": 4.962901056504653e-05, + "loss": 5.1441, + "step": 9233 + }, + { + "epoch": 0.05491721381672852, + "grad_norm": 1.2989814281463623, + "learning_rate": 4.9628930389788886e-05, + "loss": 5.5146, + "step": 9234 + }, + { + "epoch": 0.05492316110000951, + "grad_norm": 1.350948452949524, + "learning_rate": 4.962885020593354e-05, + "loss": 5.2832, + "step": 9235 + }, + { + "epoch": 0.054929108383290515, + "grad_norm": 1.5801438093185425, + "learning_rate": 4.962877001348052e-05, + "loss": 5.4251, + "step": 9236 + }, + { + "epoch": 0.05493505566657151, + "grad_norm": 1.4355653524398804, + "learning_rate": 4.9628689812429854e-05, + "loss": 5.4092, + "step": 9237 + }, + { + "epoch": 0.054941002949852505, + "grad_norm": 1.692746639251709, + "learning_rate": 4.962860960278156e-05, + "loss": 5.3858, + "step": 9238 + }, + { + "epoch": 0.05494695023313351, + "grad_norm": 1.5125641822814941, + "learning_rate": 4.962852938453567e-05, + "loss": 5.6584, + "step": 9239 + }, + { + "epoch": 0.0549528975164145, + "grad_norm": 1.4158848524093628, + "learning_rate": 4.962844915769221e-05, + "loss": 5.652, + "step": 9240 + }, + { + "epoch": 0.0549588447996955, + "grad_norm": 1.314286231994629, + "learning_rate": 4.9628368922251235e-05, + "loss": 5.501, + "step": 9241 + }, + { + "epoch": 0.0549647920829765, + "grad_norm": 1.4003247022628784, + "learning_rate": 4.962828867821273e-05, + "loss": 5.448, + "step": 9242 + }, + { + "epoch": 0.054970739366257494, + "grad_norm": 1.7670220136642456, + "learning_rate": 4.962820842557675e-05, + "loss": 5.4854, + "step": 9243 + }, + { + "epoch": 0.05497668664953849, + "grad_norm": 1.9435075521469116, + "learning_rate": 4.962812816434332e-05, + "loss": 5.3824, + "step": 9244 + }, + { + "epoch": 0.05498263393281949, + "grad_norm": 2.1733458042144775, + "learning_rate": 4.9628047894512466e-05, + "loss": 5.6771, + "step": 9245 + }, + { + "epoch": 0.054988581216100486, + "grad_norm": 1.5455420017242432, + "learning_rate": 4.962796761608421e-05, + "loss": 5.4634, + "step": 9246 + }, + { + "epoch": 0.05499452849938148, + "grad_norm": 1.623382806777954, + "learning_rate": 4.962788732905859e-05, + "loss": 5.8441, + "step": 9247 + }, + { + "epoch": 0.05500047578266248, + "grad_norm": 1.928788423538208, + "learning_rate": 4.962780703343563e-05, + "loss": 5.6553, + "step": 9248 + }, + { + "epoch": 0.05500642306594348, + "grad_norm": 1.660984992980957, + "learning_rate": 4.962772672921535e-05, + "loss": 5.5953, + "step": 9249 + }, + { + "epoch": 0.055012370349224474, + "grad_norm": 2.081026792526245, + "learning_rate": 4.962764641639779e-05, + "loss": 5.7065, + "step": 9250 + }, + { + "epoch": 0.05501831763250547, + "grad_norm": 1.8750234842300415, + "learning_rate": 4.962756609498297e-05, + "loss": 5.8814, + "step": 9251 + }, + { + "epoch": 0.05502426491578647, + "grad_norm": 1.9573127031326294, + "learning_rate": 4.9627485764970916e-05, + "loss": 5.7415, + "step": 9252 + }, + { + "epoch": 0.055030212199067466, + "grad_norm": 1.7536600828170776, + "learning_rate": 4.962740542636167e-05, + "loss": 5.5638, + "step": 9253 + }, + { + "epoch": 0.05503615948234846, + "grad_norm": 1.692557692527771, + "learning_rate": 4.962732507915525e-05, + "loss": 5.5362, + "step": 9254 + }, + { + "epoch": 0.05504210676562946, + "grad_norm": 1.9066821336746216, + "learning_rate": 4.962724472335168e-05, + "loss": 5.3094, + "step": 9255 + }, + { + "epoch": 0.05504805404891046, + "grad_norm": 2.069007158279419, + "learning_rate": 4.9627164358951e-05, + "loss": 5.766, + "step": 9256 + }, + { + "epoch": 0.05505400133219145, + "grad_norm": 2.0293545722961426, + "learning_rate": 4.9627083985953227e-05, + "loss": 5.7769, + "step": 9257 + }, + { + "epoch": 0.055059948615472455, + "grad_norm": 1.7953507900238037, + "learning_rate": 4.962700360435839e-05, + "loss": 5.8435, + "step": 9258 + }, + { + "epoch": 0.05506589589875345, + "grad_norm": 1.9281821250915527, + "learning_rate": 4.9626923214166535e-05, + "loss": 5.8342, + "step": 9259 + }, + { + "epoch": 0.055071843182034445, + "grad_norm": 1.4612617492675781, + "learning_rate": 4.962684281537766e-05, + "loss": 5.8273, + "step": 9260 + }, + { + "epoch": 0.05507779046531545, + "grad_norm": 1.8589900732040405, + "learning_rate": 4.9626762407991817e-05, + "loss": 5.7607, + "step": 9261 + }, + { + "epoch": 0.05508373774859644, + "grad_norm": 1.9395030736923218, + "learning_rate": 4.9626681992009025e-05, + "loss": 5.7573, + "step": 9262 + }, + { + "epoch": 0.05508968503187744, + "grad_norm": 1.7344708442687988, + "learning_rate": 4.962660156742931e-05, + "loss": 5.7999, + "step": 9263 + }, + { + "epoch": 0.05509563231515843, + "grad_norm": 1.7719827890396118, + "learning_rate": 4.9626521134252704e-05, + "loss": 5.7882, + "step": 9264 + }, + { + "epoch": 0.055101579598439435, + "grad_norm": 1.4955536127090454, + "learning_rate": 4.9626440692479236e-05, + "loss": 5.639, + "step": 9265 + }, + { + "epoch": 0.05510752688172043, + "grad_norm": 2.0087990760803223, + "learning_rate": 4.9626360242108925e-05, + "loss": 5.841, + "step": 9266 + }, + { + "epoch": 0.055113474165001425, + "grad_norm": 1.7334564924240112, + "learning_rate": 4.962627978314181e-05, + "loss": 5.4267, + "step": 9267 + }, + { + "epoch": 0.05511942144828243, + "grad_norm": 2.1204535961151123, + "learning_rate": 4.962619931557792e-05, + "loss": 5.4451, + "step": 9268 + }, + { + "epoch": 0.05512536873156342, + "grad_norm": 2.2374279499053955, + "learning_rate": 4.962611883941727e-05, + "loss": 5.5095, + "step": 9269 + }, + { + "epoch": 0.05513131601484442, + "grad_norm": 1.735070824623108, + "learning_rate": 4.9626038354659904e-05, + "loss": 5.3609, + "step": 9270 + }, + { + "epoch": 0.05513726329812542, + "grad_norm": 1.9748501777648926, + "learning_rate": 4.9625957861305837e-05, + "loss": 5.3366, + "step": 9271 + }, + { + "epoch": 0.055143210581406414, + "grad_norm": 1.8736618757247925, + "learning_rate": 4.96258773593551e-05, + "loss": 5.4706, + "step": 9272 + }, + { + "epoch": 0.05514915786468741, + "grad_norm": 2.571755886077881, + "learning_rate": 4.9625796848807736e-05, + "loss": 5.0393, + "step": 9273 + }, + { + "epoch": 0.05515510514796841, + "grad_norm": 2.1467013359069824, + "learning_rate": 4.962571632966375e-05, + "loss": 5.5798, + "step": 9274 + }, + { + "epoch": 0.055161052431249406, + "grad_norm": 2.4553916454315186, + "learning_rate": 4.962563580192319e-05, + "loss": 5.4323, + "step": 9275 + }, + { + "epoch": 0.0551669997145304, + "grad_norm": 2.4478797912597656, + "learning_rate": 4.962555526558607e-05, + "loss": 5.2591, + "step": 9276 + }, + { + "epoch": 0.055172946997811396, + "grad_norm": 2.2164270877838135, + "learning_rate": 4.9625474720652416e-05, + "loss": 5.3404, + "step": 9277 + }, + { + "epoch": 0.0551788942810924, + "grad_norm": 1.9161698818206787, + "learning_rate": 4.962539416712227e-05, + "loss": 5.2591, + "step": 9278 + }, + { + "epoch": 0.055184841564373394, + "grad_norm": 2.348734140396118, + "learning_rate": 4.962531360499565e-05, + "loss": 5.8132, + "step": 9279 + }, + { + "epoch": 0.05519078884765439, + "grad_norm": 2.400090456008911, + "learning_rate": 4.962523303427259e-05, + "loss": 5.7786, + "step": 9280 + }, + { + "epoch": 0.05519673613093539, + "grad_norm": 2.1626594066619873, + "learning_rate": 4.9625152454953115e-05, + "loss": 5.8488, + "step": 9281 + }, + { + "epoch": 0.055202683414216386, + "grad_norm": 1.7470853328704834, + "learning_rate": 4.962507186703725e-05, + "loss": 5.72, + "step": 9282 + }, + { + "epoch": 0.05520863069749738, + "grad_norm": 1.9191921949386597, + "learning_rate": 4.962499127052503e-05, + "loss": 5.6321, + "step": 9283 + }, + { + "epoch": 0.05521457798077838, + "grad_norm": 2.1550769805908203, + "learning_rate": 4.962491066541649e-05, + "loss": 5.4521, + "step": 9284 + }, + { + "epoch": 0.05522052526405938, + "grad_norm": 2.0529074668884277, + "learning_rate": 4.9624830051711634e-05, + "loss": 5.4108, + "step": 9285 + }, + { + "epoch": 0.05522647254734037, + "grad_norm": 1.7673834562301636, + "learning_rate": 4.962474942941051e-05, + "loss": 5.5955, + "step": 9286 + }, + { + "epoch": 0.055232419830621375, + "grad_norm": 1.9575849771499634, + "learning_rate": 4.9624668798513143e-05, + "loss": 5.6295, + "step": 9287 + }, + { + "epoch": 0.05523836711390237, + "grad_norm": 1.8054029941558838, + "learning_rate": 4.9624588159019546e-05, + "loss": 5.3372, + "step": 9288 + }, + { + "epoch": 0.055244314397183365, + "grad_norm": 1.8002424240112305, + "learning_rate": 4.962450751092978e-05, + "loss": 5.4404, + "step": 9289 + }, + { + "epoch": 0.05525026168046437, + "grad_norm": 2.052530527114868, + "learning_rate": 4.962442685424383e-05, + "loss": 5.4921, + "step": 9290 + }, + { + "epoch": 0.05525620896374536, + "grad_norm": 1.8559443950653076, + "learning_rate": 4.962434618896176e-05, + "loss": 5.5776, + "step": 9291 + }, + { + "epoch": 0.05526215624702636, + "grad_norm": 1.8794355392456055, + "learning_rate": 4.962426551508359e-05, + "loss": 5.5818, + "step": 9292 + }, + { + "epoch": 0.05526810353030735, + "grad_norm": 1.8995412588119507, + "learning_rate": 4.962418483260933e-05, + "loss": 5.6274, + "step": 9293 + }, + { + "epoch": 0.055274050813588355, + "grad_norm": 1.8608371019363403, + "learning_rate": 4.962410414153903e-05, + "loss": 5.4655, + "step": 9294 + }, + { + "epoch": 0.05527999809686935, + "grad_norm": 2.0378072261810303, + "learning_rate": 4.9624023441872715e-05, + "loss": 5.5579, + "step": 9295 + }, + { + "epoch": 0.055285945380150345, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.9623942733610397e-05, + "loss": 5.6663, + "step": 9296 + }, + { + "epoch": 0.05529189266343135, + "grad_norm": 2.4487335681915283, + "learning_rate": 4.962386201675212e-05, + "loss": 5.6792, + "step": 9297 + }, + { + "epoch": 0.05529783994671234, + "grad_norm": 2.0460383892059326, + "learning_rate": 4.96237812912979e-05, + "loss": 5.917, + "step": 9298 + }, + { + "epoch": 0.05530378722999334, + "grad_norm": 2.4838030338287354, + "learning_rate": 4.962370055724778e-05, + "loss": 5.1067, + "step": 9299 + }, + { + "epoch": 0.05530973451327434, + "grad_norm": 1.9340513944625854, + "learning_rate": 4.962361981460178e-05, + "loss": 5.2529, + "step": 9300 + }, + { + "epoch": 0.055315681796555334, + "grad_norm": 2.201068878173828, + "learning_rate": 4.9623539063359925e-05, + "loss": 5.6055, + "step": 9301 + }, + { + "epoch": 0.05532162907983633, + "grad_norm": 2.0552330017089844, + "learning_rate": 4.962345830352225e-05, + "loss": 5.3531, + "step": 9302 + }, + { + "epoch": 0.05532757636311733, + "grad_norm": 2.611407995223999, + "learning_rate": 4.9623377535088785e-05, + "loss": 5.5829, + "step": 9303 + }, + { + "epoch": 0.055333523646398326, + "grad_norm": 2.2239346504211426, + "learning_rate": 4.962329675805955e-05, + "loss": 5.3558, + "step": 9304 + }, + { + "epoch": 0.05533947092967932, + "grad_norm": 2.3899872303009033, + "learning_rate": 4.9623215972434566e-05, + "loss": 5.7277, + "step": 9305 + }, + { + "epoch": 0.055345418212960316, + "grad_norm": 2.8471267223358154, + "learning_rate": 4.962313517821389e-05, + "loss": 6.1046, + "step": 9306 + }, + { + "epoch": 0.05535136549624132, + "grad_norm": 2.426400661468506, + "learning_rate": 4.962305437539752e-05, + "loss": 5.8942, + "step": 9307 + }, + { + "epoch": 0.055357312779522314, + "grad_norm": 2.3548812866210938, + "learning_rate": 4.962297356398549e-05, + "loss": 6.0552, + "step": 9308 + }, + { + "epoch": 0.05536326006280331, + "grad_norm": 1.8423515558242798, + "learning_rate": 4.9622892743977844e-05, + "loss": 5.9377, + "step": 9309 + }, + { + "epoch": 0.05536920734608431, + "grad_norm": 2.1509203910827637, + "learning_rate": 4.96228119153746e-05, + "loss": 5.7195, + "step": 9310 + }, + { + "epoch": 0.055375154629365306, + "grad_norm": 2.3096275329589844, + "learning_rate": 4.962273107817579e-05, + "loss": 5.3461, + "step": 9311 + }, + { + "epoch": 0.0553811019126463, + "grad_norm": 1.980205774307251, + "learning_rate": 4.962265023238143e-05, + "loss": 5.8851, + "step": 9312 + }, + { + "epoch": 0.0553870491959273, + "grad_norm": 1.8162591457366943, + "learning_rate": 4.962256937799156e-05, + "loss": 5.7092, + "step": 9313 + }, + { + "epoch": 0.0553929964792083, + "grad_norm": 1.873853087425232, + "learning_rate": 4.962248851500621e-05, + "loss": 5.8939, + "step": 9314 + }, + { + "epoch": 0.05539894376248929, + "grad_norm": 1.8039345741271973, + "learning_rate": 4.96224076434254e-05, + "loss": 5.9289, + "step": 9315 + }, + { + "epoch": 0.055404891045770295, + "grad_norm": 2.3106470108032227, + "learning_rate": 4.962232676324916e-05, + "loss": 5.9103, + "step": 9316 + }, + { + "epoch": 0.05541083832905129, + "grad_norm": 2.2209455966949463, + "learning_rate": 4.962224587447752e-05, + "loss": 6.0053, + "step": 9317 + }, + { + "epoch": 0.055416785612332285, + "grad_norm": 2.0624780654907227, + "learning_rate": 4.962216497711052e-05, + "loss": 5.9258, + "step": 9318 + }, + { + "epoch": 0.05542273289561329, + "grad_norm": 2.371662139892578, + "learning_rate": 4.962208407114817e-05, + "loss": 6.4127, + "step": 9319 + }, + { + "epoch": 0.05542868017889428, + "grad_norm": 2.7035610675811768, + "learning_rate": 4.96220031565905e-05, + "loss": 5.9742, + "step": 9320 + }, + { + "epoch": 0.05543462746217528, + "grad_norm": 2.060577392578125, + "learning_rate": 4.9621922233437544e-05, + "loss": 5.9729, + "step": 9321 + }, + { + "epoch": 0.05544057474545627, + "grad_norm": 1.7935984134674072, + "learning_rate": 4.962184130168933e-05, + "loss": 5.4077, + "step": 9322 + }, + { + "epoch": 0.055446522028737275, + "grad_norm": 1.8716622591018677, + "learning_rate": 4.9621760361345885e-05, + "loss": 5.4554, + "step": 9323 + }, + { + "epoch": 0.05545246931201827, + "grad_norm": 1.9150923490524292, + "learning_rate": 4.962167941240724e-05, + "loss": 5.8121, + "step": 9324 + }, + { + "epoch": 0.055458416595299265, + "grad_norm": 1.9207059144973755, + "learning_rate": 4.962159845487342e-05, + "loss": 5.8593, + "step": 9325 + }, + { + "epoch": 0.05546436387858027, + "grad_norm": 1.962039589881897, + "learning_rate": 4.9621517488744454e-05, + "loss": 6.0174, + "step": 9326 + }, + { + "epoch": 0.05547031116186126, + "grad_norm": 2.0445704460144043, + "learning_rate": 4.9621436514020376e-05, + "loss": 5.5782, + "step": 9327 + }, + { + "epoch": 0.05547625844514226, + "grad_norm": 2.0861823558807373, + "learning_rate": 4.9621355530701204e-05, + "loss": 5.6102, + "step": 9328 + }, + { + "epoch": 0.05548220572842326, + "grad_norm": 2.0184309482574463, + "learning_rate": 4.962127453878697e-05, + "loss": 5.8072, + "step": 9329 + }, + { + "epoch": 0.055488153011704254, + "grad_norm": 1.899994134902954, + "learning_rate": 4.962119353827771e-05, + "loss": 5.7361, + "step": 9330 + }, + { + "epoch": 0.05549410029498525, + "grad_norm": 1.8874105215072632, + "learning_rate": 4.962111252917344e-05, + "loss": 5.7988, + "step": 9331 + }, + { + "epoch": 0.05550004757826625, + "grad_norm": 2.046682119369507, + "learning_rate": 4.9621031511474194e-05, + "loss": 5.7037, + "step": 9332 + }, + { + "epoch": 0.055505994861547246, + "grad_norm": 2.2552926540374756, + "learning_rate": 4.962095048517999e-05, + "loss": 5.7556, + "step": 9333 + }, + { + "epoch": 0.05551194214482824, + "grad_norm": 2.1904358863830566, + "learning_rate": 4.962086945029089e-05, + "loss": 5.6529, + "step": 9334 + }, + { + "epoch": 0.055517889428109236, + "grad_norm": 2.03745698928833, + "learning_rate": 4.9620788406806883e-05, + "loss": 5.8504, + "step": 9335 + }, + { + "epoch": 0.05552383671139024, + "grad_norm": 1.81668221950531, + "learning_rate": 4.9620707354728017e-05, + "loss": 5.3275, + "step": 9336 + }, + { + "epoch": 0.055529783994671233, + "grad_norm": 2.570976734161377, + "learning_rate": 4.962062629405432e-05, + "loss": 5.666, + "step": 9337 + }, + { + "epoch": 0.05553573127795223, + "grad_norm": 2.6855766773223877, + "learning_rate": 4.962054522478581e-05, + "loss": 5.7798, + "step": 9338 + }, + { + "epoch": 0.05554167856123323, + "grad_norm": 2.329690933227539, + "learning_rate": 4.962046414692252e-05, + "loss": 5.9334, + "step": 9339 + }, + { + "epoch": 0.055547625844514226, + "grad_norm": 1.6809495687484741, + "learning_rate": 4.962038306046449e-05, + "loss": 5.8506, + "step": 9340 + }, + { + "epoch": 0.05555357312779522, + "grad_norm": 1.7170113325119019, + "learning_rate": 4.962030196541173e-05, + "loss": 6.0863, + "step": 9341 + }, + { + "epoch": 0.05555952041107622, + "grad_norm": 2.247680902481079, + "learning_rate": 4.962022086176428e-05, + "loss": 5.2188, + "step": 9342 + }, + { + "epoch": 0.05556546769435722, + "grad_norm": 2.680091381072998, + "learning_rate": 4.9620139749522165e-05, + "loss": 4.8506, + "step": 9343 + }, + { + "epoch": 0.05557141497763821, + "grad_norm": 2.1886465549468994, + "learning_rate": 4.962005862868542e-05, + "loss": 5.5164, + "step": 9344 + }, + { + "epoch": 0.055577362260919215, + "grad_norm": 2.061368227005005, + "learning_rate": 4.961997749925405e-05, + "loss": 5.4491, + "step": 9345 + }, + { + "epoch": 0.05558330954420021, + "grad_norm": 2.368156909942627, + "learning_rate": 4.961989636122812e-05, + "loss": 5.9053, + "step": 9346 + }, + { + "epoch": 0.055589256827481205, + "grad_norm": 2.562565803527832, + "learning_rate": 4.961981521460763e-05, + "loss": 5.7683, + "step": 9347 + }, + { + "epoch": 0.05559520411076221, + "grad_norm": 2.388779640197754, + "learning_rate": 4.961973405939262e-05, + "loss": 5.1235, + "step": 9348 + }, + { + "epoch": 0.0556011513940432, + "grad_norm": 2.546994686126709, + "learning_rate": 4.9619652895583104e-05, + "loss": 4.7793, + "step": 9349 + }, + { + "epoch": 0.0556070986773242, + "grad_norm": 2.379549026489258, + "learning_rate": 4.9619571723179135e-05, + "loss": 4.8949, + "step": 9350 + }, + { + "epoch": 0.05561304596060519, + "grad_norm": 2.1621344089508057, + "learning_rate": 4.961949054218072e-05, + "loss": 4.6824, + "step": 9351 + }, + { + "epoch": 0.055618993243886194, + "grad_norm": 2.136289119720459, + "learning_rate": 4.96194093525879e-05, + "loss": 4.834, + "step": 9352 + }, + { + "epoch": 0.05562494052716719, + "grad_norm": 2.3572680950164795, + "learning_rate": 4.9619328154400694e-05, + "loss": 4.9755, + "step": 9353 + }, + { + "epoch": 0.055630887810448185, + "grad_norm": 2.2439966201782227, + "learning_rate": 4.961924694761913e-05, + "loss": 5.7662, + "step": 9354 + }, + { + "epoch": 0.05563683509372919, + "grad_norm": 2.287597894668579, + "learning_rate": 4.961916573224326e-05, + "loss": 4.6108, + "step": 9355 + }, + { + "epoch": 0.05564278237701018, + "grad_norm": 2.1382369995117188, + "learning_rate": 4.961908450827308e-05, + "loss": 4.5993, + "step": 9356 + }, + { + "epoch": 0.05564872966029118, + "grad_norm": 2.112348794937134, + "learning_rate": 4.961900327570863e-05, + "loss": 4.6798, + "step": 9357 + }, + { + "epoch": 0.05565467694357218, + "grad_norm": 2.0453972816467285, + "learning_rate": 4.9618922034549946e-05, + "loss": 4.5424, + "step": 9358 + }, + { + "epoch": 0.055660624226853174, + "grad_norm": 2.0547754764556885, + "learning_rate": 4.961884078479705e-05, + "loss": 5.0661, + "step": 9359 + }, + { + "epoch": 0.05566657151013417, + "grad_norm": 2.5003650188446045, + "learning_rate": 4.9618759526449965e-05, + "loss": 5.3388, + "step": 9360 + }, + { + "epoch": 0.05567251879341517, + "grad_norm": 2.0582423210144043, + "learning_rate": 4.9618678259508736e-05, + "loss": 5.8437, + "step": 9361 + }, + { + "epoch": 0.055678466076696166, + "grad_norm": 1.7867279052734375, + "learning_rate": 4.9618596983973376e-05, + "loss": 5.369, + "step": 9362 + }, + { + "epoch": 0.05568441335997716, + "grad_norm": 2.03729248046875, + "learning_rate": 4.961851569984392e-05, + "loss": 5.9932, + "step": 9363 + }, + { + "epoch": 0.055690360643258156, + "grad_norm": 2.2527456283569336, + "learning_rate": 4.961843440712038e-05, + "loss": 5.893, + "step": 9364 + }, + { + "epoch": 0.05569630792653916, + "grad_norm": 2.0027201175689697, + "learning_rate": 4.9618353105802815e-05, + "loss": 5.8216, + "step": 9365 + }, + { + "epoch": 0.05570225520982015, + "grad_norm": 2.236548662185669, + "learning_rate": 4.961827179589124e-05, + "loss": 5.5371, + "step": 9366 + }, + { + "epoch": 0.05570820249310115, + "grad_norm": 2.4477334022521973, + "learning_rate": 4.9618190477385666e-05, + "loss": 5.6552, + "step": 9367 + }, + { + "epoch": 0.05571414977638215, + "grad_norm": 2.504549026489258, + "learning_rate": 4.9618109150286145e-05, + "loss": 5.5732, + "step": 9368 + }, + { + "epoch": 0.055720097059663146, + "grad_norm": 2.1413187980651855, + "learning_rate": 4.9618027814592695e-05, + "loss": 5.1792, + "step": 9369 + }, + { + "epoch": 0.05572604434294414, + "grad_norm": 2.1714866161346436, + "learning_rate": 4.9617946470305344e-05, + "loss": 5.3444, + "step": 9370 + }, + { + "epoch": 0.05573199162622514, + "grad_norm": 1.7478383779525757, + "learning_rate": 4.9617865117424126e-05, + "loss": 5.7151, + "step": 9371 + }, + { + "epoch": 0.05573793890950614, + "grad_norm": 2.0415220260620117, + "learning_rate": 4.9617783755949067e-05, + "loss": 5.8765, + "step": 9372 + }, + { + "epoch": 0.05574388619278713, + "grad_norm": 1.917108416557312, + "learning_rate": 4.961770238588019e-05, + "loss": 6.0797, + "step": 9373 + }, + { + "epoch": 0.055749833476068135, + "grad_norm": 1.9404850006103516, + "learning_rate": 4.961762100721753e-05, + "loss": 6.1376, + "step": 9374 + }, + { + "epoch": 0.05575578075934913, + "grad_norm": 1.7101916074752808, + "learning_rate": 4.9617539619961104e-05, + "loss": 5.9375, + "step": 9375 + }, + { + "epoch": 0.055761728042630125, + "grad_norm": 2.591960906982422, + "learning_rate": 4.9617458224110954e-05, + "loss": 5.3716, + "step": 9376 + }, + { + "epoch": 0.05576767532591113, + "grad_norm": 2.070600986480713, + "learning_rate": 4.961737681966711e-05, + "loss": 5.3822, + "step": 9377 + }, + { + "epoch": 0.05577362260919212, + "grad_norm": 2.100820302963257, + "learning_rate": 4.9617295406629594e-05, + "loss": 5.7703, + "step": 9378 + }, + { + "epoch": 0.05577956989247312, + "grad_norm": 2.2413878440856934, + "learning_rate": 4.961721398499843e-05, + "loss": 4.9197, + "step": 9379 + }, + { + "epoch": 0.05578551717575411, + "grad_norm": 1.9762401580810547, + "learning_rate": 4.961713255477365e-05, + "loss": 5.6705, + "step": 9380 + }, + { + "epoch": 0.055791464459035114, + "grad_norm": 2.22676420211792, + "learning_rate": 4.961705111595528e-05, + "loss": 5.0196, + "step": 9381 + }, + { + "epoch": 0.05579741174231611, + "grad_norm": 2.0652241706848145, + "learning_rate": 4.9616969668543364e-05, + "loss": 5.3894, + "step": 9382 + }, + { + "epoch": 0.055803359025597105, + "grad_norm": 2.156890630722046, + "learning_rate": 4.96168882125379e-05, + "loss": 5.3063, + "step": 9383 + }, + { + "epoch": 0.05580930630887811, + "grad_norm": 2.131964683532715, + "learning_rate": 4.961680674793895e-05, + "loss": 5.9304, + "step": 9384 + }, + { + "epoch": 0.0558152535921591, + "grad_norm": 2.2117621898651123, + "learning_rate": 4.9616725274746525e-05, + "loss": 5.9553, + "step": 9385 + }, + { + "epoch": 0.0558212008754401, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.9616643792960654e-05, + "loss": 5.9911, + "step": 9386 + }, + { + "epoch": 0.0558271481587211, + "grad_norm": 1.7709077596664429, + "learning_rate": 4.961656230258136e-05, + "loss": 5.6291, + "step": 9387 + }, + { + "epoch": 0.055833095442002094, + "grad_norm": 1.838767170906067, + "learning_rate": 4.961648080360869e-05, + "loss": 6.0152, + "step": 9388 + }, + { + "epoch": 0.05583904272528309, + "grad_norm": 2.117058515548706, + "learning_rate": 4.9616399296042656e-05, + "loss": 4.8079, + "step": 9389 + }, + { + "epoch": 0.05584499000856409, + "grad_norm": 2.147491693496704, + "learning_rate": 4.9616317779883293e-05, + "loss": 4.6489, + "step": 9390 + }, + { + "epoch": 0.055850937291845086, + "grad_norm": 2.1025705337524414, + "learning_rate": 4.961623625513062e-05, + "loss": 4.4984, + "step": 9391 + }, + { + "epoch": 0.05585688457512608, + "grad_norm": 1.799986720085144, + "learning_rate": 4.961615472178468e-05, + "loss": 5.1008, + "step": 9392 + }, + { + "epoch": 0.055862831858407076, + "grad_norm": 2.2975053787231445, + "learning_rate": 4.961607317984549e-05, + "loss": 5.9754, + "step": 9393 + }, + { + "epoch": 0.05586877914168808, + "grad_norm": 1.9996155500411987, + "learning_rate": 4.961599162931309e-05, + "loss": 5.9255, + "step": 9394 + }, + { + "epoch": 0.05587472642496907, + "grad_norm": 1.7344794273376465, + "learning_rate": 4.9615910070187496e-05, + "loss": 6.0873, + "step": 9395 + }, + { + "epoch": 0.05588067370825007, + "grad_norm": 2.260706901550293, + "learning_rate": 4.961582850246875e-05, + "loss": 5.9454, + "step": 9396 + }, + { + "epoch": 0.05588662099153107, + "grad_norm": 2.1810765266418457, + "learning_rate": 4.961574692615686e-05, + "loss": 5.7548, + "step": 9397 + }, + { + "epoch": 0.055892568274812066, + "grad_norm": 2.0940003395080566, + "learning_rate": 4.961566534125188e-05, + "loss": 5.8184, + "step": 9398 + }, + { + "epoch": 0.05589851555809306, + "grad_norm": 2.066464900970459, + "learning_rate": 4.961558374775382e-05, + "loss": 5.7867, + "step": 9399 + }, + { + "epoch": 0.05590446284137406, + "grad_norm": 1.7197705507278442, + "learning_rate": 4.961550214566271e-05, + "loss": 5.9211, + "step": 9400 + }, + { + "epoch": 0.05591041012465506, + "grad_norm": 2.3055293560028076, + "learning_rate": 4.9615420534978583e-05, + "loss": 5.9531, + "step": 9401 + }, + { + "epoch": 0.05591635740793605, + "grad_norm": 2.0974669456481934, + "learning_rate": 4.961533891570147e-05, + "loss": 5.9347, + "step": 9402 + }, + { + "epoch": 0.055922304691217055, + "grad_norm": 2.5196354389190674, + "learning_rate": 4.96152572878314e-05, + "loss": 5.0729, + "step": 9403 + }, + { + "epoch": 0.05592825197449805, + "grad_norm": 2.157181978225708, + "learning_rate": 4.9615175651368395e-05, + "loss": 5.9513, + "step": 9404 + }, + { + "epoch": 0.055934199257779045, + "grad_norm": 1.94083833694458, + "learning_rate": 4.9615094006312485e-05, + "loss": 5.9239, + "step": 9405 + }, + { + "epoch": 0.05594014654106005, + "grad_norm": 2.2118191719055176, + "learning_rate": 4.9615012352663704e-05, + "loss": 5.6936, + "step": 9406 + }, + { + "epoch": 0.05594609382434104, + "grad_norm": 2.2255051136016846, + "learning_rate": 4.9614930690422065e-05, + "loss": 5.7475, + "step": 9407 + }, + { + "epoch": 0.05595204110762204, + "grad_norm": 2.1640844345092773, + "learning_rate": 4.961484901958762e-05, + "loss": 5.8138, + "step": 9408 + }, + { + "epoch": 0.05595798839090303, + "grad_norm": 2.2722928524017334, + "learning_rate": 4.961476734016038e-05, + "loss": 5.5784, + "step": 9409 + }, + { + "epoch": 0.055963935674184034, + "grad_norm": 2.0541749000549316, + "learning_rate": 4.961468565214039e-05, + "loss": 5.6871, + "step": 9410 + }, + { + "epoch": 0.05596988295746503, + "grad_norm": 2.3496010303497314, + "learning_rate": 4.9614603955527655e-05, + "loss": 5.4195, + "step": 9411 + }, + { + "epoch": 0.055975830240746025, + "grad_norm": 2.333435297012329, + "learning_rate": 4.9614522250322215e-05, + "loss": 5.4257, + "step": 9412 + }, + { + "epoch": 0.05598177752402703, + "grad_norm": 2.339057445526123, + "learning_rate": 4.9614440536524106e-05, + "loss": 5.4158, + "step": 9413 + }, + { + "epoch": 0.05598772480730802, + "grad_norm": 2.4383058547973633, + "learning_rate": 4.961435881413335e-05, + "loss": 5.4569, + "step": 9414 + }, + { + "epoch": 0.05599367209058902, + "grad_norm": 2.1405389308929443, + "learning_rate": 4.961427708314997e-05, + "loss": 5.6178, + "step": 9415 + }, + { + "epoch": 0.05599961937387002, + "grad_norm": 2.2082836627960205, + "learning_rate": 4.961419534357401e-05, + "loss": 5.386, + "step": 9416 + }, + { + "epoch": 0.056005566657151014, + "grad_norm": 2.0305027961730957, + "learning_rate": 4.961411359540548e-05, + "loss": 5.2822, + "step": 9417 + }, + { + "epoch": 0.05601151394043201, + "grad_norm": 2.606452226638794, + "learning_rate": 4.961403183864442e-05, + "loss": 5.2691, + "step": 9418 + }, + { + "epoch": 0.05601746122371301, + "grad_norm": 2.3506669998168945, + "learning_rate": 4.961395007329086e-05, + "loss": 5.3307, + "step": 9419 + }, + { + "epoch": 0.056023408506994006, + "grad_norm": 2.3472225666046143, + "learning_rate": 4.961386829934482e-05, + "loss": 5.2247, + "step": 9420 + }, + { + "epoch": 0.056029355790275, + "grad_norm": 2.1121721267700195, + "learning_rate": 4.961378651680633e-05, + "loss": 5.2857, + "step": 9421 + }, + { + "epoch": 0.056035303073555996, + "grad_norm": 2.4357142448425293, + "learning_rate": 4.9613704725675427e-05, + "loss": 5.3398, + "step": 9422 + }, + { + "epoch": 0.056041250356837, + "grad_norm": 2.639418125152588, + "learning_rate": 4.961362292595213e-05, + "loss": 5.3008, + "step": 9423 + }, + { + "epoch": 0.05604719764011799, + "grad_norm": 3.297189712524414, + "learning_rate": 4.961354111763647e-05, + "loss": 5.5908, + "step": 9424 + }, + { + "epoch": 0.05605314492339899, + "grad_norm": 2.095613718032837, + "learning_rate": 4.961345930072848e-05, + "loss": 5.2389, + "step": 9425 + }, + { + "epoch": 0.05605909220667999, + "grad_norm": 2.2495081424713135, + "learning_rate": 4.9613377475228186e-05, + "loss": 5.474, + "step": 9426 + }, + { + "epoch": 0.056065039489960986, + "grad_norm": 2.282697916030884, + "learning_rate": 4.961329564113562e-05, + "loss": 5.3253, + "step": 9427 + }, + { + "epoch": 0.05607098677324198, + "grad_norm": 2.515075206756592, + "learning_rate": 4.96132137984508e-05, + "loss": 5.238, + "step": 9428 + }, + { + "epoch": 0.05607693405652298, + "grad_norm": 2.072274684906006, + "learning_rate": 4.961313194717376e-05, + "loss": 5.3627, + "step": 9429 + }, + { + "epoch": 0.05608288133980398, + "grad_norm": 2.4552547931671143, + "learning_rate": 4.961305008730454e-05, + "loss": 6.1799, + "step": 9430 + }, + { + "epoch": 0.05608882862308497, + "grad_norm": 2.2289538383483887, + "learning_rate": 4.9612968218843146e-05, + "loss": 5.5477, + "step": 9431 + }, + { + "epoch": 0.056094775906365975, + "grad_norm": 2.6174185276031494, + "learning_rate": 4.9612886341789635e-05, + "loss": 5.1779, + "step": 9432 + }, + { + "epoch": 0.05610072318964697, + "grad_norm": 2.4489150047302246, + "learning_rate": 4.9612804456144005e-05, + "loss": 5.2067, + "step": 9433 + }, + { + "epoch": 0.056106670472927965, + "grad_norm": 2.2651829719543457, + "learning_rate": 4.96127225619063e-05, + "loss": 5.3582, + "step": 9434 + }, + { + "epoch": 0.05611261775620897, + "grad_norm": 2.1985251903533936, + "learning_rate": 4.9612640659076556e-05, + "loss": 5.2034, + "step": 9435 + }, + { + "epoch": 0.05611856503948996, + "grad_norm": 1.9510128498077393, + "learning_rate": 4.961255874765479e-05, + "loss": 5.2263, + "step": 9436 + }, + { + "epoch": 0.05612451232277096, + "grad_norm": 2.338815212249756, + "learning_rate": 4.961247682764104e-05, + "loss": 5.9091, + "step": 9437 + }, + { + "epoch": 0.05613045960605195, + "grad_norm": 2.097111225128174, + "learning_rate": 4.961239489903532e-05, + "loss": 6.3285, + "step": 9438 + }, + { + "epoch": 0.056136406889332954, + "grad_norm": 1.9965720176696777, + "learning_rate": 4.961231296183767e-05, + "loss": 6.3141, + "step": 9439 + }, + { + "epoch": 0.05614235417261395, + "grad_norm": 2.2406206130981445, + "learning_rate": 4.9612231016048114e-05, + "loss": 5.7335, + "step": 9440 + }, + { + "epoch": 0.056148301455894944, + "grad_norm": 2.2798993587493896, + "learning_rate": 4.961214906166668e-05, + "loss": 4.9959, + "step": 9441 + }, + { + "epoch": 0.056154248739175947, + "grad_norm": 2.482706069946289, + "learning_rate": 4.96120670986934e-05, + "loss": 5.295, + "step": 9442 + }, + { + "epoch": 0.05616019602245694, + "grad_norm": 2.398867607116699, + "learning_rate": 4.961198512712831e-05, + "loss": 4.9592, + "step": 9443 + }, + { + "epoch": 0.05616614330573794, + "grad_norm": 2.1979055404663086, + "learning_rate": 4.961190314697143e-05, + "loss": 5.1003, + "step": 9444 + }, + { + "epoch": 0.05617209058901894, + "grad_norm": 2.3249244689941406, + "learning_rate": 4.961182115822278e-05, + "loss": 5.1408, + "step": 9445 + }, + { + "epoch": 0.056178037872299934, + "grad_norm": 2.3679821491241455, + "learning_rate": 4.96117391608824e-05, + "loss": 5.4006, + "step": 9446 + }, + { + "epoch": 0.05618398515558093, + "grad_norm": 1.8706363439559937, + "learning_rate": 4.961165715495032e-05, + "loss": 6.1741, + "step": 9447 + }, + { + "epoch": 0.05618993243886193, + "grad_norm": 2.1825344562530518, + "learning_rate": 4.961157514042656e-05, + "loss": 6.0869, + "step": 9448 + }, + { + "epoch": 0.056195879722142926, + "grad_norm": 1.85076904296875, + "learning_rate": 4.961149311731116e-05, + "loss": 5.9252, + "step": 9449 + }, + { + "epoch": 0.05620182700542392, + "grad_norm": 1.9433631896972656, + "learning_rate": 4.961141108560413e-05, + "loss": 5.968, + "step": 9450 + }, + { + "epoch": 0.056207774288704916, + "grad_norm": 2.5718259811401367, + "learning_rate": 4.961132904530552e-05, + "loss": 5.4274, + "step": 9451 + }, + { + "epoch": 0.05621372157198592, + "grad_norm": 1.919552206993103, + "learning_rate": 4.961124699641535e-05, + "loss": 5.1943, + "step": 9452 + }, + { + "epoch": 0.05621966885526691, + "grad_norm": 2.1371817588806152, + "learning_rate": 4.961116493893364e-05, + "loss": 5.9949, + "step": 9453 + }, + { + "epoch": 0.05622561613854791, + "grad_norm": 2.5715489387512207, + "learning_rate": 4.961108287286044e-05, + "loss": 6.2061, + "step": 9454 + }, + { + "epoch": 0.05623156342182891, + "grad_norm": 2.1871471405029297, + "learning_rate": 4.961100079819575e-05, + "loss": 5.7872, + "step": 9455 + }, + { + "epoch": 0.056237510705109905, + "grad_norm": 2.011925220489502, + "learning_rate": 4.961091871493962e-05, + "loss": 5.7992, + "step": 9456 + }, + { + "epoch": 0.0562434579883909, + "grad_norm": 2.516580820083618, + "learning_rate": 4.9610836623092074e-05, + "loss": 5.9154, + "step": 9457 + }, + { + "epoch": 0.0562494052716719, + "grad_norm": 1.9336326122283936, + "learning_rate": 4.961075452265314e-05, + "loss": 5.7933, + "step": 9458 + }, + { + "epoch": 0.0562553525549529, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.961067241362285e-05, + "loss": 6.1897, + "step": 9459 + }, + { + "epoch": 0.05626129983823389, + "grad_norm": 1.9757578372955322, + "learning_rate": 4.961059029600122e-05, + "loss": 6.0909, + "step": 9460 + }, + { + "epoch": 0.056267247121514895, + "grad_norm": 1.9767241477966309, + "learning_rate": 4.9610508169788294e-05, + "loss": 6.2212, + "step": 9461 + }, + { + "epoch": 0.05627319440479589, + "grad_norm": 1.9890403747558594, + "learning_rate": 4.961042603498409e-05, + "loss": 6.5071, + "step": 9462 + }, + { + "epoch": 0.056279141688076885, + "grad_norm": 1.9011937379837036, + "learning_rate": 4.961034389158864e-05, + "loss": 5.8098, + "step": 9463 + }, + { + "epoch": 0.05628508897135789, + "grad_norm": 2.236356735229492, + "learning_rate": 4.961026173960197e-05, + "loss": 4.8901, + "step": 9464 + }, + { + "epoch": 0.05629103625463888, + "grad_norm": 1.9147372245788574, + "learning_rate": 4.961017957902412e-05, + "loss": 5.1372, + "step": 9465 + }, + { + "epoch": 0.05629698353791988, + "grad_norm": 1.9628163576126099, + "learning_rate": 4.9610097409855106e-05, + "loss": 5.1161, + "step": 9466 + }, + { + "epoch": 0.05630293082120087, + "grad_norm": 2.0323991775512695, + "learning_rate": 4.961001523209496e-05, + "loss": 5.1493, + "step": 9467 + }, + { + "epoch": 0.056308878104481874, + "grad_norm": 1.7026360034942627, + "learning_rate": 4.9609933045743714e-05, + "loss": 5.2349, + "step": 9468 + }, + { + "epoch": 0.05631482538776287, + "grad_norm": 1.7758761644363403, + "learning_rate": 4.9609850850801394e-05, + "loss": 5.231, + "step": 9469 + }, + { + "epoch": 0.056320772671043864, + "grad_norm": 2.3305037021636963, + "learning_rate": 4.9609768647268026e-05, + "loss": 5.9209, + "step": 9470 + }, + { + "epoch": 0.056326719954324866, + "grad_norm": 2.2628681659698486, + "learning_rate": 4.960968643514365e-05, + "loss": 5.4753, + "step": 9471 + }, + { + "epoch": 0.05633266723760586, + "grad_norm": 2.4022347927093506, + "learning_rate": 4.9609604214428286e-05, + "loss": 4.8414, + "step": 9472 + }, + { + "epoch": 0.05633861452088686, + "grad_norm": 2.2767343521118164, + "learning_rate": 4.9609521985121955e-05, + "loss": 4.7178, + "step": 9473 + }, + { + "epoch": 0.05634456180416786, + "grad_norm": 2.547600507736206, + "learning_rate": 4.96094397472247e-05, + "loss": 4.7365, + "step": 9474 + }, + { + "epoch": 0.056350509087448854, + "grad_norm": 2.3546998500823975, + "learning_rate": 4.960935750073654e-05, + "loss": 5.4846, + "step": 9475 + }, + { + "epoch": 0.05635645637072985, + "grad_norm": 2.9641268253326416, + "learning_rate": 4.960927524565751e-05, + "loss": 5.7409, + "step": 9476 + }, + { + "epoch": 0.05636240365401085, + "grad_norm": 3.1727824211120605, + "learning_rate": 4.960919298198764e-05, + "loss": 5.8456, + "step": 9477 + }, + { + "epoch": 0.056368350937291846, + "grad_norm": 2.620507001876831, + "learning_rate": 4.960911070972695e-05, + "loss": 5.6295, + "step": 9478 + }, + { + "epoch": 0.05637429822057284, + "grad_norm": 2.6132571697235107, + "learning_rate": 4.960902842887548e-05, + "loss": 5.697, + "step": 9479 + }, + { + "epoch": 0.056380245503853836, + "grad_norm": 2.2931299209594727, + "learning_rate": 4.960894613943324e-05, + "loss": 5.4723, + "step": 9480 + }, + { + "epoch": 0.05638619278713484, + "grad_norm": 2.176729202270508, + "learning_rate": 4.9608863841400284e-05, + "loss": 5.7403, + "step": 9481 + }, + { + "epoch": 0.05639214007041583, + "grad_norm": 1.932180404663086, + "learning_rate": 4.9608781534776616e-05, + "loss": 5.9256, + "step": 9482 + }, + { + "epoch": 0.05639808735369683, + "grad_norm": 1.7315243482589722, + "learning_rate": 4.9608699219562286e-05, + "loss": 5.9176, + "step": 9483 + }, + { + "epoch": 0.05640403463697783, + "grad_norm": 1.6548408269882202, + "learning_rate": 4.9608616895757306e-05, + "loss": 5.7495, + "step": 9484 + }, + { + "epoch": 0.056409981920258825, + "grad_norm": 1.8549202680587769, + "learning_rate": 4.960853456336172e-05, + "loss": 5.5261, + "step": 9485 + }, + { + "epoch": 0.05641592920353982, + "grad_norm": 2.5990993976593018, + "learning_rate": 4.9608452222375544e-05, + "loss": 5.5934, + "step": 9486 + }, + { + "epoch": 0.05642187648682082, + "grad_norm": 1.705051302909851, + "learning_rate": 4.9608369872798815e-05, + "loss": 5.3613, + "step": 9487 + }, + { + "epoch": 0.05642782377010182, + "grad_norm": 1.6170406341552734, + "learning_rate": 4.960828751463156e-05, + "loss": 5.2743, + "step": 9488 + }, + { + "epoch": 0.05643377105338281, + "grad_norm": 1.6247482299804688, + "learning_rate": 4.9608205147873796e-05, + "loss": 5.2772, + "step": 9489 + }, + { + "epoch": 0.056439718336663815, + "grad_norm": 1.7574137449264526, + "learning_rate": 4.9608122772525575e-05, + "loss": 5.3464, + "step": 9490 + }, + { + "epoch": 0.05644566561994481, + "grad_norm": 1.8814537525177002, + "learning_rate": 4.960804038858691e-05, + "loss": 5.3092, + "step": 9491 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 2.0222842693328857, + "learning_rate": 4.9607957996057816e-05, + "loss": 4.8234, + "step": 9492 + }, + { + "epoch": 0.05645756018650681, + "grad_norm": 1.6224759817123413, + "learning_rate": 4.960787559493836e-05, + "loss": 5.3962, + "step": 9493 + }, + { + "epoch": 0.0564635074697878, + "grad_norm": 1.4097533226013184, + "learning_rate": 4.960779318522853e-05, + "loss": 5.8302, + "step": 9494 + }, + { + "epoch": 0.0564694547530688, + "grad_norm": 1.7296205759048462, + "learning_rate": 4.960771076692839e-05, + "loss": 5.5679, + "step": 9495 + }, + { + "epoch": 0.05647540203634979, + "grad_norm": 1.6300212144851685, + "learning_rate": 4.960762834003794e-05, + "loss": 5.4315, + "step": 9496 + }, + { + "epoch": 0.056481349319630794, + "grad_norm": 1.8587864637374878, + "learning_rate": 4.960754590455723e-05, + "loss": 5.5492, + "step": 9497 + }, + { + "epoch": 0.05648729660291179, + "grad_norm": 1.8136985301971436, + "learning_rate": 4.960746346048628e-05, + "loss": 5.6363, + "step": 9498 + }, + { + "epoch": 0.056493243886192784, + "grad_norm": 2.1277284622192383, + "learning_rate": 4.960738100782511e-05, + "loss": 5.593, + "step": 9499 + }, + { + "epoch": 0.056499191169473786, + "grad_norm": 2.0262863636016846, + "learning_rate": 4.960729854657377e-05, + "loss": 5.6396, + "step": 9500 + }, + { + "epoch": 0.05650513845275478, + "grad_norm": 1.7870309352874756, + "learning_rate": 4.9607216076732266e-05, + "loss": 5.6523, + "step": 9501 + }, + { + "epoch": 0.05651108573603578, + "grad_norm": 1.734782099723816, + "learning_rate": 4.9607133598300636e-05, + "loss": 5.5313, + "step": 9502 + }, + { + "epoch": 0.05651703301931678, + "grad_norm": 2.2485032081604004, + "learning_rate": 4.9607051111278914e-05, + "loss": 5.3814, + "step": 9503 + }, + { + "epoch": 0.056522980302597774, + "grad_norm": 1.5091774463653564, + "learning_rate": 4.9606968615667125e-05, + "loss": 5.5277, + "step": 9504 + }, + { + "epoch": 0.05652892758587877, + "grad_norm": 1.7117774486541748, + "learning_rate": 4.9606886111465303e-05, + "loss": 5.2649, + "step": 9505 + }, + { + "epoch": 0.05653487486915977, + "grad_norm": 1.7309353351593018, + "learning_rate": 4.960680359867346e-05, + "loss": 5.2276, + "step": 9506 + }, + { + "epoch": 0.056540822152440766, + "grad_norm": 1.7058963775634766, + "learning_rate": 4.960672107729164e-05, + "loss": 5.1848, + "step": 9507 + }, + { + "epoch": 0.05654676943572176, + "grad_norm": 1.7862296104431152, + "learning_rate": 4.960663854731987e-05, + "loss": 5.2424, + "step": 9508 + }, + { + "epoch": 0.05655271671900276, + "grad_norm": 1.8900794982910156, + "learning_rate": 4.960655600875818e-05, + "loss": 5.283, + "step": 9509 + }, + { + "epoch": 0.05655866400228376, + "grad_norm": 1.9991587400436401, + "learning_rate": 4.960647346160658e-05, + "loss": 5.3525, + "step": 9510 + }, + { + "epoch": 0.05656461128556475, + "grad_norm": 1.6889851093292236, + "learning_rate": 4.960639090586513e-05, + "loss": 5.0592, + "step": 9511 + }, + { + "epoch": 0.05657055856884575, + "grad_norm": 1.6314234733581543, + "learning_rate": 4.9606308341533844e-05, + "loss": 5.1733, + "step": 9512 + }, + { + "epoch": 0.05657650585212675, + "grad_norm": 1.7801847457885742, + "learning_rate": 4.960622576861275e-05, + "loss": 5.2358, + "step": 9513 + }, + { + "epoch": 0.056582453135407745, + "grad_norm": 1.6572017669677734, + "learning_rate": 4.9606143187101864e-05, + "loss": 5.2429, + "step": 9514 + }, + { + "epoch": 0.05658840041868874, + "grad_norm": 1.7574421167373657, + "learning_rate": 4.960606059700124e-05, + "loss": 5.0717, + "step": 9515 + }, + { + "epoch": 0.05659434770196974, + "grad_norm": 1.8162970542907715, + "learning_rate": 4.960597799831088e-05, + "loss": 5.1513, + "step": 9516 + }, + { + "epoch": 0.05660029498525074, + "grad_norm": 1.9231795072555542, + "learning_rate": 4.960589539103084e-05, + "loss": 5.1539, + "step": 9517 + }, + { + "epoch": 0.05660624226853173, + "grad_norm": 1.624566674232483, + "learning_rate": 4.9605812775161136e-05, + "loss": 5.0999, + "step": 9518 + }, + { + "epoch": 0.056612189551812735, + "grad_norm": 1.4293668270111084, + "learning_rate": 4.960573015070179e-05, + "loss": 5.2365, + "step": 9519 + }, + { + "epoch": 0.05661813683509373, + "grad_norm": 1.789515495300293, + "learning_rate": 4.960564751765284e-05, + "loss": 5.2233, + "step": 9520 + }, + { + "epoch": 0.056624084118374725, + "grad_norm": 1.7212306261062622, + "learning_rate": 4.960556487601432e-05, + "loss": 5.1902, + "step": 9521 + }, + { + "epoch": 0.05663003140165573, + "grad_norm": 1.7691519260406494, + "learning_rate": 4.960548222578625e-05, + "loss": 5.2136, + "step": 9522 + }, + { + "epoch": 0.05663597868493672, + "grad_norm": 1.5925794839859009, + "learning_rate": 4.960539956696866e-05, + "loss": 5.4808, + "step": 9523 + }, + { + "epoch": 0.05664192596821772, + "grad_norm": 1.7014095783233643, + "learning_rate": 4.960531689956157e-05, + "loss": 5.1934, + "step": 9524 + }, + { + "epoch": 0.05664787325149871, + "grad_norm": 1.3620802164077759, + "learning_rate": 4.960523422356502e-05, + "loss": 5.0169, + "step": 9525 + }, + { + "epoch": 0.056653820534779714, + "grad_norm": 1.4778205156326294, + "learning_rate": 4.960515153897904e-05, + "loss": 5.1535, + "step": 9526 + }, + { + "epoch": 0.05665976781806071, + "grad_norm": 1.6393300294876099, + "learning_rate": 4.960506884580366e-05, + "loss": 5.2494, + "step": 9527 + }, + { + "epoch": 0.056665715101341704, + "grad_norm": 1.6070711612701416, + "learning_rate": 4.96049861440389e-05, + "loss": 5.3117, + "step": 9528 + }, + { + "epoch": 0.056671662384622706, + "grad_norm": 1.6023461818695068, + "learning_rate": 4.96049034336848e-05, + "loss": 5.1554, + "step": 9529 + }, + { + "epoch": 0.0566776096679037, + "grad_norm": 1.6061514616012573, + "learning_rate": 4.9604820714741374e-05, + "loss": 5.4123, + "step": 9530 + }, + { + "epoch": 0.056683556951184697, + "grad_norm": 1.8043792247772217, + "learning_rate": 4.960473798720866e-05, + "loss": 5.2582, + "step": 9531 + }, + { + "epoch": 0.0566895042344657, + "grad_norm": 1.6002432107925415, + "learning_rate": 4.960465525108669e-05, + "loss": 5.211, + "step": 9532 + }, + { + "epoch": 0.056695451517746694, + "grad_norm": 1.851266622543335, + "learning_rate": 4.960457250637549e-05, + "loss": 5.0949, + "step": 9533 + }, + { + "epoch": 0.05670139880102769, + "grad_norm": 1.7806520462036133, + "learning_rate": 4.9604489753075085e-05, + "loss": 5.1178, + "step": 9534 + }, + { + "epoch": 0.05670734608430869, + "grad_norm": 1.9938620328903198, + "learning_rate": 4.9604406991185506e-05, + "loss": 5.098, + "step": 9535 + }, + { + "epoch": 0.056713293367589686, + "grad_norm": 1.7983622550964355, + "learning_rate": 4.960432422070679e-05, + "loss": 4.98, + "step": 9536 + }, + { + "epoch": 0.05671924065087068, + "grad_norm": 1.845821499824524, + "learning_rate": 4.960424144163895e-05, + "loss": 4.951, + "step": 9537 + }, + { + "epoch": 0.05672518793415168, + "grad_norm": 1.8922109603881836, + "learning_rate": 4.960415865398202e-05, + "loss": 5.0327, + "step": 9538 + }, + { + "epoch": 0.05673113521743268, + "grad_norm": 2.159832239151001, + "learning_rate": 4.960407585773604e-05, + "loss": 5.5287, + "step": 9539 + }, + { + "epoch": 0.05673708250071367, + "grad_norm": 1.9966739416122437, + "learning_rate": 4.960399305290103e-05, + "loss": 5.7114, + "step": 9540 + }, + { + "epoch": 0.05674302978399467, + "grad_norm": 1.8796072006225586, + "learning_rate": 4.9603910239477026e-05, + "loss": 5.4673, + "step": 9541 + }, + { + "epoch": 0.05674897706727567, + "grad_norm": 1.6589174270629883, + "learning_rate": 4.9603827417464045e-05, + "loss": 5.3755, + "step": 9542 + }, + { + "epoch": 0.056754924350556665, + "grad_norm": 1.975807547569275, + "learning_rate": 4.960374458686212e-05, + "loss": 5.0648, + "step": 9543 + }, + { + "epoch": 0.05676087163383766, + "grad_norm": 1.7437241077423096, + "learning_rate": 4.960366174767128e-05, + "loss": 5.2338, + "step": 9544 + }, + { + "epoch": 0.05676681891711866, + "grad_norm": 1.8508884906768799, + "learning_rate": 4.9603578899891564e-05, + "loss": 5.3432, + "step": 9545 + }, + { + "epoch": 0.05677276620039966, + "grad_norm": 2.2117562294006348, + "learning_rate": 4.960349604352299e-05, + "loss": 5.0623, + "step": 9546 + }, + { + "epoch": 0.05677871348368065, + "grad_norm": 1.7681034803390503, + "learning_rate": 4.9603413178565586e-05, + "loss": 5.1998, + "step": 9547 + }, + { + "epoch": 0.056784660766961655, + "grad_norm": 2.4477179050445557, + "learning_rate": 4.960333030501939e-05, + "loss": 5.3317, + "step": 9548 + }, + { + "epoch": 0.05679060805024265, + "grad_norm": 1.8297652006149292, + "learning_rate": 4.9603247422884426e-05, + "loss": 5.3608, + "step": 9549 + }, + { + "epoch": 0.056796555333523645, + "grad_norm": 1.8361153602600098, + "learning_rate": 4.9603164532160715e-05, + "loss": 5.3914, + "step": 9550 + }, + { + "epoch": 0.05680250261680465, + "grad_norm": 1.748226523399353, + "learning_rate": 4.96030816328483e-05, + "loss": 5.3436, + "step": 9551 + }, + { + "epoch": 0.05680844990008564, + "grad_norm": 1.744964599609375, + "learning_rate": 4.96029987249472e-05, + "loss": 5.4287, + "step": 9552 + }, + { + "epoch": 0.05681439718336664, + "grad_norm": 1.9512866735458374, + "learning_rate": 4.9602915808457454e-05, + "loss": 5.3601, + "step": 9553 + }, + { + "epoch": 0.05682034446664763, + "grad_norm": 1.5863629579544067, + "learning_rate": 4.9602832883379077e-05, + "loss": 5.5491, + "step": 9554 + }, + { + "epoch": 0.056826291749928634, + "grad_norm": 1.967677354812622, + "learning_rate": 4.96027499497121e-05, + "loss": 5.2402, + "step": 9555 + }, + { + "epoch": 0.05683223903320963, + "grad_norm": 2.277714252471924, + "learning_rate": 4.960266700745657e-05, + "loss": 5.5155, + "step": 9556 + }, + { + "epoch": 0.056838186316490624, + "grad_norm": 1.8371034860610962, + "learning_rate": 4.96025840566125e-05, + "loss": 5.2694, + "step": 9557 + }, + { + "epoch": 0.056844133599771626, + "grad_norm": 1.723008155822754, + "learning_rate": 4.9602501097179915e-05, + "loss": 5.4983, + "step": 9558 + }, + { + "epoch": 0.05685008088305262, + "grad_norm": 1.6955413818359375, + "learning_rate": 4.960241812915886e-05, + "loss": 5.6888, + "step": 9559 + }, + { + "epoch": 0.056856028166333616, + "grad_norm": 1.5899012088775635, + "learning_rate": 4.960233515254935e-05, + "loss": 5.4241, + "step": 9560 + }, + { + "epoch": 0.05686197544961462, + "grad_norm": 1.493268370628357, + "learning_rate": 4.9602252167351416e-05, + "loss": 5.1889, + "step": 9561 + }, + { + "epoch": 0.056867922732895614, + "grad_norm": 1.8037081956863403, + "learning_rate": 4.9602169173565094e-05, + "loss": 5.1785, + "step": 9562 + }, + { + "epoch": 0.05687387001617661, + "grad_norm": 1.6377664804458618, + "learning_rate": 4.960208617119041e-05, + "loss": 5.2593, + "step": 9563 + }, + { + "epoch": 0.05687981729945761, + "grad_norm": 2.077209234237671, + "learning_rate": 4.960200316022739e-05, + "loss": 5.1012, + "step": 9564 + }, + { + "epoch": 0.056885764582738606, + "grad_norm": 2.3584885597229004, + "learning_rate": 4.9601920140676064e-05, + "loss": 5.1141, + "step": 9565 + }, + { + "epoch": 0.0568917118660196, + "grad_norm": 1.990319013595581, + "learning_rate": 4.960183711253646e-05, + "loss": 4.9336, + "step": 9566 + }, + { + "epoch": 0.0568976591493006, + "grad_norm": 2.037742853164673, + "learning_rate": 4.960175407580861e-05, + "loss": 4.8494, + "step": 9567 + }, + { + "epoch": 0.0569036064325816, + "grad_norm": 1.8493839502334595, + "learning_rate": 4.9601671030492546e-05, + "loss": 5.337, + "step": 9568 + }, + { + "epoch": 0.05690955371586259, + "grad_norm": 1.9864604473114014, + "learning_rate": 4.960158797658829e-05, + "loss": 5.5684, + "step": 9569 + }, + { + "epoch": 0.05691550099914359, + "grad_norm": 1.9740629196166992, + "learning_rate": 4.960150491409587e-05, + "loss": 5.444, + "step": 9570 + }, + { + "epoch": 0.05692144828242459, + "grad_norm": 1.9429807662963867, + "learning_rate": 4.960142184301533e-05, + "loss": 5.277, + "step": 9571 + }, + { + "epoch": 0.056927395565705585, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.960133876334668e-05, + "loss": 5.1694, + "step": 9572 + }, + { + "epoch": 0.05693334284898658, + "grad_norm": 1.7716888189315796, + "learning_rate": 4.960125567508996e-05, + "loss": 5.1383, + "step": 9573 + }, + { + "epoch": 0.05693929013226758, + "grad_norm": 1.8266246318817139, + "learning_rate": 4.9601172578245194e-05, + "loss": 5.4019, + "step": 9574 + }, + { + "epoch": 0.05694523741554858, + "grad_norm": 1.8929648399353027, + "learning_rate": 4.9601089472812414e-05, + "loss": 5.3948, + "step": 9575 + }, + { + "epoch": 0.05695118469882957, + "grad_norm": 1.9918208122253418, + "learning_rate": 4.960100635879165e-05, + "loss": 5.3195, + "step": 9576 + }, + { + "epoch": 0.056957131982110575, + "grad_norm": 1.4987989664077759, + "learning_rate": 4.960092323618292e-05, + "loss": 5.5292, + "step": 9577 + }, + { + "epoch": 0.05696307926539157, + "grad_norm": 1.683800220489502, + "learning_rate": 4.960084010498627e-05, + "loss": 5.5069, + "step": 9578 + }, + { + "epoch": 0.056969026548672565, + "grad_norm": 1.767561435699463, + "learning_rate": 4.960075696520171e-05, + "loss": 5.4134, + "step": 9579 + }, + { + "epoch": 0.05697497383195357, + "grad_norm": 2.077564239501953, + "learning_rate": 4.960067381682929e-05, + "loss": 5.3362, + "step": 9580 + }, + { + "epoch": 0.05698092111523456, + "grad_norm": 2.0167109966278076, + "learning_rate": 4.960059065986903e-05, + "loss": 5.4235, + "step": 9581 + }, + { + "epoch": 0.05698686839851556, + "grad_norm": 1.647669792175293, + "learning_rate": 4.9600507494320953e-05, + "loss": 5.3273, + "step": 9582 + }, + { + "epoch": 0.05699281568179655, + "grad_norm": 1.6051719188690186, + "learning_rate": 4.960042432018509e-05, + "loss": 5.2486, + "step": 9583 + }, + { + "epoch": 0.056998762965077554, + "grad_norm": 1.9283394813537598, + "learning_rate": 4.960034113746148e-05, + "loss": 5.233, + "step": 9584 + }, + { + "epoch": 0.05700471024835855, + "grad_norm": 1.6215802431106567, + "learning_rate": 4.960025794615014e-05, + "loss": 5.2322, + "step": 9585 + }, + { + "epoch": 0.057010657531639544, + "grad_norm": 1.8902918100357056, + "learning_rate": 4.960017474625111e-05, + "loss": 5.063, + "step": 9586 + }, + { + "epoch": 0.057016604814920546, + "grad_norm": 2.4694666862487793, + "learning_rate": 4.9600091537764415e-05, + "loss": 4.498, + "step": 9587 + }, + { + "epoch": 0.05702255209820154, + "grad_norm": 1.98915433883667, + "learning_rate": 4.960000832069007e-05, + "loss": 4.8781, + "step": 9588 + }, + { + "epoch": 0.057028499381482536, + "grad_norm": 2.0424818992614746, + "learning_rate": 4.9599925095028126e-05, + "loss": 5.5803, + "step": 9589 + }, + { + "epoch": 0.05703444666476354, + "grad_norm": 1.471275806427002, + "learning_rate": 4.95998418607786e-05, + "loss": 5.5604, + "step": 9590 + }, + { + "epoch": 0.057040393948044534, + "grad_norm": 1.6512761116027832, + "learning_rate": 4.959975861794152e-05, + "loss": 5.2147, + "step": 9591 + }, + { + "epoch": 0.05704634123132553, + "grad_norm": 1.6902865171432495, + "learning_rate": 4.959967536651693e-05, + "loss": 5.2654, + "step": 9592 + }, + { + "epoch": 0.05705228851460653, + "grad_norm": 1.5656665563583374, + "learning_rate": 4.9599592106504835e-05, + "loss": 5.1106, + "step": 9593 + }, + { + "epoch": 0.057058235797887526, + "grad_norm": 1.760901927947998, + "learning_rate": 4.959950883790528e-05, + "loss": 5.1833, + "step": 9594 + }, + { + "epoch": 0.05706418308116852, + "grad_norm": 1.5585325956344604, + "learning_rate": 4.9599425560718294e-05, + "loss": 5.202, + "step": 9595 + }, + { + "epoch": 0.05707013036444952, + "grad_norm": 1.5477479696273804, + "learning_rate": 4.959934227494389e-05, + "loss": 5.121, + "step": 9596 + }, + { + "epoch": 0.05707607764773052, + "grad_norm": 1.9299825429916382, + "learning_rate": 4.959925898058213e-05, + "loss": 5.0026, + "step": 9597 + }, + { + "epoch": 0.05708202493101151, + "grad_norm": 1.866237759590149, + "learning_rate": 4.959917567763301e-05, + "loss": 4.999, + "step": 9598 + }, + { + "epoch": 0.05708797221429251, + "grad_norm": 1.6670162677764893, + "learning_rate": 4.959909236609657e-05, + "loss": 5.4047, + "step": 9599 + }, + { + "epoch": 0.05709391949757351, + "grad_norm": 1.4666836261749268, + "learning_rate": 4.9599009045972844e-05, + "loss": 5.3598, + "step": 9600 + }, + { + "epoch": 0.057099866780854505, + "grad_norm": 1.928645372390747, + "learning_rate": 4.959892571726186e-05, + "loss": 5.7015, + "step": 9601 + }, + { + "epoch": 0.0571058140641355, + "grad_norm": 1.9761322736740112, + "learning_rate": 4.959884237996365e-05, + "loss": 4.8682, + "step": 9602 + }, + { + "epoch": 0.0571117613474165, + "grad_norm": 1.9823036193847656, + "learning_rate": 4.959875903407823e-05, + "loss": 4.8752, + "step": 9603 + }, + { + "epoch": 0.0571177086306975, + "grad_norm": 1.9242253303527832, + "learning_rate": 4.959867567960564e-05, + "loss": 4.9314, + "step": 9604 + }, + { + "epoch": 0.05712365591397849, + "grad_norm": 1.740980625152588, + "learning_rate": 4.9598592316545904e-05, + "loss": 4.9843, + "step": 9605 + }, + { + "epoch": 0.057129603197259494, + "grad_norm": 2.0768508911132812, + "learning_rate": 4.959850894489906e-05, + "loss": 4.8528, + "step": 9606 + }, + { + "epoch": 0.05713555048054049, + "grad_norm": 1.7417833805084229, + "learning_rate": 4.959842556466513e-05, + "loss": 5.1374, + "step": 9607 + }, + { + "epoch": 0.057141497763821485, + "grad_norm": 1.933691382408142, + "learning_rate": 4.959834217584414e-05, + "loss": 5.349, + "step": 9608 + }, + { + "epoch": 0.05714744504710249, + "grad_norm": 1.8035194873809814, + "learning_rate": 4.959825877843612e-05, + "loss": 5.0212, + "step": 9609 + }, + { + "epoch": 0.05715339233038348, + "grad_norm": 2.323709487915039, + "learning_rate": 4.9598175372441106e-05, + "loss": 5.5346, + "step": 9610 + }, + { + "epoch": 0.05715933961366448, + "grad_norm": 1.755983591079712, + "learning_rate": 4.959809195785912e-05, + "loss": 4.8425, + "step": 9611 + }, + { + "epoch": 0.05716528689694547, + "grad_norm": 1.6614432334899902, + "learning_rate": 4.95980085346902e-05, + "loss": 4.912, + "step": 9612 + }, + { + "epoch": 0.057171234180226474, + "grad_norm": 1.8319662809371948, + "learning_rate": 4.959792510293436e-05, + "loss": 5.0125, + "step": 9613 + }, + { + "epoch": 0.05717718146350747, + "grad_norm": 1.8528090715408325, + "learning_rate": 4.959784166259165e-05, + "loss": 4.898, + "step": 9614 + }, + { + "epoch": 0.057183128746788464, + "grad_norm": 2.163757562637329, + "learning_rate": 4.959775821366208e-05, + "loss": 5.2041, + "step": 9615 + }, + { + "epoch": 0.057189076030069466, + "grad_norm": 1.939430832862854, + "learning_rate": 4.959767475614569e-05, + "loss": 5.3337, + "step": 9616 + }, + { + "epoch": 0.05719502331335046, + "grad_norm": 1.7198511362075806, + "learning_rate": 4.959759129004251e-05, + "loss": 5.2682, + "step": 9617 + }, + { + "epoch": 0.057200970596631456, + "grad_norm": 1.7674570083618164, + "learning_rate": 4.959750781535255e-05, + "loss": 5.4188, + "step": 9618 + }, + { + "epoch": 0.05720691787991246, + "grad_norm": 1.7197433710098267, + "learning_rate": 4.959742433207587e-05, + "loss": 5.1725, + "step": 9619 + }, + { + "epoch": 0.05721286516319345, + "grad_norm": 1.6682969331741333, + "learning_rate": 4.959734084021248e-05, + "loss": 5.1349, + "step": 9620 + }, + { + "epoch": 0.05721881244647445, + "grad_norm": 1.3784568309783936, + "learning_rate": 4.959725733976241e-05, + "loss": 5.2408, + "step": 9621 + }, + { + "epoch": 0.05722475972975545, + "grad_norm": 1.690483808517456, + "learning_rate": 4.9597173830725686e-05, + "loss": 5.2616, + "step": 9622 + }, + { + "epoch": 0.057230707013036446, + "grad_norm": 1.5313903093338013, + "learning_rate": 4.959709031310235e-05, + "loss": 5.1481, + "step": 9623 + }, + { + "epoch": 0.05723665429631744, + "grad_norm": 1.6266121864318848, + "learning_rate": 4.959700678689242e-05, + "loss": 5.0192, + "step": 9624 + }, + { + "epoch": 0.05724260157959844, + "grad_norm": 2.3125410079956055, + "learning_rate": 4.959692325209593e-05, + "loss": 4.5513, + "step": 9625 + }, + { + "epoch": 0.05724854886287944, + "grad_norm": 1.6884924173355103, + "learning_rate": 4.9596839708712913e-05, + "loss": 5.1917, + "step": 9626 + }, + { + "epoch": 0.05725449614616043, + "grad_norm": 1.5797723531723022, + "learning_rate": 4.9596756156743385e-05, + "loss": 5.5674, + "step": 9627 + }, + { + "epoch": 0.05726044342944143, + "grad_norm": 1.6152269840240479, + "learning_rate": 4.959667259618739e-05, + "loss": 5.4566, + "step": 9628 + }, + { + "epoch": 0.05726639071272243, + "grad_norm": 1.611608624458313, + "learning_rate": 4.959658902704495e-05, + "loss": 5.3678, + "step": 9629 + }, + { + "epoch": 0.057272337996003425, + "grad_norm": 1.774327278137207, + "learning_rate": 4.9596505449316086e-05, + "loss": 5.2438, + "step": 9630 + }, + { + "epoch": 0.05727828527928442, + "grad_norm": 1.7961443662643433, + "learning_rate": 4.9596421863000856e-05, + "loss": 5.3061, + "step": 9631 + }, + { + "epoch": 0.05728423256256542, + "grad_norm": 1.709675669670105, + "learning_rate": 4.959633826809925e-05, + "loss": 5.0095, + "step": 9632 + }, + { + "epoch": 0.05729017984584642, + "grad_norm": 1.7140734195709229, + "learning_rate": 4.959625466461132e-05, + "loss": 5.313, + "step": 9633 + }, + { + "epoch": 0.05729612712912741, + "grad_norm": 1.8302016258239746, + "learning_rate": 4.95961710525371e-05, + "loss": 5.4008, + "step": 9634 + }, + { + "epoch": 0.057302074412408414, + "grad_norm": 1.8570395708084106, + "learning_rate": 4.95960874318766e-05, + "loss": 5.513, + "step": 9635 + }, + { + "epoch": 0.05730802169568941, + "grad_norm": 1.6907027959823608, + "learning_rate": 4.959600380262987e-05, + "loss": 5.1933, + "step": 9636 + }, + { + "epoch": 0.057313968978970405, + "grad_norm": 1.6505299806594849, + "learning_rate": 4.9595920164796926e-05, + "loss": 5.1537, + "step": 9637 + }, + { + "epoch": 0.05731991626225141, + "grad_norm": 1.5248258113861084, + "learning_rate": 4.95958365183778e-05, + "loss": 5.4232, + "step": 9638 + }, + { + "epoch": 0.0573258635455324, + "grad_norm": 1.4630048274993896, + "learning_rate": 4.9595752863372524e-05, + "loss": 5.565, + "step": 9639 + }, + { + "epoch": 0.0573318108288134, + "grad_norm": 1.5858573913574219, + "learning_rate": 4.959566919978112e-05, + "loss": 5.4364, + "step": 9640 + }, + { + "epoch": 0.05733775811209439, + "grad_norm": 1.7803694009780884, + "learning_rate": 4.9595585527603625e-05, + "loss": 5.1727, + "step": 9641 + }, + { + "epoch": 0.057343705395375394, + "grad_norm": 1.639163851737976, + "learning_rate": 4.959550184684007e-05, + "loss": 5.5538, + "step": 9642 + }, + { + "epoch": 0.05734965267865639, + "grad_norm": 1.5917890071868896, + "learning_rate": 4.959541815749046e-05, + "loss": 5.6788, + "step": 9643 + }, + { + "epoch": 0.057355599961937384, + "grad_norm": 1.5524990558624268, + "learning_rate": 4.959533445955487e-05, + "loss": 5.7832, + "step": 9644 + }, + { + "epoch": 0.057361547245218386, + "grad_norm": 1.7229019403457642, + "learning_rate": 4.959525075303328e-05, + "loss": 5.4417, + "step": 9645 + }, + { + "epoch": 0.05736749452849938, + "grad_norm": 1.5434623956680298, + "learning_rate": 4.959516703792575e-05, + "loss": 5.3629, + "step": 9646 + }, + { + "epoch": 0.057373441811780376, + "grad_norm": 1.4929866790771484, + "learning_rate": 4.9595083314232306e-05, + "loss": 5.8586, + "step": 9647 + }, + { + "epoch": 0.05737938909506138, + "grad_norm": 1.209796667098999, + "learning_rate": 4.959499958195297e-05, + "loss": 5.5001, + "step": 9648 + }, + { + "epoch": 0.05738533637834237, + "grad_norm": 2.703871488571167, + "learning_rate": 4.9594915841087775e-05, + "loss": 5.6564, + "step": 9649 + }, + { + "epoch": 0.05739128366162337, + "grad_norm": 1.9408828020095825, + "learning_rate": 4.959483209163674e-05, + "loss": 5.6683, + "step": 9650 + }, + { + "epoch": 0.05739723094490437, + "grad_norm": 1.8055803775787354, + "learning_rate": 4.9594748333599914e-05, + "loss": 5.3046, + "step": 9651 + }, + { + "epoch": 0.057403178228185366, + "grad_norm": 2.3453104496002197, + "learning_rate": 4.959466456697731e-05, + "loss": 6.1944, + "step": 9652 + }, + { + "epoch": 0.05740912551146636, + "grad_norm": 2.3799800872802734, + "learning_rate": 4.959458079176897e-05, + "loss": 5.6706, + "step": 9653 + }, + { + "epoch": 0.05741507279474736, + "grad_norm": 2.111069440841675, + "learning_rate": 4.959449700797491e-05, + "loss": 5.1808, + "step": 9654 + }, + { + "epoch": 0.05742102007802836, + "grad_norm": 2.237873077392578, + "learning_rate": 4.9594413215595164e-05, + "loss": 5.0609, + "step": 9655 + }, + { + "epoch": 0.05742696736130935, + "grad_norm": 1.956520438194275, + "learning_rate": 4.959432941462977e-05, + "loss": 5.1431, + "step": 9656 + }, + { + "epoch": 0.05743291464459035, + "grad_norm": 2.3761603832244873, + "learning_rate": 4.9594245605078735e-05, + "loss": 4.8722, + "step": 9657 + }, + { + "epoch": 0.05743886192787135, + "grad_norm": 1.820745825767517, + "learning_rate": 4.959416178694212e-05, + "loss": 5.0149, + "step": 9658 + }, + { + "epoch": 0.057444809211152345, + "grad_norm": 2.0804755687713623, + "learning_rate": 4.9594077960219924e-05, + "loss": 5.7698, + "step": 9659 + }, + { + "epoch": 0.05745075649443334, + "grad_norm": 1.9319117069244385, + "learning_rate": 4.9593994124912196e-05, + "loss": 5.3054, + "step": 9660 + }, + { + "epoch": 0.05745670377771434, + "grad_norm": 2.386338472366333, + "learning_rate": 4.959391028101896e-05, + "loss": 5.2093, + "step": 9661 + }, + { + "epoch": 0.05746265106099534, + "grad_norm": 1.852386474609375, + "learning_rate": 4.9593826428540244e-05, + "loss": 5.1943, + "step": 9662 + }, + { + "epoch": 0.05746859834427633, + "grad_norm": 1.9619694948196411, + "learning_rate": 4.959374256747607e-05, + "loss": 4.8275, + "step": 9663 + }, + { + "epoch": 0.057474545627557334, + "grad_norm": 2.4797024726867676, + "learning_rate": 4.9593658697826485e-05, + "loss": 5.5257, + "step": 9664 + }, + { + "epoch": 0.05748049291083833, + "grad_norm": 2.1713874340057373, + "learning_rate": 4.959357481959149e-05, + "loss": 5.4486, + "step": 9665 + }, + { + "epoch": 0.057486440194119325, + "grad_norm": 1.9605398178100586, + "learning_rate": 4.9593490932771145e-05, + "loss": 5.1512, + "step": 9666 + }, + { + "epoch": 0.05749238747740033, + "grad_norm": 1.9853549003601074, + "learning_rate": 4.959340703736547e-05, + "loss": 5.665, + "step": 9667 + }, + { + "epoch": 0.05749833476068132, + "grad_norm": 1.984279990196228, + "learning_rate": 4.9593323133374494e-05, + "loss": 5.7797, + "step": 9668 + }, + { + "epoch": 0.05750428204396232, + "grad_norm": 1.8343236446380615, + "learning_rate": 4.9593239220798225e-05, + "loss": 5.0261, + "step": 9669 + }, + { + "epoch": 0.05751022932724331, + "grad_norm": 1.8675687313079834, + "learning_rate": 4.959315529963673e-05, + "loss": 4.8754, + "step": 9670 + }, + { + "epoch": 0.057516176610524314, + "grad_norm": 1.9129834175109863, + "learning_rate": 4.959307136989e-05, + "loss": 5.1056, + "step": 9671 + }, + { + "epoch": 0.05752212389380531, + "grad_norm": 3.142893075942993, + "learning_rate": 4.95929874315581e-05, + "loss": 5.6029, + "step": 9672 + }, + { + "epoch": 0.057528071177086304, + "grad_norm": 1.80843985080719, + "learning_rate": 4.9592903484641026e-05, + "loss": 5.57, + "step": 9673 + }, + { + "epoch": 0.057534018460367306, + "grad_norm": 1.9195841550827026, + "learning_rate": 4.9592819529138835e-05, + "loss": 5.6964, + "step": 9674 + }, + { + "epoch": 0.0575399657436483, + "grad_norm": 2.026477813720703, + "learning_rate": 4.959273556505154e-05, + "loss": 5.8544, + "step": 9675 + }, + { + "epoch": 0.057545913026929296, + "grad_norm": 2.111274003982544, + "learning_rate": 4.959265159237918e-05, + "loss": 5.8014, + "step": 9676 + }, + { + "epoch": 0.0575518603102103, + "grad_norm": 1.9789505004882812, + "learning_rate": 4.9592567611121776e-05, + "loss": 5.7646, + "step": 9677 + }, + { + "epoch": 0.05755780759349129, + "grad_norm": 1.8776015043258667, + "learning_rate": 4.9592483621279365e-05, + "loss": 6.1603, + "step": 9678 + }, + { + "epoch": 0.05756375487677229, + "grad_norm": 2.135849714279175, + "learning_rate": 4.9592399622851956e-05, + "loss": 5.6372, + "step": 9679 + }, + { + "epoch": 0.05756970216005329, + "grad_norm": 2.3335585594177246, + "learning_rate": 4.959231561583961e-05, + "loss": 5.5515, + "step": 9680 + }, + { + "epoch": 0.057575649443334286, + "grad_norm": 1.9315869808197021, + "learning_rate": 4.9592231600242337e-05, + "loss": 5.9287, + "step": 9681 + }, + { + "epoch": 0.05758159672661528, + "grad_norm": 2.4559311866760254, + "learning_rate": 4.959214757606017e-05, + "loss": 5.6079, + "step": 9682 + }, + { + "epoch": 0.05758754400989628, + "grad_norm": 2.6558609008789062, + "learning_rate": 4.959206354329314e-05, + "loss": 5.5728, + "step": 9683 + }, + { + "epoch": 0.05759349129317728, + "grad_norm": 2.2376396656036377, + "learning_rate": 4.9591979501941274e-05, + "loss": 5.5318, + "step": 9684 + }, + { + "epoch": 0.05759943857645827, + "grad_norm": 1.8506240844726562, + "learning_rate": 4.95918954520046e-05, + "loss": 5.7957, + "step": 9685 + }, + { + "epoch": 0.05760538585973927, + "grad_norm": 2.2428138256073, + "learning_rate": 4.9591811393483144e-05, + "loss": 5.7223, + "step": 9686 + }, + { + "epoch": 0.05761133314302027, + "grad_norm": 2.5734875202178955, + "learning_rate": 4.9591727326376955e-05, + "loss": 5.3401, + "step": 9687 + }, + { + "epoch": 0.057617280426301265, + "grad_norm": 2.567263126373291, + "learning_rate": 4.959164325068604e-05, + "loss": 5.4853, + "step": 9688 + }, + { + "epoch": 0.05762322770958226, + "grad_norm": 2.4430556297302246, + "learning_rate": 4.959155916641043e-05, + "loss": 5.9845, + "step": 9689 + }, + { + "epoch": 0.05762917499286326, + "grad_norm": 2.039846181869507, + "learning_rate": 4.959147507355017e-05, + "loss": 6.0689, + "step": 9690 + }, + { + "epoch": 0.05763512227614426, + "grad_norm": 2.207920551300049, + "learning_rate": 4.959139097210528e-05, + "loss": 5.6658, + "step": 9691 + }, + { + "epoch": 0.05764106955942525, + "grad_norm": 1.7421616315841675, + "learning_rate": 4.959130686207578e-05, + "loss": 6.0915, + "step": 9692 + }, + { + "epoch": 0.057647016842706254, + "grad_norm": 1.7738968133926392, + "learning_rate": 4.9591222743461716e-05, + "loss": 6.2092, + "step": 9693 + }, + { + "epoch": 0.05765296412598725, + "grad_norm": 1.8665943145751953, + "learning_rate": 4.959113861626311e-05, + "loss": 6.0922, + "step": 9694 + }, + { + "epoch": 0.057658911409268244, + "grad_norm": 2.0272347927093506, + "learning_rate": 4.959105448047999e-05, + "loss": 5.8291, + "step": 9695 + }, + { + "epoch": 0.057664858692549247, + "grad_norm": 2.8527796268463135, + "learning_rate": 4.9590970336112395e-05, + "loss": 5.428, + "step": 9696 + }, + { + "epoch": 0.05767080597583024, + "grad_norm": 1.8518950939178467, + "learning_rate": 4.959088618316033e-05, + "loss": 5.4199, + "step": 9697 + }, + { + "epoch": 0.05767675325911124, + "grad_norm": 2.38712739944458, + "learning_rate": 4.959080202162386e-05, + "loss": 5.1627, + "step": 9698 + }, + { + "epoch": 0.05768270054239223, + "grad_norm": 1.8407059907913208, + "learning_rate": 4.959071785150298e-05, + "loss": 5.1827, + "step": 9699 + }, + { + "epoch": 0.057688647825673234, + "grad_norm": 2.431151866912842, + "learning_rate": 4.9590633672797744e-05, + "loss": 6.1722, + "step": 9700 + }, + { + "epoch": 0.05769459510895423, + "grad_norm": 2.498046398162842, + "learning_rate": 4.9590549485508165e-05, + "loss": 6.2321, + "step": 9701 + }, + { + "epoch": 0.057700542392235224, + "grad_norm": 1.8793575763702393, + "learning_rate": 4.959046528963428e-05, + "loss": 5.4019, + "step": 9702 + }, + { + "epoch": 0.057706489675516226, + "grad_norm": 2.137622117996216, + "learning_rate": 4.9590381085176115e-05, + "loss": 5.9118, + "step": 9703 + }, + { + "epoch": 0.05771243695879722, + "grad_norm": 1.9514268636703491, + "learning_rate": 4.959029687213371e-05, + "loss": 5.6651, + "step": 9704 + }, + { + "epoch": 0.057718384242078216, + "grad_norm": 2.3678367137908936, + "learning_rate": 4.9590212650507085e-05, + "loss": 5.2054, + "step": 9705 + }, + { + "epoch": 0.05772433152535922, + "grad_norm": 2.8808276653289795, + "learning_rate": 4.9590128420296266e-05, + "loss": 5.3066, + "step": 9706 + }, + { + "epoch": 0.05773027880864021, + "grad_norm": 2.2405474185943604, + "learning_rate": 4.9590044181501297e-05, + "loss": 5.2904, + "step": 9707 + }, + { + "epoch": 0.05773622609192121, + "grad_norm": 2.3762283325195312, + "learning_rate": 4.958995993412219e-05, + "loss": 5.5847, + "step": 9708 + }, + { + "epoch": 0.05774217337520221, + "grad_norm": 2.5258681774139404, + "learning_rate": 4.958987567815898e-05, + "loss": 5.4852, + "step": 9709 + }, + { + "epoch": 0.057748120658483205, + "grad_norm": 2.31478214263916, + "learning_rate": 4.9589791413611704e-05, + "loss": 5.5658, + "step": 9710 + }, + { + "epoch": 0.0577540679417642, + "grad_norm": 1.735771894454956, + "learning_rate": 4.958970714048038e-05, + "loss": 6.0311, + "step": 9711 + }, + { + "epoch": 0.0577600152250452, + "grad_norm": 2.2843849658966064, + "learning_rate": 4.958962285876505e-05, + "loss": 5.9535, + "step": 9712 + }, + { + "epoch": 0.0577659625083262, + "grad_norm": 2.3449392318725586, + "learning_rate": 4.958953856846573e-05, + "loss": 5.9835, + "step": 9713 + }, + { + "epoch": 0.05777190979160719, + "grad_norm": 2.319952964782715, + "learning_rate": 4.9589454269582456e-05, + "loss": 5.5318, + "step": 9714 + }, + { + "epoch": 0.05777785707488819, + "grad_norm": 2.6801493167877197, + "learning_rate": 4.958936996211526e-05, + "loss": 4.8672, + "step": 9715 + }, + { + "epoch": 0.05778380435816919, + "grad_norm": 2.622528553009033, + "learning_rate": 4.958928564606418e-05, + "loss": 6.0755, + "step": 9716 + }, + { + "epoch": 0.057789751641450185, + "grad_norm": 1.973480224609375, + "learning_rate": 4.9589201321429216e-05, + "loss": 5.8197, + "step": 9717 + }, + { + "epoch": 0.05779569892473118, + "grad_norm": 2.060497760772705, + "learning_rate": 4.958911698821043e-05, + "loss": 5.2838, + "step": 9718 + }, + { + "epoch": 0.05780164620801218, + "grad_norm": 2.068103551864624, + "learning_rate": 4.958903264640783e-05, + "loss": 5.4917, + "step": 9719 + }, + { + "epoch": 0.05780759349129318, + "grad_norm": 2.5899293422698975, + "learning_rate": 4.958894829602145e-05, + "loss": 5.1312, + "step": 9720 + }, + { + "epoch": 0.05781354077457417, + "grad_norm": 3.2153897285461426, + "learning_rate": 4.958886393705132e-05, + "loss": 4.7502, + "step": 9721 + }, + { + "epoch": 0.057819488057855174, + "grad_norm": 2.805802345275879, + "learning_rate": 4.9588779569497484e-05, + "loss": 4.6876, + "step": 9722 + }, + { + "epoch": 0.05782543534113617, + "grad_norm": 2.3670101165771484, + "learning_rate": 4.958869519335995e-05, + "loss": 4.6025, + "step": 9723 + }, + { + "epoch": 0.057831382624417164, + "grad_norm": 1.992903709411621, + "learning_rate": 4.9588610808638755e-05, + "loss": 5.3602, + "step": 9724 + }, + { + "epoch": 0.057837329907698166, + "grad_norm": 2.249572277069092, + "learning_rate": 4.958852641533394e-05, + "loss": 4.9574, + "step": 9725 + }, + { + "epoch": 0.05784327719097916, + "grad_norm": 2.500433921813965, + "learning_rate": 4.958844201344552e-05, + "loss": 5.3656, + "step": 9726 + }, + { + "epoch": 0.05784922447426016, + "grad_norm": 2.0277605056762695, + "learning_rate": 4.9588357602973526e-05, + "loss": 5.6467, + "step": 9727 + }, + { + "epoch": 0.05785517175754116, + "grad_norm": 2.1196112632751465, + "learning_rate": 4.958827318391799e-05, + "loss": 5.6257, + "step": 9728 + }, + { + "epoch": 0.057861119040822154, + "grad_norm": 3.160593271255493, + "learning_rate": 4.9588188756278945e-05, + "loss": 4.9618, + "step": 9729 + }, + { + "epoch": 0.05786706632410315, + "grad_norm": 1.90407395362854, + "learning_rate": 4.958810432005642e-05, + "loss": 5.4551, + "step": 9730 + }, + { + "epoch": 0.057873013607384144, + "grad_norm": 2.0096004009246826, + "learning_rate": 4.958801987525043e-05, + "loss": 5.6562, + "step": 9731 + }, + { + "epoch": 0.057878960890665146, + "grad_norm": 2.617847442626953, + "learning_rate": 4.958793542186103e-05, + "loss": 5.747, + "step": 9732 + }, + { + "epoch": 0.05788490817394614, + "grad_norm": 2.3982057571411133, + "learning_rate": 4.9587850959888226e-05, + "loss": 5.6146, + "step": 9733 + }, + { + "epoch": 0.057890855457227136, + "grad_norm": 2.0222113132476807, + "learning_rate": 4.9587766489332065e-05, + "loss": 6.0204, + "step": 9734 + }, + { + "epoch": 0.05789680274050814, + "grad_norm": 2.1110177040100098, + "learning_rate": 4.958768201019257e-05, + "loss": 5.2957, + "step": 9735 + }, + { + "epoch": 0.05790275002378913, + "grad_norm": 1.8278865814208984, + "learning_rate": 4.958759752246977e-05, + "loss": 5.9902, + "step": 9736 + }, + { + "epoch": 0.05790869730707013, + "grad_norm": 2.2461514472961426, + "learning_rate": 4.958751302616368e-05, + "loss": 5.8572, + "step": 9737 + }, + { + "epoch": 0.05791464459035113, + "grad_norm": 1.7453250885009766, + "learning_rate": 4.958742852127435e-05, + "loss": 5.6658, + "step": 9738 + }, + { + "epoch": 0.057920591873632125, + "grad_norm": 2.480726718902588, + "learning_rate": 4.95873440078018e-05, + "loss": 5.4231, + "step": 9739 + }, + { + "epoch": 0.05792653915691312, + "grad_norm": 2.2310776710510254, + "learning_rate": 4.958725948574607e-05, + "loss": 5.4768, + "step": 9740 + }, + { + "epoch": 0.05793248644019412, + "grad_norm": 1.9454891681671143, + "learning_rate": 4.958717495510718e-05, + "loss": 5.4503, + "step": 9741 + }, + { + "epoch": 0.05793843372347512, + "grad_norm": 2.196054458618164, + "learning_rate": 4.958709041588516e-05, + "loss": 5.1987, + "step": 9742 + }, + { + "epoch": 0.05794438100675611, + "grad_norm": 2.385000228881836, + "learning_rate": 4.958700586808004e-05, + "loss": 5.8413, + "step": 9743 + }, + { + "epoch": 0.05795032829003711, + "grad_norm": 2.0967705249786377, + "learning_rate": 4.958692131169185e-05, + "loss": 5.8531, + "step": 9744 + }, + { + "epoch": 0.05795627557331811, + "grad_norm": 2.186253309249878, + "learning_rate": 4.958683674672062e-05, + "loss": 5.8241, + "step": 9745 + }, + { + "epoch": 0.057962222856599105, + "grad_norm": 1.8932995796203613, + "learning_rate": 4.958675217316638e-05, + "loss": 5.8724, + "step": 9746 + }, + { + "epoch": 0.0579681701398801, + "grad_norm": 1.9706943035125732, + "learning_rate": 4.958666759102916e-05, + "loss": 5.6565, + "step": 9747 + }, + { + "epoch": 0.0579741174231611, + "grad_norm": 1.7686703205108643, + "learning_rate": 4.958658300030898e-05, + "loss": 5.6299, + "step": 9748 + }, + { + "epoch": 0.0579800647064421, + "grad_norm": 2.309403419494629, + "learning_rate": 4.958649840100589e-05, + "loss": 4.6907, + "step": 9749 + }, + { + "epoch": 0.05798601198972309, + "grad_norm": 2.139760971069336, + "learning_rate": 4.95864137931199e-05, + "loss": 4.7311, + "step": 9750 + }, + { + "epoch": 0.057991959273004094, + "grad_norm": 1.960402011871338, + "learning_rate": 4.958632917665105e-05, + "loss": 5.598, + "step": 9751 + }, + { + "epoch": 0.05799790655628509, + "grad_norm": 1.721853256225586, + "learning_rate": 4.958624455159936e-05, + "loss": 6.0519, + "step": 9752 + }, + { + "epoch": 0.058003853839566084, + "grad_norm": 1.8527748584747314, + "learning_rate": 4.958615991796487e-05, + "loss": 5.3347, + "step": 9753 + }, + { + "epoch": 0.058009801122847086, + "grad_norm": 2.070084810256958, + "learning_rate": 4.958607527574761e-05, + "loss": 4.6653, + "step": 9754 + }, + { + "epoch": 0.05801574840612808, + "grad_norm": 2.143115997314453, + "learning_rate": 4.9585990624947605e-05, + "loss": 4.6522, + "step": 9755 + }, + { + "epoch": 0.05802169568940908, + "grad_norm": 2.2870991230010986, + "learning_rate": 4.9585905965564884e-05, + "loss": 4.7037, + "step": 9756 + }, + { + "epoch": 0.05802764297269008, + "grad_norm": 2.0633544921875, + "learning_rate": 4.958582129759947e-05, + "loss": 4.689, + "step": 9757 + }, + { + "epoch": 0.058033590255971074, + "grad_norm": 1.8845857381820679, + "learning_rate": 4.95857366210514e-05, + "loss": 4.8077, + "step": 9758 + }, + { + "epoch": 0.05803953753925207, + "grad_norm": 1.7319310903549194, + "learning_rate": 4.9585651935920715e-05, + "loss": 5.3528, + "step": 9759 + }, + { + "epoch": 0.058045484822533064, + "grad_norm": 2.2369909286499023, + "learning_rate": 4.958556724220742e-05, + "loss": 4.6549, + "step": 9760 + }, + { + "epoch": 0.058051432105814066, + "grad_norm": 2.076901912689209, + "learning_rate": 4.9585482539911566e-05, + "loss": 4.4642, + "step": 9761 + }, + { + "epoch": 0.05805737938909506, + "grad_norm": 2.0487091541290283, + "learning_rate": 4.958539782903318e-05, + "loss": 4.6575, + "step": 9762 + }, + { + "epoch": 0.058063326672376056, + "grad_norm": 2.2116169929504395, + "learning_rate": 4.9585313109572274e-05, + "loss": 4.4866, + "step": 9763 + }, + { + "epoch": 0.05806927395565706, + "grad_norm": 1.9818168878555298, + "learning_rate": 4.958522838152889e-05, + "loss": 4.7502, + "step": 9764 + }, + { + "epoch": 0.05807522123893805, + "grad_norm": 2.1484010219573975, + "learning_rate": 4.958514364490306e-05, + "loss": 5.7809, + "step": 9765 + }, + { + "epoch": 0.05808116852221905, + "grad_norm": 2.4087398052215576, + "learning_rate": 4.958505889969481e-05, + "loss": 5.5236, + "step": 9766 + }, + { + "epoch": 0.05808711580550005, + "grad_norm": 2.000459909439087, + "learning_rate": 4.9584974145904165e-05, + "loss": 4.7356, + "step": 9767 + }, + { + "epoch": 0.058093063088781045, + "grad_norm": 2.3958399295806885, + "learning_rate": 4.958488938353116e-05, + "loss": 4.3695, + "step": 9768 + }, + { + "epoch": 0.05809901037206204, + "grad_norm": 2.039053440093994, + "learning_rate": 4.958480461257584e-05, + "loss": 4.6128, + "step": 9769 + }, + { + "epoch": 0.05810495765534304, + "grad_norm": 1.7663822174072266, + "learning_rate": 4.95847198330382e-05, + "loss": 4.8533, + "step": 9770 + }, + { + "epoch": 0.05811090493862404, + "grad_norm": 2.594289779663086, + "learning_rate": 4.9584635044918295e-05, + "loss": 5.3048, + "step": 9771 + }, + { + "epoch": 0.05811685222190503, + "grad_norm": 2.712372303009033, + "learning_rate": 4.958455024821615e-05, + "loss": 5.4435, + "step": 9772 + }, + { + "epoch": 0.05812279950518603, + "grad_norm": 2.4295241832733154, + "learning_rate": 4.9584465442931794e-05, + "loss": 5.2665, + "step": 9773 + }, + { + "epoch": 0.05812874678846703, + "grad_norm": 2.5820906162261963, + "learning_rate": 4.9584380629065245e-05, + "loss": 5.6227, + "step": 9774 + }, + { + "epoch": 0.058134694071748025, + "grad_norm": 2.140291213989258, + "learning_rate": 4.958429580661655e-05, + "loss": 5.1792, + "step": 9775 + }, + { + "epoch": 0.05814064135502902, + "grad_norm": 2.111551523208618, + "learning_rate": 4.9584210975585734e-05, + "loss": 5.7262, + "step": 9776 + }, + { + "epoch": 0.05814658863831002, + "grad_norm": 2.5887086391448975, + "learning_rate": 4.958412613597282e-05, + "loss": 5.1613, + "step": 9777 + }, + { + "epoch": 0.05815253592159102, + "grad_norm": 1.9678863286972046, + "learning_rate": 4.9584041287777835e-05, + "loss": 5.7693, + "step": 9778 + }, + { + "epoch": 0.05815848320487201, + "grad_norm": 2.000265121459961, + "learning_rate": 4.958395643100083e-05, + "loss": 5.654, + "step": 9779 + }, + { + "epoch": 0.058164430488153014, + "grad_norm": 1.8926239013671875, + "learning_rate": 4.958387156564181e-05, + "loss": 5.3004, + "step": 9780 + }, + { + "epoch": 0.05817037777143401, + "grad_norm": 2.3557002544403076, + "learning_rate": 4.958378669170082e-05, + "loss": 5.5437, + "step": 9781 + }, + { + "epoch": 0.058176325054715004, + "grad_norm": 1.9434150457382202, + "learning_rate": 4.958370180917787e-05, + "loss": 5.8442, + "step": 9782 + }, + { + "epoch": 0.058182272337996006, + "grad_norm": 1.875900387763977, + "learning_rate": 4.9583616918073026e-05, + "loss": 5.9312, + "step": 9783 + }, + { + "epoch": 0.058188219621277, + "grad_norm": 1.8945306539535522, + "learning_rate": 4.958353201838628e-05, + "loss": 5.7166, + "step": 9784 + }, + { + "epoch": 0.058194166904557997, + "grad_norm": 1.7081416845321655, + "learning_rate": 4.9583447110117684e-05, + "loss": 6.0803, + "step": 9785 + }, + { + "epoch": 0.058200114187839, + "grad_norm": 1.6520098447799683, + "learning_rate": 4.958336219326725e-05, + "loss": 6.0181, + "step": 9786 + }, + { + "epoch": 0.058206061471119994, + "grad_norm": 1.90665602684021, + "learning_rate": 4.9583277267835024e-05, + "loss": 5.586, + "step": 9787 + }, + { + "epoch": 0.05821200875440099, + "grad_norm": 1.8179740905761719, + "learning_rate": 4.958319233382104e-05, + "loss": 5.8637, + "step": 9788 + }, + { + "epoch": 0.058217956037681984, + "grad_norm": 1.8228380680084229, + "learning_rate": 4.95831073912253e-05, + "loss": 5.7406, + "step": 9789 + }, + { + "epoch": 0.058223903320962986, + "grad_norm": 1.691999912261963, + "learning_rate": 4.958302244004786e-05, + "loss": 5.8021, + "step": 9790 + }, + { + "epoch": 0.05822985060424398, + "grad_norm": 1.8590795993804932, + "learning_rate": 4.958293748028875e-05, + "loss": 5.5897, + "step": 9791 + }, + { + "epoch": 0.058235797887524976, + "grad_norm": 1.5923960208892822, + "learning_rate": 4.958285251194797e-05, + "loss": 5.7424, + "step": 9792 + }, + { + "epoch": 0.05824174517080598, + "grad_norm": 1.6928486824035645, + "learning_rate": 4.958276753502559e-05, + "loss": 5.905, + "step": 9793 + }, + { + "epoch": 0.05824769245408697, + "grad_norm": 2.120725393295288, + "learning_rate": 4.958268254952161e-05, + "loss": 5.9974, + "step": 9794 + }, + { + "epoch": 0.05825363973736797, + "grad_norm": 1.850441813468933, + "learning_rate": 4.9582597555436075e-05, + "loss": 5.7171, + "step": 9795 + }, + { + "epoch": 0.05825958702064897, + "grad_norm": 2.196037530899048, + "learning_rate": 4.9582512552769e-05, + "loss": 6.1243, + "step": 9796 + }, + { + "epoch": 0.058265534303929965, + "grad_norm": 1.9170193672180176, + "learning_rate": 4.9582427541520423e-05, + "loss": 5.8087, + "step": 9797 + }, + { + "epoch": 0.05827148158721096, + "grad_norm": 1.974478006362915, + "learning_rate": 4.958234252169039e-05, + "loss": 5.794, + "step": 9798 + }, + { + "epoch": 0.05827742887049196, + "grad_norm": 1.824965476989746, + "learning_rate": 4.9582257493278904e-05, + "loss": 5.6904, + "step": 9799 + }, + { + "epoch": 0.05828337615377296, + "grad_norm": 1.828037142753601, + "learning_rate": 4.9582172456286e-05, + "loss": 5.6793, + "step": 9800 + }, + { + "epoch": 0.05828932343705395, + "grad_norm": 1.8949617147445679, + "learning_rate": 4.9582087410711726e-05, + "loss": 5.6685, + "step": 9801 + }, + { + "epoch": 0.05829527072033495, + "grad_norm": 1.8183050155639648, + "learning_rate": 4.958200235655609e-05, + "loss": 5.7754, + "step": 9802 + }, + { + "epoch": 0.05830121800361595, + "grad_norm": 1.6816062927246094, + "learning_rate": 4.9581917293819135e-05, + "loss": 5.6931, + "step": 9803 + }, + { + "epoch": 0.058307165286896945, + "grad_norm": 1.875659465789795, + "learning_rate": 4.958183222250089e-05, + "loss": 5.7568, + "step": 9804 + }, + { + "epoch": 0.05831311257017794, + "grad_norm": 2.162404775619507, + "learning_rate": 4.958174714260137e-05, + "loss": 5.7969, + "step": 9805 + }, + { + "epoch": 0.05831905985345894, + "grad_norm": 2.2122790813446045, + "learning_rate": 4.958166205412064e-05, + "loss": 5.7301, + "step": 9806 + }, + { + "epoch": 0.05832500713673994, + "grad_norm": 1.8822424411773682, + "learning_rate": 4.9581576957058686e-05, + "loss": 5.7034, + "step": 9807 + }, + { + "epoch": 0.05833095442002093, + "grad_norm": 1.8780319690704346, + "learning_rate": 4.958149185141556e-05, + "loss": 5.6573, + "step": 9808 + }, + { + "epoch": 0.058336901703301934, + "grad_norm": 1.9177708625793457, + "learning_rate": 4.958140673719129e-05, + "loss": 5.6619, + "step": 9809 + }, + { + "epoch": 0.05834284898658293, + "grad_norm": 1.8662844896316528, + "learning_rate": 4.95813216143859e-05, + "loss": 5.5857, + "step": 9810 + }, + { + "epoch": 0.058348796269863924, + "grad_norm": 2.1798834800720215, + "learning_rate": 4.958123648299944e-05, + "loss": 5.5811, + "step": 9811 + }, + { + "epoch": 0.058354743553144926, + "grad_norm": 2.1575138568878174, + "learning_rate": 4.958115134303191e-05, + "loss": 5.6761, + "step": 9812 + }, + { + "epoch": 0.05836069083642592, + "grad_norm": 2.055314302444458, + "learning_rate": 4.958106619448336e-05, + "loss": 5.721, + "step": 9813 + }, + { + "epoch": 0.058366638119706916, + "grad_norm": 1.8962149620056152, + "learning_rate": 4.958098103735381e-05, + "loss": 5.6132, + "step": 9814 + }, + { + "epoch": 0.05837258540298792, + "grad_norm": 1.7715760469436646, + "learning_rate": 4.95808958716433e-05, + "loss": 5.6461, + "step": 9815 + }, + { + "epoch": 0.058378532686268914, + "grad_norm": 1.9166070222854614, + "learning_rate": 4.958081069735184e-05, + "loss": 5.5628, + "step": 9816 + }, + { + "epoch": 0.05838447996954991, + "grad_norm": 1.8872902393341064, + "learning_rate": 4.9580725514479484e-05, + "loss": 5.6476, + "step": 9817 + }, + { + "epoch": 0.058390427252830904, + "grad_norm": 1.8257521390914917, + "learning_rate": 4.9580640323026254e-05, + "loss": 5.6175, + "step": 9818 + }, + { + "epoch": 0.058396374536111906, + "grad_norm": 1.919291377067566, + "learning_rate": 4.958055512299217e-05, + "loss": 5.5954, + "step": 9819 + }, + { + "epoch": 0.0584023218193929, + "grad_norm": 1.8318076133728027, + "learning_rate": 4.958046991437726e-05, + "loss": 5.6255, + "step": 9820 + }, + { + "epoch": 0.058408269102673896, + "grad_norm": 1.9153858423233032, + "learning_rate": 4.958038469718158e-05, + "loss": 5.6787, + "step": 9821 + }, + { + "epoch": 0.0584142163859549, + "grad_norm": 1.967021107673645, + "learning_rate": 4.958029947140513e-05, + "loss": 5.6714, + "step": 9822 + }, + { + "epoch": 0.05842016366923589, + "grad_norm": 1.654997706413269, + "learning_rate": 4.958021423704795e-05, + "loss": 5.4809, + "step": 9823 + }, + { + "epoch": 0.05842611095251689, + "grad_norm": 1.8183335065841675, + "learning_rate": 4.9580128994110074e-05, + "loss": 5.5223, + "step": 9824 + }, + { + "epoch": 0.05843205823579789, + "grad_norm": 1.7665660381317139, + "learning_rate": 4.958004374259153e-05, + "loss": 5.5639, + "step": 9825 + }, + { + "epoch": 0.058438005519078885, + "grad_norm": 1.8233551979064941, + "learning_rate": 4.957995848249235e-05, + "loss": 5.6358, + "step": 9826 + }, + { + "epoch": 0.05844395280235988, + "grad_norm": 1.721301555633545, + "learning_rate": 4.957987321381256e-05, + "loss": 5.4989, + "step": 9827 + }, + { + "epoch": 0.05844990008564088, + "grad_norm": 1.6921659708023071, + "learning_rate": 4.957978793655218e-05, + "loss": 5.448, + "step": 9828 + }, + { + "epoch": 0.05845584736892188, + "grad_norm": 1.810354232788086, + "learning_rate": 4.957970265071126e-05, + "loss": 5.4501, + "step": 9829 + }, + { + "epoch": 0.05846179465220287, + "grad_norm": 1.7205116748809814, + "learning_rate": 4.957961735628982e-05, + "loss": 5.5222, + "step": 9830 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 1.9636965990066528, + "learning_rate": 4.957953205328788e-05, + "loss": 5.5894, + "step": 9831 + }, + { + "epoch": 0.05847368921876487, + "grad_norm": 1.9312820434570312, + "learning_rate": 4.9579446741705485e-05, + "loss": 5.6543, + "step": 9832 + }, + { + "epoch": 0.058479636502045865, + "grad_norm": 1.870448112487793, + "learning_rate": 4.9579361421542665e-05, + "loss": 5.6707, + "step": 9833 + }, + { + "epoch": 0.05848558378532686, + "grad_norm": 1.5943735837936401, + "learning_rate": 4.9579276092799435e-05, + "loss": 5.5184, + "step": 9834 + }, + { + "epoch": 0.05849153106860786, + "grad_norm": 1.6929852962493896, + "learning_rate": 4.957919075547584e-05, + "loss": 5.5188, + "step": 9835 + }, + { + "epoch": 0.05849747835188886, + "grad_norm": 2.0268075466156006, + "learning_rate": 4.95791054095719e-05, + "loss": 5.4909, + "step": 9836 + }, + { + "epoch": 0.05850342563516985, + "grad_norm": 2.047982931137085, + "learning_rate": 4.957902005508765e-05, + "loss": 5.6459, + "step": 9837 + }, + { + "epoch": 0.058509372918450854, + "grad_norm": 1.7938467264175415, + "learning_rate": 4.957893469202311e-05, + "loss": 5.4805, + "step": 9838 + }, + { + "epoch": 0.05851532020173185, + "grad_norm": 1.803093433380127, + "learning_rate": 4.957884932037833e-05, + "loss": 5.4092, + "step": 9839 + }, + { + "epoch": 0.058521267485012844, + "grad_norm": 1.8001232147216797, + "learning_rate": 4.957876394015333e-05, + "loss": 5.9168, + "step": 9840 + }, + { + "epoch": 0.058527214768293846, + "grad_norm": 1.9442622661590576, + "learning_rate": 4.9578678551348125e-05, + "loss": 6.0317, + "step": 9841 + }, + { + "epoch": 0.05853316205157484, + "grad_norm": 2.013845205307007, + "learning_rate": 4.957859315396276e-05, + "loss": 5.6855, + "step": 9842 + }, + { + "epoch": 0.058539109334855836, + "grad_norm": 2.7557523250579834, + "learning_rate": 4.9578507747997264e-05, + "loss": 5.3782, + "step": 9843 + }, + { + "epoch": 0.05854505661813684, + "grad_norm": 1.9822032451629639, + "learning_rate": 4.957842233345167e-05, + "loss": 6.22, + "step": 9844 + }, + { + "epoch": 0.058551003901417834, + "grad_norm": 1.7408699989318848, + "learning_rate": 4.9578336910326e-05, + "loss": 5.2347, + "step": 9845 + }, + { + "epoch": 0.05855695118469883, + "grad_norm": 3.2186660766601562, + "learning_rate": 4.957825147862028e-05, + "loss": 5.3282, + "step": 9846 + }, + { + "epoch": 0.058562898467979824, + "grad_norm": 3.3589892387390137, + "learning_rate": 4.957816603833455e-05, + "loss": 5.5689, + "step": 9847 + }, + { + "epoch": 0.058568845751260826, + "grad_norm": 3.4228861331939697, + "learning_rate": 4.957808058946883e-05, + "loss": 5.5797, + "step": 9848 + }, + { + "epoch": 0.05857479303454182, + "grad_norm": 2.420506238937378, + "learning_rate": 4.957799513202317e-05, + "loss": 5.735, + "step": 9849 + }, + { + "epoch": 0.058580740317822816, + "grad_norm": 1.8269212245941162, + "learning_rate": 4.957790966599758e-05, + "loss": 5.7571, + "step": 9850 + }, + { + "epoch": 0.05858668760110382, + "grad_norm": 2.011110305786133, + "learning_rate": 4.957782419139209e-05, + "loss": 5.9786, + "step": 9851 + }, + { + "epoch": 0.05859263488438481, + "grad_norm": 2.3139355182647705, + "learning_rate": 4.957773870820674e-05, + "loss": 5.8356, + "step": 9852 + }, + { + "epoch": 0.05859858216766581, + "grad_norm": 2.3406572341918945, + "learning_rate": 4.957765321644155e-05, + "loss": 5.8426, + "step": 9853 + }, + { + "epoch": 0.05860452945094681, + "grad_norm": 2.1194591522216797, + "learning_rate": 4.957756771609657e-05, + "loss": 5.6152, + "step": 9854 + }, + { + "epoch": 0.058610476734227805, + "grad_norm": 1.9966599941253662, + "learning_rate": 4.95774822071718e-05, + "loss": 5.8189, + "step": 9855 + }, + { + "epoch": 0.0586164240175088, + "grad_norm": 1.8953092098236084, + "learning_rate": 4.95773966896673e-05, + "loss": 5.8185, + "step": 9856 + }, + { + "epoch": 0.0586223713007898, + "grad_norm": 1.9035093784332275, + "learning_rate": 4.957731116358307e-05, + "loss": 5.6554, + "step": 9857 + }, + { + "epoch": 0.0586283185840708, + "grad_norm": 3.507546901702881, + "learning_rate": 4.9577225628919157e-05, + "loss": 5.8906, + "step": 9858 + }, + { + "epoch": 0.05863426586735179, + "grad_norm": 2.1840403079986572, + "learning_rate": 4.9577140085675586e-05, + "loss": 5.6084, + "step": 9859 + }, + { + "epoch": 0.05864021315063279, + "grad_norm": 2.008424758911133, + "learning_rate": 4.95770545338524e-05, + "loss": 5.8435, + "step": 9860 + }, + { + "epoch": 0.05864616043391379, + "grad_norm": 1.9004656076431274, + "learning_rate": 4.957696897344961e-05, + "loss": 5.5906, + "step": 9861 + }, + { + "epoch": 0.058652107717194785, + "grad_norm": 1.8043147325515747, + "learning_rate": 4.9576883404467255e-05, + "loss": 5.6057, + "step": 9862 + }, + { + "epoch": 0.05865805500047578, + "grad_norm": 1.6765285730361938, + "learning_rate": 4.957679782690537e-05, + "loss": 5.7246, + "step": 9863 + }, + { + "epoch": 0.05866400228375678, + "grad_norm": 2.0207018852233887, + "learning_rate": 4.9576712240763974e-05, + "loss": 5.8459, + "step": 9864 + }, + { + "epoch": 0.05866994956703778, + "grad_norm": 1.975874423980713, + "learning_rate": 4.95766266460431e-05, + "loss": 5.7313, + "step": 9865 + }, + { + "epoch": 0.05867589685031877, + "grad_norm": 2.085277557373047, + "learning_rate": 4.957654104274279e-05, + "loss": 5.1359, + "step": 9866 + }, + { + "epoch": 0.058681844133599774, + "grad_norm": 2.039437770843506, + "learning_rate": 4.957645543086305e-05, + "loss": 5.5673, + "step": 9867 + }, + { + "epoch": 0.05868779141688077, + "grad_norm": 2.0692098140716553, + "learning_rate": 4.9576369810403926e-05, + "loss": 5.6326, + "step": 9868 + }, + { + "epoch": 0.058693738700161764, + "grad_norm": 2.3873767852783203, + "learning_rate": 4.957628418136545e-05, + "loss": 5.5133, + "step": 9869 + }, + { + "epoch": 0.058699685983442766, + "grad_norm": 2.9347658157348633, + "learning_rate": 4.957619854374764e-05, + "loss": 5.5444, + "step": 9870 + }, + { + "epoch": 0.05870563326672376, + "grad_norm": 2.955348014831543, + "learning_rate": 4.957611289755054e-05, + "loss": 5.4883, + "step": 9871 + }, + { + "epoch": 0.058711580550004756, + "grad_norm": 2.147033214569092, + "learning_rate": 4.957602724277417e-05, + "loss": 5.4554, + "step": 9872 + }, + { + "epoch": 0.05871752783328576, + "grad_norm": 2.1422510147094727, + "learning_rate": 4.957594157941856e-05, + "loss": 5.56, + "step": 9873 + }, + { + "epoch": 0.05872347511656675, + "grad_norm": 2.018935203552246, + "learning_rate": 4.957585590748375e-05, + "loss": 5.5176, + "step": 9874 + }, + { + "epoch": 0.05872942239984775, + "grad_norm": 3.0146446228027344, + "learning_rate": 4.957577022696976e-05, + "loss": 5.2623, + "step": 9875 + }, + { + "epoch": 0.058735369683128744, + "grad_norm": 2.923011064529419, + "learning_rate": 4.957568453787662e-05, + "loss": 5.1828, + "step": 9876 + }, + { + "epoch": 0.058741316966409746, + "grad_norm": 2.7203526496887207, + "learning_rate": 4.9575598840204366e-05, + "loss": 5.1565, + "step": 9877 + }, + { + "epoch": 0.05874726424969074, + "grad_norm": 2.056260108947754, + "learning_rate": 4.9575513133953025e-05, + "loss": 5.1345, + "step": 9878 + }, + { + "epoch": 0.058753211532971736, + "grad_norm": 2.3120932579040527, + "learning_rate": 4.9575427419122616e-05, + "loss": 5.1792, + "step": 9879 + }, + { + "epoch": 0.05875915881625274, + "grad_norm": 2.1298701763153076, + "learning_rate": 4.9575341695713186e-05, + "loss": 5.1447, + "step": 9880 + }, + { + "epoch": 0.05876510609953373, + "grad_norm": 2.393869638442993, + "learning_rate": 4.9575255963724756e-05, + "loss": 5.2938, + "step": 9881 + }, + { + "epoch": 0.05877105338281473, + "grad_norm": 2.324061155319214, + "learning_rate": 4.9575170223157366e-05, + "loss": 5.1488, + "step": 9882 + }, + { + "epoch": 0.05877700066609573, + "grad_norm": 2.1416141986846924, + "learning_rate": 4.957508447401103e-05, + "loss": 5.0551, + "step": 9883 + }, + { + "epoch": 0.058782947949376725, + "grad_norm": 2.127350091934204, + "learning_rate": 4.9574998716285795e-05, + "loss": 5.03, + "step": 9884 + }, + { + "epoch": 0.05878889523265772, + "grad_norm": 2.317267417907715, + "learning_rate": 4.957491294998167e-05, + "loss": 5.049, + "step": 9885 + }, + { + "epoch": 0.05879484251593872, + "grad_norm": 2.3667004108428955, + "learning_rate": 4.9574827175098704e-05, + "loss": 5.009, + "step": 9886 + }, + { + "epoch": 0.05880078979921972, + "grad_norm": 2.4034934043884277, + "learning_rate": 4.9574741391636915e-05, + "loss": 4.9419, + "step": 9887 + }, + { + "epoch": 0.05880673708250071, + "grad_norm": 2.3792901039123535, + "learning_rate": 4.957465559959634e-05, + "loss": 4.8517, + "step": 9888 + }, + { + "epoch": 0.05881268436578171, + "grad_norm": 2.139249086380005, + "learning_rate": 4.957456979897701e-05, + "loss": 5.0767, + "step": 9889 + }, + { + "epoch": 0.05881863164906271, + "grad_norm": 2.5370614528656006, + "learning_rate": 4.957448398977894e-05, + "loss": 5.0243, + "step": 9890 + }, + { + "epoch": 0.058824578932343705, + "grad_norm": 2.0474746227264404, + "learning_rate": 4.957439817200218e-05, + "loss": 4.988, + "step": 9891 + }, + { + "epoch": 0.0588305262156247, + "grad_norm": 2.1323394775390625, + "learning_rate": 4.957431234564675e-05, + "loss": 5.7499, + "step": 9892 + }, + { + "epoch": 0.0588364734989057, + "grad_norm": 2.135988473892212, + "learning_rate": 4.957422651071269e-05, + "loss": 6.0197, + "step": 9893 + }, + { + "epoch": 0.0588424207821867, + "grad_norm": 2.4457356929779053, + "learning_rate": 4.957414066720001e-05, + "loss": 5.4461, + "step": 9894 + }, + { + "epoch": 0.05884836806546769, + "grad_norm": 2.3973019123077393, + "learning_rate": 4.957405481510876e-05, + "loss": 5.0372, + "step": 9895 + }, + { + "epoch": 0.058854315348748694, + "grad_norm": 2.5532052516937256, + "learning_rate": 4.957396895443896e-05, + "loss": 5.1462, + "step": 9896 + }, + { + "epoch": 0.05886026263202969, + "grad_norm": 2.3662166595458984, + "learning_rate": 4.9573883085190633e-05, + "loss": 5.1894, + "step": 9897 + }, + { + "epoch": 0.058866209915310684, + "grad_norm": 2.153883695602417, + "learning_rate": 4.9573797207363825e-05, + "loss": 5.6859, + "step": 9898 + }, + { + "epoch": 0.058872157198591686, + "grad_norm": 1.9541380405426025, + "learning_rate": 4.957371132095856e-05, + "loss": 5.5487, + "step": 9899 + }, + { + "epoch": 0.05887810448187268, + "grad_norm": 1.7920335531234741, + "learning_rate": 4.957362542597486e-05, + "loss": 5.4021, + "step": 9900 + }, + { + "epoch": 0.058884051765153676, + "grad_norm": 2.351090431213379, + "learning_rate": 4.9573539522412756e-05, + "loss": 4.9377, + "step": 9901 + }, + { + "epoch": 0.05888999904843468, + "grad_norm": 2.4780900478363037, + "learning_rate": 4.95734536102723e-05, + "loss": 5.04, + "step": 9902 + }, + { + "epoch": 0.05889594633171567, + "grad_norm": 1.7211192846298218, + "learning_rate": 4.957336768955349e-05, + "loss": 5.2959, + "step": 9903 + }, + { + "epoch": 0.05890189361499667, + "grad_norm": 1.9051212072372437, + "learning_rate": 4.957328176025638e-05, + "loss": 5.5587, + "step": 9904 + }, + { + "epoch": 0.058907840898277664, + "grad_norm": 2.009725332260132, + "learning_rate": 4.957319582238099e-05, + "loss": 5.5366, + "step": 9905 + }, + { + "epoch": 0.058913788181558666, + "grad_norm": 1.835423231124878, + "learning_rate": 4.957310987592735e-05, + "loss": 5.2522, + "step": 9906 + }, + { + "epoch": 0.05891973546483966, + "grad_norm": 1.6150819063186646, + "learning_rate": 4.957302392089549e-05, + "loss": 5.3935, + "step": 9907 + }, + { + "epoch": 0.058925682748120656, + "grad_norm": 1.825942873954773, + "learning_rate": 4.9572937957285435e-05, + "loss": 5.5435, + "step": 9908 + }, + { + "epoch": 0.05893163003140166, + "grad_norm": 1.5434985160827637, + "learning_rate": 4.957285198509724e-05, + "loss": 5.2508, + "step": 9909 + }, + { + "epoch": 0.05893757731468265, + "grad_norm": 1.7675530910491943, + "learning_rate": 4.9572766004330894e-05, + "loss": 5.2811, + "step": 9910 + }, + { + "epoch": 0.05894352459796365, + "grad_norm": 1.5196996927261353, + "learning_rate": 4.957268001498646e-05, + "loss": 5.1829, + "step": 9911 + }, + { + "epoch": 0.05894947188124465, + "grad_norm": 1.5598126649856567, + "learning_rate": 4.9572594017063964e-05, + "loss": 5.2067, + "step": 9912 + }, + { + "epoch": 0.058955419164525645, + "grad_norm": 1.6600217819213867, + "learning_rate": 4.957250801056342e-05, + "loss": 5.1591, + "step": 9913 + }, + { + "epoch": 0.05896136644780664, + "grad_norm": 2.040682315826416, + "learning_rate": 4.957242199548487e-05, + "loss": 4.8792, + "step": 9914 + }, + { + "epoch": 0.05896731373108764, + "grad_norm": 2.0122241973876953, + "learning_rate": 4.9572335971828346e-05, + "loss": 5.9489, + "step": 9915 + }, + { + "epoch": 0.05897326101436864, + "grad_norm": 2.4522452354431152, + "learning_rate": 4.957224993959386e-05, + "loss": 5.943, + "step": 9916 + }, + { + "epoch": 0.05897920829764963, + "grad_norm": 1.9101065397262573, + "learning_rate": 4.957216389878147e-05, + "loss": 5.858, + "step": 9917 + }, + { + "epoch": 0.05898515558093063, + "grad_norm": 1.6488839387893677, + "learning_rate": 4.957207784939118e-05, + "loss": 5.4935, + "step": 9918 + }, + { + "epoch": 0.05899110286421163, + "grad_norm": 1.7620775699615479, + "learning_rate": 4.957199179142303e-05, + "loss": 5.6067, + "step": 9919 + }, + { + "epoch": 0.058997050147492625, + "grad_norm": 2.6018314361572266, + "learning_rate": 4.957190572487707e-05, + "loss": 5.5249, + "step": 9920 + }, + { + "epoch": 0.05900299743077362, + "grad_norm": 1.810274600982666, + "learning_rate": 4.957181964975329e-05, + "loss": 5.4063, + "step": 9921 + }, + { + "epoch": 0.05900894471405462, + "grad_norm": 1.7467454671859741, + "learning_rate": 4.957173356605176e-05, + "loss": 5.4476, + "step": 9922 + }, + { + "epoch": 0.05901489199733562, + "grad_norm": 1.9074509143829346, + "learning_rate": 4.9571647473772483e-05, + "loss": 5.8014, + "step": 9923 + }, + { + "epoch": 0.05902083928061661, + "grad_norm": 1.6376137733459473, + "learning_rate": 4.9571561372915496e-05, + "loss": 5.6813, + "step": 9924 + }, + { + "epoch": 0.059026786563897614, + "grad_norm": 1.9984129667282104, + "learning_rate": 4.957147526348083e-05, + "loss": 5.9534, + "step": 9925 + }, + { + "epoch": 0.05903273384717861, + "grad_norm": 2.38493013381958, + "learning_rate": 4.957138914546852e-05, + "loss": 5.6903, + "step": 9926 + }, + { + "epoch": 0.059038681130459604, + "grad_norm": 1.86250901222229, + "learning_rate": 4.957130301887859e-05, + "loss": 5.1777, + "step": 9927 + }, + { + "epoch": 0.059044628413740606, + "grad_norm": 1.6241644620895386, + "learning_rate": 4.957121688371107e-05, + "loss": 5.1693, + "step": 9928 + }, + { + "epoch": 0.0590505756970216, + "grad_norm": 1.5627753734588623, + "learning_rate": 4.9571130739965996e-05, + "loss": 5.0313, + "step": 9929 + }, + { + "epoch": 0.059056522980302596, + "grad_norm": 1.6763062477111816, + "learning_rate": 4.957104458764339e-05, + "loss": 4.9973, + "step": 9930 + }, + { + "epoch": 0.0590624702635836, + "grad_norm": 1.6215085983276367, + "learning_rate": 4.957095842674329e-05, + "loss": 5.2216, + "step": 9931 + }, + { + "epoch": 0.05906841754686459, + "grad_norm": 1.5599844455718994, + "learning_rate": 4.957087225726572e-05, + "loss": 5.4525, + "step": 9932 + }, + { + "epoch": 0.05907436483014559, + "grad_norm": 1.3916441202163696, + "learning_rate": 4.957078607921072e-05, + "loss": 5.4434, + "step": 9933 + }, + { + "epoch": 0.059080312113426584, + "grad_norm": 1.524478554725647, + "learning_rate": 4.9570699892578295e-05, + "loss": 5.3979, + "step": 9934 + }, + { + "epoch": 0.059086259396707586, + "grad_norm": 1.264108657836914, + "learning_rate": 4.9570613697368505e-05, + "loss": 5.2892, + "step": 9935 + }, + { + "epoch": 0.05909220667998858, + "grad_norm": 1.7481588125228882, + "learning_rate": 4.957052749358137e-05, + "loss": 4.8539, + "step": 9936 + }, + { + "epoch": 0.059098153963269576, + "grad_norm": 1.675515055656433, + "learning_rate": 4.957044128121692e-05, + "loss": 5.4645, + "step": 9937 + }, + { + "epoch": 0.05910410124655058, + "grad_norm": 1.6560577154159546, + "learning_rate": 4.957035506027517e-05, + "loss": 4.9354, + "step": 9938 + }, + { + "epoch": 0.05911004852983157, + "grad_norm": 1.5030722618103027, + "learning_rate": 4.9570268830756174e-05, + "loss": 5.206, + "step": 9939 + }, + { + "epoch": 0.05911599581311257, + "grad_norm": 1.65435791015625, + "learning_rate": 4.957018259265994e-05, + "loss": 5.2132, + "step": 9940 + }, + { + "epoch": 0.05912194309639357, + "grad_norm": 1.6701000928878784, + "learning_rate": 4.9570096345986515e-05, + "loss": 5.2313, + "step": 9941 + }, + { + "epoch": 0.059127890379674565, + "grad_norm": 1.412954330444336, + "learning_rate": 4.957001009073593e-05, + "loss": 5.2511, + "step": 9942 + }, + { + "epoch": 0.05913383766295556, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.95699238269082e-05, + "loss": 5.3646, + "step": 9943 + }, + { + "epoch": 0.05913978494623656, + "grad_norm": 1.6969150304794312, + "learning_rate": 4.9569837554503365e-05, + "loss": 5.3001, + "step": 9944 + }, + { + "epoch": 0.05914573222951756, + "grad_norm": 1.8579715490341187, + "learning_rate": 4.9569751273521454e-05, + "loss": 5.0944, + "step": 9945 + }, + { + "epoch": 0.05915167951279855, + "grad_norm": 1.6907633543014526, + "learning_rate": 4.956966498396249e-05, + "loss": 5.1447, + "step": 9946 + }, + { + "epoch": 0.059157626796079554, + "grad_norm": 1.7581912279129028, + "learning_rate": 4.9569578685826525e-05, + "loss": 5.2065, + "step": 9947 + }, + { + "epoch": 0.05916357407936055, + "grad_norm": 1.4447051286697388, + "learning_rate": 4.9569492379113555e-05, + "loss": 5.081, + "step": 9948 + }, + { + "epoch": 0.059169521362641544, + "grad_norm": 1.731697916984558, + "learning_rate": 4.9569406063823644e-05, + "loss": 5.241, + "step": 9949 + }, + { + "epoch": 0.05917546864592254, + "grad_norm": 1.6483672857284546, + "learning_rate": 4.956931973995681e-05, + "loss": 5.306, + "step": 9950 + }, + { + "epoch": 0.05918141592920354, + "grad_norm": 2.2123141288757324, + "learning_rate": 4.956923340751306e-05, + "loss": 5.6134, + "step": 9951 + }, + { + "epoch": 0.05918736321248454, + "grad_norm": 1.8569937944412231, + "learning_rate": 4.956914706649246e-05, + "loss": 5.4819, + "step": 9952 + }, + { + "epoch": 0.05919331049576553, + "grad_norm": 1.8417435884475708, + "learning_rate": 4.956906071689502e-05, + "loss": 5.4116, + "step": 9953 + }, + { + "epoch": 0.059199257779046534, + "grad_norm": 1.7050427198410034, + "learning_rate": 4.956897435872078e-05, + "loss": 5.238, + "step": 9954 + }, + { + "epoch": 0.05920520506232753, + "grad_norm": 1.6636401414871216, + "learning_rate": 4.956888799196976e-05, + "loss": 5.0962, + "step": 9955 + }, + { + "epoch": 0.059211152345608524, + "grad_norm": 1.9194599390029907, + "learning_rate": 4.9568801616642e-05, + "loss": 5.2078, + "step": 9956 + }, + { + "epoch": 0.059217099628889526, + "grad_norm": 1.6154237985610962, + "learning_rate": 4.956871523273752e-05, + "loss": 5.3562, + "step": 9957 + }, + { + "epoch": 0.05922304691217052, + "grad_norm": 1.4500404596328735, + "learning_rate": 4.956862884025636e-05, + "loss": 5.2061, + "step": 9958 + }, + { + "epoch": 0.059228994195451516, + "grad_norm": 1.6681636571884155, + "learning_rate": 4.956854243919854e-05, + "loss": 5.3455, + "step": 9959 + }, + { + "epoch": 0.05923494147873252, + "grad_norm": 1.7175511121749878, + "learning_rate": 4.9568456029564104e-05, + "loss": 5.2967, + "step": 9960 + }, + { + "epoch": 0.05924088876201351, + "grad_norm": 1.5013905763626099, + "learning_rate": 4.956836961135306e-05, + "loss": 4.9836, + "step": 9961 + }, + { + "epoch": 0.05924683604529451, + "grad_norm": 1.6521363258361816, + "learning_rate": 4.956828318456546e-05, + "loss": 5.0295, + "step": 9962 + }, + { + "epoch": 0.0592527833285755, + "grad_norm": 1.5945814847946167, + "learning_rate": 4.9568196749201326e-05, + "loss": 4.9511, + "step": 9963 + }, + { + "epoch": 0.059258730611856505, + "grad_norm": 1.508301854133606, + "learning_rate": 4.95681103052607e-05, + "loss": 4.9469, + "step": 9964 + }, + { + "epoch": 0.0592646778951375, + "grad_norm": 1.5902310609817505, + "learning_rate": 4.956802385274358e-05, + "loss": 4.9761, + "step": 9965 + }, + { + "epoch": 0.059270625178418496, + "grad_norm": 1.739424467086792, + "learning_rate": 4.956793739165003e-05, + "loss": 5.2443, + "step": 9966 + }, + { + "epoch": 0.0592765724616995, + "grad_norm": 1.8317997455596924, + "learning_rate": 4.9567850921980056e-05, + "loss": 5.0046, + "step": 9967 + }, + { + "epoch": 0.05928251974498049, + "grad_norm": 1.8073506355285645, + "learning_rate": 4.956776444373371e-05, + "loss": 5.1779, + "step": 9968 + }, + { + "epoch": 0.05928846702826149, + "grad_norm": 1.8806017637252808, + "learning_rate": 4.956767795691101e-05, + "loss": 5.2956, + "step": 9969 + }, + { + "epoch": 0.05929441431154249, + "grad_norm": 1.8397493362426758, + "learning_rate": 4.956759146151198e-05, + "loss": 5.1775, + "step": 9970 + }, + { + "epoch": 0.059300361594823485, + "grad_norm": 2.001387119293213, + "learning_rate": 4.9567504957536656e-05, + "loss": 5.2149, + "step": 9971 + }, + { + "epoch": 0.05930630887810448, + "grad_norm": 2.011504650115967, + "learning_rate": 4.956741844498508e-05, + "loss": 5.2384, + "step": 9972 + }, + { + "epoch": 0.05931225616138548, + "grad_norm": 1.7936465740203857, + "learning_rate": 4.956733192385727e-05, + "loss": 5.2297, + "step": 9973 + }, + { + "epoch": 0.05931820344466648, + "grad_norm": 1.7336666584014893, + "learning_rate": 4.9567245394153255e-05, + "loss": 5.1637, + "step": 9974 + }, + { + "epoch": 0.05932415072794747, + "grad_norm": 1.7429137229919434, + "learning_rate": 4.956715885587307e-05, + "loss": 5.1315, + "step": 9975 + }, + { + "epoch": 0.059330098011228474, + "grad_norm": 1.6609208583831787, + "learning_rate": 4.956707230901674e-05, + "loss": 5.1554, + "step": 9976 + }, + { + "epoch": 0.05933604529450947, + "grad_norm": 1.630026936531067, + "learning_rate": 4.95669857535843e-05, + "loss": 5.1569, + "step": 9977 + }, + { + "epoch": 0.059341992577790464, + "grad_norm": 1.6968966722488403, + "learning_rate": 4.956689918957579e-05, + "loss": 5.06, + "step": 9978 + }, + { + "epoch": 0.05934793986107146, + "grad_norm": 1.6973050832748413, + "learning_rate": 4.9566812616991214e-05, + "loss": 5.2044, + "step": 9979 + }, + { + "epoch": 0.05935388714435246, + "grad_norm": 1.436073899269104, + "learning_rate": 4.9566726035830624e-05, + "loss": 5.2638, + "step": 9980 + }, + { + "epoch": 0.05935983442763346, + "grad_norm": 1.7667059898376465, + "learning_rate": 4.956663944609404e-05, + "loss": 5.0912, + "step": 9981 + }, + { + "epoch": 0.05936578171091445, + "grad_norm": 2.277327060699463, + "learning_rate": 4.9566552847781504e-05, + "loss": 5.6089, + "step": 9982 + }, + { + "epoch": 0.059371728994195454, + "grad_norm": 1.521134376525879, + "learning_rate": 4.956646624089304e-05, + "loss": 5.0213, + "step": 9983 + }, + { + "epoch": 0.05937767627747645, + "grad_norm": 1.556511402130127, + "learning_rate": 4.956637962542867e-05, + "loss": 5.1126, + "step": 9984 + }, + { + "epoch": 0.059383623560757444, + "grad_norm": 1.6691070795059204, + "learning_rate": 4.9566293001388423e-05, + "loss": 5.1351, + "step": 9985 + }, + { + "epoch": 0.059389570844038446, + "grad_norm": 1.5213310718536377, + "learning_rate": 4.956620636877235e-05, + "loss": 5.2402, + "step": 9986 + }, + { + "epoch": 0.05939551812731944, + "grad_norm": 1.5169057846069336, + "learning_rate": 4.956611972758046e-05, + "loss": 5.214, + "step": 9987 + }, + { + "epoch": 0.059401465410600436, + "grad_norm": 1.6076115369796753, + "learning_rate": 4.956603307781279e-05, + "loss": 5.1081, + "step": 9988 + }, + { + "epoch": 0.05940741269388144, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.9565946419469376e-05, + "loss": 5.1582, + "step": 9989 + }, + { + "epoch": 0.05941335997716243, + "grad_norm": 1.5118008852005005, + "learning_rate": 4.956585975255025e-05, + "loss": 5.0515, + "step": 9990 + }, + { + "epoch": 0.05941930726044343, + "grad_norm": 1.8852020502090454, + "learning_rate": 4.956577307705543e-05, + "loss": 5.3811, + "step": 9991 + }, + { + "epoch": 0.05942525454372442, + "grad_norm": 1.7066764831542969, + "learning_rate": 4.9565686392984955e-05, + "loss": 5.4599, + "step": 9992 + }, + { + "epoch": 0.059431201827005425, + "grad_norm": 1.5517010688781738, + "learning_rate": 4.956559970033885e-05, + "loss": 5.0728, + "step": 9993 + }, + { + "epoch": 0.05943714911028642, + "grad_norm": 1.508901596069336, + "learning_rate": 4.956551299911715e-05, + "loss": 5.1857, + "step": 9994 + }, + { + "epoch": 0.059443096393567416, + "grad_norm": 1.8867852687835693, + "learning_rate": 4.9565426289319874e-05, + "loss": 5.2223, + "step": 9995 + }, + { + "epoch": 0.05944904367684842, + "grad_norm": 1.4767159223556519, + "learning_rate": 4.9565339570947076e-05, + "loss": 5.1404, + "step": 9996 + }, + { + "epoch": 0.05945499096012941, + "grad_norm": 1.6351869106292725, + "learning_rate": 4.956525284399876e-05, + "loss": 5.3235, + "step": 9997 + }, + { + "epoch": 0.05946093824341041, + "grad_norm": 1.543565273284912, + "learning_rate": 4.956516610847497e-05, + "loss": 5.3365, + "step": 9998 + }, + { + "epoch": 0.05946688552669141, + "grad_norm": 1.4907768964767456, + "learning_rate": 4.9565079364375746e-05, + "loss": 5.4215, + "step": 9999 + }, + { + "epoch": 0.059472832809972405, + "grad_norm": 1.5810034275054932, + "learning_rate": 4.956499261170109e-05, + "loss": 5.3899, + "step": 10000 + }, + { + "epoch": 0.0594787800932534, + "grad_norm": 1.6342787742614746, + "learning_rate": 4.956490585045106e-05, + "loss": 5.4278, + "step": 10001 + }, + { + "epoch": 0.0594847273765344, + "grad_norm": 1.5474039316177368, + "learning_rate": 4.956481908062567e-05, + "loss": 5.1232, + "step": 10002 + }, + { + "epoch": 0.0594906746598154, + "grad_norm": 1.5679951906204224, + "learning_rate": 4.956473230222496e-05, + "loss": 5.3245, + "step": 10003 + }, + { + "epoch": 0.05949662194309639, + "grad_norm": 1.4851021766662598, + "learning_rate": 4.9564645515248955e-05, + "loss": 5.1806, + "step": 10004 + }, + { + "epoch": 0.059502569226377394, + "grad_norm": 1.8518844842910767, + "learning_rate": 4.956455871969768e-05, + "loss": 5.2543, + "step": 10005 + }, + { + "epoch": 0.05950851650965839, + "grad_norm": 1.7865514755249023, + "learning_rate": 4.956447191557118e-05, + "loss": 5.405, + "step": 10006 + }, + { + "epoch": 0.059514463792939384, + "grad_norm": 1.9051682949066162, + "learning_rate": 4.956438510286946e-05, + "loss": 5.0509, + "step": 10007 + }, + { + "epoch": 0.05952041107622038, + "grad_norm": 1.5150926113128662, + "learning_rate": 4.956429828159258e-05, + "loss": 5.0065, + "step": 10008 + }, + { + "epoch": 0.05952635835950138, + "grad_norm": 1.6085938215255737, + "learning_rate": 4.956421145174056e-05, + "loss": 5.2295, + "step": 10009 + }, + { + "epoch": 0.05953230564278238, + "grad_norm": 1.6337605714797974, + "learning_rate": 4.9564124613313424e-05, + "loss": 5.1666, + "step": 10010 + }, + { + "epoch": 0.05953825292606337, + "grad_norm": 1.5093178749084473, + "learning_rate": 4.9564037766311205e-05, + "loss": 5.2268, + "step": 10011 + }, + { + "epoch": 0.059544200209344374, + "grad_norm": 1.5047305822372437, + "learning_rate": 4.9563950910733936e-05, + "loss": 5.1065, + "step": 10012 + }, + { + "epoch": 0.05955014749262537, + "grad_norm": 1.6275629997253418, + "learning_rate": 4.9563864046581645e-05, + "loss": 5.2366, + "step": 10013 + }, + { + "epoch": 0.059556094775906364, + "grad_norm": 1.535582184791565, + "learning_rate": 4.956377717385436e-05, + "loss": 5.1799, + "step": 10014 + }, + { + "epoch": 0.059562042059187366, + "grad_norm": 1.448477864265442, + "learning_rate": 4.956369029255211e-05, + "loss": 5.2207, + "step": 10015 + }, + { + "epoch": 0.05956798934246836, + "grad_norm": 1.5288492441177368, + "learning_rate": 4.956360340267494e-05, + "loss": 5.3646, + "step": 10016 + }, + { + "epoch": 0.059573936625749356, + "grad_norm": 1.5746785402297974, + "learning_rate": 4.956351650422287e-05, + "loss": 5.1941, + "step": 10017 + }, + { + "epoch": 0.05957988390903036, + "grad_norm": 1.7088212966918945, + "learning_rate": 4.956342959719592e-05, + "loss": 5.1667, + "step": 10018 + }, + { + "epoch": 0.05958583119231135, + "grad_norm": 1.7666717767715454, + "learning_rate": 4.956334268159414e-05, + "loss": 5.1808, + "step": 10019 + }, + { + "epoch": 0.05959177847559235, + "grad_norm": 1.6472598314285278, + "learning_rate": 4.956325575741755e-05, + "loss": 5.3369, + "step": 10020 + }, + { + "epoch": 0.05959772575887334, + "grad_norm": 1.7340562343597412, + "learning_rate": 4.9563168824666174e-05, + "loss": 5.5623, + "step": 10021 + }, + { + "epoch": 0.059603673042154345, + "grad_norm": 1.9677515029907227, + "learning_rate": 4.9563081883340054e-05, + "loss": 4.7612, + "step": 10022 + }, + { + "epoch": 0.05960962032543534, + "grad_norm": 1.4823256731033325, + "learning_rate": 4.9562994933439215e-05, + "loss": 5.4504, + "step": 10023 + }, + { + "epoch": 0.059615567608716336, + "grad_norm": 1.5346739292144775, + "learning_rate": 4.956290797496369e-05, + "loss": 5.5455, + "step": 10024 + }, + { + "epoch": 0.05962151489199734, + "grad_norm": 1.5420036315917969, + "learning_rate": 4.956282100791351e-05, + "loss": 5.1363, + "step": 10025 + }, + { + "epoch": 0.05962746217527833, + "grad_norm": 1.7927091121673584, + "learning_rate": 4.956273403228869e-05, + "loss": 5.0768, + "step": 10026 + }, + { + "epoch": 0.05963340945855933, + "grad_norm": 1.7139612436294556, + "learning_rate": 4.9562647048089287e-05, + "loss": 5.2046, + "step": 10027 + }, + { + "epoch": 0.05963935674184033, + "grad_norm": 1.627684473991394, + "learning_rate": 4.956256005531531e-05, + "loss": 5.3844, + "step": 10028 + }, + { + "epoch": 0.059645304025121325, + "grad_norm": 1.5006085634231567, + "learning_rate": 4.9562473053966805e-05, + "loss": 5.4948, + "step": 10029 + }, + { + "epoch": 0.05965125130840232, + "grad_norm": 1.5670723915100098, + "learning_rate": 4.956238604404378e-05, + "loss": 5.5465, + "step": 10030 + }, + { + "epoch": 0.05965719859168332, + "grad_norm": 1.5671201944351196, + "learning_rate": 4.95622990255463e-05, + "loss": 5.1969, + "step": 10031 + }, + { + "epoch": 0.05966314587496432, + "grad_norm": 2.1628634929656982, + "learning_rate": 4.956221199847436e-05, + "loss": 5.0244, + "step": 10032 + }, + { + "epoch": 0.05966909315824531, + "grad_norm": 1.5766685009002686, + "learning_rate": 4.956212496282801e-05, + "loss": 5.4698, + "step": 10033 + }, + { + "epoch": 0.059675040441526314, + "grad_norm": 1.625812292098999, + "learning_rate": 4.956203791860728e-05, + "loss": 5.3825, + "step": 10034 + }, + { + "epoch": 0.05968098772480731, + "grad_norm": 1.4307054281234741, + "learning_rate": 4.956195086581219e-05, + "loss": 5.3576, + "step": 10035 + }, + { + "epoch": 0.059686935008088304, + "grad_norm": 1.4459644556045532, + "learning_rate": 4.9561863804442785e-05, + "loss": 5.3478, + "step": 10036 + }, + { + "epoch": 0.0596928822913693, + "grad_norm": 1.8038474321365356, + "learning_rate": 4.9561776734499075e-05, + "loss": 5.4967, + "step": 10037 + }, + { + "epoch": 0.0596988295746503, + "grad_norm": 1.41011381149292, + "learning_rate": 4.9561689655981115e-05, + "loss": 5.4224, + "step": 10038 + }, + { + "epoch": 0.059704776857931297, + "grad_norm": 1.6678937673568726, + "learning_rate": 4.956160256888891e-05, + "loss": 5.27, + "step": 10039 + }, + { + "epoch": 0.05971072414121229, + "grad_norm": 1.794647455215454, + "learning_rate": 4.956151547322251e-05, + "loss": 5.2822, + "step": 10040 + }, + { + "epoch": 0.059716671424493294, + "grad_norm": 1.5010912418365479, + "learning_rate": 4.9561428368981944e-05, + "loss": 5.3778, + "step": 10041 + }, + { + "epoch": 0.05972261870777429, + "grad_norm": 1.785395860671997, + "learning_rate": 4.9561341256167234e-05, + "loss": 5.4213, + "step": 10042 + }, + { + "epoch": 0.059728565991055284, + "grad_norm": 1.889667272567749, + "learning_rate": 4.956125413477841e-05, + "loss": 5.2795, + "step": 10043 + }, + { + "epoch": 0.059734513274336286, + "grad_norm": 2.209780216217041, + "learning_rate": 4.95611670048155e-05, + "loss": 5.6823, + "step": 10044 + }, + { + "epoch": 0.05974046055761728, + "grad_norm": 1.979069471359253, + "learning_rate": 4.956107986627855e-05, + "loss": 5.3437, + "step": 10045 + }, + { + "epoch": 0.059746407840898276, + "grad_norm": 1.8391239643096924, + "learning_rate": 4.9560992719167584e-05, + "loss": 5.2246, + "step": 10046 + }, + { + "epoch": 0.05975235512417928, + "grad_norm": 2.0196359157562256, + "learning_rate": 4.956090556348262e-05, + "loss": 5.3549, + "step": 10047 + }, + { + "epoch": 0.05975830240746027, + "grad_norm": 1.7103056907653809, + "learning_rate": 4.95608183992237e-05, + "loss": 5.4016, + "step": 10048 + }, + { + "epoch": 0.05976424969074127, + "grad_norm": 1.543308138847351, + "learning_rate": 4.956073122639085e-05, + "loss": 5.2628, + "step": 10049 + }, + { + "epoch": 0.05977019697402226, + "grad_norm": 2.0719797611236572, + "learning_rate": 4.956064404498411e-05, + "loss": 5.3149, + "step": 10050 + }, + { + "epoch": 0.059776144257303265, + "grad_norm": 1.9024063348770142, + "learning_rate": 4.95605568550035e-05, + "loss": 5.2804, + "step": 10051 + }, + { + "epoch": 0.05978209154058426, + "grad_norm": 1.6171611547470093, + "learning_rate": 4.9560469656449046e-05, + "loss": 5.2558, + "step": 10052 + }, + { + "epoch": 0.059788038823865255, + "grad_norm": 1.5416970252990723, + "learning_rate": 4.9560382449320795e-05, + "loss": 5.3164, + "step": 10053 + }, + { + "epoch": 0.05979398610714626, + "grad_norm": 1.6956002712249756, + "learning_rate": 4.956029523361877e-05, + "loss": 5.2123, + "step": 10054 + }, + { + "epoch": 0.05979993339042725, + "grad_norm": 1.6414602994918823, + "learning_rate": 4.956020800934299e-05, + "loss": 5.3302, + "step": 10055 + }, + { + "epoch": 0.05980588067370825, + "grad_norm": 1.6868051290512085, + "learning_rate": 4.95601207764935e-05, + "loss": 5.2076, + "step": 10056 + }, + { + "epoch": 0.05981182795698925, + "grad_norm": 1.7299697399139404, + "learning_rate": 4.956003353507033e-05, + "loss": 5.3502, + "step": 10057 + }, + { + "epoch": 0.059817775240270245, + "grad_norm": 1.4923878908157349, + "learning_rate": 4.95599462850735e-05, + "loss": 5.3081, + "step": 10058 + }, + { + "epoch": 0.05982372252355124, + "grad_norm": 1.571413516998291, + "learning_rate": 4.9559859026503045e-05, + "loss": 5.1434, + "step": 10059 + }, + { + "epoch": 0.05982966980683224, + "grad_norm": 1.6265422105789185, + "learning_rate": 4.9559771759359e-05, + "loss": 5.2455, + "step": 10060 + }, + { + "epoch": 0.05983561709011324, + "grad_norm": 1.7889208793640137, + "learning_rate": 4.9559684483641395e-05, + "loss": 5.2429, + "step": 10061 + }, + { + "epoch": 0.05984156437339423, + "grad_norm": 1.5957598686218262, + "learning_rate": 4.955959719935025e-05, + "loss": 5.2299, + "step": 10062 + }, + { + "epoch": 0.059847511656675234, + "grad_norm": 1.6366177797317505, + "learning_rate": 4.955950990648561e-05, + "loss": 5.366, + "step": 10063 + }, + { + "epoch": 0.05985345893995623, + "grad_norm": 1.6712719202041626, + "learning_rate": 4.95594226050475e-05, + "loss": 5.3602, + "step": 10064 + }, + { + "epoch": 0.059859406223237224, + "grad_norm": 1.8273069858551025, + "learning_rate": 4.955933529503595e-05, + "loss": 5.3586, + "step": 10065 + }, + { + "epoch": 0.05986535350651822, + "grad_norm": 1.6638576984405518, + "learning_rate": 4.955924797645098e-05, + "loss": 5.2359, + "step": 10066 + }, + { + "epoch": 0.05987130078979922, + "grad_norm": 1.8127614259719849, + "learning_rate": 4.955916064929264e-05, + "loss": 5.3815, + "step": 10067 + }, + { + "epoch": 0.059877248073080216, + "grad_norm": 1.7204198837280273, + "learning_rate": 4.955907331356095e-05, + "loss": 5.5576, + "step": 10068 + }, + { + "epoch": 0.05988319535636121, + "grad_norm": 1.9153103828430176, + "learning_rate": 4.9558985969255936e-05, + "loss": 5.4363, + "step": 10069 + }, + { + "epoch": 0.059889142639642214, + "grad_norm": 1.6427290439605713, + "learning_rate": 4.9558898616377634e-05, + "loss": 5.4497, + "step": 10070 + }, + { + "epoch": 0.05989508992292321, + "grad_norm": 1.660217046737671, + "learning_rate": 4.955881125492608e-05, + "loss": 5.4988, + "step": 10071 + }, + { + "epoch": 0.059901037206204204, + "grad_norm": 1.7776225805282593, + "learning_rate": 4.955872388490129e-05, + "loss": 5.2714, + "step": 10072 + }, + { + "epoch": 0.059906984489485206, + "grad_norm": 1.5099388360977173, + "learning_rate": 4.9558636506303314e-05, + "loss": 5.4714, + "step": 10073 + }, + { + "epoch": 0.0599129317727662, + "grad_norm": 1.523537039756775, + "learning_rate": 4.955854911913217e-05, + "loss": 5.3528, + "step": 10074 + }, + { + "epoch": 0.059918879056047196, + "grad_norm": 1.3424321413040161, + "learning_rate": 4.9558461723387885e-05, + "loss": 5.3385, + "step": 10075 + }, + { + "epoch": 0.0599248263393282, + "grad_norm": 1.3843169212341309, + "learning_rate": 4.955837431907049e-05, + "loss": 5.383, + "step": 10076 + }, + { + "epoch": 0.05993077362260919, + "grad_norm": 1.4927351474761963, + "learning_rate": 4.955828690618003e-05, + "loss": 5.3536, + "step": 10077 + }, + { + "epoch": 0.05993672090589019, + "grad_norm": 1.5207486152648926, + "learning_rate": 4.955819948471653e-05, + "loss": 5.3557, + "step": 10078 + }, + { + "epoch": 0.05994266818917118, + "grad_norm": 1.5589584112167358, + "learning_rate": 4.9558112054680004e-05, + "loss": 5.3747, + "step": 10079 + }, + { + "epoch": 0.059948615472452185, + "grad_norm": 1.436951756477356, + "learning_rate": 4.9558024616070496e-05, + "loss": 5.2807, + "step": 10080 + }, + { + "epoch": 0.05995456275573318, + "grad_norm": 1.4345866441726685, + "learning_rate": 4.955793716888804e-05, + "loss": 5.4, + "step": 10081 + }, + { + "epoch": 0.059960510039014175, + "grad_norm": 1.2811249494552612, + "learning_rate": 4.955784971313267e-05, + "loss": 5.2531, + "step": 10082 + }, + { + "epoch": 0.05996645732229518, + "grad_norm": 1.5558568239212036, + "learning_rate": 4.955776224880439e-05, + "loss": 5.1136, + "step": 10083 + }, + { + "epoch": 0.05997240460557617, + "grad_norm": 1.3918567895889282, + "learning_rate": 4.955767477590326e-05, + "loss": 5.2748, + "step": 10084 + }, + { + "epoch": 0.05997835188885717, + "grad_norm": 1.3277204036712646, + "learning_rate": 4.9557587294429295e-05, + "loss": 5.2346, + "step": 10085 + }, + { + "epoch": 0.05998429917213817, + "grad_norm": 1.2874623537063599, + "learning_rate": 4.955749980438253e-05, + "loss": 5.2616, + "step": 10086 + }, + { + "epoch": 0.059990246455419165, + "grad_norm": 1.7534229755401611, + "learning_rate": 4.9557412305763004e-05, + "loss": 5.2509, + "step": 10087 + }, + { + "epoch": 0.05999619373870016, + "grad_norm": 1.4560372829437256, + "learning_rate": 4.955732479857072e-05, + "loss": 5.2385, + "step": 10088 + }, + { + "epoch": 0.06000214102198116, + "grad_norm": 1.232779860496521, + "learning_rate": 4.955723728280575e-05, + "loss": 5.2726, + "step": 10089 + }, + { + "epoch": 0.06000808830526216, + "grad_norm": 1.6178683042526245, + "learning_rate": 4.955714975846809e-05, + "loss": 5.3816, + "step": 10090 + }, + { + "epoch": 0.06001403558854315, + "grad_norm": 1.5438450574874878, + "learning_rate": 4.955706222555779e-05, + "loss": 5.2706, + "step": 10091 + }, + { + "epoch": 0.060019982871824154, + "grad_norm": 1.5367876291275024, + "learning_rate": 4.955697468407486e-05, + "loss": 5.1955, + "step": 10092 + }, + { + "epoch": 0.06002593015510515, + "grad_norm": 1.2902512550354004, + "learning_rate": 4.955688713401936e-05, + "loss": 5.166, + "step": 10093 + }, + { + "epoch": 0.060031877438386144, + "grad_norm": 1.5516488552093506, + "learning_rate": 4.95567995753913e-05, + "loss": 5.1256, + "step": 10094 + }, + { + "epoch": 0.06003782472166714, + "grad_norm": 1.3104857206344604, + "learning_rate": 4.9556712008190706e-05, + "loss": 5.1604, + "step": 10095 + }, + { + "epoch": 0.06004377200494814, + "grad_norm": 1.6237741708755493, + "learning_rate": 4.955662443241762e-05, + "loss": 5.2686, + "step": 10096 + }, + { + "epoch": 0.060049719288229136, + "grad_norm": 1.6566027402877808, + "learning_rate": 4.955653684807208e-05, + "loss": 5.3376, + "step": 10097 + }, + { + "epoch": 0.06005566657151013, + "grad_norm": 1.4010981321334839, + "learning_rate": 4.9556449255154106e-05, + "loss": 5.4008, + "step": 10098 + }, + { + "epoch": 0.060061613854791134, + "grad_norm": 1.6399116516113281, + "learning_rate": 4.955636165366372e-05, + "loss": 5.2718, + "step": 10099 + }, + { + "epoch": 0.06006756113807213, + "grad_norm": 1.5371499061584473, + "learning_rate": 4.955627404360096e-05, + "loss": 5.2107, + "step": 10100 + }, + { + "epoch": 0.060073508421353124, + "grad_norm": 1.598186731338501, + "learning_rate": 4.955618642496587e-05, + "loss": 5.3482, + "step": 10101 + }, + { + "epoch": 0.060079455704634126, + "grad_norm": 1.526595115661621, + "learning_rate": 4.955609879775846e-05, + "loss": 5.2335, + "step": 10102 + }, + { + "epoch": 0.06008540298791512, + "grad_norm": 1.509990930557251, + "learning_rate": 4.955601116197877e-05, + "loss": 5.168, + "step": 10103 + }, + { + "epoch": 0.060091350271196116, + "grad_norm": 1.368203043937683, + "learning_rate": 4.9555923517626836e-05, + "loss": 5.2183, + "step": 10104 + }, + { + "epoch": 0.06009729755447712, + "grad_norm": 1.5153454542160034, + "learning_rate": 4.955583586470268e-05, + "loss": 5.2558, + "step": 10105 + }, + { + "epoch": 0.06010324483775811, + "grad_norm": 2.9330217838287354, + "learning_rate": 4.955574820320633e-05, + "loss": 5.6863, + "step": 10106 + }, + { + "epoch": 0.06010919212103911, + "grad_norm": 1.6096080541610718, + "learning_rate": 4.9555660533137825e-05, + "loss": 5.2243, + "step": 10107 + }, + { + "epoch": 0.0601151394043201, + "grad_norm": 1.5425163507461548, + "learning_rate": 4.95555728544972e-05, + "loss": 5.4308, + "step": 10108 + }, + { + "epoch": 0.060121086687601105, + "grad_norm": 1.4898573160171509, + "learning_rate": 4.955548516728447e-05, + "loss": 5.389, + "step": 10109 + }, + { + "epoch": 0.0601270339708821, + "grad_norm": 1.5746946334838867, + "learning_rate": 4.955539747149968e-05, + "loss": 5.1414, + "step": 10110 + }, + { + "epoch": 0.060132981254163095, + "grad_norm": 1.7621461153030396, + "learning_rate": 4.955530976714285e-05, + "loss": 5.4572, + "step": 10111 + }, + { + "epoch": 0.0601389285374441, + "grad_norm": 1.4524224996566772, + "learning_rate": 4.9555222054214015e-05, + "loss": 5.4577, + "step": 10112 + }, + { + "epoch": 0.06014487582072509, + "grad_norm": 1.5630146265029907, + "learning_rate": 4.95551343327132e-05, + "loss": 5.277, + "step": 10113 + }, + { + "epoch": 0.06015082310400609, + "grad_norm": 1.9279972314834595, + "learning_rate": 4.955504660264045e-05, + "loss": 5.1485, + "step": 10114 + }, + { + "epoch": 0.06015677038728709, + "grad_norm": 1.618775725364685, + "learning_rate": 4.9554958863995786e-05, + "loss": 5.1262, + "step": 10115 + }, + { + "epoch": 0.060162717670568085, + "grad_norm": 1.8578898906707764, + "learning_rate": 4.955487111677924e-05, + "loss": 5.3451, + "step": 10116 + }, + { + "epoch": 0.06016866495384908, + "grad_norm": 1.5652815103530884, + "learning_rate": 4.955478336099084e-05, + "loss": 5.2326, + "step": 10117 + }, + { + "epoch": 0.06017461223713008, + "grad_norm": 1.4957774877548218, + "learning_rate": 4.9554695596630616e-05, + "loss": 5.3332, + "step": 10118 + }, + { + "epoch": 0.06018055952041108, + "grad_norm": 1.428112506866455, + "learning_rate": 4.9554607823698606e-05, + "loss": 5.2647, + "step": 10119 + }, + { + "epoch": 0.06018650680369207, + "grad_norm": 1.9383279085159302, + "learning_rate": 4.955452004219484e-05, + "loss": 5.5897, + "step": 10120 + }, + { + "epoch": 0.060192454086973074, + "grad_norm": 1.8523132801055908, + "learning_rate": 4.955443225211934e-05, + "loss": 5.6204, + "step": 10121 + }, + { + "epoch": 0.06019840137025407, + "grad_norm": 1.7980049848556519, + "learning_rate": 4.955434445347214e-05, + "loss": 5.4383, + "step": 10122 + }, + { + "epoch": 0.060204348653535064, + "grad_norm": 1.7927988767623901, + "learning_rate": 4.9554256646253274e-05, + "loss": 5.6066, + "step": 10123 + }, + { + "epoch": 0.06021029593681606, + "grad_norm": 1.8549528121948242, + "learning_rate": 4.955416883046277e-05, + "loss": 5.2963, + "step": 10124 + }, + { + "epoch": 0.06021624322009706, + "grad_norm": 1.7140870094299316, + "learning_rate": 4.955408100610066e-05, + "loss": 5.4636, + "step": 10125 + }, + { + "epoch": 0.060222190503378056, + "grad_norm": 1.3744412660598755, + "learning_rate": 4.955399317316697e-05, + "loss": 5.2985, + "step": 10126 + }, + { + "epoch": 0.06022813778665905, + "grad_norm": 1.572782278060913, + "learning_rate": 4.9553905331661734e-05, + "loss": 5.2598, + "step": 10127 + }, + { + "epoch": 0.06023408506994005, + "grad_norm": 1.6485692262649536, + "learning_rate": 4.955381748158499e-05, + "loss": 5.3764, + "step": 10128 + }, + { + "epoch": 0.06024003235322105, + "grad_norm": 1.5442413091659546, + "learning_rate": 4.955372962293676e-05, + "loss": 5.2504, + "step": 10129 + }, + { + "epoch": 0.060245979636502044, + "grad_norm": 1.807518482208252, + "learning_rate": 4.9553641755717075e-05, + "loss": 5.2853, + "step": 10130 + }, + { + "epoch": 0.060251926919783046, + "grad_norm": 1.5858244895935059, + "learning_rate": 4.9553553879925965e-05, + "loss": 5.2645, + "step": 10131 + }, + { + "epoch": 0.06025787420306404, + "grad_norm": 1.596307396888733, + "learning_rate": 4.955346599556347e-05, + "loss": 5.4094, + "step": 10132 + }, + { + "epoch": 0.060263821486345036, + "grad_norm": 1.4624857902526855, + "learning_rate": 4.955337810262961e-05, + "loss": 5.4366, + "step": 10133 + }, + { + "epoch": 0.06026976876962604, + "grad_norm": 1.426866888999939, + "learning_rate": 4.955329020112442e-05, + "loss": 5.324, + "step": 10134 + }, + { + "epoch": 0.06027571605290703, + "grad_norm": 1.6577516794204712, + "learning_rate": 4.955320229104793e-05, + "loss": 5.2937, + "step": 10135 + }, + { + "epoch": 0.06028166333618803, + "grad_norm": 1.3958433866500854, + "learning_rate": 4.9553114372400166e-05, + "loss": 5.421, + "step": 10136 + }, + { + "epoch": 0.06028761061946902, + "grad_norm": 1.3242517709732056, + "learning_rate": 4.9553026445181173e-05, + "loss": 5.2697, + "step": 10137 + }, + { + "epoch": 0.060293557902750025, + "grad_norm": 1.519018530845642, + "learning_rate": 4.955293850939096e-05, + "loss": 5.1432, + "step": 10138 + }, + { + "epoch": 0.06029950518603102, + "grad_norm": 1.528515338897705, + "learning_rate": 4.955285056502958e-05, + "loss": 5.1388, + "step": 10139 + }, + { + "epoch": 0.060305452469312015, + "grad_norm": 1.4830992221832275, + "learning_rate": 4.955276261209705e-05, + "loss": 5.3222, + "step": 10140 + }, + { + "epoch": 0.06031139975259302, + "grad_norm": 1.4149411916732788, + "learning_rate": 4.95526746505934e-05, + "loss": 5.2706, + "step": 10141 + }, + { + "epoch": 0.06031734703587401, + "grad_norm": 1.4466478824615479, + "learning_rate": 4.9552586680518676e-05, + "loss": 5.2309, + "step": 10142 + }, + { + "epoch": 0.06032329431915501, + "grad_norm": 1.4246203899383545, + "learning_rate": 4.9552498701872884e-05, + "loss": 5.1539, + "step": 10143 + }, + { + "epoch": 0.06032924160243601, + "grad_norm": 1.632572889328003, + "learning_rate": 4.955241071465608e-05, + "loss": 5.3788, + "step": 10144 + }, + { + "epoch": 0.060335188885717005, + "grad_norm": 1.5974568128585815, + "learning_rate": 4.955232271886828e-05, + "loss": 5.3558, + "step": 10145 + }, + { + "epoch": 0.060341136168998, + "grad_norm": 1.6396468877792358, + "learning_rate": 4.9552234714509516e-05, + "loss": 5.2162, + "step": 10146 + }, + { + "epoch": 0.060347083452279, + "grad_norm": 1.5349491834640503, + "learning_rate": 4.9552146701579815e-05, + "loss": 5.212, + "step": 10147 + }, + { + "epoch": 0.06035303073556, + "grad_norm": 1.5236495733261108, + "learning_rate": 4.955205868007922e-05, + "loss": 5.2984, + "step": 10148 + }, + { + "epoch": 0.06035897801884099, + "grad_norm": 1.4593411684036255, + "learning_rate": 4.955197065000775e-05, + "loss": 5.268, + "step": 10149 + }, + { + "epoch": 0.060364925302121994, + "grad_norm": 1.4498536586761475, + "learning_rate": 4.955188261136545e-05, + "loss": 5.1437, + "step": 10150 + }, + { + "epoch": 0.06037087258540299, + "grad_norm": 1.5059176683425903, + "learning_rate": 4.9551794564152334e-05, + "loss": 5.3011, + "step": 10151 + }, + { + "epoch": 0.060376819868683984, + "grad_norm": 1.5773544311523438, + "learning_rate": 4.9551706508368445e-05, + "loss": 5.2066, + "step": 10152 + }, + { + "epoch": 0.06038276715196498, + "grad_norm": 1.4858072996139526, + "learning_rate": 4.95516184440138e-05, + "loss": 5.2757, + "step": 10153 + }, + { + "epoch": 0.06038871443524598, + "grad_norm": 1.486055612564087, + "learning_rate": 4.955153037108845e-05, + "loss": 5.1416, + "step": 10154 + }, + { + "epoch": 0.060394661718526976, + "grad_norm": 1.3411048650741577, + "learning_rate": 4.955144228959241e-05, + "loss": 5.1708, + "step": 10155 + }, + { + "epoch": 0.06040060900180797, + "grad_norm": 1.2979127168655396, + "learning_rate": 4.9551354199525714e-05, + "loss": 5.1421, + "step": 10156 + }, + { + "epoch": 0.06040655628508897, + "grad_norm": 1.4928209781646729, + "learning_rate": 4.9551266100888395e-05, + "loss": 5.2185, + "step": 10157 + }, + { + "epoch": 0.06041250356836997, + "grad_norm": 1.58747398853302, + "learning_rate": 4.955117799368048e-05, + "loss": 5.2587, + "step": 10158 + }, + { + "epoch": 0.060418450851650964, + "grad_norm": 1.1862558126449585, + "learning_rate": 4.9551089877902e-05, + "loss": 5.2405, + "step": 10159 + }, + { + "epoch": 0.060424398134931966, + "grad_norm": 1.5547248125076294, + "learning_rate": 4.955100175355299e-05, + "loss": 5.2326, + "step": 10160 + }, + { + "epoch": 0.06043034541821296, + "grad_norm": 1.6986664533615112, + "learning_rate": 4.955091362063349e-05, + "loss": 5.2261, + "step": 10161 + }, + { + "epoch": 0.060436292701493956, + "grad_norm": 1.531891107559204, + "learning_rate": 4.95508254791435e-05, + "loss": 5.4475, + "step": 10162 + }, + { + "epoch": 0.06044223998477496, + "grad_norm": 1.57411789894104, + "learning_rate": 4.955073732908309e-05, + "loss": 5.1346, + "step": 10163 + }, + { + "epoch": 0.06044818726805595, + "grad_norm": 1.548439383506775, + "learning_rate": 4.9550649170452255e-05, + "loss": 5.1953, + "step": 10164 + }, + { + "epoch": 0.06045413455133695, + "grad_norm": 1.645850419998169, + "learning_rate": 4.955056100325105e-05, + "loss": 5.2728, + "step": 10165 + }, + { + "epoch": 0.06046008183461794, + "grad_norm": 1.6308786869049072, + "learning_rate": 4.95504728274795e-05, + "loss": 5.3134, + "step": 10166 + }, + { + "epoch": 0.060466029117898945, + "grad_norm": 1.4754101037979126, + "learning_rate": 4.955038464313763e-05, + "loss": 5.3938, + "step": 10167 + }, + { + "epoch": 0.06047197640117994, + "grad_norm": 2.408869981765747, + "learning_rate": 4.955029645022548e-05, + "loss": 5.4687, + "step": 10168 + }, + { + "epoch": 0.060477923684460935, + "grad_norm": 1.6601638793945312, + "learning_rate": 4.955020824874307e-05, + "loss": 5.165, + "step": 10169 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 1.5239113569259644, + "learning_rate": 4.955012003869043e-05, + "loss": 5.133, + "step": 10170 + }, + { + "epoch": 0.06048981825102293, + "grad_norm": 1.6661083698272705, + "learning_rate": 4.955003182006761e-05, + "loss": 5.2033, + "step": 10171 + }, + { + "epoch": 0.06049576553430393, + "grad_norm": 1.4320698976516724, + "learning_rate": 4.9549943592874615e-05, + "loss": 5.1842, + "step": 10172 + }, + { + "epoch": 0.06050171281758493, + "grad_norm": 1.789302110671997, + "learning_rate": 4.95498553571115e-05, + "loss": 5.1052, + "step": 10173 + }, + { + "epoch": 0.060507660100865925, + "grad_norm": 1.598085880279541, + "learning_rate": 4.954976711277828e-05, + "loss": 5.3194, + "step": 10174 + }, + { + "epoch": 0.06051360738414692, + "grad_norm": 1.4569145441055298, + "learning_rate": 4.954967885987498e-05, + "loss": 5.2009, + "step": 10175 + }, + { + "epoch": 0.06051955466742792, + "grad_norm": 1.5980345010757446, + "learning_rate": 4.954959059840165e-05, + "loss": 5.1686, + "step": 10176 + }, + { + "epoch": 0.06052550195070892, + "grad_norm": 1.5382320880889893, + "learning_rate": 4.954950232835831e-05, + "loss": 5.303, + "step": 10177 + }, + { + "epoch": 0.06053144923398991, + "grad_norm": 1.5568296909332275, + "learning_rate": 4.954941404974499e-05, + "loss": 5.2044, + "step": 10178 + }, + { + "epoch": 0.060537396517270914, + "grad_norm": 1.6732075214385986, + "learning_rate": 4.954932576256173e-05, + "loss": 5.3133, + "step": 10179 + }, + { + "epoch": 0.06054334380055191, + "grad_norm": 1.6905434131622314, + "learning_rate": 4.954923746680855e-05, + "loss": 5.3868, + "step": 10180 + }, + { + "epoch": 0.060549291083832904, + "grad_norm": 1.4349027872085571, + "learning_rate": 4.954914916248549e-05, + "loss": 5.2215, + "step": 10181 + }, + { + "epoch": 0.0605552383671139, + "grad_norm": 1.5257092714309692, + "learning_rate": 4.9549060849592566e-05, + "loss": 5.2148, + "step": 10182 + }, + { + "epoch": 0.0605611856503949, + "grad_norm": 1.5402655601501465, + "learning_rate": 4.954897252812982e-05, + "loss": 5.3069, + "step": 10183 + }, + { + "epoch": 0.060567132933675896, + "grad_norm": 1.801798701286316, + "learning_rate": 4.954888419809729e-05, + "loss": 5.0786, + "step": 10184 + }, + { + "epoch": 0.06057308021695689, + "grad_norm": 1.4860090017318726, + "learning_rate": 4.954879585949499e-05, + "loss": 4.8878, + "step": 10185 + }, + { + "epoch": 0.06057902750023789, + "grad_norm": 1.7319056987762451, + "learning_rate": 4.954870751232296e-05, + "loss": 4.9013, + "step": 10186 + }, + { + "epoch": 0.06058497478351889, + "grad_norm": 1.4376243352890015, + "learning_rate": 4.954861915658123e-05, + "loss": 5.37, + "step": 10187 + }, + { + "epoch": 0.060590922066799884, + "grad_norm": 1.2903879880905151, + "learning_rate": 4.954853079226983e-05, + "loss": 5.5355, + "step": 10188 + }, + { + "epoch": 0.060596869350080886, + "grad_norm": 1.5223259925842285, + "learning_rate": 4.95484424193888e-05, + "loss": 5.3451, + "step": 10189 + }, + { + "epoch": 0.06060281663336188, + "grad_norm": 1.283892035484314, + "learning_rate": 4.954835403793815e-05, + "loss": 5.2245, + "step": 10190 + }, + { + "epoch": 0.060608763916642876, + "grad_norm": 1.5581207275390625, + "learning_rate": 4.9548265647917936e-05, + "loss": 5.303, + "step": 10191 + }, + { + "epoch": 0.06061471119992388, + "grad_norm": 1.4258673191070557, + "learning_rate": 4.9548177249328164e-05, + "loss": 5.4569, + "step": 10192 + }, + { + "epoch": 0.06062065848320487, + "grad_norm": 1.4326061010360718, + "learning_rate": 4.9548088842168886e-05, + "loss": 5.2761, + "step": 10193 + }, + { + "epoch": 0.06062660576648587, + "grad_norm": 1.9100563526153564, + "learning_rate": 4.9548000426440114e-05, + "loss": 4.9366, + "step": 10194 + }, + { + "epoch": 0.06063255304976687, + "grad_norm": 1.7059932947158813, + "learning_rate": 4.9547912002141895e-05, + "loss": 4.9135, + "step": 10195 + }, + { + "epoch": 0.060638500333047865, + "grad_norm": 1.6715087890625, + "learning_rate": 4.954782356927425e-05, + "loss": 5.0662, + "step": 10196 + }, + { + "epoch": 0.06064444761632886, + "grad_norm": 1.966430902481079, + "learning_rate": 4.9547735127837223e-05, + "loss": 4.7995, + "step": 10197 + }, + { + "epoch": 0.060650394899609855, + "grad_norm": 1.7138090133666992, + "learning_rate": 4.954764667783083e-05, + "loss": 4.9745, + "step": 10198 + }, + { + "epoch": 0.06065634218289086, + "grad_norm": 1.832889199256897, + "learning_rate": 4.95475582192551e-05, + "loss": 4.9795, + "step": 10199 + }, + { + "epoch": 0.06066228946617185, + "grad_norm": 1.883525013923645, + "learning_rate": 4.954746975211008e-05, + "loss": 4.8523, + "step": 10200 + }, + { + "epoch": 0.06066823674945285, + "grad_norm": 1.747101068496704, + "learning_rate": 4.954738127639579e-05, + "loss": 4.9402, + "step": 10201 + }, + { + "epoch": 0.06067418403273385, + "grad_norm": 1.583900809288025, + "learning_rate": 4.9547292792112256e-05, + "loss": 5.176, + "step": 10202 + }, + { + "epoch": 0.060680131316014845, + "grad_norm": 1.6390752792358398, + "learning_rate": 4.954720429925953e-05, + "loss": 5.1014, + "step": 10203 + }, + { + "epoch": 0.06068607859929584, + "grad_norm": 1.4499305486679077, + "learning_rate": 4.954711579783762e-05, + "loss": 5.1473, + "step": 10204 + }, + { + "epoch": 0.06069202588257684, + "grad_norm": 1.2734607458114624, + "learning_rate": 4.954702728784656e-05, + "loss": 5.0919, + "step": 10205 + }, + { + "epoch": 0.06069797316585784, + "grad_norm": 1.4447498321533203, + "learning_rate": 4.954693876928639e-05, + "loss": 5.0145, + "step": 10206 + }, + { + "epoch": 0.06070392044913883, + "grad_norm": 1.7052301168441772, + "learning_rate": 4.954685024215714e-05, + "loss": 5.109, + "step": 10207 + }, + { + "epoch": 0.060709867732419834, + "grad_norm": 1.6922130584716797, + "learning_rate": 4.9546761706458836e-05, + "loss": 5.2519, + "step": 10208 + }, + { + "epoch": 0.06071581501570083, + "grad_norm": 1.7998334169387817, + "learning_rate": 4.954667316219151e-05, + "loss": 5.2272, + "step": 10209 + }, + { + "epoch": 0.060721762298981824, + "grad_norm": 1.6331555843353271, + "learning_rate": 4.95465846093552e-05, + "loss": 5.1382, + "step": 10210 + }, + { + "epoch": 0.06072770958226282, + "grad_norm": 1.4777888059616089, + "learning_rate": 4.954649604794993e-05, + "loss": 5.0601, + "step": 10211 + }, + { + "epoch": 0.06073365686554382, + "grad_norm": 1.6776998043060303, + "learning_rate": 4.954640747797573e-05, + "loss": 5.0229, + "step": 10212 + }, + { + "epoch": 0.060739604148824816, + "grad_norm": 1.9567780494689941, + "learning_rate": 4.9546318899432634e-05, + "loss": 5.483, + "step": 10213 + }, + { + "epoch": 0.06074555143210581, + "grad_norm": 1.7381116151809692, + "learning_rate": 4.9546230312320664e-05, + "loss": 5.4088, + "step": 10214 + }, + { + "epoch": 0.06075149871538681, + "grad_norm": 2.290041446685791, + "learning_rate": 4.954614171663986e-05, + "loss": 5.0879, + "step": 10215 + }, + { + "epoch": 0.06075744599866781, + "grad_norm": 1.680309534072876, + "learning_rate": 4.9546053112390255e-05, + "loss": 5.1931, + "step": 10216 + }, + { + "epoch": 0.0607633932819488, + "grad_norm": 1.997379183769226, + "learning_rate": 4.9545964499571885e-05, + "loss": 5.0834, + "step": 10217 + }, + { + "epoch": 0.060769340565229805, + "grad_norm": 1.9145865440368652, + "learning_rate": 4.954587587818476e-05, + "loss": 5.3478, + "step": 10218 + }, + { + "epoch": 0.0607752878485108, + "grad_norm": 1.565874457359314, + "learning_rate": 4.954578724822893e-05, + "loss": 5.2579, + "step": 10219 + }, + { + "epoch": 0.060781235131791796, + "grad_norm": 1.5997511148452759, + "learning_rate": 4.9545698609704416e-05, + "loss": 5.233, + "step": 10220 + }, + { + "epoch": 0.0607871824150728, + "grad_norm": 2.205021619796753, + "learning_rate": 4.954560996261125e-05, + "loss": 5.227, + "step": 10221 + }, + { + "epoch": 0.06079312969835379, + "grad_norm": 1.5360487699508667, + "learning_rate": 4.954552130694947e-05, + "loss": 5.182, + "step": 10222 + }, + { + "epoch": 0.06079907698163479, + "grad_norm": 1.5571166276931763, + "learning_rate": 4.95454326427191e-05, + "loss": 5.3671, + "step": 10223 + }, + { + "epoch": 0.06080502426491579, + "grad_norm": 1.7289685010910034, + "learning_rate": 4.9545343969920175e-05, + "loss": 5.1256, + "step": 10224 + }, + { + "epoch": 0.060810971548196785, + "grad_norm": 1.7945314645767212, + "learning_rate": 4.954525528855272e-05, + "loss": 5.0339, + "step": 10225 + }, + { + "epoch": 0.06081691883147778, + "grad_norm": 1.7037841081619263, + "learning_rate": 4.954516659861678e-05, + "loss": 4.9308, + "step": 10226 + }, + { + "epoch": 0.060822866114758775, + "grad_norm": 1.8096303939819336, + "learning_rate": 4.954507790011237e-05, + "loss": 5.1173, + "step": 10227 + }, + { + "epoch": 0.06082881339803978, + "grad_norm": 1.7563896179199219, + "learning_rate": 4.954498919303952e-05, + "loss": 5.1713, + "step": 10228 + }, + { + "epoch": 0.06083476068132077, + "grad_norm": 1.8820421695709229, + "learning_rate": 4.954490047739827e-05, + "loss": 5.2372, + "step": 10229 + }, + { + "epoch": 0.06084070796460177, + "grad_norm": 2.7050085067749023, + "learning_rate": 4.954481175318865e-05, + "loss": 5.6108, + "step": 10230 + }, + { + "epoch": 0.06084665524788277, + "grad_norm": 1.6424611806869507, + "learning_rate": 4.954472302041069e-05, + "loss": 5.1423, + "step": 10231 + }, + { + "epoch": 0.060852602531163764, + "grad_norm": 1.7690013647079468, + "learning_rate": 4.954463427906443e-05, + "loss": 5.0232, + "step": 10232 + }, + { + "epoch": 0.06085854981444476, + "grad_norm": 1.8925920724868774, + "learning_rate": 4.9544545529149874e-05, + "loss": 4.8949, + "step": 10233 + }, + { + "epoch": 0.06086449709772576, + "grad_norm": 1.7629793882369995, + "learning_rate": 4.954445677066709e-05, + "loss": 4.8832, + "step": 10234 + }, + { + "epoch": 0.06087044438100676, + "grad_norm": 1.5553311109542847, + "learning_rate": 4.9544368003616084e-05, + "loss": 4.8787, + "step": 10235 + }, + { + "epoch": 0.06087639166428775, + "grad_norm": 1.6236152648925781, + "learning_rate": 4.9544279227996884e-05, + "loss": 4.8583, + "step": 10236 + }, + { + "epoch": 0.060882338947568754, + "grad_norm": 1.7591924667358398, + "learning_rate": 4.954419044380954e-05, + "loss": 5.1468, + "step": 10237 + }, + { + "epoch": 0.06088828623084975, + "grad_norm": 1.8084702491760254, + "learning_rate": 4.954410165105406e-05, + "loss": 5.3178, + "step": 10238 + }, + { + "epoch": 0.060894233514130744, + "grad_norm": 1.6629832983016968, + "learning_rate": 4.9544012849730495e-05, + "loss": 5.2955, + "step": 10239 + }, + { + "epoch": 0.06090018079741174, + "grad_norm": 1.6681956052780151, + "learning_rate": 4.954392403983887e-05, + "loss": 4.9919, + "step": 10240 + }, + { + "epoch": 0.06090612808069274, + "grad_norm": 1.7849150896072388, + "learning_rate": 4.954383522137922e-05, + "loss": 4.9667, + "step": 10241 + }, + { + "epoch": 0.060912075363973736, + "grad_norm": 1.6313222646713257, + "learning_rate": 4.954374639435157e-05, + "loss": 4.9842, + "step": 10242 + }, + { + "epoch": 0.06091802264725473, + "grad_norm": 1.3376604318618774, + "learning_rate": 4.954365755875594e-05, + "loss": 5.2643, + "step": 10243 + }, + { + "epoch": 0.06092396993053573, + "grad_norm": 1.5971726179122925, + "learning_rate": 4.954356871459238e-05, + "loss": 5.2225, + "step": 10244 + }, + { + "epoch": 0.06092991721381673, + "grad_norm": 1.638786792755127, + "learning_rate": 4.954347986186091e-05, + "loss": 5.2855, + "step": 10245 + }, + { + "epoch": 0.06093586449709772, + "grad_norm": 1.6273027658462524, + "learning_rate": 4.954339100056157e-05, + "loss": 5.3825, + "step": 10246 + }, + { + "epoch": 0.060941811780378725, + "grad_norm": 1.4666591882705688, + "learning_rate": 4.954330213069438e-05, + "loss": 5.3148, + "step": 10247 + }, + { + "epoch": 0.06094775906365972, + "grad_norm": 1.447332501411438, + "learning_rate": 4.954321325225938e-05, + "loss": 5.1907, + "step": 10248 + }, + { + "epoch": 0.060953706346940716, + "grad_norm": 1.7162379026412964, + "learning_rate": 4.95431243652566e-05, + "loss": 5.289, + "step": 10249 + }, + { + "epoch": 0.06095965363022172, + "grad_norm": 1.7236372232437134, + "learning_rate": 4.954303546968606e-05, + "loss": 5.1839, + "step": 10250 + }, + { + "epoch": 0.06096560091350271, + "grad_norm": 1.76384437084198, + "learning_rate": 4.954294656554781e-05, + "loss": 5.1665, + "step": 10251 + }, + { + "epoch": 0.06097154819678371, + "grad_norm": 1.595041275024414, + "learning_rate": 4.954285765284187e-05, + "loss": 5.2667, + "step": 10252 + }, + { + "epoch": 0.06097749548006471, + "grad_norm": 1.6735886335372925, + "learning_rate": 4.954276873156827e-05, + "loss": 5.3367, + "step": 10253 + }, + { + "epoch": 0.060983442763345705, + "grad_norm": 1.656801462173462, + "learning_rate": 4.9542679801727044e-05, + "loss": 5.3188, + "step": 10254 + }, + { + "epoch": 0.0609893900466267, + "grad_norm": 1.7149133682250977, + "learning_rate": 4.9542590863318214e-05, + "loss": 5.0618, + "step": 10255 + }, + { + "epoch": 0.060995337329907695, + "grad_norm": 1.715561032295227, + "learning_rate": 4.954250191634183e-05, + "loss": 5.2589, + "step": 10256 + }, + { + "epoch": 0.0610012846131887, + "grad_norm": 1.4005486965179443, + "learning_rate": 4.95424129607979e-05, + "loss": 5.1061, + "step": 10257 + }, + { + "epoch": 0.06100723189646969, + "grad_norm": 1.6608542203903198, + "learning_rate": 4.954232399668648e-05, + "loss": 5.3779, + "step": 10258 + }, + { + "epoch": 0.06101317917975069, + "grad_norm": 1.5471054315567017, + "learning_rate": 4.954223502400758e-05, + "loss": 5.448, + "step": 10259 + }, + { + "epoch": 0.06101912646303169, + "grad_norm": 1.6794294118881226, + "learning_rate": 4.9542146042761246e-05, + "loss": 5.1452, + "step": 10260 + }, + { + "epoch": 0.061025073746312684, + "grad_norm": 1.5416966676712036, + "learning_rate": 4.95420570529475e-05, + "loss": 5.2192, + "step": 10261 + }, + { + "epoch": 0.06103102102959368, + "grad_norm": 1.6667221784591675, + "learning_rate": 4.954196805456637e-05, + "loss": 5.3682, + "step": 10262 + }, + { + "epoch": 0.06103696831287468, + "grad_norm": 1.3199689388275146, + "learning_rate": 4.95418790476179e-05, + "loss": 5.1038, + "step": 10263 + }, + { + "epoch": 0.06104291559615568, + "grad_norm": 1.5326366424560547, + "learning_rate": 4.954179003210211e-05, + "loss": 5.3002, + "step": 10264 + }, + { + "epoch": 0.06104886287943667, + "grad_norm": 1.529453992843628, + "learning_rate": 4.954170100801904e-05, + "loss": 5.4515, + "step": 10265 + }, + { + "epoch": 0.061054810162717674, + "grad_norm": 1.719894528388977, + "learning_rate": 4.954161197536871e-05, + "loss": 5.4161, + "step": 10266 + }, + { + "epoch": 0.06106075744599867, + "grad_norm": 1.4632771015167236, + "learning_rate": 4.954152293415115e-05, + "loss": 5.4669, + "step": 10267 + }, + { + "epoch": 0.061066704729279664, + "grad_norm": 1.7698414325714111, + "learning_rate": 4.954143388436641e-05, + "loss": 5.4045, + "step": 10268 + }, + { + "epoch": 0.06107265201256066, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.95413448260145e-05, + "loss": 5.3637, + "step": 10269 + }, + { + "epoch": 0.06107859929584166, + "grad_norm": 1.6832401752471924, + "learning_rate": 4.954125575909547e-05, + "loss": 5.2123, + "step": 10270 + }, + { + "epoch": 0.061084546579122656, + "grad_norm": 1.6782628297805786, + "learning_rate": 4.954116668360933e-05, + "loss": 5.3007, + "step": 10271 + }, + { + "epoch": 0.06109049386240365, + "grad_norm": 1.598941683769226, + "learning_rate": 4.954107759955613e-05, + "loss": 5.1452, + "step": 10272 + }, + { + "epoch": 0.06109644114568465, + "grad_norm": 1.4137005805969238, + "learning_rate": 4.954098850693589e-05, + "loss": 5.1348, + "step": 10273 + }, + { + "epoch": 0.06110238842896565, + "grad_norm": 1.388108730316162, + "learning_rate": 4.9540899405748646e-05, + "loss": 5.4108, + "step": 10274 + }, + { + "epoch": 0.06110833571224664, + "grad_norm": 1.5997217893600464, + "learning_rate": 4.954081029599443e-05, + "loss": 5.3727, + "step": 10275 + }, + { + "epoch": 0.061114282995527645, + "grad_norm": 1.5805003643035889, + "learning_rate": 4.954072117767327e-05, + "loss": 5.4151, + "step": 10276 + }, + { + "epoch": 0.06112023027880864, + "grad_norm": 1.402063250541687, + "learning_rate": 4.9540632050785194e-05, + "loss": 5.287, + "step": 10277 + }, + { + "epoch": 0.061126177562089636, + "grad_norm": 1.6100205183029175, + "learning_rate": 4.9540542915330236e-05, + "loss": 5.2047, + "step": 10278 + }, + { + "epoch": 0.06113212484537064, + "grad_norm": 1.6199030876159668, + "learning_rate": 4.9540453771308435e-05, + "loss": 5.2141, + "step": 10279 + }, + { + "epoch": 0.06113807212865163, + "grad_norm": 1.485408067703247, + "learning_rate": 4.95403646187198e-05, + "loss": 5.1893, + "step": 10280 + }, + { + "epoch": 0.06114401941193263, + "grad_norm": 1.5842605829238892, + "learning_rate": 4.9540275457564395e-05, + "loss": 5.1383, + "step": 10281 + }, + { + "epoch": 0.06114996669521363, + "grad_norm": 1.5824682712554932, + "learning_rate": 4.9540186287842225e-05, + "loss": 5.1754, + "step": 10282 + }, + { + "epoch": 0.061155913978494625, + "grad_norm": 1.7714753150939941, + "learning_rate": 4.954009710955333e-05, + "loss": 5.2951, + "step": 10283 + }, + { + "epoch": 0.06116186126177562, + "grad_norm": 1.6528159379959106, + "learning_rate": 4.954000792269774e-05, + "loss": 5.2391, + "step": 10284 + }, + { + "epoch": 0.061167808545056615, + "grad_norm": 1.54135262966156, + "learning_rate": 4.953991872727549e-05, + "loss": 5.3849, + "step": 10285 + }, + { + "epoch": 0.06117375582833762, + "grad_norm": 1.4225090742111206, + "learning_rate": 4.953982952328661e-05, + "loss": 5.2211, + "step": 10286 + }, + { + "epoch": 0.06117970311161861, + "grad_norm": 1.7174444198608398, + "learning_rate": 4.953974031073112e-05, + "loss": 5.2873, + "step": 10287 + }, + { + "epoch": 0.06118565039489961, + "grad_norm": 1.4754962921142578, + "learning_rate": 4.953965108960907e-05, + "loss": 5.3137, + "step": 10288 + }, + { + "epoch": 0.06119159767818061, + "grad_norm": 1.6911029815673828, + "learning_rate": 4.9539561859920475e-05, + "loss": 5.1914, + "step": 10289 + }, + { + "epoch": 0.061197544961461604, + "grad_norm": 1.5569958686828613, + "learning_rate": 4.953947262166537e-05, + "loss": 5.2141, + "step": 10290 + }, + { + "epoch": 0.0612034922447426, + "grad_norm": 1.5939570665359497, + "learning_rate": 4.9539383374843794e-05, + "loss": 5.2059, + "step": 10291 + }, + { + "epoch": 0.0612094395280236, + "grad_norm": 1.7220442295074463, + "learning_rate": 4.953929411945577e-05, + "loss": 5.3399, + "step": 10292 + }, + { + "epoch": 0.061215386811304597, + "grad_norm": 1.7158905267715454, + "learning_rate": 4.953920485550134e-05, + "loss": 5.3392, + "step": 10293 + }, + { + "epoch": 0.06122133409458559, + "grad_norm": 1.5761021375656128, + "learning_rate": 4.9539115582980525e-05, + "loss": 5.1523, + "step": 10294 + }, + { + "epoch": 0.061227281377866594, + "grad_norm": 1.7746198177337646, + "learning_rate": 4.953902630189335e-05, + "loss": 5.1577, + "step": 10295 + }, + { + "epoch": 0.06123322866114759, + "grad_norm": 1.9633466005325317, + "learning_rate": 4.953893701223986e-05, + "loss": 5.448, + "step": 10296 + }, + { + "epoch": 0.061239175944428584, + "grad_norm": 1.7086774110794067, + "learning_rate": 4.953884771402007e-05, + "loss": 5.2624, + "step": 10297 + }, + { + "epoch": 0.06124512322770958, + "grad_norm": 1.5247907638549805, + "learning_rate": 4.953875840723403e-05, + "loss": 5.1644, + "step": 10298 + }, + { + "epoch": 0.06125107051099058, + "grad_norm": 1.7014293670654297, + "learning_rate": 4.953866909188177e-05, + "loss": 5.2118, + "step": 10299 + }, + { + "epoch": 0.061257017794271576, + "grad_norm": 1.390368103981018, + "learning_rate": 4.9538579767963305e-05, + "loss": 5.3159, + "step": 10300 + }, + { + "epoch": 0.06126296507755257, + "grad_norm": 1.4748090505599976, + "learning_rate": 4.953849043547868e-05, + "loss": 5.5283, + "step": 10301 + }, + { + "epoch": 0.06126891236083357, + "grad_norm": 1.6433857679367065, + "learning_rate": 4.953840109442792e-05, + "loss": 5.3388, + "step": 10302 + }, + { + "epoch": 0.06127485964411457, + "grad_norm": 1.6636543273925781, + "learning_rate": 4.9538311744811056e-05, + "loss": 5.4523, + "step": 10303 + }, + { + "epoch": 0.06128080692739556, + "grad_norm": 1.6074668169021606, + "learning_rate": 4.953822238662812e-05, + "loss": 5.2963, + "step": 10304 + }, + { + "epoch": 0.061286754210676565, + "grad_norm": 1.8746674060821533, + "learning_rate": 4.9538133019879155e-05, + "loss": 5.359, + "step": 10305 + }, + { + "epoch": 0.06129270149395756, + "grad_norm": 1.5438963174819946, + "learning_rate": 4.953804364456417e-05, + "loss": 5.2039, + "step": 10306 + }, + { + "epoch": 0.061298648777238555, + "grad_norm": 1.5594170093536377, + "learning_rate": 4.9537954260683205e-05, + "loss": 5.3003, + "step": 10307 + }, + { + "epoch": 0.06130459606051956, + "grad_norm": 1.3331657648086548, + "learning_rate": 4.95378648682363e-05, + "loss": 5.3051, + "step": 10308 + }, + { + "epoch": 0.06131054334380055, + "grad_norm": 1.5514707565307617, + "learning_rate": 4.953777546722348e-05, + "loss": 5.3344, + "step": 10309 + }, + { + "epoch": 0.06131649062708155, + "grad_norm": 1.6396936178207397, + "learning_rate": 4.953768605764477e-05, + "loss": 5.1244, + "step": 10310 + }, + { + "epoch": 0.06132243791036255, + "grad_norm": 1.576407551765442, + "learning_rate": 4.953759663950022e-05, + "loss": 5.1908, + "step": 10311 + }, + { + "epoch": 0.061328385193643545, + "grad_norm": 1.5868182182312012, + "learning_rate": 4.953750721278984e-05, + "loss": 5.2538, + "step": 10312 + }, + { + "epoch": 0.06133433247692454, + "grad_norm": 1.7734450101852417, + "learning_rate": 4.9537417777513664e-05, + "loss": 5.3727, + "step": 10313 + }, + { + "epoch": 0.061340279760205535, + "grad_norm": 1.5105754137039185, + "learning_rate": 4.953732833367174e-05, + "loss": 5.3547, + "step": 10314 + }, + { + "epoch": 0.06134622704348654, + "grad_norm": 1.5607833862304688, + "learning_rate": 4.953723888126408e-05, + "loss": 5.2265, + "step": 10315 + }, + { + "epoch": 0.06135217432676753, + "grad_norm": 1.2882065773010254, + "learning_rate": 4.9537149420290726e-05, + "loss": 4.9719, + "step": 10316 + }, + { + "epoch": 0.06135812161004853, + "grad_norm": 1.4349958896636963, + "learning_rate": 4.953705995075171e-05, + "loss": 5.2773, + "step": 10317 + }, + { + "epoch": 0.06136406889332953, + "grad_norm": 2.3595380783081055, + "learning_rate": 4.953697047264706e-05, + "loss": 5.7403, + "step": 10318 + }, + { + "epoch": 0.061370016176610524, + "grad_norm": 1.6126785278320312, + "learning_rate": 4.9536880985976805e-05, + "loss": 5.5316, + "step": 10319 + }, + { + "epoch": 0.06137596345989152, + "grad_norm": 1.7738999128341675, + "learning_rate": 4.953679149074098e-05, + "loss": 5.602, + "step": 10320 + }, + { + "epoch": 0.06138191074317252, + "grad_norm": 1.9263441562652588, + "learning_rate": 4.953670198693961e-05, + "loss": 5.0669, + "step": 10321 + }, + { + "epoch": 0.061387858026453516, + "grad_norm": 1.6290051937103271, + "learning_rate": 4.953661247457273e-05, + "loss": 5.2163, + "step": 10322 + }, + { + "epoch": 0.06139380530973451, + "grad_norm": 1.6354936361312866, + "learning_rate": 4.9536522953640374e-05, + "loss": 5.1678, + "step": 10323 + }, + { + "epoch": 0.061399752593015514, + "grad_norm": 1.7600759267807007, + "learning_rate": 4.953643342414257e-05, + "loss": 5.946, + "step": 10324 + }, + { + "epoch": 0.06140569987629651, + "grad_norm": 2.0515828132629395, + "learning_rate": 4.9536343886079357e-05, + "loss": 5.463, + "step": 10325 + }, + { + "epoch": 0.061411647159577504, + "grad_norm": 1.9990586042404175, + "learning_rate": 4.9536254339450754e-05, + "loss": 5.3084, + "step": 10326 + }, + { + "epoch": 0.0614175944428585, + "grad_norm": 1.7596598863601685, + "learning_rate": 4.95361647842568e-05, + "loss": 5.9268, + "step": 10327 + }, + { + "epoch": 0.0614235417261395, + "grad_norm": 1.8702850341796875, + "learning_rate": 4.953607522049752e-05, + "loss": 5.4303, + "step": 10328 + }, + { + "epoch": 0.061429489009420496, + "grad_norm": 1.9598991870880127, + "learning_rate": 4.953598564817296e-05, + "loss": 5.1813, + "step": 10329 + }, + { + "epoch": 0.06143543629270149, + "grad_norm": 1.5180566310882568, + "learning_rate": 4.953589606728314e-05, + "loss": 5.6051, + "step": 10330 + }, + { + "epoch": 0.06144138357598249, + "grad_norm": 1.4654324054718018, + "learning_rate": 4.953580647782808e-05, + "loss": 5.7188, + "step": 10331 + }, + { + "epoch": 0.06144733085926349, + "grad_norm": 1.351413607597351, + "learning_rate": 4.9535716879807835e-05, + "loss": 5.6928, + "step": 10332 + }, + { + "epoch": 0.06145327814254448, + "grad_norm": 1.4495320320129395, + "learning_rate": 4.953562727322242e-05, + "loss": 5.5576, + "step": 10333 + }, + { + "epoch": 0.061459225425825485, + "grad_norm": 1.4851731061935425, + "learning_rate": 4.953553765807187e-05, + "loss": 5.31, + "step": 10334 + }, + { + "epoch": 0.06146517270910648, + "grad_norm": 1.9790018796920776, + "learning_rate": 4.953544803435622e-05, + "loss": 5.5375, + "step": 10335 + }, + { + "epoch": 0.061471119992387475, + "grad_norm": 1.6931076049804688, + "learning_rate": 4.953535840207549e-05, + "loss": 5.6863, + "step": 10336 + }, + { + "epoch": 0.06147706727566848, + "grad_norm": 1.7479010820388794, + "learning_rate": 4.9535268761229735e-05, + "loss": 5.571, + "step": 10337 + }, + { + "epoch": 0.06148301455894947, + "grad_norm": 2.0722434520721436, + "learning_rate": 4.953517911181896e-05, + "loss": 5.2462, + "step": 10338 + }, + { + "epoch": 0.06148896184223047, + "grad_norm": 2.125288486480713, + "learning_rate": 4.953508945384322e-05, + "loss": 5.6343, + "step": 10339 + }, + { + "epoch": 0.06149490912551147, + "grad_norm": 2.0187058448791504, + "learning_rate": 4.953499978730252e-05, + "loss": 5.8642, + "step": 10340 + }, + { + "epoch": 0.061500856408792465, + "grad_norm": 1.6849068403244019, + "learning_rate": 4.9534910112196906e-05, + "loss": 5.5534, + "step": 10341 + }, + { + "epoch": 0.06150680369207346, + "grad_norm": 2.008009433746338, + "learning_rate": 4.953482042852641e-05, + "loss": 5.464, + "step": 10342 + }, + { + "epoch": 0.061512750975354455, + "grad_norm": 1.7537699937820435, + "learning_rate": 4.953473073629107e-05, + "loss": 5.9052, + "step": 10343 + }, + { + "epoch": 0.06151869825863546, + "grad_norm": 1.5746090412139893, + "learning_rate": 4.95346410354909e-05, + "loss": 5.6898, + "step": 10344 + }, + { + "epoch": 0.06152464554191645, + "grad_norm": 2.027543783187866, + "learning_rate": 4.9534551326125944e-05, + "loss": 6.0481, + "step": 10345 + }, + { + "epoch": 0.06153059282519745, + "grad_norm": 1.6113003492355347, + "learning_rate": 4.9534461608196224e-05, + "loss": 5.4792, + "step": 10346 + }, + { + "epoch": 0.06153654010847845, + "grad_norm": 1.5709928274154663, + "learning_rate": 4.953437188170178e-05, + "loss": 5.7601, + "step": 10347 + }, + { + "epoch": 0.061542487391759444, + "grad_norm": 1.7116700410842896, + "learning_rate": 4.953428214664265e-05, + "loss": 5.7284, + "step": 10348 + }, + { + "epoch": 0.06154843467504044, + "grad_norm": 2.262103796005249, + "learning_rate": 4.953419240301884e-05, + "loss": 5.7247, + "step": 10349 + }, + { + "epoch": 0.06155438195832144, + "grad_norm": 1.8536508083343506, + "learning_rate": 4.9534102650830406e-05, + "loss": 5.7509, + "step": 10350 + }, + { + "epoch": 0.061560329241602436, + "grad_norm": 2.1372785568237305, + "learning_rate": 4.953401289007737e-05, + "loss": 5.8436, + "step": 10351 + }, + { + "epoch": 0.06156627652488343, + "grad_norm": 2.5555527210235596, + "learning_rate": 4.953392312075976e-05, + "loss": 5.6481, + "step": 10352 + }, + { + "epoch": 0.061572223808164434, + "grad_norm": 2.607111692428589, + "learning_rate": 4.953383334287761e-05, + "loss": 5.4822, + "step": 10353 + }, + { + "epoch": 0.06157817109144543, + "grad_norm": 2.728994369506836, + "learning_rate": 4.953374355643095e-05, + "loss": 5.4327, + "step": 10354 + }, + { + "epoch": 0.061584118374726424, + "grad_norm": 2.3375606536865234, + "learning_rate": 4.953365376141983e-05, + "loss": 5.537, + "step": 10355 + }, + { + "epoch": 0.06159006565800742, + "grad_norm": 2.4509146213531494, + "learning_rate": 4.953356395784425e-05, + "loss": 5.5717, + "step": 10356 + }, + { + "epoch": 0.06159601294128842, + "grad_norm": 2.412198781967163, + "learning_rate": 4.953347414570426e-05, + "loss": 5.5216, + "step": 10357 + }, + { + "epoch": 0.061601960224569416, + "grad_norm": 1.7105822563171387, + "learning_rate": 4.9533384324999886e-05, + "loss": 5.6661, + "step": 10358 + }, + { + "epoch": 0.06160790750785041, + "grad_norm": 2.2394793033599854, + "learning_rate": 4.953329449573116e-05, + "loss": 5.2062, + "step": 10359 + }, + { + "epoch": 0.06161385479113141, + "grad_norm": 2.1791203022003174, + "learning_rate": 4.9533204657898127e-05, + "loss": 5.1961, + "step": 10360 + }, + { + "epoch": 0.06161980207441241, + "grad_norm": 2.0430495738983154, + "learning_rate": 4.953311481150079e-05, + "loss": 5.1492, + "step": 10361 + }, + { + "epoch": 0.0616257493576934, + "grad_norm": 2.157975435256958, + "learning_rate": 4.9533024956539204e-05, + "loss": 4.9354, + "step": 10362 + }, + { + "epoch": 0.061631696640974405, + "grad_norm": 2.101484537124634, + "learning_rate": 4.953293509301339e-05, + "loss": 4.9212, + "step": 10363 + }, + { + "epoch": 0.0616376439242554, + "grad_norm": 1.740793228149414, + "learning_rate": 4.953284522092338e-05, + "loss": 5.1234, + "step": 10364 + }, + { + "epoch": 0.061643591207536395, + "grad_norm": 1.9694514274597168, + "learning_rate": 4.953275534026921e-05, + "loss": 5.3688, + "step": 10365 + }, + { + "epoch": 0.0616495384908174, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.953266545105091e-05, + "loss": 4.7194, + "step": 10366 + }, + { + "epoch": 0.06165548577409839, + "grad_norm": 2.016284942626953, + "learning_rate": 4.95325755532685e-05, + "loss": 4.7397, + "step": 10367 + }, + { + "epoch": 0.06166143305737939, + "grad_norm": 2.3073251247406006, + "learning_rate": 4.9532485646922036e-05, + "loss": 4.59, + "step": 10368 + }, + { + "epoch": 0.06166738034066039, + "grad_norm": 2.265873670578003, + "learning_rate": 4.9532395732011524e-05, + "loss": 4.7713, + "step": 10369 + }, + { + "epoch": 0.061673327623941385, + "grad_norm": 1.8176212310791016, + "learning_rate": 4.953230580853701e-05, + "loss": 5.2288, + "step": 10370 + }, + { + "epoch": 0.06167927490722238, + "grad_norm": 2.3636794090270996, + "learning_rate": 4.953221587649852e-05, + "loss": 5.1683, + "step": 10371 + }, + { + "epoch": 0.061685222190503375, + "grad_norm": 1.8074215650558472, + "learning_rate": 4.953212593589609e-05, + "loss": 6.037, + "step": 10372 + }, + { + "epoch": 0.06169116947378438, + "grad_norm": 2.1368768215179443, + "learning_rate": 4.953203598672975e-05, + "loss": 5.8481, + "step": 10373 + }, + { + "epoch": 0.06169711675706537, + "grad_norm": 2.924474000930786, + "learning_rate": 4.953194602899952e-05, + "loss": 4.327, + "step": 10374 + }, + { + "epoch": 0.06170306404034637, + "grad_norm": 2.412336826324463, + "learning_rate": 4.953185606270545e-05, + "loss": 4.3885, + "step": 10375 + }, + { + "epoch": 0.06170901132362737, + "grad_norm": 1.9676904678344727, + "learning_rate": 4.953176608784756e-05, + "loss": 5.4581, + "step": 10376 + }, + { + "epoch": 0.061714958606908364, + "grad_norm": 2.1357827186584473, + "learning_rate": 4.953167610442588e-05, + "loss": 6.1762, + "step": 10377 + }, + { + "epoch": 0.06172090589018936, + "grad_norm": 1.912763237953186, + "learning_rate": 4.953158611244045e-05, + "loss": 6.3403, + "step": 10378 + }, + { + "epoch": 0.06172685317347036, + "grad_norm": 2.0528855323791504, + "learning_rate": 4.95314961118913e-05, + "loss": 6.1921, + "step": 10379 + }, + { + "epoch": 0.061732800456751356, + "grad_norm": 2.1858723163604736, + "learning_rate": 4.953140610277846e-05, + "loss": 5.1944, + "step": 10380 + }, + { + "epoch": 0.06173874774003235, + "grad_norm": 2.04040265083313, + "learning_rate": 4.9531316085101944e-05, + "loss": 5.1866, + "step": 10381 + }, + { + "epoch": 0.06174469502331335, + "grad_norm": 2.216113567352295, + "learning_rate": 4.953122605886181e-05, + "loss": 5.5625, + "step": 10382 + }, + { + "epoch": 0.06175064230659435, + "grad_norm": 1.7107234001159668, + "learning_rate": 4.9531136024058076e-05, + "loss": 5.917, + "step": 10383 + }, + { + "epoch": 0.061756589589875344, + "grad_norm": 1.983104944229126, + "learning_rate": 4.9531045980690776e-05, + "loss": 6.0113, + "step": 10384 + }, + { + "epoch": 0.06176253687315634, + "grad_norm": 2.0186147689819336, + "learning_rate": 4.9530955928759945e-05, + "loss": 6.5227, + "step": 10385 + }, + { + "epoch": 0.06176848415643734, + "grad_norm": 1.8337477445602417, + "learning_rate": 4.9530865868265605e-05, + "loss": 5.9586, + "step": 10386 + }, + { + "epoch": 0.061774431439718336, + "grad_norm": 1.6523345708847046, + "learning_rate": 4.9530775799207795e-05, + "loss": 5.7073, + "step": 10387 + }, + { + "epoch": 0.06178037872299933, + "grad_norm": 1.617838740348816, + "learning_rate": 4.953068572158654e-05, + "loss": 5.3771, + "step": 10388 + }, + { + "epoch": 0.06178632600628033, + "grad_norm": 1.7327697277069092, + "learning_rate": 4.953059563540189e-05, + "loss": 5.3021, + "step": 10389 + }, + { + "epoch": 0.06179227328956133, + "grad_norm": 2.726762294769287, + "learning_rate": 4.9530505540653856e-05, + "loss": 5.2568, + "step": 10390 + }, + { + "epoch": 0.06179822057284232, + "grad_norm": 2.540090560913086, + "learning_rate": 4.953041543734247e-05, + "loss": 5.114, + "step": 10391 + }, + { + "epoch": 0.061804167856123325, + "grad_norm": 2.26487135887146, + "learning_rate": 4.953032532546777e-05, + "loss": 5.2552, + "step": 10392 + }, + { + "epoch": 0.06181011513940432, + "grad_norm": 1.9986075162887573, + "learning_rate": 4.95302352050298e-05, + "loss": 5.3555, + "step": 10393 + }, + { + "epoch": 0.061816062422685315, + "grad_norm": 2.2121987342834473, + "learning_rate": 4.9530145076028564e-05, + "loss": 5.665, + "step": 10394 + }, + { + "epoch": 0.06182200970596632, + "grad_norm": 1.892927646636963, + "learning_rate": 4.953005493846411e-05, + "loss": 5.2536, + "step": 10395 + }, + { + "epoch": 0.06182795698924731, + "grad_norm": 2.1083126068115234, + "learning_rate": 4.952996479233647e-05, + "loss": 6.1748, + "step": 10396 + }, + { + "epoch": 0.06183390427252831, + "grad_norm": 2.2235448360443115, + "learning_rate": 4.9529874637645675e-05, + "loss": 6.0676, + "step": 10397 + }, + { + "epoch": 0.06183985155580931, + "grad_norm": 2.0888702869415283, + "learning_rate": 4.952978447439175e-05, + "loss": 5.2515, + "step": 10398 + }, + { + "epoch": 0.061845798839090305, + "grad_norm": 1.826622724533081, + "learning_rate": 4.9529694302574736e-05, + "loss": 5.6849, + "step": 10399 + }, + { + "epoch": 0.0618517461223713, + "grad_norm": 1.9772933721542358, + "learning_rate": 4.952960412219465e-05, + "loss": 5.7702, + "step": 10400 + }, + { + "epoch": 0.061857693405652295, + "grad_norm": 2.2230029106140137, + "learning_rate": 4.952951393325154e-05, + "loss": 5.5747, + "step": 10401 + }, + { + "epoch": 0.0618636406889333, + "grad_norm": 1.9372552633285522, + "learning_rate": 4.9529423735745425e-05, + "loss": 5.4728, + "step": 10402 + }, + { + "epoch": 0.06186958797221429, + "grad_norm": 2.2238845825195312, + "learning_rate": 4.952933352967635e-05, + "loss": 5.2462, + "step": 10403 + }, + { + "epoch": 0.06187553525549529, + "grad_norm": 1.7716748714447021, + "learning_rate": 4.952924331504433e-05, + "loss": 5.5651, + "step": 10404 + }, + { + "epoch": 0.06188148253877629, + "grad_norm": 2.2933645248413086, + "learning_rate": 4.9529153091849405e-05, + "loss": 5.8684, + "step": 10405 + }, + { + "epoch": 0.061887429822057284, + "grad_norm": 2.222883939743042, + "learning_rate": 4.9529062860091616e-05, + "loss": 5.8427, + "step": 10406 + }, + { + "epoch": 0.06189337710533828, + "grad_norm": 1.645338773727417, + "learning_rate": 4.9528972619770975e-05, + "loss": 5.7001, + "step": 10407 + }, + { + "epoch": 0.06189932438861928, + "grad_norm": 2.1029653549194336, + "learning_rate": 4.952888237088752e-05, + "loss": 5.728, + "step": 10408 + }, + { + "epoch": 0.061905271671900276, + "grad_norm": 2.2689831256866455, + "learning_rate": 4.952879211344129e-05, + "loss": 5.4678, + "step": 10409 + }, + { + "epoch": 0.06191121895518127, + "grad_norm": 1.908469557762146, + "learning_rate": 4.9528701847432315e-05, + "loss": 6.007, + "step": 10410 + }, + { + "epoch": 0.06191716623846227, + "grad_norm": 1.819381833076477, + "learning_rate": 4.952861157286062e-05, + "loss": 6.2041, + "step": 10411 + }, + { + "epoch": 0.06192311352174327, + "grad_norm": 2.16945743560791, + "learning_rate": 4.952852128972624e-05, + "loss": 5.7757, + "step": 10412 + }, + { + "epoch": 0.061929060805024264, + "grad_norm": 2.1671459674835205, + "learning_rate": 4.952843099802921e-05, + "loss": 5.5212, + "step": 10413 + }, + { + "epoch": 0.061935008088305266, + "grad_norm": 1.730073094367981, + "learning_rate": 4.952834069776956e-05, + "loss": 5.809, + "step": 10414 + }, + { + "epoch": 0.06194095537158626, + "grad_norm": 2.1048457622528076, + "learning_rate": 4.952825038894732e-05, + "loss": 5.7219, + "step": 10415 + }, + { + "epoch": 0.061946902654867256, + "grad_norm": 2.7438642978668213, + "learning_rate": 4.9528160071562516e-05, + "loss": 5.6367, + "step": 10416 + }, + { + "epoch": 0.06195284993814825, + "grad_norm": 2.0103960037231445, + "learning_rate": 4.952806974561518e-05, + "loss": 5.1429, + "step": 10417 + }, + { + "epoch": 0.06195879722142925, + "grad_norm": 2.1754884719848633, + "learning_rate": 4.9527979411105354e-05, + "loss": 5.9337, + "step": 10418 + }, + { + "epoch": 0.06196474450471025, + "grad_norm": 2.553421974182129, + "learning_rate": 4.9527889068033063e-05, + "loss": 5.7076, + "step": 10419 + }, + { + "epoch": 0.06197069178799124, + "grad_norm": 2.0601327419281006, + "learning_rate": 4.952779871639834e-05, + "loss": 5.7855, + "step": 10420 + }, + { + "epoch": 0.061976639071272245, + "grad_norm": 2.0958025455474854, + "learning_rate": 4.952770835620122e-05, + "loss": 5.8621, + "step": 10421 + }, + { + "epoch": 0.06198258635455324, + "grad_norm": 2.2658755779266357, + "learning_rate": 4.952761798744172e-05, + "loss": 5.9306, + "step": 10422 + }, + { + "epoch": 0.061988533637834235, + "grad_norm": 1.933090090751648, + "learning_rate": 4.9527527610119896e-05, + "loss": 5.1557, + "step": 10423 + }, + { + "epoch": 0.06199448092111524, + "grad_norm": 2.5761375427246094, + "learning_rate": 4.952743722423575e-05, + "loss": 5.4438, + "step": 10424 + }, + { + "epoch": 0.06200042820439623, + "grad_norm": 2.0499768257141113, + "learning_rate": 4.9527346829789344e-05, + "loss": 5.4153, + "step": 10425 + }, + { + "epoch": 0.06200637548767723, + "grad_norm": 1.970674991607666, + "learning_rate": 4.952725642678069e-05, + "loss": 5.8678, + "step": 10426 + }, + { + "epoch": 0.06201232277095823, + "grad_norm": 2.4563233852386475, + "learning_rate": 4.9527166015209814e-05, + "loss": 4.926, + "step": 10427 + }, + { + "epoch": 0.062018270054239225, + "grad_norm": 1.8380508422851562, + "learning_rate": 4.9527075595076763e-05, + "loss": 4.9619, + "step": 10428 + }, + { + "epoch": 0.06202421733752022, + "grad_norm": 1.8930846452713013, + "learning_rate": 4.9526985166381565e-05, + "loss": 4.8252, + "step": 10429 + }, + { + "epoch": 0.062030164620801215, + "grad_norm": 2.401026725769043, + "learning_rate": 4.952689472912426e-05, + "loss": 4.5023, + "step": 10430 + }, + { + "epoch": 0.06203611190408222, + "grad_norm": 2.2801949977874756, + "learning_rate": 4.952680428330486e-05, + "loss": 4.6461, + "step": 10431 + }, + { + "epoch": 0.06204205918736321, + "grad_norm": 2.2466189861297607, + "learning_rate": 4.95267138289234e-05, + "loss": 4.5946, + "step": 10432 + }, + { + "epoch": 0.06204800647064421, + "grad_norm": 2.1723902225494385, + "learning_rate": 4.952662336597993e-05, + "loss": 5.6417, + "step": 10433 + }, + { + "epoch": 0.06205395375392521, + "grad_norm": 1.9614545106887817, + "learning_rate": 4.952653289447446e-05, + "loss": 5.0758, + "step": 10434 + }, + { + "epoch": 0.062059901037206204, + "grad_norm": 2.465252637863159, + "learning_rate": 4.9526442414407036e-05, + "loss": 4.6159, + "step": 10435 + }, + { + "epoch": 0.0620658483204872, + "grad_norm": 2.2298080921173096, + "learning_rate": 4.9526351925777684e-05, + "loss": 5.24, + "step": 10436 + }, + { + "epoch": 0.0620717956037682, + "grad_norm": 2.1284472942352295, + "learning_rate": 4.952626142858643e-05, + "loss": 4.5255, + "step": 10437 + }, + { + "epoch": 0.062077742887049196, + "grad_norm": 2.1340067386627197, + "learning_rate": 4.9526170922833314e-05, + "loss": 4.5931, + "step": 10438 + }, + { + "epoch": 0.06208369017033019, + "grad_norm": 2.20354962348938, + "learning_rate": 4.952608040851837e-05, + "loss": 4.7688, + "step": 10439 + }, + { + "epoch": 0.06208963745361119, + "grad_norm": 1.5250015258789062, + "learning_rate": 4.952598988564162e-05, + "loss": 5.3292, + "step": 10440 + }, + { + "epoch": 0.06209558473689219, + "grad_norm": 2.1667168140411377, + "learning_rate": 4.95258993542031e-05, + "loss": 5.6216, + "step": 10441 + }, + { + "epoch": 0.062101532020173184, + "grad_norm": 1.8172663450241089, + "learning_rate": 4.9525808814202846e-05, + "loss": 5.5813, + "step": 10442 + }, + { + "epoch": 0.062107479303454186, + "grad_norm": 1.9832731485366821, + "learning_rate": 4.9525718265640884e-05, + "loss": 5.4444, + "step": 10443 + }, + { + "epoch": 0.06211342658673518, + "grad_norm": 2.051358699798584, + "learning_rate": 4.952562770851724e-05, + "loss": 5.3488, + "step": 10444 + }, + { + "epoch": 0.062119373870016176, + "grad_norm": 2.1487104892730713, + "learning_rate": 4.952553714283196e-05, + "loss": 5.3803, + "step": 10445 + }, + { + "epoch": 0.06212532115329717, + "grad_norm": 2.086853504180908, + "learning_rate": 4.952544656858507e-05, + "loss": 5.4585, + "step": 10446 + }, + { + "epoch": 0.06213126843657817, + "grad_norm": 2.1599764823913574, + "learning_rate": 4.95253559857766e-05, + "loss": 5.3728, + "step": 10447 + }, + { + "epoch": 0.06213721571985917, + "grad_norm": 1.877626657485962, + "learning_rate": 4.9525265394406576e-05, + "loss": 5.433, + "step": 10448 + }, + { + "epoch": 0.06214316300314016, + "grad_norm": 2.022185802459717, + "learning_rate": 4.952517479447504e-05, + "loss": 5.6472, + "step": 10449 + }, + { + "epoch": 0.062149110286421165, + "grad_norm": 2.1667773723602295, + "learning_rate": 4.9525084185982015e-05, + "loss": 5.3174, + "step": 10450 + }, + { + "epoch": 0.06215505756970216, + "grad_norm": 1.6227883100509644, + "learning_rate": 4.952499356892753e-05, + "loss": 5.3747, + "step": 10451 + }, + { + "epoch": 0.062161004852983155, + "grad_norm": 1.935307502746582, + "learning_rate": 4.952490294331164e-05, + "loss": 5.7716, + "step": 10452 + }, + { + "epoch": 0.06216695213626416, + "grad_norm": 2.6584694385528564, + "learning_rate": 4.952481230913435e-05, + "loss": 5.3525, + "step": 10453 + }, + { + "epoch": 0.06217289941954515, + "grad_norm": 2.626344919204712, + "learning_rate": 4.9524721666395705e-05, + "loss": 5.2118, + "step": 10454 + }, + { + "epoch": 0.06217884670282615, + "grad_norm": 2.525580644607544, + "learning_rate": 4.9524631015095735e-05, + "loss": 5.1231, + "step": 10455 + }, + { + "epoch": 0.06218479398610715, + "grad_norm": 2.274801015853882, + "learning_rate": 4.9524540355234464e-05, + "loss": 5.0637, + "step": 10456 + }, + { + "epoch": 0.062190741269388145, + "grad_norm": 1.9937769174575806, + "learning_rate": 4.952444968681193e-05, + "loss": 5.8196, + "step": 10457 + }, + { + "epoch": 0.06219668855266914, + "grad_norm": 2.124290943145752, + "learning_rate": 4.952435900982816e-05, + "loss": 5.5221, + "step": 10458 + }, + { + "epoch": 0.062202635835950135, + "grad_norm": 2.2544684410095215, + "learning_rate": 4.95242683242832e-05, + "loss": 5.6656, + "step": 10459 + }, + { + "epoch": 0.06220858311923114, + "grad_norm": 2.2626397609710693, + "learning_rate": 4.952417763017706e-05, + "loss": 5.5836, + "step": 10460 + }, + { + "epoch": 0.06221453040251213, + "grad_norm": 1.9299595355987549, + "learning_rate": 4.9524086927509796e-05, + "loss": 5.6637, + "step": 10461 + }, + { + "epoch": 0.06222047768579313, + "grad_norm": 1.769463062286377, + "learning_rate": 4.952399621628142e-05, + "loss": 5.4836, + "step": 10462 + }, + { + "epoch": 0.06222642496907413, + "grad_norm": 1.6773936748504639, + "learning_rate": 4.952390549649196e-05, + "loss": 5.2894, + "step": 10463 + }, + { + "epoch": 0.062232372252355124, + "grad_norm": 1.7612723112106323, + "learning_rate": 4.952381476814148e-05, + "loss": 5.5438, + "step": 10464 + }, + { + "epoch": 0.06223831953563612, + "grad_norm": 2.5255069732666016, + "learning_rate": 4.952372403122997e-05, + "loss": 5.7864, + "step": 10465 + }, + { + "epoch": 0.06224426681891712, + "grad_norm": 2.1128363609313965, + "learning_rate": 4.9523633285757486e-05, + "loss": 5.6207, + "step": 10466 + }, + { + "epoch": 0.062250214102198116, + "grad_norm": 1.8612544536590576, + "learning_rate": 4.952354253172407e-05, + "loss": 5.9177, + "step": 10467 + }, + { + "epoch": 0.06225616138547911, + "grad_norm": 2.092707633972168, + "learning_rate": 4.9523451769129715e-05, + "loss": 5.6047, + "step": 10468 + }, + { + "epoch": 0.06226210866876011, + "grad_norm": 2.6695668697357178, + "learning_rate": 4.952336099797449e-05, + "loss": 5.4931, + "step": 10469 + }, + { + "epoch": 0.06226805595204111, + "grad_norm": 2.2714614868164062, + "learning_rate": 4.9523270218258414e-05, + "loss": 5.4481, + "step": 10470 + }, + { + "epoch": 0.0622740032353221, + "grad_norm": 2.035304307937622, + "learning_rate": 4.952317942998151e-05, + "loss": 5.3609, + "step": 10471 + }, + { + "epoch": 0.062279950518603105, + "grad_norm": 2.295647144317627, + "learning_rate": 4.952308863314382e-05, + "loss": 5.5687, + "step": 10472 + }, + { + "epoch": 0.0622858978018841, + "grad_norm": 1.8365178108215332, + "learning_rate": 4.9522997827745375e-05, + "loss": 5.4207, + "step": 10473 + }, + { + "epoch": 0.062291845085165096, + "grad_norm": 1.6130415201187134, + "learning_rate": 4.9522907013786206e-05, + "loss": 5.1894, + "step": 10474 + }, + { + "epoch": 0.06229779236844609, + "grad_norm": 2.01560115814209, + "learning_rate": 4.952281619126634e-05, + "loss": 5.4956, + "step": 10475 + }, + { + "epoch": 0.06230373965172709, + "grad_norm": 2.7854549884796143, + "learning_rate": 4.952272536018582e-05, + "loss": 5.2341, + "step": 10476 + }, + { + "epoch": 0.06230968693500809, + "grad_norm": 2.7532944679260254, + "learning_rate": 4.9522634520544666e-05, + "loss": 5.1863, + "step": 10477 + }, + { + "epoch": 0.06231563421828908, + "grad_norm": 2.193084239959717, + "learning_rate": 4.952254367234291e-05, + "loss": 5.5187, + "step": 10478 + }, + { + "epoch": 0.062321581501570085, + "grad_norm": 2.245664119720459, + "learning_rate": 4.952245281558059e-05, + "loss": 5.1275, + "step": 10479 + }, + { + "epoch": 0.06232752878485108, + "grad_norm": 2.0522654056549072, + "learning_rate": 4.9522361950257734e-05, + "loss": 5.2887, + "step": 10480 + }, + { + "epoch": 0.062333476068132075, + "grad_norm": 2.132280111312866, + "learning_rate": 4.952227107637437e-05, + "loss": 5.8767, + "step": 10481 + }, + { + "epoch": 0.06233942335141308, + "grad_norm": 2.155574083328247, + "learning_rate": 4.952218019393055e-05, + "loss": 5.9499, + "step": 10482 + }, + { + "epoch": 0.06234537063469407, + "grad_norm": 2.3979780673980713, + "learning_rate": 4.952208930292627e-05, + "loss": 5.7622, + "step": 10483 + }, + { + "epoch": 0.06235131791797507, + "grad_norm": 2.444812297821045, + "learning_rate": 4.9521998403361595e-05, + "loss": 5.3332, + "step": 10484 + }, + { + "epoch": 0.06235726520125607, + "grad_norm": 2.369248867034912, + "learning_rate": 4.952190749523654e-05, + "loss": 5.109, + "step": 10485 + }, + { + "epoch": 0.062363212484537064, + "grad_norm": 1.9160844087600708, + "learning_rate": 4.952181657855114e-05, + "loss": 5.1783, + "step": 10486 + }, + { + "epoch": 0.06236915976781806, + "grad_norm": 2.1532788276672363, + "learning_rate": 4.952172565330543e-05, + "loss": 5.913, + "step": 10487 + }, + { + "epoch": 0.062375107051099055, + "grad_norm": 2.132382392883301, + "learning_rate": 4.9521634719499435e-05, + "loss": 5.7748, + "step": 10488 + }, + { + "epoch": 0.06238105433438006, + "grad_norm": 2.22267484664917, + "learning_rate": 4.9521543777133194e-05, + "loss": 5.6464, + "step": 10489 + }, + { + "epoch": 0.06238700161766105, + "grad_norm": 2.0619423389434814, + "learning_rate": 4.952145282620674e-05, + "loss": 5.4881, + "step": 10490 + }, + { + "epoch": 0.06239294890094205, + "grad_norm": 2.9574310779571533, + "learning_rate": 4.952136186672009e-05, + "loss": 5.4401, + "step": 10491 + }, + { + "epoch": 0.06239889618422305, + "grad_norm": 1.7362775802612305, + "learning_rate": 4.952127089867329e-05, + "loss": 6.0755, + "step": 10492 + }, + { + "epoch": 0.062404843467504044, + "grad_norm": 1.8244996070861816, + "learning_rate": 4.952117992206637e-05, + "loss": 6.2588, + "step": 10493 + }, + { + "epoch": 0.06241079075078504, + "grad_norm": 1.8556538820266724, + "learning_rate": 4.952108893689936e-05, + "loss": 6.0827, + "step": 10494 + }, + { + "epoch": 0.06241673803406604, + "grad_norm": 2.2471442222595215, + "learning_rate": 4.9520997943172285e-05, + "loss": 5.98, + "step": 10495 + }, + { + "epoch": 0.062422685317347036, + "grad_norm": 3.0217249393463135, + "learning_rate": 4.9520906940885186e-05, + "loss": 5.5116, + "step": 10496 + }, + { + "epoch": 0.06242863260062803, + "grad_norm": 2.02962064743042, + "learning_rate": 4.9520815930038086e-05, + "loss": 5.9341, + "step": 10497 + }, + { + "epoch": 0.06243457988390903, + "grad_norm": 1.6286019086837769, + "learning_rate": 4.9520724910631034e-05, + "loss": 5.1944, + "step": 10498 + }, + { + "epoch": 0.06244052716719003, + "grad_norm": 1.9963330030441284, + "learning_rate": 4.9520633882664044e-05, + "loss": 6.0584, + "step": 10499 + }, + { + "epoch": 0.06244647445047102, + "grad_norm": 1.884988784790039, + "learning_rate": 4.9520542846137155e-05, + "loss": 6.2744, + "step": 10500 + }, + { + "epoch": 0.062452421733752025, + "grad_norm": 1.9402821063995361, + "learning_rate": 4.95204518010504e-05, + "loss": 5.9201, + "step": 10501 + }, + { + "epoch": 0.06245836901703302, + "grad_norm": 1.9304310083389282, + "learning_rate": 4.9520360747403805e-05, + "loss": 5.7227, + "step": 10502 + }, + { + "epoch": 0.062464316300314016, + "grad_norm": 2.8199663162231445, + "learning_rate": 4.9520269685197405e-05, + "loss": 6.4819, + "step": 10503 + }, + { + "epoch": 0.06247026358359501, + "grad_norm": 1.456852912902832, + "learning_rate": 4.9520178614431236e-05, + "loss": 5.3169, + "step": 10504 + }, + { + "epoch": 0.06247621086687601, + "grad_norm": 2.3753762245178223, + "learning_rate": 4.9520087535105324e-05, + "loss": 5.9817, + "step": 10505 + }, + { + "epoch": 0.06248215815015701, + "grad_norm": 2.329932928085327, + "learning_rate": 4.951999644721971e-05, + "loss": 6.0266, + "step": 10506 + }, + { + "epoch": 0.062488105433438, + "grad_norm": 1.772615671157837, + "learning_rate": 4.951990535077441e-05, + "loss": 5.2548, + "step": 10507 + }, + { + "epoch": 0.062494052716719005, + "grad_norm": 2.1240997314453125, + "learning_rate": 4.951981424576946e-05, + "loss": 5.3991, + "step": 10508 + }, + { + "epoch": 0.0625, + "grad_norm": 1.7283856868743896, + "learning_rate": 4.9519723132204905e-05, + "loss": 5.2065, + "step": 10509 + }, + { + "epoch": 0.062505947283281, + "grad_norm": 2.197404384613037, + "learning_rate": 4.951963201008076e-05, + "loss": 5.7282, + "step": 10510 + }, + { + "epoch": 0.06251189456656199, + "grad_norm": 1.8550727367401123, + "learning_rate": 4.9519540879397075e-05, + "loss": 6.0125, + "step": 10511 + }, + { + "epoch": 0.06251784184984299, + "grad_norm": 1.5998154878616333, + "learning_rate": 4.951944974015387e-05, + "loss": 5.9371, + "step": 10512 + }, + { + "epoch": 0.062523789133124, + "grad_norm": 1.644454836845398, + "learning_rate": 4.951935859235117e-05, + "loss": 5.9315, + "step": 10513 + }, + { + "epoch": 0.06252973641640498, + "grad_norm": 1.9119540452957153, + "learning_rate": 4.951926743598902e-05, + "loss": 5.7104, + "step": 10514 + }, + { + "epoch": 0.06253568369968598, + "grad_norm": 1.8863649368286133, + "learning_rate": 4.951917627106745e-05, + "loss": 5.8639, + "step": 10515 + }, + { + "epoch": 0.06254163098296699, + "grad_norm": 2.1626899242401123, + "learning_rate": 4.951908509758648e-05, + "loss": 5.9727, + "step": 10516 + }, + { + "epoch": 0.06254757826624797, + "grad_norm": 1.9397778511047363, + "learning_rate": 4.9518993915546155e-05, + "loss": 5.9771, + "step": 10517 + }, + { + "epoch": 0.06255352554952898, + "grad_norm": 1.7723463773727417, + "learning_rate": 4.951890272494651e-05, + "loss": 5.8684, + "step": 10518 + }, + { + "epoch": 0.06255947283280998, + "grad_norm": 1.9191977977752686, + "learning_rate": 4.9518811525787565e-05, + "loss": 5.7242, + "step": 10519 + }, + { + "epoch": 0.06256542011609097, + "grad_norm": 1.7599314451217651, + "learning_rate": 4.951872031806935e-05, + "loss": 5.5234, + "step": 10520 + }, + { + "epoch": 0.06257136739937197, + "grad_norm": 1.6560989618301392, + "learning_rate": 4.951862910179191e-05, + "loss": 5.5907, + "step": 10521 + }, + { + "epoch": 0.06257731468265297, + "grad_norm": 1.9756556749343872, + "learning_rate": 4.9518537876955265e-05, + "loss": 6.0013, + "step": 10522 + }, + { + "epoch": 0.06258326196593396, + "grad_norm": 1.9012173414230347, + "learning_rate": 4.9518446643559454e-05, + "loss": 5.8073, + "step": 10523 + }, + { + "epoch": 0.06258920924921496, + "grad_norm": 1.8992196321487427, + "learning_rate": 4.951835540160451e-05, + "loss": 5.8571, + "step": 10524 + }, + { + "epoch": 0.06259515653249595, + "grad_norm": 1.8002395629882812, + "learning_rate": 4.9518264151090455e-05, + "loss": 5.7798, + "step": 10525 + }, + { + "epoch": 0.06260110381577695, + "grad_norm": 1.732063889503479, + "learning_rate": 4.9518172892017335e-05, + "loss": 5.8167, + "step": 10526 + }, + { + "epoch": 0.06260705109905795, + "grad_norm": 1.6961164474487305, + "learning_rate": 4.951808162438517e-05, + "loss": 5.8797, + "step": 10527 + }, + { + "epoch": 0.06261299838233894, + "grad_norm": 1.904102087020874, + "learning_rate": 4.9517990348193996e-05, + "loss": 5.7109, + "step": 10528 + }, + { + "epoch": 0.06261894566561994, + "grad_norm": 1.6908652782440186, + "learning_rate": 4.951789906344384e-05, + "loss": 5.8435, + "step": 10529 + }, + { + "epoch": 0.06262489294890095, + "grad_norm": 1.8550028800964355, + "learning_rate": 4.951780777013475e-05, + "loss": 5.6218, + "step": 10530 + }, + { + "epoch": 0.06263084023218193, + "grad_norm": 1.7106919288635254, + "learning_rate": 4.951771646826674e-05, + "loss": 5.6668, + "step": 10531 + }, + { + "epoch": 0.06263678751546294, + "grad_norm": 1.5522899627685547, + "learning_rate": 4.951762515783984e-05, + "loss": 5.418, + "step": 10532 + }, + { + "epoch": 0.06264273479874394, + "grad_norm": 1.7510137557983398, + "learning_rate": 4.9517533838854104e-05, + "loss": 5.6595, + "step": 10533 + }, + { + "epoch": 0.06264868208202493, + "grad_norm": 2.1222739219665527, + "learning_rate": 4.9517442511309544e-05, + "loss": 6.0008, + "step": 10534 + }, + { + "epoch": 0.06265462936530593, + "grad_norm": 1.977807641029358, + "learning_rate": 4.95173511752062e-05, + "loss": 5.8263, + "step": 10535 + }, + { + "epoch": 0.06266057664858693, + "grad_norm": 1.6423957347869873, + "learning_rate": 4.9517259830544105e-05, + "loss": 6.2078, + "step": 10536 + }, + { + "epoch": 0.06266652393186792, + "grad_norm": 1.9365674257278442, + "learning_rate": 4.9517168477323286e-05, + "loss": 6.0972, + "step": 10537 + }, + { + "epoch": 0.06267247121514892, + "grad_norm": 1.6738137006759644, + "learning_rate": 4.951707711554377e-05, + "loss": 5.7439, + "step": 10538 + }, + { + "epoch": 0.06267841849842992, + "grad_norm": 2.4281718730926514, + "learning_rate": 4.95169857452056e-05, + "loss": 5.4822, + "step": 10539 + }, + { + "epoch": 0.06268436578171091, + "grad_norm": 2.53411602973938, + "learning_rate": 4.951689436630881e-05, + "loss": 5.4883, + "step": 10540 + }, + { + "epoch": 0.06269031306499191, + "grad_norm": 2.116520643234253, + "learning_rate": 4.951680297885342e-05, + "loss": 5.6123, + "step": 10541 + }, + { + "epoch": 0.06269626034827291, + "grad_norm": 1.8546512126922607, + "learning_rate": 4.951671158283946e-05, + "loss": 5.443, + "step": 10542 + }, + { + "epoch": 0.0627022076315539, + "grad_norm": 2.0048365592956543, + "learning_rate": 4.9516620178266975e-05, + "loss": 5.7759, + "step": 10543 + }, + { + "epoch": 0.0627081549148349, + "grad_norm": 1.6800916194915771, + "learning_rate": 4.9516528765136e-05, + "loss": 5.6767, + "step": 10544 + }, + { + "epoch": 0.0627141021981159, + "grad_norm": 1.7444523572921753, + "learning_rate": 4.9516437343446544e-05, + "loss": 5.297, + "step": 10545 + }, + { + "epoch": 0.0627200494813969, + "grad_norm": 1.8653407096862793, + "learning_rate": 4.951634591319866e-05, + "loss": 5.6999, + "step": 10546 + }, + { + "epoch": 0.0627259967646779, + "grad_norm": 1.7988131046295166, + "learning_rate": 4.9516254474392376e-05, + "loss": 5.5244, + "step": 10547 + }, + { + "epoch": 0.0627319440479589, + "grad_norm": 1.7915012836456299, + "learning_rate": 4.951616302702772e-05, + "loss": 5.6766, + "step": 10548 + }, + { + "epoch": 0.06273789133123989, + "grad_norm": 1.8351629972457886, + "learning_rate": 4.951607157110471e-05, + "loss": 5.6332, + "step": 10549 + }, + { + "epoch": 0.06274383861452089, + "grad_norm": 1.6819947957992554, + "learning_rate": 4.951598010662341e-05, + "loss": 5.5773, + "step": 10550 + }, + { + "epoch": 0.06274978589780189, + "grad_norm": 2.2969119548797607, + "learning_rate": 4.951588863358383e-05, + "loss": 5.6847, + "step": 10551 + }, + { + "epoch": 0.06275573318108288, + "grad_norm": 2.346092939376831, + "learning_rate": 4.951579715198601e-05, + "loss": 5.404, + "step": 10552 + }, + { + "epoch": 0.06276168046436388, + "grad_norm": 1.8255709409713745, + "learning_rate": 4.951570566182997e-05, + "loss": 5.9009, + "step": 10553 + }, + { + "epoch": 0.06276762774764487, + "grad_norm": 2.4000492095947266, + "learning_rate": 4.951561416311575e-05, + "loss": 5.4395, + "step": 10554 + }, + { + "epoch": 0.06277357503092587, + "grad_norm": 2.1519010066986084, + "learning_rate": 4.951552265584339e-05, + "loss": 5.6447, + "step": 10555 + }, + { + "epoch": 0.06277952231420687, + "grad_norm": 1.7821810245513916, + "learning_rate": 4.9515431140012915e-05, + "loss": 5.3495, + "step": 10556 + }, + { + "epoch": 0.06278546959748786, + "grad_norm": 1.8359061479568481, + "learning_rate": 4.9515339615624356e-05, + "loss": 5.7258, + "step": 10557 + }, + { + "epoch": 0.06279141688076886, + "grad_norm": 1.899970293045044, + "learning_rate": 4.951524808267774e-05, + "loss": 5.9683, + "step": 10558 + }, + { + "epoch": 0.06279736416404987, + "grad_norm": 1.6407743692398071, + "learning_rate": 4.951515654117311e-05, + "loss": 6.001, + "step": 10559 + }, + { + "epoch": 0.06280331144733085, + "grad_norm": 1.5474567413330078, + "learning_rate": 4.9515064991110485e-05, + "loss": 5.673, + "step": 10560 + }, + { + "epoch": 0.06280925873061186, + "grad_norm": 1.7129321098327637, + "learning_rate": 4.951497343248991e-05, + "loss": 5.7232, + "step": 10561 + }, + { + "epoch": 0.06281520601389286, + "grad_norm": 1.948367953300476, + "learning_rate": 4.95148818653114e-05, + "loss": 5.9378, + "step": 10562 + }, + { + "epoch": 0.06282115329717385, + "grad_norm": 1.788724422454834, + "learning_rate": 4.951479028957501e-05, + "loss": 5.9077, + "step": 10563 + }, + { + "epoch": 0.06282710058045485, + "grad_norm": 1.7036423683166504, + "learning_rate": 4.951469870528076e-05, + "loss": 5.7688, + "step": 10564 + }, + { + "epoch": 0.06283304786373585, + "grad_norm": 1.6055458784103394, + "learning_rate": 4.9514607112428676e-05, + "loss": 5.7234, + "step": 10565 + }, + { + "epoch": 0.06283899514701684, + "grad_norm": 1.9353829622268677, + "learning_rate": 4.95145155110188e-05, + "loss": 6.1046, + "step": 10566 + }, + { + "epoch": 0.06284494243029784, + "grad_norm": 1.6070129871368408, + "learning_rate": 4.9514423901051157e-05, + "loss": 5.7379, + "step": 10567 + }, + { + "epoch": 0.06285088971357884, + "grad_norm": 1.447828769683838, + "learning_rate": 4.951433228252579e-05, + "loss": 5.2944, + "step": 10568 + }, + { + "epoch": 0.06285683699685983, + "grad_norm": 2.5256540775299072, + "learning_rate": 4.951424065544271e-05, + "loss": 5.1358, + "step": 10569 + }, + { + "epoch": 0.06286278428014083, + "grad_norm": 2.29848051071167, + "learning_rate": 4.951414901980197e-05, + "loss": 5.1967, + "step": 10570 + }, + { + "epoch": 0.06286873156342183, + "grad_norm": 1.9477180242538452, + "learning_rate": 4.951405737560359e-05, + "loss": 5.7509, + "step": 10571 + }, + { + "epoch": 0.06287467884670282, + "grad_norm": 1.9303146600723267, + "learning_rate": 4.951396572284761e-05, + "loss": 5.7052, + "step": 10572 + }, + { + "epoch": 0.06288062612998382, + "grad_norm": 1.5632199048995972, + "learning_rate": 4.951387406153405e-05, + "loss": 5.5001, + "step": 10573 + }, + { + "epoch": 0.06288657341326483, + "grad_norm": 1.6798962354660034, + "learning_rate": 4.951378239166296e-05, + "loss": 5.5537, + "step": 10574 + }, + { + "epoch": 0.06289252069654581, + "grad_norm": 1.7395051717758179, + "learning_rate": 4.9513690713234355e-05, + "loss": 5.736, + "step": 10575 + }, + { + "epoch": 0.06289846797982682, + "grad_norm": 1.726020097732544, + "learning_rate": 4.951359902624828e-05, + "loss": 5.6802, + "step": 10576 + }, + { + "epoch": 0.06290441526310782, + "grad_norm": 1.8063993453979492, + "learning_rate": 4.9513507330704755e-05, + "loss": 5.6077, + "step": 10577 + }, + { + "epoch": 0.0629103625463888, + "grad_norm": 1.6284246444702148, + "learning_rate": 4.951341562660382e-05, + "loss": 5.8327, + "step": 10578 + }, + { + "epoch": 0.06291630982966981, + "grad_norm": 2.635869026184082, + "learning_rate": 4.95133239139455e-05, + "loss": 5.8252, + "step": 10579 + }, + { + "epoch": 0.06292225711295081, + "grad_norm": 2.5127367973327637, + "learning_rate": 4.9513232192729845e-05, + "loss": 5.7431, + "step": 10580 + }, + { + "epoch": 0.0629282043962318, + "grad_norm": 2.0740721225738525, + "learning_rate": 4.951314046295686e-05, + "loss": 5.4582, + "step": 10581 + }, + { + "epoch": 0.0629341516795128, + "grad_norm": 2.32232666015625, + "learning_rate": 4.95130487246266e-05, + "loss": 5.2523, + "step": 10582 + }, + { + "epoch": 0.06294009896279379, + "grad_norm": 2.164407730102539, + "learning_rate": 4.951295697773908e-05, + "loss": 5.6436, + "step": 10583 + }, + { + "epoch": 0.06294604624607479, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.951286522229435e-05, + "loss": 5.5333, + "step": 10584 + }, + { + "epoch": 0.0629519935293558, + "grad_norm": 2.025470733642578, + "learning_rate": 4.951277345829242e-05, + "loss": 5.5041, + "step": 10585 + }, + { + "epoch": 0.06295794081263678, + "grad_norm": 1.9415414333343506, + "learning_rate": 4.951268168573334e-05, + "loss": 5.2148, + "step": 10586 + }, + { + "epoch": 0.06296388809591778, + "grad_norm": 1.9229072332382202, + "learning_rate": 4.9512589904617135e-05, + "loss": 5.1461, + "step": 10587 + }, + { + "epoch": 0.06296983537919879, + "grad_norm": 2.414041757583618, + "learning_rate": 4.951249811494384e-05, + "loss": 5.5023, + "step": 10588 + }, + { + "epoch": 0.06297578266247977, + "grad_norm": 2.49826979637146, + "learning_rate": 4.9512406316713486e-05, + "loss": 5.3566, + "step": 10589 + }, + { + "epoch": 0.06298172994576078, + "grad_norm": 1.7222081422805786, + "learning_rate": 4.951231450992611e-05, + "loss": 5.3128, + "step": 10590 + }, + { + "epoch": 0.06298767722904178, + "grad_norm": 1.7181445360183716, + "learning_rate": 4.9512222694581725e-05, + "loss": 5.4598, + "step": 10591 + }, + { + "epoch": 0.06299362451232277, + "grad_norm": 1.547813892364502, + "learning_rate": 4.9512130870680385e-05, + "loss": 5.3997, + "step": 10592 + }, + { + "epoch": 0.06299957179560377, + "grad_norm": 1.6273536682128906, + "learning_rate": 4.95120390382221e-05, + "loss": 5.1668, + "step": 10593 + }, + { + "epoch": 0.06300551907888477, + "grad_norm": 1.6771745681762695, + "learning_rate": 4.9511947197206934e-05, + "loss": 5.2368, + "step": 10594 + }, + { + "epoch": 0.06301146636216576, + "grad_norm": 2.439664125442505, + "learning_rate": 4.951185534763489e-05, + "loss": 5.2178, + "step": 10595 + }, + { + "epoch": 0.06301741364544676, + "grad_norm": 2.194408655166626, + "learning_rate": 4.951176348950601e-05, + "loss": 5.3593, + "step": 10596 + }, + { + "epoch": 0.06302336092872776, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.9511671622820334e-05, + "loss": 6.3141, + "step": 10597 + }, + { + "epoch": 0.06302930821200875, + "grad_norm": 1.9550800323486328, + "learning_rate": 4.951157974757789e-05, + "loss": 5.8944, + "step": 10598 + }, + { + "epoch": 0.06303525549528975, + "grad_norm": 1.764724612236023, + "learning_rate": 4.9511487863778693e-05, + "loss": 5.5796, + "step": 10599 + }, + { + "epoch": 0.06304120277857075, + "grad_norm": 1.7987425327301025, + "learning_rate": 4.951139597142279e-05, + "loss": 5.5231, + "step": 10600 + }, + { + "epoch": 0.06304715006185174, + "grad_norm": 1.495875358581543, + "learning_rate": 4.951130407051022e-05, + "loss": 5.5019, + "step": 10601 + }, + { + "epoch": 0.06305309734513274, + "grad_norm": 2.7586476802825928, + "learning_rate": 4.9511212161041e-05, + "loss": 5.7043, + "step": 10602 + }, + { + "epoch": 0.06305904462841375, + "grad_norm": 2.1746270656585693, + "learning_rate": 4.951112024301517e-05, + "loss": 5.351, + "step": 10603 + }, + { + "epoch": 0.06306499191169473, + "grad_norm": 1.8681105375289917, + "learning_rate": 4.951102831643277e-05, + "loss": 5.4847, + "step": 10604 + }, + { + "epoch": 0.06307093919497574, + "grad_norm": 1.772286057472229, + "learning_rate": 4.951093638129382e-05, + "loss": 5.767, + "step": 10605 + }, + { + "epoch": 0.06307688647825674, + "grad_norm": 1.847748875617981, + "learning_rate": 4.951084443759835e-05, + "loss": 5.7737, + "step": 10606 + }, + { + "epoch": 0.06308283376153773, + "grad_norm": 1.9219080209732056, + "learning_rate": 4.95107524853464e-05, + "loss": 5.9414, + "step": 10607 + }, + { + "epoch": 0.06308878104481873, + "grad_norm": 1.6497199535369873, + "learning_rate": 4.9510660524538e-05, + "loss": 5.7124, + "step": 10608 + }, + { + "epoch": 0.06309472832809973, + "grad_norm": 1.8772788047790527, + "learning_rate": 4.951056855517318e-05, + "loss": 5.6784, + "step": 10609 + }, + { + "epoch": 0.06310067561138072, + "grad_norm": 2.035104990005493, + "learning_rate": 4.951047657725197e-05, + "loss": 5.5975, + "step": 10610 + }, + { + "epoch": 0.06310662289466172, + "grad_norm": 2.000922918319702, + "learning_rate": 4.9510384590774414e-05, + "loss": 5.2133, + "step": 10611 + }, + { + "epoch": 0.06311257017794271, + "grad_norm": 2.2581655979156494, + "learning_rate": 4.9510292595740536e-05, + "loss": 5.468, + "step": 10612 + }, + { + "epoch": 0.06311851746122371, + "grad_norm": 2.0332419872283936, + "learning_rate": 4.9510200592150365e-05, + "loss": 5.4923, + "step": 10613 + }, + { + "epoch": 0.06312446474450471, + "grad_norm": 1.9499238729476929, + "learning_rate": 4.9510108580003934e-05, + "loss": 5.5535, + "step": 10614 + }, + { + "epoch": 0.0631304120277857, + "grad_norm": 2.017491579055786, + "learning_rate": 4.951001655930128e-05, + "loss": 5.3771, + "step": 10615 + }, + { + "epoch": 0.0631363593110667, + "grad_norm": 2.355508804321289, + "learning_rate": 4.950992453004243e-05, + "loss": 5.0035, + "step": 10616 + }, + { + "epoch": 0.0631423065943477, + "grad_norm": 2.0470683574676514, + "learning_rate": 4.9509832492227426e-05, + "loss": 5.6073, + "step": 10617 + }, + { + "epoch": 0.0631482538776287, + "grad_norm": 1.7955858707427979, + "learning_rate": 4.9509740445856284e-05, + "loss": 5.8097, + "step": 10618 + }, + { + "epoch": 0.0631542011609097, + "grad_norm": 2.0126395225524902, + "learning_rate": 4.9509648390929045e-05, + "loss": 5.5989, + "step": 10619 + }, + { + "epoch": 0.0631601484441907, + "grad_norm": 1.8632375001907349, + "learning_rate": 4.950955632744575e-05, + "loss": 5.5585, + "step": 10620 + }, + { + "epoch": 0.06316609572747169, + "grad_norm": 2.2190446853637695, + "learning_rate": 4.950946425540641e-05, + "loss": 5.5182, + "step": 10621 + }, + { + "epoch": 0.06317204301075269, + "grad_norm": 2.082871675491333, + "learning_rate": 4.9509372174811074e-05, + "loss": 5.7849, + "step": 10622 + }, + { + "epoch": 0.06317799029403369, + "grad_norm": 2.17744517326355, + "learning_rate": 4.9509280085659774e-05, + "loss": 5.2332, + "step": 10623 + }, + { + "epoch": 0.06318393757731468, + "grad_norm": 1.7662746906280518, + "learning_rate": 4.950918798795253e-05, + "loss": 5.4136, + "step": 10624 + }, + { + "epoch": 0.06318988486059568, + "grad_norm": 1.6879531145095825, + "learning_rate": 4.950909588168939e-05, + "loss": 5.3747, + "step": 10625 + }, + { + "epoch": 0.06319583214387668, + "grad_norm": 2.0174877643585205, + "learning_rate": 4.950900376687038e-05, + "loss": 5.2927, + "step": 10626 + }, + { + "epoch": 0.06320177942715767, + "grad_norm": 1.9052749872207642, + "learning_rate": 4.950891164349552e-05, + "loss": 5.1492, + "step": 10627 + }, + { + "epoch": 0.06320772671043867, + "grad_norm": 1.7647850513458252, + "learning_rate": 4.950881951156485e-05, + "loss": 5.4182, + "step": 10628 + }, + { + "epoch": 0.06321367399371967, + "grad_norm": 1.9794502258300781, + "learning_rate": 4.950872737107841e-05, + "loss": 5.3838, + "step": 10629 + }, + { + "epoch": 0.06321962127700066, + "grad_norm": 2.3403780460357666, + "learning_rate": 4.950863522203623e-05, + "loss": 5.4542, + "step": 10630 + }, + { + "epoch": 0.06322556856028166, + "grad_norm": 1.8747358322143555, + "learning_rate": 4.9508543064438336e-05, + "loss": 5.4949, + "step": 10631 + }, + { + "epoch": 0.06323151584356267, + "grad_norm": 1.9435046911239624, + "learning_rate": 4.950845089828476e-05, + "loss": 5.6136, + "step": 10632 + }, + { + "epoch": 0.06323746312684365, + "grad_norm": 2.095583438873291, + "learning_rate": 4.9508358723575544e-05, + "loss": 5.2864, + "step": 10633 + }, + { + "epoch": 0.06324341041012466, + "grad_norm": 1.8254145383834839, + "learning_rate": 4.9508266540310705e-05, + "loss": 5.4732, + "step": 10634 + }, + { + "epoch": 0.06324935769340566, + "grad_norm": 2.303638458251953, + "learning_rate": 4.950817434849029e-05, + "loss": 5.1501, + "step": 10635 + }, + { + "epoch": 0.06325530497668665, + "grad_norm": 2.5389420986175537, + "learning_rate": 4.950808214811432e-05, + "loss": 5.0723, + "step": 10636 + }, + { + "epoch": 0.06326125225996765, + "grad_norm": 2.1702539920806885, + "learning_rate": 4.950798993918283e-05, + "loss": 4.8838, + "step": 10637 + }, + { + "epoch": 0.06326719954324865, + "grad_norm": 1.921650767326355, + "learning_rate": 4.9507897721695855e-05, + "loss": 5.9958, + "step": 10638 + }, + { + "epoch": 0.06327314682652964, + "grad_norm": 2.2247352600097656, + "learning_rate": 4.950780549565343e-05, + "loss": 4.9319, + "step": 10639 + }, + { + "epoch": 0.06327909410981064, + "grad_norm": 2.3517649173736572, + "learning_rate": 4.950771326105558e-05, + "loss": 4.6033, + "step": 10640 + }, + { + "epoch": 0.06328504139309163, + "grad_norm": 2.053856134414673, + "learning_rate": 4.950762101790234e-05, + "loss": 4.3799, + "step": 10641 + }, + { + "epoch": 0.06329098867637263, + "grad_norm": 1.8055500984191895, + "learning_rate": 4.9507528766193746e-05, + "loss": 5.244, + "step": 10642 + }, + { + "epoch": 0.06329693595965363, + "grad_norm": 2.0694682598114014, + "learning_rate": 4.950743650592983e-05, + "loss": 5.1965, + "step": 10643 + }, + { + "epoch": 0.06330288324293462, + "grad_norm": 2.027399778366089, + "learning_rate": 4.950734423711061e-05, + "loss": 4.5576, + "step": 10644 + }, + { + "epoch": 0.06330883052621562, + "grad_norm": 2.22308087348938, + "learning_rate": 4.950725195973614e-05, + "loss": 4.4679, + "step": 10645 + }, + { + "epoch": 0.06331477780949663, + "grad_norm": 2.1807515621185303, + "learning_rate": 4.9507159673806436e-05, + "loss": 4.6147, + "step": 10646 + }, + { + "epoch": 0.06332072509277761, + "grad_norm": 2.0173258781433105, + "learning_rate": 4.9507067379321536e-05, + "loss": 4.5657, + "step": 10647 + }, + { + "epoch": 0.06332667237605862, + "grad_norm": 1.832610845565796, + "learning_rate": 4.9506975076281474e-05, + "loss": 4.7433, + "step": 10648 + }, + { + "epoch": 0.06333261965933962, + "grad_norm": 2.027352809906006, + "learning_rate": 4.950688276468628e-05, + "loss": 5.0426, + "step": 10649 + }, + { + "epoch": 0.0633385669426206, + "grad_norm": 1.856307864189148, + "learning_rate": 4.950679044453599e-05, + "loss": 5.2838, + "step": 10650 + }, + { + "epoch": 0.06334451422590161, + "grad_norm": 2.0875375270843506, + "learning_rate": 4.950669811583062e-05, + "loss": 4.5728, + "step": 10651 + }, + { + "epoch": 0.06335046150918261, + "grad_norm": 2.1067941188812256, + "learning_rate": 4.950660577857023e-05, + "loss": 4.5313, + "step": 10652 + }, + { + "epoch": 0.0633564087924636, + "grad_norm": 2.1747500896453857, + "learning_rate": 4.9506513432754825e-05, + "loss": 4.432, + "step": 10653 + }, + { + "epoch": 0.0633623560757446, + "grad_norm": 1.769059181213379, + "learning_rate": 4.950642107838446e-05, + "loss": 5.4667, + "step": 10654 + }, + { + "epoch": 0.0633683033590256, + "grad_norm": 2.2065072059631348, + "learning_rate": 4.9506328715459146e-05, + "loss": 5.9873, + "step": 10655 + }, + { + "epoch": 0.06337425064230659, + "grad_norm": 1.679431438446045, + "learning_rate": 4.950623634397893e-05, + "loss": 5.851, + "step": 10656 + }, + { + "epoch": 0.06338019792558759, + "grad_norm": 1.919668197631836, + "learning_rate": 4.950614396394384e-05, + "loss": 5.8613, + "step": 10657 + }, + { + "epoch": 0.0633861452088686, + "grad_norm": 1.5296612977981567, + "learning_rate": 4.9506051575353915e-05, + "loss": 5.7067, + "step": 10658 + }, + { + "epoch": 0.06339209249214958, + "grad_norm": 2.1283507347106934, + "learning_rate": 4.950595917820917e-05, + "loss": 5.1141, + "step": 10659 + }, + { + "epoch": 0.06339803977543058, + "grad_norm": 1.7011604309082031, + "learning_rate": 4.950586677250966e-05, + "loss": 6.0463, + "step": 10660 + }, + { + "epoch": 0.06340398705871159, + "grad_norm": 1.7479497194290161, + "learning_rate": 4.9505774358255396e-05, + "loss": 5.8942, + "step": 10661 + }, + { + "epoch": 0.06340993434199257, + "grad_norm": 1.939471960067749, + "learning_rate": 4.950568193544642e-05, + "loss": 5.562, + "step": 10662 + }, + { + "epoch": 0.06341588162527358, + "grad_norm": 1.871993899345398, + "learning_rate": 4.9505589504082764e-05, + "loss": 5.746, + "step": 10663 + }, + { + "epoch": 0.06342182890855458, + "grad_norm": 2.173109292984009, + "learning_rate": 4.950549706416446e-05, + "loss": 5.5927, + "step": 10664 + }, + { + "epoch": 0.06342777619183557, + "grad_norm": 1.809971809387207, + "learning_rate": 4.950540461569154e-05, + "loss": 5.8983, + "step": 10665 + }, + { + "epoch": 0.06343372347511657, + "grad_norm": 1.6344120502471924, + "learning_rate": 4.950531215866404e-05, + "loss": 5.5301, + "step": 10666 + }, + { + "epoch": 0.06343967075839757, + "grad_norm": 2.080425500869751, + "learning_rate": 4.9505219693081985e-05, + "loss": 6.0214, + "step": 10667 + }, + { + "epoch": 0.06344561804167856, + "grad_norm": 1.9382790327072144, + "learning_rate": 4.9505127218945415e-05, + "loss": 5.676, + "step": 10668 + }, + { + "epoch": 0.06345156532495956, + "grad_norm": 1.6945782899856567, + "learning_rate": 4.9505034736254354e-05, + "loss": 5.9337, + "step": 10669 + }, + { + "epoch": 0.06345751260824055, + "grad_norm": 1.6129313707351685, + "learning_rate": 4.9504942245008836e-05, + "loss": 5.6561, + "step": 10670 + }, + { + "epoch": 0.06346345989152155, + "grad_norm": 2.002903461456299, + "learning_rate": 4.95048497452089e-05, + "loss": 5.6302, + "step": 10671 + }, + { + "epoch": 0.06346940717480255, + "grad_norm": 1.6016403436660767, + "learning_rate": 4.950475723685457e-05, + "loss": 5.8275, + "step": 10672 + }, + { + "epoch": 0.06347535445808354, + "grad_norm": 1.7645297050476074, + "learning_rate": 4.9504664719945895e-05, + "loss": 5.5541, + "step": 10673 + }, + { + "epoch": 0.06348130174136454, + "grad_norm": 1.9627439975738525, + "learning_rate": 4.950457219448288e-05, + "loss": 5.6425, + "step": 10674 + }, + { + "epoch": 0.06348724902464555, + "grad_norm": 1.6297314167022705, + "learning_rate": 4.950447966046558e-05, + "loss": 5.5735, + "step": 10675 + }, + { + "epoch": 0.06349319630792653, + "grad_norm": 1.7911304235458374, + "learning_rate": 4.9504387117894014e-05, + "loss": 5.7736, + "step": 10676 + }, + { + "epoch": 0.06349914359120754, + "grad_norm": 1.627543330192566, + "learning_rate": 4.950429456676823e-05, + "loss": 5.736, + "step": 10677 + }, + { + "epoch": 0.06350509087448854, + "grad_norm": 1.9574320316314697, + "learning_rate": 4.950420200708824e-05, + "loss": 5.365, + "step": 10678 + }, + { + "epoch": 0.06351103815776953, + "grad_norm": 1.7698450088500977, + "learning_rate": 4.950410943885408e-05, + "loss": 5.5742, + "step": 10679 + }, + { + "epoch": 0.06351698544105053, + "grad_norm": 1.7660366296768188, + "learning_rate": 4.9504016862065806e-05, + "loss": 5.9064, + "step": 10680 + }, + { + "epoch": 0.06352293272433153, + "grad_norm": 2.0279083251953125, + "learning_rate": 4.9503924276723425e-05, + "loss": 5.7938, + "step": 10681 + }, + { + "epoch": 0.06352888000761252, + "grad_norm": 2.101827621459961, + "learning_rate": 4.9503831682826974e-05, + "loss": 5.4898, + "step": 10682 + }, + { + "epoch": 0.06353482729089352, + "grad_norm": 2.04978084564209, + "learning_rate": 4.9503739080376486e-05, + "loss": 5.3753, + "step": 10683 + }, + { + "epoch": 0.06354077457417452, + "grad_norm": 1.8539999723434448, + "learning_rate": 4.950364646937201e-05, + "loss": 5.5575, + "step": 10684 + }, + { + "epoch": 0.06354672185745551, + "grad_norm": 2.077073097229004, + "learning_rate": 4.9503553849813556e-05, + "loss": 5.4628, + "step": 10685 + }, + { + "epoch": 0.06355266914073651, + "grad_norm": 1.8130167722702026, + "learning_rate": 4.950346122170116e-05, + "loss": 5.1648, + "step": 10686 + }, + { + "epoch": 0.06355861642401751, + "grad_norm": 1.810944676399231, + "learning_rate": 4.950336858503486e-05, + "loss": 5.8371, + "step": 10687 + }, + { + "epoch": 0.0635645637072985, + "grad_norm": 2.0081756114959717, + "learning_rate": 4.950327593981469e-05, + "loss": 5.6933, + "step": 10688 + }, + { + "epoch": 0.0635705109905795, + "grad_norm": 1.5824620723724365, + "learning_rate": 4.950318328604068e-05, + "loss": 5.4494, + "step": 10689 + }, + { + "epoch": 0.0635764582738605, + "grad_norm": 1.6470626592636108, + "learning_rate": 4.950309062371286e-05, + "loss": 6.2401, + "step": 10690 + }, + { + "epoch": 0.0635824055571415, + "grad_norm": 1.799074649810791, + "learning_rate": 4.950299795283127e-05, + "loss": 6.1075, + "step": 10691 + }, + { + "epoch": 0.0635883528404225, + "grad_norm": 2.0551035404205322, + "learning_rate": 4.950290527339593e-05, + "loss": 5.6646, + "step": 10692 + }, + { + "epoch": 0.0635943001237035, + "grad_norm": 2.3543875217437744, + "learning_rate": 4.9502812585406875e-05, + "loss": 4.9341, + "step": 10693 + }, + { + "epoch": 0.06360024740698449, + "grad_norm": 2.0479071140289307, + "learning_rate": 4.950271988886415e-05, + "loss": 5.3351, + "step": 10694 + }, + { + "epoch": 0.06360619469026549, + "grad_norm": 1.9331302642822266, + "learning_rate": 4.950262718376778e-05, + "loss": 5.6269, + "step": 10695 + }, + { + "epoch": 0.06361214197354649, + "grad_norm": 1.9922640323638916, + "learning_rate": 4.950253447011779e-05, + "loss": 5.5113, + "step": 10696 + }, + { + "epoch": 0.06361808925682748, + "grad_norm": 1.769916296005249, + "learning_rate": 4.950244174791422e-05, + "loss": 5.5902, + "step": 10697 + }, + { + "epoch": 0.06362403654010848, + "grad_norm": 2.8808071613311768, + "learning_rate": 4.95023490171571e-05, + "loss": 4.9506, + "step": 10698 + }, + { + "epoch": 0.06362998382338947, + "grad_norm": 2.0609331130981445, + "learning_rate": 4.9502256277846466e-05, + "loss": 5.4256, + "step": 10699 + }, + { + "epoch": 0.06363593110667047, + "grad_norm": 2.0112223625183105, + "learning_rate": 4.950216352998234e-05, + "loss": 6.1121, + "step": 10700 + }, + { + "epoch": 0.06364187838995147, + "grad_norm": 1.5665667057037354, + "learning_rate": 4.9502070773564765e-05, + "loss": 5.1959, + "step": 10701 + }, + { + "epoch": 0.06364782567323246, + "grad_norm": 1.9731864929199219, + "learning_rate": 4.9501978008593774e-05, + "loss": 5.2887, + "step": 10702 + }, + { + "epoch": 0.06365377295651346, + "grad_norm": 1.7925242185592651, + "learning_rate": 4.9501885235069404e-05, + "loss": 5.7386, + "step": 10703 + }, + { + "epoch": 0.06365972023979447, + "grad_norm": 1.6686629056930542, + "learning_rate": 4.950179245299166e-05, + "loss": 5.7279, + "step": 10704 + }, + { + "epoch": 0.06366566752307545, + "grad_norm": 2.034392833709717, + "learning_rate": 4.95016996623606e-05, + "loss": 5.6148, + "step": 10705 + }, + { + "epoch": 0.06367161480635646, + "grad_norm": 2.1711995601654053, + "learning_rate": 4.9501606863176254e-05, + "loss": 5.7088, + "step": 10706 + }, + { + "epoch": 0.06367756208963746, + "grad_norm": 2.3276829719543457, + "learning_rate": 4.950151405543865e-05, + "loss": 5.3658, + "step": 10707 + }, + { + "epoch": 0.06368350937291845, + "grad_norm": 2.174130916595459, + "learning_rate": 4.9501421239147824e-05, + "loss": 5.3459, + "step": 10708 + }, + { + "epoch": 0.06368945665619945, + "grad_norm": 1.8721747398376465, + "learning_rate": 4.9501328414303794e-05, + "loss": 5.3375, + "step": 10709 + }, + { + "epoch": 0.06369540393948045, + "grad_norm": 1.8677324056625366, + "learning_rate": 4.9501235580906615e-05, + "loss": 5.8192, + "step": 10710 + }, + { + "epoch": 0.06370135122276144, + "grad_norm": 2.0901246070861816, + "learning_rate": 4.9501142738956294e-05, + "loss": 6.1188, + "step": 10711 + }, + { + "epoch": 0.06370729850604244, + "grad_norm": 1.7860997915267944, + "learning_rate": 4.9501049888452885e-05, + "loss": 5.4011, + "step": 10712 + }, + { + "epoch": 0.06371324578932344, + "grad_norm": 2.000946283340454, + "learning_rate": 4.950095702939642e-05, + "loss": 5.16, + "step": 10713 + }, + { + "epoch": 0.06371919307260443, + "grad_norm": 2.47086501121521, + "learning_rate": 4.950086416178691e-05, + "loss": 5.1543, + "step": 10714 + }, + { + "epoch": 0.06372514035588543, + "grad_norm": 1.8694473505020142, + "learning_rate": 4.9500771285624415e-05, + "loss": 5.3576, + "step": 10715 + }, + { + "epoch": 0.06373108763916643, + "grad_norm": 1.8921676874160767, + "learning_rate": 4.9500678400908946e-05, + "loss": 5.0827, + "step": 10716 + }, + { + "epoch": 0.06373703492244742, + "grad_norm": 1.8423974514007568, + "learning_rate": 4.950058550764054e-05, + "loss": 4.9912, + "step": 10717 + }, + { + "epoch": 0.06374298220572842, + "grad_norm": 1.6893757581710815, + "learning_rate": 4.950049260581924e-05, + "loss": 5.2792, + "step": 10718 + }, + { + "epoch": 0.06374892948900943, + "grad_norm": 1.720799446105957, + "learning_rate": 4.950039969544507e-05, + "loss": 5.4355, + "step": 10719 + }, + { + "epoch": 0.06375487677229041, + "grad_norm": 1.717527151107788, + "learning_rate": 4.9500306776518065e-05, + "loss": 5.2802, + "step": 10720 + }, + { + "epoch": 0.06376082405557142, + "grad_norm": 1.876207947731018, + "learning_rate": 4.950021384903825e-05, + "loss": 5.4667, + "step": 10721 + }, + { + "epoch": 0.06376677133885242, + "grad_norm": 1.7892308235168457, + "learning_rate": 4.9500120913005666e-05, + "loss": 5.6635, + "step": 10722 + }, + { + "epoch": 0.0637727186221334, + "grad_norm": 1.828092336654663, + "learning_rate": 4.950002796842034e-05, + "loss": 5.5301, + "step": 10723 + }, + { + "epoch": 0.06377866590541441, + "grad_norm": 1.5860785245895386, + "learning_rate": 4.949993501528232e-05, + "loss": 5.2337, + "step": 10724 + }, + { + "epoch": 0.06378461318869541, + "grad_norm": 1.731295108795166, + "learning_rate": 4.949984205359161e-05, + "loss": 5.4115, + "step": 10725 + }, + { + "epoch": 0.0637905604719764, + "grad_norm": 2.194288969039917, + "learning_rate": 4.949974908334827e-05, + "loss": 5.4736, + "step": 10726 + }, + { + "epoch": 0.0637965077552574, + "grad_norm": 1.6036415100097656, + "learning_rate": 4.949965610455231e-05, + "loss": 5.4563, + "step": 10727 + }, + { + "epoch": 0.06380245503853839, + "grad_norm": 1.6228232383728027, + "learning_rate": 4.949956311720378e-05, + "loss": 5.4695, + "step": 10728 + }, + { + "epoch": 0.06380840232181939, + "grad_norm": 1.3040069341659546, + "learning_rate": 4.94994701213027e-05, + "loss": 5.0126, + "step": 10729 + }, + { + "epoch": 0.06381434960510039, + "grad_norm": 1.5976930856704712, + "learning_rate": 4.9499377116849116e-05, + "loss": 5.0165, + "step": 10730 + }, + { + "epoch": 0.06382029688838138, + "grad_norm": 1.5877797603607178, + "learning_rate": 4.9499284103843046e-05, + "loss": 5.1634, + "step": 10731 + }, + { + "epoch": 0.06382624417166238, + "grad_norm": 1.6466439962387085, + "learning_rate": 4.949919108228453e-05, + "loss": 5.3954, + "step": 10732 + }, + { + "epoch": 0.06383219145494338, + "grad_norm": 1.5188345909118652, + "learning_rate": 4.949909805217361e-05, + "loss": 5.2876, + "step": 10733 + }, + { + "epoch": 0.06383813873822437, + "grad_norm": 1.836227297782898, + "learning_rate": 4.94990050135103e-05, + "loss": 5.4966, + "step": 10734 + }, + { + "epoch": 0.06384408602150538, + "grad_norm": 1.5542840957641602, + "learning_rate": 4.9498911966294635e-05, + "loss": 5.2188, + "step": 10735 + }, + { + "epoch": 0.06385003330478638, + "grad_norm": 1.3053034543991089, + "learning_rate": 4.9498818910526656e-05, + "loss": 5.3834, + "step": 10736 + }, + { + "epoch": 0.06385598058806737, + "grad_norm": 1.4250247478485107, + "learning_rate": 4.9498725846206395e-05, + "loss": 5.1852, + "step": 10737 + }, + { + "epoch": 0.06386192787134837, + "grad_norm": 1.5885393619537354, + "learning_rate": 4.9498632773333886e-05, + "loss": 5.2518, + "step": 10738 + }, + { + "epoch": 0.06386787515462937, + "grad_norm": 1.5664896965026855, + "learning_rate": 4.949853969190915e-05, + "loss": 5.1186, + "step": 10739 + }, + { + "epoch": 0.06387382243791036, + "grad_norm": 1.5156123638153076, + "learning_rate": 4.949844660193223e-05, + "loss": 5.1111, + "step": 10740 + }, + { + "epoch": 0.06387976972119136, + "grad_norm": 1.5308325290679932, + "learning_rate": 4.949835350340316e-05, + "loss": 5.1577, + "step": 10741 + }, + { + "epoch": 0.06388571700447236, + "grad_norm": 1.3338321447372437, + "learning_rate": 4.949826039632196e-05, + "loss": 5.2386, + "step": 10742 + }, + { + "epoch": 0.06389166428775335, + "grad_norm": 1.5307821035385132, + "learning_rate": 4.9498167280688676e-05, + "loss": 5.1173, + "step": 10743 + }, + { + "epoch": 0.06389761157103435, + "grad_norm": 1.607913613319397, + "learning_rate": 4.9498074156503325e-05, + "loss": 5.3077, + "step": 10744 + }, + { + "epoch": 0.06390355885431535, + "grad_norm": 1.6242469549179077, + "learning_rate": 4.949798102376596e-05, + "loss": 5.3319, + "step": 10745 + }, + { + "epoch": 0.06390950613759634, + "grad_norm": 1.62213134765625, + "learning_rate": 4.9497887882476604e-05, + "loss": 5.3494, + "step": 10746 + }, + { + "epoch": 0.06391545342087734, + "grad_norm": 1.4064897298812866, + "learning_rate": 4.949779473263528e-05, + "loss": 5.207, + "step": 10747 + }, + { + "epoch": 0.06392140070415835, + "grad_norm": 1.7431879043579102, + "learning_rate": 4.949770157424203e-05, + "loss": 5.4068, + "step": 10748 + }, + { + "epoch": 0.06392734798743933, + "grad_norm": 1.5815304517745972, + "learning_rate": 4.949760840729689e-05, + "loss": 5.3917, + "step": 10749 + }, + { + "epoch": 0.06393329527072034, + "grad_norm": 1.576541543006897, + "learning_rate": 4.949751523179988e-05, + "loss": 5.4123, + "step": 10750 + }, + { + "epoch": 0.06393924255400134, + "grad_norm": 1.6717814207077026, + "learning_rate": 4.9497422047751054e-05, + "loss": 5.3028, + "step": 10751 + }, + { + "epoch": 0.06394518983728233, + "grad_norm": 1.4091792106628418, + "learning_rate": 4.9497328855150424e-05, + "loss": 5.2231, + "step": 10752 + }, + { + "epoch": 0.06395113712056333, + "grad_norm": 1.4366726875305176, + "learning_rate": 4.949723565399803e-05, + "loss": 5.2908, + "step": 10753 + }, + { + "epoch": 0.06395708440384433, + "grad_norm": 1.6679248809814453, + "learning_rate": 4.9497142444293906e-05, + "loss": 5.1079, + "step": 10754 + }, + { + "epoch": 0.06396303168712532, + "grad_norm": 1.6619216203689575, + "learning_rate": 4.949704922603808e-05, + "loss": 5.1504, + "step": 10755 + }, + { + "epoch": 0.06396897897040632, + "grad_norm": 1.7149940729141235, + "learning_rate": 4.9496955999230586e-05, + "loss": 5.3031, + "step": 10756 + }, + { + "epoch": 0.06397492625368732, + "grad_norm": 1.711256504058838, + "learning_rate": 4.9496862763871456e-05, + "loss": 5.2146, + "step": 10757 + }, + { + "epoch": 0.06398087353696831, + "grad_norm": 1.654680609703064, + "learning_rate": 4.949676951996073e-05, + "loss": 5.2774, + "step": 10758 + }, + { + "epoch": 0.06398682082024931, + "grad_norm": 1.5115636587142944, + "learning_rate": 4.949667626749843e-05, + "loss": 5.2155, + "step": 10759 + }, + { + "epoch": 0.0639927681035303, + "grad_norm": 1.7153947353363037, + "learning_rate": 4.9496583006484596e-05, + "loss": 5.2711, + "step": 10760 + }, + { + "epoch": 0.0639987153868113, + "grad_norm": 1.8497945070266724, + "learning_rate": 4.949648973691926e-05, + "loss": 5.2864, + "step": 10761 + }, + { + "epoch": 0.0640046626700923, + "grad_norm": 1.5251562595367432, + "learning_rate": 4.9496396458802455e-05, + "loss": 5.2532, + "step": 10762 + }, + { + "epoch": 0.0640106099533733, + "grad_norm": 1.5916621685028076, + "learning_rate": 4.94963031721342e-05, + "loss": 5.2136, + "step": 10763 + }, + { + "epoch": 0.0640165572366543, + "grad_norm": 1.5781627893447876, + "learning_rate": 4.949620987691455e-05, + "loss": 5.3188, + "step": 10764 + }, + { + "epoch": 0.0640225045199353, + "grad_norm": 1.7783690690994263, + "learning_rate": 4.9496116573143515e-05, + "loss": 5.4196, + "step": 10765 + }, + { + "epoch": 0.06402845180321629, + "grad_norm": 1.5746928453445435, + "learning_rate": 4.949602326082115e-05, + "loss": 5.3724, + "step": 10766 + }, + { + "epoch": 0.06403439908649729, + "grad_norm": 1.677771806716919, + "learning_rate": 4.9495929939947475e-05, + "loss": 5.2894, + "step": 10767 + }, + { + "epoch": 0.06404034636977829, + "grad_norm": 1.7747725248336792, + "learning_rate": 4.949583661052252e-05, + "loss": 5.0527, + "step": 10768 + }, + { + "epoch": 0.06404629365305928, + "grad_norm": 1.6927893161773682, + "learning_rate": 4.9495743272546314e-05, + "loss": 5.0999, + "step": 10769 + }, + { + "epoch": 0.06405224093634028, + "grad_norm": 1.6289039850234985, + "learning_rate": 4.949564992601891e-05, + "loss": 5.4197, + "step": 10770 + }, + { + "epoch": 0.06405818821962128, + "grad_norm": 1.742658019065857, + "learning_rate": 4.9495556570940316e-05, + "loss": 5.2927, + "step": 10771 + }, + { + "epoch": 0.06406413550290227, + "grad_norm": 1.6643215417861938, + "learning_rate": 4.949546320731059e-05, + "loss": 5.3262, + "step": 10772 + }, + { + "epoch": 0.06407008278618327, + "grad_norm": 1.6400927305221558, + "learning_rate": 4.949536983512974e-05, + "loss": 5.1072, + "step": 10773 + }, + { + "epoch": 0.06407603006946427, + "grad_norm": 1.7093544006347656, + "learning_rate": 4.949527645439781e-05, + "loss": 5.1849, + "step": 10774 + }, + { + "epoch": 0.06408197735274526, + "grad_norm": 1.6980849504470825, + "learning_rate": 4.949518306511484e-05, + "loss": 5.3661, + "step": 10775 + }, + { + "epoch": 0.06408792463602626, + "grad_norm": 1.7241551876068115, + "learning_rate": 4.949508966728085e-05, + "loss": 5.3315, + "step": 10776 + }, + { + "epoch": 0.06409387191930727, + "grad_norm": 1.8421318531036377, + "learning_rate": 4.9494996260895874e-05, + "loss": 5.3506, + "step": 10777 + }, + { + "epoch": 0.06409981920258825, + "grad_norm": 1.835738182067871, + "learning_rate": 4.949490284595995e-05, + "loss": 5.2087, + "step": 10778 + }, + { + "epoch": 0.06410576648586926, + "grad_norm": 1.6622625589370728, + "learning_rate": 4.949480942247311e-05, + "loss": 5.0072, + "step": 10779 + }, + { + "epoch": 0.06411171376915026, + "grad_norm": 1.5437613725662231, + "learning_rate": 4.949471599043539e-05, + "loss": 5.182, + "step": 10780 + }, + { + "epoch": 0.06411766105243125, + "grad_norm": 1.620758295059204, + "learning_rate": 4.949462254984681e-05, + "loss": 5.2771, + "step": 10781 + }, + { + "epoch": 0.06412360833571225, + "grad_norm": 1.6143954992294312, + "learning_rate": 4.949452910070741e-05, + "loss": 5.1175, + "step": 10782 + }, + { + "epoch": 0.06412955561899325, + "grad_norm": 1.8173086643218994, + "learning_rate": 4.949443564301722e-05, + "loss": 5.175, + "step": 10783 + }, + { + "epoch": 0.06413550290227424, + "grad_norm": 1.75434148311615, + "learning_rate": 4.9494342176776284e-05, + "loss": 5.1133, + "step": 10784 + }, + { + "epoch": 0.06414145018555524, + "grad_norm": 1.7278660535812378, + "learning_rate": 4.949424870198462e-05, + "loss": 5.0704, + "step": 10785 + }, + { + "epoch": 0.06414739746883624, + "grad_norm": 1.793285608291626, + "learning_rate": 4.949415521864228e-05, + "loss": 5.1567, + "step": 10786 + }, + { + "epoch": 0.06415334475211723, + "grad_norm": 1.7892498970031738, + "learning_rate": 4.949406172674927e-05, + "loss": 5.201, + "step": 10787 + }, + { + "epoch": 0.06415929203539823, + "grad_norm": 2.276643991470337, + "learning_rate": 4.9493968226305645e-05, + "loss": 5.5555, + "step": 10788 + }, + { + "epoch": 0.06416523931867922, + "grad_norm": 1.5785993337631226, + "learning_rate": 4.9493874717311416e-05, + "loss": 5.2692, + "step": 10789 + }, + { + "epoch": 0.06417118660196022, + "grad_norm": 1.3982635736465454, + "learning_rate": 4.949378119976664e-05, + "loss": 5.24, + "step": 10790 + }, + { + "epoch": 0.06417713388524122, + "grad_norm": 1.4310967922210693, + "learning_rate": 4.949368767367133e-05, + "loss": 5.2032, + "step": 10791 + }, + { + "epoch": 0.06418308116852221, + "grad_norm": 1.5635451078414917, + "learning_rate": 4.949359413902554e-05, + "loss": 5.2589, + "step": 10792 + }, + { + "epoch": 0.06418902845180322, + "grad_norm": 1.5000566244125366, + "learning_rate": 4.949350059582927e-05, + "loss": 5.147, + "step": 10793 + }, + { + "epoch": 0.06419497573508422, + "grad_norm": 1.7782738208770752, + "learning_rate": 4.9493407044082585e-05, + "loss": 5.1987, + "step": 10794 + }, + { + "epoch": 0.0642009230183652, + "grad_norm": 1.5931564569473267, + "learning_rate": 4.94933134837855e-05, + "loss": 5.2591, + "step": 10795 + }, + { + "epoch": 0.06420687030164621, + "grad_norm": 1.619287371635437, + "learning_rate": 4.9493219914938055e-05, + "loss": 5.1041, + "step": 10796 + }, + { + "epoch": 0.06421281758492721, + "grad_norm": 1.5174281597137451, + "learning_rate": 4.949312633754028e-05, + "loss": 5.1798, + "step": 10797 + }, + { + "epoch": 0.0642187648682082, + "grad_norm": 1.6485828161239624, + "learning_rate": 4.9493032751592205e-05, + "loss": 5.1086, + "step": 10798 + }, + { + "epoch": 0.0642247121514892, + "grad_norm": 1.830984354019165, + "learning_rate": 4.949293915709386e-05, + "loss": 5.2241, + "step": 10799 + }, + { + "epoch": 0.0642306594347702, + "grad_norm": 1.9102944135665894, + "learning_rate": 4.94928455540453e-05, + "loss": 4.9652, + "step": 10800 + }, + { + "epoch": 0.06423660671805119, + "grad_norm": 1.6826778650283813, + "learning_rate": 4.949275194244653e-05, + "loss": 5.0479, + "step": 10801 + }, + { + "epoch": 0.06424255400133219, + "grad_norm": 1.7545628547668457, + "learning_rate": 4.9492658322297595e-05, + "loss": 4.9263, + "step": 10802 + }, + { + "epoch": 0.0642485012846132, + "grad_norm": 1.621121883392334, + "learning_rate": 4.949256469359852e-05, + "loss": 4.9095, + "step": 10803 + }, + { + "epoch": 0.06425444856789418, + "grad_norm": 1.727095603942871, + "learning_rate": 4.9492471056349356e-05, + "loss": 5.1913, + "step": 10804 + }, + { + "epoch": 0.06426039585117518, + "grad_norm": 1.749241590499878, + "learning_rate": 4.949237741055011e-05, + "loss": 5.4284, + "step": 10805 + }, + { + "epoch": 0.06426634313445619, + "grad_norm": 1.627784252166748, + "learning_rate": 4.9492283756200834e-05, + "loss": 5.547, + "step": 10806 + }, + { + "epoch": 0.06427229041773717, + "grad_norm": 1.8133957386016846, + "learning_rate": 4.949219009330155e-05, + "loss": 5.5841, + "step": 10807 + }, + { + "epoch": 0.06427823770101818, + "grad_norm": 1.6667630672454834, + "learning_rate": 4.949209642185231e-05, + "loss": 5.4091, + "step": 10808 + }, + { + "epoch": 0.06428418498429918, + "grad_norm": 1.601288914680481, + "learning_rate": 4.949200274185312e-05, + "loss": 4.9647, + "step": 10809 + }, + { + "epoch": 0.06429013226758017, + "grad_norm": 1.4544743299484253, + "learning_rate": 4.9491909053304025e-05, + "loss": 5.477, + "step": 10810 + }, + { + "epoch": 0.06429607955086117, + "grad_norm": 1.65786874294281, + "learning_rate": 4.949181535620506e-05, + "loss": 5.2401, + "step": 10811 + }, + { + "epoch": 0.06430202683414217, + "grad_norm": 1.561251163482666, + "learning_rate": 4.949172165055625e-05, + "loss": 5.7689, + "step": 10812 + }, + { + "epoch": 0.06430797411742316, + "grad_norm": 1.465378999710083, + "learning_rate": 4.949162793635764e-05, + "loss": 5.4109, + "step": 10813 + }, + { + "epoch": 0.06431392140070416, + "grad_norm": 1.3914259672164917, + "learning_rate": 4.949153421360926e-05, + "loss": 5.5144, + "step": 10814 + }, + { + "epoch": 0.06431986868398516, + "grad_norm": 1.6016005277633667, + "learning_rate": 4.949144048231113e-05, + "loss": 5.2708, + "step": 10815 + }, + { + "epoch": 0.06432581596726615, + "grad_norm": 1.4063479900360107, + "learning_rate": 4.94913467424633e-05, + "loss": 5.0303, + "step": 10816 + }, + { + "epoch": 0.06433176325054715, + "grad_norm": 1.5708017349243164, + "learning_rate": 4.9491252994065785e-05, + "loss": 5.3104, + "step": 10817 + }, + { + "epoch": 0.06433771053382814, + "grad_norm": 1.5542651414871216, + "learning_rate": 4.9491159237118626e-05, + "loss": 5.1308, + "step": 10818 + }, + { + "epoch": 0.06434365781710914, + "grad_norm": 1.3946558237075806, + "learning_rate": 4.9491065471621855e-05, + "loss": 5.243, + "step": 10819 + }, + { + "epoch": 0.06434960510039014, + "grad_norm": 1.3560529947280884, + "learning_rate": 4.9490971697575513e-05, + "loss": 4.9319, + "step": 10820 + }, + { + "epoch": 0.06435555238367113, + "grad_norm": 1.6921281814575195, + "learning_rate": 4.949087791497963e-05, + "loss": 5.2203, + "step": 10821 + }, + { + "epoch": 0.06436149966695213, + "grad_norm": 1.5226655006408691, + "learning_rate": 4.9490784123834225e-05, + "loss": 5.1879, + "step": 10822 + }, + { + "epoch": 0.06436744695023314, + "grad_norm": 1.5012669563293457, + "learning_rate": 4.9490690324139346e-05, + "loss": 5.2373, + "step": 10823 + }, + { + "epoch": 0.06437339423351413, + "grad_norm": 1.8050286769866943, + "learning_rate": 4.949059651589502e-05, + "loss": 5.0441, + "step": 10824 + }, + { + "epoch": 0.06437934151679513, + "grad_norm": 1.6800918579101562, + "learning_rate": 4.9490502699101274e-05, + "loss": 5.0871, + "step": 10825 + }, + { + "epoch": 0.06438528880007613, + "grad_norm": 1.4211550951004028, + "learning_rate": 4.949040887375814e-05, + "loss": 5.118, + "step": 10826 + }, + { + "epoch": 0.06439123608335712, + "grad_norm": 1.7064868211746216, + "learning_rate": 4.949031503986568e-05, + "loss": 5.2285, + "step": 10827 + }, + { + "epoch": 0.06439718336663812, + "grad_norm": 1.862491250038147, + "learning_rate": 4.949022119742388e-05, + "loss": 5.0958, + "step": 10828 + }, + { + "epoch": 0.06440313064991912, + "grad_norm": 1.933610200881958, + "learning_rate": 4.949012734643281e-05, + "loss": 5.1282, + "step": 10829 + }, + { + "epoch": 0.06440907793320011, + "grad_norm": 1.6140058040618896, + "learning_rate": 4.949003348689249e-05, + "loss": 4.9913, + "step": 10830 + }, + { + "epoch": 0.06441502521648111, + "grad_norm": 1.6881496906280518, + "learning_rate": 4.948993961880295e-05, + "loss": 5.1017, + "step": 10831 + }, + { + "epoch": 0.06442097249976211, + "grad_norm": 1.7887358665466309, + "learning_rate": 4.948984574216422e-05, + "loss": 5.1503, + "step": 10832 + }, + { + "epoch": 0.0644269197830431, + "grad_norm": 1.635720133781433, + "learning_rate": 4.948975185697634e-05, + "loss": 5.3381, + "step": 10833 + }, + { + "epoch": 0.0644328670663241, + "grad_norm": 1.6106109619140625, + "learning_rate": 4.9489657963239346e-05, + "loss": 5.0498, + "step": 10834 + }, + { + "epoch": 0.0644388143496051, + "grad_norm": 1.740438461303711, + "learning_rate": 4.9489564060953266e-05, + "loss": 5.0302, + "step": 10835 + }, + { + "epoch": 0.0644447616328861, + "grad_norm": 1.663994312286377, + "learning_rate": 4.9489470150118124e-05, + "loss": 5.1976, + "step": 10836 + }, + { + "epoch": 0.0644507089161671, + "grad_norm": 1.6748932600021362, + "learning_rate": 4.9489376230733965e-05, + "loss": 5.0055, + "step": 10837 + }, + { + "epoch": 0.0644566561994481, + "grad_norm": 1.7139437198638916, + "learning_rate": 4.948928230280082e-05, + "loss": 4.9617, + "step": 10838 + }, + { + "epoch": 0.06446260348272909, + "grad_norm": 1.698791742324829, + "learning_rate": 4.948918836631872e-05, + "loss": 4.9725, + "step": 10839 + }, + { + "epoch": 0.06446855076601009, + "grad_norm": 1.6961768865585327, + "learning_rate": 4.94890944212877e-05, + "loss": 4.9126, + "step": 10840 + }, + { + "epoch": 0.06447449804929109, + "grad_norm": 1.6551483869552612, + "learning_rate": 4.948900046770778e-05, + "loss": 5.0775, + "step": 10841 + }, + { + "epoch": 0.06448044533257208, + "grad_norm": 1.5863447189331055, + "learning_rate": 4.948890650557901e-05, + "loss": 5.0467, + "step": 10842 + }, + { + "epoch": 0.06448639261585308, + "grad_norm": 1.5629637241363525, + "learning_rate": 4.9488812534901414e-05, + "loss": 5.0012, + "step": 10843 + }, + { + "epoch": 0.06449233989913408, + "grad_norm": 1.5247453451156616, + "learning_rate": 4.948871855567503e-05, + "loss": 4.9928, + "step": 10844 + }, + { + "epoch": 0.06449828718241507, + "grad_norm": 1.7595921754837036, + "learning_rate": 4.948862456789988e-05, + "loss": 4.9256, + "step": 10845 + }, + { + "epoch": 0.06450423446569607, + "grad_norm": 1.6370458602905273, + "learning_rate": 4.948853057157601e-05, + "loss": 4.9499, + "step": 10846 + }, + { + "epoch": 0.06451018174897706, + "grad_norm": 1.7747406959533691, + "learning_rate": 4.948843656670345e-05, + "loss": 4.9246, + "step": 10847 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.6769739389419556, + "learning_rate": 4.948834255328222e-05, + "loss": 4.9561, + "step": 10848 + }, + { + "epoch": 0.06452207631553906, + "grad_norm": 1.60416841506958, + "learning_rate": 4.948824853131236e-05, + "loss": 5.0318, + "step": 10849 + }, + { + "epoch": 0.06452802359882005, + "grad_norm": 2.1050093173980713, + "learning_rate": 4.948815450079392e-05, + "loss": 5.5308, + "step": 10850 + }, + { + "epoch": 0.06453397088210105, + "grad_norm": 1.7474935054779053, + "learning_rate": 4.948806046172691e-05, + "loss": 5.0752, + "step": 10851 + }, + { + "epoch": 0.06453991816538206, + "grad_norm": 1.8992688655853271, + "learning_rate": 4.948796641411138e-05, + "loss": 5.3704, + "step": 10852 + }, + { + "epoch": 0.06454586544866305, + "grad_norm": 1.9632636308670044, + "learning_rate": 4.948787235794734e-05, + "loss": 5.4173, + "step": 10853 + }, + { + "epoch": 0.06455181273194405, + "grad_norm": 1.9034284353256226, + "learning_rate": 4.948777829323484e-05, + "loss": 5.2655, + "step": 10854 + }, + { + "epoch": 0.06455776001522505, + "grad_norm": 1.716711163520813, + "learning_rate": 4.9487684219973914e-05, + "loss": 5.4192, + "step": 10855 + }, + { + "epoch": 0.06456370729850604, + "grad_norm": 1.7886557579040527, + "learning_rate": 4.948759013816459e-05, + "loss": 5.2828, + "step": 10856 + }, + { + "epoch": 0.06456965458178704, + "grad_norm": 2.004117250442505, + "learning_rate": 4.9487496047806905e-05, + "loss": 4.9521, + "step": 10857 + }, + { + "epoch": 0.06457560186506804, + "grad_norm": 1.627955436706543, + "learning_rate": 4.948740194890088e-05, + "loss": 5.4288, + "step": 10858 + }, + { + "epoch": 0.06458154914834903, + "grad_norm": 2.2537145614624023, + "learning_rate": 4.948730784144656e-05, + "loss": 5.8176, + "step": 10859 + }, + { + "epoch": 0.06458749643163003, + "grad_norm": 2.216066837310791, + "learning_rate": 4.948721372544397e-05, + "loss": 5.4569, + "step": 10860 + }, + { + "epoch": 0.06459344371491103, + "grad_norm": 1.7641898393630981, + "learning_rate": 4.948711960089315e-05, + "loss": 5.659, + "step": 10861 + }, + { + "epoch": 0.06459939099819202, + "grad_norm": 1.9137814044952393, + "learning_rate": 4.948702546779413e-05, + "loss": 5.6275, + "step": 10862 + }, + { + "epoch": 0.06460533828147302, + "grad_norm": 2.2355434894561768, + "learning_rate": 4.948693132614694e-05, + "loss": 5.1712, + "step": 10863 + }, + { + "epoch": 0.06461128556475403, + "grad_norm": 1.780849814414978, + "learning_rate": 4.9486837175951616e-05, + "loss": 5.4521, + "step": 10864 + }, + { + "epoch": 0.06461723284803501, + "grad_norm": 1.8078423738479614, + "learning_rate": 4.948674301720819e-05, + "loss": 5.3609, + "step": 10865 + }, + { + "epoch": 0.06462318013131602, + "grad_norm": 1.590707540512085, + "learning_rate": 4.94866488499167e-05, + "loss": 5.4121, + "step": 10866 + }, + { + "epoch": 0.06462912741459702, + "grad_norm": 1.4369510412216187, + "learning_rate": 4.948655467407717e-05, + "loss": 5.418, + "step": 10867 + }, + { + "epoch": 0.064635074697878, + "grad_norm": 1.5800751447677612, + "learning_rate": 4.9486460489689634e-05, + "loss": 5.3492, + "step": 10868 + }, + { + "epoch": 0.06464102198115901, + "grad_norm": 1.5271484851837158, + "learning_rate": 4.948636629675413e-05, + "loss": 5.2758, + "step": 10869 + }, + { + "epoch": 0.06464696926444001, + "grad_norm": 1.7175722122192383, + "learning_rate": 4.948627209527069e-05, + "loss": 5.2939, + "step": 10870 + }, + { + "epoch": 0.064652916547721, + "grad_norm": 1.568851113319397, + "learning_rate": 4.948617788523935e-05, + "loss": 5.2559, + "step": 10871 + }, + { + "epoch": 0.064658863831002, + "grad_norm": 1.4012210369110107, + "learning_rate": 4.9486083666660135e-05, + "loss": 5.3195, + "step": 10872 + }, + { + "epoch": 0.064664811114283, + "grad_norm": 1.5386475324630737, + "learning_rate": 4.948598943953308e-05, + "loss": 5.293, + "step": 10873 + }, + { + "epoch": 0.06467075839756399, + "grad_norm": 1.4143292903900146, + "learning_rate": 4.948589520385821e-05, + "loss": 5.2181, + "step": 10874 + }, + { + "epoch": 0.06467670568084499, + "grad_norm": 1.392470121383667, + "learning_rate": 4.9485800959635576e-05, + "loss": 5.3074, + "step": 10875 + }, + { + "epoch": 0.06468265296412598, + "grad_norm": 1.7176567316055298, + "learning_rate": 4.94857067068652e-05, + "loss": 5.3024, + "step": 10876 + }, + { + "epoch": 0.06468860024740698, + "grad_norm": 1.5002285242080688, + "learning_rate": 4.9485612445547115e-05, + "loss": 5.1543, + "step": 10877 + }, + { + "epoch": 0.06469454753068798, + "grad_norm": 1.5615242719650269, + "learning_rate": 4.9485518175681364e-05, + "loss": 5.371, + "step": 10878 + }, + { + "epoch": 0.06470049481396897, + "grad_norm": 1.4294706583023071, + "learning_rate": 4.9485423897267966e-05, + "loss": 5.4151, + "step": 10879 + }, + { + "epoch": 0.06470644209724997, + "grad_norm": 2.0147571563720703, + "learning_rate": 4.948532961030695e-05, + "loss": 5.3082, + "step": 10880 + }, + { + "epoch": 0.06471238938053098, + "grad_norm": 1.5661358833312988, + "learning_rate": 4.948523531479837e-05, + "loss": 5.8232, + "step": 10881 + }, + { + "epoch": 0.06471833666381197, + "grad_norm": 1.5608779191970825, + "learning_rate": 4.9485141010742245e-05, + "loss": 5.5648, + "step": 10882 + }, + { + "epoch": 0.06472428394709297, + "grad_norm": 2.3148789405822754, + "learning_rate": 4.948504669813861e-05, + "loss": 4.8802, + "step": 10883 + }, + { + "epoch": 0.06473023123037397, + "grad_norm": 1.9495759010314941, + "learning_rate": 4.9484952376987504e-05, + "loss": 5.1985, + "step": 10884 + }, + { + "epoch": 0.06473617851365496, + "grad_norm": 2.031764268875122, + "learning_rate": 4.9484858047288944e-05, + "loss": 5.0772, + "step": 10885 + }, + { + "epoch": 0.06474212579693596, + "grad_norm": 1.6575301885604858, + "learning_rate": 4.948476370904298e-05, + "loss": 5.2157, + "step": 10886 + }, + { + "epoch": 0.06474807308021696, + "grad_norm": 1.6381278038024902, + "learning_rate": 4.948466936224964e-05, + "loss": 5.1168, + "step": 10887 + }, + { + "epoch": 0.06475402036349795, + "grad_norm": 1.672555923461914, + "learning_rate": 4.9484575006908945e-05, + "loss": 5.2839, + "step": 10888 + }, + { + "epoch": 0.06475996764677895, + "grad_norm": 1.8838026523590088, + "learning_rate": 4.9484480643020944e-05, + "loss": 5.301, + "step": 10889 + }, + { + "epoch": 0.06476591493005995, + "grad_norm": 1.935205101966858, + "learning_rate": 4.9484386270585656e-05, + "loss": 5.2898, + "step": 10890 + }, + { + "epoch": 0.06477186221334094, + "grad_norm": 1.630003809928894, + "learning_rate": 4.9484291889603134e-05, + "loss": 5.181, + "step": 10891 + }, + { + "epoch": 0.06477780949662194, + "grad_norm": 1.5095784664154053, + "learning_rate": 4.948419750007339e-05, + "loss": 5.3159, + "step": 10892 + }, + { + "epoch": 0.06478375677990295, + "grad_norm": 1.7217234373092651, + "learning_rate": 4.948410310199647e-05, + "loss": 5.3395, + "step": 10893 + }, + { + "epoch": 0.06478970406318393, + "grad_norm": 1.727953314781189, + "learning_rate": 4.94840086953724e-05, + "loss": 5.1374, + "step": 10894 + }, + { + "epoch": 0.06479565134646494, + "grad_norm": 1.7891777753829956, + "learning_rate": 4.9483914280201224e-05, + "loss": 5.2145, + "step": 10895 + }, + { + "epoch": 0.06480159862974594, + "grad_norm": 1.7402048110961914, + "learning_rate": 4.9483819856482956e-05, + "loss": 5.1723, + "step": 10896 + }, + { + "epoch": 0.06480754591302693, + "grad_norm": 1.6635658740997314, + "learning_rate": 4.9483725424217644e-05, + "loss": 5.0995, + "step": 10897 + }, + { + "epoch": 0.06481349319630793, + "grad_norm": 1.6190650463104248, + "learning_rate": 4.9483630983405317e-05, + "loss": 5.2062, + "step": 10898 + }, + { + "epoch": 0.06481944047958893, + "grad_norm": 1.6335800886154175, + "learning_rate": 4.9483536534046006e-05, + "loss": 5.4298, + "step": 10899 + }, + { + "epoch": 0.06482538776286992, + "grad_norm": 1.7549209594726562, + "learning_rate": 4.948344207613974e-05, + "loss": 5.1833, + "step": 10900 + }, + { + "epoch": 0.06483133504615092, + "grad_norm": 1.6011431217193604, + "learning_rate": 4.948334760968656e-05, + "loss": 5.2329, + "step": 10901 + }, + { + "epoch": 0.06483728232943192, + "grad_norm": 1.627424955368042, + "learning_rate": 4.9483253134686505e-05, + "loss": 5.3059, + "step": 10902 + }, + { + "epoch": 0.06484322961271291, + "grad_norm": 1.593361258506775, + "learning_rate": 4.948315865113959e-05, + "loss": 5.2711, + "step": 10903 + }, + { + "epoch": 0.06484917689599391, + "grad_norm": 1.5899426937103271, + "learning_rate": 4.9483064159045854e-05, + "loss": 5.2449, + "step": 10904 + }, + { + "epoch": 0.0648551241792749, + "grad_norm": 1.6572548151016235, + "learning_rate": 4.948296965840534e-05, + "loss": 5.18, + "step": 10905 + }, + { + "epoch": 0.0648610714625559, + "grad_norm": 1.649928092956543, + "learning_rate": 4.948287514921808e-05, + "loss": 5.2434, + "step": 10906 + }, + { + "epoch": 0.0648670187458369, + "grad_norm": 1.4546284675598145, + "learning_rate": 4.9482780631484094e-05, + "loss": 5.405, + "step": 10907 + }, + { + "epoch": 0.06487296602911789, + "grad_norm": 1.624617338180542, + "learning_rate": 4.9482686105203425e-05, + "loss": 5.3537, + "step": 10908 + }, + { + "epoch": 0.0648789133123989, + "grad_norm": 1.5108991861343384, + "learning_rate": 4.94825915703761e-05, + "loss": 5.1709, + "step": 10909 + }, + { + "epoch": 0.0648848605956799, + "grad_norm": 1.571028470993042, + "learning_rate": 4.948249702700215e-05, + "loss": 5.1374, + "step": 10910 + }, + { + "epoch": 0.06489080787896088, + "grad_norm": 1.3280094861984253, + "learning_rate": 4.948240247508162e-05, + "loss": 5.3469, + "step": 10911 + }, + { + "epoch": 0.06489675516224189, + "grad_norm": 1.8487119674682617, + "learning_rate": 4.948230791461454e-05, + "loss": 5.4673, + "step": 10912 + }, + { + "epoch": 0.06490270244552289, + "grad_norm": 1.6253544092178345, + "learning_rate": 4.9482213345600936e-05, + "loss": 5.2096, + "step": 10913 + }, + { + "epoch": 0.06490864972880388, + "grad_norm": 1.8487451076507568, + "learning_rate": 4.9482118768040844e-05, + "loss": 5.1452, + "step": 10914 + }, + { + "epoch": 0.06491459701208488, + "grad_norm": 1.6638668775558472, + "learning_rate": 4.948202418193429e-05, + "loss": 5.2382, + "step": 10915 + }, + { + "epoch": 0.06492054429536588, + "grad_norm": 1.662256121635437, + "learning_rate": 4.9481929587281326e-05, + "loss": 5.3125, + "step": 10916 + }, + { + "epoch": 0.06492649157864687, + "grad_norm": 1.5133339166641235, + "learning_rate": 4.948183498408197e-05, + "loss": 5.2494, + "step": 10917 + }, + { + "epoch": 0.06493243886192787, + "grad_norm": 1.5063300132751465, + "learning_rate": 4.9481740372336256e-05, + "loss": 5.1778, + "step": 10918 + }, + { + "epoch": 0.06493838614520887, + "grad_norm": 1.5223631858825684, + "learning_rate": 4.948164575204421e-05, + "loss": 5.1773, + "step": 10919 + }, + { + "epoch": 0.06494433342848986, + "grad_norm": 1.6163926124572754, + "learning_rate": 4.948155112320589e-05, + "loss": 5.2669, + "step": 10920 + }, + { + "epoch": 0.06495028071177086, + "grad_norm": 1.4077887535095215, + "learning_rate": 4.948145648582131e-05, + "loss": 5.1711, + "step": 10921 + }, + { + "epoch": 0.06495622799505187, + "grad_norm": 1.5710374116897583, + "learning_rate": 4.9481361839890505e-05, + "loss": 5.1687, + "step": 10922 + }, + { + "epoch": 0.06496217527833285, + "grad_norm": 1.5444159507751465, + "learning_rate": 4.9481267185413506e-05, + "loss": 5.2681, + "step": 10923 + }, + { + "epoch": 0.06496812256161386, + "grad_norm": 1.4816917181015015, + "learning_rate": 4.948117252239035e-05, + "loss": 5.2897, + "step": 10924 + }, + { + "epoch": 0.06497406984489486, + "grad_norm": 1.3373851776123047, + "learning_rate": 4.9481077850821075e-05, + "loss": 5.1607, + "step": 10925 + }, + { + "epoch": 0.06498001712817585, + "grad_norm": 1.7353702783584595, + "learning_rate": 4.948098317070571e-05, + "loss": 5.2546, + "step": 10926 + }, + { + "epoch": 0.06498596441145685, + "grad_norm": 1.4494054317474365, + "learning_rate": 4.948088848204428e-05, + "loss": 5.2244, + "step": 10927 + }, + { + "epoch": 0.06499191169473785, + "grad_norm": 1.6031813621520996, + "learning_rate": 4.9480793784836825e-05, + "loss": 5.2487, + "step": 10928 + }, + { + "epoch": 0.06499785897801884, + "grad_norm": 1.4134970903396606, + "learning_rate": 4.948069907908338e-05, + "loss": 5.2224, + "step": 10929 + }, + { + "epoch": 0.06500380626129984, + "grad_norm": 1.5790150165557861, + "learning_rate": 4.948060436478398e-05, + "loss": 5.3096, + "step": 10930 + }, + { + "epoch": 0.06500975354458084, + "grad_norm": 1.3925936222076416, + "learning_rate": 4.9480509641938644e-05, + "loss": 5.1823, + "step": 10931 + }, + { + "epoch": 0.06501570082786183, + "grad_norm": 1.40078866481781, + "learning_rate": 4.948041491054742e-05, + "loss": 5.1352, + "step": 10932 + }, + { + "epoch": 0.06502164811114283, + "grad_norm": 1.509726881980896, + "learning_rate": 4.948032017061034e-05, + "loss": 5.199, + "step": 10933 + }, + { + "epoch": 0.06502759539442382, + "grad_norm": 1.5671876668930054, + "learning_rate": 4.948022542212743e-05, + "loss": 5.2323, + "step": 10934 + }, + { + "epoch": 0.06503354267770482, + "grad_norm": 1.5019149780273438, + "learning_rate": 4.948013066509872e-05, + "loss": 5.244, + "step": 10935 + }, + { + "epoch": 0.06503948996098582, + "grad_norm": 1.576842188835144, + "learning_rate": 4.948003589952426e-05, + "loss": 5.153, + "step": 10936 + }, + { + "epoch": 0.06504543724426681, + "grad_norm": 1.4069315195083618, + "learning_rate": 4.9479941125404074e-05, + "loss": 5.3396, + "step": 10937 + }, + { + "epoch": 0.06505138452754781, + "grad_norm": 1.6663076877593994, + "learning_rate": 4.947984634273818e-05, + "loss": 5.223, + "step": 10938 + }, + { + "epoch": 0.06505733181082882, + "grad_norm": 1.5132073163986206, + "learning_rate": 4.947975155152663e-05, + "loss": 5.1335, + "step": 10939 + }, + { + "epoch": 0.0650632790941098, + "grad_norm": 1.59386146068573, + "learning_rate": 4.9479656751769455e-05, + "loss": 5.4893, + "step": 10940 + }, + { + "epoch": 0.06506922637739081, + "grad_norm": 1.3486778736114502, + "learning_rate": 4.9479561943466686e-05, + "loss": 5.2164, + "step": 10941 + }, + { + "epoch": 0.06507517366067181, + "grad_norm": 1.4107574224472046, + "learning_rate": 4.947946712661835e-05, + "loss": 5.2337, + "step": 10942 + }, + { + "epoch": 0.0650811209439528, + "grad_norm": 1.6905080080032349, + "learning_rate": 4.947937230122449e-05, + "loss": 5.1749, + "step": 10943 + }, + { + "epoch": 0.0650870682272338, + "grad_norm": 1.5062333345413208, + "learning_rate": 4.947927746728513e-05, + "loss": 5.2227, + "step": 10944 + }, + { + "epoch": 0.0650930155105148, + "grad_norm": 1.4318712949752808, + "learning_rate": 4.947918262480031e-05, + "loss": 5.1565, + "step": 10945 + }, + { + "epoch": 0.06509896279379579, + "grad_norm": 1.5121338367462158, + "learning_rate": 4.9479087773770055e-05, + "loss": 5.3718, + "step": 10946 + }, + { + "epoch": 0.06510491007707679, + "grad_norm": 1.2901450395584106, + "learning_rate": 4.947899291419441e-05, + "loss": 5.291, + "step": 10947 + }, + { + "epoch": 0.0651108573603578, + "grad_norm": 1.5350853204727173, + "learning_rate": 4.9478898046073394e-05, + "loss": 5.411, + "step": 10948 + }, + { + "epoch": 0.06511680464363878, + "grad_norm": 1.5083260536193848, + "learning_rate": 4.947880316940705e-05, + "loss": 4.9143, + "step": 10949 + }, + { + "epoch": 0.06512275192691978, + "grad_norm": 1.462415099143982, + "learning_rate": 4.947870828419541e-05, + "loss": 5.0059, + "step": 10950 + }, + { + "epoch": 0.06512869921020079, + "grad_norm": 1.9356911182403564, + "learning_rate": 4.947861339043851e-05, + "loss": 5.3886, + "step": 10951 + }, + { + "epoch": 0.06513464649348177, + "grad_norm": 1.4918417930603027, + "learning_rate": 4.947851848813637e-05, + "loss": 5.3456, + "step": 10952 + }, + { + "epoch": 0.06514059377676278, + "grad_norm": 1.8015687465667725, + "learning_rate": 4.9478423577289044e-05, + "loss": 5.4599, + "step": 10953 + }, + { + "epoch": 0.06514654106004378, + "grad_norm": 1.663827657699585, + "learning_rate": 4.947832865789654e-05, + "loss": 5.4448, + "step": 10954 + }, + { + "epoch": 0.06515248834332477, + "grad_norm": 1.7196985483169556, + "learning_rate": 4.947823372995891e-05, + "loss": 5.4799, + "step": 10955 + }, + { + "epoch": 0.06515843562660577, + "grad_norm": 1.341449499130249, + "learning_rate": 4.947813879347619e-05, + "loss": 5.0305, + "step": 10956 + }, + { + "epoch": 0.06516438290988677, + "grad_norm": 1.9917103052139282, + "learning_rate": 4.9478043848448394e-05, + "loss": 4.9911, + "step": 10957 + }, + { + "epoch": 0.06517033019316776, + "grad_norm": 1.8540695905685425, + "learning_rate": 4.947794889487557e-05, + "loss": 4.9725, + "step": 10958 + }, + { + "epoch": 0.06517627747644876, + "grad_norm": 1.6755226850509644, + "learning_rate": 4.9477853932757744e-05, + "loss": 5.1452, + "step": 10959 + }, + { + "epoch": 0.06518222475972976, + "grad_norm": 1.613694667816162, + "learning_rate": 4.9477758962094954e-05, + "loss": 5.1241, + "step": 10960 + }, + { + "epoch": 0.06518817204301075, + "grad_norm": 1.4891341924667358, + "learning_rate": 4.9477663982887235e-05, + "loss": 5.2139, + "step": 10961 + }, + { + "epoch": 0.06519411932629175, + "grad_norm": 1.451180100440979, + "learning_rate": 4.947756899513461e-05, + "loss": 5.216, + "step": 10962 + }, + { + "epoch": 0.06520006660957274, + "grad_norm": 1.7225643396377563, + "learning_rate": 4.947747399883712e-05, + "loss": 4.9342, + "step": 10963 + }, + { + "epoch": 0.06520601389285374, + "grad_norm": 1.5917341709136963, + "learning_rate": 4.94773789939948e-05, + "loss": 4.9196, + "step": 10964 + }, + { + "epoch": 0.06521196117613474, + "grad_norm": 1.3010936975479126, + "learning_rate": 4.947728398060768e-05, + "loss": 4.8165, + "step": 10965 + }, + { + "epoch": 0.06521790845941573, + "grad_norm": 1.6672911643981934, + "learning_rate": 4.947718895867579e-05, + "loss": 5.082, + "step": 10966 + }, + { + "epoch": 0.06522385574269673, + "grad_norm": 1.5662728548049927, + "learning_rate": 4.947709392819916e-05, + "loss": 5.1654, + "step": 10967 + }, + { + "epoch": 0.06522980302597774, + "grad_norm": 1.3455015420913696, + "learning_rate": 4.947699888917784e-05, + "loss": 4.6897, + "step": 10968 + }, + { + "epoch": 0.06523575030925872, + "grad_norm": 1.6042569875717163, + "learning_rate": 4.947690384161185e-05, + "loss": 4.6814, + "step": 10969 + }, + { + "epoch": 0.06524169759253973, + "grad_norm": 1.436345100402832, + "learning_rate": 4.947680878550123e-05, + "loss": 4.6052, + "step": 10970 + }, + { + "epoch": 0.06524764487582073, + "grad_norm": 1.3438220024108887, + "learning_rate": 4.9476713720846e-05, + "loss": 4.6385, + "step": 10971 + }, + { + "epoch": 0.06525359215910172, + "grad_norm": 1.378206729888916, + "learning_rate": 4.94766186476462e-05, + "loss": 4.5546, + "step": 10972 + }, + { + "epoch": 0.06525953944238272, + "grad_norm": 1.5776808261871338, + "learning_rate": 4.9476523565901874e-05, + "loss": 4.7728, + "step": 10973 + }, + { + "epoch": 0.06526548672566372, + "grad_norm": 1.8892265558242798, + "learning_rate": 4.947642847561305e-05, + "loss": 5.3423, + "step": 10974 + }, + { + "epoch": 0.06527143400894471, + "grad_norm": 1.279730200767517, + "learning_rate": 4.9476333376779746e-05, + "loss": 4.649, + "step": 10975 + }, + { + "epoch": 0.06527738129222571, + "grad_norm": 1.6268417835235596, + "learning_rate": 4.947623826940201e-05, + "loss": 4.6534, + "step": 10976 + }, + { + "epoch": 0.06528332857550671, + "grad_norm": 1.4456939697265625, + "learning_rate": 4.947614315347987e-05, + "loss": 4.6636, + "step": 10977 + }, + { + "epoch": 0.0652892758587877, + "grad_norm": 1.4848358631134033, + "learning_rate": 4.947604802901337e-05, + "loss": 4.6823, + "step": 10978 + }, + { + "epoch": 0.0652952231420687, + "grad_norm": 1.4143959283828735, + "learning_rate": 4.947595289600253e-05, + "loss": 4.546, + "step": 10979 + }, + { + "epoch": 0.0653011704253497, + "grad_norm": 1.7399781942367554, + "learning_rate": 4.947585775444739e-05, + "loss": 5.1456, + "step": 10980 + }, + { + "epoch": 0.0653071177086307, + "grad_norm": 1.9160579442977905, + "learning_rate": 4.947576260434797e-05, + "loss": 5.4101, + "step": 10981 + }, + { + "epoch": 0.0653130649919117, + "grad_norm": 1.9356415271759033, + "learning_rate": 4.947566744570433e-05, + "loss": 5.6235, + "step": 10982 + }, + { + "epoch": 0.0653190122751927, + "grad_norm": 1.756996512413025, + "learning_rate": 4.947557227851648e-05, + "loss": 5.6458, + "step": 10983 + }, + { + "epoch": 0.06532495955847369, + "grad_norm": 1.790447473526001, + "learning_rate": 4.947547710278446e-05, + "loss": 5.1529, + "step": 10984 + }, + { + "epoch": 0.06533090684175469, + "grad_norm": 1.8125256299972534, + "learning_rate": 4.94753819185083e-05, + "loss": 4.8824, + "step": 10985 + }, + { + "epoch": 0.06533685412503569, + "grad_norm": 1.72708261013031, + "learning_rate": 4.947528672568804e-05, + "loss": 5.1252, + "step": 10986 + }, + { + "epoch": 0.06534280140831668, + "grad_norm": 1.5867630243301392, + "learning_rate": 4.9475191524323714e-05, + "loss": 5.2007, + "step": 10987 + }, + { + "epoch": 0.06534874869159768, + "grad_norm": 1.8278383016586304, + "learning_rate": 4.9475096314415356e-05, + "loss": 5.1268, + "step": 10988 + }, + { + "epoch": 0.06535469597487868, + "grad_norm": 1.6850647926330566, + "learning_rate": 4.947500109596298e-05, + "loss": 5.0058, + "step": 10989 + }, + { + "epoch": 0.06536064325815967, + "grad_norm": 1.4993211030960083, + "learning_rate": 4.9474905868966645e-05, + "loss": 5.1911, + "step": 10990 + }, + { + "epoch": 0.06536659054144067, + "grad_norm": 1.4816709756851196, + "learning_rate": 4.947481063342637e-05, + "loss": 5.073, + "step": 10991 + }, + { + "epoch": 0.06537253782472166, + "grad_norm": 1.5394763946533203, + "learning_rate": 4.9474715389342194e-05, + "loss": 5.3133, + "step": 10992 + }, + { + "epoch": 0.06537848510800266, + "grad_norm": 1.6095061302185059, + "learning_rate": 4.9474620136714144e-05, + "loss": 5.1657, + "step": 10993 + }, + { + "epoch": 0.06538443239128366, + "grad_norm": 1.707533597946167, + "learning_rate": 4.947452487554226e-05, + "loss": 5.2022, + "step": 10994 + }, + { + "epoch": 0.06539037967456465, + "grad_norm": 1.6304863691329956, + "learning_rate": 4.947442960582657e-05, + "loss": 5.1454, + "step": 10995 + }, + { + "epoch": 0.06539632695784565, + "grad_norm": 1.5767943859100342, + "learning_rate": 4.9474334327567103e-05, + "loss": 5.0317, + "step": 10996 + }, + { + "epoch": 0.06540227424112666, + "grad_norm": 1.6779369115829468, + "learning_rate": 4.9474239040763916e-05, + "loss": 5.1932, + "step": 10997 + }, + { + "epoch": 0.06540822152440764, + "grad_norm": 1.6607457399368286, + "learning_rate": 4.947414374541701e-05, + "loss": 5.2488, + "step": 10998 + }, + { + "epoch": 0.06541416880768865, + "grad_norm": 1.5271342992782593, + "learning_rate": 4.947404844152644e-05, + "loss": 5.2225, + "step": 10999 + }, + { + "epoch": 0.06542011609096965, + "grad_norm": 1.3633404970169067, + "learning_rate": 4.947395312909223e-05, + "loss": 5.2228, + "step": 11000 + }, + { + "epoch": 0.06542606337425064, + "grad_norm": 1.4911702871322632, + "learning_rate": 4.9473857808114416e-05, + "loss": 5.3533, + "step": 11001 + }, + { + "epoch": 0.06543201065753164, + "grad_norm": 1.350714087486267, + "learning_rate": 4.947376247859303e-05, + "loss": 5.2553, + "step": 11002 + }, + { + "epoch": 0.06543795794081264, + "grad_norm": 1.531064510345459, + "learning_rate": 4.9473667140528116e-05, + "loss": 5.0982, + "step": 11003 + }, + { + "epoch": 0.06544390522409363, + "grad_norm": 1.4037193059921265, + "learning_rate": 4.947357179391968e-05, + "loss": 5.2129, + "step": 11004 + }, + { + "epoch": 0.06544985250737463, + "grad_norm": 1.5746560096740723, + "learning_rate": 4.9473476438767784e-05, + "loss": 5.2561, + "step": 11005 + }, + { + "epoch": 0.06545579979065563, + "grad_norm": 1.4906586408615112, + "learning_rate": 4.947338107507245e-05, + "loss": 5.2584, + "step": 11006 + }, + { + "epoch": 0.06546174707393662, + "grad_norm": 1.687965989112854, + "learning_rate": 4.947328570283371e-05, + "loss": 5.0578, + "step": 11007 + }, + { + "epoch": 0.06546769435721762, + "grad_norm": 1.6732810735702515, + "learning_rate": 4.94731903220516e-05, + "loss": 5.1301, + "step": 11008 + }, + { + "epoch": 0.06547364164049863, + "grad_norm": 1.465431809425354, + "learning_rate": 4.947309493272615e-05, + "loss": 5.2479, + "step": 11009 + }, + { + "epoch": 0.06547958892377961, + "grad_norm": 1.4699040651321411, + "learning_rate": 4.94729995348574e-05, + "loss": 5.263, + "step": 11010 + }, + { + "epoch": 0.06548553620706062, + "grad_norm": 1.5757801532745361, + "learning_rate": 4.947290412844537e-05, + "loss": 5.2938, + "step": 11011 + }, + { + "epoch": 0.06549148349034162, + "grad_norm": 1.5458070039749146, + "learning_rate": 4.947280871349011e-05, + "loss": 5.2755, + "step": 11012 + }, + { + "epoch": 0.0654974307736226, + "grad_norm": 1.4919404983520508, + "learning_rate": 4.9472713289991644e-05, + "loss": 5.1432, + "step": 11013 + }, + { + "epoch": 0.06550337805690361, + "grad_norm": 1.513539433479309, + "learning_rate": 4.947261785795001e-05, + "loss": 5.3262, + "step": 11014 + }, + { + "epoch": 0.06550932534018461, + "grad_norm": 1.610257863998413, + "learning_rate": 4.947252241736523e-05, + "loss": 5.1444, + "step": 11015 + }, + { + "epoch": 0.0655152726234656, + "grad_norm": 1.5597975254058838, + "learning_rate": 4.947242696823735e-05, + "loss": 5.1581, + "step": 11016 + }, + { + "epoch": 0.0655212199067466, + "grad_norm": 1.686418056488037, + "learning_rate": 4.94723315105664e-05, + "loss": 5.1608, + "step": 11017 + }, + { + "epoch": 0.0655271671900276, + "grad_norm": 1.5329445600509644, + "learning_rate": 4.94722360443524e-05, + "loss": 5.1716, + "step": 11018 + }, + { + "epoch": 0.06553311447330859, + "grad_norm": 1.4718917608261108, + "learning_rate": 4.94721405695954e-05, + "loss": 5.0924, + "step": 11019 + }, + { + "epoch": 0.06553906175658959, + "grad_norm": 1.4442907571792603, + "learning_rate": 4.947204508629544e-05, + "loss": 5.3967, + "step": 11020 + }, + { + "epoch": 0.06554500903987058, + "grad_norm": 1.523834466934204, + "learning_rate": 4.947194959445253e-05, + "loss": 5.2068, + "step": 11021 + }, + { + "epoch": 0.06555095632315158, + "grad_norm": 1.4898262023925781, + "learning_rate": 4.947185409406672e-05, + "loss": 5.1664, + "step": 11022 + }, + { + "epoch": 0.06555690360643258, + "grad_norm": 1.504695177078247, + "learning_rate": 4.947175858513804e-05, + "loss": 5.2349, + "step": 11023 + }, + { + "epoch": 0.06556285088971357, + "grad_norm": 1.3538787364959717, + "learning_rate": 4.9471663067666516e-05, + "loss": 5.1034, + "step": 11024 + }, + { + "epoch": 0.06556879817299457, + "grad_norm": 1.3748440742492676, + "learning_rate": 4.94715675416522e-05, + "loss": 4.9759, + "step": 11025 + }, + { + "epoch": 0.06557474545627558, + "grad_norm": 1.5980280637741089, + "learning_rate": 4.94714720070951e-05, + "loss": 5.3042, + "step": 11026 + }, + { + "epoch": 0.06558069273955656, + "grad_norm": 1.641076683998108, + "learning_rate": 4.9471376463995266e-05, + "loss": 5.3373, + "step": 11027 + }, + { + "epoch": 0.06558664002283757, + "grad_norm": 1.5320390462875366, + "learning_rate": 4.947128091235273e-05, + "loss": 5.2308, + "step": 11028 + }, + { + "epoch": 0.06559258730611857, + "grad_norm": 1.5777555704116821, + "learning_rate": 4.9471185352167514e-05, + "loss": 5.2242, + "step": 11029 + }, + { + "epoch": 0.06559853458939956, + "grad_norm": 1.5055029392242432, + "learning_rate": 4.947108978343967e-05, + "loss": 5.1974, + "step": 11030 + }, + { + "epoch": 0.06560448187268056, + "grad_norm": 1.3923927545547485, + "learning_rate": 4.947099420616922e-05, + "loss": 5.3244, + "step": 11031 + }, + { + "epoch": 0.06561042915596156, + "grad_norm": 1.40999174118042, + "learning_rate": 4.9470898620356186e-05, + "loss": 5.3315, + "step": 11032 + }, + { + "epoch": 0.06561637643924255, + "grad_norm": 1.418296456336975, + "learning_rate": 4.947080302600063e-05, + "loss": 5.3942, + "step": 11033 + }, + { + "epoch": 0.06562232372252355, + "grad_norm": 1.7927478551864624, + "learning_rate": 4.9470707423102566e-05, + "loss": 5.3084, + "step": 11034 + }, + { + "epoch": 0.06562827100580455, + "grad_norm": 1.385011911392212, + "learning_rate": 4.947061181166203e-05, + "loss": 5.2043, + "step": 11035 + }, + { + "epoch": 0.06563421828908554, + "grad_norm": 1.5702954530715942, + "learning_rate": 4.9470516191679054e-05, + "loss": 5.9851, + "step": 11036 + }, + { + "epoch": 0.06564016557236654, + "grad_norm": 1.4196525812149048, + "learning_rate": 4.947042056315367e-05, + "loss": 5.2592, + "step": 11037 + }, + { + "epoch": 0.06564611285564755, + "grad_norm": 1.8318798542022705, + "learning_rate": 4.947032492608592e-05, + "loss": 5.3181, + "step": 11038 + }, + { + "epoch": 0.06565206013892853, + "grad_norm": 1.615460991859436, + "learning_rate": 4.947022928047583e-05, + "loss": 5.4053, + "step": 11039 + }, + { + "epoch": 0.06565800742220954, + "grad_norm": 1.384602427482605, + "learning_rate": 4.947013362632344e-05, + "loss": 5.3955, + "step": 11040 + }, + { + "epoch": 0.06566395470549054, + "grad_norm": 1.5959913730621338, + "learning_rate": 4.947003796362878e-05, + "loss": 5.4737, + "step": 11041 + }, + { + "epoch": 0.06566990198877153, + "grad_norm": 1.483659029006958, + "learning_rate": 4.946994229239188e-05, + "loss": 5.3804, + "step": 11042 + }, + { + "epoch": 0.06567584927205253, + "grad_norm": 1.2752004861831665, + "learning_rate": 4.946984661261277e-05, + "loss": 5.3806, + "step": 11043 + }, + { + "epoch": 0.06568179655533353, + "grad_norm": 2.0671582221984863, + "learning_rate": 4.946975092429149e-05, + "loss": 5.3047, + "step": 11044 + }, + { + "epoch": 0.06568774383861452, + "grad_norm": 1.6126081943511963, + "learning_rate": 4.946965522742808e-05, + "loss": 5.1905, + "step": 11045 + }, + { + "epoch": 0.06569369112189552, + "grad_norm": 1.6867598295211792, + "learning_rate": 4.946955952202257e-05, + "loss": 5.1543, + "step": 11046 + }, + { + "epoch": 0.06569963840517652, + "grad_norm": 1.3493974208831787, + "learning_rate": 4.946946380807498e-05, + "loss": 5.1527, + "step": 11047 + }, + { + "epoch": 0.06570558568845751, + "grad_norm": 1.4694898128509521, + "learning_rate": 4.946936808558536e-05, + "loss": 5.238, + "step": 11048 + }, + { + "epoch": 0.06571153297173851, + "grad_norm": 1.7940189838409424, + "learning_rate": 4.946927235455373e-05, + "loss": 5.0666, + "step": 11049 + }, + { + "epoch": 0.0657174802550195, + "grad_norm": 1.7015198469161987, + "learning_rate": 4.946917661498013e-05, + "loss": 5.5182, + "step": 11050 + }, + { + "epoch": 0.0657234275383005, + "grad_norm": 2.214686632156372, + "learning_rate": 4.946908086686459e-05, + "loss": 5.9424, + "step": 11051 + }, + { + "epoch": 0.0657293748215815, + "grad_norm": 1.7855008840560913, + "learning_rate": 4.9468985110207154e-05, + "loss": 5.8496, + "step": 11052 + }, + { + "epoch": 0.06573532210486249, + "grad_norm": 1.8354082107543945, + "learning_rate": 4.946888934500785e-05, + "loss": 5.8044, + "step": 11053 + }, + { + "epoch": 0.0657412693881435, + "grad_norm": 2.0321154594421387, + "learning_rate": 4.9468793571266705e-05, + "loss": 5.9488, + "step": 11054 + }, + { + "epoch": 0.0657472166714245, + "grad_norm": 2.2285213470458984, + "learning_rate": 4.946869778898376e-05, + "loss": 5.1819, + "step": 11055 + }, + { + "epoch": 0.06575316395470548, + "grad_norm": 1.9831287860870361, + "learning_rate": 4.946860199815904e-05, + "loss": 5.2068, + "step": 11056 + }, + { + "epoch": 0.06575911123798649, + "grad_norm": 2.1150667667388916, + "learning_rate": 4.946850619879259e-05, + "loss": 5.1523, + "step": 11057 + }, + { + "epoch": 0.06576505852126749, + "grad_norm": 1.9136968851089478, + "learning_rate": 4.946841039088444e-05, + "loss": 5.0084, + "step": 11058 + }, + { + "epoch": 0.06577100580454848, + "grad_norm": 1.9802511930465698, + "learning_rate": 4.9468314574434604e-05, + "loss": 4.9223, + "step": 11059 + }, + { + "epoch": 0.06577695308782948, + "grad_norm": 1.940656065940857, + "learning_rate": 4.946821874944315e-05, + "loss": 4.9662, + "step": 11060 + }, + { + "epoch": 0.06578290037111048, + "grad_norm": 1.8476706743240356, + "learning_rate": 4.9468122915910084e-05, + "loss": 4.8863, + "step": 11061 + }, + { + "epoch": 0.06578884765439147, + "grad_norm": 2.0490243434906006, + "learning_rate": 4.946802707383546e-05, + "loss": 4.8459, + "step": 11062 + }, + { + "epoch": 0.06579479493767247, + "grad_norm": 1.8996137380599976, + "learning_rate": 4.946793122321928e-05, + "loss": 4.7574, + "step": 11063 + }, + { + "epoch": 0.06580074222095347, + "grad_norm": 1.8910033702850342, + "learning_rate": 4.946783536406161e-05, + "loss": 4.8808, + "step": 11064 + }, + { + "epoch": 0.06580668950423446, + "grad_norm": 2.123816967010498, + "learning_rate": 4.946773949636247e-05, + "loss": 4.8486, + "step": 11065 + }, + { + "epoch": 0.06581263678751546, + "grad_norm": 1.7508260011672974, + "learning_rate": 4.9467643620121906e-05, + "loss": 4.9856, + "step": 11066 + }, + { + "epoch": 0.06581858407079647, + "grad_norm": 1.728398084640503, + "learning_rate": 4.9467547735339926e-05, + "loss": 4.9634, + "step": 11067 + }, + { + "epoch": 0.06582453135407745, + "grad_norm": 2.1020689010620117, + "learning_rate": 4.946745184201659e-05, + "loss": 4.6133, + "step": 11068 + }, + { + "epoch": 0.06583047863735846, + "grad_norm": 2.106549024581909, + "learning_rate": 4.9467355940151904e-05, + "loss": 4.7124, + "step": 11069 + }, + { + "epoch": 0.06583642592063946, + "grad_norm": 2.078505039215088, + "learning_rate": 4.9467260029745924e-05, + "loss": 4.5828, + "step": 11070 + }, + { + "epoch": 0.06584237320392045, + "grad_norm": 1.987950325012207, + "learning_rate": 4.946716411079868e-05, + "loss": 4.5823, + "step": 11071 + }, + { + "epoch": 0.06584832048720145, + "grad_norm": 1.9027208089828491, + "learning_rate": 4.94670681833102e-05, + "loss": 4.8063, + "step": 11072 + }, + { + "epoch": 0.06585426777048245, + "grad_norm": 2.001823902130127, + "learning_rate": 4.946697224728052e-05, + "loss": 4.5405, + "step": 11073 + }, + { + "epoch": 0.06586021505376344, + "grad_norm": 2.1472394466400146, + "learning_rate": 4.946687630270967e-05, + "loss": 4.6565, + "step": 11074 + }, + { + "epoch": 0.06586616233704444, + "grad_norm": 2.0731146335601807, + "learning_rate": 4.946678034959769e-05, + "loss": 4.5022, + "step": 11075 + }, + { + "epoch": 0.06587210962032544, + "grad_norm": 2.0769810676574707, + "learning_rate": 4.946668438794461e-05, + "loss": 4.5248, + "step": 11076 + }, + { + "epoch": 0.06587805690360643, + "grad_norm": 2.183871269226074, + "learning_rate": 4.946658841775046e-05, + "loss": 4.5723, + "step": 11077 + }, + { + "epoch": 0.06588400418688743, + "grad_norm": 2.0304160118103027, + "learning_rate": 4.9466492439015275e-05, + "loss": 4.5928, + "step": 11078 + }, + { + "epoch": 0.06588995147016842, + "grad_norm": 1.9167170524597168, + "learning_rate": 4.94663964517391e-05, + "loss": 4.4162, + "step": 11079 + }, + { + "epoch": 0.06589589875344942, + "grad_norm": 2.1295299530029297, + "learning_rate": 4.9466300455921946e-05, + "loss": 4.6662, + "step": 11080 + }, + { + "epoch": 0.06590184603673042, + "grad_norm": 2.180253744125366, + "learning_rate": 4.946620445156386e-05, + "loss": 4.5101, + "step": 11081 + }, + { + "epoch": 0.06590779332001141, + "grad_norm": 1.887289047241211, + "learning_rate": 4.9466108438664885e-05, + "loss": 4.3611, + "step": 11082 + }, + { + "epoch": 0.06591374060329241, + "grad_norm": 1.8323948383331299, + "learning_rate": 4.946601241722504e-05, + "loss": 4.8711, + "step": 11083 + }, + { + "epoch": 0.06591968788657342, + "grad_norm": 1.944860577583313, + "learning_rate": 4.946591638724436e-05, + "loss": 4.5288, + "step": 11084 + }, + { + "epoch": 0.0659256351698544, + "grad_norm": 1.9748528003692627, + "learning_rate": 4.946582034872288e-05, + "loss": 4.3819, + "step": 11085 + }, + { + "epoch": 0.0659315824531354, + "grad_norm": 2.017582416534424, + "learning_rate": 4.9465724301660635e-05, + "loss": 4.4508, + "step": 11086 + }, + { + "epoch": 0.06593752973641641, + "grad_norm": 1.8043986558914185, + "learning_rate": 4.946562824605766e-05, + "loss": 4.5948, + "step": 11087 + }, + { + "epoch": 0.0659434770196974, + "grad_norm": 1.8695666790008545, + "learning_rate": 4.946553218191399e-05, + "loss": 4.2691, + "step": 11088 + }, + { + "epoch": 0.0659494243029784, + "grad_norm": 2.027717351913452, + "learning_rate": 4.9465436109229656e-05, + "loss": 4.4152, + "step": 11089 + }, + { + "epoch": 0.0659553715862594, + "grad_norm": 1.989127278327942, + "learning_rate": 4.946534002800469e-05, + "loss": 4.5155, + "step": 11090 + }, + { + "epoch": 0.06596131886954039, + "grad_norm": 1.9889907836914062, + "learning_rate": 4.9465243938239124e-05, + "loss": 4.4047, + "step": 11091 + }, + { + "epoch": 0.06596726615282139, + "grad_norm": 2.077021837234497, + "learning_rate": 4.946514783993299e-05, + "loss": 4.5199, + "step": 11092 + }, + { + "epoch": 0.0659732134361024, + "grad_norm": 1.9180271625518799, + "learning_rate": 4.946505173308633e-05, + "loss": 4.4511, + "step": 11093 + }, + { + "epoch": 0.06597916071938338, + "grad_norm": 2.120338201522827, + "learning_rate": 4.946495561769918e-05, + "loss": 4.3034, + "step": 11094 + }, + { + "epoch": 0.06598510800266438, + "grad_norm": 1.9632322788238525, + "learning_rate": 4.946485949377156e-05, + "loss": 5.2411, + "step": 11095 + }, + { + "epoch": 0.06599105528594539, + "grad_norm": 2.0921249389648438, + "learning_rate": 4.946476336130351e-05, + "loss": 4.5768, + "step": 11096 + }, + { + "epoch": 0.06599700256922637, + "grad_norm": 2.1472532749176025, + "learning_rate": 4.9464667220295066e-05, + "loss": 4.6279, + "step": 11097 + }, + { + "epoch": 0.06600294985250738, + "grad_norm": 2.472062349319458, + "learning_rate": 4.946457107074626e-05, + "loss": 5.703, + "step": 11098 + }, + { + "epoch": 0.06600889713578838, + "grad_norm": 1.8995217084884644, + "learning_rate": 4.946447491265712e-05, + "loss": 4.5265, + "step": 11099 + }, + { + "epoch": 0.06601484441906937, + "grad_norm": 2.173339605331421, + "learning_rate": 4.946437874602769e-05, + "loss": 4.5356, + "step": 11100 + }, + { + "epoch": 0.06602079170235037, + "grad_norm": 1.8179867267608643, + "learning_rate": 4.9464282570858e-05, + "loss": 4.3765, + "step": 11101 + }, + { + "epoch": 0.06602673898563137, + "grad_norm": 2.367713212966919, + "learning_rate": 4.946418638714808e-05, + "loss": 5.6831, + "step": 11102 + }, + { + "epoch": 0.06603268626891236, + "grad_norm": 2.3576571941375732, + "learning_rate": 4.9464090194897964e-05, + "loss": 5.563, + "step": 11103 + }, + { + "epoch": 0.06603863355219336, + "grad_norm": 2.0476090908050537, + "learning_rate": 4.946399399410768e-05, + "loss": 5.7503, + "step": 11104 + }, + { + "epoch": 0.06604458083547436, + "grad_norm": 2.104295253753662, + "learning_rate": 4.946389778477728e-05, + "loss": 5.669, + "step": 11105 + }, + { + "epoch": 0.06605052811875535, + "grad_norm": 2.1458580493927, + "learning_rate": 4.946380156690677e-05, + "loss": 5.5317, + "step": 11106 + }, + { + "epoch": 0.06605647540203635, + "grad_norm": 2.0373425483703613, + "learning_rate": 4.946370534049621e-05, + "loss": 5.5952, + "step": 11107 + }, + { + "epoch": 0.06606242268531734, + "grad_norm": 2.232574701309204, + "learning_rate": 4.946360910554563e-05, + "loss": 5.6076, + "step": 11108 + }, + { + "epoch": 0.06606836996859834, + "grad_norm": 2.1477861404418945, + "learning_rate": 4.946351286205505e-05, + "loss": 5.5862, + "step": 11109 + }, + { + "epoch": 0.06607431725187934, + "grad_norm": 2.105203866958618, + "learning_rate": 4.946341661002451e-05, + "loss": 5.5089, + "step": 11110 + }, + { + "epoch": 0.06608026453516033, + "grad_norm": 2.1524410247802734, + "learning_rate": 4.9463320349454047e-05, + "loss": 5.419, + "step": 11111 + }, + { + "epoch": 0.06608621181844133, + "grad_norm": 2.132504463195801, + "learning_rate": 4.946322408034369e-05, + "loss": 5.3421, + "step": 11112 + }, + { + "epoch": 0.06609215910172234, + "grad_norm": 1.7870386838912964, + "learning_rate": 4.9463127802693474e-05, + "loss": 5.1829, + "step": 11113 + }, + { + "epoch": 0.06609810638500332, + "grad_norm": 1.9586358070373535, + "learning_rate": 4.946303151650343e-05, + "loss": 5.228, + "step": 11114 + }, + { + "epoch": 0.06610405366828433, + "grad_norm": 2.092473030090332, + "learning_rate": 4.9462935221773594e-05, + "loss": 5.4616, + "step": 11115 + }, + { + "epoch": 0.06611000095156533, + "grad_norm": 2.204131603240967, + "learning_rate": 4.946283891850401e-05, + "loss": 5.4552, + "step": 11116 + }, + { + "epoch": 0.06611594823484632, + "grad_norm": 1.998795747756958, + "learning_rate": 4.946274260669469e-05, + "loss": 5.5193, + "step": 11117 + }, + { + "epoch": 0.06612189551812732, + "grad_norm": 1.9446638822555542, + "learning_rate": 4.9462646286345684e-05, + "loss": 5.3923, + "step": 11118 + }, + { + "epoch": 0.06612784280140832, + "grad_norm": 1.828114628791809, + "learning_rate": 4.946254995745702e-05, + "loss": 5.4306, + "step": 11119 + }, + { + "epoch": 0.06613379008468931, + "grad_norm": 2.1322944164276123, + "learning_rate": 4.946245362002873e-05, + "loss": 5.3831, + "step": 11120 + }, + { + "epoch": 0.06613973736797031, + "grad_norm": 2.1194324493408203, + "learning_rate": 4.9462357274060856e-05, + "loss": 5.2805, + "step": 11121 + }, + { + "epoch": 0.06614568465125131, + "grad_norm": 2.011417865753174, + "learning_rate": 4.946226091955342e-05, + "loss": 5.3052, + "step": 11122 + }, + { + "epoch": 0.0661516319345323, + "grad_norm": 2.202887773513794, + "learning_rate": 4.9462164556506464e-05, + "loss": 5.5263, + "step": 11123 + }, + { + "epoch": 0.0661575792178133, + "grad_norm": 2.075645685195923, + "learning_rate": 4.946206818492002e-05, + "loss": 5.1033, + "step": 11124 + }, + { + "epoch": 0.0661635265010943, + "grad_norm": 2.0723443031311035, + "learning_rate": 4.946197180479412e-05, + "loss": 4.8365, + "step": 11125 + }, + { + "epoch": 0.0661694737843753, + "grad_norm": 2.245961904525757, + "learning_rate": 4.94618754161288e-05, + "loss": 5.0123, + "step": 11126 + }, + { + "epoch": 0.0661754210676563, + "grad_norm": 2.0513699054718018, + "learning_rate": 4.9461779018924096e-05, + "loss": 4.9909, + "step": 11127 + }, + { + "epoch": 0.0661813683509373, + "grad_norm": 2.1552181243896484, + "learning_rate": 4.9461682613180024e-05, + "loss": 5.165, + "step": 11128 + }, + { + "epoch": 0.06618731563421829, + "grad_norm": 2.1207263469696045, + "learning_rate": 4.946158619889664e-05, + "loss": 5.3254, + "step": 11129 + }, + { + "epoch": 0.06619326291749929, + "grad_norm": 1.8278319835662842, + "learning_rate": 4.946148977607397e-05, + "loss": 5.2462, + "step": 11130 + }, + { + "epoch": 0.06619921020078029, + "grad_norm": 2.434661865234375, + "learning_rate": 4.9461393344712046e-05, + "loss": 5.28, + "step": 11131 + }, + { + "epoch": 0.06620515748406128, + "grad_norm": 2.3434953689575195, + "learning_rate": 4.9461296904810904e-05, + "loss": 5.112, + "step": 11132 + }, + { + "epoch": 0.06621110476734228, + "grad_norm": 2.010430335998535, + "learning_rate": 4.946120045637057e-05, + "loss": 5.1236, + "step": 11133 + }, + { + "epoch": 0.06621705205062328, + "grad_norm": 2.19608736038208, + "learning_rate": 4.946110399939109e-05, + "loss": 5.122, + "step": 11134 + }, + { + "epoch": 0.06622299933390427, + "grad_norm": 1.9471449851989746, + "learning_rate": 4.946100753387249e-05, + "loss": 5.2849, + "step": 11135 + }, + { + "epoch": 0.06622894661718527, + "grad_norm": 2.0541727542877197, + "learning_rate": 4.94609110598148e-05, + "loss": 5.4196, + "step": 11136 + }, + { + "epoch": 0.06623489390046626, + "grad_norm": 2.268826723098755, + "learning_rate": 4.946081457721806e-05, + "loss": 5.449, + "step": 11137 + }, + { + "epoch": 0.06624084118374726, + "grad_norm": 2.075227975845337, + "learning_rate": 4.9460718086082307e-05, + "loss": 5.5463, + "step": 11138 + }, + { + "epoch": 0.06624678846702826, + "grad_norm": 2.0949649810791016, + "learning_rate": 4.9460621586407567e-05, + "loss": 5.3737, + "step": 11139 + }, + { + "epoch": 0.06625273575030925, + "grad_norm": 2.1247878074645996, + "learning_rate": 4.9460525078193874e-05, + "loss": 5.2766, + "step": 11140 + }, + { + "epoch": 0.06625868303359025, + "grad_norm": 1.8304489850997925, + "learning_rate": 4.9460428561441276e-05, + "loss": 5.181, + "step": 11141 + }, + { + "epoch": 0.06626463031687126, + "grad_norm": 2.160853862762451, + "learning_rate": 4.946033203614978e-05, + "loss": 5.5222, + "step": 11142 + }, + { + "epoch": 0.06627057760015224, + "grad_norm": 1.9857962131500244, + "learning_rate": 4.9460235502319446e-05, + "loss": 5.574, + "step": 11143 + }, + { + "epoch": 0.06627652488343325, + "grad_norm": 2.016709804534912, + "learning_rate": 4.9460138959950294e-05, + "loss": 5.5255, + "step": 11144 + }, + { + "epoch": 0.06628247216671425, + "grad_norm": 1.8675861358642578, + "learning_rate": 4.946004240904235e-05, + "loss": 5.3604, + "step": 11145 + }, + { + "epoch": 0.06628841944999524, + "grad_norm": 1.9159897565841675, + "learning_rate": 4.945994584959567e-05, + "loss": 5.5348, + "step": 11146 + }, + { + "epoch": 0.06629436673327624, + "grad_norm": 2.0460150241851807, + "learning_rate": 4.945984928161027e-05, + "loss": 5.3267, + "step": 11147 + }, + { + "epoch": 0.06630031401655724, + "grad_norm": 1.8361427783966064, + "learning_rate": 4.9459752705086196e-05, + "loss": 5.3309, + "step": 11148 + }, + { + "epoch": 0.06630626129983823, + "grad_norm": 1.5448495149612427, + "learning_rate": 4.945965612002347e-05, + "loss": 5.0789, + "step": 11149 + }, + { + "epoch": 0.06631220858311923, + "grad_norm": 1.4580925703048706, + "learning_rate": 4.9459559526422125e-05, + "loss": 5.2011, + "step": 11150 + }, + { + "epoch": 0.06631815586640023, + "grad_norm": 1.606593370437622, + "learning_rate": 4.945946292428221e-05, + "loss": 5.2061, + "step": 11151 + }, + { + "epoch": 0.06632410314968122, + "grad_norm": 1.4270994663238525, + "learning_rate": 4.945936631360375e-05, + "loss": 5.089, + "step": 11152 + }, + { + "epoch": 0.06633005043296222, + "grad_norm": 1.6082873344421387, + "learning_rate": 4.9459269694386766e-05, + "loss": 5.2502, + "step": 11153 + }, + { + "epoch": 0.06633599771624323, + "grad_norm": 1.5378412008285522, + "learning_rate": 4.945917306663131e-05, + "loss": 5.4431, + "step": 11154 + }, + { + "epoch": 0.06634194499952421, + "grad_norm": 1.2726879119873047, + "learning_rate": 4.9459076430337416e-05, + "loss": 5.4568, + "step": 11155 + }, + { + "epoch": 0.06634789228280522, + "grad_norm": 1.6131432056427002, + "learning_rate": 4.94589797855051e-05, + "loss": 5.2507, + "step": 11156 + }, + { + "epoch": 0.06635383956608622, + "grad_norm": 1.5835362672805786, + "learning_rate": 4.945888313213442e-05, + "loss": 5.1122, + "step": 11157 + }, + { + "epoch": 0.0663597868493672, + "grad_norm": 1.5903444290161133, + "learning_rate": 4.945878647022539e-05, + "loss": 5.3236, + "step": 11158 + }, + { + "epoch": 0.06636573413264821, + "grad_norm": 1.7948551177978516, + "learning_rate": 4.945868979977805e-05, + "loss": 5.5939, + "step": 11159 + }, + { + "epoch": 0.06637168141592921, + "grad_norm": 2.1183457374572754, + "learning_rate": 4.945859312079243e-05, + "loss": 5.3639, + "step": 11160 + }, + { + "epoch": 0.0663776286992102, + "grad_norm": 1.5584137439727783, + "learning_rate": 4.945849643326857e-05, + "loss": 5.4302, + "step": 11161 + }, + { + "epoch": 0.0663835759824912, + "grad_norm": 1.5150829553604126, + "learning_rate": 4.9458399737206504e-05, + "loss": 5.2485, + "step": 11162 + }, + { + "epoch": 0.0663895232657722, + "grad_norm": 1.421235203742981, + "learning_rate": 4.9458303032606264e-05, + "loss": 5.2149, + "step": 11163 + }, + { + "epoch": 0.06639547054905319, + "grad_norm": 1.640207052230835, + "learning_rate": 4.945820631946788e-05, + "loss": 5.2807, + "step": 11164 + }, + { + "epoch": 0.06640141783233419, + "grad_norm": 1.5021215677261353, + "learning_rate": 4.945810959779139e-05, + "loss": 5.3684, + "step": 11165 + }, + { + "epoch": 0.06640736511561518, + "grad_norm": 1.802828073501587, + "learning_rate": 4.945801286757682e-05, + "loss": 5.2153, + "step": 11166 + }, + { + "epoch": 0.06641331239889618, + "grad_norm": 1.556386947631836, + "learning_rate": 4.945791612882422e-05, + "loss": 5.1908, + "step": 11167 + }, + { + "epoch": 0.06641925968217718, + "grad_norm": 1.5906118154525757, + "learning_rate": 4.9457819381533616e-05, + "loss": 5.2183, + "step": 11168 + }, + { + "epoch": 0.06642520696545817, + "grad_norm": 1.5778700113296509, + "learning_rate": 4.945772262570503e-05, + "loss": 5.2465, + "step": 11169 + }, + { + "epoch": 0.06643115424873917, + "grad_norm": 1.4705984592437744, + "learning_rate": 4.945762586133852e-05, + "loss": 5.1496, + "step": 11170 + }, + { + "epoch": 0.06643710153202018, + "grad_norm": 1.5118781328201294, + "learning_rate": 4.9457529088434093e-05, + "loss": 5.1764, + "step": 11171 + }, + { + "epoch": 0.06644304881530116, + "grad_norm": 1.5784192085266113, + "learning_rate": 4.94574323069918e-05, + "loss": 5.165, + "step": 11172 + }, + { + "epoch": 0.06644899609858217, + "grad_norm": 1.517220139503479, + "learning_rate": 4.9457335517011666e-05, + "loss": 5.1718, + "step": 11173 + }, + { + "epoch": 0.06645494338186317, + "grad_norm": 1.3823192119598389, + "learning_rate": 4.9457238718493734e-05, + "loss": 5.1945, + "step": 11174 + }, + { + "epoch": 0.06646089066514416, + "grad_norm": 1.4499212503433228, + "learning_rate": 4.945714191143803e-05, + "loss": 5.1044, + "step": 11175 + }, + { + "epoch": 0.06646683794842516, + "grad_norm": 1.4904807806015015, + "learning_rate": 4.945704509584459e-05, + "loss": 5.1781, + "step": 11176 + }, + { + "epoch": 0.06647278523170616, + "grad_norm": 1.6798325777053833, + "learning_rate": 4.945694827171345e-05, + "loss": 4.8879, + "step": 11177 + }, + { + "epoch": 0.06647873251498715, + "grad_norm": 1.3890799283981323, + "learning_rate": 4.945685143904464e-05, + "loss": 4.9941, + "step": 11178 + }, + { + "epoch": 0.06648467979826815, + "grad_norm": 1.4167201519012451, + "learning_rate": 4.94567545978382e-05, + "loss": 5.016, + "step": 11179 + }, + { + "epoch": 0.06649062708154915, + "grad_norm": 1.5122467279434204, + "learning_rate": 4.9456657748094145e-05, + "loss": 4.9937, + "step": 11180 + }, + { + "epoch": 0.06649657436483014, + "grad_norm": 1.4347165822982788, + "learning_rate": 4.9456560889812543e-05, + "loss": 5.0486, + "step": 11181 + }, + { + "epoch": 0.06650252164811114, + "grad_norm": 1.6328964233398438, + "learning_rate": 4.94564640229934e-05, + "loss": 5.1891, + "step": 11182 + }, + { + "epoch": 0.06650846893139215, + "grad_norm": 1.5832617282867432, + "learning_rate": 4.9456367147636765e-05, + "loss": 5.2947, + "step": 11183 + }, + { + "epoch": 0.06651441621467313, + "grad_norm": 1.6932839155197144, + "learning_rate": 4.9456270263742655e-05, + "loss": 5.0755, + "step": 11184 + }, + { + "epoch": 0.06652036349795414, + "grad_norm": 1.6238216161727905, + "learning_rate": 4.945617337131111e-05, + "loss": 5.1903, + "step": 11185 + }, + { + "epoch": 0.06652631078123514, + "grad_norm": 2.362353801727295, + "learning_rate": 4.945607647034218e-05, + "loss": 5.3641, + "step": 11186 + }, + { + "epoch": 0.06653225806451613, + "grad_norm": 1.6447978019714355, + "learning_rate": 4.9455979560835874e-05, + "loss": 5.0174, + "step": 11187 + }, + { + "epoch": 0.06653820534779713, + "grad_norm": 1.6059958934783936, + "learning_rate": 4.945588264279225e-05, + "loss": 4.884, + "step": 11188 + }, + { + "epoch": 0.06654415263107813, + "grad_norm": 1.6291608810424805, + "learning_rate": 4.9455785716211325e-05, + "loss": 4.9735, + "step": 11189 + }, + { + "epoch": 0.06655009991435912, + "grad_norm": 1.6926389932632446, + "learning_rate": 4.9455688781093135e-05, + "loss": 4.9294, + "step": 11190 + }, + { + "epoch": 0.06655604719764012, + "grad_norm": 1.5816938877105713, + "learning_rate": 4.945559183743772e-05, + "loss": 4.9161, + "step": 11191 + }, + { + "epoch": 0.06656199448092112, + "grad_norm": 1.5514836311340332, + "learning_rate": 4.9455494885245115e-05, + "loss": 4.9102, + "step": 11192 + }, + { + "epoch": 0.06656794176420211, + "grad_norm": 1.6787114143371582, + "learning_rate": 4.9455397924515346e-05, + "loss": 4.9628, + "step": 11193 + }, + { + "epoch": 0.06657388904748311, + "grad_norm": 1.5264941453933716, + "learning_rate": 4.945530095524844e-05, + "loss": 5.1685, + "step": 11194 + }, + { + "epoch": 0.06657983633076411, + "grad_norm": 1.80072820186615, + "learning_rate": 4.945520397744445e-05, + "loss": 4.8308, + "step": 11195 + }, + { + "epoch": 0.0665857836140451, + "grad_norm": 1.7497553825378418, + "learning_rate": 4.945510699110341e-05, + "loss": 4.8846, + "step": 11196 + }, + { + "epoch": 0.0665917308973261, + "grad_norm": 1.8938134908676147, + "learning_rate": 4.945500999622533e-05, + "loss": 4.8303, + "step": 11197 + }, + { + "epoch": 0.06659767818060709, + "grad_norm": 1.7286055088043213, + "learning_rate": 4.9454912992810264e-05, + "loss": 4.7686, + "step": 11198 + }, + { + "epoch": 0.0666036254638881, + "grad_norm": 1.7573840618133545, + "learning_rate": 4.945481598085824e-05, + "loss": 4.7527, + "step": 11199 + }, + { + "epoch": 0.0666095727471691, + "grad_norm": 1.9013001918792725, + "learning_rate": 4.94547189603693e-05, + "loss": 5.0987, + "step": 11200 + }, + { + "epoch": 0.06661552003045008, + "grad_norm": 1.5453308820724487, + "learning_rate": 4.945462193134346e-05, + "loss": 5.3799, + "step": 11201 + }, + { + "epoch": 0.06662146731373109, + "grad_norm": 1.763839602470398, + "learning_rate": 4.945452489378076e-05, + "loss": 5.2904, + "step": 11202 + }, + { + "epoch": 0.06662741459701209, + "grad_norm": 1.650407075881958, + "learning_rate": 4.945442784768125e-05, + "loss": 5.3007, + "step": 11203 + }, + { + "epoch": 0.06663336188029308, + "grad_norm": 1.6620690822601318, + "learning_rate": 4.945433079304495e-05, + "loss": 5.394, + "step": 11204 + }, + { + "epoch": 0.06663930916357408, + "grad_norm": 1.5000416040420532, + "learning_rate": 4.945423372987189e-05, + "loss": 5.0648, + "step": 11205 + }, + { + "epoch": 0.06664525644685508, + "grad_norm": 2.1791460514068604, + "learning_rate": 4.945413665816211e-05, + "loss": 5.5261, + "step": 11206 + }, + { + "epoch": 0.06665120373013607, + "grad_norm": 2.084258556365967, + "learning_rate": 4.945403957791565e-05, + "loss": 5.5796, + "step": 11207 + }, + { + "epoch": 0.06665715101341707, + "grad_norm": 1.9391356706619263, + "learning_rate": 4.945394248913253e-05, + "loss": 5.4855, + "step": 11208 + }, + { + "epoch": 0.06666309829669807, + "grad_norm": 1.8323030471801758, + "learning_rate": 4.9453845391812803e-05, + "loss": 5.5711, + "step": 11209 + }, + { + "epoch": 0.06666904557997906, + "grad_norm": 1.9193792343139648, + "learning_rate": 4.945374828595648e-05, + "loss": 5.2585, + "step": 11210 + }, + { + "epoch": 0.06667499286326006, + "grad_norm": 1.7111014127731323, + "learning_rate": 4.9453651171563606e-05, + "loss": 5.1965, + "step": 11211 + }, + { + "epoch": 0.06668094014654107, + "grad_norm": 1.8574761152267456, + "learning_rate": 4.9453554048634224e-05, + "loss": 5.2538, + "step": 11212 + }, + { + "epoch": 0.06668688742982205, + "grad_norm": 2.18009352684021, + "learning_rate": 4.945345691716835e-05, + "loss": 5.2486, + "step": 11213 + }, + { + "epoch": 0.06669283471310306, + "grad_norm": 2.167819023132324, + "learning_rate": 4.945335977716603e-05, + "loss": 5.1877, + "step": 11214 + }, + { + "epoch": 0.06669878199638406, + "grad_norm": 2.086603879928589, + "learning_rate": 4.9453262628627297e-05, + "loss": 5.32, + "step": 11215 + }, + { + "epoch": 0.06670472927966505, + "grad_norm": 2.239917039871216, + "learning_rate": 4.945316547155218e-05, + "loss": 5.5289, + "step": 11216 + }, + { + "epoch": 0.06671067656294605, + "grad_norm": 1.9402177333831787, + "learning_rate": 4.945306830594072e-05, + "loss": 5.5159, + "step": 11217 + }, + { + "epoch": 0.06671662384622705, + "grad_norm": 2.2730953693389893, + "learning_rate": 4.945297113179294e-05, + "loss": 5.5132, + "step": 11218 + }, + { + "epoch": 0.06672257112950804, + "grad_norm": 2.4021079540252686, + "learning_rate": 4.945287394910888e-05, + "loss": 5.7505, + "step": 11219 + }, + { + "epoch": 0.06672851841278904, + "grad_norm": 1.8272559642791748, + "learning_rate": 4.945277675788859e-05, + "loss": 5.7324, + "step": 11220 + }, + { + "epoch": 0.06673446569607004, + "grad_norm": 1.641192078590393, + "learning_rate": 4.945267955813206e-05, + "loss": 5.7665, + "step": 11221 + }, + { + "epoch": 0.06674041297935103, + "grad_norm": 2.1081202030181885, + "learning_rate": 4.945258234983938e-05, + "loss": 5.3633, + "step": 11222 + }, + { + "epoch": 0.06674636026263203, + "grad_norm": 1.7172397375106812, + "learning_rate": 4.945248513301054e-05, + "loss": 5.775, + "step": 11223 + }, + { + "epoch": 0.06675230754591303, + "grad_norm": 1.9968703985214233, + "learning_rate": 4.9452387907645594e-05, + "loss": 5.4817, + "step": 11224 + }, + { + "epoch": 0.06675825482919402, + "grad_norm": 1.9165494441986084, + "learning_rate": 4.9452290673744575e-05, + "loss": 5.6977, + "step": 11225 + }, + { + "epoch": 0.06676420211247502, + "grad_norm": 1.832783579826355, + "learning_rate": 4.945219343130751e-05, + "loss": 5.2065, + "step": 11226 + }, + { + "epoch": 0.06677014939575601, + "grad_norm": 2.073590040206909, + "learning_rate": 4.945209618033444e-05, + "loss": 5.0158, + "step": 11227 + }, + { + "epoch": 0.06677609667903701, + "grad_norm": 2.0305895805358887, + "learning_rate": 4.9451998920825395e-05, + "loss": 4.8452, + "step": 11228 + }, + { + "epoch": 0.06678204396231802, + "grad_norm": 1.8843696117401123, + "learning_rate": 4.945190165278041e-05, + "loss": 5.5082, + "step": 11229 + }, + { + "epoch": 0.066787991245599, + "grad_norm": 1.66866934299469, + "learning_rate": 4.945180437619951e-05, + "loss": 5.4151, + "step": 11230 + }, + { + "epoch": 0.06679393852888, + "grad_norm": 1.8018205165863037, + "learning_rate": 4.9451707091082746e-05, + "loss": 5.124, + "step": 11231 + }, + { + "epoch": 0.06679988581216101, + "grad_norm": 1.760339379310608, + "learning_rate": 4.9451609797430146e-05, + "loss": 4.9834, + "step": 11232 + }, + { + "epoch": 0.066805833095442, + "grad_norm": 1.609376072883606, + "learning_rate": 4.945151249524174e-05, + "loss": 5.0217, + "step": 11233 + }, + { + "epoch": 0.066811780378723, + "grad_norm": 1.5468369722366333, + "learning_rate": 4.9451415184517556e-05, + "loss": 5.1881, + "step": 11234 + }, + { + "epoch": 0.066817727662004, + "grad_norm": 1.2027482986450195, + "learning_rate": 4.945131786525764e-05, + "loss": 5.1014, + "step": 11235 + }, + { + "epoch": 0.06682367494528499, + "grad_norm": 1.6050941944122314, + "learning_rate": 4.945122053746203e-05, + "loss": 5.0314, + "step": 11236 + }, + { + "epoch": 0.06682962222856599, + "grad_norm": 1.4980865716934204, + "learning_rate": 4.9451123201130746e-05, + "loss": 4.9371, + "step": 11237 + }, + { + "epoch": 0.06683556951184699, + "grad_norm": 1.6754953861236572, + "learning_rate": 4.9451025856263824e-05, + "loss": 4.9733, + "step": 11238 + }, + { + "epoch": 0.06684151679512798, + "grad_norm": 1.5051567554473877, + "learning_rate": 4.9450928502861303e-05, + "loss": 4.8994, + "step": 11239 + }, + { + "epoch": 0.06684746407840898, + "grad_norm": 1.5211920738220215, + "learning_rate": 4.945083114092321e-05, + "loss": 4.8459, + "step": 11240 + }, + { + "epoch": 0.06685341136168998, + "grad_norm": 1.6717231273651123, + "learning_rate": 4.9450733770449596e-05, + "loss": 5.1029, + "step": 11241 + }, + { + "epoch": 0.06685935864497097, + "grad_norm": 1.4853429794311523, + "learning_rate": 4.945063639144048e-05, + "loss": 5.2199, + "step": 11242 + }, + { + "epoch": 0.06686530592825198, + "grad_norm": 1.6102755069732666, + "learning_rate": 4.9450539003895894e-05, + "loss": 5.1191, + "step": 11243 + }, + { + "epoch": 0.06687125321153298, + "grad_norm": 1.6091139316558838, + "learning_rate": 4.9450441607815876e-05, + "loss": 5.2492, + "step": 11244 + }, + { + "epoch": 0.06687720049481397, + "grad_norm": 1.5190162658691406, + "learning_rate": 4.945034420320047e-05, + "loss": 5.1763, + "step": 11245 + }, + { + "epoch": 0.06688314777809497, + "grad_norm": 1.636243462562561, + "learning_rate": 4.94502467900497e-05, + "loss": 5.4906, + "step": 11246 + }, + { + "epoch": 0.06688909506137597, + "grad_norm": 1.5214428901672363, + "learning_rate": 4.9450149368363594e-05, + "loss": 5.3554, + "step": 11247 + }, + { + "epoch": 0.06689504234465696, + "grad_norm": 1.696183681488037, + "learning_rate": 4.9450051938142205e-05, + "loss": 5.3185, + "step": 11248 + }, + { + "epoch": 0.06690098962793796, + "grad_norm": 1.5344911813735962, + "learning_rate": 4.944995449938555e-05, + "loss": 5.345, + "step": 11249 + }, + { + "epoch": 0.06690693691121896, + "grad_norm": 1.598035454750061, + "learning_rate": 4.944985705209366e-05, + "loss": 5.2271, + "step": 11250 + }, + { + "epoch": 0.06691288419449995, + "grad_norm": 1.501841425895691, + "learning_rate": 4.944975959626659e-05, + "loss": 5.1807, + "step": 11251 + }, + { + "epoch": 0.06691883147778095, + "grad_norm": 1.3818657398223877, + "learning_rate": 4.944966213190436e-05, + "loss": 5.2953, + "step": 11252 + }, + { + "epoch": 0.06692477876106195, + "grad_norm": 1.5480642318725586, + "learning_rate": 4.9449564659007e-05, + "loss": 5.3048, + "step": 11253 + }, + { + "epoch": 0.06693072604434294, + "grad_norm": 1.5553090572357178, + "learning_rate": 4.9449467177574546e-05, + "loss": 5.1365, + "step": 11254 + }, + { + "epoch": 0.06693667332762394, + "grad_norm": 1.581534743309021, + "learning_rate": 4.944936968760705e-05, + "loss": 5.1498, + "step": 11255 + }, + { + "epoch": 0.06694262061090493, + "grad_norm": 1.8294548988342285, + "learning_rate": 4.944927218910452e-05, + "loss": 5.1331, + "step": 11256 + }, + { + "epoch": 0.06694856789418593, + "grad_norm": 1.3404508829116821, + "learning_rate": 4.944917468206701e-05, + "loss": 5.5092, + "step": 11257 + }, + { + "epoch": 0.06695451517746694, + "grad_norm": 1.5146483182907104, + "learning_rate": 4.944907716649454e-05, + "loss": 5.2797, + "step": 11258 + }, + { + "epoch": 0.06696046246074792, + "grad_norm": 1.571393609046936, + "learning_rate": 4.944897964238715e-05, + "loss": 5.4528, + "step": 11259 + }, + { + "epoch": 0.06696640974402893, + "grad_norm": 1.640459656715393, + "learning_rate": 4.944888210974487e-05, + "loss": 5.1032, + "step": 11260 + }, + { + "epoch": 0.06697235702730993, + "grad_norm": 1.5397419929504395, + "learning_rate": 4.944878456856774e-05, + "loss": 5.2333, + "step": 11261 + }, + { + "epoch": 0.06697830431059092, + "grad_norm": 1.4423824548721313, + "learning_rate": 4.94486870188558e-05, + "loss": 5.1765, + "step": 11262 + }, + { + "epoch": 0.06698425159387192, + "grad_norm": 1.366347074508667, + "learning_rate": 4.9448589460609066e-05, + "loss": 5.2257, + "step": 11263 + }, + { + "epoch": 0.06699019887715292, + "grad_norm": 1.370089054107666, + "learning_rate": 4.944849189382759e-05, + "loss": 5.4681, + "step": 11264 + }, + { + "epoch": 0.06699614616043391, + "grad_norm": 1.3014042377471924, + "learning_rate": 4.9448394318511394e-05, + "loss": 5.3434, + "step": 11265 + }, + { + "epoch": 0.06700209344371491, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.9448296734660516e-05, + "loss": 5.3064, + "step": 11266 + }, + { + "epoch": 0.06700804072699591, + "grad_norm": 1.6640921831130981, + "learning_rate": 4.944819914227499e-05, + "loss": 5.2896, + "step": 11267 + }, + { + "epoch": 0.0670139880102769, + "grad_norm": 1.4969593286514282, + "learning_rate": 4.9448101541354845e-05, + "loss": 5.1413, + "step": 11268 + }, + { + "epoch": 0.0670199352935579, + "grad_norm": 1.4021313190460205, + "learning_rate": 4.9448003931900126e-05, + "loss": 5.2609, + "step": 11269 + }, + { + "epoch": 0.0670258825768389, + "grad_norm": 1.6506398916244507, + "learning_rate": 4.9447906313910865e-05, + "loss": 5.3365, + "step": 11270 + }, + { + "epoch": 0.0670318298601199, + "grad_norm": 1.6469614505767822, + "learning_rate": 4.9447808687387084e-05, + "loss": 5.0384, + "step": 11271 + }, + { + "epoch": 0.0670377771434009, + "grad_norm": 1.5047974586486816, + "learning_rate": 4.944771105232883e-05, + "loss": 5.3565, + "step": 11272 + }, + { + "epoch": 0.0670437244266819, + "grad_norm": 1.4467194080352783, + "learning_rate": 4.9447613408736135e-05, + "loss": 5.5576, + "step": 11273 + }, + { + "epoch": 0.06704967170996289, + "grad_norm": 1.4636478424072266, + "learning_rate": 4.9447515756609034e-05, + "loss": 5.6407, + "step": 11274 + }, + { + "epoch": 0.06705561899324389, + "grad_norm": 1.373046875, + "learning_rate": 4.944741809594755e-05, + "loss": 5.4286, + "step": 11275 + }, + { + "epoch": 0.06706156627652489, + "grad_norm": 1.5114089250564575, + "learning_rate": 4.944732042675172e-05, + "loss": 5.6425, + "step": 11276 + }, + { + "epoch": 0.06706751355980588, + "grad_norm": 1.8263514041900635, + "learning_rate": 4.9447222749021596e-05, + "loss": 5.2469, + "step": 11277 + }, + { + "epoch": 0.06707346084308688, + "grad_norm": 1.780553936958313, + "learning_rate": 4.944712506275719e-05, + "loss": 5.3306, + "step": 11278 + }, + { + "epoch": 0.06707940812636788, + "grad_norm": 1.6208360195159912, + "learning_rate": 4.9447027367958556e-05, + "loss": 5.5365, + "step": 11279 + }, + { + "epoch": 0.06708535540964887, + "grad_norm": 1.336965560913086, + "learning_rate": 4.9446929664625705e-05, + "loss": 5.2694, + "step": 11280 + }, + { + "epoch": 0.06709130269292987, + "grad_norm": 1.6100155115127563, + "learning_rate": 4.9446831952758685e-05, + "loss": 5.5489, + "step": 11281 + }, + { + "epoch": 0.06709724997621087, + "grad_norm": 1.8020440340042114, + "learning_rate": 4.944673423235753e-05, + "loss": 5.3396, + "step": 11282 + }, + { + "epoch": 0.06710319725949186, + "grad_norm": 1.5315353870391846, + "learning_rate": 4.9446636503422276e-05, + "loss": 5.3687, + "step": 11283 + }, + { + "epoch": 0.06710914454277286, + "grad_norm": 2.2560019493103027, + "learning_rate": 4.9446538765952953e-05, + "loss": 5.4584, + "step": 11284 + }, + { + "epoch": 0.06711509182605385, + "grad_norm": 1.4653301239013672, + "learning_rate": 4.94464410199496e-05, + "loss": 5.3438, + "step": 11285 + }, + { + "epoch": 0.06712103910933485, + "grad_norm": 1.5931557416915894, + "learning_rate": 4.9446343265412243e-05, + "loss": 5.5802, + "step": 11286 + }, + { + "epoch": 0.06712698639261586, + "grad_norm": 1.5282461643218994, + "learning_rate": 4.944624550234092e-05, + "loss": 5.5634, + "step": 11287 + }, + { + "epoch": 0.06713293367589684, + "grad_norm": 1.7275618314743042, + "learning_rate": 4.944614773073566e-05, + "loss": 5.3797, + "step": 11288 + }, + { + "epoch": 0.06713888095917785, + "grad_norm": 1.6453620195388794, + "learning_rate": 4.944604995059651e-05, + "loss": 5.4693, + "step": 11289 + }, + { + "epoch": 0.06714482824245885, + "grad_norm": 1.870483636856079, + "learning_rate": 4.944595216192349e-05, + "loss": 5.4693, + "step": 11290 + }, + { + "epoch": 0.06715077552573984, + "grad_norm": 1.5478577613830566, + "learning_rate": 4.944585436471665e-05, + "loss": 5.694, + "step": 11291 + }, + { + "epoch": 0.06715672280902084, + "grad_norm": 1.9456945657730103, + "learning_rate": 4.944575655897601e-05, + "loss": 5.6687, + "step": 11292 + }, + { + "epoch": 0.06716267009230184, + "grad_norm": 1.808176875114441, + "learning_rate": 4.944565874470161e-05, + "loss": 5.7444, + "step": 11293 + }, + { + "epoch": 0.06716861737558283, + "grad_norm": 1.8066149950027466, + "learning_rate": 4.944556092189347e-05, + "loss": 5.5264, + "step": 11294 + }, + { + "epoch": 0.06717456465886383, + "grad_norm": 2.2896971702575684, + "learning_rate": 4.9445463090551656e-05, + "loss": 4.7624, + "step": 11295 + }, + { + "epoch": 0.06718051194214483, + "grad_norm": 1.7178759574890137, + "learning_rate": 4.9445365250676165e-05, + "loss": 5.79, + "step": 11296 + }, + { + "epoch": 0.06718645922542582, + "grad_norm": 1.8841933012008667, + "learning_rate": 4.944526740226707e-05, + "loss": 5.9792, + "step": 11297 + }, + { + "epoch": 0.06719240650870682, + "grad_norm": 1.8618090152740479, + "learning_rate": 4.944516954532437e-05, + "loss": 5.957, + "step": 11298 + }, + { + "epoch": 0.06719835379198782, + "grad_norm": 1.7545913457870483, + "learning_rate": 4.944507167984812e-05, + "loss": 5.4484, + "step": 11299 + }, + { + "epoch": 0.06720430107526881, + "grad_norm": 2.023158073425293, + "learning_rate": 4.9444973805838345e-05, + "loss": 5.0873, + "step": 11300 + }, + { + "epoch": 0.06721024835854982, + "grad_norm": 1.893340826034546, + "learning_rate": 4.944487592329509e-05, + "loss": 5.042, + "step": 11301 + }, + { + "epoch": 0.06721619564183082, + "grad_norm": 1.981518268585205, + "learning_rate": 4.944477803221837e-05, + "loss": 5.1463, + "step": 11302 + }, + { + "epoch": 0.0672221429251118, + "grad_norm": 2.47416090965271, + "learning_rate": 4.9444680132608236e-05, + "loss": 5.2885, + "step": 11303 + }, + { + "epoch": 0.06722809020839281, + "grad_norm": 2.3973519802093506, + "learning_rate": 4.944458222446472e-05, + "loss": 5.3321, + "step": 11304 + }, + { + "epoch": 0.06723403749167381, + "grad_norm": 1.9117941856384277, + "learning_rate": 4.9444484307787846e-05, + "loss": 5.2159, + "step": 11305 + }, + { + "epoch": 0.0672399847749548, + "grad_norm": 1.8732513189315796, + "learning_rate": 4.9444386382577656e-05, + "loss": 5.222, + "step": 11306 + }, + { + "epoch": 0.0672459320582358, + "grad_norm": 1.9202747344970703, + "learning_rate": 4.9444288448834184e-05, + "loss": 5.5766, + "step": 11307 + }, + { + "epoch": 0.0672518793415168, + "grad_norm": 1.8956191539764404, + "learning_rate": 4.944419050655747e-05, + "loss": 5.7129, + "step": 11308 + }, + { + "epoch": 0.06725782662479779, + "grad_norm": 2.7075235843658447, + "learning_rate": 4.9444092555747534e-05, + "loss": 5.2199, + "step": 11309 + }, + { + "epoch": 0.06726377390807879, + "grad_norm": 2.396125078201294, + "learning_rate": 4.944399459640442e-05, + "loss": 5.3548, + "step": 11310 + }, + { + "epoch": 0.0672697211913598, + "grad_norm": 2.6050171852111816, + "learning_rate": 4.9443896628528166e-05, + "loss": 5.616, + "step": 11311 + }, + { + "epoch": 0.06727566847464078, + "grad_norm": 2.512720823287964, + "learning_rate": 4.94437986521188e-05, + "loss": 5.3699, + "step": 11312 + }, + { + "epoch": 0.06728161575792178, + "grad_norm": 2.509716510772705, + "learning_rate": 4.9443700667176345e-05, + "loss": 5.431, + "step": 11313 + }, + { + "epoch": 0.06728756304120277, + "grad_norm": 2.2237601280212402, + "learning_rate": 4.944360267370085e-05, + "loss": 5.3985, + "step": 11314 + }, + { + "epoch": 0.06729351032448377, + "grad_norm": 1.982344627380371, + "learning_rate": 4.9443504671692356e-05, + "loss": 5.4849, + "step": 11315 + }, + { + "epoch": 0.06729945760776478, + "grad_norm": 2.1006124019622803, + "learning_rate": 4.9443406661150874e-05, + "loss": 5.227, + "step": 11316 + }, + { + "epoch": 0.06730540489104576, + "grad_norm": 2.0929529666900635, + "learning_rate": 4.9443308642076456e-05, + "loss": 5.524, + "step": 11317 + }, + { + "epoch": 0.06731135217432677, + "grad_norm": 1.9268262386322021, + "learning_rate": 4.944321061446914e-05, + "loss": 6.0622, + "step": 11318 + }, + { + "epoch": 0.06731729945760777, + "grad_norm": 2.257065773010254, + "learning_rate": 4.944311257832894e-05, + "loss": 4.9455, + "step": 11319 + }, + { + "epoch": 0.06732324674088876, + "grad_norm": 2.056244373321533, + "learning_rate": 4.944301453365591e-05, + "loss": 5.4157, + "step": 11320 + }, + { + "epoch": 0.06732919402416976, + "grad_norm": 2.1667540073394775, + "learning_rate": 4.944291648045007e-05, + "loss": 5.5767, + "step": 11321 + }, + { + "epoch": 0.06733514130745076, + "grad_norm": 1.9596853256225586, + "learning_rate": 4.944281841871146e-05, + "loss": 5.6532, + "step": 11322 + }, + { + "epoch": 0.06734108859073175, + "grad_norm": 1.7050867080688477, + "learning_rate": 4.9442720348440116e-05, + "loss": 5.8881, + "step": 11323 + }, + { + "epoch": 0.06734703587401275, + "grad_norm": 1.8681753873825073, + "learning_rate": 4.944262226963607e-05, + "loss": 5.9369, + "step": 11324 + }, + { + "epoch": 0.06735298315729375, + "grad_norm": 1.9432111978530884, + "learning_rate": 4.9442524182299365e-05, + "loss": 5.9163, + "step": 11325 + }, + { + "epoch": 0.06735893044057474, + "grad_norm": 1.8099175691604614, + "learning_rate": 4.9442426086430026e-05, + "loss": 5.809, + "step": 11326 + }, + { + "epoch": 0.06736487772385574, + "grad_norm": 1.6179800033569336, + "learning_rate": 4.944232798202808e-05, + "loss": 5.5609, + "step": 11327 + }, + { + "epoch": 0.06737082500713674, + "grad_norm": 2.303189992904663, + "learning_rate": 4.944222986909357e-05, + "loss": 5.9291, + "step": 11328 + }, + { + "epoch": 0.06737677229041773, + "grad_norm": 1.913813829421997, + "learning_rate": 4.944213174762654e-05, + "loss": 5.8672, + "step": 11329 + }, + { + "epoch": 0.06738271957369873, + "grad_norm": 2.1856813430786133, + "learning_rate": 4.944203361762701e-05, + "loss": 5.2632, + "step": 11330 + }, + { + "epoch": 0.06738866685697974, + "grad_norm": 2.019679069519043, + "learning_rate": 4.9441935479095016e-05, + "loss": 5.3707, + "step": 11331 + }, + { + "epoch": 0.06739461414026073, + "grad_norm": 1.8531097173690796, + "learning_rate": 4.944183733203059e-05, + "loss": 5.6689, + "step": 11332 + }, + { + "epoch": 0.06740056142354173, + "grad_norm": 2.068208694458008, + "learning_rate": 4.944173917643378e-05, + "loss": 5.6111, + "step": 11333 + }, + { + "epoch": 0.06740650870682273, + "grad_norm": 1.8021270036697388, + "learning_rate": 4.944164101230461e-05, + "loss": 6.0865, + "step": 11334 + }, + { + "epoch": 0.06741245599010372, + "grad_norm": 1.9051427841186523, + "learning_rate": 4.944154283964312e-05, + "loss": 5.5862, + "step": 11335 + }, + { + "epoch": 0.06741840327338472, + "grad_norm": 1.718483805656433, + "learning_rate": 4.944144465844933e-05, + "loss": 5.2505, + "step": 11336 + }, + { + "epoch": 0.06742435055666572, + "grad_norm": 2.205167531967163, + "learning_rate": 4.944134646872329e-05, + "loss": 5.3181, + "step": 11337 + }, + { + "epoch": 0.06743029783994671, + "grad_norm": 1.550945520401001, + "learning_rate": 4.944124827046502e-05, + "loss": 5.4129, + "step": 11338 + }, + { + "epoch": 0.06743624512322771, + "grad_norm": 2.08793044090271, + "learning_rate": 4.944115006367458e-05, + "loss": 5.9705, + "step": 11339 + }, + { + "epoch": 0.06744219240650871, + "grad_norm": 1.8955761194229126, + "learning_rate": 4.944105184835197e-05, + "loss": 4.9629, + "step": 11340 + }, + { + "epoch": 0.0674481396897897, + "grad_norm": 1.7287909984588623, + "learning_rate": 4.944095362449724e-05, + "loss": 5.1097, + "step": 11341 + }, + { + "epoch": 0.0674540869730707, + "grad_norm": 1.8718771934509277, + "learning_rate": 4.944085539211044e-05, + "loss": 5.6443, + "step": 11342 + }, + { + "epoch": 0.06746003425635169, + "grad_norm": 2.220863103866577, + "learning_rate": 4.9440757151191585e-05, + "loss": 5.5042, + "step": 11343 + }, + { + "epoch": 0.0674659815396327, + "grad_norm": 1.9501415491104126, + "learning_rate": 4.944065890174071e-05, + "loss": 5.6788, + "step": 11344 + }, + { + "epoch": 0.0674719288229137, + "grad_norm": 1.8566590547561646, + "learning_rate": 4.944056064375786e-05, + "loss": 5.6531, + "step": 11345 + }, + { + "epoch": 0.06747787610619468, + "grad_norm": 1.895409345626831, + "learning_rate": 4.9440462377243055e-05, + "loss": 5.6441, + "step": 11346 + }, + { + "epoch": 0.06748382338947569, + "grad_norm": 2.1746973991394043, + "learning_rate": 4.9440364102196345e-05, + "loss": 5.8624, + "step": 11347 + }, + { + "epoch": 0.06748977067275669, + "grad_norm": 1.9661751985549927, + "learning_rate": 4.944026581861775e-05, + "loss": 5.6075, + "step": 11348 + }, + { + "epoch": 0.06749571795603768, + "grad_norm": 1.8591458797454834, + "learning_rate": 4.944016752650731e-05, + "loss": 5.9115, + "step": 11349 + }, + { + "epoch": 0.06750166523931868, + "grad_norm": 1.6491025686264038, + "learning_rate": 4.9440069225865065e-05, + "loss": 6.0548, + "step": 11350 + }, + { + "epoch": 0.06750761252259968, + "grad_norm": 1.857928991317749, + "learning_rate": 4.9439970916691045e-05, + "loss": 5.4326, + "step": 11351 + }, + { + "epoch": 0.06751355980588067, + "grad_norm": 1.8189151287078857, + "learning_rate": 4.943987259898528e-05, + "loss": 5.7744, + "step": 11352 + }, + { + "epoch": 0.06751950708916167, + "grad_norm": 1.7486300468444824, + "learning_rate": 4.943977427274781e-05, + "loss": 5.7128, + "step": 11353 + }, + { + "epoch": 0.06752545437244267, + "grad_norm": 1.7272138595581055, + "learning_rate": 4.943967593797866e-05, + "loss": 5.9922, + "step": 11354 + }, + { + "epoch": 0.06753140165572366, + "grad_norm": 1.740860939025879, + "learning_rate": 4.9439577594677875e-05, + "loss": 5.8486, + "step": 11355 + }, + { + "epoch": 0.06753734893900466, + "grad_norm": 1.9054155349731445, + "learning_rate": 4.9439479242845494e-05, + "loss": 5.4694, + "step": 11356 + }, + { + "epoch": 0.06754329622228566, + "grad_norm": 1.9783501625061035, + "learning_rate": 4.943938088248154e-05, + "loss": 5.5185, + "step": 11357 + }, + { + "epoch": 0.06754924350556665, + "grad_norm": 1.8267238140106201, + "learning_rate": 4.943928251358605e-05, + "loss": 5.7589, + "step": 11358 + }, + { + "epoch": 0.06755519078884765, + "grad_norm": 1.6957738399505615, + "learning_rate": 4.943918413615906e-05, + "loss": 5.5716, + "step": 11359 + }, + { + "epoch": 0.06756113807212866, + "grad_norm": 2.0818982124328613, + "learning_rate": 4.94390857502006e-05, + "loss": 5.8969, + "step": 11360 + }, + { + "epoch": 0.06756708535540965, + "grad_norm": 1.8012073040008545, + "learning_rate": 4.9438987355710703e-05, + "loss": 6.1053, + "step": 11361 + }, + { + "epoch": 0.06757303263869065, + "grad_norm": 2.2209696769714355, + "learning_rate": 4.943888895268942e-05, + "loss": 5.9714, + "step": 11362 + }, + { + "epoch": 0.06757897992197165, + "grad_norm": 1.8006336688995361, + "learning_rate": 4.943879054113676e-05, + "loss": 5.6427, + "step": 11363 + }, + { + "epoch": 0.06758492720525264, + "grad_norm": 1.7628017663955688, + "learning_rate": 4.9438692121052775e-05, + "loss": 5.8639, + "step": 11364 + }, + { + "epoch": 0.06759087448853364, + "grad_norm": 1.8574492931365967, + "learning_rate": 4.94385936924375e-05, + "loss": 5.892, + "step": 11365 + }, + { + "epoch": 0.06759682177181464, + "grad_norm": 1.7926831245422363, + "learning_rate": 4.9438495255290964e-05, + "loss": 5.9024, + "step": 11366 + }, + { + "epoch": 0.06760276905509563, + "grad_norm": 2.503370761871338, + "learning_rate": 4.94383968096132e-05, + "loss": 5.994, + "step": 11367 + }, + { + "epoch": 0.06760871633837663, + "grad_norm": 1.7123390436172485, + "learning_rate": 4.943829835540424e-05, + "loss": 5.8052, + "step": 11368 + }, + { + "epoch": 0.06761466362165763, + "grad_norm": 2.0890092849731445, + "learning_rate": 4.943819989266413e-05, + "loss": 5.067, + "step": 11369 + }, + { + "epoch": 0.06762061090493862, + "grad_norm": 1.8000640869140625, + "learning_rate": 4.9438101421392894e-05, + "loss": 5.3562, + "step": 11370 + }, + { + "epoch": 0.06762655818821962, + "grad_norm": 2.254873514175415, + "learning_rate": 4.9438002941590564e-05, + "loss": 5.0557, + "step": 11371 + }, + { + "epoch": 0.06763250547150061, + "grad_norm": 1.8080449104309082, + "learning_rate": 4.943790445325719e-05, + "loss": 5.6702, + "step": 11372 + }, + { + "epoch": 0.06763845275478161, + "grad_norm": 2.0175933837890625, + "learning_rate": 4.943780595639279e-05, + "loss": 5.6227, + "step": 11373 + }, + { + "epoch": 0.06764440003806262, + "grad_norm": 1.9859650135040283, + "learning_rate": 4.943770745099741e-05, + "loss": 5.4437, + "step": 11374 + }, + { + "epoch": 0.0676503473213436, + "grad_norm": 1.975573182106018, + "learning_rate": 4.943760893707107e-05, + "loss": 5.3101, + "step": 11375 + }, + { + "epoch": 0.0676562946046246, + "grad_norm": 2.2590208053588867, + "learning_rate": 4.943751041461382e-05, + "loss": 5.2544, + "step": 11376 + }, + { + "epoch": 0.06766224188790561, + "grad_norm": 1.8615392446517944, + "learning_rate": 4.943741188362568e-05, + "loss": 5.5266, + "step": 11377 + }, + { + "epoch": 0.0676681891711866, + "grad_norm": 2.056810140609741, + "learning_rate": 4.943731334410669e-05, + "loss": 5.1994, + "step": 11378 + }, + { + "epoch": 0.0676741364544676, + "grad_norm": 2.0275685787200928, + "learning_rate": 4.94372147960569e-05, + "loss": 5.7385, + "step": 11379 + }, + { + "epoch": 0.0676800837377486, + "grad_norm": 2.082963466644287, + "learning_rate": 4.9437116239476325e-05, + "loss": 5.1531, + "step": 11380 + }, + { + "epoch": 0.06768603102102959, + "grad_norm": 2.176421642303467, + "learning_rate": 4.9437017674365004e-05, + "loss": 5.521, + "step": 11381 + }, + { + "epoch": 0.06769197830431059, + "grad_norm": 2.1424365043640137, + "learning_rate": 4.9436919100722964e-05, + "loss": 5.4543, + "step": 11382 + }, + { + "epoch": 0.06769792558759159, + "grad_norm": 2.07836651802063, + "learning_rate": 4.9436820518550266e-05, + "loss": 5.5166, + "step": 11383 + }, + { + "epoch": 0.06770387287087258, + "grad_norm": 1.9776746034622192, + "learning_rate": 4.9436721927846915e-05, + "loss": 5.4621, + "step": 11384 + }, + { + "epoch": 0.06770982015415358, + "grad_norm": 1.9985042810440063, + "learning_rate": 4.943662332861296e-05, + "loss": 5.3835, + "step": 11385 + }, + { + "epoch": 0.06771576743743458, + "grad_norm": 1.6877795457839966, + "learning_rate": 4.943652472084843e-05, + "loss": 5.185, + "step": 11386 + }, + { + "epoch": 0.06772171472071557, + "grad_norm": 1.8307565450668335, + "learning_rate": 4.943642610455336e-05, + "loss": 5.117, + "step": 11387 + }, + { + "epoch": 0.06772766200399657, + "grad_norm": 2.0381922721862793, + "learning_rate": 4.943632747972779e-05, + "loss": 5.6004, + "step": 11388 + }, + { + "epoch": 0.06773360928727758, + "grad_norm": 1.9554756879806519, + "learning_rate": 4.943622884637175e-05, + "loss": 5.9638, + "step": 11389 + }, + { + "epoch": 0.06773955657055857, + "grad_norm": 1.878861665725708, + "learning_rate": 4.9436130204485274e-05, + "loss": 5.7961, + "step": 11390 + }, + { + "epoch": 0.06774550385383957, + "grad_norm": 2.040012836456299, + "learning_rate": 4.94360315540684e-05, + "loss": 5.7175, + "step": 11391 + }, + { + "epoch": 0.06775145113712057, + "grad_norm": 2.262408494949341, + "learning_rate": 4.943593289512115e-05, + "loss": 4.8581, + "step": 11392 + }, + { + "epoch": 0.06775739842040156, + "grad_norm": 2.201751232147217, + "learning_rate": 4.943583422764358e-05, + "loss": 5.0647, + "step": 11393 + }, + { + "epoch": 0.06776334570368256, + "grad_norm": 1.9768764972686768, + "learning_rate": 4.943573555163571e-05, + "loss": 5.8836, + "step": 11394 + }, + { + "epoch": 0.06776929298696356, + "grad_norm": 2.1048574447631836, + "learning_rate": 4.9435636867097575e-05, + "loss": 5.9746, + "step": 11395 + }, + { + "epoch": 0.06777524027024455, + "grad_norm": 1.5297552347183228, + "learning_rate": 4.943553817402921e-05, + "loss": 4.912, + "step": 11396 + }, + { + "epoch": 0.06778118755352555, + "grad_norm": 1.5313429832458496, + "learning_rate": 4.943543947243066e-05, + "loss": 4.975, + "step": 11397 + }, + { + "epoch": 0.06778713483680655, + "grad_norm": 1.8882219791412354, + "learning_rate": 4.943534076230194e-05, + "loss": 5.2183, + "step": 11398 + }, + { + "epoch": 0.06779308212008754, + "grad_norm": 1.698997139930725, + "learning_rate": 4.9435242043643094e-05, + "loss": 5.8019, + "step": 11399 + }, + { + "epoch": 0.06779902940336854, + "grad_norm": 1.775140404701233, + "learning_rate": 4.943514331645417e-05, + "loss": 5.7451, + "step": 11400 + }, + { + "epoch": 0.06780497668664953, + "grad_norm": 2.273650884628296, + "learning_rate": 4.943504458073518e-05, + "loss": 4.7727, + "step": 11401 + }, + { + "epoch": 0.06781092396993053, + "grad_norm": 2.166961908340454, + "learning_rate": 4.943494583648617e-05, + "loss": 5.4537, + "step": 11402 + }, + { + "epoch": 0.06781687125321154, + "grad_norm": 2.147876024246216, + "learning_rate": 4.943484708370717e-05, + "loss": 5.2635, + "step": 11403 + }, + { + "epoch": 0.06782281853649252, + "grad_norm": 1.968397855758667, + "learning_rate": 4.943474832239822e-05, + "loss": 5.6591, + "step": 11404 + }, + { + "epoch": 0.06782876581977353, + "grad_norm": 1.8838316202163696, + "learning_rate": 4.943464955255935e-05, + "loss": 5.5462, + "step": 11405 + }, + { + "epoch": 0.06783471310305453, + "grad_norm": 2.4205315113067627, + "learning_rate": 4.94345507741906e-05, + "loss": 4.859, + "step": 11406 + }, + { + "epoch": 0.06784066038633552, + "grad_norm": 2.1272950172424316, + "learning_rate": 4.9434451987292e-05, + "loss": 5.1791, + "step": 11407 + }, + { + "epoch": 0.06784660766961652, + "grad_norm": 2.345055341720581, + "learning_rate": 4.9434353191863595e-05, + "loss": 5.1616, + "step": 11408 + }, + { + "epoch": 0.06785255495289752, + "grad_norm": 2.3967537879943848, + "learning_rate": 4.9434254387905395e-05, + "loss": 5.1805, + "step": 11409 + }, + { + "epoch": 0.06785850223617851, + "grad_norm": 2.2108283042907715, + "learning_rate": 4.943415557541745e-05, + "loss": 5.381, + "step": 11410 + }, + { + "epoch": 0.06786444951945951, + "grad_norm": 2.178776979446411, + "learning_rate": 4.94340567543998e-05, + "loss": 5.4016, + "step": 11411 + }, + { + "epoch": 0.06787039680274051, + "grad_norm": 2.003169059753418, + "learning_rate": 4.943395792485247e-05, + "loss": 5.5632, + "step": 11412 + }, + { + "epoch": 0.0678763440860215, + "grad_norm": 2.0337789058685303, + "learning_rate": 4.9433859086775506e-05, + "loss": 5.4476, + "step": 11413 + }, + { + "epoch": 0.0678822913693025, + "grad_norm": 1.784868836402893, + "learning_rate": 4.943376024016892e-05, + "loss": 5.3578, + "step": 11414 + }, + { + "epoch": 0.0678882386525835, + "grad_norm": 1.7282286882400513, + "learning_rate": 4.943366138503277e-05, + "loss": 5.6202, + "step": 11415 + }, + { + "epoch": 0.06789418593586449, + "grad_norm": 1.9716618061065674, + "learning_rate": 4.943356252136707e-05, + "loss": 4.9861, + "step": 11416 + }, + { + "epoch": 0.0679001332191455, + "grad_norm": 2.399317502975464, + "learning_rate": 4.943346364917188e-05, + "loss": 4.4494, + "step": 11417 + }, + { + "epoch": 0.0679060805024265, + "grad_norm": 2.142995834350586, + "learning_rate": 4.943336476844722e-05, + "loss": 4.5989, + "step": 11418 + }, + { + "epoch": 0.06791202778570748, + "grad_norm": 1.9394404888153076, + "learning_rate": 4.943326587919311e-05, + "loss": 4.4944, + "step": 11419 + }, + { + "epoch": 0.06791797506898849, + "grad_norm": 2.41937518119812, + "learning_rate": 4.9433166981409615e-05, + "loss": 5.1687, + "step": 11420 + }, + { + "epoch": 0.06792392235226949, + "grad_norm": 2.1686136722564697, + "learning_rate": 4.943306807509675e-05, + "loss": 6.2976, + "step": 11421 + }, + { + "epoch": 0.06792986963555048, + "grad_norm": 1.9649391174316406, + "learning_rate": 4.943296916025455e-05, + "loss": 6.0242, + "step": 11422 + }, + { + "epoch": 0.06793581691883148, + "grad_norm": 1.9251484870910645, + "learning_rate": 4.943287023688305e-05, + "loss": 5.9777, + "step": 11423 + }, + { + "epoch": 0.06794176420211248, + "grad_norm": 1.838348388671875, + "learning_rate": 4.9432771304982296e-05, + "loss": 5.8669, + "step": 11424 + }, + { + "epoch": 0.06794771148539347, + "grad_norm": 2.5417487621307373, + "learning_rate": 4.94326723645523e-05, + "loss": 5.5131, + "step": 11425 + }, + { + "epoch": 0.06795365876867447, + "grad_norm": 2.2175936698913574, + "learning_rate": 4.943257341559312e-05, + "loss": 5.4657, + "step": 11426 + }, + { + "epoch": 0.06795960605195547, + "grad_norm": 2.4474873542785645, + "learning_rate": 4.943247445810478e-05, + "loss": 5.2401, + "step": 11427 + }, + { + "epoch": 0.06796555333523646, + "grad_norm": 2.176483392715454, + "learning_rate": 4.9432375492087324e-05, + "loss": 5.7295, + "step": 11428 + }, + { + "epoch": 0.06797150061851746, + "grad_norm": 1.9311527013778687, + "learning_rate": 4.943227651754077e-05, + "loss": 5.8135, + "step": 11429 + }, + { + "epoch": 0.06797744790179845, + "grad_norm": 2.2462544441223145, + "learning_rate": 4.943217753446516e-05, + "loss": 6.0761, + "step": 11430 + }, + { + "epoch": 0.06798339518507945, + "grad_norm": 2.3158276081085205, + "learning_rate": 4.943207854286053e-05, + "loss": 6.0223, + "step": 11431 + }, + { + "epoch": 0.06798934246836046, + "grad_norm": 1.6222623586654663, + "learning_rate": 4.9431979542726914e-05, + "loss": 5.9417, + "step": 11432 + }, + { + "epoch": 0.06799528975164144, + "grad_norm": 1.9809083938598633, + "learning_rate": 4.9431880534064345e-05, + "loss": 5.7476, + "step": 11433 + }, + { + "epoch": 0.06800123703492245, + "grad_norm": 1.9575468301773071, + "learning_rate": 4.9431781516872865e-05, + "loss": 5.6169, + "step": 11434 + }, + { + "epoch": 0.06800718431820345, + "grad_norm": 2.1103882789611816, + "learning_rate": 4.9431682491152495e-05, + "loss": 5.5119, + "step": 11435 + }, + { + "epoch": 0.06801313160148444, + "grad_norm": 2.280287265777588, + "learning_rate": 4.943158345690328e-05, + "loss": 5.2622, + "step": 11436 + }, + { + "epoch": 0.06801907888476544, + "grad_norm": 2.582737684249878, + "learning_rate": 4.943148441412525e-05, + "loss": 5.2644, + "step": 11437 + }, + { + "epoch": 0.06802502616804644, + "grad_norm": 2.1919124126434326, + "learning_rate": 4.9431385362818446e-05, + "loss": 5.0717, + "step": 11438 + }, + { + "epoch": 0.06803097345132743, + "grad_norm": 2.3036141395568848, + "learning_rate": 4.9431286302982896e-05, + "loss": 5.0049, + "step": 11439 + }, + { + "epoch": 0.06803692073460843, + "grad_norm": 2.3675789833068848, + "learning_rate": 4.943118723461864e-05, + "loss": 5.4686, + "step": 11440 + }, + { + "epoch": 0.06804286801788943, + "grad_norm": 2.8305327892303467, + "learning_rate": 4.94310881577257e-05, + "loss": 5.3409, + "step": 11441 + }, + { + "epoch": 0.06804881530117042, + "grad_norm": 1.562173843383789, + "learning_rate": 4.9430989072304126e-05, + "loss": 5.6801, + "step": 11442 + }, + { + "epoch": 0.06805476258445142, + "grad_norm": 1.9728971719741821, + "learning_rate": 4.9430889978353945e-05, + "loss": 5.4252, + "step": 11443 + }, + { + "epoch": 0.06806070986773242, + "grad_norm": 2.054025173187256, + "learning_rate": 4.9430790875875185e-05, + "loss": 5.1155, + "step": 11444 + }, + { + "epoch": 0.06806665715101341, + "grad_norm": 1.8511056900024414, + "learning_rate": 4.9430691764867895e-05, + "loss": 5.102, + "step": 11445 + }, + { + "epoch": 0.06807260443429441, + "grad_norm": 1.9024226665496826, + "learning_rate": 4.943059264533211e-05, + "loss": 5.0761, + "step": 11446 + }, + { + "epoch": 0.06807855171757542, + "grad_norm": 2.4767966270446777, + "learning_rate": 4.9430493517267843e-05, + "loss": 4.9809, + "step": 11447 + }, + { + "epoch": 0.0680844990008564, + "grad_norm": 2.393517255783081, + "learning_rate": 4.943039438067515e-05, + "loss": 5.1191, + "step": 11448 + }, + { + "epoch": 0.06809044628413741, + "grad_norm": 1.9510548114776611, + "learning_rate": 4.9430295235554055e-05, + "loss": 5.7117, + "step": 11449 + }, + { + "epoch": 0.06809639356741841, + "grad_norm": 2.1002418994903564, + "learning_rate": 4.9430196081904605e-05, + "loss": 5.7003, + "step": 11450 + }, + { + "epoch": 0.0681023408506994, + "grad_norm": 2.5328590869903564, + "learning_rate": 4.943009691972682e-05, + "loss": 6.1835, + "step": 11451 + }, + { + "epoch": 0.0681082881339804, + "grad_norm": 1.9173791408538818, + "learning_rate": 4.9429997749020743e-05, + "loss": 5.9596, + "step": 11452 + }, + { + "epoch": 0.0681142354172614, + "grad_norm": 2.0781052112579346, + "learning_rate": 4.9429898569786406e-05, + "loss": 5.7335, + "step": 11453 + }, + { + "epoch": 0.06812018270054239, + "grad_norm": 2.4210550785064697, + "learning_rate": 4.942979938202384e-05, + "loss": 4.9888, + "step": 11454 + }, + { + "epoch": 0.06812612998382339, + "grad_norm": 1.8438634872436523, + "learning_rate": 4.942970018573309e-05, + "loss": 5.8027, + "step": 11455 + }, + { + "epoch": 0.0681320772671044, + "grad_norm": 2.122882843017578, + "learning_rate": 4.942960098091418e-05, + "loss": 5.8569, + "step": 11456 + }, + { + "epoch": 0.06813802455038538, + "grad_norm": 1.6002168655395508, + "learning_rate": 4.942950176756715e-05, + "loss": 5.7362, + "step": 11457 + }, + { + "epoch": 0.06814397183366638, + "grad_norm": 1.8086539506912231, + "learning_rate": 4.942940254569203e-05, + "loss": 5.7537, + "step": 11458 + }, + { + "epoch": 0.06814991911694737, + "grad_norm": 2.0441513061523438, + "learning_rate": 4.942930331528886e-05, + "loss": 5.8255, + "step": 11459 + }, + { + "epoch": 0.06815586640022837, + "grad_norm": 1.8272675275802612, + "learning_rate": 4.942920407635767e-05, + "loss": 5.6915, + "step": 11460 + }, + { + "epoch": 0.06816181368350938, + "grad_norm": 3.3902077674865723, + "learning_rate": 4.94291048288985e-05, + "loss": 4.719, + "step": 11461 + }, + { + "epoch": 0.06816776096679036, + "grad_norm": 3.1770875453948975, + "learning_rate": 4.9429005572911385e-05, + "loss": 4.401, + "step": 11462 + }, + { + "epoch": 0.06817370825007137, + "grad_norm": 1.9011846780776978, + "learning_rate": 4.9428906308396355e-05, + "loss": 5.4768, + "step": 11463 + }, + { + "epoch": 0.06817965553335237, + "grad_norm": 1.7608321905136108, + "learning_rate": 4.9428807035353443e-05, + "loss": 5.5755, + "step": 11464 + }, + { + "epoch": 0.06818560281663336, + "grad_norm": 1.8250397443771362, + "learning_rate": 4.9428707753782686e-05, + "loss": 5.7804, + "step": 11465 + }, + { + "epoch": 0.06819155009991436, + "grad_norm": 2.566436290740967, + "learning_rate": 4.942860846368412e-05, + "loss": 5.0442, + "step": 11466 + }, + { + "epoch": 0.06819749738319536, + "grad_norm": 3.336547613143921, + "learning_rate": 4.942850916505779e-05, + "loss": 4.5331, + "step": 11467 + }, + { + "epoch": 0.06820344466647635, + "grad_norm": 2.6383185386657715, + "learning_rate": 4.9428409857903714e-05, + "loss": 4.5301, + "step": 11468 + }, + { + "epoch": 0.06820939194975735, + "grad_norm": 2.3853955268859863, + "learning_rate": 4.9428310542221924e-05, + "loss": 4.3398, + "step": 11469 + }, + { + "epoch": 0.06821533923303835, + "grad_norm": 2.3954038619995117, + "learning_rate": 4.942821121801246e-05, + "loss": 5.0841, + "step": 11470 + }, + { + "epoch": 0.06822128651631934, + "grad_norm": 2.922161340713501, + "learning_rate": 4.942811188527537e-05, + "loss": 4.5573, + "step": 11471 + }, + { + "epoch": 0.06822723379960034, + "grad_norm": 2.7202560901641846, + "learning_rate": 4.942801254401068e-05, + "loss": 4.5047, + "step": 11472 + }, + { + "epoch": 0.06823318108288134, + "grad_norm": 2.2289440631866455, + "learning_rate": 4.9427913194218424e-05, + "loss": 5.4686, + "step": 11473 + }, + { + "epoch": 0.06823912836616233, + "grad_norm": 2.2033851146698, + "learning_rate": 4.9427813835898635e-05, + "loss": 5.3554, + "step": 11474 + }, + { + "epoch": 0.06824507564944333, + "grad_norm": 2.171147346496582, + "learning_rate": 4.9427714469051345e-05, + "loss": 5.504, + "step": 11475 + }, + { + "epoch": 0.06825102293272434, + "grad_norm": 2.0110602378845215, + "learning_rate": 4.9427615093676594e-05, + "loss": 5.6126, + "step": 11476 + }, + { + "epoch": 0.06825697021600532, + "grad_norm": 2.08642840385437, + "learning_rate": 4.942751570977441e-05, + "loss": 6.0948, + "step": 11477 + }, + { + "epoch": 0.06826291749928633, + "grad_norm": 2.12245774269104, + "learning_rate": 4.9427416317344835e-05, + "loss": 5.2845, + "step": 11478 + }, + { + "epoch": 0.06826886478256733, + "grad_norm": 1.9155166149139404, + "learning_rate": 4.942731691638791e-05, + "loss": 5.4674, + "step": 11479 + }, + { + "epoch": 0.06827481206584832, + "grad_norm": 2.3452367782592773, + "learning_rate": 4.942721750690365e-05, + "loss": 5.2368, + "step": 11480 + }, + { + "epoch": 0.06828075934912932, + "grad_norm": 2.1282498836517334, + "learning_rate": 4.9427118088892105e-05, + "loss": 5.348, + "step": 11481 + }, + { + "epoch": 0.06828670663241032, + "grad_norm": 1.9251933097839355, + "learning_rate": 4.9427018662353306e-05, + "loss": 5.2588, + "step": 11482 + }, + { + "epoch": 0.06829265391569131, + "grad_norm": 1.9481078386306763, + "learning_rate": 4.942691922728728e-05, + "loss": 5.2775, + "step": 11483 + }, + { + "epoch": 0.06829860119897231, + "grad_norm": 1.9506112337112427, + "learning_rate": 4.942681978369408e-05, + "loss": 5.6865, + "step": 11484 + }, + { + "epoch": 0.06830454848225331, + "grad_norm": 2.0636112689971924, + "learning_rate": 4.942672033157373e-05, + "loss": 6.218, + "step": 11485 + }, + { + "epoch": 0.0683104957655343, + "grad_norm": 1.8479397296905518, + "learning_rate": 4.9426620870926256e-05, + "loss": 6.1283, + "step": 11486 + }, + { + "epoch": 0.0683164430488153, + "grad_norm": 1.9079830646514893, + "learning_rate": 4.94265214017517e-05, + "loss": 6.127, + "step": 11487 + }, + { + "epoch": 0.06832239033209629, + "grad_norm": 2.1076481342315674, + "learning_rate": 4.9426421924050105e-05, + "loss": 5.9978, + "step": 11488 + }, + { + "epoch": 0.0683283376153773, + "grad_norm": 1.885231375694275, + "learning_rate": 4.942632243782149e-05, + "loss": 5.8269, + "step": 11489 + }, + { + "epoch": 0.0683342848986583, + "grad_norm": 1.968980073928833, + "learning_rate": 4.942622294306591e-05, + "loss": 5.899, + "step": 11490 + }, + { + "epoch": 0.06834023218193928, + "grad_norm": 1.9857345819473267, + "learning_rate": 4.9426123439783376e-05, + "loss": 5.9416, + "step": 11491 + }, + { + "epoch": 0.06834617946522029, + "grad_norm": 1.8433799743652344, + "learning_rate": 4.942602392797394e-05, + "loss": 6.0714, + "step": 11492 + }, + { + "epoch": 0.06835212674850129, + "grad_norm": 1.9299565553665161, + "learning_rate": 4.942592440763764e-05, + "loss": 6.14, + "step": 11493 + }, + { + "epoch": 0.06835807403178228, + "grad_norm": 1.5700571537017822, + "learning_rate": 4.9425824878774486e-05, + "loss": 6.0496, + "step": 11494 + }, + { + "epoch": 0.06836402131506328, + "grad_norm": 1.6914032697677612, + "learning_rate": 4.942572534138454e-05, + "loss": 5.8301, + "step": 11495 + }, + { + "epoch": 0.06836996859834428, + "grad_norm": 1.6765984296798706, + "learning_rate": 4.942562579546782e-05, + "loss": 6.0701, + "step": 11496 + }, + { + "epoch": 0.06837591588162527, + "grad_norm": 1.715425729751587, + "learning_rate": 4.9425526241024364e-05, + "loss": 5.9499, + "step": 11497 + }, + { + "epoch": 0.06838186316490627, + "grad_norm": 1.8849130868911743, + "learning_rate": 4.942542667805422e-05, + "loss": 5.7088, + "step": 11498 + }, + { + "epoch": 0.06838781044818727, + "grad_norm": 2.1290276050567627, + "learning_rate": 4.9425327106557405e-05, + "loss": 5.9329, + "step": 11499 + }, + { + "epoch": 0.06839375773146826, + "grad_norm": 1.9105192422866821, + "learning_rate": 4.942522752653396e-05, + "loss": 5.9068, + "step": 11500 + }, + { + "epoch": 0.06839970501474926, + "grad_norm": 1.9120036363601685, + "learning_rate": 4.9425127937983926e-05, + "loss": 5.8411, + "step": 11501 + }, + { + "epoch": 0.06840565229803026, + "grad_norm": 2.1045427322387695, + "learning_rate": 4.942502834090732e-05, + "loss": 6.1575, + "step": 11502 + }, + { + "epoch": 0.06841159958131125, + "grad_norm": 1.8271901607513428, + "learning_rate": 4.94249287353042e-05, + "loss": 6.0732, + "step": 11503 + }, + { + "epoch": 0.06841754686459225, + "grad_norm": 1.4770866632461548, + "learning_rate": 4.942482912117459e-05, + "loss": 6.0823, + "step": 11504 + }, + { + "epoch": 0.06842349414787326, + "grad_norm": 1.7055792808532715, + "learning_rate": 4.942472949851852e-05, + "loss": 6.0738, + "step": 11505 + }, + { + "epoch": 0.06842944143115424, + "grad_norm": 1.588705062866211, + "learning_rate": 4.942462986733602e-05, + "loss": 5.9731, + "step": 11506 + }, + { + "epoch": 0.06843538871443525, + "grad_norm": 2.662527561187744, + "learning_rate": 4.942453022762715e-05, + "loss": 5.7745, + "step": 11507 + }, + { + "epoch": 0.06844133599771625, + "grad_norm": 2.0649495124816895, + "learning_rate": 4.9424430579391925e-05, + "loss": 5.7173, + "step": 11508 + }, + { + "epoch": 0.06844728328099724, + "grad_norm": 1.647801160812378, + "learning_rate": 4.942433092263038e-05, + "loss": 6.1516, + "step": 11509 + }, + { + "epoch": 0.06845323056427824, + "grad_norm": 1.743788480758667, + "learning_rate": 4.942423125734256e-05, + "loss": 6.0211, + "step": 11510 + }, + { + "epoch": 0.06845917784755924, + "grad_norm": 1.898647665977478, + "learning_rate": 4.942413158352849e-05, + "loss": 6.0106, + "step": 11511 + }, + { + "epoch": 0.06846512513084023, + "grad_norm": 1.5159860849380493, + "learning_rate": 4.94240319011882e-05, + "loss": 5.8759, + "step": 11512 + }, + { + "epoch": 0.06847107241412123, + "grad_norm": 3.265730142593384, + "learning_rate": 4.9423932210321744e-05, + "loss": 4.7228, + "step": 11513 + }, + { + "epoch": 0.06847701969740223, + "grad_norm": 2.9290871620178223, + "learning_rate": 4.9423832510929136e-05, + "loss": 4.5315, + "step": 11514 + }, + { + "epoch": 0.06848296698068322, + "grad_norm": 2.4189975261688232, + "learning_rate": 4.942373280301042e-05, + "loss": 4.5803, + "step": 11515 + }, + { + "epoch": 0.06848891426396422, + "grad_norm": 2.4018993377685547, + "learning_rate": 4.9423633086565645e-05, + "loss": 5.1411, + "step": 11516 + }, + { + "epoch": 0.06849486154724521, + "grad_norm": 2.4697556495666504, + "learning_rate": 4.9423533361594824e-05, + "loss": 5.1523, + "step": 11517 + }, + { + "epoch": 0.06850080883052621, + "grad_norm": 2.1573715209960938, + "learning_rate": 4.942343362809799e-05, + "loss": 5.3488, + "step": 11518 + }, + { + "epoch": 0.06850675611380722, + "grad_norm": 1.9723131656646729, + "learning_rate": 4.9423333886075205e-05, + "loss": 5.2315, + "step": 11519 + }, + { + "epoch": 0.0685127033970882, + "grad_norm": 1.6925430297851562, + "learning_rate": 4.9423234135526475e-05, + "loss": 5.3055, + "step": 11520 + }, + { + "epoch": 0.0685186506803692, + "grad_norm": 2.8665122985839844, + "learning_rate": 4.942313437645185e-05, + "loss": 4.4905, + "step": 11521 + }, + { + "epoch": 0.06852459796365021, + "grad_norm": 2.7538015842437744, + "learning_rate": 4.942303460885136e-05, + "loss": 4.3863, + "step": 11522 + }, + { + "epoch": 0.0685305452469312, + "grad_norm": 2.335664987564087, + "learning_rate": 4.942293483272504e-05, + "loss": 4.4571, + "step": 11523 + }, + { + "epoch": 0.0685364925302122, + "grad_norm": 1.7987995147705078, + "learning_rate": 4.942283504807293e-05, + "loss": 5.1802, + "step": 11524 + }, + { + "epoch": 0.0685424398134932, + "grad_norm": 2.3286690711975098, + "learning_rate": 4.9422735254895056e-05, + "loss": 5.2883, + "step": 11525 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 2.093317747116089, + "learning_rate": 4.9422635453191466e-05, + "loss": 5.2589, + "step": 11526 + }, + { + "epoch": 0.06855433438005519, + "grad_norm": 1.914236307144165, + "learning_rate": 4.942253564296218e-05, + "loss": 5.4347, + "step": 11527 + }, + { + "epoch": 0.06856028166333619, + "grad_norm": 1.602265477180481, + "learning_rate": 4.942243582420724e-05, + "loss": 5.8021, + "step": 11528 + }, + { + "epoch": 0.06856622894661718, + "grad_norm": 1.4433797597885132, + "learning_rate": 4.9422335996926674e-05, + "loss": 5.7432, + "step": 11529 + }, + { + "epoch": 0.06857217622989818, + "grad_norm": 1.3481166362762451, + "learning_rate": 4.942223616112053e-05, + "loss": 5.2946, + "step": 11530 + }, + { + "epoch": 0.06857812351317918, + "grad_norm": 1.879550576210022, + "learning_rate": 4.942213631678883e-05, + "loss": 5.2669, + "step": 11531 + }, + { + "epoch": 0.06858407079646017, + "grad_norm": 2.7241995334625244, + "learning_rate": 4.942203646393162e-05, + "loss": 5.2248, + "step": 11532 + }, + { + "epoch": 0.06859001807974117, + "grad_norm": 1.9870814085006714, + "learning_rate": 4.942193660254892e-05, + "loss": 5.4025, + "step": 11533 + }, + { + "epoch": 0.06859596536302218, + "grad_norm": 1.89231276512146, + "learning_rate": 4.942183673264079e-05, + "loss": 5.6046, + "step": 11534 + }, + { + "epoch": 0.06860191264630316, + "grad_norm": 2.024684429168701, + "learning_rate": 4.9421736854207235e-05, + "loss": 5.4031, + "step": 11535 + }, + { + "epoch": 0.06860785992958417, + "grad_norm": 1.6764521598815918, + "learning_rate": 4.942163696724831e-05, + "loss": 5.702, + "step": 11536 + }, + { + "epoch": 0.06861380721286517, + "grad_norm": 1.7738621234893799, + "learning_rate": 4.942153707176405e-05, + "loss": 5.1491, + "step": 11537 + }, + { + "epoch": 0.06861975449614616, + "grad_norm": 1.416986346244812, + "learning_rate": 4.942143716775447e-05, + "loss": 5.3883, + "step": 11538 + }, + { + "epoch": 0.06862570177942716, + "grad_norm": 1.837067723274231, + "learning_rate": 4.942133725521963e-05, + "loss": 5.2945, + "step": 11539 + }, + { + "epoch": 0.06863164906270816, + "grad_norm": 1.995610237121582, + "learning_rate": 4.942123733415955e-05, + "loss": 5.2589, + "step": 11540 + }, + { + "epoch": 0.06863759634598915, + "grad_norm": 1.9689414501190186, + "learning_rate": 4.9421137404574264e-05, + "loss": 5.3715, + "step": 11541 + }, + { + "epoch": 0.06864354362927015, + "grad_norm": 1.6984235048294067, + "learning_rate": 4.942103746646382e-05, + "loss": 5.3987, + "step": 11542 + }, + { + "epoch": 0.06864949091255115, + "grad_norm": 1.2645832300186157, + "learning_rate": 4.9420937519828234e-05, + "loss": 5.2142, + "step": 11543 + }, + { + "epoch": 0.06865543819583214, + "grad_norm": 1.6830233335494995, + "learning_rate": 4.9420837564667556e-05, + "loss": 5.1172, + "step": 11544 + }, + { + "epoch": 0.06866138547911314, + "grad_norm": 1.5734926462173462, + "learning_rate": 4.9420737600981816e-05, + "loss": 5.3789, + "step": 11545 + }, + { + "epoch": 0.06866733276239413, + "grad_norm": 1.7375764846801758, + "learning_rate": 4.942063762877105e-05, + "loss": 5.5311, + "step": 11546 + }, + { + "epoch": 0.06867328004567513, + "grad_norm": 1.5421762466430664, + "learning_rate": 4.942053764803529e-05, + "loss": 5.1722, + "step": 11547 + }, + { + "epoch": 0.06867922732895614, + "grad_norm": 1.6282575130462646, + "learning_rate": 4.942043765877457e-05, + "loss": 5.4754, + "step": 11548 + }, + { + "epoch": 0.06868517461223712, + "grad_norm": 1.5595266819000244, + "learning_rate": 4.9420337660988936e-05, + "loss": 5.3516, + "step": 11549 + }, + { + "epoch": 0.06869112189551813, + "grad_norm": 1.5642317533493042, + "learning_rate": 4.9420237654678405e-05, + "loss": 5.2364, + "step": 11550 + }, + { + "epoch": 0.06869706917879913, + "grad_norm": 1.5491602420806885, + "learning_rate": 4.942013763984302e-05, + "loss": 5.1566, + "step": 11551 + }, + { + "epoch": 0.06870301646208012, + "grad_norm": 1.4256258010864258, + "learning_rate": 4.942003761648283e-05, + "loss": 5.1592, + "step": 11552 + }, + { + "epoch": 0.06870896374536112, + "grad_norm": 1.756016492843628, + "learning_rate": 4.9419937584597846e-05, + "loss": 5.012, + "step": 11553 + }, + { + "epoch": 0.06871491102864212, + "grad_norm": 2.5290040969848633, + "learning_rate": 4.941983754418812e-05, + "loss": 4.571, + "step": 11554 + }, + { + "epoch": 0.06872085831192311, + "grad_norm": 2.6146528720855713, + "learning_rate": 4.9419737495253685e-05, + "loss": 4.3515, + "step": 11555 + }, + { + "epoch": 0.06872680559520411, + "grad_norm": 2.3333144187927246, + "learning_rate": 4.941963743779456e-05, + "loss": 4.3032, + "step": 11556 + }, + { + "epoch": 0.06873275287848511, + "grad_norm": 2.342433452606201, + "learning_rate": 4.9419537371810795e-05, + "loss": 4.2942, + "step": 11557 + }, + { + "epoch": 0.0687387001617661, + "grad_norm": 2.423696517944336, + "learning_rate": 4.941943729730243e-05, + "loss": 4.4, + "step": 11558 + }, + { + "epoch": 0.0687446474450471, + "grad_norm": 2.3420050144195557, + "learning_rate": 4.941933721426948e-05, + "loss": 5.0466, + "step": 11559 + }, + { + "epoch": 0.0687505947283281, + "grad_norm": 2.7115821838378906, + "learning_rate": 4.9419237122712e-05, + "loss": 5.1197, + "step": 11560 + }, + { + "epoch": 0.06875654201160909, + "grad_norm": 2.7316489219665527, + "learning_rate": 4.9419137022630014e-05, + "loss": 5.2435, + "step": 11561 + }, + { + "epoch": 0.0687624892948901, + "grad_norm": 2.291551113128662, + "learning_rate": 4.941903691402356e-05, + "loss": 5.0345, + "step": 11562 + }, + { + "epoch": 0.0687684365781711, + "grad_norm": 2.4499049186706543, + "learning_rate": 4.941893679689267e-05, + "loss": 4.503, + "step": 11563 + }, + { + "epoch": 0.06877438386145208, + "grad_norm": 2.7120168209075928, + "learning_rate": 4.9418836671237385e-05, + "loss": 4.2954, + "step": 11564 + }, + { + "epoch": 0.06878033114473309, + "grad_norm": 2.8483526706695557, + "learning_rate": 4.941873653705774e-05, + "loss": 6.269, + "step": 11565 + }, + { + "epoch": 0.06878627842801409, + "grad_norm": 2.3191473484039307, + "learning_rate": 4.941863639435376e-05, + "loss": 6.1628, + "step": 11566 + }, + { + "epoch": 0.06879222571129508, + "grad_norm": 3.4622583389282227, + "learning_rate": 4.9418536243125486e-05, + "loss": 5.6115, + "step": 11567 + }, + { + "epoch": 0.06879817299457608, + "grad_norm": 1.7118897438049316, + "learning_rate": 4.941843608337295e-05, + "loss": 5.4801, + "step": 11568 + }, + { + "epoch": 0.06880412027785708, + "grad_norm": 2.876338243484497, + "learning_rate": 4.9418335915096195e-05, + "loss": 5.0806, + "step": 11569 + }, + { + "epoch": 0.06881006756113807, + "grad_norm": 2.2875587940216064, + "learning_rate": 4.941823573829525e-05, + "loss": 5.2833, + "step": 11570 + }, + { + "epoch": 0.06881601484441907, + "grad_norm": 1.797743320465088, + "learning_rate": 4.9418135552970155e-05, + "loss": 6.1407, + "step": 11571 + }, + { + "epoch": 0.06882196212770007, + "grad_norm": 1.957331895828247, + "learning_rate": 4.941803535912094e-05, + "loss": 5.8743, + "step": 11572 + }, + { + "epoch": 0.06882790941098106, + "grad_norm": 1.9552925825119019, + "learning_rate": 4.9417935156747644e-05, + "loss": 5.584, + "step": 11573 + }, + { + "epoch": 0.06883385669426206, + "grad_norm": 2.057610034942627, + "learning_rate": 4.94178349458503e-05, + "loss": 5.8445, + "step": 11574 + }, + { + "epoch": 0.06883980397754305, + "grad_norm": 1.7856727838516235, + "learning_rate": 4.941773472642893e-05, + "loss": 6.0133, + "step": 11575 + }, + { + "epoch": 0.06884575126082405, + "grad_norm": 1.4494417905807495, + "learning_rate": 4.941763449848359e-05, + "loss": 5.888, + "step": 11576 + }, + { + "epoch": 0.06885169854410506, + "grad_norm": 2.1377499103546143, + "learning_rate": 4.9417534262014306e-05, + "loss": 6.0604, + "step": 11577 + }, + { + "epoch": 0.06885764582738604, + "grad_norm": 1.769888162612915, + "learning_rate": 4.9417434017021105e-05, + "loss": 5.8815, + "step": 11578 + }, + { + "epoch": 0.06886359311066705, + "grad_norm": 1.933935523033142, + "learning_rate": 4.9417333763504036e-05, + "loss": 5.6601, + "step": 11579 + }, + { + "epoch": 0.06886954039394805, + "grad_norm": 1.8672062158584595, + "learning_rate": 4.941723350146313e-05, + "loss": 5.8143, + "step": 11580 + }, + { + "epoch": 0.06887548767722904, + "grad_norm": 1.9899057149887085, + "learning_rate": 4.941713323089842e-05, + "loss": 5.8465, + "step": 11581 + }, + { + "epoch": 0.06888143496051004, + "grad_norm": 2.1053643226623535, + "learning_rate": 4.941703295180994e-05, + "loss": 5.4582, + "step": 11582 + }, + { + "epoch": 0.06888738224379104, + "grad_norm": 1.9435245990753174, + "learning_rate": 4.9416932664197726e-05, + "loss": 5.8503, + "step": 11583 + }, + { + "epoch": 0.06889332952707203, + "grad_norm": 1.9407175779342651, + "learning_rate": 4.941683236806181e-05, + "loss": 5.706, + "step": 11584 + }, + { + "epoch": 0.06889927681035303, + "grad_norm": 2.0505893230438232, + "learning_rate": 4.941673206340224e-05, + "loss": 6.01, + "step": 11585 + }, + { + "epoch": 0.06890522409363403, + "grad_norm": 1.6713486909866333, + "learning_rate": 4.941663175021903e-05, + "loss": 5.8347, + "step": 11586 + }, + { + "epoch": 0.06891117137691502, + "grad_norm": 1.5333812236785889, + "learning_rate": 4.941653142851223e-05, + "loss": 5.8493, + "step": 11587 + }, + { + "epoch": 0.06891711866019602, + "grad_norm": 2.10982346534729, + "learning_rate": 4.9416431098281865e-05, + "loss": 5.4037, + "step": 11588 + }, + { + "epoch": 0.06892306594347702, + "grad_norm": 1.766663908958435, + "learning_rate": 4.9416330759527985e-05, + "loss": 5.0335, + "step": 11589 + }, + { + "epoch": 0.06892901322675801, + "grad_norm": 2.0600688457489014, + "learning_rate": 4.9416230412250615e-05, + "loss": 5.4017, + "step": 11590 + }, + { + "epoch": 0.06893496051003901, + "grad_norm": 1.6271671056747437, + "learning_rate": 4.941613005644979e-05, + "loss": 5.903, + "step": 11591 + }, + { + "epoch": 0.06894090779332002, + "grad_norm": 1.9222697019577026, + "learning_rate": 4.9416029692125544e-05, + "loss": 5.1666, + "step": 11592 + }, + { + "epoch": 0.068946855076601, + "grad_norm": 1.7405030727386475, + "learning_rate": 4.941592931927792e-05, + "loss": 5.0799, + "step": 11593 + }, + { + "epoch": 0.068952802359882, + "grad_norm": 1.7639994621276855, + "learning_rate": 4.941582893790694e-05, + "loss": 5.7596, + "step": 11594 + }, + { + "epoch": 0.06895874964316301, + "grad_norm": 1.9628292322158813, + "learning_rate": 4.941572854801265e-05, + "loss": 4.4573, + "step": 11595 + }, + { + "epoch": 0.068964696926444, + "grad_norm": 1.7616615295410156, + "learning_rate": 4.941562814959508e-05, + "loss": 4.6399, + "step": 11596 + }, + { + "epoch": 0.068970644209725, + "grad_norm": 1.8174281120300293, + "learning_rate": 4.9415527742654265e-05, + "loss": 5.6279, + "step": 11597 + }, + { + "epoch": 0.068976591493006, + "grad_norm": 1.563138723373413, + "learning_rate": 4.941542732719025e-05, + "loss": 5.8696, + "step": 11598 + }, + { + "epoch": 0.06898253877628699, + "grad_norm": 1.4704676866531372, + "learning_rate": 4.9415326903203055e-05, + "loss": 5.7129, + "step": 11599 + }, + { + "epoch": 0.06898848605956799, + "grad_norm": 2.484572410583496, + "learning_rate": 4.9415226470692724e-05, + "loss": 5.336, + "step": 11600 + }, + { + "epoch": 0.068994433342849, + "grad_norm": 1.882876992225647, + "learning_rate": 4.9415126029659284e-05, + "loss": 5.4273, + "step": 11601 + }, + { + "epoch": 0.06900038062612998, + "grad_norm": 1.7827874422073364, + "learning_rate": 4.941502558010278e-05, + "loss": 5.6699, + "step": 11602 + }, + { + "epoch": 0.06900632790941098, + "grad_norm": 1.5609276294708252, + "learning_rate": 4.941492512202325e-05, + "loss": 5.648, + "step": 11603 + }, + { + "epoch": 0.06901227519269197, + "grad_norm": 1.6941063404083252, + "learning_rate": 4.941482465542071e-05, + "loss": 5.633, + "step": 11604 + }, + { + "epoch": 0.06901822247597297, + "grad_norm": 1.768922209739685, + "learning_rate": 4.941472418029521e-05, + "loss": 5.6072, + "step": 11605 + }, + { + "epoch": 0.06902416975925398, + "grad_norm": 2.225846767425537, + "learning_rate": 4.941462369664679e-05, + "loss": 4.9314, + "step": 11606 + }, + { + "epoch": 0.06903011704253496, + "grad_norm": 2.4479281902313232, + "learning_rate": 4.941452320447546e-05, + "loss": 5.0563, + "step": 11607 + }, + { + "epoch": 0.06903606432581597, + "grad_norm": 2.358238935470581, + "learning_rate": 4.941442270378129e-05, + "loss": 4.9379, + "step": 11608 + }, + { + "epoch": 0.06904201160909697, + "grad_norm": 2.2679247856140137, + "learning_rate": 4.941432219456429e-05, + "loss": 5.0655, + "step": 11609 + }, + { + "epoch": 0.06904795889237796, + "grad_norm": 2.524176597595215, + "learning_rate": 4.94142216768245e-05, + "loss": 4.8694, + "step": 11610 + }, + { + "epoch": 0.06905390617565896, + "grad_norm": 2.1919515132904053, + "learning_rate": 4.9414121150561966e-05, + "loss": 5.0889, + "step": 11611 + }, + { + "epoch": 0.06905985345893996, + "grad_norm": 2.2838563919067383, + "learning_rate": 4.94140206157767e-05, + "loss": 4.9942, + "step": 11612 + }, + { + "epoch": 0.06906580074222095, + "grad_norm": 2.2270026206970215, + "learning_rate": 4.9413920072468764e-05, + "loss": 4.9885, + "step": 11613 + }, + { + "epoch": 0.06907174802550195, + "grad_norm": 2.175245761871338, + "learning_rate": 4.9413819520638176e-05, + "loss": 4.9829, + "step": 11614 + }, + { + "epoch": 0.06907769530878295, + "grad_norm": 2.128441572189331, + "learning_rate": 4.941371896028498e-05, + "loss": 4.9802, + "step": 11615 + }, + { + "epoch": 0.06908364259206394, + "grad_norm": 2.7656328678131104, + "learning_rate": 4.94136183914092e-05, + "loss": 5.1302, + "step": 11616 + }, + { + "epoch": 0.06908958987534494, + "grad_norm": 2.23917818069458, + "learning_rate": 4.941351781401088e-05, + "loss": 4.8766, + "step": 11617 + }, + { + "epoch": 0.06909553715862594, + "grad_norm": 1.861399531364441, + "learning_rate": 4.941341722809005e-05, + "loss": 5.8151, + "step": 11618 + }, + { + "epoch": 0.06910148444190693, + "grad_norm": 2.13590145111084, + "learning_rate": 4.9413316633646754e-05, + "loss": 5.6892, + "step": 11619 + }, + { + "epoch": 0.06910743172518793, + "grad_norm": 1.8261966705322266, + "learning_rate": 4.9413216030681024e-05, + "loss": 6.1387, + "step": 11620 + }, + { + "epoch": 0.06911337900846894, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.941311541919289e-05, + "loss": 5.3217, + "step": 11621 + }, + { + "epoch": 0.06911932629174992, + "grad_norm": 2.1011979579925537, + "learning_rate": 4.941301479918239e-05, + "loss": 5.048, + "step": 11622 + }, + { + "epoch": 0.06912527357503093, + "grad_norm": 2.214597225189209, + "learning_rate": 4.941291417064956e-05, + "loss": 5.4312, + "step": 11623 + }, + { + "epoch": 0.06913122085831193, + "grad_norm": 2.6525864601135254, + "learning_rate": 4.941281353359443e-05, + "loss": 4.4151, + "step": 11624 + }, + { + "epoch": 0.06913716814159292, + "grad_norm": 1.9638911485671997, + "learning_rate": 4.941271288801704e-05, + "loss": 5.0091, + "step": 11625 + }, + { + "epoch": 0.06914311542487392, + "grad_norm": 2.062688112258911, + "learning_rate": 4.941261223391742e-05, + "loss": 5.503, + "step": 11626 + }, + { + "epoch": 0.06914906270815492, + "grad_norm": 2.219430685043335, + "learning_rate": 4.941251157129561e-05, + "loss": 4.984, + "step": 11627 + }, + { + "epoch": 0.06915500999143591, + "grad_norm": 2.0745718479156494, + "learning_rate": 4.941241090015165e-05, + "loss": 5.3094, + "step": 11628 + }, + { + "epoch": 0.06916095727471691, + "grad_norm": 1.8852496147155762, + "learning_rate": 4.941231022048557e-05, + "loss": 5.2424, + "step": 11629 + }, + { + "epoch": 0.06916690455799791, + "grad_norm": 2.335723400115967, + "learning_rate": 4.9412209532297404e-05, + "loss": 5.6031, + "step": 11630 + }, + { + "epoch": 0.0691728518412789, + "grad_norm": 2.167698621749878, + "learning_rate": 4.941210883558719e-05, + "loss": 5.3132, + "step": 11631 + }, + { + "epoch": 0.0691787991245599, + "grad_norm": 2.213068962097168, + "learning_rate": 4.941200813035495e-05, + "loss": 5.2049, + "step": 11632 + }, + { + "epoch": 0.06918474640784089, + "grad_norm": 1.9697870016098022, + "learning_rate": 4.941190741660075e-05, + "loss": 5.3118, + "step": 11633 + }, + { + "epoch": 0.0691906936911219, + "grad_norm": 1.7360777854919434, + "learning_rate": 4.941180669432458e-05, + "loss": 5.444, + "step": 11634 + }, + { + "epoch": 0.0691966409744029, + "grad_norm": 1.8400771617889404, + "learning_rate": 4.9411705963526514e-05, + "loss": 5.6975, + "step": 11635 + }, + { + "epoch": 0.06920258825768388, + "grad_norm": 1.492242693901062, + "learning_rate": 4.941160522420657e-05, + "loss": 5.5617, + "step": 11636 + }, + { + "epoch": 0.06920853554096489, + "grad_norm": 1.6014543771743774, + "learning_rate": 4.9411504476364794e-05, + "loss": 5.7317, + "step": 11637 + }, + { + "epoch": 0.06921448282424589, + "grad_norm": 1.7973628044128418, + "learning_rate": 4.9411403720001215e-05, + "loss": 5.3105, + "step": 11638 + }, + { + "epoch": 0.06922043010752688, + "grad_norm": 1.8314461708068848, + "learning_rate": 4.9411302955115853e-05, + "loss": 5.624, + "step": 11639 + }, + { + "epoch": 0.06922637739080788, + "grad_norm": 1.621315836906433, + "learning_rate": 4.941120218170877e-05, + "loss": 5.8243, + "step": 11640 + }, + { + "epoch": 0.06923232467408888, + "grad_norm": 2.0378596782684326, + "learning_rate": 4.941110139977998e-05, + "loss": 4.9275, + "step": 11641 + }, + { + "epoch": 0.06923827195736987, + "grad_norm": 1.8713582754135132, + "learning_rate": 4.941100060932954e-05, + "loss": 5.1218, + "step": 11642 + }, + { + "epoch": 0.06924421924065087, + "grad_norm": 1.878404140472412, + "learning_rate": 4.941089981035746e-05, + "loss": 5.4997, + "step": 11643 + }, + { + "epoch": 0.06925016652393187, + "grad_norm": 1.7230712175369263, + "learning_rate": 4.941079900286379e-05, + "loss": 5.5514, + "step": 11644 + }, + { + "epoch": 0.06925611380721286, + "grad_norm": 1.6272276639938354, + "learning_rate": 4.941069818684856e-05, + "loss": 5.7186, + "step": 11645 + }, + { + "epoch": 0.06926206109049386, + "grad_norm": 1.5610454082489014, + "learning_rate": 4.9410597362311814e-05, + "loss": 5.8929, + "step": 11646 + }, + { + "epoch": 0.06926800837377486, + "grad_norm": 1.7373837232589722, + "learning_rate": 4.941049652925358e-05, + "loss": 5.6428, + "step": 11647 + }, + { + "epoch": 0.06927395565705585, + "grad_norm": 1.9722628593444824, + "learning_rate": 4.9410395687673886e-05, + "loss": 5.9562, + "step": 11648 + }, + { + "epoch": 0.06927990294033685, + "grad_norm": 1.5603039264678955, + "learning_rate": 4.941029483757278e-05, + "loss": 6.031, + "step": 11649 + }, + { + "epoch": 0.06928585022361786, + "grad_norm": 1.6971800327301025, + "learning_rate": 4.941019397895029e-05, + "loss": 5.7527, + "step": 11650 + }, + { + "epoch": 0.06929179750689884, + "grad_norm": 1.9559118747711182, + "learning_rate": 4.9410093111806456e-05, + "loss": 5.0904, + "step": 11651 + }, + { + "epoch": 0.06929774479017985, + "grad_norm": 1.561122179031372, + "learning_rate": 4.9409992236141315e-05, + "loss": 5.7438, + "step": 11652 + }, + { + "epoch": 0.06930369207346085, + "grad_norm": 1.6071819067001343, + "learning_rate": 4.940989135195489e-05, + "loss": 5.8852, + "step": 11653 + }, + { + "epoch": 0.06930963935674184, + "grad_norm": 1.6804322004318237, + "learning_rate": 4.940979045924723e-05, + "loss": 5.7174, + "step": 11654 + }, + { + "epoch": 0.06931558664002284, + "grad_norm": 1.5802178382873535, + "learning_rate": 4.940968955801836e-05, + "loss": 5.8755, + "step": 11655 + }, + { + "epoch": 0.06932153392330384, + "grad_norm": 2.1002743244171143, + "learning_rate": 4.940958864826832e-05, + "loss": 5.6323, + "step": 11656 + }, + { + "epoch": 0.06932748120658483, + "grad_norm": 1.8874709606170654, + "learning_rate": 4.9409487729997144e-05, + "loss": 5.6798, + "step": 11657 + }, + { + "epoch": 0.06933342848986583, + "grad_norm": 1.6967203617095947, + "learning_rate": 4.940938680320487e-05, + "loss": 5.8461, + "step": 11658 + }, + { + "epoch": 0.06933937577314683, + "grad_norm": 1.9648679494857788, + "learning_rate": 4.9409285867891534e-05, + "loss": 5.842, + "step": 11659 + }, + { + "epoch": 0.06934532305642782, + "grad_norm": 1.8681408166885376, + "learning_rate": 4.940918492405716e-05, + "loss": 5.8859, + "step": 11660 + }, + { + "epoch": 0.06935127033970882, + "grad_norm": 2.0480551719665527, + "learning_rate": 4.9409083971701805e-05, + "loss": 5.6415, + "step": 11661 + }, + { + "epoch": 0.06935721762298983, + "grad_norm": 2.102832555770874, + "learning_rate": 4.940898301082548e-05, + "loss": 5.6163, + "step": 11662 + }, + { + "epoch": 0.06936316490627081, + "grad_norm": 1.7471407651901245, + "learning_rate": 4.940888204142824e-05, + "loss": 5.7973, + "step": 11663 + }, + { + "epoch": 0.06936911218955182, + "grad_norm": 1.9675641059875488, + "learning_rate": 4.94087810635101e-05, + "loss": 5.1125, + "step": 11664 + }, + { + "epoch": 0.0693750594728328, + "grad_norm": 1.6316107511520386, + "learning_rate": 4.940868007707111e-05, + "loss": 5.5067, + "step": 11665 + }, + { + "epoch": 0.0693810067561138, + "grad_norm": 1.8663619756698608, + "learning_rate": 4.940857908211131e-05, + "loss": 5.5552, + "step": 11666 + }, + { + "epoch": 0.06938695403939481, + "grad_norm": 2.155702590942383, + "learning_rate": 4.940847807863072e-05, + "loss": 6.0919, + "step": 11667 + }, + { + "epoch": 0.0693929013226758, + "grad_norm": 1.968467354774475, + "learning_rate": 4.9408377066629384e-05, + "loss": 5.8105, + "step": 11668 + }, + { + "epoch": 0.0693988486059568, + "grad_norm": 1.5245625972747803, + "learning_rate": 4.940827604610734e-05, + "loss": 5.8901, + "step": 11669 + }, + { + "epoch": 0.0694047958892378, + "grad_norm": 1.7377501726150513, + "learning_rate": 4.940817501706461e-05, + "loss": 5.5917, + "step": 11670 + }, + { + "epoch": 0.06941074317251879, + "grad_norm": 1.9668710231781006, + "learning_rate": 4.940807397950125e-05, + "loss": 5.6857, + "step": 11671 + }, + { + "epoch": 0.06941669045579979, + "grad_norm": 1.8168022632598877, + "learning_rate": 4.9407972933417266e-05, + "loss": 5.7032, + "step": 11672 + }, + { + "epoch": 0.06942263773908079, + "grad_norm": 2.4009077548980713, + "learning_rate": 4.940787187881273e-05, + "loss": 5.6767, + "step": 11673 + }, + { + "epoch": 0.06942858502236178, + "grad_norm": 1.8541746139526367, + "learning_rate": 4.940777081568765e-05, + "loss": 5.6327, + "step": 11674 + }, + { + "epoch": 0.06943453230564278, + "grad_norm": 2.028602361679077, + "learning_rate": 4.940766974404206e-05, + "loss": 5.0819, + "step": 11675 + }, + { + "epoch": 0.06944047958892378, + "grad_norm": 2.0870065689086914, + "learning_rate": 4.940756866387602e-05, + "loss": 5.1645, + "step": 11676 + }, + { + "epoch": 0.06944642687220477, + "grad_norm": 1.8009755611419678, + "learning_rate": 4.940746757518954e-05, + "loss": 4.9832, + "step": 11677 + }, + { + "epoch": 0.06945237415548577, + "grad_norm": 2.20975399017334, + "learning_rate": 4.9407366477982675e-05, + "loss": 4.9683, + "step": 11678 + }, + { + "epoch": 0.06945832143876678, + "grad_norm": 1.89133882522583, + "learning_rate": 4.940726537225544e-05, + "loss": 4.7736, + "step": 11679 + }, + { + "epoch": 0.06946426872204776, + "grad_norm": 1.7583657503128052, + "learning_rate": 4.940716425800789e-05, + "loss": 5.4275, + "step": 11680 + }, + { + "epoch": 0.06947021600532877, + "grad_norm": 2.1929352283477783, + "learning_rate": 4.940706313524004e-05, + "loss": 4.8441, + "step": 11681 + }, + { + "epoch": 0.06947616328860977, + "grad_norm": 2.1098999977111816, + "learning_rate": 4.940696200395194e-05, + "loss": 5.065, + "step": 11682 + }, + { + "epoch": 0.06948211057189076, + "grad_norm": 1.7651045322418213, + "learning_rate": 4.940686086414363e-05, + "loss": 5.7086, + "step": 11683 + }, + { + "epoch": 0.06948805785517176, + "grad_norm": 1.6675828695297241, + "learning_rate": 4.9406759715815134e-05, + "loss": 5.89, + "step": 11684 + }, + { + "epoch": 0.06949400513845276, + "grad_norm": 1.9754993915557861, + "learning_rate": 4.940665855896648e-05, + "loss": 5.7752, + "step": 11685 + }, + { + "epoch": 0.06949995242173375, + "grad_norm": 1.7652478218078613, + "learning_rate": 4.940655739359773e-05, + "loss": 5.6518, + "step": 11686 + }, + { + "epoch": 0.06950589970501475, + "grad_norm": 1.898997187614441, + "learning_rate": 4.940645621970889e-05, + "loss": 5.4579, + "step": 11687 + }, + { + "epoch": 0.06951184698829575, + "grad_norm": 2.1233060359954834, + "learning_rate": 4.940635503730001e-05, + "loss": 4.3979, + "step": 11688 + }, + { + "epoch": 0.06951779427157674, + "grad_norm": 2.0859549045562744, + "learning_rate": 4.940625384637113e-05, + "loss": 4.4309, + "step": 11689 + }, + { + "epoch": 0.06952374155485774, + "grad_norm": 2.051492929458618, + "learning_rate": 4.940615264692228e-05, + "loss": 4.4332, + "step": 11690 + }, + { + "epoch": 0.06952968883813875, + "grad_norm": 2.0359628200531006, + "learning_rate": 4.940605143895348e-05, + "loss": 4.29, + "step": 11691 + }, + { + "epoch": 0.06953563612141973, + "grad_norm": 2.0122604370117188, + "learning_rate": 4.940595022246479e-05, + "loss": 4.4391, + "step": 11692 + }, + { + "epoch": 0.06954158340470074, + "grad_norm": 2.059694290161133, + "learning_rate": 4.940584899745624e-05, + "loss": 4.3993, + "step": 11693 + }, + { + "epoch": 0.06954753068798172, + "grad_norm": 2.0355825424194336, + "learning_rate": 4.940574776392786e-05, + "loss": 4.2829, + "step": 11694 + }, + { + "epoch": 0.06955347797126273, + "grad_norm": 1.933385968208313, + "learning_rate": 4.940564652187967e-05, + "loss": 4.372, + "step": 11695 + }, + { + "epoch": 0.06955942525454373, + "grad_norm": 2.0848586559295654, + "learning_rate": 4.940554527131174e-05, + "loss": 4.3064, + "step": 11696 + }, + { + "epoch": 0.06956537253782472, + "grad_norm": 1.889845848083496, + "learning_rate": 4.940544401222407e-05, + "loss": 4.3811, + "step": 11697 + }, + { + "epoch": 0.06957131982110572, + "grad_norm": 2.0076160430908203, + "learning_rate": 4.9405342744616724e-05, + "loss": 4.3382, + "step": 11698 + }, + { + "epoch": 0.06957726710438672, + "grad_norm": 1.9708037376403809, + "learning_rate": 4.940524146848971e-05, + "loss": 4.4659, + "step": 11699 + }, + { + "epoch": 0.06958321438766771, + "grad_norm": 2.086454153060913, + "learning_rate": 4.940514018384309e-05, + "loss": 4.196, + "step": 11700 + }, + { + "epoch": 0.06958916167094871, + "grad_norm": 2.095062255859375, + "learning_rate": 4.940503889067689e-05, + "loss": 4.2062, + "step": 11701 + }, + { + "epoch": 0.06959510895422971, + "grad_norm": 2.0661754608154297, + "learning_rate": 4.940493758899114e-05, + "loss": 4.3468, + "step": 11702 + }, + { + "epoch": 0.0696010562375107, + "grad_norm": 2.073573350906372, + "learning_rate": 4.9404836278785875e-05, + "loss": 4.248, + "step": 11703 + }, + { + "epoch": 0.0696070035207917, + "grad_norm": 2.104018449783325, + "learning_rate": 4.940473496006114e-05, + "loss": 4.1523, + "step": 11704 + }, + { + "epoch": 0.0696129508040727, + "grad_norm": 2.067532777786255, + "learning_rate": 4.9404633632816954e-05, + "loss": 4.2721, + "step": 11705 + }, + { + "epoch": 0.06961889808735369, + "grad_norm": 2.036736249923706, + "learning_rate": 4.9404532297053376e-05, + "loss": 4.4057, + "step": 11706 + }, + { + "epoch": 0.0696248453706347, + "grad_norm": 1.9911088943481445, + "learning_rate": 4.940443095277042e-05, + "loss": 4.1875, + "step": 11707 + }, + { + "epoch": 0.0696307926539157, + "grad_norm": 2.017457962036133, + "learning_rate": 4.9404329599968124e-05, + "loss": 4.1506, + "step": 11708 + }, + { + "epoch": 0.06963673993719668, + "grad_norm": 1.8043596744537354, + "learning_rate": 4.940422823864654e-05, + "loss": 4.3937, + "step": 11709 + }, + { + "epoch": 0.06964268722047769, + "grad_norm": 2.0362250804901123, + "learning_rate": 4.9404126868805687e-05, + "loss": 3.8076, + "step": 11710 + }, + { + "epoch": 0.06964863450375869, + "grad_norm": 2.10723876953125, + "learning_rate": 4.940402549044561e-05, + "loss": 4.2487, + "step": 11711 + }, + { + "epoch": 0.06965458178703968, + "grad_norm": 2.1901967525482178, + "learning_rate": 4.940392410356632e-05, + "loss": 4.1183, + "step": 11712 + }, + { + "epoch": 0.06966052907032068, + "grad_norm": 2.196518659591675, + "learning_rate": 4.9403822708167896e-05, + "loss": 4.2959, + "step": 11713 + }, + { + "epoch": 0.06966647635360168, + "grad_norm": 2.1917595863342285, + "learning_rate": 4.940372130425034e-05, + "loss": 4.1011, + "step": 11714 + }, + { + "epoch": 0.06967242363688267, + "grad_norm": 2.14424991607666, + "learning_rate": 4.9403619891813696e-05, + "loss": 3.9033, + "step": 11715 + }, + { + "epoch": 0.06967837092016367, + "grad_norm": 1.9970608949661255, + "learning_rate": 4.9403518470858004e-05, + "loss": 3.9243, + "step": 11716 + }, + { + "epoch": 0.06968431820344467, + "grad_norm": 2.215721607208252, + "learning_rate": 4.9403417041383294e-05, + "loss": 4.0036, + "step": 11717 + }, + { + "epoch": 0.06969026548672566, + "grad_norm": 1.9153071641921997, + "learning_rate": 4.94033156033896e-05, + "loss": 5.6849, + "step": 11718 + }, + { + "epoch": 0.06969621277000666, + "grad_norm": 2.287951707839966, + "learning_rate": 4.9403214156876966e-05, + "loss": 4.3569, + "step": 11719 + }, + { + "epoch": 0.06970216005328767, + "grad_norm": 2.1257216930389404, + "learning_rate": 4.940311270184542e-05, + "loss": 4.1051, + "step": 11720 + }, + { + "epoch": 0.06970810733656865, + "grad_norm": 2.164879560470581, + "learning_rate": 4.9403011238295e-05, + "loss": 4.0754, + "step": 11721 + }, + { + "epoch": 0.06971405461984966, + "grad_norm": 2.2430567741394043, + "learning_rate": 4.940290976622574e-05, + "loss": 4.1251, + "step": 11722 + }, + { + "epoch": 0.06972000190313064, + "grad_norm": 2.2621891498565674, + "learning_rate": 4.940280828563768e-05, + "loss": 4.2302, + "step": 11723 + }, + { + "epoch": 0.06972594918641165, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.940270679653085e-05, + "loss": 4.2853, + "step": 11724 + }, + { + "epoch": 0.06973189646969265, + "grad_norm": 2.211843729019165, + "learning_rate": 4.940260529890528e-05, + "loss": 3.6609, + "step": 11725 + }, + { + "epoch": 0.06973784375297364, + "grad_norm": 1.8500425815582275, + "learning_rate": 4.940250379276102e-05, + "loss": 3.8701, + "step": 11726 + }, + { + "epoch": 0.06974379103625464, + "grad_norm": 2.09136962890625, + "learning_rate": 4.94024022780981e-05, + "loss": 4.5569, + "step": 11727 + }, + { + "epoch": 0.06974973831953564, + "grad_norm": 1.9922528266906738, + "learning_rate": 4.940230075491655e-05, + "loss": 4.4055, + "step": 11728 + }, + { + "epoch": 0.06975568560281663, + "grad_norm": 2.253831624984741, + "learning_rate": 4.940219922321641e-05, + "loss": 4.114, + "step": 11729 + }, + { + "epoch": 0.06976163288609763, + "grad_norm": 2.0647006034851074, + "learning_rate": 4.94020976829977e-05, + "loss": 4.9004, + "step": 11730 + }, + { + "epoch": 0.06976758016937863, + "grad_norm": 2.5659384727478027, + "learning_rate": 4.940199613426049e-05, + "loss": 5.0852, + "step": 11731 + }, + { + "epoch": 0.06977352745265962, + "grad_norm": 2.227599859237671, + "learning_rate": 4.9401894577004796e-05, + "loss": 5.1603, + "step": 11732 + }, + { + "epoch": 0.06977947473594062, + "grad_norm": 1.8170785903930664, + "learning_rate": 4.940179301123063e-05, + "loss": 5.8334, + "step": 11733 + }, + { + "epoch": 0.06978542201922162, + "grad_norm": 2.1795544624328613, + "learning_rate": 4.940169143693807e-05, + "loss": 5.668, + "step": 11734 + }, + { + "epoch": 0.06979136930250261, + "grad_norm": 2.1248555183410645, + "learning_rate": 4.940158985412713e-05, + "loss": 5.7604, + "step": 11735 + }, + { + "epoch": 0.06979731658578361, + "grad_norm": 1.9677635431289673, + "learning_rate": 4.9401488262797845e-05, + "loss": 5.6568, + "step": 11736 + }, + { + "epoch": 0.06980326386906462, + "grad_norm": 1.9796242713928223, + "learning_rate": 4.940138666295025e-05, + "loss": 5.4303, + "step": 11737 + }, + { + "epoch": 0.0698092111523456, + "grad_norm": 1.7489395141601562, + "learning_rate": 4.9401285054584385e-05, + "loss": 6.1782, + "step": 11738 + }, + { + "epoch": 0.0698151584356266, + "grad_norm": 1.8067989349365234, + "learning_rate": 4.940118343770028e-05, + "loss": 6.0974, + "step": 11739 + }, + { + "epoch": 0.06982110571890761, + "grad_norm": 1.7377318143844604, + "learning_rate": 4.940108181229798e-05, + "loss": 5.8477, + "step": 11740 + }, + { + "epoch": 0.0698270530021886, + "grad_norm": 2.297499656677246, + "learning_rate": 4.940098017837751e-05, + "loss": 4.8027, + "step": 11741 + }, + { + "epoch": 0.0698330002854696, + "grad_norm": 1.7340888977050781, + "learning_rate": 4.940087853593891e-05, + "loss": 5.5897, + "step": 11742 + }, + { + "epoch": 0.0698389475687506, + "grad_norm": 2.019639730453491, + "learning_rate": 4.9400776884982216e-05, + "loss": 5.4493, + "step": 11743 + }, + { + "epoch": 0.06984489485203159, + "grad_norm": 1.7959356307983398, + "learning_rate": 4.9400675225507466e-05, + "loss": 5.5995, + "step": 11744 + }, + { + "epoch": 0.06985084213531259, + "grad_norm": 2.234757661819458, + "learning_rate": 4.940057355751468e-05, + "loss": 5.9542, + "step": 11745 + }, + { + "epoch": 0.06985678941859359, + "grad_norm": 2.047755241394043, + "learning_rate": 4.9400471881003925e-05, + "loss": 5.9125, + "step": 11746 + }, + { + "epoch": 0.06986273670187458, + "grad_norm": 1.9563192129135132, + "learning_rate": 4.940037019597521e-05, + "loss": 5.7298, + "step": 11747 + }, + { + "epoch": 0.06986868398515558, + "grad_norm": 2.7170934677124023, + "learning_rate": 4.940026850242857e-05, + "loss": 5.5172, + "step": 11748 + }, + { + "epoch": 0.06987463126843659, + "grad_norm": 2.326277494430542, + "learning_rate": 4.9400166800364056e-05, + "loss": 5.685, + "step": 11749 + }, + { + "epoch": 0.06988057855171757, + "grad_norm": 1.708383321762085, + "learning_rate": 4.94000650897817e-05, + "loss": 5.3879, + "step": 11750 + }, + { + "epoch": 0.06988652583499858, + "grad_norm": 1.897631049156189, + "learning_rate": 4.9399963370681527e-05, + "loss": 5.6856, + "step": 11751 + }, + { + "epoch": 0.06989247311827956, + "grad_norm": 2.227720260620117, + "learning_rate": 4.939986164306357e-05, + "loss": 5.4487, + "step": 11752 + }, + { + "epoch": 0.06989842040156057, + "grad_norm": 2.7821953296661377, + "learning_rate": 4.939975990692789e-05, + "loss": 5.7276, + "step": 11753 + }, + { + "epoch": 0.06990436768484157, + "grad_norm": 1.8389033079147339, + "learning_rate": 4.939965816227449e-05, + "loss": 5.6933, + "step": 11754 + }, + { + "epoch": 0.06991031496812256, + "grad_norm": 1.7653162479400635, + "learning_rate": 4.939955640910343e-05, + "loss": 5.6079, + "step": 11755 + }, + { + "epoch": 0.06991626225140356, + "grad_norm": 1.7504348754882812, + "learning_rate": 4.939945464741475e-05, + "loss": 6.0413, + "step": 11756 + }, + { + "epoch": 0.06992220953468456, + "grad_norm": 2.118326187133789, + "learning_rate": 4.939935287720845e-05, + "loss": 5.8937, + "step": 11757 + }, + { + "epoch": 0.06992815681796555, + "grad_norm": 1.9626812934875488, + "learning_rate": 4.93992510984846e-05, + "loss": 5.9564, + "step": 11758 + }, + { + "epoch": 0.06993410410124655, + "grad_norm": 1.9915722608566284, + "learning_rate": 4.939914931124322e-05, + "loss": 5.6851, + "step": 11759 + }, + { + "epoch": 0.06994005138452755, + "grad_norm": 1.7959195375442505, + "learning_rate": 4.939904751548435e-05, + "loss": 4.785, + "step": 11760 + }, + { + "epoch": 0.06994599866780854, + "grad_norm": 1.8472923040390015, + "learning_rate": 4.9398945711208025e-05, + "loss": 5.2683, + "step": 11761 + }, + { + "epoch": 0.06995194595108954, + "grad_norm": 1.4207996129989624, + "learning_rate": 4.9398843898414274e-05, + "loss": 5.5402, + "step": 11762 + }, + { + "epoch": 0.06995789323437054, + "grad_norm": 2.122070550918579, + "learning_rate": 4.9398742077103146e-05, + "loss": 5.5397, + "step": 11763 + }, + { + "epoch": 0.06996384051765153, + "grad_norm": 2.285970687866211, + "learning_rate": 4.939864024727467e-05, + "loss": 5.1401, + "step": 11764 + }, + { + "epoch": 0.06996978780093253, + "grad_norm": 2.1245667934417725, + "learning_rate": 4.9398538408928874e-05, + "loss": 5.2009, + "step": 11765 + }, + { + "epoch": 0.06997573508421354, + "grad_norm": 1.8151131868362427, + "learning_rate": 4.939843656206581e-05, + "loss": 4.8635, + "step": 11766 + }, + { + "epoch": 0.06998168236749452, + "grad_norm": 1.9139370918273926, + "learning_rate": 4.9398334706685494e-05, + "loss": 5.5998, + "step": 11767 + }, + { + "epoch": 0.06998762965077553, + "grad_norm": 1.6889853477478027, + "learning_rate": 4.9398232842787976e-05, + "loss": 5.6183, + "step": 11768 + }, + { + "epoch": 0.06999357693405653, + "grad_norm": 1.773409366607666, + "learning_rate": 4.939813097037329e-05, + "loss": 5.5083, + "step": 11769 + }, + { + "epoch": 0.06999952421733752, + "grad_norm": 2.195955991744995, + "learning_rate": 4.9398029089441465e-05, + "loss": 6.4436, + "step": 11770 + }, + { + "epoch": 0.07000547150061852, + "grad_norm": 2.058687448501587, + "learning_rate": 4.939792719999254e-05, + "loss": 6.2875, + "step": 11771 + }, + { + "epoch": 0.07001141878389952, + "grad_norm": 1.9074562788009644, + "learning_rate": 4.939782530202655e-05, + "loss": 5.8764, + "step": 11772 + }, + { + "epoch": 0.07001736606718051, + "grad_norm": 2.163663864135742, + "learning_rate": 4.9397723395543535e-05, + "loss": 5.4666, + "step": 11773 + }, + { + "epoch": 0.07002331335046151, + "grad_norm": 2.2188286781311035, + "learning_rate": 4.939762148054352e-05, + "loss": 6.0679, + "step": 11774 + }, + { + "epoch": 0.07002926063374251, + "grad_norm": 1.8202224969863892, + "learning_rate": 4.9397519557026553e-05, + "loss": 6.0465, + "step": 11775 + }, + { + "epoch": 0.0700352079170235, + "grad_norm": 1.9515994787216187, + "learning_rate": 4.939741762499266e-05, + "loss": 5.9634, + "step": 11776 + }, + { + "epoch": 0.0700411552003045, + "grad_norm": 1.772741675376892, + "learning_rate": 4.9397315684441886e-05, + "loss": 5.3117, + "step": 11777 + }, + { + "epoch": 0.0700471024835855, + "grad_norm": 1.7377926111221313, + "learning_rate": 4.9397213735374256e-05, + "loss": 5.7082, + "step": 11778 + }, + { + "epoch": 0.0700530497668665, + "grad_norm": 1.881205439567566, + "learning_rate": 4.939711177778982e-05, + "loss": 5.8463, + "step": 11779 + }, + { + "epoch": 0.0700589970501475, + "grad_norm": 1.893402099609375, + "learning_rate": 4.939700981168859e-05, + "loss": 5.8321, + "step": 11780 + }, + { + "epoch": 0.07006494433342848, + "grad_norm": 1.6830201148986816, + "learning_rate": 4.939690783707063e-05, + "loss": 5.8655, + "step": 11781 + }, + { + "epoch": 0.07007089161670949, + "grad_norm": 1.9164643287658691, + "learning_rate": 4.939680585393595e-05, + "loss": 5.7089, + "step": 11782 + }, + { + "epoch": 0.07007683889999049, + "grad_norm": 1.5564945936203003, + "learning_rate": 4.93967038622846e-05, + "loss": 5.8671, + "step": 11783 + }, + { + "epoch": 0.07008278618327148, + "grad_norm": 1.6557695865631104, + "learning_rate": 4.939660186211662e-05, + "loss": 5.7461, + "step": 11784 + }, + { + "epoch": 0.07008873346655248, + "grad_norm": 1.7161173820495605, + "learning_rate": 4.9396499853432035e-05, + "loss": 5.0569, + "step": 11785 + }, + { + "epoch": 0.07009468074983348, + "grad_norm": 1.6760550737380981, + "learning_rate": 4.939639783623088e-05, + "loss": 5.4683, + "step": 11786 + }, + { + "epoch": 0.07010062803311447, + "grad_norm": 1.818652629852295, + "learning_rate": 4.9396295810513196e-05, + "loss": 4.9676, + "step": 11787 + }, + { + "epoch": 0.07010657531639547, + "grad_norm": 2.016510009765625, + "learning_rate": 4.939619377627901e-05, + "loss": 5.255, + "step": 11788 + }, + { + "epoch": 0.07011252259967647, + "grad_norm": 2.1893560886383057, + "learning_rate": 4.939609173352838e-05, + "loss": 5.0798, + "step": 11789 + }, + { + "epoch": 0.07011846988295746, + "grad_norm": 1.8063241243362427, + "learning_rate": 4.939598968226132e-05, + "loss": 5.049, + "step": 11790 + }, + { + "epoch": 0.07012441716623846, + "grad_norm": 1.7766486406326294, + "learning_rate": 4.939588762247786e-05, + "loss": 4.8375, + "step": 11791 + }, + { + "epoch": 0.07013036444951946, + "grad_norm": 1.6848721504211426, + "learning_rate": 4.9395785554178066e-05, + "loss": 4.7944, + "step": 11792 + }, + { + "epoch": 0.07013631173280045, + "grad_norm": 1.5173190832138062, + "learning_rate": 4.939568347736195e-05, + "loss": 4.8558, + "step": 11793 + }, + { + "epoch": 0.07014225901608145, + "grad_norm": 1.9625753164291382, + "learning_rate": 4.939558139202955e-05, + "loss": 5.0129, + "step": 11794 + }, + { + "epoch": 0.07014820629936246, + "grad_norm": 2.1610453128814697, + "learning_rate": 4.93954792981809e-05, + "loss": 5.7208, + "step": 11795 + }, + { + "epoch": 0.07015415358264344, + "grad_norm": 2.272775411605835, + "learning_rate": 4.939537719581605e-05, + "loss": 5.3673, + "step": 11796 + }, + { + "epoch": 0.07016010086592445, + "grad_norm": 1.8652429580688477, + "learning_rate": 4.9395275084935025e-05, + "loss": 5.7692, + "step": 11797 + }, + { + "epoch": 0.07016604814920545, + "grad_norm": 1.6594206094741821, + "learning_rate": 4.939517296553786e-05, + "loss": 5.7201, + "step": 11798 + }, + { + "epoch": 0.07017199543248644, + "grad_norm": 1.7499476671218872, + "learning_rate": 4.939507083762459e-05, + "loss": 5.6471, + "step": 11799 + }, + { + "epoch": 0.07017794271576744, + "grad_norm": 2.050825834274292, + "learning_rate": 4.939496870119525e-05, + "loss": 5.4805, + "step": 11800 + }, + { + "epoch": 0.07018388999904844, + "grad_norm": 2.033815383911133, + "learning_rate": 4.939486655624988e-05, + "loss": 5.7465, + "step": 11801 + }, + { + "epoch": 0.07018983728232943, + "grad_norm": 1.7499231100082397, + "learning_rate": 4.939476440278852e-05, + "loss": 5.0271, + "step": 11802 + }, + { + "epoch": 0.07019578456561043, + "grad_norm": 2.331024646759033, + "learning_rate": 4.939466224081119e-05, + "loss": 5.0491, + "step": 11803 + }, + { + "epoch": 0.07020173184889143, + "grad_norm": 2.089859962463379, + "learning_rate": 4.939456007031794e-05, + "loss": 5.6678, + "step": 11804 + }, + { + "epoch": 0.07020767913217242, + "grad_norm": 2.0704381465911865, + "learning_rate": 4.93944578913088e-05, + "loss": 5.5128, + "step": 11805 + }, + { + "epoch": 0.07021362641545342, + "grad_norm": 2.3215534687042236, + "learning_rate": 4.939435570378381e-05, + "loss": 4.8886, + "step": 11806 + }, + { + "epoch": 0.07021957369873442, + "grad_norm": 2.2506353855133057, + "learning_rate": 4.9394253507743004e-05, + "loss": 4.8606, + "step": 11807 + }, + { + "epoch": 0.07022552098201541, + "grad_norm": 1.9065401554107666, + "learning_rate": 4.939415130318641e-05, + "loss": 5.4306, + "step": 11808 + }, + { + "epoch": 0.07023146826529642, + "grad_norm": 1.9229549169540405, + "learning_rate": 4.9394049090114076e-05, + "loss": 5.5586, + "step": 11809 + }, + { + "epoch": 0.0702374155485774, + "grad_norm": 1.857392430305481, + "learning_rate": 4.939394686852603e-05, + "loss": 5.382, + "step": 11810 + }, + { + "epoch": 0.0702433628318584, + "grad_norm": 2.0430874824523926, + "learning_rate": 4.939384463842231e-05, + "loss": 5.4362, + "step": 11811 + }, + { + "epoch": 0.07024931011513941, + "grad_norm": 1.839227318763733, + "learning_rate": 4.939374239980294e-05, + "loss": 5.0285, + "step": 11812 + }, + { + "epoch": 0.0702552573984204, + "grad_norm": 1.9690957069396973, + "learning_rate": 4.939364015266798e-05, + "loss": 5.5512, + "step": 11813 + }, + { + "epoch": 0.0702612046817014, + "grad_norm": 1.819841980934143, + "learning_rate": 4.939353789701745e-05, + "loss": 5.4886, + "step": 11814 + }, + { + "epoch": 0.0702671519649824, + "grad_norm": 1.7670280933380127, + "learning_rate": 4.939343563285138e-05, + "loss": 5.0925, + "step": 11815 + }, + { + "epoch": 0.07027309924826339, + "grad_norm": 1.478452444076538, + "learning_rate": 4.9393333360169824e-05, + "loss": 5.6562, + "step": 11816 + }, + { + "epoch": 0.07027904653154439, + "grad_norm": 1.7796739339828491, + "learning_rate": 4.93932310789728e-05, + "loss": 5.7462, + "step": 11817 + }, + { + "epoch": 0.07028499381482539, + "grad_norm": 1.425431728363037, + "learning_rate": 4.939312878926036e-05, + "loss": 5.6002, + "step": 11818 + }, + { + "epoch": 0.07029094109810638, + "grad_norm": 1.7066885232925415, + "learning_rate": 4.939302649103252e-05, + "loss": 5.3827, + "step": 11819 + }, + { + "epoch": 0.07029688838138738, + "grad_norm": 1.5144743919372559, + "learning_rate": 4.939292418428933e-05, + "loss": 5.094, + "step": 11820 + }, + { + "epoch": 0.07030283566466838, + "grad_norm": 1.5426355600357056, + "learning_rate": 4.939282186903082e-05, + "loss": 5.4808, + "step": 11821 + }, + { + "epoch": 0.07030878294794937, + "grad_norm": 1.5655393600463867, + "learning_rate": 4.9392719545257034e-05, + "loss": 5.5422, + "step": 11822 + }, + { + "epoch": 0.07031473023123037, + "grad_norm": 1.2810043096542358, + "learning_rate": 4.9392617212967995e-05, + "loss": 5.5069, + "step": 11823 + }, + { + "epoch": 0.07032067751451138, + "grad_norm": 1.534588098526001, + "learning_rate": 4.9392514872163754e-05, + "loss": 5.4887, + "step": 11824 + }, + { + "epoch": 0.07032662479779236, + "grad_norm": 1.6692357063293457, + "learning_rate": 4.9392412522844325e-05, + "loss": 5.4235, + "step": 11825 + }, + { + "epoch": 0.07033257208107337, + "grad_norm": 2.1246654987335205, + "learning_rate": 4.939231016500977e-05, + "loss": 5.4533, + "step": 11826 + }, + { + "epoch": 0.07033851936435437, + "grad_norm": 2.0235774517059326, + "learning_rate": 4.9392207798660106e-05, + "loss": 5.0393, + "step": 11827 + }, + { + "epoch": 0.07034446664763536, + "grad_norm": 1.7843154668807983, + "learning_rate": 4.939210542379537e-05, + "loss": 5.2501, + "step": 11828 + }, + { + "epoch": 0.07035041393091636, + "grad_norm": 2.1056478023529053, + "learning_rate": 4.939200304041561e-05, + "loss": 5.7809, + "step": 11829 + }, + { + "epoch": 0.07035636121419736, + "grad_norm": 2.0902159214019775, + "learning_rate": 4.939190064852085e-05, + "loss": 5.591, + "step": 11830 + }, + { + "epoch": 0.07036230849747835, + "grad_norm": 2.3349802494049072, + "learning_rate": 4.9391798248111134e-05, + "loss": 4.7641, + "step": 11831 + }, + { + "epoch": 0.07036825578075935, + "grad_norm": 1.6848636865615845, + "learning_rate": 4.939169583918648e-05, + "loss": 5.5082, + "step": 11832 + }, + { + "epoch": 0.07037420306404035, + "grad_norm": 1.958947777748108, + "learning_rate": 4.939159342174695e-05, + "loss": 5.433, + "step": 11833 + }, + { + "epoch": 0.07038015034732134, + "grad_norm": 1.7382566928863525, + "learning_rate": 4.939149099579256e-05, + "loss": 5.5014, + "step": 11834 + }, + { + "epoch": 0.07038609763060234, + "grad_norm": 2.469529867172241, + "learning_rate": 4.939138856132336e-05, + "loss": 4.6383, + "step": 11835 + }, + { + "epoch": 0.07039204491388334, + "grad_norm": 2.127711057662964, + "learning_rate": 4.939128611833937e-05, + "loss": 5.6088, + "step": 11836 + }, + { + "epoch": 0.07039799219716433, + "grad_norm": 2.252210855484009, + "learning_rate": 4.9391183666840636e-05, + "loss": 5.027, + "step": 11837 + }, + { + "epoch": 0.07040393948044534, + "grad_norm": 1.990277647972107, + "learning_rate": 4.9391081206827194e-05, + "loss": 5.6389, + "step": 11838 + }, + { + "epoch": 0.07040988676372632, + "grad_norm": 2.170099973678589, + "learning_rate": 4.939097873829908e-05, + "loss": 5.5588, + "step": 11839 + }, + { + "epoch": 0.07041583404700733, + "grad_norm": 2.4616951942443848, + "learning_rate": 4.939087626125632e-05, + "loss": 5.6505, + "step": 11840 + }, + { + "epoch": 0.07042178133028833, + "grad_norm": 1.9600075483322144, + "learning_rate": 4.9390773775698964e-05, + "loss": 5.1086, + "step": 11841 + }, + { + "epoch": 0.07042772861356932, + "grad_norm": 2.173632860183716, + "learning_rate": 4.939067128162703e-05, + "loss": 5.8069, + "step": 11842 + }, + { + "epoch": 0.07043367589685032, + "grad_norm": 1.9921432733535767, + "learning_rate": 4.939056877904058e-05, + "loss": 5.3222, + "step": 11843 + }, + { + "epoch": 0.07043962318013132, + "grad_norm": 2.1605379581451416, + "learning_rate": 4.939046626793962e-05, + "loss": 5.1565, + "step": 11844 + }, + { + "epoch": 0.07044557046341231, + "grad_norm": 2.2240231037139893, + "learning_rate": 4.9390363748324206e-05, + "loss": 5.3633, + "step": 11845 + }, + { + "epoch": 0.07045151774669331, + "grad_norm": 2.1935648918151855, + "learning_rate": 4.9390261220194374e-05, + "loss": 5.3715, + "step": 11846 + }, + { + "epoch": 0.07045746502997431, + "grad_norm": 2.3079628944396973, + "learning_rate": 4.9390158683550146e-05, + "loss": 5.4728, + "step": 11847 + }, + { + "epoch": 0.0704634123132553, + "grad_norm": 2.1652259826660156, + "learning_rate": 4.939005613839157e-05, + "loss": 5.276, + "step": 11848 + }, + { + "epoch": 0.0704693595965363, + "grad_norm": 1.75044846534729, + "learning_rate": 4.938995358471867e-05, + "loss": 5.3, + "step": 11849 + }, + { + "epoch": 0.0704753068798173, + "grad_norm": 2.11893892288208, + "learning_rate": 4.93898510225315e-05, + "loss": 5.3949, + "step": 11850 + }, + { + "epoch": 0.07048125416309829, + "grad_norm": 1.8546398878097534, + "learning_rate": 4.938974845183008e-05, + "loss": 5.3606, + "step": 11851 + }, + { + "epoch": 0.0704872014463793, + "grad_norm": 2.2334201335906982, + "learning_rate": 4.9389645872614456e-05, + "loss": 5.1987, + "step": 11852 + }, + { + "epoch": 0.0704931487296603, + "grad_norm": 2.0545856952667236, + "learning_rate": 4.938954328488465e-05, + "loss": 5.2742, + "step": 11853 + }, + { + "epoch": 0.07049909601294128, + "grad_norm": 2.011322498321533, + "learning_rate": 4.938944068864071e-05, + "loss": 5.3738, + "step": 11854 + }, + { + "epoch": 0.07050504329622229, + "grad_norm": 1.6539164781570435, + "learning_rate": 4.9389338083882664e-05, + "loss": 5.1915, + "step": 11855 + }, + { + "epoch": 0.07051099057950329, + "grad_norm": 1.9423818588256836, + "learning_rate": 4.9389235470610564e-05, + "loss": 5.4432, + "step": 11856 + }, + { + "epoch": 0.07051693786278428, + "grad_norm": 1.9459011554718018, + "learning_rate": 4.938913284882442e-05, + "loss": 5.2929, + "step": 11857 + }, + { + "epoch": 0.07052288514606528, + "grad_norm": 2.0341713428497314, + "learning_rate": 4.938903021852429e-05, + "loss": 5.1413, + "step": 11858 + }, + { + "epoch": 0.07052883242934628, + "grad_norm": 2.1413371562957764, + "learning_rate": 4.93889275797102e-05, + "loss": 5.0283, + "step": 11859 + }, + { + "epoch": 0.07053477971262727, + "grad_norm": 1.9965273141860962, + "learning_rate": 4.9388824932382185e-05, + "loss": 5.0919, + "step": 11860 + }, + { + "epoch": 0.07054072699590827, + "grad_norm": 1.9912536144256592, + "learning_rate": 4.938872227654028e-05, + "loss": 4.72, + "step": 11861 + }, + { + "epoch": 0.07054667427918927, + "grad_norm": 2.267775058746338, + "learning_rate": 4.9388619612184533e-05, + "loss": 5.3942, + "step": 11862 + }, + { + "epoch": 0.07055262156247026, + "grad_norm": 2.0529544353485107, + "learning_rate": 4.9388516939314965e-05, + "loss": 5.504, + "step": 11863 + }, + { + "epoch": 0.07055856884575126, + "grad_norm": 2.124903678894043, + "learning_rate": 4.938841425793162e-05, + "loss": 5.3684, + "step": 11864 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 2.2070152759552, + "learning_rate": 4.938831156803453e-05, + "loss": 5.1349, + "step": 11865 + }, + { + "epoch": 0.07057046341231325, + "grad_norm": 1.717877745628357, + "learning_rate": 4.9388208869623734e-05, + "loss": 5.2605, + "step": 11866 + }, + { + "epoch": 0.07057641069559425, + "grad_norm": 2.258847951889038, + "learning_rate": 4.9388106162699266e-05, + "loss": 4.9048, + "step": 11867 + }, + { + "epoch": 0.07058235797887524, + "grad_norm": 2.065905809402466, + "learning_rate": 4.938800344726117e-05, + "loss": 5.0523, + "step": 11868 + }, + { + "epoch": 0.07058830526215625, + "grad_norm": 2.13053035736084, + "learning_rate": 4.9387900723309455e-05, + "loss": 5.1551, + "step": 11869 + }, + { + "epoch": 0.07059425254543725, + "grad_norm": 2.0323257446289062, + "learning_rate": 4.938779799084419e-05, + "loss": 5.0807, + "step": 11870 + }, + { + "epoch": 0.07060019982871824, + "grad_norm": 2.0503158569335938, + "learning_rate": 4.9387695249865396e-05, + "loss": 5.1946, + "step": 11871 + }, + { + "epoch": 0.07060614711199924, + "grad_norm": 2.069227933883667, + "learning_rate": 4.9387592500373105e-05, + "loss": 5.0027, + "step": 11872 + }, + { + "epoch": 0.07061209439528024, + "grad_norm": 2.0208382606506348, + "learning_rate": 4.9387489742367354e-05, + "loss": 5.0877, + "step": 11873 + }, + { + "epoch": 0.07061804167856123, + "grad_norm": 2.0159859657287598, + "learning_rate": 4.9387386975848196e-05, + "loss": 4.864, + "step": 11874 + }, + { + "epoch": 0.07062398896184223, + "grad_norm": 1.9365311861038208, + "learning_rate": 4.9387284200815645e-05, + "loss": 4.7373, + "step": 11875 + }, + { + "epoch": 0.07062993624512323, + "grad_norm": 2.1024274826049805, + "learning_rate": 4.9387181417269736e-05, + "loss": 5.0155, + "step": 11876 + }, + { + "epoch": 0.07063588352840422, + "grad_norm": 2.5438032150268555, + "learning_rate": 4.938707862521052e-05, + "loss": 5.3267, + "step": 11877 + }, + { + "epoch": 0.07064183081168522, + "grad_norm": 2.129715919494629, + "learning_rate": 4.938697582463804e-05, + "loss": 5.104, + "step": 11878 + }, + { + "epoch": 0.07064777809496622, + "grad_norm": 2.237442970275879, + "learning_rate": 4.9386873015552303e-05, + "loss": 5.134, + "step": 11879 + }, + { + "epoch": 0.07065372537824721, + "grad_norm": 2.2773404121398926, + "learning_rate": 4.9386770197953366e-05, + "loss": 5.269, + "step": 11880 + }, + { + "epoch": 0.07065967266152821, + "grad_norm": 2.0882620811462402, + "learning_rate": 4.938666737184125e-05, + "loss": 4.8091, + "step": 11881 + }, + { + "epoch": 0.07066561994480922, + "grad_norm": 2.0649476051330566, + "learning_rate": 4.938656453721602e-05, + "loss": 4.9143, + "step": 11882 + }, + { + "epoch": 0.0706715672280902, + "grad_norm": 2.19030499458313, + "learning_rate": 4.938646169407768e-05, + "loss": 4.7439, + "step": 11883 + }, + { + "epoch": 0.0706775145113712, + "grad_norm": 2.8669347763061523, + "learning_rate": 4.938635884242628e-05, + "loss": 4.3684, + "step": 11884 + }, + { + "epoch": 0.07068346179465221, + "grad_norm": 2.3018336296081543, + "learning_rate": 4.9386255982261854e-05, + "loss": 4.8602, + "step": 11885 + }, + { + "epoch": 0.0706894090779332, + "grad_norm": 2.7775471210479736, + "learning_rate": 4.938615311358443e-05, + "loss": 5.2401, + "step": 11886 + }, + { + "epoch": 0.0706953563612142, + "grad_norm": 2.1075756549835205, + "learning_rate": 4.938605023639406e-05, + "loss": 5.1085, + "step": 11887 + }, + { + "epoch": 0.0707013036444952, + "grad_norm": 2.456530809402466, + "learning_rate": 4.9385947350690776e-05, + "loss": 5.0506, + "step": 11888 + }, + { + "epoch": 0.07070725092777619, + "grad_norm": 1.76799738407135, + "learning_rate": 4.9385844456474605e-05, + "loss": 4.8233, + "step": 11889 + }, + { + "epoch": 0.07071319821105719, + "grad_norm": 2.0819127559661865, + "learning_rate": 4.938574155374559e-05, + "loss": 4.4198, + "step": 11890 + }, + { + "epoch": 0.07071914549433819, + "grad_norm": 2.221586227416992, + "learning_rate": 4.9385638642503765e-05, + "loss": 4.2423, + "step": 11891 + }, + { + "epoch": 0.07072509277761918, + "grad_norm": 2.108182668685913, + "learning_rate": 4.938553572274916e-05, + "loss": 4.2564, + "step": 11892 + }, + { + "epoch": 0.07073104006090018, + "grad_norm": 1.9631624221801758, + "learning_rate": 4.938543279448182e-05, + "loss": 4.1641, + "step": 11893 + }, + { + "epoch": 0.07073698734418118, + "grad_norm": 1.9730273485183716, + "learning_rate": 4.938532985770178e-05, + "loss": 4.0728, + "step": 11894 + }, + { + "epoch": 0.07074293462746217, + "grad_norm": 1.9632551670074463, + "learning_rate": 4.9385226912409065e-05, + "loss": 4.2014, + "step": 11895 + }, + { + "epoch": 0.07074888191074317, + "grad_norm": 1.9986671209335327, + "learning_rate": 4.9385123958603726e-05, + "loss": 4.0299, + "step": 11896 + }, + { + "epoch": 0.07075482919402416, + "grad_norm": 2.2256031036376953, + "learning_rate": 4.9385020996285794e-05, + "loss": 4.1397, + "step": 11897 + }, + { + "epoch": 0.07076077647730517, + "grad_norm": 2.231462001800537, + "learning_rate": 4.9384918025455296e-05, + "loss": 4.0977, + "step": 11898 + }, + { + "epoch": 0.07076672376058617, + "grad_norm": 2.0946438312530518, + "learning_rate": 4.938481504611227e-05, + "loss": 3.9446, + "step": 11899 + }, + { + "epoch": 0.07077267104386716, + "grad_norm": 1.6953986883163452, + "learning_rate": 4.938471205825677e-05, + "loss": 4.6809, + "step": 11900 + }, + { + "epoch": 0.07077861832714816, + "grad_norm": 2.1963350772857666, + "learning_rate": 4.938460906188882e-05, + "loss": 4.3626, + "step": 11901 + }, + { + "epoch": 0.07078456561042916, + "grad_norm": 2.2069251537323, + "learning_rate": 4.938450605700845e-05, + "loss": 4.1057, + "step": 11902 + }, + { + "epoch": 0.07079051289371015, + "grad_norm": 2.1809592247009277, + "learning_rate": 4.9384403043615694e-05, + "loss": 3.5619, + "step": 11903 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 2.305171012878418, + "learning_rate": 4.938430002171061e-05, + "loss": 5.8033, + "step": 11904 + }, + { + "epoch": 0.07080240746027215, + "grad_norm": 2.1984407901763916, + "learning_rate": 4.9384196991293205e-05, + "loss": 3.5869, + "step": 11905 + }, + { + "epoch": 0.07080835474355314, + "grad_norm": 1.8870881795883179, + "learning_rate": 4.938409395236353e-05, + "loss": 4.8027, + "step": 11906 + }, + { + "epoch": 0.07081430202683414, + "grad_norm": 2.11314058303833, + "learning_rate": 4.938399090492163e-05, + "loss": 4.1942, + "step": 11907 + }, + { + "epoch": 0.07082024931011514, + "grad_norm": 2.143794298171997, + "learning_rate": 4.938388784896752e-05, + "loss": 3.8526, + "step": 11908 + }, + { + "epoch": 0.07082619659339613, + "grad_norm": 2.4311232566833496, + "learning_rate": 4.938378478450125e-05, + "loss": 3.8572, + "step": 11909 + }, + { + "epoch": 0.07083214387667713, + "grad_norm": 2.0959818363189697, + "learning_rate": 4.9383681711522855e-05, + "loss": 4.3465, + "step": 11910 + }, + { + "epoch": 0.07083809115995814, + "grad_norm": 1.9161559343338013, + "learning_rate": 4.938357863003237e-05, + "loss": 5.5608, + "step": 11911 + }, + { + "epoch": 0.07084403844323912, + "grad_norm": 1.8549482822418213, + "learning_rate": 4.9383475540029824e-05, + "loss": 5.9874, + "step": 11912 + }, + { + "epoch": 0.07084998572652013, + "grad_norm": 1.8600444793701172, + "learning_rate": 4.9383372441515255e-05, + "loss": 6.0579, + "step": 11913 + }, + { + "epoch": 0.07085593300980113, + "grad_norm": 1.6985594034194946, + "learning_rate": 4.938326933448871e-05, + "loss": 5.7963, + "step": 11914 + }, + { + "epoch": 0.07086188029308212, + "grad_norm": 2.06860613822937, + "learning_rate": 4.9383166218950216e-05, + "loss": 5.4789, + "step": 11915 + }, + { + "epoch": 0.07086782757636312, + "grad_norm": 2.8111190795898438, + "learning_rate": 4.938306309489982e-05, + "loss": 5.2546, + "step": 11916 + }, + { + "epoch": 0.07087377485964412, + "grad_norm": 2.700589895248413, + "learning_rate": 4.9382959962337536e-05, + "loss": 5.2021, + "step": 11917 + }, + { + "epoch": 0.07087972214292511, + "grad_norm": 2.364793539047241, + "learning_rate": 4.938285682126341e-05, + "loss": 4.9508, + "step": 11918 + }, + { + "epoch": 0.07088566942620611, + "grad_norm": 2.4212446212768555, + "learning_rate": 4.938275367167749e-05, + "loss": 5.1269, + "step": 11919 + }, + { + "epoch": 0.07089161670948711, + "grad_norm": 1.785733699798584, + "learning_rate": 4.93826505135798e-05, + "loss": 5.7357, + "step": 11920 + }, + { + "epoch": 0.0708975639927681, + "grad_norm": 1.6912823915481567, + "learning_rate": 4.9382547346970376e-05, + "loss": 5.4003, + "step": 11921 + }, + { + "epoch": 0.0709035112760491, + "grad_norm": 1.8408714532852173, + "learning_rate": 4.938244417184926e-05, + "loss": 5.3169, + "step": 11922 + }, + { + "epoch": 0.0709094585593301, + "grad_norm": 2.3245468139648438, + "learning_rate": 4.938234098821648e-05, + "loss": 4.9588, + "step": 11923 + }, + { + "epoch": 0.07091540584261109, + "grad_norm": 1.922179102897644, + "learning_rate": 4.938223779607208e-05, + "loss": 5.431, + "step": 11924 + }, + { + "epoch": 0.0709213531258921, + "grad_norm": 1.8331208229064941, + "learning_rate": 4.9382134595416094e-05, + "loss": 5.9121, + "step": 11925 + }, + { + "epoch": 0.07092730040917308, + "grad_norm": 2.15932297706604, + "learning_rate": 4.9382031386248556e-05, + "loss": 5.058, + "step": 11926 + }, + { + "epoch": 0.07093324769245409, + "grad_norm": 2.2255606651306152, + "learning_rate": 4.93819281685695e-05, + "loss": 4.9215, + "step": 11927 + }, + { + "epoch": 0.07093919497573509, + "grad_norm": 2.3665359020233154, + "learning_rate": 4.938182494237897e-05, + "loss": 4.8405, + "step": 11928 + }, + { + "epoch": 0.07094514225901608, + "grad_norm": 2.1564438343048096, + "learning_rate": 4.938172170767699e-05, + "loss": 4.9598, + "step": 11929 + }, + { + "epoch": 0.07095108954229708, + "grad_norm": 2.2083945274353027, + "learning_rate": 4.938161846446361e-05, + "loss": 4.8603, + "step": 11930 + }, + { + "epoch": 0.07095703682557808, + "grad_norm": 2.3422255516052246, + "learning_rate": 4.938151521273885e-05, + "loss": 4.8926, + "step": 11931 + }, + { + "epoch": 0.07096298410885907, + "grad_norm": 2.5269415378570557, + "learning_rate": 4.9381411952502764e-05, + "loss": 4.876, + "step": 11932 + }, + { + "epoch": 0.07096893139214007, + "grad_norm": 2.1761882305145264, + "learning_rate": 4.9381308683755376e-05, + "loss": 4.7533, + "step": 11933 + }, + { + "epoch": 0.07097487867542107, + "grad_norm": 2.078146457672119, + "learning_rate": 4.938120540649672e-05, + "loss": 4.9606, + "step": 11934 + }, + { + "epoch": 0.07098082595870206, + "grad_norm": 2.3086254596710205, + "learning_rate": 4.9381102120726846e-05, + "loss": 4.7763, + "step": 11935 + }, + { + "epoch": 0.07098677324198306, + "grad_norm": 1.8531124591827393, + "learning_rate": 4.938099882644578e-05, + "loss": 5.0218, + "step": 11936 + }, + { + "epoch": 0.07099272052526406, + "grad_norm": 2.2169790267944336, + "learning_rate": 4.938089552365355e-05, + "loss": 6.0072, + "step": 11937 + }, + { + "epoch": 0.07099866780854505, + "grad_norm": 1.8759880065917969, + "learning_rate": 4.938079221235021e-05, + "loss": 5.8259, + "step": 11938 + }, + { + "epoch": 0.07100461509182605, + "grad_norm": 2.026217222213745, + "learning_rate": 4.938068889253579e-05, + "loss": 5.4426, + "step": 11939 + }, + { + "epoch": 0.07101056237510706, + "grad_norm": 2.5047786235809326, + "learning_rate": 4.938058556421031e-05, + "loss": 4.7276, + "step": 11940 + }, + { + "epoch": 0.07101650965838804, + "grad_norm": 2.243281602859497, + "learning_rate": 4.938048222737383e-05, + "loss": 4.9284, + "step": 11941 + }, + { + "epoch": 0.07102245694166905, + "grad_norm": 1.989563226699829, + "learning_rate": 4.938037888202637e-05, + "loss": 5.7744, + "step": 11942 + }, + { + "epoch": 0.07102840422495005, + "grad_norm": 1.829290509223938, + "learning_rate": 4.9380275528167974e-05, + "loss": 5.6942, + "step": 11943 + }, + { + "epoch": 0.07103435150823104, + "grad_norm": 1.8001593351364136, + "learning_rate": 4.938017216579868e-05, + "loss": 5.6928, + "step": 11944 + }, + { + "epoch": 0.07104029879151204, + "grad_norm": 1.7705434560775757, + "learning_rate": 4.938006879491851e-05, + "loss": 5.6954, + "step": 11945 + }, + { + "epoch": 0.07104624607479304, + "grad_norm": 1.8746812343597412, + "learning_rate": 4.937996541552752e-05, + "loss": 5.7184, + "step": 11946 + }, + { + "epoch": 0.07105219335807403, + "grad_norm": 1.6931661367416382, + "learning_rate": 4.937986202762573e-05, + "loss": 5.398, + "step": 11947 + }, + { + "epoch": 0.07105814064135503, + "grad_norm": 2.0784003734588623, + "learning_rate": 4.937975863121318e-05, + "loss": 5.7164, + "step": 11948 + }, + { + "epoch": 0.07106408792463603, + "grad_norm": 1.8495618104934692, + "learning_rate": 4.937965522628991e-05, + "loss": 5.7093, + "step": 11949 + }, + { + "epoch": 0.07107003520791702, + "grad_norm": 1.7720533609390259, + "learning_rate": 4.9379551812855964e-05, + "loss": 5.7548, + "step": 11950 + }, + { + "epoch": 0.07107598249119802, + "grad_norm": 1.721205472946167, + "learning_rate": 4.937944839091135e-05, + "loss": 5.7496, + "step": 11951 + }, + { + "epoch": 0.07108192977447902, + "grad_norm": 1.896657109260559, + "learning_rate": 4.9379344960456145e-05, + "loss": 5.5989, + "step": 11952 + }, + { + "epoch": 0.07108787705776001, + "grad_norm": 1.4022153615951538, + "learning_rate": 4.9379241521490344e-05, + "loss": 5.5029, + "step": 11953 + }, + { + "epoch": 0.07109382434104101, + "grad_norm": 1.9068467617034912, + "learning_rate": 4.937913807401401e-05, + "loss": 5.6915, + "step": 11954 + }, + { + "epoch": 0.071099771624322, + "grad_norm": 1.6542187929153442, + "learning_rate": 4.9379034618027164e-05, + "loss": 5.6409, + "step": 11955 + }, + { + "epoch": 0.071105718907603, + "grad_norm": 1.5280201435089111, + "learning_rate": 4.937893115352986e-05, + "loss": 5.6264, + "step": 11956 + }, + { + "epoch": 0.07111166619088401, + "grad_norm": 1.767232060432434, + "learning_rate": 4.937882768052211e-05, + "loss": 5.4562, + "step": 11957 + }, + { + "epoch": 0.071117613474165, + "grad_norm": 1.571892261505127, + "learning_rate": 4.9378724199003975e-05, + "loss": 5.7949, + "step": 11958 + }, + { + "epoch": 0.071123560757446, + "grad_norm": 1.9400190114974976, + "learning_rate": 4.937862070897548e-05, + "loss": 5.5872, + "step": 11959 + }, + { + "epoch": 0.071129508040727, + "grad_norm": 1.7246766090393066, + "learning_rate": 4.937851721043665e-05, + "loss": 5.8455, + "step": 11960 + }, + { + "epoch": 0.07113545532400799, + "grad_norm": 1.937168002128601, + "learning_rate": 4.9378413703387534e-05, + "loss": 5.0864, + "step": 11961 + }, + { + "epoch": 0.07114140260728899, + "grad_norm": 2.3808209896087646, + "learning_rate": 4.937831018782817e-05, + "loss": 4.5918, + "step": 11962 + }, + { + "epoch": 0.07114734989056999, + "grad_norm": 2.567026138305664, + "learning_rate": 4.937820666375859e-05, + "loss": 4.7375, + "step": 11963 + }, + { + "epoch": 0.07115329717385098, + "grad_norm": 1.8941316604614258, + "learning_rate": 4.937810313117882e-05, + "loss": 5.811, + "step": 11964 + }, + { + "epoch": 0.07115924445713198, + "grad_norm": 1.9301189184188843, + "learning_rate": 4.9377999590088916e-05, + "loss": 5.7947, + "step": 11965 + }, + { + "epoch": 0.07116519174041298, + "grad_norm": 2.281784772872925, + "learning_rate": 4.93778960404889e-05, + "loss": 5.5993, + "step": 11966 + }, + { + "epoch": 0.07117113902369397, + "grad_norm": 1.7826297283172607, + "learning_rate": 4.937779248237882e-05, + "loss": 6.1836, + "step": 11967 + }, + { + "epoch": 0.07117708630697497, + "grad_norm": 2.8714182376861572, + "learning_rate": 4.9377688915758694e-05, + "loss": 5.3955, + "step": 11968 + }, + { + "epoch": 0.07118303359025598, + "grad_norm": 2.3284013271331787, + "learning_rate": 4.937758534062857e-05, + "loss": 5.3027, + "step": 11969 + }, + { + "epoch": 0.07118898087353696, + "grad_norm": 1.8880923986434937, + "learning_rate": 4.937748175698849e-05, + "loss": 5.8408, + "step": 11970 + }, + { + "epoch": 0.07119492815681797, + "grad_norm": 2.8952460289001465, + "learning_rate": 4.937737816483847e-05, + "loss": 4.7325, + "step": 11971 + }, + { + "epoch": 0.07120087544009897, + "grad_norm": 2.5028738975524902, + "learning_rate": 4.9377274564178574e-05, + "loss": 4.5854, + "step": 11972 + }, + { + "epoch": 0.07120682272337996, + "grad_norm": 1.8834285736083984, + "learning_rate": 4.9377170955008815e-05, + "loss": 5.5415, + "step": 11973 + }, + { + "epoch": 0.07121277000666096, + "grad_norm": 2.162062644958496, + "learning_rate": 4.937706733732924e-05, + "loss": 5.2187, + "step": 11974 + }, + { + "epoch": 0.07121871728994196, + "grad_norm": 2.1506881713867188, + "learning_rate": 4.937696371113988e-05, + "loss": 5.1746, + "step": 11975 + }, + { + "epoch": 0.07122466457322295, + "grad_norm": 2.0309176445007324, + "learning_rate": 4.937686007644078e-05, + "loss": 5.1708, + "step": 11976 + }, + { + "epoch": 0.07123061185650395, + "grad_norm": 2.251579523086548, + "learning_rate": 4.9376756433231966e-05, + "loss": 6.0623, + "step": 11977 + }, + { + "epoch": 0.07123655913978495, + "grad_norm": 2.161918878555298, + "learning_rate": 4.937665278151348e-05, + "loss": 6.2297, + "step": 11978 + }, + { + "epoch": 0.07124250642306594, + "grad_norm": 1.703783631324768, + "learning_rate": 4.937654912128535e-05, + "loss": 5.9388, + "step": 11979 + }, + { + "epoch": 0.07124845370634694, + "grad_norm": 1.7420361042022705, + "learning_rate": 4.937644545254763e-05, + "loss": 5.5426, + "step": 11980 + }, + { + "epoch": 0.07125440098962794, + "grad_norm": 1.8634297847747803, + "learning_rate": 4.937634177530033e-05, + "loss": 5.8412, + "step": 11981 + }, + { + "epoch": 0.07126034827290893, + "grad_norm": 1.8084121942520142, + "learning_rate": 4.937623808954351e-05, + "loss": 6.266, + "step": 11982 + }, + { + "epoch": 0.07126629555618993, + "grad_norm": 1.5925266742706299, + "learning_rate": 4.93761343952772e-05, + "loss": 5.7173, + "step": 11983 + }, + { + "epoch": 0.07127224283947092, + "grad_norm": 1.7778257131576538, + "learning_rate": 4.937603069250143e-05, + "loss": 5.8119, + "step": 11984 + }, + { + "epoch": 0.07127819012275192, + "grad_norm": 1.6839842796325684, + "learning_rate": 4.9375926981216235e-05, + "loss": 5.9446, + "step": 11985 + }, + { + "epoch": 0.07128413740603293, + "grad_norm": 1.7892810106277466, + "learning_rate": 4.937582326142166e-05, + "loss": 5.9564, + "step": 11986 + }, + { + "epoch": 0.07129008468931392, + "grad_norm": 1.7179774045944214, + "learning_rate": 4.9375719533117734e-05, + "loss": 6.1969, + "step": 11987 + }, + { + "epoch": 0.07129603197259492, + "grad_norm": 1.3788355588912964, + "learning_rate": 4.93756157963045e-05, + "loss": 6.0409, + "step": 11988 + }, + { + "epoch": 0.07130197925587592, + "grad_norm": 1.6451042890548706, + "learning_rate": 4.9375512050981986e-05, + "loss": 5.8116, + "step": 11989 + }, + { + "epoch": 0.07130792653915691, + "grad_norm": 1.8904451131820679, + "learning_rate": 4.937540829715024e-05, + "loss": 5.7952, + "step": 11990 + }, + { + "epoch": 0.07131387382243791, + "grad_norm": 1.4976747035980225, + "learning_rate": 4.9375304534809284e-05, + "loss": 5.7092, + "step": 11991 + }, + { + "epoch": 0.07131982110571891, + "grad_norm": 1.5585631132125854, + "learning_rate": 4.937520076395916e-05, + "loss": 6.0693, + "step": 11992 + }, + { + "epoch": 0.0713257683889999, + "grad_norm": 1.8329144716262817, + "learning_rate": 4.937509698459991e-05, + "loss": 5.5883, + "step": 11993 + }, + { + "epoch": 0.0713317156722809, + "grad_norm": 2.6030189990997314, + "learning_rate": 4.937499319673157e-05, + "loss": 5.1776, + "step": 11994 + }, + { + "epoch": 0.0713376629555619, + "grad_norm": 1.744042992591858, + "learning_rate": 4.9374889400354165e-05, + "loss": 5.4105, + "step": 11995 + }, + { + "epoch": 0.07134361023884289, + "grad_norm": 1.819018006324768, + "learning_rate": 4.937478559546774e-05, + "loss": 5.5695, + "step": 11996 + }, + { + "epoch": 0.0713495575221239, + "grad_norm": 1.754894733428955, + "learning_rate": 4.9374681782072325e-05, + "loss": 5.7519, + "step": 11997 + }, + { + "epoch": 0.0713555048054049, + "grad_norm": 2.132507085800171, + "learning_rate": 4.9374577960167964e-05, + "loss": 4.9783, + "step": 11998 + }, + { + "epoch": 0.07136145208868588, + "grad_norm": 2.0926709175109863, + "learning_rate": 4.937447412975469e-05, + "loss": 4.905, + "step": 11999 + }, + { + "epoch": 0.07136739937196689, + "grad_norm": 2.1235594749450684, + "learning_rate": 4.937437029083254e-05, + "loss": 4.7978, + "step": 12000 + }, + { + "epoch": 0.07137334665524789, + "grad_norm": 2.217911720275879, + "learning_rate": 4.937426644340154e-05, + "loss": 4.9506, + "step": 12001 + }, + { + "epoch": 0.07137929393852888, + "grad_norm": 2.0362601280212402, + "learning_rate": 4.937416258746175e-05, + "loss": 5.0299, + "step": 12002 + }, + { + "epoch": 0.07138524122180988, + "grad_norm": 2.2846896648406982, + "learning_rate": 4.937405872301318e-05, + "loss": 5.0606, + "step": 12003 + }, + { + "epoch": 0.07139118850509088, + "grad_norm": 2.2545530796051025, + "learning_rate": 4.937395485005588e-05, + "loss": 4.8651, + "step": 12004 + }, + { + "epoch": 0.07139713578837187, + "grad_norm": 2.32738995552063, + "learning_rate": 4.937385096858989e-05, + "loss": 4.7908, + "step": 12005 + }, + { + "epoch": 0.07140308307165287, + "grad_norm": 2.239215850830078, + "learning_rate": 4.9373747078615235e-05, + "loss": 4.7545, + "step": 12006 + }, + { + "epoch": 0.07140903035493387, + "grad_norm": 2.4766969680786133, + "learning_rate": 4.937364318013196e-05, + "loss": 5.0795, + "step": 12007 + }, + { + "epoch": 0.07141497763821486, + "grad_norm": 2.602111577987671, + "learning_rate": 4.937353927314009e-05, + "loss": 4.6898, + "step": 12008 + }, + { + "epoch": 0.07142092492149586, + "grad_norm": 2.8508496284484863, + "learning_rate": 4.937343535763968e-05, + "loss": 4.3136, + "step": 12009 + }, + { + "epoch": 0.07142687220477686, + "grad_norm": 2.4613311290740967, + "learning_rate": 4.9373331433630754e-05, + "loss": 4.4826, + "step": 12010 + }, + { + "epoch": 0.07143281948805785, + "grad_norm": 2.561643362045288, + "learning_rate": 4.937322750111334e-05, + "loss": 4.251, + "step": 12011 + }, + { + "epoch": 0.07143876677133885, + "grad_norm": 2.397507667541504, + "learning_rate": 4.93731235600875e-05, + "loss": 4.3018, + "step": 12012 + }, + { + "epoch": 0.07144471405461984, + "grad_norm": 2.250120162963867, + "learning_rate": 4.937301961055324e-05, + "loss": 4.1796, + "step": 12013 + }, + { + "epoch": 0.07145066133790084, + "grad_norm": 2.337451934814453, + "learning_rate": 4.9372915652510615e-05, + "loss": 4.2362, + "step": 12014 + }, + { + "epoch": 0.07145660862118185, + "grad_norm": 2.357034921646118, + "learning_rate": 4.937281168595966e-05, + "loss": 4.0961, + "step": 12015 + }, + { + "epoch": 0.07146255590446284, + "grad_norm": 2.0843617916107178, + "learning_rate": 4.93727077109004e-05, + "loss": 4.4584, + "step": 12016 + }, + { + "epoch": 0.07146850318774384, + "grad_norm": 2.149707317352295, + "learning_rate": 4.937260372733289e-05, + "loss": 4.2248, + "step": 12017 + }, + { + "epoch": 0.07147445047102484, + "grad_norm": 2.149765729904175, + "learning_rate": 4.937249973525715e-05, + "loss": 4.154, + "step": 12018 + }, + { + "epoch": 0.07148039775430583, + "grad_norm": 2.1572682857513428, + "learning_rate": 4.937239573467323e-05, + "loss": 4.2345, + "step": 12019 + }, + { + "epoch": 0.07148634503758683, + "grad_norm": 2.246751070022583, + "learning_rate": 4.9372291725581145e-05, + "loss": 3.9739, + "step": 12020 + }, + { + "epoch": 0.07149229232086783, + "grad_norm": 2.2735042572021484, + "learning_rate": 4.9372187707980955e-05, + "loss": 4.0442, + "step": 12021 + }, + { + "epoch": 0.07149823960414882, + "grad_norm": 2.2270023822784424, + "learning_rate": 4.9372083681872684e-05, + "loss": 4.0374, + "step": 12022 + }, + { + "epoch": 0.07150418688742982, + "grad_norm": 2.2228193283081055, + "learning_rate": 4.937197964725637e-05, + "loss": 4.0503, + "step": 12023 + }, + { + "epoch": 0.07151013417071082, + "grad_norm": 2.2630691528320312, + "learning_rate": 4.9371875604132046e-05, + "loss": 4.0431, + "step": 12024 + }, + { + "epoch": 0.07151608145399181, + "grad_norm": 2.2461886405944824, + "learning_rate": 4.937177155249976e-05, + "loss": 4.1164, + "step": 12025 + }, + { + "epoch": 0.07152202873727281, + "grad_norm": 1.9476062059402466, + "learning_rate": 4.937166749235953e-05, + "loss": 4.317, + "step": 12026 + }, + { + "epoch": 0.07152797602055382, + "grad_norm": 2.33138370513916, + "learning_rate": 4.937156342371141e-05, + "loss": 4.1309, + "step": 12027 + }, + { + "epoch": 0.0715339233038348, + "grad_norm": 3.3887436389923096, + "learning_rate": 4.937145934655543e-05, + "loss": 5.1713, + "step": 12028 + }, + { + "epoch": 0.0715398705871158, + "grad_norm": 2.499302625656128, + "learning_rate": 4.937135526089162e-05, + "loss": 4.0553, + "step": 12029 + }, + { + "epoch": 0.07154581787039681, + "grad_norm": 2.4269003868103027, + "learning_rate": 4.937125116672002e-05, + "loss": 4.0425, + "step": 12030 + }, + { + "epoch": 0.0715517651536778, + "grad_norm": 2.1819067001342773, + "learning_rate": 4.937114706404067e-05, + "loss": 4.0591, + "step": 12031 + }, + { + "epoch": 0.0715577124369588, + "grad_norm": 1.8021305799484253, + "learning_rate": 4.937104295285361e-05, + "loss": 4.9171, + "step": 12032 + }, + { + "epoch": 0.0715636597202398, + "grad_norm": 2.1833691596984863, + "learning_rate": 4.937093883315887e-05, + "loss": 4.053, + "step": 12033 + }, + { + "epoch": 0.07156960700352079, + "grad_norm": 2.1684465408325195, + "learning_rate": 4.9370834704956484e-05, + "loss": 4.0692, + "step": 12034 + }, + { + "epoch": 0.07157555428680179, + "grad_norm": 2.1576929092407227, + "learning_rate": 4.937073056824649e-05, + "loss": 3.9958, + "step": 12035 + }, + { + "epoch": 0.07158150157008279, + "grad_norm": 1.5627915859222412, + "learning_rate": 4.9370626423028924e-05, + "loss": 5.3373, + "step": 12036 + }, + { + "epoch": 0.07158744885336378, + "grad_norm": 1.6166819334030151, + "learning_rate": 4.937052226930383e-05, + "loss": 5.801, + "step": 12037 + }, + { + "epoch": 0.07159339613664478, + "grad_norm": 1.4187299013137817, + "learning_rate": 4.937041810707124e-05, + "loss": 5.5937, + "step": 12038 + }, + { + "epoch": 0.07159934341992578, + "grad_norm": 1.5857088565826416, + "learning_rate": 4.937031393633118e-05, + "loss": 5.6268, + "step": 12039 + }, + { + "epoch": 0.07160529070320677, + "grad_norm": 1.5691097974777222, + "learning_rate": 4.93702097570837e-05, + "loss": 5.7414, + "step": 12040 + }, + { + "epoch": 0.07161123798648777, + "grad_norm": 1.4723674058914185, + "learning_rate": 4.9370105569328835e-05, + "loss": 5.4711, + "step": 12041 + }, + { + "epoch": 0.07161718526976876, + "grad_norm": 1.686745047569275, + "learning_rate": 4.937000137306661e-05, + "loss": 5.4302, + "step": 12042 + }, + { + "epoch": 0.07162313255304976, + "grad_norm": 1.7394465208053589, + "learning_rate": 4.936989716829707e-05, + "loss": 5.1609, + "step": 12043 + }, + { + "epoch": 0.07162907983633077, + "grad_norm": 1.4348796606063843, + "learning_rate": 4.9369792955020264e-05, + "loss": 5.2468, + "step": 12044 + }, + { + "epoch": 0.07163502711961175, + "grad_norm": 1.674187421798706, + "learning_rate": 4.93696887332362e-05, + "loss": 5.2451, + "step": 12045 + }, + { + "epoch": 0.07164097440289276, + "grad_norm": 1.6606419086456299, + "learning_rate": 4.9369584502944934e-05, + "loss": 5.2744, + "step": 12046 + }, + { + "epoch": 0.07164692168617376, + "grad_norm": 1.4020198583602905, + "learning_rate": 4.93694802641465e-05, + "loss": 5.2914, + "step": 12047 + }, + { + "epoch": 0.07165286896945475, + "grad_norm": 1.4234102964401245, + "learning_rate": 4.936937601684093e-05, + "loss": 5.2405, + "step": 12048 + }, + { + "epoch": 0.07165881625273575, + "grad_norm": 1.261983036994934, + "learning_rate": 4.936927176102827e-05, + "loss": 5.1532, + "step": 12049 + }, + { + "epoch": 0.07166476353601675, + "grad_norm": 1.3787094354629517, + "learning_rate": 4.9369167496708534e-05, + "loss": 5.2033, + "step": 12050 + }, + { + "epoch": 0.07167071081929774, + "grad_norm": 1.405142068862915, + "learning_rate": 4.9369063223881786e-05, + "loss": 5.0391, + "step": 12051 + }, + { + "epoch": 0.07167665810257874, + "grad_norm": 1.513554573059082, + "learning_rate": 4.936895894254804e-05, + "loss": 5.0236, + "step": 12052 + }, + { + "epoch": 0.07168260538585974, + "grad_norm": 1.4279611110687256, + "learning_rate": 4.9368854652707355e-05, + "loss": 5.1429, + "step": 12053 + }, + { + "epoch": 0.07168855266914073, + "grad_norm": 1.4320182800292969, + "learning_rate": 4.936875035435974e-05, + "loss": 5.0519, + "step": 12054 + }, + { + "epoch": 0.07169449995242173, + "grad_norm": 1.415925145149231, + "learning_rate": 4.936864604750526e-05, + "loss": 4.9904, + "step": 12055 + }, + { + "epoch": 0.07170044723570274, + "grad_norm": 1.403998851776123, + "learning_rate": 4.936854173214393e-05, + "loss": 4.8988, + "step": 12056 + }, + { + "epoch": 0.07170639451898372, + "grad_norm": 1.744532585144043, + "learning_rate": 4.936843740827579e-05, + "loss": 4.9661, + "step": 12057 + }, + { + "epoch": 0.07171234180226473, + "grad_norm": 1.4900517463684082, + "learning_rate": 4.9368333075900884e-05, + "loss": 5.1887, + "step": 12058 + }, + { + "epoch": 0.07171828908554573, + "grad_norm": 1.454063057899475, + "learning_rate": 4.936822873501925e-05, + "loss": 5.2801, + "step": 12059 + }, + { + "epoch": 0.07172423636882672, + "grad_norm": 1.5426071882247925, + "learning_rate": 4.936812438563092e-05, + "loss": 5.1987, + "step": 12060 + }, + { + "epoch": 0.07173018365210772, + "grad_norm": 1.7365894317626953, + "learning_rate": 4.936802002773592e-05, + "loss": 5.1933, + "step": 12061 + }, + { + "epoch": 0.07173613093538872, + "grad_norm": 1.5046216249465942, + "learning_rate": 4.9367915661334295e-05, + "loss": 5.1688, + "step": 12062 + }, + { + "epoch": 0.07174207821866971, + "grad_norm": 1.6715713739395142, + "learning_rate": 4.936781128642609e-05, + "loss": 5.3649, + "step": 12063 + }, + { + "epoch": 0.07174802550195071, + "grad_norm": 1.6386772394180298, + "learning_rate": 4.936770690301134e-05, + "loss": 5.4107, + "step": 12064 + }, + { + "epoch": 0.07175397278523171, + "grad_norm": 1.604153037071228, + "learning_rate": 4.936760251109006e-05, + "loss": 5.2952, + "step": 12065 + }, + { + "epoch": 0.0717599200685127, + "grad_norm": 1.7100228071212769, + "learning_rate": 4.9367498110662306e-05, + "loss": 5.202, + "step": 12066 + }, + { + "epoch": 0.0717658673517937, + "grad_norm": 1.4062007665634155, + "learning_rate": 4.9367393701728116e-05, + "loss": 5.2246, + "step": 12067 + }, + { + "epoch": 0.0717718146350747, + "grad_norm": 1.4552310705184937, + "learning_rate": 4.9367289284287514e-05, + "loss": 5.5919, + "step": 12068 + }, + { + "epoch": 0.07177776191835569, + "grad_norm": 1.5134438276290894, + "learning_rate": 4.9367184858340546e-05, + "loss": 5.3921, + "step": 12069 + }, + { + "epoch": 0.0717837092016367, + "grad_norm": 1.724139928817749, + "learning_rate": 4.9367080423887246e-05, + "loss": 5.6409, + "step": 12070 + }, + { + "epoch": 0.07178965648491768, + "grad_norm": 1.7401317358016968, + "learning_rate": 4.9366975980927655e-05, + "loss": 4.8093, + "step": 12071 + }, + { + "epoch": 0.07179560376819868, + "grad_norm": 2.3226993083953857, + "learning_rate": 4.93668715294618e-05, + "loss": 4.2685, + "step": 12072 + }, + { + "epoch": 0.07180155105147969, + "grad_norm": 2.200608730316162, + "learning_rate": 4.9366767069489715e-05, + "loss": 4.1155, + "step": 12073 + }, + { + "epoch": 0.07180749833476067, + "grad_norm": 2.381131649017334, + "learning_rate": 4.936666260101145e-05, + "loss": 3.9837, + "step": 12074 + }, + { + "epoch": 0.07181344561804168, + "grad_norm": 2.2567548751831055, + "learning_rate": 4.936655812402704e-05, + "loss": 4.0642, + "step": 12075 + }, + { + "epoch": 0.07181939290132268, + "grad_norm": 2.253011703491211, + "learning_rate": 4.9366453638536506e-05, + "loss": 4.0683, + "step": 12076 + }, + { + "epoch": 0.07182534018460367, + "grad_norm": 2.3459978103637695, + "learning_rate": 4.93663491445399e-05, + "loss": 4.0525, + "step": 12077 + }, + { + "epoch": 0.07183128746788467, + "grad_norm": 2.3964619636535645, + "learning_rate": 4.9366244642037254e-05, + "loss": 4.0198, + "step": 12078 + }, + { + "epoch": 0.07183723475116567, + "grad_norm": 2.392293930053711, + "learning_rate": 4.93661401310286e-05, + "loss": 3.7765, + "step": 12079 + }, + { + "epoch": 0.07184318203444666, + "grad_norm": 2.3027987480163574, + "learning_rate": 4.936603561151398e-05, + "loss": 4.0315, + "step": 12080 + }, + { + "epoch": 0.07184912931772766, + "grad_norm": 2.3942925930023193, + "learning_rate": 4.936593108349343e-05, + "loss": 4.1308, + "step": 12081 + }, + { + "epoch": 0.07185507660100866, + "grad_norm": 2.183898687362671, + "learning_rate": 4.9365826546966984e-05, + "loss": 4.0779, + "step": 12082 + }, + { + "epoch": 0.07186102388428965, + "grad_norm": 2.3463728427886963, + "learning_rate": 4.936572200193468e-05, + "loss": 4.0035, + "step": 12083 + }, + { + "epoch": 0.07186697116757065, + "grad_norm": 2.3459651470184326, + "learning_rate": 4.9365617448396556e-05, + "loss": 4.0577, + "step": 12084 + }, + { + "epoch": 0.07187291845085166, + "grad_norm": 2.169189691543579, + "learning_rate": 4.936551288635264e-05, + "loss": 4.2678, + "step": 12085 + }, + { + "epoch": 0.07187886573413264, + "grad_norm": 2.3313188552856445, + "learning_rate": 4.936540831580299e-05, + "loss": 4.9956, + "step": 12086 + }, + { + "epoch": 0.07188481301741365, + "grad_norm": 2.431053400039673, + "learning_rate": 4.936530373674761e-05, + "loss": 5.2317, + "step": 12087 + }, + { + "epoch": 0.07189076030069465, + "grad_norm": 1.8984981775283813, + "learning_rate": 4.936519914918656e-05, + "loss": 5.4541, + "step": 12088 + }, + { + "epoch": 0.07189670758397564, + "grad_norm": 1.8862982988357544, + "learning_rate": 4.9365094553119877e-05, + "loss": 5.6448, + "step": 12089 + }, + { + "epoch": 0.07190265486725664, + "grad_norm": 1.7802925109863281, + "learning_rate": 4.936498994854759e-05, + "loss": 5.3182, + "step": 12090 + }, + { + "epoch": 0.07190860215053764, + "grad_norm": 1.7578701972961426, + "learning_rate": 4.9364885335469734e-05, + "loss": 6.0188, + "step": 12091 + }, + { + "epoch": 0.07191454943381863, + "grad_norm": 1.6750003099441528, + "learning_rate": 4.9364780713886345e-05, + "loss": 6.0822, + "step": 12092 + }, + { + "epoch": 0.07192049671709963, + "grad_norm": 1.4945881366729736, + "learning_rate": 4.936467608379747e-05, + "loss": 6.0554, + "step": 12093 + }, + { + "epoch": 0.07192644400038063, + "grad_norm": 1.5508134365081787, + "learning_rate": 4.936457144520313e-05, + "loss": 5.9712, + "step": 12094 + }, + { + "epoch": 0.07193239128366162, + "grad_norm": 1.4133291244506836, + "learning_rate": 4.936446679810337e-05, + "loss": 5.9137, + "step": 12095 + }, + { + "epoch": 0.07193833856694262, + "grad_norm": 1.415930986404419, + "learning_rate": 4.936436214249823e-05, + "loss": 5.9957, + "step": 12096 + }, + { + "epoch": 0.07194428585022362, + "grad_norm": 1.682356595993042, + "learning_rate": 4.936425747838774e-05, + "loss": 6.2381, + "step": 12097 + }, + { + "epoch": 0.07195023313350461, + "grad_norm": 1.693535566329956, + "learning_rate": 4.9364152805771946e-05, + "loss": 6.0523, + "step": 12098 + }, + { + "epoch": 0.07195618041678561, + "grad_norm": 1.7577873468399048, + "learning_rate": 4.9364048124650875e-05, + "loss": 5.8243, + "step": 12099 + }, + { + "epoch": 0.0719621277000666, + "grad_norm": 1.6486074924468994, + "learning_rate": 4.936394343502457e-05, + "loss": 5.8072, + "step": 12100 + }, + { + "epoch": 0.0719680749833476, + "grad_norm": 1.5245120525360107, + "learning_rate": 4.936383873689306e-05, + "loss": 5.9013, + "step": 12101 + }, + { + "epoch": 0.0719740222666286, + "grad_norm": 1.4771286249160767, + "learning_rate": 4.936373403025638e-05, + "loss": 6.1314, + "step": 12102 + }, + { + "epoch": 0.0719799695499096, + "grad_norm": 1.7547197341918945, + "learning_rate": 4.936362931511458e-05, + "loss": 5.9725, + "step": 12103 + }, + { + "epoch": 0.0719859168331906, + "grad_norm": 1.9942286014556885, + "learning_rate": 4.936352459146769e-05, + "loss": 5.82, + "step": 12104 + }, + { + "epoch": 0.0719918641164716, + "grad_norm": 1.8367860317230225, + "learning_rate": 4.936341985931574e-05, + "loss": 5.8653, + "step": 12105 + }, + { + "epoch": 0.07199781139975259, + "grad_norm": 1.8277100324630737, + "learning_rate": 4.936331511865877e-05, + "loss": 5.6998, + "step": 12106 + }, + { + "epoch": 0.07200375868303359, + "grad_norm": 1.5308998823165894, + "learning_rate": 4.936321036949683e-05, + "loss": 5.822, + "step": 12107 + }, + { + "epoch": 0.07200970596631459, + "grad_norm": 1.7100377082824707, + "learning_rate": 4.936310561182993e-05, + "loss": 5.991, + "step": 12108 + }, + { + "epoch": 0.07201565324959558, + "grad_norm": 1.8563333749771118, + "learning_rate": 4.936300084565813e-05, + "loss": 5.8438, + "step": 12109 + }, + { + "epoch": 0.07202160053287658, + "grad_norm": 1.9967303276062012, + "learning_rate": 4.936289607098146e-05, + "loss": 5.6786, + "step": 12110 + }, + { + "epoch": 0.07202754781615758, + "grad_norm": 2.1997451782226562, + "learning_rate": 4.9362791287799945e-05, + "loss": 5.2983, + "step": 12111 + }, + { + "epoch": 0.07203349509943857, + "grad_norm": 2.144521713256836, + "learning_rate": 4.9362686496113644e-05, + "loss": 5.2942, + "step": 12112 + }, + { + "epoch": 0.07203944238271957, + "grad_norm": 2.0747883319854736, + "learning_rate": 4.936258169592257e-05, + "loss": 5.473, + "step": 12113 + }, + { + "epoch": 0.07204538966600058, + "grad_norm": 2.0386881828308105, + "learning_rate": 4.9362476887226776e-05, + "loss": 5.2557, + "step": 12114 + }, + { + "epoch": 0.07205133694928156, + "grad_norm": 2.190687894821167, + "learning_rate": 4.93623720700263e-05, + "loss": 5.3251, + "step": 12115 + }, + { + "epoch": 0.07205728423256257, + "grad_norm": 1.9349397420883179, + "learning_rate": 4.936226724432116e-05, + "loss": 5.242, + "step": 12116 + }, + { + "epoch": 0.07206323151584357, + "grad_norm": 2.175943613052368, + "learning_rate": 4.93621624101114e-05, + "loss": 5.185, + "step": 12117 + }, + { + "epoch": 0.07206917879912456, + "grad_norm": 2.053994655609131, + "learning_rate": 4.936205756739708e-05, + "loss": 5.0755, + "step": 12118 + }, + { + "epoch": 0.07207512608240556, + "grad_norm": 2.0012362003326416, + "learning_rate": 4.93619527161782e-05, + "loss": 5.1797, + "step": 12119 + }, + { + "epoch": 0.07208107336568656, + "grad_norm": 1.9441219568252563, + "learning_rate": 4.936184785645482e-05, + "loss": 5.5583, + "step": 12120 + }, + { + "epoch": 0.07208702064896755, + "grad_norm": 2.990767002105713, + "learning_rate": 4.936174298822696e-05, + "loss": 4.8348, + "step": 12121 + }, + { + "epoch": 0.07209296793224855, + "grad_norm": 2.8385918140411377, + "learning_rate": 4.936163811149469e-05, + "loss": 4.7299, + "step": 12122 + }, + { + "epoch": 0.07209891521552955, + "grad_norm": 2.5228044986724854, + "learning_rate": 4.9361533226258006e-05, + "loss": 4.622, + "step": 12123 + }, + { + "epoch": 0.07210486249881054, + "grad_norm": 2.317598581314087, + "learning_rate": 4.936142833251697e-05, + "loss": 4.588, + "step": 12124 + }, + { + "epoch": 0.07211080978209154, + "grad_norm": 2.369335889816284, + "learning_rate": 4.936132343027161e-05, + "loss": 4.3843, + "step": 12125 + }, + { + "epoch": 0.07211675706537254, + "grad_norm": 2.4761011600494385, + "learning_rate": 4.936121851952196e-05, + "loss": 4.4101, + "step": 12126 + }, + { + "epoch": 0.07212270434865353, + "grad_norm": 2.3830130100250244, + "learning_rate": 4.9361113600268065e-05, + "loss": 4.5065, + "step": 12127 + }, + { + "epoch": 0.07212865163193453, + "grad_norm": 2.4977028369903564, + "learning_rate": 4.936100867250996e-05, + "loss": 4.4469, + "step": 12128 + }, + { + "epoch": 0.07213459891521554, + "grad_norm": 2.3377795219421387, + "learning_rate": 4.9360903736247663e-05, + "loss": 4.4045, + "step": 12129 + }, + { + "epoch": 0.07214054619849652, + "grad_norm": 2.268906831741333, + "learning_rate": 4.9360798791481245e-05, + "loss": 4.4224, + "step": 12130 + }, + { + "epoch": 0.07214649348177753, + "grad_norm": 2.316899538040161, + "learning_rate": 4.936069383821072e-05, + "loss": 4.3704, + "step": 12131 + }, + { + "epoch": 0.07215244076505851, + "grad_norm": 2.419618606567383, + "learning_rate": 4.936058887643612e-05, + "loss": 5.493, + "step": 12132 + }, + { + "epoch": 0.07215838804833952, + "grad_norm": 2.081756353378296, + "learning_rate": 4.93604839061575e-05, + "loss": 6.2328, + "step": 12133 + }, + { + "epoch": 0.07216433533162052, + "grad_norm": 2.1638660430908203, + "learning_rate": 4.936037892737487e-05, + "loss": 6.3089, + "step": 12134 + }, + { + "epoch": 0.07217028261490151, + "grad_norm": 1.7972848415374756, + "learning_rate": 4.93602739400883e-05, + "loss": 6.4013, + "step": 12135 + }, + { + "epoch": 0.07217622989818251, + "grad_norm": 1.7160871028900146, + "learning_rate": 4.93601689442978e-05, + "loss": 6.1717, + "step": 12136 + }, + { + "epoch": 0.07218217718146351, + "grad_norm": 2.0931475162506104, + "learning_rate": 4.936006394000342e-05, + "loss": 5.3515, + "step": 12137 + }, + { + "epoch": 0.0721881244647445, + "grad_norm": 2.2872977256774902, + "learning_rate": 4.93599589272052e-05, + "loss": 5.8342, + "step": 12138 + }, + { + "epoch": 0.0721940717480255, + "grad_norm": 2.4082720279693604, + "learning_rate": 4.9359853905903166e-05, + "loss": 6.1651, + "step": 12139 + }, + { + "epoch": 0.0722000190313065, + "grad_norm": 2.120962381362915, + "learning_rate": 4.935974887609735e-05, + "loss": 6.1182, + "step": 12140 + }, + { + "epoch": 0.07220596631458749, + "grad_norm": 2.0507090091705322, + "learning_rate": 4.9359643837787805e-05, + "loss": 5.7158, + "step": 12141 + }, + { + "epoch": 0.0722119135978685, + "grad_norm": 2.099963426589966, + "learning_rate": 4.9359538790974556e-05, + "loss": 5.6952, + "step": 12142 + }, + { + "epoch": 0.0722178608811495, + "grad_norm": 1.7631537914276123, + "learning_rate": 4.935943373565765e-05, + "loss": 5.6649, + "step": 12143 + }, + { + "epoch": 0.07222380816443048, + "grad_norm": 1.739601492881775, + "learning_rate": 4.9359328671837115e-05, + "loss": 5.7258, + "step": 12144 + }, + { + "epoch": 0.07222975544771149, + "grad_norm": 1.630116581916809, + "learning_rate": 4.9359223599512996e-05, + "loss": 5.7305, + "step": 12145 + }, + { + "epoch": 0.07223570273099249, + "grad_norm": 1.6106374263763428, + "learning_rate": 4.935911851868531e-05, + "loss": 5.6779, + "step": 12146 + }, + { + "epoch": 0.07224165001427348, + "grad_norm": 1.945662021636963, + "learning_rate": 4.935901342935412e-05, + "loss": 5.716, + "step": 12147 + }, + { + "epoch": 0.07224759729755448, + "grad_norm": 1.8601467609405518, + "learning_rate": 4.935890833151944e-05, + "loss": 5.7539, + "step": 12148 + }, + { + "epoch": 0.07225354458083548, + "grad_norm": 1.8324257135391235, + "learning_rate": 4.9358803225181324e-05, + "loss": 5.7309, + "step": 12149 + }, + { + "epoch": 0.07225949186411647, + "grad_norm": 2.0564095973968506, + "learning_rate": 4.93586981103398e-05, + "loss": 5.7201, + "step": 12150 + }, + { + "epoch": 0.07226543914739747, + "grad_norm": 1.925706386566162, + "learning_rate": 4.93585929869949e-05, + "loss": 5.5736, + "step": 12151 + }, + { + "epoch": 0.07227138643067847, + "grad_norm": 1.5965845584869385, + "learning_rate": 4.935848785514667e-05, + "loss": 5.4351, + "step": 12152 + }, + { + "epoch": 0.07227733371395946, + "grad_norm": 2.2522077560424805, + "learning_rate": 4.935838271479515e-05, + "loss": 5.8261, + "step": 12153 + }, + { + "epoch": 0.07228328099724046, + "grad_norm": 2.242398738861084, + "learning_rate": 4.935827756594036e-05, + "loss": 5.9923, + "step": 12154 + }, + { + "epoch": 0.07228922828052146, + "grad_norm": 2.043266534805298, + "learning_rate": 4.935817240858236e-05, + "loss": 5.6127, + "step": 12155 + }, + { + "epoch": 0.07229517556380245, + "grad_norm": 2.4922964572906494, + "learning_rate": 4.935806724272116e-05, + "loss": 5.3549, + "step": 12156 + }, + { + "epoch": 0.07230112284708345, + "grad_norm": 2.5241329669952393, + "learning_rate": 4.935796206835682e-05, + "loss": 5.2194, + "step": 12157 + }, + { + "epoch": 0.07230707013036446, + "grad_norm": 2.4680237770080566, + "learning_rate": 4.9357856885489365e-05, + "loss": 5.1154, + "step": 12158 + }, + { + "epoch": 0.07231301741364544, + "grad_norm": 2.1012492179870605, + "learning_rate": 4.9357751694118824e-05, + "loss": 4.8526, + "step": 12159 + }, + { + "epoch": 0.07231896469692645, + "grad_norm": 1.9997994899749756, + "learning_rate": 4.935764649424526e-05, + "loss": 4.9778, + "step": 12160 + }, + { + "epoch": 0.07232491198020743, + "grad_norm": 1.770112156867981, + "learning_rate": 4.935754128586868e-05, + "loss": 5.0855, + "step": 12161 + }, + { + "epoch": 0.07233085926348844, + "grad_norm": 2.0865485668182373, + "learning_rate": 4.935743606898914e-05, + "loss": 5.1566, + "step": 12162 + }, + { + "epoch": 0.07233680654676944, + "grad_norm": 2.0801351070404053, + "learning_rate": 4.9357330843606677e-05, + "loss": 5.0611, + "step": 12163 + }, + { + "epoch": 0.07234275383005043, + "grad_norm": 1.8675305843353271, + "learning_rate": 4.935722560972131e-05, + "loss": 4.9216, + "step": 12164 + }, + { + "epoch": 0.07234870111333143, + "grad_norm": 1.9125452041625977, + "learning_rate": 4.935712036733309e-05, + "loss": 4.8363, + "step": 12165 + }, + { + "epoch": 0.07235464839661243, + "grad_norm": 2.4954965114593506, + "learning_rate": 4.935701511644205e-05, + "loss": 4.9816, + "step": 12166 + }, + { + "epoch": 0.07236059567989342, + "grad_norm": 2.412381410598755, + "learning_rate": 4.935690985704823e-05, + "loss": 4.9616, + "step": 12167 + }, + { + "epoch": 0.07236654296317442, + "grad_norm": 2.356994152069092, + "learning_rate": 4.9356804589151665e-05, + "loss": 4.8326, + "step": 12168 + }, + { + "epoch": 0.07237249024645542, + "grad_norm": 2.2399415969848633, + "learning_rate": 4.93566993127524e-05, + "loss": 4.8955, + "step": 12169 + }, + { + "epoch": 0.07237843752973641, + "grad_norm": 2.691772222518921, + "learning_rate": 4.935659402785044e-05, + "loss": 5.6475, + "step": 12170 + }, + { + "epoch": 0.07238438481301741, + "grad_norm": 2.954955816268921, + "learning_rate": 4.9356488734445865e-05, + "loss": 6.2151, + "step": 12171 + }, + { + "epoch": 0.07239033209629842, + "grad_norm": 2.010998010635376, + "learning_rate": 4.935638343253869e-05, + "loss": 5.9124, + "step": 12172 + }, + { + "epoch": 0.0723962793795794, + "grad_norm": 2.2737836837768555, + "learning_rate": 4.935627812212894e-05, + "loss": 5.4068, + "step": 12173 + }, + { + "epoch": 0.0724022266628604, + "grad_norm": 2.2700793743133545, + "learning_rate": 4.9356172803216675e-05, + "loss": 4.8156, + "step": 12174 + }, + { + "epoch": 0.07240817394614141, + "grad_norm": 2.2795162200927734, + "learning_rate": 4.935606747580192e-05, + "loss": 4.7882, + "step": 12175 + }, + { + "epoch": 0.0724141212294224, + "grad_norm": 2.1849277019500732, + "learning_rate": 4.9355962139884715e-05, + "loss": 4.9914, + "step": 12176 + }, + { + "epoch": 0.0724200685127034, + "grad_norm": 2.5336532592773438, + "learning_rate": 4.935585679546509e-05, + "loss": 4.8487, + "step": 12177 + }, + { + "epoch": 0.0724260157959844, + "grad_norm": 2.624995708465576, + "learning_rate": 4.935575144254309e-05, + "loss": 4.9523, + "step": 12178 + }, + { + "epoch": 0.07243196307926539, + "grad_norm": 2.5450191497802734, + "learning_rate": 4.935564608111875e-05, + "loss": 4.9958, + "step": 12179 + }, + { + "epoch": 0.07243791036254639, + "grad_norm": 2.2714452743530273, + "learning_rate": 4.9355540711192107e-05, + "loss": 5.301, + "step": 12180 + }, + { + "epoch": 0.07244385764582739, + "grad_norm": 2.0173168182373047, + "learning_rate": 4.935543533276319e-05, + "loss": 5.7992, + "step": 12181 + }, + { + "epoch": 0.07244980492910838, + "grad_norm": 2.9326014518737793, + "learning_rate": 4.9355329945832054e-05, + "loss": 5.6065, + "step": 12182 + }, + { + "epoch": 0.07245575221238938, + "grad_norm": 2.142066478729248, + "learning_rate": 4.935522455039871e-05, + "loss": 5.5339, + "step": 12183 + }, + { + "epoch": 0.07246169949567038, + "grad_norm": 1.8901113271713257, + "learning_rate": 4.9355119146463214e-05, + "loss": 5.8829, + "step": 12184 + }, + { + "epoch": 0.07246764677895137, + "grad_norm": 1.996052622795105, + "learning_rate": 4.93550137340256e-05, + "loss": 6.2189, + "step": 12185 + }, + { + "epoch": 0.07247359406223237, + "grad_norm": 1.7420963048934937, + "learning_rate": 4.93549083130859e-05, + "loss": 5.9254, + "step": 12186 + }, + { + "epoch": 0.07247954134551338, + "grad_norm": 2.8487229347229004, + "learning_rate": 4.935480288364416e-05, + "loss": 5.8643, + "step": 12187 + }, + { + "epoch": 0.07248548862879436, + "grad_norm": 3.0168306827545166, + "learning_rate": 4.93546974457004e-05, + "loss": 5.811, + "step": 12188 + }, + { + "epoch": 0.07249143591207537, + "grad_norm": 2.841353416442871, + "learning_rate": 4.935459199925467e-05, + "loss": 5.6832, + "step": 12189 + }, + { + "epoch": 0.07249738319535635, + "grad_norm": 2.3517918586730957, + "learning_rate": 4.9354486544307e-05, + "loss": 4.3651, + "step": 12190 + }, + { + "epoch": 0.07250333047863736, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.935438108085744e-05, + "loss": 4.2884, + "step": 12191 + }, + { + "epoch": 0.07250927776191836, + "grad_norm": 2.0812551975250244, + "learning_rate": 4.935427560890601e-05, + "loss": 4.168, + "step": 12192 + }, + { + "epoch": 0.07251522504519935, + "grad_norm": 2.0546631813049316, + "learning_rate": 4.935417012845275e-05, + "loss": 3.862, + "step": 12193 + }, + { + "epoch": 0.07252117232848035, + "grad_norm": 2.130612850189209, + "learning_rate": 4.935406463949771e-05, + "loss": 3.6729, + "step": 12194 + }, + { + "epoch": 0.07252711961176135, + "grad_norm": 2.35225510597229, + "learning_rate": 4.9353959142040917e-05, + "loss": 3.7075, + "step": 12195 + }, + { + "epoch": 0.07253306689504234, + "grad_norm": 2.418698310852051, + "learning_rate": 4.93538536360824e-05, + "loss": 3.679, + "step": 12196 + }, + { + "epoch": 0.07253901417832334, + "grad_norm": 2.4452991485595703, + "learning_rate": 4.9353748121622214e-05, + "loss": 3.7827, + "step": 12197 + }, + { + "epoch": 0.07254496146160434, + "grad_norm": 2.3787992000579834, + "learning_rate": 4.935364259866038e-05, + "loss": 3.7484, + "step": 12198 + }, + { + "epoch": 0.07255090874488533, + "grad_norm": 2.299149751663208, + "learning_rate": 4.935353706719694e-05, + "loss": 3.6186, + "step": 12199 + }, + { + "epoch": 0.07255685602816633, + "grad_norm": 2.666121244430542, + "learning_rate": 4.9353431527231944e-05, + "loss": 3.5323, + "step": 12200 + }, + { + "epoch": 0.07256280331144734, + "grad_norm": 2.4448325634002686, + "learning_rate": 4.9353325978765404e-05, + "loss": 3.8176, + "step": 12201 + }, + { + "epoch": 0.07256875059472832, + "grad_norm": 2.5082852840423584, + "learning_rate": 4.935322042179737e-05, + "loss": 3.7838, + "step": 12202 + }, + { + "epoch": 0.07257469787800933, + "grad_norm": 2.3247005939483643, + "learning_rate": 4.935311485632788e-05, + "loss": 3.8036, + "step": 12203 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 2.4917871952056885, + "learning_rate": 4.9353009282356974e-05, + "loss": 3.6734, + "step": 12204 + }, + { + "epoch": 0.07258659244457132, + "grad_norm": 2.2535903453826904, + "learning_rate": 4.935290369988468e-05, + "loss": 3.7451, + "step": 12205 + }, + { + "epoch": 0.07259253972785232, + "grad_norm": 2.355896472930908, + "learning_rate": 4.9352798108911036e-05, + "loss": 3.5963, + "step": 12206 + }, + { + "epoch": 0.07259848701113332, + "grad_norm": 2.21923828125, + "learning_rate": 4.935269250943609e-05, + "loss": 3.5492, + "step": 12207 + }, + { + "epoch": 0.07260443429441431, + "grad_norm": 2.3795714378356934, + "learning_rate": 4.935258690145986e-05, + "loss": 3.7146, + "step": 12208 + }, + { + "epoch": 0.07261038157769531, + "grad_norm": 2.3866682052612305, + "learning_rate": 4.93524812849824e-05, + "loss": 3.7359, + "step": 12209 + }, + { + "epoch": 0.07261632886097631, + "grad_norm": 2.411289691925049, + "learning_rate": 4.935237566000374e-05, + "loss": 3.6958, + "step": 12210 + }, + { + "epoch": 0.0726222761442573, + "grad_norm": 2.3831989765167236, + "learning_rate": 4.935227002652392e-05, + "loss": 3.6696, + "step": 12211 + }, + { + "epoch": 0.0726282234275383, + "grad_norm": 2.1831908226013184, + "learning_rate": 4.935216438454297e-05, + "loss": 3.905, + "step": 12212 + }, + { + "epoch": 0.0726341707108193, + "grad_norm": 2.1136345863342285, + "learning_rate": 4.9352058734060934e-05, + "loss": 5.0188, + "step": 12213 + }, + { + "epoch": 0.07264011799410029, + "grad_norm": 2.2617692947387695, + "learning_rate": 4.935195307507784e-05, + "loss": 5.1883, + "step": 12214 + }, + { + "epoch": 0.0726460652773813, + "grad_norm": 2.4442226886749268, + "learning_rate": 4.935184740759374e-05, + "loss": 5.1883, + "step": 12215 + }, + { + "epoch": 0.0726520125606623, + "grad_norm": 2.300234794616699, + "learning_rate": 4.935174173160865e-05, + "loss": 4.9925, + "step": 12216 + }, + { + "epoch": 0.07265795984394328, + "grad_norm": 2.1512858867645264, + "learning_rate": 4.935163604712263e-05, + "loss": 4.883, + "step": 12217 + }, + { + "epoch": 0.07266390712722429, + "grad_norm": 2.210825204849243, + "learning_rate": 4.93515303541357e-05, + "loss": 5.165, + "step": 12218 + }, + { + "epoch": 0.07266985441050527, + "grad_norm": 2.1589086055755615, + "learning_rate": 4.935142465264791e-05, + "loss": 4.931, + "step": 12219 + }, + { + "epoch": 0.07267580169378628, + "grad_norm": 2.0527892112731934, + "learning_rate": 4.935131894265927e-05, + "loss": 5.0566, + "step": 12220 + }, + { + "epoch": 0.07268174897706728, + "grad_norm": 2.202828884124756, + "learning_rate": 4.935121322416985e-05, + "loss": 4.9519, + "step": 12221 + }, + { + "epoch": 0.07268769626034827, + "grad_norm": 2.262834310531616, + "learning_rate": 4.935110749717967e-05, + "loss": 4.9596, + "step": 12222 + }, + { + "epoch": 0.07269364354362927, + "grad_norm": 2.169311761856079, + "learning_rate": 4.935100176168877e-05, + "loss": 4.8968, + "step": 12223 + }, + { + "epoch": 0.07269959082691027, + "grad_norm": 2.137746572494507, + "learning_rate": 4.935089601769719e-05, + "loss": 4.8535, + "step": 12224 + }, + { + "epoch": 0.07270553811019126, + "grad_norm": 2.060861587524414, + "learning_rate": 4.935079026520496e-05, + "loss": 5.0784, + "step": 12225 + }, + { + "epoch": 0.07271148539347226, + "grad_norm": 2.235352039337158, + "learning_rate": 4.935068450421213e-05, + "loss": 4.7351, + "step": 12226 + }, + { + "epoch": 0.07271743267675326, + "grad_norm": 2.3832550048828125, + "learning_rate": 4.935057873471872e-05, + "loss": 4.618, + "step": 12227 + }, + { + "epoch": 0.07272337996003425, + "grad_norm": 2.3591537475585938, + "learning_rate": 4.935047295672477e-05, + "loss": 4.7029, + "step": 12228 + }, + { + "epoch": 0.07272932724331525, + "grad_norm": 2.2797207832336426, + "learning_rate": 4.935036717023033e-05, + "loss": 4.9199, + "step": 12229 + }, + { + "epoch": 0.07273527452659626, + "grad_norm": 2.4931957721710205, + "learning_rate": 4.935026137523542e-05, + "loss": 4.5923, + "step": 12230 + }, + { + "epoch": 0.07274122180987724, + "grad_norm": 2.152064323425293, + "learning_rate": 4.9350155571740095e-05, + "loss": 5.1495, + "step": 12231 + }, + { + "epoch": 0.07274716909315825, + "grad_norm": 2.470526695251465, + "learning_rate": 4.935004975974438e-05, + "loss": 4.8257, + "step": 12232 + }, + { + "epoch": 0.07275311637643925, + "grad_norm": 2.262578248977661, + "learning_rate": 4.9349943939248304e-05, + "loss": 5.7004, + "step": 12233 + }, + { + "epoch": 0.07275906365972024, + "grad_norm": 2.0813188552856445, + "learning_rate": 4.934983811025192e-05, + "loss": 5.6048, + "step": 12234 + }, + { + "epoch": 0.07276501094300124, + "grad_norm": 2.4882686138153076, + "learning_rate": 4.934973227275527e-05, + "loss": 5.8121, + "step": 12235 + }, + { + "epoch": 0.07277095822628224, + "grad_norm": 2.5181429386138916, + "learning_rate": 4.9349626426758364e-05, + "loss": 4.5581, + "step": 12236 + }, + { + "epoch": 0.07277690550956323, + "grad_norm": 2.6369354724884033, + "learning_rate": 4.934952057226127e-05, + "loss": 4.7938, + "step": 12237 + }, + { + "epoch": 0.07278285279284423, + "grad_norm": 1.8615930080413818, + "learning_rate": 4.9349414709264e-05, + "loss": 5.2097, + "step": 12238 + }, + { + "epoch": 0.07278880007612523, + "grad_norm": 1.4905575513839722, + "learning_rate": 4.93493088377666e-05, + "loss": 5.5717, + "step": 12239 + }, + { + "epoch": 0.07279474735940622, + "grad_norm": 1.8339897394180298, + "learning_rate": 4.9349202957769106e-05, + "loss": 5.6908, + "step": 12240 + }, + { + "epoch": 0.07280069464268722, + "grad_norm": 1.5875110626220703, + "learning_rate": 4.934909706927156e-05, + "loss": 5.6246, + "step": 12241 + }, + { + "epoch": 0.07280664192596822, + "grad_norm": 1.8365919589996338, + "learning_rate": 4.934899117227399e-05, + "loss": 5.394, + "step": 12242 + }, + { + "epoch": 0.07281258920924921, + "grad_norm": 1.9548145532608032, + "learning_rate": 4.934888526677645e-05, + "loss": 5.2427, + "step": 12243 + }, + { + "epoch": 0.07281853649253021, + "grad_norm": 1.8174974918365479, + "learning_rate": 4.934877935277896e-05, + "loss": 5.5844, + "step": 12244 + }, + { + "epoch": 0.07282448377581122, + "grad_norm": 1.800117015838623, + "learning_rate": 4.934867343028157e-05, + "loss": 4.9386, + "step": 12245 + }, + { + "epoch": 0.0728304310590922, + "grad_norm": 2.0356900691986084, + "learning_rate": 4.93485674992843e-05, + "loss": 4.6911, + "step": 12246 + }, + { + "epoch": 0.0728363783423732, + "grad_norm": 2.009455442428589, + "learning_rate": 4.93484615597872e-05, + "loss": 4.6121, + "step": 12247 + }, + { + "epoch": 0.0728423256256542, + "grad_norm": 1.9252879619598389, + "learning_rate": 4.934835561179031e-05, + "loss": 4.737, + "step": 12248 + }, + { + "epoch": 0.0728482729089352, + "grad_norm": 2.3497977256774902, + "learning_rate": 4.934824965529365e-05, + "loss": 5.6921, + "step": 12249 + }, + { + "epoch": 0.0728542201922162, + "grad_norm": 2.0821962356567383, + "learning_rate": 4.934814369029727e-05, + "loss": 5.3845, + "step": 12250 + }, + { + "epoch": 0.07286016747549719, + "grad_norm": 1.9725046157836914, + "learning_rate": 4.934803771680121e-05, + "loss": 5.5557, + "step": 12251 + }, + { + "epoch": 0.07286611475877819, + "grad_norm": 2.290238618850708, + "learning_rate": 4.93479317348055e-05, + "loss": 5.4258, + "step": 12252 + }, + { + "epoch": 0.07287206204205919, + "grad_norm": 1.9502376317977905, + "learning_rate": 4.934782574431017e-05, + "loss": 5.0531, + "step": 12253 + }, + { + "epoch": 0.07287800932534018, + "grad_norm": 2.128431797027588, + "learning_rate": 4.9347719745315275e-05, + "loss": 5.0241, + "step": 12254 + }, + { + "epoch": 0.07288395660862118, + "grad_norm": 1.9173803329467773, + "learning_rate": 4.934761373782084e-05, + "loss": 5.7107, + "step": 12255 + }, + { + "epoch": 0.07288990389190218, + "grad_norm": 1.5167652368545532, + "learning_rate": 4.93475077218269e-05, + "loss": 5.2304, + "step": 12256 + }, + { + "epoch": 0.07289585117518317, + "grad_norm": 1.4125497341156006, + "learning_rate": 4.9347401697333505e-05, + "loss": 5.1099, + "step": 12257 + }, + { + "epoch": 0.07290179845846417, + "grad_norm": 2.384801149368286, + "learning_rate": 4.934729566434068e-05, + "loss": 5.0051, + "step": 12258 + }, + { + "epoch": 0.07290774574174518, + "grad_norm": 1.9343961477279663, + "learning_rate": 4.934718962284846e-05, + "loss": 5.3367, + "step": 12259 + }, + { + "epoch": 0.07291369302502616, + "grad_norm": 2.048220157623291, + "learning_rate": 4.93470835728569e-05, + "loss": 5.8502, + "step": 12260 + }, + { + "epoch": 0.07291964030830717, + "grad_norm": 2.037167549133301, + "learning_rate": 4.934697751436601e-05, + "loss": 5.1993, + "step": 12261 + }, + { + "epoch": 0.07292558759158817, + "grad_norm": 1.8141452074050903, + "learning_rate": 4.9346871447375854e-05, + "loss": 5.8308, + "step": 12262 + }, + { + "epoch": 0.07293153487486916, + "grad_norm": 1.7525955438613892, + "learning_rate": 4.934676537188645e-05, + "loss": 5.5946, + "step": 12263 + }, + { + "epoch": 0.07293748215815016, + "grad_norm": 1.9784163236618042, + "learning_rate": 4.9346659287897846e-05, + "loss": 5.7214, + "step": 12264 + }, + { + "epoch": 0.07294342944143116, + "grad_norm": 1.8948242664337158, + "learning_rate": 4.934655319541007e-05, + "loss": 5.7434, + "step": 12265 + }, + { + "epoch": 0.07294937672471215, + "grad_norm": 1.698625087738037, + "learning_rate": 4.934644709442317e-05, + "loss": 5.7828, + "step": 12266 + }, + { + "epoch": 0.07295532400799315, + "grad_norm": 1.6057854890823364, + "learning_rate": 4.934634098493717e-05, + "loss": 5.8815, + "step": 12267 + }, + { + "epoch": 0.07296127129127415, + "grad_norm": 1.4753777980804443, + "learning_rate": 4.9346234866952125e-05, + "loss": 5.8368, + "step": 12268 + }, + { + "epoch": 0.07296721857455514, + "grad_norm": 1.8265280723571777, + "learning_rate": 4.9346128740468046e-05, + "loss": 5.7511, + "step": 12269 + }, + { + "epoch": 0.07297316585783614, + "grad_norm": 1.7212530374526978, + "learning_rate": 4.9346022605485e-05, + "loss": 5.6741, + "step": 12270 + }, + { + "epoch": 0.07297911314111714, + "grad_norm": 1.8423148393630981, + "learning_rate": 4.9345916462002996e-05, + "loss": 5.5199, + "step": 12271 + }, + { + "epoch": 0.07298506042439813, + "grad_norm": 1.7754487991333008, + "learning_rate": 4.934581031002209e-05, + "loss": 5.9655, + "step": 12272 + }, + { + "epoch": 0.07299100770767913, + "grad_norm": 1.794704794883728, + "learning_rate": 4.9345704149542313e-05, + "loss": 5.886, + "step": 12273 + }, + { + "epoch": 0.07299695499096014, + "grad_norm": 1.807165503501892, + "learning_rate": 4.93455979805637e-05, + "loss": 5.5222, + "step": 12274 + }, + { + "epoch": 0.07300290227424112, + "grad_norm": 1.6476585865020752, + "learning_rate": 4.934549180308629e-05, + "loss": 5.6588, + "step": 12275 + }, + { + "epoch": 0.07300884955752213, + "grad_norm": 1.8332840204238892, + "learning_rate": 4.9345385617110125e-05, + "loss": 5.0781, + "step": 12276 + }, + { + "epoch": 0.07301479684080311, + "grad_norm": 1.837471842765808, + "learning_rate": 4.934527942263523e-05, + "loss": 5.8881, + "step": 12277 + }, + { + "epoch": 0.07302074412408412, + "grad_norm": 1.538299798965454, + "learning_rate": 4.934517321966165e-05, + "loss": 6.0547, + "step": 12278 + }, + { + "epoch": 0.07302669140736512, + "grad_norm": 1.9346814155578613, + "learning_rate": 4.934506700818943e-05, + "loss": 5.7853, + "step": 12279 + }, + { + "epoch": 0.0730326386906461, + "grad_norm": 1.9108514785766602, + "learning_rate": 4.93449607882186e-05, + "loss": 5.8034, + "step": 12280 + }, + { + "epoch": 0.07303858597392711, + "grad_norm": 2.0216846466064453, + "learning_rate": 4.934485455974919e-05, + "loss": 5.5127, + "step": 12281 + }, + { + "epoch": 0.07304453325720811, + "grad_norm": 2.2365148067474365, + "learning_rate": 4.9344748322781244e-05, + "loss": 5.5519, + "step": 12282 + }, + { + "epoch": 0.0730504805404891, + "grad_norm": 1.872934103012085, + "learning_rate": 4.934464207731479e-05, + "loss": 5.783, + "step": 12283 + }, + { + "epoch": 0.0730564278237701, + "grad_norm": 1.944606900215149, + "learning_rate": 4.934453582334988e-05, + "loss": 5.9803, + "step": 12284 + }, + { + "epoch": 0.0730623751070511, + "grad_norm": 1.765257477760315, + "learning_rate": 4.934442956088654e-05, + "loss": 5.8434, + "step": 12285 + }, + { + "epoch": 0.07306832239033209, + "grad_norm": 1.9726130962371826, + "learning_rate": 4.934432328992482e-05, + "loss": 5.6173, + "step": 12286 + }, + { + "epoch": 0.0730742696736131, + "grad_norm": 2.0510616302490234, + "learning_rate": 4.934421701046474e-05, + "loss": 5.4661, + "step": 12287 + }, + { + "epoch": 0.0730802169568941, + "grad_norm": 1.6038832664489746, + "learning_rate": 4.934411072250635e-05, + "loss": 5.2786, + "step": 12288 + }, + { + "epoch": 0.07308616424017508, + "grad_norm": 2.0088446140289307, + "learning_rate": 4.934400442604968e-05, + "loss": 4.9999, + "step": 12289 + }, + { + "epoch": 0.07309211152345609, + "grad_norm": 1.4760913848876953, + "learning_rate": 4.934389812109477e-05, + "loss": 4.785, + "step": 12290 + }, + { + "epoch": 0.07309805880673709, + "grad_norm": 2.2036757469177246, + "learning_rate": 4.934379180764166e-05, + "loss": 5.8303, + "step": 12291 + }, + { + "epoch": 0.07310400609001808, + "grad_norm": 2.0261359214782715, + "learning_rate": 4.9343685485690385e-05, + "loss": 5.6823, + "step": 12292 + }, + { + "epoch": 0.07310995337329908, + "grad_norm": 1.7493160963058472, + "learning_rate": 4.934357915524097e-05, + "loss": 5.6144, + "step": 12293 + }, + { + "epoch": 0.07311590065658008, + "grad_norm": 1.887373685836792, + "learning_rate": 4.934347281629347e-05, + "loss": 5.9405, + "step": 12294 + }, + { + "epoch": 0.07312184793986107, + "grad_norm": 1.6655008792877197, + "learning_rate": 4.9343366468847915e-05, + "loss": 5.8376, + "step": 12295 + }, + { + "epoch": 0.07312779522314207, + "grad_norm": 1.9241079092025757, + "learning_rate": 4.9343260112904345e-05, + "loss": 5.6072, + "step": 12296 + }, + { + "epoch": 0.07313374250642307, + "grad_norm": 1.7873997688293457, + "learning_rate": 4.934315374846279e-05, + "loss": 5.539, + "step": 12297 + }, + { + "epoch": 0.07313968978970406, + "grad_norm": 1.9266597032546997, + "learning_rate": 4.9343047375523296e-05, + "loss": 5.3921, + "step": 12298 + }, + { + "epoch": 0.07314563707298506, + "grad_norm": 1.9283325672149658, + "learning_rate": 4.934294099408589e-05, + "loss": 5.2326, + "step": 12299 + }, + { + "epoch": 0.07315158435626606, + "grad_norm": 1.739047884941101, + "learning_rate": 4.934283460415062e-05, + "loss": 5.4831, + "step": 12300 + }, + { + "epoch": 0.07315753163954705, + "grad_norm": 1.6729072332382202, + "learning_rate": 4.934272820571752e-05, + "loss": 5.633, + "step": 12301 + }, + { + "epoch": 0.07316347892282805, + "grad_norm": 1.6901992559432983, + "learning_rate": 4.9342621798786616e-05, + "loss": 5.6121, + "step": 12302 + }, + { + "epoch": 0.07316942620610906, + "grad_norm": 1.8640037775039673, + "learning_rate": 4.9342515383357956e-05, + "loss": 5.6498, + "step": 12303 + }, + { + "epoch": 0.07317537348939004, + "grad_norm": 1.9629018306732178, + "learning_rate": 4.9342408959431576e-05, + "loss": 5.9364, + "step": 12304 + }, + { + "epoch": 0.07318132077267105, + "grad_norm": 1.9370427131652832, + "learning_rate": 4.934230252700752e-05, + "loss": 5.8945, + "step": 12305 + }, + { + "epoch": 0.07318726805595203, + "grad_norm": 1.6541575193405151, + "learning_rate": 4.9342196086085814e-05, + "loss": 5.5826, + "step": 12306 + }, + { + "epoch": 0.07319321533923304, + "grad_norm": 1.6640154123306274, + "learning_rate": 4.934208963666649e-05, + "loss": 5.7065, + "step": 12307 + }, + { + "epoch": 0.07319916262251404, + "grad_norm": 1.596665620803833, + "learning_rate": 4.934198317874961e-05, + "loss": 5.6764, + "step": 12308 + }, + { + "epoch": 0.07320510990579503, + "grad_norm": 1.841260552406311, + "learning_rate": 4.9341876712335176e-05, + "loss": 5.624, + "step": 12309 + }, + { + "epoch": 0.07321105718907603, + "grad_norm": 1.921162724494934, + "learning_rate": 4.9341770237423254e-05, + "loss": 5.3177, + "step": 12310 + }, + { + "epoch": 0.07321700447235703, + "grad_norm": 1.844192624092102, + "learning_rate": 4.934166375401388e-05, + "loss": 5.6236, + "step": 12311 + }, + { + "epoch": 0.07322295175563802, + "grad_norm": 1.9088208675384521, + "learning_rate": 4.934155726210707e-05, + "loss": 5.7487, + "step": 12312 + }, + { + "epoch": 0.07322889903891902, + "grad_norm": 2.1057817935943604, + "learning_rate": 4.934145076170288e-05, + "loss": 5.3372, + "step": 12313 + }, + { + "epoch": 0.07323484632220002, + "grad_norm": 1.9507678747177124, + "learning_rate": 4.9341344252801335e-05, + "loss": 5.9318, + "step": 12314 + }, + { + "epoch": 0.07324079360548101, + "grad_norm": 1.9885265827178955, + "learning_rate": 4.934123773540249e-05, + "loss": 5.7724, + "step": 12315 + }, + { + "epoch": 0.07324674088876201, + "grad_norm": 1.81960129737854, + "learning_rate": 4.934113120950636e-05, + "loss": 5.7624, + "step": 12316 + }, + { + "epoch": 0.07325268817204302, + "grad_norm": 1.7848392724990845, + "learning_rate": 4.9341024675112994e-05, + "loss": 5.8135, + "step": 12317 + }, + { + "epoch": 0.073258635455324, + "grad_norm": 1.8326808214187622, + "learning_rate": 4.9340918132222436e-05, + "loss": 5.9725, + "step": 12318 + }, + { + "epoch": 0.073264582738605, + "grad_norm": 1.731719970703125, + "learning_rate": 4.93408115808347e-05, + "loss": 5.8932, + "step": 12319 + }, + { + "epoch": 0.07327053002188601, + "grad_norm": 1.7635269165039062, + "learning_rate": 4.934070502094985e-05, + "loss": 5.4953, + "step": 12320 + }, + { + "epoch": 0.073276477305167, + "grad_norm": 1.61715829372406, + "learning_rate": 4.934059845256791e-05, + "loss": 5.4043, + "step": 12321 + }, + { + "epoch": 0.073282424588448, + "grad_norm": 1.9188543558120728, + "learning_rate": 4.9340491875688914e-05, + "loss": 5.2762, + "step": 12322 + }, + { + "epoch": 0.073288371871729, + "grad_norm": 2.098680019378662, + "learning_rate": 4.9340385290312904e-05, + "loss": 5.4673, + "step": 12323 + }, + { + "epoch": 0.07329431915500999, + "grad_norm": 2.15560245513916, + "learning_rate": 4.934027869643992e-05, + "loss": 5.9124, + "step": 12324 + }, + { + "epoch": 0.07330026643829099, + "grad_norm": 1.9819902181625366, + "learning_rate": 4.934017209407e-05, + "loss": 5.5686, + "step": 12325 + }, + { + "epoch": 0.07330621372157199, + "grad_norm": 2.517003059387207, + "learning_rate": 4.934006548320317e-05, + "loss": 3.9751, + "step": 12326 + }, + { + "epoch": 0.07331216100485298, + "grad_norm": 2.458714723587036, + "learning_rate": 4.9339958863839474e-05, + "loss": 3.7976, + "step": 12327 + }, + { + "epoch": 0.07331810828813398, + "grad_norm": 2.2642102241516113, + "learning_rate": 4.9339852235978955e-05, + "loss": 3.8853, + "step": 12328 + }, + { + "epoch": 0.07332405557141498, + "grad_norm": 2.3097565174102783, + "learning_rate": 4.9339745599621645e-05, + "loss": 3.5699, + "step": 12329 + }, + { + "epoch": 0.07333000285469597, + "grad_norm": 2.312995195388794, + "learning_rate": 4.933963895476758e-05, + "loss": 3.8338, + "step": 12330 + }, + { + "epoch": 0.07333595013797697, + "grad_norm": 2.69657826423645, + "learning_rate": 4.93395323014168e-05, + "loss": 5.3459, + "step": 12331 + }, + { + "epoch": 0.07334189742125798, + "grad_norm": 2.263038396835327, + "learning_rate": 4.9339425639569336e-05, + "loss": 5.712, + "step": 12332 + }, + { + "epoch": 0.07334784470453896, + "grad_norm": 1.9429599046707153, + "learning_rate": 4.9339318969225235e-05, + "loss": 5.7465, + "step": 12333 + }, + { + "epoch": 0.07335379198781997, + "grad_norm": 2.07045841217041, + "learning_rate": 4.933921229038453e-05, + "loss": 5.6726, + "step": 12334 + }, + { + "epoch": 0.07335973927110095, + "grad_norm": 2.0304102897644043, + "learning_rate": 4.933910560304725e-05, + "loss": 5.8084, + "step": 12335 + }, + { + "epoch": 0.07336568655438196, + "grad_norm": 1.8316701650619507, + "learning_rate": 4.933899890721344e-05, + "loss": 5.3852, + "step": 12336 + }, + { + "epoch": 0.07337163383766296, + "grad_norm": 2.1406614780426025, + "learning_rate": 4.933889220288315e-05, + "loss": 5.1097, + "step": 12337 + }, + { + "epoch": 0.07337758112094395, + "grad_norm": 1.7518030405044556, + "learning_rate": 4.9338785490056395e-05, + "loss": 5.2038, + "step": 12338 + }, + { + "epoch": 0.07338352840422495, + "grad_norm": 1.8387973308563232, + "learning_rate": 4.933867876873322e-05, + "loss": 5.0847, + "step": 12339 + }, + { + "epoch": 0.07338947568750595, + "grad_norm": 1.692947506904602, + "learning_rate": 4.933857203891367e-05, + "loss": 5.6124, + "step": 12340 + }, + { + "epoch": 0.07339542297078694, + "grad_norm": 1.6367069482803345, + "learning_rate": 4.933846530059776e-05, + "loss": 5.7119, + "step": 12341 + }, + { + "epoch": 0.07340137025406794, + "grad_norm": 2.0395610332489014, + "learning_rate": 4.933835855378556e-05, + "loss": 5.4164, + "step": 12342 + }, + { + "epoch": 0.07340731753734894, + "grad_norm": 2.074073314666748, + "learning_rate": 4.933825179847709e-05, + "loss": 5.3952, + "step": 12343 + }, + { + "epoch": 0.07341326482062993, + "grad_norm": 2.2825684547424316, + "learning_rate": 4.9338145034672376e-05, + "loss": 5.4019, + "step": 12344 + }, + { + "epoch": 0.07341921210391093, + "grad_norm": 2.006591796875, + "learning_rate": 4.9338038262371476e-05, + "loss": 5.4422, + "step": 12345 + }, + { + "epoch": 0.07342515938719194, + "grad_norm": 2.10418701171875, + "learning_rate": 4.9337931481574415e-05, + "loss": 5.3801, + "step": 12346 + }, + { + "epoch": 0.07343110667047292, + "grad_norm": 1.9998257160186768, + "learning_rate": 4.9337824692281233e-05, + "loss": 5.1673, + "step": 12347 + }, + { + "epoch": 0.07343705395375393, + "grad_norm": 2.175896644592285, + "learning_rate": 4.933771789449197e-05, + "loss": 5.118, + "step": 12348 + }, + { + "epoch": 0.07344300123703493, + "grad_norm": 2.075164318084717, + "learning_rate": 4.933761108820666e-05, + "loss": 5.1662, + "step": 12349 + }, + { + "epoch": 0.07344894852031592, + "grad_norm": 2.0672569274902344, + "learning_rate": 4.933750427342534e-05, + "loss": 5.0957, + "step": 12350 + }, + { + "epoch": 0.07345489580359692, + "grad_norm": 2.0570287704467773, + "learning_rate": 4.9337397450148055e-05, + "loss": 5.2772, + "step": 12351 + }, + { + "epoch": 0.07346084308687792, + "grad_norm": 2.0653116703033447, + "learning_rate": 4.933729061837483e-05, + "loss": 5.4755, + "step": 12352 + }, + { + "epoch": 0.07346679037015891, + "grad_norm": 2.832578420639038, + "learning_rate": 4.933718377810571e-05, + "loss": 4.8128, + "step": 12353 + }, + { + "epoch": 0.07347273765343991, + "grad_norm": 2.378556251525879, + "learning_rate": 4.933707692934073e-05, + "loss": 5.109, + "step": 12354 + }, + { + "epoch": 0.07347868493672091, + "grad_norm": 2.1819205284118652, + "learning_rate": 4.933697007207993e-05, + "loss": 4.8603, + "step": 12355 + }, + { + "epoch": 0.0734846322200019, + "grad_norm": 2.104738473892212, + "learning_rate": 4.9336863206323345e-05, + "loss": 4.7806, + "step": 12356 + }, + { + "epoch": 0.0734905795032829, + "grad_norm": 1.8287266492843628, + "learning_rate": 4.933675633207101e-05, + "loss": 4.7082, + "step": 12357 + }, + { + "epoch": 0.0734965267865639, + "grad_norm": 2.0478014945983887, + "learning_rate": 4.933664944932297e-05, + "loss": 4.6145, + "step": 12358 + }, + { + "epoch": 0.07350247406984489, + "grad_norm": 2.208263397216797, + "learning_rate": 4.9336542558079244e-05, + "loss": 4.7523, + "step": 12359 + }, + { + "epoch": 0.0735084213531259, + "grad_norm": 2.1506083011627197, + "learning_rate": 4.93364356583399e-05, + "loss": 4.7444, + "step": 12360 + }, + { + "epoch": 0.0735143686364069, + "grad_norm": 2.04584002494812, + "learning_rate": 4.933632875010494e-05, + "loss": 4.6706, + "step": 12361 + }, + { + "epoch": 0.07352031591968788, + "grad_norm": 1.8598030805587769, + "learning_rate": 4.933622183337443e-05, + "loss": 4.6404, + "step": 12362 + }, + { + "epoch": 0.07352626320296889, + "grad_norm": 2.5650441646575928, + "learning_rate": 4.93361149081484e-05, + "loss": 5.382, + "step": 12363 + }, + { + "epoch": 0.07353221048624987, + "grad_norm": 2.1182446479797363, + "learning_rate": 4.933600797442688e-05, + "loss": 5.9041, + "step": 12364 + }, + { + "epoch": 0.07353815776953088, + "grad_norm": 1.8753353357315063, + "learning_rate": 4.933590103220991e-05, + "loss": 5.6615, + "step": 12365 + }, + { + "epoch": 0.07354410505281188, + "grad_norm": 1.9428893327713013, + "learning_rate": 4.933579408149752e-05, + "loss": 5.3549, + "step": 12366 + }, + { + "epoch": 0.07355005233609287, + "grad_norm": 1.809191346168518, + "learning_rate": 4.9335687122289766e-05, + "loss": 5.5603, + "step": 12367 + }, + { + "epoch": 0.07355599961937387, + "grad_norm": 1.7782649993896484, + "learning_rate": 4.933558015458667e-05, + "loss": 5.2848, + "step": 12368 + }, + { + "epoch": 0.07356194690265487, + "grad_norm": 1.71909499168396, + "learning_rate": 4.933547317838828e-05, + "loss": 5.3774, + "step": 12369 + }, + { + "epoch": 0.07356789418593586, + "grad_norm": 1.6399723291397095, + "learning_rate": 4.9335366193694625e-05, + "loss": 5.629, + "step": 12370 + }, + { + "epoch": 0.07357384146921686, + "grad_norm": 1.8646855354309082, + "learning_rate": 4.9335259200505746e-05, + "loss": 5.6297, + "step": 12371 + }, + { + "epoch": 0.07357978875249786, + "grad_norm": 1.5271104574203491, + "learning_rate": 4.9335152198821676e-05, + "loss": 5.6112, + "step": 12372 + }, + { + "epoch": 0.07358573603577885, + "grad_norm": 1.6217905282974243, + "learning_rate": 4.933504518864246e-05, + "loss": 5.2959, + "step": 12373 + }, + { + "epoch": 0.07359168331905985, + "grad_norm": 1.5774266719818115, + "learning_rate": 4.933493816996812e-05, + "loss": 5.4181, + "step": 12374 + }, + { + "epoch": 0.07359763060234085, + "grad_norm": 1.3641432523727417, + "learning_rate": 4.933483114279872e-05, + "loss": 5.3903, + "step": 12375 + }, + { + "epoch": 0.07360357788562184, + "grad_norm": 1.67635178565979, + "learning_rate": 4.933472410713428e-05, + "loss": 5.6771, + "step": 12376 + }, + { + "epoch": 0.07360952516890285, + "grad_norm": 1.6944624185562134, + "learning_rate": 4.933461706297483e-05, + "loss": 5.6008, + "step": 12377 + }, + { + "epoch": 0.07361547245218385, + "grad_norm": 1.3603699207305908, + "learning_rate": 4.933451001032042e-05, + "loss": 5.5396, + "step": 12378 + }, + { + "epoch": 0.07362141973546484, + "grad_norm": 1.6585369110107422, + "learning_rate": 4.9334402949171086e-05, + "loss": 5.5697, + "step": 12379 + }, + { + "epoch": 0.07362736701874584, + "grad_norm": 1.503786563873291, + "learning_rate": 4.9334295879526865e-05, + "loss": 5.4539, + "step": 12380 + }, + { + "epoch": 0.07363331430202684, + "grad_norm": 1.4761176109313965, + "learning_rate": 4.933418880138779e-05, + "loss": 5.4573, + "step": 12381 + }, + { + "epoch": 0.07363926158530783, + "grad_norm": 1.671972393989563, + "learning_rate": 4.93340817147539e-05, + "loss": 5.4143, + "step": 12382 + }, + { + "epoch": 0.07364520886858883, + "grad_norm": 1.5486379861831665, + "learning_rate": 4.9333974619625236e-05, + "loss": 5.4134, + "step": 12383 + }, + { + "epoch": 0.07365115615186983, + "grad_norm": 1.340108036994934, + "learning_rate": 4.933386751600183e-05, + "loss": 5.4587, + "step": 12384 + }, + { + "epoch": 0.07365710343515082, + "grad_norm": 1.3910952806472778, + "learning_rate": 4.933376040388372e-05, + "loss": 5.4129, + "step": 12385 + }, + { + "epoch": 0.07366305071843182, + "grad_norm": 1.5878056287765503, + "learning_rate": 4.9333653283270955e-05, + "loss": 5.3633, + "step": 12386 + }, + { + "epoch": 0.07366899800171282, + "grad_norm": 1.6040968894958496, + "learning_rate": 4.933354615416356e-05, + "loss": 5.2486, + "step": 12387 + }, + { + "epoch": 0.07367494528499381, + "grad_norm": 1.4824137687683105, + "learning_rate": 4.933343901656157e-05, + "loss": 5.2947, + "step": 12388 + }, + { + "epoch": 0.07368089256827481, + "grad_norm": 1.6114120483398438, + "learning_rate": 4.933333187046503e-05, + "loss": 5.2948, + "step": 12389 + }, + { + "epoch": 0.07368683985155582, + "grad_norm": 1.4269661903381348, + "learning_rate": 4.933322471587398e-05, + "loss": 5.1633, + "step": 12390 + }, + { + "epoch": 0.0736927871348368, + "grad_norm": 1.430588960647583, + "learning_rate": 4.933311755278844e-05, + "loss": 5.2846, + "step": 12391 + }, + { + "epoch": 0.0736987344181178, + "grad_norm": 1.3490641117095947, + "learning_rate": 4.9333010381208476e-05, + "loss": 5.2067, + "step": 12392 + }, + { + "epoch": 0.0737046817013988, + "grad_norm": 1.9292722940444946, + "learning_rate": 4.9332903201134104e-05, + "loss": 5.6196, + "step": 12393 + }, + { + "epoch": 0.0737106289846798, + "grad_norm": 1.8885586261749268, + "learning_rate": 4.933279601256536e-05, + "loss": 5.5225, + "step": 12394 + }, + { + "epoch": 0.0737165762679608, + "grad_norm": 1.5985313653945923, + "learning_rate": 4.93326888155023e-05, + "loss": 5.7447, + "step": 12395 + }, + { + "epoch": 0.07372252355124179, + "grad_norm": 2.819392681121826, + "learning_rate": 4.933258160994494e-05, + "loss": 6.002, + "step": 12396 + }, + { + "epoch": 0.07372847083452279, + "grad_norm": 2.006615161895752, + "learning_rate": 4.933247439589333e-05, + "loss": 5.7733, + "step": 12397 + }, + { + "epoch": 0.07373441811780379, + "grad_norm": 1.628408432006836, + "learning_rate": 4.933236717334751e-05, + "loss": 5.3899, + "step": 12398 + }, + { + "epoch": 0.07374036540108478, + "grad_norm": 1.5265247821807861, + "learning_rate": 4.93322599423075e-05, + "loss": 5.3891, + "step": 12399 + }, + { + "epoch": 0.07374631268436578, + "grad_norm": 1.6663800477981567, + "learning_rate": 4.933215270277336e-05, + "loss": 5.6172, + "step": 12400 + }, + { + "epoch": 0.07375225996764678, + "grad_norm": 1.7699551582336426, + "learning_rate": 4.933204545474511e-05, + "loss": 5.7088, + "step": 12401 + }, + { + "epoch": 0.07375820725092777, + "grad_norm": 1.5542314052581787, + "learning_rate": 4.93319381982228e-05, + "loss": 5.5925, + "step": 12402 + }, + { + "epoch": 0.07376415453420877, + "grad_norm": 1.5389710664749146, + "learning_rate": 4.933183093320646e-05, + "loss": 5.572, + "step": 12403 + }, + { + "epoch": 0.07377010181748977, + "grad_norm": 1.381242275238037, + "learning_rate": 4.9331723659696124e-05, + "loss": 5.4964, + "step": 12404 + }, + { + "epoch": 0.07377604910077076, + "grad_norm": 1.5536670684814453, + "learning_rate": 4.933161637769184e-05, + "loss": 5.3748, + "step": 12405 + }, + { + "epoch": 0.07378199638405177, + "grad_norm": 1.6656473875045776, + "learning_rate": 4.933150908719364e-05, + "loss": 5.3267, + "step": 12406 + }, + { + "epoch": 0.07378794366733277, + "grad_norm": 1.9200701713562012, + "learning_rate": 4.933140178820156e-05, + "loss": 5.2928, + "step": 12407 + }, + { + "epoch": 0.07379389095061376, + "grad_norm": 1.6290313005447388, + "learning_rate": 4.933129448071564e-05, + "loss": 5.4969, + "step": 12408 + }, + { + "epoch": 0.07379983823389476, + "grad_norm": 1.7247267961502075, + "learning_rate": 4.933118716473592e-05, + "loss": 5.564, + "step": 12409 + }, + { + "epoch": 0.07380578551717576, + "grad_norm": 1.4726417064666748, + "learning_rate": 4.933107984026243e-05, + "loss": 5.1759, + "step": 12410 + }, + { + "epoch": 0.07381173280045675, + "grad_norm": 1.4726674556732178, + "learning_rate": 4.933097250729522e-05, + "loss": 5.1731, + "step": 12411 + }, + { + "epoch": 0.07381768008373775, + "grad_norm": 1.4694938659667969, + "learning_rate": 4.93308651658343e-05, + "loss": 5.4539, + "step": 12412 + }, + { + "epoch": 0.07382362736701875, + "grad_norm": 1.5212653875350952, + "learning_rate": 4.9330757815879734e-05, + "loss": 5.5035, + "step": 12413 + }, + { + "epoch": 0.07382957465029974, + "grad_norm": 1.3731454610824585, + "learning_rate": 4.933065045743156e-05, + "loss": 5.415, + "step": 12414 + }, + { + "epoch": 0.07383552193358074, + "grad_norm": 1.5576610565185547, + "learning_rate": 4.93305430904898e-05, + "loss": 5.2776, + "step": 12415 + }, + { + "epoch": 0.07384146921686174, + "grad_norm": 1.72965407371521, + "learning_rate": 4.93304357150545e-05, + "loss": 5.3598, + "step": 12416 + }, + { + "epoch": 0.07384741650014273, + "grad_norm": 1.5218521356582642, + "learning_rate": 4.93303283311257e-05, + "loss": 5.295, + "step": 12417 + }, + { + "epoch": 0.07385336378342373, + "grad_norm": 1.5174230337142944, + "learning_rate": 4.933022093870343e-05, + "loss": 5.3506, + "step": 12418 + }, + { + "epoch": 0.07385931106670474, + "grad_norm": 1.3844187259674072, + "learning_rate": 4.933011353778773e-05, + "loss": 5.4345, + "step": 12419 + }, + { + "epoch": 0.07386525834998572, + "grad_norm": 1.5130188465118408, + "learning_rate": 4.9330006128378645e-05, + "loss": 5.4359, + "step": 12420 + }, + { + "epoch": 0.07387120563326673, + "grad_norm": 1.599004864692688, + "learning_rate": 4.93298987104762e-05, + "loss": 5.1631, + "step": 12421 + }, + { + "epoch": 0.07387715291654771, + "grad_norm": 1.6220343112945557, + "learning_rate": 4.932979128408044e-05, + "loss": 5.1244, + "step": 12422 + }, + { + "epoch": 0.07388310019982872, + "grad_norm": 1.5366616249084473, + "learning_rate": 4.93296838491914e-05, + "loss": 5.0368, + "step": 12423 + }, + { + "epoch": 0.07388904748310972, + "grad_norm": 1.5800726413726807, + "learning_rate": 4.932957640580912e-05, + "loss": 4.9906, + "step": 12424 + }, + { + "epoch": 0.0738949947663907, + "grad_norm": 1.6035537719726562, + "learning_rate": 4.9329468953933637e-05, + "loss": 5.0616, + "step": 12425 + }, + { + "epoch": 0.07390094204967171, + "grad_norm": 1.580127239227295, + "learning_rate": 4.932936149356499e-05, + "loss": 5.145, + "step": 12426 + }, + { + "epoch": 0.07390688933295271, + "grad_norm": 1.724788784980774, + "learning_rate": 4.932925402470321e-05, + "loss": 4.9589, + "step": 12427 + }, + { + "epoch": 0.0739128366162337, + "grad_norm": 1.5442367792129517, + "learning_rate": 4.932914654734834e-05, + "loss": 5.077, + "step": 12428 + }, + { + "epoch": 0.0739187838995147, + "grad_norm": 1.3692456483840942, + "learning_rate": 4.932903906150042e-05, + "loss": 5.1778, + "step": 12429 + }, + { + "epoch": 0.0739247311827957, + "grad_norm": 1.8229175806045532, + "learning_rate": 4.932893156715948e-05, + "loss": 5.4053, + "step": 12430 + }, + { + "epoch": 0.07393067846607669, + "grad_norm": 1.7769286632537842, + "learning_rate": 4.9328824064325566e-05, + "loss": 5.2541, + "step": 12431 + }, + { + "epoch": 0.07393662574935769, + "grad_norm": 1.7022631168365479, + "learning_rate": 4.93287165529987e-05, + "loss": 4.8555, + "step": 12432 + }, + { + "epoch": 0.0739425730326387, + "grad_norm": 1.5031015872955322, + "learning_rate": 4.932860903317894e-05, + "loss": 5.019, + "step": 12433 + }, + { + "epoch": 0.07394852031591968, + "grad_norm": 1.352550983428955, + "learning_rate": 4.932850150486631e-05, + "loss": 5.239, + "step": 12434 + }, + { + "epoch": 0.07395446759920069, + "grad_norm": 1.5571177005767822, + "learning_rate": 4.932839396806085e-05, + "loss": 5.2511, + "step": 12435 + }, + { + "epoch": 0.07396041488248169, + "grad_norm": 1.7673511505126953, + "learning_rate": 4.93282864227626e-05, + "loss": 5.1811, + "step": 12436 + }, + { + "epoch": 0.07396636216576268, + "grad_norm": 1.6385267972946167, + "learning_rate": 4.932817886897161e-05, + "loss": 5.1644, + "step": 12437 + }, + { + "epoch": 0.07397230944904368, + "grad_norm": 1.6142395734786987, + "learning_rate": 4.932807130668788e-05, + "loss": 5.173, + "step": 12438 + }, + { + "epoch": 0.07397825673232468, + "grad_norm": 1.6966745853424072, + "learning_rate": 4.932796373591149e-05, + "loss": 5.1495, + "step": 12439 + }, + { + "epoch": 0.07398420401560567, + "grad_norm": 1.6631567478179932, + "learning_rate": 4.932785615664245e-05, + "loss": 5.1787, + "step": 12440 + }, + { + "epoch": 0.07399015129888667, + "grad_norm": 1.7747845649719238, + "learning_rate": 4.9327748568880816e-05, + "loss": 5.1303, + "step": 12441 + }, + { + "epoch": 0.07399609858216767, + "grad_norm": 1.457535982131958, + "learning_rate": 4.932764097262661e-05, + "loss": 5.1573, + "step": 12442 + }, + { + "epoch": 0.07400204586544866, + "grad_norm": 1.602452039718628, + "learning_rate": 4.9327533367879875e-05, + "loss": 5.1039, + "step": 12443 + }, + { + "epoch": 0.07400799314872966, + "grad_norm": 1.644687294960022, + "learning_rate": 4.932742575464065e-05, + "loss": 5.3112, + "step": 12444 + }, + { + "epoch": 0.07401394043201066, + "grad_norm": 1.5873420238494873, + "learning_rate": 4.932731813290897e-05, + "loss": 5.1128, + "step": 12445 + }, + { + "epoch": 0.07401988771529165, + "grad_norm": 1.8046668767929077, + "learning_rate": 4.932721050268489e-05, + "loss": 4.9776, + "step": 12446 + }, + { + "epoch": 0.07402583499857265, + "grad_norm": 1.6964846849441528, + "learning_rate": 4.932710286396841e-05, + "loss": 5.0039, + "step": 12447 + }, + { + "epoch": 0.07403178228185366, + "grad_norm": 1.5332229137420654, + "learning_rate": 4.93269952167596e-05, + "loss": 4.9873, + "step": 12448 + }, + { + "epoch": 0.07403772956513464, + "grad_norm": 1.6128625869750977, + "learning_rate": 4.9326887561058485e-05, + "loss": 5.1139, + "step": 12449 + }, + { + "epoch": 0.07404367684841565, + "grad_norm": 1.5800291299819946, + "learning_rate": 4.932677989686511e-05, + "loss": 4.9687, + "step": 12450 + }, + { + "epoch": 0.07404962413169663, + "grad_norm": 1.6543092727661133, + "learning_rate": 4.932667222417951e-05, + "loss": 4.8345, + "step": 12451 + }, + { + "epoch": 0.07405557141497764, + "grad_norm": 1.4438380002975464, + "learning_rate": 4.932656454300171e-05, + "loss": 4.9677, + "step": 12452 + }, + { + "epoch": 0.07406151869825864, + "grad_norm": 1.6437597274780273, + "learning_rate": 4.932645685333176e-05, + "loss": 4.9016, + "step": 12453 + }, + { + "epoch": 0.07406746598153963, + "grad_norm": 1.5359379053115845, + "learning_rate": 4.932634915516969e-05, + "loss": 4.8357, + "step": 12454 + }, + { + "epoch": 0.07407341326482063, + "grad_norm": 1.6683440208435059, + "learning_rate": 4.9326241448515554e-05, + "loss": 4.8715, + "step": 12455 + }, + { + "epoch": 0.07407936054810163, + "grad_norm": 1.5654494762420654, + "learning_rate": 4.932613373336937e-05, + "loss": 4.8993, + "step": 12456 + }, + { + "epoch": 0.07408530783138262, + "grad_norm": 1.5333384275436401, + "learning_rate": 4.932602600973119e-05, + "loss": 4.9181, + "step": 12457 + }, + { + "epoch": 0.07409125511466362, + "grad_norm": 1.5674177408218384, + "learning_rate": 4.9325918277601046e-05, + "loss": 4.905, + "step": 12458 + }, + { + "epoch": 0.07409720239794462, + "grad_norm": 1.410294771194458, + "learning_rate": 4.9325810536978965e-05, + "loss": 4.8645, + "step": 12459 + }, + { + "epoch": 0.07410314968122561, + "grad_norm": 1.4950916767120361, + "learning_rate": 4.9325702787865006e-05, + "loss": 4.8289, + "step": 12460 + }, + { + "epoch": 0.07410909696450661, + "grad_norm": 1.7529935836791992, + "learning_rate": 4.9325595030259195e-05, + "loss": 4.8917, + "step": 12461 + }, + { + "epoch": 0.07411504424778761, + "grad_norm": 3.5575430393218994, + "learning_rate": 4.932548726416157e-05, + "loss": 5.5795, + "step": 12462 + }, + { + "epoch": 0.0741209915310686, + "grad_norm": 1.5091896057128906, + "learning_rate": 4.9325379489572165e-05, + "loss": 4.9864, + "step": 12463 + }, + { + "epoch": 0.0741269388143496, + "grad_norm": 1.6818382740020752, + "learning_rate": 4.932527170649102e-05, + "loss": 5.3386, + "step": 12464 + }, + { + "epoch": 0.07413288609763061, + "grad_norm": 1.7938569784164429, + "learning_rate": 4.932516391491818e-05, + "loss": 5.2668, + "step": 12465 + }, + { + "epoch": 0.0741388333809116, + "grad_norm": 1.89009428024292, + "learning_rate": 4.932505611485367e-05, + "loss": 5.1755, + "step": 12466 + }, + { + "epoch": 0.0741447806641926, + "grad_norm": 1.5277502536773682, + "learning_rate": 4.932494830629753e-05, + "loss": 5.3271, + "step": 12467 + }, + { + "epoch": 0.0741507279474736, + "grad_norm": 1.7720823287963867, + "learning_rate": 4.932484048924981e-05, + "loss": 5.7089, + "step": 12468 + }, + { + "epoch": 0.07415667523075459, + "grad_norm": 1.6797159910202026, + "learning_rate": 4.932473266371054e-05, + "loss": 5.5563, + "step": 12469 + }, + { + "epoch": 0.07416262251403559, + "grad_norm": 1.6536195278167725, + "learning_rate": 4.932462482967976e-05, + "loss": 5.4271, + "step": 12470 + }, + { + "epoch": 0.07416856979731659, + "grad_norm": 1.5667130947113037, + "learning_rate": 4.93245169871575e-05, + "loss": 5.3703, + "step": 12471 + }, + { + "epoch": 0.07417451708059758, + "grad_norm": 1.3659738302230835, + "learning_rate": 4.93244091361438e-05, + "loss": 5.4114, + "step": 12472 + }, + { + "epoch": 0.07418046436387858, + "grad_norm": 1.5106414556503296, + "learning_rate": 4.9324301276638705e-05, + "loss": 5.386, + "step": 12473 + }, + { + "epoch": 0.07418641164715958, + "grad_norm": 1.5054755210876465, + "learning_rate": 4.932419340864225e-05, + "loss": 5.3067, + "step": 12474 + }, + { + "epoch": 0.07419235893044057, + "grad_norm": 1.4413330554962158, + "learning_rate": 4.932408553215446e-05, + "loss": 5.358, + "step": 12475 + }, + { + "epoch": 0.07419830621372157, + "grad_norm": 1.3034652471542358, + "learning_rate": 4.932397764717539e-05, + "loss": 5.2942, + "step": 12476 + }, + { + "epoch": 0.07420425349700258, + "grad_norm": 1.494664192199707, + "learning_rate": 4.9323869753705074e-05, + "loss": 5.4243, + "step": 12477 + }, + { + "epoch": 0.07421020078028356, + "grad_norm": 1.2644178867340088, + "learning_rate": 4.932376185174354e-05, + "loss": 5.2212, + "step": 12478 + }, + { + "epoch": 0.07421614806356457, + "grad_norm": 1.5576590299606323, + "learning_rate": 4.9323653941290836e-05, + "loss": 5.2077, + "step": 12479 + }, + { + "epoch": 0.07422209534684555, + "grad_norm": 1.5699479579925537, + "learning_rate": 4.932354602234699e-05, + "loss": 5.3849, + "step": 12480 + }, + { + "epoch": 0.07422804263012656, + "grad_norm": 1.6582329273223877, + "learning_rate": 4.932343809491205e-05, + "loss": 5.3961, + "step": 12481 + }, + { + "epoch": 0.07423398991340756, + "grad_norm": 1.6159483194351196, + "learning_rate": 4.932333015898605e-05, + "loss": 5.3711, + "step": 12482 + }, + { + "epoch": 0.07423993719668855, + "grad_norm": 1.453933596611023, + "learning_rate": 4.932322221456902e-05, + "loss": 5.2899, + "step": 12483 + }, + { + "epoch": 0.07424588447996955, + "grad_norm": 1.3830047845840454, + "learning_rate": 4.9323114261661014e-05, + "loss": 5.3839, + "step": 12484 + }, + { + "epoch": 0.07425183176325055, + "grad_norm": 1.5541338920593262, + "learning_rate": 4.932300630026205e-05, + "loss": 5.257, + "step": 12485 + }, + { + "epoch": 0.07425777904653154, + "grad_norm": 1.5887267589569092, + "learning_rate": 4.932289833037219e-05, + "loss": 5.2079, + "step": 12486 + }, + { + "epoch": 0.07426372632981254, + "grad_norm": 1.6341818571090698, + "learning_rate": 4.932279035199144e-05, + "loss": 5.2529, + "step": 12487 + }, + { + "epoch": 0.07426967361309354, + "grad_norm": 1.5520392656326294, + "learning_rate": 4.9322682365119866e-05, + "loss": 5.2416, + "step": 12488 + }, + { + "epoch": 0.07427562089637453, + "grad_norm": 1.610711693763733, + "learning_rate": 4.93225743697575e-05, + "loss": 5.3172, + "step": 12489 + }, + { + "epoch": 0.07428156817965553, + "grad_norm": 1.5997258424758911, + "learning_rate": 4.932246636590436e-05, + "loss": 5.2343, + "step": 12490 + }, + { + "epoch": 0.07428751546293653, + "grad_norm": 1.5319284200668335, + "learning_rate": 4.932235835356051e-05, + "loss": 5.2021, + "step": 12491 + }, + { + "epoch": 0.07429346274621752, + "grad_norm": 1.6516488790512085, + "learning_rate": 4.932225033272597e-05, + "loss": 5.2678, + "step": 12492 + }, + { + "epoch": 0.07429941002949852, + "grad_norm": 1.9008166790008545, + "learning_rate": 4.9322142303400786e-05, + "loss": 5.1424, + "step": 12493 + }, + { + "epoch": 0.07430535731277953, + "grad_norm": 1.8372108936309814, + "learning_rate": 4.932203426558499e-05, + "loss": 5.321, + "step": 12494 + }, + { + "epoch": 0.07431130459606052, + "grad_norm": 1.4764071702957153, + "learning_rate": 4.932192621927863e-05, + "loss": 5.3627, + "step": 12495 + }, + { + "epoch": 0.07431725187934152, + "grad_norm": 1.6356589794158936, + "learning_rate": 4.932181816448173e-05, + "loss": 5.2061, + "step": 12496 + }, + { + "epoch": 0.07432319916262252, + "grad_norm": 1.6335545778274536, + "learning_rate": 4.932171010119434e-05, + "loss": 5.2283, + "step": 12497 + }, + { + "epoch": 0.07432914644590351, + "grad_norm": 1.499968409538269, + "learning_rate": 4.932160202941649e-05, + "loss": 5.4862, + "step": 12498 + }, + { + "epoch": 0.07433509372918451, + "grad_norm": 1.7292691469192505, + "learning_rate": 4.932149394914822e-05, + "loss": 5.4055, + "step": 12499 + }, + { + "epoch": 0.07434104101246551, + "grad_norm": 1.6818633079528809, + "learning_rate": 4.932138586038957e-05, + "loss": 5.5262, + "step": 12500 + }, + { + "epoch": 0.0743469882957465, + "grad_norm": 1.4048001766204834, + "learning_rate": 4.932127776314057e-05, + "loss": 5.1876, + "step": 12501 + }, + { + "epoch": 0.0743529355790275, + "grad_norm": 1.6041479110717773, + "learning_rate": 4.9321169657401264e-05, + "loss": 5.0791, + "step": 12502 + }, + { + "epoch": 0.0743588828623085, + "grad_norm": 1.3542897701263428, + "learning_rate": 4.932106154317169e-05, + "loss": 5.189, + "step": 12503 + }, + { + "epoch": 0.07436483014558949, + "grad_norm": 1.7782005071640015, + "learning_rate": 4.932095342045189e-05, + "loss": 5.2823, + "step": 12504 + }, + { + "epoch": 0.0743707774288705, + "grad_norm": 1.5981978178024292, + "learning_rate": 4.932084528924189e-05, + "loss": 5.3978, + "step": 12505 + }, + { + "epoch": 0.0743767247121515, + "grad_norm": 1.5224134922027588, + "learning_rate": 4.9320737149541734e-05, + "loss": 5.336, + "step": 12506 + }, + { + "epoch": 0.07438267199543248, + "grad_norm": 1.4827311038970947, + "learning_rate": 4.932062900135147e-05, + "loss": 5.2284, + "step": 12507 + }, + { + "epoch": 0.07438861927871349, + "grad_norm": 1.4394789934158325, + "learning_rate": 4.932052084467111e-05, + "loss": 5.1672, + "step": 12508 + }, + { + "epoch": 0.07439456656199447, + "grad_norm": 1.5112950801849365, + "learning_rate": 4.9320412679500715e-05, + "loss": 5.4069, + "step": 12509 + }, + { + "epoch": 0.07440051384527548, + "grad_norm": 1.4547615051269531, + "learning_rate": 4.932030450584032e-05, + "loss": 5.3317, + "step": 12510 + }, + { + "epoch": 0.07440646112855648, + "grad_norm": 1.5839279890060425, + "learning_rate": 4.9320196323689946e-05, + "loss": 5.2042, + "step": 12511 + }, + { + "epoch": 0.07441240841183747, + "grad_norm": 1.6392362117767334, + "learning_rate": 4.9320088133049655e-05, + "loss": 5.2595, + "step": 12512 + }, + { + "epoch": 0.07441835569511847, + "grad_norm": 1.530236840248108, + "learning_rate": 4.931997993391947e-05, + "loss": 5.4417, + "step": 12513 + }, + { + "epoch": 0.07442430297839947, + "grad_norm": 1.7665959596633911, + "learning_rate": 4.931987172629943e-05, + "loss": 5.5164, + "step": 12514 + }, + { + "epoch": 0.07443025026168046, + "grad_norm": 1.5256375074386597, + "learning_rate": 4.931976351018957e-05, + "loss": 5.3645, + "step": 12515 + }, + { + "epoch": 0.07443619754496146, + "grad_norm": 1.5948551893234253, + "learning_rate": 4.9319655285589937e-05, + "loss": 5.1964, + "step": 12516 + }, + { + "epoch": 0.07444214482824246, + "grad_norm": 1.451249361038208, + "learning_rate": 4.931954705250056e-05, + "loss": 5.3043, + "step": 12517 + }, + { + "epoch": 0.07444809211152345, + "grad_norm": 1.5874381065368652, + "learning_rate": 4.931943881092148e-05, + "loss": 5.3769, + "step": 12518 + }, + { + "epoch": 0.07445403939480445, + "grad_norm": 1.597102165222168, + "learning_rate": 4.931933056085274e-05, + "loss": 5.2909, + "step": 12519 + }, + { + "epoch": 0.07445998667808545, + "grad_norm": 1.3787156343460083, + "learning_rate": 4.9319222302294364e-05, + "loss": 5.5499, + "step": 12520 + }, + { + "epoch": 0.07446593396136644, + "grad_norm": 1.5816805362701416, + "learning_rate": 4.931911403524641e-05, + "loss": 5.255, + "step": 12521 + }, + { + "epoch": 0.07447188124464744, + "grad_norm": 1.636619210243225, + "learning_rate": 4.93190057597089e-05, + "loss": 5.3816, + "step": 12522 + }, + { + "epoch": 0.07447782852792845, + "grad_norm": 1.518872857093811, + "learning_rate": 4.931889747568187e-05, + "loss": 5.3376, + "step": 12523 + }, + { + "epoch": 0.07448377581120944, + "grad_norm": 1.9586291313171387, + "learning_rate": 4.931878918316537e-05, + "loss": 5.6678, + "step": 12524 + }, + { + "epoch": 0.07448972309449044, + "grad_norm": 1.5893887281417847, + "learning_rate": 4.9318680882159435e-05, + "loss": 5.266, + "step": 12525 + }, + { + "epoch": 0.07449567037777144, + "grad_norm": 1.5339915752410889, + "learning_rate": 4.93185725726641e-05, + "loss": 5.1891, + "step": 12526 + }, + { + "epoch": 0.07450161766105243, + "grad_norm": 1.730128288269043, + "learning_rate": 4.9318464254679396e-05, + "loss": 5.1534, + "step": 12527 + }, + { + "epoch": 0.07450756494433343, + "grad_norm": 1.691015362739563, + "learning_rate": 4.931835592820537e-05, + "loss": 5.2599, + "step": 12528 + }, + { + "epoch": 0.07451351222761443, + "grad_norm": 1.2936137914657593, + "learning_rate": 4.9318247593242056e-05, + "loss": 5.2432, + "step": 12529 + }, + { + "epoch": 0.07451945951089542, + "grad_norm": 1.4507200717926025, + "learning_rate": 4.93181392497895e-05, + "loss": 5.1539, + "step": 12530 + }, + { + "epoch": 0.07452540679417642, + "grad_norm": 1.6212667226791382, + "learning_rate": 4.931803089784772e-05, + "loss": 5.1212, + "step": 12531 + }, + { + "epoch": 0.07453135407745742, + "grad_norm": 1.48690927028656, + "learning_rate": 4.9317922537416775e-05, + "loss": 5.168, + "step": 12532 + }, + { + "epoch": 0.07453730136073841, + "grad_norm": 1.5102870464324951, + "learning_rate": 4.931781416849669e-05, + "loss": 5.2024, + "step": 12533 + }, + { + "epoch": 0.07454324864401941, + "grad_norm": 1.4186264276504517, + "learning_rate": 4.9317705791087516e-05, + "loss": 5.1154, + "step": 12534 + }, + { + "epoch": 0.07454919592730042, + "grad_norm": 1.623822569847107, + "learning_rate": 4.931759740518928e-05, + "loss": 5.0244, + "step": 12535 + }, + { + "epoch": 0.0745551432105814, + "grad_norm": 1.4694246053695679, + "learning_rate": 4.9317489010802015e-05, + "loss": 5.1737, + "step": 12536 + }, + { + "epoch": 0.0745610904938624, + "grad_norm": 1.553551435470581, + "learning_rate": 4.931738060792577e-05, + "loss": 5.1339, + "step": 12537 + }, + { + "epoch": 0.0745670377771434, + "grad_norm": 1.744367003440857, + "learning_rate": 4.9317272196560575e-05, + "loss": 5.1564, + "step": 12538 + }, + { + "epoch": 0.0745729850604244, + "grad_norm": 1.6584309339523315, + "learning_rate": 4.931716377670648e-05, + "loss": 5.1871, + "step": 12539 + }, + { + "epoch": 0.0745789323437054, + "grad_norm": 1.6894947290420532, + "learning_rate": 4.931705534836351e-05, + "loss": 5.1432, + "step": 12540 + }, + { + "epoch": 0.07458487962698639, + "grad_norm": 1.467315912246704, + "learning_rate": 4.93169469115317e-05, + "loss": 5.2072, + "step": 12541 + }, + { + "epoch": 0.07459082691026739, + "grad_norm": 1.478841781616211, + "learning_rate": 4.93168384662111e-05, + "loss": 5.3644, + "step": 12542 + }, + { + "epoch": 0.07459677419354839, + "grad_norm": 1.6001938581466675, + "learning_rate": 4.9316730012401745e-05, + "loss": 5.2031, + "step": 12543 + }, + { + "epoch": 0.07460272147682938, + "grad_norm": 1.480236530303955, + "learning_rate": 4.931662155010367e-05, + "loss": 5.0113, + "step": 12544 + }, + { + "epoch": 0.07460866876011038, + "grad_norm": 1.490511178970337, + "learning_rate": 4.9316513079316914e-05, + "loss": 5.0416, + "step": 12545 + }, + { + "epoch": 0.07461461604339138, + "grad_norm": 1.7327873706817627, + "learning_rate": 4.931640460004152e-05, + "loss": 5.0578, + "step": 12546 + }, + { + "epoch": 0.07462056332667237, + "grad_norm": 1.6410421133041382, + "learning_rate": 4.9316296112277514e-05, + "loss": 5.0239, + "step": 12547 + }, + { + "epoch": 0.07462651060995337, + "grad_norm": 1.5255141258239746, + "learning_rate": 4.9316187616024936e-05, + "loss": 5.1592, + "step": 12548 + }, + { + "epoch": 0.07463245789323437, + "grad_norm": 1.5555649995803833, + "learning_rate": 4.9316079111283835e-05, + "loss": 5.3981, + "step": 12549 + }, + { + "epoch": 0.07463840517651536, + "grad_norm": 1.4196929931640625, + "learning_rate": 4.931597059805424e-05, + "loss": 5.0682, + "step": 12550 + }, + { + "epoch": 0.07464435245979636, + "grad_norm": 1.562338948249817, + "learning_rate": 4.93158620763362e-05, + "loss": 5.3551, + "step": 12551 + }, + { + "epoch": 0.07465029974307737, + "grad_norm": 1.5955942869186401, + "learning_rate": 4.931575354612973e-05, + "loss": 5.3108, + "step": 12552 + }, + { + "epoch": 0.07465624702635835, + "grad_norm": 1.4173908233642578, + "learning_rate": 4.9315645007434885e-05, + "loss": 5.3793, + "step": 12553 + }, + { + "epoch": 0.07466219430963936, + "grad_norm": 1.4075239896774292, + "learning_rate": 4.93155364602517e-05, + "loss": 5.4409, + "step": 12554 + }, + { + "epoch": 0.07466814159292036, + "grad_norm": 1.3041841983795166, + "learning_rate": 4.9315427904580216e-05, + "loss": 5.5285, + "step": 12555 + }, + { + "epoch": 0.07467408887620135, + "grad_norm": 1.4277441501617432, + "learning_rate": 4.9315319340420465e-05, + "loss": 5.5017, + "step": 12556 + }, + { + "epoch": 0.07468003615948235, + "grad_norm": 1.407895803451538, + "learning_rate": 4.931521076777248e-05, + "loss": 5.3675, + "step": 12557 + }, + { + "epoch": 0.07468598344276335, + "grad_norm": 1.429131031036377, + "learning_rate": 4.931510218663632e-05, + "loss": 5.3712, + "step": 12558 + }, + { + "epoch": 0.07469193072604434, + "grad_norm": 1.7229793071746826, + "learning_rate": 4.9314993597011995e-05, + "loss": 5.4513, + "step": 12559 + }, + { + "epoch": 0.07469787800932534, + "grad_norm": 1.5961774587631226, + "learning_rate": 4.9314884998899565e-05, + "loss": 5.5478, + "step": 12560 + }, + { + "epoch": 0.07470382529260634, + "grad_norm": 1.4570807218551636, + "learning_rate": 4.931477639229906e-05, + "loss": 5.3973, + "step": 12561 + }, + { + "epoch": 0.07470977257588733, + "grad_norm": 1.6308903694152832, + "learning_rate": 4.931466777721052e-05, + "loss": 5.1951, + "step": 12562 + }, + { + "epoch": 0.07471571985916833, + "grad_norm": 1.438491940498352, + "learning_rate": 4.9314559153633974e-05, + "loss": 5.4237, + "step": 12563 + }, + { + "epoch": 0.07472166714244934, + "grad_norm": 1.7219120264053345, + "learning_rate": 4.931445052156947e-05, + "loss": 5.2303, + "step": 12564 + }, + { + "epoch": 0.07472761442573032, + "grad_norm": 1.557895302772522, + "learning_rate": 4.931434188101704e-05, + "loss": 5.2383, + "step": 12565 + }, + { + "epoch": 0.07473356170901133, + "grad_norm": 1.3585479259490967, + "learning_rate": 4.931423323197672e-05, + "loss": 5.2698, + "step": 12566 + }, + { + "epoch": 0.07473950899229233, + "grad_norm": 1.643608808517456, + "learning_rate": 4.931412457444857e-05, + "loss": 5.285, + "step": 12567 + }, + { + "epoch": 0.07474545627557332, + "grad_norm": 1.7847453355789185, + "learning_rate": 4.93140159084326e-05, + "loss": 5.413, + "step": 12568 + }, + { + "epoch": 0.07475140355885432, + "grad_norm": 1.5010985136032104, + "learning_rate": 4.931390723392886e-05, + "loss": 5.3665, + "step": 12569 + }, + { + "epoch": 0.0747573508421353, + "grad_norm": 1.3640403747558594, + "learning_rate": 4.931379855093738e-05, + "loss": 5.2253, + "step": 12570 + }, + { + "epoch": 0.07476329812541631, + "grad_norm": 1.4886012077331543, + "learning_rate": 4.9313689859458214e-05, + "loss": 5.5954, + "step": 12571 + }, + { + "epoch": 0.07476924540869731, + "grad_norm": 1.6626142263412476, + "learning_rate": 4.931358115949138e-05, + "loss": 5.3558, + "step": 12572 + }, + { + "epoch": 0.0747751926919783, + "grad_norm": 1.6350460052490234, + "learning_rate": 4.931347245103693e-05, + "loss": 5.3222, + "step": 12573 + }, + { + "epoch": 0.0747811399752593, + "grad_norm": 1.586182951927185, + "learning_rate": 4.93133637340949e-05, + "loss": 5.2056, + "step": 12574 + }, + { + "epoch": 0.0747870872585403, + "grad_norm": 1.6866692304611206, + "learning_rate": 4.931325500866532e-05, + "loss": 5.2698, + "step": 12575 + }, + { + "epoch": 0.07479303454182129, + "grad_norm": 1.4165509939193726, + "learning_rate": 4.9313146274748235e-05, + "loss": 5.2572, + "step": 12576 + }, + { + "epoch": 0.07479898182510229, + "grad_norm": 1.6259573698043823, + "learning_rate": 4.931303753234369e-05, + "loss": 5.2585, + "step": 12577 + }, + { + "epoch": 0.0748049291083833, + "grad_norm": 1.4159972667694092, + "learning_rate": 4.931292878145171e-05, + "loss": 5.1748, + "step": 12578 + }, + { + "epoch": 0.07481087639166428, + "grad_norm": 1.3880494832992554, + "learning_rate": 4.931282002207234e-05, + "loss": 5.2181, + "step": 12579 + }, + { + "epoch": 0.07481682367494528, + "grad_norm": 1.4466285705566406, + "learning_rate": 4.931271125420561e-05, + "loss": 5.2041, + "step": 12580 + }, + { + "epoch": 0.07482277095822629, + "grad_norm": 1.5111972093582153, + "learning_rate": 4.931260247785157e-05, + "loss": 5.2388, + "step": 12581 + }, + { + "epoch": 0.07482871824150727, + "grad_norm": 1.368296504020691, + "learning_rate": 4.9312493693010245e-05, + "loss": 5.0964, + "step": 12582 + }, + { + "epoch": 0.07483466552478828, + "grad_norm": 1.5604379177093506, + "learning_rate": 4.931238489968168e-05, + "loss": 5.2031, + "step": 12583 + }, + { + "epoch": 0.07484061280806928, + "grad_norm": 1.6104371547698975, + "learning_rate": 4.9312276097865916e-05, + "loss": 5.1122, + "step": 12584 + }, + { + "epoch": 0.07484656009135027, + "grad_norm": 1.5082486867904663, + "learning_rate": 4.931216728756299e-05, + "loss": 5.2092, + "step": 12585 + }, + { + "epoch": 0.07485250737463127, + "grad_norm": 2.1802000999450684, + "learning_rate": 4.931205846877293e-05, + "loss": 5.859, + "step": 12586 + }, + { + "epoch": 0.07485845465791227, + "grad_norm": 1.7069321870803833, + "learning_rate": 4.931194964149579e-05, + "loss": 4.9751, + "step": 12587 + }, + { + "epoch": 0.07486440194119326, + "grad_norm": 1.3614740371704102, + "learning_rate": 4.931184080573159e-05, + "loss": 5.2341, + "step": 12588 + }, + { + "epoch": 0.07487034922447426, + "grad_norm": 1.3952617645263672, + "learning_rate": 4.931173196148039e-05, + "loss": 5.0472, + "step": 12589 + }, + { + "epoch": 0.07487629650775526, + "grad_norm": 1.435829758644104, + "learning_rate": 4.9311623108742205e-05, + "loss": 5.0165, + "step": 12590 + }, + { + "epoch": 0.07488224379103625, + "grad_norm": 1.3875840902328491, + "learning_rate": 4.931151424751709e-05, + "loss": 5.5455, + "step": 12591 + }, + { + "epoch": 0.07488819107431725, + "grad_norm": 1.4364032745361328, + "learning_rate": 4.931140537780508e-05, + "loss": 5.5106, + "step": 12592 + }, + { + "epoch": 0.07489413835759826, + "grad_norm": 1.5878878831863403, + "learning_rate": 4.9311296499606194e-05, + "loss": 5.2372, + "step": 12593 + }, + { + "epoch": 0.07490008564087924, + "grad_norm": 1.5724025964736938, + "learning_rate": 4.9311187612920495e-05, + "loss": 5.3771, + "step": 12594 + }, + { + "epoch": 0.07490603292416025, + "grad_norm": 1.4630738496780396, + "learning_rate": 4.9311078717748014e-05, + "loss": 5.3378, + "step": 12595 + }, + { + "epoch": 0.07491198020744125, + "grad_norm": 1.4438437223434448, + "learning_rate": 4.931096981408878e-05, + "loss": 5.3019, + "step": 12596 + }, + { + "epoch": 0.07491792749072224, + "grad_norm": 1.674564242362976, + "learning_rate": 4.931086090194285e-05, + "loss": 5.2957, + "step": 12597 + }, + { + "epoch": 0.07492387477400324, + "grad_norm": 1.237748384475708, + "learning_rate": 4.9310751981310236e-05, + "loss": 5.1994, + "step": 12598 + }, + { + "epoch": 0.07492982205728423, + "grad_norm": 1.5828932523727417, + "learning_rate": 4.9310643052191e-05, + "loss": 5.2326, + "step": 12599 + }, + { + "epoch": 0.07493576934056523, + "grad_norm": 1.2774053812026978, + "learning_rate": 4.931053411458516e-05, + "loss": 5.2496, + "step": 12600 + }, + { + "epoch": 0.07494171662384623, + "grad_norm": 1.2986499071121216, + "learning_rate": 4.9310425168492766e-05, + "loss": 5.3061, + "step": 12601 + }, + { + "epoch": 0.07494766390712722, + "grad_norm": 1.3973673582077026, + "learning_rate": 4.931031621391386e-05, + "loss": 5.1437, + "step": 12602 + }, + { + "epoch": 0.07495361119040822, + "grad_norm": 1.4217787981033325, + "learning_rate": 4.9310207250848475e-05, + "loss": 5.1636, + "step": 12603 + }, + { + "epoch": 0.07495955847368922, + "grad_norm": 1.5062726736068726, + "learning_rate": 4.9310098279296634e-05, + "loss": 5.2944, + "step": 12604 + }, + { + "epoch": 0.07496550575697021, + "grad_norm": 1.4844671487808228, + "learning_rate": 4.9309989299258404e-05, + "loss": 5.1899, + "step": 12605 + }, + { + "epoch": 0.07497145304025121, + "grad_norm": 1.3542430400848389, + "learning_rate": 4.9309880310733805e-05, + "loss": 5.1636, + "step": 12606 + }, + { + "epoch": 0.07497740032353221, + "grad_norm": 1.58526611328125, + "learning_rate": 4.930977131372287e-05, + "loss": 5.5748, + "step": 12607 + }, + { + "epoch": 0.0749833476068132, + "grad_norm": 1.6003972291946411, + "learning_rate": 4.930966230822564e-05, + "loss": 5.3992, + "step": 12608 + }, + { + "epoch": 0.0749892948900942, + "grad_norm": 1.6475237607955933, + "learning_rate": 4.930955329424218e-05, + "loss": 5.4515, + "step": 12609 + }, + { + "epoch": 0.0749952421733752, + "grad_norm": 1.5395694971084595, + "learning_rate": 4.9309444271772486e-05, + "loss": 5.5117, + "step": 12610 + }, + { + "epoch": 0.0750011894566562, + "grad_norm": 1.3863389492034912, + "learning_rate": 4.930933524081663e-05, + "loss": 5.5771, + "step": 12611 + }, + { + "epoch": 0.0750071367399372, + "grad_norm": 1.431830644607544, + "learning_rate": 4.9309226201374626e-05, + "loss": 5.412, + "step": 12612 + }, + { + "epoch": 0.0750130840232182, + "grad_norm": 1.4647631645202637, + "learning_rate": 4.930911715344653e-05, + "loss": 5.1849, + "step": 12613 + }, + { + "epoch": 0.07501903130649919, + "grad_norm": 2.126068592071533, + "learning_rate": 4.930900809703237e-05, + "loss": 5.1712, + "step": 12614 + }, + { + "epoch": 0.07502497858978019, + "grad_norm": 1.3078912496566772, + "learning_rate": 4.9308899032132183e-05, + "loss": 5.3937, + "step": 12615 + }, + { + "epoch": 0.07503092587306119, + "grad_norm": 1.2535938024520874, + "learning_rate": 4.9308789958746016e-05, + "loss": 5.5708, + "step": 12616 + }, + { + "epoch": 0.07503687315634218, + "grad_norm": 1.3942710161209106, + "learning_rate": 4.9308680876873894e-05, + "loss": 5.5907, + "step": 12617 + }, + { + "epoch": 0.07504282043962318, + "grad_norm": 1.3061814308166504, + "learning_rate": 4.930857178651587e-05, + "loss": 5.2515, + "step": 12618 + }, + { + "epoch": 0.07504876772290418, + "grad_norm": 1.8493753671646118, + "learning_rate": 4.930846268767197e-05, + "loss": 4.9958, + "step": 12619 + }, + { + "epoch": 0.07505471500618517, + "grad_norm": 1.5966380834579468, + "learning_rate": 4.9308353580342234e-05, + "loss": 4.8784, + "step": 12620 + }, + { + "epoch": 0.07506066228946617, + "grad_norm": 1.6849051713943481, + "learning_rate": 4.930824446452671e-05, + "loss": 5.1549, + "step": 12621 + }, + { + "epoch": 0.07506660957274718, + "grad_norm": 1.5844405889511108, + "learning_rate": 4.9308135340225426e-05, + "loss": 4.9807, + "step": 12622 + }, + { + "epoch": 0.07507255685602816, + "grad_norm": 1.520621418952942, + "learning_rate": 4.9308026207438424e-05, + "loss": 5.2237, + "step": 12623 + }, + { + "epoch": 0.07507850413930917, + "grad_norm": 1.5273483991622925, + "learning_rate": 4.9307917066165744e-05, + "loss": 5.4053, + "step": 12624 + }, + { + "epoch": 0.07508445142259017, + "grad_norm": 1.7137775421142578, + "learning_rate": 4.9307807916407414e-05, + "loss": 5.0427, + "step": 12625 + }, + { + "epoch": 0.07509039870587116, + "grad_norm": 1.7140679359436035, + "learning_rate": 4.930769875816348e-05, + "loss": 5.0354, + "step": 12626 + }, + { + "epoch": 0.07509634598915216, + "grad_norm": 1.5592498779296875, + "learning_rate": 4.930758959143399e-05, + "loss": 4.9663, + "step": 12627 + }, + { + "epoch": 0.07510229327243315, + "grad_norm": 1.4611366987228394, + "learning_rate": 4.930748041621896e-05, + "loss": 4.9469, + "step": 12628 + }, + { + "epoch": 0.07510824055571415, + "grad_norm": 1.4682248830795288, + "learning_rate": 4.930737123251844e-05, + "loss": 5.0217, + "step": 12629 + }, + { + "epoch": 0.07511418783899515, + "grad_norm": 1.5643991231918335, + "learning_rate": 4.9307262040332474e-05, + "loss": 5.0488, + "step": 12630 + }, + { + "epoch": 0.07512013512227614, + "grad_norm": 1.680577278137207, + "learning_rate": 4.9307152839661094e-05, + "loss": 5.0813, + "step": 12631 + }, + { + "epoch": 0.07512608240555714, + "grad_norm": 1.9138245582580566, + "learning_rate": 4.9307043630504334e-05, + "loss": 5.0965, + "step": 12632 + }, + { + "epoch": 0.07513202968883814, + "grad_norm": 1.7382584810256958, + "learning_rate": 4.9306934412862236e-05, + "loss": 5.3726, + "step": 12633 + }, + { + "epoch": 0.07513797697211913, + "grad_norm": 1.684213638305664, + "learning_rate": 4.930682518673484e-05, + "loss": 5.2511, + "step": 12634 + }, + { + "epoch": 0.07514392425540013, + "grad_norm": 1.6976017951965332, + "learning_rate": 4.9306715952122185e-05, + "loss": 4.9669, + "step": 12635 + }, + { + "epoch": 0.07514987153868113, + "grad_norm": 1.526212453842163, + "learning_rate": 4.930660670902431e-05, + "loss": 4.9405, + "step": 12636 + }, + { + "epoch": 0.07515581882196212, + "grad_norm": 1.6616593599319458, + "learning_rate": 4.930649745744124e-05, + "loss": 5.0266, + "step": 12637 + }, + { + "epoch": 0.07516176610524312, + "grad_norm": 1.7911401987075806, + "learning_rate": 4.930638819737303e-05, + "loss": 4.8774, + "step": 12638 + }, + { + "epoch": 0.07516771338852413, + "grad_norm": 1.3613603115081787, + "learning_rate": 4.93062789288197e-05, + "loss": 5.4048, + "step": 12639 + }, + { + "epoch": 0.07517366067180511, + "grad_norm": 1.5945172309875488, + "learning_rate": 4.930616965178131e-05, + "loss": 5.1918, + "step": 12640 + }, + { + "epoch": 0.07517960795508612, + "grad_norm": 1.816091775894165, + "learning_rate": 4.930606036625789e-05, + "loss": 5.3138, + "step": 12641 + }, + { + "epoch": 0.07518555523836712, + "grad_norm": 1.642877459526062, + "learning_rate": 4.930595107224947e-05, + "loss": 5.2438, + "step": 12642 + }, + { + "epoch": 0.07519150252164811, + "grad_norm": 1.8904980421066284, + "learning_rate": 4.930584176975609e-05, + "loss": 5.1565, + "step": 12643 + }, + { + "epoch": 0.07519744980492911, + "grad_norm": 1.6247447729110718, + "learning_rate": 4.93057324587778e-05, + "loss": 5.1795, + "step": 12644 + }, + { + "epoch": 0.07520339708821011, + "grad_norm": 1.4699510335922241, + "learning_rate": 4.930562313931461e-05, + "loss": 5.3628, + "step": 12645 + }, + { + "epoch": 0.0752093443714911, + "grad_norm": 1.537920355796814, + "learning_rate": 4.93055138113666e-05, + "loss": 5.492, + "step": 12646 + }, + { + "epoch": 0.0752152916547721, + "grad_norm": 1.3268204927444458, + "learning_rate": 4.930540447493378e-05, + "loss": 5.2169, + "step": 12647 + }, + { + "epoch": 0.0752212389380531, + "grad_norm": 1.627005934715271, + "learning_rate": 4.930529513001619e-05, + "loss": 5.9358, + "step": 12648 + }, + { + "epoch": 0.07522718622133409, + "grad_norm": 1.445926547050476, + "learning_rate": 4.930518577661388e-05, + "loss": 5.0762, + "step": 12649 + }, + { + "epoch": 0.0752331335046151, + "grad_norm": 1.5958713293075562, + "learning_rate": 4.930507641472688e-05, + "loss": 5.2345, + "step": 12650 + }, + { + "epoch": 0.0752390807878961, + "grad_norm": 1.470540165901184, + "learning_rate": 4.9304967044355225e-05, + "loss": 5.1259, + "step": 12651 + }, + { + "epoch": 0.07524502807117708, + "grad_norm": 1.4679489135742188, + "learning_rate": 4.930485766549896e-05, + "loss": 5.1456, + "step": 12652 + }, + { + "epoch": 0.07525097535445809, + "grad_norm": 1.3032207489013672, + "learning_rate": 4.930474827815812e-05, + "loss": 5.1479, + "step": 12653 + }, + { + "epoch": 0.07525692263773909, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.930463888233274e-05, + "loss": 5.173, + "step": 12654 + }, + { + "epoch": 0.07526286992102008, + "grad_norm": 1.5788590908050537, + "learning_rate": 4.930452947802286e-05, + "loss": 5.0608, + "step": 12655 + }, + { + "epoch": 0.07526881720430108, + "grad_norm": 1.4392722845077515, + "learning_rate": 4.9304420065228526e-05, + "loss": 5.1209, + "step": 12656 + }, + { + "epoch": 0.07527476448758207, + "grad_norm": 1.4725446701049805, + "learning_rate": 4.930431064394977e-05, + "loss": 5.1249, + "step": 12657 + }, + { + "epoch": 0.07528071177086307, + "grad_norm": 1.4239790439605713, + "learning_rate": 4.930420121418663e-05, + "loss": 5.0262, + "step": 12658 + }, + { + "epoch": 0.07528665905414407, + "grad_norm": 1.3037468194961548, + "learning_rate": 4.930409177593914e-05, + "loss": 5.1158, + "step": 12659 + }, + { + "epoch": 0.07529260633742506, + "grad_norm": 1.430015206336975, + "learning_rate": 4.930398232920734e-05, + "loss": 5.1362, + "step": 12660 + }, + { + "epoch": 0.07529855362070606, + "grad_norm": 1.2381033897399902, + "learning_rate": 4.930387287399127e-05, + "loss": 5.2351, + "step": 12661 + }, + { + "epoch": 0.07530450090398706, + "grad_norm": 1.4459912776947021, + "learning_rate": 4.930376341029098e-05, + "loss": 5.1413, + "step": 12662 + }, + { + "epoch": 0.07531044818726805, + "grad_norm": 1.4875576496124268, + "learning_rate": 4.93036539381065e-05, + "loss": 5.0556, + "step": 12663 + }, + { + "epoch": 0.07531639547054905, + "grad_norm": 1.1632124185562134, + "learning_rate": 4.930354445743785e-05, + "loss": 5.2317, + "step": 12664 + }, + { + "epoch": 0.07532234275383005, + "grad_norm": 1.324722170829773, + "learning_rate": 4.9303434968285096e-05, + "loss": 5.0562, + "step": 12665 + }, + { + "epoch": 0.07532829003711104, + "grad_norm": 1.4292213916778564, + "learning_rate": 4.9303325470648254e-05, + "loss": 5.0991, + "step": 12666 + }, + { + "epoch": 0.07533423732039204, + "grad_norm": 1.4528483152389526, + "learning_rate": 4.930321596452738e-05, + "loss": 5.0675, + "step": 12667 + }, + { + "epoch": 0.07534018460367305, + "grad_norm": 1.5489269495010376, + "learning_rate": 4.9303106449922504e-05, + "loss": 4.9073, + "step": 12668 + }, + { + "epoch": 0.07534613188695403, + "grad_norm": 1.440854787826538, + "learning_rate": 4.9302996926833664e-05, + "loss": 5.0401, + "step": 12669 + }, + { + "epoch": 0.07535207917023504, + "grad_norm": 1.4586740732192993, + "learning_rate": 4.9302887395260894e-05, + "loss": 5.0483, + "step": 12670 + }, + { + "epoch": 0.07535802645351604, + "grad_norm": 1.390376091003418, + "learning_rate": 4.930277785520424e-05, + "loss": 5.1417, + "step": 12671 + }, + { + "epoch": 0.07536397373679703, + "grad_norm": 1.296410083770752, + "learning_rate": 4.9302668306663736e-05, + "loss": 5.461, + "step": 12672 + }, + { + "epoch": 0.07536992102007803, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.930255874963943e-05, + "loss": 5.4972, + "step": 12673 + }, + { + "epoch": 0.07537586830335903, + "grad_norm": 1.4567232131958008, + "learning_rate": 4.930244918413134e-05, + "loss": 5.1921, + "step": 12674 + }, + { + "epoch": 0.07538181558664002, + "grad_norm": 1.7850147485733032, + "learning_rate": 4.930233961013953e-05, + "loss": 5.0658, + "step": 12675 + }, + { + "epoch": 0.07538776286992102, + "grad_norm": 1.5736637115478516, + "learning_rate": 4.930223002766401e-05, + "loss": 5.6874, + "step": 12676 + }, + { + "epoch": 0.07539371015320202, + "grad_norm": 1.5202080011367798, + "learning_rate": 4.9302120436704836e-05, + "loss": 5.7279, + "step": 12677 + }, + { + "epoch": 0.07539965743648301, + "grad_norm": 1.4259493350982666, + "learning_rate": 4.930201083726205e-05, + "loss": 5.5445, + "step": 12678 + }, + { + "epoch": 0.07540560471976401, + "grad_norm": 1.5141973495483398, + "learning_rate": 4.9301901229335674e-05, + "loss": 5.5086, + "step": 12679 + }, + { + "epoch": 0.07541155200304502, + "grad_norm": 1.5044218301773071, + "learning_rate": 4.930179161292576e-05, + "loss": 5.4279, + "step": 12680 + }, + { + "epoch": 0.075417499286326, + "grad_norm": 1.5342620611190796, + "learning_rate": 4.930168198803234e-05, + "loss": 5.0885, + "step": 12681 + }, + { + "epoch": 0.075423446569607, + "grad_norm": 1.8139567375183105, + "learning_rate": 4.930157235465546e-05, + "loss": 5.5586, + "step": 12682 + }, + { + "epoch": 0.07542939385288801, + "grad_norm": 1.606778621673584, + "learning_rate": 4.9301462712795144e-05, + "loss": 5.4007, + "step": 12683 + }, + { + "epoch": 0.075435341136169, + "grad_norm": 1.6451623439788818, + "learning_rate": 4.930135306245144e-05, + "loss": 5.2882, + "step": 12684 + }, + { + "epoch": 0.07544128841945, + "grad_norm": 1.915991187095642, + "learning_rate": 4.9301243403624385e-05, + "loss": 5.0727, + "step": 12685 + }, + { + "epoch": 0.07544723570273099, + "grad_norm": 1.536456823348999, + "learning_rate": 4.930113373631402e-05, + "loss": 5.2154, + "step": 12686 + }, + { + "epoch": 0.07545318298601199, + "grad_norm": 1.5820670127868652, + "learning_rate": 4.9301024060520375e-05, + "loss": 5.0613, + "step": 12687 + }, + { + "epoch": 0.07545913026929299, + "grad_norm": 1.5905929803848267, + "learning_rate": 4.93009143762435e-05, + "loss": 5.08, + "step": 12688 + }, + { + "epoch": 0.07546507755257398, + "grad_norm": 1.5759062767028809, + "learning_rate": 4.9300804683483426e-05, + "loss": 5.0874, + "step": 12689 + }, + { + "epoch": 0.07547102483585498, + "grad_norm": 1.4619840383529663, + "learning_rate": 4.9300694982240186e-05, + "loss": 5.1803, + "step": 12690 + }, + { + "epoch": 0.07547697211913598, + "grad_norm": 1.2742846012115479, + "learning_rate": 4.930058527251383e-05, + "loss": 5.2721, + "step": 12691 + }, + { + "epoch": 0.07548291940241697, + "grad_norm": 1.4095741510391235, + "learning_rate": 4.930047555430439e-05, + "loss": 5.055, + "step": 12692 + }, + { + "epoch": 0.07548886668569797, + "grad_norm": 1.3399991989135742, + "learning_rate": 4.93003658276119e-05, + "loss": 5.0315, + "step": 12693 + }, + { + "epoch": 0.07549481396897897, + "grad_norm": 1.4075208902359009, + "learning_rate": 4.9300256092436407e-05, + "loss": 5.2634, + "step": 12694 + }, + { + "epoch": 0.07550076125225996, + "grad_norm": 1.681321144104004, + "learning_rate": 4.930014634877795e-05, + "loss": 4.9749, + "step": 12695 + }, + { + "epoch": 0.07550670853554096, + "grad_norm": 1.842136263847351, + "learning_rate": 4.9300036596636555e-05, + "loss": 4.797, + "step": 12696 + }, + { + "epoch": 0.07551265581882197, + "grad_norm": 1.8733257055282593, + "learning_rate": 4.929992683601228e-05, + "loss": 5.4726, + "step": 12697 + }, + { + "epoch": 0.07551860310210295, + "grad_norm": 1.747514009475708, + "learning_rate": 4.929981706690514e-05, + "loss": 5.1081, + "step": 12698 + }, + { + "epoch": 0.07552455038538396, + "grad_norm": 1.8107210397720337, + "learning_rate": 4.9299707289315187e-05, + "loss": 4.983, + "step": 12699 + }, + { + "epoch": 0.07553049766866496, + "grad_norm": 1.6319682598114014, + "learning_rate": 4.929959750324246e-05, + "loss": 4.9968, + "step": 12700 + }, + { + "epoch": 0.07553644495194595, + "grad_norm": 1.4653065204620361, + "learning_rate": 4.9299487708687e-05, + "loss": 5.3013, + "step": 12701 + }, + { + "epoch": 0.07554239223522695, + "grad_norm": 1.4665262699127197, + "learning_rate": 4.929937790564883e-05, + "loss": 5.4431, + "step": 12702 + }, + { + "epoch": 0.07554833951850795, + "grad_norm": 1.4962518215179443, + "learning_rate": 4.9299268094127996e-05, + "loss": 5.3692, + "step": 12703 + }, + { + "epoch": 0.07555428680178894, + "grad_norm": 1.7913219928741455, + "learning_rate": 4.929915827412454e-05, + "loss": 5.0082, + "step": 12704 + }, + { + "epoch": 0.07556023408506994, + "grad_norm": 1.5508856773376465, + "learning_rate": 4.929904844563851e-05, + "loss": 5.1501, + "step": 12705 + }, + { + "epoch": 0.07556618136835094, + "grad_norm": 1.5882935523986816, + "learning_rate": 4.929893860866993e-05, + "loss": 4.9579, + "step": 12706 + }, + { + "epoch": 0.07557212865163193, + "grad_norm": 1.4550399780273438, + "learning_rate": 4.9298828763218833e-05, + "loss": 5.0165, + "step": 12707 + }, + { + "epoch": 0.07557807593491293, + "grad_norm": 1.5075403451919556, + "learning_rate": 4.929871890928527e-05, + "loss": 4.933, + "step": 12708 + }, + { + "epoch": 0.07558402321819394, + "grad_norm": 1.7094134092330933, + "learning_rate": 4.929860904686928e-05, + "loss": 4.8842, + "step": 12709 + }, + { + "epoch": 0.07558997050147492, + "grad_norm": 1.5615170001983643, + "learning_rate": 4.929849917597089e-05, + "loss": 5.5301, + "step": 12710 + }, + { + "epoch": 0.07559591778475593, + "grad_norm": 1.6687208414077759, + "learning_rate": 4.929838929659015e-05, + "loss": 4.9325, + "step": 12711 + }, + { + "epoch": 0.07560186506803693, + "grad_norm": 1.3476423025131226, + "learning_rate": 4.9298279408727086e-05, + "loss": 5.1274, + "step": 12712 + }, + { + "epoch": 0.07560781235131792, + "grad_norm": 1.359786868095398, + "learning_rate": 4.929816951238175e-05, + "loss": 4.7549, + "step": 12713 + }, + { + "epoch": 0.07561375963459892, + "grad_norm": 1.305482029914856, + "learning_rate": 4.9298059607554184e-05, + "loss": 4.7371, + "step": 12714 + }, + { + "epoch": 0.0756197069178799, + "grad_norm": 1.408693790435791, + "learning_rate": 4.92979496942444e-05, + "loss": 5.0733, + "step": 12715 + }, + { + "epoch": 0.07562565420116091, + "grad_norm": 1.3604625463485718, + "learning_rate": 4.9297839772452456e-05, + "loss": 4.7947, + "step": 12716 + }, + { + "epoch": 0.07563160148444191, + "grad_norm": 1.4101814031600952, + "learning_rate": 4.929772984217839e-05, + "loss": 5.2003, + "step": 12717 + }, + { + "epoch": 0.0756375487677229, + "grad_norm": 1.4409375190734863, + "learning_rate": 4.929761990342224e-05, + "loss": 5.167, + "step": 12718 + }, + { + "epoch": 0.0756434960510039, + "grad_norm": 1.4309754371643066, + "learning_rate": 4.9297509956184044e-05, + "loss": 5.1499, + "step": 12719 + }, + { + "epoch": 0.0756494433342849, + "grad_norm": 1.6380341053009033, + "learning_rate": 4.929740000046382e-05, + "loss": 4.8282, + "step": 12720 + }, + { + "epoch": 0.07565539061756589, + "grad_norm": 1.6795456409454346, + "learning_rate": 4.929729003626164e-05, + "loss": 4.708, + "step": 12721 + }, + { + "epoch": 0.07566133790084689, + "grad_norm": 1.7367075681686401, + "learning_rate": 4.929718006357753e-05, + "loss": 5.3364, + "step": 12722 + }, + { + "epoch": 0.0756672851841279, + "grad_norm": 1.5842353105545044, + "learning_rate": 4.929707008241152e-05, + "loss": 5.2025, + "step": 12723 + }, + { + "epoch": 0.07567323246740888, + "grad_norm": 1.5129985809326172, + "learning_rate": 4.9296960092763657e-05, + "loss": 5.1788, + "step": 12724 + }, + { + "epoch": 0.07567917975068988, + "grad_norm": 1.4276295900344849, + "learning_rate": 4.929685009463397e-05, + "loss": 5.2597, + "step": 12725 + }, + { + "epoch": 0.07568512703397089, + "grad_norm": 1.499213457107544, + "learning_rate": 4.9296740088022506e-05, + "loss": 5.1778, + "step": 12726 + }, + { + "epoch": 0.07569107431725187, + "grad_norm": 1.4656083583831787, + "learning_rate": 4.92966300729293e-05, + "loss": 5.2689, + "step": 12727 + }, + { + "epoch": 0.07569702160053288, + "grad_norm": 1.6160268783569336, + "learning_rate": 4.9296520049354393e-05, + "loss": 5.1829, + "step": 12728 + }, + { + "epoch": 0.07570296888381388, + "grad_norm": 1.514891266822815, + "learning_rate": 4.929641001729782e-05, + "loss": 5.2586, + "step": 12729 + }, + { + "epoch": 0.07570891616709487, + "grad_norm": 1.4635345935821533, + "learning_rate": 4.929629997675963e-05, + "loss": 5.2159, + "step": 12730 + }, + { + "epoch": 0.07571486345037587, + "grad_norm": 1.704380750656128, + "learning_rate": 4.9296189927739846e-05, + "loss": 5.1068, + "step": 12731 + }, + { + "epoch": 0.07572081073365687, + "grad_norm": 1.5786374807357788, + "learning_rate": 4.929607987023851e-05, + "loss": 5.2306, + "step": 12732 + }, + { + "epoch": 0.07572675801693786, + "grad_norm": 1.5011721849441528, + "learning_rate": 4.929596980425567e-05, + "loss": 5.1594, + "step": 12733 + }, + { + "epoch": 0.07573270530021886, + "grad_norm": 1.4532456398010254, + "learning_rate": 4.9295859729791354e-05, + "loss": 5.0955, + "step": 12734 + }, + { + "epoch": 0.07573865258349986, + "grad_norm": 1.5734699964523315, + "learning_rate": 4.9295749646845604e-05, + "loss": 5.1523, + "step": 12735 + }, + { + "epoch": 0.07574459986678085, + "grad_norm": 1.578141450881958, + "learning_rate": 4.929563955541846e-05, + "loss": 5.0784, + "step": 12736 + }, + { + "epoch": 0.07575054715006185, + "grad_norm": 1.408524513244629, + "learning_rate": 4.929552945550996e-05, + "loss": 5.1411, + "step": 12737 + }, + { + "epoch": 0.07575649443334286, + "grad_norm": 1.4755773544311523, + "learning_rate": 4.929541934712014e-05, + "loss": 5.0666, + "step": 12738 + }, + { + "epoch": 0.07576244171662384, + "grad_norm": 1.5521161556243896, + "learning_rate": 4.929530923024904e-05, + "loss": 5.0938, + "step": 12739 + }, + { + "epoch": 0.07576838899990485, + "grad_norm": 1.4772706031799316, + "learning_rate": 4.929519910489671e-05, + "loss": 5.1178, + "step": 12740 + }, + { + "epoch": 0.07577433628318585, + "grad_norm": 1.2669662237167358, + "learning_rate": 4.9295088971063164e-05, + "loss": 5.2565, + "step": 12741 + }, + { + "epoch": 0.07578028356646684, + "grad_norm": 1.5846413373947144, + "learning_rate": 4.929497882874845e-05, + "loss": 5.2109, + "step": 12742 + }, + { + "epoch": 0.07578623084974784, + "grad_norm": 1.779228687286377, + "learning_rate": 4.929486867795262e-05, + "loss": 5.0196, + "step": 12743 + }, + { + "epoch": 0.07579217813302883, + "grad_norm": 1.6306418180465698, + "learning_rate": 4.92947585186757e-05, + "loss": 5.1982, + "step": 12744 + }, + { + "epoch": 0.07579812541630983, + "grad_norm": 1.5107831954956055, + "learning_rate": 4.9294648350917726e-05, + "loss": 5.0652, + "step": 12745 + }, + { + "epoch": 0.07580407269959083, + "grad_norm": 1.3846759796142578, + "learning_rate": 4.9294538174678744e-05, + "loss": 5.0322, + "step": 12746 + }, + { + "epoch": 0.07581001998287182, + "grad_norm": 1.4558676481246948, + "learning_rate": 4.9294427989958794e-05, + "loss": 4.9626, + "step": 12747 + }, + { + "epoch": 0.07581596726615282, + "grad_norm": 1.3155016899108887, + "learning_rate": 4.92943177967579e-05, + "loss": 4.9965, + "step": 12748 + }, + { + "epoch": 0.07582191454943382, + "grad_norm": 1.3237980604171753, + "learning_rate": 4.9294207595076125e-05, + "loss": 4.9697, + "step": 12749 + }, + { + "epoch": 0.07582786183271481, + "grad_norm": 1.4439423084259033, + "learning_rate": 4.929409738491349e-05, + "loss": 5.0636, + "step": 12750 + }, + { + "epoch": 0.07583380911599581, + "grad_norm": 1.4793460369110107, + "learning_rate": 4.9293987166270024e-05, + "loss": 5.1122, + "step": 12751 + }, + { + "epoch": 0.07583975639927681, + "grad_norm": 1.5353471040725708, + "learning_rate": 4.929387693914578e-05, + "loss": 5.174, + "step": 12752 + }, + { + "epoch": 0.0758457036825578, + "grad_norm": 1.690537452697754, + "learning_rate": 4.929376670354081e-05, + "loss": 5.1515, + "step": 12753 + }, + { + "epoch": 0.0758516509658388, + "grad_norm": 1.4602952003479004, + "learning_rate": 4.9293656459455124e-05, + "loss": 5.1244, + "step": 12754 + }, + { + "epoch": 0.0758575982491198, + "grad_norm": 1.5871785879135132, + "learning_rate": 4.929354620688878e-05, + "loss": 5.2856, + "step": 12755 + }, + { + "epoch": 0.0758635455324008, + "grad_norm": 1.588065505027771, + "learning_rate": 4.92934359458418e-05, + "loss": 5.3694, + "step": 12756 + }, + { + "epoch": 0.0758694928156818, + "grad_norm": 1.5489270687103271, + "learning_rate": 4.929332567631424e-05, + "loss": 5.3546, + "step": 12757 + }, + { + "epoch": 0.0758754400989628, + "grad_norm": 1.493815541267395, + "learning_rate": 4.9293215398306136e-05, + "loss": 5.0878, + "step": 12758 + }, + { + "epoch": 0.07588138738224379, + "grad_norm": 1.3329546451568604, + "learning_rate": 4.929310511181751e-05, + "loss": 5.2171, + "step": 12759 + }, + { + "epoch": 0.07588733466552479, + "grad_norm": 1.5299288034439087, + "learning_rate": 4.929299481684842e-05, + "loss": 5.1695, + "step": 12760 + }, + { + "epoch": 0.07589328194880579, + "grad_norm": 1.5130664110183716, + "learning_rate": 4.9292884513398894e-05, + "loss": 5.3169, + "step": 12761 + }, + { + "epoch": 0.07589922923208678, + "grad_norm": 1.420339584350586, + "learning_rate": 4.9292774201468974e-05, + "loss": 5.1995, + "step": 12762 + }, + { + "epoch": 0.07590517651536778, + "grad_norm": 1.4740930795669556, + "learning_rate": 4.9292663881058696e-05, + "loss": 5.3321, + "step": 12763 + }, + { + "epoch": 0.07591112379864878, + "grad_norm": 1.448968768119812, + "learning_rate": 4.92925535521681e-05, + "loss": 5.1292, + "step": 12764 + }, + { + "epoch": 0.07591707108192977, + "grad_norm": 1.3219209909439087, + "learning_rate": 4.929244321479722e-05, + "loss": 5.1873, + "step": 12765 + }, + { + "epoch": 0.07592301836521077, + "grad_norm": 1.3336325883865356, + "learning_rate": 4.929233286894611e-05, + "loss": 5.248, + "step": 12766 + }, + { + "epoch": 0.07592896564849178, + "grad_norm": 1.4230278730392456, + "learning_rate": 4.9292222514614795e-05, + "loss": 5.2072, + "step": 12767 + }, + { + "epoch": 0.07593491293177276, + "grad_norm": 1.4522627592086792, + "learning_rate": 4.929211215180331e-05, + "loss": 5.4323, + "step": 12768 + }, + { + "epoch": 0.07594086021505377, + "grad_norm": 1.4863537549972534, + "learning_rate": 4.929200178051171e-05, + "loss": 5.241, + "step": 12769 + }, + { + "epoch": 0.07594680749833477, + "grad_norm": 1.7619402408599854, + "learning_rate": 4.929189140074001e-05, + "loss": 5.4853, + "step": 12770 + }, + { + "epoch": 0.07595275478161576, + "grad_norm": 1.6116011142730713, + "learning_rate": 4.929178101248827e-05, + "loss": 5.4793, + "step": 12771 + }, + { + "epoch": 0.07595870206489676, + "grad_norm": 1.8669662475585938, + "learning_rate": 4.9291670615756516e-05, + "loss": 5.4062, + "step": 12772 + }, + { + "epoch": 0.07596464934817775, + "grad_norm": 1.6439383029937744, + "learning_rate": 4.9291560210544796e-05, + "loss": 5.148, + "step": 12773 + }, + { + "epoch": 0.07597059663145875, + "grad_norm": 1.4800657033920288, + "learning_rate": 4.929144979685314e-05, + "loss": 5.3895, + "step": 12774 + }, + { + "epoch": 0.07597654391473975, + "grad_norm": 1.4091606140136719, + "learning_rate": 4.929133937468159e-05, + "loss": 5.3307, + "step": 12775 + }, + { + "epoch": 0.07598249119802074, + "grad_norm": 1.3786438703536987, + "learning_rate": 4.9291228944030176e-05, + "loss": 5.0786, + "step": 12776 + }, + { + "epoch": 0.07598843848130174, + "grad_norm": 1.6039817333221436, + "learning_rate": 4.929111850489896e-05, + "loss": 5.0606, + "step": 12777 + }, + { + "epoch": 0.07599438576458274, + "grad_norm": 1.5277283191680908, + "learning_rate": 4.929100805728796e-05, + "loss": 5.1949, + "step": 12778 + }, + { + "epoch": 0.07600033304786373, + "grad_norm": 1.6756436824798584, + "learning_rate": 4.929089760119722e-05, + "loss": 5.125, + "step": 12779 + }, + { + "epoch": 0.07600628033114473, + "grad_norm": 1.7082979679107666, + "learning_rate": 4.929078713662677e-05, + "loss": 5.1984, + "step": 12780 + }, + { + "epoch": 0.07601222761442573, + "grad_norm": 1.607293963432312, + "learning_rate": 4.929067666357666e-05, + "loss": 5.1809, + "step": 12781 + }, + { + "epoch": 0.07601817489770672, + "grad_norm": 1.5133613348007202, + "learning_rate": 4.9290566182046936e-05, + "loss": 5.2602, + "step": 12782 + }, + { + "epoch": 0.07602412218098772, + "grad_norm": 1.6572481393814087, + "learning_rate": 4.9290455692037616e-05, + "loss": 5.0959, + "step": 12783 + }, + { + "epoch": 0.07603006946426873, + "grad_norm": 1.6593372821807861, + "learning_rate": 4.929034519354876e-05, + "loss": 5.1672, + "step": 12784 + }, + { + "epoch": 0.07603601674754971, + "grad_norm": 1.4214340448379517, + "learning_rate": 4.929023468658038e-05, + "loss": 5.1064, + "step": 12785 + }, + { + "epoch": 0.07604196403083072, + "grad_norm": 1.4875116348266602, + "learning_rate": 4.929012417113255e-05, + "loss": 5.0657, + "step": 12786 + }, + { + "epoch": 0.07604791131411172, + "grad_norm": 1.7354154586791992, + "learning_rate": 4.929001364720527e-05, + "loss": 5.0415, + "step": 12787 + }, + { + "epoch": 0.0760538585973927, + "grad_norm": 1.5597622394561768, + "learning_rate": 4.928990311479861e-05, + "loss": 5.1404, + "step": 12788 + }, + { + "epoch": 0.07605980588067371, + "grad_norm": 1.6819382905960083, + "learning_rate": 4.928979257391258e-05, + "loss": 4.9487, + "step": 12789 + }, + { + "epoch": 0.07606575316395471, + "grad_norm": 1.4722174406051636, + "learning_rate": 4.928968202454725e-05, + "loss": 5.1677, + "step": 12790 + }, + { + "epoch": 0.0760717004472357, + "grad_norm": 1.5145434141159058, + "learning_rate": 4.9289571466702635e-05, + "loss": 5.2197, + "step": 12791 + }, + { + "epoch": 0.0760776477305167, + "grad_norm": 1.6052699089050293, + "learning_rate": 4.9289460900378784e-05, + "loss": 5.2508, + "step": 12792 + }, + { + "epoch": 0.0760835950137977, + "grad_norm": 1.3738253116607666, + "learning_rate": 4.9289350325575734e-05, + "loss": 5.1253, + "step": 12793 + }, + { + "epoch": 0.07608954229707869, + "grad_norm": 1.2580832242965698, + "learning_rate": 4.9289239742293524e-05, + "loss": 5.2497, + "step": 12794 + }, + { + "epoch": 0.0760954895803597, + "grad_norm": 1.6756019592285156, + "learning_rate": 4.928912915053219e-05, + "loss": 5.2471, + "step": 12795 + }, + { + "epoch": 0.0761014368636407, + "grad_norm": 1.6785964965820312, + "learning_rate": 4.928901855029177e-05, + "loss": 4.9893, + "step": 12796 + }, + { + "epoch": 0.07610738414692168, + "grad_norm": 1.6926941871643066, + "learning_rate": 4.92889079415723e-05, + "loss": 5.1558, + "step": 12797 + }, + { + "epoch": 0.07611333143020269, + "grad_norm": 1.4381680488586426, + "learning_rate": 4.9288797324373835e-05, + "loss": 4.9754, + "step": 12798 + }, + { + "epoch": 0.07611927871348369, + "grad_norm": 1.4430698156356812, + "learning_rate": 4.9288686698696393e-05, + "loss": 5.0197, + "step": 12799 + }, + { + "epoch": 0.07612522599676468, + "grad_norm": 1.4745796918869019, + "learning_rate": 4.928857606454002e-05, + "loss": 4.8857, + "step": 12800 + }, + { + "epoch": 0.07613117328004568, + "grad_norm": 1.5430330038070679, + "learning_rate": 4.928846542190477e-05, + "loss": 5.0407, + "step": 12801 + }, + { + "epoch": 0.07613712056332667, + "grad_norm": 1.6061021089553833, + "learning_rate": 4.928835477079066e-05, + "loss": 5.068, + "step": 12802 + }, + { + "epoch": 0.07614306784660767, + "grad_norm": 1.699568510055542, + "learning_rate": 4.9288244111197734e-05, + "loss": 4.9067, + "step": 12803 + }, + { + "epoch": 0.07614901512988867, + "grad_norm": 1.4770212173461914, + "learning_rate": 4.928813344312603e-05, + "loss": 5.0807, + "step": 12804 + }, + { + "epoch": 0.07615496241316966, + "grad_norm": 1.4657871723175049, + "learning_rate": 4.928802276657559e-05, + "loss": 5.1982, + "step": 12805 + }, + { + "epoch": 0.07616090969645066, + "grad_norm": 1.7897653579711914, + "learning_rate": 4.928791208154646e-05, + "loss": 5.1154, + "step": 12806 + }, + { + "epoch": 0.07616685697973166, + "grad_norm": 1.6905261278152466, + "learning_rate": 4.928780138803866e-05, + "loss": 5.3129, + "step": 12807 + }, + { + "epoch": 0.07617280426301265, + "grad_norm": 1.4763284921646118, + "learning_rate": 4.928769068605225e-05, + "loss": 5.2104, + "step": 12808 + }, + { + "epoch": 0.07617875154629365, + "grad_norm": 1.38632333278656, + "learning_rate": 4.928757997558725e-05, + "loss": 5.0857, + "step": 12809 + }, + { + "epoch": 0.07618469882957465, + "grad_norm": 1.5099103450775146, + "learning_rate": 4.928746925664371e-05, + "loss": 5.1264, + "step": 12810 + }, + { + "epoch": 0.07619064611285564, + "grad_norm": 1.285243272781372, + "learning_rate": 4.928735852922167e-05, + "loss": 5.1177, + "step": 12811 + }, + { + "epoch": 0.07619659339613664, + "grad_norm": 1.2749274969100952, + "learning_rate": 4.928724779332116e-05, + "loss": 5.0831, + "step": 12812 + }, + { + "epoch": 0.07620254067941765, + "grad_norm": 2.413712978363037, + "learning_rate": 4.928713704894222e-05, + "loss": 5.2416, + "step": 12813 + }, + { + "epoch": 0.07620848796269863, + "grad_norm": 1.602721929550171, + "learning_rate": 4.9287026296084895e-05, + "loss": 4.9799, + "step": 12814 + }, + { + "epoch": 0.07621443524597964, + "grad_norm": 1.515821099281311, + "learning_rate": 4.928691553474921e-05, + "loss": 5.034, + "step": 12815 + }, + { + "epoch": 0.07622038252926064, + "grad_norm": 1.3245290517807007, + "learning_rate": 4.928680476493523e-05, + "loss": 4.9559, + "step": 12816 + }, + { + "epoch": 0.07622632981254163, + "grad_norm": 1.5383784770965576, + "learning_rate": 4.928669398664297e-05, + "loss": 4.9085, + "step": 12817 + }, + { + "epoch": 0.07623227709582263, + "grad_norm": 1.4406317472457886, + "learning_rate": 4.928658319987247e-05, + "loss": 5.0073, + "step": 12818 + }, + { + "epoch": 0.07623822437910363, + "grad_norm": 1.6843304634094238, + "learning_rate": 4.928647240462378e-05, + "loss": 5.0262, + "step": 12819 + }, + { + "epoch": 0.07624417166238462, + "grad_norm": 1.655497431755066, + "learning_rate": 4.928636160089693e-05, + "loss": 5.0633, + "step": 12820 + }, + { + "epoch": 0.07625011894566562, + "grad_norm": 1.4143035411834717, + "learning_rate": 4.9286250788691973e-05, + "loss": 5.1131, + "step": 12821 + }, + { + "epoch": 0.07625606622894662, + "grad_norm": 1.5316637754440308, + "learning_rate": 4.9286139968008926e-05, + "loss": 5.2727, + "step": 12822 + }, + { + "epoch": 0.07626201351222761, + "grad_norm": 1.6708348989486694, + "learning_rate": 4.9286029138847844e-05, + "loss": 5.1469, + "step": 12823 + }, + { + "epoch": 0.07626796079550861, + "grad_norm": 1.48544180393219, + "learning_rate": 4.928591830120876e-05, + "loss": 5.0916, + "step": 12824 + }, + { + "epoch": 0.07627390807878962, + "grad_norm": 1.3884835243225098, + "learning_rate": 4.9285807455091715e-05, + "loss": 5.1451, + "step": 12825 + }, + { + "epoch": 0.0762798553620706, + "grad_norm": 1.7265839576721191, + "learning_rate": 4.928569660049674e-05, + "loss": 5.0478, + "step": 12826 + }, + { + "epoch": 0.0762858026453516, + "grad_norm": 1.678852915763855, + "learning_rate": 4.9285585737423875e-05, + "loss": 5.2127, + "step": 12827 + }, + { + "epoch": 0.07629174992863261, + "grad_norm": 1.4907126426696777, + "learning_rate": 4.928547486587317e-05, + "loss": 4.9706, + "step": 12828 + }, + { + "epoch": 0.0762976972119136, + "grad_norm": 1.610822319984436, + "learning_rate": 4.928536398584466e-05, + "loss": 5.2416, + "step": 12829 + }, + { + "epoch": 0.0763036444951946, + "grad_norm": 1.5226528644561768, + "learning_rate": 4.9285253097338375e-05, + "loss": 5.2665, + "step": 12830 + }, + { + "epoch": 0.07630959177847559, + "grad_norm": 1.6021392345428467, + "learning_rate": 4.928514220035436e-05, + "loss": 5.2129, + "step": 12831 + }, + { + "epoch": 0.07631553906175659, + "grad_norm": 1.4113723039627075, + "learning_rate": 4.928503129489265e-05, + "loss": 5.3568, + "step": 12832 + }, + { + "epoch": 0.07632148634503759, + "grad_norm": 1.7851402759552002, + "learning_rate": 4.928492038095329e-05, + "loss": 5.2028, + "step": 12833 + }, + { + "epoch": 0.07632743362831858, + "grad_norm": 2.0881283283233643, + "learning_rate": 4.928480945853631e-05, + "loss": 5.2721, + "step": 12834 + }, + { + "epoch": 0.07633338091159958, + "grad_norm": 1.376695156097412, + "learning_rate": 4.928469852764176e-05, + "loss": 5.0203, + "step": 12835 + }, + { + "epoch": 0.07633932819488058, + "grad_norm": 1.585046648979187, + "learning_rate": 4.928458758826967e-05, + "loss": 5.4281, + "step": 12836 + }, + { + "epoch": 0.07634527547816157, + "grad_norm": 1.7124192714691162, + "learning_rate": 4.928447664042008e-05, + "loss": 5.4921, + "step": 12837 + }, + { + "epoch": 0.07635122276144257, + "grad_norm": 1.5693449974060059, + "learning_rate": 4.928436568409304e-05, + "loss": 5.5729, + "step": 12838 + }, + { + "epoch": 0.07635717004472357, + "grad_norm": 2.072880506515503, + "learning_rate": 4.928425471928857e-05, + "loss": 5.1023, + "step": 12839 + }, + { + "epoch": 0.07636311732800456, + "grad_norm": 1.674325704574585, + "learning_rate": 4.928414374600672e-05, + "loss": 5.5319, + "step": 12840 + }, + { + "epoch": 0.07636906461128556, + "grad_norm": 1.3941127061843872, + "learning_rate": 4.9284032764247523e-05, + "loss": 5.4425, + "step": 12841 + }, + { + "epoch": 0.07637501189456657, + "grad_norm": 1.670743703842163, + "learning_rate": 4.9283921774011025e-05, + "loss": 5.2595, + "step": 12842 + }, + { + "epoch": 0.07638095917784755, + "grad_norm": 2.852534294128418, + "learning_rate": 4.928381077529726e-05, + "loss": 5.321, + "step": 12843 + }, + { + "epoch": 0.07638690646112856, + "grad_norm": 1.930977463722229, + "learning_rate": 4.928369976810626e-05, + "loss": 5.2649, + "step": 12844 + }, + { + "epoch": 0.07639285374440956, + "grad_norm": 1.8886314630508423, + "learning_rate": 4.928358875243808e-05, + "loss": 5.1882, + "step": 12845 + }, + { + "epoch": 0.07639880102769055, + "grad_norm": 1.793514609336853, + "learning_rate": 4.9283477728292745e-05, + "loss": 5.0946, + "step": 12846 + }, + { + "epoch": 0.07640474831097155, + "grad_norm": 1.8616431951522827, + "learning_rate": 4.9283366695670304e-05, + "loss": 5.1097, + "step": 12847 + }, + { + "epoch": 0.07641069559425255, + "grad_norm": 1.9281915426254272, + "learning_rate": 4.9283255654570785e-05, + "loss": 5.0054, + "step": 12848 + }, + { + "epoch": 0.07641664287753354, + "grad_norm": 2.036522150039673, + "learning_rate": 4.9283144604994234e-05, + "loss": 4.9115, + "step": 12849 + }, + { + "epoch": 0.07642259016081454, + "grad_norm": 1.7962864637374878, + "learning_rate": 4.928303354694069e-05, + "loss": 4.8951, + "step": 12850 + }, + { + "epoch": 0.07642853744409554, + "grad_norm": 2.1671249866485596, + "learning_rate": 4.9282922480410195e-05, + "loss": 5.1393, + "step": 12851 + }, + { + "epoch": 0.07643448472737653, + "grad_norm": 1.9870150089263916, + "learning_rate": 4.9282811405402774e-05, + "loss": 5.5572, + "step": 12852 + }, + { + "epoch": 0.07644043201065753, + "grad_norm": 2.1498360633850098, + "learning_rate": 4.928270032191847e-05, + "loss": 5.7031, + "step": 12853 + }, + { + "epoch": 0.07644637929393854, + "grad_norm": 2.06821870803833, + "learning_rate": 4.928258922995734e-05, + "loss": 5.723, + "step": 12854 + }, + { + "epoch": 0.07645232657721952, + "grad_norm": 2.283720016479492, + "learning_rate": 4.92824781295194e-05, + "loss": 5.2129, + "step": 12855 + }, + { + "epoch": 0.07645827386050053, + "grad_norm": 2.1862099170684814, + "learning_rate": 4.9282367020604704e-05, + "loss": 4.7535, + "step": 12856 + }, + { + "epoch": 0.07646422114378153, + "grad_norm": 1.7297099828720093, + "learning_rate": 4.928225590321328e-05, + "loss": 5.1965, + "step": 12857 + }, + { + "epoch": 0.07647016842706252, + "grad_norm": 2.0406720638275146, + "learning_rate": 4.9282144777345176e-05, + "loss": 5.289, + "step": 12858 + }, + { + "epoch": 0.07647611571034352, + "grad_norm": 1.8368127346038818, + "learning_rate": 4.928203364300042e-05, + "loss": 5.5448, + "step": 12859 + }, + { + "epoch": 0.0764820629936245, + "grad_norm": 1.837804913520813, + "learning_rate": 4.9281922500179054e-05, + "loss": 5.5284, + "step": 12860 + }, + { + "epoch": 0.07648801027690551, + "grad_norm": 1.7191063165664673, + "learning_rate": 4.928181134888113e-05, + "loss": 5.8212, + "step": 12861 + }, + { + "epoch": 0.07649395756018651, + "grad_norm": 1.757323980331421, + "learning_rate": 4.928170018910667e-05, + "loss": 5.8421, + "step": 12862 + }, + { + "epoch": 0.0764999048434675, + "grad_norm": 1.9213273525238037, + "learning_rate": 4.928158902085572e-05, + "loss": 5.1923, + "step": 12863 + }, + { + "epoch": 0.0765058521267485, + "grad_norm": 1.888006567955017, + "learning_rate": 4.928147784412832e-05, + "loss": 5.4282, + "step": 12864 + }, + { + "epoch": 0.0765117994100295, + "grad_norm": 1.555870771408081, + "learning_rate": 4.9281366658924506e-05, + "loss": 5.8256, + "step": 12865 + }, + { + "epoch": 0.07651774669331049, + "grad_norm": 1.8194485902786255, + "learning_rate": 4.9281255465244314e-05, + "loss": 5.5886, + "step": 12866 + }, + { + "epoch": 0.07652369397659149, + "grad_norm": 1.7867372035980225, + "learning_rate": 4.9281144263087795e-05, + "loss": 5.4818, + "step": 12867 + }, + { + "epoch": 0.0765296412598725, + "grad_norm": 1.8511155843734741, + "learning_rate": 4.928103305245497e-05, + "loss": 5.519, + "step": 12868 + }, + { + "epoch": 0.07653558854315348, + "grad_norm": 2.728428602218628, + "learning_rate": 4.928092183334589e-05, + "loss": 5.0085, + "step": 12869 + }, + { + "epoch": 0.07654153582643448, + "grad_norm": 2.5393402576446533, + "learning_rate": 4.92808106057606e-05, + "loss": 5.0862, + "step": 12870 + }, + { + "epoch": 0.07654748310971549, + "grad_norm": 2.494248151779175, + "learning_rate": 4.928069936969912e-05, + "loss": 5.5557, + "step": 12871 + }, + { + "epoch": 0.07655343039299647, + "grad_norm": 2.4287991523742676, + "learning_rate": 4.9280588125161496e-05, + "loss": 5.6646, + "step": 12872 + }, + { + "epoch": 0.07655937767627748, + "grad_norm": 2.188556432723999, + "learning_rate": 4.928047687214778e-05, + "loss": 5.6618, + "step": 12873 + }, + { + "epoch": 0.07656532495955848, + "grad_norm": 2.7367382049560547, + "learning_rate": 4.9280365610657996e-05, + "loss": 4.6788, + "step": 12874 + }, + { + "epoch": 0.07657127224283947, + "grad_norm": 2.492922067642212, + "learning_rate": 4.9280254340692187e-05, + "loss": 4.4132, + "step": 12875 + }, + { + "epoch": 0.07657721952612047, + "grad_norm": 2.361133575439453, + "learning_rate": 4.928014306225039e-05, + "loss": 4.3957, + "step": 12876 + }, + { + "epoch": 0.07658316680940147, + "grad_norm": 2.652127742767334, + "learning_rate": 4.9280031775332646e-05, + "loss": 4.4568, + "step": 12877 + }, + { + "epoch": 0.07658911409268246, + "grad_norm": 2.40895938873291, + "learning_rate": 4.9279920479938995e-05, + "loss": 4.6276, + "step": 12878 + }, + { + "epoch": 0.07659506137596346, + "grad_norm": 1.9418548345565796, + "learning_rate": 4.927980917606948e-05, + "loss": 5.6008, + "step": 12879 + }, + { + "epoch": 0.07660100865924446, + "grad_norm": 1.7706143856048584, + "learning_rate": 4.9279697863724125e-05, + "loss": 5.4946, + "step": 12880 + }, + { + "epoch": 0.07660695594252545, + "grad_norm": 2.856342077255249, + "learning_rate": 4.9279586542902986e-05, + "loss": 4.9182, + "step": 12881 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 2.713515043258667, + "learning_rate": 4.927947521360608e-05, + "loss": 5.2341, + "step": 12882 + }, + { + "epoch": 0.07661885050908745, + "grad_norm": 2.186169147491455, + "learning_rate": 4.927936387583348e-05, + "loss": 5.1348, + "step": 12883 + }, + { + "epoch": 0.07662479779236844, + "grad_norm": 2.3114492893218994, + "learning_rate": 4.9279252529585195e-05, + "loss": 5.0016, + "step": 12884 + }, + { + "epoch": 0.07663074507564945, + "grad_norm": 2.256502866744995, + "learning_rate": 4.927914117486128e-05, + "loss": 5.1759, + "step": 12885 + }, + { + "epoch": 0.07663669235893045, + "grad_norm": 2.281243324279785, + "learning_rate": 4.927902981166176e-05, + "loss": 5.1437, + "step": 12886 + }, + { + "epoch": 0.07664263964221144, + "grad_norm": 2.3553836345672607, + "learning_rate": 4.927891843998668e-05, + "loss": 5.1622, + "step": 12887 + }, + { + "epoch": 0.07664858692549244, + "grad_norm": 2.420192003250122, + "learning_rate": 4.927880705983609e-05, + "loss": 4.994, + "step": 12888 + }, + { + "epoch": 0.07665453420877343, + "grad_norm": 2.3391306400299072, + "learning_rate": 4.927869567121001e-05, + "loss": 4.9445, + "step": 12889 + }, + { + "epoch": 0.07666048149205443, + "grad_norm": 2.2093355655670166, + "learning_rate": 4.9278584274108484e-05, + "loss": 5.05, + "step": 12890 + }, + { + "epoch": 0.07666642877533543, + "grad_norm": 2.3378305435180664, + "learning_rate": 4.927847286853157e-05, + "loss": 4.8694, + "step": 12891 + }, + { + "epoch": 0.07667237605861642, + "grad_norm": 2.2110583782196045, + "learning_rate": 4.927836145447928e-05, + "loss": 4.8622, + "step": 12892 + }, + { + "epoch": 0.07667832334189742, + "grad_norm": 2.2865991592407227, + "learning_rate": 4.927825003195167e-05, + "loss": 4.9485, + "step": 12893 + }, + { + "epoch": 0.07668427062517842, + "grad_norm": 2.343135356903076, + "learning_rate": 4.927813860094878e-05, + "loss": 4.8874, + "step": 12894 + }, + { + "epoch": 0.07669021790845941, + "grad_norm": 2.1939613819122314, + "learning_rate": 4.927802716147063e-05, + "loss": 4.8349, + "step": 12895 + }, + { + "epoch": 0.07669616519174041, + "grad_norm": 2.866560697555542, + "learning_rate": 4.927791571351728e-05, + "loss": 5.1409, + "step": 12896 + }, + { + "epoch": 0.07670211247502141, + "grad_norm": 2.1052801609039307, + "learning_rate": 4.927780425708876e-05, + "loss": 5.3716, + "step": 12897 + }, + { + "epoch": 0.0767080597583024, + "grad_norm": 2.141184091567993, + "learning_rate": 4.9277692792185106e-05, + "loss": 5.2985, + "step": 12898 + }, + { + "epoch": 0.0767140070415834, + "grad_norm": 1.93148934841156, + "learning_rate": 4.927758131880636e-05, + "loss": 5.6222, + "step": 12899 + }, + { + "epoch": 0.0767199543248644, + "grad_norm": 1.8454651832580566, + "learning_rate": 4.927746983695256e-05, + "loss": 5.6966, + "step": 12900 + }, + { + "epoch": 0.0767259016081454, + "grad_norm": 1.764281153678894, + "learning_rate": 4.9277358346623746e-05, + "loss": 5.4979, + "step": 12901 + }, + { + "epoch": 0.0767318488914264, + "grad_norm": 1.6969131231307983, + "learning_rate": 4.9277246847819965e-05, + "loss": 5.5221, + "step": 12902 + }, + { + "epoch": 0.0767377961747074, + "grad_norm": 1.7118967771530151, + "learning_rate": 4.927713534054124e-05, + "loss": 5.6067, + "step": 12903 + }, + { + "epoch": 0.07674374345798839, + "grad_norm": 2.1508536338806152, + "learning_rate": 4.9277023824787625e-05, + "loss": 5.8241, + "step": 12904 + }, + { + "epoch": 0.07674969074126939, + "grad_norm": 1.8613126277923584, + "learning_rate": 4.927691230055914e-05, + "loss": 5.7141, + "step": 12905 + }, + { + "epoch": 0.07675563802455039, + "grad_norm": 1.8942763805389404, + "learning_rate": 4.927680076785585e-05, + "loss": 5.6909, + "step": 12906 + }, + { + "epoch": 0.07676158530783138, + "grad_norm": 1.8824634552001953, + "learning_rate": 4.927668922667777e-05, + "loss": 5.5055, + "step": 12907 + }, + { + "epoch": 0.07676753259111238, + "grad_norm": 1.8920915126800537, + "learning_rate": 4.927657767702495e-05, + "loss": 5.1783, + "step": 12908 + }, + { + "epoch": 0.07677347987439338, + "grad_norm": 1.8226712942123413, + "learning_rate": 4.927646611889743e-05, + "loss": 5.7529, + "step": 12909 + }, + { + "epoch": 0.07677942715767437, + "grad_norm": 1.88478684425354, + "learning_rate": 4.9276354552295245e-05, + "loss": 5.7034, + "step": 12910 + }, + { + "epoch": 0.07678537444095537, + "grad_norm": 1.6312634944915771, + "learning_rate": 4.927624297721844e-05, + "loss": 5.6476, + "step": 12911 + }, + { + "epoch": 0.07679132172423637, + "grad_norm": 1.5183994770050049, + "learning_rate": 4.927613139366704e-05, + "loss": 5.8517, + "step": 12912 + }, + { + "epoch": 0.07679726900751736, + "grad_norm": 1.6718844175338745, + "learning_rate": 4.92760198016411e-05, + "loss": 5.9619, + "step": 12913 + }, + { + "epoch": 0.07680321629079837, + "grad_norm": 2.575932741165161, + "learning_rate": 4.9275908201140654e-05, + "loss": 5.6903, + "step": 12914 + }, + { + "epoch": 0.07680916357407937, + "grad_norm": 2.2863197326660156, + "learning_rate": 4.927579659216574e-05, + "loss": 5.7517, + "step": 12915 + }, + { + "epoch": 0.07681511085736036, + "grad_norm": 2.231417417526245, + "learning_rate": 4.9275684974716384e-05, + "loss": 5.2323, + "step": 12916 + }, + { + "epoch": 0.07682105814064136, + "grad_norm": 1.9159691333770752, + "learning_rate": 4.927557334879265e-05, + "loss": 5.2548, + "step": 12917 + }, + { + "epoch": 0.07682700542392235, + "grad_norm": 1.6682984828948975, + "learning_rate": 4.927546171439455e-05, + "loss": 5.4639, + "step": 12918 + }, + { + "epoch": 0.07683295270720335, + "grad_norm": 2.1923654079437256, + "learning_rate": 4.927535007152215e-05, + "loss": 5.6016, + "step": 12919 + }, + { + "epoch": 0.07683889999048435, + "grad_norm": 2.2393245697021484, + "learning_rate": 4.9275238420175474e-05, + "loss": 5.9433, + "step": 12920 + }, + { + "epoch": 0.07684484727376534, + "grad_norm": 1.8611164093017578, + "learning_rate": 4.9275126760354565e-05, + "loss": 5.3477, + "step": 12921 + }, + { + "epoch": 0.07685079455704634, + "grad_norm": 1.902567982673645, + "learning_rate": 4.927501509205945e-05, + "loss": 5.4417, + "step": 12922 + }, + { + "epoch": 0.07685674184032734, + "grad_norm": 1.7735011577606201, + "learning_rate": 4.9274903415290184e-05, + "loss": 5.652, + "step": 12923 + }, + { + "epoch": 0.07686268912360833, + "grad_norm": 1.886060357093811, + "learning_rate": 4.927479173004681e-05, + "loss": 5.5927, + "step": 12924 + }, + { + "epoch": 0.07686863640688933, + "grad_norm": 1.8315941095352173, + "learning_rate": 4.927468003632935e-05, + "loss": 5.6559, + "step": 12925 + }, + { + "epoch": 0.07687458369017033, + "grad_norm": 1.7790045738220215, + "learning_rate": 4.927456833413784e-05, + "loss": 5.463, + "step": 12926 + }, + { + "epoch": 0.07688053097345132, + "grad_norm": 1.9559917449951172, + "learning_rate": 4.927445662347234e-05, + "loss": 5.6154, + "step": 12927 + }, + { + "epoch": 0.07688647825673232, + "grad_norm": 1.7274752855300903, + "learning_rate": 4.927434490433287e-05, + "loss": 5.5621, + "step": 12928 + }, + { + "epoch": 0.07689242554001333, + "grad_norm": 1.594190001487732, + "learning_rate": 4.9274233176719486e-05, + "loss": 5.4674, + "step": 12929 + }, + { + "epoch": 0.07689837282329431, + "grad_norm": 1.79281485080719, + "learning_rate": 4.927412144063222e-05, + "loss": 5.5166, + "step": 12930 + }, + { + "epoch": 0.07690432010657532, + "grad_norm": 1.6584967374801636, + "learning_rate": 4.92740096960711e-05, + "loss": 5.4249, + "step": 12931 + }, + { + "epoch": 0.07691026738985632, + "grad_norm": 1.8458021879196167, + "learning_rate": 4.927389794303617e-05, + "loss": 5.6073, + "step": 12932 + }, + { + "epoch": 0.0769162146731373, + "grad_norm": 1.5526570081710815, + "learning_rate": 4.927378618152748e-05, + "loss": 5.3992, + "step": 12933 + }, + { + "epoch": 0.07692216195641831, + "grad_norm": 1.6043710708618164, + "learning_rate": 4.927367441154507e-05, + "loss": 5.3786, + "step": 12934 + }, + { + "epoch": 0.07692810923969931, + "grad_norm": 1.6580268144607544, + "learning_rate": 4.927356263308896e-05, + "loss": 5.5177, + "step": 12935 + }, + { + "epoch": 0.0769340565229803, + "grad_norm": 1.7199897766113281, + "learning_rate": 4.9273450846159194e-05, + "loss": 5.4281, + "step": 12936 + }, + { + "epoch": 0.0769400038062613, + "grad_norm": 1.6920559406280518, + "learning_rate": 4.9273339050755835e-05, + "loss": 5.562, + "step": 12937 + }, + { + "epoch": 0.0769459510895423, + "grad_norm": 1.8027700185775757, + "learning_rate": 4.9273227246878894e-05, + "loss": 5.5473, + "step": 12938 + }, + { + "epoch": 0.07695189837282329, + "grad_norm": 1.6055867671966553, + "learning_rate": 4.927311543452842e-05, + "loss": 5.4903, + "step": 12939 + }, + { + "epoch": 0.07695784565610429, + "grad_norm": 1.5789201259613037, + "learning_rate": 4.9273003613704456e-05, + "loss": 5.4514, + "step": 12940 + }, + { + "epoch": 0.0769637929393853, + "grad_norm": 1.6153863668441772, + "learning_rate": 4.9272891784407034e-05, + "loss": 5.4343, + "step": 12941 + }, + { + "epoch": 0.07696974022266628, + "grad_norm": 1.8802043199539185, + "learning_rate": 4.927277994663619e-05, + "loss": 5.4691, + "step": 12942 + }, + { + "epoch": 0.07697568750594729, + "grad_norm": 1.869836688041687, + "learning_rate": 4.9272668100391984e-05, + "loss": 5.5037, + "step": 12943 + }, + { + "epoch": 0.07698163478922829, + "grad_norm": 1.9082410335540771, + "learning_rate": 4.927255624567443e-05, + "loss": 5.4814, + "step": 12944 + }, + { + "epoch": 0.07698758207250928, + "grad_norm": 1.5890675783157349, + "learning_rate": 4.927244438248358e-05, + "loss": 5.4627, + "step": 12945 + }, + { + "epoch": 0.07699352935579028, + "grad_norm": 1.7432551383972168, + "learning_rate": 4.9272332510819475e-05, + "loss": 5.4301, + "step": 12946 + }, + { + "epoch": 0.07699947663907127, + "grad_norm": 1.7112667560577393, + "learning_rate": 4.927222063068214e-05, + "loss": 5.4028, + "step": 12947 + }, + { + "epoch": 0.07700542392235227, + "grad_norm": 1.7046465873718262, + "learning_rate": 4.9272108742071634e-05, + "loss": 5.4688, + "step": 12948 + }, + { + "epoch": 0.07701137120563327, + "grad_norm": 1.6928964853286743, + "learning_rate": 4.927199684498798e-05, + "loss": 5.4553, + "step": 12949 + }, + { + "epoch": 0.07701731848891426, + "grad_norm": 1.8731732368469238, + "learning_rate": 4.927188493943122e-05, + "loss": 5.3542, + "step": 12950 + }, + { + "epoch": 0.07702326577219526, + "grad_norm": 1.6586295366287231, + "learning_rate": 4.92717730254014e-05, + "loss": 5.2852, + "step": 12951 + }, + { + "epoch": 0.07702921305547626, + "grad_norm": 1.724252462387085, + "learning_rate": 4.927166110289855e-05, + "loss": 5.3982, + "step": 12952 + }, + { + "epoch": 0.07703516033875725, + "grad_norm": 1.7133373022079468, + "learning_rate": 4.9271549171922716e-05, + "loss": 5.3642, + "step": 12953 + }, + { + "epoch": 0.07704110762203825, + "grad_norm": 1.779291033744812, + "learning_rate": 4.927143723247394e-05, + "loss": 5.3949, + "step": 12954 + }, + { + "epoch": 0.07704705490531925, + "grad_norm": 1.8439239263534546, + "learning_rate": 4.927132528455225e-05, + "loss": 5.3829, + "step": 12955 + }, + { + "epoch": 0.07705300218860024, + "grad_norm": 1.7440255880355835, + "learning_rate": 4.927121332815769e-05, + "loss": 5.3881, + "step": 12956 + }, + { + "epoch": 0.07705894947188124, + "grad_norm": 1.8459028005599976, + "learning_rate": 4.927110136329031e-05, + "loss": 5.3575, + "step": 12957 + }, + { + "epoch": 0.07706489675516225, + "grad_norm": 2.8051815032958984, + "learning_rate": 4.927098938995013e-05, + "loss": 5.2814, + "step": 12958 + }, + { + "epoch": 0.07707084403844323, + "grad_norm": 1.8814127445220947, + "learning_rate": 4.9270877408137194e-05, + "loss": 5.3614, + "step": 12959 + }, + { + "epoch": 0.07707679132172424, + "grad_norm": 1.570408821105957, + "learning_rate": 4.927076541785156e-05, + "loss": 5.3453, + "step": 12960 + }, + { + "epoch": 0.07708273860500524, + "grad_norm": 1.607393741607666, + "learning_rate": 4.927065341909324e-05, + "loss": 5.4766, + "step": 12961 + }, + { + "epoch": 0.07708868588828623, + "grad_norm": 1.475420594215393, + "learning_rate": 4.927054141186229e-05, + "loss": 5.4511, + "step": 12962 + }, + { + "epoch": 0.07709463317156723, + "grad_norm": 1.7785848379135132, + "learning_rate": 4.927042939615875e-05, + "loss": 5.3839, + "step": 12963 + }, + { + "epoch": 0.07710058045484823, + "grad_norm": 1.7313402891159058, + "learning_rate": 4.9270317371982645e-05, + "loss": 5.3398, + "step": 12964 + }, + { + "epoch": 0.07710652773812922, + "grad_norm": 1.666938066482544, + "learning_rate": 4.927020533933403e-05, + "loss": 5.4462, + "step": 12965 + }, + { + "epoch": 0.07711247502141022, + "grad_norm": 1.5219112634658813, + "learning_rate": 4.9270093298212933e-05, + "loss": 5.7593, + "step": 12966 + }, + { + "epoch": 0.07711842230469122, + "grad_norm": 2.0760631561279297, + "learning_rate": 4.92699812486194e-05, + "loss": 5.5765, + "step": 12967 + }, + { + "epoch": 0.07712436958797221, + "grad_norm": 1.7648851871490479, + "learning_rate": 4.926986919055346e-05, + "loss": 5.8786, + "step": 12968 + }, + { + "epoch": 0.07713031687125321, + "grad_norm": 1.832141399383545, + "learning_rate": 4.926975712401517e-05, + "loss": 5.6695, + "step": 12969 + }, + { + "epoch": 0.07713626415453421, + "grad_norm": 1.9032765626907349, + "learning_rate": 4.926964504900455e-05, + "loss": 5.701, + "step": 12970 + }, + { + "epoch": 0.0771422114378152, + "grad_norm": 1.7294973134994507, + "learning_rate": 4.9269532965521656e-05, + "loss": 5.6569, + "step": 12971 + }, + { + "epoch": 0.0771481587210962, + "grad_norm": 1.927510142326355, + "learning_rate": 4.926942087356651e-05, + "loss": 5.1289, + "step": 12972 + }, + { + "epoch": 0.07715410600437721, + "grad_norm": 1.6945842504501343, + "learning_rate": 4.926930877313917e-05, + "loss": 5.5703, + "step": 12973 + }, + { + "epoch": 0.0771600532876582, + "grad_norm": 1.7665363550186157, + "learning_rate": 4.926919666423966e-05, + "loss": 5.822, + "step": 12974 + }, + { + "epoch": 0.0771660005709392, + "grad_norm": 1.5802277326583862, + "learning_rate": 4.926908454686801e-05, + "loss": 5.5438, + "step": 12975 + }, + { + "epoch": 0.07717194785422019, + "grad_norm": 1.9065684080123901, + "learning_rate": 4.9268972421024295e-05, + "loss": 5.5556, + "step": 12976 + }, + { + "epoch": 0.07717789513750119, + "grad_norm": 1.7630208730697632, + "learning_rate": 4.9268860286708526e-05, + "loss": 5.6079, + "step": 12977 + }, + { + "epoch": 0.07718384242078219, + "grad_norm": 1.6295850276947021, + "learning_rate": 4.9268748143920746e-05, + "loss": 5.6163, + "step": 12978 + }, + { + "epoch": 0.07718978970406318, + "grad_norm": 1.753202199935913, + "learning_rate": 4.926863599266099e-05, + "loss": 5.549, + "step": 12979 + }, + { + "epoch": 0.07719573698734418, + "grad_norm": 1.7823643684387207, + "learning_rate": 4.9268523832929314e-05, + "loss": 5.6917, + "step": 12980 + }, + { + "epoch": 0.07720168427062518, + "grad_norm": 1.7990792989730835, + "learning_rate": 4.926841166472574e-05, + "loss": 5.5897, + "step": 12981 + }, + { + "epoch": 0.07720763155390617, + "grad_norm": 1.7813109159469604, + "learning_rate": 4.926829948805033e-05, + "loss": 5.5953, + "step": 12982 + }, + { + "epoch": 0.07721357883718717, + "grad_norm": 1.7127541303634644, + "learning_rate": 4.926818730290309e-05, + "loss": 5.5476, + "step": 12983 + }, + { + "epoch": 0.07721952612046817, + "grad_norm": 2.0513558387756348, + "learning_rate": 4.9268075109284084e-05, + "loss": 5.5721, + "step": 12984 + }, + { + "epoch": 0.07722547340374916, + "grad_norm": 1.8053756952285767, + "learning_rate": 4.9267962907193346e-05, + "loss": 5.5344, + "step": 12985 + }, + { + "epoch": 0.07723142068703016, + "grad_norm": 1.7184503078460693, + "learning_rate": 4.9267850696630904e-05, + "loss": 5.602, + "step": 12986 + }, + { + "epoch": 0.07723736797031117, + "grad_norm": 1.8753174543380737, + "learning_rate": 4.926773847759682e-05, + "loss": 5.701, + "step": 12987 + }, + { + "epoch": 0.07724331525359215, + "grad_norm": 1.7761272192001343, + "learning_rate": 4.9267626250091106e-05, + "loss": 5.5026, + "step": 12988 + }, + { + "epoch": 0.07724926253687316, + "grad_norm": 1.6833654642105103, + "learning_rate": 4.926751401411381e-05, + "loss": 5.5615, + "step": 12989 + }, + { + "epoch": 0.07725520982015416, + "grad_norm": 1.8640247583389282, + "learning_rate": 4.926740176966499e-05, + "loss": 5.8367, + "step": 12990 + }, + { + "epoch": 0.07726115710343515, + "grad_norm": 2.036540985107422, + "learning_rate": 4.9267289516744665e-05, + "loss": 5.6258, + "step": 12991 + }, + { + "epoch": 0.07726710438671615, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.926717725535288e-05, + "loss": 5.1961, + "step": 12992 + }, + { + "epoch": 0.07727305166999715, + "grad_norm": 2.149548292160034, + "learning_rate": 4.9267064985489674e-05, + "loss": 5.1735, + "step": 12993 + }, + { + "epoch": 0.07727899895327814, + "grad_norm": 1.7929832935333252, + "learning_rate": 4.926695270715508e-05, + "loss": 5.6889, + "step": 12994 + }, + { + "epoch": 0.07728494623655914, + "grad_norm": 1.7964575290679932, + "learning_rate": 4.926684042034916e-05, + "loss": 5.0576, + "step": 12995 + }, + { + "epoch": 0.07729089351984014, + "grad_norm": 1.8207305669784546, + "learning_rate": 4.926672812507192e-05, + "loss": 5.2703, + "step": 12996 + }, + { + "epoch": 0.07729684080312113, + "grad_norm": 1.6263490915298462, + "learning_rate": 4.9266615821323425e-05, + "loss": 5.5999, + "step": 12997 + }, + { + "epoch": 0.07730278808640213, + "grad_norm": 2.0018131732940674, + "learning_rate": 4.92665035091037e-05, + "loss": 4.9439, + "step": 12998 + }, + { + "epoch": 0.07730873536968313, + "grad_norm": 2.32818341255188, + "learning_rate": 4.926639118841279e-05, + "loss": 4.6071, + "step": 12999 + }, + { + "epoch": 0.07731468265296412, + "grad_norm": 2.3354949951171875, + "learning_rate": 4.926627885925074e-05, + "loss": 4.6642, + "step": 13000 + }, + { + "epoch": 0.07732062993624512, + "grad_norm": 1.71230149269104, + "learning_rate": 4.926616652161757e-05, + "loss": 5.161, + "step": 13001 + }, + { + "epoch": 0.07732657721952613, + "grad_norm": 1.4890326261520386, + "learning_rate": 4.9266054175513345e-05, + "loss": 5.1714, + "step": 13002 + }, + { + "epoch": 0.07733252450280712, + "grad_norm": 1.5844224691390991, + "learning_rate": 4.926594182093809e-05, + "loss": 4.869, + "step": 13003 + }, + { + "epoch": 0.07733847178608812, + "grad_norm": 2.328636884689331, + "learning_rate": 4.926582945789185e-05, + "loss": 5.1571, + "step": 13004 + }, + { + "epoch": 0.0773444190693691, + "grad_norm": 2.067760467529297, + "learning_rate": 4.926571708637464e-05, + "loss": 5.4416, + "step": 13005 + }, + { + "epoch": 0.07735036635265011, + "grad_norm": 1.7148468494415283, + "learning_rate": 4.926560470638653e-05, + "loss": 5.464, + "step": 13006 + }, + { + "epoch": 0.07735631363593111, + "grad_norm": 1.6869080066680908, + "learning_rate": 4.926549231792755e-05, + "loss": 5.5537, + "step": 13007 + }, + { + "epoch": 0.0773622609192121, + "grad_norm": 2.239408254623413, + "learning_rate": 4.9265379920997735e-05, + "loss": 5.1551, + "step": 13008 + }, + { + "epoch": 0.0773682082024931, + "grad_norm": 2.4059038162231445, + "learning_rate": 4.926526751559713e-05, + "loss": 5.2639, + "step": 13009 + }, + { + "epoch": 0.0773741554857741, + "grad_norm": 2.0787813663482666, + "learning_rate": 4.926515510172577e-05, + "loss": 5.3485, + "step": 13010 + }, + { + "epoch": 0.07738010276905509, + "grad_norm": 1.912137508392334, + "learning_rate": 4.9265042679383685e-05, + "loss": 5.551, + "step": 13011 + }, + { + "epoch": 0.07738605005233609, + "grad_norm": 2.0865983963012695, + "learning_rate": 4.926493024857094e-05, + "loss": 5.0343, + "step": 13012 + }, + { + "epoch": 0.0773919973356171, + "grad_norm": 1.9341247081756592, + "learning_rate": 4.926481780928754e-05, + "loss": 5.5904, + "step": 13013 + }, + { + "epoch": 0.07739794461889808, + "grad_norm": 1.7777684926986694, + "learning_rate": 4.926470536153356e-05, + "loss": 5.5396, + "step": 13014 + }, + { + "epoch": 0.07740389190217908, + "grad_norm": 1.7952098846435547, + "learning_rate": 4.926459290530902e-05, + "loss": 5.3212, + "step": 13015 + }, + { + "epoch": 0.07740983918546009, + "grad_norm": 1.7674907445907593, + "learning_rate": 4.926448044061396e-05, + "loss": 5.3316, + "step": 13016 + }, + { + "epoch": 0.07741578646874107, + "grad_norm": 1.8327823877334595, + "learning_rate": 4.926436796744841e-05, + "loss": 5.3129, + "step": 13017 + }, + { + "epoch": 0.07742173375202208, + "grad_norm": 1.613867998123169, + "learning_rate": 4.9264255485812425e-05, + "loss": 5.4935, + "step": 13018 + }, + { + "epoch": 0.07742768103530308, + "grad_norm": 1.7167906761169434, + "learning_rate": 4.9264142995706044e-05, + "loss": 5.3054, + "step": 13019 + }, + { + "epoch": 0.07743362831858407, + "grad_norm": 2.272038698196411, + "learning_rate": 4.92640304971293e-05, + "loss": 5.1327, + "step": 13020 + }, + { + "epoch": 0.07743957560186507, + "grad_norm": 1.6358660459518433, + "learning_rate": 4.926391799008223e-05, + "loss": 5.3285, + "step": 13021 + }, + { + "epoch": 0.07744552288514607, + "grad_norm": 2.166813373565674, + "learning_rate": 4.926380547456488e-05, + "loss": 5.2846, + "step": 13022 + }, + { + "epoch": 0.07745147016842706, + "grad_norm": 2.3251235485076904, + "learning_rate": 4.926369295057729e-05, + "loss": 5.2482, + "step": 13023 + }, + { + "epoch": 0.07745741745170806, + "grad_norm": 1.9402974843978882, + "learning_rate": 4.926358041811949e-05, + "loss": 5.3514, + "step": 13024 + }, + { + "epoch": 0.07746336473498906, + "grad_norm": 2.1346986293792725, + "learning_rate": 4.9263467877191525e-05, + "loss": 5.1912, + "step": 13025 + }, + { + "epoch": 0.07746931201827005, + "grad_norm": 2.0809762477874756, + "learning_rate": 4.926335532779344e-05, + "loss": 5.0547, + "step": 13026 + }, + { + "epoch": 0.07747525930155105, + "grad_norm": 2.110558032989502, + "learning_rate": 4.9263242769925256e-05, + "loss": 5.2177, + "step": 13027 + }, + { + "epoch": 0.07748120658483205, + "grad_norm": 2.3498575687408447, + "learning_rate": 4.926313020358704e-05, + "loss": 4.9997, + "step": 13028 + }, + { + "epoch": 0.07748715386811304, + "grad_norm": 2.4052765369415283, + "learning_rate": 4.92630176287788e-05, + "loss": 4.9736, + "step": 13029 + }, + { + "epoch": 0.07749310115139404, + "grad_norm": 2.3132238388061523, + "learning_rate": 4.9262905045500603e-05, + "loss": 4.9149, + "step": 13030 + }, + { + "epoch": 0.07749904843467505, + "grad_norm": 2.315483331680298, + "learning_rate": 4.926279245375247e-05, + "loss": 4.9096, + "step": 13031 + }, + { + "epoch": 0.07750499571795604, + "grad_norm": 2.0887367725372314, + "learning_rate": 4.926267985353445e-05, + "loss": 5.3274, + "step": 13032 + }, + { + "epoch": 0.07751094300123704, + "grad_norm": 2.3138368129730225, + "learning_rate": 4.926256724484658e-05, + "loss": 4.8627, + "step": 13033 + }, + { + "epoch": 0.07751689028451804, + "grad_norm": 2.348411798477173, + "learning_rate": 4.926245462768889e-05, + "loss": 4.9815, + "step": 13034 + }, + { + "epoch": 0.07752283756779903, + "grad_norm": 1.7357233762741089, + "learning_rate": 4.926234200206144e-05, + "loss": 5.2836, + "step": 13035 + }, + { + "epoch": 0.07752878485108003, + "grad_norm": 1.8633183240890503, + "learning_rate": 4.9262229367964255e-05, + "loss": 5.1838, + "step": 13036 + }, + { + "epoch": 0.07753473213436102, + "grad_norm": 1.736359715461731, + "learning_rate": 4.926211672539737e-05, + "loss": 5.6746, + "step": 13037 + }, + { + "epoch": 0.07754067941764202, + "grad_norm": 2.368511915206909, + "learning_rate": 4.9262004074360834e-05, + "loss": 4.5786, + "step": 13038 + }, + { + "epoch": 0.07754662670092302, + "grad_norm": 1.859297752380371, + "learning_rate": 4.926189141485468e-05, + "loss": 5.8459, + "step": 13039 + }, + { + "epoch": 0.07755257398420401, + "grad_norm": 2.2050845623016357, + "learning_rate": 4.9261778746878955e-05, + "loss": 5.8982, + "step": 13040 + }, + { + "epoch": 0.07755852126748501, + "grad_norm": 1.7485835552215576, + "learning_rate": 4.926166607043369e-05, + "loss": 5.789, + "step": 13041 + }, + { + "epoch": 0.07756446855076601, + "grad_norm": 1.7780888080596924, + "learning_rate": 4.9261553385518936e-05, + "loss": 5.48, + "step": 13042 + }, + { + "epoch": 0.077570415834047, + "grad_norm": 1.8764269351959229, + "learning_rate": 4.9261440692134716e-05, + "loss": 5.093, + "step": 13043 + }, + { + "epoch": 0.077576363117328, + "grad_norm": 1.784196376800537, + "learning_rate": 4.926132799028108e-05, + "loss": 5.4335, + "step": 13044 + }, + { + "epoch": 0.077582310400609, + "grad_norm": 2.173844337463379, + "learning_rate": 4.926121527995806e-05, + "loss": 4.5078, + "step": 13045 + }, + { + "epoch": 0.07758825768389, + "grad_norm": 2.410778045654297, + "learning_rate": 4.9261102561165705e-05, + "loss": 5.2113, + "step": 13046 + }, + { + "epoch": 0.077594204967171, + "grad_norm": 2.0470073223114014, + "learning_rate": 4.9260989833904057e-05, + "loss": 5.4695, + "step": 13047 + }, + { + "epoch": 0.077600152250452, + "grad_norm": 1.619314193725586, + "learning_rate": 4.926087709817314e-05, + "loss": 5.8778, + "step": 13048 + }, + { + "epoch": 0.07760609953373299, + "grad_norm": 2.2353031635284424, + "learning_rate": 4.9260764353973e-05, + "loss": 5.2482, + "step": 13049 + }, + { + "epoch": 0.07761204681701399, + "grad_norm": 2.0858941078186035, + "learning_rate": 4.926065160130369e-05, + "loss": 5.2752, + "step": 13050 + }, + { + "epoch": 0.07761799410029499, + "grad_norm": 2.275660514831543, + "learning_rate": 4.926053884016522e-05, + "loss": 5.004, + "step": 13051 + }, + { + "epoch": 0.07762394138357598, + "grad_norm": 1.9338358640670776, + "learning_rate": 4.926042607055765e-05, + "loss": 5.4688, + "step": 13052 + }, + { + "epoch": 0.07762988866685698, + "grad_norm": 1.7377573251724243, + "learning_rate": 4.926031329248103e-05, + "loss": 5.6429, + "step": 13053 + }, + { + "epoch": 0.07763583595013798, + "grad_norm": 1.8915661573410034, + "learning_rate": 4.9260200505935374e-05, + "loss": 5.543, + "step": 13054 + }, + { + "epoch": 0.07764178323341897, + "grad_norm": 1.7961910963058472, + "learning_rate": 4.926008771092073e-05, + "loss": 5.4245, + "step": 13055 + }, + { + "epoch": 0.07764773051669997, + "grad_norm": 1.9412139654159546, + "learning_rate": 4.9259974907437145e-05, + "loss": 5.5858, + "step": 13056 + }, + { + "epoch": 0.07765367779998097, + "grad_norm": 2.458508253097534, + "learning_rate": 4.925986209548466e-05, + "loss": 5.3307, + "step": 13057 + }, + { + "epoch": 0.07765962508326196, + "grad_norm": 2.23331880569458, + "learning_rate": 4.92597492750633e-05, + "loss": 5.6979, + "step": 13058 + }, + { + "epoch": 0.07766557236654296, + "grad_norm": 2.38264536857605, + "learning_rate": 4.9259636446173104e-05, + "loss": 5.5771, + "step": 13059 + }, + { + "epoch": 0.07767151964982397, + "grad_norm": 2.0892632007598877, + "learning_rate": 4.925952360881413e-05, + "loss": 5.8596, + "step": 13060 + }, + { + "epoch": 0.07767746693310495, + "grad_norm": 1.82732355594635, + "learning_rate": 4.92594107629864e-05, + "loss": 5.3724, + "step": 13061 + }, + { + "epoch": 0.07768341421638596, + "grad_norm": 1.821089506149292, + "learning_rate": 4.925929790868997e-05, + "loss": 5.6499, + "step": 13062 + }, + { + "epoch": 0.07768936149966696, + "grad_norm": 1.9662789106369019, + "learning_rate": 4.925918504592487e-05, + "loss": 5.5132, + "step": 13063 + }, + { + "epoch": 0.07769530878294795, + "grad_norm": 1.830101490020752, + "learning_rate": 4.925907217469113e-05, + "loss": 5.4492, + "step": 13064 + }, + { + "epoch": 0.07770125606622895, + "grad_norm": 1.8362375497817993, + "learning_rate": 4.9258959294988804e-05, + "loss": 5.8314, + "step": 13065 + }, + { + "epoch": 0.07770720334950994, + "grad_norm": 2.23861026763916, + "learning_rate": 4.9258846406817926e-05, + "loss": 6.2564, + "step": 13066 + }, + { + "epoch": 0.07771315063279094, + "grad_norm": 2.2672650814056396, + "learning_rate": 4.9258733510178536e-05, + "loss": 6.3396, + "step": 13067 + }, + { + "epoch": 0.07771909791607194, + "grad_norm": 1.8667620420455933, + "learning_rate": 4.9258620605070665e-05, + "loss": 5.8509, + "step": 13068 + }, + { + "epoch": 0.07772504519935293, + "grad_norm": 1.7386364936828613, + "learning_rate": 4.925850769149436e-05, + "loss": 5.567, + "step": 13069 + }, + { + "epoch": 0.07773099248263393, + "grad_norm": 1.3638315200805664, + "learning_rate": 4.9258394769449675e-05, + "loss": 5.6892, + "step": 13070 + }, + { + "epoch": 0.07773693976591493, + "grad_norm": 1.7117588520050049, + "learning_rate": 4.9258281838936624e-05, + "loss": 5.461, + "step": 13071 + }, + { + "epoch": 0.07774288704919592, + "grad_norm": 1.7597805261611938, + "learning_rate": 4.925816889995526e-05, + "loss": 5.6783, + "step": 13072 + }, + { + "epoch": 0.07774883433247692, + "grad_norm": 1.8734283447265625, + "learning_rate": 4.9258055952505624e-05, + "loss": 5.633, + "step": 13073 + }, + { + "epoch": 0.07775478161575793, + "grad_norm": 1.5552877187728882, + "learning_rate": 4.9257942996587744e-05, + "loss": 5.8804, + "step": 13074 + }, + { + "epoch": 0.07776072889903891, + "grad_norm": 1.2786669731140137, + "learning_rate": 4.925783003220167e-05, + "loss": 5.3208, + "step": 13075 + }, + { + "epoch": 0.07776667618231992, + "grad_norm": 1.558182954788208, + "learning_rate": 4.925771705934744e-05, + "loss": 5.4023, + "step": 13076 + }, + { + "epoch": 0.07777262346560092, + "grad_norm": 1.3482223749160767, + "learning_rate": 4.925760407802509e-05, + "loss": 5.3879, + "step": 13077 + }, + { + "epoch": 0.0777785707488819, + "grad_norm": 1.5111918449401855, + "learning_rate": 4.925749108823466e-05, + "loss": 5.329, + "step": 13078 + }, + { + "epoch": 0.07778451803216291, + "grad_norm": 1.7119463682174683, + "learning_rate": 4.925737808997619e-05, + "loss": 5.7282, + "step": 13079 + }, + { + "epoch": 0.07779046531544391, + "grad_norm": 1.7753342390060425, + "learning_rate": 4.925726508324972e-05, + "loss": 5.2677, + "step": 13080 + }, + { + "epoch": 0.0777964125987249, + "grad_norm": 1.8957557678222656, + "learning_rate": 4.925715206805529e-05, + "loss": 4.7193, + "step": 13081 + }, + { + "epoch": 0.0778023598820059, + "grad_norm": 2.503037214279175, + "learning_rate": 4.9257039044392935e-05, + "loss": 5.034, + "step": 13082 + }, + { + "epoch": 0.0778083071652869, + "grad_norm": 2.031312942504883, + "learning_rate": 4.92569260122627e-05, + "loss": 5.1982, + "step": 13083 + }, + { + "epoch": 0.07781425444856789, + "grad_norm": 1.8345115184783936, + "learning_rate": 4.9256812971664635e-05, + "loss": 5.6059, + "step": 13084 + }, + { + "epoch": 0.07782020173184889, + "grad_norm": 2.134131669998169, + "learning_rate": 4.925669992259875e-05, + "loss": 5.8174, + "step": 13085 + }, + { + "epoch": 0.0778261490151299, + "grad_norm": 1.9598990678787231, + "learning_rate": 4.9256586865065114e-05, + "loss": 5.76, + "step": 13086 + }, + { + "epoch": 0.07783209629841088, + "grad_norm": 1.8105463981628418, + "learning_rate": 4.925647379906375e-05, + "loss": 5.5112, + "step": 13087 + }, + { + "epoch": 0.07783804358169188, + "grad_norm": 1.5290614366531372, + "learning_rate": 4.9256360724594696e-05, + "loss": 5.7122, + "step": 13088 + }, + { + "epoch": 0.07784399086497289, + "grad_norm": 1.6188294887542725, + "learning_rate": 4.9256247641658005e-05, + "loss": 5.58, + "step": 13089 + }, + { + "epoch": 0.07784993814825387, + "grad_norm": 1.8662221431732178, + "learning_rate": 4.925613455025371e-05, + "loss": 5.4975, + "step": 13090 + }, + { + "epoch": 0.07785588543153488, + "grad_norm": 1.808813452720642, + "learning_rate": 4.925602145038184e-05, + "loss": 5.6704, + "step": 13091 + }, + { + "epoch": 0.07786183271481588, + "grad_norm": 1.776418924331665, + "learning_rate": 4.925590834204245e-05, + "loss": 5.7558, + "step": 13092 + }, + { + "epoch": 0.07786777999809687, + "grad_norm": 1.704537034034729, + "learning_rate": 4.925579522523557e-05, + "loss": 5.6667, + "step": 13093 + }, + { + "epoch": 0.07787372728137787, + "grad_norm": 2.115651845932007, + "learning_rate": 4.9255682099961246e-05, + "loss": 5.5823, + "step": 13094 + }, + { + "epoch": 0.07787967456465886, + "grad_norm": 1.851914882659912, + "learning_rate": 4.9255568966219504e-05, + "loss": 5.6749, + "step": 13095 + }, + { + "epoch": 0.07788562184793986, + "grad_norm": 1.8792526721954346, + "learning_rate": 4.92554558240104e-05, + "loss": 5.8539, + "step": 13096 + }, + { + "epoch": 0.07789156913122086, + "grad_norm": 1.805280327796936, + "learning_rate": 4.925534267333397e-05, + "loss": 5.8522, + "step": 13097 + }, + { + "epoch": 0.07789751641450185, + "grad_norm": 1.7457916736602783, + "learning_rate": 4.925522951419025e-05, + "loss": 5.9419, + "step": 13098 + }, + { + "epoch": 0.07790346369778285, + "grad_norm": 1.6427416801452637, + "learning_rate": 4.925511634657928e-05, + "loss": 5.8924, + "step": 13099 + }, + { + "epoch": 0.07790941098106385, + "grad_norm": 1.7034873962402344, + "learning_rate": 4.9255003170501095e-05, + "loss": 5.8701, + "step": 13100 + }, + { + "epoch": 0.07791535826434484, + "grad_norm": 1.6852953433990479, + "learning_rate": 4.925488998595574e-05, + "loss": 5.771, + "step": 13101 + }, + { + "epoch": 0.07792130554762584, + "grad_norm": 1.6478735208511353, + "learning_rate": 4.9254776792943255e-05, + "loss": 5.4274, + "step": 13102 + }, + { + "epoch": 0.07792725283090685, + "grad_norm": 1.5896925926208496, + "learning_rate": 4.925466359146368e-05, + "loss": 5.8217, + "step": 13103 + }, + { + "epoch": 0.07793320011418783, + "grad_norm": 1.649539828300476, + "learning_rate": 4.9254550381517054e-05, + "loss": 5.7899, + "step": 13104 + }, + { + "epoch": 0.07793914739746884, + "grad_norm": 1.5224459171295166, + "learning_rate": 4.925443716310341e-05, + "loss": 5.7931, + "step": 13105 + }, + { + "epoch": 0.07794509468074984, + "grad_norm": 2.009038209915161, + "learning_rate": 4.9254323936222796e-05, + "loss": 5.854, + "step": 13106 + }, + { + "epoch": 0.07795104196403083, + "grad_norm": 1.5545878410339355, + "learning_rate": 4.9254210700875245e-05, + "loss": 5.7212, + "step": 13107 + }, + { + "epoch": 0.07795698924731183, + "grad_norm": 2.0804193019866943, + "learning_rate": 4.92540974570608e-05, + "loss": 5.7195, + "step": 13108 + }, + { + "epoch": 0.07796293653059283, + "grad_norm": 1.940432071685791, + "learning_rate": 4.92539842047795e-05, + "loss": 5.4998, + "step": 13109 + }, + { + "epoch": 0.07796888381387382, + "grad_norm": 2.3788061141967773, + "learning_rate": 4.925387094403139e-05, + "loss": 5.5975, + "step": 13110 + }, + { + "epoch": 0.07797483109715482, + "grad_norm": 1.6193798780441284, + "learning_rate": 4.92537576748165e-05, + "loss": 5.4489, + "step": 13111 + }, + { + "epoch": 0.07798077838043582, + "grad_norm": 1.7056760787963867, + "learning_rate": 4.9253644397134866e-05, + "loss": 5.5584, + "step": 13112 + }, + { + "epoch": 0.07798672566371681, + "grad_norm": 1.2604116201400757, + "learning_rate": 4.925353111098655e-05, + "loss": 5.5681, + "step": 13113 + }, + { + "epoch": 0.07799267294699781, + "grad_norm": 1.305413842201233, + "learning_rate": 4.925341781637157e-05, + "loss": 5.6966, + "step": 13114 + }, + { + "epoch": 0.07799862023027881, + "grad_norm": 2.6248581409454346, + "learning_rate": 4.9253304513289975e-05, + "loss": 5.3666, + "step": 13115 + }, + { + "epoch": 0.0780045675135598, + "grad_norm": 1.687741994857788, + "learning_rate": 4.92531912017418e-05, + "loss": 5.5511, + "step": 13116 + }, + { + "epoch": 0.0780105147968408, + "grad_norm": 1.5827749967575073, + "learning_rate": 4.9253077881727086e-05, + "loss": 5.3363, + "step": 13117 + }, + { + "epoch": 0.0780164620801218, + "grad_norm": 1.5989108085632324, + "learning_rate": 4.925296455324587e-05, + "loss": 5.472, + "step": 13118 + }, + { + "epoch": 0.0780224093634028, + "grad_norm": 1.5687717199325562, + "learning_rate": 4.9252851216298194e-05, + "loss": 5.6894, + "step": 13119 + }, + { + "epoch": 0.0780283566466838, + "grad_norm": 1.312949538230896, + "learning_rate": 4.9252737870884106e-05, + "loss": 5.6735, + "step": 13120 + }, + { + "epoch": 0.0780343039299648, + "grad_norm": 1.5779353380203247, + "learning_rate": 4.925262451700363e-05, + "loss": 5.3281, + "step": 13121 + }, + { + "epoch": 0.07804025121324579, + "grad_norm": 1.6127909421920776, + "learning_rate": 4.9252511154656825e-05, + "loss": 5.27, + "step": 13122 + }, + { + "epoch": 0.07804619849652679, + "grad_norm": 1.6496199369430542, + "learning_rate": 4.925239778384371e-05, + "loss": 5.4913, + "step": 13123 + }, + { + "epoch": 0.07805214577980778, + "grad_norm": 2.394230842590332, + "learning_rate": 4.925228440456433e-05, + "loss": 5.1788, + "step": 13124 + }, + { + "epoch": 0.07805809306308878, + "grad_norm": 2.169250249862671, + "learning_rate": 4.925217101681873e-05, + "loss": 5.4087, + "step": 13125 + }, + { + "epoch": 0.07806404034636978, + "grad_norm": 2.150338649749756, + "learning_rate": 4.925205762060695e-05, + "loss": 5.5004, + "step": 13126 + }, + { + "epoch": 0.07806998762965077, + "grad_norm": 2.0131516456604004, + "learning_rate": 4.925194421592903e-05, + "loss": 5.5791, + "step": 13127 + }, + { + "epoch": 0.07807593491293177, + "grad_norm": 1.8154455423355103, + "learning_rate": 4.925183080278501e-05, + "loss": 5.5479, + "step": 13128 + }, + { + "epoch": 0.07808188219621277, + "grad_norm": 1.7489157915115356, + "learning_rate": 4.925171738117492e-05, + "loss": 5.7169, + "step": 13129 + }, + { + "epoch": 0.07808782947949376, + "grad_norm": 1.6712158918380737, + "learning_rate": 4.92516039510988e-05, + "loss": 6.0751, + "step": 13130 + }, + { + "epoch": 0.07809377676277476, + "grad_norm": 1.7542296648025513, + "learning_rate": 4.9251490512556706e-05, + "loss": 5.8998, + "step": 13131 + }, + { + "epoch": 0.07809972404605577, + "grad_norm": 1.5962193012237549, + "learning_rate": 4.9251377065548666e-05, + "loss": 5.7781, + "step": 13132 + }, + { + "epoch": 0.07810567132933675, + "grad_norm": 1.783756136894226, + "learning_rate": 4.9251263610074714e-05, + "loss": 5.8384, + "step": 13133 + }, + { + "epoch": 0.07811161861261776, + "grad_norm": 1.6608144044876099, + "learning_rate": 4.92511501461349e-05, + "loss": 5.7603, + "step": 13134 + }, + { + "epoch": 0.07811756589589876, + "grad_norm": 1.8659160137176514, + "learning_rate": 4.925103667372926e-05, + "loss": 5.5039, + "step": 13135 + }, + { + "epoch": 0.07812351317917975, + "grad_norm": 1.591565489768982, + "learning_rate": 4.925092319285783e-05, + "loss": 5.7034, + "step": 13136 + }, + { + "epoch": 0.07812946046246075, + "grad_norm": 1.5772358179092407, + "learning_rate": 4.925080970352066e-05, + "loss": 5.6347, + "step": 13137 + }, + { + "epoch": 0.07813540774574175, + "grad_norm": 1.7196561098098755, + "learning_rate": 4.925069620571778e-05, + "loss": 5.7086, + "step": 13138 + }, + { + "epoch": 0.07814135502902274, + "grad_norm": 1.9582041501998901, + "learning_rate": 4.9250582699449237e-05, + "loss": 5.9774, + "step": 13139 + }, + { + "epoch": 0.07814730231230374, + "grad_norm": 2.0566928386688232, + "learning_rate": 4.9250469184715064e-05, + "loss": 5.8527, + "step": 13140 + }, + { + "epoch": 0.07815324959558474, + "grad_norm": 1.9961296319961548, + "learning_rate": 4.92503556615153e-05, + "loss": 5.65, + "step": 13141 + }, + { + "epoch": 0.07815919687886573, + "grad_norm": 1.672601342201233, + "learning_rate": 4.925024212984999e-05, + "loss": 5.7242, + "step": 13142 + }, + { + "epoch": 0.07816514416214673, + "grad_norm": 1.6791996955871582, + "learning_rate": 4.9250128589719166e-05, + "loss": 5.7365, + "step": 13143 + }, + { + "epoch": 0.07817109144542773, + "grad_norm": 2.4464364051818848, + "learning_rate": 4.925001504112288e-05, + "loss": 4.9673, + "step": 13144 + }, + { + "epoch": 0.07817703872870872, + "grad_norm": 2.0053181648254395, + "learning_rate": 4.9249901484061156e-05, + "loss": 5.7916, + "step": 13145 + }, + { + "epoch": 0.07818298601198972, + "grad_norm": 2.512120246887207, + "learning_rate": 4.924978791853405e-05, + "loss": 5.914, + "step": 13146 + }, + { + "epoch": 0.07818893329527073, + "grad_norm": 2.2429497241973877, + "learning_rate": 4.924967434454159e-05, + "loss": 5.8806, + "step": 13147 + }, + { + "epoch": 0.07819488057855171, + "grad_norm": 1.9966307878494263, + "learning_rate": 4.924956076208381e-05, + "loss": 5.8883, + "step": 13148 + }, + { + "epoch": 0.07820082786183272, + "grad_norm": 2.492926836013794, + "learning_rate": 4.924944717116077e-05, + "loss": 5.361, + "step": 13149 + }, + { + "epoch": 0.07820677514511372, + "grad_norm": 2.050769090652466, + "learning_rate": 4.92493335717725e-05, + "loss": 5.5682, + "step": 13150 + }, + { + "epoch": 0.07821272242839471, + "grad_norm": 2.2797789573669434, + "learning_rate": 4.9249219963919037e-05, + "loss": 5.8695, + "step": 13151 + }, + { + "epoch": 0.07821866971167571, + "grad_norm": 2.1034891605377197, + "learning_rate": 4.924910634760041e-05, + "loss": 4.987, + "step": 13152 + }, + { + "epoch": 0.0782246169949567, + "grad_norm": 1.7718714475631714, + "learning_rate": 4.924899272281669e-05, + "loss": 5.112, + "step": 13153 + }, + { + "epoch": 0.0782305642782377, + "grad_norm": 1.730656385421753, + "learning_rate": 4.9248879089567884e-05, + "loss": 5.6589, + "step": 13154 + }, + { + "epoch": 0.0782365115615187, + "grad_norm": 1.7784979343414307, + "learning_rate": 4.9248765447854054e-05, + "loss": 5.6812, + "step": 13155 + }, + { + "epoch": 0.07824245884479969, + "grad_norm": 1.5646599531173706, + "learning_rate": 4.9248651797675213e-05, + "loss": 5.7598, + "step": 13156 + }, + { + "epoch": 0.07824840612808069, + "grad_norm": 2.6416964530944824, + "learning_rate": 4.924853813903144e-05, + "loss": 5.9888, + "step": 13157 + }, + { + "epoch": 0.0782543534113617, + "grad_norm": 1.978983998298645, + "learning_rate": 4.924842447192274e-05, + "loss": 5.8919, + "step": 13158 + }, + { + "epoch": 0.07826030069464268, + "grad_norm": 2.3622004985809326, + "learning_rate": 4.924831079634916e-05, + "loss": 5.706, + "step": 13159 + }, + { + "epoch": 0.07826624797792368, + "grad_norm": 2.4118547439575195, + "learning_rate": 4.9248197112310754e-05, + "loss": 5.529, + "step": 13160 + }, + { + "epoch": 0.07827219526120469, + "grad_norm": 1.9290462732315063, + "learning_rate": 4.9248083419807554e-05, + "loss": 5.6403, + "step": 13161 + }, + { + "epoch": 0.07827814254448567, + "grad_norm": 1.9591599702835083, + "learning_rate": 4.92479697188396e-05, + "loss": 5.3365, + "step": 13162 + }, + { + "epoch": 0.07828408982776668, + "grad_norm": 1.7800555229187012, + "learning_rate": 4.9247856009406924e-05, + "loss": 6.4051, + "step": 13163 + }, + { + "epoch": 0.07829003711104768, + "grad_norm": 1.8390953540802002, + "learning_rate": 4.924774229150958e-05, + "loss": 5.775, + "step": 13164 + }, + { + "epoch": 0.07829598439432867, + "grad_norm": 1.8265724182128906, + "learning_rate": 4.924762856514759e-05, + "loss": 6.1238, + "step": 13165 + }, + { + "epoch": 0.07830193167760967, + "grad_norm": 1.5573666095733643, + "learning_rate": 4.9247514830321005e-05, + "loss": 5.9823, + "step": 13166 + }, + { + "epoch": 0.07830787896089067, + "grad_norm": 2.2647573947906494, + "learning_rate": 4.924740108702987e-05, + "loss": 5.0975, + "step": 13167 + }, + { + "epoch": 0.07831382624417166, + "grad_norm": 2.509573459625244, + "learning_rate": 4.924728733527422e-05, + "loss": 5.1327, + "step": 13168 + }, + { + "epoch": 0.07831977352745266, + "grad_norm": 2.2974681854248047, + "learning_rate": 4.924717357505408e-05, + "loss": 5.1493, + "step": 13169 + }, + { + "epoch": 0.07832572081073366, + "grad_norm": 1.958938717842102, + "learning_rate": 4.924705980636951e-05, + "loss": 6.0291, + "step": 13170 + }, + { + "epoch": 0.07833166809401465, + "grad_norm": 1.7714133262634277, + "learning_rate": 4.924694602922054e-05, + "loss": 5.9623, + "step": 13171 + }, + { + "epoch": 0.07833761537729565, + "grad_norm": 1.7545043230056763, + "learning_rate": 4.924683224360721e-05, + "loss": 5.9123, + "step": 13172 + }, + { + "epoch": 0.07834356266057665, + "grad_norm": 1.4791491031646729, + "learning_rate": 4.924671844952957e-05, + "loss": 5.8959, + "step": 13173 + }, + { + "epoch": 0.07834950994385764, + "grad_norm": 1.783353567123413, + "learning_rate": 4.924660464698764e-05, + "loss": 5.732, + "step": 13174 + }, + { + "epoch": 0.07835545722713864, + "grad_norm": 1.9444235563278198, + "learning_rate": 4.9246490835981474e-05, + "loss": 5.5167, + "step": 13175 + }, + { + "epoch": 0.07836140451041965, + "grad_norm": 1.9656537771224976, + "learning_rate": 4.924637701651111e-05, + "loss": 5.4557, + "step": 13176 + }, + { + "epoch": 0.07836735179370063, + "grad_norm": 1.8164803981781006, + "learning_rate": 4.9246263188576594e-05, + "loss": 5.44, + "step": 13177 + }, + { + "epoch": 0.07837329907698164, + "grad_norm": 1.8245429992675781, + "learning_rate": 4.9246149352177946e-05, + "loss": 5.2164, + "step": 13178 + }, + { + "epoch": 0.07837924636026264, + "grad_norm": 1.76225745677948, + "learning_rate": 4.924603550731522e-05, + "loss": 5.2325, + "step": 13179 + }, + { + "epoch": 0.07838519364354363, + "grad_norm": 2.052314519882202, + "learning_rate": 4.924592165398846e-05, + "loss": 5.7905, + "step": 13180 + }, + { + "epoch": 0.07839114092682463, + "grad_norm": 1.63084077835083, + "learning_rate": 4.924580779219769e-05, + "loss": 5.2703, + "step": 13181 + }, + { + "epoch": 0.07839708821010562, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.9245693921942965e-05, + "loss": 5.5974, + "step": 13182 + }, + { + "epoch": 0.07840303549338662, + "grad_norm": 2.201376438140869, + "learning_rate": 4.9245580043224315e-05, + "loss": 5.1298, + "step": 13183 + }, + { + "epoch": 0.07840898277666762, + "grad_norm": 2.3778293132781982, + "learning_rate": 4.924546615604179e-05, + "loss": 5.2289, + "step": 13184 + }, + { + "epoch": 0.07841493005994861, + "grad_norm": 2.5284171104431152, + "learning_rate": 4.9245352260395414e-05, + "loss": 5.0038, + "step": 13185 + }, + { + "epoch": 0.07842087734322961, + "grad_norm": 2.230825424194336, + "learning_rate": 4.9245238356285244e-05, + "loss": 5.0699, + "step": 13186 + }, + { + "epoch": 0.07842682462651061, + "grad_norm": 2.1288161277770996, + "learning_rate": 4.924512444371131e-05, + "loss": 5.1093, + "step": 13187 + }, + { + "epoch": 0.0784327719097916, + "grad_norm": 1.912685751914978, + "learning_rate": 4.924501052267365e-05, + "loss": 5.5926, + "step": 13188 + }, + { + "epoch": 0.0784387191930726, + "grad_norm": 2.394078254699707, + "learning_rate": 4.924489659317231e-05, + "loss": 5.129, + "step": 13189 + }, + { + "epoch": 0.0784446664763536, + "grad_norm": 2.7360801696777344, + "learning_rate": 4.924478265520733e-05, + "loss": 4.9682, + "step": 13190 + }, + { + "epoch": 0.0784506137596346, + "grad_norm": 2.4817416667938232, + "learning_rate": 4.924466870877874e-05, + "loss": 5.0193, + "step": 13191 + }, + { + "epoch": 0.0784565610429156, + "grad_norm": 2.5156679153442383, + "learning_rate": 4.92445547538866e-05, + "loss": 5.0044, + "step": 13192 + }, + { + "epoch": 0.0784625083261966, + "grad_norm": 2.519080638885498, + "learning_rate": 4.924444079053092e-05, + "loss": 5.0109, + "step": 13193 + }, + { + "epoch": 0.07846845560947759, + "grad_norm": 2.3944201469421387, + "learning_rate": 4.924432681871176e-05, + "loss": 5.0032, + "step": 13194 + }, + { + "epoch": 0.07847440289275859, + "grad_norm": 2.4199647903442383, + "learning_rate": 4.924421283842916e-05, + "loss": 4.8158, + "step": 13195 + }, + { + "epoch": 0.07848035017603959, + "grad_norm": 2.4517173767089844, + "learning_rate": 4.924409884968316e-05, + "loss": 4.8194, + "step": 13196 + }, + { + "epoch": 0.07848629745932058, + "grad_norm": 2.231703042984009, + "learning_rate": 4.924398485247379e-05, + "loss": 4.882, + "step": 13197 + }, + { + "epoch": 0.07849224474260158, + "grad_norm": 2.218252182006836, + "learning_rate": 4.924387084680109e-05, + "loss": 4.872, + "step": 13198 + }, + { + "epoch": 0.07849819202588258, + "grad_norm": 2.2126224040985107, + "learning_rate": 4.924375683266511e-05, + "loss": 5.019, + "step": 13199 + }, + { + "epoch": 0.07850413930916357, + "grad_norm": 2.197240114212036, + "learning_rate": 4.924364281006589e-05, + "loss": 4.9801, + "step": 13200 + }, + { + "epoch": 0.07851008659244457, + "grad_norm": 2.11427640914917, + "learning_rate": 4.9243528779003456e-05, + "loss": 4.992, + "step": 13201 + }, + { + "epoch": 0.07851603387572557, + "grad_norm": 1.9424201250076294, + "learning_rate": 4.9243414739477864e-05, + "loss": 4.9275, + "step": 13202 + }, + { + "epoch": 0.07852198115900656, + "grad_norm": 1.897208571434021, + "learning_rate": 4.9243300691489146e-05, + "loss": 5.0482, + "step": 13203 + }, + { + "epoch": 0.07852792844228756, + "grad_norm": 1.7149171829223633, + "learning_rate": 4.924318663503734e-05, + "loss": 5.4713, + "step": 13204 + }, + { + "epoch": 0.07853387572556857, + "grad_norm": 1.770279049873352, + "learning_rate": 4.924307257012248e-05, + "loss": 5.5565, + "step": 13205 + }, + { + "epoch": 0.07853982300884955, + "grad_norm": 2.043506145477295, + "learning_rate": 4.924295849674463e-05, + "loss": 4.9129, + "step": 13206 + }, + { + "epoch": 0.07854577029213056, + "grad_norm": 1.91255521774292, + "learning_rate": 4.92428444149038e-05, + "loss": 5.5405, + "step": 13207 + }, + { + "epoch": 0.07855171757541156, + "grad_norm": 2.371006965637207, + "learning_rate": 4.924273032460005e-05, + "loss": 5.8047, + "step": 13208 + }, + { + "epoch": 0.07855766485869255, + "grad_norm": 2.1126253604888916, + "learning_rate": 4.9242616225833416e-05, + "loss": 5.6397, + "step": 13209 + }, + { + "epoch": 0.07856361214197355, + "grad_norm": 1.9398634433746338, + "learning_rate": 4.9242502118603925e-05, + "loss": 5.7703, + "step": 13210 + }, + { + "epoch": 0.07856955942525454, + "grad_norm": 1.7660777568817139, + "learning_rate": 4.924238800291164e-05, + "loss": 5.6485, + "step": 13211 + }, + { + "epoch": 0.07857550670853554, + "grad_norm": 1.835633397102356, + "learning_rate": 4.924227387875658e-05, + "loss": 5.701, + "step": 13212 + }, + { + "epoch": 0.07858145399181654, + "grad_norm": 1.8192920684814453, + "learning_rate": 4.9242159746138796e-05, + "loss": 5.5682, + "step": 13213 + }, + { + "epoch": 0.07858740127509753, + "grad_norm": 1.8342156410217285, + "learning_rate": 4.924204560505832e-05, + "loss": 5.2546, + "step": 13214 + }, + { + "epoch": 0.07859334855837853, + "grad_norm": 1.855446696281433, + "learning_rate": 4.92419314555152e-05, + "loss": 5.7471, + "step": 13215 + }, + { + "epoch": 0.07859929584165953, + "grad_norm": 1.7786341905593872, + "learning_rate": 4.924181729750946e-05, + "loss": 5.8774, + "step": 13216 + }, + { + "epoch": 0.07860524312494052, + "grad_norm": 1.7919361591339111, + "learning_rate": 4.9241703131041175e-05, + "loss": 5.7796, + "step": 13217 + }, + { + "epoch": 0.07861119040822152, + "grad_norm": 2.1065824031829834, + "learning_rate": 4.924158895611034e-05, + "loss": 5.2471, + "step": 13218 + }, + { + "epoch": 0.07861713769150253, + "grad_norm": 2.18803334236145, + "learning_rate": 4.9241474772717036e-05, + "loss": 4.8654, + "step": 13219 + }, + { + "epoch": 0.07862308497478351, + "grad_norm": 2.156651020050049, + "learning_rate": 4.924136058086127e-05, + "loss": 4.7614, + "step": 13220 + }, + { + "epoch": 0.07862903225806452, + "grad_norm": 2.098242998123169, + "learning_rate": 4.9241246380543095e-05, + "loss": 4.8152, + "step": 13221 + }, + { + "epoch": 0.07863497954134552, + "grad_norm": 1.9857498407363892, + "learning_rate": 4.924113217176256e-05, + "loss": 4.7955, + "step": 13222 + }, + { + "epoch": 0.0786409268246265, + "grad_norm": 2.046926259994507, + "learning_rate": 4.9241017954519685e-05, + "loss": 4.9851, + "step": 13223 + }, + { + "epoch": 0.07864687410790751, + "grad_norm": 1.804005742073059, + "learning_rate": 4.924090372881454e-05, + "loss": 5.5084, + "step": 13224 + }, + { + "epoch": 0.07865282139118851, + "grad_norm": 1.8413509130477905, + "learning_rate": 4.924078949464713e-05, + "loss": 5.462, + "step": 13225 + }, + { + "epoch": 0.0786587686744695, + "grad_norm": 1.7599927186965942, + "learning_rate": 4.924067525201751e-05, + "loss": 5.4255, + "step": 13226 + }, + { + "epoch": 0.0786647159577505, + "grad_norm": 1.7645682096481323, + "learning_rate": 4.924056100092573e-05, + "loss": 5.4837, + "step": 13227 + }, + { + "epoch": 0.0786706632410315, + "grad_norm": 1.7478766441345215, + "learning_rate": 4.924044674137182e-05, + "loss": 5.2957, + "step": 13228 + }, + { + "epoch": 0.07867661052431249, + "grad_norm": 1.7865453958511353, + "learning_rate": 4.924033247335581e-05, + "loss": 5.1909, + "step": 13229 + }, + { + "epoch": 0.07868255780759349, + "grad_norm": 1.8167400360107422, + "learning_rate": 4.924021819687776e-05, + "loss": 5.2732, + "step": 13230 + }, + { + "epoch": 0.0786885050908745, + "grad_norm": 1.8745819330215454, + "learning_rate": 4.92401039119377e-05, + "loss": 5.3222, + "step": 13231 + }, + { + "epoch": 0.07869445237415548, + "grad_norm": 1.7355458736419678, + "learning_rate": 4.9239989618535665e-05, + "loss": 5.4142, + "step": 13232 + }, + { + "epoch": 0.07870039965743648, + "grad_norm": 1.7634247541427612, + "learning_rate": 4.9239875316671705e-05, + "loss": 5.3114, + "step": 13233 + }, + { + "epoch": 0.07870634694071749, + "grad_norm": 1.8516123294830322, + "learning_rate": 4.9239761006345845e-05, + "loss": 5.3014, + "step": 13234 + }, + { + "epoch": 0.07871229422399847, + "grad_norm": 1.8192317485809326, + "learning_rate": 4.9239646687558146e-05, + "loss": 5.407, + "step": 13235 + }, + { + "epoch": 0.07871824150727948, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.923953236030863e-05, + "loss": 5.4235, + "step": 13236 + }, + { + "epoch": 0.07872418879056048, + "grad_norm": 1.681746006011963, + "learning_rate": 4.923941802459735e-05, + "loss": 5.3367, + "step": 13237 + }, + { + "epoch": 0.07873013607384147, + "grad_norm": 1.6417745351791382, + "learning_rate": 4.9239303680424334e-05, + "loss": 5.253, + "step": 13238 + }, + { + "epoch": 0.07873608335712247, + "grad_norm": 1.6522557735443115, + "learning_rate": 4.9239189327789626e-05, + "loss": 5.0855, + "step": 13239 + }, + { + "epoch": 0.07874203064040346, + "grad_norm": 1.7547293901443481, + "learning_rate": 4.9239074966693275e-05, + "loss": 5.9017, + "step": 13240 + }, + { + "epoch": 0.07874797792368446, + "grad_norm": 1.998478889465332, + "learning_rate": 4.923896059713531e-05, + "loss": 5.4774, + "step": 13241 + }, + { + "epoch": 0.07875392520696546, + "grad_norm": 1.869710922241211, + "learning_rate": 4.9238846219115774e-05, + "loss": 5.4591, + "step": 13242 + }, + { + "epoch": 0.07875987249024645, + "grad_norm": 1.8957170248031616, + "learning_rate": 4.923873183263471e-05, + "loss": 5.2823, + "step": 13243 + }, + { + "epoch": 0.07876581977352745, + "grad_norm": 1.9052289724349976, + "learning_rate": 4.9238617437692146e-05, + "loss": 5.4753, + "step": 13244 + }, + { + "epoch": 0.07877176705680845, + "grad_norm": 1.8786853551864624, + "learning_rate": 4.923850303428814e-05, + "loss": 5.2234, + "step": 13245 + }, + { + "epoch": 0.07877771434008944, + "grad_norm": 2.298356533050537, + "learning_rate": 4.923838862242271e-05, + "loss": 4.7138, + "step": 13246 + }, + { + "epoch": 0.07878366162337044, + "grad_norm": 2.1191911697387695, + "learning_rate": 4.923827420209592e-05, + "loss": 4.6354, + "step": 13247 + }, + { + "epoch": 0.07878960890665145, + "grad_norm": 2.1735050678253174, + "learning_rate": 4.923815977330781e-05, + "loss": 4.454, + "step": 13248 + }, + { + "epoch": 0.07879555618993243, + "grad_norm": 2.0126335620880127, + "learning_rate": 4.923804533605839e-05, + "loss": 4.3387, + "step": 13249 + }, + { + "epoch": 0.07880150347321344, + "grad_norm": 2.00081729888916, + "learning_rate": 4.9237930890347726e-05, + "loss": 4.4009, + "step": 13250 + }, + { + "epoch": 0.07880745075649444, + "grad_norm": 2.198625326156616, + "learning_rate": 4.923781643617586e-05, + "loss": 4.4334, + "step": 13251 + }, + { + "epoch": 0.07881339803977543, + "grad_norm": 2.0630993843078613, + "learning_rate": 4.923770197354281e-05, + "loss": 4.6349, + "step": 13252 + }, + { + "epoch": 0.07881934532305643, + "grad_norm": 1.7470935583114624, + "learning_rate": 4.923758750244863e-05, + "loss": 5.1363, + "step": 13253 + }, + { + "epoch": 0.07882529260633743, + "grad_norm": 1.5461190938949585, + "learning_rate": 4.923747302289335e-05, + "loss": 5.7365, + "step": 13254 + }, + { + "epoch": 0.07883123988961842, + "grad_norm": 1.800528645515442, + "learning_rate": 4.9237358534877036e-05, + "loss": 5.949, + "step": 13255 + }, + { + "epoch": 0.07883718717289942, + "grad_norm": 2.096055746078491, + "learning_rate": 4.923724403839971e-05, + "loss": 5.4203, + "step": 13256 + }, + { + "epoch": 0.07884313445618042, + "grad_norm": 2.0838513374328613, + "learning_rate": 4.92371295334614e-05, + "loss": 5.0542, + "step": 13257 + }, + { + "epoch": 0.07884908173946141, + "grad_norm": 1.711534023284912, + "learning_rate": 4.923701502006217e-05, + "loss": 5.7168, + "step": 13258 + }, + { + "epoch": 0.07885502902274241, + "grad_norm": 1.6610822677612305, + "learning_rate": 4.9236900498202035e-05, + "loss": 5.5605, + "step": 13259 + }, + { + "epoch": 0.07886097630602341, + "grad_norm": 1.549854040145874, + "learning_rate": 4.9236785967881064e-05, + "loss": 5.7792, + "step": 13260 + }, + { + "epoch": 0.0788669235893044, + "grad_norm": 1.9194339513778687, + "learning_rate": 4.923667142909927e-05, + "loss": 5.5481, + "step": 13261 + }, + { + "epoch": 0.0788728708725854, + "grad_norm": 1.6644178628921509, + "learning_rate": 4.923655688185671e-05, + "loss": 5.7271, + "step": 13262 + }, + { + "epoch": 0.0788788181558664, + "grad_norm": 1.820898175239563, + "learning_rate": 4.9236442326153414e-05, + "loss": 6.2458, + "step": 13263 + }, + { + "epoch": 0.0788847654391474, + "grad_norm": 1.732539176940918, + "learning_rate": 4.923632776198943e-05, + "loss": 5.5854, + "step": 13264 + }, + { + "epoch": 0.0788907127224284, + "grad_norm": 1.769140601158142, + "learning_rate": 4.923621318936479e-05, + "loss": 5.5511, + "step": 13265 + }, + { + "epoch": 0.0788966600057094, + "grad_norm": 1.728833556175232, + "learning_rate": 4.923609860827955e-05, + "loss": 5.6215, + "step": 13266 + }, + { + "epoch": 0.07890260728899039, + "grad_norm": 1.5940407514572144, + "learning_rate": 4.923598401873373e-05, + "loss": 5.6572, + "step": 13267 + }, + { + "epoch": 0.07890855457227139, + "grad_norm": 2.153200149536133, + "learning_rate": 4.923586942072737e-05, + "loss": 5.0235, + "step": 13268 + }, + { + "epoch": 0.07891450185555238, + "grad_norm": 1.6448415517807007, + "learning_rate": 4.9235754814260526e-05, + "loss": 5.5353, + "step": 13269 + }, + { + "epoch": 0.07892044913883338, + "grad_norm": 1.706984281539917, + "learning_rate": 4.9235640199333235e-05, + "loss": 5.5278, + "step": 13270 + }, + { + "epoch": 0.07892639642211438, + "grad_norm": 1.6129798889160156, + "learning_rate": 4.923552557594553e-05, + "loss": 5.4643, + "step": 13271 + }, + { + "epoch": 0.07893234370539537, + "grad_norm": 1.612748384475708, + "learning_rate": 4.923541094409745e-05, + "loss": 5.4994, + "step": 13272 + }, + { + "epoch": 0.07893829098867637, + "grad_norm": 1.6947647333145142, + "learning_rate": 4.923529630378904e-05, + "loss": 5.5117, + "step": 13273 + }, + { + "epoch": 0.07894423827195737, + "grad_norm": 1.629684567451477, + "learning_rate": 4.9235181655020336e-05, + "loss": 5.4266, + "step": 13274 + }, + { + "epoch": 0.07895018555523836, + "grad_norm": 1.6417474746704102, + "learning_rate": 4.923506699779139e-05, + "loss": 5.4803, + "step": 13275 + }, + { + "epoch": 0.07895613283851936, + "grad_norm": 1.5188243389129639, + "learning_rate": 4.9234952332102226e-05, + "loss": 5.4066, + "step": 13276 + }, + { + "epoch": 0.07896208012180037, + "grad_norm": 1.4906466007232666, + "learning_rate": 4.9234837657952885e-05, + "loss": 5.4622, + "step": 13277 + }, + { + "epoch": 0.07896802740508135, + "grad_norm": 1.745351791381836, + "learning_rate": 4.9234722975343414e-05, + "loss": 5.458, + "step": 13278 + }, + { + "epoch": 0.07897397468836236, + "grad_norm": 1.734399676322937, + "learning_rate": 4.9234608284273866e-05, + "loss": 5.3542, + "step": 13279 + }, + { + "epoch": 0.07897992197164336, + "grad_norm": 2.396031379699707, + "learning_rate": 4.9234493584744254e-05, + "loss": 5.0978, + "step": 13280 + }, + { + "epoch": 0.07898586925492435, + "grad_norm": 2.0151939392089844, + "learning_rate": 4.9234378876754626e-05, + "loss": 5.5051, + "step": 13281 + }, + { + "epoch": 0.07899181653820535, + "grad_norm": 2.1796762943267822, + "learning_rate": 4.9234264160305036e-05, + "loss": 5.2788, + "step": 13282 + }, + { + "epoch": 0.07899776382148635, + "grad_norm": 2.069291830062866, + "learning_rate": 4.923414943539552e-05, + "loss": 5.4454, + "step": 13283 + }, + { + "epoch": 0.07900371110476734, + "grad_norm": 2.034498929977417, + "learning_rate": 4.92340347020261e-05, + "loss": 5.3849, + "step": 13284 + }, + { + "epoch": 0.07900965838804834, + "grad_norm": 1.8353052139282227, + "learning_rate": 4.9233919960196835e-05, + "loss": 5.3975, + "step": 13285 + }, + { + "epoch": 0.07901560567132934, + "grad_norm": 1.9896777868270874, + "learning_rate": 4.923380520990776e-05, + "loss": 5.1199, + "step": 13286 + }, + { + "epoch": 0.07902155295461033, + "grad_norm": 1.9539830684661865, + "learning_rate": 4.923369045115891e-05, + "loss": 5.3908, + "step": 13287 + }, + { + "epoch": 0.07902750023789133, + "grad_norm": 1.682651162147522, + "learning_rate": 4.923357568395033e-05, + "loss": 5.4719, + "step": 13288 + }, + { + "epoch": 0.07903344752117233, + "grad_norm": 2.0095672607421875, + "learning_rate": 4.923346090828206e-05, + "loss": 5.9258, + "step": 13289 + }, + { + "epoch": 0.07903939480445332, + "grad_norm": 1.7949076890945435, + "learning_rate": 4.923334612415413e-05, + "loss": 5.646, + "step": 13290 + }, + { + "epoch": 0.07904534208773432, + "grad_norm": 2.1651079654693604, + "learning_rate": 4.92332313315666e-05, + "loss": 5.2527, + "step": 13291 + }, + { + "epoch": 0.07905128937101533, + "grad_norm": 2.0362184047698975, + "learning_rate": 4.92331165305195e-05, + "loss": 5.2671, + "step": 13292 + }, + { + "epoch": 0.07905723665429631, + "grad_norm": 1.5425541400909424, + "learning_rate": 4.923300172101287e-05, + "loss": 5.5149, + "step": 13293 + }, + { + "epoch": 0.07906318393757732, + "grad_norm": 2.13031005859375, + "learning_rate": 4.923288690304675e-05, + "loss": 5.9304, + "step": 13294 + }, + { + "epoch": 0.07906913122085832, + "grad_norm": 2.165199041366577, + "learning_rate": 4.923277207662117e-05, + "loss": 5.9153, + "step": 13295 + }, + { + "epoch": 0.0790750785041393, + "grad_norm": 2.1479499340057373, + "learning_rate": 4.923265724173619e-05, + "loss": 5.7215, + "step": 13296 + }, + { + "epoch": 0.07908102578742031, + "grad_norm": 1.8908145427703857, + "learning_rate": 4.923254239839183e-05, + "loss": 5.5801, + "step": 13297 + }, + { + "epoch": 0.0790869730707013, + "grad_norm": 1.7739901542663574, + "learning_rate": 4.9232427546588145e-05, + "loss": 5.283, + "step": 13298 + }, + { + "epoch": 0.0790929203539823, + "grad_norm": 1.8153715133666992, + "learning_rate": 4.9232312686325175e-05, + "loss": 5.4626, + "step": 13299 + }, + { + "epoch": 0.0790988676372633, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.923219781760295e-05, + "loss": 5.5246, + "step": 13300 + }, + { + "epoch": 0.07910481492054429, + "grad_norm": 2.161536455154419, + "learning_rate": 4.923208294042152e-05, + "loss": 5.6865, + "step": 13301 + }, + { + "epoch": 0.07911076220382529, + "grad_norm": 2.5373623371124268, + "learning_rate": 4.9231968054780905e-05, + "loss": 5.8634, + "step": 13302 + }, + { + "epoch": 0.0791167094871063, + "grad_norm": 2.4957666397094727, + "learning_rate": 4.923185316068117e-05, + "loss": 4.9065, + "step": 13303 + }, + { + "epoch": 0.07912265677038728, + "grad_norm": 2.260540246963501, + "learning_rate": 4.923173825812235e-05, + "loss": 5.0815, + "step": 13304 + }, + { + "epoch": 0.07912860405366828, + "grad_norm": 2.406765937805176, + "learning_rate": 4.923162334710448e-05, + "loss": 4.8599, + "step": 13305 + }, + { + "epoch": 0.07913455133694929, + "grad_norm": 2.282153606414795, + "learning_rate": 4.923150842762759e-05, + "loss": 5.1024, + "step": 13306 + }, + { + "epoch": 0.07914049862023027, + "grad_norm": 1.8351432085037231, + "learning_rate": 4.9231393499691744e-05, + "loss": 5.3715, + "step": 13307 + }, + { + "epoch": 0.07914644590351128, + "grad_norm": 1.8290963172912598, + "learning_rate": 4.9231278563296965e-05, + "loss": 5.4456, + "step": 13308 + }, + { + "epoch": 0.07915239318679228, + "grad_norm": 1.7157766819000244, + "learning_rate": 4.923116361844329e-05, + "loss": 5.4952, + "step": 13309 + }, + { + "epoch": 0.07915834047007327, + "grad_norm": 2.051391124725342, + "learning_rate": 4.923104866513077e-05, + "loss": 5.7754, + "step": 13310 + }, + { + "epoch": 0.07916428775335427, + "grad_norm": 1.8714796304702759, + "learning_rate": 4.923093370335944e-05, + "loss": 5.4118, + "step": 13311 + }, + { + "epoch": 0.07917023503663527, + "grad_norm": 2.4251246452331543, + "learning_rate": 4.923081873312935e-05, + "loss": 4.9677, + "step": 13312 + }, + { + "epoch": 0.07917618231991626, + "grad_norm": 3.490328550338745, + "learning_rate": 4.923070375444052e-05, + "loss": 4.5336, + "step": 13313 + }, + { + "epoch": 0.07918212960319726, + "grad_norm": 2.820434331893921, + "learning_rate": 4.9230588767293004e-05, + "loss": 4.2865, + "step": 13314 + }, + { + "epoch": 0.07918807688647826, + "grad_norm": 2.3713653087615967, + "learning_rate": 4.923047377168685e-05, + "loss": 4.2558, + "step": 13315 + }, + { + "epoch": 0.07919402416975925, + "grad_norm": 2.484199285507202, + "learning_rate": 4.923035876762208e-05, + "loss": 3.9565, + "step": 13316 + }, + { + "epoch": 0.07919997145304025, + "grad_norm": 2.771982431411743, + "learning_rate": 4.9230243755098735e-05, + "loss": 3.9478, + "step": 13317 + }, + { + "epoch": 0.07920591873632125, + "grad_norm": 2.613006591796875, + "learning_rate": 4.9230128734116874e-05, + "loss": 4.0285, + "step": 13318 + }, + { + "epoch": 0.07921186601960224, + "grad_norm": 2.378276824951172, + "learning_rate": 4.923001370467653e-05, + "loss": 4.129, + "step": 13319 + }, + { + "epoch": 0.07921781330288324, + "grad_norm": 2.6948869228363037, + "learning_rate": 4.922989866677772e-05, + "loss": 5.7581, + "step": 13320 + }, + { + "epoch": 0.07922376058616425, + "grad_norm": 2.058387517929077, + "learning_rate": 4.922978362042051e-05, + "loss": 5.7589, + "step": 13321 + }, + { + "epoch": 0.07922970786944523, + "grad_norm": 2.2277138233184814, + "learning_rate": 4.9229668565604936e-05, + "loss": 5.691, + "step": 13322 + }, + { + "epoch": 0.07923565515272624, + "grad_norm": 1.827525019645691, + "learning_rate": 4.922955350233104e-05, + "loss": 5.6555, + "step": 13323 + }, + { + "epoch": 0.07924160243600724, + "grad_norm": 1.5456974506378174, + "learning_rate": 4.922943843059885e-05, + "loss": 5.445, + "step": 13324 + }, + { + "epoch": 0.07924754971928823, + "grad_norm": 1.859805703163147, + "learning_rate": 4.922932335040842e-05, + "loss": 5.5864, + "step": 13325 + }, + { + "epoch": 0.07925349700256923, + "grad_norm": 2.0083398818969727, + "learning_rate": 4.922920826175977e-05, + "loss": 5.7598, + "step": 13326 + }, + { + "epoch": 0.07925944428585022, + "grad_norm": 1.9759368896484375, + "learning_rate": 4.922909316465296e-05, + "loss": 5.7778, + "step": 13327 + }, + { + "epoch": 0.07926539156913122, + "grad_norm": 1.9937580823898315, + "learning_rate": 4.9228978059088035e-05, + "loss": 5.7291, + "step": 13328 + }, + { + "epoch": 0.07927133885241222, + "grad_norm": 2.6860668659210205, + "learning_rate": 4.922886294506501e-05, + "loss": 5.0277, + "step": 13329 + }, + { + "epoch": 0.07927728613569321, + "grad_norm": 2.03318190574646, + "learning_rate": 4.9228747822583945e-05, + "loss": 5.2387, + "step": 13330 + }, + { + "epoch": 0.07928323341897421, + "grad_norm": 2.250929117202759, + "learning_rate": 4.9228632691644874e-05, + "loss": 5.2348, + "step": 13331 + }, + { + "epoch": 0.07928918070225521, + "grad_norm": 2.0255093574523926, + "learning_rate": 4.922851755224784e-05, + "loss": 5.6585, + "step": 13332 + }, + { + "epoch": 0.0792951279855362, + "grad_norm": 1.9353551864624023, + "learning_rate": 4.922840240439288e-05, + "loss": 5.3989, + "step": 13333 + }, + { + "epoch": 0.0793010752688172, + "grad_norm": 1.9392589330673218, + "learning_rate": 4.922828724808003e-05, + "loss": 5.9127, + "step": 13334 + }, + { + "epoch": 0.0793070225520982, + "grad_norm": 2.312340021133423, + "learning_rate": 4.922817208330934e-05, + "loss": 5.656, + "step": 13335 + }, + { + "epoch": 0.0793129698353792, + "grad_norm": 2.1480720043182373, + "learning_rate": 4.9228056910080845e-05, + "loss": 5.4582, + "step": 13336 + }, + { + "epoch": 0.0793189171186602, + "grad_norm": 2.0460312366485596, + "learning_rate": 4.922794172839458e-05, + "loss": 5.5177, + "step": 13337 + }, + { + "epoch": 0.0793248644019412, + "grad_norm": 1.8319480419158936, + "learning_rate": 4.92278265382506e-05, + "loss": 5.5872, + "step": 13338 + }, + { + "epoch": 0.07933081168522219, + "grad_norm": 1.610379934310913, + "learning_rate": 4.922771133964893e-05, + "loss": 5.5398, + "step": 13339 + }, + { + "epoch": 0.07933675896850319, + "grad_norm": 1.767022728919983, + "learning_rate": 4.9227596132589616e-05, + "loss": 6.0004, + "step": 13340 + }, + { + "epoch": 0.07934270625178419, + "grad_norm": 2.108621835708618, + "learning_rate": 4.92274809170727e-05, + "loss": 5.1513, + "step": 13341 + }, + { + "epoch": 0.07934865353506518, + "grad_norm": 2.2562835216522217, + "learning_rate": 4.922736569309822e-05, + "loss": 4.7642, + "step": 13342 + }, + { + "epoch": 0.07935460081834618, + "grad_norm": 1.7953063249588013, + "learning_rate": 4.922725046066622e-05, + "loss": 5.2453, + "step": 13343 + }, + { + "epoch": 0.07936054810162718, + "grad_norm": 1.8957513570785522, + "learning_rate": 4.922713521977673e-05, + "loss": 5.0673, + "step": 13344 + }, + { + "epoch": 0.07936649538490817, + "grad_norm": 1.8375275135040283, + "learning_rate": 4.922701997042981e-05, + "loss": 5.0301, + "step": 13345 + }, + { + "epoch": 0.07937244266818917, + "grad_norm": 2.306138515472412, + "learning_rate": 4.9226904712625473e-05, + "loss": 4.7415, + "step": 13346 + }, + { + "epoch": 0.07937838995147017, + "grad_norm": 2.058403730392456, + "learning_rate": 4.922678944636379e-05, + "loss": 5.4454, + "step": 13347 + }, + { + "epoch": 0.07938433723475116, + "grad_norm": 1.9230997562408447, + "learning_rate": 4.922667417164477e-05, + "loss": 5.3755, + "step": 13348 + }, + { + "epoch": 0.07939028451803216, + "grad_norm": 1.9053308963775635, + "learning_rate": 4.922655888846848e-05, + "loss": 5.7708, + "step": 13349 + }, + { + "epoch": 0.07939623180131317, + "grad_norm": 1.8009783029556274, + "learning_rate": 4.922644359683494e-05, + "loss": 4.9939, + "step": 13350 + }, + { + "epoch": 0.07940217908459415, + "grad_norm": 1.6748642921447754, + "learning_rate": 4.92263282967442e-05, + "loss": 5.4869, + "step": 13351 + }, + { + "epoch": 0.07940812636787516, + "grad_norm": 1.532475471496582, + "learning_rate": 4.92262129881963e-05, + "loss": 5.755, + "step": 13352 + }, + { + "epoch": 0.07941407365115616, + "grad_norm": 1.513795018196106, + "learning_rate": 4.9226097671191284e-05, + "loss": 5.4083, + "step": 13353 + }, + { + "epoch": 0.07942002093443715, + "grad_norm": 1.66012442111969, + "learning_rate": 4.922598234572918e-05, + "loss": 5.5185, + "step": 13354 + }, + { + "epoch": 0.07942596821771815, + "grad_norm": 1.6519379615783691, + "learning_rate": 4.922586701181005e-05, + "loss": 5.3482, + "step": 13355 + }, + { + "epoch": 0.07943191550099914, + "grad_norm": 1.4444184303283691, + "learning_rate": 4.922575166943391e-05, + "loss": 5.4466, + "step": 13356 + }, + { + "epoch": 0.07943786278428014, + "grad_norm": 1.4603393077850342, + "learning_rate": 4.92256363186008e-05, + "loss": 5.4343, + "step": 13357 + }, + { + "epoch": 0.07944381006756114, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.922552095931078e-05, + "loss": 5.4224, + "step": 13358 + }, + { + "epoch": 0.07944975735084213, + "grad_norm": 1.3054184913635254, + "learning_rate": 4.922540559156389e-05, + "loss": 5.4801, + "step": 13359 + }, + { + "epoch": 0.07945570463412313, + "grad_norm": 1.6295130252838135, + "learning_rate": 4.922529021536015e-05, + "loss": 5.4593, + "step": 13360 + }, + { + "epoch": 0.07946165191740413, + "grad_norm": 1.6684668064117432, + "learning_rate": 4.922517483069962e-05, + "loss": 5.2817, + "step": 13361 + }, + { + "epoch": 0.07946759920068512, + "grad_norm": 1.580409049987793, + "learning_rate": 4.922505943758232e-05, + "loss": 5.4399, + "step": 13362 + }, + { + "epoch": 0.07947354648396612, + "grad_norm": 1.613756775856018, + "learning_rate": 4.922494403600831e-05, + "loss": 5.2646, + "step": 13363 + }, + { + "epoch": 0.07947949376724713, + "grad_norm": 1.4371063709259033, + "learning_rate": 4.9224828625977616e-05, + "loss": 5.2866, + "step": 13364 + }, + { + "epoch": 0.07948544105052811, + "grad_norm": 1.5926525592803955, + "learning_rate": 4.9224713207490294e-05, + "loss": 5.5958, + "step": 13365 + }, + { + "epoch": 0.07949138833380912, + "grad_norm": 1.5216618776321411, + "learning_rate": 4.9224597780546365e-05, + "loss": 5.6094, + "step": 13366 + }, + { + "epoch": 0.07949733561709012, + "grad_norm": 1.7261598110198975, + "learning_rate": 4.922448234514588e-05, + "loss": 5.2781, + "step": 13367 + }, + { + "epoch": 0.0795032829003711, + "grad_norm": 1.6909232139587402, + "learning_rate": 4.922436690128889e-05, + "loss": 5.3299, + "step": 13368 + }, + { + "epoch": 0.07950923018365211, + "grad_norm": 1.6486754417419434, + "learning_rate": 4.922425144897541e-05, + "loss": 5.2478, + "step": 13369 + }, + { + "epoch": 0.07951517746693311, + "grad_norm": 1.4019837379455566, + "learning_rate": 4.922413598820551e-05, + "loss": 5.2383, + "step": 13370 + }, + { + "epoch": 0.0795211247502141, + "grad_norm": 1.7588412761688232, + "learning_rate": 4.92240205189792e-05, + "loss": 5.3224, + "step": 13371 + }, + { + "epoch": 0.0795270720334951, + "grad_norm": 1.5354480743408203, + "learning_rate": 4.922390504129654e-05, + "loss": 5.1617, + "step": 13372 + }, + { + "epoch": 0.0795330193167761, + "grad_norm": 1.5183011293411255, + "learning_rate": 4.922378955515756e-05, + "loss": 5.3082, + "step": 13373 + }, + { + "epoch": 0.07953896660005709, + "grad_norm": 1.436281681060791, + "learning_rate": 4.922367406056232e-05, + "loss": 5.4446, + "step": 13374 + }, + { + "epoch": 0.07954491388333809, + "grad_norm": 1.526934266090393, + "learning_rate": 4.922355855751083e-05, + "loss": 5.3067, + "step": 13375 + }, + { + "epoch": 0.0795508611666191, + "grad_norm": 1.516784906387329, + "learning_rate": 4.922344304600315e-05, + "loss": 5.4982, + "step": 13376 + }, + { + "epoch": 0.07955680844990008, + "grad_norm": 1.5154777765274048, + "learning_rate": 4.922332752603932e-05, + "loss": 5.3459, + "step": 13377 + }, + { + "epoch": 0.07956275573318108, + "grad_norm": 1.542508840560913, + "learning_rate": 4.9223211997619376e-05, + "loss": 5.3677, + "step": 13378 + }, + { + "epoch": 0.07956870301646209, + "grad_norm": 1.3413010835647583, + "learning_rate": 4.922309646074336e-05, + "loss": 5.2684, + "step": 13379 + }, + { + "epoch": 0.07957465029974307, + "grad_norm": 1.6295002698898315, + "learning_rate": 4.9222980915411306e-05, + "loss": 5.2737, + "step": 13380 + }, + { + "epoch": 0.07958059758302408, + "grad_norm": 1.5810730457305908, + "learning_rate": 4.922286536162326e-05, + "loss": 5.2471, + "step": 13381 + }, + { + "epoch": 0.07958654486630508, + "grad_norm": 1.3186451196670532, + "learning_rate": 4.9222749799379266e-05, + "loss": 5.3081, + "step": 13382 + }, + { + "epoch": 0.07959249214958607, + "grad_norm": 1.3897243738174438, + "learning_rate": 4.922263422867936e-05, + "loss": 5.2658, + "step": 13383 + }, + { + "epoch": 0.07959843943286707, + "grad_norm": 1.3873858451843262, + "learning_rate": 4.922251864952358e-05, + "loss": 5.334, + "step": 13384 + }, + { + "epoch": 0.07960438671614806, + "grad_norm": 1.4205409288406372, + "learning_rate": 4.922240306191197e-05, + "loss": 5.3007, + "step": 13385 + }, + { + "epoch": 0.07961033399942906, + "grad_norm": 1.3726485967636108, + "learning_rate": 4.922228746584457e-05, + "loss": 5.1949, + "step": 13386 + }, + { + "epoch": 0.07961628128271006, + "grad_norm": 1.708837628364563, + "learning_rate": 4.922217186132142e-05, + "loss": 5.2061, + "step": 13387 + }, + { + "epoch": 0.07962222856599105, + "grad_norm": 1.7818368673324585, + "learning_rate": 4.9222056248342556e-05, + "loss": 5.1182, + "step": 13388 + }, + { + "epoch": 0.07962817584927205, + "grad_norm": 1.4941715002059937, + "learning_rate": 4.9221940626908024e-05, + "loss": 5.0899, + "step": 13389 + }, + { + "epoch": 0.07963412313255305, + "grad_norm": 1.3581326007843018, + "learning_rate": 4.922182499701787e-05, + "loss": 5.0551, + "step": 13390 + }, + { + "epoch": 0.07964007041583404, + "grad_norm": 1.5772393941879272, + "learning_rate": 4.922170935867212e-05, + "loss": 5.245, + "step": 13391 + }, + { + "epoch": 0.07964601769911504, + "grad_norm": 1.9635555744171143, + "learning_rate": 4.922159371187082e-05, + "loss": 5.2898, + "step": 13392 + }, + { + "epoch": 0.07965196498239605, + "grad_norm": 1.535050392150879, + "learning_rate": 4.922147805661402e-05, + "loss": 5.2043, + "step": 13393 + }, + { + "epoch": 0.07965791226567703, + "grad_norm": 1.4985787868499756, + "learning_rate": 4.922136239290175e-05, + "loss": 5.1682, + "step": 13394 + }, + { + "epoch": 0.07966385954895804, + "grad_norm": 1.5314218997955322, + "learning_rate": 4.922124672073405e-05, + "loss": 5.321, + "step": 13395 + }, + { + "epoch": 0.07966980683223904, + "grad_norm": 1.440621018409729, + "learning_rate": 4.9221131040110954e-05, + "loss": 5.3013, + "step": 13396 + }, + { + "epoch": 0.07967575411552003, + "grad_norm": 1.5103110074996948, + "learning_rate": 4.9221015351032527e-05, + "loss": 5.2825, + "step": 13397 + }, + { + "epoch": 0.07968170139880103, + "grad_norm": 1.3581254482269287, + "learning_rate": 4.9220899653498786e-05, + "loss": 5.2433, + "step": 13398 + }, + { + "epoch": 0.07968764868208203, + "grad_norm": 1.5673763751983643, + "learning_rate": 4.922078394750978e-05, + "loss": 5.2279, + "step": 13399 + }, + { + "epoch": 0.07969359596536302, + "grad_norm": 1.5550049543380737, + "learning_rate": 4.922066823306555e-05, + "loss": 5.0406, + "step": 13400 + }, + { + "epoch": 0.07969954324864402, + "grad_norm": 1.6366932392120361, + "learning_rate": 4.922055251016613e-05, + "loss": 5.1299, + "step": 13401 + }, + { + "epoch": 0.07970549053192502, + "grad_norm": 1.45979642868042, + "learning_rate": 4.922043677881157e-05, + "loss": 4.9527, + "step": 13402 + }, + { + "epoch": 0.07971143781520601, + "grad_norm": 1.594494104385376, + "learning_rate": 4.922032103900191e-05, + "loss": 5.6511, + "step": 13403 + }, + { + "epoch": 0.07971738509848701, + "grad_norm": 1.419045329093933, + "learning_rate": 4.9220205290737175e-05, + "loss": 5.0936, + "step": 13404 + }, + { + "epoch": 0.07972333238176801, + "grad_norm": 1.5998183488845825, + "learning_rate": 4.922008953401742e-05, + "loss": 5.2774, + "step": 13405 + }, + { + "epoch": 0.079729279665049, + "grad_norm": 1.3942409753799438, + "learning_rate": 4.9219973768842685e-05, + "loss": 5.5466, + "step": 13406 + }, + { + "epoch": 0.07973522694833, + "grad_norm": 1.4478344917297363, + "learning_rate": 4.9219857995213015e-05, + "loss": 5.5757, + "step": 13407 + }, + { + "epoch": 0.079741174231611, + "grad_norm": 1.4197556972503662, + "learning_rate": 4.921974221312843e-05, + "loss": 5.3194, + "step": 13408 + }, + { + "epoch": 0.079747121514892, + "grad_norm": 1.7690924406051636, + "learning_rate": 4.9219626422588996e-05, + "loss": 5.3551, + "step": 13409 + }, + { + "epoch": 0.079753068798173, + "grad_norm": 1.8233799934387207, + "learning_rate": 4.921951062359473e-05, + "loss": 5.3143, + "step": 13410 + }, + { + "epoch": 0.079759016081454, + "grad_norm": 1.738848090171814, + "learning_rate": 4.921939481614568e-05, + "loss": 5.0194, + "step": 13411 + }, + { + "epoch": 0.07976496336473499, + "grad_norm": 1.6401729583740234, + "learning_rate": 4.92192790002419e-05, + "loss": 5.3347, + "step": 13412 + }, + { + "epoch": 0.07977091064801599, + "grad_norm": 1.425485372543335, + "learning_rate": 4.921916317588341e-05, + "loss": 5.0384, + "step": 13413 + }, + { + "epoch": 0.07977685793129698, + "grad_norm": 1.6337133646011353, + "learning_rate": 4.921904734307027e-05, + "loss": 5.3213, + "step": 13414 + }, + { + "epoch": 0.07978280521457798, + "grad_norm": 1.561292052268982, + "learning_rate": 4.92189315018025e-05, + "loss": 5.1502, + "step": 13415 + }, + { + "epoch": 0.07978875249785898, + "grad_norm": 1.6225664615631104, + "learning_rate": 4.921881565208016e-05, + "loss": 5.2638, + "step": 13416 + }, + { + "epoch": 0.07979469978113997, + "grad_norm": 1.5074353218078613, + "learning_rate": 4.921869979390328e-05, + "loss": 5.0872, + "step": 13417 + }, + { + "epoch": 0.07980064706442097, + "grad_norm": 1.4769634008407593, + "learning_rate": 4.92185839272719e-05, + "loss": 5.1341, + "step": 13418 + }, + { + "epoch": 0.07980659434770197, + "grad_norm": 1.5929937362670898, + "learning_rate": 4.921846805218607e-05, + "loss": 5.2799, + "step": 13419 + }, + { + "epoch": 0.07981254163098296, + "grad_norm": 1.4583854675292969, + "learning_rate": 4.921835216864581e-05, + "loss": 5.0822, + "step": 13420 + }, + { + "epoch": 0.07981848891426396, + "grad_norm": 1.4904375076293945, + "learning_rate": 4.921823627665119e-05, + "loss": 5.055, + "step": 13421 + }, + { + "epoch": 0.07982443619754497, + "grad_norm": 1.6971831321716309, + "learning_rate": 4.921812037620221e-05, + "loss": 5.1968, + "step": 13422 + }, + { + "epoch": 0.07983038348082595, + "grad_norm": 1.5604689121246338, + "learning_rate": 4.9218004467298956e-05, + "loss": 4.9681, + "step": 13423 + }, + { + "epoch": 0.07983633076410696, + "grad_norm": 1.678427815437317, + "learning_rate": 4.9217888549941436e-05, + "loss": 5.2044, + "step": 13424 + }, + { + "epoch": 0.07984227804738796, + "grad_norm": 1.521996259689331, + "learning_rate": 4.921777262412971e-05, + "loss": 4.9741, + "step": 13425 + }, + { + "epoch": 0.07984822533066895, + "grad_norm": 1.5315868854522705, + "learning_rate": 4.92176566898638e-05, + "loss": 5.0064, + "step": 13426 + }, + { + "epoch": 0.07985417261394995, + "grad_norm": 1.465867280960083, + "learning_rate": 4.9217540747143765e-05, + "loss": 4.942, + "step": 13427 + }, + { + "epoch": 0.07986011989723095, + "grad_norm": 1.4323827028274536, + "learning_rate": 4.9217424795969634e-05, + "loss": 4.8934, + "step": 13428 + }, + { + "epoch": 0.07986606718051194, + "grad_norm": 1.4645717144012451, + "learning_rate": 4.921730883634145e-05, + "loss": 5.0473, + "step": 13429 + }, + { + "epoch": 0.07987201446379294, + "grad_norm": 1.5992658138275146, + "learning_rate": 4.9217192868259246e-05, + "loss": 4.8968, + "step": 13430 + }, + { + "epoch": 0.07987796174707394, + "grad_norm": 1.4294894933700562, + "learning_rate": 4.921707689172308e-05, + "loss": 5.0719, + "step": 13431 + }, + { + "epoch": 0.07988390903035493, + "grad_norm": 1.5885019302368164, + "learning_rate": 4.921696090673298e-05, + "loss": 5.1505, + "step": 13432 + }, + { + "epoch": 0.07988985631363593, + "grad_norm": 1.4929580688476562, + "learning_rate": 4.921684491328898e-05, + "loss": 5.016, + "step": 13433 + }, + { + "epoch": 0.07989580359691693, + "grad_norm": 1.4980381727218628, + "learning_rate": 4.921672891139114e-05, + "loss": 5.0601, + "step": 13434 + }, + { + "epoch": 0.07990175088019792, + "grad_norm": 1.5698089599609375, + "learning_rate": 4.9216612901039495e-05, + "loss": 5.0251, + "step": 13435 + }, + { + "epoch": 0.07990769816347892, + "grad_norm": 1.459037184715271, + "learning_rate": 4.921649688223407e-05, + "loss": 4.8417, + "step": 13436 + }, + { + "epoch": 0.07991364544675993, + "grad_norm": 1.5418161153793335, + "learning_rate": 4.921638085497492e-05, + "loss": 5.1989, + "step": 13437 + }, + { + "epoch": 0.07991959273004091, + "grad_norm": 1.546325922012329, + "learning_rate": 4.9216264819262084e-05, + "loss": 5.3004, + "step": 13438 + }, + { + "epoch": 0.07992554001332192, + "grad_norm": 1.5820508003234863, + "learning_rate": 4.9216148775095594e-05, + "loss": 5.3327, + "step": 13439 + }, + { + "epoch": 0.07993148729660292, + "grad_norm": 1.5077866315841675, + "learning_rate": 4.9216032722475504e-05, + "loss": 5.2423, + "step": 13440 + }, + { + "epoch": 0.0799374345798839, + "grad_norm": 1.3654597997665405, + "learning_rate": 4.921591666140184e-05, + "loss": 5.1563, + "step": 13441 + }, + { + "epoch": 0.07994338186316491, + "grad_norm": 1.6721473932266235, + "learning_rate": 4.921580059187466e-05, + "loss": 5.1848, + "step": 13442 + }, + { + "epoch": 0.0799493291464459, + "grad_norm": 1.5349076986312866, + "learning_rate": 4.921568451389398e-05, + "loss": 5.1836, + "step": 13443 + }, + { + "epoch": 0.0799552764297269, + "grad_norm": 1.6246919631958008, + "learning_rate": 4.921556842745987e-05, + "loss": 4.8715, + "step": 13444 + }, + { + "epoch": 0.0799612237130079, + "grad_norm": 1.5361920595169067, + "learning_rate": 4.921545233257234e-05, + "loss": 4.8203, + "step": 13445 + }, + { + "epoch": 0.07996717099628889, + "grad_norm": 1.6185765266418457, + "learning_rate": 4.921533622923146e-05, + "loss": 4.8039, + "step": 13446 + }, + { + "epoch": 0.07997311827956989, + "grad_norm": 1.402462363243103, + "learning_rate": 4.9215220117437246e-05, + "loss": 4.8524, + "step": 13447 + }, + { + "epoch": 0.07997906556285089, + "grad_norm": 1.5282337665557861, + "learning_rate": 4.921510399718975e-05, + "loss": 4.8081, + "step": 13448 + }, + { + "epoch": 0.07998501284613188, + "grad_norm": 1.336254596710205, + "learning_rate": 4.921498786848902e-05, + "loss": 4.8468, + "step": 13449 + }, + { + "epoch": 0.07999096012941288, + "grad_norm": 1.4701998233795166, + "learning_rate": 4.921487173133508e-05, + "loss": 4.6873, + "step": 13450 + }, + { + "epoch": 0.07999690741269389, + "grad_norm": 1.6340824365615845, + "learning_rate": 4.921475558572798e-05, + "loss": 4.6779, + "step": 13451 + }, + { + "epoch": 0.08000285469597487, + "grad_norm": 1.557027816772461, + "learning_rate": 4.921463943166775e-05, + "loss": 4.6467, + "step": 13452 + }, + { + "epoch": 0.08000880197925588, + "grad_norm": 1.6390316486358643, + "learning_rate": 4.9214523269154454e-05, + "loss": 4.7376, + "step": 13453 + }, + { + "epoch": 0.08001474926253688, + "grad_norm": 2.3929800987243652, + "learning_rate": 4.921440709818811e-05, + "loss": 5.2623, + "step": 13454 + }, + { + "epoch": 0.08002069654581787, + "grad_norm": 1.5896660089492798, + "learning_rate": 4.921429091876877e-05, + "loss": 4.6952, + "step": 13455 + }, + { + "epoch": 0.08002664382909887, + "grad_norm": 1.6705348491668701, + "learning_rate": 4.921417473089647e-05, + "loss": 4.7963, + "step": 13456 + }, + { + "epoch": 0.08003259111237987, + "grad_norm": 1.5925310850143433, + "learning_rate": 4.9214058534571253e-05, + "loss": 4.7398, + "step": 13457 + }, + { + "epoch": 0.08003853839566086, + "grad_norm": 1.5314396619796753, + "learning_rate": 4.921394232979316e-05, + "loss": 4.7578, + "step": 13458 + }, + { + "epoch": 0.08004448567894186, + "grad_norm": 1.6665661334991455, + "learning_rate": 4.921382611656222e-05, + "loss": 4.7767, + "step": 13459 + }, + { + "epoch": 0.08005043296222286, + "grad_norm": 1.5145021677017212, + "learning_rate": 4.9213709894878495e-05, + "loss": 4.7892, + "step": 13460 + }, + { + "epoch": 0.08005638024550385, + "grad_norm": 1.8332866430282593, + "learning_rate": 4.921359366474201e-05, + "loss": 4.6434, + "step": 13461 + }, + { + "epoch": 0.08006232752878485, + "grad_norm": 1.467970371246338, + "learning_rate": 4.921347742615281e-05, + "loss": 4.6611, + "step": 13462 + }, + { + "epoch": 0.08006827481206585, + "grad_norm": 1.5667515993118286, + "learning_rate": 4.9213361179110936e-05, + "loss": 4.5792, + "step": 13463 + }, + { + "epoch": 0.08007422209534684, + "grad_norm": 1.5370365381240845, + "learning_rate": 4.9213244923616434e-05, + "loss": 4.6724, + "step": 13464 + }, + { + "epoch": 0.08008016937862784, + "grad_norm": 1.7298029661178589, + "learning_rate": 4.921312865966933e-05, + "loss": 4.7808, + "step": 13465 + }, + { + "epoch": 0.08008611666190885, + "grad_norm": 1.5497710704803467, + "learning_rate": 4.921301238726966e-05, + "loss": 4.8228, + "step": 13466 + }, + { + "epoch": 0.08009206394518983, + "grad_norm": 1.4589923620224, + "learning_rate": 4.92128961064175e-05, + "loss": 4.757, + "step": 13467 + }, + { + "epoch": 0.08009801122847084, + "grad_norm": 1.6503071784973145, + "learning_rate": 4.921277981711286e-05, + "loss": 4.6074, + "step": 13468 + }, + { + "epoch": 0.08010395851175184, + "grad_norm": 1.621209979057312, + "learning_rate": 4.921266351935578e-05, + "loss": 4.6338, + "step": 13469 + }, + { + "epoch": 0.08010990579503283, + "grad_norm": 1.6513469219207764, + "learning_rate": 4.921254721314632e-05, + "loss": 4.7399, + "step": 13470 + }, + { + "epoch": 0.08011585307831383, + "grad_norm": 1.5691003799438477, + "learning_rate": 4.9212430898484505e-05, + "loss": 4.8002, + "step": 13471 + }, + { + "epoch": 0.08012180036159482, + "grad_norm": 1.6764090061187744, + "learning_rate": 4.921231457537039e-05, + "loss": 4.7913, + "step": 13472 + }, + { + "epoch": 0.08012774764487582, + "grad_norm": 1.5193006992340088, + "learning_rate": 4.9212198243804e-05, + "loss": 4.8346, + "step": 13473 + }, + { + "epoch": 0.08013369492815682, + "grad_norm": 1.722706913948059, + "learning_rate": 4.921208190378538e-05, + "loss": 4.6969, + "step": 13474 + }, + { + "epoch": 0.08013964221143781, + "grad_norm": 1.6551017761230469, + "learning_rate": 4.921196555531457e-05, + "loss": 4.6504, + "step": 13475 + }, + { + "epoch": 0.08014558949471881, + "grad_norm": 1.462902307510376, + "learning_rate": 4.921184919839162e-05, + "loss": 4.7678, + "step": 13476 + }, + { + "epoch": 0.08015153677799981, + "grad_norm": 1.4332460165023804, + "learning_rate": 4.9211732833016554e-05, + "loss": 4.7563, + "step": 13477 + }, + { + "epoch": 0.0801574840612808, + "grad_norm": 1.466042160987854, + "learning_rate": 4.9211616459189434e-05, + "loss": 4.7071, + "step": 13478 + }, + { + "epoch": 0.0801634313445618, + "grad_norm": 1.5814018249511719, + "learning_rate": 4.9211500076910275e-05, + "loss": 4.7497, + "step": 13479 + }, + { + "epoch": 0.0801693786278428, + "grad_norm": 1.5666007995605469, + "learning_rate": 4.921138368617915e-05, + "loss": 4.7757, + "step": 13480 + }, + { + "epoch": 0.0801753259111238, + "grad_norm": 1.6804678440093994, + "learning_rate": 4.9211267286996064e-05, + "loss": 4.6921, + "step": 13481 + }, + { + "epoch": 0.0801812731944048, + "grad_norm": 1.6126580238342285, + "learning_rate": 4.921115087936108e-05, + "loss": 4.746, + "step": 13482 + }, + { + "epoch": 0.0801872204776858, + "grad_norm": 1.5597195625305176, + "learning_rate": 4.9211034463274235e-05, + "loss": 4.8135, + "step": 13483 + }, + { + "epoch": 0.08019316776096679, + "grad_norm": 1.4779510498046875, + "learning_rate": 4.9210918038735565e-05, + "loss": 4.9011, + "step": 13484 + }, + { + "epoch": 0.08019911504424779, + "grad_norm": 1.449723243713379, + "learning_rate": 4.921080160574512e-05, + "loss": 4.648, + "step": 13485 + }, + { + "epoch": 0.08020506232752879, + "grad_norm": 1.609134554862976, + "learning_rate": 4.921068516430293e-05, + "loss": 4.6809, + "step": 13486 + }, + { + "epoch": 0.08021100961080978, + "grad_norm": 1.5483453273773193, + "learning_rate": 4.921056871440905e-05, + "loss": 4.7247, + "step": 13487 + }, + { + "epoch": 0.08021695689409078, + "grad_norm": 1.5850282907485962, + "learning_rate": 4.921045225606349e-05, + "loss": 4.6378, + "step": 13488 + }, + { + "epoch": 0.08022290417737178, + "grad_norm": 1.746030569076538, + "learning_rate": 4.9210335789266325e-05, + "loss": 4.6986, + "step": 13489 + }, + { + "epoch": 0.08022885146065277, + "grad_norm": 1.5930465459823608, + "learning_rate": 4.921021931401758e-05, + "loss": 4.6339, + "step": 13490 + }, + { + "epoch": 0.08023479874393377, + "grad_norm": 1.5435012578964233, + "learning_rate": 4.92101028303173e-05, + "loss": 4.5761, + "step": 13491 + }, + { + "epoch": 0.08024074602721477, + "grad_norm": 1.8166500329971313, + "learning_rate": 4.920998633816552e-05, + "loss": 4.5668, + "step": 13492 + }, + { + "epoch": 0.08024669331049576, + "grad_norm": 1.659976601600647, + "learning_rate": 4.920986983756228e-05, + "loss": 4.7431, + "step": 13493 + }, + { + "epoch": 0.08025264059377676, + "grad_norm": 1.6075677871704102, + "learning_rate": 4.920975332850762e-05, + "loss": 4.7744, + "step": 13494 + }, + { + "epoch": 0.08025858787705777, + "grad_norm": 1.6895835399627686, + "learning_rate": 4.9209636811001605e-05, + "loss": 4.638, + "step": 13495 + }, + { + "epoch": 0.08026453516033875, + "grad_norm": 1.4848902225494385, + "learning_rate": 4.9209520285044244e-05, + "loss": 4.7314, + "step": 13496 + }, + { + "epoch": 0.08027048244361976, + "grad_norm": 1.6041605472564697, + "learning_rate": 4.920940375063559e-05, + "loss": 4.7329, + "step": 13497 + }, + { + "epoch": 0.08027642972690076, + "grad_norm": 1.5055692195892334, + "learning_rate": 4.920928720777568e-05, + "loss": 4.721, + "step": 13498 + }, + { + "epoch": 0.08028237701018175, + "grad_norm": 1.3238314390182495, + "learning_rate": 4.920917065646456e-05, + "loss": 5.3071, + "step": 13499 + }, + { + "epoch": 0.08028832429346275, + "grad_norm": 1.463626742362976, + "learning_rate": 4.9209054096702266e-05, + "loss": 5.1885, + "step": 13500 + }, + { + "epoch": 0.08029427157674375, + "grad_norm": 1.4844539165496826, + "learning_rate": 4.9208937528488844e-05, + "loss": 5.2873, + "step": 13501 + }, + { + "epoch": 0.08030021886002474, + "grad_norm": 1.5207467079162598, + "learning_rate": 4.920882095182434e-05, + "loss": 5.1049, + "step": 13502 + }, + { + "epoch": 0.08030616614330574, + "grad_norm": 1.3113683462142944, + "learning_rate": 4.920870436670878e-05, + "loss": 5.1821, + "step": 13503 + }, + { + "epoch": 0.08031211342658673, + "grad_norm": 1.3822054862976074, + "learning_rate": 4.920858777314221e-05, + "loss": 5.1467, + "step": 13504 + }, + { + "epoch": 0.08031806070986773, + "grad_norm": 1.7611572742462158, + "learning_rate": 4.920847117112467e-05, + "loss": 5.0616, + "step": 13505 + }, + { + "epoch": 0.08032400799314873, + "grad_norm": 1.632802963256836, + "learning_rate": 4.920835456065621e-05, + "loss": 5.1535, + "step": 13506 + }, + { + "epoch": 0.08032995527642972, + "grad_norm": 1.6254185438156128, + "learning_rate": 4.920823794173686e-05, + "loss": 5.211, + "step": 13507 + }, + { + "epoch": 0.08033590255971072, + "grad_norm": 1.4769513607025146, + "learning_rate": 4.920812131436666e-05, + "loss": 5.0879, + "step": 13508 + }, + { + "epoch": 0.08034184984299172, + "grad_norm": 1.531504511833191, + "learning_rate": 4.920800467854566e-05, + "loss": 4.9068, + "step": 13509 + }, + { + "epoch": 0.08034779712627271, + "grad_norm": 1.6325825452804565, + "learning_rate": 4.9207888034273895e-05, + "loss": 5.0463, + "step": 13510 + }, + { + "epoch": 0.08035374440955372, + "grad_norm": 1.3797351121902466, + "learning_rate": 4.9207771381551406e-05, + "loss": 5.0644, + "step": 13511 + }, + { + "epoch": 0.08035969169283472, + "grad_norm": 1.7325141429901123, + "learning_rate": 4.920765472037823e-05, + "loss": 4.9095, + "step": 13512 + }, + { + "epoch": 0.0803656389761157, + "grad_norm": 1.3197063207626343, + "learning_rate": 4.920753805075442e-05, + "loss": 5.1837, + "step": 13513 + }, + { + "epoch": 0.08037158625939671, + "grad_norm": 1.532212734222412, + "learning_rate": 4.9207421372680006e-05, + "loss": 5.1011, + "step": 13514 + }, + { + "epoch": 0.08037753354267771, + "grad_norm": 1.2958672046661377, + "learning_rate": 4.9207304686155034e-05, + "loss": 5.1349, + "step": 13515 + }, + { + "epoch": 0.0803834808259587, + "grad_norm": 2.914010524749756, + "learning_rate": 4.9207187991179533e-05, + "loss": 5.4637, + "step": 13516 + }, + { + "epoch": 0.0803894281092397, + "grad_norm": 1.490577220916748, + "learning_rate": 4.920707128775356e-05, + "loss": 5.2322, + "step": 13517 + }, + { + "epoch": 0.0803953753925207, + "grad_norm": 1.5756994485855103, + "learning_rate": 4.920695457587714e-05, + "loss": 5.1501, + "step": 13518 + }, + { + "epoch": 0.08040132267580169, + "grad_norm": 1.7483723163604736, + "learning_rate": 4.920683785555033e-05, + "loss": 5.131, + "step": 13519 + }, + { + "epoch": 0.08040726995908269, + "grad_norm": 1.426866054534912, + "learning_rate": 4.920672112677316e-05, + "loss": 5.5304, + "step": 13520 + }, + { + "epoch": 0.0804132172423637, + "grad_norm": 1.3744142055511475, + "learning_rate": 4.920660438954568e-05, + "loss": 5.1042, + "step": 13521 + }, + { + "epoch": 0.08041916452564468, + "grad_norm": 1.5924170017242432, + "learning_rate": 4.9206487643867916e-05, + "loss": 5.261, + "step": 13522 + }, + { + "epoch": 0.08042511180892568, + "grad_norm": 1.566296935081482, + "learning_rate": 4.920637088973992e-05, + "loss": 5.0451, + "step": 13523 + }, + { + "epoch": 0.08043105909220669, + "grad_norm": 1.4542006254196167, + "learning_rate": 4.9206254127161734e-05, + "loss": 5.0351, + "step": 13524 + }, + { + "epoch": 0.08043700637548767, + "grad_norm": 1.4084336757659912, + "learning_rate": 4.920613735613339e-05, + "loss": 5.1177, + "step": 13525 + }, + { + "epoch": 0.08044295365876868, + "grad_norm": 1.5498062372207642, + "learning_rate": 4.920602057665493e-05, + "loss": 4.9068, + "step": 13526 + }, + { + "epoch": 0.08044890094204968, + "grad_norm": 1.4482768774032593, + "learning_rate": 4.920590378872641e-05, + "loss": 4.9393, + "step": 13527 + }, + { + "epoch": 0.08045484822533067, + "grad_norm": 1.4438153505325317, + "learning_rate": 4.920578699234785e-05, + "loss": 5.0109, + "step": 13528 + }, + { + "epoch": 0.08046079550861167, + "grad_norm": 1.5769532918930054, + "learning_rate": 4.9205670187519305e-05, + "loss": 4.916, + "step": 13529 + }, + { + "epoch": 0.08046674279189267, + "grad_norm": 1.6127451658248901, + "learning_rate": 4.9205553374240806e-05, + "loss": 5.0038, + "step": 13530 + }, + { + "epoch": 0.08047269007517366, + "grad_norm": 1.5733160972595215, + "learning_rate": 4.92054365525124e-05, + "loss": 5.2705, + "step": 13531 + }, + { + "epoch": 0.08047863735845466, + "grad_norm": 1.956769585609436, + "learning_rate": 4.920531972233413e-05, + "loss": 5.0572, + "step": 13532 + }, + { + "epoch": 0.08048458464173565, + "grad_norm": 1.614670753479004, + "learning_rate": 4.9205202883706025e-05, + "loss": 5.0323, + "step": 13533 + }, + { + "epoch": 0.08049053192501665, + "grad_norm": 1.3706777095794678, + "learning_rate": 4.920508603662814e-05, + "loss": 5.1335, + "step": 13534 + }, + { + "epoch": 0.08049647920829765, + "grad_norm": 1.5787118673324585, + "learning_rate": 4.9204969181100505e-05, + "loss": 4.9626, + "step": 13535 + }, + { + "epoch": 0.08050242649157864, + "grad_norm": 1.6258914470672607, + "learning_rate": 4.9204852317123175e-05, + "loss": 5.1592, + "step": 13536 + }, + { + "epoch": 0.08050837377485964, + "grad_norm": 1.662347435951233, + "learning_rate": 4.920473544469617e-05, + "loss": 5.053, + "step": 13537 + }, + { + "epoch": 0.08051432105814064, + "grad_norm": 1.8060719966888428, + "learning_rate": 4.920461856381955e-05, + "loss": 5.0823, + "step": 13538 + }, + { + "epoch": 0.08052026834142163, + "grad_norm": 1.7381904125213623, + "learning_rate": 4.920450167449334e-05, + "loss": 4.7485, + "step": 13539 + }, + { + "epoch": 0.08052621562470264, + "grad_norm": 1.838526964187622, + "learning_rate": 4.9204384776717594e-05, + "loss": 5.1404, + "step": 13540 + }, + { + "epoch": 0.08053216290798364, + "grad_norm": 1.8131240606307983, + "learning_rate": 4.920426787049234e-05, + "loss": 5.2337, + "step": 13541 + }, + { + "epoch": 0.08053811019126463, + "grad_norm": 1.7523903846740723, + "learning_rate": 4.9204150955817635e-05, + "loss": 5.2375, + "step": 13542 + }, + { + "epoch": 0.08054405747454563, + "grad_norm": 1.5962380170822144, + "learning_rate": 4.9204034032693505e-05, + "loss": 5.1667, + "step": 13543 + }, + { + "epoch": 0.08055000475782663, + "grad_norm": 1.566009283065796, + "learning_rate": 4.920391710112e-05, + "loss": 5.1105, + "step": 13544 + }, + { + "epoch": 0.08055595204110762, + "grad_norm": 1.6253767013549805, + "learning_rate": 4.920380016109716e-05, + "loss": 5.2942, + "step": 13545 + }, + { + "epoch": 0.08056189932438862, + "grad_norm": 1.538004994392395, + "learning_rate": 4.920368321262502e-05, + "loss": 5.1847, + "step": 13546 + }, + { + "epoch": 0.08056784660766962, + "grad_norm": 1.6407667398452759, + "learning_rate": 4.9203566255703625e-05, + "loss": 5.1368, + "step": 13547 + }, + { + "epoch": 0.08057379389095061, + "grad_norm": 1.5777368545532227, + "learning_rate": 4.9203449290333016e-05, + "loss": 5.1507, + "step": 13548 + }, + { + "epoch": 0.08057974117423161, + "grad_norm": 1.5601979494094849, + "learning_rate": 4.920333231651323e-05, + "loss": 5.0926, + "step": 13549 + }, + { + "epoch": 0.08058568845751261, + "grad_norm": 1.4342397451400757, + "learning_rate": 4.9203215334244315e-05, + "loss": 4.9536, + "step": 13550 + }, + { + "epoch": 0.0805916357407936, + "grad_norm": 1.6202988624572754, + "learning_rate": 4.9203098343526305e-05, + "loss": 4.9009, + "step": 13551 + }, + { + "epoch": 0.0805975830240746, + "grad_norm": 1.4504165649414062, + "learning_rate": 4.9202981344359243e-05, + "loss": 5.3843, + "step": 13552 + }, + { + "epoch": 0.0806035303073556, + "grad_norm": 1.6187599897384644, + "learning_rate": 4.920286433674317e-05, + "loss": 5.3396, + "step": 13553 + }, + { + "epoch": 0.0806094775906366, + "grad_norm": 1.6162225008010864, + "learning_rate": 4.920274732067813e-05, + "loss": 5.3163, + "step": 13554 + }, + { + "epoch": 0.0806154248739176, + "grad_norm": 1.6445814371109009, + "learning_rate": 4.920263029616416e-05, + "loss": 5.207, + "step": 13555 + }, + { + "epoch": 0.0806213721571986, + "grad_norm": 1.5133748054504395, + "learning_rate": 4.9202513263201296e-05, + "loss": 5.4284, + "step": 13556 + }, + { + "epoch": 0.08062731944047959, + "grad_norm": 1.5004390478134155, + "learning_rate": 4.920239622178959e-05, + "loss": 5.0013, + "step": 13557 + }, + { + "epoch": 0.08063326672376059, + "grad_norm": 1.6617141962051392, + "learning_rate": 4.920227917192908e-05, + "loss": 5.346, + "step": 13558 + }, + { + "epoch": 0.08063921400704159, + "grad_norm": 1.5505567789077759, + "learning_rate": 4.92021621136198e-05, + "loss": 5.2799, + "step": 13559 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 1.5264419317245483, + "learning_rate": 4.92020450468618e-05, + "loss": 5.1277, + "step": 13560 + }, + { + "epoch": 0.08065110857360358, + "grad_norm": 1.6758075952529907, + "learning_rate": 4.920192797165511e-05, + "loss": 5.2519, + "step": 13561 + }, + { + "epoch": 0.08065705585688457, + "grad_norm": 1.5858482122421265, + "learning_rate": 4.920181088799978e-05, + "loss": 5.3231, + "step": 13562 + }, + { + "epoch": 0.08066300314016557, + "grad_norm": 1.5122928619384766, + "learning_rate": 4.920169379589585e-05, + "loss": 5.1791, + "step": 13563 + }, + { + "epoch": 0.08066895042344657, + "grad_norm": 1.4593915939331055, + "learning_rate": 4.9201576695343354e-05, + "loss": 5.0555, + "step": 13564 + }, + { + "epoch": 0.08067489770672756, + "grad_norm": 1.6524077653884888, + "learning_rate": 4.9201459586342336e-05, + "loss": 5.1981, + "step": 13565 + }, + { + "epoch": 0.08068084499000856, + "grad_norm": 1.5063152313232422, + "learning_rate": 4.920134246889285e-05, + "loss": 5.0406, + "step": 13566 + }, + { + "epoch": 0.08068679227328956, + "grad_norm": 1.3544602394104004, + "learning_rate": 4.9201225342994914e-05, + "loss": 5.0385, + "step": 13567 + }, + { + "epoch": 0.08069273955657055, + "grad_norm": 1.5672118663787842, + "learning_rate": 4.920110820864858e-05, + "loss": 5.2393, + "step": 13568 + }, + { + "epoch": 0.08069868683985155, + "grad_norm": 1.5031840801239014, + "learning_rate": 4.92009910658539e-05, + "loss": 5.1584, + "step": 13569 + }, + { + "epoch": 0.08070463412313256, + "grad_norm": 1.682307243347168, + "learning_rate": 4.920087391461089e-05, + "loss": 4.8473, + "step": 13570 + }, + { + "epoch": 0.08071058140641355, + "grad_norm": 1.5047411918640137, + "learning_rate": 4.9200756754919616e-05, + "loss": 4.8286, + "step": 13571 + }, + { + "epoch": 0.08071652868969455, + "grad_norm": 1.4234607219696045, + "learning_rate": 4.920063958678011e-05, + "loss": 4.8309, + "step": 13572 + }, + { + "epoch": 0.08072247597297555, + "grad_norm": 1.5061196088790894, + "learning_rate": 4.920052241019239e-05, + "loss": 5.0132, + "step": 13573 + }, + { + "epoch": 0.08072842325625654, + "grad_norm": 1.5565897226333618, + "learning_rate": 4.920040522515654e-05, + "loss": 4.9357, + "step": 13574 + }, + { + "epoch": 0.08073437053953754, + "grad_norm": 1.442288875579834, + "learning_rate": 4.920028803167257e-05, + "loss": 4.7943, + "step": 13575 + }, + { + "epoch": 0.08074031782281854, + "grad_norm": 1.6255996227264404, + "learning_rate": 4.9200170829740534e-05, + "loss": 4.824, + "step": 13576 + }, + { + "epoch": 0.08074626510609953, + "grad_norm": 1.7027612924575806, + "learning_rate": 4.920005361936047e-05, + "loss": 5.1223, + "step": 13577 + }, + { + "epoch": 0.08075221238938053, + "grad_norm": 2.5931310653686523, + "learning_rate": 4.919993640053241e-05, + "loss": 5.3487, + "step": 13578 + }, + { + "epoch": 0.08075815967266153, + "grad_norm": 1.5481868982315063, + "learning_rate": 4.91998191732564e-05, + "loss": 5.0844, + "step": 13579 + }, + { + "epoch": 0.08076410695594252, + "grad_norm": 1.3663432598114014, + "learning_rate": 4.919970193753248e-05, + "loss": 5.2151, + "step": 13580 + }, + { + "epoch": 0.08077005423922352, + "grad_norm": 1.4602998495101929, + "learning_rate": 4.919958469336071e-05, + "loss": 5.3133, + "step": 13581 + }, + { + "epoch": 0.08077600152250453, + "grad_norm": 1.6350071430206299, + "learning_rate": 4.919946744074111e-05, + "loss": 5.5026, + "step": 13582 + }, + { + "epoch": 0.08078194880578551, + "grad_norm": 1.4492799043655396, + "learning_rate": 4.919935017967372e-05, + "loss": 5.4211, + "step": 13583 + }, + { + "epoch": 0.08078789608906652, + "grad_norm": 1.398373007774353, + "learning_rate": 4.919923291015859e-05, + "loss": 5.2947, + "step": 13584 + }, + { + "epoch": 0.08079384337234752, + "grad_norm": 1.543583869934082, + "learning_rate": 4.9199115632195755e-05, + "loss": 5.0361, + "step": 13585 + }, + { + "epoch": 0.0807997906556285, + "grad_norm": 1.7753655910491943, + "learning_rate": 4.9198998345785265e-05, + "loss": 5.1897, + "step": 13586 + }, + { + "epoch": 0.08080573793890951, + "grad_norm": 1.668168544769287, + "learning_rate": 4.919888105092715e-05, + "loss": 5.3786, + "step": 13587 + }, + { + "epoch": 0.08081168522219051, + "grad_norm": 1.3956975936889648, + "learning_rate": 4.919876374762145e-05, + "loss": 5.4662, + "step": 13588 + }, + { + "epoch": 0.0808176325054715, + "grad_norm": 1.3362425565719604, + "learning_rate": 4.9198646435868226e-05, + "loss": 5.4723, + "step": 13589 + }, + { + "epoch": 0.0808235797887525, + "grad_norm": 1.3419675827026367, + "learning_rate": 4.919852911566749e-05, + "loss": 5.3888, + "step": 13590 + }, + { + "epoch": 0.08082952707203349, + "grad_norm": 1.5144484043121338, + "learning_rate": 4.9198411787019304e-05, + "loss": 5.292, + "step": 13591 + }, + { + "epoch": 0.08083547435531449, + "grad_norm": 1.4561097621917725, + "learning_rate": 4.91982944499237e-05, + "loss": 5.3688, + "step": 13592 + }, + { + "epoch": 0.08084142163859549, + "grad_norm": 1.4536436796188354, + "learning_rate": 4.919817710438073e-05, + "loss": 5.3606, + "step": 13593 + }, + { + "epoch": 0.08084736892187648, + "grad_norm": 1.3266935348510742, + "learning_rate": 4.919805975039041e-05, + "loss": 5.3999, + "step": 13594 + }, + { + "epoch": 0.08085331620515748, + "grad_norm": 1.4032717943191528, + "learning_rate": 4.919794238795281e-05, + "loss": 5.3494, + "step": 13595 + }, + { + "epoch": 0.08085926348843848, + "grad_norm": 1.6235400438308716, + "learning_rate": 4.919782501706796e-05, + "loss": 5.1499, + "step": 13596 + }, + { + "epoch": 0.08086521077171947, + "grad_norm": 1.349752426147461, + "learning_rate": 4.919770763773589e-05, + "loss": 5.3599, + "step": 13597 + }, + { + "epoch": 0.08087115805500047, + "grad_norm": 1.9415758848190308, + "learning_rate": 4.919759024995666e-05, + "loss": 5.3427, + "step": 13598 + }, + { + "epoch": 0.08087710533828148, + "grad_norm": 1.688825249671936, + "learning_rate": 4.9197472853730296e-05, + "loss": 5.2918, + "step": 13599 + }, + { + "epoch": 0.08088305262156247, + "grad_norm": 1.55258309841156, + "learning_rate": 4.919735544905685e-05, + "loss": 5.3016, + "step": 13600 + }, + { + "epoch": 0.08088899990484347, + "grad_norm": 1.3860005140304565, + "learning_rate": 4.919723803593634e-05, + "loss": 5.3049, + "step": 13601 + }, + { + "epoch": 0.08089494718812447, + "grad_norm": 1.289819359779358, + "learning_rate": 4.919712061436884e-05, + "loss": 5.1657, + "step": 13602 + }, + { + "epoch": 0.08090089447140546, + "grad_norm": 1.5799275636672974, + "learning_rate": 4.9197003184354375e-05, + "loss": 5.2638, + "step": 13603 + }, + { + "epoch": 0.08090684175468646, + "grad_norm": 1.5292985439300537, + "learning_rate": 4.919688574589299e-05, + "loss": 5.2643, + "step": 13604 + }, + { + "epoch": 0.08091278903796746, + "grad_norm": 1.6338304281234741, + "learning_rate": 4.919676829898471e-05, + "loss": 5.2377, + "step": 13605 + }, + { + "epoch": 0.08091873632124845, + "grad_norm": 1.7117339372634888, + "learning_rate": 4.919665084362959e-05, + "loss": 5.262, + "step": 13606 + }, + { + "epoch": 0.08092468360452945, + "grad_norm": 1.606644868850708, + "learning_rate": 4.919653337982767e-05, + "loss": 5.2308, + "step": 13607 + }, + { + "epoch": 0.08093063088781045, + "grad_norm": 1.5751184225082397, + "learning_rate": 4.9196415907578994e-05, + "loss": 5.1455, + "step": 13608 + }, + { + "epoch": 0.08093657817109144, + "grad_norm": 1.7105200290679932, + "learning_rate": 4.9196298426883595e-05, + "loss": 5.2608, + "step": 13609 + }, + { + "epoch": 0.08094252545437244, + "grad_norm": 1.4504178762435913, + "learning_rate": 4.919618093774152e-05, + "loss": 5.3592, + "step": 13610 + }, + { + "epoch": 0.08094847273765345, + "grad_norm": 1.2036757469177246, + "learning_rate": 4.9196063440152804e-05, + "loss": 5.3256, + "step": 13611 + }, + { + "epoch": 0.08095442002093443, + "grad_norm": 1.4795072078704834, + "learning_rate": 4.9195945934117507e-05, + "loss": 5.2968, + "step": 13612 + }, + { + "epoch": 0.08096036730421544, + "grad_norm": 1.2796508073806763, + "learning_rate": 4.9195828419635644e-05, + "loss": 5.1288, + "step": 13613 + }, + { + "epoch": 0.08096631458749644, + "grad_norm": 1.4119127988815308, + "learning_rate": 4.9195710896707264e-05, + "loss": 5.3238, + "step": 13614 + }, + { + "epoch": 0.08097226187077743, + "grad_norm": 1.618862509727478, + "learning_rate": 4.919559336533241e-05, + "loss": 5.301, + "step": 13615 + }, + { + "epoch": 0.08097820915405843, + "grad_norm": 1.5049046277999878, + "learning_rate": 4.919547582551114e-05, + "loss": 5.3395, + "step": 13616 + }, + { + "epoch": 0.08098415643733943, + "grad_norm": 1.3821018934249878, + "learning_rate": 4.9195358277243464e-05, + "loss": 5.4033, + "step": 13617 + }, + { + "epoch": 0.08099010372062042, + "grad_norm": 1.4585113525390625, + "learning_rate": 4.9195240720529446e-05, + "loss": 5.3098, + "step": 13618 + }, + { + "epoch": 0.08099605100390142, + "grad_norm": 1.5766072273254395, + "learning_rate": 4.9195123155369114e-05, + "loss": 5.2672, + "step": 13619 + }, + { + "epoch": 0.08100199828718241, + "grad_norm": 1.5132715702056885, + "learning_rate": 4.919500558176252e-05, + "loss": 5.1707, + "step": 13620 + }, + { + "epoch": 0.08100794557046341, + "grad_norm": 1.594093918800354, + "learning_rate": 4.91948879997097e-05, + "loss": 5.2988, + "step": 13621 + }, + { + "epoch": 0.08101389285374441, + "grad_norm": 1.529877781867981, + "learning_rate": 4.919477040921069e-05, + "loss": 5.4418, + "step": 13622 + }, + { + "epoch": 0.0810198401370254, + "grad_norm": 1.4329211711883545, + "learning_rate": 4.919465281026554e-05, + "loss": 5.308, + "step": 13623 + }, + { + "epoch": 0.0810257874203064, + "grad_norm": 1.4308300018310547, + "learning_rate": 4.919453520287428e-05, + "loss": 5.259, + "step": 13624 + }, + { + "epoch": 0.0810317347035874, + "grad_norm": 1.248282790184021, + "learning_rate": 4.919441758703697e-05, + "loss": 5.2129, + "step": 13625 + }, + { + "epoch": 0.08103768198686839, + "grad_norm": 1.4535733461380005, + "learning_rate": 4.919429996275363e-05, + "loss": 5.1989, + "step": 13626 + }, + { + "epoch": 0.0810436292701494, + "grad_norm": 1.6055153608322144, + "learning_rate": 4.9194182330024306e-05, + "loss": 5.1669, + "step": 13627 + }, + { + "epoch": 0.0810495765534304, + "grad_norm": 1.6016899347305298, + "learning_rate": 4.919406468884905e-05, + "loss": 5.1958, + "step": 13628 + }, + { + "epoch": 0.08105552383671139, + "grad_norm": 1.4217112064361572, + "learning_rate": 4.91939470392279e-05, + "loss": 4.9775, + "step": 13629 + }, + { + "epoch": 0.08106147111999239, + "grad_norm": 1.4405405521392822, + "learning_rate": 4.919382938116088e-05, + "loss": 5.1865, + "step": 13630 + }, + { + "epoch": 0.08106741840327339, + "grad_norm": 1.3826597929000854, + "learning_rate": 4.919371171464805e-05, + "loss": 5.1909, + "step": 13631 + }, + { + "epoch": 0.08107336568655438, + "grad_norm": 1.942305088043213, + "learning_rate": 4.919359403968944e-05, + "loss": 5.227, + "step": 13632 + }, + { + "epoch": 0.08107931296983538, + "grad_norm": 1.8932685852050781, + "learning_rate": 4.919347635628511e-05, + "loss": 5.3257, + "step": 13633 + }, + { + "epoch": 0.08108526025311638, + "grad_norm": 1.8511128425598145, + "learning_rate": 4.9193358664435074e-05, + "loss": 5.4229, + "step": 13634 + }, + { + "epoch": 0.08109120753639737, + "grad_norm": 1.6317822933197021, + "learning_rate": 4.919324096413939e-05, + "loss": 5.3067, + "step": 13635 + }, + { + "epoch": 0.08109715481967837, + "grad_norm": 1.835503101348877, + "learning_rate": 4.91931232553981e-05, + "loss": 5.3246, + "step": 13636 + }, + { + "epoch": 0.08110310210295937, + "grad_norm": 1.8521870374679565, + "learning_rate": 4.919300553821124e-05, + "loss": 5.3367, + "step": 13637 + }, + { + "epoch": 0.08110904938624036, + "grad_norm": 1.7814146280288696, + "learning_rate": 4.9192887812578844e-05, + "loss": 5.2949, + "step": 13638 + }, + { + "epoch": 0.08111499666952136, + "grad_norm": 1.6024845838546753, + "learning_rate": 4.919277007850097e-05, + "loss": 5.3159, + "step": 13639 + }, + { + "epoch": 0.08112094395280237, + "grad_norm": 2.955554246902466, + "learning_rate": 4.919265233597765e-05, + "loss": 4.8802, + "step": 13640 + }, + { + "epoch": 0.08112689123608335, + "grad_norm": 1.7217108011245728, + "learning_rate": 4.919253458500892e-05, + "loss": 5.08, + "step": 13641 + }, + { + "epoch": 0.08113283851936436, + "grad_norm": 1.686672329902649, + "learning_rate": 4.9192416825594825e-05, + "loss": 5.1349, + "step": 13642 + }, + { + "epoch": 0.08113878580264536, + "grad_norm": 1.5377975702285767, + "learning_rate": 4.9192299057735416e-05, + "loss": 5.1327, + "step": 13643 + }, + { + "epoch": 0.08114473308592635, + "grad_norm": 1.7383031845092773, + "learning_rate": 4.9192181281430716e-05, + "loss": 5.0938, + "step": 13644 + }, + { + "epoch": 0.08115068036920735, + "grad_norm": 1.6174112558364868, + "learning_rate": 4.919206349668077e-05, + "loss": 5.0123, + "step": 13645 + }, + { + "epoch": 0.08115662765248835, + "grad_norm": 1.5967239141464233, + "learning_rate": 4.9191945703485646e-05, + "loss": 5.0334, + "step": 13646 + }, + { + "epoch": 0.08116257493576934, + "grad_norm": 1.5330301523208618, + "learning_rate": 4.919182790184534e-05, + "loss": 5.1615, + "step": 13647 + }, + { + "epoch": 0.08116852221905034, + "grad_norm": 1.5532622337341309, + "learning_rate": 4.919171009175993e-05, + "loss": 5.1565, + "step": 13648 + }, + { + "epoch": 0.08117446950233133, + "grad_norm": 1.4814139604568481, + "learning_rate": 4.919159227322945e-05, + "loss": 5.0991, + "step": 13649 + }, + { + "epoch": 0.08118041678561233, + "grad_norm": 1.2586545944213867, + "learning_rate": 4.919147444625392e-05, + "loss": 5.2482, + "step": 13650 + }, + { + "epoch": 0.08118636406889333, + "grad_norm": 1.5292212963104248, + "learning_rate": 4.91913566108334e-05, + "loss": 5.1787, + "step": 13651 + }, + { + "epoch": 0.08119231135217432, + "grad_norm": 1.5354405641555786, + "learning_rate": 4.919123876696793e-05, + "loss": 5.0046, + "step": 13652 + }, + { + "epoch": 0.08119825863545532, + "grad_norm": 1.3921040296554565, + "learning_rate": 4.919112091465755e-05, + "loss": 5.2199, + "step": 13653 + }, + { + "epoch": 0.08120420591873632, + "grad_norm": 1.471068263053894, + "learning_rate": 4.91910030539023e-05, + "loss": 5.0445, + "step": 13654 + }, + { + "epoch": 0.08121015320201731, + "grad_norm": 1.3318332433700562, + "learning_rate": 4.919088518470222e-05, + "loss": 5.1973, + "step": 13655 + }, + { + "epoch": 0.08121610048529831, + "grad_norm": 1.5445464849472046, + "learning_rate": 4.919076730705735e-05, + "loss": 5.4165, + "step": 13656 + }, + { + "epoch": 0.08122204776857932, + "grad_norm": 1.3854666948318481, + "learning_rate": 4.9190649420967735e-05, + "loss": 5.336, + "step": 13657 + }, + { + "epoch": 0.0812279950518603, + "grad_norm": 1.4703121185302734, + "learning_rate": 4.919053152643342e-05, + "loss": 5.4837, + "step": 13658 + }, + { + "epoch": 0.08123394233514131, + "grad_norm": 1.3189783096313477, + "learning_rate": 4.9190413623454425e-05, + "loss": 5.4163, + "step": 13659 + }, + { + "epoch": 0.08123988961842231, + "grad_norm": 1.469601035118103, + "learning_rate": 4.919029571203081e-05, + "loss": 5.2772, + "step": 13660 + }, + { + "epoch": 0.0812458369017033, + "grad_norm": 1.4215590953826904, + "learning_rate": 4.919017779216262e-05, + "loss": 5.5008, + "step": 13661 + }, + { + "epoch": 0.0812517841849843, + "grad_norm": 1.577255129814148, + "learning_rate": 4.919005986384989e-05, + "loss": 5.2565, + "step": 13662 + }, + { + "epoch": 0.0812577314682653, + "grad_norm": 1.5910719633102417, + "learning_rate": 4.918994192709265e-05, + "loss": 5.1143, + "step": 13663 + }, + { + "epoch": 0.08126367875154629, + "grad_norm": 1.5665141344070435, + "learning_rate": 4.9189823981890964e-05, + "loss": 5.1911, + "step": 13664 + }, + { + "epoch": 0.08126962603482729, + "grad_norm": 1.6348809003829956, + "learning_rate": 4.918970602824485e-05, + "loss": 5.2257, + "step": 13665 + }, + { + "epoch": 0.0812755733181083, + "grad_norm": 1.4213917255401611, + "learning_rate": 4.9189588066154365e-05, + "loss": 5.0528, + "step": 13666 + }, + { + "epoch": 0.08128152060138928, + "grad_norm": 1.497758388519287, + "learning_rate": 4.918947009561955e-05, + "loss": 5.2421, + "step": 13667 + }, + { + "epoch": 0.08128746788467028, + "grad_norm": 1.4052904844284058, + "learning_rate": 4.918935211664043e-05, + "loss": 5.5054, + "step": 13668 + }, + { + "epoch": 0.08129341516795129, + "grad_norm": 1.5615813732147217, + "learning_rate": 4.9189234129217064e-05, + "loss": 5.2711, + "step": 13669 + }, + { + "epoch": 0.08129936245123227, + "grad_norm": 1.2366914749145508, + "learning_rate": 4.9189116133349485e-05, + "loss": 5.4035, + "step": 13670 + }, + { + "epoch": 0.08130530973451328, + "grad_norm": 1.5328080654144287, + "learning_rate": 4.918899812903773e-05, + "loss": 5.3269, + "step": 13671 + }, + { + "epoch": 0.08131125701779428, + "grad_norm": 1.6515448093414307, + "learning_rate": 4.918888011628185e-05, + "loss": 5.1734, + "step": 13672 + }, + { + "epoch": 0.08131720430107527, + "grad_norm": 1.385549783706665, + "learning_rate": 4.918876209508188e-05, + "loss": 5.3769, + "step": 13673 + }, + { + "epoch": 0.08132315158435627, + "grad_norm": 1.4133338928222656, + "learning_rate": 4.9188644065437875e-05, + "loss": 5.2607, + "step": 13674 + }, + { + "epoch": 0.08132909886763727, + "grad_norm": 1.6652443408966064, + "learning_rate": 4.918852602734984e-05, + "loss": 5.3939, + "step": 13675 + }, + { + "epoch": 0.08133504615091826, + "grad_norm": 1.455493450164795, + "learning_rate": 4.918840798081786e-05, + "loss": 5.3051, + "step": 13676 + }, + { + "epoch": 0.08134099343419926, + "grad_norm": 1.5490756034851074, + "learning_rate": 4.918828992584196e-05, + "loss": 5.4309, + "step": 13677 + }, + { + "epoch": 0.08134694071748025, + "grad_norm": 1.5857222080230713, + "learning_rate": 4.918817186242216e-05, + "loss": 5.1158, + "step": 13678 + }, + { + "epoch": 0.08135288800076125, + "grad_norm": 1.6051661968231201, + "learning_rate": 4.918805379055853e-05, + "loss": 5.2668, + "step": 13679 + }, + { + "epoch": 0.08135883528404225, + "grad_norm": 1.6476162672042847, + "learning_rate": 4.91879357102511e-05, + "loss": 5.2367, + "step": 13680 + }, + { + "epoch": 0.08136478256732324, + "grad_norm": 1.4255136251449585, + "learning_rate": 4.918781762149991e-05, + "loss": 5.0348, + "step": 13681 + }, + { + "epoch": 0.08137072985060424, + "grad_norm": 1.4585214853286743, + "learning_rate": 4.9187699524305e-05, + "loss": 5.2323, + "step": 13682 + }, + { + "epoch": 0.08137667713388524, + "grad_norm": 1.3733863830566406, + "learning_rate": 4.9187581418666415e-05, + "loss": 5.0898, + "step": 13683 + }, + { + "epoch": 0.08138262441716623, + "grad_norm": 1.5789494514465332, + "learning_rate": 4.91874633045842e-05, + "loss": 5.0886, + "step": 13684 + }, + { + "epoch": 0.08138857170044723, + "grad_norm": 1.4390051364898682, + "learning_rate": 4.918734518205839e-05, + "loss": 5.4305, + "step": 13685 + }, + { + "epoch": 0.08139451898372824, + "grad_norm": 1.8984171152114868, + "learning_rate": 4.9187227051089025e-05, + "loss": 5.0593, + "step": 13686 + }, + { + "epoch": 0.08140046626700922, + "grad_norm": 1.940045714378357, + "learning_rate": 4.918710891167615e-05, + "loss": 5.3115, + "step": 13687 + }, + { + "epoch": 0.08140641355029023, + "grad_norm": 1.6479912996292114, + "learning_rate": 4.918699076381981e-05, + "loss": 5.1585, + "step": 13688 + }, + { + "epoch": 0.08141236083357123, + "grad_norm": 1.554114818572998, + "learning_rate": 4.918687260752003e-05, + "loss": 5.1581, + "step": 13689 + }, + { + "epoch": 0.08141830811685222, + "grad_norm": 1.6920353174209595, + "learning_rate": 4.9186754442776874e-05, + "loss": 5.2263, + "step": 13690 + }, + { + "epoch": 0.08142425540013322, + "grad_norm": 1.572787880897522, + "learning_rate": 4.9186636269590366e-05, + "loss": 5.1019, + "step": 13691 + }, + { + "epoch": 0.08143020268341422, + "grad_norm": 1.646004319190979, + "learning_rate": 4.918651808796055e-05, + "loss": 5.1426, + "step": 13692 + }, + { + "epoch": 0.08143614996669521, + "grad_norm": 1.578749179840088, + "learning_rate": 4.9186399897887475e-05, + "loss": 4.9682, + "step": 13693 + }, + { + "epoch": 0.08144209724997621, + "grad_norm": 1.7725828886032104, + "learning_rate": 4.918628169937118e-05, + "loss": 5.0772, + "step": 13694 + }, + { + "epoch": 0.08144804453325721, + "grad_norm": 1.808596134185791, + "learning_rate": 4.91861634924117e-05, + "loss": 5.077, + "step": 13695 + }, + { + "epoch": 0.0814539918165382, + "grad_norm": 1.8685991764068604, + "learning_rate": 4.9186045277009084e-05, + "loss": 5.1322, + "step": 13696 + }, + { + "epoch": 0.0814599390998192, + "grad_norm": 1.6144567728042603, + "learning_rate": 4.9185927053163366e-05, + "loss": 5.3354, + "step": 13697 + }, + { + "epoch": 0.0814658863831002, + "grad_norm": 1.767673373222351, + "learning_rate": 4.918580882087459e-05, + "loss": 5.0358, + "step": 13698 + }, + { + "epoch": 0.0814718336663812, + "grad_norm": 1.7151973247528076, + "learning_rate": 4.9185690580142805e-05, + "loss": 5.0371, + "step": 13699 + }, + { + "epoch": 0.0814777809496622, + "grad_norm": 1.710990071296692, + "learning_rate": 4.918557233096803e-05, + "loss": 4.9236, + "step": 13700 + }, + { + "epoch": 0.0814837282329432, + "grad_norm": 1.8118677139282227, + "learning_rate": 4.9185454073350335e-05, + "loss": 4.9112, + "step": 13701 + }, + { + "epoch": 0.08148967551622419, + "grad_norm": 2.0120832920074463, + "learning_rate": 4.918533580728974e-05, + "loss": 4.8201, + "step": 13702 + }, + { + "epoch": 0.08149562279950519, + "grad_norm": 1.742125153541565, + "learning_rate": 4.91852175327863e-05, + "loss": 5.0618, + "step": 13703 + }, + { + "epoch": 0.08150157008278619, + "grad_norm": 1.6496554613113403, + "learning_rate": 4.9185099249840054e-05, + "loss": 5.217, + "step": 13704 + }, + { + "epoch": 0.08150751736606718, + "grad_norm": 1.6782381534576416, + "learning_rate": 4.9184980958451034e-05, + "loss": 5.0362, + "step": 13705 + }, + { + "epoch": 0.08151346464934818, + "grad_norm": 1.8002519607543945, + "learning_rate": 4.918486265861929e-05, + "loss": 4.8812, + "step": 13706 + }, + { + "epoch": 0.08151941193262917, + "grad_norm": 1.5939546823501587, + "learning_rate": 4.918474435034486e-05, + "loss": 5.0571, + "step": 13707 + }, + { + "epoch": 0.08152535921591017, + "grad_norm": 1.6342964172363281, + "learning_rate": 4.918462603362778e-05, + "loss": 5.087, + "step": 13708 + }, + { + "epoch": 0.08153130649919117, + "grad_norm": 1.549822449684143, + "learning_rate": 4.91845077084681e-05, + "loss": 5.1654, + "step": 13709 + }, + { + "epoch": 0.08153725378247216, + "grad_norm": 1.5732479095458984, + "learning_rate": 4.9184389374865855e-05, + "loss": 4.9085, + "step": 13710 + }, + { + "epoch": 0.08154320106575316, + "grad_norm": 1.4182745218276978, + "learning_rate": 4.9184271032821094e-05, + "loss": 4.8846, + "step": 13711 + }, + { + "epoch": 0.08154914834903416, + "grad_norm": 1.3679918050765991, + "learning_rate": 4.918415268233385e-05, + "loss": 5.0263, + "step": 13712 + }, + { + "epoch": 0.08155509563231515, + "grad_norm": 1.4714219570159912, + "learning_rate": 4.918403432340418e-05, + "loss": 5.5169, + "step": 13713 + }, + { + "epoch": 0.08156104291559615, + "grad_norm": 1.8351292610168457, + "learning_rate": 4.91839159560321e-05, + "loss": 5.215, + "step": 13714 + }, + { + "epoch": 0.08156699019887716, + "grad_norm": 1.530781865119934, + "learning_rate": 4.918379758021767e-05, + "loss": 5.0882, + "step": 13715 + }, + { + "epoch": 0.08157293748215814, + "grad_norm": 1.799901008605957, + "learning_rate": 4.918367919596093e-05, + "loss": 5.2248, + "step": 13716 + }, + { + "epoch": 0.08157888476543915, + "grad_norm": 1.7563488483428955, + "learning_rate": 4.9183560803261915e-05, + "loss": 5.3192, + "step": 13717 + }, + { + "epoch": 0.08158483204872015, + "grad_norm": 1.7521497011184692, + "learning_rate": 4.918344240212066e-05, + "loss": 5.4841, + "step": 13718 + }, + { + "epoch": 0.08159077933200114, + "grad_norm": 1.7345610857009888, + "learning_rate": 4.918332399253722e-05, + "loss": 5.0716, + "step": 13719 + }, + { + "epoch": 0.08159672661528214, + "grad_norm": 1.4790915250778198, + "learning_rate": 4.918320557451164e-05, + "loss": 5.1833, + "step": 13720 + }, + { + "epoch": 0.08160267389856314, + "grad_norm": 1.4721198081970215, + "learning_rate": 4.918308714804395e-05, + "loss": 5.1355, + "step": 13721 + }, + { + "epoch": 0.08160862118184413, + "grad_norm": 1.4949108362197876, + "learning_rate": 4.918296871313419e-05, + "loss": 4.9666, + "step": 13722 + }, + { + "epoch": 0.08161456846512513, + "grad_norm": 1.3814501762390137, + "learning_rate": 4.91828502697824e-05, + "loss": 5.0575, + "step": 13723 + }, + { + "epoch": 0.08162051574840613, + "grad_norm": 1.4503964185714722, + "learning_rate": 4.918273181798864e-05, + "loss": 5.4112, + "step": 13724 + }, + { + "epoch": 0.08162646303168712, + "grad_norm": 1.5512415170669556, + "learning_rate": 4.9182613357752925e-05, + "loss": 5.1501, + "step": 13725 + }, + { + "epoch": 0.08163241031496812, + "grad_norm": 1.7429851293563843, + "learning_rate": 4.9182494889075315e-05, + "loss": 5.2736, + "step": 13726 + }, + { + "epoch": 0.08163835759824913, + "grad_norm": 1.325498104095459, + "learning_rate": 4.918237641195584e-05, + "loss": 5.3702, + "step": 13727 + }, + { + "epoch": 0.08164430488153011, + "grad_norm": 1.2677874565124512, + "learning_rate": 4.918225792639456e-05, + "loss": 5.2681, + "step": 13728 + }, + { + "epoch": 0.08165025216481112, + "grad_norm": 1.4957364797592163, + "learning_rate": 4.918213943239149e-05, + "loss": 5.4956, + "step": 13729 + }, + { + "epoch": 0.08165619944809212, + "grad_norm": 1.3380833864212036, + "learning_rate": 4.91820209299467e-05, + "loss": 5.3286, + "step": 13730 + }, + { + "epoch": 0.0816621467313731, + "grad_norm": 1.6803557872772217, + "learning_rate": 4.918190241906021e-05, + "loss": 5.3119, + "step": 13731 + }, + { + "epoch": 0.08166809401465411, + "grad_norm": 1.7933920621871948, + "learning_rate": 4.918178389973206e-05, + "loss": 5.139, + "step": 13732 + }, + { + "epoch": 0.08167404129793511, + "grad_norm": 1.5846813917160034, + "learning_rate": 4.91816653719623e-05, + "loss": 5.4431, + "step": 13733 + }, + { + "epoch": 0.0816799885812161, + "grad_norm": 1.9218448400497437, + "learning_rate": 4.918154683575098e-05, + "loss": 5.3245, + "step": 13734 + }, + { + "epoch": 0.0816859358644971, + "grad_norm": 1.4883100986480713, + "learning_rate": 4.918142829109813e-05, + "loss": 5.3007, + "step": 13735 + }, + { + "epoch": 0.08169188314777809, + "grad_norm": 1.4396723508834839, + "learning_rate": 4.918130973800379e-05, + "loss": 5.1956, + "step": 13736 + }, + { + "epoch": 0.08169783043105909, + "grad_norm": 1.4395633935928345, + "learning_rate": 4.918119117646801e-05, + "loss": 5.1637, + "step": 13737 + }, + { + "epoch": 0.08170377771434009, + "grad_norm": 1.540003776550293, + "learning_rate": 4.9181072606490816e-05, + "loss": 5.2278, + "step": 13738 + }, + { + "epoch": 0.08170972499762108, + "grad_norm": 1.446815848350525, + "learning_rate": 4.918095402807227e-05, + "loss": 5.1627, + "step": 13739 + }, + { + "epoch": 0.08171567228090208, + "grad_norm": 1.4501028060913086, + "learning_rate": 4.918083544121239e-05, + "loss": 5.0747, + "step": 13740 + }, + { + "epoch": 0.08172161956418308, + "grad_norm": 1.217608094215393, + "learning_rate": 4.9180716845911244e-05, + "loss": 5.0668, + "step": 13741 + }, + { + "epoch": 0.08172756684746407, + "grad_norm": 1.6321865320205688, + "learning_rate": 4.918059824216885e-05, + "loss": 5.2785, + "step": 13742 + }, + { + "epoch": 0.08173351413074507, + "grad_norm": 1.5838396549224854, + "learning_rate": 4.9180479629985265e-05, + "loss": 5.1675, + "step": 13743 + }, + { + "epoch": 0.08173946141402608, + "grad_norm": 1.7023003101348877, + "learning_rate": 4.918036100936052e-05, + "loss": 5.1664, + "step": 13744 + }, + { + "epoch": 0.08174540869730706, + "grad_norm": 1.767067790031433, + "learning_rate": 4.918024238029466e-05, + "loss": 5.0157, + "step": 13745 + }, + { + "epoch": 0.08175135598058807, + "grad_norm": 1.6058627367019653, + "learning_rate": 4.918012374278773e-05, + "loss": 5.1772, + "step": 13746 + }, + { + "epoch": 0.08175730326386907, + "grad_norm": 1.7853416204452515, + "learning_rate": 4.9180005096839766e-05, + "loss": 5.2678, + "step": 13747 + }, + { + "epoch": 0.08176325054715006, + "grad_norm": 1.4799201488494873, + "learning_rate": 4.917988644245082e-05, + "loss": 5.3153, + "step": 13748 + }, + { + "epoch": 0.08176919783043106, + "grad_norm": 1.4581291675567627, + "learning_rate": 4.917976777962092e-05, + "loss": 5.2755, + "step": 13749 + }, + { + "epoch": 0.08177514511371206, + "grad_norm": 1.7151737213134766, + "learning_rate": 4.917964910835011e-05, + "loss": 5.1761, + "step": 13750 + }, + { + "epoch": 0.08178109239699305, + "grad_norm": 1.5101522207260132, + "learning_rate": 4.917953042863843e-05, + "loss": 5.0003, + "step": 13751 + }, + { + "epoch": 0.08178703968027405, + "grad_norm": 1.4508110284805298, + "learning_rate": 4.9179411740485935e-05, + "loss": 5.1158, + "step": 13752 + }, + { + "epoch": 0.08179298696355505, + "grad_norm": 1.5012980699539185, + "learning_rate": 4.917929304389266e-05, + "loss": 5.2762, + "step": 13753 + }, + { + "epoch": 0.08179893424683604, + "grad_norm": 1.5914186239242554, + "learning_rate": 4.9179174338858635e-05, + "loss": 5.1422, + "step": 13754 + }, + { + "epoch": 0.08180488153011704, + "grad_norm": 1.5001139640808105, + "learning_rate": 4.9179055625383915e-05, + "loss": 5.2158, + "step": 13755 + }, + { + "epoch": 0.08181082881339805, + "grad_norm": 1.382815957069397, + "learning_rate": 4.917893690346853e-05, + "loss": 5.2562, + "step": 13756 + }, + { + "epoch": 0.08181677609667903, + "grad_norm": 1.3576865196228027, + "learning_rate": 4.9178818173112535e-05, + "loss": 5.221, + "step": 13757 + }, + { + "epoch": 0.08182272337996004, + "grad_norm": 1.5542206764221191, + "learning_rate": 4.917869943431596e-05, + "loss": 5.071, + "step": 13758 + }, + { + "epoch": 0.08182867066324104, + "grad_norm": 1.6010403633117676, + "learning_rate": 4.9178580687078855e-05, + "loss": 5.2052, + "step": 13759 + }, + { + "epoch": 0.08183461794652203, + "grad_norm": 1.3808842897415161, + "learning_rate": 4.9178461931401254e-05, + "loss": 5.3007, + "step": 13760 + }, + { + "epoch": 0.08184056522980303, + "grad_norm": 1.3584518432617188, + "learning_rate": 4.91783431672832e-05, + "loss": 5.3137, + "step": 13761 + }, + { + "epoch": 0.08184651251308403, + "grad_norm": 1.4467449188232422, + "learning_rate": 4.917822439472474e-05, + "loss": 5.2208, + "step": 13762 + }, + { + "epoch": 0.08185245979636502, + "grad_norm": 1.298618197441101, + "learning_rate": 4.917810561372591e-05, + "loss": 5.2161, + "step": 13763 + }, + { + "epoch": 0.08185840707964602, + "grad_norm": 2.5304789543151855, + "learning_rate": 4.9177986824286756e-05, + "loss": 4.6644, + "step": 13764 + }, + { + "epoch": 0.08186435436292701, + "grad_norm": 1.607969880104065, + "learning_rate": 4.917786802640732e-05, + "loss": 5.2116, + "step": 13765 + }, + { + "epoch": 0.08187030164620801, + "grad_norm": 1.401207685470581, + "learning_rate": 4.917774922008763e-05, + "loss": 5.2847, + "step": 13766 + }, + { + "epoch": 0.08187624892948901, + "grad_norm": 1.1652514934539795, + "learning_rate": 4.9177630405327746e-05, + "loss": 5.2939, + "step": 13767 + }, + { + "epoch": 0.08188219621277, + "grad_norm": 1.2998749017715454, + "learning_rate": 4.9177511582127694e-05, + "loss": 5.251, + "step": 13768 + }, + { + "epoch": 0.081888143496051, + "grad_norm": 1.33558988571167, + "learning_rate": 4.917739275048753e-05, + "loss": 5.2749, + "step": 13769 + }, + { + "epoch": 0.081894090779332, + "grad_norm": 1.1457966566085815, + "learning_rate": 4.917727391040728e-05, + "loss": 5.3153, + "step": 13770 + }, + { + "epoch": 0.08190003806261299, + "grad_norm": 1.493249773979187, + "learning_rate": 4.917715506188699e-05, + "loss": 5.3702, + "step": 13771 + }, + { + "epoch": 0.081905985345894, + "grad_norm": 1.2591760158538818, + "learning_rate": 4.917703620492672e-05, + "loss": 5.2019, + "step": 13772 + }, + { + "epoch": 0.081911932629175, + "grad_norm": 1.2480885982513428, + "learning_rate": 4.917691733952648e-05, + "loss": 5.1904, + "step": 13773 + }, + { + "epoch": 0.08191787991245598, + "grad_norm": 1.3278160095214844, + "learning_rate": 4.917679846568634e-05, + "loss": 5.0424, + "step": 13774 + }, + { + "epoch": 0.08192382719573699, + "grad_norm": 1.2930511236190796, + "learning_rate": 4.9176679583406325e-05, + "loss": 5.2437, + "step": 13775 + }, + { + "epoch": 0.08192977447901799, + "grad_norm": 1.39852774143219, + "learning_rate": 4.9176560692686485e-05, + "loss": 5.3683, + "step": 13776 + }, + { + "epoch": 0.08193572176229898, + "grad_norm": 1.3392889499664307, + "learning_rate": 4.917644179352685e-05, + "loss": 5.1894, + "step": 13777 + }, + { + "epoch": 0.08194166904557998, + "grad_norm": 1.318595051765442, + "learning_rate": 4.917632288592747e-05, + "loss": 5.382, + "step": 13778 + }, + { + "epoch": 0.08194761632886098, + "grad_norm": 1.0992580652236938, + "learning_rate": 4.9176203969888395e-05, + "loss": 5.1979, + "step": 13779 + }, + { + "epoch": 0.08195356361214197, + "grad_norm": 1.2092480659484863, + "learning_rate": 4.917608504540965e-05, + "loss": 5.2253, + "step": 13780 + }, + { + "epoch": 0.08195951089542297, + "grad_norm": 1.2495516538619995, + "learning_rate": 4.9175966112491286e-05, + "loss": 5.1951, + "step": 13781 + }, + { + "epoch": 0.08196545817870397, + "grad_norm": 1.642177700996399, + "learning_rate": 4.917584717113334e-05, + "loss": 4.9648, + "step": 13782 + }, + { + "epoch": 0.08197140546198496, + "grad_norm": 1.4849772453308105, + "learning_rate": 4.9175728221335856e-05, + "loss": 4.8231, + "step": 13783 + }, + { + "epoch": 0.08197735274526596, + "grad_norm": 1.1743687391281128, + "learning_rate": 4.917560926309888e-05, + "loss": 4.7685, + "step": 13784 + }, + { + "epoch": 0.08198330002854697, + "grad_norm": 1.2688218355178833, + "learning_rate": 4.9175490296422436e-05, + "loss": 5.3023, + "step": 13785 + }, + { + "epoch": 0.08198924731182795, + "grad_norm": 1.2325210571289062, + "learning_rate": 4.9175371321306584e-05, + "loss": 4.8373, + "step": 13786 + }, + { + "epoch": 0.08199519459510896, + "grad_norm": 1.5414066314697266, + "learning_rate": 4.9175252337751364e-05, + "loss": 5.005, + "step": 13787 + }, + { + "epoch": 0.08200114187838996, + "grad_norm": 2.1581833362579346, + "learning_rate": 4.917513334575681e-05, + "loss": 5.5065, + "step": 13788 + }, + { + "epoch": 0.08200708916167095, + "grad_norm": 2.0199508666992188, + "learning_rate": 4.917501434532297e-05, + "loss": 5.8826, + "step": 13789 + }, + { + "epoch": 0.08201303644495195, + "grad_norm": 1.727602481842041, + "learning_rate": 4.917489533644987e-05, + "loss": 5.6967, + "step": 13790 + }, + { + "epoch": 0.08201898372823295, + "grad_norm": 1.5649336576461792, + "learning_rate": 4.917477631913757e-05, + "loss": 5.783, + "step": 13791 + }, + { + "epoch": 0.08202493101151394, + "grad_norm": 1.7326582670211792, + "learning_rate": 4.9174657293386115e-05, + "loss": 5.6705, + "step": 13792 + }, + { + "epoch": 0.08203087829479494, + "grad_norm": 1.8611500263214111, + "learning_rate": 4.917453825919553e-05, + "loss": 5.4881, + "step": 13793 + }, + { + "epoch": 0.08203682557807593, + "grad_norm": 1.9762206077575684, + "learning_rate": 4.917441921656586e-05, + "loss": 5.4826, + "step": 13794 + }, + { + "epoch": 0.08204277286135693, + "grad_norm": 1.6816489696502686, + "learning_rate": 4.9174300165497154e-05, + "loss": 5.466, + "step": 13795 + }, + { + "epoch": 0.08204872014463793, + "grad_norm": 1.8922536373138428, + "learning_rate": 4.9174181105989445e-05, + "loss": 5.3603, + "step": 13796 + }, + { + "epoch": 0.08205466742791892, + "grad_norm": 2.094996213912964, + "learning_rate": 4.917406203804279e-05, + "loss": 5.8687, + "step": 13797 + }, + { + "epoch": 0.08206061471119992, + "grad_norm": 1.8656450510025024, + "learning_rate": 4.9173942961657215e-05, + "loss": 6.2551, + "step": 13798 + }, + { + "epoch": 0.08206656199448092, + "grad_norm": 1.871787428855896, + "learning_rate": 4.917382387683276e-05, + "loss": 5.6612, + "step": 13799 + }, + { + "epoch": 0.08207250927776191, + "grad_norm": 1.8721636533737183, + "learning_rate": 4.9173704783569475e-05, + "loss": 5.8918, + "step": 13800 + }, + { + "epoch": 0.08207845656104291, + "grad_norm": 2.0554919242858887, + "learning_rate": 4.917358568186741e-05, + "loss": 5.6398, + "step": 13801 + }, + { + "epoch": 0.08208440384432392, + "grad_norm": 1.9311691522598267, + "learning_rate": 4.917346657172658e-05, + "loss": 5.6507, + "step": 13802 + }, + { + "epoch": 0.0820903511276049, + "grad_norm": 1.7426981925964355, + "learning_rate": 4.917334745314705e-05, + "loss": 5.3193, + "step": 13803 + }, + { + "epoch": 0.0820962984108859, + "grad_norm": 1.783890724182129, + "learning_rate": 4.9173228326128856e-05, + "loss": 5.1274, + "step": 13804 + }, + { + "epoch": 0.08210224569416691, + "grad_norm": 1.8739385604858398, + "learning_rate": 4.917310919067203e-05, + "loss": 5.378, + "step": 13805 + }, + { + "epoch": 0.0821081929774479, + "grad_norm": 1.6748543977737427, + "learning_rate": 4.917299004677663e-05, + "loss": 5.4772, + "step": 13806 + }, + { + "epoch": 0.0821141402607289, + "grad_norm": 1.498864769935608, + "learning_rate": 4.917287089444269e-05, + "loss": 5.4485, + "step": 13807 + }, + { + "epoch": 0.0821200875440099, + "grad_norm": 1.6129908561706543, + "learning_rate": 4.917275173367024e-05, + "loss": 5.5245, + "step": 13808 + }, + { + "epoch": 0.08212603482729089, + "grad_norm": 1.4655383825302124, + "learning_rate": 4.917263256445934e-05, + "loss": 5.5513, + "step": 13809 + }, + { + "epoch": 0.08213198211057189, + "grad_norm": 1.765244483947754, + "learning_rate": 4.917251338681003e-05, + "loss": 5.5322, + "step": 13810 + }, + { + "epoch": 0.0821379293938529, + "grad_norm": 2.002889633178711, + "learning_rate": 4.917239420072233e-05, + "loss": 5.1273, + "step": 13811 + }, + { + "epoch": 0.08214387667713388, + "grad_norm": 2.4380993843078613, + "learning_rate": 4.917227500619631e-05, + "loss": 4.8983, + "step": 13812 + }, + { + "epoch": 0.08214982396041488, + "grad_norm": 2.0864169597625732, + "learning_rate": 4.917215580323199e-05, + "loss": 5.077, + "step": 13813 + }, + { + "epoch": 0.08215577124369589, + "grad_norm": 2.2942094802856445, + "learning_rate": 4.917203659182942e-05, + "loss": 5.4359, + "step": 13814 + }, + { + "epoch": 0.08216171852697687, + "grad_norm": 2.067659616470337, + "learning_rate": 4.917191737198865e-05, + "loss": 5.7409, + "step": 13815 + }, + { + "epoch": 0.08216766581025788, + "grad_norm": 2.010085344314575, + "learning_rate": 4.917179814370971e-05, + "loss": 5.2279, + "step": 13816 + }, + { + "epoch": 0.08217361309353888, + "grad_norm": 1.8540743589401245, + "learning_rate": 4.917167890699264e-05, + "loss": 5.6146, + "step": 13817 + }, + { + "epoch": 0.08217956037681987, + "grad_norm": 1.9126391410827637, + "learning_rate": 4.917155966183749e-05, + "loss": 5.7007, + "step": 13818 + }, + { + "epoch": 0.08218550766010087, + "grad_norm": 1.6382626295089722, + "learning_rate": 4.91714404082443e-05, + "loss": 5.3641, + "step": 13819 + }, + { + "epoch": 0.08219145494338187, + "grad_norm": 1.8019288778305054, + "learning_rate": 4.9171321146213105e-05, + "loss": 5.1853, + "step": 13820 + }, + { + "epoch": 0.08219740222666286, + "grad_norm": 1.681685447692871, + "learning_rate": 4.917120187574395e-05, + "loss": 5.4141, + "step": 13821 + }, + { + "epoch": 0.08220334950994386, + "grad_norm": 1.9356689453125, + "learning_rate": 4.9171082596836896e-05, + "loss": 5.5379, + "step": 13822 + }, + { + "epoch": 0.08220929679322485, + "grad_norm": 1.9538071155548096, + "learning_rate": 4.917096330949195e-05, + "loss": 5.5723, + "step": 13823 + }, + { + "epoch": 0.08221524407650585, + "grad_norm": 1.7350852489471436, + "learning_rate": 4.9170844013709175e-05, + "loss": 5.5622, + "step": 13824 + }, + { + "epoch": 0.08222119135978685, + "grad_norm": 1.790276050567627, + "learning_rate": 4.9170724709488606e-05, + "loss": 5.5194, + "step": 13825 + }, + { + "epoch": 0.08222713864306784, + "grad_norm": 2.2997219562530518, + "learning_rate": 4.917060539683028e-05, + "loss": 5.0646, + "step": 13826 + }, + { + "epoch": 0.08223308592634884, + "grad_norm": 1.729131817817688, + "learning_rate": 4.9170486075734254e-05, + "loss": 5.5588, + "step": 13827 + }, + { + "epoch": 0.08223903320962984, + "grad_norm": 1.8754487037658691, + "learning_rate": 4.9170366746200566e-05, + "loss": 5.5435, + "step": 13828 + }, + { + "epoch": 0.08224498049291083, + "grad_norm": 1.8330692052841187, + "learning_rate": 4.9170247408229244e-05, + "loss": 5.598, + "step": 13829 + }, + { + "epoch": 0.08225092777619183, + "grad_norm": 1.8318592309951782, + "learning_rate": 4.917012806182034e-05, + "loss": 5.5165, + "step": 13830 + }, + { + "epoch": 0.08225687505947284, + "grad_norm": 1.6818424463272095, + "learning_rate": 4.9170008706973895e-05, + "loss": 5.3377, + "step": 13831 + }, + { + "epoch": 0.08226282234275382, + "grad_norm": 1.7040458917617798, + "learning_rate": 4.916988934368995e-05, + "loss": 5.4644, + "step": 13832 + }, + { + "epoch": 0.08226876962603483, + "grad_norm": 1.8902777433395386, + "learning_rate": 4.916976997196855e-05, + "loss": 5.4526, + "step": 13833 + }, + { + "epoch": 0.08227471690931583, + "grad_norm": 1.7484904527664185, + "learning_rate": 4.9169650591809724e-05, + "loss": 5.3, + "step": 13834 + }, + { + "epoch": 0.08228066419259682, + "grad_norm": 1.726083517074585, + "learning_rate": 4.916953120321353e-05, + "loss": 5.4451, + "step": 13835 + }, + { + "epoch": 0.08228661147587782, + "grad_norm": 1.791942834854126, + "learning_rate": 4.916941180618e-05, + "loss": 5.444, + "step": 13836 + }, + { + "epoch": 0.08229255875915882, + "grad_norm": 1.9032018184661865, + "learning_rate": 4.916929240070918e-05, + "loss": 5.4411, + "step": 13837 + }, + { + "epoch": 0.08229850604243981, + "grad_norm": 1.6170588731765747, + "learning_rate": 4.91691729868011e-05, + "loss": 5.4293, + "step": 13838 + }, + { + "epoch": 0.08230445332572081, + "grad_norm": 1.3972853422164917, + "learning_rate": 4.9169053564455825e-05, + "loss": 5.2889, + "step": 13839 + }, + { + "epoch": 0.08231040060900181, + "grad_norm": 1.782913088798523, + "learning_rate": 4.916893413367338e-05, + "loss": 5.4092, + "step": 13840 + }, + { + "epoch": 0.0823163478922828, + "grad_norm": 1.83617103099823, + "learning_rate": 4.9168814694453807e-05, + "loss": 5.3997, + "step": 13841 + }, + { + "epoch": 0.0823222951755638, + "grad_norm": 1.92609703540802, + "learning_rate": 4.9168695246797146e-05, + "loss": 5.3469, + "step": 13842 + }, + { + "epoch": 0.0823282424588448, + "grad_norm": 2.20027756690979, + "learning_rate": 4.9168575790703454e-05, + "loss": 5.5999, + "step": 13843 + }, + { + "epoch": 0.0823341897421258, + "grad_norm": 3.096323251724243, + "learning_rate": 4.916845632617275e-05, + "loss": 5.3997, + "step": 13844 + }, + { + "epoch": 0.0823401370254068, + "grad_norm": 2.433900833129883, + "learning_rate": 4.91683368532051e-05, + "loss": 5.4937, + "step": 13845 + }, + { + "epoch": 0.0823460843086878, + "grad_norm": 2.371389389038086, + "learning_rate": 4.9168217371800526e-05, + "loss": 5.966, + "step": 13846 + }, + { + "epoch": 0.08235203159196879, + "grad_norm": 1.5628182888031006, + "learning_rate": 4.9168097881959076e-05, + "loss": 5.5971, + "step": 13847 + }, + { + "epoch": 0.08235797887524979, + "grad_norm": 2.733569622039795, + "learning_rate": 4.91679783836808e-05, + "loss": 5.2696, + "step": 13848 + }, + { + "epoch": 0.08236392615853079, + "grad_norm": 2.117197275161743, + "learning_rate": 4.916785887696572e-05, + "loss": 5.3729, + "step": 13849 + }, + { + "epoch": 0.08236987344181178, + "grad_norm": 2.040476083755493, + "learning_rate": 4.9167739361813905e-05, + "loss": 5.6568, + "step": 13850 + }, + { + "epoch": 0.08237582072509278, + "grad_norm": 2.127465009689331, + "learning_rate": 4.916761983822536e-05, + "loss": 5.9168, + "step": 13851 + }, + { + "epoch": 0.08238176800837377, + "grad_norm": 2.00907301902771, + "learning_rate": 4.916750030620017e-05, + "loss": 5.9104, + "step": 13852 + }, + { + "epoch": 0.08238771529165477, + "grad_norm": 1.721428394317627, + "learning_rate": 4.916738076573835e-05, + "loss": 5.8126, + "step": 13853 + }, + { + "epoch": 0.08239366257493577, + "grad_norm": 1.5760809183120728, + "learning_rate": 4.9167261216839946e-05, + "loss": 6.0134, + "step": 13854 + }, + { + "epoch": 0.08239960985821676, + "grad_norm": 1.648639440536499, + "learning_rate": 4.9167141659505e-05, + "loss": 5.3878, + "step": 13855 + }, + { + "epoch": 0.08240555714149776, + "grad_norm": 1.4113967418670654, + "learning_rate": 4.916702209373355e-05, + "loss": 5.8159, + "step": 13856 + }, + { + "epoch": 0.08241150442477876, + "grad_norm": 1.725477933883667, + "learning_rate": 4.916690251952565e-05, + "loss": 5.7185, + "step": 13857 + }, + { + "epoch": 0.08241745170805975, + "grad_norm": 1.8538665771484375, + "learning_rate": 4.9166782936881326e-05, + "loss": 5.1804, + "step": 13858 + }, + { + "epoch": 0.08242339899134075, + "grad_norm": 1.5203232765197754, + "learning_rate": 4.9166663345800635e-05, + "loss": 5.1486, + "step": 13859 + }, + { + "epoch": 0.08242934627462176, + "grad_norm": 1.8738161325454712, + "learning_rate": 4.916654374628361e-05, + "loss": 5.0062, + "step": 13860 + }, + { + "epoch": 0.08243529355790274, + "grad_norm": 1.689563512802124, + "learning_rate": 4.916642413833029e-05, + "loss": 4.9508, + "step": 13861 + }, + { + "epoch": 0.08244124084118375, + "grad_norm": 1.8749178647994995, + "learning_rate": 4.916630452194073e-05, + "loss": 5.4645, + "step": 13862 + }, + { + "epoch": 0.08244718812446475, + "grad_norm": 2.779536247253418, + "learning_rate": 4.9166184897114956e-05, + "loss": 5.9364, + "step": 13863 + }, + { + "epoch": 0.08245313540774574, + "grad_norm": 2.41239333152771, + "learning_rate": 4.9166065263853014e-05, + "loss": 5.9045, + "step": 13864 + }, + { + "epoch": 0.08245908269102674, + "grad_norm": 1.624475359916687, + "learning_rate": 4.916594562215495e-05, + "loss": 5.4222, + "step": 13865 + }, + { + "epoch": 0.08246502997430774, + "grad_norm": 1.6841174364089966, + "learning_rate": 4.916582597202081e-05, + "loss": 5.3455, + "step": 13866 + }, + { + "epoch": 0.08247097725758873, + "grad_norm": 1.6790028810501099, + "learning_rate": 4.916570631345062e-05, + "loss": 5.5397, + "step": 13867 + }, + { + "epoch": 0.08247692454086973, + "grad_norm": 1.87303626537323, + "learning_rate": 4.9165586646444436e-05, + "loss": 5.6022, + "step": 13868 + }, + { + "epoch": 0.08248287182415073, + "grad_norm": 1.7747167348861694, + "learning_rate": 4.91654669710023e-05, + "loss": 5.4631, + "step": 13869 + }, + { + "epoch": 0.08248881910743172, + "grad_norm": 1.694941759109497, + "learning_rate": 4.9165347287124244e-05, + "loss": 5.5634, + "step": 13870 + }, + { + "epoch": 0.08249476639071272, + "grad_norm": 1.8258243799209595, + "learning_rate": 4.9165227594810316e-05, + "loss": 5.526, + "step": 13871 + }, + { + "epoch": 0.08250071367399373, + "grad_norm": 1.708798885345459, + "learning_rate": 4.9165107894060556e-05, + "loss": 5.5127, + "step": 13872 + }, + { + "epoch": 0.08250666095727471, + "grad_norm": 1.7820818424224854, + "learning_rate": 4.916498818487501e-05, + "loss": 5.4169, + "step": 13873 + }, + { + "epoch": 0.08251260824055572, + "grad_norm": 2.38067626953125, + "learning_rate": 4.916486846725372e-05, + "loss": 5.8063, + "step": 13874 + }, + { + "epoch": 0.08251855552383672, + "grad_norm": 1.8507468700408936, + "learning_rate": 4.916474874119671e-05, + "loss": 5.4871, + "step": 13875 + }, + { + "epoch": 0.0825245028071177, + "grad_norm": 1.8866678476333618, + "learning_rate": 4.916462900670404e-05, + "loss": 5.5452, + "step": 13876 + }, + { + "epoch": 0.08253045009039871, + "grad_norm": 1.853668212890625, + "learning_rate": 4.916450926377576e-05, + "loss": 5.8262, + "step": 13877 + }, + { + "epoch": 0.08253639737367971, + "grad_norm": 1.7404545545578003, + "learning_rate": 4.916438951241189e-05, + "loss": 5.5978, + "step": 13878 + }, + { + "epoch": 0.0825423446569607, + "grad_norm": 1.844139814376831, + "learning_rate": 4.916426975261248e-05, + "loss": 5.765, + "step": 13879 + }, + { + "epoch": 0.0825482919402417, + "grad_norm": 1.9454487562179565, + "learning_rate": 4.916414998437758e-05, + "loss": 5.5458, + "step": 13880 + }, + { + "epoch": 0.08255423922352269, + "grad_norm": 1.317144751548767, + "learning_rate": 4.916403020770722e-05, + "loss": 5.7694, + "step": 13881 + }, + { + "epoch": 0.08256018650680369, + "grad_norm": 1.718024730682373, + "learning_rate": 4.916391042260145e-05, + "loss": 5.7369, + "step": 13882 + }, + { + "epoch": 0.08256613379008469, + "grad_norm": 1.4623572826385498, + "learning_rate": 4.9163790629060305e-05, + "loss": 5.72, + "step": 13883 + }, + { + "epoch": 0.08257208107336568, + "grad_norm": 1.908839225769043, + "learning_rate": 4.916367082708383e-05, + "loss": 5.7175, + "step": 13884 + }, + { + "epoch": 0.08257802835664668, + "grad_norm": 1.7910356521606445, + "learning_rate": 4.916355101667206e-05, + "loss": 5.4446, + "step": 13885 + }, + { + "epoch": 0.08258397563992768, + "grad_norm": 2.132512092590332, + "learning_rate": 4.9163431197825055e-05, + "loss": 5.2315, + "step": 13886 + }, + { + "epoch": 0.08258992292320867, + "grad_norm": 2.223329782485962, + "learning_rate": 4.9163311370542844e-05, + "loss": 5.2953, + "step": 13887 + }, + { + "epoch": 0.08259587020648967, + "grad_norm": 2.6441519260406494, + "learning_rate": 4.916319153482547e-05, + "loss": 5.2637, + "step": 13888 + }, + { + "epoch": 0.08260181748977068, + "grad_norm": 2.1528780460357666, + "learning_rate": 4.9163071690672973e-05, + "loss": 5.1602, + "step": 13889 + }, + { + "epoch": 0.08260776477305166, + "grad_norm": 2.6483633518218994, + "learning_rate": 4.91629518380854e-05, + "loss": 5.2487, + "step": 13890 + }, + { + "epoch": 0.08261371205633267, + "grad_norm": 2.276808738708496, + "learning_rate": 4.916283197706279e-05, + "loss": 5.064, + "step": 13891 + }, + { + "epoch": 0.08261965933961367, + "grad_norm": 1.8921101093292236, + "learning_rate": 4.9162712107605184e-05, + "loss": 5.3979, + "step": 13892 + }, + { + "epoch": 0.08262560662289466, + "grad_norm": 2.2009568214416504, + "learning_rate": 4.9162592229712625e-05, + "loss": 5.2434, + "step": 13893 + }, + { + "epoch": 0.08263155390617566, + "grad_norm": 2.199380874633789, + "learning_rate": 4.916247234338516e-05, + "loss": 4.7187, + "step": 13894 + }, + { + "epoch": 0.08263750118945666, + "grad_norm": 2.3620400428771973, + "learning_rate": 4.916235244862282e-05, + "loss": 4.7371, + "step": 13895 + }, + { + "epoch": 0.08264344847273765, + "grad_norm": 2.100086212158203, + "learning_rate": 4.9162232545425646e-05, + "loss": 4.5239, + "step": 13896 + }, + { + "epoch": 0.08264939575601865, + "grad_norm": 2.100106954574585, + "learning_rate": 4.91621126337937e-05, + "loss": 4.5555, + "step": 13897 + }, + { + "epoch": 0.08265534303929965, + "grad_norm": 2.005345344543457, + "learning_rate": 4.9161992713727e-05, + "loss": 4.397, + "step": 13898 + }, + { + "epoch": 0.08266129032258064, + "grad_norm": 1.9393454790115356, + "learning_rate": 4.91618727852256e-05, + "loss": 4.7327, + "step": 13899 + }, + { + "epoch": 0.08266723760586164, + "grad_norm": 2.0109846591949463, + "learning_rate": 4.916175284828955e-05, + "loss": 4.4987, + "step": 13900 + }, + { + "epoch": 0.08267318488914265, + "grad_norm": 2.0040533542633057, + "learning_rate": 4.916163290291886e-05, + "loss": 4.4703, + "step": 13901 + }, + { + "epoch": 0.08267913217242363, + "grad_norm": 2.014885902404785, + "learning_rate": 4.916151294911361e-05, + "loss": 4.374, + "step": 13902 + }, + { + "epoch": 0.08268507945570464, + "grad_norm": 1.9490050077438354, + "learning_rate": 4.916139298687382e-05, + "loss": 4.6281, + "step": 13903 + }, + { + "epoch": 0.08269102673898564, + "grad_norm": 2.0691943168640137, + "learning_rate": 4.916127301619954e-05, + "loss": 4.5008, + "step": 13904 + }, + { + "epoch": 0.08269697402226663, + "grad_norm": 2.1290805339813232, + "learning_rate": 4.916115303709081e-05, + "loss": 5.4876, + "step": 13905 + }, + { + "epoch": 0.08270292130554763, + "grad_norm": 1.981466293334961, + "learning_rate": 4.916103304954767e-05, + "loss": 5.7699, + "step": 13906 + }, + { + "epoch": 0.08270886858882863, + "grad_norm": 1.8898048400878906, + "learning_rate": 4.916091305357016e-05, + "loss": 5.7874, + "step": 13907 + }, + { + "epoch": 0.08271481587210962, + "grad_norm": 1.7809741497039795, + "learning_rate": 4.916079304915833e-05, + "loss": 5.6264, + "step": 13908 + }, + { + "epoch": 0.08272076315539062, + "grad_norm": 1.7516652345657349, + "learning_rate": 4.916067303631221e-05, + "loss": 5.5751, + "step": 13909 + }, + { + "epoch": 0.08272671043867161, + "grad_norm": 1.9051094055175781, + "learning_rate": 4.916055301503185e-05, + "loss": 5.7984, + "step": 13910 + }, + { + "epoch": 0.08273265772195261, + "grad_norm": 1.7115057706832886, + "learning_rate": 4.9160432985317295e-05, + "loss": 5.6187, + "step": 13911 + }, + { + "epoch": 0.08273860500523361, + "grad_norm": 1.790529727935791, + "learning_rate": 4.916031294716858e-05, + "loss": 5.6276, + "step": 13912 + }, + { + "epoch": 0.0827445522885146, + "grad_norm": 1.742039442062378, + "learning_rate": 4.9160192900585754e-05, + "loss": 5.3783, + "step": 13913 + }, + { + "epoch": 0.0827504995717956, + "grad_norm": 1.7544314861297607, + "learning_rate": 4.916007284556885e-05, + "loss": 5.5276, + "step": 13914 + }, + { + "epoch": 0.0827564468550766, + "grad_norm": 2.0135440826416016, + "learning_rate": 4.915995278211791e-05, + "loss": 5.5177, + "step": 13915 + }, + { + "epoch": 0.08276239413835759, + "grad_norm": 1.5759433507919312, + "learning_rate": 4.915983271023299e-05, + "loss": 5.4652, + "step": 13916 + }, + { + "epoch": 0.0827683414216386, + "grad_norm": 1.7974358797073364, + "learning_rate": 4.915971262991411e-05, + "loss": 5.4463, + "step": 13917 + }, + { + "epoch": 0.0827742887049196, + "grad_norm": 1.847692608833313, + "learning_rate": 4.9159592541161335e-05, + "loss": 5.4247, + "step": 13918 + }, + { + "epoch": 0.08278023598820058, + "grad_norm": 1.6701977252960205, + "learning_rate": 4.915947244397469e-05, + "loss": 5.3451, + "step": 13919 + }, + { + "epoch": 0.08278618327148159, + "grad_norm": 1.9226999282836914, + "learning_rate": 4.915935233835423e-05, + "loss": 5.1159, + "step": 13920 + }, + { + "epoch": 0.08279213055476259, + "grad_norm": 2.430760383605957, + "learning_rate": 4.915923222429998e-05, + "loss": 4.9746, + "step": 13921 + }, + { + "epoch": 0.08279807783804358, + "grad_norm": 1.7708054780960083, + "learning_rate": 4.915911210181199e-05, + "loss": 5.4986, + "step": 13922 + }, + { + "epoch": 0.08280402512132458, + "grad_norm": 1.7802354097366333, + "learning_rate": 4.915899197089031e-05, + "loss": 5.4283, + "step": 13923 + }, + { + "epoch": 0.08280997240460558, + "grad_norm": 2.347226142883301, + "learning_rate": 4.9158871831534984e-05, + "loss": 5.2917, + "step": 13924 + }, + { + "epoch": 0.08281591968788657, + "grad_norm": 2.5685782432556152, + "learning_rate": 4.915875168374603e-05, + "loss": 5.243, + "step": 13925 + }, + { + "epoch": 0.08282186697116757, + "grad_norm": 2.460383176803589, + "learning_rate": 4.915863152752351e-05, + "loss": 4.9241, + "step": 13926 + }, + { + "epoch": 0.08282781425444857, + "grad_norm": 2.2505056858062744, + "learning_rate": 4.915851136286747e-05, + "loss": 5.0951, + "step": 13927 + }, + { + "epoch": 0.08283376153772956, + "grad_norm": 2.517544984817505, + "learning_rate": 4.915839118977793e-05, + "loss": 5.151, + "step": 13928 + }, + { + "epoch": 0.08283970882101056, + "grad_norm": 2.445645809173584, + "learning_rate": 4.915827100825495e-05, + "loss": 5.1831, + "step": 13929 + }, + { + "epoch": 0.08284565610429157, + "grad_norm": 2.347383737564087, + "learning_rate": 4.9158150818298564e-05, + "loss": 5.0299, + "step": 13930 + }, + { + "epoch": 0.08285160338757255, + "grad_norm": 2.1791892051696777, + "learning_rate": 4.915803061990882e-05, + "loss": 5.4083, + "step": 13931 + }, + { + "epoch": 0.08285755067085356, + "grad_norm": 1.9959020614624023, + "learning_rate": 4.9157910413085764e-05, + "loss": 5.9036, + "step": 13932 + }, + { + "epoch": 0.08286349795413456, + "grad_norm": 2.3419620990753174, + "learning_rate": 4.915779019782942e-05, + "loss": 4.9082, + "step": 13933 + }, + { + "epoch": 0.08286944523741555, + "grad_norm": 2.452756643295288, + "learning_rate": 4.915766997413985e-05, + "loss": 4.8272, + "step": 13934 + }, + { + "epoch": 0.08287539252069655, + "grad_norm": 2.344353675842285, + "learning_rate": 4.915754974201708e-05, + "loss": 5.0269, + "step": 13935 + }, + { + "epoch": 0.08288133980397755, + "grad_norm": 2.366218090057373, + "learning_rate": 4.9157429501461175e-05, + "loss": 4.8898, + "step": 13936 + }, + { + "epoch": 0.08288728708725854, + "grad_norm": 1.7986581325531006, + "learning_rate": 4.915730925247214e-05, + "loss": 4.9316, + "step": 13937 + }, + { + "epoch": 0.08289323437053954, + "grad_norm": 2.059094190597534, + "learning_rate": 4.915718899505005e-05, + "loss": 5.1297, + "step": 13938 + }, + { + "epoch": 0.08289918165382054, + "grad_norm": 1.9630707502365112, + "learning_rate": 4.915706872919493e-05, + "loss": 5.4844, + "step": 13939 + }, + { + "epoch": 0.08290512893710153, + "grad_norm": 2.0281238555908203, + "learning_rate": 4.9156948454906825e-05, + "loss": 5.9276, + "step": 13940 + }, + { + "epoch": 0.08291107622038253, + "grad_norm": 1.8783270120620728, + "learning_rate": 4.9156828172185786e-05, + "loss": 5.7085, + "step": 13941 + }, + { + "epoch": 0.08291702350366352, + "grad_norm": 2.190317153930664, + "learning_rate": 4.915670788103184e-05, + "loss": 4.9619, + "step": 13942 + }, + { + "epoch": 0.08292297078694452, + "grad_norm": 2.2746498584747314, + "learning_rate": 4.915658758144505e-05, + "loss": 4.8965, + "step": 13943 + }, + { + "epoch": 0.08292891807022552, + "grad_norm": 1.940510630607605, + "learning_rate": 4.915646727342543e-05, + "loss": 5.0367, + "step": 13944 + }, + { + "epoch": 0.08293486535350651, + "grad_norm": 1.9016308784484863, + "learning_rate": 4.915634695697304e-05, + "loss": 5.5002, + "step": 13945 + }, + { + "epoch": 0.08294081263678751, + "grad_norm": 2.0041022300720215, + "learning_rate": 4.915622663208792e-05, + "loss": 5.4193, + "step": 13946 + }, + { + "epoch": 0.08294675992006852, + "grad_norm": 2.0117805004119873, + "learning_rate": 4.9156106298770115e-05, + "loss": 5.2697, + "step": 13947 + }, + { + "epoch": 0.0829527072033495, + "grad_norm": 1.864820957183838, + "learning_rate": 4.9155985957019654e-05, + "loss": 5.1594, + "step": 13948 + }, + { + "epoch": 0.0829586544866305, + "grad_norm": 1.7407771348953247, + "learning_rate": 4.91558656068366e-05, + "loss": 5.1189, + "step": 13949 + }, + { + "epoch": 0.08296460176991151, + "grad_norm": 2.027552366256714, + "learning_rate": 4.9155745248220976e-05, + "loss": 5.6257, + "step": 13950 + }, + { + "epoch": 0.0829705490531925, + "grad_norm": 1.6893701553344727, + "learning_rate": 4.9155624881172834e-05, + "loss": 5.1268, + "step": 13951 + }, + { + "epoch": 0.0829764963364735, + "grad_norm": 1.7216230630874634, + "learning_rate": 4.915550450569221e-05, + "loss": 5.2768, + "step": 13952 + }, + { + "epoch": 0.0829824436197545, + "grad_norm": 1.6723179817199707, + "learning_rate": 4.915538412177915e-05, + "loss": 5.7059, + "step": 13953 + }, + { + "epoch": 0.08298839090303549, + "grad_norm": 1.7645996809005737, + "learning_rate": 4.915526372943369e-05, + "loss": 5.6065, + "step": 13954 + }, + { + "epoch": 0.08299433818631649, + "grad_norm": 1.9206926822662354, + "learning_rate": 4.915514332865588e-05, + "loss": 4.9229, + "step": 13955 + }, + { + "epoch": 0.08300028546959749, + "grad_norm": 1.9269802570343018, + "learning_rate": 4.9155022919445766e-05, + "loss": 5.5678, + "step": 13956 + }, + { + "epoch": 0.08300623275287848, + "grad_norm": 2.378319501876831, + "learning_rate": 4.915490250180338e-05, + "loss": 4.7271, + "step": 13957 + }, + { + "epoch": 0.08301218003615948, + "grad_norm": 1.73631751537323, + "learning_rate": 4.915478207572876e-05, + "loss": 5.1302, + "step": 13958 + }, + { + "epoch": 0.08301812731944049, + "grad_norm": 1.6520816087722778, + "learning_rate": 4.915466164122196e-05, + "loss": 6.0497, + "step": 13959 + }, + { + "epoch": 0.08302407460272147, + "grad_norm": 1.7382736206054688, + "learning_rate": 4.915454119828302e-05, + "loss": 6.0155, + "step": 13960 + }, + { + "epoch": 0.08303002188600248, + "grad_norm": 1.6733272075653076, + "learning_rate": 4.915442074691197e-05, + "loss": 5.2624, + "step": 13961 + }, + { + "epoch": 0.08303596916928348, + "grad_norm": 2.0024397373199463, + "learning_rate": 4.915430028710887e-05, + "loss": 5.4794, + "step": 13962 + }, + { + "epoch": 0.08304191645256447, + "grad_norm": 1.9784339666366577, + "learning_rate": 4.915417981887375e-05, + "loss": 5.1546, + "step": 13963 + }, + { + "epoch": 0.08304786373584547, + "grad_norm": 1.7146525382995605, + "learning_rate": 4.915405934220666e-05, + "loss": 5.6269, + "step": 13964 + }, + { + "epoch": 0.08305381101912647, + "grad_norm": 1.7252057790756226, + "learning_rate": 4.9153938857107626e-05, + "loss": 5.7015, + "step": 13965 + }, + { + "epoch": 0.08305975830240746, + "grad_norm": 1.6623241901397705, + "learning_rate": 4.9153818363576715e-05, + "loss": 5.5249, + "step": 13966 + }, + { + "epoch": 0.08306570558568846, + "grad_norm": 2.0701472759246826, + "learning_rate": 4.9153697861613944e-05, + "loss": 5.3528, + "step": 13967 + }, + { + "epoch": 0.08307165286896946, + "grad_norm": 1.6600522994995117, + "learning_rate": 4.915357735121938e-05, + "loss": 5.3454, + "step": 13968 + }, + { + "epoch": 0.08307760015225045, + "grad_norm": 2.093092918395996, + "learning_rate": 4.915345683239304e-05, + "loss": 5.2417, + "step": 13969 + }, + { + "epoch": 0.08308354743553145, + "grad_norm": 1.9673899412155151, + "learning_rate": 4.915333630513498e-05, + "loss": 5.1908, + "step": 13970 + }, + { + "epoch": 0.08308949471881244, + "grad_norm": 1.8442246913909912, + "learning_rate": 4.915321576944524e-05, + "loss": 5.6287, + "step": 13971 + }, + { + "epoch": 0.08309544200209344, + "grad_norm": 1.5737566947937012, + "learning_rate": 4.9153095225323864e-05, + "loss": 5.7533, + "step": 13972 + }, + { + "epoch": 0.08310138928537444, + "grad_norm": 1.7948611974716187, + "learning_rate": 4.915297467277089e-05, + "loss": 5.5739, + "step": 13973 + }, + { + "epoch": 0.08310733656865543, + "grad_norm": 2.0080626010894775, + "learning_rate": 4.915285411178637e-05, + "loss": 5.5505, + "step": 13974 + }, + { + "epoch": 0.08311328385193643, + "grad_norm": 1.7838460206985474, + "learning_rate": 4.915273354237033e-05, + "loss": 6.0133, + "step": 13975 + }, + { + "epoch": 0.08311923113521744, + "grad_norm": 1.7599917650222778, + "learning_rate": 4.915261296452282e-05, + "loss": 5.6552, + "step": 13976 + }, + { + "epoch": 0.08312517841849842, + "grad_norm": 1.6211295127868652, + "learning_rate": 4.915249237824388e-05, + "loss": 5.6797, + "step": 13977 + }, + { + "epoch": 0.08313112570177943, + "grad_norm": 1.7404415607452393, + "learning_rate": 4.9152371783533565e-05, + "loss": 5.5134, + "step": 13978 + }, + { + "epoch": 0.08313707298506043, + "grad_norm": 1.8577871322631836, + "learning_rate": 4.9152251180391895e-05, + "loss": 5.5823, + "step": 13979 + }, + { + "epoch": 0.08314302026834142, + "grad_norm": 1.6060470342636108, + "learning_rate": 4.915213056881893e-05, + "loss": 5.5875, + "step": 13980 + }, + { + "epoch": 0.08314896755162242, + "grad_norm": 1.915451169013977, + "learning_rate": 4.91520099488147e-05, + "loss": 5.279, + "step": 13981 + }, + { + "epoch": 0.08315491483490342, + "grad_norm": 2.281404972076416, + "learning_rate": 4.9151889320379265e-05, + "loss": 5.0863, + "step": 13982 + }, + { + "epoch": 0.08316086211818441, + "grad_norm": 1.9069279432296753, + "learning_rate": 4.9151768683512646e-05, + "loss": 5.3055, + "step": 13983 + }, + { + "epoch": 0.08316680940146541, + "grad_norm": 1.810571312904358, + "learning_rate": 4.915164803821489e-05, + "loss": 5.4988, + "step": 13984 + }, + { + "epoch": 0.08317275668474641, + "grad_norm": 1.788197636604309, + "learning_rate": 4.915152738448605e-05, + "loss": 5.6627, + "step": 13985 + }, + { + "epoch": 0.0831787039680274, + "grad_norm": 2.294187545776367, + "learning_rate": 4.9151406722326165e-05, + "loss": 5.1977, + "step": 13986 + }, + { + "epoch": 0.0831846512513084, + "grad_norm": 2.584395170211792, + "learning_rate": 4.915128605173527e-05, + "loss": 5.1909, + "step": 13987 + }, + { + "epoch": 0.0831905985345894, + "grad_norm": 2.249406576156616, + "learning_rate": 4.9151165372713405e-05, + "loss": 5.1109, + "step": 13988 + }, + { + "epoch": 0.0831965458178704, + "grad_norm": 1.8678929805755615, + "learning_rate": 4.915104468526062e-05, + "loss": 5.1035, + "step": 13989 + }, + { + "epoch": 0.0832024931011514, + "grad_norm": 2.139711856842041, + "learning_rate": 4.915092398937696e-05, + "loss": 5.0151, + "step": 13990 + }, + { + "epoch": 0.0832084403844324, + "grad_norm": 2.1683461666107178, + "learning_rate": 4.915080328506246e-05, + "loss": 5.1097, + "step": 13991 + }, + { + "epoch": 0.08321438766771339, + "grad_norm": 2.1205332279205322, + "learning_rate": 4.9150682572317165e-05, + "loss": 4.9998, + "step": 13992 + }, + { + "epoch": 0.08322033495099439, + "grad_norm": 1.8642542362213135, + "learning_rate": 4.915056185114111e-05, + "loss": 5.8554, + "step": 13993 + }, + { + "epoch": 0.08322628223427539, + "grad_norm": 2.1150970458984375, + "learning_rate": 4.915044112153435e-05, + "loss": 5.5297, + "step": 13994 + }, + { + "epoch": 0.08323222951755638, + "grad_norm": 2.584157943725586, + "learning_rate": 4.9150320383496915e-05, + "loss": 5.0058, + "step": 13995 + }, + { + "epoch": 0.08323817680083738, + "grad_norm": 2.305853843688965, + "learning_rate": 4.9150199637028854e-05, + "loss": 5.0785, + "step": 13996 + }, + { + "epoch": 0.08324412408411838, + "grad_norm": 2.0386359691619873, + "learning_rate": 4.9150078882130214e-05, + "loss": 5.1104, + "step": 13997 + }, + { + "epoch": 0.08325007136739937, + "grad_norm": 1.6055399179458618, + "learning_rate": 4.914995811880102e-05, + "loss": 5.778, + "step": 13998 + }, + { + "epoch": 0.08325601865068037, + "grad_norm": 1.635704517364502, + "learning_rate": 4.9149837347041334e-05, + "loss": 6.1107, + "step": 13999 + }, + { + "epoch": 0.08326196593396136, + "grad_norm": 1.8098101615905762, + "learning_rate": 4.9149716566851184e-05, + "loss": 6.1197, + "step": 14000 + }, + { + "epoch": 0.08326791321724236, + "grad_norm": 1.5740363597869873, + "learning_rate": 4.914959577823062e-05, + "loss": 5.7821, + "step": 14001 + }, + { + "epoch": 0.08327386050052336, + "grad_norm": 1.4634822607040405, + "learning_rate": 4.914947498117968e-05, + "loss": 5.7062, + "step": 14002 + }, + { + "epoch": 0.08327980778380435, + "grad_norm": 1.7310374975204468, + "learning_rate": 4.914935417569841e-05, + "loss": 5.6689, + "step": 14003 + }, + { + "epoch": 0.08328575506708535, + "grad_norm": 1.5742056369781494, + "learning_rate": 4.914923336178685e-05, + "loss": 5.6529, + "step": 14004 + }, + { + "epoch": 0.08329170235036636, + "grad_norm": 1.6353307962417603, + "learning_rate": 4.914911253944504e-05, + "loss": 5.4564, + "step": 14005 + }, + { + "epoch": 0.08329764963364734, + "grad_norm": 1.8744231462478638, + "learning_rate": 4.9148991708673024e-05, + "loss": 5.305, + "step": 14006 + }, + { + "epoch": 0.08330359691692835, + "grad_norm": 1.9766863584518433, + "learning_rate": 4.914887086947085e-05, + "loss": 5.711, + "step": 14007 + }, + { + "epoch": 0.08330954420020935, + "grad_norm": 2.1832756996154785, + "learning_rate": 4.914875002183855e-05, + "loss": 4.9322, + "step": 14008 + }, + { + "epoch": 0.08331549148349034, + "grad_norm": 2.2370998859405518, + "learning_rate": 4.914862916577617e-05, + "loss": 4.512, + "step": 14009 + }, + { + "epoch": 0.08332143876677134, + "grad_norm": 2.2743804454803467, + "learning_rate": 4.914850830128376e-05, + "loss": 4.5716, + "step": 14010 + }, + { + "epoch": 0.08332738605005234, + "grad_norm": 2.3644347190856934, + "learning_rate": 4.914838742836134e-05, + "loss": 4.1288, + "step": 14011 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 3.1034274101257324, + "learning_rate": 4.9148266547008984e-05, + "loss": 5.2864, + "step": 14012 + }, + { + "epoch": 0.08333928061661433, + "grad_norm": 2.240302801132202, + "learning_rate": 4.914814565722671e-05, + "loss": 5.3452, + "step": 14013 + }, + { + "epoch": 0.08334522789989533, + "grad_norm": 2.0743885040283203, + "learning_rate": 4.9148024759014566e-05, + "loss": 5.4338, + "step": 14014 + }, + { + "epoch": 0.08335117518317632, + "grad_norm": 2.0169663429260254, + "learning_rate": 4.91479038523726e-05, + "loss": 5.5108, + "step": 14015 + }, + { + "epoch": 0.08335712246645732, + "grad_norm": 1.9730015993118286, + "learning_rate": 4.914778293730085e-05, + "loss": 5.6413, + "step": 14016 + }, + { + "epoch": 0.08336306974973832, + "grad_norm": 2.3047432899475098, + "learning_rate": 4.914766201379936e-05, + "loss": 5.4111, + "step": 14017 + }, + { + "epoch": 0.08336901703301931, + "grad_norm": 3.079416275024414, + "learning_rate": 4.914754108186816e-05, + "loss": 5.5591, + "step": 14018 + }, + { + "epoch": 0.08337496431630032, + "grad_norm": 1.9374867677688599, + "learning_rate": 4.9147420141507314e-05, + "loss": 5.9295, + "step": 14019 + }, + { + "epoch": 0.08338091159958132, + "grad_norm": 1.874292016029358, + "learning_rate": 4.9147299192716855e-05, + "loss": 5.6846, + "step": 14020 + }, + { + "epoch": 0.0833868588828623, + "grad_norm": 1.8852506875991821, + "learning_rate": 4.914717823549682e-05, + "loss": 5.621, + "step": 14021 + }, + { + "epoch": 0.08339280616614331, + "grad_norm": 1.9332367181777954, + "learning_rate": 4.914705726984725e-05, + "loss": 5.8584, + "step": 14022 + }, + { + "epoch": 0.08339875344942431, + "grad_norm": 1.6252962350845337, + "learning_rate": 4.91469362957682e-05, + "loss": 5.8173, + "step": 14023 + }, + { + "epoch": 0.0834047007327053, + "grad_norm": 1.6760259866714478, + "learning_rate": 4.9146815313259695e-05, + "loss": 5.5441, + "step": 14024 + }, + { + "epoch": 0.0834106480159863, + "grad_norm": 1.4979921579360962, + "learning_rate": 4.9146694322321785e-05, + "loss": 6.1467, + "step": 14025 + }, + { + "epoch": 0.0834165952992673, + "grad_norm": 1.4720534086227417, + "learning_rate": 4.914657332295453e-05, + "loss": 5.8626, + "step": 14026 + }, + { + "epoch": 0.08342254258254829, + "grad_norm": 1.6709620952606201, + "learning_rate": 4.914645231515794e-05, + "loss": 5.8468, + "step": 14027 + }, + { + "epoch": 0.08342848986582929, + "grad_norm": 1.6389116048812866, + "learning_rate": 4.9146331298932075e-05, + "loss": 5.9222, + "step": 14028 + }, + { + "epoch": 0.08343443714911028, + "grad_norm": 1.4344384670257568, + "learning_rate": 4.9146210274276974e-05, + "loss": 5.5457, + "step": 14029 + }, + { + "epoch": 0.08344038443239128, + "grad_norm": 1.472469449043274, + "learning_rate": 4.914608924119268e-05, + "loss": 5.608, + "step": 14030 + }, + { + "epoch": 0.08344633171567228, + "grad_norm": 1.6688710451126099, + "learning_rate": 4.914596819967925e-05, + "loss": 5.7982, + "step": 14031 + }, + { + "epoch": 0.08345227899895327, + "grad_norm": 1.6417087316513062, + "learning_rate": 4.9145847149736704e-05, + "loss": 5.6498, + "step": 14032 + }, + { + "epoch": 0.08345822628223427, + "grad_norm": 1.5726937055587769, + "learning_rate": 4.9145726091365084e-05, + "loss": 5.8723, + "step": 14033 + }, + { + "epoch": 0.08346417356551528, + "grad_norm": 1.7523616552352905, + "learning_rate": 4.914560502456444e-05, + "loss": 6.1967, + "step": 14034 + }, + { + "epoch": 0.08347012084879626, + "grad_norm": 1.8270281553268433, + "learning_rate": 4.914548394933483e-05, + "loss": 6.0493, + "step": 14035 + }, + { + "epoch": 0.08347606813207727, + "grad_norm": 1.8113981485366821, + "learning_rate": 4.914536286567627e-05, + "loss": 5.2815, + "step": 14036 + }, + { + "epoch": 0.08348201541535827, + "grad_norm": 1.7894388437271118, + "learning_rate": 4.914524177358881e-05, + "loss": 5.2606, + "step": 14037 + }, + { + "epoch": 0.08348796269863926, + "grad_norm": 1.7994349002838135, + "learning_rate": 4.9145120673072505e-05, + "loss": 5.025, + "step": 14038 + }, + { + "epoch": 0.08349390998192026, + "grad_norm": 1.6934137344360352, + "learning_rate": 4.914499956412738e-05, + "loss": 5.0455, + "step": 14039 + }, + { + "epoch": 0.08349985726520126, + "grad_norm": 1.549500823020935, + "learning_rate": 4.914487844675349e-05, + "loss": 5.3836, + "step": 14040 + }, + { + "epoch": 0.08350580454848225, + "grad_norm": 1.7452481985092163, + "learning_rate": 4.9144757320950873e-05, + "loss": 5.0175, + "step": 14041 + }, + { + "epoch": 0.08351175183176325, + "grad_norm": 1.9420257806777954, + "learning_rate": 4.914463618671957e-05, + "loss": 5.0146, + "step": 14042 + }, + { + "epoch": 0.08351769911504425, + "grad_norm": 1.798431158065796, + "learning_rate": 4.914451504405962e-05, + "loss": 4.7656, + "step": 14043 + }, + { + "epoch": 0.08352364639832524, + "grad_norm": 1.7167326211929321, + "learning_rate": 4.914439389297107e-05, + "loss": 4.7518, + "step": 14044 + }, + { + "epoch": 0.08352959368160624, + "grad_norm": 1.7150487899780273, + "learning_rate": 4.914427273345397e-05, + "loss": 4.8298, + "step": 14045 + }, + { + "epoch": 0.08353554096488724, + "grad_norm": 1.7048633098602295, + "learning_rate": 4.914415156550834e-05, + "loss": 5.0039, + "step": 14046 + }, + { + "epoch": 0.08354148824816823, + "grad_norm": 1.364012598991394, + "learning_rate": 4.914403038913425e-05, + "loss": 5.3718, + "step": 14047 + }, + { + "epoch": 0.08354743553144924, + "grad_norm": 2.29878830909729, + "learning_rate": 4.9143909204331716e-05, + "loss": 4.8874, + "step": 14048 + }, + { + "epoch": 0.08355338281473024, + "grad_norm": 2.1153953075408936, + "learning_rate": 4.91437880111008e-05, + "loss": 4.6646, + "step": 14049 + }, + { + "epoch": 0.08355933009801123, + "grad_norm": 2.289346218109131, + "learning_rate": 4.914366680944153e-05, + "loss": 4.7966, + "step": 14050 + }, + { + "epoch": 0.08356527738129223, + "grad_norm": 1.8394019603729248, + "learning_rate": 4.9143545599353965e-05, + "loss": 5.1788, + "step": 14051 + }, + { + "epoch": 0.08357122466457323, + "grad_norm": 2.192802667617798, + "learning_rate": 4.9143424380838136e-05, + "loss": 5.4549, + "step": 14052 + }, + { + "epoch": 0.08357717194785422, + "grad_norm": 2.128356695175171, + "learning_rate": 4.9143303153894085e-05, + "loss": 5.6652, + "step": 14053 + }, + { + "epoch": 0.08358311923113522, + "grad_norm": 2.0716452598571777, + "learning_rate": 4.914318191852186e-05, + "loss": 5.7013, + "step": 14054 + }, + { + "epoch": 0.08358906651441622, + "grad_norm": 2.298940658569336, + "learning_rate": 4.91430606747215e-05, + "loss": 5.565, + "step": 14055 + }, + { + "epoch": 0.08359501379769721, + "grad_norm": 2.250102996826172, + "learning_rate": 4.914293942249304e-05, + "loss": 5.6935, + "step": 14056 + }, + { + "epoch": 0.08360096108097821, + "grad_norm": 2.123037576675415, + "learning_rate": 4.914281816183653e-05, + "loss": 5.624, + "step": 14057 + }, + { + "epoch": 0.0836069083642592, + "grad_norm": 1.833024501800537, + "learning_rate": 4.9142696892752013e-05, + "loss": 5.4329, + "step": 14058 + }, + { + "epoch": 0.0836128556475402, + "grad_norm": 1.8438977003097534, + "learning_rate": 4.9142575615239526e-05, + "loss": 5.294, + "step": 14059 + }, + { + "epoch": 0.0836188029308212, + "grad_norm": 1.805525541305542, + "learning_rate": 4.914245432929913e-05, + "loss": 5.3778, + "step": 14060 + }, + { + "epoch": 0.08362475021410219, + "grad_norm": 1.5750529766082764, + "learning_rate": 4.9142333034930835e-05, + "loss": 5.357, + "step": 14061 + }, + { + "epoch": 0.0836306974973832, + "grad_norm": 1.3928825855255127, + "learning_rate": 4.914221173213471e-05, + "loss": 5.5141, + "step": 14062 + }, + { + "epoch": 0.0836366447806642, + "grad_norm": 1.6307804584503174, + "learning_rate": 4.914209042091079e-05, + "loss": 5.3687, + "step": 14063 + }, + { + "epoch": 0.08364259206394518, + "grad_norm": 1.533963680267334, + "learning_rate": 4.914196910125911e-05, + "loss": 5.7295, + "step": 14064 + }, + { + "epoch": 0.08364853934722619, + "grad_norm": 1.4950587749481201, + "learning_rate": 4.914184777317972e-05, + "loss": 5.816, + "step": 14065 + }, + { + "epoch": 0.08365448663050719, + "grad_norm": 1.3246190547943115, + "learning_rate": 4.914172643667266e-05, + "loss": 5.6925, + "step": 14066 + }, + { + "epoch": 0.08366043391378818, + "grad_norm": 1.4816724061965942, + "learning_rate": 4.9141605091737975e-05, + "loss": 5.6528, + "step": 14067 + }, + { + "epoch": 0.08366638119706918, + "grad_norm": 1.6656372547149658, + "learning_rate": 4.914148373837571e-05, + "loss": 5.4619, + "step": 14068 + }, + { + "epoch": 0.08367232848035018, + "grad_norm": 1.2973356246948242, + "learning_rate": 4.914136237658589e-05, + "loss": 5.5467, + "step": 14069 + }, + { + "epoch": 0.08367827576363117, + "grad_norm": 1.7669901847839355, + "learning_rate": 4.914124100636857e-05, + "loss": 5.2213, + "step": 14070 + }, + { + "epoch": 0.08368422304691217, + "grad_norm": 1.7352882623672485, + "learning_rate": 4.91411196277238e-05, + "loss": 5.2938, + "step": 14071 + }, + { + "epoch": 0.08369017033019317, + "grad_norm": 1.5912410020828247, + "learning_rate": 4.914099824065161e-05, + "loss": 5.4139, + "step": 14072 + }, + { + "epoch": 0.08369611761347416, + "grad_norm": 1.46699059009552, + "learning_rate": 4.914087684515205e-05, + "loss": 5.2317, + "step": 14073 + }, + { + "epoch": 0.08370206489675516, + "grad_norm": 3.0727121829986572, + "learning_rate": 4.914075544122516e-05, + "loss": 5.2324, + "step": 14074 + }, + { + "epoch": 0.08370801218003616, + "grad_norm": 1.4887278079986572, + "learning_rate": 4.914063402887098e-05, + "loss": 5.0331, + "step": 14075 + }, + { + "epoch": 0.08371395946331715, + "grad_norm": 1.4677956104278564, + "learning_rate": 4.9140512608089555e-05, + "loss": 5.0892, + "step": 14076 + }, + { + "epoch": 0.08371990674659816, + "grad_norm": 1.3760831356048584, + "learning_rate": 4.914039117888093e-05, + "loss": 5.3738, + "step": 14077 + }, + { + "epoch": 0.08372585402987916, + "grad_norm": 1.6125822067260742, + "learning_rate": 4.9140269741245135e-05, + "loss": 5.4629, + "step": 14078 + }, + { + "epoch": 0.08373180131316015, + "grad_norm": 1.6336333751678467, + "learning_rate": 4.9140148295182226e-05, + "loss": 5.2533, + "step": 14079 + }, + { + "epoch": 0.08373774859644115, + "grad_norm": 1.6296573877334595, + "learning_rate": 4.9140026840692247e-05, + "loss": 4.8288, + "step": 14080 + }, + { + "epoch": 0.08374369587972215, + "grad_norm": 1.6058591604232788, + "learning_rate": 4.913990537777522e-05, + "loss": 5.0549, + "step": 14081 + }, + { + "epoch": 0.08374964316300314, + "grad_norm": 1.6199642419815063, + "learning_rate": 4.9139783906431214e-05, + "loss": 5.2387, + "step": 14082 + }, + { + "epoch": 0.08375559044628414, + "grad_norm": 1.7537976503372192, + "learning_rate": 4.913966242666025e-05, + "loss": 5.2766, + "step": 14083 + }, + { + "epoch": 0.08376153772956514, + "grad_norm": 1.579128384590149, + "learning_rate": 4.9139540938462384e-05, + "loss": 5.2251, + "step": 14084 + }, + { + "epoch": 0.08376748501284613, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.913941944183765e-05, + "loss": 5.0699, + "step": 14085 + }, + { + "epoch": 0.08377343229612713, + "grad_norm": 1.4739151000976562, + "learning_rate": 4.91392979367861e-05, + "loss": 5.229, + "step": 14086 + }, + { + "epoch": 0.08377937957940812, + "grad_norm": 1.6380045413970947, + "learning_rate": 4.9139176423307764e-05, + "loss": 5.0977, + "step": 14087 + }, + { + "epoch": 0.08378532686268912, + "grad_norm": 1.640865445137024, + "learning_rate": 4.91390549014027e-05, + "loss": 5.1106, + "step": 14088 + }, + { + "epoch": 0.08379127414597012, + "grad_norm": 1.7274518013000488, + "learning_rate": 4.913893337107093e-05, + "loss": 5.2093, + "step": 14089 + }, + { + "epoch": 0.08379722142925111, + "grad_norm": 1.7702603340148926, + "learning_rate": 4.913881183231251e-05, + "loss": 5.1314, + "step": 14090 + }, + { + "epoch": 0.08380316871253211, + "grad_norm": 1.766479253768921, + "learning_rate": 4.913869028512749e-05, + "loss": 5.1266, + "step": 14091 + }, + { + "epoch": 0.08380911599581312, + "grad_norm": 1.5863205194473267, + "learning_rate": 4.91385687295159e-05, + "loss": 5.1487, + "step": 14092 + }, + { + "epoch": 0.0838150632790941, + "grad_norm": 1.6770803928375244, + "learning_rate": 4.913844716547777e-05, + "loss": 5.2479, + "step": 14093 + }, + { + "epoch": 0.0838210105623751, + "grad_norm": 1.8650991916656494, + "learning_rate": 4.913832559301317e-05, + "loss": 5.2748, + "step": 14094 + }, + { + "epoch": 0.08382695784565611, + "grad_norm": 1.7304933071136475, + "learning_rate": 4.913820401212213e-05, + "loss": 5.2572, + "step": 14095 + }, + { + "epoch": 0.0838329051289371, + "grad_norm": 1.7103501558303833, + "learning_rate": 4.9138082422804695e-05, + "loss": 5.1145, + "step": 14096 + }, + { + "epoch": 0.0838388524122181, + "grad_norm": 1.8390073776245117, + "learning_rate": 4.91379608250609e-05, + "loss": 5.1171, + "step": 14097 + }, + { + "epoch": 0.0838447996954991, + "grad_norm": 1.815047264099121, + "learning_rate": 4.913783921889079e-05, + "loss": 5.2329, + "step": 14098 + }, + { + "epoch": 0.08385074697878009, + "grad_norm": 1.4381682872772217, + "learning_rate": 4.9137717604294415e-05, + "loss": 5.1098, + "step": 14099 + }, + { + "epoch": 0.08385669426206109, + "grad_norm": 1.6523853540420532, + "learning_rate": 4.9137595981271815e-05, + "loss": 5.1352, + "step": 14100 + }, + { + "epoch": 0.08386264154534209, + "grad_norm": 1.377199649810791, + "learning_rate": 4.913747434982302e-05, + "loss": 5.1191, + "step": 14101 + }, + { + "epoch": 0.08386858882862308, + "grad_norm": 1.5858699083328247, + "learning_rate": 4.913735270994809e-05, + "loss": 5.0569, + "step": 14102 + }, + { + "epoch": 0.08387453611190408, + "grad_norm": 1.608522891998291, + "learning_rate": 4.913723106164705e-05, + "loss": 4.8834, + "step": 14103 + }, + { + "epoch": 0.08388048339518508, + "grad_norm": 1.7063453197479248, + "learning_rate": 4.913710940491996e-05, + "loss": 4.9019, + "step": 14104 + }, + { + "epoch": 0.08388643067846607, + "grad_norm": 1.5008784532546997, + "learning_rate": 4.913698773976685e-05, + "loss": 4.8423, + "step": 14105 + }, + { + "epoch": 0.08389237796174707, + "grad_norm": 1.8743178844451904, + "learning_rate": 4.913686606618777e-05, + "loss": 4.9256, + "step": 14106 + }, + { + "epoch": 0.08389832524502808, + "grad_norm": 1.813094973564148, + "learning_rate": 4.9136744384182764e-05, + "loss": 4.9245, + "step": 14107 + }, + { + "epoch": 0.08390427252830907, + "grad_norm": 1.9561067819595337, + "learning_rate": 4.913662269375186e-05, + "loss": 4.8459, + "step": 14108 + }, + { + "epoch": 0.08391021981159007, + "grad_norm": 1.6159533262252808, + "learning_rate": 4.913650099489512e-05, + "loss": 4.8092, + "step": 14109 + }, + { + "epoch": 0.08391616709487107, + "grad_norm": 1.5819872617721558, + "learning_rate": 4.913637928761257e-05, + "loss": 4.9047, + "step": 14110 + }, + { + "epoch": 0.08392211437815206, + "grad_norm": 1.6294678449630737, + "learning_rate": 4.913625757190426e-05, + "loss": 4.6908, + "step": 14111 + }, + { + "epoch": 0.08392806166143306, + "grad_norm": 1.5048410892486572, + "learning_rate": 4.913613584777024e-05, + "loss": 5.2021, + "step": 14112 + }, + { + "epoch": 0.08393400894471406, + "grad_norm": 1.626280665397644, + "learning_rate": 4.9136014115210525e-05, + "loss": 5.4592, + "step": 14113 + }, + { + "epoch": 0.08393995622799505, + "grad_norm": 1.662269115447998, + "learning_rate": 4.91358923742252e-05, + "loss": 5.0027, + "step": 14114 + }, + { + "epoch": 0.08394590351127605, + "grad_norm": 1.5630388259887695, + "learning_rate": 4.913577062481427e-05, + "loss": 5.3327, + "step": 14115 + }, + { + "epoch": 0.08395185079455704, + "grad_norm": 1.4223047494888306, + "learning_rate": 4.913564886697779e-05, + "loss": 5.5081, + "step": 14116 + }, + { + "epoch": 0.08395779807783804, + "grad_norm": 1.3298295736312866, + "learning_rate": 4.9135527100715814e-05, + "loss": 5.3783, + "step": 14117 + }, + { + "epoch": 0.08396374536111904, + "grad_norm": 1.335779070854187, + "learning_rate": 4.913540532602837e-05, + "loss": 5.3901, + "step": 14118 + }, + { + "epoch": 0.08396969264440003, + "grad_norm": 1.5331017971038818, + "learning_rate": 4.913528354291551e-05, + "loss": 5.5643, + "step": 14119 + }, + { + "epoch": 0.08397563992768103, + "grad_norm": 1.703400731086731, + "learning_rate": 4.913516175137727e-05, + "loss": 5.4256, + "step": 14120 + }, + { + "epoch": 0.08398158721096204, + "grad_norm": 1.5330191850662231, + "learning_rate": 4.913503995141369e-05, + "loss": 5.2509, + "step": 14121 + }, + { + "epoch": 0.08398753449424302, + "grad_norm": 1.7405961751937866, + "learning_rate": 4.913491814302482e-05, + "loss": 5.4171, + "step": 14122 + }, + { + "epoch": 0.08399348177752403, + "grad_norm": 1.2550197839736938, + "learning_rate": 4.9134796326210696e-05, + "loss": 5.3908, + "step": 14123 + }, + { + "epoch": 0.08399942906080503, + "grad_norm": 1.2029253244400024, + "learning_rate": 4.9134674500971366e-05, + "loss": 5.5355, + "step": 14124 + }, + { + "epoch": 0.08400537634408602, + "grad_norm": 1.2968589067459106, + "learning_rate": 4.913455266730687e-05, + "loss": 5.4007, + "step": 14125 + }, + { + "epoch": 0.08401132362736702, + "grad_norm": 1.2636605501174927, + "learning_rate": 4.913443082521725e-05, + "loss": 5.2402, + "step": 14126 + }, + { + "epoch": 0.08401727091064802, + "grad_norm": 1.2112632989883423, + "learning_rate": 4.9134308974702554e-05, + "loss": 5.2595, + "step": 14127 + }, + { + "epoch": 0.08402321819392901, + "grad_norm": 1.447730302810669, + "learning_rate": 4.913418711576282e-05, + "loss": 5.2688, + "step": 14128 + }, + { + "epoch": 0.08402916547721001, + "grad_norm": 1.4328616857528687, + "learning_rate": 4.913406524839809e-05, + "loss": 5.2368, + "step": 14129 + }, + { + "epoch": 0.08403511276049101, + "grad_norm": 1.4782198667526245, + "learning_rate": 4.91339433726084e-05, + "loss": 5.2019, + "step": 14130 + }, + { + "epoch": 0.084041060043772, + "grad_norm": 1.499373197555542, + "learning_rate": 4.913382148839381e-05, + "loss": 5.3352, + "step": 14131 + }, + { + "epoch": 0.084047007327053, + "grad_norm": 1.37551748752594, + "learning_rate": 4.9133699595754346e-05, + "loss": 5.1566, + "step": 14132 + }, + { + "epoch": 0.084052954610334, + "grad_norm": 1.6400420665740967, + "learning_rate": 4.913357769469006e-05, + "loss": 5.5225, + "step": 14133 + }, + { + "epoch": 0.08405890189361499, + "grad_norm": 1.3855832815170288, + "learning_rate": 4.913345578520099e-05, + "loss": 5.4466, + "step": 14134 + }, + { + "epoch": 0.084064849176896, + "grad_norm": 1.783508062362671, + "learning_rate": 4.913333386728718e-05, + "loss": 5.1713, + "step": 14135 + }, + { + "epoch": 0.084070796460177, + "grad_norm": 2.435201406478882, + "learning_rate": 4.913321194094866e-05, + "loss": 4.9899, + "step": 14136 + }, + { + "epoch": 0.08407674374345799, + "grad_norm": 1.708850622177124, + "learning_rate": 4.91330900061855e-05, + "loss": 5.0808, + "step": 14137 + }, + { + "epoch": 0.08408269102673899, + "grad_norm": 1.583473801612854, + "learning_rate": 4.913296806299773e-05, + "loss": 5.0164, + "step": 14138 + }, + { + "epoch": 0.08408863831001999, + "grad_norm": 1.6990292072296143, + "learning_rate": 4.9132846111385386e-05, + "loss": 4.9476, + "step": 14139 + }, + { + "epoch": 0.08409458559330098, + "grad_norm": 1.6386258602142334, + "learning_rate": 4.913272415134851e-05, + "loss": 4.9357, + "step": 14140 + }, + { + "epoch": 0.08410053287658198, + "grad_norm": 1.258575439453125, + "learning_rate": 4.9132602182887156e-05, + "loss": 4.7666, + "step": 14141 + }, + { + "epoch": 0.08410648015986298, + "grad_norm": 1.3333406448364258, + "learning_rate": 4.913248020600135e-05, + "loss": 4.698, + "step": 14142 + }, + { + "epoch": 0.08411242744314397, + "grad_norm": 1.3663051128387451, + "learning_rate": 4.913235822069116e-05, + "loss": 4.9414, + "step": 14143 + }, + { + "epoch": 0.08411837472642497, + "grad_norm": 1.6906498670578003, + "learning_rate": 4.91322362269566e-05, + "loss": 5.281, + "step": 14144 + }, + { + "epoch": 0.08412432200970596, + "grad_norm": 1.2671558856964111, + "learning_rate": 4.9132114224797735e-05, + "loss": 5.2566, + "step": 14145 + }, + { + "epoch": 0.08413026929298696, + "grad_norm": 1.4022216796875, + "learning_rate": 4.9131992214214586e-05, + "loss": 5.128, + "step": 14146 + }, + { + "epoch": 0.08413621657626796, + "grad_norm": 1.4810549020767212, + "learning_rate": 4.913187019520722e-05, + "loss": 5.0172, + "step": 14147 + }, + { + "epoch": 0.08414216385954895, + "grad_norm": 1.2757905721664429, + "learning_rate": 4.913174816777566e-05, + "loss": 5.3796, + "step": 14148 + }, + { + "epoch": 0.08414811114282995, + "grad_norm": 1.4088176488876343, + "learning_rate": 4.913162613191996e-05, + "loss": 5.4586, + "step": 14149 + }, + { + "epoch": 0.08415405842611096, + "grad_norm": 1.5218896865844727, + "learning_rate": 4.9131504087640154e-05, + "loss": 5.1652, + "step": 14150 + }, + { + "epoch": 0.08416000570939194, + "grad_norm": 1.4234968423843384, + "learning_rate": 4.913138203493629e-05, + "loss": 5.1917, + "step": 14151 + }, + { + "epoch": 0.08416595299267295, + "grad_norm": 1.4841183423995972, + "learning_rate": 4.913125997380842e-05, + "loss": 5.2818, + "step": 14152 + }, + { + "epoch": 0.08417190027595395, + "grad_norm": 1.8631536960601807, + "learning_rate": 4.9131137904256564e-05, + "loss": 5.4848, + "step": 14153 + }, + { + "epoch": 0.08417784755923494, + "grad_norm": 1.5508880615234375, + "learning_rate": 4.913101582628078e-05, + "loss": 5.3698, + "step": 14154 + }, + { + "epoch": 0.08418379484251594, + "grad_norm": 1.2428319454193115, + "learning_rate": 4.913089373988111e-05, + "loss": 5.2071, + "step": 14155 + }, + { + "epoch": 0.08418974212579694, + "grad_norm": 1.405325174331665, + "learning_rate": 4.91307716450576e-05, + "loss": 5.1774, + "step": 14156 + }, + { + "epoch": 0.08419568940907793, + "grad_norm": 1.6800439357757568, + "learning_rate": 4.913064954181028e-05, + "loss": 5.3735, + "step": 14157 + }, + { + "epoch": 0.08420163669235893, + "grad_norm": 1.475174069404602, + "learning_rate": 4.9130527430139194e-05, + "loss": 5.3303, + "step": 14158 + }, + { + "epoch": 0.08420758397563993, + "grad_norm": 1.5441967248916626, + "learning_rate": 4.91304053100444e-05, + "loss": 5.3007, + "step": 14159 + }, + { + "epoch": 0.08421353125892092, + "grad_norm": 1.3798770904541016, + "learning_rate": 4.913028318152593e-05, + "loss": 5.287, + "step": 14160 + }, + { + "epoch": 0.08421947854220192, + "grad_norm": 1.4294620752334595, + "learning_rate": 4.913016104458382e-05, + "loss": 5.3159, + "step": 14161 + }, + { + "epoch": 0.08422542582548292, + "grad_norm": 1.4971884489059448, + "learning_rate": 4.913003889921812e-05, + "loss": 5.4701, + "step": 14162 + }, + { + "epoch": 0.08423137310876391, + "grad_norm": 1.447045922279358, + "learning_rate": 4.912991674542888e-05, + "loss": 5.306, + "step": 14163 + }, + { + "epoch": 0.08423732039204491, + "grad_norm": 1.7867134809494019, + "learning_rate": 4.9129794583216135e-05, + "loss": 4.8653, + "step": 14164 + }, + { + "epoch": 0.08424326767532592, + "grad_norm": 1.6931066513061523, + "learning_rate": 4.912967241257993e-05, + "loss": 4.7628, + "step": 14165 + }, + { + "epoch": 0.0842492149586069, + "grad_norm": 1.6567879915237427, + "learning_rate": 4.91295502335203e-05, + "loss": 4.7857, + "step": 14166 + }, + { + "epoch": 0.08425516224188791, + "grad_norm": 1.6891521215438843, + "learning_rate": 4.91294280460373e-05, + "loss": 4.7873, + "step": 14167 + }, + { + "epoch": 0.08426110952516891, + "grad_norm": 1.6237304210662842, + "learning_rate": 4.912930585013095e-05, + "loss": 4.8596, + "step": 14168 + }, + { + "epoch": 0.0842670568084499, + "grad_norm": 1.585802674293518, + "learning_rate": 4.912918364580132e-05, + "loss": 4.8226, + "step": 14169 + }, + { + "epoch": 0.0842730040917309, + "grad_norm": 1.6892811059951782, + "learning_rate": 4.912906143304844e-05, + "loss": 4.8307, + "step": 14170 + }, + { + "epoch": 0.0842789513750119, + "grad_norm": 1.8254313468933105, + "learning_rate": 4.912893921187236e-05, + "loss": 4.8508, + "step": 14171 + }, + { + "epoch": 0.08428489865829289, + "grad_norm": 1.5577294826507568, + "learning_rate": 4.912881698227311e-05, + "loss": 4.7303, + "step": 14172 + }, + { + "epoch": 0.08429084594157389, + "grad_norm": 1.5635697841644287, + "learning_rate": 4.912869474425074e-05, + "loss": 4.9597, + "step": 14173 + }, + { + "epoch": 0.08429679322485488, + "grad_norm": 1.6620457172393799, + "learning_rate": 4.9128572497805294e-05, + "loss": 5.1012, + "step": 14174 + }, + { + "epoch": 0.08430274050813588, + "grad_norm": 1.4082841873168945, + "learning_rate": 4.912845024293681e-05, + "loss": 5.1785, + "step": 14175 + }, + { + "epoch": 0.08430868779141688, + "grad_norm": 1.5914233922958374, + "learning_rate": 4.9128327979645336e-05, + "loss": 5.2035, + "step": 14176 + }, + { + "epoch": 0.08431463507469787, + "grad_norm": 1.3170946836471558, + "learning_rate": 4.912820570793091e-05, + "loss": 5.35, + "step": 14177 + }, + { + "epoch": 0.08432058235797887, + "grad_norm": 1.3059190511703491, + "learning_rate": 4.912808342779357e-05, + "loss": 5.1428, + "step": 14178 + }, + { + "epoch": 0.08432652964125988, + "grad_norm": 1.438844919204712, + "learning_rate": 4.912796113923337e-05, + "loss": 5.2154, + "step": 14179 + }, + { + "epoch": 0.08433247692454086, + "grad_norm": 1.401469349861145, + "learning_rate": 4.912783884225035e-05, + "loss": 5.0941, + "step": 14180 + }, + { + "epoch": 0.08433842420782187, + "grad_norm": 1.6718204021453857, + "learning_rate": 4.912771653684456e-05, + "loss": 5.3221, + "step": 14181 + }, + { + "epoch": 0.08434437149110287, + "grad_norm": 1.51036536693573, + "learning_rate": 4.912759422301602e-05, + "loss": 5.2619, + "step": 14182 + }, + { + "epoch": 0.08435031877438386, + "grad_norm": 1.6579569578170776, + "learning_rate": 4.9127471900764795e-05, + "loss": 5.1176, + "step": 14183 + }, + { + "epoch": 0.08435626605766486, + "grad_norm": 1.5300757884979248, + "learning_rate": 4.912734957009091e-05, + "loss": 5.1625, + "step": 14184 + }, + { + "epoch": 0.08436221334094586, + "grad_norm": 1.2839969396591187, + "learning_rate": 4.912722723099442e-05, + "loss": 5.0852, + "step": 14185 + }, + { + "epoch": 0.08436816062422685, + "grad_norm": 1.7074840068817139, + "learning_rate": 4.9127104883475364e-05, + "loss": 5.1611, + "step": 14186 + }, + { + "epoch": 0.08437410790750785, + "grad_norm": 1.790992021560669, + "learning_rate": 4.9126982527533797e-05, + "loss": 5.0386, + "step": 14187 + }, + { + "epoch": 0.08438005519078885, + "grad_norm": 1.5269246101379395, + "learning_rate": 4.912686016316973e-05, + "loss": 5.0272, + "step": 14188 + }, + { + "epoch": 0.08438600247406984, + "grad_norm": 1.510847806930542, + "learning_rate": 4.9126737790383234e-05, + "loss": 5.2073, + "step": 14189 + }, + { + "epoch": 0.08439194975735084, + "grad_norm": 1.6551074981689453, + "learning_rate": 4.912661540917435e-05, + "loss": 5.0436, + "step": 14190 + }, + { + "epoch": 0.08439789704063184, + "grad_norm": 1.3152271509170532, + "learning_rate": 4.91264930195431e-05, + "loss": 5.0981, + "step": 14191 + }, + { + "epoch": 0.08440384432391283, + "grad_norm": 1.478190302848816, + "learning_rate": 4.912637062148955e-05, + "loss": 5.1172, + "step": 14192 + }, + { + "epoch": 0.08440979160719383, + "grad_norm": 1.4574978351593018, + "learning_rate": 4.912624821501373e-05, + "loss": 4.9757, + "step": 14193 + }, + { + "epoch": 0.08441573889047484, + "grad_norm": 1.600182056427002, + "learning_rate": 4.912612580011568e-05, + "loss": 5.1763, + "step": 14194 + }, + { + "epoch": 0.08442168617375582, + "grad_norm": 1.5805768966674805, + "learning_rate": 4.912600337679546e-05, + "loss": 5.1949, + "step": 14195 + }, + { + "epoch": 0.08442763345703683, + "grad_norm": 1.465785264968872, + "learning_rate": 4.9125880945053106e-05, + "loss": 5.0695, + "step": 14196 + }, + { + "epoch": 0.08443358074031783, + "grad_norm": 1.6188615560531616, + "learning_rate": 4.912575850488864e-05, + "loss": 5.1263, + "step": 14197 + }, + { + "epoch": 0.08443952802359882, + "grad_norm": 2.4953408241271973, + "learning_rate": 4.9125636056302125e-05, + "loss": 5.6462, + "step": 14198 + }, + { + "epoch": 0.08444547530687982, + "grad_norm": 1.6779934167861938, + "learning_rate": 4.91255135992936e-05, + "loss": 5.1673, + "step": 14199 + }, + { + "epoch": 0.08445142259016082, + "grad_norm": 1.648706316947937, + "learning_rate": 4.912539113386312e-05, + "loss": 5.3792, + "step": 14200 + }, + { + "epoch": 0.08445736987344181, + "grad_norm": 1.4866549968719482, + "learning_rate": 4.91252686600107e-05, + "loss": 5.2828, + "step": 14201 + }, + { + "epoch": 0.08446331715672281, + "grad_norm": 1.6002475023269653, + "learning_rate": 4.912514617773641e-05, + "loss": 5.3255, + "step": 14202 + }, + { + "epoch": 0.0844692644400038, + "grad_norm": 1.4162862300872803, + "learning_rate": 4.912502368704027e-05, + "loss": 5.3363, + "step": 14203 + }, + { + "epoch": 0.0844752117232848, + "grad_norm": 1.4465757608413696, + "learning_rate": 4.912490118792234e-05, + "loss": 5.586, + "step": 14204 + }, + { + "epoch": 0.0844811590065658, + "grad_norm": 1.8178991079330444, + "learning_rate": 4.912477868038266e-05, + "loss": 5.3029, + "step": 14205 + }, + { + "epoch": 0.08448710628984679, + "grad_norm": 1.4270378351211548, + "learning_rate": 4.912465616442126e-05, + "loss": 5.3864, + "step": 14206 + }, + { + "epoch": 0.0844930535731278, + "grad_norm": 1.5574913024902344, + "learning_rate": 4.91245336400382e-05, + "loss": 5.7667, + "step": 14207 + }, + { + "epoch": 0.0844990008564088, + "grad_norm": 1.3866809606552124, + "learning_rate": 4.91244111072335e-05, + "loss": 5.683, + "step": 14208 + }, + { + "epoch": 0.08450494813968978, + "grad_norm": 1.3390960693359375, + "learning_rate": 4.912428856600722e-05, + "loss": 5.7286, + "step": 14209 + }, + { + "epoch": 0.08451089542297079, + "grad_norm": 1.4317498207092285, + "learning_rate": 4.912416601635942e-05, + "loss": 5.6913, + "step": 14210 + }, + { + "epoch": 0.08451684270625179, + "grad_norm": 1.3110778331756592, + "learning_rate": 4.91240434582901e-05, + "loss": 5.6325, + "step": 14211 + }, + { + "epoch": 0.08452278998953278, + "grad_norm": 1.3288872241973877, + "learning_rate": 4.9123920891799344e-05, + "loss": 5.6343, + "step": 14212 + }, + { + "epoch": 0.08452873727281378, + "grad_norm": 1.2967199087142944, + "learning_rate": 4.912379831688716e-05, + "loss": 5.6514, + "step": 14213 + }, + { + "epoch": 0.08453468455609478, + "grad_norm": 1.6022506952285767, + "learning_rate": 4.912367573355362e-05, + "loss": 5.4006, + "step": 14214 + }, + { + "epoch": 0.08454063183937577, + "grad_norm": 1.6698434352874756, + "learning_rate": 4.912355314179875e-05, + "loss": 5.1543, + "step": 14215 + }, + { + "epoch": 0.08454657912265677, + "grad_norm": 1.6759408712387085, + "learning_rate": 4.9123430541622594e-05, + "loss": 4.9744, + "step": 14216 + }, + { + "epoch": 0.08455252640593777, + "grad_norm": 2.470752239227295, + "learning_rate": 4.91233079330252e-05, + "loss": 5.7614, + "step": 14217 + }, + { + "epoch": 0.08455847368921876, + "grad_norm": 2.1985907554626465, + "learning_rate": 4.91231853160066e-05, + "loss": 6.037, + "step": 14218 + }, + { + "epoch": 0.08456442097249976, + "grad_norm": 2.079569101333618, + "learning_rate": 4.912306269056686e-05, + "loss": 5.4943, + "step": 14219 + }, + { + "epoch": 0.08457036825578076, + "grad_norm": 2.2941744327545166, + "learning_rate": 4.9122940056706e-05, + "loss": 5.3733, + "step": 14220 + }, + { + "epoch": 0.08457631553906175, + "grad_norm": 1.9538209438323975, + "learning_rate": 4.912281741442407e-05, + "loss": 5.6362, + "step": 14221 + }, + { + "epoch": 0.08458226282234275, + "grad_norm": 1.7498515844345093, + "learning_rate": 4.9122694763721124e-05, + "loss": 5.7129, + "step": 14222 + }, + { + "epoch": 0.08458821010562376, + "grad_norm": 2.1728787422180176, + "learning_rate": 4.912257210459718e-05, + "loss": 5.4633, + "step": 14223 + }, + { + "epoch": 0.08459415738890474, + "grad_norm": 2.2436587810516357, + "learning_rate": 4.91224494370523e-05, + "loss": 5.3996, + "step": 14224 + }, + { + "epoch": 0.08460010467218575, + "grad_norm": 2.400299549102783, + "learning_rate": 4.912232676108653e-05, + "loss": 5.3994, + "step": 14225 + }, + { + "epoch": 0.08460605195546675, + "grad_norm": 1.9408513307571411, + "learning_rate": 4.91222040766999e-05, + "loss": 5.4537, + "step": 14226 + }, + { + "epoch": 0.08461199923874774, + "grad_norm": 2.4801602363586426, + "learning_rate": 4.912208138389245e-05, + "loss": 4.6625, + "step": 14227 + }, + { + "epoch": 0.08461794652202874, + "grad_norm": 2.021916627883911, + "learning_rate": 4.912195868266424e-05, + "loss": 4.5642, + "step": 14228 + }, + { + "epoch": 0.08462389380530974, + "grad_norm": 1.9586929082870483, + "learning_rate": 4.91218359730153e-05, + "loss": 4.6361, + "step": 14229 + }, + { + "epoch": 0.08462984108859073, + "grad_norm": 1.8478419780731201, + "learning_rate": 4.912171325494568e-05, + "loss": 4.5632, + "step": 14230 + }, + { + "epoch": 0.08463578837187173, + "grad_norm": 1.7078584432601929, + "learning_rate": 4.9121590528455406e-05, + "loss": 4.7259, + "step": 14231 + }, + { + "epoch": 0.08464173565515272, + "grad_norm": 1.7676106691360474, + "learning_rate": 4.912146779354455e-05, + "loss": 5.2565, + "step": 14232 + }, + { + "epoch": 0.08464768293843372, + "grad_norm": 1.8230634927749634, + "learning_rate": 4.912134505021313e-05, + "loss": 5.7668, + "step": 14233 + }, + { + "epoch": 0.08465363022171472, + "grad_norm": 1.8570215702056885, + "learning_rate": 4.91212222984612e-05, + "loss": 6.1849, + "step": 14234 + }, + { + "epoch": 0.08465957750499571, + "grad_norm": 1.7698529958724976, + "learning_rate": 4.9121099538288805e-05, + "loss": 6.0298, + "step": 14235 + }, + { + "epoch": 0.08466552478827671, + "grad_norm": 1.9919711351394653, + "learning_rate": 4.912097676969597e-05, + "loss": 5.7423, + "step": 14236 + }, + { + "epoch": 0.08467147207155772, + "grad_norm": 1.9937268495559692, + "learning_rate": 4.912085399268277e-05, + "loss": 5.8415, + "step": 14237 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 1.9489192962646484, + "learning_rate": 4.912073120724921e-05, + "loss": 5.812, + "step": 14238 + }, + { + "epoch": 0.0846833666381197, + "grad_norm": 1.6114327907562256, + "learning_rate": 4.9120608413395366e-05, + "loss": 5.9458, + "step": 14239 + }, + { + "epoch": 0.08468931392140071, + "grad_norm": 1.5803523063659668, + "learning_rate": 4.9120485611121265e-05, + "loss": 5.8837, + "step": 14240 + }, + { + "epoch": 0.0846952612046817, + "grad_norm": 1.8166266679763794, + "learning_rate": 4.9120362800426946e-05, + "loss": 5.5997, + "step": 14241 + }, + { + "epoch": 0.0847012084879627, + "grad_norm": 2.2683627605438232, + "learning_rate": 4.912023998131246e-05, + "loss": 5.4089, + "step": 14242 + }, + { + "epoch": 0.0847071557712437, + "grad_norm": 1.959498405456543, + "learning_rate": 4.9120117153777846e-05, + "loss": 5.5651, + "step": 14243 + }, + { + "epoch": 0.08471310305452469, + "grad_norm": 2.2388527393341064, + "learning_rate": 4.9119994317823155e-05, + "loss": 6.1511, + "step": 14244 + }, + { + "epoch": 0.08471905033780569, + "grad_norm": 1.9563941955566406, + "learning_rate": 4.911987147344842e-05, + "loss": 6.0499, + "step": 14245 + }, + { + "epoch": 0.08472499762108669, + "grad_norm": 1.7460871934890747, + "learning_rate": 4.911974862065368e-05, + "loss": 5.8368, + "step": 14246 + }, + { + "epoch": 0.08473094490436768, + "grad_norm": 1.820356845855713, + "learning_rate": 4.911962575943899e-05, + "loss": 5.3679, + "step": 14247 + }, + { + "epoch": 0.08473689218764868, + "grad_norm": 2.2215917110443115, + "learning_rate": 4.911950288980439e-05, + "loss": 5.0686, + "step": 14248 + }, + { + "epoch": 0.08474283947092968, + "grad_norm": 1.7801320552825928, + "learning_rate": 4.9119380011749914e-05, + "loss": 5.7665, + "step": 14249 + }, + { + "epoch": 0.08474878675421067, + "grad_norm": 1.8713878393173218, + "learning_rate": 4.911925712527562e-05, + "loss": 5.7, + "step": 14250 + }, + { + "epoch": 0.08475473403749167, + "grad_norm": 1.9371087551116943, + "learning_rate": 4.911913423038154e-05, + "loss": 5.6707, + "step": 14251 + }, + { + "epoch": 0.08476068132077268, + "grad_norm": 2.2298929691314697, + "learning_rate": 4.9119011327067724e-05, + "loss": 5.7042, + "step": 14252 + }, + { + "epoch": 0.08476662860405366, + "grad_norm": 1.7787251472473145, + "learning_rate": 4.91188884153342e-05, + "loss": 5.9205, + "step": 14253 + }, + { + "epoch": 0.08477257588733467, + "grad_norm": 2.0264973640441895, + "learning_rate": 4.911876549518102e-05, + "loss": 5.2057, + "step": 14254 + }, + { + "epoch": 0.08477852317061567, + "grad_norm": 2.7479963302612305, + "learning_rate": 4.911864256660824e-05, + "loss": 4.3828, + "step": 14255 + }, + { + "epoch": 0.08478447045389666, + "grad_norm": 2.3911163806915283, + "learning_rate": 4.9118519629615886e-05, + "loss": 4.1959, + "step": 14256 + }, + { + "epoch": 0.08479041773717766, + "grad_norm": 2.5100319385528564, + "learning_rate": 4.9118396684204005e-05, + "loss": 4.3845, + "step": 14257 + }, + { + "epoch": 0.08479636502045866, + "grad_norm": 2.575680732727051, + "learning_rate": 4.911827373037264e-05, + "loss": 4.1927, + "step": 14258 + }, + { + "epoch": 0.08480231230373965, + "grad_norm": 2.64941143989563, + "learning_rate": 4.9118150768121837e-05, + "loss": 4.2398, + "step": 14259 + }, + { + "epoch": 0.08480825958702065, + "grad_norm": 3.4619154930114746, + "learning_rate": 4.911802779745163e-05, + "loss": 5.9141, + "step": 14260 + }, + { + "epoch": 0.08481420687030164, + "grad_norm": 2.5471723079681396, + "learning_rate": 4.911790481836208e-05, + "loss": 4.1887, + "step": 14261 + }, + { + "epoch": 0.08482015415358264, + "grad_norm": 2.9113502502441406, + "learning_rate": 4.911778183085321e-05, + "loss": 4.3556, + "step": 14262 + }, + { + "epoch": 0.08482610143686364, + "grad_norm": 2.5952084064483643, + "learning_rate": 4.9117658834925076e-05, + "loss": 5.0408, + "step": 14263 + }, + { + "epoch": 0.08483204872014463, + "grad_norm": 2.60726261138916, + "learning_rate": 4.911753583057771e-05, + "loss": 5.5094, + "step": 14264 + }, + { + "epoch": 0.08483799600342563, + "grad_norm": 1.9005889892578125, + "learning_rate": 4.911741281781117e-05, + "loss": 5.2637, + "step": 14265 + }, + { + "epoch": 0.08484394328670664, + "grad_norm": 1.6408629417419434, + "learning_rate": 4.911728979662549e-05, + "loss": 5.4722, + "step": 14266 + }, + { + "epoch": 0.08484989056998762, + "grad_norm": 1.840955376625061, + "learning_rate": 4.911716676702071e-05, + "loss": 5.5073, + "step": 14267 + }, + { + "epoch": 0.08485583785326863, + "grad_norm": 1.8430123329162598, + "learning_rate": 4.911704372899687e-05, + "loss": 6.0372, + "step": 14268 + }, + { + "epoch": 0.08486178513654963, + "grad_norm": 3.2100231647491455, + "learning_rate": 4.911692068255402e-05, + "loss": 5.0497, + "step": 14269 + }, + { + "epoch": 0.08486773241983062, + "grad_norm": 3.191558837890625, + "learning_rate": 4.911679762769221e-05, + "loss": 5.0467, + "step": 14270 + }, + { + "epoch": 0.08487367970311162, + "grad_norm": 3.04190731048584, + "learning_rate": 4.911667456441148e-05, + "loss": 4.8008, + "step": 14271 + }, + { + "epoch": 0.08487962698639262, + "grad_norm": 2.6688694953918457, + "learning_rate": 4.911655149271186e-05, + "loss": 4.722, + "step": 14272 + }, + { + "epoch": 0.08488557426967361, + "grad_norm": 2.1458704471588135, + "learning_rate": 4.9116428412593394e-05, + "loss": 4.788, + "step": 14273 + }, + { + "epoch": 0.08489152155295461, + "grad_norm": 2.345972776412964, + "learning_rate": 4.911630532405615e-05, + "loss": 4.7955, + "step": 14274 + }, + { + "epoch": 0.08489746883623561, + "grad_norm": 2.2022581100463867, + "learning_rate": 4.911618222710014e-05, + "loss": 4.815, + "step": 14275 + }, + { + "epoch": 0.0849034161195166, + "grad_norm": 2.311004877090454, + "learning_rate": 4.911605912172542e-05, + "loss": 4.8632, + "step": 14276 + }, + { + "epoch": 0.0849093634027976, + "grad_norm": 2.5007429122924805, + "learning_rate": 4.911593600793204e-05, + "loss": 4.7273, + "step": 14277 + }, + { + "epoch": 0.0849153106860786, + "grad_norm": 2.257115364074707, + "learning_rate": 4.9115812885720026e-05, + "loss": 4.9697, + "step": 14278 + }, + { + "epoch": 0.08492125796935959, + "grad_norm": 2.7667057514190674, + "learning_rate": 4.9115689755089436e-05, + "loss": 5.1607, + "step": 14279 + }, + { + "epoch": 0.0849272052526406, + "grad_norm": 2.4240612983703613, + "learning_rate": 4.911556661604031e-05, + "loss": 4.9873, + "step": 14280 + }, + { + "epoch": 0.0849331525359216, + "grad_norm": 1.9951629638671875, + "learning_rate": 4.911544346857269e-05, + "loss": 4.9961, + "step": 14281 + }, + { + "epoch": 0.08493909981920258, + "grad_norm": 1.8532124757766724, + "learning_rate": 4.9115320312686605e-05, + "loss": 4.9467, + "step": 14282 + }, + { + "epoch": 0.08494504710248359, + "grad_norm": 2.41200590133667, + "learning_rate": 4.9115197148382126e-05, + "loss": 4.9865, + "step": 14283 + }, + { + "epoch": 0.08495099438576459, + "grad_norm": 2.2735655307769775, + "learning_rate": 4.911507397565928e-05, + "loss": 4.9223, + "step": 14284 + }, + { + "epoch": 0.08495694166904558, + "grad_norm": 2.29052734375, + "learning_rate": 4.91149507945181e-05, + "loss": 4.9479, + "step": 14285 + }, + { + "epoch": 0.08496288895232658, + "grad_norm": 2.71832275390625, + "learning_rate": 4.911482760495865e-05, + "loss": 4.9537, + "step": 14286 + }, + { + "epoch": 0.08496883623560758, + "grad_norm": 2.1351630687713623, + "learning_rate": 4.911470440698096e-05, + "loss": 5.3776, + "step": 14287 + }, + { + "epoch": 0.08497478351888857, + "grad_norm": 2.514810085296631, + "learning_rate": 4.9114581200585066e-05, + "loss": 5.6067, + "step": 14288 + }, + { + "epoch": 0.08498073080216957, + "grad_norm": 1.787312626838684, + "learning_rate": 4.9114457985771036e-05, + "loss": 5.4929, + "step": 14289 + }, + { + "epoch": 0.08498667808545056, + "grad_norm": 1.7784658670425415, + "learning_rate": 4.911433476253889e-05, + "loss": 5.5471, + "step": 14290 + }, + { + "epoch": 0.08499262536873156, + "grad_norm": 1.6120775938034058, + "learning_rate": 4.9114211530888676e-05, + "loss": 5.5455, + "step": 14291 + }, + { + "epoch": 0.08499857265201256, + "grad_norm": 1.6809823513031006, + "learning_rate": 4.9114088290820446e-05, + "loss": 5.7674, + "step": 14292 + }, + { + "epoch": 0.08500451993529355, + "grad_norm": 1.784569501876831, + "learning_rate": 4.9113965042334234e-05, + "loss": 5.554, + "step": 14293 + }, + { + "epoch": 0.08501046721857455, + "grad_norm": 1.8622018098831177, + "learning_rate": 4.9113841785430094e-05, + "loss": 5.5718, + "step": 14294 + }, + { + "epoch": 0.08501641450185556, + "grad_norm": 1.8970091342926025, + "learning_rate": 4.911371852010805e-05, + "loss": 5.6398, + "step": 14295 + }, + { + "epoch": 0.08502236178513654, + "grad_norm": 1.9560039043426514, + "learning_rate": 4.911359524636816e-05, + "loss": 5.3627, + "step": 14296 + }, + { + "epoch": 0.08502830906841755, + "grad_norm": 1.7574408054351807, + "learning_rate": 4.911347196421046e-05, + "loss": 5.6245, + "step": 14297 + }, + { + "epoch": 0.08503425635169855, + "grad_norm": 2.0868546962738037, + "learning_rate": 4.9113348673635004e-05, + "loss": 5.6092, + "step": 14298 + }, + { + "epoch": 0.08504020363497954, + "grad_norm": 2.1157326698303223, + "learning_rate": 4.9113225374641816e-05, + "loss": 5.0796, + "step": 14299 + }, + { + "epoch": 0.08504615091826054, + "grad_norm": 1.7721058130264282, + "learning_rate": 4.911310206723096e-05, + "loss": 5.148, + "step": 14300 + }, + { + "epoch": 0.08505209820154154, + "grad_norm": 1.586799144744873, + "learning_rate": 4.911297875140246e-05, + "loss": 5.5425, + "step": 14301 + }, + { + "epoch": 0.08505804548482253, + "grad_norm": 1.9669803380966187, + "learning_rate": 4.9112855427156376e-05, + "loss": 5.1675, + "step": 14302 + }, + { + "epoch": 0.08506399276810353, + "grad_norm": 2.279446601867676, + "learning_rate": 4.911273209449274e-05, + "loss": 5.8068, + "step": 14303 + }, + { + "epoch": 0.08506994005138453, + "grad_norm": 2.036482572555542, + "learning_rate": 4.9112608753411605e-05, + "loss": 5.3995, + "step": 14304 + }, + { + "epoch": 0.08507588733466552, + "grad_norm": 1.833946704864502, + "learning_rate": 4.9112485403913e-05, + "loss": 6.069, + "step": 14305 + }, + { + "epoch": 0.08508183461794652, + "grad_norm": 1.6984084844589233, + "learning_rate": 4.9112362045996976e-05, + "loss": 5.7842, + "step": 14306 + }, + { + "epoch": 0.08508778190122752, + "grad_norm": 1.6729326248168945, + "learning_rate": 4.911223867966358e-05, + "loss": 5.5225, + "step": 14307 + }, + { + "epoch": 0.08509372918450851, + "grad_norm": 2.046747922897339, + "learning_rate": 4.911211530491284e-05, + "loss": 4.967, + "step": 14308 + }, + { + "epoch": 0.08509967646778951, + "grad_norm": 1.967058539390564, + "learning_rate": 4.911199192174482e-05, + "loss": 5.8046, + "step": 14309 + }, + { + "epoch": 0.08510562375107052, + "grad_norm": 1.8341583013534546, + "learning_rate": 4.911186853015955e-05, + "loss": 4.8317, + "step": 14310 + }, + { + "epoch": 0.0851115710343515, + "grad_norm": 1.9655890464782715, + "learning_rate": 4.911174513015707e-05, + "loss": 4.6122, + "step": 14311 + }, + { + "epoch": 0.0851175183176325, + "grad_norm": 1.7953969240188599, + "learning_rate": 4.9111621721737445e-05, + "loss": 5.3151, + "step": 14312 + }, + { + "epoch": 0.08512346560091351, + "grad_norm": 1.7074720859527588, + "learning_rate": 4.9111498304900684e-05, + "loss": 5.337, + "step": 14313 + }, + { + "epoch": 0.0851294128841945, + "grad_norm": 1.8258756399154663, + "learning_rate": 4.9111374879646854e-05, + "loss": 5.3245, + "step": 14314 + }, + { + "epoch": 0.0851353601674755, + "grad_norm": 1.731689691543579, + "learning_rate": 4.9111251445976e-05, + "loss": 5.149, + "step": 14315 + }, + { + "epoch": 0.0851413074507565, + "grad_norm": 1.9083631038665771, + "learning_rate": 4.9111128003888154e-05, + "loss": 5.2409, + "step": 14316 + }, + { + "epoch": 0.08514725473403749, + "grad_norm": 1.739311933517456, + "learning_rate": 4.911100455338336e-05, + "loss": 5.0946, + "step": 14317 + }, + { + "epoch": 0.08515320201731849, + "grad_norm": 1.6812219619750977, + "learning_rate": 4.9110881094461655e-05, + "loss": 5.3062, + "step": 14318 + }, + { + "epoch": 0.08515914930059948, + "grad_norm": 1.8215876817703247, + "learning_rate": 4.9110757627123096e-05, + "loss": 5.5774, + "step": 14319 + }, + { + "epoch": 0.08516509658388048, + "grad_norm": 1.9548031091690063, + "learning_rate": 4.9110634151367725e-05, + "loss": 5.7895, + "step": 14320 + }, + { + "epoch": 0.08517104386716148, + "grad_norm": 2.266925096511841, + "learning_rate": 4.911051066719558e-05, + "loss": 4.6526, + "step": 14321 + }, + { + "epoch": 0.08517699115044247, + "grad_norm": 2.304807424545288, + "learning_rate": 4.9110387174606695e-05, + "loss": 5.2573, + "step": 14322 + }, + { + "epoch": 0.08518293843372347, + "grad_norm": 2.019482135772705, + "learning_rate": 4.911026367360114e-05, + "loss": 5.2739, + "step": 14323 + }, + { + "epoch": 0.08518888571700448, + "grad_norm": 2.0559775829315186, + "learning_rate": 4.911014016417893e-05, + "loss": 5.7166, + "step": 14324 + }, + { + "epoch": 0.08519483300028546, + "grad_norm": 2.0565741062164307, + "learning_rate": 4.911001664634012e-05, + "loss": 5.6359, + "step": 14325 + }, + { + "epoch": 0.08520078028356647, + "grad_norm": 1.8766587972640991, + "learning_rate": 4.910989312008475e-05, + "loss": 5.2667, + "step": 14326 + }, + { + "epoch": 0.08520672756684747, + "grad_norm": 1.669317364692688, + "learning_rate": 4.910976958541287e-05, + "loss": 5.7565, + "step": 14327 + }, + { + "epoch": 0.08521267485012846, + "grad_norm": 1.9138641357421875, + "learning_rate": 4.910964604232452e-05, + "loss": 5.9362, + "step": 14328 + }, + { + "epoch": 0.08521862213340946, + "grad_norm": 1.740892767906189, + "learning_rate": 4.9109522490819734e-05, + "loss": 5.6964, + "step": 14329 + }, + { + "epoch": 0.08522456941669046, + "grad_norm": 1.788825511932373, + "learning_rate": 4.9109398930898576e-05, + "loss": 5.4266, + "step": 14330 + }, + { + "epoch": 0.08523051669997145, + "grad_norm": 2.035877227783203, + "learning_rate": 4.910927536256106e-05, + "loss": 5.5609, + "step": 14331 + }, + { + "epoch": 0.08523646398325245, + "grad_norm": 2.078150987625122, + "learning_rate": 4.9109151785807265e-05, + "loss": 5.0074, + "step": 14332 + }, + { + "epoch": 0.08524241126653345, + "grad_norm": 2.601290225982666, + "learning_rate": 4.91090282006372e-05, + "loss": 5.2021, + "step": 14333 + }, + { + "epoch": 0.08524835854981444, + "grad_norm": 1.7069159746170044, + "learning_rate": 4.910890460705092e-05, + "loss": 5.0313, + "step": 14334 + }, + { + "epoch": 0.08525430583309544, + "grad_norm": 1.8937885761260986, + "learning_rate": 4.9108781005048473e-05, + "loss": 4.6001, + "step": 14335 + }, + { + "epoch": 0.08526025311637644, + "grad_norm": 2.3120486736297607, + "learning_rate": 4.91086573946299e-05, + "loss": 4.4027, + "step": 14336 + }, + { + "epoch": 0.08526620039965743, + "grad_norm": 2.064420223236084, + "learning_rate": 4.910853377579524e-05, + "loss": 4.8853, + "step": 14337 + }, + { + "epoch": 0.08527214768293843, + "grad_norm": 1.80779230594635, + "learning_rate": 4.910841014854455e-05, + "loss": 5.5493, + "step": 14338 + }, + { + "epoch": 0.08527809496621944, + "grad_norm": 1.6364500522613525, + "learning_rate": 4.910828651287786e-05, + "loss": 5.6569, + "step": 14339 + }, + { + "epoch": 0.08528404224950042, + "grad_norm": 1.7472214698791504, + "learning_rate": 4.910816286879522e-05, + "loss": 5.4057, + "step": 14340 + }, + { + "epoch": 0.08528998953278143, + "grad_norm": 1.6311333179473877, + "learning_rate": 4.910803921629666e-05, + "loss": 5.8406, + "step": 14341 + }, + { + "epoch": 0.08529593681606243, + "grad_norm": 2.2367610931396484, + "learning_rate": 4.9107915555382236e-05, + "loss": 4.9339, + "step": 14342 + }, + { + "epoch": 0.08530188409934342, + "grad_norm": 2.033160924911499, + "learning_rate": 4.910779188605199e-05, + "loss": 4.8923, + "step": 14343 + }, + { + "epoch": 0.08530783138262442, + "grad_norm": 1.852645993232727, + "learning_rate": 4.910766820830596e-05, + "loss": 5.2208, + "step": 14344 + }, + { + "epoch": 0.08531377866590542, + "grad_norm": 1.9810596704483032, + "learning_rate": 4.910754452214419e-05, + "loss": 5.0119, + "step": 14345 + }, + { + "epoch": 0.08531972594918641, + "grad_norm": 1.92807137966156, + "learning_rate": 4.910742082756673e-05, + "loss": 5.6388, + "step": 14346 + }, + { + "epoch": 0.08532567323246741, + "grad_norm": 1.783923864364624, + "learning_rate": 4.910729712457361e-05, + "loss": 5.2831, + "step": 14347 + }, + { + "epoch": 0.0853316205157484, + "grad_norm": 2.008113145828247, + "learning_rate": 4.91071734131649e-05, + "loss": 5.085, + "step": 14348 + }, + { + "epoch": 0.0853375677990294, + "grad_norm": 2.2313408851623535, + "learning_rate": 4.910704969334061e-05, + "loss": 5.243, + "step": 14349 + }, + { + "epoch": 0.0853435150823104, + "grad_norm": 2.155491590499878, + "learning_rate": 4.9106925965100806e-05, + "loss": 6.0776, + "step": 14350 + }, + { + "epoch": 0.08534946236559139, + "grad_norm": 1.995848536491394, + "learning_rate": 4.910680222844551e-05, + "loss": 5.6763, + "step": 14351 + }, + { + "epoch": 0.0853554096488724, + "grad_norm": 2.033620595932007, + "learning_rate": 4.910667848337479e-05, + "loss": 4.4634, + "step": 14352 + }, + { + "epoch": 0.0853613569321534, + "grad_norm": 2.036668062210083, + "learning_rate": 4.910655472988868e-05, + "loss": 4.6367, + "step": 14353 + }, + { + "epoch": 0.08536730421543438, + "grad_norm": 1.9862895011901855, + "learning_rate": 4.910643096798721e-05, + "loss": 4.4623, + "step": 14354 + }, + { + "epoch": 0.08537325149871539, + "grad_norm": 1.9778163433074951, + "learning_rate": 4.910630719767044e-05, + "loss": 4.3706, + "step": 14355 + }, + { + "epoch": 0.08537919878199639, + "grad_norm": 1.984913945198059, + "learning_rate": 4.9106183418938404e-05, + "loss": 4.4573, + "step": 14356 + }, + { + "epoch": 0.08538514606527738, + "grad_norm": 2.0571017265319824, + "learning_rate": 4.910605963179116e-05, + "loss": 4.2782, + "step": 14357 + }, + { + "epoch": 0.08539109334855838, + "grad_norm": 2.028339147567749, + "learning_rate": 4.910593583622872e-05, + "loss": 4.3874, + "step": 14358 + }, + { + "epoch": 0.08539704063183938, + "grad_norm": 2.03485369682312, + "learning_rate": 4.9105812032251165e-05, + "loss": 4.5877, + "step": 14359 + }, + { + "epoch": 0.08540298791512037, + "grad_norm": 1.950490951538086, + "learning_rate": 4.910568821985851e-05, + "loss": 4.6547, + "step": 14360 + }, + { + "epoch": 0.08540893519840137, + "grad_norm": 2.1270785331726074, + "learning_rate": 4.910556439905081e-05, + "loss": 5.3685, + "step": 14361 + }, + { + "epoch": 0.08541488248168237, + "grad_norm": 2.094545364379883, + "learning_rate": 4.910544056982811e-05, + "loss": 6.1109, + "step": 14362 + }, + { + "epoch": 0.08542082976496336, + "grad_norm": 2.2988197803497314, + "learning_rate": 4.910531673219044e-05, + "loss": 5.4789, + "step": 14363 + }, + { + "epoch": 0.08542677704824436, + "grad_norm": 2.2927358150482178, + "learning_rate": 4.910519288613786e-05, + "loss": 5.3853, + "step": 14364 + }, + { + "epoch": 0.08543272433152536, + "grad_norm": 2.223668098449707, + "learning_rate": 4.910506903167041e-05, + "loss": 5.3572, + "step": 14365 + }, + { + "epoch": 0.08543867161480635, + "grad_norm": 2.0522570610046387, + "learning_rate": 4.910494516878813e-05, + "loss": 5.3581, + "step": 14366 + }, + { + "epoch": 0.08544461889808735, + "grad_norm": 2.4349021911621094, + "learning_rate": 4.910482129749106e-05, + "loss": 5.4082, + "step": 14367 + }, + { + "epoch": 0.08545056618136836, + "grad_norm": 1.976344347000122, + "learning_rate": 4.910469741777924e-05, + "loss": 5.6107, + "step": 14368 + }, + { + "epoch": 0.08545651346464934, + "grad_norm": 1.8476877212524414, + "learning_rate": 4.910457352965272e-05, + "loss": 5.5059, + "step": 14369 + }, + { + "epoch": 0.08546246074793035, + "grad_norm": 1.6204098463058472, + "learning_rate": 4.910444963311155e-05, + "loss": 5.6578, + "step": 14370 + }, + { + "epoch": 0.08546840803121135, + "grad_norm": 1.808021903038025, + "learning_rate": 4.910432572815576e-05, + "loss": 5.8263, + "step": 14371 + }, + { + "epoch": 0.08547435531449234, + "grad_norm": 1.4975682497024536, + "learning_rate": 4.91042018147854e-05, + "loss": 5.582, + "step": 14372 + }, + { + "epoch": 0.08548030259777334, + "grad_norm": 1.644845724105835, + "learning_rate": 4.910407789300051e-05, + "loss": 5.7127, + "step": 14373 + }, + { + "epoch": 0.08548624988105434, + "grad_norm": 1.5433874130249023, + "learning_rate": 4.910395396280114e-05, + "loss": 5.6941, + "step": 14374 + }, + { + "epoch": 0.08549219716433533, + "grad_norm": 1.7267838716506958, + "learning_rate": 4.910383002418732e-05, + "loss": 5.632, + "step": 14375 + }, + { + "epoch": 0.08549814444761633, + "grad_norm": 1.4142215251922607, + "learning_rate": 4.9103706077159116e-05, + "loss": 5.6108, + "step": 14376 + }, + { + "epoch": 0.08550409173089732, + "grad_norm": 1.8514180183410645, + "learning_rate": 4.9103582121716554e-05, + "loss": 5.828, + "step": 14377 + }, + { + "epoch": 0.08551003901417832, + "grad_norm": 1.633837103843689, + "learning_rate": 4.9103458157859674e-05, + "loss": 5.8585, + "step": 14378 + }, + { + "epoch": 0.08551598629745932, + "grad_norm": 1.9934178590774536, + "learning_rate": 4.910333418558853e-05, + "loss": 5.5907, + "step": 14379 + }, + { + "epoch": 0.08552193358074031, + "grad_norm": 1.8934741020202637, + "learning_rate": 4.910321020490316e-05, + "loss": 5.579, + "step": 14380 + }, + { + "epoch": 0.08552788086402131, + "grad_norm": 1.9341318607330322, + "learning_rate": 4.910308621580361e-05, + "loss": 5.8737, + "step": 14381 + }, + { + "epoch": 0.08553382814730232, + "grad_norm": 2.1566226482391357, + "learning_rate": 4.9102962218289915e-05, + "loss": 5.6105, + "step": 14382 + }, + { + "epoch": 0.0855397754305833, + "grad_norm": 1.707112431526184, + "learning_rate": 4.910283821236213e-05, + "loss": 5.6875, + "step": 14383 + }, + { + "epoch": 0.0855457227138643, + "grad_norm": 2.8415439128875732, + "learning_rate": 4.9102714198020296e-05, + "loss": 4.9292, + "step": 14384 + }, + { + "epoch": 0.08555166999714531, + "grad_norm": 2.2043650150299072, + "learning_rate": 4.9102590175264445e-05, + "loss": 5.7264, + "step": 14385 + }, + { + "epoch": 0.0855576172804263, + "grad_norm": 2.2063820362091064, + "learning_rate": 4.9102466144094636e-05, + "loss": 5.1616, + "step": 14386 + }, + { + "epoch": 0.0855635645637073, + "grad_norm": 1.9087328910827637, + "learning_rate": 4.9102342104510903e-05, + "loss": 5.1897, + "step": 14387 + }, + { + "epoch": 0.0855695118469883, + "grad_norm": 1.6418956518173218, + "learning_rate": 4.910221805651329e-05, + "loss": 5.0923, + "step": 14388 + }, + { + "epoch": 0.08557545913026929, + "grad_norm": 1.5215847492218018, + "learning_rate": 4.9102094000101836e-05, + "loss": 4.9602, + "step": 14389 + }, + { + "epoch": 0.08558140641355029, + "grad_norm": 2.249983072280884, + "learning_rate": 4.91019699352766e-05, + "loss": 5.1167, + "step": 14390 + }, + { + "epoch": 0.08558735369683129, + "grad_norm": 1.89960777759552, + "learning_rate": 4.9101845862037615e-05, + "loss": 6.1589, + "step": 14391 + }, + { + "epoch": 0.08559330098011228, + "grad_norm": 1.8243924379348755, + "learning_rate": 4.910172178038492e-05, + "loss": 5.8661, + "step": 14392 + }, + { + "epoch": 0.08559924826339328, + "grad_norm": 1.8313872814178467, + "learning_rate": 4.9101597690318567e-05, + "loss": 5.6129, + "step": 14393 + }, + { + "epoch": 0.08560519554667428, + "grad_norm": 1.8717663288116455, + "learning_rate": 4.9101473591838593e-05, + "loss": 5.6346, + "step": 14394 + }, + { + "epoch": 0.08561114282995527, + "grad_norm": 1.6444953680038452, + "learning_rate": 4.910134948494504e-05, + "loss": 5.7237, + "step": 14395 + }, + { + "epoch": 0.08561709011323627, + "grad_norm": 1.8138811588287354, + "learning_rate": 4.910122536963796e-05, + "loss": 5.7682, + "step": 14396 + }, + { + "epoch": 0.08562303739651728, + "grad_norm": 2.629892110824585, + "learning_rate": 4.9101101245917394e-05, + "loss": 5.89, + "step": 14397 + }, + { + "epoch": 0.08562898467979826, + "grad_norm": 1.8197498321533203, + "learning_rate": 4.910097711378337e-05, + "loss": 5.6768, + "step": 14398 + }, + { + "epoch": 0.08563493196307927, + "grad_norm": 2.1121623516082764, + "learning_rate": 4.9100852973235955e-05, + "loss": 5.672, + "step": 14399 + }, + { + "epoch": 0.08564087924636027, + "grad_norm": 1.8823927640914917, + "learning_rate": 4.910072882427518e-05, + "loss": 5.6717, + "step": 14400 + }, + { + "epoch": 0.08564682652964126, + "grad_norm": 2.602023124694824, + "learning_rate": 4.9100604666901084e-05, + "loss": 5.4193, + "step": 14401 + }, + { + "epoch": 0.08565277381292226, + "grad_norm": 2.420342445373535, + "learning_rate": 4.910048050111372e-05, + "loss": 5.2811, + "step": 14402 + }, + { + "epoch": 0.08565872109620326, + "grad_norm": 2.593797206878662, + "learning_rate": 4.910035632691313e-05, + "loss": 5.2942, + "step": 14403 + }, + { + "epoch": 0.08566466837948425, + "grad_norm": 1.9292038679122925, + "learning_rate": 4.910023214429935e-05, + "loss": 5.0231, + "step": 14404 + }, + { + "epoch": 0.08567061566276525, + "grad_norm": 2.159935712814331, + "learning_rate": 4.9100107953272434e-05, + "loss": 4.8778, + "step": 14405 + }, + { + "epoch": 0.08567656294604625, + "grad_norm": 2.2363314628601074, + "learning_rate": 4.9099983753832416e-05, + "loss": 4.8828, + "step": 14406 + }, + { + "epoch": 0.08568251022932724, + "grad_norm": 2.149986505508423, + "learning_rate": 4.909985954597934e-05, + "loss": 5.4351, + "step": 14407 + }, + { + "epoch": 0.08568845751260824, + "grad_norm": 2.05991268157959, + "learning_rate": 4.909973532971325e-05, + "loss": 5.3759, + "step": 14408 + }, + { + "epoch": 0.08569440479588923, + "grad_norm": 2.0030369758605957, + "learning_rate": 4.9099611105034196e-05, + "loss": 5.5126, + "step": 14409 + }, + { + "epoch": 0.08570035207917023, + "grad_norm": 1.7764592170715332, + "learning_rate": 4.9099486871942216e-05, + "loss": 5.1808, + "step": 14410 + }, + { + "epoch": 0.08570629936245124, + "grad_norm": 1.8827999830245972, + "learning_rate": 4.909936263043735e-05, + "loss": 5.5076, + "step": 14411 + }, + { + "epoch": 0.08571224664573222, + "grad_norm": 2.0153589248657227, + "learning_rate": 4.9099238380519655e-05, + "loss": 5.2955, + "step": 14412 + }, + { + "epoch": 0.08571819392901323, + "grad_norm": 2.0739622116088867, + "learning_rate": 4.909911412218916e-05, + "loss": 5.2463, + "step": 14413 + }, + { + "epoch": 0.08572414121229423, + "grad_norm": 2.4668188095092773, + "learning_rate": 4.909898985544591e-05, + "loss": 5.1859, + "step": 14414 + }, + { + "epoch": 0.08573008849557522, + "grad_norm": 2.245546340942383, + "learning_rate": 4.9098865580289956e-05, + "loss": 5.5472, + "step": 14415 + }, + { + "epoch": 0.08573603577885622, + "grad_norm": 2.244086980819702, + "learning_rate": 4.909874129672133e-05, + "loss": 5.5531, + "step": 14416 + }, + { + "epoch": 0.08574198306213722, + "grad_norm": 2.2983627319335938, + "learning_rate": 4.909861700474009e-05, + "loss": 5.6178, + "step": 14417 + }, + { + "epoch": 0.08574793034541821, + "grad_norm": 1.9792771339416504, + "learning_rate": 4.9098492704346265e-05, + "loss": 5.364, + "step": 14418 + }, + { + "epoch": 0.08575387762869921, + "grad_norm": 1.8312867879867554, + "learning_rate": 4.9098368395539914e-05, + "loss": 5.3105, + "step": 14419 + }, + { + "epoch": 0.08575982491198021, + "grad_norm": 1.8415101766586304, + "learning_rate": 4.909824407832107e-05, + "loss": 5.3182, + "step": 14420 + }, + { + "epoch": 0.0857657721952612, + "grad_norm": 1.965531349182129, + "learning_rate": 4.909811975268977e-05, + "loss": 5.496, + "step": 14421 + }, + { + "epoch": 0.0857717194785422, + "grad_norm": 1.9116218090057373, + "learning_rate": 4.909799541864607e-05, + "loss": 5.2531, + "step": 14422 + }, + { + "epoch": 0.0857776667618232, + "grad_norm": 1.863571286201477, + "learning_rate": 4.909787107619001e-05, + "loss": 5.535, + "step": 14423 + }, + { + "epoch": 0.08578361404510419, + "grad_norm": 1.966637372970581, + "learning_rate": 4.909774672532163e-05, + "loss": 5.5072, + "step": 14424 + }, + { + "epoch": 0.0857895613283852, + "grad_norm": 1.9251974821090698, + "learning_rate": 4.9097622366040974e-05, + "loss": 5.1989, + "step": 14425 + }, + { + "epoch": 0.0857955086116662, + "grad_norm": 1.6277741193771362, + "learning_rate": 4.90974979983481e-05, + "loss": 5.357, + "step": 14426 + }, + { + "epoch": 0.08580145589494718, + "grad_norm": 1.6832202672958374, + "learning_rate": 4.909737362224302e-05, + "loss": 5.3485, + "step": 14427 + }, + { + "epoch": 0.08580740317822819, + "grad_norm": 1.7656053304672241, + "learning_rate": 4.909724923772581e-05, + "loss": 5.3965, + "step": 14428 + }, + { + "epoch": 0.08581335046150919, + "grad_norm": 1.748529076576233, + "learning_rate": 4.909712484479649e-05, + "loss": 5.3895, + "step": 14429 + }, + { + "epoch": 0.08581929774479018, + "grad_norm": 2.1317241191864014, + "learning_rate": 4.909700044345511e-05, + "loss": 5.1703, + "step": 14430 + }, + { + "epoch": 0.08582524502807118, + "grad_norm": 2.6896255016326904, + "learning_rate": 4.909687603370172e-05, + "loss": 5.3942, + "step": 14431 + }, + { + "epoch": 0.08583119231135218, + "grad_norm": 2.1061718463897705, + "learning_rate": 4.909675161553637e-05, + "loss": 5.3545, + "step": 14432 + }, + { + "epoch": 0.08583713959463317, + "grad_norm": 2.7201108932495117, + "learning_rate": 4.9096627188959085e-05, + "loss": 4.9659, + "step": 14433 + }, + { + "epoch": 0.08584308687791417, + "grad_norm": 2.0352578163146973, + "learning_rate": 4.909650275396991e-05, + "loss": 5.2667, + "step": 14434 + }, + { + "epoch": 0.08584903416119517, + "grad_norm": 1.6980863809585571, + "learning_rate": 4.9096378310568905e-05, + "loss": 5.4036, + "step": 14435 + }, + { + "epoch": 0.08585498144447616, + "grad_norm": 1.677700161933899, + "learning_rate": 4.90962538587561e-05, + "loss": 5.3104, + "step": 14436 + }, + { + "epoch": 0.08586092872775716, + "grad_norm": 1.995198369026184, + "learning_rate": 4.9096129398531534e-05, + "loss": 5.4235, + "step": 14437 + }, + { + "epoch": 0.08586687601103815, + "grad_norm": 2.136059284210205, + "learning_rate": 4.909600492989527e-05, + "loss": 5.1867, + "step": 14438 + }, + { + "epoch": 0.08587282329431915, + "grad_norm": 1.9917269945144653, + "learning_rate": 4.909588045284733e-05, + "loss": 5.5507, + "step": 14439 + }, + { + "epoch": 0.08587877057760016, + "grad_norm": 1.7341989278793335, + "learning_rate": 4.909575596738777e-05, + "loss": 5.4782, + "step": 14440 + }, + { + "epoch": 0.08588471786088114, + "grad_norm": 2.058920383453369, + "learning_rate": 4.9095631473516635e-05, + "loss": 5.51, + "step": 14441 + }, + { + "epoch": 0.08589066514416215, + "grad_norm": 1.7856314182281494, + "learning_rate": 4.9095506971233965e-05, + "loss": 5.4189, + "step": 14442 + }, + { + "epoch": 0.08589661242744315, + "grad_norm": 1.5290231704711914, + "learning_rate": 4.90953824605398e-05, + "loss": 5.4398, + "step": 14443 + }, + { + "epoch": 0.08590255971072414, + "grad_norm": 1.6302571296691895, + "learning_rate": 4.909525794143418e-05, + "loss": 5.4468, + "step": 14444 + }, + { + "epoch": 0.08590850699400514, + "grad_norm": 1.9898178577423096, + "learning_rate": 4.909513341391716e-05, + "loss": 5.5514, + "step": 14445 + }, + { + "epoch": 0.08591445427728614, + "grad_norm": 2.539473533630371, + "learning_rate": 4.909500887798878e-05, + "loss": 5.0985, + "step": 14446 + }, + { + "epoch": 0.08592040156056713, + "grad_norm": 2.109477996826172, + "learning_rate": 4.909488433364907e-05, + "loss": 5.1304, + "step": 14447 + }, + { + "epoch": 0.08592634884384813, + "grad_norm": 1.627647042274475, + "learning_rate": 4.9094759780898096e-05, + "loss": 5.7772, + "step": 14448 + }, + { + "epoch": 0.08593229612712913, + "grad_norm": 1.7776944637298584, + "learning_rate": 4.909463521973588e-05, + "loss": 6.3219, + "step": 14449 + }, + { + "epoch": 0.08593824341041012, + "grad_norm": 1.8342489004135132, + "learning_rate": 4.909451065016249e-05, + "loss": 5.7136, + "step": 14450 + }, + { + "epoch": 0.08594419069369112, + "grad_norm": 2.109060764312744, + "learning_rate": 4.9094386072177945e-05, + "loss": 5.449, + "step": 14451 + }, + { + "epoch": 0.08595013797697212, + "grad_norm": 2.5615251064300537, + "learning_rate": 4.909426148578231e-05, + "loss": 4.7441, + "step": 14452 + }, + { + "epoch": 0.08595608526025311, + "grad_norm": 1.7670586109161377, + "learning_rate": 4.909413689097561e-05, + "loss": 5.4488, + "step": 14453 + }, + { + "epoch": 0.08596203254353411, + "grad_norm": 1.9190126657485962, + "learning_rate": 4.909401228775789e-05, + "loss": 5.3128, + "step": 14454 + }, + { + "epoch": 0.08596797982681512, + "grad_norm": 1.679866909980774, + "learning_rate": 4.90938876761292e-05, + "loss": 5.4575, + "step": 14455 + }, + { + "epoch": 0.0859739271100961, + "grad_norm": 1.6199991703033447, + "learning_rate": 4.909376305608959e-05, + "loss": 5.541, + "step": 14456 + }, + { + "epoch": 0.0859798743933771, + "grad_norm": 1.876761794090271, + "learning_rate": 4.9093638427639096e-05, + "loss": 5.7256, + "step": 14457 + }, + { + "epoch": 0.08598582167665811, + "grad_norm": 1.7833212614059448, + "learning_rate": 4.909351379077776e-05, + "loss": 5.6512, + "step": 14458 + }, + { + "epoch": 0.0859917689599391, + "grad_norm": 2.249696731567383, + "learning_rate": 4.909338914550562e-05, + "loss": 5.6517, + "step": 14459 + }, + { + "epoch": 0.0859977162432201, + "grad_norm": 1.8037621974945068, + "learning_rate": 4.909326449182273e-05, + "loss": 5.7564, + "step": 14460 + }, + { + "epoch": 0.0860036635265011, + "grad_norm": 1.4057918787002563, + "learning_rate": 4.909313982972914e-05, + "loss": 5.6259, + "step": 14461 + }, + { + "epoch": 0.08600961080978209, + "grad_norm": 1.5501145124435425, + "learning_rate": 4.9093015159224874e-05, + "loss": 5.6626, + "step": 14462 + }, + { + "epoch": 0.08601555809306309, + "grad_norm": 1.8189458847045898, + "learning_rate": 4.909289048030999e-05, + "loss": 5.4682, + "step": 14463 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 1.6819778680801392, + "learning_rate": 4.909276579298452e-05, + "loss": 5.3511, + "step": 14464 + }, + { + "epoch": 0.08602745265962508, + "grad_norm": 1.8401011228561401, + "learning_rate": 4.909264109724853e-05, + "loss": 5.531, + "step": 14465 + }, + { + "epoch": 0.08603339994290608, + "grad_norm": 1.6418116092681885, + "learning_rate": 4.909251639310203e-05, + "loss": 5.2885, + "step": 14466 + }, + { + "epoch": 0.08603934722618707, + "grad_norm": 1.4331059455871582, + "learning_rate": 4.909239168054509e-05, + "loss": 5.2792, + "step": 14467 + }, + { + "epoch": 0.08604529450946807, + "grad_norm": 1.4047703742980957, + "learning_rate": 4.9092266959577745e-05, + "loss": 5.2179, + "step": 14468 + }, + { + "epoch": 0.08605124179274908, + "grad_norm": 1.641930103302002, + "learning_rate": 4.909214223020003e-05, + "loss": 5.475, + "step": 14469 + }, + { + "epoch": 0.08605718907603006, + "grad_norm": 1.9879019260406494, + "learning_rate": 4.909201749241201e-05, + "loss": 5.3893, + "step": 14470 + }, + { + "epoch": 0.08606313635931107, + "grad_norm": 1.4790434837341309, + "learning_rate": 4.909189274621371e-05, + "loss": 5.3011, + "step": 14471 + }, + { + "epoch": 0.08606908364259207, + "grad_norm": 1.4283875226974487, + "learning_rate": 4.909176799160518e-05, + "loss": 5.4181, + "step": 14472 + }, + { + "epoch": 0.08607503092587306, + "grad_norm": 1.6676496267318726, + "learning_rate": 4.909164322858646e-05, + "loss": 5.4682, + "step": 14473 + }, + { + "epoch": 0.08608097820915406, + "grad_norm": 1.4858648777008057, + "learning_rate": 4.9091518457157605e-05, + "loss": 5.3073, + "step": 14474 + }, + { + "epoch": 0.08608692549243506, + "grad_norm": 1.5135246515274048, + "learning_rate": 4.909139367731864e-05, + "loss": 5.4039, + "step": 14475 + }, + { + "epoch": 0.08609287277571605, + "grad_norm": 1.353051781654358, + "learning_rate": 4.909126888906962e-05, + "loss": 5.5455, + "step": 14476 + }, + { + "epoch": 0.08609882005899705, + "grad_norm": 1.2824941873550415, + "learning_rate": 4.909114409241059e-05, + "loss": 5.6465, + "step": 14477 + }, + { + "epoch": 0.08610476734227805, + "grad_norm": 1.3398411273956299, + "learning_rate": 4.909101928734159e-05, + "loss": 5.5299, + "step": 14478 + }, + { + "epoch": 0.08611071462555904, + "grad_norm": 1.167169213294983, + "learning_rate": 4.909089447386266e-05, + "loss": 5.4376, + "step": 14479 + }, + { + "epoch": 0.08611666190884004, + "grad_norm": 1.2469842433929443, + "learning_rate": 4.9090769651973846e-05, + "loss": 5.4945, + "step": 14480 + }, + { + "epoch": 0.08612260919212104, + "grad_norm": 1.3025931119918823, + "learning_rate": 4.90906448216752e-05, + "loss": 5.3283, + "step": 14481 + }, + { + "epoch": 0.08612855647540203, + "grad_norm": 1.597223162651062, + "learning_rate": 4.909051998296675e-05, + "loss": 5.0729, + "step": 14482 + }, + { + "epoch": 0.08613450375868303, + "grad_norm": 1.53999662399292, + "learning_rate": 4.909039513584856e-05, + "loss": 5.2956, + "step": 14483 + }, + { + "epoch": 0.08614045104196404, + "grad_norm": 1.462623953819275, + "learning_rate": 4.909027028032066e-05, + "loss": 5.2748, + "step": 14484 + }, + { + "epoch": 0.08614639832524502, + "grad_norm": 1.380196452140808, + "learning_rate": 4.909014541638309e-05, + "loss": 5.4184, + "step": 14485 + }, + { + "epoch": 0.08615234560852603, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.90900205440359e-05, + "loss": 5.2064, + "step": 14486 + }, + { + "epoch": 0.08615829289180703, + "grad_norm": 1.406848430633545, + "learning_rate": 4.9089895663279136e-05, + "loss": 5.2019, + "step": 14487 + }, + { + "epoch": 0.08616424017508802, + "grad_norm": 1.3956660032272339, + "learning_rate": 4.908977077411283e-05, + "loss": 5.128, + "step": 14488 + }, + { + "epoch": 0.08617018745836902, + "grad_norm": 1.4705348014831543, + "learning_rate": 4.9089645876537044e-05, + "loss": 5.3451, + "step": 14489 + }, + { + "epoch": 0.08617613474165002, + "grad_norm": 1.4385737180709839, + "learning_rate": 4.9089520970551804e-05, + "loss": 5.0668, + "step": 14490 + }, + { + "epoch": 0.08618208202493101, + "grad_norm": 1.584478735923767, + "learning_rate": 4.908939605615717e-05, + "loss": 4.9412, + "step": 14491 + }, + { + "epoch": 0.08618802930821201, + "grad_norm": 1.2740134000778198, + "learning_rate": 4.908927113335317e-05, + "loss": 4.8684, + "step": 14492 + }, + { + "epoch": 0.08619397659149301, + "grad_norm": 1.5669810771942139, + "learning_rate": 4.9089146202139856e-05, + "loss": 5.1903, + "step": 14493 + }, + { + "epoch": 0.086199923874774, + "grad_norm": 1.6113348007202148, + "learning_rate": 4.908902126251727e-05, + "loss": 5.1217, + "step": 14494 + }, + { + "epoch": 0.086205871158055, + "grad_norm": 1.6401634216308594, + "learning_rate": 4.908889631448546e-05, + "loss": 5.2241, + "step": 14495 + }, + { + "epoch": 0.08621181844133599, + "grad_norm": 1.522625207901001, + "learning_rate": 4.9088771358044456e-05, + "loss": 5.1858, + "step": 14496 + }, + { + "epoch": 0.086217765724617, + "grad_norm": 1.3802037239074707, + "learning_rate": 4.9088646393194316e-05, + "loss": 5.2349, + "step": 14497 + }, + { + "epoch": 0.086223713007898, + "grad_norm": 1.5226190090179443, + "learning_rate": 4.9088521419935076e-05, + "loss": 5.2612, + "step": 14498 + }, + { + "epoch": 0.08622966029117898, + "grad_norm": 1.3293451070785522, + "learning_rate": 4.9088396438266785e-05, + "loss": 5.169, + "step": 14499 + }, + { + "epoch": 0.08623560757445999, + "grad_norm": 1.334403157234192, + "learning_rate": 4.908827144818948e-05, + "loss": 5.1139, + "step": 14500 + }, + { + "epoch": 0.08624155485774099, + "grad_norm": 1.5195876359939575, + "learning_rate": 4.908814644970321e-05, + "loss": 5.1473, + "step": 14501 + }, + { + "epoch": 0.08624750214102198, + "grad_norm": 1.3367561101913452, + "learning_rate": 4.908802144280802e-05, + "loss": 5.1148, + "step": 14502 + }, + { + "epoch": 0.08625344942430298, + "grad_norm": 1.485002875328064, + "learning_rate": 4.908789642750395e-05, + "loss": 5.0796, + "step": 14503 + }, + { + "epoch": 0.08625939670758398, + "grad_norm": 1.3907506465911865, + "learning_rate": 4.9087771403791037e-05, + "loss": 5.1382, + "step": 14504 + }, + { + "epoch": 0.08626534399086497, + "grad_norm": 1.5129644870758057, + "learning_rate": 4.9087646371669336e-05, + "loss": 5.037, + "step": 14505 + }, + { + "epoch": 0.08627129127414597, + "grad_norm": 1.4666407108306885, + "learning_rate": 4.9087521331138896e-05, + "loss": 5.1877, + "step": 14506 + }, + { + "epoch": 0.08627723855742697, + "grad_norm": 1.5812102556228638, + "learning_rate": 4.9087396282199736e-05, + "loss": 5.2588, + "step": 14507 + }, + { + "epoch": 0.08628318584070796, + "grad_norm": 2.976067066192627, + "learning_rate": 4.908727122485193e-05, + "loss": 4.7477, + "step": 14508 + }, + { + "epoch": 0.08628913312398896, + "grad_norm": 1.5401511192321777, + "learning_rate": 4.90871461590955e-05, + "loss": 5.2242, + "step": 14509 + }, + { + "epoch": 0.08629508040726996, + "grad_norm": 1.3266774415969849, + "learning_rate": 4.9087021084930486e-05, + "loss": 5.2792, + "step": 14510 + }, + { + "epoch": 0.08630102769055095, + "grad_norm": 1.3292385339736938, + "learning_rate": 4.9086896002356956e-05, + "loss": 5.2434, + "step": 14511 + }, + { + "epoch": 0.08630697497383195, + "grad_norm": 1.237931489944458, + "learning_rate": 4.908677091137493e-05, + "loss": 5.2173, + "step": 14512 + }, + { + "epoch": 0.08631292225711296, + "grad_norm": 1.2488665580749512, + "learning_rate": 4.908664581198447e-05, + "loss": 5.1262, + "step": 14513 + }, + { + "epoch": 0.08631886954039394, + "grad_norm": 1.5126835107803345, + "learning_rate": 4.9086520704185604e-05, + "loss": 5.2258, + "step": 14514 + }, + { + "epoch": 0.08632481682367495, + "grad_norm": 1.3975410461425781, + "learning_rate": 4.908639558797839e-05, + "loss": 4.9266, + "step": 14515 + }, + { + "epoch": 0.08633076410695595, + "grad_norm": 1.2499217987060547, + "learning_rate": 4.908627046336285e-05, + "loss": 5.1564, + "step": 14516 + }, + { + "epoch": 0.08633671139023694, + "grad_norm": 1.6880254745483398, + "learning_rate": 4.908614533033905e-05, + "loss": 5.0906, + "step": 14517 + }, + { + "epoch": 0.08634265867351794, + "grad_norm": 1.498849630355835, + "learning_rate": 4.908602018890702e-05, + "loss": 5.0771, + "step": 14518 + }, + { + "epoch": 0.08634860595679894, + "grad_norm": 1.9192509651184082, + "learning_rate": 4.908589503906682e-05, + "loss": 5.2173, + "step": 14519 + }, + { + "epoch": 0.08635455324007993, + "grad_norm": 1.8038657903671265, + "learning_rate": 4.9085769880818475e-05, + "loss": 5.3003, + "step": 14520 + }, + { + "epoch": 0.08636050052336093, + "grad_norm": 1.3908354043960571, + "learning_rate": 4.9085644714162037e-05, + "loss": 5.1943, + "step": 14521 + }, + { + "epoch": 0.08636644780664193, + "grad_norm": 1.336630940437317, + "learning_rate": 4.9085519539097556e-05, + "loss": 5.2693, + "step": 14522 + }, + { + "epoch": 0.08637239508992292, + "grad_norm": 1.6008005142211914, + "learning_rate": 4.908539435562506e-05, + "loss": 5.2779, + "step": 14523 + }, + { + "epoch": 0.08637834237320392, + "grad_norm": 1.4620133638381958, + "learning_rate": 4.9085269163744605e-05, + "loss": 5.0467, + "step": 14524 + }, + { + "epoch": 0.08638428965648491, + "grad_norm": 1.5825145244598389, + "learning_rate": 4.9085143963456236e-05, + "loss": 4.9838, + "step": 14525 + }, + { + "epoch": 0.08639023693976591, + "grad_norm": 1.751550555229187, + "learning_rate": 4.9085018754759995e-05, + "loss": 5.0467, + "step": 14526 + }, + { + "epoch": 0.08639618422304692, + "grad_norm": 1.5967564582824707, + "learning_rate": 4.908489353765591e-05, + "loss": 5.0685, + "step": 14527 + }, + { + "epoch": 0.0864021315063279, + "grad_norm": 1.646323800086975, + "learning_rate": 4.908476831214405e-05, + "loss": 4.9341, + "step": 14528 + }, + { + "epoch": 0.0864080787896089, + "grad_norm": 1.482224464416504, + "learning_rate": 4.908464307822443e-05, + "loss": 4.9893, + "step": 14529 + }, + { + "epoch": 0.08641402607288991, + "grad_norm": 1.5190521478652954, + "learning_rate": 4.908451783589713e-05, + "loss": 5.0747, + "step": 14530 + }, + { + "epoch": 0.0864199733561709, + "grad_norm": 1.41251802444458, + "learning_rate": 4.908439258516215e-05, + "loss": 5.0098, + "step": 14531 + }, + { + "epoch": 0.0864259206394519, + "grad_norm": 1.678646445274353, + "learning_rate": 4.9084267326019576e-05, + "loss": 5.0224, + "step": 14532 + }, + { + "epoch": 0.0864318679227329, + "grad_norm": 1.5203865766525269, + "learning_rate": 4.908414205846943e-05, + "loss": 5.109, + "step": 14533 + }, + { + "epoch": 0.08643781520601389, + "grad_norm": 1.5437216758728027, + "learning_rate": 4.9084016782511754e-05, + "loss": 5.1168, + "step": 14534 + }, + { + "epoch": 0.08644376248929489, + "grad_norm": 1.3460302352905273, + "learning_rate": 4.90838914981466e-05, + "loss": 5.1038, + "step": 14535 + }, + { + "epoch": 0.08644970977257589, + "grad_norm": 1.4768339395523071, + "learning_rate": 4.908376620537401e-05, + "loss": 5.129, + "step": 14536 + }, + { + "epoch": 0.08645565705585688, + "grad_norm": 1.2669035196304321, + "learning_rate": 4.9083640904194025e-05, + "loss": 5.0856, + "step": 14537 + }, + { + "epoch": 0.08646160433913788, + "grad_norm": 1.5692600011825562, + "learning_rate": 4.9083515594606686e-05, + "loss": 5.0897, + "step": 14538 + }, + { + "epoch": 0.08646755162241888, + "grad_norm": 1.4857045412063599, + "learning_rate": 4.9083390276612044e-05, + "loss": 4.9654, + "step": 14539 + }, + { + "epoch": 0.08647349890569987, + "grad_norm": 1.5537325143814087, + "learning_rate": 4.908326495021014e-05, + "loss": 5.0431, + "step": 14540 + }, + { + "epoch": 0.08647944618898087, + "grad_norm": 1.483089566230774, + "learning_rate": 4.908313961540101e-05, + "loss": 5.0737, + "step": 14541 + }, + { + "epoch": 0.08648539347226188, + "grad_norm": 1.5829899311065674, + "learning_rate": 4.9083014272184716e-05, + "loss": 4.9844, + "step": 14542 + }, + { + "epoch": 0.08649134075554286, + "grad_norm": 1.3660348653793335, + "learning_rate": 4.908288892056128e-05, + "loss": 5.0384, + "step": 14543 + }, + { + "epoch": 0.08649728803882387, + "grad_norm": 1.3721328973770142, + "learning_rate": 4.9082763560530764e-05, + "loss": 5.0993, + "step": 14544 + }, + { + "epoch": 0.08650323532210487, + "grad_norm": 1.412381887435913, + "learning_rate": 4.90826381920932e-05, + "loss": 4.9359, + "step": 14545 + }, + { + "epoch": 0.08650918260538586, + "grad_norm": 1.5164285898208618, + "learning_rate": 4.9082512815248635e-05, + "loss": 5.0156, + "step": 14546 + }, + { + "epoch": 0.08651512988866686, + "grad_norm": 1.5244861841201782, + "learning_rate": 4.9082387429997117e-05, + "loss": 5.0719, + "step": 14547 + }, + { + "epoch": 0.08652107717194786, + "grad_norm": 1.304221510887146, + "learning_rate": 4.908226203633869e-05, + "loss": 4.9553, + "step": 14548 + }, + { + "epoch": 0.08652702445522885, + "grad_norm": 1.328220009803772, + "learning_rate": 4.908213663427338e-05, + "loss": 4.9761, + "step": 14549 + }, + { + "epoch": 0.08653297173850985, + "grad_norm": 1.4459906816482544, + "learning_rate": 4.908201122380126e-05, + "loss": 5.0422, + "step": 14550 + }, + { + "epoch": 0.08653891902179085, + "grad_norm": 1.5402530431747437, + "learning_rate": 4.908188580492235e-05, + "loss": 4.8856, + "step": 14551 + }, + { + "epoch": 0.08654486630507184, + "grad_norm": 1.6573606729507446, + "learning_rate": 4.90817603776367e-05, + "loss": 5.0958, + "step": 14552 + }, + { + "epoch": 0.08655081358835284, + "grad_norm": 1.5214189291000366, + "learning_rate": 4.9081634941944365e-05, + "loss": 4.9494, + "step": 14553 + }, + { + "epoch": 0.08655676087163383, + "grad_norm": 1.4977836608886719, + "learning_rate": 4.908150949784538e-05, + "loss": 4.9166, + "step": 14554 + }, + { + "epoch": 0.08656270815491483, + "grad_norm": 1.4952701330184937, + "learning_rate": 4.908138404533979e-05, + "loss": 4.9371, + "step": 14555 + }, + { + "epoch": 0.08656865543819584, + "grad_norm": 1.2652736902236938, + "learning_rate": 4.9081258584427626e-05, + "loss": 4.9424, + "step": 14556 + }, + { + "epoch": 0.08657460272147682, + "grad_norm": 1.4386261701583862, + "learning_rate": 4.908113311510895e-05, + "loss": 4.8909, + "step": 14557 + }, + { + "epoch": 0.08658055000475783, + "grad_norm": 1.4800533056259155, + "learning_rate": 4.90810076373838e-05, + "loss": 4.9226, + "step": 14558 + }, + { + "epoch": 0.08658649728803883, + "grad_norm": 1.4734489917755127, + "learning_rate": 4.908088215125222e-05, + "loss": 4.9774, + "step": 14559 + }, + { + "epoch": 0.08659244457131982, + "grad_norm": 1.47382390499115, + "learning_rate": 4.9080756656714245e-05, + "loss": 4.9001, + "step": 14560 + }, + { + "epoch": 0.08659839185460082, + "grad_norm": 1.4358749389648438, + "learning_rate": 4.908063115376994e-05, + "loss": 4.8537, + "step": 14561 + }, + { + "epoch": 0.08660433913788182, + "grad_norm": 1.3895947933197021, + "learning_rate": 4.908050564241933e-05, + "loss": 4.9445, + "step": 14562 + }, + { + "epoch": 0.08661028642116281, + "grad_norm": 1.6166354417800903, + "learning_rate": 4.908038012266246e-05, + "loss": 4.9447, + "step": 14563 + }, + { + "epoch": 0.08661623370444381, + "grad_norm": 1.4621998071670532, + "learning_rate": 4.908025459449938e-05, + "loss": 5.0405, + "step": 14564 + }, + { + "epoch": 0.08662218098772481, + "grad_norm": 1.4160699844360352, + "learning_rate": 4.908012905793013e-05, + "loss": 5.1246, + "step": 14565 + }, + { + "epoch": 0.0866281282710058, + "grad_norm": 1.3748950958251953, + "learning_rate": 4.9080003512954756e-05, + "loss": 5.0856, + "step": 14566 + }, + { + "epoch": 0.0866340755542868, + "grad_norm": 1.5496206283569336, + "learning_rate": 4.9079877959573303e-05, + "loss": 5.1539, + "step": 14567 + }, + { + "epoch": 0.0866400228375678, + "grad_norm": 1.2577475309371948, + "learning_rate": 4.9079752397785814e-05, + "loss": 5.033, + "step": 14568 + }, + { + "epoch": 0.08664597012084879, + "grad_norm": 1.3565775156021118, + "learning_rate": 4.9079626827592336e-05, + "loss": 4.977, + "step": 14569 + }, + { + "epoch": 0.0866519174041298, + "grad_norm": 1.869673252105713, + "learning_rate": 4.90795012489929e-05, + "loss": 5.0452, + "step": 14570 + }, + { + "epoch": 0.0866578646874108, + "grad_norm": 1.3931822776794434, + "learning_rate": 4.907937566198757e-05, + "loss": 5.0182, + "step": 14571 + }, + { + "epoch": 0.08666381197069178, + "grad_norm": 1.5796258449554443, + "learning_rate": 4.907925006657637e-05, + "loss": 5.0167, + "step": 14572 + }, + { + "epoch": 0.08666975925397279, + "grad_norm": 1.439174771308899, + "learning_rate": 4.9079124462759356e-05, + "loss": 5.0223, + "step": 14573 + }, + { + "epoch": 0.08667570653725379, + "grad_norm": 1.5269712209701538, + "learning_rate": 4.907899885053657e-05, + "loss": 5.0726, + "step": 14574 + }, + { + "epoch": 0.08668165382053478, + "grad_norm": 1.6334160566329956, + "learning_rate": 4.9078873229908054e-05, + "loss": 4.902, + "step": 14575 + }, + { + "epoch": 0.08668760110381578, + "grad_norm": 1.2883020639419556, + "learning_rate": 4.9078747600873846e-05, + "loss": 5.0168, + "step": 14576 + }, + { + "epoch": 0.08669354838709678, + "grad_norm": 1.3399035930633545, + "learning_rate": 4.9078621963434e-05, + "loss": 5.1285, + "step": 14577 + }, + { + "epoch": 0.08669949567037777, + "grad_norm": 1.6066272258758545, + "learning_rate": 4.9078496317588556e-05, + "loss": 5.1761, + "step": 14578 + }, + { + "epoch": 0.08670544295365877, + "grad_norm": 1.5316112041473389, + "learning_rate": 4.907837066333756e-05, + "loss": 4.9691, + "step": 14579 + }, + { + "epoch": 0.08671139023693977, + "grad_norm": 1.2680541276931763, + "learning_rate": 4.907824500068105e-05, + "loss": 4.984, + "step": 14580 + }, + { + "epoch": 0.08671733752022076, + "grad_norm": 1.3451861143112183, + "learning_rate": 4.9078119329619076e-05, + "loss": 5.1079, + "step": 14581 + }, + { + "epoch": 0.08672328480350176, + "grad_norm": 1.4813716411590576, + "learning_rate": 4.907799365015168e-05, + "loss": 5.0822, + "step": 14582 + }, + { + "epoch": 0.08672923208678275, + "grad_norm": 1.2526417970657349, + "learning_rate": 4.90778679622789e-05, + "loss": 5.0981, + "step": 14583 + }, + { + "epoch": 0.08673517937006375, + "grad_norm": 1.320970058441162, + "learning_rate": 4.907774226600079e-05, + "loss": 5.2046, + "step": 14584 + }, + { + "epoch": 0.08674112665334476, + "grad_norm": 1.4376531839370728, + "learning_rate": 4.907761656131739e-05, + "loss": 5.0422, + "step": 14585 + }, + { + "epoch": 0.08674707393662574, + "grad_norm": 1.3290382623672485, + "learning_rate": 4.907749084822873e-05, + "loss": 4.9587, + "step": 14586 + }, + { + "epoch": 0.08675302121990675, + "grad_norm": 1.4613630771636963, + "learning_rate": 4.907736512673489e-05, + "loss": 5.0141, + "step": 14587 + }, + { + "epoch": 0.08675896850318775, + "grad_norm": 1.2996604442596436, + "learning_rate": 4.907723939683587e-05, + "loss": 5.0881, + "step": 14588 + }, + { + "epoch": 0.08676491578646874, + "grad_norm": 1.5718237161636353, + "learning_rate": 4.907711365853174e-05, + "loss": 5.0104, + "step": 14589 + }, + { + "epoch": 0.08677086306974974, + "grad_norm": 1.5009227991104126, + "learning_rate": 4.907698791182255e-05, + "loss": 4.9257, + "step": 14590 + }, + { + "epoch": 0.08677681035303074, + "grad_norm": 1.4179331064224243, + "learning_rate": 4.907686215670831e-05, + "loss": 5.0209, + "step": 14591 + }, + { + "epoch": 0.08678275763631173, + "grad_norm": 1.3447542190551758, + "learning_rate": 4.9076736393189105e-05, + "loss": 5.0633, + "step": 14592 + }, + { + "epoch": 0.08678870491959273, + "grad_norm": 1.4221898317337036, + "learning_rate": 4.907661062126495e-05, + "loss": 4.907, + "step": 14593 + }, + { + "epoch": 0.08679465220287373, + "grad_norm": 1.5112396478652954, + "learning_rate": 4.907648484093591e-05, + "loss": 5.0703, + "step": 14594 + }, + { + "epoch": 0.08680059948615472, + "grad_norm": 1.3118572235107422, + "learning_rate": 4.907635905220201e-05, + "loss": 5.0089, + "step": 14595 + }, + { + "epoch": 0.08680654676943572, + "grad_norm": 1.6776518821716309, + "learning_rate": 4.90762332550633e-05, + "loss": 4.9705, + "step": 14596 + }, + { + "epoch": 0.08681249405271672, + "grad_norm": 1.467530608177185, + "learning_rate": 4.9076107449519824e-05, + "loss": 5.0596, + "step": 14597 + }, + { + "epoch": 0.08681844133599771, + "grad_norm": 1.5924569368362427, + "learning_rate": 4.907598163557163e-05, + "loss": 4.9904, + "step": 14598 + }, + { + "epoch": 0.08682438861927871, + "grad_norm": 1.1862461566925049, + "learning_rate": 4.907585581321877e-05, + "loss": 5.2065, + "step": 14599 + }, + { + "epoch": 0.08683033590255972, + "grad_norm": 1.5537490844726562, + "learning_rate": 4.9075729982461265e-05, + "loss": 4.9604, + "step": 14600 + }, + { + "epoch": 0.0868362831858407, + "grad_norm": 1.5608946084976196, + "learning_rate": 4.9075604143299176e-05, + "loss": 4.9951, + "step": 14601 + }, + { + "epoch": 0.0868422304691217, + "grad_norm": 1.3890982866287231, + "learning_rate": 4.907547829573254e-05, + "loss": 5.1994, + "step": 14602 + }, + { + "epoch": 0.08684817775240271, + "grad_norm": 1.5367194414138794, + "learning_rate": 4.907535243976141e-05, + "loss": 5.008, + "step": 14603 + }, + { + "epoch": 0.0868541250356837, + "grad_norm": 1.5362403392791748, + "learning_rate": 4.9075226575385814e-05, + "loss": 5.0239, + "step": 14604 + }, + { + "epoch": 0.0868600723189647, + "grad_norm": 1.3252228498458862, + "learning_rate": 4.9075100702605814e-05, + "loss": 4.9663, + "step": 14605 + }, + { + "epoch": 0.0868660196022457, + "grad_norm": 1.4381712675094604, + "learning_rate": 4.907497482142144e-05, + "loss": 5.1457, + "step": 14606 + }, + { + "epoch": 0.08687196688552669, + "grad_norm": 1.5137197971343994, + "learning_rate": 4.907484893183274e-05, + "loss": 4.9831, + "step": 14607 + }, + { + "epoch": 0.08687791416880769, + "grad_norm": 1.5544081926345825, + "learning_rate": 4.907472303383976e-05, + "loss": 5.0485, + "step": 14608 + }, + { + "epoch": 0.08688386145208869, + "grad_norm": 1.4613279104232788, + "learning_rate": 4.907459712744254e-05, + "loss": 5.3929, + "step": 14609 + }, + { + "epoch": 0.08688980873536968, + "grad_norm": 1.2830102443695068, + "learning_rate": 4.907447121264113e-05, + "loss": 5.4241, + "step": 14610 + }, + { + "epoch": 0.08689575601865068, + "grad_norm": 1.2168337106704712, + "learning_rate": 4.907434528943558e-05, + "loss": 5.4678, + "step": 14611 + }, + { + "epoch": 0.08690170330193167, + "grad_norm": 1.3995872735977173, + "learning_rate": 4.907421935782591e-05, + "loss": 5.2, + "step": 14612 + }, + { + "epoch": 0.08690765058521267, + "grad_norm": 1.4081990718841553, + "learning_rate": 4.907409341781219e-05, + "loss": 5.4356, + "step": 14613 + }, + { + "epoch": 0.08691359786849367, + "grad_norm": 1.4506621360778809, + "learning_rate": 4.9073967469394436e-05, + "loss": 5.3816, + "step": 14614 + }, + { + "epoch": 0.08691954515177466, + "grad_norm": 1.3564461469650269, + "learning_rate": 4.907384151257272e-05, + "loss": 5.2808, + "step": 14615 + }, + { + "epoch": 0.08692549243505567, + "grad_norm": 1.3663856983184814, + "learning_rate": 4.907371554734708e-05, + "loss": 5.4286, + "step": 14616 + }, + { + "epoch": 0.08693143971833667, + "grad_norm": 1.5905755758285522, + "learning_rate": 4.907358957371755e-05, + "loss": 5.3404, + "step": 14617 + }, + { + "epoch": 0.08693738700161766, + "grad_norm": 1.6172430515289307, + "learning_rate": 4.9073463591684175e-05, + "loss": 5.2511, + "step": 14618 + }, + { + "epoch": 0.08694333428489866, + "grad_norm": 1.362925410270691, + "learning_rate": 4.9073337601247e-05, + "loss": 5.3786, + "step": 14619 + }, + { + "epoch": 0.08694928156817966, + "grad_norm": 1.4276455640792847, + "learning_rate": 4.907321160240608e-05, + "loss": 5.1243, + "step": 14620 + }, + { + "epoch": 0.08695522885146065, + "grad_norm": 1.5211840867996216, + "learning_rate": 4.907308559516145e-05, + "loss": 5.1465, + "step": 14621 + }, + { + "epoch": 0.08696117613474165, + "grad_norm": 1.4728838205337524, + "learning_rate": 4.9072959579513146e-05, + "loss": 4.9585, + "step": 14622 + }, + { + "epoch": 0.08696712341802265, + "grad_norm": 1.5337111949920654, + "learning_rate": 4.907283355546123e-05, + "loss": 5.0553, + "step": 14623 + }, + { + "epoch": 0.08697307070130364, + "grad_norm": 1.3105639219284058, + "learning_rate": 4.907270752300573e-05, + "loss": 5.2724, + "step": 14624 + }, + { + "epoch": 0.08697901798458464, + "grad_norm": 1.4726678133010864, + "learning_rate": 4.90725814821467e-05, + "loss": 5.2771, + "step": 14625 + }, + { + "epoch": 0.08698496526786564, + "grad_norm": 1.5226463079452515, + "learning_rate": 4.907245543288418e-05, + "loss": 5.2294, + "step": 14626 + }, + { + "epoch": 0.08699091255114663, + "grad_norm": 1.4187650680541992, + "learning_rate": 4.9072329375218215e-05, + "loss": 5.0003, + "step": 14627 + }, + { + "epoch": 0.08699685983442763, + "grad_norm": 1.3565301895141602, + "learning_rate": 4.907220330914885e-05, + "loss": 5.0616, + "step": 14628 + }, + { + "epoch": 0.08700280711770864, + "grad_norm": 1.3763781785964966, + "learning_rate": 4.907207723467612e-05, + "loss": 5.1036, + "step": 14629 + }, + { + "epoch": 0.08700875440098962, + "grad_norm": 1.350926160812378, + "learning_rate": 4.907195115180009e-05, + "loss": 5.3433, + "step": 14630 + }, + { + "epoch": 0.08701470168427063, + "grad_norm": 1.4927095174789429, + "learning_rate": 4.907182506052078e-05, + "loss": 5.3726, + "step": 14631 + }, + { + "epoch": 0.08702064896755163, + "grad_norm": 1.9378905296325684, + "learning_rate": 4.907169896083824e-05, + "loss": 4.9942, + "step": 14632 + }, + { + "epoch": 0.08702659625083262, + "grad_norm": 1.2046253681182861, + "learning_rate": 4.907157285275253e-05, + "loss": 5.2877, + "step": 14633 + }, + { + "epoch": 0.08703254353411362, + "grad_norm": 1.352828025817871, + "learning_rate": 4.907144673626368e-05, + "loss": 5.264, + "step": 14634 + }, + { + "epoch": 0.08703849081739462, + "grad_norm": 1.4438698291778564, + "learning_rate": 4.907132061137173e-05, + "loss": 5.1767, + "step": 14635 + }, + { + "epoch": 0.08704443810067561, + "grad_norm": 1.4066534042358398, + "learning_rate": 4.9071194478076734e-05, + "loss": 5.0919, + "step": 14636 + }, + { + "epoch": 0.08705038538395661, + "grad_norm": 1.4313786029815674, + "learning_rate": 4.9071068336378736e-05, + "loss": 5.0307, + "step": 14637 + }, + { + "epoch": 0.08705633266723761, + "grad_norm": 1.3995366096496582, + "learning_rate": 4.907094218627778e-05, + "loss": 4.9508, + "step": 14638 + }, + { + "epoch": 0.0870622799505186, + "grad_norm": 1.395270824432373, + "learning_rate": 4.90708160277739e-05, + "loss": 5.1403, + "step": 14639 + }, + { + "epoch": 0.0870682272337996, + "grad_norm": 1.4280959367752075, + "learning_rate": 4.9070689860867144e-05, + "loss": 5.1675, + "step": 14640 + }, + { + "epoch": 0.08707417451708059, + "grad_norm": 1.5028926134109497, + "learning_rate": 4.907056368555757e-05, + "loss": 5.1178, + "step": 14641 + }, + { + "epoch": 0.08708012180036159, + "grad_norm": 1.480936884880066, + "learning_rate": 4.90704375018452e-05, + "loss": 5.1681, + "step": 14642 + }, + { + "epoch": 0.0870860690836426, + "grad_norm": 1.474708914756775, + "learning_rate": 4.907031130973009e-05, + "loss": 4.998, + "step": 14643 + }, + { + "epoch": 0.08709201636692358, + "grad_norm": 1.719551920890808, + "learning_rate": 4.907018510921229e-05, + "loss": 5.0486, + "step": 14644 + }, + { + "epoch": 0.08709796365020459, + "grad_norm": 1.6314032077789307, + "learning_rate": 4.907005890029184e-05, + "loss": 4.9233, + "step": 14645 + }, + { + "epoch": 0.08710391093348559, + "grad_norm": 1.635712742805481, + "learning_rate": 4.906993268296877e-05, + "loss": 4.7026, + "step": 14646 + }, + { + "epoch": 0.08710985821676658, + "grad_norm": 1.5682891607284546, + "learning_rate": 4.906980645724314e-05, + "loss": 4.7681, + "step": 14647 + }, + { + "epoch": 0.08711580550004758, + "grad_norm": 1.5149590969085693, + "learning_rate": 4.906968022311499e-05, + "loss": 4.6026, + "step": 14648 + }, + { + "epoch": 0.08712175278332858, + "grad_norm": 1.666756510734558, + "learning_rate": 4.906955398058436e-05, + "loss": 4.6652, + "step": 14649 + }, + { + "epoch": 0.08712770006660957, + "grad_norm": 1.563281536102295, + "learning_rate": 4.906942772965129e-05, + "loss": 4.8195, + "step": 14650 + }, + { + "epoch": 0.08713364734989057, + "grad_norm": 1.3730766773223877, + "learning_rate": 4.906930147031585e-05, + "loss": 5.3917, + "step": 14651 + }, + { + "epoch": 0.08713959463317157, + "grad_norm": 1.344741940498352, + "learning_rate": 4.906917520257805e-05, + "loss": 5.4866, + "step": 14652 + }, + { + "epoch": 0.08714554191645256, + "grad_norm": 1.4403667449951172, + "learning_rate": 4.906904892643796e-05, + "loss": 5.3869, + "step": 14653 + }, + { + "epoch": 0.08715148919973356, + "grad_norm": 1.4251221418380737, + "learning_rate": 4.906892264189561e-05, + "loss": 5.5564, + "step": 14654 + }, + { + "epoch": 0.08715743648301456, + "grad_norm": 1.0403032302856445, + "learning_rate": 4.9068796348951055e-05, + "loss": 5.3422, + "step": 14655 + }, + { + "epoch": 0.08716338376629555, + "grad_norm": 1.4933732748031616, + "learning_rate": 4.9068670047604313e-05, + "loss": 4.9035, + "step": 14656 + }, + { + "epoch": 0.08716933104957655, + "grad_norm": 1.820141315460205, + "learning_rate": 4.9068543737855466e-05, + "loss": 4.8447, + "step": 14657 + }, + { + "epoch": 0.08717527833285756, + "grad_norm": 1.5337603092193604, + "learning_rate": 4.9068417419704526e-05, + "loss": 4.7122, + "step": 14658 + }, + { + "epoch": 0.08718122561613854, + "grad_norm": 1.6933845281600952, + "learning_rate": 4.9068291093151555e-05, + "loss": 4.6246, + "step": 14659 + }, + { + "epoch": 0.08718717289941955, + "grad_norm": 1.607749342918396, + "learning_rate": 4.906816475819659e-05, + "loss": 4.5246, + "step": 14660 + }, + { + "epoch": 0.08719312018270055, + "grad_norm": 1.6468732357025146, + "learning_rate": 4.906803841483969e-05, + "loss": 4.5529, + "step": 14661 + }, + { + "epoch": 0.08719906746598154, + "grad_norm": 1.7252613306045532, + "learning_rate": 4.906791206308087e-05, + "loss": 4.5866, + "step": 14662 + }, + { + "epoch": 0.08720501474926254, + "grad_norm": 1.8178141117095947, + "learning_rate": 4.90677857029202e-05, + "loss": 4.6312, + "step": 14663 + }, + { + "epoch": 0.08721096203254354, + "grad_norm": 1.6173008680343628, + "learning_rate": 4.906765933435771e-05, + "loss": 4.5964, + "step": 14664 + }, + { + "epoch": 0.08721690931582453, + "grad_norm": 1.4914458990097046, + "learning_rate": 4.9067532957393444e-05, + "loss": 4.7123, + "step": 14665 + }, + { + "epoch": 0.08722285659910553, + "grad_norm": 1.5310544967651367, + "learning_rate": 4.9067406572027465e-05, + "loss": 4.6907, + "step": 14666 + }, + { + "epoch": 0.08722880388238653, + "grad_norm": 1.4311203956604004, + "learning_rate": 4.9067280178259794e-05, + "loss": 4.7749, + "step": 14667 + }, + { + "epoch": 0.08723475116566752, + "grad_norm": 1.6848034858703613, + "learning_rate": 4.9067153776090484e-05, + "loss": 5.1676, + "step": 14668 + }, + { + "epoch": 0.08724069844894852, + "grad_norm": 1.510909914970398, + "learning_rate": 4.906702736551958e-05, + "loss": 5.1237, + "step": 14669 + }, + { + "epoch": 0.08724664573222951, + "grad_norm": 1.4135887622833252, + "learning_rate": 4.906690094654713e-05, + "loss": 5.131, + "step": 14670 + }, + { + "epoch": 0.08725259301551051, + "grad_norm": 1.5739595890045166, + "learning_rate": 4.906677451917317e-05, + "loss": 5.2374, + "step": 14671 + }, + { + "epoch": 0.08725854029879151, + "grad_norm": 1.592644214630127, + "learning_rate": 4.9066648083397746e-05, + "loss": 5.0424, + "step": 14672 + }, + { + "epoch": 0.0872644875820725, + "grad_norm": 1.3842464685440063, + "learning_rate": 4.906652163922091e-05, + "loss": 5.106, + "step": 14673 + }, + { + "epoch": 0.0872704348653535, + "grad_norm": 1.4318630695343018, + "learning_rate": 4.906639518664269e-05, + "loss": 5.1223, + "step": 14674 + }, + { + "epoch": 0.08727638214863451, + "grad_norm": 1.5598502159118652, + "learning_rate": 4.906626872566314e-05, + "loss": 5.0363, + "step": 14675 + }, + { + "epoch": 0.0872823294319155, + "grad_norm": 1.9367897510528564, + "learning_rate": 4.9066142256282316e-05, + "loss": 4.8822, + "step": 14676 + }, + { + "epoch": 0.0872882767151965, + "grad_norm": 1.8134979009628296, + "learning_rate": 4.906601577850024e-05, + "loss": 4.7218, + "step": 14677 + }, + { + "epoch": 0.0872942239984775, + "grad_norm": 1.5139638185501099, + "learning_rate": 4.9065889292316976e-05, + "loss": 5.0311, + "step": 14678 + }, + { + "epoch": 0.08730017128175849, + "grad_norm": 1.5324028730392456, + "learning_rate": 4.906576279773255e-05, + "loss": 5.2366, + "step": 14679 + }, + { + "epoch": 0.08730611856503949, + "grad_norm": 1.4219286441802979, + "learning_rate": 4.906563629474702e-05, + "loss": 5.1362, + "step": 14680 + }, + { + "epoch": 0.08731206584832049, + "grad_norm": 1.4673584699630737, + "learning_rate": 4.906550978336042e-05, + "loss": 5.1336, + "step": 14681 + }, + { + "epoch": 0.08731801313160148, + "grad_norm": 1.2611639499664307, + "learning_rate": 4.906538326357281e-05, + "loss": 5.1791, + "step": 14682 + }, + { + "epoch": 0.08732396041488248, + "grad_norm": 1.283827543258667, + "learning_rate": 4.9065256735384205e-05, + "loss": 5.0889, + "step": 14683 + }, + { + "epoch": 0.08732990769816348, + "grad_norm": 1.4508111476898193, + "learning_rate": 4.906513019879468e-05, + "loss": 4.9832, + "step": 14684 + }, + { + "epoch": 0.08733585498144447, + "grad_norm": 1.3923978805541992, + "learning_rate": 4.906500365380427e-05, + "loss": 4.8147, + "step": 14685 + }, + { + "epoch": 0.08734180226472547, + "grad_norm": 1.3737010955810547, + "learning_rate": 4.906487710041301e-05, + "loss": 4.8448, + "step": 14686 + }, + { + "epoch": 0.08734774954800648, + "grad_norm": 1.4765465259552002, + "learning_rate": 4.906475053862095e-05, + "loss": 4.8601, + "step": 14687 + }, + { + "epoch": 0.08735369683128746, + "grad_norm": 1.527372121810913, + "learning_rate": 4.906462396842813e-05, + "loss": 4.8898, + "step": 14688 + }, + { + "epoch": 0.08735964411456847, + "grad_norm": 1.2455743551254272, + "learning_rate": 4.9064497389834604e-05, + "loss": 4.9954, + "step": 14689 + }, + { + "epoch": 0.08736559139784947, + "grad_norm": 1.3169753551483154, + "learning_rate": 4.906437080284041e-05, + "loss": 5.1384, + "step": 14690 + }, + { + "epoch": 0.08737153868113046, + "grad_norm": 1.3158196210861206, + "learning_rate": 4.906424420744559e-05, + "loss": 5.032, + "step": 14691 + }, + { + "epoch": 0.08737748596441146, + "grad_norm": 1.5421653985977173, + "learning_rate": 4.9064117603650197e-05, + "loss": 4.6448, + "step": 14692 + }, + { + "epoch": 0.08738343324769246, + "grad_norm": 1.4324442148208618, + "learning_rate": 4.906399099145427e-05, + "loss": 4.819, + "step": 14693 + }, + { + "epoch": 0.08738938053097345, + "grad_norm": 1.299877643585205, + "learning_rate": 4.9063864370857836e-05, + "loss": 5.4793, + "step": 14694 + }, + { + "epoch": 0.08739532781425445, + "grad_norm": 1.8289762735366821, + "learning_rate": 4.906373774186097e-05, + "loss": 5.0972, + "step": 14695 + }, + { + "epoch": 0.08740127509753545, + "grad_norm": 1.5460636615753174, + "learning_rate": 4.9063611104463705e-05, + "loss": 5.0992, + "step": 14696 + }, + { + "epoch": 0.08740722238081644, + "grad_norm": 1.4720163345336914, + "learning_rate": 4.9063484458666076e-05, + "loss": 5.0918, + "step": 14697 + }, + { + "epoch": 0.08741316966409744, + "grad_norm": 1.4653000831604004, + "learning_rate": 4.906335780446813e-05, + "loss": 5.1523, + "step": 14698 + }, + { + "epoch": 0.08741911694737843, + "grad_norm": 1.461012840270996, + "learning_rate": 4.9063231141869914e-05, + "loss": 5.1848, + "step": 14699 + }, + { + "epoch": 0.08742506423065943, + "grad_norm": 1.6757450103759766, + "learning_rate": 4.906310447087148e-05, + "loss": 4.9809, + "step": 14700 + }, + { + "epoch": 0.08743101151394043, + "grad_norm": 1.498402714729309, + "learning_rate": 4.906297779147286e-05, + "loss": 5.1451, + "step": 14701 + }, + { + "epoch": 0.08743695879722142, + "grad_norm": 1.341667652130127, + "learning_rate": 4.906285110367411e-05, + "loss": 5.1973, + "step": 14702 + }, + { + "epoch": 0.08744290608050242, + "grad_norm": 1.5008035898208618, + "learning_rate": 4.9062724407475255e-05, + "loss": 5.0961, + "step": 14703 + }, + { + "epoch": 0.08744885336378343, + "grad_norm": 1.6110866069793701, + "learning_rate": 4.9062597702876354e-05, + "loss": 4.7201, + "step": 14704 + }, + { + "epoch": 0.08745480064706442, + "grad_norm": 1.5154603719711304, + "learning_rate": 4.906247098987746e-05, + "loss": 4.6537, + "step": 14705 + }, + { + "epoch": 0.08746074793034542, + "grad_norm": 1.6169204711914062, + "learning_rate": 4.90623442684786e-05, + "loss": 4.512, + "step": 14706 + }, + { + "epoch": 0.08746669521362642, + "grad_norm": 1.4967073202133179, + "learning_rate": 4.9062217538679824e-05, + "loss": 4.7159, + "step": 14707 + }, + { + "epoch": 0.08747264249690741, + "grad_norm": 1.4621938467025757, + "learning_rate": 4.9062090800481174e-05, + "loss": 4.7553, + "step": 14708 + }, + { + "epoch": 0.08747858978018841, + "grad_norm": 1.694868564605713, + "learning_rate": 4.9061964053882694e-05, + "loss": 4.6801, + "step": 14709 + }, + { + "epoch": 0.08748453706346941, + "grad_norm": 1.6228396892547607, + "learning_rate": 4.906183729888444e-05, + "loss": 4.5402, + "step": 14710 + }, + { + "epoch": 0.0874904843467504, + "grad_norm": 1.388859748840332, + "learning_rate": 4.9061710535486435e-05, + "loss": 4.5645, + "step": 14711 + }, + { + "epoch": 0.0874964316300314, + "grad_norm": 1.546074390411377, + "learning_rate": 4.9061583763688746e-05, + "loss": 4.4146, + "step": 14712 + }, + { + "epoch": 0.0875023789133124, + "grad_norm": 1.5526363849639893, + "learning_rate": 4.90614569834914e-05, + "loss": 4.6027, + "step": 14713 + }, + { + "epoch": 0.08750832619659339, + "grad_norm": 1.6809604167938232, + "learning_rate": 4.9061330194894454e-05, + "loss": 4.4927, + "step": 14714 + }, + { + "epoch": 0.0875142734798744, + "grad_norm": 1.8013920783996582, + "learning_rate": 4.906120339789795e-05, + "loss": 4.6949, + "step": 14715 + }, + { + "epoch": 0.0875202207631554, + "grad_norm": 1.587863564491272, + "learning_rate": 4.906107659250192e-05, + "loss": 4.7255, + "step": 14716 + }, + { + "epoch": 0.08752616804643638, + "grad_norm": 1.4871174097061157, + "learning_rate": 4.9060949778706415e-05, + "loss": 4.6753, + "step": 14717 + }, + { + "epoch": 0.08753211532971739, + "grad_norm": 1.5521314144134521, + "learning_rate": 4.9060822956511485e-05, + "loss": 4.6963, + "step": 14718 + }, + { + "epoch": 0.08753806261299839, + "grad_norm": 1.5176832675933838, + "learning_rate": 4.906069612591717e-05, + "loss": 4.7475, + "step": 14719 + }, + { + "epoch": 0.08754400989627938, + "grad_norm": 1.7381534576416016, + "learning_rate": 4.906056928692352e-05, + "loss": 4.6952, + "step": 14720 + }, + { + "epoch": 0.08754995717956038, + "grad_norm": 1.604637622833252, + "learning_rate": 4.9060442439530564e-05, + "loss": 4.5792, + "step": 14721 + }, + { + "epoch": 0.08755590446284138, + "grad_norm": 1.6367937326431274, + "learning_rate": 4.9060315583738356e-05, + "loss": 4.6422, + "step": 14722 + }, + { + "epoch": 0.08756185174612237, + "grad_norm": 1.5177057981491089, + "learning_rate": 4.906018871954695e-05, + "loss": 4.5682, + "step": 14723 + }, + { + "epoch": 0.08756779902940337, + "grad_norm": 1.5539237260818481, + "learning_rate": 4.906006184695637e-05, + "loss": 4.5194, + "step": 14724 + }, + { + "epoch": 0.08757374631268437, + "grad_norm": 1.7041072845458984, + "learning_rate": 4.905993496596668e-05, + "loss": 4.6526, + "step": 14725 + }, + { + "epoch": 0.08757969359596536, + "grad_norm": 1.7187644243240356, + "learning_rate": 4.9059808076577914e-05, + "loss": 4.6251, + "step": 14726 + }, + { + "epoch": 0.08758564087924636, + "grad_norm": 1.6393675804138184, + "learning_rate": 4.905968117879012e-05, + "loss": 4.7242, + "step": 14727 + }, + { + "epoch": 0.08759158816252735, + "grad_norm": 1.6426397562026978, + "learning_rate": 4.905955427260333e-05, + "loss": 4.6272, + "step": 14728 + }, + { + "epoch": 0.08759753544580835, + "grad_norm": 1.3231829404830933, + "learning_rate": 4.9059427358017605e-05, + "loss": 4.621, + "step": 14729 + }, + { + "epoch": 0.08760348272908935, + "grad_norm": 1.3970234394073486, + "learning_rate": 4.905930043503298e-05, + "loss": 4.6356, + "step": 14730 + }, + { + "epoch": 0.08760943001237034, + "grad_norm": 1.511977195739746, + "learning_rate": 4.90591735036495e-05, + "loss": 4.7408, + "step": 14731 + }, + { + "epoch": 0.08761537729565134, + "grad_norm": 1.284788727760315, + "learning_rate": 4.9059046563867216e-05, + "loss": 5.2573, + "step": 14732 + }, + { + "epoch": 0.08762132457893235, + "grad_norm": 1.5148005485534668, + "learning_rate": 4.905891961568617e-05, + "loss": 5.0465, + "step": 14733 + }, + { + "epoch": 0.08762727186221334, + "grad_norm": 1.3727401494979858, + "learning_rate": 4.905879265910639e-05, + "loss": 5.0424, + "step": 14734 + }, + { + "epoch": 0.08763321914549434, + "grad_norm": 1.4994157552719116, + "learning_rate": 4.9058665694127945e-05, + "loss": 5.1662, + "step": 14735 + }, + { + "epoch": 0.08763916642877534, + "grad_norm": 1.5002670288085938, + "learning_rate": 4.905853872075087e-05, + "loss": 5.0872, + "step": 14736 + }, + { + "epoch": 0.08764511371205633, + "grad_norm": 1.580439567565918, + "learning_rate": 4.90584117389752e-05, + "loss": 5.1315, + "step": 14737 + }, + { + "epoch": 0.08765106099533733, + "grad_norm": 1.416154384613037, + "learning_rate": 4.9058284748801e-05, + "loss": 5.1066, + "step": 14738 + }, + { + "epoch": 0.08765700827861833, + "grad_norm": 1.5391058921813965, + "learning_rate": 4.905815775022828e-05, + "loss": 5.1724, + "step": 14739 + }, + { + "epoch": 0.08766295556189932, + "grad_norm": 1.20875883102417, + "learning_rate": 4.905803074325712e-05, + "loss": 5.152, + "step": 14740 + }, + { + "epoch": 0.08766890284518032, + "grad_norm": 1.27827787399292, + "learning_rate": 4.9057903727887556e-05, + "loss": 5.0271, + "step": 14741 + }, + { + "epoch": 0.08767485012846132, + "grad_norm": 1.1356613636016846, + "learning_rate": 4.9057776704119615e-05, + "loss": 5.0078, + "step": 14742 + }, + { + "epoch": 0.08768079741174231, + "grad_norm": 1.3931230306625366, + "learning_rate": 4.9057649671953355e-05, + "loss": 5.1253, + "step": 14743 + }, + { + "epoch": 0.08768674469502331, + "grad_norm": 1.553105115890503, + "learning_rate": 4.905752263138882e-05, + "loss": 5.1259, + "step": 14744 + }, + { + "epoch": 0.08769269197830432, + "grad_norm": 1.4004448652267456, + "learning_rate": 4.905739558242605e-05, + "loss": 5.1104, + "step": 14745 + }, + { + "epoch": 0.0876986392615853, + "grad_norm": 1.6295247077941895, + "learning_rate": 4.905726852506509e-05, + "loss": 5.0718, + "step": 14746 + }, + { + "epoch": 0.0877045865448663, + "grad_norm": 1.5966804027557373, + "learning_rate": 4.9057141459306e-05, + "loss": 5.1922, + "step": 14747 + }, + { + "epoch": 0.08771053382814731, + "grad_norm": 1.5448883771896362, + "learning_rate": 4.9057014385148795e-05, + "loss": 4.9715, + "step": 14748 + }, + { + "epoch": 0.0877164811114283, + "grad_norm": 1.5252676010131836, + "learning_rate": 4.905688730259354e-05, + "loss": 5.2128, + "step": 14749 + }, + { + "epoch": 0.0877224283947093, + "grad_norm": 1.387237310409546, + "learning_rate": 4.9056760211640274e-05, + "loss": 5.0933, + "step": 14750 + }, + { + "epoch": 0.0877283756779903, + "grad_norm": 1.3318862915039062, + "learning_rate": 4.905663311228904e-05, + "loss": 5.1849, + "step": 14751 + }, + { + "epoch": 0.08773432296127129, + "grad_norm": 1.4328356981277466, + "learning_rate": 4.905650600453989e-05, + "loss": 5.2287, + "step": 14752 + }, + { + "epoch": 0.08774027024455229, + "grad_norm": 1.4316518306732178, + "learning_rate": 4.905637888839285e-05, + "loss": 4.9774, + "step": 14753 + }, + { + "epoch": 0.08774621752783329, + "grad_norm": 1.1666837930679321, + "learning_rate": 4.9056251763847996e-05, + "loss": 5.2098, + "step": 14754 + }, + { + "epoch": 0.08775216481111428, + "grad_norm": 1.4383636713027954, + "learning_rate": 4.9056124630905333e-05, + "loss": 5.2438, + "step": 14755 + }, + { + "epoch": 0.08775811209439528, + "grad_norm": 2.6009883880615234, + "learning_rate": 4.9055997489564936e-05, + "loss": 5.7232, + "step": 14756 + }, + { + "epoch": 0.08776405937767627, + "grad_norm": 1.3072876930236816, + "learning_rate": 4.905587033982684e-05, + "loss": 5.1811, + "step": 14757 + }, + { + "epoch": 0.08777000666095727, + "grad_norm": 1.2538501024246216, + "learning_rate": 4.9055743181691084e-05, + "loss": 5.1557, + "step": 14758 + }, + { + "epoch": 0.08777595394423827, + "grad_norm": 1.2565419673919678, + "learning_rate": 4.905561601515771e-05, + "loss": 5.129, + "step": 14759 + }, + { + "epoch": 0.08778190122751926, + "grad_norm": 1.3041788339614868, + "learning_rate": 4.905548884022678e-05, + "loss": 5.2048, + "step": 14760 + }, + { + "epoch": 0.08778784851080026, + "grad_norm": 1.4548598527908325, + "learning_rate": 4.905536165689832e-05, + "loss": 5.2405, + "step": 14761 + }, + { + "epoch": 0.08779379579408127, + "grad_norm": 1.1748031377792358, + "learning_rate": 4.905523446517239e-05, + "loss": 5.1804, + "step": 14762 + }, + { + "epoch": 0.08779974307736226, + "grad_norm": 1.210534930229187, + "learning_rate": 4.905510726504902e-05, + "loss": 5.1383, + "step": 14763 + }, + { + "epoch": 0.08780569036064326, + "grad_norm": 1.2154903411865234, + "learning_rate": 4.9054980056528264e-05, + "loss": 5.2757, + "step": 14764 + }, + { + "epoch": 0.08781163764392426, + "grad_norm": 1.4123867750167847, + "learning_rate": 4.9054852839610166e-05, + "loss": 5.1268, + "step": 14765 + }, + { + "epoch": 0.08781758492720525, + "grad_norm": 1.3136295080184937, + "learning_rate": 4.905472561429476e-05, + "loss": 5.2186, + "step": 14766 + }, + { + "epoch": 0.08782353221048625, + "grad_norm": 1.2741068601608276, + "learning_rate": 4.905459838058209e-05, + "loss": 4.9737, + "step": 14767 + }, + { + "epoch": 0.08782947949376725, + "grad_norm": 1.2963054180145264, + "learning_rate": 4.9054471138472225e-05, + "loss": 5.1712, + "step": 14768 + }, + { + "epoch": 0.08783542677704824, + "grad_norm": 1.5352611541748047, + "learning_rate": 4.905434388796519e-05, + "loss": 4.9473, + "step": 14769 + }, + { + "epoch": 0.08784137406032924, + "grad_norm": 1.3399711847305298, + "learning_rate": 4.905421662906103e-05, + "loss": 5.2402, + "step": 14770 + }, + { + "epoch": 0.08784732134361024, + "grad_norm": 1.4278292655944824, + "learning_rate": 4.9054089361759794e-05, + "loss": 4.9331, + "step": 14771 + }, + { + "epoch": 0.08785326862689123, + "grad_norm": 1.5057200193405151, + "learning_rate": 4.905396208606151e-05, + "loss": 5.1553, + "step": 14772 + }, + { + "epoch": 0.08785921591017223, + "grad_norm": 1.4660797119140625, + "learning_rate": 4.905383480196625e-05, + "loss": 5.0792, + "step": 14773 + }, + { + "epoch": 0.08786516319345324, + "grad_norm": 1.4386217594146729, + "learning_rate": 4.905370750947405e-05, + "loss": 4.8363, + "step": 14774 + }, + { + "epoch": 0.08787111047673422, + "grad_norm": 1.4555455446243286, + "learning_rate": 4.905358020858493e-05, + "loss": 4.8934, + "step": 14775 + }, + { + "epoch": 0.08787705776001523, + "grad_norm": 1.5161443948745728, + "learning_rate": 4.905345289929897e-05, + "loss": 4.8227, + "step": 14776 + }, + { + "epoch": 0.08788300504329623, + "grad_norm": 1.2704185247421265, + "learning_rate": 4.9053325581616185e-05, + "loss": 4.9612, + "step": 14777 + }, + { + "epoch": 0.08788895232657722, + "grad_norm": 1.6396795511245728, + "learning_rate": 4.905319825553664e-05, + "loss": 4.8947, + "step": 14778 + }, + { + "epoch": 0.08789489960985822, + "grad_norm": 1.49285888671875, + "learning_rate": 4.905307092106037e-05, + "loss": 5.0814, + "step": 14779 + }, + { + "epoch": 0.08790084689313922, + "grad_norm": 1.3829785585403442, + "learning_rate": 4.9052943578187424e-05, + "loss": 5.3864, + "step": 14780 + }, + { + "epoch": 0.08790679417642021, + "grad_norm": 1.517054557800293, + "learning_rate": 4.905281622691784e-05, + "loss": 5.3053, + "step": 14781 + }, + { + "epoch": 0.08791274145970121, + "grad_norm": 1.491402506828308, + "learning_rate": 4.905268886725167e-05, + "loss": 5.3685, + "step": 14782 + }, + { + "epoch": 0.08791868874298221, + "grad_norm": 1.5034211874008179, + "learning_rate": 4.905256149918895e-05, + "loss": 5.2139, + "step": 14783 + }, + { + "epoch": 0.0879246360262632, + "grad_norm": 1.4021977186203003, + "learning_rate": 4.905243412272974e-05, + "loss": 5.301, + "step": 14784 + }, + { + "epoch": 0.0879305833095442, + "grad_norm": 1.44327974319458, + "learning_rate": 4.9052306737874064e-05, + "loss": 5.296, + "step": 14785 + }, + { + "epoch": 0.08793653059282519, + "grad_norm": 1.4733220338821411, + "learning_rate": 4.905217934462198e-05, + "loss": 5.3302, + "step": 14786 + }, + { + "epoch": 0.08794247787610619, + "grad_norm": 1.3308794498443604, + "learning_rate": 4.9052051942973533e-05, + "loss": 5.1835, + "step": 14787 + }, + { + "epoch": 0.0879484251593872, + "grad_norm": 1.2667236328125, + "learning_rate": 4.905192453292876e-05, + "loss": 5.1801, + "step": 14788 + }, + { + "epoch": 0.08795437244266818, + "grad_norm": 1.3284921646118164, + "learning_rate": 4.90517971144877e-05, + "loss": 5.106, + "step": 14789 + }, + { + "epoch": 0.08796031972594918, + "grad_norm": 1.4089261293411255, + "learning_rate": 4.9051669687650415e-05, + "loss": 5.133, + "step": 14790 + }, + { + "epoch": 0.08796626700923019, + "grad_norm": 1.1701233386993408, + "learning_rate": 4.905154225241694e-05, + "loss": 5.1602, + "step": 14791 + }, + { + "epoch": 0.08797221429251117, + "grad_norm": 1.169570803642273, + "learning_rate": 4.9051414808787324e-05, + "loss": 5.1231, + "step": 14792 + }, + { + "epoch": 0.08797816157579218, + "grad_norm": 1.5104409456253052, + "learning_rate": 4.90512873567616e-05, + "loss": 5.0774, + "step": 14793 + }, + { + "epoch": 0.08798410885907318, + "grad_norm": 1.3065992593765259, + "learning_rate": 4.9051159896339816e-05, + "loss": 4.9547, + "step": 14794 + }, + { + "epoch": 0.08799005614235417, + "grad_norm": 1.6417936086654663, + "learning_rate": 4.905103242752203e-05, + "loss": 5.2734, + "step": 14795 + }, + { + "epoch": 0.08799600342563517, + "grad_norm": 2.1529974937438965, + "learning_rate": 4.905090495030827e-05, + "loss": 5.1999, + "step": 14796 + }, + { + "epoch": 0.08800195070891617, + "grad_norm": 1.6746312379837036, + "learning_rate": 4.90507774646986e-05, + "loss": 4.959, + "step": 14797 + }, + { + "epoch": 0.08800789799219716, + "grad_norm": 1.4422825574874878, + "learning_rate": 4.905064997069304e-05, + "loss": 5.0581, + "step": 14798 + }, + { + "epoch": 0.08801384527547816, + "grad_norm": 1.658833622932434, + "learning_rate": 4.9050522468291646e-05, + "loss": 4.9591, + "step": 14799 + }, + { + "epoch": 0.08801979255875916, + "grad_norm": 1.4971596002578735, + "learning_rate": 4.9050394957494464e-05, + "loss": 5.2515, + "step": 14800 + }, + { + "epoch": 0.08802573984204015, + "grad_norm": 1.5866429805755615, + "learning_rate": 4.9050267438301546e-05, + "loss": 5.1084, + "step": 14801 + }, + { + "epoch": 0.08803168712532115, + "grad_norm": 1.5049015283584595, + "learning_rate": 4.9050139910712925e-05, + "loss": 5.1102, + "step": 14802 + }, + { + "epoch": 0.08803763440860216, + "grad_norm": 1.6711664199829102, + "learning_rate": 4.905001237472864e-05, + "loss": 5.0215, + "step": 14803 + }, + { + "epoch": 0.08804358169188314, + "grad_norm": 1.6390610933303833, + "learning_rate": 4.904988483034875e-05, + "loss": 4.978, + "step": 14804 + }, + { + "epoch": 0.08804952897516415, + "grad_norm": 1.5968292951583862, + "learning_rate": 4.9049757277573295e-05, + "loss": 5.0183, + "step": 14805 + }, + { + "epoch": 0.08805547625844515, + "grad_norm": 1.4864193201065063, + "learning_rate": 4.9049629716402325e-05, + "loss": 5.5199, + "step": 14806 + }, + { + "epoch": 0.08806142354172614, + "grad_norm": 1.5658420324325562, + "learning_rate": 4.904950214683587e-05, + "loss": 5.4906, + "step": 14807 + }, + { + "epoch": 0.08806737082500714, + "grad_norm": 1.5811707973480225, + "learning_rate": 4.9049374568873975e-05, + "loss": 5.5795, + "step": 14808 + }, + { + "epoch": 0.08807331810828814, + "grad_norm": 1.418641448020935, + "learning_rate": 4.90492469825167e-05, + "loss": 5.3616, + "step": 14809 + }, + { + "epoch": 0.08807926539156913, + "grad_norm": 1.323500633239746, + "learning_rate": 4.904911938776408e-05, + "loss": 5.2641, + "step": 14810 + }, + { + "epoch": 0.08808521267485013, + "grad_norm": 1.590867280960083, + "learning_rate": 4.904899178461616e-05, + "loss": 5.3782, + "step": 14811 + }, + { + "epoch": 0.08809115995813113, + "grad_norm": 1.243213176727295, + "learning_rate": 4.904886417307299e-05, + "loss": 5.4743, + "step": 14812 + }, + { + "epoch": 0.08809710724141212, + "grad_norm": 1.5051169395446777, + "learning_rate": 4.9048736553134614e-05, + "loss": 5.3046, + "step": 14813 + }, + { + "epoch": 0.08810305452469312, + "grad_norm": 1.334234356880188, + "learning_rate": 4.904860892480106e-05, + "loss": 5.2673, + "step": 14814 + }, + { + "epoch": 0.08810900180797411, + "grad_norm": 1.4352458715438843, + "learning_rate": 4.904848128807239e-05, + "loss": 5.3465, + "step": 14815 + }, + { + "epoch": 0.08811494909125511, + "grad_norm": 1.6878329515457153, + "learning_rate": 4.904835364294864e-05, + "loss": 5.3467, + "step": 14816 + }, + { + "epoch": 0.08812089637453611, + "grad_norm": 1.542100191116333, + "learning_rate": 4.904822598942986e-05, + "loss": 5.4147, + "step": 14817 + }, + { + "epoch": 0.0881268436578171, + "grad_norm": 1.5099046230316162, + "learning_rate": 4.90480983275161e-05, + "loss": 5.7198, + "step": 14818 + }, + { + "epoch": 0.0881327909410981, + "grad_norm": 1.6120097637176514, + "learning_rate": 4.9047970657207395e-05, + "loss": 5.4417, + "step": 14819 + }, + { + "epoch": 0.0881387382243791, + "grad_norm": 1.455407977104187, + "learning_rate": 4.904784297850379e-05, + "loss": 5.3028, + "step": 14820 + }, + { + "epoch": 0.0881446855076601, + "grad_norm": 1.589712381362915, + "learning_rate": 4.904771529140533e-05, + "loss": 5.2493, + "step": 14821 + }, + { + "epoch": 0.0881506327909411, + "grad_norm": 1.5051584243774414, + "learning_rate": 4.904758759591206e-05, + "loss": 5.2225, + "step": 14822 + }, + { + "epoch": 0.0881565800742221, + "grad_norm": 1.3623727560043335, + "learning_rate": 4.9047459892024026e-05, + "loss": 5.1738, + "step": 14823 + }, + { + "epoch": 0.08816252735750309, + "grad_norm": 1.4643206596374512, + "learning_rate": 4.9047332179741274e-05, + "loss": 5.123, + "step": 14824 + }, + { + "epoch": 0.08816847464078409, + "grad_norm": 1.4233453273773193, + "learning_rate": 4.904720445906384e-05, + "loss": 4.9263, + "step": 14825 + }, + { + "epoch": 0.08817442192406509, + "grad_norm": 1.6479318141937256, + "learning_rate": 4.9047076729991786e-05, + "loss": 4.9663, + "step": 14826 + }, + { + "epoch": 0.08818036920734608, + "grad_norm": 1.4759633541107178, + "learning_rate": 4.9046948992525145e-05, + "loss": 5.0326, + "step": 14827 + }, + { + "epoch": 0.08818631649062708, + "grad_norm": 1.435533046722412, + "learning_rate": 4.904682124666395e-05, + "loss": 5.0819, + "step": 14828 + }, + { + "epoch": 0.08819226377390808, + "grad_norm": 1.4540610313415527, + "learning_rate": 4.904669349240827e-05, + "loss": 5.391, + "step": 14829 + }, + { + "epoch": 0.08819821105718907, + "grad_norm": 1.6308038234710693, + "learning_rate": 4.904656572975814e-05, + "loss": 4.9723, + "step": 14830 + }, + { + "epoch": 0.08820415834047007, + "grad_norm": 1.453600287437439, + "learning_rate": 4.90464379587136e-05, + "loss": 5.1689, + "step": 14831 + }, + { + "epoch": 0.08821010562375108, + "grad_norm": 1.4876199960708618, + "learning_rate": 4.904631017927469e-05, + "loss": 5.1163, + "step": 14832 + }, + { + "epoch": 0.08821605290703206, + "grad_norm": 1.4240463972091675, + "learning_rate": 4.9046182391441466e-05, + "loss": 5.1154, + "step": 14833 + }, + { + "epoch": 0.08822200019031307, + "grad_norm": 1.4176205396652222, + "learning_rate": 4.904605459521397e-05, + "loss": 5.1587, + "step": 14834 + }, + { + "epoch": 0.08822794747359407, + "grad_norm": 1.302998423576355, + "learning_rate": 4.9045926790592244e-05, + "loss": 5.1302, + "step": 14835 + }, + { + "epoch": 0.08823389475687506, + "grad_norm": 1.4490020275115967, + "learning_rate": 4.904579897757633e-05, + "loss": 5.0817, + "step": 14836 + }, + { + "epoch": 0.08823984204015606, + "grad_norm": 1.4430203437805176, + "learning_rate": 4.9045671156166276e-05, + "loss": 5.1334, + "step": 14837 + }, + { + "epoch": 0.08824578932343706, + "grad_norm": 1.326277494430542, + "learning_rate": 4.9045543326362134e-05, + "loss": 5.3292, + "step": 14838 + }, + { + "epoch": 0.08825173660671805, + "grad_norm": 1.373415470123291, + "learning_rate": 4.9045415488163936e-05, + "loss": 5.454, + "step": 14839 + }, + { + "epoch": 0.08825768388999905, + "grad_norm": 1.4334250688552856, + "learning_rate": 4.904528764157173e-05, + "loss": 5.2735, + "step": 14840 + }, + { + "epoch": 0.08826363117328005, + "grad_norm": 1.4029041528701782, + "learning_rate": 4.904515978658556e-05, + "loss": 5.0549, + "step": 14841 + }, + { + "epoch": 0.08826957845656104, + "grad_norm": 1.355177879333496, + "learning_rate": 4.904503192320548e-05, + "loss": 5.2569, + "step": 14842 + }, + { + "epoch": 0.08827552573984204, + "grad_norm": 1.2063989639282227, + "learning_rate": 4.904490405143153e-05, + "loss": 5.2469, + "step": 14843 + }, + { + "epoch": 0.08828147302312303, + "grad_norm": 1.2290265560150146, + "learning_rate": 4.904477617126374e-05, + "loss": 5.255, + "step": 14844 + }, + { + "epoch": 0.08828742030640403, + "grad_norm": 1.0648494958877563, + "learning_rate": 4.904464828270218e-05, + "loss": 5.2423, + "step": 14845 + }, + { + "epoch": 0.08829336758968503, + "grad_norm": 1.362572431564331, + "learning_rate": 4.904452038574687e-05, + "loss": 5.3856, + "step": 14846 + }, + { + "epoch": 0.08829931487296602, + "grad_norm": 1.3004114627838135, + "learning_rate": 4.9044392480397886e-05, + "loss": 5.0672, + "step": 14847 + }, + { + "epoch": 0.08830526215624702, + "grad_norm": 1.4852789640426636, + "learning_rate": 4.904426456665523e-05, + "loss": 5.2145, + "step": 14848 + }, + { + "epoch": 0.08831120943952803, + "grad_norm": 1.4221493005752563, + "learning_rate": 4.9044136644518976e-05, + "loss": 5.4544, + "step": 14849 + }, + { + "epoch": 0.08831715672280901, + "grad_norm": 1.4444363117218018, + "learning_rate": 4.904400871398917e-05, + "loss": 5.3342, + "step": 14850 + }, + { + "epoch": 0.08832310400609002, + "grad_norm": 1.1723617315292358, + "learning_rate": 4.904388077506585e-05, + "loss": 5.3846, + "step": 14851 + }, + { + "epoch": 0.08832905128937102, + "grad_norm": 1.3458356857299805, + "learning_rate": 4.904375282774905e-05, + "loss": 5.3903, + "step": 14852 + }, + { + "epoch": 0.08833499857265201, + "grad_norm": 1.4839876890182495, + "learning_rate": 4.904362487203883e-05, + "loss": 5.0889, + "step": 14853 + }, + { + "epoch": 0.08834094585593301, + "grad_norm": 1.6487696170806885, + "learning_rate": 4.904349690793523e-05, + "loss": 5.0904, + "step": 14854 + }, + { + "epoch": 0.08834689313921401, + "grad_norm": 1.5201997756958008, + "learning_rate": 4.904336893543829e-05, + "loss": 4.9017, + "step": 14855 + }, + { + "epoch": 0.088352840422495, + "grad_norm": 1.5502886772155762, + "learning_rate": 4.904324095454806e-05, + "loss": 4.931, + "step": 14856 + }, + { + "epoch": 0.088358787705776, + "grad_norm": 1.4996228218078613, + "learning_rate": 4.904311296526458e-05, + "loss": 5.0773, + "step": 14857 + }, + { + "epoch": 0.088364734989057, + "grad_norm": 1.7004456520080566, + "learning_rate": 4.90429849675879e-05, + "loss": 4.9913, + "step": 14858 + }, + { + "epoch": 0.08837068227233799, + "grad_norm": 1.426007866859436, + "learning_rate": 4.904285696151806e-05, + "loss": 5.1312, + "step": 14859 + }, + { + "epoch": 0.088376629555619, + "grad_norm": 1.4049350023269653, + "learning_rate": 4.904272894705512e-05, + "loss": 5.0539, + "step": 14860 + }, + { + "epoch": 0.0883825768389, + "grad_norm": 1.558273434638977, + "learning_rate": 4.9042600924199096e-05, + "loss": 5.0822, + "step": 14861 + }, + { + "epoch": 0.08838852412218098, + "grad_norm": 1.6177934408187866, + "learning_rate": 4.9042472892950055e-05, + "loss": 5.1646, + "step": 14862 + }, + { + "epoch": 0.08839447140546199, + "grad_norm": 1.5152839422225952, + "learning_rate": 4.904234485330803e-05, + "loss": 5.0144, + "step": 14863 + }, + { + "epoch": 0.08840041868874299, + "grad_norm": 1.474231243133545, + "learning_rate": 4.904221680527308e-05, + "loss": 5.1063, + "step": 14864 + }, + { + "epoch": 0.08840636597202398, + "grad_norm": 1.5897177457809448, + "learning_rate": 4.904208874884523e-05, + "loss": 4.9724, + "step": 14865 + }, + { + "epoch": 0.08841231325530498, + "grad_norm": 1.604368805885315, + "learning_rate": 4.904196068402454e-05, + "loss": 4.8905, + "step": 14866 + }, + { + "epoch": 0.08841826053858598, + "grad_norm": 1.338458776473999, + "learning_rate": 4.904183261081105e-05, + "loss": 4.7829, + "step": 14867 + }, + { + "epoch": 0.08842420782186697, + "grad_norm": 1.62189781665802, + "learning_rate": 4.9041704529204806e-05, + "loss": 4.8025, + "step": 14868 + }, + { + "epoch": 0.08843015510514797, + "grad_norm": 1.555298089981079, + "learning_rate": 4.904157643920585e-05, + "loss": 4.9098, + "step": 14869 + }, + { + "epoch": 0.08843610238842897, + "grad_norm": 1.5110834836959839, + "learning_rate": 4.904144834081423e-05, + "loss": 4.8648, + "step": 14870 + }, + { + "epoch": 0.08844204967170996, + "grad_norm": 1.59073805809021, + "learning_rate": 4.904132023402999e-05, + "loss": 4.8997, + "step": 14871 + }, + { + "epoch": 0.08844799695499096, + "grad_norm": 1.5218732357025146, + "learning_rate": 4.904119211885316e-05, + "loss": 5.352, + "step": 14872 + }, + { + "epoch": 0.08845394423827196, + "grad_norm": 1.5263079404830933, + "learning_rate": 4.904106399528382e-05, + "loss": 4.8921, + "step": 14873 + }, + { + "epoch": 0.08845989152155295, + "grad_norm": 1.6151986122131348, + "learning_rate": 4.904093586332198e-05, + "loss": 5.0086, + "step": 14874 + }, + { + "epoch": 0.08846583880483395, + "grad_norm": 1.4971787929534912, + "learning_rate": 4.90408077229677e-05, + "loss": 5.0119, + "step": 14875 + }, + { + "epoch": 0.08847178608811494, + "grad_norm": 1.4897308349609375, + "learning_rate": 4.904067957422102e-05, + "loss": 5.0175, + "step": 14876 + }, + { + "epoch": 0.08847773337139594, + "grad_norm": 1.4023786783218384, + "learning_rate": 4.904055141708199e-05, + "loss": 5.0361, + "step": 14877 + }, + { + "epoch": 0.08848368065467695, + "grad_norm": 1.4664498567581177, + "learning_rate": 4.904042325155065e-05, + "loss": 4.9784, + "step": 14878 + }, + { + "epoch": 0.08848962793795793, + "grad_norm": 1.390824556350708, + "learning_rate": 4.904029507762704e-05, + "loss": 4.9922, + "step": 14879 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 1.9508315324783325, + "learning_rate": 4.904016689531122e-05, + "loss": 5.6352, + "step": 14880 + }, + { + "epoch": 0.08850152250451994, + "grad_norm": 1.4192322492599487, + "learning_rate": 4.904003870460323e-05, + "loss": 5.0654, + "step": 14881 + }, + { + "epoch": 0.08850746978780093, + "grad_norm": 1.5868372917175293, + "learning_rate": 4.903991050550311e-05, + "loss": 4.9631, + "step": 14882 + }, + { + "epoch": 0.08851341707108193, + "grad_norm": 1.405555009841919, + "learning_rate": 4.903978229801089e-05, + "loss": 5.1311, + "step": 14883 + }, + { + "epoch": 0.08851936435436293, + "grad_norm": 1.453817367553711, + "learning_rate": 4.9039654082126646e-05, + "loss": 5.0866, + "step": 14884 + }, + { + "epoch": 0.08852531163764392, + "grad_norm": 1.5051809549331665, + "learning_rate": 4.9039525857850404e-05, + "loss": 5.1606, + "step": 14885 + }, + { + "epoch": 0.08853125892092492, + "grad_norm": 1.5323255062103271, + "learning_rate": 4.9039397625182206e-05, + "loss": 5.1564, + "step": 14886 + }, + { + "epoch": 0.08853720620420592, + "grad_norm": 1.5018506050109863, + "learning_rate": 4.903926938412211e-05, + "loss": 4.9672, + "step": 14887 + }, + { + "epoch": 0.08854315348748691, + "grad_norm": 1.488289713859558, + "learning_rate": 4.903914113467015e-05, + "loss": 4.882, + "step": 14888 + }, + { + "epoch": 0.08854910077076791, + "grad_norm": 1.434045672416687, + "learning_rate": 4.903901287682637e-05, + "loss": 5.0748, + "step": 14889 + }, + { + "epoch": 0.08855504805404892, + "grad_norm": 1.5172244310379028, + "learning_rate": 4.903888461059083e-05, + "loss": 5.065, + "step": 14890 + }, + { + "epoch": 0.0885609953373299, + "grad_norm": 1.545283555984497, + "learning_rate": 4.903875633596356e-05, + "loss": 5.2187, + "step": 14891 + }, + { + "epoch": 0.0885669426206109, + "grad_norm": 1.3149688243865967, + "learning_rate": 4.90386280529446e-05, + "loss": 4.9977, + "step": 14892 + }, + { + "epoch": 0.08857288990389191, + "grad_norm": 1.4925106763839722, + "learning_rate": 4.903849976153401e-05, + "loss": 5.0622, + "step": 14893 + }, + { + "epoch": 0.0885788371871729, + "grad_norm": 1.6073296070098877, + "learning_rate": 4.903837146173183e-05, + "loss": 5.0823, + "step": 14894 + }, + { + "epoch": 0.0885847844704539, + "grad_norm": 1.2879148721694946, + "learning_rate": 4.9038243153538096e-05, + "loss": 5.1574, + "step": 14895 + }, + { + "epoch": 0.0885907317537349, + "grad_norm": 1.6396079063415527, + "learning_rate": 4.903811483695287e-05, + "loss": 5.1748, + "step": 14896 + }, + { + "epoch": 0.08859667903701589, + "grad_norm": 1.426180124282837, + "learning_rate": 4.903798651197618e-05, + "loss": 5.0374, + "step": 14897 + }, + { + "epoch": 0.08860262632029689, + "grad_norm": 1.3685684204101562, + "learning_rate": 4.9037858178608076e-05, + "loss": 4.9373, + "step": 14898 + }, + { + "epoch": 0.08860857360357789, + "grad_norm": 1.5495455265045166, + "learning_rate": 4.903772983684861e-05, + "loss": 5.0696, + "step": 14899 + }, + { + "epoch": 0.08861452088685888, + "grad_norm": 1.4423854351043701, + "learning_rate": 4.9037601486697815e-05, + "loss": 5.1359, + "step": 14900 + }, + { + "epoch": 0.08862046817013988, + "grad_norm": 1.4704400300979614, + "learning_rate": 4.9037473128155745e-05, + "loss": 5.0438, + "step": 14901 + }, + { + "epoch": 0.08862641545342088, + "grad_norm": 1.49704909324646, + "learning_rate": 4.903734476122244e-05, + "loss": 5.0305, + "step": 14902 + }, + { + "epoch": 0.08863236273670187, + "grad_norm": 1.3732075691223145, + "learning_rate": 4.903721638589795e-05, + "loss": 4.9659, + "step": 14903 + }, + { + "epoch": 0.08863831001998287, + "grad_norm": 1.5920335054397583, + "learning_rate": 4.903708800218231e-05, + "loss": 4.9936, + "step": 14904 + }, + { + "epoch": 0.08864425730326386, + "grad_norm": 1.6084437370300293, + "learning_rate": 4.9036959610075575e-05, + "loss": 5.0048, + "step": 14905 + }, + { + "epoch": 0.08865020458654486, + "grad_norm": 1.2329050302505493, + "learning_rate": 4.903683120957778e-05, + "loss": 4.9729, + "step": 14906 + }, + { + "epoch": 0.08865615186982587, + "grad_norm": 1.4001328945159912, + "learning_rate": 4.903670280068898e-05, + "loss": 4.9577, + "step": 14907 + }, + { + "epoch": 0.08866209915310685, + "grad_norm": 1.3499484062194824, + "learning_rate": 4.903657438340921e-05, + "loss": 4.8696, + "step": 14908 + }, + { + "epoch": 0.08866804643638786, + "grad_norm": 1.3606812953948975, + "learning_rate": 4.903644595773853e-05, + "loss": 4.9142, + "step": 14909 + }, + { + "epoch": 0.08867399371966886, + "grad_norm": 1.3275173902511597, + "learning_rate": 4.9036317523676964e-05, + "loss": 5.032, + "step": 14910 + }, + { + "epoch": 0.08867994100294985, + "grad_norm": 1.5485349893569946, + "learning_rate": 4.903618908122458e-05, + "loss": 4.9252, + "step": 14911 + }, + { + "epoch": 0.08868588828623085, + "grad_norm": 1.4325098991394043, + "learning_rate": 4.9036060630381395e-05, + "loss": 4.9971, + "step": 14912 + }, + { + "epoch": 0.08869183556951185, + "grad_norm": 1.4953216314315796, + "learning_rate": 4.903593217114748e-05, + "loss": 4.8228, + "step": 14913 + }, + { + "epoch": 0.08869778285279284, + "grad_norm": 1.4761654138565063, + "learning_rate": 4.9035803703522876e-05, + "loss": 4.8365, + "step": 14914 + }, + { + "epoch": 0.08870373013607384, + "grad_norm": 1.3572559356689453, + "learning_rate": 4.9035675227507615e-05, + "loss": 4.8409, + "step": 14915 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 1.3793766498565674, + "learning_rate": 4.903554674310175e-05, + "loss": 4.8748, + "step": 14916 + }, + { + "epoch": 0.08871562470263583, + "grad_norm": 1.2097266912460327, + "learning_rate": 4.9035418250305314e-05, + "loss": 4.9695, + "step": 14917 + }, + { + "epoch": 0.08872157198591683, + "grad_norm": 1.5097788572311401, + "learning_rate": 4.903528974911837e-05, + "loss": 4.9205, + "step": 14918 + }, + { + "epoch": 0.08872751926919784, + "grad_norm": 1.474219560623169, + "learning_rate": 4.903516123954095e-05, + "loss": 4.9382, + "step": 14919 + }, + { + "epoch": 0.08873346655247882, + "grad_norm": 1.4695779085159302, + "learning_rate": 4.903503272157311e-05, + "loss": 5.1486, + "step": 14920 + }, + { + "epoch": 0.08873941383575983, + "grad_norm": 1.6874669790267944, + "learning_rate": 4.903490419521488e-05, + "loss": 5.6441, + "step": 14921 + }, + { + "epoch": 0.08874536111904083, + "grad_norm": 1.5862348079681396, + "learning_rate": 4.903477566046632e-05, + "loss": 5.1457, + "step": 14922 + }, + { + "epoch": 0.08875130840232182, + "grad_norm": 1.5781593322753906, + "learning_rate": 4.903464711732747e-05, + "loss": 4.915, + "step": 14923 + }, + { + "epoch": 0.08875725568560282, + "grad_norm": 1.5252950191497803, + "learning_rate": 4.903451856579837e-05, + "loss": 5.0672, + "step": 14924 + }, + { + "epoch": 0.08876320296888382, + "grad_norm": 1.575958013534546, + "learning_rate": 4.9034390005879065e-05, + "loss": 4.9914, + "step": 14925 + }, + { + "epoch": 0.08876915025216481, + "grad_norm": 1.3837618827819824, + "learning_rate": 4.90342614375696e-05, + "loss": 5.1778, + "step": 14926 + }, + { + "epoch": 0.08877509753544581, + "grad_norm": 1.4716275930404663, + "learning_rate": 4.9034132860870036e-05, + "loss": 5.2625, + "step": 14927 + }, + { + "epoch": 0.08878104481872681, + "grad_norm": 1.2883623838424683, + "learning_rate": 4.90340042757804e-05, + "loss": 5.2357, + "step": 14928 + }, + { + "epoch": 0.0887869921020078, + "grad_norm": 1.521010398864746, + "learning_rate": 4.9033875682300736e-05, + "loss": 5.4941, + "step": 14929 + }, + { + "epoch": 0.0887929393852888, + "grad_norm": 1.5457875728607178, + "learning_rate": 4.903374708043109e-05, + "loss": 5.3108, + "step": 14930 + }, + { + "epoch": 0.0887988866685698, + "grad_norm": 1.4583250284194946, + "learning_rate": 4.903361847017152e-05, + "loss": 5.425, + "step": 14931 + }, + { + "epoch": 0.08880483395185079, + "grad_norm": 1.561854362487793, + "learning_rate": 4.903348985152206e-05, + "loss": 5.4267, + "step": 14932 + }, + { + "epoch": 0.0888107812351318, + "grad_norm": 1.6274350881576538, + "learning_rate": 4.9033361224482756e-05, + "loss": 5.3266, + "step": 14933 + }, + { + "epoch": 0.08881672851841278, + "grad_norm": 1.3476616144180298, + "learning_rate": 4.903323258905366e-05, + "loss": 5.248, + "step": 14934 + }, + { + "epoch": 0.08882267580169378, + "grad_norm": 1.3584541082382202, + "learning_rate": 4.90331039452348e-05, + "loss": 5.3101, + "step": 14935 + }, + { + "epoch": 0.08882862308497479, + "grad_norm": 1.5269302129745483, + "learning_rate": 4.903297529302624e-05, + "loss": 5.3451, + "step": 14936 + }, + { + "epoch": 0.08883457036825577, + "grad_norm": 1.5320923328399658, + "learning_rate": 4.903284663242801e-05, + "loss": 5.4289, + "step": 14937 + }, + { + "epoch": 0.08884051765153678, + "grad_norm": 1.5647650957107544, + "learning_rate": 4.9032717963440166e-05, + "loss": 5.2925, + "step": 14938 + }, + { + "epoch": 0.08884646493481778, + "grad_norm": 1.3379693031311035, + "learning_rate": 4.9032589286062744e-05, + "loss": 5.3314, + "step": 14939 + }, + { + "epoch": 0.08885241221809877, + "grad_norm": 1.5872068405151367, + "learning_rate": 4.90324606002958e-05, + "loss": 5.3521, + "step": 14940 + }, + { + "epoch": 0.08885835950137977, + "grad_norm": 1.473799228668213, + "learning_rate": 4.9032331906139373e-05, + "loss": 5.3697, + "step": 14941 + }, + { + "epoch": 0.08886430678466077, + "grad_norm": 2.2111928462982178, + "learning_rate": 4.90322032035935e-05, + "loss": 5.0139, + "step": 14942 + }, + { + "epoch": 0.08887025406794176, + "grad_norm": 1.386910319328308, + "learning_rate": 4.903207449265824e-05, + "loss": 5.3982, + "step": 14943 + }, + { + "epoch": 0.08887620135122276, + "grad_norm": 1.4972623586654663, + "learning_rate": 4.9031945773333624e-05, + "loss": 5.4207, + "step": 14944 + }, + { + "epoch": 0.08888214863450376, + "grad_norm": 1.6061536073684692, + "learning_rate": 4.903181704561971e-05, + "loss": 5.4265, + "step": 14945 + }, + { + "epoch": 0.08888809591778475, + "grad_norm": 1.5003243684768677, + "learning_rate": 4.903168830951653e-05, + "loss": 5.2323, + "step": 14946 + }, + { + "epoch": 0.08889404320106575, + "grad_norm": 1.4466320276260376, + "learning_rate": 4.9031559565024144e-05, + "loss": 5.3054, + "step": 14947 + }, + { + "epoch": 0.08889999048434676, + "grad_norm": 1.4495269060134888, + "learning_rate": 4.9031430812142584e-05, + "loss": 5.2725, + "step": 14948 + }, + { + "epoch": 0.08890593776762774, + "grad_norm": 1.2909798622131348, + "learning_rate": 4.9031302050871896e-05, + "loss": 5.13, + "step": 14949 + }, + { + "epoch": 0.08891188505090875, + "grad_norm": 1.368377685546875, + "learning_rate": 4.903117328121214e-05, + "loss": 5.0471, + "step": 14950 + }, + { + "epoch": 0.08891783233418975, + "grad_norm": 1.3496042490005493, + "learning_rate": 4.903104450316334e-05, + "loss": 5.1209, + "step": 14951 + }, + { + "epoch": 0.08892377961747074, + "grad_norm": 1.593047022819519, + "learning_rate": 4.9030915716725554e-05, + "loss": 5.2551, + "step": 14952 + }, + { + "epoch": 0.08892972690075174, + "grad_norm": 1.3550326824188232, + "learning_rate": 4.903078692189882e-05, + "loss": 5.2543, + "step": 14953 + }, + { + "epoch": 0.08893567418403274, + "grad_norm": 1.4302785396575928, + "learning_rate": 4.903065811868319e-05, + "loss": 5.2828, + "step": 14954 + }, + { + "epoch": 0.08894162146731373, + "grad_norm": 1.578244686126709, + "learning_rate": 4.903052930707871e-05, + "loss": 5.0593, + "step": 14955 + }, + { + "epoch": 0.08894756875059473, + "grad_norm": 1.248634696006775, + "learning_rate": 4.903040048708541e-05, + "loss": 5.0644, + "step": 14956 + }, + { + "epoch": 0.08895351603387573, + "grad_norm": 1.4040237665176392, + "learning_rate": 4.903027165870336e-05, + "loss": 5.0951, + "step": 14957 + }, + { + "epoch": 0.08895946331715672, + "grad_norm": 1.1941477060317993, + "learning_rate": 4.903014282193258e-05, + "loss": 5.0298, + "step": 14958 + }, + { + "epoch": 0.08896541060043772, + "grad_norm": 1.4292995929718018, + "learning_rate": 4.9030013976773125e-05, + "loss": 5.1567, + "step": 14959 + }, + { + "epoch": 0.08897135788371872, + "grad_norm": 1.4789859056472778, + "learning_rate": 4.902988512322505e-05, + "loss": 5.2172, + "step": 14960 + }, + { + "epoch": 0.08897730516699971, + "grad_norm": 2.160266876220703, + "learning_rate": 4.9029756261288376e-05, + "loss": 5.3458, + "step": 14961 + }, + { + "epoch": 0.08898325245028071, + "grad_norm": 1.8164606094360352, + "learning_rate": 4.902962739096317e-05, + "loss": 5.2795, + "step": 14962 + }, + { + "epoch": 0.0889891997335617, + "grad_norm": 2.0879664421081543, + "learning_rate": 4.902949851224947e-05, + "loss": 5.595, + "step": 14963 + }, + { + "epoch": 0.0889951470168427, + "grad_norm": 2.59543514251709, + "learning_rate": 4.9029369625147324e-05, + "loss": 5.3626, + "step": 14964 + }, + { + "epoch": 0.0890010943001237, + "grad_norm": 2.0679430961608887, + "learning_rate": 4.9029240729656764e-05, + "loss": 5.4222, + "step": 14965 + }, + { + "epoch": 0.0890070415834047, + "grad_norm": 1.90644109249115, + "learning_rate": 4.902911182577785e-05, + "loss": 6.1042, + "step": 14966 + }, + { + "epoch": 0.0890129888666857, + "grad_norm": 1.8565638065338135, + "learning_rate": 4.9028982913510626e-05, + "loss": 6.0312, + "step": 14967 + }, + { + "epoch": 0.0890189361499667, + "grad_norm": 1.717623233795166, + "learning_rate": 4.902885399285512e-05, + "loss": 5.794, + "step": 14968 + }, + { + "epoch": 0.08902488343324769, + "grad_norm": 2.2094457149505615, + "learning_rate": 4.90287250638114e-05, + "loss": 5.2517, + "step": 14969 + }, + { + "epoch": 0.08903083071652869, + "grad_norm": 2.2559561729431152, + "learning_rate": 4.9028596126379493e-05, + "loss": 5.2155, + "step": 14970 + }, + { + "epoch": 0.08903677799980969, + "grad_norm": 2.5394740104675293, + "learning_rate": 4.9028467180559455e-05, + "loss": 5.0829, + "step": 14971 + }, + { + "epoch": 0.08904272528309068, + "grad_norm": 1.9542546272277832, + "learning_rate": 4.902833822635133e-05, + "loss": 4.856, + "step": 14972 + }, + { + "epoch": 0.08904867256637168, + "grad_norm": 1.9541314840316772, + "learning_rate": 4.9028209263755154e-05, + "loss": 4.9858, + "step": 14973 + }, + { + "epoch": 0.08905461984965268, + "grad_norm": 1.8625229597091675, + "learning_rate": 4.9028080292770986e-05, + "loss": 4.976, + "step": 14974 + }, + { + "epoch": 0.08906056713293367, + "grad_norm": 2.254417657852173, + "learning_rate": 4.9027951313398855e-05, + "loss": 4.9765, + "step": 14975 + }, + { + "epoch": 0.08906651441621467, + "grad_norm": 2.3143160343170166, + "learning_rate": 4.902782232563882e-05, + "loss": 4.9562, + "step": 14976 + }, + { + "epoch": 0.08907246169949568, + "grad_norm": 2.320388078689575, + "learning_rate": 4.902769332949092e-05, + "loss": 4.9988, + "step": 14977 + }, + { + "epoch": 0.08907840898277666, + "grad_norm": 2.378101348876953, + "learning_rate": 4.90275643249552e-05, + "loss": 5.0869, + "step": 14978 + }, + { + "epoch": 0.08908435626605767, + "grad_norm": 2.5663437843322754, + "learning_rate": 4.90274353120317e-05, + "loss": 5.1124, + "step": 14979 + }, + { + "epoch": 0.08909030354933867, + "grad_norm": 2.2866733074188232, + "learning_rate": 4.902730629072048e-05, + "loss": 5.0564, + "step": 14980 + }, + { + "epoch": 0.08909625083261966, + "grad_norm": 2.060153007507324, + "learning_rate": 4.902717726102157e-05, + "loss": 4.9419, + "step": 14981 + }, + { + "epoch": 0.08910219811590066, + "grad_norm": 2.1555984020233154, + "learning_rate": 4.902704822293502e-05, + "loss": 4.6593, + "step": 14982 + }, + { + "epoch": 0.08910814539918166, + "grad_norm": 2.2045845985412598, + "learning_rate": 4.902691917646088e-05, + "loss": 4.6824, + "step": 14983 + }, + { + "epoch": 0.08911409268246265, + "grad_norm": 2.2891733646392822, + "learning_rate": 4.9026790121599185e-05, + "loss": 4.6378, + "step": 14984 + }, + { + "epoch": 0.08912003996574365, + "grad_norm": 2.0503318309783936, + "learning_rate": 4.902666105834999e-05, + "loss": 4.8051, + "step": 14985 + }, + { + "epoch": 0.08912598724902465, + "grad_norm": 2.2125399112701416, + "learning_rate": 4.9026531986713336e-05, + "loss": 5.0773, + "step": 14986 + }, + { + "epoch": 0.08913193453230564, + "grad_norm": 2.1177804470062256, + "learning_rate": 4.902640290668927e-05, + "loss": 5.0995, + "step": 14987 + }, + { + "epoch": 0.08913788181558664, + "grad_norm": 2.1028857231140137, + "learning_rate": 4.902627381827783e-05, + "loss": 4.3883, + "step": 14988 + }, + { + "epoch": 0.08914382909886764, + "grad_norm": 1.9426429271697998, + "learning_rate": 4.9026144721479065e-05, + "loss": 4.6539, + "step": 14989 + }, + { + "epoch": 0.08914977638214863, + "grad_norm": 2.2325892448425293, + "learning_rate": 4.902601561629302e-05, + "loss": 4.731, + "step": 14990 + }, + { + "epoch": 0.08915572366542963, + "grad_norm": 2.3903300762176514, + "learning_rate": 4.9025886502719756e-05, + "loss": 4.5786, + "step": 14991 + }, + { + "epoch": 0.08916167094871062, + "grad_norm": 2.368431806564331, + "learning_rate": 4.9025757380759284e-05, + "loss": 4.8904, + "step": 14992 + }, + { + "epoch": 0.08916761823199162, + "grad_norm": 2.1727442741394043, + "learning_rate": 4.902562825041168e-05, + "loss": 4.6276, + "step": 14993 + }, + { + "epoch": 0.08917356551527263, + "grad_norm": 2.2038626670837402, + "learning_rate": 4.9025499111676975e-05, + "loss": 4.7451, + "step": 14994 + }, + { + "epoch": 0.08917951279855361, + "grad_norm": 2.3933217525482178, + "learning_rate": 4.902536996455521e-05, + "loss": 4.8129, + "step": 14995 + }, + { + "epoch": 0.08918546008183462, + "grad_norm": 2.473212242126465, + "learning_rate": 4.902524080904645e-05, + "loss": 4.6171, + "step": 14996 + }, + { + "epoch": 0.08919140736511562, + "grad_norm": 2.2226645946502686, + "learning_rate": 4.902511164515071e-05, + "loss": 4.3847, + "step": 14997 + }, + { + "epoch": 0.0891973546483966, + "grad_norm": 2.0874104499816895, + "learning_rate": 4.9024982472868065e-05, + "loss": 4.801, + "step": 14998 + }, + { + "epoch": 0.08920330193167761, + "grad_norm": 1.9831374883651733, + "learning_rate": 4.902485329219854e-05, + "loss": 4.8995, + "step": 14999 + }, + { + "epoch": 0.08920924921495861, + "grad_norm": 2.1662073135375977, + "learning_rate": 4.9024724103142196e-05, + "loss": 4.7221, + "step": 15000 + }, + { + "epoch": 0.0892151964982396, + "grad_norm": 2.335336685180664, + "learning_rate": 4.902459490569906e-05, + "loss": 4.5051, + "step": 15001 + }, + { + "epoch": 0.0892211437815206, + "grad_norm": 2.2647337913513184, + "learning_rate": 4.902446569986919e-05, + "loss": 4.5274, + "step": 15002 + }, + { + "epoch": 0.0892270910648016, + "grad_norm": 2.1781129837036133, + "learning_rate": 4.9024336485652625e-05, + "loss": 4.5661, + "step": 15003 + }, + { + "epoch": 0.08923303834808259, + "grad_norm": 2.6452128887176514, + "learning_rate": 4.902420726304941e-05, + "loss": 5.0087, + "step": 15004 + }, + { + "epoch": 0.0892389856313636, + "grad_norm": 2.10276460647583, + "learning_rate": 4.90240780320596e-05, + "loss": 4.5003, + "step": 15005 + }, + { + "epoch": 0.0892449329146446, + "grad_norm": 2.1297876834869385, + "learning_rate": 4.902394879268323e-05, + "loss": 4.7603, + "step": 15006 + }, + { + "epoch": 0.08925088019792558, + "grad_norm": 2.288257122039795, + "learning_rate": 4.902381954492033e-05, + "loss": 4.7433, + "step": 15007 + }, + { + "epoch": 0.08925682748120659, + "grad_norm": 2.422492742538452, + "learning_rate": 4.902369028877098e-05, + "loss": 4.7823, + "step": 15008 + }, + { + "epoch": 0.08926277476448759, + "grad_norm": 2.4264109134674072, + "learning_rate": 4.9023561024235215e-05, + "loss": 4.9725, + "step": 15009 + }, + { + "epoch": 0.08926872204776858, + "grad_norm": 2.191776752471924, + "learning_rate": 4.902343175131307e-05, + "loss": 4.7893, + "step": 15010 + }, + { + "epoch": 0.08927466933104958, + "grad_norm": 2.0434861183166504, + "learning_rate": 4.9023302470004584e-05, + "loss": 5.3321, + "step": 15011 + }, + { + "epoch": 0.08928061661433058, + "grad_norm": 2.3108692169189453, + "learning_rate": 4.902317318030981e-05, + "loss": 4.848, + "step": 15012 + }, + { + "epoch": 0.08928656389761157, + "grad_norm": 1.8814477920532227, + "learning_rate": 4.9023043882228805e-05, + "loss": 4.9666, + "step": 15013 + }, + { + "epoch": 0.08929251118089257, + "grad_norm": 1.7109707593917847, + "learning_rate": 4.902291457576159e-05, + "loss": 5.0996, + "step": 15014 + }, + { + "epoch": 0.08929845846417357, + "grad_norm": 1.4246928691864014, + "learning_rate": 4.902278526090823e-05, + "loss": 5.1413, + "step": 15015 + }, + { + "epoch": 0.08930440574745456, + "grad_norm": 1.5714298486709595, + "learning_rate": 4.902265593766877e-05, + "loss": 5.4028, + "step": 15016 + }, + { + "epoch": 0.08931035303073556, + "grad_norm": 1.4553309679031372, + "learning_rate": 4.902252660604324e-05, + "loss": 5.1903, + "step": 15017 + }, + { + "epoch": 0.08931630031401656, + "grad_norm": 1.3266233205795288, + "learning_rate": 4.902239726603171e-05, + "loss": 5.1093, + "step": 15018 + }, + { + "epoch": 0.08932224759729755, + "grad_norm": 1.3145966529846191, + "learning_rate": 4.902226791763419e-05, + "loss": 5.0704, + "step": 15019 + }, + { + "epoch": 0.08932819488057855, + "grad_norm": 1.4367384910583496, + "learning_rate": 4.9022138560850754e-05, + "loss": 4.9669, + "step": 15020 + }, + { + "epoch": 0.08933414216385954, + "grad_norm": 1.4239497184753418, + "learning_rate": 4.902200919568144e-05, + "loss": 5.1035, + "step": 15021 + }, + { + "epoch": 0.08934008944714054, + "grad_norm": 1.323853611946106, + "learning_rate": 4.9021879822126284e-05, + "loss": 4.989, + "step": 15022 + }, + { + "epoch": 0.08934603673042155, + "grad_norm": 1.596498727798462, + "learning_rate": 4.9021750440185345e-05, + "loss": 5.0445, + "step": 15023 + }, + { + "epoch": 0.08935198401370253, + "grad_norm": 1.3866841793060303, + "learning_rate": 4.902162104985865e-05, + "loss": 4.9832, + "step": 15024 + }, + { + "epoch": 0.08935793129698354, + "grad_norm": 1.2495089769363403, + "learning_rate": 4.9021491651146265e-05, + "loss": 5.1337, + "step": 15025 + }, + { + "epoch": 0.08936387858026454, + "grad_norm": 1.2082443237304688, + "learning_rate": 4.902136224404822e-05, + "loss": 5.1038, + "step": 15026 + }, + { + "epoch": 0.08936982586354553, + "grad_norm": 1.5153082609176636, + "learning_rate": 4.9021232828564564e-05, + "loss": 5.122, + "step": 15027 + }, + { + "epoch": 0.08937577314682653, + "grad_norm": 1.5340677499771118, + "learning_rate": 4.902110340469536e-05, + "loss": 5.2675, + "step": 15028 + }, + { + "epoch": 0.08938172043010753, + "grad_norm": 1.9367091655731201, + "learning_rate": 4.9020973972440624e-05, + "loss": 5.4528, + "step": 15029 + }, + { + "epoch": 0.08938766771338852, + "grad_norm": 1.7637518644332886, + "learning_rate": 4.902084453180041e-05, + "loss": 5.4686, + "step": 15030 + }, + { + "epoch": 0.08939361499666952, + "grad_norm": 1.668220043182373, + "learning_rate": 4.902071508277477e-05, + "loss": 5.5889, + "step": 15031 + }, + { + "epoch": 0.08939956227995052, + "grad_norm": 2.0754151344299316, + "learning_rate": 4.902058562536375e-05, + "loss": 5.7398, + "step": 15032 + }, + { + "epoch": 0.08940550956323151, + "grad_norm": 1.9756910800933838, + "learning_rate": 4.902045615956739e-05, + "loss": 5.528, + "step": 15033 + }, + { + "epoch": 0.08941145684651251, + "grad_norm": 1.6614958047866821, + "learning_rate": 4.9020326685385735e-05, + "loss": 5.5761, + "step": 15034 + }, + { + "epoch": 0.08941740412979352, + "grad_norm": 2.0193135738372803, + "learning_rate": 4.902019720281884e-05, + "loss": 5.1836, + "step": 15035 + }, + { + "epoch": 0.0894233514130745, + "grad_norm": 2.164290428161621, + "learning_rate": 4.9020067711866735e-05, + "loss": 5.0216, + "step": 15036 + }, + { + "epoch": 0.0894292986963555, + "grad_norm": 2.3957648277282715, + "learning_rate": 4.901993821252947e-05, + "loss": 4.9631, + "step": 15037 + }, + { + "epoch": 0.08943524597963651, + "grad_norm": 2.204258680343628, + "learning_rate": 4.90198087048071e-05, + "loss": 4.774, + "step": 15038 + }, + { + "epoch": 0.0894411932629175, + "grad_norm": 1.7879102230072021, + "learning_rate": 4.9019679188699666e-05, + "loss": 5.716, + "step": 15039 + }, + { + "epoch": 0.0894471405461985, + "grad_norm": 1.6019984483718872, + "learning_rate": 4.9019549664207196e-05, + "loss": 5.3657, + "step": 15040 + }, + { + "epoch": 0.0894530878294795, + "grad_norm": 2.079514741897583, + "learning_rate": 4.901942013132976e-05, + "loss": 5.0526, + "step": 15041 + }, + { + "epoch": 0.08945903511276049, + "grad_norm": 1.9381201267242432, + "learning_rate": 4.901929059006739e-05, + "loss": 4.9585, + "step": 15042 + }, + { + "epoch": 0.08946498239604149, + "grad_norm": 1.6514472961425781, + "learning_rate": 4.9019161040420134e-05, + "loss": 5.4721, + "step": 15043 + }, + { + "epoch": 0.08947092967932249, + "grad_norm": 1.7294371128082275, + "learning_rate": 4.901903148238804e-05, + "loss": 5.4401, + "step": 15044 + }, + { + "epoch": 0.08947687696260348, + "grad_norm": 1.7769347429275513, + "learning_rate": 4.901890191597115e-05, + "loss": 5.4324, + "step": 15045 + }, + { + "epoch": 0.08948282424588448, + "grad_norm": 1.6517225503921509, + "learning_rate": 4.9018772341169505e-05, + "loss": 5.2967, + "step": 15046 + }, + { + "epoch": 0.08948877152916548, + "grad_norm": 1.5310052633285522, + "learning_rate": 4.901864275798316e-05, + "loss": 5.4017, + "step": 15047 + }, + { + "epoch": 0.08949471881244647, + "grad_norm": 1.9703199863433838, + "learning_rate": 4.9018513166412146e-05, + "loss": 4.9813, + "step": 15048 + }, + { + "epoch": 0.08950066609572747, + "grad_norm": 1.991087555885315, + "learning_rate": 4.901838356645652e-05, + "loss": 5.2911, + "step": 15049 + }, + { + "epoch": 0.08950661337900846, + "grad_norm": 1.7992926836013794, + "learning_rate": 4.9018253958116334e-05, + "loss": 5.2996, + "step": 15050 + }, + { + "epoch": 0.08951256066228946, + "grad_norm": 1.5164752006530762, + "learning_rate": 4.901812434139161e-05, + "loss": 5.8002, + "step": 15051 + }, + { + "epoch": 0.08951850794557047, + "grad_norm": 1.8143075704574585, + "learning_rate": 4.9017994716282415e-05, + "loss": 5.241, + "step": 15052 + }, + { + "epoch": 0.08952445522885145, + "grad_norm": 1.9806342124938965, + "learning_rate": 4.9017865082788785e-05, + "loss": 5.3656, + "step": 15053 + }, + { + "epoch": 0.08953040251213246, + "grad_norm": 2.403789520263672, + "learning_rate": 4.901773544091077e-05, + "loss": 5.1024, + "step": 15054 + }, + { + "epoch": 0.08953634979541346, + "grad_norm": 1.5903408527374268, + "learning_rate": 4.90176057906484e-05, + "loss": 5.3849, + "step": 15055 + }, + { + "epoch": 0.08954229707869445, + "grad_norm": 1.764125943183899, + "learning_rate": 4.901747613200175e-05, + "loss": 5.0757, + "step": 15056 + }, + { + "epoch": 0.08954824436197545, + "grad_norm": 2.1031241416931152, + "learning_rate": 4.901734646497084e-05, + "loss": 5.2114, + "step": 15057 + }, + { + "epoch": 0.08955419164525645, + "grad_norm": 1.9965282678604126, + "learning_rate": 4.901721678955571e-05, + "loss": 5.1136, + "step": 15058 + }, + { + "epoch": 0.08956013892853744, + "grad_norm": 1.9062676429748535, + "learning_rate": 4.9017087105756434e-05, + "loss": 4.9166, + "step": 15059 + }, + { + "epoch": 0.08956608621181844, + "grad_norm": 2.0963199138641357, + "learning_rate": 4.901695741357303e-05, + "loss": 4.7587, + "step": 15060 + }, + { + "epoch": 0.08957203349509944, + "grad_norm": 1.7062407732009888, + "learning_rate": 4.901682771300556e-05, + "loss": 5.3046, + "step": 15061 + }, + { + "epoch": 0.08957798077838043, + "grad_norm": 1.574013352394104, + "learning_rate": 4.9016698004054065e-05, + "loss": 5.3007, + "step": 15062 + }, + { + "epoch": 0.08958392806166143, + "grad_norm": 1.7540260553359985, + "learning_rate": 4.9016568286718586e-05, + "loss": 5.5824, + "step": 15063 + }, + { + "epoch": 0.08958987534494244, + "grad_norm": 1.4875624179840088, + "learning_rate": 4.901643856099917e-05, + "loss": 5.4569, + "step": 15064 + }, + { + "epoch": 0.08959582262822342, + "grad_norm": 1.6023603677749634, + "learning_rate": 4.901630882689586e-05, + "loss": 5.5397, + "step": 15065 + }, + { + "epoch": 0.08960176991150443, + "grad_norm": 2.1851913928985596, + "learning_rate": 4.9016179084408706e-05, + "loss": 4.9882, + "step": 15066 + }, + { + "epoch": 0.08960771719478543, + "grad_norm": 1.4636015892028809, + "learning_rate": 4.901604933353776e-05, + "loss": 5.4568, + "step": 15067 + }, + { + "epoch": 0.08961366447806642, + "grad_norm": 2.6841142177581787, + "learning_rate": 4.901591957428305e-05, + "loss": 5.8365, + "step": 15068 + }, + { + "epoch": 0.08961961176134742, + "grad_norm": 2.2015743255615234, + "learning_rate": 4.9015789806644643e-05, + "loss": 5.4798, + "step": 15069 + }, + { + "epoch": 0.08962555904462842, + "grad_norm": 2.3934903144836426, + "learning_rate": 4.901566003062256e-05, + "loss": 5.3355, + "step": 15070 + }, + { + "epoch": 0.08963150632790941, + "grad_norm": 2.418919801712036, + "learning_rate": 4.9015530246216866e-05, + "loss": 5.2546, + "step": 15071 + }, + { + "epoch": 0.08963745361119041, + "grad_norm": 2.2773303985595703, + "learning_rate": 4.90154004534276e-05, + "loss": 5.3306, + "step": 15072 + }, + { + "epoch": 0.08964340089447141, + "grad_norm": 2.09413743019104, + "learning_rate": 4.9015270652254796e-05, + "loss": 5.4715, + "step": 15073 + }, + { + "epoch": 0.0896493481777524, + "grad_norm": 1.8905339241027832, + "learning_rate": 4.901514084269852e-05, + "loss": 5.2248, + "step": 15074 + }, + { + "epoch": 0.0896552954610334, + "grad_norm": 1.7001872062683105, + "learning_rate": 4.9015011024758794e-05, + "loss": 5.2869, + "step": 15075 + }, + { + "epoch": 0.0896612427443144, + "grad_norm": 1.7953561544418335, + "learning_rate": 4.901488119843568e-05, + "loss": 5.2027, + "step": 15076 + }, + { + "epoch": 0.08966719002759539, + "grad_norm": 1.8996349573135376, + "learning_rate": 4.9014751363729225e-05, + "loss": 5.8168, + "step": 15077 + }, + { + "epoch": 0.0896731373108764, + "grad_norm": 1.6294323205947876, + "learning_rate": 4.901462152063946e-05, + "loss": 5.0331, + "step": 15078 + }, + { + "epoch": 0.08967908459415738, + "grad_norm": 1.4392082691192627, + "learning_rate": 4.901449166916645e-05, + "loss": 4.9094, + "step": 15079 + }, + { + "epoch": 0.08968503187743838, + "grad_norm": 1.6613532304763794, + "learning_rate": 4.9014361809310216e-05, + "loss": 5.1426, + "step": 15080 + }, + { + "epoch": 0.08969097916071939, + "grad_norm": 1.7502686977386475, + "learning_rate": 4.9014231941070823e-05, + "loss": 5.4298, + "step": 15081 + }, + { + "epoch": 0.08969692644400037, + "grad_norm": 1.9276418685913086, + "learning_rate": 4.9014102064448305e-05, + "loss": 5.8383, + "step": 15082 + }, + { + "epoch": 0.08970287372728138, + "grad_norm": 2.471407651901245, + "learning_rate": 4.901397217944272e-05, + "loss": 6.1879, + "step": 15083 + }, + { + "epoch": 0.08970882101056238, + "grad_norm": 2.0759341716766357, + "learning_rate": 4.90138422860541e-05, + "loss": 6.0929, + "step": 15084 + }, + { + "epoch": 0.08971476829384337, + "grad_norm": 1.6504180431365967, + "learning_rate": 4.9013712384282505e-05, + "loss": 6.0733, + "step": 15085 + }, + { + "epoch": 0.08972071557712437, + "grad_norm": 1.7268849611282349, + "learning_rate": 4.9013582474127965e-05, + "loss": 5.9707, + "step": 15086 + }, + { + "epoch": 0.08972666286040537, + "grad_norm": 1.8029861450195312, + "learning_rate": 4.901345255559053e-05, + "loss": 5.3645, + "step": 15087 + }, + { + "epoch": 0.08973261014368636, + "grad_norm": 1.8240137100219727, + "learning_rate": 4.9013322628670246e-05, + "loss": 5.4201, + "step": 15088 + }, + { + "epoch": 0.08973855742696736, + "grad_norm": 1.799771785736084, + "learning_rate": 4.901319269336716e-05, + "loss": 5.2043, + "step": 15089 + }, + { + "epoch": 0.08974450471024836, + "grad_norm": 1.6271024942398071, + "learning_rate": 4.901306274968131e-05, + "loss": 5.4118, + "step": 15090 + }, + { + "epoch": 0.08975045199352935, + "grad_norm": 1.4443042278289795, + "learning_rate": 4.9012932797612756e-05, + "loss": 5.5921, + "step": 15091 + }, + { + "epoch": 0.08975639927681035, + "grad_norm": 1.7174689769744873, + "learning_rate": 4.9012802837161535e-05, + "loss": 5.5233, + "step": 15092 + }, + { + "epoch": 0.08976234656009136, + "grad_norm": 1.7158472537994385, + "learning_rate": 4.901267286832769e-05, + "loss": 5.9171, + "step": 15093 + }, + { + "epoch": 0.08976829384337234, + "grad_norm": 1.691797137260437, + "learning_rate": 4.9012542891111275e-05, + "loss": 5.6207, + "step": 15094 + }, + { + "epoch": 0.08977424112665335, + "grad_norm": 1.7525362968444824, + "learning_rate": 4.901241290551233e-05, + "loss": 5.3468, + "step": 15095 + }, + { + "epoch": 0.08978018840993435, + "grad_norm": 1.6895235776901245, + "learning_rate": 4.901228291153089e-05, + "loss": 5.3567, + "step": 15096 + }, + { + "epoch": 0.08978613569321534, + "grad_norm": 1.6617051362991333, + "learning_rate": 4.9012152909167015e-05, + "loss": 5.6781, + "step": 15097 + }, + { + "epoch": 0.08979208297649634, + "grad_norm": 1.5234577655792236, + "learning_rate": 4.901202289842075e-05, + "loss": 5.6262, + "step": 15098 + }, + { + "epoch": 0.08979803025977734, + "grad_norm": 2.1545703411102295, + "learning_rate": 4.9011892879292125e-05, + "loss": 5.3112, + "step": 15099 + }, + { + "epoch": 0.08980397754305833, + "grad_norm": 2.246051073074341, + "learning_rate": 4.9011762851781204e-05, + "loss": 5.3783, + "step": 15100 + }, + { + "epoch": 0.08980992482633933, + "grad_norm": 2.000429630279541, + "learning_rate": 4.901163281588802e-05, + "loss": 5.2561, + "step": 15101 + }, + { + "epoch": 0.08981587210962033, + "grad_norm": 2.0881898403167725, + "learning_rate": 4.901150277161263e-05, + "loss": 5.3308, + "step": 15102 + }, + { + "epoch": 0.08982181939290132, + "grad_norm": 2.4498097896575928, + "learning_rate": 4.901137271895506e-05, + "loss": 5.8405, + "step": 15103 + }, + { + "epoch": 0.08982776667618232, + "grad_norm": 2.210160732269287, + "learning_rate": 4.901124265791538e-05, + "loss": 5.5462, + "step": 15104 + }, + { + "epoch": 0.08983371395946332, + "grad_norm": 2.366419553756714, + "learning_rate": 4.9011112588493625e-05, + "loss": 5.4069, + "step": 15105 + }, + { + "epoch": 0.08983966124274431, + "grad_norm": 1.812118649482727, + "learning_rate": 4.901098251068983e-05, + "loss": 5.9549, + "step": 15106 + }, + { + "epoch": 0.08984560852602531, + "grad_norm": 1.6506917476654053, + "learning_rate": 4.901085242450405e-05, + "loss": 5.762, + "step": 15107 + }, + { + "epoch": 0.0898515558093063, + "grad_norm": 1.8076404333114624, + "learning_rate": 4.901072232993633e-05, + "loss": 5.7841, + "step": 15108 + }, + { + "epoch": 0.0898575030925873, + "grad_norm": 2.51157546043396, + "learning_rate": 4.9010592226986716e-05, + "loss": 5.1544, + "step": 15109 + }, + { + "epoch": 0.0898634503758683, + "grad_norm": 1.9424755573272705, + "learning_rate": 4.901046211565526e-05, + "loss": 5.4587, + "step": 15110 + }, + { + "epoch": 0.0898693976591493, + "grad_norm": 1.998506784439087, + "learning_rate": 4.9010331995941995e-05, + "loss": 5.8242, + "step": 15111 + }, + { + "epoch": 0.0898753449424303, + "grad_norm": 1.8947205543518066, + "learning_rate": 4.901020186784697e-05, + "loss": 5.4488, + "step": 15112 + }, + { + "epoch": 0.0898812922257113, + "grad_norm": 1.905993938446045, + "learning_rate": 4.901007173137022e-05, + "loss": 5.3882, + "step": 15113 + }, + { + "epoch": 0.08988723950899229, + "grad_norm": 1.723973274230957, + "learning_rate": 4.900994158651182e-05, + "loss": 5.9411, + "step": 15114 + }, + { + "epoch": 0.08989318679227329, + "grad_norm": 1.747159719467163, + "learning_rate": 4.900981143327179e-05, + "loss": 5.8436, + "step": 15115 + }, + { + "epoch": 0.08989913407555429, + "grad_norm": 1.7400517463684082, + "learning_rate": 4.900968127165018e-05, + "loss": 5.7067, + "step": 15116 + }, + { + "epoch": 0.08990508135883528, + "grad_norm": 1.763750433921814, + "learning_rate": 4.900955110164704e-05, + "loss": 5.6198, + "step": 15117 + }, + { + "epoch": 0.08991102864211628, + "grad_norm": 1.9004894495010376, + "learning_rate": 4.9009420923262416e-05, + "loss": 5.0977, + "step": 15118 + }, + { + "epoch": 0.08991697592539728, + "grad_norm": 1.6853641271591187, + "learning_rate": 4.900929073649635e-05, + "loss": 5.5213, + "step": 15119 + }, + { + "epoch": 0.08992292320867827, + "grad_norm": 1.7032074928283691, + "learning_rate": 4.900916054134889e-05, + "loss": 5.3764, + "step": 15120 + }, + { + "epoch": 0.08992887049195927, + "grad_norm": 1.623089075088501, + "learning_rate": 4.9009030337820084e-05, + "loss": 5.525, + "step": 15121 + }, + { + "epoch": 0.08993481777524027, + "grad_norm": 1.6154295206069946, + "learning_rate": 4.900890012590996e-05, + "loss": 5.7378, + "step": 15122 + }, + { + "epoch": 0.08994076505852126, + "grad_norm": 1.8368462324142456, + "learning_rate": 4.900876990561859e-05, + "loss": 5.4768, + "step": 15123 + }, + { + "epoch": 0.08994671234180227, + "grad_norm": 1.7773829698562622, + "learning_rate": 4.9008639676946e-05, + "loss": 5.419, + "step": 15124 + }, + { + "epoch": 0.08995265962508327, + "grad_norm": 1.625287413597107, + "learning_rate": 4.9008509439892244e-05, + "loss": 5.4727, + "step": 15125 + }, + { + "epoch": 0.08995860690836426, + "grad_norm": 1.6234408617019653, + "learning_rate": 4.9008379194457364e-05, + "loss": 5.413, + "step": 15126 + }, + { + "epoch": 0.08996455419164526, + "grad_norm": 1.7441129684448242, + "learning_rate": 4.900824894064141e-05, + "loss": 5.2681, + "step": 15127 + }, + { + "epoch": 0.08997050147492626, + "grad_norm": 1.8756482601165771, + "learning_rate": 4.900811867844443e-05, + "loss": 5.5319, + "step": 15128 + }, + { + "epoch": 0.08997644875820725, + "grad_norm": 1.9200249910354614, + "learning_rate": 4.900798840786645e-05, + "loss": 4.7499, + "step": 15129 + }, + { + "epoch": 0.08998239604148825, + "grad_norm": 2.4838919639587402, + "learning_rate": 4.900785812890753e-05, + "loss": 5.0713, + "step": 15130 + }, + { + "epoch": 0.08998834332476925, + "grad_norm": 2.1441292762756348, + "learning_rate": 4.900772784156773e-05, + "loss": 4.9425, + "step": 15131 + }, + { + "epoch": 0.08999429060805024, + "grad_norm": 2.0838072299957275, + "learning_rate": 4.9007597545847066e-05, + "loss": 5.0632, + "step": 15132 + }, + { + "epoch": 0.09000023789133124, + "grad_norm": 1.630042314529419, + "learning_rate": 4.90074672417456e-05, + "loss": 5.2275, + "step": 15133 + }, + { + "epoch": 0.09000618517461224, + "grad_norm": 2.336031675338745, + "learning_rate": 4.900733692926338e-05, + "loss": 4.9596, + "step": 15134 + }, + { + "epoch": 0.09001213245789323, + "grad_norm": 2.414837598800659, + "learning_rate": 4.9007206608400446e-05, + "loss": 4.7405, + "step": 15135 + }, + { + "epoch": 0.09001807974117423, + "grad_norm": 2.2872564792633057, + "learning_rate": 4.900707627915684e-05, + "loss": 4.8294, + "step": 15136 + }, + { + "epoch": 0.09002402702445522, + "grad_norm": 2.474933624267578, + "learning_rate": 4.9006945941532615e-05, + "loss": 4.882, + "step": 15137 + }, + { + "epoch": 0.09002997430773622, + "grad_norm": 2.170109987258911, + "learning_rate": 4.900681559552781e-05, + "loss": 4.6778, + "step": 15138 + }, + { + "epoch": 0.09003592159101723, + "grad_norm": 2.1962943077087402, + "learning_rate": 4.900668524114248e-05, + "loss": 4.8201, + "step": 15139 + }, + { + "epoch": 0.09004186887429821, + "grad_norm": 2.46073317527771, + "learning_rate": 4.9006554878376656e-05, + "loss": 4.6929, + "step": 15140 + }, + { + "epoch": 0.09004781615757922, + "grad_norm": 2.4591431617736816, + "learning_rate": 4.90064245072304e-05, + "loss": 4.711, + "step": 15141 + }, + { + "epoch": 0.09005376344086022, + "grad_norm": 2.2225937843322754, + "learning_rate": 4.9006294127703745e-05, + "loss": 5.2556, + "step": 15142 + }, + { + "epoch": 0.0900597107241412, + "grad_norm": 2.3457517623901367, + "learning_rate": 4.900616373979674e-05, + "loss": 5.7773, + "step": 15143 + }, + { + "epoch": 0.09006565800742221, + "grad_norm": 2.226430892944336, + "learning_rate": 4.9006033343509436e-05, + "loss": 5.6364, + "step": 15144 + }, + { + "epoch": 0.09007160529070321, + "grad_norm": 2.1407759189605713, + "learning_rate": 4.900590293884186e-05, + "loss": 5.4202, + "step": 15145 + }, + { + "epoch": 0.0900775525739842, + "grad_norm": 1.7371548414230347, + "learning_rate": 4.9005772525794084e-05, + "loss": 5.5686, + "step": 15146 + }, + { + "epoch": 0.0900834998572652, + "grad_norm": 1.8759154081344604, + "learning_rate": 4.900564210436615e-05, + "loss": 5.4824, + "step": 15147 + }, + { + "epoch": 0.0900894471405462, + "grad_norm": 1.8595685958862305, + "learning_rate": 4.900551167455807e-05, + "loss": 5.6123, + "step": 15148 + }, + { + "epoch": 0.09009539442382719, + "grad_norm": 2.0119471549987793, + "learning_rate": 4.900538123636993e-05, + "loss": 5.5925, + "step": 15149 + }, + { + "epoch": 0.09010134170710819, + "grad_norm": 1.9375147819519043, + "learning_rate": 4.900525078980176e-05, + "loss": 5.5707, + "step": 15150 + }, + { + "epoch": 0.0901072889903892, + "grad_norm": 1.7323594093322754, + "learning_rate": 4.9005120334853595e-05, + "loss": 5.4133, + "step": 15151 + }, + { + "epoch": 0.09011323627367018, + "grad_norm": 1.7680727243423462, + "learning_rate": 4.90049898715255e-05, + "loss": 5.5954, + "step": 15152 + }, + { + "epoch": 0.09011918355695119, + "grad_norm": 1.8436721563339233, + "learning_rate": 4.9004859399817505e-05, + "loss": 5.5689, + "step": 15153 + }, + { + "epoch": 0.09012513084023219, + "grad_norm": 1.8080954551696777, + "learning_rate": 4.9004728919729664e-05, + "loss": 5.5266, + "step": 15154 + }, + { + "epoch": 0.09013107812351318, + "grad_norm": 2.2874748706817627, + "learning_rate": 4.900459843126202e-05, + "loss": 5.1985, + "step": 15155 + }, + { + "epoch": 0.09013702540679418, + "grad_norm": 1.8425899744033813, + "learning_rate": 4.900446793441462e-05, + "loss": 5.2856, + "step": 15156 + }, + { + "epoch": 0.09014297269007518, + "grad_norm": 1.6970654726028442, + "learning_rate": 4.900433742918751e-05, + "loss": 5.8597, + "step": 15157 + }, + { + "epoch": 0.09014891997335617, + "grad_norm": 2.3444008827209473, + "learning_rate": 4.9004206915580726e-05, + "loss": 4.4653, + "step": 15158 + }, + { + "epoch": 0.09015486725663717, + "grad_norm": 2.0390350818634033, + "learning_rate": 4.9004076393594325e-05, + "loss": 4.6565, + "step": 15159 + }, + { + "epoch": 0.09016081453991817, + "grad_norm": 2.0733320713043213, + "learning_rate": 4.900394586322835e-05, + "loss": 4.6052, + "step": 15160 + }, + { + "epoch": 0.09016676182319916, + "grad_norm": 1.9700855016708374, + "learning_rate": 4.9003815324482846e-05, + "loss": 4.7535, + "step": 15161 + }, + { + "epoch": 0.09017270910648016, + "grad_norm": 2.0294783115386963, + "learning_rate": 4.900368477735786e-05, + "loss": 5.4154, + "step": 15162 + }, + { + "epoch": 0.09017865638976116, + "grad_norm": 1.8937848806381226, + "learning_rate": 4.900355422185343e-05, + "loss": 5.3244, + "step": 15163 + }, + { + "epoch": 0.09018460367304215, + "grad_norm": 1.7404329776763916, + "learning_rate": 4.900342365796961e-05, + "loss": 5.887, + "step": 15164 + }, + { + "epoch": 0.09019055095632315, + "grad_norm": 1.5309412479400635, + "learning_rate": 4.9003293085706446e-05, + "loss": 5.4574, + "step": 15165 + }, + { + "epoch": 0.09019649823960414, + "grad_norm": 2.10003662109375, + "learning_rate": 4.9003162505063976e-05, + "loss": 5.2962, + "step": 15166 + }, + { + "epoch": 0.09020244552288514, + "grad_norm": 2.7704551219940186, + "learning_rate": 4.900303191604225e-05, + "loss": 4.6386, + "step": 15167 + }, + { + "epoch": 0.09020839280616615, + "grad_norm": 3.3551974296569824, + "learning_rate": 4.9002901318641314e-05, + "loss": 5.3348, + "step": 15168 + }, + { + "epoch": 0.09021434008944713, + "grad_norm": 2.8300132751464844, + "learning_rate": 4.9002770712861216e-05, + "loss": 5.2031, + "step": 15169 + }, + { + "epoch": 0.09022028737272814, + "grad_norm": 1.77587890625, + "learning_rate": 4.9002640098702005e-05, + "loss": 5.1371, + "step": 15170 + }, + { + "epoch": 0.09022623465600914, + "grad_norm": 1.694191575050354, + "learning_rate": 4.900250947616371e-05, + "loss": 5.7283, + "step": 15171 + }, + { + "epoch": 0.09023218193929013, + "grad_norm": 1.6392415761947632, + "learning_rate": 4.900237884524638e-05, + "loss": 5.3856, + "step": 15172 + }, + { + "epoch": 0.09023812922257113, + "grad_norm": 2.302626371383667, + "learning_rate": 4.900224820595008e-05, + "loss": 5.1007, + "step": 15173 + }, + { + "epoch": 0.09024407650585213, + "grad_norm": 2.296760082244873, + "learning_rate": 4.900211755827484e-05, + "loss": 5.0303, + "step": 15174 + }, + { + "epoch": 0.09025002378913312, + "grad_norm": 2.2914488315582275, + "learning_rate": 4.9001986902220706e-05, + "loss": 5.3176, + "step": 15175 + }, + { + "epoch": 0.09025597107241412, + "grad_norm": 2.084686756134033, + "learning_rate": 4.900185623778774e-05, + "loss": 5.2028, + "step": 15176 + }, + { + "epoch": 0.09026191835569512, + "grad_norm": 1.9465001821517944, + "learning_rate": 4.9001725564975953e-05, + "loss": 4.661, + "step": 15177 + }, + { + "epoch": 0.09026786563897611, + "grad_norm": 2.926347494125366, + "learning_rate": 4.900159488378542e-05, + "loss": 4.4579, + "step": 15178 + }, + { + "epoch": 0.09027381292225711, + "grad_norm": 2.6047539710998535, + "learning_rate": 4.900146419421619e-05, + "loss": 4.5486, + "step": 15179 + }, + { + "epoch": 0.09027976020553811, + "grad_norm": 2.4737868309020996, + "learning_rate": 4.9001333496268274e-05, + "loss": 4.3661, + "step": 15180 + }, + { + "epoch": 0.0902857074888191, + "grad_norm": 2.075547456741333, + "learning_rate": 4.900120278994176e-05, + "loss": 4.3157, + "step": 15181 + }, + { + "epoch": 0.0902916547721001, + "grad_norm": 2.509284019470215, + "learning_rate": 4.900107207523666e-05, + "loss": 4.2558, + "step": 15182 + }, + { + "epoch": 0.09029760205538111, + "grad_norm": 2.4345662593841553, + "learning_rate": 4.9000941352153046e-05, + "loss": 4.2932, + "step": 15183 + }, + { + "epoch": 0.0903035493386621, + "grad_norm": 2.214146137237549, + "learning_rate": 4.9000810620690945e-05, + "loss": 4.6953, + "step": 15184 + }, + { + "epoch": 0.0903094966219431, + "grad_norm": 2.197709083557129, + "learning_rate": 4.900067988085041e-05, + "loss": 4.7138, + "step": 15185 + }, + { + "epoch": 0.0903154439052241, + "grad_norm": 2.0381791591644287, + "learning_rate": 4.900054913263148e-05, + "loss": 6.1924, + "step": 15186 + }, + { + "epoch": 0.09032139118850509, + "grad_norm": 1.7017699480056763, + "learning_rate": 4.900041837603422e-05, + "loss": 6.1646, + "step": 15187 + }, + { + "epoch": 0.09032733847178609, + "grad_norm": 1.5804365873336792, + "learning_rate": 4.9000287611058645e-05, + "loss": 6.1757, + "step": 15188 + }, + { + "epoch": 0.09033328575506709, + "grad_norm": 1.6158896684646606, + "learning_rate": 4.9000156837704836e-05, + "loss": 6.1136, + "step": 15189 + }, + { + "epoch": 0.09033923303834808, + "grad_norm": 1.9524257183074951, + "learning_rate": 4.90000260559728e-05, + "loss": 5.43, + "step": 15190 + }, + { + "epoch": 0.09034518032162908, + "grad_norm": 1.835134744644165, + "learning_rate": 4.899989526586261e-05, + "loss": 6.0223, + "step": 15191 + }, + { + "epoch": 0.09035112760491008, + "grad_norm": 1.7213332653045654, + "learning_rate": 4.899976446737432e-05, + "loss": 5.7823, + "step": 15192 + }, + { + "epoch": 0.09035707488819107, + "grad_norm": 1.8744465112686157, + "learning_rate": 4.899963366050795e-05, + "loss": 5.0549, + "step": 15193 + }, + { + "epoch": 0.09036302217147207, + "grad_norm": 1.800979495048523, + "learning_rate": 4.899950284526355e-05, + "loss": 5.0726, + "step": 15194 + }, + { + "epoch": 0.09036896945475306, + "grad_norm": 1.7476063966751099, + "learning_rate": 4.899937202164118e-05, + "loss": 4.9177, + "step": 15195 + }, + { + "epoch": 0.09037491673803406, + "grad_norm": 1.5107455253601074, + "learning_rate": 4.899924118964087e-05, + "loss": 5.1873, + "step": 15196 + }, + { + "epoch": 0.09038086402131507, + "grad_norm": 1.4630497694015503, + "learning_rate": 4.899911034926267e-05, + "loss": 4.9166, + "step": 15197 + }, + { + "epoch": 0.09038681130459605, + "grad_norm": 1.519824743270874, + "learning_rate": 4.899897950050664e-05, + "loss": 4.9084, + "step": 15198 + }, + { + "epoch": 0.09039275858787706, + "grad_norm": 1.480298399925232, + "learning_rate": 4.899884864337281e-05, + "loss": 4.8724, + "step": 15199 + }, + { + "epoch": 0.09039870587115806, + "grad_norm": 1.549485445022583, + "learning_rate": 4.8998717777861224e-05, + "loss": 4.8378, + "step": 15200 + }, + { + "epoch": 0.09040465315443905, + "grad_norm": 1.6650373935699463, + "learning_rate": 4.8998586903971936e-05, + "loss": 4.9478, + "step": 15201 + }, + { + "epoch": 0.09041060043772005, + "grad_norm": 1.5880005359649658, + "learning_rate": 4.899845602170499e-05, + "loss": 4.7952, + "step": 15202 + }, + { + "epoch": 0.09041654772100105, + "grad_norm": 1.5553892850875854, + "learning_rate": 4.899832513106043e-05, + "loss": 4.9303, + "step": 15203 + }, + { + "epoch": 0.09042249500428204, + "grad_norm": 1.5907729864120483, + "learning_rate": 4.899819423203831e-05, + "loss": 4.7951, + "step": 15204 + }, + { + "epoch": 0.09042844228756304, + "grad_norm": 1.5885943174362183, + "learning_rate": 4.899806332463866e-05, + "loss": 4.8896, + "step": 15205 + }, + { + "epoch": 0.09043438957084404, + "grad_norm": 1.7483280897140503, + "learning_rate": 4.899793240886154e-05, + "loss": 5.6137, + "step": 15206 + }, + { + "epoch": 0.09044033685412503, + "grad_norm": 1.7883373498916626, + "learning_rate": 4.8997801484706984e-05, + "loss": 5.7183, + "step": 15207 + }, + { + "epoch": 0.09044628413740603, + "grad_norm": 1.7988712787628174, + "learning_rate": 4.8997670552175044e-05, + "loss": 5.7979, + "step": 15208 + }, + { + "epoch": 0.09045223142068703, + "grad_norm": 2.1793367862701416, + "learning_rate": 4.899753961126577e-05, + "loss": 5.3549, + "step": 15209 + }, + { + "epoch": 0.09045817870396802, + "grad_norm": 2.117983341217041, + "learning_rate": 4.8997408661979194e-05, + "loss": 5.1934, + "step": 15210 + }, + { + "epoch": 0.09046412598724902, + "grad_norm": 2.1799557209014893, + "learning_rate": 4.899727770431538e-05, + "loss": 5.2521, + "step": 15211 + }, + { + "epoch": 0.09047007327053003, + "grad_norm": 2.117403745651245, + "learning_rate": 4.8997146738274355e-05, + "loss": 5.3379, + "step": 15212 + }, + { + "epoch": 0.09047602055381102, + "grad_norm": 1.59669828414917, + "learning_rate": 4.899701576385619e-05, + "loss": 5.375, + "step": 15213 + }, + { + "epoch": 0.09048196783709202, + "grad_norm": 1.6929266452789307, + "learning_rate": 4.8996884781060907e-05, + "loss": 5.9243, + "step": 15214 + }, + { + "epoch": 0.09048791512037302, + "grad_norm": 1.8353838920593262, + "learning_rate": 4.899675378988855e-05, + "loss": 5.9216, + "step": 15215 + }, + { + "epoch": 0.09049386240365401, + "grad_norm": 1.6468323469161987, + "learning_rate": 4.899662279033918e-05, + "loss": 6.0171, + "step": 15216 + }, + { + "epoch": 0.09049980968693501, + "grad_norm": 1.4748890399932861, + "learning_rate": 4.899649178241284e-05, + "loss": 5.6775, + "step": 15217 + }, + { + "epoch": 0.09050575697021601, + "grad_norm": 1.8783589601516724, + "learning_rate": 4.8996360766109576e-05, + "loss": 5.7625, + "step": 15218 + }, + { + "epoch": 0.090511704253497, + "grad_norm": 1.7860721349716187, + "learning_rate": 4.8996229741429416e-05, + "loss": 5.7512, + "step": 15219 + }, + { + "epoch": 0.090517651536778, + "grad_norm": 1.7337830066680908, + "learning_rate": 4.899609870837243e-05, + "loss": 5.8233, + "step": 15220 + }, + { + "epoch": 0.090523598820059, + "grad_norm": 1.9256298542022705, + "learning_rate": 4.899596766693865e-05, + "loss": 5.8586, + "step": 15221 + }, + { + "epoch": 0.09052954610333999, + "grad_norm": 1.814205288887024, + "learning_rate": 4.8995836617128135e-05, + "loss": 5.4852, + "step": 15222 + }, + { + "epoch": 0.090535493386621, + "grad_norm": 1.8664608001708984, + "learning_rate": 4.899570555894091e-05, + "loss": 5.6847, + "step": 15223 + }, + { + "epoch": 0.09054144066990198, + "grad_norm": 1.8377459049224854, + "learning_rate": 4.899557449237704e-05, + "loss": 5.8869, + "step": 15224 + }, + { + "epoch": 0.09054738795318298, + "grad_norm": 1.788875937461853, + "learning_rate": 4.899544341743656e-05, + "loss": 5.4372, + "step": 15225 + }, + { + "epoch": 0.09055333523646399, + "grad_norm": 1.8490506410598755, + "learning_rate": 4.899531233411951e-05, + "loss": 6.1163, + "step": 15226 + }, + { + "epoch": 0.09055928251974497, + "grad_norm": 2.14841628074646, + "learning_rate": 4.8995181242425955e-05, + "loss": 6.1154, + "step": 15227 + }, + { + "epoch": 0.09056522980302598, + "grad_norm": 2.051154851913452, + "learning_rate": 4.899505014235593e-05, + "loss": 4.9326, + "step": 15228 + }, + { + "epoch": 0.09057117708630698, + "grad_norm": 2.071126937866211, + "learning_rate": 4.899491903390948e-05, + "loss": 4.8831, + "step": 15229 + }, + { + "epoch": 0.09057712436958797, + "grad_norm": 2.0155231952667236, + "learning_rate": 4.899478791708665e-05, + "loss": 4.87, + "step": 15230 + }, + { + "epoch": 0.09058307165286897, + "grad_norm": 1.946815013885498, + "learning_rate": 4.89946567918875e-05, + "loss": 4.8139, + "step": 15231 + }, + { + "epoch": 0.09058901893614997, + "grad_norm": 1.9526349306106567, + "learning_rate": 4.899452565831204e-05, + "loss": 4.7618, + "step": 15232 + }, + { + "epoch": 0.09059496621943096, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.8994394516360355e-05, + "loss": 4.7617, + "step": 15233 + }, + { + "epoch": 0.09060091350271196, + "grad_norm": 2.0964083671569824, + "learning_rate": 4.8994263366032466e-05, + "loss": 4.6298, + "step": 15234 + }, + { + "epoch": 0.09060686078599296, + "grad_norm": 2.0333590507507324, + "learning_rate": 4.899413220732843e-05, + "loss": 4.6419, + "step": 15235 + }, + { + "epoch": 0.09061280806927395, + "grad_norm": 2.076993703842163, + "learning_rate": 4.89940010402483e-05, + "loss": 4.6163, + "step": 15236 + }, + { + "epoch": 0.09061875535255495, + "grad_norm": 1.767774224281311, + "learning_rate": 4.89938698647921e-05, + "loss": 5.2418, + "step": 15237 + }, + { + "epoch": 0.09062470263583595, + "grad_norm": 1.8380626440048218, + "learning_rate": 4.899373868095989e-05, + "loss": 5.3304, + "step": 15238 + }, + { + "epoch": 0.09063064991911694, + "grad_norm": 1.7332574129104614, + "learning_rate": 4.8993607488751716e-05, + "loss": 5.3528, + "step": 15239 + }, + { + "epoch": 0.09063659720239794, + "grad_norm": 1.8473124504089355, + "learning_rate": 4.8993476288167614e-05, + "loss": 5.5801, + "step": 15240 + }, + { + "epoch": 0.09064254448567895, + "grad_norm": 2.299206256866455, + "learning_rate": 4.899334507920765e-05, + "loss": 5.308, + "step": 15241 + }, + { + "epoch": 0.09064849176895994, + "grad_norm": 1.945417046546936, + "learning_rate": 4.899321386187185e-05, + "loss": 4.8894, + "step": 15242 + }, + { + "epoch": 0.09065443905224094, + "grad_norm": 2.328246831893921, + "learning_rate": 4.899308263616027e-05, + "loss": 5.0332, + "step": 15243 + }, + { + "epoch": 0.09066038633552194, + "grad_norm": 2.194546699523926, + "learning_rate": 4.899295140207295e-05, + "loss": 4.8891, + "step": 15244 + }, + { + "epoch": 0.09066633361880293, + "grad_norm": 2.078903913497925, + "learning_rate": 4.899282015960994e-05, + "loss": 5.0327, + "step": 15245 + }, + { + "epoch": 0.09067228090208393, + "grad_norm": 2.2129557132720947, + "learning_rate": 4.8992688908771285e-05, + "loss": 4.8806, + "step": 15246 + }, + { + "epoch": 0.09067822818536493, + "grad_norm": 2.3200979232788086, + "learning_rate": 4.8992557649557026e-05, + "loss": 4.9961, + "step": 15247 + }, + { + "epoch": 0.09068417546864592, + "grad_norm": 1.5829685926437378, + "learning_rate": 4.899242638196722e-05, + "loss": 5.4238, + "step": 15248 + }, + { + "epoch": 0.09069012275192692, + "grad_norm": 1.9085135459899902, + "learning_rate": 4.89922951060019e-05, + "loss": 5.0338, + "step": 15249 + }, + { + "epoch": 0.09069607003520792, + "grad_norm": 2.3000802993774414, + "learning_rate": 4.899216382166112e-05, + "loss": 4.9529, + "step": 15250 + }, + { + "epoch": 0.09070201731848891, + "grad_norm": 2.1610753536224365, + "learning_rate": 4.899203252894492e-05, + "loss": 4.9373, + "step": 15251 + }, + { + "epoch": 0.09070796460176991, + "grad_norm": 2.2821414470672607, + "learning_rate": 4.899190122785336e-05, + "loss": 5.2032, + "step": 15252 + }, + { + "epoch": 0.0907139118850509, + "grad_norm": 2.226741075515747, + "learning_rate": 4.899176991838646e-05, + "loss": 4.9354, + "step": 15253 + }, + { + "epoch": 0.0907198591683319, + "grad_norm": 2.0117716789245605, + "learning_rate": 4.899163860054429e-05, + "loss": 5.1179, + "step": 15254 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 1.6551730632781982, + "learning_rate": 4.8991507274326886e-05, + "loss": 5.6428, + "step": 15255 + }, + { + "epoch": 0.0907317537348939, + "grad_norm": 1.5236784219741821, + "learning_rate": 4.89913759397343e-05, + "loss": 5.4088, + "step": 15256 + }, + { + "epoch": 0.0907377010181749, + "grad_norm": 1.542356252670288, + "learning_rate": 4.899124459676656e-05, + "loss": 5.3383, + "step": 15257 + }, + { + "epoch": 0.0907436483014559, + "grad_norm": 1.5694434642791748, + "learning_rate": 4.899111324542374e-05, + "loss": 5.5202, + "step": 15258 + }, + { + "epoch": 0.09074959558473689, + "grad_norm": 1.459039568901062, + "learning_rate": 4.8990981885705856e-05, + "loss": 5.3481, + "step": 15259 + }, + { + "epoch": 0.09075554286801789, + "grad_norm": 1.4624565839767456, + "learning_rate": 4.899085051761297e-05, + "loss": 5.343, + "step": 15260 + }, + { + "epoch": 0.09076149015129889, + "grad_norm": 1.2748361825942993, + "learning_rate": 4.899071914114513e-05, + "loss": 5.1925, + "step": 15261 + }, + { + "epoch": 0.09076743743457988, + "grad_norm": 1.3813046216964722, + "learning_rate": 4.899058775630237e-05, + "loss": 4.9712, + "step": 15262 + }, + { + "epoch": 0.09077338471786088, + "grad_norm": 1.349108099937439, + "learning_rate": 4.8990456363084756e-05, + "loss": 4.9562, + "step": 15263 + }, + { + "epoch": 0.09077933200114188, + "grad_norm": 1.4744555950164795, + "learning_rate": 4.8990324961492316e-05, + "loss": 5.0014, + "step": 15264 + }, + { + "epoch": 0.09078527928442287, + "grad_norm": 1.4227643013000488, + "learning_rate": 4.8990193551525105e-05, + "loss": 5.076, + "step": 15265 + }, + { + "epoch": 0.09079122656770387, + "grad_norm": 1.4344059228897095, + "learning_rate": 4.8990062133183164e-05, + "loss": 5.2212, + "step": 15266 + }, + { + "epoch": 0.09079717385098487, + "grad_norm": 1.5858408212661743, + "learning_rate": 4.8989930706466534e-05, + "loss": 5.1893, + "step": 15267 + }, + { + "epoch": 0.09080312113426586, + "grad_norm": 1.6398282051086426, + "learning_rate": 4.898979927137527e-05, + "loss": 5.034, + "step": 15268 + }, + { + "epoch": 0.09080906841754686, + "grad_norm": 1.4295551776885986, + "learning_rate": 4.8989667827909416e-05, + "loss": 5.2761, + "step": 15269 + }, + { + "epoch": 0.09081501570082787, + "grad_norm": 1.4313840866088867, + "learning_rate": 4.898953637606902e-05, + "loss": 5.183, + "step": 15270 + }, + { + "epoch": 0.09082096298410886, + "grad_norm": 1.2977478504180908, + "learning_rate": 4.898940491585412e-05, + "loss": 5.1148, + "step": 15271 + }, + { + "epoch": 0.09082691026738986, + "grad_norm": 1.6052992343902588, + "learning_rate": 4.898927344726477e-05, + "loss": 5.3767, + "step": 15272 + }, + { + "epoch": 0.09083285755067086, + "grad_norm": 1.3184257745742798, + "learning_rate": 4.898914197030101e-05, + "loss": 5.3465, + "step": 15273 + }, + { + "epoch": 0.09083880483395185, + "grad_norm": 1.292985439300537, + "learning_rate": 4.898901048496289e-05, + "loss": 5.2478, + "step": 15274 + }, + { + "epoch": 0.09084475211723285, + "grad_norm": 1.1660702228546143, + "learning_rate": 4.898887899125045e-05, + "loss": 5.2655, + "step": 15275 + }, + { + "epoch": 0.09085069940051385, + "grad_norm": 1.2271296977996826, + "learning_rate": 4.8988747489163746e-05, + "loss": 5.2001, + "step": 15276 + }, + { + "epoch": 0.09085664668379484, + "grad_norm": 1.2237215042114258, + "learning_rate": 4.898861597870281e-05, + "loss": 5.213, + "step": 15277 + }, + { + "epoch": 0.09086259396707584, + "grad_norm": 1.3682539463043213, + "learning_rate": 4.898848445986771e-05, + "loss": 5.2174, + "step": 15278 + }, + { + "epoch": 0.09086854125035684, + "grad_norm": 1.2321406602859497, + "learning_rate": 4.8988352932658466e-05, + "loss": 5.1424, + "step": 15279 + }, + { + "epoch": 0.09087448853363783, + "grad_norm": 1.285792350769043, + "learning_rate": 4.898822139707514e-05, + "loss": 5.1438, + "step": 15280 + }, + { + "epoch": 0.09088043581691883, + "grad_norm": 1.137921690940857, + "learning_rate": 4.898808985311778e-05, + "loss": 5.159, + "step": 15281 + }, + { + "epoch": 0.09088638310019982, + "grad_norm": 1.2261563539505005, + "learning_rate": 4.898795830078641e-05, + "loss": 5.1176, + "step": 15282 + }, + { + "epoch": 0.09089233038348082, + "grad_norm": 1.1642104387283325, + "learning_rate": 4.89878267400811e-05, + "loss": 5.0887, + "step": 15283 + }, + { + "epoch": 0.09089827766676183, + "grad_norm": 1.3699917793273926, + "learning_rate": 4.898769517100189e-05, + "loss": 5.0048, + "step": 15284 + }, + { + "epoch": 0.09090422495004281, + "grad_norm": 1.6375452280044556, + "learning_rate": 4.898756359354882e-05, + "loss": 4.6914, + "step": 15285 + }, + { + "epoch": 0.09091017223332382, + "grad_norm": 1.5404956340789795, + "learning_rate": 4.8987432007721944e-05, + "loss": 4.8266, + "step": 15286 + }, + { + "epoch": 0.09091611951660482, + "grad_norm": 1.6747840642929077, + "learning_rate": 4.89873004135213e-05, + "loss": 4.697, + "step": 15287 + }, + { + "epoch": 0.0909220667998858, + "grad_norm": 1.3908432722091675, + "learning_rate": 4.8987168810946935e-05, + "loss": 4.9327, + "step": 15288 + }, + { + "epoch": 0.09092801408316681, + "grad_norm": 1.4933167695999146, + "learning_rate": 4.89870371999989e-05, + "loss": 4.6153, + "step": 15289 + }, + { + "epoch": 0.09093396136644781, + "grad_norm": 1.6259129047393799, + "learning_rate": 4.8986905580677234e-05, + "loss": 4.533, + "step": 15290 + }, + { + "epoch": 0.0909399086497288, + "grad_norm": 1.3692474365234375, + "learning_rate": 4.898677395298199e-05, + "loss": 4.6246, + "step": 15291 + }, + { + "epoch": 0.0909458559330098, + "grad_norm": 1.4951711893081665, + "learning_rate": 4.8986642316913214e-05, + "loss": 4.6677, + "step": 15292 + }, + { + "epoch": 0.0909518032162908, + "grad_norm": 1.5491467714309692, + "learning_rate": 4.8986510672470946e-05, + "loss": 4.9271, + "step": 15293 + }, + { + "epoch": 0.09095775049957179, + "grad_norm": 1.6902397871017456, + "learning_rate": 4.8986379019655235e-05, + "loss": 4.6467, + "step": 15294 + }, + { + "epoch": 0.09096369778285279, + "grad_norm": 1.5122796297073364, + "learning_rate": 4.898624735846613e-05, + "loss": 4.7103, + "step": 15295 + }, + { + "epoch": 0.0909696450661338, + "grad_norm": 1.5287622213363647, + "learning_rate": 4.898611568890367e-05, + "loss": 4.7461, + "step": 15296 + }, + { + "epoch": 0.09097559234941478, + "grad_norm": 1.4649391174316406, + "learning_rate": 4.898598401096791e-05, + "loss": 5.2472, + "step": 15297 + }, + { + "epoch": 0.09098153963269578, + "grad_norm": 1.7621572017669678, + "learning_rate": 4.898585232465889e-05, + "loss": 4.6864, + "step": 15298 + }, + { + "epoch": 0.09098748691597679, + "grad_norm": 1.6371783018112183, + "learning_rate": 4.898572062997665e-05, + "loss": 4.6091, + "step": 15299 + }, + { + "epoch": 0.09099343419925777, + "grad_norm": 1.28440523147583, + "learning_rate": 4.898558892692125e-05, + "loss": 5.0019, + "step": 15300 + }, + { + "epoch": 0.09099938148253878, + "grad_norm": 1.4753130674362183, + "learning_rate": 4.898545721549272e-05, + "loss": 5.3848, + "step": 15301 + }, + { + "epoch": 0.09100532876581978, + "grad_norm": 1.4267481565475464, + "learning_rate": 4.898532549569112e-05, + "loss": 5.1787, + "step": 15302 + }, + { + "epoch": 0.09101127604910077, + "grad_norm": 1.4724546670913696, + "learning_rate": 4.898519376751649e-05, + "loss": 5.2581, + "step": 15303 + }, + { + "epoch": 0.09101722333238177, + "grad_norm": 1.4417310953140259, + "learning_rate": 4.8985062030968875e-05, + "loss": 5.4829, + "step": 15304 + }, + { + "epoch": 0.09102317061566277, + "grad_norm": 1.1160683631896973, + "learning_rate": 4.898493028604833e-05, + "loss": 5.5287, + "step": 15305 + }, + { + "epoch": 0.09102911789894376, + "grad_norm": 1.2454899549484253, + "learning_rate": 4.8984798532754884e-05, + "loss": 5.2984, + "step": 15306 + }, + { + "epoch": 0.09103506518222476, + "grad_norm": 1.5732132196426392, + "learning_rate": 4.8984666771088596e-05, + "loss": 5.4998, + "step": 15307 + }, + { + "epoch": 0.09104101246550576, + "grad_norm": 1.6430423259735107, + "learning_rate": 4.8984535001049515e-05, + "loss": 5.4636, + "step": 15308 + }, + { + "epoch": 0.09104695974878675, + "grad_norm": 1.245288372039795, + "learning_rate": 4.898440322263768e-05, + "loss": 5.2874, + "step": 15309 + }, + { + "epoch": 0.09105290703206775, + "grad_norm": 1.4186644554138184, + "learning_rate": 4.898427143585312e-05, + "loss": 5.2275, + "step": 15310 + }, + { + "epoch": 0.09105885431534876, + "grad_norm": 1.3040757179260254, + "learning_rate": 4.8984139640695915e-05, + "loss": 5.2864, + "step": 15311 + }, + { + "epoch": 0.09106480159862974, + "grad_norm": 1.4106818437576294, + "learning_rate": 4.898400783716609e-05, + "loss": 5.5897, + "step": 15312 + }, + { + "epoch": 0.09107074888191075, + "grad_norm": 1.5596522092819214, + "learning_rate": 4.89838760252637e-05, + "loss": 5.4827, + "step": 15313 + }, + { + "epoch": 0.09107669616519173, + "grad_norm": 2.2576634883880615, + "learning_rate": 4.898374420498878e-05, + "loss": 5.1471, + "step": 15314 + }, + { + "epoch": 0.09108264344847274, + "grad_norm": 1.2749537229537964, + "learning_rate": 4.898361237634139e-05, + "loss": 5.2688, + "step": 15315 + }, + { + "epoch": 0.09108859073175374, + "grad_norm": 1.4171591997146606, + "learning_rate": 4.8983480539321566e-05, + "loss": 5.0796, + "step": 15316 + }, + { + "epoch": 0.09109453801503473, + "grad_norm": 1.2233314514160156, + "learning_rate": 4.898334869392936e-05, + "loss": 5.0992, + "step": 15317 + }, + { + "epoch": 0.09110048529831573, + "grad_norm": 1.4817143678665161, + "learning_rate": 4.8983216840164804e-05, + "loss": 5.2354, + "step": 15318 + }, + { + "epoch": 0.09110643258159673, + "grad_norm": 1.442088007926941, + "learning_rate": 4.898308497802796e-05, + "loss": 5.2177, + "step": 15319 + }, + { + "epoch": 0.09111237986487772, + "grad_norm": 1.3996042013168335, + "learning_rate": 4.898295310751887e-05, + "loss": 4.9938, + "step": 15320 + }, + { + "epoch": 0.09111832714815872, + "grad_norm": 1.3091521263122559, + "learning_rate": 4.8982821228637576e-05, + "loss": 4.9916, + "step": 15321 + }, + { + "epoch": 0.09112427443143972, + "grad_norm": 1.4807448387145996, + "learning_rate": 4.898268934138414e-05, + "loss": 4.9833, + "step": 15322 + }, + { + "epoch": 0.09113022171472071, + "grad_norm": 1.5992671251296997, + "learning_rate": 4.898255744575858e-05, + "loss": 5.1007, + "step": 15323 + }, + { + "epoch": 0.09113616899800171, + "grad_norm": 1.4472523927688599, + "learning_rate": 4.8982425541760954e-05, + "loss": 5.3123, + "step": 15324 + }, + { + "epoch": 0.09114211628128271, + "grad_norm": 1.2865816354751587, + "learning_rate": 4.898229362939132e-05, + "loss": 5.0817, + "step": 15325 + }, + { + "epoch": 0.0911480635645637, + "grad_norm": 1.477144479751587, + "learning_rate": 4.898216170864972e-05, + "loss": 5.1819, + "step": 15326 + }, + { + "epoch": 0.0911540108478447, + "grad_norm": 1.5831303596496582, + "learning_rate": 4.8982029779536184e-05, + "loss": 5.28, + "step": 15327 + }, + { + "epoch": 0.0911599581311257, + "grad_norm": 1.3366963863372803, + "learning_rate": 4.898189784205078e-05, + "loss": 5.3715, + "step": 15328 + }, + { + "epoch": 0.0911659054144067, + "grad_norm": 1.5603365898132324, + "learning_rate": 4.898176589619353e-05, + "loss": 5.2642, + "step": 15329 + }, + { + "epoch": 0.0911718526976877, + "grad_norm": 1.5105326175689697, + "learning_rate": 4.8981633941964506e-05, + "loss": 4.949, + "step": 15330 + }, + { + "epoch": 0.0911777999809687, + "grad_norm": 1.2074800729751587, + "learning_rate": 4.8981501979363734e-05, + "loss": 5.2847, + "step": 15331 + }, + { + "epoch": 0.09118374726424969, + "grad_norm": 1.4356200695037842, + "learning_rate": 4.898137000839127e-05, + "loss": 5.6169, + "step": 15332 + }, + { + "epoch": 0.09118969454753069, + "grad_norm": 1.5015919208526611, + "learning_rate": 4.8981238029047154e-05, + "loss": 5.1135, + "step": 15333 + }, + { + "epoch": 0.09119564183081169, + "grad_norm": 1.4902187585830688, + "learning_rate": 4.8981106041331434e-05, + "loss": 5.4406, + "step": 15334 + }, + { + "epoch": 0.09120158911409268, + "grad_norm": 1.2884581089019775, + "learning_rate": 4.898097404524416e-05, + "loss": 5.3493, + "step": 15335 + }, + { + "epoch": 0.09120753639737368, + "grad_norm": 1.4323054552078247, + "learning_rate": 4.898084204078539e-05, + "loss": 5.0939, + "step": 15336 + }, + { + "epoch": 0.09121348368065468, + "grad_norm": 1.6282861232757568, + "learning_rate": 4.898071002795514e-05, + "loss": 5.1857, + "step": 15337 + }, + { + "epoch": 0.09121943096393567, + "grad_norm": 1.3413678407669067, + "learning_rate": 4.898057800675347e-05, + "loss": 4.9581, + "step": 15338 + }, + { + "epoch": 0.09122537824721667, + "grad_norm": 1.5613822937011719, + "learning_rate": 4.898044597718044e-05, + "loss": 4.6401, + "step": 15339 + }, + { + "epoch": 0.09123132553049768, + "grad_norm": 1.4945799112319946, + "learning_rate": 4.898031393923608e-05, + "loss": 4.6649, + "step": 15340 + }, + { + "epoch": 0.09123727281377866, + "grad_norm": 1.6086750030517578, + "learning_rate": 4.898018189292043e-05, + "loss": 4.5996, + "step": 15341 + }, + { + "epoch": 0.09124322009705967, + "grad_norm": 1.3530272245407104, + "learning_rate": 4.898004983823355e-05, + "loss": 4.6511, + "step": 15342 + }, + { + "epoch": 0.09124916738034065, + "grad_norm": 1.5523587465286255, + "learning_rate": 4.897991777517549e-05, + "loss": 4.8099, + "step": 15343 + }, + { + "epoch": 0.09125511466362166, + "grad_norm": 1.6695882081985474, + "learning_rate": 4.8979785703746286e-05, + "loss": 5.2371, + "step": 15344 + }, + { + "epoch": 0.09126106194690266, + "grad_norm": 1.777717113494873, + "learning_rate": 4.897965362394599e-05, + "loss": 5.373, + "step": 15345 + }, + { + "epoch": 0.09126700923018365, + "grad_norm": 1.2890517711639404, + "learning_rate": 4.8979521535774636e-05, + "loss": 5.3851, + "step": 15346 + }, + { + "epoch": 0.09127295651346465, + "grad_norm": 1.3539687395095825, + "learning_rate": 4.897938943923228e-05, + "loss": 5.1218, + "step": 15347 + }, + { + "epoch": 0.09127890379674565, + "grad_norm": 1.4157010316848755, + "learning_rate": 4.8979257334318974e-05, + "loss": 4.9411, + "step": 15348 + }, + { + "epoch": 0.09128485108002664, + "grad_norm": 1.4856256246566772, + "learning_rate": 4.897912522103475e-05, + "loss": 5.1622, + "step": 15349 + }, + { + "epoch": 0.09129079836330764, + "grad_norm": 1.4729665517807007, + "learning_rate": 4.8978993099379666e-05, + "loss": 5.0901, + "step": 15350 + }, + { + "epoch": 0.09129674564658864, + "grad_norm": 1.376625895500183, + "learning_rate": 4.897886096935376e-05, + "loss": 4.8843, + "step": 15351 + }, + { + "epoch": 0.09130269292986963, + "grad_norm": 1.3019710779190063, + "learning_rate": 4.897872883095708e-05, + "loss": 4.9956, + "step": 15352 + }, + { + "epoch": 0.09130864021315063, + "grad_norm": 1.4751423597335815, + "learning_rate": 4.897859668418968e-05, + "loss": 5.4369, + "step": 15353 + }, + { + "epoch": 0.09131458749643163, + "grad_norm": 1.3563402891159058, + "learning_rate": 4.8978464529051595e-05, + "loss": 5.2071, + "step": 15354 + }, + { + "epoch": 0.09132053477971262, + "grad_norm": 1.7365561723709106, + "learning_rate": 4.8978332365542875e-05, + "loss": 4.8797, + "step": 15355 + }, + { + "epoch": 0.09132648206299362, + "grad_norm": 1.4001792669296265, + "learning_rate": 4.8978200193663565e-05, + "loss": 5.2549, + "step": 15356 + }, + { + "epoch": 0.09133242934627463, + "grad_norm": 1.5568649768829346, + "learning_rate": 4.897806801341371e-05, + "loss": 5.3805, + "step": 15357 + }, + { + "epoch": 0.09133837662955561, + "grad_norm": 1.4169847965240479, + "learning_rate": 4.897793582479337e-05, + "loss": 5.2655, + "step": 15358 + }, + { + "epoch": 0.09134432391283662, + "grad_norm": 1.3992067575454712, + "learning_rate": 4.897780362780258e-05, + "loss": 5.4284, + "step": 15359 + }, + { + "epoch": 0.09135027119611762, + "grad_norm": 1.2274264097213745, + "learning_rate": 4.8977671422441376e-05, + "loss": 5.2443, + "step": 15360 + }, + { + "epoch": 0.09135621847939861, + "grad_norm": 1.4754104614257812, + "learning_rate": 4.897753920870982e-05, + "loss": 5.3438, + "step": 15361 + }, + { + "epoch": 0.09136216576267961, + "grad_norm": 1.3993452787399292, + "learning_rate": 4.897740698660796e-05, + "loss": 5.2396, + "step": 15362 + }, + { + "epoch": 0.09136811304596061, + "grad_norm": 1.2840338945388794, + "learning_rate": 4.897727475613583e-05, + "loss": 5.2912, + "step": 15363 + }, + { + "epoch": 0.0913740603292416, + "grad_norm": 1.5234180688858032, + "learning_rate": 4.8977142517293474e-05, + "loss": 5.4197, + "step": 15364 + }, + { + "epoch": 0.0913800076125226, + "grad_norm": 1.6243525743484497, + "learning_rate": 4.897701027008095e-05, + "loss": 5.4358, + "step": 15365 + }, + { + "epoch": 0.0913859548958036, + "grad_norm": 1.277801513671875, + "learning_rate": 4.8976878014498306e-05, + "loss": 5.2801, + "step": 15366 + }, + { + "epoch": 0.09139190217908459, + "grad_norm": 1.5294082164764404, + "learning_rate": 4.897674575054557e-05, + "loss": 4.8257, + "step": 15367 + }, + { + "epoch": 0.0913978494623656, + "grad_norm": 1.7289122343063354, + "learning_rate": 4.897661347822281e-05, + "loss": 4.8155, + "step": 15368 + }, + { + "epoch": 0.0914037967456466, + "grad_norm": 1.5567346811294556, + "learning_rate": 4.897648119753006e-05, + "loss": 4.8245, + "step": 15369 + }, + { + "epoch": 0.09140974402892758, + "grad_norm": 1.4855397939682007, + "learning_rate": 4.8976348908467365e-05, + "loss": 4.7247, + "step": 15370 + }, + { + "epoch": 0.09141569131220859, + "grad_norm": 1.4355418682098389, + "learning_rate": 4.897621661103477e-05, + "loss": 5.0925, + "step": 15371 + }, + { + "epoch": 0.09142163859548957, + "grad_norm": 1.3165326118469238, + "learning_rate": 4.897608430523233e-05, + "loss": 5.3419, + "step": 15372 + }, + { + "epoch": 0.09142758587877058, + "grad_norm": 1.4930912256240845, + "learning_rate": 4.8975951991060084e-05, + "loss": 5.3267, + "step": 15373 + }, + { + "epoch": 0.09143353316205158, + "grad_norm": 1.2326771020889282, + "learning_rate": 4.897581966851809e-05, + "loss": 5.2902, + "step": 15374 + }, + { + "epoch": 0.09143948044533257, + "grad_norm": 1.1512086391448975, + "learning_rate": 4.897568733760638e-05, + "loss": 5.2362, + "step": 15375 + }, + { + "epoch": 0.09144542772861357, + "grad_norm": 2.2404119968414307, + "learning_rate": 4.8975554998325e-05, + "loss": 5.055, + "step": 15376 + }, + { + "epoch": 0.09145137501189457, + "grad_norm": 1.3026318550109863, + "learning_rate": 4.8975422650674005e-05, + "loss": 5.0192, + "step": 15377 + }, + { + "epoch": 0.09145732229517556, + "grad_norm": 1.5808472633361816, + "learning_rate": 4.897529029465344e-05, + "loss": 5.2429, + "step": 15378 + }, + { + "epoch": 0.09146326957845656, + "grad_norm": 1.5761525630950928, + "learning_rate": 4.897515793026335e-05, + "loss": 4.9123, + "step": 15379 + }, + { + "epoch": 0.09146921686173756, + "grad_norm": 1.488484501838684, + "learning_rate": 4.897502555750377e-05, + "loss": 4.8463, + "step": 15380 + }, + { + "epoch": 0.09147516414501855, + "grad_norm": 1.4662736654281616, + "learning_rate": 4.897489317637477e-05, + "loss": 5.3047, + "step": 15381 + }, + { + "epoch": 0.09148111142829955, + "grad_norm": 1.6454370021820068, + "learning_rate": 4.897476078687637e-05, + "loss": 5.2335, + "step": 15382 + }, + { + "epoch": 0.09148705871158055, + "grad_norm": 1.425868034362793, + "learning_rate": 4.8974628389008636e-05, + "loss": 5.2016, + "step": 15383 + }, + { + "epoch": 0.09149300599486154, + "grad_norm": 1.599349021911621, + "learning_rate": 4.8974495982771606e-05, + "loss": 5.4205, + "step": 15384 + }, + { + "epoch": 0.09149895327814254, + "grad_norm": 1.6200257539749146, + "learning_rate": 4.897436356816533e-05, + "loss": 5.5001, + "step": 15385 + }, + { + "epoch": 0.09150490056142355, + "grad_norm": 1.5314574241638184, + "learning_rate": 4.8974231145189844e-05, + "loss": 5.4711, + "step": 15386 + }, + { + "epoch": 0.09151084784470453, + "grad_norm": 1.507489562034607, + "learning_rate": 4.8974098713845206e-05, + "loss": 5.4001, + "step": 15387 + }, + { + "epoch": 0.09151679512798554, + "grad_norm": 1.4561303853988647, + "learning_rate": 4.897396627413146e-05, + "loss": 5.4566, + "step": 15388 + }, + { + "epoch": 0.09152274241126654, + "grad_norm": 1.3273184299468994, + "learning_rate": 4.897383382604865e-05, + "loss": 5.4665, + "step": 15389 + }, + { + "epoch": 0.09152868969454753, + "grad_norm": 1.370138168334961, + "learning_rate": 4.8973701369596814e-05, + "loss": 5.4319, + "step": 15390 + }, + { + "epoch": 0.09153463697782853, + "grad_norm": 1.4831699132919312, + "learning_rate": 4.897356890477601e-05, + "loss": 5.2734, + "step": 15391 + }, + { + "epoch": 0.09154058426110953, + "grad_norm": 1.3152328729629517, + "learning_rate": 4.897343643158629e-05, + "loss": 5.3573, + "step": 15392 + }, + { + "epoch": 0.09154653154439052, + "grad_norm": 1.635460376739502, + "learning_rate": 4.8973303950027684e-05, + "loss": 5.2433, + "step": 15393 + }, + { + "epoch": 0.09155247882767152, + "grad_norm": 1.5252761840820312, + "learning_rate": 4.897317146010024e-05, + "loss": 5.2164, + "step": 15394 + }, + { + "epoch": 0.09155842611095252, + "grad_norm": 1.600043773651123, + "learning_rate": 4.897303896180402e-05, + "loss": 5.4138, + "step": 15395 + }, + { + "epoch": 0.09156437339423351, + "grad_norm": 1.6243258714675903, + "learning_rate": 4.8972906455139056e-05, + "loss": 5.6129, + "step": 15396 + }, + { + "epoch": 0.09157032067751451, + "grad_norm": 1.2726150751113892, + "learning_rate": 4.89727739401054e-05, + "loss": 5.4639, + "step": 15397 + }, + { + "epoch": 0.09157626796079552, + "grad_norm": 2.1045331954956055, + "learning_rate": 4.897264141670309e-05, + "loss": 5.1875, + "step": 15398 + }, + { + "epoch": 0.0915822152440765, + "grad_norm": 2.1204488277435303, + "learning_rate": 4.897250888493218e-05, + "loss": 5.0401, + "step": 15399 + }, + { + "epoch": 0.0915881625273575, + "grad_norm": 1.794190526008606, + "learning_rate": 4.8972376344792716e-05, + "loss": 6.0581, + "step": 15400 + }, + { + "epoch": 0.0915941098106385, + "grad_norm": 2.050788402557373, + "learning_rate": 4.8972243796284746e-05, + "loss": 5.0138, + "step": 15401 + }, + { + "epoch": 0.0916000570939195, + "grad_norm": 2.1165850162506104, + "learning_rate": 4.897211123940831e-05, + "loss": 4.7077, + "step": 15402 + }, + { + "epoch": 0.0916060043772005, + "grad_norm": 1.9797117710113525, + "learning_rate": 4.8971978674163455e-05, + "loss": 4.8248, + "step": 15403 + }, + { + "epoch": 0.09161195166048149, + "grad_norm": 1.922232747077942, + "learning_rate": 4.8971846100550234e-05, + "loss": 4.7655, + "step": 15404 + }, + { + "epoch": 0.09161789894376249, + "grad_norm": 1.7310322523117065, + "learning_rate": 4.897171351856869e-05, + "loss": 5.425, + "step": 15405 + }, + { + "epoch": 0.09162384622704349, + "grad_norm": 1.9186078310012817, + "learning_rate": 4.897158092821887e-05, + "loss": 6.2449, + "step": 15406 + }, + { + "epoch": 0.09162979351032448, + "grad_norm": 1.7470628023147583, + "learning_rate": 4.897144832950081e-05, + "loss": 6.1586, + "step": 15407 + }, + { + "epoch": 0.09163574079360548, + "grad_norm": 1.7828420400619507, + "learning_rate": 4.897131572241457e-05, + "loss": 6.1068, + "step": 15408 + }, + { + "epoch": 0.09164168807688648, + "grad_norm": 1.8831984996795654, + "learning_rate": 4.897118310696019e-05, + "loss": 5.6989, + "step": 15409 + }, + { + "epoch": 0.09164763536016747, + "grad_norm": 1.6138192415237427, + "learning_rate": 4.8971050483137726e-05, + "loss": 5.8222, + "step": 15410 + }, + { + "epoch": 0.09165358264344847, + "grad_norm": 1.6921756267547607, + "learning_rate": 4.897091785094721e-05, + "loss": 5.8559, + "step": 15411 + }, + { + "epoch": 0.09165952992672947, + "grad_norm": 2.007937431335449, + "learning_rate": 4.8970785210388694e-05, + "loss": 5.4523, + "step": 15412 + }, + { + "epoch": 0.09166547721001046, + "grad_norm": 1.8820117712020874, + "learning_rate": 4.8970652561462224e-05, + "loss": 5.6293, + "step": 15413 + }, + { + "epoch": 0.09167142449329146, + "grad_norm": 2.0193300247192383, + "learning_rate": 4.897051990416785e-05, + "loss": 5.8481, + "step": 15414 + }, + { + "epoch": 0.09167737177657247, + "grad_norm": 2.3685405254364014, + "learning_rate": 4.897038723850561e-05, + "loss": 6.2884, + "step": 15415 + }, + { + "epoch": 0.09168331905985345, + "grad_norm": 2.001131534576416, + "learning_rate": 4.897025456447556e-05, + "loss": 5.6747, + "step": 15416 + }, + { + "epoch": 0.09168926634313446, + "grad_norm": 1.9729053974151611, + "learning_rate": 4.897012188207774e-05, + "loss": 5.9019, + "step": 15417 + }, + { + "epoch": 0.09169521362641546, + "grad_norm": 1.7620398998260498, + "learning_rate": 4.896998919131219e-05, + "loss": 5.9498, + "step": 15418 + }, + { + "epoch": 0.09170116090969645, + "grad_norm": 1.6993772983551025, + "learning_rate": 4.896985649217898e-05, + "loss": 5.973, + "step": 15419 + }, + { + "epoch": 0.09170710819297745, + "grad_norm": 1.6905665397644043, + "learning_rate": 4.896972378467813e-05, + "loss": 5.9729, + "step": 15420 + }, + { + "epoch": 0.09171305547625845, + "grad_norm": 1.710838794708252, + "learning_rate": 4.8969591068809706e-05, + "loss": 5.6661, + "step": 15421 + }, + { + "epoch": 0.09171900275953944, + "grad_norm": 1.9235612154006958, + "learning_rate": 4.896945834457374e-05, + "loss": 5.38, + "step": 15422 + }, + { + "epoch": 0.09172495004282044, + "grad_norm": 2.360656976699829, + "learning_rate": 4.896932561197028e-05, + "loss": 5.2199, + "step": 15423 + }, + { + "epoch": 0.09173089732610144, + "grad_norm": 2.403338670730591, + "learning_rate": 4.896919287099938e-05, + "loss": 5.1776, + "step": 15424 + }, + { + "epoch": 0.09173684460938243, + "grad_norm": 1.9474782943725586, + "learning_rate": 4.896906012166108e-05, + "loss": 5.0781, + "step": 15425 + }, + { + "epoch": 0.09174279189266343, + "grad_norm": 1.8974144458770752, + "learning_rate": 4.896892736395543e-05, + "loss": 5.1609, + "step": 15426 + }, + { + "epoch": 0.09174873917594444, + "grad_norm": 2.3854262828826904, + "learning_rate": 4.896879459788247e-05, + "loss": 5.2019, + "step": 15427 + }, + { + "epoch": 0.09175468645922542, + "grad_norm": 2.4181137084960938, + "learning_rate": 4.8968661823442264e-05, + "loss": 5.1216, + "step": 15428 + }, + { + "epoch": 0.09176063374250643, + "grad_norm": 2.266355514526367, + "learning_rate": 4.896852904063484e-05, + "loss": 5.0401, + "step": 15429 + }, + { + "epoch": 0.09176658102578741, + "grad_norm": 2.086296558380127, + "learning_rate": 4.896839624946025e-05, + "loss": 4.8601, + "step": 15430 + }, + { + "epoch": 0.09177252830906842, + "grad_norm": 1.943326473236084, + "learning_rate": 4.896826344991854e-05, + "loss": 4.9978, + "step": 15431 + }, + { + "epoch": 0.09177847559234942, + "grad_norm": 2.0165631771087646, + "learning_rate": 4.896813064200975e-05, + "loss": 5.0379, + "step": 15432 + }, + { + "epoch": 0.0917844228756304, + "grad_norm": 1.7142544984817505, + "learning_rate": 4.896799782573394e-05, + "loss": 5.7101, + "step": 15433 + }, + { + "epoch": 0.09179037015891141, + "grad_norm": 1.9000083208084106, + "learning_rate": 4.896786500109115e-05, + "loss": 5.9536, + "step": 15434 + }, + { + "epoch": 0.09179631744219241, + "grad_norm": 1.6976677179336548, + "learning_rate": 4.8967732168081426e-05, + "loss": 5.4408, + "step": 15435 + }, + { + "epoch": 0.0918022647254734, + "grad_norm": 1.7433068752288818, + "learning_rate": 4.8967599326704815e-05, + "loss": 5.831, + "step": 15436 + }, + { + "epoch": 0.0918082120087544, + "grad_norm": 1.484256625175476, + "learning_rate": 4.896746647696136e-05, + "loss": 5.943, + "step": 15437 + }, + { + "epoch": 0.0918141592920354, + "grad_norm": 2.2480883598327637, + "learning_rate": 4.8967333618851106e-05, + "loss": 5.6634, + "step": 15438 + }, + { + "epoch": 0.09182010657531639, + "grad_norm": 1.3530383110046387, + "learning_rate": 4.896720075237411e-05, + "loss": 5.8981, + "step": 15439 + }, + { + "epoch": 0.09182605385859739, + "grad_norm": 1.451636552810669, + "learning_rate": 4.896706787753041e-05, + "loss": 5.9803, + "step": 15440 + }, + { + "epoch": 0.0918320011418784, + "grad_norm": 1.5904042720794678, + "learning_rate": 4.896693499432006e-05, + "loss": 5.9692, + "step": 15441 + }, + { + "epoch": 0.09183794842515938, + "grad_norm": 1.3971885442733765, + "learning_rate": 4.896680210274309e-05, + "loss": 5.8612, + "step": 15442 + }, + { + "epoch": 0.09184389570844038, + "grad_norm": 1.325842022895813, + "learning_rate": 4.8966669202799564e-05, + "loss": 5.9081, + "step": 15443 + }, + { + "epoch": 0.09184984299172139, + "grad_norm": 1.4639033079147339, + "learning_rate": 4.8966536294489515e-05, + "loss": 5.8395, + "step": 15444 + }, + { + "epoch": 0.09185579027500237, + "grad_norm": 1.248425006866455, + "learning_rate": 4.896640337781301e-05, + "loss": 5.9016, + "step": 15445 + }, + { + "epoch": 0.09186173755828338, + "grad_norm": 1.4250134229660034, + "learning_rate": 4.896627045277007e-05, + "loss": 5.815, + "step": 15446 + }, + { + "epoch": 0.09186768484156438, + "grad_norm": 1.9178589582443237, + "learning_rate": 4.896613751936075e-05, + "loss": 5.9092, + "step": 15447 + }, + { + "epoch": 0.09187363212484537, + "grad_norm": 1.9218472242355347, + "learning_rate": 4.896600457758511e-05, + "loss": 5.7151, + "step": 15448 + }, + { + "epoch": 0.09187957940812637, + "grad_norm": 1.7698949575424194, + "learning_rate": 4.896587162744317e-05, + "loss": 5.709, + "step": 15449 + }, + { + "epoch": 0.09188552669140737, + "grad_norm": 2.5047290325164795, + "learning_rate": 4.8965738668935e-05, + "loss": 5.5417, + "step": 15450 + }, + { + "epoch": 0.09189147397468836, + "grad_norm": 1.9855560064315796, + "learning_rate": 4.896560570206065e-05, + "loss": 5.9572, + "step": 15451 + }, + { + "epoch": 0.09189742125796936, + "grad_norm": 1.8577516078948975, + "learning_rate": 4.896547272682014e-05, + "loss": 4.8775, + "step": 15452 + }, + { + "epoch": 0.09190336854125036, + "grad_norm": 1.8830385208129883, + "learning_rate": 4.896533974321353e-05, + "loss": 4.8617, + "step": 15453 + }, + { + "epoch": 0.09190931582453135, + "grad_norm": 1.5114052295684814, + "learning_rate": 4.896520675124087e-05, + "loss": 4.9485, + "step": 15454 + }, + { + "epoch": 0.09191526310781235, + "grad_norm": 1.6233285665512085, + "learning_rate": 4.8965073750902205e-05, + "loss": 5.1098, + "step": 15455 + }, + { + "epoch": 0.09192121039109336, + "grad_norm": 1.6900150775909424, + "learning_rate": 4.896494074219758e-05, + "loss": 6.025, + "step": 15456 + }, + { + "epoch": 0.09192715767437434, + "grad_norm": 1.3984570503234863, + "learning_rate": 4.8964807725127046e-05, + "loss": 5.888, + "step": 15457 + }, + { + "epoch": 0.09193310495765535, + "grad_norm": 1.7069528102874756, + "learning_rate": 4.896467469969064e-05, + "loss": 5.6435, + "step": 15458 + }, + { + "epoch": 0.09193905224093633, + "grad_norm": 1.641513705253601, + "learning_rate": 4.896454166588842e-05, + "loss": 5.5641, + "step": 15459 + }, + { + "epoch": 0.09194499952421734, + "grad_norm": 1.8448737859725952, + "learning_rate": 4.896440862372042e-05, + "loss": 5.5673, + "step": 15460 + }, + { + "epoch": 0.09195094680749834, + "grad_norm": 1.7696945667266846, + "learning_rate": 4.8964275573186694e-05, + "loss": 5.4383, + "step": 15461 + }, + { + "epoch": 0.09195689409077933, + "grad_norm": 2.7951743602752686, + "learning_rate": 4.8964142514287285e-05, + "loss": 4.2996, + "step": 15462 + }, + { + "epoch": 0.09196284137406033, + "grad_norm": 2.5503883361816406, + "learning_rate": 4.8964009447022246e-05, + "loss": 4.2864, + "step": 15463 + }, + { + "epoch": 0.09196878865734133, + "grad_norm": 2.2069225311279297, + "learning_rate": 4.896387637139161e-05, + "loss": 4.3818, + "step": 15464 + }, + { + "epoch": 0.09197473594062232, + "grad_norm": 2.34734845161438, + "learning_rate": 4.8963743287395444e-05, + "loss": 4.2951, + "step": 15465 + }, + { + "epoch": 0.09198068322390332, + "grad_norm": 2.2955567836761475, + "learning_rate": 4.896361019503378e-05, + "loss": 4.3349, + "step": 15466 + }, + { + "epoch": 0.09198663050718432, + "grad_norm": 2.3519480228424072, + "learning_rate": 4.8963477094306666e-05, + "loss": 4.2685, + "step": 15467 + }, + { + "epoch": 0.09199257779046531, + "grad_norm": 2.3862032890319824, + "learning_rate": 4.896334398521415e-05, + "loss": 4.1333, + "step": 15468 + }, + { + "epoch": 0.09199852507374631, + "grad_norm": 2.1290738582611084, + "learning_rate": 4.896321086775627e-05, + "loss": 4.7918, + "step": 15469 + }, + { + "epoch": 0.09200447235702731, + "grad_norm": 2.2130253314971924, + "learning_rate": 4.8963077741933095e-05, + "loss": 5.208, + "step": 15470 + }, + { + "epoch": 0.0920104196403083, + "grad_norm": 2.063810110092163, + "learning_rate": 4.896294460774464e-05, + "loss": 5.1891, + "step": 15471 + }, + { + "epoch": 0.0920163669235893, + "grad_norm": 2.068791627883911, + "learning_rate": 4.8962811465190984e-05, + "loss": 5.2855, + "step": 15472 + }, + { + "epoch": 0.0920223142068703, + "grad_norm": 1.8504056930541992, + "learning_rate": 4.896267831427215e-05, + "loss": 5.0159, + "step": 15473 + }, + { + "epoch": 0.0920282614901513, + "grad_norm": 2.150820255279541, + "learning_rate": 4.89625451549882e-05, + "loss": 5.7728, + "step": 15474 + }, + { + "epoch": 0.0920342087734323, + "grad_norm": 2.3655643463134766, + "learning_rate": 4.8962411987339165e-05, + "loss": 5.4863, + "step": 15475 + }, + { + "epoch": 0.0920401560567133, + "grad_norm": 1.509820818901062, + "learning_rate": 4.8962278811325105e-05, + "loss": 5.5682, + "step": 15476 + }, + { + "epoch": 0.09204610333999429, + "grad_norm": 1.8581949472427368, + "learning_rate": 4.896214562694605e-05, + "loss": 5.6875, + "step": 15477 + }, + { + "epoch": 0.09205205062327529, + "grad_norm": 2.028116464614868, + "learning_rate": 4.8962012434202075e-05, + "loss": 5.3495, + "step": 15478 + }, + { + "epoch": 0.09205799790655629, + "grad_norm": 1.9395058155059814, + "learning_rate": 4.89618792330932e-05, + "loss": 5.5616, + "step": 15479 + }, + { + "epoch": 0.09206394518983728, + "grad_norm": 1.9281854629516602, + "learning_rate": 4.896174602361948e-05, + "loss": 5.6449, + "step": 15480 + }, + { + "epoch": 0.09206989247311828, + "grad_norm": 1.7750074863433838, + "learning_rate": 4.896161280578097e-05, + "loss": 5.1178, + "step": 15481 + }, + { + "epoch": 0.09207583975639928, + "grad_norm": 2.0160205364227295, + "learning_rate": 4.89614795795777e-05, + "loss": 5.4698, + "step": 15482 + }, + { + "epoch": 0.09208178703968027, + "grad_norm": 2.0041770935058594, + "learning_rate": 4.896134634500972e-05, + "loss": 4.6989, + "step": 15483 + }, + { + "epoch": 0.09208773432296127, + "grad_norm": 1.9916999340057373, + "learning_rate": 4.896121310207708e-05, + "loss": 4.6296, + "step": 15484 + }, + { + "epoch": 0.09209368160624228, + "grad_norm": 1.62458336353302, + "learning_rate": 4.8961079850779845e-05, + "loss": 5.1147, + "step": 15485 + }, + { + "epoch": 0.09209962888952326, + "grad_norm": 1.8349764347076416, + "learning_rate": 4.8960946591118036e-05, + "loss": 5.3646, + "step": 15486 + }, + { + "epoch": 0.09210557617280427, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.89608133230917e-05, + "loss": 5.7467, + "step": 15487 + }, + { + "epoch": 0.09211152345608525, + "grad_norm": 1.8945664167404175, + "learning_rate": 4.89606800467009e-05, + "loss": 5.5526, + "step": 15488 + }, + { + "epoch": 0.09211747073936626, + "grad_norm": 2.1056711673736572, + "learning_rate": 4.896054676194568e-05, + "loss": 4.8553, + "step": 15489 + }, + { + "epoch": 0.09212341802264726, + "grad_norm": 2.0394606590270996, + "learning_rate": 4.896041346882607e-05, + "loss": 5.4427, + "step": 15490 + }, + { + "epoch": 0.09212936530592825, + "grad_norm": 2.3078689575195312, + "learning_rate": 4.896028016734213e-05, + "loss": 5.3668, + "step": 15491 + }, + { + "epoch": 0.09213531258920925, + "grad_norm": 2.1227409839630127, + "learning_rate": 4.8960146857493904e-05, + "loss": 5.6314, + "step": 15492 + }, + { + "epoch": 0.09214125987249025, + "grad_norm": 2.156165838241577, + "learning_rate": 4.896001353928144e-05, + "loss": 5.5088, + "step": 15493 + }, + { + "epoch": 0.09214720715577124, + "grad_norm": 1.8915730714797974, + "learning_rate": 4.895988021270478e-05, + "loss": 5.5636, + "step": 15494 + }, + { + "epoch": 0.09215315443905224, + "grad_norm": 1.8041549921035767, + "learning_rate": 4.895974687776398e-05, + "loss": 5.5213, + "step": 15495 + }, + { + "epoch": 0.09215910172233324, + "grad_norm": 1.8982187509536743, + "learning_rate": 4.8959613534459074e-05, + "loss": 5.7038, + "step": 15496 + }, + { + "epoch": 0.09216504900561423, + "grad_norm": 1.9235600233078003, + "learning_rate": 4.895948018279012e-05, + "loss": 5.514, + "step": 15497 + }, + { + "epoch": 0.09217099628889523, + "grad_norm": 2.284212112426758, + "learning_rate": 4.895934682275715e-05, + "loss": 5.4624, + "step": 15498 + }, + { + "epoch": 0.09217694357217623, + "grad_norm": 2.770934820175171, + "learning_rate": 4.895921345436022e-05, + "loss": 4.7516, + "step": 15499 + }, + { + "epoch": 0.09218289085545722, + "grad_norm": 2.054158926010132, + "learning_rate": 4.895908007759939e-05, + "loss": 5.6444, + "step": 15500 + }, + { + "epoch": 0.09218883813873822, + "grad_norm": 2.352905511856079, + "learning_rate": 4.895894669247468e-05, + "loss": 4.7985, + "step": 15501 + }, + { + "epoch": 0.09219478542201923, + "grad_norm": 2.612039804458618, + "learning_rate": 4.895881329898615e-05, + "loss": 4.769, + "step": 15502 + }, + { + "epoch": 0.09220073270530021, + "grad_norm": 2.1274194717407227, + "learning_rate": 4.8958679897133854e-05, + "loss": 4.6185, + "step": 15503 + }, + { + "epoch": 0.09220667998858122, + "grad_norm": 2.2458853721618652, + "learning_rate": 4.895854648691782e-05, + "loss": 4.8576, + "step": 15504 + }, + { + "epoch": 0.09221262727186222, + "grad_norm": 2.415526866912842, + "learning_rate": 4.895841306833811e-05, + "loss": 4.999, + "step": 15505 + }, + { + "epoch": 0.0922185745551432, + "grad_norm": 1.8172876834869385, + "learning_rate": 4.8958279641394765e-05, + "loss": 5.1992, + "step": 15506 + }, + { + "epoch": 0.09222452183842421, + "grad_norm": 2.0568878650665283, + "learning_rate": 4.8958146206087826e-05, + "loss": 5.1348, + "step": 15507 + }, + { + "epoch": 0.09223046912170521, + "grad_norm": 2.152869701385498, + "learning_rate": 4.895801276241736e-05, + "loss": 4.9832, + "step": 15508 + }, + { + "epoch": 0.0922364164049862, + "grad_norm": 1.8191282749176025, + "learning_rate": 4.895787931038339e-05, + "loss": 5.3098, + "step": 15509 + }, + { + "epoch": 0.0922423636882672, + "grad_norm": 1.9511895179748535, + "learning_rate": 4.895774584998597e-05, + "loss": 5.5763, + "step": 15510 + }, + { + "epoch": 0.0922483109715482, + "grad_norm": 1.8735122680664062, + "learning_rate": 4.895761238122515e-05, + "loss": 5.3644, + "step": 15511 + }, + { + "epoch": 0.09225425825482919, + "grad_norm": 1.672721028327942, + "learning_rate": 4.895747890410098e-05, + "loss": 5.2794, + "step": 15512 + }, + { + "epoch": 0.0922602055381102, + "grad_norm": 1.5318527221679688, + "learning_rate": 4.89573454186135e-05, + "loss": 5.3575, + "step": 15513 + }, + { + "epoch": 0.0922661528213912, + "grad_norm": 1.8192704916000366, + "learning_rate": 4.895721192476275e-05, + "loss": 5.498, + "step": 15514 + }, + { + "epoch": 0.09227210010467218, + "grad_norm": 1.948249340057373, + "learning_rate": 4.895707842254879e-05, + "loss": 5.6955, + "step": 15515 + }, + { + "epoch": 0.09227804738795319, + "grad_norm": 2.1378414630889893, + "learning_rate": 4.895694491197166e-05, + "loss": 5.4999, + "step": 15516 + }, + { + "epoch": 0.09228399467123417, + "grad_norm": 2.057358980178833, + "learning_rate": 4.8956811393031414e-05, + "loss": 4.7234, + "step": 15517 + }, + { + "epoch": 0.09228994195451518, + "grad_norm": 1.9550749063491821, + "learning_rate": 4.895667786572809e-05, + "loss": 5.7611, + "step": 15518 + }, + { + "epoch": 0.09229588923779618, + "grad_norm": 2.120396852493286, + "learning_rate": 4.8956544330061734e-05, + "loss": 5.8707, + "step": 15519 + }, + { + "epoch": 0.09230183652107717, + "grad_norm": 1.8432284593582153, + "learning_rate": 4.8956410786032404e-05, + "loss": 5.7512, + "step": 15520 + }, + { + "epoch": 0.09230778380435817, + "grad_norm": 1.738993525505066, + "learning_rate": 4.895627723364013e-05, + "loss": 5.2099, + "step": 15521 + }, + { + "epoch": 0.09231373108763917, + "grad_norm": 1.4885916709899902, + "learning_rate": 4.895614367288497e-05, + "loss": 5.6817, + "step": 15522 + }, + { + "epoch": 0.09231967837092016, + "grad_norm": 1.9712351560592651, + "learning_rate": 4.895601010376697e-05, + "loss": 5.4247, + "step": 15523 + }, + { + "epoch": 0.09232562565420116, + "grad_norm": 1.6669690608978271, + "learning_rate": 4.895587652628617e-05, + "loss": 5.2189, + "step": 15524 + }, + { + "epoch": 0.09233157293748216, + "grad_norm": 2.1034297943115234, + "learning_rate": 4.895574294044262e-05, + "loss": 5.4772, + "step": 15525 + }, + { + "epoch": 0.09233752022076315, + "grad_norm": 2.3692588806152344, + "learning_rate": 4.895560934623637e-05, + "loss": 5.002, + "step": 15526 + }, + { + "epoch": 0.09234346750404415, + "grad_norm": 2.708406686782837, + "learning_rate": 4.8955475743667464e-05, + "loss": 4.9923, + "step": 15527 + }, + { + "epoch": 0.09234941478732515, + "grad_norm": 2.4986281394958496, + "learning_rate": 4.895534213273595e-05, + "loss": 4.7859, + "step": 15528 + }, + { + "epoch": 0.09235536207060614, + "grad_norm": 2.4715240001678467, + "learning_rate": 4.895520851344187e-05, + "loss": 5.2135, + "step": 15529 + }, + { + "epoch": 0.09236130935388714, + "grad_norm": 1.77085280418396, + "learning_rate": 4.895507488578528e-05, + "loss": 5.4675, + "step": 15530 + }, + { + "epoch": 0.09236725663716815, + "grad_norm": 1.4845975637435913, + "learning_rate": 4.8954941249766225e-05, + "loss": 5.8627, + "step": 15531 + }, + { + "epoch": 0.09237320392044913, + "grad_norm": 2.0753140449523926, + "learning_rate": 4.8954807605384734e-05, + "loss": 5.8246, + "step": 15532 + }, + { + "epoch": 0.09237915120373014, + "grad_norm": 1.5671929121017456, + "learning_rate": 4.895467395264088e-05, + "loss": 5.8189, + "step": 15533 + }, + { + "epoch": 0.09238509848701114, + "grad_norm": 1.749223232269287, + "learning_rate": 4.895454029153469e-05, + "loss": 5.9183, + "step": 15534 + }, + { + "epoch": 0.09239104577029213, + "grad_norm": 1.7186611890792847, + "learning_rate": 4.895440662206622e-05, + "loss": 5.84, + "step": 15535 + }, + { + "epoch": 0.09239699305357313, + "grad_norm": 1.654483437538147, + "learning_rate": 4.895427294423551e-05, + "loss": 5.4055, + "step": 15536 + }, + { + "epoch": 0.09240294033685413, + "grad_norm": 1.7109687328338623, + "learning_rate": 4.895413925804261e-05, + "loss": 5.3028, + "step": 15537 + }, + { + "epoch": 0.09240888762013512, + "grad_norm": 1.9221105575561523, + "learning_rate": 4.895400556348757e-05, + "loss": 5.2911, + "step": 15538 + }, + { + "epoch": 0.09241483490341612, + "grad_norm": 1.9464010000228882, + "learning_rate": 4.895387186057044e-05, + "loss": 5.5883, + "step": 15539 + }, + { + "epoch": 0.09242078218669712, + "grad_norm": 1.9429137706756592, + "learning_rate": 4.8953738149291254e-05, + "loss": 5.7164, + "step": 15540 + }, + { + "epoch": 0.09242672946997811, + "grad_norm": 1.7792669534683228, + "learning_rate": 4.8953604429650065e-05, + "loss": 5.7924, + "step": 15541 + }, + { + "epoch": 0.09243267675325911, + "grad_norm": 2.2124290466308594, + "learning_rate": 4.895347070164692e-05, + "loss": 5.4432, + "step": 15542 + }, + { + "epoch": 0.09243862403654012, + "grad_norm": 1.6349585056304932, + "learning_rate": 4.8953336965281873e-05, + "loss": 5.6975, + "step": 15543 + }, + { + "epoch": 0.0924445713198211, + "grad_norm": 2.01434063911438, + "learning_rate": 4.895320322055496e-05, + "loss": 5.3564, + "step": 15544 + }, + { + "epoch": 0.0924505186031021, + "grad_norm": 1.8110109567642212, + "learning_rate": 4.895306946746623e-05, + "loss": 5.3061, + "step": 15545 + }, + { + "epoch": 0.0924564658863831, + "grad_norm": 1.6687593460083008, + "learning_rate": 4.895293570601573e-05, + "loss": 5.4061, + "step": 15546 + }, + { + "epoch": 0.0924624131696641, + "grad_norm": 1.7488101720809937, + "learning_rate": 4.895280193620351e-05, + "loss": 5.4726, + "step": 15547 + }, + { + "epoch": 0.0924683604529451, + "grad_norm": 1.9059126377105713, + "learning_rate": 4.895266815802961e-05, + "loss": 5.9665, + "step": 15548 + }, + { + "epoch": 0.09247430773622609, + "grad_norm": 1.9732307195663452, + "learning_rate": 4.8952534371494084e-05, + "loss": 6.007, + "step": 15549 + }, + { + "epoch": 0.09248025501950709, + "grad_norm": 1.792325496673584, + "learning_rate": 4.895240057659697e-05, + "loss": 5.9466, + "step": 15550 + }, + { + "epoch": 0.09248620230278809, + "grad_norm": 1.7282743453979492, + "learning_rate": 4.895226677333833e-05, + "loss": 5.456, + "step": 15551 + }, + { + "epoch": 0.09249214958606908, + "grad_norm": 1.5014616250991821, + "learning_rate": 4.89521329617182e-05, + "loss": 5.0257, + "step": 15552 + }, + { + "epoch": 0.09249809686935008, + "grad_norm": 1.5420494079589844, + "learning_rate": 4.8951999141736624e-05, + "loss": 5.0657, + "step": 15553 + }, + { + "epoch": 0.09250404415263108, + "grad_norm": 1.4273606538772583, + "learning_rate": 4.895186531339365e-05, + "loss": 5.3431, + "step": 15554 + }, + { + "epoch": 0.09250999143591207, + "grad_norm": 1.9525657892227173, + "learning_rate": 4.895173147668933e-05, + "loss": 5.514, + "step": 15555 + }, + { + "epoch": 0.09251593871919307, + "grad_norm": 2.7004175186157227, + "learning_rate": 4.895159763162371e-05, + "loss": 5.3548, + "step": 15556 + }, + { + "epoch": 0.09252188600247407, + "grad_norm": 2.5703442096710205, + "learning_rate": 4.8951463778196835e-05, + "loss": 5.4275, + "step": 15557 + }, + { + "epoch": 0.09252783328575506, + "grad_norm": 2.4033594131469727, + "learning_rate": 4.895132991640875e-05, + "loss": 5.285, + "step": 15558 + }, + { + "epoch": 0.09253378056903606, + "grad_norm": 2.0295355319976807, + "learning_rate": 4.89511960462595e-05, + "loss": 5.1196, + "step": 15559 + }, + { + "epoch": 0.09253972785231707, + "grad_norm": 2.0739188194274902, + "learning_rate": 4.895106216774914e-05, + "loss": 4.7362, + "step": 15560 + }, + { + "epoch": 0.09254567513559805, + "grad_norm": 2.2429590225219727, + "learning_rate": 4.895092828087771e-05, + "loss": 5.0749, + "step": 15561 + }, + { + "epoch": 0.09255162241887906, + "grad_norm": 1.9738318920135498, + "learning_rate": 4.895079438564526e-05, + "loss": 5.6755, + "step": 15562 + }, + { + "epoch": 0.09255756970216006, + "grad_norm": 2.692275047302246, + "learning_rate": 4.895066048205183e-05, + "loss": 5.3146, + "step": 15563 + }, + { + "epoch": 0.09256351698544105, + "grad_norm": 2.774864912033081, + "learning_rate": 4.895052657009748e-05, + "loss": 5.1116, + "step": 15564 + }, + { + "epoch": 0.09256946426872205, + "grad_norm": 2.5513851642608643, + "learning_rate": 4.895039264978224e-05, + "loss": 5.0464, + "step": 15565 + }, + { + "epoch": 0.09257541155200305, + "grad_norm": 2.2035319805145264, + "learning_rate": 4.895025872110617e-05, + "loss": 5.1499, + "step": 15566 + }, + { + "epoch": 0.09258135883528404, + "grad_norm": 1.669402837753296, + "learning_rate": 4.8950124784069305e-05, + "loss": 5.5006, + "step": 15567 + }, + { + "epoch": 0.09258730611856504, + "grad_norm": 1.9433900117874146, + "learning_rate": 4.894999083867171e-05, + "loss": 5.1423, + "step": 15568 + }, + { + "epoch": 0.09259325340184604, + "grad_norm": 2.2401936054229736, + "learning_rate": 4.8949856884913416e-05, + "loss": 4.8937, + "step": 15569 + }, + { + "epoch": 0.09259920068512703, + "grad_norm": 2.094503164291382, + "learning_rate": 4.894972292279447e-05, + "loss": 4.8554, + "step": 15570 + }, + { + "epoch": 0.09260514796840803, + "grad_norm": 2.1677212715148926, + "learning_rate": 4.894958895231493e-05, + "loss": 4.7446, + "step": 15571 + }, + { + "epoch": 0.09261109525168904, + "grad_norm": 2.0262231826782227, + "learning_rate": 4.894945497347483e-05, + "loss": 4.8282, + "step": 15572 + }, + { + "epoch": 0.09261704253497002, + "grad_norm": 1.9491705894470215, + "learning_rate": 4.894932098627423e-05, + "loss": 4.9579, + "step": 15573 + }, + { + "epoch": 0.09262298981825103, + "grad_norm": 2.0898170471191406, + "learning_rate": 4.8949186990713165e-05, + "loss": 4.8197, + "step": 15574 + }, + { + "epoch": 0.09262893710153201, + "grad_norm": 1.8452088832855225, + "learning_rate": 4.894905298679169e-05, + "loss": 4.8359, + "step": 15575 + }, + { + "epoch": 0.09263488438481302, + "grad_norm": 2.1573541164398193, + "learning_rate": 4.894891897450984e-05, + "loss": 4.5882, + "step": 15576 + }, + { + "epoch": 0.09264083166809402, + "grad_norm": 2.1609156131744385, + "learning_rate": 4.894878495386768e-05, + "loss": 4.7556, + "step": 15577 + }, + { + "epoch": 0.092646778951375, + "grad_norm": 1.9062503576278687, + "learning_rate": 4.894865092486524e-05, + "loss": 4.6933, + "step": 15578 + }, + { + "epoch": 0.09265272623465601, + "grad_norm": 1.8876394033432007, + "learning_rate": 4.894851688750257e-05, + "loss": 4.7317, + "step": 15579 + }, + { + "epoch": 0.09265867351793701, + "grad_norm": 1.9106816053390503, + "learning_rate": 4.894838284177972e-05, + "loss": 4.7597, + "step": 15580 + }, + { + "epoch": 0.092664620801218, + "grad_norm": 1.8116264343261719, + "learning_rate": 4.894824878769674e-05, + "loss": 4.8865, + "step": 15581 + }, + { + "epoch": 0.092670568084499, + "grad_norm": 1.8492180109024048, + "learning_rate": 4.894811472525368e-05, + "loss": 4.7282, + "step": 15582 + }, + { + "epoch": 0.09267651536778, + "grad_norm": 1.9450536966323853, + "learning_rate": 4.894798065445058e-05, + "loss": 5.0777, + "step": 15583 + }, + { + "epoch": 0.09268246265106099, + "grad_norm": 2.2099180221557617, + "learning_rate": 4.894784657528748e-05, + "loss": 5.421, + "step": 15584 + }, + { + "epoch": 0.09268840993434199, + "grad_norm": 2.2239253520965576, + "learning_rate": 4.8947712487764436e-05, + "loss": 5.8346, + "step": 15585 + }, + { + "epoch": 0.092694357217623, + "grad_norm": 1.7867511510849, + "learning_rate": 4.894757839188149e-05, + "loss": 5.9306, + "step": 15586 + }, + { + "epoch": 0.09270030450090398, + "grad_norm": 1.6986007690429688, + "learning_rate": 4.89474442876387e-05, + "loss": 5.0704, + "step": 15587 + }, + { + "epoch": 0.09270625178418498, + "grad_norm": 1.7906185388565063, + "learning_rate": 4.89473101750361e-05, + "loss": 5.1951, + "step": 15588 + }, + { + "epoch": 0.09271219906746599, + "grad_norm": 1.7287026643753052, + "learning_rate": 4.894717605407374e-05, + "loss": 5.1736, + "step": 15589 + }, + { + "epoch": 0.09271814635074697, + "grad_norm": 1.6170624494552612, + "learning_rate": 4.8947041924751665e-05, + "loss": 5.5399, + "step": 15590 + }, + { + "epoch": 0.09272409363402798, + "grad_norm": 1.7556488513946533, + "learning_rate": 4.894690778706994e-05, + "loss": 5.574, + "step": 15591 + }, + { + "epoch": 0.09273004091730898, + "grad_norm": 2.346484899520874, + "learning_rate": 4.894677364102859e-05, + "loss": 5.0062, + "step": 15592 + }, + { + "epoch": 0.09273598820058997, + "grad_norm": 2.1376540660858154, + "learning_rate": 4.894663948662766e-05, + "loss": 5.1377, + "step": 15593 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 2.2489631175994873, + "learning_rate": 4.894650532386721e-05, + "loss": 5.1058, + "step": 15594 + }, + { + "epoch": 0.09274788276715197, + "grad_norm": 1.984281063079834, + "learning_rate": 4.8946371152747285e-05, + "loss": 5.1223, + "step": 15595 + }, + { + "epoch": 0.09275383005043296, + "grad_norm": 1.9387162923812866, + "learning_rate": 4.8946236973267935e-05, + "loss": 5.5121, + "step": 15596 + }, + { + "epoch": 0.09275977733371396, + "grad_norm": 1.8052873611450195, + "learning_rate": 4.894610278542919e-05, + "loss": 5.2101, + "step": 15597 + }, + { + "epoch": 0.09276572461699496, + "grad_norm": 2.558525562286377, + "learning_rate": 4.894596858923111e-05, + "loss": 4.6659, + "step": 15598 + }, + { + "epoch": 0.09277167190027595, + "grad_norm": 1.700897455215454, + "learning_rate": 4.8945834384673746e-05, + "loss": 5.4634, + "step": 15599 + }, + { + "epoch": 0.09277761918355695, + "grad_norm": 1.4691836833953857, + "learning_rate": 4.8945700171757134e-05, + "loss": 5.3873, + "step": 15600 + }, + { + "epoch": 0.09278356646683796, + "grad_norm": 1.4673740863800049, + "learning_rate": 4.894556595048132e-05, + "loss": 5.3917, + "step": 15601 + }, + { + "epoch": 0.09278951375011894, + "grad_norm": 1.6252011060714722, + "learning_rate": 4.894543172084637e-05, + "loss": 5.2003, + "step": 15602 + }, + { + "epoch": 0.09279546103339995, + "grad_norm": 1.6320288181304932, + "learning_rate": 4.89452974828523e-05, + "loss": 5.4821, + "step": 15603 + }, + { + "epoch": 0.09280140831668093, + "grad_norm": 2.1444239616394043, + "learning_rate": 4.8945163236499194e-05, + "loss": 5.9926, + "step": 15604 + }, + { + "epoch": 0.09280735559996194, + "grad_norm": 2.3000271320343018, + "learning_rate": 4.894502898178707e-05, + "loss": 4.7545, + "step": 15605 + }, + { + "epoch": 0.09281330288324294, + "grad_norm": 2.259962797164917, + "learning_rate": 4.894489471871597e-05, + "loss": 5.1292, + "step": 15606 + }, + { + "epoch": 0.09281925016652393, + "grad_norm": 2.5522921085357666, + "learning_rate": 4.8944760447285977e-05, + "loss": 5.1226, + "step": 15607 + }, + { + "epoch": 0.09282519744980493, + "grad_norm": 1.7621963024139404, + "learning_rate": 4.8944626167497096e-05, + "loss": 5.5405, + "step": 15608 + }, + { + "epoch": 0.09283114473308593, + "grad_norm": 1.6631364822387695, + "learning_rate": 4.894449187934941e-05, + "loss": 5.4332, + "step": 15609 + }, + { + "epoch": 0.09283709201636692, + "grad_norm": 1.695904016494751, + "learning_rate": 4.894435758284294e-05, + "loss": 5.4989, + "step": 15610 + }, + { + "epoch": 0.09284303929964792, + "grad_norm": 2.0772507190704346, + "learning_rate": 4.894422327797774e-05, + "loss": 5.0412, + "step": 15611 + }, + { + "epoch": 0.09284898658292892, + "grad_norm": 1.959685206413269, + "learning_rate": 4.894408896475386e-05, + "loss": 5.2749, + "step": 15612 + }, + { + "epoch": 0.09285493386620991, + "grad_norm": 2.0305607318878174, + "learning_rate": 4.894395464317135e-05, + "loss": 5.6227, + "step": 15613 + }, + { + "epoch": 0.09286088114949091, + "grad_norm": 1.7631112337112427, + "learning_rate": 4.894382031323026e-05, + "loss": 5.4396, + "step": 15614 + }, + { + "epoch": 0.09286682843277191, + "grad_norm": 1.8171305656433105, + "learning_rate": 4.894368597493062e-05, + "loss": 5.2498, + "step": 15615 + }, + { + "epoch": 0.0928727757160529, + "grad_norm": 2.123805522918701, + "learning_rate": 4.894355162827249e-05, + "loss": 5.8113, + "step": 15616 + }, + { + "epoch": 0.0928787229993339, + "grad_norm": 1.840071201324463, + "learning_rate": 4.894341727325591e-05, + "loss": 5.6394, + "step": 15617 + }, + { + "epoch": 0.0928846702826149, + "grad_norm": 1.7636733055114746, + "learning_rate": 4.8943282909880935e-05, + "loss": 5.5515, + "step": 15618 + }, + { + "epoch": 0.0928906175658959, + "grad_norm": 1.956026315689087, + "learning_rate": 4.89431485381476e-05, + "loss": 5.1716, + "step": 15619 + }, + { + "epoch": 0.0928965648491769, + "grad_norm": 2.2381720542907715, + "learning_rate": 4.894301415805597e-05, + "loss": 4.9692, + "step": 15620 + }, + { + "epoch": 0.0929025121324579, + "grad_norm": 2.178999423980713, + "learning_rate": 4.894287976960607e-05, + "loss": 4.9732, + "step": 15621 + }, + { + "epoch": 0.09290845941573889, + "grad_norm": 2.1932144165039062, + "learning_rate": 4.894274537279796e-05, + "loss": 4.9497, + "step": 15622 + }, + { + "epoch": 0.09291440669901989, + "grad_norm": 2.093252182006836, + "learning_rate": 4.894261096763169e-05, + "loss": 4.7642, + "step": 15623 + }, + { + "epoch": 0.09292035398230089, + "grad_norm": 1.785686731338501, + "learning_rate": 4.89424765541073e-05, + "loss": 5.1449, + "step": 15624 + }, + { + "epoch": 0.09292630126558188, + "grad_norm": 2.250986099243164, + "learning_rate": 4.894234213222484e-05, + "loss": 4.8503, + "step": 15625 + }, + { + "epoch": 0.09293224854886288, + "grad_norm": 1.8585362434387207, + "learning_rate": 4.8942207701984355e-05, + "loss": 4.582, + "step": 15626 + }, + { + "epoch": 0.09293819583214388, + "grad_norm": 2.080742597579956, + "learning_rate": 4.894207326338589e-05, + "loss": 4.4912, + "step": 15627 + }, + { + "epoch": 0.09294414311542487, + "grad_norm": 2.422774076461792, + "learning_rate": 4.8941938816429495e-05, + "loss": 4.4227, + "step": 15628 + }, + { + "epoch": 0.09295009039870587, + "grad_norm": 2.3304965496063232, + "learning_rate": 4.8941804361115215e-05, + "loss": 4.2265, + "step": 15629 + }, + { + "epoch": 0.09295603768198687, + "grad_norm": 2.619837522506714, + "learning_rate": 4.8941669897443105e-05, + "loss": 4.6812, + "step": 15630 + }, + { + "epoch": 0.09296198496526786, + "grad_norm": 2.4924118518829346, + "learning_rate": 4.89415354254132e-05, + "loss": 4.5081, + "step": 15631 + }, + { + "epoch": 0.09296793224854887, + "grad_norm": 2.5034751892089844, + "learning_rate": 4.894140094502556e-05, + "loss": 4.3356, + "step": 15632 + }, + { + "epoch": 0.09297387953182985, + "grad_norm": 2.599963665008545, + "learning_rate": 4.894126645628021e-05, + "loss": 4.6952, + "step": 15633 + }, + { + "epoch": 0.09297982681511086, + "grad_norm": 2.189516544342041, + "learning_rate": 4.894113195917722e-05, + "loss": 5.75, + "step": 15634 + }, + { + "epoch": 0.09298577409839186, + "grad_norm": 2.5768351554870605, + "learning_rate": 4.894099745371663e-05, + "loss": 5.9257, + "step": 15635 + }, + { + "epoch": 0.09299172138167285, + "grad_norm": 2.2909457683563232, + "learning_rate": 4.894086293989848e-05, + "loss": 5.484, + "step": 15636 + }, + { + "epoch": 0.09299766866495385, + "grad_norm": 2.0447487831115723, + "learning_rate": 4.894072841772282e-05, + "loss": 5.2952, + "step": 15637 + }, + { + "epoch": 0.09300361594823485, + "grad_norm": 1.8934963941574097, + "learning_rate": 4.894059388718971e-05, + "loss": 5.3498, + "step": 15638 + }, + { + "epoch": 0.09300956323151584, + "grad_norm": 1.9989632368087769, + "learning_rate": 4.894045934829919e-05, + "loss": 5.55, + "step": 15639 + }, + { + "epoch": 0.09301551051479684, + "grad_norm": 1.4955580234527588, + "learning_rate": 4.8940324801051285e-05, + "loss": 5.1978, + "step": 15640 + }, + { + "epoch": 0.09302145779807784, + "grad_norm": 1.8308879137039185, + "learning_rate": 4.8940190245446074e-05, + "loss": 5.5448, + "step": 15641 + }, + { + "epoch": 0.09302740508135883, + "grad_norm": 1.4997726678848267, + "learning_rate": 4.8940055681483576e-05, + "loss": 5.353, + "step": 15642 + }, + { + "epoch": 0.09303335236463983, + "grad_norm": 1.5643866062164307, + "learning_rate": 4.8939921109163864e-05, + "loss": 5.1456, + "step": 15643 + }, + { + "epoch": 0.09303929964792083, + "grad_norm": 1.8125799894332886, + "learning_rate": 4.8939786528486967e-05, + "loss": 5.3456, + "step": 15644 + }, + { + "epoch": 0.09304524693120182, + "grad_norm": 1.6802864074707031, + "learning_rate": 4.893965193945294e-05, + "loss": 5.279, + "step": 15645 + }, + { + "epoch": 0.09305119421448282, + "grad_norm": 1.4397536516189575, + "learning_rate": 4.893951734206182e-05, + "loss": 5.9849, + "step": 15646 + }, + { + "epoch": 0.09305714149776383, + "grad_norm": 1.618416428565979, + "learning_rate": 4.893938273631368e-05, + "loss": 5.231, + "step": 15647 + }, + { + "epoch": 0.09306308878104481, + "grad_norm": 1.4833893775939941, + "learning_rate": 4.8939248122208537e-05, + "loss": 5.2883, + "step": 15648 + }, + { + "epoch": 0.09306903606432582, + "grad_norm": 1.2709630727767944, + "learning_rate": 4.8939113499746446e-05, + "loss": 5.1042, + "step": 15649 + }, + { + "epoch": 0.09307498334760682, + "grad_norm": 1.2770884037017822, + "learning_rate": 4.893897886892747e-05, + "loss": 5.0682, + "step": 15650 + }, + { + "epoch": 0.0930809306308878, + "grad_norm": 1.4511629343032837, + "learning_rate": 4.893884422975163e-05, + "loss": 5.0904, + "step": 15651 + }, + { + "epoch": 0.09308687791416881, + "grad_norm": 1.7428641319274902, + "learning_rate": 4.8938709582219e-05, + "loss": 5.2569, + "step": 15652 + }, + { + "epoch": 0.09309282519744981, + "grad_norm": 1.5430729389190674, + "learning_rate": 4.89385749263296e-05, + "loss": 5.1698, + "step": 15653 + }, + { + "epoch": 0.0930987724807308, + "grad_norm": 1.6689143180847168, + "learning_rate": 4.8938440262083495e-05, + "loss": 5.1866, + "step": 15654 + }, + { + "epoch": 0.0931047197640118, + "grad_norm": 1.505698323249817, + "learning_rate": 4.8938305589480734e-05, + "loss": 5.1574, + "step": 15655 + }, + { + "epoch": 0.0931106670472928, + "grad_norm": 1.496547818183899, + "learning_rate": 4.8938170908521356e-05, + "loss": 5.1175, + "step": 15656 + }, + { + "epoch": 0.09311661433057379, + "grad_norm": 1.5257115364074707, + "learning_rate": 4.893803621920541e-05, + "loss": 5.1796, + "step": 15657 + }, + { + "epoch": 0.09312256161385479, + "grad_norm": 1.5880948305130005, + "learning_rate": 4.893790152153294e-05, + "loss": 5.1864, + "step": 15658 + }, + { + "epoch": 0.0931285088971358, + "grad_norm": 1.632869839668274, + "learning_rate": 4.8937766815503994e-05, + "loss": 5.1126, + "step": 15659 + }, + { + "epoch": 0.09313445618041678, + "grad_norm": 1.5902632474899292, + "learning_rate": 4.893763210111862e-05, + "loss": 5.0661, + "step": 15660 + }, + { + "epoch": 0.09314040346369779, + "grad_norm": 1.2780532836914062, + "learning_rate": 4.893749737837687e-05, + "loss": 5.2189, + "step": 15661 + }, + { + "epoch": 0.09314635074697877, + "grad_norm": 1.604551076889038, + "learning_rate": 4.8937362647278786e-05, + "loss": 5.4624, + "step": 15662 + }, + { + "epoch": 0.09315229803025978, + "grad_norm": 1.3654263019561768, + "learning_rate": 4.8937227907824424e-05, + "loss": 5.3875, + "step": 15663 + }, + { + "epoch": 0.09315824531354078, + "grad_norm": 1.3098255395889282, + "learning_rate": 4.893709316001381e-05, + "loss": 5.2158, + "step": 15664 + }, + { + "epoch": 0.09316419259682177, + "grad_norm": 1.4036632776260376, + "learning_rate": 4.893695840384701e-05, + "loss": 5.3808, + "step": 15665 + }, + { + "epoch": 0.09317013988010277, + "grad_norm": 1.772504210472107, + "learning_rate": 4.893682363932407e-05, + "loss": 5.4599, + "step": 15666 + }, + { + "epoch": 0.09317608716338377, + "grad_norm": 1.8509577512741089, + "learning_rate": 4.893668886644503e-05, + "loss": 5.223, + "step": 15667 + }, + { + "epoch": 0.09318203444666476, + "grad_norm": 1.7572264671325684, + "learning_rate": 4.893655408520993e-05, + "loss": 5.3276, + "step": 15668 + }, + { + "epoch": 0.09318798172994576, + "grad_norm": 1.7149637937545776, + "learning_rate": 4.8936419295618835e-05, + "loss": 5.3093, + "step": 15669 + }, + { + "epoch": 0.09319392901322676, + "grad_norm": 1.441741943359375, + "learning_rate": 4.893628449767178e-05, + "loss": 5.2237, + "step": 15670 + }, + { + "epoch": 0.09319987629650775, + "grad_norm": 1.4929050207138062, + "learning_rate": 4.893614969136882e-05, + "loss": 5.22, + "step": 15671 + }, + { + "epoch": 0.09320582357978875, + "grad_norm": 1.251057505607605, + "learning_rate": 4.893601487670999e-05, + "loss": 5.2417, + "step": 15672 + }, + { + "epoch": 0.09321177086306975, + "grad_norm": 1.313826560974121, + "learning_rate": 4.893588005369535e-05, + "loss": 5.1841, + "step": 15673 + }, + { + "epoch": 0.09321771814635074, + "grad_norm": 1.1993061304092407, + "learning_rate": 4.8935745222324935e-05, + "loss": 5.1649, + "step": 15674 + }, + { + "epoch": 0.09322366542963174, + "grad_norm": 1.4086672067642212, + "learning_rate": 4.8935610382598806e-05, + "loss": 5.1463, + "step": 15675 + }, + { + "epoch": 0.09322961271291275, + "grad_norm": 1.3089197874069214, + "learning_rate": 4.893547553451701e-05, + "loss": 5.1505, + "step": 15676 + }, + { + "epoch": 0.09323555999619373, + "grad_norm": 1.3332446813583374, + "learning_rate": 4.893534067807957e-05, + "loss": 5.1267, + "step": 15677 + }, + { + "epoch": 0.09324150727947474, + "grad_norm": 1.433020830154419, + "learning_rate": 4.893520581328656e-05, + "loss": 5.1689, + "step": 15678 + }, + { + "epoch": 0.09324745456275574, + "grad_norm": 1.4111361503601074, + "learning_rate": 4.893507094013801e-05, + "loss": 5.1288, + "step": 15679 + }, + { + "epoch": 0.09325340184603673, + "grad_norm": 1.551698923110962, + "learning_rate": 4.893493605863398e-05, + "loss": 5.0919, + "step": 15680 + }, + { + "epoch": 0.09325934912931773, + "grad_norm": 1.5479143857955933, + "learning_rate": 4.893480116877451e-05, + "loss": 4.9749, + "step": 15681 + }, + { + "epoch": 0.09326529641259873, + "grad_norm": 1.3716951608657837, + "learning_rate": 4.893466627055964e-05, + "loss": 5.2221, + "step": 15682 + }, + { + "epoch": 0.09327124369587972, + "grad_norm": 1.409462571144104, + "learning_rate": 4.893453136398943e-05, + "loss": 5.2131, + "step": 15683 + }, + { + "epoch": 0.09327719097916072, + "grad_norm": 1.3185720443725586, + "learning_rate": 4.8934396449063935e-05, + "loss": 5.094, + "step": 15684 + }, + { + "epoch": 0.09328313826244172, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8934261525783176e-05, + "loss": 5.0889, + "step": 15685 + }, + { + "epoch": 0.09328908554572271, + "grad_norm": 2.147268772125244, + "learning_rate": 4.8934126594147216e-05, + "loss": 4.9404, + "step": 15686 + }, + { + "epoch": 0.09329503282900371, + "grad_norm": 1.3361799716949463, + "learning_rate": 4.8933991654156096e-05, + "loss": 5.0744, + "step": 15687 + }, + { + "epoch": 0.09330098011228471, + "grad_norm": 1.6436421871185303, + "learning_rate": 4.893385670580988e-05, + "loss": 5.0633, + "step": 15688 + }, + { + "epoch": 0.0933069273955657, + "grad_norm": 1.5499234199523926, + "learning_rate": 4.8933721749108586e-05, + "loss": 4.8445, + "step": 15689 + }, + { + "epoch": 0.0933128746788467, + "grad_norm": 1.363355278968811, + "learning_rate": 4.893358678405229e-05, + "loss": 5.1135, + "step": 15690 + }, + { + "epoch": 0.0933188219621277, + "grad_norm": 1.4172797203063965, + "learning_rate": 4.893345181064102e-05, + "loss": 5.056, + "step": 15691 + }, + { + "epoch": 0.0933247692454087, + "grad_norm": 1.546329140663147, + "learning_rate": 4.893331682887483e-05, + "loss": 4.9756, + "step": 15692 + }, + { + "epoch": 0.0933307165286897, + "grad_norm": 1.5151170492172241, + "learning_rate": 4.893318183875376e-05, + "loss": 4.991, + "step": 15693 + }, + { + "epoch": 0.09333666381197069, + "grad_norm": 1.1936514377593994, + "learning_rate": 4.893304684027787e-05, + "loss": 5.0454, + "step": 15694 + }, + { + "epoch": 0.09334261109525169, + "grad_norm": 1.4055380821228027, + "learning_rate": 4.893291183344721e-05, + "loss": 5.0673, + "step": 15695 + }, + { + "epoch": 0.09334855837853269, + "grad_norm": 1.4087036848068237, + "learning_rate": 4.89327768182618e-05, + "loss": 4.9748, + "step": 15696 + }, + { + "epoch": 0.09335450566181368, + "grad_norm": 1.251237392425537, + "learning_rate": 4.893264179472171e-05, + "loss": 5.158, + "step": 15697 + }, + { + "epoch": 0.09336045294509468, + "grad_norm": 1.3806357383728027, + "learning_rate": 4.893250676282699e-05, + "loss": 5.2027, + "step": 15698 + }, + { + "epoch": 0.09336640022837568, + "grad_norm": 1.3959203958511353, + "learning_rate": 4.893237172257767e-05, + "loss": 5.1854, + "step": 15699 + }, + { + "epoch": 0.09337234751165667, + "grad_norm": 1.4886810779571533, + "learning_rate": 4.893223667397381e-05, + "loss": 5.2363, + "step": 15700 + }, + { + "epoch": 0.09337829479493767, + "grad_norm": 1.2987968921661377, + "learning_rate": 4.893210161701546e-05, + "loss": 5.2931, + "step": 15701 + }, + { + "epoch": 0.09338424207821867, + "grad_norm": 1.2594645023345947, + "learning_rate": 4.8931966551702644e-05, + "loss": 5.1346, + "step": 15702 + }, + { + "epoch": 0.09339018936149966, + "grad_norm": 1.5101357698440552, + "learning_rate": 4.893183147803544e-05, + "loss": 5.0369, + "step": 15703 + }, + { + "epoch": 0.09339613664478066, + "grad_norm": 1.4388933181762695, + "learning_rate": 4.8931696396013876e-05, + "loss": 5.0427, + "step": 15704 + }, + { + "epoch": 0.09340208392806167, + "grad_norm": 1.2890875339508057, + "learning_rate": 4.8931561305638006e-05, + "loss": 5.1602, + "step": 15705 + }, + { + "epoch": 0.09340803121134265, + "grad_norm": 1.3310670852661133, + "learning_rate": 4.893142620690787e-05, + "loss": 5.4886, + "step": 15706 + }, + { + "epoch": 0.09341397849462366, + "grad_norm": 1.0935169458389282, + "learning_rate": 4.893129109982353e-05, + "loss": 5.4634, + "step": 15707 + }, + { + "epoch": 0.09341992577790466, + "grad_norm": 1.4718440771102905, + "learning_rate": 4.893115598438501e-05, + "loss": 5.4917, + "step": 15708 + }, + { + "epoch": 0.09342587306118565, + "grad_norm": 1.4053934812545776, + "learning_rate": 4.8931020860592384e-05, + "loss": 5.1588, + "step": 15709 + }, + { + "epoch": 0.09343182034446665, + "grad_norm": 1.3130263090133667, + "learning_rate": 4.893088572844568e-05, + "loss": 5.0464, + "step": 15710 + }, + { + "epoch": 0.09343776762774765, + "grad_norm": 1.3342580795288086, + "learning_rate": 4.8930750587944955e-05, + "loss": 5.1464, + "step": 15711 + }, + { + "epoch": 0.09344371491102864, + "grad_norm": 1.3214285373687744, + "learning_rate": 4.893061543909024e-05, + "loss": 5.0867, + "step": 15712 + }, + { + "epoch": 0.09344966219430964, + "grad_norm": 1.2091466188430786, + "learning_rate": 4.893048028188161e-05, + "loss": 5.1403, + "step": 15713 + }, + { + "epoch": 0.09345560947759064, + "grad_norm": 1.421499490737915, + "learning_rate": 4.893034511631909e-05, + "loss": 5.1853, + "step": 15714 + }, + { + "epoch": 0.09346155676087163, + "grad_norm": 1.2093148231506348, + "learning_rate": 4.893020994240273e-05, + "loss": 5.0892, + "step": 15715 + }, + { + "epoch": 0.09346750404415263, + "grad_norm": 1.361080288887024, + "learning_rate": 4.893007476013258e-05, + "loss": 5.0855, + "step": 15716 + }, + { + "epoch": 0.09347345132743363, + "grad_norm": 1.31247079372406, + "learning_rate": 4.89299395695087e-05, + "loss": 5.1667, + "step": 15717 + }, + { + "epoch": 0.09347939861071462, + "grad_norm": 1.4052191972732544, + "learning_rate": 4.892980437053112e-05, + "loss": 4.9256, + "step": 15718 + }, + { + "epoch": 0.09348534589399562, + "grad_norm": 1.409225344657898, + "learning_rate": 4.8929669163199886e-05, + "loss": 4.7722, + "step": 15719 + }, + { + "epoch": 0.09349129317727661, + "grad_norm": 1.54015052318573, + "learning_rate": 4.892953394751505e-05, + "loss": 4.9331, + "step": 15720 + }, + { + "epoch": 0.09349724046055762, + "grad_norm": 1.313596487045288, + "learning_rate": 4.892939872347667e-05, + "loss": 5.0221, + "step": 15721 + }, + { + "epoch": 0.09350318774383862, + "grad_norm": 1.5266852378845215, + "learning_rate": 4.8929263491084785e-05, + "loss": 5.0261, + "step": 15722 + }, + { + "epoch": 0.0935091350271196, + "grad_norm": 1.409408450126648, + "learning_rate": 4.892912825033944e-05, + "loss": 5.1319, + "step": 15723 + }, + { + "epoch": 0.09351508231040061, + "grad_norm": 1.444326639175415, + "learning_rate": 4.892899300124067e-05, + "loss": 5.0043, + "step": 15724 + }, + { + "epoch": 0.09352102959368161, + "grad_norm": 1.6662111282348633, + "learning_rate": 4.8928857743788556e-05, + "loss": 5.22, + "step": 15725 + }, + { + "epoch": 0.0935269768769626, + "grad_norm": 1.5927739143371582, + "learning_rate": 4.8928722477983116e-05, + "loss": 5.1532, + "step": 15726 + }, + { + "epoch": 0.0935329241602436, + "grad_norm": 1.5560848712921143, + "learning_rate": 4.892858720382441e-05, + "loss": 4.8893, + "step": 15727 + }, + { + "epoch": 0.0935388714435246, + "grad_norm": 1.450135588645935, + "learning_rate": 4.892845192131247e-05, + "loss": 4.8116, + "step": 15728 + }, + { + "epoch": 0.09354481872680559, + "grad_norm": 1.3629002571105957, + "learning_rate": 4.892831663044736e-05, + "loss": 4.9439, + "step": 15729 + }, + { + "epoch": 0.09355076601008659, + "grad_norm": 1.5293892621994019, + "learning_rate": 4.892818133122913e-05, + "loss": 5.1726, + "step": 15730 + }, + { + "epoch": 0.0935567132933676, + "grad_norm": 1.193088412284851, + "learning_rate": 4.892804602365781e-05, + "loss": 5.3199, + "step": 15731 + }, + { + "epoch": 0.09356266057664858, + "grad_norm": 1.5575615167617798, + "learning_rate": 4.8927910707733456e-05, + "loss": 5.3426, + "step": 15732 + }, + { + "epoch": 0.09356860785992958, + "grad_norm": 1.4177138805389404, + "learning_rate": 4.892777538345612e-05, + "loss": 5.4028, + "step": 15733 + }, + { + "epoch": 0.09357455514321059, + "grad_norm": 1.4139392375946045, + "learning_rate": 4.892764005082584e-05, + "loss": 5.3854, + "step": 15734 + }, + { + "epoch": 0.09358050242649157, + "grad_norm": 1.5129605531692505, + "learning_rate": 4.892750470984267e-05, + "loss": 5.3614, + "step": 15735 + }, + { + "epoch": 0.09358644970977258, + "grad_norm": 1.23565673828125, + "learning_rate": 4.8927369360506665e-05, + "loss": 5.2379, + "step": 15736 + }, + { + "epoch": 0.09359239699305358, + "grad_norm": 1.4861465692520142, + "learning_rate": 4.892723400281785e-05, + "loss": 5.0968, + "step": 15737 + }, + { + "epoch": 0.09359834427633457, + "grad_norm": 1.4061464071273804, + "learning_rate": 4.892709863677629e-05, + "loss": 5.2947, + "step": 15738 + }, + { + "epoch": 0.09360429155961557, + "grad_norm": 1.2175462245941162, + "learning_rate": 4.892696326238203e-05, + "loss": 5.2828, + "step": 15739 + }, + { + "epoch": 0.09361023884289657, + "grad_norm": 1.398414969444275, + "learning_rate": 4.8926827879635104e-05, + "loss": 5.3281, + "step": 15740 + }, + { + "epoch": 0.09361618612617756, + "grad_norm": 1.438428282737732, + "learning_rate": 4.892669248853558e-05, + "loss": 5.2483, + "step": 15741 + }, + { + "epoch": 0.09362213340945856, + "grad_norm": 1.6579184532165527, + "learning_rate": 4.8926557089083494e-05, + "loss": 5.1275, + "step": 15742 + }, + { + "epoch": 0.09362808069273956, + "grad_norm": 1.2637989521026611, + "learning_rate": 4.892642168127889e-05, + "loss": 5.2276, + "step": 15743 + }, + { + "epoch": 0.09363402797602055, + "grad_norm": 1.383898377418518, + "learning_rate": 4.892628626512182e-05, + "loss": 5.3406, + "step": 15744 + }, + { + "epoch": 0.09363997525930155, + "grad_norm": 1.3794132471084595, + "learning_rate": 4.8926150840612325e-05, + "loss": 5.2309, + "step": 15745 + }, + { + "epoch": 0.09364592254258255, + "grad_norm": 1.3234885931015015, + "learning_rate": 4.8926015407750466e-05, + "loss": 5.3171, + "step": 15746 + }, + { + "epoch": 0.09365186982586354, + "grad_norm": 1.4807502031326294, + "learning_rate": 4.892587996653629e-05, + "loss": 5.3362, + "step": 15747 + }, + { + "epoch": 0.09365781710914454, + "grad_norm": 2.380307912826538, + "learning_rate": 4.892574451696982e-05, + "loss": 5.3103, + "step": 15748 + }, + { + "epoch": 0.09366376439242553, + "grad_norm": 1.5202600955963135, + "learning_rate": 4.892560905905113e-05, + "loss": 5.2225, + "step": 15749 + }, + { + "epoch": 0.09366971167570654, + "grad_norm": 1.34883451461792, + "learning_rate": 4.892547359278025e-05, + "loss": 5.1794, + "step": 15750 + }, + { + "epoch": 0.09367565895898754, + "grad_norm": 1.7073168754577637, + "learning_rate": 4.8925338118157235e-05, + "loss": 5.101, + "step": 15751 + }, + { + "epoch": 0.09368160624226853, + "grad_norm": 1.2718127965927124, + "learning_rate": 4.892520263518214e-05, + "loss": 5.3492, + "step": 15752 + }, + { + "epoch": 0.09368755352554953, + "grad_norm": 1.2247645854949951, + "learning_rate": 4.8925067143854993e-05, + "loss": 5.0841, + "step": 15753 + }, + { + "epoch": 0.09369350080883053, + "grad_norm": 1.4443535804748535, + "learning_rate": 4.892493164417586e-05, + "loss": 5.2866, + "step": 15754 + }, + { + "epoch": 0.09369944809211152, + "grad_norm": 1.2206883430480957, + "learning_rate": 4.8924796136144776e-05, + "loss": 5.116, + "step": 15755 + }, + { + "epoch": 0.09370539537539252, + "grad_norm": 1.4597479104995728, + "learning_rate": 4.89246606197618e-05, + "loss": 5.1501, + "step": 15756 + }, + { + "epoch": 0.09371134265867352, + "grad_norm": 1.4129786491394043, + "learning_rate": 4.892452509502697e-05, + "loss": 5.2618, + "step": 15757 + }, + { + "epoch": 0.09371728994195451, + "grad_norm": 1.382739543914795, + "learning_rate": 4.892438956194033e-05, + "loss": 5.2191, + "step": 15758 + }, + { + "epoch": 0.09372323722523551, + "grad_norm": 1.3665072917938232, + "learning_rate": 4.8924254020501934e-05, + "loss": 4.9739, + "step": 15759 + }, + { + "epoch": 0.09372918450851651, + "grad_norm": 1.3109017610549927, + "learning_rate": 4.892411847071183e-05, + "loss": 5.0648, + "step": 15760 + }, + { + "epoch": 0.0937351317917975, + "grad_norm": 1.5278202295303345, + "learning_rate": 4.892398291257007e-05, + "loss": 5.0215, + "step": 15761 + }, + { + "epoch": 0.0937410790750785, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.8923847346076686e-05, + "loss": 5.442, + "step": 15762 + }, + { + "epoch": 0.0937470263583595, + "grad_norm": 1.4718897342681885, + "learning_rate": 4.892371177123174e-05, + "loss": 5.1484, + "step": 15763 + }, + { + "epoch": 0.0937529736416405, + "grad_norm": 1.2358952760696411, + "learning_rate": 4.8923576188035264e-05, + "loss": 5.3594, + "step": 15764 + }, + { + "epoch": 0.0937589209249215, + "grad_norm": 1.59844172000885, + "learning_rate": 4.8923440596487326e-05, + "loss": 5.221, + "step": 15765 + }, + { + "epoch": 0.0937648682082025, + "grad_norm": 1.4293478727340698, + "learning_rate": 4.892330499658795e-05, + "loss": 5.2211, + "step": 15766 + }, + { + "epoch": 0.09377081549148349, + "grad_norm": 1.167673110961914, + "learning_rate": 4.8923169388337204e-05, + "loss": 5.1274, + "step": 15767 + }, + { + "epoch": 0.09377676277476449, + "grad_norm": 1.4637590646743774, + "learning_rate": 4.892303377173512e-05, + "loss": 5.0781, + "step": 15768 + }, + { + "epoch": 0.09378271005804549, + "grad_norm": 1.383498191833496, + "learning_rate": 4.892289814678176e-05, + "loss": 5.003, + "step": 15769 + }, + { + "epoch": 0.09378865734132648, + "grad_norm": 1.5803290605545044, + "learning_rate": 4.892276251347716e-05, + "loss": 4.9609, + "step": 15770 + }, + { + "epoch": 0.09379460462460748, + "grad_norm": 1.5272483825683594, + "learning_rate": 4.892262687182137e-05, + "loss": 5.074, + "step": 15771 + }, + { + "epoch": 0.09380055190788848, + "grad_norm": 1.377105951309204, + "learning_rate": 4.8922491221814436e-05, + "loss": 5.011, + "step": 15772 + }, + { + "epoch": 0.09380649919116947, + "grad_norm": 1.2150218486785889, + "learning_rate": 4.8922355563456414e-05, + "loss": 5.172, + "step": 15773 + }, + { + "epoch": 0.09381244647445047, + "grad_norm": 1.379515290260315, + "learning_rate": 4.892221989674734e-05, + "loss": 5.229, + "step": 15774 + }, + { + "epoch": 0.09381839375773147, + "grad_norm": 1.5256911516189575, + "learning_rate": 4.892208422168727e-05, + "loss": 5.0163, + "step": 15775 + }, + { + "epoch": 0.09382434104101246, + "grad_norm": 1.645808458328247, + "learning_rate": 4.892194853827624e-05, + "loss": 5.1382, + "step": 15776 + }, + { + "epoch": 0.09383028832429346, + "grad_norm": 1.7437238693237305, + "learning_rate": 4.8921812846514315e-05, + "loss": 4.8078, + "step": 15777 + }, + { + "epoch": 0.09383623560757447, + "grad_norm": 1.384291410446167, + "learning_rate": 4.892167714640152e-05, + "loss": 5.1645, + "step": 15778 + }, + { + "epoch": 0.09384218289085546, + "grad_norm": 1.6412228345870972, + "learning_rate": 4.892154143793792e-05, + "loss": 5.0472, + "step": 15779 + }, + { + "epoch": 0.09384813017413646, + "grad_norm": 1.5364267826080322, + "learning_rate": 4.8921405721123555e-05, + "loss": 5.1357, + "step": 15780 + }, + { + "epoch": 0.09385407745741745, + "grad_norm": 1.4579834938049316, + "learning_rate": 4.892126999595849e-05, + "loss": 5.2047, + "step": 15781 + }, + { + "epoch": 0.09386002474069845, + "grad_norm": 1.4087393283843994, + "learning_rate": 4.8921134262442745e-05, + "loss": 5.3224, + "step": 15782 + }, + { + "epoch": 0.09386597202397945, + "grad_norm": 1.4741411209106445, + "learning_rate": 4.8920998520576376e-05, + "loss": 4.9882, + "step": 15783 + }, + { + "epoch": 0.09387191930726044, + "grad_norm": 1.488578200340271, + "learning_rate": 4.8920862770359434e-05, + "loss": 4.8698, + "step": 15784 + }, + { + "epoch": 0.09387786659054144, + "grad_norm": 1.4695780277252197, + "learning_rate": 4.892072701179197e-05, + "loss": 4.6841, + "step": 15785 + }, + { + "epoch": 0.09388381387382244, + "grad_norm": 1.2468496561050415, + "learning_rate": 4.892059124487402e-05, + "loss": 5.0962, + "step": 15786 + }, + { + "epoch": 0.09388976115710343, + "grad_norm": 1.1099787950515747, + "learning_rate": 4.8920455469605654e-05, + "loss": 5.0883, + "step": 15787 + }, + { + "epoch": 0.09389570844038443, + "grad_norm": 1.3954483270645142, + "learning_rate": 4.892031968598689e-05, + "loss": 4.9554, + "step": 15788 + }, + { + "epoch": 0.09390165572366543, + "grad_norm": 1.3176839351654053, + "learning_rate": 4.892018389401779e-05, + "loss": 5.1638, + "step": 15789 + }, + { + "epoch": 0.09390760300694642, + "grad_norm": 1.2406723499298096, + "learning_rate": 4.892004809369841e-05, + "loss": 5.0569, + "step": 15790 + }, + { + "epoch": 0.09391355029022742, + "grad_norm": 1.395556926727295, + "learning_rate": 4.891991228502878e-05, + "loss": 4.9179, + "step": 15791 + }, + { + "epoch": 0.09391949757350843, + "grad_norm": 1.3977546691894531, + "learning_rate": 4.891977646800896e-05, + "loss": 5.0045, + "step": 15792 + }, + { + "epoch": 0.09392544485678941, + "grad_norm": 1.5089846849441528, + "learning_rate": 4.891964064263899e-05, + "loss": 5.176, + "step": 15793 + }, + { + "epoch": 0.09393139214007042, + "grad_norm": 1.260077953338623, + "learning_rate": 4.891950480891893e-05, + "loss": 5.3789, + "step": 15794 + }, + { + "epoch": 0.09393733942335142, + "grad_norm": 1.3587939739227295, + "learning_rate": 4.891936896684881e-05, + "loss": 5.308, + "step": 15795 + }, + { + "epoch": 0.0939432867066324, + "grad_norm": 1.4004688262939453, + "learning_rate": 4.8919233116428684e-05, + "loss": 5.5232, + "step": 15796 + }, + { + "epoch": 0.09394923398991341, + "grad_norm": 1.3308182954788208, + "learning_rate": 4.89190972576586e-05, + "loss": 5.3944, + "step": 15797 + }, + { + "epoch": 0.09395518127319441, + "grad_norm": 1.3078187704086304, + "learning_rate": 4.891896139053861e-05, + "loss": 5.3146, + "step": 15798 + }, + { + "epoch": 0.0939611285564754, + "grad_norm": 1.3268121480941772, + "learning_rate": 4.891882551506875e-05, + "loss": 5.2966, + "step": 15799 + }, + { + "epoch": 0.0939670758397564, + "grad_norm": 1.424813985824585, + "learning_rate": 4.8918689631249095e-05, + "loss": 5.132, + "step": 15800 + }, + { + "epoch": 0.0939730231230374, + "grad_norm": 1.2917978763580322, + "learning_rate": 4.8918553739079656e-05, + "loss": 5.1889, + "step": 15801 + }, + { + "epoch": 0.09397897040631839, + "grad_norm": 1.377146601676941, + "learning_rate": 4.8918417838560506e-05, + "loss": 5.2749, + "step": 15802 + }, + { + "epoch": 0.09398491768959939, + "grad_norm": 1.2476272583007812, + "learning_rate": 4.891828192969167e-05, + "loss": 5.1367, + "step": 15803 + }, + { + "epoch": 0.0939908649728804, + "grad_norm": 1.423923373222351, + "learning_rate": 4.891814601247322e-05, + "loss": 5.1657, + "step": 15804 + }, + { + "epoch": 0.09399681225616138, + "grad_norm": 1.2762609720230103, + "learning_rate": 4.891801008690518e-05, + "loss": 5.2245, + "step": 15805 + }, + { + "epoch": 0.09400275953944238, + "grad_norm": 1.3098403215408325, + "learning_rate": 4.891787415298763e-05, + "loss": 5.1452, + "step": 15806 + }, + { + "epoch": 0.09400870682272339, + "grad_norm": 1.2892425060272217, + "learning_rate": 4.8917738210720586e-05, + "loss": 5.268, + "step": 15807 + }, + { + "epoch": 0.09401465410600438, + "grad_norm": 1.4667305946350098, + "learning_rate": 4.8917602260104105e-05, + "loss": 5.1666, + "step": 15808 + }, + { + "epoch": 0.09402060138928538, + "grad_norm": 1.289933204650879, + "learning_rate": 4.891746630113824e-05, + "loss": 5.1772, + "step": 15809 + }, + { + "epoch": 0.09402654867256637, + "grad_norm": 2.3923516273498535, + "learning_rate": 4.891733033382303e-05, + "loss": 5.0732, + "step": 15810 + }, + { + "epoch": 0.09403249595584737, + "grad_norm": 1.223607063293457, + "learning_rate": 4.8917194358158534e-05, + "loss": 5.1025, + "step": 15811 + }, + { + "epoch": 0.09403844323912837, + "grad_norm": 1.5959491729736328, + "learning_rate": 4.8917058374144785e-05, + "loss": 5.3244, + "step": 15812 + }, + { + "epoch": 0.09404439052240936, + "grad_norm": 1.2359555959701538, + "learning_rate": 4.8916922381781845e-05, + "loss": 4.8643, + "step": 15813 + }, + { + "epoch": 0.09405033780569036, + "grad_norm": 1.3971196413040161, + "learning_rate": 4.891678638106974e-05, + "loss": 5.0362, + "step": 15814 + }, + { + "epoch": 0.09405628508897136, + "grad_norm": 1.3501266241073608, + "learning_rate": 4.891665037200855e-05, + "loss": 4.8705, + "step": 15815 + }, + { + "epoch": 0.09406223237225235, + "grad_norm": 1.3506006002426147, + "learning_rate": 4.89165143545983e-05, + "loss": 4.9122, + "step": 15816 + }, + { + "epoch": 0.09406817965553335, + "grad_norm": 1.4444037675857544, + "learning_rate": 4.891637832883904e-05, + "loss": 4.8428, + "step": 15817 + }, + { + "epoch": 0.09407412693881435, + "grad_norm": 1.4757333993911743, + "learning_rate": 4.891624229473082e-05, + "loss": 5.1774, + "step": 15818 + }, + { + "epoch": 0.09408007422209534, + "grad_norm": 1.3660651445388794, + "learning_rate": 4.891610625227369e-05, + "loss": 5.2998, + "step": 15819 + }, + { + "epoch": 0.09408602150537634, + "grad_norm": 1.625279426574707, + "learning_rate": 4.891597020146769e-05, + "loss": 5.1365, + "step": 15820 + }, + { + "epoch": 0.09409196878865735, + "grad_norm": 1.5202007293701172, + "learning_rate": 4.891583414231287e-05, + "loss": 5.287, + "step": 15821 + }, + { + "epoch": 0.09409791607193833, + "grad_norm": 1.5217576026916504, + "learning_rate": 4.891569807480928e-05, + "loss": 5.3599, + "step": 15822 + }, + { + "epoch": 0.09410386335521934, + "grad_norm": 1.5446710586547852, + "learning_rate": 4.891556199895696e-05, + "loss": 5.1332, + "step": 15823 + }, + { + "epoch": 0.09410981063850034, + "grad_norm": 1.2877990007400513, + "learning_rate": 4.8915425914755973e-05, + "loss": 5.0756, + "step": 15824 + }, + { + "epoch": 0.09411575792178133, + "grad_norm": 1.3024258613586426, + "learning_rate": 4.891528982220636e-05, + "loss": 5.3293, + "step": 15825 + }, + { + "epoch": 0.09412170520506233, + "grad_norm": 1.3039882183074951, + "learning_rate": 4.8915153721308166e-05, + "loss": 5.1406, + "step": 15826 + }, + { + "epoch": 0.09412765248834333, + "grad_norm": 1.2524348497390747, + "learning_rate": 4.8915017612061435e-05, + "loss": 5.3044, + "step": 15827 + }, + { + "epoch": 0.09413359977162432, + "grad_norm": 1.2522565126419067, + "learning_rate": 4.8914881494466226e-05, + "loss": 5.1776, + "step": 15828 + }, + { + "epoch": 0.09413954705490532, + "grad_norm": 1.3882638216018677, + "learning_rate": 4.8914745368522566e-05, + "loss": 5.2296, + "step": 15829 + }, + { + "epoch": 0.09414549433818632, + "grad_norm": 1.5169535875320435, + "learning_rate": 4.891460923423052e-05, + "loss": 5.2058, + "step": 15830 + }, + { + "epoch": 0.09415144162146731, + "grad_norm": 1.2045719623565674, + "learning_rate": 4.891447309159014e-05, + "loss": 5.256, + "step": 15831 + }, + { + "epoch": 0.09415738890474831, + "grad_norm": 1.4639356136322021, + "learning_rate": 4.891433694060146e-05, + "loss": 5.1781, + "step": 15832 + }, + { + "epoch": 0.09416333618802931, + "grad_norm": 1.498923420906067, + "learning_rate": 4.891420078126453e-05, + "loss": 5.1777, + "step": 15833 + }, + { + "epoch": 0.0941692834713103, + "grad_norm": 1.163977861404419, + "learning_rate": 4.89140646135794e-05, + "loss": 4.9302, + "step": 15834 + }, + { + "epoch": 0.0941752307545913, + "grad_norm": 1.502808690071106, + "learning_rate": 4.8913928437546113e-05, + "loss": 5.1053, + "step": 15835 + }, + { + "epoch": 0.0941811780378723, + "grad_norm": 1.401517391204834, + "learning_rate": 4.891379225316473e-05, + "loss": 5.3156, + "step": 15836 + }, + { + "epoch": 0.0941871253211533, + "grad_norm": 1.328116774559021, + "learning_rate": 4.891365606043528e-05, + "loss": 5.2333, + "step": 15837 + }, + { + "epoch": 0.0941930726044343, + "grad_norm": 1.160243272781372, + "learning_rate": 4.891351985935782e-05, + "loss": 5.2575, + "step": 15838 + }, + { + "epoch": 0.09419901988771529, + "grad_norm": 1.1748963594436646, + "learning_rate": 4.8913383649932404e-05, + "loss": 5.0673, + "step": 15839 + }, + { + "epoch": 0.09420496717099629, + "grad_norm": 1.2916535139083862, + "learning_rate": 4.891324743215907e-05, + "loss": 5.135, + "step": 15840 + }, + { + "epoch": 0.09421091445427729, + "grad_norm": 1.302393913269043, + "learning_rate": 4.8913111206037865e-05, + "loss": 4.9814, + "step": 15841 + }, + { + "epoch": 0.09421686173755828, + "grad_norm": 1.273445963859558, + "learning_rate": 4.891297497156885e-05, + "loss": 4.9163, + "step": 15842 + }, + { + "epoch": 0.09422280902083928, + "grad_norm": 1.444884181022644, + "learning_rate": 4.8912838728752055e-05, + "loss": 4.9316, + "step": 15843 + }, + { + "epoch": 0.09422875630412028, + "grad_norm": 1.411985993385315, + "learning_rate": 4.891270247758753e-05, + "loss": 4.9222, + "step": 15844 + }, + { + "epoch": 0.09423470358740127, + "grad_norm": 1.3697528839111328, + "learning_rate": 4.891256621807533e-05, + "loss": 4.8398, + "step": 15845 + }, + { + "epoch": 0.09424065087068227, + "grad_norm": 1.385298728942871, + "learning_rate": 4.891242995021551e-05, + "loss": 4.8869, + "step": 15846 + }, + { + "epoch": 0.09424659815396327, + "grad_norm": 1.821768879890442, + "learning_rate": 4.8912293674008094e-05, + "loss": 5.178, + "step": 15847 + }, + { + "epoch": 0.09425254543724426, + "grad_norm": 1.8198026418685913, + "learning_rate": 4.891215738945315e-05, + "loss": 5.2892, + "step": 15848 + }, + { + "epoch": 0.09425849272052526, + "grad_norm": 1.4373536109924316, + "learning_rate": 4.891202109655072e-05, + "loss": 5.1203, + "step": 15849 + }, + { + "epoch": 0.09426444000380627, + "grad_norm": 1.2086896896362305, + "learning_rate": 4.8911884795300855e-05, + "loss": 4.8603, + "step": 15850 + }, + { + "epoch": 0.09427038728708725, + "grad_norm": 1.3166700601577759, + "learning_rate": 4.891174848570359e-05, + "loss": 4.917, + "step": 15851 + }, + { + "epoch": 0.09427633457036826, + "grad_norm": 1.5753637552261353, + "learning_rate": 4.891161216775898e-05, + "loss": 5.0197, + "step": 15852 + }, + { + "epoch": 0.09428228185364926, + "grad_norm": 1.5428698062896729, + "learning_rate": 4.891147584146708e-05, + "loss": 5.2048, + "step": 15853 + }, + { + "epoch": 0.09428822913693025, + "grad_norm": 1.3760755062103271, + "learning_rate": 4.8911339506827924e-05, + "loss": 5.2568, + "step": 15854 + }, + { + "epoch": 0.09429417642021125, + "grad_norm": 1.6683621406555176, + "learning_rate": 4.891120316384157e-05, + "loss": 4.8976, + "step": 15855 + }, + { + "epoch": 0.09430012370349225, + "grad_norm": 1.4224987030029297, + "learning_rate": 4.891106681250807e-05, + "loss": 4.9538, + "step": 15856 + }, + { + "epoch": 0.09430607098677324, + "grad_norm": 1.2851178646087646, + "learning_rate": 4.8910930452827454e-05, + "loss": 4.8972, + "step": 15857 + }, + { + "epoch": 0.09431201827005424, + "grad_norm": 1.6412112712860107, + "learning_rate": 4.891079408479978e-05, + "loss": 5.124, + "step": 15858 + }, + { + "epoch": 0.09431796555333524, + "grad_norm": 1.380089282989502, + "learning_rate": 4.891065770842509e-05, + "loss": 5.1155, + "step": 15859 + }, + { + "epoch": 0.09432391283661623, + "grad_norm": 1.3117294311523438, + "learning_rate": 4.891052132370344e-05, + "loss": 5.1968, + "step": 15860 + }, + { + "epoch": 0.09432986011989723, + "grad_norm": 1.5171841382980347, + "learning_rate": 4.891038493063488e-05, + "loss": 5.1029, + "step": 15861 + }, + { + "epoch": 0.09433580740317823, + "grad_norm": 1.4801427125930786, + "learning_rate": 4.8910248529219446e-05, + "loss": 5.1533, + "step": 15862 + }, + { + "epoch": 0.09434175468645922, + "grad_norm": 1.672522783279419, + "learning_rate": 4.8910112119457196e-05, + "loss": 5.3259, + "step": 15863 + }, + { + "epoch": 0.09434770196974022, + "grad_norm": 1.5151952505111694, + "learning_rate": 4.890997570134816e-05, + "loss": 5.2654, + "step": 15864 + }, + { + "epoch": 0.09435364925302123, + "grad_norm": 1.4178684949874878, + "learning_rate": 4.890983927489242e-05, + "loss": 5.2369, + "step": 15865 + }, + { + "epoch": 0.09435959653630221, + "grad_norm": 1.3673019409179688, + "learning_rate": 4.890970284008999e-05, + "loss": 5.2176, + "step": 15866 + }, + { + "epoch": 0.09436554381958322, + "grad_norm": 1.4063305854797363, + "learning_rate": 4.8909566396940934e-05, + "loss": 5.1189, + "step": 15867 + }, + { + "epoch": 0.0943714911028642, + "grad_norm": 1.277815818786621, + "learning_rate": 4.890942994544528e-05, + "loss": 5.2204, + "step": 15868 + }, + { + "epoch": 0.09437743838614521, + "grad_norm": 1.5394912958145142, + "learning_rate": 4.890929348560311e-05, + "loss": 5.1147, + "step": 15869 + }, + { + "epoch": 0.09438338566942621, + "grad_norm": 1.4091798067092896, + "learning_rate": 4.890915701741444e-05, + "loss": 5.1367, + "step": 15870 + }, + { + "epoch": 0.0943893329527072, + "grad_norm": 1.367828369140625, + "learning_rate": 4.8909020540879336e-05, + "loss": 5.1871, + "step": 15871 + }, + { + "epoch": 0.0943952802359882, + "grad_norm": 2.2413175106048584, + "learning_rate": 4.890888405599784e-05, + "loss": 5.0571, + "step": 15872 + }, + { + "epoch": 0.0944012275192692, + "grad_norm": 1.392906904220581, + "learning_rate": 4.8908747562769995e-05, + "loss": 4.9885, + "step": 15873 + }, + { + "epoch": 0.09440717480255019, + "grad_norm": 1.4517099857330322, + "learning_rate": 4.8908611061195865e-05, + "loss": 5.1596, + "step": 15874 + }, + { + "epoch": 0.09441312208583119, + "grad_norm": 1.663919448852539, + "learning_rate": 4.890847455127547e-05, + "loss": 5.0029, + "step": 15875 + }, + { + "epoch": 0.0944190693691122, + "grad_norm": 1.5252666473388672, + "learning_rate": 4.8908338033008885e-05, + "loss": 4.9596, + "step": 15876 + }, + { + "epoch": 0.09442501665239318, + "grad_norm": 1.613261103630066, + "learning_rate": 4.8908201506396143e-05, + "loss": 4.91, + "step": 15877 + }, + { + "epoch": 0.09443096393567418, + "grad_norm": 1.5182253122329712, + "learning_rate": 4.8908064971437295e-05, + "loss": 5.0564, + "step": 15878 + }, + { + "epoch": 0.09443691121895519, + "grad_norm": 1.4765241146087646, + "learning_rate": 4.8907928428132386e-05, + "loss": 5.0863, + "step": 15879 + }, + { + "epoch": 0.09444285850223617, + "grad_norm": 1.6401035785675049, + "learning_rate": 4.890779187648147e-05, + "loss": 4.9876, + "step": 15880 + }, + { + "epoch": 0.09444880578551718, + "grad_norm": 1.4818077087402344, + "learning_rate": 4.8907655316484594e-05, + "loss": 4.9361, + "step": 15881 + }, + { + "epoch": 0.09445475306879818, + "grad_norm": 1.4490398168563843, + "learning_rate": 4.89075187481418e-05, + "loss": 4.8991, + "step": 15882 + }, + { + "epoch": 0.09446070035207917, + "grad_norm": 1.2799785137176514, + "learning_rate": 4.890738217145313e-05, + "loss": 5.0147, + "step": 15883 + }, + { + "epoch": 0.09446664763536017, + "grad_norm": 1.416590929031372, + "learning_rate": 4.890724558641865e-05, + "loss": 5.0255, + "step": 15884 + }, + { + "epoch": 0.09447259491864117, + "grad_norm": 1.4365648031234741, + "learning_rate": 4.8907108993038395e-05, + "loss": 5.0262, + "step": 15885 + }, + { + "epoch": 0.09447854220192216, + "grad_norm": 1.367490530014038, + "learning_rate": 4.890697239131241e-05, + "loss": 4.9478, + "step": 15886 + }, + { + "epoch": 0.09448448948520316, + "grad_norm": 1.3645575046539307, + "learning_rate": 4.8906835781240754e-05, + "loss": 5.0751, + "step": 15887 + }, + { + "epoch": 0.09449043676848416, + "grad_norm": 1.4014960527420044, + "learning_rate": 4.8906699162823464e-05, + "loss": 4.9789, + "step": 15888 + }, + { + "epoch": 0.09449638405176515, + "grad_norm": 1.2261216640472412, + "learning_rate": 4.8906562536060596e-05, + "loss": 4.9619, + "step": 15889 + }, + { + "epoch": 0.09450233133504615, + "grad_norm": 1.3241546154022217, + "learning_rate": 4.890642590095219e-05, + "loss": 4.9947, + "step": 15890 + }, + { + "epoch": 0.09450827861832715, + "grad_norm": 1.337372899055481, + "learning_rate": 4.89062892574983e-05, + "loss": 4.9817, + "step": 15891 + }, + { + "epoch": 0.09451422590160814, + "grad_norm": 1.47610604763031, + "learning_rate": 4.8906152605698974e-05, + "loss": 4.9467, + "step": 15892 + }, + { + "epoch": 0.09452017318488914, + "grad_norm": 1.3533576726913452, + "learning_rate": 4.890601594555425e-05, + "loss": 4.9819, + "step": 15893 + }, + { + "epoch": 0.09452612046817015, + "grad_norm": 1.4445271492004395, + "learning_rate": 4.890587927706419e-05, + "loss": 4.9566, + "step": 15894 + }, + { + "epoch": 0.09453206775145113, + "grad_norm": 1.4600121974945068, + "learning_rate": 4.8905742600228834e-05, + "loss": 4.9341, + "step": 15895 + }, + { + "epoch": 0.09453801503473214, + "grad_norm": 1.2824327945709229, + "learning_rate": 4.8905605915048224e-05, + "loss": 5.0945, + "step": 15896 + }, + { + "epoch": 0.09454396231801313, + "grad_norm": 1.4806164503097534, + "learning_rate": 4.890546922152242e-05, + "loss": 5.1312, + "step": 15897 + }, + { + "epoch": 0.09454990960129413, + "grad_norm": 1.3514155149459839, + "learning_rate": 4.890533251965146e-05, + "loss": 4.9596, + "step": 15898 + }, + { + "epoch": 0.09455585688457513, + "grad_norm": 1.332749843597412, + "learning_rate": 4.89051958094354e-05, + "loss": 5.0649, + "step": 15899 + }, + { + "epoch": 0.09456180416785612, + "grad_norm": 1.310562014579773, + "learning_rate": 4.8905059090874284e-05, + "loss": 5.0977, + "step": 15900 + }, + { + "epoch": 0.09456775145113712, + "grad_norm": 1.342310905456543, + "learning_rate": 4.8904922363968153e-05, + "loss": 5.115, + "step": 15901 + }, + { + "epoch": 0.09457369873441812, + "grad_norm": 1.4810988903045654, + "learning_rate": 4.890478562871706e-05, + "loss": 5.1305, + "step": 15902 + }, + { + "epoch": 0.09457964601769911, + "grad_norm": 1.3064900636672974, + "learning_rate": 4.890464888512106e-05, + "loss": 5.1387, + "step": 15903 + }, + { + "epoch": 0.09458559330098011, + "grad_norm": 1.4571950435638428, + "learning_rate": 4.890451213318019e-05, + "loss": 5.1235, + "step": 15904 + }, + { + "epoch": 0.09459154058426111, + "grad_norm": 1.3964077234268188, + "learning_rate": 4.89043753728945e-05, + "loss": 5.0854, + "step": 15905 + }, + { + "epoch": 0.0945974878675421, + "grad_norm": 1.4404022693634033, + "learning_rate": 4.8904238604264044e-05, + "loss": 5.0991, + "step": 15906 + }, + { + "epoch": 0.0946034351508231, + "grad_norm": 1.3269283771514893, + "learning_rate": 4.890410182728886e-05, + "loss": 4.9299, + "step": 15907 + }, + { + "epoch": 0.0946093824341041, + "grad_norm": 1.4588782787322998, + "learning_rate": 4.8903965041969e-05, + "loss": 5.0992, + "step": 15908 + }, + { + "epoch": 0.0946153297173851, + "grad_norm": 1.2911858558654785, + "learning_rate": 4.8903828248304525e-05, + "loss": 5.0639, + "step": 15909 + }, + { + "epoch": 0.0946212770006661, + "grad_norm": 1.336695909500122, + "learning_rate": 4.8903691446295466e-05, + "loss": 5.1479, + "step": 15910 + }, + { + "epoch": 0.0946272242839471, + "grad_norm": 1.3052904605865479, + "learning_rate": 4.890355463594186e-05, + "loss": 5.049, + "step": 15911 + }, + { + "epoch": 0.09463317156722809, + "grad_norm": 1.3744491338729858, + "learning_rate": 4.890341781724379e-05, + "loss": 5.0709, + "step": 15912 + }, + { + "epoch": 0.09463911885050909, + "grad_norm": 1.5727102756500244, + "learning_rate": 4.890328099020127e-05, + "loss": 4.9857, + "step": 15913 + }, + { + "epoch": 0.09464506613379009, + "grad_norm": 1.5804322957992554, + "learning_rate": 4.890314415481437e-05, + "loss": 5.133, + "step": 15914 + }, + { + "epoch": 0.09465101341707108, + "grad_norm": 1.228421926498413, + "learning_rate": 4.8903007311083124e-05, + "loss": 4.9561, + "step": 15915 + }, + { + "epoch": 0.09465696070035208, + "grad_norm": 1.4680207967758179, + "learning_rate": 4.890287045900759e-05, + "loss": 5.0502, + "step": 15916 + }, + { + "epoch": 0.09466290798363308, + "grad_norm": 1.3447710275650024, + "learning_rate": 4.89027335985878e-05, + "loss": 5.1255, + "step": 15917 + }, + { + "epoch": 0.09466885526691407, + "grad_norm": 1.3510375022888184, + "learning_rate": 4.8902596729823825e-05, + "loss": 5.0936, + "step": 15918 + }, + { + "epoch": 0.09467480255019507, + "grad_norm": 1.3805617094039917, + "learning_rate": 4.89024598527157e-05, + "loss": 5.1146, + "step": 15919 + }, + { + "epoch": 0.09468074983347607, + "grad_norm": 1.568036437034607, + "learning_rate": 4.890232296726347e-05, + "loss": 5.0032, + "step": 15920 + }, + { + "epoch": 0.09468669711675706, + "grad_norm": 1.6060000658035278, + "learning_rate": 4.890218607346718e-05, + "loss": 5.017, + "step": 15921 + }, + { + "epoch": 0.09469264440003806, + "grad_norm": 1.498241901397705, + "learning_rate": 4.890204917132689e-05, + "loss": 5.1265, + "step": 15922 + }, + { + "epoch": 0.09469859168331907, + "grad_norm": 1.418135643005371, + "learning_rate": 4.8901912260842644e-05, + "loss": 5.1458, + "step": 15923 + }, + { + "epoch": 0.09470453896660005, + "grad_norm": 1.3306639194488525, + "learning_rate": 4.890177534201448e-05, + "loss": 5.1672, + "step": 15924 + }, + { + "epoch": 0.09471048624988106, + "grad_norm": 1.542938470840454, + "learning_rate": 4.890163841484246e-05, + "loss": 5.1511, + "step": 15925 + }, + { + "epoch": 0.09471643353316204, + "grad_norm": 1.3050166368484497, + "learning_rate": 4.890150147932662e-05, + "loss": 5.2615, + "step": 15926 + }, + { + "epoch": 0.09472238081644305, + "grad_norm": 1.3447345495224, + "learning_rate": 4.890136453546702e-05, + "loss": 5.2957, + "step": 15927 + }, + { + "epoch": 0.09472832809972405, + "grad_norm": 1.3270481824874878, + "learning_rate": 4.8901227583263695e-05, + "loss": 5.2751, + "step": 15928 + }, + { + "epoch": 0.09473427538300504, + "grad_norm": 1.3909003734588623, + "learning_rate": 4.890109062271669e-05, + "loss": 5.1162, + "step": 15929 + }, + { + "epoch": 0.09474022266628604, + "grad_norm": 1.4668915271759033, + "learning_rate": 4.890095365382608e-05, + "loss": 5.0313, + "step": 15930 + }, + { + "epoch": 0.09474616994956704, + "grad_norm": 1.2651780843734741, + "learning_rate": 4.890081667659188e-05, + "loss": 5.0576, + "step": 15931 + }, + { + "epoch": 0.09475211723284803, + "grad_norm": 1.5086911916732788, + "learning_rate": 4.8900679691014154e-05, + "loss": 4.9508, + "step": 15932 + }, + { + "epoch": 0.09475806451612903, + "grad_norm": 1.2698594331741333, + "learning_rate": 4.8900542697092956e-05, + "loss": 5.0183, + "step": 15933 + }, + { + "epoch": 0.09476401179941003, + "grad_norm": 2.691392183303833, + "learning_rate": 4.8900405694828313e-05, + "loss": 5.0997, + "step": 15934 + }, + { + "epoch": 0.09476995908269102, + "grad_norm": 1.3395452499389648, + "learning_rate": 4.8900268684220295e-05, + "loss": 5.2219, + "step": 15935 + }, + { + "epoch": 0.09477590636597202, + "grad_norm": 1.3485181331634521, + "learning_rate": 4.8900131665268934e-05, + "loss": 4.9594, + "step": 15936 + }, + { + "epoch": 0.09478185364925303, + "grad_norm": 1.2990431785583496, + "learning_rate": 4.889999463797429e-05, + "loss": 4.9492, + "step": 15937 + }, + { + "epoch": 0.09478780093253401, + "grad_norm": 1.2848893404006958, + "learning_rate": 4.8899857602336396e-05, + "loss": 4.9819, + "step": 15938 + }, + { + "epoch": 0.09479374821581502, + "grad_norm": 1.4666554927825928, + "learning_rate": 4.889972055835531e-05, + "loss": 4.9672, + "step": 15939 + }, + { + "epoch": 0.09479969549909602, + "grad_norm": 1.3356142044067383, + "learning_rate": 4.8899583506031085e-05, + "loss": 5.029, + "step": 15940 + }, + { + "epoch": 0.094805642782377, + "grad_norm": 1.561786413192749, + "learning_rate": 4.8899446445363765e-05, + "loss": 4.9071, + "step": 15941 + }, + { + "epoch": 0.09481159006565801, + "grad_norm": 1.4906450510025024, + "learning_rate": 4.889930937635339e-05, + "loss": 5.0832, + "step": 15942 + }, + { + "epoch": 0.09481753734893901, + "grad_norm": 1.5042341947555542, + "learning_rate": 4.889917229900001e-05, + "loss": 5.1069, + "step": 15943 + }, + { + "epoch": 0.09482348463222, + "grad_norm": 1.6562377214431763, + "learning_rate": 4.889903521330368e-05, + "loss": 5.0532, + "step": 15944 + }, + { + "epoch": 0.094829431915501, + "grad_norm": 1.1881135702133179, + "learning_rate": 4.889889811926445e-05, + "loss": 5.1159, + "step": 15945 + }, + { + "epoch": 0.094835379198782, + "grad_norm": 1.3550158739089966, + "learning_rate": 4.889876101688234e-05, + "loss": 5.0754, + "step": 15946 + }, + { + "epoch": 0.09484132648206299, + "grad_norm": 1.403874158859253, + "learning_rate": 4.8898623906157435e-05, + "loss": 5.405, + "step": 15947 + }, + { + "epoch": 0.09484727376534399, + "grad_norm": 1.4460557699203491, + "learning_rate": 4.889848678708977e-05, + "loss": 5.041, + "step": 15948 + }, + { + "epoch": 0.094853221048625, + "grad_norm": 1.4151064157485962, + "learning_rate": 4.889834965967939e-05, + "loss": 5.368, + "step": 15949 + }, + { + "epoch": 0.09485916833190598, + "grad_norm": 1.3388437032699585, + "learning_rate": 4.889821252392633e-05, + "loss": 5.2905, + "step": 15950 + }, + { + "epoch": 0.09486511561518698, + "grad_norm": 1.1941900253295898, + "learning_rate": 4.8898075379830665e-05, + "loss": 5.1499, + "step": 15951 + }, + { + "epoch": 0.09487106289846799, + "grad_norm": 1.4840821027755737, + "learning_rate": 4.889793822739243e-05, + "loss": 5.0461, + "step": 15952 + }, + { + "epoch": 0.09487701018174897, + "grad_norm": 1.4021552801132202, + "learning_rate": 4.889780106661166e-05, + "loss": 4.89, + "step": 15953 + }, + { + "epoch": 0.09488295746502998, + "grad_norm": 1.4893288612365723, + "learning_rate": 4.889766389748842e-05, + "loss": 4.9719, + "step": 15954 + }, + { + "epoch": 0.09488890474831096, + "grad_norm": 1.4530198574066162, + "learning_rate": 4.889752672002275e-05, + "loss": 5.3931, + "step": 15955 + }, + { + "epoch": 0.09489485203159197, + "grad_norm": 1.468037724494934, + "learning_rate": 4.88973895342147e-05, + "loss": 5.271, + "step": 15956 + }, + { + "epoch": 0.09490079931487297, + "grad_norm": 1.3074537515640259, + "learning_rate": 4.889725234006433e-05, + "loss": 5.202, + "step": 15957 + }, + { + "epoch": 0.09490674659815396, + "grad_norm": 1.3678735494613647, + "learning_rate": 4.889711513757166e-05, + "loss": 5.0821, + "step": 15958 + }, + { + "epoch": 0.09491269388143496, + "grad_norm": 1.3922240734100342, + "learning_rate": 4.889697792673676e-05, + "loss": 4.8938, + "step": 15959 + }, + { + "epoch": 0.09491864116471596, + "grad_norm": 1.3895872831344604, + "learning_rate": 4.8896840707559674e-05, + "loss": 4.8293, + "step": 15960 + }, + { + "epoch": 0.09492458844799695, + "grad_norm": 1.223599910736084, + "learning_rate": 4.889670348004045e-05, + "loss": 4.8528, + "step": 15961 + }, + { + "epoch": 0.09493053573127795, + "grad_norm": 1.4488904476165771, + "learning_rate": 4.889656624417913e-05, + "loss": 5.0107, + "step": 15962 + }, + { + "epoch": 0.09493648301455895, + "grad_norm": 1.5250918865203857, + "learning_rate": 4.889642899997576e-05, + "loss": 4.9114, + "step": 15963 + }, + { + "epoch": 0.09494243029783994, + "grad_norm": 1.4656517505645752, + "learning_rate": 4.88962917474304e-05, + "loss": 5.2163, + "step": 15964 + }, + { + "epoch": 0.09494837758112094, + "grad_norm": 1.316635251045227, + "learning_rate": 4.889615448654309e-05, + "loss": 5.1904, + "step": 15965 + }, + { + "epoch": 0.09495432486440195, + "grad_norm": 1.5920292139053345, + "learning_rate": 4.8896017217313886e-05, + "loss": 5.0858, + "step": 15966 + }, + { + "epoch": 0.09496027214768293, + "grad_norm": 1.5263009071350098, + "learning_rate": 4.889587993974282e-05, + "loss": 5.0594, + "step": 15967 + }, + { + "epoch": 0.09496621943096394, + "grad_norm": 1.4230486154556274, + "learning_rate": 4.889574265382996e-05, + "loss": 5.0712, + "step": 15968 + }, + { + "epoch": 0.09497216671424494, + "grad_norm": 1.9315528869628906, + "learning_rate": 4.889560535957533e-05, + "loss": 4.8489, + "step": 15969 + }, + { + "epoch": 0.09497811399752593, + "grad_norm": 1.3432739973068237, + "learning_rate": 4.8895468056979e-05, + "loss": 4.9722, + "step": 15970 + }, + { + "epoch": 0.09498406128080693, + "grad_norm": 1.191886067390442, + "learning_rate": 4.8895330746041e-05, + "loss": 4.9384, + "step": 15971 + }, + { + "epoch": 0.09499000856408793, + "grad_norm": 1.4204323291778564, + "learning_rate": 4.8895193426761396e-05, + "loss": 5.1063, + "step": 15972 + }, + { + "epoch": 0.09499595584736892, + "grad_norm": 1.319189429283142, + "learning_rate": 4.8895056099140224e-05, + "loss": 5.0643, + "step": 15973 + }, + { + "epoch": 0.09500190313064992, + "grad_norm": 1.2905625104904175, + "learning_rate": 4.8894918763177533e-05, + "loss": 5.0806, + "step": 15974 + }, + { + "epoch": 0.09500785041393092, + "grad_norm": 1.6914581060409546, + "learning_rate": 4.889478141887338e-05, + "loss": 4.9209, + "step": 15975 + }, + { + "epoch": 0.09501379769721191, + "grad_norm": 1.390061378479004, + "learning_rate": 4.8894644066227797e-05, + "loss": 5.1376, + "step": 15976 + }, + { + "epoch": 0.09501974498049291, + "grad_norm": 1.2711600065231323, + "learning_rate": 4.889450670524084e-05, + "loss": 5.2344, + "step": 15977 + }, + { + "epoch": 0.09502569226377391, + "grad_norm": 1.472398042678833, + "learning_rate": 4.889436933591256e-05, + "loss": 5.0605, + "step": 15978 + }, + { + "epoch": 0.0950316395470549, + "grad_norm": 1.483567714691162, + "learning_rate": 4.889423195824301e-05, + "loss": 4.9827, + "step": 15979 + }, + { + "epoch": 0.0950375868303359, + "grad_norm": 1.706921935081482, + "learning_rate": 4.889409457223222e-05, + "loss": 5.0692, + "step": 15980 + }, + { + "epoch": 0.0950435341136169, + "grad_norm": 1.7719398736953735, + "learning_rate": 4.889395717788026e-05, + "loss": 5.0985, + "step": 15981 + }, + { + "epoch": 0.0950494813968979, + "grad_norm": 1.6768114566802979, + "learning_rate": 4.889381977518715e-05, + "loss": 4.8838, + "step": 15982 + }, + { + "epoch": 0.0950554286801789, + "grad_norm": 1.5722233057022095, + "learning_rate": 4.889368236415296e-05, + "loss": 4.824, + "step": 15983 + }, + { + "epoch": 0.09506137596345988, + "grad_norm": 1.5722928047180176, + "learning_rate": 4.889354494477773e-05, + "loss": 5.3027, + "step": 15984 + }, + { + "epoch": 0.09506732324674089, + "grad_norm": 2.0003905296325684, + "learning_rate": 4.8893407517061526e-05, + "loss": 5.2216, + "step": 15985 + }, + { + "epoch": 0.09507327053002189, + "grad_norm": 1.390168309211731, + "learning_rate": 4.889327008100437e-05, + "loss": 5.358, + "step": 15986 + }, + { + "epoch": 0.09507921781330288, + "grad_norm": 1.545292854309082, + "learning_rate": 4.889313263660632e-05, + "loss": 5.5124, + "step": 15987 + }, + { + "epoch": 0.09508516509658388, + "grad_norm": 1.4416158199310303, + "learning_rate": 4.889299518386742e-05, + "loss": 5.0929, + "step": 15988 + }, + { + "epoch": 0.09509111237986488, + "grad_norm": 1.8936892747879028, + "learning_rate": 4.889285772278773e-05, + "loss": 4.9407, + "step": 15989 + }, + { + "epoch": 0.09509705966314587, + "grad_norm": 1.4762251377105713, + "learning_rate": 4.889272025336729e-05, + "loss": 5.05, + "step": 15990 + }, + { + "epoch": 0.09510300694642687, + "grad_norm": 1.4513001441955566, + "learning_rate": 4.8892582775606146e-05, + "loss": 5.2386, + "step": 15991 + }, + { + "epoch": 0.09510895422970787, + "grad_norm": 1.8999260663986206, + "learning_rate": 4.8892445289504345e-05, + "loss": 5.1524, + "step": 15992 + }, + { + "epoch": 0.09511490151298886, + "grad_norm": 1.5721614360809326, + "learning_rate": 4.8892307795061945e-05, + "loss": 5.2276, + "step": 15993 + }, + { + "epoch": 0.09512084879626986, + "grad_norm": 1.754425287246704, + "learning_rate": 4.889217029227898e-05, + "loss": 5.118, + "step": 15994 + }, + { + "epoch": 0.09512679607955087, + "grad_norm": 1.6336870193481445, + "learning_rate": 4.889203278115551e-05, + "loss": 5.2065, + "step": 15995 + }, + { + "epoch": 0.09513274336283185, + "grad_norm": 2.721186876296997, + "learning_rate": 4.889189526169157e-05, + "loss": 5.3698, + "step": 15996 + }, + { + "epoch": 0.09513869064611286, + "grad_norm": 1.3870679140090942, + "learning_rate": 4.889175773388722e-05, + "loss": 5.294, + "step": 15997 + }, + { + "epoch": 0.09514463792939386, + "grad_norm": 1.4010889530181885, + "learning_rate": 4.889162019774252e-05, + "loss": 5.2313, + "step": 15998 + }, + { + "epoch": 0.09515058521267485, + "grad_norm": 1.6322177648544312, + "learning_rate": 4.889148265325748e-05, + "loss": 5.2871, + "step": 15999 + }, + { + "epoch": 0.09515653249595585, + "grad_norm": 1.5373196601867676, + "learning_rate": 4.889134510043218e-05, + "loss": 5.4748, + "step": 16000 + }, + { + "epoch": 0.09516247977923685, + "grad_norm": 1.572461724281311, + "learning_rate": 4.889120753926666e-05, + "loss": 5.3634, + "step": 16001 + }, + { + "epoch": 0.09516842706251784, + "grad_norm": 1.3587132692337036, + "learning_rate": 4.889106996976096e-05, + "loss": 5.1399, + "step": 16002 + }, + { + "epoch": 0.09517437434579884, + "grad_norm": 1.1270248889923096, + "learning_rate": 4.889093239191514e-05, + "loss": 5.1845, + "step": 16003 + }, + { + "epoch": 0.09518032162907984, + "grad_norm": 1.5456722974777222, + "learning_rate": 4.889079480572924e-05, + "loss": 5.4895, + "step": 16004 + }, + { + "epoch": 0.09518626891236083, + "grad_norm": 1.2772669792175293, + "learning_rate": 4.8890657211203307e-05, + "loss": 5.5415, + "step": 16005 + }, + { + "epoch": 0.09519221619564183, + "grad_norm": 1.5249123573303223, + "learning_rate": 4.88905196083374e-05, + "loss": 5.2731, + "step": 16006 + }, + { + "epoch": 0.09519816347892283, + "grad_norm": 1.137450098991394, + "learning_rate": 4.889038199713155e-05, + "loss": 5.2232, + "step": 16007 + }, + { + "epoch": 0.09520411076220382, + "grad_norm": 1.4076485633850098, + "learning_rate": 4.889024437758582e-05, + "loss": 5.3428, + "step": 16008 + }, + { + "epoch": 0.09521005804548482, + "grad_norm": 1.3883590698242188, + "learning_rate": 4.889010674970026e-05, + "loss": 5.328, + "step": 16009 + }, + { + "epoch": 0.09521600532876583, + "grad_norm": 1.4320605993270874, + "learning_rate": 4.88899691134749e-05, + "loss": 5.1469, + "step": 16010 + }, + { + "epoch": 0.09522195261204681, + "grad_norm": 1.5601880550384521, + "learning_rate": 4.8889831468909795e-05, + "loss": 5.1063, + "step": 16011 + }, + { + "epoch": 0.09522789989532782, + "grad_norm": 1.4243980646133423, + "learning_rate": 4.8889693816005014e-05, + "loss": 5.067, + "step": 16012 + }, + { + "epoch": 0.0952338471786088, + "grad_norm": 1.3901020288467407, + "learning_rate": 4.8889556154760577e-05, + "loss": 4.9954, + "step": 16013 + }, + { + "epoch": 0.0952397944618898, + "grad_norm": 1.2067557573318481, + "learning_rate": 4.8889418485176544e-05, + "loss": 5.5485, + "step": 16014 + }, + { + "epoch": 0.09524574174517081, + "grad_norm": 1.6004818677902222, + "learning_rate": 4.888928080725296e-05, + "loss": 5.0334, + "step": 16015 + }, + { + "epoch": 0.0952516890284518, + "grad_norm": 1.42451810836792, + "learning_rate": 4.8889143120989864e-05, + "loss": 4.9913, + "step": 16016 + }, + { + "epoch": 0.0952576363117328, + "grad_norm": 1.528438925743103, + "learning_rate": 4.888900542638734e-05, + "loss": 4.9749, + "step": 16017 + }, + { + "epoch": 0.0952635835950138, + "grad_norm": 1.2179231643676758, + "learning_rate": 4.888886772344539e-05, + "loss": 5.0631, + "step": 16018 + }, + { + "epoch": 0.09526953087829479, + "grad_norm": 1.5069763660430908, + "learning_rate": 4.8888730012164085e-05, + "loss": 5.0739, + "step": 16019 + }, + { + "epoch": 0.09527547816157579, + "grad_norm": 1.3587465286254883, + "learning_rate": 4.888859229254348e-05, + "loss": 5.0924, + "step": 16020 + }, + { + "epoch": 0.0952814254448568, + "grad_norm": 1.412811517715454, + "learning_rate": 4.888845456458361e-05, + "loss": 5.0228, + "step": 16021 + }, + { + "epoch": 0.09528737272813778, + "grad_norm": 1.5316507816314697, + "learning_rate": 4.888831682828453e-05, + "loss": 4.9514, + "step": 16022 + }, + { + "epoch": 0.09529332001141878, + "grad_norm": 1.4402068853378296, + "learning_rate": 4.888817908364628e-05, + "loss": 4.9404, + "step": 16023 + }, + { + "epoch": 0.09529926729469979, + "grad_norm": 1.353027582168579, + "learning_rate": 4.888804133066892e-05, + "loss": 5.0359, + "step": 16024 + }, + { + "epoch": 0.09530521457798077, + "grad_norm": 1.4211509227752686, + "learning_rate": 4.8887903569352486e-05, + "loss": 5.2472, + "step": 16025 + }, + { + "epoch": 0.09531116186126178, + "grad_norm": 1.3640077114105225, + "learning_rate": 4.888776579969704e-05, + "loss": 5.4126, + "step": 16026 + }, + { + "epoch": 0.09531710914454278, + "grad_norm": 1.5627541542053223, + "learning_rate": 4.8887628021702616e-05, + "loss": 5.1019, + "step": 16027 + }, + { + "epoch": 0.09532305642782377, + "grad_norm": 1.788611650466919, + "learning_rate": 4.888749023536927e-05, + "loss": 4.9395, + "step": 16028 + }, + { + "epoch": 0.09532900371110477, + "grad_norm": 1.3194786310195923, + "learning_rate": 4.8887352440697044e-05, + "loss": 4.9888, + "step": 16029 + }, + { + "epoch": 0.09533495099438577, + "grad_norm": 1.3091423511505127, + "learning_rate": 4.888721463768598e-05, + "loss": 5.1328, + "step": 16030 + }, + { + "epoch": 0.09534089827766676, + "grad_norm": 1.2864805459976196, + "learning_rate": 4.8887076826336154e-05, + "loss": 5.2569, + "step": 16031 + }, + { + "epoch": 0.09534684556094776, + "grad_norm": 1.3800050020217896, + "learning_rate": 4.888693900664759e-05, + "loss": 5.0698, + "step": 16032 + }, + { + "epoch": 0.09535279284422876, + "grad_norm": 1.2338416576385498, + "learning_rate": 4.8886801178620347e-05, + "loss": 5.227, + "step": 16033 + }, + { + "epoch": 0.09535874012750975, + "grad_norm": 1.4023356437683105, + "learning_rate": 4.888666334225446e-05, + "loss": 5.2976, + "step": 16034 + }, + { + "epoch": 0.09536468741079075, + "grad_norm": 1.4695215225219727, + "learning_rate": 4.8886525497549994e-05, + "loss": 5.1062, + "step": 16035 + }, + { + "epoch": 0.09537063469407175, + "grad_norm": 1.3647410869598389, + "learning_rate": 4.888638764450698e-05, + "loss": 5.2613, + "step": 16036 + }, + { + "epoch": 0.09537658197735274, + "grad_norm": 1.3059413433074951, + "learning_rate": 4.8886249783125484e-05, + "loss": 5.1593, + "step": 16037 + }, + { + "epoch": 0.09538252926063374, + "grad_norm": 1.3861093521118164, + "learning_rate": 4.8886111913405544e-05, + "loss": 4.9149, + "step": 16038 + }, + { + "epoch": 0.09538847654391475, + "grad_norm": 1.4214578866958618, + "learning_rate": 4.88859740353472e-05, + "loss": 5.0443, + "step": 16039 + }, + { + "epoch": 0.09539442382719573, + "grad_norm": 1.3835242986679077, + "learning_rate": 4.888583614895052e-05, + "loss": 4.9516, + "step": 16040 + }, + { + "epoch": 0.09540037111047674, + "grad_norm": 1.47120201587677, + "learning_rate": 4.8885698254215526e-05, + "loss": 4.9673, + "step": 16041 + }, + { + "epoch": 0.09540631839375772, + "grad_norm": 1.4861125946044922, + "learning_rate": 4.8885560351142295e-05, + "loss": 4.8283, + "step": 16042 + }, + { + "epoch": 0.09541226567703873, + "grad_norm": 1.2469282150268555, + "learning_rate": 4.888542243973086e-05, + "loss": 5.164, + "step": 16043 + }, + { + "epoch": 0.09541821296031973, + "grad_norm": 1.2372372150421143, + "learning_rate": 4.888528451998127e-05, + "loss": 5.2986, + "step": 16044 + }, + { + "epoch": 0.09542416024360072, + "grad_norm": 1.370978593826294, + "learning_rate": 4.888514659189357e-05, + "loss": 5.2353, + "step": 16045 + }, + { + "epoch": 0.09543010752688172, + "grad_norm": 1.4328222274780273, + "learning_rate": 4.888500865546781e-05, + "loss": 5.3482, + "step": 16046 + }, + { + "epoch": 0.09543605481016272, + "grad_norm": 1.2651796340942383, + "learning_rate": 4.888487071070405e-05, + "loss": 5.3276, + "step": 16047 + }, + { + "epoch": 0.09544200209344371, + "grad_norm": 1.34639310836792, + "learning_rate": 4.8884732757602325e-05, + "loss": 5.108, + "step": 16048 + }, + { + "epoch": 0.09544794937672471, + "grad_norm": 1.2254658937454224, + "learning_rate": 4.888459479616269e-05, + "loss": 5.1569, + "step": 16049 + }, + { + "epoch": 0.09545389666000571, + "grad_norm": 1.2902439832687378, + "learning_rate": 4.888445682638518e-05, + "loss": 5.2215, + "step": 16050 + }, + { + "epoch": 0.0954598439432867, + "grad_norm": 1.572160243988037, + "learning_rate": 4.888431884826986e-05, + "loss": 5.1288, + "step": 16051 + }, + { + "epoch": 0.0954657912265677, + "grad_norm": 1.266427993774414, + "learning_rate": 4.888418086181676e-05, + "loss": 5.231, + "step": 16052 + }, + { + "epoch": 0.0954717385098487, + "grad_norm": 1.2186620235443115, + "learning_rate": 4.888404286702595e-05, + "loss": 5.113, + "step": 16053 + }, + { + "epoch": 0.0954776857931297, + "grad_norm": 1.386727213859558, + "learning_rate": 4.888390486389747e-05, + "loss": 5.0559, + "step": 16054 + }, + { + "epoch": 0.0954836330764107, + "grad_norm": 1.3253827095031738, + "learning_rate": 4.8883766852431354e-05, + "loss": 5.2569, + "step": 16055 + }, + { + "epoch": 0.0954895803596917, + "grad_norm": 1.219800591468811, + "learning_rate": 4.888362883262767e-05, + "loss": 5.0805, + "step": 16056 + }, + { + "epoch": 0.09549552764297269, + "grad_norm": 1.2425061464309692, + "learning_rate": 4.888349080448646e-05, + "loss": 5.1447, + "step": 16057 + }, + { + "epoch": 0.09550147492625369, + "grad_norm": 2.619645833969116, + "learning_rate": 4.888335276800777e-05, + "loss": 5.2419, + "step": 16058 + }, + { + "epoch": 0.09550742220953469, + "grad_norm": 1.3087180852890015, + "learning_rate": 4.888321472319164e-05, + "loss": 5.1895, + "step": 16059 + }, + { + "epoch": 0.09551336949281568, + "grad_norm": 1.1865695714950562, + "learning_rate": 4.888307667003813e-05, + "loss": 5.1791, + "step": 16060 + }, + { + "epoch": 0.09551931677609668, + "grad_norm": 1.2647303342819214, + "learning_rate": 4.8882938608547294e-05, + "loss": 5.1928, + "step": 16061 + }, + { + "epoch": 0.09552526405937768, + "grad_norm": 1.2161632776260376, + "learning_rate": 4.888280053871916e-05, + "loss": 5.1431, + "step": 16062 + }, + { + "epoch": 0.09553121134265867, + "grad_norm": 1.3904309272766113, + "learning_rate": 4.8882662460553784e-05, + "loss": 5.0658, + "step": 16063 + }, + { + "epoch": 0.09553715862593967, + "grad_norm": 1.4302258491516113, + "learning_rate": 4.888252437405123e-05, + "loss": 5.1838, + "step": 16064 + }, + { + "epoch": 0.09554310590922067, + "grad_norm": 1.4313236474990845, + "learning_rate": 4.888238627921152e-05, + "loss": 5.2108, + "step": 16065 + }, + { + "epoch": 0.09554905319250166, + "grad_norm": 1.485170602798462, + "learning_rate": 4.8882248176034726e-05, + "loss": 5.179, + "step": 16066 + }, + { + "epoch": 0.09555500047578266, + "grad_norm": 1.3742952346801758, + "learning_rate": 4.888211006452088e-05, + "loss": 5.0416, + "step": 16067 + }, + { + "epoch": 0.09556094775906367, + "grad_norm": 1.2600523233413696, + "learning_rate": 4.888197194467005e-05, + "loss": 5.0891, + "step": 16068 + }, + { + "epoch": 0.09556689504234465, + "grad_norm": 1.2905696630477905, + "learning_rate": 4.888183381648225e-05, + "loss": 5.1004, + "step": 16069 + }, + { + "epoch": 0.09557284232562566, + "grad_norm": 1.2373219728469849, + "learning_rate": 4.8881695679957565e-05, + "loss": 5.1549, + "step": 16070 + }, + { + "epoch": 0.09557878960890664, + "grad_norm": 1.43118155002594, + "learning_rate": 4.8881557535096014e-05, + "loss": 5.067, + "step": 16071 + }, + { + "epoch": 0.09558473689218765, + "grad_norm": 1.201025366783142, + "learning_rate": 4.888141938189767e-05, + "loss": 5.1304, + "step": 16072 + }, + { + "epoch": 0.09559068417546865, + "grad_norm": 1.3497222661972046, + "learning_rate": 4.888128122036256e-05, + "loss": 5.0802, + "step": 16073 + }, + { + "epoch": 0.09559663145874964, + "grad_norm": 1.3429580926895142, + "learning_rate": 4.888114305049074e-05, + "loss": 5.1033, + "step": 16074 + }, + { + "epoch": 0.09560257874203064, + "grad_norm": 1.212725281715393, + "learning_rate": 4.888100487228227e-05, + "loss": 5.0627, + "step": 16075 + }, + { + "epoch": 0.09560852602531164, + "grad_norm": 1.258507490158081, + "learning_rate": 4.8880866685737174e-05, + "loss": 5.1215, + "step": 16076 + }, + { + "epoch": 0.09561447330859263, + "grad_norm": 1.4401910305023193, + "learning_rate": 4.888072849085552e-05, + "loss": 4.9619, + "step": 16077 + }, + { + "epoch": 0.09562042059187363, + "grad_norm": 1.240682601928711, + "learning_rate": 4.888059028763735e-05, + "loss": 4.8384, + "step": 16078 + }, + { + "epoch": 0.09562636787515463, + "grad_norm": 1.5701509714126587, + "learning_rate": 4.888045207608272e-05, + "loss": 5.0756, + "step": 16079 + }, + { + "epoch": 0.09563231515843562, + "grad_norm": 2.0408403873443604, + "learning_rate": 4.888031385619166e-05, + "loss": 5.1615, + "step": 16080 + }, + { + "epoch": 0.09563826244171662, + "grad_norm": 1.8134169578552246, + "learning_rate": 4.8880175627964245e-05, + "loss": 5.2383, + "step": 16081 + }, + { + "epoch": 0.09564420972499763, + "grad_norm": 1.4934067726135254, + "learning_rate": 4.888003739140049e-05, + "loss": 5.1512, + "step": 16082 + }, + { + "epoch": 0.09565015700827861, + "grad_norm": 1.6359374523162842, + "learning_rate": 4.887989914650047e-05, + "loss": 5.1245, + "step": 16083 + }, + { + "epoch": 0.09565610429155962, + "grad_norm": 1.5446397066116333, + "learning_rate": 4.887976089326422e-05, + "loss": 4.9806, + "step": 16084 + }, + { + "epoch": 0.09566205157484062, + "grad_norm": 1.845180869102478, + "learning_rate": 4.8879622631691794e-05, + "loss": 5.0474, + "step": 16085 + }, + { + "epoch": 0.0956679988581216, + "grad_norm": 1.8755276203155518, + "learning_rate": 4.887948436178324e-05, + "loss": 5.0674, + "step": 16086 + }, + { + "epoch": 0.09567394614140261, + "grad_norm": 1.5596239566802979, + "learning_rate": 4.88793460835386e-05, + "loss": 5.0699, + "step": 16087 + }, + { + "epoch": 0.09567989342468361, + "grad_norm": 1.6092095375061035, + "learning_rate": 4.8879207796957935e-05, + "loss": 5.1184, + "step": 16088 + }, + { + "epoch": 0.0956858407079646, + "grad_norm": 1.6217916011810303, + "learning_rate": 4.887906950204127e-05, + "loss": 4.9607, + "step": 16089 + }, + { + "epoch": 0.0956917879912456, + "grad_norm": 1.5006567239761353, + "learning_rate": 4.8878931198788694e-05, + "loss": 4.7948, + "step": 16090 + }, + { + "epoch": 0.0956977352745266, + "grad_norm": 1.397647738456726, + "learning_rate": 4.887879288720021e-05, + "loss": 5.1067, + "step": 16091 + }, + { + "epoch": 0.09570368255780759, + "grad_norm": 1.5627835988998413, + "learning_rate": 4.8878654567275886e-05, + "loss": 4.9138, + "step": 16092 + }, + { + "epoch": 0.09570962984108859, + "grad_norm": 1.4590591192245483, + "learning_rate": 4.8878516239015784e-05, + "loss": 4.9132, + "step": 16093 + }, + { + "epoch": 0.0957155771243696, + "grad_norm": 1.347569465637207, + "learning_rate": 4.887837790241992e-05, + "loss": 4.9732, + "step": 16094 + }, + { + "epoch": 0.09572152440765058, + "grad_norm": 1.547169804573059, + "learning_rate": 4.887823955748838e-05, + "loss": 5.1336, + "step": 16095 + }, + { + "epoch": 0.09572747169093158, + "grad_norm": 1.3920515775680542, + "learning_rate": 4.887810120422118e-05, + "loss": 5.0738, + "step": 16096 + }, + { + "epoch": 0.09573341897421259, + "grad_norm": 1.4531773328781128, + "learning_rate": 4.8877962842618386e-05, + "loss": 5.0517, + "step": 16097 + }, + { + "epoch": 0.09573936625749357, + "grad_norm": 1.458679437637329, + "learning_rate": 4.887782447268004e-05, + "loss": 4.9291, + "step": 16098 + }, + { + "epoch": 0.09574531354077458, + "grad_norm": 1.6293518543243408, + "learning_rate": 4.8877686094406196e-05, + "loss": 4.7676, + "step": 16099 + }, + { + "epoch": 0.09575126082405556, + "grad_norm": 1.6756728887557983, + "learning_rate": 4.8877547707796895e-05, + "loss": 4.7426, + "step": 16100 + }, + { + "epoch": 0.09575720810733657, + "grad_norm": 1.7573354244232178, + "learning_rate": 4.8877409312852194e-05, + "loss": 4.6344, + "step": 16101 + }, + { + "epoch": 0.09576315539061757, + "grad_norm": 1.701581597328186, + "learning_rate": 4.8877270909572126e-05, + "loss": 4.8023, + "step": 16102 + }, + { + "epoch": 0.09576910267389856, + "grad_norm": 1.4811267852783203, + "learning_rate": 4.887713249795676e-05, + "loss": 4.9964, + "step": 16103 + }, + { + "epoch": 0.09577504995717956, + "grad_norm": 1.4324437379837036, + "learning_rate": 4.887699407800612e-05, + "loss": 4.9657, + "step": 16104 + }, + { + "epoch": 0.09578099724046056, + "grad_norm": 1.6630572080612183, + "learning_rate": 4.8876855649720285e-05, + "loss": 4.8689, + "step": 16105 + }, + { + "epoch": 0.09578694452374155, + "grad_norm": 1.8548660278320312, + "learning_rate": 4.887671721309928e-05, + "loss": 4.8775, + "step": 16106 + }, + { + "epoch": 0.09579289180702255, + "grad_norm": 1.5234023332595825, + "learning_rate": 4.887657876814316e-05, + "loss": 5.1495, + "step": 16107 + }, + { + "epoch": 0.09579883909030355, + "grad_norm": 1.5281673669815063, + "learning_rate": 4.8876440314851967e-05, + "loss": 4.8887, + "step": 16108 + }, + { + "epoch": 0.09580478637358454, + "grad_norm": 1.6189017295837402, + "learning_rate": 4.887630185322576e-05, + "loss": 4.7103, + "step": 16109 + }, + { + "epoch": 0.09581073365686554, + "grad_norm": 1.8149834871292114, + "learning_rate": 4.8876163383264584e-05, + "loss": 4.5674, + "step": 16110 + }, + { + "epoch": 0.09581668094014655, + "grad_norm": 1.6370511054992676, + "learning_rate": 4.887602490496848e-05, + "loss": 4.6307, + "step": 16111 + }, + { + "epoch": 0.09582262822342753, + "grad_norm": 1.603553056716919, + "learning_rate": 4.887588641833751e-05, + "loss": 4.597, + "step": 16112 + }, + { + "epoch": 0.09582857550670854, + "grad_norm": 1.6511812210083008, + "learning_rate": 4.887574792337171e-05, + "loss": 4.604, + "step": 16113 + }, + { + "epoch": 0.09583452278998954, + "grad_norm": 1.6924868822097778, + "learning_rate": 4.887560942007113e-05, + "loss": 4.6674, + "step": 16114 + }, + { + "epoch": 0.09584047007327053, + "grad_norm": 1.6445999145507812, + "learning_rate": 4.887547090843583e-05, + "loss": 4.492, + "step": 16115 + }, + { + "epoch": 0.09584641735655153, + "grad_norm": 2.282087564468384, + "learning_rate": 4.887533238846585e-05, + "loss": 5.7458, + "step": 16116 + }, + { + "epoch": 0.09585236463983253, + "grad_norm": 1.8790422677993774, + "learning_rate": 4.887519386016123e-05, + "loss": 5.6642, + "step": 16117 + }, + { + "epoch": 0.09585831192311352, + "grad_norm": 1.887954592704773, + "learning_rate": 4.887505532352203e-05, + "loss": 5.8485, + "step": 16118 + }, + { + "epoch": 0.09586425920639452, + "grad_norm": 1.8805441856384277, + "learning_rate": 4.88749167785483e-05, + "loss": 5.5941, + "step": 16119 + }, + { + "epoch": 0.09587020648967552, + "grad_norm": 2.141098976135254, + "learning_rate": 4.8874778225240076e-05, + "loss": 5.1748, + "step": 16120 + }, + { + "epoch": 0.09587615377295651, + "grad_norm": 1.560094952583313, + "learning_rate": 4.887463966359741e-05, + "loss": 5.625, + "step": 16121 + }, + { + "epoch": 0.09588210105623751, + "grad_norm": 1.6463109254837036, + "learning_rate": 4.887450109362036e-05, + "loss": 5.6568, + "step": 16122 + }, + { + "epoch": 0.09588804833951851, + "grad_norm": 1.5389329195022583, + "learning_rate": 4.887436251530898e-05, + "loss": 5.6461, + "step": 16123 + }, + { + "epoch": 0.0958939956227995, + "grad_norm": 1.4973753690719604, + "learning_rate": 4.8874223928663284e-05, + "loss": 5.3542, + "step": 16124 + }, + { + "epoch": 0.0958999429060805, + "grad_norm": 1.4039745330810547, + "learning_rate": 4.8874085333683364e-05, + "loss": 5.506, + "step": 16125 + }, + { + "epoch": 0.0959058901893615, + "grad_norm": 1.819114089012146, + "learning_rate": 4.8873946730369235e-05, + "loss": 5.2586, + "step": 16126 + }, + { + "epoch": 0.0959118374726425, + "grad_norm": 1.9034372568130493, + "learning_rate": 4.887380811872095e-05, + "loss": 5.1818, + "step": 16127 + }, + { + "epoch": 0.0959177847559235, + "grad_norm": 1.8390016555786133, + "learning_rate": 4.8873669498738584e-05, + "loss": 5.8263, + "step": 16128 + }, + { + "epoch": 0.09592373203920448, + "grad_norm": 1.780961275100708, + "learning_rate": 4.887353087042216e-05, + "loss": 5.801, + "step": 16129 + }, + { + "epoch": 0.09592967932248549, + "grad_norm": 1.8105396032333374, + "learning_rate": 4.887339223377173e-05, + "loss": 5.3426, + "step": 16130 + }, + { + "epoch": 0.09593562660576649, + "grad_norm": 1.9126670360565186, + "learning_rate": 4.887325358878735e-05, + "loss": 5.404, + "step": 16131 + }, + { + "epoch": 0.09594157388904748, + "grad_norm": 1.4767181873321533, + "learning_rate": 4.887311493546906e-05, + "loss": 5.5631, + "step": 16132 + }, + { + "epoch": 0.09594752117232848, + "grad_norm": 1.4779311418533325, + "learning_rate": 4.8872976273816904e-05, + "loss": 5.6407, + "step": 16133 + }, + { + "epoch": 0.09595346845560948, + "grad_norm": 1.9026421308517456, + "learning_rate": 4.8872837603830955e-05, + "loss": 5.4299, + "step": 16134 + }, + { + "epoch": 0.09595941573889047, + "grad_norm": 1.845184326171875, + "learning_rate": 4.887269892551123e-05, + "loss": 5.4873, + "step": 16135 + }, + { + "epoch": 0.09596536302217147, + "grad_norm": 2.49023175239563, + "learning_rate": 4.88725602388578e-05, + "loss": 4.1458, + "step": 16136 + }, + { + "epoch": 0.09597131030545247, + "grad_norm": 2.0831515789031982, + "learning_rate": 4.887242154387071e-05, + "loss": 5.0316, + "step": 16137 + }, + { + "epoch": 0.09597725758873346, + "grad_norm": 1.6316094398498535, + "learning_rate": 4.887228284055e-05, + "loss": 5.1289, + "step": 16138 + }, + { + "epoch": 0.09598320487201446, + "grad_norm": 2.025193214416504, + "learning_rate": 4.8872144128895724e-05, + "loss": 5.3065, + "step": 16139 + }, + { + "epoch": 0.09598915215529547, + "grad_norm": 2.077871322631836, + "learning_rate": 4.887200540890793e-05, + "loss": 5.1163, + "step": 16140 + }, + { + "epoch": 0.09599509943857645, + "grad_norm": 1.8450415134429932, + "learning_rate": 4.8871866680586666e-05, + "loss": 5.2638, + "step": 16141 + }, + { + "epoch": 0.09600104672185746, + "grad_norm": 1.676255464553833, + "learning_rate": 4.8871727943931974e-05, + "loss": 4.8191, + "step": 16142 + }, + { + "epoch": 0.09600699400513846, + "grad_norm": 1.6484187841415405, + "learning_rate": 4.8871589198943914e-05, + "loss": 5.3993, + "step": 16143 + }, + { + "epoch": 0.09601294128841945, + "grad_norm": 1.7061866521835327, + "learning_rate": 4.887145044562253e-05, + "loss": 5.2941, + "step": 16144 + }, + { + "epoch": 0.09601888857170045, + "grad_norm": 1.7628071308135986, + "learning_rate": 4.887131168396786e-05, + "loss": 5.2736, + "step": 16145 + }, + { + "epoch": 0.09602483585498145, + "grad_norm": 2.0107390880584717, + "learning_rate": 4.887117291397997e-05, + "loss": 5.1561, + "step": 16146 + }, + { + "epoch": 0.09603078313826244, + "grad_norm": 1.7889841794967651, + "learning_rate": 4.887103413565889e-05, + "loss": 6.0519, + "step": 16147 + }, + { + "epoch": 0.09603673042154344, + "grad_norm": 1.7982914447784424, + "learning_rate": 4.8870895349004686e-05, + "loss": 5.4913, + "step": 16148 + }, + { + "epoch": 0.09604267770482444, + "grad_norm": 1.8263020515441895, + "learning_rate": 4.88707565540174e-05, + "loss": 5.8516, + "step": 16149 + }, + { + "epoch": 0.09604862498810543, + "grad_norm": 1.642863392829895, + "learning_rate": 4.887061775069708e-05, + "loss": 5.5714, + "step": 16150 + }, + { + "epoch": 0.09605457227138643, + "grad_norm": 1.5696642398834229, + "learning_rate": 4.887047893904377e-05, + "loss": 5.4624, + "step": 16151 + }, + { + "epoch": 0.09606051955466743, + "grad_norm": 1.8895677328109741, + "learning_rate": 4.8870340119057536e-05, + "loss": 5.621, + "step": 16152 + }, + { + "epoch": 0.09606646683794842, + "grad_norm": 1.772875428199768, + "learning_rate": 4.8870201290738395e-05, + "loss": 5.5371, + "step": 16153 + }, + { + "epoch": 0.09607241412122942, + "grad_norm": 1.6763731241226196, + "learning_rate": 4.8870062454086415e-05, + "loss": 5.966, + "step": 16154 + }, + { + "epoch": 0.09607836140451043, + "grad_norm": 1.5911294221878052, + "learning_rate": 4.886992360910165e-05, + "loss": 5.3707, + "step": 16155 + }, + { + "epoch": 0.09608430868779141, + "grad_norm": 1.7060188055038452, + "learning_rate": 4.886978475578414e-05, + "loss": 5.5278, + "step": 16156 + }, + { + "epoch": 0.09609025597107242, + "grad_norm": 1.6456331014633179, + "learning_rate": 4.886964589413394e-05, + "loss": 5.5132, + "step": 16157 + }, + { + "epoch": 0.0960962032543534, + "grad_norm": 1.6736609935760498, + "learning_rate": 4.886950702415109e-05, + "loss": 5.245, + "step": 16158 + }, + { + "epoch": 0.0961021505376344, + "grad_norm": 1.5359262228012085, + "learning_rate": 4.886936814583564e-05, + "loss": 5.3893, + "step": 16159 + }, + { + "epoch": 0.09610809782091541, + "grad_norm": 1.5430463552474976, + "learning_rate": 4.886922925918763e-05, + "loss": 5.4257, + "step": 16160 + }, + { + "epoch": 0.0961140451041964, + "grad_norm": 1.940909743309021, + "learning_rate": 4.886909036420714e-05, + "loss": 5.0744, + "step": 16161 + }, + { + "epoch": 0.0961199923874774, + "grad_norm": 1.869372844696045, + "learning_rate": 4.886895146089418e-05, + "loss": 5.4901, + "step": 16162 + }, + { + "epoch": 0.0961259396707584, + "grad_norm": 1.794975996017456, + "learning_rate": 4.886881254924882e-05, + "loss": 5.5174, + "step": 16163 + }, + { + "epoch": 0.09613188695403939, + "grad_norm": 1.6314165592193604, + "learning_rate": 4.8868673629271105e-05, + "loss": 5.5883, + "step": 16164 + }, + { + "epoch": 0.09613783423732039, + "grad_norm": 1.7309901714324951, + "learning_rate": 4.886853470096108e-05, + "loss": 5.3881, + "step": 16165 + }, + { + "epoch": 0.09614378152060139, + "grad_norm": 1.7356623411178589, + "learning_rate": 4.88683957643188e-05, + "loss": 5.3578, + "step": 16166 + }, + { + "epoch": 0.09614972880388238, + "grad_norm": 2.302006244659424, + "learning_rate": 4.886825681934431e-05, + "loss": 5.7811, + "step": 16167 + }, + { + "epoch": 0.09615567608716338, + "grad_norm": 2.282381534576416, + "learning_rate": 4.8868117866037656e-05, + "loss": 5.8847, + "step": 16168 + }, + { + "epoch": 0.09616162337044439, + "grad_norm": 1.9158310890197754, + "learning_rate": 4.886797890439889e-05, + "loss": 5.7663, + "step": 16169 + }, + { + "epoch": 0.09616757065372537, + "grad_norm": 1.6491609811782837, + "learning_rate": 4.886783993442806e-05, + "loss": 5.9077, + "step": 16170 + }, + { + "epoch": 0.09617351793700638, + "grad_norm": 1.739547848701477, + "learning_rate": 4.886770095612521e-05, + "loss": 5.5126, + "step": 16171 + }, + { + "epoch": 0.09617946522028738, + "grad_norm": 1.534516453742981, + "learning_rate": 4.88675619694904e-05, + "loss": 5.372, + "step": 16172 + }, + { + "epoch": 0.09618541250356837, + "grad_norm": 1.8228504657745361, + "learning_rate": 4.8867422974523657e-05, + "loss": 5.4673, + "step": 16173 + }, + { + "epoch": 0.09619135978684937, + "grad_norm": 1.8887168169021606, + "learning_rate": 4.886728397122505e-05, + "loss": 5.5699, + "step": 16174 + }, + { + "epoch": 0.09619730707013037, + "grad_norm": 1.6889835596084595, + "learning_rate": 4.8867144959594626e-05, + "loss": 5.6244, + "step": 16175 + }, + { + "epoch": 0.09620325435341136, + "grad_norm": 1.7387192249298096, + "learning_rate": 4.8867005939632424e-05, + "loss": 5.7735, + "step": 16176 + }, + { + "epoch": 0.09620920163669236, + "grad_norm": 1.9036939144134521, + "learning_rate": 4.8866866911338494e-05, + "loss": 5.8873, + "step": 16177 + }, + { + "epoch": 0.09621514891997336, + "grad_norm": 1.6884106397628784, + "learning_rate": 4.886672787471289e-05, + "loss": 5.1366, + "step": 16178 + }, + { + "epoch": 0.09622109620325435, + "grad_norm": 1.5132830142974854, + "learning_rate": 4.886658882975566e-05, + "loss": 5.2964, + "step": 16179 + }, + { + "epoch": 0.09622704348653535, + "grad_norm": 1.7039000988006592, + "learning_rate": 4.886644977646685e-05, + "loss": 5.2287, + "step": 16180 + }, + { + "epoch": 0.09623299076981635, + "grad_norm": 1.6894882917404175, + "learning_rate": 4.886631071484651e-05, + "loss": 5.3205, + "step": 16181 + }, + { + "epoch": 0.09623893805309734, + "grad_norm": 2.303013324737549, + "learning_rate": 4.8866171644894684e-05, + "loss": 5.2701, + "step": 16182 + }, + { + "epoch": 0.09624488533637834, + "grad_norm": 1.6158491373062134, + "learning_rate": 4.886603256661142e-05, + "loss": 5.522, + "step": 16183 + }, + { + "epoch": 0.09625083261965935, + "grad_norm": 1.5886715650558472, + "learning_rate": 4.8865893479996776e-05, + "loss": 5.7498, + "step": 16184 + }, + { + "epoch": 0.09625677990294033, + "grad_norm": 2.007570505142212, + "learning_rate": 4.88657543850508e-05, + "loss": 5.3746, + "step": 16185 + }, + { + "epoch": 0.09626272718622134, + "grad_norm": 2.8191232681274414, + "learning_rate": 4.886561528177352e-05, + "loss": 4.9794, + "step": 16186 + }, + { + "epoch": 0.09626867446950232, + "grad_norm": 2.5193052291870117, + "learning_rate": 4.886547617016501e-05, + "loss": 4.982, + "step": 16187 + }, + { + "epoch": 0.09627462175278333, + "grad_norm": 1.8875666856765747, + "learning_rate": 4.8865337050225316e-05, + "loss": 5.1801, + "step": 16188 + }, + { + "epoch": 0.09628056903606433, + "grad_norm": 1.441834568977356, + "learning_rate": 4.8865197921954475e-05, + "loss": 5.2723, + "step": 16189 + }, + { + "epoch": 0.09628651631934532, + "grad_norm": 2.0356223583221436, + "learning_rate": 4.8865058785352536e-05, + "loss": 5.4185, + "step": 16190 + }, + { + "epoch": 0.09629246360262632, + "grad_norm": 2.03885817527771, + "learning_rate": 4.8864919640419554e-05, + "loss": 5.1636, + "step": 16191 + }, + { + "epoch": 0.09629841088590732, + "grad_norm": 2.118439197540283, + "learning_rate": 4.8864780487155576e-05, + "loss": 5.4012, + "step": 16192 + }, + { + "epoch": 0.09630435816918831, + "grad_norm": 1.8266710042953491, + "learning_rate": 4.886464132556064e-05, + "loss": 4.9442, + "step": 16193 + }, + { + "epoch": 0.09631030545246931, + "grad_norm": 1.646341323852539, + "learning_rate": 4.886450215563482e-05, + "loss": 5.1368, + "step": 16194 + }, + { + "epoch": 0.09631625273575031, + "grad_norm": 1.8833272457122803, + "learning_rate": 4.886436297737814e-05, + "loss": 5.279, + "step": 16195 + }, + { + "epoch": 0.0963222000190313, + "grad_norm": 1.9521067142486572, + "learning_rate": 4.8864223790790666e-05, + "loss": 5.6571, + "step": 16196 + }, + { + "epoch": 0.0963281473023123, + "grad_norm": 1.8902586698532104, + "learning_rate": 4.8864084595872427e-05, + "loss": 5.632, + "step": 16197 + }, + { + "epoch": 0.0963340945855933, + "grad_norm": 1.7994412183761597, + "learning_rate": 4.886394539262349e-05, + "loss": 5.574, + "step": 16198 + }, + { + "epoch": 0.0963400418688743, + "grad_norm": 1.751780390739441, + "learning_rate": 4.8863806181043895e-05, + "loss": 5.691, + "step": 16199 + }, + { + "epoch": 0.0963459891521553, + "grad_norm": 2.30880069732666, + "learning_rate": 4.8863666961133684e-05, + "loss": 5.7477, + "step": 16200 + }, + { + "epoch": 0.0963519364354363, + "grad_norm": 2.351921319961548, + "learning_rate": 4.8863527732892924e-05, + "loss": 5.8162, + "step": 16201 + }, + { + "epoch": 0.09635788371871729, + "grad_norm": 1.6124454736709595, + "learning_rate": 4.8863388496321636e-05, + "loss": 5.8105, + "step": 16202 + }, + { + "epoch": 0.09636383100199829, + "grad_norm": 1.4927148818969727, + "learning_rate": 4.886324925141991e-05, + "loss": 5.8246, + "step": 16203 + }, + { + "epoch": 0.09636977828527929, + "grad_norm": 1.71438729763031, + "learning_rate": 4.886310999818775e-05, + "loss": 5.798, + "step": 16204 + }, + { + "epoch": 0.09637572556856028, + "grad_norm": 1.9519150257110596, + "learning_rate": 4.886297073662523e-05, + "loss": 5.2815, + "step": 16205 + }, + { + "epoch": 0.09638167285184128, + "grad_norm": 1.7694860696792603, + "learning_rate": 4.88628314667324e-05, + "loss": 5.7564, + "step": 16206 + }, + { + "epoch": 0.09638762013512228, + "grad_norm": 1.658252477645874, + "learning_rate": 4.88626921885093e-05, + "loss": 5.6586, + "step": 16207 + }, + { + "epoch": 0.09639356741840327, + "grad_norm": 2.310295581817627, + "learning_rate": 4.886255290195598e-05, + "loss": 4.9317, + "step": 16208 + }, + { + "epoch": 0.09639951470168427, + "grad_norm": 2.239964246749878, + "learning_rate": 4.886241360707249e-05, + "loss": 5.3794, + "step": 16209 + }, + { + "epoch": 0.09640546198496527, + "grad_norm": 2.470205307006836, + "learning_rate": 4.886227430385887e-05, + "loss": 5.1755, + "step": 16210 + }, + { + "epoch": 0.09641140926824626, + "grad_norm": 2.208298683166504, + "learning_rate": 4.8862134992315185e-05, + "loss": 5.1296, + "step": 16211 + }, + { + "epoch": 0.09641735655152726, + "grad_norm": 2.112288475036621, + "learning_rate": 4.886199567244147e-05, + "loss": 5.0888, + "step": 16212 + }, + { + "epoch": 0.09642330383480827, + "grad_norm": 2.3725969791412354, + "learning_rate": 4.886185634423778e-05, + "loss": 5.0256, + "step": 16213 + }, + { + "epoch": 0.09642925111808925, + "grad_norm": 2.3314402103424072, + "learning_rate": 4.8861717007704164e-05, + "loss": 5.012, + "step": 16214 + }, + { + "epoch": 0.09643519840137026, + "grad_norm": 2.1015000343322754, + "learning_rate": 4.8861577662840676e-05, + "loss": 4.7244, + "step": 16215 + }, + { + "epoch": 0.09644114568465124, + "grad_norm": 2.335218906402588, + "learning_rate": 4.8861438309647344e-05, + "loss": 4.8442, + "step": 16216 + }, + { + "epoch": 0.09644709296793225, + "grad_norm": 2.249216079711914, + "learning_rate": 4.886129894812424e-05, + "loss": 5.2573, + "step": 16217 + }, + { + "epoch": 0.09645304025121325, + "grad_norm": 2.228283166885376, + "learning_rate": 4.8861159578271406e-05, + "loss": 4.7297, + "step": 16218 + }, + { + "epoch": 0.09645898753449424, + "grad_norm": 1.7820645570755005, + "learning_rate": 4.886102020008888e-05, + "loss": 4.8427, + "step": 16219 + }, + { + "epoch": 0.09646493481777524, + "grad_norm": 2.1911120414733887, + "learning_rate": 4.886088081357672e-05, + "loss": 4.9677, + "step": 16220 + }, + { + "epoch": 0.09647088210105624, + "grad_norm": 2.453758716583252, + "learning_rate": 4.8860741418734976e-05, + "loss": 4.9039, + "step": 16221 + }, + { + "epoch": 0.09647682938433723, + "grad_norm": 2.488105058670044, + "learning_rate": 4.886060201556369e-05, + "loss": 5.0211, + "step": 16222 + }, + { + "epoch": 0.09648277666761823, + "grad_norm": 2.2040843963623047, + "learning_rate": 4.8860462604062915e-05, + "loss": 5.1067, + "step": 16223 + }, + { + "epoch": 0.09648872395089923, + "grad_norm": 2.0934717655181885, + "learning_rate": 4.8860323184232695e-05, + "loss": 4.9648, + "step": 16224 + }, + { + "epoch": 0.09649467123418022, + "grad_norm": 2.3775415420532227, + "learning_rate": 4.886018375607309e-05, + "loss": 4.9459, + "step": 16225 + }, + { + "epoch": 0.09650061851746122, + "grad_norm": 2.4042131900787354, + "learning_rate": 4.886004431958414e-05, + "loss": 4.7845, + "step": 16226 + }, + { + "epoch": 0.09650656580074223, + "grad_norm": 2.34424090385437, + "learning_rate": 4.885990487476589e-05, + "loss": 5.012, + "step": 16227 + }, + { + "epoch": 0.09651251308402321, + "grad_norm": 2.2711172103881836, + "learning_rate": 4.8859765421618395e-05, + "loss": 4.906, + "step": 16228 + }, + { + "epoch": 0.09651846036730422, + "grad_norm": 2.4021360874176025, + "learning_rate": 4.8859625960141706e-05, + "loss": 4.916, + "step": 16229 + }, + { + "epoch": 0.09652440765058522, + "grad_norm": 1.9205279350280762, + "learning_rate": 4.885948649033587e-05, + "loss": 5.0469, + "step": 16230 + }, + { + "epoch": 0.0965303549338662, + "grad_norm": 2.226362466812134, + "learning_rate": 4.885934701220093e-05, + "loss": 4.9439, + "step": 16231 + }, + { + "epoch": 0.09653630221714721, + "grad_norm": 2.288909673690796, + "learning_rate": 4.885920752573694e-05, + "loss": 4.8271, + "step": 16232 + }, + { + "epoch": 0.09654224950042821, + "grad_norm": 2.132235050201416, + "learning_rate": 4.8859068030943943e-05, + "loss": 5.1891, + "step": 16233 + }, + { + "epoch": 0.0965481967837092, + "grad_norm": 2.080244541168213, + "learning_rate": 4.8858928527822e-05, + "loss": 4.9055, + "step": 16234 + }, + { + "epoch": 0.0965541440669902, + "grad_norm": 2.324211359024048, + "learning_rate": 4.8858789016371145e-05, + "loss": 5.2614, + "step": 16235 + }, + { + "epoch": 0.0965600913502712, + "grad_norm": 1.827802062034607, + "learning_rate": 4.8858649496591437e-05, + "loss": 4.8874, + "step": 16236 + }, + { + "epoch": 0.09656603863355219, + "grad_norm": 1.8670811653137207, + "learning_rate": 4.885850996848292e-05, + "loss": 5.2402, + "step": 16237 + }, + { + "epoch": 0.09657198591683319, + "grad_norm": 2.046444892883301, + "learning_rate": 4.885837043204564e-05, + "loss": 4.7029, + "step": 16238 + }, + { + "epoch": 0.0965779332001142, + "grad_norm": 2.007894992828369, + "learning_rate": 4.885823088727965e-05, + "loss": 5.6706, + "step": 16239 + }, + { + "epoch": 0.09658388048339518, + "grad_norm": 2.24422025680542, + "learning_rate": 4.8858091334185005e-05, + "loss": 5.9666, + "step": 16240 + }, + { + "epoch": 0.09658982776667618, + "grad_norm": 1.7045838832855225, + "learning_rate": 4.885795177276174e-05, + "loss": 5.3021, + "step": 16241 + }, + { + "epoch": 0.09659577504995719, + "grad_norm": 1.7880860567092896, + "learning_rate": 4.885781220300991e-05, + "loss": 4.9151, + "step": 16242 + }, + { + "epoch": 0.09660172233323817, + "grad_norm": 2.3720862865448, + "learning_rate": 4.885767262492957e-05, + "loss": 5.0868, + "step": 16243 + }, + { + "epoch": 0.09660766961651918, + "grad_norm": 1.8655211925506592, + "learning_rate": 4.8857533038520756e-05, + "loss": 5.5072, + "step": 16244 + }, + { + "epoch": 0.09661361689980018, + "grad_norm": 1.8259748220443726, + "learning_rate": 4.885739344378353e-05, + "loss": 5.5992, + "step": 16245 + }, + { + "epoch": 0.09661956418308117, + "grad_norm": 1.667145013809204, + "learning_rate": 4.885725384071793e-05, + "loss": 5.2069, + "step": 16246 + }, + { + "epoch": 0.09662551146636217, + "grad_norm": 1.8004356622695923, + "learning_rate": 4.8857114229324015e-05, + "loss": 5.232, + "step": 16247 + }, + { + "epoch": 0.09663145874964316, + "grad_norm": 1.8246740102767944, + "learning_rate": 4.8856974609601825e-05, + "loss": 5.185, + "step": 16248 + }, + { + "epoch": 0.09663740603292416, + "grad_norm": 1.7453134059906006, + "learning_rate": 4.885683498155141e-05, + "loss": 4.9118, + "step": 16249 + }, + { + "epoch": 0.09664335331620516, + "grad_norm": 1.76914381980896, + "learning_rate": 4.885669534517282e-05, + "loss": 4.6679, + "step": 16250 + }, + { + "epoch": 0.09664930059948615, + "grad_norm": 2.0119516849517822, + "learning_rate": 4.88565557004661e-05, + "loss": 4.6495, + "step": 16251 + }, + { + "epoch": 0.09665524788276715, + "grad_norm": 1.7628357410430908, + "learning_rate": 4.885641604743131e-05, + "loss": 4.7581, + "step": 16252 + }, + { + "epoch": 0.09666119516604815, + "grad_norm": 1.6456751823425293, + "learning_rate": 4.8856276386068486e-05, + "loss": 4.9539, + "step": 16253 + }, + { + "epoch": 0.09666714244932914, + "grad_norm": 1.8474618196487427, + "learning_rate": 4.885613671637769e-05, + "loss": 5.9248, + "step": 16254 + }, + { + "epoch": 0.09667308973261014, + "grad_norm": 2.1205222606658936, + "learning_rate": 4.885599703835896e-05, + "loss": 5.2783, + "step": 16255 + }, + { + "epoch": 0.09667903701589114, + "grad_norm": 1.7559815645217896, + "learning_rate": 4.885585735201235e-05, + "loss": 5.6276, + "step": 16256 + }, + { + "epoch": 0.09668498429917213, + "grad_norm": 1.5784190893173218, + "learning_rate": 4.885571765733789e-05, + "loss": 5.5933, + "step": 16257 + }, + { + "epoch": 0.09669093158245314, + "grad_norm": 1.7377841472625732, + "learning_rate": 4.885557795433567e-05, + "loss": 5.1234, + "step": 16258 + }, + { + "epoch": 0.09669687886573414, + "grad_norm": 1.6517775058746338, + "learning_rate": 4.88554382430057e-05, + "loss": 5.6291, + "step": 16259 + }, + { + "epoch": 0.09670282614901513, + "grad_norm": 1.8474104404449463, + "learning_rate": 4.885529852334805e-05, + "loss": 6.0357, + "step": 16260 + }, + { + "epoch": 0.09670877343229613, + "grad_norm": 1.6555463075637817, + "learning_rate": 4.8855158795362756e-05, + "loss": 5.9828, + "step": 16261 + }, + { + "epoch": 0.09671472071557713, + "grad_norm": 1.6003193855285645, + "learning_rate": 4.8855019059049876e-05, + "loss": 5.9705, + "step": 16262 + }, + { + "epoch": 0.09672066799885812, + "grad_norm": 1.4992772340774536, + "learning_rate": 4.885487931440945e-05, + "loss": 5.8604, + "step": 16263 + }, + { + "epoch": 0.09672661528213912, + "grad_norm": 1.8667478561401367, + "learning_rate": 4.885473956144154e-05, + "loss": 6.1141, + "step": 16264 + }, + { + "epoch": 0.09673256256542012, + "grad_norm": 1.7311911582946777, + "learning_rate": 4.8854599800146186e-05, + "loss": 5.4142, + "step": 16265 + }, + { + "epoch": 0.09673850984870111, + "grad_norm": 2.0519683361053467, + "learning_rate": 4.885446003052343e-05, + "loss": 5.4321, + "step": 16266 + }, + { + "epoch": 0.09674445713198211, + "grad_norm": 2.02132248878479, + "learning_rate": 4.8854320252573325e-05, + "loss": 5.4957, + "step": 16267 + }, + { + "epoch": 0.09675040441526311, + "grad_norm": 1.7282330989837646, + "learning_rate": 4.885418046629594e-05, + "loss": 5.4486, + "step": 16268 + }, + { + "epoch": 0.0967563516985441, + "grad_norm": 1.909114122390747, + "learning_rate": 4.885404067169129e-05, + "loss": 5.4782, + "step": 16269 + }, + { + "epoch": 0.0967622989818251, + "grad_norm": 1.897161602973938, + "learning_rate": 4.885390086875945e-05, + "loss": 5.8678, + "step": 16270 + }, + { + "epoch": 0.0967682462651061, + "grad_norm": 2.0866503715515137, + "learning_rate": 4.885376105750046e-05, + "loss": 5.0869, + "step": 16271 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 1.6914600133895874, + "learning_rate": 4.885362123791437e-05, + "loss": 5.3385, + "step": 16272 + }, + { + "epoch": 0.0967801408316681, + "grad_norm": 1.4390329122543335, + "learning_rate": 4.885348141000122e-05, + "loss": 5.8069, + "step": 16273 + }, + { + "epoch": 0.0967860881149491, + "grad_norm": 1.5077629089355469, + "learning_rate": 4.885334157376107e-05, + "loss": 5.6679, + "step": 16274 + }, + { + "epoch": 0.09679203539823009, + "grad_norm": 1.4550343751907349, + "learning_rate": 4.885320172919397e-05, + "loss": 5.7548, + "step": 16275 + }, + { + "epoch": 0.09679798268151109, + "grad_norm": 2.068070650100708, + "learning_rate": 4.8853061876299956e-05, + "loss": 4.9706, + "step": 16276 + }, + { + "epoch": 0.09680392996479208, + "grad_norm": 1.3487659692764282, + "learning_rate": 4.885292201507909e-05, + "loss": 5.6918, + "step": 16277 + }, + { + "epoch": 0.09680987724807308, + "grad_norm": 1.4306180477142334, + "learning_rate": 4.885278214553141e-05, + "loss": 5.6196, + "step": 16278 + }, + { + "epoch": 0.09681582453135408, + "grad_norm": 1.6410231590270996, + "learning_rate": 4.885264226765698e-05, + "loss": 5.0523, + "step": 16279 + }, + { + "epoch": 0.09682177181463507, + "grad_norm": 2.4701485633850098, + "learning_rate": 4.8852502381455825e-05, + "loss": 4.6255, + "step": 16280 + }, + { + "epoch": 0.09682771909791607, + "grad_norm": 2.5248069763183594, + "learning_rate": 4.885236248692802e-05, + "loss": 4.5055, + "step": 16281 + }, + { + "epoch": 0.09683366638119707, + "grad_norm": 2.1913154125213623, + "learning_rate": 4.8852222584073595e-05, + "loss": 4.748, + "step": 16282 + }, + { + "epoch": 0.09683961366447806, + "grad_norm": 1.951987385749817, + "learning_rate": 4.8852082672892606e-05, + "loss": 5.3871, + "step": 16283 + }, + { + "epoch": 0.09684556094775906, + "grad_norm": 2.007020950317383, + "learning_rate": 4.885194275338511e-05, + "loss": 6.1075, + "step": 16284 + }, + { + "epoch": 0.09685150823104006, + "grad_norm": 1.9821717739105225, + "learning_rate": 4.885180282555113e-05, + "loss": 5.1719, + "step": 16285 + }, + { + "epoch": 0.09685745551432105, + "grad_norm": 2.339564800262451, + "learning_rate": 4.885166288939074e-05, + "loss": 4.9518, + "step": 16286 + }, + { + "epoch": 0.09686340279760206, + "grad_norm": 2.1785504817962646, + "learning_rate": 4.8851522944903984e-05, + "loss": 4.9656, + "step": 16287 + }, + { + "epoch": 0.09686935008088306, + "grad_norm": 1.7723946571350098, + "learning_rate": 4.885138299209091e-05, + "loss": 6.1572, + "step": 16288 + }, + { + "epoch": 0.09687529736416405, + "grad_norm": 1.702458381652832, + "learning_rate": 4.885124303095156e-05, + "loss": 5.9616, + "step": 16289 + }, + { + "epoch": 0.09688124464744505, + "grad_norm": 2.279836893081665, + "learning_rate": 4.885110306148599e-05, + "loss": 5.4305, + "step": 16290 + }, + { + "epoch": 0.09688719193072605, + "grad_norm": 1.8569501638412476, + "learning_rate": 4.8850963083694244e-05, + "loss": 5.8019, + "step": 16291 + }, + { + "epoch": 0.09689313921400704, + "grad_norm": 1.8126327991485596, + "learning_rate": 4.885082309757637e-05, + "loss": 5.7076, + "step": 16292 + }, + { + "epoch": 0.09689908649728804, + "grad_norm": 1.7170337438583374, + "learning_rate": 4.8850683103132424e-05, + "loss": 5.9862, + "step": 16293 + }, + { + "epoch": 0.09690503378056904, + "grad_norm": 1.7631909847259521, + "learning_rate": 4.8850543100362454e-05, + "loss": 5.917, + "step": 16294 + }, + { + "epoch": 0.09691098106385003, + "grad_norm": 1.9938957691192627, + "learning_rate": 4.88504030892665e-05, + "loss": 5.5773, + "step": 16295 + }, + { + "epoch": 0.09691692834713103, + "grad_norm": 1.9459222555160522, + "learning_rate": 4.8850263069844623e-05, + "loss": 5.2847, + "step": 16296 + }, + { + "epoch": 0.09692287563041203, + "grad_norm": 1.8420277833938599, + "learning_rate": 4.8850123042096865e-05, + "loss": 5.5691, + "step": 16297 + }, + { + "epoch": 0.09692882291369302, + "grad_norm": 2.2592809200286865, + "learning_rate": 4.8849983006023267e-05, + "loss": 5.4666, + "step": 16298 + }, + { + "epoch": 0.09693477019697402, + "grad_norm": 2.080939292907715, + "learning_rate": 4.884984296162389e-05, + "loss": 5.243, + "step": 16299 + }, + { + "epoch": 0.09694071748025503, + "grad_norm": 1.648836374282837, + "learning_rate": 4.884970290889879e-05, + "loss": 5.8331, + "step": 16300 + }, + { + "epoch": 0.09694666476353601, + "grad_norm": 1.668505311012268, + "learning_rate": 4.884956284784799e-05, + "loss": 5.7523, + "step": 16301 + }, + { + "epoch": 0.09695261204681702, + "grad_norm": 1.5473688840866089, + "learning_rate": 4.8849422778471567e-05, + "loss": 5.5379, + "step": 16302 + }, + { + "epoch": 0.09695855933009802, + "grad_norm": 1.9258644580841064, + "learning_rate": 4.8849282700769545e-05, + "loss": 5.6405, + "step": 16303 + }, + { + "epoch": 0.096964506613379, + "grad_norm": 1.5651416778564453, + "learning_rate": 4.884914261474199e-05, + "loss": 6.1487, + "step": 16304 + }, + { + "epoch": 0.09697045389666001, + "grad_norm": 1.5289270877838135, + "learning_rate": 4.884900252038894e-05, + "loss": 5.6653, + "step": 16305 + }, + { + "epoch": 0.096976401179941, + "grad_norm": 1.8394510746002197, + "learning_rate": 4.8848862417710464e-05, + "loss": 4.9243, + "step": 16306 + }, + { + "epoch": 0.096982348463222, + "grad_norm": 1.7624824047088623, + "learning_rate": 4.8848722306706584e-05, + "loss": 5.7712, + "step": 16307 + }, + { + "epoch": 0.096988295746503, + "grad_norm": 1.7294182777404785, + "learning_rate": 4.8848582187377365e-05, + "loss": 5.5197, + "step": 16308 + }, + { + "epoch": 0.09699424302978399, + "grad_norm": 1.69902765750885, + "learning_rate": 4.8848442059722856e-05, + "loss": 5.6485, + "step": 16309 + }, + { + "epoch": 0.09700019031306499, + "grad_norm": 1.7867447137832642, + "learning_rate": 4.88483019237431e-05, + "loss": 5.4422, + "step": 16310 + }, + { + "epoch": 0.09700613759634599, + "grad_norm": 1.6588819026947021, + "learning_rate": 4.884816177943814e-05, + "loss": 5.4282, + "step": 16311 + }, + { + "epoch": 0.09701208487962698, + "grad_norm": 1.504918098449707, + "learning_rate": 4.884802162680804e-05, + "loss": 5.508, + "step": 16312 + }, + { + "epoch": 0.09701803216290798, + "grad_norm": 1.5852895975112915, + "learning_rate": 4.8847881465852846e-05, + "loss": 5.5567, + "step": 16313 + }, + { + "epoch": 0.09702397944618898, + "grad_norm": 1.5719797611236572, + "learning_rate": 4.88477412965726e-05, + "loss": 5.6284, + "step": 16314 + }, + { + "epoch": 0.09702992672946997, + "grad_norm": 1.4208050966262817, + "learning_rate": 4.884760111896735e-05, + "loss": 5.5653, + "step": 16315 + }, + { + "epoch": 0.09703587401275098, + "grad_norm": 1.567555546760559, + "learning_rate": 4.8847460933037156e-05, + "loss": 5.5144, + "step": 16316 + }, + { + "epoch": 0.09704182129603198, + "grad_norm": 1.9179699420928955, + "learning_rate": 4.884732073878205e-05, + "loss": 4.7947, + "step": 16317 + }, + { + "epoch": 0.09704776857931297, + "grad_norm": 2.5346062183380127, + "learning_rate": 4.88471805362021e-05, + "loss": 3.8315, + "step": 16318 + }, + { + "epoch": 0.09705371586259397, + "grad_norm": 2.585686683654785, + "learning_rate": 4.884704032529734e-05, + "loss": 3.7288, + "step": 16319 + }, + { + "epoch": 0.09705966314587497, + "grad_norm": 2.133723020553589, + "learning_rate": 4.8846900106067825e-05, + "loss": 3.6369, + "step": 16320 + }, + { + "epoch": 0.09706561042915596, + "grad_norm": 2.4039080142974854, + "learning_rate": 4.884675987851361e-05, + "loss": 3.9068, + "step": 16321 + }, + { + "epoch": 0.09707155771243696, + "grad_norm": 2.643489360809326, + "learning_rate": 4.884661964263473e-05, + "loss": 3.7793, + "step": 16322 + }, + { + "epoch": 0.09707750499571796, + "grad_norm": 2.485727071762085, + "learning_rate": 4.8846479398431244e-05, + "loss": 4.9789, + "step": 16323 + }, + { + "epoch": 0.09708345227899895, + "grad_norm": 2.8592441082000732, + "learning_rate": 4.8846339145903194e-05, + "loss": 4.0196, + "step": 16324 + }, + { + "epoch": 0.09708939956227995, + "grad_norm": 2.470813035964966, + "learning_rate": 4.884619888505064e-05, + "loss": 5.2308, + "step": 16325 + }, + { + "epoch": 0.09709534684556095, + "grad_norm": 2.3255081176757812, + "learning_rate": 4.884605861587362e-05, + "loss": 5.3535, + "step": 16326 + }, + { + "epoch": 0.09710129412884194, + "grad_norm": 2.1462676525115967, + "learning_rate": 4.8845918338372195e-05, + "loss": 5.2611, + "step": 16327 + }, + { + "epoch": 0.09710724141212294, + "grad_norm": 1.8838989734649658, + "learning_rate": 4.88457780525464e-05, + "loss": 5.8104, + "step": 16328 + }, + { + "epoch": 0.09711318869540395, + "grad_norm": 2.137746572494507, + "learning_rate": 4.884563775839629e-05, + "loss": 5.4702, + "step": 16329 + }, + { + "epoch": 0.09711913597868493, + "grad_norm": 1.8934431076049805, + "learning_rate": 4.884549745592192e-05, + "loss": 4.9703, + "step": 16330 + }, + { + "epoch": 0.09712508326196594, + "grad_norm": 2.409020185470581, + "learning_rate": 4.884535714512333e-05, + "loss": 5.6793, + "step": 16331 + }, + { + "epoch": 0.09713103054524694, + "grad_norm": 2.039520263671875, + "learning_rate": 4.884521682600056e-05, + "loss": 5.7809, + "step": 16332 + }, + { + "epoch": 0.09713697782852793, + "grad_norm": 3.1211516857147217, + "learning_rate": 4.884507649855369e-05, + "loss": 5.6195, + "step": 16333 + }, + { + "epoch": 0.09714292511180893, + "grad_norm": 1.9474505186080933, + "learning_rate": 4.884493616278274e-05, + "loss": 5.3064, + "step": 16334 + }, + { + "epoch": 0.09714887239508992, + "grad_norm": 1.7586307525634766, + "learning_rate": 4.884479581868777e-05, + "loss": 4.9531, + "step": 16335 + }, + { + "epoch": 0.09715481967837092, + "grad_norm": 1.6352753639221191, + "learning_rate": 4.884465546626883e-05, + "loss": 5.304, + "step": 16336 + }, + { + "epoch": 0.09716076696165192, + "grad_norm": 1.681362271308899, + "learning_rate": 4.884451510552597e-05, + "loss": 5.9167, + "step": 16337 + }, + { + "epoch": 0.09716671424493291, + "grad_norm": 1.7970985174179077, + "learning_rate": 4.8844374736459225e-05, + "loss": 6.122, + "step": 16338 + }, + { + "epoch": 0.09717266152821391, + "grad_norm": 1.5312799215316772, + "learning_rate": 4.8844234359068666e-05, + "loss": 4.903, + "step": 16339 + }, + { + "epoch": 0.09717860881149491, + "grad_norm": 1.7024787664413452, + "learning_rate": 4.884409397335432e-05, + "loss": 5.3306, + "step": 16340 + }, + { + "epoch": 0.0971845560947759, + "grad_norm": 3.000169515609741, + "learning_rate": 4.884395357931626e-05, + "loss": 4.9682, + "step": 16341 + }, + { + "epoch": 0.0971905033780569, + "grad_norm": 2.910048484802246, + "learning_rate": 4.884381317695452e-05, + "loss": 5.2385, + "step": 16342 + }, + { + "epoch": 0.0971964506613379, + "grad_norm": 2.1094155311584473, + "learning_rate": 4.8843672766269147e-05, + "loss": 5.1025, + "step": 16343 + }, + { + "epoch": 0.09720239794461889, + "grad_norm": 1.7918319702148438, + "learning_rate": 4.884353234726019e-05, + "loss": 5.2822, + "step": 16344 + }, + { + "epoch": 0.0972083452278999, + "grad_norm": 1.574461579322815, + "learning_rate": 4.884339191992771e-05, + "loss": 5.6254, + "step": 16345 + }, + { + "epoch": 0.0972142925111809, + "grad_norm": 2.0780746936798096, + "learning_rate": 4.884325148427175e-05, + "loss": 5.0641, + "step": 16346 + }, + { + "epoch": 0.09722023979446189, + "grad_norm": 2.30399227142334, + "learning_rate": 4.884311104029235e-05, + "loss": 4.9591, + "step": 16347 + }, + { + "epoch": 0.09722618707774289, + "grad_norm": 2.087993621826172, + "learning_rate": 4.884297058798957e-05, + "loss": 5.0514, + "step": 16348 + }, + { + "epoch": 0.09723213436102389, + "grad_norm": 2.0179786682128906, + "learning_rate": 4.884283012736345e-05, + "loss": 4.9632, + "step": 16349 + }, + { + "epoch": 0.09723808164430488, + "grad_norm": 2.4394171237945557, + "learning_rate": 4.8842689658414054e-05, + "loss": 4.6517, + "step": 16350 + }, + { + "epoch": 0.09724402892758588, + "grad_norm": 2.6895275115966797, + "learning_rate": 4.884254918114142e-05, + "loss": 4.726, + "step": 16351 + }, + { + "epoch": 0.09724997621086688, + "grad_norm": 1.5181125402450562, + "learning_rate": 4.884240869554559e-05, + "loss": 5.679, + "step": 16352 + }, + { + "epoch": 0.09725592349414787, + "grad_norm": 1.758475422859192, + "learning_rate": 4.884226820162662e-05, + "loss": 5.2323, + "step": 16353 + }, + { + "epoch": 0.09726187077742887, + "grad_norm": 2.0166938304901123, + "learning_rate": 4.884212769938457e-05, + "loss": 4.6912, + "step": 16354 + }, + { + "epoch": 0.09726781806070987, + "grad_norm": 2.1366612911224365, + "learning_rate": 4.8841987188819475e-05, + "loss": 4.4761, + "step": 16355 + }, + { + "epoch": 0.09727376534399086, + "grad_norm": 1.9595547914505005, + "learning_rate": 4.884184666993139e-05, + "loss": 4.5343, + "step": 16356 + }, + { + "epoch": 0.09727971262727186, + "grad_norm": 1.896043300628662, + "learning_rate": 4.884170614272037e-05, + "loss": 4.465, + "step": 16357 + }, + { + "epoch": 0.09728565991055287, + "grad_norm": 2.062506675720215, + "learning_rate": 4.884156560718645e-05, + "loss": 4.301, + "step": 16358 + }, + { + "epoch": 0.09729160719383385, + "grad_norm": 2.0816612243652344, + "learning_rate": 4.884142506332968e-05, + "loss": 4.5414, + "step": 16359 + }, + { + "epoch": 0.09729755447711486, + "grad_norm": 2.0095489025115967, + "learning_rate": 4.884128451115012e-05, + "loss": 4.3779, + "step": 16360 + }, + { + "epoch": 0.09730350176039586, + "grad_norm": 2.0766615867614746, + "learning_rate": 4.884114395064781e-05, + "loss": 4.3999, + "step": 16361 + }, + { + "epoch": 0.09730944904367685, + "grad_norm": 2.0266785621643066, + "learning_rate": 4.8841003381822805e-05, + "loss": 4.5122, + "step": 16362 + }, + { + "epoch": 0.09731539632695785, + "grad_norm": 1.9631284475326538, + "learning_rate": 4.884086280467516e-05, + "loss": 4.3061, + "step": 16363 + }, + { + "epoch": 0.09732134361023884, + "grad_norm": 2.2965009212493896, + "learning_rate": 4.8840722219204905e-05, + "loss": 4.3387, + "step": 16364 + }, + { + "epoch": 0.09732729089351984, + "grad_norm": 2.036365509033203, + "learning_rate": 4.8840581625412105e-05, + "loss": 4.3242, + "step": 16365 + }, + { + "epoch": 0.09733323817680084, + "grad_norm": 2.186131477355957, + "learning_rate": 4.88404410232968e-05, + "loss": 4.2517, + "step": 16366 + }, + { + "epoch": 0.09733918546008183, + "grad_norm": 2.2000489234924316, + "learning_rate": 4.884030041285905e-05, + "loss": 4.274, + "step": 16367 + }, + { + "epoch": 0.09734513274336283, + "grad_norm": 3.2708849906921387, + "learning_rate": 4.884015979409889e-05, + "loss": 4.9575, + "step": 16368 + }, + { + "epoch": 0.09735108002664383, + "grad_norm": 1.7634176015853882, + "learning_rate": 4.884001916701639e-05, + "loss": 4.63, + "step": 16369 + }, + { + "epoch": 0.09735702730992482, + "grad_norm": 2.297611713409424, + "learning_rate": 4.883987853161157e-05, + "loss": 4.3009, + "step": 16370 + }, + { + "epoch": 0.09736297459320582, + "grad_norm": 2.1840944290161133, + "learning_rate": 4.8839737887884507e-05, + "loss": 4.2232, + "step": 16371 + }, + { + "epoch": 0.09736892187648682, + "grad_norm": 2.1925270557403564, + "learning_rate": 4.8839597235835234e-05, + "loss": 4.1824, + "step": 16372 + }, + { + "epoch": 0.09737486915976781, + "grad_norm": 2.175720453262329, + "learning_rate": 4.88394565754638e-05, + "loss": 4.2619, + "step": 16373 + }, + { + "epoch": 0.09738081644304881, + "grad_norm": 2.282804489135742, + "learning_rate": 4.883931590677026e-05, + "loss": 4.2207, + "step": 16374 + }, + { + "epoch": 0.09738676372632982, + "grad_norm": 1.674668788909912, + "learning_rate": 4.883917522975466e-05, + "loss": 5.3627, + "step": 16375 + }, + { + "epoch": 0.0973927110096108, + "grad_norm": 1.6538902521133423, + "learning_rate": 4.883903454441705e-05, + "loss": 5.302, + "step": 16376 + }, + { + "epoch": 0.09739865829289181, + "grad_norm": 1.4267115592956543, + "learning_rate": 4.8838893850757485e-05, + "loss": 5.2545, + "step": 16377 + }, + { + "epoch": 0.09740460557617281, + "grad_norm": 1.3086082935333252, + "learning_rate": 4.8838753148776e-05, + "loss": 5.1538, + "step": 16378 + }, + { + "epoch": 0.0974105528594538, + "grad_norm": 1.4384034872055054, + "learning_rate": 4.883861243847266e-05, + "loss": 5.3925, + "step": 16379 + }, + { + "epoch": 0.0974165001427348, + "grad_norm": 1.4971977472305298, + "learning_rate": 4.88384717198475e-05, + "loss": 5.3966, + "step": 16380 + }, + { + "epoch": 0.0974224474260158, + "grad_norm": 1.517468810081482, + "learning_rate": 4.8838330992900584e-05, + "loss": 5.1097, + "step": 16381 + }, + { + "epoch": 0.09742839470929679, + "grad_norm": 1.388852596282959, + "learning_rate": 4.8838190257631944e-05, + "loss": 5.1066, + "step": 16382 + }, + { + "epoch": 0.09743434199257779, + "grad_norm": 1.2972341775894165, + "learning_rate": 4.8838049514041646e-05, + "loss": 5.0383, + "step": 16383 + }, + { + "epoch": 0.0974402892758588, + "grad_norm": 1.338291049003601, + "learning_rate": 4.883790876212972e-05, + "loss": 5.1339, + "step": 16384 + }, + { + "epoch": 0.09744623655913978, + "grad_norm": 1.4399670362472534, + "learning_rate": 4.883776800189624e-05, + "loss": 5.0542, + "step": 16385 + }, + { + "epoch": 0.09745218384242078, + "grad_norm": 1.5091251134872437, + "learning_rate": 4.8837627233341235e-05, + "loss": 4.9303, + "step": 16386 + }, + { + "epoch": 0.09745813112570179, + "grad_norm": 1.4728022813796997, + "learning_rate": 4.8837486456464764e-05, + "loss": 5.0902, + "step": 16387 + }, + { + "epoch": 0.09746407840898277, + "grad_norm": 1.454509973526001, + "learning_rate": 4.8837345671266865e-05, + "loss": 4.9227, + "step": 16388 + }, + { + "epoch": 0.09747002569226378, + "grad_norm": 1.431118130683899, + "learning_rate": 4.88372048777476e-05, + "loss": 5.0128, + "step": 16389 + }, + { + "epoch": 0.09747597297554478, + "grad_norm": 1.434967041015625, + "learning_rate": 4.8837064075907015e-05, + "loss": 5.1793, + "step": 16390 + }, + { + "epoch": 0.09748192025882577, + "grad_norm": 1.5077275037765503, + "learning_rate": 4.883692326574515e-05, + "loss": 5.1573, + "step": 16391 + }, + { + "epoch": 0.09748786754210677, + "grad_norm": 1.44413161277771, + "learning_rate": 4.883678244726208e-05, + "loss": 5.2297, + "step": 16392 + }, + { + "epoch": 0.09749381482538776, + "grad_norm": 1.606898546218872, + "learning_rate": 4.883664162045781e-05, + "loss": 4.9409, + "step": 16393 + }, + { + "epoch": 0.09749976210866876, + "grad_norm": 1.649034857749939, + "learning_rate": 4.883650078533243e-05, + "loss": 5.1519, + "step": 16394 + }, + { + "epoch": 0.09750570939194976, + "grad_norm": 1.5309730768203735, + "learning_rate": 4.883635994188597e-05, + "loss": 4.9568, + "step": 16395 + }, + { + "epoch": 0.09751165667523075, + "grad_norm": 1.8033829927444458, + "learning_rate": 4.883621909011848e-05, + "loss": 4.7442, + "step": 16396 + }, + { + "epoch": 0.09751760395851175, + "grad_norm": 1.653501272201538, + "learning_rate": 4.8836078230030016e-05, + "loss": 4.5672, + "step": 16397 + }, + { + "epoch": 0.09752355124179275, + "grad_norm": 1.686077356338501, + "learning_rate": 4.8835937361620624e-05, + "loss": 4.5819, + "step": 16398 + }, + { + "epoch": 0.09752949852507374, + "grad_norm": 1.5233088731765747, + "learning_rate": 4.883579648489035e-05, + "loss": 4.5191, + "step": 16399 + }, + { + "epoch": 0.09753544580835474, + "grad_norm": 1.6472907066345215, + "learning_rate": 4.883565559983925e-05, + "loss": 4.6418, + "step": 16400 + }, + { + "epoch": 0.09754139309163574, + "grad_norm": 1.817649483680725, + "learning_rate": 4.8835514706467364e-05, + "loss": 4.806, + "step": 16401 + }, + { + "epoch": 0.09754734037491673, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.8835373804774754e-05, + "loss": 4.8169, + "step": 16402 + }, + { + "epoch": 0.09755328765819773, + "grad_norm": 1.5510175228118896, + "learning_rate": 4.883523289476145e-05, + "loss": 4.7987, + "step": 16403 + }, + { + "epoch": 0.09755923494147874, + "grad_norm": 1.4557734727859497, + "learning_rate": 4.8835091976427514e-05, + "loss": 4.7322, + "step": 16404 + }, + { + "epoch": 0.09756518222475973, + "grad_norm": 1.528123140335083, + "learning_rate": 4.8834951049773006e-05, + "loss": 4.7376, + "step": 16405 + }, + { + "epoch": 0.09757112950804073, + "grad_norm": 1.6215547323226929, + "learning_rate": 4.8834810114797944e-05, + "loss": 4.7679, + "step": 16406 + }, + { + "epoch": 0.09757707679132173, + "grad_norm": 1.4554566144943237, + "learning_rate": 4.883466917150241e-05, + "loss": 4.6452, + "step": 16407 + }, + { + "epoch": 0.09758302407460272, + "grad_norm": 1.5100599527359009, + "learning_rate": 4.883452821988644e-05, + "loss": 4.6957, + "step": 16408 + }, + { + "epoch": 0.09758897135788372, + "grad_norm": 1.7057833671569824, + "learning_rate": 4.8834387259950074e-05, + "loss": 4.7888, + "step": 16409 + }, + { + "epoch": 0.09759491864116472, + "grad_norm": 1.4016892910003662, + "learning_rate": 4.883424629169337e-05, + "loss": 4.769, + "step": 16410 + }, + { + "epoch": 0.09760086592444571, + "grad_norm": 1.5257891416549683, + "learning_rate": 4.883410531511638e-05, + "loss": 4.7443, + "step": 16411 + }, + { + "epoch": 0.09760681320772671, + "grad_norm": 1.3904502391815186, + "learning_rate": 4.883396433021916e-05, + "loss": 4.786, + "step": 16412 + }, + { + "epoch": 0.09761276049100771, + "grad_norm": 1.6081106662750244, + "learning_rate": 4.883382333700174e-05, + "loss": 4.5321, + "step": 16413 + }, + { + "epoch": 0.0976187077742887, + "grad_norm": 1.4291402101516724, + "learning_rate": 4.883368233546417e-05, + "loss": 4.5898, + "step": 16414 + }, + { + "epoch": 0.0976246550575697, + "grad_norm": 1.5700920820236206, + "learning_rate": 4.8833541325606524e-05, + "loss": 5.2177, + "step": 16415 + }, + { + "epoch": 0.0976306023408507, + "grad_norm": 1.5503007173538208, + "learning_rate": 4.8833400307428825e-05, + "loss": 5.3911, + "step": 16416 + }, + { + "epoch": 0.0976365496241317, + "grad_norm": 1.5890953540802002, + "learning_rate": 4.8833259280931135e-05, + "loss": 4.9426, + "step": 16417 + }, + { + "epoch": 0.0976424969074127, + "grad_norm": 1.5032304525375366, + "learning_rate": 4.8833118246113494e-05, + "loss": 4.6124, + "step": 16418 + }, + { + "epoch": 0.0976484441906937, + "grad_norm": 1.5300242900848389, + "learning_rate": 4.8832977202975964e-05, + "loss": 4.9323, + "step": 16419 + }, + { + "epoch": 0.09765439147397469, + "grad_norm": 1.7094424962997437, + "learning_rate": 4.883283615151859e-05, + "loss": 5.3205, + "step": 16420 + }, + { + "epoch": 0.09766033875725569, + "grad_norm": 1.8231004476547241, + "learning_rate": 4.883269509174142e-05, + "loss": 5.0414, + "step": 16421 + }, + { + "epoch": 0.09766628604053668, + "grad_norm": 1.7779520750045776, + "learning_rate": 4.8832554023644496e-05, + "loss": 4.9106, + "step": 16422 + }, + { + "epoch": 0.09767223332381768, + "grad_norm": 1.5394103527069092, + "learning_rate": 4.8832412947227875e-05, + "loss": 4.998, + "step": 16423 + }, + { + "epoch": 0.09767818060709868, + "grad_norm": 1.3814078569412231, + "learning_rate": 4.883227186249161e-05, + "loss": 4.9109, + "step": 16424 + }, + { + "epoch": 0.09768412789037967, + "grad_norm": 1.291040301322937, + "learning_rate": 4.8832130769435735e-05, + "loss": 5.3617, + "step": 16425 + }, + { + "epoch": 0.09769007517366067, + "grad_norm": 1.561249017715454, + "learning_rate": 4.883198966806032e-05, + "loss": 5.3041, + "step": 16426 + }, + { + "epoch": 0.09769602245694167, + "grad_norm": 1.7411010265350342, + "learning_rate": 4.883184855836539e-05, + "loss": 5.0816, + "step": 16427 + }, + { + "epoch": 0.09770196974022266, + "grad_norm": 1.6507155895233154, + "learning_rate": 4.8831707440351024e-05, + "loss": 5.1089, + "step": 16428 + }, + { + "epoch": 0.09770791702350366, + "grad_norm": 1.5242364406585693, + "learning_rate": 4.8831566314017254e-05, + "loss": 4.9718, + "step": 16429 + }, + { + "epoch": 0.09771386430678466, + "grad_norm": 2.3768868446350098, + "learning_rate": 4.883142517936412e-05, + "loss": 4.9333, + "step": 16430 + }, + { + "epoch": 0.09771981159006565, + "grad_norm": 1.2830429077148438, + "learning_rate": 4.8831284036391684e-05, + "loss": 4.9238, + "step": 16431 + }, + { + "epoch": 0.09772575887334665, + "grad_norm": 1.5065499544143677, + "learning_rate": 4.883114288509999e-05, + "loss": 5.0151, + "step": 16432 + }, + { + "epoch": 0.09773170615662766, + "grad_norm": 1.5989798307418823, + "learning_rate": 4.88310017254891e-05, + "loss": 5.0081, + "step": 16433 + }, + { + "epoch": 0.09773765343990864, + "grad_norm": 1.391644835472107, + "learning_rate": 4.883086055755905e-05, + "loss": 4.8942, + "step": 16434 + }, + { + "epoch": 0.09774360072318965, + "grad_norm": 1.4952952861785889, + "learning_rate": 4.883071938130989e-05, + "loss": 5.0018, + "step": 16435 + }, + { + "epoch": 0.09774954800647065, + "grad_norm": 1.522814393043518, + "learning_rate": 4.883057819674168e-05, + "loss": 5.2591, + "step": 16436 + }, + { + "epoch": 0.09775549528975164, + "grad_norm": 1.3879649639129639, + "learning_rate": 4.8830437003854454e-05, + "loss": 4.9136, + "step": 16437 + }, + { + "epoch": 0.09776144257303264, + "grad_norm": 1.3485056161880493, + "learning_rate": 4.883029580264827e-05, + "loss": 5.5159, + "step": 16438 + }, + { + "epoch": 0.09776738985631364, + "grad_norm": 1.475131869316101, + "learning_rate": 4.883015459312317e-05, + "loss": 5.4397, + "step": 16439 + }, + { + "epoch": 0.09777333713959463, + "grad_norm": 1.2736895084381104, + "learning_rate": 4.8830013375279215e-05, + "loss": 5.2867, + "step": 16440 + }, + { + "epoch": 0.09777928442287563, + "grad_norm": 1.456312656402588, + "learning_rate": 4.882987214911645e-05, + "loss": 5.3351, + "step": 16441 + }, + { + "epoch": 0.09778523170615663, + "grad_norm": 1.5312397480010986, + "learning_rate": 4.882973091463492e-05, + "loss": 5.3233, + "step": 16442 + }, + { + "epoch": 0.09779117898943762, + "grad_norm": 1.5735961198806763, + "learning_rate": 4.882958967183468e-05, + "loss": 4.9878, + "step": 16443 + }, + { + "epoch": 0.09779712627271862, + "grad_norm": 1.337172508239746, + "learning_rate": 4.882944842071577e-05, + "loss": 5.121, + "step": 16444 + }, + { + "epoch": 0.09780307355599963, + "grad_norm": 1.47593355178833, + "learning_rate": 4.882930716127826e-05, + "loss": 5.4733, + "step": 16445 + }, + { + "epoch": 0.09780902083928061, + "grad_norm": 1.4311164617538452, + "learning_rate": 4.882916589352217e-05, + "loss": 5.2215, + "step": 16446 + }, + { + "epoch": 0.09781496812256162, + "grad_norm": 1.3628556728363037, + "learning_rate": 4.882902461744757e-05, + "loss": 5.3611, + "step": 16447 + }, + { + "epoch": 0.09782091540584262, + "grad_norm": 1.5621687173843384, + "learning_rate": 4.882888333305451e-05, + "loss": 5.4407, + "step": 16448 + }, + { + "epoch": 0.0978268626891236, + "grad_norm": 1.570478081703186, + "learning_rate": 4.8828742040343024e-05, + "loss": 5.533, + "step": 16449 + }, + { + "epoch": 0.09783280997240461, + "grad_norm": 1.3725816011428833, + "learning_rate": 4.8828600739313174e-05, + "loss": 5.1467, + "step": 16450 + }, + { + "epoch": 0.0978387572556856, + "grad_norm": 1.4899497032165527, + "learning_rate": 4.8828459429965e-05, + "loss": 5.233, + "step": 16451 + }, + { + "epoch": 0.0978447045389666, + "grad_norm": 1.380609154701233, + "learning_rate": 4.882831811229857e-05, + "loss": 5.1484, + "step": 16452 + }, + { + "epoch": 0.0978506518222476, + "grad_norm": 1.2167932987213135, + "learning_rate": 4.882817678631391e-05, + "loss": 5.1687, + "step": 16453 + }, + { + "epoch": 0.09785659910552859, + "grad_norm": 1.5250643491744995, + "learning_rate": 4.882803545201108e-05, + "loss": 5.2395, + "step": 16454 + }, + { + "epoch": 0.09786254638880959, + "grad_norm": 1.4288511276245117, + "learning_rate": 4.882789410939013e-05, + "loss": 5.0532, + "step": 16455 + }, + { + "epoch": 0.09786849367209059, + "grad_norm": 1.6325379610061646, + "learning_rate": 4.8827752758451105e-05, + "loss": 5.2077, + "step": 16456 + }, + { + "epoch": 0.09787444095537158, + "grad_norm": 1.4227756261825562, + "learning_rate": 4.882761139919406e-05, + "loss": 5.0431, + "step": 16457 + }, + { + "epoch": 0.09788038823865258, + "grad_norm": 1.355039358139038, + "learning_rate": 4.8827470031619046e-05, + "loss": 4.9062, + "step": 16458 + }, + { + "epoch": 0.09788633552193358, + "grad_norm": 1.5071823596954346, + "learning_rate": 4.8827328655726113e-05, + "loss": 5.2632, + "step": 16459 + }, + { + "epoch": 0.09789228280521457, + "grad_norm": 1.411828637123108, + "learning_rate": 4.88271872715153e-05, + "loss": 5.343, + "step": 16460 + }, + { + "epoch": 0.09789823008849557, + "grad_norm": 1.419164776802063, + "learning_rate": 4.882704587898666e-05, + "loss": 5.1643, + "step": 16461 + }, + { + "epoch": 0.09790417737177658, + "grad_norm": 1.4997645616531372, + "learning_rate": 4.882690447814024e-05, + "loss": 5.1701, + "step": 16462 + }, + { + "epoch": 0.09791012465505756, + "grad_norm": 1.4251139163970947, + "learning_rate": 4.88267630689761e-05, + "loss": 5.0228, + "step": 16463 + }, + { + "epoch": 0.09791607193833857, + "grad_norm": 1.289102554321289, + "learning_rate": 4.882662165149429e-05, + "loss": 5.1934, + "step": 16464 + }, + { + "epoch": 0.09792201922161957, + "grad_norm": 1.1589713096618652, + "learning_rate": 4.882648022569484e-05, + "loss": 5.3388, + "step": 16465 + }, + { + "epoch": 0.09792796650490056, + "grad_norm": 1.1682082414627075, + "learning_rate": 4.8826338791577816e-05, + "loss": 5.2062, + "step": 16466 + }, + { + "epoch": 0.09793391378818156, + "grad_norm": 1.2263107299804688, + "learning_rate": 4.882619734914326e-05, + "loss": 5.414, + "step": 16467 + }, + { + "epoch": 0.09793986107146256, + "grad_norm": 1.2873631715774536, + "learning_rate": 4.882605589839123e-05, + "loss": 5.4286, + "step": 16468 + }, + { + "epoch": 0.09794580835474355, + "grad_norm": 1.2950979471206665, + "learning_rate": 4.882591443932177e-05, + "loss": 5.1603, + "step": 16469 + }, + { + "epoch": 0.09795175563802455, + "grad_norm": 1.5623066425323486, + "learning_rate": 4.882577297193493e-05, + "loss": 5.0778, + "step": 16470 + }, + { + "epoch": 0.09795770292130555, + "grad_norm": 1.5446339845657349, + "learning_rate": 4.882563149623076e-05, + "loss": 5.1451, + "step": 16471 + }, + { + "epoch": 0.09796365020458654, + "grad_norm": 1.599387526512146, + "learning_rate": 4.882549001220931e-05, + "loss": 5.4596, + "step": 16472 + }, + { + "epoch": 0.09796959748786754, + "grad_norm": 1.325596809387207, + "learning_rate": 4.882534851987062e-05, + "loss": 5.4639, + "step": 16473 + }, + { + "epoch": 0.09797554477114855, + "grad_norm": 1.3077852725982666, + "learning_rate": 4.8825207019214746e-05, + "loss": 5.3654, + "step": 16474 + }, + { + "epoch": 0.09798149205442953, + "grad_norm": 1.5500328540802002, + "learning_rate": 4.882506551024174e-05, + "loss": 4.946, + "step": 16475 + }, + { + "epoch": 0.09798743933771054, + "grad_norm": 1.6101415157318115, + "learning_rate": 4.8824923992951656e-05, + "loss": 4.9618, + "step": 16476 + }, + { + "epoch": 0.09799338662099154, + "grad_norm": 1.542837381362915, + "learning_rate": 4.882478246734453e-05, + "loss": 4.9959, + "step": 16477 + }, + { + "epoch": 0.09799933390427253, + "grad_norm": 1.5618165731430054, + "learning_rate": 4.8824640933420424e-05, + "loss": 5.1221, + "step": 16478 + }, + { + "epoch": 0.09800528118755353, + "grad_norm": 1.4425160884857178, + "learning_rate": 4.882449939117938e-05, + "loss": 5.1689, + "step": 16479 + }, + { + "epoch": 0.09801122847083452, + "grad_norm": 1.3621004819869995, + "learning_rate": 4.8824357840621445e-05, + "loss": 4.9975, + "step": 16480 + }, + { + "epoch": 0.09801717575411552, + "grad_norm": 1.5944523811340332, + "learning_rate": 4.882421628174668e-05, + "loss": 5.0296, + "step": 16481 + }, + { + "epoch": 0.09802312303739652, + "grad_norm": 1.391321063041687, + "learning_rate": 4.8824074714555125e-05, + "loss": 5.0139, + "step": 16482 + }, + { + "epoch": 0.09802907032067751, + "grad_norm": 1.2085964679718018, + "learning_rate": 4.882393313904683e-05, + "loss": 5.1125, + "step": 16483 + }, + { + "epoch": 0.09803501760395851, + "grad_norm": 1.391383409500122, + "learning_rate": 4.882379155522185e-05, + "loss": 5.2999, + "step": 16484 + }, + { + "epoch": 0.09804096488723951, + "grad_norm": 1.3748564720153809, + "learning_rate": 4.882364996308023e-05, + "loss": 5.3096, + "step": 16485 + }, + { + "epoch": 0.0980469121705205, + "grad_norm": 1.825728416442871, + "learning_rate": 4.8823508362622014e-05, + "loss": 5.3318, + "step": 16486 + }, + { + "epoch": 0.0980528594538015, + "grad_norm": 1.6402180194854736, + "learning_rate": 4.882336675384726e-05, + "loss": 5.155, + "step": 16487 + }, + { + "epoch": 0.0980588067370825, + "grad_norm": 1.343284249305725, + "learning_rate": 4.882322513675601e-05, + "loss": 4.9341, + "step": 16488 + }, + { + "epoch": 0.09806475402036349, + "grad_norm": 1.3958711624145508, + "learning_rate": 4.882308351134833e-05, + "loss": 4.9595, + "step": 16489 + }, + { + "epoch": 0.0980707013036445, + "grad_norm": 1.572996735572815, + "learning_rate": 4.882294187762425e-05, + "loss": 4.9666, + "step": 16490 + }, + { + "epoch": 0.0980766485869255, + "grad_norm": 1.6167391538619995, + "learning_rate": 4.882280023558383e-05, + "loss": 4.7387, + "step": 16491 + }, + { + "epoch": 0.09808259587020648, + "grad_norm": 2.474092483520508, + "learning_rate": 4.882265858522711e-05, + "loss": 5.1476, + "step": 16492 + }, + { + "epoch": 0.09808854315348749, + "grad_norm": 1.5375875234603882, + "learning_rate": 4.8822516926554155e-05, + "loss": 4.5832, + "step": 16493 + }, + { + "epoch": 0.09809449043676849, + "grad_norm": 1.6802133321762085, + "learning_rate": 4.8822375259565e-05, + "loss": 4.615, + "step": 16494 + }, + { + "epoch": 0.09810043772004948, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.8822233584259703e-05, + "loss": 4.6586, + "step": 16495 + }, + { + "epoch": 0.09810638500333048, + "grad_norm": 1.5207875967025757, + "learning_rate": 4.882209190063831e-05, + "loss": 4.6748, + "step": 16496 + }, + { + "epoch": 0.09811233228661148, + "grad_norm": 1.4980802536010742, + "learning_rate": 4.882195020870087e-05, + "loss": 4.5326, + "step": 16497 + }, + { + "epoch": 0.09811827956989247, + "grad_norm": 1.473092794418335, + "learning_rate": 4.882180850844743e-05, + "loss": 4.6126, + "step": 16498 + }, + { + "epoch": 0.09812422685317347, + "grad_norm": 1.521147608757019, + "learning_rate": 4.8821666799878055e-05, + "loss": 4.6269, + "step": 16499 + }, + { + "epoch": 0.09813017413645447, + "grad_norm": 1.7371230125427246, + "learning_rate": 4.882152508299277e-05, + "loss": 4.6847, + "step": 16500 + }, + { + "epoch": 0.09813612141973546, + "grad_norm": 1.7222683429718018, + "learning_rate": 4.8821383357791636e-05, + "loss": 5.3943, + "step": 16501 + }, + { + "epoch": 0.09814206870301646, + "grad_norm": 1.523373007774353, + "learning_rate": 4.8821241624274705e-05, + "loss": 5.2822, + "step": 16502 + }, + { + "epoch": 0.09814801598629747, + "grad_norm": 1.365224838256836, + "learning_rate": 4.882109988244203e-05, + "loss": 5.1923, + "step": 16503 + }, + { + "epoch": 0.09815396326957845, + "grad_norm": 1.503907322883606, + "learning_rate": 4.882095813229365e-05, + "loss": 5.128, + "step": 16504 + }, + { + "epoch": 0.09815991055285946, + "grad_norm": 1.5996166467666626, + "learning_rate": 4.8820816373829625e-05, + "loss": 4.9296, + "step": 16505 + }, + { + "epoch": 0.09816585783614046, + "grad_norm": 1.373089075088501, + "learning_rate": 4.8820674607049994e-05, + "loss": 5.0614, + "step": 16506 + }, + { + "epoch": 0.09817180511942145, + "grad_norm": 1.3730735778808594, + "learning_rate": 4.882053283195481e-05, + "loss": 5.0374, + "step": 16507 + }, + { + "epoch": 0.09817775240270245, + "grad_norm": 1.2357912063598633, + "learning_rate": 4.882039104854413e-05, + "loss": 5.1513, + "step": 16508 + }, + { + "epoch": 0.09818369968598344, + "grad_norm": 1.402327299118042, + "learning_rate": 4.8820249256817995e-05, + "loss": 5.7344, + "step": 16509 + }, + { + "epoch": 0.09818964696926444, + "grad_norm": 1.3152369260787964, + "learning_rate": 4.882010745677645e-05, + "loss": 5.6755, + "step": 16510 + }, + { + "epoch": 0.09819559425254544, + "grad_norm": 1.409428358078003, + "learning_rate": 4.8819965648419565e-05, + "loss": 5.3562, + "step": 16511 + }, + { + "epoch": 0.09820154153582643, + "grad_norm": 1.3278082609176636, + "learning_rate": 4.881982383174737e-05, + "loss": 5.2401, + "step": 16512 + }, + { + "epoch": 0.09820748881910743, + "grad_norm": 1.287716269493103, + "learning_rate": 4.881968200675991e-05, + "loss": 4.9961, + "step": 16513 + }, + { + "epoch": 0.09821343610238843, + "grad_norm": 1.3444676399230957, + "learning_rate": 4.881954017345727e-05, + "loss": 5.5592, + "step": 16514 + }, + { + "epoch": 0.09821938338566942, + "grad_norm": 1.4815365076065063, + "learning_rate": 4.881939833183945e-05, + "loss": 5.5342, + "step": 16515 + }, + { + "epoch": 0.09822533066895042, + "grad_norm": 1.210050344467163, + "learning_rate": 4.8819256481906536e-05, + "loss": 5.5375, + "step": 16516 + }, + { + "epoch": 0.09823127795223142, + "grad_norm": 2.041801691055298, + "learning_rate": 4.881911462365857e-05, + "loss": 4.601, + "step": 16517 + }, + { + "epoch": 0.09823722523551241, + "grad_norm": 2.196315050125122, + "learning_rate": 4.881897275709558e-05, + "loss": 4.2376, + "step": 16518 + }, + { + "epoch": 0.09824317251879341, + "grad_norm": 2.1649539470672607, + "learning_rate": 4.881883088221765e-05, + "loss": 4.4159, + "step": 16519 + }, + { + "epoch": 0.09824911980207442, + "grad_norm": 2.02476167678833, + "learning_rate": 4.881868899902481e-05, + "loss": 4.4091, + "step": 16520 + }, + { + "epoch": 0.0982550670853554, + "grad_norm": 1.9262346029281616, + "learning_rate": 4.88185471075171e-05, + "loss": 4.4326, + "step": 16521 + }, + { + "epoch": 0.0982610143686364, + "grad_norm": 1.8461369276046753, + "learning_rate": 4.881840520769459e-05, + "loss": 4.1563, + "step": 16522 + }, + { + "epoch": 0.09826696165191741, + "grad_norm": 1.8261640071868896, + "learning_rate": 4.881826329955732e-05, + "loss": 4.3518, + "step": 16523 + }, + { + "epoch": 0.0982729089351984, + "grad_norm": 2.1533737182617188, + "learning_rate": 4.881812138310534e-05, + "loss": 4.292, + "step": 16524 + }, + { + "epoch": 0.0982788562184794, + "grad_norm": 2.11578369140625, + "learning_rate": 4.8817979458338705e-05, + "loss": 4.5411, + "step": 16525 + }, + { + "epoch": 0.0982848035017604, + "grad_norm": 1.8681827783584595, + "learning_rate": 4.881783752525745e-05, + "loss": 5.7264, + "step": 16526 + }, + { + "epoch": 0.09829075078504139, + "grad_norm": 1.98794424533844, + "learning_rate": 4.881769558386163e-05, + "loss": 5.4694, + "step": 16527 + }, + { + "epoch": 0.09829669806832239, + "grad_norm": 2.6389517784118652, + "learning_rate": 4.881755363415131e-05, + "loss": 5.0086, + "step": 16528 + }, + { + "epoch": 0.0983026453516034, + "grad_norm": 2.2565221786499023, + "learning_rate": 4.881741167612653e-05, + "loss": 4.9219, + "step": 16529 + }, + { + "epoch": 0.09830859263488438, + "grad_norm": 1.8296940326690674, + "learning_rate": 4.881726970978733e-05, + "loss": 4.9185, + "step": 16530 + }, + { + "epoch": 0.09831453991816538, + "grad_norm": 2.031334638595581, + "learning_rate": 4.8817127735133774e-05, + "loss": 4.8589, + "step": 16531 + }, + { + "epoch": 0.09832048720144639, + "grad_norm": 1.5883747339248657, + "learning_rate": 4.8816985752165904e-05, + "loss": 5.2695, + "step": 16532 + }, + { + "epoch": 0.09832643448472737, + "grad_norm": 1.4946906566619873, + "learning_rate": 4.8816843760883755e-05, + "loss": 5.6835, + "step": 16533 + }, + { + "epoch": 0.09833238176800838, + "grad_norm": 1.7901808023452759, + "learning_rate": 4.881670176128741e-05, + "loss": 6.1753, + "step": 16534 + }, + { + "epoch": 0.09833832905128938, + "grad_norm": 1.7249737977981567, + "learning_rate": 4.881655975337689e-05, + "loss": 5.86, + "step": 16535 + }, + { + "epoch": 0.09834427633457037, + "grad_norm": 1.8257695436477661, + "learning_rate": 4.8816417737152264e-05, + "loss": 5.1969, + "step": 16536 + }, + { + "epoch": 0.09835022361785137, + "grad_norm": 1.3712751865386963, + "learning_rate": 4.881627571261357e-05, + "loss": 5.7666, + "step": 16537 + }, + { + "epoch": 0.09835617090113236, + "grad_norm": 1.8865090608596802, + "learning_rate": 4.881613367976086e-05, + "loss": 4.8832, + "step": 16538 + }, + { + "epoch": 0.09836211818441336, + "grad_norm": 1.7155808210372925, + "learning_rate": 4.8815991638594175e-05, + "loss": 4.7248, + "step": 16539 + }, + { + "epoch": 0.09836806546769436, + "grad_norm": 1.6654868125915527, + "learning_rate": 4.8815849589113585e-05, + "loss": 4.7095, + "step": 16540 + }, + { + "epoch": 0.09837401275097535, + "grad_norm": 1.6152902841567993, + "learning_rate": 4.881570753131912e-05, + "loss": 5.2894, + "step": 16541 + }, + { + "epoch": 0.09837996003425635, + "grad_norm": 2.1657047271728516, + "learning_rate": 4.8815565465210835e-05, + "loss": 5.9782, + "step": 16542 + }, + { + "epoch": 0.09838590731753735, + "grad_norm": 1.801346778869629, + "learning_rate": 4.88154233907888e-05, + "loss": 5.6683, + "step": 16543 + }, + { + "epoch": 0.09839185460081834, + "grad_norm": 1.7916477918624878, + "learning_rate": 4.881528130805303e-05, + "loss": 5.7056, + "step": 16544 + }, + { + "epoch": 0.09839780188409934, + "grad_norm": 2.1006147861480713, + "learning_rate": 4.881513921700359e-05, + "loss": 5.6315, + "step": 16545 + }, + { + "epoch": 0.09840374916738034, + "grad_norm": 2.3291585445404053, + "learning_rate": 4.8814997117640535e-05, + "loss": 4.8996, + "step": 16546 + }, + { + "epoch": 0.09840969645066133, + "grad_norm": 1.9543695449829102, + "learning_rate": 4.8814855009963916e-05, + "loss": 5.1839, + "step": 16547 + }, + { + "epoch": 0.09841564373394233, + "grad_norm": 2.7100865840911865, + "learning_rate": 4.881471289397378e-05, + "loss": 5.1445, + "step": 16548 + }, + { + "epoch": 0.09842159101722334, + "grad_norm": 2.5749876499176025, + "learning_rate": 4.8814570769670165e-05, + "loss": 5.2023, + "step": 16549 + }, + { + "epoch": 0.09842753830050432, + "grad_norm": 2.079770088195801, + "learning_rate": 4.881442863705313e-05, + "loss": 5.1197, + "step": 16550 + }, + { + "epoch": 0.09843348558378533, + "grad_norm": 1.9495431184768677, + "learning_rate": 4.881428649612272e-05, + "loss": 4.8669, + "step": 16551 + }, + { + "epoch": 0.09843943286706633, + "grad_norm": 2.0918610095977783, + "learning_rate": 4.8814144346879e-05, + "loss": 5.0413, + "step": 16552 + }, + { + "epoch": 0.09844538015034732, + "grad_norm": 2.326662302017212, + "learning_rate": 4.8814002189322e-05, + "loss": 5.0085, + "step": 16553 + }, + { + "epoch": 0.09845132743362832, + "grad_norm": 2.3819150924682617, + "learning_rate": 4.881386002345178e-05, + "loss": 4.8364, + "step": 16554 + }, + { + "epoch": 0.09845727471690932, + "grad_norm": 2.6585230827331543, + "learning_rate": 4.881371784926839e-05, + "loss": 5.1722, + "step": 16555 + }, + { + "epoch": 0.09846322200019031, + "grad_norm": 2.209075689315796, + "learning_rate": 4.881357566677187e-05, + "loss": 5.0474, + "step": 16556 + }, + { + "epoch": 0.09846916928347131, + "grad_norm": 1.9725440740585327, + "learning_rate": 4.881343347596229e-05, + "loss": 5.0361, + "step": 16557 + }, + { + "epoch": 0.09847511656675231, + "grad_norm": 2.0074071884155273, + "learning_rate": 4.881329127683968e-05, + "loss": 5.5143, + "step": 16558 + }, + { + "epoch": 0.0984810638500333, + "grad_norm": 1.8329545259475708, + "learning_rate": 4.8813149069404093e-05, + "loss": 5.8843, + "step": 16559 + }, + { + "epoch": 0.0984870111333143, + "grad_norm": 2.2991678714752197, + "learning_rate": 4.881300685365558e-05, + "loss": 4.6178, + "step": 16560 + }, + { + "epoch": 0.0984929584165953, + "grad_norm": 2.7643637657165527, + "learning_rate": 4.881286462959419e-05, + "loss": 4.1381, + "step": 16561 + }, + { + "epoch": 0.0984989056998763, + "grad_norm": 2.5811941623687744, + "learning_rate": 4.8812722397219985e-05, + "loss": 3.8026, + "step": 16562 + }, + { + "epoch": 0.0985048529831573, + "grad_norm": 2.1111907958984375, + "learning_rate": 4.8812580156533e-05, + "loss": 4.0149, + "step": 16563 + }, + { + "epoch": 0.0985108002664383, + "grad_norm": 2.229973793029785, + "learning_rate": 4.8812437907533294e-05, + "loss": 4.24, + "step": 16564 + }, + { + "epoch": 0.09851674754971929, + "grad_norm": 1.6310914754867554, + "learning_rate": 4.8812295650220905e-05, + "loss": 5.9476, + "step": 16565 + }, + { + "epoch": 0.09852269483300029, + "grad_norm": 1.7397875785827637, + "learning_rate": 4.881215338459589e-05, + "loss": 5.8527, + "step": 16566 + }, + { + "epoch": 0.09852864211628128, + "grad_norm": 1.8279019594192505, + "learning_rate": 4.88120111106583e-05, + "loss": 5.5869, + "step": 16567 + }, + { + "epoch": 0.09853458939956228, + "grad_norm": 1.6956331729888916, + "learning_rate": 4.881186882840818e-05, + "loss": 5.6508, + "step": 16568 + }, + { + "epoch": 0.09854053668284328, + "grad_norm": 1.619205355644226, + "learning_rate": 4.881172653784559e-05, + "loss": 5.6502, + "step": 16569 + }, + { + "epoch": 0.09854648396612427, + "grad_norm": 1.4612733125686646, + "learning_rate": 4.881158423897057e-05, + "loss": 5.5937, + "step": 16570 + }, + { + "epoch": 0.09855243124940527, + "grad_norm": 1.4997358322143555, + "learning_rate": 4.8811441931783165e-05, + "loss": 5.5865, + "step": 16571 + }, + { + "epoch": 0.09855837853268627, + "grad_norm": 1.6516716480255127, + "learning_rate": 4.8811299616283434e-05, + "loss": 5.4031, + "step": 16572 + }, + { + "epoch": 0.09856432581596726, + "grad_norm": 1.5714633464813232, + "learning_rate": 4.881115729247143e-05, + "loss": 5.4543, + "step": 16573 + }, + { + "epoch": 0.09857027309924826, + "grad_norm": 1.4891443252563477, + "learning_rate": 4.881101496034719e-05, + "loss": 5.5687, + "step": 16574 + }, + { + "epoch": 0.09857622038252926, + "grad_norm": 1.3504915237426758, + "learning_rate": 4.8810872619910773e-05, + "loss": 5.5777, + "step": 16575 + }, + { + "epoch": 0.09858216766581025, + "grad_norm": 1.5825836658477783, + "learning_rate": 4.881073027116223e-05, + "loss": 5.547, + "step": 16576 + }, + { + "epoch": 0.09858811494909125, + "grad_norm": 1.4398233890533447, + "learning_rate": 4.8810587914101607e-05, + "loss": 5.4707, + "step": 16577 + }, + { + "epoch": 0.09859406223237226, + "grad_norm": 1.6776020526885986, + "learning_rate": 4.881044554872895e-05, + "loss": 5.4879, + "step": 16578 + }, + { + "epoch": 0.09860000951565324, + "grad_norm": 1.417771339416504, + "learning_rate": 4.8810303175044316e-05, + "loss": 5.5362, + "step": 16579 + }, + { + "epoch": 0.09860595679893425, + "grad_norm": 1.4919921159744263, + "learning_rate": 4.881016079304775e-05, + "loss": 5.5289, + "step": 16580 + }, + { + "epoch": 0.09861190408221525, + "grad_norm": 1.6195905208587646, + "learning_rate": 4.88100184027393e-05, + "loss": 5.467, + "step": 16581 + }, + { + "epoch": 0.09861785136549624, + "grad_norm": 1.5255846977233887, + "learning_rate": 4.880987600411902e-05, + "loss": 6.268, + "step": 16582 + }, + { + "epoch": 0.09862379864877724, + "grad_norm": 1.5051823854446411, + "learning_rate": 4.880973359718696e-05, + "loss": 6.024, + "step": 16583 + }, + { + "epoch": 0.09862974593205824, + "grad_norm": 2.455932378768921, + "learning_rate": 4.880959118194317e-05, + "loss": 5.0881, + "step": 16584 + }, + { + "epoch": 0.09863569321533923, + "grad_norm": 2.3916566371917725, + "learning_rate": 4.880944875838769e-05, + "loss": 5.0897, + "step": 16585 + }, + { + "epoch": 0.09864164049862023, + "grad_norm": 2.0487334728240967, + "learning_rate": 4.880930632652058e-05, + "loss": 5.603, + "step": 16586 + }, + { + "epoch": 0.09864758778190123, + "grad_norm": 1.9195282459259033, + "learning_rate": 4.880916388634189e-05, + "loss": 5.6492, + "step": 16587 + }, + { + "epoch": 0.09865353506518222, + "grad_norm": 1.743602991104126, + "learning_rate": 4.880902143785166e-05, + "loss": 5.7378, + "step": 16588 + }, + { + "epoch": 0.09865948234846322, + "grad_norm": 1.913156509399414, + "learning_rate": 4.880887898104996e-05, + "loss": 5.6267, + "step": 16589 + }, + { + "epoch": 0.09866542963174423, + "grad_norm": 1.8759669065475464, + "learning_rate": 4.880873651593681e-05, + "loss": 5.5593, + "step": 16590 + }, + { + "epoch": 0.09867137691502521, + "grad_norm": 1.8475536108016968, + "learning_rate": 4.880859404251229e-05, + "loss": 5.5021, + "step": 16591 + }, + { + "epoch": 0.09867732419830622, + "grad_norm": 1.5235642194747925, + "learning_rate": 4.880845156077643e-05, + "loss": 5.4692, + "step": 16592 + }, + { + "epoch": 0.09868327148158722, + "grad_norm": 1.8132069110870361, + "learning_rate": 4.8808309070729294e-05, + "loss": 5.6067, + "step": 16593 + }, + { + "epoch": 0.0986892187648682, + "grad_norm": 1.8001697063446045, + "learning_rate": 4.880816657237091e-05, + "loss": 5.749, + "step": 16594 + }, + { + "epoch": 0.09869516604814921, + "grad_norm": 1.8349007368087769, + "learning_rate": 4.8808024065701354e-05, + "loss": 5.6596, + "step": 16595 + }, + { + "epoch": 0.0987011133314302, + "grad_norm": 1.5677918195724487, + "learning_rate": 4.880788155072065e-05, + "loss": 5.725, + "step": 16596 + }, + { + "epoch": 0.0987070606147112, + "grad_norm": 1.8379719257354736, + "learning_rate": 4.880773902742887e-05, + "loss": 5.4325, + "step": 16597 + }, + { + "epoch": 0.0987130078979922, + "grad_norm": 1.8847566843032837, + "learning_rate": 4.880759649582605e-05, + "loss": 5.5737, + "step": 16598 + }, + { + "epoch": 0.09871895518127319, + "grad_norm": 2.398552417755127, + "learning_rate": 4.8807453955912244e-05, + "loss": 5.4192, + "step": 16599 + }, + { + "epoch": 0.09872490246455419, + "grad_norm": 1.990404486656189, + "learning_rate": 4.8807311407687494e-05, + "loss": 5.4624, + "step": 16600 + }, + { + "epoch": 0.09873084974783519, + "grad_norm": 1.533575177192688, + "learning_rate": 4.880716885115187e-05, + "loss": 5.8242, + "step": 16601 + }, + { + "epoch": 0.09873679703111618, + "grad_norm": 1.7357563972473145, + "learning_rate": 4.88070262863054e-05, + "loss": 5.9343, + "step": 16602 + }, + { + "epoch": 0.09874274431439718, + "grad_norm": 1.8504372835159302, + "learning_rate": 4.880688371314816e-05, + "loss": 5.6685, + "step": 16603 + }, + { + "epoch": 0.09874869159767818, + "grad_norm": 2.5040910243988037, + "learning_rate": 4.880674113168016e-05, + "loss": 5.1591, + "step": 16604 + }, + { + "epoch": 0.09875463888095917, + "grad_norm": 2.7820568084716797, + "learning_rate": 4.880659854190148e-05, + "loss": 5.0528, + "step": 16605 + }, + { + "epoch": 0.09876058616424017, + "grad_norm": 2.004427909851074, + "learning_rate": 4.8806455943812165e-05, + "loss": 5.6251, + "step": 16606 + }, + { + "epoch": 0.09876653344752118, + "grad_norm": 1.8053330183029175, + "learning_rate": 4.880631333741227e-05, + "loss": 5.5293, + "step": 16607 + }, + { + "epoch": 0.09877248073080216, + "grad_norm": 1.6708273887634277, + "learning_rate": 4.8806170722701824e-05, + "loss": 6.1215, + "step": 16608 + }, + { + "epoch": 0.09877842801408317, + "grad_norm": 1.6344959735870361, + "learning_rate": 4.88060280996809e-05, + "loss": 6.191, + "step": 16609 + }, + { + "epoch": 0.09878437529736417, + "grad_norm": 1.68915593624115, + "learning_rate": 4.880588546834953e-05, + "loss": 5.9302, + "step": 16610 + }, + { + "epoch": 0.09879032258064516, + "grad_norm": 2.108917236328125, + "learning_rate": 4.8805742828707777e-05, + "loss": 5.5227, + "step": 16611 + }, + { + "epoch": 0.09879626986392616, + "grad_norm": 1.7772480249404907, + "learning_rate": 4.8805600180755685e-05, + "loss": 5.5694, + "step": 16612 + }, + { + "epoch": 0.09880221714720716, + "grad_norm": 1.629629135131836, + "learning_rate": 4.8805457524493305e-05, + "loss": 5.7881, + "step": 16613 + }, + { + "epoch": 0.09880816443048815, + "grad_norm": 1.8985555171966553, + "learning_rate": 4.880531485992068e-05, + "loss": 5.5357, + "step": 16614 + }, + { + "epoch": 0.09881411171376915, + "grad_norm": 2.5329599380493164, + "learning_rate": 4.880517218703786e-05, + "loss": 4.8959, + "step": 16615 + }, + { + "epoch": 0.09882005899705015, + "grad_norm": 2.408377170562744, + "learning_rate": 4.8805029505844915e-05, + "loss": 4.9581, + "step": 16616 + }, + { + "epoch": 0.09882600628033114, + "grad_norm": 2.125190258026123, + "learning_rate": 4.880488681634187e-05, + "loss": 4.4116, + "step": 16617 + }, + { + "epoch": 0.09883195356361214, + "grad_norm": 2.153186082839966, + "learning_rate": 4.880474411852879e-05, + "loss": 4.2887, + "step": 16618 + }, + { + "epoch": 0.09883790084689315, + "grad_norm": 2.3961498737335205, + "learning_rate": 4.880460141240571e-05, + "loss": 4.6521, + "step": 16619 + }, + { + "epoch": 0.09884384813017413, + "grad_norm": 2.4282264709472656, + "learning_rate": 4.880445869797271e-05, + "loss": 4.6307, + "step": 16620 + }, + { + "epoch": 0.09884979541345514, + "grad_norm": 2.461005687713623, + "learning_rate": 4.88043159752298e-05, + "loss": 4.4234, + "step": 16621 + }, + { + "epoch": 0.09885574269673614, + "grad_norm": 2.5483081340789795, + "learning_rate": 4.8804173244177056e-05, + "loss": 4.2688, + "step": 16622 + }, + { + "epoch": 0.09886168998001713, + "grad_norm": 2.370413303375244, + "learning_rate": 4.8804030504814524e-05, + "loss": 4.4887, + "step": 16623 + }, + { + "epoch": 0.09886763726329813, + "grad_norm": 2.681118965148926, + "learning_rate": 4.880388775714225e-05, + "loss": 4.2941, + "step": 16624 + }, + { + "epoch": 0.09887358454657912, + "grad_norm": 2.1210896968841553, + "learning_rate": 4.8803745001160284e-05, + "loss": 5.1994, + "step": 16625 + }, + { + "epoch": 0.09887953182986012, + "grad_norm": 1.703626275062561, + "learning_rate": 4.880360223686867e-05, + "loss": 5.5578, + "step": 16626 + }, + { + "epoch": 0.09888547911314112, + "grad_norm": 1.5515342950820923, + "learning_rate": 4.8803459464267475e-05, + "loss": 5.6636, + "step": 16627 + }, + { + "epoch": 0.09889142639642211, + "grad_norm": 1.2145434617996216, + "learning_rate": 4.880331668335673e-05, + "loss": 5.3634, + "step": 16628 + }, + { + "epoch": 0.09889737367970311, + "grad_norm": 1.2893304824829102, + "learning_rate": 4.88031738941365e-05, + "loss": 5.5383, + "step": 16629 + }, + { + "epoch": 0.09890332096298411, + "grad_norm": 3.1206297874450684, + "learning_rate": 4.880303109660682e-05, + "loss": 4.9313, + "step": 16630 + }, + { + "epoch": 0.0989092682462651, + "grad_norm": 3.382498264312744, + "learning_rate": 4.8802888290767756e-05, + "loss": 4.4475, + "step": 16631 + }, + { + "epoch": 0.0989152155295461, + "grad_norm": 1.8280858993530273, + "learning_rate": 4.880274547661934e-05, + "loss": 5.6722, + "step": 16632 + }, + { + "epoch": 0.0989211628128271, + "grad_norm": 2.0412793159484863, + "learning_rate": 4.880260265416164e-05, + "loss": 5.3952, + "step": 16633 + }, + { + "epoch": 0.09892711009610809, + "grad_norm": 2.0702524185180664, + "learning_rate": 4.880245982339469e-05, + "loss": 5.2754, + "step": 16634 + }, + { + "epoch": 0.0989330573793891, + "grad_norm": 1.7081348896026611, + "learning_rate": 4.880231698431855e-05, + "loss": 5.8414, + "step": 16635 + }, + { + "epoch": 0.0989390046626701, + "grad_norm": 1.7762012481689453, + "learning_rate": 4.880217413693328e-05, + "loss": 6.0106, + "step": 16636 + }, + { + "epoch": 0.09894495194595108, + "grad_norm": 1.815253496170044, + "learning_rate": 4.8802031281238895e-05, + "loss": 5.9715, + "step": 16637 + }, + { + "epoch": 0.09895089922923209, + "grad_norm": 1.8652589321136475, + "learning_rate": 4.880188841723548e-05, + "loss": 5.9437, + "step": 16638 + }, + { + "epoch": 0.09895684651251309, + "grad_norm": 1.687664270401001, + "learning_rate": 4.8801745544923075e-05, + "loss": 6.0776, + "step": 16639 + }, + { + "epoch": 0.09896279379579408, + "grad_norm": 1.579231858253479, + "learning_rate": 4.880160266430171e-05, + "loss": 6.0486, + "step": 16640 + }, + { + "epoch": 0.09896874107907508, + "grad_norm": 1.711932897567749, + "learning_rate": 4.8801459775371464e-05, + "loss": 5.7954, + "step": 16641 + }, + { + "epoch": 0.09897468836235608, + "grad_norm": 2.022918939590454, + "learning_rate": 4.880131687813237e-05, + "loss": 5.4453, + "step": 16642 + }, + { + "epoch": 0.09898063564563707, + "grad_norm": 2.4682674407958984, + "learning_rate": 4.880117397258449e-05, + "loss": 5.2084, + "step": 16643 + }, + { + "epoch": 0.09898658292891807, + "grad_norm": 2.7558486461639404, + "learning_rate": 4.880103105872786e-05, + "loss": 4.8931, + "step": 16644 + }, + { + "epoch": 0.09899253021219907, + "grad_norm": 1.8757295608520508, + "learning_rate": 4.880088813656253e-05, + "loss": 5.4484, + "step": 16645 + }, + { + "epoch": 0.09899847749548006, + "grad_norm": 2.0811331272125244, + "learning_rate": 4.880074520608857e-05, + "loss": 5.8003, + "step": 16646 + }, + { + "epoch": 0.09900442477876106, + "grad_norm": 1.9147615432739258, + "learning_rate": 4.880060226730601e-05, + "loss": 5.869, + "step": 16647 + }, + { + "epoch": 0.09901037206204207, + "grad_norm": 1.974865436553955, + "learning_rate": 4.88004593202149e-05, + "loss": 5.5896, + "step": 16648 + }, + { + "epoch": 0.09901631934532305, + "grad_norm": 1.8365596532821655, + "learning_rate": 4.88003163648153e-05, + "loss": 5.5321, + "step": 16649 + }, + { + "epoch": 0.09902226662860406, + "grad_norm": 1.5927996635437012, + "learning_rate": 4.8800173401107255e-05, + "loss": 5.49, + "step": 16650 + }, + { + "epoch": 0.09902821391188506, + "grad_norm": 1.7566391229629517, + "learning_rate": 4.880003042909081e-05, + "loss": 5.49, + "step": 16651 + }, + { + "epoch": 0.09903416119516605, + "grad_norm": 1.718018651008606, + "learning_rate": 4.879988744876602e-05, + "loss": 5.4515, + "step": 16652 + }, + { + "epoch": 0.09904010847844705, + "grad_norm": 1.8946046829223633, + "learning_rate": 4.879974446013295e-05, + "loss": 4.9902, + "step": 16653 + }, + { + "epoch": 0.09904605576172804, + "grad_norm": 1.939060926437378, + "learning_rate": 4.879960146319162e-05, + "loss": 5.2067, + "step": 16654 + }, + { + "epoch": 0.09905200304500904, + "grad_norm": 1.6621825695037842, + "learning_rate": 4.8799458457942106e-05, + "loss": 5.0041, + "step": 16655 + }, + { + "epoch": 0.09905795032829004, + "grad_norm": 1.8790650367736816, + "learning_rate": 4.879931544438444e-05, + "loss": 4.6893, + "step": 16656 + }, + { + "epoch": 0.09906389761157103, + "grad_norm": 2.20035982131958, + "learning_rate": 4.879917242251868e-05, + "loss": 4.4463, + "step": 16657 + }, + { + "epoch": 0.09906984489485203, + "grad_norm": 1.4379361867904663, + "learning_rate": 4.879902939234487e-05, + "loss": 4.993, + "step": 16658 + }, + { + "epoch": 0.09907579217813303, + "grad_norm": 2.2738726139068604, + "learning_rate": 4.879888635386307e-05, + "loss": 5.108, + "step": 16659 + }, + { + "epoch": 0.09908173946141402, + "grad_norm": 2.0921952724456787, + "learning_rate": 4.8798743307073325e-05, + "loss": 5.3023, + "step": 16660 + }, + { + "epoch": 0.09908768674469502, + "grad_norm": 1.894437313079834, + "learning_rate": 4.8798600251975684e-05, + "loss": 5.2797, + "step": 16661 + }, + { + "epoch": 0.09909363402797602, + "grad_norm": 1.6831610202789307, + "learning_rate": 4.87984571885702e-05, + "loss": 5.3342, + "step": 16662 + }, + { + "epoch": 0.09909958131125701, + "grad_norm": 1.9177473783493042, + "learning_rate": 4.879831411685691e-05, + "loss": 5.2245, + "step": 16663 + }, + { + "epoch": 0.09910552859453801, + "grad_norm": 1.8289183378219604, + "learning_rate": 4.879817103683589e-05, + "loss": 5.2411, + "step": 16664 + }, + { + "epoch": 0.09911147587781902, + "grad_norm": 1.7047971487045288, + "learning_rate": 4.8798027948507166e-05, + "loss": 5.1896, + "step": 16665 + }, + { + "epoch": 0.0991174231611, + "grad_norm": 1.5395535230636597, + "learning_rate": 4.87978848518708e-05, + "loss": 5.0688, + "step": 16666 + }, + { + "epoch": 0.099123370444381, + "grad_norm": 1.652870535850525, + "learning_rate": 4.879774174692683e-05, + "loss": 5.1786, + "step": 16667 + }, + { + "epoch": 0.09912931772766201, + "grad_norm": 1.7581889629364014, + "learning_rate": 4.8797598633675326e-05, + "loss": 5.0549, + "step": 16668 + }, + { + "epoch": 0.099135265010943, + "grad_norm": 1.6056864261627197, + "learning_rate": 4.8797455512116315e-05, + "loss": 5.0516, + "step": 16669 + }, + { + "epoch": 0.099141212294224, + "grad_norm": 1.8067295551300049, + "learning_rate": 4.879731238224986e-05, + "loss": 5.0642, + "step": 16670 + }, + { + "epoch": 0.099147159577505, + "grad_norm": 1.7332173585891724, + "learning_rate": 4.8797169244076016e-05, + "loss": 5.0361, + "step": 16671 + }, + { + "epoch": 0.09915310686078599, + "grad_norm": 1.64972984790802, + "learning_rate": 4.879702609759482e-05, + "loss": 5.0521, + "step": 16672 + }, + { + "epoch": 0.09915905414406699, + "grad_norm": 1.8066579103469849, + "learning_rate": 4.879688294280633e-05, + "loss": 5.1431, + "step": 16673 + }, + { + "epoch": 0.09916500142734799, + "grad_norm": 2.093921661376953, + "learning_rate": 4.879673977971059e-05, + "loss": 5.4831, + "step": 16674 + }, + { + "epoch": 0.09917094871062898, + "grad_norm": 2.1563215255737305, + "learning_rate": 4.879659660830766e-05, + "loss": 5.4992, + "step": 16675 + }, + { + "epoch": 0.09917689599390998, + "grad_norm": 1.9041906595230103, + "learning_rate": 4.8796453428597585e-05, + "loss": 6.0952, + "step": 16676 + }, + { + "epoch": 0.09918284327719099, + "grad_norm": 1.7259836196899414, + "learning_rate": 4.879631024058041e-05, + "loss": 5.9602, + "step": 16677 + }, + { + "epoch": 0.09918879056047197, + "grad_norm": 2.075324058532715, + "learning_rate": 4.879616704425619e-05, + "loss": 5.1186, + "step": 16678 + }, + { + "epoch": 0.09919473784375298, + "grad_norm": 2.243378162384033, + "learning_rate": 4.8796023839624975e-05, + "loss": 4.8764, + "step": 16679 + }, + { + "epoch": 0.09920068512703398, + "grad_norm": 1.8717987537384033, + "learning_rate": 4.879588062668681e-05, + "loss": 5.6084, + "step": 16680 + }, + { + "epoch": 0.09920663241031497, + "grad_norm": 1.8316127061843872, + "learning_rate": 4.879573740544175e-05, + "loss": 5.5613, + "step": 16681 + }, + { + "epoch": 0.09921257969359597, + "grad_norm": 1.7016340494155884, + "learning_rate": 4.879559417588985e-05, + "loss": 5.5577, + "step": 16682 + }, + { + "epoch": 0.09921852697687697, + "grad_norm": 2.2173359394073486, + "learning_rate": 4.879545093803115e-05, + "loss": 4.9591, + "step": 16683 + }, + { + "epoch": 0.09922447426015796, + "grad_norm": 1.9507017135620117, + "learning_rate": 4.87953076918657e-05, + "loss": 5.6648, + "step": 16684 + }, + { + "epoch": 0.09923042154343896, + "grad_norm": 1.6124898195266724, + "learning_rate": 4.879516443739356e-05, + "loss": 6.0163, + "step": 16685 + }, + { + "epoch": 0.09923636882671995, + "grad_norm": 1.5823163986206055, + "learning_rate": 4.879502117461477e-05, + "loss": 5.868, + "step": 16686 + }, + { + "epoch": 0.09924231611000095, + "grad_norm": 1.608522653579712, + "learning_rate": 4.879487790352938e-05, + "loss": 5.7482, + "step": 16687 + }, + { + "epoch": 0.09924826339328195, + "grad_norm": 1.783008337020874, + "learning_rate": 4.879473462413745e-05, + "loss": 5.2352, + "step": 16688 + }, + { + "epoch": 0.09925421067656294, + "grad_norm": 1.8089349269866943, + "learning_rate": 4.8794591336439024e-05, + "loss": 5.1793, + "step": 16689 + }, + { + "epoch": 0.09926015795984394, + "grad_norm": 1.5393356084823608, + "learning_rate": 4.879444804043415e-05, + "loss": 5.4802, + "step": 16690 + }, + { + "epoch": 0.09926610524312494, + "grad_norm": 1.7046642303466797, + "learning_rate": 4.8794304736122886e-05, + "loss": 5.8368, + "step": 16691 + }, + { + "epoch": 0.09927205252640593, + "grad_norm": 1.7474054098129272, + "learning_rate": 4.879416142350527e-05, + "loss": 5.7578, + "step": 16692 + }, + { + "epoch": 0.09927799980968693, + "grad_norm": 1.9804757833480835, + "learning_rate": 4.879401810258136e-05, + "loss": 5.691, + "step": 16693 + }, + { + "epoch": 0.09928394709296794, + "grad_norm": 1.7752422094345093, + "learning_rate": 4.87938747733512e-05, + "loss": 5.2478, + "step": 16694 + }, + { + "epoch": 0.09928989437624892, + "grad_norm": 1.8842644691467285, + "learning_rate": 4.879373143581485e-05, + "loss": 5.2061, + "step": 16695 + }, + { + "epoch": 0.09929584165952993, + "grad_norm": 1.6537442207336426, + "learning_rate": 4.8793588089972355e-05, + "loss": 5.215, + "step": 16696 + }, + { + "epoch": 0.09930178894281093, + "grad_norm": 1.5108014345169067, + "learning_rate": 4.8793444735823755e-05, + "loss": 5.2327, + "step": 16697 + }, + { + "epoch": 0.09930773622609192, + "grad_norm": 1.4653078317642212, + "learning_rate": 4.8793301373369116e-05, + "loss": 5.219, + "step": 16698 + }, + { + "epoch": 0.09931368350937292, + "grad_norm": 1.3908593654632568, + "learning_rate": 4.879315800260848e-05, + "loss": 5.1597, + "step": 16699 + }, + { + "epoch": 0.09931963079265392, + "grad_norm": 1.3809629678726196, + "learning_rate": 4.87930146235419e-05, + "loss": 5.2364, + "step": 16700 + }, + { + "epoch": 0.09932557807593491, + "grad_norm": 1.741685152053833, + "learning_rate": 4.879287123616943e-05, + "loss": 5.7777, + "step": 16701 + }, + { + "epoch": 0.09933152535921591, + "grad_norm": 1.7733122110366821, + "learning_rate": 4.879272784049111e-05, + "loss": 5.4035, + "step": 16702 + }, + { + "epoch": 0.09933747264249691, + "grad_norm": 1.4871195554733276, + "learning_rate": 4.8792584436506985e-05, + "loss": 4.961, + "step": 16703 + }, + { + "epoch": 0.0993434199257779, + "grad_norm": 1.6865509748458862, + "learning_rate": 4.8792441024217115e-05, + "loss": 4.9876, + "step": 16704 + }, + { + "epoch": 0.0993493672090589, + "grad_norm": 1.6606428623199463, + "learning_rate": 4.879229760362156e-05, + "loss": 5.1431, + "step": 16705 + }, + { + "epoch": 0.0993553144923399, + "grad_norm": 1.6394522190093994, + "learning_rate": 4.879215417472036e-05, + "loss": 5.223, + "step": 16706 + }, + { + "epoch": 0.0993612617756209, + "grad_norm": 1.6220464706420898, + "learning_rate": 4.879201073751356e-05, + "loss": 5.322, + "step": 16707 + }, + { + "epoch": 0.0993672090589019, + "grad_norm": 1.4539369344711304, + "learning_rate": 4.879186729200121e-05, + "loss": 5.1935, + "step": 16708 + }, + { + "epoch": 0.0993731563421829, + "grad_norm": 1.7421495914459229, + "learning_rate": 4.8791723838183376e-05, + "loss": 5.0639, + "step": 16709 + }, + { + "epoch": 0.09937910362546389, + "grad_norm": 1.5782475471496582, + "learning_rate": 4.8791580376060085e-05, + "loss": 5.8221, + "step": 16710 + }, + { + "epoch": 0.09938505090874489, + "grad_norm": 1.6991766691207886, + "learning_rate": 4.879143690563141e-05, + "loss": 5.9037, + "step": 16711 + }, + { + "epoch": 0.09939099819202589, + "grad_norm": 1.7815147638320923, + "learning_rate": 4.879129342689739e-05, + "loss": 5.668, + "step": 16712 + }, + { + "epoch": 0.09939694547530688, + "grad_norm": 1.6047189235687256, + "learning_rate": 4.879114993985806e-05, + "loss": 5.3005, + "step": 16713 + }, + { + "epoch": 0.09940289275858788, + "grad_norm": 1.8050780296325684, + "learning_rate": 4.87910064445135e-05, + "loss": 5.4931, + "step": 16714 + }, + { + "epoch": 0.09940884004186887, + "grad_norm": 2.010920286178589, + "learning_rate": 4.8790862940863744e-05, + "loss": 5.6301, + "step": 16715 + }, + { + "epoch": 0.09941478732514987, + "grad_norm": 1.443099856376648, + "learning_rate": 4.879071942890884e-05, + "loss": 5.9498, + "step": 16716 + }, + { + "epoch": 0.09942073460843087, + "grad_norm": 1.777207612991333, + "learning_rate": 4.879057590864885e-05, + "loss": 5.2754, + "step": 16717 + }, + { + "epoch": 0.09942668189171186, + "grad_norm": 2.314602851867676, + "learning_rate": 4.87904323800838e-05, + "loss": 5.1447, + "step": 16718 + }, + { + "epoch": 0.09943262917499286, + "grad_norm": 1.4886807203292847, + "learning_rate": 4.879028884321377e-05, + "loss": 5.5389, + "step": 16719 + }, + { + "epoch": 0.09943857645827386, + "grad_norm": 1.4403626918792725, + "learning_rate": 4.879014529803879e-05, + "loss": 5.5377, + "step": 16720 + }, + { + "epoch": 0.09944452374155485, + "grad_norm": 1.570827841758728, + "learning_rate": 4.8790001744558916e-05, + "loss": 5.2541, + "step": 16721 + }, + { + "epoch": 0.09945047102483585, + "grad_norm": 1.6352084875106812, + "learning_rate": 4.87898581827742e-05, + "loss": 4.9031, + "step": 16722 + }, + { + "epoch": 0.09945641830811686, + "grad_norm": 1.864465594291687, + "learning_rate": 4.878971461268469e-05, + "loss": 4.8689, + "step": 16723 + }, + { + "epoch": 0.09946236559139784, + "grad_norm": 1.5618411302566528, + "learning_rate": 4.878957103429044e-05, + "loss": 5.4576, + "step": 16724 + }, + { + "epoch": 0.09946831287467885, + "grad_norm": 1.6910091638565063, + "learning_rate": 4.8789427447591486e-05, + "loss": 5.557, + "step": 16725 + }, + { + "epoch": 0.09947426015795985, + "grad_norm": 1.708056926727295, + "learning_rate": 4.8789283852587895e-05, + "loss": 5.5343, + "step": 16726 + }, + { + "epoch": 0.09948020744124084, + "grad_norm": 1.5828802585601807, + "learning_rate": 4.878914024927971e-05, + "loss": 5.3913, + "step": 16727 + }, + { + "epoch": 0.09948615472452184, + "grad_norm": 1.6802269220352173, + "learning_rate": 4.878899663766698e-05, + "loss": 5.4407, + "step": 16728 + }, + { + "epoch": 0.09949210200780284, + "grad_norm": 2.0542306900024414, + "learning_rate": 4.8788853017749766e-05, + "loss": 4.9265, + "step": 16729 + }, + { + "epoch": 0.09949804929108383, + "grad_norm": 2.035903215408325, + "learning_rate": 4.87887093895281e-05, + "loss": 5.1802, + "step": 16730 + }, + { + "epoch": 0.09950399657436483, + "grad_norm": 1.7885538339614868, + "learning_rate": 4.8788565753002044e-05, + "loss": 5.5238, + "step": 16731 + }, + { + "epoch": 0.09950994385764583, + "grad_norm": 1.606881022453308, + "learning_rate": 4.878842210817165e-05, + "loss": 5.805, + "step": 16732 + }, + { + "epoch": 0.09951589114092682, + "grad_norm": 1.6354256868362427, + "learning_rate": 4.8788278455036956e-05, + "loss": 5.7968, + "step": 16733 + }, + { + "epoch": 0.09952183842420782, + "grad_norm": 1.7537651062011719, + "learning_rate": 4.8788134793598024e-05, + "loss": 5.5945, + "step": 16734 + }, + { + "epoch": 0.09952778570748883, + "grad_norm": 2.149411678314209, + "learning_rate": 4.8787991123854895e-05, + "loss": 4.7458, + "step": 16735 + }, + { + "epoch": 0.09953373299076981, + "grad_norm": 1.9956060647964478, + "learning_rate": 4.878784744580763e-05, + "loss": 4.9471, + "step": 16736 + }, + { + "epoch": 0.09953968027405082, + "grad_norm": 2.0445396900177, + "learning_rate": 4.878770375945627e-05, + "loss": 4.9063, + "step": 16737 + }, + { + "epoch": 0.09954562755733182, + "grad_norm": 1.8563852310180664, + "learning_rate": 4.878756006480088e-05, + "loss": 5.8788, + "step": 16738 + }, + { + "epoch": 0.0995515748406128, + "grad_norm": 1.8931719064712524, + "learning_rate": 4.8787416361841474e-05, + "loss": 6.0917, + "step": 16739 + }, + { + "epoch": 0.09955752212389381, + "grad_norm": 2.062368869781494, + "learning_rate": 4.878727265057814e-05, + "loss": 5.0113, + "step": 16740 + }, + { + "epoch": 0.09956346940717481, + "grad_norm": 1.7274762392044067, + "learning_rate": 4.878712893101092e-05, + "loss": 5.7383, + "step": 16741 + }, + { + "epoch": 0.0995694166904558, + "grad_norm": 1.7377746105194092, + "learning_rate": 4.878698520313986e-05, + "loss": 5.5545, + "step": 16742 + }, + { + "epoch": 0.0995753639737368, + "grad_norm": 1.8383115530014038, + "learning_rate": 4.8786841466965e-05, + "loss": 5.2297, + "step": 16743 + }, + { + "epoch": 0.09958131125701779, + "grad_norm": 1.7715762853622437, + "learning_rate": 4.8786697722486405e-05, + "loss": 5.4735, + "step": 16744 + }, + { + "epoch": 0.09958725854029879, + "grad_norm": 1.8447803258895874, + "learning_rate": 4.878655396970412e-05, + "loss": 5.25, + "step": 16745 + }, + { + "epoch": 0.09959320582357979, + "grad_norm": 2.215622663497925, + "learning_rate": 4.878641020861819e-05, + "loss": 4.8387, + "step": 16746 + }, + { + "epoch": 0.09959915310686078, + "grad_norm": 1.71353018283844, + "learning_rate": 4.878626643922867e-05, + "loss": 5.6831, + "step": 16747 + }, + { + "epoch": 0.09960510039014178, + "grad_norm": 1.8424171209335327, + "learning_rate": 4.8786122661535616e-05, + "loss": 5.5785, + "step": 16748 + }, + { + "epoch": 0.09961104767342278, + "grad_norm": 1.8796172142028809, + "learning_rate": 4.8785978875539065e-05, + "loss": 5.5921, + "step": 16749 + }, + { + "epoch": 0.09961699495670377, + "grad_norm": 1.820435881614685, + "learning_rate": 4.878583508123908e-05, + "loss": 5.7645, + "step": 16750 + }, + { + "epoch": 0.09962294223998477, + "grad_norm": 1.9210152626037598, + "learning_rate": 4.87856912786357e-05, + "loss": 5.0471, + "step": 16751 + }, + { + "epoch": 0.09962888952326578, + "grad_norm": 1.4372605085372925, + "learning_rate": 4.878554746772899e-05, + "loss": 5.3131, + "step": 16752 + }, + { + "epoch": 0.09963483680654676, + "grad_norm": 1.8078817129135132, + "learning_rate": 4.878540364851898e-05, + "loss": 5.266, + "step": 16753 + }, + { + "epoch": 0.09964078408982777, + "grad_norm": 2.068875551223755, + "learning_rate": 4.878525982100575e-05, + "loss": 4.714, + "step": 16754 + }, + { + "epoch": 0.09964673137310877, + "grad_norm": 2.0813167095184326, + "learning_rate": 4.878511598518931e-05, + "loss": 4.5889, + "step": 16755 + }, + { + "epoch": 0.09965267865638976, + "grad_norm": 2.3035426139831543, + "learning_rate": 4.878497214106974e-05, + "loss": 4.8549, + "step": 16756 + }, + { + "epoch": 0.09965862593967076, + "grad_norm": 1.7791129350662231, + "learning_rate": 4.878482828864709e-05, + "loss": 5.2515, + "step": 16757 + }, + { + "epoch": 0.09966457322295176, + "grad_norm": 1.7512277364730835, + "learning_rate": 4.878468442792139e-05, + "loss": 5.8079, + "step": 16758 + }, + { + "epoch": 0.09967052050623275, + "grad_norm": 1.789523720741272, + "learning_rate": 4.878454055889271e-05, + "loss": 5.4302, + "step": 16759 + }, + { + "epoch": 0.09967646778951375, + "grad_norm": 1.72003173828125, + "learning_rate": 4.8784396681561086e-05, + "loss": 5.6425, + "step": 16760 + }, + { + "epoch": 0.09968241507279475, + "grad_norm": 2.0497727394104004, + "learning_rate": 4.878425279592658e-05, + "loss": 5.6608, + "step": 16761 + }, + { + "epoch": 0.09968836235607574, + "grad_norm": 1.7305432558059692, + "learning_rate": 4.878410890198923e-05, + "loss": 5.5431, + "step": 16762 + }, + { + "epoch": 0.09969430963935674, + "grad_norm": 1.708824634552002, + "learning_rate": 4.878396499974911e-05, + "loss": 5.1754, + "step": 16763 + }, + { + "epoch": 0.09970025692263774, + "grad_norm": 1.9238412380218506, + "learning_rate": 4.878382108920624e-05, + "loss": 5.0595, + "step": 16764 + }, + { + "epoch": 0.09970620420591873, + "grad_norm": 1.7634879350662231, + "learning_rate": 4.878367717036069e-05, + "loss": 5.5733, + "step": 16765 + }, + { + "epoch": 0.09971215148919974, + "grad_norm": 1.7330491542816162, + "learning_rate": 4.8783533243212495e-05, + "loss": 5.4314, + "step": 16766 + }, + { + "epoch": 0.09971809877248074, + "grad_norm": 1.4424408674240112, + "learning_rate": 4.878338930776172e-05, + "loss": 5.3059, + "step": 16767 + }, + { + "epoch": 0.09972404605576173, + "grad_norm": 1.4692374467849731, + "learning_rate": 4.878324536400841e-05, + "loss": 5.2838, + "step": 16768 + }, + { + "epoch": 0.09972999333904273, + "grad_norm": 1.3602346181869507, + "learning_rate": 4.878310141195262e-05, + "loss": 5.5587, + "step": 16769 + }, + { + "epoch": 0.09973594062232373, + "grad_norm": 1.3222168684005737, + "learning_rate": 4.878295745159438e-05, + "loss": 5.61, + "step": 16770 + }, + { + "epoch": 0.09974188790560472, + "grad_norm": 1.398383378982544, + "learning_rate": 4.878281348293377e-05, + "loss": 5.5348, + "step": 16771 + }, + { + "epoch": 0.09974783518888572, + "grad_norm": 1.4184808731079102, + "learning_rate": 4.878266950597081e-05, + "loss": 5.4425, + "step": 16772 + }, + { + "epoch": 0.09975378247216671, + "grad_norm": 1.2451627254486084, + "learning_rate": 4.878252552070558e-05, + "loss": 5.5105, + "step": 16773 + }, + { + "epoch": 0.09975972975544771, + "grad_norm": 1.4243760108947754, + "learning_rate": 4.878238152713811e-05, + "loss": 5.5839, + "step": 16774 + }, + { + "epoch": 0.09976567703872871, + "grad_norm": 1.1774061918258667, + "learning_rate": 4.878223752526846e-05, + "loss": 5.4785, + "step": 16775 + }, + { + "epoch": 0.0997716243220097, + "grad_norm": 1.2542285919189453, + "learning_rate": 4.8782093515096676e-05, + "loss": 5.4994, + "step": 16776 + }, + { + "epoch": 0.0997775716052907, + "grad_norm": 1.486611008644104, + "learning_rate": 4.878194949662281e-05, + "loss": 5.347, + "step": 16777 + }, + { + "epoch": 0.0997835188885717, + "grad_norm": 1.391717791557312, + "learning_rate": 4.878180546984691e-05, + "loss": 5.3397, + "step": 16778 + }, + { + "epoch": 0.09978946617185269, + "grad_norm": 1.819778323173523, + "learning_rate": 4.878166143476902e-05, + "loss": 5.4217, + "step": 16779 + }, + { + "epoch": 0.0997954134551337, + "grad_norm": 1.549660563468933, + "learning_rate": 4.8781517391389205e-05, + "loss": 5.5044, + "step": 16780 + }, + { + "epoch": 0.0998013607384147, + "grad_norm": 1.4923075437545776, + "learning_rate": 4.878137333970751e-05, + "loss": 5.4779, + "step": 16781 + }, + { + "epoch": 0.09980730802169568, + "grad_norm": 1.3846399784088135, + "learning_rate": 4.878122927972398e-05, + "loss": 5.8974, + "step": 16782 + }, + { + "epoch": 0.09981325530497669, + "grad_norm": 1.325563669204712, + "learning_rate": 4.878108521143867e-05, + "loss": 5.516, + "step": 16783 + }, + { + "epoch": 0.09981920258825769, + "grad_norm": 1.3482844829559326, + "learning_rate": 4.878094113485162e-05, + "loss": 5.4661, + "step": 16784 + }, + { + "epoch": 0.09982514987153868, + "grad_norm": 1.4238206148147583, + "learning_rate": 4.87807970499629e-05, + "loss": 5.5551, + "step": 16785 + }, + { + "epoch": 0.09983109715481968, + "grad_norm": 1.1277439594268799, + "learning_rate": 4.8780652956772544e-05, + "loss": 5.3611, + "step": 16786 + }, + { + "epoch": 0.09983704443810068, + "grad_norm": 1.2312495708465576, + "learning_rate": 4.878050885528061e-05, + "loss": 5.4233, + "step": 16787 + }, + { + "epoch": 0.09984299172138167, + "grad_norm": 1.3811876773834229, + "learning_rate": 4.878036474548715e-05, + "loss": 5.4336, + "step": 16788 + }, + { + "epoch": 0.09984893900466267, + "grad_norm": 1.211362361907959, + "learning_rate": 4.87802206273922e-05, + "loss": 4.9956, + "step": 16789 + }, + { + "epoch": 0.09985488628794367, + "grad_norm": 1.0385311841964722, + "learning_rate": 4.878007650099583e-05, + "loss": 5.4416, + "step": 16790 + }, + { + "epoch": 0.09986083357122466, + "grad_norm": 1.2311192750930786, + "learning_rate": 4.8779932366298074e-05, + "loss": 5.4814, + "step": 16791 + }, + { + "epoch": 0.09986678085450566, + "grad_norm": 1.6310219764709473, + "learning_rate": 4.8779788223299e-05, + "loss": 5.1746, + "step": 16792 + }, + { + "epoch": 0.09987272813778666, + "grad_norm": 1.4695444107055664, + "learning_rate": 4.877964407199864e-05, + "loss": 5.3724, + "step": 16793 + }, + { + "epoch": 0.09987867542106765, + "grad_norm": 1.8295196294784546, + "learning_rate": 4.877949991239705e-05, + "loss": 5.1085, + "step": 16794 + }, + { + "epoch": 0.09988462270434866, + "grad_norm": 1.5845080614089966, + "learning_rate": 4.877935574449428e-05, + "loss": 5.027, + "step": 16795 + }, + { + "epoch": 0.09989056998762966, + "grad_norm": 1.3743692636489868, + "learning_rate": 4.8779211568290395e-05, + "loss": 5.0717, + "step": 16796 + }, + { + "epoch": 0.09989651727091065, + "grad_norm": 1.3857053518295288, + "learning_rate": 4.877906738378542e-05, + "loss": 4.9698, + "step": 16797 + }, + { + "epoch": 0.09990246455419165, + "grad_norm": 1.3818373680114746, + "learning_rate": 4.8778923190979425e-05, + "loss": 4.8686, + "step": 16798 + }, + { + "epoch": 0.09990841183747265, + "grad_norm": 1.563095211982727, + "learning_rate": 4.877877898987245e-05, + "loss": 4.6804, + "step": 16799 + }, + { + "epoch": 0.09991435912075364, + "grad_norm": 1.3965919017791748, + "learning_rate": 4.877863478046455e-05, + "loss": 5.141, + "step": 16800 + }, + { + "epoch": 0.09992030640403464, + "grad_norm": 1.5473159551620483, + "learning_rate": 4.8778490562755775e-05, + "loss": 5.0796, + "step": 16801 + }, + { + "epoch": 0.09992625368731563, + "grad_norm": 2.548140525817871, + "learning_rate": 4.877834633674618e-05, + "loss": 4.9149, + "step": 16802 + }, + { + "epoch": 0.09993220097059663, + "grad_norm": 1.59461510181427, + "learning_rate": 4.87782021024358e-05, + "loss": 4.9048, + "step": 16803 + }, + { + "epoch": 0.09993814825387763, + "grad_norm": 1.49467134475708, + "learning_rate": 4.87780578598247e-05, + "loss": 5.2484, + "step": 16804 + }, + { + "epoch": 0.09994409553715862, + "grad_norm": 1.5844218730926514, + "learning_rate": 4.8777913608912926e-05, + "loss": 5.2107, + "step": 16805 + }, + { + "epoch": 0.09995004282043962, + "grad_norm": 1.465334415435791, + "learning_rate": 4.877776934970053e-05, + "loss": 5.4002, + "step": 16806 + }, + { + "epoch": 0.09995599010372062, + "grad_norm": 1.5409786701202393, + "learning_rate": 4.877762508218756e-05, + "loss": 5.6233, + "step": 16807 + }, + { + "epoch": 0.09996193738700161, + "grad_norm": 1.3813812732696533, + "learning_rate": 4.877748080637406e-05, + "loss": 5.3072, + "step": 16808 + }, + { + "epoch": 0.09996788467028261, + "grad_norm": 1.3815702199935913, + "learning_rate": 4.8777336522260095e-05, + "loss": 5.0923, + "step": 16809 + }, + { + "epoch": 0.09997383195356362, + "grad_norm": 1.6513910293579102, + "learning_rate": 4.87771922298457e-05, + "loss": 5.0482, + "step": 16810 + }, + { + "epoch": 0.0999797792368446, + "grad_norm": 1.6680731773376465, + "learning_rate": 4.8777047929130944e-05, + "loss": 4.984, + "step": 16811 + }, + { + "epoch": 0.0999857265201256, + "grad_norm": 1.4342384338378906, + "learning_rate": 4.8776903620115855e-05, + "loss": 5.2745, + "step": 16812 + }, + { + "epoch": 0.09999167380340661, + "grad_norm": 1.564255714416504, + "learning_rate": 4.87767593028005e-05, + "loss": 5.398, + "step": 16813 + }, + { + "epoch": 0.0999976210866876, + "grad_norm": 1.2767013311386108, + "learning_rate": 4.877661497718493e-05, + "loss": 5.0663, + "step": 16814 + }, + { + "epoch": 0.1000035683699686, + "grad_norm": 1.35418701171875, + "learning_rate": 4.877647064326918e-05, + "loss": 5.064, + "step": 16815 + }, + { + "epoch": 0.1000095156532496, + "grad_norm": 1.5754468441009521, + "learning_rate": 4.877632630105331e-05, + "loss": 5.1525, + "step": 16816 + }, + { + "epoch": 0.10001546293653059, + "grad_norm": 1.8457043170928955, + "learning_rate": 4.877618195053737e-05, + "loss": 5.3074, + "step": 16817 + }, + { + "epoch": 0.10002141021981159, + "grad_norm": 1.7238751649856567, + "learning_rate": 4.877603759172141e-05, + "loss": 5.3408, + "step": 16818 + }, + { + "epoch": 0.10002735750309259, + "grad_norm": 1.5342493057250977, + "learning_rate": 4.8775893224605486e-05, + "loss": 5.3495, + "step": 16819 + }, + { + "epoch": 0.10003330478637358, + "grad_norm": 1.4931390285491943, + "learning_rate": 4.877574884918964e-05, + "loss": 5.2617, + "step": 16820 + }, + { + "epoch": 0.10003925206965458, + "grad_norm": 1.5503534078598022, + "learning_rate": 4.877560446547393e-05, + "loss": 5.0805, + "step": 16821 + }, + { + "epoch": 0.10004519935293558, + "grad_norm": 1.480191707611084, + "learning_rate": 4.87754600734584e-05, + "loss": 5.1405, + "step": 16822 + }, + { + "epoch": 0.10005114663621657, + "grad_norm": 1.371559977531433, + "learning_rate": 4.87753156731431e-05, + "loss": 5.2313, + "step": 16823 + }, + { + "epoch": 0.10005709391949758, + "grad_norm": 1.2534080743789673, + "learning_rate": 4.8775171264528085e-05, + "loss": 5.3029, + "step": 16824 + }, + { + "epoch": 0.10006304120277858, + "grad_norm": 1.4513366222381592, + "learning_rate": 4.8775026847613406e-05, + "loss": 5.2663, + "step": 16825 + }, + { + "epoch": 0.10006898848605957, + "grad_norm": 1.4045735597610474, + "learning_rate": 4.8774882422399105e-05, + "loss": 5.2358, + "step": 16826 + }, + { + "epoch": 0.10007493576934057, + "grad_norm": 1.469664216041565, + "learning_rate": 4.877473798888524e-05, + "loss": 5.0215, + "step": 16827 + }, + { + "epoch": 0.10008088305262157, + "grad_norm": 1.4306927919387817, + "learning_rate": 4.8774593547071855e-05, + "loss": 4.8262, + "step": 16828 + }, + { + "epoch": 0.10008683033590256, + "grad_norm": 1.5118143558502197, + "learning_rate": 4.877444909695902e-05, + "loss": 4.8248, + "step": 16829 + }, + { + "epoch": 0.10009277761918356, + "grad_norm": 1.3022321462631226, + "learning_rate": 4.8774304638546754e-05, + "loss": 4.7268, + "step": 16830 + }, + { + "epoch": 0.10009872490246455, + "grad_norm": 1.468758463859558, + "learning_rate": 4.877416017183513e-05, + "loss": 4.8686, + "step": 16831 + }, + { + "epoch": 0.10010467218574555, + "grad_norm": 1.4958772659301758, + "learning_rate": 4.8774015696824196e-05, + "loss": 5.084, + "step": 16832 + }, + { + "epoch": 0.10011061946902655, + "grad_norm": 1.5816160440444946, + "learning_rate": 4.877387121351399e-05, + "loss": 5.1009, + "step": 16833 + }, + { + "epoch": 0.10011656675230754, + "grad_norm": 1.4751555919647217, + "learning_rate": 4.877372672190458e-05, + "loss": 5.1875, + "step": 16834 + }, + { + "epoch": 0.10012251403558854, + "grad_norm": 1.380433201789856, + "learning_rate": 4.8773582221996006e-05, + "loss": 5.3213, + "step": 16835 + }, + { + "epoch": 0.10012846131886954, + "grad_norm": 1.566112756729126, + "learning_rate": 4.877343771378832e-05, + "loss": 4.9251, + "step": 16836 + }, + { + "epoch": 0.10013440860215053, + "grad_norm": 1.4834301471710205, + "learning_rate": 4.8773293197281566e-05, + "loss": 4.7936, + "step": 16837 + }, + { + "epoch": 0.10014035588543153, + "grad_norm": 1.6053043603897095, + "learning_rate": 4.877314867247581e-05, + "loss": 4.8611, + "step": 16838 + }, + { + "epoch": 0.10014630316871254, + "grad_norm": 1.420598030090332, + "learning_rate": 4.877300413937109e-05, + "loss": 5.0481, + "step": 16839 + }, + { + "epoch": 0.10015225045199352, + "grad_norm": 1.474554181098938, + "learning_rate": 4.877285959796746e-05, + "loss": 5.0342, + "step": 16840 + }, + { + "epoch": 0.10015819773527453, + "grad_norm": 1.6535485982894897, + "learning_rate": 4.877271504826496e-05, + "loss": 5.4624, + "step": 16841 + }, + { + "epoch": 0.10016414501855553, + "grad_norm": 1.3873733282089233, + "learning_rate": 4.877257049026367e-05, + "loss": 5.1673, + "step": 16842 + }, + { + "epoch": 0.10017009230183652, + "grad_norm": 1.3890115022659302, + "learning_rate": 4.8772425923963606e-05, + "loss": 4.938, + "step": 16843 + }, + { + "epoch": 0.10017603958511752, + "grad_norm": 1.443969964981079, + "learning_rate": 4.8772281349364846e-05, + "loss": 4.8525, + "step": 16844 + }, + { + "epoch": 0.10018198686839852, + "grad_norm": 1.545344591140747, + "learning_rate": 4.877213676646742e-05, + "loss": 4.8682, + "step": 16845 + }, + { + "epoch": 0.10018793415167951, + "grad_norm": 1.6065396070480347, + "learning_rate": 4.877199217527138e-05, + "loss": 4.7394, + "step": 16846 + }, + { + "epoch": 0.10019388143496051, + "grad_norm": 1.444199800491333, + "learning_rate": 4.877184757577679e-05, + "loss": 4.7775, + "step": 16847 + }, + { + "epoch": 0.10019982871824151, + "grad_norm": 1.5434626340866089, + "learning_rate": 4.87717029679837e-05, + "loss": 4.6714, + "step": 16848 + }, + { + "epoch": 0.1002057760015225, + "grad_norm": 1.502533197402954, + "learning_rate": 4.877155835189215e-05, + "loss": 4.7591, + "step": 16849 + }, + { + "epoch": 0.1002117232848035, + "grad_norm": 1.6330854892730713, + "learning_rate": 4.877141372750219e-05, + "loss": 4.7426, + "step": 16850 + }, + { + "epoch": 0.1002176705680845, + "grad_norm": 1.658887267112732, + "learning_rate": 4.877126909481388e-05, + "loss": 4.7558, + "step": 16851 + }, + { + "epoch": 0.10022361785136549, + "grad_norm": 1.4569580554962158, + "learning_rate": 4.877112445382727e-05, + "loss": 4.7797, + "step": 16852 + }, + { + "epoch": 0.1002295651346465, + "grad_norm": 1.4903759956359863, + "learning_rate": 4.8770979804542394e-05, + "loss": 4.7895, + "step": 16853 + }, + { + "epoch": 0.1002355124179275, + "grad_norm": 1.638406753540039, + "learning_rate": 4.877083514695933e-05, + "loss": 4.7197, + "step": 16854 + }, + { + "epoch": 0.10024145970120849, + "grad_norm": 1.4558868408203125, + "learning_rate": 4.87706904810781e-05, + "loss": 4.7159, + "step": 16855 + }, + { + "epoch": 0.10024740698448949, + "grad_norm": 1.5545023679733276, + "learning_rate": 4.877054580689877e-05, + "loss": 4.7387, + "step": 16856 + }, + { + "epoch": 0.10025335426777049, + "grad_norm": 1.3767842054367065, + "learning_rate": 4.877040112442139e-05, + "loss": 4.7149, + "step": 16857 + }, + { + "epoch": 0.10025930155105148, + "grad_norm": 1.4483342170715332, + "learning_rate": 4.877025643364601e-05, + "loss": 4.7756, + "step": 16858 + }, + { + "epoch": 0.10026524883433248, + "grad_norm": 1.1949654817581177, + "learning_rate": 4.8770111734572673e-05, + "loss": 4.7883, + "step": 16859 + }, + { + "epoch": 0.10027119611761347, + "grad_norm": 1.430977463722229, + "learning_rate": 4.876996702720144e-05, + "loss": 5.0236, + "step": 16860 + }, + { + "epoch": 0.10027714340089447, + "grad_norm": 1.4976351261138916, + "learning_rate": 4.876982231153236e-05, + "loss": 5.1242, + "step": 16861 + }, + { + "epoch": 0.10028309068417547, + "grad_norm": 1.6913431882858276, + "learning_rate": 4.876967758756547e-05, + "loss": 5.3454, + "step": 16862 + }, + { + "epoch": 0.10028903796745646, + "grad_norm": 1.5901557207107544, + "learning_rate": 4.876953285530084e-05, + "loss": 5.2313, + "step": 16863 + }, + { + "epoch": 0.10029498525073746, + "grad_norm": 2.483757257461548, + "learning_rate": 4.8769388114738515e-05, + "loss": 4.9951, + "step": 16864 + }, + { + "epoch": 0.10030093253401846, + "grad_norm": 1.5647902488708496, + "learning_rate": 4.8769243365878536e-05, + "loss": 5.1029, + "step": 16865 + }, + { + "epoch": 0.10030687981729945, + "grad_norm": 1.5830740928649902, + "learning_rate": 4.8769098608720954e-05, + "loss": 5.1918, + "step": 16866 + }, + { + "epoch": 0.10031282710058045, + "grad_norm": 1.5231165885925293, + "learning_rate": 4.876895384326584e-05, + "loss": 5.0817, + "step": 16867 + }, + { + "epoch": 0.10031877438386146, + "grad_norm": 1.5266731977462769, + "learning_rate": 4.876880906951321e-05, + "loss": 4.9117, + "step": 16868 + }, + { + "epoch": 0.10032472166714244, + "grad_norm": 1.9662569761276245, + "learning_rate": 4.876866428746315e-05, + "loss": 4.8381, + "step": 16869 + }, + { + "epoch": 0.10033066895042345, + "grad_norm": 1.34932279586792, + "learning_rate": 4.876851949711569e-05, + "loss": 5.0781, + "step": 16870 + }, + { + "epoch": 0.10033661623370445, + "grad_norm": 1.3333275318145752, + "learning_rate": 4.876837469847089e-05, + "loss": 5.0527, + "step": 16871 + }, + { + "epoch": 0.10034256351698544, + "grad_norm": 1.3569806814193726, + "learning_rate": 4.876822989152879e-05, + "loss": 5.0854, + "step": 16872 + }, + { + "epoch": 0.10034851080026644, + "grad_norm": 1.4417848587036133, + "learning_rate": 4.876808507628945e-05, + "loss": 4.885, + "step": 16873 + }, + { + "epoch": 0.10035445808354744, + "grad_norm": 1.453704833984375, + "learning_rate": 4.876794025275292e-05, + "loss": 4.8919, + "step": 16874 + }, + { + "epoch": 0.10036040536682843, + "grad_norm": 1.392701268196106, + "learning_rate": 4.876779542091924e-05, + "loss": 5.0682, + "step": 16875 + }, + { + "epoch": 0.10036635265010943, + "grad_norm": 1.5623222589492798, + "learning_rate": 4.876765058078847e-05, + "loss": 5.0369, + "step": 16876 + }, + { + "epoch": 0.10037229993339043, + "grad_norm": 1.4053794145584106, + "learning_rate": 4.876750573236066e-05, + "loss": 4.9932, + "step": 16877 + }, + { + "epoch": 0.10037824721667142, + "grad_norm": 1.3282443284988403, + "learning_rate": 4.876736087563586e-05, + "loss": 5.0678, + "step": 16878 + }, + { + "epoch": 0.10038419449995242, + "grad_norm": 1.3737441301345825, + "learning_rate": 4.876721601061412e-05, + "loss": 5.1292, + "step": 16879 + }, + { + "epoch": 0.10039014178323342, + "grad_norm": 1.3209916353225708, + "learning_rate": 4.876707113729549e-05, + "loss": 5.0717, + "step": 16880 + }, + { + "epoch": 0.10039608906651441, + "grad_norm": 1.2051011323928833, + "learning_rate": 4.8766926255680026e-05, + "loss": 5.0075, + "step": 16881 + }, + { + "epoch": 0.10040203634979541, + "grad_norm": 1.260746955871582, + "learning_rate": 4.876678136576777e-05, + "loss": 4.8419, + "step": 16882 + }, + { + "epoch": 0.10040798363307642, + "grad_norm": 1.3981266021728516, + "learning_rate": 4.876663646755877e-05, + "loss": 4.8558, + "step": 16883 + }, + { + "epoch": 0.1004139309163574, + "grad_norm": 1.3491755723953247, + "learning_rate": 4.876649156105309e-05, + "loss": 4.7809, + "step": 16884 + }, + { + "epoch": 0.10041987819963841, + "grad_norm": 1.3315166234970093, + "learning_rate": 4.8766346646250774e-05, + "loss": 4.9221, + "step": 16885 + }, + { + "epoch": 0.10042582548291941, + "grad_norm": 1.250731348991394, + "learning_rate": 4.876620172315186e-05, + "loss": 4.8344, + "step": 16886 + }, + { + "epoch": 0.1004317727662004, + "grad_norm": 1.249316692352295, + "learning_rate": 4.876605679175642e-05, + "loss": 4.8441, + "step": 16887 + }, + { + "epoch": 0.1004377200494814, + "grad_norm": 1.3112961053848267, + "learning_rate": 4.87659118520645e-05, + "loss": 4.834, + "step": 16888 + }, + { + "epoch": 0.10044366733276239, + "grad_norm": 1.4331620931625366, + "learning_rate": 4.876576690407614e-05, + "loss": 4.9801, + "step": 16889 + }, + { + "epoch": 0.10044961461604339, + "grad_norm": 1.5304386615753174, + "learning_rate": 4.8765621947791396e-05, + "loss": 5.1799, + "step": 16890 + }, + { + "epoch": 0.10045556189932439, + "grad_norm": 1.3581719398498535, + "learning_rate": 4.8765476983210326e-05, + "loss": 5.1517, + "step": 16891 + }, + { + "epoch": 0.10046150918260538, + "grad_norm": 1.2568892240524292, + "learning_rate": 4.876533201033296e-05, + "loss": 5.0663, + "step": 16892 + }, + { + "epoch": 0.10046745646588638, + "grad_norm": 1.3863126039505005, + "learning_rate": 4.876518702915936e-05, + "loss": 4.9666, + "step": 16893 + }, + { + "epoch": 0.10047340374916738, + "grad_norm": 1.328078031539917, + "learning_rate": 4.87650420396896e-05, + "loss": 5.0049, + "step": 16894 + }, + { + "epoch": 0.10047935103244837, + "grad_norm": 1.252009630203247, + "learning_rate": 4.8764897041923696e-05, + "loss": 5.0709, + "step": 16895 + }, + { + "epoch": 0.10048529831572937, + "grad_norm": 1.4895809888839722, + "learning_rate": 4.876475203586171e-05, + "loss": 5.0922, + "step": 16896 + }, + { + "epoch": 0.10049124559901038, + "grad_norm": 1.363641619682312, + "learning_rate": 4.8764607021503696e-05, + "loss": 5.0233, + "step": 16897 + }, + { + "epoch": 0.10049719288229136, + "grad_norm": 1.5323866605758667, + "learning_rate": 4.876446199884971e-05, + "loss": 4.8705, + "step": 16898 + }, + { + "epoch": 0.10050314016557237, + "grad_norm": 1.4069478511810303, + "learning_rate": 4.8764316967899786e-05, + "loss": 5.0136, + "step": 16899 + }, + { + "epoch": 0.10050908744885337, + "grad_norm": 1.4166046380996704, + "learning_rate": 4.876417192865399e-05, + "loss": 5.0047, + "step": 16900 + }, + { + "epoch": 0.10051503473213436, + "grad_norm": 1.5298703908920288, + "learning_rate": 4.876402688111237e-05, + "loss": 5.0046, + "step": 16901 + }, + { + "epoch": 0.10052098201541536, + "grad_norm": 1.340071678161621, + "learning_rate": 4.876388182527497e-05, + "loss": 5.107, + "step": 16902 + }, + { + "epoch": 0.10052692929869636, + "grad_norm": 1.367415189743042, + "learning_rate": 4.876373676114184e-05, + "loss": 4.9292, + "step": 16903 + }, + { + "epoch": 0.10053287658197735, + "grad_norm": 1.3535525798797607, + "learning_rate": 4.876359168871304e-05, + "loss": 4.9801, + "step": 16904 + }, + { + "epoch": 0.10053882386525835, + "grad_norm": 1.2370539903640747, + "learning_rate": 4.8763446607988615e-05, + "loss": 4.9598, + "step": 16905 + }, + { + "epoch": 0.10054477114853935, + "grad_norm": 1.251837968826294, + "learning_rate": 4.876330151896862e-05, + "loss": 5.0506, + "step": 16906 + }, + { + "epoch": 0.10055071843182034, + "grad_norm": 1.3221372365951538, + "learning_rate": 4.8763156421653097e-05, + "loss": 5.4094, + "step": 16907 + }, + { + "epoch": 0.10055666571510134, + "grad_norm": 1.34721040725708, + "learning_rate": 4.87630113160421e-05, + "loss": 5.4361, + "step": 16908 + }, + { + "epoch": 0.10056261299838234, + "grad_norm": 1.2884198427200317, + "learning_rate": 4.876286620213568e-05, + "loss": 5.3518, + "step": 16909 + }, + { + "epoch": 0.10056856028166333, + "grad_norm": 1.259414553642273, + "learning_rate": 4.87627210799339e-05, + "loss": 5.2298, + "step": 16910 + }, + { + "epoch": 0.10057450756494433, + "grad_norm": 1.482032299041748, + "learning_rate": 4.8762575949436796e-05, + "loss": 5.3625, + "step": 16911 + }, + { + "epoch": 0.10058045484822534, + "grad_norm": 1.2673801183700562, + "learning_rate": 4.876243081064441e-05, + "loss": 5.2678, + "step": 16912 + }, + { + "epoch": 0.10058640213150633, + "grad_norm": 1.3014607429504395, + "learning_rate": 4.876228566355682e-05, + "loss": 5.2762, + "step": 16913 + }, + { + "epoch": 0.10059234941478733, + "grad_norm": 1.2084840536117554, + "learning_rate": 4.876214050817405e-05, + "loss": 5.1128, + "step": 16914 + }, + { + "epoch": 0.10059829669806833, + "grad_norm": 1.3497353792190552, + "learning_rate": 4.876199534449617e-05, + "loss": 5.1666, + "step": 16915 + }, + { + "epoch": 0.10060424398134932, + "grad_norm": 1.4095430374145508, + "learning_rate": 4.876185017252322e-05, + "loss": 5.0055, + "step": 16916 + }, + { + "epoch": 0.10061019126463032, + "grad_norm": 1.319938063621521, + "learning_rate": 4.876170499225525e-05, + "loss": 5.0628, + "step": 16917 + }, + { + "epoch": 0.10061613854791131, + "grad_norm": 1.2126001119613647, + "learning_rate": 4.876155980369232e-05, + "loss": 5.4244, + "step": 16918 + }, + { + "epoch": 0.10062208583119231, + "grad_norm": 1.0456511974334717, + "learning_rate": 4.876141460683448e-05, + "loss": 5.2556, + "step": 16919 + }, + { + "epoch": 0.10062803311447331, + "grad_norm": 1.2545825242996216, + "learning_rate": 4.8761269401681765e-05, + "loss": 5.1549, + "step": 16920 + }, + { + "epoch": 0.1006339803977543, + "grad_norm": 1.3613678216934204, + "learning_rate": 4.876112418823424e-05, + "loss": 5.0592, + "step": 16921 + }, + { + "epoch": 0.1006399276810353, + "grad_norm": 1.4963204860687256, + "learning_rate": 4.876097896649196e-05, + "loss": 5.1025, + "step": 16922 + }, + { + "epoch": 0.1006458749643163, + "grad_norm": 1.3221436738967896, + "learning_rate": 4.876083373645495e-05, + "loss": 5.2534, + "step": 16923 + }, + { + "epoch": 0.10065182224759729, + "grad_norm": 1.6041839122772217, + "learning_rate": 4.8760688498123294e-05, + "loss": 5.3351, + "step": 16924 + }, + { + "epoch": 0.1006577695308783, + "grad_norm": 1.4891480207443237, + "learning_rate": 4.876054325149702e-05, + "loss": 5.4782, + "step": 16925 + }, + { + "epoch": 0.1006637168141593, + "grad_norm": 2.101271867752075, + "learning_rate": 4.876039799657619e-05, + "loss": 5.3844, + "step": 16926 + }, + { + "epoch": 0.10066966409744028, + "grad_norm": 1.5637247562408447, + "learning_rate": 4.8760252733360845e-05, + "loss": 5.4488, + "step": 16927 + }, + { + "epoch": 0.10067561138072129, + "grad_norm": 1.5939668416976929, + "learning_rate": 4.8760107461851044e-05, + "loss": 5.3429, + "step": 16928 + }, + { + "epoch": 0.10068155866400229, + "grad_norm": 1.509945273399353, + "learning_rate": 4.875996218204684e-05, + "loss": 5.4501, + "step": 16929 + }, + { + "epoch": 0.10068750594728328, + "grad_norm": 1.553009271621704, + "learning_rate": 4.875981689394827e-05, + "loss": 5.4183, + "step": 16930 + }, + { + "epoch": 0.10069345323056428, + "grad_norm": 1.5002714395523071, + "learning_rate": 4.875967159755539e-05, + "loss": 5.2343, + "step": 16931 + }, + { + "epoch": 0.10069940051384528, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8759526292868266e-05, + "loss": 5.4414, + "step": 16932 + }, + { + "epoch": 0.10070534779712627, + "grad_norm": 1.38532555103302, + "learning_rate": 4.875938097988694e-05, + "loss": 5.4026, + "step": 16933 + }, + { + "epoch": 0.10071129508040727, + "grad_norm": 1.4190242290496826, + "learning_rate": 4.8759235658611445e-05, + "loss": 5.346, + "step": 16934 + }, + { + "epoch": 0.10071724236368827, + "grad_norm": 1.291375756263733, + "learning_rate": 4.875909032904186e-05, + "loss": 5.3715, + "step": 16935 + }, + { + "epoch": 0.10072318964696926, + "grad_norm": 1.5563501119613647, + "learning_rate": 4.8758944991178214e-05, + "loss": 5.2474, + "step": 16936 + }, + { + "epoch": 0.10072913693025026, + "grad_norm": 1.2936631441116333, + "learning_rate": 4.875879964502056e-05, + "loss": 5.2627, + "step": 16937 + }, + { + "epoch": 0.10073508421353126, + "grad_norm": 1.5020617246627808, + "learning_rate": 4.875865429056896e-05, + "loss": 5.2166, + "step": 16938 + }, + { + "epoch": 0.10074103149681225, + "grad_norm": 1.4830302000045776, + "learning_rate": 4.8758508927823464e-05, + "loss": 5.2558, + "step": 16939 + }, + { + "epoch": 0.10074697878009325, + "grad_norm": 1.4259967803955078, + "learning_rate": 4.8758363556784114e-05, + "loss": 5.3117, + "step": 16940 + }, + { + "epoch": 0.10075292606337426, + "grad_norm": 1.5735303163528442, + "learning_rate": 4.875821817745096e-05, + "loss": 5.2993, + "step": 16941 + }, + { + "epoch": 0.10075887334665524, + "grad_norm": 1.6409742832183838, + "learning_rate": 4.875807278982407e-05, + "loss": 5.4337, + "step": 16942 + }, + { + "epoch": 0.10076482062993625, + "grad_norm": 1.5159885883331299, + "learning_rate": 4.875792739390347e-05, + "loss": 5.4222, + "step": 16943 + }, + { + "epoch": 0.10077076791321725, + "grad_norm": 1.704200029373169, + "learning_rate": 4.875778198968923e-05, + "loss": 5.5248, + "step": 16944 + }, + { + "epoch": 0.10077671519649824, + "grad_norm": 1.8533267974853516, + "learning_rate": 4.875763657718139e-05, + "loss": 5.2155, + "step": 16945 + }, + { + "epoch": 0.10078266247977924, + "grad_norm": 1.3260399103164673, + "learning_rate": 4.8757491156380006e-05, + "loss": 5.3239, + "step": 16946 + }, + { + "epoch": 0.10078860976306023, + "grad_norm": 1.317050814628601, + "learning_rate": 4.875734572728513e-05, + "loss": 5.2346, + "step": 16947 + }, + { + "epoch": 0.10079455704634123, + "grad_norm": 1.5583351850509644, + "learning_rate": 4.875720028989681e-05, + "loss": 5.194, + "step": 16948 + }, + { + "epoch": 0.10080050432962223, + "grad_norm": 1.3424546718597412, + "learning_rate": 4.8757054844215094e-05, + "loss": 5.3616, + "step": 16949 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 1.3151681423187256, + "learning_rate": 4.875690939024004e-05, + "loss": 5.2183, + "step": 16950 + }, + { + "epoch": 0.10081239889618422, + "grad_norm": 1.441724419593811, + "learning_rate": 4.875676392797168e-05, + "loss": 5.3292, + "step": 16951 + }, + { + "epoch": 0.10081834617946522, + "grad_norm": 1.3751790523529053, + "learning_rate": 4.87566184574101e-05, + "loss": 5.1747, + "step": 16952 + }, + { + "epoch": 0.10082429346274621, + "grad_norm": 1.5188177824020386, + "learning_rate": 4.8756472978555314e-05, + "loss": 5.2291, + "step": 16953 + }, + { + "epoch": 0.10083024074602721, + "grad_norm": 1.2834105491638184, + "learning_rate": 4.87563274914074e-05, + "loss": 5.1655, + "step": 16954 + }, + { + "epoch": 0.10083618802930822, + "grad_norm": 1.3950659036636353, + "learning_rate": 4.8756181995966385e-05, + "loss": 5.2318, + "step": 16955 + }, + { + "epoch": 0.1008421353125892, + "grad_norm": 1.3544670343399048, + "learning_rate": 4.875603649223234e-05, + "loss": 5.026, + "step": 16956 + }, + { + "epoch": 0.1008480825958702, + "grad_norm": 1.4849059581756592, + "learning_rate": 4.875589098020531e-05, + "loss": 5.2139, + "step": 16957 + }, + { + "epoch": 0.10085402987915121, + "grad_norm": 1.2032678127288818, + "learning_rate": 4.875574545988534e-05, + "loss": 5.3103, + "step": 16958 + }, + { + "epoch": 0.1008599771624322, + "grad_norm": 1.4803698062896729, + "learning_rate": 4.875559993127249e-05, + "loss": 5.2546, + "step": 16959 + }, + { + "epoch": 0.1008659244457132, + "grad_norm": 1.374115228652954, + "learning_rate": 4.8755454394366795e-05, + "loss": 5.1654, + "step": 16960 + }, + { + "epoch": 0.1008718717289942, + "grad_norm": 1.420754075050354, + "learning_rate": 4.875530884916832e-05, + "loss": 5.3368, + "step": 16961 + }, + { + "epoch": 0.10087781901227519, + "grad_norm": 1.3919636011123657, + "learning_rate": 4.875516329567712e-05, + "loss": 5.3053, + "step": 16962 + }, + { + "epoch": 0.10088376629555619, + "grad_norm": 1.2697970867156982, + "learning_rate": 4.8755017733893235e-05, + "loss": 5.1771, + "step": 16963 + }, + { + "epoch": 0.10088971357883719, + "grad_norm": 1.3521144390106201, + "learning_rate": 4.8754872163816714e-05, + "loss": 5.3226, + "step": 16964 + }, + { + "epoch": 0.10089566086211818, + "grad_norm": 1.4171572923660278, + "learning_rate": 4.875472658544761e-05, + "loss": 5.17, + "step": 16965 + }, + { + "epoch": 0.10090160814539918, + "grad_norm": 1.1771302223205566, + "learning_rate": 4.875458099878598e-05, + "loss": 5.2938, + "step": 16966 + }, + { + "epoch": 0.10090755542868018, + "grad_norm": 1.3881202936172485, + "learning_rate": 4.875443540383188e-05, + "loss": 5.2567, + "step": 16967 + }, + { + "epoch": 0.10091350271196117, + "grad_norm": 1.3272387981414795, + "learning_rate": 4.875428980058534e-05, + "loss": 5.2459, + "step": 16968 + }, + { + "epoch": 0.10091944999524217, + "grad_norm": 1.227569341659546, + "learning_rate": 4.875414418904643e-05, + "loss": 5.4037, + "step": 16969 + }, + { + "epoch": 0.10092539727852318, + "grad_norm": 1.6725070476531982, + "learning_rate": 4.875399856921519e-05, + "loss": 4.957, + "step": 16970 + }, + { + "epoch": 0.10093134456180416, + "grad_norm": 1.2896990776062012, + "learning_rate": 4.8753852941091676e-05, + "loss": 5.0245, + "step": 16971 + }, + { + "epoch": 0.10093729184508517, + "grad_norm": 1.4771101474761963, + "learning_rate": 4.8753707304675935e-05, + "loss": 5.007, + "step": 16972 + }, + { + "epoch": 0.10094323912836617, + "grad_norm": 1.5898420810699463, + "learning_rate": 4.8753561659968025e-05, + "loss": 5.2144, + "step": 16973 + }, + { + "epoch": 0.10094918641164716, + "grad_norm": 1.3972615003585815, + "learning_rate": 4.875341600696799e-05, + "loss": 5.0019, + "step": 16974 + }, + { + "epoch": 0.10095513369492816, + "grad_norm": 1.3663748502731323, + "learning_rate": 4.875327034567588e-05, + "loss": 5.3281, + "step": 16975 + }, + { + "epoch": 0.10096108097820915, + "grad_norm": 1.4441343545913696, + "learning_rate": 4.875312467609175e-05, + "loss": 5.3224, + "step": 16976 + }, + { + "epoch": 0.10096702826149015, + "grad_norm": 1.409233570098877, + "learning_rate": 4.875297899821565e-05, + "loss": 5.1244, + "step": 16977 + }, + { + "epoch": 0.10097297554477115, + "grad_norm": 1.286838412284851, + "learning_rate": 4.875283331204763e-05, + "loss": 5.187, + "step": 16978 + }, + { + "epoch": 0.10097892282805214, + "grad_norm": 1.3722141981124878, + "learning_rate": 4.8752687617587744e-05, + "loss": 5.1052, + "step": 16979 + }, + { + "epoch": 0.10098487011133314, + "grad_norm": 1.464938998222351, + "learning_rate": 4.8752541914836034e-05, + "loss": 5.2428, + "step": 16980 + }, + { + "epoch": 0.10099081739461414, + "grad_norm": 1.5051358938217163, + "learning_rate": 4.875239620379256e-05, + "loss": 5.204, + "step": 16981 + }, + { + "epoch": 0.10099676467789513, + "grad_norm": 1.374108076095581, + "learning_rate": 4.875225048445737e-05, + "loss": 5.4567, + "step": 16982 + }, + { + "epoch": 0.10100271196117613, + "grad_norm": 1.482023000717163, + "learning_rate": 4.875210475683052e-05, + "loss": 5.3605, + "step": 16983 + }, + { + "epoch": 0.10100865924445714, + "grad_norm": 1.429819107055664, + "learning_rate": 4.8751959020912056e-05, + "loss": 5.3351, + "step": 16984 + }, + { + "epoch": 0.10101460652773812, + "grad_norm": 1.3165935277938843, + "learning_rate": 4.875181327670202e-05, + "loss": 5.2705, + "step": 16985 + }, + { + "epoch": 0.10102055381101913, + "grad_norm": 1.4560794830322266, + "learning_rate": 4.8751667524200474e-05, + "loss": 5.313, + "step": 16986 + }, + { + "epoch": 0.10102650109430013, + "grad_norm": 1.5268526077270508, + "learning_rate": 4.875152176340747e-05, + "loss": 5.2432, + "step": 16987 + }, + { + "epoch": 0.10103244837758112, + "grad_norm": 1.8486063480377197, + "learning_rate": 4.875137599432305e-05, + "loss": 5.4951, + "step": 16988 + }, + { + "epoch": 0.10103839566086212, + "grad_norm": 1.5344970226287842, + "learning_rate": 4.875123021694727e-05, + "loss": 4.7321, + "step": 16989 + }, + { + "epoch": 0.10104434294414312, + "grad_norm": 1.5000940561294556, + "learning_rate": 4.8751084431280186e-05, + "loss": 5.1539, + "step": 16990 + }, + { + "epoch": 0.10105029022742411, + "grad_norm": 1.3047879934310913, + "learning_rate": 4.875093863732184e-05, + "loss": 5.1549, + "step": 16991 + }, + { + "epoch": 0.10105623751070511, + "grad_norm": 1.3496383428573608, + "learning_rate": 4.875079283507229e-05, + "loss": 5.0896, + "step": 16992 + }, + { + "epoch": 0.10106218479398611, + "grad_norm": 1.3492714166641235, + "learning_rate": 4.875064702453158e-05, + "loss": 5.0242, + "step": 16993 + }, + { + "epoch": 0.1010681320772671, + "grad_norm": 1.3479794263839722, + "learning_rate": 4.8750501205699766e-05, + "loss": 4.9653, + "step": 16994 + }, + { + "epoch": 0.1010740793605481, + "grad_norm": 1.4737683534622192, + "learning_rate": 4.87503553785769e-05, + "loss": 5.0082, + "step": 16995 + }, + { + "epoch": 0.1010800266438291, + "grad_norm": 1.335184931755066, + "learning_rate": 4.8750209543163026e-05, + "loss": 5.0068, + "step": 16996 + }, + { + "epoch": 0.10108597392711009, + "grad_norm": 1.3982423543930054, + "learning_rate": 4.87500636994582e-05, + "loss": 4.9958, + "step": 16997 + }, + { + "epoch": 0.1010919212103911, + "grad_norm": 1.4706374406814575, + "learning_rate": 4.874991784746248e-05, + "loss": 4.9776, + "step": 16998 + }, + { + "epoch": 0.1010978684936721, + "grad_norm": 1.4456995725631714, + "learning_rate": 4.8749771987175896e-05, + "loss": 5.1226, + "step": 16999 + }, + { + "epoch": 0.10110381577695308, + "grad_norm": 1.3827359676361084, + "learning_rate": 4.874962611859853e-05, + "loss": 5.0648, + "step": 17000 + }, + { + "epoch": 0.10110976306023409, + "grad_norm": 1.4089758396148682, + "learning_rate": 4.874948024173039e-05, + "loss": 5.0511, + "step": 17001 + }, + { + "epoch": 0.10111571034351509, + "grad_norm": 1.5135823488235474, + "learning_rate": 4.874933435657157e-05, + "loss": 5.1586, + "step": 17002 + }, + { + "epoch": 0.10112165762679608, + "grad_norm": 1.3575700521469116, + "learning_rate": 4.87491884631221e-05, + "loss": 5.4172, + "step": 17003 + }, + { + "epoch": 0.10112760491007708, + "grad_norm": 1.6240919828414917, + "learning_rate": 4.874904256138203e-05, + "loss": 4.8663, + "step": 17004 + }, + { + "epoch": 0.10113355219335807, + "grad_norm": 1.517287254333496, + "learning_rate": 4.8748896651351415e-05, + "loss": 5.2746, + "step": 17005 + }, + { + "epoch": 0.10113949947663907, + "grad_norm": 1.359541893005371, + "learning_rate": 4.87487507330303e-05, + "loss": 5.2497, + "step": 17006 + }, + { + "epoch": 0.10114544675992007, + "grad_norm": 1.608406901359558, + "learning_rate": 4.8748604806418755e-05, + "loss": 5.2789, + "step": 17007 + }, + { + "epoch": 0.10115139404320106, + "grad_norm": 1.5752578973770142, + "learning_rate": 4.874845887151681e-05, + "loss": 5.1583, + "step": 17008 + }, + { + "epoch": 0.10115734132648206, + "grad_norm": 1.5864077806472778, + "learning_rate": 4.8748312928324524e-05, + "loss": 5.2091, + "step": 17009 + }, + { + "epoch": 0.10116328860976306, + "grad_norm": 1.4714727401733398, + "learning_rate": 4.874816697684195e-05, + "loss": 5.2404, + "step": 17010 + }, + { + "epoch": 0.10116923589304405, + "grad_norm": 1.4676539897918701, + "learning_rate": 4.874802101706913e-05, + "loss": 5.3318, + "step": 17011 + }, + { + "epoch": 0.10117518317632505, + "grad_norm": 1.3290908336639404, + "learning_rate": 4.874787504900612e-05, + "loss": 5.0484, + "step": 17012 + }, + { + "epoch": 0.10118113045960606, + "grad_norm": 1.2661367654800415, + "learning_rate": 4.8747729072652984e-05, + "loss": 5.1857, + "step": 17013 + }, + { + "epoch": 0.10118707774288704, + "grad_norm": 1.2540318965911865, + "learning_rate": 4.874758308800975e-05, + "loss": 5.3025, + "step": 17014 + }, + { + "epoch": 0.10119302502616805, + "grad_norm": 1.2353893518447876, + "learning_rate": 4.874743709507649e-05, + "loss": 5.3613, + "step": 17015 + }, + { + "epoch": 0.10119897230944905, + "grad_norm": 1.2193371057510376, + "learning_rate": 4.874729109385323e-05, + "loss": 5.3029, + "step": 17016 + }, + { + "epoch": 0.10120491959273004, + "grad_norm": 1.2443112134933472, + "learning_rate": 4.874714508434005e-05, + "loss": 5.3667, + "step": 17017 + }, + { + "epoch": 0.10121086687601104, + "grad_norm": 1.4194598197937012, + "learning_rate": 4.874699906653698e-05, + "loss": 5.5583, + "step": 17018 + }, + { + "epoch": 0.10121681415929204, + "grad_norm": 1.4791369438171387, + "learning_rate": 4.874685304044408e-05, + "loss": 5.2797, + "step": 17019 + }, + { + "epoch": 0.10122276144257303, + "grad_norm": 1.4528671503067017, + "learning_rate": 4.87467070060614e-05, + "loss": 5.1261, + "step": 17020 + }, + { + "epoch": 0.10122870872585403, + "grad_norm": 1.2694898843765259, + "learning_rate": 4.8746560963388985e-05, + "loss": 5.3817, + "step": 17021 + }, + { + "epoch": 0.10123465600913503, + "grad_norm": 1.6012862920761108, + "learning_rate": 4.8746414912426896e-05, + "loss": 4.962, + "step": 17022 + }, + { + "epoch": 0.10124060329241602, + "grad_norm": 1.6179730892181396, + "learning_rate": 4.874626885317518e-05, + "loss": 4.6365, + "step": 17023 + }, + { + "epoch": 0.10124655057569702, + "grad_norm": 1.4522144794464111, + "learning_rate": 4.8746122785633885e-05, + "loss": 4.8943, + "step": 17024 + }, + { + "epoch": 0.10125249785897802, + "grad_norm": 1.6087841987609863, + "learning_rate": 4.8745976709803064e-05, + "loss": 4.81, + "step": 17025 + }, + { + "epoch": 0.10125844514225901, + "grad_norm": 1.424810767173767, + "learning_rate": 4.8745830625682766e-05, + "loss": 4.8699, + "step": 17026 + }, + { + "epoch": 0.10126439242554001, + "grad_norm": 1.3316916227340698, + "learning_rate": 4.874568453327304e-05, + "loss": 5.0084, + "step": 17027 + }, + { + "epoch": 0.10127033970882102, + "grad_norm": 1.549833059310913, + "learning_rate": 4.8745538432573946e-05, + "loss": 4.748, + "step": 17028 + }, + { + "epoch": 0.101276286992102, + "grad_norm": 1.294263482093811, + "learning_rate": 4.874539232358553e-05, + "loss": 4.8004, + "step": 17029 + }, + { + "epoch": 0.101282234275383, + "grad_norm": 1.5209519863128662, + "learning_rate": 4.8745246206307845e-05, + "loss": 4.8187, + "step": 17030 + }, + { + "epoch": 0.10128818155866401, + "grad_norm": 1.5805583000183105, + "learning_rate": 4.874510008074094e-05, + "loss": 4.7126, + "step": 17031 + }, + { + "epoch": 0.101294128841945, + "grad_norm": 1.473693609237671, + "learning_rate": 4.8744953946884864e-05, + "loss": 4.86, + "step": 17032 + }, + { + "epoch": 0.101300076125226, + "grad_norm": 1.6662403345108032, + "learning_rate": 4.8744807804739664e-05, + "loss": 4.8903, + "step": 17033 + }, + { + "epoch": 0.10130602340850699, + "grad_norm": 1.5269529819488525, + "learning_rate": 4.87446616543054e-05, + "loss": 5.1061, + "step": 17034 + }, + { + "epoch": 0.10131197069178799, + "grad_norm": 1.3940715789794922, + "learning_rate": 4.8744515495582127e-05, + "loss": 5.3221, + "step": 17035 + }, + { + "epoch": 0.10131791797506899, + "grad_norm": 1.4603626728057861, + "learning_rate": 4.874436932856988e-05, + "loss": 5.2562, + "step": 17036 + }, + { + "epoch": 0.10132386525834998, + "grad_norm": 1.4601393938064575, + "learning_rate": 4.874422315326873e-05, + "loss": 5.1297, + "step": 17037 + }, + { + "epoch": 0.10132981254163098, + "grad_norm": 1.3284024000167847, + "learning_rate": 4.874407696967871e-05, + "loss": 5.2209, + "step": 17038 + }, + { + "epoch": 0.10133575982491198, + "grad_norm": 1.1924611330032349, + "learning_rate": 4.874393077779987e-05, + "loss": 5.265, + "step": 17039 + }, + { + "epoch": 0.10134170710819297, + "grad_norm": 1.1306421756744385, + "learning_rate": 4.874378457763228e-05, + "loss": 5.1637, + "step": 17040 + }, + { + "epoch": 0.10134765439147397, + "grad_norm": 1.414591908454895, + "learning_rate": 4.874363836917598e-05, + "loss": 5.1238, + "step": 17041 + }, + { + "epoch": 0.10135360167475498, + "grad_norm": 1.245263934135437, + "learning_rate": 4.8743492152431016e-05, + "loss": 5.1779, + "step": 17042 + }, + { + "epoch": 0.10135954895803596, + "grad_norm": 1.363484501838684, + "learning_rate": 4.874334592739745e-05, + "loss": 5.1328, + "step": 17043 + }, + { + "epoch": 0.10136549624131697, + "grad_norm": 1.3666833639144897, + "learning_rate": 4.8743199694075326e-05, + "loss": 5.2547, + "step": 17044 + }, + { + "epoch": 0.10137144352459797, + "grad_norm": 1.3848010301589966, + "learning_rate": 4.8743053452464694e-05, + "loss": 5.2745, + "step": 17045 + }, + { + "epoch": 0.10137739080787896, + "grad_norm": 1.4478403329849243, + "learning_rate": 4.87429072025656e-05, + "loss": 5.2069, + "step": 17046 + }, + { + "epoch": 0.10138333809115996, + "grad_norm": 1.5361924171447754, + "learning_rate": 4.8742760944378115e-05, + "loss": 5.1721, + "step": 17047 + }, + { + "epoch": 0.10138928537444096, + "grad_norm": 1.549049973487854, + "learning_rate": 4.874261467790227e-05, + "loss": 5.2525, + "step": 17048 + }, + { + "epoch": 0.10139523265772195, + "grad_norm": 1.484999656677246, + "learning_rate": 4.874246840313813e-05, + "loss": 5.2433, + "step": 17049 + }, + { + "epoch": 0.10140117994100295, + "grad_norm": 1.58607017993927, + "learning_rate": 4.8742322120085734e-05, + "loss": 4.9631, + "step": 17050 + }, + { + "epoch": 0.10140712722428395, + "grad_norm": 1.1922807693481445, + "learning_rate": 4.874217582874514e-05, + "loss": 5.1917, + "step": 17051 + }, + { + "epoch": 0.10141307450756494, + "grad_norm": 1.1538786888122559, + "learning_rate": 4.87420295291164e-05, + "loss": 5.0231, + "step": 17052 + }, + { + "epoch": 0.10141902179084594, + "grad_norm": 1.302758812904358, + "learning_rate": 4.874188322119956e-05, + "loss": 5.0292, + "step": 17053 + }, + { + "epoch": 0.10142496907412694, + "grad_norm": 1.2432395219802856, + "learning_rate": 4.874173690499467e-05, + "loss": 5.1671, + "step": 17054 + }, + { + "epoch": 0.10143091635740793, + "grad_norm": 1.3793164491653442, + "learning_rate": 4.8741590580501786e-05, + "loss": 5.2231, + "step": 17055 + }, + { + "epoch": 0.10143686364068893, + "grad_norm": 1.3487818241119385, + "learning_rate": 4.8741444247720966e-05, + "loss": 5.0464, + "step": 17056 + }, + { + "epoch": 0.10144281092396994, + "grad_norm": 1.512860894203186, + "learning_rate": 4.874129790665225e-05, + "loss": 4.8973, + "step": 17057 + }, + { + "epoch": 0.10144875820725092, + "grad_norm": 1.6202374696731567, + "learning_rate": 4.874115155729569e-05, + "loss": 5.0055, + "step": 17058 + }, + { + "epoch": 0.10145470549053193, + "grad_norm": 1.3453385829925537, + "learning_rate": 4.874100519965134e-05, + "loss": 4.7808, + "step": 17059 + }, + { + "epoch": 0.10146065277381293, + "grad_norm": 1.4613635540008545, + "learning_rate": 4.874085883371925e-05, + "loss": 4.8073, + "step": 17060 + }, + { + "epoch": 0.10146660005709392, + "grad_norm": 1.3086074590682983, + "learning_rate": 4.874071245949947e-05, + "loss": 4.9751, + "step": 17061 + }, + { + "epoch": 0.10147254734037492, + "grad_norm": 1.454784631729126, + "learning_rate": 4.8740566076992055e-05, + "loss": 5.2422, + "step": 17062 + }, + { + "epoch": 0.10147849462365591, + "grad_norm": 1.3406941890716553, + "learning_rate": 4.8740419686197054e-05, + "loss": 5.2342, + "step": 17063 + }, + { + "epoch": 0.10148444190693691, + "grad_norm": 1.3241393566131592, + "learning_rate": 4.8740273287114514e-05, + "loss": 5.2168, + "step": 17064 + }, + { + "epoch": 0.10149038919021791, + "grad_norm": 1.2292134761810303, + "learning_rate": 4.8740126879744495e-05, + "loss": 5.171, + "step": 17065 + }, + { + "epoch": 0.1014963364734989, + "grad_norm": 1.395484209060669, + "learning_rate": 4.8739980464087044e-05, + "loss": 5.1782, + "step": 17066 + }, + { + "epoch": 0.1015022837567799, + "grad_norm": 1.8667857646942139, + "learning_rate": 4.87398340401422e-05, + "loss": 5.7113, + "step": 17067 + }, + { + "epoch": 0.1015082310400609, + "grad_norm": 1.4775335788726807, + "learning_rate": 4.873968760791003e-05, + "loss": 5.2518, + "step": 17068 + }, + { + "epoch": 0.10151417832334189, + "grad_norm": 1.5058828592300415, + "learning_rate": 4.873954116739059e-05, + "loss": 5.3249, + "step": 17069 + }, + { + "epoch": 0.1015201256066229, + "grad_norm": 1.4806468486785889, + "learning_rate": 4.873939471858391e-05, + "loss": 5.1119, + "step": 17070 + }, + { + "epoch": 0.1015260728899039, + "grad_norm": 1.3866868019104004, + "learning_rate": 4.873924826149006e-05, + "loss": 5.1709, + "step": 17071 + }, + { + "epoch": 0.10153202017318488, + "grad_norm": 1.2337566614151, + "learning_rate": 4.8739101796109074e-05, + "loss": 5.2346, + "step": 17072 + }, + { + "epoch": 0.10153796745646589, + "grad_norm": 1.5977396965026855, + "learning_rate": 4.873895532244103e-05, + "loss": 5.4213, + "step": 17073 + }, + { + "epoch": 0.10154391473974689, + "grad_norm": 1.343363642692566, + "learning_rate": 4.873880884048595e-05, + "loss": 5.2865, + "step": 17074 + }, + { + "epoch": 0.10154986202302788, + "grad_norm": 1.4759324789047241, + "learning_rate": 4.87386623502439e-05, + "loss": 5.1743, + "step": 17075 + }, + { + "epoch": 0.10155580930630888, + "grad_norm": 1.2113150358200073, + "learning_rate": 4.873851585171493e-05, + "loss": 5.2218, + "step": 17076 + }, + { + "epoch": 0.10156175658958988, + "grad_norm": 1.3962153196334839, + "learning_rate": 4.873836934489908e-05, + "loss": 5.1031, + "step": 17077 + }, + { + "epoch": 0.10156770387287087, + "grad_norm": 1.410144329071045, + "learning_rate": 4.8738222829796424e-05, + "loss": 5.0662, + "step": 17078 + }, + { + "epoch": 0.10157365115615187, + "grad_norm": 1.224947452545166, + "learning_rate": 4.873807630640699e-05, + "loss": 5.1583, + "step": 17079 + }, + { + "epoch": 0.10157959843943287, + "grad_norm": 1.401877522468567, + "learning_rate": 4.873792977473084e-05, + "loss": 5.2688, + "step": 17080 + }, + { + "epoch": 0.10158554572271386, + "grad_norm": 1.3576874732971191, + "learning_rate": 4.873778323476802e-05, + "loss": 5.037, + "step": 17081 + }, + { + "epoch": 0.10159149300599486, + "grad_norm": 1.226619839668274, + "learning_rate": 4.8737636686518595e-05, + "loss": 5.0502, + "step": 17082 + }, + { + "epoch": 0.10159744028927586, + "grad_norm": 1.2307099103927612, + "learning_rate": 4.87374901299826e-05, + "loss": 5.0855, + "step": 17083 + }, + { + "epoch": 0.10160338757255685, + "grad_norm": 1.1481422185897827, + "learning_rate": 4.873734356516009e-05, + "loss": 5.2114, + "step": 17084 + }, + { + "epoch": 0.10160933485583785, + "grad_norm": 1.4645094871520996, + "learning_rate": 4.873719699205113e-05, + "loss": 5.1432, + "step": 17085 + }, + { + "epoch": 0.10161528213911886, + "grad_norm": 1.3309158086776733, + "learning_rate": 4.873705041065575e-05, + "loss": 5.1557, + "step": 17086 + }, + { + "epoch": 0.10162122942239984, + "grad_norm": 1.2546007633209229, + "learning_rate": 4.873690382097401e-05, + "loss": 5.324, + "step": 17087 + }, + { + "epoch": 0.10162717670568085, + "grad_norm": 1.33823561668396, + "learning_rate": 4.873675722300597e-05, + "loss": 5.1773, + "step": 17088 + }, + { + "epoch": 0.10163312398896185, + "grad_norm": 1.3027381896972656, + "learning_rate": 4.873661061675166e-05, + "loss": 5.4172, + "step": 17089 + }, + { + "epoch": 0.10163907127224284, + "grad_norm": 1.3852121829986572, + "learning_rate": 4.873646400221116e-05, + "loss": 5.1655, + "step": 17090 + }, + { + "epoch": 0.10164501855552384, + "grad_norm": 1.4345825910568237, + "learning_rate": 4.87363173793845e-05, + "loss": 4.9941, + "step": 17091 + }, + { + "epoch": 0.10165096583880483, + "grad_norm": 1.4016261100769043, + "learning_rate": 4.873617074827173e-05, + "loss": 4.9657, + "step": 17092 + }, + { + "epoch": 0.10165691312208583, + "grad_norm": 1.339082956314087, + "learning_rate": 4.8736024108872914e-05, + "loss": 5.0075, + "step": 17093 + }, + { + "epoch": 0.10166286040536683, + "grad_norm": 1.3223985433578491, + "learning_rate": 4.8735877461188094e-05, + "loss": 4.9656, + "step": 17094 + }, + { + "epoch": 0.10166880768864782, + "grad_norm": 1.4618138074874878, + "learning_rate": 4.8735730805217326e-05, + "loss": 5.0158, + "step": 17095 + }, + { + "epoch": 0.10167475497192882, + "grad_norm": 1.4075788259506226, + "learning_rate": 4.8735584140960666e-05, + "loss": 5.3668, + "step": 17096 + }, + { + "epoch": 0.10168070225520982, + "grad_norm": 1.2219016551971436, + "learning_rate": 4.873543746841815e-05, + "loss": 5.3549, + "step": 17097 + }, + { + "epoch": 0.10168664953849081, + "grad_norm": 1.4344584941864014, + "learning_rate": 4.873529078758985e-05, + "loss": 5.2044, + "step": 17098 + }, + { + "epoch": 0.10169259682177181, + "grad_norm": 1.3579001426696777, + "learning_rate": 4.8735144098475794e-05, + "loss": 5.1071, + "step": 17099 + }, + { + "epoch": 0.10169854410505282, + "grad_norm": 1.4645969867706299, + "learning_rate": 4.873499740107604e-05, + "loss": 5.0359, + "step": 17100 + }, + { + "epoch": 0.1017044913883338, + "grad_norm": 1.6800013780593872, + "learning_rate": 4.8734850695390654e-05, + "loss": 5.2085, + "step": 17101 + }, + { + "epoch": 0.1017104386716148, + "grad_norm": 1.678339958190918, + "learning_rate": 4.873470398141968e-05, + "loss": 5.1671, + "step": 17102 + }, + { + "epoch": 0.10171638595489581, + "grad_norm": 1.6498647928237915, + "learning_rate": 4.873455725916316e-05, + "loss": 5.2105, + "step": 17103 + }, + { + "epoch": 0.1017223332381768, + "grad_norm": 1.522147297859192, + "learning_rate": 4.873441052862115e-05, + "loss": 5.1215, + "step": 17104 + }, + { + "epoch": 0.1017282805214578, + "grad_norm": 1.3335652351379395, + "learning_rate": 4.87342637897937e-05, + "loss": 5.2504, + "step": 17105 + }, + { + "epoch": 0.1017342278047388, + "grad_norm": 1.1647717952728271, + "learning_rate": 4.873411704268087e-05, + "loss": 5.3183, + "step": 17106 + }, + { + "epoch": 0.10174017508801979, + "grad_norm": 1.3210188150405884, + "learning_rate": 4.8733970287282706e-05, + "loss": 5.399, + "step": 17107 + }, + { + "epoch": 0.10174612237130079, + "grad_norm": 1.2331137657165527, + "learning_rate": 4.873382352359925e-05, + "loss": 5.2521, + "step": 17108 + }, + { + "epoch": 0.10175206965458179, + "grad_norm": 1.245252251625061, + "learning_rate": 4.873367675163056e-05, + "loss": 5.2092, + "step": 17109 + }, + { + "epoch": 0.10175801693786278, + "grad_norm": 1.3423751592636108, + "learning_rate": 4.87335299713767e-05, + "loss": 4.918, + "step": 17110 + }, + { + "epoch": 0.10176396422114378, + "grad_norm": 1.8670060634613037, + "learning_rate": 4.87333831828377e-05, + "loss": 4.6559, + "step": 17111 + }, + { + "epoch": 0.10176991150442478, + "grad_norm": 1.54763925075531, + "learning_rate": 4.873323638601363e-05, + "loss": 5.2565, + "step": 17112 + }, + { + "epoch": 0.10177585878770577, + "grad_norm": 1.134102702140808, + "learning_rate": 4.8733089580904525e-05, + "loss": 5.2119, + "step": 17113 + }, + { + "epoch": 0.10178180607098677, + "grad_norm": 1.395027756690979, + "learning_rate": 4.873294276751045e-05, + "loss": 5.0732, + "step": 17114 + }, + { + "epoch": 0.10178775335426778, + "grad_norm": 1.104973554611206, + "learning_rate": 4.873279594583144e-05, + "loss": 5.0807, + "step": 17115 + }, + { + "epoch": 0.10179370063754876, + "grad_norm": 1.0554969310760498, + "learning_rate": 4.873264911586757e-05, + "loss": 5.0831, + "step": 17116 + }, + { + "epoch": 0.10179964792082977, + "grad_norm": 1.0598722696304321, + "learning_rate": 4.873250227761887e-05, + "loss": 5.1264, + "step": 17117 + }, + { + "epoch": 0.10180559520411077, + "grad_norm": 1.1047697067260742, + "learning_rate": 4.8732355431085395e-05, + "loss": 5.0687, + "step": 17118 + }, + { + "epoch": 0.10181154248739176, + "grad_norm": 1.5564457178115845, + "learning_rate": 4.87322085762672e-05, + "loss": 5.0063, + "step": 17119 + }, + { + "epoch": 0.10181748977067276, + "grad_norm": 1.5218400955200195, + "learning_rate": 4.8732061713164344e-05, + "loss": 5.3785, + "step": 17120 + }, + { + "epoch": 0.10182343705395375, + "grad_norm": 1.3067396879196167, + "learning_rate": 4.873191484177686e-05, + "loss": 5.4108, + "step": 17121 + }, + { + "epoch": 0.10182938433723475, + "grad_norm": 1.4401333332061768, + "learning_rate": 4.873176796210482e-05, + "loss": 5.5251, + "step": 17122 + }, + { + "epoch": 0.10183533162051575, + "grad_norm": 1.0483810901641846, + "learning_rate": 4.873162107414826e-05, + "loss": 5.4983, + "step": 17123 + }, + { + "epoch": 0.10184127890379674, + "grad_norm": 1.2637344598770142, + "learning_rate": 4.8731474177907244e-05, + "loss": 5.4487, + "step": 17124 + }, + { + "epoch": 0.10184722618707774, + "grad_norm": 1.314834475517273, + "learning_rate": 4.873132727338181e-05, + "loss": 5.228, + "step": 17125 + }, + { + "epoch": 0.10185317347035874, + "grad_norm": 1.354665756225586, + "learning_rate": 4.8731180360572e-05, + "loss": 5.3908, + "step": 17126 + }, + { + "epoch": 0.10185912075363973, + "grad_norm": 1.3690662384033203, + "learning_rate": 4.87310334394779e-05, + "loss": 5.0955, + "step": 17127 + }, + { + "epoch": 0.10186506803692073, + "grad_norm": 1.5240978002548218, + "learning_rate": 4.873088651009954e-05, + "loss": 5.2838, + "step": 17128 + }, + { + "epoch": 0.10187101532020174, + "grad_norm": 1.147658109664917, + "learning_rate": 4.8730739572436966e-05, + "loss": 5.3074, + "step": 17129 + }, + { + "epoch": 0.10187696260348272, + "grad_norm": 1.3384162187576294, + "learning_rate": 4.8730592626490235e-05, + "loss": 5.3677, + "step": 17130 + }, + { + "epoch": 0.10188290988676373, + "grad_norm": 1.3388500213623047, + "learning_rate": 4.87304456722594e-05, + "loss": 5.3151, + "step": 17131 + }, + { + "epoch": 0.10188885717004473, + "grad_norm": 1.215617060661316, + "learning_rate": 4.873029870974452e-05, + "loss": 4.9182, + "step": 17132 + }, + { + "epoch": 0.10189480445332572, + "grad_norm": 1.2983050346374512, + "learning_rate": 4.873015173894563e-05, + "loss": 5.142, + "step": 17133 + }, + { + "epoch": 0.10190075173660672, + "grad_norm": 1.3918750286102295, + "learning_rate": 4.873000475986279e-05, + "loss": 5.0548, + "step": 17134 + }, + { + "epoch": 0.10190669901988772, + "grad_norm": 1.3934828042984009, + "learning_rate": 4.8729857772496045e-05, + "loss": 5.1319, + "step": 17135 + }, + { + "epoch": 0.10191264630316871, + "grad_norm": 1.32583487033844, + "learning_rate": 4.872971077684546e-05, + "loss": 5.2762, + "step": 17136 + }, + { + "epoch": 0.10191859358644971, + "grad_norm": 1.295102834701538, + "learning_rate": 4.872956377291108e-05, + "loss": 5.2338, + "step": 17137 + }, + { + "epoch": 0.10192454086973071, + "grad_norm": 1.2840588092803955, + "learning_rate": 4.8729416760692946e-05, + "loss": 5.3957, + "step": 17138 + }, + { + "epoch": 0.1019304881530117, + "grad_norm": 1.371270775794983, + "learning_rate": 4.872926974019112e-05, + "loss": 5.5933, + "step": 17139 + }, + { + "epoch": 0.1019364354362927, + "grad_norm": 1.380387783050537, + "learning_rate": 4.872912271140565e-05, + "loss": 5.6628, + "step": 17140 + }, + { + "epoch": 0.1019423827195737, + "grad_norm": 1.3120551109313965, + "learning_rate": 4.8728975674336596e-05, + "loss": 5.6424, + "step": 17141 + }, + { + "epoch": 0.10194833000285469, + "grad_norm": 1.3965035676956177, + "learning_rate": 4.8728828628984003e-05, + "loss": 5.5413, + "step": 17142 + }, + { + "epoch": 0.1019542772861357, + "grad_norm": 1.5870885848999023, + "learning_rate": 4.872868157534791e-05, + "loss": 5.1952, + "step": 17143 + }, + { + "epoch": 0.1019602245694167, + "grad_norm": 1.584633231163025, + "learning_rate": 4.872853451342839e-05, + "loss": 5.1045, + "step": 17144 + }, + { + "epoch": 0.10196617185269768, + "grad_norm": 1.5781641006469727, + "learning_rate": 4.872838744322548e-05, + "loss": 4.9581, + "step": 17145 + }, + { + "epoch": 0.10197211913597869, + "grad_norm": 1.3683301210403442, + "learning_rate": 4.872824036473923e-05, + "loss": 4.9931, + "step": 17146 + }, + { + "epoch": 0.10197806641925969, + "grad_norm": 1.4182472229003906, + "learning_rate": 4.87280932779697e-05, + "loss": 4.7815, + "step": 17147 + }, + { + "epoch": 0.10198401370254068, + "grad_norm": 1.464609146118164, + "learning_rate": 4.872794618291694e-05, + "loss": 4.9158, + "step": 17148 + }, + { + "epoch": 0.10198996098582168, + "grad_norm": 1.4733667373657227, + "learning_rate": 4.872779907958099e-05, + "loss": 5.069, + "step": 17149 + }, + { + "epoch": 0.10199590826910268, + "grad_norm": 1.4454584121704102, + "learning_rate": 4.872765196796192e-05, + "loss": 5.1131, + "step": 17150 + }, + { + "epoch": 0.10200185555238367, + "grad_norm": 1.6175665855407715, + "learning_rate": 4.872750484805977e-05, + "loss": 4.9432, + "step": 17151 + }, + { + "epoch": 0.10200780283566467, + "grad_norm": 1.378569483757019, + "learning_rate": 4.872735771987459e-05, + "loss": 4.9243, + "step": 17152 + }, + { + "epoch": 0.10201375011894566, + "grad_norm": 1.452481985092163, + "learning_rate": 4.872721058340644e-05, + "loss": 4.8421, + "step": 17153 + }, + { + "epoch": 0.10201969740222666, + "grad_norm": 1.8265782594680786, + "learning_rate": 4.872706343865536e-05, + "loss": 5.2555, + "step": 17154 + }, + { + "epoch": 0.10202564468550766, + "grad_norm": 1.6913262605667114, + "learning_rate": 4.8726916285621414e-05, + "loss": 5.3829, + "step": 17155 + }, + { + "epoch": 0.10203159196878865, + "grad_norm": 1.6480923891067505, + "learning_rate": 4.8726769124304644e-05, + "loss": 5.4168, + "step": 17156 + }, + { + "epoch": 0.10203753925206965, + "grad_norm": 1.702602744102478, + "learning_rate": 4.8726621954705105e-05, + "loss": 5.4045, + "step": 17157 + }, + { + "epoch": 0.10204348653535066, + "grad_norm": 1.749205470085144, + "learning_rate": 4.8726474776822844e-05, + "loss": 5.5886, + "step": 17158 + }, + { + "epoch": 0.10204943381863164, + "grad_norm": 1.927309274673462, + "learning_rate": 4.8726327590657916e-05, + "loss": 5.5547, + "step": 17159 + }, + { + "epoch": 0.10205538110191265, + "grad_norm": 1.6493511199951172, + "learning_rate": 4.8726180396210374e-05, + "loss": 5.6764, + "step": 17160 + }, + { + "epoch": 0.10206132838519365, + "grad_norm": 1.7083081007003784, + "learning_rate": 4.8726033193480266e-05, + "loss": 5.5823, + "step": 17161 + }, + { + "epoch": 0.10206727566847464, + "grad_norm": 1.7882472276687622, + "learning_rate": 4.872588598246765e-05, + "loss": 5.4388, + "step": 17162 + }, + { + "epoch": 0.10207322295175564, + "grad_norm": 1.6043784618377686, + "learning_rate": 4.872573876317257e-05, + "loss": 5.6816, + "step": 17163 + }, + { + "epoch": 0.10207917023503664, + "grad_norm": 1.3449418544769287, + "learning_rate": 4.872559153559507e-05, + "loss": 5.5661, + "step": 17164 + }, + { + "epoch": 0.10208511751831763, + "grad_norm": 1.7593882083892822, + "learning_rate": 4.8725444299735226e-05, + "loss": 4.95, + "step": 17165 + }, + { + "epoch": 0.10209106480159863, + "grad_norm": 1.8593993186950684, + "learning_rate": 4.872529705559307e-05, + "loss": 5.3296, + "step": 17166 + }, + { + "epoch": 0.10209701208487963, + "grad_norm": 1.7530159950256348, + "learning_rate": 4.872514980316865e-05, + "loss": 5.4378, + "step": 17167 + }, + { + "epoch": 0.10210295936816062, + "grad_norm": 1.7487550973892212, + "learning_rate": 4.872500254246203e-05, + "loss": 5.3435, + "step": 17168 + }, + { + "epoch": 0.10210890665144162, + "grad_norm": 1.7868090867996216, + "learning_rate": 4.8724855273473256e-05, + "loss": 5.2266, + "step": 17169 + }, + { + "epoch": 0.10211485393472262, + "grad_norm": 1.6116459369659424, + "learning_rate": 4.872470799620238e-05, + "loss": 5.2394, + "step": 17170 + }, + { + "epoch": 0.10212080121800361, + "grad_norm": 1.6221721172332764, + "learning_rate": 4.872456071064946e-05, + "loss": 5.823, + "step": 17171 + }, + { + "epoch": 0.10212674850128461, + "grad_norm": 1.462540626525879, + "learning_rate": 4.872441341681454e-05, + "loss": 5.8816, + "step": 17172 + }, + { + "epoch": 0.10213269578456562, + "grad_norm": 1.3804352283477783, + "learning_rate": 4.872426611469766e-05, + "loss": 5.7982, + "step": 17173 + }, + { + "epoch": 0.1021386430678466, + "grad_norm": 1.7873106002807617, + "learning_rate": 4.872411880429889e-05, + "loss": 5.0282, + "step": 17174 + }, + { + "epoch": 0.1021445903511276, + "grad_norm": 1.9154506921768188, + "learning_rate": 4.8723971485618284e-05, + "loss": 4.8535, + "step": 17175 + }, + { + "epoch": 0.10215053763440861, + "grad_norm": 1.865502953529358, + "learning_rate": 4.872382415865587e-05, + "loss": 5.5282, + "step": 17176 + }, + { + "epoch": 0.1021564849176896, + "grad_norm": 1.8683371543884277, + "learning_rate": 4.872367682341173e-05, + "loss": 5.2973, + "step": 17177 + }, + { + "epoch": 0.1021624322009706, + "grad_norm": 1.8488374948501587, + "learning_rate": 4.872352947988589e-05, + "loss": 5.4094, + "step": 17178 + }, + { + "epoch": 0.1021683794842516, + "grad_norm": 1.6702567338943481, + "learning_rate": 4.872338212807841e-05, + "loss": 5.5705, + "step": 17179 + }, + { + "epoch": 0.10217432676753259, + "grad_norm": 1.6559606790542603, + "learning_rate": 4.8723234767989345e-05, + "loss": 5.6637, + "step": 17180 + }, + { + "epoch": 0.10218027405081359, + "grad_norm": 1.523253321647644, + "learning_rate": 4.872308739961875e-05, + "loss": 5.4033, + "step": 17181 + }, + { + "epoch": 0.10218622133409458, + "grad_norm": 1.4300789833068848, + "learning_rate": 4.8722940022966665e-05, + "loss": 5.7568, + "step": 17182 + }, + { + "epoch": 0.10219216861737558, + "grad_norm": 1.5076279640197754, + "learning_rate": 4.872279263803314e-05, + "loss": 4.9469, + "step": 17183 + }, + { + "epoch": 0.10219811590065658, + "grad_norm": 1.721596598625183, + "learning_rate": 4.872264524481824e-05, + "loss": 5.1595, + "step": 17184 + }, + { + "epoch": 0.10220406318393757, + "grad_norm": 1.5876305103302002, + "learning_rate": 4.872249784332201e-05, + "loss": 4.9964, + "step": 17185 + }, + { + "epoch": 0.10221001046721857, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.87223504335445e-05, + "loss": 5.0299, + "step": 17186 + }, + { + "epoch": 0.10221595775049958, + "grad_norm": 1.586411952972412, + "learning_rate": 4.872220301548576e-05, + "loss": 4.9945, + "step": 17187 + }, + { + "epoch": 0.10222190503378056, + "grad_norm": 1.541045069694519, + "learning_rate": 4.872205558914585e-05, + "loss": 4.8789, + "step": 17188 + }, + { + "epoch": 0.10222785231706157, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.872190815452481e-05, + "loss": 4.849, + "step": 17189 + }, + { + "epoch": 0.10223379960034257, + "grad_norm": 1.7448357343673706, + "learning_rate": 4.87217607116227e-05, + "loss": 4.7961, + "step": 17190 + }, + { + "epoch": 0.10223974688362356, + "grad_norm": 1.7249553203582764, + "learning_rate": 4.872161326043957e-05, + "loss": 4.7988, + "step": 17191 + }, + { + "epoch": 0.10224569416690456, + "grad_norm": 1.6894437074661255, + "learning_rate": 4.8721465800975465e-05, + "loss": 4.6713, + "step": 17192 + }, + { + "epoch": 0.10225164145018556, + "grad_norm": 1.5226197242736816, + "learning_rate": 4.8721318333230446e-05, + "loss": 4.8233, + "step": 17193 + }, + { + "epoch": 0.10225758873346655, + "grad_norm": 1.6511256694793701, + "learning_rate": 4.8721170857204554e-05, + "loss": 5.177, + "step": 17194 + }, + { + "epoch": 0.10226353601674755, + "grad_norm": 1.8213993310928345, + "learning_rate": 4.872102337289785e-05, + "loss": 5.2472, + "step": 17195 + }, + { + "epoch": 0.10226948330002855, + "grad_norm": 1.6683803796768188, + "learning_rate": 4.872087588031038e-05, + "loss": 4.7902, + "step": 17196 + }, + { + "epoch": 0.10227543058330954, + "grad_norm": 1.5809015035629272, + "learning_rate": 4.8720728379442204e-05, + "loss": 4.6288, + "step": 17197 + }, + { + "epoch": 0.10228137786659054, + "grad_norm": 1.7978498935699463, + "learning_rate": 4.872058087029336e-05, + "loss": 4.6638, + "step": 17198 + }, + { + "epoch": 0.10228732514987154, + "grad_norm": 1.74656081199646, + "learning_rate": 4.87204333528639e-05, + "loss": 5.652, + "step": 17199 + }, + { + "epoch": 0.10229327243315253, + "grad_norm": 1.6222811937332153, + "learning_rate": 4.87202858271539e-05, + "loss": 5.3951, + "step": 17200 + }, + { + "epoch": 0.10229921971643353, + "grad_norm": 1.8816531896591187, + "learning_rate": 4.8720138293163374e-05, + "loss": 5.728, + "step": 17201 + }, + { + "epoch": 0.10230516699971454, + "grad_norm": 1.5618531703948975, + "learning_rate": 4.871999075089241e-05, + "loss": 5.7162, + "step": 17202 + }, + { + "epoch": 0.10231111428299552, + "grad_norm": 1.4562182426452637, + "learning_rate": 4.871984320034103e-05, + "loss": 5.7563, + "step": 17203 + }, + { + "epoch": 0.10231706156627653, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.87196956415093e-05, + "loss": 5.6333, + "step": 17204 + }, + { + "epoch": 0.10232300884955753, + "grad_norm": 1.7934935092926025, + "learning_rate": 4.871954807439727e-05, + "loss": 5.5804, + "step": 17205 + }, + { + "epoch": 0.10232895613283852, + "grad_norm": 1.5005213022232056, + "learning_rate": 4.8719400499005e-05, + "loss": 5.2471, + "step": 17206 + }, + { + "epoch": 0.10233490341611952, + "grad_norm": 1.5418996810913086, + "learning_rate": 4.871925291533252e-05, + "loss": 6.0574, + "step": 17207 + }, + { + "epoch": 0.10234085069940052, + "grad_norm": 1.3919132947921753, + "learning_rate": 4.87191053233799e-05, + "loss": 6.0048, + "step": 17208 + }, + { + "epoch": 0.10234679798268151, + "grad_norm": 1.9565762281417847, + "learning_rate": 4.8718957723147184e-05, + "loss": 4.9914, + "step": 17209 + }, + { + "epoch": 0.10235274526596251, + "grad_norm": 2.3950796127319336, + "learning_rate": 4.871881011463442e-05, + "loss": 5.7963, + "step": 17210 + }, + { + "epoch": 0.1023586925492435, + "grad_norm": 2.0693960189819336, + "learning_rate": 4.871866249784167e-05, + "loss": 5.4641, + "step": 17211 + }, + { + "epoch": 0.1023646398325245, + "grad_norm": 2.105893850326538, + "learning_rate": 4.871851487276898e-05, + "loss": 5.3983, + "step": 17212 + }, + { + "epoch": 0.1023705871158055, + "grad_norm": 2.171363115310669, + "learning_rate": 4.8718367239416404e-05, + "loss": 5.6619, + "step": 17213 + }, + { + "epoch": 0.10237653439908649, + "grad_norm": 2.141611099243164, + "learning_rate": 4.8718219597783984e-05, + "loss": 5.5488, + "step": 17214 + }, + { + "epoch": 0.1023824816823675, + "grad_norm": 1.8755214214324951, + "learning_rate": 4.871807194787178e-05, + "loss": 5.4888, + "step": 17215 + }, + { + "epoch": 0.1023884289656485, + "grad_norm": 2.0865023136138916, + "learning_rate": 4.871792428967984e-05, + "loss": 5.4645, + "step": 17216 + }, + { + "epoch": 0.10239437624892948, + "grad_norm": 1.9486721754074097, + "learning_rate": 4.871777662320823e-05, + "loss": 5.4057, + "step": 17217 + }, + { + "epoch": 0.10240032353221049, + "grad_norm": 2.109412670135498, + "learning_rate": 4.8717628948456976e-05, + "loss": 5.3768, + "step": 17218 + }, + { + "epoch": 0.10240627081549149, + "grad_norm": 2.202826499938965, + "learning_rate": 4.871748126542615e-05, + "loss": 5.4996, + "step": 17219 + }, + { + "epoch": 0.10241221809877248, + "grad_norm": 1.8646687269210815, + "learning_rate": 4.87173335741158e-05, + "loss": 5.5151, + "step": 17220 + }, + { + "epoch": 0.10241816538205348, + "grad_norm": 1.7966501712799072, + "learning_rate": 4.8717185874525964e-05, + "loss": 5.5548, + "step": 17221 + }, + { + "epoch": 0.10242411266533448, + "grad_norm": 1.9538966417312622, + "learning_rate": 4.8717038166656706e-05, + "loss": 5.6221, + "step": 17222 + }, + { + "epoch": 0.10243005994861547, + "grad_norm": 1.6085959672927856, + "learning_rate": 4.871689045050808e-05, + "loss": 5.2468, + "step": 17223 + }, + { + "epoch": 0.10243600723189647, + "grad_norm": 1.7573461532592773, + "learning_rate": 4.871674272608012e-05, + "loss": 5.5835, + "step": 17224 + }, + { + "epoch": 0.10244195451517747, + "grad_norm": 1.8237701654434204, + "learning_rate": 4.87165949933729e-05, + "loss": 5.3537, + "step": 17225 + }, + { + "epoch": 0.10244790179845846, + "grad_norm": 1.963970422744751, + "learning_rate": 4.8716447252386465e-05, + "loss": 5.5714, + "step": 17226 + }, + { + "epoch": 0.10245384908173946, + "grad_norm": 2.0216476917266846, + "learning_rate": 4.871629950312086e-05, + "loss": 5.4889, + "step": 17227 + }, + { + "epoch": 0.10245979636502046, + "grad_norm": 2.0271217823028564, + "learning_rate": 4.871615174557614e-05, + "loss": 5.5903, + "step": 17228 + }, + { + "epoch": 0.10246574364830145, + "grad_norm": 1.7717560529708862, + "learning_rate": 4.871600397975236e-05, + "loss": 5.3989, + "step": 17229 + }, + { + "epoch": 0.10247169093158245, + "grad_norm": 1.722076416015625, + "learning_rate": 4.8715856205649556e-05, + "loss": 5.526, + "step": 17230 + }, + { + "epoch": 0.10247763821486346, + "grad_norm": 2.124905586242676, + "learning_rate": 4.8715708423267805e-05, + "loss": 5.3835, + "step": 17231 + }, + { + "epoch": 0.10248358549814444, + "grad_norm": 2.2088522911071777, + "learning_rate": 4.8715560632607135e-05, + "loss": 5.5228, + "step": 17232 + }, + { + "epoch": 0.10248953278142545, + "grad_norm": 2.0236847400665283, + "learning_rate": 4.871541283366761e-05, + "loss": 5.3851, + "step": 17233 + }, + { + "epoch": 0.10249548006470645, + "grad_norm": 1.7546913623809814, + "learning_rate": 4.871526502644928e-05, + "loss": 5.2, + "step": 17234 + }, + { + "epoch": 0.10250142734798744, + "grad_norm": 1.9796072244644165, + "learning_rate": 4.87151172109522e-05, + "loss": 5.3873, + "step": 17235 + }, + { + "epoch": 0.10250737463126844, + "grad_norm": 1.5305960178375244, + "learning_rate": 4.8714969387176414e-05, + "loss": 5.1888, + "step": 17236 + }, + { + "epoch": 0.10251332191454944, + "grad_norm": 2.007124185562134, + "learning_rate": 4.871482155512198e-05, + "loss": 5.4024, + "step": 17237 + }, + { + "epoch": 0.10251926919783043, + "grad_norm": 1.8268414735794067, + "learning_rate": 4.871467371478894e-05, + "loss": 5.4289, + "step": 17238 + }, + { + "epoch": 0.10252521648111143, + "grad_norm": 1.9826276302337646, + "learning_rate": 4.871452586617736e-05, + "loss": 5.3222, + "step": 17239 + }, + { + "epoch": 0.10253116376439242, + "grad_norm": 1.7642468214035034, + "learning_rate": 4.8714378009287285e-05, + "loss": 5.3858, + "step": 17240 + }, + { + "epoch": 0.10253711104767342, + "grad_norm": 1.9604185819625854, + "learning_rate": 4.8714230144118764e-05, + "loss": 5.4142, + "step": 17241 + }, + { + "epoch": 0.10254305833095442, + "grad_norm": 2.333829402923584, + "learning_rate": 4.8714082270671844e-05, + "loss": 5.2124, + "step": 17242 + }, + { + "epoch": 0.10254900561423541, + "grad_norm": 1.996928095817566, + "learning_rate": 4.8713934388946593e-05, + "loss": 5.5055, + "step": 17243 + }, + { + "epoch": 0.10255495289751641, + "grad_norm": 2.2702581882476807, + "learning_rate": 4.871378649894304e-05, + "loss": 5.3477, + "step": 17244 + }, + { + "epoch": 0.10256090018079742, + "grad_norm": 1.9696896076202393, + "learning_rate": 4.871363860066126e-05, + "loss": 5.39, + "step": 17245 + }, + { + "epoch": 0.1025668474640784, + "grad_norm": 1.7752536535263062, + "learning_rate": 4.871349069410129e-05, + "loss": 5.326, + "step": 17246 + }, + { + "epoch": 0.1025727947473594, + "grad_norm": 1.798829197883606, + "learning_rate": 4.8713342779263184e-05, + "loss": 5.4066, + "step": 17247 + }, + { + "epoch": 0.10257874203064041, + "grad_norm": 1.975467562675476, + "learning_rate": 4.871319485614699e-05, + "loss": 5.4183, + "step": 17248 + }, + { + "epoch": 0.1025846893139214, + "grad_norm": 2.4021782875061035, + "learning_rate": 4.871304692475277e-05, + "loss": 5.3949, + "step": 17249 + }, + { + "epoch": 0.1025906365972024, + "grad_norm": 1.8973580598831177, + "learning_rate": 4.871289898508058e-05, + "loss": 5.437, + "step": 17250 + }, + { + "epoch": 0.1025965838804834, + "grad_norm": 2.3427937030792236, + "learning_rate": 4.8712751037130446e-05, + "loss": 5.4347, + "step": 17251 + }, + { + "epoch": 0.10260253116376439, + "grad_norm": 1.8699359893798828, + "learning_rate": 4.871260308090245e-05, + "loss": 5.3404, + "step": 17252 + }, + { + "epoch": 0.10260847844704539, + "grad_norm": 2.146106719970703, + "learning_rate": 4.871245511639661e-05, + "loss": 5.3664, + "step": 17253 + }, + { + "epoch": 0.10261442573032639, + "grad_norm": 2.0223419666290283, + "learning_rate": 4.871230714361302e-05, + "loss": 5.4117, + "step": 17254 + }, + { + "epoch": 0.10262037301360738, + "grad_norm": 2.036025047302246, + "learning_rate": 4.871215916255169e-05, + "loss": 5.4349, + "step": 17255 + }, + { + "epoch": 0.10262632029688838, + "grad_norm": 2.0085432529449463, + "learning_rate": 4.87120111732127e-05, + "loss": 5.4896, + "step": 17256 + }, + { + "epoch": 0.10263226758016938, + "grad_norm": 2.088165521621704, + "learning_rate": 4.871186317559609e-05, + "loss": 5.2516, + "step": 17257 + }, + { + "epoch": 0.10263821486345037, + "grad_norm": 1.7493584156036377, + "learning_rate": 4.871171516970191e-05, + "loss": 5.0744, + "step": 17258 + }, + { + "epoch": 0.10264416214673137, + "grad_norm": 1.9395314455032349, + "learning_rate": 4.8711567155530224e-05, + "loss": 5.2783, + "step": 17259 + }, + { + "epoch": 0.10265010943001238, + "grad_norm": 2.057565689086914, + "learning_rate": 4.871141913308107e-05, + "loss": 5.2501, + "step": 17260 + }, + { + "epoch": 0.10265605671329336, + "grad_norm": 2.159641742706299, + "learning_rate": 4.87112711023545e-05, + "loss": 5.2844, + "step": 17261 + }, + { + "epoch": 0.10266200399657437, + "grad_norm": 1.8931914567947388, + "learning_rate": 4.8711123063350575e-05, + "loss": 5.4454, + "step": 17262 + }, + { + "epoch": 0.10266795127985537, + "grad_norm": 1.9728927612304688, + "learning_rate": 4.871097501606934e-05, + "loss": 5.3719, + "step": 17263 + }, + { + "epoch": 0.10267389856313636, + "grad_norm": 1.8770530223846436, + "learning_rate": 4.8710826960510845e-05, + "loss": 5.4244, + "step": 17264 + }, + { + "epoch": 0.10267984584641736, + "grad_norm": 2.072201728820801, + "learning_rate": 4.871067889667516e-05, + "loss": 5.3282, + "step": 17265 + }, + { + "epoch": 0.10268579312969836, + "grad_norm": 2.16689133644104, + "learning_rate": 4.8710530824562304e-05, + "loss": 5.4205, + "step": 17266 + }, + { + "epoch": 0.10269174041297935, + "grad_norm": 2.017695903778076, + "learning_rate": 4.8710382744172354e-05, + "loss": 5.1803, + "step": 17267 + }, + { + "epoch": 0.10269768769626035, + "grad_norm": 1.8181023597717285, + "learning_rate": 4.871023465550535e-05, + "loss": 5.3418, + "step": 17268 + }, + { + "epoch": 0.10270363497954134, + "grad_norm": 1.9661909341812134, + "learning_rate": 4.871008655856136e-05, + "loss": 5.115, + "step": 17269 + }, + { + "epoch": 0.10270958226282234, + "grad_norm": 1.9482250213623047, + "learning_rate": 4.870993845334041e-05, + "loss": 5.0172, + "step": 17270 + }, + { + "epoch": 0.10271552954610334, + "grad_norm": 2.0916497707366943, + "learning_rate": 4.870979033984257e-05, + "loss": 5.4317, + "step": 17271 + }, + { + "epoch": 0.10272147682938433, + "grad_norm": 1.919918417930603, + "learning_rate": 4.8709642218067894e-05, + "loss": 5.3986, + "step": 17272 + }, + { + "epoch": 0.10272742411266533, + "grad_norm": 1.8286259174346924, + "learning_rate": 4.870949408801642e-05, + "loss": 5.1301, + "step": 17273 + }, + { + "epoch": 0.10273337139594634, + "grad_norm": 2.2312278747558594, + "learning_rate": 4.870934594968821e-05, + "loss": 5.0839, + "step": 17274 + }, + { + "epoch": 0.10273931867922732, + "grad_norm": 2.2795724868774414, + "learning_rate": 4.870919780308331e-05, + "loss": 5.3578, + "step": 17275 + }, + { + "epoch": 0.10274526596250833, + "grad_norm": 2.253885269165039, + "learning_rate": 4.870904964820178e-05, + "loss": 5.2482, + "step": 17276 + }, + { + "epoch": 0.10275121324578933, + "grad_norm": 1.9351953268051147, + "learning_rate": 4.870890148504366e-05, + "loss": 5.3657, + "step": 17277 + }, + { + "epoch": 0.10275716052907032, + "grad_norm": 2.072274923324585, + "learning_rate": 4.8708753313609004e-05, + "loss": 5.2433, + "step": 17278 + }, + { + "epoch": 0.10276310781235132, + "grad_norm": 2.0419273376464844, + "learning_rate": 4.8708605133897874e-05, + "loss": 5.27, + "step": 17279 + }, + { + "epoch": 0.10276905509563232, + "grad_norm": 2.156855821609497, + "learning_rate": 4.870845694591031e-05, + "loss": 5.1727, + "step": 17280 + }, + { + "epoch": 0.10277500237891331, + "grad_norm": 1.6552194356918335, + "learning_rate": 4.870830874964637e-05, + "loss": 5.0872, + "step": 17281 + }, + { + "epoch": 0.10278094966219431, + "grad_norm": 1.8167924880981445, + "learning_rate": 4.870816054510611e-05, + "loss": 5.2827, + "step": 17282 + }, + { + "epoch": 0.10278689694547531, + "grad_norm": 2.1617610454559326, + "learning_rate": 4.870801233228956e-05, + "loss": 5.1375, + "step": 17283 + }, + { + "epoch": 0.1027928442287563, + "grad_norm": 1.918817162513733, + "learning_rate": 4.87078641111968e-05, + "loss": 5.2945, + "step": 17284 + }, + { + "epoch": 0.1027987915120373, + "grad_norm": 1.5282881259918213, + "learning_rate": 4.870771588182788e-05, + "loss": 5.6653, + "step": 17285 + }, + { + "epoch": 0.1028047387953183, + "grad_norm": 1.7902590036392212, + "learning_rate": 4.8707567644182825e-05, + "loss": 5.6262, + "step": 17286 + }, + { + "epoch": 0.10281068607859929, + "grad_norm": 1.9451625347137451, + "learning_rate": 4.87074193982617e-05, + "loss": 5.1153, + "step": 17287 + }, + { + "epoch": 0.1028166333618803, + "grad_norm": 1.832401156425476, + "learning_rate": 4.870727114406457e-05, + "loss": 5.2928, + "step": 17288 + }, + { + "epoch": 0.1028225806451613, + "grad_norm": 1.645761251449585, + "learning_rate": 4.870712288159147e-05, + "loss": 5.649, + "step": 17289 + }, + { + "epoch": 0.10282852792844228, + "grad_norm": 1.6721855401992798, + "learning_rate": 4.8706974610842474e-05, + "loss": 5.7568, + "step": 17290 + }, + { + "epoch": 0.10283447521172329, + "grad_norm": 1.7489598989486694, + "learning_rate": 4.87068263318176e-05, + "loss": 5.6752, + "step": 17291 + }, + { + "epoch": 0.10284042249500429, + "grad_norm": 1.505332112312317, + "learning_rate": 4.870667804451693e-05, + "loss": 5.2993, + "step": 17292 + }, + { + "epoch": 0.10284636977828528, + "grad_norm": 1.3620814085006714, + "learning_rate": 4.870652974894049e-05, + "loss": 4.7225, + "step": 17293 + }, + { + "epoch": 0.10285231706156628, + "grad_norm": 2.1685922145843506, + "learning_rate": 4.8706381445088356e-05, + "loss": 4.8737, + "step": 17294 + }, + { + "epoch": 0.10285826434484728, + "grad_norm": 2.219942331314087, + "learning_rate": 4.8706233132960566e-05, + "loss": 5.7529, + "step": 17295 + }, + { + "epoch": 0.10286421162812827, + "grad_norm": 1.928809404373169, + "learning_rate": 4.8706084812557176e-05, + "loss": 5.803, + "step": 17296 + }, + { + "epoch": 0.10287015891140927, + "grad_norm": 1.8534711599349976, + "learning_rate": 4.870593648387823e-05, + "loss": 5.9403, + "step": 17297 + }, + { + "epoch": 0.10287610619469026, + "grad_norm": 2.2624459266662598, + "learning_rate": 4.87057881469238e-05, + "loss": 5.1227, + "step": 17298 + }, + { + "epoch": 0.10288205347797126, + "grad_norm": 2.4320240020751953, + "learning_rate": 4.870563980169391e-05, + "loss": 4.9701, + "step": 17299 + }, + { + "epoch": 0.10288800076125226, + "grad_norm": 2.664921760559082, + "learning_rate": 4.870549144818864e-05, + "loss": 4.8771, + "step": 17300 + }, + { + "epoch": 0.10289394804453325, + "grad_norm": 2.2558987140655518, + "learning_rate": 4.870534308640802e-05, + "loss": 5.0682, + "step": 17301 + }, + { + "epoch": 0.10289989532781425, + "grad_norm": 2.291553258895874, + "learning_rate": 4.870519471635211e-05, + "loss": 4.8481, + "step": 17302 + }, + { + "epoch": 0.10290584261109526, + "grad_norm": 1.9109137058258057, + "learning_rate": 4.870504633802096e-05, + "loss": 5.377, + "step": 17303 + }, + { + "epoch": 0.10291178989437624, + "grad_norm": 1.6809476613998413, + "learning_rate": 4.870489795141463e-05, + "loss": 5.5337, + "step": 17304 + }, + { + "epoch": 0.10291773717765725, + "grad_norm": 1.6410505771636963, + "learning_rate": 4.870474955653316e-05, + "loss": 5.5353, + "step": 17305 + }, + { + "epoch": 0.10292368446093825, + "grad_norm": 1.6310313940048218, + "learning_rate": 4.87046011533766e-05, + "loss": 5.4727, + "step": 17306 + }, + { + "epoch": 0.10292963174421924, + "grad_norm": 1.6450475454330444, + "learning_rate": 4.8704452741945015e-05, + "loss": 5.3677, + "step": 17307 + }, + { + "epoch": 0.10293557902750024, + "grad_norm": 1.7327302694320679, + "learning_rate": 4.870430432223846e-05, + "loss": 5.2964, + "step": 17308 + }, + { + "epoch": 0.10294152631078124, + "grad_norm": 2.837498426437378, + "learning_rate": 4.870415589425696e-05, + "loss": 4.7407, + "step": 17309 + }, + { + "epoch": 0.10294747359406223, + "grad_norm": 2.326399803161621, + "learning_rate": 4.8704007458000593e-05, + "loss": 4.8998, + "step": 17310 + }, + { + "epoch": 0.10295342087734323, + "grad_norm": 1.9505521059036255, + "learning_rate": 4.87038590134694e-05, + "loss": 5.438, + "step": 17311 + }, + { + "epoch": 0.10295936816062423, + "grad_norm": 1.690581202507019, + "learning_rate": 4.870371056066344e-05, + "loss": 5.4291, + "step": 17312 + }, + { + "epoch": 0.10296531544390522, + "grad_norm": 1.9977236986160278, + "learning_rate": 4.870356209958276e-05, + "loss": 5.81, + "step": 17313 + }, + { + "epoch": 0.10297126272718622, + "grad_norm": 1.7996702194213867, + "learning_rate": 4.8703413630227405e-05, + "loss": 5.7569, + "step": 17314 + }, + { + "epoch": 0.10297721001046722, + "grad_norm": 1.7594531774520874, + "learning_rate": 4.870326515259743e-05, + "loss": 5.9367, + "step": 17315 + }, + { + "epoch": 0.10298315729374821, + "grad_norm": 1.8434146642684937, + "learning_rate": 4.870311666669289e-05, + "loss": 5.1578, + "step": 17316 + }, + { + "epoch": 0.10298910457702921, + "grad_norm": 2.531515598297119, + "learning_rate": 4.870296817251385e-05, + "loss": 5.0574, + "step": 17317 + }, + { + "epoch": 0.10299505186031022, + "grad_norm": 2.2126452922821045, + "learning_rate": 4.870281967006034e-05, + "loss": 4.9034, + "step": 17318 + }, + { + "epoch": 0.1030009991435912, + "grad_norm": 2.391558885574341, + "learning_rate": 4.870267115933242e-05, + "loss": 4.9584, + "step": 17319 + }, + { + "epoch": 0.1030069464268722, + "grad_norm": 1.9653453826904297, + "learning_rate": 4.8702522640330145e-05, + "loss": 4.9569, + "step": 17320 + }, + { + "epoch": 0.10301289371015321, + "grad_norm": 2.0124504566192627, + "learning_rate": 4.870237411305356e-05, + "loss": 4.9237, + "step": 17321 + }, + { + "epoch": 0.1030188409934342, + "grad_norm": 1.9120689630508423, + "learning_rate": 4.8702225577502724e-05, + "loss": 4.9637, + "step": 17322 + }, + { + "epoch": 0.1030247882767152, + "grad_norm": 2.108009099960327, + "learning_rate": 4.8702077033677684e-05, + "loss": 4.9479, + "step": 17323 + }, + { + "epoch": 0.1030307355599962, + "grad_norm": 2.211385488510132, + "learning_rate": 4.8701928481578494e-05, + "loss": 4.9553, + "step": 17324 + }, + { + "epoch": 0.10303668284327719, + "grad_norm": 2.1452252864837646, + "learning_rate": 4.8701779921205215e-05, + "loss": 4.7809, + "step": 17325 + }, + { + "epoch": 0.10304263012655819, + "grad_norm": 2.126650810241699, + "learning_rate": 4.8701631352557874e-05, + "loss": 4.7027, + "step": 17326 + }, + { + "epoch": 0.10304857740983918, + "grad_norm": 1.9753129482269287, + "learning_rate": 4.870148277563655e-05, + "loss": 4.8073, + "step": 17327 + }, + { + "epoch": 0.10305452469312018, + "grad_norm": 2.013455867767334, + "learning_rate": 4.8701334190441284e-05, + "loss": 4.7989, + "step": 17328 + }, + { + "epoch": 0.10306047197640118, + "grad_norm": 2.2819676399230957, + "learning_rate": 4.8701185596972124e-05, + "loss": 4.7784, + "step": 17329 + }, + { + "epoch": 0.10306641925968217, + "grad_norm": 2.050511360168457, + "learning_rate": 4.870103699522912e-05, + "loss": 4.9621, + "step": 17330 + }, + { + "epoch": 0.10307236654296317, + "grad_norm": 2.422591209411621, + "learning_rate": 4.870088838521233e-05, + "loss": 4.7558, + "step": 17331 + }, + { + "epoch": 0.10307831382624418, + "grad_norm": 2.2109572887420654, + "learning_rate": 4.870073976692181e-05, + "loss": 4.7162, + "step": 17332 + }, + { + "epoch": 0.10308426110952516, + "grad_norm": 2.070526123046875, + "learning_rate": 4.8700591140357596e-05, + "loss": 4.9765, + "step": 17333 + }, + { + "epoch": 0.10309020839280617, + "grad_norm": 1.610152244567871, + "learning_rate": 4.870044250551976e-05, + "loss": 5.9361, + "step": 17334 + }, + { + "epoch": 0.10309615567608717, + "grad_norm": 1.8921641111373901, + "learning_rate": 4.870029386240834e-05, + "loss": 4.9423, + "step": 17335 + }, + { + "epoch": 0.10310210295936816, + "grad_norm": 2.07476806640625, + "learning_rate": 4.870014521102339e-05, + "loss": 4.7742, + "step": 17336 + }, + { + "epoch": 0.10310805024264916, + "grad_norm": 2.021850824356079, + "learning_rate": 4.869999655136498e-05, + "loss": 4.8182, + "step": 17337 + }, + { + "epoch": 0.10311399752593016, + "grad_norm": 1.5896223783493042, + "learning_rate": 4.869984788343314e-05, + "loss": 5.5694, + "step": 17338 + }, + { + "epoch": 0.10311994480921115, + "grad_norm": 1.1907202005386353, + "learning_rate": 4.869969920722792e-05, + "loss": 5.4427, + "step": 17339 + }, + { + "epoch": 0.10312589209249215, + "grad_norm": 1.56050443649292, + "learning_rate": 4.869955052274938e-05, + "loss": 5.2405, + "step": 17340 + }, + { + "epoch": 0.10313183937577315, + "grad_norm": 1.6611580848693848, + "learning_rate": 4.869940182999757e-05, + "loss": 5.1457, + "step": 17341 + }, + { + "epoch": 0.10313778665905414, + "grad_norm": 1.4664785861968994, + "learning_rate": 4.869925312897256e-05, + "loss": 5.2846, + "step": 17342 + }, + { + "epoch": 0.10314373394233514, + "grad_norm": 1.9751476049423218, + "learning_rate": 4.8699104419674366e-05, + "loss": 5.0283, + "step": 17343 + }, + { + "epoch": 0.10314968122561614, + "grad_norm": 1.715144157409668, + "learning_rate": 4.869895570210307e-05, + "loss": 4.8856, + "step": 17344 + }, + { + "epoch": 0.10315562850889713, + "grad_norm": 1.7803713083267212, + "learning_rate": 4.8698806976258704e-05, + "loss": 5.5573, + "step": 17345 + }, + { + "epoch": 0.10316157579217813, + "grad_norm": 1.4687060117721558, + "learning_rate": 4.8698658242141336e-05, + "loss": 5.2287, + "step": 17346 + }, + { + "epoch": 0.10316752307545914, + "grad_norm": 1.6236404180526733, + "learning_rate": 4.869850949975101e-05, + "loss": 5.1, + "step": 17347 + }, + { + "epoch": 0.10317347035874012, + "grad_norm": 1.6414464712142944, + "learning_rate": 4.869836074908778e-05, + "loss": 5.0884, + "step": 17348 + }, + { + "epoch": 0.10317941764202113, + "grad_norm": 1.5938411951065063, + "learning_rate": 4.86982119901517e-05, + "loss": 5.9405, + "step": 17349 + }, + { + "epoch": 0.10318536492530213, + "grad_norm": 1.7434169054031372, + "learning_rate": 4.869806322294282e-05, + "loss": 6.3698, + "step": 17350 + }, + { + "epoch": 0.10319131220858312, + "grad_norm": 1.4999836683273315, + "learning_rate": 4.8697914447461185e-05, + "loss": 5.4169, + "step": 17351 + }, + { + "epoch": 0.10319725949186412, + "grad_norm": 1.768048644065857, + "learning_rate": 4.869776566370686e-05, + "loss": 5.6703, + "step": 17352 + }, + { + "epoch": 0.10320320677514512, + "grad_norm": 1.734729528427124, + "learning_rate": 4.869761687167988e-05, + "loss": 5.6454, + "step": 17353 + }, + { + "epoch": 0.10320915405842611, + "grad_norm": 1.848308801651001, + "learning_rate": 4.869746807138031e-05, + "loss": 5.742, + "step": 17354 + }, + { + "epoch": 0.10321510134170711, + "grad_norm": 1.628144383430481, + "learning_rate": 4.8697319262808205e-05, + "loss": 5.6099, + "step": 17355 + }, + { + "epoch": 0.1032210486249881, + "grad_norm": 1.5005884170532227, + "learning_rate": 4.86971704459636e-05, + "loss": 5.5419, + "step": 17356 + }, + { + "epoch": 0.1032269959082691, + "grad_norm": 1.5255531072616577, + "learning_rate": 4.869702162084657e-05, + "loss": 5.4757, + "step": 17357 + }, + { + "epoch": 0.1032329431915501, + "grad_norm": 1.549132227897644, + "learning_rate": 4.869687278745715e-05, + "loss": 5.4757, + "step": 17358 + }, + { + "epoch": 0.10323889047483109, + "grad_norm": 1.6518296003341675, + "learning_rate": 4.869672394579539e-05, + "loss": 5.5803, + "step": 17359 + }, + { + "epoch": 0.10324483775811209, + "grad_norm": 2.3987839221954346, + "learning_rate": 4.869657509586136e-05, + "loss": 5.0978, + "step": 17360 + }, + { + "epoch": 0.1032507850413931, + "grad_norm": 1.7290594577789307, + "learning_rate": 4.869642623765509e-05, + "loss": 5.4998, + "step": 17361 + }, + { + "epoch": 0.10325673232467408, + "grad_norm": 1.6334084272384644, + "learning_rate": 4.869627737117665e-05, + "loss": 5.4695, + "step": 17362 + }, + { + "epoch": 0.10326267960795509, + "grad_norm": 1.609734296798706, + "learning_rate": 4.8696128496426074e-05, + "loss": 5.4406, + "step": 17363 + }, + { + "epoch": 0.10326862689123609, + "grad_norm": 1.7579066753387451, + "learning_rate": 4.869597961340343e-05, + "loss": 5.6412, + "step": 17364 + }, + { + "epoch": 0.10327457417451708, + "grad_norm": 1.8831701278686523, + "learning_rate": 4.869583072210877e-05, + "loss": 5.444, + "step": 17365 + }, + { + "epoch": 0.10328052145779808, + "grad_norm": 1.9597128629684448, + "learning_rate": 4.869568182254214e-05, + "loss": 5.2228, + "step": 17366 + }, + { + "epoch": 0.10328646874107908, + "grad_norm": 1.8867931365966797, + "learning_rate": 4.8695532914703584e-05, + "loss": 4.9979, + "step": 17367 + }, + { + "epoch": 0.10329241602436007, + "grad_norm": 1.5480263233184814, + "learning_rate": 4.869538399859317e-05, + "loss": 5.6457, + "step": 17368 + }, + { + "epoch": 0.10329836330764107, + "grad_norm": 1.6710255146026611, + "learning_rate": 4.869523507421093e-05, + "loss": 5.774, + "step": 17369 + }, + { + "epoch": 0.10330431059092207, + "grad_norm": 1.6559721231460571, + "learning_rate": 4.869508614155695e-05, + "loss": 5.5643, + "step": 17370 + }, + { + "epoch": 0.10331025787420306, + "grad_norm": 1.4451355934143066, + "learning_rate": 4.869493720063124e-05, + "loss": 5.4598, + "step": 17371 + }, + { + "epoch": 0.10331620515748406, + "grad_norm": 1.8376599550247192, + "learning_rate": 4.869478825143388e-05, + "loss": 4.7552, + "step": 17372 + }, + { + "epoch": 0.10332215244076506, + "grad_norm": 2.0193891525268555, + "learning_rate": 4.869463929396491e-05, + "loss": 4.5671, + "step": 17373 + }, + { + "epoch": 0.10332809972404605, + "grad_norm": 2.07692551612854, + "learning_rate": 4.869449032822439e-05, + "loss": 4.4776, + "step": 17374 + }, + { + "epoch": 0.10333404700732705, + "grad_norm": 1.820893406867981, + "learning_rate": 4.869434135421237e-05, + "loss": 5.4705, + "step": 17375 + }, + { + "epoch": 0.10333999429060806, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.86941923719289e-05, + "loss": 4.8619, + "step": 17376 + }, + { + "epoch": 0.10334594157388904, + "grad_norm": 1.9348174333572388, + "learning_rate": 4.8694043381374026e-05, + "loss": 4.3723, + "step": 17377 + }, + { + "epoch": 0.10335188885717005, + "grad_norm": 1.8993666172027588, + "learning_rate": 4.869389438254781e-05, + "loss": 4.5442, + "step": 17378 + }, + { + "epoch": 0.10335783614045105, + "grad_norm": 1.9089124202728271, + "learning_rate": 4.869374537545031e-05, + "loss": 4.3347, + "step": 17379 + }, + { + "epoch": 0.10336378342373204, + "grad_norm": 1.8560502529144287, + "learning_rate": 4.869359636008155e-05, + "loss": 4.312, + "step": 17380 + }, + { + "epoch": 0.10336973070701304, + "grad_norm": 1.909680962562561, + "learning_rate": 4.8693447336441614e-05, + "loss": 4.3109, + "step": 17381 + }, + { + "epoch": 0.10337567799029404, + "grad_norm": 1.7769371271133423, + "learning_rate": 4.8693298304530535e-05, + "loss": 4.4442, + "step": 17382 + }, + { + "epoch": 0.10338162527357503, + "grad_norm": 2.080097198486328, + "learning_rate": 4.869314926434837e-05, + "loss": 4.339, + "step": 17383 + }, + { + "epoch": 0.10338757255685603, + "grad_norm": 1.8703278303146362, + "learning_rate": 4.8693000215895176e-05, + "loss": 4.4124, + "step": 17384 + }, + { + "epoch": 0.10339351984013702, + "grad_norm": 1.9553934335708618, + "learning_rate": 4.869285115917099e-05, + "loss": 4.3571, + "step": 17385 + }, + { + "epoch": 0.10339946712341802, + "grad_norm": 1.8989006280899048, + "learning_rate": 4.869270209417588e-05, + "loss": 4.4108, + "step": 17386 + }, + { + "epoch": 0.10340541440669902, + "grad_norm": 1.8347021341323853, + "learning_rate": 4.8692553020909896e-05, + "loss": 4.1529, + "step": 17387 + }, + { + "epoch": 0.10341136168998001, + "grad_norm": 1.9458621740341187, + "learning_rate": 4.869240393937309e-05, + "loss": 4.2392, + "step": 17388 + }, + { + "epoch": 0.10341730897326101, + "grad_norm": 1.8578664064407349, + "learning_rate": 4.86922548495655e-05, + "loss": 4.3238, + "step": 17389 + }, + { + "epoch": 0.10342325625654201, + "grad_norm": 1.9359874725341797, + "learning_rate": 4.869210575148719e-05, + "loss": 4.56, + "step": 17390 + }, + { + "epoch": 0.103429203539823, + "grad_norm": 2.0030486583709717, + "learning_rate": 4.869195664513822e-05, + "loss": 4.1571, + "step": 17391 + }, + { + "epoch": 0.103435150823104, + "grad_norm": 1.9431639909744263, + "learning_rate": 4.869180753051863e-05, + "loss": 4.2181, + "step": 17392 + }, + { + "epoch": 0.10344109810638501, + "grad_norm": 1.9171335697174072, + "learning_rate": 4.869165840762847e-05, + "loss": 4.3139, + "step": 17393 + }, + { + "epoch": 0.103447045389666, + "grad_norm": 1.9467666149139404, + "learning_rate": 4.86915092764678e-05, + "loss": 4.3906, + "step": 17394 + }, + { + "epoch": 0.103452992672947, + "grad_norm": 2.1354262828826904, + "learning_rate": 4.8691360137036666e-05, + "loss": 4.3407, + "step": 17395 + }, + { + "epoch": 0.103458939956228, + "grad_norm": 1.7994540929794312, + "learning_rate": 4.8691210989335126e-05, + "loss": 4.5767, + "step": 17396 + }, + { + "epoch": 0.10346488723950899, + "grad_norm": 1.8322330713272095, + "learning_rate": 4.869106183336323e-05, + "loss": 4.62, + "step": 17397 + }, + { + "epoch": 0.10347083452278999, + "grad_norm": 1.9874459505081177, + "learning_rate": 4.869091266912102e-05, + "loss": 4.2579, + "step": 17398 + }, + { + "epoch": 0.10347678180607099, + "grad_norm": 1.8300455808639526, + "learning_rate": 4.869076349660856e-05, + "loss": 4.3049, + "step": 17399 + }, + { + "epoch": 0.10348272908935198, + "grad_norm": 1.8731672763824463, + "learning_rate": 4.8690614315825914e-05, + "loss": 4.3241, + "step": 17400 + }, + { + "epoch": 0.10348867637263298, + "grad_norm": 1.8587061166763306, + "learning_rate": 4.86904651267731e-05, + "loss": 4.2513, + "step": 17401 + }, + { + "epoch": 0.10349462365591398, + "grad_norm": 1.8614505529403687, + "learning_rate": 4.86903159294502e-05, + "loss": 4.2877, + "step": 17402 + }, + { + "epoch": 0.10350057093919497, + "grad_norm": 1.7118782997131348, + "learning_rate": 4.869016672385725e-05, + "loss": 5.951, + "step": 17403 + }, + { + "epoch": 0.10350651822247597, + "grad_norm": 1.6701730489730835, + "learning_rate": 4.869001750999431e-05, + "loss": 5.8099, + "step": 17404 + }, + { + "epoch": 0.10351246550575698, + "grad_norm": 1.4960297346115112, + "learning_rate": 4.868986828786143e-05, + "loss": 5.7589, + "step": 17405 + }, + { + "epoch": 0.10351841278903796, + "grad_norm": 1.3732372522354126, + "learning_rate": 4.868971905745866e-05, + "loss": 5.8552, + "step": 17406 + }, + { + "epoch": 0.10352436007231897, + "grad_norm": 1.5108624696731567, + "learning_rate": 4.868956981878606e-05, + "loss": 5.82, + "step": 17407 + }, + { + "epoch": 0.10353030735559997, + "grad_norm": 1.8640809059143066, + "learning_rate": 4.868942057184367e-05, + "loss": 5.4388, + "step": 17408 + }, + { + "epoch": 0.10353625463888096, + "grad_norm": 2.082534074783325, + "learning_rate": 4.868927131663154e-05, + "loss": 4.3796, + "step": 17409 + }, + { + "epoch": 0.10354220192216196, + "grad_norm": 1.8963665962219238, + "learning_rate": 4.868912205314975e-05, + "loss": 5.6469, + "step": 17410 + }, + { + "epoch": 0.10354814920544296, + "grad_norm": 1.7797149419784546, + "learning_rate": 4.868897278139832e-05, + "loss": 5.6187, + "step": 17411 + }, + { + "epoch": 0.10355409648872395, + "grad_norm": 1.8464981317520142, + "learning_rate": 4.868882350137732e-05, + "loss": 4.8464, + "step": 17412 + }, + { + "epoch": 0.10356004377200495, + "grad_norm": 1.5401747226715088, + "learning_rate": 4.8688674213086794e-05, + "loss": 5.3547, + "step": 17413 + }, + { + "epoch": 0.10356599105528594, + "grad_norm": 1.4159618616104126, + "learning_rate": 4.868852491652679e-05, + "loss": 5.4428, + "step": 17414 + }, + { + "epoch": 0.10357193833856694, + "grad_norm": 1.6561527252197266, + "learning_rate": 4.868837561169738e-05, + "loss": 5.6467, + "step": 17415 + }, + { + "epoch": 0.10357788562184794, + "grad_norm": 1.659527063369751, + "learning_rate": 4.8688226298598586e-05, + "loss": 5.8631, + "step": 17416 + }, + { + "epoch": 0.10358383290512893, + "grad_norm": 1.8206923007965088, + "learning_rate": 4.868807697723049e-05, + "loss": 5.6475, + "step": 17417 + }, + { + "epoch": 0.10358978018840993, + "grad_norm": 1.9741102457046509, + "learning_rate": 4.868792764759312e-05, + "loss": 4.633, + "step": 17418 + }, + { + "epoch": 0.10359572747169093, + "grad_norm": 1.9505152702331543, + "learning_rate": 4.8687778309686546e-05, + "loss": 4.4024, + "step": 17419 + }, + { + "epoch": 0.10360167475497192, + "grad_norm": 1.7461168766021729, + "learning_rate": 4.868762896351082e-05, + "loss": 5.6505, + "step": 17420 + }, + { + "epoch": 0.10360762203825293, + "grad_norm": 1.6750074625015259, + "learning_rate": 4.868747960906598e-05, + "loss": 5.7747, + "step": 17421 + }, + { + "epoch": 0.10361356932153393, + "grad_norm": 1.5986868143081665, + "learning_rate": 4.8687330246352085e-05, + "loss": 5.2086, + "step": 17422 + }, + { + "epoch": 0.10361951660481492, + "grad_norm": 1.5743950605392456, + "learning_rate": 4.868718087536919e-05, + "loss": 5.6462, + "step": 17423 + }, + { + "epoch": 0.10362546388809592, + "grad_norm": 1.5192588567733765, + "learning_rate": 4.868703149611734e-05, + "loss": 5.5579, + "step": 17424 + }, + { + "epoch": 0.10363141117137692, + "grad_norm": 1.7356244325637817, + "learning_rate": 4.86868821085966e-05, + "loss": 5.5978, + "step": 17425 + }, + { + "epoch": 0.10363735845465791, + "grad_norm": 1.7366925477981567, + "learning_rate": 4.868673271280701e-05, + "loss": 5.3812, + "step": 17426 + }, + { + "epoch": 0.10364330573793891, + "grad_norm": 2.016662836074829, + "learning_rate": 4.868658330874862e-05, + "loss": 5.4003, + "step": 17427 + }, + { + "epoch": 0.10364925302121991, + "grad_norm": 2.022550582885742, + "learning_rate": 4.86864338964215e-05, + "loss": 5.191, + "step": 17428 + }, + { + "epoch": 0.1036552003045009, + "grad_norm": 1.8406000137329102, + "learning_rate": 4.868628447582568e-05, + "loss": 5.9494, + "step": 17429 + }, + { + "epoch": 0.1036611475877819, + "grad_norm": 1.7836806774139404, + "learning_rate": 4.868613504696123e-05, + "loss": 5.4606, + "step": 17430 + }, + { + "epoch": 0.1036670948710629, + "grad_norm": 1.6688835620880127, + "learning_rate": 4.86859856098282e-05, + "loss": 5.2287, + "step": 17431 + }, + { + "epoch": 0.10367304215434389, + "grad_norm": 1.7083512544631958, + "learning_rate": 4.868583616442663e-05, + "loss": 4.7133, + "step": 17432 + }, + { + "epoch": 0.1036789894376249, + "grad_norm": 1.8784829378128052, + "learning_rate": 4.8685686710756576e-05, + "loss": 4.8341, + "step": 17433 + }, + { + "epoch": 0.1036849367209059, + "grad_norm": 2.380962610244751, + "learning_rate": 4.8685537248818105e-05, + "loss": 4.6553, + "step": 17434 + }, + { + "epoch": 0.10369088400418688, + "grad_norm": 1.936126470565796, + "learning_rate": 4.868538777861125e-05, + "loss": 5.0645, + "step": 17435 + }, + { + "epoch": 0.10369683128746789, + "grad_norm": 1.9400380849838257, + "learning_rate": 4.8685238300136065e-05, + "loss": 4.9022, + "step": 17436 + }, + { + "epoch": 0.10370277857074889, + "grad_norm": 2.0275371074676514, + "learning_rate": 4.868508881339261e-05, + "loss": 4.8918, + "step": 17437 + }, + { + "epoch": 0.10370872585402988, + "grad_norm": 1.8734835386276245, + "learning_rate": 4.868493931838094e-05, + "loss": 4.9889, + "step": 17438 + }, + { + "epoch": 0.10371467313731088, + "grad_norm": 2.346519947052002, + "learning_rate": 4.868478981510111e-05, + "loss": 4.4857, + "step": 17439 + }, + { + "epoch": 0.10372062042059188, + "grad_norm": 2.4242961406707764, + "learning_rate": 4.868464030355315e-05, + "loss": 4.034, + "step": 17440 + }, + { + "epoch": 0.10372656770387287, + "grad_norm": 2.3877294063568115, + "learning_rate": 4.8684490783737133e-05, + "loss": 4.2761, + "step": 17441 + }, + { + "epoch": 0.10373251498715387, + "grad_norm": 1.832585096359253, + "learning_rate": 4.8684341255653107e-05, + "loss": 5.1485, + "step": 17442 + }, + { + "epoch": 0.10373846227043486, + "grad_norm": 2.0385608673095703, + "learning_rate": 4.868419171930112e-05, + "loss": 5.7793, + "step": 17443 + }, + { + "epoch": 0.10374440955371586, + "grad_norm": 1.8885849714279175, + "learning_rate": 4.8684042174681225e-05, + "loss": 5.9304, + "step": 17444 + }, + { + "epoch": 0.10375035683699686, + "grad_norm": 1.8748784065246582, + "learning_rate": 4.868389262179348e-05, + "loss": 5.3722, + "step": 17445 + }, + { + "epoch": 0.10375630412027785, + "grad_norm": 1.9851447343826294, + "learning_rate": 4.8683743060637924e-05, + "loss": 5.4734, + "step": 17446 + }, + { + "epoch": 0.10376225140355885, + "grad_norm": 2.387681245803833, + "learning_rate": 4.868359349121463e-05, + "loss": 4.7244, + "step": 17447 + }, + { + "epoch": 0.10376819868683985, + "grad_norm": 1.8236793279647827, + "learning_rate": 4.868344391352363e-05, + "loss": 5.0094, + "step": 17448 + }, + { + "epoch": 0.10377414597012084, + "grad_norm": 1.3649673461914062, + "learning_rate": 4.868329432756498e-05, + "loss": 5.3295, + "step": 17449 + }, + { + "epoch": 0.10378009325340184, + "grad_norm": 1.8916471004486084, + "learning_rate": 4.8683144733338746e-05, + "loss": 5.9443, + "step": 17450 + }, + { + "epoch": 0.10378604053668285, + "grad_norm": 1.8541333675384521, + "learning_rate": 4.868299513084497e-05, + "loss": 5.425, + "step": 17451 + }, + { + "epoch": 0.10379198781996384, + "grad_norm": 1.9708364009857178, + "learning_rate": 4.8682845520083695e-05, + "loss": 5.3254, + "step": 17452 + }, + { + "epoch": 0.10379793510324484, + "grad_norm": 1.7171103954315186, + "learning_rate": 4.8682695901054995e-05, + "loss": 5.3498, + "step": 17453 + }, + { + "epoch": 0.10380388238652584, + "grad_norm": 1.6002514362335205, + "learning_rate": 4.868254627375891e-05, + "loss": 5.1611, + "step": 17454 + }, + { + "epoch": 0.10380982966980683, + "grad_norm": 1.9245331287384033, + "learning_rate": 4.8682396638195486e-05, + "loss": 5.3348, + "step": 17455 + }, + { + "epoch": 0.10381577695308783, + "grad_norm": 1.4742863178253174, + "learning_rate": 4.8682246994364786e-05, + "loss": 5.7573, + "step": 17456 + }, + { + "epoch": 0.10382172423636883, + "grad_norm": 1.929343581199646, + "learning_rate": 4.8682097342266855e-05, + "loss": 5.8469, + "step": 17457 + }, + { + "epoch": 0.10382767151964982, + "grad_norm": 1.6212769746780396, + "learning_rate": 4.8681947681901754e-05, + "loss": 5.9121, + "step": 17458 + }, + { + "epoch": 0.10383361880293082, + "grad_norm": 1.6550590991973877, + "learning_rate": 4.868179801326952e-05, + "loss": 5.7114, + "step": 17459 + }, + { + "epoch": 0.10383956608621182, + "grad_norm": 1.671628475189209, + "learning_rate": 4.868164833637023e-05, + "loss": 5.3988, + "step": 17460 + }, + { + "epoch": 0.10384551336949281, + "grad_norm": 1.5833921432495117, + "learning_rate": 4.868149865120391e-05, + "loss": 5.1952, + "step": 17461 + }, + { + "epoch": 0.10385146065277381, + "grad_norm": 1.8280199766159058, + "learning_rate": 4.868134895777063e-05, + "loss": 5.4812, + "step": 17462 + }, + { + "epoch": 0.10385740793605482, + "grad_norm": 1.7413616180419922, + "learning_rate": 4.868119925607043e-05, + "loss": 5.4119, + "step": 17463 + }, + { + "epoch": 0.1038633552193358, + "grad_norm": 1.6645252704620361, + "learning_rate": 4.868104954610337e-05, + "loss": 5.3546, + "step": 17464 + }, + { + "epoch": 0.1038693025026168, + "grad_norm": 1.634175181388855, + "learning_rate": 4.86808998278695e-05, + "loss": 5.3119, + "step": 17465 + }, + { + "epoch": 0.10387524978589781, + "grad_norm": 1.5220096111297607, + "learning_rate": 4.868075010136887e-05, + "loss": 5.1345, + "step": 17466 + }, + { + "epoch": 0.1038811970691788, + "grad_norm": 1.3279895782470703, + "learning_rate": 4.8680600366601534e-05, + "loss": 5.0071, + "step": 17467 + }, + { + "epoch": 0.1038871443524598, + "grad_norm": 1.4460431337356567, + "learning_rate": 4.8680450623567555e-05, + "loss": 4.8219, + "step": 17468 + }, + { + "epoch": 0.1038930916357408, + "grad_norm": 1.7028027772903442, + "learning_rate": 4.868030087226697e-05, + "loss": 5.2679, + "step": 17469 + }, + { + "epoch": 0.10389903891902179, + "grad_norm": 1.7697324752807617, + "learning_rate": 4.8680151112699835e-05, + "loss": 5.504, + "step": 17470 + }, + { + "epoch": 0.10390498620230279, + "grad_norm": 1.4549357891082764, + "learning_rate": 4.86800013448662e-05, + "loss": 5.4475, + "step": 17471 + }, + { + "epoch": 0.10391093348558378, + "grad_norm": 1.7069107294082642, + "learning_rate": 4.867985156876613e-05, + "loss": 5.5878, + "step": 17472 + }, + { + "epoch": 0.10391688076886478, + "grad_norm": 1.8917819261550903, + "learning_rate": 4.867970178439967e-05, + "loss": 5.4449, + "step": 17473 + }, + { + "epoch": 0.10392282805214578, + "grad_norm": 1.7132060527801514, + "learning_rate": 4.8679551991766856e-05, + "loss": 5.7547, + "step": 17474 + }, + { + "epoch": 0.10392877533542677, + "grad_norm": 1.6535362005233765, + "learning_rate": 4.867940219086777e-05, + "loss": 5.9603, + "step": 17475 + }, + { + "epoch": 0.10393472261870777, + "grad_norm": 1.6559079885482788, + "learning_rate": 4.8679252381702443e-05, + "loss": 5.9673, + "step": 17476 + }, + { + "epoch": 0.10394066990198877, + "grad_norm": 1.5295041799545288, + "learning_rate": 4.867910256427093e-05, + "loss": 5.4502, + "step": 17477 + }, + { + "epoch": 0.10394661718526976, + "grad_norm": 1.8571394681930542, + "learning_rate": 4.8678952738573294e-05, + "loss": 6.1838, + "step": 17478 + }, + { + "epoch": 0.10395256446855076, + "grad_norm": 1.7148513793945312, + "learning_rate": 4.8678802904609576e-05, + "loss": 5.9624, + "step": 17479 + }, + { + "epoch": 0.10395851175183177, + "grad_norm": 1.7191139459609985, + "learning_rate": 4.867865306237983e-05, + "loss": 5.8591, + "step": 17480 + }, + { + "epoch": 0.10396445903511276, + "grad_norm": 1.526285171508789, + "learning_rate": 4.867850321188412e-05, + "loss": 5.988, + "step": 17481 + }, + { + "epoch": 0.10397040631839376, + "grad_norm": 1.5284392833709717, + "learning_rate": 4.867835335312249e-05, + "loss": 5.7212, + "step": 17482 + }, + { + "epoch": 0.10397635360167476, + "grad_norm": 1.5675333738327026, + "learning_rate": 4.8678203486094975e-05, + "loss": 5.5921, + "step": 17483 + }, + { + "epoch": 0.10398230088495575, + "grad_norm": 1.7697393894195557, + "learning_rate": 4.8678053610801654e-05, + "loss": 5.1748, + "step": 17484 + }, + { + "epoch": 0.10398824816823675, + "grad_norm": 1.5940029621124268, + "learning_rate": 4.867790372724257e-05, + "loss": 5.7108, + "step": 17485 + }, + { + "epoch": 0.10399419545151775, + "grad_norm": 2.0347743034362793, + "learning_rate": 4.867775383541777e-05, + "loss": 5.4253, + "step": 17486 + }, + { + "epoch": 0.10400014273479874, + "grad_norm": 2.1038641929626465, + "learning_rate": 4.867760393532732e-05, + "loss": 5.2362, + "step": 17487 + }, + { + "epoch": 0.10400609001807974, + "grad_norm": 2.2253377437591553, + "learning_rate": 4.867745402697126e-05, + "loss": 5.0801, + "step": 17488 + }, + { + "epoch": 0.10401203730136074, + "grad_norm": 1.8215906620025635, + "learning_rate": 4.867730411034964e-05, + "loss": 5.1438, + "step": 17489 + }, + { + "epoch": 0.10401798458464173, + "grad_norm": 1.5428386926651, + "learning_rate": 4.867715418546252e-05, + "loss": 5.0664, + "step": 17490 + }, + { + "epoch": 0.10402393186792273, + "grad_norm": 1.3886137008666992, + "learning_rate": 4.867700425230995e-05, + "loss": 4.992, + "step": 17491 + }, + { + "epoch": 0.10402987915120374, + "grad_norm": 1.4177032709121704, + "learning_rate": 4.867685431089199e-05, + "loss": 4.9245, + "step": 17492 + }, + { + "epoch": 0.10403582643448472, + "grad_norm": 1.2621585130691528, + "learning_rate": 4.867670436120867e-05, + "loss": 4.8902, + "step": 17493 + }, + { + "epoch": 0.10404177371776573, + "grad_norm": 1.4095661640167236, + "learning_rate": 4.867655440326007e-05, + "loss": 4.871, + "step": 17494 + }, + { + "epoch": 0.10404772100104673, + "grad_norm": 1.3117374181747437, + "learning_rate": 4.867640443704622e-05, + "loss": 4.9351, + "step": 17495 + }, + { + "epoch": 0.10405366828432772, + "grad_norm": 1.6237322092056274, + "learning_rate": 4.867625446256719e-05, + "loss": 5.4253, + "step": 17496 + }, + { + "epoch": 0.10405961556760872, + "grad_norm": 2.095696210861206, + "learning_rate": 4.867610447982302e-05, + "loss": 5.1793, + "step": 17497 + }, + { + "epoch": 0.10406556285088972, + "grad_norm": 3.627516508102417, + "learning_rate": 4.867595448881377e-05, + "loss": 5.1206, + "step": 17498 + }, + { + "epoch": 0.10407151013417071, + "grad_norm": 2.0525522232055664, + "learning_rate": 4.8675804489539477e-05, + "loss": 5.5922, + "step": 17499 + }, + { + "epoch": 0.10407745741745171, + "grad_norm": 1.6003656387329102, + "learning_rate": 4.867565448200022e-05, + "loss": 6.0267, + "step": 17500 + }, + { + "epoch": 0.1040834047007327, + "grad_norm": 1.4709582328796387, + "learning_rate": 4.8675504466196034e-05, + "loss": 5.55, + "step": 17501 + }, + { + "epoch": 0.1040893519840137, + "grad_norm": 1.5550457239151, + "learning_rate": 4.8675354442126966e-05, + "loss": 5.6857, + "step": 17502 + }, + { + "epoch": 0.1040952992672947, + "grad_norm": 1.6180169582366943, + "learning_rate": 4.8675204409793085e-05, + "loss": 5.3079, + "step": 17503 + }, + { + "epoch": 0.10410124655057569, + "grad_norm": 1.5625691413879395, + "learning_rate": 4.8675054369194426e-05, + "loss": 5.5965, + "step": 17504 + }, + { + "epoch": 0.10410719383385669, + "grad_norm": 1.4117538928985596, + "learning_rate": 4.8674904320331064e-05, + "loss": 5.7337, + "step": 17505 + }, + { + "epoch": 0.1041131411171377, + "grad_norm": 1.5518572330474854, + "learning_rate": 4.867475426320302e-05, + "loss": 5.5802, + "step": 17506 + }, + { + "epoch": 0.10411908840041868, + "grad_norm": 1.3276773691177368, + "learning_rate": 4.867460419781037e-05, + "loss": 6.0462, + "step": 17507 + }, + { + "epoch": 0.10412503568369968, + "grad_norm": 1.3660519123077393, + "learning_rate": 4.867445412415317e-05, + "loss": 6.0382, + "step": 17508 + }, + { + "epoch": 0.10413098296698069, + "grad_norm": 1.2959636449813843, + "learning_rate": 4.867430404223146e-05, + "loss": 5.8823, + "step": 17509 + }, + { + "epoch": 0.10413693025026168, + "grad_norm": 2.009265899658203, + "learning_rate": 4.867415395204528e-05, + "loss": 4.9889, + "step": 17510 + }, + { + "epoch": 0.10414287753354268, + "grad_norm": 1.3692728281021118, + "learning_rate": 4.8674003853594705e-05, + "loss": 5.2382, + "step": 17511 + }, + { + "epoch": 0.10414882481682368, + "grad_norm": 1.4074095487594604, + "learning_rate": 4.8673853746879785e-05, + "loss": 5.8241, + "step": 17512 + }, + { + "epoch": 0.10415477210010467, + "grad_norm": 1.2155077457427979, + "learning_rate": 4.867370363190057e-05, + "loss": 5.762, + "step": 17513 + }, + { + "epoch": 0.10416071938338567, + "grad_norm": 1.1142069101333618, + "learning_rate": 4.86735535086571e-05, + "loss": 5.7591, + "step": 17514 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 1.1758382320404053, + "learning_rate": 4.867340337714944e-05, + "loss": 5.6534, + "step": 17515 + }, + { + "epoch": 0.10417261394994766, + "grad_norm": 1.2154567241668701, + "learning_rate": 4.867325323737765e-05, + "loss": 5.7465, + "step": 17516 + }, + { + "epoch": 0.10417856123322866, + "grad_norm": 1.3033171892166138, + "learning_rate": 4.867310308934176e-05, + "loss": 5.7701, + "step": 17517 + }, + { + "epoch": 0.10418450851650966, + "grad_norm": 1.3491926193237305, + "learning_rate": 4.867295293304184e-05, + "loss": 5.7883, + "step": 17518 + }, + { + "epoch": 0.10419045579979065, + "grad_norm": 1.223988652229309, + "learning_rate": 4.867280276847793e-05, + "loss": 5.7065, + "step": 17519 + }, + { + "epoch": 0.10419640308307165, + "grad_norm": 1.3885700702667236, + "learning_rate": 4.867265259565009e-05, + "loss": 5.6934, + "step": 17520 + }, + { + "epoch": 0.10420235036635266, + "grad_norm": 1.1616452932357788, + "learning_rate": 4.867250241455837e-05, + "loss": 5.6958, + "step": 17521 + }, + { + "epoch": 0.10420829764963364, + "grad_norm": 1.2696330547332764, + "learning_rate": 4.867235222520283e-05, + "loss": 5.5534, + "step": 17522 + }, + { + "epoch": 0.10421424493291465, + "grad_norm": 1.3539372682571411, + "learning_rate": 4.8672202027583516e-05, + "loss": 5.8028, + "step": 17523 + }, + { + "epoch": 0.10422019221619565, + "grad_norm": 2.547095775604248, + "learning_rate": 4.867205182170048e-05, + "loss": 5.0223, + "step": 17524 + }, + { + "epoch": 0.10422613949947664, + "grad_norm": 1.7378231287002563, + "learning_rate": 4.8671901607553775e-05, + "loss": 5.8356, + "step": 17525 + }, + { + "epoch": 0.10423208678275764, + "grad_norm": 1.9287587404251099, + "learning_rate": 4.867175138514346e-05, + "loss": 5.9694, + "step": 17526 + }, + { + "epoch": 0.10423803406603864, + "grad_norm": 1.685260534286499, + "learning_rate": 4.867160115446957e-05, + "loss": 5.6962, + "step": 17527 + }, + { + "epoch": 0.10424398134931963, + "grad_norm": 1.594699501991272, + "learning_rate": 4.8671450915532176e-05, + "loss": 5.6139, + "step": 17528 + }, + { + "epoch": 0.10424992863260063, + "grad_norm": 1.5966441631317139, + "learning_rate": 4.867130066833132e-05, + "loss": 5.6369, + "step": 17529 + }, + { + "epoch": 0.10425587591588162, + "grad_norm": 1.701524019241333, + "learning_rate": 4.867115041286706e-05, + "loss": 5.6487, + "step": 17530 + }, + { + "epoch": 0.10426182319916262, + "grad_norm": 1.575536847114563, + "learning_rate": 4.8671000149139444e-05, + "loss": 5.5935, + "step": 17531 + }, + { + "epoch": 0.10426777048244362, + "grad_norm": 1.6812626123428345, + "learning_rate": 4.867084987714853e-05, + "loss": 5.4343, + "step": 17532 + }, + { + "epoch": 0.10427371776572461, + "grad_norm": 1.6122568845748901, + "learning_rate": 4.867069959689435e-05, + "loss": 5.5194, + "step": 17533 + }, + { + "epoch": 0.10427966504900561, + "grad_norm": 1.5337659120559692, + "learning_rate": 4.8670549308376996e-05, + "loss": 5.5248, + "step": 17534 + }, + { + "epoch": 0.10428561233228661, + "grad_norm": 1.45541250705719, + "learning_rate": 4.867039901159649e-05, + "loss": 5.6301, + "step": 17535 + }, + { + "epoch": 0.1042915596155676, + "grad_norm": 1.6674455404281616, + "learning_rate": 4.867024870655289e-05, + "loss": 6.1182, + "step": 17536 + }, + { + "epoch": 0.1042975068988486, + "grad_norm": 1.4686870574951172, + "learning_rate": 4.867009839324624e-05, + "loss": 5.9761, + "step": 17537 + }, + { + "epoch": 0.1043034541821296, + "grad_norm": 1.6447898149490356, + "learning_rate": 4.866994807167662e-05, + "loss": 5.4559, + "step": 17538 + }, + { + "epoch": 0.1043094014654106, + "grad_norm": 1.4841620922088623, + "learning_rate": 4.866979774184406e-05, + "loss": 5.4441, + "step": 17539 + }, + { + "epoch": 0.1043153487486916, + "grad_norm": 1.8813121318817139, + "learning_rate": 4.8669647403748616e-05, + "loss": 5.348, + "step": 17540 + }, + { + "epoch": 0.1043212960319726, + "grad_norm": 4.018791198730469, + "learning_rate": 4.866949705739035e-05, + "loss": 5.457, + "step": 17541 + }, + { + "epoch": 0.10432724331525359, + "grad_norm": 2.9932172298431396, + "learning_rate": 4.86693467027693e-05, + "loss": 5.2345, + "step": 17542 + }, + { + "epoch": 0.10433319059853459, + "grad_norm": 1.4329689741134644, + "learning_rate": 4.866919633988553e-05, + "loss": 5.8491, + "step": 17543 + }, + { + "epoch": 0.10433913788181559, + "grad_norm": 1.7308731079101562, + "learning_rate": 4.866904596873909e-05, + "loss": 5.5858, + "step": 17544 + }, + { + "epoch": 0.10434508516509658, + "grad_norm": 2.2066311836242676, + "learning_rate": 4.866889558933002e-05, + "loss": 4.7702, + "step": 17545 + }, + { + "epoch": 0.10435103244837758, + "grad_norm": 1.528171181678772, + "learning_rate": 4.866874520165839e-05, + "loss": 5.1622, + "step": 17546 + }, + { + "epoch": 0.10435697973165858, + "grad_norm": 1.8969347476959229, + "learning_rate": 4.866859480572424e-05, + "loss": 5.0091, + "step": 17547 + }, + { + "epoch": 0.10436292701493957, + "grad_norm": 1.6737502813339233, + "learning_rate": 4.8668444401527644e-05, + "loss": 5.7552, + "step": 17548 + }, + { + "epoch": 0.10436887429822057, + "grad_norm": 1.793411374092102, + "learning_rate": 4.8668293989068626e-05, + "loss": 5.7963, + "step": 17549 + }, + { + "epoch": 0.10437482158150158, + "grad_norm": 1.8675566911697388, + "learning_rate": 4.866814356834725e-05, + "loss": 4.7389, + "step": 17550 + }, + { + "epoch": 0.10438076886478256, + "grad_norm": 1.9145622253417969, + "learning_rate": 4.8667993139363574e-05, + "loss": 5.0921, + "step": 17551 + }, + { + "epoch": 0.10438671614806357, + "grad_norm": 1.6751158237457275, + "learning_rate": 4.866784270211764e-05, + "loss": 5.5547, + "step": 17552 + }, + { + "epoch": 0.10439266343134457, + "grad_norm": 1.754550576210022, + "learning_rate": 4.866769225660951e-05, + "loss": 5.6077, + "step": 17553 + }, + { + "epoch": 0.10439861071462556, + "grad_norm": 2.0323402881622314, + "learning_rate": 4.866754180283924e-05, + "loss": 5.1191, + "step": 17554 + }, + { + "epoch": 0.10440455799790656, + "grad_norm": 1.8000339269638062, + "learning_rate": 4.866739134080687e-05, + "loss": 5.1533, + "step": 17555 + }, + { + "epoch": 0.10441050528118756, + "grad_norm": 2.053093671798706, + "learning_rate": 4.866724087051245e-05, + "loss": 4.9985, + "step": 17556 + }, + { + "epoch": 0.10441645256446855, + "grad_norm": 1.6764185428619385, + "learning_rate": 4.866709039195605e-05, + "loss": 4.9674, + "step": 17557 + }, + { + "epoch": 0.10442239984774955, + "grad_norm": 1.6942695379257202, + "learning_rate": 4.866693990513772e-05, + "loss": 4.9319, + "step": 17558 + }, + { + "epoch": 0.10442834713103054, + "grad_norm": 1.5124322175979614, + "learning_rate": 4.8666789410057496e-05, + "loss": 5.1371, + "step": 17559 + }, + { + "epoch": 0.10443429441431154, + "grad_norm": 1.925757646560669, + "learning_rate": 4.866663890671545e-05, + "loss": 4.6366, + "step": 17560 + }, + { + "epoch": 0.10444024169759254, + "grad_norm": 2.0077321529388428, + "learning_rate": 4.866648839511161e-05, + "loss": 4.9993, + "step": 17561 + }, + { + "epoch": 0.10444618898087353, + "grad_norm": 2.1986982822418213, + "learning_rate": 4.866633787524605e-05, + "loss": 4.814, + "step": 17562 + }, + { + "epoch": 0.10445213626415453, + "grad_norm": 1.9967917203903198, + "learning_rate": 4.866618734711882e-05, + "loss": 4.5182, + "step": 17563 + }, + { + "epoch": 0.10445808354743553, + "grad_norm": 1.7663863897323608, + "learning_rate": 4.8666036810729965e-05, + "loss": 4.5589, + "step": 17564 + }, + { + "epoch": 0.10446403083071652, + "grad_norm": 1.7784098386764526, + "learning_rate": 4.8665886266079537e-05, + "loss": 4.6739, + "step": 17565 + }, + { + "epoch": 0.10446997811399752, + "grad_norm": 1.7143903970718384, + "learning_rate": 4.8665735713167596e-05, + "loss": 4.8434, + "step": 17566 + }, + { + "epoch": 0.10447592539727853, + "grad_norm": 2.018825054168701, + "learning_rate": 4.866558515199419e-05, + "loss": 4.5235, + "step": 17567 + }, + { + "epoch": 0.10448187268055951, + "grad_norm": 2.1135973930358887, + "learning_rate": 4.8665434582559374e-05, + "loss": 4.5048, + "step": 17568 + }, + { + "epoch": 0.10448781996384052, + "grad_norm": 2.097177028656006, + "learning_rate": 4.86652840048632e-05, + "loss": 4.7811, + "step": 17569 + }, + { + "epoch": 0.10449376724712152, + "grad_norm": 2.054049015045166, + "learning_rate": 4.866513341890572e-05, + "loss": 4.5964, + "step": 17570 + }, + { + "epoch": 0.10449971453040251, + "grad_norm": 1.9631117582321167, + "learning_rate": 4.866498282468699e-05, + "loss": 4.4055, + "step": 17571 + }, + { + "epoch": 0.10450566181368351, + "grad_norm": 2.079071521759033, + "learning_rate": 4.8664832222207055e-05, + "loss": 4.3743, + "step": 17572 + }, + { + "epoch": 0.10451160909696451, + "grad_norm": 1.8425450325012207, + "learning_rate": 4.8664681611465966e-05, + "loss": 4.411, + "step": 17573 + }, + { + "epoch": 0.1045175563802455, + "grad_norm": 1.812538743019104, + "learning_rate": 4.866453099246379e-05, + "loss": 4.3496, + "step": 17574 + }, + { + "epoch": 0.1045235036635265, + "grad_norm": 1.8823848962783813, + "learning_rate": 4.8664380365200566e-05, + "loss": 4.3613, + "step": 17575 + }, + { + "epoch": 0.1045294509468075, + "grad_norm": 1.6085865497589111, + "learning_rate": 4.8664229729676356e-05, + "loss": 4.5187, + "step": 17576 + }, + { + "epoch": 0.10453539823008849, + "grad_norm": 1.8719606399536133, + "learning_rate": 4.8664079085891204e-05, + "loss": 4.7276, + "step": 17577 + }, + { + "epoch": 0.1045413455133695, + "grad_norm": 1.7630116939544678, + "learning_rate": 4.866392843384517e-05, + "loss": 4.3749, + "step": 17578 + }, + { + "epoch": 0.1045472927966505, + "grad_norm": 1.8641449213027954, + "learning_rate": 4.86637777735383e-05, + "loss": 4.5781, + "step": 17579 + }, + { + "epoch": 0.10455324007993148, + "grad_norm": 1.8178362846374512, + "learning_rate": 4.8663627104970645e-05, + "loss": 4.3217, + "step": 17580 + }, + { + "epoch": 0.10455918736321249, + "grad_norm": 1.7655141353607178, + "learning_rate": 4.866347642814228e-05, + "loss": 4.4972, + "step": 17581 + }, + { + "epoch": 0.10456513464649349, + "grad_norm": 1.843266248703003, + "learning_rate": 4.8663325743053216e-05, + "loss": 4.5214, + "step": 17582 + }, + { + "epoch": 0.10457108192977448, + "grad_norm": 1.8023161888122559, + "learning_rate": 4.866317504970354e-05, + "loss": 4.3205, + "step": 17583 + }, + { + "epoch": 0.10457702921305548, + "grad_norm": 1.7845708131790161, + "learning_rate": 4.8663024348093296e-05, + "loss": 4.1439, + "step": 17584 + }, + { + "epoch": 0.10458297649633648, + "grad_norm": 2.0029754638671875, + "learning_rate": 4.866287363822253e-05, + "loss": 4.4627, + "step": 17585 + }, + { + "epoch": 0.10458892377961747, + "grad_norm": 1.6008789539337158, + "learning_rate": 4.8662722920091305e-05, + "loss": 4.5539, + "step": 17586 + }, + { + "epoch": 0.10459487106289847, + "grad_norm": 1.884207844734192, + "learning_rate": 4.8662572193699664e-05, + "loss": 4.1132, + "step": 17587 + }, + { + "epoch": 0.10460081834617946, + "grad_norm": 1.7014282941818237, + "learning_rate": 4.866242145904767e-05, + "loss": 4.9612, + "step": 17588 + }, + { + "epoch": 0.10460676562946046, + "grad_norm": 1.7388410568237305, + "learning_rate": 4.8662270716135364e-05, + "loss": 5.3079, + "step": 17589 + }, + { + "epoch": 0.10461271291274146, + "grad_norm": 1.6414510011672974, + "learning_rate": 4.8662119964962805e-05, + "loss": 5.5816, + "step": 17590 + }, + { + "epoch": 0.10461866019602245, + "grad_norm": 1.4039387702941895, + "learning_rate": 4.866196920553004e-05, + "loss": 5.0036, + "step": 17591 + }, + { + "epoch": 0.10462460747930345, + "grad_norm": 1.7621723413467407, + "learning_rate": 4.866181843783712e-05, + "loss": 5.3461, + "step": 17592 + }, + { + "epoch": 0.10463055476258445, + "grad_norm": 1.4525210857391357, + "learning_rate": 4.866166766188412e-05, + "loss": 5.2897, + "step": 17593 + }, + { + "epoch": 0.10463650204586544, + "grad_norm": 1.4203788042068481, + "learning_rate": 4.866151687767107e-05, + "loss": 5.2506, + "step": 17594 + }, + { + "epoch": 0.10464244932914644, + "grad_norm": 1.419097900390625, + "learning_rate": 4.866136608519803e-05, + "loss": 5.246, + "step": 17595 + }, + { + "epoch": 0.10464839661242745, + "grad_norm": 1.8866242170333862, + "learning_rate": 4.8661215284465047e-05, + "loss": 5.5259, + "step": 17596 + }, + { + "epoch": 0.10465434389570843, + "grad_norm": 1.5161887407302856, + "learning_rate": 4.866106447547218e-05, + "loss": 5.2219, + "step": 17597 + }, + { + "epoch": 0.10466029117898944, + "grad_norm": 1.3552051782608032, + "learning_rate": 4.866091365821948e-05, + "loss": 4.9473, + "step": 17598 + }, + { + "epoch": 0.10466623846227044, + "grad_norm": 1.3443762063980103, + "learning_rate": 4.8660762832707e-05, + "loss": 5.0027, + "step": 17599 + }, + { + "epoch": 0.10467218574555143, + "grad_norm": 1.5657448768615723, + "learning_rate": 4.866061199893479e-05, + "loss": 5.3873, + "step": 17600 + }, + { + "epoch": 0.10467813302883243, + "grad_norm": 1.177984595298767, + "learning_rate": 4.866046115690291e-05, + "loss": 4.8628, + "step": 17601 + }, + { + "epoch": 0.10468408031211343, + "grad_norm": 1.1911925077438354, + "learning_rate": 4.8660310306611405e-05, + "loss": 4.7862, + "step": 17602 + }, + { + "epoch": 0.10469002759539442, + "grad_norm": 1.238619327545166, + "learning_rate": 4.866015944806033e-05, + "loss": 4.6844, + "step": 17603 + }, + { + "epoch": 0.10469597487867542, + "grad_norm": 1.4151804447174072, + "learning_rate": 4.8660008581249736e-05, + "loss": 4.7824, + "step": 17604 + }, + { + "epoch": 0.10470192216195642, + "grad_norm": 1.1852803230285645, + "learning_rate": 4.8659857706179676e-05, + "loss": 4.8358, + "step": 17605 + }, + { + "epoch": 0.10470786944523741, + "grad_norm": 1.2641617059707642, + "learning_rate": 4.865970682285022e-05, + "loss": 4.688, + "step": 17606 + }, + { + "epoch": 0.10471381672851841, + "grad_norm": 1.3711220026016235, + "learning_rate": 4.865955593126138e-05, + "loss": 4.6552, + "step": 17607 + }, + { + "epoch": 0.10471976401179942, + "grad_norm": 1.5641502141952515, + "learning_rate": 4.865940503141325e-05, + "loss": 5.0781, + "step": 17608 + }, + { + "epoch": 0.1047257112950804, + "grad_norm": 1.5290453433990479, + "learning_rate": 4.865925412330586e-05, + "loss": 5.1347, + "step": 17609 + }, + { + "epoch": 0.1047316585783614, + "grad_norm": 1.6220836639404297, + "learning_rate": 4.8659103206939275e-05, + "loss": 5.2943, + "step": 17610 + }, + { + "epoch": 0.10473760586164241, + "grad_norm": 1.4212614297866821, + "learning_rate": 4.865895228231353e-05, + "loss": 5.2939, + "step": 17611 + }, + { + "epoch": 0.1047435531449234, + "grad_norm": 1.4920703172683716, + "learning_rate": 4.8658801349428696e-05, + "loss": 5.3314, + "step": 17612 + }, + { + "epoch": 0.1047495004282044, + "grad_norm": 1.4596521854400635, + "learning_rate": 4.865865040828482e-05, + "loss": 5.3082, + "step": 17613 + }, + { + "epoch": 0.1047554477114854, + "grad_norm": 1.2887258529663086, + "learning_rate": 4.865849945888195e-05, + "loss": 5.1002, + "step": 17614 + }, + { + "epoch": 0.10476139499476639, + "grad_norm": 1.3587419986724854, + "learning_rate": 4.8658348501220145e-05, + "loss": 4.9773, + "step": 17615 + }, + { + "epoch": 0.10476734227804739, + "grad_norm": 1.5476746559143066, + "learning_rate": 4.865819753529945e-05, + "loss": 5.0726, + "step": 17616 + }, + { + "epoch": 0.10477328956132839, + "grad_norm": 1.2820343971252441, + "learning_rate": 4.865804656111993e-05, + "loss": 5.0708, + "step": 17617 + }, + { + "epoch": 0.10477923684460938, + "grad_norm": 1.5396101474761963, + "learning_rate": 4.8657895578681634e-05, + "loss": 5.087, + "step": 17618 + }, + { + "epoch": 0.10478518412789038, + "grad_norm": 1.9199161529541016, + "learning_rate": 4.86577445879846e-05, + "loss": 4.9402, + "step": 17619 + }, + { + "epoch": 0.10479113141117137, + "grad_norm": 1.6283903121948242, + "learning_rate": 4.8657593589028894e-05, + "loss": 5.2045, + "step": 17620 + }, + { + "epoch": 0.10479707869445237, + "grad_norm": 1.350632905960083, + "learning_rate": 4.865744258181457e-05, + "loss": 5.2314, + "step": 17621 + }, + { + "epoch": 0.10480302597773337, + "grad_norm": 1.5528992414474487, + "learning_rate": 4.865729156634168e-05, + "loss": 4.9361, + "step": 17622 + }, + { + "epoch": 0.10480897326101436, + "grad_norm": 1.4698718786239624, + "learning_rate": 4.865714054261027e-05, + "loss": 5.6547, + "step": 17623 + }, + { + "epoch": 0.10481492054429536, + "grad_norm": 1.2905457019805908, + "learning_rate": 4.86569895106204e-05, + "loss": 5.5628, + "step": 17624 + }, + { + "epoch": 0.10482086782757637, + "grad_norm": 1.2559312582015991, + "learning_rate": 4.8656838470372116e-05, + "loss": 5.3106, + "step": 17625 + }, + { + "epoch": 0.10482681511085735, + "grad_norm": 1.2229273319244385, + "learning_rate": 4.8656687421865466e-05, + "loss": 5.1566, + "step": 17626 + }, + { + "epoch": 0.10483276239413836, + "grad_norm": 1.4148969650268555, + "learning_rate": 4.8656536365100524e-05, + "loss": 5.1785, + "step": 17627 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 1.4109671115875244, + "learning_rate": 4.865638530007732e-05, + "loss": 4.922, + "step": 17628 + }, + { + "epoch": 0.10484465696070035, + "grad_norm": 1.526160478591919, + "learning_rate": 4.865623422679593e-05, + "loss": 5.0734, + "step": 17629 + }, + { + "epoch": 0.10485060424398135, + "grad_norm": 1.5093508958816528, + "learning_rate": 4.865608314525638e-05, + "loss": 5.1926, + "step": 17630 + }, + { + "epoch": 0.10485655152726235, + "grad_norm": 1.4625009298324585, + "learning_rate": 4.8655932055458734e-05, + "loss": 5.1372, + "step": 17631 + }, + { + "epoch": 0.10486249881054334, + "grad_norm": 1.348502516746521, + "learning_rate": 4.865578095740305e-05, + "loss": 5.0275, + "step": 17632 + }, + { + "epoch": 0.10486844609382434, + "grad_norm": 1.4530283212661743, + "learning_rate": 4.865562985108938e-05, + "loss": 5.093, + "step": 17633 + }, + { + "epoch": 0.10487439337710534, + "grad_norm": 1.4871639013290405, + "learning_rate": 4.865547873651778e-05, + "loss": 5.0789, + "step": 17634 + }, + { + "epoch": 0.10488034066038633, + "grad_norm": 1.2314977645874023, + "learning_rate": 4.865532761368828e-05, + "loss": 5.0966, + "step": 17635 + }, + { + "epoch": 0.10488628794366733, + "grad_norm": 1.3988053798675537, + "learning_rate": 4.865517648260097e-05, + "loss": 5.4284, + "step": 17636 + }, + { + "epoch": 0.10489223522694834, + "grad_norm": 1.3434901237487793, + "learning_rate": 4.865502534325587e-05, + "loss": 5.3563, + "step": 17637 + }, + { + "epoch": 0.10489818251022932, + "grad_norm": 1.3380807638168335, + "learning_rate": 4.865487419565305e-05, + "loss": 5.3628, + "step": 17638 + }, + { + "epoch": 0.10490412979351033, + "grad_norm": 1.5222781896591187, + "learning_rate": 4.865472303979255e-05, + "loss": 5.2164, + "step": 17639 + }, + { + "epoch": 0.10491007707679133, + "grad_norm": 1.2916938066482544, + "learning_rate": 4.865457187567444e-05, + "loss": 5.1248, + "step": 17640 + }, + { + "epoch": 0.10491602436007232, + "grad_norm": 1.4988411664962769, + "learning_rate": 4.8654420703298755e-05, + "loss": 5.0932, + "step": 17641 + }, + { + "epoch": 0.10492197164335332, + "grad_norm": 1.2529023885726929, + "learning_rate": 4.8654269522665564e-05, + "loss": 5.1465, + "step": 17642 + }, + { + "epoch": 0.10492791892663432, + "grad_norm": 1.3913809061050415, + "learning_rate": 4.86541183337749e-05, + "loss": 5.0039, + "step": 17643 + }, + { + "epoch": 0.10493386620991531, + "grad_norm": 1.5128841400146484, + "learning_rate": 4.8653967136626836e-05, + "loss": 4.9937, + "step": 17644 + }, + { + "epoch": 0.10493981349319631, + "grad_norm": 1.3300340175628662, + "learning_rate": 4.865381593122142e-05, + "loss": 5.0521, + "step": 17645 + }, + { + "epoch": 0.10494576077647731, + "grad_norm": 1.6548517942428589, + "learning_rate": 4.86536647175587e-05, + "loss": 5.1361, + "step": 17646 + }, + { + "epoch": 0.1049517080597583, + "grad_norm": 1.2479137182235718, + "learning_rate": 4.865351349563873e-05, + "loss": 5.3129, + "step": 17647 + }, + { + "epoch": 0.1049576553430393, + "grad_norm": 1.3804575204849243, + "learning_rate": 4.8653362265461556e-05, + "loss": 4.9891, + "step": 17648 + }, + { + "epoch": 0.10496360262632029, + "grad_norm": 1.2821561098098755, + "learning_rate": 4.865321102702724e-05, + "loss": 5.0255, + "step": 17649 + }, + { + "epoch": 0.10496954990960129, + "grad_norm": 1.5715882778167725, + "learning_rate": 4.865305978033583e-05, + "loss": 4.9897, + "step": 17650 + }, + { + "epoch": 0.1049754971928823, + "grad_norm": 1.5910687446594238, + "learning_rate": 4.865290852538738e-05, + "loss": 5.1387, + "step": 17651 + }, + { + "epoch": 0.10498144447616328, + "grad_norm": 1.4188683032989502, + "learning_rate": 4.865275726218196e-05, + "loss": 5.3502, + "step": 17652 + }, + { + "epoch": 0.10498739175944428, + "grad_norm": 1.6032958030700684, + "learning_rate": 4.8652605990719594e-05, + "loss": 5.2716, + "step": 17653 + }, + { + "epoch": 0.10499333904272529, + "grad_norm": 1.4894942045211792, + "learning_rate": 4.8652454711000353e-05, + "loss": 5.237, + "step": 17654 + }, + { + "epoch": 0.10499928632600627, + "grad_norm": 1.5370794534683228, + "learning_rate": 4.8652303423024276e-05, + "loss": 5.0227, + "step": 17655 + }, + { + "epoch": 0.10500523360928728, + "grad_norm": 1.4100168943405151, + "learning_rate": 4.865215212679143e-05, + "loss": 5.0713, + "step": 17656 + }, + { + "epoch": 0.10501118089256828, + "grad_norm": 1.6180533170700073, + "learning_rate": 4.8652000822301856e-05, + "loss": 5.2041, + "step": 17657 + }, + { + "epoch": 0.10501712817584927, + "grad_norm": 1.2447609901428223, + "learning_rate": 4.865184950955562e-05, + "loss": 5.1073, + "step": 17658 + }, + { + "epoch": 0.10502307545913027, + "grad_norm": 1.4866548776626587, + "learning_rate": 4.865169818855277e-05, + "loss": 5.1287, + "step": 17659 + }, + { + "epoch": 0.10502902274241127, + "grad_norm": 1.33426034450531, + "learning_rate": 4.865154685929335e-05, + "loss": 5.1343, + "step": 17660 + }, + { + "epoch": 0.10503497002569226, + "grad_norm": 1.122551679611206, + "learning_rate": 4.865139552177742e-05, + "loss": 5.1267, + "step": 17661 + }, + { + "epoch": 0.10504091730897326, + "grad_norm": 1.787278175354004, + "learning_rate": 4.865124417600504e-05, + "loss": 5.4828, + "step": 17662 + }, + { + "epoch": 0.10504686459225426, + "grad_norm": 1.4937405586242676, + "learning_rate": 4.8651092821976246e-05, + "loss": 5.3467, + "step": 17663 + }, + { + "epoch": 0.10505281187553525, + "grad_norm": 1.395286202430725, + "learning_rate": 4.86509414596911e-05, + "loss": 5.1552, + "step": 17664 + }, + { + "epoch": 0.10505875915881625, + "grad_norm": 1.5284260511398315, + "learning_rate": 4.865079008914965e-05, + "loss": 5.2718, + "step": 17665 + }, + { + "epoch": 0.10506470644209726, + "grad_norm": 2.0051753520965576, + "learning_rate": 4.865063871035197e-05, + "loss": 5.1121, + "step": 17666 + }, + { + "epoch": 0.10507065372537824, + "grad_norm": 1.690699577331543, + "learning_rate": 4.8650487323298085e-05, + "loss": 5.1091, + "step": 17667 + }, + { + "epoch": 0.10507660100865925, + "grad_norm": 1.5275843143463135, + "learning_rate": 4.865033592798807e-05, + "loss": 5.3064, + "step": 17668 + }, + { + "epoch": 0.10508254829194025, + "grad_norm": 1.584038496017456, + "learning_rate": 4.865018452442195e-05, + "loss": 5.2598, + "step": 17669 + }, + { + "epoch": 0.10508849557522124, + "grad_norm": 1.8086310625076294, + "learning_rate": 4.865003311259981e-05, + "loss": 5.2229, + "step": 17670 + }, + { + "epoch": 0.10509444285850224, + "grad_norm": 1.805972695350647, + "learning_rate": 4.864988169252168e-05, + "loss": 5.1051, + "step": 17671 + }, + { + "epoch": 0.10510039014178324, + "grad_norm": 1.6209838390350342, + "learning_rate": 4.864973026418762e-05, + "loss": 5.1808, + "step": 17672 + }, + { + "epoch": 0.10510633742506423, + "grad_norm": 1.3997793197631836, + "learning_rate": 4.8649578827597684e-05, + "loss": 4.9167, + "step": 17673 + }, + { + "epoch": 0.10511228470834523, + "grad_norm": 1.368037462234497, + "learning_rate": 4.8649427382751925e-05, + "loss": 4.98, + "step": 17674 + }, + { + "epoch": 0.10511823199162623, + "grad_norm": 1.3904718160629272, + "learning_rate": 4.864927592965039e-05, + "loss": 4.8101, + "step": 17675 + }, + { + "epoch": 0.10512417927490722, + "grad_norm": 1.3237133026123047, + "learning_rate": 4.864912446829315e-05, + "loss": 5.1427, + "step": 17676 + }, + { + "epoch": 0.10513012655818822, + "grad_norm": 1.2642048597335815, + "learning_rate": 4.864897299868024e-05, + "loss": 5.2961, + "step": 17677 + }, + { + "epoch": 0.10513607384146921, + "grad_norm": 1.4357531070709229, + "learning_rate": 4.864882152081172e-05, + "loss": 5.4811, + "step": 17678 + }, + { + "epoch": 0.10514202112475021, + "grad_norm": 1.652321696281433, + "learning_rate": 4.864867003468763e-05, + "loss": 5.2172, + "step": 17679 + }, + { + "epoch": 0.10514796840803121, + "grad_norm": 1.6143925189971924, + "learning_rate": 4.864851854030804e-05, + "loss": 4.9856, + "step": 17680 + }, + { + "epoch": 0.1051539156913122, + "grad_norm": 1.637320637702942, + "learning_rate": 4.8648367037673e-05, + "loss": 4.9458, + "step": 17681 + }, + { + "epoch": 0.1051598629745932, + "grad_norm": 1.650970458984375, + "learning_rate": 4.864821552678256e-05, + "loss": 4.714, + "step": 17682 + }, + { + "epoch": 0.1051658102578742, + "grad_norm": 1.616098403930664, + "learning_rate": 4.864806400763676e-05, + "loss": 4.7064, + "step": 17683 + }, + { + "epoch": 0.1051717575411552, + "grad_norm": 1.6400461196899414, + "learning_rate": 4.864791248023568e-05, + "loss": 4.5955, + "step": 17684 + }, + { + "epoch": 0.1051777048244362, + "grad_norm": 1.3815523386001587, + "learning_rate": 4.8647760944579344e-05, + "loss": 4.7491, + "step": 17685 + }, + { + "epoch": 0.1051836521077172, + "grad_norm": 1.5695693492889404, + "learning_rate": 4.864760940066783e-05, + "loss": 4.6242, + "step": 17686 + }, + { + "epoch": 0.10518959939099819, + "grad_norm": 1.5861409902572632, + "learning_rate": 4.8647457848501174e-05, + "loss": 4.5859, + "step": 17687 + }, + { + "epoch": 0.10519554667427919, + "grad_norm": 1.637741208076477, + "learning_rate": 4.864730628807944e-05, + "loss": 4.6572, + "step": 17688 + }, + { + "epoch": 0.10520149395756019, + "grad_norm": 1.5806957483291626, + "learning_rate": 4.864715471940268e-05, + "loss": 4.8879, + "step": 17689 + }, + { + "epoch": 0.10520744124084118, + "grad_norm": 2.0158286094665527, + "learning_rate": 4.864700314247093e-05, + "loss": 5.5019, + "step": 17690 + }, + { + "epoch": 0.10521338852412218, + "grad_norm": 1.5022921562194824, + "learning_rate": 4.8646851557284256e-05, + "loss": 5.2029, + "step": 17691 + }, + { + "epoch": 0.10521933580740318, + "grad_norm": 1.8164446353912354, + "learning_rate": 4.864669996384272e-05, + "loss": 4.9258, + "step": 17692 + }, + { + "epoch": 0.10522528309068417, + "grad_norm": 1.6789724826812744, + "learning_rate": 4.864654836214636e-05, + "loss": 5.0876, + "step": 17693 + }, + { + "epoch": 0.10523123037396517, + "grad_norm": 1.778971552848816, + "learning_rate": 4.864639675219523e-05, + "loss": 5.1052, + "step": 17694 + }, + { + "epoch": 0.10523717765724618, + "grad_norm": 1.2401436567306519, + "learning_rate": 4.8646245133989396e-05, + "loss": 5.2536, + "step": 17695 + }, + { + "epoch": 0.10524312494052716, + "grad_norm": 1.6509275436401367, + "learning_rate": 4.8646093507528904e-05, + "loss": 4.9215, + "step": 17696 + }, + { + "epoch": 0.10524907222380817, + "grad_norm": 1.3725727796554565, + "learning_rate": 4.864594187281379e-05, + "loss": 5.5578, + "step": 17697 + }, + { + "epoch": 0.10525501950708917, + "grad_norm": 1.481040358543396, + "learning_rate": 4.864579022984413e-05, + "loss": 5.4683, + "step": 17698 + }, + { + "epoch": 0.10526096679037016, + "grad_norm": 1.4682444334030151, + "learning_rate": 4.864563857861998e-05, + "loss": 5.5076, + "step": 17699 + }, + { + "epoch": 0.10526691407365116, + "grad_norm": 1.2660551071166992, + "learning_rate": 4.864548691914137e-05, + "loss": 5.6092, + "step": 17700 + }, + { + "epoch": 0.10527286135693216, + "grad_norm": 1.266858458518982, + "learning_rate": 4.8645335251408366e-05, + "loss": 5.4373, + "step": 17701 + }, + { + "epoch": 0.10527880864021315, + "grad_norm": 1.5075262784957886, + "learning_rate": 4.8645183575421024e-05, + "loss": 5.3651, + "step": 17702 + }, + { + "epoch": 0.10528475592349415, + "grad_norm": 1.6108607053756714, + "learning_rate": 4.864503189117939e-05, + "loss": 5.3372, + "step": 17703 + }, + { + "epoch": 0.10529070320677515, + "grad_norm": 1.677874207496643, + "learning_rate": 4.8644880198683515e-05, + "loss": 4.9378, + "step": 17704 + }, + { + "epoch": 0.10529665049005614, + "grad_norm": 1.5847524404525757, + "learning_rate": 4.864472849793346e-05, + "loss": 5.2918, + "step": 17705 + }, + { + "epoch": 0.10530259777333714, + "grad_norm": 1.598244309425354, + "learning_rate": 4.864457678892927e-05, + "loss": 5.2408, + "step": 17706 + }, + { + "epoch": 0.10530854505661813, + "grad_norm": 1.4147340059280396, + "learning_rate": 4.8644425071671015e-05, + "loss": 5.2856, + "step": 17707 + }, + { + "epoch": 0.10531449233989913, + "grad_norm": 1.6057299375534058, + "learning_rate": 4.8644273346158734e-05, + "loss": 5.343, + "step": 17708 + }, + { + "epoch": 0.10532043962318013, + "grad_norm": 1.3503344058990479, + "learning_rate": 4.864412161239247e-05, + "loss": 5.4081, + "step": 17709 + }, + { + "epoch": 0.10532638690646112, + "grad_norm": 1.8316742181777954, + "learning_rate": 4.8643969870372295e-05, + "loss": 4.7925, + "step": 17710 + }, + { + "epoch": 0.10533233418974212, + "grad_norm": 2.1429593563079834, + "learning_rate": 4.864381812009825e-05, + "loss": 4.3519, + "step": 17711 + }, + { + "epoch": 0.10533828147302313, + "grad_norm": 1.9665764570236206, + "learning_rate": 4.8643666361570396e-05, + "loss": 4.388, + "step": 17712 + }, + { + "epoch": 0.10534422875630411, + "grad_norm": 1.7851755619049072, + "learning_rate": 4.864351459478878e-05, + "loss": 4.5242, + "step": 17713 + }, + { + "epoch": 0.10535017603958512, + "grad_norm": 1.8347305059432983, + "learning_rate": 4.864336281975346e-05, + "loss": 4.166, + "step": 17714 + }, + { + "epoch": 0.10535612332286612, + "grad_norm": 1.9413511753082275, + "learning_rate": 4.864321103646449e-05, + "loss": 4.0937, + "step": 17715 + }, + { + "epoch": 0.1053620706061471, + "grad_norm": 1.8122237920761108, + "learning_rate": 4.8643059244921904e-05, + "loss": 4.3812, + "step": 17716 + }, + { + "epoch": 0.10536801788942811, + "grad_norm": 2.0114996433258057, + "learning_rate": 4.864290744512578e-05, + "loss": 4.0728, + "step": 17717 + }, + { + "epoch": 0.10537396517270911, + "grad_norm": 1.8565599918365479, + "learning_rate": 4.8642755637076165e-05, + "loss": 4.2625, + "step": 17718 + }, + { + "epoch": 0.1053799124559901, + "grad_norm": 1.9136046171188354, + "learning_rate": 4.8642603820773105e-05, + "loss": 4.4933, + "step": 17719 + }, + { + "epoch": 0.1053858597392711, + "grad_norm": 1.8930033445358276, + "learning_rate": 4.864245199621666e-05, + "loss": 4.3249, + "step": 17720 + }, + { + "epoch": 0.1053918070225521, + "grad_norm": 1.7729578018188477, + "learning_rate": 4.864230016340687e-05, + "loss": 4.4736, + "step": 17721 + }, + { + "epoch": 0.10539775430583309, + "grad_norm": 2.1663360595703125, + "learning_rate": 4.864214832234381e-05, + "loss": 4.7505, + "step": 17722 + }, + { + "epoch": 0.1054037015891141, + "grad_norm": 1.9864879846572876, + "learning_rate": 4.864199647302751e-05, + "loss": 4.7233, + "step": 17723 + }, + { + "epoch": 0.1054096488723951, + "grad_norm": 2.031329870223999, + "learning_rate": 4.8641844615458035e-05, + "loss": 4.8218, + "step": 17724 + }, + { + "epoch": 0.10541559615567608, + "grad_norm": 2.0325984954833984, + "learning_rate": 4.864169274963544e-05, + "loss": 4.9383, + "step": 17725 + }, + { + "epoch": 0.10542154343895709, + "grad_norm": 1.9482324123382568, + "learning_rate": 4.864154087555977e-05, + "loss": 5.0849, + "step": 17726 + }, + { + "epoch": 0.10542749072223809, + "grad_norm": 1.6887640953063965, + "learning_rate": 4.864138899323108e-05, + "loss": 5.0216, + "step": 17727 + }, + { + "epoch": 0.10543343800551908, + "grad_norm": 2.0226924419403076, + "learning_rate": 4.864123710264944e-05, + "loss": 4.9241, + "step": 17728 + }, + { + "epoch": 0.10543938528880008, + "grad_norm": 1.647629976272583, + "learning_rate": 4.8641085203814873e-05, + "loss": 5.0318, + "step": 17729 + }, + { + "epoch": 0.10544533257208108, + "grad_norm": 1.766290545463562, + "learning_rate": 4.864093329672745e-05, + "loss": 4.9034, + "step": 17730 + }, + { + "epoch": 0.10545127985536207, + "grad_norm": 1.7573658227920532, + "learning_rate": 4.864078138138723e-05, + "loss": 4.7783, + "step": 17731 + }, + { + "epoch": 0.10545722713864307, + "grad_norm": 1.5503767728805542, + "learning_rate": 4.864062945779425e-05, + "loss": 5.1085, + "step": 17732 + }, + { + "epoch": 0.10546317442192407, + "grad_norm": 1.7276320457458496, + "learning_rate": 4.864047752594857e-05, + "loss": 4.8028, + "step": 17733 + }, + { + "epoch": 0.10546912170520506, + "grad_norm": 1.9654134511947632, + "learning_rate": 4.864032558585024e-05, + "loss": 5.1221, + "step": 17734 + }, + { + "epoch": 0.10547506898848606, + "grad_norm": 1.9654512405395508, + "learning_rate": 4.864017363749933e-05, + "loss": 5.0463, + "step": 17735 + }, + { + "epoch": 0.10548101627176705, + "grad_norm": 1.9071869850158691, + "learning_rate": 4.864002168089587e-05, + "loss": 5.0822, + "step": 17736 + }, + { + "epoch": 0.10548696355504805, + "grad_norm": 2.4190056324005127, + "learning_rate": 4.863986971603993e-05, + "loss": 5.7404, + "step": 17737 + }, + { + "epoch": 0.10549291083832905, + "grad_norm": 2.2098371982574463, + "learning_rate": 4.863971774293155e-05, + "loss": 5.9282, + "step": 17738 + }, + { + "epoch": 0.10549885812161004, + "grad_norm": 2.569831132888794, + "learning_rate": 4.8639565761570784e-05, + "loss": 4.3309, + "step": 17739 + }, + { + "epoch": 0.10550480540489104, + "grad_norm": 2.252847909927368, + "learning_rate": 4.8639413771957696e-05, + "loss": 4.185, + "step": 17740 + }, + { + "epoch": 0.10551075268817205, + "grad_norm": 2.3022215366363525, + "learning_rate": 4.8639261774092325e-05, + "loss": 4.3537, + "step": 17741 + }, + { + "epoch": 0.10551669997145303, + "grad_norm": 2.2695138454437256, + "learning_rate": 4.8639109767974745e-05, + "loss": 3.9806, + "step": 17742 + }, + { + "epoch": 0.10552264725473404, + "grad_norm": 2.1722588539123535, + "learning_rate": 4.8638957753604985e-05, + "loss": 3.9803, + "step": 17743 + }, + { + "epoch": 0.10552859453801504, + "grad_norm": 2.4385933876037598, + "learning_rate": 4.863880573098312e-05, + "loss": 4.0148, + "step": 17744 + }, + { + "epoch": 0.10553454182129603, + "grad_norm": 2.3186235427856445, + "learning_rate": 4.8638653700109184e-05, + "loss": 3.979, + "step": 17745 + }, + { + "epoch": 0.10554048910457703, + "grad_norm": 2.4591264724731445, + "learning_rate": 4.863850166098324e-05, + "loss": 3.9258, + "step": 17746 + }, + { + "epoch": 0.10554643638785803, + "grad_norm": 2.2619590759277344, + "learning_rate": 4.8638349613605336e-05, + "loss": 4.0571, + "step": 17747 + }, + { + "epoch": 0.10555238367113902, + "grad_norm": 2.393226146697998, + "learning_rate": 4.863819755797553e-05, + "loss": 4.0036, + "step": 17748 + }, + { + "epoch": 0.10555833095442002, + "grad_norm": 2.281846046447754, + "learning_rate": 4.8638045494093875e-05, + "loss": 3.9382, + "step": 17749 + }, + { + "epoch": 0.10556427823770102, + "grad_norm": 2.165407657623291, + "learning_rate": 4.8637893421960425e-05, + "loss": 4.0204, + "step": 17750 + }, + { + "epoch": 0.10557022552098201, + "grad_norm": 2.131829261779785, + "learning_rate": 4.863774134157523e-05, + "loss": 4.8661, + "step": 17751 + }, + { + "epoch": 0.10557617280426301, + "grad_norm": 2.0619029998779297, + "learning_rate": 4.863758925293834e-05, + "loss": 5.5522, + "step": 17752 + }, + { + "epoch": 0.10558212008754402, + "grad_norm": 1.6535427570343018, + "learning_rate": 4.863743715604981e-05, + "loss": 5.3463, + "step": 17753 + }, + { + "epoch": 0.105588067370825, + "grad_norm": 1.903904676437378, + "learning_rate": 4.86372850509097e-05, + "loss": 5.7202, + "step": 17754 + }, + { + "epoch": 0.105594014654106, + "grad_norm": 1.649357557296753, + "learning_rate": 4.863713293751806e-05, + "loss": 5.577, + "step": 17755 + }, + { + "epoch": 0.10559996193738701, + "grad_norm": 2.0812721252441406, + "learning_rate": 4.8636980815874936e-05, + "loss": 5.3164, + "step": 17756 + }, + { + "epoch": 0.105605909220668, + "grad_norm": 2.312357187271118, + "learning_rate": 4.8636828685980384e-05, + "loss": 5.3018, + "step": 17757 + }, + { + "epoch": 0.105611856503949, + "grad_norm": 2.1815388202667236, + "learning_rate": 4.863667654783447e-05, + "loss": 5.1509, + "step": 17758 + }, + { + "epoch": 0.10561780378723, + "grad_norm": 1.7500512599945068, + "learning_rate": 4.8636524401437225e-05, + "loss": 5.492, + "step": 17759 + }, + { + "epoch": 0.10562375107051099, + "grad_norm": 1.6850415468215942, + "learning_rate": 4.863637224678872e-05, + "loss": 5.5086, + "step": 17760 + }, + { + "epoch": 0.10562969835379199, + "grad_norm": 1.7222185134887695, + "learning_rate": 4.8636220083889e-05, + "loss": 5.4139, + "step": 17761 + }, + { + "epoch": 0.10563564563707299, + "grad_norm": 1.627914309501648, + "learning_rate": 4.8636067912738116e-05, + "loss": 5.5763, + "step": 17762 + }, + { + "epoch": 0.10564159292035398, + "grad_norm": 1.5884100198745728, + "learning_rate": 4.863591573333613e-05, + "loss": 5.544, + "step": 17763 + }, + { + "epoch": 0.10564754020363498, + "grad_norm": 1.4660178422927856, + "learning_rate": 4.8635763545683085e-05, + "loss": 5.4913, + "step": 17764 + }, + { + "epoch": 0.10565348748691597, + "grad_norm": 1.5240764617919922, + "learning_rate": 4.863561134977904e-05, + "loss": 5.4757, + "step": 17765 + }, + { + "epoch": 0.10565943477019697, + "grad_norm": 1.3686332702636719, + "learning_rate": 4.863545914562406e-05, + "loss": 5.4934, + "step": 17766 + }, + { + "epoch": 0.10566538205347797, + "grad_norm": 1.5429164171218872, + "learning_rate": 4.863530693321817e-05, + "loss": 5.3654, + "step": 17767 + }, + { + "epoch": 0.10567132933675896, + "grad_norm": 1.4237322807312012, + "learning_rate": 4.863515471256145e-05, + "loss": 5.4128, + "step": 17768 + }, + { + "epoch": 0.10567727662003996, + "grad_norm": 1.6438677310943604, + "learning_rate": 4.863500248365393e-05, + "loss": 5.3129, + "step": 17769 + }, + { + "epoch": 0.10568322390332097, + "grad_norm": 1.9208921194076538, + "learning_rate": 4.8634850246495675e-05, + "loss": 5.4889, + "step": 17770 + }, + { + "epoch": 0.10568917118660195, + "grad_norm": 1.6967288255691528, + "learning_rate": 4.863469800108675e-05, + "loss": 5.5301, + "step": 17771 + }, + { + "epoch": 0.10569511846988296, + "grad_norm": 1.5820802450180054, + "learning_rate": 4.8634545747427185e-05, + "loss": 5.4126, + "step": 17772 + }, + { + "epoch": 0.10570106575316396, + "grad_norm": 1.8280025720596313, + "learning_rate": 4.8634393485517046e-05, + "loss": 6.1201, + "step": 17773 + }, + { + "epoch": 0.10570701303644495, + "grad_norm": 1.809193730354309, + "learning_rate": 4.8634241215356394e-05, + "loss": 5.4123, + "step": 17774 + }, + { + "epoch": 0.10571296031972595, + "grad_norm": 1.596528172492981, + "learning_rate": 4.863408893694527e-05, + "loss": 5.6865, + "step": 17775 + }, + { + "epoch": 0.10571890760300695, + "grad_norm": 1.7726397514343262, + "learning_rate": 4.8633936650283715e-05, + "loss": 5.7298, + "step": 17776 + }, + { + "epoch": 0.10572485488628794, + "grad_norm": 1.5804529190063477, + "learning_rate": 4.863378435537182e-05, + "loss": 5.6051, + "step": 17777 + }, + { + "epoch": 0.10573080216956894, + "grad_norm": 1.5244919061660767, + "learning_rate": 4.8633632052209595e-05, + "loss": 5.7402, + "step": 17778 + }, + { + "epoch": 0.10573674945284994, + "grad_norm": 1.5003318786621094, + "learning_rate": 4.8633479740797117e-05, + "loss": 5.6978, + "step": 17779 + }, + { + "epoch": 0.10574269673613093, + "grad_norm": 1.7325289249420166, + "learning_rate": 4.863332742113444e-05, + "loss": 5.8616, + "step": 17780 + }, + { + "epoch": 0.10574864401941193, + "grad_norm": 1.8214267492294312, + "learning_rate": 4.863317509322161e-05, + "loss": 5.9213, + "step": 17781 + }, + { + "epoch": 0.10575459130269294, + "grad_norm": 1.7067787647247314, + "learning_rate": 4.863302275705869e-05, + "loss": 5.5518, + "step": 17782 + }, + { + "epoch": 0.10576053858597392, + "grad_norm": 1.8018234968185425, + "learning_rate": 4.863287041264571e-05, + "loss": 5.5241, + "step": 17783 + }, + { + "epoch": 0.10576648586925493, + "grad_norm": 1.7645032405853271, + "learning_rate": 4.863271805998275e-05, + "loss": 5.6471, + "step": 17784 + }, + { + "epoch": 0.10577243315253593, + "grad_norm": 1.6891655921936035, + "learning_rate": 4.8632565699069854e-05, + "loss": 5.9138, + "step": 17785 + }, + { + "epoch": 0.10577838043581692, + "grad_norm": 1.6546204090118408, + "learning_rate": 4.8632413329907076e-05, + "loss": 5.8511, + "step": 17786 + }, + { + "epoch": 0.10578432771909792, + "grad_norm": 1.864680528640747, + "learning_rate": 4.863226095249446e-05, + "loss": 5.7665, + "step": 17787 + }, + { + "epoch": 0.10579027500237892, + "grad_norm": 1.9052486419677734, + "learning_rate": 4.863210856683207e-05, + "loss": 5.6528, + "step": 17788 + }, + { + "epoch": 0.10579622228565991, + "grad_norm": 2.212982416152954, + "learning_rate": 4.8631956172919944e-05, + "loss": 5.2294, + "step": 17789 + }, + { + "epoch": 0.10580216956894091, + "grad_norm": 2.0703213214874268, + "learning_rate": 4.863180377075816e-05, + "loss": 4.9963, + "step": 17790 + }, + { + "epoch": 0.10580811685222191, + "grad_norm": 2.1718661785125732, + "learning_rate": 4.863165136034675e-05, + "loss": 5.1047, + "step": 17791 + }, + { + "epoch": 0.1058140641355029, + "grad_norm": 2.2078070640563965, + "learning_rate": 4.8631498941685774e-05, + "loss": 5.2682, + "step": 17792 + }, + { + "epoch": 0.1058200114187839, + "grad_norm": 2.187614917755127, + "learning_rate": 4.863134651477529e-05, + "loss": 4.9008, + "step": 17793 + }, + { + "epoch": 0.10582595870206489, + "grad_norm": 1.7202839851379395, + "learning_rate": 4.863119407961535e-05, + "loss": 5.1006, + "step": 17794 + }, + { + "epoch": 0.10583190598534589, + "grad_norm": 2.3109450340270996, + "learning_rate": 4.8631041636206e-05, + "loss": 4.8489, + "step": 17795 + }, + { + "epoch": 0.1058378532686269, + "grad_norm": 2.2688632011413574, + "learning_rate": 4.8630889184547295e-05, + "loss": 4.953, + "step": 17796 + }, + { + "epoch": 0.10584380055190788, + "grad_norm": 2.0636980533599854, + "learning_rate": 4.863073672463929e-05, + "loss": 4.9537, + "step": 17797 + }, + { + "epoch": 0.10584974783518888, + "grad_norm": 1.9752720594406128, + "learning_rate": 4.863058425648205e-05, + "loss": 4.8646, + "step": 17798 + }, + { + "epoch": 0.10585569511846989, + "grad_norm": 1.9784966707229614, + "learning_rate": 4.86304317800756e-05, + "loss": 5.1245, + "step": 17799 + }, + { + "epoch": 0.10586164240175087, + "grad_norm": 1.812218427658081, + "learning_rate": 4.863027929542002e-05, + "loss": 5.4367, + "step": 17800 + }, + { + "epoch": 0.10586758968503188, + "grad_norm": 1.8048956394195557, + "learning_rate": 4.863012680251536e-05, + "loss": 5.6052, + "step": 17801 + }, + { + "epoch": 0.10587353696831288, + "grad_norm": 1.9246432781219482, + "learning_rate": 4.862997430136166e-05, + "loss": 5.9335, + "step": 17802 + }, + { + "epoch": 0.10587948425159387, + "grad_norm": 1.5138533115386963, + "learning_rate": 4.862982179195897e-05, + "loss": 5.8785, + "step": 17803 + }, + { + "epoch": 0.10588543153487487, + "grad_norm": 1.4948742389678955, + "learning_rate": 4.862966927430737e-05, + "loss": 5.7478, + "step": 17804 + }, + { + "epoch": 0.10589137881815587, + "grad_norm": 1.4670746326446533, + "learning_rate": 4.862951674840689e-05, + "loss": 5.7397, + "step": 17805 + }, + { + "epoch": 0.10589732610143686, + "grad_norm": 1.4234925508499146, + "learning_rate": 4.862936421425759e-05, + "loss": 5.9919, + "step": 17806 + }, + { + "epoch": 0.10590327338471786, + "grad_norm": 1.8313277959823608, + "learning_rate": 4.862921167185953e-05, + "loss": 5.7289, + "step": 17807 + }, + { + "epoch": 0.10590922066799886, + "grad_norm": 1.7373311519622803, + "learning_rate": 4.8629059121212745e-05, + "loss": 5.7652, + "step": 17808 + }, + { + "epoch": 0.10591516795127985, + "grad_norm": 1.7706129550933838, + "learning_rate": 4.86289065623173e-05, + "loss": 5.4623, + "step": 17809 + }, + { + "epoch": 0.10592111523456085, + "grad_norm": 1.7332470417022705, + "learning_rate": 4.862875399517325e-05, + "loss": 5.5546, + "step": 17810 + }, + { + "epoch": 0.10592706251784186, + "grad_norm": 1.7493473291397095, + "learning_rate": 4.862860141978065e-05, + "loss": 5.2762, + "step": 17811 + }, + { + "epoch": 0.10593300980112284, + "grad_norm": 1.8064602613449097, + "learning_rate": 4.862844883613955e-05, + "loss": 5.2969, + "step": 17812 + }, + { + "epoch": 0.10593895708440385, + "grad_norm": 1.6318674087524414, + "learning_rate": 4.862829624425e-05, + "loss": 5.3229, + "step": 17813 + }, + { + "epoch": 0.10594490436768485, + "grad_norm": 1.7438777685165405, + "learning_rate": 4.8628143644112056e-05, + "loss": 5.3167, + "step": 17814 + }, + { + "epoch": 0.10595085165096584, + "grad_norm": 1.8095386028289795, + "learning_rate": 4.8627991035725774e-05, + "loss": 5.2744, + "step": 17815 + }, + { + "epoch": 0.10595679893424684, + "grad_norm": 1.8095691204071045, + "learning_rate": 4.86278384190912e-05, + "loss": 5.5105, + "step": 17816 + }, + { + "epoch": 0.10596274621752784, + "grad_norm": 1.858776569366455, + "learning_rate": 4.862768579420839e-05, + "loss": 5.4338, + "step": 17817 + }, + { + "epoch": 0.10596869350080883, + "grad_norm": 1.8224806785583496, + "learning_rate": 4.86275331610774e-05, + "loss": 5.6273, + "step": 17818 + }, + { + "epoch": 0.10597464078408983, + "grad_norm": 1.6850696802139282, + "learning_rate": 4.8627380519698284e-05, + "loss": 5.9963, + "step": 17819 + }, + { + "epoch": 0.10598058806737083, + "grad_norm": 1.4804600477218628, + "learning_rate": 4.86272278700711e-05, + "loss": 5.726, + "step": 17820 + }, + { + "epoch": 0.10598653535065182, + "grad_norm": 1.721027135848999, + "learning_rate": 4.862707521219589e-05, + "loss": 5.191, + "step": 17821 + }, + { + "epoch": 0.10599248263393282, + "grad_norm": 1.8109691143035889, + "learning_rate": 4.862692254607271e-05, + "loss": 4.926, + "step": 17822 + }, + { + "epoch": 0.10599842991721381, + "grad_norm": 1.7531434297561646, + "learning_rate": 4.862676987170162e-05, + "loss": 5.0376, + "step": 17823 + }, + { + "epoch": 0.10600437720049481, + "grad_norm": 1.6847648620605469, + "learning_rate": 4.8626617189082656e-05, + "loss": 5.0376, + "step": 17824 + }, + { + "epoch": 0.10601032448377581, + "grad_norm": 1.6512411832809448, + "learning_rate": 4.86264644982159e-05, + "loss": 5.087, + "step": 17825 + }, + { + "epoch": 0.1060162717670568, + "grad_norm": 1.6410924196243286, + "learning_rate": 4.8626311799101375e-05, + "loss": 5.6917, + "step": 17826 + }, + { + "epoch": 0.1060222190503378, + "grad_norm": 2.1565957069396973, + "learning_rate": 4.862615909173916e-05, + "loss": 4.619, + "step": 17827 + }, + { + "epoch": 0.1060281663336188, + "grad_norm": 1.8235310316085815, + "learning_rate": 4.86260063761293e-05, + "loss": 5.1155, + "step": 17828 + }, + { + "epoch": 0.1060341136168998, + "grad_norm": 1.7710633277893066, + "learning_rate": 4.862585365227184e-05, + "loss": 4.7845, + "step": 17829 + }, + { + "epoch": 0.1060400609001808, + "grad_norm": 2.174832820892334, + "learning_rate": 4.862570092016683e-05, + "loss": 4.6384, + "step": 17830 + }, + { + "epoch": 0.1060460081834618, + "grad_norm": 2.359682321548462, + "learning_rate": 4.862554817981434e-05, + "loss": 4.2191, + "step": 17831 + }, + { + "epoch": 0.10605195546674279, + "grad_norm": 2.4251585006713867, + "learning_rate": 4.8625395431214414e-05, + "loss": 4.0982, + "step": 17832 + }, + { + "epoch": 0.10605790275002379, + "grad_norm": 2.543009042739868, + "learning_rate": 4.86252426743671e-05, + "loss": 4.0773, + "step": 17833 + }, + { + "epoch": 0.10606385003330479, + "grad_norm": 2.6991419792175293, + "learning_rate": 4.862508990927247e-05, + "loss": 4.0209, + "step": 17834 + }, + { + "epoch": 0.10606979731658578, + "grad_norm": 2.354445695877075, + "learning_rate": 4.862493713593056e-05, + "loss": 3.9223, + "step": 17835 + }, + { + "epoch": 0.10607574459986678, + "grad_norm": 2.5119223594665527, + "learning_rate": 4.8624784354341426e-05, + "loss": 3.9006, + "step": 17836 + }, + { + "epoch": 0.10608169188314778, + "grad_norm": 2.717792272567749, + "learning_rate": 4.862463156450513e-05, + "loss": 4.3295, + "step": 17837 + }, + { + "epoch": 0.10608763916642877, + "grad_norm": 3.1779162883758545, + "learning_rate": 4.862447876642171e-05, + "loss": 4.3483, + "step": 17838 + }, + { + "epoch": 0.10609358644970977, + "grad_norm": 2.272994041442871, + "learning_rate": 4.8624325960091235e-05, + "loss": 4.2826, + "step": 17839 + }, + { + "epoch": 0.10609953373299078, + "grad_norm": 2.4689860343933105, + "learning_rate": 4.862417314551375e-05, + "loss": 4.9144, + "step": 17840 + }, + { + "epoch": 0.10610548101627176, + "grad_norm": 1.8101458549499512, + "learning_rate": 4.862402032268931e-05, + "loss": 5.9325, + "step": 17841 + }, + { + "epoch": 0.10611142829955277, + "grad_norm": 1.9994734525680542, + "learning_rate": 4.862386749161797e-05, + "loss": 5.5438, + "step": 17842 + }, + { + "epoch": 0.10611737558283377, + "grad_norm": 2.5475401878356934, + "learning_rate": 4.8623714652299786e-05, + "loss": 5.2262, + "step": 17843 + }, + { + "epoch": 0.10612332286611476, + "grad_norm": 2.286040782928467, + "learning_rate": 4.86235618047348e-05, + "loss": 5.065, + "step": 17844 + }, + { + "epoch": 0.10612927014939576, + "grad_norm": 1.788761854171753, + "learning_rate": 4.862340894892308e-05, + "loss": 5.5053, + "step": 17845 + }, + { + "epoch": 0.10613521743267676, + "grad_norm": 2.2951841354370117, + "learning_rate": 4.8623256084864663e-05, + "loss": 5.1262, + "step": 17846 + }, + { + "epoch": 0.10614116471595775, + "grad_norm": 1.962814211845398, + "learning_rate": 4.862310321255962e-05, + "loss": 5.8084, + "step": 17847 + }, + { + "epoch": 0.10614711199923875, + "grad_norm": 1.7888414859771729, + "learning_rate": 4.862295033200799e-05, + "loss": 5.2409, + "step": 17848 + }, + { + "epoch": 0.10615305928251975, + "grad_norm": 1.7108670473098755, + "learning_rate": 4.862279744320983e-05, + "loss": 5.6138, + "step": 17849 + }, + { + "epoch": 0.10615900656580074, + "grad_norm": 1.7636443376541138, + "learning_rate": 4.8622644546165196e-05, + "loss": 5.5664, + "step": 17850 + }, + { + "epoch": 0.10616495384908174, + "grad_norm": 1.7193186283111572, + "learning_rate": 4.8622491640874147e-05, + "loss": 5.7852, + "step": 17851 + }, + { + "epoch": 0.10617090113236273, + "grad_norm": 1.817215919494629, + "learning_rate": 4.8622338727336723e-05, + "loss": 5.5478, + "step": 17852 + }, + { + "epoch": 0.10617684841564373, + "grad_norm": 1.547817349433899, + "learning_rate": 4.8622185805552994e-05, + "loss": 5.5249, + "step": 17853 + }, + { + "epoch": 0.10618279569892473, + "grad_norm": 1.577528953552246, + "learning_rate": 4.862203287552299e-05, + "loss": 5.7268, + "step": 17854 + }, + { + "epoch": 0.10618874298220572, + "grad_norm": 1.4524853229522705, + "learning_rate": 4.862187993724679e-05, + "loss": 5.8539, + "step": 17855 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 1.6361198425292969, + "learning_rate": 4.8621726990724437e-05, + "loss": 5.0815, + "step": 17856 + }, + { + "epoch": 0.10620063754876773, + "grad_norm": 1.65043044090271, + "learning_rate": 4.862157403595598e-05, + "loss": 5.1938, + "step": 17857 + }, + { + "epoch": 0.10620658483204871, + "grad_norm": 1.6236746311187744, + "learning_rate": 4.8621421072941476e-05, + "loss": 5.5602, + "step": 17858 + }, + { + "epoch": 0.10621253211532972, + "grad_norm": 1.4648228883743286, + "learning_rate": 4.862126810168097e-05, + "loss": 5.3728, + "step": 17859 + }, + { + "epoch": 0.10621847939861072, + "grad_norm": 1.4803123474121094, + "learning_rate": 4.862111512217453e-05, + "loss": 5.58, + "step": 17860 + }, + { + "epoch": 0.1062244266818917, + "grad_norm": 1.320387840270996, + "learning_rate": 4.862096213442221e-05, + "loss": 5.0337, + "step": 17861 + }, + { + "epoch": 0.10623037396517271, + "grad_norm": 1.8309158086776733, + "learning_rate": 4.862080913842405e-05, + "loss": 4.3603, + "step": 17862 + }, + { + "epoch": 0.10623632124845371, + "grad_norm": 1.79231595993042, + "learning_rate": 4.86206561341801e-05, + "loss": 4.401, + "step": 17863 + }, + { + "epoch": 0.1062422685317347, + "grad_norm": 1.7894480228424072, + "learning_rate": 4.862050312169043e-05, + "loss": 4.4592, + "step": 17864 + }, + { + "epoch": 0.1062482158150157, + "grad_norm": 1.8271396160125732, + "learning_rate": 4.8620350100955095e-05, + "loss": 4.2442, + "step": 17865 + }, + { + "epoch": 0.1062541630982967, + "grad_norm": 2.03336238861084, + "learning_rate": 4.862019707197413e-05, + "loss": 4.6245, + "step": 17866 + }, + { + "epoch": 0.10626011038157769, + "grad_norm": 1.8034088611602783, + "learning_rate": 4.86200440347476e-05, + "loss": 4.5798, + "step": 17867 + }, + { + "epoch": 0.10626605766485869, + "grad_norm": 1.366013765335083, + "learning_rate": 4.861989098927556e-05, + "loss": 5.2409, + "step": 17868 + }, + { + "epoch": 0.1062720049481397, + "grad_norm": 1.603281855583191, + "learning_rate": 4.8619737935558054e-05, + "loss": 5.6699, + "step": 17869 + }, + { + "epoch": 0.10627795223142068, + "grad_norm": 1.6720329523086548, + "learning_rate": 4.861958487359515e-05, + "loss": 5.2162, + "step": 17870 + }, + { + "epoch": 0.10628389951470169, + "grad_norm": 2.5577762126922607, + "learning_rate": 4.861943180338689e-05, + "loss": 3.9116, + "step": 17871 + }, + { + "epoch": 0.10628984679798269, + "grad_norm": 2.6489310264587402, + "learning_rate": 4.861927872493332e-05, + "loss": 4.232, + "step": 17872 + }, + { + "epoch": 0.10629579408126368, + "grad_norm": 2.481381893157959, + "learning_rate": 4.861912563823451e-05, + "loss": 4.374, + "step": 17873 + }, + { + "epoch": 0.10630174136454468, + "grad_norm": 2.444721221923828, + "learning_rate": 4.861897254329052e-05, + "loss": 4.504, + "step": 17874 + }, + { + "epoch": 0.10630768864782568, + "grad_norm": 2.529085636138916, + "learning_rate": 4.8618819440101373e-05, + "loss": 4.1305, + "step": 17875 + }, + { + "epoch": 0.10631363593110667, + "grad_norm": 3.966379404067993, + "learning_rate": 4.861866632866715e-05, + "loss": 3.9104, + "step": 17876 + }, + { + "epoch": 0.10631958321438767, + "grad_norm": 2.408405065536499, + "learning_rate": 4.8618513208987895e-05, + "loss": 3.8762, + "step": 17877 + }, + { + "epoch": 0.10632553049766867, + "grad_norm": 2.41780686378479, + "learning_rate": 4.8618360081063654e-05, + "loss": 3.7665, + "step": 17878 + }, + { + "epoch": 0.10633147778094966, + "grad_norm": 2.60262393951416, + "learning_rate": 4.861820694489448e-05, + "loss": 4.067, + "step": 17879 + }, + { + "epoch": 0.10633742506423066, + "grad_norm": 2.624938726425171, + "learning_rate": 4.8618053800480456e-05, + "loss": 4.5653, + "step": 17880 + }, + { + "epoch": 0.10634337234751165, + "grad_norm": 2.783202886581421, + "learning_rate": 4.86179006478216e-05, + "loss": 4.4091, + "step": 17881 + }, + { + "epoch": 0.10634931963079265, + "grad_norm": 2.8269615173339844, + "learning_rate": 4.861774748691798e-05, + "loss": 3.949, + "step": 17882 + }, + { + "epoch": 0.10635526691407365, + "grad_norm": 2.82108998298645, + "learning_rate": 4.861759431776965e-05, + "loss": 3.8479, + "step": 17883 + }, + { + "epoch": 0.10636121419735464, + "grad_norm": 2.8543620109558105, + "learning_rate": 4.861744114037666e-05, + "loss": 3.4358, + "step": 17884 + }, + { + "epoch": 0.10636716148063564, + "grad_norm": 2.6492035388946533, + "learning_rate": 4.861728795473907e-05, + "loss": 3.6298, + "step": 17885 + }, + { + "epoch": 0.10637310876391665, + "grad_norm": 2.834181785583496, + "learning_rate": 4.861713476085693e-05, + "loss": 3.4125, + "step": 17886 + }, + { + "epoch": 0.10637905604719763, + "grad_norm": 3.447075605392456, + "learning_rate": 4.861698155873028e-05, + "loss": 3.5416, + "step": 17887 + }, + { + "epoch": 0.10638500333047864, + "grad_norm": 3.6009531021118164, + "learning_rate": 4.86168283483592e-05, + "loss": 4.1912, + "step": 17888 + }, + { + "epoch": 0.10639095061375964, + "grad_norm": 4.086645126342773, + "learning_rate": 4.861667512974372e-05, + "loss": 4.3999, + "step": 17889 + }, + { + "epoch": 0.10639689789704063, + "grad_norm": 3.673405408859253, + "learning_rate": 4.86165219028839e-05, + "loss": 4.3731, + "step": 17890 + }, + { + "epoch": 0.10640284518032163, + "grad_norm": 2.2896664142608643, + "learning_rate": 4.861636866777981e-05, + "loss": 5.5963, + "step": 17891 + }, + { + "epoch": 0.10640879246360263, + "grad_norm": 2.0481069087982178, + "learning_rate": 4.861621542443148e-05, + "loss": 5.7909, + "step": 17892 + }, + { + "epoch": 0.10641473974688362, + "grad_norm": 1.9108741283416748, + "learning_rate": 4.861606217283897e-05, + "loss": 5.3044, + "step": 17893 + }, + { + "epoch": 0.10642068703016462, + "grad_norm": 1.7842040061950684, + "learning_rate": 4.861590891300235e-05, + "loss": 5.3071, + "step": 17894 + }, + { + "epoch": 0.10642663431344562, + "grad_norm": 1.854777455329895, + "learning_rate": 4.861575564492164e-05, + "loss": 5.386, + "step": 17895 + }, + { + "epoch": 0.10643258159672661, + "grad_norm": 1.7286109924316406, + "learning_rate": 4.861560236859693e-05, + "loss": 5.5609, + "step": 17896 + }, + { + "epoch": 0.10643852888000761, + "grad_norm": 1.709408164024353, + "learning_rate": 4.861544908402825e-05, + "loss": 5.6772, + "step": 17897 + }, + { + "epoch": 0.10644447616328861, + "grad_norm": 1.9251428842544556, + "learning_rate": 4.861529579121567e-05, + "loss": 5.6114, + "step": 17898 + }, + { + "epoch": 0.1064504234465696, + "grad_norm": 1.6568808555603027, + "learning_rate": 4.8615142490159226e-05, + "loss": 5.4648, + "step": 17899 + }, + { + "epoch": 0.1064563707298506, + "grad_norm": 1.7793960571289062, + "learning_rate": 4.861498918085898e-05, + "loss": 5.4987, + "step": 17900 + }, + { + "epoch": 0.10646231801313161, + "grad_norm": 1.9044899940490723, + "learning_rate": 4.861483586331499e-05, + "loss": 5.7757, + "step": 17901 + }, + { + "epoch": 0.1064682652964126, + "grad_norm": 2.215278387069702, + "learning_rate": 4.86146825375273e-05, + "loss": 6.2767, + "step": 17902 + }, + { + "epoch": 0.1064742125796936, + "grad_norm": 1.8699604272842407, + "learning_rate": 4.861452920349597e-05, + "loss": 6.2987, + "step": 17903 + }, + { + "epoch": 0.1064801598629746, + "grad_norm": 1.634887456893921, + "learning_rate": 4.861437586122105e-05, + "loss": 6.2596, + "step": 17904 + }, + { + "epoch": 0.10648610714625559, + "grad_norm": 1.54149329662323, + "learning_rate": 4.86142225107026e-05, + "loss": 6.1988, + "step": 17905 + }, + { + "epoch": 0.10649205442953659, + "grad_norm": 1.5954409837722778, + "learning_rate": 4.861406915194067e-05, + "loss": 6.1052, + "step": 17906 + }, + { + "epoch": 0.10649800171281759, + "grad_norm": 1.8810808658599854, + "learning_rate": 4.86139157849353e-05, + "loss": 6.0318, + "step": 17907 + }, + { + "epoch": 0.10650394899609858, + "grad_norm": 1.4983458518981934, + "learning_rate": 4.861376240968656e-05, + "loss": 5.8614, + "step": 17908 + }, + { + "epoch": 0.10650989627937958, + "grad_norm": 1.5446088314056396, + "learning_rate": 4.8613609026194504e-05, + "loss": 5.623, + "step": 17909 + }, + { + "epoch": 0.10651584356266057, + "grad_norm": 1.7121042013168335, + "learning_rate": 4.861345563445918e-05, + "loss": 4.9258, + "step": 17910 + }, + { + "epoch": 0.10652179084594157, + "grad_norm": 2.002478837966919, + "learning_rate": 4.861330223448065e-05, + "loss": 5.285, + "step": 17911 + }, + { + "epoch": 0.10652773812922257, + "grad_norm": 1.7703490257263184, + "learning_rate": 4.8613148826258944e-05, + "loss": 5.2279, + "step": 17912 + }, + { + "epoch": 0.10653368541250356, + "grad_norm": 1.7763222455978394, + "learning_rate": 4.861299540979415e-05, + "loss": 4.8737, + "step": 17913 + }, + { + "epoch": 0.10653963269578456, + "grad_norm": 1.5921473503112793, + "learning_rate": 4.8612841985086296e-05, + "loss": 5.3756, + "step": 17914 + }, + { + "epoch": 0.10654557997906557, + "grad_norm": 1.810085654258728, + "learning_rate": 4.8612688552135435e-05, + "loss": 5.3784, + "step": 17915 + }, + { + "epoch": 0.10655152726234655, + "grad_norm": 2.2289364337921143, + "learning_rate": 4.8612535110941636e-05, + "loss": 5.0258, + "step": 17916 + }, + { + "epoch": 0.10655747454562756, + "grad_norm": 1.9337642192840576, + "learning_rate": 4.8612381661504946e-05, + "loss": 4.9943, + "step": 17917 + }, + { + "epoch": 0.10656342182890856, + "grad_norm": 1.5772477388381958, + "learning_rate": 4.861222820382542e-05, + "loss": 5.1188, + "step": 17918 + }, + { + "epoch": 0.10656936911218955, + "grad_norm": 1.6176950931549072, + "learning_rate": 4.8612074737903097e-05, + "loss": 5.0973, + "step": 17919 + }, + { + "epoch": 0.10657531639547055, + "grad_norm": 1.7878233194351196, + "learning_rate": 4.8611921263738045e-05, + "loss": 5.0342, + "step": 17920 + }, + { + "epoch": 0.10658126367875155, + "grad_norm": 1.7473089694976807, + "learning_rate": 4.861176778133033e-05, + "loss": 5.2844, + "step": 17921 + }, + { + "epoch": 0.10658721096203254, + "grad_norm": 2.472464084625244, + "learning_rate": 4.8611614290679975e-05, + "loss": 4.9654, + "step": 17922 + }, + { + "epoch": 0.10659315824531354, + "grad_norm": 2.5256218910217285, + "learning_rate": 4.861146079178706e-05, + "loss": 4.7885, + "step": 17923 + }, + { + "epoch": 0.10659910552859454, + "grad_norm": 2.2665674686431885, + "learning_rate": 4.861130728465162e-05, + "loss": 5.0838, + "step": 17924 + }, + { + "epoch": 0.10660505281187553, + "grad_norm": 1.6795161962509155, + "learning_rate": 4.861115376927372e-05, + "loss": 5.3174, + "step": 17925 + }, + { + "epoch": 0.10661100009515653, + "grad_norm": 1.5786751508712769, + "learning_rate": 4.8611000245653405e-05, + "loss": 5.1831, + "step": 17926 + }, + { + "epoch": 0.10661694737843753, + "grad_norm": 2.0238442420959473, + "learning_rate": 4.861084671379074e-05, + "loss": 5.7967, + "step": 17927 + }, + { + "epoch": 0.10662289466171852, + "grad_norm": 1.5760328769683838, + "learning_rate": 4.861069317368577e-05, + "loss": 5.5692, + "step": 17928 + }, + { + "epoch": 0.10662884194499953, + "grad_norm": 1.7190479040145874, + "learning_rate": 4.861053962533855e-05, + "loss": 5.4248, + "step": 17929 + }, + { + "epoch": 0.10663478922828053, + "grad_norm": 1.987444519996643, + "learning_rate": 4.861038606874914e-05, + "loss": 5.3845, + "step": 17930 + }, + { + "epoch": 0.10664073651156152, + "grad_norm": 2.3603975772857666, + "learning_rate": 4.8610232503917585e-05, + "loss": 4.9948, + "step": 17931 + }, + { + "epoch": 0.10664668379484252, + "grad_norm": 2.560696601867676, + "learning_rate": 4.861007893084394e-05, + "loss": 4.797, + "step": 17932 + }, + { + "epoch": 0.10665263107812352, + "grad_norm": 2.3494272232055664, + "learning_rate": 4.860992534952826e-05, + "loss": 4.81, + "step": 17933 + }, + { + "epoch": 0.10665857836140451, + "grad_norm": 2.1878998279571533, + "learning_rate": 4.86097717599706e-05, + "loss": 4.7863, + "step": 17934 + }, + { + "epoch": 0.10666452564468551, + "grad_norm": 2.123789072036743, + "learning_rate": 4.8609618162171016e-05, + "loss": 4.7846, + "step": 17935 + }, + { + "epoch": 0.10667047292796651, + "grad_norm": 2.307370662689209, + "learning_rate": 4.8609464556129555e-05, + "loss": 4.3901, + "step": 17936 + }, + { + "epoch": 0.1066764202112475, + "grad_norm": 1.8189514875411987, + "learning_rate": 4.8609310941846274e-05, + "loss": 5.2722, + "step": 17937 + }, + { + "epoch": 0.1066823674945285, + "grad_norm": 1.4699981212615967, + "learning_rate": 4.860915731932123e-05, + "loss": 5.7501, + "step": 17938 + }, + { + "epoch": 0.10668831477780949, + "grad_norm": 1.5624393224716187, + "learning_rate": 4.860900368855447e-05, + "loss": 5.6963, + "step": 17939 + }, + { + "epoch": 0.10669426206109049, + "grad_norm": 1.8463138341903687, + "learning_rate": 4.860885004954605e-05, + "loss": 5.3627, + "step": 17940 + }, + { + "epoch": 0.1067002093443715, + "grad_norm": 1.7627042531967163, + "learning_rate": 4.8608696402296025e-05, + "loss": 5.6548, + "step": 17941 + }, + { + "epoch": 0.10670615662765248, + "grad_norm": 1.631505012512207, + "learning_rate": 4.860854274680444e-05, + "loss": 5.7926, + "step": 17942 + }, + { + "epoch": 0.10671210391093348, + "grad_norm": 1.4491498470306396, + "learning_rate": 4.860838908307137e-05, + "loss": 5.5395, + "step": 17943 + }, + { + "epoch": 0.10671805119421449, + "grad_norm": 1.6210049390792847, + "learning_rate": 4.8608235411096845e-05, + "loss": 5.2768, + "step": 17944 + }, + { + "epoch": 0.10672399847749547, + "grad_norm": 1.4522534608840942, + "learning_rate": 4.860808173088094e-05, + "loss": 5.7723, + "step": 17945 + }, + { + "epoch": 0.10672994576077648, + "grad_norm": 2.0779013633728027, + "learning_rate": 4.860792804242369e-05, + "loss": 5.4679, + "step": 17946 + }, + { + "epoch": 0.10673589304405748, + "grad_norm": 2.248556137084961, + "learning_rate": 4.860777434572515e-05, + "loss": 5.5089, + "step": 17947 + }, + { + "epoch": 0.10674184032733847, + "grad_norm": 2.2192306518554688, + "learning_rate": 4.86076206407854e-05, + "loss": 5.4098, + "step": 17948 + }, + { + "epoch": 0.10674778761061947, + "grad_norm": 1.7523053884506226, + "learning_rate": 4.8607466927604455e-05, + "loss": 5.3223, + "step": 17949 + }, + { + "epoch": 0.10675373489390047, + "grad_norm": 1.8636107444763184, + "learning_rate": 4.8607313206182395e-05, + "loss": 5.339, + "step": 17950 + }, + { + "epoch": 0.10675968217718146, + "grad_norm": 1.9067093133926392, + "learning_rate": 4.860715947651926e-05, + "loss": 5.3779, + "step": 17951 + }, + { + "epoch": 0.10676562946046246, + "grad_norm": 1.850948452949524, + "learning_rate": 4.860700573861512e-05, + "loss": 5.3474, + "step": 17952 + }, + { + "epoch": 0.10677157674374346, + "grad_norm": 2.144895076751709, + "learning_rate": 4.8606851992470005e-05, + "loss": 5.3089, + "step": 17953 + }, + { + "epoch": 0.10677752402702445, + "grad_norm": 2.054420232772827, + "learning_rate": 4.860669823808399e-05, + "loss": 5.3653, + "step": 17954 + }, + { + "epoch": 0.10678347131030545, + "grad_norm": 1.94870126247406, + "learning_rate": 4.860654447545711e-05, + "loss": 5.2514, + "step": 17955 + }, + { + "epoch": 0.10678941859358645, + "grad_norm": 1.8006596565246582, + "learning_rate": 4.860639070458945e-05, + "loss": 5.2357, + "step": 17956 + }, + { + "epoch": 0.10679536587686744, + "grad_norm": 2.309035301208496, + "learning_rate": 4.860623692548103e-05, + "loss": 5.2681, + "step": 17957 + }, + { + "epoch": 0.10680131316014845, + "grad_norm": 2.402949571609497, + "learning_rate": 4.860608313813192e-05, + "loss": 5.549, + "step": 17958 + }, + { + "epoch": 0.10680726044342945, + "grad_norm": 1.724307894706726, + "learning_rate": 4.8605929342542164e-05, + "loss": 5.5283, + "step": 17959 + }, + { + "epoch": 0.10681320772671044, + "grad_norm": 1.8566054105758667, + "learning_rate": 4.860577553871183e-05, + "loss": 5.834, + "step": 17960 + }, + { + "epoch": 0.10681915500999144, + "grad_norm": 1.8882628679275513, + "learning_rate": 4.860562172664096e-05, + "loss": 5.7954, + "step": 17961 + }, + { + "epoch": 0.10682510229327244, + "grad_norm": 1.694075345993042, + "learning_rate": 4.860546790632961e-05, + "loss": 5.7573, + "step": 17962 + }, + { + "epoch": 0.10683104957655343, + "grad_norm": 1.8312102556228638, + "learning_rate": 4.860531407777783e-05, + "loss": 5.4479, + "step": 17963 + }, + { + "epoch": 0.10683699685983443, + "grad_norm": 1.6124730110168457, + "learning_rate": 4.860516024098569e-05, + "loss": 5.5356, + "step": 17964 + }, + { + "epoch": 0.10684294414311543, + "grad_norm": 2.3505187034606934, + "learning_rate": 4.8605006395953225e-05, + "loss": 5.6543, + "step": 17965 + }, + { + "epoch": 0.10684889142639642, + "grad_norm": 2.69331431388855, + "learning_rate": 4.86048525426805e-05, + "loss": 5.5359, + "step": 17966 + }, + { + "epoch": 0.10685483870967742, + "grad_norm": 2.095374822616577, + "learning_rate": 4.860469868116756e-05, + "loss": 5.5514, + "step": 17967 + }, + { + "epoch": 0.10686078599295841, + "grad_norm": 1.8596038818359375, + "learning_rate": 4.8604544811414465e-05, + "loss": 5.5171, + "step": 17968 + }, + { + "epoch": 0.10686673327623941, + "grad_norm": 2.215549945831299, + "learning_rate": 4.860439093342127e-05, + "loss": 5.3824, + "step": 17969 + }, + { + "epoch": 0.10687268055952041, + "grad_norm": 1.9737238883972168, + "learning_rate": 4.860423704718803e-05, + "loss": 5.4159, + "step": 17970 + }, + { + "epoch": 0.1068786278428014, + "grad_norm": 1.8673701286315918, + "learning_rate": 4.860408315271479e-05, + "loss": 5.421, + "step": 17971 + }, + { + "epoch": 0.1068845751260824, + "grad_norm": 1.905371069908142, + "learning_rate": 4.86039292500016e-05, + "loss": 5.4003, + "step": 17972 + }, + { + "epoch": 0.1068905224093634, + "grad_norm": 1.7888939380645752, + "learning_rate": 4.8603775339048534e-05, + "loss": 5.1581, + "step": 17973 + }, + { + "epoch": 0.1068964696926444, + "grad_norm": 1.7499796152114868, + "learning_rate": 4.8603621419855625e-05, + "loss": 5.1334, + "step": 17974 + }, + { + "epoch": 0.1069024169759254, + "grad_norm": 1.6159700155258179, + "learning_rate": 4.860346749242295e-05, + "loss": 5.1999, + "step": 17975 + }, + { + "epoch": 0.1069083642592064, + "grad_norm": 1.7355921268463135, + "learning_rate": 4.860331355675053e-05, + "loss": 5.3899, + "step": 17976 + }, + { + "epoch": 0.10691431154248739, + "grad_norm": 1.760110855102539, + "learning_rate": 4.860315961283846e-05, + "loss": 5.5386, + "step": 17977 + }, + { + "epoch": 0.10692025882576839, + "grad_norm": 1.605482816696167, + "learning_rate": 4.860300566068675e-05, + "loss": 5.5486, + "step": 17978 + }, + { + "epoch": 0.10692620610904939, + "grad_norm": 2.1792690753936768, + "learning_rate": 4.860285170029548e-05, + "loss": 4.8871, + "step": 17979 + }, + { + "epoch": 0.10693215339233038, + "grad_norm": 1.4513617753982544, + "learning_rate": 4.86026977316647e-05, + "loss": 5.1944, + "step": 17980 + }, + { + "epoch": 0.10693810067561138, + "grad_norm": 2.560112476348877, + "learning_rate": 4.860254375479446e-05, + "loss": 4.2504, + "step": 17981 + }, + { + "epoch": 0.10694404795889238, + "grad_norm": 2.035403251647949, + "learning_rate": 4.8602389769684816e-05, + "loss": 5.4479, + "step": 17982 + }, + { + "epoch": 0.10694999524217337, + "grad_norm": 1.8496562242507935, + "learning_rate": 4.8602235776335826e-05, + "loss": 5.4981, + "step": 17983 + }, + { + "epoch": 0.10695594252545437, + "grad_norm": 1.9541285037994385, + "learning_rate": 4.8602081774747536e-05, + "loss": 5.5772, + "step": 17984 + }, + { + "epoch": 0.10696188980873537, + "grad_norm": 1.674981951713562, + "learning_rate": 4.860192776492001e-05, + "loss": 5.3656, + "step": 17985 + }, + { + "epoch": 0.10696783709201636, + "grad_norm": 1.675601601600647, + "learning_rate": 4.860177374685328e-05, + "loss": 5.3382, + "step": 17986 + }, + { + "epoch": 0.10697378437529736, + "grad_norm": 1.8874675035476685, + "learning_rate": 4.860161972054743e-05, + "loss": 5.1908, + "step": 17987 + }, + { + "epoch": 0.10697973165857837, + "grad_norm": 2.267000675201416, + "learning_rate": 4.860146568600249e-05, + "loss": 5.4437, + "step": 17988 + }, + { + "epoch": 0.10698567894185936, + "grad_norm": 1.8062045574188232, + "learning_rate": 4.8601311643218526e-05, + "loss": 5.2315, + "step": 17989 + }, + { + "epoch": 0.10699162622514036, + "grad_norm": 1.9503196477890015, + "learning_rate": 4.8601157592195584e-05, + "loss": 5.3999, + "step": 17990 + }, + { + "epoch": 0.10699757350842136, + "grad_norm": 1.8589918613433838, + "learning_rate": 4.860100353293372e-05, + "loss": 5.694, + "step": 17991 + }, + { + "epoch": 0.10700352079170235, + "grad_norm": 1.69667649269104, + "learning_rate": 4.8600849465432995e-05, + "loss": 5.6146, + "step": 17992 + }, + { + "epoch": 0.10700946807498335, + "grad_norm": 1.6006754636764526, + "learning_rate": 4.8600695389693455e-05, + "loss": 5.2849, + "step": 17993 + }, + { + "epoch": 0.10701541535826435, + "grad_norm": 1.7502506971359253, + "learning_rate": 4.860054130571516e-05, + "loss": 4.9652, + "step": 17994 + }, + { + "epoch": 0.10702136264154534, + "grad_norm": 1.6936286687850952, + "learning_rate": 4.860038721349816e-05, + "loss": 5.2192, + "step": 17995 + }, + { + "epoch": 0.10702730992482634, + "grad_norm": 1.4757579565048218, + "learning_rate": 4.8600233113042496e-05, + "loss": 5.3917, + "step": 17996 + }, + { + "epoch": 0.10703325720810733, + "grad_norm": 1.4602460861206055, + "learning_rate": 4.8600079004348245e-05, + "loss": 5.5418, + "step": 17997 + }, + { + "epoch": 0.10703920449138833, + "grad_norm": 1.4150431156158447, + "learning_rate": 4.859992488741545e-05, + "loss": 5.6592, + "step": 17998 + }, + { + "epoch": 0.10704515177466933, + "grad_norm": 1.385908842086792, + "learning_rate": 4.859977076224416e-05, + "loss": 5.2818, + "step": 17999 + }, + { + "epoch": 0.10705109905795032, + "grad_norm": 1.3683747053146362, + "learning_rate": 4.8599616628834446e-05, + "loss": 5.2743, + "step": 18000 + }, + { + "epoch": 0.10705704634123132, + "grad_norm": 1.2521027326583862, + "learning_rate": 4.859946248718634e-05, + "loss": 5.1564, + "step": 18001 + }, + { + "epoch": 0.10706299362451233, + "grad_norm": 1.445575475692749, + "learning_rate": 4.8599308337299906e-05, + "loss": 5.0108, + "step": 18002 + }, + { + "epoch": 0.10706894090779331, + "grad_norm": 1.3680258989334106, + "learning_rate": 4.859915417917519e-05, + "loss": 5.2649, + "step": 18003 + }, + { + "epoch": 0.10707488819107432, + "grad_norm": 1.2142491340637207, + "learning_rate": 4.859900001281227e-05, + "loss": 5.1143, + "step": 18004 + }, + { + "epoch": 0.10708083547435532, + "grad_norm": 1.244157314300537, + "learning_rate": 4.859884583821117e-05, + "loss": 5.2321, + "step": 18005 + }, + { + "epoch": 0.1070867827576363, + "grad_norm": 1.4057670831680298, + "learning_rate": 4.859869165537196e-05, + "loss": 5.3419, + "step": 18006 + }, + { + "epoch": 0.10709273004091731, + "grad_norm": 1.3243392705917358, + "learning_rate": 4.859853746429469e-05, + "loss": 5.0217, + "step": 18007 + }, + { + "epoch": 0.10709867732419831, + "grad_norm": 1.3227713108062744, + "learning_rate": 4.8598383264979416e-05, + "loss": 5.055, + "step": 18008 + }, + { + "epoch": 0.1071046246074793, + "grad_norm": 1.3313336372375488, + "learning_rate": 4.8598229057426195e-05, + "loss": 5.1319, + "step": 18009 + }, + { + "epoch": 0.1071105718907603, + "grad_norm": 1.385715126991272, + "learning_rate": 4.8598074841635064e-05, + "loss": 4.9349, + "step": 18010 + }, + { + "epoch": 0.1071165191740413, + "grad_norm": 1.3244850635528564, + "learning_rate": 4.85979206176061e-05, + "loss": 4.9055, + "step": 18011 + }, + { + "epoch": 0.10712246645732229, + "grad_norm": 1.2922260761260986, + "learning_rate": 4.859776638533934e-05, + "loss": 5.0518, + "step": 18012 + }, + { + "epoch": 0.10712841374060329, + "grad_norm": 1.3371012210845947, + "learning_rate": 4.8597612144834845e-05, + "loss": 5.234, + "step": 18013 + }, + { + "epoch": 0.1071343610238843, + "grad_norm": 1.3367552757263184, + "learning_rate": 4.859745789609267e-05, + "loss": 4.9765, + "step": 18014 + }, + { + "epoch": 0.10714030830716528, + "grad_norm": 1.5067929029464722, + "learning_rate": 4.859730363911286e-05, + "loss": 5.235, + "step": 18015 + }, + { + "epoch": 0.10714625559044628, + "grad_norm": 1.3660157918930054, + "learning_rate": 4.859714937389548e-05, + "loss": 5.4104, + "step": 18016 + }, + { + "epoch": 0.10715220287372729, + "grad_norm": 1.3999029397964478, + "learning_rate": 4.859699510044057e-05, + "loss": 5.1603, + "step": 18017 + }, + { + "epoch": 0.10715815015700828, + "grad_norm": 1.6147737503051758, + "learning_rate": 4.8596840818748204e-05, + "loss": 5.0506, + "step": 18018 + }, + { + "epoch": 0.10716409744028928, + "grad_norm": 1.5618371963500977, + "learning_rate": 4.859668652881843e-05, + "loss": 5.1564, + "step": 18019 + }, + { + "epoch": 0.10717004472357028, + "grad_norm": 1.3786426782608032, + "learning_rate": 4.859653223065128e-05, + "loss": 5.1884, + "step": 18020 + }, + { + "epoch": 0.10717599200685127, + "grad_norm": 1.429489016532898, + "learning_rate": 4.859637792424683e-05, + "loss": 5.1556, + "step": 18021 + }, + { + "epoch": 0.10718193929013227, + "grad_norm": 1.3347980976104736, + "learning_rate": 4.859622360960513e-05, + "loss": 5.008, + "step": 18022 + }, + { + "epoch": 0.10718788657341327, + "grad_norm": 1.3850064277648926, + "learning_rate": 4.859606928672623e-05, + "loss": 5.0719, + "step": 18023 + }, + { + "epoch": 0.10719383385669426, + "grad_norm": 1.3279672861099243, + "learning_rate": 4.859591495561019e-05, + "loss": 5.0793, + "step": 18024 + }, + { + "epoch": 0.10719978113997526, + "grad_norm": 1.5108927488327026, + "learning_rate": 4.8595760616257056e-05, + "loss": 5.1067, + "step": 18025 + }, + { + "epoch": 0.10720572842325625, + "grad_norm": 1.2342565059661865, + "learning_rate": 4.859560626866689e-05, + "loss": 5.0298, + "step": 18026 + }, + { + "epoch": 0.10721167570653725, + "grad_norm": 1.2821179628372192, + "learning_rate": 4.859545191283974e-05, + "loss": 5.2185, + "step": 18027 + }, + { + "epoch": 0.10721762298981825, + "grad_norm": 1.11893630027771, + "learning_rate": 4.859529754877566e-05, + "loss": 5.1911, + "step": 18028 + }, + { + "epoch": 0.10722357027309924, + "grad_norm": 1.2202814817428589, + "learning_rate": 4.859514317647471e-05, + "loss": 5.028, + "step": 18029 + }, + { + "epoch": 0.10722951755638024, + "grad_norm": 1.3898543119430542, + "learning_rate": 4.859498879593694e-05, + "loss": 5.4019, + "step": 18030 + }, + { + "epoch": 0.10723546483966125, + "grad_norm": 1.2810478210449219, + "learning_rate": 4.859483440716239e-05, + "loss": 5.0634, + "step": 18031 + }, + { + "epoch": 0.10724141212294223, + "grad_norm": 1.4424680471420288, + "learning_rate": 4.859468001015114e-05, + "loss": 5.0058, + "step": 18032 + }, + { + "epoch": 0.10724735940622324, + "grad_norm": 1.4053739309310913, + "learning_rate": 4.859452560490323e-05, + "loss": 5.0174, + "step": 18033 + }, + { + "epoch": 0.10725330668950424, + "grad_norm": 1.2552763223648071, + "learning_rate": 4.859437119141871e-05, + "loss": 5.0222, + "step": 18034 + }, + { + "epoch": 0.10725925397278523, + "grad_norm": 1.3694052696228027, + "learning_rate": 4.859421676969764e-05, + "loss": 4.9663, + "step": 18035 + }, + { + "epoch": 0.10726520125606623, + "grad_norm": 1.3814043998718262, + "learning_rate": 4.859406233974007e-05, + "loss": 5.01, + "step": 18036 + }, + { + "epoch": 0.10727114853934723, + "grad_norm": 1.5185308456420898, + "learning_rate": 4.859390790154606e-05, + "loss": 4.9698, + "step": 18037 + }, + { + "epoch": 0.10727709582262822, + "grad_norm": 1.2509820461273193, + "learning_rate": 4.859375345511566e-05, + "loss": 5.1034, + "step": 18038 + }, + { + "epoch": 0.10728304310590922, + "grad_norm": 1.3478872776031494, + "learning_rate": 4.8593599000448926e-05, + "loss": 5.2459, + "step": 18039 + }, + { + "epoch": 0.10728899038919022, + "grad_norm": 1.3720686435699463, + "learning_rate": 4.859344453754591e-05, + "loss": 5.1671, + "step": 18040 + }, + { + "epoch": 0.10729493767247121, + "grad_norm": 1.3953602313995361, + "learning_rate": 4.859329006640666e-05, + "loss": 5.3221, + "step": 18041 + }, + { + "epoch": 0.10730088495575221, + "grad_norm": 1.4901010990142822, + "learning_rate": 4.859313558703125e-05, + "loss": 5.1694, + "step": 18042 + }, + { + "epoch": 0.10730683223903321, + "grad_norm": 1.4153228998184204, + "learning_rate": 4.859298109941971e-05, + "loss": 5.2721, + "step": 18043 + }, + { + "epoch": 0.1073127795223142, + "grad_norm": 1.34188711643219, + "learning_rate": 4.859282660357211e-05, + "loss": 5.3048, + "step": 18044 + }, + { + "epoch": 0.1073187268055952, + "grad_norm": 1.355832576751709, + "learning_rate": 4.859267209948849e-05, + "loss": 5.2908, + "step": 18045 + }, + { + "epoch": 0.1073246740888762, + "grad_norm": 1.1551882028579712, + "learning_rate": 4.859251758716891e-05, + "loss": 5.1681, + "step": 18046 + }, + { + "epoch": 0.1073306213721572, + "grad_norm": 1.1728358268737793, + "learning_rate": 4.8592363066613434e-05, + "loss": 5.1535, + "step": 18047 + }, + { + "epoch": 0.1073365686554382, + "grad_norm": 1.4180268049240112, + "learning_rate": 4.859220853782211e-05, + "loss": 4.6467, + "step": 18048 + }, + { + "epoch": 0.1073425159387192, + "grad_norm": 1.4042308330535889, + "learning_rate": 4.8592054000794984e-05, + "loss": 4.7348, + "step": 18049 + }, + { + "epoch": 0.10734846322200019, + "grad_norm": 1.2508533000946045, + "learning_rate": 4.859189945553211e-05, + "loss": 4.7797, + "step": 18050 + }, + { + "epoch": 0.10735441050528119, + "grad_norm": 1.2266274690628052, + "learning_rate": 4.859174490203355e-05, + "loss": 4.7223, + "step": 18051 + }, + { + "epoch": 0.10736035778856219, + "grad_norm": 1.3217378854751587, + "learning_rate": 4.8591590340299366e-05, + "loss": 4.82, + "step": 18052 + }, + { + "epoch": 0.10736630507184318, + "grad_norm": 1.3789056539535522, + "learning_rate": 4.8591435770329594e-05, + "loss": 5.3133, + "step": 18053 + }, + { + "epoch": 0.10737225235512418, + "grad_norm": 1.6090314388275146, + "learning_rate": 4.85912811921243e-05, + "loss": 5.2263, + "step": 18054 + }, + { + "epoch": 0.10737819963840518, + "grad_norm": 1.3780972957611084, + "learning_rate": 4.859112660568353e-05, + "loss": 5.3081, + "step": 18055 + }, + { + "epoch": 0.10738414692168617, + "grad_norm": 1.3518953323364258, + "learning_rate": 4.859097201100734e-05, + "loss": 5.3423, + "step": 18056 + }, + { + "epoch": 0.10739009420496717, + "grad_norm": 1.4160034656524658, + "learning_rate": 4.859081740809579e-05, + "loss": 5.3082, + "step": 18057 + }, + { + "epoch": 0.10739604148824816, + "grad_norm": 1.1970654726028442, + "learning_rate": 4.8590662796948924e-05, + "loss": 5.254, + "step": 18058 + }, + { + "epoch": 0.10740198877152916, + "grad_norm": 1.3175582885742188, + "learning_rate": 4.859050817756681e-05, + "loss": 5.2823, + "step": 18059 + }, + { + "epoch": 0.10740793605481017, + "grad_norm": 1.5136942863464355, + "learning_rate": 4.859035354994948e-05, + "loss": 5.2238, + "step": 18060 + }, + { + "epoch": 0.10741388333809115, + "grad_norm": 1.2552412748336792, + "learning_rate": 4.859019891409701e-05, + "loss": 5.0492, + "step": 18061 + }, + { + "epoch": 0.10741983062137216, + "grad_norm": 1.2873655557632446, + "learning_rate": 4.859004427000945e-05, + "loss": 4.9162, + "step": 18062 + }, + { + "epoch": 0.10742577790465316, + "grad_norm": 1.2441788911819458, + "learning_rate": 4.8589889617686834e-05, + "loss": 4.9769, + "step": 18063 + }, + { + "epoch": 0.10743172518793415, + "grad_norm": 1.4254180192947388, + "learning_rate": 4.8589734957129246e-05, + "loss": 4.9917, + "step": 18064 + }, + { + "epoch": 0.10743767247121515, + "grad_norm": 1.3922675848007202, + "learning_rate": 4.858958028833672e-05, + "loss": 4.9705, + "step": 18065 + }, + { + "epoch": 0.10744361975449615, + "grad_norm": 1.430801510810852, + "learning_rate": 4.858942561130932e-05, + "loss": 5.0772, + "step": 18066 + }, + { + "epoch": 0.10744956703777714, + "grad_norm": 1.3651894330978394, + "learning_rate": 4.8589270926047085e-05, + "loss": 4.8844, + "step": 18067 + }, + { + "epoch": 0.10745551432105814, + "grad_norm": 1.4133042097091675, + "learning_rate": 4.858911623255008e-05, + "loss": 4.9397, + "step": 18068 + }, + { + "epoch": 0.10746146160433914, + "grad_norm": 1.4437615871429443, + "learning_rate": 4.858896153081837e-05, + "loss": 4.9977, + "step": 18069 + }, + { + "epoch": 0.10746740888762013, + "grad_norm": 1.3420813083648682, + "learning_rate": 4.858880682085199e-05, + "loss": 4.9295, + "step": 18070 + }, + { + "epoch": 0.10747335617090113, + "grad_norm": 1.2613091468811035, + "learning_rate": 4.8588652102651e-05, + "loss": 5.3186, + "step": 18071 + }, + { + "epoch": 0.10747930345418213, + "grad_norm": 1.2117836475372314, + "learning_rate": 4.858849737621545e-05, + "loss": 5.207, + "step": 18072 + }, + { + "epoch": 0.10748525073746312, + "grad_norm": 1.3153164386749268, + "learning_rate": 4.85883426415454e-05, + "loss": 4.9786, + "step": 18073 + }, + { + "epoch": 0.10749119802074412, + "grad_norm": 1.2437881231307983, + "learning_rate": 4.858818789864091e-05, + "loss": 4.8748, + "step": 18074 + }, + { + "epoch": 0.10749714530402513, + "grad_norm": 1.2477847337722778, + "learning_rate": 4.858803314750203e-05, + "loss": 4.8874, + "step": 18075 + }, + { + "epoch": 0.10750309258730611, + "grad_norm": 1.342822790145874, + "learning_rate": 4.858787838812881e-05, + "loss": 4.8244, + "step": 18076 + }, + { + "epoch": 0.10750903987058712, + "grad_norm": 1.4947394132614136, + "learning_rate": 4.8587723620521306e-05, + "loss": 4.9091, + "step": 18077 + }, + { + "epoch": 0.10751498715386812, + "grad_norm": 1.388978362083435, + "learning_rate": 4.8587568844679566e-05, + "loss": 4.9075, + "step": 18078 + }, + { + "epoch": 0.10752093443714911, + "grad_norm": 1.5932878255844116, + "learning_rate": 4.8587414060603656e-05, + "loss": 4.8712, + "step": 18079 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 1.3746308088302612, + "learning_rate": 4.8587259268293616e-05, + "loss": 4.9187, + "step": 18080 + }, + { + "epoch": 0.10753282900371111, + "grad_norm": 1.2811295986175537, + "learning_rate": 4.858710446774951e-05, + "loss": 4.8643, + "step": 18081 + }, + { + "epoch": 0.1075387762869921, + "grad_norm": 1.4154548645019531, + "learning_rate": 4.858694965897139e-05, + "loss": 4.8802, + "step": 18082 + }, + { + "epoch": 0.1075447235702731, + "grad_norm": 1.3216148614883423, + "learning_rate": 4.8586794841959305e-05, + "loss": 5.0356, + "step": 18083 + }, + { + "epoch": 0.1075506708535541, + "grad_norm": 1.0971577167510986, + "learning_rate": 4.858664001671332e-05, + "loss": 5.2085, + "step": 18084 + }, + { + "epoch": 0.10755661813683509, + "grad_norm": 1.3257287740707397, + "learning_rate": 4.858648518323348e-05, + "loss": 5.1728, + "step": 18085 + }, + { + "epoch": 0.1075625654201161, + "grad_norm": 1.2429475784301758, + "learning_rate": 4.858633034151985e-05, + "loss": 5.1053, + "step": 18086 + }, + { + "epoch": 0.10756851270339708, + "grad_norm": 1.1196707487106323, + "learning_rate": 4.858617549157246e-05, + "loss": 5.074, + "step": 18087 + }, + { + "epoch": 0.10757445998667808, + "grad_norm": 1.1981266736984253, + "learning_rate": 4.858602063339139e-05, + "loss": 5.0093, + "step": 18088 + }, + { + "epoch": 0.10758040726995909, + "grad_norm": 1.3818682432174683, + "learning_rate": 4.858586576697668e-05, + "loss": 5.0184, + "step": 18089 + }, + { + "epoch": 0.10758635455324007, + "grad_norm": 1.303539752960205, + "learning_rate": 4.85857108923284e-05, + "loss": 5.1778, + "step": 18090 + }, + { + "epoch": 0.10759230183652108, + "grad_norm": 1.3990812301635742, + "learning_rate": 4.8585556009446576e-05, + "loss": 4.9785, + "step": 18091 + }, + { + "epoch": 0.10759824911980208, + "grad_norm": 1.2507104873657227, + "learning_rate": 4.858540111833129e-05, + "loss": 4.9024, + "step": 18092 + }, + { + "epoch": 0.10760419640308307, + "grad_norm": 1.2867792844772339, + "learning_rate": 4.858524621898257e-05, + "loss": 4.8847, + "step": 18093 + }, + { + "epoch": 0.10761014368636407, + "grad_norm": 1.1816591024398804, + "learning_rate": 4.8585091311400495e-05, + "loss": 4.9431, + "step": 18094 + }, + { + "epoch": 0.10761609096964507, + "grad_norm": 1.292284607887268, + "learning_rate": 4.85849363955851e-05, + "loss": 5.2273, + "step": 18095 + }, + { + "epoch": 0.10762203825292606, + "grad_norm": 1.3242478370666504, + "learning_rate": 4.8584781471536456e-05, + "loss": 5.093, + "step": 18096 + }, + { + "epoch": 0.10762798553620706, + "grad_norm": 1.211534857749939, + "learning_rate": 4.858462653925461e-05, + "loss": 5.0928, + "step": 18097 + }, + { + "epoch": 0.10763393281948806, + "grad_norm": 1.0469262599945068, + "learning_rate": 4.858447159873961e-05, + "loss": 5.0435, + "step": 18098 + }, + { + "epoch": 0.10763988010276905, + "grad_norm": 1.2352322340011597, + "learning_rate": 4.8584316649991514e-05, + "loss": 5.1899, + "step": 18099 + }, + { + "epoch": 0.10764582738605005, + "grad_norm": 1.2135246992111206, + "learning_rate": 4.8584161693010375e-05, + "loss": 5.1028, + "step": 18100 + }, + { + "epoch": 0.10765177466933105, + "grad_norm": 1.3525876998901367, + "learning_rate": 4.858400672779625e-05, + "loss": 5.0422, + "step": 18101 + }, + { + "epoch": 0.10765772195261204, + "grad_norm": 1.3221076726913452, + "learning_rate": 4.85838517543492e-05, + "loss": 5.1329, + "step": 18102 + }, + { + "epoch": 0.10766366923589304, + "grad_norm": 1.4856393337249756, + "learning_rate": 4.858369677266926e-05, + "loss": 4.6795, + "step": 18103 + }, + { + "epoch": 0.10766961651917405, + "grad_norm": 1.4690982103347778, + "learning_rate": 4.8583541782756495e-05, + "loss": 5.1234, + "step": 18104 + }, + { + "epoch": 0.10767556380245503, + "grad_norm": 1.2535064220428467, + "learning_rate": 4.8583386784610964e-05, + "loss": 5.1344, + "step": 18105 + }, + { + "epoch": 0.10768151108573604, + "grad_norm": 1.3537837266921997, + "learning_rate": 4.858323177823272e-05, + "loss": 5.228, + "step": 18106 + }, + { + "epoch": 0.10768745836901704, + "grad_norm": 1.2927895784378052, + "learning_rate": 4.8583076763621805e-05, + "loss": 5.2371, + "step": 18107 + }, + { + "epoch": 0.10769340565229803, + "grad_norm": 1.2356709241867065, + "learning_rate": 4.8582921740778284e-05, + "loss": 4.9056, + "step": 18108 + }, + { + "epoch": 0.10769935293557903, + "grad_norm": 1.266918420791626, + "learning_rate": 4.858276670970221e-05, + "loss": 5.2142, + "step": 18109 + }, + { + "epoch": 0.10770530021886003, + "grad_norm": 1.1703591346740723, + "learning_rate": 4.858261167039364e-05, + "loss": 5.1237, + "step": 18110 + }, + { + "epoch": 0.10771124750214102, + "grad_norm": 1.2324700355529785, + "learning_rate": 4.858245662285262e-05, + "loss": 5.1391, + "step": 18111 + }, + { + "epoch": 0.10771719478542202, + "grad_norm": 1.2764140367507935, + "learning_rate": 4.85823015670792e-05, + "loss": 5.1368, + "step": 18112 + }, + { + "epoch": 0.10772314206870302, + "grad_norm": 1.254909634590149, + "learning_rate": 4.8582146503073456e-05, + "loss": 5.002, + "step": 18113 + }, + { + "epoch": 0.10772908935198401, + "grad_norm": 1.3368279933929443, + "learning_rate": 4.858199143083542e-05, + "loss": 5.1365, + "step": 18114 + }, + { + "epoch": 0.10773503663526501, + "grad_norm": 1.3550091981887817, + "learning_rate": 4.8581836350365165e-05, + "loss": 5.1722, + "step": 18115 + }, + { + "epoch": 0.107740983918546, + "grad_norm": 1.6306661367416382, + "learning_rate": 4.858168126166272e-05, + "loss": 5.0883, + "step": 18116 + }, + { + "epoch": 0.107746931201827, + "grad_norm": 1.5143946409225464, + "learning_rate": 4.858152616472816e-05, + "loss": 5.1258, + "step": 18117 + }, + { + "epoch": 0.107752878485108, + "grad_norm": 1.6553763151168823, + "learning_rate": 4.858137105956153e-05, + "loss": 4.9596, + "step": 18118 + }, + { + "epoch": 0.107758825768389, + "grad_norm": 1.920473337173462, + "learning_rate": 4.8581215946162896e-05, + "loss": 5.2206, + "step": 18119 + }, + { + "epoch": 0.10776477305167, + "grad_norm": 1.8482425212860107, + "learning_rate": 4.85810608245323e-05, + "loss": 5.1515, + "step": 18120 + }, + { + "epoch": 0.107770720334951, + "grad_norm": 1.6005665063858032, + "learning_rate": 4.8580905694669794e-05, + "loss": 5.1383, + "step": 18121 + }, + { + "epoch": 0.10777666761823199, + "grad_norm": 1.2169783115386963, + "learning_rate": 4.858075055657544e-05, + "loss": 5.3538, + "step": 18122 + }, + { + "epoch": 0.10778261490151299, + "grad_norm": 1.3251442909240723, + "learning_rate": 4.858059541024929e-05, + "loss": 5.3116, + "step": 18123 + }, + { + "epoch": 0.10778856218479399, + "grad_norm": 1.2065789699554443, + "learning_rate": 4.858044025569139e-05, + "loss": 5.2334, + "step": 18124 + }, + { + "epoch": 0.10779450946807498, + "grad_norm": 1.5847411155700684, + "learning_rate": 4.858028509290181e-05, + "loss": 4.9114, + "step": 18125 + }, + { + "epoch": 0.10780045675135598, + "grad_norm": 1.373826503753662, + "learning_rate": 4.85801299218806e-05, + "loss": 5.0748, + "step": 18126 + }, + { + "epoch": 0.10780640403463698, + "grad_norm": 1.7349494695663452, + "learning_rate": 4.85799747426278e-05, + "loss": 5.0888, + "step": 18127 + }, + { + "epoch": 0.10781235131791797, + "grad_norm": 1.3385915756225586, + "learning_rate": 4.857981955514349e-05, + "loss": 5.1472, + "step": 18128 + }, + { + "epoch": 0.10781829860119897, + "grad_norm": 1.3666753768920898, + "learning_rate": 4.857966435942769e-05, + "loss": 5.0881, + "step": 18129 + }, + { + "epoch": 0.10782424588447997, + "grad_norm": 1.39078688621521, + "learning_rate": 4.857950915548048e-05, + "loss": 5.3867, + "step": 18130 + }, + { + "epoch": 0.10783019316776096, + "grad_norm": 1.4484905004501343, + "learning_rate": 4.857935394330192e-05, + "loss": 5.0516, + "step": 18131 + }, + { + "epoch": 0.10783614045104196, + "grad_norm": 1.526084542274475, + "learning_rate": 4.8579198722892034e-05, + "loss": 5.0424, + "step": 18132 + }, + { + "epoch": 0.10784208773432297, + "grad_norm": 1.4617003202438354, + "learning_rate": 4.8579043494250895e-05, + "loss": 5.0245, + "step": 18133 + }, + { + "epoch": 0.10784803501760395, + "grad_norm": 1.3335559368133545, + "learning_rate": 4.857888825737856e-05, + "loss": 4.9398, + "step": 18134 + }, + { + "epoch": 0.10785398230088496, + "grad_norm": 1.1473711729049683, + "learning_rate": 4.857873301227508e-05, + "loss": 5.1818, + "step": 18135 + }, + { + "epoch": 0.10785992958416596, + "grad_norm": 1.5986409187316895, + "learning_rate": 4.8578577758940504e-05, + "loss": 5.3518, + "step": 18136 + }, + { + "epoch": 0.10786587686744695, + "grad_norm": 1.6430408954620361, + "learning_rate": 4.857842249737489e-05, + "loss": 5.3052, + "step": 18137 + }, + { + "epoch": 0.10787182415072795, + "grad_norm": 1.5069605112075806, + "learning_rate": 4.8578267227578303e-05, + "loss": 5.3491, + "step": 18138 + }, + { + "epoch": 0.10787777143400895, + "grad_norm": 1.3385566473007202, + "learning_rate": 4.857811194955077e-05, + "loss": 5.3864, + "step": 18139 + }, + { + "epoch": 0.10788371871728994, + "grad_norm": 1.1956936120986938, + "learning_rate": 4.857795666329237e-05, + "loss": 5.1304, + "step": 18140 + }, + { + "epoch": 0.10788966600057094, + "grad_norm": 1.3437196016311646, + "learning_rate": 4.857780136880315e-05, + "loss": 5.1872, + "step": 18141 + }, + { + "epoch": 0.10789561328385194, + "grad_norm": 1.4649217128753662, + "learning_rate": 4.857764606608316e-05, + "loss": 5.4178, + "step": 18142 + }, + { + "epoch": 0.10790156056713293, + "grad_norm": 1.2196028232574463, + "learning_rate": 4.857749075513246e-05, + "loss": 5.1782, + "step": 18143 + }, + { + "epoch": 0.10790750785041393, + "grad_norm": 1.2016780376434326, + "learning_rate": 4.8577335435951096e-05, + "loss": 5.2293, + "step": 18144 + }, + { + "epoch": 0.10791345513369492, + "grad_norm": 1.3034183979034424, + "learning_rate": 4.857718010853914e-05, + "loss": 5.2886, + "step": 18145 + }, + { + "epoch": 0.10791940241697592, + "grad_norm": 1.1815390586853027, + "learning_rate": 4.857702477289663e-05, + "loss": 5.2637, + "step": 18146 + }, + { + "epoch": 0.10792534970025693, + "grad_norm": 1.328203558921814, + "learning_rate": 4.857686942902362e-05, + "loss": 5.3154, + "step": 18147 + }, + { + "epoch": 0.10793129698353791, + "grad_norm": 1.2995961904525757, + "learning_rate": 4.857671407692016e-05, + "loss": 5.3313, + "step": 18148 + }, + { + "epoch": 0.10793724426681892, + "grad_norm": 1.181191325187683, + "learning_rate": 4.8576558716586326e-05, + "loss": 5.2589, + "step": 18149 + }, + { + "epoch": 0.10794319155009992, + "grad_norm": 1.266570806503296, + "learning_rate": 4.8576403348022154e-05, + "loss": 5.1694, + "step": 18150 + }, + { + "epoch": 0.1079491388333809, + "grad_norm": 1.4107643365859985, + "learning_rate": 4.857624797122771e-05, + "loss": 5.1784, + "step": 18151 + }, + { + "epoch": 0.10795508611666191, + "grad_norm": 1.1809200048446655, + "learning_rate": 4.8576092586203024e-05, + "loss": 5.3081, + "step": 18152 + }, + { + "epoch": 0.10796103339994291, + "grad_norm": 1.179453730583191, + "learning_rate": 4.857593719294818e-05, + "loss": 5.2534, + "step": 18153 + }, + { + "epoch": 0.1079669806832239, + "grad_norm": 1.3677690029144287, + "learning_rate": 4.857578179146323e-05, + "loss": 5.4021, + "step": 18154 + }, + { + "epoch": 0.1079729279665049, + "grad_norm": 1.3077856302261353, + "learning_rate": 4.8575626381748196e-05, + "loss": 5.1766, + "step": 18155 + }, + { + "epoch": 0.1079788752497859, + "grad_norm": 1.075791835784912, + "learning_rate": 4.857547096380317e-05, + "loss": 5.163, + "step": 18156 + }, + { + "epoch": 0.10798482253306689, + "grad_norm": 1.2855931520462036, + "learning_rate": 4.8575315537628186e-05, + "loss": 5.157, + "step": 18157 + }, + { + "epoch": 0.10799076981634789, + "grad_norm": 1.1961009502410889, + "learning_rate": 4.8575160103223303e-05, + "loss": 5.1632, + "step": 18158 + }, + { + "epoch": 0.1079967170996289, + "grad_norm": 1.6419997215270996, + "learning_rate": 4.8575004660588574e-05, + "loss": 5.1575, + "step": 18159 + }, + { + "epoch": 0.10800266438290988, + "grad_norm": 1.5928575992584229, + "learning_rate": 4.857484920972405e-05, + "loss": 5.0818, + "step": 18160 + }, + { + "epoch": 0.10800861166619088, + "grad_norm": 1.3492580652236938, + "learning_rate": 4.85746937506298e-05, + "loss": 5.1529, + "step": 18161 + }, + { + "epoch": 0.10801455894947189, + "grad_norm": 1.543717861175537, + "learning_rate": 4.857453828330587e-05, + "loss": 5.6192, + "step": 18162 + }, + { + "epoch": 0.10802050623275287, + "grad_norm": 1.5657880306243896, + "learning_rate": 4.85743828077523e-05, + "loss": 5.6619, + "step": 18163 + }, + { + "epoch": 0.10802645351603388, + "grad_norm": 1.3861533403396606, + "learning_rate": 4.8574227323969164e-05, + "loss": 5.2147, + "step": 18164 + }, + { + "epoch": 0.10803240079931488, + "grad_norm": 1.3780323266983032, + "learning_rate": 4.85740718319565e-05, + "loss": 5.1112, + "step": 18165 + }, + { + "epoch": 0.10803834808259587, + "grad_norm": 1.5768086910247803, + "learning_rate": 4.857391633171438e-05, + "loss": 5.011, + "step": 18166 + }, + { + "epoch": 0.10804429536587687, + "grad_norm": 1.4504894018173218, + "learning_rate": 4.857376082324285e-05, + "loss": 4.9349, + "step": 18167 + }, + { + "epoch": 0.10805024264915787, + "grad_norm": 1.5084949731826782, + "learning_rate": 4.857360530654196e-05, + "loss": 4.9861, + "step": 18168 + }, + { + "epoch": 0.10805618993243886, + "grad_norm": 1.4052237272262573, + "learning_rate": 4.857344978161177e-05, + "loss": 5.0447, + "step": 18169 + }, + { + "epoch": 0.10806213721571986, + "grad_norm": 1.5666663646697998, + "learning_rate": 4.857329424845233e-05, + "loss": 5.3537, + "step": 18170 + }, + { + "epoch": 0.10806808449900086, + "grad_norm": 1.251293420791626, + "learning_rate": 4.8573138707063695e-05, + "loss": 5.0139, + "step": 18171 + }, + { + "epoch": 0.10807403178228185, + "grad_norm": 1.2570216655731201, + "learning_rate": 4.8572983157445926e-05, + "loss": 4.9959, + "step": 18172 + }, + { + "epoch": 0.10807997906556285, + "grad_norm": 1.5116729736328125, + "learning_rate": 4.857282759959907e-05, + "loss": 5.1592, + "step": 18173 + }, + { + "epoch": 0.10808592634884384, + "grad_norm": 1.518898367881775, + "learning_rate": 4.857267203352318e-05, + "loss": 5.3541, + "step": 18174 + }, + { + "epoch": 0.10809187363212484, + "grad_norm": 1.314247965812683, + "learning_rate": 4.857251645921832e-05, + "loss": 5.2249, + "step": 18175 + }, + { + "epoch": 0.10809782091540585, + "grad_norm": 1.378150224685669, + "learning_rate": 4.857236087668453e-05, + "loss": 5.0004, + "step": 18176 + }, + { + "epoch": 0.10810376819868683, + "grad_norm": 1.4453868865966797, + "learning_rate": 4.8572205285921876e-05, + "loss": 5.2717, + "step": 18177 + }, + { + "epoch": 0.10810971548196784, + "grad_norm": 1.3493587970733643, + "learning_rate": 4.857204968693041e-05, + "loss": 5.4044, + "step": 18178 + }, + { + "epoch": 0.10811566276524884, + "grad_norm": 1.3819094896316528, + "learning_rate": 4.857189407971019e-05, + "loss": 5.0641, + "step": 18179 + }, + { + "epoch": 0.10812161004852983, + "grad_norm": 1.337969422340393, + "learning_rate": 4.857173846426126e-05, + "loss": 4.9078, + "step": 18180 + }, + { + "epoch": 0.10812755733181083, + "grad_norm": 1.655778408050537, + "learning_rate": 4.857158284058367e-05, + "loss": 4.9192, + "step": 18181 + }, + { + "epoch": 0.10813350461509183, + "grad_norm": 1.3867977857589722, + "learning_rate": 4.85714272086775e-05, + "loss": 4.86, + "step": 18182 + }, + { + "epoch": 0.10813945189837282, + "grad_norm": 1.5444231033325195, + "learning_rate": 4.8571271568542786e-05, + "loss": 4.9745, + "step": 18183 + }, + { + "epoch": 0.10814539918165382, + "grad_norm": 1.470123052597046, + "learning_rate": 4.8571115920179576e-05, + "loss": 5.1311, + "step": 18184 + }, + { + "epoch": 0.10815134646493482, + "grad_norm": 1.3052124977111816, + "learning_rate": 4.8570960263587936e-05, + "loss": 5.0657, + "step": 18185 + }, + { + "epoch": 0.10815729374821581, + "grad_norm": 1.4197286367416382, + "learning_rate": 4.857080459876792e-05, + "loss": 5.0798, + "step": 18186 + }, + { + "epoch": 0.10816324103149681, + "grad_norm": 1.5119234323501587, + "learning_rate": 4.857064892571958e-05, + "loss": 5.2842, + "step": 18187 + }, + { + "epoch": 0.10816918831477781, + "grad_norm": 1.6037629842758179, + "learning_rate": 4.8570493244442974e-05, + "loss": 4.8785, + "step": 18188 + }, + { + "epoch": 0.1081751355980588, + "grad_norm": 1.6456643342971802, + "learning_rate": 4.857033755493814e-05, + "loss": 5.2566, + "step": 18189 + }, + { + "epoch": 0.1081810828813398, + "grad_norm": 1.5777020454406738, + "learning_rate": 4.8570181857205155e-05, + "loss": 4.9856, + "step": 18190 + }, + { + "epoch": 0.1081870301646208, + "grad_norm": 1.6042171716690063, + "learning_rate": 4.857002615124405e-05, + "loss": 4.9179, + "step": 18191 + }, + { + "epoch": 0.1081929774479018, + "grad_norm": 1.2339718341827393, + "learning_rate": 4.856987043705491e-05, + "loss": 4.9144, + "step": 18192 + }, + { + "epoch": 0.1081989247311828, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.856971471463776e-05, + "loss": 5.0296, + "step": 18193 + }, + { + "epoch": 0.1082048720144638, + "grad_norm": 1.4179781675338745, + "learning_rate": 4.856955898399267e-05, + "loss": 5.268, + "step": 18194 + }, + { + "epoch": 0.10821081929774479, + "grad_norm": 1.5291078090667725, + "learning_rate": 4.856940324511969e-05, + "loss": 5.2433, + "step": 18195 + }, + { + "epoch": 0.10821676658102579, + "grad_norm": 1.5799169540405273, + "learning_rate": 4.856924749801888e-05, + "loss": 5.1906, + "step": 18196 + }, + { + "epoch": 0.10822271386430679, + "grad_norm": 1.4068591594696045, + "learning_rate": 4.8569091742690276e-05, + "loss": 5.2152, + "step": 18197 + }, + { + "epoch": 0.10822866114758778, + "grad_norm": 1.3728901147842407, + "learning_rate": 4.8568935979133953e-05, + "loss": 5.1717, + "step": 18198 + }, + { + "epoch": 0.10823460843086878, + "grad_norm": 1.524344563484192, + "learning_rate": 4.856878020734996e-05, + "loss": 5.0635, + "step": 18199 + }, + { + "epoch": 0.10824055571414978, + "grad_norm": 1.4725397825241089, + "learning_rate": 4.856862442733835e-05, + "loss": 5.2382, + "step": 18200 + }, + { + "epoch": 0.10824650299743077, + "grad_norm": 1.3467813730239868, + "learning_rate": 4.856846863909917e-05, + "loss": 5.0823, + "step": 18201 + }, + { + "epoch": 0.10825245028071177, + "grad_norm": 1.264833927154541, + "learning_rate": 4.856831284263249e-05, + "loss": 5.1763, + "step": 18202 + }, + { + "epoch": 0.10825839756399276, + "grad_norm": 1.2883045673370361, + "learning_rate": 4.856815703793836e-05, + "loss": 5.1207, + "step": 18203 + }, + { + "epoch": 0.10826434484727376, + "grad_norm": 1.309486746788025, + "learning_rate": 4.856800122501681e-05, + "loss": 5.0648, + "step": 18204 + }, + { + "epoch": 0.10827029213055477, + "grad_norm": 1.4473057985305786, + "learning_rate": 4.856784540386793e-05, + "loss": 4.9615, + "step": 18205 + }, + { + "epoch": 0.10827623941383575, + "grad_norm": 1.5151125192642212, + "learning_rate": 4.856768957449175e-05, + "loss": 5.2847, + "step": 18206 + }, + { + "epoch": 0.10828218669711676, + "grad_norm": 1.4859318733215332, + "learning_rate": 4.8567533736888336e-05, + "loss": 4.931, + "step": 18207 + }, + { + "epoch": 0.10828813398039776, + "grad_norm": 1.6516517400741577, + "learning_rate": 4.8567377891057745e-05, + "loss": 5.05, + "step": 18208 + }, + { + "epoch": 0.10829408126367875, + "grad_norm": 1.679347276687622, + "learning_rate": 4.8567222037000024e-05, + "loss": 5.2281, + "step": 18209 + }, + { + "epoch": 0.10830002854695975, + "grad_norm": 1.5119515657424927, + "learning_rate": 4.856706617471523e-05, + "loss": 4.9572, + "step": 18210 + }, + { + "epoch": 0.10830597583024075, + "grad_norm": 1.6819381713867188, + "learning_rate": 4.8566910304203404e-05, + "loss": 4.6228, + "step": 18211 + }, + { + "epoch": 0.10831192311352174, + "grad_norm": 1.7754294872283936, + "learning_rate": 4.856675442546462e-05, + "loss": 4.6851, + "step": 18212 + }, + { + "epoch": 0.10831787039680274, + "grad_norm": 1.455660343170166, + "learning_rate": 4.856659853849893e-05, + "loss": 5.059, + "step": 18213 + }, + { + "epoch": 0.10832381768008374, + "grad_norm": 1.358823299407959, + "learning_rate": 4.856644264330639e-05, + "loss": 5.0354, + "step": 18214 + }, + { + "epoch": 0.10832976496336473, + "grad_norm": 1.465482473373413, + "learning_rate": 4.856628673988703e-05, + "loss": 5.0441, + "step": 18215 + }, + { + "epoch": 0.10833571224664573, + "grad_norm": 1.3863260746002197, + "learning_rate": 4.8566130828240936e-05, + "loss": 5.0445, + "step": 18216 + }, + { + "epoch": 0.10834165952992673, + "grad_norm": 1.556997299194336, + "learning_rate": 4.856597490836815e-05, + "loss": 5.0629, + "step": 18217 + }, + { + "epoch": 0.10834760681320772, + "grad_norm": 1.3784066438674927, + "learning_rate": 4.856581898026872e-05, + "loss": 5.1894, + "step": 18218 + }, + { + "epoch": 0.10835355409648872, + "grad_norm": 1.4675719738006592, + "learning_rate": 4.856566304394271e-05, + "loss": 5.008, + "step": 18219 + }, + { + "epoch": 0.10835950137976973, + "grad_norm": 1.634920597076416, + "learning_rate": 4.856550709939016e-05, + "loss": 4.7707, + "step": 18220 + }, + { + "epoch": 0.10836544866305071, + "grad_norm": 1.83092200756073, + "learning_rate": 4.856535114661115e-05, + "loss": 4.8947, + "step": 18221 + }, + { + "epoch": 0.10837139594633172, + "grad_norm": 1.497359037399292, + "learning_rate": 4.856519518560571e-05, + "loss": 4.9656, + "step": 18222 + }, + { + "epoch": 0.10837734322961272, + "grad_norm": 1.3194255828857422, + "learning_rate": 4.856503921637391e-05, + "loss": 5.2374, + "step": 18223 + }, + { + "epoch": 0.1083832905128937, + "grad_norm": 1.3584619760513306, + "learning_rate": 4.8564883238915794e-05, + "loss": 5.1154, + "step": 18224 + }, + { + "epoch": 0.10838923779617471, + "grad_norm": 1.4173928499221802, + "learning_rate": 4.8564727253231416e-05, + "loss": 5.173, + "step": 18225 + }, + { + "epoch": 0.10839518507945571, + "grad_norm": 1.4110074043273926, + "learning_rate": 4.8564571259320844e-05, + "loss": 5.2409, + "step": 18226 + }, + { + "epoch": 0.1084011323627367, + "grad_norm": 1.4481827020645142, + "learning_rate": 4.856441525718412e-05, + "loss": 4.8533, + "step": 18227 + }, + { + "epoch": 0.1084070796460177, + "grad_norm": 1.4017881155014038, + "learning_rate": 4.85642592468213e-05, + "loss": 5.0483, + "step": 18228 + }, + { + "epoch": 0.1084130269292987, + "grad_norm": 1.3940458297729492, + "learning_rate": 4.8564103228232445e-05, + "loss": 5.0983, + "step": 18229 + }, + { + "epoch": 0.10841897421257969, + "grad_norm": 1.4414485692977905, + "learning_rate": 4.8563947201417604e-05, + "loss": 5.1561, + "step": 18230 + }, + { + "epoch": 0.1084249214958607, + "grad_norm": 1.3622056245803833, + "learning_rate": 4.856379116637683e-05, + "loss": 5.1773, + "step": 18231 + }, + { + "epoch": 0.10843086877914168, + "grad_norm": 1.3298035860061646, + "learning_rate": 4.856363512311019e-05, + "loss": 5.0742, + "step": 18232 + }, + { + "epoch": 0.10843681606242268, + "grad_norm": 1.3110575675964355, + "learning_rate": 4.856347907161771e-05, + "loss": 5.044, + "step": 18233 + }, + { + "epoch": 0.10844276334570369, + "grad_norm": 1.309591293334961, + "learning_rate": 4.856332301189948e-05, + "loss": 5.1313, + "step": 18234 + }, + { + "epoch": 0.10844871062898467, + "grad_norm": 1.2283830642700195, + "learning_rate": 4.856316694395552e-05, + "loss": 5.0777, + "step": 18235 + }, + { + "epoch": 0.10845465791226568, + "grad_norm": 1.1523172855377197, + "learning_rate": 4.856301086778592e-05, + "loss": 5.1245, + "step": 18236 + }, + { + "epoch": 0.10846060519554668, + "grad_norm": 1.3058217763900757, + "learning_rate": 4.85628547833907e-05, + "loss": 4.9649, + "step": 18237 + }, + { + "epoch": 0.10846655247882767, + "grad_norm": 1.239734172821045, + "learning_rate": 4.856269869076994e-05, + "loss": 5.0736, + "step": 18238 + }, + { + "epoch": 0.10847249976210867, + "grad_norm": 1.2624062299728394, + "learning_rate": 4.856254258992369e-05, + "loss": 5.0538, + "step": 18239 + }, + { + "epoch": 0.10847844704538967, + "grad_norm": 1.2172342538833618, + "learning_rate": 4.856238648085199e-05, + "loss": 5.0781, + "step": 18240 + }, + { + "epoch": 0.10848439432867066, + "grad_norm": 1.2534043788909912, + "learning_rate": 4.8562230363554906e-05, + "loss": 5.2148, + "step": 18241 + }, + { + "epoch": 0.10849034161195166, + "grad_norm": 1.3765602111816406, + "learning_rate": 4.85620742380325e-05, + "loss": 5.1274, + "step": 18242 + }, + { + "epoch": 0.10849628889523266, + "grad_norm": 1.4610897302627563, + "learning_rate": 4.856191810428481e-05, + "loss": 5.0356, + "step": 18243 + }, + { + "epoch": 0.10850223617851365, + "grad_norm": 1.4103399515151978, + "learning_rate": 4.8561761962311895e-05, + "loss": 5.0198, + "step": 18244 + }, + { + "epoch": 0.10850818346179465, + "grad_norm": 1.5159040689468384, + "learning_rate": 4.856160581211382e-05, + "loss": 5.0139, + "step": 18245 + }, + { + "epoch": 0.10851413074507565, + "grad_norm": 1.5071041584014893, + "learning_rate": 4.856144965369063e-05, + "loss": 4.9644, + "step": 18246 + }, + { + "epoch": 0.10852007802835664, + "grad_norm": 1.4504464864730835, + "learning_rate": 4.856129348704237e-05, + "loss": 5.041, + "step": 18247 + }, + { + "epoch": 0.10852602531163764, + "grad_norm": 1.2327022552490234, + "learning_rate": 4.856113731216911e-05, + "loss": 4.9775, + "step": 18248 + }, + { + "epoch": 0.10853197259491865, + "grad_norm": 2.013401508331299, + "learning_rate": 4.8560981129070914e-05, + "loss": 4.5814, + "step": 18249 + }, + { + "epoch": 0.10853791987819963, + "grad_norm": 1.7224215269088745, + "learning_rate": 4.8560824937747814e-05, + "loss": 5.3439, + "step": 18250 + }, + { + "epoch": 0.10854386716148064, + "grad_norm": 1.6198631525039673, + "learning_rate": 4.856066873819987e-05, + "loss": 5.0878, + "step": 18251 + }, + { + "epoch": 0.10854981444476164, + "grad_norm": 1.3257763385772705, + "learning_rate": 4.8560512530427146e-05, + "loss": 5.4697, + "step": 18252 + }, + { + "epoch": 0.10855576172804263, + "grad_norm": 1.6341005563735962, + "learning_rate": 4.856035631442969e-05, + "loss": 5.1383, + "step": 18253 + }, + { + "epoch": 0.10856170901132363, + "grad_norm": 1.4148058891296387, + "learning_rate": 4.8560200090207555e-05, + "loss": 5.3053, + "step": 18254 + }, + { + "epoch": 0.10856765629460463, + "grad_norm": 1.4810155630111694, + "learning_rate": 4.8560043857760796e-05, + "loss": 5.1222, + "step": 18255 + }, + { + "epoch": 0.10857360357788562, + "grad_norm": 1.4345650672912598, + "learning_rate": 4.8559887617089476e-05, + "loss": 5.2331, + "step": 18256 + }, + { + "epoch": 0.10857955086116662, + "grad_norm": 1.7319680452346802, + "learning_rate": 4.855973136819363e-05, + "loss": 4.6762, + "step": 18257 + }, + { + "epoch": 0.10858549814444762, + "grad_norm": 1.3632503747940063, + "learning_rate": 4.855957511107333e-05, + "loss": 4.8047, + "step": 18258 + }, + { + "epoch": 0.10859144542772861, + "grad_norm": 1.2798017263412476, + "learning_rate": 4.8559418845728636e-05, + "loss": 4.9368, + "step": 18259 + }, + { + "epoch": 0.10859739271100961, + "grad_norm": 1.539689540863037, + "learning_rate": 4.855926257215958e-05, + "loss": 4.8178, + "step": 18260 + }, + { + "epoch": 0.1086033399942906, + "grad_norm": 1.2351077795028687, + "learning_rate": 4.855910629036623e-05, + "loss": 5.0983, + "step": 18261 + }, + { + "epoch": 0.1086092872775716, + "grad_norm": 1.582154393196106, + "learning_rate": 4.855895000034865e-05, + "loss": 5.0563, + "step": 18262 + }, + { + "epoch": 0.1086152345608526, + "grad_norm": 1.3505899906158447, + "learning_rate": 4.855879370210688e-05, + "loss": 5.4024, + "step": 18263 + }, + { + "epoch": 0.1086211818441336, + "grad_norm": 1.236626148223877, + "learning_rate": 4.855863739564097e-05, + "loss": 5.4412, + "step": 18264 + }, + { + "epoch": 0.1086271291274146, + "grad_norm": 1.1207302808761597, + "learning_rate": 4.855848108095099e-05, + "loss": 5.3498, + "step": 18265 + }, + { + "epoch": 0.1086330764106956, + "grad_norm": 1.3238142728805542, + "learning_rate": 4.855832475803698e-05, + "loss": 4.9028, + "step": 18266 + }, + { + "epoch": 0.10863902369397659, + "grad_norm": 1.4837650060653687, + "learning_rate": 4.8558168426899006e-05, + "loss": 5.354, + "step": 18267 + }, + { + "epoch": 0.10864497097725759, + "grad_norm": 1.55657160282135, + "learning_rate": 4.8558012087537126e-05, + "loss": 5.4629, + "step": 18268 + }, + { + "epoch": 0.10865091826053859, + "grad_norm": 1.4918092489242554, + "learning_rate": 4.855785573995138e-05, + "loss": 5.046, + "step": 18269 + }, + { + "epoch": 0.10865686554381958, + "grad_norm": 1.5374544858932495, + "learning_rate": 4.855769938414183e-05, + "loss": 4.9571, + "step": 18270 + }, + { + "epoch": 0.10866281282710058, + "grad_norm": 1.360386610031128, + "learning_rate": 4.8557543020108537e-05, + "loss": 4.9482, + "step": 18271 + }, + { + "epoch": 0.10866876011038158, + "grad_norm": 1.2835793495178223, + "learning_rate": 4.855738664785154e-05, + "loss": 4.8301, + "step": 18272 + }, + { + "epoch": 0.10867470739366257, + "grad_norm": 1.453478217124939, + "learning_rate": 4.8557230267370915e-05, + "loss": 4.7873, + "step": 18273 + }, + { + "epoch": 0.10868065467694357, + "grad_norm": 1.4986752271652222, + "learning_rate": 4.855707387866669e-05, + "loss": 5.4533, + "step": 18274 + }, + { + "epoch": 0.10868660196022457, + "grad_norm": 1.574263572692871, + "learning_rate": 4.855691748173894e-05, + "loss": 5.0576, + "step": 18275 + }, + { + "epoch": 0.10869254924350556, + "grad_norm": 1.6014435291290283, + "learning_rate": 4.855676107658772e-05, + "loss": 4.8039, + "step": 18276 + }, + { + "epoch": 0.10869849652678656, + "grad_norm": 1.3822481632232666, + "learning_rate": 4.855660466321307e-05, + "loss": 4.9241, + "step": 18277 + }, + { + "epoch": 0.10870444381006757, + "grad_norm": 1.3199692964553833, + "learning_rate": 4.855644824161506e-05, + "loss": 4.842, + "step": 18278 + }, + { + "epoch": 0.10871039109334855, + "grad_norm": 1.340505599975586, + "learning_rate": 4.855629181179373e-05, + "loss": 4.8217, + "step": 18279 + }, + { + "epoch": 0.10871633837662956, + "grad_norm": 1.32645845413208, + "learning_rate": 4.8556135373749144e-05, + "loss": 4.9701, + "step": 18280 + }, + { + "epoch": 0.10872228565991056, + "grad_norm": 1.3629400730133057, + "learning_rate": 4.855597892748135e-05, + "loss": 5.2129, + "step": 18281 + }, + { + "epoch": 0.10872823294319155, + "grad_norm": 1.504604458808899, + "learning_rate": 4.8555822472990415e-05, + "loss": 4.988, + "step": 18282 + }, + { + "epoch": 0.10873418022647255, + "grad_norm": 1.514352560043335, + "learning_rate": 4.855566601027638e-05, + "loss": 4.8909, + "step": 18283 + }, + { + "epoch": 0.10874012750975355, + "grad_norm": 1.35514235496521, + "learning_rate": 4.85555095393393e-05, + "loss": 4.9441, + "step": 18284 + }, + { + "epoch": 0.10874607479303454, + "grad_norm": 1.1690728664398193, + "learning_rate": 4.8555353060179256e-05, + "loss": 5.3733, + "step": 18285 + }, + { + "epoch": 0.10875202207631554, + "grad_norm": 1.3280658721923828, + "learning_rate": 4.855519657279626e-05, + "loss": 5.4406, + "step": 18286 + }, + { + "epoch": 0.10875796935959654, + "grad_norm": 1.5852582454681396, + "learning_rate": 4.85550400771904e-05, + "loss": 5.176, + "step": 18287 + }, + { + "epoch": 0.10876391664287753, + "grad_norm": 1.233869194984436, + "learning_rate": 4.855488357336172e-05, + "loss": 5.2879, + "step": 18288 + }, + { + "epoch": 0.10876986392615853, + "grad_norm": 1.365251064300537, + "learning_rate": 4.855472706131027e-05, + "loss": 5.1592, + "step": 18289 + }, + { + "epoch": 0.10877581120943952, + "grad_norm": 1.6119641065597534, + "learning_rate": 4.8554570541036104e-05, + "loss": 5.0079, + "step": 18290 + }, + { + "epoch": 0.10878175849272052, + "grad_norm": 1.3233095407485962, + "learning_rate": 4.855441401253928e-05, + "loss": 5.3579, + "step": 18291 + }, + { + "epoch": 0.10878770577600153, + "grad_norm": 1.3345812559127808, + "learning_rate": 4.855425747581986e-05, + "loss": 5.1435, + "step": 18292 + }, + { + "epoch": 0.10879365305928251, + "grad_norm": 1.6694916486740112, + "learning_rate": 4.855410093087789e-05, + "loss": 5.0007, + "step": 18293 + }, + { + "epoch": 0.10879960034256352, + "grad_norm": 1.5835634469985962, + "learning_rate": 4.855394437771342e-05, + "loss": 4.9706, + "step": 18294 + }, + { + "epoch": 0.10880554762584452, + "grad_norm": 1.5465360879898071, + "learning_rate": 4.8553787816326526e-05, + "loss": 4.8983, + "step": 18295 + }, + { + "epoch": 0.1088114949091255, + "grad_norm": 1.4393326044082642, + "learning_rate": 4.855363124671723e-05, + "loss": 4.9365, + "step": 18296 + }, + { + "epoch": 0.10881744219240651, + "grad_norm": 1.5096935033798218, + "learning_rate": 4.8553474668885626e-05, + "loss": 4.8343, + "step": 18297 + }, + { + "epoch": 0.10882338947568751, + "grad_norm": 1.422397255897522, + "learning_rate": 4.8553318082831735e-05, + "loss": 4.9229, + "step": 18298 + }, + { + "epoch": 0.1088293367589685, + "grad_norm": 1.6444910764694214, + "learning_rate": 4.855316148855562e-05, + "loss": 5.0403, + "step": 18299 + }, + { + "epoch": 0.1088352840422495, + "grad_norm": 1.3621931076049805, + "learning_rate": 4.855300488605734e-05, + "loss": 4.9027, + "step": 18300 + }, + { + "epoch": 0.1088412313255305, + "grad_norm": 1.5086915493011475, + "learning_rate": 4.855284827533696e-05, + "loss": 4.95, + "step": 18301 + }, + { + "epoch": 0.10884717860881149, + "grad_norm": 1.7021756172180176, + "learning_rate": 4.855269165639451e-05, + "loss": 4.8245, + "step": 18302 + }, + { + "epoch": 0.10885312589209249, + "grad_norm": 1.6745699644088745, + "learning_rate": 4.855253502923007e-05, + "loss": 4.7832, + "step": 18303 + }, + { + "epoch": 0.1088590731753735, + "grad_norm": 1.2379045486450195, + "learning_rate": 4.8552378393843676e-05, + "loss": 5.0438, + "step": 18304 + }, + { + "epoch": 0.10886502045865448, + "grad_norm": 1.3999474048614502, + "learning_rate": 4.85522217502354e-05, + "loss": 5.0123, + "step": 18305 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 1.3539077043533325, + "learning_rate": 4.8552065098405276e-05, + "loss": 5.0722, + "step": 18306 + }, + { + "epoch": 0.10887691502521649, + "grad_norm": 1.3992128372192383, + "learning_rate": 4.8551908438353374e-05, + "loss": 4.9449, + "step": 18307 + }, + { + "epoch": 0.10888286230849747, + "grad_norm": 1.617443323135376, + "learning_rate": 4.8551751770079744e-05, + "loss": 5.1081, + "step": 18308 + }, + { + "epoch": 0.10888880959177848, + "grad_norm": 1.6027116775512695, + "learning_rate": 4.8551595093584446e-05, + "loss": 5.06, + "step": 18309 + }, + { + "epoch": 0.10889475687505948, + "grad_norm": 1.1488780975341797, + "learning_rate": 4.855143840886752e-05, + "loss": 5.1771, + "step": 18310 + }, + { + "epoch": 0.10890070415834047, + "grad_norm": 1.5683537721633911, + "learning_rate": 4.855128171592903e-05, + "loss": 5.1402, + "step": 18311 + }, + { + "epoch": 0.10890665144162147, + "grad_norm": 1.2840538024902344, + "learning_rate": 4.855112501476904e-05, + "loss": 5.2887, + "step": 18312 + }, + { + "epoch": 0.10891259872490247, + "grad_norm": 1.2311303615570068, + "learning_rate": 4.855096830538759e-05, + "loss": 5.2057, + "step": 18313 + }, + { + "epoch": 0.10891854600818346, + "grad_norm": 1.3655261993408203, + "learning_rate": 4.855081158778474e-05, + "loss": 5.3298, + "step": 18314 + }, + { + "epoch": 0.10892449329146446, + "grad_norm": 1.3405102491378784, + "learning_rate": 4.855065486196055e-05, + "loss": 5.3249, + "step": 18315 + }, + { + "epoch": 0.10893044057474546, + "grad_norm": 1.3816508054733276, + "learning_rate": 4.855049812791506e-05, + "loss": 5.2829, + "step": 18316 + }, + { + "epoch": 0.10893638785802645, + "grad_norm": 1.1929587125778198, + "learning_rate": 4.855034138564835e-05, + "loss": 5.5317, + "step": 18317 + }, + { + "epoch": 0.10894233514130745, + "grad_norm": 1.2426830530166626, + "learning_rate": 4.855018463516045e-05, + "loss": 5.263, + "step": 18318 + }, + { + "epoch": 0.10894828242458844, + "grad_norm": 1.3385604619979858, + "learning_rate": 4.855002787645141e-05, + "loss": 5.2531, + "step": 18319 + }, + { + "epoch": 0.10895422970786944, + "grad_norm": 1.2306677103042603, + "learning_rate": 4.8549871109521314e-05, + "loss": 5.245, + "step": 18320 + }, + { + "epoch": 0.10896017699115045, + "grad_norm": 1.3108047246932983, + "learning_rate": 4.85497143343702e-05, + "loss": 5.3063, + "step": 18321 + }, + { + "epoch": 0.10896612427443143, + "grad_norm": 1.3951044082641602, + "learning_rate": 4.8549557550998126e-05, + "loss": 5.4842, + "step": 18322 + }, + { + "epoch": 0.10897207155771244, + "grad_norm": 1.4618322849273682, + "learning_rate": 4.854940075940514e-05, + "loss": 5.5703, + "step": 18323 + }, + { + "epoch": 0.10897801884099344, + "grad_norm": 1.3512097597122192, + "learning_rate": 4.8549243959591304e-05, + "loss": 5.2615, + "step": 18324 + }, + { + "epoch": 0.10898396612427443, + "grad_norm": 1.261428713798523, + "learning_rate": 4.8549087151556675e-05, + "loss": 5.2617, + "step": 18325 + }, + { + "epoch": 0.10898991340755543, + "grad_norm": 1.5647974014282227, + "learning_rate": 4.854893033530129e-05, + "loss": 5.0529, + "step": 18326 + }, + { + "epoch": 0.10899586069083643, + "grad_norm": 1.3635188341140747, + "learning_rate": 4.8548773510825226e-05, + "loss": 5.1029, + "step": 18327 + }, + { + "epoch": 0.10900180797411742, + "grad_norm": 1.2746639251708984, + "learning_rate": 4.854861667812852e-05, + "loss": 5.1788, + "step": 18328 + }, + { + "epoch": 0.10900775525739842, + "grad_norm": 1.3292982578277588, + "learning_rate": 4.854845983721125e-05, + "loss": 5.2442, + "step": 18329 + }, + { + "epoch": 0.10901370254067942, + "grad_norm": 1.3015047311782837, + "learning_rate": 4.854830298807345e-05, + "loss": 5.2234, + "step": 18330 + }, + { + "epoch": 0.10901964982396041, + "grad_norm": 1.2642244100570679, + "learning_rate": 4.854814613071518e-05, + "loss": 5.1501, + "step": 18331 + }, + { + "epoch": 0.10902559710724141, + "grad_norm": 1.191630482673645, + "learning_rate": 4.8547989265136484e-05, + "loss": 5.1618, + "step": 18332 + }, + { + "epoch": 0.10903154439052241, + "grad_norm": 1.4171391725540161, + "learning_rate": 4.8547832391337445e-05, + "loss": 5.1431, + "step": 18333 + }, + { + "epoch": 0.1090374916738034, + "grad_norm": 1.3901907205581665, + "learning_rate": 4.854767550931809e-05, + "loss": 5.1464, + "step": 18334 + }, + { + "epoch": 0.1090434389570844, + "grad_norm": 1.5166548490524292, + "learning_rate": 4.854751861907849e-05, + "loss": 5.0841, + "step": 18335 + }, + { + "epoch": 0.1090493862403654, + "grad_norm": 1.3555935621261597, + "learning_rate": 4.854736172061869e-05, + "loss": 5.2947, + "step": 18336 + }, + { + "epoch": 0.1090553335236464, + "grad_norm": 1.1348215341567993, + "learning_rate": 4.854720481393875e-05, + "loss": 5.2813, + "step": 18337 + }, + { + "epoch": 0.1090612808069274, + "grad_norm": 1.3353219032287598, + "learning_rate": 4.8547047899038734e-05, + "loss": 5.2473, + "step": 18338 + }, + { + "epoch": 0.1090672280902084, + "grad_norm": 1.550512671470642, + "learning_rate": 4.854689097591868e-05, + "loss": 5.1364, + "step": 18339 + }, + { + "epoch": 0.10907317537348939, + "grad_norm": 1.5353589057922363, + "learning_rate": 4.8546734044578646e-05, + "loss": 5.0105, + "step": 18340 + }, + { + "epoch": 0.10907912265677039, + "grad_norm": 1.4025498628616333, + "learning_rate": 4.85465771050187e-05, + "loss": 5.0779, + "step": 18341 + }, + { + "epoch": 0.10908506994005139, + "grad_norm": 1.220438838005066, + "learning_rate": 4.8546420157238874e-05, + "loss": 5.0732, + "step": 18342 + }, + { + "epoch": 0.10909101722333238, + "grad_norm": 1.4058369398117065, + "learning_rate": 4.8546263201239245e-05, + "loss": 5.0838, + "step": 18343 + }, + { + "epoch": 0.10909696450661338, + "grad_norm": 1.4438905715942383, + "learning_rate": 4.854610623701986e-05, + "loss": 5.0449, + "step": 18344 + }, + { + "epoch": 0.10910291178989438, + "grad_norm": 1.536890983581543, + "learning_rate": 4.854594926458076e-05, + "loss": 4.9601, + "step": 18345 + }, + { + "epoch": 0.10910885907317537, + "grad_norm": 1.3566638231277466, + "learning_rate": 4.8545792283922025e-05, + "loss": 4.9283, + "step": 18346 + }, + { + "epoch": 0.10911480635645637, + "grad_norm": 1.3086943626403809, + "learning_rate": 4.8545635295043694e-05, + "loss": 5.0638, + "step": 18347 + }, + { + "epoch": 0.10912075363973736, + "grad_norm": 1.330124020576477, + "learning_rate": 4.854547829794582e-05, + "loss": 5.0944, + "step": 18348 + }, + { + "epoch": 0.10912670092301836, + "grad_norm": 1.4076783657073975, + "learning_rate": 4.854532129262848e-05, + "loss": 4.9725, + "step": 18349 + }, + { + "epoch": 0.10913264820629937, + "grad_norm": 1.380814552307129, + "learning_rate": 4.854516427909169e-05, + "loss": 5.0551, + "step": 18350 + }, + { + "epoch": 0.10913859548958035, + "grad_norm": 1.4243587255477905, + "learning_rate": 4.854500725733554e-05, + "loss": 5.103, + "step": 18351 + }, + { + "epoch": 0.10914454277286136, + "grad_norm": 1.438328742980957, + "learning_rate": 4.854485022736006e-05, + "loss": 5.1153, + "step": 18352 + }, + { + "epoch": 0.10915049005614236, + "grad_norm": 1.4602978229522705, + "learning_rate": 4.8544693189165324e-05, + "loss": 4.8916, + "step": 18353 + }, + { + "epoch": 0.10915643733942335, + "grad_norm": 1.548378586769104, + "learning_rate": 4.8544536142751385e-05, + "loss": 5.0205, + "step": 18354 + }, + { + "epoch": 0.10916238462270435, + "grad_norm": 1.33285653591156, + "learning_rate": 4.854437908811828e-05, + "loss": 4.9558, + "step": 18355 + }, + { + "epoch": 0.10916833190598535, + "grad_norm": 1.442918300628662, + "learning_rate": 4.854422202526609e-05, + "loss": 4.9119, + "step": 18356 + }, + { + "epoch": 0.10917427918926634, + "grad_norm": 1.498830795288086, + "learning_rate": 4.8544064954194836e-05, + "loss": 4.9787, + "step": 18357 + }, + { + "epoch": 0.10918022647254734, + "grad_norm": 1.422012209892273, + "learning_rate": 4.85439078749046e-05, + "loss": 5.0013, + "step": 18358 + }, + { + "epoch": 0.10918617375582834, + "grad_norm": 1.4635952711105347, + "learning_rate": 4.854375078739543e-05, + "loss": 4.8389, + "step": 18359 + }, + { + "epoch": 0.10919212103910933, + "grad_norm": 1.3973792791366577, + "learning_rate": 4.854359369166738e-05, + "loss": 4.9503, + "step": 18360 + }, + { + "epoch": 0.10919806832239033, + "grad_norm": 1.4016454219818115, + "learning_rate": 4.8543436587720504e-05, + "loss": 4.8533, + "step": 18361 + }, + { + "epoch": 0.10920401560567133, + "grad_norm": 1.215690016746521, + "learning_rate": 4.854327947555486e-05, + "loss": 5.0961, + "step": 18362 + }, + { + "epoch": 0.10920996288895232, + "grad_norm": 1.1589696407318115, + "learning_rate": 4.85431223551705e-05, + "loss": 4.8991, + "step": 18363 + }, + { + "epoch": 0.10921591017223332, + "grad_norm": 1.2894245386123657, + "learning_rate": 4.854296522656748e-05, + "loss": 5.0622, + "step": 18364 + }, + { + "epoch": 0.10922185745551433, + "grad_norm": 1.3525546789169312, + "learning_rate": 4.854280808974585e-05, + "loss": 5.1679, + "step": 18365 + }, + { + "epoch": 0.10922780473879531, + "grad_norm": 1.2055712938308716, + "learning_rate": 4.854265094470567e-05, + "loss": 5.2706, + "step": 18366 + }, + { + "epoch": 0.10923375202207632, + "grad_norm": 1.3646256923675537, + "learning_rate": 4.8542493791447e-05, + "loss": 5.2381, + "step": 18367 + }, + { + "epoch": 0.10923969930535732, + "grad_norm": 1.535840630531311, + "learning_rate": 4.8542336629969875e-05, + "loss": 5.0133, + "step": 18368 + }, + { + "epoch": 0.1092456465886383, + "grad_norm": 1.3226375579833984, + "learning_rate": 4.854217946027437e-05, + "loss": 4.9518, + "step": 18369 + }, + { + "epoch": 0.10925159387191931, + "grad_norm": 1.4403883218765259, + "learning_rate": 4.854202228236054e-05, + "loss": 5.1958, + "step": 18370 + }, + { + "epoch": 0.10925754115520031, + "grad_norm": 1.3661396503448486, + "learning_rate": 4.8541865096228426e-05, + "loss": 5.297, + "step": 18371 + }, + { + "epoch": 0.1092634884384813, + "grad_norm": 1.1291767358779907, + "learning_rate": 4.8541707901878096e-05, + "loss": 5.0954, + "step": 18372 + }, + { + "epoch": 0.1092694357217623, + "grad_norm": 1.414288878440857, + "learning_rate": 4.854155069930959e-05, + "loss": 5.0499, + "step": 18373 + }, + { + "epoch": 0.1092753830050433, + "grad_norm": 1.405760407447815, + "learning_rate": 4.8541393488522976e-05, + "loss": 5.004, + "step": 18374 + }, + { + "epoch": 0.10928133028832429, + "grad_norm": 1.2152272462844849, + "learning_rate": 4.854123626951831e-05, + "loss": 4.9798, + "step": 18375 + }, + { + "epoch": 0.10928727757160529, + "grad_norm": 1.3401811122894287, + "learning_rate": 4.854107904229564e-05, + "loss": 5.1179, + "step": 18376 + }, + { + "epoch": 0.10929322485488628, + "grad_norm": 1.036811113357544, + "learning_rate": 4.854092180685502e-05, + "loss": 5.129, + "step": 18377 + }, + { + "epoch": 0.10929917213816728, + "grad_norm": 1.380259394645691, + "learning_rate": 4.8540764563196506e-05, + "loss": 5.163, + "step": 18378 + }, + { + "epoch": 0.10930511942144829, + "grad_norm": 1.3078418970108032, + "learning_rate": 4.8540607311320156e-05, + "loss": 4.9882, + "step": 18379 + }, + { + "epoch": 0.10931106670472927, + "grad_norm": 1.2273530960083008, + "learning_rate": 4.854045005122603e-05, + "loss": 5.0736, + "step": 18380 + }, + { + "epoch": 0.10931701398801028, + "grad_norm": 1.1997276544570923, + "learning_rate": 4.8540292782914164e-05, + "loss": 4.9193, + "step": 18381 + }, + { + "epoch": 0.10932296127129128, + "grad_norm": 1.2119728326797485, + "learning_rate": 4.854013550638463e-05, + "loss": 4.9752, + "step": 18382 + }, + { + "epoch": 0.10932890855457227, + "grad_norm": 1.1508461236953735, + "learning_rate": 4.853997822163748e-05, + "loss": 4.8432, + "step": 18383 + }, + { + "epoch": 0.10933485583785327, + "grad_norm": 1.2142893075942993, + "learning_rate": 4.853982092867276e-05, + "loss": 5.0771, + "step": 18384 + }, + { + "epoch": 0.10934080312113427, + "grad_norm": 1.1016231775283813, + "learning_rate": 4.8539663627490536e-05, + "loss": 5.0918, + "step": 18385 + }, + { + "epoch": 0.10934675040441526, + "grad_norm": 1.2202482223510742, + "learning_rate": 4.8539506318090865e-05, + "loss": 5.1181, + "step": 18386 + }, + { + "epoch": 0.10935269768769626, + "grad_norm": 1.3560340404510498, + "learning_rate": 4.853934900047379e-05, + "loss": 5.1007, + "step": 18387 + }, + { + "epoch": 0.10935864497097726, + "grad_norm": 1.350473165512085, + "learning_rate": 4.8539191674639374e-05, + "loss": 5.1084, + "step": 18388 + }, + { + "epoch": 0.10936459225425825, + "grad_norm": 1.5102394819259644, + "learning_rate": 4.853903434058766e-05, + "loss": 5.0825, + "step": 18389 + }, + { + "epoch": 0.10937053953753925, + "grad_norm": 1.3704886436462402, + "learning_rate": 4.853887699831872e-05, + "loss": 5.1083, + "step": 18390 + }, + { + "epoch": 0.10937648682082025, + "grad_norm": 1.315167784690857, + "learning_rate": 4.8538719647832606e-05, + "loss": 4.9786, + "step": 18391 + }, + { + "epoch": 0.10938243410410124, + "grad_norm": 1.5208832025527954, + "learning_rate": 4.8538562289129356e-05, + "loss": 4.9011, + "step": 18392 + }, + { + "epoch": 0.10938838138738224, + "grad_norm": 1.3259782791137695, + "learning_rate": 4.8538404922209046e-05, + "loss": 4.9368, + "step": 18393 + }, + { + "epoch": 0.10939432867066325, + "grad_norm": 1.3342556953430176, + "learning_rate": 4.853824754707172e-05, + "loss": 4.9858, + "step": 18394 + }, + { + "epoch": 0.10940027595394423, + "grad_norm": 1.2291737794876099, + "learning_rate": 4.853809016371743e-05, + "loss": 5.0289, + "step": 18395 + }, + { + "epoch": 0.10940622323722524, + "grad_norm": 1.1539384126663208, + "learning_rate": 4.8537932772146245e-05, + "loss": 4.9444, + "step": 18396 + }, + { + "epoch": 0.10941217052050624, + "grad_norm": 1.2171412706375122, + "learning_rate": 4.8537775372358204e-05, + "loss": 4.9818, + "step": 18397 + }, + { + "epoch": 0.10941811780378723, + "grad_norm": 1.2133311033248901, + "learning_rate": 4.8537617964353374e-05, + "loss": 5.2647, + "step": 18398 + }, + { + "epoch": 0.10942406508706823, + "grad_norm": 1.2499877214431763, + "learning_rate": 4.8537460548131796e-05, + "loss": 5.4893, + "step": 18399 + }, + { + "epoch": 0.10943001237034923, + "grad_norm": 1.2127736806869507, + "learning_rate": 4.8537303123693545e-05, + "loss": 5.3607, + "step": 18400 + }, + { + "epoch": 0.10943595965363022, + "grad_norm": 1.3051133155822754, + "learning_rate": 4.853714569103865e-05, + "loss": 5.4531, + "step": 18401 + }, + { + "epoch": 0.10944190693691122, + "grad_norm": 1.3183389902114868, + "learning_rate": 4.85369882501672e-05, + "loss": 5.1784, + "step": 18402 + }, + { + "epoch": 0.10944785422019222, + "grad_norm": 1.5276503562927246, + "learning_rate": 4.853683080107922e-05, + "loss": 4.9092, + "step": 18403 + }, + { + "epoch": 0.10945380150347321, + "grad_norm": 1.519415259361267, + "learning_rate": 4.853667334377478e-05, + "loss": 4.7973, + "step": 18404 + }, + { + "epoch": 0.10945974878675421, + "grad_norm": 1.4063026905059814, + "learning_rate": 4.853651587825392e-05, + "loss": 4.7771, + "step": 18405 + }, + { + "epoch": 0.1094656960700352, + "grad_norm": 1.2753932476043701, + "learning_rate": 4.8536358404516715e-05, + "loss": 4.7902, + "step": 18406 + }, + { + "epoch": 0.1094716433533162, + "grad_norm": 1.5203404426574707, + "learning_rate": 4.8536200922563205e-05, + "loss": 4.961, + "step": 18407 + }, + { + "epoch": 0.1094775906365972, + "grad_norm": 1.4700336456298828, + "learning_rate": 4.8536043432393455e-05, + "loss": 5.0276, + "step": 18408 + }, + { + "epoch": 0.1094835379198782, + "grad_norm": 1.3945552110671997, + "learning_rate": 4.8535885934007506e-05, + "loss": 4.9641, + "step": 18409 + }, + { + "epoch": 0.1094894852031592, + "grad_norm": 1.1885923147201538, + "learning_rate": 4.853572842740544e-05, + "loss": 4.9162, + "step": 18410 + }, + { + "epoch": 0.1094954324864402, + "grad_norm": 1.414090871810913, + "learning_rate": 4.853557091258728e-05, + "loss": 4.9317, + "step": 18411 + }, + { + "epoch": 0.10950137976972119, + "grad_norm": 1.4395371675491333, + "learning_rate": 4.85354133895531e-05, + "loss": 4.7658, + "step": 18412 + }, + { + "epoch": 0.10950732705300219, + "grad_norm": 1.351665735244751, + "learning_rate": 4.8535255858302944e-05, + "loss": 4.9385, + "step": 18413 + }, + { + "epoch": 0.10951327433628319, + "grad_norm": 1.5085922479629517, + "learning_rate": 4.853509831883688e-05, + "loss": 5.0192, + "step": 18414 + }, + { + "epoch": 0.10951922161956418, + "grad_norm": 1.3413939476013184, + "learning_rate": 4.8534940771154954e-05, + "loss": 4.9193, + "step": 18415 + }, + { + "epoch": 0.10952516890284518, + "grad_norm": 1.532934546470642, + "learning_rate": 4.853478321525723e-05, + "loss": 4.9137, + "step": 18416 + }, + { + "epoch": 0.10953111618612618, + "grad_norm": 1.388016700744629, + "learning_rate": 4.8534625651143754e-05, + "loss": 4.9381, + "step": 18417 + }, + { + "epoch": 0.10953706346940717, + "grad_norm": 1.551255702972412, + "learning_rate": 4.853446807881458e-05, + "loss": 5.0973, + "step": 18418 + }, + { + "epoch": 0.10954301075268817, + "grad_norm": 1.4487138986587524, + "learning_rate": 4.853431049826976e-05, + "loss": 5.1313, + "step": 18419 + }, + { + "epoch": 0.10954895803596917, + "grad_norm": 1.467703104019165, + "learning_rate": 4.853415290950936e-05, + "loss": 5.0381, + "step": 18420 + }, + { + "epoch": 0.10955490531925016, + "grad_norm": 1.4529845714569092, + "learning_rate": 4.853399531253343e-05, + "loss": 4.9945, + "step": 18421 + }, + { + "epoch": 0.10956085260253116, + "grad_norm": 1.230872631072998, + "learning_rate": 4.8533837707342036e-05, + "loss": 5.0579, + "step": 18422 + }, + { + "epoch": 0.10956679988581217, + "grad_norm": 1.3668066263198853, + "learning_rate": 4.8533680093935206e-05, + "loss": 5.2567, + "step": 18423 + }, + { + "epoch": 0.10957274716909315, + "grad_norm": 1.3560447692871094, + "learning_rate": 4.853352247231302e-05, + "loss": 5.0152, + "step": 18424 + }, + { + "epoch": 0.10957869445237416, + "grad_norm": 1.4296886920928955, + "learning_rate": 4.8533364842475524e-05, + "loss": 5.1132, + "step": 18425 + }, + { + "epoch": 0.10958464173565516, + "grad_norm": 1.4232845306396484, + "learning_rate": 4.853320720442277e-05, + "loss": 5.0427, + "step": 18426 + }, + { + "epoch": 0.10959058901893615, + "grad_norm": 1.4019423723220825, + "learning_rate": 4.8533049558154826e-05, + "loss": 5.2369, + "step": 18427 + }, + { + "epoch": 0.10959653630221715, + "grad_norm": 1.5423427820205688, + "learning_rate": 4.853289190367173e-05, + "loss": 5.1053, + "step": 18428 + }, + { + "epoch": 0.10960248358549815, + "grad_norm": 1.5049951076507568, + "learning_rate": 4.8532734240973545e-05, + "loss": 5.3784, + "step": 18429 + }, + { + "epoch": 0.10960843086877914, + "grad_norm": 1.678328037261963, + "learning_rate": 4.853257657006033e-05, + "loss": 5.3021, + "step": 18430 + }, + { + "epoch": 0.10961437815206014, + "grad_norm": 1.5986173152923584, + "learning_rate": 4.853241889093213e-05, + "loss": 5.1686, + "step": 18431 + }, + { + "epoch": 0.10962032543534114, + "grad_norm": 1.5304551124572754, + "learning_rate": 4.853226120358901e-05, + "loss": 5.2319, + "step": 18432 + }, + { + "epoch": 0.10962627271862213, + "grad_norm": 1.609595775604248, + "learning_rate": 4.853210350803102e-05, + "loss": 5.0256, + "step": 18433 + }, + { + "epoch": 0.10963222000190313, + "grad_norm": 1.3506170511245728, + "learning_rate": 4.853194580425821e-05, + "loss": 5.0792, + "step": 18434 + }, + { + "epoch": 0.10963816728518412, + "grad_norm": 1.2946768999099731, + "learning_rate": 4.853178809227065e-05, + "loss": 5.0155, + "step": 18435 + }, + { + "epoch": 0.10964411456846512, + "grad_norm": 1.5691487789154053, + "learning_rate": 4.853163037206838e-05, + "loss": 5.1302, + "step": 18436 + }, + { + "epoch": 0.10965006185174613, + "grad_norm": 1.6740599870681763, + "learning_rate": 4.853147264365146e-05, + "loss": 5.2371, + "step": 18437 + }, + { + "epoch": 0.10965600913502711, + "grad_norm": 1.4822674989700317, + "learning_rate": 4.853131490701995e-05, + "loss": 5.0194, + "step": 18438 + }, + { + "epoch": 0.10966195641830812, + "grad_norm": 1.385177493095398, + "learning_rate": 4.853115716217389e-05, + "loss": 4.9444, + "step": 18439 + }, + { + "epoch": 0.10966790370158912, + "grad_norm": 1.3696002960205078, + "learning_rate": 4.853099940911337e-05, + "loss": 5.0557, + "step": 18440 + }, + { + "epoch": 0.1096738509848701, + "grad_norm": 1.6609543561935425, + "learning_rate": 4.8530841647838396e-05, + "loss": 4.9032, + "step": 18441 + }, + { + "epoch": 0.10967979826815111, + "grad_norm": 1.5938438177108765, + "learning_rate": 4.8530683878349056e-05, + "loss": 4.8639, + "step": 18442 + }, + { + "epoch": 0.10968574555143211, + "grad_norm": 1.4565002918243408, + "learning_rate": 4.85305261006454e-05, + "loss": 5.0483, + "step": 18443 + }, + { + "epoch": 0.1096916928347131, + "grad_norm": 1.5930250883102417, + "learning_rate": 4.853036831472749e-05, + "loss": 5.0751, + "step": 18444 + }, + { + "epoch": 0.1096976401179941, + "grad_norm": 1.5648735761642456, + "learning_rate": 4.853021052059536e-05, + "loss": 5.0991, + "step": 18445 + }, + { + "epoch": 0.1097035874012751, + "grad_norm": 1.4230155944824219, + "learning_rate": 4.8530052718249076e-05, + "loss": 5.098, + "step": 18446 + }, + { + "epoch": 0.10970953468455609, + "grad_norm": 1.4366841316223145, + "learning_rate": 4.85298949076887e-05, + "loss": 5.0975, + "step": 18447 + }, + { + "epoch": 0.10971548196783709, + "grad_norm": 1.437514066696167, + "learning_rate": 4.852973708891427e-05, + "loss": 5.0325, + "step": 18448 + }, + { + "epoch": 0.1097214292511181, + "grad_norm": 2.0367636680603027, + "learning_rate": 4.852957926192586e-05, + "loss": 5.2064, + "step": 18449 + }, + { + "epoch": 0.10972737653439908, + "grad_norm": 2.16357684135437, + "learning_rate": 4.852942142672352e-05, + "loss": 5.1532, + "step": 18450 + }, + { + "epoch": 0.10973332381768008, + "grad_norm": 1.6931402683258057, + "learning_rate": 4.8529263583307296e-05, + "loss": 5.2128, + "step": 18451 + }, + { + "epoch": 0.10973927110096109, + "grad_norm": 2.4651196002960205, + "learning_rate": 4.852910573167725e-05, + "loss": 4.798, + "step": 18452 + }, + { + "epoch": 0.10974521838424207, + "grad_norm": 1.7160784006118774, + "learning_rate": 4.852894787183344e-05, + "loss": 5.5087, + "step": 18453 + }, + { + "epoch": 0.10975116566752308, + "grad_norm": 1.478097915649414, + "learning_rate": 4.852879000377591e-05, + "loss": 5.6876, + "step": 18454 + }, + { + "epoch": 0.10975711295080408, + "grad_norm": 1.8612531423568726, + "learning_rate": 4.852863212750474e-05, + "loss": 5.2259, + "step": 18455 + }, + { + "epoch": 0.10976306023408507, + "grad_norm": 1.6869621276855469, + "learning_rate": 4.852847424301995e-05, + "loss": 5.5294, + "step": 18456 + }, + { + "epoch": 0.10976900751736607, + "grad_norm": 1.7378077507019043, + "learning_rate": 4.852831635032161e-05, + "loss": 5.4568, + "step": 18457 + }, + { + "epoch": 0.10977495480064707, + "grad_norm": 1.7788033485412598, + "learning_rate": 4.852815844940979e-05, + "loss": 5.2331, + "step": 18458 + }, + { + "epoch": 0.10978090208392806, + "grad_norm": 1.8730370998382568, + "learning_rate": 4.852800054028453e-05, + "loss": 4.9792, + "step": 18459 + }, + { + "epoch": 0.10978684936720906, + "grad_norm": 1.5126397609710693, + "learning_rate": 4.852784262294588e-05, + "loss": 5.3134, + "step": 18460 + }, + { + "epoch": 0.10979279665049006, + "grad_norm": 1.6687992811203003, + "learning_rate": 4.8527684697393914e-05, + "loss": 5.3296, + "step": 18461 + }, + { + "epoch": 0.10979874393377105, + "grad_norm": 1.6268471479415894, + "learning_rate": 4.852752676362867e-05, + "loss": 4.9804, + "step": 18462 + }, + { + "epoch": 0.10980469121705205, + "grad_norm": 1.7055017948150635, + "learning_rate": 4.8527368821650214e-05, + "loss": 5.0289, + "step": 18463 + }, + { + "epoch": 0.10981063850033304, + "grad_norm": 1.489247441291809, + "learning_rate": 4.852721087145859e-05, + "loss": 5.0428, + "step": 18464 + }, + { + "epoch": 0.10981658578361404, + "grad_norm": 1.7411161661148071, + "learning_rate": 4.8527052913053874e-05, + "loss": 5.1142, + "step": 18465 + }, + { + "epoch": 0.10982253306689505, + "grad_norm": 1.5776443481445312, + "learning_rate": 4.8526894946436094e-05, + "loss": 5.2881, + "step": 18466 + }, + { + "epoch": 0.10982848035017603, + "grad_norm": 1.342997431755066, + "learning_rate": 4.852673697160532e-05, + "loss": 5.0295, + "step": 18467 + }, + { + "epoch": 0.10983442763345704, + "grad_norm": 1.1686962842941284, + "learning_rate": 4.8526578988561606e-05, + "loss": 5.0607, + "step": 18468 + }, + { + "epoch": 0.10984037491673804, + "grad_norm": 1.578697681427002, + "learning_rate": 4.8526420997305006e-05, + "loss": 5.3291, + "step": 18469 + }, + { + "epoch": 0.10984632220001903, + "grad_norm": 1.5248758792877197, + "learning_rate": 4.8526262997835575e-05, + "loss": 5.1206, + "step": 18470 + }, + { + "epoch": 0.10985226948330003, + "grad_norm": 1.1425076723098755, + "learning_rate": 4.852610499015337e-05, + "loss": 5.1892, + "step": 18471 + }, + { + "epoch": 0.10985821676658103, + "grad_norm": 1.356423020362854, + "learning_rate": 4.852594697425844e-05, + "loss": 4.9477, + "step": 18472 + }, + { + "epoch": 0.10986416404986202, + "grad_norm": 1.3905398845672607, + "learning_rate": 4.852578895015085e-05, + "loss": 4.9084, + "step": 18473 + }, + { + "epoch": 0.10987011133314302, + "grad_norm": 1.3447619676589966, + "learning_rate": 4.8525630917830655e-05, + "loss": 4.9042, + "step": 18474 + }, + { + "epoch": 0.10987605861642402, + "grad_norm": 1.2110105752944946, + "learning_rate": 4.8525472877297893e-05, + "loss": 4.9669, + "step": 18475 + }, + { + "epoch": 0.10988200589970501, + "grad_norm": 1.480750560760498, + "learning_rate": 4.8525314828552646e-05, + "loss": 5.1071, + "step": 18476 + }, + { + "epoch": 0.10988795318298601, + "grad_norm": 1.2497118711471558, + "learning_rate": 4.852515677159495e-05, + "loss": 4.8868, + "step": 18477 + }, + { + "epoch": 0.10989390046626701, + "grad_norm": 1.4057846069335938, + "learning_rate": 4.8524998706424856e-05, + "loss": 5.1173, + "step": 18478 + }, + { + "epoch": 0.109899847749548, + "grad_norm": 1.3325163125991821, + "learning_rate": 4.8524840633042436e-05, + "loss": 5.1066, + "step": 18479 + }, + { + "epoch": 0.109905795032829, + "grad_norm": 1.333720326423645, + "learning_rate": 4.852468255144773e-05, + "loss": 5.1404, + "step": 18480 + }, + { + "epoch": 0.10991174231611, + "grad_norm": 1.3484537601470947, + "learning_rate": 4.852452446164081e-05, + "loss": 5.1284, + "step": 18481 + }, + { + "epoch": 0.109917689599391, + "grad_norm": 1.3348337411880493, + "learning_rate": 4.8524366363621716e-05, + "loss": 5.2056, + "step": 18482 + }, + { + "epoch": 0.109923636882672, + "grad_norm": 1.1838293075561523, + "learning_rate": 4.8524208257390504e-05, + "loss": 5.0488, + "step": 18483 + }, + { + "epoch": 0.109929584165953, + "grad_norm": 1.2820385694503784, + "learning_rate": 4.852405014294724e-05, + "loss": 5.1329, + "step": 18484 + }, + { + "epoch": 0.10993553144923399, + "grad_norm": 1.3892844915390015, + "learning_rate": 4.852389202029198e-05, + "loss": 5.0263, + "step": 18485 + }, + { + "epoch": 0.10994147873251499, + "grad_norm": 1.4780217409133911, + "learning_rate": 4.852373388942476e-05, + "loss": 5.0866, + "step": 18486 + }, + { + "epoch": 0.10994742601579599, + "grad_norm": 1.4181870222091675, + "learning_rate": 4.852357575034565e-05, + "loss": 5.1436, + "step": 18487 + }, + { + "epoch": 0.10995337329907698, + "grad_norm": 1.4174554347991943, + "learning_rate": 4.852341760305471e-05, + "loss": 5.132, + "step": 18488 + }, + { + "epoch": 0.10995932058235798, + "grad_norm": 1.2727283239364624, + "learning_rate": 4.852325944755198e-05, + "loss": 5.0171, + "step": 18489 + }, + { + "epoch": 0.10996526786563898, + "grad_norm": 1.2102142572402954, + "learning_rate": 4.852310128383753e-05, + "loss": 5.0183, + "step": 18490 + }, + { + "epoch": 0.10997121514891997, + "grad_norm": 1.254946231842041, + "learning_rate": 4.85229431119114e-05, + "loss": 5.105, + "step": 18491 + }, + { + "epoch": 0.10997716243220097, + "grad_norm": 1.4097338914871216, + "learning_rate": 4.8522784931773666e-05, + "loss": 4.953, + "step": 18492 + }, + { + "epoch": 0.10998310971548196, + "grad_norm": 1.368314504623413, + "learning_rate": 4.852262674342436e-05, + "loss": 4.9527, + "step": 18493 + }, + { + "epoch": 0.10998905699876296, + "grad_norm": 1.3907700777053833, + "learning_rate": 4.8522468546863554e-05, + "loss": 4.9416, + "step": 18494 + }, + { + "epoch": 0.10999500428204396, + "grad_norm": 1.2113755941390991, + "learning_rate": 4.852231034209129e-05, + "loss": 4.8552, + "step": 18495 + }, + { + "epoch": 0.11000095156532495, + "grad_norm": 1.3752022981643677, + "learning_rate": 4.852215212910763e-05, + "loss": 4.9314, + "step": 18496 + }, + { + "epoch": 0.11000689884860596, + "grad_norm": 1.243531584739685, + "learning_rate": 4.852199390791264e-05, + "loss": 4.925, + "step": 18497 + }, + { + "epoch": 0.11001284613188696, + "grad_norm": 1.3528475761413574, + "learning_rate": 4.852183567850636e-05, + "loss": 4.8643, + "step": 18498 + }, + { + "epoch": 0.11001879341516795, + "grad_norm": 1.4653394222259521, + "learning_rate": 4.8521677440888845e-05, + "loss": 4.8894, + "step": 18499 + }, + { + "epoch": 0.11002474069844895, + "grad_norm": 1.3524682521820068, + "learning_rate": 4.852151919506016e-05, + "loss": 4.7458, + "step": 18500 + }, + { + "epoch": 0.11003068798172995, + "grad_norm": 1.3654247522354126, + "learning_rate": 4.852136094102036e-05, + "loss": 4.7971, + "step": 18501 + }, + { + "epoch": 0.11003663526501094, + "grad_norm": 1.395735740661621, + "learning_rate": 4.85212026787695e-05, + "loss": 4.7677, + "step": 18502 + }, + { + "epoch": 0.11004258254829194, + "grad_norm": 1.4467344284057617, + "learning_rate": 4.8521044408307616e-05, + "loss": 4.726, + "step": 18503 + }, + { + "epoch": 0.11004852983157294, + "grad_norm": 1.276580572128296, + "learning_rate": 4.852088612963478e-05, + "loss": 4.8145, + "step": 18504 + }, + { + "epoch": 0.11005447711485393, + "grad_norm": 1.4406812191009521, + "learning_rate": 4.852072784275106e-05, + "loss": 4.7942, + "step": 18505 + }, + { + "epoch": 0.11006042439813493, + "grad_norm": 1.4281691312789917, + "learning_rate": 4.8520569547656483e-05, + "loss": 4.9745, + "step": 18506 + }, + { + "epoch": 0.11006637168141593, + "grad_norm": 1.3521541357040405, + "learning_rate": 4.852041124435112e-05, + "loss": 4.8335, + "step": 18507 + }, + { + "epoch": 0.11007231896469692, + "grad_norm": 1.2510555982589722, + "learning_rate": 4.852025293283503e-05, + "loss": 4.8868, + "step": 18508 + }, + { + "epoch": 0.11007826624797792, + "grad_norm": 1.3792724609375, + "learning_rate": 4.852009461310826e-05, + "loss": 4.9388, + "step": 18509 + }, + { + "epoch": 0.11008421353125893, + "grad_norm": 1.3494830131530762, + "learning_rate": 4.851993628517086e-05, + "loss": 4.8536, + "step": 18510 + }, + { + "epoch": 0.11009016081453991, + "grad_norm": 1.2981318235397339, + "learning_rate": 4.851977794902291e-05, + "loss": 4.8479, + "step": 18511 + }, + { + "epoch": 0.11009610809782092, + "grad_norm": 1.3305935859680176, + "learning_rate": 4.851961960466444e-05, + "loss": 4.9893, + "step": 18512 + }, + { + "epoch": 0.11010205538110192, + "grad_norm": 1.3141270875930786, + "learning_rate": 4.851946125209551e-05, + "loss": 4.8349, + "step": 18513 + }, + { + "epoch": 0.1101080026643829, + "grad_norm": 1.2411303520202637, + "learning_rate": 4.851930289131619e-05, + "loss": 4.8698, + "step": 18514 + }, + { + "epoch": 0.11011394994766391, + "grad_norm": 1.520176887512207, + "learning_rate": 4.851914452232651e-05, + "loss": 4.7576, + "step": 18515 + }, + { + "epoch": 0.11011989723094491, + "grad_norm": 1.3073054552078247, + "learning_rate": 4.851898614512655e-05, + "loss": 4.8974, + "step": 18516 + }, + { + "epoch": 0.1101258445142259, + "grad_norm": 1.4703196287155151, + "learning_rate": 4.8518827759716354e-05, + "loss": 5.0947, + "step": 18517 + }, + { + "epoch": 0.1101317917975069, + "grad_norm": 1.3140865564346313, + "learning_rate": 4.851866936609597e-05, + "loss": 5.4125, + "step": 18518 + }, + { + "epoch": 0.1101377390807879, + "grad_norm": 1.2075819969177246, + "learning_rate": 4.8518510964265465e-05, + "loss": 5.2993, + "step": 18519 + }, + { + "epoch": 0.11014368636406889, + "grad_norm": 1.6519954204559326, + "learning_rate": 4.85183525542249e-05, + "loss": 5.6638, + "step": 18520 + }, + { + "epoch": 0.11014963364734989, + "grad_norm": 2.118663787841797, + "learning_rate": 4.851819413597432e-05, + "loss": 5.5422, + "step": 18521 + }, + { + "epoch": 0.1101555809306309, + "grad_norm": 1.902429461479187, + "learning_rate": 4.851803570951377e-05, + "loss": 5.3244, + "step": 18522 + }, + { + "epoch": 0.11016152821391188, + "grad_norm": 2.593628406524658, + "learning_rate": 4.8517877274843315e-05, + "loss": 5.0554, + "step": 18523 + }, + { + "epoch": 0.11016747549719288, + "grad_norm": 2.6404380798339844, + "learning_rate": 4.851771883196302e-05, + "loss": 4.9789, + "step": 18524 + }, + { + "epoch": 0.11017342278047387, + "grad_norm": 2.08564829826355, + "learning_rate": 4.8517560380872934e-05, + "loss": 4.9616, + "step": 18525 + }, + { + "epoch": 0.11017937006375488, + "grad_norm": 2.306739091873169, + "learning_rate": 4.8517401921573114e-05, + "loss": 4.9368, + "step": 18526 + }, + { + "epoch": 0.11018531734703588, + "grad_norm": 3.0212862491607666, + "learning_rate": 4.85172434540636e-05, + "loss": 4.6379, + "step": 18527 + }, + { + "epoch": 0.11019126463031687, + "grad_norm": 2.554163694381714, + "learning_rate": 4.851708497834446e-05, + "loss": 4.6958, + "step": 18528 + }, + { + "epoch": 0.11019721191359787, + "grad_norm": 2.354631185531616, + "learning_rate": 4.851692649441576e-05, + "loss": 4.7904, + "step": 18529 + }, + { + "epoch": 0.11020315919687887, + "grad_norm": 1.5072609186172485, + "learning_rate": 4.851676800227754e-05, + "loss": 5.5862, + "step": 18530 + }, + { + "epoch": 0.11020910648015986, + "grad_norm": 1.5677906274795532, + "learning_rate": 4.851660950192986e-05, + "loss": 5.8712, + "step": 18531 + }, + { + "epoch": 0.11021505376344086, + "grad_norm": 1.7329411506652832, + "learning_rate": 4.851645099337276e-05, + "loss": 5.4559, + "step": 18532 + }, + { + "epoch": 0.11022100104672186, + "grad_norm": 2.187192916870117, + "learning_rate": 4.851629247660633e-05, + "loss": 5.2172, + "step": 18533 + }, + { + "epoch": 0.11022694833000285, + "grad_norm": 2.5248184204101562, + "learning_rate": 4.851613395163059e-05, + "loss": 4.7283, + "step": 18534 + }, + { + "epoch": 0.11023289561328385, + "grad_norm": 1.897926926612854, + "learning_rate": 4.8515975418445625e-05, + "loss": 5.0609, + "step": 18535 + }, + { + "epoch": 0.11023884289656485, + "grad_norm": 1.6827658414840698, + "learning_rate": 4.851581687705147e-05, + "loss": 5.2637, + "step": 18536 + }, + { + "epoch": 0.11024479017984584, + "grad_norm": 1.6638895273208618, + "learning_rate": 4.8515658327448184e-05, + "loss": 5.3758, + "step": 18537 + }, + { + "epoch": 0.11025073746312684, + "grad_norm": 1.3794528245925903, + "learning_rate": 4.8515499769635824e-05, + "loss": 5.1398, + "step": 18538 + }, + { + "epoch": 0.11025668474640785, + "grad_norm": 1.7829253673553467, + "learning_rate": 4.8515341203614454e-05, + "loss": 5.8449, + "step": 18539 + }, + { + "epoch": 0.11026263202968883, + "grad_norm": 1.9193391799926758, + "learning_rate": 4.85151826293841e-05, + "loss": 5.6113, + "step": 18540 + }, + { + "epoch": 0.11026857931296984, + "grad_norm": 1.9315286874771118, + "learning_rate": 4.851502404694486e-05, + "loss": 5.4341, + "step": 18541 + }, + { + "epoch": 0.11027452659625084, + "grad_norm": 1.8884371519088745, + "learning_rate": 4.851486545629677e-05, + "loss": 5.0711, + "step": 18542 + }, + { + "epoch": 0.11028047387953183, + "grad_norm": 2.104315996170044, + "learning_rate": 4.8514706857439866e-05, + "loss": 4.7431, + "step": 18543 + }, + { + "epoch": 0.11028642116281283, + "grad_norm": 1.9781455993652344, + "learning_rate": 4.8514548250374234e-05, + "loss": 4.9088, + "step": 18544 + }, + { + "epoch": 0.11029236844609383, + "grad_norm": 2.0802392959594727, + "learning_rate": 4.851438963509991e-05, + "loss": 4.8418, + "step": 18545 + }, + { + "epoch": 0.11029831572937482, + "grad_norm": 2.1856627464294434, + "learning_rate": 4.851423101161696e-05, + "loss": 5.5758, + "step": 18546 + }, + { + "epoch": 0.11030426301265582, + "grad_norm": 1.578050971031189, + "learning_rate": 4.851407237992543e-05, + "loss": 5.2795, + "step": 18547 + }, + { + "epoch": 0.11031021029593682, + "grad_norm": 2.241647720336914, + "learning_rate": 4.8513913740025376e-05, + "loss": 4.7807, + "step": 18548 + }, + { + "epoch": 0.11031615757921781, + "grad_norm": 2.102911949157715, + "learning_rate": 4.851375509191687e-05, + "loss": 5.1933, + "step": 18549 + }, + { + "epoch": 0.11032210486249881, + "grad_norm": 1.7198251485824585, + "learning_rate": 4.851359643559995e-05, + "loss": 5.273, + "step": 18550 + }, + { + "epoch": 0.11032805214577981, + "grad_norm": 1.6389858722686768, + "learning_rate": 4.8513437771074675e-05, + "loss": 5.7741, + "step": 18551 + }, + { + "epoch": 0.1103339994290608, + "grad_norm": 1.3120185136795044, + "learning_rate": 4.8513279098341106e-05, + "loss": 5.6433, + "step": 18552 + }, + { + "epoch": 0.1103399467123418, + "grad_norm": 2.6182525157928467, + "learning_rate": 4.8513120417399286e-05, + "loss": 5.2905, + "step": 18553 + }, + { + "epoch": 0.11034589399562279, + "grad_norm": 2.8740553855895996, + "learning_rate": 4.851296172824928e-05, + "loss": 5.0364, + "step": 18554 + }, + { + "epoch": 0.1103518412789038, + "grad_norm": 2.126779794692993, + "learning_rate": 4.851280303089115e-05, + "loss": 4.8801, + "step": 18555 + }, + { + "epoch": 0.1103577885621848, + "grad_norm": 2.2658486366271973, + "learning_rate": 4.851264432532493e-05, + "loss": 5.0411, + "step": 18556 + }, + { + "epoch": 0.11036373584546579, + "grad_norm": 2.2387850284576416, + "learning_rate": 4.8512485611550706e-05, + "loss": 5.048, + "step": 18557 + }, + { + "epoch": 0.11036968312874679, + "grad_norm": 2.5402557849884033, + "learning_rate": 4.851232688956851e-05, + "loss": 5.2581, + "step": 18558 + }, + { + "epoch": 0.11037563041202779, + "grad_norm": 1.9275699853897095, + "learning_rate": 4.8512168159378396e-05, + "loss": 5.765, + "step": 18559 + }, + { + "epoch": 0.11038157769530878, + "grad_norm": 1.6632050275802612, + "learning_rate": 4.8512009420980434e-05, + "loss": 5.9928, + "step": 18560 + }, + { + "epoch": 0.11038752497858978, + "grad_norm": 1.9383779764175415, + "learning_rate": 4.851185067437467e-05, + "loss": 5.306, + "step": 18561 + }, + { + "epoch": 0.11039347226187078, + "grad_norm": 1.6358258724212646, + "learning_rate": 4.851169191956117e-05, + "loss": 5.4039, + "step": 18562 + }, + { + "epoch": 0.11039941954515177, + "grad_norm": 1.625636339187622, + "learning_rate": 4.851153315653997e-05, + "loss": 5.5028, + "step": 18563 + }, + { + "epoch": 0.11040536682843277, + "grad_norm": 1.8142133951187134, + "learning_rate": 4.8511374385311134e-05, + "loss": 5.3636, + "step": 18564 + }, + { + "epoch": 0.11041131411171377, + "grad_norm": 1.778742790222168, + "learning_rate": 4.8511215605874724e-05, + "loss": 5.9869, + "step": 18565 + }, + { + "epoch": 0.11041726139499476, + "grad_norm": 1.7027266025543213, + "learning_rate": 4.8511056818230795e-05, + "loss": 5.9855, + "step": 18566 + }, + { + "epoch": 0.11042320867827576, + "grad_norm": 1.8098080158233643, + "learning_rate": 4.85108980223794e-05, + "loss": 5.3241, + "step": 18567 + }, + { + "epoch": 0.11042915596155677, + "grad_norm": 2.058525562286377, + "learning_rate": 4.851073921832059e-05, + "loss": 5.3369, + "step": 18568 + }, + { + "epoch": 0.11043510324483775, + "grad_norm": 1.6393969058990479, + "learning_rate": 4.851058040605443e-05, + "loss": 5.234, + "step": 18569 + }, + { + "epoch": 0.11044105052811876, + "grad_norm": 1.7245092391967773, + "learning_rate": 4.8510421585580954e-05, + "loss": 5.3252, + "step": 18570 + }, + { + "epoch": 0.11044699781139976, + "grad_norm": 1.7108781337738037, + "learning_rate": 4.851026275690025e-05, + "loss": 5.342, + "step": 18571 + }, + { + "epoch": 0.11045294509468075, + "grad_norm": 1.6860250234603882, + "learning_rate": 4.8510103920012354e-05, + "loss": 5.1265, + "step": 18572 + }, + { + "epoch": 0.11045889237796175, + "grad_norm": 1.4939595460891724, + "learning_rate": 4.850994507491731e-05, + "loss": 4.995, + "step": 18573 + }, + { + "epoch": 0.11046483966124275, + "grad_norm": 1.6137492656707764, + "learning_rate": 4.85097862216152e-05, + "loss": 5.0099, + "step": 18574 + }, + { + "epoch": 0.11047078694452374, + "grad_norm": 1.8155491352081299, + "learning_rate": 4.850962736010606e-05, + "loss": 4.965, + "step": 18575 + }, + { + "epoch": 0.11047673422780474, + "grad_norm": 1.6313834190368652, + "learning_rate": 4.8509468490389955e-05, + "loss": 5.1881, + "step": 18576 + }, + { + "epoch": 0.11048268151108574, + "grad_norm": 1.9885855913162231, + "learning_rate": 4.850930961246694e-05, + "loss": 4.9172, + "step": 18577 + }, + { + "epoch": 0.11048862879436673, + "grad_norm": 1.7815529108047485, + "learning_rate": 4.850915072633706e-05, + "loss": 5.2431, + "step": 18578 + }, + { + "epoch": 0.11049457607764773, + "grad_norm": 1.496060848236084, + "learning_rate": 4.8508991832000384e-05, + "loss": 5.0222, + "step": 18579 + }, + { + "epoch": 0.11050052336092873, + "grad_norm": 1.76019287109375, + "learning_rate": 4.850883292945696e-05, + "loss": 5.1522, + "step": 18580 + }, + { + "epoch": 0.11050647064420972, + "grad_norm": 1.6975457668304443, + "learning_rate": 4.8508674018706845e-05, + "loss": 5.0687, + "step": 18581 + }, + { + "epoch": 0.11051241792749072, + "grad_norm": 2.056002378463745, + "learning_rate": 4.85085150997501e-05, + "loss": 5.0267, + "step": 18582 + }, + { + "epoch": 0.11051836521077171, + "grad_norm": 1.8109005689620972, + "learning_rate": 4.850835617258677e-05, + "loss": 5.7661, + "step": 18583 + }, + { + "epoch": 0.11052431249405271, + "grad_norm": 1.762326717376709, + "learning_rate": 4.850819723721692e-05, + "loss": 5.8038, + "step": 18584 + }, + { + "epoch": 0.11053025977733372, + "grad_norm": 1.5169013738632202, + "learning_rate": 4.85080382936406e-05, + "loss": 5.7988, + "step": 18585 + }, + { + "epoch": 0.1105362070606147, + "grad_norm": 1.7740446329116821, + "learning_rate": 4.850787934185786e-05, + "loss": 5.5388, + "step": 18586 + }, + { + "epoch": 0.11054215434389571, + "grad_norm": 1.560950756072998, + "learning_rate": 4.850772038186877e-05, + "loss": 5.406, + "step": 18587 + }, + { + "epoch": 0.11054810162717671, + "grad_norm": 1.6391148567199707, + "learning_rate": 4.850756141367338e-05, + "loss": 5.4669, + "step": 18588 + }, + { + "epoch": 0.1105540489104577, + "grad_norm": 1.5571023225784302, + "learning_rate": 4.8507402437271734e-05, + "loss": 5.6556, + "step": 18589 + }, + { + "epoch": 0.1105599961937387, + "grad_norm": 1.5374432802200317, + "learning_rate": 4.85072434526639e-05, + "loss": 5.7617, + "step": 18590 + }, + { + "epoch": 0.1105659434770197, + "grad_norm": 1.4683212041854858, + "learning_rate": 4.850708445984993e-05, + "loss": 5.5074, + "step": 18591 + }, + { + "epoch": 0.11057189076030069, + "grad_norm": 1.6689101457595825, + "learning_rate": 4.850692545882988e-05, + "loss": 5.3259, + "step": 18592 + }, + { + "epoch": 0.11057783804358169, + "grad_norm": 1.394108533859253, + "learning_rate": 4.85067664496038e-05, + "loss": 5.1686, + "step": 18593 + }, + { + "epoch": 0.1105837853268627, + "grad_norm": 1.7093585729599, + "learning_rate": 4.850660743217176e-05, + "loss": 5.6622, + "step": 18594 + }, + { + "epoch": 0.11058973261014368, + "grad_norm": 1.6189805269241333, + "learning_rate": 4.85064484065338e-05, + "loss": 5.6855, + "step": 18595 + }, + { + "epoch": 0.11059567989342468, + "grad_norm": 1.5303481817245483, + "learning_rate": 4.850628937268999e-05, + "loss": 5.8242, + "step": 18596 + }, + { + "epoch": 0.11060162717670569, + "grad_norm": 1.6557955741882324, + "learning_rate": 4.850613033064037e-05, + "loss": 5.4924, + "step": 18597 + }, + { + "epoch": 0.11060757445998667, + "grad_norm": 1.5280576944351196, + "learning_rate": 4.8505971280385e-05, + "loss": 5.6122, + "step": 18598 + }, + { + "epoch": 0.11061352174326768, + "grad_norm": 1.3656830787658691, + "learning_rate": 4.8505812221923945e-05, + "loss": 5.5282, + "step": 18599 + }, + { + "epoch": 0.11061946902654868, + "grad_norm": 1.3605096340179443, + "learning_rate": 4.850565315525725e-05, + "loss": 5.0747, + "step": 18600 + }, + { + "epoch": 0.11062541630982967, + "grad_norm": 2.120056390762329, + "learning_rate": 4.850549408038498e-05, + "loss": 5.1559, + "step": 18601 + }, + { + "epoch": 0.11063136359311067, + "grad_norm": 2.14626145362854, + "learning_rate": 4.850533499730718e-05, + "loss": 4.9778, + "step": 18602 + }, + { + "epoch": 0.11063731087639167, + "grad_norm": 2.1857240200042725, + "learning_rate": 4.8505175906023916e-05, + "loss": 4.8555, + "step": 18603 + }, + { + "epoch": 0.11064325815967266, + "grad_norm": 1.6636399030685425, + "learning_rate": 4.850501680653523e-05, + "loss": 5.3488, + "step": 18604 + }, + { + "epoch": 0.11064920544295366, + "grad_norm": 1.669511079788208, + "learning_rate": 4.8504857698841185e-05, + "loss": 5.2697, + "step": 18605 + }, + { + "epoch": 0.11065515272623466, + "grad_norm": 2.1935081481933594, + "learning_rate": 4.850469858294184e-05, + "loss": 4.4319, + "step": 18606 + }, + { + "epoch": 0.11066110000951565, + "grad_norm": 2.2359724044799805, + "learning_rate": 4.850453945883725e-05, + "loss": 4.2343, + "step": 18607 + }, + { + "epoch": 0.11066704729279665, + "grad_norm": 2.278247594833374, + "learning_rate": 4.850438032652747e-05, + "loss": 4.4955, + "step": 18608 + }, + { + "epoch": 0.11067299457607765, + "grad_norm": 2.3036160469055176, + "learning_rate": 4.850422118601254e-05, + "loss": 4.9122, + "step": 18609 + }, + { + "epoch": 0.11067894185935864, + "grad_norm": 2.3913469314575195, + "learning_rate": 4.850406203729254e-05, + "loss": 4.4703, + "step": 18610 + }, + { + "epoch": 0.11068488914263964, + "grad_norm": 1.9795238971710205, + "learning_rate": 4.8503902880367516e-05, + "loss": 4.7099, + "step": 18611 + }, + { + "epoch": 0.11069083642592063, + "grad_norm": 2.3990728855133057, + "learning_rate": 4.850374371523752e-05, + "loss": 4.3833, + "step": 18612 + }, + { + "epoch": 0.11069678370920163, + "grad_norm": 2.429461717605591, + "learning_rate": 4.850358454190261e-05, + "loss": 4.4279, + "step": 18613 + }, + { + "epoch": 0.11070273099248264, + "grad_norm": 2.598304271697998, + "learning_rate": 4.8503425360362845e-05, + "loss": 4.4376, + "step": 18614 + }, + { + "epoch": 0.11070867827576363, + "grad_norm": 2.3201403617858887, + "learning_rate": 4.850326617061827e-05, + "loss": 4.6822, + "step": 18615 + }, + { + "epoch": 0.11071462555904463, + "grad_norm": 1.8401033878326416, + "learning_rate": 4.8503106972668956e-05, + "loss": 5.1109, + "step": 18616 + }, + { + "epoch": 0.11072057284232563, + "grad_norm": 1.772309422492981, + "learning_rate": 4.850294776651494e-05, + "loss": 5.7237, + "step": 18617 + }, + { + "epoch": 0.11072652012560662, + "grad_norm": 1.7160669565200806, + "learning_rate": 4.8502788552156295e-05, + "loss": 5.7218, + "step": 18618 + }, + { + "epoch": 0.11073246740888762, + "grad_norm": 1.5467272996902466, + "learning_rate": 4.850262932959306e-05, + "loss": 5.4169, + "step": 18619 + }, + { + "epoch": 0.11073841469216862, + "grad_norm": 1.3382668495178223, + "learning_rate": 4.8502470098825316e-05, + "loss": 5.1243, + "step": 18620 + }, + { + "epoch": 0.11074436197544961, + "grad_norm": 1.3461776971817017, + "learning_rate": 4.850231085985309e-05, + "loss": 4.9412, + "step": 18621 + }, + { + "epoch": 0.11075030925873061, + "grad_norm": 1.4207700490951538, + "learning_rate": 4.850215161267646e-05, + "loss": 5.4449, + "step": 18622 + }, + { + "epoch": 0.11075625654201161, + "grad_norm": 1.7271502017974854, + "learning_rate": 4.8501992357295454e-05, + "loss": 5.4579, + "step": 18623 + }, + { + "epoch": 0.1107622038252926, + "grad_norm": 1.753090500831604, + "learning_rate": 4.8501833093710156e-05, + "loss": 5.7577, + "step": 18624 + }, + { + "epoch": 0.1107681511085736, + "grad_norm": 1.3730309009552002, + "learning_rate": 4.850167382192062e-05, + "loss": 5.3646, + "step": 18625 + }, + { + "epoch": 0.1107740983918546, + "grad_norm": 1.4723306894302368, + "learning_rate": 4.8501514541926883e-05, + "loss": 4.8234, + "step": 18626 + }, + { + "epoch": 0.1107800456751356, + "grad_norm": 1.3944339752197266, + "learning_rate": 4.850135525372901e-05, + "loss": 4.805, + "step": 18627 + }, + { + "epoch": 0.1107859929584166, + "grad_norm": 1.1402732133865356, + "learning_rate": 4.850119595732706e-05, + "loss": 4.9865, + "step": 18628 + }, + { + "epoch": 0.1107919402416976, + "grad_norm": 1.0595287084579468, + "learning_rate": 4.850103665272108e-05, + "loss": 4.9961, + "step": 18629 + }, + { + "epoch": 0.11079788752497859, + "grad_norm": 1.445143699645996, + "learning_rate": 4.8500877339911136e-05, + "loss": 5.2089, + "step": 18630 + }, + { + "epoch": 0.11080383480825959, + "grad_norm": 2.2014050483703613, + "learning_rate": 4.8500718018897275e-05, + "loss": 4.7445, + "step": 18631 + }, + { + "epoch": 0.11080978209154059, + "grad_norm": 2.117194890975952, + "learning_rate": 4.850055868967956e-05, + "loss": 4.8755, + "step": 18632 + }, + { + "epoch": 0.11081572937482158, + "grad_norm": 1.82968008518219, + "learning_rate": 4.850039935225804e-05, + "loss": 4.8852, + "step": 18633 + }, + { + "epoch": 0.11082167665810258, + "grad_norm": 1.613770842552185, + "learning_rate": 4.8500240006632766e-05, + "loss": 5.1053, + "step": 18634 + }, + { + "epoch": 0.11082762394138358, + "grad_norm": 1.8672553300857544, + "learning_rate": 4.850008065280381e-05, + "loss": 4.7134, + "step": 18635 + }, + { + "epoch": 0.11083357122466457, + "grad_norm": 1.9933403730392456, + "learning_rate": 4.849992129077122e-05, + "loss": 4.7544, + "step": 18636 + }, + { + "epoch": 0.11083951850794557, + "grad_norm": 1.8642876148223877, + "learning_rate": 4.849976192053505e-05, + "loss": 4.6598, + "step": 18637 + }, + { + "epoch": 0.11084546579122657, + "grad_norm": 1.8983674049377441, + "learning_rate": 4.849960254209536e-05, + "loss": 4.7403, + "step": 18638 + }, + { + "epoch": 0.11085141307450756, + "grad_norm": 1.9882328510284424, + "learning_rate": 4.849944315545219e-05, + "loss": 5.0105, + "step": 18639 + }, + { + "epoch": 0.11085736035778856, + "grad_norm": 1.7971723079681396, + "learning_rate": 4.8499283760605614e-05, + "loss": 5.6138, + "step": 18640 + }, + { + "epoch": 0.11086330764106955, + "grad_norm": 1.5002641677856445, + "learning_rate": 4.849912435755568e-05, + "loss": 5.7336, + "step": 18641 + }, + { + "epoch": 0.11086925492435055, + "grad_norm": 1.412880778312683, + "learning_rate": 4.8498964946302436e-05, + "loss": 5.532, + "step": 18642 + }, + { + "epoch": 0.11087520220763156, + "grad_norm": 1.6482197046279907, + "learning_rate": 4.849880552684596e-05, + "loss": 5.5432, + "step": 18643 + }, + { + "epoch": 0.11088114949091255, + "grad_norm": 1.5852200984954834, + "learning_rate": 4.849864609918629e-05, + "loss": 5.3577, + "step": 18644 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 1.540536642074585, + "learning_rate": 4.849848666332348e-05, + "loss": 5.4983, + "step": 18645 + }, + { + "epoch": 0.11089304405747455, + "grad_norm": 1.7822679281234741, + "learning_rate": 4.849832721925759e-05, + "loss": 5.1427, + "step": 18646 + }, + { + "epoch": 0.11089899134075554, + "grad_norm": 1.722977638244629, + "learning_rate": 4.8498167766988685e-05, + "loss": 5.2759, + "step": 18647 + }, + { + "epoch": 0.11090493862403654, + "grad_norm": 1.7543476819992065, + "learning_rate": 4.8498008306516806e-05, + "loss": 5.2616, + "step": 18648 + }, + { + "epoch": 0.11091088590731754, + "grad_norm": 1.4882584810256958, + "learning_rate": 4.8497848837842016e-05, + "loss": 5.3781, + "step": 18649 + }, + { + "epoch": 0.11091683319059853, + "grad_norm": 1.7358192205429077, + "learning_rate": 4.849768936096437e-05, + "loss": 5.5262, + "step": 18650 + }, + { + "epoch": 0.11092278047387953, + "grad_norm": 1.6070705652236938, + "learning_rate": 4.849752987588393e-05, + "loss": 5.0576, + "step": 18651 + }, + { + "epoch": 0.11092872775716053, + "grad_norm": 1.7641521692276, + "learning_rate": 4.8497370382600736e-05, + "loss": 5.21, + "step": 18652 + }, + { + "epoch": 0.11093467504044152, + "grad_norm": 1.8225789070129395, + "learning_rate": 4.849721088111485e-05, + "loss": 6.2734, + "step": 18653 + }, + { + "epoch": 0.11094062232372252, + "grad_norm": 1.8502428531646729, + "learning_rate": 4.849705137142634e-05, + "loss": 5.8298, + "step": 18654 + }, + { + "epoch": 0.11094656960700353, + "grad_norm": 1.4959850311279297, + "learning_rate": 4.8496891853535255e-05, + "loss": 5.4667, + "step": 18655 + }, + { + "epoch": 0.11095251689028451, + "grad_norm": 1.7957161664962769, + "learning_rate": 4.849673232744164e-05, + "loss": 5.3483, + "step": 18656 + }, + { + "epoch": 0.11095846417356552, + "grad_norm": 1.448737382888794, + "learning_rate": 4.8496572793145554e-05, + "loss": 5.4568, + "step": 18657 + }, + { + "epoch": 0.11096441145684652, + "grad_norm": 1.5068676471710205, + "learning_rate": 4.8496413250647065e-05, + "loss": 5.7089, + "step": 18658 + }, + { + "epoch": 0.1109703587401275, + "grad_norm": 1.5162447690963745, + "learning_rate": 4.849625369994622e-05, + "loss": 5.6042, + "step": 18659 + }, + { + "epoch": 0.11097630602340851, + "grad_norm": 1.81594979763031, + "learning_rate": 4.8496094141043076e-05, + "loss": 5.5301, + "step": 18660 + }, + { + "epoch": 0.11098225330668951, + "grad_norm": 1.9147114753723145, + "learning_rate": 4.8495934573937684e-05, + "loss": 4.6335, + "step": 18661 + }, + { + "epoch": 0.1109882005899705, + "grad_norm": 1.4161462783813477, + "learning_rate": 4.8495774998630106e-05, + "loss": 4.9868, + "step": 18662 + }, + { + "epoch": 0.1109941478732515, + "grad_norm": 1.5652790069580078, + "learning_rate": 4.8495615415120396e-05, + "loss": 5.6954, + "step": 18663 + }, + { + "epoch": 0.1110000951565325, + "grad_norm": 1.5217374563217163, + "learning_rate": 4.8495455823408616e-05, + "loss": 5.4338, + "step": 18664 + }, + { + "epoch": 0.11100604243981349, + "grad_norm": 1.3335540294647217, + "learning_rate": 4.8495296223494805e-05, + "loss": 5.4751, + "step": 18665 + }, + { + "epoch": 0.11101198972309449, + "grad_norm": 1.8903460502624512, + "learning_rate": 4.849513661537903e-05, + "loss": 4.9481, + "step": 18666 + }, + { + "epoch": 0.1110179370063755, + "grad_norm": 1.814666748046875, + "learning_rate": 4.849497699906135e-05, + "loss": 5.1422, + "step": 18667 + }, + { + "epoch": 0.11102388428965648, + "grad_norm": 1.7838057279586792, + "learning_rate": 4.8494817374541816e-05, + "loss": 5.3991, + "step": 18668 + }, + { + "epoch": 0.11102983157293748, + "grad_norm": 1.665671944618225, + "learning_rate": 4.849465774182048e-05, + "loss": 5.5362, + "step": 18669 + }, + { + "epoch": 0.11103577885621847, + "grad_norm": 2.255326509475708, + "learning_rate": 4.8494498100897415e-05, + "loss": 5.3161, + "step": 18670 + }, + { + "epoch": 0.11104172613949947, + "grad_norm": 1.7641721963882446, + "learning_rate": 4.849433845177265e-05, + "loss": 5.0422, + "step": 18671 + }, + { + "epoch": 0.11104767342278048, + "grad_norm": 1.4214074611663818, + "learning_rate": 4.8494178794446256e-05, + "loss": 5.2417, + "step": 18672 + }, + { + "epoch": 0.11105362070606146, + "grad_norm": 1.6417256593704224, + "learning_rate": 4.849401912891829e-05, + "loss": 5.262, + "step": 18673 + }, + { + "epoch": 0.11105956798934247, + "grad_norm": 1.4238179922103882, + "learning_rate": 4.84938594551888e-05, + "loss": 5.9754, + "step": 18674 + }, + { + "epoch": 0.11106551527262347, + "grad_norm": 1.9513673782348633, + "learning_rate": 4.849369977325785e-05, + "loss": 5.8917, + "step": 18675 + }, + { + "epoch": 0.11107146255590446, + "grad_norm": 1.625225305557251, + "learning_rate": 4.849354008312549e-05, + "loss": 5.7142, + "step": 18676 + }, + { + "epoch": 0.11107740983918546, + "grad_norm": 1.5306450128555298, + "learning_rate": 4.849338038479178e-05, + "loss": 5.3206, + "step": 18677 + }, + { + "epoch": 0.11108335712246646, + "grad_norm": 2.7895541191101074, + "learning_rate": 4.849322067825677e-05, + "loss": 4.3585, + "step": 18678 + }, + { + "epoch": 0.11108930440574745, + "grad_norm": 2.2688374519348145, + "learning_rate": 4.849306096352052e-05, + "loss": 4.4967, + "step": 18679 + }, + { + "epoch": 0.11109525168902845, + "grad_norm": 2.1710267066955566, + "learning_rate": 4.849290124058309e-05, + "loss": 4.0673, + "step": 18680 + }, + { + "epoch": 0.11110119897230945, + "grad_norm": 2.235142707824707, + "learning_rate": 4.849274150944453e-05, + "loss": 3.8198, + "step": 18681 + }, + { + "epoch": 0.11110714625559044, + "grad_norm": 2.328324317932129, + "learning_rate": 4.849258177010489e-05, + "loss": 4.008, + "step": 18682 + }, + { + "epoch": 0.11111309353887144, + "grad_norm": 2.2681312561035156, + "learning_rate": 4.849242202256424e-05, + "loss": 4.1541, + "step": 18683 + }, + { + "epoch": 0.11111904082215245, + "grad_norm": 2.5430855751037598, + "learning_rate": 4.849226226682262e-05, + "loss": 4.3177, + "step": 18684 + }, + { + "epoch": 0.11112498810543343, + "grad_norm": 2.1995978355407715, + "learning_rate": 4.84921025028801e-05, + "loss": 4.5792, + "step": 18685 + }, + { + "epoch": 0.11113093538871444, + "grad_norm": 1.9515454769134521, + "learning_rate": 4.849194273073673e-05, + "loss": 4.8759, + "step": 18686 + }, + { + "epoch": 0.11113688267199544, + "grad_norm": 2.484431028366089, + "learning_rate": 4.849178295039257e-05, + "loss": 4.1916, + "step": 18687 + }, + { + "epoch": 0.11114282995527643, + "grad_norm": 2.356790065765381, + "learning_rate": 4.8491623161847665e-05, + "loss": 4.38, + "step": 18688 + }, + { + "epoch": 0.11114877723855743, + "grad_norm": 2.414517879486084, + "learning_rate": 4.849146336510207e-05, + "loss": 4.3739, + "step": 18689 + }, + { + "epoch": 0.11115472452183843, + "grad_norm": 2.4129765033721924, + "learning_rate": 4.849130356015587e-05, + "loss": 4.0384, + "step": 18690 + }, + { + "epoch": 0.11116067180511942, + "grad_norm": 2.146932363510132, + "learning_rate": 4.8491143747009074e-05, + "loss": 4.4045, + "step": 18691 + }, + { + "epoch": 0.11116661908840042, + "grad_norm": 2.1945905685424805, + "learning_rate": 4.8490983925661776e-05, + "loss": 5.1674, + "step": 18692 + }, + { + "epoch": 0.11117256637168142, + "grad_norm": 2.2188448905944824, + "learning_rate": 4.849082409611402e-05, + "loss": 4.628, + "step": 18693 + }, + { + "epoch": 0.11117851365496241, + "grad_norm": 1.7684906721115112, + "learning_rate": 4.8490664258365847e-05, + "loss": 5.236, + "step": 18694 + }, + { + "epoch": 0.11118446093824341, + "grad_norm": 2.0367350578308105, + "learning_rate": 4.849050441241734e-05, + "loss": 5.6408, + "step": 18695 + }, + { + "epoch": 0.11119040822152441, + "grad_norm": 2.0829811096191406, + "learning_rate": 4.849034455826853e-05, + "loss": 5.5519, + "step": 18696 + }, + { + "epoch": 0.1111963555048054, + "grad_norm": 1.7884539365768433, + "learning_rate": 4.8490184695919486e-05, + "loss": 5.2345, + "step": 18697 + }, + { + "epoch": 0.1112023027880864, + "grad_norm": 1.8792423009872437, + "learning_rate": 4.849002482537026e-05, + "loss": 4.7622, + "step": 18698 + }, + { + "epoch": 0.11120825007136739, + "grad_norm": 1.7493008375167847, + "learning_rate": 4.8489864946620914e-05, + "loss": 5.295, + "step": 18699 + }, + { + "epoch": 0.1112141973546484, + "grad_norm": 1.60455322265625, + "learning_rate": 4.84897050596715e-05, + "loss": 5.5708, + "step": 18700 + }, + { + "epoch": 0.1112201446379294, + "grad_norm": 1.4326173067092896, + "learning_rate": 4.848954516452206e-05, + "loss": 5.9185, + "step": 18701 + }, + { + "epoch": 0.11122609192121038, + "grad_norm": 1.6318118572235107, + "learning_rate": 4.8489385261172685e-05, + "loss": 5.6545, + "step": 18702 + }, + { + "epoch": 0.11123203920449139, + "grad_norm": 1.4083906412124634, + "learning_rate": 4.848922534962339e-05, + "loss": 5.4776, + "step": 18703 + }, + { + "epoch": 0.11123798648777239, + "grad_norm": 1.222609519958496, + "learning_rate": 4.8489065429874256e-05, + "loss": 5.5094, + "step": 18704 + }, + { + "epoch": 0.11124393377105338, + "grad_norm": 1.6955020427703857, + "learning_rate": 4.848890550192533e-05, + "loss": 5.0516, + "step": 18705 + }, + { + "epoch": 0.11124988105433438, + "grad_norm": 1.3875632286071777, + "learning_rate": 4.848874556577667e-05, + "loss": 5.5321, + "step": 18706 + }, + { + "epoch": 0.11125582833761538, + "grad_norm": 1.2538158893585205, + "learning_rate": 4.848858562142833e-05, + "loss": 5.464, + "step": 18707 + }, + { + "epoch": 0.11126177562089637, + "grad_norm": 1.7350475788116455, + "learning_rate": 4.8488425668880366e-05, + "loss": 5.2815, + "step": 18708 + }, + { + "epoch": 0.11126772290417737, + "grad_norm": 1.543989658355713, + "learning_rate": 4.848826570813284e-05, + "loss": 5.4817, + "step": 18709 + }, + { + "epoch": 0.11127367018745837, + "grad_norm": 1.3931440114974976, + "learning_rate": 4.8488105739185807e-05, + "loss": 5.7652, + "step": 18710 + }, + { + "epoch": 0.11127961747073936, + "grad_norm": 1.4630471467971802, + "learning_rate": 4.8487945762039314e-05, + "loss": 5.4886, + "step": 18711 + }, + { + "epoch": 0.11128556475402036, + "grad_norm": 1.338161826133728, + "learning_rate": 4.848778577669342e-05, + "loss": 5.2021, + "step": 18712 + }, + { + "epoch": 0.11129151203730137, + "grad_norm": 1.4282599687576294, + "learning_rate": 4.8487625783148186e-05, + "loss": 5.2767, + "step": 18713 + }, + { + "epoch": 0.11129745932058235, + "grad_norm": 1.4386523962020874, + "learning_rate": 4.848746578140366e-05, + "loss": 5.7286, + "step": 18714 + }, + { + "epoch": 0.11130340660386336, + "grad_norm": 1.2272754907608032, + "learning_rate": 4.84873057714599e-05, + "loss": 5.3609, + "step": 18715 + }, + { + "epoch": 0.11130935388714436, + "grad_norm": 1.8362592458724976, + "learning_rate": 4.848714575331697e-05, + "loss": 5.0494, + "step": 18716 + }, + { + "epoch": 0.11131530117042535, + "grad_norm": 2.098970651626587, + "learning_rate": 4.848698572697492e-05, + "loss": 4.8282, + "step": 18717 + }, + { + "epoch": 0.11132124845370635, + "grad_norm": 2.2145583629608154, + "learning_rate": 4.84868256924338e-05, + "loss": 4.4621, + "step": 18718 + }, + { + "epoch": 0.11132719573698735, + "grad_norm": 1.8036415576934814, + "learning_rate": 4.848666564969368e-05, + "loss": 5.374, + "step": 18719 + }, + { + "epoch": 0.11133314302026834, + "grad_norm": 1.5794750452041626, + "learning_rate": 4.8486505598754605e-05, + "loss": 5.6246, + "step": 18720 + }, + { + "epoch": 0.11133909030354934, + "grad_norm": 1.637068510055542, + "learning_rate": 4.848634553961664e-05, + "loss": 5.4506, + "step": 18721 + }, + { + "epoch": 0.11134503758683034, + "grad_norm": 1.6928807497024536, + "learning_rate": 4.8486185472279824e-05, + "loss": 5.2405, + "step": 18722 + }, + { + "epoch": 0.11135098487011133, + "grad_norm": 2.0931332111358643, + "learning_rate": 4.848602539674422e-05, + "loss": 4.9366, + "step": 18723 + }, + { + "epoch": 0.11135693215339233, + "grad_norm": 1.4645583629608154, + "learning_rate": 4.848586531300989e-05, + "loss": 5.0677, + "step": 18724 + }, + { + "epoch": 0.11136287943667333, + "grad_norm": 1.7817938327789307, + "learning_rate": 4.8485705221076896e-05, + "loss": 5.5975, + "step": 18725 + }, + { + "epoch": 0.11136882671995432, + "grad_norm": 1.7167946100234985, + "learning_rate": 4.848554512094528e-05, + "loss": 5.829, + "step": 18726 + }, + { + "epoch": 0.11137477400323532, + "grad_norm": 1.723574161529541, + "learning_rate": 4.8485385012615106e-05, + "loss": 5.2702, + "step": 18727 + }, + { + "epoch": 0.11138072128651631, + "grad_norm": 1.4848002195358276, + "learning_rate": 4.848522489608642e-05, + "loss": 5.6739, + "step": 18728 + }, + { + "epoch": 0.11138666856979731, + "grad_norm": 1.798085331916809, + "learning_rate": 4.848506477135929e-05, + "loss": 5.7314, + "step": 18729 + }, + { + "epoch": 0.11139261585307832, + "grad_norm": 1.7033846378326416, + "learning_rate": 4.848490463843376e-05, + "loss": 5.531, + "step": 18730 + }, + { + "epoch": 0.1113985631363593, + "grad_norm": 1.64686119556427, + "learning_rate": 4.8484744497309896e-05, + "loss": 5.8325, + "step": 18731 + }, + { + "epoch": 0.1114045104196403, + "grad_norm": 1.9923123121261597, + "learning_rate": 4.8484584347987755e-05, + "loss": 5.9614, + "step": 18732 + }, + { + "epoch": 0.11141045770292131, + "grad_norm": 1.768896460533142, + "learning_rate": 4.8484424190467385e-05, + "loss": 5.9892, + "step": 18733 + }, + { + "epoch": 0.1114164049862023, + "grad_norm": 1.5981477499008179, + "learning_rate": 4.848426402474885e-05, + "loss": 5.6239, + "step": 18734 + }, + { + "epoch": 0.1114223522694833, + "grad_norm": 1.8919446468353271, + "learning_rate": 4.848410385083219e-05, + "loss": 5.7437, + "step": 18735 + }, + { + "epoch": 0.1114282995527643, + "grad_norm": 2.2705752849578857, + "learning_rate": 4.848394366871748e-05, + "loss": 4.5999, + "step": 18736 + }, + { + "epoch": 0.11143424683604529, + "grad_norm": 1.8626762628555298, + "learning_rate": 4.848378347840476e-05, + "loss": 5.5706, + "step": 18737 + }, + { + "epoch": 0.11144019411932629, + "grad_norm": 1.5893161296844482, + "learning_rate": 4.84836232798941e-05, + "loss": 5.4011, + "step": 18738 + }, + { + "epoch": 0.1114461414026073, + "grad_norm": 1.3441518545150757, + "learning_rate": 4.8483463073185554e-05, + "loss": 5.2412, + "step": 18739 + }, + { + "epoch": 0.11145208868588828, + "grad_norm": 1.6281975507736206, + "learning_rate": 4.848330285827917e-05, + "loss": 5.4281, + "step": 18740 + }, + { + "epoch": 0.11145803596916928, + "grad_norm": 2.1942298412323, + "learning_rate": 4.8483142635175e-05, + "loss": 5.6202, + "step": 18741 + }, + { + "epoch": 0.11146398325245029, + "grad_norm": 2.086764097213745, + "learning_rate": 4.848298240387311e-05, + "loss": 5.665, + "step": 18742 + }, + { + "epoch": 0.11146993053573127, + "grad_norm": 2.0656285285949707, + "learning_rate": 4.848282216437356e-05, + "loss": 5.5196, + "step": 18743 + }, + { + "epoch": 0.11147587781901228, + "grad_norm": 1.5579513311386108, + "learning_rate": 4.84826619166764e-05, + "loss": 5.7366, + "step": 18744 + }, + { + "epoch": 0.11148182510229328, + "grad_norm": 1.7952065467834473, + "learning_rate": 4.848250166078168e-05, + "loss": 5.8041, + "step": 18745 + }, + { + "epoch": 0.11148777238557427, + "grad_norm": 1.3523657321929932, + "learning_rate": 4.848234139668947e-05, + "loss": 5.6628, + "step": 18746 + }, + { + "epoch": 0.11149371966885527, + "grad_norm": 1.6833933591842651, + "learning_rate": 4.848218112439981e-05, + "loss": 5.5285, + "step": 18747 + }, + { + "epoch": 0.11149966695213627, + "grad_norm": 1.308733344078064, + "learning_rate": 4.848202084391276e-05, + "loss": 5.9953, + "step": 18748 + }, + { + "epoch": 0.11150561423541726, + "grad_norm": 1.3434252738952637, + "learning_rate": 4.848186055522838e-05, + "loss": 5.8267, + "step": 18749 + }, + { + "epoch": 0.11151156151869826, + "grad_norm": 1.6250263452529907, + "learning_rate": 4.848170025834673e-05, + "loss": 4.964, + "step": 18750 + }, + { + "epoch": 0.11151750880197926, + "grad_norm": 1.4924334287643433, + "learning_rate": 4.848153995326786e-05, + "loss": 4.9072, + "step": 18751 + }, + { + "epoch": 0.11152345608526025, + "grad_norm": 1.5650702714920044, + "learning_rate": 4.8481379639991826e-05, + "loss": 5.8793, + "step": 18752 + }, + { + "epoch": 0.11152940336854125, + "grad_norm": 1.488553762435913, + "learning_rate": 4.848121931851868e-05, + "loss": 5.823, + "step": 18753 + }, + { + "epoch": 0.11153535065182225, + "grad_norm": 1.5356508493423462, + "learning_rate": 4.848105898884849e-05, + "loss": 5.7632, + "step": 18754 + }, + { + "epoch": 0.11154129793510324, + "grad_norm": 1.5389797687530518, + "learning_rate": 4.8480898650981296e-05, + "loss": 5.8662, + "step": 18755 + }, + { + "epoch": 0.11154724521838424, + "grad_norm": 1.3963713645935059, + "learning_rate": 4.848073830491717e-05, + "loss": 5.5647, + "step": 18756 + }, + { + "epoch": 0.11155319250166523, + "grad_norm": 1.3739324808120728, + "learning_rate": 4.848057795065617e-05, + "loss": 5.6686, + "step": 18757 + }, + { + "epoch": 0.11155913978494623, + "grad_norm": 1.2932708263397217, + "learning_rate": 4.848041758819833e-05, + "loss": 5.6567, + "step": 18758 + }, + { + "epoch": 0.11156508706822724, + "grad_norm": 1.3388581275939941, + "learning_rate": 4.848025721754372e-05, + "loss": 5.6111, + "step": 18759 + }, + { + "epoch": 0.11157103435150822, + "grad_norm": 1.28604257106781, + "learning_rate": 4.84800968386924e-05, + "loss": 5.633, + "step": 18760 + }, + { + "epoch": 0.11157698163478923, + "grad_norm": 2.0710771083831787, + "learning_rate": 4.847993645164441e-05, + "loss": 5.1686, + "step": 18761 + }, + { + "epoch": 0.11158292891807023, + "grad_norm": 1.8022092580795288, + "learning_rate": 4.847977605639983e-05, + "loss": 5.6373, + "step": 18762 + }, + { + "epoch": 0.11158887620135122, + "grad_norm": 1.7080397605895996, + "learning_rate": 4.84796156529587e-05, + "loss": 5.5389, + "step": 18763 + }, + { + "epoch": 0.11159482348463222, + "grad_norm": 1.3582305908203125, + "learning_rate": 4.847945524132107e-05, + "loss": 5.5574, + "step": 18764 + }, + { + "epoch": 0.11160077076791322, + "grad_norm": 1.9037936925888062, + "learning_rate": 4.8479294821487015e-05, + "loss": 5.2108, + "step": 18765 + }, + { + "epoch": 0.11160671805119421, + "grad_norm": 1.6884709596633911, + "learning_rate": 4.8479134393456576e-05, + "loss": 5.2462, + "step": 18766 + }, + { + "epoch": 0.11161266533447521, + "grad_norm": 1.720261812210083, + "learning_rate": 4.8478973957229813e-05, + "loss": 5.5132, + "step": 18767 + }, + { + "epoch": 0.11161861261775621, + "grad_norm": 2.1769275665283203, + "learning_rate": 4.847881351280679e-05, + "loss": 5.1169, + "step": 18768 + }, + { + "epoch": 0.1116245599010372, + "grad_norm": 1.8593683242797852, + "learning_rate": 4.847865306018754e-05, + "loss": 4.8812, + "step": 18769 + }, + { + "epoch": 0.1116305071843182, + "grad_norm": 1.9496150016784668, + "learning_rate": 4.8478492599372147e-05, + "loss": 4.8244, + "step": 18770 + }, + { + "epoch": 0.1116364544675992, + "grad_norm": 1.584330677986145, + "learning_rate": 4.8478332130360655e-05, + "loss": 4.769, + "step": 18771 + }, + { + "epoch": 0.1116424017508802, + "grad_norm": 1.5987087488174438, + "learning_rate": 4.8478171653153116e-05, + "loss": 4.8385, + "step": 18772 + }, + { + "epoch": 0.1116483490341612, + "grad_norm": 1.919463038444519, + "learning_rate": 4.847801116774959e-05, + "loss": 4.7365, + "step": 18773 + }, + { + "epoch": 0.1116542963174422, + "grad_norm": 1.8708561658859253, + "learning_rate": 4.847785067415014e-05, + "loss": 4.9067, + "step": 18774 + }, + { + "epoch": 0.11166024360072319, + "grad_norm": 1.778316617012024, + "learning_rate": 4.8477690172354804e-05, + "loss": 4.8213, + "step": 18775 + }, + { + "epoch": 0.11166619088400419, + "grad_norm": 1.7170525789260864, + "learning_rate": 4.8477529662363655e-05, + "loss": 4.7115, + "step": 18776 + }, + { + "epoch": 0.11167213816728519, + "grad_norm": 1.6704293489456177, + "learning_rate": 4.847736914417674e-05, + "loss": 4.5814, + "step": 18777 + }, + { + "epoch": 0.11167808545056618, + "grad_norm": 1.7422312498092651, + "learning_rate": 4.847720861779412e-05, + "loss": 4.6206, + "step": 18778 + }, + { + "epoch": 0.11168403273384718, + "grad_norm": 1.7162894010543823, + "learning_rate": 4.8477048083215845e-05, + "loss": 4.6421, + "step": 18779 + }, + { + "epoch": 0.11168998001712818, + "grad_norm": 1.7825870513916016, + "learning_rate": 4.847688754044199e-05, + "loss": 4.6899, + "step": 18780 + }, + { + "epoch": 0.11169592730040917, + "grad_norm": 1.8103221654891968, + "learning_rate": 4.8476726989472577e-05, + "loss": 4.5619, + "step": 18781 + }, + { + "epoch": 0.11170187458369017, + "grad_norm": 1.8276532888412476, + "learning_rate": 4.847656643030769e-05, + "loss": 4.3429, + "step": 18782 + }, + { + "epoch": 0.11170782186697117, + "grad_norm": 1.7625696659088135, + "learning_rate": 4.847640586294737e-05, + "loss": 4.4154, + "step": 18783 + }, + { + "epoch": 0.11171376915025216, + "grad_norm": 1.842450499534607, + "learning_rate": 4.8476245287391684e-05, + "loss": 4.6279, + "step": 18784 + }, + { + "epoch": 0.11171971643353316, + "grad_norm": 1.879961371421814, + "learning_rate": 4.847608470364069e-05, + "loss": 4.4906, + "step": 18785 + }, + { + "epoch": 0.11172566371681415, + "grad_norm": 1.5556871891021729, + "learning_rate": 4.847592411169443e-05, + "loss": 5.0258, + "step": 18786 + }, + { + "epoch": 0.11173161100009515, + "grad_norm": 1.8000839948654175, + "learning_rate": 4.8475763511552965e-05, + "loss": 4.4746, + "step": 18787 + }, + { + "epoch": 0.11173755828337616, + "grad_norm": 1.4234516620635986, + "learning_rate": 4.847560290321636e-05, + "loss": 5.4744, + "step": 18788 + }, + { + "epoch": 0.11174350556665714, + "grad_norm": 1.5717182159423828, + "learning_rate": 4.847544228668466e-05, + "loss": 5.4368, + "step": 18789 + }, + { + "epoch": 0.11174945284993815, + "grad_norm": 1.3514728546142578, + "learning_rate": 4.847528166195793e-05, + "loss": 5.3036, + "step": 18790 + }, + { + "epoch": 0.11175540013321915, + "grad_norm": 1.4620373249053955, + "learning_rate": 4.847512102903621e-05, + "loss": 5.2206, + "step": 18791 + }, + { + "epoch": 0.11176134741650014, + "grad_norm": 1.3034706115722656, + "learning_rate": 4.847496038791958e-05, + "loss": 5.3359, + "step": 18792 + }, + { + "epoch": 0.11176729469978114, + "grad_norm": 1.599876046180725, + "learning_rate": 4.847479973860808e-05, + "loss": 5.1282, + "step": 18793 + }, + { + "epoch": 0.11177324198306214, + "grad_norm": 1.4783935546875, + "learning_rate": 4.847463908110177e-05, + "loss": 5.1958, + "step": 18794 + }, + { + "epoch": 0.11177918926634313, + "grad_norm": 1.5132538080215454, + "learning_rate": 4.84744784154007e-05, + "loss": 5.0166, + "step": 18795 + }, + { + "epoch": 0.11178513654962413, + "grad_norm": 1.9335131645202637, + "learning_rate": 4.847431774150495e-05, + "loss": 4.8899, + "step": 18796 + }, + { + "epoch": 0.11179108383290513, + "grad_norm": 1.5765737295150757, + "learning_rate": 4.847415705941454e-05, + "loss": 5.2848, + "step": 18797 + }, + { + "epoch": 0.11179703111618612, + "grad_norm": 1.7239350080490112, + "learning_rate": 4.847399636912955e-05, + "loss": 5.0606, + "step": 18798 + }, + { + "epoch": 0.11180297839946712, + "grad_norm": 1.5246455669403076, + "learning_rate": 4.847383567065004e-05, + "loss": 5.0829, + "step": 18799 + }, + { + "epoch": 0.11180892568274813, + "grad_norm": 1.3902997970581055, + "learning_rate": 4.847367496397604e-05, + "loss": 5.2729, + "step": 18800 + }, + { + "epoch": 0.11181487296602911, + "grad_norm": 1.426282286643982, + "learning_rate": 4.8473514249107634e-05, + "loss": 5.2259, + "step": 18801 + }, + { + "epoch": 0.11182082024931012, + "grad_norm": 1.4425853490829468, + "learning_rate": 4.847335352604486e-05, + "loss": 4.923, + "step": 18802 + }, + { + "epoch": 0.11182676753259112, + "grad_norm": 1.26097571849823, + "learning_rate": 4.8473192794787786e-05, + "loss": 4.9122, + "step": 18803 + }, + { + "epoch": 0.1118327148158721, + "grad_norm": 1.4102699756622314, + "learning_rate": 4.847303205533646e-05, + "loss": 4.9641, + "step": 18804 + }, + { + "epoch": 0.11183866209915311, + "grad_norm": 1.3965771198272705, + "learning_rate": 4.847287130769094e-05, + "loss": 4.9832, + "step": 18805 + }, + { + "epoch": 0.11184460938243411, + "grad_norm": 1.3588200807571411, + "learning_rate": 4.8472710551851284e-05, + "loss": 5.0502, + "step": 18806 + }, + { + "epoch": 0.1118505566657151, + "grad_norm": 1.394020676612854, + "learning_rate": 4.847254978781755e-05, + "loss": 4.9699, + "step": 18807 + }, + { + "epoch": 0.1118565039489961, + "grad_norm": 1.4548087120056152, + "learning_rate": 4.8472389015589794e-05, + "loss": 4.9112, + "step": 18808 + }, + { + "epoch": 0.1118624512322771, + "grad_norm": 1.4359081983566284, + "learning_rate": 4.847222823516806e-05, + "loss": 4.9284, + "step": 18809 + }, + { + "epoch": 0.11186839851555809, + "grad_norm": 1.3159685134887695, + "learning_rate": 4.847206744655242e-05, + "loss": 4.9661, + "step": 18810 + }, + { + "epoch": 0.11187434579883909, + "grad_norm": 1.5037652254104614, + "learning_rate": 4.847190664974292e-05, + "loss": 5.0318, + "step": 18811 + }, + { + "epoch": 0.1118802930821201, + "grad_norm": 1.7603816986083984, + "learning_rate": 4.8471745844739624e-05, + "loss": 5.0486, + "step": 18812 + }, + { + "epoch": 0.11188624036540108, + "grad_norm": 1.6205053329467773, + "learning_rate": 4.847158503154259e-05, + "loss": 5.0587, + "step": 18813 + }, + { + "epoch": 0.11189218764868208, + "grad_norm": 1.559334635734558, + "learning_rate": 4.847142421015185e-05, + "loss": 5.1514, + "step": 18814 + }, + { + "epoch": 0.11189813493196307, + "grad_norm": 1.4896910190582275, + "learning_rate": 4.8471263380567495e-05, + "loss": 5.2103, + "step": 18815 + }, + { + "epoch": 0.11190408221524407, + "grad_norm": 1.43007493019104, + "learning_rate": 4.847110254278956e-05, + "loss": 5.0152, + "step": 18816 + }, + { + "epoch": 0.11191002949852508, + "grad_norm": 1.3567081689834595, + "learning_rate": 4.84709416968181e-05, + "loss": 4.7193, + "step": 18817 + }, + { + "epoch": 0.11191597678180606, + "grad_norm": 1.3283864259719849, + "learning_rate": 4.8470780842653186e-05, + "loss": 4.8559, + "step": 18818 + }, + { + "epoch": 0.11192192406508707, + "grad_norm": 1.5427826642990112, + "learning_rate": 4.8470619980294854e-05, + "loss": 5.1406, + "step": 18819 + }, + { + "epoch": 0.11192787134836807, + "grad_norm": 1.4549115896224976, + "learning_rate": 4.847045910974318e-05, + "loss": 5.0377, + "step": 18820 + }, + { + "epoch": 0.11193381863164906, + "grad_norm": 1.3822715282440186, + "learning_rate": 4.84702982309982e-05, + "loss": 4.9279, + "step": 18821 + }, + { + "epoch": 0.11193976591493006, + "grad_norm": 1.290756106376648, + "learning_rate": 4.8470137344059996e-05, + "loss": 4.9631, + "step": 18822 + }, + { + "epoch": 0.11194571319821106, + "grad_norm": 1.8070625066757202, + "learning_rate": 4.84699764489286e-05, + "loss": 5.0103, + "step": 18823 + }, + { + "epoch": 0.11195166048149205, + "grad_norm": 1.6692131757736206, + "learning_rate": 4.846981554560408e-05, + "loss": 5.1265, + "step": 18824 + }, + { + "epoch": 0.11195760776477305, + "grad_norm": 1.7644426822662354, + "learning_rate": 4.8469654634086495e-05, + "loss": 5.0712, + "step": 18825 + }, + { + "epoch": 0.11196355504805405, + "grad_norm": 1.5689074993133545, + "learning_rate": 4.8469493714375893e-05, + "loss": 5.0551, + "step": 18826 + }, + { + "epoch": 0.11196950233133504, + "grad_norm": 1.610300064086914, + "learning_rate": 4.846933278647233e-05, + "loss": 5.0746, + "step": 18827 + }, + { + "epoch": 0.11197544961461604, + "grad_norm": 1.2828009128570557, + "learning_rate": 4.846917185037586e-05, + "loss": 5.0645, + "step": 18828 + }, + { + "epoch": 0.11198139689789705, + "grad_norm": 1.386265516281128, + "learning_rate": 4.846901090608655e-05, + "loss": 5.1885, + "step": 18829 + }, + { + "epoch": 0.11198734418117803, + "grad_norm": 1.446359634399414, + "learning_rate": 4.846884995360446e-05, + "loss": 5.3245, + "step": 18830 + }, + { + "epoch": 0.11199329146445904, + "grad_norm": 1.4347827434539795, + "learning_rate": 4.846868899292962e-05, + "loss": 5.379, + "step": 18831 + }, + { + "epoch": 0.11199923874774004, + "grad_norm": 1.7589528560638428, + "learning_rate": 4.846852802406212e-05, + "loss": 5.2726, + "step": 18832 + }, + { + "epoch": 0.11200518603102103, + "grad_norm": 1.4316980838775635, + "learning_rate": 4.846836704700199e-05, + "loss": 5.5424, + "step": 18833 + }, + { + "epoch": 0.11201113331430203, + "grad_norm": 1.202364444732666, + "learning_rate": 4.84682060617493e-05, + "loss": 5.4271, + "step": 18834 + }, + { + "epoch": 0.11201708059758303, + "grad_norm": 1.282231330871582, + "learning_rate": 4.8468045068304094e-05, + "loss": 5.4895, + "step": 18835 + }, + { + "epoch": 0.11202302788086402, + "grad_norm": 1.8428497314453125, + "learning_rate": 4.846788406666644e-05, + "loss": 4.9924, + "step": 18836 + }, + { + "epoch": 0.11202897516414502, + "grad_norm": 1.8442119359970093, + "learning_rate": 4.846772305683639e-05, + "loss": 4.6735, + "step": 18837 + }, + { + "epoch": 0.11203492244742602, + "grad_norm": 1.7083659172058105, + "learning_rate": 4.846756203881401e-05, + "loss": 4.8064, + "step": 18838 + }, + { + "epoch": 0.11204086973070701, + "grad_norm": 1.5663195848464966, + "learning_rate": 4.8467401012599336e-05, + "loss": 5.095, + "step": 18839 + }, + { + "epoch": 0.11204681701398801, + "grad_norm": 1.7466095685958862, + "learning_rate": 4.846723997819244e-05, + "loss": 4.7633, + "step": 18840 + }, + { + "epoch": 0.11205276429726901, + "grad_norm": 1.73336660861969, + "learning_rate": 4.846707893559336e-05, + "loss": 4.8776, + "step": 18841 + }, + { + "epoch": 0.11205871158055, + "grad_norm": 1.726456880569458, + "learning_rate": 4.8466917884802175e-05, + "loss": 4.845, + "step": 18842 + }, + { + "epoch": 0.112064658863831, + "grad_norm": 1.733583927154541, + "learning_rate": 4.8466756825818934e-05, + "loss": 4.8272, + "step": 18843 + }, + { + "epoch": 0.11207060614711199, + "grad_norm": 1.8252346515655518, + "learning_rate": 4.8466595758643684e-05, + "loss": 4.7088, + "step": 18844 + }, + { + "epoch": 0.112076553430393, + "grad_norm": 1.6071163415908813, + "learning_rate": 4.8466434683276495e-05, + "loss": 4.7085, + "step": 18845 + }, + { + "epoch": 0.112082500713674, + "grad_norm": 1.8407503366470337, + "learning_rate": 4.846627359971741e-05, + "loss": 4.6885, + "step": 18846 + }, + { + "epoch": 0.11208844799695498, + "grad_norm": 1.5426356792449951, + "learning_rate": 4.84661125079665e-05, + "loss": 4.7252, + "step": 18847 + }, + { + "epoch": 0.11209439528023599, + "grad_norm": 1.8290139436721802, + "learning_rate": 4.84659514080238e-05, + "loss": 4.9314, + "step": 18848 + }, + { + "epoch": 0.11210034256351699, + "grad_norm": 1.73724365234375, + "learning_rate": 4.846579029988939e-05, + "loss": 4.7618, + "step": 18849 + }, + { + "epoch": 0.11210628984679798, + "grad_norm": 2.0577304363250732, + "learning_rate": 4.8465629183563314e-05, + "loss": 4.8118, + "step": 18850 + }, + { + "epoch": 0.11211223713007898, + "grad_norm": 1.8696433305740356, + "learning_rate": 4.846546805904562e-05, + "loss": 4.6813, + "step": 18851 + }, + { + "epoch": 0.11211818441335998, + "grad_norm": 1.6597977876663208, + "learning_rate": 4.846530692633638e-05, + "loss": 4.5187, + "step": 18852 + }, + { + "epoch": 0.11212413169664097, + "grad_norm": 1.6595630645751953, + "learning_rate": 4.846514578543564e-05, + "loss": 5.012, + "step": 18853 + }, + { + "epoch": 0.11213007897992197, + "grad_norm": 2.2116329669952393, + "learning_rate": 4.846498463634347e-05, + "loss": 5.1757, + "step": 18854 + }, + { + "epoch": 0.11213602626320297, + "grad_norm": 1.8592875003814697, + "learning_rate": 4.846482347905991e-05, + "loss": 6.0403, + "step": 18855 + }, + { + "epoch": 0.11214197354648396, + "grad_norm": 1.7812080383300781, + "learning_rate": 4.846466231358502e-05, + "loss": 5.974, + "step": 18856 + }, + { + "epoch": 0.11214792082976496, + "grad_norm": 1.8986600637435913, + "learning_rate": 4.846450113991886e-05, + "loss": 5.3866, + "step": 18857 + }, + { + "epoch": 0.11215386811304597, + "grad_norm": 2.4542179107666016, + "learning_rate": 4.846433995806148e-05, + "loss": 4.863, + "step": 18858 + }, + { + "epoch": 0.11215981539632695, + "grad_norm": 2.1604816913604736, + "learning_rate": 4.846417876801295e-05, + "loss": 5.219, + "step": 18859 + }, + { + "epoch": 0.11216576267960796, + "grad_norm": 2.325782060623169, + "learning_rate": 4.846401756977331e-05, + "loss": 5.1454, + "step": 18860 + }, + { + "epoch": 0.11217170996288896, + "grad_norm": 2.3508334159851074, + "learning_rate": 4.846385636334263e-05, + "loss": 5.1318, + "step": 18861 + }, + { + "epoch": 0.11217765724616995, + "grad_norm": 2.2381060123443604, + "learning_rate": 4.846369514872096e-05, + "loss": 5.0676, + "step": 18862 + }, + { + "epoch": 0.11218360452945095, + "grad_norm": 2.3624770641326904, + "learning_rate": 4.8463533925908355e-05, + "loss": 5.0251, + "step": 18863 + }, + { + "epoch": 0.11218955181273195, + "grad_norm": 1.9950919151306152, + "learning_rate": 4.846337269490487e-05, + "loss": 5.0396, + "step": 18864 + }, + { + "epoch": 0.11219549909601294, + "grad_norm": 1.829410433769226, + "learning_rate": 4.8463211455710574e-05, + "loss": 4.9327, + "step": 18865 + }, + { + "epoch": 0.11220144637929394, + "grad_norm": 1.8879605531692505, + "learning_rate": 4.846305020832551e-05, + "loss": 4.8902, + "step": 18866 + }, + { + "epoch": 0.11220739366257494, + "grad_norm": 1.89055335521698, + "learning_rate": 4.846288895274973e-05, + "loss": 4.9219, + "step": 18867 + }, + { + "epoch": 0.11221334094585593, + "grad_norm": 2.224971055984497, + "learning_rate": 4.84627276889833e-05, + "loss": 5.0164, + "step": 18868 + }, + { + "epoch": 0.11221928822913693, + "grad_norm": 2.1675336360931396, + "learning_rate": 4.8462566417026276e-05, + "loss": 5.0082, + "step": 18869 + }, + { + "epoch": 0.11222523551241793, + "grad_norm": 1.885236144065857, + "learning_rate": 4.8462405136878714e-05, + "loss": 5.1484, + "step": 18870 + }, + { + "epoch": 0.11223118279569892, + "grad_norm": 1.3037774562835693, + "learning_rate": 4.846224384854067e-05, + "loss": 5.64, + "step": 18871 + }, + { + "epoch": 0.11223713007897992, + "grad_norm": 1.6506762504577637, + "learning_rate": 4.846208255201219e-05, + "loss": 5.6067, + "step": 18872 + }, + { + "epoch": 0.11224307736226091, + "grad_norm": 1.4294368028640747, + "learning_rate": 4.8461921247293344e-05, + "loss": 5.67, + "step": 18873 + }, + { + "epoch": 0.11224902464554191, + "grad_norm": 1.6201854944229126, + "learning_rate": 4.846175993438419e-05, + "loss": 5.6093, + "step": 18874 + }, + { + "epoch": 0.11225497192882292, + "grad_norm": 1.5683603286743164, + "learning_rate": 4.846159861328478e-05, + "loss": 5.6129, + "step": 18875 + }, + { + "epoch": 0.1122609192121039, + "grad_norm": 1.5446193218231201, + "learning_rate": 4.8461437283995156e-05, + "loss": 5.6063, + "step": 18876 + }, + { + "epoch": 0.1122668664953849, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846127594651539e-05, + "loss": 5.6291, + "step": 18877 + }, + { + "epoch": 0.11227281377866591, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846111460084554e-05, + "loss": 5.6282, + "step": 18878 + }, + { + "epoch": 0.1122787610619469, + "grad_norm": 1.4379156827926636, + "learning_rate": 4.846095324698565e-05, + "loss": 5.5451, + "step": 18879 + }, + { + "epoch": 0.1122847083452279, + "grad_norm": 1.4940646886825562, + "learning_rate": 4.8460791884935785e-05, + "loss": 5.4705, + "step": 18880 + }, + { + "epoch": 0.1122906556285089, + "grad_norm": 1.4625567197799683, + "learning_rate": 4.8460630514696e-05, + "loss": 5.5428, + "step": 18881 + }, + { + "epoch": 0.11229660291178989, + "grad_norm": 1.7899153232574463, + "learning_rate": 4.846046913626636e-05, + "loss": 5.7665, + "step": 18882 + }, + { + "epoch": 0.11230255019507089, + "grad_norm": 2.1002516746520996, + "learning_rate": 4.8460307749646906e-05, + "loss": 6.1132, + "step": 18883 + }, + { + "epoch": 0.11230849747835189, + "grad_norm": 1.8406580686569214, + "learning_rate": 4.84601463548377e-05, + "loss": 5.5207, + "step": 18884 + }, + { + "epoch": 0.11231444476163288, + "grad_norm": 1.6287425756454468, + "learning_rate": 4.84599849518388e-05, + "loss": 5.931, + "step": 18885 + }, + { + "epoch": 0.11232039204491388, + "grad_norm": 1.4447002410888672, + "learning_rate": 4.845982354065027e-05, + "loss": 5.6181, + "step": 18886 + }, + { + "epoch": 0.11232633932819489, + "grad_norm": 1.6555171012878418, + "learning_rate": 4.845966212127215e-05, + "loss": 5.1448, + "step": 18887 + }, + { + "epoch": 0.11233228661147587, + "grad_norm": 2.0948448181152344, + "learning_rate": 4.84595006937045e-05, + "loss": 5.3695, + "step": 18888 + }, + { + "epoch": 0.11233823389475688, + "grad_norm": 1.6369346380233765, + "learning_rate": 4.845933925794739e-05, + "loss": 5.5859, + "step": 18889 + }, + { + "epoch": 0.11234418117803788, + "grad_norm": 1.4660474061965942, + "learning_rate": 4.845917781400086e-05, + "loss": 5.6121, + "step": 18890 + }, + { + "epoch": 0.11235012846131887, + "grad_norm": 1.6739449501037598, + "learning_rate": 4.845901636186497e-05, + "loss": 5.6874, + "step": 18891 + }, + { + "epoch": 0.11235607574459987, + "grad_norm": 1.4542694091796875, + "learning_rate": 4.8458854901539794e-05, + "loss": 5.5956, + "step": 18892 + }, + { + "epoch": 0.11236202302788087, + "grad_norm": 1.3305023908615112, + "learning_rate": 4.8458693433025365e-05, + "loss": 5.658, + "step": 18893 + }, + { + "epoch": 0.11236797031116186, + "grad_norm": 1.8081300258636475, + "learning_rate": 4.845853195632175e-05, + "loss": 4.8563, + "step": 18894 + }, + { + "epoch": 0.11237391759444286, + "grad_norm": 1.8959764242172241, + "learning_rate": 4.8458370471429e-05, + "loss": 5.3051, + "step": 18895 + }, + { + "epoch": 0.11237986487772386, + "grad_norm": 1.9471427202224731, + "learning_rate": 4.845820897834718e-05, + "loss": 5.8181, + "step": 18896 + }, + { + "epoch": 0.11238581216100485, + "grad_norm": 1.6311548948287964, + "learning_rate": 4.845804747707634e-05, + "loss": 5.7714, + "step": 18897 + }, + { + "epoch": 0.11239175944428585, + "grad_norm": 1.830788493156433, + "learning_rate": 4.845788596761653e-05, + "loss": 5.9535, + "step": 18898 + }, + { + "epoch": 0.11239770672756685, + "grad_norm": 1.7896127700805664, + "learning_rate": 4.8457724449967836e-05, + "loss": 5.5385, + "step": 18899 + }, + { + "epoch": 0.11240365401084784, + "grad_norm": 1.5098718404769897, + "learning_rate": 4.845756292413027e-05, + "loss": 5.4067, + "step": 18900 + }, + { + "epoch": 0.11240960129412884, + "grad_norm": 1.9224756956100464, + "learning_rate": 4.845740139010392e-05, + "loss": 5.4863, + "step": 18901 + }, + { + "epoch": 0.11241554857740983, + "grad_norm": 2.1158740520477295, + "learning_rate": 4.845723984788884e-05, + "loss": 5.0745, + "step": 18902 + }, + { + "epoch": 0.11242149586069083, + "grad_norm": 2.292292594909668, + "learning_rate": 4.845707829748507e-05, + "loss": 4.9248, + "step": 18903 + }, + { + "epoch": 0.11242744314397184, + "grad_norm": 2.312593698501587, + "learning_rate": 4.8456916738892675e-05, + "loss": 4.9712, + "step": 18904 + }, + { + "epoch": 0.11243339042725282, + "grad_norm": 1.7302945852279663, + "learning_rate": 4.8456755172111725e-05, + "loss": 5.0814, + "step": 18905 + }, + { + "epoch": 0.11243933771053383, + "grad_norm": 1.3441206216812134, + "learning_rate": 4.845659359714225e-05, + "loss": 5.6563, + "step": 18906 + }, + { + "epoch": 0.11244528499381483, + "grad_norm": 1.5126272439956665, + "learning_rate": 4.845643201398433e-05, + "loss": 5.607, + "step": 18907 + }, + { + "epoch": 0.11245123227709582, + "grad_norm": 1.438795804977417, + "learning_rate": 4.845627042263801e-05, + "loss": 5.5287, + "step": 18908 + }, + { + "epoch": 0.11245717956037682, + "grad_norm": 1.6724447011947632, + "learning_rate": 4.845610882310335e-05, + "loss": 5.361, + "step": 18909 + }, + { + "epoch": 0.11246312684365782, + "grad_norm": 1.7267217636108398, + "learning_rate": 4.845594721538041e-05, + "loss": 5.6361, + "step": 18910 + }, + { + "epoch": 0.11246907412693881, + "grad_norm": 1.7616380453109741, + "learning_rate": 4.845578559946923e-05, + "loss": 5.2538, + "step": 18911 + }, + { + "epoch": 0.11247502141021981, + "grad_norm": 1.8318467140197754, + "learning_rate": 4.845562397536988e-05, + "loss": 4.8236, + "step": 18912 + }, + { + "epoch": 0.11248096869350081, + "grad_norm": 2.4882378578186035, + "learning_rate": 4.8455462343082415e-05, + "loss": 4.5624, + "step": 18913 + }, + { + "epoch": 0.1124869159767818, + "grad_norm": 2.5109870433807373, + "learning_rate": 4.845530070260689e-05, + "loss": 4.7906, + "step": 18914 + }, + { + "epoch": 0.1124928632600628, + "grad_norm": 2.2084672451019287, + "learning_rate": 4.845513905394336e-05, + "loss": 4.5304, + "step": 18915 + }, + { + "epoch": 0.1124988105433438, + "grad_norm": 2.4276058673858643, + "learning_rate": 4.8454977397091885e-05, + "loss": 4.3753, + "step": 18916 + }, + { + "epoch": 0.1125047578266248, + "grad_norm": 2.5022165775299072, + "learning_rate": 4.845481573205252e-05, + "loss": 4.1849, + "step": 18917 + }, + { + "epoch": 0.1125107051099058, + "grad_norm": 2.511643171310425, + "learning_rate": 4.845465405882532e-05, + "loss": 4.4007, + "step": 18918 + }, + { + "epoch": 0.1125166523931868, + "grad_norm": 2.598860263824463, + "learning_rate": 4.845449237741034e-05, + "loss": 4.6015, + "step": 18919 + }, + { + "epoch": 0.11252259967646779, + "grad_norm": 2.339555263519287, + "learning_rate": 4.845433068780765e-05, + "loss": 4.4123, + "step": 18920 + }, + { + "epoch": 0.11252854695974879, + "grad_norm": 2.286858320236206, + "learning_rate": 4.845416899001729e-05, + "loss": 4.3709, + "step": 18921 + }, + { + "epoch": 0.11253449424302979, + "grad_norm": 2.431622266769409, + "learning_rate": 4.845400728403932e-05, + "loss": 4.2162, + "step": 18922 + }, + { + "epoch": 0.11254044152631078, + "grad_norm": 2.7147364616394043, + "learning_rate": 4.8453845569873796e-05, + "loss": 4.3949, + "step": 18923 + }, + { + "epoch": 0.11254638880959178, + "grad_norm": 2.4738264083862305, + "learning_rate": 4.8453683847520784e-05, + "loss": 4.2671, + "step": 18924 + }, + { + "epoch": 0.11255233609287278, + "grad_norm": 2.007298707962036, + "learning_rate": 4.8453522116980325e-05, + "loss": 4.9317, + "step": 18925 + }, + { + "epoch": 0.11255828337615377, + "grad_norm": 1.8057860136032104, + "learning_rate": 4.8453360378252486e-05, + "loss": 5.4763, + "step": 18926 + }, + { + "epoch": 0.11256423065943477, + "grad_norm": 1.913892149925232, + "learning_rate": 4.845319863133733e-05, + "loss": 5.3112, + "step": 18927 + }, + { + "epoch": 0.11257017794271577, + "grad_norm": 1.6226540803909302, + "learning_rate": 4.845303687623489e-05, + "loss": 5.7164, + "step": 18928 + }, + { + "epoch": 0.11257612522599676, + "grad_norm": 1.7885600328445435, + "learning_rate": 4.8452875112945253e-05, + "loss": 5.7746, + "step": 18929 + }, + { + "epoch": 0.11258207250927776, + "grad_norm": 1.5598177909851074, + "learning_rate": 4.8452713341468444e-05, + "loss": 5.7843, + "step": 18930 + }, + { + "epoch": 0.11258801979255875, + "grad_norm": 1.517059564590454, + "learning_rate": 4.845255156180455e-05, + "loss": 5.7777, + "step": 18931 + }, + { + "epoch": 0.11259396707583975, + "grad_norm": 1.2515442371368408, + "learning_rate": 4.84523897739536e-05, + "loss": 5.7443, + "step": 18932 + }, + { + "epoch": 0.11259991435912076, + "grad_norm": 1.4970554113388062, + "learning_rate": 4.845222797791566e-05, + "loss": 5.6157, + "step": 18933 + }, + { + "epoch": 0.11260586164240174, + "grad_norm": 1.632620930671692, + "learning_rate": 4.8452066173690804e-05, + "loss": 5.0715, + "step": 18934 + }, + { + "epoch": 0.11261180892568275, + "grad_norm": 1.9634324312210083, + "learning_rate": 4.845190436127907e-05, + "loss": 5.3624, + "step": 18935 + }, + { + "epoch": 0.11261775620896375, + "grad_norm": 1.663560152053833, + "learning_rate": 4.8451742540680514e-05, + "loss": 5.4324, + "step": 18936 + }, + { + "epoch": 0.11262370349224474, + "grad_norm": 1.560684323310852, + "learning_rate": 4.84515807118952e-05, + "loss": 4.8426, + "step": 18937 + }, + { + "epoch": 0.11262965077552574, + "grad_norm": 1.5759334564208984, + "learning_rate": 4.8451418874923185e-05, + "loss": 5.6239, + "step": 18938 + }, + { + "epoch": 0.11263559805880674, + "grad_norm": 1.8501811027526855, + "learning_rate": 4.8451257029764504e-05, + "loss": 5.1734, + "step": 18939 + }, + { + "epoch": 0.11264154534208773, + "grad_norm": 1.811924934387207, + "learning_rate": 4.845109517641925e-05, + "loss": 5.2778, + "step": 18940 + }, + { + "epoch": 0.11264749262536873, + "grad_norm": 1.9684933423995972, + "learning_rate": 4.845093331488746e-05, + "loss": 5.3673, + "step": 18941 + }, + { + "epoch": 0.11265343990864973, + "grad_norm": 2.1155457496643066, + "learning_rate": 4.8450771445169185e-05, + "loss": 4.6955, + "step": 18942 + }, + { + "epoch": 0.11265938719193072, + "grad_norm": 2.117941379547119, + "learning_rate": 4.8450609567264495e-05, + "loss": 4.4051, + "step": 18943 + }, + { + "epoch": 0.11266533447521172, + "grad_norm": 1.9649946689605713, + "learning_rate": 4.845044768117343e-05, + "loss": 5.0204, + "step": 18944 + }, + { + "epoch": 0.11267128175849273, + "grad_norm": 1.898119568824768, + "learning_rate": 4.845028578689606e-05, + "loss": 4.9994, + "step": 18945 + }, + { + "epoch": 0.11267722904177371, + "grad_norm": 2.4376771450042725, + "learning_rate": 4.845012388443244e-05, + "loss": 4.6852, + "step": 18946 + }, + { + "epoch": 0.11268317632505472, + "grad_norm": 2.593094825744629, + "learning_rate": 4.844996197378262e-05, + "loss": 4.3845, + "step": 18947 + }, + { + "epoch": 0.11268912360833572, + "grad_norm": 2.6004302501678467, + "learning_rate": 4.844980005494666e-05, + "loss": 4.2989, + "step": 18948 + }, + { + "epoch": 0.1126950708916167, + "grad_norm": 2.4045653343200684, + "learning_rate": 4.844963812792462e-05, + "loss": 4.411, + "step": 18949 + }, + { + "epoch": 0.11270101817489771, + "grad_norm": 2.2256572246551514, + "learning_rate": 4.8449476192716555e-05, + "loss": 4.423, + "step": 18950 + }, + { + "epoch": 0.11270696545817871, + "grad_norm": 2.110077142715454, + "learning_rate": 4.844931424932252e-05, + "loss": 4.2971, + "step": 18951 + }, + { + "epoch": 0.1127129127414597, + "grad_norm": 1.8960111141204834, + "learning_rate": 4.844915229774257e-05, + "loss": 5.0758, + "step": 18952 + }, + { + "epoch": 0.1127188600247407, + "grad_norm": 1.998542308807373, + "learning_rate": 4.844899033797676e-05, + "loss": 4.8565, + "step": 18953 + }, + { + "epoch": 0.1127248073080217, + "grad_norm": 1.7070491313934326, + "learning_rate": 4.8448828370025156e-05, + "loss": 5.4684, + "step": 18954 + }, + { + "epoch": 0.11273075459130269, + "grad_norm": 2.062570095062256, + "learning_rate": 4.8448666393887806e-05, + "loss": 5.5384, + "step": 18955 + }, + { + "epoch": 0.11273670187458369, + "grad_norm": 1.8782148361206055, + "learning_rate": 4.844850440956476e-05, + "loss": 5.0373, + "step": 18956 + }, + { + "epoch": 0.1127426491578647, + "grad_norm": 2.3674817085266113, + "learning_rate": 4.8448342417056096e-05, + "loss": 5.1999, + "step": 18957 + }, + { + "epoch": 0.11274859644114568, + "grad_norm": 2.2243809700012207, + "learning_rate": 4.844818041636186e-05, + "loss": 5.3275, + "step": 18958 + }, + { + "epoch": 0.11275454372442668, + "grad_norm": 2.2929039001464844, + "learning_rate": 4.8448018407482096e-05, + "loss": 5.3958, + "step": 18959 + }, + { + "epoch": 0.11276049100770767, + "grad_norm": 2.0325045585632324, + "learning_rate": 4.844785639041688e-05, + "loss": 4.6686, + "step": 18960 + }, + { + "epoch": 0.11276643829098867, + "grad_norm": 1.8510624170303345, + "learning_rate": 4.8447694365166255e-05, + "loss": 4.9134, + "step": 18961 + }, + { + "epoch": 0.11277238557426968, + "grad_norm": 1.7537583112716675, + "learning_rate": 4.844753233173027e-05, + "loss": 5.0618, + "step": 18962 + }, + { + "epoch": 0.11277833285755066, + "grad_norm": 1.9293370246887207, + "learning_rate": 4.844737029010901e-05, + "loss": 4.8716, + "step": 18963 + }, + { + "epoch": 0.11278428014083167, + "grad_norm": 1.6931575536727905, + "learning_rate": 4.844720824030251e-05, + "loss": 5.4606, + "step": 18964 + }, + { + "epoch": 0.11279022742411267, + "grad_norm": 1.970825433731079, + "learning_rate": 4.8447046182310836e-05, + "loss": 5.2482, + "step": 18965 + }, + { + "epoch": 0.11279617470739366, + "grad_norm": 1.4842323064804077, + "learning_rate": 4.844688411613404e-05, + "loss": 5.972, + "step": 18966 + }, + { + "epoch": 0.11280212199067466, + "grad_norm": 1.84175705909729, + "learning_rate": 4.8446722041772174e-05, + "loss": 4.7696, + "step": 18967 + }, + { + "epoch": 0.11280806927395566, + "grad_norm": 1.8980286121368408, + "learning_rate": 4.84465599592253e-05, + "loss": 4.5125, + "step": 18968 + }, + { + "epoch": 0.11281401655723665, + "grad_norm": 1.7349838018417358, + "learning_rate": 4.844639786849348e-05, + "loss": 4.581, + "step": 18969 + }, + { + "epoch": 0.11281996384051765, + "grad_norm": 1.5894320011138916, + "learning_rate": 4.844623576957675e-05, + "loss": 4.9205, + "step": 18970 + }, + { + "epoch": 0.11282591112379865, + "grad_norm": 1.8740227222442627, + "learning_rate": 4.84460736624752e-05, + "loss": 4.938, + "step": 18971 + }, + { + "epoch": 0.11283185840707964, + "grad_norm": 1.744537591934204, + "learning_rate": 4.8445911547188854e-05, + "loss": 5.5215, + "step": 18972 + }, + { + "epoch": 0.11283780569036064, + "grad_norm": 1.5465041399002075, + "learning_rate": 4.844574942371779e-05, + "loss": 5.3607, + "step": 18973 + }, + { + "epoch": 0.11284375297364165, + "grad_norm": 1.8417413234710693, + "learning_rate": 4.8445587292062056e-05, + "loss": 5.632, + "step": 18974 + }, + { + "epoch": 0.11284970025692263, + "grad_norm": 1.7401045560836792, + "learning_rate": 4.8445425152221704e-05, + "loss": 5.5514, + "step": 18975 + }, + { + "epoch": 0.11285564754020364, + "grad_norm": 1.6192666292190552, + "learning_rate": 4.8445263004196805e-05, + "loss": 5.2694, + "step": 18976 + }, + { + "epoch": 0.11286159482348464, + "grad_norm": 1.842510461807251, + "learning_rate": 4.84451008479874e-05, + "loss": 5.3429, + "step": 18977 + }, + { + "epoch": 0.11286754210676563, + "grad_norm": 1.4824966192245483, + "learning_rate": 4.8444938683593554e-05, + "loss": 5.5212, + "step": 18978 + }, + { + "epoch": 0.11287348939004663, + "grad_norm": 1.7926548719406128, + "learning_rate": 4.8444776511015324e-05, + "loss": 4.8687, + "step": 18979 + }, + { + "epoch": 0.11287943667332763, + "grad_norm": 1.7114008665084839, + "learning_rate": 4.844461433025277e-05, + "loss": 4.7459, + "step": 18980 + }, + { + "epoch": 0.11288538395660862, + "grad_norm": 1.8884011507034302, + "learning_rate": 4.844445214130594e-05, + "loss": 5.1957, + "step": 18981 + }, + { + "epoch": 0.11289133123988962, + "grad_norm": 1.6901582479476929, + "learning_rate": 4.844428994417489e-05, + "loss": 5.3349, + "step": 18982 + }, + { + "epoch": 0.11289727852317062, + "grad_norm": 1.7148336172103882, + "learning_rate": 4.844412773885968e-05, + "loss": 5.4903, + "step": 18983 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 1.478767991065979, + "learning_rate": 4.844396552536037e-05, + "loss": 5.6339, + "step": 18984 + }, + { + "epoch": 0.11290917308973261, + "grad_norm": 1.5679733753204346, + "learning_rate": 4.844380330367701e-05, + "loss": 5.4722, + "step": 18985 + }, + { + "epoch": 0.11291512037301361, + "grad_norm": 1.718564510345459, + "learning_rate": 4.844364107380966e-05, + "loss": 5.2826, + "step": 18986 + }, + { + "epoch": 0.1129210676562946, + "grad_norm": 1.6757621765136719, + "learning_rate": 4.844347883575839e-05, + "loss": 5.7454, + "step": 18987 + }, + { + "epoch": 0.1129270149395756, + "grad_norm": 1.9370322227478027, + "learning_rate": 4.844331658952324e-05, + "loss": 4.6631, + "step": 18988 + }, + { + "epoch": 0.1129329622228566, + "grad_norm": 1.9932162761688232, + "learning_rate": 4.844315433510426e-05, + "loss": 4.7486, + "step": 18989 + }, + { + "epoch": 0.1129389095061376, + "grad_norm": 2.0191309452056885, + "learning_rate": 4.844299207250152e-05, + "loss": 4.6999, + "step": 18990 + }, + { + "epoch": 0.1129448567894186, + "grad_norm": 1.971913456916809, + "learning_rate": 4.8442829801715074e-05, + "loss": 4.7345, + "step": 18991 + }, + { + "epoch": 0.11295080407269958, + "grad_norm": 1.8503371477127075, + "learning_rate": 4.844266752274498e-05, + "loss": 4.5352, + "step": 18992 + }, + { + "epoch": 0.11295675135598059, + "grad_norm": 2.0024712085723877, + "learning_rate": 4.8442505235591294e-05, + "loss": 4.6513, + "step": 18993 + }, + { + "epoch": 0.11296269863926159, + "grad_norm": 1.645996332168579, + "learning_rate": 4.844234294025407e-05, + "loss": 4.816, + "step": 18994 + }, + { + "epoch": 0.11296864592254258, + "grad_norm": 1.6649290323257446, + "learning_rate": 4.844218063673337e-05, + "loss": 5.1471, + "step": 18995 + }, + { + "epoch": 0.11297459320582358, + "grad_norm": 1.4211794137954712, + "learning_rate": 4.844201832502924e-05, + "loss": 5.0807, + "step": 18996 + }, + { + "epoch": 0.11298054048910458, + "grad_norm": 1.6982463598251343, + "learning_rate": 4.844185600514175e-05, + "loss": 4.9912, + "step": 18997 + }, + { + "epoch": 0.11298648777238557, + "grad_norm": 1.5852501392364502, + "learning_rate": 4.844169367707095e-05, + "loss": 5.4541, + "step": 18998 + }, + { + "epoch": 0.11299243505566657, + "grad_norm": 1.787331223487854, + "learning_rate": 4.844153134081689e-05, + "loss": 5.4295, + "step": 18999 + }, + { + "epoch": 0.11299838233894757, + "grad_norm": 1.5758492946624756, + "learning_rate": 4.844136899637964e-05, + "loss": 5.2601, + "step": 19000 + }, + { + "epoch": 0.11300432962222856, + "grad_norm": 1.5441172122955322, + "learning_rate": 4.844120664375925e-05, + "loss": 4.882, + "step": 19001 + }, + { + "epoch": 0.11301027690550956, + "grad_norm": 1.6587432622909546, + "learning_rate": 4.8441044282955774e-05, + "loss": 4.8311, + "step": 19002 + }, + { + "epoch": 0.11301622418879056, + "grad_norm": 1.6563838720321655, + "learning_rate": 4.844088191396927e-05, + "loss": 4.87, + "step": 19003 + }, + { + "epoch": 0.11302217147207155, + "grad_norm": 1.7367866039276123, + "learning_rate": 4.84407195367998e-05, + "loss": 5.2984, + "step": 19004 + }, + { + "epoch": 0.11302811875535256, + "grad_norm": 2.3307883739471436, + "learning_rate": 4.844055715144742e-05, + "loss": 4.8798, + "step": 19005 + }, + { + "epoch": 0.11303406603863356, + "grad_norm": 2.601762294769287, + "learning_rate": 4.844039475791218e-05, + "loss": 4.8156, + "step": 19006 + }, + { + "epoch": 0.11304001332191455, + "grad_norm": 2.372610330581665, + "learning_rate": 4.844023235619414e-05, + "loss": 4.0715, + "step": 19007 + }, + { + "epoch": 0.11304596060519555, + "grad_norm": 2.16119384765625, + "learning_rate": 4.8440069946293356e-05, + "loss": 4.2701, + "step": 19008 + }, + { + "epoch": 0.11305190788847655, + "grad_norm": 2.1576502323150635, + "learning_rate": 4.843990752820989e-05, + "loss": 4.1302, + "step": 19009 + }, + { + "epoch": 0.11305785517175754, + "grad_norm": 2.122025489807129, + "learning_rate": 4.843974510194379e-05, + "loss": 4.0969, + "step": 19010 + }, + { + "epoch": 0.11306380245503854, + "grad_norm": 2.1929194927215576, + "learning_rate": 4.843958266749512e-05, + "loss": 4.2054, + "step": 19011 + }, + { + "epoch": 0.11306974973831954, + "grad_norm": 2.6305301189422607, + "learning_rate": 4.843942022486393e-05, + "loss": 4.3942, + "step": 19012 + }, + { + "epoch": 0.11307569702160053, + "grad_norm": 2.5355119705200195, + "learning_rate": 4.843925777405028e-05, + "loss": 4.4392, + "step": 19013 + }, + { + "epoch": 0.11308164430488153, + "grad_norm": 2.5040411949157715, + "learning_rate": 4.843909531505424e-05, + "loss": 4.221, + "step": 19014 + }, + { + "epoch": 0.11308759158816253, + "grad_norm": 2.15824556350708, + "learning_rate": 4.843893284787584e-05, + "loss": 4.8255, + "step": 19015 + }, + { + "epoch": 0.11309353887144352, + "grad_norm": 1.6300889253616333, + "learning_rate": 4.8438770372515155e-05, + "loss": 5.3668, + "step": 19016 + }, + { + "epoch": 0.11309948615472452, + "grad_norm": 1.745676875114441, + "learning_rate": 4.8438607888972245e-05, + "loss": 5.2858, + "step": 19017 + }, + { + "epoch": 0.11310543343800553, + "grad_norm": 1.6511434316635132, + "learning_rate": 4.8438445397247146e-05, + "loss": 5.2856, + "step": 19018 + }, + { + "epoch": 0.11311138072128651, + "grad_norm": 1.6282720565795898, + "learning_rate": 4.843828289733994e-05, + "loss": 5.7748, + "step": 19019 + }, + { + "epoch": 0.11311732800456752, + "grad_norm": 1.6303821802139282, + "learning_rate": 4.843812038925066e-05, + "loss": 5.3627, + "step": 19020 + }, + { + "epoch": 0.1131232752878485, + "grad_norm": 1.5684829950332642, + "learning_rate": 4.843795787297938e-05, + "loss": 5.6563, + "step": 19021 + }, + { + "epoch": 0.1131292225711295, + "grad_norm": 1.9084935188293457, + "learning_rate": 4.843779534852615e-05, + "loss": 5.7084, + "step": 19022 + }, + { + "epoch": 0.11313516985441051, + "grad_norm": 1.5176855325698853, + "learning_rate": 4.843763281589103e-05, + "loss": 5.7602, + "step": 19023 + }, + { + "epoch": 0.1131411171376915, + "grad_norm": 1.3877767324447632, + "learning_rate": 4.843747027507407e-05, + "loss": 5.4914, + "step": 19024 + }, + { + "epoch": 0.1131470644209725, + "grad_norm": 2.0801119804382324, + "learning_rate": 4.843730772607533e-05, + "loss": 4.8814, + "step": 19025 + }, + { + "epoch": 0.1131530117042535, + "grad_norm": 1.9673620462417603, + "learning_rate": 4.8437145168894874e-05, + "loss": 4.9423, + "step": 19026 + }, + { + "epoch": 0.11315895898753449, + "grad_norm": 1.5284085273742676, + "learning_rate": 4.8436982603532755e-05, + "loss": 5.0471, + "step": 19027 + }, + { + "epoch": 0.11316490627081549, + "grad_norm": 1.870762825012207, + "learning_rate": 4.8436820029989023e-05, + "loss": 4.9376, + "step": 19028 + }, + { + "epoch": 0.11317085355409649, + "grad_norm": 1.9094692468643188, + "learning_rate": 4.843665744826374e-05, + "loss": 4.8677, + "step": 19029 + }, + { + "epoch": 0.11317680083737748, + "grad_norm": 1.6463623046875, + "learning_rate": 4.8436494858356964e-05, + "loss": 5.3397, + "step": 19030 + }, + { + "epoch": 0.11318274812065848, + "grad_norm": 1.8127562999725342, + "learning_rate": 4.8436332260268745e-05, + "loss": 5.1626, + "step": 19031 + }, + { + "epoch": 0.11318869540393948, + "grad_norm": 1.5196025371551514, + "learning_rate": 4.8436169653999144e-05, + "loss": 5.1213, + "step": 19032 + }, + { + "epoch": 0.11319464268722047, + "grad_norm": 1.8930630683898926, + "learning_rate": 4.843600703954823e-05, + "loss": 4.8268, + "step": 19033 + }, + { + "epoch": 0.11320058997050148, + "grad_norm": 2.1579136848449707, + "learning_rate": 4.843584441691603e-05, + "loss": 5.6111, + "step": 19034 + }, + { + "epoch": 0.11320653725378248, + "grad_norm": 1.7644915580749512, + "learning_rate": 4.8435681786102624e-05, + "loss": 5.5762, + "step": 19035 + }, + { + "epoch": 0.11321248453706347, + "grad_norm": 1.5442852973937988, + "learning_rate": 4.843551914710808e-05, + "loss": 5.6486, + "step": 19036 + }, + { + "epoch": 0.11321843182034447, + "grad_norm": 1.823852777481079, + "learning_rate": 4.843535649993242e-05, + "loss": 5.6581, + "step": 19037 + }, + { + "epoch": 0.11322437910362547, + "grad_norm": 1.5850268602371216, + "learning_rate": 4.8435193844575726e-05, + "loss": 5.6351, + "step": 19038 + }, + { + "epoch": 0.11323032638690646, + "grad_norm": 1.6234556436538696, + "learning_rate": 4.843503118103805e-05, + "loss": 5.5462, + "step": 19039 + }, + { + "epoch": 0.11323627367018746, + "grad_norm": 1.602618932723999, + "learning_rate": 4.843486850931944e-05, + "loss": 5.2935, + "step": 19040 + }, + { + "epoch": 0.11324222095346846, + "grad_norm": 1.6808282136917114, + "learning_rate": 4.843470582941997e-05, + "loss": 5.2254, + "step": 19041 + }, + { + "epoch": 0.11324816823674945, + "grad_norm": 1.6311568021774292, + "learning_rate": 4.8434543141339674e-05, + "loss": 5.1894, + "step": 19042 + }, + { + "epoch": 0.11325411552003045, + "grad_norm": 1.5836867094039917, + "learning_rate": 4.843438044507863e-05, + "loss": 5.6344, + "step": 19043 + }, + { + "epoch": 0.11326006280331145, + "grad_norm": 1.5654397010803223, + "learning_rate": 4.843421774063688e-05, + "loss": 5.2902, + "step": 19044 + }, + { + "epoch": 0.11326601008659244, + "grad_norm": 2.3957626819610596, + "learning_rate": 4.843405502801449e-05, + "loss": 4.812, + "step": 19045 + }, + { + "epoch": 0.11327195736987344, + "grad_norm": 2.123473644256592, + "learning_rate": 4.843389230721151e-05, + "loss": 4.6399, + "step": 19046 + }, + { + "epoch": 0.11327790465315445, + "grad_norm": 1.6691471338272095, + "learning_rate": 4.8433729578228007e-05, + "loss": 4.9337, + "step": 19047 + }, + { + "epoch": 0.11328385193643543, + "grad_norm": 1.6179373264312744, + "learning_rate": 4.8433566841064025e-05, + "loss": 5.1002, + "step": 19048 + }, + { + "epoch": 0.11328979921971644, + "grad_norm": 1.658995270729065, + "learning_rate": 4.843340409571963e-05, + "loss": 5.0397, + "step": 19049 + }, + { + "epoch": 0.11329574650299742, + "grad_norm": 2.0216362476348877, + "learning_rate": 4.843324134219488e-05, + "loss": 5.3112, + "step": 19050 + }, + { + "epoch": 0.11330169378627843, + "grad_norm": 2.0376546382904053, + "learning_rate": 4.843307858048982e-05, + "loss": 5.087, + "step": 19051 + }, + { + "epoch": 0.11330764106955943, + "grad_norm": 2.2038021087646484, + "learning_rate": 4.8432915810604516e-05, + "loss": 4.951, + "step": 19052 + }, + { + "epoch": 0.11331358835284042, + "grad_norm": 1.8985834121704102, + "learning_rate": 4.843275303253903e-05, + "loss": 5.522, + "step": 19053 + }, + { + "epoch": 0.11331953563612142, + "grad_norm": 1.9047077894210815, + "learning_rate": 4.8432590246293404e-05, + "loss": 5.8387, + "step": 19054 + }, + { + "epoch": 0.11332548291940242, + "grad_norm": 1.508352279663086, + "learning_rate": 4.8432427451867704e-05, + "loss": 5.7969, + "step": 19055 + }, + { + "epoch": 0.11333143020268341, + "grad_norm": 1.631695032119751, + "learning_rate": 4.8432264649261984e-05, + "loss": 5.3562, + "step": 19056 + }, + { + "epoch": 0.11333737748596441, + "grad_norm": 1.673411250114441, + "learning_rate": 4.8432101838476305e-05, + "loss": 5.3286, + "step": 19057 + }, + { + "epoch": 0.11334332476924541, + "grad_norm": 2.697946071624756, + "learning_rate": 4.843193901951072e-05, + "loss": 5.0634, + "step": 19058 + }, + { + "epoch": 0.1133492720525264, + "grad_norm": 2.5914673805236816, + "learning_rate": 4.843177619236529e-05, + "loss": 4.8294, + "step": 19059 + }, + { + "epoch": 0.1133552193358074, + "grad_norm": 1.8503727912902832, + "learning_rate": 4.843161335704007e-05, + "loss": 5.1436, + "step": 19060 + }, + { + "epoch": 0.1133611666190884, + "grad_norm": 1.7629435062408447, + "learning_rate": 4.843145051353511e-05, + "loss": 5.1822, + "step": 19061 + }, + { + "epoch": 0.11336711390236939, + "grad_norm": 1.826360821723938, + "learning_rate": 4.843128766185048e-05, + "loss": 5.5151, + "step": 19062 + }, + { + "epoch": 0.1133730611856504, + "grad_norm": 2.0347046852111816, + "learning_rate": 4.843112480198623e-05, + "loss": 4.7732, + "step": 19063 + }, + { + "epoch": 0.1133790084689314, + "grad_norm": 2.037482738494873, + "learning_rate": 4.843096193394241e-05, + "loss": 4.6475, + "step": 19064 + }, + { + "epoch": 0.11338495575221239, + "grad_norm": 2.1152050495147705, + "learning_rate": 4.8430799057719076e-05, + "loss": 4.531, + "step": 19065 + }, + { + "epoch": 0.11339090303549339, + "grad_norm": 2.303982734680176, + "learning_rate": 4.8430636173316306e-05, + "loss": 4.6317, + "step": 19066 + }, + { + "epoch": 0.11339685031877439, + "grad_norm": 2.3326570987701416, + "learning_rate": 4.843047328073414e-05, + "loss": 4.736, + "step": 19067 + }, + { + "epoch": 0.11340279760205538, + "grad_norm": 2.371316909790039, + "learning_rate": 4.8430310379972634e-05, + "loss": 4.806, + "step": 19068 + }, + { + "epoch": 0.11340874488533638, + "grad_norm": 2.5370912551879883, + "learning_rate": 4.8430147471031855e-05, + "loss": 4.7867, + "step": 19069 + }, + { + "epoch": 0.11341469216861738, + "grad_norm": 2.456982135772705, + "learning_rate": 4.842998455391185e-05, + "loss": 4.6942, + "step": 19070 + }, + { + "epoch": 0.11342063945189837, + "grad_norm": 2.526287078857422, + "learning_rate": 4.842982162861268e-05, + "loss": 4.7333, + "step": 19071 + }, + { + "epoch": 0.11342658673517937, + "grad_norm": 2.2763514518737793, + "learning_rate": 4.84296586951344e-05, + "loss": 4.712, + "step": 19072 + }, + { + "epoch": 0.11343253401846037, + "grad_norm": 2.330958366394043, + "learning_rate": 4.842949575347707e-05, + "loss": 4.5875, + "step": 19073 + }, + { + "epoch": 0.11343848130174136, + "grad_norm": 2.390018939971924, + "learning_rate": 4.8429332803640745e-05, + "loss": 4.6941, + "step": 19074 + }, + { + "epoch": 0.11344442858502236, + "grad_norm": 2.279719829559326, + "learning_rate": 4.842916984562548e-05, + "loss": 4.6216, + "step": 19075 + }, + { + "epoch": 0.11345037586830337, + "grad_norm": 2.2815043926239014, + "learning_rate": 4.842900687943133e-05, + "loss": 4.5667, + "step": 19076 + }, + { + "epoch": 0.11345632315158435, + "grad_norm": 2.301231861114502, + "learning_rate": 4.842884390505836e-05, + "loss": 4.5451, + "step": 19077 + }, + { + "epoch": 0.11346227043486536, + "grad_norm": 2.1763200759887695, + "learning_rate": 4.842868092250662e-05, + "loss": 4.5937, + "step": 19078 + }, + { + "epoch": 0.11346821771814634, + "grad_norm": 2.2151448726654053, + "learning_rate": 4.842851793177618e-05, + "loss": 4.8341, + "step": 19079 + }, + { + "epoch": 0.11347416500142735, + "grad_norm": 2.3094639778137207, + "learning_rate": 4.8428354932867085e-05, + "loss": 4.7308, + "step": 19080 + }, + { + "epoch": 0.11348011228470835, + "grad_norm": 1.5218987464904785, + "learning_rate": 4.8428191925779385e-05, + "loss": 5.2701, + "step": 19081 + }, + { + "epoch": 0.11348605956798934, + "grad_norm": 1.3781639337539673, + "learning_rate": 4.842802891051315e-05, + "loss": 5.6873, + "step": 19082 + }, + { + "epoch": 0.11349200685127034, + "grad_norm": 1.814702033996582, + "learning_rate": 4.842786588706842e-05, + "loss": 5.7713, + "step": 19083 + }, + { + "epoch": 0.11349795413455134, + "grad_norm": 1.5691754817962646, + "learning_rate": 4.842770285544528e-05, + "loss": 5.7115, + "step": 19084 + }, + { + "epoch": 0.11350390141783233, + "grad_norm": 1.962762713432312, + "learning_rate": 4.8427539815643766e-05, + "loss": 5.4159, + "step": 19085 + }, + { + "epoch": 0.11350984870111333, + "grad_norm": 1.6766527891159058, + "learning_rate": 4.842737676766393e-05, + "loss": 5.6007, + "step": 19086 + }, + { + "epoch": 0.11351579598439433, + "grad_norm": 1.782934308052063, + "learning_rate": 4.8427213711505844e-05, + "loss": 5.982, + "step": 19087 + }, + { + "epoch": 0.11352174326767532, + "grad_norm": 1.5706422328948975, + "learning_rate": 4.842705064716957e-05, + "loss": 5.5125, + "step": 19088 + }, + { + "epoch": 0.11352769055095632, + "grad_norm": 2.4957141876220703, + "learning_rate": 4.842688757465515e-05, + "loss": 4.5386, + "step": 19089 + }, + { + "epoch": 0.11353363783423732, + "grad_norm": 2.1444833278656006, + "learning_rate": 4.842672449396264e-05, + "loss": 4.6108, + "step": 19090 + }, + { + "epoch": 0.11353958511751831, + "grad_norm": 2.4586305618286133, + "learning_rate": 4.8426561405092106e-05, + "loss": 4.7453, + "step": 19091 + }, + { + "epoch": 0.11354553240079931, + "grad_norm": 2.228759765625, + "learning_rate": 4.8426398308043605e-05, + "loss": 4.662, + "step": 19092 + }, + { + "epoch": 0.11355147968408032, + "grad_norm": 2.029172420501709, + "learning_rate": 4.8426235202817184e-05, + "loss": 4.6389, + "step": 19093 + }, + { + "epoch": 0.1135574269673613, + "grad_norm": 2.1887340545654297, + "learning_rate": 4.842607208941291e-05, + "loss": 4.6852, + "step": 19094 + }, + { + "epoch": 0.11356337425064231, + "grad_norm": 1.7664849758148193, + "learning_rate": 4.842590896783084e-05, + "loss": 5.2435, + "step": 19095 + }, + { + "epoch": 0.11356932153392331, + "grad_norm": 1.5581247806549072, + "learning_rate": 4.8425745838071016e-05, + "loss": 5.6828, + "step": 19096 + }, + { + "epoch": 0.1135752688172043, + "grad_norm": 1.570602297782898, + "learning_rate": 4.842558270013352e-05, + "loss": 5.7011, + "step": 19097 + }, + { + "epoch": 0.1135812161004853, + "grad_norm": 1.4669830799102783, + "learning_rate": 4.842541955401838e-05, + "loss": 5.4361, + "step": 19098 + }, + { + "epoch": 0.1135871633837663, + "grad_norm": 1.199173927307129, + "learning_rate": 4.842525639972568e-05, + "loss": 5.5198, + "step": 19099 + }, + { + "epoch": 0.11359311066704729, + "grad_norm": 1.1747777462005615, + "learning_rate": 4.842509323725546e-05, + "loss": 5.6252, + "step": 19100 + }, + { + "epoch": 0.11359905795032829, + "grad_norm": 1.4497981071472168, + "learning_rate": 4.8424930066607784e-05, + "loss": 5.4295, + "step": 19101 + }, + { + "epoch": 0.1136050052336093, + "grad_norm": 1.485688328742981, + "learning_rate": 4.8424766887782704e-05, + "loss": 5.1248, + "step": 19102 + }, + { + "epoch": 0.11361095251689028, + "grad_norm": 1.419149398803711, + "learning_rate": 4.842460370078028e-05, + "loss": 5.0604, + "step": 19103 + }, + { + "epoch": 0.11361689980017128, + "grad_norm": 1.622096300125122, + "learning_rate": 4.842444050560058e-05, + "loss": 5.4429, + "step": 19104 + }, + { + "epoch": 0.11362284708345229, + "grad_norm": 1.2471072673797607, + "learning_rate": 4.8424277302243636e-05, + "loss": 5.3636, + "step": 19105 + }, + { + "epoch": 0.11362879436673327, + "grad_norm": 1.3416316509246826, + "learning_rate": 4.842411409070952e-05, + "loss": 5.1415, + "step": 19106 + }, + { + "epoch": 0.11363474165001428, + "grad_norm": 1.3691420555114746, + "learning_rate": 4.8423950870998293e-05, + "loss": 5.3286, + "step": 19107 + }, + { + "epoch": 0.11364068893329526, + "grad_norm": 1.2382487058639526, + "learning_rate": 4.842378764311e-05, + "loss": 5.4391, + "step": 19108 + }, + { + "epoch": 0.11364663621657627, + "grad_norm": 1.1729276180267334, + "learning_rate": 4.842362440704471e-05, + "loss": 5.4158, + "step": 19109 + }, + { + "epoch": 0.11365258349985727, + "grad_norm": 1.2451897859573364, + "learning_rate": 4.842346116280247e-05, + "loss": 5.2487, + "step": 19110 + }, + { + "epoch": 0.11365853078313826, + "grad_norm": 1.255652666091919, + "learning_rate": 4.8423297910383354e-05, + "loss": 5.2759, + "step": 19111 + }, + { + "epoch": 0.11366447806641926, + "grad_norm": 1.170296549797058, + "learning_rate": 4.8423134649787394e-05, + "loss": 5.1508, + "step": 19112 + }, + { + "epoch": 0.11367042534970026, + "grad_norm": 1.3954061269760132, + "learning_rate": 4.842297138101467e-05, + "loss": 5.3102, + "step": 19113 + }, + { + "epoch": 0.11367637263298125, + "grad_norm": 1.2746593952178955, + "learning_rate": 4.842280810406522e-05, + "loss": 5.2587, + "step": 19114 + }, + { + "epoch": 0.11368231991626225, + "grad_norm": 1.3224173784255981, + "learning_rate": 4.8422644818939114e-05, + "loss": 5.1927, + "step": 19115 + }, + { + "epoch": 0.11368826719954325, + "grad_norm": 1.0930812358856201, + "learning_rate": 4.84224815256364e-05, + "loss": 5.1676, + "step": 19116 + }, + { + "epoch": 0.11369421448282424, + "grad_norm": 1.3805547952651978, + "learning_rate": 4.842231822415715e-05, + "loss": 5.066, + "step": 19117 + }, + { + "epoch": 0.11370016176610524, + "grad_norm": 1.3455450534820557, + "learning_rate": 4.84221549145014e-05, + "loss": 4.9656, + "step": 19118 + }, + { + "epoch": 0.11370610904938624, + "grad_norm": 1.442218542098999, + "learning_rate": 4.842199159666922e-05, + "loss": 4.9094, + "step": 19119 + }, + { + "epoch": 0.11371205633266723, + "grad_norm": 1.435941457748413, + "learning_rate": 4.8421828270660665e-05, + "loss": 5.1035, + "step": 19120 + }, + { + "epoch": 0.11371800361594823, + "grad_norm": 1.2507586479187012, + "learning_rate": 4.84216649364758e-05, + "loss": 5.2395, + "step": 19121 + }, + { + "epoch": 0.11372395089922924, + "grad_norm": 1.3616739511489868, + "learning_rate": 4.842150159411466e-05, + "loss": 5.2082, + "step": 19122 + }, + { + "epoch": 0.11372989818251023, + "grad_norm": 1.2988322973251343, + "learning_rate": 4.842133824357732e-05, + "loss": 5.1271, + "step": 19123 + }, + { + "epoch": 0.11373584546579123, + "grad_norm": 1.2761636972427368, + "learning_rate": 4.842117488486384e-05, + "loss": 5.1724, + "step": 19124 + }, + { + "epoch": 0.11374179274907223, + "grad_norm": 1.2834585905075073, + "learning_rate": 4.842101151797426e-05, + "loss": 5.2256, + "step": 19125 + }, + { + "epoch": 0.11374774003235322, + "grad_norm": 1.2074506282806396, + "learning_rate": 4.8420848142908655e-05, + "loss": 5.2704, + "step": 19126 + }, + { + "epoch": 0.11375368731563422, + "grad_norm": 1.355292797088623, + "learning_rate": 4.842068475966707e-05, + "loss": 5.1109, + "step": 19127 + }, + { + "epoch": 0.11375963459891522, + "grad_norm": 1.1144691705703735, + "learning_rate": 4.8420521368249565e-05, + "loss": 5.0903, + "step": 19128 + }, + { + "epoch": 0.11376558188219621, + "grad_norm": 1.3889878988265991, + "learning_rate": 4.84203579686562e-05, + "loss": 5.1289, + "step": 19129 + }, + { + "epoch": 0.11377152916547721, + "grad_norm": 1.1302597522735596, + "learning_rate": 4.8420194560887035e-05, + "loss": 4.9211, + "step": 19130 + }, + { + "epoch": 0.11377747644875821, + "grad_norm": 1.1715654134750366, + "learning_rate": 4.8420031144942115e-05, + "loss": 5.2239, + "step": 19131 + }, + { + "epoch": 0.1137834237320392, + "grad_norm": 1.327021837234497, + "learning_rate": 4.84198677208215e-05, + "loss": 5.2941, + "step": 19132 + }, + { + "epoch": 0.1137893710153202, + "grad_norm": 1.3442116975784302, + "learning_rate": 4.841970428852526e-05, + "loss": 5.1752, + "step": 19133 + }, + { + "epoch": 0.1137953182986012, + "grad_norm": 1.207207202911377, + "learning_rate": 4.841954084805344e-05, + "loss": 4.9607, + "step": 19134 + }, + { + "epoch": 0.1138012655818822, + "grad_norm": 1.1609065532684326, + "learning_rate": 4.8419377399406104e-05, + "loss": 5.0458, + "step": 19135 + }, + { + "epoch": 0.1138072128651632, + "grad_norm": 1.365605115890503, + "learning_rate": 4.84192139425833e-05, + "loss": 5.0884, + "step": 19136 + }, + { + "epoch": 0.11381316014844418, + "grad_norm": 1.5192269086837769, + "learning_rate": 4.8419050477585096e-05, + "loss": 5.4803, + "step": 19137 + }, + { + "epoch": 0.11381910743172519, + "grad_norm": 1.187456488609314, + "learning_rate": 4.841888700441153e-05, + "loss": 5.4595, + "step": 19138 + }, + { + "epoch": 0.11382505471500619, + "grad_norm": 1.1836395263671875, + "learning_rate": 4.841872352306268e-05, + "loss": 5.27, + "step": 19139 + }, + { + "epoch": 0.11383100199828718, + "grad_norm": 1.353762149810791, + "learning_rate": 4.841856003353861e-05, + "loss": 5.4646, + "step": 19140 + }, + { + "epoch": 0.11383694928156818, + "grad_norm": 1.4854416847229004, + "learning_rate": 4.8418396535839344e-05, + "loss": 5.2894, + "step": 19141 + }, + { + "epoch": 0.11384289656484918, + "grad_norm": 1.3731143474578857, + "learning_rate": 4.841823302996496e-05, + "loss": 4.7512, + "step": 19142 + }, + { + "epoch": 0.11384884384813017, + "grad_norm": 1.3945658206939697, + "learning_rate": 4.841806951591552e-05, + "loss": 4.9625, + "step": 19143 + }, + { + "epoch": 0.11385479113141117, + "grad_norm": 1.2692869901657104, + "learning_rate": 4.841790599369107e-05, + "loss": 5.2245, + "step": 19144 + }, + { + "epoch": 0.11386073841469217, + "grad_norm": 1.3667423725128174, + "learning_rate": 4.8417742463291674e-05, + "loss": 5.202, + "step": 19145 + }, + { + "epoch": 0.11386668569797316, + "grad_norm": 1.2639939785003662, + "learning_rate": 4.8417578924717377e-05, + "loss": 5.4378, + "step": 19146 + }, + { + "epoch": 0.11387263298125416, + "grad_norm": 1.327867865562439, + "learning_rate": 4.8417415377968255e-05, + "loss": 5.1632, + "step": 19147 + }, + { + "epoch": 0.11387858026453516, + "grad_norm": 1.2095093727111816, + "learning_rate": 4.841725182304435e-05, + "loss": 4.9969, + "step": 19148 + }, + { + "epoch": 0.11388452754781615, + "grad_norm": 1.3395425081253052, + "learning_rate": 4.841708825994573e-05, + "loss": 5.1797, + "step": 19149 + }, + { + "epoch": 0.11389047483109715, + "grad_norm": 1.4817496538162231, + "learning_rate": 4.841692468867244e-05, + "loss": 5.1126, + "step": 19150 + }, + { + "epoch": 0.11389642211437816, + "grad_norm": 1.3066308498382568, + "learning_rate": 4.8416761109224547e-05, + "loss": 5.2692, + "step": 19151 + }, + { + "epoch": 0.11390236939765915, + "grad_norm": 1.444701075553894, + "learning_rate": 4.84165975216021e-05, + "loss": 5.0525, + "step": 19152 + }, + { + "epoch": 0.11390831668094015, + "grad_norm": 1.2720032930374146, + "learning_rate": 4.8416433925805165e-05, + "loss": 5.138, + "step": 19153 + }, + { + "epoch": 0.11391426396422115, + "grad_norm": 1.2228437662124634, + "learning_rate": 4.84162703218338e-05, + "loss": 5.028, + "step": 19154 + }, + { + "epoch": 0.11392021124750214, + "grad_norm": 1.1950013637542725, + "learning_rate": 4.841610670968805e-05, + "loss": 5.0873, + "step": 19155 + }, + { + "epoch": 0.11392615853078314, + "grad_norm": 1.3538236618041992, + "learning_rate": 4.8415943089367976e-05, + "loss": 5.0039, + "step": 19156 + }, + { + "epoch": 0.11393210581406414, + "grad_norm": 1.3344488143920898, + "learning_rate": 4.841577946087364e-05, + "loss": 5.0215, + "step": 19157 + }, + { + "epoch": 0.11393805309734513, + "grad_norm": 1.7098866701126099, + "learning_rate": 4.841561582420511e-05, + "loss": 5.5719, + "step": 19158 + }, + { + "epoch": 0.11394400038062613, + "grad_norm": 1.3574185371398926, + "learning_rate": 4.841545217936241e-05, + "loss": 4.8491, + "step": 19159 + }, + { + "epoch": 0.11394994766390713, + "grad_norm": 1.447292447090149, + "learning_rate": 4.8415288526345634e-05, + "loss": 4.8632, + "step": 19160 + }, + { + "epoch": 0.11395589494718812, + "grad_norm": 1.6439673900604248, + "learning_rate": 4.841512486515481e-05, + "loss": 5.282, + "step": 19161 + }, + { + "epoch": 0.11396184223046912, + "grad_norm": 1.3063132762908936, + "learning_rate": 4.841496119579002e-05, + "loss": 5.0399, + "step": 19162 + }, + { + "epoch": 0.11396778951375013, + "grad_norm": 1.4244173765182495, + "learning_rate": 4.8414797518251296e-05, + "loss": 4.7731, + "step": 19163 + }, + { + "epoch": 0.11397373679703111, + "grad_norm": 1.225203514099121, + "learning_rate": 4.841463383253872e-05, + "loss": 4.8294, + "step": 19164 + }, + { + "epoch": 0.11397968408031212, + "grad_norm": 1.2978007793426514, + "learning_rate": 4.8414470138652334e-05, + "loss": 4.6336, + "step": 19165 + }, + { + "epoch": 0.1139856313635931, + "grad_norm": 1.306591272354126, + "learning_rate": 4.8414306436592194e-05, + "loss": 4.8267, + "step": 19166 + }, + { + "epoch": 0.1139915786468741, + "grad_norm": 1.1227960586547852, + "learning_rate": 4.841414272635837e-05, + "loss": 4.7438, + "step": 19167 + }, + { + "epoch": 0.11399752593015511, + "grad_norm": 1.3674911260604858, + "learning_rate": 4.8413979007950905e-05, + "loss": 4.8127, + "step": 19168 + }, + { + "epoch": 0.1140034732134361, + "grad_norm": 1.3923397064208984, + "learning_rate": 4.841381528136986e-05, + "loss": 5.1568, + "step": 19169 + }, + { + "epoch": 0.1140094204967171, + "grad_norm": 1.2014738321304321, + "learning_rate": 4.84136515466153e-05, + "loss": 5.0116, + "step": 19170 + }, + { + "epoch": 0.1140153677799981, + "grad_norm": 1.3564008474349976, + "learning_rate": 4.841348780368726e-05, + "loss": 5.1181, + "step": 19171 + }, + { + "epoch": 0.11402131506327909, + "grad_norm": 1.1918834447860718, + "learning_rate": 4.841332405258583e-05, + "loss": 5.0854, + "step": 19172 + }, + { + "epoch": 0.11402726234656009, + "grad_norm": 1.2056841850280762, + "learning_rate": 4.8413160293311047e-05, + "loss": 4.825, + "step": 19173 + }, + { + "epoch": 0.11403320962984109, + "grad_norm": 1.3841508626937866, + "learning_rate": 4.841299652586298e-05, + "loss": 4.7543, + "step": 19174 + }, + { + "epoch": 0.11403915691312208, + "grad_norm": 1.511307716369629, + "learning_rate": 4.841283275024166e-05, + "loss": 4.9821, + "step": 19175 + }, + { + "epoch": 0.11404510419640308, + "grad_norm": 1.2577831745147705, + "learning_rate": 4.8412668966447175e-05, + "loss": 5.0138, + "step": 19176 + }, + { + "epoch": 0.11405105147968408, + "grad_norm": 1.442159652709961, + "learning_rate": 4.841250517447956e-05, + "loss": 5.0066, + "step": 19177 + }, + { + "epoch": 0.11405699876296507, + "grad_norm": 1.3029484748840332, + "learning_rate": 4.841234137433889e-05, + "loss": 4.9229, + "step": 19178 + }, + { + "epoch": 0.11406294604624607, + "grad_norm": 1.3138917684555054, + "learning_rate": 4.841217756602521e-05, + "loss": 4.6262, + "step": 19179 + }, + { + "epoch": 0.11406889332952708, + "grad_norm": 1.2164885997772217, + "learning_rate": 4.841201374953857e-05, + "loss": 4.7952, + "step": 19180 + }, + { + "epoch": 0.11407484061280806, + "grad_norm": 1.4247347116470337, + "learning_rate": 4.8411849924879046e-05, + "loss": 5.0066, + "step": 19181 + }, + { + "epoch": 0.11408078789608907, + "grad_norm": 1.236006736755371, + "learning_rate": 4.8411686092046695e-05, + "loss": 4.6585, + "step": 19182 + }, + { + "epoch": 0.11408673517937007, + "grad_norm": 1.2381118535995483, + "learning_rate": 4.841152225104156e-05, + "loss": 5.0935, + "step": 19183 + }, + { + "epoch": 0.11409268246265106, + "grad_norm": 1.3557883501052856, + "learning_rate": 4.84113584018637e-05, + "loss": 5.1536, + "step": 19184 + }, + { + "epoch": 0.11409862974593206, + "grad_norm": 1.3191505670547485, + "learning_rate": 4.8411194544513184e-05, + "loss": 5.2857, + "step": 19185 + }, + { + "epoch": 0.11410457702921306, + "grad_norm": 1.2058855295181274, + "learning_rate": 4.841103067899006e-05, + "loss": 5.142, + "step": 19186 + }, + { + "epoch": 0.11411052431249405, + "grad_norm": 1.163136601448059, + "learning_rate": 4.8410866805294384e-05, + "loss": 5.1891, + "step": 19187 + }, + { + "epoch": 0.11411647159577505, + "grad_norm": 1.3245770931243896, + "learning_rate": 4.841070292342622e-05, + "loss": 5.0629, + "step": 19188 + }, + { + "epoch": 0.11412241887905605, + "grad_norm": 1.13837730884552, + "learning_rate": 4.841053903338562e-05, + "loss": 5.1045, + "step": 19189 + }, + { + "epoch": 0.11412836616233704, + "grad_norm": 1.4724907875061035, + "learning_rate": 4.8410375135172646e-05, + "loss": 5.01, + "step": 19190 + }, + { + "epoch": 0.11413431344561804, + "grad_norm": 1.3786016702651978, + "learning_rate": 4.841021122878735e-05, + "loss": 5.0188, + "step": 19191 + }, + { + "epoch": 0.11414026072889905, + "grad_norm": 1.2996101379394531, + "learning_rate": 4.841004731422979e-05, + "loss": 4.954, + "step": 19192 + }, + { + "epoch": 0.11414620801218003, + "grad_norm": 1.297892451286316, + "learning_rate": 4.840988339150002e-05, + "loss": 4.9841, + "step": 19193 + }, + { + "epoch": 0.11415215529546104, + "grad_norm": 1.3011624813079834, + "learning_rate": 4.84097194605981e-05, + "loss": 4.8547, + "step": 19194 + }, + { + "epoch": 0.11415810257874202, + "grad_norm": 1.2169194221496582, + "learning_rate": 4.8409555521524096e-05, + "loss": 4.8801, + "step": 19195 + }, + { + "epoch": 0.11416404986202303, + "grad_norm": 1.4189658164978027, + "learning_rate": 4.8409391574278065e-05, + "loss": 4.9521, + "step": 19196 + }, + { + "epoch": 0.11416999714530403, + "grad_norm": 1.4178590774536133, + "learning_rate": 4.840922761886004e-05, + "loss": 4.7847, + "step": 19197 + }, + { + "epoch": 0.11417594442858502, + "grad_norm": 1.395585536956787, + "learning_rate": 4.8409063655270105e-05, + "loss": 5.0404, + "step": 19198 + }, + { + "epoch": 0.11418189171186602, + "grad_norm": 1.4803121089935303, + "learning_rate": 4.840889968350831e-05, + "loss": 4.8851, + "step": 19199 + }, + { + "epoch": 0.11418783899514702, + "grad_norm": 1.4736177921295166, + "learning_rate": 4.84087357035747e-05, + "loss": 4.9127, + "step": 19200 + }, + { + "epoch": 0.11419378627842801, + "grad_norm": 1.2947148084640503, + "learning_rate": 4.8408571715469354e-05, + "loss": 4.9169, + "step": 19201 + }, + { + "epoch": 0.11419973356170901, + "grad_norm": 1.2428392171859741, + "learning_rate": 4.840840771919232e-05, + "loss": 5.2759, + "step": 19202 + }, + { + "epoch": 0.11420568084499001, + "grad_norm": 1.2743968963623047, + "learning_rate": 4.840824371474364e-05, + "loss": 5.2273, + "step": 19203 + }, + { + "epoch": 0.114211628128271, + "grad_norm": 1.3068950176239014, + "learning_rate": 4.840807970212339e-05, + "loss": 5.3455, + "step": 19204 + }, + { + "epoch": 0.114217575411552, + "grad_norm": 1.2238211631774902, + "learning_rate": 4.8407915681331614e-05, + "loss": 5.024, + "step": 19205 + }, + { + "epoch": 0.114223522694833, + "grad_norm": 1.1461126804351807, + "learning_rate": 4.8407751652368384e-05, + "loss": 5.2113, + "step": 19206 + }, + { + "epoch": 0.11422946997811399, + "grad_norm": 1.2286972999572754, + "learning_rate": 4.840758761523375e-05, + "loss": 5.006, + "step": 19207 + }, + { + "epoch": 0.114235417261395, + "grad_norm": 1.3054790496826172, + "learning_rate": 4.840742356992777e-05, + "loss": 5.0592, + "step": 19208 + }, + { + "epoch": 0.114241364544676, + "grad_norm": 1.2426046133041382, + "learning_rate": 4.84072595164505e-05, + "loss": 5.1058, + "step": 19209 + }, + { + "epoch": 0.11424731182795698, + "grad_norm": 1.325263261795044, + "learning_rate": 4.840709545480199e-05, + "loss": 5.0528, + "step": 19210 + }, + { + "epoch": 0.11425325911123799, + "grad_norm": 1.1753286123275757, + "learning_rate": 4.840693138498231e-05, + "loss": 5.2193, + "step": 19211 + }, + { + "epoch": 0.11425920639451899, + "grad_norm": 1.486204743385315, + "learning_rate": 4.8406767306991515e-05, + "loss": 5.0389, + "step": 19212 + }, + { + "epoch": 0.11426515367779998, + "grad_norm": 1.344887614250183, + "learning_rate": 4.8406603220829655e-05, + "loss": 5.0072, + "step": 19213 + }, + { + "epoch": 0.11427110096108098, + "grad_norm": 1.270340919494629, + "learning_rate": 4.840643912649679e-05, + "loss": 5.0154, + "step": 19214 + }, + { + "epoch": 0.11427704824436198, + "grad_norm": 1.390960454940796, + "learning_rate": 4.8406275023992983e-05, + "loss": 5.0803, + "step": 19215 + }, + { + "epoch": 0.11428299552764297, + "grad_norm": 1.2927583456039429, + "learning_rate": 4.8406110913318294e-05, + "loss": 5.04, + "step": 19216 + }, + { + "epoch": 0.11428894281092397, + "grad_norm": 1.3101180791854858, + "learning_rate": 4.840594679447275e-05, + "loss": 4.9988, + "step": 19217 + }, + { + "epoch": 0.11429489009420497, + "grad_norm": 1.2187588214874268, + "learning_rate": 4.8405782667456454e-05, + "loss": 5.1006, + "step": 19218 + }, + { + "epoch": 0.11430083737748596, + "grad_norm": 1.3578346967697144, + "learning_rate": 4.840561853226944e-05, + "loss": 5.0528, + "step": 19219 + }, + { + "epoch": 0.11430678466076696, + "grad_norm": 1.8960474729537964, + "learning_rate": 4.840545438891176e-05, + "loss": 5.323, + "step": 19220 + }, + { + "epoch": 0.11431273194404797, + "grad_norm": 1.3410239219665527, + "learning_rate": 4.840529023738348e-05, + "loss": 5.1488, + "step": 19221 + }, + { + "epoch": 0.11431867922732895, + "grad_norm": 1.381373405456543, + "learning_rate": 4.840512607768465e-05, + "loss": 5.1477, + "step": 19222 + }, + { + "epoch": 0.11432462651060996, + "grad_norm": 1.4095546007156372, + "learning_rate": 4.8404961909815336e-05, + "loss": 5.1515, + "step": 19223 + }, + { + "epoch": 0.11433057379389094, + "grad_norm": 1.254451870918274, + "learning_rate": 4.840479773377559e-05, + "loss": 5.1276, + "step": 19224 + }, + { + "epoch": 0.11433652107717195, + "grad_norm": 1.3001519441604614, + "learning_rate": 4.840463354956548e-05, + "loss": 5.1561, + "step": 19225 + }, + { + "epoch": 0.11434246836045295, + "grad_norm": 1.231469750404358, + "learning_rate": 4.840446935718505e-05, + "loss": 4.963, + "step": 19226 + }, + { + "epoch": 0.11434841564373394, + "grad_norm": 1.323225736618042, + "learning_rate": 4.840430515663435e-05, + "loss": 5.0998, + "step": 19227 + }, + { + "epoch": 0.11435436292701494, + "grad_norm": 1.2244281768798828, + "learning_rate": 4.8404140947913456e-05, + "loss": 5.0727, + "step": 19228 + }, + { + "epoch": 0.11436031021029594, + "grad_norm": 1.2634974718093872, + "learning_rate": 4.840397673102242e-05, + "loss": 5.2049, + "step": 19229 + }, + { + "epoch": 0.11436625749357693, + "grad_norm": 1.5431766510009766, + "learning_rate": 4.84038125059613e-05, + "loss": 5.1387, + "step": 19230 + }, + { + "epoch": 0.11437220477685793, + "grad_norm": 1.485696792602539, + "learning_rate": 4.8403648272730145e-05, + "loss": 4.7971, + "step": 19231 + }, + { + "epoch": 0.11437815206013893, + "grad_norm": 1.4774583578109741, + "learning_rate": 4.840348403132902e-05, + "loss": 4.8967, + "step": 19232 + }, + { + "epoch": 0.11438409934341992, + "grad_norm": 1.1903584003448486, + "learning_rate": 4.840331978175798e-05, + "loss": 4.8827, + "step": 19233 + }, + { + "epoch": 0.11439004662670092, + "grad_norm": 1.3851109743118286, + "learning_rate": 4.840315552401708e-05, + "loss": 4.8348, + "step": 19234 + }, + { + "epoch": 0.11439599390998192, + "grad_norm": 1.3834025859832764, + "learning_rate": 4.840299125810639e-05, + "loss": 4.9392, + "step": 19235 + }, + { + "epoch": 0.11440194119326291, + "grad_norm": 1.2576985359191895, + "learning_rate": 4.840282698402595e-05, + "loss": 4.9092, + "step": 19236 + }, + { + "epoch": 0.11440788847654391, + "grad_norm": 1.2408863306045532, + "learning_rate": 4.840266270177583e-05, + "loss": 4.9041, + "step": 19237 + }, + { + "epoch": 0.11441383575982492, + "grad_norm": 1.4397286176681519, + "learning_rate": 4.840249841135608e-05, + "loss": 4.9588, + "step": 19238 + }, + { + "epoch": 0.1144197830431059, + "grad_norm": 1.3446424007415771, + "learning_rate": 4.840233411276676e-05, + "loss": 4.9757, + "step": 19239 + }, + { + "epoch": 0.1144257303263869, + "grad_norm": 1.2520800828933716, + "learning_rate": 4.840216980600793e-05, + "loss": 4.9746, + "step": 19240 + }, + { + "epoch": 0.11443167760966791, + "grad_norm": 1.2509692907333374, + "learning_rate": 4.840200549107963e-05, + "loss": 5.063, + "step": 19241 + }, + { + "epoch": 0.1144376248929489, + "grad_norm": 1.3295235633850098, + "learning_rate": 4.840184116798194e-05, + "loss": 5.02, + "step": 19242 + }, + { + "epoch": 0.1144435721762299, + "grad_norm": 1.3346072435379028, + "learning_rate": 4.8401676836714916e-05, + "loss": 5.0393, + "step": 19243 + }, + { + "epoch": 0.1144495194595109, + "grad_norm": 1.6711392402648926, + "learning_rate": 4.84015124972786e-05, + "loss": 5.0856, + "step": 19244 + }, + { + "epoch": 0.11445546674279189, + "grad_norm": 1.2785863876342773, + "learning_rate": 4.8401348149673065e-05, + "loss": 5.1181, + "step": 19245 + }, + { + "epoch": 0.11446141402607289, + "grad_norm": 1.4998282194137573, + "learning_rate": 4.8401183793898354e-05, + "loss": 5.0101, + "step": 19246 + }, + { + "epoch": 0.1144673613093539, + "grad_norm": 1.4768141508102417, + "learning_rate": 4.840101942995454e-05, + "loss": 4.8256, + "step": 19247 + }, + { + "epoch": 0.11447330859263488, + "grad_norm": 1.3829854726791382, + "learning_rate": 4.840085505784167e-05, + "loss": 4.8298, + "step": 19248 + }, + { + "epoch": 0.11447925587591588, + "grad_norm": 1.2079180479049683, + "learning_rate": 4.840069067755979e-05, + "loss": 4.9054, + "step": 19249 + }, + { + "epoch": 0.11448520315919689, + "grad_norm": 1.464245080947876, + "learning_rate": 4.8400526289108984e-05, + "loss": 4.8943, + "step": 19250 + }, + { + "epoch": 0.11449115044247787, + "grad_norm": 1.400992512702942, + "learning_rate": 4.840036189248929e-05, + "loss": 4.754, + "step": 19251 + }, + { + "epoch": 0.11449709772575888, + "grad_norm": 1.41909921169281, + "learning_rate": 4.840019748770077e-05, + "loss": 4.9179, + "step": 19252 + }, + { + "epoch": 0.11450304500903986, + "grad_norm": 1.3990073204040527, + "learning_rate": 4.840003307474349e-05, + "loss": 4.7989, + "step": 19253 + }, + { + "epoch": 0.11450899229232087, + "grad_norm": 1.2858465909957886, + "learning_rate": 4.8399868653617497e-05, + "loss": 4.7556, + "step": 19254 + }, + { + "epoch": 0.11451493957560187, + "grad_norm": 1.2721470594406128, + "learning_rate": 4.8399704224322854e-05, + "loss": 4.8441, + "step": 19255 + }, + { + "epoch": 0.11452088685888286, + "grad_norm": 1.2352218627929688, + "learning_rate": 4.839953978685961e-05, + "loss": 4.753, + "step": 19256 + }, + { + "epoch": 0.11452683414216386, + "grad_norm": 1.3000402450561523, + "learning_rate": 4.8399375341227834e-05, + "loss": 4.7634, + "step": 19257 + }, + { + "epoch": 0.11453278142544486, + "grad_norm": 1.2934285402297974, + "learning_rate": 4.839921088742757e-05, + "loss": 4.8047, + "step": 19258 + }, + { + "epoch": 0.11453872870872585, + "grad_norm": 1.5773643255233765, + "learning_rate": 4.839904642545889e-05, + "loss": 4.8588, + "step": 19259 + }, + { + "epoch": 0.11454467599200685, + "grad_norm": 1.3872511386871338, + "learning_rate": 4.8398881955321844e-05, + "loss": 5.0781, + "step": 19260 + }, + { + "epoch": 0.11455062327528785, + "grad_norm": 1.403011679649353, + "learning_rate": 4.839871747701649e-05, + "loss": 5.1375, + "step": 19261 + }, + { + "epoch": 0.11455657055856884, + "grad_norm": 1.2086342573165894, + "learning_rate": 4.839855299054289e-05, + "loss": 5.1052, + "step": 19262 + }, + { + "epoch": 0.11456251784184984, + "grad_norm": 1.3916890621185303, + "learning_rate": 4.8398388495901085e-05, + "loss": 5.0687, + "step": 19263 + }, + { + "epoch": 0.11456846512513084, + "grad_norm": 1.4591625928878784, + "learning_rate": 4.839822399309115e-05, + "loss": 5.0098, + "step": 19264 + }, + { + "epoch": 0.11457441240841183, + "grad_norm": 1.3421653509140015, + "learning_rate": 4.839805948211314e-05, + "loss": 4.9511, + "step": 19265 + }, + { + "epoch": 0.11458035969169283, + "grad_norm": 1.3959892988204956, + "learning_rate": 4.83978949629671e-05, + "loss": 5.0206, + "step": 19266 + }, + { + "epoch": 0.11458630697497384, + "grad_norm": 1.3058884143829346, + "learning_rate": 4.839773043565311e-05, + "loss": 5.0885, + "step": 19267 + }, + { + "epoch": 0.11459225425825482, + "grad_norm": 1.452760100364685, + "learning_rate": 4.839756590017121e-05, + "loss": 4.9945, + "step": 19268 + }, + { + "epoch": 0.11459820154153583, + "grad_norm": 1.4445050954818726, + "learning_rate": 4.8397401356521454e-05, + "loss": 4.8128, + "step": 19269 + }, + { + "epoch": 0.11460414882481683, + "grad_norm": 1.2491203546524048, + "learning_rate": 4.8397236804703916e-05, + "loss": 4.7355, + "step": 19270 + }, + { + "epoch": 0.11461009610809782, + "grad_norm": 1.3198809623718262, + "learning_rate": 4.839707224471864e-05, + "loss": 4.7621, + "step": 19271 + }, + { + "epoch": 0.11461604339137882, + "grad_norm": 1.4831585884094238, + "learning_rate": 4.8396907676565686e-05, + "loss": 4.7393, + "step": 19272 + }, + { + "epoch": 0.11462199067465982, + "grad_norm": 1.2767844200134277, + "learning_rate": 4.839674310024512e-05, + "loss": 4.8063, + "step": 19273 + }, + { + "epoch": 0.11462793795794081, + "grad_norm": 1.4342589378356934, + "learning_rate": 4.839657851575698e-05, + "loss": 4.7615, + "step": 19274 + }, + { + "epoch": 0.11463388524122181, + "grad_norm": 1.30052649974823, + "learning_rate": 4.839641392310135e-05, + "loss": 4.7389, + "step": 19275 + }, + { + "epoch": 0.11463983252450281, + "grad_norm": 1.3592944145202637, + "learning_rate": 4.8396249322278266e-05, + "loss": 4.704, + "step": 19276 + }, + { + "epoch": 0.1146457798077838, + "grad_norm": 1.1905149221420288, + "learning_rate": 4.83960847132878e-05, + "loss": 4.7189, + "step": 19277 + }, + { + "epoch": 0.1146517270910648, + "grad_norm": 1.4920209646224976, + "learning_rate": 4.8395920096129996e-05, + "loss": 4.8844, + "step": 19278 + }, + { + "epoch": 0.1146576743743458, + "grad_norm": 1.486556887626648, + "learning_rate": 4.839575547080491e-05, + "loss": 4.9462, + "step": 19279 + }, + { + "epoch": 0.1146636216576268, + "grad_norm": 1.500434160232544, + "learning_rate": 4.839559083731262e-05, + "loss": 4.9118, + "step": 19280 + }, + { + "epoch": 0.1146695689409078, + "grad_norm": 1.5061683654785156, + "learning_rate": 4.839542619565317e-05, + "loss": 4.7921, + "step": 19281 + }, + { + "epoch": 0.11467551622418878, + "grad_norm": 1.587161660194397, + "learning_rate": 4.839526154582662e-05, + "loss": 5.1129, + "step": 19282 + }, + { + "epoch": 0.11468146350746979, + "grad_norm": 1.3225055932998657, + "learning_rate": 4.839509688783302e-05, + "loss": 4.8538, + "step": 19283 + }, + { + "epoch": 0.11468741079075079, + "grad_norm": 1.3121862411499023, + "learning_rate": 4.839493222167244e-05, + "loss": 4.8695, + "step": 19284 + }, + { + "epoch": 0.11469335807403178, + "grad_norm": 1.4202474355697632, + "learning_rate": 4.839476754734492e-05, + "loss": 4.8628, + "step": 19285 + }, + { + "epoch": 0.11469930535731278, + "grad_norm": 1.283316969871521, + "learning_rate": 4.8394602864850534e-05, + "loss": 4.8431, + "step": 19286 + }, + { + "epoch": 0.11470525264059378, + "grad_norm": 1.3255420923233032, + "learning_rate": 4.839443817418934e-05, + "loss": 4.9993, + "step": 19287 + }, + { + "epoch": 0.11471119992387477, + "grad_norm": 1.3569047451019287, + "learning_rate": 4.8394273475361386e-05, + "loss": 4.9478, + "step": 19288 + }, + { + "epoch": 0.11471714720715577, + "grad_norm": 1.2374382019042969, + "learning_rate": 4.839410876836673e-05, + "loss": 5.1119, + "step": 19289 + }, + { + "epoch": 0.11472309449043677, + "grad_norm": 1.3518184423446655, + "learning_rate": 4.839394405320543e-05, + "loss": 5.2506, + "step": 19290 + }, + { + "epoch": 0.11472904177371776, + "grad_norm": 1.2599278688430786, + "learning_rate": 4.839377932987755e-05, + "loss": 5.208, + "step": 19291 + }, + { + "epoch": 0.11473498905699876, + "grad_norm": 1.3122080564498901, + "learning_rate": 4.839361459838314e-05, + "loss": 5.2356, + "step": 19292 + }, + { + "epoch": 0.11474093634027976, + "grad_norm": 1.1587629318237305, + "learning_rate": 4.839344985872226e-05, + "loss": 5.2469, + "step": 19293 + }, + { + "epoch": 0.11474688362356075, + "grad_norm": 1.2733700275421143, + "learning_rate": 4.839328511089498e-05, + "loss": 5.2365, + "step": 19294 + }, + { + "epoch": 0.11475283090684175, + "grad_norm": 1.3206977844238281, + "learning_rate": 4.8393120354901334e-05, + "loss": 5.2242, + "step": 19295 + }, + { + "epoch": 0.11475877819012276, + "grad_norm": 1.1924374103546143, + "learning_rate": 4.83929555907414e-05, + "loss": 5.2916, + "step": 19296 + }, + { + "epoch": 0.11476472547340374, + "grad_norm": 1.2989557981491089, + "learning_rate": 4.8392790818415215e-05, + "loss": 5.173, + "step": 19297 + }, + { + "epoch": 0.11477067275668475, + "grad_norm": 1.3470929861068726, + "learning_rate": 4.839262603792286e-05, + "loss": 5.2309, + "step": 19298 + }, + { + "epoch": 0.11477662003996575, + "grad_norm": 1.1529438495635986, + "learning_rate": 4.8392461249264376e-05, + "loss": 5.2373, + "step": 19299 + }, + { + "epoch": 0.11478256732324674, + "grad_norm": 1.1988370418548584, + "learning_rate": 4.839229645243982e-05, + "loss": 5.2067, + "step": 19300 + }, + { + "epoch": 0.11478851460652774, + "grad_norm": 1.3069959878921509, + "learning_rate": 4.839213164744926e-05, + "loss": 5.1413, + "step": 19301 + }, + { + "epoch": 0.11479446188980874, + "grad_norm": 1.230211615562439, + "learning_rate": 4.839196683429275e-05, + "loss": 5.2076, + "step": 19302 + }, + { + "epoch": 0.11480040917308973, + "grad_norm": 1.3232944011688232, + "learning_rate": 4.839180201297034e-05, + "loss": 5.2077, + "step": 19303 + }, + { + "epoch": 0.11480635645637073, + "grad_norm": 1.2436466217041016, + "learning_rate": 4.839163718348211e-05, + "loss": 5.1646, + "step": 19304 + }, + { + "epoch": 0.11481230373965173, + "grad_norm": 1.160416841506958, + "learning_rate": 4.8391472345828085e-05, + "loss": 5.0582, + "step": 19305 + }, + { + "epoch": 0.11481825102293272, + "grad_norm": 1.3895483016967773, + "learning_rate": 4.8391307500008344e-05, + "loss": 5.2516, + "step": 19306 + }, + { + "epoch": 0.11482419830621372, + "grad_norm": 1.5018577575683594, + "learning_rate": 4.8391142646022935e-05, + "loss": 5.4308, + "step": 19307 + }, + { + "epoch": 0.11483014558949473, + "grad_norm": 1.5278204679489136, + "learning_rate": 4.8390977783871925e-05, + "loss": 5.2238, + "step": 19308 + }, + { + "epoch": 0.11483609287277571, + "grad_norm": 1.5735019445419312, + "learning_rate": 4.839081291355536e-05, + "loss": 5.4874, + "step": 19309 + }, + { + "epoch": 0.11484204015605672, + "grad_norm": 1.4098745584487915, + "learning_rate": 4.839064803507332e-05, + "loss": 5.082, + "step": 19310 + }, + { + "epoch": 0.1148479874393377, + "grad_norm": 1.47605299949646, + "learning_rate": 4.8390483148425824e-05, + "loss": 5.0869, + "step": 19311 + }, + { + "epoch": 0.1148539347226187, + "grad_norm": 1.442550778388977, + "learning_rate": 4.8390318253612966e-05, + "loss": 5.1232, + "step": 19312 + }, + { + "epoch": 0.11485988200589971, + "grad_norm": 1.1225110292434692, + "learning_rate": 4.8390153350634785e-05, + "loss": 5.0782, + "step": 19313 + }, + { + "epoch": 0.1148658292891807, + "grad_norm": 1.329656720161438, + "learning_rate": 4.838998843949135e-05, + "loss": 4.9912, + "step": 19314 + }, + { + "epoch": 0.1148717765724617, + "grad_norm": 1.6484954357147217, + "learning_rate": 4.8389823520182704e-05, + "loss": 4.785, + "step": 19315 + }, + { + "epoch": 0.1148777238557427, + "grad_norm": 1.46773099899292, + "learning_rate": 4.838965859270891e-05, + "loss": 4.7835, + "step": 19316 + }, + { + "epoch": 0.11488367113902369, + "grad_norm": 1.717592477798462, + "learning_rate": 4.838949365707004e-05, + "loss": 5.1603, + "step": 19317 + }, + { + "epoch": 0.11488961842230469, + "grad_norm": 1.7265046834945679, + "learning_rate": 4.838932871326613e-05, + "loss": 4.9057, + "step": 19318 + }, + { + "epoch": 0.11489556570558569, + "grad_norm": 1.6203346252441406, + "learning_rate": 4.838916376129725e-05, + "loss": 4.8206, + "step": 19319 + }, + { + "epoch": 0.11490151298886668, + "grad_norm": 1.2972123622894287, + "learning_rate": 4.838899880116345e-05, + "loss": 4.7026, + "step": 19320 + }, + { + "epoch": 0.11490746027214768, + "grad_norm": 1.4215303659439087, + "learning_rate": 4.838883383286479e-05, + "loss": 4.7032, + "step": 19321 + }, + { + "epoch": 0.11491340755542868, + "grad_norm": 1.442439317703247, + "learning_rate": 4.838866885640134e-05, + "loss": 4.6853, + "step": 19322 + }, + { + "epoch": 0.11491935483870967, + "grad_norm": 1.3752079010009766, + "learning_rate": 4.838850387177315e-05, + "loss": 4.6842, + "step": 19323 + }, + { + "epoch": 0.11492530212199067, + "grad_norm": 1.4834825992584229, + "learning_rate": 4.838833887898026e-05, + "loss": 4.6455, + "step": 19324 + }, + { + "epoch": 0.11493124940527168, + "grad_norm": 1.3493545055389404, + "learning_rate": 4.8388173878022743e-05, + "loss": 4.5489, + "step": 19325 + }, + { + "epoch": 0.11493719668855266, + "grad_norm": 1.5903066396713257, + "learning_rate": 4.838800886890067e-05, + "loss": 4.5574, + "step": 19326 + }, + { + "epoch": 0.11494314397183367, + "grad_norm": 1.3842332363128662, + "learning_rate": 4.8387843851614076e-05, + "loss": 4.7516, + "step": 19327 + }, + { + "epoch": 0.11494909125511467, + "grad_norm": 1.5355647802352905, + "learning_rate": 4.838767882616303e-05, + "loss": 4.5984, + "step": 19328 + }, + { + "epoch": 0.11495503853839566, + "grad_norm": 1.6534103155136108, + "learning_rate": 4.838751379254759e-05, + "loss": 4.7761, + "step": 19329 + }, + { + "epoch": 0.11496098582167666, + "grad_norm": 1.7028656005859375, + "learning_rate": 4.83873487507678e-05, + "loss": 5.0164, + "step": 19330 + }, + { + "epoch": 0.11496693310495766, + "grad_norm": 1.7165244817733765, + "learning_rate": 4.838718370082374e-05, + "loss": 5.1044, + "step": 19331 + }, + { + "epoch": 0.11497288038823865, + "grad_norm": 1.3272297382354736, + "learning_rate": 4.838701864271545e-05, + "loss": 5.0072, + "step": 19332 + }, + { + "epoch": 0.11497882767151965, + "grad_norm": 1.553613543510437, + "learning_rate": 4.8386853576442994e-05, + "loss": 4.945, + "step": 19333 + }, + { + "epoch": 0.11498477495480065, + "grad_norm": 1.4403818845748901, + "learning_rate": 4.8386688502006425e-05, + "loss": 5.0661, + "step": 19334 + }, + { + "epoch": 0.11499072223808164, + "grad_norm": 1.5347598791122437, + "learning_rate": 4.8386523419405814e-05, + "loss": 5.0603, + "step": 19335 + }, + { + "epoch": 0.11499666952136264, + "grad_norm": 1.3777856826782227, + "learning_rate": 4.83863583286412e-05, + "loss": 5.112, + "step": 19336 + }, + { + "epoch": 0.11500261680464365, + "grad_norm": 1.794287919998169, + "learning_rate": 4.8386193229712654e-05, + "loss": 5.1972, + "step": 19337 + }, + { + "epoch": 0.11500856408792463, + "grad_norm": 1.3142359256744385, + "learning_rate": 4.8386028122620234e-05, + "loss": 5.3577, + "step": 19338 + }, + { + "epoch": 0.11501451137120564, + "grad_norm": 1.0925400257110596, + "learning_rate": 4.838586300736399e-05, + "loss": 5.2094, + "step": 19339 + }, + { + "epoch": 0.11502045865448662, + "grad_norm": 1.6456180810928345, + "learning_rate": 4.838569788394398e-05, + "loss": 4.8287, + "step": 19340 + }, + { + "epoch": 0.11502640593776763, + "grad_norm": 1.2811404466629028, + "learning_rate": 4.8385532752360265e-05, + "loss": 5.0659, + "step": 19341 + }, + { + "epoch": 0.11503235322104863, + "grad_norm": 1.392863154411316, + "learning_rate": 4.83853676126129e-05, + "loss": 5.2655, + "step": 19342 + }, + { + "epoch": 0.11503830050432962, + "grad_norm": 1.2255772352218628, + "learning_rate": 4.838520246470195e-05, + "loss": 5.0422, + "step": 19343 + }, + { + "epoch": 0.11504424778761062, + "grad_norm": 1.735661506652832, + "learning_rate": 4.8385037308627465e-05, + "loss": 6.0562, + "step": 19344 + }, + { + "epoch": 0.11505019507089162, + "grad_norm": 1.2034478187561035, + "learning_rate": 4.838487214438951e-05, + "loss": 4.9773, + "step": 19345 + }, + { + "epoch": 0.11505614235417261, + "grad_norm": 1.2786695957183838, + "learning_rate": 4.838470697198813e-05, + "loss": 4.8771, + "step": 19346 + }, + { + "epoch": 0.11506208963745361, + "grad_norm": 1.2345244884490967, + "learning_rate": 4.8384541791423394e-05, + "loss": 5.0098, + "step": 19347 + }, + { + "epoch": 0.11506803692073461, + "grad_norm": 1.3156319856643677, + "learning_rate": 4.838437660269536e-05, + "loss": 5.1089, + "step": 19348 + }, + { + "epoch": 0.1150739842040156, + "grad_norm": 1.3406500816345215, + "learning_rate": 4.838421140580407e-05, + "loss": 4.8374, + "step": 19349 + }, + { + "epoch": 0.1150799314872966, + "grad_norm": 1.412318468093872, + "learning_rate": 4.83840462007496e-05, + "loss": 4.9074, + "step": 19350 + }, + { + "epoch": 0.1150858787705776, + "grad_norm": 1.3075577020645142, + "learning_rate": 4.8383880987532004e-05, + "loss": 4.9694, + "step": 19351 + }, + { + "epoch": 0.11509182605385859, + "grad_norm": 1.178300380706787, + "learning_rate": 4.838371576615134e-05, + "loss": 4.9863, + "step": 19352 + }, + { + "epoch": 0.1150977733371396, + "grad_norm": 1.5120453834533691, + "learning_rate": 4.838355053660765e-05, + "loss": 4.8766, + "step": 19353 + }, + { + "epoch": 0.1151037206204206, + "grad_norm": 1.4834094047546387, + "learning_rate": 4.8383385298901014e-05, + "loss": 4.9724, + "step": 19354 + }, + { + "epoch": 0.11510966790370158, + "grad_norm": 1.561998724937439, + "learning_rate": 4.8383220053031475e-05, + "loss": 4.9239, + "step": 19355 + }, + { + "epoch": 0.11511561518698259, + "grad_norm": 1.4366774559020996, + "learning_rate": 4.83830547989991e-05, + "loss": 4.8052, + "step": 19356 + }, + { + "epoch": 0.11512156247026359, + "grad_norm": 1.2530354261398315, + "learning_rate": 4.8382889536803936e-05, + "loss": 5.0115, + "step": 19357 + }, + { + "epoch": 0.11512750975354458, + "grad_norm": 1.4827991724014282, + "learning_rate": 4.838272426644606e-05, + "loss": 5.1592, + "step": 19358 + }, + { + "epoch": 0.11513345703682558, + "grad_norm": 1.5874660015106201, + "learning_rate": 4.83825589879255e-05, + "loss": 5.0255, + "step": 19359 + }, + { + "epoch": 0.11513940432010658, + "grad_norm": 1.4771748781204224, + "learning_rate": 4.8382393701242335e-05, + "loss": 5.1537, + "step": 19360 + }, + { + "epoch": 0.11514535160338757, + "grad_norm": 1.4980419874191284, + "learning_rate": 4.8382228406396625e-05, + "loss": 5.0109, + "step": 19361 + }, + { + "epoch": 0.11515129888666857, + "grad_norm": 1.5008245706558228, + "learning_rate": 4.8382063103388405e-05, + "loss": 5.1644, + "step": 19362 + }, + { + "epoch": 0.11515724616994957, + "grad_norm": 1.425648808479309, + "learning_rate": 4.838189779221777e-05, + "loss": 4.8298, + "step": 19363 + }, + { + "epoch": 0.11516319345323056, + "grad_norm": 1.4478559494018555, + "learning_rate": 4.8381732472884744e-05, + "loss": 5.2984, + "step": 19364 + }, + { + "epoch": 0.11516914073651156, + "grad_norm": 1.5071446895599365, + "learning_rate": 4.83815671453894e-05, + "loss": 4.9557, + "step": 19365 + }, + { + "epoch": 0.11517508801979257, + "grad_norm": 1.6358442306518555, + "learning_rate": 4.8381401809731785e-05, + "loss": 4.7956, + "step": 19366 + }, + { + "epoch": 0.11518103530307355, + "grad_norm": 1.5035837888717651, + "learning_rate": 4.838123646591197e-05, + "loss": 4.816, + "step": 19367 + }, + { + "epoch": 0.11518698258635456, + "grad_norm": 1.4265867471694946, + "learning_rate": 4.838107111393e-05, + "loss": 4.7911, + "step": 19368 + }, + { + "epoch": 0.11519292986963554, + "grad_norm": 1.489668369293213, + "learning_rate": 4.838090575378595e-05, + "loss": 4.8403, + "step": 19369 + }, + { + "epoch": 0.11519887715291655, + "grad_norm": 1.4454714059829712, + "learning_rate": 4.838074038547986e-05, + "loss": 4.8848, + "step": 19370 + }, + { + "epoch": 0.11520482443619755, + "grad_norm": 1.42531418800354, + "learning_rate": 4.83805750090118e-05, + "loss": 5.0249, + "step": 19371 + }, + { + "epoch": 0.11521077171947854, + "grad_norm": 1.4370076656341553, + "learning_rate": 4.8380409624381826e-05, + "loss": 4.9219, + "step": 19372 + }, + { + "epoch": 0.11521671900275954, + "grad_norm": 1.543291449546814, + "learning_rate": 4.838024423158999e-05, + "loss": 4.9835, + "step": 19373 + }, + { + "epoch": 0.11522266628604054, + "grad_norm": 1.2460718154907227, + "learning_rate": 4.838007883063634e-05, + "loss": 5.0426, + "step": 19374 + }, + { + "epoch": 0.11522861356932153, + "grad_norm": 1.5159900188446045, + "learning_rate": 4.837991342152096e-05, + "loss": 5.0214, + "step": 19375 + }, + { + "epoch": 0.11523456085260253, + "grad_norm": 1.3800876140594482, + "learning_rate": 4.837974800424389e-05, + "loss": 4.7606, + "step": 19376 + }, + { + "epoch": 0.11524050813588353, + "grad_norm": 1.509788155555725, + "learning_rate": 4.8379582578805197e-05, + "loss": 4.9886, + "step": 19377 + }, + { + "epoch": 0.11524645541916452, + "grad_norm": 1.292523741722107, + "learning_rate": 4.837941714520492e-05, + "loss": 5.1574, + "step": 19378 + }, + { + "epoch": 0.11525240270244552, + "grad_norm": 1.351827621459961, + "learning_rate": 4.837925170344314e-05, + "loss": 5.3133, + "step": 19379 + }, + { + "epoch": 0.11525834998572652, + "grad_norm": 1.4871753454208374, + "learning_rate": 4.83790862535199e-05, + "loss": 4.843, + "step": 19380 + }, + { + "epoch": 0.11526429726900751, + "grad_norm": 1.6031657457351685, + "learning_rate": 4.8378920795435264e-05, + "loss": 4.8244, + "step": 19381 + }, + { + "epoch": 0.11527024455228851, + "grad_norm": 1.3754857778549194, + "learning_rate": 4.8378755329189294e-05, + "loss": 4.8421, + "step": 19382 + }, + { + "epoch": 0.11527619183556952, + "grad_norm": 1.5428962707519531, + "learning_rate": 4.837858985478203e-05, + "loss": 4.9472, + "step": 19383 + }, + { + "epoch": 0.1152821391188505, + "grad_norm": 1.45586097240448, + "learning_rate": 4.837842437221356e-05, + "loss": 4.874, + "step": 19384 + }, + { + "epoch": 0.1152880864021315, + "grad_norm": 1.5139529705047607, + "learning_rate": 4.837825888148391e-05, + "loss": 4.8867, + "step": 19385 + }, + { + "epoch": 0.11529403368541251, + "grad_norm": 1.6341979503631592, + "learning_rate": 4.837809338259315e-05, + "loss": 4.8476, + "step": 19386 + }, + { + "epoch": 0.1152999809686935, + "grad_norm": 1.45046865940094, + "learning_rate": 4.837792787554134e-05, + "loss": 5.0273, + "step": 19387 + }, + { + "epoch": 0.1153059282519745, + "grad_norm": 1.2840397357940674, + "learning_rate": 4.8377762360328547e-05, + "loss": 5.1717, + "step": 19388 + }, + { + "epoch": 0.1153118755352555, + "grad_norm": 1.4211467504501343, + "learning_rate": 4.8377596836954805e-05, + "loss": 5.021, + "step": 19389 + }, + { + "epoch": 0.11531782281853649, + "grad_norm": 1.3885877132415771, + "learning_rate": 4.837743130542019e-05, + "loss": 5.2158, + "step": 19390 + }, + { + "epoch": 0.11532377010181749, + "grad_norm": 1.2344088554382324, + "learning_rate": 4.837726576572476e-05, + "loss": 5.212, + "step": 19391 + }, + { + "epoch": 0.11532971738509849, + "grad_norm": 1.1903822422027588, + "learning_rate": 4.837710021786857e-05, + "loss": 5.3071, + "step": 19392 + }, + { + "epoch": 0.11533566466837948, + "grad_norm": 1.4263699054718018, + "learning_rate": 4.837693466185167e-05, + "loss": 5.1472, + "step": 19393 + }, + { + "epoch": 0.11534161195166048, + "grad_norm": 1.201027512550354, + "learning_rate": 4.837676909767412e-05, + "loss": 5.1779, + "step": 19394 + }, + { + "epoch": 0.11534755923494149, + "grad_norm": 1.2903262376785278, + "learning_rate": 4.8376603525335995e-05, + "loss": 5.038, + "step": 19395 + }, + { + "epoch": 0.11535350651822247, + "grad_norm": 1.3125475645065308, + "learning_rate": 4.837643794483733e-05, + "loss": 4.8948, + "step": 19396 + }, + { + "epoch": 0.11535945380150348, + "grad_norm": 1.1773933172225952, + "learning_rate": 4.837627235617819e-05, + "loss": 5.0854, + "step": 19397 + }, + { + "epoch": 0.11536540108478446, + "grad_norm": 1.2542996406555176, + "learning_rate": 4.837610675935864e-05, + "loss": 5.1329, + "step": 19398 + }, + { + "epoch": 0.11537134836806547, + "grad_norm": 1.1876561641693115, + "learning_rate": 4.837594115437873e-05, + "loss": 4.9757, + "step": 19399 + }, + { + "epoch": 0.11537729565134647, + "grad_norm": 1.2957814931869507, + "learning_rate": 4.837577554123852e-05, + "loss": 5.1203, + "step": 19400 + }, + { + "epoch": 0.11538324293462746, + "grad_norm": 1.2537682056427002, + "learning_rate": 4.837560991993807e-05, + "loss": 4.975, + "step": 19401 + }, + { + "epoch": 0.11538919021790846, + "grad_norm": 1.1898986101150513, + "learning_rate": 4.837544429047743e-05, + "loss": 4.9028, + "step": 19402 + }, + { + "epoch": 0.11539513750118946, + "grad_norm": 1.4129477739334106, + "learning_rate": 4.837527865285667e-05, + "loss": 4.7576, + "step": 19403 + }, + { + "epoch": 0.11540108478447045, + "grad_norm": 1.5386319160461426, + "learning_rate": 4.837511300707585e-05, + "loss": 4.9332, + "step": 19404 + }, + { + "epoch": 0.11540703206775145, + "grad_norm": 1.3597557544708252, + "learning_rate": 4.8374947353135e-05, + "loss": 4.8007, + "step": 19405 + }, + { + "epoch": 0.11541297935103245, + "grad_norm": 1.8251479864120483, + "learning_rate": 4.837478169103421e-05, + "loss": 5.048, + "step": 19406 + }, + { + "epoch": 0.11541892663431344, + "grad_norm": 1.488844871520996, + "learning_rate": 4.8374616020773523e-05, + "loss": 4.855, + "step": 19407 + }, + { + "epoch": 0.11542487391759444, + "grad_norm": 1.1640641689300537, + "learning_rate": 4.8374450342352996e-05, + "loss": 4.7714, + "step": 19408 + }, + { + "epoch": 0.11543082120087544, + "grad_norm": 1.1133109331130981, + "learning_rate": 4.8374284655772696e-05, + "loss": 4.849, + "step": 19409 + }, + { + "epoch": 0.11543676848415643, + "grad_norm": 1.2767143249511719, + "learning_rate": 4.837411896103266e-05, + "loss": 4.8078, + "step": 19410 + }, + { + "epoch": 0.11544271576743743, + "grad_norm": 1.2564034461975098, + "learning_rate": 4.837395325813298e-05, + "loss": 4.8602, + "step": 19411 + }, + { + "epoch": 0.11544866305071844, + "grad_norm": 1.2702561616897583, + "learning_rate": 4.837378754707369e-05, + "loss": 4.9148, + "step": 19412 + }, + { + "epoch": 0.11545461033399942, + "grad_norm": 1.1960140466690063, + "learning_rate": 4.8373621827854845e-05, + "loss": 4.9242, + "step": 19413 + }, + { + "epoch": 0.11546055761728043, + "grad_norm": 1.3663053512573242, + "learning_rate": 4.837345610047651e-05, + "loss": 4.9837, + "step": 19414 + }, + { + "epoch": 0.11546650490056143, + "grad_norm": 1.340897560119629, + "learning_rate": 4.837329036493875e-05, + "loss": 4.8059, + "step": 19415 + }, + { + "epoch": 0.11547245218384242, + "grad_norm": 1.326195478439331, + "learning_rate": 4.8373124621241616e-05, + "loss": 4.7115, + "step": 19416 + }, + { + "epoch": 0.11547839946712342, + "grad_norm": 1.2291951179504395, + "learning_rate": 4.837295886938516e-05, + "loss": 5.0075, + "step": 19417 + }, + { + "epoch": 0.11548434675040442, + "grad_norm": 1.3071776628494263, + "learning_rate": 4.837279310936945e-05, + "loss": 4.7839, + "step": 19418 + }, + { + "epoch": 0.11549029403368541, + "grad_norm": 1.4331681728363037, + "learning_rate": 4.837262734119453e-05, + "loss": 4.7494, + "step": 19419 + }, + { + "epoch": 0.11549624131696641, + "grad_norm": 1.4209895133972168, + "learning_rate": 4.837246156486048e-05, + "loss": 4.8538, + "step": 19420 + }, + { + "epoch": 0.11550218860024741, + "grad_norm": 1.2397242784500122, + "learning_rate": 4.837229578036734e-05, + "loss": 4.7616, + "step": 19421 + }, + { + "epoch": 0.1155081358835284, + "grad_norm": 1.2271560430526733, + "learning_rate": 4.837212998771517e-05, + "loss": 4.7361, + "step": 19422 + }, + { + "epoch": 0.1155140831668094, + "grad_norm": 1.3334344625473022, + "learning_rate": 4.837196418690403e-05, + "loss": 4.8971, + "step": 19423 + }, + { + "epoch": 0.1155200304500904, + "grad_norm": 1.3195756673812866, + "learning_rate": 4.837179837793398e-05, + "loss": 4.8944, + "step": 19424 + }, + { + "epoch": 0.1155259777333714, + "grad_norm": 1.4583542346954346, + "learning_rate": 4.837163256080508e-05, + "loss": 4.7857, + "step": 19425 + }, + { + "epoch": 0.1155319250166524, + "grad_norm": 1.5155558586120605, + "learning_rate": 4.837146673551739e-05, + "loss": 4.7728, + "step": 19426 + }, + { + "epoch": 0.1155378722999334, + "grad_norm": 1.3582627773284912, + "learning_rate": 4.837130090207095e-05, + "loss": 4.7065, + "step": 19427 + }, + { + "epoch": 0.11554381958321439, + "grad_norm": 1.2635151147842407, + "learning_rate": 4.837113506046584e-05, + "loss": 4.882, + "step": 19428 + }, + { + "epoch": 0.11554976686649539, + "grad_norm": 1.417083501815796, + "learning_rate": 4.83709692107021e-05, + "loss": 4.8928, + "step": 19429 + }, + { + "epoch": 0.11555571414977638, + "grad_norm": 1.4780973196029663, + "learning_rate": 4.8370803352779806e-05, + "loss": 4.9458, + "step": 19430 + }, + { + "epoch": 0.11556166143305738, + "grad_norm": 1.2949103116989136, + "learning_rate": 4.8370637486699e-05, + "loss": 4.8753, + "step": 19431 + }, + { + "epoch": 0.11556760871633838, + "grad_norm": 1.4755308628082275, + "learning_rate": 4.8370471612459744e-05, + "loss": 4.7886, + "step": 19432 + }, + { + "epoch": 0.11557355599961937, + "grad_norm": 1.4527158737182617, + "learning_rate": 4.8370305730062095e-05, + "loss": 4.8442, + "step": 19433 + }, + { + "epoch": 0.11557950328290037, + "grad_norm": 1.3422110080718994, + "learning_rate": 4.8370139839506124e-05, + "loss": 4.9745, + "step": 19434 + }, + { + "epoch": 0.11558545056618137, + "grad_norm": 1.5843584537506104, + "learning_rate": 4.836997394079187e-05, + "loss": 4.8432, + "step": 19435 + }, + { + "epoch": 0.11559139784946236, + "grad_norm": 1.3267780542373657, + "learning_rate": 4.836980803391941e-05, + "loss": 4.7816, + "step": 19436 + }, + { + "epoch": 0.11559734513274336, + "grad_norm": 1.3092966079711914, + "learning_rate": 4.836964211888878e-05, + "loss": 5.0283, + "step": 19437 + }, + { + "epoch": 0.11560329241602436, + "grad_norm": 1.4653512239456177, + "learning_rate": 4.836947619570005e-05, + "loss": 4.9265, + "step": 19438 + }, + { + "epoch": 0.11560923969930535, + "grad_norm": 1.344672441482544, + "learning_rate": 4.836931026435328e-05, + "loss": 5.0426, + "step": 19439 + }, + { + "epoch": 0.11561518698258635, + "grad_norm": 1.3949403762817383, + "learning_rate": 4.836914432484853e-05, + "loss": 5.1539, + "step": 19440 + }, + { + "epoch": 0.11562113426586736, + "grad_norm": 1.3876662254333496, + "learning_rate": 4.836897837718585e-05, + "loss": 4.9346, + "step": 19441 + }, + { + "epoch": 0.11562708154914834, + "grad_norm": 1.3399412631988525, + "learning_rate": 4.83688124213653e-05, + "loss": 4.8688, + "step": 19442 + }, + { + "epoch": 0.11563302883242935, + "grad_norm": 1.3819881677627563, + "learning_rate": 4.836864645738694e-05, + "loss": 4.9527, + "step": 19443 + }, + { + "epoch": 0.11563897611571035, + "grad_norm": 1.509074091911316, + "learning_rate": 4.8368480485250825e-05, + "loss": 4.9273, + "step": 19444 + }, + { + "epoch": 0.11564492339899134, + "grad_norm": 1.2591453790664673, + "learning_rate": 4.836831450495701e-05, + "loss": 4.9065, + "step": 19445 + }, + { + "epoch": 0.11565087068227234, + "grad_norm": 1.4065910577774048, + "learning_rate": 4.836814851650557e-05, + "loss": 4.9699, + "step": 19446 + }, + { + "epoch": 0.11565681796555334, + "grad_norm": 1.3355581760406494, + "learning_rate": 4.836798251989655e-05, + "loss": 5.1639, + "step": 19447 + }, + { + "epoch": 0.11566276524883433, + "grad_norm": 1.3715496063232422, + "learning_rate": 4.836781651513e-05, + "loss": 4.855, + "step": 19448 + }, + { + "epoch": 0.11566871253211533, + "grad_norm": 1.569305658340454, + "learning_rate": 4.836765050220599e-05, + "loss": 4.6329, + "step": 19449 + }, + { + "epoch": 0.11567465981539633, + "grad_norm": 1.3613293170928955, + "learning_rate": 4.836748448112458e-05, + "loss": 4.9897, + "step": 19450 + }, + { + "epoch": 0.11568060709867732, + "grad_norm": 1.2653577327728271, + "learning_rate": 4.836731845188581e-05, + "loss": 4.9819, + "step": 19451 + }, + { + "epoch": 0.11568655438195832, + "grad_norm": 1.5030022859573364, + "learning_rate": 4.836715241448976e-05, + "loss": 4.8387, + "step": 19452 + }, + { + "epoch": 0.11569250166523933, + "grad_norm": 1.2560715675354004, + "learning_rate": 4.836698636893647e-05, + "loss": 5.0862, + "step": 19453 + }, + { + "epoch": 0.11569844894852031, + "grad_norm": 1.1981379985809326, + "learning_rate": 4.836682031522602e-05, + "loss": 4.7682, + "step": 19454 + }, + { + "epoch": 0.11570439623180132, + "grad_norm": 1.3572615385055542, + "learning_rate": 4.8366654253358444e-05, + "loss": 4.9008, + "step": 19455 + }, + { + "epoch": 0.11571034351508232, + "grad_norm": 1.2542002201080322, + "learning_rate": 4.8366488183333816e-05, + "loss": 4.911, + "step": 19456 + }, + { + "epoch": 0.1157162907983633, + "grad_norm": 1.4759174585342407, + "learning_rate": 4.8366322105152186e-05, + "loss": 4.789, + "step": 19457 + }, + { + "epoch": 0.11572223808164431, + "grad_norm": 1.2307411432266235, + "learning_rate": 4.8366156018813616e-05, + "loss": 4.9556, + "step": 19458 + }, + { + "epoch": 0.1157281853649253, + "grad_norm": 1.240334153175354, + "learning_rate": 4.836598992431816e-05, + "loss": 4.9996, + "step": 19459 + }, + { + "epoch": 0.1157341326482063, + "grad_norm": 1.3100368976593018, + "learning_rate": 4.8365823821665876e-05, + "loss": 5.0693, + "step": 19460 + }, + { + "epoch": 0.1157400799314873, + "grad_norm": 1.0904709100723267, + "learning_rate": 4.8365657710856835e-05, + "loss": 5.0327, + "step": 19461 + }, + { + "epoch": 0.11574602721476829, + "grad_norm": 1.3847914934158325, + "learning_rate": 4.836549159189108e-05, + "loss": 5.0512, + "step": 19462 + }, + { + "epoch": 0.11575197449804929, + "grad_norm": 1.2307064533233643, + "learning_rate": 4.836532546476866e-05, + "loss": 5.0687, + "step": 19463 + }, + { + "epoch": 0.11575792178133029, + "grad_norm": 1.3900285959243774, + "learning_rate": 4.836515932948966e-05, + "loss": 5.1044, + "step": 19464 + }, + { + "epoch": 0.11576386906461128, + "grad_norm": 1.2194246053695679, + "learning_rate": 4.836499318605412e-05, + "loss": 5.0412, + "step": 19465 + }, + { + "epoch": 0.11576981634789228, + "grad_norm": 1.3460240364074707, + "learning_rate": 4.83648270344621e-05, + "loss": 5.14, + "step": 19466 + }, + { + "epoch": 0.11577576363117328, + "grad_norm": 1.2739115953445435, + "learning_rate": 4.8364660874713664e-05, + "loss": 5.0782, + "step": 19467 + }, + { + "epoch": 0.11578171091445427, + "grad_norm": 1.987092137336731, + "learning_rate": 4.836449470680887e-05, + "loss": 4.8106, + "step": 19468 + }, + { + "epoch": 0.11578765819773527, + "grad_norm": 1.3820792436599731, + "learning_rate": 4.8364328530747765e-05, + "loss": 5.3549, + "step": 19469 + }, + { + "epoch": 0.11579360548101628, + "grad_norm": 1.5276916027069092, + "learning_rate": 4.836416234653042e-05, + "loss": 5.3479, + "step": 19470 + }, + { + "epoch": 0.11579955276429726, + "grad_norm": 1.5292818546295166, + "learning_rate": 4.836399615415688e-05, + "loss": 5.2627, + "step": 19471 + }, + { + "epoch": 0.11580550004757827, + "grad_norm": 1.5759434700012207, + "learning_rate": 4.836382995362722e-05, + "loss": 5.2925, + "step": 19472 + }, + { + "epoch": 0.11581144733085927, + "grad_norm": 1.3807876110076904, + "learning_rate": 4.836366374494148e-05, + "loss": 5.0794, + "step": 19473 + }, + { + "epoch": 0.11581739461414026, + "grad_norm": 1.3631199598312378, + "learning_rate": 4.836349752809973e-05, + "loss": 5.0606, + "step": 19474 + }, + { + "epoch": 0.11582334189742126, + "grad_norm": 1.5250667333602905, + "learning_rate": 4.836333130310202e-05, + "loss": 5.1799, + "step": 19475 + }, + { + "epoch": 0.11582928918070226, + "grad_norm": 1.4191410541534424, + "learning_rate": 4.836316506994842e-05, + "loss": 5.2812, + "step": 19476 + }, + { + "epoch": 0.11583523646398325, + "grad_norm": 1.5502076148986816, + "learning_rate": 4.8362998828638975e-05, + "loss": 5.3503, + "step": 19477 + }, + { + "epoch": 0.11584118374726425, + "grad_norm": 1.441786766052246, + "learning_rate": 4.836283257917375e-05, + "loss": 5.1526, + "step": 19478 + }, + { + "epoch": 0.11584713103054525, + "grad_norm": 1.3994730710983276, + "learning_rate": 4.83626663215528e-05, + "loss": 5.1969, + "step": 19479 + }, + { + "epoch": 0.11585307831382624, + "grad_norm": 1.5141762495040894, + "learning_rate": 4.836250005577619e-05, + "loss": 5.099, + "step": 19480 + }, + { + "epoch": 0.11585902559710724, + "grad_norm": 1.4504029750823975, + "learning_rate": 4.836233378184397e-05, + "loss": 5.5225, + "step": 19481 + }, + { + "epoch": 0.11586497288038825, + "grad_norm": 1.3617264032363892, + "learning_rate": 4.8362167499756194e-05, + "loss": 5.3426, + "step": 19482 + }, + { + "epoch": 0.11587092016366923, + "grad_norm": 1.3681023120880127, + "learning_rate": 4.8362001209512934e-05, + "loss": 5.3476, + "step": 19483 + }, + { + "epoch": 0.11587686744695024, + "grad_norm": 1.050550937652588, + "learning_rate": 4.836183491111424e-05, + "loss": 5.1338, + "step": 19484 + }, + { + "epoch": 0.11588281473023124, + "grad_norm": 1.386715054512024, + "learning_rate": 4.836166860456017e-05, + "loss": 5.2761, + "step": 19485 + }, + { + "epoch": 0.11588876201351223, + "grad_norm": 1.2128262519836426, + "learning_rate": 4.836150228985078e-05, + "loss": 5.165, + "step": 19486 + }, + { + "epoch": 0.11589470929679323, + "grad_norm": 1.224721074104309, + "learning_rate": 4.836133596698614e-05, + "loss": 5.1631, + "step": 19487 + }, + { + "epoch": 0.11590065658007422, + "grad_norm": 1.2348668575286865, + "learning_rate": 4.8361169635966285e-05, + "loss": 5.3206, + "step": 19488 + }, + { + "epoch": 0.11590660386335522, + "grad_norm": 1.1665185689926147, + "learning_rate": 4.836100329679129e-05, + "loss": 5.3162, + "step": 19489 + }, + { + "epoch": 0.11591255114663622, + "grad_norm": 1.2063257694244385, + "learning_rate": 4.836083694946122e-05, + "loss": 5.0348, + "step": 19490 + }, + { + "epoch": 0.11591849842991721, + "grad_norm": 1.5199745893478394, + "learning_rate": 4.836067059397612e-05, + "loss": 5.0793, + "step": 19491 + }, + { + "epoch": 0.11592444571319821, + "grad_norm": 1.2285770177841187, + "learning_rate": 4.8360504230336044e-05, + "loss": 5.1478, + "step": 19492 + }, + { + "epoch": 0.11593039299647921, + "grad_norm": 1.3429020643234253, + "learning_rate": 4.836033785854107e-05, + "loss": 5.3225, + "step": 19493 + }, + { + "epoch": 0.1159363402797602, + "grad_norm": 1.3870415687561035, + "learning_rate": 4.836017147859123e-05, + "loss": 5.2711, + "step": 19494 + }, + { + "epoch": 0.1159422875630412, + "grad_norm": 1.3311539888381958, + "learning_rate": 4.8360005090486603e-05, + "loss": 5.1778, + "step": 19495 + }, + { + "epoch": 0.1159482348463222, + "grad_norm": 1.1331884860992432, + "learning_rate": 4.8359838694227236e-05, + "loss": 5.1435, + "step": 19496 + }, + { + "epoch": 0.11595418212960319, + "grad_norm": 1.427506685256958, + "learning_rate": 4.83596722898132e-05, + "loss": 5.2153, + "step": 19497 + }, + { + "epoch": 0.1159601294128842, + "grad_norm": 1.4716016054153442, + "learning_rate": 4.835950587724453e-05, + "loss": 4.9599, + "step": 19498 + }, + { + "epoch": 0.1159660766961652, + "grad_norm": 1.073724389076233, + "learning_rate": 4.8359339456521305e-05, + "loss": 5.3481, + "step": 19499 + }, + { + "epoch": 0.11597202397944618, + "grad_norm": 1.1965457201004028, + "learning_rate": 4.835917302764358e-05, + "loss": 5.128, + "step": 19500 + }, + { + "epoch": 0.11597797126272719, + "grad_norm": 1.2589031457901, + "learning_rate": 4.83590065906114e-05, + "loss": 5.1952, + "step": 19501 + }, + { + "epoch": 0.11598391854600819, + "grad_norm": 1.5062520503997803, + "learning_rate": 4.8358840145424835e-05, + "loss": 5.3431, + "step": 19502 + }, + { + "epoch": 0.11598986582928918, + "grad_norm": 1.3464981317520142, + "learning_rate": 4.8358673692083944e-05, + "loss": 5.187, + "step": 19503 + }, + { + "epoch": 0.11599581311257018, + "grad_norm": 1.195157766342163, + "learning_rate": 4.8358507230588776e-05, + "loss": 5.4018, + "step": 19504 + }, + { + "epoch": 0.11600176039585118, + "grad_norm": 1.185371994972229, + "learning_rate": 4.83583407609394e-05, + "loss": 5.3204, + "step": 19505 + }, + { + "epoch": 0.11600770767913217, + "grad_norm": 1.1011184453964233, + "learning_rate": 4.835817428313586e-05, + "loss": 5.2426, + "step": 19506 + }, + { + "epoch": 0.11601365496241317, + "grad_norm": 1.2706186771392822, + "learning_rate": 4.835800779717823e-05, + "loss": 5.3277, + "step": 19507 + }, + { + "epoch": 0.11601960224569417, + "grad_norm": 1.23444664478302, + "learning_rate": 4.8357841303066564e-05, + "loss": 5.304, + "step": 19508 + }, + { + "epoch": 0.11602554952897516, + "grad_norm": 1.3166215419769287, + "learning_rate": 4.8357674800800915e-05, + "loss": 5.1755, + "step": 19509 + }, + { + "epoch": 0.11603149681225616, + "grad_norm": 1.0634559392929077, + "learning_rate": 4.835750829038134e-05, + "loss": 5.2188, + "step": 19510 + }, + { + "epoch": 0.11603744409553716, + "grad_norm": 1.0847052335739136, + "learning_rate": 4.8357341771807894e-05, + "loss": 5.1993, + "step": 19511 + }, + { + "epoch": 0.11604339137881815, + "grad_norm": 1.2893394231796265, + "learning_rate": 4.8357175245080645e-05, + "loss": 5.278, + "step": 19512 + }, + { + "epoch": 0.11604933866209916, + "grad_norm": 1.1346744298934937, + "learning_rate": 4.8357008710199653e-05, + "loss": 5.0915, + "step": 19513 + }, + { + "epoch": 0.11605528594538016, + "grad_norm": 1.2405723333358765, + "learning_rate": 4.835684216716497e-05, + "loss": 5.3274, + "step": 19514 + }, + { + "epoch": 0.11606123322866115, + "grad_norm": 1.2367215156555176, + "learning_rate": 4.8356675615976646e-05, + "loss": 5.3145, + "step": 19515 + }, + { + "epoch": 0.11606718051194215, + "grad_norm": 1.23695969581604, + "learning_rate": 4.835650905663476e-05, + "loss": 5.1454, + "step": 19516 + }, + { + "epoch": 0.11607312779522314, + "grad_norm": 1.649644136428833, + "learning_rate": 4.835634248913935e-05, + "loss": 4.9684, + "step": 19517 + }, + { + "epoch": 0.11607907507850414, + "grad_norm": 1.3828257322311401, + "learning_rate": 4.835617591349049e-05, + "loss": 4.8913, + "step": 19518 + }, + { + "epoch": 0.11608502236178514, + "grad_norm": 1.4446587562561035, + "learning_rate": 4.8356009329688215e-05, + "loss": 4.9248, + "step": 19519 + }, + { + "epoch": 0.11609096964506613, + "grad_norm": 1.4149401187896729, + "learning_rate": 4.835584273773261e-05, + "loss": 5.0446, + "step": 19520 + }, + { + "epoch": 0.11609691692834713, + "grad_norm": 1.4073368310928345, + "learning_rate": 4.835567613762372e-05, + "loss": 5.1451, + "step": 19521 + }, + { + "epoch": 0.11610286421162813, + "grad_norm": 1.438539743423462, + "learning_rate": 4.835550952936161e-05, + "loss": 5.3629, + "step": 19522 + }, + { + "epoch": 0.11610881149490912, + "grad_norm": 1.4686654806137085, + "learning_rate": 4.835534291294632e-05, + "loss": 5.4386, + "step": 19523 + }, + { + "epoch": 0.11611475877819012, + "grad_norm": 1.3416131734848022, + "learning_rate": 4.835517628837793e-05, + "loss": 5.4625, + "step": 19524 + }, + { + "epoch": 0.11612070606147112, + "grad_norm": 1.38942551612854, + "learning_rate": 4.835500965565649e-05, + "loss": 5.2164, + "step": 19525 + }, + { + "epoch": 0.11612665334475211, + "grad_norm": 1.157583475112915, + "learning_rate": 4.835484301478205e-05, + "loss": 4.931, + "step": 19526 + }, + { + "epoch": 0.11613260062803311, + "grad_norm": 1.1182529926300049, + "learning_rate": 4.835467636575468e-05, + "loss": 5.0804, + "step": 19527 + }, + { + "epoch": 0.11613854791131412, + "grad_norm": 1.1087690591812134, + "learning_rate": 4.835450970857444e-05, + "loss": 4.9112, + "step": 19528 + }, + { + "epoch": 0.1161444951945951, + "grad_norm": 1.1217858791351318, + "learning_rate": 4.8354343043241374e-05, + "loss": 4.8775, + "step": 19529 + }, + { + "epoch": 0.1161504424778761, + "grad_norm": 1.703722596168518, + "learning_rate": 4.8354176369755556e-05, + "loss": 5.0991, + "step": 19530 + }, + { + "epoch": 0.11615638976115711, + "grad_norm": 1.5027599334716797, + "learning_rate": 4.8354009688117026e-05, + "loss": 5.3486, + "step": 19531 + }, + { + "epoch": 0.1161623370444381, + "grad_norm": 1.3976017236709595, + "learning_rate": 4.835384299832586e-05, + "loss": 5.3045, + "step": 19532 + }, + { + "epoch": 0.1161682843277191, + "grad_norm": 1.4341175556182861, + "learning_rate": 4.83536763003821e-05, + "loss": 5.2463, + "step": 19533 + }, + { + "epoch": 0.1161742316110001, + "grad_norm": 1.248632550239563, + "learning_rate": 4.835350959428582e-05, + "loss": 5.1573, + "step": 19534 + }, + { + "epoch": 0.11618017889428109, + "grad_norm": 1.2873725891113281, + "learning_rate": 4.835334288003707e-05, + "loss": 5.3115, + "step": 19535 + }, + { + "epoch": 0.11618612617756209, + "grad_norm": 1.4359512329101562, + "learning_rate": 4.835317615763591e-05, + "loss": 5.1134, + "step": 19536 + }, + { + "epoch": 0.11619207346084309, + "grad_norm": 1.3092215061187744, + "learning_rate": 4.8353009427082395e-05, + "loss": 5.2955, + "step": 19537 + }, + { + "epoch": 0.11619802074412408, + "grad_norm": 1.292256474494934, + "learning_rate": 4.8352842688376585e-05, + "loss": 5.2163, + "step": 19538 + }, + { + "epoch": 0.11620396802740508, + "grad_norm": 1.2327983379364014, + "learning_rate": 4.8352675941518545e-05, + "loss": 5.2785, + "step": 19539 + }, + { + "epoch": 0.11620991531068608, + "grad_norm": 1.3402459621429443, + "learning_rate": 4.835250918650832e-05, + "loss": 5.2474, + "step": 19540 + }, + { + "epoch": 0.11621586259396707, + "grad_norm": 1.4312702417373657, + "learning_rate": 4.835234242334598e-05, + "loss": 5.1451, + "step": 19541 + }, + { + "epoch": 0.11622180987724808, + "grad_norm": 1.4165308475494385, + "learning_rate": 4.8352175652031576e-05, + "loss": 5.2241, + "step": 19542 + }, + { + "epoch": 0.11622775716052908, + "grad_norm": 1.1984010934829712, + "learning_rate": 4.835200887256517e-05, + "loss": 5.2084, + "step": 19543 + }, + { + "epoch": 0.11623370444381007, + "grad_norm": 1.277029275894165, + "learning_rate": 4.835184208494682e-05, + "loss": 5.1136, + "step": 19544 + }, + { + "epoch": 0.11623965172709107, + "grad_norm": 1.4002219438552856, + "learning_rate": 4.8351675289176586e-05, + "loss": 5.1313, + "step": 19545 + }, + { + "epoch": 0.11624559901037206, + "grad_norm": 1.397129774093628, + "learning_rate": 4.835150848525452e-05, + "loss": 5.2001, + "step": 19546 + }, + { + "epoch": 0.11625154629365306, + "grad_norm": 1.3968653678894043, + "learning_rate": 4.8351341673180686e-05, + "loss": 5.1292, + "step": 19547 + }, + { + "epoch": 0.11625749357693406, + "grad_norm": 1.298600435256958, + "learning_rate": 4.8351174852955125e-05, + "loss": 5.1185, + "step": 19548 + }, + { + "epoch": 0.11626344086021505, + "grad_norm": 1.119382619857788, + "learning_rate": 4.835100802457793e-05, + "loss": 5.2052, + "step": 19549 + }, + { + "epoch": 0.11626938814349605, + "grad_norm": 1.2555358409881592, + "learning_rate": 4.835084118804913e-05, + "loss": 5.2604, + "step": 19550 + }, + { + "epoch": 0.11627533542677705, + "grad_norm": 1.293525218963623, + "learning_rate": 4.835067434336879e-05, + "loss": 5.1402, + "step": 19551 + }, + { + "epoch": 0.11628128271005804, + "grad_norm": 1.3321988582611084, + "learning_rate": 4.8350507490536976e-05, + "loss": 5.0959, + "step": 19552 + }, + { + "epoch": 0.11628722999333904, + "grad_norm": 1.3231252431869507, + "learning_rate": 4.835034062955374e-05, + "loss": 5.0461, + "step": 19553 + }, + { + "epoch": 0.11629317727662004, + "grad_norm": 1.2743831872940063, + "learning_rate": 4.835017376041914e-05, + "loss": 5.1215, + "step": 19554 + }, + { + "epoch": 0.11629912455990103, + "grad_norm": 1.3750208616256714, + "learning_rate": 4.835000688313323e-05, + "loss": 5.0459, + "step": 19555 + }, + { + "epoch": 0.11630507184318203, + "grad_norm": 1.394209861755371, + "learning_rate": 4.834983999769609e-05, + "loss": 5.1577, + "step": 19556 + }, + { + "epoch": 0.11631101912646304, + "grad_norm": 1.2393178939819336, + "learning_rate": 4.834967310410775e-05, + "loss": 5.1217, + "step": 19557 + }, + { + "epoch": 0.11631696640974402, + "grad_norm": 1.2668427228927612, + "learning_rate": 4.834950620236829e-05, + "loss": 5.0266, + "step": 19558 + }, + { + "epoch": 0.11632291369302503, + "grad_norm": 1.4088828563690186, + "learning_rate": 4.834933929247775e-05, + "loss": 4.8089, + "step": 19559 + }, + { + "epoch": 0.11632886097630603, + "grad_norm": 1.2668780088424683, + "learning_rate": 4.83491723744362e-05, + "loss": 5.2791, + "step": 19560 + }, + { + "epoch": 0.11633480825958702, + "grad_norm": 1.3243741989135742, + "learning_rate": 4.834900544824369e-05, + "loss": 5.1743, + "step": 19561 + }, + { + "epoch": 0.11634075554286802, + "grad_norm": 1.497856616973877, + "learning_rate": 4.834883851390029e-05, + "loss": 4.8667, + "step": 19562 + }, + { + "epoch": 0.11634670282614902, + "grad_norm": 1.426867961883545, + "learning_rate": 4.834867157140605e-05, + "loss": 4.9758, + "step": 19563 + }, + { + "epoch": 0.11635265010943001, + "grad_norm": 1.4427236318588257, + "learning_rate": 4.834850462076103e-05, + "loss": 5.45, + "step": 19564 + }, + { + "epoch": 0.11635859739271101, + "grad_norm": 1.4465901851654053, + "learning_rate": 4.834833766196528e-05, + "loss": 5.0877, + "step": 19565 + }, + { + "epoch": 0.11636454467599201, + "grad_norm": 1.76282799243927, + "learning_rate": 4.834817069501888e-05, + "loss": 5.0607, + "step": 19566 + }, + { + "epoch": 0.116370491959273, + "grad_norm": 1.4688469171524048, + "learning_rate": 4.8348003719921864e-05, + "loss": 4.9929, + "step": 19567 + }, + { + "epoch": 0.116376439242554, + "grad_norm": 1.576390266418457, + "learning_rate": 4.834783673667431e-05, + "loss": 5.7283, + "step": 19568 + }, + { + "epoch": 0.116382386525835, + "grad_norm": 1.517745852470398, + "learning_rate": 4.834766974527626e-05, + "loss": 5.3711, + "step": 19569 + }, + { + "epoch": 0.11638833380911599, + "grad_norm": 1.5122108459472656, + "learning_rate": 4.834750274572778e-05, + "loss": 5.6297, + "step": 19570 + }, + { + "epoch": 0.116394281092397, + "grad_norm": 1.9188055992126465, + "learning_rate": 4.8347335738028934e-05, + "loss": 5.0911, + "step": 19571 + }, + { + "epoch": 0.116400228375678, + "grad_norm": 1.7408324480056763, + "learning_rate": 4.834716872217977e-05, + "loss": 5.1396, + "step": 19572 + }, + { + "epoch": 0.11640617565895899, + "grad_norm": 1.7669044733047485, + "learning_rate": 4.834700169818035e-05, + "loss": 5.1463, + "step": 19573 + }, + { + "epoch": 0.11641212294223999, + "grad_norm": 1.7838845252990723, + "learning_rate": 4.834683466603074e-05, + "loss": 5.3486, + "step": 19574 + }, + { + "epoch": 0.11641807022552098, + "grad_norm": 1.8427141904830933, + "learning_rate": 4.834666762573098e-05, + "loss": 5.1454, + "step": 19575 + }, + { + "epoch": 0.11642401750880198, + "grad_norm": 1.8620864152908325, + "learning_rate": 4.8346500577281145e-05, + "loss": 4.9462, + "step": 19576 + }, + { + "epoch": 0.11642996479208298, + "grad_norm": 1.7334544658660889, + "learning_rate": 4.834633352068129e-05, + "loss": 4.9012, + "step": 19577 + }, + { + "epoch": 0.11643591207536397, + "grad_norm": 1.7202188968658447, + "learning_rate": 4.834616645593147e-05, + "loss": 5.2577, + "step": 19578 + }, + { + "epoch": 0.11644185935864497, + "grad_norm": 1.5666993856430054, + "learning_rate": 4.834599938303174e-05, + "loss": 4.9502, + "step": 19579 + }, + { + "epoch": 0.11644780664192597, + "grad_norm": 1.5880829095840454, + "learning_rate": 4.834583230198217e-05, + "loss": 5.1193, + "step": 19580 + }, + { + "epoch": 0.11645375392520696, + "grad_norm": 1.7851444482803345, + "learning_rate": 4.834566521278281e-05, + "loss": 5.1411, + "step": 19581 + }, + { + "epoch": 0.11645970120848796, + "grad_norm": 1.8817992210388184, + "learning_rate": 4.834549811543371e-05, + "loss": 5.1773, + "step": 19582 + }, + { + "epoch": 0.11646564849176896, + "grad_norm": 1.8055325746536255, + "learning_rate": 4.834533100993495e-05, + "loss": 4.8526, + "step": 19583 + }, + { + "epoch": 0.11647159577504995, + "grad_norm": 1.501705527305603, + "learning_rate": 4.834516389628657e-05, + "loss": 4.9943, + "step": 19584 + }, + { + "epoch": 0.11647754305833095, + "grad_norm": 1.8224765062332153, + "learning_rate": 4.8344996774488635e-05, + "loss": 5.3321, + "step": 19585 + }, + { + "epoch": 0.11648349034161196, + "grad_norm": 1.7806826829910278, + "learning_rate": 4.83448296445412e-05, + "loss": 5.1565, + "step": 19586 + }, + { + "epoch": 0.11648943762489294, + "grad_norm": 1.64619779586792, + "learning_rate": 4.8344662506444334e-05, + "loss": 4.9259, + "step": 19587 + }, + { + "epoch": 0.11649538490817395, + "grad_norm": 1.7176555395126343, + "learning_rate": 4.834449536019808e-05, + "loss": 4.9173, + "step": 19588 + }, + { + "epoch": 0.11650133219145495, + "grad_norm": 1.7485530376434326, + "learning_rate": 4.834432820580251e-05, + "loss": 4.9548, + "step": 19589 + }, + { + "epoch": 0.11650727947473594, + "grad_norm": 1.8407695293426514, + "learning_rate": 4.834416104325767e-05, + "loss": 5.5323, + "step": 19590 + }, + { + "epoch": 0.11651322675801694, + "grad_norm": 1.37450110912323, + "learning_rate": 4.834399387256363e-05, + "loss": 5.0058, + "step": 19591 + }, + { + "epoch": 0.11651917404129794, + "grad_norm": 1.6784085035324097, + "learning_rate": 4.834382669372044e-05, + "loss": 5.0886, + "step": 19592 + }, + { + "epoch": 0.11652512132457893, + "grad_norm": 1.9228695631027222, + "learning_rate": 4.834365950672816e-05, + "loss": 5.5382, + "step": 19593 + }, + { + "epoch": 0.11653106860785993, + "grad_norm": 1.7998968362808228, + "learning_rate": 4.834349231158685e-05, + "loss": 5.3286, + "step": 19594 + }, + { + "epoch": 0.11653701589114093, + "grad_norm": 1.9077783823013306, + "learning_rate": 4.8343325108296574e-05, + "loss": 4.9033, + "step": 19595 + }, + { + "epoch": 0.11654296317442192, + "grad_norm": 1.3677197694778442, + "learning_rate": 4.834315789685738e-05, + "loss": 5.4146, + "step": 19596 + }, + { + "epoch": 0.11654891045770292, + "grad_norm": 1.5490330457687378, + "learning_rate": 4.834299067726933e-05, + "loss": 5.8435, + "step": 19597 + }, + { + "epoch": 0.11655485774098392, + "grad_norm": 1.7260395288467407, + "learning_rate": 4.8342823449532484e-05, + "loss": 4.9687, + "step": 19598 + }, + { + "epoch": 0.11656080502426491, + "grad_norm": 1.5140855312347412, + "learning_rate": 4.83426562136469e-05, + "loss": 4.8185, + "step": 19599 + }, + { + "epoch": 0.11656675230754591, + "grad_norm": 1.7183781862258911, + "learning_rate": 4.834248896961263e-05, + "loss": 4.954, + "step": 19600 + }, + { + "epoch": 0.11657269959082692, + "grad_norm": 1.3909941911697388, + "learning_rate": 4.834232171742975e-05, + "loss": 5.3393, + "step": 19601 + }, + { + "epoch": 0.1165786468741079, + "grad_norm": 1.437046766281128, + "learning_rate": 4.83421544570983e-05, + "loss": 5.5486, + "step": 19602 + }, + { + "epoch": 0.11658459415738891, + "grad_norm": 1.4513304233551025, + "learning_rate": 4.8341987188618344e-05, + "loss": 5.6754, + "step": 19603 + }, + { + "epoch": 0.1165905414406699, + "grad_norm": 1.7366830110549927, + "learning_rate": 4.8341819911989936e-05, + "loss": 5.5651, + "step": 19604 + }, + { + "epoch": 0.1165964887239509, + "grad_norm": 1.7084081172943115, + "learning_rate": 4.834165262721315e-05, + "loss": 5.5237, + "step": 19605 + }, + { + "epoch": 0.1166024360072319, + "grad_norm": 1.588749647140503, + "learning_rate": 4.834148533428803e-05, + "loss": 5.5371, + "step": 19606 + }, + { + "epoch": 0.11660838329051289, + "grad_norm": 1.6907262802124023, + "learning_rate": 4.834131803321464e-05, + "loss": 5.3998, + "step": 19607 + }, + { + "epoch": 0.11661433057379389, + "grad_norm": 1.676530122756958, + "learning_rate": 4.834115072399304e-05, + "loss": 5.1636, + "step": 19608 + }, + { + "epoch": 0.11662027785707489, + "grad_norm": 1.6379070281982422, + "learning_rate": 4.834098340662327e-05, + "loss": 5.4196, + "step": 19609 + }, + { + "epoch": 0.11662622514035588, + "grad_norm": 1.6794102191925049, + "learning_rate": 4.8340816081105424e-05, + "loss": 5.3671, + "step": 19610 + }, + { + "epoch": 0.11663217242363688, + "grad_norm": 1.7833147048950195, + "learning_rate": 4.834064874743953e-05, + "loss": 5.3417, + "step": 19611 + }, + { + "epoch": 0.11663811970691788, + "grad_norm": 1.649409532546997, + "learning_rate": 4.834048140562566e-05, + "loss": 5.2781, + "step": 19612 + }, + { + "epoch": 0.11664406699019887, + "grad_norm": 1.6082829236984253, + "learning_rate": 4.834031405566387e-05, + "loss": 5.1188, + "step": 19613 + }, + { + "epoch": 0.11665001427347987, + "grad_norm": 1.6651804447174072, + "learning_rate": 4.834014669755421e-05, + "loss": 5.1683, + "step": 19614 + }, + { + "epoch": 0.11665596155676088, + "grad_norm": 1.715795636177063, + "learning_rate": 4.8339979331296755e-05, + "loss": 5.2491, + "step": 19615 + }, + { + "epoch": 0.11666190884004186, + "grad_norm": 1.6809749603271484, + "learning_rate": 4.8339811956891546e-05, + "loss": 5.0614, + "step": 19616 + }, + { + "epoch": 0.11666785612332287, + "grad_norm": 1.563790202140808, + "learning_rate": 4.833964457433865e-05, + "loss": 5.231, + "step": 19617 + }, + { + "epoch": 0.11667380340660387, + "grad_norm": 1.464647650718689, + "learning_rate": 4.8339477183638136e-05, + "loss": 5.0405, + "step": 19618 + }, + { + "epoch": 0.11667975068988486, + "grad_norm": 1.989701509475708, + "learning_rate": 4.8339309784790043e-05, + "loss": 5.4454, + "step": 19619 + }, + { + "epoch": 0.11668569797316586, + "grad_norm": 2.438558340072632, + "learning_rate": 4.833914237779444e-05, + "loss": 5.7298, + "step": 19620 + }, + { + "epoch": 0.11669164525644686, + "grad_norm": 1.7590994834899902, + "learning_rate": 4.833897496265139e-05, + "loss": 5.4473, + "step": 19621 + }, + { + "epoch": 0.11669759253972785, + "grad_norm": 2.1040074825286865, + "learning_rate": 4.833880753936093e-05, + "loss": 5.2399, + "step": 19622 + }, + { + "epoch": 0.11670353982300885, + "grad_norm": 1.7136433124542236, + "learning_rate": 4.8338640107923146e-05, + "loss": 5.21, + "step": 19623 + }, + { + "epoch": 0.11670948710628985, + "grad_norm": 1.5797784328460693, + "learning_rate": 4.8338472668338074e-05, + "loss": 5.3555, + "step": 19624 + }, + { + "epoch": 0.11671543438957084, + "grad_norm": 1.512645959854126, + "learning_rate": 4.833830522060579e-05, + "loss": 5.4964, + "step": 19625 + }, + { + "epoch": 0.11672138167285184, + "grad_norm": 1.9328651428222656, + "learning_rate": 4.833813776472634e-05, + "loss": 5.9072, + "step": 19626 + }, + { + "epoch": 0.11672732895613284, + "grad_norm": 1.882068395614624, + "learning_rate": 4.8337970300699795e-05, + "loss": 5.4304, + "step": 19627 + }, + { + "epoch": 0.11673327623941383, + "grad_norm": 2.1347815990448, + "learning_rate": 4.83378028285262e-05, + "loss": 5.1286, + "step": 19628 + }, + { + "epoch": 0.11673922352269483, + "grad_norm": 2.0237247943878174, + "learning_rate": 4.833763534820562e-05, + "loss": 5.113, + "step": 19629 + }, + { + "epoch": 0.11674517080597584, + "grad_norm": 1.5656205415725708, + "learning_rate": 4.833746785973811e-05, + "loss": 4.8452, + "step": 19630 + }, + { + "epoch": 0.11675111808925683, + "grad_norm": 2.268324613571167, + "learning_rate": 4.833730036312374e-05, + "loss": 5.7184, + "step": 19631 + }, + { + "epoch": 0.11675706537253783, + "grad_norm": 2.1705756187438965, + "learning_rate": 4.833713285836255e-05, + "loss": 5.6489, + "step": 19632 + }, + { + "epoch": 0.11676301265581882, + "grad_norm": 1.7976182699203491, + "learning_rate": 4.833696534545461e-05, + "loss": 5.7016, + "step": 19633 + }, + { + "epoch": 0.11676895993909982, + "grad_norm": 1.2853381633758545, + "learning_rate": 4.8336797824399976e-05, + "loss": 5.654, + "step": 19634 + }, + { + "epoch": 0.11677490722238082, + "grad_norm": 1.8741413354873657, + "learning_rate": 4.833663029519871e-05, + "loss": 5.6735, + "step": 19635 + }, + { + "epoch": 0.11678085450566181, + "grad_norm": 1.4911704063415527, + "learning_rate": 4.8336462757850864e-05, + "loss": 5.3877, + "step": 19636 + }, + { + "epoch": 0.11678680178894281, + "grad_norm": 1.7979151010513306, + "learning_rate": 4.8336295212356506e-05, + "loss": 5.5677, + "step": 19637 + }, + { + "epoch": 0.11679274907222381, + "grad_norm": 2.036970376968384, + "learning_rate": 4.8336127658715677e-05, + "loss": 5.4768, + "step": 19638 + }, + { + "epoch": 0.1167986963555048, + "grad_norm": 1.9423377513885498, + "learning_rate": 4.833596009692846e-05, + "loss": 5.4021, + "step": 19639 + }, + { + "epoch": 0.1168046436387858, + "grad_norm": 1.5860786437988281, + "learning_rate": 4.8335792526994894e-05, + "loss": 5.3363, + "step": 19640 + }, + { + "epoch": 0.1168105909220668, + "grad_norm": 1.5712209939956665, + "learning_rate": 4.833562494891504e-05, + "loss": 5.432, + "step": 19641 + }, + { + "epoch": 0.11681653820534779, + "grad_norm": 1.3889914751052856, + "learning_rate": 4.833545736268897e-05, + "loss": 5.3272, + "step": 19642 + }, + { + "epoch": 0.1168224854886288, + "grad_norm": 1.607134461402893, + "learning_rate": 4.8335289768316726e-05, + "loss": 5.9617, + "step": 19643 + }, + { + "epoch": 0.1168284327719098, + "grad_norm": 1.6738252639770508, + "learning_rate": 4.8335122165798376e-05, + "loss": 5.6361, + "step": 19644 + }, + { + "epoch": 0.11683438005519078, + "grad_norm": 1.6006174087524414, + "learning_rate": 4.8334954555133974e-05, + "loss": 5.7384, + "step": 19645 + }, + { + "epoch": 0.11684032733847179, + "grad_norm": 1.7018747329711914, + "learning_rate": 4.833478693632358e-05, + "loss": 5.0784, + "step": 19646 + }, + { + "epoch": 0.11684627462175279, + "grad_norm": 1.7542921304702759, + "learning_rate": 4.833461930936726e-05, + "loss": 5.2674, + "step": 19647 + }, + { + "epoch": 0.11685222190503378, + "grad_norm": 1.6434245109558105, + "learning_rate": 4.8334451674265055e-05, + "loss": 4.7117, + "step": 19648 + }, + { + "epoch": 0.11685816918831478, + "grad_norm": 1.7878485918045044, + "learning_rate": 4.8334284031017044e-05, + "loss": 4.8068, + "step": 19649 + }, + { + "epoch": 0.11686411647159578, + "grad_norm": 1.7029922008514404, + "learning_rate": 4.833411637962327e-05, + "loss": 4.9168, + "step": 19650 + }, + { + "epoch": 0.11687006375487677, + "grad_norm": 1.8004266023635864, + "learning_rate": 4.83339487200838e-05, + "loss": 4.9931, + "step": 19651 + }, + { + "epoch": 0.11687601103815777, + "grad_norm": 1.7843881845474243, + "learning_rate": 4.833378105239869e-05, + "loss": 5.0786, + "step": 19652 + }, + { + "epoch": 0.11688195832143877, + "grad_norm": 1.697993278503418, + "learning_rate": 4.833361337656799e-05, + "loss": 5.188, + "step": 19653 + }, + { + "epoch": 0.11688790560471976, + "grad_norm": 1.8484392166137695, + "learning_rate": 4.833344569259177e-05, + "loss": 5.4858, + "step": 19654 + }, + { + "epoch": 0.11689385288800076, + "grad_norm": 1.6850509643554688, + "learning_rate": 4.833327800047009e-05, + "loss": 5.7946, + "step": 19655 + }, + { + "epoch": 0.11689980017128176, + "grad_norm": 1.709845781326294, + "learning_rate": 4.8333110300203e-05, + "loss": 6.0674, + "step": 19656 + }, + { + "epoch": 0.11690574745456275, + "grad_norm": 1.6634660959243774, + "learning_rate": 4.833294259179057e-05, + "loss": 5.8038, + "step": 19657 + }, + { + "epoch": 0.11691169473784375, + "grad_norm": 1.6274930238723755, + "learning_rate": 4.833277487523283e-05, + "loss": 5.6752, + "step": 19658 + }, + { + "epoch": 0.11691764202112476, + "grad_norm": 1.5415219068527222, + "learning_rate": 4.833260715052988e-05, + "loss": 5.4002, + "step": 19659 + }, + { + "epoch": 0.11692358930440575, + "grad_norm": 1.6023998260498047, + "learning_rate": 4.833243941768175e-05, + "loss": 5.2429, + "step": 19660 + }, + { + "epoch": 0.11692953658768675, + "grad_norm": 1.4608384370803833, + "learning_rate": 4.8332271676688515e-05, + "loss": 5.5144, + "step": 19661 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 1.700076937675476, + "learning_rate": 4.833210392755021e-05, + "loss": 5.6356, + "step": 19662 + }, + { + "epoch": 0.11694143115424874, + "grad_norm": 1.415705919265747, + "learning_rate": 4.833193617026692e-05, + "loss": 5.6977, + "step": 19663 + }, + { + "epoch": 0.11694737843752974, + "grad_norm": 1.620815634727478, + "learning_rate": 4.833176840483868e-05, + "loss": 5.8967, + "step": 19664 + }, + { + "epoch": 0.11695332572081073, + "grad_norm": 1.4221736192703247, + "learning_rate": 4.833160063126558e-05, + "loss": 5.5351, + "step": 19665 + }, + { + "epoch": 0.11695927300409173, + "grad_norm": 1.460254192352295, + "learning_rate": 4.833143284954764e-05, + "loss": 5.327, + "step": 19666 + }, + { + "epoch": 0.11696522028737273, + "grad_norm": 1.8340283632278442, + "learning_rate": 4.833126505968495e-05, + "loss": 5.199, + "step": 19667 + }, + { + "epoch": 0.11697116757065372, + "grad_norm": 1.4036595821380615, + "learning_rate": 4.8331097261677555e-05, + "loss": 5.185, + "step": 19668 + }, + { + "epoch": 0.11697711485393472, + "grad_norm": 1.5454041957855225, + "learning_rate": 4.833092945552551e-05, + "loss": 5.3545, + "step": 19669 + }, + { + "epoch": 0.11698306213721572, + "grad_norm": 1.4965288639068604, + "learning_rate": 4.8330761641228886e-05, + "loss": 5.2993, + "step": 19670 + }, + { + "epoch": 0.11698900942049671, + "grad_norm": 2.4290192127227783, + "learning_rate": 4.833059381878773e-05, + "loss": 5.2738, + "step": 19671 + }, + { + "epoch": 0.11699495670377771, + "grad_norm": 2.502086877822876, + "learning_rate": 4.8330425988202097e-05, + "loss": 5.3218, + "step": 19672 + }, + { + "epoch": 0.11700090398705872, + "grad_norm": 2.1629221439361572, + "learning_rate": 4.833025814947206e-05, + "loss": 5.304, + "step": 19673 + }, + { + "epoch": 0.1170068512703397, + "grad_norm": 2.096604824066162, + "learning_rate": 4.8330090302597675e-05, + "loss": 5.3423, + "step": 19674 + }, + { + "epoch": 0.1170127985536207, + "grad_norm": 2.2843055725097656, + "learning_rate": 4.832992244757899e-05, + "loss": 5.2463, + "step": 19675 + }, + { + "epoch": 0.11701874583690171, + "grad_norm": 2.1538522243499756, + "learning_rate": 4.8329754584416074e-05, + "loss": 5.0529, + "step": 19676 + }, + { + "epoch": 0.1170246931201827, + "grad_norm": 1.763832688331604, + "learning_rate": 4.832958671310898e-05, + "loss": 5.105, + "step": 19677 + }, + { + "epoch": 0.1170306404034637, + "grad_norm": 2.048945426940918, + "learning_rate": 4.832941883365777e-05, + "loss": 5.1724, + "step": 19678 + }, + { + "epoch": 0.1170365876867447, + "grad_norm": 2.324202537536621, + "learning_rate": 4.83292509460625e-05, + "loss": 5.1574, + "step": 19679 + }, + { + "epoch": 0.11704253497002569, + "grad_norm": 2.447587728500366, + "learning_rate": 4.8329083050323235e-05, + "loss": 5.2401, + "step": 19680 + }, + { + "epoch": 0.11704848225330669, + "grad_norm": 2.212921380996704, + "learning_rate": 4.832891514644002e-05, + "loss": 5.1122, + "step": 19681 + }, + { + "epoch": 0.11705442953658769, + "grad_norm": 2.1183717250823975, + "learning_rate": 4.832874723441292e-05, + "loss": 4.985, + "step": 19682 + }, + { + "epoch": 0.11706037681986868, + "grad_norm": 2.1509101390838623, + "learning_rate": 4.8328579314242006e-05, + "loss": 5.1369, + "step": 19683 + }, + { + "epoch": 0.11706632410314968, + "grad_norm": 1.9071851968765259, + "learning_rate": 4.832841138592732e-05, + "loss": 5.0454, + "step": 19684 + }, + { + "epoch": 0.11707227138643068, + "grad_norm": 2.262612819671631, + "learning_rate": 4.8328243449468926e-05, + "loss": 5.0763, + "step": 19685 + }, + { + "epoch": 0.11707821866971167, + "grad_norm": 2.073665142059326, + "learning_rate": 4.8328075504866874e-05, + "loss": 5.0779, + "step": 19686 + }, + { + "epoch": 0.11708416595299267, + "grad_norm": 1.9270633459091187, + "learning_rate": 4.832790755212124e-05, + "loss": 4.8148, + "step": 19687 + }, + { + "epoch": 0.11709011323627368, + "grad_norm": 1.9167968034744263, + "learning_rate": 4.832773959123208e-05, + "loss": 4.8027, + "step": 19688 + }, + { + "epoch": 0.11709606051955466, + "grad_norm": 2.0495805740356445, + "learning_rate": 4.8327571622199444e-05, + "loss": 4.9483, + "step": 19689 + }, + { + "epoch": 0.11710200780283567, + "grad_norm": 2.203997850418091, + "learning_rate": 4.83274036450234e-05, + "loss": 5.1086, + "step": 19690 + }, + { + "epoch": 0.11710795508611666, + "grad_norm": 2.0023131370544434, + "learning_rate": 4.8327235659703984e-05, + "loss": 5.0601, + "step": 19691 + }, + { + "epoch": 0.11711390236939766, + "grad_norm": 2.3212523460388184, + "learning_rate": 4.832706766624128e-05, + "loss": 4.9391, + "step": 19692 + }, + { + "epoch": 0.11711984965267866, + "grad_norm": 2.2633869647979736, + "learning_rate": 4.8326899664635336e-05, + "loss": 5.0262, + "step": 19693 + }, + { + "epoch": 0.11712579693595965, + "grad_norm": 2.2608723640441895, + "learning_rate": 4.832673165488622e-05, + "loss": 4.9814, + "step": 19694 + }, + { + "epoch": 0.11713174421924065, + "grad_norm": 2.0270745754241943, + "learning_rate": 4.8326563636993975e-05, + "loss": 4.9321, + "step": 19695 + }, + { + "epoch": 0.11713769150252165, + "grad_norm": 2.1299290657043457, + "learning_rate": 4.832639561095867e-05, + "loss": 4.8248, + "step": 19696 + }, + { + "epoch": 0.11714363878580264, + "grad_norm": 2.1891887187957764, + "learning_rate": 4.8326227576780355e-05, + "loss": 4.963, + "step": 19697 + }, + { + "epoch": 0.11714958606908364, + "grad_norm": 2.35532546043396, + "learning_rate": 4.8326059534459114e-05, + "loss": 4.8617, + "step": 19698 + }, + { + "epoch": 0.11715553335236464, + "grad_norm": 2.215864658355713, + "learning_rate": 4.8325891483994964e-05, + "loss": 5.1467, + "step": 19699 + }, + { + "epoch": 0.11716148063564563, + "grad_norm": 1.7004871368408203, + "learning_rate": 4.8325723425387996e-05, + "loss": 4.8682, + "step": 19700 + }, + { + "epoch": 0.11716742791892663, + "grad_norm": 2.537426471710205, + "learning_rate": 4.832555535863826e-05, + "loss": 5.0373, + "step": 19701 + }, + { + "epoch": 0.11717337520220764, + "grad_norm": 2.3324837684631348, + "learning_rate": 4.832538728374581e-05, + "loss": 4.9261, + "step": 19702 + }, + { + "epoch": 0.11717932248548862, + "grad_norm": 2.107374906539917, + "learning_rate": 4.832521920071071e-05, + "loss": 5.0036, + "step": 19703 + }, + { + "epoch": 0.11718526976876963, + "grad_norm": 2.0933899879455566, + "learning_rate": 4.8325051109533024e-05, + "loss": 5.086, + "step": 19704 + }, + { + "epoch": 0.11719121705205063, + "grad_norm": 1.9250128269195557, + "learning_rate": 4.8324883010212794e-05, + "loss": 4.9056, + "step": 19705 + }, + { + "epoch": 0.11719716433533162, + "grad_norm": 2.0679538249969482, + "learning_rate": 4.832471490275009e-05, + "loss": 5.0291, + "step": 19706 + }, + { + "epoch": 0.11720311161861262, + "grad_norm": 2.1115055084228516, + "learning_rate": 4.8324546787144974e-05, + "loss": 4.8649, + "step": 19707 + }, + { + "epoch": 0.11720905890189362, + "grad_norm": 2.123899459838867, + "learning_rate": 4.832437866339749e-05, + "loss": 4.9011, + "step": 19708 + }, + { + "epoch": 0.11721500618517461, + "grad_norm": 2.2809536457061768, + "learning_rate": 4.832421053150772e-05, + "loss": 5.1844, + "step": 19709 + }, + { + "epoch": 0.11722095346845561, + "grad_norm": 2.04567551612854, + "learning_rate": 4.83240423914757e-05, + "loss": 4.8685, + "step": 19710 + }, + { + "epoch": 0.11722690075173661, + "grad_norm": 1.5762519836425781, + "learning_rate": 4.8323874243301495e-05, + "loss": 5.4069, + "step": 19711 + }, + { + "epoch": 0.1172328480350176, + "grad_norm": 1.719250202178955, + "learning_rate": 4.832370608698518e-05, + "loss": 5.6127, + "step": 19712 + }, + { + "epoch": 0.1172387953182986, + "grad_norm": 1.6808120012283325, + "learning_rate": 4.8323537922526785e-05, + "loss": 5.5401, + "step": 19713 + }, + { + "epoch": 0.1172447426015796, + "grad_norm": 1.6794480085372925, + "learning_rate": 4.832336974992639e-05, + "loss": 5.6679, + "step": 19714 + }, + { + "epoch": 0.11725068988486059, + "grad_norm": 1.7805535793304443, + "learning_rate": 4.832320156918405e-05, + "loss": 5.5025, + "step": 19715 + }, + { + "epoch": 0.1172566371681416, + "grad_norm": 2.1433472633361816, + "learning_rate": 4.832303338029982e-05, + "loss": 5.2425, + "step": 19716 + }, + { + "epoch": 0.1172625844514226, + "grad_norm": 1.5449565649032593, + "learning_rate": 4.832286518327376e-05, + "loss": 5.3278, + "step": 19717 + }, + { + "epoch": 0.11726853173470358, + "grad_norm": 1.7341786623001099, + "learning_rate": 4.832269697810592e-05, + "loss": 5.3393, + "step": 19718 + }, + { + "epoch": 0.11727447901798459, + "grad_norm": 1.4936028718948364, + "learning_rate": 4.832252876479638e-05, + "loss": 5.0499, + "step": 19719 + }, + { + "epoch": 0.11728042630126558, + "grad_norm": 1.7648371458053589, + "learning_rate": 4.832236054334518e-05, + "loss": 5.3585, + "step": 19720 + }, + { + "epoch": 0.11728637358454658, + "grad_norm": 1.8131940364837646, + "learning_rate": 4.832219231375238e-05, + "loss": 5.2496, + "step": 19721 + }, + { + "epoch": 0.11729232086782758, + "grad_norm": 1.5939579010009766, + "learning_rate": 4.832202407601806e-05, + "loss": 5.2294, + "step": 19722 + }, + { + "epoch": 0.11729826815110857, + "grad_norm": 1.6752222776412964, + "learning_rate": 4.832185583014225e-05, + "loss": 5.2679, + "step": 19723 + }, + { + "epoch": 0.11730421543438957, + "grad_norm": 1.4784640073776245, + "learning_rate": 4.832168757612502e-05, + "loss": 5.1567, + "step": 19724 + }, + { + "epoch": 0.11731016271767057, + "grad_norm": 1.5112851858139038, + "learning_rate": 4.8321519313966436e-05, + "loss": 5.0304, + "step": 19725 + }, + { + "epoch": 0.11731611000095156, + "grad_norm": 1.5895473957061768, + "learning_rate": 4.832135104366654e-05, + "loss": 5.0681, + "step": 19726 + }, + { + "epoch": 0.11732205728423256, + "grad_norm": 1.510641098022461, + "learning_rate": 4.832118276522541e-05, + "loss": 5.0667, + "step": 19727 + }, + { + "epoch": 0.11732800456751356, + "grad_norm": 1.7403017282485962, + "learning_rate": 4.83210144786431e-05, + "loss": 4.9199, + "step": 19728 + }, + { + "epoch": 0.11733395185079455, + "grad_norm": 2.239452600479126, + "learning_rate": 4.832084618391966e-05, + "loss": 5.2846, + "step": 19729 + }, + { + "epoch": 0.11733989913407555, + "grad_norm": 1.977001428604126, + "learning_rate": 4.8320677881055154e-05, + "loss": 4.9573, + "step": 19730 + }, + { + "epoch": 0.11734584641735656, + "grad_norm": 2.2819485664367676, + "learning_rate": 4.8320509570049633e-05, + "loss": 4.6549, + "step": 19731 + }, + { + "epoch": 0.11735179370063754, + "grad_norm": 2.3943941593170166, + "learning_rate": 4.832034125090317e-05, + "loss": 4.8411, + "step": 19732 + }, + { + "epoch": 0.11735774098391855, + "grad_norm": 2.5439767837524414, + "learning_rate": 4.832017292361582e-05, + "loss": 4.7305, + "step": 19733 + }, + { + "epoch": 0.11736368826719955, + "grad_norm": 2.21797251701355, + "learning_rate": 4.8320004588187636e-05, + "loss": 4.8963, + "step": 19734 + }, + { + "epoch": 0.11736963555048054, + "grad_norm": 1.9822254180908203, + "learning_rate": 4.831983624461868e-05, + "loss": 4.8062, + "step": 19735 + }, + { + "epoch": 0.11737558283376154, + "grad_norm": 2.56172513961792, + "learning_rate": 4.8319667892909004e-05, + "loss": 4.6495, + "step": 19736 + }, + { + "epoch": 0.11738153011704254, + "grad_norm": 2.3328988552093506, + "learning_rate": 4.831949953305868e-05, + "loss": 4.3587, + "step": 19737 + }, + { + "epoch": 0.11738747740032353, + "grad_norm": 2.4720728397369385, + "learning_rate": 4.831933116506775e-05, + "loss": 4.5648, + "step": 19738 + }, + { + "epoch": 0.11739342468360453, + "grad_norm": 2.3738696575164795, + "learning_rate": 4.831916278893629e-05, + "loss": 4.391, + "step": 19739 + }, + { + "epoch": 0.11739937196688553, + "grad_norm": 2.400050640106201, + "learning_rate": 4.831899440466435e-05, + "loss": 4.5792, + "step": 19740 + }, + { + "epoch": 0.11740531925016652, + "grad_norm": 1.7596909999847412, + "learning_rate": 4.831882601225199e-05, + "loss": 4.8026, + "step": 19741 + }, + { + "epoch": 0.11741126653344752, + "grad_norm": 2.2190558910369873, + "learning_rate": 4.831865761169927e-05, + "loss": 4.578, + "step": 19742 + }, + { + "epoch": 0.11741721381672852, + "grad_norm": 2.468982458114624, + "learning_rate": 4.831848920300624e-05, + "loss": 4.3132, + "step": 19743 + }, + { + "epoch": 0.11742316110000951, + "grad_norm": 2.1495306491851807, + "learning_rate": 4.831832078617298e-05, + "loss": 4.5307, + "step": 19744 + }, + { + "epoch": 0.11742910838329051, + "grad_norm": 2.2298312187194824, + "learning_rate": 4.831815236119953e-05, + "loss": 4.3435, + "step": 19745 + }, + { + "epoch": 0.11743505566657152, + "grad_norm": 2.0968551635742188, + "learning_rate": 4.831798392808595e-05, + "loss": 4.4348, + "step": 19746 + }, + { + "epoch": 0.1174410029498525, + "grad_norm": 2.2520592212677, + "learning_rate": 4.831781548683231e-05, + "loss": 4.4347, + "step": 19747 + }, + { + "epoch": 0.1174469502331335, + "grad_norm": 2.5319058895111084, + "learning_rate": 4.8317647037438655e-05, + "loss": 4.3817, + "step": 19748 + }, + { + "epoch": 0.1174528975164145, + "grad_norm": 2.186539649963379, + "learning_rate": 4.8317478579905054e-05, + "loss": 4.6415, + "step": 19749 + }, + { + "epoch": 0.1174588447996955, + "grad_norm": 2.472963571548462, + "learning_rate": 4.8317310114231554e-05, + "loss": 4.4495, + "step": 19750 + }, + { + "epoch": 0.1174647920829765, + "grad_norm": 2.3692901134490967, + "learning_rate": 4.831714164041823e-05, + "loss": 4.3571, + "step": 19751 + }, + { + "epoch": 0.11747073936625749, + "grad_norm": 1.8001717329025269, + "learning_rate": 4.831697315846513e-05, + "loss": 5.3843, + "step": 19752 + }, + { + "epoch": 0.11747668664953849, + "grad_norm": 1.6087725162506104, + "learning_rate": 4.8316804668372315e-05, + "loss": 5.7155, + "step": 19753 + }, + { + "epoch": 0.11748263393281949, + "grad_norm": 1.5348961353302002, + "learning_rate": 4.8316636170139845e-05, + "loss": 4.8697, + "step": 19754 + }, + { + "epoch": 0.11748858121610048, + "grad_norm": 1.790076494216919, + "learning_rate": 4.831646766376778e-05, + "loss": 5.708, + "step": 19755 + }, + { + "epoch": 0.11749452849938148, + "grad_norm": 1.8615236282348633, + "learning_rate": 4.831629914925617e-05, + "loss": 5.3669, + "step": 19756 + }, + { + "epoch": 0.11750047578266248, + "grad_norm": 1.5969476699829102, + "learning_rate": 4.8316130626605096e-05, + "loss": 5.4041, + "step": 19757 + }, + { + "epoch": 0.11750642306594347, + "grad_norm": 1.5471712350845337, + "learning_rate": 4.8315962095814584e-05, + "loss": 5.5293, + "step": 19758 + }, + { + "epoch": 0.11751237034922447, + "grad_norm": 1.6281818151474, + "learning_rate": 4.831579355688472e-05, + "loss": 5.51, + "step": 19759 + }, + { + "epoch": 0.11751831763250548, + "grad_norm": 1.5264689922332764, + "learning_rate": 4.831562500981555e-05, + "loss": 4.9906, + "step": 19760 + }, + { + "epoch": 0.11752426491578646, + "grad_norm": 1.8446382284164429, + "learning_rate": 4.8315456454607145e-05, + "loss": 4.8351, + "step": 19761 + }, + { + "epoch": 0.11753021219906747, + "grad_norm": 2.0462918281555176, + "learning_rate": 4.8315287891259545e-05, + "loss": 4.7906, + "step": 19762 + }, + { + "epoch": 0.11753615948234847, + "grad_norm": 1.664975643157959, + "learning_rate": 4.831511931977282e-05, + "loss": 5.4149, + "step": 19763 + }, + { + "epoch": 0.11754210676562946, + "grad_norm": 1.8824998140335083, + "learning_rate": 4.831495074014703e-05, + "loss": 5.2587, + "step": 19764 + }, + { + "epoch": 0.11754805404891046, + "grad_norm": 1.6167455911636353, + "learning_rate": 4.8314782152382235e-05, + "loss": 5.3213, + "step": 19765 + }, + { + "epoch": 0.11755400133219146, + "grad_norm": 1.686562180519104, + "learning_rate": 4.831461355647848e-05, + "loss": 5.3497, + "step": 19766 + }, + { + "epoch": 0.11755994861547245, + "grad_norm": 1.7332249879837036, + "learning_rate": 4.831444495243584e-05, + "loss": 5.3139, + "step": 19767 + }, + { + "epoch": 0.11756589589875345, + "grad_norm": 1.6482213735580444, + "learning_rate": 4.8314276340254375e-05, + "loss": 5.5488, + "step": 19768 + }, + { + "epoch": 0.11757184318203445, + "grad_norm": 1.6714067459106445, + "learning_rate": 4.8314107719934134e-05, + "loss": 4.7354, + "step": 19769 + }, + { + "epoch": 0.11757779046531544, + "grad_norm": 1.5826655626296997, + "learning_rate": 4.8313939091475166e-05, + "loss": 5.5232, + "step": 19770 + }, + { + "epoch": 0.11758373774859644, + "grad_norm": 1.4177565574645996, + "learning_rate": 4.831377045487756e-05, + "loss": 5.4262, + "step": 19771 + }, + { + "epoch": 0.11758968503187744, + "grad_norm": 1.4056715965270996, + "learning_rate": 4.831360181014135e-05, + "loss": 5.6306, + "step": 19772 + }, + { + "epoch": 0.11759563231515843, + "grad_norm": 1.7903814315795898, + "learning_rate": 4.83134331572666e-05, + "loss": 4.5016, + "step": 19773 + }, + { + "epoch": 0.11760157959843943, + "grad_norm": 1.8719782829284668, + "learning_rate": 4.831326449625337e-05, + "loss": 4.3561, + "step": 19774 + }, + { + "epoch": 0.11760752688172044, + "grad_norm": 2.0182130336761475, + "learning_rate": 4.831309582710173e-05, + "loss": 4.3988, + "step": 19775 + }, + { + "epoch": 0.11761347416500142, + "grad_norm": 1.828475832939148, + "learning_rate": 4.8312927149811726e-05, + "loss": 4.4127, + "step": 19776 + }, + { + "epoch": 0.11761942144828243, + "grad_norm": 1.8332375288009644, + "learning_rate": 4.831275846438341e-05, + "loss": 4.3285, + "step": 19777 + }, + { + "epoch": 0.11762536873156341, + "grad_norm": 1.7542626857757568, + "learning_rate": 4.831258977081686e-05, + "loss": 5.4412, + "step": 19778 + }, + { + "epoch": 0.11763131601484442, + "grad_norm": 1.9277591705322266, + "learning_rate": 4.831242106911212e-05, + "loss": 4.1537, + "step": 19779 + }, + { + "epoch": 0.11763726329812542, + "grad_norm": 1.943296194076538, + "learning_rate": 4.8312252359269265e-05, + "loss": 4.448, + "step": 19780 + }, + { + "epoch": 0.11764321058140641, + "grad_norm": 1.8032363653182983, + "learning_rate": 4.831208364128834e-05, + "loss": 4.9847, + "step": 19781 + }, + { + "epoch": 0.11764915786468741, + "grad_norm": 1.9383130073547363, + "learning_rate": 4.83119149151694e-05, + "loss": 4.7231, + "step": 19782 + }, + { + "epoch": 0.11765510514796841, + "grad_norm": 1.8854987621307373, + "learning_rate": 4.831174618091252e-05, + "loss": 4.1493, + "step": 19783 + }, + { + "epoch": 0.1176610524312494, + "grad_norm": 1.932180404663086, + "learning_rate": 4.831157743851775e-05, + "loss": 4.0519, + "step": 19784 + }, + { + "epoch": 0.1176669997145304, + "grad_norm": 1.885292887687683, + "learning_rate": 4.831140868798514e-05, + "loss": 4.1593, + "step": 19785 + }, + { + "epoch": 0.1176729469978114, + "grad_norm": 1.8257746696472168, + "learning_rate": 4.8311239929314764e-05, + "loss": 4.3896, + "step": 19786 + }, + { + "epoch": 0.11767889428109239, + "grad_norm": 1.9383732080459595, + "learning_rate": 4.831107116250667e-05, + "loss": 4.1973, + "step": 19787 + }, + { + "epoch": 0.1176848415643734, + "grad_norm": 1.9942466020584106, + "learning_rate": 4.831090238756093e-05, + "loss": 4.3542, + "step": 19788 + }, + { + "epoch": 0.1176907888476544, + "grad_norm": 1.5551074743270874, + "learning_rate": 4.831073360447759e-05, + "loss": 4.9338, + "step": 19789 + }, + { + "epoch": 0.11769673613093538, + "grad_norm": 1.5898525714874268, + "learning_rate": 4.831056481325672e-05, + "loss": 4.8582, + "step": 19790 + }, + { + "epoch": 0.11770268341421639, + "grad_norm": 1.7175228595733643, + "learning_rate": 4.831039601389836e-05, + "loss": 4.6618, + "step": 19791 + }, + { + "epoch": 0.11770863069749739, + "grad_norm": 2.3165528774261475, + "learning_rate": 4.8310227206402594e-05, + "loss": 4.8579, + "step": 19792 + }, + { + "epoch": 0.11771457798077838, + "grad_norm": 1.4406440258026123, + "learning_rate": 4.8310058390769464e-05, + "loss": 5.6443, + "step": 19793 + }, + { + "epoch": 0.11772052526405938, + "grad_norm": 1.6670812368392944, + "learning_rate": 4.8309889566999037e-05, + "loss": 5.2096, + "step": 19794 + }, + { + "epoch": 0.11772647254734038, + "grad_norm": 1.6150201559066772, + "learning_rate": 4.8309720735091354e-05, + "loss": 5.2055, + "step": 19795 + }, + { + "epoch": 0.11773241983062137, + "grad_norm": 1.7714163064956665, + "learning_rate": 4.83095518950465e-05, + "loss": 5.9145, + "step": 19796 + }, + { + "epoch": 0.11773836711390237, + "grad_norm": 1.3608043193817139, + "learning_rate": 4.8309383046864526e-05, + "loss": 5.1546, + "step": 19797 + }, + { + "epoch": 0.11774431439718337, + "grad_norm": 1.2962807416915894, + "learning_rate": 4.830921419054548e-05, + "loss": 5.3574, + "step": 19798 + }, + { + "epoch": 0.11775026168046436, + "grad_norm": 2.0007364749908447, + "learning_rate": 4.8309045326089434e-05, + "loss": 5.0939, + "step": 19799 + }, + { + "epoch": 0.11775620896374536, + "grad_norm": 1.6526695489883423, + "learning_rate": 4.830887645349644e-05, + "loss": 5.7498, + "step": 19800 + }, + { + "epoch": 0.11776215624702636, + "grad_norm": 1.4990460872650146, + "learning_rate": 4.830870757276655e-05, + "loss": 5.2728, + "step": 19801 + }, + { + "epoch": 0.11776810353030735, + "grad_norm": 2.182511806488037, + "learning_rate": 4.830853868389984e-05, + "loss": 5.1598, + "step": 19802 + }, + { + "epoch": 0.11777405081358835, + "grad_norm": 2.515284538269043, + "learning_rate": 4.8308369786896354e-05, + "loss": 5.1378, + "step": 19803 + }, + { + "epoch": 0.11777999809686936, + "grad_norm": 1.9783490896224976, + "learning_rate": 4.830820088175616e-05, + "loss": 4.9242, + "step": 19804 + }, + { + "epoch": 0.11778594538015034, + "grad_norm": 1.790901780128479, + "learning_rate": 4.8308031968479315e-05, + "loss": 5.1156, + "step": 19805 + }, + { + "epoch": 0.11779189266343135, + "grad_norm": 1.751846432685852, + "learning_rate": 4.830786304706587e-05, + "loss": 5.2306, + "step": 19806 + }, + { + "epoch": 0.11779783994671233, + "grad_norm": 1.588497519493103, + "learning_rate": 4.83076941175159e-05, + "loss": 5.3987, + "step": 19807 + }, + { + "epoch": 0.11780378722999334, + "grad_norm": 1.9150582551956177, + "learning_rate": 4.830752517982945e-05, + "loss": 4.977, + "step": 19808 + }, + { + "epoch": 0.11780973451327434, + "grad_norm": 1.706708312034607, + "learning_rate": 4.8307356234006584e-05, + "loss": 5.0455, + "step": 19809 + }, + { + "epoch": 0.11781568179655533, + "grad_norm": 1.9373780488967896, + "learning_rate": 4.830718728004736e-05, + "loss": 5.0547, + "step": 19810 + }, + { + "epoch": 0.11782162907983633, + "grad_norm": 1.6948046684265137, + "learning_rate": 4.830701831795184e-05, + "loss": 5.0943, + "step": 19811 + }, + { + "epoch": 0.11782757636311733, + "grad_norm": 1.630083680152893, + "learning_rate": 4.8306849347720087e-05, + "loss": 5.6369, + "step": 19812 + }, + { + "epoch": 0.11783352364639832, + "grad_norm": 1.4906461238861084, + "learning_rate": 4.830668036935214e-05, + "loss": 5.2921, + "step": 19813 + }, + { + "epoch": 0.11783947092967932, + "grad_norm": 1.6434717178344727, + "learning_rate": 4.8306511382848076e-05, + "loss": 5.3473, + "step": 19814 + }, + { + "epoch": 0.11784541821296032, + "grad_norm": 1.5606834888458252, + "learning_rate": 4.8306342388207956e-05, + "loss": 5.3031, + "step": 19815 + }, + { + "epoch": 0.11785136549624131, + "grad_norm": 2.157352924346924, + "learning_rate": 4.830617338543183e-05, + "loss": 4.4939, + "step": 19816 + }, + { + "epoch": 0.11785731277952231, + "grad_norm": 2.49686598777771, + "learning_rate": 4.830600437451975e-05, + "loss": 4.506, + "step": 19817 + }, + { + "epoch": 0.11786326006280332, + "grad_norm": 1.943969964981079, + "learning_rate": 4.830583535547179e-05, + "loss": 4.411, + "step": 19818 + }, + { + "epoch": 0.1178692073460843, + "grad_norm": 1.9092329740524292, + "learning_rate": 4.830566632828801e-05, + "loss": 4.4121, + "step": 19819 + }, + { + "epoch": 0.1178751546293653, + "grad_norm": 1.7568551301956177, + "learning_rate": 4.830549729296846e-05, + "loss": 4.317, + "step": 19820 + }, + { + "epoch": 0.11788110191264631, + "grad_norm": 1.788150429725647, + "learning_rate": 4.83053282495132e-05, + "loss": 4.2928, + "step": 19821 + }, + { + "epoch": 0.1178870491959273, + "grad_norm": 1.9792863130569458, + "learning_rate": 4.830515919792229e-05, + "loss": 4.3219, + "step": 19822 + }, + { + "epoch": 0.1178929964792083, + "grad_norm": 2.2407681941986084, + "learning_rate": 4.8304990138195795e-05, + "loss": 4.296, + "step": 19823 + }, + { + "epoch": 0.1178989437624893, + "grad_norm": 1.993288516998291, + "learning_rate": 4.830482107033377e-05, + "loss": 4.2922, + "step": 19824 + }, + { + "epoch": 0.11790489104577029, + "grad_norm": 2.1966097354888916, + "learning_rate": 4.8304651994336264e-05, + "loss": 4.1215, + "step": 19825 + }, + { + "epoch": 0.11791083832905129, + "grad_norm": 1.569989562034607, + "learning_rate": 4.8304482910203345e-05, + "loss": 5.5432, + "step": 19826 + }, + { + "epoch": 0.11791678561233229, + "grad_norm": 1.522828459739685, + "learning_rate": 4.8304313817935075e-05, + "loss": 5.465, + "step": 19827 + }, + { + "epoch": 0.11792273289561328, + "grad_norm": 1.9455969333648682, + "learning_rate": 4.830414471753151e-05, + "loss": 5.1462, + "step": 19828 + }, + { + "epoch": 0.11792868017889428, + "grad_norm": 1.8587162494659424, + "learning_rate": 4.830397560899271e-05, + "loss": 5.1987, + "step": 19829 + }, + { + "epoch": 0.11793462746217528, + "grad_norm": 2.1671674251556396, + "learning_rate": 4.830380649231873e-05, + "loss": 5.3333, + "step": 19830 + }, + { + "epoch": 0.11794057474545627, + "grad_norm": 1.8267066478729248, + "learning_rate": 4.8303637367509636e-05, + "loss": 5.5306, + "step": 19831 + }, + { + "epoch": 0.11794652202873727, + "grad_norm": 1.80419921875, + "learning_rate": 4.830346823456548e-05, + "loss": 5.3077, + "step": 19832 + }, + { + "epoch": 0.11795246931201828, + "grad_norm": 1.9116721153259277, + "learning_rate": 4.830329909348632e-05, + "loss": 4.8531, + "step": 19833 + }, + { + "epoch": 0.11795841659529926, + "grad_norm": 1.9208347797393799, + "learning_rate": 4.830312994427223e-05, + "loss": 4.9645, + "step": 19834 + }, + { + "epoch": 0.11796436387858027, + "grad_norm": 1.8385374546051025, + "learning_rate": 4.8302960786923246e-05, + "loss": 4.7095, + "step": 19835 + }, + { + "epoch": 0.11797031116186125, + "grad_norm": 1.9271587133407593, + "learning_rate": 4.830279162143945e-05, + "loss": 4.5788, + "step": 19836 + }, + { + "epoch": 0.11797625844514226, + "grad_norm": 2.0168333053588867, + "learning_rate": 4.8302622447820885e-05, + "loss": 4.7595, + "step": 19837 + }, + { + "epoch": 0.11798220572842326, + "grad_norm": 1.9674837589263916, + "learning_rate": 4.8302453266067616e-05, + "loss": 4.674, + "step": 19838 + }, + { + "epoch": 0.11798815301170425, + "grad_norm": 1.944601058959961, + "learning_rate": 4.830228407617969e-05, + "loss": 4.6683, + "step": 19839 + }, + { + "epoch": 0.11799410029498525, + "grad_norm": 1.8970340490341187, + "learning_rate": 4.83021148781572e-05, + "loss": 5.2577, + "step": 19840 + }, + { + "epoch": 0.11800004757826625, + "grad_norm": 2.035505533218384, + "learning_rate": 4.8301945672000164e-05, + "loss": 4.7872, + "step": 19841 + }, + { + "epoch": 0.11800599486154724, + "grad_norm": 2.4211058616638184, + "learning_rate": 4.830177645770867e-05, + "loss": 4.9424, + "step": 19842 + }, + { + "epoch": 0.11801194214482824, + "grad_norm": 2.080132484436035, + "learning_rate": 4.830160723528276e-05, + "loss": 4.7908, + "step": 19843 + }, + { + "epoch": 0.11801788942810924, + "grad_norm": 3.5975728034973145, + "learning_rate": 4.83014380047225e-05, + "loss": 5.3434, + "step": 19844 + }, + { + "epoch": 0.11802383671139023, + "grad_norm": 1.6917449235916138, + "learning_rate": 4.830126876602795e-05, + "loss": 5.2593, + "step": 19845 + }, + { + "epoch": 0.11802978399467123, + "grad_norm": 1.8179433345794678, + "learning_rate": 4.8301099519199173e-05, + "loss": 5.9407, + "step": 19846 + }, + { + "epoch": 0.11803573127795224, + "grad_norm": 1.652653694152832, + "learning_rate": 4.8300930264236216e-05, + "loss": 5.505, + "step": 19847 + }, + { + "epoch": 0.11804167856123322, + "grad_norm": 1.6400798559188843, + "learning_rate": 4.830076100113915e-05, + "loss": 5.7281, + "step": 19848 + }, + { + "epoch": 0.11804762584451423, + "grad_norm": 1.865049123764038, + "learning_rate": 4.830059172990802e-05, + "loss": 5.4562, + "step": 19849 + }, + { + "epoch": 0.11805357312779523, + "grad_norm": 1.68345308303833, + "learning_rate": 4.8300422450542906e-05, + "loss": 5.3027, + "step": 19850 + }, + { + "epoch": 0.11805952041107622, + "grad_norm": 2.1790804862976074, + "learning_rate": 4.8300253163043855e-05, + "loss": 4.5531, + "step": 19851 + }, + { + "epoch": 0.11806546769435722, + "grad_norm": 2.63421368598938, + "learning_rate": 4.8300083867410915e-05, + "loss": 4.0978, + "step": 19852 + }, + { + "epoch": 0.11807141497763822, + "grad_norm": 1.8692448139190674, + "learning_rate": 4.829991456364417e-05, + "loss": 5.5482, + "step": 19853 + }, + { + "epoch": 0.11807736226091921, + "grad_norm": 1.684128761291504, + "learning_rate": 4.829974525174365e-05, + "loss": 5.5612, + "step": 19854 + }, + { + "epoch": 0.11808330954420021, + "grad_norm": 1.5720278024673462, + "learning_rate": 4.829957593170944e-05, + "loss": 5.6787, + "step": 19855 + }, + { + "epoch": 0.11808925682748121, + "grad_norm": 1.834423303604126, + "learning_rate": 4.829940660354159e-05, + "loss": 4.5591, + "step": 19856 + }, + { + "epoch": 0.1180952041107622, + "grad_norm": 1.7370680570602417, + "learning_rate": 4.829923726724015e-05, + "loss": 5.1643, + "step": 19857 + }, + { + "epoch": 0.1181011513940432, + "grad_norm": 2.1546318531036377, + "learning_rate": 4.829906792280519e-05, + "loss": 4.5788, + "step": 19858 + }, + { + "epoch": 0.1181070986773242, + "grad_norm": 2.5604169368743896, + "learning_rate": 4.829889857023677e-05, + "loss": 3.1948, + "step": 19859 + }, + { + "epoch": 0.11811304596060519, + "grad_norm": 2.072169780731201, + "learning_rate": 4.829872920953494e-05, + "loss": 3.9707, + "step": 19860 + }, + { + "epoch": 0.1181189932438862, + "grad_norm": 1.7981303930282593, + "learning_rate": 4.829855984069976e-05, + "loss": 5.8413, + "step": 19861 + }, + { + "epoch": 0.1181249405271672, + "grad_norm": 1.621327519416809, + "learning_rate": 4.8298390463731305e-05, + "loss": 5.4867, + "step": 19862 + }, + { + "epoch": 0.11813088781044818, + "grad_norm": 1.5245294570922852, + "learning_rate": 4.829822107862962e-05, + "loss": 5.7148, + "step": 19863 + }, + { + "epoch": 0.11813683509372919, + "grad_norm": 2.2656896114349365, + "learning_rate": 4.8298051685394765e-05, + "loss": 5.6678, + "step": 19864 + }, + { + "epoch": 0.11814278237701017, + "grad_norm": 1.8529094457626343, + "learning_rate": 4.8297882284026805e-05, + "loss": 5.4445, + "step": 19865 + }, + { + "epoch": 0.11814872966029118, + "grad_norm": 1.5151565074920654, + "learning_rate": 4.829771287452579e-05, + "loss": 5.2794, + "step": 19866 + }, + { + "epoch": 0.11815467694357218, + "grad_norm": 1.8492248058319092, + "learning_rate": 4.829754345689178e-05, + "loss": 5.0797, + "step": 19867 + }, + { + "epoch": 0.11816062422685317, + "grad_norm": 2.7612802982330322, + "learning_rate": 4.829737403112484e-05, + "loss": 5.1486, + "step": 19868 + }, + { + "epoch": 0.11816657151013417, + "grad_norm": 1.9457459449768066, + "learning_rate": 4.8297204597225035e-05, + "loss": 5.6507, + "step": 19869 + }, + { + "epoch": 0.11817251879341517, + "grad_norm": 1.6429107189178467, + "learning_rate": 4.829703515519242e-05, + "loss": 5.8414, + "step": 19870 + }, + { + "epoch": 0.11817846607669616, + "grad_norm": 1.556187391281128, + "learning_rate": 4.829686570502704e-05, + "loss": 5.9028, + "step": 19871 + }, + { + "epoch": 0.11818441335997716, + "grad_norm": 1.451532006263733, + "learning_rate": 4.8296696246728965e-05, + "loss": 5.8497, + "step": 19872 + }, + { + "epoch": 0.11819036064325816, + "grad_norm": 1.7325583696365356, + "learning_rate": 4.8296526780298256e-05, + "loss": 5.3531, + "step": 19873 + }, + { + "epoch": 0.11819630792653915, + "grad_norm": 1.784332275390625, + "learning_rate": 4.829635730573497e-05, + "loss": 5.6025, + "step": 19874 + }, + { + "epoch": 0.11820225520982015, + "grad_norm": 1.6109933853149414, + "learning_rate": 4.829618782303917e-05, + "loss": 5.5626, + "step": 19875 + }, + { + "epoch": 0.11820820249310116, + "grad_norm": 1.6639639139175415, + "learning_rate": 4.8296018332210905e-05, + "loss": 5.5679, + "step": 19876 + }, + { + "epoch": 0.11821414977638214, + "grad_norm": 1.8205533027648926, + "learning_rate": 4.829584883325025e-05, + "loss": 5.448, + "step": 19877 + }, + { + "epoch": 0.11822009705966315, + "grad_norm": 1.6450576782226562, + "learning_rate": 4.829567932615725e-05, + "loss": 5.5966, + "step": 19878 + }, + { + "epoch": 0.11822604434294415, + "grad_norm": 1.456151008605957, + "learning_rate": 4.829550981093196e-05, + "loss": 5.5194, + "step": 19879 + }, + { + "epoch": 0.11823199162622514, + "grad_norm": 1.6064491271972656, + "learning_rate": 4.829534028757446e-05, + "loss": 5.6929, + "step": 19880 + }, + { + "epoch": 0.11823793890950614, + "grad_norm": 1.438132405281067, + "learning_rate": 4.829517075608479e-05, + "loss": 5.6738, + "step": 19881 + }, + { + "epoch": 0.11824388619278714, + "grad_norm": 2.503048896789551, + "learning_rate": 4.8295001216463024e-05, + "loss": 4.9929, + "step": 19882 + }, + { + "epoch": 0.11824983347606813, + "grad_norm": 2.3379812240600586, + "learning_rate": 4.829483166870921e-05, + "loss": 4.7947, + "step": 19883 + }, + { + "epoch": 0.11825578075934913, + "grad_norm": 2.055328130722046, + "learning_rate": 4.829466211282341e-05, + "loss": 5.3265, + "step": 19884 + }, + { + "epoch": 0.11826172804263013, + "grad_norm": 1.7393126487731934, + "learning_rate": 4.829449254880569e-05, + "loss": 5.0483, + "step": 19885 + }, + { + "epoch": 0.11826767532591112, + "grad_norm": 2.3054347038269043, + "learning_rate": 4.829432297665609e-05, + "loss": 4.9002, + "step": 19886 + }, + { + "epoch": 0.11827362260919212, + "grad_norm": 2.434323310852051, + "learning_rate": 4.82941533963747e-05, + "loss": 4.8013, + "step": 19887 + }, + { + "epoch": 0.11827956989247312, + "grad_norm": 2.0834875106811523, + "learning_rate": 4.829398380796155e-05, + "loss": 4.786, + "step": 19888 + }, + { + "epoch": 0.11828551717575411, + "grad_norm": 1.6682358980178833, + "learning_rate": 4.829381421141671e-05, + "loss": 5.6843, + "step": 19889 + }, + { + "epoch": 0.11829146445903511, + "grad_norm": 1.8787375688552856, + "learning_rate": 4.829364460674025e-05, + "loss": 5.5191, + "step": 19890 + }, + { + "epoch": 0.11829741174231612, + "grad_norm": 1.7496438026428223, + "learning_rate": 4.829347499393221e-05, + "loss": 5.6968, + "step": 19891 + }, + { + "epoch": 0.1183033590255971, + "grad_norm": 1.5585973262786865, + "learning_rate": 4.829330537299266e-05, + "loss": 5.5588, + "step": 19892 + }, + { + "epoch": 0.1183093063088781, + "grad_norm": 1.8294848203659058, + "learning_rate": 4.8293135743921664e-05, + "loss": 5.2407, + "step": 19893 + }, + { + "epoch": 0.11831525359215911, + "grad_norm": 1.4877654314041138, + "learning_rate": 4.829296610671927e-05, + "loss": 5.5383, + "step": 19894 + }, + { + "epoch": 0.1183212008754401, + "grad_norm": 1.5250638723373413, + "learning_rate": 4.829279646138554e-05, + "loss": 5.6443, + "step": 19895 + }, + { + "epoch": 0.1183271481587211, + "grad_norm": 1.5662062168121338, + "learning_rate": 4.829262680792054e-05, + "loss": 5.5409, + "step": 19896 + }, + { + "epoch": 0.11833309544200209, + "grad_norm": 1.1783791780471802, + "learning_rate": 4.829245714632432e-05, + "loss": 5.6169, + "step": 19897 + }, + { + "epoch": 0.11833904272528309, + "grad_norm": 1.4960299730300903, + "learning_rate": 4.829228747659695e-05, + "loss": 5.7195, + "step": 19898 + }, + { + "epoch": 0.11834499000856409, + "grad_norm": 1.437047004699707, + "learning_rate": 4.829211779873848e-05, + "loss": 5.7229, + "step": 19899 + }, + { + "epoch": 0.11835093729184508, + "grad_norm": 1.4095619916915894, + "learning_rate": 4.829194811274897e-05, + "loss": 5.7227, + "step": 19900 + }, + { + "epoch": 0.11835688457512608, + "grad_norm": 1.5694538354873657, + "learning_rate": 4.829177841862849e-05, + "loss": 5.356, + "step": 19901 + }, + { + "epoch": 0.11836283185840708, + "grad_norm": 1.7124476432800293, + "learning_rate": 4.829160871637708e-05, + "loss": 4.9185, + "step": 19902 + }, + { + "epoch": 0.11836877914168807, + "grad_norm": 2.2423064708709717, + "learning_rate": 4.829143900599481e-05, + "loss": 5.4345, + "step": 19903 + }, + { + "epoch": 0.11837472642496907, + "grad_norm": 1.8333791494369507, + "learning_rate": 4.829126928748175e-05, + "loss": 5.3666, + "step": 19904 + }, + { + "epoch": 0.11838067370825008, + "grad_norm": 1.5184969902038574, + "learning_rate": 4.8291099560837936e-05, + "loss": 5.4372, + "step": 19905 + }, + { + "epoch": 0.11838662099153106, + "grad_norm": 1.628544807434082, + "learning_rate": 4.829092982606345e-05, + "loss": 5.2682, + "step": 19906 + }, + { + "epoch": 0.11839256827481207, + "grad_norm": 1.5791584253311157, + "learning_rate": 4.829076008315834e-05, + "loss": 5.2149, + "step": 19907 + }, + { + "epoch": 0.11839851555809307, + "grad_norm": 1.299560546875, + "learning_rate": 4.8290590332122656e-05, + "loss": 5.1735, + "step": 19908 + }, + { + "epoch": 0.11840446284137406, + "grad_norm": 1.343913197517395, + "learning_rate": 4.829042057295647e-05, + "loss": 5.2344, + "step": 19909 + }, + { + "epoch": 0.11841041012465506, + "grad_norm": 1.2621396780014038, + "learning_rate": 4.829025080565985e-05, + "loss": 5.2982, + "step": 19910 + }, + { + "epoch": 0.11841635740793606, + "grad_norm": 1.2189174890518188, + "learning_rate": 4.829008103023284e-05, + "loss": 5.3347, + "step": 19911 + }, + { + "epoch": 0.11842230469121705, + "grad_norm": 1.2917883396148682, + "learning_rate": 4.82899112466755e-05, + "loss": 5.0745, + "step": 19912 + }, + { + "epoch": 0.11842825197449805, + "grad_norm": 1.2382320165634155, + "learning_rate": 4.828974145498789e-05, + "loss": 5.1999, + "step": 19913 + }, + { + "epoch": 0.11843419925777905, + "grad_norm": 1.398218035697937, + "learning_rate": 4.828957165517007e-05, + "loss": 5.4944, + "step": 19914 + }, + { + "epoch": 0.11844014654106004, + "grad_norm": 1.448901653289795, + "learning_rate": 4.8289401847222115e-05, + "loss": 5.4645, + "step": 19915 + }, + { + "epoch": 0.11844609382434104, + "grad_norm": 1.4628182649612427, + "learning_rate": 4.828923203114406e-05, + "loss": 5.003, + "step": 19916 + }, + { + "epoch": 0.11845204110762204, + "grad_norm": 1.3390740156173706, + "learning_rate": 4.828906220693598e-05, + "loss": 5.3482, + "step": 19917 + }, + { + "epoch": 0.11845798839090303, + "grad_norm": 1.539097547531128, + "learning_rate": 4.8288892374597925e-05, + "loss": 5.304, + "step": 19918 + }, + { + "epoch": 0.11846393567418403, + "grad_norm": 1.4011404514312744, + "learning_rate": 4.828872253412996e-05, + "loss": 5.2073, + "step": 19919 + }, + { + "epoch": 0.11846988295746504, + "grad_norm": 1.4064414501190186, + "learning_rate": 4.828855268553214e-05, + "loss": 5.2316, + "step": 19920 + }, + { + "epoch": 0.11847583024074602, + "grad_norm": 1.5808193683624268, + "learning_rate": 4.828838282880452e-05, + "loss": 5.211, + "step": 19921 + }, + { + "epoch": 0.11848177752402703, + "grad_norm": 1.5043809413909912, + "learning_rate": 4.828821296394718e-05, + "loss": 5.0564, + "step": 19922 + }, + { + "epoch": 0.11848772480730803, + "grad_norm": 1.2494529485702515, + "learning_rate": 4.828804309096016e-05, + "loss": 5.1523, + "step": 19923 + }, + { + "epoch": 0.11849367209058902, + "grad_norm": 1.4186055660247803, + "learning_rate": 4.8287873209843524e-05, + "loss": 4.9103, + "step": 19924 + }, + { + "epoch": 0.11849961937387002, + "grad_norm": 1.6093229055404663, + "learning_rate": 4.828770332059733e-05, + "loss": 4.9215, + "step": 19925 + }, + { + "epoch": 0.118505566657151, + "grad_norm": 1.5125865936279297, + "learning_rate": 4.8287533423221643e-05, + "loss": 5.0515, + "step": 19926 + }, + { + "epoch": 0.11851151394043201, + "grad_norm": 1.5410135984420776, + "learning_rate": 4.828736351771652e-05, + "loss": 4.9576, + "step": 19927 + }, + { + "epoch": 0.11851746122371301, + "grad_norm": 1.5431303977966309, + "learning_rate": 4.828719360408201e-05, + "loss": 5.1606, + "step": 19928 + }, + { + "epoch": 0.118523408506994, + "grad_norm": 1.4709242582321167, + "learning_rate": 4.828702368231819e-05, + "loss": 4.7685, + "step": 19929 + }, + { + "epoch": 0.118529355790275, + "grad_norm": 1.173568606376648, + "learning_rate": 4.828685375242511e-05, + "loss": 4.7591, + "step": 19930 + }, + { + "epoch": 0.118535303073556, + "grad_norm": 1.3113515377044678, + "learning_rate": 4.828668381440283e-05, + "loss": 4.786, + "step": 19931 + }, + { + "epoch": 0.11854125035683699, + "grad_norm": 1.4658124446868896, + "learning_rate": 4.828651386825141e-05, + "loss": 4.7776, + "step": 19932 + }, + { + "epoch": 0.118547197640118, + "grad_norm": 1.3406554460525513, + "learning_rate": 4.828634391397091e-05, + "loss": 5.0733, + "step": 19933 + }, + { + "epoch": 0.118553144923399, + "grad_norm": 1.2102482318878174, + "learning_rate": 4.828617395156138e-05, + "loss": 5.0069, + "step": 19934 + }, + { + "epoch": 0.11855909220667998, + "grad_norm": 0.989989697933197, + "learning_rate": 4.828600398102289e-05, + "loss": 4.759, + "step": 19935 + }, + { + "epoch": 0.11856503948996099, + "grad_norm": 1.2296501398086548, + "learning_rate": 4.82858340023555e-05, + "loss": 4.6269, + "step": 19936 + }, + { + "epoch": 0.11857098677324199, + "grad_norm": 1.5649582147598267, + "learning_rate": 4.828566401555926e-05, + "loss": 5.0196, + "step": 19937 + }, + { + "epoch": 0.11857693405652298, + "grad_norm": 1.2393609285354614, + "learning_rate": 4.8285494020634245e-05, + "loss": 5.059, + "step": 19938 + }, + { + "epoch": 0.11858288133980398, + "grad_norm": 1.450697422027588, + "learning_rate": 4.82853240175805e-05, + "loss": 5.1143, + "step": 19939 + }, + { + "epoch": 0.11858882862308498, + "grad_norm": 1.4795258045196533, + "learning_rate": 4.8285154006398084e-05, + "loss": 5.075, + "step": 19940 + }, + { + "epoch": 0.11859477590636597, + "grad_norm": 1.5858484506607056, + "learning_rate": 4.828498398708707e-05, + "loss": 5.0665, + "step": 19941 + }, + { + "epoch": 0.11860072318964697, + "grad_norm": 1.3411937952041626, + "learning_rate": 4.82848139596475e-05, + "loss": 4.9864, + "step": 19942 + }, + { + "epoch": 0.11860667047292797, + "grad_norm": 1.4348468780517578, + "learning_rate": 4.828464392407945e-05, + "loss": 4.904, + "step": 19943 + }, + { + "epoch": 0.11861261775620896, + "grad_norm": 1.4753068685531616, + "learning_rate": 4.8284473880382967e-05, + "loss": 5.0784, + "step": 19944 + }, + { + "epoch": 0.11861856503948996, + "grad_norm": 1.379059076309204, + "learning_rate": 4.828430382855811e-05, + "loss": 4.9782, + "step": 19945 + }, + { + "epoch": 0.11862451232277096, + "grad_norm": 1.444729208946228, + "learning_rate": 4.828413376860495e-05, + "loss": 5.5804, + "step": 19946 + }, + { + "epoch": 0.11863045960605195, + "grad_norm": 1.3467416763305664, + "learning_rate": 4.8283963700523535e-05, + "loss": 5.3278, + "step": 19947 + }, + { + "epoch": 0.11863640688933295, + "grad_norm": 1.5206544399261475, + "learning_rate": 4.8283793624313936e-05, + "loss": 5.01, + "step": 19948 + }, + { + "epoch": 0.11864235417261396, + "grad_norm": 1.394729733467102, + "learning_rate": 4.8283623539976195e-05, + "loss": 5.2139, + "step": 19949 + }, + { + "epoch": 0.11864830145589494, + "grad_norm": 1.3675029277801514, + "learning_rate": 4.8283453447510394e-05, + "loss": 5.4559, + "step": 19950 + }, + { + "epoch": 0.11865424873917595, + "grad_norm": 1.1950232982635498, + "learning_rate": 4.828328334691657e-05, + "loss": 5.2233, + "step": 19951 + }, + { + "epoch": 0.11866019602245695, + "grad_norm": 1.3517179489135742, + "learning_rate": 4.82831132381948e-05, + "loss": 5.0519, + "step": 19952 + }, + { + "epoch": 0.11866614330573794, + "grad_norm": 1.4184643030166626, + "learning_rate": 4.828294312134512e-05, + "loss": 4.8722, + "step": 19953 + }, + { + "epoch": 0.11867209058901894, + "grad_norm": 1.4558582305908203, + "learning_rate": 4.828277299636762e-05, + "loss": 5.3876, + "step": 19954 + }, + { + "epoch": 0.11867803787229993, + "grad_norm": 1.4617977142333984, + "learning_rate": 4.8282602863262345e-05, + "loss": 5.4784, + "step": 19955 + }, + { + "epoch": 0.11868398515558093, + "grad_norm": 1.4997669458389282, + "learning_rate": 4.828243272202935e-05, + "loss": 5.2556, + "step": 19956 + }, + { + "epoch": 0.11868993243886193, + "grad_norm": 1.2730913162231445, + "learning_rate": 4.8282262572668696e-05, + "loss": 5.3194, + "step": 19957 + }, + { + "epoch": 0.11869587972214292, + "grad_norm": 1.4149047136306763, + "learning_rate": 4.8282092415180444e-05, + "loss": 5.5139, + "step": 19958 + }, + { + "epoch": 0.11870182700542392, + "grad_norm": 1.2510145902633667, + "learning_rate": 4.828192224956466e-05, + "loss": 5.2486, + "step": 19959 + }, + { + "epoch": 0.11870777428870492, + "grad_norm": 1.2229409217834473, + "learning_rate": 4.828175207582139e-05, + "loss": 5.2391, + "step": 19960 + }, + { + "epoch": 0.11871372157198591, + "grad_norm": 1.3316899538040161, + "learning_rate": 4.828158189395071e-05, + "loss": 5.2928, + "step": 19961 + }, + { + "epoch": 0.11871966885526691, + "grad_norm": 1.4331640005111694, + "learning_rate": 4.828141170395266e-05, + "loss": 5.3311, + "step": 19962 + }, + { + "epoch": 0.11872561613854792, + "grad_norm": 1.3313428163528442, + "learning_rate": 4.828124150582732e-05, + "loss": 5.2203, + "step": 19963 + }, + { + "epoch": 0.1187315634218289, + "grad_norm": 1.6505075693130493, + "learning_rate": 4.828107129957473e-05, + "loss": 4.8604, + "step": 19964 + }, + { + "epoch": 0.1187375107051099, + "grad_norm": 1.3544394969940186, + "learning_rate": 4.828090108519496e-05, + "loss": 5.17, + "step": 19965 + }, + { + "epoch": 0.11874345798839091, + "grad_norm": 1.3194384574890137, + "learning_rate": 4.828073086268808e-05, + "loss": 5.2197, + "step": 19966 + }, + { + "epoch": 0.1187494052716719, + "grad_norm": 1.4014582633972168, + "learning_rate": 4.8280560632054126e-05, + "loss": 5.2865, + "step": 19967 + }, + { + "epoch": 0.1187553525549529, + "grad_norm": 1.5148218870162964, + "learning_rate": 4.828039039329317e-05, + "loss": 5.3765, + "step": 19968 + }, + { + "epoch": 0.1187612998382339, + "grad_norm": 1.3657969236373901, + "learning_rate": 4.828022014640527e-05, + "loss": 4.9787, + "step": 19969 + }, + { + "epoch": 0.11876724712151489, + "grad_norm": 1.547717571258545, + "learning_rate": 4.828004989139049e-05, + "loss": 5.0538, + "step": 19970 + }, + { + "epoch": 0.11877319440479589, + "grad_norm": 1.5132863521575928, + "learning_rate": 4.827987962824888e-05, + "loss": 5.0301, + "step": 19971 + }, + { + "epoch": 0.11877914168807689, + "grad_norm": 1.4020887613296509, + "learning_rate": 4.827970935698051e-05, + "loss": 4.9646, + "step": 19972 + }, + { + "epoch": 0.11878508897135788, + "grad_norm": 1.4983519315719604, + "learning_rate": 4.8279539077585424e-05, + "loss": 5.2266, + "step": 19973 + }, + { + "epoch": 0.11879103625463888, + "grad_norm": 1.3545745611190796, + "learning_rate": 4.82793687900637e-05, + "loss": 5.108, + "step": 19974 + }, + { + "epoch": 0.11879698353791988, + "grad_norm": 1.4865717887878418, + "learning_rate": 4.827919849441539e-05, + "loss": 5.257, + "step": 19975 + }, + { + "epoch": 0.11880293082120087, + "grad_norm": 1.4389182329177856, + "learning_rate": 4.8279028190640546e-05, + "loss": 4.976, + "step": 19976 + }, + { + "epoch": 0.11880887810448187, + "grad_norm": 1.2823866605758667, + "learning_rate": 4.827885787873924e-05, + "loss": 4.7617, + "step": 19977 + }, + { + "epoch": 0.11881482538776288, + "grad_norm": 1.369992971420288, + "learning_rate": 4.8278687558711525e-05, + "loss": 4.7165, + "step": 19978 + }, + { + "epoch": 0.11882077267104386, + "grad_norm": 1.2873594760894775, + "learning_rate": 4.827851723055745e-05, + "loss": 4.6705, + "step": 19979 + }, + { + "epoch": 0.11882671995432487, + "grad_norm": 1.3779295682907104, + "learning_rate": 4.827834689427709e-05, + "loss": 4.9752, + "step": 19980 + }, + { + "epoch": 0.11883266723760587, + "grad_norm": 1.5264688730239868, + "learning_rate": 4.82781765498705e-05, + "loss": 5.0295, + "step": 19981 + }, + { + "epoch": 0.11883861452088686, + "grad_norm": 1.6745606660842896, + "learning_rate": 4.827800619733774e-05, + "loss": 5.4265, + "step": 19982 + }, + { + "epoch": 0.11884456180416786, + "grad_norm": 1.5993295907974243, + "learning_rate": 4.8277835836678874e-05, + "loss": 5.0611, + "step": 19983 + }, + { + "epoch": 0.11885050908744885, + "grad_norm": 1.6451520919799805, + "learning_rate": 4.827766546789395e-05, + "loss": 4.9504, + "step": 19984 + }, + { + "epoch": 0.11885645637072985, + "grad_norm": 1.4769519567489624, + "learning_rate": 4.827749509098304e-05, + "loss": 5.1324, + "step": 19985 + }, + { + "epoch": 0.11886240365401085, + "grad_norm": 1.6930506229400635, + "learning_rate": 4.827732470594619e-05, + "loss": 5.134, + "step": 19986 + }, + { + "epoch": 0.11886835093729184, + "grad_norm": 1.1951912641525269, + "learning_rate": 4.827715431278347e-05, + "loss": 5.2521, + "step": 19987 + }, + { + "epoch": 0.11887429822057284, + "grad_norm": 1.3520997762680054, + "learning_rate": 4.827698391149493e-05, + "loss": 5.1791, + "step": 19988 + }, + { + "epoch": 0.11888024550385384, + "grad_norm": 1.3710130453109741, + "learning_rate": 4.8276813502080644e-05, + "loss": 5.1179, + "step": 19989 + }, + { + "epoch": 0.11888619278713483, + "grad_norm": 1.4977210760116577, + "learning_rate": 4.827664308454066e-05, + "loss": 5.1492, + "step": 19990 + }, + { + "epoch": 0.11889214007041583, + "grad_norm": 1.2681607007980347, + "learning_rate": 4.8276472658875035e-05, + "loss": 5.1178, + "step": 19991 + }, + { + "epoch": 0.11889808735369684, + "grad_norm": 1.2606865167617798, + "learning_rate": 4.827630222508385e-05, + "loss": 5.2796, + "step": 19992 + }, + { + "epoch": 0.11890403463697782, + "grad_norm": 1.477273941040039, + "learning_rate": 4.827613178316713e-05, + "loss": 5.251, + "step": 19993 + }, + { + "epoch": 0.11890998192025883, + "grad_norm": 1.4194386005401611, + "learning_rate": 4.8275961333124956e-05, + "loss": 5.157, + "step": 19994 + }, + { + "epoch": 0.11891592920353983, + "grad_norm": 1.2693103551864624, + "learning_rate": 4.8275790874957396e-05, + "loss": 5.2037, + "step": 19995 + }, + { + "epoch": 0.11892187648682082, + "grad_norm": 1.2035702466964722, + "learning_rate": 4.8275620408664487e-05, + "loss": 5.1613, + "step": 19996 + }, + { + "epoch": 0.11892782377010182, + "grad_norm": 1.1674199104309082, + "learning_rate": 4.8275449934246295e-05, + "loss": 5.2415, + "step": 19997 + }, + { + "epoch": 0.11893377105338282, + "grad_norm": 1.5064369440078735, + "learning_rate": 4.8275279451702895e-05, + "loss": 5.2025, + "step": 19998 + }, + { + "epoch": 0.11893971833666381, + "grad_norm": 1.3770934343338013, + "learning_rate": 4.827510896103433e-05, + "loss": 5.0804, + "step": 19999 + }, + { + "epoch": 0.11894566561994481, + "grad_norm": 1.4852590560913086, + "learning_rate": 4.827493846224067e-05, + "loss": 5.0169, + "step": 20000 + }, + { + "epoch": 0.11895161290322581, + "grad_norm": 1.3760627508163452, + "learning_rate": 4.8274767955321966e-05, + "loss": 5.245, + "step": 20001 + }, + { + "epoch": 0.1189575601865068, + "grad_norm": 1.4135125875473022, + "learning_rate": 4.827459744027828e-05, + "loss": 5.1599, + "step": 20002 + }, + { + "epoch": 0.1189635074697878, + "grad_norm": 1.352949857711792, + "learning_rate": 4.8274426917109675e-05, + "loss": 5.187, + "step": 20003 + }, + { + "epoch": 0.1189694547530688, + "grad_norm": 1.279439091682434, + "learning_rate": 4.82742563858162e-05, + "loss": 5.1369, + "step": 20004 + }, + { + "epoch": 0.11897540203634979, + "grad_norm": 1.6078580617904663, + "learning_rate": 4.8274085846397935e-05, + "loss": 5.097, + "step": 20005 + }, + { + "epoch": 0.1189813493196308, + "grad_norm": 1.4414268732070923, + "learning_rate": 4.827391529885492e-05, + "loss": 5.1412, + "step": 20006 + }, + { + "epoch": 0.1189872966029118, + "grad_norm": 1.249731421470642, + "learning_rate": 4.827374474318722e-05, + "loss": 5.002, + "step": 20007 + }, + { + "epoch": 0.11899324388619278, + "grad_norm": 1.5977002382278442, + "learning_rate": 4.82735741793949e-05, + "loss": 5.0387, + "step": 20008 + }, + { + "epoch": 0.11899919116947379, + "grad_norm": 1.5115478038787842, + "learning_rate": 4.8273403607478016e-05, + "loss": 4.9497, + "step": 20009 + }, + { + "epoch": 0.11900513845275479, + "grad_norm": 1.433825135231018, + "learning_rate": 4.8273233027436625e-05, + "loss": 4.9818, + "step": 20010 + }, + { + "epoch": 0.11901108573603578, + "grad_norm": 1.51628839969635, + "learning_rate": 4.827306243927079e-05, + "loss": 4.8819, + "step": 20011 + }, + { + "epoch": 0.11901703301931678, + "grad_norm": 1.3780534267425537, + "learning_rate": 4.8272891842980564e-05, + "loss": 5.18, + "step": 20012 + }, + { + "epoch": 0.11902298030259777, + "grad_norm": 1.2616275548934937, + "learning_rate": 4.8272721238566023e-05, + "loss": 5.549, + "step": 20013 + }, + { + "epoch": 0.11902892758587877, + "grad_norm": 1.2978616952896118, + "learning_rate": 4.8272550626027204e-05, + "loss": 5.4608, + "step": 20014 + }, + { + "epoch": 0.11903487486915977, + "grad_norm": 1.2539299726486206, + "learning_rate": 4.827238000536418e-05, + "loss": 5.5612, + "step": 20015 + }, + { + "epoch": 0.11904082215244076, + "grad_norm": 1.4023045301437378, + "learning_rate": 4.827220937657702e-05, + "loss": 5.2669, + "step": 20016 + }, + { + "epoch": 0.11904676943572176, + "grad_norm": 1.4386683702468872, + "learning_rate": 4.827203873966576e-05, + "loss": 5.0703, + "step": 20017 + }, + { + "epoch": 0.11905271671900276, + "grad_norm": 1.5248057842254639, + "learning_rate": 4.827186809463048e-05, + "loss": 5.0376, + "step": 20018 + }, + { + "epoch": 0.11905866400228375, + "grad_norm": 1.4410630464553833, + "learning_rate": 4.827169744147122e-05, + "loss": 5.1396, + "step": 20019 + }, + { + "epoch": 0.11906461128556475, + "grad_norm": 1.7917122840881348, + "learning_rate": 4.827152678018806e-05, + "loss": 5.1673, + "step": 20020 + }, + { + "epoch": 0.11907055856884576, + "grad_norm": 1.739169716835022, + "learning_rate": 4.827135611078105e-05, + "loss": 5.6848, + "step": 20021 + }, + { + "epoch": 0.11907650585212674, + "grad_norm": 1.6629457473754883, + "learning_rate": 4.827118543325024e-05, + "loss": 5.7335, + "step": 20022 + }, + { + "epoch": 0.11908245313540775, + "grad_norm": 1.634628176689148, + "learning_rate": 4.827101474759571e-05, + "loss": 5.7718, + "step": 20023 + }, + { + "epoch": 0.11908840041868875, + "grad_norm": 1.299861192703247, + "learning_rate": 4.827084405381751e-05, + "loss": 5.6917, + "step": 20024 + }, + { + "epoch": 0.11909434770196974, + "grad_norm": 1.3863619565963745, + "learning_rate": 4.82706733519157e-05, + "loss": 5.7363, + "step": 20025 + }, + { + "epoch": 0.11910029498525074, + "grad_norm": 2.3500845432281494, + "learning_rate": 4.827050264189033e-05, + "loss": 5.192, + "step": 20026 + }, + { + "epoch": 0.11910624226853174, + "grad_norm": 1.426633358001709, + "learning_rate": 4.827033192374147e-05, + "loss": 5.5643, + "step": 20027 + }, + { + "epoch": 0.11911218955181273, + "grad_norm": 1.4728987216949463, + "learning_rate": 4.8270161197469175e-05, + "loss": 5.6323, + "step": 20028 + }, + { + "epoch": 0.11911813683509373, + "grad_norm": 1.66750168800354, + "learning_rate": 4.826999046307352e-05, + "loss": 5.4327, + "step": 20029 + }, + { + "epoch": 0.11912408411837473, + "grad_norm": 1.4894248247146606, + "learning_rate": 4.8269819720554545e-05, + "loss": 5.4332, + "step": 20030 + }, + { + "epoch": 0.11913003140165572, + "grad_norm": 1.5166181325912476, + "learning_rate": 4.826964896991231e-05, + "loss": 5.5467, + "step": 20031 + }, + { + "epoch": 0.11913597868493672, + "grad_norm": 1.2947237491607666, + "learning_rate": 4.826947821114689e-05, + "loss": 5.5116, + "step": 20032 + }, + { + "epoch": 0.11914192596821772, + "grad_norm": 1.3890970945358276, + "learning_rate": 4.8269307444258326e-05, + "loss": 5.5459, + "step": 20033 + }, + { + "epoch": 0.11914787325149871, + "grad_norm": 1.496099591255188, + "learning_rate": 4.8269136669246695e-05, + "loss": 5.5533, + "step": 20034 + }, + { + "epoch": 0.11915382053477971, + "grad_norm": 1.4115175008773804, + "learning_rate": 4.8268965886112045e-05, + "loss": 5.4898, + "step": 20035 + }, + { + "epoch": 0.11915976781806072, + "grad_norm": 1.3803601264953613, + "learning_rate": 4.826879509485444e-05, + "loss": 5.598, + "step": 20036 + }, + { + "epoch": 0.1191657151013417, + "grad_norm": 1.7235617637634277, + "learning_rate": 4.826862429547394e-05, + "loss": 5.5489, + "step": 20037 + }, + { + "epoch": 0.1191716623846227, + "grad_norm": 1.726289987564087, + "learning_rate": 4.82684534879706e-05, + "loss": 5.5461, + "step": 20038 + }, + { + "epoch": 0.11917760966790371, + "grad_norm": 1.593349814414978, + "learning_rate": 4.826828267234449e-05, + "loss": 5.3594, + "step": 20039 + }, + { + "epoch": 0.1191835569511847, + "grad_norm": 2.3147101402282715, + "learning_rate": 4.826811184859566e-05, + "loss": 4.6888, + "step": 20040 + }, + { + "epoch": 0.1191895042344657, + "grad_norm": 2.1485888957977295, + "learning_rate": 4.826794101672417e-05, + "loss": 4.6874, + "step": 20041 + }, + { + "epoch": 0.11919545151774669, + "grad_norm": 2.5710601806640625, + "learning_rate": 4.826777017673009e-05, + "loss": 4.6524, + "step": 20042 + }, + { + "epoch": 0.11920139880102769, + "grad_norm": 2.314556121826172, + "learning_rate": 4.826759932861346e-05, + "loss": 4.3273, + "step": 20043 + }, + { + "epoch": 0.11920734608430869, + "grad_norm": 2.060617208480835, + "learning_rate": 4.826742847237436e-05, + "loss": 4.6601, + "step": 20044 + }, + { + "epoch": 0.11921329336758968, + "grad_norm": 1.9709726572036743, + "learning_rate": 4.826725760801284e-05, + "loss": 6.1007, + "step": 20045 + }, + { + "epoch": 0.11921924065087068, + "grad_norm": 2.0907840728759766, + "learning_rate": 4.826708673552895e-05, + "loss": 6.0386, + "step": 20046 + }, + { + "epoch": 0.11922518793415168, + "grad_norm": 2.02783203125, + "learning_rate": 4.826691585492278e-05, + "loss": 5.4651, + "step": 20047 + }, + { + "epoch": 0.11923113521743267, + "grad_norm": 1.8326990604400635, + "learning_rate": 4.826674496619435e-05, + "loss": 5.7342, + "step": 20048 + }, + { + "epoch": 0.11923708250071367, + "grad_norm": 1.8395801782608032, + "learning_rate": 4.8266574069343753e-05, + "loss": 5.657, + "step": 20049 + }, + { + "epoch": 0.11924302978399468, + "grad_norm": 1.5144078731536865, + "learning_rate": 4.826640316437103e-05, + "loss": 5.6856, + "step": 20050 + }, + { + "epoch": 0.11924897706727566, + "grad_norm": 1.6133313179016113, + "learning_rate": 4.826623225127626e-05, + "loss": 5.114, + "step": 20051 + }, + { + "epoch": 0.11925492435055667, + "grad_norm": 2.0678884983062744, + "learning_rate": 4.826606133005947e-05, + "loss": 5.6642, + "step": 20052 + }, + { + "epoch": 0.11926087163383767, + "grad_norm": 1.7214683294296265, + "learning_rate": 4.8265890400720744e-05, + "loss": 5.8689, + "step": 20053 + }, + { + "epoch": 0.11926681891711866, + "grad_norm": 1.7670868635177612, + "learning_rate": 4.826571946326014e-05, + "loss": 5.6504, + "step": 20054 + }, + { + "epoch": 0.11927276620039966, + "grad_norm": 1.6336724758148193, + "learning_rate": 4.82655485176777e-05, + "loss": 5.7624, + "step": 20055 + }, + { + "epoch": 0.11927871348368066, + "grad_norm": 1.6147593259811401, + "learning_rate": 4.8265377563973514e-05, + "loss": 5.8398, + "step": 20056 + }, + { + "epoch": 0.11928466076696165, + "grad_norm": 1.6203758716583252, + "learning_rate": 4.8265206602147614e-05, + "loss": 5.3793, + "step": 20057 + }, + { + "epoch": 0.11929060805024265, + "grad_norm": 1.8295884132385254, + "learning_rate": 4.8265035632200084e-05, + "loss": 5.0185, + "step": 20058 + }, + { + "epoch": 0.11929655533352365, + "grad_norm": 1.6802337169647217, + "learning_rate": 4.826486465413096e-05, + "loss": 5.8104, + "step": 20059 + }, + { + "epoch": 0.11930250261680464, + "grad_norm": 1.9276031255722046, + "learning_rate": 4.826469366794031e-05, + "loss": 5.2106, + "step": 20060 + }, + { + "epoch": 0.11930844990008564, + "grad_norm": 1.9589072465896606, + "learning_rate": 4.8264522673628205e-05, + "loss": 5.2336, + "step": 20061 + }, + { + "epoch": 0.11931439718336664, + "grad_norm": 3.45713472366333, + "learning_rate": 4.826435167119469e-05, + "loss": 5.7015, + "step": 20062 + }, + { + "epoch": 0.11932034446664763, + "grad_norm": 3.057732343673706, + "learning_rate": 4.826418066063983e-05, + "loss": 4.2376, + "step": 20063 + }, + { + "epoch": 0.11932629174992863, + "grad_norm": 2.9540810585021973, + "learning_rate": 4.8264009641963684e-05, + "loss": 4.1357, + "step": 20064 + }, + { + "epoch": 0.11933223903320964, + "grad_norm": 2.707113027572632, + "learning_rate": 4.826383861516632e-05, + "loss": 3.7255, + "step": 20065 + }, + { + "epoch": 0.11933818631649062, + "grad_norm": 2.488718032836914, + "learning_rate": 4.8263667580247784e-05, + "loss": 3.7309, + "step": 20066 + }, + { + "epoch": 0.11934413359977163, + "grad_norm": 2.6351873874664307, + "learning_rate": 4.826349653720814e-05, + "loss": 3.5953, + "step": 20067 + }, + { + "epoch": 0.11935008088305263, + "grad_norm": 2.866333246231079, + "learning_rate": 4.826332548604745e-05, + "loss": 3.8627, + "step": 20068 + }, + { + "epoch": 0.11935602816633362, + "grad_norm": 1.5446399450302124, + "learning_rate": 4.8263154426765777e-05, + "loss": 5.3014, + "step": 20069 + }, + { + "epoch": 0.11936197544961462, + "grad_norm": 1.7273021936416626, + "learning_rate": 4.8262983359363176e-05, + "loss": 5.6102, + "step": 20070 + }, + { + "epoch": 0.1193679227328956, + "grad_norm": 1.4169118404388428, + "learning_rate": 4.826281228383971e-05, + "loss": 5.6831, + "step": 20071 + }, + { + "epoch": 0.11937387001617661, + "grad_norm": 1.7140129804611206, + "learning_rate": 4.826264120019544e-05, + "loss": 5.6609, + "step": 20072 + }, + { + "epoch": 0.11937981729945761, + "grad_norm": 1.4560796022415161, + "learning_rate": 4.8262470108430414e-05, + "loss": 5.6279, + "step": 20073 + }, + { + "epoch": 0.1193857645827386, + "grad_norm": 1.6894809007644653, + "learning_rate": 4.8262299008544697e-05, + "loss": 5.192, + "step": 20074 + }, + { + "epoch": 0.1193917118660196, + "grad_norm": 2.995307683944702, + "learning_rate": 4.826212790053836e-05, + "loss": 4.9009, + "step": 20075 + }, + { + "epoch": 0.1193976591493006, + "grad_norm": 2.9559946060180664, + "learning_rate": 4.826195678441145e-05, + "loss": 4.8801, + "step": 20076 + }, + { + "epoch": 0.11940360643258159, + "grad_norm": 2.550973653793335, + "learning_rate": 4.826178566016403e-05, + "loss": 4.7061, + "step": 20077 + }, + { + "epoch": 0.11940955371586259, + "grad_norm": 2.0249550342559814, + "learning_rate": 4.826161452779617e-05, + "loss": 5.0315, + "step": 20078 + }, + { + "epoch": 0.1194155009991436, + "grad_norm": 1.6208853721618652, + "learning_rate": 4.826144338730791e-05, + "loss": 5.3685, + "step": 20079 + }, + { + "epoch": 0.11942144828242458, + "grad_norm": 1.6138144731521606, + "learning_rate": 4.826127223869933e-05, + "loss": 5.3098, + "step": 20080 + }, + { + "epoch": 0.11942739556570559, + "grad_norm": 1.6347969770431519, + "learning_rate": 4.8261101081970476e-05, + "loss": 5.7519, + "step": 20081 + }, + { + "epoch": 0.11943334284898659, + "grad_norm": 1.6273889541625977, + "learning_rate": 4.8260929917121403e-05, + "loss": 5.5083, + "step": 20082 + }, + { + "epoch": 0.11943929013226758, + "grad_norm": 1.7236882448196411, + "learning_rate": 4.826075874415219e-05, + "loss": 5.3613, + "step": 20083 + }, + { + "epoch": 0.11944523741554858, + "grad_norm": 1.5177632570266724, + "learning_rate": 4.826058756306289e-05, + "loss": 5.4234, + "step": 20084 + }, + { + "epoch": 0.11945118469882958, + "grad_norm": 1.9017301797866821, + "learning_rate": 4.826041637385354e-05, + "loss": 4.6868, + "step": 20085 + }, + { + "epoch": 0.11945713198211057, + "grad_norm": 1.8880805969238281, + "learning_rate": 4.826024517652425e-05, + "loss": 4.4478, + "step": 20086 + }, + { + "epoch": 0.11946307926539157, + "grad_norm": 1.5617226362228394, + "learning_rate": 4.826007397107503e-05, + "loss": 5.3775, + "step": 20087 + }, + { + "epoch": 0.11946902654867257, + "grad_norm": 1.836101770401001, + "learning_rate": 4.825990275750595e-05, + "loss": 5.33, + "step": 20088 + }, + { + "epoch": 0.11947497383195356, + "grad_norm": 1.6876533031463623, + "learning_rate": 4.825973153581709e-05, + "loss": 5.3164, + "step": 20089 + }, + { + "epoch": 0.11948092111523456, + "grad_norm": 1.7182306051254272, + "learning_rate": 4.82595603060085e-05, + "loss": 5.3545, + "step": 20090 + }, + { + "epoch": 0.11948686839851556, + "grad_norm": 2.160414934158325, + "learning_rate": 4.825938906808023e-05, + "loss": 4.3744, + "step": 20091 + }, + { + "epoch": 0.11949281568179655, + "grad_norm": 1.4865752458572388, + "learning_rate": 4.825921782203236e-05, + "loss": 5.455, + "step": 20092 + }, + { + "epoch": 0.11949876296507755, + "grad_norm": 1.550986409187317, + "learning_rate": 4.825904656786492e-05, + "loss": 5.4879, + "step": 20093 + }, + { + "epoch": 0.11950471024835856, + "grad_norm": 1.473037838935852, + "learning_rate": 4.8258875305577996e-05, + "loss": 5.3964, + "step": 20094 + }, + { + "epoch": 0.11951065753163954, + "grad_norm": 1.6714228391647339, + "learning_rate": 4.825870403517164e-05, + "loss": 5.0215, + "step": 20095 + }, + { + "epoch": 0.11951660481492055, + "grad_norm": 1.7555420398712158, + "learning_rate": 4.8258532756645905e-05, + "loss": 4.9852, + "step": 20096 + }, + { + "epoch": 0.11952255209820155, + "grad_norm": 1.562729835510254, + "learning_rate": 4.825836147000086e-05, + "loss": 4.5928, + "step": 20097 + }, + { + "epoch": 0.11952849938148254, + "grad_norm": 1.7901209592819214, + "learning_rate": 4.825819017523656e-05, + "loss": 5.3176, + "step": 20098 + }, + { + "epoch": 0.11953444666476354, + "grad_norm": 1.605578064918518, + "learning_rate": 4.825801887235307e-05, + "loss": 5.3162, + "step": 20099 + }, + { + "epoch": 0.11954039394804453, + "grad_norm": 1.9077202081680298, + "learning_rate": 4.8257847561350445e-05, + "loss": 5.3378, + "step": 20100 + }, + { + "epoch": 0.11954634123132553, + "grad_norm": 1.9171262979507446, + "learning_rate": 4.825767624222875e-05, + "loss": 5.2585, + "step": 20101 + }, + { + "epoch": 0.11955228851460653, + "grad_norm": 1.5661342144012451, + "learning_rate": 4.825750491498803e-05, + "loss": 5.3421, + "step": 20102 + }, + { + "epoch": 0.11955823579788752, + "grad_norm": 2.188962697982788, + "learning_rate": 4.825733357962836e-05, + "loss": 4.8925, + "step": 20103 + }, + { + "epoch": 0.11956418308116852, + "grad_norm": 1.4218099117279053, + "learning_rate": 4.82571622361498e-05, + "loss": 5.3497, + "step": 20104 + }, + { + "epoch": 0.11957013036444952, + "grad_norm": 1.6142303943634033, + "learning_rate": 4.82569908845524e-05, + "loss": 5.1657, + "step": 20105 + }, + { + "epoch": 0.11957607764773051, + "grad_norm": 1.9385474920272827, + "learning_rate": 4.8256819524836224e-05, + "loss": 5.0509, + "step": 20106 + }, + { + "epoch": 0.11958202493101151, + "grad_norm": 2.077528953552246, + "learning_rate": 4.825664815700134e-05, + "loss": 5.1879, + "step": 20107 + }, + { + "epoch": 0.11958797221429252, + "grad_norm": 2.158764123916626, + "learning_rate": 4.825647678104779e-05, + "loss": 4.9595, + "step": 20108 + }, + { + "epoch": 0.1195939194975735, + "grad_norm": 2.0398664474487305, + "learning_rate": 4.825630539697565e-05, + "loss": 4.9156, + "step": 20109 + }, + { + "epoch": 0.1195998667808545, + "grad_norm": 2.0280275344848633, + "learning_rate": 4.825613400478497e-05, + "loss": 4.8655, + "step": 20110 + }, + { + "epoch": 0.11960581406413551, + "grad_norm": 2.0311338901519775, + "learning_rate": 4.8255962604475816e-05, + "loss": 4.8953, + "step": 20111 + }, + { + "epoch": 0.1196117613474165, + "grad_norm": 2.334346055984497, + "learning_rate": 4.825579119604825e-05, + "loss": 5.0044, + "step": 20112 + }, + { + "epoch": 0.1196177086306975, + "grad_norm": 2.272148847579956, + "learning_rate": 4.825561977950233e-05, + "loss": 4.8911, + "step": 20113 + }, + { + "epoch": 0.1196236559139785, + "grad_norm": 2.0724244117736816, + "learning_rate": 4.8255448354838104e-05, + "loss": 5.3492, + "step": 20114 + }, + { + "epoch": 0.11962960319725949, + "grad_norm": 1.7691513299942017, + "learning_rate": 4.8255276922055644e-05, + "loss": 5.5727, + "step": 20115 + }, + { + "epoch": 0.11963555048054049, + "grad_norm": 1.9434363842010498, + "learning_rate": 4.8255105481155004e-05, + "loss": 5.4564, + "step": 20116 + }, + { + "epoch": 0.11964149776382149, + "grad_norm": 1.623660683631897, + "learning_rate": 4.825493403213626e-05, + "loss": 5.2862, + "step": 20117 + }, + { + "epoch": 0.11964744504710248, + "grad_norm": 1.6246039867401123, + "learning_rate": 4.8254762574999446e-05, + "loss": 5.3627, + "step": 20118 + }, + { + "epoch": 0.11965339233038348, + "grad_norm": 1.689290165901184, + "learning_rate": 4.825459110974464e-05, + "loss": 4.6902, + "step": 20119 + }, + { + "epoch": 0.11965933961366448, + "grad_norm": 1.487697720527649, + "learning_rate": 4.825441963637189e-05, + "loss": 4.7598, + "step": 20120 + }, + { + "epoch": 0.11966528689694547, + "grad_norm": 1.7388331890106201, + "learning_rate": 4.825424815488126e-05, + "loss": 4.709, + "step": 20121 + }, + { + "epoch": 0.11967123418022647, + "grad_norm": 1.9586225748062134, + "learning_rate": 4.8254076665272826e-05, + "loss": 4.4625, + "step": 20122 + }, + { + "epoch": 0.11967718146350748, + "grad_norm": 1.9228769540786743, + "learning_rate": 4.825390516754662e-05, + "loss": 4.1447, + "step": 20123 + }, + { + "epoch": 0.11968312874678846, + "grad_norm": 1.8852907419204712, + "learning_rate": 4.825373366170273e-05, + "loss": 4.2618, + "step": 20124 + }, + { + "epoch": 0.11968907603006947, + "grad_norm": 1.8267028331756592, + "learning_rate": 4.825356214774119e-05, + "loss": 4.4095, + "step": 20125 + }, + { + "epoch": 0.11969502331335047, + "grad_norm": 1.8847311735153198, + "learning_rate": 4.825339062566208e-05, + "loss": 4.1904, + "step": 20126 + }, + { + "epoch": 0.11970097059663146, + "grad_norm": 2.0036990642547607, + "learning_rate": 4.825321909546545e-05, + "loss": 4.2348, + "step": 20127 + }, + { + "epoch": 0.11970691787991246, + "grad_norm": 1.8992520570755005, + "learning_rate": 4.825304755715136e-05, + "loss": 4.3038, + "step": 20128 + }, + { + "epoch": 0.11971286516319345, + "grad_norm": 1.8314359188079834, + "learning_rate": 4.8252876010719874e-05, + "loss": 4.102, + "step": 20129 + }, + { + "epoch": 0.11971881244647445, + "grad_norm": 1.9093595743179321, + "learning_rate": 4.825270445617104e-05, + "loss": 4.0307, + "step": 20130 + }, + { + "epoch": 0.11972475972975545, + "grad_norm": 2.1645400524139404, + "learning_rate": 4.8252532893504936e-05, + "loss": 4.2032, + "step": 20131 + }, + { + "epoch": 0.11973070701303644, + "grad_norm": 2.0268661975860596, + "learning_rate": 4.8252361322721605e-05, + "loss": 4.7705, + "step": 20132 + }, + { + "epoch": 0.11973665429631744, + "grad_norm": 1.8852148056030273, + "learning_rate": 4.825218974382113e-05, + "loss": 4.8969, + "step": 20133 + }, + { + "epoch": 0.11974260157959844, + "grad_norm": 1.9107592105865479, + "learning_rate": 4.825201815680354e-05, + "loss": 5.2587, + "step": 20134 + }, + { + "epoch": 0.11974854886287943, + "grad_norm": 1.6433600187301636, + "learning_rate": 4.825184656166892e-05, + "loss": 5.1954, + "step": 20135 + }, + { + "epoch": 0.11975449614616043, + "grad_norm": 1.4135210514068604, + "learning_rate": 4.825167495841731e-05, + "loss": 5.0398, + "step": 20136 + }, + { + "epoch": 0.11976044342944143, + "grad_norm": 1.9514580965042114, + "learning_rate": 4.825150334704879e-05, + "loss": 4.3527, + "step": 20137 + }, + { + "epoch": 0.11976639071272242, + "grad_norm": 1.8811348676681519, + "learning_rate": 4.825133172756341e-05, + "loss": 4.2798, + "step": 20138 + }, + { + "epoch": 0.11977233799600343, + "grad_norm": 1.8210500478744507, + "learning_rate": 4.825116009996123e-05, + "loss": 4.666, + "step": 20139 + }, + { + "epoch": 0.11977828527928443, + "grad_norm": 1.8773581981658936, + "learning_rate": 4.825098846424231e-05, + "loss": 4.9104, + "step": 20140 + }, + { + "epoch": 0.11978423256256542, + "grad_norm": 1.517233967781067, + "learning_rate": 4.825081682040671e-05, + "loss": 5.5915, + "step": 20141 + }, + { + "epoch": 0.11979017984584642, + "grad_norm": 1.6219067573547363, + "learning_rate": 4.825064516845449e-05, + "loss": 5.6538, + "step": 20142 + }, + { + "epoch": 0.11979612712912742, + "grad_norm": 1.4977927207946777, + "learning_rate": 4.8250473508385707e-05, + "loss": 5.3499, + "step": 20143 + }, + { + "epoch": 0.11980207441240841, + "grad_norm": 1.5381087064743042, + "learning_rate": 4.8250301840200424e-05, + "loss": 5.6666, + "step": 20144 + }, + { + "epoch": 0.11980802169568941, + "grad_norm": 1.5895806550979614, + "learning_rate": 4.82501301638987e-05, + "loss": 5.2099, + "step": 20145 + }, + { + "epoch": 0.11981396897897041, + "grad_norm": 1.7511320114135742, + "learning_rate": 4.8249958479480603e-05, + "loss": 4.622, + "step": 20146 + }, + { + "epoch": 0.1198199162622514, + "grad_norm": 1.8109928369522095, + "learning_rate": 4.824978678694618e-05, + "loss": 4.4156, + "step": 20147 + }, + { + "epoch": 0.1198258635455324, + "grad_norm": 1.474926471710205, + "learning_rate": 4.8249615086295494e-05, + "loss": 5.4845, + "step": 20148 + }, + { + "epoch": 0.1198318108288134, + "grad_norm": 1.8301719427108765, + "learning_rate": 4.824944337752861e-05, + "loss": 5.1814, + "step": 20149 + }, + { + "epoch": 0.11983775811209439, + "grad_norm": 1.8549950122833252, + "learning_rate": 4.824927166064559e-05, + "loss": 5.2944, + "step": 20150 + }, + { + "epoch": 0.1198437053953754, + "grad_norm": 1.7832791805267334, + "learning_rate": 4.8249099935646494e-05, + "loss": 5.7594, + "step": 20151 + }, + { + "epoch": 0.1198496526786564, + "grad_norm": 1.5706509351730347, + "learning_rate": 4.8248928202531366e-05, + "loss": 5.4607, + "step": 20152 + }, + { + "epoch": 0.11985559996193738, + "grad_norm": 1.6395286321640015, + "learning_rate": 4.824875646130028e-05, + "loss": 5.3338, + "step": 20153 + }, + { + "epoch": 0.11986154724521839, + "grad_norm": 1.9523805379867554, + "learning_rate": 4.824858471195329e-05, + "loss": 5.1205, + "step": 20154 + }, + { + "epoch": 0.11986749452849939, + "grad_norm": 2.45190691947937, + "learning_rate": 4.824841295449047e-05, + "loss": 4.5387, + "step": 20155 + }, + { + "epoch": 0.11987344181178038, + "grad_norm": 2.2806150913238525, + "learning_rate": 4.8248241188911856e-05, + "loss": 4.8134, + "step": 20156 + }, + { + "epoch": 0.11987938909506138, + "grad_norm": 2.230710029602051, + "learning_rate": 4.8248069415217534e-05, + "loss": 4.7386, + "step": 20157 + }, + { + "epoch": 0.11988533637834237, + "grad_norm": 2.13611102104187, + "learning_rate": 4.8247897633407546e-05, + "loss": 4.6519, + "step": 20158 + }, + { + "epoch": 0.11989128366162337, + "grad_norm": 1.7644202709197998, + "learning_rate": 4.824772584348196e-05, + "loss": 5.5343, + "step": 20159 + }, + { + "epoch": 0.11989723094490437, + "grad_norm": 1.8997445106506348, + "learning_rate": 4.824755404544083e-05, + "loss": 5.2135, + "step": 20160 + }, + { + "epoch": 0.11990317822818536, + "grad_norm": 1.8288135528564453, + "learning_rate": 4.824738223928421e-05, + "loss": 4.9554, + "step": 20161 + }, + { + "epoch": 0.11990912551146636, + "grad_norm": 1.795866847038269, + "learning_rate": 4.824721042501218e-05, + "loss": 5.6791, + "step": 20162 + }, + { + "epoch": 0.11991507279474736, + "grad_norm": 2.3721072673797607, + "learning_rate": 4.824703860262479e-05, + "loss": 5.4931, + "step": 20163 + }, + { + "epoch": 0.11992102007802835, + "grad_norm": 2.415207862854004, + "learning_rate": 4.824686677212209e-05, + "loss": 5.3801, + "step": 20164 + }, + { + "epoch": 0.11992696736130935, + "grad_norm": 2.411116600036621, + "learning_rate": 4.824669493350415e-05, + "loss": 5.1122, + "step": 20165 + }, + { + "epoch": 0.11993291464459035, + "grad_norm": 1.928256869316101, + "learning_rate": 4.824652308677104e-05, + "loss": 5.1627, + "step": 20166 + }, + { + "epoch": 0.11993886192787134, + "grad_norm": 1.9031376838684082, + "learning_rate": 4.8246351231922803e-05, + "loss": 5.014, + "step": 20167 + }, + { + "epoch": 0.11994480921115235, + "grad_norm": 1.8143563270568848, + "learning_rate": 4.82461793689595e-05, + "loss": 4.8921, + "step": 20168 + }, + { + "epoch": 0.11995075649443335, + "grad_norm": 1.7218538522720337, + "learning_rate": 4.824600749788121e-05, + "loss": 4.83, + "step": 20169 + }, + { + "epoch": 0.11995670377771434, + "grad_norm": 1.8235888481140137, + "learning_rate": 4.824583561868796e-05, + "loss": 5.0709, + "step": 20170 + }, + { + "epoch": 0.11996265106099534, + "grad_norm": 2.404656410217285, + "learning_rate": 4.8245663731379845e-05, + "loss": 4.7555, + "step": 20171 + }, + { + "epoch": 0.11996859834427634, + "grad_norm": 2.0463438034057617, + "learning_rate": 4.82454918359569e-05, + "loss": 5.2582, + "step": 20172 + }, + { + "epoch": 0.11997454562755733, + "grad_norm": 1.9073017835617065, + "learning_rate": 4.82453199324192e-05, + "loss": 5.794, + "step": 20173 + }, + { + "epoch": 0.11998049291083833, + "grad_norm": 1.856101632118225, + "learning_rate": 4.8245148020766796e-05, + "loss": 5.8569, + "step": 20174 + }, + { + "epoch": 0.11998644019411933, + "grad_norm": 1.6862335205078125, + "learning_rate": 4.8244976100999745e-05, + "loss": 5.7762, + "step": 20175 + }, + { + "epoch": 0.11999238747740032, + "grad_norm": 1.8727613687515259, + "learning_rate": 4.824480417311812e-05, + "loss": 5.5417, + "step": 20176 + }, + { + "epoch": 0.11999833476068132, + "grad_norm": 2.2967453002929688, + "learning_rate": 4.8244632237121964e-05, + "loss": 5.3268, + "step": 20177 + }, + { + "epoch": 0.12000428204396232, + "grad_norm": 2.1443405151367188, + "learning_rate": 4.824446029301136e-05, + "loss": 5.1333, + "step": 20178 + }, + { + "epoch": 0.12001022932724331, + "grad_norm": 1.7855141162872314, + "learning_rate": 4.824428834078635e-05, + "loss": 5.2781, + "step": 20179 + }, + { + "epoch": 0.12001617661052431, + "grad_norm": 1.880510926246643, + "learning_rate": 4.8244116380447e-05, + "loss": 5.1012, + "step": 20180 + }, + { + "epoch": 0.12002212389380532, + "grad_norm": 1.6733261346817017, + "learning_rate": 4.824394441199337e-05, + "loss": 5.3, + "step": 20181 + }, + { + "epoch": 0.1200280711770863, + "grad_norm": 1.781132459640503, + "learning_rate": 4.824377243542552e-05, + "loss": 5.7102, + "step": 20182 + }, + { + "epoch": 0.1200340184603673, + "grad_norm": 1.779144287109375, + "learning_rate": 4.82436004507435e-05, + "loss": 5.694, + "step": 20183 + }, + { + "epoch": 0.12003996574364831, + "grad_norm": 1.6547144651412964, + "learning_rate": 4.824342845794739e-05, + "loss": 5.4852, + "step": 20184 + }, + { + "epoch": 0.1200459130269293, + "grad_norm": 1.8403137922286987, + "learning_rate": 4.824325645703723e-05, + "loss": 5.9584, + "step": 20185 + }, + { + "epoch": 0.1200518603102103, + "grad_norm": 1.738139271736145, + "learning_rate": 4.8243084448013095e-05, + "loss": 5.903, + "step": 20186 + }, + { + "epoch": 0.12005780759349129, + "grad_norm": 1.7819492816925049, + "learning_rate": 4.824291243087504e-05, + "loss": 5.587, + "step": 20187 + }, + { + "epoch": 0.12006375487677229, + "grad_norm": 1.5876322984695435, + "learning_rate": 4.824274040562313e-05, + "loss": 5.1007, + "step": 20188 + }, + { + "epoch": 0.12006970216005329, + "grad_norm": 1.6465766429901123, + "learning_rate": 4.824256837225741e-05, + "loss": 4.9674, + "step": 20189 + }, + { + "epoch": 0.12007564944333428, + "grad_norm": 1.5593008995056152, + "learning_rate": 4.824239633077795e-05, + "loss": 4.8428, + "step": 20190 + }, + { + "epoch": 0.12008159672661528, + "grad_norm": 1.9153317213058472, + "learning_rate": 4.8242224281184814e-05, + "loss": 5.7613, + "step": 20191 + }, + { + "epoch": 0.12008754400989628, + "grad_norm": 1.727364182472229, + "learning_rate": 4.8242052223478055e-05, + "loss": 5.8612, + "step": 20192 + }, + { + "epoch": 0.12009349129317727, + "grad_norm": 1.567190408706665, + "learning_rate": 4.8241880157657736e-05, + "loss": 5.9975, + "step": 20193 + }, + { + "epoch": 0.12009943857645827, + "grad_norm": 1.549182415008545, + "learning_rate": 4.824170808372391e-05, + "loss": 5.9723, + "step": 20194 + }, + { + "epoch": 0.12010538585973927, + "grad_norm": 1.6152268648147583, + "learning_rate": 4.824153600167666e-05, + "loss": 5.9953, + "step": 20195 + }, + { + "epoch": 0.12011133314302026, + "grad_norm": 1.5206012725830078, + "learning_rate": 4.824136391151602e-05, + "loss": 5.7435, + "step": 20196 + }, + { + "epoch": 0.12011728042630127, + "grad_norm": 1.719746470451355, + "learning_rate": 4.824119181324206e-05, + "loss": 5.6181, + "step": 20197 + }, + { + "epoch": 0.12012322770958227, + "grad_norm": 1.53969407081604, + "learning_rate": 4.824101970685484e-05, + "loss": 5.2699, + "step": 20198 + }, + { + "epoch": 0.12012917499286326, + "grad_norm": 1.6543430089950562, + "learning_rate": 4.824084759235442e-05, + "loss": 5.3316, + "step": 20199 + }, + { + "epoch": 0.12013512227614426, + "grad_norm": 1.8182042837142944, + "learning_rate": 4.8240675469740856e-05, + "loss": 5.4494, + "step": 20200 + }, + { + "epoch": 0.12014106955942526, + "grad_norm": 1.5531221628189087, + "learning_rate": 4.824050333901422e-05, + "loss": 5.3292, + "step": 20201 + }, + { + "epoch": 0.12014701684270625, + "grad_norm": 1.4964851140975952, + "learning_rate": 4.8240331200174564e-05, + "loss": 5.391, + "step": 20202 + }, + { + "epoch": 0.12015296412598725, + "grad_norm": 1.5492072105407715, + "learning_rate": 4.824015905322195e-05, + "loss": 5.373, + "step": 20203 + }, + { + "epoch": 0.12015891140926825, + "grad_norm": 1.733115792274475, + "learning_rate": 4.823998689815643e-05, + "loss": 5.6997, + "step": 20204 + }, + { + "epoch": 0.12016485869254924, + "grad_norm": 1.8122310638427734, + "learning_rate": 4.8239814734978074e-05, + "loss": 5.4116, + "step": 20205 + }, + { + "epoch": 0.12017080597583024, + "grad_norm": 1.9058727025985718, + "learning_rate": 4.8239642563686934e-05, + "loss": 4.9749, + "step": 20206 + }, + { + "epoch": 0.12017675325911124, + "grad_norm": 1.5442882776260376, + "learning_rate": 4.823947038428308e-05, + "loss": 5.6342, + "step": 20207 + }, + { + "epoch": 0.12018270054239223, + "grad_norm": 1.5593653917312622, + "learning_rate": 4.823929819676657e-05, + "loss": 5.7084, + "step": 20208 + }, + { + "epoch": 0.12018864782567323, + "grad_norm": 1.5067681074142456, + "learning_rate": 4.823912600113746e-05, + "loss": 5.2455, + "step": 20209 + }, + { + "epoch": 0.12019459510895424, + "grad_norm": 1.7560538053512573, + "learning_rate": 4.82389537973958e-05, + "loss": 5.5733, + "step": 20210 + }, + { + "epoch": 0.12020054239223522, + "grad_norm": 1.6941232681274414, + "learning_rate": 4.823878158554167e-05, + "loss": 5.4642, + "step": 20211 + }, + { + "epoch": 0.12020648967551623, + "grad_norm": 1.531043529510498, + "learning_rate": 4.8238609365575124e-05, + "loss": 5.1859, + "step": 20212 + }, + { + "epoch": 0.12021243695879723, + "grad_norm": 1.8201080560684204, + "learning_rate": 4.823843713749622e-05, + "loss": 5.1331, + "step": 20213 + }, + { + "epoch": 0.12021838424207822, + "grad_norm": 1.6585347652435303, + "learning_rate": 4.823826490130501e-05, + "loss": 5.6017, + "step": 20214 + }, + { + "epoch": 0.12022433152535922, + "grad_norm": 1.7156457901000977, + "learning_rate": 4.8238092657001566e-05, + "loss": 5.4022, + "step": 20215 + }, + { + "epoch": 0.1202302788086402, + "grad_norm": 1.474266529083252, + "learning_rate": 4.823792040458595e-05, + "loss": 5.6352, + "step": 20216 + }, + { + "epoch": 0.12023622609192121, + "grad_norm": 1.4047836065292358, + "learning_rate": 4.8237748144058206e-05, + "loss": 5.7834, + "step": 20217 + }, + { + "epoch": 0.12024217337520221, + "grad_norm": 1.4172712564468384, + "learning_rate": 4.823757587541841e-05, + "loss": 5.7711, + "step": 20218 + }, + { + "epoch": 0.1202481206584832, + "grad_norm": 1.6180040836334229, + "learning_rate": 4.823740359866661e-05, + "loss": 4.9208, + "step": 20219 + }, + { + "epoch": 0.1202540679417642, + "grad_norm": 1.917434573173523, + "learning_rate": 4.8237231313802875e-05, + "loss": 5.0108, + "step": 20220 + }, + { + "epoch": 0.1202600152250452, + "grad_norm": 1.6807219982147217, + "learning_rate": 4.823705902082727e-05, + "loss": 4.8156, + "step": 20221 + }, + { + "epoch": 0.12026596250832619, + "grad_norm": 1.7759804725646973, + "learning_rate": 4.823688671973984e-05, + "loss": 4.9253, + "step": 20222 + }, + { + "epoch": 0.12027190979160719, + "grad_norm": 1.667723536491394, + "learning_rate": 4.8236714410540664e-05, + "loss": 5.3166, + "step": 20223 + }, + { + "epoch": 0.1202778570748882, + "grad_norm": 2.089888334274292, + "learning_rate": 4.823654209322977e-05, + "loss": 4.5147, + "step": 20224 + }, + { + "epoch": 0.12028380435816918, + "grad_norm": 1.878585934638977, + "learning_rate": 4.823636976780725e-05, + "loss": 5.2102, + "step": 20225 + }, + { + "epoch": 0.12028975164145018, + "grad_norm": 1.758644461631775, + "learning_rate": 4.8236197434273164e-05, + "loss": 5.7388, + "step": 20226 + }, + { + "epoch": 0.12029569892473119, + "grad_norm": 1.8373035192489624, + "learning_rate": 4.823602509262755e-05, + "loss": 5.0102, + "step": 20227 + }, + { + "epoch": 0.12030164620801218, + "grad_norm": 1.697994589805603, + "learning_rate": 4.8235852742870486e-05, + "loss": 4.8272, + "step": 20228 + }, + { + "epoch": 0.12030759349129318, + "grad_norm": 1.8276288509368896, + "learning_rate": 4.823568038500202e-05, + "loss": 5.2316, + "step": 20229 + }, + { + "epoch": 0.12031354077457418, + "grad_norm": 1.691236972808838, + "learning_rate": 4.823550801902222e-05, + "loss": 5.2957, + "step": 20230 + }, + { + "epoch": 0.12031948805785517, + "grad_norm": 1.5625227689743042, + "learning_rate": 4.823533564493115e-05, + "loss": 5.0525, + "step": 20231 + }, + { + "epoch": 0.12032543534113617, + "grad_norm": 1.927823543548584, + "learning_rate": 4.823516326272886e-05, + "loss": 5.1367, + "step": 20232 + }, + { + "epoch": 0.12033138262441717, + "grad_norm": 1.649434208869934, + "learning_rate": 4.823499087241541e-05, + "loss": 4.8151, + "step": 20233 + }, + { + "epoch": 0.12033732990769816, + "grad_norm": 1.660487413406372, + "learning_rate": 4.8234818473990866e-05, + "loss": 4.8875, + "step": 20234 + }, + { + "epoch": 0.12034327719097916, + "grad_norm": 1.584165096282959, + "learning_rate": 4.823464606745529e-05, + "loss": 5.4909, + "step": 20235 + }, + { + "epoch": 0.12034922447426016, + "grad_norm": 1.6812808513641357, + "learning_rate": 4.823447365280874e-05, + "loss": 5.1194, + "step": 20236 + }, + { + "epoch": 0.12035517175754115, + "grad_norm": 1.6096045970916748, + "learning_rate": 4.823430123005127e-05, + "loss": 4.974, + "step": 20237 + }, + { + "epoch": 0.12036111904082215, + "grad_norm": 1.9969391822814941, + "learning_rate": 4.8234128799182954e-05, + "loss": 4.4403, + "step": 20238 + }, + { + "epoch": 0.12036706632410316, + "grad_norm": 1.7902976274490356, + "learning_rate": 4.8233956360203836e-05, + "loss": 5.0718, + "step": 20239 + }, + { + "epoch": 0.12037301360738414, + "grad_norm": 1.7156457901000977, + "learning_rate": 4.8233783913113985e-05, + "loss": 5.0892, + "step": 20240 + }, + { + "epoch": 0.12037896089066515, + "grad_norm": 2.1590521335601807, + "learning_rate": 4.823361145791346e-05, + "loss": 5.3385, + "step": 20241 + }, + { + "epoch": 0.12038490817394615, + "grad_norm": 1.7091206312179565, + "learning_rate": 4.8233438994602325e-05, + "loss": 5.1961, + "step": 20242 + }, + { + "epoch": 0.12039085545722714, + "grad_norm": 1.3705766201019287, + "learning_rate": 4.823326652318063e-05, + "loss": 5.023, + "step": 20243 + }, + { + "epoch": 0.12039680274050814, + "grad_norm": 1.2733731269836426, + "learning_rate": 4.8233094043648456e-05, + "loss": 5.2236, + "step": 20244 + }, + { + "epoch": 0.12040275002378913, + "grad_norm": 1.3697882890701294, + "learning_rate": 4.823292155600583e-05, + "loss": 5.3146, + "step": 20245 + }, + { + "epoch": 0.12040869730707013, + "grad_norm": 1.4292283058166504, + "learning_rate": 4.8232749060252846e-05, + "loss": 5.2777, + "step": 20246 + }, + { + "epoch": 0.12041464459035113, + "grad_norm": 1.5285491943359375, + "learning_rate": 4.823257655638954e-05, + "loss": 5.3465, + "step": 20247 + }, + { + "epoch": 0.12042059187363212, + "grad_norm": 1.6307164430618286, + "learning_rate": 4.823240404441598e-05, + "loss": 5.2863, + "step": 20248 + }, + { + "epoch": 0.12042653915691312, + "grad_norm": 1.4112886190414429, + "learning_rate": 4.823223152433224e-05, + "loss": 5.3082, + "step": 20249 + }, + { + "epoch": 0.12043248644019412, + "grad_norm": 1.4699361324310303, + "learning_rate": 4.823205899613836e-05, + "loss": 5.2161, + "step": 20250 + }, + { + "epoch": 0.12043843372347511, + "grad_norm": 1.3991621732711792, + "learning_rate": 4.823188645983441e-05, + "loss": 5.2493, + "step": 20251 + }, + { + "epoch": 0.12044438100675611, + "grad_norm": 1.4673911333084106, + "learning_rate": 4.8231713915420446e-05, + "loss": 5.1592, + "step": 20252 + }, + { + "epoch": 0.12045032829003711, + "grad_norm": 1.3782176971435547, + "learning_rate": 4.8231541362896534e-05, + "loss": 5.3296, + "step": 20253 + }, + { + "epoch": 0.1204562755733181, + "grad_norm": 1.5209922790527344, + "learning_rate": 4.823136880226272e-05, + "loss": 5.4215, + "step": 20254 + }, + { + "epoch": 0.1204622228565991, + "grad_norm": 1.3906199932098389, + "learning_rate": 4.823119623351909e-05, + "loss": 5.2263, + "step": 20255 + }, + { + "epoch": 0.1204681701398801, + "grad_norm": 1.4061380624771118, + "learning_rate": 4.823102365666568e-05, + "loss": 5.2252, + "step": 20256 + }, + { + "epoch": 0.1204741174231611, + "grad_norm": 1.3005892038345337, + "learning_rate": 4.8230851071702564e-05, + "loss": 5.2015, + "step": 20257 + }, + { + "epoch": 0.1204800647064421, + "grad_norm": 1.4949315786361694, + "learning_rate": 4.8230678478629796e-05, + "loss": 4.9753, + "step": 20258 + }, + { + "epoch": 0.1204860119897231, + "grad_norm": 1.5322837829589844, + "learning_rate": 4.823050587744744e-05, + "loss": 5.1862, + "step": 20259 + }, + { + "epoch": 0.12049195927300409, + "grad_norm": 1.379016637802124, + "learning_rate": 4.8230333268155556e-05, + "loss": 5.0689, + "step": 20260 + }, + { + "epoch": 0.12049790655628509, + "grad_norm": 1.2959635257720947, + "learning_rate": 4.8230160650754205e-05, + "loss": 5.1079, + "step": 20261 + }, + { + "epoch": 0.12050385383956609, + "grad_norm": 1.3587706089019775, + "learning_rate": 4.8229988025243436e-05, + "loss": 5.2024, + "step": 20262 + }, + { + "epoch": 0.12050980112284708, + "grad_norm": 1.3031280040740967, + "learning_rate": 4.822981539162332e-05, + "loss": 5.1008, + "step": 20263 + }, + { + "epoch": 0.12051574840612808, + "grad_norm": 1.315364956855774, + "learning_rate": 4.822964274989392e-05, + "loss": 4.8122, + "step": 20264 + }, + { + "epoch": 0.12052169568940908, + "grad_norm": 1.3627794981002808, + "learning_rate": 4.8229470100055293e-05, + "loss": 5.0851, + "step": 20265 + }, + { + "epoch": 0.12052764297269007, + "grad_norm": 1.4490907192230225, + "learning_rate": 4.822929744210749e-05, + "loss": 4.7956, + "step": 20266 + }, + { + "epoch": 0.12053359025597107, + "grad_norm": 1.1658390760421753, + "learning_rate": 4.8229124776050584e-05, + "loss": 5.0365, + "step": 20267 + }, + { + "epoch": 0.12053953753925208, + "grad_norm": 1.2844047546386719, + "learning_rate": 4.822895210188463e-05, + "loss": 5.3005, + "step": 20268 + }, + { + "epoch": 0.12054548482253306, + "grad_norm": 1.5759227275848389, + "learning_rate": 4.822877941960969e-05, + "loss": 5.0768, + "step": 20269 + }, + { + "epoch": 0.12055143210581407, + "grad_norm": 1.457592248916626, + "learning_rate": 4.822860672922582e-05, + "loss": 5.1662, + "step": 20270 + }, + { + "epoch": 0.12055737938909507, + "grad_norm": 1.2711186408996582, + "learning_rate": 4.8228434030733086e-05, + "loss": 5.3703, + "step": 20271 + }, + { + "epoch": 0.12056332667237606, + "grad_norm": 1.300824522972107, + "learning_rate": 4.822826132413155e-05, + "loss": 5.2529, + "step": 20272 + }, + { + "epoch": 0.12056927395565706, + "grad_norm": 1.2395694255828857, + "learning_rate": 4.822808860942126e-05, + "loss": 5.3225, + "step": 20273 + }, + { + "epoch": 0.12057522123893805, + "grad_norm": 1.491053581237793, + "learning_rate": 4.822791588660229e-05, + "loss": 5.5039, + "step": 20274 + }, + { + "epoch": 0.12058116852221905, + "grad_norm": 1.4981472492218018, + "learning_rate": 4.8227743155674684e-05, + "loss": 4.8774, + "step": 20275 + }, + { + "epoch": 0.12058711580550005, + "grad_norm": 1.4627505540847778, + "learning_rate": 4.822757041663852e-05, + "loss": 4.9165, + "step": 20276 + }, + { + "epoch": 0.12059306308878104, + "grad_norm": 1.5328632593154907, + "learning_rate": 4.8227397669493856e-05, + "loss": 4.8773, + "step": 20277 + }, + { + "epoch": 0.12059901037206204, + "grad_norm": 1.314146876335144, + "learning_rate": 4.822722491424074e-05, + "loss": 5.0159, + "step": 20278 + }, + { + "epoch": 0.12060495765534304, + "grad_norm": 1.435636043548584, + "learning_rate": 4.822705215087925e-05, + "loss": 5.1621, + "step": 20279 + }, + { + "epoch": 0.12061090493862403, + "grad_norm": 1.3141332864761353, + "learning_rate": 4.822687937940943e-05, + "loss": 5.3143, + "step": 20280 + }, + { + "epoch": 0.12061685222190503, + "grad_norm": 1.3140829801559448, + "learning_rate": 4.822670659983134e-05, + "loss": 5.3171, + "step": 20281 + }, + { + "epoch": 0.12062279950518603, + "grad_norm": 1.5490076541900635, + "learning_rate": 4.8226533812145056e-05, + "loss": 5.1932, + "step": 20282 + }, + { + "epoch": 0.12062874678846702, + "grad_norm": 1.4878573417663574, + "learning_rate": 4.822636101635063e-05, + "loss": 5.1662, + "step": 20283 + }, + { + "epoch": 0.12063469407174802, + "grad_norm": 1.519872784614563, + "learning_rate": 4.822618821244811e-05, + "loss": 5.0641, + "step": 20284 + }, + { + "epoch": 0.12064064135502903, + "grad_norm": 1.430929183959961, + "learning_rate": 4.822601540043757e-05, + "loss": 4.9086, + "step": 20285 + }, + { + "epoch": 0.12064658863831002, + "grad_norm": 1.483995795249939, + "learning_rate": 4.822584258031908e-05, + "loss": 4.992, + "step": 20286 + }, + { + "epoch": 0.12065253592159102, + "grad_norm": 1.3074853420257568, + "learning_rate": 4.822566975209269e-05, + "loss": 4.9514, + "step": 20287 + }, + { + "epoch": 0.12065848320487202, + "grad_norm": 1.6032319068908691, + "learning_rate": 4.822549691575844e-05, + "loss": 4.8495, + "step": 20288 + }, + { + "epoch": 0.12066443048815301, + "grad_norm": 1.2918034791946411, + "learning_rate": 4.822532407131641e-05, + "loss": 5.0728, + "step": 20289 + }, + { + "epoch": 0.12067037777143401, + "grad_norm": 1.3000357151031494, + "learning_rate": 4.8225151218766675e-05, + "loss": 5.0898, + "step": 20290 + }, + { + "epoch": 0.12067632505471501, + "grad_norm": 1.3674614429473877, + "learning_rate": 4.8224978358109274e-05, + "loss": 4.8252, + "step": 20291 + }, + { + "epoch": 0.120682272337996, + "grad_norm": 1.1932893991470337, + "learning_rate": 4.822480548934427e-05, + "loss": 4.9946, + "step": 20292 + }, + { + "epoch": 0.120688219621277, + "grad_norm": 1.1052628755569458, + "learning_rate": 4.822463261247173e-05, + "loss": 5.0293, + "step": 20293 + }, + { + "epoch": 0.120694166904558, + "grad_norm": 1.1658306121826172, + "learning_rate": 4.82244597274917e-05, + "loss": 4.9417, + "step": 20294 + }, + { + "epoch": 0.12070011418783899, + "grad_norm": 1.1357192993164062, + "learning_rate": 4.822428683440426e-05, + "loss": 4.9448, + "step": 20295 + }, + { + "epoch": 0.12070606147112, + "grad_norm": 1.0769197940826416, + "learning_rate": 4.822411393320946e-05, + "loss": 4.8676, + "step": 20296 + }, + { + "epoch": 0.120712008754401, + "grad_norm": 1.4339419603347778, + "learning_rate": 4.8223941023907366e-05, + "loss": 5.0648, + "step": 20297 + }, + { + "epoch": 0.12071795603768198, + "grad_norm": 1.6009191274642944, + "learning_rate": 4.822376810649803e-05, + "loss": 5.2228, + "step": 20298 + }, + { + "epoch": 0.12072390332096299, + "grad_norm": 1.5266865491867065, + "learning_rate": 4.8223595180981515e-05, + "loss": 5.1399, + "step": 20299 + }, + { + "epoch": 0.12072985060424399, + "grad_norm": 1.6861037015914917, + "learning_rate": 4.822342224735788e-05, + "loss": 4.9326, + "step": 20300 + }, + { + "epoch": 0.12073579788752498, + "grad_norm": 1.4925029277801514, + "learning_rate": 4.8223249305627204e-05, + "loss": 4.9586, + "step": 20301 + }, + { + "epoch": 0.12074174517080598, + "grad_norm": 1.3088650703430176, + "learning_rate": 4.822307635578952e-05, + "loss": 5.1486, + "step": 20302 + }, + { + "epoch": 0.12074769245408697, + "grad_norm": 1.5702837705612183, + "learning_rate": 4.82229033978449e-05, + "loss": 4.788, + "step": 20303 + }, + { + "epoch": 0.12075363973736797, + "grad_norm": 1.5717079639434814, + "learning_rate": 4.8222730431793406e-05, + "loss": 4.6354, + "step": 20304 + }, + { + "epoch": 0.12075958702064897, + "grad_norm": 1.4520710706710815, + "learning_rate": 4.822255745763509e-05, + "loss": 4.6995, + "step": 20305 + }, + { + "epoch": 0.12076553430392996, + "grad_norm": 1.57894766330719, + "learning_rate": 4.822238447537003e-05, + "loss": 4.6355, + "step": 20306 + }, + { + "epoch": 0.12077148158721096, + "grad_norm": 1.5820640325546265, + "learning_rate": 4.822221148499827e-05, + "loss": 4.6993, + "step": 20307 + }, + { + "epoch": 0.12077742887049196, + "grad_norm": 1.5759177207946777, + "learning_rate": 4.822203848651987e-05, + "loss": 4.5678, + "step": 20308 + }, + { + "epoch": 0.12078337615377295, + "grad_norm": 1.5758824348449707, + "learning_rate": 4.822186547993491e-05, + "loss": 4.547, + "step": 20309 + }, + { + "epoch": 0.12078932343705395, + "grad_norm": 1.6604961156845093, + "learning_rate": 4.822169246524343e-05, + "loss": 4.6418, + "step": 20310 + }, + { + "epoch": 0.12079527072033495, + "grad_norm": 1.6913725137710571, + "learning_rate": 4.8221519442445496e-05, + "loss": 4.5329, + "step": 20311 + }, + { + "epoch": 0.12080121800361594, + "grad_norm": 1.6500364542007446, + "learning_rate": 4.822134641154117e-05, + "loss": 4.6701, + "step": 20312 + }, + { + "epoch": 0.12080716528689694, + "grad_norm": 1.6819617748260498, + "learning_rate": 4.822117337253051e-05, + "loss": 4.619, + "step": 20313 + }, + { + "epoch": 0.12081311257017795, + "grad_norm": 1.27179753780365, + "learning_rate": 4.8221000325413576e-05, + "loss": 5.091, + "step": 20314 + }, + { + "epoch": 0.12081905985345893, + "grad_norm": 1.357703447341919, + "learning_rate": 4.822082727019044e-05, + "loss": 4.9313, + "step": 20315 + }, + { + "epoch": 0.12082500713673994, + "grad_norm": 1.2419538497924805, + "learning_rate": 4.8220654206861144e-05, + "loss": 4.9511, + "step": 20316 + }, + { + "epoch": 0.12083095442002094, + "grad_norm": 1.4506672620773315, + "learning_rate": 4.822048113542576e-05, + "loss": 5.1608, + "step": 20317 + }, + { + "epoch": 0.12083690170330193, + "grad_norm": 1.597922921180725, + "learning_rate": 4.8220308055884345e-05, + "loss": 5.1663, + "step": 20318 + }, + { + "epoch": 0.12084284898658293, + "grad_norm": 1.2692219018936157, + "learning_rate": 4.822013496823696e-05, + "loss": 5.0838, + "step": 20319 + }, + { + "epoch": 0.12084879626986393, + "grad_norm": 1.427439570426941, + "learning_rate": 4.8219961872483674e-05, + "loss": 4.9863, + "step": 20320 + }, + { + "epoch": 0.12085474355314492, + "grad_norm": 1.3992658853530884, + "learning_rate": 4.821978876862453e-05, + "loss": 5.1907, + "step": 20321 + }, + { + "epoch": 0.12086069083642592, + "grad_norm": 1.3777414560317993, + "learning_rate": 4.8219615656659605e-05, + "loss": 5.137, + "step": 20322 + }, + { + "epoch": 0.12086663811970692, + "grad_norm": 1.3394333124160767, + "learning_rate": 4.821944253658895e-05, + "loss": 5.1222, + "step": 20323 + }, + { + "epoch": 0.12087258540298791, + "grad_norm": 1.3054091930389404, + "learning_rate": 4.8219269408412625e-05, + "loss": 4.9626, + "step": 20324 + }, + { + "epoch": 0.12087853268626891, + "grad_norm": 1.3209751844406128, + "learning_rate": 4.8219096272130696e-05, + "loss": 5.0408, + "step": 20325 + }, + { + "epoch": 0.12088447996954992, + "grad_norm": 1.3860117197036743, + "learning_rate": 4.821892312774322e-05, + "loss": 4.9667, + "step": 20326 + }, + { + "epoch": 0.1208904272528309, + "grad_norm": 1.2468161582946777, + "learning_rate": 4.821874997525025e-05, + "loss": 5.1203, + "step": 20327 + }, + { + "epoch": 0.1208963745361119, + "grad_norm": 1.221932291984558, + "learning_rate": 4.821857681465186e-05, + "loss": 4.9117, + "step": 20328 + }, + { + "epoch": 0.12090232181939291, + "grad_norm": 1.2188096046447754, + "learning_rate": 4.8218403645948105e-05, + "loss": 4.7647, + "step": 20329 + }, + { + "epoch": 0.1209082691026739, + "grad_norm": 1.4023007154464722, + "learning_rate": 4.8218230469139044e-05, + "loss": 4.9038, + "step": 20330 + }, + { + "epoch": 0.1209142163859549, + "grad_norm": 1.4733843803405762, + "learning_rate": 4.821805728422474e-05, + "loss": 4.9782, + "step": 20331 + }, + { + "epoch": 0.12092016366923589, + "grad_norm": 1.405462384223938, + "learning_rate": 4.821788409120525e-05, + "loss": 5.0028, + "step": 20332 + }, + { + "epoch": 0.12092611095251689, + "grad_norm": 1.4103752374649048, + "learning_rate": 4.821771089008064e-05, + "loss": 4.8219, + "step": 20333 + }, + { + "epoch": 0.12093205823579789, + "grad_norm": 1.403225064277649, + "learning_rate": 4.821753768085096e-05, + "loss": 4.9024, + "step": 20334 + }, + { + "epoch": 0.12093800551907888, + "grad_norm": 1.3480467796325684, + "learning_rate": 4.821736446351629e-05, + "loss": 4.9341, + "step": 20335 + }, + { + "epoch": 0.12094395280235988, + "grad_norm": 1.4869621992111206, + "learning_rate": 4.821719123807667e-05, + "loss": 5.6448, + "step": 20336 + }, + { + "epoch": 0.12094990008564088, + "grad_norm": 1.3473197221755981, + "learning_rate": 4.821701800453217e-05, + "loss": 4.9512, + "step": 20337 + }, + { + "epoch": 0.12095584736892187, + "grad_norm": 1.378721833229065, + "learning_rate": 4.821684476288285e-05, + "loss": 5.0146, + "step": 20338 + }, + { + "epoch": 0.12096179465220287, + "grad_norm": 1.2590171098709106, + "learning_rate": 4.821667151312876e-05, + "loss": 4.8453, + "step": 20339 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 1.3700976371765137, + "learning_rate": 4.821649825526998e-05, + "loss": 4.989, + "step": 20340 + }, + { + "epoch": 0.12097368921876486, + "grad_norm": 1.2956781387329102, + "learning_rate": 4.821632498930656e-05, + "loss": 5.1885, + "step": 20341 + }, + { + "epoch": 0.12097963650204586, + "grad_norm": 1.5004302263259888, + "learning_rate": 4.821615171523856e-05, + "loss": 4.8978, + "step": 20342 + }, + { + "epoch": 0.12098558378532687, + "grad_norm": 1.4427284002304077, + "learning_rate": 4.821597843306603e-05, + "loss": 5.0771, + "step": 20343 + }, + { + "epoch": 0.12099153106860785, + "grad_norm": 1.2329649925231934, + "learning_rate": 4.8215805142789054e-05, + "loss": 5.1695, + "step": 20344 + }, + { + "epoch": 0.12099747835188886, + "grad_norm": 1.521106243133545, + "learning_rate": 4.8215631844407685e-05, + "loss": 4.8117, + "step": 20345 + }, + { + "epoch": 0.12100342563516986, + "grad_norm": 1.4634170532226562, + "learning_rate": 4.8215458537921966e-05, + "loss": 4.8144, + "step": 20346 + }, + { + "epoch": 0.12100937291845085, + "grad_norm": 1.2921918630599976, + "learning_rate": 4.821528522333197e-05, + "loss": 5.0769, + "step": 20347 + }, + { + "epoch": 0.12101532020173185, + "grad_norm": 1.5667484998703003, + "learning_rate": 4.821511190063777e-05, + "loss": 4.7748, + "step": 20348 + }, + { + "epoch": 0.12102126748501285, + "grad_norm": 1.3546236753463745, + "learning_rate": 4.8214938569839405e-05, + "loss": 5.0496, + "step": 20349 + }, + { + "epoch": 0.12102721476829384, + "grad_norm": 1.354236125946045, + "learning_rate": 4.821476523093695e-05, + "loss": 4.9173, + "step": 20350 + }, + { + "epoch": 0.12103316205157484, + "grad_norm": 1.3883708715438843, + "learning_rate": 4.821459188393046e-05, + "loss": 5.0093, + "step": 20351 + }, + { + "epoch": 0.12103910933485584, + "grad_norm": 1.5914138555526733, + "learning_rate": 4.8214418528819995e-05, + "loss": 4.7995, + "step": 20352 + }, + { + "epoch": 0.12104505661813683, + "grad_norm": 1.3804936408996582, + "learning_rate": 4.821424516560561e-05, + "loss": 5.0071, + "step": 20353 + }, + { + "epoch": 0.12105100390141783, + "grad_norm": 1.4783899784088135, + "learning_rate": 4.8214071794287376e-05, + "loss": 4.9744, + "step": 20354 + }, + { + "epoch": 0.12105695118469884, + "grad_norm": 1.480790376663208, + "learning_rate": 4.821389841486535e-05, + "loss": 4.9975, + "step": 20355 + }, + { + "epoch": 0.12106289846797982, + "grad_norm": 1.852853536605835, + "learning_rate": 4.82137250273396e-05, + "loss": 5.069, + "step": 20356 + }, + { + "epoch": 0.12106884575126083, + "grad_norm": 1.623017430305481, + "learning_rate": 4.821355163171016e-05, + "loss": 4.9939, + "step": 20357 + }, + { + "epoch": 0.12107479303454183, + "grad_norm": 1.526219367980957, + "learning_rate": 4.8213378227977123e-05, + "loss": 5.1281, + "step": 20358 + }, + { + "epoch": 0.12108074031782282, + "grad_norm": 1.574321985244751, + "learning_rate": 4.8213204816140536e-05, + "loss": 5.0241, + "step": 20359 + }, + { + "epoch": 0.12108668760110382, + "grad_norm": 1.5978455543518066, + "learning_rate": 4.8213031396200446e-05, + "loss": 5.0107, + "step": 20360 + }, + { + "epoch": 0.12109263488438482, + "grad_norm": 1.509109616279602, + "learning_rate": 4.821285796815694e-05, + "loss": 5.0056, + "step": 20361 + }, + { + "epoch": 0.12109858216766581, + "grad_norm": 1.4923186302185059, + "learning_rate": 4.8212684532010054e-05, + "loss": 5.0412, + "step": 20362 + }, + { + "epoch": 0.12110452945094681, + "grad_norm": 1.7046619653701782, + "learning_rate": 4.8212511087759874e-05, + "loss": 4.8996, + "step": 20363 + }, + { + "epoch": 0.1211104767342278, + "grad_norm": 1.7599172592163086, + "learning_rate": 4.8212337635406435e-05, + "loss": 4.9979, + "step": 20364 + }, + { + "epoch": 0.1211164240175088, + "grad_norm": 1.6309099197387695, + "learning_rate": 4.821216417494982e-05, + "loss": 4.9639, + "step": 20365 + }, + { + "epoch": 0.1211223713007898, + "grad_norm": 1.7311389446258545, + "learning_rate": 4.821199070639006e-05, + "loss": 4.9296, + "step": 20366 + }, + { + "epoch": 0.12112831858407079, + "grad_norm": 1.480536699295044, + "learning_rate": 4.8211817229727246e-05, + "loss": 4.9338, + "step": 20367 + }, + { + "epoch": 0.12113426586735179, + "grad_norm": 1.4267778396606445, + "learning_rate": 4.821164374496143e-05, + "loss": 4.8954, + "step": 20368 + }, + { + "epoch": 0.1211402131506328, + "grad_norm": 1.3726919889450073, + "learning_rate": 4.821147025209266e-05, + "loss": 4.8362, + "step": 20369 + }, + { + "epoch": 0.12114616043391378, + "grad_norm": 1.5158253908157349, + "learning_rate": 4.821129675112101e-05, + "loss": 5.0629, + "step": 20370 + }, + { + "epoch": 0.12115210771719478, + "grad_norm": 1.2002782821655273, + "learning_rate": 4.8211123242046535e-05, + "loss": 4.7668, + "step": 20371 + }, + { + "epoch": 0.12115805500047579, + "grad_norm": 1.123113751411438, + "learning_rate": 4.821094972486929e-05, + "loss": 5.0103, + "step": 20372 + }, + { + "epoch": 0.12116400228375677, + "grad_norm": 1.360532283782959, + "learning_rate": 4.821077619958936e-05, + "loss": 5.0503, + "step": 20373 + }, + { + "epoch": 0.12116994956703778, + "grad_norm": 1.3912672996520996, + "learning_rate": 4.821060266620677e-05, + "loss": 4.9326, + "step": 20374 + }, + { + "epoch": 0.12117589685031878, + "grad_norm": 1.2644896507263184, + "learning_rate": 4.821042912472161e-05, + "loss": 4.9584, + "step": 20375 + }, + { + "epoch": 0.12118184413359977, + "grad_norm": 1.1967086791992188, + "learning_rate": 4.821025557513392e-05, + "loss": 4.8954, + "step": 20376 + }, + { + "epoch": 0.12118779141688077, + "grad_norm": 1.353725552558899, + "learning_rate": 4.821008201744378e-05, + "loss": 4.8438, + "step": 20377 + }, + { + "epoch": 0.12119373870016177, + "grad_norm": 1.239682912826538, + "learning_rate": 4.820990845165123e-05, + "loss": 4.9624, + "step": 20378 + }, + { + "epoch": 0.12119968598344276, + "grad_norm": 1.1952159404754639, + "learning_rate": 4.820973487775634e-05, + "loss": 4.9254, + "step": 20379 + }, + { + "epoch": 0.12120563326672376, + "grad_norm": 1.4531627893447876, + "learning_rate": 4.820956129575918e-05, + "loss": 4.9487, + "step": 20380 + }, + { + "epoch": 0.12121158055000476, + "grad_norm": 1.2653759717941284, + "learning_rate": 4.8209387705659805e-05, + "loss": 4.7916, + "step": 20381 + }, + { + "epoch": 0.12121752783328575, + "grad_norm": 1.3156383037567139, + "learning_rate": 4.820921410745826e-05, + "loss": 5.0585, + "step": 20382 + }, + { + "epoch": 0.12122347511656675, + "grad_norm": 1.536216139793396, + "learning_rate": 4.820904050115462e-05, + "loss": 4.849, + "step": 20383 + }, + { + "epoch": 0.12122942239984776, + "grad_norm": 1.6567318439483643, + "learning_rate": 4.820886688674895e-05, + "loss": 4.6508, + "step": 20384 + }, + { + "epoch": 0.12123536968312874, + "grad_norm": 1.4173903465270996, + "learning_rate": 4.82086932642413e-05, + "loss": 4.8919, + "step": 20385 + }, + { + "epoch": 0.12124131696640975, + "grad_norm": 1.4352593421936035, + "learning_rate": 4.820851963363174e-05, + "loss": 4.7546, + "step": 20386 + }, + { + "epoch": 0.12124726424969075, + "grad_norm": 1.538988471031189, + "learning_rate": 4.8208345994920326e-05, + "loss": 4.7707, + "step": 20387 + }, + { + "epoch": 0.12125321153297174, + "grad_norm": 1.3959681987762451, + "learning_rate": 4.820817234810711e-05, + "loss": 4.5633, + "step": 20388 + }, + { + "epoch": 0.12125915881625274, + "grad_norm": 1.3972582817077637, + "learning_rate": 4.820799869319217e-05, + "loss": 4.5165, + "step": 20389 + }, + { + "epoch": 0.12126510609953374, + "grad_norm": 1.770070195198059, + "learning_rate": 4.820782503017555e-05, + "loss": 4.9679, + "step": 20390 + }, + { + "epoch": 0.12127105338281473, + "grad_norm": 1.6822887659072876, + "learning_rate": 4.820765135905732e-05, + "loss": 4.9589, + "step": 20391 + }, + { + "epoch": 0.12127700066609573, + "grad_norm": 1.6352055072784424, + "learning_rate": 4.820747767983754e-05, + "loss": 5.0389, + "step": 20392 + }, + { + "epoch": 0.12128294794937672, + "grad_norm": 1.4803529977798462, + "learning_rate": 4.8207303992516274e-05, + "loss": 5.1071, + "step": 20393 + }, + { + "epoch": 0.12128889523265772, + "grad_norm": 1.5575767755508423, + "learning_rate": 4.820713029709357e-05, + "loss": 5.2072, + "step": 20394 + }, + { + "epoch": 0.12129484251593872, + "grad_norm": 1.3417006731033325, + "learning_rate": 4.82069565935695e-05, + "loss": 5.1991, + "step": 20395 + }, + { + "epoch": 0.12130078979921971, + "grad_norm": 1.3577461242675781, + "learning_rate": 4.820678288194412e-05, + "loss": 5.3225, + "step": 20396 + }, + { + "epoch": 0.12130673708250071, + "grad_norm": 1.3763153553009033, + "learning_rate": 4.8206609162217494e-05, + "loss": 5.0247, + "step": 20397 + }, + { + "epoch": 0.12131268436578171, + "grad_norm": 1.7175389528274536, + "learning_rate": 4.8206435434389675e-05, + "loss": 5.2964, + "step": 20398 + }, + { + "epoch": 0.1213186316490627, + "grad_norm": 1.4921340942382812, + "learning_rate": 4.820626169846073e-05, + "loss": 4.781, + "step": 20399 + }, + { + "epoch": 0.1213245789323437, + "grad_norm": 1.3149629831314087, + "learning_rate": 4.8206087954430725e-05, + "loss": 5.2148, + "step": 20400 + }, + { + "epoch": 0.1213305262156247, + "grad_norm": 1.5960938930511475, + "learning_rate": 4.8205914202299715e-05, + "loss": 5.4152, + "step": 20401 + }, + { + "epoch": 0.1213364734989057, + "grad_norm": 1.4101301431655884, + "learning_rate": 4.8205740442067757e-05, + "loss": 5.2033, + "step": 20402 + }, + { + "epoch": 0.1213424207821867, + "grad_norm": 1.2584593296051025, + "learning_rate": 4.820556667373492e-05, + "loss": 5.1183, + "step": 20403 + }, + { + "epoch": 0.1213483680654677, + "grad_norm": 1.365639090538025, + "learning_rate": 4.820539289730125e-05, + "loss": 4.9446, + "step": 20404 + }, + { + "epoch": 0.12135431534874869, + "grad_norm": 1.6515495777130127, + "learning_rate": 4.820521911276682e-05, + "loss": 4.9002, + "step": 20405 + }, + { + "epoch": 0.12136026263202969, + "grad_norm": 1.2481954097747803, + "learning_rate": 4.8205045320131684e-05, + "loss": 5.3332, + "step": 20406 + }, + { + "epoch": 0.12136620991531069, + "grad_norm": 1.4952552318572998, + "learning_rate": 4.820487151939591e-05, + "loss": 4.7492, + "step": 20407 + }, + { + "epoch": 0.12137215719859168, + "grad_norm": 1.4472140073776245, + "learning_rate": 4.8204697710559556e-05, + "loss": 5.158, + "step": 20408 + }, + { + "epoch": 0.12137810448187268, + "grad_norm": 1.2544384002685547, + "learning_rate": 4.8204523893622685e-05, + "loss": 5.2041, + "step": 20409 + }, + { + "epoch": 0.12138405176515368, + "grad_norm": 1.1283172369003296, + "learning_rate": 4.820435006858535e-05, + "loss": 5.246, + "step": 20410 + }, + { + "epoch": 0.12138999904843467, + "grad_norm": 1.1113736629486084, + "learning_rate": 4.8204176235447617e-05, + "loss": 5.2116, + "step": 20411 + }, + { + "epoch": 0.12139594633171567, + "grad_norm": 1.2103666067123413, + "learning_rate": 4.820400239420955e-05, + "loss": 5.4421, + "step": 20412 + }, + { + "epoch": 0.12140189361499668, + "grad_norm": 1.2054588794708252, + "learning_rate": 4.82038285448712e-05, + "loss": 5.2503, + "step": 20413 + }, + { + "epoch": 0.12140784089827766, + "grad_norm": 1.568247675895691, + "learning_rate": 4.820365468743263e-05, + "loss": 4.9009, + "step": 20414 + }, + { + "epoch": 0.12141378818155867, + "grad_norm": 1.7106029987335205, + "learning_rate": 4.820348082189391e-05, + "loss": 4.7826, + "step": 20415 + }, + { + "epoch": 0.12141973546483967, + "grad_norm": 1.4479119777679443, + "learning_rate": 4.8203306948255095e-05, + "loss": 5.0084, + "step": 20416 + }, + { + "epoch": 0.12142568274812066, + "grad_norm": 1.467880368232727, + "learning_rate": 4.820313306651624e-05, + "loss": 5.2477, + "step": 20417 + }, + { + "epoch": 0.12143163003140166, + "grad_norm": 1.4088891744613647, + "learning_rate": 4.820295917667742e-05, + "loss": 5.1151, + "step": 20418 + }, + { + "epoch": 0.12143757731468266, + "grad_norm": 1.2838404178619385, + "learning_rate": 4.820278527873868e-05, + "loss": 5.6774, + "step": 20419 + }, + { + "epoch": 0.12144352459796365, + "grad_norm": 1.2146633863449097, + "learning_rate": 4.820261137270009e-05, + "loss": 5.0487, + "step": 20420 + }, + { + "epoch": 0.12144947188124465, + "grad_norm": 1.5603777170181274, + "learning_rate": 4.820243745856171e-05, + "loss": 5.0569, + "step": 20421 + }, + { + "epoch": 0.12145541916452564, + "grad_norm": 1.4454957246780396, + "learning_rate": 4.8202263536323586e-05, + "loss": 4.9556, + "step": 20422 + }, + { + "epoch": 0.12146136644780664, + "grad_norm": 1.4732788801193237, + "learning_rate": 4.820208960598581e-05, + "loss": 5.0095, + "step": 20423 + }, + { + "epoch": 0.12146731373108764, + "grad_norm": 1.4078243970870972, + "learning_rate": 4.820191566754841e-05, + "loss": 5.2642, + "step": 20424 + }, + { + "epoch": 0.12147326101436863, + "grad_norm": 1.2497223615646362, + "learning_rate": 4.820174172101147e-05, + "loss": 5.0792, + "step": 20425 + }, + { + "epoch": 0.12147920829764963, + "grad_norm": 1.5479954481124878, + "learning_rate": 4.8201567766375034e-05, + "loss": 4.9157, + "step": 20426 + }, + { + "epoch": 0.12148515558093063, + "grad_norm": 1.4266546964645386, + "learning_rate": 4.8201393803639175e-05, + "loss": 4.8776, + "step": 20427 + }, + { + "epoch": 0.12149110286421162, + "grad_norm": 1.3757798671722412, + "learning_rate": 4.8201219832803946e-05, + "loss": 4.8253, + "step": 20428 + }, + { + "epoch": 0.12149705014749262, + "grad_norm": 1.3386640548706055, + "learning_rate": 4.8201045853869416e-05, + "loss": 4.7895, + "step": 20429 + }, + { + "epoch": 0.12150299743077363, + "grad_norm": 1.3968008756637573, + "learning_rate": 4.820087186683564e-05, + "loss": 4.7238, + "step": 20430 + }, + { + "epoch": 0.12150894471405461, + "grad_norm": 1.4070801734924316, + "learning_rate": 4.820069787170267e-05, + "loss": 4.9614, + "step": 20431 + }, + { + "epoch": 0.12151489199733562, + "grad_norm": 1.4280625581741333, + "learning_rate": 4.820052386847059e-05, + "loss": 4.6765, + "step": 20432 + }, + { + "epoch": 0.12152083928061662, + "grad_norm": 1.3841910362243652, + "learning_rate": 4.820034985713944e-05, + "loss": 4.8008, + "step": 20433 + }, + { + "epoch": 0.1215267865638976, + "grad_norm": 1.3555341958999634, + "learning_rate": 4.820017583770928e-05, + "loss": 4.7967, + "step": 20434 + }, + { + "epoch": 0.12153273384717861, + "grad_norm": 1.3651773929595947, + "learning_rate": 4.820000181018019e-05, + "loss": 4.9003, + "step": 20435 + }, + { + "epoch": 0.12153868113045961, + "grad_norm": 1.364749789237976, + "learning_rate": 4.8199827774552215e-05, + "loss": 4.9834, + "step": 20436 + }, + { + "epoch": 0.1215446284137406, + "grad_norm": 1.384041428565979, + "learning_rate": 4.8199653730825424e-05, + "loss": 4.9997, + "step": 20437 + }, + { + "epoch": 0.1215505756970216, + "grad_norm": 1.544909954071045, + "learning_rate": 4.8199479678999867e-05, + "loss": 4.7909, + "step": 20438 + }, + { + "epoch": 0.1215565229803026, + "grad_norm": 1.4012216329574585, + "learning_rate": 4.819930561907561e-05, + "loss": 4.7359, + "step": 20439 + }, + { + "epoch": 0.12156247026358359, + "grad_norm": 1.2876297235488892, + "learning_rate": 4.819913155105272e-05, + "loss": 4.5025, + "step": 20440 + }, + { + "epoch": 0.1215684175468646, + "grad_norm": 1.5008763074874878, + "learning_rate": 4.819895747493125e-05, + "loss": 4.4486, + "step": 20441 + }, + { + "epoch": 0.1215743648301456, + "grad_norm": 1.3206987380981445, + "learning_rate": 4.8198783390711264e-05, + "loss": 4.4767, + "step": 20442 + }, + { + "epoch": 0.12158031211342658, + "grad_norm": 1.3569231033325195, + "learning_rate": 4.819860929839283e-05, + "loss": 4.6665, + "step": 20443 + }, + { + "epoch": 0.12158625939670759, + "grad_norm": 1.3377808332443237, + "learning_rate": 4.8198435197975986e-05, + "loss": 4.6109, + "step": 20444 + }, + { + "epoch": 0.12159220667998859, + "grad_norm": 1.5400346517562866, + "learning_rate": 4.8198261089460824e-05, + "loss": 4.2635, + "step": 20445 + }, + { + "epoch": 0.12159815396326958, + "grad_norm": 1.6329059600830078, + "learning_rate": 4.8198086972847376e-05, + "loss": 4.3745, + "step": 20446 + }, + { + "epoch": 0.12160410124655058, + "grad_norm": 1.4274183511734009, + "learning_rate": 4.819791284813573e-05, + "loss": 4.4103, + "step": 20447 + }, + { + "epoch": 0.12161004852983158, + "grad_norm": 1.4671530723571777, + "learning_rate": 4.8197738715325916e-05, + "loss": 4.3995, + "step": 20448 + }, + { + "epoch": 0.12161599581311257, + "grad_norm": 1.3783891201019287, + "learning_rate": 4.819756457441802e-05, + "loss": 4.3874, + "step": 20449 + }, + { + "epoch": 0.12162194309639357, + "grad_norm": 1.4054951667785645, + "learning_rate": 4.819739042541209e-05, + "loss": 4.3307, + "step": 20450 + }, + { + "epoch": 0.12162789037967456, + "grad_norm": 1.5449576377868652, + "learning_rate": 4.81972162683082e-05, + "loss": 4.8499, + "step": 20451 + }, + { + "epoch": 0.12163383766295556, + "grad_norm": 1.3887544870376587, + "learning_rate": 4.8197042103106394e-05, + "loss": 4.622, + "step": 20452 + }, + { + "epoch": 0.12163978494623656, + "grad_norm": 1.319422960281372, + "learning_rate": 4.819686792980673e-05, + "loss": 4.5172, + "step": 20453 + }, + { + "epoch": 0.12164573222951755, + "grad_norm": 1.3681663274765015, + "learning_rate": 4.8196693748409296e-05, + "loss": 4.8121, + "step": 20454 + }, + { + "epoch": 0.12165167951279855, + "grad_norm": 1.250482439994812, + "learning_rate": 4.819651955891413e-05, + "loss": 4.8792, + "step": 20455 + }, + { + "epoch": 0.12165762679607955, + "grad_norm": 1.3297876119613647, + "learning_rate": 4.819634536132129e-05, + "loss": 5.1069, + "step": 20456 + }, + { + "epoch": 0.12166357407936054, + "grad_norm": 1.3733534812927246, + "learning_rate": 4.819617115563086e-05, + "loss": 4.6061, + "step": 20457 + }, + { + "epoch": 0.12166952136264154, + "grad_norm": 1.287663459777832, + "learning_rate": 4.819599694184288e-05, + "loss": 4.9407, + "step": 20458 + }, + { + "epoch": 0.12167546864592255, + "grad_norm": 1.4198147058486938, + "learning_rate": 4.8195822719957416e-05, + "loss": 4.5361, + "step": 20459 + }, + { + "epoch": 0.12168141592920353, + "grad_norm": 1.7429990768432617, + "learning_rate": 4.819564848997453e-05, + "loss": 4.6604, + "step": 20460 + }, + { + "epoch": 0.12168736321248454, + "grad_norm": 1.4298913478851318, + "learning_rate": 4.819547425189429e-05, + "loss": 4.7415, + "step": 20461 + }, + { + "epoch": 0.12169331049576554, + "grad_norm": 1.3519923686981201, + "learning_rate": 4.8195300005716736e-05, + "loss": 5.3706, + "step": 20462 + }, + { + "epoch": 0.12169925777904653, + "grad_norm": 1.1476925611495972, + "learning_rate": 4.819512575144195e-05, + "loss": 5.4474, + "step": 20463 + }, + { + "epoch": 0.12170520506232753, + "grad_norm": 1.2756370306015015, + "learning_rate": 4.819495148906999e-05, + "loss": 4.9747, + "step": 20464 + }, + { + "epoch": 0.12171115234560853, + "grad_norm": 1.3161675930023193, + "learning_rate": 4.8194777218600906e-05, + "loss": 4.7093, + "step": 20465 + }, + { + "epoch": 0.12171709962888952, + "grad_norm": 1.4928854703903198, + "learning_rate": 4.8194602940034766e-05, + "loss": 4.7517, + "step": 20466 + }, + { + "epoch": 0.12172304691217052, + "grad_norm": 1.426684856414795, + "learning_rate": 4.819442865337163e-05, + "loss": 4.8639, + "step": 20467 + }, + { + "epoch": 0.12172899419545152, + "grad_norm": 1.368988037109375, + "learning_rate": 4.819425435861156e-05, + "loss": 4.8532, + "step": 20468 + }, + { + "epoch": 0.12173494147873251, + "grad_norm": 1.492031455039978, + "learning_rate": 4.819408005575461e-05, + "loss": 4.5139, + "step": 20469 + }, + { + "epoch": 0.12174088876201351, + "grad_norm": 1.6340793371200562, + "learning_rate": 4.819390574480085e-05, + "loss": 4.4042, + "step": 20470 + }, + { + "epoch": 0.12174683604529452, + "grad_norm": 1.5353302955627441, + "learning_rate": 4.819373142575034e-05, + "loss": 5.1097, + "step": 20471 + }, + { + "epoch": 0.1217527833285755, + "grad_norm": 1.5314761400222778, + "learning_rate": 4.8193557098603134e-05, + "loss": 4.7689, + "step": 20472 + }, + { + "epoch": 0.1217587306118565, + "grad_norm": 1.4626027345657349, + "learning_rate": 4.8193382763359295e-05, + "loss": 4.434, + "step": 20473 + }, + { + "epoch": 0.12176467789513751, + "grad_norm": 1.621871829032898, + "learning_rate": 4.8193208420018885e-05, + "loss": 4.5098, + "step": 20474 + }, + { + "epoch": 0.1217706251784185, + "grad_norm": 1.5429425239562988, + "learning_rate": 4.819303406858198e-05, + "loss": 4.4547, + "step": 20475 + }, + { + "epoch": 0.1217765724616995, + "grad_norm": 1.5002613067626953, + "learning_rate": 4.819285970904861e-05, + "loss": 4.6906, + "step": 20476 + }, + { + "epoch": 0.1217825197449805, + "grad_norm": 1.2322206497192383, + "learning_rate": 4.819268534141886e-05, + "loss": 5.049, + "step": 20477 + }, + { + "epoch": 0.12178846702826149, + "grad_norm": 1.2598546743392944, + "learning_rate": 4.819251096569278e-05, + "loss": 5.2906, + "step": 20478 + }, + { + "epoch": 0.12179441431154249, + "grad_norm": 1.2702369689941406, + "learning_rate": 4.8192336581870436e-05, + "loss": 5.1828, + "step": 20479 + }, + { + "epoch": 0.12180036159482348, + "grad_norm": 1.3816938400268555, + "learning_rate": 4.819216218995189e-05, + "loss": 5.1083, + "step": 20480 + }, + { + "epoch": 0.12180630887810448, + "grad_norm": 1.2958251237869263, + "learning_rate": 4.819198778993719e-05, + "loss": 5.1715, + "step": 20481 + }, + { + "epoch": 0.12181225616138548, + "grad_norm": 1.2317209243774414, + "learning_rate": 4.819181338182641e-05, + "loss": 5.1969, + "step": 20482 + }, + { + "epoch": 0.12181820344466647, + "grad_norm": 1.362483263015747, + "learning_rate": 4.819163896561961e-05, + "loss": 5.0893, + "step": 20483 + }, + { + "epoch": 0.12182415072794747, + "grad_norm": 1.1019991636276245, + "learning_rate": 4.819146454131685e-05, + "loss": 5.411, + "step": 20484 + }, + { + "epoch": 0.12183009801122847, + "grad_norm": 1.3575057983398438, + "learning_rate": 4.8191290108918184e-05, + "loss": 5.1797, + "step": 20485 + }, + { + "epoch": 0.12183604529450946, + "grad_norm": 1.4110307693481445, + "learning_rate": 4.8191115668423685e-05, + "loss": 5.3108, + "step": 20486 + }, + { + "epoch": 0.12184199257779046, + "grad_norm": 1.3322244882583618, + "learning_rate": 4.819094121983341e-05, + "loss": 5.238, + "step": 20487 + }, + { + "epoch": 0.12184793986107147, + "grad_norm": 1.3466796875, + "learning_rate": 4.819076676314741e-05, + "loss": 5.2786, + "step": 20488 + }, + { + "epoch": 0.12185388714435245, + "grad_norm": 1.4118572473526, + "learning_rate": 4.819059229836575e-05, + "loss": 5.0254, + "step": 20489 + }, + { + "epoch": 0.12185983442763346, + "grad_norm": 1.6264641284942627, + "learning_rate": 4.81904178254885e-05, + "loss": 4.8822, + "step": 20490 + }, + { + "epoch": 0.12186578171091446, + "grad_norm": 1.325591802597046, + "learning_rate": 4.8190243344515705e-05, + "loss": 5.5997, + "step": 20491 + }, + { + "epoch": 0.12187172899419545, + "grad_norm": 1.5424168109893799, + "learning_rate": 4.8190068855447444e-05, + "loss": 5.2096, + "step": 20492 + }, + { + "epoch": 0.12187767627747645, + "grad_norm": 1.3096263408660889, + "learning_rate": 4.818989435828377e-05, + "loss": 5.1026, + "step": 20493 + }, + { + "epoch": 0.12188362356075745, + "grad_norm": 1.3479657173156738, + "learning_rate": 4.8189719853024746e-05, + "loss": 5.0403, + "step": 20494 + }, + { + "epoch": 0.12188957084403844, + "grad_norm": 1.1970547437667847, + "learning_rate": 4.818954533967043e-05, + "loss": 5.06, + "step": 20495 + }, + { + "epoch": 0.12189551812731944, + "grad_norm": 1.3364722728729248, + "learning_rate": 4.818937081822088e-05, + "loss": 5.0216, + "step": 20496 + }, + { + "epoch": 0.12190146541060044, + "grad_norm": 1.2553714513778687, + "learning_rate": 4.818919628867615e-05, + "loss": 4.9662, + "step": 20497 + }, + { + "epoch": 0.12190741269388143, + "grad_norm": 1.270330786705017, + "learning_rate": 4.818902175103633e-05, + "loss": 4.8526, + "step": 20498 + }, + { + "epoch": 0.12191335997716243, + "grad_norm": 1.4872468709945679, + "learning_rate": 4.818884720530145e-05, + "loss": 4.9435, + "step": 20499 + }, + { + "epoch": 0.12191930726044344, + "grad_norm": 1.3152670860290527, + "learning_rate": 4.818867265147159e-05, + "loss": 5.1301, + "step": 20500 + }, + { + "epoch": 0.12192525454372442, + "grad_norm": 1.210864543914795, + "learning_rate": 4.8188498089546794e-05, + "loss": 5.1465, + "step": 20501 + }, + { + "epoch": 0.12193120182700543, + "grad_norm": 1.276159644126892, + "learning_rate": 4.818832351952715e-05, + "loss": 5.0847, + "step": 20502 + }, + { + "epoch": 0.12193714911028643, + "grad_norm": 1.449988842010498, + "learning_rate": 4.8188148941412684e-05, + "loss": 5.1143, + "step": 20503 + }, + { + "epoch": 0.12194309639356742, + "grad_norm": 1.241921305656433, + "learning_rate": 4.818797435520348e-05, + "loss": 5.067, + "step": 20504 + }, + { + "epoch": 0.12194904367684842, + "grad_norm": 1.3087794780731201, + "learning_rate": 4.81877997608996e-05, + "loss": 5.121, + "step": 20505 + }, + { + "epoch": 0.12195499096012942, + "grad_norm": 1.2226066589355469, + "learning_rate": 4.8187625158501095e-05, + "loss": 5.1879, + "step": 20506 + }, + { + "epoch": 0.12196093824341041, + "grad_norm": 1.2744648456573486, + "learning_rate": 4.8187450548008025e-05, + "loss": 5.1308, + "step": 20507 + }, + { + "epoch": 0.12196688552669141, + "grad_norm": 1.3409245014190674, + "learning_rate": 4.8187275929420464e-05, + "loss": 5.0914, + "step": 20508 + }, + { + "epoch": 0.1219728328099724, + "grad_norm": 1.2840641736984253, + "learning_rate": 4.818710130273846e-05, + "loss": 5.0818, + "step": 20509 + }, + { + "epoch": 0.1219787800932534, + "grad_norm": 1.4204998016357422, + "learning_rate": 4.818692666796207e-05, + "loss": 5.4553, + "step": 20510 + }, + { + "epoch": 0.1219847273765344, + "grad_norm": 1.3061211109161377, + "learning_rate": 4.818675202509137e-05, + "loss": 5.1777, + "step": 20511 + }, + { + "epoch": 0.12199067465981539, + "grad_norm": 1.3137598037719727, + "learning_rate": 4.818657737412642e-05, + "loss": 5.1156, + "step": 20512 + }, + { + "epoch": 0.12199662194309639, + "grad_norm": 1.1616209745407104, + "learning_rate": 4.818640271506727e-05, + "loss": 5.3169, + "step": 20513 + }, + { + "epoch": 0.1220025692263774, + "grad_norm": 1.270844578742981, + "learning_rate": 4.8186228047914e-05, + "loss": 5.3005, + "step": 20514 + }, + { + "epoch": 0.12200851650965838, + "grad_norm": 1.4955285787582397, + "learning_rate": 4.818605337266664e-05, + "loss": 5.1762, + "step": 20515 + }, + { + "epoch": 0.12201446379293938, + "grad_norm": 1.3431698083877563, + "learning_rate": 4.818587868932527e-05, + "loss": 4.9477, + "step": 20516 + }, + { + "epoch": 0.12202041107622039, + "grad_norm": 1.3437286615371704, + "learning_rate": 4.818570399788995e-05, + "loss": 4.7787, + "step": 20517 + }, + { + "epoch": 0.12202635835950137, + "grad_norm": 1.3840901851654053, + "learning_rate": 4.818552929836074e-05, + "loss": 5.0749, + "step": 20518 + }, + { + "epoch": 0.12203230564278238, + "grad_norm": 1.3907465934753418, + "learning_rate": 4.8185354590737707e-05, + "loss": 4.9084, + "step": 20519 + }, + { + "epoch": 0.12203825292606338, + "grad_norm": 1.360065221786499, + "learning_rate": 4.818517987502091e-05, + "loss": 4.9323, + "step": 20520 + }, + { + "epoch": 0.12204420020934437, + "grad_norm": 1.1924186944961548, + "learning_rate": 4.818500515121039e-05, + "loss": 4.8237, + "step": 20521 + }, + { + "epoch": 0.12205014749262537, + "grad_norm": 1.6362069845199585, + "learning_rate": 4.818483041930624e-05, + "loss": 4.6073, + "step": 20522 + }, + { + "epoch": 0.12205609477590637, + "grad_norm": 1.4413504600524902, + "learning_rate": 4.81846556793085e-05, + "loss": 4.7733, + "step": 20523 + }, + { + "epoch": 0.12206204205918736, + "grad_norm": 1.5076016187667847, + "learning_rate": 4.818448093121723e-05, + "loss": 5.4376, + "step": 20524 + }, + { + "epoch": 0.12206798934246836, + "grad_norm": 1.5311039686203003, + "learning_rate": 4.818430617503251e-05, + "loss": 5.1398, + "step": 20525 + }, + { + "epoch": 0.12207393662574936, + "grad_norm": 1.4373403787612915, + "learning_rate": 4.818413141075438e-05, + "loss": 4.897, + "step": 20526 + }, + { + "epoch": 0.12207988390903035, + "grad_norm": 1.4221818447113037, + "learning_rate": 4.818395663838291e-05, + "loss": 5.223, + "step": 20527 + }, + { + "epoch": 0.12208583119231135, + "grad_norm": 1.2606967687606812, + "learning_rate": 4.818378185791817e-05, + "loss": 4.7242, + "step": 20528 + }, + { + "epoch": 0.12209177847559236, + "grad_norm": 1.2508289813995361, + "learning_rate": 4.818360706936019e-05, + "loss": 4.623, + "step": 20529 + }, + { + "epoch": 0.12209772575887334, + "grad_norm": 1.3701050281524658, + "learning_rate": 4.8183432272709065e-05, + "loss": 4.6716, + "step": 20530 + }, + { + "epoch": 0.12210367304215435, + "grad_norm": 1.5785399675369263, + "learning_rate": 4.818325746796485e-05, + "loss": 4.5495, + "step": 20531 + }, + { + "epoch": 0.12210962032543535, + "grad_norm": 1.4542807340621948, + "learning_rate": 4.8183082655127584e-05, + "loss": 4.6848, + "step": 20532 + }, + { + "epoch": 0.12211556760871634, + "grad_norm": 1.2740551233291626, + "learning_rate": 4.818290783419736e-05, + "loss": 4.7792, + "step": 20533 + }, + { + "epoch": 0.12212151489199734, + "grad_norm": 1.2965741157531738, + "learning_rate": 4.8182733005174205e-05, + "loss": 4.7552, + "step": 20534 + }, + { + "epoch": 0.12212746217527834, + "grad_norm": 1.3440501689910889, + "learning_rate": 4.8182558168058215e-05, + "loss": 5.0506, + "step": 20535 + }, + { + "epoch": 0.12213340945855933, + "grad_norm": 1.3767000436782837, + "learning_rate": 4.8182383322849415e-05, + "loss": 5.0523, + "step": 20536 + }, + { + "epoch": 0.12213935674184033, + "grad_norm": 1.4770883321762085, + "learning_rate": 4.81822084695479e-05, + "loss": 5.117, + "step": 20537 + }, + { + "epoch": 0.12214530402512132, + "grad_norm": 1.4463403224945068, + "learning_rate": 4.818203360815371e-05, + "loss": 5.0566, + "step": 20538 + }, + { + "epoch": 0.12215125130840232, + "grad_norm": 1.5590862035751343, + "learning_rate": 4.8181858738666905e-05, + "loss": 5.1184, + "step": 20539 + }, + { + "epoch": 0.12215719859168332, + "grad_norm": 1.2578922510147095, + "learning_rate": 4.818168386108756e-05, + "loss": 5.0364, + "step": 20540 + }, + { + "epoch": 0.12216314587496431, + "grad_norm": 1.363750696182251, + "learning_rate": 4.8181508975415727e-05, + "loss": 5.1133, + "step": 20541 + }, + { + "epoch": 0.12216909315824531, + "grad_norm": 1.5973013639450073, + "learning_rate": 4.8181334081651474e-05, + "loss": 4.9659, + "step": 20542 + }, + { + "epoch": 0.12217504044152631, + "grad_norm": 1.4429646730422974, + "learning_rate": 4.818115917979485e-05, + "loss": 5.1669, + "step": 20543 + }, + { + "epoch": 0.1221809877248073, + "grad_norm": 1.4704759120941162, + "learning_rate": 4.818098426984592e-05, + "loss": 5.1613, + "step": 20544 + }, + { + "epoch": 0.1221869350080883, + "grad_norm": 1.3613824844360352, + "learning_rate": 4.8180809351804756e-05, + "loss": 5.2524, + "step": 20545 + }, + { + "epoch": 0.1221928822913693, + "grad_norm": 1.199265480041504, + "learning_rate": 4.8180634425671404e-05, + "loss": 5.1596, + "step": 20546 + }, + { + "epoch": 0.1221988295746503, + "grad_norm": 1.3537240028381348, + "learning_rate": 4.818045949144594e-05, + "loss": 5.1456, + "step": 20547 + }, + { + "epoch": 0.1222047768579313, + "grad_norm": 1.4804584980010986, + "learning_rate": 4.818028454912841e-05, + "loss": 5.0443, + "step": 20548 + }, + { + "epoch": 0.1222107241412123, + "grad_norm": 1.3245832920074463, + "learning_rate": 4.8180109598718884e-05, + "loss": 4.9495, + "step": 20549 + }, + { + "epoch": 0.12221667142449329, + "grad_norm": 1.5168079137802124, + "learning_rate": 4.817993464021742e-05, + "loss": 4.8094, + "step": 20550 + }, + { + "epoch": 0.12222261870777429, + "grad_norm": 1.4146143198013306, + "learning_rate": 4.817975967362408e-05, + "loss": 5.0319, + "step": 20551 + }, + { + "epoch": 0.12222856599105529, + "grad_norm": 1.30800199508667, + "learning_rate": 4.817958469893893e-05, + "loss": 4.6641, + "step": 20552 + }, + { + "epoch": 0.12223451327433628, + "grad_norm": 1.1652897596359253, + "learning_rate": 4.8179409716162026e-05, + "loss": 4.8978, + "step": 20553 + }, + { + "epoch": 0.12224046055761728, + "grad_norm": 1.4594627618789673, + "learning_rate": 4.817923472529343e-05, + "loss": 5.0124, + "step": 20554 + }, + { + "epoch": 0.12224640784089828, + "grad_norm": 1.2955336570739746, + "learning_rate": 4.81790597263332e-05, + "loss": 5.0336, + "step": 20555 + }, + { + "epoch": 0.12225235512417927, + "grad_norm": 1.3508485555648804, + "learning_rate": 4.8178884719281395e-05, + "loss": 4.8695, + "step": 20556 + }, + { + "epoch": 0.12225830240746027, + "grad_norm": 1.363410472869873, + "learning_rate": 4.8178709704138094e-05, + "loss": 4.9162, + "step": 20557 + }, + { + "epoch": 0.12226424969074128, + "grad_norm": 1.4330451488494873, + "learning_rate": 4.817853468090333e-05, + "loss": 4.8993, + "step": 20558 + }, + { + "epoch": 0.12227019697402226, + "grad_norm": 1.3630226850509644, + "learning_rate": 4.817835964957719e-05, + "loss": 4.9196, + "step": 20559 + }, + { + "epoch": 0.12227614425730327, + "grad_norm": 1.4265079498291016, + "learning_rate": 4.817818461015972e-05, + "loss": 4.8966, + "step": 20560 + }, + { + "epoch": 0.12228209154058427, + "grad_norm": 1.4709514379501343, + "learning_rate": 4.817800956265098e-05, + "loss": 4.7685, + "step": 20561 + }, + { + "epoch": 0.12228803882386526, + "grad_norm": 1.1047412157058716, + "learning_rate": 4.8177834507051044e-05, + "loss": 4.8495, + "step": 20562 + }, + { + "epoch": 0.12229398610714626, + "grad_norm": 1.302027940750122, + "learning_rate": 4.817765944335996e-05, + "loss": 4.9414, + "step": 20563 + }, + { + "epoch": 0.12229993339042726, + "grad_norm": 1.2321425676345825, + "learning_rate": 4.8177484371577796e-05, + "loss": 4.8089, + "step": 20564 + }, + { + "epoch": 0.12230588067370825, + "grad_norm": 1.5107663869857788, + "learning_rate": 4.8177309291704616e-05, + "loss": 4.8964, + "step": 20565 + }, + { + "epoch": 0.12231182795698925, + "grad_norm": 1.4476573467254639, + "learning_rate": 4.817713420374047e-05, + "loss": 5.1385, + "step": 20566 + }, + { + "epoch": 0.12231777524027024, + "grad_norm": 1.7367160320281982, + "learning_rate": 4.817695910768544e-05, + "loss": 4.7051, + "step": 20567 + }, + { + "epoch": 0.12232372252355124, + "grad_norm": 1.7436206340789795, + "learning_rate": 4.817678400353955e-05, + "loss": 5.0161, + "step": 20568 + }, + { + "epoch": 0.12232966980683224, + "grad_norm": 1.667702317237854, + "learning_rate": 4.8176608891302905e-05, + "loss": 4.7507, + "step": 20569 + }, + { + "epoch": 0.12233561709011323, + "grad_norm": 1.3754125833511353, + "learning_rate": 4.817643377097554e-05, + "loss": 4.9623, + "step": 20570 + }, + { + "epoch": 0.12234156437339423, + "grad_norm": 1.539730191230774, + "learning_rate": 4.817625864255751e-05, + "loss": 4.9798, + "step": 20571 + }, + { + "epoch": 0.12234751165667523, + "grad_norm": 1.2995619773864746, + "learning_rate": 4.81760835060489e-05, + "loss": 4.9225, + "step": 20572 + }, + { + "epoch": 0.12235345893995622, + "grad_norm": 1.4950238466262817, + "learning_rate": 4.817590836144975e-05, + "loss": 5.0578, + "step": 20573 + }, + { + "epoch": 0.12235940622323722, + "grad_norm": 1.5506999492645264, + "learning_rate": 4.8175733208760144e-05, + "loss": 4.7418, + "step": 20574 + }, + { + "epoch": 0.12236535350651823, + "grad_norm": 2.153271198272705, + "learning_rate": 4.817555804798012e-05, + "loss": 4.8025, + "step": 20575 + }, + { + "epoch": 0.12237130078979921, + "grad_norm": 1.4991137981414795, + "learning_rate": 4.817538287910974e-05, + "loss": 4.9943, + "step": 20576 + }, + { + "epoch": 0.12237724807308022, + "grad_norm": 1.3596469163894653, + "learning_rate": 4.8175207702149085e-05, + "loss": 5.4109, + "step": 20577 + }, + { + "epoch": 0.12238319535636122, + "grad_norm": 1.182950735092163, + "learning_rate": 4.81750325170982e-05, + "loss": 5.4844, + "step": 20578 + }, + { + "epoch": 0.1223891426396422, + "grad_norm": 1.2713780403137207, + "learning_rate": 4.817485732395715e-05, + "loss": 5.3333, + "step": 20579 + }, + { + "epoch": 0.12239508992292321, + "grad_norm": 1.396163821220398, + "learning_rate": 4.8174682122726e-05, + "loss": 5.1666, + "step": 20580 + }, + { + "epoch": 0.12240103720620421, + "grad_norm": 1.3530118465423584, + "learning_rate": 4.81745069134048e-05, + "loss": 5.055, + "step": 20581 + }, + { + "epoch": 0.1224069844894852, + "grad_norm": 1.1625109910964966, + "learning_rate": 4.8174331695993626e-05, + "loss": 5.2553, + "step": 20582 + }, + { + "epoch": 0.1224129317727662, + "grad_norm": 1.4428709745407104, + "learning_rate": 4.817415647049253e-05, + "loss": 5.1255, + "step": 20583 + }, + { + "epoch": 0.1224188790560472, + "grad_norm": 1.674591064453125, + "learning_rate": 4.8173981236901574e-05, + "loss": 4.7623, + "step": 20584 + }, + { + "epoch": 0.12242482633932819, + "grad_norm": 1.4691076278686523, + "learning_rate": 4.817380599522083e-05, + "loss": 5.1077, + "step": 20585 + }, + { + "epoch": 0.12243077362260919, + "grad_norm": 1.0224462747573853, + "learning_rate": 4.817363074545034e-05, + "loss": 5.1022, + "step": 20586 + }, + { + "epoch": 0.1224367209058902, + "grad_norm": 1.3090193271636963, + "learning_rate": 4.817345548759018e-05, + "loss": 5.121, + "step": 20587 + }, + { + "epoch": 0.12244266818917118, + "grad_norm": 1.028120756149292, + "learning_rate": 4.81732802216404e-05, + "loss": 5.2709, + "step": 20588 + }, + { + "epoch": 0.12244861547245219, + "grad_norm": 1.3667192459106445, + "learning_rate": 4.817310494760107e-05, + "loss": 5.075, + "step": 20589 + }, + { + "epoch": 0.12245456275573319, + "grad_norm": 1.3145662546157837, + "learning_rate": 4.8172929665472255e-05, + "loss": 5.1258, + "step": 20590 + }, + { + "epoch": 0.12246051003901418, + "grad_norm": 1.2744371891021729, + "learning_rate": 4.8172754375254e-05, + "loss": 5.0155, + "step": 20591 + }, + { + "epoch": 0.12246645732229518, + "grad_norm": 1.4647456407546997, + "learning_rate": 4.817257907694638e-05, + "loss": 5.0325, + "step": 20592 + }, + { + "epoch": 0.12247240460557618, + "grad_norm": 1.1393789052963257, + "learning_rate": 4.817240377054945e-05, + "loss": 5.1304, + "step": 20593 + }, + { + "epoch": 0.12247835188885717, + "grad_norm": 1.3927806615829468, + "learning_rate": 4.817222845606328e-05, + "loss": 5.0588, + "step": 20594 + }, + { + "epoch": 0.12248429917213817, + "grad_norm": 1.3344571590423584, + "learning_rate": 4.817205313348792e-05, + "loss": 5.0428, + "step": 20595 + }, + { + "epoch": 0.12249024645541916, + "grad_norm": 0.9816542267799377, + "learning_rate": 4.817187780282343e-05, + "loss": 5.0046, + "step": 20596 + }, + { + "epoch": 0.12249619373870016, + "grad_norm": 1.1602904796600342, + "learning_rate": 4.817170246406989e-05, + "loss": 5.0372, + "step": 20597 + }, + { + "epoch": 0.12250214102198116, + "grad_norm": 1.2147279977798462, + "learning_rate": 4.817152711722733e-05, + "loss": 4.999, + "step": 20598 + }, + { + "epoch": 0.12250808830526215, + "grad_norm": 1.3654884099960327, + "learning_rate": 4.817135176229585e-05, + "loss": 5.0635, + "step": 20599 + }, + { + "epoch": 0.12251403558854315, + "grad_norm": 1.3051310777664185, + "learning_rate": 4.817117639927547e-05, + "loss": 5.0137, + "step": 20600 + }, + { + "epoch": 0.12251998287182415, + "grad_norm": 1.2217040061950684, + "learning_rate": 4.8171001028166284e-05, + "loss": 4.7167, + "step": 20601 + }, + { + "epoch": 0.12252593015510514, + "grad_norm": 1.3541781902313232, + "learning_rate": 4.8170825648968345e-05, + "loss": 4.9244, + "step": 20602 + }, + { + "epoch": 0.12253187743838614, + "grad_norm": 1.2899030447006226, + "learning_rate": 4.81706502616817e-05, + "loss": 5.0452, + "step": 20603 + }, + { + "epoch": 0.12253782472166715, + "grad_norm": 1.4059736728668213, + "learning_rate": 4.817047486630643e-05, + "loss": 4.9318, + "step": 20604 + }, + { + "epoch": 0.12254377200494813, + "grad_norm": 1.6990517377853394, + "learning_rate": 4.817029946284257e-05, + "loss": 4.5067, + "step": 20605 + }, + { + "epoch": 0.12254971928822914, + "grad_norm": 1.4028486013412476, + "learning_rate": 4.817012405129021e-05, + "loss": 5.0994, + "step": 20606 + }, + { + "epoch": 0.12255566657151014, + "grad_norm": 1.5692994594573975, + "learning_rate": 4.8169948631649395e-05, + "loss": 4.742, + "step": 20607 + }, + { + "epoch": 0.12256161385479113, + "grad_norm": 1.4501662254333496, + "learning_rate": 4.81697732039202e-05, + "loss": 4.9951, + "step": 20608 + }, + { + "epoch": 0.12256756113807213, + "grad_norm": 1.2898585796356201, + "learning_rate": 4.816959776810267e-05, + "loss": 5.2756, + "step": 20609 + }, + { + "epoch": 0.12257350842135313, + "grad_norm": 1.2808797359466553, + "learning_rate": 4.8169422324196867e-05, + "loss": 5.043, + "step": 20610 + }, + { + "epoch": 0.12257945570463412, + "grad_norm": 1.6888319253921509, + "learning_rate": 4.816924687220287e-05, + "loss": 4.6803, + "step": 20611 + }, + { + "epoch": 0.12258540298791512, + "grad_norm": 1.6619288921356201, + "learning_rate": 4.8169071412120716e-05, + "loss": 4.7334, + "step": 20612 + }, + { + "epoch": 0.12259135027119612, + "grad_norm": 1.4474331140518188, + "learning_rate": 4.816889594395049e-05, + "loss": 4.8519, + "step": 20613 + }, + { + "epoch": 0.12259729755447711, + "grad_norm": 1.519037127494812, + "learning_rate": 4.816872046769223e-05, + "loss": 4.7864, + "step": 20614 + }, + { + "epoch": 0.12260324483775811, + "grad_norm": 1.4860186576843262, + "learning_rate": 4.816854498334602e-05, + "loss": 4.7542, + "step": 20615 + }, + { + "epoch": 0.12260919212103912, + "grad_norm": 1.3120838403701782, + "learning_rate": 4.81683694909119e-05, + "loss": 4.6539, + "step": 20616 + }, + { + "epoch": 0.1226151394043201, + "grad_norm": 1.4509785175323486, + "learning_rate": 4.816819399038995e-05, + "loss": 5.105, + "step": 20617 + }, + { + "epoch": 0.1226210866876011, + "grad_norm": 1.428066372871399, + "learning_rate": 4.816801848178022e-05, + "loss": 5.1138, + "step": 20618 + }, + { + "epoch": 0.12262703397088211, + "grad_norm": 1.3920371532440186, + "learning_rate": 4.816784296508277e-05, + "loss": 5.0398, + "step": 20619 + }, + { + "epoch": 0.1226329812541631, + "grad_norm": 1.258225679397583, + "learning_rate": 4.816766744029767e-05, + "loss": 4.7204, + "step": 20620 + }, + { + "epoch": 0.1226389285374441, + "grad_norm": 1.4209269285202026, + "learning_rate": 4.816749190742498e-05, + "loss": 4.6532, + "step": 20621 + }, + { + "epoch": 0.1226448758207251, + "grad_norm": 1.6276925802230835, + "learning_rate": 4.816731636646475e-05, + "loss": 4.7025, + "step": 20622 + }, + { + "epoch": 0.12265082310400609, + "grad_norm": 1.3714722394943237, + "learning_rate": 4.8167140817417055e-05, + "loss": 5.1781, + "step": 20623 + }, + { + "epoch": 0.12265677038728709, + "grad_norm": 1.397017240524292, + "learning_rate": 4.816696526028195e-05, + "loss": 5.2097, + "step": 20624 + }, + { + "epoch": 0.12266271767056808, + "grad_norm": 1.2807291746139526, + "learning_rate": 4.8166789695059486e-05, + "loss": 5.1588, + "step": 20625 + }, + { + "epoch": 0.12266866495384908, + "grad_norm": 1.301222562789917, + "learning_rate": 4.816661412174976e-05, + "loss": 5.0906, + "step": 20626 + }, + { + "epoch": 0.12267461223713008, + "grad_norm": 1.6813510656356812, + "learning_rate": 4.816643854035279e-05, + "loss": 4.4956, + "step": 20627 + }, + { + "epoch": 0.12268055952041107, + "grad_norm": 1.7415688037872314, + "learning_rate": 4.816626295086865e-05, + "loss": 4.4246, + "step": 20628 + }, + { + "epoch": 0.12268650680369207, + "grad_norm": 1.9389246702194214, + "learning_rate": 4.816608735329742e-05, + "loss": 4.4231, + "step": 20629 + }, + { + "epoch": 0.12269245408697307, + "grad_norm": 1.7021642923355103, + "learning_rate": 4.816591174763914e-05, + "loss": 4.5314, + "step": 20630 + }, + { + "epoch": 0.12269840137025406, + "grad_norm": 1.889491081237793, + "learning_rate": 4.8165736133893876e-05, + "loss": 4.384, + "step": 20631 + }, + { + "epoch": 0.12270434865353506, + "grad_norm": 1.8447821140289307, + "learning_rate": 4.816556051206171e-05, + "loss": 4.5086, + "step": 20632 + }, + { + "epoch": 0.12271029593681607, + "grad_norm": 1.7669256925582886, + "learning_rate": 4.8165384882142674e-05, + "loss": 4.4537, + "step": 20633 + }, + { + "epoch": 0.12271624322009705, + "grad_norm": 1.8175028562545776, + "learning_rate": 4.8165209244136846e-05, + "loss": 4.4478, + "step": 20634 + }, + { + "epoch": 0.12272219050337806, + "grad_norm": 1.7047181129455566, + "learning_rate": 4.816503359804427e-05, + "loss": 4.7366, + "step": 20635 + }, + { + "epoch": 0.12272813778665906, + "grad_norm": 1.4321893453598022, + "learning_rate": 4.816485794386504e-05, + "loss": 4.9958, + "step": 20636 + }, + { + "epoch": 0.12273408506994005, + "grad_norm": 1.3354036808013916, + "learning_rate": 4.816468228159918e-05, + "loss": 4.906, + "step": 20637 + }, + { + "epoch": 0.12274003235322105, + "grad_norm": 1.281680703163147, + "learning_rate": 4.8164506611246784e-05, + "loss": 4.884, + "step": 20638 + }, + { + "epoch": 0.12274597963650205, + "grad_norm": 1.32127046585083, + "learning_rate": 4.8164330932807885e-05, + "loss": 4.8039, + "step": 20639 + }, + { + "epoch": 0.12275192691978304, + "grad_norm": 1.2233742475509644, + "learning_rate": 4.816415524628257e-05, + "loss": 4.8872, + "step": 20640 + }, + { + "epoch": 0.12275787420306404, + "grad_norm": 1.4896177053451538, + "learning_rate": 4.816397955167088e-05, + "loss": 5.0379, + "step": 20641 + }, + { + "epoch": 0.12276382148634504, + "grad_norm": 1.389992594718933, + "learning_rate": 4.8163803848972886e-05, + "loss": 5.1364, + "step": 20642 + }, + { + "epoch": 0.12276976876962603, + "grad_norm": 1.4248872995376587, + "learning_rate": 4.8163628138188645e-05, + "loss": 5.3152, + "step": 20643 + }, + { + "epoch": 0.12277571605290703, + "grad_norm": 1.3105376958847046, + "learning_rate": 4.816345241931822e-05, + "loss": 4.9878, + "step": 20644 + }, + { + "epoch": 0.12278166333618803, + "grad_norm": 1.3307970762252808, + "learning_rate": 4.816327669236167e-05, + "loss": 4.9105, + "step": 20645 + }, + { + "epoch": 0.12278761061946902, + "grad_norm": 1.9464685916900635, + "learning_rate": 4.816310095731907e-05, + "loss": 5.2259, + "step": 20646 + }, + { + "epoch": 0.12279355790275003, + "grad_norm": 1.4600616693496704, + "learning_rate": 4.816292521419046e-05, + "loss": 4.7044, + "step": 20647 + }, + { + "epoch": 0.12279950518603103, + "grad_norm": 1.202574610710144, + "learning_rate": 4.816274946297592e-05, + "loss": 5.1854, + "step": 20648 + }, + { + "epoch": 0.12280545246931202, + "grad_norm": 1.5569230318069458, + "learning_rate": 4.81625737036755e-05, + "loss": 4.8316, + "step": 20649 + }, + { + "epoch": 0.12281139975259302, + "grad_norm": 1.3303078413009644, + "learning_rate": 4.8162397936289264e-05, + "loss": 4.891, + "step": 20650 + }, + { + "epoch": 0.12281734703587402, + "grad_norm": 1.2397204637527466, + "learning_rate": 4.816222216081728e-05, + "loss": 4.8077, + "step": 20651 + }, + { + "epoch": 0.12282329431915501, + "grad_norm": 1.29647696018219, + "learning_rate": 4.8162046377259594e-05, + "loss": 4.7518, + "step": 20652 + }, + { + "epoch": 0.12282924160243601, + "grad_norm": 1.4492244720458984, + "learning_rate": 4.816187058561629e-05, + "loss": 4.6352, + "step": 20653 + }, + { + "epoch": 0.122835188885717, + "grad_norm": 1.2785146236419678, + "learning_rate": 4.81616947858874e-05, + "loss": 4.9128, + "step": 20654 + }, + { + "epoch": 0.122841136168998, + "grad_norm": 1.2652465105056763, + "learning_rate": 4.8161518978073016e-05, + "loss": 5.1555, + "step": 20655 + }, + { + "epoch": 0.122847083452279, + "grad_norm": 1.5048694610595703, + "learning_rate": 4.816134316217318e-05, + "loss": 5.0648, + "step": 20656 + }, + { + "epoch": 0.12285303073555999, + "grad_norm": 1.3626654148101807, + "learning_rate": 4.816116733818795e-05, + "loss": 5.0668, + "step": 20657 + }, + { + "epoch": 0.12285897801884099, + "grad_norm": 1.614112377166748, + "learning_rate": 4.816099150611741e-05, + "loss": 4.9234, + "step": 20658 + }, + { + "epoch": 0.122864925302122, + "grad_norm": 1.9453253746032715, + "learning_rate": 4.81608156659616e-05, + "loss": 4.7709, + "step": 20659 + }, + { + "epoch": 0.12287087258540298, + "grad_norm": 1.7604261636734009, + "learning_rate": 4.816063981772059e-05, + "loss": 4.8153, + "step": 20660 + }, + { + "epoch": 0.12287681986868398, + "grad_norm": 1.473319172859192, + "learning_rate": 4.8160463961394436e-05, + "loss": 4.9552, + "step": 20661 + }, + { + "epoch": 0.12288276715196499, + "grad_norm": 1.332900881767273, + "learning_rate": 4.8160288096983207e-05, + "loss": 5.1753, + "step": 20662 + }, + { + "epoch": 0.12288871443524597, + "grad_norm": 1.438464641571045, + "learning_rate": 4.816011222448696e-05, + "loss": 5.0386, + "step": 20663 + }, + { + "epoch": 0.12289466171852698, + "grad_norm": 1.4369616508483887, + "learning_rate": 4.8159936343905756e-05, + "loss": 5.1144, + "step": 20664 + }, + { + "epoch": 0.12290060900180798, + "grad_norm": 1.307914137840271, + "learning_rate": 4.8159760455239656e-05, + "loss": 5.0308, + "step": 20665 + }, + { + "epoch": 0.12290655628508897, + "grad_norm": 1.4199682474136353, + "learning_rate": 4.815958455848872e-05, + "loss": 4.9803, + "step": 20666 + }, + { + "epoch": 0.12291250356836997, + "grad_norm": 1.2451025247573853, + "learning_rate": 4.815940865365303e-05, + "loss": 5.0328, + "step": 20667 + }, + { + "epoch": 0.12291845085165097, + "grad_norm": 1.2542675733566284, + "learning_rate": 4.8159232740732615e-05, + "loss": 5.0961, + "step": 20668 + }, + { + "epoch": 0.12292439813493196, + "grad_norm": 1.4102520942687988, + "learning_rate": 4.815905681972756e-05, + "loss": 5.1512, + "step": 20669 + }, + { + "epoch": 0.12293034541821296, + "grad_norm": 1.7003612518310547, + "learning_rate": 4.81588808906379e-05, + "loss": 5.6308, + "step": 20670 + }, + { + "epoch": 0.12293629270149396, + "grad_norm": 1.7957112789154053, + "learning_rate": 4.815870495346373e-05, + "loss": 5.2033, + "step": 20671 + }, + { + "epoch": 0.12294223998477495, + "grad_norm": 1.8667526245117188, + "learning_rate": 4.815852900820509e-05, + "loss": 5.3148, + "step": 20672 + }, + { + "epoch": 0.12294818726805595, + "grad_norm": 1.5151188373565674, + "learning_rate": 4.815835305486205e-05, + "loss": 5.1791, + "step": 20673 + }, + { + "epoch": 0.12295413455133695, + "grad_norm": 1.842624545097351, + "learning_rate": 4.8158177093434666e-05, + "loss": 4.7996, + "step": 20674 + }, + { + "epoch": 0.12296008183461794, + "grad_norm": 1.6197025775909424, + "learning_rate": 4.815800112392299e-05, + "loss": 4.9929, + "step": 20675 + }, + { + "epoch": 0.12296602911789895, + "grad_norm": 1.4609524011611938, + "learning_rate": 4.8157825146327113e-05, + "loss": 4.961, + "step": 20676 + }, + { + "epoch": 0.12297197640117995, + "grad_norm": 1.479789137840271, + "learning_rate": 4.8157649160647065e-05, + "loss": 5.3686, + "step": 20677 + }, + { + "epoch": 0.12297792368446094, + "grad_norm": 2.120084524154663, + "learning_rate": 4.815747316688293e-05, + "loss": 4.8741, + "step": 20678 + }, + { + "epoch": 0.12298387096774194, + "grad_norm": 1.2068350315093994, + "learning_rate": 4.815729716503476e-05, + "loss": 5.5907, + "step": 20679 + }, + { + "epoch": 0.12298981825102294, + "grad_norm": 1.9006667137145996, + "learning_rate": 4.815712115510261e-05, + "loss": 5.0154, + "step": 20680 + }, + { + "epoch": 0.12299576553430393, + "grad_norm": 1.7368868589401245, + "learning_rate": 4.815694513708656e-05, + "loss": 5.1994, + "step": 20681 + }, + { + "epoch": 0.12300171281758493, + "grad_norm": 1.8622910976409912, + "learning_rate": 4.815676911098665e-05, + "loss": 4.7889, + "step": 20682 + }, + { + "epoch": 0.12300766010086592, + "grad_norm": 1.7475686073303223, + "learning_rate": 4.815659307680295e-05, + "loss": 5.1067, + "step": 20683 + }, + { + "epoch": 0.12301360738414692, + "grad_norm": 1.7088334560394287, + "learning_rate": 4.815641703453553e-05, + "loss": 4.8665, + "step": 20684 + }, + { + "epoch": 0.12301955466742792, + "grad_norm": 1.4785330295562744, + "learning_rate": 4.815624098418444e-05, + "loss": 5.417, + "step": 20685 + }, + { + "epoch": 0.12302550195070891, + "grad_norm": 1.5346219539642334, + "learning_rate": 4.8156064925749745e-05, + "loss": 5.4747, + "step": 20686 + }, + { + "epoch": 0.12303144923398991, + "grad_norm": 1.7572461366653442, + "learning_rate": 4.815588885923151e-05, + "loss": 5.021, + "step": 20687 + }, + { + "epoch": 0.12303739651727091, + "grad_norm": 1.57370126247406, + "learning_rate": 4.815571278462979e-05, + "loss": 5.5248, + "step": 20688 + }, + { + "epoch": 0.1230433438005519, + "grad_norm": 1.7549457550048828, + "learning_rate": 4.815553670194465e-05, + "loss": 5.346, + "step": 20689 + }, + { + "epoch": 0.1230492910838329, + "grad_norm": 1.7188549041748047, + "learning_rate": 4.8155360611176156e-05, + "loss": 5.4671, + "step": 20690 + }, + { + "epoch": 0.1230552383671139, + "grad_norm": 2.358586311340332, + "learning_rate": 4.815518451232436e-05, + "loss": 4.4753, + "step": 20691 + }, + { + "epoch": 0.1230611856503949, + "grad_norm": 2.2453999519348145, + "learning_rate": 4.815500840538933e-05, + "loss": 4.5065, + "step": 20692 + }, + { + "epoch": 0.1230671329336759, + "grad_norm": 1.505689263343811, + "learning_rate": 4.8154832290371123e-05, + "loss": 5.2223, + "step": 20693 + }, + { + "epoch": 0.1230730802169569, + "grad_norm": 1.5649336576461792, + "learning_rate": 4.8154656167269804e-05, + "loss": 5.3686, + "step": 20694 + }, + { + "epoch": 0.12307902750023789, + "grad_norm": 1.8131600618362427, + "learning_rate": 4.815448003608544e-05, + "loss": 5.5532, + "step": 20695 + }, + { + "epoch": 0.12308497478351889, + "grad_norm": 1.7565428018569946, + "learning_rate": 4.815430389681808e-05, + "loss": 5.4619, + "step": 20696 + }, + { + "epoch": 0.12309092206679989, + "grad_norm": 1.708799958229065, + "learning_rate": 4.815412774946779e-05, + "loss": 5.5746, + "step": 20697 + }, + { + "epoch": 0.12309686935008088, + "grad_norm": 1.6220203638076782, + "learning_rate": 4.815395159403464e-05, + "loss": 5.1071, + "step": 20698 + }, + { + "epoch": 0.12310281663336188, + "grad_norm": 1.5516228675842285, + "learning_rate": 4.8153775430518676e-05, + "loss": 5.3921, + "step": 20699 + }, + { + "epoch": 0.12310876391664288, + "grad_norm": 1.7192966938018799, + "learning_rate": 4.815359925891998e-05, + "loss": 5.2339, + "step": 20700 + }, + { + "epoch": 0.12311471119992387, + "grad_norm": 1.3066575527191162, + "learning_rate": 4.815342307923859e-05, + "loss": 4.998, + "step": 20701 + }, + { + "epoch": 0.12312065848320487, + "grad_norm": 1.49882173538208, + "learning_rate": 4.815324689147459e-05, + "loss": 5.0493, + "step": 20702 + }, + { + "epoch": 0.12312660576648587, + "grad_norm": 1.5100362300872803, + "learning_rate": 4.815307069562802e-05, + "loss": 5.7113, + "step": 20703 + }, + { + "epoch": 0.12313255304976686, + "grad_norm": 1.7987116575241089, + "learning_rate": 4.815289449169896e-05, + "loss": 4.3582, + "step": 20704 + }, + { + "epoch": 0.12313850033304787, + "grad_norm": 1.7036083936691284, + "learning_rate": 4.815271827968746e-05, + "loss": 5.0769, + "step": 20705 + }, + { + "epoch": 0.12314444761632887, + "grad_norm": 1.8392287492752075, + "learning_rate": 4.8152542059593584e-05, + "loss": 4.6458, + "step": 20706 + }, + { + "epoch": 0.12315039489960986, + "grad_norm": 1.7489079236984253, + "learning_rate": 4.81523658314174e-05, + "loss": 4.9117, + "step": 20707 + }, + { + "epoch": 0.12315634218289086, + "grad_norm": 2.2490482330322266, + "learning_rate": 4.8152189595158965e-05, + "loss": 5.2912, + "step": 20708 + }, + { + "epoch": 0.12316228946617186, + "grad_norm": 1.6101025342941284, + "learning_rate": 4.815201335081834e-05, + "loss": 4.9382, + "step": 20709 + }, + { + "epoch": 0.12316823674945285, + "grad_norm": 1.7892024517059326, + "learning_rate": 4.815183709839558e-05, + "loss": 5.0046, + "step": 20710 + }, + { + "epoch": 0.12317418403273385, + "grad_norm": 1.5614895820617676, + "learning_rate": 4.815166083789076e-05, + "loss": 5.5325, + "step": 20711 + }, + { + "epoch": 0.12318013131601484, + "grad_norm": 1.4775935411453247, + "learning_rate": 4.815148456930392e-05, + "loss": 5.0981, + "step": 20712 + }, + { + "epoch": 0.12318607859929584, + "grad_norm": 1.3652704954147339, + "learning_rate": 4.815130829263515e-05, + "loss": 4.9632, + "step": 20713 + }, + { + "epoch": 0.12319202588257684, + "grad_norm": 1.7767298221588135, + "learning_rate": 4.815113200788449e-05, + "loss": 4.5071, + "step": 20714 + }, + { + "epoch": 0.12319797316585783, + "grad_norm": 1.8673535585403442, + "learning_rate": 4.815095571505202e-05, + "loss": 4.3313, + "step": 20715 + }, + { + "epoch": 0.12320392044913883, + "grad_norm": 1.6682900190353394, + "learning_rate": 4.8150779414137775e-05, + "loss": 5.2341, + "step": 20716 + }, + { + "epoch": 0.12320986773241983, + "grad_norm": 1.6456630229949951, + "learning_rate": 4.815060310514184e-05, + "loss": 5.3823, + "step": 20717 + }, + { + "epoch": 0.12321581501570082, + "grad_norm": 1.9971877336502075, + "learning_rate": 4.8150426788064265e-05, + "loss": 5.1093, + "step": 20718 + }, + { + "epoch": 0.12322176229898182, + "grad_norm": 1.6881333589553833, + "learning_rate": 4.815025046290512e-05, + "loss": 5.1788, + "step": 20719 + }, + { + "epoch": 0.12322770958226283, + "grad_norm": 1.6873126029968262, + "learning_rate": 4.815007412966446e-05, + "loss": 5.4508, + "step": 20720 + }, + { + "epoch": 0.12323365686554381, + "grad_norm": 1.5401923656463623, + "learning_rate": 4.814989778834235e-05, + "loss": 5.3638, + "step": 20721 + }, + { + "epoch": 0.12323960414882482, + "grad_norm": 1.3972458839416504, + "learning_rate": 4.814972143893885e-05, + "loss": 5.3096, + "step": 20722 + }, + { + "epoch": 0.12324555143210582, + "grad_norm": 1.7662227153778076, + "learning_rate": 4.8149545081454015e-05, + "loss": 5.7959, + "step": 20723 + }, + { + "epoch": 0.1232514987153868, + "grad_norm": 1.5072314739227295, + "learning_rate": 4.814936871588792e-05, + "loss": 5.6857, + "step": 20724 + }, + { + "epoch": 0.12325744599866781, + "grad_norm": 1.6628614664077759, + "learning_rate": 4.814919234224062e-05, + "loss": 5.4054, + "step": 20725 + }, + { + "epoch": 0.12326339328194881, + "grad_norm": 1.7059345245361328, + "learning_rate": 4.814901596051217e-05, + "loss": 5.3205, + "step": 20726 + }, + { + "epoch": 0.1232693405652298, + "grad_norm": 1.5989772081375122, + "learning_rate": 4.814883957070264e-05, + "loss": 5.0841, + "step": 20727 + }, + { + "epoch": 0.1232752878485108, + "grad_norm": 1.3816654682159424, + "learning_rate": 4.814866317281209e-05, + "loss": 4.9146, + "step": 20728 + }, + { + "epoch": 0.1232812351317918, + "grad_norm": 1.3992705345153809, + "learning_rate": 4.814848676684058e-05, + "loss": 4.8416, + "step": 20729 + }, + { + "epoch": 0.12328718241507279, + "grad_norm": 1.7377054691314697, + "learning_rate": 4.814831035278818e-05, + "loss": 5.3636, + "step": 20730 + }, + { + "epoch": 0.12329312969835379, + "grad_norm": 2.1461470127105713, + "learning_rate": 4.814813393065494e-05, + "loss": 5.7162, + "step": 20731 + }, + { + "epoch": 0.1232990769816348, + "grad_norm": 1.7310097217559814, + "learning_rate": 4.814795750044092e-05, + "loss": 5.7005, + "step": 20732 + }, + { + "epoch": 0.12330502426491578, + "grad_norm": 1.678813099861145, + "learning_rate": 4.814778106214619e-05, + "loss": 5.8184, + "step": 20733 + }, + { + "epoch": 0.12331097154819678, + "grad_norm": 1.7520476579666138, + "learning_rate": 4.814760461577081e-05, + "loss": 5.5746, + "step": 20734 + }, + { + "epoch": 0.12331691883147779, + "grad_norm": 1.6140379905700684, + "learning_rate": 4.8147428161314846e-05, + "loss": 5.4311, + "step": 20735 + }, + { + "epoch": 0.12332286611475878, + "grad_norm": 1.5862205028533936, + "learning_rate": 4.814725169877834e-05, + "loss": 5.5008, + "step": 20736 + }, + { + "epoch": 0.12332881339803978, + "grad_norm": 1.5568691492080688, + "learning_rate": 4.814707522816138e-05, + "loss": 5.5164, + "step": 20737 + }, + { + "epoch": 0.12333476068132078, + "grad_norm": 1.245606780052185, + "learning_rate": 4.814689874946401e-05, + "loss": 5.4217, + "step": 20738 + }, + { + "epoch": 0.12334070796460177, + "grad_norm": 1.3054754734039307, + "learning_rate": 4.8146722262686294e-05, + "loss": 5.4749, + "step": 20739 + }, + { + "epoch": 0.12334665524788277, + "grad_norm": 1.5772032737731934, + "learning_rate": 4.81465457678283e-05, + "loss": 5.7249, + "step": 20740 + }, + { + "epoch": 0.12335260253116376, + "grad_norm": 1.469688057899475, + "learning_rate": 4.814636926489009e-05, + "loss": 5.8515, + "step": 20741 + }, + { + "epoch": 0.12335854981444476, + "grad_norm": 2.3438186645507812, + "learning_rate": 4.814619275387172e-05, + "loss": 4.7599, + "step": 20742 + }, + { + "epoch": 0.12336449709772576, + "grad_norm": 2.4038238525390625, + "learning_rate": 4.814601623477325e-05, + "loss": 4.5717, + "step": 20743 + }, + { + "epoch": 0.12337044438100675, + "grad_norm": 2.773898124694824, + "learning_rate": 4.8145839707594745e-05, + "loss": 4.4889, + "step": 20744 + }, + { + "epoch": 0.12337639166428775, + "grad_norm": 2.863701820373535, + "learning_rate": 4.814566317233626e-05, + "loss": 4.5076, + "step": 20745 + }, + { + "epoch": 0.12338233894756875, + "grad_norm": 2.066301107406616, + "learning_rate": 4.8145486628997875e-05, + "loss": 4.8112, + "step": 20746 + }, + { + "epoch": 0.12338828623084974, + "grad_norm": 2.307910680770874, + "learning_rate": 4.814531007757963e-05, + "loss": 4.3896, + "step": 20747 + }, + { + "epoch": 0.12339423351413074, + "grad_norm": 2.2435505390167236, + "learning_rate": 4.81451335180816e-05, + "loss": 4.6403, + "step": 20748 + }, + { + "epoch": 0.12340018079741175, + "grad_norm": 2.4653170108795166, + "learning_rate": 4.814495695050385e-05, + "loss": 4.4737, + "step": 20749 + }, + { + "epoch": 0.12340612808069273, + "grad_norm": 2.3770196437835693, + "learning_rate": 4.814478037484643e-05, + "loss": 4.4951, + "step": 20750 + }, + { + "epoch": 0.12341207536397374, + "grad_norm": 1.8455066680908203, + "learning_rate": 4.81446037911094e-05, + "loss": 5.2646, + "step": 20751 + }, + { + "epoch": 0.12341802264725474, + "grad_norm": 1.6683069467544556, + "learning_rate": 4.814442719929283e-05, + "loss": 5.4287, + "step": 20752 + }, + { + "epoch": 0.12342396993053573, + "grad_norm": 1.4904793500900269, + "learning_rate": 4.814425059939679e-05, + "loss": 4.9993, + "step": 20753 + }, + { + "epoch": 0.12342991721381673, + "grad_norm": 1.5601847171783447, + "learning_rate": 4.8144073991421326e-05, + "loss": 5.1637, + "step": 20754 + }, + { + "epoch": 0.12343586449709773, + "grad_norm": 1.8937057256698608, + "learning_rate": 4.8143897375366496e-05, + "loss": 4.6928, + "step": 20755 + }, + { + "epoch": 0.12344181178037872, + "grad_norm": 1.8150557279586792, + "learning_rate": 4.814372075123238e-05, + "loss": 5.8257, + "step": 20756 + }, + { + "epoch": 0.12344775906365972, + "grad_norm": 1.537091612815857, + "learning_rate": 4.814354411901902e-05, + "loss": 5.0506, + "step": 20757 + }, + { + "epoch": 0.12345370634694072, + "grad_norm": 1.9722800254821777, + "learning_rate": 4.8143367478726495e-05, + "loss": 4.2019, + "step": 20758 + }, + { + "epoch": 0.12345965363022171, + "grad_norm": 1.9497390985488892, + "learning_rate": 4.8143190830354865e-05, + "loss": 4.2974, + "step": 20759 + }, + { + "epoch": 0.12346560091350271, + "grad_norm": 1.877036690711975, + "learning_rate": 4.814301417390418e-05, + "loss": 4.1039, + "step": 20760 + }, + { + "epoch": 0.12347154819678371, + "grad_norm": 1.932218313217163, + "learning_rate": 4.814283750937451e-05, + "loss": 4.3427, + "step": 20761 + }, + { + "epoch": 0.1234774954800647, + "grad_norm": 2.175657272338867, + "learning_rate": 4.814266083676591e-05, + "loss": 4.6891, + "step": 20762 + }, + { + "epoch": 0.1234834427633457, + "grad_norm": 1.7364848852157593, + "learning_rate": 4.8142484156078456e-05, + "loss": 4.4825, + "step": 20763 + }, + { + "epoch": 0.1234893900466267, + "grad_norm": 1.7598278522491455, + "learning_rate": 4.8142307467312184e-05, + "loss": 4.0782, + "step": 20764 + }, + { + "epoch": 0.1234953373299077, + "grad_norm": 1.9056943655014038, + "learning_rate": 4.814213077046719e-05, + "loss": 4.245, + "step": 20765 + }, + { + "epoch": 0.1235012846131887, + "grad_norm": 1.8974699974060059, + "learning_rate": 4.8141954065543506e-05, + "loss": 4.0707, + "step": 20766 + }, + { + "epoch": 0.1235072318964697, + "grad_norm": 1.9884151220321655, + "learning_rate": 4.814177735254121e-05, + "loss": 4.1443, + "step": 20767 + }, + { + "epoch": 0.12351317917975069, + "grad_norm": 1.952216625213623, + "learning_rate": 4.814160063146035e-05, + "loss": 4.6248, + "step": 20768 + }, + { + "epoch": 0.12351912646303169, + "grad_norm": 2.537240743637085, + "learning_rate": 4.814142390230101e-05, + "loss": 4.8936, + "step": 20769 + }, + { + "epoch": 0.12352507374631268, + "grad_norm": 1.6106029748916626, + "learning_rate": 4.814124716506322e-05, + "loss": 5.9498, + "step": 20770 + }, + { + "epoch": 0.12353102102959368, + "grad_norm": 2.3211259841918945, + "learning_rate": 4.814107041974707e-05, + "loss": 4.634, + "step": 20771 + }, + { + "epoch": 0.12353696831287468, + "grad_norm": 2.1425933837890625, + "learning_rate": 4.814089366635261e-05, + "loss": 4.9106, + "step": 20772 + }, + { + "epoch": 0.12354291559615567, + "grad_norm": 1.9194071292877197, + "learning_rate": 4.814071690487991e-05, + "loss": 4.9044, + "step": 20773 + }, + { + "epoch": 0.12354886287943667, + "grad_norm": 2.2048282623291016, + "learning_rate": 4.814054013532902e-05, + "loss": 4.7123, + "step": 20774 + }, + { + "epoch": 0.12355481016271767, + "grad_norm": 2.1015446186065674, + "learning_rate": 4.8140363357700004e-05, + "loss": 4.6005, + "step": 20775 + }, + { + "epoch": 0.12356075744599866, + "grad_norm": 2.133510112762451, + "learning_rate": 4.814018657199293e-05, + "loss": 5.1534, + "step": 20776 + }, + { + "epoch": 0.12356670472927966, + "grad_norm": 2.050220012664795, + "learning_rate": 4.814000977820785e-05, + "loss": 4.8997, + "step": 20777 + }, + { + "epoch": 0.12357265201256067, + "grad_norm": 2.0189473628997803, + "learning_rate": 4.8139832976344836e-05, + "loss": 4.6096, + "step": 20778 + }, + { + "epoch": 0.12357859929584165, + "grad_norm": 2.515733242034912, + "learning_rate": 4.813965616640395e-05, + "loss": 4.7096, + "step": 20779 + }, + { + "epoch": 0.12358454657912266, + "grad_norm": 2.062140941619873, + "learning_rate": 4.813947934838524e-05, + "loss": 4.8037, + "step": 20780 + }, + { + "epoch": 0.12359049386240366, + "grad_norm": 2.0707905292510986, + "learning_rate": 4.8139302522288776e-05, + "loss": 5.3148, + "step": 20781 + }, + { + "epoch": 0.12359644114568465, + "grad_norm": 2.0126004219055176, + "learning_rate": 4.813912568811463e-05, + "loss": 5.522, + "step": 20782 + }, + { + "epoch": 0.12360238842896565, + "grad_norm": 1.9760699272155762, + "learning_rate": 4.8138948845862855e-05, + "loss": 5.2751, + "step": 20783 + }, + { + "epoch": 0.12360833571224665, + "grad_norm": 1.6164956092834473, + "learning_rate": 4.81387719955335e-05, + "loss": 5.4444, + "step": 20784 + }, + { + "epoch": 0.12361428299552764, + "grad_norm": 1.7360550165176392, + "learning_rate": 4.8138595137126645e-05, + "loss": 4.7908, + "step": 20785 + }, + { + "epoch": 0.12362023027880864, + "grad_norm": 1.691304087638855, + "learning_rate": 4.813841827064235e-05, + "loss": 5.4206, + "step": 20786 + }, + { + "epoch": 0.12362617756208964, + "grad_norm": 1.685165524482727, + "learning_rate": 4.813824139608066e-05, + "loss": 4.457, + "step": 20787 + }, + { + "epoch": 0.12363212484537063, + "grad_norm": 2.114884376525879, + "learning_rate": 4.813806451344166e-05, + "loss": 4.8126, + "step": 20788 + }, + { + "epoch": 0.12363807212865163, + "grad_norm": 2.084394693374634, + "learning_rate": 4.81378876227254e-05, + "loss": 4.6486, + "step": 20789 + }, + { + "epoch": 0.12364401941193263, + "grad_norm": 1.901607871055603, + "learning_rate": 4.813771072393194e-05, + "loss": 4.3079, + "step": 20790 + }, + { + "epoch": 0.12364996669521362, + "grad_norm": 1.8139945268630981, + "learning_rate": 4.8137533817061345e-05, + "loss": 4.2445, + "step": 20791 + }, + { + "epoch": 0.12365591397849462, + "grad_norm": 1.8131442070007324, + "learning_rate": 4.8137356902113674e-05, + "loss": 4.1701, + "step": 20792 + }, + { + "epoch": 0.12366186126177563, + "grad_norm": 1.7977681159973145, + "learning_rate": 4.8137179979088995e-05, + "loss": 4.1976, + "step": 20793 + }, + { + "epoch": 0.12366780854505662, + "grad_norm": 1.78773832321167, + "learning_rate": 4.813700304798736e-05, + "loss": 4.0982, + "step": 20794 + }, + { + "epoch": 0.12367375582833762, + "grad_norm": 1.9300304651260376, + "learning_rate": 4.8136826108808844e-05, + "loss": 4.0887, + "step": 20795 + }, + { + "epoch": 0.12367970311161862, + "grad_norm": 1.8883346319198608, + "learning_rate": 4.813664916155349e-05, + "loss": 5.0699, + "step": 20796 + }, + { + "epoch": 0.12368565039489961, + "grad_norm": 1.9141865968704224, + "learning_rate": 4.813647220622137e-05, + "loss": 4.6133, + "step": 20797 + }, + { + "epoch": 0.12369159767818061, + "grad_norm": 2.074240207672119, + "learning_rate": 4.813629524281256e-05, + "loss": 4.2272, + "step": 20798 + }, + { + "epoch": 0.12369754496146161, + "grad_norm": 1.9218412637710571, + "learning_rate": 4.81361182713271e-05, + "loss": 4.2612, + "step": 20799 + }, + { + "epoch": 0.1237034922447426, + "grad_norm": 2.3334543704986572, + "learning_rate": 4.8135941291765066e-05, + "loss": 5.4561, + "step": 20800 + }, + { + "epoch": 0.1237094395280236, + "grad_norm": 2.1329383850097656, + "learning_rate": 4.8135764304126504e-05, + "loss": 4.8373, + "step": 20801 + }, + { + "epoch": 0.12371538681130459, + "grad_norm": 2.2241666316986084, + "learning_rate": 4.81355873084115e-05, + "loss": 4.5995, + "step": 20802 + }, + { + "epoch": 0.12372133409458559, + "grad_norm": 1.448601245880127, + "learning_rate": 4.8135410304620086e-05, + "loss": 6.0327, + "step": 20803 + }, + { + "epoch": 0.1237272813778666, + "grad_norm": 2.05168080329895, + "learning_rate": 4.8135233292752344e-05, + "loss": 4.8944, + "step": 20804 + }, + { + "epoch": 0.12373322866114758, + "grad_norm": 1.9282878637313843, + "learning_rate": 4.813505627280834e-05, + "loss": 5.1704, + "step": 20805 + }, + { + "epoch": 0.12373917594442858, + "grad_norm": 1.892562747001648, + "learning_rate": 4.813487924478812e-05, + "loss": 5.3674, + "step": 20806 + }, + { + "epoch": 0.12374512322770959, + "grad_norm": 1.866495132446289, + "learning_rate": 4.813470220869175e-05, + "loss": 5.3585, + "step": 20807 + }, + { + "epoch": 0.12375107051099057, + "grad_norm": 1.8725072145462036, + "learning_rate": 4.81345251645193e-05, + "loss": 5.0933, + "step": 20808 + }, + { + "epoch": 0.12375701779427158, + "grad_norm": 1.486983299255371, + "learning_rate": 4.8134348112270825e-05, + "loss": 5.1869, + "step": 20809 + }, + { + "epoch": 0.12376296507755258, + "grad_norm": 1.5050567388534546, + "learning_rate": 4.813417105194639e-05, + "loss": 5.1382, + "step": 20810 + }, + { + "epoch": 0.12376891236083357, + "grad_norm": 1.629869818687439, + "learning_rate": 4.813399398354605e-05, + "loss": 5.3847, + "step": 20811 + }, + { + "epoch": 0.12377485964411457, + "grad_norm": 1.749213695526123, + "learning_rate": 4.813381690706987e-05, + "loss": 4.8655, + "step": 20812 + }, + { + "epoch": 0.12378080692739557, + "grad_norm": 1.734803318977356, + "learning_rate": 4.813363982251792e-05, + "loss": 5.2059, + "step": 20813 + }, + { + "epoch": 0.12378675421067656, + "grad_norm": 1.8050858974456787, + "learning_rate": 4.813346272989024e-05, + "loss": 5.1364, + "step": 20814 + }, + { + "epoch": 0.12379270149395756, + "grad_norm": 1.6926177740097046, + "learning_rate": 4.813328562918692e-05, + "loss": 4.969, + "step": 20815 + }, + { + "epoch": 0.12379864877723856, + "grad_norm": 1.9767627716064453, + "learning_rate": 4.813310852040801e-05, + "loss": 5.1043, + "step": 20816 + }, + { + "epoch": 0.12380459606051955, + "grad_norm": 1.5432230234146118, + "learning_rate": 4.813293140355357e-05, + "loss": 5.0858, + "step": 20817 + }, + { + "epoch": 0.12381054334380055, + "grad_norm": 1.5301191806793213, + "learning_rate": 4.813275427862366e-05, + "loss": 5.2312, + "step": 20818 + }, + { + "epoch": 0.12381649062708155, + "grad_norm": 1.6347124576568604, + "learning_rate": 4.813257714561835e-05, + "loss": 5.1701, + "step": 20819 + }, + { + "epoch": 0.12382243791036254, + "grad_norm": 2.1260578632354736, + "learning_rate": 4.813240000453769e-05, + "loss": 5.3055, + "step": 20820 + }, + { + "epoch": 0.12382838519364354, + "grad_norm": 2.0905344486236572, + "learning_rate": 4.813222285538175e-05, + "loss": 5.1265, + "step": 20821 + }, + { + "epoch": 0.12383433247692455, + "grad_norm": 1.8773592710494995, + "learning_rate": 4.81320456981506e-05, + "loss": 5.1409, + "step": 20822 + }, + { + "epoch": 0.12384027976020553, + "grad_norm": 1.9149075746536255, + "learning_rate": 4.8131868532844275e-05, + "loss": 5.1855, + "step": 20823 + }, + { + "epoch": 0.12384622704348654, + "grad_norm": 2.0494494438171387, + "learning_rate": 4.813169135946286e-05, + "loss": 5.2561, + "step": 20824 + }, + { + "epoch": 0.12385217432676754, + "grad_norm": 1.9590463638305664, + "learning_rate": 4.8131514178006417e-05, + "loss": 5.0764, + "step": 20825 + }, + { + "epoch": 0.12385812161004853, + "grad_norm": 2.5940022468566895, + "learning_rate": 4.8131336988475e-05, + "loss": 4.42, + "step": 20826 + }, + { + "epoch": 0.12386406889332953, + "grad_norm": 2.135793924331665, + "learning_rate": 4.8131159790868665e-05, + "loss": 4.653, + "step": 20827 + }, + { + "epoch": 0.12387001617661053, + "grad_norm": 2.1380679607391357, + "learning_rate": 4.813098258518748e-05, + "loss": 4.7332, + "step": 20828 + }, + { + "epoch": 0.12387596345989152, + "grad_norm": 2.264723300933838, + "learning_rate": 4.8130805371431513e-05, + "loss": 4.8735, + "step": 20829 + }, + { + "epoch": 0.12388191074317252, + "grad_norm": 2.4449269771575928, + "learning_rate": 4.813062814960082e-05, + "loss": 3.6335, + "step": 20830 + }, + { + "epoch": 0.12388785802645351, + "grad_norm": 2.5718894004821777, + "learning_rate": 4.813045091969547e-05, + "loss": 3.8212, + "step": 20831 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 1.9600555896759033, + "learning_rate": 4.813027368171551e-05, + "loss": 5.7456, + "step": 20832 + }, + { + "epoch": 0.12389975259301551, + "grad_norm": 2.032362699508667, + "learning_rate": 4.813009643566101e-05, + "loss": 5.3087, + "step": 20833 + }, + { + "epoch": 0.1239056998762965, + "grad_norm": 2.0349206924438477, + "learning_rate": 4.8129919181532036e-05, + "loss": 5.0988, + "step": 20834 + }, + { + "epoch": 0.1239116471595775, + "grad_norm": 2.811582565307617, + "learning_rate": 4.812974191932864e-05, + "loss": 4.4085, + "step": 20835 + }, + { + "epoch": 0.1239175944428585, + "grad_norm": 1.8748958110809326, + "learning_rate": 4.8129564649050904e-05, + "loss": 5.3469, + "step": 20836 + }, + { + "epoch": 0.1239235417261395, + "grad_norm": 2.162895917892456, + "learning_rate": 4.8129387370698865e-05, + "loss": 5.4258, + "step": 20837 + }, + { + "epoch": 0.1239294890094205, + "grad_norm": 1.60780668258667, + "learning_rate": 4.8129210084272596e-05, + "loss": 5.4865, + "step": 20838 + }, + { + "epoch": 0.1239354362927015, + "grad_norm": 1.8906630277633667, + "learning_rate": 4.812903278977216e-05, + "loss": 5.3286, + "step": 20839 + }, + { + "epoch": 0.12394138357598249, + "grad_norm": 1.5469995737075806, + "learning_rate": 4.812885548719762e-05, + "loss": 5.4524, + "step": 20840 + }, + { + "epoch": 0.12394733085926349, + "grad_norm": 1.632104754447937, + "learning_rate": 4.8128678176549034e-05, + "loss": 5.4239, + "step": 20841 + }, + { + "epoch": 0.12395327814254449, + "grad_norm": 1.9250766038894653, + "learning_rate": 4.812850085782646e-05, + "loss": 5.3333, + "step": 20842 + }, + { + "epoch": 0.12395922542582548, + "grad_norm": 1.5831308364868164, + "learning_rate": 4.8128323531029974e-05, + "loss": 5.38, + "step": 20843 + }, + { + "epoch": 0.12396517270910648, + "grad_norm": 1.8450974225997925, + "learning_rate": 4.812814619615963e-05, + "loss": 5.1909, + "step": 20844 + }, + { + "epoch": 0.12397111999238748, + "grad_norm": 1.990018367767334, + "learning_rate": 4.8127968853215485e-05, + "loss": 5.2392, + "step": 20845 + }, + { + "epoch": 0.12397706727566847, + "grad_norm": 1.7380045652389526, + "learning_rate": 4.812779150219761e-05, + "loss": 5.4486, + "step": 20846 + }, + { + "epoch": 0.12398301455894947, + "grad_norm": 1.6080845594406128, + "learning_rate": 4.812761414310605e-05, + "loss": 6.0048, + "step": 20847 + }, + { + "epoch": 0.12398896184223047, + "grad_norm": 1.2336721420288086, + "learning_rate": 4.8127436775940884e-05, + "loss": 5.8988, + "step": 20848 + }, + { + "epoch": 0.12399490912551146, + "grad_norm": 1.3851333856582642, + "learning_rate": 4.8127259400702173e-05, + "loss": 6.0162, + "step": 20849 + }, + { + "epoch": 0.12400085640879246, + "grad_norm": 1.3938422203063965, + "learning_rate": 4.8127082017389965e-05, + "loss": 5.9186, + "step": 20850 + }, + { + "epoch": 0.12400680369207347, + "grad_norm": 1.6463207006454468, + "learning_rate": 4.812690462600434e-05, + "loss": 5.9684, + "step": 20851 + }, + { + "epoch": 0.12401275097535445, + "grad_norm": 1.4180574417114258, + "learning_rate": 4.8126727226545353e-05, + "loss": 5.9383, + "step": 20852 + }, + { + "epoch": 0.12401869825863546, + "grad_norm": 1.3431847095489502, + "learning_rate": 4.8126549819013065e-05, + "loss": 5.862, + "step": 20853 + }, + { + "epoch": 0.12402464554191646, + "grad_norm": 1.3493611812591553, + "learning_rate": 4.812637240340753e-05, + "loss": 5.8796, + "step": 20854 + }, + { + "epoch": 0.12403059282519745, + "grad_norm": 1.2833929061889648, + "learning_rate": 4.812619497972882e-05, + "loss": 5.7322, + "step": 20855 + }, + { + "epoch": 0.12403654010847845, + "grad_norm": 1.4494770765304565, + "learning_rate": 4.8126017547977e-05, + "loss": 5.6871, + "step": 20856 + }, + { + "epoch": 0.12404248739175945, + "grad_norm": 1.9750009775161743, + "learning_rate": 4.812584010815212e-05, + "loss": 5.4744, + "step": 20857 + }, + { + "epoch": 0.12404843467504044, + "grad_norm": 2.2873501777648926, + "learning_rate": 4.812566266025425e-05, + "loss": 4.7326, + "step": 20858 + }, + { + "epoch": 0.12405438195832144, + "grad_norm": 2.3699395656585693, + "learning_rate": 4.8125485204283446e-05, + "loss": 5.1084, + "step": 20859 + }, + { + "epoch": 0.12406032924160243, + "grad_norm": 2.3874311447143555, + "learning_rate": 4.812530774023978e-05, + "loss": 4.7226, + "step": 20860 + }, + { + "epoch": 0.12406627652488343, + "grad_norm": 1.6285946369171143, + "learning_rate": 4.8125130268123305e-05, + "loss": 5.4695, + "step": 20861 + }, + { + "epoch": 0.12407222380816443, + "grad_norm": 1.5346466302871704, + "learning_rate": 4.8124952787934096e-05, + "loss": 5.5105, + "step": 20862 + }, + { + "epoch": 0.12407817109144542, + "grad_norm": 1.7935290336608887, + "learning_rate": 4.8124775299672195e-05, + "loss": 5.2028, + "step": 20863 + }, + { + "epoch": 0.12408411837472642, + "grad_norm": 1.7893015146255493, + "learning_rate": 4.812459780333767e-05, + "loss": 5.1571, + "step": 20864 + }, + { + "epoch": 0.12409006565800743, + "grad_norm": 1.6904758214950562, + "learning_rate": 4.8124420298930596e-05, + "loss": 6.0024, + "step": 20865 + }, + { + "epoch": 0.12409601294128841, + "grad_norm": 1.7721166610717773, + "learning_rate": 4.812424278645102e-05, + "loss": 5.8716, + "step": 20866 + }, + { + "epoch": 0.12410196022456942, + "grad_norm": 1.5822969675064087, + "learning_rate": 4.812406526589901e-05, + "loss": 5.7984, + "step": 20867 + }, + { + "epoch": 0.12410790750785042, + "grad_norm": 1.713592290878296, + "learning_rate": 4.8123887737274634e-05, + "loss": 5.7348, + "step": 20868 + }, + { + "epoch": 0.1241138547911314, + "grad_norm": 1.54501473903656, + "learning_rate": 4.812371020057794e-05, + "loss": 5.7012, + "step": 20869 + }, + { + "epoch": 0.12411980207441241, + "grad_norm": 1.2782925367355347, + "learning_rate": 4.8123532655809e-05, + "loss": 5.6171, + "step": 20870 + }, + { + "epoch": 0.12412574935769341, + "grad_norm": 1.357879638671875, + "learning_rate": 4.812335510296787e-05, + "loss": 5.7021, + "step": 20871 + }, + { + "epoch": 0.1241316966409744, + "grad_norm": 1.468440294265747, + "learning_rate": 4.812317754205462e-05, + "loss": 5.6863, + "step": 20872 + }, + { + "epoch": 0.1241376439242554, + "grad_norm": 2.965566396713257, + "learning_rate": 4.812299997306931e-05, + "loss": 5.3282, + "step": 20873 + }, + { + "epoch": 0.1241435912075364, + "grad_norm": 3.3760321140289307, + "learning_rate": 4.8122822396012e-05, + "loss": 5.0464, + "step": 20874 + }, + { + "epoch": 0.12414953849081739, + "grad_norm": 2.340055465698242, + "learning_rate": 4.8122644810882746e-05, + "loss": 4.6466, + "step": 20875 + }, + { + "epoch": 0.12415548577409839, + "grad_norm": 1.5659359693527222, + "learning_rate": 4.8122467217681615e-05, + "loss": 5.5262, + "step": 20876 + }, + { + "epoch": 0.1241614330573794, + "grad_norm": 1.9036263227462769, + "learning_rate": 4.812228961640868e-05, + "loss": 5.7474, + "step": 20877 + }, + { + "epoch": 0.12416738034066038, + "grad_norm": 1.8488661050796509, + "learning_rate": 4.812211200706398e-05, + "loss": 5.6901, + "step": 20878 + }, + { + "epoch": 0.12417332762394138, + "grad_norm": 1.7501896619796753, + "learning_rate": 4.8121934389647594e-05, + "loss": 5.9729, + "step": 20879 + }, + { + "epoch": 0.12417927490722239, + "grad_norm": 1.7495286464691162, + "learning_rate": 4.812175676415957e-05, + "loss": 5.4282, + "step": 20880 + }, + { + "epoch": 0.12418522219050337, + "grad_norm": 1.8494720458984375, + "learning_rate": 4.8121579130600005e-05, + "loss": 5.6148, + "step": 20881 + }, + { + "epoch": 0.12419116947378438, + "grad_norm": 1.860341191291809, + "learning_rate": 4.812140148896892e-05, + "loss": 5.6192, + "step": 20882 + }, + { + "epoch": 0.12419711675706538, + "grad_norm": 1.845438003540039, + "learning_rate": 4.8121223839266386e-05, + "loss": 5.4989, + "step": 20883 + }, + { + "epoch": 0.12420306404034637, + "grad_norm": 1.7625926733016968, + "learning_rate": 4.812104618149248e-05, + "loss": 5.4833, + "step": 20884 + }, + { + "epoch": 0.12420901132362737, + "grad_norm": 1.4869773387908936, + "learning_rate": 4.812086851564725e-05, + "loss": 5.6437, + "step": 20885 + }, + { + "epoch": 0.12421495860690837, + "grad_norm": 1.528306245803833, + "learning_rate": 4.812069084173077e-05, + "loss": 5.4938, + "step": 20886 + }, + { + "epoch": 0.12422090589018936, + "grad_norm": 1.28203284740448, + "learning_rate": 4.81205131597431e-05, + "loss": 5.5411, + "step": 20887 + }, + { + "epoch": 0.12422685317347036, + "grad_norm": 1.9413608312606812, + "learning_rate": 4.8120335469684285e-05, + "loss": 5.4842, + "step": 20888 + }, + { + "epoch": 0.12423280045675135, + "grad_norm": 1.8776315450668335, + "learning_rate": 4.812015777155441e-05, + "loss": 5.495, + "step": 20889 + }, + { + "epoch": 0.12423874774003235, + "grad_norm": 1.941171646118164, + "learning_rate": 4.8119980065353524e-05, + "loss": 5.7711, + "step": 20890 + }, + { + "epoch": 0.12424469502331335, + "grad_norm": 1.8312263488769531, + "learning_rate": 4.811980235108169e-05, + "loss": 5.5998, + "step": 20891 + }, + { + "epoch": 0.12425064230659434, + "grad_norm": 1.6940878629684448, + "learning_rate": 4.811962462873897e-05, + "loss": 5.9089, + "step": 20892 + }, + { + "epoch": 0.12425658958987534, + "grad_norm": 1.8769567012786865, + "learning_rate": 4.811944689832543e-05, + "loss": 5.5854, + "step": 20893 + }, + { + "epoch": 0.12426253687315635, + "grad_norm": 1.8289974927902222, + "learning_rate": 4.811926915984113e-05, + "loss": 5.4698, + "step": 20894 + }, + { + "epoch": 0.12426848415643733, + "grad_norm": 2.343961000442505, + "learning_rate": 4.811909141328613e-05, + "loss": 4.4474, + "step": 20895 + }, + { + "epoch": 0.12427443143971834, + "grad_norm": 1.9822384119033813, + "learning_rate": 4.8118913658660504e-05, + "loss": 4.9353, + "step": 20896 + }, + { + "epoch": 0.12428037872299934, + "grad_norm": 2.3056247234344482, + "learning_rate": 4.811873589596429e-05, + "loss": 4.7128, + "step": 20897 + }, + { + "epoch": 0.12428632600628033, + "grad_norm": 2.205653667449951, + "learning_rate": 4.811855812519758e-05, + "loss": 4.08, + "step": 20898 + }, + { + "epoch": 0.12429227328956133, + "grad_norm": 2.0141141414642334, + "learning_rate": 4.81183803463604e-05, + "loss": 4.2903, + "step": 20899 + }, + { + "epoch": 0.12429822057284233, + "grad_norm": 2.2912099361419678, + "learning_rate": 4.811820255945285e-05, + "loss": 4.7582, + "step": 20900 + }, + { + "epoch": 0.12430416785612332, + "grad_norm": 2.1577751636505127, + "learning_rate": 4.8118024764474965e-05, + "loss": 4.757, + "step": 20901 + }, + { + "epoch": 0.12431011513940432, + "grad_norm": 2.2851569652557373, + "learning_rate": 4.811784696142682e-05, + "loss": 3.9403, + "step": 20902 + }, + { + "epoch": 0.12431606242268532, + "grad_norm": 2.256500720977783, + "learning_rate": 4.8117669150308474e-05, + "loss": 4.3498, + "step": 20903 + }, + { + "epoch": 0.12432200970596631, + "grad_norm": 2.1631035804748535, + "learning_rate": 4.811749133111999e-05, + "loss": 4.6171, + "step": 20904 + }, + { + "epoch": 0.12432795698924731, + "grad_norm": 2.360530138015747, + "learning_rate": 4.811731350386142e-05, + "loss": 4.5958, + "step": 20905 + }, + { + "epoch": 0.12433390427252831, + "grad_norm": 2.031268835067749, + "learning_rate": 4.8117135668532845e-05, + "loss": 4.4466, + "step": 20906 + }, + { + "epoch": 0.1243398515558093, + "grad_norm": 1.7367441654205322, + "learning_rate": 4.811695782513431e-05, + "loss": 4.8605, + "step": 20907 + }, + { + "epoch": 0.1243457988390903, + "grad_norm": 2.5067267417907715, + "learning_rate": 4.8116779973665886e-05, + "loss": 4.0849, + "step": 20908 + }, + { + "epoch": 0.1243517461223713, + "grad_norm": 1.5404255390167236, + "learning_rate": 4.811660211412763e-05, + "loss": 4.4511, + "step": 20909 + }, + { + "epoch": 0.1243576934056523, + "grad_norm": 1.4191818237304688, + "learning_rate": 4.8116424246519606e-05, + "loss": 4.4274, + "step": 20910 + }, + { + "epoch": 0.1243636406889333, + "grad_norm": 1.4610079526901245, + "learning_rate": 4.811624637084189e-05, + "loss": 4.4112, + "step": 20911 + }, + { + "epoch": 0.1243695879722143, + "grad_norm": 1.3842167854309082, + "learning_rate": 4.811606848709452e-05, + "loss": 4.3019, + "step": 20912 + }, + { + "epoch": 0.12437553525549529, + "grad_norm": 1.4025331735610962, + "learning_rate": 4.811589059527757e-05, + "loss": 4.251, + "step": 20913 + }, + { + "epoch": 0.12438148253877629, + "grad_norm": 1.5034327507019043, + "learning_rate": 4.81157126953911e-05, + "loss": 4.1553, + "step": 20914 + }, + { + "epoch": 0.12438742982205729, + "grad_norm": 1.5153253078460693, + "learning_rate": 4.811553478743518e-05, + "loss": 4.1264, + "step": 20915 + }, + { + "epoch": 0.12439337710533828, + "grad_norm": 1.4300923347473145, + "learning_rate": 4.811535687140987e-05, + "loss": 4.2653, + "step": 20916 + }, + { + "epoch": 0.12439932438861928, + "grad_norm": 1.4667567014694214, + "learning_rate": 4.811517894731521e-05, + "loss": 4.2216, + "step": 20917 + }, + { + "epoch": 0.12440527167190027, + "grad_norm": 1.6324750185012817, + "learning_rate": 4.81150010151513e-05, + "loss": 4.3083, + "step": 20918 + }, + { + "epoch": 0.12441121895518127, + "grad_norm": 1.507516622543335, + "learning_rate": 4.8114823074918165e-05, + "loss": 4.1369, + "step": 20919 + }, + { + "epoch": 0.12441716623846227, + "grad_norm": 1.5365220308303833, + "learning_rate": 4.8114645126615886e-05, + "loss": 4.061, + "step": 20920 + }, + { + "epoch": 0.12442311352174326, + "grad_norm": 1.3880743980407715, + "learning_rate": 4.811446717024453e-05, + "loss": 4.2464, + "step": 20921 + }, + { + "epoch": 0.12442906080502426, + "grad_norm": 1.619391918182373, + "learning_rate": 4.8114289205804155e-05, + "loss": 4.0032, + "step": 20922 + }, + { + "epoch": 0.12443500808830527, + "grad_norm": 1.5912760496139526, + "learning_rate": 4.811411123329481e-05, + "loss": 3.9996, + "step": 20923 + }, + { + "epoch": 0.12444095537158625, + "grad_norm": 1.6042509078979492, + "learning_rate": 4.811393325271657e-05, + "loss": 3.9225, + "step": 20924 + }, + { + "epoch": 0.12444690265486726, + "grad_norm": 1.4620057344436646, + "learning_rate": 4.8113755264069505e-05, + "loss": 4.4391, + "step": 20925 + }, + { + "epoch": 0.12445284993814826, + "grad_norm": 1.6154197454452515, + "learning_rate": 4.811357726735366e-05, + "loss": 4.1254, + "step": 20926 + }, + { + "epoch": 0.12445879722142925, + "grad_norm": 1.520150065422058, + "learning_rate": 4.8113399262569104e-05, + "loss": 4.7638, + "step": 20927 + }, + { + "epoch": 0.12446474450471025, + "grad_norm": 1.5869375467300415, + "learning_rate": 4.81132212497159e-05, + "loss": 4.047, + "step": 20928 + }, + { + "epoch": 0.12447069178799125, + "grad_norm": 1.610819697380066, + "learning_rate": 4.8113043228794105e-05, + "loss": 4.0823, + "step": 20929 + }, + { + "epoch": 0.12447663907127224, + "grad_norm": 1.4962780475616455, + "learning_rate": 4.811286519980379e-05, + "loss": 5.4004, + "step": 20930 + }, + { + "epoch": 0.12448258635455324, + "grad_norm": 1.382641077041626, + "learning_rate": 4.811268716274501e-05, + "loss": 5.3129, + "step": 20931 + }, + { + "epoch": 0.12448853363783424, + "grad_norm": 1.3323496580123901, + "learning_rate": 4.811250911761783e-05, + "loss": 5.2123, + "step": 20932 + }, + { + "epoch": 0.12449448092111523, + "grad_norm": 1.4375461339950562, + "learning_rate": 4.811233106442231e-05, + "loss": 5.4249, + "step": 20933 + }, + { + "epoch": 0.12450042820439623, + "grad_norm": 1.6861125230789185, + "learning_rate": 4.811215300315852e-05, + "loss": 5.0697, + "step": 20934 + }, + { + "epoch": 0.12450637548767723, + "grad_norm": 1.52859365940094, + "learning_rate": 4.811197493382651e-05, + "loss": 5.5925, + "step": 20935 + }, + { + "epoch": 0.12451232277095822, + "grad_norm": 1.4931366443634033, + "learning_rate": 4.811179685642635e-05, + "loss": 5.4442, + "step": 20936 + }, + { + "epoch": 0.12451827005423922, + "grad_norm": 1.3825764656066895, + "learning_rate": 4.8111618770958104e-05, + "loss": 5.5773, + "step": 20937 + }, + { + "epoch": 0.12452421733752023, + "grad_norm": 1.3441286087036133, + "learning_rate": 4.811144067742183e-05, + "loss": 5.5421, + "step": 20938 + }, + { + "epoch": 0.12453016462080121, + "grad_norm": 1.2910594940185547, + "learning_rate": 4.811126257581758e-05, + "loss": 5.3507, + "step": 20939 + }, + { + "epoch": 0.12453611190408222, + "grad_norm": 1.3505282402038574, + "learning_rate": 4.811108446614544e-05, + "loss": 5.5285, + "step": 20940 + }, + { + "epoch": 0.12454205918736322, + "grad_norm": 1.4562500715255737, + "learning_rate": 4.811090634840546e-05, + "loss": 5.3592, + "step": 20941 + }, + { + "epoch": 0.1245480064706442, + "grad_norm": 1.4702924489974976, + "learning_rate": 4.8110728222597694e-05, + "loss": 5.2603, + "step": 20942 + }, + { + "epoch": 0.12455395375392521, + "grad_norm": 1.6397823095321655, + "learning_rate": 4.811055008872222e-05, + "loss": 5.222, + "step": 20943 + }, + { + "epoch": 0.12455990103720621, + "grad_norm": 1.5603538751602173, + "learning_rate": 4.811037194677908e-05, + "loss": 5.2075, + "step": 20944 + }, + { + "epoch": 0.1245658483204872, + "grad_norm": 1.3349683284759521, + "learning_rate": 4.811019379676835e-05, + "loss": 5.2903, + "step": 20945 + }, + { + "epoch": 0.1245717956037682, + "grad_norm": 1.348935842514038, + "learning_rate": 4.8110015638690096e-05, + "loss": 5.4688, + "step": 20946 + }, + { + "epoch": 0.12457774288704919, + "grad_norm": 1.4173049926757812, + "learning_rate": 4.810983747254437e-05, + "loss": 5.0299, + "step": 20947 + }, + { + "epoch": 0.12458369017033019, + "grad_norm": 1.3553805351257324, + "learning_rate": 4.8109659298331244e-05, + "loss": 5.0798, + "step": 20948 + }, + { + "epoch": 0.1245896374536112, + "grad_norm": 1.3770824670791626, + "learning_rate": 4.810948111605077e-05, + "loss": 4.807, + "step": 20949 + }, + { + "epoch": 0.12459558473689218, + "grad_norm": 1.3450689315795898, + "learning_rate": 4.810930292570302e-05, + "loss": 4.8061, + "step": 20950 + }, + { + "epoch": 0.12460153202017318, + "grad_norm": 1.4118422269821167, + "learning_rate": 4.8109124727288044e-05, + "loss": 5.203, + "step": 20951 + }, + { + "epoch": 0.12460747930345419, + "grad_norm": 1.4127706289291382, + "learning_rate": 4.810894652080592e-05, + "loss": 5.104, + "step": 20952 + }, + { + "epoch": 0.12461342658673517, + "grad_norm": 1.2636264562606812, + "learning_rate": 4.810876830625669e-05, + "loss": 4.9306, + "step": 20953 + }, + { + "epoch": 0.12461937387001618, + "grad_norm": 1.3846913576126099, + "learning_rate": 4.810859008364044e-05, + "loss": 4.8095, + "step": 20954 + }, + { + "epoch": 0.12462532115329718, + "grad_norm": 1.6017072200775146, + "learning_rate": 4.8108411852957216e-05, + "loss": 4.9926, + "step": 20955 + }, + { + "epoch": 0.12463126843657817, + "grad_norm": 1.5098768472671509, + "learning_rate": 4.8108233614207075e-05, + "loss": 5.3204, + "step": 20956 + }, + { + "epoch": 0.12463721571985917, + "grad_norm": 1.1792641878128052, + "learning_rate": 4.8108055367390097e-05, + "loss": 4.7596, + "step": 20957 + }, + { + "epoch": 0.12464316300314017, + "grad_norm": 1.3787871599197388, + "learning_rate": 4.8107877112506336e-05, + "loss": 5.0914, + "step": 20958 + }, + { + "epoch": 0.12464911028642116, + "grad_norm": 1.3097307682037354, + "learning_rate": 4.8107698849555846e-05, + "loss": 4.8154, + "step": 20959 + }, + { + "epoch": 0.12465505756970216, + "grad_norm": 1.4452660083770752, + "learning_rate": 4.810752057853871e-05, + "loss": 5.1395, + "step": 20960 + }, + { + "epoch": 0.12466100485298316, + "grad_norm": 1.4970120191574097, + "learning_rate": 4.8107342299454974e-05, + "loss": 4.8164, + "step": 20961 + }, + { + "epoch": 0.12466695213626415, + "grad_norm": 1.4092109203338623, + "learning_rate": 4.810716401230469e-05, + "loss": 4.9219, + "step": 20962 + }, + { + "epoch": 0.12467289941954515, + "grad_norm": 1.5558546781539917, + "learning_rate": 4.810698571708795e-05, + "loss": 4.8639, + "step": 20963 + }, + { + "epoch": 0.12467884670282615, + "grad_norm": 1.3631898164749146, + "learning_rate": 4.810680741380479e-05, + "loss": 5.2145, + "step": 20964 + }, + { + "epoch": 0.12468479398610714, + "grad_norm": 1.608810544013977, + "learning_rate": 4.8106629102455286e-05, + "loss": 5.2486, + "step": 20965 + }, + { + "epoch": 0.12469074126938814, + "grad_norm": 1.573190689086914, + "learning_rate": 4.81064507830395e-05, + "loss": 5.2476, + "step": 20966 + }, + { + "epoch": 0.12469668855266915, + "grad_norm": 1.5032795667648315, + "learning_rate": 4.810627245555748e-05, + "loss": 5.1557, + "step": 20967 + }, + { + "epoch": 0.12470263583595013, + "grad_norm": 1.3919012546539307, + "learning_rate": 4.810609412000931e-05, + "loss": 5.2812, + "step": 20968 + }, + { + "epoch": 0.12470858311923114, + "grad_norm": 1.417431354522705, + "learning_rate": 4.810591577639504e-05, + "loss": 5.3173, + "step": 20969 + }, + { + "epoch": 0.12471453040251214, + "grad_norm": 1.2135869264602661, + "learning_rate": 4.8105737424714724e-05, + "loss": 5.3511, + "step": 20970 + }, + { + "epoch": 0.12472047768579313, + "grad_norm": 1.3142472505569458, + "learning_rate": 4.810555906496844e-05, + "loss": 5.225, + "step": 20971 + }, + { + "epoch": 0.12472642496907413, + "grad_norm": 1.4344936609268188, + "learning_rate": 4.810538069715625e-05, + "loss": 5.5032, + "step": 20972 + }, + { + "epoch": 0.12473237225235513, + "grad_norm": 1.214281439781189, + "learning_rate": 4.81052023212782e-05, + "loss": 5.4466, + "step": 20973 + }, + { + "epoch": 0.12473831953563612, + "grad_norm": 1.5831886529922485, + "learning_rate": 4.810502393733437e-05, + "loss": 4.6211, + "step": 20974 + }, + { + "epoch": 0.12474426681891712, + "grad_norm": 1.6281508207321167, + "learning_rate": 4.8104845545324816e-05, + "loss": 4.6212, + "step": 20975 + }, + { + "epoch": 0.12475021410219811, + "grad_norm": 1.5753840208053589, + "learning_rate": 4.810466714524959e-05, + "loss": 4.7089, + "step": 20976 + }, + { + "epoch": 0.12475616138547911, + "grad_norm": 1.355692744255066, + "learning_rate": 4.810448873710877e-05, + "loss": 5.0399, + "step": 20977 + }, + { + "epoch": 0.12476210866876011, + "grad_norm": 1.27257239818573, + "learning_rate": 4.810431032090241e-05, + "loss": 4.7091, + "step": 20978 + }, + { + "epoch": 0.1247680559520411, + "grad_norm": 1.532210350036621, + "learning_rate": 4.810413189663058e-05, + "loss": 4.6682, + "step": 20979 + }, + { + "epoch": 0.1247740032353221, + "grad_norm": 1.4075580835342407, + "learning_rate": 4.810395346429333e-05, + "loss": 4.5135, + "step": 20980 + }, + { + "epoch": 0.1247799505186031, + "grad_norm": 1.3797897100448608, + "learning_rate": 4.810377502389073e-05, + "loss": 4.5548, + "step": 20981 + }, + { + "epoch": 0.1247858978018841, + "grad_norm": 1.4484235048294067, + "learning_rate": 4.810359657542284e-05, + "loss": 4.5336, + "step": 20982 + }, + { + "epoch": 0.1247918450851651, + "grad_norm": 1.4712706804275513, + "learning_rate": 4.810341811888972e-05, + "loss": 4.6805, + "step": 20983 + }, + { + "epoch": 0.1247977923684461, + "grad_norm": 1.548684000968933, + "learning_rate": 4.8103239654291444e-05, + "loss": 4.6239, + "step": 20984 + }, + { + "epoch": 0.12480373965172709, + "grad_norm": 1.481542944908142, + "learning_rate": 4.810306118162806e-05, + "loss": 4.981, + "step": 20985 + }, + { + "epoch": 0.12480968693500809, + "grad_norm": 1.423977017402649, + "learning_rate": 4.810288270089963e-05, + "loss": 5.1813, + "step": 20986 + }, + { + "epoch": 0.12481563421828909, + "grad_norm": 1.2712557315826416, + "learning_rate": 4.810270421210623e-05, + "loss": 5.1499, + "step": 20987 + }, + { + "epoch": 0.12482158150157008, + "grad_norm": 1.4444210529327393, + "learning_rate": 4.810252571524791e-05, + "loss": 5.1801, + "step": 20988 + }, + { + "epoch": 0.12482752878485108, + "grad_norm": 1.2743985652923584, + "learning_rate": 4.810234721032475e-05, + "loss": 5.1433, + "step": 20989 + }, + { + "epoch": 0.12483347606813208, + "grad_norm": 1.4066376686096191, + "learning_rate": 4.810216869733679e-05, + "loss": 5.1821, + "step": 20990 + }, + { + "epoch": 0.12483942335141307, + "grad_norm": 1.362889051437378, + "learning_rate": 4.81019901762841e-05, + "loss": 5.2135, + "step": 20991 + }, + { + "epoch": 0.12484537063469407, + "grad_norm": 1.2178412675857544, + "learning_rate": 4.810181164716674e-05, + "loss": 5.3131, + "step": 20992 + }, + { + "epoch": 0.12485131791797507, + "grad_norm": 1.7444922924041748, + "learning_rate": 4.8101633109984786e-05, + "loss": 4.8666, + "step": 20993 + }, + { + "epoch": 0.12485726520125606, + "grad_norm": 1.4151227474212646, + "learning_rate": 4.810145456473828e-05, + "loss": 5.0585, + "step": 20994 + }, + { + "epoch": 0.12486321248453706, + "grad_norm": 1.2906028032302856, + "learning_rate": 4.81012760114273e-05, + "loss": 5.1402, + "step": 20995 + }, + { + "epoch": 0.12486915976781807, + "grad_norm": 1.4265183210372925, + "learning_rate": 4.8101097450051906e-05, + "loss": 5.184, + "step": 20996 + }, + { + "epoch": 0.12487510705109905, + "grad_norm": 1.499804139137268, + "learning_rate": 4.8100918880612154e-05, + "loss": 4.9952, + "step": 20997 + }, + { + "epoch": 0.12488105433438006, + "grad_norm": 1.5296711921691895, + "learning_rate": 4.810074030310812e-05, + "loss": 4.9743, + "step": 20998 + }, + { + "epoch": 0.12488700161766106, + "grad_norm": 1.4345946311950684, + "learning_rate": 4.810056171753984e-05, + "loss": 4.9107, + "step": 20999 + }, + { + "epoch": 0.12489294890094205, + "grad_norm": 1.501966953277588, + "learning_rate": 4.81003831239074e-05, + "loss": 4.8123, + "step": 21000 + }, + { + "epoch": 0.12489889618422305, + "grad_norm": 1.1865864992141724, + "learning_rate": 4.810020452221086e-05, + "loss": 5.1614, + "step": 21001 + }, + { + "epoch": 0.12490484346750405, + "grad_norm": 1.345996379852295, + "learning_rate": 4.810002591245027e-05, + "loss": 4.9784, + "step": 21002 + }, + { + "epoch": 0.12491079075078504, + "grad_norm": 1.2252000570297241, + "learning_rate": 4.80998472946257e-05, + "loss": 4.9433, + "step": 21003 + }, + { + "epoch": 0.12491673803406604, + "grad_norm": 1.4540387392044067, + "learning_rate": 4.809966866873722e-05, + "loss": 4.8608, + "step": 21004 + }, + { + "epoch": 0.12492268531734703, + "grad_norm": 1.382969617843628, + "learning_rate": 4.809949003478488e-05, + "loss": 4.8168, + "step": 21005 + }, + { + "epoch": 0.12492863260062803, + "grad_norm": 1.3642408847808838, + "learning_rate": 4.809931139276874e-05, + "loss": 4.9262, + "step": 21006 + }, + { + "epoch": 0.12493457988390903, + "grad_norm": 1.1903620958328247, + "learning_rate": 4.809913274268887e-05, + "loss": 5.1817, + "step": 21007 + }, + { + "epoch": 0.12494052716719002, + "grad_norm": 1.3020774126052856, + "learning_rate": 4.809895408454534e-05, + "loss": 4.956, + "step": 21008 + }, + { + "epoch": 0.12494647445047102, + "grad_norm": 1.3209398984909058, + "learning_rate": 4.80987754183382e-05, + "loss": 4.9542, + "step": 21009 + }, + { + "epoch": 0.12495242173375203, + "grad_norm": 1.2684825658798218, + "learning_rate": 4.809859674406752e-05, + "loss": 5.2919, + "step": 21010 + }, + { + "epoch": 0.12495836901703301, + "grad_norm": 1.271053671836853, + "learning_rate": 4.809841806173335e-05, + "loss": 5.1397, + "step": 21011 + }, + { + "epoch": 0.12496431630031402, + "grad_norm": 1.2137185335159302, + "learning_rate": 4.809823937133576e-05, + "loss": 5.1874, + "step": 21012 + }, + { + "epoch": 0.12497026358359502, + "grad_norm": 1.2429122924804688, + "learning_rate": 4.8098060672874825e-05, + "loss": 5.0626, + "step": 21013 + }, + { + "epoch": 0.124976210866876, + "grad_norm": 1.3292062282562256, + "learning_rate": 4.809788196635058e-05, + "loss": 4.9019, + "step": 21014 + }, + { + "epoch": 0.12498215815015701, + "grad_norm": 1.3801854848861694, + "learning_rate": 4.8097703251763115e-05, + "loss": 4.8948, + "step": 21015 + }, + { + "epoch": 0.12498810543343801, + "grad_norm": 1.1259671449661255, + "learning_rate": 4.8097524529112484e-05, + "loss": 4.8041, + "step": 21016 + }, + { + "epoch": 0.124994052716719, + "grad_norm": 1.145451307296753, + "learning_rate": 4.809734579839873e-05, + "loss": 5.0012, + "step": 21017 + }, + { + "epoch": 0.125, + "grad_norm": 2.0128631591796875, + "learning_rate": 4.8097167059621945e-05, + "loss": 5.5174, + "step": 21018 + }, + { + "epoch": 0.125005947283281, + "grad_norm": 1.2371736764907837, + "learning_rate": 4.8096988312782174e-05, + "loss": 4.9491, + "step": 21019 + }, + { + "epoch": 0.125011894566562, + "grad_norm": 1.4009771347045898, + "learning_rate": 4.809680955787948e-05, + "loss": 4.8699, + "step": 21020 + }, + { + "epoch": 0.125017841849843, + "grad_norm": 1.2181386947631836, + "learning_rate": 4.809663079491393e-05, + "loss": 4.8258, + "step": 21021 + }, + { + "epoch": 0.12502378913312398, + "grad_norm": 1.3663759231567383, + "learning_rate": 4.809645202388559e-05, + "loss": 5.085, + "step": 21022 + }, + { + "epoch": 0.125029736416405, + "grad_norm": 1.4783004522323608, + "learning_rate": 4.809627324479451e-05, + "loss": 5.0309, + "step": 21023 + }, + { + "epoch": 0.12503568369968598, + "grad_norm": 1.5568218231201172, + "learning_rate": 4.809609445764076e-05, + "loss": 5.217, + "step": 21024 + }, + { + "epoch": 0.12504163098296697, + "grad_norm": 1.42091965675354, + "learning_rate": 4.80959156624244e-05, + "loss": 5.1213, + "step": 21025 + }, + { + "epoch": 0.125047578266248, + "grad_norm": 1.5361231565475464, + "learning_rate": 4.8095736859145504e-05, + "loss": 5.1539, + "step": 21026 + }, + { + "epoch": 0.12505352554952898, + "grad_norm": 1.4799479246139526, + "learning_rate": 4.809555804780411e-05, + "loss": 5.0524, + "step": 21027 + }, + { + "epoch": 0.12505947283280996, + "grad_norm": 1.379309892654419, + "learning_rate": 4.809537922840031e-05, + "loss": 4.8477, + "step": 21028 + }, + { + "epoch": 0.12506542011609098, + "grad_norm": 1.3503345251083374, + "learning_rate": 4.809520040093415e-05, + "loss": 5.3253, + "step": 21029 + }, + { + "epoch": 0.12507136739937197, + "grad_norm": 1.1925950050354004, + "learning_rate": 4.8095021565405684e-05, + "loss": 5.2129, + "step": 21030 + }, + { + "epoch": 0.12507731468265296, + "grad_norm": 1.433516025543213, + "learning_rate": 4.809484272181499e-05, + "loss": 5.1091, + "step": 21031 + }, + { + "epoch": 0.12508326196593397, + "grad_norm": 1.3334667682647705, + "learning_rate": 4.809466387016213e-05, + "loss": 5.3445, + "step": 21032 + }, + { + "epoch": 0.12508920924921496, + "grad_norm": 1.270871877670288, + "learning_rate": 4.809448501044715e-05, + "loss": 5.1455, + "step": 21033 + }, + { + "epoch": 0.12509515653249595, + "grad_norm": 1.2028634548187256, + "learning_rate": 4.8094306142670145e-05, + "loss": 5.1721, + "step": 21034 + }, + { + "epoch": 0.12510110381577697, + "grad_norm": 1.537757396697998, + "learning_rate": 4.809412726683114e-05, + "loss": 5.1853, + "step": 21035 + }, + { + "epoch": 0.12510705109905795, + "grad_norm": 1.3350294828414917, + "learning_rate": 4.809394838293021e-05, + "loss": 5.0725, + "step": 21036 + }, + { + "epoch": 0.12511299838233894, + "grad_norm": 1.3986246585845947, + "learning_rate": 4.8093769490967434e-05, + "loss": 5.1176, + "step": 21037 + }, + { + "epoch": 0.12511894566561996, + "grad_norm": 1.3993934392929077, + "learning_rate": 4.809359059094285e-05, + "loss": 5.1085, + "step": 21038 + }, + { + "epoch": 0.12512489294890095, + "grad_norm": 1.6875231266021729, + "learning_rate": 4.8093411682856535e-05, + "loss": 5.134, + "step": 21039 + }, + { + "epoch": 0.12513084023218193, + "grad_norm": 1.2966142892837524, + "learning_rate": 4.809323276670855e-05, + "loss": 5.1509, + "step": 21040 + }, + { + "epoch": 0.12513678751546295, + "grad_norm": 1.3994536399841309, + "learning_rate": 4.8093053842498956e-05, + "loss": 4.8962, + "step": 21041 + }, + { + "epoch": 0.12514273479874394, + "grad_norm": 1.3936022520065308, + "learning_rate": 4.809287491022782e-05, + "loss": 4.908, + "step": 21042 + }, + { + "epoch": 0.12514868208202493, + "grad_norm": 1.9262713193893433, + "learning_rate": 4.80926959698952e-05, + "loss": 5.0856, + "step": 21043 + }, + { + "epoch": 0.12515462936530594, + "grad_norm": 1.3765772581100464, + "learning_rate": 4.809251702150115e-05, + "loss": 5.0438, + "step": 21044 + }, + { + "epoch": 0.12516057664858693, + "grad_norm": 1.4509775638580322, + "learning_rate": 4.809233806504575e-05, + "loss": 5.2001, + "step": 21045 + }, + { + "epoch": 0.12516652393186792, + "grad_norm": 1.6581740379333496, + "learning_rate": 4.809215910052904e-05, + "loss": 4.7155, + "step": 21046 + }, + { + "epoch": 0.12517247121514893, + "grad_norm": 1.5386825799942017, + "learning_rate": 4.8091980127951115e-05, + "loss": 4.6354, + "step": 21047 + }, + { + "epoch": 0.12517841849842992, + "grad_norm": 1.3021749258041382, + "learning_rate": 4.8091801147312e-05, + "loss": 5.2241, + "step": 21048 + }, + { + "epoch": 0.1251843657817109, + "grad_norm": 1.3396178483963013, + "learning_rate": 4.809162215861179e-05, + "loss": 5.2361, + "step": 21049 + }, + { + "epoch": 0.1251903130649919, + "grad_norm": 1.381496548652649, + "learning_rate": 4.809144316185052e-05, + "loss": 5.3347, + "step": 21050 + }, + { + "epoch": 0.12519626034827291, + "grad_norm": 1.4430748224258423, + "learning_rate": 4.809126415702828e-05, + "loss": 4.895, + "step": 21051 + }, + { + "epoch": 0.1252022076315539, + "grad_norm": 1.2426742315292358, + "learning_rate": 4.809108514414511e-05, + "loss": 4.9085, + "step": 21052 + }, + { + "epoch": 0.1252081549148349, + "grad_norm": 1.224529504776001, + "learning_rate": 4.8090906123201085e-05, + "loss": 5.1997, + "step": 21053 + }, + { + "epoch": 0.1252141021981159, + "grad_norm": 1.295866847038269, + "learning_rate": 4.809072709419626e-05, + "loss": 5.5419, + "step": 21054 + }, + { + "epoch": 0.1252200494813969, + "grad_norm": 1.7327667474746704, + "learning_rate": 4.80905480571307e-05, + "loss": 5.1902, + "step": 21055 + }, + { + "epoch": 0.12522599676467788, + "grad_norm": 1.4727381467819214, + "learning_rate": 4.809036901200447e-05, + "loss": 4.9909, + "step": 21056 + }, + { + "epoch": 0.1252319440479589, + "grad_norm": 1.5449626445770264, + "learning_rate": 4.8090189958817626e-05, + "loss": 4.8721, + "step": 21057 + }, + { + "epoch": 0.1252378913312399, + "grad_norm": 1.563591718673706, + "learning_rate": 4.809001089757024e-05, + "loss": 5.0417, + "step": 21058 + }, + { + "epoch": 0.12524383861452087, + "grad_norm": 1.3692893981933594, + "learning_rate": 4.808983182826237e-05, + "loss": 4.9748, + "step": 21059 + }, + { + "epoch": 0.1252497858978019, + "grad_norm": 1.3994625806808472, + "learning_rate": 4.8089652750894074e-05, + "loss": 5.1823, + "step": 21060 + }, + { + "epoch": 0.12525573318108288, + "grad_norm": 1.3998682498931885, + "learning_rate": 4.8089473665465425e-05, + "loss": 5.2272, + "step": 21061 + }, + { + "epoch": 0.12526168046436387, + "grad_norm": 1.4436434507369995, + "learning_rate": 4.808929457197647e-05, + "loss": 5.4049, + "step": 21062 + }, + { + "epoch": 0.12526762774764488, + "grad_norm": 1.2826770544052124, + "learning_rate": 4.8089115470427294e-05, + "loss": 5.2065, + "step": 21063 + }, + { + "epoch": 0.12527357503092587, + "grad_norm": 1.4545691013336182, + "learning_rate": 4.808893636081794e-05, + "loss": 5.1212, + "step": 21064 + }, + { + "epoch": 0.12527952231420686, + "grad_norm": 1.70439875125885, + "learning_rate": 4.808875724314847e-05, + "loss": 4.9993, + "step": 21065 + }, + { + "epoch": 0.12528546959748788, + "grad_norm": 1.5612056255340576, + "learning_rate": 4.8088578117418965e-05, + "loss": 5.1109, + "step": 21066 + }, + { + "epoch": 0.12529141688076886, + "grad_norm": 1.3385684490203857, + "learning_rate": 4.808839898362947e-05, + "loss": 5.3485, + "step": 21067 + }, + { + "epoch": 0.12529736416404985, + "grad_norm": 1.4440029859542847, + "learning_rate": 4.808821984178006e-05, + "loss": 5.3289, + "step": 21068 + }, + { + "epoch": 0.12530331144733087, + "grad_norm": 1.4780069589614868, + "learning_rate": 4.808804069187078e-05, + "loss": 5.4379, + "step": 21069 + }, + { + "epoch": 0.12530925873061186, + "grad_norm": 1.4137150049209595, + "learning_rate": 4.808786153390171e-05, + "loss": 5.4666, + "step": 21070 + }, + { + "epoch": 0.12531520601389284, + "grad_norm": 1.3870670795440674, + "learning_rate": 4.80876823678729e-05, + "loss": 5.4342, + "step": 21071 + }, + { + "epoch": 0.12532115329717386, + "grad_norm": 1.3641326427459717, + "learning_rate": 4.808750319378442e-05, + "loss": 5.148, + "step": 21072 + }, + { + "epoch": 0.12532710058045485, + "grad_norm": 1.3099322319030762, + "learning_rate": 4.808732401163634e-05, + "loss": 5.1237, + "step": 21073 + }, + { + "epoch": 0.12533304786373584, + "grad_norm": 1.4198615550994873, + "learning_rate": 4.808714482142871e-05, + "loss": 5.5755, + "step": 21074 + }, + { + "epoch": 0.12533899514701685, + "grad_norm": 1.1760785579681396, + "learning_rate": 4.80869656231616e-05, + "loss": 5.5684, + "step": 21075 + }, + { + "epoch": 0.12534494243029784, + "grad_norm": 1.2611156702041626, + "learning_rate": 4.8086786416835054e-05, + "loss": 5.3834, + "step": 21076 + }, + { + "epoch": 0.12535088971357883, + "grad_norm": 1.085659384727478, + "learning_rate": 4.808660720244916e-05, + "loss": 5.2553, + "step": 21077 + }, + { + "epoch": 0.12535683699685984, + "grad_norm": 1.2537906169891357, + "learning_rate": 4.808642798000397e-05, + "loss": 5.3423, + "step": 21078 + }, + { + "epoch": 0.12536278428014083, + "grad_norm": 1.0891891717910767, + "learning_rate": 4.808624874949954e-05, + "loss": 5.4889, + "step": 21079 + }, + { + "epoch": 0.12536873156342182, + "grad_norm": 1.976110577583313, + "learning_rate": 4.808606951093595e-05, + "loss": 5.6103, + "step": 21080 + }, + { + "epoch": 0.12537467884670284, + "grad_norm": 1.3253698348999023, + "learning_rate": 4.808589026431324e-05, + "loss": 5.4673, + "step": 21081 + }, + { + "epoch": 0.12538062612998382, + "grad_norm": 1.4394372701644897, + "learning_rate": 4.808571100963149e-05, + "loss": 5.5256, + "step": 21082 + }, + { + "epoch": 0.1253865734132648, + "grad_norm": 1.45836341381073, + "learning_rate": 4.808553174689076e-05, + "loss": 4.5206, + "step": 21083 + }, + { + "epoch": 0.12539252069654583, + "grad_norm": 1.5719448328018188, + "learning_rate": 4.8085352476091105e-05, + "loss": 4.0577, + "step": 21084 + }, + { + "epoch": 0.12539846797982682, + "grad_norm": 1.3744319677352905, + "learning_rate": 4.808517319723259e-05, + "loss": 4.3965, + "step": 21085 + }, + { + "epoch": 0.1254044152631078, + "grad_norm": 1.4404634237289429, + "learning_rate": 4.8084993910315286e-05, + "loss": 4.3534, + "step": 21086 + }, + { + "epoch": 0.12541036254638882, + "grad_norm": 1.696215033531189, + "learning_rate": 4.8084814615339244e-05, + "loss": 5.4743, + "step": 21087 + }, + { + "epoch": 0.1254163098296698, + "grad_norm": 2.3401246070861816, + "learning_rate": 4.808463531230454e-05, + "loss": 4.3249, + "step": 21088 + }, + { + "epoch": 0.1254222571129508, + "grad_norm": 2.673963785171509, + "learning_rate": 4.808445600121122e-05, + "loss": 4.0038, + "step": 21089 + }, + { + "epoch": 0.1254282043962318, + "grad_norm": 2.551712989807129, + "learning_rate": 4.808427668205935e-05, + "loss": 4.0593, + "step": 21090 + }, + { + "epoch": 0.1254341516795128, + "grad_norm": 2.224776029586792, + "learning_rate": 4.8084097354849004e-05, + "loss": 4.4923, + "step": 21091 + }, + { + "epoch": 0.1254400989627938, + "grad_norm": 2.8964626789093018, + "learning_rate": 4.808391801958024e-05, + "loss": 4.8955, + "step": 21092 + }, + { + "epoch": 0.1254460462460748, + "grad_norm": 2.647202491760254, + "learning_rate": 4.808373867625312e-05, + "loss": 4.315, + "step": 21093 + }, + { + "epoch": 0.1254519935293558, + "grad_norm": 2.852851152420044, + "learning_rate": 4.80835593248677e-05, + "loss": 4.6153, + "step": 21094 + }, + { + "epoch": 0.12545794081263678, + "grad_norm": 1.5732487440109253, + "learning_rate": 4.808337996542405e-05, + "loss": 5.7685, + "step": 21095 + }, + { + "epoch": 0.1254638880959178, + "grad_norm": 1.764635682106018, + "learning_rate": 4.808320059792223e-05, + "loss": 5.8056, + "step": 21096 + }, + { + "epoch": 0.12546983537919879, + "grad_norm": 3.040402889251709, + "learning_rate": 4.80830212223623e-05, + "loss": 4.3029, + "step": 21097 + }, + { + "epoch": 0.12547578266247977, + "grad_norm": 2.3675732612609863, + "learning_rate": 4.8082841838744335e-05, + "loss": 4.2356, + "step": 21098 + }, + { + "epoch": 0.1254817299457608, + "grad_norm": 2.153254747390747, + "learning_rate": 4.808266244706838e-05, + "loss": 4.1071, + "step": 21099 + }, + { + "epoch": 0.12548767722904178, + "grad_norm": 2.181788921356201, + "learning_rate": 4.808248304733451e-05, + "loss": 4.1941, + "step": 21100 + }, + { + "epoch": 0.12549362451232277, + "grad_norm": 2.416555881500244, + "learning_rate": 4.808230363954278e-05, + "loss": 4.0926, + "step": 21101 + }, + { + "epoch": 0.12549957179560378, + "grad_norm": 1.7010666131973267, + "learning_rate": 4.808212422369327e-05, + "loss": 5.3639, + "step": 21102 + }, + { + "epoch": 0.12550551907888477, + "grad_norm": 1.4592742919921875, + "learning_rate": 4.808194479978601e-05, + "loss": 5.5641, + "step": 21103 + }, + { + "epoch": 0.12551146636216576, + "grad_norm": 1.5593754053115845, + "learning_rate": 4.808176536782109e-05, + "loss": 5.4008, + "step": 21104 + }, + { + "epoch": 0.12551741364544677, + "grad_norm": 1.7061179876327515, + "learning_rate": 4.8081585927798565e-05, + "loss": 5.6922, + "step": 21105 + }, + { + "epoch": 0.12552336092872776, + "grad_norm": 1.8220082521438599, + "learning_rate": 4.808140647971849e-05, + "loss": 5.4052, + "step": 21106 + }, + { + "epoch": 0.12552930821200875, + "grad_norm": 1.5218451023101807, + "learning_rate": 4.808122702358095e-05, + "loss": 5.4067, + "step": 21107 + }, + { + "epoch": 0.12553525549528974, + "grad_norm": 1.6590322256088257, + "learning_rate": 4.808104755938598e-05, + "loss": 5.5558, + "step": 21108 + }, + { + "epoch": 0.12554120277857075, + "grad_norm": 1.751290202140808, + "learning_rate": 4.808086808713366e-05, + "loss": 5.5584, + "step": 21109 + }, + { + "epoch": 0.12554715006185174, + "grad_norm": 1.6635403633117676, + "learning_rate": 4.8080688606824035e-05, + "loss": 5.4828, + "step": 21110 + }, + { + "epoch": 0.12555309734513273, + "grad_norm": 1.4710462093353271, + "learning_rate": 4.80805091184572e-05, + "loss": 5.4251, + "step": 21111 + }, + { + "epoch": 0.12555904462841375, + "grad_norm": 1.7598154544830322, + "learning_rate": 4.808032962203318e-05, + "loss": 5.5093, + "step": 21112 + }, + { + "epoch": 0.12556499191169473, + "grad_norm": 1.5128235816955566, + "learning_rate": 4.8080150117552057e-05, + "loss": 5.5069, + "step": 21113 + }, + { + "epoch": 0.12557093919497572, + "grad_norm": 1.5336002111434937, + "learning_rate": 4.80799706050139e-05, + "loss": 5.461, + "step": 21114 + }, + { + "epoch": 0.12557688647825674, + "grad_norm": 1.80903160572052, + "learning_rate": 4.807979108441876e-05, + "loss": 5.5894, + "step": 21115 + }, + { + "epoch": 0.12558283376153773, + "grad_norm": 1.8075919151306152, + "learning_rate": 4.8079611555766706e-05, + "loss": 5.4132, + "step": 21116 + }, + { + "epoch": 0.12558878104481871, + "grad_norm": 1.8319743871688843, + "learning_rate": 4.8079432019057794e-05, + "loss": 5.4409, + "step": 21117 + }, + { + "epoch": 0.12559472832809973, + "grad_norm": 1.7753643989562988, + "learning_rate": 4.8079252474292095e-05, + "loss": 5.425, + "step": 21118 + }, + { + "epoch": 0.12560067561138072, + "grad_norm": 1.614693522453308, + "learning_rate": 4.807907292146967e-05, + "loss": 5.2583, + "step": 21119 + }, + { + "epoch": 0.1256066228946617, + "grad_norm": 1.7520705461502075, + "learning_rate": 4.807889336059057e-05, + "loss": 5.5297, + "step": 21120 + }, + { + "epoch": 0.12561257017794272, + "grad_norm": 1.478826642036438, + "learning_rate": 4.8078713791654875e-05, + "loss": 5.8051, + "step": 21121 + }, + { + "epoch": 0.1256185174612237, + "grad_norm": 1.5645164251327515, + "learning_rate": 4.807853421466263e-05, + "loss": 5.6658, + "step": 21122 + }, + { + "epoch": 0.1256244647445047, + "grad_norm": 1.6254135370254517, + "learning_rate": 4.807835462961392e-05, + "loss": 5.2885, + "step": 21123 + }, + { + "epoch": 0.12563041202778572, + "grad_norm": 1.4290140867233276, + "learning_rate": 4.807817503650879e-05, + "loss": 5.6284, + "step": 21124 + }, + { + "epoch": 0.1256363593110667, + "grad_norm": 1.541447401046753, + "learning_rate": 4.8077995435347304e-05, + "loss": 5.8538, + "step": 21125 + }, + { + "epoch": 0.1256423065943477, + "grad_norm": 1.4778785705566406, + "learning_rate": 4.8077815826129526e-05, + "loss": 5.7019, + "step": 21126 + }, + { + "epoch": 0.1256482538776287, + "grad_norm": 1.5369840860366821, + "learning_rate": 4.807763620885552e-05, + "loss": 5.7164, + "step": 21127 + }, + { + "epoch": 0.1256542011609097, + "grad_norm": 1.5266817808151245, + "learning_rate": 4.807745658352536e-05, + "loss": 5.6203, + "step": 21128 + }, + { + "epoch": 0.12566014844419068, + "grad_norm": 1.4452829360961914, + "learning_rate": 4.8077276950139085e-05, + "loss": 5.7994, + "step": 21129 + }, + { + "epoch": 0.1256660957274717, + "grad_norm": 1.3619974851608276, + "learning_rate": 4.8077097308696786e-05, + "loss": 5.6703, + "step": 21130 + }, + { + "epoch": 0.1256720430107527, + "grad_norm": 1.1146374940872192, + "learning_rate": 4.80769176591985e-05, + "loss": 5.6631, + "step": 21131 + }, + { + "epoch": 0.12567799029403368, + "grad_norm": 1.2224622964859009, + "learning_rate": 4.8076738001644305e-05, + "loss": 5.5511, + "step": 21132 + }, + { + "epoch": 0.1256839375773147, + "grad_norm": 1.530564308166504, + "learning_rate": 4.807655833603426e-05, + "loss": 5.6201, + "step": 21133 + }, + { + "epoch": 0.12568988486059568, + "grad_norm": 1.5123308897018433, + "learning_rate": 4.807637866236842e-05, + "loss": 5.3411, + "step": 21134 + }, + { + "epoch": 0.12569583214387667, + "grad_norm": 1.4682310819625854, + "learning_rate": 4.807619898064686e-05, + "loss": 5.7009, + "step": 21135 + }, + { + "epoch": 0.12570177942715768, + "grad_norm": 1.7714731693267822, + "learning_rate": 4.8076019290869634e-05, + "loss": 5.8286, + "step": 21136 + }, + { + "epoch": 0.12570772671043867, + "grad_norm": 1.6663479804992676, + "learning_rate": 4.8075839593036814e-05, + "loss": 5.8158, + "step": 21137 + }, + { + "epoch": 0.12571367399371966, + "grad_norm": 1.458070158958435, + "learning_rate": 4.8075659887148454e-05, + "loss": 5.6954, + "step": 21138 + }, + { + "epoch": 0.12571962127700068, + "grad_norm": 2.572174072265625, + "learning_rate": 4.807548017320462e-05, + "loss": 4.715, + "step": 21139 + }, + { + "epoch": 0.12572556856028166, + "grad_norm": 2.4615628719329834, + "learning_rate": 4.8075300451205375e-05, + "loss": 4.8458, + "step": 21140 + }, + { + "epoch": 0.12573151584356265, + "grad_norm": 2.193739175796509, + "learning_rate": 4.807512072115078e-05, + "loss": 4.8746, + "step": 21141 + }, + { + "epoch": 0.12573746312684367, + "grad_norm": 1.9279803037643433, + "learning_rate": 4.80749409830409e-05, + "loss": 5.3174, + "step": 21142 + }, + { + "epoch": 0.12574341041012466, + "grad_norm": 2.0332345962524414, + "learning_rate": 4.807476123687579e-05, + "loss": 4.6696, + "step": 21143 + }, + { + "epoch": 0.12574935769340564, + "grad_norm": 2.1900224685668945, + "learning_rate": 4.8074581482655525e-05, + "loss": 4.7911, + "step": 21144 + }, + { + "epoch": 0.12575530497668666, + "grad_norm": 2.1232707500457764, + "learning_rate": 4.807440172038016e-05, + "loss": 4.4891, + "step": 21145 + }, + { + "epoch": 0.12576125225996765, + "grad_norm": 2.2046613693237305, + "learning_rate": 4.807422195004976e-05, + "loss": 5.1136, + "step": 21146 + }, + { + "epoch": 0.12576719954324864, + "grad_norm": 1.9693876504898071, + "learning_rate": 4.807404217166439e-05, + "loss": 5.7068, + "step": 21147 + }, + { + "epoch": 0.12577314682652965, + "grad_norm": 1.8561034202575684, + "learning_rate": 4.807386238522411e-05, + "loss": 5.6435, + "step": 21148 + }, + { + "epoch": 0.12577909410981064, + "grad_norm": 1.7676606178283691, + "learning_rate": 4.8073682590728974e-05, + "loss": 5.0934, + "step": 21149 + }, + { + "epoch": 0.12578504139309163, + "grad_norm": 1.729425311088562, + "learning_rate": 4.8073502788179064e-05, + "loss": 5.4891, + "step": 21150 + }, + { + "epoch": 0.12579098867637264, + "grad_norm": 1.5410076379776, + "learning_rate": 4.807332297757443e-05, + "loss": 5.919, + "step": 21151 + }, + { + "epoch": 0.12579693595965363, + "grad_norm": 1.5089081525802612, + "learning_rate": 4.8073143158915134e-05, + "loss": 5.9701, + "step": 21152 + }, + { + "epoch": 0.12580288324293462, + "grad_norm": 1.476559042930603, + "learning_rate": 4.807296333220125e-05, + "loss": 5.7351, + "step": 21153 + }, + { + "epoch": 0.12580883052621564, + "grad_norm": 2.055143117904663, + "learning_rate": 4.807278349743283e-05, + "loss": 5.4949, + "step": 21154 + }, + { + "epoch": 0.12581477780949663, + "grad_norm": 1.5232601165771484, + "learning_rate": 4.807260365460994e-05, + "loss": 5.3052, + "step": 21155 + }, + { + "epoch": 0.1258207250927776, + "grad_norm": 1.832310676574707, + "learning_rate": 4.807242380373264e-05, + "loss": 5.2832, + "step": 21156 + }, + { + "epoch": 0.12582667237605863, + "grad_norm": 1.8327937126159668, + "learning_rate": 4.807224394480099e-05, + "loss": 5.482, + "step": 21157 + }, + { + "epoch": 0.12583261965933962, + "grad_norm": 1.7728074789047241, + "learning_rate": 4.8072064077815065e-05, + "loss": 5.2636, + "step": 21158 + }, + { + "epoch": 0.1258385669426206, + "grad_norm": 1.6927982568740845, + "learning_rate": 4.8071884202774916e-05, + "loss": 5.369, + "step": 21159 + }, + { + "epoch": 0.12584451422590162, + "grad_norm": 1.8296928405761719, + "learning_rate": 4.8071704319680616e-05, + "loss": 5.4939, + "step": 21160 + }, + { + "epoch": 0.1258504615091826, + "grad_norm": 1.5497393608093262, + "learning_rate": 4.8071524428532224e-05, + "loss": 5.1909, + "step": 21161 + }, + { + "epoch": 0.1258564087924636, + "grad_norm": 1.8332972526550293, + "learning_rate": 4.807134452932979e-05, + "loss": 5.1555, + "step": 21162 + }, + { + "epoch": 0.1258623560757446, + "grad_norm": 1.856772780418396, + "learning_rate": 4.80711646220734e-05, + "loss": 5.1182, + "step": 21163 + }, + { + "epoch": 0.1258683033590256, + "grad_norm": 1.6313568353652954, + "learning_rate": 4.80709847067631e-05, + "loss": 5.0921, + "step": 21164 + }, + { + "epoch": 0.1258742506423066, + "grad_norm": 1.6753991842269897, + "learning_rate": 4.807080478339896e-05, + "loss": 5.1176, + "step": 21165 + }, + { + "epoch": 0.12588019792558758, + "grad_norm": 1.554154396057129, + "learning_rate": 4.807062485198104e-05, + "loss": 5.0849, + "step": 21166 + }, + { + "epoch": 0.1258861452088686, + "grad_norm": 1.9408693313598633, + "learning_rate": 4.8070444912509394e-05, + "loss": 4.9181, + "step": 21167 + }, + { + "epoch": 0.12589209249214958, + "grad_norm": 1.7222824096679688, + "learning_rate": 4.80702649649841e-05, + "loss": 5.6235, + "step": 21168 + }, + { + "epoch": 0.12589803977543057, + "grad_norm": 1.8301146030426025, + "learning_rate": 4.807008500940522e-05, + "loss": 5.3885, + "step": 21169 + }, + { + "epoch": 0.1259039870587116, + "grad_norm": 1.7527635097503662, + "learning_rate": 4.806990504577281e-05, + "loss": 5.3772, + "step": 21170 + }, + { + "epoch": 0.12590993434199257, + "grad_norm": 1.7983075380325317, + "learning_rate": 4.806972507408693e-05, + "loss": 5.7616, + "step": 21171 + }, + { + "epoch": 0.12591588162527356, + "grad_norm": 1.6842983961105347, + "learning_rate": 4.8069545094347653e-05, + "loss": 5.8808, + "step": 21172 + }, + { + "epoch": 0.12592182890855458, + "grad_norm": 1.8382412195205688, + "learning_rate": 4.806936510655503e-05, + "loss": 5.4304, + "step": 21173 + }, + { + "epoch": 0.12592777619183557, + "grad_norm": 1.833301305770874, + "learning_rate": 4.8069185110709133e-05, + "loss": 5.4221, + "step": 21174 + }, + { + "epoch": 0.12593372347511655, + "grad_norm": 1.52051842212677, + "learning_rate": 4.8069005106810025e-05, + "loss": 5.4133, + "step": 21175 + }, + { + "epoch": 0.12593967075839757, + "grad_norm": 1.5269474983215332, + "learning_rate": 4.806882509485776e-05, + "loss": 5.5549, + "step": 21176 + }, + { + "epoch": 0.12594561804167856, + "grad_norm": 1.8116832971572876, + "learning_rate": 4.806864507485241e-05, + "loss": 5.2989, + "step": 21177 + }, + { + "epoch": 0.12595156532495955, + "grad_norm": 1.7355883121490479, + "learning_rate": 4.806846504679403e-05, + "loss": 5.3839, + "step": 21178 + }, + { + "epoch": 0.12595751260824056, + "grad_norm": 1.7445424795150757, + "learning_rate": 4.806828501068269e-05, + "loss": 4.982, + "step": 21179 + }, + { + "epoch": 0.12596345989152155, + "grad_norm": 2.445030689239502, + "learning_rate": 4.806810496651845e-05, + "loss": 4.2665, + "step": 21180 + }, + { + "epoch": 0.12596940717480254, + "grad_norm": 2.6840837001800537, + "learning_rate": 4.8067924914301377e-05, + "loss": 3.9739, + "step": 21181 + }, + { + "epoch": 0.12597535445808355, + "grad_norm": 2.431506872177124, + "learning_rate": 4.806774485403153e-05, + "loss": 3.9235, + "step": 21182 + }, + { + "epoch": 0.12598130174136454, + "grad_norm": 3.124319076538086, + "learning_rate": 4.806756478570896e-05, + "loss": 3.7692, + "step": 21183 + }, + { + "epoch": 0.12598724902464553, + "grad_norm": 2.8702549934387207, + "learning_rate": 4.806738470933375e-05, + "loss": 3.6848, + "step": 21184 + }, + { + "epoch": 0.12599319630792655, + "grad_norm": 2.6687517166137695, + "learning_rate": 4.8067204624905954e-05, + "loss": 3.5655, + "step": 21185 + }, + { + "epoch": 0.12599914359120754, + "grad_norm": 2.3944084644317627, + "learning_rate": 4.806702453242563e-05, + "loss": 3.6176, + "step": 21186 + }, + { + "epoch": 0.12600509087448852, + "grad_norm": 2.565718173980713, + "learning_rate": 4.8066844431892856e-05, + "loss": 3.6557, + "step": 21187 + }, + { + "epoch": 0.12601103815776954, + "grad_norm": 2.9165117740631104, + "learning_rate": 4.806666432330768e-05, + "loss": 3.4013, + "step": 21188 + }, + { + "epoch": 0.12601698544105053, + "grad_norm": 3.232210397720337, + "learning_rate": 4.806648420667017e-05, + "loss": 4.8954, + "step": 21189 + }, + { + "epoch": 0.12602293272433152, + "grad_norm": 3.2784297466278076, + "learning_rate": 4.8066304081980384e-05, + "loss": 4.7801, + "step": 21190 + }, + { + "epoch": 0.12602888000761253, + "grad_norm": 2.8707523345947266, + "learning_rate": 4.8066123949238396e-05, + "loss": 4.7461, + "step": 21191 + }, + { + "epoch": 0.12603482729089352, + "grad_norm": 2.3808538913726807, + "learning_rate": 4.8065943808444255e-05, + "loss": 4.5148, + "step": 21192 + }, + { + "epoch": 0.1260407745741745, + "grad_norm": 2.2710814476013184, + "learning_rate": 4.806576365959804e-05, + "loss": 4.522, + "step": 21193 + }, + { + "epoch": 0.12604672185745552, + "grad_norm": 2.2108187675476074, + "learning_rate": 4.80655835026998e-05, + "loss": 4.7575, + "step": 21194 + }, + { + "epoch": 0.1260526691407365, + "grad_norm": 2.1496641635894775, + "learning_rate": 4.80654033377496e-05, + "loss": 4.6543, + "step": 21195 + }, + { + "epoch": 0.1260586164240175, + "grad_norm": 1.9770373106002808, + "learning_rate": 4.806522316474752e-05, + "loss": 4.59, + "step": 21196 + }, + { + "epoch": 0.12606456370729852, + "grad_norm": 1.8799597024917603, + "learning_rate": 4.80650429836936e-05, + "loss": 4.598, + "step": 21197 + }, + { + "epoch": 0.1260705109905795, + "grad_norm": 1.846724510192871, + "learning_rate": 4.8064862794587903e-05, + "loss": 4.4912, + "step": 21198 + }, + { + "epoch": 0.1260764582738605, + "grad_norm": 1.7821966409683228, + "learning_rate": 4.806468259743051e-05, + "loss": 4.4898, + "step": 21199 + }, + { + "epoch": 0.1260824055571415, + "grad_norm": 1.7804360389709473, + "learning_rate": 4.806450239222148e-05, + "loss": 4.5324, + "step": 21200 + }, + { + "epoch": 0.1260883528404225, + "grad_norm": 1.705761194229126, + "learning_rate": 4.8064322178960864e-05, + "loss": 4.7046, + "step": 21201 + }, + { + "epoch": 0.12609430012370348, + "grad_norm": 2.41103458404541, + "learning_rate": 4.8064141957648726e-05, + "loss": 5.1943, + "step": 21202 + }, + { + "epoch": 0.1261002474069845, + "grad_norm": 2.3028182983398438, + "learning_rate": 4.806396172828515e-05, + "loss": 5.0494, + "step": 21203 + }, + { + "epoch": 0.1261061946902655, + "grad_norm": 2.1674535274505615, + "learning_rate": 4.806378149087016e-05, + "loss": 5.3104, + "step": 21204 + }, + { + "epoch": 0.12611214197354648, + "grad_norm": 1.9217156171798706, + "learning_rate": 4.8063601245403864e-05, + "loss": 5.2403, + "step": 21205 + }, + { + "epoch": 0.1261180892568275, + "grad_norm": 2.097116231918335, + "learning_rate": 4.806342099188629e-05, + "loss": 5.3471, + "step": 21206 + }, + { + "epoch": 0.12612403654010848, + "grad_norm": 1.8356170654296875, + "learning_rate": 4.806324073031751e-05, + "loss": 5.2168, + "step": 21207 + }, + { + "epoch": 0.12612998382338947, + "grad_norm": 2.2306652069091797, + "learning_rate": 4.806306046069761e-05, + "loss": 5.1406, + "step": 21208 + }, + { + "epoch": 0.12613593110667048, + "grad_norm": 1.8946762084960938, + "learning_rate": 4.8062880183026624e-05, + "loss": 5.072, + "step": 21209 + }, + { + "epoch": 0.12614187838995147, + "grad_norm": 2.0963854789733887, + "learning_rate": 4.806269989730462e-05, + "loss": 5.2702, + "step": 21210 + }, + { + "epoch": 0.12614782567323246, + "grad_norm": 1.859677791595459, + "learning_rate": 4.806251960353167e-05, + "loss": 5.1133, + "step": 21211 + }, + { + "epoch": 0.12615377295651348, + "grad_norm": 1.9993607997894287, + "learning_rate": 4.806233930170783e-05, + "loss": 5.1201, + "step": 21212 + }, + { + "epoch": 0.12615972023979447, + "grad_norm": 1.7218701839447021, + "learning_rate": 4.8062158991833176e-05, + "loss": 5.0055, + "step": 21213 + }, + { + "epoch": 0.12616566752307545, + "grad_norm": 1.9172027111053467, + "learning_rate": 4.806197867390775e-05, + "loss": 4.955, + "step": 21214 + }, + { + "epoch": 0.12617161480635647, + "grad_norm": 2.0665276050567627, + "learning_rate": 4.8061798347931627e-05, + "loss": 4.842, + "step": 21215 + }, + { + "epoch": 0.12617756208963746, + "grad_norm": 1.932822346687317, + "learning_rate": 4.806161801390486e-05, + "loss": 4.5687, + "step": 21216 + }, + { + "epoch": 0.12618350937291845, + "grad_norm": 1.7978770732879639, + "learning_rate": 4.806143767182754e-05, + "loss": 4.6994, + "step": 21217 + }, + { + "epoch": 0.12618945665619946, + "grad_norm": 1.9298393726348877, + "learning_rate": 4.80612573216997e-05, + "loss": 4.8935, + "step": 21218 + }, + { + "epoch": 0.12619540393948045, + "grad_norm": 1.8706467151641846, + "learning_rate": 4.806107696352141e-05, + "loss": 4.699, + "step": 21219 + }, + { + "epoch": 0.12620135122276144, + "grad_norm": 1.946582317352295, + "learning_rate": 4.806089659729274e-05, + "loss": 4.9519, + "step": 21220 + }, + { + "epoch": 0.12620729850604245, + "grad_norm": 2.1021311283111572, + "learning_rate": 4.806071622301375e-05, + "loss": 4.8315, + "step": 21221 + }, + { + "epoch": 0.12621324578932344, + "grad_norm": 2.110234022140503, + "learning_rate": 4.8060535840684504e-05, + "loss": 4.6524, + "step": 21222 + }, + { + "epoch": 0.12621919307260443, + "grad_norm": 2.1723785400390625, + "learning_rate": 4.806035545030506e-05, + "loss": 4.7154, + "step": 21223 + }, + { + "epoch": 0.12622514035588542, + "grad_norm": 1.8978101015090942, + "learning_rate": 4.806017505187548e-05, + "loss": 4.6743, + "step": 21224 + }, + { + "epoch": 0.12623108763916643, + "grad_norm": 2.0092225074768066, + "learning_rate": 4.8059994645395833e-05, + "loss": 4.9198, + "step": 21225 + }, + { + "epoch": 0.12623703492244742, + "grad_norm": 1.935624122619629, + "learning_rate": 4.8059814230866184e-05, + "loss": 4.7253, + "step": 21226 + }, + { + "epoch": 0.1262429822057284, + "grad_norm": 1.9758509397506714, + "learning_rate": 4.80596338082866e-05, + "loss": 4.6388, + "step": 21227 + }, + { + "epoch": 0.12624892948900943, + "grad_norm": 2.0389976501464844, + "learning_rate": 4.805945337765712e-05, + "loss": 4.7527, + "step": 21228 + }, + { + "epoch": 0.12625487677229041, + "grad_norm": 2.0781445503234863, + "learning_rate": 4.805927293897783e-05, + "loss": 4.7985, + "step": 21229 + }, + { + "epoch": 0.1262608240555714, + "grad_norm": 2.0403099060058594, + "learning_rate": 4.8059092492248786e-05, + "loss": 5.1442, + "step": 21230 + }, + { + "epoch": 0.12626677133885242, + "grad_norm": 2.141681432723999, + "learning_rate": 4.805891203747005e-05, + "loss": 5.1191, + "step": 21231 + }, + { + "epoch": 0.1262727186221334, + "grad_norm": 2.159761905670166, + "learning_rate": 4.805873157464169e-05, + "loss": 5.2995, + "step": 21232 + }, + { + "epoch": 0.1262786659054144, + "grad_norm": 2.568081855773926, + "learning_rate": 4.805855110376376e-05, + "loss": 5.4263, + "step": 21233 + }, + { + "epoch": 0.1262846131886954, + "grad_norm": 1.8911200761795044, + "learning_rate": 4.8058370624836336e-05, + "loss": 5.3457, + "step": 21234 + }, + { + "epoch": 0.1262905604719764, + "grad_norm": 2.3370580673217773, + "learning_rate": 4.805819013785946e-05, + "loss": 4.8342, + "step": 21235 + }, + { + "epoch": 0.1262965077552574, + "grad_norm": 2.669029474258423, + "learning_rate": 4.805800964283322e-05, + "loss": 4.9175, + "step": 21236 + }, + { + "epoch": 0.1263024550385384, + "grad_norm": 1.9824459552764893, + "learning_rate": 4.8057829139757657e-05, + "loss": 4.6509, + "step": 21237 + }, + { + "epoch": 0.1263084023218194, + "grad_norm": 1.9576833248138428, + "learning_rate": 4.805764862863286e-05, + "loss": 5.4197, + "step": 21238 + }, + { + "epoch": 0.12631434960510038, + "grad_norm": 1.9594717025756836, + "learning_rate": 4.805746810945886e-05, + "loss": 5.7506, + "step": 21239 + }, + { + "epoch": 0.1263202968883814, + "grad_norm": 2.063676357269287, + "learning_rate": 4.8057287582235746e-05, + "loss": 5.6675, + "step": 21240 + }, + { + "epoch": 0.12632624417166238, + "grad_norm": 1.9354885816574097, + "learning_rate": 4.805710704696356e-05, + "loss": 5.1697, + "step": 21241 + }, + { + "epoch": 0.12633219145494337, + "grad_norm": 1.9859137535095215, + "learning_rate": 4.8056926503642384e-05, + "loss": 4.9055, + "step": 21242 + }, + { + "epoch": 0.1263381387382244, + "grad_norm": 2.1015024185180664, + "learning_rate": 4.805674595227228e-05, + "loss": 4.4961, + "step": 21243 + }, + { + "epoch": 0.12634408602150538, + "grad_norm": 2.225673198699951, + "learning_rate": 4.805656539285329e-05, + "loss": 4.2943, + "step": 21244 + }, + { + "epoch": 0.12635003330478636, + "grad_norm": 1.9753731489181519, + "learning_rate": 4.8056384825385495e-05, + "loss": 4.401, + "step": 21245 + }, + { + "epoch": 0.12635598058806738, + "grad_norm": 1.693865180015564, + "learning_rate": 4.805620424986896e-05, + "loss": 4.2992, + "step": 21246 + }, + { + "epoch": 0.12636192787134837, + "grad_norm": 2.0757269859313965, + "learning_rate": 4.805602366630374e-05, + "loss": 4.4564, + "step": 21247 + }, + { + "epoch": 0.12636787515462936, + "grad_norm": 1.559611201286316, + "learning_rate": 4.80558430746899e-05, + "loss": 5.95, + "step": 21248 + }, + { + "epoch": 0.12637382243791037, + "grad_norm": 1.7863824367523193, + "learning_rate": 4.80556624750275e-05, + "loss": 5.2208, + "step": 21249 + }, + { + "epoch": 0.12637976972119136, + "grad_norm": 1.7766302824020386, + "learning_rate": 4.805548186731661e-05, + "loss": 4.9666, + "step": 21250 + }, + { + "epoch": 0.12638571700447235, + "grad_norm": 1.5633225440979004, + "learning_rate": 4.805530125155728e-05, + "loss": 4.7051, + "step": 21251 + }, + { + "epoch": 0.12639166428775336, + "grad_norm": 1.795332431793213, + "learning_rate": 4.80551206277496e-05, + "loss": 4.624, + "step": 21252 + }, + { + "epoch": 0.12639761157103435, + "grad_norm": 2.2065796852111816, + "learning_rate": 4.805493999589361e-05, + "loss": 4.2034, + "step": 21253 + }, + { + "epoch": 0.12640355885431534, + "grad_norm": 2.0833165645599365, + "learning_rate": 4.805475935598937e-05, + "loss": 4.3267, + "step": 21254 + }, + { + "epoch": 0.12640950613759636, + "grad_norm": 2.591543436050415, + "learning_rate": 4.8054578708036954e-05, + "loss": 4.5015, + "step": 21255 + }, + { + "epoch": 0.12641545342087734, + "grad_norm": 1.7929967641830444, + "learning_rate": 4.805439805203643e-05, + "loss": 5.1193, + "step": 21256 + }, + { + "epoch": 0.12642140070415833, + "grad_norm": 1.632691740989685, + "learning_rate": 4.805421738798785e-05, + "loss": 4.728, + "step": 21257 + }, + { + "epoch": 0.12642734798743935, + "grad_norm": 1.844673752784729, + "learning_rate": 4.8054036715891284e-05, + "loss": 4.8617, + "step": 21258 + }, + { + "epoch": 0.12643329527072034, + "grad_norm": 1.7764726877212524, + "learning_rate": 4.805385603574678e-05, + "loss": 5.0102, + "step": 21259 + }, + { + "epoch": 0.12643924255400132, + "grad_norm": 1.7257095575332642, + "learning_rate": 4.8053675347554425e-05, + "loss": 5.4136, + "step": 21260 + }, + { + "epoch": 0.12644518983728234, + "grad_norm": 1.9378974437713623, + "learning_rate": 4.805349465131427e-05, + "loss": 4.8102, + "step": 21261 + }, + { + "epoch": 0.12645113712056333, + "grad_norm": 2.1207330226898193, + "learning_rate": 4.805331394702637e-05, + "loss": 5.137, + "step": 21262 + }, + { + "epoch": 0.12645708440384432, + "grad_norm": 2.630957841873169, + "learning_rate": 4.8053133234690806e-05, + "loss": 3.9948, + "step": 21263 + }, + { + "epoch": 0.12646303168712533, + "grad_norm": 2.5051863193511963, + "learning_rate": 4.805295251430762e-05, + "loss": 3.7358, + "step": 21264 + }, + { + "epoch": 0.12646897897040632, + "grad_norm": 2.4558019638061523, + "learning_rate": 4.805277178587689e-05, + "loss": 4.1314, + "step": 21265 + }, + { + "epoch": 0.1264749262536873, + "grad_norm": 2.1878461837768555, + "learning_rate": 4.805259104939869e-05, + "loss": 5.1189, + "step": 21266 + }, + { + "epoch": 0.12648087353696832, + "grad_norm": 2.303126811981201, + "learning_rate": 4.805241030487305e-05, + "loss": 4.4202, + "step": 21267 + }, + { + "epoch": 0.1264868208202493, + "grad_norm": 2.4533417224884033, + "learning_rate": 4.805222955230006e-05, + "loss": 4.4752, + "step": 21268 + }, + { + "epoch": 0.1264927681035303, + "grad_norm": 2.4850356578826904, + "learning_rate": 4.805204879167977e-05, + "loss": 4.0938, + "step": 21269 + }, + { + "epoch": 0.12649871538681132, + "grad_norm": 2.622119665145874, + "learning_rate": 4.805186802301226e-05, + "loss": 3.5693, + "step": 21270 + }, + { + "epoch": 0.1265046626700923, + "grad_norm": 2.5546908378601074, + "learning_rate": 4.8051687246297574e-05, + "loss": 4.1895, + "step": 21271 + }, + { + "epoch": 0.1265106099533733, + "grad_norm": 2.6318092346191406, + "learning_rate": 4.805150646153578e-05, + "loss": 4.5214, + "step": 21272 + }, + { + "epoch": 0.1265165572366543, + "grad_norm": 2.380413770675659, + "learning_rate": 4.805132566872694e-05, + "loss": 4.601, + "step": 21273 + }, + { + "epoch": 0.1265225045199353, + "grad_norm": 2.652449369430542, + "learning_rate": 4.805114486787112e-05, + "loss": 4.7164, + "step": 21274 + }, + { + "epoch": 0.12652845180321629, + "grad_norm": 2.6453335285186768, + "learning_rate": 4.8050964058968394e-05, + "loss": 4.8007, + "step": 21275 + }, + { + "epoch": 0.1265343990864973, + "grad_norm": 2.226515054702759, + "learning_rate": 4.8050783242018805e-05, + "loss": 4.7653, + "step": 21276 + }, + { + "epoch": 0.1265403463697783, + "grad_norm": 2.678157091140747, + "learning_rate": 4.805060241702243e-05, + "loss": 4.8511, + "step": 21277 + }, + { + "epoch": 0.12654629365305928, + "grad_norm": 2.2161943912506104, + "learning_rate": 4.8050421583979324e-05, + "loss": 4.6734, + "step": 21278 + }, + { + "epoch": 0.1265522409363403, + "grad_norm": 2.242539882659912, + "learning_rate": 4.805024074288956e-05, + "loss": 4.5445, + "step": 21279 + }, + { + "epoch": 0.12655818821962128, + "grad_norm": 1.9599577188491821, + "learning_rate": 4.805005989375319e-05, + "loss": 4.7331, + "step": 21280 + }, + { + "epoch": 0.12656413550290227, + "grad_norm": 2.1399378776550293, + "learning_rate": 4.8049879036570286e-05, + "loss": 4.1747, + "step": 21281 + }, + { + "epoch": 0.12657008278618326, + "grad_norm": 2.202322244644165, + "learning_rate": 4.8049698171340904e-05, + "loss": 4.3195, + "step": 21282 + }, + { + "epoch": 0.12657603006946427, + "grad_norm": 2.071727991104126, + "learning_rate": 4.8049517298065115e-05, + "loss": 4.3142, + "step": 21283 + }, + { + "epoch": 0.12658197735274526, + "grad_norm": 1.8801134824752808, + "learning_rate": 4.8049336416742974e-05, + "loss": 4.2353, + "step": 21284 + }, + { + "epoch": 0.12658792463602625, + "grad_norm": 1.8937469720840454, + "learning_rate": 4.804915552737455e-05, + "loss": 4.1141, + "step": 21285 + }, + { + "epoch": 0.12659387191930727, + "grad_norm": 1.8500044345855713, + "learning_rate": 4.8048974629959906e-05, + "loss": 4.0509, + "step": 21286 + }, + { + "epoch": 0.12659981920258825, + "grad_norm": 1.8931934833526611, + "learning_rate": 4.8048793724499095e-05, + "loss": 4.1905, + "step": 21287 + }, + { + "epoch": 0.12660576648586924, + "grad_norm": 1.6579469442367554, + "learning_rate": 4.8048612810992196e-05, + "loss": 4.8032, + "step": 21288 + }, + { + "epoch": 0.12661171376915026, + "grad_norm": 1.7402268648147583, + "learning_rate": 4.804843188943926e-05, + "loss": 5.363, + "step": 21289 + }, + { + "epoch": 0.12661766105243125, + "grad_norm": 1.6550151109695435, + "learning_rate": 4.804825095984036e-05, + "loss": 5.4504, + "step": 21290 + }, + { + "epoch": 0.12662360833571223, + "grad_norm": 1.5498002767562866, + "learning_rate": 4.8048070022195546e-05, + "loss": 5.2858, + "step": 21291 + }, + { + "epoch": 0.12662955561899325, + "grad_norm": 1.6577101945877075, + "learning_rate": 4.804788907650489e-05, + "loss": 5.0535, + "step": 21292 + }, + { + "epoch": 0.12663550290227424, + "grad_norm": 1.5144888162612915, + "learning_rate": 4.804770812276845e-05, + "loss": 5.0564, + "step": 21293 + }, + { + "epoch": 0.12664145018555523, + "grad_norm": 1.7675977945327759, + "learning_rate": 4.804752716098631e-05, + "loss": 4.9044, + "step": 21294 + }, + { + "epoch": 0.12664739746883624, + "grad_norm": 1.6419012546539307, + "learning_rate": 4.8047346191158506e-05, + "loss": 5.1735, + "step": 21295 + }, + { + "epoch": 0.12665334475211723, + "grad_norm": 1.9034998416900635, + "learning_rate": 4.8047165213285106e-05, + "loss": 5.762, + "step": 21296 + }, + { + "epoch": 0.12665929203539822, + "grad_norm": 2.2357866764068604, + "learning_rate": 4.8046984227366186e-05, + "loss": 5.0351, + "step": 21297 + }, + { + "epoch": 0.12666523931867923, + "grad_norm": 1.528701663017273, + "learning_rate": 4.8046803233401796e-05, + "loss": 5.3659, + "step": 21298 + }, + { + "epoch": 0.12667118660196022, + "grad_norm": 1.5450912714004517, + "learning_rate": 4.8046622231392015e-05, + "loss": 5.4961, + "step": 21299 + }, + { + "epoch": 0.1266771338852412, + "grad_norm": 2.459630012512207, + "learning_rate": 4.804644122133689e-05, + "loss": 4.6308, + "step": 21300 + }, + { + "epoch": 0.12668308116852223, + "grad_norm": 1.8703144788742065, + "learning_rate": 4.8046260203236494e-05, + "loss": 4.6424, + "step": 21301 + }, + { + "epoch": 0.12668902845180322, + "grad_norm": 1.4294613599777222, + "learning_rate": 4.804607917709088e-05, + "loss": 5.5703, + "step": 21302 + }, + { + "epoch": 0.1266949757350842, + "grad_norm": 1.6063963174819946, + "learning_rate": 4.804589814290012e-05, + "loss": 5.6344, + "step": 21303 + }, + { + "epoch": 0.12670092301836522, + "grad_norm": 2.1621460914611816, + "learning_rate": 4.8045717100664275e-05, + "loss": 5.1798, + "step": 21304 + }, + { + "epoch": 0.1267068703016462, + "grad_norm": 2.187513828277588, + "learning_rate": 4.804553605038341e-05, + "loss": 4.4837, + "step": 21305 + }, + { + "epoch": 0.1267128175849272, + "grad_norm": 2.5205118656158447, + "learning_rate": 4.804535499205759e-05, + "loss": 4.5554, + "step": 21306 + }, + { + "epoch": 0.1267187648682082, + "grad_norm": 2.196026563644409, + "learning_rate": 4.804517392568687e-05, + "loss": 4.5849, + "step": 21307 + }, + { + "epoch": 0.1267247121514892, + "grad_norm": 2.152150869369507, + "learning_rate": 4.804499285127132e-05, + "loss": 4.4153, + "step": 21308 + }, + { + "epoch": 0.1267306594347702, + "grad_norm": 2.398475170135498, + "learning_rate": 4.8044811768811e-05, + "loss": 4.1129, + "step": 21309 + }, + { + "epoch": 0.1267366067180512, + "grad_norm": 2.4291298389434814, + "learning_rate": 4.8044630678305976e-05, + "loss": 4.4199, + "step": 21310 + }, + { + "epoch": 0.1267425540013322, + "grad_norm": 2.6893248558044434, + "learning_rate": 4.80444495797563e-05, + "loss": 4.419, + "step": 21311 + }, + { + "epoch": 0.12674850128461318, + "grad_norm": 2.369361400604248, + "learning_rate": 4.804426847316206e-05, + "loss": 4.3434, + "step": 21312 + }, + { + "epoch": 0.1267544485678942, + "grad_norm": 2.206676721572876, + "learning_rate": 4.804408735852329e-05, + "loss": 4.2195, + "step": 21313 + }, + { + "epoch": 0.12676039585117518, + "grad_norm": 2.3347322940826416, + "learning_rate": 4.8043906235840074e-05, + "loss": 4.352, + "step": 21314 + }, + { + "epoch": 0.12676634313445617, + "grad_norm": 2.4026732444763184, + "learning_rate": 4.804372510511247e-05, + "loss": 4.0351, + "step": 21315 + }, + { + "epoch": 0.1267722904177372, + "grad_norm": 2.3547754287719727, + "learning_rate": 4.8043543966340546e-05, + "loss": 4.1292, + "step": 21316 + }, + { + "epoch": 0.12677823770101818, + "grad_norm": 2.3924174308776855, + "learning_rate": 4.804336281952434e-05, + "loss": 4.138, + "step": 21317 + }, + { + "epoch": 0.12678418498429916, + "grad_norm": 2.063361883163452, + "learning_rate": 4.804318166466395e-05, + "loss": 4.1288, + "step": 21318 + }, + { + "epoch": 0.12679013226758018, + "grad_norm": 2.1719813346862793, + "learning_rate": 4.8043000501759415e-05, + "loss": 4.3262, + "step": 21319 + }, + { + "epoch": 0.12679607955086117, + "grad_norm": 2.3787803649902344, + "learning_rate": 4.8042819330810803e-05, + "loss": 4.448, + "step": 21320 + }, + { + "epoch": 0.12680202683414216, + "grad_norm": 2.369344472885132, + "learning_rate": 4.80426381518182e-05, + "loss": 4.4237, + "step": 21321 + }, + { + "epoch": 0.12680797411742317, + "grad_norm": 1.9213550090789795, + "learning_rate": 4.804245696478163e-05, + "loss": 4.8805, + "step": 21322 + }, + { + "epoch": 0.12681392140070416, + "grad_norm": 2.1709017753601074, + "learning_rate": 4.804227576970118e-05, + "loss": 5.7745, + "step": 21323 + }, + { + "epoch": 0.12681986868398515, + "grad_norm": 2.1823856830596924, + "learning_rate": 4.8042094566576925e-05, + "loss": 5.561, + "step": 21324 + }, + { + "epoch": 0.12682581596726616, + "grad_norm": 2.403367519378662, + "learning_rate": 4.80419133554089e-05, + "loss": 5.6699, + "step": 21325 + }, + { + "epoch": 0.12683176325054715, + "grad_norm": 1.8335449695587158, + "learning_rate": 4.8041732136197184e-05, + "loss": 5.5058, + "step": 21326 + }, + { + "epoch": 0.12683771053382814, + "grad_norm": 1.7406642436981201, + "learning_rate": 4.804155090894183e-05, + "loss": 5.6536, + "step": 21327 + }, + { + "epoch": 0.12684365781710916, + "grad_norm": 2.160098075866699, + "learning_rate": 4.804136967364291e-05, + "loss": 5.4742, + "step": 21328 + }, + { + "epoch": 0.12684960510039014, + "grad_norm": 1.5187212228775024, + "learning_rate": 4.804118843030049e-05, + "loss": 5.2908, + "step": 21329 + }, + { + "epoch": 0.12685555238367113, + "grad_norm": 1.387417197227478, + "learning_rate": 4.804100717891463e-05, + "loss": 5.3319, + "step": 21330 + }, + { + "epoch": 0.12686149966695215, + "grad_norm": 1.3029687404632568, + "learning_rate": 4.80408259194854e-05, + "loss": 5.4069, + "step": 21331 + }, + { + "epoch": 0.12686744695023314, + "grad_norm": 1.7097088098526, + "learning_rate": 4.804064465201284e-05, + "loss": 4.8422, + "step": 21332 + }, + { + "epoch": 0.12687339423351413, + "grad_norm": 1.7519829273223877, + "learning_rate": 4.804046337649704e-05, + "loss": 5.4513, + "step": 21333 + }, + { + "epoch": 0.12687934151679514, + "grad_norm": 1.5313260555267334, + "learning_rate": 4.8040282092938046e-05, + "loss": 4.8656, + "step": 21334 + }, + { + "epoch": 0.12688528880007613, + "grad_norm": 1.629780888557434, + "learning_rate": 4.804010080133593e-05, + "loss": 4.8751, + "step": 21335 + }, + { + "epoch": 0.12689123608335712, + "grad_norm": 1.7247028350830078, + "learning_rate": 4.8039919501690756e-05, + "loss": 4.7207, + "step": 21336 + }, + { + "epoch": 0.12689718336663813, + "grad_norm": 1.517016887664795, + "learning_rate": 4.803973819400258e-05, + "loss": 5.0604, + "step": 21337 + }, + { + "epoch": 0.12690313064991912, + "grad_norm": 1.4583669900894165, + "learning_rate": 4.8039556878271475e-05, + "loss": 5.0638, + "step": 21338 + }, + { + "epoch": 0.1269090779332001, + "grad_norm": 1.725014567375183, + "learning_rate": 4.803937555449749e-05, + "loss": 5.5831, + "step": 21339 + }, + { + "epoch": 0.1269150252164811, + "grad_norm": 1.4144753217697144, + "learning_rate": 4.803919422268071e-05, + "loss": 5.3899, + "step": 21340 + }, + { + "epoch": 0.1269209724997621, + "grad_norm": 1.4197511672973633, + "learning_rate": 4.803901288282117e-05, + "loss": 5.4904, + "step": 21341 + }, + { + "epoch": 0.1269269197830431, + "grad_norm": 1.5491420030593872, + "learning_rate": 4.803883153491896e-05, + "loss": 5.5008, + "step": 21342 + }, + { + "epoch": 0.1269328670663241, + "grad_norm": 1.4152858257293701, + "learning_rate": 4.803865017897412e-05, + "loss": 5.5328, + "step": 21343 + }, + { + "epoch": 0.1269388143496051, + "grad_norm": 1.6931630373001099, + "learning_rate": 4.803846881498674e-05, + "loss": 5.4435, + "step": 21344 + }, + { + "epoch": 0.1269447616328861, + "grad_norm": 1.4955002069473267, + "learning_rate": 4.803828744295686e-05, + "loss": 5.3631, + "step": 21345 + }, + { + "epoch": 0.12695070891616708, + "grad_norm": 1.5340615510940552, + "learning_rate": 4.803810606288455e-05, + "loss": 5.4711, + "step": 21346 + }, + { + "epoch": 0.1269566561994481, + "grad_norm": 1.4584442377090454, + "learning_rate": 4.803792467476988e-05, + "loss": 5.512, + "step": 21347 + }, + { + "epoch": 0.1269626034827291, + "grad_norm": 1.663875699043274, + "learning_rate": 4.803774327861291e-05, + "loss": 5.5867, + "step": 21348 + }, + { + "epoch": 0.12696855076601007, + "grad_norm": 1.4865331649780273, + "learning_rate": 4.8037561874413696e-05, + "loss": 5.0047, + "step": 21349 + }, + { + "epoch": 0.1269744980492911, + "grad_norm": 1.5889533758163452, + "learning_rate": 4.803738046217231e-05, + "loss": 4.9325, + "step": 21350 + }, + { + "epoch": 0.12698044533257208, + "grad_norm": 1.7473856210708618, + "learning_rate": 4.8037199041888814e-05, + "loss": 4.9296, + "step": 21351 + }, + { + "epoch": 0.12698639261585307, + "grad_norm": 1.9395428895950317, + "learning_rate": 4.8037017613563265e-05, + "loss": 5.5787, + "step": 21352 + }, + { + "epoch": 0.12699233989913408, + "grad_norm": 1.8723230361938477, + "learning_rate": 4.8036836177195734e-05, + "loss": 5.2864, + "step": 21353 + }, + { + "epoch": 0.12699828718241507, + "grad_norm": 1.8751366138458252, + "learning_rate": 4.8036654732786276e-05, + "loss": 4.9116, + "step": 21354 + }, + { + "epoch": 0.12700423446569606, + "grad_norm": 1.6620196104049683, + "learning_rate": 4.803647328033497e-05, + "loss": 5.1592, + "step": 21355 + }, + { + "epoch": 0.12701018174897707, + "grad_norm": 2.01167631149292, + "learning_rate": 4.803629181984187e-05, + "loss": 5.2254, + "step": 21356 + }, + { + "epoch": 0.12701612903225806, + "grad_norm": 1.6565442085266113, + "learning_rate": 4.803611035130703e-05, + "loss": 5.2454, + "step": 21357 + }, + { + "epoch": 0.12702207631553905, + "grad_norm": 1.3379613161087036, + "learning_rate": 4.803592887473053e-05, + "loss": 5.3203, + "step": 21358 + }, + { + "epoch": 0.12702802359882007, + "grad_norm": 1.580633282661438, + "learning_rate": 4.8035747390112415e-05, + "loss": 5.2555, + "step": 21359 + }, + { + "epoch": 0.12703397088210105, + "grad_norm": 1.9735597372055054, + "learning_rate": 4.803556589745276e-05, + "loss": 5.6899, + "step": 21360 + }, + { + "epoch": 0.12703991816538204, + "grad_norm": 1.6550042629241943, + "learning_rate": 4.8035384396751636e-05, + "loss": 4.8188, + "step": 21361 + }, + { + "epoch": 0.12704586544866306, + "grad_norm": 1.598645567893982, + "learning_rate": 4.803520288800909e-05, + "loss": 5.0498, + "step": 21362 + }, + { + "epoch": 0.12705181273194405, + "grad_norm": 1.5990798473358154, + "learning_rate": 4.80350213712252e-05, + "loss": 5.0563, + "step": 21363 + }, + { + "epoch": 0.12705776001522504, + "grad_norm": 1.5130763053894043, + "learning_rate": 4.803483984640001e-05, + "loss": 5.2562, + "step": 21364 + }, + { + "epoch": 0.12706370729850605, + "grad_norm": 1.5498485565185547, + "learning_rate": 4.803465831353361e-05, + "loss": 5.551, + "step": 21365 + }, + { + "epoch": 0.12706965458178704, + "grad_norm": 1.819954752922058, + "learning_rate": 4.803447677262603e-05, + "loss": 4.5888, + "step": 21366 + }, + { + "epoch": 0.12707560186506803, + "grad_norm": 1.5863771438598633, + "learning_rate": 4.8034295223677374e-05, + "loss": 5.108, + "step": 21367 + }, + { + "epoch": 0.12708154914834904, + "grad_norm": 1.6637874841690063, + "learning_rate": 4.803411366668767e-05, + "loss": 5.3476, + "step": 21368 + }, + { + "epoch": 0.12708749643163003, + "grad_norm": 1.5182580947875977, + "learning_rate": 4.8033932101657e-05, + "loss": 5.6559, + "step": 21369 + }, + { + "epoch": 0.12709344371491102, + "grad_norm": 1.725801706314087, + "learning_rate": 4.803375052858542e-05, + "loss": 4.6643, + "step": 21370 + }, + { + "epoch": 0.12709939099819204, + "grad_norm": 1.6476885080337524, + "learning_rate": 4.803356894747299e-05, + "loss": 4.6574, + "step": 21371 + }, + { + "epoch": 0.12710533828147302, + "grad_norm": 1.520213007926941, + "learning_rate": 4.803338735831979e-05, + "loss": 5.3691, + "step": 21372 + }, + { + "epoch": 0.127111285564754, + "grad_norm": 1.4914368391036987, + "learning_rate": 4.803320576112586e-05, + "loss": 5.2913, + "step": 21373 + }, + { + "epoch": 0.12711723284803503, + "grad_norm": 1.254329800605774, + "learning_rate": 4.803302415589128e-05, + "loss": 5.3926, + "step": 21374 + }, + { + "epoch": 0.12712318013131602, + "grad_norm": 1.909441351890564, + "learning_rate": 4.8032842542616116e-05, + "loss": 4.6179, + "step": 21375 + }, + { + "epoch": 0.127129127414597, + "grad_norm": 1.7123392820358276, + "learning_rate": 4.803266092130042e-05, + "loss": 5.1276, + "step": 21376 + }, + { + "epoch": 0.12713507469787802, + "grad_norm": 1.717854380607605, + "learning_rate": 4.8032479291944265e-05, + "loss": 5.3377, + "step": 21377 + }, + { + "epoch": 0.127141021981159, + "grad_norm": 1.7636181116104126, + "learning_rate": 4.80322976545477e-05, + "loss": 5.3434, + "step": 21378 + }, + { + "epoch": 0.12714696926444, + "grad_norm": 1.6754179000854492, + "learning_rate": 4.80321160091108e-05, + "loss": 5.3604, + "step": 21379 + }, + { + "epoch": 0.127152916547721, + "grad_norm": 1.4759787321090698, + "learning_rate": 4.803193435563364e-05, + "loss": 5.267, + "step": 21380 + }, + { + "epoch": 0.127158863831002, + "grad_norm": 1.8769867420196533, + "learning_rate": 4.803175269411625e-05, + "loss": 5.2666, + "step": 21381 + }, + { + "epoch": 0.127164811114283, + "grad_norm": 1.7843588590621948, + "learning_rate": 4.803157102455873e-05, + "loss": 5.1529, + "step": 21382 + }, + { + "epoch": 0.127170758397564, + "grad_norm": 1.7799369096755981, + "learning_rate": 4.803138934696111e-05, + "loss": 4.9332, + "step": 21383 + }, + { + "epoch": 0.127176705680845, + "grad_norm": 1.8240329027175903, + "learning_rate": 4.803120766132348e-05, + "loss": 4.8369, + "step": 21384 + }, + { + "epoch": 0.12718265296412598, + "grad_norm": 1.7379107475280762, + "learning_rate": 4.8031025967645895e-05, + "loss": 4.6134, + "step": 21385 + }, + { + "epoch": 0.127188600247407, + "grad_norm": 1.9912395477294922, + "learning_rate": 4.8030844265928414e-05, + "loss": 4.5456, + "step": 21386 + }, + { + "epoch": 0.12719454753068798, + "grad_norm": 1.762600302696228, + "learning_rate": 4.80306625561711e-05, + "loss": 5.4269, + "step": 21387 + }, + { + "epoch": 0.12720049481396897, + "grad_norm": 1.9208531379699707, + "learning_rate": 4.8030480838374027e-05, + "loss": 5.542, + "step": 21388 + }, + { + "epoch": 0.12720644209725, + "grad_norm": 1.8121410608291626, + "learning_rate": 4.803029911253725e-05, + "loss": 5.7218, + "step": 21389 + }, + { + "epoch": 0.12721238938053098, + "grad_norm": 2.0130512714385986, + "learning_rate": 4.803011737866082e-05, + "loss": 5.4736, + "step": 21390 + }, + { + "epoch": 0.12721833666381197, + "grad_norm": 1.4087759256362915, + "learning_rate": 4.802993563674483e-05, + "loss": 5.5634, + "step": 21391 + }, + { + "epoch": 0.12722428394709298, + "grad_norm": 1.640550971031189, + "learning_rate": 4.8029753886789316e-05, + "loss": 5.6422, + "step": 21392 + }, + { + "epoch": 0.12723023123037397, + "grad_norm": 1.58751380443573, + "learning_rate": 4.802957212879436e-05, + "loss": 5.2661, + "step": 21393 + }, + { + "epoch": 0.12723617851365496, + "grad_norm": 1.536847472190857, + "learning_rate": 4.802939036276002e-05, + "loss": 5.475, + "step": 21394 + }, + { + "epoch": 0.12724212579693597, + "grad_norm": 1.8386236429214478, + "learning_rate": 4.802920858868635e-05, + "loss": 5.4889, + "step": 21395 + }, + { + "epoch": 0.12724807308021696, + "grad_norm": 1.7268786430358887, + "learning_rate": 4.802902680657343e-05, + "loss": 5.2129, + "step": 21396 + }, + { + "epoch": 0.12725402036349795, + "grad_norm": 1.5081709623336792, + "learning_rate": 4.8028845016421306e-05, + "loss": 5.0437, + "step": 21397 + }, + { + "epoch": 0.12725996764677894, + "grad_norm": 1.3470754623413086, + "learning_rate": 4.802866321823006e-05, + "loss": 5.2242, + "step": 21398 + }, + { + "epoch": 0.12726591493005995, + "grad_norm": 1.2352057695388794, + "learning_rate": 4.802848141199974e-05, + "loss": 4.6926, + "step": 21399 + }, + { + "epoch": 0.12727186221334094, + "grad_norm": 1.4411710500717163, + "learning_rate": 4.802829959773041e-05, + "loss": 5.098, + "step": 21400 + }, + { + "epoch": 0.12727780949662193, + "grad_norm": 1.3453952074050903, + "learning_rate": 4.802811777542214e-05, + "loss": 5.0484, + "step": 21401 + }, + { + "epoch": 0.12728375677990295, + "grad_norm": 1.4602265357971191, + "learning_rate": 4.8027935945074995e-05, + "loss": 5.167, + "step": 21402 + }, + { + "epoch": 0.12728970406318393, + "grad_norm": 1.4542255401611328, + "learning_rate": 4.802775410668904e-05, + "loss": 5.0701, + "step": 21403 + }, + { + "epoch": 0.12729565134646492, + "grad_norm": 1.4398037195205688, + "learning_rate": 4.802757226026433e-05, + "loss": 5.0809, + "step": 21404 + }, + { + "epoch": 0.12730159862974594, + "grad_norm": 1.3027135133743286, + "learning_rate": 4.8027390405800935e-05, + "loss": 5.1283, + "step": 21405 + }, + { + "epoch": 0.12730754591302693, + "grad_norm": 1.3704328536987305, + "learning_rate": 4.802720854329891e-05, + "loss": 5.0886, + "step": 21406 + }, + { + "epoch": 0.12731349319630791, + "grad_norm": 1.2771658897399902, + "learning_rate": 4.802702667275833e-05, + "loss": 4.968, + "step": 21407 + }, + { + "epoch": 0.12731944047958893, + "grad_norm": 1.3370757102966309, + "learning_rate": 4.802684479417925e-05, + "loss": 5.2742, + "step": 21408 + }, + { + "epoch": 0.12732538776286992, + "grad_norm": 1.2101991176605225, + "learning_rate": 4.802666290756174e-05, + "loss": 5.3125, + "step": 21409 + }, + { + "epoch": 0.1273313350461509, + "grad_norm": 1.327354907989502, + "learning_rate": 4.8026481012905854e-05, + "loss": 5.0784, + "step": 21410 + }, + { + "epoch": 0.12733728232943192, + "grad_norm": 1.2267961502075195, + "learning_rate": 4.802629911021166e-05, + "loss": 5.0666, + "step": 21411 + }, + { + "epoch": 0.1273432296127129, + "grad_norm": 1.2195243835449219, + "learning_rate": 4.8026117199479224e-05, + "loss": 5.1941, + "step": 21412 + }, + { + "epoch": 0.1273491768959939, + "grad_norm": 1.1964733600616455, + "learning_rate": 4.8025935280708616e-05, + "loss": 5.0561, + "step": 21413 + }, + { + "epoch": 0.12735512417927491, + "grad_norm": 1.148831844329834, + "learning_rate": 4.802575335389989e-05, + "loss": 4.9592, + "step": 21414 + }, + { + "epoch": 0.1273610714625559, + "grad_norm": 1.2319111824035645, + "learning_rate": 4.802557141905311e-05, + "loss": 5.0165, + "step": 21415 + }, + { + "epoch": 0.1273670187458369, + "grad_norm": 1.324744462966919, + "learning_rate": 4.802538947616834e-05, + "loss": 4.9402, + "step": 21416 + }, + { + "epoch": 0.1273729660291179, + "grad_norm": 1.1551966667175293, + "learning_rate": 4.802520752524564e-05, + "loss": 5.1849, + "step": 21417 + }, + { + "epoch": 0.1273789133123989, + "grad_norm": 1.2087135314941406, + "learning_rate": 4.802502556628508e-05, + "loss": 5.1082, + "step": 21418 + }, + { + "epoch": 0.12738486059567988, + "grad_norm": 1.1568787097930908, + "learning_rate": 4.8024843599286726e-05, + "loss": 5.1379, + "step": 21419 + }, + { + "epoch": 0.1273908078789609, + "grad_norm": 1.2819747924804688, + "learning_rate": 4.802466162425063e-05, + "loss": 5.2054, + "step": 21420 + }, + { + "epoch": 0.1273967551622419, + "grad_norm": 1.3548219203948975, + "learning_rate": 4.8024479641176866e-05, + "loss": 4.8277, + "step": 21421 + }, + { + "epoch": 0.12740270244552288, + "grad_norm": 1.3331178426742554, + "learning_rate": 4.80242976500655e-05, + "loss": 4.991, + "step": 21422 + }, + { + "epoch": 0.1274086497288039, + "grad_norm": 1.3595576286315918, + "learning_rate": 4.8024115650916584e-05, + "loss": 4.8734, + "step": 21423 + }, + { + "epoch": 0.12741459701208488, + "grad_norm": 1.310585856437683, + "learning_rate": 4.802393364373019e-05, + "loss": 4.9281, + "step": 21424 + }, + { + "epoch": 0.12742054429536587, + "grad_norm": 1.3193553686141968, + "learning_rate": 4.8023751628506374e-05, + "loss": 4.9819, + "step": 21425 + }, + { + "epoch": 0.12742649157864688, + "grad_norm": 1.2952460050582886, + "learning_rate": 4.8023569605245204e-05, + "loss": 4.9577, + "step": 21426 + }, + { + "epoch": 0.12743243886192787, + "grad_norm": 1.376548409461975, + "learning_rate": 4.802338757394674e-05, + "loss": 5.2219, + "step": 21427 + }, + { + "epoch": 0.12743838614520886, + "grad_norm": 1.1417921781539917, + "learning_rate": 4.802320553461106e-05, + "loss": 5.0234, + "step": 21428 + }, + { + "epoch": 0.12744433342848988, + "grad_norm": 1.2543314695358276, + "learning_rate": 4.8023023487238214e-05, + "loss": 4.9921, + "step": 21429 + }, + { + "epoch": 0.12745028071177086, + "grad_norm": 1.4437085390090942, + "learning_rate": 4.802284143182827e-05, + "loss": 4.8699, + "step": 21430 + }, + { + "epoch": 0.12745622799505185, + "grad_norm": 1.137539267539978, + "learning_rate": 4.802265936838128e-05, + "loss": 5.1073, + "step": 21431 + }, + { + "epoch": 0.12746217527833287, + "grad_norm": 1.4179331064224243, + "learning_rate": 4.802247729689733e-05, + "loss": 5.0073, + "step": 21432 + }, + { + "epoch": 0.12746812256161386, + "grad_norm": 1.5519764423370361, + "learning_rate": 4.802229521737646e-05, + "loss": 4.9426, + "step": 21433 + }, + { + "epoch": 0.12747406984489484, + "grad_norm": 1.440847396850586, + "learning_rate": 4.8022113129818754e-05, + "loss": 5.2137, + "step": 21434 + }, + { + "epoch": 0.12748001712817586, + "grad_norm": 1.2741557359695435, + "learning_rate": 4.802193103422426e-05, + "loss": 4.966, + "step": 21435 + }, + { + "epoch": 0.12748596441145685, + "grad_norm": 1.5297214984893799, + "learning_rate": 4.8021748930593045e-05, + "loss": 5.006, + "step": 21436 + }, + { + "epoch": 0.12749191169473784, + "grad_norm": 1.2509713172912598, + "learning_rate": 4.802156681892518e-05, + "loss": 5.0719, + "step": 21437 + }, + { + "epoch": 0.12749785897801885, + "grad_norm": 1.2376511096954346, + "learning_rate": 4.802138469922073e-05, + "loss": 4.8896, + "step": 21438 + }, + { + "epoch": 0.12750380626129984, + "grad_norm": 1.311804175376892, + "learning_rate": 4.802120257147974e-05, + "loss": 5.0292, + "step": 21439 + }, + { + "epoch": 0.12750975354458083, + "grad_norm": 1.2717031240463257, + "learning_rate": 4.802102043570229e-05, + "loss": 5.157, + "step": 21440 + }, + { + "epoch": 0.12751570082786184, + "grad_norm": 1.2967960834503174, + "learning_rate": 4.8020838291888445e-05, + "loss": 5.1289, + "step": 21441 + }, + { + "epoch": 0.12752164811114283, + "grad_norm": 1.2796543836593628, + "learning_rate": 4.802065614003826e-05, + "loss": 5.0702, + "step": 21442 + }, + { + "epoch": 0.12752759539442382, + "grad_norm": 1.4490569829940796, + "learning_rate": 4.80204739801518e-05, + "loss": 5.1, + "step": 21443 + }, + { + "epoch": 0.12753354267770484, + "grad_norm": 1.1721242666244507, + "learning_rate": 4.8020291812229136e-05, + "loss": 5.1237, + "step": 21444 + }, + { + "epoch": 0.12753948996098582, + "grad_norm": 1.3185924291610718, + "learning_rate": 4.8020109636270316e-05, + "loss": 5.0208, + "step": 21445 + }, + { + "epoch": 0.1275454372442668, + "grad_norm": 1.4432177543640137, + "learning_rate": 4.801992745227543e-05, + "loss": 5.0235, + "step": 21446 + }, + { + "epoch": 0.12755138452754783, + "grad_norm": 1.3810619115829468, + "learning_rate": 4.801974526024451e-05, + "loss": 4.8893, + "step": 21447 + }, + { + "epoch": 0.12755733181082882, + "grad_norm": 1.3421547412872314, + "learning_rate": 4.8019563060177634e-05, + "loss": 4.9605, + "step": 21448 + }, + { + "epoch": 0.1275632790941098, + "grad_norm": 1.304095983505249, + "learning_rate": 4.8019380852074875e-05, + "loss": 4.9489, + "step": 21449 + }, + { + "epoch": 0.12756922637739082, + "grad_norm": 1.3935438394546509, + "learning_rate": 4.801919863593629e-05, + "loss": 4.9097, + "step": 21450 + }, + { + "epoch": 0.1275751736606718, + "grad_norm": 1.1719253063201904, + "learning_rate": 4.801901641176193e-05, + "loss": 4.9922, + "step": 21451 + }, + { + "epoch": 0.1275811209439528, + "grad_norm": 1.8718456029891968, + "learning_rate": 4.801883417955188e-05, + "loss": 5.409, + "step": 21452 + }, + { + "epoch": 0.1275870682272338, + "grad_norm": 1.1837137937545776, + "learning_rate": 4.801865193930618e-05, + "loss": 4.967, + "step": 21453 + }, + { + "epoch": 0.1275930155105148, + "grad_norm": 1.2643749713897705, + "learning_rate": 4.801846969102491e-05, + "loss": 4.7932, + "step": 21454 + }, + { + "epoch": 0.1275989627937958, + "grad_norm": 1.2207399606704712, + "learning_rate": 4.801828743470814e-05, + "loss": 4.9634, + "step": 21455 + }, + { + "epoch": 0.12760491007707678, + "grad_norm": 1.2489538192749023, + "learning_rate": 4.801810517035592e-05, + "loss": 5.1077, + "step": 21456 + }, + { + "epoch": 0.1276108573603578, + "grad_norm": 1.3879250288009644, + "learning_rate": 4.801792289796832e-05, + "loss": 5.225, + "step": 21457 + }, + { + "epoch": 0.12761680464363878, + "grad_norm": 1.4891397953033447, + "learning_rate": 4.8017740617545385e-05, + "loss": 5.1288, + "step": 21458 + }, + { + "epoch": 0.12762275192691977, + "grad_norm": 1.555528998374939, + "learning_rate": 4.801755832908721e-05, + "loss": 5.1875, + "step": 21459 + }, + { + "epoch": 0.12762869921020079, + "grad_norm": 1.287625789642334, + "learning_rate": 4.8017376032593834e-05, + "loss": 5.1934, + "step": 21460 + }, + { + "epoch": 0.12763464649348177, + "grad_norm": 1.4907346963882446, + "learning_rate": 4.801719372806533e-05, + "loss": 5.169, + "step": 21461 + }, + { + "epoch": 0.12764059377676276, + "grad_norm": 1.2776025533676147, + "learning_rate": 4.801701141550177e-05, + "loss": 5.2178, + "step": 21462 + }, + { + "epoch": 0.12764654106004378, + "grad_norm": 1.4319080114364624, + "learning_rate": 4.80168290949032e-05, + "loss": 5.159, + "step": 21463 + }, + { + "epoch": 0.12765248834332477, + "grad_norm": 1.4323997497558594, + "learning_rate": 4.80166467662697e-05, + "loss": 5.227, + "step": 21464 + }, + { + "epoch": 0.12765843562660575, + "grad_norm": 1.409071445465088, + "learning_rate": 4.8016464429601326e-05, + "loss": 5.0025, + "step": 21465 + }, + { + "epoch": 0.12766438290988677, + "grad_norm": 1.42705500125885, + "learning_rate": 4.801628208489814e-05, + "loss": 5.0332, + "step": 21466 + }, + { + "epoch": 0.12767033019316776, + "grad_norm": 1.2235654592514038, + "learning_rate": 4.801609973216021e-05, + "loss": 5.0734, + "step": 21467 + }, + { + "epoch": 0.12767627747644875, + "grad_norm": 1.2238860130310059, + "learning_rate": 4.8015917371387595e-05, + "loss": 4.9804, + "step": 21468 + }, + { + "epoch": 0.12768222475972976, + "grad_norm": 1.4584438800811768, + "learning_rate": 4.801573500258036e-05, + "loss": 5.162, + "step": 21469 + }, + { + "epoch": 0.12768817204301075, + "grad_norm": 1.236396074295044, + "learning_rate": 4.8015552625738566e-05, + "loss": 5.1374, + "step": 21470 + }, + { + "epoch": 0.12769411932629174, + "grad_norm": 1.472617745399475, + "learning_rate": 4.801537024086229e-05, + "loss": 5.0376, + "step": 21471 + }, + { + "epoch": 0.12770006660957275, + "grad_norm": 1.2870211601257324, + "learning_rate": 4.801518784795158e-05, + "loss": 4.9798, + "step": 21472 + }, + { + "epoch": 0.12770601389285374, + "grad_norm": 1.3299795389175415, + "learning_rate": 4.801500544700651e-05, + "loss": 4.9588, + "step": 21473 + }, + { + "epoch": 0.12771196117613473, + "grad_norm": 1.474135398864746, + "learning_rate": 4.8014823038027134e-05, + "loss": 5.015, + "step": 21474 + }, + { + "epoch": 0.12771790845941575, + "grad_norm": 1.6452490091323853, + "learning_rate": 4.8014640621013524e-05, + "loss": 5.0075, + "step": 21475 + }, + { + "epoch": 0.12772385574269673, + "grad_norm": 1.3577489852905273, + "learning_rate": 4.801445819596574e-05, + "loss": 4.9675, + "step": 21476 + }, + { + "epoch": 0.12772980302597772, + "grad_norm": 1.2642143964767456, + "learning_rate": 4.801427576288384e-05, + "loss": 5.0593, + "step": 21477 + }, + { + "epoch": 0.12773575030925874, + "grad_norm": 1.5256940126419067, + "learning_rate": 4.801409332176791e-05, + "loss": 4.8987, + "step": 21478 + }, + { + "epoch": 0.12774169759253973, + "grad_norm": 1.667886734008789, + "learning_rate": 4.801391087261798e-05, + "loss": 4.7562, + "step": 21479 + }, + { + "epoch": 0.12774764487582072, + "grad_norm": 1.3564702272415161, + "learning_rate": 4.801372841543415e-05, + "loss": 5.2975, + "step": 21480 + }, + { + "epoch": 0.12775359215910173, + "grad_norm": 1.607532262802124, + "learning_rate": 4.801354595021645e-05, + "loss": 4.9578, + "step": 21481 + }, + { + "epoch": 0.12775953944238272, + "grad_norm": 1.2633382081985474, + "learning_rate": 4.801336347696496e-05, + "loss": 5.1104, + "step": 21482 + }, + { + "epoch": 0.1277654867256637, + "grad_norm": 1.4292182922363281, + "learning_rate": 4.801318099567975e-05, + "loss": 4.9637, + "step": 21483 + }, + { + "epoch": 0.12777143400894472, + "grad_norm": 1.1797621250152588, + "learning_rate": 4.8012998506360874e-05, + "loss": 5.403, + "step": 21484 + }, + { + "epoch": 0.1277773812922257, + "grad_norm": 1.3704683780670166, + "learning_rate": 4.801281600900839e-05, + "loss": 4.9852, + "step": 21485 + }, + { + "epoch": 0.1277833285755067, + "grad_norm": 1.4775960445404053, + "learning_rate": 4.8012633503622384e-05, + "loss": 5.2049, + "step": 21486 + }, + { + "epoch": 0.12778927585878772, + "grad_norm": 1.5056041479110718, + "learning_rate": 4.801245099020289e-05, + "loss": 4.9782, + "step": 21487 + }, + { + "epoch": 0.1277952231420687, + "grad_norm": 1.3562772274017334, + "learning_rate": 4.801226846875e-05, + "loss": 5.0427, + "step": 21488 + }, + { + "epoch": 0.1278011704253497, + "grad_norm": 1.346339464187622, + "learning_rate": 4.801208593926376e-05, + "loss": 5.2215, + "step": 21489 + }, + { + "epoch": 0.1278071177086307, + "grad_norm": 1.3189916610717773, + "learning_rate": 4.801190340174424e-05, + "loss": 5.2097, + "step": 21490 + }, + { + "epoch": 0.1278130649919117, + "grad_norm": 1.466374397277832, + "learning_rate": 4.80117208561915e-05, + "loss": 4.8106, + "step": 21491 + }, + { + "epoch": 0.12781901227519268, + "grad_norm": 1.4882310628890991, + "learning_rate": 4.801153830260561e-05, + "loss": 5.1702, + "step": 21492 + }, + { + "epoch": 0.1278249595584737, + "grad_norm": 1.4080910682678223, + "learning_rate": 4.801135574098662e-05, + "loss": 5.0508, + "step": 21493 + }, + { + "epoch": 0.1278309068417547, + "grad_norm": 1.366672396659851, + "learning_rate": 4.801117317133461e-05, + "loss": 4.8692, + "step": 21494 + }, + { + "epoch": 0.12783685412503568, + "grad_norm": 1.3347315788269043, + "learning_rate": 4.801099059364963e-05, + "loss": 5.2327, + "step": 21495 + }, + { + "epoch": 0.1278428014083167, + "grad_norm": 1.434276819229126, + "learning_rate": 4.8010808007931765e-05, + "loss": 4.7217, + "step": 21496 + }, + { + "epoch": 0.12784874869159768, + "grad_norm": 1.2148855924606323, + "learning_rate": 4.801062541418105e-05, + "loss": 5.2082, + "step": 21497 + }, + { + "epoch": 0.12785469597487867, + "grad_norm": 1.4282805919647217, + "learning_rate": 4.801044281239758e-05, + "loss": 4.8627, + "step": 21498 + }, + { + "epoch": 0.12786064325815968, + "grad_norm": 1.309984564781189, + "learning_rate": 4.8010260202581394e-05, + "loss": 5.0809, + "step": 21499 + }, + { + "epoch": 0.12786659054144067, + "grad_norm": 1.2769159078598022, + "learning_rate": 4.801007758473256e-05, + "loss": 5.0357, + "step": 21500 + }, + { + "epoch": 0.12787253782472166, + "grad_norm": 1.4789204597473145, + "learning_rate": 4.800989495885115e-05, + "loss": 5.0572, + "step": 21501 + }, + { + "epoch": 0.12787848510800268, + "grad_norm": 1.2763663530349731, + "learning_rate": 4.8009712324937216e-05, + "loss": 5.2331, + "step": 21502 + }, + { + "epoch": 0.12788443239128366, + "grad_norm": 1.237911581993103, + "learning_rate": 4.800952968299084e-05, + "loss": 5.1217, + "step": 21503 + }, + { + "epoch": 0.12789037967456465, + "grad_norm": 1.3204708099365234, + "learning_rate": 4.800934703301206e-05, + "loss": 5.0503, + "step": 21504 + }, + { + "epoch": 0.12789632695784567, + "grad_norm": 1.2918440103530884, + "learning_rate": 4.800916437500097e-05, + "loss": 5.1229, + "step": 21505 + }, + { + "epoch": 0.12790227424112666, + "grad_norm": 1.2793703079223633, + "learning_rate": 4.8008981708957614e-05, + "loss": 4.9075, + "step": 21506 + }, + { + "epoch": 0.12790822152440764, + "grad_norm": 1.177607536315918, + "learning_rate": 4.8008799034882054e-05, + "loss": 4.89, + "step": 21507 + }, + { + "epoch": 0.12791416880768866, + "grad_norm": 0.9703904986381531, + "learning_rate": 4.800861635277437e-05, + "loss": 5.0141, + "step": 21508 + }, + { + "epoch": 0.12792011609096965, + "grad_norm": 1.2512762546539307, + "learning_rate": 4.800843366263461e-05, + "loss": 4.953, + "step": 21509 + }, + { + "epoch": 0.12792606337425064, + "grad_norm": 1.3279083967208862, + "learning_rate": 4.8008250964462846e-05, + "loss": 5.0179, + "step": 21510 + }, + { + "epoch": 0.12793201065753165, + "grad_norm": 1.3790103197097778, + "learning_rate": 4.8008068258259144e-05, + "loss": 4.9531, + "step": 21511 + }, + { + "epoch": 0.12793795794081264, + "grad_norm": 1.2640241384506226, + "learning_rate": 4.800788554402355e-05, + "loss": 5.0281, + "step": 21512 + }, + { + "epoch": 0.12794390522409363, + "grad_norm": 1.2616617679595947, + "learning_rate": 4.800770282175615e-05, + "loss": 5.1131, + "step": 21513 + }, + { + "epoch": 0.12794985250737465, + "grad_norm": 1.7765449285507202, + "learning_rate": 4.800752009145699e-05, + "loss": 5.3388, + "step": 21514 + }, + { + "epoch": 0.12795579979065563, + "grad_norm": 1.4468929767608643, + "learning_rate": 4.800733735312615e-05, + "loss": 4.9308, + "step": 21515 + }, + { + "epoch": 0.12796174707393662, + "grad_norm": 1.286733865737915, + "learning_rate": 4.800715460676369e-05, + "loss": 5.0407, + "step": 21516 + }, + { + "epoch": 0.1279676943572176, + "grad_norm": 1.3074883222579956, + "learning_rate": 4.8006971852369665e-05, + "loss": 5.0364, + "step": 21517 + }, + { + "epoch": 0.12797364164049863, + "grad_norm": 1.2966744899749756, + "learning_rate": 4.8006789089944144e-05, + "loss": 5.0411, + "step": 21518 + }, + { + "epoch": 0.1279795889237796, + "grad_norm": 1.4764792919158936, + "learning_rate": 4.800660631948719e-05, + "loss": 5.0178, + "step": 21519 + }, + { + "epoch": 0.1279855362070606, + "grad_norm": 1.3073668479919434, + "learning_rate": 4.800642354099887e-05, + "loss": 4.8384, + "step": 21520 + }, + { + "epoch": 0.12799148349034162, + "grad_norm": 1.433164119720459, + "learning_rate": 4.800624075447924e-05, + "loss": 4.4844, + "step": 21521 + }, + { + "epoch": 0.1279974307736226, + "grad_norm": 1.435656189918518, + "learning_rate": 4.8006057959928375e-05, + "loss": 4.7067, + "step": 21522 + }, + { + "epoch": 0.1280033780569036, + "grad_norm": 1.2541238069534302, + "learning_rate": 4.800587515734632e-05, + "loss": 4.76, + "step": 21523 + }, + { + "epoch": 0.1280093253401846, + "grad_norm": 1.3341822624206543, + "learning_rate": 4.8005692346733166e-05, + "loss": 4.9485, + "step": 21524 + }, + { + "epoch": 0.1280152726234656, + "grad_norm": 1.1761771440505981, + "learning_rate": 4.8005509528088963e-05, + "loss": 4.9416, + "step": 21525 + }, + { + "epoch": 0.1280212199067466, + "grad_norm": 1.490059494972229, + "learning_rate": 4.8005326701413764e-05, + "loss": 4.5864, + "step": 21526 + }, + { + "epoch": 0.1280271671900276, + "grad_norm": 1.4474053382873535, + "learning_rate": 4.8005143866707656e-05, + "loss": 4.3612, + "step": 21527 + }, + { + "epoch": 0.1280331144733086, + "grad_norm": 1.4138057231903076, + "learning_rate": 4.800496102397068e-05, + "loss": 4.7795, + "step": 21528 + }, + { + "epoch": 0.12803906175658958, + "grad_norm": 1.3671265840530396, + "learning_rate": 4.8004778173202915e-05, + "loss": 4.8096, + "step": 21529 + }, + { + "epoch": 0.1280450090398706, + "grad_norm": 1.3463077545166016, + "learning_rate": 4.800459531440441e-05, + "loss": 4.4858, + "step": 21530 + }, + { + "epoch": 0.12805095632315158, + "grad_norm": 1.2250823974609375, + "learning_rate": 4.800441244757525e-05, + "loss": 4.7394, + "step": 21531 + }, + { + "epoch": 0.12805690360643257, + "grad_norm": 1.4103713035583496, + "learning_rate": 4.800422957271548e-05, + "loss": 4.8084, + "step": 21532 + }, + { + "epoch": 0.1280628508897136, + "grad_norm": 1.3920261859893799, + "learning_rate": 4.800404668982518e-05, + "loss": 4.9744, + "step": 21533 + }, + { + "epoch": 0.12806879817299457, + "grad_norm": 1.2541594505310059, + "learning_rate": 4.8003863798904395e-05, + "loss": 5.024, + "step": 21534 + }, + { + "epoch": 0.12807474545627556, + "grad_norm": 1.2717599868774414, + "learning_rate": 4.80036808999532e-05, + "loss": 4.9402, + "step": 21535 + }, + { + "epoch": 0.12808069273955658, + "grad_norm": 1.168628215789795, + "learning_rate": 4.8003497992971656e-05, + "loss": 4.8391, + "step": 21536 + }, + { + "epoch": 0.12808664002283757, + "grad_norm": 1.2135813236236572, + "learning_rate": 4.800331507795984e-05, + "loss": 4.9725, + "step": 21537 + }, + { + "epoch": 0.12809258730611855, + "grad_norm": 1.2984068393707275, + "learning_rate": 4.8003132154917795e-05, + "loss": 4.8946, + "step": 21538 + }, + { + "epoch": 0.12809853458939957, + "grad_norm": 1.1610583066940308, + "learning_rate": 4.8002949223845595e-05, + "loss": 4.9362, + "step": 21539 + }, + { + "epoch": 0.12810448187268056, + "grad_norm": 1.357981562614441, + "learning_rate": 4.8002766284743306e-05, + "loss": 4.6936, + "step": 21540 + }, + { + "epoch": 0.12811042915596155, + "grad_norm": 1.437784194946289, + "learning_rate": 4.800258333761098e-05, + "loss": 4.7942, + "step": 21541 + }, + { + "epoch": 0.12811637643924256, + "grad_norm": 1.364261507987976, + "learning_rate": 4.8002400382448704e-05, + "loss": 4.763, + "step": 21542 + }, + { + "epoch": 0.12812232372252355, + "grad_norm": 1.3244688510894775, + "learning_rate": 4.800221741925652e-05, + "loss": 4.8804, + "step": 21543 + }, + { + "epoch": 0.12812827100580454, + "grad_norm": 1.5480263233184814, + "learning_rate": 4.80020344480345e-05, + "loss": 4.6523, + "step": 21544 + }, + { + "epoch": 0.12813421828908556, + "grad_norm": 1.2875494956970215, + "learning_rate": 4.800185146878271e-05, + "loss": 4.6137, + "step": 21545 + }, + { + "epoch": 0.12814016557236654, + "grad_norm": 1.1969667673110962, + "learning_rate": 4.80016684815012e-05, + "loss": 5.1034, + "step": 21546 + }, + { + "epoch": 0.12814611285564753, + "grad_norm": 1.3188492059707642, + "learning_rate": 4.8001485486190064e-05, + "loss": 5.078, + "step": 21547 + }, + { + "epoch": 0.12815206013892855, + "grad_norm": 1.2246590852737427, + "learning_rate": 4.800130248284934e-05, + "loss": 5.0404, + "step": 21548 + }, + { + "epoch": 0.12815800742220954, + "grad_norm": 1.2853569984436035, + "learning_rate": 4.800111947147909e-05, + "loss": 4.9271, + "step": 21549 + }, + { + "epoch": 0.12816395470549052, + "grad_norm": 1.1865004301071167, + "learning_rate": 4.8000936452079395e-05, + "loss": 4.8657, + "step": 21550 + }, + { + "epoch": 0.12816990198877154, + "grad_norm": 1.4134557247161865, + "learning_rate": 4.8000753424650306e-05, + "loss": 4.5964, + "step": 21551 + }, + { + "epoch": 0.12817584927205253, + "grad_norm": 1.3943791389465332, + "learning_rate": 4.8000570389191894e-05, + "loss": 4.7792, + "step": 21552 + }, + { + "epoch": 0.12818179655533352, + "grad_norm": 1.5506455898284912, + "learning_rate": 4.8000387345704225e-05, + "loss": 4.767, + "step": 21553 + }, + { + "epoch": 0.12818774383861453, + "grad_norm": 1.516860008239746, + "learning_rate": 4.8000204294187356e-05, + "loss": 4.8412, + "step": 21554 + }, + { + "epoch": 0.12819369112189552, + "grad_norm": 1.3515304327011108, + "learning_rate": 4.8000021234641345e-05, + "loss": 4.7443, + "step": 21555 + }, + { + "epoch": 0.1281996384051765, + "grad_norm": 1.4094910621643066, + "learning_rate": 4.7999838167066276e-05, + "loss": 4.8343, + "step": 21556 + }, + { + "epoch": 0.12820558568845752, + "grad_norm": 1.3746453523635864, + "learning_rate": 4.7999655091462195e-05, + "loss": 4.6913, + "step": 21557 + }, + { + "epoch": 0.1282115329717385, + "grad_norm": 1.4625654220581055, + "learning_rate": 4.799947200782917e-05, + "loss": 4.8412, + "step": 21558 + }, + { + "epoch": 0.1282174802550195, + "grad_norm": 1.3790411949157715, + "learning_rate": 4.7999288916167275e-05, + "loss": 4.5777, + "step": 21559 + }, + { + "epoch": 0.12822342753830052, + "grad_norm": 1.4020804166793823, + "learning_rate": 4.799910581647656e-05, + "loss": 4.8728, + "step": 21560 + }, + { + "epoch": 0.1282293748215815, + "grad_norm": 1.2850565910339355, + "learning_rate": 4.799892270875709e-05, + "loss": 4.9687, + "step": 21561 + }, + { + "epoch": 0.1282353221048625, + "grad_norm": 1.4895892143249512, + "learning_rate": 4.799873959300894e-05, + "loss": 4.9786, + "step": 21562 + }, + { + "epoch": 0.1282412693881435, + "grad_norm": 1.149808406829834, + "learning_rate": 4.799855646923217e-05, + "loss": 4.9924, + "step": 21563 + }, + { + "epoch": 0.1282472166714245, + "grad_norm": 1.3952314853668213, + "learning_rate": 4.799837333742684e-05, + "loss": 4.9225, + "step": 21564 + }, + { + "epoch": 0.12825316395470548, + "grad_norm": 1.271844744682312, + "learning_rate": 4.799819019759301e-05, + "loss": 4.9967, + "step": 21565 + }, + { + "epoch": 0.1282591112379865, + "grad_norm": 1.3351553678512573, + "learning_rate": 4.799800704973075e-05, + "loss": 4.9089, + "step": 21566 + }, + { + "epoch": 0.1282650585212675, + "grad_norm": 1.2077351808547974, + "learning_rate": 4.799782389384013e-05, + "loss": 4.8948, + "step": 21567 + }, + { + "epoch": 0.12827100580454848, + "grad_norm": 1.6159747838974, + "learning_rate": 4.79976407299212e-05, + "loss": 4.6636, + "step": 21568 + }, + { + "epoch": 0.1282769530878295, + "grad_norm": 1.4904805421829224, + "learning_rate": 4.7997457557974035e-05, + "loss": 4.8164, + "step": 21569 + }, + { + "epoch": 0.12828290037111048, + "grad_norm": 1.2312726974487305, + "learning_rate": 4.79972743779987e-05, + "loss": 4.8022, + "step": 21570 + }, + { + "epoch": 0.12828884765439147, + "grad_norm": 1.3150570392608643, + "learning_rate": 4.799709118999525e-05, + "loss": 4.7237, + "step": 21571 + }, + { + "epoch": 0.12829479493767248, + "grad_norm": 1.441749930381775, + "learning_rate": 4.799690799396375e-05, + "loss": 4.8704, + "step": 21572 + }, + { + "epoch": 0.12830074222095347, + "grad_norm": 1.4237558841705322, + "learning_rate": 4.799672478990427e-05, + "loss": 4.9428, + "step": 21573 + }, + { + "epoch": 0.12830668950423446, + "grad_norm": 1.5440024137496948, + "learning_rate": 4.7996541577816867e-05, + "loss": 4.7546, + "step": 21574 + }, + { + "epoch": 0.12831263678751545, + "grad_norm": 1.2962610721588135, + "learning_rate": 4.799635835770161e-05, + "loss": 4.9324, + "step": 21575 + }, + { + "epoch": 0.12831858407079647, + "grad_norm": 2.1041312217712402, + "learning_rate": 4.799617512955857e-05, + "loss": 5.2894, + "step": 21576 + }, + { + "epoch": 0.12832453135407745, + "grad_norm": 1.3591945171356201, + "learning_rate": 4.7995991893387796e-05, + "loss": 4.6942, + "step": 21577 + }, + { + "epoch": 0.12833047863735844, + "grad_norm": 1.2474287748336792, + "learning_rate": 4.799580864918936e-05, + "loss": 5.0003, + "step": 21578 + }, + { + "epoch": 0.12833642592063946, + "grad_norm": 1.4604638814926147, + "learning_rate": 4.7995625396963326e-05, + "loss": 4.8608, + "step": 21579 + }, + { + "epoch": 0.12834237320392045, + "grad_norm": 1.5033100843429565, + "learning_rate": 4.7995442136709755e-05, + "loss": 4.9221, + "step": 21580 + }, + { + "epoch": 0.12834832048720143, + "grad_norm": 1.4712806940078735, + "learning_rate": 4.799525886842872e-05, + "loss": 4.9657, + "step": 21581 + }, + { + "epoch": 0.12835426777048245, + "grad_norm": 1.4505717754364014, + "learning_rate": 4.799507559212026e-05, + "loss": 4.7913, + "step": 21582 + }, + { + "epoch": 0.12836021505376344, + "grad_norm": 1.6151630878448486, + "learning_rate": 4.7994892307784466e-05, + "loss": 4.6494, + "step": 21583 + }, + { + "epoch": 0.12836616233704443, + "grad_norm": 1.5356489419937134, + "learning_rate": 4.79947090154214e-05, + "loss": 4.5596, + "step": 21584 + }, + { + "epoch": 0.12837210962032544, + "grad_norm": 1.5046836137771606, + "learning_rate": 4.7994525715031114e-05, + "loss": 4.6486, + "step": 21585 + }, + { + "epoch": 0.12837805690360643, + "grad_norm": 1.413750171661377, + "learning_rate": 4.799434240661367e-05, + "loss": 4.8878, + "step": 21586 + }, + { + "epoch": 0.12838400418688742, + "grad_norm": 1.3955304622650146, + "learning_rate": 4.799415909016915e-05, + "loss": 5.1577, + "step": 21587 + }, + { + "epoch": 0.12838995147016843, + "grad_norm": 1.5791069269180298, + "learning_rate": 4.79939757656976e-05, + "loss": 5.1712, + "step": 21588 + }, + { + "epoch": 0.12839589875344942, + "grad_norm": 1.3384202718734741, + "learning_rate": 4.799379243319909e-05, + "loss": 5.1534, + "step": 21589 + }, + { + "epoch": 0.1284018460367304, + "grad_norm": 1.4390661716461182, + "learning_rate": 4.7993609092673684e-05, + "loss": 5.3616, + "step": 21590 + }, + { + "epoch": 0.12840779332001143, + "grad_norm": 1.3923462629318237, + "learning_rate": 4.799342574412145e-05, + "loss": 5.2225, + "step": 21591 + }, + { + "epoch": 0.12841374060329241, + "grad_norm": 1.2241096496582031, + "learning_rate": 4.799324238754245e-05, + "loss": 5.2419, + "step": 21592 + }, + { + "epoch": 0.1284196878865734, + "grad_norm": 1.3041672706604004, + "learning_rate": 4.799305902293674e-05, + "loss": 5.0903, + "step": 21593 + }, + { + "epoch": 0.12842563516985442, + "grad_norm": 1.2822580337524414, + "learning_rate": 4.799287565030439e-05, + "loss": 5.1304, + "step": 21594 + }, + { + "epoch": 0.1284315824531354, + "grad_norm": 1.4155261516571045, + "learning_rate": 4.7992692269645475e-05, + "loss": 5.2332, + "step": 21595 + }, + { + "epoch": 0.1284375297364164, + "grad_norm": 1.4972230195999146, + "learning_rate": 4.799250888096004e-05, + "loss": 5.0588, + "step": 21596 + }, + { + "epoch": 0.1284434770196974, + "grad_norm": 1.3301728963851929, + "learning_rate": 4.799232548424816e-05, + "loss": 5.0401, + "step": 21597 + }, + { + "epoch": 0.1284494243029784, + "grad_norm": 1.2775028944015503, + "learning_rate": 4.799214207950989e-05, + "loss": 4.877, + "step": 21598 + }, + { + "epoch": 0.1284553715862594, + "grad_norm": 1.1996419429779053, + "learning_rate": 4.799195866674532e-05, + "loss": 4.9223, + "step": 21599 + }, + { + "epoch": 0.1284613188695404, + "grad_norm": 1.1330626010894775, + "learning_rate": 4.7991775245954477e-05, + "loss": 4.9224, + "step": 21600 + }, + { + "epoch": 0.1284672661528214, + "grad_norm": 1.3013830184936523, + "learning_rate": 4.7991591817137446e-05, + "loss": 5.1005, + "step": 21601 + }, + { + "epoch": 0.12847321343610238, + "grad_norm": 1.2901992797851562, + "learning_rate": 4.79914083802943e-05, + "loss": 4.9554, + "step": 21602 + }, + { + "epoch": 0.1284791607193834, + "grad_norm": 1.4342957735061646, + "learning_rate": 4.799122493542507e-05, + "loss": 4.9685, + "step": 21603 + }, + { + "epoch": 0.12848510800266438, + "grad_norm": 1.2227423191070557, + "learning_rate": 4.7991041482529856e-05, + "loss": 4.9219, + "step": 21604 + }, + { + "epoch": 0.12849105528594537, + "grad_norm": 1.2947163581848145, + "learning_rate": 4.7990858021608705e-05, + "loss": 4.9747, + "step": 21605 + }, + { + "epoch": 0.1284970025692264, + "grad_norm": 1.2928695678710938, + "learning_rate": 4.799067455266168e-05, + "loss": 5.0456, + "step": 21606 + }, + { + "epoch": 0.12850294985250738, + "grad_norm": 1.461930513381958, + "learning_rate": 4.799049107568885e-05, + "loss": 4.8518, + "step": 21607 + }, + { + "epoch": 0.12850889713578836, + "grad_norm": 1.4009983539581299, + "learning_rate": 4.799030759069028e-05, + "loss": 4.8761, + "step": 21608 + }, + { + "epoch": 0.12851484441906938, + "grad_norm": 1.2762218713760376, + "learning_rate": 4.799012409766602e-05, + "loss": 4.8551, + "step": 21609 + }, + { + "epoch": 0.12852079170235037, + "grad_norm": 1.3359547853469849, + "learning_rate": 4.7989940596616156e-05, + "loss": 4.7933, + "step": 21610 + }, + { + "epoch": 0.12852673898563136, + "grad_norm": 1.4515223503112793, + "learning_rate": 4.7989757087540735e-05, + "loss": 4.8432, + "step": 21611 + }, + { + "epoch": 0.12853268626891237, + "grad_norm": 1.445410966873169, + "learning_rate": 4.7989573570439825e-05, + "loss": 5.0115, + "step": 21612 + }, + { + "epoch": 0.12853863355219336, + "grad_norm": 1.4424355030059814, + "learning_rate": 4.79893900453135e-05, + "loss": 4.9, + "step": 21613 + }, + { + "epoch": 0.12854458083547435, + "grad_norm": 1.2938885688781738, + "learning_rate": 4.798920651216182e-05, + "loss": 4.7918, + "step": 21614 + }, + { + "epoch": 0.12855052811875536, + "grad_norm": 1.3097805976867676, + "learning_rate": 4.798902297098484e-05, + "loss": 4.7449, + "step": 21615 + }, + { + "epoch": 0.12855647540203635, + "grad_norm": 1.5416840314865112, + "learning_rate": 4.798883942178263e-05, + "loss": 5.3092, + "step": 21616 + }, + { + "epoch": 0.12856242268531734, + "grad_norm": 1.339882493019104, + "learning_rate": 4.798865586455525e-05, + "loss": 5.2832, + "step": 21617 + }, + { + "epoch": 0.12856836996859836, + "grad_norm": 1.2793277502059937, + "learning_rate": 4.7988472299302764e-05, + "loss": 4.9532, + "step": 21618 + }, + { + "epoch": 0.12857431725187934, + "grad_norm": 1.3368133306503296, + "learning_rate": 4.7988288726025254e-05, + "loss": 5.0795, + "step": 21619 + }, + { + "epoch": 0.12858026453516033, + "grad_norm": 1.4083633422851562, + "learning_rate": 4.7988105144722764e-05, + "loss": 5.3231, + "step": 21620 + }, + { + "epoch": 0.12858621181844135, + "grad_norm": 1.4018146991729736, + "learning_rate": 4.7987921555395356e-05, + "loss": 5.0031, + "step": 21621 + }, + { + "epoch": 0.12859215910172234, + "grad_norm": 1.2982511520385742, + "learning_rate": 4.798773795804311e-05, + "loss": 4.9553, + "step": 21622 + }, + { + "epoch": 0.12859810638500332, + "grad_norm": 1.2939512729644775, + "learning_rate": 4.798755435266607e-05, + "loss": 4.9096, + "step": 21623 + }, + { + "epoch": 0.12860405366828434, + "grad_norm": 1.2920591831207275, + "learning_rate": 4.7987370739264334e-05, + "loss": 4.8198, + "step": 21624 + }, + { + "epoch": 0.12861000095156533, + "grad_norm": 1.537635326385498, + "learning_rate": 4.798718711783793e-05, + "loss": 4.9656, + "step": 21625 + }, + { + "epoch": 0.12861594823484632, + "grad_norm": 1.4374878406524658, + "learning_rate": 4.798700348838694e-05, + "loss": 5.022, + "step": 21626 + }, + { + "epoch": 0.12862189551812733, + "grad_norm": 1.4768397808074951, + "learning_rate": 4.798681985091142e-05, + "loss": 5.1965, + "step": 21627 + }, + { + "epoch": 0.12862784280140832, + "grad_norm": 1.370009183883667, + "learning_rate": 4.798663620541145e-05, + "loss": 5.049, + "step": 21628 + }, + { + "epoch": 0.1286337900846893, + "grad_norm": 1.309531569480896, + "learning_rate": 4.7986452551887076e-05, + "loss": 4.9583, + "step": 21629 + }, + { + "epoch": 0.12863973736797032, + "grad_norm": 1.3303570747375488, + "learning_rate": 4.7986268890338365e-05, + "loss": 5.0708, + "step": 21630 + }, + { + "epoch": 0.1286456846512513, + "grad_norm": 1.389640212059021, + "learning_rate": 4.7986085220765385e-05, + "loss": 5.0744, + "step": 21631 + }, + { + "epoch": 0.1286516319345323, + "grad_norm": 1.198508620262146, + "learning_rate": 4.798590154316821e-05, + "loss": 5.0152, + "step": 21632 + }, + { + "epoch": 0.1286575792178133, + "grad_norm": 1.3534667491912842, + "learning_rate": 4.7985717857546886e-05, + "loss": 5.0292, + "step": 21633 + }, + { + "epoch": 0.1286635265010943, + "grad_norm": 1.4618093967437744, + "learning_rate": 4.798553416390149e-05, + "loss": 5.0733, + "step": 21634 + }, + { + "epoch": 0.1286694737843753, + "grad_norm": 1.4006026983261108, + "learning_rate": 4.798535046223207e-05, + "loss": 5.0071, + "step": 21635 + }, + { + "epoch": 0.12867542106765628, + "grad_norm": 1.4667402505874634, + "learning_rate": 4.7985166752538714e-05, + "loss": 4.8829, + "step": 21636 + }, + { + "epoch": 0.1286813683509373, + "grad_norm": 1.2916743755340576, + "learning_rate": 4.798498303482147e-05, + "loss": 4.9049, + "step": 21637 + }, + { + "epoch": 0.12868731563421829, + "grad_norm": 1.400270700454712, + "learning_rate": 4.798479930908041e-05, + "loss": 5.1051, + "step": 21638 + }, + { + "epoch": 0.12869326291749927, + "grad_norm": 1.3317632675170898, + "learning_rate": 4.798461557531558e-05, + "loss": 4.7864, + "step": 21639 + }, + { + "epoch": 0.1286992102007803, + "grad_norm": 1.1226558685302734, + "learning_rate": 4.7984431833527074e-05, + "loss": 4.8598, + "step": 21640 + }, + { + "epoch": 0.12870515748406128, + "grad_norm": 1.2921690940856934, + "learning_rate": 4.7984248083714934e-05, + "loss": 4.8687, + "step": 21641 + }, + { + "epoch": 0.12871110476734227, + "grad_norm": 1.2811640501022339, + "learning_rate": 4.798406432587923e-05, + "loss": 4.7438, + "step": 21642 + }, + { + "epoch": 0.12871705205062328, + "grad_norm": 1.1892732381820679, + "learning_rate": 4.7983880560020026e-05, + "loss": 4.681, + "step": 21643 + }, + { + "epoch": 0.12872299933390427, + "grad_norm": 1.3800525665283203, + "learning_rate": 4.7983696786137386e-05, + "loss": 4.9215, + "step": 21644 + }, + { + "epoch": 0.12872894661718526, + "grad_norm": 1.2753770351409912, + "learning_rate": 4.7983513004231385e-05, + "loss": 5.0006, + "step": 21645 + }, + { + "epoch": 0.12873489390046627, + "grad_norm": 1.494894027709961, + "learning_rate": 4.7983329214302064e-05, + "loss": 4.9356, + "step": 21646 + }, + { + "epoch": 0.12874084118374726, + "grad_norm": 1.3660098314285278, + "learning_rate": 4.7983145416349505e-05, + "loss": 5.3071, + "step": 21647 + }, + { + "epoch": 0.12874678846702825, + "grad_norm": 1.3494385480880737, + "learning_rate": 4.798296161037377e-05, + "loss": 5.3493, + "step": 21648 + }, + { + "epoch": 0.12875273575030927, + "grad_norm": 1.2632153034210205, + "learning_rate": 4.798277779637492e-05, + "loss": 4.9825, + "step": 21649 + }, + { + "epoch": 0.12875868303359025, + "grad_norm": 1.3519765138626099, + "learning_rate": 4.7982593974353015e-05, + "loss": 4.9032, + "step": 21650 + }, + { + "epoch": 0.12876463031687124, + "grad_norm": 1.3728691339492798, + "learning_rate": 4.798241014430813e-05, + "loss": 5.0458, + "step": 21651 + }, + { + "epoch": 0.12877057760015226, + "grad_norm": 1.326675295829773, + "learning_rate": 4.798222630624032e-05, + "loss": 4.9129, + "step": 21652 + }, + { + "epoch": 0.12877652488343325, + "grad_norm": 1.4878405332565308, + "learning_rate": 4.798204246014965e-05, + "loss": 5.1253, + "step": 21653 + }, + { + "epoch": 0.12878247216671423, + "grad_norm": 1.322288990020752, + "learning_rate": 4.798185860603619e-05, + "loss": 5.1333, + "step": 21654 + }, + { + "epoch": 0.12878841944999525, + "grad_norm": 1.496812343597412, + "learning_rate": 4.7981674743899995e-05, + "loss": 5.0263, + "step": 21655 + }, + { + "epoch": 0.12879436673327624, + "grad_norm": 1.4336779117584229, + "learning_rate": 4.7981490873741144e-05, + "loss": 5.1177, + "step": 21656 + }, + { + "epoch": 0.12880031401655723, + "grad_norm": 1.380751132965088, + "learning_rate": 4.7981306995559684e-05, + "loss": 5.0884, + "step": 21657 + }, + { + "epoch": 0.12880626129983824, + "grad_norm": 1.3929660320281982, + "learning_rate": 4.798112310935569e-05, + "loss": 5.3662, + "step": 21658 + }, + { + "epoch": 0.12881220858311923, + "grad_norm": 1.2857346534729004, + "learning_rate": 4.798093921512923e-05, + "loss": 5.2264, + "step": 21659 + }, + { + "epoch": 0.12881815586640022, + "grad_norm": 1.2468816041946411, + "learning_rate": 4.798075531288035e-05, + "loss": 4.8248, + "step": 21660 + }, + { + "epoch": 0.12882410314968123, + "grad_norm": 1.43264901638031, + "learning_rate": 4.798057140260913e-05, + "loss": 5.3999, + "step": 21661 + }, + { + "epoch": 0.12883005043296222, + "grad_norm": 1.3590344190597534, + "learning_rate": 4.798038748431563e-05, + "loss": 5.1312, + "step": 21662 + }, + { + "epoch": 0.1288359977162432, + "grad_norm": 1.4812084436416626, + "learning_rate": 4.7980203557999915e-05, + "loss": 4.7615, + "step": 21663 + }, + { + "epoch": 0.12884194499952423, + "grad_norm": 1.4256600141525269, + "learning_rate": 4.798001962366205e-05, + "loss": 4.8678, + "step": 21664 + }, + { + "epoch": 0.12884789228280522, + "grad_norm": 1.1849418878555298, + "learning_rate": 4.7979835681302095e-05, + "loss": 4.8823, + "step": 21665 + }, + { + "epoch": 0.1288538395660862, + "grad_norm": 1.395228385925293, + "learning_rate": 4.7979651730920116e-05, + "loss": 4.682, + "step": 21666 + }, + { + "epoch": 0.12885978684936722, + "grad_norm": 1.2800064086914062, + "learning_rate": 4.7979467772516186e-05, + "loss": 4.7797, + "step": 21667 + }, + { + "epoch": 0.1288657341326482, + "grad_norm": 1.3429536819458008, + "learning_rate": 4.7979283806090346e-05, + "loss": 4.7517, + "step": 21668 + }, + { + "epoch": 0.1288716814159292, + "grad_norm": 1.359732747077942, + "learning_rate": 4.797909983164269e-05, + "loss": 4.7123, + "step": 21669 + }, + { + "epoch": 0.1288776286992102, + "grad_norm": 1.2731539011001587, + "learning_rate": 4.7978915849173254e-05, + "loss": 4.7211, + "step": 21670 + }, + { + "epoch": 0.1288835759824912, + "grad_norm": 1.3688287734985352, + "learning_rate": 4.797873185868213e-05, + "loss": 4.7257, + "step": 21671 + }, + { + "epoch": 0.1288895232657722, + "grad_norm": 1.4043165445327759, + "learning_rate": 4.797854786016936e-05, + "loss": 4.8099, + "step": 21672 + }, + { + "epoch": 0.1288954705490532, + "grad_norm": 1.3721412420272827, + "learning_rate": 4.797836385363502e-05, + "loss": 4.7698, + "step": 21673 + }, + { + "epoch": 0.1289014178323342, + "grad_norm": 1.4348787069320679, + "learning_rate": 4.797817983907917e-05, + "loss": 4.7587, + "step": 21674 + }, + { + "epoch": 0.12890736511561518, + "grad_norm": 1.133793592453003, + "learning_rate": 4.797799581650187e-05, + "loss": 4.8101, + "step": 21675 + }, + { + "epoch": 0.1289133123988962, + "grad_norm": 1.3624104261398315, + "learning_rate": 4.797781178590319e-05, + "loss": 4.7416, + "step": 21676 + }, + { + "epoch": 0.12891925968217718, + "grad_norm": 1.5194214582443237, + "learning_rate": 4.7977627747283196e-05, + "loss": 4.6894, + "step": 21677 + }, + { + "epoch": 0.12892520696545817, + "grad_norm": 1.3625789880752563, + "learning_rate": 4.7977443700641954e-05, + "loss": 4.8029, + "step": 21678 + }, + { + "epoch": 0.1289311542487392, + "grad_norm": 1.2961907386779785, + "learning_rate": 4.797725964597952e-05, + "loss": 4.718, + "step": 21679 + }, + { + "epoch": 0.12893710153202018, + "grad_norm": 1.4091925621032715, + "learning_rate": 4.797707558329596e-05, + "loss": 4.7604, + "step": 21680 + }, + { + "epoch": 0.12894304881530116, + "grad_norm": 1.2274402379989624, + "learning_rate": 4.797689151259134e-05, + "loss": 4.8241, + "step": 21681 + }, + { + "epoch": 0.12894899609858218, + "grad_norm": 1.3694384098052979, + "learning_rate": 4.797670743386573e-05, + "loss": 4.7724, + "step": 21682 + }, + { + "epoch": 0.12895494338186317, + "grad_norm": 1.3621066808700562, + "learning_rate": 4.7976523347119184e-05, + "loss": 4.685, + "step": 21683 + }, + { + "epoch": 0.12896089066514416, + "grad_norm": 1.418641209602356, + "learning_rate": 4.7976339252351766e-05, + "loss": 4.7379, + "step": 21684 + }, + { + "epoch": 0.12896683794842517, + "grad_norm": 1.3113913536071777, + "learning_rate": 4.797615514956355e-05, + "loss": 4.7922, + "step": 21685 + }, + { + "epoch": 0.12897278523170616, + "grad_norm": 1.3266078233718872, + "learning_rate": 4.79759710387546e-05, + "loss": 4.7116, + "step": 21686 + }, + { + "epoch": 0.12897873251498715, + "grad_norm": 1.5212455987930298, + "learning_rate": 4.7975786919924975e-05, + "loss": 4.8422, + "step": 21687 + }, + { + "epoch": 0.12898467979826816, + "grad_norm": 1.225883960723877, + "learning_rate": 4.797560279307473e-05, + "loss": 4.8641, + "step": 21688 + }, + { + "epoch": 0.12899062708154915, + "grad_norm": 1.451951026916504, + "learning_rate": 4.797541865820395e-05, + "loss": 4.7685, + "step": 21689 + }, + { + "epoch": 0.12899657436483014, + "grad_norm": 1.3755689859390259, + "learning_rate": 4.7975234515312694e-05, + "loss": 4.7828, + "step": 21690 + }, + { + "epoch": 0.12900252164811113, + "grad_norm": 1.2667524814605713, + "learning_rate": 4.797505036440101e-05, + "loss": 4.6897, + "step": 21691 + }, + { + "epoch": 0.12900846893139215, + "grad_norm": 1.4491240978240967, + "learning_rate": 4.797486620546898e-05, + "loss": 4.8052, + "step": 21692 + }, + { + "epoch": 0.12901441621467313, + "grad_norm": 1.21664559841156, + "learning_rate": 4.797468203851665e-05, + "loss": 4.712, + "step": 21693 + }, + { + "epoch": 0.12902036349795412, + "grad_norm": 1.3836992979049683, + "learning_rate": 4.797449786354411e-05, + "loss": 4.6642, + "step": 21694 + }, + { + "epoch": 0.12902631078123514, + "grad_norm": 1.4487723112106323, + "learning_rate": 4.79743136805514e-05, + "loss": 4.7088, + "step": 21695 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 1.2507479190826416, + "learning_rate": 4.79741294895386e-05, + "loss": 4.8429, + "step": 21696 + }, + { + "epoch": 0.1290382053477971, + "grad_norm": 1.231549620628357, + "learning_rate": 4.7973945290505766e-05, + "loss": 4.9336, + "step": 21697 + }, + { + "epoch": 0.12904415263107813, + "grad_norm": 1.2317709922790527, + "learning_rate": 4.797376108345297e-05, + "loss": 4.6885, + "step": 21698 + }, + { + "epoch": 0.12905009991435912, + "grad_norm": 1.2158896923065186, + "learning_rate": 4.797357686838026e-05, + "loss": 4.7068, + "step": 21699 + }, + { + "epoch": 0.1290560471976401, + "grad_norm": 1.5054548978805542, + "learning_rate": 4.7973392645287726e-05, + "loss": 4.8568, + "step": 21700 + }, + { + "epoch": 0.12906199448092112, + "grad_norm": 1.1551764011383057, + "learning_rate": 4.7973208414175406e-05, + "loss": 4.6746, + "step": 21701 + }, + { + "epoch": 0.1290679417642021, + "grad_norm": 1.3304046392440796, + "learning_rate": 4.7973024175043386e-05, + "loss": 4.8012, + "step": 21702 + }, + { + "epoch": 0.1290738890474831, + "grad_norm": 1.4763063192367554, + "learning_rate": 4.797283992789172e-05, + "loss": 4.7412, + "step": 21703 + }, + { + "epoch": 0.12907983633076411, + "grad_norm": 1.247894287109375, + "learning_rate": 4.797265567272047e-05, + "loss": 4.7786, + "step": 21704 + }, + { + "epoch": 0.1290857836140451, + "grad_norm": 1.3668314218521118, + "learning_rate": 4.79724714095297e-05, + "loss": 4.7728, + "step": 21705 + }, + { + "epoch": 0.1290917308973261, + "grad_norm": 1.3727326393127441, + "learning_rate": 4.7972287138319477e-05, + "loss": 4.8493, + "step": 21706 + }, + { + "epoch": 0.1290976781806071, + "grad_norm": 1.3531663417816162, + "learning_rate": 4.797210285908987e-05, + "loss": 4.7598, + "step": 21707 + }, + { + "epoch": 0.1291036254638881, + "grad_norm": 1.4899832010269165, + "learning_rate": 4.797191857184094e-05, + "loss": 4.7274, + "step": 21708 + }, + { + "epoch": 0.12910957274716908, + "grad_norm": 1.3908995389938354, + "learning_rate": 4.7971734276572744e-05, + "loss": 4.9911, + "step": 21709 + }, + { + "epoch": 0.1291155200304501, + "grad_norm": 1.621774435043335, + "learning_rate": 4.7971549973285357e-05, + "loss": 5.0285, + "step": 21710 + }, + { + "epoch": 0.1291214673137311, + "grad_norm": 1.415650725364685, + "learning_rate": 4.797136566197884e-05, + "loss": 5.0959, + "step": 21711 + }, + { + "epoch": 0.12912741459701207, + "grad_norm": 1.4947463274002075, + "learning_rate": 4.797118134265326e-05, + "loss": 4.9473, + "step": 21712 + }, + { + "epoch": 0.1291333618802931, + "grad_norm": 1.45017409324646, + "learning_rate": 4.7970997015308674e-05, + "loss": 4.9928, + "step": 21713 + }, + { + "epoch": 0.12913930916357408, + "grad_norm": 1.2527333498001099, + "learning_rate": 4.7970812679945145e-05, + "loss": 4.9771, + "step": 21714 + }, + { + "epoch": 0.12914525644685507, + "grad_norm": 1.457526683807373, + "learning_rate": 4.797062833656275e-05, + "loss": 5.0143, + "step": 21715 + }, + { + "epoch": 0.12915120373013608, + "grad_norm": 1.1785821914672852, + "learning_rate": 4.7970443985161546e-05, + "loss": 5.1313, + "step": 21716 + }, + { + "epoch": 0.12915715101341707, + "grad_norm": 1.3593906164169312, + "learning_rate": 4.79702596257416e-05, + "loss": 5.0906, + "step": 21717 + }, + { + "epoch": 0.12916309829669806, + "grad_norm": 1.3789628744125366, + "learning_rate": 4.797007525830296e-05, + "loss": 5.0352, + "step": 21718 + }, + { + "epoch": 0.12916904557997907, + "grad_norm": 1.282631278038025, + "learning_rate": 4.796989088284571e-05, + "loss": 4.9973, + "step": 21719 + }, + { + "epoch": 0.12917499286326006, + "grad_norm": 1.2933098077774048, + "learning_rate": 4.796970649936991e-05, + "loss": 5.0783, + "step": 21720 + }, + { + "epoch": 0.12918094014654105, + "grad_norm": 1.3595205545425415, + "learning_rate": 4.796952210787563e-05, + "loss": 5.158, + "step": 21721 + }, + { + "epoch": 0.12918688742982207, + "grad_norm": 1.3962088823318481, + "learning_rate": 4.796933770836293e-05, + "loss": 4.9939, + "step": 21722 + }, + { + "epoch": 0.12919283471310306, + "grad_norm": 1.382554292678833, + "learning_rate": 4.796915330083186e-05, + "loss": 4.8864, + "step": 21723 + }, + { + "epoch": 0.12919878199638404, + "grad_norm": 1.3807674646377563, + "learning_rate": 4.7968968885282495e-05, + "loss": 5.0454, + "step": 21724 + }, + { + "epoch": 0.12920472927966506, + "grad_norm": 1.276809811592102, + "learning_rate": 4.7968784461714905e-05, + "loss": 5.1221, + "step": 21725 + }, + { + "epoch": 0.12921067656294605, + "grad_norm": 1.230714201927185, + "learning_rate": 4.796860003012915e-05, + "loss": 5.0382, + "step": 21726 + }, + { + "epoch": 0.12921662384622704, + "grad_norm": 1.2899225950241089, + "learning_rate": 4.796841559052529e-05, + "loss": 4.8591, + "step": 21727 + }, + { + "epoch": 0.12922257112950805, + "grad_norm": 1.3561869859695435, + "learning_rate": 4.79682311429034e-05, + "loss": 4.8811, + "step": 21728 + }, + { + "epoch": 0.12922851841278904, + "grad_norm": 1.600656270980835, + "learning_rate": 4.796804668726353e-05, + "loss": 4.9317, + "step": 21729 + }, + { + "epoch": 0.12923446569607003, + "grad_norm": 1.4110677242279053, + "learning_rate": 4.7967862223605756e-05, + "loss": 5.0964, + "step": 21730 + }, + { + "epoch": 0.12924041297935104, + "grad_norm": 1.2293707132339478, + "learning_rate": 4.796767775193014e-05, + "loss": 5.2952, + "step": 21731 + }, + { + "epoch": 0.12924636026263203, + "grad_norm": 1.4413278102874756, + "learning_rate": 4.796749327223674e-05, + "loss": 4.9628, + "step": 21732 + }, + { + "epoch": 0.12925230754591302, + "grad_norm": 1.4178003072738647, + "learning_rate": 4.7967308784525635e-05, + "loss": 4.7142, + "step": 21733 + }, + { + "epoch": 0.12925825482919404, + "grad_norm": 1.2427667379379272, + "learning_rate": 4.7967124288796875e-05, + "loss": 5.2655, + "step": 21734 + }, + { + "epoch": 0.12926420211247502, + "grad_norm": 1.3278542757034302, + "learning_rate": 4.796693978505052e-05, + "loss": 5.0156, + "step": 21735 + }, + { + "epoch": 0.129270149395756, + "grad_norm": 1.3728119134902954, + "learning_rate": 4.7966755273286656e-05, + "loss": 5.4176, + "step": 21736 + }, + { + "epoch": 0.12927609667903703, + "grad_norm": 1.344072937965393, + "learning_rate": 4.796657075350533e-05, + "loss": 4.8808, + "step": 21737 + }, + { + "epoch": 0.12928204396231802, + "grad_norm": 1.2877874374389648, + "learning_rate": 4.796638622570661e-05, + "loss": 5.0312, + "step": 21738 + }, + { + "epoch": 0.129287991245599, + "grad_norm": 1.3147602081298828, + "learning_rate": 4.7966201689890566e-05, + "loss": 5.0241, + "step": 21739 + }, + { + "epoch": 0.12929393852888002, + "grad_norm": 1.3858917951583862, + "learning_rate": 4.796601714605726e-05, + "loss": 4.468, + "step": 21740 + }, + { + "epoch": 0.129299885812161, + "grad_norm": 1.4089725017547607, + "learning_rate": 4.7965832594206747e-05, + "loss": 4.587, + "step": 21741 + }, + { + "epoch": 0.129305833095442, + "grad_norm": 1.4754424095153809, + "learning_rate": 4.796564803433911e-05, + "loss": 4.8697, + "step": 21742 + }, + { + "epoch": 0.129311780378723, + "grad_norm": 1.557544231414795, + "learning_rate": 4.796546346645439e-05, + "loss": 5.058, + "step": 21743 + }, + { + "epoch": 0.129317727662004, + "grad_norm": 1.3962191343307495, + "learning_rate": 4.7965278890552666e-05, + "loss": 5.172, + "step": 21744 + }, + { + "epoch": 0.129323674945285, + "grad_norm": 1.4976222515106201, + "learning_rate": 4.796509430663401e-05, + "loss": 5.2878, + "step": 21745 + }, + { + "epoch": 0.129329622228566, + "grad_norm": 1.3315789699554443, + "learning_rate": 4.796490971469847e-05, + "loss": 5.0468, + "step": 21746 + }, + { + "epoch": 0.129335569511847, + "grad_norm": 1.3718360662460327, + "learning_rate": 4.796472511474611e-05, + "loss": 4.9696, + "step": 21747 + }, + { + "epoch": 0.12934151679512798, + "grad_norm": 1.4873707294464111, + "learning_rate": 4.7964540506777014e-05, + "loss": 4.9281, + "step": 21748 + }, + { + "epoch": 0.12934746407840897, + "grad_norm": 1.3806785345077515, + "learning_rate": 4.7964355890791226e-05, + "loss": 5.1646, + "step": 21749 + }, + { + "epoch": 0.12935341136168998, + "grad_norm": 1.4873976707458496, + "learning_rate": 4.796417126678883e-05, + "loss": 5.1125, + "step": 21750 + }, + { + "epoch": 0.12935935864497097, + "grad_norm": 1.3314671516418457, + "learning_rate": 4.7963986634769864e-05, + "loss": 5.0819, + "step": 21751 + }, + { + "epoch": 0.12936530592825196, + "grad_norm": 1.2392772436141968, + "learning_rate": 4.796380199473442e-05, + "loss": 5.0049, + "step": 21752 + }, + { + "epoch": 0.12937125321153298, + "grad_norm": 1.4799960851669312, + "learning_rate": 4.7963617346682544e-05, + "loss": 4.8518, + "step": 21753 + }, + { + "epoch": 0.12937720049481397, + "grad_norm": 1.5646624565124512, + "learning_rate": 4.796343269061431e-05, + "loss": 4.5612, + "step": 21754 + }, + { + "epoch": 0.12938314777809495, + "grad_norm": 1.5001260042190552, + "learning_rate": 4.796324802652977e-05, + "loss": 4.8736, + "step": 21755 + }, + { + "epoch": 0.12938909506137597, + "grad_norm": 1.4235304594039917, + "learning_rate": 4.7963063354429004e-05, + "loss": 4.9256, + "step": 21756 + }, + { + "epoch": 0.12939504234465696, + "grad_norm": 1.3335869312286377, + "learning_rate": 4.7962878674312075e-05, + "loss": 4.7066, + "step": 21757 + }, + { + "epoch": 0.12940098962793795, + "grad_norm": 1.2664694786071777, + "learning_rate": 4.7962693986179036e-05, + "loss": 4.7202, + "step": 21758 + }, + { + "epoch": 0.12940693691121896, + "grad_norm": 1.2120671272277832, + "learning_rate": 4.7962509290029954e-05, + "loss": 4.8417, + "step": 21759 + }, + { + "epoch": 0.12941288419449995, + "grad_norm": 1.3657382726669312, + "learning_rate": 4.7962324585864906e-05, + "loss": 4.6566, + "step": 21760 + }, + { + "epoch": 0.12941883147778094, + "grad_norm": 1.3212461471557617, + "learning_rate": 4.7962139873683944e-05, + "loss": 4.8251, + "step": 21761 + }, + { + "epoch": 0.12942477876106195, + "grad_norm": 1.9045685529708862, + "learning_rate": 4.7961955153487137e-05, + "loss": 4.5268, + "step": 21762 + }, + { + "epoch": 0.12943072604434294, + "grad_norm": 1.536188006401062, + "learning_rate": 4.7961770425274545e-05, + "loss": 4.8356, + "step": 21763 + }, + { + "epoch": 0.12943667332762393, + "grad_norm": 1.4966436624526978, + "learning_rate": 4.796158568904624e-05, + "loss": 4.485, + "step": 21764 + }, + { + "epoch": 0.12944262061090495, + "grad_norm": 1.377543568611145, + "learning_rate": 4.796140094480228e-05, + "loss": 4.7828, + "step": 21765 + }, + { + "epoch": 0.12944856789418593, + "grad_norm": 1.6093590259552002, + "learning_rate": 4.796121619254273e-05, + "loss": 4.6621, + "step": 21766 + }, + { + "epoch": 0.12945451517746692, + "grad_norm": 1.4633464813232422, + "learning_rate": 4.796103143226767e-05, + "loss": 4.7979, + "step": 21767 + }, + { + "epoch": 0.12946046246074794, + "grad_norm": 1.332219123840332, + "learning_rate": 4.7960846663977136e-05, + "loss": 4.8313, + "step": 21768 + }, + { + "epoch": 0.12946640974402893, + "grad_norm": 1.2190324068069458, + "learning_rate": 4.796066188767121e-05, + "loss": 4.6559, + "step": 21769 + }, + { + "epoch": 0.12947235702730991, + "grad_norm": 1.4958453178405762, + "learning_rate": 4.796047710334996e-05, + "loss": 4.7633, + "step": 21770 + }, + { + "epoch": 0.12947830431059093, + "grad_norm": 1.2693027257919312, + "learning_rate": 4.796029231101344e-05, + "loss": 4.7291, + "step": 21771 + }, + { + "epoch": 0.12948425159387192, + "grad_norm": 1.2988125085830688, + "learning_rate": 4.7960107510661725e-05, + "loss": 4.7817, + "step": 21772 + }, + { + "epoch": 0.1294901988771529, + "grad_norm": 1.355332374572754, + "learning_rate": 4.7959922702294866e-05, + "loss": 4.6112, + "step": 21773 + }, + { + "epoch": 0.12949614616043392, + "grad_norm": 1.3531986474990845, + "learning_rate": 4.7959737885912934e-05, + "loss": 4.7711, + "step": 21774 + }, + { + "epoch": 0.1295020934437149, + "grad_norm": 1.275888204574585, + "learning_rate": 4.7959553061516004e-05, + "loss": 4.9089, + "step": 21775 + }, + { + "epoch": 0.1295080407269959, + "grad_norm": 1.4016762971878052, + "learning_rate": 4.795936822910413e-05, + "loss": 4.8768, + "step": 21776 + }, + { + "epoch": 0.12951398801027691, + "grad_norm": 1.5274311304092407, + "learning_rate": 4.795918338867737e-05, + "loss": 4.7434, + "step": 21777 + }, + { + "epoch": 0.1295199352935579, + "grad_norm": 1.4976401329040527, + "learning_rate": 4.79589985402358e-05, + "loss": 4.992, + "step": 21778 + }, + { + "epoch": 0.1295258825768389, + "grad_norm": 1.5180116891860962, + "learning_rate": 4.795881368377948e-05, + "loss": 5.1312, + "step": 21779 + }, + { + "epoch": 0.1295318298601199, + "grad_norm": 1.3271901607513428, + "learning_rate": 4.795862881930848e-05, + "loss": 5.1021, + "step": 21780 + }, + { + "epoch": 0.1295377771434009, + "grad_norm": 1.5069388151168823, + "learning_rate": 4.795844394682286e-05, + "loss": 4.8872, + "step": 21781 + }, + { + "epoch": 0.12954372442668188, + "grad_norm": 1.4247567653656006, + "learning_rate": 4.795825906632267e-05, + "loss": 5.0028, + "step": 21782 + }, + { + "epoch": 0.1295496717099629, + "grad_norm": 1.4976978302001953, + "learning_rate": 4.795807417780801e-05, + "loss": 5.0181, + "step": 21783 + }, + { + "epoch": 0.1295556189932439, + "grad_norm": 1.291518211364746, + "learning_rate": 4.7957889281278913e-05, + "loss": 4.8314, + "step": 21784 + }, + { + "epoch": 0.12956156627652488, + "grad_norm": 1.352803349494934, + "learning_rate": 4.7957704376735455e-05, + "loss": 4.916, + "step": 21785 + }, + { + "epoch": 0.1295675135598059, + "grad_norm": 1.3911688327789307, + "learning_rate": 4.7957519464177695e-05, + "loss": 5.1256, + "step": 21786 + }, + { + "epoch": 0.12957346084308688, + "grad_norm": 1.2493035793304443, + "learning_rate": 4.795733454360571e-05, + "loss": 4.8268, + "step": 21787 + }, + { + "epoch": 0.12957940812636787, + "grad_norm": 1.4249591827392578, + "learning_rate": 4.7957149615019547e-05, + "loss": 4.8414, + "step": 21788 + }, + { + "epoch": 0.12958535540964888, + "grad_norm": 1.5388774871826172, + "learning_rate": 4.795696467841929e-05, + "loss": 4.6288, + "step": 21789 + }, + { + "epoch": 0.12959130269292987, + "grad_norm": 1.1780091524124146, + "learning_rate": 4.795677973380499e-05, + "loss": 4.5712, + "step": 21790 + }, + { + "epoch": 0.12959724997621086, + "grad_norm": 1.2415392398834229, + "learning_rate": 4.7956594781176716e-05, + "loss": 4.8536, + "step": 21791 + }, + { + "epoch": 0.12960319725949188, + "grad_norm": 1.2828611135482788, + "learning_rate": 4.795640982053453e-05, + "loss": 5.1549, + "step": 21792 + }, + { + "epoch": 0.12960914454277286, + "grad_norm": 1.5143916606903076, + "learning_rate": 4.79562248518785e-05, + "loss": 5.2302, + "step": 21793 + }, + { + "epoch": 0.12961509182605385, + "grad_norm": 1.3260207176208496, + "learning_rate": 4.795603987520869e-05, + "loss": 4.9272, + "step": 21794 + }, + { + "epoch": 0.12962103910933487, + "grad_norm": 1.2133897542953491, + "learning_rate": 4.795585489052516e-05, + "loss": 4.8229, + "step": 21795 + }, + { + "epoch": 0.12962698639261586, + "grad_norm": 1.5181169509887695, + "learning_rate": 4.795566989782798e-05, + "loss": 4.8024, + "step": 21796 + }, + { + "epoch": 0.12963293367589684, + "grad_norm": 1.3889726400375366, + "learning_rate": 4.795548489711722e-05, + "loss": 4.5859, + "step": 21797 + }, + { + "epoch": 0.12963888095917786, + "grad_norm": 1.543861985206604, + "learning_rate": 4.7955299888392924e-05, + "loss": 4.7782, + "step": 21798 + }, + { + "epoch": 0.12964482824245885, + "grad_norm": 1.4648151397705078, + "learning_rate": 4.795511487165518e-05, + "loss": 4.9949, + "step": 21799 + }, + { + "epoch": 0.12965077552573984, + "grad_norm": 1.2487531900405884, + "learning_rate": 4.795492984690404e-05, + "loss": 5.0329, + "step": 21800 + }, + { + "epoch": 0.12965672280902085, + "grad_norm": 1.503164529800415, + "learning_rate": 4.795474481413957e-05, + "loss": 4.7723, + "step": 21801 + }, + { + "epoch": 0.12966267009230184, + "grad_norm": 1.3406294584274292, + "learning_rate": 4.795455977336184e-05, + "loss": 4.9541, + "step": 21802 + }, + { + "epoch": 0.12966861737558283, + "grad_norm": 1.4314171075820923, + "learning_rate": 4.795437472457091e-05, + "loss": 5.018, + "step": 21803 + }, + { + "epoch": 0.12967456465886384, + "grad_norm": 1.3255850076675415, + "learning_rate": 4.795418966776683e-05, + "loss": 4.7675, + "step": 21804 + }, + { + "epoch": 0.12968051194214483, + "grad_norm": 1.6132442951202393, + "learning_rate": 4.7954004602949697e-05, + "loss": 4.8068, + "step": 21805 + }, + { + "epoch": 0.12968645922542582, + "grad_norm": 1.25650954246521, + "learning_rate": 4.7953819530119555e-05, + "loss": 4.8709, + "step": 21806 + }, + { + "epoch": 0.1296924065087068, + "grad_norm": 1.3686168193817139, + "learning_rate": 4.795363444927646e-05, + "loss": 4.8815, + "step": 21807 + }, + { + "epoch": 0.12969835379198782, + "grad_norm": 1.250143051147461, + "learning_rate": 4.79534493604205e-05, + "loss": 4.9077, + "step": 21808 + }, + { + "epoch": 0.1297043010752688, + "grad_norm": 1.421834111213684, + "learning_rate": 4.795326426355173e-05, + "loss": 4.806, + "step": 21809 + }, + { + "epoch": 0.1297102483585498, + "grad_norm": 1.3038170337677002, + "learning_rate": 4.795307915867021e-05, + "loss": 5.0142, + "step": 21810 + }, + { + "epoch": 0.12971619564183082, + "grad_norm": 1.390637993812561, + "learning_rate": 4.7952894045776e-05, + "loss": 4.8802, + "step": 21811 + }, + { + "epoch": 0.1297221429251118, + "grad_norm": 1.3310891389846802, + "learning_rate": 4.7952708924869184e-05, + "loss": 4.7995, + "step": 21812 + }, + { + "epoch": 0.1297280902083928, + "grad_norm": 1.243156909942627, + "learning_rate": 4.79525237959498e-05, + "loss": 4.6147, + "step": 21813 + }, + { + "epoch": 0.1297340374916738, + "grad_norm": 1.522707223892212, + "learning_rate": 4.7952338659017934e-05, + "loss": 4.6666, + "step": 21814 + }, + { + "epoch": 0.1297399847749548, + "grad_norm": 1.3331211805343628, + "learning_rate": 4.795215351407365e-05, + "loss": 4.7236, + "step": 21815 + }, + { + "epoch": 0.12974593205823579, + "grad_norm": 1.3704382181167603, + "learning_rate": 4.7951968361116996e-05, + "loss": 5.299, + "step": 21816 + }, + { + "epoch": 0.1297518793415168, + "grad_norm": 1.4870846271514893, + "learning_rate": 4.7951783200148055e-05, + "loss": 5.2623, + "step": 21817 + }, + { + "epoch": 0.1297578266247978, + "grad_norm": 1.4282408952713013, + "learning_rate": 4.795159803116688e-05, + "loss": 5.075, + "step": 21818 + }, + { + "epoch": 0.12976377390807878, + "grad_norm": 1.408409595489502, + "learning_rate": 4.795141285417354e-05, + "loss": 4.7274, + "step": 21819 + }, + { + "epoch": 0.1297697211913598, + "grad_norm": 1.4432475566864014, + "learning_rate": 4.79512276691681e-05, + "loss": 4.8196, + "step": 21820 + }, + { + "epoch": 0.12977566847464078, + "grad_norm": 1.6136623620986938, + "learning_rate": 4.7951042476150624e-05, + "loss": 4.7634, + "step": 21821 + }, + { + "epoch": 0.12978161575792177, + "grad_norm": 1.13461434841156, + "learning_rate": 4.795085727512117e-05, + "loss": 4.9421, + "step": 21822 + }, + { + "epoch": 0.12978756304120279, + "grad_norm": 1.2107611894607544, + "learning_rate": 4.795067206607981e-05, + "loss": 5.1572, + "step": 21823 + }, + { + "epoch": 0.12979351032448377, + "grad_norm": 1.8843787908554077, + "learning_rate": 4.795048684902661e-05, + "loss": 5.4081, + "step": 21824 + }, + { + "epoch": 0.12979945760776476, + "grad_norm": 1.192597508430481, + "learning_rate": 4.7950301623961633e-05, + "loss": 4.9609, + "step": 21825 + }, + { + "epoch": 0.12980540489104578, + "grad_norm": 1.4349040985107422, + "learning_rate": 4.795011639088495e-05, + "loss": 4.72, + "step": 21826 + }, + { + "epoch": 0.12981135217432677, + "grad_norm": 1.8054217100143433, + "learning_rate": 4.79499311497966e-05, + "loss": 5.5003, + "step": 21827 + }, + { + "epoch": 0.12981729945760775, + "grad_norm": 1.521070122718811, + "learning_rate": 4.794974590069669e-05, + "loss": 5.5325, + "step": 21828 + }, + { + "epoch": 0.12982324674088877, + "grad_norm": 1.936892032623291, + "learning_rate": 4.794956064358524e-05, + "loss": 4.6644, + "step": 21829 + }, + { + "epoch": 0.12982919402416976, + "grad_norm": 1.9401378631591797, + "learning_rate": 4.794937537846234e-05, + "loss": 4.7442, + "step": 21830 + }, + { + "epoch": 0.12983514130745075, + "grad_norm": 1.3924851417541504, + "learning_rate": 4.794919010532806e-05, + "loss": 4.9434, + "step": 21831 + }, + { + "epoch": 0.12984108859073176, + "grad_norm": 1.3180463314056396, + "learning_rate": 4.794900482418244e-05, + "loss": 4.9098, + "step": 21832 + }, + { + "epoch": 0.12984703587401275, + "grad_norm": 1.3872355222702026, + "learning_rate": 4.7948819535025565e-05, + "loss": 4.8212, + "step": 21833 + }, + { + "epoch": 0.12985298315729374, + "grad_norm": 1.2868075370788574, + "learning_rate": 4.79486342378575e-05, + "loss": 4.7609, + "step": 21834 + }, + { + "epoch": 0.12985893044057475, + "grad_norm": 1.4286006689071655, + "learning_rate": 4.79484489326783e-05, + "loss": 4.828, + "step": 21835 + }, + { + "epoch": 0.12986487772385574, + "grad_norm": 1.3485580682754517, + "learning_rate": 4.794826361948804e-05, + "loss": 4.7596, + "step": 21836 + }, + { + "epoch": 0.12987082500713673, + "grad_norm": 1.469319224357605, + "learning_rate": 4.794807829828677e-05, + "loss": 4.8431, + "step": 21837 + }, + { + "epoch": 0.12987677229041775, + "grad_norm": 1.4626957178115845, + "learning_rate": 4.794789296907457e-05, + "loss": 4.7884, + "step": 21838 + }, + { + "epoch": 0.12988271957369873, + "grad_norm": 1.2266536951065063, + "learning_rate": 4.794770763185149e-05, + "loss": 4.8359, + "step": 21839 + }, + { + "epoch": 0.12988866685697972, + "grad_norm": 1.2295827865600586, + "learning_rate": 4.794752228661761e-05, + "loss": 4.6327, + "step": 21840 + }, + { + "epoch": 0.12989461414026074, + "grad_norm": 1.4784702062606812, + "learning_rate": 4.794733693337298e-05, + "loss": 4.8363, + "step": 21841 + }, + { + "epoch": 0.12990056142354173, + "grad_norm": 1.6527009010314941, + "learning_rate": 4.794715157211767e-05, + "loss": 5.0696, + "step": 21842 + }, + { + "epoch": 0.12990650870682272, + "grad_norm": 1.7082421779632568, + "learning_rate": 4.7946966202851754e-05, + "loss": 4.8249, + "step": 21843 + }, + { + "epoch": 0.12991245599010373, + "grad_norm": 1.5493143796920776, + "learning_rate": 4.794678082557529e-05, + "loss": 4.9604, + "step": 21844 + }, + { + "epoch": 0.12991840327338472, + "grad_norm": 1.631940245628357, + "learning_rate": 4.7946595440288335e-05, + "loss": 4.6672, + "step": 21845 + }, + { + "epoch": 0.1299243505566657, + "grad_norm": 1.3021342754364014, + "learning_rate": 4.794641004699096e-05, + "loss": 4.821, + "step": 21846 + }, + { + "epoch": 0.12993029783994672, + "grad_norm": 1.331272006034851, + "learning_rate": 4.794622464568324e-05, + "loss": 5.1398, + "step": 21847 + }, + { + "epoch": 0.1299362451232277, + "grad_norm": 1.5635039806365967, + "learning_rate": 4.794603923636522e-05, + "loss": 5.0405, + "step": 21848 + }, + { + "epoch": 0.1299421924065087, + "grad_norm": 1.412961721420288, + "learning_rate": 4.794585381903698e-05, + "loss": 5.1334, + "step": 21849 + }, + { + "epoch": 0.12994813968978972, + "grad_norm": 1.0943198204040527, + "learning_rate": 4.794566839369857e-05, + "loss": 5.1978, + "step": 21850 + }, + { + "epoch": 0.1299540869730707, + "grad_norm": 1.6458427906036377, + "learning_rate": 4.794548296035007e-05, + "loss": 4.6475, + "step": 21851 + }, + { + "epoch": 0.1299600342563517, + "grad_norm": 1.37641179561615, + "learning_rate": 4.794529751899155e-05, + "loss": 5.0094, + "step": 21852 + }, + { + "epoch": 0.1299659815396327, + "grad_norm": 1.6493875980377197, + "learning_rate": 4.7945112069623054e-05, + "loss": 4.9748, + "step": 21853 + }, + { + "epoch": 0.1299719288229137, + "grad_norm": 1.4612071514129639, + "learning_rate": 4.794492661224466e-05, + "loss": 5.1217, + "step": 21854 + }, + { + "epoch": 0.12997787610619468, + "grad_norm": 1.4929149150848389, + "learning_rate": 4.7944741146856425e-05, + "loss": 4.916, + "step": 21855 + }, + { + "epoch": 0.1299838233894757, + "grad_norm": 1.5030015707015991, + "learning_rate": 4.794455567345842e-05, + "loss": 5.1206, + "step": 21856 + }, + { + "epoch": 0.1299897706727567, + "grad_norm": 1.3132811784744263, + "learning_rate": 4.79443701920507e-05, + "loss": 5.1996, + "step": 21857 + }, + { + "epoch": 0.12999571795603768, + "grad_norm": 1.3515914678573608, + "learning_rate": 4.794418470263335e-05, + "loss": 4.8565, + "step": 21858 + }, + { + "epoch": 0.1300016652393187, + "grad_norm": 1.3780977725982666, + "learning_rate": 4.7943999205206414e-05, + "loss": 4.9207, + "step": 21859 + }, + { + "epoch": 0.13000761252259968, + "grad_norm": 1.3044095039367676, + "learning_rate": 4.794381369976997e-05, + "loss": 5.0898, + "step": 21860 + }, + { + "epoch": 0.13001355980588067, + "grad_norm": 1.3406704664230347, + "learning_rate": 4.7943628186324076e-05, + "loss": 4.942, + "step": 21861 + }, + { + "epoch": 0.13001950708916168, + "grad_norm": 1.2654430866241455, + "learning_rate": 4.7943442664868795e-05, + "loss": 5.2096, + "step": 21862 + }, + { + "epoch": 0.13002545437244267, + "grad_norm": 1.313717007637024, + "learning_rate": 4.79432571354042e-05, + "loss": 4.9946, + "step": 21863 + }, + { + "epoch": 0.13003140165572366, + "grad_norm": 1.0787066221237183, + "learning_rate": 4.794307159793035e-05, + "loss": 4.9556, + "step": 21864 + }, + { + "epoch": 0.13003734893900465, + "grad_norm": 1.3731575012207031, + "learning_rate": 4.794288605244731e-05, + "loss": 4.904, + "step": 21865 + }, + { + "epoch": 0.13004329622228566, + "grad_norm": 1.4843237400054932, + "learning_rate": 4.794270049895514e-05, + "loss": 5.1451, + "step": 21866 + }, + { + "epoch": 0.13004924350556665, + "grad_norm": 1.3293545246124268, + "learning_rate": 4.794251493745392e-05, + "loss": 5.1794, + "step": 21867 + }, + { + "epoch": 0.13005519078884764, + "grad_norm": 1.6757280826568604, + "learning_rate": 4.79423293679437e-05, + "loss": 4.9797, + "step": 21868 + }, + { + "epoch": 0.13006113807212866, + "grad_norm": 1.7158734798431396, + "learning_rate": 4.794214379042456e-05, + "loss": 4.7833, + "step": 21869 + }, + { + "epoch": 0.13006708535540965, + "grad_norm": 2.164602756500244, + "learning_rate": 4.794195820489654e-05, + "loss": 4.4662, + "step": 21870 + }, + { + "epoch": 0.13007303263869063, + "grad_norm": 1.5726985931396484, + "learning_rate": 4.794177261135972e-05, + "loss": 5.3064, + "step": 21871 + }, + { + "epoch": 0.13007897992197165, + "grad_norm": 1.3667716979980469, + "learning_rate": 4.794158700981417e-05, + "loss": 5.0881, + "step": 21872 + }, + { + "epoch": 0.13008492720525264, + "grad_norm": 1.5155465602874756, + "learning_rate": 4.794140140025994e-05, + "loss": 4.95, + "step": 21873 + }, + { + "epoch": 0.13009087448853363, + "grad_norm": 1.4024773836135864, + "learning_rate": 4.794121578269712e-05, + "loss": 5.1932, + "step": 21874 + }, + { + "epoch": 0.13009682177181464, + "grad_norm": 1.3104946613311768, + "learning_rate": 4.7941030157125746e-05, + "loss": 5.1143, + "step": 21875 + }, + { + "epoch": 0.13010276905509563, + "grad_norm": 1.3269513845443726, + "learning_rate": 4.79408445235459e-05, + "loss": 5.1411, + "step": 21876 + }, + { + "epoch": 0.13010871633837662, + "grad_norm": 1.3147937059402466, + "learning_rate": 4.7940658881957645e-05, + "loss": 5.0444, + "step": 21877 + }, + { + "epoch": 0.13011466362165763, + "grad_norm": 1.125897765159607, + "learning_rate": 4.794047323236104e-05, + "loss": 5.0522, + "step": 21878 + }, + { + "epoch": 0.13012061090493862, + "grad_norm": 1.331945776939392, + "learning_rate": 4.794028757475615e-05, + "loss": 5.1433, + "step": 21879 + }, + { + "epoch": 0.1301265581882196, + "grad_norm": 1.206411361694336, + "learning_rate": 4.794010190914304e-05, + "loss": 4.7293, + "step": 21880 + }, + { + "epoch": 0.13013250547150063, + "grad_norm": 1.6212915182113647, + "learning_rate": 4.793991623552179e-05, + "loss": 4.5976, + "step": 21881 + }, + { + "epoch": 0.13013845275478161, + "grad_norm": 1.4009672403335571, + "learning_rate": 4.793973055389244e-05, + "loss": 4.8846, + "step": 21882 + }, + { + "epoch": 0.1301444000380626, + "grad_norm": 1.5049399137496948, + "learning_rate": 4.793954486425507e-05, + "loss": 4.7785, + "step": 21883 + }, + { + "epoch": 0.13015034732134362, + "grad_norm": 1.496751070022583, + "learning_rate": 4.7939359166609746e-05, + "loss": 4.5957, + "step": 21884 + }, + { + "epoch": 0.1301562946046246, + "grad_norm": 1.7572035789489746, + "learning_rate": 4.7939173460956525e-05, + "loss": 4.8929, + "step": 21885 + }, + { + "epoch": 0.1301622418879056, + "grad_norm": 1.593353271484375, + "learning_rate": 4.793898774729548e-05, + "loss": 5.6704, + "step": 21886 + }, + { + "epoch": 0.1301681891711866, + "grad_norm": 1.4550076723098755, + "learning_rate": 4.7938802025626665e-05, + "loss": 5.6588, + "step": 21887 + }, + { + "epoch": 0.1301741364544676, + "grad_norm": 1.6618671417236328, + "learning_rate": 4.793861629595015e-05, + "loss": 5.6571, + "step": 21888 + }, + { + "epoch": 0.1301800837377486, + "grad_norm": 1.4493645429611206, + "learning_rate": 4.793843055826601e-05, + "loss": 5.4406, + "step": 21889 + }, + { + "epoch": 0.1301860310210296, + "grad_norm": 1.5164732933044434, + "learning_rate": 4.793824481257429e-05, + "loss": 5.4872, + "step": 21890 + }, + { + "epoch": 0.1301919783043106, + "grad_norm": 1.5956424474716187, + "learning_rate": 4.793805905887508e-05, + "loss": 4.7702, + "step": 21891 + }, + { + "epoch": 0.13019792558759158, + "grad_norm": 1.850864291191101, + "learning_rate": 4.7937873297168425e-05, + "loss": 4.6842, + "step": 21892 + }, + { + "epoch": 0.1302038728708726, + "grad_norm": 1.637451171875, + "learning_rate": 4.793768752745439e-05, + "loss": 5.2488, + "step": 21893 + }, + { + "epoch": 0.13020982015415358, + "grad_norm": 1.5980913639068604, + "learning_rate": 4.793750174973305e-05, + "loss": 5.4026, + "step": 21894 + }, + { + "epoch": 0.13021576743743457, + "grad_norm": 1.7420471906661987, + "learning_rate": 4.793731596400446e-05, + "loss": 5.2409, + "step": 21895 + }, + { + "epoch": 0.1302217147207156, + "grad_norm": 2.749483346939087, + "learning_rate": 4.7937130170268694e-05, + "loss": 5.3401, + "step": 21896 + }, + { + "epoch": 0.13022766200399657, + "grad_norm": 2.610828399658203, + "learning_rate": 4.793694436852581e-05, + "loss": 5.0967, + "step": 21897 + }, + { + "epoch": 0.13023360928727756, + "grad_norm": 2.5725367069244385, + "learning_rate": 4.793675855877588e-05, + "loss": 5.1184, + "step": 21898 + }, + { + "epoch": 0.13023955657055858, + "grad_norm": 2.438526153564453, + "learning_rate": 4.793657274101896e-05, + "loss": 5.1315, + "step": 21899 + }, + { + "epoch": 0.13024550385383957, + "grad_norm": 2.2574191093444824, + "learning_rate": 4.793638691525513e-05, + "loss": 4.9999, + "step": 21900 + }, + { + "epoch": 0.13025145113712056, + "grad_norm": 1.9024723768234253, + "learning_rate": 4.7936201081484434e-05, + "loss": 5.1766, + "step": 21901 + }, + { + "epoch": 0.13025739842040157, + "grad_norm": 2.2040951251983643, + "learning_rate": 4.793601523970695e-05, + "loss": 4.9261, + "step": 21902 + }, + { + "epoch": 0.13026334570368256, + "grad_norm": 2.333158016204834, + "learning_rate": 4.7935829389922736e-05, + "loss": 4.9423, + "step": 21903 + }, + { + "epoch": 0.13026929298696355, + "grad_norm": 2.2712838649749756, + "learning_rate": 4.793564353213187e-05, + "loss": 4.7511, + "step": 21904 + }, + { + "epoch": 0.13027524027024456, + "grad_norm": 2.119046211242676, + "learning_rate": 4.79354576663344e-05, + "loss": 4.7284, + "step": 21905 + }, + { + "epoch": 0.13028118755352555, + "grad_norm": 2.3056483268737793, + "learning_rate": 4.79352717925304e-05, + "loss": 4.8627, + "step": 21906 + }, + { + "epoch": 0.13028713483680654, + "grad_norm": 2.2767837047576904, + "learning_rate": 4.793508591071993e-05, + "loss": 4.7924, + "step": 21907 + }, + { + "epoch": 0.13029308212008756, + "grad_norm": 2.138441324234009, + "learning_rate": 4.793490002090306e-05, + "loss": 4.747, + "step": 21908 + }, + { + "epoch": 0.13029902940336854, + "grad_norm": 1.9595372676849365, + "learning_rate": 4.793471412307986e-05, + "loss": 4.6861, + "step": 21909 + }, + { + "epoch": 0.13030497668664953, + "grad_norm": 2.207357883453369, + "learning_rate": 4.793452821725039e-05, + "loss": 4.4727, + "step": 21910 + }, + { + "epoch": 0.13031092396993055, + "grad_norm": 1.9506596326828003, + "learning_rate": 4.7934342303414704e-05, + "loss": 4.4445, + "step": 21911 + }, + { + "epoch": 0.13031687125321154, + "grad_norm": 2.0946574211120605, + "learning_rate": 4.793415638157288e-05, + "loss": 4.4556, + "step": 21912 + }, + { + "epoch": 0.13032281853649252, + "grad_norm": 2.7089650630950928, + "learning_rate": 4.793397045172497e-05, + "loss": 4.3106, + "step": 21913 + }, + { + "epoch": 0.13032876581977354, + "grad_norm": 2.6837174892425537, + "learning_rate": 4.793378451387106e-05, + "loss": 4.4133, + "step": 21914 + }, + { + "epoch": 0.13033471310305453, + "grad_norm": 2.28702712059021, + "learning_rate": 4.7933598568011207e-05, + "loss": 4.4326, + "step": 21915 + }, + { + "epoch": 0.13034066038633552, + "grad_norm": 2.172691583633423, + "learning_rate": 4.793341261414546e-05, + "loss": 4.6047, + "step": 21916 + }, + { + "epoch": 0.13034660766961653, + "grad_norm": 2.202906608581543, + "learning_rate": 4.79332266522739e-05, + "loss": 4.6857, + "step": 21917 + }, + { + "epoch": 0.13035255495289752, + "grad_norm": 1.7617685794830322, + "learning_rate": 4.793304068239658e-05, + "loss": 4.4888, + "step": 21918 + }, + { + "epoch": 0.1303585022361785, + "grad_norm": 2.2866454124450684, + "learning_rate": 4.7932854704513586e-05, + "loss": 4.5558, + "step": 21919 + }, + { + "epoch": 0.13036444951945952, + "grad_norm": 2.0338642597198486, + "learning_rate": 4.793266871862496e-05, + "loss": 5.2769, + "step": 21920 + }, + { + "epoch": 0.1303703968027405, + "grad_norm": 2.0302703380584717, + "learning_rate": 4.793248272473078e-05, + "loss": 4.5903, + "step": 21921 + }, + { + "epoch": 0.1303763440860215, + "grad_norm": 2.1618101596832275, + "learning_rate": 4.793229672283111e-05, + "loss": 4.9971, + "step": 21922 + }, + { + "epoch": 0.1303822913693025, + "grad_norm": 2.0446085929870605, + "learning_rate": 4.7932110712926004e-05, + "loss": 5.286, + "step": 21923 + }, + { + "epoch": 0.1303882386525835, + "grad_norm": 1.544705867767334, + "learning_rate": 4.793192469501554e-05, + "loss": 5.5509, + "step": 21924 + }, + { + "epoch": 0.1303941859358645, + "grad_norm": 1.5994058847427368, + "learning_rate": 4.7931738669099776e-05, + "loss": 5.5891, + "step": 21925 + }, + { + "epoch": 0.13040013321914548, + "grad_norm": 1.5866730213165283, + "learning_rate": 4.793155263517878e-05, + "loss": 5.3539, + "step": 21926 + }, + { + "epoch": 0.1304060805024265, + "grad_norm": 1.5843631029129028, + "learning_rate": 4.793136659325262e-05, + "loss": 5.5528, + "step": 21927 + }, + { + "epoch": 0.13041202778570748, + "grad_norm": 1.8037461042404175, + "learning_rate": 4.7931180543321354e-05, + "loss": 4.9484, + "step": 21928 + }, + { + "epoch": 0.13041797506898847, + "grad_norm": 1.8021430969238281, + "learning_rate": 4.793099448538505e-05, + "loss": 5.2239, + "step": 21929 + }, + { + "epoch": 0.1304239223522695, + "grad_norm": 1.9063239097595215, + "learning_rate": 4.793080841944377e-05, + "loss": 5.0627, + "step": 21930 + }, + { + "epoch": 0.13042986963555048, + "grad_norm": 1.8546555042266846, + "learning_rate": 4.7930622345497575e-05, + "loss": 4.8691, + "step": 21931 + }, + { + "epoch": 0.13043581691883147, + "grad_norm": 1.7901126146316528, + "learning_rate": 4.793043626354655e-05, + "loss": 4.8975, + "step": 21932 + }, + { + "epoch": 0.13044176420211248, + "grad_norm": 1.7083008289337158, + "learning_rate": 4.793025017359074e-05, + "loss": 4.8176, + "step": 21933 + }, + { + "epoch": 0.13044771148539347, + "grad_norm": 1.7584604024887085, + "learning_rate": 4.793006407563022e-05, + "loss": 5.2551, + "step": 21934 + }, + { + "epoch": 0.13045365876867446, + "grad_norm": 1.6731703281402588, + "learning_rate": 4.792987796966505e-05, + "loss": 5.0456, + "step": 21935 + }, + { + "epoch": 0.13045960605195547, + "grad_norm": 1.6340082883834839, + "learning_rate": 4.7929691855695294e-05, + "loss": 5.5061, + "step": 21936 + }, + { + "epoch": 0.13046555333523646, + "grad_norm": 1.7354822158813477, + "learning_rate": 4.792950573372102e-05, + "loss": 5.7164, + "step": 21937 + }, + { + "epoch": 0.13047150061851745, + "grad_norm": 1.6100409030914307, + "learning_rate": 4.79293196037423e-05, + "loss": 5.2427, + "step": 21938 + }, + { + "epoch": 0.13047744790179847, + "grad_norm": 2.603156328201294, + "learning_rate": 4.7929133465759184e-05, + "loss": 4.1146, + "step": 21939 + }, + { + "epoch": 0.13048339518507945, + "grad_norm": 2.518183946609497, + "learning_rate": 4.7928947319771746e-05, + "loss": 4.2918, + "step": 21940 + }, + { + "epoch": 0.13048934246836044, + "grad_norm": 1.7518165111541748, + "learning_rate": 4.792876116578004e-05, + "loss": 5.9257, + "step": 21941 + }, + { + "epoch": 0.13049528975164146, + "grad_norm": 1.8118661642074585, + "learning_rate": 4.792857500378416e-05, + "loss": 5.8985, + "step": 21942 + }, + { + "epoch": 0.13050123703492245, + "grad_norm": 1.5877163410186768, + "learning_rate": 4.792838883378414e-05, + "loss": 6.0572, + "step": 21943 + }, + { + "epoch": 0.13050718431820343, + "grad_norm": 1.313362956047058, + "learning_rate": 4.7928202655780055e-05, + "loss": 5.7739, + "step": 21944 + }, + { + "epoch": 0.13051313160148445, + "grad_norm": 1.5902273654937744, + "learning_rate": 4.792801646977198e-05, + "loss": 6.021, + "step": 21945 + }, + { + "epoch": 0.13051907888476544, + "grad_norm": 1.8784877061843872, + "learning_rate": 4.792783027575996e-05, + "loss": 5.0933, + "step": 21946 + }, + { + "epoch": 0.13052502616804643, + "grad_norm": 1.7743972539901733, + "learning_rate": 4.7927644073744076e-05, + "loss": 5.1168, + "step": 21947 + }, + { + "epoch": 0.13053097345132744, + "grad_norm": 2.0093095302581787, + "learning_rate": 4.792745786372439e-05, + "loss": 5.7441, + "step": 21948 + }, + { + "epoch": 0.13053692073460843, + "grad_norm": 2.0483853816986084, + "learning_rate": 4.7927271645700966e-05, + "loss": 5.4851, + "step": 21949 + }, + { + "epoch": 0.13054286801788942, + "grad_norm": 1.7858600616455078, + "learning_rate": 4.792708541967386e-05, + "loss": 5.4308, + "step": 21950 + }, + { + "epoch": 0.13054881530117043, + "grad_norm": 1.578202247619629, + "learning_rate": 4.7926899185643155e-05, + "loss": 5.4409, + "step": 21951 + }, + { + "epoch": 0.13055476258445142, + "grad_norm": 1.5763752460479736, + "learning_rate": 4.7926712943608895e-05, + "loss": 5.438, + "step": 21952 + }, + { + "epoch": 0.1305607098677324, + "grad_norm": 1.4117366075515747, + "learning_rate": 4.792652669357117e-05, + "loss": 5.3256, + "step": 21953 + }, + { + "epoch": 0.13056665715101343, + "grad_norm": 1.8186451196670532, + "learning_rate": 4.792634043553003e-05, + "loss": 5.4336, + "step": 21954 + }, + { + "epoch": 0.13057260443429441, + "grad_norm": 1.8576366901397705, + "learning_rate": 4.7926154169485536e-05, + "loss": 5.5133, + "step": 21955 + }, + { + "epoch": 0.1305785517175754, + "grad_norm": 1.81550931930542, + "learning_rate": 4.7925967895437754e-05, + "loss": 5.3673, + "step": 21956 + }, + { + "epoch": 0.13058449900085642, + "grad_norm": 1.5518393516540527, + "learning_rate": 4.7925781613386765e-05, + "loss": 5.3788, + "step": 21957 + }, + { + "epoch": 0.1305904462841374, + "grad_norm": 1.726492166519165, + "learning_rate": 4.7925595323332615e-05, + "loss": 5.4759, + "step": 21958 + }, + { + "epoch": 0.1305963935674184, + "grad_norm": 1.6105836629867554, + "learning_rate": 4.792540902527538e-05, + "loss": 5.3339, + "step": 21959 + }, + { + "epoch": 0.1306023408506994, + "grad_norm": 1.6900887489318848, + "learning_rate": 4.792522271921512e-05, + "loss": 5.457, + "step": 21960 + }, + { + "epoch": 0.1306082881339804, + "grad_norm": 1.6158493757247925, + "learning_rate": 4.79250364051519e-05, + "loss": 5.4049, + "step": 21961 + }, + { + "epoch": 0.1306142354172614, + "grad_norm": 1.5123624801635742, + "learning_rate": 4.792485008308579e-05, + "loss": 5.3611, + "step": 21962 + }, + { + "epoch": 0.1306201827005424, + "grad_norm": 1.4421589374542236, + "learning_rate": 4.792466375301685e-05, + "loss": 5.3816, + "step": 21963 + }, + { + "epoch": 0.1306261299838234, + "grad_norm": 1.6167370080947876, + "learning_rate": 4.792447741494514e-05, + "loss": 5.3484, + "step": 21964 + }, + { + "epoch": 0.13063207726710438, + "grad_norm": 1.5235882997512817, + "learning_rate": 4.7924291068870745e-05, + "loss": 5.4756, + "step": 21965 + }, + { + "epoch": 0.1306380245503854, + "grad_norm": 1.5585761070251465, + "learning_rate": 4.7924104714793705e-05, + "loss": 4.9743, + "step": 21966 + }, + { + "epoch": 0.13064397183366638, + "grad_norm": 1.6565943956375122, + "learning_rate": 4.79239183527141e-05, + "loss": 4.9801, + "step": 21967 + }, + { + "epoch": 0.13064991911694737, + "grad_norm": 1.449012041091919, + "learning_rate": 4.7923731982631993e-05, + "loss": 5.2166, + "step": 21968 + }, + { + "epoch": 0.1306558664002284, + "grad_norm": 1.7511426210403442, + "learning_rate": 4.792354560454745e-05, + "loss": 4.7892, + "step": 21969 + }, + { + "epoch": 0.13066181368350938, + "grad_norm": 1.8433175086975098, + "learning_rate": 4.7923359218460535e-05, + "loss": 5.1481, + "step": 21970 + }, + { + "epoch": 0.13066776096679036, + "grad_norm": 1.4407368898391724, + "learning_rate": 4.792317282437131e-05, + "loss": 5.3282, + "step": 21971 + }, + { + "epoch": 0.13067370825007138, + "grad_norm": 1.7756870985031128, + "learning_rate": 4.7922986422279836e-05, + "loss": 4.9934, + "step": 21972 + }, + { + "epoch": 0.13067965553335237, + "grad_norm": 1.6745517253875732, + "learning_rate": 4.7922800012186197e-05, + "loss": 4.9524, + "step": 21973 + }, + { + "epoch": 0.13068560281663336, + "grad_norm": 1.6869374513626099, + "learning_rate": 4.792261359409044e-05, + "loss": 5.0163, + "step": 21974 + }, + { + "epoch": 0.13069155009991437, + "grad_norm": 1.810007929801941, + "learning_rate": 4.7922427167992635e-05, + "loss": 5.7507, + "step": 21975 + }, + { + "epoch": 0.13069749738319536, + "grad_norm": 1.438236951828003, + "learning_rate": 4.792224073389284e-05, + "loss": 5.6271, + "step": 21976 + }, + { + "epoch": 0.13070344466647635, + "grad_norm": 1.7424002885818481, + "learning_rate": 4.7922054291791135e-05, + "loss": 5.4101, + "step": 21977 + }, + { + "epoch": 0.13070939194975736, + "grad_norm": 1.6832276582717896, + "learning_rate": 4.7921867841687576e-05, + "loss": 5.5323, + "step": 21978 + }, + { + "epoch": 0.13071533923303835, + "grad_norm": 1.4542639255523682, + "learning_rate": 4.792168138358223e-05, + "loss": 5.6003, + "step": 21979 + }, + { + "epoch": 0.13072128651631934, + "grad_norm": 1.5791352987289429, + "learning_rate": 4.7921494917475164e-05, + "loss": 4.448, + "step": 21980 + }, + { + "epoch": 0.13072723379960036, + "grad_norm": 1.7216298580169678, + "learning_rate": 4.792130844336644e-05, + "loss": 5.2205, + "step": 21981 + }, + { + "epoch": 0.13073318108288134, + "grad_norm": 1.7315418720245361, + "learning_rate": 4.792112196125612e-05, + "loss": 5.617, + "step": 21982 + }, + { + "epoch": 0.13073912836616233, + "grad_norm": 1.6149991750717163, + "learning_rate": 4.792093547114428e-05, + "loss": 5.1341, + "step": 21983 + }, + { + "epoch": 0.13074507564944332, + "grad_norm": 1.8531928062438965, + "learning_rate": 4.792074897303097e-05, + "loss": 5.384, + "step": 21984 + }, + { + "epoch": 0.13075102293272434, + "grad_norm": 1.869070053100586, + "learning_rate": 4.792056246691627e-05, + "loss": 5.428, + "step": 21985 + }, + { + "epoch": 0.13075697021600532, + "grad_norm": 1.715179204940796, + "learning_rate": 4.792037595280024e-05, + "loss": 5.5358, + "step": 21986 + }, + { + "epoch": 0.1307629174992863, + "grad_norm": 2.155991315841675, + "learning_rate": 4.792018943068294e-05, + "loss": 4.9676, + "step": 21987 + }, + { + "epoch": 0.13076886478256733, + "grad_norm": 1.9201817512512207, + "learning_rate": 4.7920002900564434e-05, + "loss": 5.1021, + "step": 21988 + }, + { + "epoch": 0.13077481206584832, + "grad_norm": 1.8021970987319946, + "learning_rate": 4.79198163624448e-05, + "loss": 5.233, + "step": 21989 + }, + { + "epoch": 0.1307807593491293, + "grad_norm": 2.034694194793701, + "learning_rate": 4.7919629816324093e-05, + "loss": 5.7133, + "step": 21990 + }, + { + "epoch": 0.13078670663241032, + "grad_norm": 1.7929306030273438, + "learning_rate": 4.791944326220238e-05, + "loss": 5.1922, + "step": 21991 + }, + { + "epoch": 0.1307926539156913, + "grad_norm": 1.6092936992645264, + "learning_rate": 4.791925670007972e-05, + "loss": 4.8169, + "step": 21992 + }, + { + "epoch": 0.1307986011989723, + "grad_norm": 1.6994092464447021, + "learning_rate": 4.791907012995619e-05, + "loss": 4.7869, + "step": 21993 + }, + { + "epoch": 0.1308045484822533, + "grad_norm": 1.7823549509048462, + "learning_rate": 4.791888355183185e-05, + "loss": 5.1608, + "step": 21994 + }, + { + "epoch": 0.1308104957655343, + "grad_norm": 1.9024605751037598, + "learning_rate": 4.7918696965706764e-05, + "loss": 4.016, + "step": 21995 + }, + { + "epoch": 0.1308164430488153, + "grad_norm": 1.8696129322052002, + "learning_rate": 4.7918510371580993e-05, + "loss": 4.3457, + "step": 21996 + }, + { + "epoch": 0.1308223903320963, + "grad_norm": 1.8359664678573608, + "learning_rate": 4.791832376945461e-05, + "loss": 4.1822, + "step": 21997 + }, + { + "epoch": 0.1308283376153773, + "grad_norm": 1.867409586906433, + "learning_rate": 4.791813715932768e-05, + "loss": 4.0156, + "step": 21998 + }, + { + "epoch": 0.13083428489865828, + "grad_norm": 1.729768991470337, + "learning_rate": 4.7917950541200264e-05, + "loss": 5.4221, + "step": 21999 + }, + { + "epoch": 0.1308402321819393, + "grad_norm": 1.8171114921569824, + "learning_rate": 4.791776391507242e-05, + "loss": 4.1685, + "step": 22000 + }, + { + "epoch": 0.13084617946522029, + "grad_norm": 1.8626638650894165, + "learning_rate": 4.7917577280944234e-05, + "loss": 4.1981, + "step": 22001 + }, + { + "epoch": 0.13085212674850127, + "grad_norm": 1.9804152250289917, + "learning_rate": 4.791739063881575e-05, + "loss": 4.1258, + "step": 22002 + }, + { + "epoch": 0.1308580740317823, + "grad_norm": 2.6114773750305176, + "learning_rate": 4.791720398868704e-05, + "loss": 4.0207, + "step": 22003 + }, + { + "epoch": 0.13086402131506328, + "grad_norm": 2.1169519424438477, + "learning_rate": 4.791701733055818e-05, + "loss": 4.0134, + "step": 22004 + }, + { + "epoch": 0.13086996859834427, + "grad_norm": 2.318971872329712, + "learning_rate": 4.791683066442922e-05, + "loss": 4.1341, + "step": 22005 + }, + { + "epoch": 0.13087591588162528, + "grad_norm": 2.1771652698516846, + "learning_rate": 4.7916643990300234e-05, + "loss": 4.5816, + "step": 22006 + }, + { + "epoch": 0.13088186316490627, + "grad_norm": 2.327596426010132, + "learning_rate": 4.791645730817128e-05, + "loss": 5.3562, + "step": 22007 + }, + { + "epoch": 0.13088781044818726, + "grad_norm": 2.3558785915374756, + "learning_rate": 4.7916270618042434e-05, + "loss": 4.055, + "step": 22008 + }, + { + "epoch": 0.13089375773146827, + "grad_norm": 2.07840633392334, + "learning_rate": 4.791608391991374e-05, + "loss": 4.4366, + "step": 22009 + }, + { + "epoch": 0.13089970501474926, + "grad_norm": 2.4755849838256836, + "learning_rate": 4.79158972137853e-05, + "loss": 5.5616, + "step": 22010 + }, + { + "epoch": 0.13090565229803025, + "grad_norm": 1.8745293617248535, + "learning_rate": 4.791571049965714e-05, + "loss": 5.1908, + "step": 22011 + }, + { + "epoch": 0.13091159958131127, + "grad_norm": 1.8463020324707031, + "learning_rate": 4.791552377752935e-05, + "loss": 5.64, + "step": 22012 + }, + { + "epoch": 0.13091754686459225, + "grad_norm": 1.7283350229263306, + "learning_rate": 4.791533704740199e-05, + "loss": 5.191, + "step": 22013 + }, + { + "epoch": 0.13092349414787324, + "grad_norm": 2.290731191635132, + "learning_rate": 4.7915150309275115e-05, + "loss": 4.7131, + "step": 22014 + }, + { + "epoch": 0.13092944143115426, + "grad_norm": 2.1718969345092773, + "learning_rate": 4.7914963563148794e-05, + "loss": 4.6983, + "step": 22015 + }, + { + "epoch": 0.13093538871443525, + "grad_norm": 2.179349184036255, + "learning_rate": 4.791477680902311e-05, + "loss": 4.7265, + "step": 22016 + }, + { + "epoch": 0.13094133599771623, + "grad_norm": 1.7619205713272095, + "learning_rate": 4.79145900468981e-05, + "loss": 5.3916, + "step": 22017 + }, + { + "epoch": 0.13094728328099725, + "grad_norm": 1.827709674835205, + "learning_rate": 4.7914403276773855e-05, + "loss": 5.4988, + "step": 22018 + }, + { + "epoch": 0.13095323056427824, + "grad_norm": 1.768192172050476, + "learning_rate": 4.7914216498650424e-05, + "loss": 5.3605, + "step": 22019 + }, + { + "epoch": 0.13095917784755923, + "grad_norm": 1.6903995275497437, + "learning_rate": 4.791402971252788e-05, + "loss": 5.3919, + "step": 22020 + }, + { + "epoch": 0.13096512513084024, + "grad_norm": 1.5048458576202393, + "learning_rate": 4.791384291840628e-05, + "loss": 5.43, + "step": 22021 + }, + { + "epoch": 0.13097107241412123, + "grad_norm": 1.6317448616027832, + "learning_rate": 4.7913656116285685e-05, + "loss": 5.4964, + "step": 22022 + }, + { + "epoch": 0.13097701969740222, + "grad_norm": 1.775623083114624, + "learning_rate": 4.791346930616619e-05, + "loss": 5.4068, + "step": 22023 + }, + { + "epoch": 0.13098296698068324, + "grad_norm": 1.7148652076721191, + "learning_rate": 4.7913282488047826e-05, + "loss": 5.4362, + "step": 22024 + }, + { + "epoch": 0.13098891426396422, + "grad_norm": 1.6784619092941284, + "learning_rate": 4.7913095661930675e-05, + "loss": 5.3668, + "step": 22025 + }, + { + "epoch": 0.1309948615472452, + "grad_norm": 1.671555757522583, + "learning_rate": 4.79129088278148e-05, + "loss": 5.264, + "step": 22026 + }, + { + "epoch": 0.13100080883052623, + "grad_norm": 1.5523961782455444, + "learning_rate": 4.791272198570027e-05, + "loss": 5.1395, + "step": 22027 + }, + { + "epoch": 0.13100675611380722, + "grad_norm": 1.8762462139129639, + "learning_rate": 4.7912535135587134e-05, + "loss": 5.1099, + "step": 22028 + }, + { + "epoch": 0.1310127033970882, + "grad_norm": 1.7621192932128906, + "learning_rate": 4.7912348277475474e-05, + "loss": 5.0033, + "step": 22029 + }, + { + "epoch": 0.13101865068036922, + "grad_norm": 1.6044316291809082, + "learning_rate": 4.791216141136535e-05, + "loss": 5.2646, + "step": 22030 + }, + { + "epoch": 0.1310245979636502, + "grad_norm": 2.3852479457855225, + "learning_rate": 4.791197453725683e-05, + "loss": 4.7932, + "step": 22031 + }, + { + "epoch": 0.1310305452469312, + "grad_norm": 2.259331703186035, + "learning_rate": 4.7911787655149975e-05, + "loss": 4.8083, + "step": 22032 + }, + { + "epoch": 0.1310364925302122, + "grad_norm": 2.167745351791382, + "learning_rate": 4.791160076504485e-05, + "loss": 4.852, + "step": 22033 + }, + { + "epoch": 0.1310424398134932, + "grad_norm": 1.8246276378631592, + "learning_rate": 4.791141386694152e-05, + "loss": 5.1364, + "step": 22034 + }, + { + "epoch": 0.1310483870967742, + "grad_norm": 1.820461630821228, + "learning_rate": 4.791122696084006e-05, + "loss": 4.9647, + "step": 22035 + }, + { + "epoch": 0.1310543343800552, + "grad_norm": 1.6964235305786133, + "learning_rate": 4.791104004674052e-05, + "loss": 5.4281, + "step": 22036 + }, + { + "epoch": 0.1310602816633362, + "grad_norm": 1.8432056903839111, + "learning_rate": 4.791085312464297e-05, + "loss": 5.1905, + "step": 22037 + }, + { + "epoch": 0.13106622894661718, + "grad_norm": 1.9929230213165283, + "learning_rate": 4.7910666194547485e-05, + "loss": 5.0115, + "step": 22038 + }, + { + "epoch": 0.1310721762298982, + "grad_norm": 1.70926034450531, + "learning_rate": 4.791047925645412e-05, + "loss": 5.299, + "step": 22039 + }, + { + "epoch": 0.13107812351317918, + "grad_norm": 1.5090575218200684, + "learning_rate": 4.791029231036295e-05, + "loss": 5.4832, + "step": 22040 + }, + { + "epoch": 0.13108407079646017, + "grad_norm": 1.9068914651870728, + "learning_rate": 4.7910105356274025e-05, + "loss": 4.6246, + "step": 22041 + }, + { + "epoch": 0.13109001807974116, + "grad_norm": 1.9232919216156006, + "learning_rate": 4.7909918394187425e-05, + "loss": 4.7151, + "step": 22042 + }, + { + "epoch": 0.13109596536302218, + "grad_norm": 1.973927617073059, + "learning_rate": 4.790973142410321e-05, + "loss": 4.4912, + "step": 22043 + }, + { + "epoch": 0.13110191264630316, + "grad_norm": 1.554721474647522, + "learning_rate": 4.7909544446021434e-05, + "loss": 5.211, + "step": 22044 + }, + { + "epoch": 0.13110785992958415, + "grad_norm": 1.8059271574020386, + "learning_rate": 4.7909357459942185e-05, + "loss": 5.2998, + "step": 22045 + }, + { + "epoch": 0.13111380721286517, + "grad_norm": 1.7360923290252686, + "learning_rate": 4.79091704658655e-05, + "loss": 5.58, + "step": 22046 + }, + { + "epoch": 0.13111975449614616, + "grad_norm": 1.627770185470581, + "learning_rate": 4.790898346379148e-05, + "loss": 5.7186, + "step": 22047 + }, + { + "epoch": 0.13112570177942715, + "grad_norm": 1.6354387998580933, + "learning_rate": 4.790879645372016e-05, + "loss": 5.5099, + "step": 22048 + }, + { + "epoch": 0.13113164906270816, + "grad_norm": 1.6667500734329224, + "learning_rate": 4.790860943565161e-05, + "loss": 5.4328, + "step": 22049 + }, + { + "epoch": 0.13113759634598915, + "grad_norm": 1.7549245357513428, + "learning_rate": 4.790842240958591e-05, + "loss": 5.4191, + "step": 22050 + }, + { + "epoch": 0.13114354362927014, + "grad_norm": 1.5705612897872925, + "learning_rate": 4.790823537552311e-05, + "loss": 5.254, + "step": 22051 + }, + { + "epoch": 0.13114949091255115, + "grad_norm": 1.438839316368103, + "learning_rate": 4.790804833346329e-05, + "loss": 5.4708, + "step": 22052 + }, + { + "epoch": 0.13115543819583214, + "grad_norm": 1.8666369915008545, + "learning_rate": 4.790786128340651e-05, + "loss": 5.8635, + "step": 22053 + }, + { + "epoch": 0.13116138547911313, + "grad_norm": 2.1541588306427, + "learning_rate": 4.7907674225352815e-05, + "loss": 5.4732, + "step": 22054 + }, + { + "epoch": 0.13116733276239415, + "grad_norm": 1.6082664728164673, + "learning_rate": 4.79074871593023e-05, + "loss": 5.3902, + "step": 22055 + }, + { + "epoch": 0.13117328004567513, + "grad_norm": 1.7293864488601685, + "learning_rate": 4.790730008525502e-05, + "loss": 5.3317, + "step": 22056 + }, + { + "epoch": 0.13117922732895612, + "grad_norm": 1.830518126487732, + "learning_rate": 4.790711300321104e-05, + "loss": 5.3786, + "step": 22057 + }, + { + "epoch": 0.13118517461223714, + "grad_norm": 2.368182897567749, + "learning_rate": 4.790692591317041e-05, + "loss": 5.8, + "step": 22058 + }, + { + "epoch": 0.13119112189551813, + "grad_norm": 2.27848482131958, + "learning_rate": 4.7906738815133216e-05, + "loss": 5.4954, + "step": 22059 + }, + { + "epoch": 0.13119706917879911, + "grad_norm": 1.6672909259796143, + "learning_rate": 4.790655170909952e-05, + "loss": 5.2937, + "step": 22060 + }, + { + "epoch": 0.13120301646208013, + "grad_norm": 1.9788751602172852, + "learning_rate": 4.790636459506938e-05, + "loss": 5.1761, + "step": 22061 + }, + { + "epoch": 0.13120896374536112, + "grad_norm": 2.8215107917785645, + "learning_rate": 4.7906177473042865e-05, + "loss": 4.9236, + "step": 22062 + }, + { + "epoch": 0.1312149110286421, + "grad_norm": 2.0486905574798584, + "learning_rate": 4.790599034302004e-05, + "loss": 5.2273, + "step": 22063 + }, + { + "epoch": 0.13122085831192312, + "grad_norm": 1.9029892683029175, + "learning_rate": 4.790580320500097e-05, + "loss": 4.7737, + "step": 22064 + }, + { + "epoch": 0.1312268055952041, + "grad_norm": 2.052060842514038, + "learning_rate": 4.790561605898572e-05, + "loss": 4.7055, + "step": 22065 + }, + { + "epoch": 0.1312327528784851, + "grad_norm": 2.3215537071228027, + "learning_rate": 4.790542890497436e-05, + "loss": 4.6687, + "step": 22066 + }, + { + "epoch": 0.13123870016176611, + "grad_norm": 1.9903185367584229, + "learning_rate": 4.790524174296694e-05, + "loss": 4.5768, + "step": 22067 + }, + { + "epoch": 0.1312446474450471, + "grad_norm": 1.9112823009490967, + "learning_rate": 4.790505457296355e-05, + "loss": 4.664, + "step": 22068 + }, + { + "epoch": 0.1312505947283281, + "grad_norm": 2.09714412689209, + "learning_rate": 4.790486739496424e-05, + "loss": 4.4941, + "step": 22069 + }, + { + "epoch": 0.1312565420116091, + "grad_norm": 1.986820936203003, + "learning_rate": 4.7904680208969073e-05, + "loss": 4.8173, + "step": 22070 + }, + { + "epoch": 0.1312624892948901, + "grad_norm": 1.8170347213745117, + "learning_rate": 4.790449301497812e-05, + "loss": 4.78, + "step": 22071 + }, + { + "epoch": 0.13126843657817108, + "grad_norm": 1.7738579511642456, + "learning_rate": 4.790430581299145e-05, + "loss": 5.3492, + "step": 22072 + }, + { + "epoch": 0.1312743838614521, + "grad_norm": 1.9075175523757935, + "learning_rate": 4.7904118603009115e-05, + "loss": 4.4672, + "step": 22073 + }, + { + "epoch": 0.1312803311447331, + "grad_norm": 1.9848250150680542, + "learning_rate": 4.790393138503119e-05, + "loss": 4.2157, + "step": 22074 + }, + { + "epoch": 0.13128627842801407, + "grad_norm": 1.7980430126190186, + "learning_rate": 4.7903744159057745e-05, + "loss": 4.2482, + "step": 22075 + }, + { + "epoch": 0.1312922257112951, + "grad_norm": 1.8066810369491577, + "learning_rate": 4.7903556925088835e-05, + "loss": 4.0731, + "step": 22076 + }, + { + "epoch": 0.13129817299457608, + "grad_norm": 1.901912808418274, + "learning_rate": 4.790336968312453e-05, + "loss": 4.0677, + "step": 22077 + }, + { + "epoch": 0.13130412027785707, + "grad_norm": 1.8650418519973755, + "learning_rate": 4.79031824331649e-05, + "loss": 4.0593, + "step": 22078 + }, + { + "epoch": 0.13131006756113808, + "grad_norm": 1.8098959922790527, + "learning_rate": 4.7902995175210003e-05, + "loss": 4.1248, + "step": 22079 + }, + { + "epoch": 0.13131601484441907, + "grad_norm": 1.7840689420700073, + "learning_rate": 4.790280790925991e-05, + "loss": 4.1299, + "step": 22080 + }, + { + "epoch": 0.13132196212770006, + "grad_norm": 1.847676157951355, + "learning_rate": 4.7902620635314676e-05, + "loss": 3.9775, + "step": 22081 + }, + { + "epoch": 0.13132790941098108, + "grad_norm": 1.970070719718933, + "learning_rate": 4.7902433353374374e-05, + "loss": 3.9744, + "step": 22082 + }, + { + "epoch": 0.13133385669426206, + "grad_norm": 1.7709019184112549, + "learning_rate": 4.790224606343908e-05, + "loss": 3.9691, + "step": 22083 + }, + { + "epoch": 0.13133980397754305, + "grad_norm": 2.0055277347564697, + "learning_rate": 4.790205876550884e-05, + "loss": 4.0181, + "step": 22084 + }, + { + "epoch": 0.13134575126082407, + "grad_norm": 1.8686769008636475, + "learning_rate": 4.790187145958372e-05, + "loss": 3.9445, + "step": 22085 + }, + { + "epoch": 0.13135169854410506, + "grad_norm": 1.8052544593811035, + "learning_rate": 4.790168414566381e-05, + "loss": 4.3716, + "step": 22086 + }, + { + "epoch": 0.13135764582738604, + "grad_norm": 1.730320692062378, + "learning_rate": 4.790149682374915e-05, + "loss": 5.8462, + "step": 22087 + }, + { + "epoch": 0.13136359311066706, + "grad_norm": 1.8372067213058472, + "learning_rate": 4.790130949383982e-05, + "loss": 6.0599, + "step": 22088 + }, + { + "epoch": 0.13136954039394805, + "grad_norm": 1.505204200744629, + "learning_rate": 4.7901122155935874e-05, + "loss": 5.9626, + "step": 22089 + }, + { + "epoch": 0.13137548767722904, + "grad_norm": 2.126800537109375, + "learning_rate": 4.790093481003738e-05, + "loss": 5.3673, + "step": 22090 + }, + { + "epoch": 0.13138143496051005, + "grad_norm": 1.5778108835220337, + "learning_rate": 4.7900747456144415e-05, + "loss": 5.4421, + "step": 22091 + }, + { + "epoch": 0.13138738224379104, + "grad_norm": 1.4741785526275635, + "learning_rate": 4.7900560094257024e-05, + "loss": 5.5546, + "step": 22092 + }, + { + "epoch": 0.13139332952707203, + "grad_norm": 1.3331834077835083, + "learning_rate": 4.7900372724375295e-05, + "loss": 5.592, + "step": 22093 + }, + { + "epoch": 0.13139927681035304, + "grad_norm": 2.421566963195801, + "learning_rate": 4.790018534649927e-05, + "loss": 5.1022, + "step": 22094 + }, + { + "epoch": 0.13140522409363403, + "grad_norm": 1.761720895767212, + "learning_rate": 4.789999796062904e-05, + "loss": 5.2071, + "step": 22095 + }, + { + "epoch": 0.13141117137691502, + "grad_norm": 1.5059387683868408, + "learning_rate": 4.789981056676465e-05, + "loss": 5.3767, + "step": 22096 + }, + { + "epoch": 0.13141711866019604, + "grad_norm": 1.5319740772247314, + "learning_rate": 4.7899623164906176e-05, + "loss": 5.6233, + "step": 22097 + }, + { + "epoch": 0.13142306594347702, + "grad_norm": 1.7106443643569946, + "learning_rate": 4.789943575505368e-05, + "loss": 5.5583, + "step": 22098 + }, + { + "epoch": 0.131429013226758, + "grad_norm": 1.4288161993026733, + "learning_rate": 4.7899248337207227e-05, + "loss": 5.4574, + "step": 22099 + }, + { + "epoch": 0.131434960510039, + "grad_norm": 1.7327675819396973, + "learning_rate": 4.789906091136688e-05, + "loss": 5.3935, + "step": 22100 + }, + { + "epoch": 0.13144090779332002, + "grad_norm": 1.7318532466888428, + "learning_rate": 4.7898873477532716e-05, + "loss": 5.0156, + "step": 22101 + }, + { + "epoch": 0.131446855076601, + "grad_norm": 1.4947113990783691, + "learning_rate": 4.789868603570478e-05, + "loss": 5.2255, + "step": 22102 + }, + { + "epoch": 0.131452802359882, + "grad_norm": 2.454650402069092, + "learning_rate": 4.789849858588316e-05, + "loss": 5.0697, + "step": 22103 + }, + { + "epoch": 0.131458749643163, + "grad_norm": 2.0269839763641357, + "learning_rate": 4.789831112806791e-05, + "loss": 5.3687, + "step": 22104 + }, + { + "epoch": 0.131464696926444, + "grad_norm": 1.89911687374115, + "learning_rate": 4.7898123662259084e-05, + "loss": 5.1816, + "step": 22105 + }, + { + "epoch": 0.13147064420972498, + "grad_norm": 1.7952163219451904, + "learning_rate": 4.789793618845677e-05, + "loss": 5.1441, + "step": 22106 + }, + { + "epoch": 0.131476591493006, + "grad_norm": 1.458935022354126, + "learning_rate": 4.789774870666102e-05, + "loss": 4.8489, + "step": 22107 + }, + { + "epoch": 0.131482538776287, + "grad_norm": 1.5516583919525146, + "learning_rate": 4.78975612168719e-05, + "loss": 4.9763, + "step": 22108 + }, + { + "epoch": 0.13148848605956798, + "grad_norm": 1.525307297706604, + "learning_rate": 4.789737371908948e-05, + "loss": 5.5826, + "step": 22109 + }, + { + "epoch": 0.131494433342849, + "grad_norm": 1.516675353050232, + "learning_rate": 4.7897186213313824e-05, + "loss": 5.7384, + "step": 22110 + }, + { + "epoch": 0.13150038062612998, + "grad_norm": 1.3918993473052979, + "learning_rate": 4.7896998699545e-05, + "loss": 5.9798, + "step": 22111 + }, + { + "epoch": 0.13150632790941097, + "grad_norm": 1.7346227169036865, + "learning_rate": 4.789681117778307e-05, + "loss": 5.4939, + "step": 22112 + }, + { + "epoch": 0.13151227519269199, + "grad_norm": 1.784882664680481, + "learning_rate": 4.7896623648028094e-05, + "loss": 5.5369, + "step": 22113 + }, + { + "epoch": 0.13151822247597297, + "grad_norm": 1.5360532999038696, + "learning_rate": 4.789643611028015e-05, + "loss": 5.5539, + "step": 22114 + }, + { + "epoch": 0.13152416975925396, + "grad_norm": 1.3865541219711304, + "learning_rate": 4.789624856453929e-05, + "loss": 5.6192, + "step": 22115 + }, + { + "epoch": 0.13153011704253498, + "grad_norm": 1.8362021446228027, + "learning_rate": 4.7896061010805596e-05, + "loss": 5.6915, + "step": 22116 + }, + { + "epoch": 0.13153606432581597, + "grad_norm": 1.607771635055542, + "learning_rate": 4.789587344907911e-05, + "loss": 5.4442, + "step": 22117 + }, + { + "epoch": 0.13154201160909695, + "grad_norm": 1.5097888708114624, + "learning_rate": 4.789568587935992e-05, + "loss": 5.84, + "step": 22118 + }, + { + "epoch": 0.13154795889237797, + "grad_norm": 1.4404877424240112, + "learning_rate": 4.789549830164809e-05, + "loss": 5.7407, + "step": 22119 + }, + { + "epoch": 0.13155390617565896, + "grad_norm": 1.5682063102722168, + "learning_rate": 4.7895310715943665e-05, + "loss": 5.3026, + "step": 22120 + }, + { + "epoch": 0.13155985345893995, + "grad_norm": 1.6435290575027466, + "learning_rate": 4.789512312224672e-05, + "loss": 5.7749, + "step": 22121 + }, + { + "epoch": 0.13156580074222096, + "grad_norm": 1.7454910278320312, + "learning_rate": 4.7894935520557335e-05, + "loss": 5.5817, + "step": 22122 + }, + { + "epoch": 0.13157174802550195, + "grad_norm": 1.9168800115585327, + "learning_rate": 4.789474791087556e-05, + "loss": 4.3752, + "step": 22123 + }, + { + "epoch": 0.13157769530878294, + "grad_norm": 2.1051509380340576, + "learning_rate": 4.789456029320147e-05, + "loss": 3.6253, + "step": 22124 + }, + { + "epoch": 0.13158364259206395, + "grad_norm": 2.0902812480926514, + "learning_rate": 4.789437266753512e-05, + "loss": 4.039, + "step": 22125 + }, + { + "epoch": 0.13158958987534494, + "grad_norm": 1.804121971130371, + "learning_rate": 4.789418503387658e-05, + "loss": 3.6551, + "step": 22126 + }, + { + "epoch": 0.13159553715862593, + "grad_norm": 1.992370367050171, + "learning_rate": 4.789399739222592e-05, + "loss": 3.6387, + "step": 22127 + }, + { + "epoch": 0.13160148444190695, + "grad_norm": 2.0625061988830566, + "learning_rate": 4.7893809742583204e-05, + "loss": 3.943, + "step": 22128 + }, + { + "epoch": 0.13160743172518793, + "grad_norm": 2.021989107131958, + "learning_rate": 4.789362208494849e-05, + "loss": 4.0269, + "step": 22129 + }, + { + "epoch": 0.13161337900846892, + "grad_norm": 2.037161350250244, + "learning_rate": 4.7893434419321856e-05, + "loss": 5.3085, + "step": 22130 + }, + { + "epoch": 0.13161932629174994, + "grad_norm": 1.8836485147476196, + "learning_rate": 4.7893246745703355e-05, + "loss": 4.7337, + "step": 22131 + }, + { + "epoch": 0.13162527357503093, + "grad_norm": 1.5900107622146606, + "learning_rate": 4.789305906409306e-05, + "loss": 5.0772, + "step": 22132 + }, + { + "epoch": 0.13163122085831191, + "grad_norm": 1.627558946609497, + "learning_rate": 4.789287137449103e-05, + "loss": 5.1703, + "step": 22133 + }, + { + "epoch": 0.13163716814159293, + "grad_norm": 1.8517992496490479, + "learning_rate": 4.7892683676897344e-05, + "loss": 5.173, + "step": 22134 + }, + { + "epoch": 0.13164311542487392, + "grad_norm": 1.2436500787734985, + "learning_rate": 4.789249597131205e-05, + "loss": 4.956, + "step": 22135 + }, + { + "epoch": 0.1316490627081549, + "grad_norm": 1.5156265497207642, + "learning_rate": 4.789230825773523e-05, + "loss": 5.6121, + "step": 22136 + }, + { + "epoch": 0.13165500999143592, + "grad_norm": 1.3742187023162842, + "learning_rate": 4.789212053616694e-05, + "loss": 5.2186, + "step": 22137 + }, + { + "epoch": 0.1316609572747169, + "grad_norm": 1.3079794645309448, + "learning_rate": 4.7891932806607245e-05, + "loss": 5.4108, + "step": 22138 + }, + { + "epoch": 0.1316669045579979, + "grad_norm": 1.5291730165481567, + "learning_rate": 4.789174506905621e-05, + "loss": 5.1516, + "step": 22139 + }, + { + "epoch": 0.13167285184127892, + "grad_norm": 1.3465576171875, + "learning_rate": 4.7891557323513904e-05, + "loss": 4.9797, + "step": 22140 + }, + { + "epoch": 0.1316787991245599, + "grad_norm": 1.228513479232788, + "learning_rate": 4.789136956998039e-05, + "loss": 5.0119, + "step": 22141 + }, + { + "epoch": 0.1316847464078409, + "grad_norm": 1.4027810096740723, + "learning_rate": 4.789118180845574e-05, + "loss": 5.2781, + "step": 22142 + }, + { + "epoch": 0.1316906936911219, + "grad_norm": 1.371072769165039, + "learning_rate": 4.789099403894002e-05, + "loss": 5.1414, + "step": 22143 + }, + { + "epoch": 0.1316966409744029, + "grad_norm": 1.264255404472351, + "learning_rate": 4.7890806261433286e-05, + "loss": 4.9926, + "step": 22144 + }, + { + "epoch": 0.13170258825768388, + "grad_norm": 1.351501226425171, + "learning_rate": 4.78906184759356e-05, + "loss": 5.1473, + "step": 22145 + }, + { + "epoch": 0.1317085355409649, + "grad_norm": 1.4877911806106567, + "learning_rate": 4.7890430682447046e-05, + "loss": 5.2634, + "step": 22146 + }, + { + "epoch": 0.1317144828242459, + "grad_norm": 1.3446416854858398, + "learning_rate": 4.7890242880967675e-05, + "loss": 5.197, + "step": 22147 + }, + { + "epoch": 0.13172043010752688, + "grad_norm": 1.2246133089065552, + "learning_rate": 4.789005507149756e-05, + "loss": 5.1262, + "step": 22148 + }, + { + "epoch": 0.1317263773908079, + "grad_norm": 1.3092166185379028, + "learning_rate": 4.7889867254036755e-05, + "loss": 5.0157, + "step": 22149 + }, + { + "epoch": 0.13173232467408888, + "grad_norm": 1.3076307773590088, + "learning_rate": 4.788967942858534e-05, + "loss": 5.159, + "step": 22150 + }, + { + "epoch": 0.13173827195736987, + "grad_norm": 1.3207625150680542, + "learning_rate": 4.788949159514338e-05, + "loss": 5.1559, + "step": 22151 + }, + { + "epoch": 0.13174421924065088, + "grad_norm": 1.4235469102859497, + "learning_rate": 4.788930375371092e-05, + "loss": 4.9426, + "step": 22152 + }, + { + "epoch": 0.13175016652393187, + "grad_norm": 1.4294525384902954, + "learning_rate": 4.7889115904288054e-05, + "loss": 5.0116, + "step": 22153 + }, + { + "epoch": 0.13175611380721286, + "grad_norm": 1.3456943035125732, + "learning_rate": 4.788892804687483e-05, + "loss": 4.9962, + "step": 22154 + }, + { + "epoch": 0.13176206109049388, + "grad_norm": 1.368545651435852, + "learning_rate": 4.788874018147132e-05, + "loss": 5.1523, + "step": 22155 + }, + { + "epoch": 0.13176800837377486, + "grad_norm": 1.2844034433364868, + "learning_rate": 4.788855230807758e-05, + "loss": 4.879, + "step": 22156 + }, + { + "epoch": 0.13177395565705585, + "grad_norm": 1.3061450719833374, + "learning_rate": 4.788836442669369e-05, + "loss": 4.9011, + "step": 22157 + }, + { + "epoch": 0.13177990294033684, + "grad_norm": 1.4233042001724243, + "learning_rate": 4.788817653731971e-05, + "loss": 4.8821, + "step": 22158 + }, + { + "epoch": 0.13178585022361786, + "grad_norm": 1.4013172388076782, + "learning_rate": 4.788798863995569e-05, + "loss": 4.8431, + "step": 22159 + }, + { + "epoch": 0.13179179750689884, + "grad_norm": 1.2786699533462524, + "learning_rate": 4.7887800734601716e-05, + "loss": 4.6884, + "step": 22160 + }, + { + "epoch": 0.13179774479017983, + "grad_norm": 1.408245325088501, + "learning_rate": 4.7887612821257855e-05, + "loss": 5.2191, + "step": 22161 + }, + { + "epoch": 0.13180369207346085, + "grad_norm": 1.5876145362854004, + "learning_rate": 4.788742489992416e-05, + "loss": 5.459, + "step": 22162 + }, + { + "epoch": 0.13180963935674184, + "grad_norm": 1.4462308883666992, + "learning_rate": 4.7887236970600705e-05, + "loss": 5.2757, + "step": 22163 + }, + { + "epoch": 0.13181558664002282, + "grad_norm": 1.288514494895935, + "learning_rate": 4.7887049033287546e-05, + "loss": 5.1, + "step": 22164 + }, + { + "epoch": 0.13182153392330384, + "grad_norm": 1.387949824333191, + "learning_rate": 4.788686108798476e-05, + "loss": 4.9212, + "step": 22165 + }, + { + "epoch": 0.13182748120658483, + "grad_norm": 1.534636378288269, + "learning_rate": 4.7886673134692404e-05, + "loss": 4.7585, + "step": 22166 + }, + { + "epoch": 0.13183342848986582, + "grad_norm": 1.464815378189087, + "learning_rate": 4.788648517341054e-05, + "loss": 5.121, + "step": 22167 + }, + { + "epoch": 0.13183937577314683, + "grad_norm": 1.2842152118682861, + "learning_rate": 4.788629720413925e-05, + "loss": 5.1032, + "step": 22168 + }, + { + "epoch": 0.13184532305642782, + "grad_norm": 1.5626686811447144, + "learning_rate": 4.7886109226878595e-05, + "loss": 4.9001, + "step": 22169 + }, + { + "epoch": 0.1318512703397088, + "grad_norm": 1.4019660949707031, + "learning_rate": 4.788592124162863e-05, + "loss": 5.2157, + "step": 22170 + }, + { + "epoch": 0.13185721762298983, + "grad_norm": 1.1018543243408203, + "learning_rate": 4.788573324838942e-05, + "loss": 5.5623, + "step": 22171 + }, + { + "epoch": 0.1318631649062708, + "grad_norm": 1.4074633121490479, + "learning_rate": 4.788554524716105e-05, + "loss": 5.0306, + "step": 22172 + }, + { + "epoch": 0.1318691121895518, + "grad_norm": 1.4724953174591064, + "learning_rate": 4.788535723794356e-05, + "loss": 5.033, + "step": 22173 + }, + { + "epoch": 0.13187505947283282, + "grad_norm": 1.359288215637207, + "learning_rate": 4.788516922073703e-05, + "loss": 4.918, + "step": 22174 + }, + { + "epoch": 0.1318810067561138, + "grad_norm": 1.3733046054840088, + "learning_rate": 4.788498119554152e-05, + "loss": 4.9631, + "step": 22175 + }, + { + "epoch": 0.1318869540393948, + "grad_norm": 1.1926368474960327, + "learning_rate": 4.7884793162357114e-05, + "loss": 4.8628, + "step": 22176 + }, + { + "epoch": 0.1318929013226758, + "grad_norm": 1.1444061994552612, + "learning_rate": 4.788460512118386e-05, + "loss": 4.8978, + "step": 22177 + }, + { + "epoch": 0.1318988486059568, + "grad_norm": 1.3945989608764648, + "learning_rate": 4.7884417072021814e-05, + "loss": 4.9901, + "step": 22178 + }, + { + "epoch": 0.13190479588923779, + "grad_norm": 1.4278130531311035, + "learning_rate": 4.7884229014871063e-05, + "loss": 4.8705, + "step": 22179 + }, + { + "epoch": 0.1319107431725188, + "grad_norm": 1.4391251802444458, + "learning_rate": 4.788404094973167e-05, + "loss": 4.8575, + "step": 22180 + }, + { + "epoch": 0.1319166904557998, + "grad_norm": 1.435241460800171, + "learning_rate": 4.788385287660369e-05, + "loss": 4.8571, + "step": 22181 + }, + { + "epoch": 0.13192263773908078, + "grad_norm": 1.2841169834136963, + "learning_rate": 4.788366479548718e-05, + "loss": 4.8738, + "step": 22182 + }, + { + "epoch": 0.1319285850223618, + "grad_norm": 1.318769931793213, + "learning_rate": 4.7883476706382236e-05, + "loss": 5.1381, + "step": 22183 + }, + { + "epoch": 0.13193453230564278, + "grad_norm": 1.398940920829773, + "learning_rate": 4.78832886092889e-05, + "loss": 4.8094, + "step": 22184 + }, + { + "epoch": 0.13194047958892377, + "grad_norm": 1.373937726020813, + "learning_rate": 4.788310050420725e-05, + "loss": 5.0183, + "step": 22185 + }, + { + "epoch": 0.1319464268722048, + "grad_norm": 1.2899675369262695, + "learning_rate": 4.788291239113734e-05, + "loss": 5.3211, + "step": 22186 + }, + { + "epoch": 0.13195237415548577, + "grad_norm": 1.2992362976074219, + "learning_rate": 4.788272427007924e-05, + "loss": 5.2411, + "step": 22187 + }, + { + "epoch": 0.13195832143876676, + "grad_norm": 1.3528488874435425, + "learning_rate": 4.7882536141033025e-05, + "loss": 5.272, + "step": 22188 + }, + { + "epoch": 0.13196426872204778, + "grad_norm": 1.0530016422271729, + "learning_rate": 4.7882348003998746e-05, + "loss": 5.1516, + "step": 22189 + }, + { + "epoch": 0.13197021600532877, + "grad_norm": 1.3447175025939941, + "learning_rate": 4.7882159858976486e-05, + "loss": 5.0007, + "step": 22190 + }, + { + "epoch": 0.13197616328860975, + "grad_norm": 1.531227946281433, + "learning_rate": 4.788197170596629e-05, + "loss": 5.0506, + "step": 22191 + }, + { + "epoch": 0.13198211057189077, + "grad_norm": 1.3458744287490845, + "learning_rate": 4.788178354496823e-05, + "loss": 4.931, + "step": 22192 + }, + { + "epoch": 0.13198805785517176, + "grad_norm": 1.380890965461731, + "learning_rate": 4.788159537598239e-05, + "loss": 5.2813, + "step": 22193 + }, + { + "epoch": 0.13199400513845275, + "grad_norm": 1.387640118598938, + "learning_rate": 4.788140719900881e-05, + "loss": 5.1234, + "step": 22194 + }, + { + "epoch": 0.13199995242173376, + "grad_norm": 1.304620623588562, + "learning_rate": 4.788121901404757e-05, + "loss": 4.988, + "step": 22195 + }, + { + "epoch": 0.13200589970501475, + "grad_norm": 1.3828579187393188, + "learning_rate": 4.7881030821098736e-05, + "loss": 5.2552, + "step": 22196 + }, + { + "epoch": 0.13201184698829574, + "grad_norm": 1.4819931983947754, + "learning_rate": 4.788084262016237e-05, + "loss": 4.9094, + "step": 22197 + }, + { + "epoch": 0.13201779427157675, + "grad_norm": 1.4570109844207764, + "learning_rate": 4.788065441123853e-05, + "loss": 5.0518, + "step": 22198 + }, + { + "epoch": 0.13202374155485774, + "grad_norm": 1.4303123950958252, + "learning_rate": 4.7880466194327305e-05, + "loss": 4.773, + "step": 22199 + }, + { + "epoch": 0.13202968883813873, + "grad_norm": 1.5727583169937134, + "learning_rate": 4.788027796942874e-05, + "loss": 4.458, + "step": 22200 + }, + { + "epoch": 0.13203563612141975, + "grad_norm": 1.5693985223770142, + "learning_rate": 4.78800897365429e-05, + "loss": 4.4378, + "step": 22201 + }, + { + "epoch": 0.13204158340470074, + "grad_norm": 1.4328757524490356, + "learning_rate": 4.787990149566987e-05, + "loss": 4.3503, + "step": 22202 + }, + { + "epoch": 0.13204753068798172, + "grad_norm": 1.4490034580230713, + "learning_rate": 4.787971324680969e-05, + "loss": 4.3476, + "step": 22203 + }, + { + "epoch": 0.13205347797126274, + "grad_norm": 1.4600367546081543, + "learning_rate": 4.7879524989962446e-05, + "loss": 4.3052, + "step": 22204 + }, + { + "epoch": 0.13205942525454373, + "grad_norm": 1.5479463338851929, + "learning_rate": 4.787933672512819e-05, + "loss": 4.3291, + "step": 22205 + }, + { + "epoch": 0.13206537253782472, + "grad_norm": 1.6317998170852661, + "learning_rate": 4.7879148452306986e-05, + "loss": 4.2697, + "step": 22206 + }, + { + "epoch": 0.13207131982110573, + "grad_norm": 1.5387004613876343, + "learning_rate": 4.787896017149892e-05, + "loss": 4.3413, + "step": 22207 + }, + { + "epoch": 0.13207726710438672, + "grad_norm": 1.5556374788284302, + "learning_rate": 4.7878771882704046e-05, + "loss": 4.2002, + "step": 22208 + }, + { + "epoch": 0.1320832143876677, + "grad_norm": 1.626752495765686, + "learning_rate": 4.787858358592243e-05, + "loss": 4.2729, + "step": 22209 + }, + { + "epoch": 0.13208916167094872, + "grad_norm": 1.3982586860656738, + "learning_rate": 4.7878395281154134e-05, + "loss": 4.2138, + "step": 22210 + }, + { + "epoch": 0.1320951089542297, + "grad_norm": 1.5739530324935913, + "learning_rate": 4.787820696839922e-05, + "loss": 4.1526, + "step": 22211 + }, + { + "epoch": 0.1321010562375107, + "grad_norm": 1.458217978477478, + "learning_rate": 4.787801864765777e-05, + "loss": 4.2584, + "step": 22212 + }, + { + "epoch": 0.13210700352079172, + "grad_norm": 1.4696205854415894, + "learning_rate": 4.787783031892984e-05, + "loss": 4.2042, + "step": 22213 + }, + { + "epoch": 0.1321129508040727, + "grad_norm": 1.729152798652649, + "learning_rate": 4.7877641982215485e-05, + "loss": 4.4817, + "step": 22214 + }, + { + "epoch": 0.1321188980873537, + "grad_norm": 1.7412737607955933, + "learning_rate": 4.787745363751479e-05, + "loss": 4.4568, + "step": 22215 + }, + { + "epoch": 0.13212484537063468, + "grad_norm": 1.6463770866394043, + "learning_rate": 4.787726528482781e-05, + "loss": 4.4503, + "step": 22216 + }, + { + "epoch": 0.1321307926539157, + "grad_norm": 1.5496896505355835, + "learning_rate": 4.7877076924154617e-05, + "loss": 4.3863, + "step": 22217 + }, + { + "epoch": 0.13213673993719668, + "grad_norm": 1.6521345376968384, + "learning_rate": 4.787688855549527e-05, + "loss": 4.3847, + "step": 22218 + }, + { + "epoch": 0.13214268722047767, + "grad_norm": 1.6477288007736206, + "learning_rate": 4.7876700178849836e-05, + "loss": 4.3939, + "step": 22219 + }, + { + "epoch": 0.1321486345037587, + "grad_norm": 1.6795778274536133, + "learning_rate": 4.787651179421838e-05, + "loss": 4.1722, + "step": 22220 + }, + { + "epoch": 0.13215458178703968, + "grad_norm": 1.5795823335647583, + "learning_rate": 4.787632340160098e-05, + "loss": 4.2125, + "step": 22221 + }, + { + "epoch": 0.13216052907032066, + "grad_norm": 1.6583930253982544, + "learning_rate": 4.7876135000997686e-05, + "loss": 4.2013, + "step": 22222 + }, + { + "epoch": 0.13216647635360168, + "grad_norm": 1.4495878219604492, + "learning_rate": 4.7875946592408575e-05, + "loss": 4.1335, + "step": 22223 + }, + { + "epoch": 0.13217242363688267, + "grad_norm": 1.5657227039337158, + "learning_rate": 4.78757581758337e-05, + "loss": 4.1514, + "step": 22224 + }, + { + "epoch": 0.13217837092016366, + "grad_norm": 1.7183332443237305, + "learning_rate": 4.787556975127313e-05, + "loss": 4.7715, + "step": 22225 + }, + { + "epoch": 0.13218431820344467, + "grad_norm": 2.1822710037231445, + "learning_rate": 4.7875381318726945e-05, + "loss": 4.9383, + "step": 22226 + }, + { + "epoch": 0.13219026548672566, + "grad_norm": 1.9633662700653076, + "learning_rate": 4.787519287819519e-05, + "loss": 4.9601, + "step": 22227 + }, + { + "epoch": 0.13219621277000665, + "grad_norm": 1.6858619451522827, + "learning_rate": 4.787500442967795e-05, + "loss": 5.0091, + "step": 22228 + }, + { + "epoch": 0.13220216005328767, + "grad_norm": 1.5447601079940796, + "learning_rate": 4.787481597317528e-05, + "loss": 4.8372, + "step": 22229 + }, + { + "epoch": 0.13220810733656865, + "grad_norm": 1.4934616088867188, + "learning_rate": 4.787462750868725e-05, + "loss": 4.9812, + "step": 22230 + }, + { + "epoch": 0.13221405461984964, + "grad_norm": 1.4039883613586426, + "learning_rate": 4.787443903621393e-05, + "loss": 4.829, + "step": 22231 + }, + { + "epoch": 0.13222000190313066, + "grad_norm": 1.5184186697006226, + "learning_rate": 4.787425055575536e-05, + "loss": 4.8379, + "step": 22232 + }, + { + "epoch": 0.13222594918641165, + "grad_norm": 1.3783762454986572, + "learning_rate": 4.787406206731164e-05, + "loss": 4.9209, + "step": 22233 + }, + { + "epoch": 0.13223189646969263, + "grad_norm": 1.360772967338562, + "learning_rate": 4.787387357088282e-05, + "loss": 5.0036, + "step": 22234 + }, + { + "epoch": 0.13223784375297365, + "grad_norm": 1.4753018617630005, + "learning_rate": 4.787368506646897e-05, + "loss": 5.3268, + "step": 22235 + }, + { + "epoch": 0.13224379103625464, + "grad_norm": 1.3295317888259888, + "learning_rate": 4.787349655407014e-05, + "loss": 5.3096, + "step": 22236 + }, + { + "epoch": 0.13224973831953563, + "grad_norm": 1.4120566844940186, + "learning_rate": 4.787330803368642e-05, + "loss": 4.9041, + "step": 22237 + }, + { + "epoch": 0.13225568560281664, + "grad_norm": 1.3822401762008667, + "learning_rate": 4.787311950531787e-05, + "loss": 5.0089, + "step": 22238 + }, + { + "epoch": 0.13226163288609763, + "grad_norm": 1.0574642419815063, + "learning_rate": 4.7872930968964535e-05, + "loss": 5.528, + "step": 22239 + }, + { + "epoch": 0.13226758016937862, + "grad_norm": 1.4523993730545044, + "learning_rate": 4.78727424246265e-05, + "loss": 5.3844, + "step": 22240 + }, + { + "epoch": 0.13227352745265963, + "grad_norm": 1.283956527709961, + "learning_rate": 4.787255387230383e-05, + "loss": 5.226, + "step": 22241 + }, + { + "epoch": 0.13227947473594062, + "grad_norm": 1.621275782585144, + "learning_rate": 4.7872365311996594e-05, + "loss": 4.7797, + "step": 22242 + }, + { + "epoch": 0.1322854220192216, + "grad_norm": 1.327376365661621, + "learning_rate": 4.787217674370484e-05, + "loss": 4.9057, + "step": 22243 + }, + { + "epoch": 0.13229136930250263, + "grad_norm": 1.5311939716339111, + "learning_rate": 4.787198816742865e-05, + "loss": 5.0076, + "step": 22244 + }, + { + "epoch": 0.13229731658578361, + "grad_norm": 1.3926832675933838, + "learning_rate": 4.7871799583168085e-05, + "loss": 4.9328, + "step": 22245 + }, + { + "epoch": 0.1323032638690646, + "grad_norm": 1.2381867170333862, + "learning_rate": 4.787161099092321e-05, + "loss": 5.1678, + "step": 22246 + }, + { + "epoch": 0.13230921115234562, + "grad_norm": 1.1969068050384521, + "learning_rate": 4.78714223906941e-05, + "loss": 5.5106, + "step": 22247 + }, + { + "epoch": 0.1323151584356266, + "grad_norm": 1.2368844747543335, + "learning_rate": 4.7871233782480804e-05, + "loss": 5.4105, + "step": 22248 + }, + { + "epoch": 0.1323211057189076, + "grad_norm": 1.45974862575531, + "learning_rate": 4.78710451662834e-05, + "loss": 4.9328, + "step": 22249 + }, + { + "epoch": 0.1323270530021886, + "grad_norm": 1.2457060813903809, + "learning_rate": 4.787085654210195e-05, + "loss": 5.225, + "step": 22250 + }, + { + "epoch": 0.1323330002854696, + "grad_norm": 1.4274303913116455, + "learning_rate": 4.787066790993652e-05, + "loss": 4.8785, + "step": 22251 + }, + { + "epoch": 0.1323389475687506, + "grad_norm": 1.3072400093078613, + "learning_rate": 4.7870479269787174e-05, + "loss": 4.871, + "step": 22252 + }, + { + "epoch": 0.1323448948520316, + "grad_norm": 1.2442991733551025, + "learning_rate": 4.787029062165398e-05, + "loss": 4.8374, + "step": 22253 + }, + { + "epoch": 0.1323508421353126, + "grad_norm": 1.3584920167922974, + "learning_rate": 4.787010196553701e-05, + "loss": 5.2427, + "step": 22254 + }, + { + "epoch": 0.13235678941859358, + "grad_norm": 1.560067892074585, + "learning_rate": 4.786991330143632e-05, + "loss": 4.8689, + "step": 22255 + }, + { + "epoch": 0.1323627367018746, + "grad_norm": 1.3197054862976074, + "learning_rate": 4.786972462935198e-05, + "loss": 4.8326, + "step": 22256 + }, + { + "epoch": 0.13236868398515558, + "grad_norm": 1.2790191173553467, + "learning_rate": 4.786953594928405e-05, + "loss": 4.7454, + "step": 22257 + }, + { + "epoch": 0.13237463126843657, + "grad_norm": 1.6187344789505005, + "learning_rate": 4.7869347261232606e-05, + "loss": 5.5456, + "step": 22258 + }, + { + "epoch": 0.1323805785517176, + "grad_norm": 1.3327410221099854, + "learning_rate": 4.786915856519771e-05, + "loss": 4.834, + "step": 22259 + }, + { + "epoch": 0.13238652583499858, + "grad_norm": 1.2602509260177612, + "learning_rate": 4.786896986117943e-05, + "loss": 5.1677, + "step": 22260 + }, + { + "epoch": 0.13239247311827956, + "grad_norm": 1.4382299184799194, + "learning_rate": 4.786878114917782e-05, + "loss": 5.0591, + "step": 22261 + }, + { + "epoch": 0.13239842040156058, + "grad_norm": 1.4061304330825806, + "learning_rate": 4.786859242919296e-05, + "loss": 5.0161, + "step": 22262 + }, + { + "epoch": 0.13240436768484157, + "grad_norm": 1.4143967628479004, + "learning_rate": 4.7868403701224905e-05, + "loss": 4.7625, + "step": 22263 + }, + { + "epoch": 0.13241031496812256, + "grad_norm": 1.4221394062042236, + "learning_rate": 4.786821496527374e-05, + "loss": 4.8579, + "step": 22264 + }, + { + "epoch": 0.13241626225140357, + "grad_norm": 1.3852332830429077, + "learning_rate": 4.78680262213395e-05, + "loss": 4.6081, + "step": 22265 + }, + { + "epoch": 0.13242220953468456, + "grad_norm": 1.2698066234588623, + "learning_rate": 4.786783746942228e-05, + "loss": 4.7903, + "step": 22266 + }, + { + "epoch": 0.13242815681796555, + "grad_norm": 1.2313082218170166, + "learning_rate": 4.7867648709522136e-05, + "loss": 4.8353, + "step": 22267 + }, + { + "epoch": 0.13243410410124656, + "grad_norm": 1.3578218221664429, + "learning_rate": 4.7867459941639124e-05, + "loss": 5.2778, + "step": 22268 + }, + { + "epoch": 0.13244005138452755, + "grad_norm": 1.5034034252166748, + "learning_rate": 4.786727116577332e-05, + "loss": 5.2208, + "step": 22269 + }, + { + "epoch": 0.13244599866780854, + "grad_norm": 1.621207356452942, + "learning_rate": 4.786708238192479e-05, + "loss": 4.8394, + "step": 22270 + }, + { + "epoch": 0.13245194595108956, + "grad_norm": 1.471311092376709, + "learning_rate": 4.7866893590093595e-05, + "loss": 4.8942, + "step": 22271 + }, + { + "epoch": 0.13245789323437054, + "grad_norm": 1.3276898860931396, + "learning_rate": 4.7866704790279806e-05, + "loss": 4.833, + "step": 22272 + }, + { + "epoch": 0.13246384051765153, + "grad_norm": 1.484650731086731, + "learning_rate": 4.786651598248349e-05, + "loss": 5.0415, + "step": 22273 + }, + { + "epoch": 0.13246978780093252, + "grad_norm": 1.3327105045318604, + "learning_rate": 4.7866327166704703e-05, + "loss": 5.2227, + "step": 22274 + }, + { + "epoch": 0.13247573508421354, + "grad_norm": 1.4387754201889038, + "learning_rate": 4.7866138342943525e-05, + "loss": 5.1764, + "step": 22275 + }, + { + "epoch": 0.13248168236749452, + "grad_norm": 1.3406511545181274, + "learning_rate": 4.786594951120001e-05, + "loss": 5.2711, + "step": 22276 + }, + { + "epoch": 0.1324876296507755, + "grad_norm": 1.3859505653381348, + "learning_rate": 4.7865760671474224e-05, + "loss": 5.1102, + "step": 22277 + }, + { + "epoch": 0.13249357693405653, + "grad_norm": 1.517545461654663, + "learning_rate": 4.7865571823766245e-05, + "loss": 5.1275, + "step": 22278 + }, + { + "epoch": 0.13249952421733752, + "grad_norm": 1.720278263092041, + "learning_rate": 4.7865382968076125e-05, + "loss": 5.0902, + "step": 22279 + }, + { + "epoch": 0.1325054715006185, + "grad_norm": 1.543717622756958, + "learning_rate": 4.786519410440394e-05, + "loss": 5.1094, + "step": 22280 + }, + { + "epoch": 0.13251141878389952, + "grad_norm": 1.2068023681640625, + "learning_rate": 4.786500523274975e-05, + "loss": 5.1791, + "step": 22281 + }, + { + "epoch": 0.1325173660671805, + "grad_norm": 1.426169991493225, + "learning_rate": 4.786481635311362e-05, + "loss": 5.2155, + "step": 22282 + }, + { + "epoch": 0.1325233133504615, + "grad_norm": 1.4624898433685303, + "learning_rate": 4.7864627465495626e-05, + "loss": 4.8741, + "step": 22283 + }, + { + "epoch": 0.1325292606337425, + "grad_norm": 1.2942382097244263, + "learning_rate": 4.786443856989582e-05, + "loss": 5.4888, + "step": 22284 + }, + { + "epoch": 0.1325352079170235, + "grad_norm": 1.2372108697891235, + "learning_rate": 4.786424966631428e-05, + "loss": 5.1907, + "step": 22285 + }, + { + "epoch": 0.1325411552003045, + "grad_norm": 1.368546962738037, + "learning_rate": 4.7864060754751064e-05, + "loss": 5.1653, + "step": 22286 + }, + { + "epoch": 0.1325471024835855, + "grad_norm": 1.6052632331848145, + "learning_rate": 4.786387183520624e-05, + "loss": 5.2139, + "step": 22287 + }, + { + "epoch": 0.1325530497668665, + "grad_norm": 1.4893959760665894, + "learning_rate": 4.7863682907679874e-05, + "loss": 4.9972, + "step": 22288 + }, + { + "epoch": 0.13255899705014748, + "grad_norm": 1.370919942855835, + "learning_rate": 4.786349397217204e-05, + "loss": 5.315, + "step": 22289 + }, + { + "epoch": 0.1325649443334285, + "grad_norm": 1.7138948440551758, + "learning_rate": 4.786330502868279e-05, + "loss": 5.4063, + "step": 22290 + }, + { + "epoch": 0.13257089161670949, + "grad_norm": 1.4117851257324219, + "learning_rate": 4.786311607721219e-05, + "loss": 5.3601, + "step": 22291 + }, + { + "epoch": 0.13257683889999047, + "grad_norm": 2.5631167888641357, + "learning_rate": 4.786292711776033e-05, + "loss": 3.8547, + "step": 22292 + }, + { + "epoch": 0.1325827861832715, + "grad_norm": 2.4507203102111816, + "learning_rate": 4.786273815032724e-05, + "loss": 3.9096, + "step": 22293 + }, + { + "epoch": 0.13258873346655248, + "grad_norm": 2.384136915206909, + "learning_rate": 4.7862549174913014e-05, + "loss": 4.0437, + "step": 22294 + }, + { + "epoch": 0.13259468074983347, + "grad_norm": 2.215449094772339, + "learning_rate": 4.786236019151771e-05, + "loss": 3.9703, + "step": 22295 + }, + { + "epoch": 0.13260062803311448, + "grad_norm": 2.1639139652252197, + "learning_rate": 4.786217120014138e-05, + "loss": 3.5108, + "step": 22296 + }, + { + "epoch": 0.13260657531639547, + "grad_norm": 2.2001569271087646, + "learning_rate": 4.786198220078412e-05, + "loss": 3.3189, + "step": 22297 + }, + { + "epoch": 0.13261252259967646, + "grad_norm": 2.1637179851531982, + "learning_rate": 4.7861793193445964e-05, + "loss": 3.3301, + "step": 22298 + }, + { + "epoch": 0.13261846988295747, + "grad_norm": 2.12546443939209, + "learning_rate": 4.7861604178127e-05, + "loss": 3.4002, + "step": 22299 + }, + { + "epoch": 0.13262441716623846, + "grad_norm": 1.632663369178772, + "learning_rate": 4.7861415154827285e-05, + "loss": 5.6516, + "step": 22300 + }, + { + "epoch": 0.13263036444951945, + "grad_norm": 1.6801213026046753, + "learning_rate": 4.786122612354688e-05, + "loss": 5.5013, + "step": 22301 + }, + { + "epoch": 0.13263631173280047, + "grad_norm": 1.5306708812713623, + "learning_rate": 4.7861037084285866e-05, + "loss": 5.6885, + "step": 22302 + }, + { + "epoch": 0.13264225901608145, + "grad_norm": 1.553322196006775, + "learning_rate": 4.7860848037044294e-05, + "loss": 5.499, + "step": 22303 + }, + { + "epoch": 0.13264820629936244, + "grad_norm": 1.5508325099945068, + "learning_rate": 4.7860658981822234e-05, + "loss": 5.522, + "step": 22304 + }, + { + "epoch": 0.13265415358264346, + "grad_norm": 1.4522117376327515, + "learning_rate": 4.786046991861976e-05, + "loss": 5.616, + "step": 22305 + }, + { + "epoch": 0.13266010086592445, + "grad_norm": 1.5596072673797607, + "learning_rate": 4.7860280847436926e-05, + "loss": 5.5323, + "step": 22306 + }, + { + "epoch": 0.13266604814920543, + "grad_norm": 1.8776074647903442, + "learning_rate": 4.7860091768273806e-05, + "loss": 5.4604, + "step": 22307 + }, + { + "epoch": 0.13267199543248645, + "grad_norm": 1.97171151638031, + "learning_rate": 4.785990268113048e-05, + "loss": 5.2305, + "step": 22308 + }, + { + "epoch": 0.13267794271576744, + "grad_norm": 1.35499107837677, + "learning_rate": 4.785971358600698e-05, + "loss": 4.8288, + "step": 22309 + }, + { + "epoch": 0.13268388999904843, + "grad_norm": 1.5026946067810059, + "learning_rate": 4.785952448290339e-05, + "loss": 4.6641, + "step": 22310 + }, + { + "epoch": 0.13268983728232944, + "grad_norm": 1.6728490591049194, + "learning_rate": 4.785933537181978e-05, + "loss": 4.8855, + "step": 22311 + }, + { + "epoch": 0.13269578456561043, + "grad_norm": 1.834144115447998, + "learning_rate": 4.7859146252756213e-05, + "loss": 4.5688, + "step": 22312 + }, + { + "epoch": 0.13270173184889142, + "grad_norm": 2.314073085784912, + "learning_rate": 4.7858957125712753e-05, + "loss": 5.3503, + "step": 22313 + }, + { + "epoch": 0.13270767913217243, + "grad_norm": 1.7270644903182983, + "learning_rate": 4.785876799068947e-05, + "loss": 5.6763, + "step": 22314 + }, + { + "epoch": 0.13271362641545342, + "grad_norm": 1.929304599761963, + "learning_rate": 4.785857884768643e-05, + "loss": 5.1659, + "step": 22315 + }, + { + "epoch": 0.1327195736987344, + "grad_norm": 1.8507132530212402, + "learning_rate": 4.785838969670369e-05, + "loss": 5.0806, + "step": 22316 + }, + { + "epoch": 0.13272552098201543, + "grad_norm": 1.6761378049850464, + "learning_rate": 4.785820053774133e-05, + "loss": 5.2008, + "step": 22317 + }, + { + "epoch": 0.13273146826529642, + "grad_norm": 1.521119475364685, + "learning_rate": 4.785801137079939e-05, + "loss": 5.0448, + "step": 22318 + }, + { + "epoch": 0.1327374155485774, + "grad_norm": 1.6237796545028687, + "learning_rate": 4.785782219587797e-05, + "loss": 5.0451, + "step": 22319 + }, + { + "epoch": 0.13274336283185842, + "grad_norm": 1.4166826009750366, + "learning_rate": 4.785763301297712e-05, + "loss": 5.0055, + "step": 22320 + }, + { + "epoch": 0.1327493101151394, + "grad_norm": 1.7093290090560913, + "learning_rate": 4.7857443822096905e-05, + "loss": 4.9528, + "step": 22321 + }, + { + "epoch": 0.1327552573984204, + "grad_norm": 1.7715668678283691, + "learning_rate": 4.785725462323739e-05, + "loss": 5.1638, + "step": 22322 + }, + { + "epoch": 0.1327612046817014, + "grad_norm": 1.8321062326431274, + "learning_rate": 4.785706541639865e-05, + "loss": 5.1916, + "step": 22323 + }, + { + "epoch": 0.1327671519649824, + "grad_norm": 1.6878079175949097, + "learning_rate": 4.7856876201580736e-05, + "loss": 5.1106, + "step": 22324 + }, + { + "epoch": 0.1327730992482634, + "grad_norm": 1.5275590419769287, + "learning_rate": 4.7856686978783725e-05, + "loss": 5.1073, + "step": 22325 + }, + { + "epoch": 0.1327790465315444, + "grad_norm": 1.6648119688034058, + "learning_rate": 4.7856497748007684e-05, + "loss": 5.3244, + "step": 22326 + }, + { + "epoch": 0.1327849938148254, + "grad_norm": 1.693325400352478, + "learning_rate": 4.7856308509252674e-05, + "loss": 5.596, + "step": 22327 + }, + { + "epoch": 0.13279094109810638, + "grad_norm": 2.6629621982574463, + "learning_rate": 4.785611926251876e-05, + "loss": 4.1305, + "step": 22328 + }, + { + "epoch": 0.1327968883813874, + "grad_norm": 2.4292843341827393, + "learning_rate": 4.785593000780602e-05, + "loss": 4.5656, + "step": 22329 + }, + { + "epoch": 0.13280283566466838, + "grad_norm": 1.5317484140396118, + "learning_rate": 4.78557407451145e-05, + "loss": 5.6828, + "step": 22330 + }, + { + "epoch": 0.13280878294794937, + "grad_norm": 1.59109365940094, + "learning_rate": 4.7855551474444285e-05, + "loss": 5.7914, + "step": 22331 + }, + { + "epoch": 0.13281473023123036, + "grad_norm": 1.359665036201477, + "learning_rate": 4.7855362195795425e-05, + "loss": 5.6294, + "step": 22332 + }, + { + "epoch": 0.13282067751451138, + "grad_norm": 1.327269196510315, + "learning_rate": 4.7855172909168003e-05, + "loss": 5.7178, + "step": 22333 + }, + { + "epoch": 0.13282662479779236, + "grad_norm": 1.4080103635787964, + "learning_rate": 4.785498361456207e-05, + "loss": 5.8786, + "step": 22334 + }, + { + "epoch": 0.13283257208107335, + "grad_norm": 1.393926978111267, + "learning_rate": 4.78547943119777e-05, + "loss": 5.4177, + "step": 22335 + }, + { + "epoch": 0.13283851936435437, + "grad_norm": 1.6050227880477905, + "learning_rate": 4.785460500141495e-05, + "loss": 5.5235, + "step": 22336 + }, + { + "epoch": 0.13284446664763536, + "grad_norm": 1.5462367534637451, + "learning_rate": 4.785441568287391e-05, + "loss": 6.1101, + "step": 22337 + }, + { + "epoch": 0.13285041393091634, + "grad_norm": 1.5062382221221924, + "learning_rate": 4.785422635635462e-05, + "loss": 5.8075, + "step": 22338 + }, + { + "epoch": 0.13285636121419736, + "grad_norm": 1.7419465780258179, + "learning_rate": 4.785403702185716e-05, + "loss": 5.8189, + "step": 22339 + }, + { + "epoch": 0.13286230849747835, + "grad_norm": 1.754164218902588, + "learning_rate": 4.785384767938158e-05, + "loss": 5.6446, + "step": 22340 + }, + { + "epoch": 0.13286825578075934, + "grad_norm": 1.3769707679748535, + "learning_rate": 4.785365832892797e-05, + "loss": 5.7689, + "step": 22341 + }, + { + "epoch": 0.13287420306404035, + "grad_norm": 1.6358861923217773, + "learning_rate": 4.7853468970496386e-05, + "loss": 5.4568, + "step": 22342 + }, + { + "epoch": 0.13288015034732134, + "grad_norm": 1.567083477973938, + "learning_rate": 4.7853279604086883e-05, + "loss": 5.4124, + "step": 22343 + }, + { + "epoch": 0.13288609763060233, + "grad_norm": 1.3793751001358032, + "learning_rate": 4.785309022969954e-05, + "loss": 5.5976, + "step": 22344 + }, + { + "epoch": 0.13289204491388334, + "grad_norm": 1.5371218919754028, + "learning_rate": 4.7852900847334414e-05, + "loss": 5.2898, + "step": 22345 + }, + { + "epoch": 0.13289799219716433, + "grad_norm": 2.1502809524536133, + "learning_rate": 4.785271145699158e-05, + "loss": 4.1536, + "step": 22346 + }, + { + "epoch": 0.13290393948044532, + "grad_norm": 1.9648473262786865, + "learning_rate": 4.785252205867111e-05, + "loss": 4.1755, + "step": 22347 + }, + { + "epoch": 0.13290988676372634, + "grad_norm": 1.874877691268921, + "learning_rate": 4.785233265237305e-05, + "loss": 4.1043, + "step": 22348 + }, + { + "epoch": 0.13291583404700733, + "grad_norm": 1.924109935760498, + "learning_rate": 4.785214323809748e-05, + "loss": 4.0551, + "step": 22349 + }, + { + "epoch": 0.1329217813302883, + "grad_norm": 1.8653898239135742, + "learning_rate": 4.785195381584446e-05, + "loss": 4.0712, + "step": 22350 + }, + { + "epoch": 0.13292772861356933, + "grad_norm": 1.8480240106582642, + "learning_rate": 4.785176438561406e-05, + "loss": 4.0729, + "step": 22351 + }, + { + "epoch": 0.13293367589685032, + "grad_norm": 1.7229113578796387, + "learning_rate": 4.785157494740635e-05, + "loss": 3.9822, + "step": 22352 + }, + { + "epoch": 0.1329396231801313, + "grad_norm": 1.9756056070327759, + "learning_rate": 4.7851385501221385e-05, + "loss": 3.8667, + "step": 22353 + }, + { + "epoch": 0.13294557046341232, + "grad_norm": 1.9121302366256714, + "learning_rate": 4.785119604705924e-05, + "loss": 4.0157, + "step": 22354 + }, + { + "epoch": 0.1329515177466933, + "grad_norm": 1.999444842338562, + "learning_rate": 4.785100658491998e-05, + "loss": 4.0511, + "step": 22355 + }, + { + "epoch": 0.1329574650299743, + "grad_norm": 1.8992079496383667, + "learning_rate": 4.785081711480367e-05, + "loss": 3.9595, + "step": 22356 + }, + { + "epoch": 0.1329634123132553, + "grad_norm": 1.8835148811340332, + "learning_rate": 4.785062763671037e-05, + "loss": 3.9891, + "step": 22357 + }, + { + "epoch": 0.1329693595965363, + "grad_norm": 1.8938409090042114, + "learning_rate": 4.785043815064015e-05, + "loss": 3.927, + "step": 22358 + }, + { + "epoch": 0.1329753068798173, + "grad_norm": 1.8824357986450195, + "learning_rate": 4.785024865659309e-05, + "loss": 4.0438, + "step": 22359 + }, + { + "epoch": 0.1329812541630983, + "grad_norm": 1.9158250093460083, + "learning_rate": 4.785005915456924e-05, + "loss": 4.0448, + "step": 22360 + }, + { + "epoch": 0.1329872014463793, + "grad_norm": 1.7421679496765137, + "learning_rate": 4.784986964456867e-05, + "loss": 3.9869, + "step": 22361 + }, + { + "epoch": 0.13299314872966028, + "grad_norm": 1.7917057275772095, + "learning_rate": 4.784968012659145e-05, + "loss": 3.9976, + "step": 22362 + }, + { + "epoch": 0.1329990960129413, + "grad_norm": 1.9387284517288208, + "learning_rate": 4.784949060063764e-05, + "loss": 4.3383, + "step": 22363 + }, + { + "epoch": 0.1330050432962223, + "grad_norm": 2.60548996925354, + "learning_rate": 4.78493010667073e-05, + "loss": 4.5527, + "step": 22364 + }, + { + "epoch": 0.13301099057950327, + "grad_norm": 2.440361976623535, + "learning_rate": 4.784911152480051e-05, + "loss": 4.7931, + "step": 22365 + }, + { + "epoch": 0.1330169378627843, + "grad_norm": 2.4233226776123047, + "learning_rate": 4.784892197491734e-05, + "loss": 4.5482, + "step": 22366 + }, + { + "epoch": 0.13302288514606528, + "grad_norm": 2.3421928882598877, + "learning_rate": 4.7848732417057836e-05, + "loss": 4.6708, + "step": 22367 + }, + { + "epoch": 0.13302883242934627, + "grad_norm": 1.9476850032806396, + "learning_rate": 4.784854285122208e-05, + "loss": 4.5518, + "step": 22368 + }, + { + "epoch": 0.13303477971262728, + "grad_norm": 2.015965223312378, + "learning_rate": 4.784835327741013e-05, + "loss": 4.5258, + "step": 22369 + }, + { + "epoch": 0.13304072699590827, + "grad_norm": 2.28434157371521, + "learning_rate": 4.784816369562206e-05, + "loss": 4.6413, + "step": 22370 + }, + { + "epoch": 0.13304667427918926, + "grad_norm": 1.9141323566436768, + "learning_rate": 4.784797410585794e-05, + "loss": 4.7134, + "step": 22371 + }, + { + "epoch": 0.13305262156247027, + "grad_norm": 2.2627341747283936, + "learning_rate": 4.7847784508117815e-05, + "loss": 4.512, + "step": 22372 + }, + { + "epoch": 0.13305856884575126, + "grad_norm": 2.2111268043518066, + "learning_rate": 4.784759490240177e-05, + "loss": 4.6105, + "step": 22373 + }, + { + "epoch": 0.13306451612903225, + "grad_norm": 2.4321610927581787, + "learning_rate": 4.7847405288709864e-05, + "loss": 5.1333, + "step": 22374 + }, + { + "epoch": 0.13307046341231327, + "grad_norm": 2.49605131149292, + "learning_rate": 4.7847215667042165e-05, + "loss": 5.2355, + "step": 22375 + }, + { + "epoch": 0.13307641069559425, + "grad_norm": 2.2517080307006836, + "learning_rate": 4.784702603739874e-05, + "loss": 5.3007, + "step": 22376 + }, + { + "epoch": 0.13308235797887524, + "grad_norm": 1.807502269744873, + "learning_rate": 4.784683639977966e-05, + "loss": 5.2645, + "step": 22377 + }, + { + "epoch": 0.13308830526215626, + "grad_norm": 1.9133596420288086, + "learning_rate": 4.784664675418497e-05, + "loss": 5.3313, + "step": 22378 + }, + { + "epoch": 0.13309425254543725, + "grad_norm": 1.823691725730896, + "learning_rate": 4.7846457100614774e-05, + "loss": 5.5637, + "step": 22379 + }, + { + "epoch": 0.13310019982871824, + "grad_norm": 1.769579291343689, + "learning_rate": 4.78462674390691e-05, + "loss": 5.3217, + "step": 22380 + }, + { + "epoch": 0.13310614711199925, + "grad_norm": 1.576685905456543, + "learning_rate": 4.784607776954804e-05, + "loss": 5.5387, + "step": 22381 + }, + { + "epoch": 0.13311209439528024, + "grad_norm": 1.5737719535827637, + "learning_rate": 4.784588809205164e-05, + "loss": 5.269, + "step": 22382 + }, + { + "epoch": 0.13311804167856123, + "grad_norm": 1.6323963403701782, + "learning_rate": 4.784569840657998e-05, + "loss": 5.156, + "step": 22383 + }, + { + "epoch": 0.13312398896184224, + "grad_norm": 2.5943386554718018, + "learning_rate": 4.784550871313312e-05, + "loss": 5.0882, + "step": 22384 + }, + { + "epoch": 0.13312993624512323, + "grad_norm": 1.5392063856124878, + "learning_rate": 4.784531901171113e-05, + "loss": 5.0303, + "step": 22385 + }, + { + "epoch": 0.13313588352840422, + "grad_norm": 1.7257198095321655, + "learning_rate": 4.784512930231408e-05, + "loss": 5.3784, + "step": 22386 + }, + { + "epoch": 0.13314183081168524, + "grad_norm": 1.7736787796020508, + "learning_rate": 4.784493958494203e-05, + "loss": 5.256, + "step": 22387 + }, + { + "epoch": 0.13314777809496622, + "grad_norm": 1.575386643409729, + "learning_rate": 4.784474985959505e-05, + "loss": 5.1247, + "step": 22388 + }, + { + "epoch": 0.1331537253782472, + "grad_norm": 1.6164257526397705, + "learning_rate": 4.7844560126273195e-05, + "loss": 5.553, + "step": 22389 + }, + { + "epoch": 0.13315967266152823, + "grad_norm": 1.515674114227295, + "learning_rate": 4.7844370384976546e-05, + "loss": 5.556, + "step": 22390 + }, + { + "epoch": 0.13316561994480922, + "grad_norm": 1.5831459760665894, + "learning_rate": 4.784418063570516e-05, + "loss": 5.2649, + "step": 22391 + }, + { + "epoch": 0.1331715672280902, + "grad_norm": 1.5372157096862793, + "learning_rate": 4.7843990878459114e-05, + "loss": 5.1961, + "step": 22392 + }, + { + "epoch": 0.1331775145113712, + "grad_norm": 1.5881307125091553, + "learning_rate": 4.784380111323846e-05, + "loss": 5.5521, + "step": 22393 + }, + { + "epoch": 0.1331834617946522, + "grad_norm": 1.7717739343643188, + "learning_rate": 4.784361134004327e-05, + "loss": 5.4407, + "step": 22394 + }, + { + "epoch": 0.1331894090779332, + "grad_norm": 1.7472600936889648, + "learning_rate": 4.784342155887362e-05, + "loss": 5.1055, + "step": 22395 + }, + { + "epoch": 0.13319535636121418, + "grad_norm": 1.8296018838882446, + "learning_rate": 4.784323176972956e-05, + "loss": 4.596, + "step": 22396 + }, + { + "epoch": 0.1332013036444952, + "grad_norm": 1.6303856372833252, + "learning_rate": 4.784304197261117e-05, + "loss": 5.4028, + "step": 22397 + }, + { + "epoch": 0.1332072509277762, + "grad_norm": 1.4000413417816162, + "learning_rate": 4.78428521675185e-05, + "loss": 5.8166, + "step": 22398 + }, + { + "epoch": 0.13321319821105718, + "grad_norm": 1.4396088123321533, + "learning_rate": 4.7842662354451634e-05, + "loss": 5.4439, + "step": 22399 + }, + { + "epoch": 0.1332191454943382, + "grad_norm": 1.580919623374939, + "learning_rate": 4.7842472533410635e-05, + "loss": 5.3089, + "step": 22400 + }, + { + "epoch": 0.13322509277761918, + "grad_norm": 1.7976210117340088, + "learning_rate": 4.7842282704395545e-05, + "loss": 5.1538, + "step": 22401 + }, + { + "epoch": 0.13323104006090017, + "grad_norm": 1.7573418617248535, + "learning_rate": 4.784209286740647e-05, + "loss": 5.3701, + "step": 22402 + }, + { + "epoch": 0.13323698734418118, + "grad_norm": 1.6944206953048706, + "learning_rate": 4.784190302244345e-05, + "loss": 4.8349, + "step": 22403 + }, + { + "epoch": 0.13324293462746217, + "grad_norm": 1.9255948066711426, + "learning_rate": 4.7841713169506555e-05, + "loss": 5.2077, + "step": 22404 + }, + { + "epoch": 0.13324888191074316, + "grad_norm": 1.7583602666854858, + "learning_rate": 4.784152330859586e-05, + "loss": 4.9968, + "step": 22405 + }, + { + "epoch": 0.13325482919402418, + "grad_norm": 1.6917812824249268, + "learning_rate": 4.784133343971142e-05, + "loss": 5.3295, + "step": 22406 + }, + { + "epoch": 0.13326077647730517, + "grad_norm": 1.5531493425369263, + "learning_rate": 4.784114356285331e-05, + "loss": 5.2978, + "step": 22407 + }, + { + "epoch": 0.13326672376058615, + "grad_norm": 1.5347543954849243, + "learning_rate": 4.7840953678021586e-05, + "loss": 5.2922, + "step": 22408 + }, + { + "epoch": 0.13327267104386717, + "grad_norm": 1.3059866428375244, + "learning_rate": 4.7840763785216323e-05, + "loss": 5.2255, + "step": 22409 + }, + { + "epoch": 0.13327861832714816, + "grad_norm": 1.2207573652267456, + "learning_rate": 4.784057388443759e-05, + "loss": 4.9595, + "step": 22410 + }, + { + "epoch": 0.13328456561042915, + "grad_norm": 1.9115726947784424, + "learning_rate": 4.784038397568545e-05, + "loss": 5.0465, + "step": 22411 + }, + { + "epoch": 0.13329051289371016, + "grad_norm": 1.907443642616272, + "learning_rate": 4.7840194058959965e-05, + "loss": 4.5429, + "step": 22412 + }, + { + "epoch": 0.13329646017699115, + "grad_norm": 1.7891590595245361, + "learning_rate": 4.78400041342612e-05, + "loss": 4.5718, + "step": 22413 + }, + { + "epoch": 0.13330240746027214, + "grad_norm": 1.7904539108276367, + "learning_rate": 4.7839814201589234e-05, + "loss": 4.7077, + "step": 22414 + }, + { + "epoch": 0.13330835474355315, + "grad_norm": 1.8562805652618408, + "learning_rate": 4.783962426094411e-05, + "loss": 4.8559, + "step": 22415 + }, + { + "epoch": 0.13331430202683414, + "grad_norm": 1.7840648889541626, + "learning_rate": 4.7839434312325924e-05, + "loss": 4.5559, + "step": 22416 + }, + { + "epoch": 0.13332024931011513, + "grad_norm": 1.8956695795059204, + "learning_rate": 4.783924435573472e-05, + "loss": 4.6933, + "step": 22417 + }, + { + "epoch": 0.13332619659339615, + "grad_norm": 1.798685073852539, + "learning_rate": 4.783905439117058e-05, + "loss": 4.5131, + "step": 22418 + }, + { + "epoch": 0.13333214387667713, + "grad_norm": 1.8377288579940796, + "learning_rate": 4.7838864418633554e-05, + "loss": 4.4986, + "step": 22419 + }, + { + "epoch": 0.13333809115995812, + "grad_norm": 1.8382439613342285, + "learning_rate": 4.783867443812372e-05, + "loss": 5.1565, + "step": 22420 + }, + { + "epoch": 0.13334403844323914, + "grad_norm": 2.030796766281128, + "learning_rate": 4.783848444964114e-05, + "loss": 5.4532, + "step": 22421 + }, + { + "epoch": 0.13334998572652013, + "grad_norm": 2.020561695098877, + "learning_rate": 4.7838294453185886e-05, + "loss": 5.4529, + "step": 22422 + }, + { + "epoch": 0.13335593300980111, + "grad_norm": 1.8092904090881348, + "learning_rate": 4.783810444875801e-05, + "loss": 5.4092, + "step": 22423 + }, + { + "epoch": 0.13336188029308213, + "grad_norm": 1.7571618556976318, + "learning_rate": 4.78379144363576e-05, + "loss": 5.5134, + "step": 22424 + }, + { + "epoch": 0.13336782757636312, + "grad_norm": 1.8572049140930176, + "learning_rate": 4.7837724415984694e-05, + "loss": 5.1786, + "step": 22425 + }, + { + "epoch": 0.1333737748596441, + "grad_norm": 2.3944039344787598, + "learning_rate": 4.783753438763938e-05, + "loss": 4.7667, + "step": 22426 + }, + { + "epoch": 0.13337972214292512, + "grad_norm": 1.9377988576889038, + "learning_rate": 4.7837344351321725e-05, + "loss": 5.6523, + "step": 22427 + }, + { + "epoch": 0.1333856694262061, + "grad_norm": 1.7981183528900146, + "learning_rate": 4.783715430703178e-05, + "loss": 5.5374, + "step": 22428 + }, + { + "epoch": 0.1333916167094871, + "grad_norm": 1.6658248901367188, + "learning_rate": 4.783696425476963e-05, + "loss": 5.5128, + "step": 22429 + }, + { + "epoch": 0.13339756399276811, + "grad_norm": 1.6594502925872803, + "learning_rate": 4.783677419453533e-05, + "loss": 5.5225, + "step": 22430 + }, + { + "epoch": 0.1334035112760491, + "grad_norm": 1.6250741481781006, + "learning_rate": 4.7836584126328945e-05, + "loss": 5.4027, + "step": 22431 + }, + { + "epoch": 0.1334094585593301, + "grad_norm": 1.633254885673523, + "learning_rate": 4.783639405015054e-05, + "loss": 5.3856, + "step": 22432 + }, + { + "epoch": 0.1334154058426111, + "grad_norm": 1.5948752164840698, + "learning_rate": 4.783620396600019e-05, + "loss": 5.5501, + "step": 22433 + }, + { + "epoch": 0.1334213531258921, + "grad_norm": 2.007847547531128, + "learning_rate": 4.783601387387796e-05, + "loss": 4.878, + "step": 22434 + }, + { + "epoch": 0.13342730040917308, + "grad_norm": 2.4036359786987305, + "learning_rate": 4.783582377378391e-05, + "loss": 3.8348, + "step": 22435 + }, + { + "epoch": 0.1334332476924541, + "grad_norm": 2.7686264514923096, + "learning_rate": 4.783563366571811e-05, + "loss": 3.13, + "step": 22436 + }, + { + "epoch": 0.1334391949757351, + "grad_norm": 2.4651095867156982, + "learning_rate": 4.7835443549680625e-05, + "loss": 2.9104, + "step": 22437 + }, + { + "epoch": 0.13344514225901608, + "grad_norm": 2.57837176322937, + "learning_rate": 4.7835253425671526e-05, + "loss": 3.1145, + "step": 22438 + }, + { + "epoch": 0.1334510895422971, + "grad_norm": 2.804194688796997, + "learning_rate": 4.783506329369087e-05, + "loss": 3.7685, + "step": 22439 + }, + { + "epoch": 0.13345703682557808, + "grad_norm": 2.5836985111236572, + "learning_rate": 4.783487315373874e-05, + "loss": 3.383, + "step": 22440 + }, + { + "epoch": 0.13346298410885907, + "grad_norm": 2.5800416469573975, + "learning_rate": 4.7834683005815184e-05, + "loss": 3.345, + "step": 22441 + }, + { + "epoch": 0.13346893139214008, + "grad_norm": 2.695234775543213, + "learning_rate": 4.7834492849920275e-05, + "loss": 3.7905, + "step": 22442 + }, + { + "epoch": 0.13347487867542107, + "grad_norm": 2.075918436050415, + "learning_rate": 4.783430268605409e-05, + "loss": 4.3114, + "step": 22443 + }, + { + "epoch": 0.13348082595870206, + "grad_norm": 2.221691131591797, + "learning_rate": 4.7834112514216676e-05, + "loss": 5.5658, + "step": 22444 + }, + { + "epoch": 0.13348677324198308, + "grad_norm": 1.9432377815246582, + "learning_rate": 4.783392233440811e-05, + "loss": 5.2566, + "step": 22445 + }, + { + "epoch": 0.13349272052526406, + "grad_norm": 1.9735411405563354, + "learning_rate": 4.783373214662846e-05, + "loss": 4.2656, + "step": 22446 + }, + { + "epoch": 0.13349866780854505, + "grad_norm": 1.8616423606872559, + "learning_rate": 4.783354195087779e-05, + "loss": 4.2018, + "step": 22447 + }, + { + "epoch": 0.13350461509182607, + "grad_norm": 1.9751770496368408, + "learning_rate": 4.783335174715617e-05, + "loss": 4.1716, + "step": 22448 + }, + { + "epoch": 0.13351056237510706, + "grad_norm": 2.053149461746216, + "learning_rate": 4.7833161535463656e-05, + "loss": 4.0603, + "step": 22449 + }, + { + "epoch": 0.13351650965838804, + "grad_norm": 1.8129456043243408, + "learning_rate": 4.7832971315800325e-05, + "loss": 4.098, + "step": 22450 + }, + { + "epoch": 0.13352245694166903, + "grad_norm": 1.8842658996582031, + "learning_rate": 4.783278108816624e-05, + "loss": 4.1225, + "step": 22451 + }, + { + "epoch": 0.13352840422495005, + "grad_norm": 1.9037132263183594, + "learning_rate": 4.783259085256146e-05, + "loss": 4.0953, + "step": 22452 + }, + { + "epoch": 0.13353435150823104, + "grad_norm": 1.8058161735534668, + "learning_rate": 4.7832400608986074e-05, + "loss": 3.9189, + "step": 22453 + }, + { + "epoch": 0.13354029879151202, + "grad_norm": 1.899573564529419, + "learning_rate": 4.7832210357440124e-05, + "loss": 4.063, + "step": 22454 + }, + { + "epoch": 0.13354624607479304, + "grad_norm": 1.8507969379425049, + "learning_rate": 4.783202009792368e-05, + "loss": 4.1139, + "step": 22455 + }, + { + "epoch": 0.13355219335807403, + "grad_norm": 1.861315369606018, + "learning_rate": 4.783182983043681e-05, + "loss": 4.1063, + "step": 22456 + }, + { + "epoch": 0.13355814064135502, + "grad_norm": 1.9481399059295654, + "learning_rate": 4.7831639554979603e-05, + "loss": 4.1103, + "step": 22457 + }, + { + "epoch": 0.13356408792463603, + "grad_norm": 1.9315237998962402, + "learning_rate": 4.7831449271552086e-05, + "loss": 4.0723, + "step": 22458 + }, + { + "epoch": 0.13357003520791702, + "grad_norm": 1.951989769935608, + "learning_rate": 4.783125898015436e-05, + "loss": 4.3063, + "step": 22459 + }, + { + "epoch": 0.133575982491198, + "grad_norm": 1.8107032775878906, + "learning_rate": 4.783106868078647e-05, + "loss": 4.1869, + "step": 22460 + }, + { + "epoch": 0.13358192977447902, + "grad_norm": 1.8079946041107178, + "learning_rate": 4.7830878373448495e-05, + "loss": 4.2569, + "step": 22461 + }, + { + "epoch": 0.13358787705776, + "grad_norm": 1.9094295501708984, + "learning_rate": 4.7830688058140494e-05, + "loss": 4.8144, + "step": 22462 + }, + { + "epoch": 0.133593824341041, + "grad_norm": 1.9410862922668457, + "learning_rate": 4.7830497734862536e-05, + "loss": 4.6606, + "step": 22463 + }, + { + "epoch": 0.13359977162432202, + "grad_norm": 1.832387089729309, + "learning_rate": 4.783030740361469e-05, + "loss": 4.774, + "step": 22464 + }, + { + "epoch": 0.133605718907603, + "grad_norm": 1.8661162853240967, + "learning_rate": 4.783011706439701e-05, + "loss": 5.0414, + "step": 22465 + }, + { + "epoch": 0.133611666190884, + "grad_norm": 1.6019399166107178, + "learning_rate": 4.782992671720958e-05, + "loss": 5.1333, + "step": 22466 + }, + { + "epoch": 0.133617613474165, + "grad_norm": 1.539556860923767, + "learning_rate": 4.7829736362052455e-05, + "loss": 5.5576, + "step": 22467 + }, + { + "epoch": 0.133623560757446, + "grad_norm": 1.6988813877105713, + "learning_rate": 4.7829545998925704e-05, + "loss": 5.5953, + "step": 22468 + }, + { + "epoch": 0.13362950804072699, + "grad_norm": 1.77605140209198, + "learning_rate": 4.78293556278294e-05, + "loss": 5.1917, + "step": 22469 + }, + { + "epoch": 0.133635455324008, + "grad_norm": 1.958486557006836, + "learning_rate": 4.78291652487636e-05, + "loss": 5.141, + "step": 22470 + }, + { + "epoch": 0.133641402607289, + "grad_norm": 1.4875729084014893, + "learning_rate": 4.7828974861728374e-05, + "loss": 5.551, + "step": 22471 + }, + { + "epoch": 0.13364734989056998, + "grad_norm": 1.5118046998977661, + "learning_rate": 4.7828784466723795e-05, + "loss": 5.8965, + "step": 22472 + }, + { + "epoch": 0.133653297173851, + "grad_norm": 1.7107024192810059, + "learning_rate": 4.7828594063749924e-05, + "loss": 5.444, + "step": 22473 + }, + { + "epoch": 0.13365924445713198, + "grad_norm": 2.211569309234619, + "learning_rate": 4.7828403652806814e-05, + "loss": 4.6709, + "step": 22474 + }, + { + "epoch": 0.13366519174041297, + "grad_norm": 1.5755807161331177, + "learning_rate": 4.782821323389455e-05, + "loss": 5.481, + "step": 22475 + }, + { + "epoch": 0.13367113902369399, + "grad_norm": 1.5715577602386475, + "learning_rate": 4.782802280701319e-05, + "loss": 5.4475, + "step": 22476 + }, + { + "epoch": 0.13367708630697497, + "grad_norm": 1.483229160308838, + "learning_rate": 4.782783237216281e-05, + "loss": 5.287, + "step": 22477 + }, + { + "epoch": 0.13368303359025596, + "grad_norm": 1.6031765937805176, + "learning_rate": 4.782764192934347e-05, + "loss": 4.9328, + "step": 22478 + }, + { + "epoch": 0.13368898087353698, + "grad_norm": 1.5472909212112427, + "learning_rate": 4.782745147855523e-05, + "loss": 5.4962, + "step": 22479 + }, + { + "epoch": 0.13369492815681797, + "grad_norm": 1.5153834819793701, + "learning_rate": 4.7827261019798164e-05, + "loss": 5.2488, + "step": 22480 + }, + { + "epoch": 0.13370087544009895, + "grad_norm": 1.8485814332962036, + "learning_rate": 4.782707055307233e-05, + "loss": 4.6998, + "step": 22481 + }, + { + "epoch": 0.13370682272337997, + "grad_norm": 1.6526838541030884, + "learning_rate": 4.782688007837781e-05, + "loss": 4.7843, + "step": 22482 + }, + { + "epoch": 0.13371277000666096, + "grad_norm": 1.6769697666168213, + "learning_rate": 4.782668959571467e-05, + "loss": 4.8344, + "step": 22483 + }, + { + "epoch": 0.13371871728994195, + "grad_norm": 1.6509302854537964, + "learning_rate": 4.782649910508296e-05, + "loss": 5.0646, + "step": 22484 + }, + { + "epoch": 0.13372466457322296, + "grad_norm": 1.58712637424469, + "learning_rate": 4.782630860648275e-05, + "loss": 4.841, + "step": 22485 + }, + { + "epoch": 0.13373061185650395, + "grad_norm": 1.7171813249588013, + "learning_rate": 4.782611809991412e-05, + "loss": 5.5934, + "step": 22486 + }, + { + "epoch": 0.13373655913978494, + "grad_norm": 1.598689079284668, + "learning_rate": 4.782592758537712e-05, + "loss": 5.5131, + "step": 22487 + }, + { + "epoch": 0.13374250642306595, + "grad_norm": 1.652279019355774, + "learning_rate": 4.782573706287183e-05, + "loss": 4.9244, + "step": 22488 + }, + { + "epoch": 0.13374845370634694, + "grad_norm": 1.733337163925171, + "learning_rate": 4.782554653239831e-05, + "loss": 5.1153, + "step": 22489 + }, + { + "epoch": 0.13375440098962793, + "grad_norm": 1.3961280584335327, + "learning_rate": 4.782535599395662e-05, + "loss": 5.1146, + "step": 22490 + }, + { + "epoch": 0.13376034827290895, + "grad_norm": 1.371650218963623, + "learning_rate": 4.782516544754685e-05, + "loss": 4.9608, + "step": 22491 + }, + { + "epoch": 0.13376629555618993, + "grad_norm": 1.738678216934204, + "learning_rate": 4.782497489316904e-05, + "loss": 5.384, + "step": 22492 + }, + { + "epoch": 0.13377224283947092, + "grad_norm": 1.899530291557312, + "learning_rate": 4.7824784330823266e-05, + "loss": 5.479, + "step": 22493 + }, + { + "epoch": 0.13377819012275194, + "grad_norm": 1.6108837127685547, + "learning_rate": 4.782459376050959e-05, + "loss": 5.4919, + "step": 22494 + }, + { + "epoch": 0.13378413740603293, + "grad_norm": 1.688045859336853, + "learning_rate": 4.78244031822281e-05, + "loss": 5.5093, + "step": 22495 + }, + { + "epoch": 0.13379008468931392, + "grad_norm": 1.526538610458374, + "learning_rate": 4.782421259597884e-05, + "loss": 5.4022, + "step": 22496 + }, + { + "epoch": 0.13379603197259493, + "grad_norm": 1.5651198625564575, + "learning_rate": 4.7824022001761884e-05, + "loss": 5.3737, + "step": 22497 + }, + { + "epoch": 0.13380197925587592, + "grad_norm": 1.6090896129608154, + "learning_rate": 4.7823831399577296e-05, + "loss": 5.3482, + "step": 22498 + }, + { + "epoch": 0.1338079265391569, + "grad_norm": 1.5139176845550537, + "learning_rate": 4.782364078942514e-05, + "loss": 5.2195, + "step": 22499 + }, + { + "epoch": 0.13381387382243792, + "grad_norm": 1.468328833580017, + "learning_rate": 4.782345017130549e-05, + "loss": 5.4421, + "step": 22500 + }, + { + "epoch": 0.1338198211057189, + "grad_norm": 1.4803540706634521, + "learning_rate": 4.782325954521841e-05, + "loss": 5.8645, + "step": 22501 + }, + { + "epoch": 0.1338257683889999, + "grad_norm": 1.5472211837768555, + "learning_rate": 4.782306891116397e-05, + "loss": 5.5739, + "step": 22502 + }, + { + "epoch": 0.13383171567228092, + "grad_norm": 1.5523242950439453, + "learning_rate": 4.782287826914223e-05, + "loss": 5.4971, + "step": 22503 + }, + { + "epoch": 0.1338376629555619, + "grad_norm": 1.6459407806396484, + "learning_rate": 4.7822687619153264e-05, + "loss": 5.5006, + "step": 22504 + }, + { + "epoch": 0.1338436102388429, + "grad_norm": 1.9664801359176636, + "learning_rate": 4.782249696119712e-05, + "loss": 4.908, + "step": 22505 + }, + { + "epoch": 0.1338495575221239, + "grad_norm": 1.757797360420227, + "learning_rate": 4.782230629527389e-05, + "loss": 5.3259, + "step": 22506 + }, + { + "epoch": 0.1338555048054049, + "grad_norm": 1.734212040901184, + "learning_rate": 4.7822115621383626e-05, + "loss": 4.9526, + "step": 22507 + }, + { + "epoch": 0.13386145208868588, + "grad_norm": 1.7347631454467773, + "learning_rate": 4.7821924939526386e-05, + "loss": 4.9416, + "step": 22508 + }, + { + "epoch": 0.13386739937196687, + "grad_norm": 1.6283304691314697, + "learning_rate": 4.782173424970226e-05, + "loss": 5.1706, + "step": 22509 + }, + { + "epoch": 0.1338733466552479, + "grad_norm": 1.6665587425231934, + "learning_rate": 4.7821543551911294e-05, + "loss": 5.6977, + "step": 22510 + }, + { + "epoch": 0.13387929393852888, + "grad_norm": 1.5051319599151611, + "learning_rate": 4.7821352846153576e-05, + "loss": 5.7575, + "step": 22511 + }, + { + "epoch": 0.13388524122180986, + "grad_norm": 1.966944932937622, + "learning_rate": 4.7821162132429154e-05, + "loss": 4.8996, + "step": 22512 + }, + { + "epoch": 0.13389118850509088, + "grad_norm": 2.669949769973755, + "learning_rate": 4.782097141073809e-05, + "loss": 3.7917, + "step": 22513 + }, + { + "epoch": 0.13389713578837187, + "grad_norm": 2.743389844894409, + "learning_rate": 4.782078068108048e-05, + "loss": 3.658, + "step": 22514 + }, + { + "epoch": 0.13390308307165286, + "grad_norm": 2.8011279106140137, + "learning_rate": 4.782058994345635e-05, + "loss": 3.4269, + "step": 22515 + }, + { + "epoch": 0.13390903035493387, + "grad_norm": 2.332318067550659, + "learning_rate": 4.78203991978658e-05, + "loss": 3.7318, + "step": 22516 + }, + { + "epoch": 0.13391497763821486, + "grad_norm": 2.1522371768951416, + "learning_rate": 4.782020844430888e-05, + "loss": 3.912, + "step": 22517 + }, + { + "epoch": 0.13392092492149585, + "grad_norm": 1.7325389385223389, + "learning_rate": 4.782001768278567e-05, + "loss": 5.2602, + "step": 22518 + }, + { + "epoch": 0.13392687220477686, + "grad_norm": 1.872207522392273, + "learning_rate": 4.7819826913296216e-05, + "loss": 5.3663, + "step": 22519 + }, + { + "epoch": 0.13393281948805785, + "grad_norm": 1.86244535446167, + "learning_rate": 4.78196361358406e-05, + "loss": 5.382, + "step": 22520 + }, + { + "epoch": 0.13393876677133884, + "grad_norm": 1.6984341144561768, + "learning_rate": 4.781944535041889e-05, + "loss": 5.2243, + "step": 22521 + }, + { + "epoch": 0.13394471405461986, + "grad_norm": 1.7697153091430664, + "learning_rate": 4.781925455703114e-05, + "loss": 5.2368, + "step": 22522 + }, + { + "epoch": 0.13395066133790084, + "grad_norm": 2.323636293411255, + "learning_rate": 4.781906375567743e-05, + "loss": 4.7709, + "step": 22523 + }, + { + "epoch": 0.13395660862118183, + "grad_norm": 2.2196481227874756, + "learning_rate": 4.781887294635782e-05, + "loss": 4.8089, + "step": 22524 + }, + { + "epoch": 0.13396255590446285, + "grad_norm": 1.8148611783981323, + "learning_rate": 4.7818682129072365e-05, + "loss": 4.673, + "step": 22525 + }, + { + "epoch": 0.13396850318774384, + "grad_norm": 1.9306626319885254, + "learning_rate": 4.7818491303821155e-05, + "loss": 5.3217, + "step": 22526 + }, + { + "epoch": 0.13397445047102483, + "grad_norm": 1.9646215438842773, + "learning_rate": 4.781830047060425e-05, + "loss": 4.9239, + "step": 22527 + }, + { + "epoch": 0.13398039775430584, + "grad_norm": 1.7711313962936401, + "learning_rate": 4.7818109629421706e-05, + "loss": 5.1977, + "step": 22528 + }, + { + "epoch": 0.13398634503758683, + "grad_norm": 1.5714713335037231, + "learning_rate": 4.781791878027359e-05, + "loss": 5.1759, + "step": 22529 + }, + { + "epoch": 0.13399229232086782, + "grad_norm": 1.573440670967102, + "learning_rate": 4.781772792315998e-05, + "loss": 5.2892, + "step": 22530 + }, + { + "epoch": 0.13399823960414883, + "grad_norm": 1.484643816947937, + "learning_rate": 4.781753705808094e-05, + "loss": 5.2751, + "step": 22531 + }, + { + "epoch": 0.13400418688742982, + "grad_norm": 1.484236240386963, + "learning_rate": 4.781734618503653e-05, + "loss": 5.1928, + "step": 22532 + }, + { + "epoch": 0.1340101341707108, + "grad_norm": 1.6469415426254272, + "learning_rate": 4.781715530402682e-05, + "loss": 4.9161, + "step": 22533 + }, + { + "epoch": 0.13401608145399183, + "grad_norm": 1.736928939819336, + "learning_rate": 4.781696441505188e-05, + "loss": 5.2132, + "step": 22534 + }, + { + "epoch": 0.1340220287372728, + "grad_norm": 1.6927560567855835, + "learning_rate": 4.781677351811177e-05, + "loss": 5.1001, + "step": 22535 + }, + { + "epoch": 0.1340279760205538, + "grad_norm": 1.4961135387420654, + "learning_rate": 4.7816582613206564e-05, + "loss": 4.8025, + "step": 22536 + }, + { + "epoch": 0.13403392330383482, + "grad_norm": 1.6069209575653076, + "learning_rate": 4.7816391700336315e-05, + "loss": 5.1449, + "step": 22537 + }, + { + "epoch": 0.1340398705871158, + "grad_norm": 1.9168766736984253, + "learning_rate": 4.781620077950111e-05, + "loss": 5.1479, + "step": 22538 + }, + { + "epoch": 0.1340458178703968, + "grad_norm": 1.545693278312683, + "learning_rate": 4.7816009850701e-05, + "loss": 5.1445, + "step": 22539 + }, + { + "epoch": 0.1340517651536778, + "grad_norm": 2.524106740951538, + "learning_rate": 4.781581891393606e-05, + "loss": 4.3988, + "step": 22540 + }, + { + "epoch": 0.1340577124369588, + "grad_norm": 3.073733329772949, + "learning_rate": 4.781562796920635e-05, + "loss": 4.8931, + "step": 22541 + }, + { + "epoch": 0.1340636597202398, + "grad_norm": 2.1566405296325684, + "learning_rate": 4.7815437016511936e-05, + "loss": 4.9778, + "step": 22542 + }, + { + "epoch": 0.1340696070035208, + "grad_norm": 1.6103532314300537, + "learning_rate": 4.78152460558529e-05, + "loss": 5.0521, + "step": 22543 + }, + { + "epoch": 0.1340755542868018, + "grad_norm": 2.068673849105835, + "learning_rate": 4.781505508722929e-05, + "loss": 4.4481, + "step": 22544 + }, + { + "epoch": 0.13408150157008278, + "grad_norm": 2.2658448219299316, + "learning_rate": 4.7814864110641175e-05, + "loss": 4.5904, + "step": 22545 + }, + { + "epoch": 0.1340874488533638, + "grad_norm": 1.6960278749465942, + "learning_rate": 4.781467312608864e-05, + "loss": 5.4661, + "step": 22546 + }, + { + "epoch": 0.13409339613664478, + "grad_norm": 1.7006616592407227, + "learning_rate": 4.781448213357173e-05, + "loss": 5.338, + "step": 22547 + }, + { + "epoch": 0.13409934341992577, + "grad_norm": 1.6810702085494995, + "learning_rate": 4.7814291133090515e-05, + "loss": 5.6328, + "step": 22548 + }, + { + "epoch": 0.1341052907032068, + "grad_norm": 1.788943886756897, + "learning_rate": 4.781410012464508e-05, + "loss": 4.7265, + "step": 22549 + }, + { + "epoch": 0.13411123798648777, + "grad_norm": 1.8539581298828125, + "learning_rate": 4.781390910823547e-05, + "loss": 5.0821, + "step": 22550 + }, + { + "epoch": 0.13411718526976876, + "grad_norm": 1.548677682876587, + "learning_rate": 4.781371808386176e-05, + "loss": 5.4704, + "step": 22551 + }, + { + "epoch": 0.13412313255304978, + "grad_norm": 1.4806692600250244, + "learning_rate": 4.781352705152402e-05, + "loss": 5.5085, + "step": 22552 + }, + { + "epoch": 0.13412907983633077, + "grad_norm": 1.5281784534454346, + "learning_rate": 4.781333601122231e-05, + "loss": 5.0698, + "step": 22553 + }, + { + "epoch": 0.13413502711961175, + "grad_norm": 1.681803822517395, + "learning_rate": 4.78131449629567e-05, + "loss": 4.6259, + "step": 22554 + }, + { + "epoch": 0.13414097440289277, + "grad_norm": 1.9039119482040405, + "learning_rate": 4.781295390672726e-05, + "loss": 4.967, + "step": 22555 + }, + { + "epoch": 0.13414692168617376, + "grad_norm": 1.3885890245437622, + "learning_rate": 4.781276284253405e-05, + "loss": 4.5992, + "step": 22556 + }, + { + "epoch": 0.13415286896945475, + "grad_norm": 1.5828464031219482, + "learning_rate": 4.781257177037714e-05, + "loss": 4.6859, + "step": 22557 + }, + { + "epoch": 0.13415881625273576, + "grad_norm": 1.6242060661315918, + "learning_rate": 4.78123806902566e-05, + "loss": 4.7105, + "step": 22558 + }, + { + "epoch": 0.13416476353601675, + "grad_norm": 1.6682454347610474, + "learning_rate": 4.781218960217249e-05, + "loss": 4.8545, + "step": 22559 + }, + { + "epoch": 0.13417071081929774, + "grad_norm": 1.8982216119766235, + "learning_rate": 4.781199850612489e-05, + "loss": 5.4946, + "step": 22560 + }, + { + "epoch": 0.13417665810257876, + "grad_norm": 1.916904330253601, + "learning_rate": 4.781180740211384e-05, + "loss": 5.7877, + "step": 22561 + }, + { + "epoch": 0.13418260538585974, + "grad_norm": 2.1762099266052246, + "learning_rate": 4.781161629013944e-05, + "loss": 5.7918, + "step": 22562 + }, + { + "epoch": 0.13418855266914073, + "grad_norm": 1.7190003395080566, + "learning_rate": 4.7811425170201726e-05, + "loss": 5.5881, + "step": 22563 + }, + { + "epoch": 0.13419449995242175, + "grad_norm": 1.5587143898010254, + "learning_rate": 4.781123404230079e-05, + "loss": 5.5391, + "step": 22564 + }, + { + "epoch": 0.13420044723570274, + "grad_norm": 1.8347082138061523, + "learning_rate": 4.7811042906436684e-05, + "loss": 5.7366, + "step": 22565 + }, + { + "epoch": 0.13420639451898372, + "grad_norm": 1.5644575357437134, + "learning_rate": 4.7810851762609484e-05, + "loss": 5.6529, + "step": 22566 + }, + { + "epoch": 0.1342123418022647, + "grad_norm": 1.6571894884109497, + "learning_rate": 4.7810660610819246e-05, + "loss": 5.1555, + "step": 22567 + }, + { + "epoch": 0.13421828908554573, + "grad_norm": 1.8291380405426025, + "learning_rate": 4.7810469451066045e-05, + "loss": 5.878, + "step": 22568 + }, + { + "epoch": 0.13422423636882672, + "grad_norm": 1.8254185914993286, + "learning_rate": 4.781027828334994e-05, + "loss": 5.0244, + "step": 22569 + }, + { + "epoch": 0.1342301836521077, + "grad_norm": 1.5728260278701782, + "learning_rate": 4.7810087107671e-05, + "loss": 5.8684, + "step": 22570 + }, + { + "epoch": 0.13423613093538872, + "grad_norm": 1.4518792629241943, + "learning_rate": 4.7809895924029303e-05, + "loss": 6.0868, + "step": 22571 + }, + { + "epoch": 0.1342420782186697, + "grad_norm": 1.5205591917037964, + "learning_rate": 4.7809704732424905e-05, + "loss": 5.3721, + "step": 22572 + }, + { + "epoch": 0.1342480255019507, + "grad_norm": 1.7081562280654907, + "learning_rate": 4.7809513532857876e-05, + "loss": 4.9758, + "step": 22573 + }, + { + "epoch": 0.1342539727852317, + "grad_norm": 1.4048930406570435, + "learning_rate": 4.7809322325328275e-05, + "loss": 5.4701, + "step": 22574 + }, + { + "epoch": 0.1342599200685127, + "grad_norm": 1.5663319826126099, + "learning_rate": 4.780913110983618e-05, + "loss": 5.1094, + "step": 22575 + }, + { + "epoch": 0.1342658673517937, + "grad_norm": 1.6008634567260742, + "learning_rate": 4.780893988638165e-05, + "loss": 5.2138, + "step": 22576 + }, + { + "epoch": 0.1342718146350747, + "grad_norm": 1.5711628198623657, + "learning_rate": 4.780874865496475e-05, + "loss": 5.7172, + "step": 22577 + }, + { + "epoch": 0.1342777619183557, + "grad_norm": 1.799984335899353, + "learning_rate": 4.7808557415585566e-05, + "loss": 4.8959, + "step": 22578 + }, + { + "epoch": 0.13428370920163668, + "grad_norm": 1.7693933248519897, + "learning_rate": 4.7808366168244137e-05, + "loss": 5.376, + "step": 22579 + }, + { + "epoch": 0.1342896564849177, + "grad_norm": 2.1041815280914307, + "learning_rate": 4.780817491294055e-05, + "loss": 5.4672, + "step": 22580 + }, + { + "epoch": 0.13429560376819868, + "grad_norm": 1.8219122886657715, + "learning_rate": 4.780798364967486e-05, + "loss": 5.6201, + "step": 22581 + }, + { + "epoch": 0.13430155105147967, + "grad_norm": 1.5907140970230103, + "learning_rate": 4.780779237844715e-05, + "loss": 5.2499, + "step": 22582 + }, + { + "epoch": 0.1343074983347607, + "grad_norm": 1.388074278831482, + "learning_rate": 4.780760109925746e-05, + "loss": 5.535, + "step": 22583 + }, + { + "epoch": 0.13431344561804168, + "grad_norm": 1.4996978044509888, + "learning_rate": 4.780740981210588e-05, + "loss": 5.2713, + "step": 22584 + }, + { + "epoch": 0.13431939290132267, + "grad_norm": 1.591178059577942, + "learning_rate": 4.780721851699247e-05, + "loss": 5.2211, + "step": 22585 + }, + { + "epoch": 0.13432534018460368, + "grad_norm": 1.5548349618911743, + "learning_rate": 4.780702721391729e-05, + "loss": 5.2867, + "step": 22586 + }, + { + "epoch": 0.13433128746788467, + "grad_norm": 1.5549981594085693, + "learning_rate": 4.780683590288042e-05, + "loss": 5.3627, + "step": 22587 + }, + { + "epoch": 0.13433723475116566, + "grad_norm": 1.4587602615356445, + "learning_rate": 4.780664458388191e-05, + "loss": 5.2031, + "step": 22588 + }, + { + "epoch": 0.13434318203444667, + "grad_norm": 1.836823582649231, + "learning_rate": 4.7806453256921846e-05, + "loss": 4.9802, + "step": 22589 + }, + { + "epoch": 0.13434912931772766, + "grad_norm": 1.5445985794067383, + "learning_rate": 4.780626192200027e-05, + "loss": 4.8789, + "step": 22590 + }, + { + "epoch": 0.13435507660100865, + "grad_norm": 1.5032085180282593, + "learning_rate": 4.780607057911728e-05, + "loss": 4.936, + "step": 22591 + }, + { + "epoch": 0.13436102388428967, + "grad_norm": 1.5628653764724731, + "learning_rate": 4.780587922827292e-05, + "loss": 4.9026, + "step": 22592 + }, + { + "epoch": 0.13436697116757065, + "grad_norm": 2.011505126953125, + "learning_rate": 4.7805687869467265e-05, + "loss": 4.5883, + "step": 22593 + }, + { + "epoch": 0.13437291845085164, + "grad_norm": 1.824877142906189, + "learning_rate": 4.780549650270038e-05, + "loss": 4.7637, + "step": 22594 + }, + { + "epoch": 0.13437886573413266, + "grad_norm": 1.3882604837417603, + "learning_rate": 4.780530512797232e-05, + "loss": 5.1455, + "step": 22595 + }, + { + "epoch": 0.13438481301741365, + "grad_norm": 1.6364738941192627, + "learning_rate": 4.780511374528318e-05, + "loss": 4.7607, + "step": 22596 + }, + { + "epoch": 0.13439076030069463, + "grad_norm": 1.6384764909744263, + "learning_rate": 4.7804922354633004e-05, + "loss": 4.8959, + "step": 22597 + }, + { + "epoch": 0.13439670758397565, + "grad_norm": 1.53514564037323, + "learning_rate": 4.780473095602186e-05, + "loss": 4.9072, + "step": 22598 + }, + { + "epoch": 0.13440265486725664, + "grad_norm": 1.5599232912063599, + "learning_rate": 4.780453954944983e-05, + "loss": 5.0727, + "step": 22599 + }, + { + "epoch": 0.13440860215053763, + "grad_norm": 1.6296029090881348, + "learning_rate": 4.780434813491696e-05, + "loss": 5.1448, + "step": 22600 + }, + { + "epoch": 0.13441454943381864, + "grad_norm": 1.8083057403564453, + "learning_rate": 4.780415671242334e-05, + "loss": 5.0841, + "step": 22601 + }, + { + "epoch": 0.13442049671709963, + "grad_norm": 1.668716311454773, + "learning_rate": 4.780396528196902e-05, + "loss": 5.0684, + "step": 22602 + }, + { + "epoch": 0.13442644400038062, + "grad_norm": 1.5879114866256714, + "learning_rate": 4.7803773843554065e-05, + "loss": 5.3685, + "step": 22603 + }, + { + "epoch": 0.13443239128366163, + "grad_norm": 1.6570247411727905, + "learning_rate": 4.780358239717855e-05, + "loss": 5.2864, + "step": 22604 + }, + { + "epoch": 0.13443833856694262, + "grad_norm": 1.5763763189315796, + "learning_rate": 4.780339094284254e-05, + "loss": 5.1896, + "step": 22605 + }, + { + "epoch": 0.1344442858502236, + "grad_norm": 1.6956191062927246, + "learning_rate": 4.7803199480546105e-05, + "loss": 5.1213, + "step": 22606 + }, + { + "epoch": 0.13445023313350463, + "grad_norm": 1.64959716796875, + "learning_rate": 4.780300801028931e-05, + "loss": 4.8764, + "step": 22607 + }, + { + "epoch": 0.13445618041678561, + "grad_norm": 1.7988736629486084, + "learning_rate": 4.7802816532072216e-05, + "loss": 4.7578, + "step": 22608 + }, + { + "epoch": 0.1344621277000666, + "grad_norm": 1.6349395513534546, + "learning_rate": 4.78026250458949e-05, + "loss": 5.5973, + "step": 22609 + }, + { + "epoch": 0.13446807498334762, + "grad_norm": 1.7561520338058472, + "learning_rate": 4.7802433551757416e-05, + "loss": 4.5933, + "step": 22610 + }, + { + "epoch": 0.1344740222666286, + "grad_norm": 1.7918694019317627, + "learning_rate": 4.780224204965984e-05, + "loss": 4.6726, + "step": 22611 + }, + { + "epoch": 0.1344799695499096, + "grad_norm": 1.6543810367584229, + "learning_rate": 4.780205053960224e-05, + "loss": 5.0966, + "step": 22612 + }, + { + "epoch": 0.1344859168331906, + "grad_norm": 1.4896337985992432, + "learning_rate": 4.7801859021584685e-05, + "loss": 4.9243, + "step": 22613 + }, + { + "epoch": 0.1344918641164716, + "grad_norm": 1.6509222984313965, + "learning_rate": 4.780166749560723e-05, + "loss": 5.0023, + "step": 22614 + }, + { + "epoch": 0.1344978113997526, + "grad_norm": 1.7909302711486816, + "learning_rate": 4.7801475961669944e-05, + "loss": 4.8274, + "step": 22615 + }, + { + "epoch": 0.1345037586830336, + "grad_norm": 1.7640331983566284, + "learning_rate": 4.780128441977291e-05, + "loss": 4.7262, + "step": 22616 + }, + { + "epoch": 0.1345097059663146, + "grad_norm": 1.6381694078445435, + "learning_rate": 4.780109286991617e-05, + "loss": 4.9027, + "step": 22617 + }, + { + "epoch": 0.13451565324959558, + "grad_norm": 1.830243468284607, + "learning_rate": 4.780090131209981e-05, + "loss": 4.837, + "step": 22618 + }, + { + "epoch": 0.1345216005328766, + "grad_norm": 1.6413569450378418, + "learning_rate": 4.780070974632389e-05, + "loss": 4.6675, + "step": 22619 + }, + { + "epoch": 0.13452754781615758, + "grad_norm": 1.7041996717453003, + "learning_rate": 4.780051817258848e-05, + "loss": 4.4556, + "step": 22620 + }, + { + "epoch": 0.13453349509943857, + "grad_norm": 1.6706191301345825, + "learning_rate": 4.780032659089364e-05, + "loss": 5.29, + "step": 22621 + }, + { + "epoch": 0.1345394423827196, + "grad_norm": 1.6883933544158936, + "learning_rate": 4.780013500123945e-05, + "loss": 5.2777, + "step": 22622 + }, + { + "epoch": 0.13454538966600058, + "grad_norm": 1.6006532907485962, + "learning_rate": 4.779994340362596e-05, + "loss": 5.1652, + "step": 22623 + }, + { + "epoch": 0.13455133694928156, + "grad_norm": 1.5645374059677124, + "learning_rate": 4.779975179805325e-05, + "loss": 5.0746, + "step": 22624 + }, + { + "epoch": 0.13455728423256255, + "grad_norm": 1.4294723272323608, + "learning_rate": 4.7799560184521384e-05, + "loss": 5.1747, + "step": 22625 + }, + { + "epoch": 0.13456323151584357, + "grad_norm": 1.5289671421051025, + "learning_rate": 4.7799368563030424e-05, + "loss": 5.0096, + "step": 22626 + }, + { + "epoch": 0.13456917879912456, + "grad_norm": 1.4476962089538574, + "learning_rate": 4.779917693358044e-05, + "loss": 5.1043, + "step": 22627 + }, + { + "epoch": 0.13457512608240554, + "grad_norm": 1.647494912147522, + "learning_rate": 4.7798985296171494e-05, + "loss": 5.2014, + "step": 22628 + }, + { + "epoch": 0.13458107336568656, + "grad_norm": 1.6972601413726807, + "learning_rate": 4.7798793650803665e-05, + "loss": 5.1526, + "step": 22629 + }, + { + "epoch": 0.13458702064896755, + "grad_norm": 1.7442299127578735, + "learning_rate": 4.779860199747701e-05, + "loss": 5.3699, + "step": 22630 + }, + { + "epoch": 0.13459296793224854, + "grad_norm": 1.5356593132019043, + "learning_rate": 4.77984103361916e-05, + "loss": 5.164, + "step": 22631 + }, + { + "epoch": 0.13459891521552955, + "grad_norm": 1.4700989723205566, + "learning_rate": 4.77982186669475e-05, + "loss": 4.7305, + "step": 22632 + }, + { + "epoch": 0.13460486249881054, + "grad_norm": 1.4296282529830933, + "learning_rate": 4.779802698974477e-05, + "loss": 4.7196, + "step": 22633 + }, + { + "epoch": 0.13461080978209153, + "grad_norm": 1.4722986221313477, + "learning_rate": 4.7797835304583494e-05, + "loss": 4.763, + "step": 22634 + }, + { + "epoch": 0.13461675706537254, + "grad_norm": 1.4767835140228271, + "learning_rate": 4.779764361146373e-05, + "loss": 4.6168, + "step": 22635 + }, + { + "epoch": 0.13462270434865353, + "grad_norm": 1.5353070497512817, + "learning_rate": 4.779745191038554e-05, + "loss": 4.8458, + "step": 22636 + }, + { + "epoch": 0.13462865163193452, + "grad_norm": 1.6942658424377441, + "learning_rate": 4.779726020134899e-05, + "loss": 4.8253, + "step": 22637 + }, + { + "epoch": 0.13463459891521554, + "grad_norm": 1.3153749704360962, + "learning_rate": 4.779706848435416e-05, + "loss": 4.6095, + "step": 22638 + }, + { + "epoch": 0.13464054619849652, + "grad_norm": 1.5381252765655518, + "learning_rate": 4.779687675940111e-05, + "loss": 4.202, + "step": 22639 + }, + { + "epoch": 0.1346464934817775, + "grad_norm": 1.5490522384643555, + "learning_rate": 4.779668502648989e-05, + "loss": 4.9204, + "step": 22640 + }, + { + "epoch": 0.13465244076505853, + "grad_norm": 1.518019676208496, + "learning_rate": 4.7796493285620604e-05, + "loss": 5.3894, + "step": 22641 + }, + { + "epoch": 0.13465838804833952, + "grad_norm": 1.635918378829956, + "learning_rate": 4.7796301536793284e-05, + "loss": 4.3345, + "step": 22642 + }, + { + "epoch": 0.1346643353316205, + "grad_norm": 1.7409108877182007, + "learning_rate": 4.779610978000802e-05, + "loss": 4.2783, + "step": 22643 + }, + { + "epoch": 0.13467028261490152, + "grad_norm": 1.7899144887924194, + "learning_rate": 4.7795918015264865e-05, + "loss": 4.8578, + "step": 22644 + }, + { + "epoch": 0.1346762298981825, + "grad_norm": 1.6725822687149048, + "learning_rate": 4.779572624256389e-05, + "loss": 4.7902, + "step": 22645 + }, + { + "epoch": 0.1346821771814635, + "grad_norm": 1.8630287647247314, + "learning_rate": 4.7795534461905165e-05, + "loss": 4.5775, + "step": 22646 + }, + { + "epoch": 0.1346881244647445, + "grad_norm": 1.6607400178909302, + "learning_rate": 4.779534267328875e-05, + "loss": 4.7948, + "step": 22647 + }, + { + "epoch": 0.1346940717480255, + "grad_norm": 1.5015220642089844, + "learning_rate": 4.7795150876714726e-05, + "loss": 4.3331, + "step": 22648 + }, + { + "epoch": 0.1347000190313065, + "grad_norm": 1.5176305770874023, + "learning_rate": 4.779495907218314e-05, + "loss": 4.7168, + "step": 22649 + }, + { + "epoch": 0.1347059663145875, + "grad_norm": 1.8669017553329468, + "learning_rate": 4.7794767259694076e-05, + "loss": 4.6268, + "step": 22650 + }, + { + "epoch": 0.1347119135978685, + "grad_norm": 1.795281171798706, + "learning_rate": 4.7794575439247586e-05, + "loss": 4.6233, + "step": 22651 + }, + { + "epoch": 0.13471786088114948, + "grad_norm": 1.9019118547439575, + "learning_rate": 4.779438361084375e-05, + "loss": 4.9087, + "step": 22652 + }, + { + "epoch": 0.1347238081644305, + "grad_norm": 1.8863301277160645, + "learning_rate": 4.779419177448263e-05, + "loss": 4.6571, + "step": 22653 + }, + { + "epoch": 0.13472975544771149, + "grad_norm": 1.7758681774139404, + "learning_rate": 4.779399993016429e-05, + "loss": 4.7445, + "step": 22654 + }, + { + "epoch": 0.13473570273099247, + "grad_norm": 1.8668162822723389, + "learning_rate": 4.7793808077888804e-05, + "loss": 4.8334, + "step": 22655 + }, + { + "epoch": 0.1347416500142735, + "grad_norm": 1.8495571613311768, + "learning_rate": 4.7793616217656235e-05, + "loss": 4.7865, + "step": 22656 + }, + { + "epoch": 0.13474759729755448, + "grad_norm": 2.0655038356781006, + "learning_rate": 4.779342434946665e-05, + "loss": 4.6479, + "step": 22657 + }, + { + "epoch": 0.13475354458083547, + "grad_norm": 1.8008273839950562, + "learning_rate": 4.7793232473320116e-05, + "loss": 4.8482, + "step": 22658 + }, + { + "epoch": 0.13475949186411648, + "grad_norm": 1.8431730270385742, + "learning_rate": 4.7793040589216695e-05, + "loss": 4.5315, + "step": 22659 + }, + { + "epoch": 0.13476543914739747, + "grad_norm": 1.7335654497146606, + "learning_rate": 4.779284869715647e-05, + "loss": 5.2788, + "step": 22660 + }, + { + "epoch": 0.13477138643067846, + "grad_norm": 1.6339887380599976, + "learning_rate": 4.779265679713949e-05, + "loss": 4.9113, + "step": 22661 + }, + { + "epoch": 0.13477733371395947, + "grad_norm": 1.746029019355774, + "learning_rate": 4.7792464889165825e-05, + "loss": 5.3739, + "step": 22662 + }, + { + "epoch": 0.13478328099724046, + "grad_norm": 1.6831165552139282, + "learning_rate": 4.7792272973235554e-05, + "loss": 5.2394, + "step": 22663 + }, + { + "epoch": 0.13478922828052145, + "grad_norm": 1.629170298576355, + "learning_rate": 4.7792081049348737e-05, + "loss": 5.0894, + "step": 22664 + }, + { + "epoch": 0.13479517556380247, + "grad_norm": 1.71427321434021, + "learning_rate": 4.779188911750543e-05, + "loss": 4.9391, + "step": 22665 + }, + { + "epoch": 0.13480112284708345, + "grad_norm": 1.6911921501159668, + "learning_rate": 4.779169717770572e-05, + "loss": 4.965, + "step": 22666 + }, + { + "epoch": 0.13480707013036444, + "grad_norm": 1.6597939729690552, + "learning_rate": 4.779150522994965e-05, + "loss": 5.1885, + "step": 22667 + }, + { + "epoch": 0.13481301741364546, + "grad_norm": 1.8732246160507202, + "learning_rate": 4.779131327423732e-05, + "loss": 4.7274, + "step": 22668 + }, + { + "epoch": 0.13481896469692645, + "grad_norm": 1.6462973356246948, + "learning_rate": 4.7791121310568765e-05, + "loss": 5.0614, + "step": 22669 + }, + { + "epoch": 0.13482491198020743, + "grad_norm": 1.5832293033599854, + "learning_rate": 4.7790929338944065e-05, + "loss": 5.4794, + "step": 22670 + }, + { + "epoch": 0.13483085926348845, + "grad_norm": 1.8505337238311768, + "learning_rate": 4.7790737359363293e-05, + "loss": 5.3381, + "step": 22671 + }, + { + "epoch": 0.13483680654676944, + "grad_norm": 1.4535889625549316, + "learning_rate": 4.7790545371826504e-05, + "loss": 5.1247, + "step": 22672 + }, + { + "epoch": 0.13484275383005043, + "grad_norm": 2.478214979171753, + "learning_rate": 4.779035337633377e-05, + "loss": 5.2909, + "step": 22673 + }, + { + "epoch": 0.13484870111333144, + "grad_norm": 1.3034166097640991, + "learning_rate": 4.7790161372885176e-05, + "loss": 5.36, + "step": 22674 + }, + { + "epoch": 0.13485464839661243, + "grad_norm": 1.6429485082626343, + "learning_rate": 4.778996936148076e-05, + "loss": 5.5559, + "step": 22675 + }, + { + "epoch": 0.13486059567989342, + "grad_norm": 1.7537177801132202, + "learning_rate": 4.77897773421206e-05, + "loss": 5.3665, + "step": 22676 + }, + { + "epoch": 0.13486654296317444, + "grad_norm": 1.7982977628707886, + "learning_rate": 4.778958531480476e-05, + "loss": 5.5078, + "step": 22677 + }, + { + "epoch": 0.13487249024645542, + "grad_norm": 1.5147206783294678, + "learning_rate": 4.7789393279533315e-05, + "loss": 5.6726, + "step": 22678 + }, + { + "epoch": 0.1348784375297364, + "grad_norm": 1.405532956123352, + "learning_rate": 4.778920123630634e-05, + "loss": 5.4188, + "step": 22679 + }, + { + "epoch": 0.13488438481301743, + "grad_norm": 1.4880021810531616, + "learning_rate": 4.778900918512387e-05, + "loss": 5.4478, + "step": 22680 + }, + { + "epoch": 0.13489033209629842, + "grad_norm": 1.4672034978866577, + "learning_rate": 4.7788817125986006e-05, + "loss": 5.2975, + "step": 22681 + }, + { + "epoch": 0.1348962793795794, + "grad_norm": 1.5284076929092407, + "learning_rate": 4.77886250588928e-05, + "loss": 5.008, + "step": 22682 + }, + { + "epoch": 0.1349022266628604, + "grad_norm": 1.6853814125061035, + "learning_rate": 4.778843298384431e-05, + "loss": 4.5719, + "step": 22683 + }, + { + "epoch": 0.1349081739461414, + "grad_norm": 1.8264626264572144, + "learning_rate": 4.778824090084063e-05, + "loss": 4.7764, + "step": 22684 + }, + { + "epoch": 0.1349141212294224, + "grad_norm": 1.3100756406784058, + "learning_rate": 4.77880488098818e-05, + "loss": 4.9967, + "step": 22685 + }, + { + "epoch": 0.13492006851270338, + "grad_norm": 1.5330268144607544, + "learning_rate": 4.7787856710967895e-05, + "loss": 4.6979, + "step": 22686 + }, + { + "epoch": 0.1349260157959844, + "grad_norm": 1.5872783660888672, + "learning_rate": 4.778766460409899e-05, + "loss": 4.9115, + "step": 22687 + }, + { + "epoch": 0.1349319630792654, + "grad_norm": 1.7895172834396362, + "learning_rate": 4.778747248927515e-05, + "loss": 4.9802, + "step": 22688 + }, + { + "epoch": 0.13493791036254638, + "grad_norm": 1.7277544736862183, + "learning_rate": 4.778728036649643e-05, + "loss": 5.2551, + "step": 22689 + }, + { + "epoch": 0.1349438576458274, + "grad_norm": 1.6623975038528442, + "learning_rate": 4.778708823576291e-05, + "loss": 5.4733, + "step": 22690 + }, + { + "epoch": 0.13494980492910838, + "grad_norm": 1.5472412109375, + "learning_rate": 4.7786896097074655e-05, + "loss": 5.3827, + "step": 22691 + }, + { + "epoch": 0.13495575221238937, + "grad_norm": 1.5824527740478516, + "learning_rate": 4.778670395043173e-05, + "loss": 5.1529, + "step": 22692 + }, + { + "epoch": 0.13496169949567038, + "grad_norm": 1.702009916305542, + "learning_rate": 4.77865117958342e-05, + "loss": 4.9916, + "step": 22693 + }, + { + "epoch": 0.13496764677895137, + "grad_norm": 1.653401255607605, + "learning_rate": 4.778631963328214e-05, + "loss": 5.3644, + "step": 22694 + }, + { + "epoch": 0.13497359406223236, + "grad_norm": 1.7365010976791382, + "learning_rate": 4.7786127462775604e-05, + "loss": 5.6488, + "step": 22695 + }, + { + "epoch": 0.13497954134551338, + "grad_norm": 1.749050498008728, + "learning_rate": 4.778593528431467e-05, + "loss": 5.6256, + "step": 22696 + }, + { + "epoch": 0.13498548862879436, + "grad_norm": 1.8504292964935303, + "learning_rate": 4.7785743097899394e-05, + "loss": 5.3972, + "step": 22697 + }, + { + "epoch": 0.13499143591207535, + "grad_norm": 1.6481549739837646, + "learning_rate": 4.7785550903529864e-05, + "loss": 5.2532, + "step": 22698 + }, + { + "epoch": 0.13499738319535637, + "grad_norm": 1.6081243753433228, + "learning_rate": 4.778535870120612e-05, + "loss": 5.2455, + "step": 22699 + }, + { + "epoch": 0.13500333047863736, + "grad_norm": 1.7087515592575073, + "learning_rate": 4.7785166490928246e-05, + "loss": 5.3115, + "step": 22700 + }, + { + "epoch": 0.13500927776191834, + "grad_norm": 1.626558780670166, + "learning_rate": 4.7784974272696314e-05, + "loss": 4.9586, + "step": 22701 + }, + { + "epoch": 0.13501522504519936, + "grad_norm": 1.5453464984893799, + "learning_rate": 4.778478204651038e-05, + "loss": 5.3882, + "step": 22702 + }, + { + "epoch": 0.13502117232848035, + "grad_norm": 1.602817416191101, + "learning_rate": 4.778458981237051e-05, + "loss": 5.1293, + "step": 22703 + }, + { + "epoch": 0.13502711961176134, + "grad_norm": 1.642824411392212, + "learning_rate": 4.778439757027677e-05, + "loss": 5.25, + "step": 22704 + }, + { + "epoch": 0.13503306689504235, + "grad_norm": 1.544092059135437, + "learning_rate": 4.7784205320229245e-05, + "loss": 5.4593, + "step": 22705 + }, + { + "epoch": 0.13503901417832334, + "grad_norm": 1.5194666385650635, + "learning_rate": 4.778401306222798e-05, + "loss": 5.1281, + "step": 22706 + }, + { + "epoch": 0.13504496146160433, + "grad_norm": 1.5252684354782104, + "learning_rate": 4.778382079627305e-05, + "loss": 5.2614, + "step": 22707 + }, + { + "epoch": 0.13505090874488535, + "grad_norm": 1.3341602087020874, + "learning_rate": 4.778362852236453e-05, + "loss": 5.6714, + "step": 22708 + }, + { + "epoch": 0.13505685602816633, + "grad_norm": 1.4264339208602905, + "learning_rate": 4.7783436240502475e-05, + "loss": 5.5506, + "step": 22709 + }, + { + "epoch": 0.13506280331144732, + "grad_norm": 1.7837181091308594, + "learning_rate": 4.778324395068696e-05, + "loss": 5.4757, + "step": 22710 + }, + { + "epoch": 0.13506875059472834, + "grad_norm": 1.6878288984298706, + "learning_rate": 4.7783051652918054e-05, + "loss": 5.4745, + "step": 22711 + }, + { + "epoch": 0.13507469787800933, + "grad_norm": 1.4143346548080444, + "learning_rate": 4.778285934719582e-05, + "loss": 5.5602, + "step": 22712 + }, + { + "epoch": 0.1350806451612903, + "grad_norm": 1.4829423427581787, + "learning_rate": 4.778266703352032e-05, + "loss": 5.4767, + "step": 22713 + }, + { + "epoch": 0.13508659244457133, + "grad_norm": 1.5431561470031738, + "learning_rate": 4.778247471189163e-05, + "loss": 5.532, + "step": 22714 + }, + { + "epoch": 0.13509253972785232, + "grad_norm": 1.6398223638534546, + "learning_rate": 4.7782282382309814e-05, + "loss": 5.4421, + "step": 22715 + }, + { + "epoch": 0.1350984870111333, + "grad_norm": 1.7385345697402954, + "learning_rate": 4.778209004477494e-05, + "loss": 4.9767, + "step": 22716 + }, + { + "epoch": 0.13510443429441432, + "grad_norm": 1.659159541130066, + "learning_rate": 4.7781897699287066e-05, + "loss": 5.2567, + "step": 22717 + }, + { + "epoch": 0.1351103815776953, + "grad_norm": 1.665582299232483, + "learning_rate": 4.7781705345846274e-05, + "loss": 4.9557, + "step": 22718 + }, + { + "epoch": 0.1351163288609763, + "grad_norm": 1.603225827217102, + "learning_rate": 4.7781512984452614e-05, + "loss": 5.3373, + "step": 22719 + }, + { + "epoch": 0.13512227614425731, + "grad_norm": 2.11853289604187, + "learning_rate": 4.7781320615106176e-05, + "loss": 4.9767, + "step": 22720 + }, + { + "epoch": 0.1351282234275383, + "grad_norm": 1.463710069656372, + "learning_rate": 4.7781128237807006e-05, + "loss": 5.0996, + "step": 22721 + }, + { + "epoch": 0.1351341707108193, + "grad_norm": 1.785783290863037, + "learning_rate": 4.7780935852555186e-05, + "loss": 5.0664, + "step": 22722 + }, + { + "epoch": 0.1351401179941003, + "grad_norm": 1.6467021703720093, + "learning_rate": 4.778074345935078e-05, + "loss": 5.0879, + "step": 22723 + }, + { + "epoch": 0.1351460652773813, + "grad_norm": 1.7273554801940918, + "learning_rate": 4.7780551058193834e-05, + "loss": 5.1165, + "step": 22724 + }, + { + "epoch": 0.13515201256066228, + "grad_norm": 1.7785577774047852, + "learning_rate": 4.7780358649084443e-05, + "loss": 4.9459, + "step": 22725 + }, + { + "epoch": 0.1351579598439433, + "grad_norm": 1.6499429941177368, + "learning_rate": 4.7780166232022674e-05, + "loss": 5.3581, + "step": 22726 + }, + { + "epoch": 0.1351639071272243, + "grad_norm": 1.651881217956543, + "learning_rate": 4.777997380700857e-05, + "loss": 5.215, + "step": 22727 + }, + { + "epoch": 0.13516985441050527, + "grad_norm": 1.726369857788086, + "learning_rate": 4.7779781374042215e-05, + "loss": 4.8891, + "step": 22728 + }, + { + "epoch": 0.1351758016937863, + "grad_norm": 1.5628979206085205, + "learning_rate": 4.7779588933123675e-05, + "loss": 5.0173, + "step": 22729 + }, + { + "epoch": 0.13518174897706728, + "grad_norm": 2.179954767227173, + "learning_rate": 4.777939648425302e-05, + "loss": 5.0088, + "step": 22730 + }, + { + "epoch": 0.13518769626034827, + "grad_norm": 1.5813510417938232, + "learning_rate": 4.777920402743031e-05, + "loss": 5.064, + "step": 22731 + }, + { + "epoch": 0.13519364354362928, + "grad_norm": 1.4100569486618042, + "learning_rate": 4.7779011562655616e-05, + "loss": 5.5696, + "step": 22732 + }, + { + "epoch": 0.13519959082691027, + "grad_norm": 1.4252601861953735, + "learning_rate": 4.7778819089929e-05, + "loss": 5.4797, + "step": 22733 + }, + { + "epoch": 0.13520553811019126, + "grad_norm": 1.5482890605926514, + "learning_rate": 4.7778626609250546e-05, + "loss": 5.7168, + "step": 22734 + }, + { + "epoch": 0.13521148539347227, + "grad_norm": 1.7441178560256958, + "learning_rate": 4.77784341206203e-05, + "loss": 5.5385, + "step": 22735 + }, + { + "epoch": 0.13521743267675326, + "grad_norm": 1.5903903245925903, + "learning_rate": 4.777824162403833e-05, + "loss": 5.4181, + "step": 22736 + }, + { + "epoch": 0.13522337996003425, + "grad_norm": 1.6240642070770264, + "learning_rate": 4.777804911950472e-05, + "loss": 5.5071, + "step": 22737 + }, + { + "epoch": 0.13522932724331527, + "grad_norm": 1.4418225288391113, + "learning_rate": 4.7777856607019536e-05, + "loss": 5.6326, + "step": 22738 + }, + { + "epoch": 0.13523527452659626, + "grad_norm": 1.618449330329895, + "learning_rate": 4.7777664086582823e-05, + "loss": 5.4445, + "step": 22739 + }, + { + "epoch": 0.13524122180987724, + "grad_norm": 1.7598767280578613, + "learning_rate": 4.777747155819467e-05, + "loss": 5.3207, + "step": 22740 + }, + { + "epoch": 0.13524716909315823, + "grad_norm": 1.707531213760376, + "learning_rate": 4.7777279021855134e-05, + "loss": 5.2888, + "step": 22741 + }, + { + "epoch": 0.13525311637643925, + "grad_norm": 1.8292144536972046, + "learning_rate": 4.777708647756429e-05, + "loss": 4.897, + "step": 22742 + }, + { + "epoch": 0.13525906365972024, + "grad_norm": 1.893703818321228, + "learning_rate": 4.77768939253222e-05, + "loss": 4.8088, + "step": 22743 + }, + { + "epoch": 0.13526501094300122, + "grad_norm": 1.6884989738464355, + "learning_rate": 4.777670136512893e-05, + "loss": 5.183, + "step": 22744 + }, + { + "epoch": 0.13527095822628224, + "grad_norm": 1.8513271808624268, + "learning_rate": 4.777650879698454e-05, + "loss": 4.6775, + "step": 22745 + }, + { + "epoch": 0.13527690550956323, + "grad_norm": 1.5597106218338013, + "learning_rate": 4.777631622088912e-05, + "loss": 5.268, + "step": 22746 + }, + { + "epoch": 0.13528285279284422, + "grad_norm": 1.6159777641296387, + "learning_rate": 4.777612363684272e-05, + "loss": 5.223, + "step": 22747 + }, + { + "epoch": 0.13528880007612523, + "grad_norm": 1.6712334156036377, + "learning_rate": 4.777593104484541e-05, + "loss": 5.1676, + "step": 22748 + }, + { + "epoch": 0.13529474735940622, + "grad_norm": 1.4349523782730103, + "learning_rate": 4.7775738444897253e-05, + "loss": 5.3066, + "step": 22749 + }, + { + "epoch": 0.1353006946426872, + "grad_norm": 1.6191719770431519, + "learning_rate": 4.7775545836998324e-05, + "loss": 5.2426, + "step": 22750 + }, + { + "epoch": 0.13530664192596822, + "grad_norm": 1.8324687480926514, + "learning_rate": 4.777535322114869e-05, + "loss": 5.2352, + "step": 22751 + }, + { + "epoch": 0.1353125892092492, + "grad_norm": 1.5355842113494873, + "learning_rate": 4.777516059734841e-05, + "loss": 5.5875, + "step": 22752 + }, + { + "epoch": 0.1353185364925302, + "grad_norm": 1.6957530975341797, + "learning_rate": 4.777496796559756e-05, + "loss": 5.4624, + "step": 22753 + }, + { + "epoch": 0.13532448377581122, + "grad_norm": 1.6195729970932007, + "learning_rate": 4.7774775325896205e-05, + "loss": 5.2686, + "step": 22754 + }, + { + "epoch": 0.1353304310590922, + "grad_norm": 1.429439663887024, + "learning_rate": 4.7774582678244406e-05, + "loss": 5.3407, + "step": 22755 + }, + { + "epoch": 0.1353363783423732, + "grad_norm": 1.4609668254852295, + "learning_rate": 4.777439002264225e-05, + "loss": 5.4332, + "step": 22756 + }, + { + "epoch": 0.1353423256256542, + "grad_norm": 1.3537366390228271, + "learning_rate": 4.7774197359089765e-05, + "loss": 5.4353, + "step": 22757 + }, + { + "epoch": 0.1353482729089352, + "grad_norm": 1.6953861713409424, + "learning_rate": 4.7774004687587057e-05, + "loss": 5.1824, + "step": 22758 + }, + { + "epoch": 0.13535422019221618, + "grad_norm": 1.3835570812225342, + "learning_rate": 4.7773812008134186e-05, + "loss": 5.1748, + "step": 22759 + }, + { + "epoch": 0.1353601674754972, + "grad_norm": 1.94771146774292, + "learning_rate": 4.7773619320731206e-05, + "loss": 4.7599, + "step": 22760 + }, + { + "epoch": 0.1353661147587782, + "grad_norm": 1.56703782081604, + "learning_rate": 4.777342662537819e-05, + "loss": 5.4686, + "step": 22761 + }, + { + "epoch": 0.13537206204205918, + "grad_norm": 1.627790093421936, + "learning_rate": 4.77732339220752e-05, + "loss": 5.4504, + "step": 22762 + }, + { + "epoch": 0.1353780093253402, + "grad_norm": 1.5668286085128784, + "learning_rate": 4.777304121082232e-05, + "loss": 5.5147, + "step": 22763 + }, + { + "epoch": 0.13538395660862118, + "grad_norm": 1.7350172996520996, + "learning_rate": 4.7772848491619606e-05, + "loss": 5.1803, + "step": 22764 + }, + { + "epoch": 0.13538990389190217, + "grad_norm": 1.700966715812683, + "learning_rate": 4.7772655764467124e-05, + "loss": 5.1222, + "step": 22765 + }, + { + "epoch": 0.13539585117518319, + "grad_norm": 1.7613048553466797, + "learning_rate": 4.777246302936494e-05, + "loss": 5.1391, + "step": 22766 + }, + { + "epoch": 0.13540179845846417, + "grad_norm": 1.7095452547073364, + "learning_rate": 4.777227028631312e-05, + "loss": 5.112, + "step": 22767 + }, + { + "epoch": 0.13540774574174516, + "grad_norm": 1.8310586214065552, + "learning_rate": 4.7772077535311744e-05, + "loss": 5.0404, + "step": 22768 + }, + { + "epoch": 0.13541369302502618, + "grad_norm": 1.7058879137039185, + "learning_rate": 4.777188477636087e-05, + "loss": 5.1165, + "step": 22769 + }, + { + "epoch": 0.13541964030830717, + "grad_norm": 1.7806624174118042, + "learning_rate": 4.7771692009460565e-05, + "loss": 5.0711, + "step": 22770 + }, + { + "epoch": 0.13542558759158815, + "grad_norm": 1.8086166381835938, + "learning_rate": 4.777149923461089e-05, + "loss": 4.7757, + "step": 22771 + }, + { + "epoch": 0.13543153487486917, + "grad_norm": 1.9984580278396606, + "learning_rate": 4.777130645181194e-05, + "loss": 4.918, + "step": 22772 + }, + { + "epoch": 0.13543748215815016, + "grad_norm": 1.6648451089859009, + "learning_rate": 4.777111366106375e-05, + "loss": 5.0051, + "step": 22773 + }, + { + "epoch": 0.13544342944143115, + "grad_norm": 1.6590383052825928, + "learning_rate": 4.77709208623664e-05, + "loss": 5.6166, + "step": 22774 + }, + { + "epoch": 0.13544937672471216, + "grad_norm": 1.4530583620071411, + "learning_rate": 4.777072805571995e-05, + "loss": 5.6772, + "step": 22775 + }, + { + "epoch": 0.13545532400799315, + "grad_norm": 1.5310078859329224, + "learning_rate": 4.777053524112448e-05, + "loss": 4.9965, + "step": 22776 + }, + { + "epoch": 0.13546127129127414, + "grad_norm": 1.5363576412200928, + "learning_rate": 4.777034241858005e-05, + "loss": 5.2144, + "step": 22777 + }, + { + "epoch": 0.13546721857455515, + "grad_norm": 1.7318395376205444, + "learning_rate": 4.7770149588086735e-05, + "loss": 5.2367, + "step": 22778 + }, + { + "epoch": 0.13547316585783614, + "grad_norm": 1.567736268043518, + "learning_rate": 4.776995674964459e-05, + "loss": 5.4778, + "step": 22779 + }, + { + "epoch": 0.13547911314111713, + "grad_norm": 1.879223108291626, + "learning_rate": 4.7769763903253685e-05, + "loss": 4.8963, + "step": 22780 + }, + { + "epoch": 0.13548506042439815, + "grad_norm": 1.6292016506195068, + "learning_rate": 4.77695710489141e-05, + "loss": 5.2529, + "step": 22781 + }, + { + "epoch": 0.13549100770767913, + "grad_norm": 1.4838228225708008, + "learning_rate": 4.7769378186625885e-05, + "loss": 5.5594, + "step": 22782 + }, + { + "epoch": 0.13549695499096012, + "grad_norm": 1.4567928314208984, + "learning_rate": 4.776918531638912e-05, + "loss": 5.5789, + "step": 22783 + }, + { + "epoch": 0.13550290227424114, + "grad_norm": 1.6464484930038452, + "learning_rate": 4.776899243820386e-05, + "loss": 5.4319, + "step": 22784 + }, + { + "epoch": 0.13550884955752213, + "grad_norm": 1.501028060913086, + "learning_rate": 4.776879955207019e-05, + "loss": 5.5543, + "step": 22785 + }, + { + "epoch": 0.13551479684080311, + "grad_norm": 1.6811163425445557, + "learning_rate": 4.776860665798816e-05, + "loss": 5.4512, + "step": 22786 + }, + { + "epoch": 0.13552074412408413, + "grad_norm": 1.762147068977356, + "learning_rate": 4.7768413755957854e-05, + "loss": 5.6262, + "step": 22787 + }, + { + "epoch": 0.13552669140736512, + "grad_norm": 1.846987009048462, + "learning_rate": 4.7768220845979315e-05, + "loss": 5.4735, + "step": 22788 + }, + { + "epoch": 0.1355326386906461, + "grad_norm": 1.9326568841934204, + "learning_rate": 4.776802792805264e-05, + "loss": 5.3295, + "step": 22789 + }, + { + "epoch": 0.13553858597392712, + "grad_norm": 1.5496313571929932, + "learning_rate": 4.7767835002177874e-05, + "loss": 5.4742, + "step": 22790 + }, + { + "epoch": 0.1355445332572081, + "grad_norm": 1.3328933715820312, + "learning_rate": 4.776764206835509e-05, + "loss": 5.5611, + "step": 22791 + }, + { + "epoch": 0.1355504805404891, + "grad_norm": 1.3349891901016235, + "learning_rate": 4.776744912658437e-05, + "loss": 5.5732, + "step": 22792 + }, + { + "epoch": 0.13555642782377011, + "grad_norm": 1.510608434677124, + "learning_rate": 4.776725617686576e-05, + "loss": 5.4108, + "step": 22793 + }, + { + "epoch": 0.1355623751070511, + "grad_norm": 1.4556225538253784, + "learning_rate": 4.776706321919934e-05, + "loss": 5.5154, + "step": 22794 + }, + { + "epoch": 0.1355683223903321, + "grad_norm": 1.7231537103652954, + "learning_rate": 4.776687025358516e-05, + "loss": 5.4437, + "step": 22795 + }, + { + "epoch": 0.1355742696736131, + "grad_norm": 1.6234036684036255, + "learning_rate": 4.7766677280023314e-05, + "loss": 5.2642, + "step": 22796 + }, + { + "epoch": 0.1355802169568941, + "grad_norm": 1.6550066471099854, + "learning_rate": 4.776648429851385e-05, + "loss": 5.3577, + "step": 22797 + }, + { + "epoch": 0.13558616424017508, + "grad_norm": 1.5199332237243652, + "learning_rate": 4.776629130905684e-05, + "loss": 4.9679, + "step": 22798 + }, + { + "epoch": 0.13559211152345607, + "grad_norm": 1.5900238752365112, + "learning_rate": 4.776609831165236e-05, + "loss": 5.5357, + "step": 22799 + }, + { + "epoch": 0.1355980588067371, + "grad_norm": 1.4585398435592651, + "learning_rate": 4.776590530630047e-05, + "loss": 5.4191, + "step": 22800 + }, + { + "epoch": 0.13560400609001808, + "grad_norm": 1.4049118757247925, + "learning_rate": 4.7765712293001234e-05, + "loss": 5.4423, + "step": 22801 + }, + { + "epoch": 0.13560995337329906, + "grad_norm": 1.5287877321243286, + "learning_rate": 4.7765519271754726e-05, + "loss": 5.4635, + "step": 22802 + }, + { + "epoch": 0.13561590065658008, + "grad_norm": 1.4761078357696533, + "learning_rate": 4.776532624256101e-05, + "loss": 5.394, + "step": 22803 + }, + { + "epoch": 0.13562184793986107, + "grad_norm": 1.523536205291748, + "learning_rate": 4.776513320542015e-05, + "loss": 5.4171, + "step": 22804 + }, + { + "epoch": 0.13562779522314206, + "grad_norm": 1.701953411102295, + "learning_rate": 4.7764940160332214e-05, + "loss": 5.336, + "step": 22805 + }, + { + "epoch": 0.13563374250642307, + "grad_norm": 1.5426260232925415, + "learning_rate": 4.7764747107297284e-05, + "loss": 5.5175, + "step": 22806 + }, + { + "epoch": 0.13563968978970406, + "grad_norm": 1.5670596361160278, + "learning_rate": 4.776455404631541e-05, + "loss": 5.4254, + "step": 22807 + }, + { + "epoch": 0.13564563707298505, + "grad_norm": 1.4388494491577148, + "learning_rate": 4.7764360977386666e-05, + "loss": 5.3282, + "step": 22808 + }, + { + "epoch": 0.13565158435626606, + "grad_norm": 1.4222092628479004, + "learning_rate": 4.776416790051111e-05, + "loss": 5.5187, + "step": 22809 + }, + { + "epoch": 0.13565753163954705, + "grad_norm": 1.604407787322998, + "learning_rate": 4.776397481568883e-05, + "loss": 5.3026, + "step": 22810 + }, + { + "epoch": 0.13566347892282804, + "grad_norm": 1.4160562753677368, + "learning_rate": 4.776378172291988e-05, + "loss": 5.2925, + "step": 22811 + }, + { + "epoch": 0.13566942620610906, + "grad_norm": 1.543260931968689, + "learning_rate": 4.776358862220433e-05, + "loss": 5.4234, + "step": 22812 + }, + { + "epoch": 0.13567537348939004, + "grad_norm": 1.6589266061782837, + "learning_rate": 4.776339551354224e-05, + "loss": 5.0677, + "step": 22813 + }, + { + "epoch": 0.13568132077267103, + "grad_norm": 1.5909267663955688, + "learning_rate": 4.7763202396933696e-05, + "loss": 5.145, + "step": 22814 + }, + { + "epoch": 0.13568726805595205, + "grad_norm": 1.4697500467300415, + "learning_rate": 4.776300927237873e-05, + "loss": 5.2856, + "step": 22815 + }, + { + "epoch": 0.13569321533923304, + "grad_norm": 1.895766019821167, + "learning_rate": 4.7762816139877456e-05, + "loss": 5.3554, + "step": 22816 + }, + { + "epoch": 0.13569916262251402, + "grad_norm": 1.8093748092651367, + "learning_rate": 4.7762622999429905e-05, + "loss": 4.9482, + "step": 22817 + }, + { + "epoch": 0.13570510990579504, + "grad_norm": 1.6899988651275635, + "learning_rate": 4.776242985103616e-05, + "loss": 5.1788, + "step": 22818 + }, + { + "epoch": 0.13571105718907603, + "grad_norm": 1.8199821710586548, + "learning_rate": 4.7762236694696294e-05, + "loss": 4.9181, + "step": 22819 + }, + { + "epoch": 0.13571700447235702, + "grad_norm": 1.7687036991119385, + "learning_rate": 4.776204353041036e-05, + "loss": 4.9925, + "step": 22820 + }, + { + "epoch": 0.13572295175563803, + "grad_norm": 1.705419659614563, + "learning_rate": 4.776185035817843e-05, + "loss": 5.0644, + "step": 22821 + }, + { + "epoch": 0.13572889903891902, + "grad_norm": 1.7805287837982178, + "learning_rate": 4.7761657178000575e-05, + "loss": 5.1567, + "step": 22822 + }, + { + "epoch": 0.1357348463222, + "grad_norm": 1.4791945219039917, + "learning_rate": 4.776146398987686e-05, + "loss": 5.2834, + "step": 22823 + }, + { + "epoch": 0.13574079360548102, + "grad_norm": 1.546128749847412, + "learning_rate": 4.776127079380735e-05, + "loss": 4.8066, + "step": 22824 + }, + { + "epoch": 0.135746740888762, + "grad_norm": 1.6163334846496582, + "learning_rate": 4.776107758979212e-05, + "loss": 5.1771, + "step": 22825 + }, + { + "epoch": 0.135752688172043, + "grad_norm": 1.6902676820755005, + "learning_rate": 4.776088437783123e-05, + "loss": 4.9249, + "step": 22826 + }, + { + "epoch": 0.13575863545532402, + "grad_norm": 1.4966270923614502, + "learning_rate": 4.776069115792475e-05, + "loss": 5.6609, + "step": 22827 + }, + { + "epoch": 0.135764582738605, + "grad_norm": 1.6107707023620605, + "learning_rate": 4.7760497930072754e-05, + "loss": 5.4167, + "step": 22828 + }, + { + "epoch": 0.135770530021886, + "grad_norm": 1.5773305892944336, + "learning_rate": 4.77603046942753e-05, + "loss": 5.4044, + "step": 22829 + }, + { + "epoch": 0.135776477305167, + "grad_norm": 1.6871259212493896, + "learning_rate": 4.7760111450532454e-05, + "loss": 5.5288, + "step": 22830 + }, + { + "epoch": 0.135782424588448, + "grad_norm": 1.4027100801467896, + "learning_rate": 4.77599181988443e-05, + "loss": 5.265, + "step": 22831 + }, + { + "epoch": 0.13578837187172899, + "grad_norm": 1.7435009479522705, + "learning_rate": 4.775972493921088e-05, + "loss": 5.3546, + "step": 22832 + }, + { + "epoch": 0.13579431915501, + "grad_norm": 1.4834927320480347, + "learning_rate": 4.7759531671632286e-05, + "loss": 5.168, + "step": 22833 + }, + { + "epoch": 0.135800266438291, + "grad_norm": 1.6468613147735596, + "learning_rate": 4.775933839610857e-05, + "loss": 5.0984, + "step": 22834 + }, + { + "epoch": 0.13580621372157198, + "grad_norm": 1.6906235218048096, + "learning_rate": 4.77591451126398e-05, + "loss": 5.0563, + "step": 22835 + }, + { + "epoch": 0.135812161004853, + "grad_norm": 1.2667183876037598, + "learning_rate": 4.775895182122605e-05, + "loss": 5.7256, + "step": 22836 + }, + { + "epoch": 0.13581810828813398, + "grad_norm": 1.381974697113037, + "learning_rate": 4.775875852186739e-05, + "loss": 5.6773, + "step": 22837 + }, + { + "epoch": 0.13582405557141497, + "grad_norm": 1.395326018333435, + "learning_rate": 4.775856521456388e-05, + "loss": 5.4884, + "step": 22838 + }, + { + "epoch": 0.13583000285469599, + "grad_norm": 1.4601794481277466, + "learning_rate": 4.775837189931559e-05, + "loss": 5.6866, + "step": 22839 + }, + { + "epoch": 0.13583595013797697, + "grad_norm": 1.3722656965255737, + "learning_rate": 4.7758178576122584e-05, + "loss": 5.7885, + "step": 22840 + }, + { + "epoch": 0.13584189742125796, + "grad_norm": 1.5126278400421143, + "learning_rate": 4.775798524498494e-05, + "loss": 5.5806, + "step": 22841 + }, + { + "epoch": 0.13584784470453898, + "grad_norm": 1.465306282043457, + "learning_rate": 4.7757791905902714e-05, + "loss": 5.5597, + "step": 22842 + }, + { + "epoch": 0.13585379198781997, + "grad_norm": 1.7111048698425293, + "learning_rate": 4.775759855887598e-05, + "loss": 5.3431, + "step": 22843 + }, + { + "epoch": 0.13585973927110095, + "grad_norm": 1.7369952201843262, + "learning_rate": 4.7757405203904796e-05, + "loss": 5.4373, + "step": 22844 + }, + { + "epoch": 0.13586568655438197, + "grad_norm": 1.571898341178894, + "learning_rate": 4.7757211840989246e-05, + "loss": 5.4751, + "step": 22845 + }, + { + "epoch": 0.13587163383766296, + "grad_norm": 1.6752384901046753, + "learning_rate": 4.775701847012938e-05, + "loss": 5.3411, + "step": 22846 + }, + { + "epoch": 0.13587758112094395, + "grad_norm": 1.3036680221557617, + "learning_rate": 4.775682509132529e-05, + "loss": 5.6136, + "step": 22847 + }, + { + "epoch": 0.13588352840422496, + "grad_norm": 1.60060453414917, + "learning_rate": 4.775663170457701e-05, + "loss": 5.3134, + "step": 22848 + }, + { + "epoch": 0.13588947568750595, + "grad_norm": 1.746317982673645, + "learning_rate": 4.775643830988463e-05, + "loss": 5.1176, + "step": 22849 + }, + { + "epoch": 0.13589542297078694, + "grad_norm": 1.5190258026123047, + "learning_rate": 4.775624490724822e-05, + "loss": 5.2673, + "step": 22850 + }, + { + "epoch": 0.13590137025406795, + "grad_norm": 1.5572645664215088, + "learning_rate": 4.775605149666783e-05, + "loss": 5.7732, + "step": 22851 + }, + { + "epoch": 0.13590731753734894, + "grad_norm": 1.6563985347747803, + "learning_rate": 4.775585807814354e-05, + "loss": 5.3757, + "step": 22852 + }, + { + "epoch": 0.13591326482062993, + "grad_norm": 1.583486795425415, + "learning_rate": 4.775566465167541e-05, + "loss": 5.5406, + "step": 22853 + }, + { + "epoch": 0.13591921210391095, + "grad_norm": 1.9212104082107544, + "learning_rate": 4.7755471217263525e-05, + "loss": 5.5629, + "step": 22854 + }, + { + "epoch": 0.13592515938719194, + "grad_norm": 1.5397447347640991, + "learning_rate": 4.775527777490793e-05, + "loss": 5.5745, + "step": 22855 + }, + { + "epoch": 0.13593110667047292, + "grad_norm": 1.4469612836837769, + "learning_rate": 4.775508432460871e-05, + "loss": 5.5762, + "step": 22856 + }, + { + "epoch": 0.13593705395375394, + "grad_norm": 1.6050552129745483, + "learning_rate": 4.775489086636592e-05, + "loss": 5.2207, + "step": 22857 + }, + { + "epoch": 0.13594300123703493, + "grad_norm": 1.5991270542144775, + "learning_rate": 4.7754697400179636e-05, + "loss": 5.3331, + "step": 22858 + }, + { + "epoch": 0.13594894852031592, + "grad_norm": 1.8474901914596558, + "learning_rate": 4.775450392604992e-05, + "loss": 5.3208, + "step": 22859 + }, + { + "epoch": 0.1359548958035969, + "grad_norm": 1.6865973472595215, + "learning_rate": 4.7754310443976844e-05, + "loss": 5.2557, + "step": 22860 + }, + { + "epoch": 0.13596084308687792, + "grad_norm": 1.9411492347717285, + "learning_rate": 4.775411695396047e-05, + "loss": 5.2765, + "step": 22861 + }, + { + "epoch": 0.1359667903701589, + "grad_norm": 1.6263481378555298, + "learning_rate": 4.775392345600087e-05, + "loss": 5.2767, + "step": 22862 + }, + { + "epoch": 0.1359727376534399, + "grad_norm": 1.7159794569015503, + "learning_rate": 4.7753729950098116e-05, + "loss": 5.5175, + "step": 22863 + }, + { + "epoch": 0.1359786849367209, + "grad_norm": 1.6026562452316284, + "learning_rate": 4.7753536436252266e-05, + "loss": 5.3517, + "step": 22864 + }, + { + "epoch": 0.1359846322200019, + "grad_norm": 1.4052190780639648, + "learning_rate": 4.775334291446339e-05, + "loss": 5.3153, + "step": 22865 + }, + { + "epoch": 0.1359905795032829, + "grad_norm": 1.4030534029006958, + "learning_rate": 4.7753149384731556e-05, + "loss": 5.3798, + "step": 22866 + }, + { + "epoch": 0.1359965267865639, + "grad_norm": 1.5234447717666626, + "learning_rate": 4.775295584705683e-05, + "loss": 5.2717, + "step": 22867 + }, + { + "epoch": 0.1360024740698449, + "grad_norm": 1.6578015089035034, + "learning_rate": 4.775276230143929e-05, + "loss": 5.2482, + "step": 22868 + }, + { + "epoch": 0.13600842135312588, + "grad_norm": 1.427674651145935, + "learning_rate": 4.775256874787899e-05, + "loss": 5.3303, + "step": 22869 + }, + { + "epoch": 0.1360143686364069, + "grad_norm": 1.610268473625183, + "learning_rate": 4.7752375186376006e-05, + "loss": 5.4775, + "step": 22870 + }, + { + "epoch": 0.13602031591968788, + "grad_norm": 1.7097511291503906, + "learning_rate": 4.7752181616930404e-05, + "loss": 5.2721, + "step": 22871 + }, + { + "epoch": 0.13602626320296887, + "grad_norm": 1.6628022193908691, + "learning_rate": 4.775198803954225e-05, + "loss": 5.2049, + "step": 22872 + }, + { + "epoch": 0.1360322104862499, + "grad_norm": 1.6983882188796997, + "learning_rate": 4.7751794454211615e-05, + "loss": 5.1596, + "step": 22873 + }, + { + "epoch": 0.13603815776953088, + "grad_norm": 1.6148128509521484, + "learning_rate": 4.775160086093856e-05, + "loss": 5.3958, + "step": 22874 + }, + { + "epoch": 0.13604410505281186, + "grad_norm": 1.6220009326934814, + "learning_rate": 4.7751407259723155e-05, + "loss": 5.2774, + "step": 22875 + }, + { + "epoch": 0.13605005233609288, + "grad_norm": 1.5017454624176025, + "learning_rate": 4.7751213650565464e-05, + "loss": 5.303, + "step": 22876 + }, + { + "epoch": 0.13605599961937387, + "grad_norm": 1.6734380722045898, + "learning_rate": 4.7751020033465566e-05, + "loss": 5.3784, + "step": 22877 + }, + { + "epoch": 0.13606194690265486, + "grad_norm": 1.8177162408828735, + "learning_rate": 4.775082640842352e-05, + "loss": 5.4498, + "step": 22878 + }, + { + "epoch": 0.13606789418593587, + "grad_norm": 1.6287364959716797, + "learning_rate": 4.7750632775439396e-05, + "loss": 5.3252, + "step": 22879 + }, + { + "epoch": 0.13607384146921686, + "grad_norm": 1.5242222547531128, + "learning_rate": 4.7750439134513267e-05, + "loss": 5.2287, + "step": 22880 + }, + { + "epoch": 0.13607978875249785, + "grad_norm": 1.4447482824325562, + "learning_rate": 4.775024548564519e-05, + "loss": 5.3725, + "step": 22881 + }, + { + "epoch": 0.13608573603577886, + "grad_norm": 1.4994373321533203, + "learning_rate": 4.775005182883523e-05, + "loss": 5.4844, + "step": 22882 + }, + { + "epoch": 0.13609168331905985, + "grad_norm": 1.541668176651001, + "learning_rate": 4.774985816408347e-05, + "loss": 5.4171, + "step": 22883 + }, + { + "epoch": 0.13609763060234084, + "grad_norm": 1.4670990705490112, + "learning_rate": 4.7749664491389965e-05, + "loss": 5.4372, + "step": 22884 + }, + { + "epoch": 0.13610357788562186, + "grad_norm": 1.686318039894104, + "learning_rate": 4.7749470810754796e-05, + "loss": 5.1164, + "step": 22885 + }, + { + "epoch": 0.13610952516890285, + "grad_norm": 1.4744656085968018, + "learning_rate": 4.7749277122178015e-05, + "loss": 5.3787, + "step": 22886 + }, + { + "epoch": 0.13611547245218383, + "grad_norm": 1.498948574066162, + "learning_rate": 4.77490834256597e-05, + "loss": 5.2837, + "step": 22887 + }, + { + "epoch": 0.13612141973546485, + "grad_norm": 1.4990612268447876, + "learning_rate": 4.774888972119991e-05, + "loss": 5.3503, + "step": 22888 + }, + { + "epoch": 0.13612736701874584, + "grad_norm": 1.6973026990890503, + "learning_rate": 4.774869600879872e-05, + "loss": 5.2776, + "step": 22889 + }, + { + "epoch": 0.13613331430202683, + "grad_norm": 1.5271309614181519, + "learning_rate": 4.7748502288456193e-05, + "loss": 5.3318, + "step": 22890 + }, + { + "epoch": 0.13613926158530784, + "grad_norm": 1.5284117460250854, + "learning_rate": 4.7748308560172406e-05, + "loss": 5.2975, + "step": 22891 + }, + { + "epoch": 0.13614520886858883, + "grad_norm": 1.45162034034729, + "learning_rate": 4.774811482394741e-05, + "loss": 5.1825, + "step": 22892 + }, + { + "epoch": 0.13615115615186982, + "grad_norm": 1.558273434638977, + "learning_rate": 4.774792107978129e-05, + "loss": 5.1004, + "step": 22893 + }, + { + "epoch": 0.13615710343515083, + "grad_norm": 1.576781988143921, + "learning_rate": 4.77477273276741e-05, + "loss": 5.4028, + "step": 22894 + }, + { + "epoch": 0.13616305071843182, + "grad_norm": 1.3964447975158691, + "learning_rate": 4.7747533567625916e-05, + "loss": 5.4402, + "step": 22895 + }, + { + "epoch": 0.1361689980017128, + "grad_norm": 1.7266137599945068, + "learning_rate": 4.77473397996368e-05, + "loss": 4.9304, + "step": 22896 + }, + { + "epoch": 0.13617494528499383, + "grad_norm": 1.573444128036499, + "learning_rate": 4.774714602370683e-05, + "loss": 4.9736, + "step": 22897 + }, + { + "epoch": 0.13618089256827481, + "grad_norm": 1.7123498916625977, + "learning_rate": 4.774695223983606e-05, + "loss": 5.3678, + "step": 22898 + }, + { + "epoch": 0.1361868398515558, + "grad_norm": 1.8102420568466187, + "learning_rate": 4.7746758448024566e-05, + "loss": 5.2433, + "step": 22899 + }, + { + "epoch": 0.13619278713483682, + "grad_norm": 1.5984879732131958, + "learning_rate": 4.774656464827242e-05, + "loss": 5.2601, + "step": 22900 + }, + { + "epoch": 0.1361987344181178, + "grad_norm": 1.8117280006408691, + "learning_rate": 4.7746370840579666e-05, + "loss": 5.1488, + "step": 22901 + }, + { + "epoch": 0.1362046817013988, + "grad_norm": 1.6972469091415405, + "learning_rate": 4.7746177024946405e-05, + "loss": 5.337, + "step": 22902 + }, + { + "epoch": 0.1362106289846798, + "grad_norm": 1.4006030559539795, + "learning_rate": 4.7745983201372685e-05, + "loss": 5.4563, + "step": 22903 + }, + { + "epoch": 0.1362165762679608, + "grad_norm": 1.7627719640731812, + "learning_rate": 4.774578936985857e-05, + "loss": 5.0125, + "step": 22904 + }, + { + "epoch": 0.1362225235512418, + "grad_norm": 1.3935896158218384, + "learning_rate": 4.774559553040415e-05, + "loss": 5.2413, + "step": 22905 + }, + { + "epoch": 0.1362284708345228, + "grad_norm": 1.3300725221633911, + "learning_rate": 4.7745401683009464e-05, + "loss": 5.391, + "step": 22906 + }, + { + "epoch": 0.1362344181178038, + "grad_norm": 1.5094577074050903, + "learning_rate": 4.7745207827674596e-05, + "loss": 6.0553, + "step": 22907 + }, + { + "epoch": 0.13624036540108478, + "grad_norm": 1.3816832304000854, + "learning_rate": 4.774501396439961e-05, + "loss": 5.9914, + "step": 22908 + }, + { + "epoch": 0.1362463126843658, + "grad_norm": 1.5488735437393188, + "learning_rate": 4.774482009318458e-05, + "loss": 5.5686, + "step": 22909 + }, + { + "epoch": 0.13625225996764678, + "grad_norm": 1.7096377611160278, + "learning_rate": 4.774462621402957e-05, + "loss": 5.0948, + "step": 22910 + }, + { + "epoch": 0.13625820725092777, + "grad_norm": 1.8099161386489868, + "learning_rate": 4.7744432326934644e-05, + "loss": 5.3055, + "step": 22911 + }, + { + "epoch": 0.1362641545342088, + "grad_norm": 1.5320358276367188, + "learning_rate": 4.7744238431899864e-05, + "loss": 5.467, + "step": 22912 + }, + { + "epoch": 0.13627010181748977, + "grad_norm": 1.928933024406433, + "learning_rate": 4.774404452892531e-05, + "loss": 4.9311, + "step": 22913 + }, + { + "epoch": 0.13627604910077076, + "grad_norm": 1.912596344947815, + "learning_rate": 4.7743850618011046e-05, + "loss": 5.1982, + "step": 22914 + }, + { + "epoch": 0.13628199638405178, + "grad_norm": 1.6227478981018066, + "learning_rate": 4.774365669915714e-05, + "loss": 5.3649, + "step": 22915 + }, + { + "epoch": 0.13628794366733277, + "grad_norm": 1.8333683013916016, + "learning_rate": 4.7743462772363656e-05, + "loss": 4.7404, + "step": 22916 + }, + { + "epoch": 0.13629389095061376, + "grad_norm": 1.6802351474761963, + "learning_rate": 4.7743268837630665e-05, + "loss": 5.2044, + "step": 22917 + }, + { + "epoch": 0.13629983823389474, + "grad_norm": 1.76273775100708, + "learning_rate": 4.774307489495823e-05, + "loss": 4.7032, + "step": 22918 + }, + { + "epoch": 0.13630578551717576, + "grad_norm": 1.8272813558578491, + "learning_rate": 4.7742880944346427e-05, + "loss": 4.6324, + "step": 22919 + }, + { + "epoch": 0.13631173280045675, + "grad_norm": 2.327012777328491, + "learning_rate": 4.7742686985795316e-05, + "loss": 4.3851, + "step": 22920 + }, + { + "epoch": 0.13631768008373774, + "grad_norm": 2.035224199295044, + "learning_rate": 4.7742493019304965e-05, + "loss": 4.2965, + "step": 22921 + }, + { + "epoch": 0.13632362736701875, + "grad_norm": 2.3920044898986816, + "learning_rate": 4.774229904487546e-05, + "loss": 4.237, + "step": 22922 + }, + { + "epoch": 0.13632957465029974, + "grad_norm": 2.3279507160186768, + "learning_rate": 4.7742105062506835e-05, + "loss": 4.3676, + "step": 22923 + }, + { + "epoch": 0.13633552193358073, + "grad_norm": 2.360509157180786, + "learning_rate": 4.7741911072199185e-05, + "loss": 4.1116, + "step": 22924 + }, + { + "epoch": 0.13634146921686174, + "grad_norm": 2.3977739810943604, + "learning_rate": 4.7741717073952573e-05, + "loss": 4.4254, + "step": 22925 + }, + { + "epoch": 0.13634741650014273, + "grad_norm": 2.2043890953063965, + "learning_rate": 4.774152306776706e-05, + "loss": 4.3602, + "step": 22926 + }, + { + "epoch": 0.13635336378342372, + "grad_norm": 2.264444589614868, + "learning_rate": 4.7741329053642714e-05, + "loss": 4.3561, + "step": 22927 + }, + { + "epoch": 0.13635931106670474, + "grad_norm": 1.9636424779891968, + "learning_rate": 4.7741135031579596e-05, + "loss": 4.9631, + "step": 22928 + }, + { + "epoch": 0.13636525834998572, + "grad_norm": 1.9803466796875, + "learning_rate": 4.77409410015778e-05, + "loss": 4.4919, + "step": 22929 + }, + { + "epoch": 0.1363712056332667, + "grad_norm": 2.3046467304229736, + "learning_rate": 4.774074696363736e-05, + "loss": 4.7812, + "step": 22930 + }, + { + "epoch": 0.13637715291654773, + "grad_norm": 1.8447179794311523, + "learning_rate": 4.774055291775837e-05, + "loss": 4.7631, + "step": 22931 + }, + { + "epoch": 0.13638310019982872, + "grad_norm": 1.7349412441253662, + "learning_rate": 4.774035886394089e-05, + "loss": 4.7341, + "step": 22932 + }, + { + "epoch": 0.1363890474831097, + "grad_norm": 1.751775860786438, + "learning_rate": 4.774016480218498e-05, + "loss": 4.9051, + "step": 22933 + }, + { + "epoch": 0.13639499476639072, + "grad_norm": 1.6568492650985718, + "learning_rate": 4.773997073249071e-05, + "loss": 4.9236, + "step": 22934 + }, + { + "epoch": 0.1364009420496717, + "grad_norm": 1.6315816640853882, + "learning_rate": 4.773977665485816e-05, + "loss": 5.0631, + "step": 22935 + }, + { + "epoch": 0.1364068893329527, + "grad_norm": 1.7680082321166992, + "learning_rate": 4.773958256928739e-05, + "loss": 4.7632, + "step": 22936 + }, + { + "epoch": 0.1364128366162337, + "grad_norm": 1.656140923500061, + "learning_rate": 4.773938847577846e-05, + "loss": 4.7978, + "step": 22937 + }, + { + "epoch": 0.1364187838995147, + "grad_norm": 1.9236876964569092, + "learning_rate": 4.773919437433144e-05, + "loss": 4.5575, + "step": 22938 + }, + { + "epoch": 0.1364247311827957, + "grad_norm": 1.98481023311615, + "learning_rate": 4.773900026494641e-05, + "loss": 4.4456, + "step": 22939 + }, + { + "epoch": 0.1364306784660767, + "grad_norm": 1.494399070739746, + "learning_rate": 4.773880614762343e-05, + "loss": 5.3057, + "step": 22940 + }, + { + "epoch": 0.1364366257493577, + "grad_norm": 1.972229242324829, + "learning_rate": 4.773861202236257e-05, + "loss": 4.3849, + "step": 22941 + }, + { + "epoch": 0.13644257303263868, + "grad_norm": 2.0766615867614746, + "learning_rate": 4.773841788916389e-05, + "loss": 4.4249, + "step": 22942 + }, + { + "epoch": 0.1364485203159197, + "grad_norm": 1.9418238401412964, + "learning_rate": 4.773822374802747e-05, + "loss": 4.577, + "step": 22943 + }, + { + "epoch": 0.13645446759920069, + "grad_norm": 2.066725254058838, + "learning_rate": 4.773802959895336e-05, + "loss": 4.3563, + "step": 22944 + }, + { + "epoch": 0.13646041488248167, + "grad_norm": 2.948639154434204, + "learning_rate": 4.773783544194165e-05, + "loss": 3.2644, + "step": 22945 + }, + { + "epoch": 0.1364663621657627, + "grad_norm": 2.065586805343628, + "learning_rate": 4.7737641276992385e-05, + "loss": 5.9715, + "step": 22946 + }, + { + "epoch": 0.13647230944904368, + "grad_norm": 2.169130325317383, + "learning_rate": 4.7737447104105645e-05, + "loss": 4.9516, + "step": 22947 + }, + { + "epoch": 0.13647825673232467, + "grad_norm": 2.4133553504943848, + "learning_rate": 4.773725292328151e-05, + "loss": 5.2266, + "step": 22948 + }, + { + "epoch": 0.13648420401560568, + "grad_norm": 2.4718146324157715, + "learning_rate": 4.773705873452002e-05, + "loss": 5.1842, + "step": 22949 + }, + { + "epoch": 0.13649015129888667, + "grad_norm": 1.8822194337844849, + "learning_rate": 4.773686453782127e-05, + "loss": 4.9297, + "step": 22950 + }, + { + "epoch": 0.13649609858216766, + "grad_norm": 1.8627861738204956, + "learning_rate": 4.773667033318531e-05, + "loss": 4.682, + "step": 22951 + }, + { + "epoch": 0.13650204586544867, + "grad_norm": 2.1915957927703857, + "learning_rate": 4.773647612061222e-05, + "loss": 4.5292, + "step": 22952 + }, + { + "epoch": 0.13650799314872966, + "grad_norm": 2.182401657104492, + "learning_rate": 4.773628190010205e-05, + "loss": 4.6416, + "step": 22953 + }, + { + "epoch": 0.13651394043201065, + "grad_norm": 2.020988941192627, + "learning_rate": 4.773608767165488e-05, + "loss": 4.5698, + "step": 22954 + }, + { + "epoch": 0.13651988771529167, + "grad_norm": 1.5788037776947021, + "learning_rate": 4.773589343527078e-05, + "loss": 5.0962, + "step": 22955 + }, + { + "epoch": 0.13652583499857265, + "grad_norm": 1.929002285003662, + "learning_rate": 4.773569919094982e-05, + "loss": 4.7789, + "step": 22956 + }, + { + "epoch": 0.13653178228185364, + "grad_norm": 1.4314018487930298, + "learning_rate": 4.773550493869206e-05, + "loss": 5.1814, + "step": 22957 + }, + { + "epoch": 0.13653772956513466, + "grad_norm": 1.3779473304748535, + "learning_rate": 4.7735310678497566e-05, + "loss": 5.3468, + "step": 22958 + }, + { + "epoch": 0.13654367684841565, + "grad_norm": 1.543843150138855, + "learning_rate": 4.773511641036641e-05, + "loss": 5.2539, + "step": 22959 + }, + { + "epoch": 0.13654962413169663, + "grad_norm": 1.3671090602874756, + "learning_rate": 4.773492213429866e-05, + "loss": 5.2174, + "step": 22960 + }, + { + "epoch": 0.13655557141497765, + "grad_norm": 1.6130348443984985, + "learning_rate": 4.7734727850294386e-05, + "loss": 5.3554, + "step": 22961 + }, + { + "epoch": 0.13656151869825864, + "grad_norm": 1.4536763429641724, + "learning_rate": 4.773453355835365e-05, + "loss": 5.1686, + "step": 22962 + }, + { + "epoch": 0.13656746598153963, + "grad_norm": 1.4020705223083496, + "learning_rate": 4.773433925847652e-05, + "loss": 5.1832, + "step": 22963 + }, + { + "epoch": 0.13657341326482064, + "grad_norm": 1.5963356494903564, + "learning_rate": 4.773414495066308e-05, + "loss": 5.2799, + "step": 22964 + }, + { + "epoch": 0.13657936054810163, + "grad_norm": 1.235477328300476, + "learning_rate": 4.773395063491338e-05, + "loss": 5.3078, + "step": 22965 + }, + { + "epoch": 0.13658530783138262, + "grad_norm": 1.658551812171936, + "learning_rate": 4.7733756311227484e-05, + "loss": 4.8935, + "step": 22966 + }, + { + "epoch": 0.13659125511466363, + "grad_norm": 1.3750555515289307, + "learning_rate": 4.773356197960548e-05, + "loss": 5.4716, + "step": 22967 + }, + { + "epoch": 0.13659720239794462, + "grad_norm": 1.368320107460022, + "learning_rate": 4.773336764004742e-05, + "loss": 5.3549, + "step": 22968 + }, + { + "epoch": 0.1366031496812256, + "grad_norm": 1.6175824403762817, + "learning_rate": 4.773317329255337e-05, + "loss": 5.6482, + "step": 22969 + }, + { + "epoch": 0.13660909696450663, + "grad_norm": 1.5855069160461426, + "learning_rate": 4.7732978937123404e-05, + "loss": 4.8048, + "step": 22970 + }, + { + "epoch": 0.13661504424778761, + "grad_norm": 1.2763618230819702, + "learning_rate": 4.77327845737576e-05, + "loss": 5.3114, + "step": 22971 + }, + { + "epoch": 0.1366209915310686, + "grad_norm": 1.296797275543213, + "learning_rate": 4.773259020245601e-05, + "loss": 5.2154, + "step": 22972 + }, + { + "epoch": 0.13662693881434962, + "grad_norm": 1.6255276203155518, + "learning_rate": 4.7732395823218714e-05, + "loss": 4.7173, + "step": 22973 + }, + { + "epoch": 0.1366328860976306, + "grad_norm": 1.6712839603424072, + "learning_rate": 4.7732201436045764e-05, + "loss": 4.7129, + "step": 22974 + }, + { + "epoch": 0.1366388333809116, + "grad_norm": 1.3639626502990723, + "learning_rate": 4.773200704093724e-05, + "loss": 5.3616, + "step": 22975 + }, + { + "epoch": 0.13664478066419258, + "grad_norm": 1.5322916507720947, + "learning_rate": 4.773181263789321e-05, + "loss": 4.9117, + "step": 22976 + }, + { + "epoch": 0.1366507279474736, + "grad_norm": 1.5231655836105347, + "learning_rate": 4.7731618226913735e-05, + "loss": 5.3278, + "step": 22977 + }, + { + "epoch": 0.1366566752307546, + "grad_norm": 1.610016942024231, + "learning_rate": 4.7731423807998896e-05, + "loss": 4.8782, + "step": 22978 + }, + { + "epoch": 0.13666262251403558, + "grad_norm": 1.578951358795166, + "learning_rate": 4.773122938114875e-05, + "loss": 5.4874, + "step": 22979 + }, + { + "epoch": 0.1366685697973166, + "grad_norm": 1.7087042331695557, + "learning_rate": 4.773103494636335e-05, + "loss": 5.4259, + "step": 22980 + }, + { + "epoch": 0.13667451708059758, + "grad_norm": 1.4179787635803223, + "learning_rate": 4.773084050364279e-05, + "loss": 5.3227, + "step": 22981 + }, + { + "epoch": 0.13668046436387857, + "grad_norm": 1.6982066631317139, + "learning_rate": 4.773064605298714e-05, + "loss": 4.9789, + "step": 22982 + }, + { + "epoch": 0.13668641164715958, + "grad_norm": 1.6331787109375, + "learning_rate": 4.773045159439644e-05, + "loss": 5.3524, + "step": 22983 + }, + { + "epoch": 0.13669235893044057, + "grad_norm": 1.5722705125808716, + "learning_rate": 4.773025712787078e-05, + "loss": 5.2852, + "step": 22984 + }, + { + "epoch": 0.13669830621372156, + "grad_norm": 1.553524136543274, + "learning_rate": 4.773006265341023e-05, + "loss": 5.3803, + "step": 22985 + }, + { + "epoch": 0.13670425349700258, + "grad_norm": 1.6696399450302124, + "learning_rate": 4.772986817101484e-05, + "loss": 5.1719, + "step": 22986 + }, + { + "epoch": 0.13671020078028356, + "grad_norm": 1.468403935432434, + "learning_rate": 4.772967368068469e-05, + "loss": 5.3468, + "step": 22987 + }, + { + "epoch": 0.13671614806356455, + "grad_norm": 1.5586446523666382, + "learning_rate": 4.772947918241985e-05, + "loss": 5.3733, + "step": 22988 + }, + { + "epoch": 0.13672209534684557, + "grad_norm": 1.549392819404602, + "learning_rate": 4.7729284676220385e-05, + "loss": 5.4622, + "step": 22989 + }, + { + "epoch": 0.13672804263012656, + "grad_norm": 1.4469774961471558, + "learning_rate": 4.772909016208636e-05, + "loss": 5.3998, + "step": 22990 + }, + { + "epoch": 0.13673398991340754, + "grad_norm": 1.3361252546310425, + "learning_rate": 4.7728895640017833e-05, + "loss": 5.1723, + "step": 22991 + }, + { + "epoch": 0.13673993719668856, + "grad_norm": 1.5584652423858643, + "learning_rate": 4.7728701110014894e-05, + "loss": 5.03, + "step": 22992 + }, + { + "epoch": 0.13674588447996955, + "grad_norm": 1.319245457649231, + "learning_rate": 4.7728506572077594e-05, + "loss": 5.0349, + "step": 22993 + }, + { + "epoch": 0.13675183176325054, + "grad_norm": 1.6574468612670898, + "learning_rate": 4.7728312026206015e-05, + "loss": 5.3401, + "step": 22994 + }, + { + "epoch": 0.13675777904653155, + "grad_norm": 1.564598560333252, + "learning_rate": 4.772811747240022e-05, + "loss": 5.3047, + "step": 22995 + }, + { + "epoch": 0.13676372632981254, + "grad_norm": 1.5692095756530762, + "learning_rate": 4.772792291066026e-05, + "loss": 5.1632, + "step": 22996 + }, + { + "epoch": 0.13676967361309353, + "grad_norm": 1.3904811143875122, + "learning_rate": 4.772772834098622e-05, + "loss": 5.2429, + "step": 22997 + }, + { + "epoch": 0.13677562089637454, + "grad_norm": 1.6455345153808594, + "learning_rate": 4.7727533763378175e-05, + "loss": 5.164, + "step": 22998 + }, + { + "epoch": 0.13678156817965553, + "grad_norm": 1.384092092514038, + "learning_rate": 4.772733917783618e-05, + "loss": 4.9753, + "step": 22999 + }, + { + "epoch": 0.13678751546293652, + "grad_norm": 1.5056332349777222, + "learning_rate": 4.77271445843603e-05, + "loss": 5.008, + "step": 23000 + }, + { + "epoch": 0.13679346274621754, + "grad_norm": 1.6766334772109985, + "learning_rate": 4.772694998295061e-05, + "loss": 5.2156, + "step": 23001 + }, + { + "epoch": 0.13679941002949852, + "grad_norm": 1.517899513244629, + "learning_rate": 4.772675537360718e-05, + "loss": 5.4637, + "step": 23002 + }, + { + "epoch": 0.1368053573127795, + "grad_norm": 1.539090633392334, + "learning_rate": 4.772656075633007e-05, + "loss": 4.9678, + "step": 23003 + }, + { + "epoch": 0.13681130459606053, + "grad_norm": 1.5403459072113037, + "learning_rate": 4.772636613111936e-05, + "loss": 5.1884, + "step": 23004 + }, + { + "epoch": 0.13681725187934152, + "grad_norm": 1.4680373668670654, + "learning_rate": 4.7726171497975106e-05, + "loss": 5.118, + "step": 23005 + }, + { + "epoch": 0.1368231991626225, + "grad_norm": 1.6800905466079712, + "learning_rate": 4.7725976856897376e-05, + "loss": 5.5796, + "step": 23006 + }, + { + "epoch": 0.13682914644590352, + "grad_norm": 1.6708084344863892, + "learning_rate": 4.7725782207886246e-05, + "loss": 4.8021, + "step": 23007 + }, + { + "epoch": 0.1368350937291845, + "grad_norm": 1.3744218349456787, + "learning_rate": 4.772558755094177e-05, + "loss": 5.2993, + "step": 23008 + }, + { + "epoch": 0.1368410410124655, + "grad_norm": 1.6822494268417358, + "learning_rate": 4.772539288606405e-05, + "loss": 4.8643, + "step": 23009 + }, + { + "epoch": 0.1368469882957465, + "grad_norm": 1.7003953456878662, + "learning_rate": 4.772519821325311e-05, + "loss": 5.0189, + "step": 23010 + }, + { + "epoch": 0.1368529355790275, + "grad_norm": 1.5518492460250854, + "learning_rate": 4.772500353250905e-05, + "loss": 5.2159, + "step": 23011 + }, + { + "epoch": 0.1368588828623085, + "grad_norm": 1.64122474193573, + "learning_rate": 4.772480884383191e-05, + "loss": 4.8965, + "step": 23012 + }, + { + "epoch": 0.1368648301455895, + "grad_norm": 1.6162265539169312, + "learning_rate": 4.772461414722179e-05, + "loss": 5.1521, + "step": 23013 + }, + { + "epoch": 0.1368707774288705, + "grad_norm": 1.7200851440429688, + "learning_rate": 4.7724419442678736e-05, + "loss": 5.1694, + "step": 23014 + }, + { + "epoch": 0.13687672471215148, + "grad_norm": 1.4717456102371216, + "learning_rate": 4.772422473020283e-05, + "loss": 5.1999, + "step": 23015 + }, + { + "epoch": 0.1368826719954325, + "grad_norm": 1.6320459842681885, + "learning_rate": 4.772403000979412e-05, + "loss": 4.9127, + "step": 23016 + }, + { + "epoch": 0.13688861927871349, + "grad_norm": 1.5466574430465698, + "learning_rate": 4.772383528145269e-05, + "loss": 5.0589, + "step": 23017 + }, + { + "epoch": 0.13689456656199447, + "grad_norm": 1.7745109796524048, + "learning_rate": 4.77236405451786e-05, + "loss": 4.9933, + "step": 23018 + }, + { + "epoch": 0.1369005138452755, + "grad_norm": 1.4493471384048462, + "learning_rate": 4.772344580097193e-05, + "loss": 5.0996, + "step": 23019 + }, + { + "epoch": 0.13690646112855648, + "grad_norm": 1.4859240055084229, + "learning_rate": 4.7723251048832734e-05, + "loss": 5.2686, + "step": 23020 + }, + { + "epoch": 0.13691240841183747, + "grad_norm": 1.6349207162857056, + "learning_rate": 4.7723056288761084e-05, + "loss": 5.1644, + "step": 23021 + }, + { + "epoch": 0.13691835569511848, + "grad_norm": 1.5266002416610718, + "learning_rate": 4.772286152075706e-05, + "loss": 4.988, + "step": 23022 + }, + { + "epoch": 0.13692430297839947, + "grad_norm": 1.592774748802185, + "learning_rate": 4.772266674482071e-05, + "loss": 5.2701, + "step": 23023 + }, + { + "epoch": 0.13693025026168046, + "grad_norm": 1.4789998531341553, + "learning_rate": 4.772247196095211e-05, + "loss": 5.1181, + "step": 23024 + }, + { + "epoch": 0.13693619754496147, + "grad_norm": 1.4374842643737793, + "learning_rate": 4.772227716915134e-05, + "loss": 4.5882, + "step": 23025 + }, + { + "epoch": 0.13694214482824246, + "grad_norm": 1.682689905166626, + "learning_rate": 4.772208236941845e-05, + "loss": 5.5035, + "step": 23026 + }, + { + "epoch": 0.13694809211152345, + "grad_norm": 1.5549851655960083, + "learning_rate": 4.772188756175352e-05, + "loss": 5.5484, + "step": 23027 + }, + { + "epoch": 0.13695403939480447, + "grad_norm": 1.5018965005874634, + "learning_rate": 4.772169274615661e-05, + "loss": 5.0517, + "step": 23028 + }, + { + "epoch": 0.13695998667808545, + "grad_norm": 1.648807168006897, + "learning_rate": 4.77214979226278e-05, + "loss": 5.1527, + "step": 23029 + }, + { + "epoch": 0.13696593396136644, + "grad_norm": 1.6059378385543823, + "learning_rate": 4.772130309116714e-05, + "loss": 5.0003, + "step": 23030 + }, + { + "epoch": 0.13697188124464746, + "grad_norm": 1.368412971496582, + "learning_rate": 4.772110825177472e-05, + "loss": 5.1025, + "step": 23031 + }, + { + "epoch": 0.13697782852792845, + "grad_norm": 1.627031922340393, + "learning_rate": 4.7720913404450576e-05, + "loss": 5.1206, + "step": 23032 + }, + { + "epoch": 0.13698377581120944, + "grad_norm": 1.654307246208191, + "learning_rate": 4.772071854919481e-05, + "loss": 4.8403, + "step": 23033 + }, + { + "epoch": 0.13698972309449042, + "grad_norm": 1.658847451210022, + "learning_rate": 4.772052368600748e-05, + "loss": 5.2089, + "step": 23034 + }, + { + "epoch": 0.13699567037777144, + "grad_norm": 1.6999456882476807, + "learning_rate": 4.772032881488864e-05, + "loss": 5.2022, + "step": 23035 + }, + { + "epoch": 0.13700161766105243, + "grad_norm": 1.2880116701126099, + "learning_rate": 4.772013393583837e-05, + "loss": 5.4331, + "step": 23036 + }, + { + "epoch": 0.13700756494433342, + "grad_norm": 1.4780634641647339, + "learning_rate": 4.7719939048856735e-05, + "loss": 5.034, + "step": 23037 + }, + { + "epoch": 0.13701351222761443, + "grad_norm": 1.5058658123016357, + "learning_rate": 4.771974415394381e-05, + "loss": 5.4403, + "step": 23038 + }, + { + "epoch": 0.13701945951089542, + "grad_norm": 1.4378021955490112, + "learning_rate": 4.771954925109965e-05, + "loss": 5.1769, + "step": 23039 + }, + { + "epoch": 0.1370254067941764, + "grad_norm": 1.6010862588882446, + "learning_rate": 4.7719354340324337e-05, + "loss": 5.4728, + "step": 23040 + }, + { + "epoch": 0.13703135407745742, + "grad_norm": 1.6916764974594116, + "learning_rate": 4.7719159421617924e-05, + "loss": 4.9489, + "step": 23041 + }, + { + "epoch": 0.1370373013607384, + "grad_norm": 1.4737353324890137, + "learning_rate": 4.771896449498049e-05, + "loss": 4.8795, + "step": 23042 + }, + { + "epoch": 0.1370432486440194, + "grad_norm": 1.5808194875717163, + "learning_rate": 4.7718769560412105e-05, + "loss": 4.8375, + "step": 23043 + }, + { + "epoch": 0.13704919592730042, + "grad_norm": 1.3700640201568604, + "learning_rate": 4.771857461791283e-05, + "loss": 4.8135, + "step": 23044 + }, + { + "epoch": 0.1370551432105814, + "grad_norm": 1.1938998699188232, + "learning_rate": 4.7718379667482735e-05, + "loss": 4.8199, + "step": 23045 + }, + { + "epoch": 0.1370610904938624, + "grad_norm": 1.3598859310150146, + "learning_rate": 4.7718184709121885e-05, + "loss": 4.6871, + "step": 23046 + }, + { + "epoch": 0.1370670377771434, + "grad_norm": 1.2303695678710938, + "learning_rate": 4.7717989742830354e-05, + "loss": 4.7421, + "step": 23047 + }, + { + "epoch": 0.1370729850604244, + "grad_norm": 1.2872051000595093, + "learning_rate": 4.77177947686082e-05, + "loss": 4.6669, + "step": 23048 + }, + { + "epoch": 0.13707893234370538, + "grad_norm": 1.2523759603500366, + "learning_rate": 4.771759978645551e-05, + "loss": 4.6359, + "step": 23049 + }, + { + "epoch": 0.1370848796269864, + "grad_norm": 1.2552485466003418, + "learning_rate": 4.771740479637234e-05, + "loss": 4.7362, + "step": 23050 + }, + { + "epoch": 0.1370908269102674, + "grad_norm": 1.434870958328247, + "learning_rate": 4.771720979835875e-05, + "loss": 4.812, + "step": 23051 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 1.6004719734191895, + "learning_rate": 4.771701479241483e-05, + "loss": 5.1579, + "step": 23052 + }, + { + "epoch": 0.1371027214768294, + "grad_norm": 1.5982462167739868, + "learning_rate": 4.7716819778540625e-05, + "loss": 4.8038, + "step": 23053 + }, + { + "epoch": 0.13710866876011038, + "grad_norm": 1.7509288787841797, + "learning_rate": 4.7716624756736215e-05, + "loss": 5.091, + "step": 23054 + }, + { + "epoch": 0.13711461604339137, + "grad_norm": 1.729748010635376, + "learning_rate": 4.7716429727001665e-05, + "loss": 5.0755, + "step": 23055 + }, + { + "epoch": 0.13712056332667238, + "grad_norm": 1.6167495250701904, + "learning_rate": 4.7716234689337044e-05, + "loss": 5.0602, + "step": 23056 + }, + { + "epoch": 0.13712651060995337, + "grad_norm": 1.7035753726959229, + "learning_rate": 4.771603964374242e-05, + "loss": 5.1877, + "step": 23057 + }, + { + "epoch": 0.13713245789323436, + "grad_norm": 1.5923055410385132, + "learning_rate": 4.7715844590217865e-05, + "loss": 4.6043, + "step": 23058 + }, + { + "epoch": 0.13713840517651538, + "grad_norm": 1.551894187927246, + "learning_rate": 4.771564952876344e-05, + "loss": 5.0746, + "step": 23059 + }, + { + "epoch": 0.13714435245979636, + "grad_norm": 1.8965848684310913, + "learning_rate": 4.771545445937921e-05, + "loss": 4.6152, + "step": 23060 + }, + { + "epoch": 0.13715029974307735, + "grad_norm": 1.630903720855713, + "learning_rate": 4.771525938206527e-05, + "loss": 5.3948, + "step": 23061 + }, + { + "epoch": 0.13715624702635837, + "grad_norm": 1.7285772562026978, + "learning_rate": 4.771506429682166e-05, + "loss": 5.2535, + "step": 23062 + }, + { + "epoch": 0.13716219430963936, + "grad_norm": 1.789049506187439, + "learning_rate": 4.771486920364844e-05, + "loss": 4.7232, + "step": 23063 + }, + { + "epoch": 0.13716814159292035, + "grad_norm": 1.6774955987930298, + "learning_rate": 4.7714674102545706e-05, + "loss": 5.424, + "step": 23064 + }, + { + "epoch": 0.13717408887620136, + "grad_norm": 1.9038479328155518, + "learning_rate": 4.771447899351351e-05, + "loss": 4.7868, + "step": 23065 + }, + { + "epoch": 0.13718003615948235, + "grad_norm": 1.906087875366211, + "learning_rate": 4.771428387655192e-05, + "loss": 4.7115, + "step": 23066 + }, + { + "epoch": 0.13718598344276334, + "grad_norm": 1.786908745765686, + "learning_rate": 4.771408875166103e-05, + "loss": 4.6741, + "step": 23067 + }, + { + "epoch": 0.13719193072604435, + "grad_norm": 1.8421779870986938, + "learning_rate": 4.771389361884086e-05, + "loss": 4.9338, + "step": 23068 + }, + { + "epoch": 0.13719787800932534, + "grad_norm": 1.8146562576293945, + "learning_rate": 4.7713698478091516e-05, + "loss": 4.5556, + "step": 23069 + }, + { + "epoch": 0.13720382529260633, + "grad_norm": 1.4694918394088745, + "learning_rate": 4.7713503329413056e-05, + "loss": 5.611, + "step": 23070 + }, + { + "epoch": 0.13720977257588735, + "grad_norm": 1.553694248199463, + "learning_rate": 4.771330817280554e-05, + "loss": 5.6062, + "step": 23071 + }, + { + "epoch": 0.13721571985916833, + "grad_norm": 1.293204426765442, + "learning_rate": 4.771311300826905e-05, + "loss": 5.7157, + "step": 23072 + }, + { + "epoch": 0.13722166714244932, + "grad_norm": 1.369480013847351, + "learning_rate": 4.771291783580364e-05, + "loss": 5.754, + "step": 23073 + }, + { + "epoch": 0.13722761442573034, + "grad_norm": 1.7480628490447998, + "learning_rate": 4.771272265540939e-05, + "loss": 5.4179, + "step": 23074 + }, + { + "epoch": 0.13723356170901133, + "grad_norm": 1.604788064956665, + "learning_rate": 4.771252746708636e-05, + "loss": 5.3766, + "step": 23075 + }, + { + "epoch": 0.13723950899229231, + "grad_norm": 1.721793532371521, + "learning_rate": 4.7712332270834626e-05, + "loss": 4.9839, + "step": 23076 + }, + { + "epoch": 0.13724545627557333, + "grad_norm": 1.528327226638794, + "learning_rate": 4.771213706665425e-05, + "loss": 5.427, + "step": 23077 + }, + { + "epoch": 0.13725140355885432, + "grad_norm": 1.425625205039978, + "learning_rate": 4.7711941854545295e-05, + "loss": 5.6246, + "step": 23078 + }, + { + "epoch": 0.1372573508421353, + "grad_norm": 1.9369326829910278, + "learning_rate": 4.771174663450784e-05, + "loss": 5.5038, + "step": 23079 + }, + { + "epoch": 0.13726329812541632, + "grad_norm": 1.906792163848877, + "learning_rate": 4.771155140654195e-05, + "loss": 5.5361, + "step": 23080 + }, + { + "epoch": 0.1372692454086973, + "grad_norm": 1.7495099306106567, + "learning_rate": 4.7711356170647694e-05, + "loss": 4.9809, + "step": 23081 + }, + { + "epoch": 0.1372751926919783, + "grad_norm": 1.5589921474456787, + "learning_rate": 4.771116092682514e-05, + "loss": 4.9627, + "step": 23082 + }, + { + "epoch": 0.13728113997525931, + "grad_norm": 1.7177824974060059, + "learning_rate": 4.771096567507435e-05, + "loss": 5.403, + "step": 23083 + }, + { + "epoch": 0.1372870872585403, + "grad_norm": 1.5470298528671265, + "learning_rate": 4.7710770415395395e-05, + "loss": 6.0237, + "step": 23084 + }, + { + "epoch": 0.1372930345418213, + "grad_norm": 1.5613659620285034, + "learning_rate": 4.771057514778835e-05, + "loss": 5.7272, + "step": 23085 + }, + { + "epoch": 0.1372989818251023, + "grad_norm": 1.7003729343414307, + "learning_rate": 4.771037987225328e-05, + "loss": 4.9305, + "step": 23086 + }, + { + "epoch": 0.1373049291083833, + "grad_norm": 2.587393283843994, + "learning_rate": 4.771018458879023e-05, + "loss": 4.9075, + "step": 23087 + }, + { + "epoch": 0.13731087639166428, + "grad_norm": 2.208308696746826, + "learning_rate": 4.770998929739931e-05, + "loss": 4.9141, + "step": 23088 + }, + { + "epoch": 0.1373168236749453, + "grad_norm": 2.0532326698303223, + "learning_rate": 4.770979399808057e-05, + "loss": 5.0574, + "step": 23089 + }, + { + "epoch": 0.1373227709582263, + "grad_norm": 1.86672043800354, + "learning_rate": 4.770959869083406e-05, + "loss": 4.9269, + "step": 23090 + }, + { + "epoch": 0.13732871824150727, + "grad_norm": 1.8310163021087646, + "learning_rate": 4.7709403375659874e-05, + "loss": 4.901, + "step": 23091 + }, + { + "epoch": 0.13733466552478826, + "grad_norm": 1.8886011838912964, + "learning_rate": 4.7709208052558065e-05, + "loss": 4.8325, + "step": 23092 + }, + { + "epoch": 0.13734061280806928, + "grad_norm": 1.9192320108413696, + "learning_rate": 4.770901272152871e-05, + "loss": 4.8783, + "step": 23093 + }, + { + "epoch": 0.13734656009135027, + "grad_norm": 2.0797886848449707, + "learning_rate": 4.770881738257187e-05, + "loss": 4.6473, + "step": 23094 + }, + { + "epoch": 0.13735250737463126, + "grad_norm": 2.2008140087127686, + "learning_rate": 4.770862203568762e-05, + "loss": 4.7291, + "step": 23095 + }, + { + "epoch": 0.13735845465791227, + "grad_norm": 2.002549886703491, + "learning_rate": 4.770842668087602e-05, + "loss": 4.5471, + "step": 23096 + }, + { + "epoch": 0.13736440194119326, + "grad_norm": 1.7748942375183105, + "learning_rate": 4.770823131813714e-05, + "loss": 4.5844, + "step": 23097 + }, + { + "epoch": 0.13737034922447425, + "grad_norm": 2.128469467163086, + "learning_rate": 4.7708035947471065e-05, + "loss": 4.7365, + "step": 23098 + }, + { + "epoch": 0.13737629650775526, + "grad_norm": 1.9279344081878662, + "learning_rate": 4.770784056887784e-05, + "loss": 4.5673, + "step": 23099 + }, + { + "epoch": 0.13738224379103625, + "grad_norm": 1.896638035774231, + "learning_rate": 4.770764518235754e-05, + "loss": 4.5956, + "step": 23100 + }, + { + "epoch": 0.13738819107431724, + "grad_norm": 2.4768176078796387, + "learning_rate": 4.770744978791024e-05, + "loss": 4.5071, + "step": 23101 + }, + { + "epoch": 0.13739413835759826, + "grad_norm": 2.0828697681427, + "learning_rate": 4.7707254385536e-05, + "loss": 4.5681, + "step": 23102 + }, + { + "epoch": 0.13740008564087924, + "grad_norm": 2.197688579559326, + "learning_rate": 4.7707058975234895e-05, + "loss": 4.5111, + "step": 23103 + }, + { + "epoch": 0.13740603292416023, + "grad_norm": 2.0053935050964355, + "learning_rate": 4.7706863557007e-05, + "loss": 4.5441, + "step": 23104 + }, + { + "epoch": 0.13741198020744125, + "grad_norm": 2.247901439666748, + "learning_rate": 4.770666813085236e-05, + "loss": 4.5538, + "step": 23105 + }, + { + "epoch": 0.13741792749072224, + "grad_norm": 2.1666789054870605, + "learning_rate": 4.770647269677106e-05, + "loss": 4.7712, + "step": 23106 + }, + { + "epoch": 0.13742387477400322, + "grad_norm": 2.0191304683685303, + "learning_rate": 4.770627725476317e-05, + "loss": 4.5244, + "step": 23107 + }, + { + "epoch": 0.13742982205728424, + "grad_norm": 1.9388200044631958, + "learning_rate": 4.770608180482874e-05, + "loss": 4.6272, + "step": 23108 + }, + { + "epoch": 0.13743576934056523, + "grad_norm": 2.0467464923858643, + "learning_rate": 4.7705886346967865e-05, + "loss": 4.5852, + "step": 23109 + }, + { + "epoch": 0.13744171662384622, + "grad_norm": 2.0310070514678955, + "learning_rate": 4.770569088118059e-05, + "loss": 4.3915, + "step": 23110 + }, + { + "epoch": 0.13744766390712723, + "grad_norm": 2.1216657161712646, + "learning_rate": 4.770549540746701e-05, + "loss": 4.4549, + "step": 23111 + }, + { + "epoch": 0.13745361119040822, + "grad_norm": 1.9715701341629028, + "learning_rate": 4.770529992582715e-05, + "loss": 4.8822, + "step": 23112 + }, + { + "epoch": 0.1374595584736892, + "grad_norm": 2.0956320762634277, + "learning_rate": 4.7705104436261124e-05, + "loss": 5.3927, + "step": 23113 + }, + { + "epoch": 0.13746550575697022, + "grad_norm": 1.6396405696868896, + "learning_rate": 4.770490893876898e-05, + "loss": 5.5089, + "step": 23114 + }, + { + "epoch": 0.1374714530402512, + "grad_norm": 1.8379572629928589, + "learning_rate": 4.7704713433350777e-05, + "loss": 5.9133, + "step": 23115 + }, + { + "epoch": 0.1374774003235322, + "grad_norm": 1.6787012815475464, + "learning_rate": 4.7704517920006594e-05, + "loss": 5.4497, + "step": 23116 + }, + { + "epoch": 0.13748334760681322, + "grad_norm": 1.6657997369766235, + "learning_rate": 4.77043223987365e-05, + "loss": 5.2093, + "step": 23117 + }, + { + "epoch": 0.1374892948900942, + "grad_norm": 1.7581418752670288, + "learning_rate": 4.7704126869540565e-05, + "loss": 6.4119, + "step": 23118 + }, + { + "epoch": 0.1374952421733752, + "grad_norm": 1.4436302185058594, + "learning_rate": 4.770393133241885e-05, + "loss": 6.3299, + "step": 23119 + }, + { + "epoch": 0.1375011894566562, + "grad_norm": 1.6737406253814697, + "learning_rate": 4.7703735787371434e-05, + "loss": 5.8634, + "step": 23120 + }, + { + "epoch": 0.1375071367399372, + "grad_norm": 1.5715806484222412, + "learning_rate": 4.7703540234398375e-05, + "loss": 5.7896, + "step": 23121 + }, + { + "epoch": 0.13751308402321819, + "grad_norm": 1.8452152013778687, + "learning_rate": 4.7703344673499744e-05, + "loss": 5.8868, + "step": 23122 + }, + { + "epoch": 0.1375190313064992, + "grad_norm": 1.6291402578353882, + "learning_rate": 4.770314910467561e-05, + "loss": 5.8256, + "step": 23123 + }, + { + "epoch": 0.1375249785897802, + "grad_norm": 1.4301279783248901, + "learning_rate": 4.770295352792604e-05, + "loss": 5.7982, + "step": 23124 + }, + { + "epoch": 0.13753092587306118, + "grad_norm": 1.5949046611785889, + "learning_rate": 4.770275794325111e-05, + "loss": 5.5606, + "step": 23125 + }, + { + "epoch": 0.1375368731563422, + "grad_norm": 1.572860598564148, + "learning_rate": 4.770256235065087e-05, + "loss": 5.1636, + "step": 23126 + }, + { + "epoch": 0.13754282043962318, + "grad_norm": 1.4339121580123901, + "learning_rate": 4.7702366750125405e-05, + "loss": 5.1374, + "step": 23127 + }, + { + "epoch": 0.13754876772290417, + "grad_norm": 1.4290729761123657, + "learning_rate": 4.770217114167478e-05, + "loss": 5.7268, + "step": 23128 + }, + { + "epoch": 0.13755471500618519, + "grad_norm": 1.1300958395004272, + "learning_rate": 4.7701975525299066e-05, + "loss": 5.6887, + "step": 23129 + }, + { + "epoch": 0.13756066228946617, + "grad_norm": 1.1974701881408691, + "learning_rate": 4.7701779900998325e-05, + "loss": 5.6763, + "step": 23130 + }, + { + "epoch": 0.13756660957274716, + "grad_norm": 1.3675005435943604, + "learning_rate": 4.7701584268772614e-05, + "loss": 5.6558, + "step": 23131 + }, + { + "epoch": 0.13757255685602818, + "grad_norm": 1.3302583694458008, + "learning_rate": 4.770138862862203e-05, + "loss": 5.6915, + "step": 23132 + }, + { + "epoch": 0.13757850413930917, + "grad_norm": 1.3415045738220215, + "learning_rate": 4.770119298054662e-05, + "loss": 5.6922, + "step": 23133 + }, + { + "epoch": 0.13758445142259015, + "grad_norm": 1.229663372039795, + "learning_rate": 4.770099732454646e-05, + "loss": 5.7799, + "step": 23134 + }, + { + "epoch": 0.13759039870587117, + "grad_norm": 1.3245000839233398, + "learning_rate": 4.7700801660621614e-05, + "loss": 5.7848, + "step": 23135 + }, + { + "epoch": 0.13759634598915216, + "grad_norm": 1.2835783958435059, + "learning_rate": 4.770060598877215e-05, + "loss": 5.5999, + "step": 23136 + }, + { + "epoch": 0.13760229327243315, + "grad_norm": 1.9270732402801514, + "learning_rate": 4.770041030899814e-05, + "loss": 4.8701, + "step": 23137 + }, + { + "epoch": 0.13760824055571416, + "grad_norm": 1.8123419284820557, + "learning_rate": 4.7700214621299656e-05, + "loss": 5.3828, + "step": 23138 + }, + { + "epoch": 0.13761418783899515, + "grad_norm": 2.0436434745788574, + "learning_rate": 4.770001892567676e-05, + "loss": 4.6098, + "step": 23139 + }, + { + "epoch": 0.13762013512227614, + "grad_norm": 1.4343012571334839, + "learning_rate": 4.769982322212953e-05, + "loss": 5.5587, + "step": 23140 + }, + { + "epoch": 0.13762608240555715, + "grad_norm": 1.266640067100525, + "learning_rate": 4.769962751065801e-05, + "loss": 5.626, + "step": 23141 + }, + { + "epoch": 0.13763202968883814, + "grad_norm": 1.9386495351791382, + "learning_rate": 4.7699431791262296e-05, + "loss": 4.7212, + "step": 23142 + }, + { + "epoch": 0.13763797697211913, + "grad_norm": 2.270129919052124, + "learning_rate": 4.769923606394244e-05, + "loss": 4.7609, + "step": 23143 + }, + { + "epoch": 0.13764392425540015, + "grad_norm": 2.0305488109588623, + "learning_rate": 4.7699040328698516e-05, + "loss": 4.8083, + "step": 23144 + }, + { + "epoch": 0.13764987153868113, + "grad_norm": 2.1791486740112305, + "learning_rate": 4.769884458553059e-05, + "loss": 4.834, + "step": 23145 + }, + { + "epoch": 0.13765581882196212, + "grad_norm": 2.152580738067627, + "learning_rate": 4.769864883443873e-05, + "loss": 4.5418, + "step": 23146 + }, + { + "epoch": 0.13766176610524314, + "grad_norm": 2.2850470542907715, + "learning_rate": 4.769845307542301e-05, + "loss": 4.9344, + "step": 23147 + }, + { + "epoch": 0.13766771338852413, + "grad_norm": 1.745813012123108, + "learning_rate": 4.76982573084835e-05, + "loss": 4.9631, + "step": 23148 + }, + { + "epoch": 0.13767366067180511, + "grad_norm": 1.5848993062973022, + "learning_rate": 4.769806153362025e-05, + "loss": 5.3936, + "step": 23149 + }, + { + "epoch": 0.1376796079550861, + "grad_norm": 1.5276480913162231, + "learning_rate": 4.7697865750833356e-05, + "loss": 5.7806, + "step": 23150 + }, + { + "epoch": 0.13768555523836712, + "grad_norm": 1.3464304208755493, + "learning_rate": 4.769766996012286e-05, + "loss": 5.5572, + "step": 23151 + }, + { + "epoch": 0.1376915025216481, + "grad_norm": 1.375168800354004, + "learning_rate": 4.769747416148885e-05, + "loss": 5.6109, + "step": 23152 + }, + { + "epoch": 0.1376974498049291, + "grad_norm": 1.3537193536758423, + "learning_rate": 4.769727835493138e-05, + "loss": 5.5257, + "step": 23153 + }, + { + "epoch": 0.1377033970882101, + "grad_norm": 1.6656006574630737, + "learning_rate": 4.769708254045053e-05, + "loss": 5.3327, + "step": 23154 + }, + { + "epoch": 0.1377093443714911, + "grad_norm": 1.6092736721038818, + "learning_rate": 4.769688671804635e-05, + "loss": 5.7785, + "step": 23155 + }, + { + "epoch": 0.1377152916547721, + "grad_norm": 1.5005303621292114, + "learning_rate": 4.7696690887718934e-05, + "loss": 5.4944, + "step": 23156 + }, + { + "epoch": 0.1377212389380531, + "grad_norm": 1.6100717782974243, + "learning_rate": 4.7696495049468336e-05, + "loss": 5.3767, + "step": 23157 + }, + { + "epoch": 0.1377271862213341, + "grad_norm": 1.5637480020523071, + "learning_rate": 4.7696299203294626e-05, + "loss": 5.3981, + "step": 23158 + }, + { + "epoch": 0.13773313350461508, + "grad_norm": 1.6407819986343384, + "learning_rate": 4.769610334919787e-05, + "loss": 5.4328, + "step": 23159 + }, + { + "epoch": 0.1377390807878961, + "grad_norm": 1.8828953504562378, + "learning_rate": 4.7695907487178146e-05, + "loss": 5.5127, + "step": 23160 + }, + { + "epoch": 0.13774502807117708, + "grad_norm": 1.5160561800003052, + "learning_rate": 4.7695711617235506e-05, + "loss": 5.3309, + "step": 23161 + }, + { + "epoch": 0.13775097535445807, + "grad_norm": 1.4901509284973145, + "learning_rate": 4.769551573937003e-05, + "loss": 5.4584, + "step": 23162 + }, + { + "epoch": 0.1377569226377391, + "grad_norm": 1.3983137607574463, + "learning_rate": 4.769531985358179e-05, + "loss": 5.6738, + "step": 23163 + }, + { + "epoch": 0.13776286992102008, + "grad_norm": 1.7664490938186646, + "learning_rate": 4.7695123959870834e-05, + "loss": 5.513, + "step": 23164 + }, + { + "epoch": 0.13776881720430106, + "grad_norm": 1.4650641679763794, + "learning_rate": 4.7694928058237255e-05, + "loss": 4.9959, + "step": 23165 + }, + { + "epoch": 0.13777476448758208, + "grad_norm": 1.5515252351760864, + "learning_rate": 4.7694732148681106e-05, + "loss": 5.1419, + "step": 23166 + }, + { + "epoch": 0.13778071177086307, + "grad_norm": 1.459083914756775, + "learning_rate": 4.769453623120247e-05, + "loss": 5.3639, + "step": 23167 + }, + { + "epoch": 0.13778665905414406, + "grad_norm": 1.6032545566558838, + "learning_rate": 4.76943403058014e-05, + "loss": 5.3822, + "step": 23168 + }, + { + "epoch": 0.13779260633742507, + "grad_norm": 1.5436428785324097, + "learning_rate": 4.769414437247797e-05, + "loss": 5.0313, + "step": 23169 + }, + { + "epoch": 0.13779855362070606, + "grad_norm": 1.2577800750732422, + "learning_rate": 4.769394843123225e-05, + "loss": 4.8907, + "step": 23170 + }, + { + "epoch": 0.13780450090398705, + "grad_norm": 1.4654191732406616, + "learning_rate": 4.769375248206431e-05, + "loss": 5.0346, + "step": 23171 + }, + { + "epoch": 0.13781044818726806, + "grad_norm": 1.9576739072799683, + "learning_rate": 4.769355652497421e-05, + "loss": 5.4, + "step": 23172 + }, + { + "epoch": 0.13781639547054905, + "grad_norm": 1.7060799598693848, + "learning_rate": 4.7693360559962027e-05, + "loss": 4.9668, + "step": 23173 + }, + { + "epoch": 0.13782234275383004, + "grad_norm": 1.4705651998519897, + "learning_rate": 4.769316458702782e-05, + "loss": 5.2053, + "step": 23174 + }, + { + "epoch": 0.13782829003711106, + "grad_norm": 1.806314468383789, + "learning_rate": 4.769296860617167e-05, + "loss": 5.5297, + "step": 23175 + }, + { + "epoch": 0.13783423732039204, + "grad_norm": 1.7741440534591675, + "learning_rate": 4.769277261739364e-05, + "loss": 5.569, + "step": 23176 + }, + { + "epoch": 0.13784018460367303, + "grad_norm": 1.4956278800964355, + "learning_rate": 4.7692576620693796e-05, + "loss": 5.2616, + "step": 23177 + }, + { + "epoch": 0.13784613188695405, + "grad_norm": 1.4668684005737305, + "learning_rate": 4.7692380616072205e-05, + "loss": 5.551, + "step": 23178 + }, + { + "epoch": 0.13785207917023504, + "grad_norm": 1.9172862768173218, + "learning_rate": 4.769218460352894e-05, + "loss": 5.072, + "step": 23179 + }, + { + "epoch": 0.13785802645351602, + "grad_norm": 2.3610761165618896, + "learning_rate": 4.769198858306407e-05, + "loss": 4.5511, + "step": 23180 + }, + { + "epoch": 0.13786397373679704, + "grad_norm": 2.099209785461426, + "learning_rate": 4.769179255467766e-05, + "loss": 5.1829, + "step": 23181 + }, + { + "epoch": 0.13786992102007803, + "grad_norm": 1.8222076892852783, + "learning_rate": 4.7691596518369776e-05, + "loss": 5.1451, + "step": 23182 + }, + { + "epoch": 0.13787586830335902, + "grad_norm": 2.129558563232422, + "learning_rate": 4.769140047414049e-05, + "loss": 4.574, + "step": 23183 + }, + { + "epoch": 0.13788181558664003, + "grad_norm": 2.3188533782958984, + "learning_rate": 4.7691204421989876e-05, + "loss": 4.4604, + "step": 23184 + }, + { + "epoch": 0.13788776286992102, + "grad_norm": 2.2996792793273926, + "learning_rate": 4.7691008361918e-05, + "loss": 4.6119, + "step": 23185 + }, + { + "epoch": 0.137893710153202, + "grad_norm": 2.164652109146118, + "learning_rate": 4.769081229392492e-05, + "loss": 4.6286, + "step": 23186 + }, + { + "epoch": 0.13789965743648303, + "grad_norm": 1.9271842241287231, + "learning_rate": 4.769061621801071e-05, + "loss": 4.947, + "step": 23187 + }, + { + "epoch": 0.137905604719764, + "grad_norm": 1.8559855222702026, + "learning_rate": 4.769042013417545e-05, + "loss": 5.1969, + "step": 23188 + }, + { + "epoch": 0.137911552003045, + "grad_norm": 1.8955408334732056, + "learning_rate": 4.769022404241919e-05, + "loss": 5.0117, + "step": 23189 + }, + { + "epoch": 0.13791749928632602, + "grad_norm": 2.333242177963257, + "learning_rate": 4.769002794274201e-05, + "loss": 4.4839, + "step": 23190 + }, + { + "epoch": 0.137923446569607, + "grad_norm": 1.6732560396194458, + "learning_rate": 4.768983183514397e-05, + "loss": 5.2458, + "step": 23191 + }, + { + "epoch": 0.137929393852888, + "grad_norm": 1.6078556776046753, + "learning_rate": 4.768963571962516e-05, + "loss": 5.616, + "step": 23192 + }, + { + "epoch": 0.137935341136169, + "grad_norm": 1.7516095638275146, + "learning_rate": 4.768943959618562e-05, + "loss": 5.3052, + "step": 23193 + }, + { + "epoch": 0.13794128841945, + "grad_norm": 1.5200318098068237, + "learning_rate": 4.7689243464825425e-05, + "loss": 5.664, + "step": 23194 + }, + { + "epoch": 0.13794723570273099, + "grad_norm": 1.3212077617645264, + "learning_rate": 4.7689047325544664e-05, + "loss": 5.4562, + "step": 23195 + }, + { + "epoch": 0.137953182986012, + "grad_norm": 1.3307675123214722, + "learning_rate": 4.7688851178343386e-05, + "loss": 5.2517, + "step": 23196 + }, + { + "epoch": 0.137959130269293, + "grad_norm": 1.5186207294464111, + "learning_rate": 4.768865502322166e-05, + "loss": 5.654, + "step": 23197 + }, + { + "epoch": 0.13796507755257398, + "grad_norm": 1.6482549905776978, + "learning_rate": 4.7688458860179564e-05, + "loss": 5.3282, + "step": 23198 + }, + { + "epoch": 0.137971024835855, + "grad_norm": 1.4418150186538696, + "learning_rate": 4.768826268921717e-05, + "loss": 5.5913, + "step": 23199 + }, + { + "epoch": 0.13797697211913598, + "grad_norm": 1.5591225624084473, + "learning_rate": 4.768806651033452e-05, + "loss": 5.8459, + "step": 23200 + }, + { + "epoch": 0.13798291940241697, + "grad_norm": 1.3476347923278809, + "learning_rate": 4.768787032353171e-05, + "loss": 5.3597, + "step": 23201 + }, + { + "epoch": 0.137988866685698, + "grad_norm": 1.4543404579162598, + "learning_rate": 4.76876741288088e-05, + "loss": 5.4525, + "step": 23202 + }, + { + "epoch": 0.13799481396897897, + "grad_norm": 1.3845150470733643, + "learning_rate": 4.7687477926165846e-05, + "loss": 5.6559, + "step": 23203 + }, + { + "epoch": 0.13800076125225996, + "grad_norm": 1.303808569908142, + "learning_rate": 4.768728171560294e-05, + "loss": 5.8732, + "step": 23204 + }, + { + "epoch": 0.13800670853554098, + "grad_norm": 1.422867774963379, + "learning_rate": 4.768708549712013e-05, + "loss": 5.217, + "step": 23205 + }, + { + "epoch": 0.13801265581882197, + "grad_norm": 1.558089017868042, + "learning_rate": 4.7686889270717506e-05, + "loss": 5.6403, + "step": 23206 + }, + { + "epoch": 0.13801860310210295, + "grad_norm": 1.5510298013687134, + "learning_rate": 4.7686693036395115e-05, + "loss": 5.6199, + "step": 23207 + }, + { + "epoch": 0.13802455038538394, + "grad_norm": 1.2693150043487549, + "learning_rate": 4.768649679415303e-05, + "loss": 5.7368, + "step": 23208 + }, + { + "epoch": 0.13803049766866496, + "grad_norm": 1.5053805112838745, + "learning_rate": 4.768630054399132e-05, + "loss": 5.4941, + "step": 23209 + }, + { + "epoch": 0.13803644495194595, + "grad_norm": 2.5151054859161377, + "learning_rate": 4.768610428591007e-05, + "loss": 4.5744, + "step": 23210 + }, + { + "epoch": 0.13804239223522694, + "grad_norm": 2.1085267066955566, + "learning_rate": 4.768590801990933e-05, + "loss": 4.5849, + "step": 23211 + }, + { + "epoch": 0.13804833951850795, + "grad_norm": 2.0741498470306396, + "learning_rate": 4.7685711745989174e-05, + "loss": 4.5745, + "step": 23212 + }, + { + "epoch": 0.13805428680178894, + "grad_norm": 2.0066654682159424, + "learning_rate": 4.7685515464149664e-05, + "loss": 4.6388, + "step": 23213 + }, + { + "epoch": 0.13806023408506993, + "grad_norm": 1.9224933385849, + "learning_rate": 4.7685319174390885e-05, + "loss": 4.5382, + "step": 23214 + }, + { + "epoch": 0.13806618136835094, + "grad_norm": 2.2363088130950928, + "learning_rate": 4.7685122876712896e-05, + "loss": 4.5825, + "step": 23215 + }, + { + "epoch": 0.13807212865163193, + "grad_norm": 2.1900362968444824, + "learning_rate": 4.768492657111576e-05, + "loss": 4.5519, + "step": 23216 + }, + { + "epoch": 0.13807807593491292, + "grad_norm": 2.0702250003814697, + "learning_rate": 4.768473025759955e-05, + "loss": 4.5917, + "step": 23217 + }, + { + "epoch": 0.13808402321819394, + "grad_norm": 2.000380277633667, + "learning_rate": 4.768453393616433e-05, + "loss": 4.8847, + "step": 23218 + }, + { + "epoch": 0.13808997050147492, + "grad_norm": 2.0710175037384033, + "learning_rate": 4.768433760681018e-05, + "loss": 4.5455, + "step": 23219 + }, + { + "epoch": 0.1380959177847559, + "grad_norm": 2.1148219108581543, + "learning_rate": 4.7684141269537165e-05, + "loss": 4.5109, + "step": 23220 + }, + { + "epoch": 0.13810186506803693, + "grad_norm": 1.7681657075881958, + "learning_rate": 4.768394492434535e-05, + "loss": 4.8899, + "step": 23221 + }, + { + "epoch": 0.13810781235131792, + "grad_norm": 2.032696008682251, + "learning_rate": 4.76837485712348e-05, + "loss": 5.2375, + "step": 23222 + }, + { + "epoch": 0.1381137596345989, + "grad_norm": 2.0016825199127197, + "learning_rate": 4.7683552210205585e-05, + "loss": 4.9066, + "step": 23223 + }, + { + "epoch": 0.13811970691787992, + "grad_norm": 2.1309103965759277, + "learning_rate": 4.7683355841257784e-05, + "loss": 4.4317, + "step": 23224 + }, + { + "epoch": 0.1381256542011609, + "grad_norm": 1.9037781953811646, + "learning_rate": 4.768315946439145e-05, + "loss": 5.0218, + "step": 23225 + }, + { + "epoch": 0.1381316014844419, + "grad_norm": 2.3080644607543945, + "learning_rate": 4.768296307960666e-05, + "loss": 5.2226, + "step": 23226 + }, + { + "epoch": 0.1381375487677229, + "grad_norm": 2.1073081493377686, + "learning_rate": 4.7682766686903494e-05, + "loss": 5.2403, + "step": 23227 + }, + { + "epoch": 0.1381434960510039, + "grad_norm": 1.7865220308303833, + "learning_rate": 4.768257028628199e-05, + "loss": 5.1642, + "step": 23228 + }, + { + "epoch": 0.1381494433342849, + "grad_norm": 1.7039834260940552, + "learning_rate": 4.768237387774225e-05, + "loss": 5.1943, + "step": 23229 + }, + { + "epoch": 0.1381553906175659, + "grad_norm": 1.714506983757019, + "learning_rate": 4.768217746128432e-05, + "loss": 5.0952, + "step": 23230 + }, + { + "epoch": 0.1381613379008469, + "grad_norm": 1.7183910608291626, + "learning_rate": 4.768198103690827e-05, + "loss": 5.0447, + "step": 23231 + }, + { + "epoch": 0.13816728518412788, + "grad_norm": 1.776077151298523, + "learning_rate": 4.768178460461419e-05, + "loss": 5.1296, + "step": 23232 + }, + { + "epoch": 0.1381732324674089, + "grad_norm": 1.7849907875061035, + "learning_rate": 4.7681588164402124e-05, + "loss": 4.7961, + "step": 23233 + }, + { + "epoch": 0.13817917975068988, + "grad_norm": 1.403860330581665, + "learning_rate": 4.768139171627216e-05, + "loss": 5.4794, + "step": 23234 + }, + { + "epoch": 0.13818512703397087, + "grad_norm": 1.5944229364395142, + "learning_rate": 4.7681195260224344e-05, + "loss": 4.973, + "step": 23235 + }, + { + "epoch": 0.1381910743172519, + "grad_norm": 2.196274518966675, + "learning_rate": 4.7680998796258764e-05, + "loss": 5.1835, + "step": 23236 + }, + { + "epoch": 0.13819702160053288, + "grad_norm": 1.5403459072113037, + "learning_rate": 4.768080232437548e-05, + "loss": 5.828, + "step": 23237 + }, + { + "epoch": 0.13820296888381386, + "grad_norm": 1.9711260795593262, + "learning_rate": 4.768060584457456e-05, + "loss": 5.4937, + "step": 23238 + }, + { + "epoch": 0.13820891616709488, + "grad_norm": 1.6869981288909912, + "learning_rate": 4.7680409356856075e-05, + "loss": 5.3298, + "step": 23239 + }, + { + "epoch": 0.13821486345037587, + "grad_norm": 2.4224069118499756, + "learning_rate": 4.7680212861220096e-05, + "loss": 4.9544, + "step": 23240 + }, + { + "epoch": 0.13822081073365686, + "grad_norm": 1.905261754989624, + "learning_rate": 4.768001635766669e-05, + "loss": 4.852, + "step": 23241 + }, + { + "epoch": 0.13822675801693787, + "grad_norm": 1.7081589698791504, + "learning_rate": 4.7679819846195925e-05, + "loss": 5.2201, + "step": 23242 + }, + { + "epoch": 0.13823270530021886, + "grad_norm": 1.5893620252609253, + "learning_rate": 4.767962332680786e-05, + "loss": 4.9691, + "step": 23243 + }, + { + "epoch": 0.13823865258349985, + "grad_norm": 1.7598754167556763, + "learning_rate": 4.767942679950258e-05, + "loss": 4.9661, + "step": 23244 + }, + { + "epoch": 0.13824459986678087, + "grad_norm": 1.6882308721542358, + "learning_rate": 4.767923026428015e-05, + "loss": 5.3529, + "step": 23245 + }, + { + "epoch": 0.13825054715006185, + "grad_norm": 1.6711715459823608, + "learning_rate": 4.767903372114063e-05, + "loss": 5.3288, + "step": 23246 + }, + { + "epoch": 0.13825649443334284, + "grad_norm": 1.5780813694000244, + "learning_rate": 4.76788371700841e-05, + "loss": 5.5583, + "step": 23247 + }, + { + "epoch": 0.13826244171662386, + "grad_norm": 1.9719429016113281, + "learning_rate": 4.767864061111061e-05, + "loss": 5.2821, + "step": 23248 + }, + { + "epoch": 0.13826838899990485, + "grad_norm": 1.6447231769561768, + "learning_rate": 4.767844404422025e-05, + "loss": 6.0166, + "step": 23249 + }, + { + "epoch": 0.13827433628318583, + "grad_norm": 1.6587456464767456, + "learning_rate": 4.767824746941307e-05, + "loss": 5.4081, + "step": 23250 + }, + { + "epoch": 0.13828028356646685, + "grad_norm": 1.9438105821609497, + "learning_rate": 4.767805088668916e-05, + "loss": 5.4436, + "step": 23251 + }, + { + "epoch": 0.13828623084974784, + "grad_norm": 2.1185503005981445, + "learning_rate": 4.767785429604857e-05, + "loss": 4.8413, + "step": 23252 + }, + { + "epoch": 0.13829217813302883, + "grad_norm": 2.176520347595215, + "learning_rate": 4.767765769749138e-05, + "loss": 4.9092, + "step": 23253 + }, + { + "epoch": 0.13829812541630984, + "grad_norm": 2.020982503890991, + "learning_rate": 4.767746109101765e-05, + "loss": 4.9179, + "step": 23254 + }, + { + "epoch": 0.13830407269959083, + "grad_norm": 1.6086227893829346, + "learning_rate": 4.767726447662746e-05, + "loss": 5.1998, + "step": 23255 + }, + { + "epoch": 0.13831001998287182, + "grad_norm": 1.8750804662704468, + "learning_rate": 4.767706785432087e-05, + "loss": 4.6858, + "step": 23256 + }, + { + "epoch": 0.13831596726615283, + "grad_norm": 1.7748466730117798, + "learning_rate": 4.767687122409794e-05, + "loss": 4.5468, + "step": 23257 + }, + { + "epoch": 0.13832191454943382, + "grad_norm": 1.94595205783844, + "learning_rate": 4.767667458595875e-05, + "loss": 4.6902, + "step": 23258 + }, + { + "epoch": 0.1383278618327148, + "grad_norm": 1.7588400840759277, + "learning_rate": 4.7676477939903375e-05, + "loss": 5.8701, + "step": 23259 + }, + { + "epoch": 0.13833380911599583, + "grad_norm": 1.8222272396087646, + "learning_rate": 4.7676281285931866e-05, + "loss": 4.6879, + "step": 23260 + }, + { + "epoch": 0.13833975639927681, + "grad_norm": 1.7244281768798828, + "learning_rate": 4.767608462404431e-05, + "loss": 5.0215, + "step": 23261 + }, + { + "epoch": 0.1383457036825578, + "grad_norm": 1.5756913423538208, + "learning_rate": 4.767588795424077e-05, + "loss": 5.9537, + "step": 23262 + }, + { + "epoch": 0.13835165096583882, + "grad_norm": 1.6441105604171753, + "learning_rate": 4.767569127652131e-05, + "loss": 5.9245, + "step": 23263 + }, + { + "epoch": 0.1383575982491198, + "grad_norm": 1.5573482513427734, + "learning_rate": 4.767549459088599e-05, + "loss": 5.6705, + "step": 23264 + }, + { + "epoch": 0.1383635455324008, + "grad_norm": 1.65425705909729, + "learning_rate": 4.767529789733489e-05, + "loss": 5.8664, + "step": 23265 + }, + { + "epoch": 0.13836949281568178, + "grad_norm": 1.665283441543579, + "learning_rate": 4.767510119586809e-05, + "loss": 5.7634, + "step": 23266 + }, + { + "epoch": 0.1383754400989628, + "grad_norm": 1.4760838747024536, + "learning_rate": 4.767490448648564e-05, + "loss": 5.7739, + "step": 23267 + }, + { + "epoch": 0.1383813873822438, + "grad_norm": 1.649942398071289, + "learning_rate": 4.7674707769187616e-05, + "loss": 5.7518, + "step": 23268 + }, + { + "epoch": 0.13838733466552477, + "grad_norm": 1.5092672109603882, + "learning_rate": 4.7674511043974084e-05, + "loss": 5.7706, + "step": 23269 + }, + { + "epoch": 0.1383932819488058, + "grad_norm": 2.5008256435394287, + "learning_rate": 4.767431431084512e-05, + "loss": 4.6023, + "step": 23270 + }, + { + "epoch": 0.13839922923208678, + "grad_norm": 2.4018449783325195, + "learning_rate": 4.767411756980078e-05, + "loss": 4.7872, + "step": 23271 + }, + { + "epoch": 0.13840517651536777, + "grad_norm": 1.7928224802017212, + "learning_rate": 4.7673920820841136e-05, + "loss": 5.2731, + "step": 23272 + }, + { + "epoch": 0.13841112379864878, + "grad_norm": 1.844249963760376, + "learning_rate": 4.767372406396627e-05, + "loss": 5.2441, + "step": 23273 + }, + { + "epoch": 0.13841707108192977, + "grad_norm": 2.160876989364624, + "learning_rate": 4.7673527299176236e-05, + "loss": 4.5445, + "step": 23274 + }, + { + "epoch": 0.13842301836521076, + "grad_norm": 1.6312650442123413, + "learning_rate": 4.767333052647112e-05, + "loss": 5.0418, + "step": 23275 + }, + { + "epoch": 0.13842896564849178, + "grad_norm": 1.6567429304122925, + "learning_rate": 4.7673133745850965e-05, + "loss": 5.2882, + "step": 23276 + }, + { + "epoch": 0.13843491293177276, + "grad_norm": 1.8484638929367065, + "learning_rate": 4.767293695731585e-05, + "loss": 5.3432, + "step": 23277 + }, + { + "epoch": 0.13844086021505375, + "grad_norm": 1.8447157144546509, + "learning_rate": 4.767274016086586e-05, + "loss": 5.3307, + "step": 23278 + }, + { + "epoch": 0.13844680749833477, + "grad_norm": 1.6714428663253784, + "learning_rate": 4.767254335650104e-05, + "loss": 5.3053, + "step": 23279 + }, + { + "epoch": 0.13845275478161576, + "grad_norm": 1.7423646450042725, + "learning_rate": 4.7672346544221474e-05, + "loss": 5.3129, + "step": 23280 + }, + { + "epoch": 0.13845870206489674, + "grad_norm": 1.5770469903945923, + "learning_rate": 4.7672149724027224e-05, + "loss": 5.2806, + "step": 23281 + }, + { + "epoch": 0.13846464934817776, + "grad_norm": 1.5982024669647217, + "learning_rate": 4.7671952895918365e-05, + "loss": 5.4873, + "step": 23282 + }, + { + "epoch": 0.13847059663145875, + "grad_norm": 1.9240913391113281, + "learning_rate": 4.767175605989496e-05, + "loss": 5.8309, + "step": 23283 + }, + { + "epoch": 0.13847654391473974, + "grad_norm": 1.612429141998291, + "learning_rate": 4.7671559215957075e-05, + "loss": 5.4479, + "step": 23284 + }, + { + "epoch": 0.13848249119802075, + "grad_norm": 1.5843868255615234, + "learning_rate": 4.7671362364104785e-05, + "loss": 5.5509, + "step": 23285 + }, + { + "epoch": 0.13848843848130174, + "grad_norm": 2.3811614513397217, + "learning_rate": 4.767116550433816e-05, + "loss": 5.4695, + "step": 23286 + }, + { + "epoch": 0.13849438576458273, + "grad_norm": 2.6257996559143066, + "learning_rate": 4.767096863665726e-05, + "loss": 5.0195, + "step": 23287 + }, + { + "epoch": 0.13850033304786374, + "grad_norm": 1.8920071125030518, + "learning_rate": 4.7670771761062164e-05, + "loss": 5.2023, + "step": 23288 + }, + { + "epoch": 0.13850628033114473, + "grad_norm": 1.52253258228302, + "learning_rate": 4.767057487755293e-05, + "loss": 5.6985, + "step": 23289 + }, + { + "epoch": 0.13851222761442572, + "grad_norm": 2.240440845489502, + "learning_rate": 4.767037798612964e-05, + "loss": 5.1073, + "step": 23290 + }, + { + "epoch": 0.13851817489770674, + "grad_norm": 2.127216100692749, + "learning_rate": 4.7670181086792354e-05, + "loss": 5.1885, + "step": 23291 + }, + { + "epoch": 0.13852412218098772, + "grad_norm": 2.128519058227539, + "learning_rate": 4.766998417954114e-05, + "loss": 4.9388, + "step": 23292 + }, + { + "epoch": 0.1385300694642687, + "grad_norm": 1.87863290309906, + "learning_rate": 4.7669787264376066e-05, + "loss": 4.8293, + "step": 23293 + }, + { + "epoch": 0.13853601674754973, + "grad_norm": 2.03975510597229, + "learning_rate": 4.766959034129721e-05, + "loss": 4.9168, + "step": 23294 + }, + { + "epoch": 0.13854196403083072, + "grad_norm": 2.0336341857910156, + "learning_rate": 4.766939341030463e-05, + "loss": 4.9715, + "step": 23295 + }, + { + "epoch": 0.1385479113141117, + "grad_norm": 1.943743348121643, + "learning_rate": 4.7669196471398396e-05, + "loss": 4.7709, + "step": 23296 + }, + { + "epoch": 0.13855385859739272, + "grad_norm": 2.1629462242126465, + "learning_rate": 4.766899952457858e-05, + "loss": 4.7499, + "step": 23297 + }, + { + "epoch": 0.1385598058806737, + "grad_norm": 2.200531005859375, + "learning_rate": 4.7668802569845256e-05, + "loss": 4.8418, + "step": 23298 + }, + { + "epoch": 0.1385657531639547, + "grad_norm": 2.038649797439575, + "learning_rate": 4.766860560719849e-05, + "loss": 5.2351, + "step": 23299 + }, + { + "epoch": 0.1385717004472357, + "grad_norm": 1.8091388940811157, + "learning_rate": 4.766840863663834e-05, + "loss": 5.3526, + "step": 23300 + }, + { + "epoch": 0.1385776477305167, + "grad_norm": 1.9351911544799805, + "learning_rate": 4.7668211658164884e-05, + "loss": 4.813, + "step": 23301 + }, + { + "epoch": 0.1385835950137977, + "grad_norm": 2.0985751152038574, + "learning_rate": 4.766801467177819e-05, + "loss": 4.7762, + "step": 23302 + }, + { + "epoch": 0.1385895422970787, + "grad_norm": 2.023658275604248, + "learning_rate": 4.766781767747833e-05, + "loss": 4.8076, + "step": 23303 + }, + { + "epoch": 0.1385954895803597, + "grad_norm": 1.7464020252227783, + "learning_rate": 4.7667620675265364e-05, + "loss": 5.2537, + "step": 23304 + }, + { + "epoch": 0.13860143686364068, + "grad_norm": 1.7812929153442383, + "learning_rate": 4.7667423665139364e-05, + "loss": 4.8896, + "step": 23305 + }, + { + "epoch": 0.1386073841469217, + "grad_norm": 2.0042948722839355, + "learning_rate": 4.76672266471004e-05, + "loss": 4.7254, + "step": 23306 + }, + { + "epoch": 0.13861333143020269, + "grad_norm": 1.8378963470458984, + "learning_rate": 4.7667029621148554e-05, + "loss": 4.9849, + "step": 23307 + }, + { + "epoch": 0.13861927871348367, + "grad_norm": 2.1476621627807617, + "learning_rate": 4.7666832587283873e-05, + "loss": 4.5167, + "step": 23308 + }, + { + "epoch": 0.1386252259967647, + "grad_norm": 1.8289295434951782, + "learning_rate": 4.7666635545506434e-05, + "loss": 4.8841, + "step": 23309 + }, + { + "epoch": 0.13863117328004568, + "grad_norm": 1.7215977907180786, + "learning_rate": 4.766643849581631e-05, + "loss": 5.0148, + "step": 23310 + }, + { + "epoch": 0.13863712056332667, + "grad_norm": 1.464308261871338, + "learning_rate": 4.7666241438213566e-05, + "loss": 5.2551, + "step": 23311 + }, + { + "epoch": 0.13864306784660768, + "grad_norm": 1.655523657798767, + "learning_rate": 4.766604437269827e-05, + "loss": 5.604, + "step": 23312 + }, + { + "epoch": 0.13864901512988867, + "grad_norm": 1.9533252716064453, + "learning_rate": 4.766584729927049e-05, + "loss": 5.6238, + "step": 23313 + }, + { + "epoch": 0.13865496241316966, + "grad_norm": 1.8174513578414917, + "learning_rate": 4.7665650217930305e-05, + "loss": 5.6806, + "step": 23314 + }, + { + "epoch": 0.13866090969645067, + "grad_norm": 1.58940851688385, + "learning_rate": 4.766545312867776e-05, + "loss": 5.5066, + "step": 23315 + }, + { + "epoch": 0.13866685697973166, + "grad_norm": 1.5862720012664795, + "learning_rate": 4.766525603151295e-05, + "loss": 5.352, + "step": 23316 + }, + { + "epoch": 0.13867280426301265, + "grad_norm": 1.7878305912017822, + "learning_rate": 4.7665058926435934e-05, + "loss": 5.4043, + "step": 23317 + }, + { + "epoch": 0.13867875154629367, + "grad_norm": 1.3984423875808716, + "learning_rate": 4.766486181344678e-05, + "loss": 5.8719, + "step": 23318 + }, + { + "epoch": 0.13868469882957465, + "grad_norm": 1.6912389993667603, + "learning_rate": 4.7664664692545555e-05, + "loss": 5.6587, + "step": 23319 + }, + { + "epoch": 0.13869064611285564, + "grad_norm": 1.593245506286621, + "learning_rate": 4.766446756373233e-05, + "loss": 5.424, + "step": 23320 + }, + { + "epoch": 0.13869659339613666, + "grad_norm": 1.5353487730026245, + "learning_rate": 4.766427042700717e-05, + "loss": 5.7179, + "step": 23321 + }, + { + "epoch": 0.13870254067941765, + "grad_norm": 1.4989358186721802, + "learning_rate": 4.766407328237016e-05, + "loss": 6.1919, + "step": 23322 + }, + { + "epoch": 0.13870848796269863, + "grad_norm": 1.292460322380066, + "learning_rate": 4.766387612982134e-05, + "loss": 5.8265, + "step": 23323 + }, + { + "epoch": 0.13871443524597965, + "grad_norm": 1.4890642166137695, + "learning_rate": 4.766367896936081e-05, + "loss": 5.1671, + "step": 23324 + }, + { + "epoch": 0.13872038252926064, + "grad_norm": 1.7513198852539062, + "learning_rate": 4.766348180098861e-05, + "loss": 4.908, + "step": 23325 + }, + { + "epoch": 0.13872632981254163, + "grad_norm": 1.503311038017273, + "learning_rate": 4.766328462470483e-05, + "loss": 5.661, + "step": 23326 + }, + { + "epoch": 0.13873227709582261, + "grad_norm": 2.333216667175293, + "learning_rate": 4.766308744050953e-05, + "loss": 4.5921, + "step": 23327 + }, + { + "epoch": 0.13873822437910363, + "grad_norm": 2.1495418548583984, + "learning_rate": 4.7662890248402786e-05, + "loss": 4.8017, + "step": 23328 + }, + { + "epoch": 0.13874417166238462, + "grad_norm": 1.4922517538070679, + "learning_rate": 4.766269304838466e-05, + "loss": 5.3407, + "step": 23329 + }, + { + "epoch": 0.1387501189456656, + "grad_norm": 1.5760530233383179, + "learning_rate": 4.7662495840455214e-05, + "loss": 5.1536, + "step": 23330 + }, + { + "epoch": 0.13875606622894662, + "grad_norm": 1.432483434677124, + "learning_rate": 4.7662298624614524e-05, + "loss": 4.405, + "step": 23331 + }, + { + "epoch": 0.1387620135122276, + "grad_norm": 1.5221575498580933, + "learning_rate": 4.766210140086267e-05, + "loss": 4.5132, + "step": 23332 + }, + { + "epoch": 0.1387679607955086, + "grad_norm": 1.7520684003829956, + "learning_rate": 4.76619041691997e-05, + "loss": 4.5229, + "step": 23333 + }, + { + "epoch": 0.13877390807878962, + "grad_norm": 1.8210954666137695, + "learning_rate": 4.76617069296257e-05, + "loss": 4.7207, + "step": 23334 + }, + { + "epoch": 0.1387798553620706, + "grad_norm": 1.5682491064071655, + "learning_rate": 4.7661509682140734e-05, + "loss": 4.5045, + "step": 23335 + }, + { + "epoch": 0.1387858026453516, + "grad_norm": 1.7219401597976685, + "learning_rate": 4.7661312426744865e-05, + "loss": 4.4846, + "step": 23336 + }, + { + "epoch": 0.1387917499286326, + "grad_norm": 1.590681791305542, + "learning_rate": 4.766111516343816e-05, + "loss": 4.2617, + "step": 23337 + }, + { + "epoch": 0.1387976972119136, + "grad_norm": 1.533359408378601, + "learning_rate": 4.76609178922207e-05, + "loss": 4.4746, + "step": 23338 + }, + { + "epoch": 0.13880364449519458, + "grad_norm": 1.5994545221328735, + "learning_rate": 4.7660720613092555e-05, + "loss": 4.5712, + "step": 23339 + }, + { + "epoch": 0.1388095917784756, + "grad_norm": 1.472655177116394, + "learning_rate": 4.766052332605377e-05, + "loss": 4.3592, + "step": 23340 + }, + { + "epoch": 0.1388155390617566, + "grad_norm": 1.5625941753387451, + "learning_rate": 4.7660326031104445e-05, + "loss": 4.2859, + "step": 23341 + }, + { + "epoch": 0.13882148634503758, + "grad_norm": 2.1194114685058594, + "learning_rate": 4.766012872824464e-05, + "loss": 5.0237, + "step": 23342 + }, + { + "epoch": 0.1388274336283186, + "grad_norm": 1.699491262435913, + "learning_rate": 4.7659931417474404e-05, + "loss": 5.4558, + "step": 23343 + }, + { + "epoch": 0.13883338091159958, + "grad_norm": 1.7734466791152954, + "learning_rate": 4.765973409879382e-05, + "loss": 4.5118, + "step": 23344 + }, + { + "epoch": 0.13883932819488057, + "grad_norm": 1.7193443775177002, + "learning_rate": 4.765953677220296e-05, + "loss": 5.7915, + "step": 23345 + }, + { + "epoch": 0.13884527547816158, + "grad_norm": 1.6994706392288208, + "learning_rate": 4.765933943770189e-05, + "loss": 5.2722, + "step": 23346 + }, + { + "epoch": 0.13885122276144257, + "grad_norm": 2.1580300331115723, + "learning_rate": 4.765914209529068e-05, + "loss": 5.2697, + "step": 23347 + }, + { + "epoch": 0.13885717004472356, + "grad_norm": 2.437685012817383, + "learning_rate": 4.765894474496939e-05, + "loss": 5.2533, + "step": 23348 + }, + { + "epoch": 0.13886311732800458, + "grad_norm": 2.2965760231018066, + "learning_rate": 4.7658747386738113e-05, + "loss": 5.3419, + "step": 23349 + }, + { + "epoch": 0.13886906461128556, + "grad_norm": 2.0520517826080322, + "learning_rate": 4.765855002059689e-05, + "loss": 5.1966, + "step": 23350 + }, + { + "epoch": 0.13887501189456655, + "grad_norm": 2.043931484222412, + "learning_rate": 4.76583526465458e-05, + "loss": 5.1984, + "step": 23351 + }, + { + "epoch": 0.13888095917784757, + "grad_norm": 1.9283409118652344, + "learning_rate": 4.765815526458491e-05, + "loss": 4.6806, + "step": 23352 + }, + { + "epoch": 0.13888690646112856, + "grad_norm": 1.8964955806732178, + "learning_rate": 4.76579578747143e-05, + "loss": 4.9367, + "step": 23353 + }, + { + "epoch": 0.13889285374440954, + "grad_norm": 1.8109381198883057, + "learning_rate": 4.765776047693403e-05, + "loss": 4.7777, + "step": 23354 + }, + { + "epoch": 0.13889880102769056, + "grad_norm": 2.0096335411071777, + "learning_rate": 4.765756307124417e-05, + "loss": 4.9217, + "step": 23355 + }, + { + "epoch": 0.13890474831097155, + "grad_norm": 1.8210729360580444, + "learning_rate": 4.765736565764479e-05, + "loss": 4.8393, + "step": 23356 + }, + { + "epoch": 0.13891069559425254, + "grad_norm": 2.1033902168273926, + "learning_rate": 4.7657168236135954e-05, + "loss": 5.043, + "step": 23357 + }, + { + "epoch": 0.13891664287753355, + "grad_norm": 2.0610570907592773, + "learning_rate": 4.7656970806717736e-05, + "loss": 5.0493, + "step": 23358 + }, + { + "epoch": 0.13892259016081454, + "grad_norm": 2.169670343399048, + "learning_rate": 4.765677336939021e-05, + "loss": 5.2321, + "step": 23359 + }, + { + "epoch": 0.13892853744409553, + "grad_norm": 2.198686122894287, + "learning_rate": 4.7656575924153426e-05, + "loss": 5.2698, + "step": 23360 + }, + { + "epoch": 0.13893448472737654, + "grad_norm": 1.9425220489501953, + "learning_rate": 4.7656378471007476e-05, + "loss": 4.9435, + "step": 23361 + }, + { + "epoch": 0.13894043201065753, + "grad_norm": 1.936712384223938, + "learning_rate": 4.765618100995241e-05, + "loss": 4.6584, + "step": 23362 + }, + { + "epoch": 0.13894637929393852, + "grad_norm": 1.7941532135009766, + "learning_rate": 4.765598354098831e-05, + "loss": 4.6791, + "step": 23363 + }, + { + "epoch": 0.13895232657721954, + "grad_norm": 2.0149965286254883, + "learning_rate": 4.765578606411524e-05, + "loss": 5.1019, + "step": 23364 + }, + { + "epoch": 0.13895827386050053, + "grad_norm": 1.9302345514297485, + "learning_rate": 4.7655588579333265e-05, + "loss": 5.1168, + "step": 23365 + }, + { + "epoch": 0.1389642211437815, + "grad_norm": 2.0851333141326904, + "learning_rate": 4.7655391086642465e-05, + "loss": 5.0517, + "step": 23366 + }, + { + "epoch": 0.13897016842706253, + "grad_norm": 1.9221385717391968, + "learning_rate": 4.7655193586042904e-05, + "loss": 5.1486, + "step": 23367 + }, + { + "epoch": 0.13897611571034352, + "grad_norm": 1.9929136037826538, + "learning_rate": 4.765499607753464e-05, + "loss": 5.1288, + "step": 23368 + }, + { + "epoch": 0.1389820629936245, + "grad_norm": 1.8818596601486206, + "learning_rate": 4.765479856111775e-05, + "loss": 4.8252, + "step": 23369 + }, + { + "epoch": 0.13898801027690552, + "grad_norm": 1.748961091041565, + "learning_rate": 4.765460103679231e-05, + "loss": 4.7829, + "step": 23370 + }, + { + "epoch": 0.1389939575601865, + "grad_norm": 1.8021109104156494, + "learning_rate": 4.765440350455838e-05, + "loss": 4.7424, + "step": 23371 + }, + { + "epoch": 0.1389999048434675, + "grad_norm": 2.1486730575561523, + "learning_rate": 4.765420596441603e-05, + "loss": 4.6696, + "step": 23372 + }, + { + "epoch": 0.1390058521267485, + "grad_norm": 1.9908959865570068, + "learning_rate": 4.765400841636534e-05, + "loss": 4.5644, + "step": 23373 + }, + { + "epoch": 0.1390117994100295, + "grad_norm": 2.021198272705078, + "learning_rate": 4.765381086040636e-05, + "loss": 5.2841, + "step": 23374 + }, + { + "epoch": 0.1390177466933105, + "grad_norm": 2.0757644176483154, + "learning_rate": 4.765361329653918e-05, + "loss": 5.0479, + "step": 23375 + }, + { + "epoch": 0.1390236939765915, + "grad_norm": 2.6452016830444336, + "learning_rate": 4.7653415724763844e-05, + "loss": 4.5668, + "step": 23376 + }, + { + "epoch": 0.1390296412598725, + "grad_norm": 1.8536683320999146, + "learning_rate": 4.7653218145080436e-05, + "loss": 4.6049, + "step": 23377 + }, + { + "epoch": 0.13903558854315348, + "grad_norm": 2.1392767429351807, + "learning_rate": 4.765302055748903e-05, + "loss": 4.5307, + "step": 23378 + }, + { + "epoch": 0.1390415358264345, + "grad_norm": 2.0592446327209473, + "learning_rate": 4.765282296198968e-05, + "loss": 4.7421, + "step": 23379 + }, + { + "epoch": 0.1390474831097155, + "grad_norm": 1.9982407093048096, + "learning_rate": 4.765262535858248e-05, + "loss": 4.5699, + "step": 23380 + }, + { + "epoch": 0.13905343039299647, + "grad_norm": 1.6928536891937256, + "learning_rate": 4.765242774726747e-05, + "loss": 5.0689, + "step": 23381 + }, + { + "epoch": 0.1390593776762775, + "grad_norm": 2.1993813514709473, + "learning_rate": 4.765223012804474e-05, + "loss": 4.8268, + "step": 23382 + }, + { + "epoch": 0.13906532495955848, + "grad_norm": 1.711241364479065, + "learning_rate": 4.765203250091434e-05, + "loss": 5.7443, + "step": 23383 + }, + { + "epoch": 0.13907127224283947, + "grad_norm": 1.862398386001587, + "learning_rate": 4.765183486587636e-05, + "loss": 5.3367, + "step": 23384 + }, + { + "epoch": 0.13907721952612045, + "grad_norm": 1.95891273021698, + "learning_rate": 4.765163722293084e-05, + "loss": 5.6618, + "step": 23385 + }, + { + "epoch": 0.13908316680940147, + "grad_norm": 2.362205743789673, + "learning_rate": 4.765143957207789e-05, + "loss": 5.1168, + "step": 23386 + }, + { + "epoch": 0.13908911409268246, + "grad_norm": 1.7440927028656006, + "learning_rate": 4.7651241913317545e-05, + "loss": 4.858, + "step": 23387 + }, + { + "epoch": 0.13909506137596345, + "grad_norm": 1.7432098388671875, + "learning_rate": 4.765104424664989e-05, + "loss": 4.9096, + "step": 23388 + }, + { + "epoch": 0.13910100865924446, + "grad_norm": 1.7505769729614258, + "learning_rate": 4.765084657207498e-05, + "loss": 5.0255, + "step": 23389 + }, + { + "epoch": 0.13910695594252545, + "grad_norm": 1.5105990171432495, + "learning_rate": 4.76506488895929e-05, + "loss": 5.2811, + "step": 23390 + }, + { + "epoch": 0.13911290322580644, + "grad_norm": 1.6876368522644043, + "learning_rate": 4.765045119920372e-05, + "loss": 5.6723, + "step": 23391 + }, + { + "epoch": 0.13911885050908745, + "grad_norm": 1.6542494297027588, + "learning_rate": 4.7650253500907494e-05, + "loss": 5.1409, + "step": 23392 + }, + { + "epoch": 0.13912479779236844, + "grad_norm": 2.0412867069244385, + "learning_rate": 4.76500557947043e-05, + "loss": 4.8772, + "step": 23393 + }, + { + "epoch": 0.13913074507564943, + "grad_norm": 1.8121492862701416, + "learning_rate": 4.76498580805942e-05, + "loss": 5.1079, + "step": 23394 + }, + { + "epoch": 0.13913669235893045, + "grad_norm": 1.576653003692627, + "learning_rate": 4.764966035857727e-05, + "loss": 4.9576, + "step": 23395 + }, + { + "epoch": 0.13914263964221144, + "grad_norm": 1.5891642570495605, + "learning_rate": 4.764946262865358e-05, + "loss": 4.8846, + "step": 23396 + }, + { + "epoch": 0.13914858692549242, + "grad_norm": 1.7079927921295166, + "learning_rate": 4.7649264890823195e-05, + "loss": 5.0182, + "step": 23397 + }, + { + "epoch": 0.13915453420877344, + "grad_norm": 1.6532564163208008, + "learning_rate": 4.764906714508619e-05, + "loss": 4.8068, + "step": 23398 + }, + { + "epoch": 0.13916048149205443, + "grad_norm": 1.5107650756835938, + "learning_rate": 4.764886939144263e-05, + "loss": 5.3482, + "step": 23399 + }, + { + "epoch": 0.13916642877533542, + "grad_norm": 1.666096806526184, + "learning_rate": 4.764867162989258e-05, + "loss": 5.1747, + "step": 23400 + }, + { + "epoch": 0.13917237605861643, + "grad_norm": 1.864372730255127, + "learning_rate": 4.764847386043611e-05, + "loss": 4.3209, + "step": 23401 + }, + { + "epoch": 0.13917832334189742, + "grad_norm": 2.2691080570220947, + "learning_rate": 4.7648276083073295e-05, + "loss": 4.5254, + "step": 23402 + }, + { + "epoch": 0.1391842706251784, + "grad_norm": 2.0673935413360596, + "learning_rate": 4.76480782978042e-05, + "loss": 4.639, + "step": 23403 + }, + { + "epoch": 0.13919021790845942, + "grad_norm": 1.9274605512619019, + "learning_rate": 4.76478805046289e-05, + "loss": 4.579, + "step": 23404 + }, + { + "epoch": 0.1391961651917404, + "grad_norm": 1.5076278448104858, + "learning_rate": 4.7647682703547455e-05, + "loss": 4.9522, + "step": 23405 + }, + { + "epoch": 0.1392021124750214, + "grad_norm": 2.005662202835083, + "learning_rate": 4.7647484894559936e-05, + "loss": 4.3399, + "step": 23406 + }, + { + "epoch": 0.13920805975830242, + "grad_norm": 1.9292556047439575, + "learning_rate": 4.7647287077666414e-05, + "loss": 4.4166, + "step": 23407 + }, + { + "epoch": 0.1392140070415834, + "grad_norm": 1.7474818229675293, + "learning_rate": 4.764708925286696e-05, + "loss": 4.3355, + "step": 23408 + }, + { + "epoch": 0.1392199543248644, + "grad_norm": 1.9833084344863892, + "learning_rate": 4.764689142016164e-05, + "loss": 4.3388, + "step": 23409 + }, + { + "epoch": 0.1392259016081454, + "grad_norm": 1.7962874174118042, + "learning_rate": 4.764669357955053e-05, + "loss": 5.3199, + "step": 23410 + }, + { + "epoch": 0.1392318488914264, + "grad_norm": 1.6865921020507812, + "learning_rate": 4.764649573103368e-05, + "loss": 5.3787, + "step": 23411 + }, + { + "epoch": 0.13923779617470738, + "grad_norm": 1.2966182231903076, + "learning_rate": 4.7646297874611185e-05, + "loss": 5.0989, + "step": 23412 + }, + { + "epoch": 0.1392437434579884, + "grad_norm": 1.732437252998352, + "learning_rate": 4.76461000102831e-05, + "loss": 5.6207, + "step": 23413 + }, + { + "epoch": 0.1392496907412694, + "grad_norm": 1.567841649055481, + "learning_rate": 4.7645902138049494e-05, + "loss": 5.3921, + "step": 23414 + }, + { + "epoch": 0.13925563802455038, + "grad_norm": 1.7841026782989502, + "learning_rate": 4.764570425791043e-05, + "loss": 5.7206, + "step": 23415 + }, + { + "epoch": 0.1392615853078314, + "grad_norm": 2.0582776069641113, + "learning_rate": 4.764550636986599e-05, + "loss": 4.7812, + "step": 23416 + }, + { + "epoch": 0.13926753259111238, + "grad_norm": 1.5891739130020142, + "learning_rate": 4.764530847391624e-05, + "loss": 5.3211, + "step": 23417 + }, + { + "epoch": 0.13927347987439337, + "grad_norm": 1.4662810564041138, + "learning_rate": 4.764511057006125e-05, + "loss": 5.6385, + "step": 23418 + }, + { + "epoch": 0.13927942715767438, + "grad_norm": 1.6601322889328003, + "learning_rate": 4.764491265830108e-05, + "loss": 5.7947, + "step": 23419 + }, + { + "epoch": 0.13928537444095537, + "grad_norm": 1.5726239681243896, + "learning_rate": 4.7644714738635796e-05, + "loss": 5.6488, + "step": 23420 + }, + { + "epoch": 0.13929132172423636, + "grad_norm": 2.0315866470336914, + "learning_rate": 4.7644516811065494e-05, + "loss": 5.3196, + "step": 23421 + }, + { + "epoch": 0.13929726900751738, + "grad_norm": 2.3560190200805664, + "learning_rate": 4.764431887559022e-05, + "loss": 5.0898, + "step": 23422 + }, + { + "epoch": 0.13930321629079837, + "grad_norm": 1.6240613460540771, + "learning_rate": 4.764412093221004e-05, + "loss": 4.9766, + "step": 23423 + }, + { + "epoch": 0.13930916357407935, + "grad_norm": 1.9657840728759766, + "learning_rate": 4.764392298092504e-05, + "loss": 5.5328, + "step": 23424 + }, + { + "epoch": 0.13931511085736037, + "grad_norm": 1.8219939470291138, + "learning_rate": 4.764372502173527e-05, + "loss": 5.3713, + "step": 23425 + }, + { + "epoch": 0.13932105814064136, + "grad_norm": 1.6808767318725586, + "learning_rate": 4.764352705464082e-05, + "loss": 5.4753, + "step": 23426 + }, + { + "epoch": 0.13932700542392235, + "grad_norm": 1.6270160675048828, + "learning_rate": 4.764332907964175e-05, + "loss": 5.6609, + "step": 23427 + }, + { + "epoch": 0.13933295270720336, + "grad_norm": 1.5609904527664185, + "learning_rate": 4.764313109673812e-05, + "loss": 5.6954, + "step": 23428 + }, + { + "epoch": 0.13933889999048435, + "grad_norm": 1.5029795169830322, + "learning_rate": 4.764293310593001e-05, + "loss": 5.6655, + "step": 23429 + }, + { + "epoch": 0.13934484727376534, + "grad_norm": 1.6427209377288818, + "learning_rate": 4.7642735107217484e-05, + "loss": 4.9946, + "step": 23430 + }, + { + "epoch": 0.13935079455704635, + "grad_norm": 1.5815205574035645, + "learning_rate": 4.764253710060062e-05, + "loss": 5.4891, + "step": 23431 + }, + { + "epoch": 0.13935674184032734, + "grad_norm": 1.7551064491271973, + "learning_rate": 4.764233908607947e-05, + "loss": 5.4036, + "step": 23432 + }, + { + "epoch": 0.13936268912360833, + "grad_norm": 1.62980055809021, + "learning_rate": 4.7642141063654114e-05, + "loss": 5.4836, + "step": 23433 + }, + { + "epoch": 0.13936863640688935, + "grad_norm": 1.836366891860962, + "learning_rate": 4.7641943033324634e-05, + "loss": 5.4079, + "step": 23434 + }, + { + "epoch": 0.13937458369017033, + "grad_norm": 1.710744857788086, + "learning_rate": 4.764174499509107e-05, + "loss": 5.2859, + "step": 23435 + }, + { + "epoch": 0.13938053097345132, + "grad_norm": 1.6887309551239014, + "learning_rate": 4.7641546948953515e-05, + "loss": 5.4671, + "step": 23436 + }, + { + "epoch": 0.13938647825673234, + "grad_norm": 1.6997935771942139, + "learning_rate": 4.764134889491203e-05, + "loss": 5.2601, + "step": 23437 + }, + { + "epoch": 0.13939242554001333, + "grad_norm": 1.560526967048645, + "learning_rate": 4.764115083296668e-05, + "loss": 5.795, + "step": 23438 + }, + { + "epoch": 0.13939837282329431, + "grad_norm": 1.4518390893936157, + "learning_rate": 4.7640952763117544e-05, + "loss": 5.3885, + "step": 23439 + }, + { + "epoch": 0.13940432010657533, + "grad_norm": 1.698185920715332, + "learning_rate": 4.7640754685364675e-05, + "loss": 5.053, + "step": 23440 + }, + { + "epoch": 0.13941026738985632, + "grad_norm": 1.7422363758087158, + "learning_rate": 4.764055659970816e-05, + "loss": 5.1586, + "step": 23441 + }, + { + "epoch": 0.1394162146731373, + "grad_norm": 1.7014398574829102, + "learning_rate": 4.7640358506148065e-05, + "loss": 5.2313, + "step": 23442 + }, + { + "epoch": 0.1394221619564183, + "grad_norm": 1.6611777544021606, + "learning_rate": 4.764016040468444e-05, + "loss": 5.1691, + "step": 23443 + }, + { + "epoch": 0.1394281092396993, + "grad_norm": 1.6166971921920776, + "learning_rate": 4.763996229531739e-05, + "loss": 5.2217, + "step": 23444 + }, + { + "epoch": 0.1394340565229803, + "grad_norm": 1.9434369802474976, + "learning_rate": 4.763976417804694e-05, + "loss": 4.4322, + "step": 23445 + }, + { + "epoch": 0.1394400038062613, + "grad_norm": 3.2407455444335938, + "learning_rate": 4.7639566052873197e-05, + "loss": 3.3762, + "step": 23446 + }, + { + "epoch": 0.1394459510895423, + "grad_norm": 1.8475316762924194, + "learning_rate": 4.7639367919796215e-05, + "loss": 5.2435, + "step": 23447 + }, + { + "epoch": 0.1394518983728233, + "grad_norm": 1.7297134399414062, + "learning_rate": 4.763916977881606e-05, + "loss": 5.2485, + "step": 23448 + }, + { + "epoch": 0.13945784565610428, + "grad_norm": 1.720375657081604, + "learning_rate": 4.76389716299328e-05, + "loss": 5.1242, + "step": 23449 + }, + { + "epoch": 0.1394637929393853, + "grad_norm": 1.729045033454895, + "learning_rate": 4.763877347314652e-05, + "loss": 5.0312, + "step": 23450 + }, + { + "epoch": 0.13946974022266628, + "grad_norm": 1.817941427230835, + "learning_rate": 4.7638575308457266e-05, + "loss": 4.5856, + "step": 23451 + }, + { + "epoch": 0.13947568750594727, + "grad_norm": 2.7483971118927, + "learning_rate": 4.763837713586513e-05, + "loss": 3.3044, + "step": 23452 + }, + { + "epoch": 0.1394816347892283, + "grad_norm": 2.3746731281280518, + "learning_rate": 4.763817895537017e-05, + "loss": 3.0149, + "step": 23453 + }, + { + "epoch": 0.13948758207250928, + "grad_norm": 2.6971354484558105, + "learning_rate": 4.763798076697244e-05, + "loss": 3.7174, + "step": 23454 + }, + { + "epoch": 0.13949352935579026, + "grad_norm": 2.457082986831665, + "learning_rate": 4.763778257067205e-05, + "loss": 2.8548, + "step": 23455 + }, + { + "epoch": 0.13949947663907128, + "grad_norm": 2.4862163066864014, + "learning_rate": 4.7637584366469024e-05, + "loss": 2.6084, + "step": 23456 + }, + { + "epoch": 0.13950542392235227, + "grad_norm": 2.847895622253418, + "learning_rate": 4.763738615436346e-05, + "loss": 4.1775, + "step": 23457 + }, + { + "epoch": 0.13951137120563326, + "grad_norm": 2.827467918395996, + "learning_rate": 4.763718793435541e-05, + "loss": 4.0248, + "step": 23458 + }, + { + "epoch": 0.13951731848891427, + "grad_norm": 2.9717519283294678, + "learning_rate": 4.763698970644496e-05, + "loss": 3.8032, + "step": 23459 + }, + { + "epoch": 0.13952326577219526, + "grad_norm": 2.6418726444244385, + "learning_rate": 4.7636791470632166e-05, + "loss": 3.7307, + "step": 23460 + }, + { + "epoch": 0.13952921305547625, + "grad_norm": 2.789552927017212, + "learning_rate": 4.763659322691711e-05, + "loss": 3.458, + "step": 23461 + }, + { + "epoch": 0.13953516033875726, + "grad_norm": 2.3144681453704834, + "learning_rate": 4.7636394975299845e-05, + "loss": 4.1631, + "step": 23462 + }, + { + "epoch": 0.13954110762203825, + "grad_norm": 3.1292171478271484, + "learning_rate": 4.7636196715780454e-05, + "loss": 3.3234, + "step": 23463 + }, + { + "epoch": 0.13954705490531924, + "grad_norm": 3.2646241188049316, + "learning_rate": 4.763599844835899e-05, + "loss": 3.4951, + "step": 23464 + }, + { + "epoch": 0.13955300218860026, + "grad_norm": 3.3047688007354736, + "learning_rate": 4.7635800173035545e-05, + "loss": 3.6349, + "step": 23465 + }, + { + "epoch": 0.13955894947188124, + "grad_norm": 2.6160805225372314, + "learning_rate": 4.763560188981017e-05, + "loss": 3.8286, + "step": 23466 + }, + { + "epoch": 0.13956489675516223, + "grad_norm": 2.5719079971313477, + "learning_rate": 4.763540359868294e-05, + "loss": 3.7716, + "step": 23467 + }, + { + "epoch": 0.13957084403844325, + "grad_norm": 2.6471214294433594, + "learning_rate": 4.763520529965393e-05, + "loss": 3.4606, + "step": 23468 + }, + { + "epoch": 0.13957679132172424, + "grad_norm": 2.581679344177246, + "learning_rate": 4.7635006992723194e-05, + "loss": 3.5469, + "step": 23469 + }, + { + "epoch": 0.13958273860500522, + "grad_norm": 2.3326828479766846, + "learning_rate": 4.763480867789082e-05, + "loss": 3.7371, + "step": 23470 + }, + { + "epoch": 0.13958868588828624, + "grad_norm": 2.46588397026062, + "learning_rate": 4.763461035515686e-05, + "loss": 3.5972, + "step": 23471 + }, + { + "epoch": 0.13959463317156723, + "grad_norm": 2.3971428871154785, + "learning_rate": 4.76344120245214e-05, + "loss": 3.9445, + "step": 23472 + }, + { + "epoch": 0.13960058045484822, + "grad_norm": 1.8938592672348022, + "learning_rate": 4.7634213685984494e-05, + "loss": 5.1934, + "step": 23473 + }, + { + "epoch": 0.13960652773812923, + "grad_norm": 1.4792579412460327, + "learning_rate": 4.763401533954622e-05, + "loss": 5.5867, + "step": 23474 + }, + { + "epoch": 0.13961247502141022, + "grad_norm": 1.9039497375488281, + "learning_rate": 4.763381698520665e-05, + "loss": 4.9615, + "step": 23475 + }, + { + "epoch": 0.1396184223046912, + "grad_norm": 2.2181084156036377, + "learning_rate": 4.7633618622965845e-05, + "loss": 5.107, + "step": 23476 + }, + { + "epoch": 0.13962436958797222, + "grad_norm": 1.618551254272461, + "learning_rate": 4.7633420252823876e-05, + "loss": 4.8326, + "step": 23477 + }, + { + "epoch": 0.1396303168712532, + "grad_norm": 1.7516298294067383, + "learning_rate": 4.763322187478081e-05, + "loss": 5.0812, + "step": 23478 + }, + { + "epoch": 0.1396362641545342, + "grad_norm": 2.385951042175293, + "learning_rate": 4.7633023488836726e-05, + "loss": 4.2155, + "step": 23479 + }, + { + "epoch": 0.13964221143781522, + "grad_norm": 2.1702630519866943, + "learning_rate": 4.7632825094991686e-05, + "loss": 4.1257, + "step": 23480 + }, + { + "epoch": 0.1396481587210962, + "grad_norm": 1.9801292419433594, + "learning_rate": 4.763262669324576e-05, + "loss": 3.7386, + "step": 23481 + }, + { + "epoch": 0.1396541060043772, + "grad_norm": 2.250795602798462, + "learning_rate": 4.7632428283599016e-05, + "loss": 3.7169, + "step": 23482 + }, + { + "epoch": 0.1396600532876582, + "grad_norm": 2.124037027359009, + "learning_rate": 4.763222986605153e-05, + "loss": 3.7271, + "step": 23483 + }, + { + "epoch": 0.1396660005709392, + "grad_norm": 1.7976130247116089, + "learning_rate": 4.763203144060336e-05, + "loss": 3.9943, + "step": 23484 + }, + { + "epoch": 0.13967194785422019, + "grad_norm": 1.8421905040740967, + "learning_rate": 4.763183300725459e-05, + "loss": 4.1526, + "step": 23485 + }, + { + "epoch": 0.1396778951375012, + "grad_norm": 2.166212797164917, + "learning_rate": 4.763163456600527e-05, + "loss": 4.0225, + "step": 23486 + }, + { + "epoch": 0.1396838424207822, + "grad_norm": 2.2913808822631836, + "learning_rate": 4.763143611685549e-05, + "loss": 4.1125, + "step": 23487 + }, + { + "epoch": 0.13968978970406318, + "grad_norm": 2.20432448387146, + "learning_rate": 4.7631237659805307e-05, + "loss": 3.8297, + "step": 23488 + }, + { + "epoch": 0.1396957369873442, + "grad_norm": 2.323784351348877, + "learning_rate": 4.7631039194854785e-05, + "loss": 3.9128, + "step": 23489 + }, + { + "epoch": 0.13970168427062518, + "grad_norm": 2.22320294380188, + "learning_rate": 4.7630840722004014e-05, + "loss": 3.2834, + "step": 23490 + }, + { + "epoch": 0.13970763155390617, + "grad_norm": 2.2063205242156982, + "learning_rate": 4.763064224125304e-05, + "loss": 3.2472, + "step": 23491 + }, + { + "epoch": 0.13971357883718719, + "grad_norm": 2.1124684810638428, + "learning_rate": 4.763044375260195e-05, + "loss": 3.4765, + "step": 23492 + }, + { + "epoch": 0.13971952612046817, + "grad_norm": 2.2450273036956787, + "learning_rate": 4.7630245256050796e-05, + "loss": 3.586, + "step": 23493 + }, + { + "epoch": 0.13972547340374916, + "grad_norm": 2.821563243865967, + "learning_rate": 4.7630046751599665e-05, + "loss": 3.9152, + "step": 23494 + }, + { + "epoch": 0.13973142068703018, + "grad_norm": 2.623655319213867, + "learning_rate": 4.762984823924862e-05, + "loss": 5.2159, + "step": 23495 + }, + { + "epoch": 0.13973736797031117, + "grad_norm": 2.5610146522521973, + "learning_rate": 4.762964971899773e-05, + "loss": 5.0813, + "step": 23496 + }, + { + "epoch": 0.13974331525359215, + "grad_norm": 2.434995651245117, + "learning_rate": 4.7629451190847055e-05, + "loss": 4.651, + "step": 23497 + }, + { + "epoch": 0.13974926253687317, + "grad_norm": 2.0094375610351562, + "learning_rate": 4.7629252654796675e-05, + "loss": 5.6776, + "step": 23498 + }, + { + "epoch": 0.13975520982015416, + "grad_norm": 2.568547248840332, + "learning_rate": 4.7629054110846664e-05, + "loss": 3.2979, + "step": 23499 + }, + { + "epoch": 0.13976115710343515, + "grad_norm": 1.9725669622421265, + "learning_rate": 4.7628855558997074e-05, + "loss": 5.6269, + "step": 23500 + }, + { + "epoch": 0.13976710438671613, + "grad_norm": 1.6308250427246094, + "learning_rate": 4.7628656999247986e-05, + "loss": 5.7476, + "step": 23501 + }, + { + "epoch": 0.13977305166999715, + "grad_norm": 2.4110774993896484, + "learning_rate": 4.762845843159947e-05, + "loss": 4.8208, + "step": 23502 + }, + { + "epoch": 0.13977899895327814, + "grad_norm": 2.9670233726501465, + "learning_rate": 4.762825985605159e-05, + "loss": 3.3216, + "step": 23503 + }, + { + "epoch": 0.13978494623655913, + "grad_norm": 2.9264471530914307, + "learning_rate": 4.762806127260443e-05, + "loss": 3.12, + "step": 23504 + }, + { + "epoch": 0.13979089351984014, + "grad_norm": 2.983513593673706, + "learning_rate": 4.7627862681258037e-05, + "loss": 3.2355, + "step": 23505 + }, + { + "epoch": 0.13979684080312113, + "grad_norm": 2.5023698806762695, + "learning_rate": 4.7627664082012494e-05, + "loss": 3.6619, + "step": 23506 + }, + { + "epoch": 0.13980278808640212, + "grad_norm": 2.691542625427246, + "learning_rate": 4.762746547486786e-05, + "loss": 2.9562, + "step": 23507 + }, + { + "epoch": 0.13980873536968313, + "grad_norm": 2.487741470336914, + "learning_rate": 4.762726685982421e-05, + "loss": 3.6212, + "step": 23508 + }, + { + "epoch": 0.13981468265296412, + "grad_norm": 2.5798730850219727, + "learning_rate": 4.762706823688163e-05, + "loss": 3.6246, + "step": 23509 + }, + { + "epoch": 0.1398206299362451, + "grad_norm": 2.8465988636016846, + "learning_rate": 4.762686960604017e-05, + "loss": 3.3039, + "step": 23510 + }, + { + "epoch": 0.13982657721952613, + "grad_norm": 2.70969820022583, + "learning_rate": 4.7626670967299897e-05, + "loss": 2.3823, + "step": 23511 + }, + { + "epoch": 0.13983252450280712, + "grad_norm": 2.3834662437438965, + "learning_rate": 4.762647232066089e-05, + "loss": 2.8856, + "step": 23512 + }, + { + "epoch": 0.1398384717860881, + "grad_norm": 2.694798231124878, + "learning_rate": 4.762627366612321e-05, + "loss": 4.3653, + "step": 23513 + }, + { + "epoch": 0.13984441906936912, + "grad_norm": 2.6196436882019043, + "learning_rate": 4.7626075003686944e-05, + "loss": 4.5615, + "step": 23514 + }, + { + "epoch": 0.1398503663526501, + "grad_norm": 2.6196036338806152, + "learning_rate": 4.7625876333352136e-05, + "loss": 3.4767, + "step": 23515 + }, + { + "epoch": 0.1398563136359311, + "grad_norm": 2.32704496383667, + "learning_rate": 4.762567765511888e-05, + "loss": 3.7236, + "step": 23516 + }, + { + "epoch": 0.1398622609192121, + "grad_norm": 2.7415919303894043, + "learning_rate": 4.7625478968987226e-05, + "loss": 3.2248, + "step": 23517 + }, + { + "epoch": 0.1398682082024931, + "grad_norm": 2.402270555496216, + "learning_rate": 4.7625280274957254e-05, + "loss": 3.5112, + "step": 23518 + }, + { + "epoch": 0.1398741554857741, + "grad_norm": 2.722087860107422, + "learning_rate": 4.762508157302903e-05, + "loss": 3.5728, + "step": 23519 + }, + { + "epoch": 0.1398801027690551, + "grad_norm": 2.2336719036102295, + "learning_rate": 4.7624882863202626e-05, + "loss": 4.361, + "step": 23520 + }, + { + "epoch": 0.1398860500523361, + "grad_norm": 1.687203288078308, + "learning_rate": 4.7624684145478106e-05, + "loss": 5.2352, + "step": 23521 + }, + { + "epoch": 0.13989199733561708, + "grad_norm": 2.0672800540924072, + "learning_rate": 4.762448541985553e-05, + "loss": 5.0935, + "step": 23522 + }, + { + "epoch": 0.1398979446188981, + "grad_norm": 1.9521383047103882, + "learning_rate": 4.7624286686335e-05, + "loss": 5.1912, + "step": 23523 + }, + { + "epoch": 0.13990389190217908, + "grad_norm": 1.8050906658172607, + "learning_rate": 4.762408794491656e-05, + "loss": 5.2494, + "step": 23524 + }, + { + "epoch": 0.13990983918546007, + "grad_norm": 1.7029122114181519, + "learning_rate": 4.762388919560028e-05, + "loss": 5.2882, + "step": 23525 + }, + { + "epoch": 0.1399157864687411, + "grad_norm": 2.089055299758911, + "learning_rate": 4.7623690438386234e-05, + "loss": 5.1689, + "step": 23526 + }, + { + "epoch": 0.13992173375202208, + "grad_norm": 1.8083282709121704, + "learning_rate": 4.7623491673274503e-05, + "loss": 5.2078, + "step": 23527 + }, + { + "epoch": 0.13992768103530306, + "grad_norm": 1.6455740928649902, + "learning_rate": 4.7623292900265126e-05, + "loss": 4.6492, + "step": 23528 + }, + { + "epoch": 0.13993362831858408, + "grad_norm": 1.7084187269210815, + "learning_rate": 4.76230941193582e-05, + "loss": 4.5537, + "step": 23529 + }, + { + "epoch": 0.13993957560186507, + "grad_norm": 1.5048147439956665, + "learning_rate": 4.762289533055379e-05, + "loss": 4.3823, + "step": 23530 + }, + { + "epoch": 0.13994552288514606, + "grad_norm": 1.6451318264007568, + "learning_rate": 4.762269653385196e-05, + "loss": 4.4546, + "step": 23531 + }, + { + "epoch": 0.13995147016842707, + "grad_norm": 1.4565141201019287, + "learning_rate": 4.762249772925278e-05, + "loss": 4.5148, + "step": 23532 + }, + { + "epoch": 0.13995741745170806, + "grad_norm": 1.4664920568466187, + "learning_rate": 4.7622298916756316e-05, + "loss": 4.4532, + "step": 23533 + }, + { + "epoch": 0.13996336473498905, + "grad_norm": 1.5902373790740967, + "learning_rate": 4.762210009636264e-05, + "loss": 4.4744, + "step": 23534 + }, + { + "epoch": 0.13996931201827006, + "grad_norm": 1.6029250621795654, + "learning_rate": 4.762190126807182e-05, + "loss": 4.4635, + "step": 23535 + }, + { + "epoch": 0.13997525930155105, + "grad_norm": 1.49099862575531, + "learning_rate": 4.7621702431883943e-05, + "loss": 4.4079, + "step": 23536 + }, + { + "epoch": 0.13998120658483204, + "grad_norm": 1.5527629852294922, + "learning_rate": 4.762150358779905e-05, + "loss": 4.4034, + "step": 23537 + }, + { + "epoch": 0.13998715386811306, + "grad_norm": 1.4014298915863037, + "learning_rate": 4.762130473581723e-05, + "loss": 4.5512, + "step": 23538 + }, + { + "epoch": 0.13999310115139404, + "grad_norm": 1.4211797714233398, + "learning_rate": 4.762110587593854e-05, + "loss": 4.3554, + "step": 23539 + }, + { + "epoch": 0.13999904843467503, + "grad_norm": 1.305879831314087, + "learning_rate": 4.762090700816306e-05, + "loss": 4.5469, + "step": 23540 + }, + { + "epoch": 0.14000499571795605, + "grad_norm": 1.6035869121551514, + "learning_rate": 4.762070813249085e-05, + "loss": 4.2506, + "step": 23541 + }, + { + "epoch": 0.14001094300123704, + "grad_norm": 2.48470139503479, + "learning_rate": 4.7620509248922e-05, + "loss": 4.4341, + "step": 23542 + }, + { + "epoch": 0.14001689028451803, + "grad_norm": 2.1328017711639404, + "learning_rate": 4.7620310357456546e-05, + "loss": 4.8064, + "step": 23543 + }, + { + "epoch": 0.14002283756779904, + "grad_norm": 2.631490707397461, + "learning_rate": 4.7620111458094586e-05, + "loss": 4.9828, + "step": 23544 + }, + { + "epoch": 0.14002878485108003, + "grad_norm": 2.4217545986175537, + "learning_rate": 4.761991255083617e-05, + "loss": 3.7975, + "step": 23545 + }, + { + "epoch": 0.14003473213436102, + "grad_norm": 2.1837475299835205, + "learning_rate": 4.7619713635681384e-05, + "loss": 3.7627, + "step": 23546 + }, + { + "epoch": 0.14004067941764203, + "grad_norm": 2.188026189804077, + "learning_rate": 4.7619514712630284e-05, + "loss": 3.6425, + "step": 23547 + }, + { + "epoch": 0.14004662670092302, + "grad_norm": 2.157501697540283, + "learning_rate": 4.761931578168295e-05, + "loss": 3.2671, + "step": 23548 + }, + { + "epoch": 0.140052573984204, + "grad_norm": 2.28362774848938, + "learning_rate": 4.7619116842839446e-05, + "loss": 3.9765, + "step": 23549 + }, + { + "epoch": 0.14005852126748503, + "grad_norm": 2.1072418689727783, + "learning_rate": 4.7618917896099844e-05, + "loss": 3.8694, + "step": 23550 + }, + { + "epoch": 0.140064468550766, + "grad_norm": 2.061612367630005, + "learning_rate": 4.76187189414642e-05, + "loss": 3.6775, + "step": 23551 + }, + { + "epoch": 0.140070415834047, + "grad_norm": 2.153618812561035, + "learning_rate": 4.761851997893261e-05, + "loss": 3.2189, + "step": 23552 + }, + { + "epoch": 0.14007636311732802, + "grad_norm": 2.211912155151367, + "learning_rate": 4.761832100850512e-05, + "loss": 4.0855, + "step": 23553 + }, + { + "epoch": 0.140082310400609, + "grad_norm": 2.109023094177246, + "learning_rate": 4.761812203018181e-05, + "loss": 3.1532, + "step": 23554 + }, + { + "epoch": 0.14008825768389, + "grad_norm": 2.056579113006592, + "learning_rate": 4.7617923043962745e-05, + "loss": 3.3965, + "step": 23555 + }, + { + "epoch": 0.140094204967171, + "grad_norm": 2.6552531719207764, + "learning_rate": 4.761772404984799e-05, + "loss": 4.8136, + "step": 23556 + }, + { + "epoch": 0.140100152250452, + "grad_norm": 2.873891592025757, + "learning_rate": 4.7617525047837634e-05, + "loss": 5.1014, + "step": 23557 + }, + { + "epoch": 0.140106099533733, + "grad_norm": 2.9486472606658936, + "learning_rate": 4.761732603793173e-05, + "loss": 4.9751, + "step": 23558 + }, + { + "epoch": 0.14011204681701397, + "grad_norm": 1.6354721784591675, + "learning_rate": 4.761712702013035e-05, + "loss": 5.6091, + "step": 23559 + }, + { + "epoch": 0.140117994100295, + "grad_norm": 1.766449213027954, + "learning_rate": 4.761692799443357e-05, + "loss": 5.6621, + "step": 23560 + }, + { + "epoch": 0.14012394138357598, + "grad_norm": 1.9253995418548584, + "learning_rate": 4.7616728960841444e-05, + "loss": 5.0477, + "step": 23561 + }, + { + "epoch": 0.14012988866685697, + "grad_norm": 1.5569409132003784, + "learning_rate": 4.761652991935406e-05, + "loss": 5.2989, + "step": 23562 + }, + { + "epoch": 0.14013583595013798, + "grad_norm": 1.395662784576416, + "learning_rate": 4.761633086997147e-05, + "loss": 5.2249, + "step": 23563 + }, + { + "epoch": 0.14014178323341897, + "grad_norm": 1.9045140743255615, + "learning_rate": 4.761613181269376e-05, + "loss": 5.5549, + "step": 23564 + }, + { + "epoch": 0.14014773051669996, + "grad_norm": 2.0041518211364746, + "learning_rate": 4.761593274752099e-05, + "loss": 5.7419, + "step": 23565 + }, + { + "epoch": 0.14015367779998097, + "grad_norm": 1.983040452003479, + "learning_rate": 4.761573367445323e-05, + "loss": 5.761, + "step": 23566 + }, + { + "epoch": 0.14015962508326196, + "grad_norm": 1.6701973676681519, + "learning_rate": 4.761553459349055e-05, + "loss": 5.8376, + "step": 23567 + }, + { + "epoch": 0.14016557236654295, + "grad_norm": 1.3928866386413574, + "learning_rate": 4.761533550463303e-05, + "loss": 5.7623, + "step": 23568 + }, + { + "epoch": 0.14017151964982397, + "grad_norm": 1.5971790552139282, + "learning_rate": 4.761513640788072e-05, + "loss": 5.6896, + "step": 23569 + }, + { + "epoch": 0.14017746693310495, + "grad_norm": 1.655540943145752, + "learning_rate": 4.76149373032337e-05, + "loss": 5.7311, + "step": 23570 + }, + { + "epoch": 0.14018341421638594, + "grad_norm": 1.6018282175064087, + "learning_rate": 4.761473819069204e-05, + "loss": 5.6966, + "step": 23571 + }, + { + "epoch": 0.14018936149966696, + "grad_norm": 2.0446600914001465, + "learning_rate": 4.7614539070255816e-05, + "loss": 4.7235, + "step": 23572 + }, + { + "epoch": 0.14019530878294795, + "grad_norm": 1.6043277978897095, + "learning_rate": 4.761433994192508e-05, + "loss": 5.1602, + "step": 23573 + }, + { + "epoch": 0.14020125606622894, + "grad_norm": 1.7339102029800415, + "learning_rate": 4.761414080569992e-05, + "loss": 4.6082, + "step": 23574 + }, + { + "epoch": 0.14020720334950995, + "grad_norm": 1.9234665632247925, + "learning_rate": 4.761394166158039e-05, + "loss": 5.1365, + "step": 23575 + }, + { + "epoch": 0.14021315063279094, + "grad_norm": 1.7816582918167114, + "learning_rate": 4.7613742509566574e-05, + "loss": 5.4685, + "step": 23576 + }, + { + "epoch": 0.14021909791607193, + "grad_norm": 2.230858564376831, + "learning_rate": 4.7613543349658526e-05, + "loss": 4.0433, + "step": 23577 + }, + { + "epoch": 0.14022504519935294, + "grad_norm": 2.088791847229004, + "learning_rate": 4.761334418185633e-05, + "loss": 4.0262, + "step": 23578 + }, + { + "epoch": 0.14023099248263393, + "grad_norm": 2.2880146503448486, + "learning_rate": 4.761314500616004e-05, + "loss": 3.8526, + "step": 23579 + }, + { + "epoch": 0.14023693976591492, + "grad_norm": 1.428227186203003, + "learning_rate": 4.7612945822569744e-05, + "loss": 5.538, + "step": 23580 + }, + { + "epoch": 0.14024288704919594, + "grad_norm": 1.5487463474273682, + "learning_rate": 4.76127466310855e-05, + "loss": 4.9206, + "step": 23581 + }, + { + "epoch": 0.14024883433247692, + "grad_norm": 1.7598581314086914, + "learning_rate": 4.761254743170738e-05, + "loss": 4.8871, + "step": 23582 + }, + { + "epoch": 0.1402547816157579, + "grad_norm": 1.8421943187713623, + "learning_rate": 4.7612348224435457e-05, + "loss": 4.9177, + "step": 23583 + }, + { + "epoch": 0.14026072889903893, + "grad_norm": 1.9214147329330444, + "learning_rate": 4.761214900926979e-05, + "loss": 5.148, + "step": 23584 + }, + { + "epoch": 0.14026667618231992, + "grad_norm": 1.8675332069396973, + "learning_rate": 4.761194978621045e-05, + "loss": 5.455, + "step": 23585 + }, + { + "epoch": 0.1402726234656009, + "grad_norm": 1.7940279245376587, + "learning_rate": 4.761175055525753e-05, + "loss": 5.3608, + "step": 23586 + }, + { + "epoch": 0.14027857074888192, + "grad_norm": 1.526066541671753, + "learning_rate": 4.761155131641107e-05, + "loss": 5.3672, + "step": 23587 + }, + { + "epoch": 0.1402845180321629, + "grad_norm": 1.7407697439193726, + "learning_rate": 4.761135206967115e-05, + "loss": 5.4809, + "step": 23588 + }, + { + "epoch": 0.1402904653154439, + "grad_norm": 1.8562800884246826, + "learning_rate": 4.761115281503784e-05, + "loss": 5.3086, + "step": 23589 + }, + { + "epoch": 0.1402964125987249, + "grad_norm": 1.7709288597106934, + "learning_rate": 4.7610953552511216e-05, + "loss": 4.8511, + "step": 23590 + }, + { + "epoch": 0.1403023598820059, + "grad_norm": 1.6407638788223267, + "learning_rate": 4.761075428209134e-05, + "loss": 4.7137, + "step": 23591 + }, + { + "epoch": 0.1403083071652869, + "grad_norm": 1.8322784900665283, + "learning_rate": 4.761055500377828e-05, + "loss": 5.1288, + "step": 23592 + }, + { + "epoch": 0.1403142544485679, + "grad_norm": 2.5631179809570312, + "learning_rate": 4.761035571757211e-05, + "loss": 3.9808, + "step": 23593 + }, + { + "epoch": 0.1403202017318489, + "grad_norm": 2.5823936462402344, + "learning_rate": 4.7610156423472895e-05, + "loss": 4.0532, + "step": 23594 + }, + { + "epoch": 0.14032614901512988, + "grad_norm": 3.3013498783111572, + "learning_rate": 4.760995712148072e-05, + "loss": 3.5222, + "step": 23595 + }, + { + "epoch": 0.1403320962984109, + "grad_norm": 2.8877291679382324, + "learning_rate": 4.760975781159563e-05, + "loss": 3.4662, + "step": 23596 + }, + { + "epoch": 0.14033804358169188, + "grad_norm": 2.757053852081299, + "learning_rate": 4.760955849381771e-05, + "loss": 2.9554, + "step": 23597 + }, + { + "epoch": 0.14034399086497287, + "grad_norm": 2.611163854598999, + "learning_rate": 4.760935916814703e-05, + "loss": 3.0722, + "step": 23598 + }, + { + "epoch": 0.1403499381482539, + "grad_norm": 2.5141069889068604, + "learning_rate": 4.760915983458366e-05, + "loss": 2.9377, + "step": 23599 + }, + { + "epoch": 0.14035588543153488, + "grad_norm": 2.88659930229187, + "learning_rate": 4.7608960493127655e-05, + "loss": 2.7086, + "step": 23600 + }, + { + "epoch": 0.14036183271481587, + "grad_norm": 1.4970325231552124, + "learning_rate": 4.7608761143779103e-05, + "loss": 5.279, + "step": 23601 + }, + { + "epoch": 0.14036777999809688, + "grad_norm": 1.883097767829895, + "learning_rate": 4.760856178653806e-05, + "loss": 4.9675, + "step": 23602 + }, + { + "epoch": 0.14037372728137787, + "grad_norm": 1.8045644760131836, + "learning_rate": 4.760836242140461e-05, + "loss": 4.9739, + "step": 23603 + }, + { + "epoch": 0.14037967456465886, + "grad_norm": 2.2752342224121094, + "learning_rate": 4.760816304837881e-05, + "loss": 5.1278, + "step": 23604 + }, + { + "epoch": 0.14038562184793987, + "grad_norm": 1.8345577716827393, + "learning_rate": 4.760796366746074e-05, + "loss": 5.232, + "step": 23605 + }, + { + "epoch": 0.14039156913122086, + "grad_norm": 1.6739290952682495, + "learning_rate": 4.760776427865046e-05, + "loss": 5.1867, + "step": 23606 + }, + { + "epoch": 0.14039751641450185, + "grad_norm": 1.8607251644134521, + "learning_rate": 4.760756488194803e-05, + "loss": 5.1918, + "step": 23607 + }, + { + "epoch": 0.14040346369778287, + "grad_norm": 1.852330207824707, + "learning_rate": 4.760736547735355e-05, + "loss": 5.1462, + "step": 23608 + }, + { + "epoch": 0.14040941098106385, + "grad_norm": 1.738235354423523, + "learning_rate": 4.760716606486706e-05, + "loss": 5.1607, + "step": 23609 + }, + { + "epoch": 0.14041535826434484, + "grad_norm": 1.7101359367370605, + "learning_rate": 4.760696664448865e-05, + "loss": 5.1047, + "step": 23610 + }, + { + "epoch": 0.14042130554762586, + "grad_norm": 1.618538737297058, + "learning_rate": 4.760676721621838e-05, + "loss": 5.034, + "step": 23611 + }, + { + "epoch": 0.14042725283090685, + "grad_norm": 1.5971029996871948, + "learning_rate": 4.760656778005632e-05, + "loss": 5.0689, + "step": 23612 + }, + { + "epoch": 0.14043320011418783, + "grad_norm": 1.7599228620529175, + "learning_rate": 4.760636833600254e-05, + "loss": 5.0584, + "step": 23613 + }, + { + "epoch": 0.14043914739746885, + "grad_norm": 1.7093656063079834, + "learning_rate": 4.7606168884057114e-05, + "loss": 5.0887, + "step": 23614 + }, + { + "epoch": 0.14044509468074984, + "grad_norm": 1.77159583568573, + "learning_rate": 4.760596942422011e-05, + "loss": 4.9885, + "step": 23615 + }, + { + "epoch": 0.14045104196403083, + "grad_norm": 1.6793224811553955, + "learning_rate": 4.7605769956491586e-05, + "loss": 5.7858, + "step": 23616 + }, + { + "epoch": 0.14045698924731181, + "grad_norm": 2.0000784397125244, + "learning_rate": 4.7605570480871624e-05, + "loss": 5.1434, + "step": 23617 + }, + { + "epoch": 0.14046293653059283, + "grad_norm": 1.777692437171936, + "learning_rate": 4.760537099736029e-05, + "loss": 5.237, + "step": 23618 + }, + { + "epoch": 0.14046888381387382, + "grad_norm": 1.7709475755691528, + "learning_rate": 4.760517150595766e-05, + "loss": 5.1844, + "step": 23619 + }, + { + "epoch": 0.1404748310971548, + "grad_norm": 1.5300654172897339, + "learning_rate": 4.76049720066638e-05, + "loss": 5.4657, + "step": 23620 + }, + { + "epoch": 0.14048077838043582, + "grad_norm": 1.5757399797439575, + "learning_rate": 4.7604772499478767e-05, + "loss": 5.7018, + "step": 23621 + }, + { + "epoch": 0.1404867256637168, + "grad_norm": 1.572698712348938, + "learning_rate": 4.760457298440265e-05, + "loss": 5.5974, + "step": 23622 + }, + { + "epoch": 0.1404926729469978, + "grad_norm": 1.7017083168029785, + "learning_rate": 4.760437346143551e-05, + "loss": 5.6591, + "step": 23623 + }, + { + "epoch": 0.14049862023027881, + "grad_norm": 1.496193528175354, + "learning_rate": 4.760417393057741e-05, + "loss": 5.603, + "step": 23624 + }, + { + "epoch": 0.1405045675135598, + "grad_norm": 1.5156760215759277, + "learning_rate": 4.760397439182843e-05, + "loss": 5.5561, + "step": 23625 + }, + { + "epoch": 0.1405105147968408, + "grad_norm": 1.520276665687561, + "learning_rate": 4.760377484518864e-05, + "loss": 5.6208, + "step": 23626 + }, + { + "epoch": 0.1405164620801218, + "grad_norm": 1.6519960165023804, + "learning_rate": 4.760357529065811e-05, + "loss": 5.6191, + "step": 23627 + }, + { + "epoch": 0.1405224093634028, + "grad_norm": 1.6115814447402954, + "learning_rate": 4.760337572823689e-05, + "loss": 5.6622, + "step": 23628 + }, + { + "epoch": 0.14052835664668378, + "grad_norm": 1.6744813919067383, + "learning_rate": 4.760317615792508e-05, + "loss": 4.9525, + "step": 23629 + }, + { + "epoch": 0.1405343039299648, + "grad_norm": 1.8949360847473145, + "learning_rate": 4.7602976579722725e-05, + "loss": 5.2284, + "step": 23630 + }, + { + "epoch": 0.1405402512132458, + "grad_norm": 1.7098066806793213, + "learning_rate": 4.760277699362991e-05, + "loss": 5.6612, + "step": 23631 + }, + { + "epoch": 0.14054619849652678, + "grad_norm": 2.258535861968994, + "learning_rate": 4.76025773996467e-05, + "loss": 5.3049, + "step": 23632 + }, + { + "epoch": 0.1405521457798078, + "grad_norm": 1.713905692100525, + "learning_rate": 4.760237779777316e-05, + "loss": 6.081, + "step": 23633 + }, + { + "epoch": 0.14055809306308878, + "grad_norm": 1.744905710220337, + "learning_rate": 4.760217818800936e-05, + "loss": 5.6269, + "step": 23634 + }, + { + "epoch": 0.14056404034636977, + "grad_norm": 2.032653570175171, + "learning_rate": 4.760197857035538e-05, + "loss": 4.8417, + "step": 23635 + }, + { + "epoch": 0.14056998762965078, + "grad_norm": 1.9457743167877197, + "learning_rate": 4.7601778944811275e-05, + "loss": 4.6145, + "step": 23636 + }, + { + "epoch": 0.14057593491293177, + "grad_norm": 2.0428082942962646, + "learning_rate": 4.760157931137713e-05, + "loss": 4.7341, + "step": 23637 + }, + { + "epoch": 0.14058188219621276, + "grad_norm": 1.8817776441574097, + "learning_rate": 4.7601379670053006e-05, + "loss": 4.4932, + "step": 23638 + }, + { + "epoch": 0.14058782947949378, + "grad_norm": 1.9882752895355225, + "learning_rate": 4.760118002083897e-05, + "loss": 4.5001, + "step": 23639 + }, + { + "epoch": 0.14059377676277476, + "grad_norm": 1.6730908155441284, + "learning_rate": 4.760098036373509e-05, + "loss": 4.2396, + "step": 23640 + }, + { + "epoch": 0.14059972404605575, + "grad_norm": 1.9490888118743896, + "learning_rate": 4.760078069874145e-05, + "loss": 4.2708, + "step": 23641 + }, + { + "epoch": 0.14060567132933677, + "grad_norm": 1.8162645101547241, + "learning_rate": 4.7600581025858114e-05, + "loss": 4.2507, + "step": 23642 + }, + { + "epoch": 0.14061161861261776, + "grad_norm": 1.9260125160217285, + "learning_rate": 4.760038134508514e-05, + "loss": 4.4647, + "step": 23643 + }, + { + "epoch": 0.14061756589589874, + "grad_norm": 1.892685055732727, + "learning_rate": 4.7600181656422616e-05, + "loss": 4.1241, + "step": 23644 + }, + { + "epoch": 0.14062351317917976, + "grad_norm": 1.625123143196106, + "learning_rate": 4.75999819598706e-05, + "loss": 4.3582, + "step": 23645 + }, + { + "epoch": 0.14062946046246075, + "grad_norm": 1.841758131980896, + "learning_rate": 4.759978225542916e-05, + "loss": 4.3403, + "step": 23646 + }, + { + "epoch": 0.14063540774574174, + "grad_norm": 1.8946552276611328, + "learning_rate": 4.759958254309837e-05, + "loss": 4.5008, + "step": 23647 + }, + { + "epoch": 0.14064135502902275, + "grad_norm": 1.7985520362854004, + "learning_rate": 4.75993828228783e-05, + "loss": 4.4869, + "step": 23648 + }, + { + "epoch": 0.14064730231230374, + "grad_norm": 1.823662519454956, + "learning_rate": 4.759918309476902e-05, + "loss": 4.6177, + "step": 23649 + }, + { + "epoch": 0.14065324959558473, + "grad_norm": 1.94038724899292, + "learning_rate": 4.75989833587706e-05, + "loss": 4.4979, + "step": 23650 + }, + { + "epoch": 0.14065919687886574, + "grad_norm": 1.9023078680038452, + "learning_rate": 4.75987836148831e-05, + "loss": 4.3507, + "step": 23651 + }, + { + "epoch": 0.14066514416214673, + "grad_norm": 1.917851448059082, + "learning_rate": 4.7598583863106606e-05, + "loss": 4.1841, + "step": 23652 + }, + { + "epoch": 0.14067109144542772, + "grad_norm": 1.8332593441009521, + "learning_rate": 4.759838410344117e-05, + "loss": 4.4705, + "step": 23653 + }, + { + "epoch": 0.14067703872870874, + "grad_norm": 1.7567338943481445, + "learning_rate": 4.759818433588689e-05, + "loss": 4.5008, + "step": 23654 + }, + { + "epoch": 0.14068298601198972, + "grad_norm": 1.9399288892745972, + "learning_rate": 4.75979845604438e-05, + "loss": 4.3969, + "step": 23655 + }, + { + "epoch": 0.1406889332952707, + "grad_norm": 1.7779430150985718, + "learning_rate": 4.7597784777112e-05, + "loss": 4.3292, + "step": 23656 + }, + { + "epoch": 0.14069488057855173, + "grad_norm": 1.802742600440979, + "learning_rate": 4.759758498589153e-05, + "loss": 5.0038, + "step": 23657 + }, + { + "epoch": 0.14070082786183272, + "grad_norm": 2.5247714519500732, + "learning_rate": 4.759738518678249e-05, + "loss": 5.0153, + "step": 23658 + }, + { + "epoch": 0.1407067751451137, + "grad_norm": 3.0549800395965576, + "learning_rate": 4.759718537978494e-05, + "loss": 4.6653, + "step": 23659 + }, + { + "epoch": 0.14071272242839472, + "grad_norm": 2.7805356979370117, + "learning_rate": 4.7596985564898935e-05, + "loss": 4.4669, + "step": 23660 + }, + { + "epoch": 0.1407186697116757, + "grad_norm": 2.404932737350464, + "learning_rate": 4.759678574212456e-05, + "loss": 4.6932, + "step": 23661 + }, + { + "epoch": 0.1407246169949567, + "grad_norm": 2.2168543338775635, + "learning_rate": 4.7596585911461875e-05, + "loss": 4.397, + "step": 23662 + }, + { + "epoch": 0.1407305642782377, + "grad_norm": 2.423726797103882, + "learning_rate": 4.759638607291097e-05, + "loss": 4.3534, + "step": 23663 + }, + { + "epoch": 0.1407365115615187, + "grad_norm": 2.1283328533172607, + "learning_rate": 4.759618622647188e-05, + "loss": 4.9248, + "step": 23664 + }, + { + "epoch": 0.1407424588447997, + "grad_norm": 1.6989446878433228, + "learning_rate": 4.7595986372144716e-05, + "loss": 5.4656, + "step": 23665 + }, + { + "epoch": 0.1407484061280807, + "grad_norm": 1.7057443857192993, + "learning_rate": 4.759578650992951e-05, + "loss": 5.193, + "step": 23666 + }, + { + "epoch": 0.1407543534113617, + "grad_norm": 2.3968324661254883, + "learning_rate": 4.7595586639826364e-05, + "loss": 5.132, + "step": 23667 + }, + { + "epoch": 0.14076030069464268, + "grad_norm": 1.7770966291427612, + "learning_rate": 4.7595386761835314e-05, + "loss": 4.8487, + "step": 23668 + }, + { + "epoch": 0.1407662479779237, + "grad_norm": 1.8165397644042969, + "learning_rate": 4.759518687595646e-05, + "loss": 4.9981, + "step": 23669 + }, + { + "epoch": 0.14077219526120469, + "grad_norm": 1.4801784753799438, + "learning_rate": 4.759498698218986e-05, + "loss": 5.0204, + "step": 23670 + }, + { + "epoch": 0.14077814254448567, + "grad_norm": 1.6488209962844849, + "learning_rate": 4.759478708053557e-05, + "loss": 4.9349, + "step": 23671 + }, + { + "epoch": 0.1407840898277667, + "grad_norm": 1.5207561254501343, + "learning_rate": 4.759458717099369e-05, + "loss": 4.9986, + "step": 23672 + }, + { + "epoch": 0.14079003711104768, + "grad_norm": 1.5029826164245605, + "learning_rate": 4.7594387253564263e-05, + "loss": 4.9708, + "step": 23673 + }, + { + "epoch": 0.14079598439432867, + "grad_norm": 1.6697144508361816, + "learning_rate": 4.7594187328247375e-05, + "loss": 4.9915, + "step": 23674 + }, + { + "epoch": 0.14080193167760965, + "grad_norm": 1.7437782287597656, + "learning_rate": 4.7593987395043085e-05, + "loss": 5.068, + "step": 23675 + }, + { + "epoch": 0.14080787896089067, + "grad_norm": 1.8639456033706665, + "learning_rate": 4.7593787453951475e-05, + "loss": 4.9861, + "step": 23676 + }, + { + "epoch": 0.14081382624417166, + "grad_norm": 1.7246698141098022, + "learning_rate": 4.75935875049726e-05, + "loss": 4.9547, + "step": 23677 + }, + { + "epoch": 0.14081977352745265, + "grad_norm": 1.764772891998291, + "learning_rate": 4.759338754810654e-05, + "loss": 4.7823, + "step": 23678 + }, + { + "epoch": 0.14082572081073366, + "grad_norm": 1.3609477281570435, + "learning_rate": 4.759318758335336e-05, + "loss": 4.9039, + "step": 23679 + }, + { + "epoch": 0.14083166809401465, + "grad_norm": 1.4477577209472656, + "learning_rate": 4.759298761071313e-05, + "loss": 4.7816, + "step": 23680 + }, + { + "epoch": 0.14083761537729564, + "grad_norm": 1.6295807361602783, + "learning_rate": 4.759278763018592e-05, + "loss": 4.641, + "step": 23681 + }, + { + "epoch": 0.14084356266057665, + "grad_norm": 1.7831028699874878, + "learning_rate": 4.7592587641771806e-05, + "loss": 4.8989, + "step": 23682 + }, + { + "epoch": 0.14084950994385764, + "grad_norm": 1.7806429862976074, + "learning_rate": 4.7592387645470845e-05, + "loss": 4.9344, + "step": 23683 + }, + { + "epoch": 0.14085545722713863, + "grad_norm": 2.0284979343414307, + "learning_rate": 4.759218764128313e-05, + "loss": 5.7399, + "step": 23684 + }, + { + "epoch": 0.14086140451041965, + "grad_norm": 1.853495717048645, + "learning_rate": 4.7591987629208706e-05, + "loss": 4.8495, + "step": 23685 + }, + { + "epoch": 0.14086735179370063, + "grad_norm": 1.6907382011413574, + "learning_rate": 4.759178760924765e-05, + "loss": 4.8365, + "step": 23686 + }, + { + "epoch": 0.14087329907698162, + "grad_norm": 1.7131983041763306, + "learning_rate": 4.7591587581400045e-05, + "loss": 4.8217, + "step": 23687 + }, + { + "epoch": 0.14087924636026264, + "grad_norm": 1.6896579265594482, + "learning_rate": 4.759138754566595e-05, + "loss": 5.4568, + "step": 23688 + }, + { + "epoch": 0.14088519364354363, + "grad_norm": 1.7312794923782349, + "learning_rate": 4.759118750204542e-05, + "loss": 5.7501, + "step": 23689 + }, + { + "epoch": 0.14089114092682462, + "grad_norm": 1.494137167930603, + "learning_rate": 4.759098745053855e-05, + "loss": 5.526, + "step": 23690 + }, + { + "epoch": 0.14089708821010563, + "grad_norm": 2.2159650325775146, + "learning_rate": 4.75907873911454e-05, + "loss": 5.3686, + "step": 23691 + }, + { + "epoch": 0.14090303549338662, + "grad_norm": 2.0564072132110596, + "learning_rate": 4.759058732386603e-05, + "loss": 5.2311, + "step": 23692 + }, + { + "epoch": 0.1409089827766676, + "grad_norm": 2.5233311653137207, + "learning_rate": 4.759038724870053e-05, + "loss": 4.7775, + "step": 23693 + }, + { + "epoch": 0.14091493005994862, + "grad_norm": 2.180325984954834, + "learning_rate": 4.7590187165648956e-05, + "loss": 4.8106, + "step": 23694 + }, + { + "epoch": 0.1409208773432296, + "grad_norm": 2.1391143798828125, + "learning_rate": 4.758998707471138e-05, + "loss": 4.741, + "step": 23695 + }, + { + "epoch": 0.1409268246265106, + "grad_norm": 1.9628124237060547, + "learning_rate": 4.758978697588787e-05, + "loss": 4.7177, + "step": 23696 + }, + { + "epoch": 0.14093277190979162, + "grad_norm": 2.1324729919433594, + "learning_rate": 4.7589586869178506e-05, + "loss": 4.8006, + "step": 23697 + }, + { + "epoch": 0.1409387191930726, + "grad_norm": 1.9791810512542725, + "learning_rate": 4.758938675458335e-05, + "loss": 4.6171, + "step": 23698 + }, + { + "epoch": 0.1409446664763536, + "grad_norm": 1.8566325902938843, + "learning_rate": 4.758918663210247e-05, + "loss": 5.0375, + "step": 23699 + }, + { + "epoch": 0.1409506137596346, + "grad_norm": 2.3218674659729004, + "learning_rate": 4.758898650173593e-05, + "loss": 5.2169, + "step": 23700 + }, + { + "epoch": 0.1409565610429156, + "grad_norm": 2.0162737369537354, + "learning_rate": 4.7588786363483816e-05, + "loss": 4.8988, + "step": 23701 + }, + { + "epoch": 0.14096250832619658, + "grad_norm": 2.1534879207611084, + "learning_rate": 4.7588586217346197e-05, + "loss": 4.9911, + "step": 23702 + }, + { + "epoch": 0.1409684556094776, + "grad_norm": 2.16445255279541, + "learning_rate": 4.7588386063323134e-05, + "loss": 4.9501, + "step": 23703 + }, + { + "epoch": 0.1409744028927586, + "grad_norm": 1.9189707040786743, + "learning_rate": 4.7588185901414684e-05, + "loss": 4.9125, + "step": 23704 + }, + { + "epoch": 0.14098035017603958, + "grad_norm": 2.1000189781188965, + "learning_rate": 4.7587985731620945e-05, + "loss": 5.002, + "step": 23705 + }, + { + "epoch": 0.1409862974593206, + "grad_norm": 2.0911948680877686, + "learning_rate": 4.7587785553941974e-05, + "loss": 5.0206, + "step": 23706 + }, + { + "epoch": 0.14099224474260158, + "grad_norm": 1.9519456624984741, + "learning_rate": 4.758758536837783e-05, + "loss": 4.5715, + "step": 23707 + }, + { + "epoch": 0.14099819202588257, + "grad_norm": 2.1036672592163086, + "learning_rate": 4.75873851749286e-05, + "loss": 4.7427, + "step": 23708 + }, + { + "epoch": 0.14100413930916358, + "grad_norm": 1.6662368774414062, + "learning_rate": 4.7587184973594354e-05, + "loss": 5.1132, + "step": 23709 + }, + { + "epoch": 0.14101008659244457, + "grad_norm": 1.5314775705337524, + "learning_rate": 4.758698476437514e-05, + "loss": 5.6674, + "step": 23710 + }, + { + "epoch": 0.14101603387572556, + "grad_norm": 1.7167651653289795, + "learning_rate": 4.7586784547271056e-05, + "loss": 5.74, + "step": 23711 + }, + { + "epoch": 0.14102198115900658, + "grad_norm": 1.6126611232757568, + "learning_rate": 4.758658432228216e-05, + "loss": 5.7798, + "step": 23712 + }, + { + "epoch": 0.14102792844228756, + "grad_norm": 1.5236903429031372, + "learning_rate": 4.758638408940851e-05, + "loss": 5.3924, + "step": 23713 + }, + { + "epoch": 0.14103387572556855, + "grad_norm": 1.7352653741836548, + "learning_rate": 4.758618384865019e-05, + "loss": 5.3551, + "step": 23714 + }, + { + "epoch": 0.14103982300884957, + "grad_norm": 2.1185758113861084, + "learning_rate": 4.758598360000727e-05, + "loss": 4.5986, + "step": 23715 + }, + { + "epoch": 0.14104577029213056, + "grad_norm": 2.0252137184143066, + "learning_rate": 4.758578334347981e-05, + "loss": 5.5963, + "step": 23716 + }, + { + "epoch": 0.14105171757541154, + "grad_norm": 2.1225454807281494, + "learning_rate": 4.75855830790679e-05, + "loss": 5.1949, + "step": 23717 + }, + { + "epoch": 0.14105766485869256, + "grad_norm": 2.7703025341033936, + "learning_rate": 4.7585382806771585e-05, + "loss": 4.4741, + "step": 23718 + }, + { + "epoch": 0.14106361214197355, + "grad_norm": 1.6570090055465698, + "learning_rate": 4.758518252659094e-05, + "loss": 4.8543, + "step": 23719 + }, + { + "epoch": 0.14106955942525454, + "grad_norm": 1.759743571281433, + "learning_rate": 4.7584982238526053e-05, + "loss": 4.7901, + "step": 23720 + }, + { + "epoch": 0.14107550670853555, + "grad_norm": 1.562591314315796, + "learning_rate": 4.7584781942576976e-05, + "loss": 5.351, + "step": 23721 + }, + { + "epoch": 0.14108145399181654, + "grad_norm": 1.279597520828247, + "learning_rate": 4.758458163874379e-05, + "loss": 6.0303, + "step": 23722 + }, + { + "epoch": 0.14108740127509753, + "grad_norm": 1.3173538446426392, + "learning_rate": 4.758438132702656e-05, + "loss": 6.015, + "step": 23723 + }, + { + "epoch": 0.14109334855837855, + "grad_norm": 1.4862935543060303, + "learning_rate": 4.7584181007425354e-05, + "loss": 5.6649, + "step": 23724 + }, + { + "epoch": 0.14109929584165953, + "grad_norm": 1.8398306369781494, + "learning_rate": 4.7583980679940244e-05, + "loss": 5.3897, + "step": 23725 + }, + { + "epoch": 0.14110524312494052, + "grad_norm": 2.02359676361084, + "learning_rate": 4.758378034457129e-05, + "loss": 5.8195, + "step": 23726 + }, + { + "epoch": 0.14111119040822154, + "grad_norm": 2.131068706512451, + "learning_rate": 4.758358000131858e-05, + "loss": 5.693, + "step": 23727 + }, + { + "epoch": 0.14111713769150253, + "grad_norm": 2.144928455352783, + "learning_rate": 4.7583379650182184e-05, + "loss": 5.4745, + "step": 23728 + }, + { + "epoch": 0.1411230849747835, + "grad_norm": 2.043093681335449, + "learning_rate": 4.758317929116215e-05, + "loss": 5.5877, + "step": 23729 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 1.7879455089569092, + "learning_rate": 4.758297892425857e-05, + "loss": 5.5822, + "step": 23730 + }, + { + "epoch": 0.14113497954134552, + "grad_norm": 1.6113840341567993, + "learning_rate": 4.7582778549471494e-05, + "loss": 5.2861, + "step": 23731 + }, + { + "epoch": 0.1411409268246265, + "grad_norm": 1.6712645292282104, + "learning_rate": 4.7582578166801015e-05, + "loss": 5.1185, + "step": 23732 + }, + { + "epoch": 0.1411468741079075, + "grad_norm": 1.6905531883239746, + "learning_rate": 4.758237777624719e-05, + "loss": 5.3339, + "step": 23733 + }, + { + "epoch": 0.1411528213911885, + "grad_norm": 2.058136224746704, + "learning_rate": 4.758217737781009e-05, + "loss": 4.6243, + "step": 23734 + }, + { + "epoch": 0.1411587686744695, + "grad_norm": 1.9609389305114746, + "learning_rate": 4.758197697148978e-05, + "loss": 4.7675, + "step": 23735 + }, + { + "epoch": 0.1411647159577505, + "grad_norm": 1.947270154953003, + "learning_rate": 4.758177655728634e-05, + "loss": 4.6854, + "step": 23736 + }, + { + "epoch": 0.1411706632410315, + "grad_norm": 2.0735461711883545, + "learning_rate": 4.7581576135199834e-05, + "loss": 4.9539, + "step": 23737 + }, + { + "epoch": 0.1411766105243125, + "grad_norm": 2.0236589908599854, + "learning_rate": 4.758137570523033e-05, + "loss": 5.0488, + "step": 23738 + }, + { + "epoch": 0.14118255780759348, + "grad_norm": 2.1183953285217285, + "learning_rate": 4.7581175267377906e-05, + "loss": 4.9358, + "step": 23739 + }, + { + "epoch": 0.1411885050908745, + "grad_norm": 2.0142831802368164, + "learning_rate": 4.758097482164262e-05, + "loss": 4.8333, + "step": 23740 + }, + { + "epoch": 0.14119445237415548, + "grad_norm": 2.204681634902954, + "learning_rate": 4.758077436802455e-05, + "loss": 4.8852, + "step": 23741 + }, + { + "epoch": 0.14120039965743647, + "grad_norm": 2.216187000274658, + "learning_rate": 4.7580573906523774e-05, + "loss": 5.0268, + "step": 23742 + }, + { + "epoch": 0.1412063469407175, + "grad_norm": 2.1434781551361084, + "learning_rate": 4.7580373437140343e-05, + "loss": 4.9048, + "step": 23743 + }, + { + "epoch": 0.14121229422399847, + "grad_norm": 1.8260117769241333, + "learning_rate": 4.758017295987435e-05, + "loss": 5.0481, + "step": 23744 + }, + { + "epoch": 0.14121824150727946, + "grad_norm": 2.2184064388275146, + "learning_rate": 4.757997247472584e-05, + "loss": 4.8967, + "step": 23745 + }, + { + "epoch": 0.14122418879056048, + "grad_norm": 1.8644381761550903, + "learning_rate": 4.75797719816949e-05, + "loss": 5.1945, + "step": 23746 + }, + { + "epoch": 0.14123013607384147, + "grad_norm": 2.0591354370117188, + "learning_rate": 4.757957148078159e-05, + "loss": 4.8916, + "step": 23747 + }, + { + "epoch": 0.14123608335712245, + "grad_norm": 2.429004669189453, + "learning_rate": 4.7579370971985986e-05, + "loss": 4.555, + "step": 23748 + }, + { + "epoch": 0.14124203064040347, + "grad_norm": 2.451037883758545, + "learning_rate": 4.757917045530816e-05, + "loss": 4.663, + "step": 23749 + }, + { + "epoch": 0.14124797792368446, + "grad_norm": 1.8227989673614502, + "learning_rate": 4.7578969930748176e-05, + "loss": 5.6976, + "step": 23750 + }, + { + "epoch": 0.14125392520696545, + "grad_norm": 1.8706707954406738, + "learning_rate": 4.757876939830611e-05, + "loss": 6.0974, + "step": 23751 + }, + { + "epoch": 0.14125987249024646, + "grad_norm": 1.7714571952819824, + "learning_rate": 4.7578568857982025e-05, + "loss": 5.5516, + "step": 23752 + }, + { + "epoch": 0.14126581977352745, + "grad_norm": 2.067776679992676, + "learning_rate": 4.7578368309776e-05, + "loss": 5.296, + "step": 23753 + }, + { + "epoch": 0.14127176705680844, + "grad_norm": 1.9231433868408203, + "learning_rate": 4.7578167753688095e-05, + "loss": 5.1286, + "step": 23754 + }, + { + "epoch": 0.14127771434008946, + "grad_norm": 2.0858731269836426, + "learning_rate": 4.7577967189718386e-05, + "loss": 4.717, + "step": 23755 + }, + { + "epoch": 0.14128366162337044, + "grad_norm": 2.173215627670288, + "learning_rate": 4.757776661786694e-05, + "loss": 4.6995, + "step": 23756 + }, + { + "epoch": 0.14128960890665143, + "grad_norm": 2.008244037628174, + "learning_rate": 4.7577566038133834e-05, + "loss": 4.4147, + "step": 23757 + }, + { + "epoch": 0.14129555618993245, + "grad_norm": 1.9767186641693115, + "learning_rate": 4.757736545051913e-05, + "loss": 4.9901, + "step": 23758 + }, + { + "epoch": 0.14130150347321344, + "grad_norm": 1.860136866569519, + "learning_rate": 4.7577164855022905e-05, + "loss": 4.7252, + "step": 23759 + }, + { + "epoch": 0.14130745075649442, + "grad_norm": 1.9243319034576416, + "learning_rate": 4.757696425164522e-05, + "loss": 4.6387, + "step": 23760 + }, + { + "epoch": 0.14131339803977544, + "grad_norm": 1.9811434745788574, + "learning_rate": 4.7576763640386155e-05, + "loss": 4.7365, + "step": 23761 + }, + { + "epoch": 0.14131934532305643, + "grad_norm": 2.1552014350891113, + "learning_rate": 4.757656302124577e-05, + "loss": 4.4764, + "step": 23762 + }, + { + "epoch": 0.14132529260633742, + "grad_norm": 1.8660786151885986, + "learning_rate": 4.757636239422414e-05, + "loss": 4.6108, + "step": 23763 + }, + { + "epoch": 0.14133123988961843, + "grad_norm": 2.0548014640808105, + "learning_rate": 4.757616175932134e-05, + "loss": 4.3871, + "step": 23764 + }, + { + "epoch": 0.14133718717289942, + "grad_norm": 2.107966184616089, + "learning_rate": 4.757596111653743e-05, + "loss": 4.3013, + "step": 23765 + }, + { + "epoch": 0.1413431344561804, + "grad_norm": 2.062649726867676, + "learning_rate": 4.757576046587249e-05, + "loss": 4.3352, + "step": 23766 + }, + { + "epoch": 0.14134908173946142, + "grad_norm": 1.9424866437911987, + "learning_rate": 4.7575559807326584e-05, + "loss": 4.5538, + "step": 23767 + }, + { + "epoch": 0.1413550290227424, + "grad_norm": 1.9787993431091309, + "learning_rate": 4.757535914089978e-05, + "loss": 4.7105, + "step": 23768 + }, + { + "epoch": 0.1413609763060234, + "grad_norm": 2.3590548038482666, + "learning_rate": 4.7575158466592154e-05, + "loss": 4.5962, + "step": 23769 + }, + { + "epoch": 0.14136692358930442, + "grad_norm": 2.3521318435668945, + "learning_rate": 4.757495778440377e-05, + "loss": 4.8107, + "step": 23770 + }, + { + "epoch": 0.1413728708725854, + "grad_norm": 2.079169273376465, + "learning_rate": 4.7574757094334696e-05, + "loss": 4.6617, + "step": 23771 + }, + { + "epoch": 0.1413788181558664, + "grad_norm": 2.020505428314209, + "learning_rate": 4.757455639638502e-05, + "loss": 4.9402, + "step": 23772 + }, + { + "epoch": 0.1413847654391474, + "grad_norm": 1.8023982048034668, + "learning_rate": 4.75743556905548e-05, + "loss": 5.7173, + "step": 23773 + }, + { + "epoch": 0.1413907127224284, + "grad_norm": 1.471612572669983, + "learning_rate": 4.75741549768441e-05, + "loss": 5.6359, + "step": 23774 + }, + { + "epoch": 0.14139666000570938, + "grad_norm": 1.691918969154358, + "learning_rate": 4.7573954255252996e-05, + "loss": 5.6043, + "step": 23775 + }, + { + "epoch": 0.1414026072889904, + "grad_norm": 1.5347981452941895, + "learning_rate": 4.757375352578156e-05, + "loss": 5.9488, + "step": 23776 + }, + { + "epoch": 0.1414085545722714, + "grad_norm": 1.6003544330596924, + "learning_rate": 4.757355278842985e-05, + "loss": 5.4831, + "step": 23777 + }, + { + "epoch": 0.14141450185555238, + "grad_norm": 1.868674397468567, + "learning_rate": 4.757335204319796e-05, + "loss": 5.3372, + "step": 23778 + }, + { + "epoch": 0.1414204491388334, + "grad_norm": 1.827628254890442, + "learning_rate": 4.7573151290085935e-05, + "loss": 5.2977, + "step": 23779 + }, + { + "epoch": 0.14142639642211438, + "grad_norm": 1.80328369140625, + "learning_rate": 4.757295052909386e-05, + "loss": 5.2484, + "step": 23780 + }, + { + "epoch": 0.14143234370539537, + "grad_norm": 1.7244900465011597, + "learning_rate": 4.7572749760221815e-05, + "loss": 5.341, + "step": 23781 + }, + { + "epoch": 0.14143829098867639, + "grad_norm": 1.6203787326812744, + "learning_rate": 4.757254898346984e-05, + "loss": 5.1993, + "step": 23782 + }, + { + "epoch": 0.14144423827195737, + "grad_norm": 1.7411043643951416, + "learning_rate": 4.7572348198838026e-05, + "loss": 5.177, + "step": 23783 + }, + { + "epoch": 0.14145018555523836, + "grad_norm": 1.6770362854003906, + "learning_rate": 4.7572147406326435e-05, + "loss": 5.2169, + "step": 23784 + }, + { + "epoch": 0.14145613283851938, + "grad_norm": 1.6283633708953857, + "learning_rate": 4.7571946605935146e-05, + "loss": 5.1338, + "step": 23785 + }, + { + "epoch": 0.14146208012180037, + "grad_norm": 1.601276159286499, + "learning_rate": 4.7571745797664215e-05, + "loss": 5.0783, + "step": 23786 + }, + { + "epoch": 0.14146802740508135, + "grad_norm": 1.7484774589538574, + "learning_rate": 4.757154498151373e-05, + "loss": 5.106, + "step": 23787 + }, + { + "epoch": 0.14147397468836237, + "grad_norm": 1.8326083421707153, + "learning_rate": 4.7571344157483744e-05, + "loss": 5.0202, + "step": 23788 + }, + { + "epoch": 0.14147992197164336, + "grad_norm": 1.7564448118209839, + "learning_rate": 4.757114332557434e-05, + "loss": 5.0854, + "step": 23789 + }, + { + "epoch": 0.14148586925492435, + "grad_norm": 1.776414394378662, + "learning_rate": 4.757094248578558e-05, + "loss": 5.049, + "step": 23790 + }, + { + "epoch": 0.14149181653820536, + "grad_norm": 1.6053420305252075, + "learning_rate": 4.757074163811754e-05, + "loss": 5.1644, + "step": 23791 + }, + { + "epoch": 0.14149776382148635, + "grad_norm": 1.9419928789138794, + "learning_rate": 4.7570540782570295e-05, + "loss": 5.6868, + "step": 23792 + }, + { + "epoch": 0.14150371110476734, + "grad_norm": 1.8629308938980103, + "learning_rate": 4.757033991914389e-05, + "loss": 5.6614, + "step": 23793 + }, + { + "epoch": 0.14150965838804833, + "grad_norm": 1.745348572731018, + "learning_rate": 4.757013904783842e-05, + "loss": 5.6742, + "step": 23794 + }, + { + "epoch": 0.14151560567132934, + "grad_norm": 1.8093681335449219, + "learning_rate": 4.756993816865396e-05, + "loss": 5.8902, + "step": 23795 + }, + { + "epoch": 0.14152155295461033, + "grad_norm": 1.8000177145004272, + "learning_rate": 4.7569737281590554e-05, + "loss": 5.7025, + "step": 23796 + }, + { + "epoch": 0.14152750023789132, + "grad_norm": 1.7782033681869507, + "learning_rate": 4.756953638664829e-05, + "loss": 5.492, + "step": 23797 + }, + { + "epoch": 0.14153344752117233, + "grad_norm": 1.7651612758636475, + "learning_rate": 4.756933548382723e-05, + "loss": 4.8989, + "step": 23798 + }, + { + "epoch": 0.14153939480445332, + "grad_norm": 2.0286474227905273, + "learning_rate": 4.756913457312745e-05, + "loss": 4.5672, + "step": 23799 + }, + { + "epoch": 0.1415453420877343, + "grad_norm": 2.361325740814209, + "learning_rate": 4.756893365454902e-05, + "loss": 4.6471, + "step": 23800 + }, + { + "epoch": 0.14155128937101533, + "grad_norm": 1.8565771579742432, + "learning_rate": 4.756873272809202e-05, + "loss": 4.589, + "step": 23801 + }, + { + "epoch": 0.14155723665429631, + "grad_norm": 1.895958662033081, + "learning_rate": 4.756853179375649e-05, + "loss": 4.4608, + "step": 23802 + }, + { + "epoch": 0.1415631839375773, + "grad_norm": 2.103283166885376, + "learning_rate": 4.756833085154252e-05, + "loss": 4.3885, + "step": 23803 + }, + { + "epoch": 0.14156913122085832, + "grad_norm": 2.0823607444763184, + "learning_rate": 4.756812990145019e-05, + "loss": 4.307, + "step": 23804 + }, + { + "epoch": 0.1415750785041393, + "grad_norm": 1.852010726928711, + "learning_rate": 4.7567928943479546e-05, + "loss": 4.7289, + "step": 23805 + }, + { + "epoch": 0.1415810257874203, + "grad_norm": 1.6223875284194946, + "learning_rate": 4.7567727977630685e-05, + "loss": 5.5772, + "step": 23806 + }, + { + "epoch": 0.1415869730707013, + "grad_norm": 1.9508872032165527, + "learning_rate": 4.756752700390366e-05, + "loss": 5.3001, + "step": 23807 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 1.6098484992980957, + "learning_rate": 4.756732602229853e-05, + "loss": 5.2318, + "step": 23808 + }, + { + "epoch": 0.1415988676372633, + "grad_norm": 1.4197050333023071, + "learning_rate": 4.7567125032815394e-05, + "loss": 4.9752, + "step": 23809 + }, + { + "epoch": 0.1416048149205443, + "grad_norm": 1.5325055122375488, + "learning_rate": 4.7566924035454305e-05, + "loss": 5.0156, + "step": 23810 + }, + { + "epoch": 0.1416107622038253, + "grad_norm": 1.7188338041305542, + "learning_rate": 4.7566723030215335e-05, + "loss": 5.3756, + "step": 23811 + }, + { + "epoch": 0.14161670948710628, + "grad_norm": 1.779646396636963, + "learning_rate": 4.756652201709856e-05, + "loss": 5.3844, + "step": 23812 + }, + { + "epoch": 0.1416226567703873, + "grad_norm": 1.913001298904419, + "learning_rate": 4.756632099610404e-05, + "loss": 5.2604, + "step": 23813 + }, + { + "epoch": 0.14162860405366828, + "grad_norm": 1.5379444360733032, + "learning_rate": 4.7566119967231846e-05, + "loss": 5.4184, + "step": 23814 + }, + { + "epoch": 0.14163455133694927, + "grad_norm": 2.5433242321014404, + "learning_rate": 4.756591893048206e-05, + "loss": 5.076, + "step": 23815 + }, + { + "epoch": 0.1416404986202303, + "grad_norm": 2.0431840419769287, + "learning_rate": 4.756571788585474e-05, + "loss": 5.0766, + "step": 23816 + }, + { + "epoch": 0.14164644590351128, + "grad_norm": 2.5103769302368164, + "learning_rate": 4.7565516833349964e-05, + "loss": 4.9539, + "step": 23817 + }, + { + "epoch": 0.14165239318679226, + "grad_norm": 1.563063383102417, + "learning_rate": 4.75653157729678e-05, + "loss": 5.4752, + "step": 23818 + }, + { + "epoch": 0.14165834047007328, + "grad_norm": 1.8695935010910034, + "learning_rate": 4.756511470470832e-05, + "loss": 5.4486, + "step": 23819 + }, + { + "epoch": 0.14166428775335427, + "grad_norm": 2.092947244644165, + "learning_rate": 4.756491362857158e-05, + "loss": 5.1404, + "step": 23820 + }, + { + "epoch": 0.14167023503663526, + "grad_norm": 1.8582149744033813, + "learning_rate": 4.756471254455768e-05, + "loss": 5.0814, + "step": 23821 + }, + { + "epoch": 0.14167618231991627, + "grad_norm": 3.3430545330047607, + "learning_rate": 4.756451145266666e-05, + "loss": 5.2346, + "step": 23822 + }, + { + "epoch": 0.14168212960319726, + "grad_norm": 2.023859977722168, + "learning_rate": 4.75643103528986e-05, + "loss": 5.1639, + "step": 23823 + }, + { + "epoch": 0.14168807688647825, + "grad_norm": 2.0848581790924072, + "learning_rate": 4.756410924525358e-05, + "loss": 5.1314, + "step": 23824 + }, + { + "epoch": 0.14169402416975926, + "grad_norm": 2.2708516120910645, + "learning_rate": 4.7563908129731663e-05, + "loss": 5.1218, + "step": 23825 + }, + { + "epoch": 0.14169997145304025, + "grad_norm": 1.9105170965194702, + "learning_rate": 4.7563707006332905e-05, + "loss": 5.0428, + "step": 23826 + }, + { + "epoch": 0.14170591873632124, + "grad_norm": 1.9914016723632812, + "learning_rate": 4.75635058750574e-05, + "loss": 5.0497, + "step": 23827 + }, + { + "epoch": 0.14171186601960226, + "grad_norm": 1.9820994138717651, + "learning_rate": 4.756330473590521e-05, + "loss": 5.1161, + "step": 23828 + }, + { + "epoch": 0.14171781330288324, + "grad_norm": 1.7676537036895752, + "learning_rate": 4.75631035888764e-05, + "loss": 5.0291, + "step": 23829 + }, + { + "epoch": 0.14172376058616423, + "grad_norm": 1.9614083766937256, + "learning_rate": 4.7562902433971046e-05, + "loss": 5.3574, + "step": 23830 + }, + { + "epoch": 0.14172970786944525, + "grad_norm": 1.4212971925735474, + "learning_rate": 4.756270127118921e-05, + "loss": 5.8053, + "step": 23831 + }, + { + "epoch": 0.14173565515272624, + "grad_norm": 1.6015945672988892, + "learning_rate": 4.7562500100530984e-05, + "loss": 5.9339, + "step": 23832 + }, + { + "epoch": 0.14174160243600722, + "grad_norm": 1.6133309602737427, + "learning_rate": 4.7562298921996405e-05, + "loss": 5.4939, + "step": 23833 + }, + { + "epoch": 0.14174754971928824, + "grad_norm": 1.514958381652832, + "learning_rate": 4.7562097735585565e-05, + "loss": 5.649, + "step": 23834 + }, + { + "epoch": 0.14175349700256923, + "grad_norm": 1.912479281425476, + "learning_rate": 4.756189654129853e-05, + "loss": 5.5304, + "step": 23835 + }, + { + "epoch": 0.14175944428585022, + "grad_norm": 2.149765968322754, + "learning_rate": 4.756169533913538e-05, + "loss": 5.4228, + "step": 23836 + }, + { + "epoch": 0.14176539156913123, + "grad_norm": 1.8468290567398071, + "learning_rate": 4.756149412909616e-05, + "loss": 5.4605, + "step": 23837 + }, + { + "epoch": 0.14177133885241222, + "grad_norm": 1.670300841331482, + "learning_rate": 4.756129291118097e-05, + "loss": 5.4537, + "step": 23838 + }, + { + "epoch": 0.1417772861356932, + "grad_norm": 1.8857238292694092, + "learning_rate": 4.756109168538985e-05, + "loss": 5.2654, + "step": 23839 + }, + { + "epoch": 0.14178323341897422, + "grad_norm": 1.9114692211151123, + "learning_rate": 4.7560890451722894e-05, + "loss": 5.3255, + "step": 23840 + }, + { + "epoch": 0.1417891807022552, + "grad_norm": 1.654356598854065, + "learning_rate": 4.7560689210180164e-05, + "loss": 5.2983, + "step": 23841 + }, + { + "epoch": 0.1417951279855362, + "grad_norm": 1.9302277565002441, + "learning_rate": 4.7560487960761734e-05, + "loss": 5.7902, + "step": 23842 + }, + { + "epoch": 0.14180107526881722, + "grad_norm": 1.8009575605392456, + "learning_rate": 4.7560286703467674e-05, + "loss": 5.7359, + "step": 23843 + }, + { + "epoch": 0.1418070225520982, + "grad_norm": 1.4472894668579102, + "learning_rate": 4.7560085438298043e-05, + "loss": 5.813, + "step": 23844 + }, + { + "epoch": 0.1418129698353792, + "grad_norm": 1.6131559610366821, + "learning_rate": 4.755988416525292e-05, + "loss": 5.7525, + "step": 23845 + }, + { + "epoch": 0.1418189171186602, + "grad_norm": 1.4684244394302368, + "learning_rate": 4.755968288433237e-05, + "loss": 5.7649, + "step": 23846 + }, + { + "epoch": 0.1418248644019412, + "grad_norm": 1.369974970817566, + "learning_rate": 4.755948159553647e-05, + "loss": 5.666, + "step": 23847 + }, + { + "epoch": 0.14183081168522219, + "grad_norm": 1.6687818765640259, + "learning_rate": 4.755928029886529e-05, + "loss": 5.5685, + "step": 23848 + }, + { + "epoch": 0.1418367589685032, + "grad_norm": 2.011798858642578, + "learning_rate": 4.755907899431891e-05, + "loss": 6.0011, + "step": 23849 + }, + { + "epoch": 0.1418427062517842, + "grad_norm": 2.1938908100128174, + "learning_rate": 4.7558877681897376e-05, + "loss": 5.4987, + "step": 23850 + }, + { + "epoch": 0.14184865353506518, + "grad_norm": 1.9103244543075562, + "learning_rate": 4.7558676361600774e-05, + "loss": 5.5061, + "step": 23851 + }, + { + "epoch": 0.14185460081834617, + "grad_norm": 1.850809097290039, + "learning_rate": 4.7558475033429165e-05, + "loss": 5.4346, + "step": 23852 + }, + { + "epoch": 0.14186054810162718, + "grad_norm": 1.6861615180969238, + "learning_rate": 4.755827369738263e-05, + "loss": 5.5082, + "step": 23853 + }, + { + "epoch": 0.14186649538490817, + "grad_norm": 1.532423496246338, + "learning_rate": 4.7558072353461236e-05, + "loss": 5.704, + "step": 23854 + }, + { + "epoch": 0.14187244266818916, + "grad_norm": 1.6446877717971802, + "learning_rate": 4.755787100166506e-05, + "loss": 5.7046, + "step": 23855 + }, + { + "epoch": 0.14187838995147017, + "grad_norm": 1.599294662475586, + "learning_rate": 4.7557669641994144e-05, + "loss": 5.7324, + "step": 23856 + }, + { + "epoch": 0.14188433723475116, + "grad_norm": 1.8838186264038086, + "learning_rate": 4.7557468274448594e-05, + "loss": 5.5496, + "step": 23857 + }, + { + "epoch": 0.14189028451803215, + "grad_norm": 1.8579468727111816, + "learning_rate": 4.7557266899028464e-05, + "loss": 5.6645, + "step": 23858 + }, + { + "epoch": 0.14189623180131317, + "grad_norm": 2.02162766456604, + "learning_rate": 4.7557065515733815e-05, + "loss": 5.7992, + "step": 23859 + }, + { + "epoch": 0.14190217908459415, + "grad_norm": 1.559417486190796, + "learning_rate": 4.755686412456474e-05, + "loss": 5.6176, + "step": 23860 + }, + { + "epoch": 0.14190812636787514, + "grad_norm": 1.5074375867843628, + "learning_rate": 4.755666272552129e-05, + "loss": 5.3933, + "step": 23861 + }, + { + "epoch": 0.14191407365115616, + "grad_norm": 1.521987795829773, + "learning_rate": 4.755646131860354e-05, + "loss": 5.834, + "step": 23862 + }, + { + "epoch": 0.14192002093443715, + "grad_norm": 1.7396782636642456, + "learning_rate": 4.755625990381157e-05, + "loss": 5.149, + "step": 23863 + }, + { + "epoch": 0.14192596821771813, + "grad_norm": 1.7040945291519165, + "learning_rate": 4.755605848114544e-05, + "loss": 5.1569, + "step": 23864 + }, + { + "epoch": 0.14193191550099915, + "grad_norm": 1.7336739301681519, + "learning_rate": 4.7555857050605217e-05, + "loss": 5.1509, + "step": 23865 + }, + { + "epoch": 0.14193786278428014, + "grad_norm": 1.6548901796340942, + "learning_rate": 4.755565561219099e-05, + "loss": 4.9829, + "step": 23866 + }, + { + "epoch": 0.14194381006756113, + "grad_norm": 1.9203529357910156, + "learning_rate": 4.7555454165902804e-05, + "loss": 4.8946, + "step": 23867 + }, + { + "epoch": 0.14194975735084214, + "grad_norm": 1.8711525201797485, + "learning_rate": 4.755525271174074e-05, + "loss": 4.9691, + "step": 23868 + }, + { + "epoch": 0.14195570463412313, + "grad_norm": 1.8115698099136353, + "learning_rate": 4.755505124970488e-05, + "loss": 4.7342, + "step": 23869 + }, + { + "epoch": 0.14196165191740412, + "grad_norm": 1.996324896812439, + "learning_rate": 4.7554849779795284e-05, + "loss": 4.8892, + "step": 23870 + }, + { + "epoch": 0.14196759920068514, + "grad_norm": 1.7132238149642944, + "learning_rate": 4.7554648302012015e-05, + "loss": 4.7785, + "step": 23871 + }, + { + "epoch": 0.14197354648396612, + "grad_norm": 1.8130909204483032, + "learning_rate": 4.755444681635516e-05, + "loss": 4.9106, + "step": 23872 + }, + { + "epoch": 0.1419794937672471, + "grad_norm": 1.8058964014053345, + "learning_rate": 4.755424532282478e-05, + "loss": 4.7486, + "step": 23873 + }, + { + "epoch": 0.14198544105052813, + "grad_norm": 3.171724557876587, + "learning_rate": 4.755404382142094e-05, + "loss": 4.7696, + "step": 23874 + }, + { + "epoch": 0.14199138833380912, + "grad_norm": 1.99362313747406, + "learning_rate": 4.755384231214372e-05, + "loss": 4.6704, + "step": 23875 + }, + { + "epoch": 0.1419973356170901, + "grad_norm": 1.3904173374176025, + "learning_rate": 4.755364079499318e-05, + "loss": 5.6621, + "step": 23876 + }, + { + "epoch": 0.14200328290037112, + "grad_norm": 1.4735981225967407, + "learning_rate": 4.7553439269969415e-05, + "loss": 5.5464, + "step": 23877 + }, + { + "epoch": 0.1420092301836521, + "grad_norm": 1.3085891008377075, + "learning_rate": 4.755323773707246e-05, + "loss": 5.4913, + "step": 23878 + }, + { + "epoch": 0.1420151774669331, + "grad_norm": 1.627657175064087, + "learning_rate": 4.755303619630241e-05, + "loss": 5.4001, + "step": 23879 + }, + { + "epoch": 0.1420211247502141, + "grad_norm": 1.8672151565551758, + "learning_rate": 4.755283464765933e-05, + "loss": 5.5518, + "step": 23880 + }, + { + "epoch": 0.1420270720334951, + "grad_norm": 1.8344969749450684, + "learning_rate": 4.755263309114328e-05, + "loss": 5.2819, + "step": 23881 + }, + { + "epoch": 0.1420330193167761, + "grad_norm": 1.8662999868392944, + "learning_rate": 4.755243152675434e-05, + "loss": 5.3128, + "step": 23882 + }, + { + "epoch": 0.1420389666000571, + "grad_norm": 1.6729795932769775, + "learning_rate": 4.755222995449259e-05, + "loss": 5.1282, + "step": 23883 + }, + { + "epoch": 0.1420449138833381, + "grad_norm": 2.925039529800415, + "learning_rate": 4.7552028374358074e-05, + "loss": 4.9187, + "step": 23884 + }, + { + "epoch": 0.14205086116661908, + "grad_norm": 2.414885997772217, + "learning_rate": 4.755182678635089e-05, + "loss": 5.219, + "step": 23885 + }, + { + "epoch": 0.1420568084499001, + "grad_norm": 1.7273744344711304, + "learning_rate": 4.7551625190471095e-05, + "loss": 5.1296, + "step": 23886 + }, + { + "epoch": 0.14206275573318108, + "grad_norm": 1.691588044166565, + "learning_rate": 4.755142358671876e-05, + "loss": 5.3328, + "step": 23887 + }, + { + "epoch": 0.14206870301646207, + "grad_norm": 1.6644389629364014, + "learning_rate": 4.755122197509395e-05, + "loss": 6.162, + "step": 23888 + }, + { + "epoch": 0.1420746502997431, + "grad_norm": 1.7232459783554077, + "learning_rate": 4.7551020355596744e-05, + "loss": 6.1469, + "step": 23889 + }, + { + "epoch": 0.14208059758302408, + "grad_norm": 1.4883437156677246, + "learning_rate": 4.7550818728227206e-05, + "loss": 6.1803, + "step": 23890 + }, + { + "epoch": 0.14208654486630506, + "grad_norm": 1.4301148653030396, + "learning_rate": 4.7550617092985425e-05, + "loss": 6.0918, + "step": 23891 + }, + { + "epoch": 0.14209249214958608, + "grad_norm": 1.4922714233398438, + "learning_rate": 4.755041544987144e-05, + "loss": 5.8328, + "step": 23892 + }, + { + "epoch": 0.14209843943286707, + "grad_norm": 1.9683314561843872, + "learning_rate": 4.7550213798885345e-05, + "loss": 5.3362, + "step": 23893 + }, + { + "epoch": 0.14210438671614806, + "grad_norm": 1.841512680053711, + "learning_rate": 4.755001214002721e-05, + "loss": 5.1776, + "step": 23894 + }, + { + "epoch": 0.14211033399942907, + "grad_norm": 1.615190863609314, + "learning_rate": 4.7549810473297085e-05, + "loss": 5.4266, + "step": 23895 + }, + { + "epoch": 0.14211628128271006, + "grad_norm": 1.728252649307251, + "learning_rate": 4.7549608798695065e-05, + "loss": 5.5736, + "step": 23896 + }, + { + "epoch": 0.14212222856599105, + "grad_norm": 1.5590336322784424, + "learning_rate": 4.75494071162212e-05, + "loss": 5.4725, + "step": 23897 + }, + { + "epoch": 0.14212817584927206, + "grad_norm": 1.5246217250823975, + "learning_rate": 4.7549205425875585e-05, + "loss": 5.3707, + "step": 23898 + }, + { + "epoch": 0.14213412313255305, + "grad_norm": 1.4803682565689087, + "learning_rate": 4.754900372765826e-05, + "loss": 5.5735, + "step": 23899 + }, + { + "epoch": 0.14214007041583404, + "grad_norm": 1.633510947227478, + "learning_rate": 4.7548802021569315e-05, + "loss": 5.3334, + "step": 23900 + }, + { + "epoch": 0.14214601769911506, + "grad_norm": 1.9321861267089844, + "learning_rate": 4.754860030760882e-05, + "loss": 5.3384, + "step": 23901 + }, + { + "epoch": 0.14215196498239605, + "grad_norm": 1.858965516090393, + "learning_rate": 4.7548398585776844e-05, + "loss": 5.4072, + "step": 23902 + }, + { + "epoch": 0.14215791226567703, + "grad_norm": 1.7266136407852173, + "learning_rate": 4.754819685607345e-05, + "loss": 5.3865, + "step": 23903 + }, + { + "epoch": 0.14216385954895805, + "grad_norm": 1.579783320426941, + "learning_rate": 4.754799511849871e-05, + "loss": 5.3524, + "step": 23904 + }, + { + "epoch": 0.14216980683223904, + "grad_norm": 1.5112273693084717, + "learning_rate": 4.7547793373052704e-05, + "loss": 5.3411, + "step": 23905 + }, + { + "epoch": 0.14217575411552003, + "grad_norm": 1.5031278133392334, + "learning_rate": 4.754759161973549e-05, + "loss": 5.3782, + "step": 23906 + }, + { + "epoch": 0.14218170139880104, + "grad_norm": 1.581784963607788, + "learning_rate": 4.7547389858547155e-05, + "loss": 5.2722, + "step": 23907 + }, + { + "epoch": 0.14218764868208203, + "grad_norm": 1.350386619567871, + "learning_rate": 4.754718808948775e-05, + "loss": 5.5733, + "step": 23908 + }, + { + "epoch": 0.14219359596536302, + "grad_norm": 1.5469433069229126, + "learning_rate": 4.754698631255736e-05, + "loss": 5.7556, + "step": 23909 + }, + { + "epoch": 0.142199543248644, + "grad_norm": 1.5234500169754028, + "learning_rate": 4.754678452775604e-05, + "loss": 5.9086, + "step": 23910 + }, + { + "epoch": 0.14220549053192502, + "grad_norm": 1.4361084699630737, + "learning_rate": 4.754658273508388e-05, + "loss": 5.7659, + "step": 23911 + }, + { + "epoch": 0.142211437815206, + "grad_norm": 1.5128140449523926, + "learning_rate": 4.754638093454094e-05, + "loss": 5.7307, + "step": 23912 + }, + { + "epoch": 0.142217385098487, + "grad_norm": 1.4324685335159302, + "learning_rate": 4.754617912612729e-05, + "loss": 5.4717, + "step": 23913 + }, + { + "epoch": 0.14222333238176801, + "grad_norm": 1.8225339651107788, + "learning_rate": 4.7545977309843004e-05, + "loss": 5.3876, + "step": 23914 + }, + { + "epoch": 0.142229279665049, + "grad_norm": 1.6822171211242676, + "learning_rate": 4.754577548568815e-05, + "loss": 5.5243, + "step": 23915 + }, + { + "epoch": 0.14223522694833, + "grad_norm": 1.7231889963150024, + "learning_rate": 4.754557365366279e-05, + "loss": 5.9398, + "step": 23916 + }, + { + "epoch": 0.142241174231611, + "grad_norm": 1.6815425157546997, + "learning_rate": 4.754537181376702e-05, + "loss": 6.0264, + "step": 23917 + }, + { + "epoch": 0.142247121514892, + "grad_norm": 1.599161148071289, + "learning_rate": 4.754516996600088e-05, + "loss": 6.0783, + "step": 23918 + }, + { + "epoch": 0.14225306879817298, + "grad_norm": 1.565960168838501, + "learning_rate": 4.7544968110364455e-05, + "loss": 6.2248, + "step": 23919 + }, + { + "epoch": 0.142259016081454, + "grad_norm": 1.5778778791427612, + "learning_rate": 4.754476624685782e-05, + "loss": 6.1216, + "step": 23920 + }, + { + "epoch": 0.142264963364735, + "grad_norm": 1.6303963661193848, + "learning_rate": 4.754456437548104e-05, + "loss": 5.9956, + "step": 23921 + }, + { + "epoch": 0.14227091064801597, + "grad_norm": 1.6119714975357056, + "learning_rate": 4.754436249623418e-05, + "loss": 5.4221, + "step": 23922 + }, + { + "epoch": 0.142276857931297, + "grad_norm": 1.9543877840042114, + "learning_rate": 4.754416060911732e-05, + "loss": 5.3631, + "step": 23923 + }, + { + "epoch": 0.14228280521457798, + "grad_norm": 1.90111243724823, + "learning_rate": 4.754395871413052e-05, + "loss": 5.3828, + "step": 23924 + }, + { + "epoch": 0.14228875249785897, + "grad_norm": 1.6575809717178345, + "learning_rate": 4.754375681127386e-05, + "loss": 5.1258, + "step": 23925 + }, + { + "epoch": 0.14229469978113998, + "grad_norm": 1.5518983602523804, + "learning_rate": 4.7543554900547416e-05, + "loss": 5.2144, + "step": 23926 + }, + { + "epoch": 0.14230064706442097, + "grad_norm": 1.604325532913208, + "learning_rate": 4.754335298195124e-05, + "loss": 5.1447, + "step": 23927 + }, + { + "epoch": 0.14230659434770196, + "grad_norm": 1.6287504434585571, + "learning_rate": 4.754315105548542e-05, + "loss": 5.1267, + "step": 23928 + }, + { + "epoch": 0.14231254163098297, + "grad_norm": 1.5111888647079468, + "learning_rate": 4.7542949121150014e-05, + "loss": 5.1122, + "step": 23929 + }, + { + "epoch": 0.14231848891426396, + "grad_norm": 1.4685728549957275, + "learning_rate": 4.75427471789451e-05, + "loss": 5.5366, + "step": 23930 + }, + { + "epoch": 0.14232443619754495, + "grad_norm": 2.1167118549346924, + "learning_rate": 4.754254522887074e-05, + "loss": 5.0426, + "step": 23931 + }, + { + "epoch": 0.14233038348082597, + "grad_norm": 1.7412205934524536, + "learning_rate": 4.754234327092702e-05, + "loss": 5.1454, + "step": 23932 + }, + { + "epoch": 0.14233633076410696, + "grad_norm": 2.290722608566284, + "learning_rate": 4.754214130511399e-05, + "loss": 4.7253, + "step": 23933 + }, + { + "epoch": 0.14234227804738794, + "grad_norm": 2.460817813873291, + "learning_rate": 4.754193933143174e-05, + "loss": 4.762, + "step": 23934 + }, + { + "epoch": 0.14234822533066896, + "grad_norm": 2.2080838680267334, + "learning_rate": 4.754173734988032e-05, + "loss": 4.6405, + "step": 23935 + }, + { + "epoch": 0.14235417261394995, + "grad_norm": 2.475855588912964, + "learning_rate": 4.7541535360459825e-05, + "loss": 4.6213, + "step": 23936 + }, + { + "epoch": 0.14236011989723094, + "grad_norm": 2.1748647689819336, + "learning_rate": 4.754133336317031e-05, + "loss": 4.5461, + "step": 23937 + }, + { + "epoch": 0.14236606718051195, + "grad_norm": 2.1339731216430664, + "learning_rate": 4.754113135801185e-05, + "loss": 4.6366, + "step": 23938 + }, + { + "epoch": 0.14237201446379294, + "grad_norm": 2.142465353012085, + "learning_rate": 4.754092934498451e-05, + "loss": 4.6129, + "step": 23939 + }, + { + "epoch": 0.14237796174707393, + "grad_norm": 2.1925458908081055, + "learning_rate": 4.754072732408836e-05, + "loss": 4.6171, + "step": 23940 + }, + { + "epoch": 0.14238390903035494, + "grad_norm": 2.1470870971679688, + "learning_rate": 4.7540525295323483e-05, + "loss": 4.4577, + "step": 23941 + }, + { + "epoch": 0.14238985631363593, + "grad_norm": 1.7223306894302368, + "learning_rate": 4.754032325868994e-05, + "loss": 5.7355, + "step": 23942 + }, + { + "epoch": 0.14239580359691692, + "grad_norm": 1.8489956855773926, + "learning_rate": 4.7540121214187805e-05, + "loss": 5.9877, + "step": 23943 + }, + { + "epoch": 0.14240175088019794, + "grad_norm": 1.8920329809188843, + "learning_rate": 4.7539919161817134e-05, + "loss": 5.6751, + "step": 23944 + }, + { + "epoch": 0.14240769816347892, + "grad_norm": 1.642392635345459, + "learning_rate": 4.753971710157802e-05, + "loss": 5.3404, + "step": 23945 + }, + { + "epoch": 0.1424136454467599, + "grad_norm": 1.681997537612915, + "learning_rate": 4.753951503347053e-05, + "loss": 5.2964, + "step": 23946 + }, + { + "epoch": 0.14241959273004093, + "grad_norm": 1.767589807510376, + "learning_rate": 4.753931295749472e-05, + "loss": 5.2843, + "step": 23947 + }, + { + "epoch": 0.14242554001332192, + "grad_norm": 1.7100127935409546, + "learning_rate": 4.7539110873650674e-05, + "loss": 5.3869, + "step": 23948 + }, + { + "epoch": 0.1424314872966029, + "grad_norm": 1.5660570859909058, + "learning_rate": 4.7538908781938453e-05, + "loss": 5.3994, + "step": 23949 + }, + { + "epoch": 0.14243743457988392, + "grad_norm": 1.8509501218795776, + "learning_rate": 4.7538706682358124e-05, + "loss": 5.8575, + "step": 23950 + }, + { + "epoch": 0.1424433818631649, + "grad_norm": 1.5773848295211792, + "learning_rate": 4.753850457490978e-05, + "loss": 5.8548, + "step": 23951 + }, + { + "epoch": 0.1424493291464459, + "grad_norm": 1.4020990133285522, + "learning_rate": 4.753830245959347e-05, + "loss": 5.6696, + "step": 23952 + }, + { + "epoch": 0.1424552764297269, + "grad_norm": 1.7756813764572144, + "learning_rate": 4.753810033640928e-05, + "loss": 5.3623, + "step": 23953 + }, + { + "epoch": 0.1424612237130079, + "grad_norm": 1.9046579599380493, + "learning_rate": 4.7537898205357255e-05, + "loss": 5.4078, + "step": 23954 + }, + { + "epoch": 0.1424671709962889, + "grad_norm": 1.6977450847625732, + "learning_rate": 4.753769606643749e-05, + "loss": 5.4418, + "step": 23955 + }, + { + "epoch": 0.1424731182795699, + "grad_norm": 1.6306700706481934, + "learning_rate": 4.753749391965005e-05, + "loss": 5.6299, + "step": 23956 + }, + { + "epoch": 0.1424790655628509, + "grad_norm": 1.8286629915237427, + "learning_rate": 4.7537291764995006e-05, + "loss": 5.7271, + "step": 23957 + }, + { + "epoch": 0.14248501284613188, + "grad_norm": 1.5603896379470825, + "learning_rate": 4.753708960247242e-05, + "loss": 5.645, + "step": 23958 + }, + { + "epoch": 0.1424909601294129, + "grad_norm": 1.6031434535980225, + "learning_rate": 4.7536887432082375e-05, + "loss": 5.6604, + "step": 23959 + }, + { + "epoch": 0.14249690741269389, + "grad_norm": 1.6950321197509766, + "learning_rate": 4.753668525382493e-05, + "loss": 5.7467, + "step": 23960 + }, + { + "epoch": 0.14250285469597487, + "grad_norm": 1.367156744003296, + "learning_rate": 4.753648306770017e-05, + "loss": 5.8554, + "step": 23961 + }, + { + "epoch": 0.1425088019792559, + "grad_norm": 1.6769720315933228, + "learning_rate": 4.753628087370815e-05, + "loss": 5.7408, + "step": 23962 + }, + { + "epoch": 0.14251474926253688, + "grad_norm": 2.3092730045318604, + "learning_rate": 4.753607867184894e-05, + "loss": 4.3284, + "step": 23963 + }, + { + "epoch": 0.14252069654581787, + "grad_norm": 1.8199213743209839, + "learning_rate": 4.753587646212263e-05, + "loss": 4.9928, + "step": 23964 + }, + { + "epoch": 0.14252664382909888, + "grad_norm": 1.5818908214569092, + "learning_rate": 4.753567424452927e-05, + "loss": 5.4382, + "step": 23965 + }, + { + "epoch": 0.14253259111237987, + "grad_norm": 1.6112592220306396, + "learning_rate": 4.753547201906895e-05, + "loss": 5.6344, + "step": 23966 + }, + { + "epoch": 0.14253853839566086, + "grad_norm": 1.530733585357666, + "learning_rate": 4.753526978574172e-05, + "loss": 5.6788, + "step": 23967 + }, + { + "epoch": 0.14254448567894185, + "grad_norm": 1.4186383485794067, + "learning_rate": 4.7535067544547664e-05, + "loss": 5.5129, + "step": 23968 + }, + { + "epoch": 0.14255043296222286, + "grad_norm": 1.3288373947143555, + "learning_rate": 4.753486529548684e-05, + "loss": 5.4413, + "step": 23969 + }, + { + "epoch": 0.14255638024550385, + "grad_norm": 1.3416498899459839, + "learning_rate": 4.7534663038559335e-05, + "loss": 5.6757, + "step": 23970 + }, + { + "epoch": 0.14256232752878484, + "grad_norm": 1.2552043199539185, + "learning_rate": 4.7534460773765215e-05, + "loss": 5.4015, + "step": 23971 + }, + { + "epoch": 0.14256827481206585, + "grad_norm": 1.7393593788146973, + "learning_rate": 4.7534258501104544e-05, + "loss": 5.8824, + "step": 23972 + }, + { + "epoch": 0.14257422209534684, + "grad_norm": 1.5608609914779663, + "learning_rate": 4.75340562205774e-05, + "loss": 5.7623, + "step": 23973 + }, + { + "epoch": 0.14258016937862783, + "grad_norm": 1.484365463256836, + "learning_rate": 4.753385393218384e-05, + "loss": 5.6563, + "step": 23974 + }, + { + "epoch": 0.14258611666190885, + "grad_norm": 1.5432020425796509, + "learning_rate": 4.753365163592395e-05, + "loss": 5.6214, + "step": 23975 + }, + { + "epoch": 0.14259206394518983, + "grad_norm": 1.3963783979415894, + "learning_rate": 4.7533449331797797e-05, + "loss": 5.5315, + "step": 23976 + }, + { + "epoch": 0.14259801122847082, + "grad_norm": 1.778178095817566, + "learning_rate": 4.753324701980545e-05, + "loss": 5.8467, + "step": 23977 + }, + { + "epoch": 0.14260395851175184, + "grad_norm": 1.717940330505371, + "learning_rate": 4.753304469994698e-05, + "loss": 5.6369, + "step": 23978 + }, + { + "epoch": 0.14260990579503283, + "grad_norm": 1.7598493099212646, + "learning_rate": 4.753284237222245e-05, + "loss": 5.2906, + "step": 23979 + }, + { + "epoch": 0.14261585307831381, + "grad_norm": 2.1206471920013428, + "learning_rate": 4.753264003663194e-05, + "loss": 4.5855, + "step": 23980 + }, + { + "epoch": 0.14262180036159483, + "grad_norm": 2.1312971115112305, + "learning_rate": 4.7532437693175525e-05, + "loss": 4.6795, + "step": 23981 + }, + { + "epoch": 0.14262774764487582, + "grad_norm": 2.6566877365112305, + "learning_rate": 4.753223534185326e-05, + "loss": 4.6831, + "step": 23982 + }, + { + "epoch": 0.1426336949281568, + "grad_norm": 2.5692079067230225, + "learning_rate": 4.753203298266523e-05, + "loss": 4.3662, + "step": 23983 + }, + { + "epoch": 0.14263964221143782, + "grad_norm": 2.2617204189300537, + "learning_rate": 4.75318306156115e-05, + "loss": 4.5077, + "step": 23984 + }, + { + "epoch": 0.1426455894947188, + "grad_norm": 2.3445560932159424, + "learning_rate": 4.753162824069214e-05, + "loss": 4.3449, + "step": 23985 + }, + { + "epoch": 0.1426515367779998, + "grad_norm": 2.193120002746582, + "learning_rate": 4.7531425857907216e-05, + "loss": 4.3601, + "step": 23986 + }, + { + "epoch": 0.14265748406128081, + "grad_norm": 2.3515334129333496, + "learning_rate": 4.753122346725681e-05, + "loss": 4.411, + "step": 23987 + }, + { + "epoch": 0.1426634313445618, + "grad_norm": 2.286971092224121, + "learning_rate": 4.7531021068740986e-05, + "loss": 4.4801, + "step": 23988 + }, + { + "epoch": 0.1426693786278428, + "grad_norm": 2.30155873298645, + "learning_rate": 4.7530818662359814e-05, + "loss": 4.4121, + "step": 23989 + }, + { + "epoch": 0.1426753259111238, + "grad_norm": 2.151796340942383, + "learning_rate": 4.7530616248113364e-05, + "loss": 4.4185, + "step": 23990 + }, + { + "epoch": 0.1426812731944048, + "grad_norm": 2.6092782020568848, + "learning_rate": 4.7530413826001706e-05, + "loss": 4.5183, + "step": 23991 + }, + { + "epoch": 0.14268722047768578, + "grad_norm": 2.3881771564483643, + "learning_rate": 4.7530211396024926e-05, + "loss": 4.5246, + "step": 23992 + }, + { + "epoch": 0.1426931677609668, + "grad_norm": 2.921297550201416, + "learning_rate": 4.753000895818307e-05, + "loss": 4.5855, + "step": 23993 + }, + { + "epoch": 0.1426991150442478, + "grad_norm": 2.039461135864258, + "learning_rate": 4.752980651247623e-05, + "loss": 5.3866, + "step": 23994 + }, + { + "epoch": 0.14270506232752878, + "grad_norm": 2.6810874938964844, + "learning_rate": 4.752960405890446e-05, + "loss": 4.3992, + "step": 23995 + }, + { + "epoch": 0.1427110096108098, + "grad_norm": 2.366675615310669, + "learning_rate": 4.752940159746784e-05, + "loss": 4.3981, + "step": 23996 + }, + { + "epoch": 0.14271695689409078, + "grad_norm": 2.446672201156616, + "learning_rate": 4.7529199128166435e-05, + "loss": 4.3428, + "step": 23997 + }, + { + "epoch": 0.14272290417737177, + "grad_norm": 2.686692476272583, + "learning_rate": 4.7528996651000325e-05, + "loss": 4.4006, + "step": 23998 + }, + { + "epoch": 0.14272885146065278, + "grad_norm": 2.577341318130493, + "learning_rate": 4.752879416596957e-05, + "loss": 4.3635, + "step": 23999 + }, + { + "epoch": 0.14273479874393377, + "grad_norm": 2.0183050632476807, + "learning_rate": 4.752859167307425e-05, + "loss": 4.402, + "step": 24000 + }, + { + "epoch": 0.14274074602721476, + "grad_norm": 2.062704563140869, + "learning_rate": 4.7528389172314434e-05, + "loss": 4.3103, + "step": 24001 + }, + { + "epoch": 0.14274669331049578, + "grad_norm": 2.3112356662750244, + "learning_rate": 4.752818666369019e-05, + "loss": 4.5129, + "step": 24002 + }, + { + "epoch": 0.14275264059377676, + "grad_norm": 2.3484156131744385, + "learning_rate": 4.752798414720158e-05, + "loss": 4.2367, + "step": 24003 + }, + { + "epoch": 0.14275858787705775, + "grad_norm": 2.142179250717163, + "learning_rate": 4.752778162284869e-05, + "loss": 4.8016, + "step": 24004 + }, + { + "epoch": 0.14276453516033877, + "grad_norm": 2.076201915740967, + "learning_rate": 4.752757909063158e-05, + "loss": 5.2754, + "step": 24005 + }, + { + "epoch": 0.14277048244361976, + "grad_norm": 1.7873663902282715, + "learning_rate": 4.752737655055033e-05, + "loss": 5.3064, + "step": 24006 + }, + { + "epoch": 0.14277642972690074, + "grad_norm": 1.863776445388794, + "learning_rate": 4.7527174002605e-05, + "loss": 5.045, + "step": 24007 + }, + { + "epoch": 0.14278237701018176, + "grad_norm": 1.9370598793029785, + "learning_rate": 4.752697144679567e-05, + "loss": 5.037, + "step": 24008 + }, + { + "epoch": 0.14278832429346275, + "grad_norm": 1.967492938041687, + "learning_rate": 4.7526768883122405e-05, + "loss": 4.9898, + "step": 24009 + }, + { + "epoch": 0.14279427157674374, + "grad_norm": 1.6309136152267456, + "learning_rate": 4.7526566311585285e-05, + "loss": 5.0752, + "step": 24010 + }, + { + "epoch": 0.14280021886002475, + "grad_norm": 1.6783781051635742, + "learning_rate": 4.7526363732184365e-05, + "loss": 4.7746, + "step": 24011 + }, + { + "epoch": 0.14280616614330574, + "grad_norm": 1.4897167682647705, + "learning_rate": 4.752616114491972e-05, + "loss": 5.1681, + "step": 24012 + }, + { + "epoch": 0.14281211342658673, + "grad_norm": 1.4138036966323853, + "learning_rate": 4.752595854979144e-05, + "loss": 5.351, + "step": 24013 + }, + { + "epoch": 0.14281806070986774, + "grad_norm": 1.4653584957122803, + "learning_rate": 4.7525755946799566e-05, + "loss": 5.1754, + "step": 24014 + }, + { + "epoch": 0.14282400799314873, + "grad_norm": 1.7669284343719482, + "learning_rate": 4.752555333594419e-05, + "loss": 5.2409, + "step": 24015 + }, + { + "epoch": 0.14282995527642972, + "grad_norm": 2.478325366973877, + "learning_rate": 4.752535071722538e-05, + "loss": 5.7027, + "step": 24016 + }, + { + "epoch": 0.14283590255971074, + "grad_norm": 1.3903100490570068, + "learning_rate": 4.75251480906432e-05, + "loss": 5.371, + "step": 24017 + }, + { + "epoch": 0.14284184984299172, + "grad_norm": 1.5938868522644043, + "learning_rate": 4.752494545619772e-05, + "loss": 5.0741, + "step": 24018 + }, + { + "epoch": 0.1428477971262727, + "grad_norm": 1.4633463621139526, + "learning_rate": 4.752474281388901e-05, + "loss": 5.2562, + "step": 24019 + }, + { + "epoch": 0.14285374440955373, + "grad_norm": 1.5575978755950928, + "learning_rate": 4.7524540163717155e-05, + "loss": 5.7142, + "step": 24020 + }, + { + "epoch": 0.14285969169283472, + "grad_norm": 1.857527732849121, + "learning_rate": 4.7524337505682216e-05, + "loss": 5.6595, + "step": 24021 + }, + { + "epoch": 0.1428656389761157, + "grad_norm": 1.6097089052200317, + "learning_rate": 4.752413483978426e-05, + "loss": 5.2562, + "step": 24022 + }, + { + "epoch": 0.14287158625939672, + "grad_norm": 1.8765082359313965, + "learning_rate": 4.752393216602335e-05, + "loss": 4.511, + "step": 24023 + }, + { + "epoch": 0.1428775335426777, + "grad_norm": 1.5626455545425415, + "learning_rate": 4.752372948439959e-05, + "loss": 4.8816, + "step": 24024 + }, + { + "epoch": 0.1428834808259587, + "grad_norm": 1.4234426021575928, + "learning_rate": 4.7523526794913015e-05, + "loss": 5.1271, + "step": 24025 + }, + { + "epoch": 0.14288942810923969, + "grad_norm": 1.4709553718566895, + "learning_rate": 4.7523324097563706e-05, + "loss": 5.2034, + "step": 24026 + }, + { + "epoch": 0.1428953753925207, + "grad_norm": 1.7568445205688477, + "learning_rate": 4.752312139235175e-05, + "loss": 4.7914, + "step": 24027 + }, + { + "epoch": 0.1429013226758017, + "grad_norm": 1.711824893951416, + "learning_rate": 4.752291867927719e-05, + "loss": 4.6601, + "step": 24028 + }, + { + "epoch": 0.14290726995908268, + "grad_norm": 1.6301651000976562, + "learning_rate": 4.752271595834012e-05, + "loss": 4.9326, + "step": 24029 + }, + { + "epoch": 0.1429132172423637, + "grad_norm": 1.5549229383468628, + "learning_rate": 4.752251322954061e-05, + "loss": 5.1706, + "step": 24030 + }, + { + "epoch": 0.14291916452564468, + "grad_norm": 1.5638782978057861, + "learning_rate": 4.752231049287871e-05, + "loss": 4.9079, + "step": 24031 + }, + { + "epoch": 0.14292511180892567, + "grad_norm": 1.6099932193756104, + "learning_rate": 4.752210774835451e-05, + "loss": 4.7565, + "step": 24032 + }, + { + "epoch": 0.14293105909220669, + "grad_norm": 1.5388545989990234, + "learning_rate": 4.752190499596808e-05, + "loss": 4.792, + "step": 24033 + }, + { + "epoch": 0.14293700637548767, + "grad_norm": 1.4083584547042847, + "learning_rate": 4.752170223571948e-05, + "loss": 4.8608, + "step": 24034 + }, + { + "epoch": 0.14294295365876866, + "grad_norm": 1.5718214511871338, + "learning_rate": 4.752149946760879e-05, + "loss": 4.7874, + "step": 24035 + }, + { + "epoch": 0.14294890094204968, + "grad_norm": 1.5951184034347534, + "learning_rate": 4.752129669163607e-05, + "loss": 4.7581, + "step": 24036 + }, + { + "epoch": 0.14295484822533067, + "grad_norm": 1.5525321960449219, + "learning_rate": 4.7521093907801404e-05, + "loss": 4.5684, + "step": 24037 + }, + { + "epoch": 0.14296079550861165, + "grad_norm": 1.6149049997329712, + "learning_rate": 4.7520891116104856e-05, + "loss": 4.4343, + "step": 24038 + }, + { + "epoch": 0.14296674279189267, + "grad_norm": 1.624150037765503, + "learning_rate": 4.752068831654649e-05, + "loss": 4.4697, + "step": 24039 + }, + { + "epoch": 0.14297269007517366, + "grad_norm": 1.3906975984573364, + "learning_rate": 4.75204855091264e-05, + "loss": 4.4062, + "step": 24040 + }, + { + "epoch": 0.14297863735845465, + "grad_norm": 1.6626862287521362, + "learning_rate": 4.7520282693844623e-05, + "loss": 4.9593, + "step": 24041 + }, + { + "epoch": 0.14298458464173566, + "grad_norm": 1.8431484699249268, + "learning_rate": 4.752007987070126e-05, + "loss": 5.3581, + "step": 24042 + }, + { + "epoch": 0.14299053192501665, + "grad_norm": 1.7550246715545654, + "learning_rate": 4.751987703969637e-05, + "loss": 5.3909, + "step": 24043 + }, + { + "epoch": 0.14299647920829764, + "grad_norm": 1.6016278266906738, + "learning_rate": 4.7519674200830015e-05, + "loss": 5.1732, + "step": 24044 + }, + { + "epoch": 0.14300242649157865, + "grad_norm": 1.4594265222549438, + "learning_rate": 4.7519471354102285e-05, + "loss": 5.0859, + "step": 24045 + }, + { + "epoch": 0.14300837377485964, + "grad_norm": 1.7040293216705322, + "learning_rate": 4.751926849951323e-05, + "loss": 5.1476, + "step": 24046 + }, + { + "epoch": 0.14301432105814063, + "grad_norm": 1.4739158153533936, + "learning_rate": 4.7519065637062934e-05, + "loss": 5.3691, + "step": 24047 + }, + { + "epoch": 0.14302026834142165, + "grad_norm": 1.5245054960250854, + "learning_rate": 4.751886276675147e-05, + "loss": 5.4395, + "step": 24048 + }, + { + "epoch": 0.14302621562470264, + "grad_norm": 1.678786039352417, + "learning_rate": 4.75186598885789e-05, + "loss": 4.826, + "step": 24049 + }, + { + "epoch": 0.14303216290798362, + "grad_norm": 1.9114538431167603, + "learning_rate": 4.7518457002545305e-05, + "loss": 5.1483, + "step": 24050 + }, + { + "epoch": 0.14303811019126464, + "grad_norm": 1.5139118432998657, + "learning_rate": 4.751825410865074e-05, + "loss": 5.1349, + "step": 24051 + }, + { + "epoch": 0.14304405747454563, + "grad_norm": 1.4199074506759644, + "learning_rate": 4.7518051206895286e-05, + "loss": 5.0579, + "step": 24052 + }, + { + "epoch": 0.14305000475782662, + "grad_norm": 1.570027470588684, + "learning_rate": 4.751784829727902e-05, + "loss": 4.9915, + "step": 24053 + }, + { + "epoch": 0.14305595204110763, + "grad_norm": 1.476340651512146, + "learning_rate": 4.7517645379802e-05, + "loss": 5.4808, + "step": 24054 + }, + { + "epoch": 0.14306189932438862, + "grad_norm": 1.7526558637619019, + "learning_rate": 4.75174424544643e-05, + "loss": 5.3816, + "step": 24055 + }, + { + "epoch": 0.1430678466076696, + "grad_norm": 1.846692681312561, + "learning_rate": 4.7517239521266e-05, + "loss": 5.6713, + "step": 24056 + }, + { + "epoch": 0.14307379389095062, + "grad_norm": 1.5340349674224854, + "learning_rate": 4.751703658020716e-05, + "loss": 5.6456, + "step": 24057 + }, + { + "epoch": 0.1430797411742316, + "grad_norm": 1.6693123579025269, + "learning_rate": 4.751683363128786e-05, + "loss": 5.5229, + "step": 24058 + }, + { + "epoch": 0.1430856884575126, + "grad_norm": 1.7673590183258057, + "learning_rate": 4.751663067450816e-05, + "loss": 4.9188, + "step": 24059 + }, + { + "epoch": 0.14309163574079362, + "grad_norm": 1.8243883848190308, + "learning_rate": 4.751642770986814e-05, + "loss": 4.5658, + "step": 24060 + }, + { + "epoch": 0.1430975830240746, + "grad_norm": 2.394139051437378, + "learning_rate": 4.7516224737367866e-05, + "loss": 4.101, + "step": 24061 + }, + { + "epoch": 0.1431035303073556, + "grad_norm": 2.0918843746185303, + "learning_rate": 4.7516021757007414e-05, + "loss": 4.03, + "step": 24062 + }, + { + "epoch": 0.1431094775906366, + "grad_norm": 2.129743814468384, + "learning_rate": 4.751581876878685e-05, + "loss": 4.1339, + "step": 24063 + }, + { + "epoch": 0.1431154248739176, + "grad_norm": 2.1546170711517334, + "learning_rate": 4.751561577270624e-05, + "loss": 4.4471, + "step": 24064 + }, + { + "epoch": 0.14312137215719858, + "grad_norm": 1.9738941192626953, + "learning_rate": 4.751541276876567e-05, + "loss": 5.8276, + "step": 24065 + }, + { + "epoch": 0.1431273194404796, + "grad_norm": 1.9925949573516846, + "learning_rate": 4.7515209756965196e-05, + "loss": 5.2116, + "step": 24066 + }, + { + "epoch": 0.1431332667237606, + "grad_norm": 1.761315941810608, + "learning_rate": 4.75150067373049e-05, + "loss": 5.0048, + "step": 24067 + }, + { + "epoch": 0.14313921400704158, + "grad_norm": 1.7744289636611938, + "learning_rate": 4.751480370978485e-05, + "loss": 5.2451, + "step": 24068 + }, + { + "epoch": 0.1431451612903226, + "grad_norm": 1.4490324258804321, + "learning_rate": 4.7514600674405106e-05, + "loss": 5.704, + "step": 24069 + }, + { + "epoch": 0.14315110857360358, + "grad_norm": 1.4389432668685913, + "learning_rate": 4.751439763116575e-05, + "loss": 5.6274, + "step": 24070 + }, + { + "epoch": 0.14315705585688457, + "grad_norm": 2.0219969749450684, + "learning_rate": 4.751419458006685e-05, + "loss": 4.2387, + "step": 24071 + }, + { + "epoch": 0.14316300314016558, + "grad_norm": 1.6722300052642822, + "learning_rate": 4.751399152110848e-05, + "loss": 4.7426, + "step": 24072 + }, + { + "epoch": 0.14316895042344657, + "grad_norm": 1.461065411567688, + "learning_rate": 4.751378845429071e-05, + "loss": 5.4895, + "step": 24073 + }, + { + "epoch": 0.14317489770672756, + "grad_norm": 1.3877815008163452, + "learning_rate": 4.75135853796136e-05, + "loss": 5.6264, + "step": 24074 + }, + { + "epoch": 0.14318084499000858, + "grad_norm": 1.3981953859329224, + "learning_rate": 4.751338229707724e-05, + "loss": 5.4467, + "step": 24075 + }, + { + "epoch": 0.14318679227328956, + "grad_norm": 1.3032608032226562, + "learning_rate": 4.751317920668169e-05, + "loss": 5.5902, + "step": 24076 + }, + { + "epoch": 0.14319273955657055, + "grad_norm": 1.477534532546997, + "learning_rate": 4.751297610842701e-05, + "loss": 5.6286, + "step": 24077 + }, + { + "epoch": 0.14319868683985157, + "grad_norm": 1.5056313276290894, + "learning_rate": 4.75127730023133e-05, + "loss": 5.5233, + "step": 24078 + }, + { + "epoch": 0.14320463412313256, + "grad_norm": 1.6936917304992676, + "learning_rate": 4.75125698883406e-05, + "loss": 4.9877, + "step": 24079 + }, + { + "epoch": 0.14321058140641355, + "grad_norm": 1.5967860221862793, + "learning_rate": 4.7512366766509004e-05, + "loss": 5.1782, + "step": 24080 + }, + { + "epoch": 0.14321652868969456, + "grad_norm": 1.4995664358139038, + "learning_rate": 4.751216363681857e-05, + "loss": 5.3016, + "step": 24081 + }, + { + "epoch": 0.14322247597297555, + "grad_norm": 1.6829060316085815, + "learning_rate": 4.751196049926937e-05, + "loss": 5.228, + "step": 24082 + }, + { + "epoch": 0.14322842325625654, + "grad_norm": 2.151371955871582, + "learning_rate": 4.7511757353861475e-05, + "loss": 5.1807, + "step": 24083 + }, + { + "epoch": 0.14323437053953753, + "grad_norm": 2.1892330646514893, + "learning_rate": 4.751155420059497e-05, + "loss": 5.3542, + "step": 24084 + }, + { + "epoch": 0.14324031782281854, + "grad_norm": 2.0016772747039795, + "learning_rate": 4.75113510394699e-05, + "loss": 4.9516, + "step": 24085 + }, + { + "epoch": 0.14324626510609953, + "grad_norm": 1.8935182094573975, + "learning_rate": 4.751114787048635e-05, + "loss": 5.0342, + "step": 24086 + }, + { + "epoch": 0.14325221238938052, + "grad_norm": 2.004809617996216, + "learning_rate": 4.75109446936444e-05, + "loss": 4.2826, + "step": 24087 + }, + { + "epoch": 0.14325815967266153, + "grad_norm": 1.8340208530426025, + "learning_rate": 4.7510741508944115e-05, + "loss": 4.9323, + "step": 24088 + }, + { + "epoch": 0.14326410695594252, + "grad_norm": 1.769805908203125, + "learning_rate": 4.7510538316385545e-05, + "loss": 5.3595, + "step": 24089 + }, + { + "epoch": 0.1432700542392235, + "grad_norm": 1.5973625183105469, + "learning_rate": 4.75103351159688e-05, + "loss": 5.6195, + "step": 24090 + }, + { + "epoch": 0.14327600152250453, + "grad_norm": 1.5248761177062988, + "learning_rate": 4.751013190769391e-05, + "loss": 5.3578, + "step": 24091 + }, + { + "epoch": 0.14328194880578551, + "grad_norm": 1.5317707061767578, + "learning_rate": 4.750992869156098e-05, + "loss": 5.2791, + "step": 24092 + }, + { + "epoch": 0.1432878960890665, + "grad_norm": 1.9778176546096802, + "learning_rate": 4.750972546757005e-05, + "loss": 5.1077, + "step": 24093 + }, + { + "epoch": 0.14329384337234752, + "grad_norm": 1.7787549495697021, + "learning_rate": 4.750952223572123e-05, + "loss": 5.1073, + "step": 24094 + }, + { + "epoch": 0.1432997906556285, + "grad_norm": 1.6317193508148193, + "learning_rate": 4.750931899601455e-05, + "loss": 5.3686, + "step": 24095 + }, + { + "epoch": 0.1433057379389095, + "grad_norm": 1.7646535634994507, + "learning_rate": 4.7509115748450106e-05, + "loss": 5.4542, + "step": 24096 + }, + { + "epoch": 0.1433116852221905, + "grad_norm": 1.679877519607544, + "learning_rate": 4.750891249302796e-05, + "loss": 5.7126, + "step": 24097 + }, + { + "epoch": 0.1433176325054715, + "grad_norm": 1.3325512409210205, + "learning_rate": 4.750870922974819e-05, + "loss": 5.512, + "step": 24098 + }, + { + "epoch": 0.1433235797887525, + "grad_norm": 1.443447470664978, + "learning_rate": 4.750850595861086e-05, + "loss": 5.4712, + "step": 24099 + }, + { + "epoch": 0.1433295270720335, + "grad_norm": 1.5300956964492798, + "learning_rate": 4.7508302679616044e-05, + "loss": 5.2247, + "step": 24100 + }, + { + "epoch": 0.1433354743553145, + "grad_norm": 1.4438292980194092, + "learning_rate": 4.750809939276381e-05, + "loss": 5.3292, + "step": 24101 + }, + { + "epoch": 0.14334142163859548, + "grad_norm": 1.5861626863479614, + "learning_rate": 4.750789609805423e-05, + "loss": 5.1881, + "step": 24102 + }, + { + "epoch": 0.1433473689218765, + "grad_norm": 1.4352222681045532, + "learning_rate": 4.750769279548738e-05, + "loss": 5.3461, + "step": 24103 + }, + { + "epoch": 0.14335331620515748, + "grad_norm": 1.4064099788665771, + "learning_rate": 4.750748948506332e-05, + "loss": 5.1699, + "step": 24104 + }, + { + "epoch": 0.14335926348843847, + "grad_norm": 1.2421483993530273, + "learning_rate": 4.7507286166782136e-05, + "loss": 5.3811, + "step": 24105 + }, + { + "epoch": 0.1433652107717195, + "grad_norm": 1.430109977722168, + "learning_rate": 4.750708284064389e-05, + "loss": 5.3169, + "step": 24106 + }, + { + "epoch": 0.14337115805500047, + "grad_norm": 1.4107475280761719, + "learning_rate": 4.750687950664865e-05, + "loss": 5.1744, + "step": 24107 + }, + { + "epoch": 0.14337710533828146, + "grad_norm": 1.4888633489608765, + "learning_rate": 4.750667616479649e-05, + "loss": 5.0892, + "step": 24108 + }, + { + "epoch": 0.14338305262156248, + "grad_norm": 1.5325970649719238, + "learning_rate": 4.7506472815087486e-05, + "loss": 4.8421, + "step": 24109 + }, + { + "epoch": 0.14338899990484347, + "grad_norm": 1.806287407875061, + "learning_rate": 4.75062694575217e-05, + "loss": 5.459, + "step": 24110 + }, + { + "epoch": 0.14339494718812446, + "grad_norm": 1.8281558752059937, + "learning_rate": 4.750606609209921e-05, + "loss": 4.7275, + "step": 24111 + }, + { + "epoch": 0.14340089447140547, + "grad_norm": 1.3527547121047974, + "learning_rate": 4.750586271882009e-05, + "loss": 5.4797, + "step": 24112 + }, + { + "epoch": 0.14340684175468646, + "grad_norm": 1.719956874847412, + "learning_rate": 4.75056593376844e-05, + "loss": 5.1069, + "step": 24113 + }, + { + "epoch": 0.14341278903796745, + "grad_norm": 1.484231948852539, + "learning_rate": 4.750545594869222e-05, + "loss": 5.2246, + "step": 24114 + }, + { + "epoch": 0.14341873632124846, + "grad_norm": 1.7525322437286377, + "learning_rate": 4.7505252551843615e-05, + "loss": 5.2036, + "step": 24115 + }, + { + "epoch": 0.14342468360452945, + "grad_norm": 1.6943596601486206, + "learning_rate": 4.7505049147138656e-05, + "loss": 5.6783, + "step": 24116 + }, + { + "epoch": 0.14343063088781044, + "grad_norm": 1.619377851486206, + "learning_rate": 4.750484573457743e-05, + "loss": 5.4861, + "step": 24117 + }, + { + "epoch": 0.14343657817109146, + "grad_norm": 1.9882891178131104, + "learning_rate": 4.750464231415998e-05, + "loss": 5.1085, + "step": 24118 + }, + { + "epoch": 0.14344252545437244, + "grad_norm": 1.4033042192459106, + "learning_rate": 4.75044388858864e-05, + "loss": 5.2776, + "step": 24119 + }, + { + "epoch": 0.14344847273765343, + "grad_norm": 1.2633885145187378, + "learning_rate": 4.750423544975675e-05, + "loss": 5.3406, + "step": 24120 + }, + { + "epoch": 0.14345442002093445, + "grad_norm": 1.4787468910217285, + "learning_rate": 4.7504032005771105e-05, + "loss": 5.5417, + "step": 24121 + }, + { + "epoch": 0.14346036730421544, + "grad_norm": 1.6677738428115845, + "learning_rate": 4.750382855392953e-05, + "loss": 5.39, + "step": 24122 + }, + { + "epoch": 0.14346631458749642, + "grad_norm": 1.6277536153793335, + "learning_rate": 4.750362509423211e-05, + "loss": 5.443, + "step": 24123 + }, + { + "epoch": 0.14347226187077744, + "grad_norm": 1.7157353162765503, + "learning_rate": 4.75034216266789e-05, + "loss": 5.6696, + "step": 24124 + }, + { + "epoch": 0.14347820915405843, + "grad_norm": 1.6321076154708862, + "learning_rate": 4.750321815126998e-05, + "loss": 5.4125, + "step": 24125 + }, + { + "epoch": 0.14348415643733942, + "grad_norm": 1.3769804239273071, + "learning_rate": 4.750301466800542e-05, + "loss": 5.5333, + "step": 24126 + }, + { + "epoch": 0.14349010372062043, + "grad_norm": 1.6320770978927612, + "learning_rate": 4.7502811176885286e-05, + "loss": 5.062, + "step": 24127 + }, + { + "epoch": 0.14349605100390142, + "grad_norm": 1.8570098876953125, + "learning_rate": 4.750260767790966e-05, + "loss": 4.8349, + "step": 24128 + }, + { + "epoch": 0.1435019982871824, + "grad_norm": 1.6399726867675781, + "learning_rate": 4.7502404171078604e-05, + "loss": 5.0899, + "step": 24129 + }, + { + "epoch": 0.14350794557046342, + "grad_norm": 1.6327539682388306, + "learning_rate": 4.7502200656392184e-05, + "loss": 5.4722, + "step": 24130 + }, + { + "epoch": 0.1435138928537444, + "grad_norm": 1.887136697769165, + "learning_rate": 4.750199713385048e-05, + "loss": 5.2569, + "step": 24131 + }, + { + "epoch": 0.1435198401370254, + "grad_norm": 1.8090238571166992, + "learning_rate": 4.750179360345357e-05, + "loss": 5.252, + "step": 24132 + }, + { + "epoch": 0.14352578742030642, + "grad_norm": 1.7913198471069336, + "learning_rate": 4.750159006520152e-05, + "loss": 5.2661, + "step": 24133 + }, + { + "epoch": 0.1435317347035874, + "grad_norm": 2.239309310913086, + "learning_rate": 4.7501386519094385e-05, + "loss": 5.1478, + "step": 24134 + }, + { + "epoch": 0.1435376819868684, + "grad_norm": 2.179140090942383, + "learning_rate": 4.750118296513225e-05, + "loss": 4.9088, + "step": 24135 + }, + { + "epoch": 0.1435436292701494, + "grad_norm": 1.629287838935852, + "learning_rate": 4.7500979403315186e-05, + "loss": 5.0642, + "step": 24136 + }, + { + "epoch": 0.1435495765534304, + "grad_norm": 1.598783254623413, + "learning_rate": 4.750077583364326e-05, + "loss": 5.7616, + "step": 24137 + }, + { + "epoch": 0.14355552383671139, + "grad_norm": 1.792859435081482, + "learning_rate": 4.750057225611656e-05, + "loss": 6.1022, + "step": 24138 + }, + { + "epoch": 0.1435614711199924, + "grad_norm": 1.728210687637329, + "learning_rate": 4.750036867073513e-05, + "loss": 5.904, + "step": 24139 + }, + { + "epoch": 0.1435674184032734, + "grad_norm": 1.9541816711425781, + "learning_rate": 4.7500165077499056e-05, + "loss": 5.3199, + "step": 24140 + }, + { + "epoch": 0.14357336568655438, + "grad_norm": 1.6042431592941284, + "learning_rate": 4.7499961476408405e-05, + "loss": 5.5277, + "step": 24141 + }, + { + "epoch": 0.14357931296983537, + "grad_norm": 1.50521719455719, + "learning_rate": 4.749975786746325e-05, + "loss": 5.4995, + "step": 24142 + }, + { + "epoch": 0.14358526025311638, + "grad_norm": 1.2425066232681274, + "learning_rate": 4.749955425066366e-05, + "loss": 5.6135, + "step": 24143 + }, + { + "epoch": 0.14359120753639737, + "grad_norm": 1.3020912408828735, + "learning_rate": 4.749935062600971e-05, + "loss": 5.5885, + "step": 24144 + }, + { + "epoch": 0.14359715481967836, + "grad_norm": 1.8732852935791016, + "learning_rate": 4.749914699350148e-05, + "loss": 5.3004, + "step": 24145 + }, + { + "epoch": 0.14360310210295937, + "grad_norm": 1.5296770334243774, + "learning_rate": 4.749894335313901e-05, + "loss": 5.5932, + "step": 24146 + }, + { + "epoch": 0.14360904938624036, + "grad_norm": 1.6563706398010254, + "learning_rate": 4.749873970492241e-05, + "loss": 5.4436, + "step": 24147 + }, + { + "epoch": 0.14361499666952135, + "grad_norm": 1.5168625116348267, + "learning_rate": 4.749853604885172e-05, + "loss": 5.5198, + "step": 24148 + }, + { + "epoch": 0.14362094395280237, + "grad_norm": 1.8161656856536865, + "learning_rate": 4.749833238492703e-05, + "loss": 5.3261, + "step": 24149 + }, + { + "epoch": 0.14362689123608335, + "grad_norm": 1.6286919116973877, + "learning_rate": 4.749812871314841e-05, + "loss": 5.3505, + "step": 24150 + }, + { + "epoch": 0.14363283851936434, + "grad_norm": 1.6236040592193604, + "learning_rate": 4.749792503351591e-05, + "loss": 5.4271, + "step": 24151 + }, + { + "epoch": 0.14363878580264536, + "grad_norm": 1.8177775144577026, + "learning_rate": 4.749772134602963e-05, + "loss": 5.2076, + "step": 24152 + }, + { + "epoch": 0.14364473308592635, + "grad_norm": 1.8818564414978027, + "learning_rate": 4.7497517650689616e-05, + "loss": 5.2685, + "step": 24153 + }, + { + "epoch": 0.14365068036920733, + "grad_norm": 1.7166740894317627, + "learning_rate": 4.749731394749596e-05, + "loss": 5.0742, + "step": 24154 + }, + { + "epoch": 0.14365662765248835, + "grad_norm": 1.6446893215179443, + "learning_rate": 4.749711023644873e-05, + "loss": 5.0406, + "step": 24155 + }, + { + "epoch": 0.14366257493576934, + "grad_norm": 1.5812546014785767, + "learning_rate": 4.749690651754798e-05, + "loss": 5.1155, + "step": 24156 + }, + { + "epoch": 0.14366852221905033, + "grad_norm": 1.8002673387527466, + "learning_rate": 4.749670279079379e-05, + "loss": 4.8509, + "step": 24157 + }, + { + "epoch": 0.14367446950233134, + "grad_norm": 1.6835267543792725, + "learning_rate": 4.749649905618624e-05, + "loss": 4.8694, + "step": 24158 + }, + { + "epoch": 0.14368041678561233, + "grad_norm": 1.605454683303833, + "learning_rate": 4.74962953137254e-05, + "loss": 4.926, + "step": 24159 + }, + { + "epoch": 0.14368636406889332, + "grad_norm": 1.6154637336730957, + "learning_rate": 4.749609156341133e-05, + "loss": 5.0548, + "step": 24160 + }, + { + "epoch": 0.14369231135217433, + "grad_norm": 1.7472615242004395, + "learning_rate": 4.74958878052441e-05, + "loss": 5.2218, + "step": 24161 + }, + { + "epoch": 0.14369825863545532, + "grad_norm": 1.80000901222229, + "learning_rate": 4.7495684039223795e-05, + "loss": 5.5268, + "step": 24162 + }, + { + "epoch": 0.1437042059187363, + "grad_norm": 1.6673831939697266, + "learning_rate": 4.749548026535048e-05, + "loss": 4.9823, + "step": 24163 + }, + { + "epoch": 0.14371015320201733, + "grad_norm": 1.5900602340698242, + "learning_rate": 4.749527648362422e-05, + "loss": 4.9122, + "step": 24164 + }, + { + "epoch": 0.14371610048529831, + "grad_norm": 1.538674235343933, + "learning_rate": 4.74950726940451e-05, + "loss": 4.887, + "step": 24165 + }, + { + "epoch": 0.1437220477685793, + "grad_norm": 1.5512803792953491, + "learning_rate": 4.749486889661318e-05, + "loss": 5.106, + "step": 24166 + }, + { + "epoch": 0.14372799505186032, + "grad_norm": 1.6589990854263306, + "learning_rate": 4.7494665091328524e-05, + "loss": 5.1019, + "step": 24167 + }, + { + "epoch": 0.1437339423351413, + "grad_norm": 1.3078352212905884, + "learning_rate": 4.7494461278191225e-05, + "loss": 5.5803, + "step": 24168 + }, + { + "epoch": 0.1437398896184223, + "grad_norm": 1.2839313745498657, + "learning_rate": 4.7494257457201333e-05, + "loss": 5.2538, + "step": 24169 + }, + { + "epoch": 0.1437458369017033, + "grad_norm": 1.6686280965805054, + "learning_rate": 4.749405362835894e-05, + "loss": 4.6737, + "step": 24170 + }, + { + "epoch": 0.1437517841849843, + "grad_norm": 1.6385589838027954, + "learning_rate": 4.7493849791664094e-05, + "loss": 5.3224, + "step": 24171 + }, + { + "epoch": 0.1437577314682653, + "grad_norm": 1.5661671161651611, + "learning_rate": 4.749364594711688e-05, + "loss": 5.4675, + "step": 24172 + }, + { + "epoch": 0.1437636787515463, + "grad_norm": 1.481903314590454, + "learning_rate": 4.749344209471737e-05, + "loss": 5.6801, + "step": 24173 + }, + { + "epoch": 0.1437696260348273, + "grad_norm": 1.6317354440689087, + "learning_rate": 4.749323823446562e-05, + "loss": 5.2531, + "step": 24174 + }, + { + "epoch": 0.14377557331810828, + "grad_norm": 1.7542403936386108, + "learning_rate": 4.749303436636173e-05, + "loss": 4.9242, + "step": 24175 + }, + { + "epoch": 0.1437815206013893, + "grad_norm": 1.7798454761505127, + "learning_rate": 4.7492830490405746e-05, + "loss": 5.0939, + "step": 24176 + }, + { + "epoch": 0.14378746788467028, + "grad_norm": 1.3787469863891602, + "learning_rate": 4.7492626606597744e-05, + "loss": 5.2257, + "step": 24177 + }, + { + "epoch": 0.14379341516795127, + "grad_norm": 1.7178335189819336, + "learning_rate": 4.7492422714937806e-05, + "loss": 4.5083, + "step": 24178 + }, + { + "epoch": 0.1437993624512323, + "grad_norm": 1.559964656829834, + "learning_rate": 4.7492218815425996e-05, + "loss": 5.3788, + "step": 24179 + }, + { + "epoch": 0.14380530973451328, + "grad_norm": 3.269479990005493, + "learning_rate": 4.749201490806238e-05, + "loss": 4.0238, + "step": 24180 + }, + { + "epoch": 0.14381125701779426, + "grad_norm": 1.696169137954712, + "learning_rate": 4.749181099284703e-05, + "loss": 5.7992, + "step": 24181 + }, + { + "epoch": 0.14381720430107528, + "grad_norm": 1.563265085220337, + "learning_rate": 4.749160706978003e-05, + "loss": 5.2459, + "step": 24182 + }, + { + "epoch": 0.14382315158435627, + "grad_norm": 1.6364827156066895, + "learning_rate": 4.7491403138861435e-05, + "loss": 5.2826, + "step": 24183 + }, + { + "epoch": 0.14382909886763726, + "grad_norm": 1.82567298412323, + "learning_rate": 4.749119920009132e-05, + "loss": 4.8079, + "step": 24184 + }, + { + "epoch": 0.14383504615091827, + "grad_norm": 1.3982584476470947, + "learning_rate": 4.7490995253469774e-05, + "loss": 5.4093, + "step": 24185 + }, + { + "epoch": 0.14384099343419926, + "grad_norm": 1.349155068397522, + "learning_rate": 4.749079129899684e-05, + "loss": 5.3707, + "step": 24186 + }, + { + "epoch": 0.14384694071748025, + "grad_norm": 1.4101881980895996, + "learning_rate": 4.749058733667261e-05, + "loss": 4.9554, + "step": 24187 + }, + { + "epoch": 0.14385288800076126, + "grad_norm": 1.1910806894302368, + "learning_rate": 4.749038336649715e-05, + "loss": 5.0658, + "step": 24188 + }, + { + "epoch": 0.14385883528404225, + "grad_norm": 1.5315760374069214, + "learning_rate": 4.749017938847052e-05, + "loss": 5.4716, + "step": 24189 + }, + { + "epoch": 0.14386478256732324, + "grad_norm": 1.1762129068374634, + "learning_rate": 4.7489975402592814e-05, + "loss": 5.6235, + "step": 24190 + }, + { + "epoch": 0.14387072985060426, + "grad_norm": 1.2317709922790527, + "learning_rate": 4.748977140886408e-05, + "loss": 5.8842, + "step": 24191 + }, + { + "epoch": 0.14387667713388524, + "grad_norm": 1.439610481262207, + "learning_rate": 4.7489567407284405e-05, + "loss": 5.4157, + "step": 24192 + }, + { + "epoch": 0.14388262441716623, + "grad_norm": 1.842933177947998, + "learning_rate": 4.7489363397853854e-05, + "loss": 5.1555, + "step": 24193 + }, + { + "epoch": 0.14388857170044725, + "grad_norm": 1.887911081314087, + "learning_rate": 4.748915938057249e-05, + "loss": 5.5591, + "step": 24194 + }, + { + "epoch": 0.14389451898372824, + "grad_norm": 1.7697376012802124, + "learning_rate": 4.7488955355440404e-05, + "loss": 5.5617, + "step": 24195 + }, + { + "epoch": 0.14390046626700922, + "grad_norm": 1.5946240425109863, + "learning_rate": 4.7488751322457655e-05, + "loss": 5.3901, + "step": 24196 + }, + { + "epoch": 0.14390641355029024, + "grad_norm": 1.7462904453277588, + "learning_rate": 4.7488547281624306e-05, + "loss": 5.3187, + "step": 24197 + }, + { + "epoch": 0.14391236083357123, + "grad_norm": 1.7388325929641724, + "learning_rate": 4.7488343232940445e-05, + "loss": 5.0042, + "step": 24198 + }, + { + "epoch": 0.14391830811685222, + "grad_norm": 1.5990902185440063, + "learning_rate": 4.7488139176406135e-05, + "loss": 5.1336, + "step": 24199 + }, + { + "epoch": 0.1439242554001332, + "grad_norm": 1.7063771486282349, + "learning_rate": 4.748793511202145e-05, + "loss": 5.6073, + "step": 24200 + }, + { + "epoch": 0.14393020268341422, + "grad_norm": 1.5042674541473389, + "learning_rate": 4.748773103978645e-05, + "loss": 5.6617, + "step": 24201 + }, + { + "epoch": 0.1439361499666952, + "grad_norm": 1.4366991519927979, + "learning_rate": 4.7487526959701225e-05, + "loss": 5.3679, + "step": 24202 + }, + { + "epoch": 0.1439420972499762, + "grad_norm": 1.571524977684021, + "learning_rate": 4.748732287176584e-05, + "loss": 5.5487, + "step": 24203 + }, + { + "epoch": 0.1439480445332572, + "grad_norm": 1.3584872484207153, + "learning_rate": 4.748711877598036e-05, + "loss": 5.3332, + "step": 24204 + }, + { + "epoch": 0.1439539918165382, + "grad_norm": 1.4718894958496094, + "learning_rate": 4.748691467234484e-05, + "loss": 5.3985, + "step": 24205 + }, + { + "epoch": 0.1439599390998192, + "grad_norm": 1.5978455543518066, + "learning_rate": 4.748671056085939e-05, + "loss": 5.6351, + "step": 24206 + }, + { + "epoch": 0.1439658863831002, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.748650644152406e-05, + "loss": 4.9972, + "step": 24207 + }, + { + "epoch": 0.1439718336663812, + "grad_norm": 1.7493484020233154, + "learning_rate": 4.748630231433891e-05, + "loss": 4.8863, + "step": 24208 + }, + { + "epoch": 0.14397778094966218, + "grad_norm": 1.7967579364776611, + "learning_rate": 4.748609817930405e-05, + "loss": 5.5271, + "step": 24209 + }, + { + "epoch": 0.1439837282329432, + "grad_norm": 1.3049358129501343, + "learning_rate": 4.7485894036419505e-05, + "loss": 5.631, + "step": 24210 + }, + { + "epoch": 0.14398967551622419, + "grad_norm": 2.1333138942718506, + "learning_rate": 4.7485689885685366e-05, + "loss": 4.3777, + "step": 24211 + }, + { + "epoch": 0.14399562279950517, + "grad_norm": 1.7402033805847168, + "learning_rate": 4.748548572710172e-05, + "loss": 5.0069, + "step": 24212 + }, + { + "epoch": 0.1440015700827862, + "grad_norm": 1.5663232803344727, + "learning_rate": 4.748528156066861e-05, + "loss": 5.8514, + "step": 24213 + }, + { + "epoch": 0.14400751736606718, + "grad_norm": 1.5079457759857178, + "learning_rate": 4.748507738638612e-05, + "loss": 5.771, + "step": 24214 + }, + { + "epoch": 0.14401346464934817, + "grad_norm": 1.407939076423645, + "learning_rate": 4.7484873204254334e-05, + "loss": 5.405, + "step": 24215 + }, + { + "epoch": 0.14401941193262918, + "grad_norm": 1.6172797679901123, + "learning_rate": 4.7484669014273296e-05, + "loss": 5.3918, + "step": 24216 + }, + { + "epoch": 0.14402535921591017, + "grad_norm": 1.52508544921875, + "learning_rate": 4.74844648164431e-05, + "loss": 5.3287, + "step": 24217 + }, + { + "epoch": 0.14403130649919116, + "grad_norm": 1.6615005731582642, + "learning_rate": 4.7484260610763806e-05, + "loss": 5.3211, + "step": 24218 + }, + { + "epoch": 0.14403725378247217, + "grad_norm": 1.7896537780761719, + "learning_rate": 4.74840563972355e-05, + "loss": 5.3131, + "step": 24219 + }, + { + "epoch": 0.14404320106575316, + "grad_norm": 1.665890097618103, + "learning_rate": 4.748385217585823e-05, + "loss": 5.4934, + "step": 24220 + }, + { + "epoch": 0.14404914834903415, + "grad_norm": 1.9217110872268677, + "learning_rate": 4.7483647946632085e-05, + "loss": 4.9057, + "step": 24221 + }, + { + "epoch": 0.14405509563231517, + "grad_norm": 1.3658103942871094, + "learning_rate": 4.748344370955713e-05, + "loss": 5.3585, + "step": 24222 + }, + { + "epoch": 0.14406104291559615, + "grad_norm": 1.3099697828292847, + "learning_rate": 4.748323946463343e-05, + "loss": 5.7427, + "step": 24223 + }, + { + "epoch": 0.14406699019887714, + "grad_norm": 1.5619271993637085, + "learning_rate": 4.7483035211861075e-05, + "loss": 5.4217, + "step": 24224 + }, + { + "epoch": 0.14407293748215816, + "grad_norm": 1.6359944343566895, + "learning_rate": 4.748283095124012e-05, + "loss": 5.0194, + "step": 24225 + }, + { + "epoch": 0.14407888476543915, + "grad_norm": 1.5773736238479614, + "learning_rate": 4.748262668277064e-05, + "loss": 5.0422, + "step": 24226 + }, + { + "epoch": 0.14408483204872014, + "grad_norm": 1.4909980297088623, + "learning_rate": 4.748242240645271e-05, + "loss": 5.6089, + "step": 24227 + }, + { + "epoch": 0.14409077933200115, + "grad_norm": 1.3489822149276733, + "learning_rate": 4.74822181222864e-05, + "loss": 5.6137, + "step": 24228 + }, + { + "epoch": 0.14409672661528214, + "grad_norm": 1.3335795402526855, + "learning_rate": 4.748201383027178e-05, + "loss": 5.4704, + "step": 24229 + }, + { + "epoch": 0.14410267389856313, + "grad_norm": 1.2519936561584473, + "learning_rate": 4.748180953040891e-05, + "loss": 5.5211, + "step": 24230 + }, + { + "epoch": 0.14410862118184414, + "grad_norm": 1.3223121166229248, + "learning_rate": 4.748160522269788e-05, + "loss": 5.897, + "step": 24231 + }, + { + "epoch": 0.14411456846512513, + "grad_norm": 1.3471014499664307, + "learning_rate": 4.748140090713876e-05, + "loss": 5.5012, + "step": 24232 + }, + { + "epoch": 0.14412051574840612, + "grad_norm": 1.7432321310043335, + "learning_rate": 4.74811965837316e-05, + "loss": 5.5286, + "step": 24233 + }, + { + "epoch": 0.14412646303168714, + "grad_norm": 1.4858758449554443, + "learning_rate": 4.74809922524765e-05, + "loss": 5.0719, + "step": 24234 + }, + { + "epoch": 0.14413241031496812, + "grad_norm": 1.3750518560409546, + "learning_rate": 4.7480787913373515e-05, + "loss": 5.63, + "step": 24235 + }, + { + "epoch": 0.1441383575982491, + "grad_norm": 1.3795223236083984, + "learning_rate": 4.7480583566422723e-05, + "loss": 5.5985, + "step": 24236 + }, + { + "epoch": 0.14414430488153013, + "grad_norm": 1.5779204368591309, + "learning_rate": 4.7480379211624185e-05, + "loss": 5.4503, + "step": 24237 + }, + { + "epoch": 0.14415025216481112, + "grad_norm": 1.5513705015182495, + "learning_rate": 4.7480174848977974e-05, + "loss": 5.6559, + "step": 24238 + }, + { + "epoch": 0.1441561994480921, + "grad_norm": 1.3171751499176025, + "learning_rate": 4.747997047848417e-05, + "loss": 5.7664, + "step": 24239 + }, + { + "epoch": 0.14416214673137312, + "grad_norm": 1.4049638509750366, + "learning_rate": 4.7479766100142855e-05, + "loss": 5.7167, + "step": 24240 + }, + { + "epoch": 0.1441680940146541, + "grad_norm": 1.5657798051834106, + "learning_rate": 4.747956171395407e-05, + "loss": 5.3544, + "step": 24241 + }, + { + "epoch": 0.1441740412979351, + "grad_norm": 1.7015857696533203, + "learning_rate": 4.747935731991791e-05, + "loss": 5.2192, + "step": 24242 + }, + { + "epoch": 0.1441799885812161, + "grad_norm": 1.396626591682434, + "learning_rate": 4.7479152918034433e-05, + "loss": 5.6169, + "step": 24243 + }, + { + "epoch": 0.1441859358644971, + "grad_norm": 1.5319141149520874, + "learning_rate": 4.7478948508303714e-05, + "loss": 5.5103, + "step": 24244 + }, + { + "epoch": 0.1441918831477781, + "grad_norm": 1.878131628036499, + "learning_rate": 4.747874409072583e-05, + "loss": 5.0926, + "step": 24245 + }, + { + "epoch": 0.1441978304310591, + "grad_norm": 1.3702614307403564, + "learning_rate": 4.7478539665300845e-05, + "loss": 5.5891, + "step": 24246 + }, + { + "epoch": 0.1442037777143401, + "grad_norm": 1.729227066040039, + "learning_rate": 4.7478335232028845e-05, + "loss": 5.4893, + "step": 24247 + }, + { + "epoch": 0.14420972499762108, + "grad_norm": 1.356343150138855, + "learning_rate": 4.747813079090988e-05, + "loss": 5.3913, + "step": 24248 + }, + { + "epoch": 0.1442156722809021, + "grad_norm": 1.6735188961029053, + "learning_rate": 4.7477926341944036e-05, + "loss": 5.1161, + "step": 24249 + }, + { + "epoch": 0.14422161956418308, + "grad_norm": 1.6281756162643433, + "learning_rate": 4.7477721885131376e-05, + "loss": 5.0971, + "step": 24250 + }, + { + "epoch": 0.14422756684746407, + "grad_norm": 1.789338231086731, + "learning_rate": 4.747751742047199e-05, + "loss": 5.0477, + "step": 24251 + }, + { + "epoch": 0.1442335141307451, + "grad_norm": 2.3384926319122314, + "learning_rate": 4.7477312947965915e-05, + "loss": 4.5108, + "step": 24252 + }, + { + "epoch": 0.14423946141402608, + "grad_norm": 2.1642465591430664, + "learning_rate": 4.7477108467613255e-05, + "loss": 4.6503, + "step": 24253 + }, + { + "epoch": 0.14424540869730706, + "grad_norm": 2.0242364406585693, + "learning_rate": 4.747690397941406e-05, + "loss": 4.7346, + "step": 24254 + }, + { + "epoch": 0.14425135598058808, + "grad_norm": 2.543030023574829, + "learning_rate": 4.7476699483368414e-05, + "loss": 4.4076, + "step": 24255 + }, + { + "epoch": 0.14425730326386907, + "grad_norm": 2.274937391281128, + "learning_rate": 4.747649497947638e-05, + "loss": 4.5464, + "step": 24256 + }, + { + "epoch": 0.14426325054715006, + "grad_norm": 2.695321798324585, + "learning_rate": 4.747629046773805e-05, + "loss": 4.5794, + "step": 24257 + }, + { + "epoch": 0.14426919783043107, + "grad_norm": 2.2838776111602783, + "learning_rate": 4.7476085948153465e-05, + "loss": 4.6079, + "step": 24258 + }, + { + "epoch": 0.14427514511371206, + "grad_norm": 2.1405718326568604, + "learning_rate": 4.7475881420722714e-05, + "loss": 4.4428, + "step": 24259 + }, + { + "epoch": 0.14428109239699305, + "grad_norm": 2.17814302444458, + "learning_rate": 4.747567688544586e-05, + "loss": 4.3945, + "step": 24260 + }, + { + "epoch": 0.14428703968027404, + "grad_norm": 2.24731183052063, + "learning_rate": 4.747547234232299e-05, + "loss": 4.4622, + "step": 24261 + }, + { + "epoch": 0.14429298696355505, + "grad_norm": 2.2340478897094727, + "learning_rate": 4.747526779135416e-05, + "loss": 4.3968, + "step": 24262 + }, + { + "epoch": 0.14429893424683604, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.747506323253944e-05, + "loss": 4.4357, + "step": 24263 + }, + { + "epoch": 0.14430488153011703, + "grad_norm": 2.30887770652771, + "learning_rate": 4.747485866587891e-05, + "loss": 4.3798, + "step": 24264 + }, + { + "epoch": 0.14431082881339805, + "grad_norm": 1.8898377418518066, + "learning_rate": 4.7474654091372645e-05, + "loss": 4.759, + "step": 24265 + }, + { + "epoch": 0.14431677609667903, + "grad_norm": 1.8610650300979614, + "learning_rate": 4.747444950902071e-05, + "loss": 5.2619, + "step": 24266 + }, + { + "epoch": 0.14432272337996002, + "grad_norm": 2.0524682998657227, + "learning_rate": 4.747424491882317e-05, + "loss": 5.1975, + "step": 24267 + }, + { + "epoch": 0.14432867066324104, + "grad_norm": 1.9053709506988525, + "learning_rate": 4.7474040320780114e-05, + "loss": 4.9233, + "step": 24268 + }, + { + "epoch": 0.14433461794652203, + "grad_norm": 1.8127448558807373, + "learning_rate": 4.747383571489159e-05, + "loss": 5.4335, + "step": 24269 + }, + { + "epoch": 0.14434056522980301, + "grad_norm": 1.6836609840393066, + "learning_rate": 4.747363110115769e-05, + "loss": 5.3978, + "step": 24270 + }, + { + "epoch": 0.14434651251308403, + "grad_norm": 1.5606380701065063, + "learning_rate": 4.747342647957848e-05, + "loss": 5.4756, + "step": 24271 + }, + { + "epoch": 0.14435245979636502, + "grad_norm": 1.5684814453125, + "learning_rate": 4.747322185015402e-05, + "loss": 5.2942, + "step": 24272 + }, + { + "epoch": 0.144358407079646, + "grad_norm": 1.4253596067428589, + "learning_rate": 4.7473017212884395e-05, + "loss": 5.3061, + "step": 24273 + }, + { + "epoch": 0.14436435436292702, + "grad_norm": 1.5249817371368408, + "learning_rate": 4.747281256776968e-05, + "loss": 5.2824, + "step": 24274 + }, + { + "epoch": 0.144370301646208, + "grad_norm": 1.7111622095108032, + "learning_rate": 4.747260791480992e-05, + "loss": 5.3591, + "step": 24275 + }, + { + "epoch": 0.144376248929489, + "grad_norm": 1.6259697675704956, + "learning_rate": 4.7472403254005216e-05, + "loss": 5.6083, + "step": 24276 + }, + { + "epoch": 0.14438219621277001, + "grad_norm": 1.7138687372207642, + "learning_rate": 4.7472198585355634e-05, + "loss": 5.45, + "step": 24277 + }, + { + "epoch": 0.144388143496051, + "grad_norm": 1.55049729347229, + "learning_rate": 4.7471993908861226e-05, + "loss": 5.413, + "step": 24278 + }, + { + "epoch": 0.144394090779332, + "grad_norm": 1.619774580001831, + "learning_rate": 4.7471789224522086e-05, + "loss": 5.4499, + "step": 24279 + }, + { + "epoch": 0.144400038062613, + "grad_norm": 1.4726954698562622, + "learning_rate": 4.747158453233828e-05, + "loss": 5.3787, + "step": 24280 + }, + { + "epoch": 0.144405985345894, + "grad_norm": 1.5688132047653198, + "learning_rate": 4.7471379832309865e-05, + "loss": 5.0952, + "step": 24281 + }, + { + "epoch": 0.14441193262917498, + "grad_norm": 1.5431749820709229, + "learning_rate": 4.747117512443693e-05, + "loss": 5.4646, + "step": 24282 + }, + { + "epoch": 0.144417879912456, + "grad_norm": 1.5271220207214355, + "learning_rate": 4.747097040871954e-05, + "loss": 4.7074, + "step": 24283 + }, + { + "epoch": 0.144423827195737, + "grad_norm": 1.49335777759552, + "learning_rate": 4.7470765685157765e-05, + "loss": 5.1271, + "step": 24284 + }, + { + "epoch": 0.14442977447901797, + "grad_norm": 1.624834418296814, + "learning_rate": 4.7470560953751675e-05, + "loss": 4.7448, + "step": 24285 + }, + { + "epoch": 0.144435721762299, + "grad_norm": 1.4151476621627808, + "learning_rate": 4.7470356214501355e-05, + "loss": 5.2011, + "step": 24286 + }, + { + "epoch": 0.14444166904557998, + "grad_norm": 1.4529035091400146, + "learning_rate": 4.747015146740685e-05, + "loss": 5.2849, + "step": 24287 + }, + { + "epoch": 0.14444761632886097, + "grad_norm": 1.43472158908844, + "learning_rate": 4.746994671246826e-05, + "loss": 5.2655, + "step": 24288 + }, + { + "epoch": 0.14445356361214198, + "grad_norm": 1.2202403545379639, + "learning_rate": 4.7469741949685645e-05, + "loss": 5.3629, + "step": 24289 + }, + { + "epoch": 0.14445951089542297, + "grad_norm": 1.5001815557479858, + "learning_rate": 4.746953717905906e-05, + "loss": 5.3728, + "step": 24290 + }, + { + "epoch": 0.14446545817870396, + "grad_norm": 1.3214131593704224, + "learning_rate": 4.7469332400588604e-05, + "loss": 5.2877, + "step": 24291 + }, + { + "epoch": 0.14447140546198498, + "grad_norm": 1.5443751811981201, + "learning_rate": 4.7469127614274334e-05, + "loss": 5.2852, + "step": 24292 + }, + { + "epoch": 0.14447735274526596, + "grad_norm": 1.63779616355896, + "learning_rate": 4.746892282011632e-05, + "loss": 5.1985, + "step": 24293 + }, + { + "epoch": 0.14448330002854695, + "grad_norm": 1.4744620323181152, + "learning_rate": 4.7468718018114644e-05, + "loss": 5.4607, + "step": 24294 + }, + { + "epoch": 0.14448924731182797, + "grad_norm": 1.6099250316619873, + "learning_rate": 4.7468513208269366e-05, + "loss": 5.3546, + "step": 24295 + }, + { + "epoch": 0.14449519459510896, + "grad_norm": 1.692960262298584, + "learning_rate": 4.746830839058056e-05, + "loss": 5.2117, + "step": 24296 + }, + { + "epoch": 0.14450114187838994, + "grad_norm": 2.379516363143921, + "learning_rate": 4.746810356504831e-05, + "loss": 4.3924, + "step": 24297 + }, + { + "epoch": 0.14450708916167096, + "grad_norm": 1.5348504781723022, + "learning_rate": 4.7467898731672665e-05, + "loss": 5.556, + "step": 24298 + }, + { + "epoch": 0.14451303644495195, + "grad_norm": 1.65830397605896, + "learning_rate": 4.746769389045371e-05, + "loss": 5.26, + "step": 24299 + }, + { + "epoch": 0.14451898372823294, + "grad_norm": 1.6785426139831543, + "learning_rate": 4.746748904139152e-05, + "loss": 5.6459, + "step": 24300 + }, + { + "epoch": 0.14452493101151395, + "grad_norm": 1.8990434408187866, + "learning_rate": 4.746728418448616e-05, + "loss": 5.7021, + "step": 24301 + }, + { + "epoch": 0.14453087829479494, + "grad_norm": 1.5564841032028198, + "learning_rate": 4.7467079319737706e-05, + "loss": 5.1878, + "step": 24302 + }, + { + "epoch": 0.14453682557807593, + "grad_norm": 1.5522741079330444, + "learning_rate": 4.7466874447146226e-05, + "loss": 5.356, + "step": 24303 + }, + { + "epoch": 0.14454277286135694, + "grad_norm": 1.5835893154144287, + "learning_rate": 4.746666956671179e-05, + "loss": 5.1861, + "step": 24304 + }, + { + "epoch": 0.14454872014463793, + "grad_norm": 1.5514174699783325, + "learning_rate": 4.746646467843447e-05, + "loss": 4.9673, + "step": 24305 + }, + { + "epoch": 0.14455466742791892, + "grad_norm": 1.5682575702667236, + "learning_rate": 4.746625978231435e-05, + "loss": 4.8175, + "step": 24306 + }, + { + "epoch": 0.14456061471119994, + "grad_norm": 1.7369959354400635, + "learning_rate": 4.746605487835148e-05, + "loss": 4.8891, + "step": 24307 + }, + { + "epoch": 0.14456656199448092, + "grad_norm": 1.5230990648269653, + "learning_rate": 4.7465849966545945e-05, + "loss": 4.7425, + "step": 24308 + }, + { + "epoch": 0.1445725092777619, + "grad_norm": 1.696858525276184, + "learning_rate": 4.7465645046897814e-05, + "loss": 5.2665, + "step": 24309 + }, + { + "epoch": 0.14457845656104293, + "grad_norm": 1.3940263986587524, + "learning_rate": 4.7465440119407153e-05, + "loss": 4.9574, + "step": 24310 + }, + { + "epoch": 0.14458440384432392, + "grad_norm": 1.6118072271347046, + "learning_rate": 4.7465235184074046e-05, + "loss": 4.6531, + "step": 24311 + }, + { + "epoch": 0.1445903511276049, + "grad_norm": 1.671982765197754, + "learning_rate": 4.746503024089856e-05, + "loss": 4.6481, + "step": 24312 + }, + { + "epoch": 0.14459629841088592, + "grad_norm": 1.74351167678833, + "learning_rate": 4.746482528988076e-05, + "loss": 4.6964, + "step": 24313 + }, + { + "epoch": 0.1446022456941669, + "grad_norm": 1.8614739179611206, + "learning_rate": 4.746462033102072e-05, + "loss": 4.6784, + "step": 24314 + }, + { + "epoch": 0.1446081929774479, + "grad_norm": 1.4908361434936523, + "learning_rate": 4.746441536431851e-05, + "loss": 4.5367, + "step": 24315 + }, + { + "epoch": 0.1446141402607289, + "grad_norm": 1.6088496446609497, + "learning_rate": 4.746421038977421e-05, + "loss": 4.6425, + "step": 24316 + }, + { + "epoch": 0.1446200875440099, + "grad_norm": 1.674081563949585, + "learning_rate": 4.746400540738789e-05, + "loss": 4.4158, + "step": 24317 + }, + { + "epoch": 0.1446260348272909, + "grad_norm": 1.8151460886001587, + "learning_rate": 4.746380041715961e-05, + "loss": 4.6386, + "step": 24318 + }, + { + "epoch": 0.14463198211057188, + "grad_norm": 1.9019746780395508, + "learning_rate": 4.7463595419089456e-05, + "loss": 5.501, + "step": 24319 + }, + { + "epoch": 0.1446379293938529, + "grad_norm": 1.4574391841888428, + "learning_rate": 4.746339041317749e-05, + "loss": 5.4025, + "step": 24320 + }, + { + "epoch": 0.14464387667713388, + "grad_norm": 1.6762443780899048, + "learning_rate": 4.746318539942378e-05, + "loss": 5.4696, + "step": 24321 + }, + { + "epoch": 0.14464982396041487, + "grad_norm": 1.6373367309570312, + "learning_rate": 4.746298037782841e-05, + "loss": 5.3375, + "step": 24322 + }, + { + "epoch": 0.14465577124369589, + "grad_norm": 2.50252103805542, + "learning_rate": 4.7462775348391455e-05, + "loss": 4.5236, + "step": 24323 + }, + { + "epoch": 0.14466171852697687, + "grad_norm": 2.569896936416626, + "learning_rate": 4.7462570311112965e-05, + "loss": 4.5617, + "step": 24324 + }, + { + "epoch": 0.14466766581025786, + "grad_norm": 2.6712963581085205, + "learning_rate": 4.7462365265993024e-05, + "loss": 4.552, + "step": 24325 + }, + { + "epoch": 0.14467361309353888, + "grad_norm": 2.3401951789855957, + "learning_rate": 4.7462160213031705e-05, + "loss": 4.306, + "step": 24326 + }, + { + "epoch": 0.14467956037681987, + "grad_norm": 2.5915024280548096, + "learning_rate": 4.746195515222908e-05, + "loss": 4.2392, + "step": 24327 + }, + { + "epoch": 0.14468550766010085, + "grad_norm": 1.6202619075775146, + "learning_rate": 4.746175008358522e-05, + "loss": 5.2185, + "step": 24328 + }, + { + "epoch": 0.14469145494338187, + "grad_norm": 1.3534679412841797, + "learning_rate": 4.746154500710019e-05, + "loss": 5.3462, + "step": 24329 + }, + { + "epoch": 0.14469740222666286, + "grad_norm": 1.6344646215438843, + "learning_rate": 4.746133992277407e-05, + "loss": 5.2465, + "step": 24330 + }, + { + "epoch": 0.14470334950994385, + "grad_norm": 1.4203686714172363, + "learning_rate": 4.7461134830606924e-05, + "loss": 5.3623, + "step": 24331 + }, + { + "epoch": 0.14470929679322486, + "grad_norm": 1.3993933200836182, + "learning_rate": 4.7460929730598834e-05, + "loss": 5.3726, + "step": 24332 + }, + { + "epoch": 0.14471524407650585, + "grad_norm": 1.804283618927002, + "learning_rate": 4.746072462274986e-05, + "loss": 4.8216, + "step": 24333 + }, + { + "epoch": 0.14472119135978684, + "grad_norm": 1.5801303386688232, + "learning_rate": 4.746051950706009e-05, + "loss": 5.1036, + "step": 24334 + }, + { + "epoch": 0.14472713864306785, + "grad_norm": 1.525907278060913, + "learning_rate": 4.746031438352957e-05, + "loss": 4.743, + "step": 24335 + }, + { + "epoch": 0.14473308592634884, + "grad_norm": 1.6091197729110718, + "learning_rate": 4.746010925215839e-05, + "loss": 5.347, + "step": 24336 + }, + { + "epoch": 0.14473903320962983, + "grad_norm": 1.6794999837875366, + "learning_rate": 4.7459904112946626e-05, + "loss": 4.7244, + "step": 24337 + }, + { + "epoch": 0.14474498049291085, + "grad_norm": 1.5076384544372559, + "learning_rate": 4.745969896589434e-05, + "loss": 4.591, + "step": 24338 + }, + { + "epoch": 0.14475092777619183, + "grad_norm": 1.6222561597824097, + "learning_rate": 4.74594938110016e-05, + "loss": 4.7175, + "step": 24339 + }, + { + "epoch": 0.14475687505947282, + "grad_norm": 1.6383036375045776, + "learning_rate": 4.745928864826848e-05, + "loss": 5.5165, + "step": 24340 + }, + { + "epoch": 0.14476282234275384, + "grad_norm": 1.4812443256378174, + "learning_rate": 4.745908347769507e-05, + "loss": 5.4227, + "step": 24341 + }, + { + "epoch": 0.14476876962603483, + "grad_norm": 1.4673051834106445, + "learning_rate": 4.7458878299281406e-05, + "loss": 5.1107, + "step": 24342 + }, + { + "epoch": 0.14477471690931581, + "grad_norm": 1.3475501537322998, + "learning_rate": 4.745867311302759e-05, + "loss": 4.9491, + "step": 24343 + }, + { + "epoch": 0.14478066419259683, + "grad_norm": 1.437537670135498, + "learning_rate": 4.745846791893368e-05, + "loss": 4.985, + "step": 24344 + }, + { + "epoch": 0.14478661147587782, + "grad_norm": 1.3732200860977173, + "learning_rate": 4.745826271699975e-05, + "loss": 4.9058, + "step": 24345 + }, + { + "epoch": 0.1447925587591588, + "grad_norm": 1.2727693319320679, + "learning_rate": 4.7458057507225875e-05, + "loss": 4.9508, + "step": 24346 + }, + { + "epoch": 0.14479850604243982, + "grad_norm": 1.0993971824645996, + "learning_rate": 4.7457852289612125e-05, + "loss": 5.125, + "step": 24347 + }, + { + "epoch": 0.1448044533257208, + "grad_norm": 1.325086236000061, + "learning_rate": 4.745764706415857e-05, + "loss": 5.4091, + "step": 24348 + }, + { + "epoch": 0.1448104006090018, + "grad_norm": 1.378989815711975, + "learning_rate": 4.745744183086528e-05, + "loss": 5.472, + "step": 24349 + }, + { + "epoch": 0.14481634789228282, + "grad_norm": 1.347360372543335, + "learning_rate": 4.745723658973233e-05, + "loss": 5.4071, + "step": 24350 + }, + { + "epoch": 0.1448222951755638, + "grad_norm": 1.367734670639038, + "learning_rate": 4.74570313407598e-05, + "loss": 5.3895, + "step": 24351 + }, + { + "epoch": 0.1448282424588448, + "grad_norm": 1.4136337041854858, + "learning_rate": 4.745682608394774e-05, + "loss": 5.4637, + "step": 24352 + }, + { + "epoch": 0.1448341897421258, + "grad_norm": 1.475825548171997, + "learning_rate": 4.745662081929624e-05, + "loss": 5.3135, + "step": 24353 + }, + { + "epoch": 0.1448401370254068, + "grad_norm": 1.6745150089263916, + "learning_rate": 4.745641554680537e-05, + "loss": 4.9959, + "step": 24354 + }, + { + "epoch": 0.14484608430868778, + "grad_norm": 1.7860320806503296, + "learning_rate": 4.7456210266475185e-05, + "loss": 5.054, + "step": 24355 + }, + { + "epoch": 0.1448520315919688, + "grad_norm": 1.456579327583313, + "learning_rate": 4.745600497830577e-05, + "loss": 5.2742, + "step": 24356 + }, + { + "epoch": 0.1448579788752498, + "grad_norm": 1.5492240190505981, + "learning_rate": 4.745579968229721e-05, + "loss": 5.0763, + "step": 24357 + }, + { + "epoch": 0.14486392615853078, + "grad_norm": 1.5172940492630005, + "learning_rate": 4.7455594378449554e-05, + "loss": 5.3951, + "step": 24358 + }, + { + "epoch": 0.1448698734418118, + "grad_norm": 1.5349613428115845, + "learning_rate": 4.7455389066762876e-05, + "loss": 5.5627, + "step": 24359 + }, + { + "epoch": 0.14487582072509278, + "grad_norm": 1.8341866731643677, + "learning_rate": 4.745518374723726e-05, + "loss": 5.2127, + "step": 24360 + }, + { + "epoch": 0.14488176800837377, + "grad_norm": 1.4852558374404907, + "learning_rate": 4.745497841987277e-05, + "loss": 5.2764, + "step": 24361 + }, + { + "epoch": 0.14488771529165478, + "grad_norm": 1.6629048585891724, + "learning_rate": 4.745477308466948e-05, + "loss": 5.0675, + "step": 24362 + }, + { + "epoch": 0.14489366257493577, + "grad_norm": 1.7459721565246582, + "learning_rate": 4.745456774162746e-05, + "loss": 5.0865, + "step": 24363 + }, + { + "epoch": 0.14489960985821676, + "grad_norm": 1.9257551431655884, + "learning_rate": 4.745436239074678e-05, + "loss": 4.9022, + "step": 24364 + }, + { + "epoch": 0.14490555714149778, + "grad_norm": 1.9146925210952759, + "learning_rate": 4.745415703202752e-05, + "loss": 5.3591, + "step": 24365 + }, + { + "epoch": 0.14491150442477876, + "grad_norm": 1.5624557733535767, + "learning_rate": 4.7453951665469744e-05, + "loss": 5.2383, + "step": 24366 + }, + { + "epoch": 0.14491745170805975, + "grad_norm": 1.4265660047531128, + "learning_rate": 4.745374629107352e-05, + "loss": 5.5559, + "step": 24367 + }, + { + "epoch": 0.14492339899134077, + "grad_norm": 2.072206497192383, + "learning_rate": 4.7453540908838934e-05, + "loss": 4.6001, + "step": 24368 + }, + { + "epoch": 0.14492934627462176, + "grad_norm": 2.144535779953003, + "learning_rate": 4.745333551876604e-05, + "loss": 4.6874, + "step": 24369 + }, + { + "epoch": 0.14493529355790274, + "grad_norm": 2.311624050140381, + "learning_rate": 4.745313012085492e-05, + "loss": 5.2824, + "step": 24370 + }, + { + "epoch": 0.14494124084118376, + "grad_norm": 1.6523234844207764, + "learning_rate": 4.745292471510565e-05, + "loss": 5.447, + "step": 24371 + }, + { + "epoch": 0.14494718812446475, + "grad_norm": 1.480470895767212, + "learning_rate": 4.745271930151829e-05, + "loss": 5.4511, + "step": 24372 + }, + { + "epoch": 0.14495313540774574, + "grad_norm": 1.6797810792922974, + "learning_rate": 4.7452513880092917e-05, + "loss": 5.258, + "step": 24373 + }, + { + "epoch": 0.14495908269102675, + "grad_norm": 1.541110873222351, + "learning_rate": 4.7452308450829615e-05, + "loss": 5.4877, + "step": 24374 + }, + { + "epoch": 0.14496502997430774, + "grad_norm": 1.8961621522903442, + "learning_rate": 4.745210301372843e-05, + "loss": 5.5844, + "step": 24375 + }, + { + "epoch": 0.14497097725758873, + "grad_norm": 1.8623143434524536, + "learning_rate": 4.745189756878945e-05, + "loss": 5.6454, + "step": 24376 + }, + { + "epoch": 0.14497692454086972, + "grad_norm": 1.6899166107177734, + "learning_rate": 4.745169211601276e-05, + "loss": 5.3369, + "step": 24377 + }, + { + "epoch": 0.14498287182415073, + "grad_norm": 1.7222342491149902, + "learning_rate": 4.74514866553984e-05, + "loss": 5.5805, + "step": 24378 + }, + { + "epoch": 0.14498881910743172, + "grad_norm": 1.7649940252304077, + "learning_rate": 4.745128118694646e-05, + "loss": 5.5568, + "step": 24379 + }, + { + "epoch": 0.1449947663907127, + "grad_norm": 1.9492725133895874, + "learning_rate": 4.745107571065701e-05, + "loss": 5.2019, + "step": 24380 + }, + { + "epoch": 0.14500071367399373, + "grad_norm": 1.6403963565826416, + "learning_rate": 4.745087022653013e-05, + "loss": 5.0867, + "step": 24381 + }, + { + "epoch": 0.1450066609572747, + "grad_norm": 1.6921762228012085, + "learning_rate": 4.7450664734565875e-05, + "loss": 4.823, + "step": 24382 + }, + { + "epoch": 0.1450126082405557, + "grad_norm": 1.7539616823196411, + "learning_rate": 4.745045923476432e-05, + "loss": 5.0268, + "step": 24383 + }, + { + "epoch": 0.14501855552383672, + "grad_norm": 1.7073962688446045, + "learning_rate": 4.745025372712555e-05, + "loss": 5.3082, + "step": 24384 + }, + { + "epoch": 0.1450245028071177, + "grad_norm": 1.457963228225708, + "learning_rate": 4.7450048211649626e-05, + "loss": 5.1926, + "step": 24385 + }, + { + "epoch": 0.1450304500903987, + "grad_norm": 1.7305623292922974, + "learning_rate": 4.744984268833662e-05, + "loss": 5.3563, + "step": 24386 + }, + { + "epoch": 0.1450363973736797, + "grad_norm": 1.4888592958450317, + "learning_rate": 4.744963715718661e-05, + "loss": 5.3478, + "step": 24387 + }, + { + "epoch": 0.1450423446569607, + "grad_norm": 1.7059639692306519, + "learning_rate": 4.744943161819966e-05, + "loss": 5.3782, + "step": 24388 + }, + { + "epoch": 0.14504829194024169, + "grad_norm": 1.539562463760376, + "learning_rate": 4.744922607137585e-05, + "loss": 5.4259, + "step": 24389 + }, + { + "epoch": 0.1450542392235227, + "grad_norm": 1.6427409648895264, + "learning_rate": 4.7449020516715245e-05, + "loss": 5.2505, + "step": 24390 + }, + { + "epoch": 0.1450601865068037, + "grad_norm": 1.5506988763809204, + "learning_rate": 4.7448814954217916e-05, + "loss": 5.214, + "step": 24391 + }, + { + "epoch": 0.14506613379008468, + "grad_norm": 1.7298414707183838, + "learning_rate": 4.744860938388395e-05, + "loss": 5.4361, + "step": 24392 + }, + { + "epoch": 0.1450720810733657, + "grad_norm": 1.6383203268051147, + "learning_rate": 4.744840380571339e-05, + "loss": 5.2703, + "step": 24393 + }, + { + "epoch": 0.14507802835664668, + "grad_norm": 1.6193071603775024, + "learning_rate": 4.744819821970633e-05, + "loss": 5.1414, + "step": 24394 + }, + { + "epoch": 0.14508397563992767, + "grad_norm": 1.6779429912567139, + "learning_rate": 4.7447992625862835e-05, + "loss": 5.1886, + "step": 24395 + }, + { + "epoch": 0.1450899229232087, + "grad_norm": 1.7938344478607178, + "learning_rate": 4.7447787024182975e-05, + "loss": 5.4733, + "step": 24396 + }, + { + "epoch": 0.14509587020648967, + "grad_norm": 1.7248293161392212, + "learning_rate": 4.7447581414666834e-05, + "loss": 5.484, + "step": 24397 + }, + { + "epoch": 0.14510181748977066, + "grad_norm": 1.7749347686767578, + "learning_rate": 4.744737579731447e-05, + "loss": 5.3896, + "step": 24398 + }, + { + "epoch": 0.14510776477305168, + "grad_norm": 1.6471116542816162, + "learning_rate": 4.744717017212595e-05, + "loss": 5.4622, + "step": 24399 + }, + { + "epoch": 0.14511371205633267, + "grad_norm": 1.7347856760025024, + "learning_rate": 4.7446964539101366e-05, + "loss": 5.5045, + "step": 24400 + }, + { + "epoch": 0.14511965933961365, + "grad_norm": 1.7716923952102661, + "learning_rate": 4.744675889824078e-05, + "loss": 5.3604, + "step": 24401 + }, + { + "epoch": 0.14512560662289467, + "grad_norm": 1.8484928607940674, + "learning_rate": 4.7446553249544253e-05, + "loss": 5.4746, + "step": 24402 + }, + { + "epoch": 0.14513155390617566, + "grad_norm": 1.7685359716415405, + "learning_rate": 4.7446347593011875e-05, + "loss": 5.4244, + "step": 24403 + }, + { + "epoch": 0.14513750118945665, + "grad_norm": 1.6140607595443726, + "learning_rate": 4.74461419286437e-05, + "loss": 5.4337, + "step": 24404 + }, + { + "epoch": 0.14514344847273766, + "grad_norm": 1.8425545692443848, + "learning_rate": 4.744593625643982e-05, + "loss": 4.7602, + "step": 24405 + }, + { + "epoch": 0.14514939575601865, + "grad_norm": 1.7787073850631714, + "learning_rate": 4.7445730576400284e-05, + "loss": 4.7792, + "step": 24406 + }, + { + "epoch": 0.14515534303929964, + "grad_norm": 1.7401658296585083, + "learning_rate": 4.7445524888525185e-05, + "loss": 5.1436, + "step": 24407 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 1.9028658866882324, + "learning_rate": 4.744531919281457e-05, + "loss": 5.2477, + "step": 24408 + }, + { + "epoch": 0.14516723760586164, + "grad_norm": 1.86017644405365, + "learning_rate": 4.7445113489268544e-05, + "loss": 5.593, + "step": 24409 + }, + { + "epoch": 0.14517318488914263, + "grad_norm": 1.551146149635315, + "learning_rate": 4.744490777788716e-05, + "loss": 5.7147, + "step": 24410 + }, + { + "epoch": 0.14517913217242365, + "grad_norm": 1.689828634262085, + "learning_rate": 4.744470205867048e-05, + "loss": 5.7174, + "step": 24411 + }, + { + "epoch": 0.14518507945570464, + "grad_norm": 1.6940490007400513, + "learning_rate": 4.744449633161859e-05, + "loss": 5.4586, + "step": 24412 + }, + { + "epoch": 0.14519102673898562, + "grad_norm": 1.6582127809524536, + "learning_rate": 4.7444290596731555e-05, + "loss": 5.4499, + "step": 24413 + }, + { + "epoch": 0.14519697402226664, + "grad_norm": 1.5289736986160278, + "learning_rate": 4.7444084854009454e-05, + "loss": 5.3323, + "step": 24414 + }, + { + "epoch": 0.14520292130554763, + "grad_norm": 1.597364068031311, + "learning_rate": 4.744387910345235e-05, + "loss": 5.2472, + "step": 24415 + }, + { + "epoch": 0.14520886858882862, + "grad_norm": 1.567718505859375, + "learning_rate": 4.7443673345060325e-05, + "loss": 5.1505, + "step": 24416 + }, + { + "epoch": 0.14521481587210963, + "grad_norm": 1.6296337842941284, + "learning_rate": 4.7443467578833446e-05, + "loss": 5.5358, + "step": 24417 + }, + { + "epoch": 0.14522076315539062, + "grad_norm": 1.5341614484786987, + "learning_rate": 4.744326180477179e-05, + "loss": 5.4139, + "step": 24418 + }, + { + "epoch": 0.1452267104386716, + "grad_norm": 1.6611801385879517, + "learning_rate": 4.744305602287541e-05, + "loss": 5.3999, + "step": 24419 + }, + { + "epoch": 0.14523265772195262, + "grad_norm": 1.4712778329849243, + "learning_rate": 4.74428502331444e-05, + "loss": 5.5498, + "step": 24420 + }, + { + "epoch": 0.1452386050052336, + "grad_norm": 1.6814862489700317, + "learning_rate": 4.744264443557882e-05, + "loss": 5.3511, + "step": 24421 + }, + { + "epoch": 0.1452445522885146, + "grad_norm": 1.512871265411377, + "learning_rate": 4.7442438630178746e-05, + "loss": 5.2377, + "step": 24422 + }, + { + "epoch": 0.14525049957179562, + "grad_norm": 1.4311738014221191, + "learning_rate": 4.744223281694424e-05, + "loss": 5.49, + "step": 24423 + }, + { + "epoch": 0.1452564468550766, + "grad_norm": 1.4469417333602905, + "learning_rate": 4.744202699587539e-05, + "loss": 5.2427, + "step": 24424 + }, + { + "epoch": 0.1452623941383576, + "grad_norm": 1.4444100856781006, + "learning_rate": 4.744182116697226e-05, + "loss": 5.263, + "step": 24425 + }, + { + "epoch": 0.1452683414216386, + "grad_norm": 1.4034851789474487, + "learning_rate": 4.744161533023492e-05, + "loss": 5.2735, + "step": 24426 + }, + { + "epoch": 0.1452742887049196, + "grad_norm": 1.637856364250183, + "learning_rate": 4.7441409485663444e-05, + "loss": 5.0982, + "step": 24427 + }, + { + "epoch": 0.14528023598820058, + "grad_norm": 1.7255091667175293, + "learning_rate": 4.7441203633257915e-05, + "loss": 4.9104, + "step": 24428 + }, + { + "epoch": 0.1452861832714816, + "grad_norm": 2.115915536880493, + "learning_rate": 4.744099777301838e-05, + "loss": 4.9661, + "step": 24429 + }, + { + "epoch": 0.1452921305547626, + "grad_norm": 1.8747011423110962, + "learning_rate": 4.7440791904944926e-05, + "loss": 5.2122, + "step": 24430 + }, + { + "epoch": 0.14529807783804358, + "grad_norm": 1.7300605773925781, + "learning_rate": 4.744058602903763e-05, + "loss": 5.1689, + "step": 24431 + }, + { + "epoch": 0.1453040251213246, + "grad_norm": 1.4435160160064697, + "learning_rate": 4.744038014529655e-05, + "loss": 5.2636, + "step": 24432 + }, + { + "epoch": 0.14530997240460558, + "grad_norm": 1.6441041231155396, + "learning_rate": 4.744017425372177e-05, + "loss": 5.2737, + "step": 24433 + }, + { + "epoch": 0.14531591968788657, + "grad_norm": 1.5537841320037842, + "learning_rate": 4.743996835431336e-05, + "loss": 5.1661, + "step": 24434 + }, + { + "epoch": 0.14532186697116756, + "grad_norm": 1.5431783199310303, + "learning_rate": 4.743976244707138e-05, + "loss": 5.0257, + "step": 24435 + }, + { + "epoch": 0.14532781425444857, + "grad_norm": 1.6137834787368774, + "learning_rate": 4.7439556531995914e-05, + "loss": 4.9459, + "step": 24436 + }, + { + "epoch": 0.14533376153772956, + "grad_norm": 1.6870076656341553, + "learning_rate": 4.743935060908703e-05, + "loss": 5.0615, + "step": 24437 + }, + { + "epoch": 0.14533970882101055, + "grad_norm": 1.7536146640777588, + "learning_rate": 4.74391446783448e-05, + "loss": 5.041, + "step": 24438 + }, + { + "epoch": 0.14534565610429157, + "grad_norm": 1.8259520530700684, + "learning_rate": 4.7438938739769304e-05, + "loss": 5.0222, + "step": 24439 + }, + { + "epoch": 0.14535160338757255, + "grad_norm": 1.9656455516815186, + "learning_rate": 4.74387327933606e-05, + "loss": 5.3352, + "step": 24440 + }, + { + "epoch": 0.14535755067085354, + "grad_norm": 2.096452236175537, + "learning_rate": 4.743852683911877e-05, + "loss": 5.4241, + "step": 24441 + }, + { + "epoch": 0.14536349795413456, + "grad_norm": 1.6562155485153198, + "learning_rate": 4.743832087704388e-05, + "loss": 6.0049, + "step": 24442 + }, + { + "epoch": 0.14536944523741555, + "grad_norm": 1.538763165473938, + "learning_rate": 4.7438114907136e-05, + "loss": 5.4588, + "step": 24443 + }, + { + "epoch": 0.14537539252069653, + "grad_norm": 1.835303783416748, + "learning_rate": 4.7437908929395216e-05, + "loss": 5.1866, + "step": 24444 + }, + { + "epoch": 0.14538133980397755, + "grad_norm": 1.6841330528259277, + "learning_rate": 4.743770294382158e-05, + "loss": 5.51, + "step": 24445 + }, + { + "epoch": 0.14538728708725854, + "grad_norm": 1.775283694267273, + "learning_rate": 4.743749695041517e-05, + "loss": 5.5482, + "step": 24446 + }, + { + "epoch": 0.14539323437053953, + "grad_norm": 1.5169485807418823, + "learning_rate": 4.7437290949176074e-05, + "loss": 5.5175, + "step": 24447 + }, + { + "epoch": 0.14539918165382054, + "grad_norm": 1.3337781429290771, + "learning_rate": 4.743708494010435e-05, + "loss": 5.8864, + "step": 24448 + }, + { + "epoch": 0.14540512893710153, + "grad_norm": 1.5488650798797607, + "learning_rate": 4.743687892320006e-05, + "loss": 5.9374, + "step": 24449 + }, + { + "epoch": 0.14541107622038252, + "grad_norm": 1.7683097124099731, + "learning_rate": 4.74366728984633e-05, + "loss": 5.2741, + "step": 24450 + }, + { + "epoch": 0.14541702350366353, + "grad_norm": 1.750689148902893, + "learning_rate": 4.743646686589413e-05, + "loss": 5.5179, + "step": 24451 + }, + { + "epoch": 0.14542297078694452, + "grad_norm": 1.8411931991577148, + "learning_rate": 4.7436260825492604e-05, + "loss": 5.2341, + "step": 24452 + }, + { + "epoch": 0.1454289180702255, + "grad_norm": 1.8112800121307373, + "learning_rate": 4.7436054777258824e-05, + "loss": 5.2025, + "step": 24453 + }, + { + "epoch": 0.14543486535350653, + "grad_norm": 1.5593929290771484, + "learning_rate": 4.743584872119285e-05, + "loss": 5.4906, + "step": 24454 + }, + { + "epoch": 0.14544081263678751, + "grad_norm": 1.683072805404663, + "learning_rate": 4.743564265729475e-05, + "loss": 5.279, + "step": 24455 + }, + { + "epoch": 0.1454467599200685, + "grad_norm": 1.6395639181137085, + "learning_rate": 4.74354365855646e-05, + "loss": 5.9672, + "step": 24456 + }, + { + "epoch": 0.14545270720334952, + "grad_norm": 1.5672929286956787, + "learning_rate": 4.743523050600247e-05, + "loss": 5.3588, + "step": 24457 + }, + { + "epoch": 0.1454586544866305, + "grad_norm": 1.7329927682876587, + "learning_rate": 4.7435024418608434e-05, + "loss": 5.1456, + "step": 24458 + }, + { + "epoch": 0.1454646017699115, + "grad_norm": 1.7443114519119263, + "learning_rate": 4.7434818323382554e-05, + "loss": 5.0256, + "step": 24459 + }, + { + "epoch": 0.1454705490531925, + "grad_norm": 1.6770588159561157, + "learning_rate": 4.7434612220324926e-05, + "loss": 5.0028, + "step": 24460 + }, + { + "epoch": 0.1454764963364735, + "grad_norm": 1.7134469747543335, + "learning_rate": 4.74344061094356e-05, + "loss": 5.0299, + "step": 24461 + }, + { + "epoch": 0.1454824436197545, + "grad_norm": 1.55935537815094, + "learning_rate": 4.743419999071465e-05, + "loss": 5.0422, + "step": 24462 + }, + { + "epoch": 0.1454883909030355, + "grad_norm": 1.722185730934143, + "learning_rate": 4.743399386416216e-05, + "loss": 4.9558, + "step": 24463 + }, + { + "epoch": 0.1454943381863165, + "grad_norm": 1.6128919124603271, + "learning_rate": 4.743378772977819e-05, + "loss": 4.903, + "step": 24464 + }, + { + "epoch": 0.14550028546959748, + "grad_norm": 1.6574269533157349, + "learning_rate": 4.7433581587562816e-05, + "loss": 4.9092, + "step": 24465 + }, + { + "epoch": 0.1455062327528785, + "grad_norm": 1.6132055521011353, + "learning_rate": 4.7433375437516116e-05, + "loss": 4.8561, + "step": 24466 + }, + { + "epoch": 0.14551218003615948, + "grad_norm": 1.7846872806549072, + "learning_rate": 4.743316927963814e-05, + "loss": 5.3115, + "step": 24467 + }, + { + "epoch": 0.14551812731944047, + "grad_norm": 1.787424087524414, + "learning_rate": 4.7432963113929e-05, + "loss": 5.2607, + "step": 24468 + }, + { + "epoch": 0.1455240746027215, + "grad_norm": 1.9011743068695068, + "learning_rate": 4.743275694038873e-05, + "loss": 4.989, + "step": 24469 + }, + { + "epoch": 0.14553002188600248, + "grad_norm": 1.7853960990905762, + "learning_rate": 4.7432550759017415e-05, + "loss": 5.066, + "step": 24470 + }, + { + "epoch": 0.14553596916928346, + "grad_norm": 2.131143569946289, + "learning_rate": 4.7432344569815134e-05, + "loss": 5.0322, + "step": 24471 + }, + { + "epoch": 0.14554191645256448, + "grad_norm": 1.7870924472808838, + "learning_rate": 4.743213837278195e-05, + "loss": 4.8767, + "step": 24472 + }, + { + "epoch": 0.14554786373584547, + "grad_norm": 1.8804802894592285, + "learning_rate": 4.743193216791795e-05, + "loss": 5.0155, + "step": 24473 + }, + { + "epoch": 0.14555381101912646, + "grad_norm": 2.4177560806274414, + "learning_rate": 4.7431725955223175e-05, + "loss": 4.6521, + "step": 24474 + }, + { + "epoch": 0.14555975830240747, + "grad_norm": 2.3657360076904297, + "learning_rate": 4.743151973469773e-05, + "loss": 4.5406, + "step": 24475 + }, + { + "epoch": 0.14556570558568846, + "grad_norm": 2.233304977416992, + "learning_rate": 4.743131350634167e-05, + "loss": 4.6725, + "step": 24476 + }, + { + "epoch": 0.14557165286896945, + "grad_norm": 2.314302921295166, + "learning_rate": 4.743110727015506e-05, + "loss": 4.2326, + "step": 24477 + }, + { + "epoch": 0.14557760015225046, + "grad_norm": 2.272599220275879, + "learning_rate": 4.7430901026137996e-05, + "loss": 4.2031, + "step": 24478 + }, + { + "epoch": 0.14558354743553145, + "grad_norm": 1.7667213678359985, + "learning_rate": 4.743069477429053e-05, + "loss": 5.0108, + "step": 24479 + }, + { + "epoch": 0.14558949471881244, + "grad_norm": 2.192775011062622, + "learning_rate": 4.7430488514612746e-05, + "loss": 4.0625, + "step": 24480 + }, + { + "epoch": 0.14559544200209346, + "grad_norm": 2.4205431938171387, + "learning_rate": 4.743028224710471e-05, + "loss": 4.1039, + "step": 24481 + }, + { + "epoch": 0.14560138928537444, + "grad_norm": 2.1844823360443115, + "learning_rate": 4.743007597176649e-05, + "loss": 3.9408, + "step": 24482 + }, + { + "epoch": 0.14560733656865543, + "grad_norm": 2.3235034942626953, + "learning_rate": 4.742986968859816e-05, + "loss": 4.0957, + "step": 24483 + }, + { + "epoch": 0.14561328385193645, + "grad_norm": 2.3802473545074463, + "learning_rate": 4.742966339759979e-05, + "loss": 4.2864, + "step": 24484 + }, + { + "epoch": 0.14561923113521744, + "grad_norm": 2.2253031730651855, + "learning_rate": 4.742945709877147e-05, + "loss": 4.1559, + "step": 24485 + }, + { + "epoch": 0.14562517841849842, + "grad_norm": 2.559008836746216, + "learning_rate": 4.742925079211324e-05, + "loss": 4.0356, + "step": 24486 + }, + { + "epoch": 0.14563112570177944, + "grad_norm": 2.222951889038086, + "learning_rate": 4.7429044477625206e-05, + "loss": 4.0193, + "step": 24487 + }, + { + "epoch": 0.14563707298506043, + "grad_norm": 1.9578197002410889, + "learning_rate": 4.742883815530742e-05, + "loss": 4.8917, + "step": 24488 + }, + { + "epoch": 0.14564302026834142, + "grad_norm": 1.8768174648284912, + "learning_rate": 4.742863182515996e-05, + "loss": 4.8987, + "step": 24489 + }, + { + "epoch": 0.14564896755162243, + "grad_norm": 2.0520718097686768, + "learning_rate": 4.7428425487182895e-05, + "loss": 5.2806, + "step": 24490 + }, + { + "epoch": 0.14565491483490342, + "grad_norm": 1.7171385288238525, + "learning_rate": 4.74282191413763e-05, + "loss": 4.801, + "step": 24491 + }, + { + "epoch": 0.1456608621181844, + "grad_norm": 1.5739022493362427, + "learning_rate": 4.742801278774024e-05, + "loss": 5.5888, + "step": 24492 + }, + { + "epoch": 0.1456668094014654, + "grad_norm": 1.6728390455245972, + "learning_rate": 4.742780642627479e-05, + "loss": 5.0339, + "step": 24493 + }, + { + "epoch": 0.1456727566847464, + "grad_norm": 1.5647993087768555, + "learning_rate": 4.7427600056980035e-05, + "loss": 4.859, + "step": 24494 + }, + { + "epoch": 0.1456787039680274, + "grad_norm": 1.8099721670150757, + "learning_rate": 4.7427393679856026e-05, + "loss": 5.4872, + "step": 24495 + }, + { + "epoch": 0.1456846512513084, + "grad_norm": 1.7053685188293457, + "learning_rate": 4.742718729490285e-05, + "loss": 5.0992, + "step": 24496 + }, + { + "epoch": 0.1456905985345894, + "grad_norm": 1.57960045337677, + "learning_rate": 4.742698090212058e-05, + "loss": 5.3847, + "step": 24497 + }, + { + "epoch": 0.1456965458178704, + "grad_norm": 1.6272963285446167, + "learning_rate": 4.7426774501509275e-05, + "loss": 5.2833, + "step": 24498 + }, + { + "epoch": 0.14570249310115138, + "grad_norm": 1.8782978057861328, + "learning_rate": 4.742656809306902e-05, + "loss": 5.2527, + "step": 24499 + }, + { + "epoch": 0.1457084403844324, + "grad_norm": 1.6581416130065918, + "learning_rate": 4.742636167679988e-05, + "loss": 5.4469, + "step": 24500 + }, + { + "epoch": 0.14571438766771339, + "grad_norm": 1.4809743165969849, + "learning_rate": 4.742615525270193e-05, + "loss": 5.5264, + "step": 24501 + }, + { + "epoch": 0.14572033495099437, + "grad_norm": 1.7145473957061768, + "learning_rate": 4.742594882077523e-05, + "loss": 5.3418, + "step": 24502 + }, + { + "epoch": 0.1457262822342754, + "grad_norm": 1.5335949659347534, + "learning_rate": 4.742574238101988e-05, + "loss": 5.3467, + "step": 24503 + }, + { + "epoch": 0.14573222951755638, + "grad_norm": 1.4682936668395996, + "learning_rate": 4.742553593343593e-05, + "loss": 5.3817, + "step": 24504 + }, + { + "epoch": 0.14573817680083737, + "grad_norm": 1.3231433629989624, + "learning_rate": 4.742532947802345e-05, + "loss": 5.4963, + "step": 24505 + }, + { + "epoch": 0.14574412408411838, + "grad_norm": 1.4141665697097778, + "learning_rate": 4.7425123014782525e-05, + "loss": 5.6261, + "step": 24506 + }, + { + "epoch": 0.14575007136739937, + "grad_norm": 1.5164703130722046, + "learning_rate": 4.742491654371322e-05, + "loss": 5.8411, + "step": 24507 + }, + { + "epoch": 0.14575601865068036, + "grad_norm": 1.309892177581787, + "learning_rate": 4.7424710064815606e-05, + "loss": 5.497, + "step": 24508 + }, + { + "epoch": 0.14576196593396137, + "grad_norm": 1.9315495491027832, + "learning_rate": 4.742450357808976e-05, + "loss": 5.5718, + "step": 24509 + }, + { + "epoch": 0.14576791321724236, + "grad_norm": 1.3881922960281372, + "learning_rate": 4.742429708353575e-05, + "loss": 5.6583, + "step": 24510 + }, + { + "epoch": 0.14577386050052335, + "grad_norm": 1.186221957206726, + "learning_rate": 4.7424090581153654e-05, + "loss": 5.5111, + "step": 24511 + }, + { + "epoch": 0.14577980778380437, + "grad_norm": 1.5839451551437378, + "learning_rate": 4.742388407094354e-05, + "loss": 5.285, + "step": 24512 + }, + { + "epoch": 0.14578575506708535, + "grad_norm": 1.659534215927124, + "learning_rate": 4.7423677552905474e-05, + "loss": 5.2722, + "step": 24513 + }, + { + "epoch": 0.14579170235036634, + "grad_norm": 1.530068278312683, + "learning_rate": 4.742347102703953e-05, + "loss": 5.6943, + "step": 24514 + }, + { + "epoch": 0.14579764963364736, + "grad_norm": 1.966497540473938, + "learning_rate": 4.7423264493345794e-05, + "loss": 5.3509, + "step": 24515 + }, + { + "epoch": 0.14580359691692835, + "grad_norm": 2.2554593086242676, + "learning_rate": 4.7423057951824325e-05, + "loss": 4.8778, + "step": 24516 + }, + { + "epoch": 0.14580954420020933, + "grad_norm": 1.746324062347412, + "learning_rate": 4.7422851402475195e-05, + "loss": 5.2867, + "step": 24517 + }, + { + "epoch": 0.14581549148349035, + "grad_norm": 1.5312012434005737, + "learning_rate": 4.7422644845298484e-05, + "loss": 5.3472, + "step": 24518 + }, + { + "epoch": 0.14582143876677134, + "grad_norm": 1.8742462396621704, + "learning_rate": 4.742243828029426e-05, + "loss": 5.2399, + "step": 24519 + }, + { + "epoch": 0.14582738605005233, + "grad_norm": 1.563302993774414, + "learning_rate": 4.7422231707462585e-05, + "loss": 5.3742, + "step": 24520 + }, + { + "epoch": 0.14583333333333334, + "grad_norm": 1.7737884521484375, + "learning_rate": 4.7422025126803545e-05, + "loss": 5.6674, + "step": 24521 + }, + { + "epoch": 0.14583928061661433, + "grad_norm": 1.9887245893478394, + "learning_rate": 4.742181853831721e-05, + "loss": 5.3851, + "step": 24522 + }, + { + "epoch": 0.14584522789989532, + "grad_norm": 1.773938775062561, + "learning_rate": 4.7421611942003654e-05, + "loss": 5.22, + "step": 24523 + }, + { + "epoch": 0.14585117518317633, + "grad_norm": 1.733723521232605, + "learning_rate": 4.742140533786294e-05, + "loss": 5.0786, + "step": 24524 + }, + { + "epoch": 0.14585712246645732, + "grad_norm": 1.7058782577514648, + "learning_rate": 4.742119872589514e-05, + "loss": 5.214, + "step": 24525 + }, + { + "epoch": 0.1458630697497383, + "grad_norm": 1.7503206729888916, + "learning_rate": 4.742099210610034e-05, + "loss": 5.3132, + "step": 24526 + }, + { + "epoch": 0.14586901703301933, + "grad_norm": 1.9028650522232056, + "learning_rate": 4.7420785478478596e-05, + "loss": 5.3016, + "step": 24527 + }, + { + "epoch": 0.14587496431630032, + "grad_norm": 1.7530872821807861, + "learning_rate": 4.742057884302999e-05, + "loss": 5.199, + "step": 24528 + }, + { + "epoch": 0.1458809115995813, + "grad_norm": 1.8776800632476807, + "learning_rate": 4.7420372199754595e-05, + "loss": 5.0358, + "step": 24529 + }, + { + "epoch": 0.14588685888286232, + "grad_norm": 1.6402316093444824, + "learning_rate": 4.7420165548652474e-05, + "loss": 5.0548, + "step": 24530 + }, + { + "epoch": 0.1458928061661433, + "grad_norm": 1.9277185201644897, + "learning_rate": 4.741995888972371e-05, + "loss": 5.0196, + "step": 24531 + }, + { + "epoch": 0.1458987534494243, + "grad_norm": 1.7798771858215332, + "learning_rate": 4.7419752222968364e-05, + "loss": 5.0015, + "step": 24532 + }, + { + "epoch": 0.1459047007327053, + "grad_norm": 1.6921379566192627, + "learning_rate": 4.741954554838652e-05, + "loss": 5.0044, + "step": 24533 + }, + { + "epoch": 0.1459106480159863, + "grad_norm": 1.5286321640014648, + "learning_rate": 4.741933886597825e-05, + "loss": 5.2836, + "step": 24534 + }, + { + "epoch": 0.1459165952992673, + "grad_norm": 1.5439866781234741, + "learning_rate": 4.741913217574361e-05, + "loss": 5.645, + "step": 24535 + }, + { + "epoch": 0.1459225425825483, + "grad_norm": 1.8537307977676392, + "learning_rate": 4.741892547768269e-05, + "loss": 5.7112, + "step": 24536 + }, + { + "epoch": 0.1459284898658293, + "grad_norm": 1.458747386932373, + "learning_rate": 4.741871877179554e-05, + "loss": 5.3639, + "step": 24537 + }, + { + "epoch": 0.14593443714911028, + "grad_norm": 1.8507471084594727, + "learning_rate": 4.7418512058082255e-05, + "loss": 4.7947, + "step": 24538 + }, + { + "epoch": 0.1459403844323913, + "grad_norm": 1.8104653358459473, + "learning_rate": 4.74183053365429e-05, + "loss": 4.9444, + "step": 24539 + }, + { + "epoch": 0.14594633171567228, + "grad_norm": 1.8392473459243774, + "learning_rate": 4.741809860717755e-05, + "loss": 4.6432, + "step": 24540 + }, + { + "epoch": 0.14595227899895327, + "grad_norm": 1.8322739601135254, + "learning_rate": 4.7417891869986274e-05, + "loss": 4.8165, + "step": 24541 + }, + { + "epoch": 0.1459582262822343, + "grad_norm": 1.7574645280838013, + "learning_rate": 4.741768512496914e-05, + "loss": 4.5592, + "step": 24542 + }, + { + "epoch": 0.14596417356551528, + "grad_norm": 1.6960285902023315, + "learning_rate": 4.7417478372126223e-05, + "loss": 4.8203, + "step": 24543 + }, + { + "epoch": 0.14597012084879626, + "grad_norm": 1.624930739402771, + "learning_rate": 4.741727161145759e-05, + "loss": 4.7056, + "step": 24544 + }, + { + "epoch": 0.14597606813207728, + "grad_norm": 1.6901119947433472, + "learning_rate": 4.741706484296333e-05, + "loss": 4.8837, + "step": 24545 + }, + { + "epoch": 0.14598201541535827, + "grad_norm": 1.6677742004394531, + "learning_rate": 4.74168580666435e-05, + "loss": 5.777, + "step": 24546 + }, + { + "epoch": 0.14598796269863926, + "grad_norm": 1.9622048139572144, + "learning_rate": 4.741665128249818e-05, + "loss": 5.1728, + "step": 24547 + }, + { + "epoch": 0.14599390998192027, + "grad_norm": 2.1024181842803955, + "learning_rate": 4.7416444490527435e-05, + "loss": 5.1417, + "step": 24548 + }, + { + "epoch": 0.14599985726520126, + "grad_norm": 1.9071123600006104, + "learning_rate": 4.7416237690731336e-05, + "loss": 5.1996, + "step": 24549 + }, + { + "epoch": 0.14600580454848225, + "grad_norm": 2.404794931411743, + "learning_rate": 4.741603088310997e-05, + "loss": 5.2283, + "step": 24550 + }, + { + "epoch": 0.14601175183176324, + "grad_norm": 1.6359655857086182, + "learning_rate": 4.74158240676634e-05, + "loss": 5.3233, + "step": 24551 + }, + { + "epoch": 0.14601769911504425, + "grad_norm": 2.5952274799346924, + "learning_rate": 4.7415617244391686e-05, + "loss": 4.9227, + "step": 24552 + }, + { + "epoch": 0.14602364639832524, + "grad_norm": 1.709825038909912, + "learning_rate": 4.7415410413294914e-05, + "loss": 5.2745, + "step": 24553 + }, + { + "epoch": 0.14602959368160623, + "grad_norm": 1.709489345550537, + "learning_rate": 4.741520357437316e-05, + "loss": 5.0694, + "step": 24554 + }, + { + "epoch": 0.14603554096488724, + "grad_norm": 1.6386815309524536, + "learning_rate": 4.7414996727626484e-05, + "loss": 5.1265, + "step": 24555 + }, + { + "epoch": 0.14604148824816823, + "grad_norm": 1.4357349872589111, + "learning_rate": 4.741478987305497e-05, + "loss": 5.149, + "step": 24556 + }, + { + "epoch": 0.14604743553144922, + "grad_norm": 1.951442003250122, + "learning_rate": 4.741458301065868e-05, + "loss": 5.0956, + "step": 24557 + }, + { + "epoch": 0.14605338281473024, + "grad_norm": 2.0688650608062744, + "learning_rate": 4.7414376140437696e-05, + "loss": 4.8894, + "step": 24558 + }, + { + "epoch": 0.14605933009801123, + "grad_norm": 1.6985790729522705, + "learning_rate": 4.741416926239208e-05, + "loss": 4.9548, + "step": 24559 + }, + { + "epoch": 0.1460652773812922, + "grad_norm": 1.5429292917251587, + "learning_rate": 4.7413962376521906e-05, + "loss": 4.9634, + "step": 24560 + }, + { + "epoch": 0.14607122466457323, + "grad_norm": 1.5821011066436768, + "learning_rate": 4.741375548282726e-05, + "loss": 5.3701, + "step": 24561 + }, + { + "epoch": 0.14607717194785422, + "grad_norm": 1.5868496894836426, + "learning_rate": 4.7413548581308196e-05, + "loss": 5.0315, + "step": 24562 + }, + { + "epoch": 0.1460831192311352, + "grad_norm": 1.471294641494751, + "learning_rate": 4.74133416719648e-05, + "loss": 4.9128, + "step": 24563 + }, + { + "epoch": 0.14608906651441622, + "grad_norm": 1.4862011671066284, + "learning_rate": 4.7413134754797126e-05, + "loss": 4.8533, + "step": 24564 + }, + { + "epoch": 0.1460950137976972, + "grad_norm": 1.47359037399292, + "learning_rate": 4.741292782980527e-05, + "loss": 4.8428, + "step": 24565 + }, + { + "epoch": 0.1461009610809782, + "grad_norm": 1.4886908531188965, + "learning_rate": 4.741272089698928e-05, + "loss": 4.8365, + "step": 24566 + }, + { + "epoch": 0.1461069083642592, + "grad_norm": 1.561625599861145, + "learning_rate": 4.741251395634925e-05, + "loss": 4.9553, + "step": 24567 + }, + { + "epoch": 0.1461128556475402, + "grad_norm": 1.5089234113693237, + "learning_rate": 4.741230700788524e-05, + "loss": 4.7997, + "step": 24568 + }, + { + "epoch": 0.1461188029308212, + "grad_norm": 1.5985972881317139, + "learning_rate": 4.741210005159733e-05, + "loss": 4.8006, + "step": 24569 + }, + { + "epoch": 0.1461247502141022, + "grad_norm": 1.5302664041519165, + "learning_rate": 4.741189308748558e-05, + "loss": 4.7809, + "step": 24570 + }, + { + "epoch": 0.1461306974973832, + "grad_norm": 1.5156875848770142, + "learning_rate": 4.7411686115550074e-05, + "loss": 4.6965, + "step": 24571 + }, + { + "epoch": 0.14613664478066418, + "grad_norm": 1.6026439666748047, + "learning_rate": 4.741147913579088e-05, + "loss": 4.9386, + "step": 24572 + }, + { + "epoch": 0.1461425920639452, + "grad_norm": 1.849469542503357, + "learning_rate": 4.7411272148208067e-05, + "loss": 5.7675, + "step": 24573 + }, + { + "epoch": 0.1461485393472262, + "grad_norm": 1.9813694953918457, + "learning_rate": 4.7411065152801716e-05, + "loss": 5.3741, + "step": 24574 + }, + { + "epoch": 0.14615448663050717, + "grad_norm": 2.459035634994507, + "learning_rate": 4.741085814957189e-05, + "loss": 4.6126, + "step": 24575 + }, + { + "epoch": 0.1461604339137882, + "grad_norm": 2.858220100402832, + "learning_rate": 4.741065113851867e-05, + "loss": 4.1891, + "step": 24576 + }, + { + "epoch": 0.14616638119706918, + "grad_norm": 2.2826805114746094, + "learning_rate": 4.741044411964212e-05, + "loss": 4.4009, + "step": 24577 + }, + { + "epoch": 0.14617232848035017, + "grad_norm": 2.0174343585968018, + "learning_rate": 4.741023709294231e-05, + "loss": 4.946, + "step": 24578 + }, + { + "epoch": 0.14617827576363118, + "grad_norm": 2.0307867527008057, + "learning_rate": 4.741003005841932e-05, + "loss": 5.0872, + "step": 24579 + }, + { + "epoch": 0.14618422304691217, + "grad_norm": 2.147662878036499, + "learning_rate": 4.740982301607323e-05, + "loss": 4.648, + "step": 24580 + }, + { + "epoch": 0.14619017033019316, + "grad_norm": 2.7005789279937744, + "learning_rate": 4.740961596590409e-05, + "loss": 5.0555, + "step": 24581 + }, + { + "epoch": 0.14619611761347417, + "grad_norm": 2.3652596473693848, + "learning_rate": 4.740940890791199e-05, + "loss": 4.7969, + "step": 24582 + }, + { + "epoch": 0.14620206489675516, + "grad_norm": 2.5925567150115967, + "learning_rate": 4.7409201842097e-05, + "loss": 4.7544, + "step": 24583 + }, + { + "epoch": 0.14620801218003615, + "grad_norm": 1.9309169054031372, + "learning_rate": 4.740899476845918e-05, + "loss": 5.0901, + "step": 24584 + }, + { + "epoch": 0.14621395946331717, + "grad_norm": 2.6501107215881348, + "learning_rate": 4.740878768699861e-05, + "loss": 5.1449, + "step": 24585 + }, + { + "epoch": 0.14621990674659816, + "grad_norm": 2.3010451793670654, + "learning_rate": 4.7408580597715376e-05, + "loss": 5.276, + "step": 24586 + }, + { + "epoch": 0.14622585402987914, + "grad_norm": 1.8606983423233032, + "learning_rate": 4.740837350060953e-05, + "loss": 5.1453, + "step": 24587 + }, + { + "epoch": 0.14623180131316016, + "grad_norm": 2.0047266483306885, + "learning_rate": 4.740816639568115e-05, + "loss": 4.8976, + "step": 24588 + }, + { + "epoch": 0.14623774859644115, + "grad_norm": 2.4806363582611084, + "learning_rate": 4.740795928293032e-05, + "loss": 4.1182, + "step": 24589 + }, + { + "epoch": 0.14624369587972214, + "grad_norm": 2.560715675354004, + "learning_rate": 4.74077521623571e-05, + "loss": 4.4461, + "step": 24590 + }, + { + "epoch": 0.14624964316300315, + "grad_norm": 2.3709921836853027, + "learning_rate": 4.740754503396156e-05, + "loss": 4.5193, + "step": 24591 + }, + { + "epoch": 0.14625559044628414, + "grad_norm": 2.1095876693725586, + "learning_rate": 4.7407337897743784e-05, + "loss": 4.881, + "step": 24592 + }, + { + "epoch": 0.14626153772956513, + "grad_norm": 1.6448874473571777, + "learning_rate": 4.740713075370383e-05, + "loss": 5.0707, + "step": 24593 + }, + { + "epoch": 0.14626748501284614, + "grad_norm": 1.9237885475158691, + "learning_rate": 4.740692360184178e-05, + "loss": 5.0708, + "step": 24594 + }, + { + "epoch": 0.14627343229612713, + "grad_norm": 1.7685006856918335, + "learning_rate": 4.740671644215771e-05, + "loss": 5.0034, + "step": 24595 + }, + { + "epoch": 0.14627937957940812, + "grad_norm": 1.999850869178772, + "learning_rate": 4.740650927465169e-05, + "loss": 5.1153, + "step": 24596 + }, + { + "epoch": 0.14628532686268914, + "grad_norm": 2.0358314514160156, + "learning_rate": 4.740630209932378e-05, + "loss": 5.0567, + "step": 24597 + }, + { + "epoch": 0.14629127414597012, + "grad_norm": 1.883933424949646, + "learning_rate": 4.740609491617407e-05, + "loss": 5.0562, + "step": 24598 + }, + { + "epoch": 0.1462972214292511, + "grad_norm": 2.0172266960144043, + "learning_rate": 4.740588772520261e-05, + "loss": 5.0597, + "step": 24599 + }, + { + "epoch": 0.14630316871253213, + "grad_norm": 1.798579216003418, + "learning_rate": 4.74056805264095e-05, + "loss": 4.9391, + "step": 24600 + }, + { + "epoch": 0.14630911599581312, + "grad_norm": 1.8433833122253418, + "learning_rate": 4.7405473319794794e-05, + "loss": 5.0088, + "step": 24601 + }, + { + "epoch": 0.1463150632790941, + "grad_norm": 1.7729485034942627, + "learning_rate": 4.7405266105358564e-05, + "loss": 4.8909, + "step": 24602 + }, + { + "epoch": 0.14632101056237512, + "grad_norm": 1.9823477268218994, + "learning_rate": 4.740505888310089e-05, + "loss": 5.0547, + "step": 24603 + }, + { + "epoch": 0.1463269578456561, + "grad_norm": 2.0508856773376465, + "learning_rate": 4.740485165302184e-05, + "loss": 5.0857, + "step": 24604 + }, + { + "epoch": 0.1463329051289371, + "grad_norm": 2.0253899097442627, + "learning_rate": 4.740464441512149e-05, + "loss": 4.9882, + "step": 24605 + }, + { + "epoch": 0.1463388524122181, + "grad_norm": 1.977512001991272, + "learning_rate": 4.740443716939991e-05, + "loss": 4.8881, + "step": 24606 + }, + { + "epoch": 0.1463447996954991, + "grad_norm": 1.8985627889633179, + "learning_rate": 4.7404229915857175e-05, + "loss": 5.0182, + "step": 24607 + }, + { + "epoch": 0.1463507469787801, + "grad_norm": 2.009416103363037, + "learning_rate": 4.7404022654493355e-05, + "loss": 4.7361, + "step": 24608 + }, + { + "epoch": 0.14635669426206108, + "grad_norm": 2.3150322437286377, + "learning_rate": 4.7403815385308514e-05, + "loss": 4.2706, + "step": 24609 + }, + { + "epoch": 0.1463626415453421, + "grad_norm": 2.10493540763855, + "learning_rate": 4.740360810830275e-05, + "loss": 4.2009, + "step": 24610 + }, + { + "epoch": 0.14636858882862308, + "grad_norm": 2.019585132598877, + "learning_rate": 4.7403400823476094e-05, + "loss": 4.2991, + "step": 24611 + }, + { + "epoch": 0.14637453611190407, + "grad_norm": 1.966424584388733, + "learning_rate": 4.740319353082866e-05, + "loss": 5.0383, + "step": 24612 + }, + { + "epoch": 0.14638048339518508, + "grad_norm": 2.048212766647339, + "learning_rate": 4.740298623036049e-05, + "loss": 5.0623, + "step": 24613 + }, + { + "epoch": 0.14638643067846607, + "grad_norm": 2.318051338195801, + "learning_rate": 4.740277892207168e-05, + "loss": 5.7096, + "step": 24614 + }, + { + "epoch": 0.14639237796174706, + "grad_norm": 1.6807061433792114, + "learning_rate": 4.740257160596229e-05, + "loss": 4.9725, + "step": 24615 + }, + { + "epoch": 0.14639832524502808, + "grad_norm": 1.968828558921814, + "learning_rate": 4.7402364282032386e-05, + "loss": 4.9904, + "step": 24616 + }, + { + "epoch": 0.14640427252830907, + "grad_norm": 1.8591229915618896, + "learning_rate": 4.740215695028205e-05, + "loss": 4.9013, + "step": 24617 + }, + { + "epoch": 0.14641021981159005, + "grad_norm": 1.8735779523849487, + "learning_rate": 4.740194961071136e-05, + "loss": 5.0174, + "step": 24618 + }, + { + "epoch": 0.14641616709487107, + "grad_norm": 1.9068244695663452, + "learning_rate": 4.740174226332037e-05, + "loss": 4.9578, + "step": 24619 + }, + { + "epoch": 0.14642211437815206, + "grad_norm": 2.136747360229492, + "learning_rate": 4.740153490810917e-05, + "loss": 4.953, + "step": 24620 + }, + { + "epoch": 0.14642806166143305, + "grad_norm": 2.1197381019592285, + "learning_rate": 4.740132754507782e-05, + "loss": 5.1238, + "step": 24621 + }, + { + "epoch": 0.14643400894471406, + "grad_norm": 1.8754642009735107, + "learning_rate": 4.740112017422641e-05, + "loss": 4.9628, + "step": 24622 + }, + { + "epoch": 0.14643995622799505, + "grad_norm": 1.8816076517105103, + "learning_rate": 4.740091279555499e-05, + "loss": 4.8295, + "step": 24623 + }, + { + "epoch": 0.14644590351127604, + "grad_norm": 1.7956056594848633, + "learning_rate": 4.740070540906365e-05, + "loss": 4.7985, + "step": 24624 + }, + { + "epoch": 0.14645185079455705, + "grad_norm": 2.021692991256714, + "learning_rate": 4.740049801475245e-05, + "loss": 4.9583, + "step": 24625 + }, + { + "epoch": 0.14645779807783804, + "grad_norm": 1.69369637966156, + "learning_rate": 4.7400290612621465e-05, + "loss": 4.9205, + "step": 24626 + }, + { + "epoch": 0.14646374536111903, + "grad_norm": 1.7640669345855713, + "learning_rate": 4.740008320267077e-05, + "loss": 5.0191, + "step": 24627 + }, + { + "epoch": 0.14646969264440005, + "grad_norm": 2.0161068439483643, + "learning_rate": 4.739987578490045e-05, + "loss": 5.1847, + "step": 24628 + }, + { + "epoch": 0.14647563992768103, + "grad_norm": 1.8745818138122559, + "learning_rate": 4.7399668359310555e-05, + "loss": 5.0221, + "step": 24629 + }, + { + "epoch": 0.14648158721096202, + "grad_norm": 1.8857629299163818, + "learning_rate": 4.7399460925901164e-05, + "loss": 5.0957, + "step": 24630 + }, + { + "epoch": 0.14648753449424304, + "grad_norm": 1.7315385341644287, + "learning_rate": 4.739925348467236e-05, + "loss": 5.1935, + "step": 24631 + }, + { + "epoch": 0.14649348177752403, + "grad_norm": 1.968795657157898, + "learning_rate": 4.7399046035624204e-05, + "loss": 5.2074, + "step": 24632 + }, + { + "epoch": 0.14649942906080501, + "grad_norm": 1.889760971069336, + "learning_rate": 4.739883857875677e-05, + "loss": 4.7733, + "step": 24633 + }, + { + "epoch": 0.14650537634408603, + "grad_norm": 1.9310023784637451, + "learning_rate": 4.739863111407013e-05, + "loss": 5.0259, + "step": 24634 + }, + { + "epoch": 0.14651132362736702, + "grad_norm": 1.807829737663269, + "learning_rate": 4.739842364156437e-05, + "loss": 4.8263, + "step": 24635 + }, + { + "epoch": 0.146517270910648, + "grad_norm": 1.8053529262542725, + "learning_rate": 4.739821616123955e-05, + "loss": 4.8213, + "step": 24636 + }, + { + "epoch": 0.14652321819392902, + "grad_norm": 1.9432908296585083, + "learning_rate": 4.739800867309574e-05, + "loss": 4.8625, + "step": 24637 + }, + { + "epoch": 0.14652916547721, + "grad_norm": 1.5960321426391602, + "learning_rate": 4.739780117713302e-05, + "loss": 4.6592, + "step": 24638 + }, + { + "epoch": 0.146535112760491, + "grad_norm": 1.9232900142669678, + "learning_rate": 4.739759367335145e-05, + "loss": 4.8859, + "step": 24639 + }, + { + "epoch": 0.14654106004377201, + "grad_norm": 1.8403369188308716, + "learning_rate": 4.739738616175112e-05, + "loss": 4.7934, + "step": 24640 + }, + { + "epoch": 0.146547007327053, + "grad_norm": 1.6142429113388062, + "learning_rate": 4.7397178642332095e-05, + "loss": 4.7553, + "step": 24641 + }, + { + "epoch": 0.146552954610334, + "grad_norm": 1.7207775115966797, + "learning_rate": 4.7396971115094445e-05, + "loss": 4.5229, + "step": 24642 + }, + { + "epoch": 0.146558901893615, + "grad_norm": 1.651342511177063, + "learning_rate": 4.739676358003824e-05, + "loss": 4.7882, + "step": 24643 + }, + { + "epoch": 0.146564849176896, + "grad_norm": 1.5380842685699463, + "learning_rate": 4.7396556037163556e-05, + "loss": 5.1114, + "step": 24644 + }, + { + "epoch": 0.14657079646017698, + "grad_norm": 1.7868518829345703, + "learning_rate": 4.739634848647047e-05, + "loss": 6.0014, + "step": 24645 + }, + { + "epoch": 0.146576743743458, + "grad_norm": 1.7771759033203125, + "learning_rate": 4.7396140927959045e-05, + "loss": 6.0391, + "step": 24646 + }, + { + "epoch": 0.146582691026739, + "grad_norm": 1.7818456888198853, + "learning_rate": 4.739593336162936e-05, + "loss": 5.431, + "step": 24647 + }, + { + "epoch": 0.14658863831001998, + "grad_norm": 1.6585869789123535, + "learning_rate": 4.7395725787481496e-05, + "loss": 5.4888, + "step": 24648 + }, + { + "epoch": 0.146594585593301, + "grad_norm": 1.448287010192871, + "learning_rate": 4.73955182055155e-05, + "loss": 5.5616, + "step": 24649 + }, + { + "epoch": 0.14660053287658198, + "grad_norm": 1.600519061088562, + "learning_rate": 4.739531061573147e-05, + "loss": 5.4446, + "step": 24650 + }, + { + "epoch": 0.14660648015986297, + "grad_norm": 1.5828067064285278, + "learning_rate": 4.7395103018129464e-05, + "loss": 5.7003, + "step": 24651 + }, + { + "epoch": 0.14661242744314398, + "grad_norm": 2.0968759059906006, + "learning_rate": 4.739489541270956e-05, + "loss": 5.4655, + "step": 24652 + }, + { + "epoch": 0.14661837472642497, + "grad_norm": 2.287879467010498, + "learning_rate": 4.739468779947183e-05, + "loss": 5.182, + "step": 24653 + }, + { + "epoch": 0.14662432200970596, + "grad_norm": 1.9258517026901245, + "learning_rate": 4.7394480178416344e-05, + "loss": 5.6223, + "step": 24654 + }, + { + "epoch": 0.14663026929298698, + "grad_norm": 1.9016472101211548, + "learning_rate": 4.7394272549543183e-05, + "loss": 5.304, + "step": 24655 + }, + { + "epoch": 0.14663621657626796, + "grad_norm": 1.4872523546218872, + "learning_rate": 4.739406491285241e-05, + "loss": 5.4679, + "step": 24656 + }, + { + "epoch": 0.14664216385954895, + "grad_norm": 1.6542940139770508, + "learning_rate": 4.73938572683441e-05, + "loss": 5.4644, + "step": 24657 + }, + { + "epoch": 0.14664811114282997, + "grad_norm": 2.210514545440674, + "learning_rate": 4.739364961601832e-05, + "loss": 4.6455, + "step": 24658 + }, + { + "epoch": 0.14665405842611096, + "grad_norm": 2.3305461406707764, + "learning_rate": 4.739344195587515e-05, + "loss": 4.571, + "step": 24659 + }, + { + "epoch": 0.14666000570939194, + "grad_norm": 2.243680238723755, + "learning_rate": 4.739323428791467e-05, + "loss": 4.5274, + "step": 24660 + }, + { + "epoch": 0.14666595299267296, + "grad_norm": 2.1816461086273193, + "learning_rate": 4.739302661213693e-05, + "loss": 4.4871, + "step": 24661 + }, + { + "epoch": 0.14667190027595395, + "grad_norm": 2.0428659915924072, + "learning_rate": 4.739281892854203e-05, + "loss": 4.3641, + "step": 24662 + }, + { + "epoch": 0.14667784755923494, + "grad_norm": 1.902016043663025, + "learning_rate": 4.739261123713001e-05, + "loss": 4.42, + "step": 24663 + }, + { + "epoch": 0.14668379484251595, + "grad_norm": 2.382110118865967, + "learning_rate": 4.7392403537900974e-05, + "loss": 4.3784, + "step": 24664 + }, + { + "epoch": 0.14668974212579694, + "grad_norm": 2.014251470565796, + "learning_rate": 4.739219583085498e-05, + "loss": 4.583, + "step": 24665 + }, + { + "epoch": 0.14669568940907793, + "grad_norm": 2.268214464187622, + "learning_rate": 4.7391988115992106e-05, + "loss": 4.4803, + "step": 24666 + }, + { + "epoch": 0.14670163669235892, + "grad_norm": 2.19326114654541, + "learning_rate": 4.7391780393312405e-05, + "loss": 4.5751, + "step": 24667 + }, + { + "epoch": 0.14670758397563993, + "grad_norm": 2.1453635692596436, + "learning_rate": 4.739157266281597e-05, + "loss": 4.8723, + "step": 24668 + }, + { + "epoch": 0.14671353125892092, + "grad_norm": 1.788976788520813, + "learning_rate": 4.739136492450288e-05, + "loss": 5.3339, + "step": 24669 + }, + { + "epoch": 0.1467194785422019, + "grad_norm": 2.523129940032959, + "learning_rate": 4.739115717837319e-05, + "loss": 4.314, + "step": 24670 + }, + { + "epoch": 0.14672542582548292, + "grad_norm": 2.2541866302490234, + "learning_rate": 4.739094942442698e-05, + "loss": 4.5228, + "step": 24671 + }, + { + "epoch": 0.1467313731087639, + "grad_norm": 2.5569868087768555, + "learning_rate": 4.739074166266431e-05, + "loss": 4.6268, + "step": 24672 + }, + { + "epoch": 0.1467373203920449, + "grad_norm": 1.9912770986557007, + "learning_rate": 4.739053389308528e-05, + "loss": 4.642, + "step": 24673 + }, + { + "epoch": 0.14674326767532592, + "grad_norm": 1.8588427305221558, + "learning_rate": 4.739032611568993e-05, + "loss": 5.2527, + "step": 24674 + }, + { + "epoch": 0.1467492149586069, + "grad_norm": 1.9020613431930542, + "learning_rate": 4.7390118330478356e-05, + "loss": 5.4926, + "step": 24675 + }, + { + "epoch": 0.1467551622418879, + "grad_norm": 2.319058895111084, + "learning_rate": 4.7389910537450624e-05, + "loss": 5.1275, + "step": 24676 + }, + { + "epoch": 0.1467611095251689, + "grad_norm": 1.7051849365234375, + "learning_rate": 4.7389702736606804e-05, + "loss": 5.599, + "step": 24677 + }, + { + "epoch": 0.1467670568084499, + "grad_norm": 1.7340635061264038, + "learning_rate": 4.738949492794696e-05, + "loss": 5.3359, + "step": 24678 + }, + { + "epoch": 0.14677300409173089, + "grad_norm": 1.5634024143218994, + "learning_rate": 4.738928711147119e-05, + "loss": 5.2585, + "step": 24679 + }, + { + "epoch": 0.1467789513750119, + "grad_norm": 1.559401035308838, + "learning_rate": 4.738907928717955e-05, + "loss": 5.297, + "step": 24680 + }, + { + "epoch": 0.1467848986582929, + "grad_norm": 1.5967936515808105, + "learning_rate": 4.738887145507211e-05, + "loss": 5.2068, + "step": 24681 + }, + { + "epoch": 0.14679084594157388, + "grad_norm": 1.6294320821762085, + "learning_rate": 4.7388663615148945e-05, + "loss": 5.1878, + "step": 24682 + }, + { + "epoch": 0.1467967932248549, + "grad_norm": 1.4520001411437988, + "learning_rate": 4.7388455767410135e-05, + "loss": 5.0777, + "step": 24683 + }, + { + "epoch": 0.14680274050813588, + "grad_norm": 1.3392236232757568, + "learning_rate": 4.738824791185573e-05, + "loss": 5.2396, + "step": 24684 + }, + { + "epoch": 0.14680868779141687, + "grad_norm": 1.467822551727295, + "learning_rate": 4.738804004848584e-05, + "loss": 5.253, + "step": 24685 + }, + { + "epoch": 0.14681463507469789, + "grad_norm": 1.5025224685668945, + "learning_rate": 4.7387832177300504e-05, + "loss": 5.386, + "step": 24686 + }, + { + "epoch": 0.14682058235797887, + "grad_norm": 1.6178737878799438, + "learning_rate": 4.73876242982998e-05, + "loss": 5.2601, + "step": 24687 + }, + { + "epoch": 0.14682652964125986, + "grad_norm": 1.4832427501678467, + "learning_rate": 4.7387416411483825e-05, + "loss": 5.0987, + "step": 24688 + }, + { + "epoch": 0.14683247692454088, + "grad_norm": 1.4726454019546509, + "learning_rate": 4.738720851685263e-05, + "loss": 5.3468, + "step": 24689 + }, + { + "epoch": 0.14683842420782187, + "grad_norm": 1.5659757852554321, + "learning_rate": 4.7387000614406284e-05, + "loss": 5.1591, + "step": 24690 + }, + { + "epoch": 0.14684437149110285, + "grad_norm": 1.7832130193710327, + "learning_rate": 4.7386792704144875e-05, + "loss": 5.126, + "step": 24691 + }, + { + "epoch": 0.14685031877438387, + "grad_norm": 1.6943825483322144, + "learning_rate": 4.738658478606846e-05, + "loss": 5.4705, + "step": 24692 + }, + { + "epoch": 0.14685626605766486, + "grad_norm": 1.4877350330352783, + "learning_rate": 4.738637686017713e-05, + "loss": 5.3479, + "step": 24693 + }, + { + "epoch": 0.14686221334094585, + "grad_norm": 2.306101083755493, + "learning_rate": 4.738616892647094e-05, + "loss": 4.4746, + "step": 24694 + }, + { + "epoch": 0.14686816062422686, + "grad_norm": 2.2277164459228516, + "learning_rate": 4.7385960984949976e-05, + "loss": 4.4995, + "step": 24695 + }, + { + "epoch": 0.14687410790750785, + "grad_norm": 1.535406231880188, + "learning_rate": 4.738575303561429e-05, + "loss": 5.3042, + "step": 24696 + }, + { + "epoch": 0.14688005519078884, + "grad_norm": 1.7974361181259155, + "learning_rate": 4.738554507846398e-05, + "loss": 5.3804, + "step": 24697 + }, + { + "epoch": 0.14688600247406985, + "grad_norm": 1.9455167055130005, + "learning_rate": 4.7385337113499104e-05, + "loss": 4.9782, + "step": 24698 + }, + { + "epoch": 0.14689194975735084, + "grad_norm": 2.486859083175659, + "learning_rate": 4.738512914071974e-05, + "loss": 4.5543, + "step": 24699 + }, + { + "epoch": 0.14689789704063183, + "grad_norm": 2.1134984493255615, + "learning_rate": 4.738492116012596e-05, + "loss": 4.3281, + "step": 24700 + }, + { + "epoch": 0.14690384432391285, + "grad_norm": 2.081852674484253, + "learning_rate": 4.7384713171717833e-05, + "loss": 4.3307, + "step": 24701 + }, + { + "epoch": 0.14690979160719383, + "grad_norm": 2.3121731281280518, + "learning_rate": 4.7384505175495435e-05, + "loss": 4.4791, + "step": 24702 + }, + { + "epoch": 0.14691573889047482, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.738429717145883e-05, + "loss": 4.5165, + "step": 24703 + }, + { + "epoch": 0.14692168617375584, + "grad_norm": 1.7863034009933472, + "learning_rate": 4.7384089159608115e-05, + "loss": 4.8086, + "step": 24704 + }, + { + "epoch": 0.14692763345703683, + "grad_norm": 2.0969200134277344, + "learning_rate": 4.7383881139943335e-05, + "loss": 4.7512, + "step": 24705 + }, + { + "epoch": 0.14693358074031782, + "grad_norm": 1.9164679050445557, + "learning_rate": 4.738367311246458e-05, + "loss": 4.5249, + "step": 24706 + }, + { + "epoch": 0.14693952802359883, + "grad_norm": 1.8215450048446655, + "learning_rate": 4.738346507717191e-05, + "loss": 4.7016, + "step": 24707 + }, + { + "epoch": 0.14694547530687982, + "grad_norm": 1.7830946445465088, + "learning_rate": 4.7383257034065395e-05, + "loss": 4.6173, + "step": 24708 + }, + { + "epoch": 0.1469514225901608, + "grad_norm": 1.7251957654953003, + "learning_rate": 4.7383048983145126e-05, + "loss": 4.9539, + "step": 24709 + }, + { + "epoch": 0.14695736987344182, + "grad_norm": 1.6763554811477661, + "learning_rate": 4.738284092441117e-05, + "loss": 4.8123, + "step": 24710 + }, + { + "epoch": 0.1469633171567228, + "grad_norm": 1.5693418979644775, + "learning_rate": 4.738263285786358e-05, + "loss": 4.6586, + "step": 24711 + }, + { + "epoch": 0.1469692644400038, + "grad_norm": 2.5585360527038574, + "learning_rate": 4.738242478350247e-05, + "loss": 4.2875, + "step": 24712 + }, + { + "epoch": 0.14697521172328482, + "grad_norm": 2.41618275642395, + "learning_rate": 4.738221670132786e-05, + "loss": 4.3448, + "step": 24713 + }, + { + "epoch": 0.1469811590065658, + "grad_norm": 2.233074903488159, + "learning_rate": 4.7382008611339867e-05, + "loss": 5.2453, + "step": 24714 + }, + { + "epoch": 0.1469871062898468, + "grad_norm": 1.7833389043807983, + "learning_rate": 4.738180051353854e-05, + "loss": 4.9964, + "step": 24715 + }, + { + "epoch": 0.1469930535731278, + "grad_norm": 1.7970653772354126, + "learning_rate": 4.738159240792396e-05, + "loss": 4.5124, + "step": 24716 + }, + { + "epoch": 0.1469990008564088, + "grad_norm": 2.1043243408203125, + "learning_rate": 4.738138429449619e-05, + "loss": 4.3681, + "step": 24717 + }, + { + "epoch": 0.14700494813968978, + "grad_norm": 1.5849015712738037, + "learning_rate": 4.738117617325532e-05, + "loss": 4.7756, + "step": 24718 + }, + { + "epoch": 0.1470108954229708, + "grad_norm": 1.5067150592803955, + "learning_rate": 4.73809680442014e-05, + "loss": 4.6255, + "step": 24719 + }, + { + "epoch": 0.1470168427062518, + "grad_norm": 1.5583860874176025, + "learning_rate": 4.7380759907334524e-05, + "loss": 4.7671, + "step": 24720 + }, + { + "epoch": 0.14702278998953278, + "grad_norm": 1.9732975959777832, + "learning_rate": 4.7380551762654755e-05, + "loss": 4.514, + "step": 24721 + }, + { + "epoch": 0.1470287372728138, + "grad_norm": 2.2196953296661377, + "learning_rate": 4.738034361016217e-05, + "loss": 4.2897, + "step": 24722 + }, + { + "epoch": 0.14703468455609478, + "grad_norm": 2.3124115467071533, + "learning_rate": 4.738013544985683e-05, + "loss": 4.2081, + "step": 24723 + }, + { + "epoch": 0.14704063183937577, + "grad_norm": 2.4807839393615723, + "learning_rate": 4.737992728173882e-05, + "loss": 4.5975, + "step": 24724 + }, + { + "epoch": 0.14704657912265678, + "grad_norm": 1.6757773160934448, + "learning_rate": 4.737971910580821e-05, + "loss": 5.6665, + "step": 24725 + }, + { + "epoch": 0.14705252640593777, + "grad_norm": 1.9433516263961792, + "learning_rate": 4.7379510922065074e-05, + "loss": 5.0243, + "step": 24726 + }, + { + "epoch": 0.14705847368921876, + "grad_norm": 2.392778158187866, + "learning_rate": 4.737930273050948e-05, + "loss": 4.7769, + "step": 24727 + }, + { + "epoch": 0.14706442097249975, + "grad_norm": 2.730144739151001, + "learning_rate": 4.73790945311415e-05, + "loss": 4.8214, + "step": 24728 + }, + { + "epoch": 0.14707036825578076, + "grad_norm": 1.9504640102386475, + "learning_rate": 4.7378886323961205e-05, + "loss": 4.8057, + "step": 24729 + }, + { + "epoch": 0.14707631553906175, + "grad_norm": 1.7174079418182373, + "learning_rate": 4.7378678108968675e-05, + "loss": 5.1865, + "step": 24730 + }, + { + "epoch": 0.14708226282234274, + "grad_norm": 2.109645128250122, + "learning_rate": 4.737846988616399e-05, + "loss": 5.1682, + "step": 24731 + }, + { + "epoch": 0.14708821010562376, + "grad_norm": 1.9357048273086548, + "learning_rate": 4.7378261655547204e-05, + "loss": 5.0972, + "step": 24732 + }, + { + "epoch": 0.14709415738890474, + "grad_norm": 1.4660345315933228, + "learning_rate": 4.73780534171184e-05, + "loss": 5.7247, + "step": 24733 + }, + { + "epoch": 0.14710010467218573, + "grad_norm": 1.8927645683288574, + "learning_rate": 4.7377845170877644e-05, + "loss": 5.241, + "step": 24734 + }, + { + "epoch": 0.14710605195546675, + "grad_norm": 1.1164909601211548, + "learning_rate": 4.737763691682502e-05, + "loss": 5.4844, + "step": 24735 + }, + { + "epoch": 0.14711199923874774, + "grad_norm": 1.5676599740982056, + "learning_rate": 4.7377428654960584e-05, + "loss": 5.0659, + "step": 24736 + }, + { + "epoch": 0.14711794652202873, + "grad_norm": 2.404731273651123, + "learning_rate": 4.737722038528443e-05, + "loss": 4.5183, + "step": 24737 + }, + { + "epoch": 0.14712389380530974, + "grad_norm": 1.9689422845840454, + "learning_rate": 4.7377012107796615e-05, + "loss": 4.9564, + "step": 24738 + }, + { + "epoch": 0.14712984108859073, + "grad_norm": 2.320307970046997, + "learning_rate": 4.737680382249721e-05, + "loss": 4.4609, + "step": 24739 + }, + { + "epoch": 0.14713578837187172, + "grad_norm": 1.8649024963378906, + "learning_rate": 4.7376595529386305e-05, + "loss": 4.7436, + "step": 24740 + }, + { + "epoch": 0.14714173565515273, + "grad_norm": 2.112926721572876, + "learning_rate": 4.7376387228463956e-05, + "loss": 4.6949, + "step": 24741 + }, + { + "epoch": 0.14714768293843372, + "grad_norm": 2.237760543823242, + "learning_rate": 4.737617891973024e-05, + "loss": 4.5927, + "step": 24742 + }, + { + "epoch": 0.1471536302217147, + "grad_norm": 2.115577220916748, + "learning_rate": 4.737597060318524e-05, + "loss": 4.4007, + "step": 24743 + }, + { + "epoch": 0.14715957750499573, + "grad_norm": 2.0081801414489746, + "learning_rate": 4.737576227882901e-05, + "loss": 4.3844, + "step": 24744 + }, + { + "epoch": 0.1471655247882767, + "grad_norm": 2.1995346546173096, + "learning_rate": 4.737555394666163e-05, + "loss": 4.4581, + "step": 24745 + }, + { + "epoch": 0.1471714720715577, + "grad_norm": 2.2637784481048584, + "learning_rate": 4.7375345606683184e-05, + "loss": 4.4969, + "step": 24746 + }, + { + "epoch": 0.14717741935483872, + "grad_norm": 2.4739608764648438, + "learning_rate": 4.737513725889373e-05, + "loss": 4.521, + "step": 24747 + }, + { + "epoch": 0.1471833666381197, + "grad_norm": 1.6418421268463135, + "learning_rate": 4.737492890329335e-05, + "loss": 5.1064, + "step": 24748 + }, + { + "epoch": 0.1471893139214007, + "grad_norm": 1.9451549053192139, + "learning_rate": 4.737472053988212e-05, + "loss": 4.6824, + "step": 24749 + }, + { + "epoch": 0.1471952612046817, + "grad_norm": 1.9891009330749512, + "learning_rate": 4.7374512168660094e-05, + "loss": 5.2228, + "step": 24750 + }, + { + "epoch": 0.1472012084879627, + "grad_norm": 2.1582279205322266, + "learning_rate": 4.737430378962736e-05, + "loss": 5.7231, + "step": 24751 + }, + { + "epoch": 0.1472071557712437, + "grad_norm": 1.8569883108139038, + "learning_rate": 4.737409540278399e-05, + "loss": 5.3307, + "step": 24752 + }, + { + "epoch": 0.1472131030545247, + "grad_norm": 1.4937759637832642, + "learning_rate": 4.737388700813006e-05, + "loss": 5.3213, + "step": 24753 + }, + { + "epoch": 0.1472190503378057, + "grad_norm": 1.6692577600479126, + "learning_rate": 4.737367860566563e-05, + "loss": 5.2426, + "step": 24754 + }, + { + "epoch": 0.14722499762108668, + "grad_norm": 2.3550398349761963, + "learning_rate": 4.737347019539078e-05, + "loss": 4.7053, + "step": 24755 + }, + { + "epoch": 0.1472309449043677, + "grad_norm": 2.122601270675659, + "learning_rate": 4.737326177730559e-05, + "loss": 4.9372, + "step": 24756 + }, + { + "epoch": 0.14723689218764868, + "grad_norm": 1.429738163948059, + "learning_rate": 4.737305335141012e-05, + "loss": 4.7637, + "step": 24757 + }, + { + "epoch": 0.14724283947092967, + "grad_norm": 1.6185976266860962, + "learning_rate": 4.7372844917704445e-05, + "loss": 4.6184, + "step": 24758 + }, + { + "epoch": 0.1472487867542107, + "grad_norm": 1.495154619216919, + "learning_rate": 4.737263647618865e-05, + "loss": 4.4256, + "step": 24759 + }, + { + "epoch": 0.14725473403749167, + "grad_norm": 1.366437554359436, + "learning_rate": 4.737242802686279e-05, + "loss": 4.5822, + "step": 24760 + }, + { + "epoch": 0.14726068132077266, + "grad_norm": 2.3462178707122803, + "learning_rate": 4.737221956972695e-05, + "loss": 4.9419, + "step": 24761 + }, + { + "epoch": 0.14726662860405368, + "grad_norm": 2.846083402633667, + "learning_rate": 4.73720111047812e-05, + "loss": 4.6403, + "step": 24762 + }, + { + "epoch": 0.14727257588733467, + "grad_norm": 2.388052463531494, + "learning_rate": 4.7371802632025605e-05, + "loss": 4.5375, + "step": 24763 + }, + { + "epoch": 0.14727852317061566, + "grad_norm": 1.4230948686599731, + "learning_rate": 4.7371594151460254e-05, + "loss": 4.6451, + "step": 24764 + }, + { + "epoch": 0.14728447045389667, + "grad_norm": 1.2602354288101196, + "learning_rate": 4.737138566308521e-05, + "loss": 4.4927, + "step": 24765 + }, + { + "epoch": 0.14729041773717766, + "grad_norm": 1.9645811319351196, + "learning_rate": 4.737117716690054e-05, + "loss": 4.875, + "step": 24766 + }, + { + "epoch": 0.14729636502045865, + "grad_norm": 2.729315757751465, + "learning_rate": 4.7370968662906325e-05, + "loss": 4.0048, + "step": 24767 + }, + { + "epoch": 0.14730231230373966, + "grad_norm": 2.797999382019043, + "learning_rate": 4.7370760151102635e-05, + "loss": 4.3436, + "step": 24768 + }, + { + "epoch": 0.14730825958702065, + "grad_norm": 2.058621406555176, + "learning_rate": 4.737055163148955e-05, + "loss": 4.4137, + "step": 24769 + }, + { + "epoch": 0.14731420687030164, + "grad_norm": 1.9290826320648193, + "learning_rate": 4.737034310406713e-05, + "loss": 4.4751, + "step": 24770 + }, + { + "epoch": 0.14732015415358266, + "grad_norm": 2.316140651702881, + "learning_rate": 4.737013456883546e-05, + "loss": 4.4009, + "step": 24771 + }, + { + "epoch": 0.14732610143686364, + "grad_norm": 2.326529026031494, + "learning_rate": 4.7369926025794606e-05, + "loss": 4.4272, + "step": 24772 + }, + { + "epoch": 0.14733204872014463, + "grad_norm": 2.089818239212036, + "learning_rate": 4.736971747494464e-05, + "loss": 4.4192, + "step": 24773 + }, + { + "epoch": 0.14733799600342565, + "grad_norm": 1.714152455329895, + "learning_rate": 4.736950891628564e-05, + "loss": 5.1404, + "step": 24774 + }, + { + "epoch": 0.14734394328670664, + "grad_norm": 2.01911997795105, + "learning_rate": 4.736930034981767e-05, + "loss": 4.7116, + "step": 24775 + }, + { + "epoch": 0.14734989056998762, + "grad_norm": 2.0275747776031494, + "learning_rate": 4.736909177554081e-05, + "loss": 4.4249, + "step": 24776 + }, + { + "epoch": 0.14735583785326864, + "grad_norm": 1.9515576362609863, + "learning_rate": 4.7368883193455135e-05, + "loss": 4.3968, + "step": 24777 + }, + { + "epoch": 0.14736178513654963, + "grad_norm": 1.6079367399215698, + "learning_rate": 4.736867460356071e-05, + "loss": 4.3927, + "step": 24778 + }, + { + "epoch": 0.14736773241983062, + "grad_norm": 1.856449842453003, + "learning_rate": 4.736846600585761e-05, + "loss": 4.4231, + "step": 24779 + }, + { + "epoch": 0.14737367970311163, + "grad_norm": 1.7405143976211548, + "learning_rate": 4.7368257400345915e-05, + "loss": 5.4894, + "step": 24780 + }, + { + "epoch": 0.14737962698639262, + "grad_norm": 1.6344300508499146, + "learning_rate": 4.736804878702569e-05, + "loss": 5.5489, + "step": 24781 + }, + { + "epoch": 0.1473855742696736, + "grad_norm": 1.693015694618225, + "learning_rate": 4.7367840165897014e-05, + "loss": 5.6432, + "step": 24782 + }, + { + "epoch": 0.14739152155295462, + "grad_norm": 1.5487139225006104, + "learning_rate": 4.736763153695995e-05, + "loss": 4.6316, + "step": 24783 + }, + { + "epoch": 0.1473974688362356, + "grad_norm": 1.5867420434951782, + "learning_rate": 4.736742290021458e-05, + "loss": 4.3782, + "step": 24784 + }, + { + "epoch": 0.1474034161195166, + "grad_norm": 1.7892907857894897, + "learning_rate": 4.736721425566097e-05, + "loss": 4.413, + "step": 24785 + }, + { + "epoch": 0.1474093634027976, + "grad_norm": 1.7791600227355957, + "learning_rate": 4.7367005603299206e-05, + "loss": 4.9471, + "step": 24786 + }, + { + "epoch": 0.1474153106860786, + "grad_norm": 1.5871254205703735, + "learning_rate": 4.736679694312934e-05, + "loss": 5.6475, + "step": 24787 + }, + { + "epoch": 0.1474212579693596, + "grad_norm": 1.5154014825820923, + "learning_rate": 4.7366588275151465e-05, + "loss": 5.6038, + "step": 24788 + }, + { + "epoch": 0.14742720525264058, + "grad_norm": 1.4058479070663452, + "learning_rate": 4.736637959936564e-05, + "loss": 5.4371, + "step": 24789 + }, + { + "epoch": 0.1474331525359216, + "grad_norm": 1.5023268461227417, + "learning_rate": 4.7366170915771946e-05, + "loss": 5.6043, + "step": 24790 + }, + { + "epoch": 0.14743909981920258, + "grad_norm": 1.573081135749817, + "learning_rate": 4.7365962224370445e-05, + "loss": 4.6014, + "step": 24791 + }, + { + "epoch": 0.14744504710248357, + "grad_norm": 1.413909673690796, + "learning_rate": 4.7365753525161225e-05, + "loss": 5.1478, + "step": 24792 + }, + { + "epoch": 0.1474509943857646, + "grad_norm": 1.6636765003204346, + "learning_rate": 4.736554481814435e-05, + "loss": 5.3099, + "step": 24793 + }, + { + "epoch": 0.14745694166904558, + "grad_norm": 1.4575749635696411, + "learning_rate": 4.7365336103319904e-05, + "loss": 4.7067, + "step": 24794 + }, + { + "epoch": 0.14746288895232657, + "grad_norm": 1.4840314388275146, + "learning_rate": 4.736512738068793e-05, + "loss": 5.3591, + "step": 24795 + }, + { + "epoch": 0.14746883623560758, + "grad_norm": 1.8716658353805542, + "learning_rate": 4.736491865024853e-05, + "loss": 4.9905, + "step": 24796 + }, + { + "epoch": 0.14747478351888857, + "grad_norm": 1.5661007165908813, + "learning_rate": 4.736470991200178e-05, + "loss": 5.5725, + "step": 24797 + }, + { + "epoch": 0.14748073080216956, + "grad_norm": 1.7020787000656128, + "learning_rate": 4.736450116594773e-05, + "loss": 4.97, + "step": 24798 + }, + { + "epoch": 0.14748667808545057, + "grad_norm": 1.7010732889175415, + "learning_rate": 4.736429241208646e-05, + "loss": 5.0832, + "step": 24799 + }, + { + "epoch": 0.14749262536873156, + "grad_norm": 2.984389305114746, + "learning_rate": 4.7364083650418057e-05, + "loss": 4.5466, + "step": 24800 + }, + { + "epoch": 0.14749857265201255, + "grad_norm": 1.8300197124481201, + "learning_rate": 4.7363874880942574e-05, + "loss": 4.9772, + "step": 24801 + }, + { + "epoch": 0.14750451993529357, + "grad_norm": 1.685394048690796, + "learning_rate": 4.73636661036601e-05, + "loss": 5.0689, + "step": 24802 + }, + { + "epoch": 0.14751046721857455, + "grad_norm": 1.559996485710144, + "learning_rate": 4.7363457318570695e-05, + "loss": 5.1496, + "step": 24803 + }, + { + "epoch": 0.14751641450185554, + "grad_norm": 1.5654375553131104, + "learning_rate": 4.736324852567444e-05, + "loss": 5.1427, + "step": 24804 + }, + { + "epoch": 0.14752236178513656, + "grad_norm": 2.0388715267181396, + "learning_rate": 4.736303972497141e-05, + "loss": 4.6176, + "step": 24805 + }, + { + "epoch": 0.14752830906841755, + "grad_norm": 2.139695882797241, + "learning_rate": 4.736283091646167e-05, + "loss": 4.7746, + "step": 24806 + }, + { + "epoch": 0.14753425635169853, + "grad_norm": 1.6551018953323364, + "learning_rate": 4.73626221001453e-05, + "loss": 5.3522, + "step": 24807 + }, + { + "epoch": 0.14754020363497955, + "grad_norm": 1.6643954515457153, + "learning_rate": 4.7362413276022364e-05, + "loss": 5.5479, + "step": 24808 + }, + { + "epoch": 0.14754615091826054, + "grad_norm": 1.6942282915115356, + "learning_rate": 4.7362204444092947e-05, + "loss": 5.2971, + "step": 24809 + }, + { + "epoch": 0.14755209820154153, + "grad_norm": 2.1273419857025146, + "learning_rate": 4.736199560435711e-05, + "loss": 5.1465, + "step": 24810 + }, + { + "epoch": 0.14755804548482254, + "grad_norm": 2.1430892944335938, + "learning_rate": 4.736178675681493e-05, + "loss": 4.9944, + "step": 24811 + }, + { + "epoch": 0.14756399276810353, + "grad_norm": 2.1971189975738525, + "learning_rate": 4.736157790146649e-05, + "loss": 5.2348, + "step": 24812 + }, + { + "epoch": 0.14756994005138452, + "grad_norm": 1.7993513345718384, + "learning_rate": 4.7361369038311855e-05, + "loss": 5.0186, + "step": 24813 + }, + { + "epoch": 0.14757588733466553, + "grad_norm": 1.8296352624893188, + "learning_rate": 4.7361160167351085e-05, + "loss": 4.9939, + "step": 24814 + }, + { + "epoch": 0.14758183461794652, + "grad_norm": 1.6994922161102295, + "learning_rate": 4.7360951288584276e-05, + "loss": 5.0838, + "step": 24815 + }, + { + "epoch": 0.1475877819012275, + "grad_norm": 1.8526664972305298, + "learning_rate": 4.736074240201148e-05, + "loss": 4.9977, + "step": 24816 + }, + { + "epoch": 0.14759372918450853, + "grad_norm": 1.6255830526351929, + "learning_rate": 4.736053350763279e-05, + "loss": 5.111, + "step": 24817 + }, + { + "epoch": 0.14759967646778951, + "grad_norm": 1.6871737241744995, + "learning_rate": 4.736032460544826e-05, + "loss": 4.8522, + "step": 24818 + }, + { + "epoch": 0.1476056237510705, + "grad_norm": 1.8430577516555786, + "learning_rate": 4.7360115695457975e-05, + "loss": 4.9312, + "step": 24819 + }, + { + "epoch": 0.14761157103435152, + "grad_norm": 1.6737143993377686, + "learning_rate": 4.735990677766201e-05, + "loss": 4.7894, + "step": 24820 + }, + { + "epoch": 0.1476175183176325, + "grad_norm": 1.648138403892517, + "learning_rate": 4.7359697852060425e-05, + "loss": 4.8173, + "step": 24821 + }, + { + "epoch": 0.1476234656009135, + "grad_norm": 1.8230416774749756, + "learning_rate": 4.73594889186533e-05, + "loss": 5.0618, + "step": 24822 + }, + { + "epoch": 0.1476294128841945, + "grad_norm": 1.928932547569275, + "learning_rate": 4.735927997744072e-05, + "loss": 4.8846, + "step": 24823 + }, + { + "epoch": 0.1476353601674755, + "grad_norm": 1.8593389987945557, + "learning_rate": 4.735907102842273e-05, + "loss": 5.0283, + "step": 24824 + }, + { + "epoch": 0.1476413074507565, + "grad_norm": 1.988168478012085, + "learning_rate": 4.735886207159943e-05, + "loss": 5.0253, + "step": 24825 + }, + { + "epoch": 0.1476472547340375, + "grad_norm": 1.6367772817611694, + "learning_rate": 4.7358653106970885e-05, + "loss": 4.9296, + "step": 24826 + }, + { + "epoch": 0.1476532020173185, + "grad_norm": 1.7799687385559082, + "learning_rate": 4.7358444134537154e-05, + "loss": 4.5257, + "step": 24827 + }, + { + "epoch": 0.14765914930059948, + "grad_norm": 1.8706213235855103, + "learning_rate": 4.735823515429833e-05, + "loss": 4.9739, + "step": 24828 + }, + { + "epoch": 0.1476650965838805, + "grad_norm": 1.7662311792373657, + "learning_rate": 4.7358026166254476e-05, + "loss": 4.9545, + "step": 24829 + }, + { + "epoch": 0.14767104386716148, + "grad_norm": 1.6466079950332642, + "learning_rate": 4.7357817170405664e-05, + "loss": 4.8203, + "step": 24830 + }, + { + "epoch": 0.14767699115044247, + "grad_norm": 1.7296116352081299, + "learning_rate": 4.7357608166751965e-05, + "loss": 4.7575, + "step": 24831 + }, + { + "epoch": 0.1476829384337235, + "grad_norm": 1.6118981838226318, + "learning_rate": 4.735739915529346e-05, + "loss": 4.6546, + "step": 24832 + }, + { + "epoch": 0.14768888571700448, + "grad_norm": 1.7108652591705322, + "learning_rate": 4.735719013603022e-05, + "loss": 5.5278, + "step": 24833 + }, + { + "epoch": 0.14769483300028546, + "grad_norm": 1.583243727684021, + "learning_rate": 4.735698110896232e-05, + "loss": 5.5526, + "step": 24834 + }, + { + "epoch": 0.14770078028356648, + "grad_norm": 1.9354965686798096, + "learning_rate": 4.735677207408982e-05, + "loss": 4.9137, + "step": 24835 + }, + { + "epoch": 0.14770672756684747, + "grad_norm": 2.2551913261413574, + "learning_rate": 4.7356563031412805e-05, + "loss": 5.105, + "step": 24836 + }, + { + "epoch": 0.14771267485012846, + "grad_norm": 1.8324413299560547, + "learning_rate": 4.7356353980931344e-05, + "loss": 5.1002, + "step": 24837 + }, + { + "epoch": 0.14771862213340947, + "grad_norm": 1.7993746995925903, + "learning_rate": 4.7356144922645504e-05, + "loss": 5.0061, + "step": 24838 + }, + { + "epoch": 0.14772456941669046, + "grad_norm": 1.6633015871047974, + "learning_rate": 4.735593585655538e-05, + "loss": 5.6399, + "step": 24839 + }, + { + "epoch": 0.14773051669997145, + "grad_norm": 1.6153156757354736, + "learning_rate": 4.735572678266102e-05, + "loss": 5.845, + "step": 24840 + }, + { + "epoch": 0.14773646398325246, + "grad_norm": 1.5680739879608154, + "learning_rate": 4.7355517700962506e-05, + "loss": 4.9451, + "step": 24841 + }, + { + "epoch": 0.14774241126653345, + "grad_norm": 1.7775828838348389, + "learning_rate": 4.735530861145992e-05, + "loss": 5.3363, + "step": 24842 + }, + { + "epoch": 0.14774835854981444, + "grad_norm": 1.5199836492538452, + "learning_rate": 4.7355099514153316e-05, + "loss": 5.2147, + "step": 24843 + }, + { + "epoch": 0.14775430583309543, + "grad_norm": 1.5332800149917603, + "learning_rate": 4.7354890409042783e-05, + "loss": 5.2439, + "step": 24844 + }, + { + "epoch": 0.14776025311637644, + "grad_norm": 2.0724799633026123, + "learning_rate": 4.735468129612839e-05, + "loss": 5.0292, + "step": 24845 + }, + { + "epoch": 0.14776620039965743, + "grad_norm": 2.5946760177612305, + "learning_rate": 4.73544721754102e-05, + "loss": 4.973, + "step": 24846 + }, + { + "epoch": 0.14777214768293842, + "grad_norm": 1.9194954633712769, + "learning_rate": 4.735426304688831e-05, + "loss": 4.7452, + "step": 24847 + }, + { + "epoch": 0.14777809496621944, + "grad_norm": 1.38433039188385, + "learning_rate": 4.735405391056277e-05, + "loss": 5.5551, + "step": 24848 + }, + { + "epoch": 0.14778404224950042, + "grad_norm": 1.8728227615356445, + "learning_rate": 4.735384476643366e-05, + "loss": 5.3088, + "step": 24849 + }, + { + "epoch": 0.1477899895327814, + "grad_norm": 1.6192907094955444, + "learning_rate": 4.7353635614501054e-05, + "loss": 5.3365, + "step": 24850 + }, + { + "epoch": 0.14779593681606243, + "grad_norm": 1.4671828746795654, + "learning_rate": 4.735342645476503e-05, + "loss": 5.5339, + "step": 24851 + }, + { + "epoch": 0.14780188409934342, + "grad_norm": 1.924024224281311, + "learning_rate": 4.7353217287225646e-05, + "loss": 5.2287, + "step": 24852 + }, + { + "epoch": 0.1478078313826244, + "grad_norm": 1.6585190296173096, + "learning_rate": 4.735300811188299e-05, + "loss": 5.124, + "step": 24853 + }, + { + "epoch": 0.14781377866590542, + "grad_norm": 1.6820423603057861, + "learning_rate": 4.735279892873713e-05, + "loss": 5.4088, + "step": 24854 + }, + { + "epoch": 0.1478197259491864, + "grad_norm": 1.5978790521621704, + "learning_rate": 4.7352589737788134e-05, + "loss": 5.8087, + "step": 24855 + }, + { + "epoch": 0.1478256732324674, + "grad_norm": 1.6521705389022827, + "learning_rate": 4.735238053903609e-05, + "loss": 5.2014, + "step": 24856 + }, + { + "epoch": 0.1478316205157484, + "grad_norm": 1.6667120456695557, + "learning_rate": 4.7352171332481056e-05, + "loss": 5.1015, + "step": 24857 + }, + { + "epoch": 0.1478375677990294, + "grad_norm": 1.7318087816238403, + "learning_rate": 4.735196211812311e-05, + "loss": 5.4063, + "step": 24858 + }, + { + "epoch": 0.1478435150823104, + "grad_norm": 1.7706724405288696, + "learning_rate": 4.735175289596232e-05, + "loss": 5.0941, + "step": 24859 + }, + { + "epoch": 0.1478494623655914, + "grad_norm": 1.5582432746887207, + "learning_rate": 4.7351543665998764e-05, + "loss": 5.2643, + "step": 24860 + }, + { + "epoch": 0.1478554096488724, + "grad_norm": 1.5588469505310059, + "learning_rate": 4.735133442823252e-05, + "loss": 5.5234, + "step": 24861 + }, + { + "epoch": 0.14786135693215338, + "grad_norm": 2.5532615184783936, + "learning_rate": 4.735112518266366e-05, + "loss": 4.5405, + "step": 24862 + }, + { + "epoch": 0.1478673042154344, + "grad_norm": 1.5495831966400146, + "learning_rate": 4.735091592929224e-05, + "loss": 5.5153, + "step": 24863 + }, + { + "epoch": 0.14787325149871539, + "grad_norm": 1.4878839254379272, + "learning_rate": 4.7350706668118356e-05, + "loss": 5.2186, + "step": 24864 + }, + { + "epoch": 0.14787919878199637, + "grad_norm": 1.4914618730545044, + "learning_rate": 4.735049739914207e-05, + "loss": 5.3108, + "step": 24865 + }, + { + "epoch": 0.1478851460652774, + "grad_norm": 1.6413542032241821, + "learning_rate": 4.735028812236345e-05, + "loss": 5.2726, + "step": 24866 + }, + { + "epoch": 0.14789109334855838, + "grad_norm": 1.6650172472000122, + "learning_rate": 4.735007883778259e-05, + "loss": 5.3186, + "step": 24867 + }, + { + "epoch": 0.14789704063183937, + "grad_norm": 1.5289151668548584, + "learning_rate": 4.734986954539954e-05, + "loss": 5.1124, + "step": 24868 + }, + { + "epoch": 0.14790298791512038, + "grad_norm": 1.5151697397232056, + "learning_rate": 4.734966024521438e-05, + "loss": 5.495, + "step": 24869 + }, + { + "epoch": 0.14790893519840137, + "grad_norm": 1.3832122087478638, + "learning_rate": 4.734945093722718e-05, + "loss": 5.426, + "step": 24870 + }, + { + "epoch": 0.14791488248168236, + "grad_norm": 1.6117453575134277, + "learning_rate": 4.7349241621438023e-05, + "loss": 5.2548, + "step": 24871 + }, + { + "epoch": 0.14792082976496337, + "grad_norm": 1.5391991138458252, + "learning_rate": 4.734903229784698e-05, + "loss": 4.7025, + "step": 24872 + }, + { + "epoch": 0.14792677704824436, + "grad_norm": 1.649274468421936, + "learning_rate": 4.734882296645411e-05, + "loss": 5.4152, + "step": 24873 + }, + { + "epoch": 0.14793272433152535, + "grad_norm": 1.7147942781448364, + "learning_rate": 4.734861362725951e-05, + "loss": 5.4865, + "step": 24874 + }, + { + "epoch": 0.14793867161480637, + "grad_norm": 1.4434807300567627, + "learning_rate": 4.734840428026324e-05, + "loss": 5.5211, + "step": 24875 + }, + { + "epoch": 0.14794461889808735, + "grad_norm": 1.4886515140533447, + "learning_rate": 4.7348194925465364e-05, + "loss": 5.197, + "step": 24876 + }, + { + "epoch": 0.14795056618136834, + "grad_norm": 1.3683615922927856, + "learning_rate": 4.734798556286596e-05, + "loss": 4.9886, + "step": 24877 + }, + { + "epoch": 0.14795651346464936, + "grad_norm": 1.4986892938613892, + "learning_rate": 4.734777619246512e-05, + "loss": 5.0067, + "step": 24878 + }, + { + "epoch": 0.14796246074793035, + "grad_norm": 1.8438472747802734, + "learning_rate": 4.734756681426289e-05, + "loss": 5.2865, + "step": 24879 + }, + { + "epoch": 0.14796840803121133, + "grad_norm": 1.710975170135498, + "learning_rate": 4.734735742825935e-05, + "loss": 5.1215, + "step": 24880 + }, + { + "epoch": 0.14797435531449235, + "grad_norm": 2.074619770050049, + "learning_rate": 4.7347148034454594e-05, + "loss": 4.5968, + "step": 24881 + }, + { + "epoch": 0.14798030259777334, + "grad_norm": 2.5662643909454346, + "learning_rate": 4.7346938632848676e-05, + "loss": 4.3404, + "step": 24882 + }, + { + "epoch": 0.14798624988105433, + "grad_norm": 1.6698600053787231, + "learning_rate": 4.7346729223441665e-05, + "loss": 5.2027, + "step": 24883 + }, + { + "epoch": 0.14799219716433534, + "grad_norm": 2.1604435443878174, + "learning_rate": 4.7346519806233644e-05, + "loss": 4.4595, + "step": 24884 + }, + { + "epoch": 0.14799814444761633, + "grad_norm": 2.7507572174072266, + "learning_rate": 4.734631038122469e-05, + "loss": 3.1764, + "step": 24885 + }, + { + "epoch": 0.14800409173089732, + "grad_norm": 2.8016562461853027, + "learning_rate": 4.734610094841487e-05, + "loss": 3.8763, + "step": 24886 + }, + { + "epoch": 0.14801003901417834, + "grad_norm": 2.9202160835266113, + "learning_rate": 4.7345891507804253e-05, + "loss": 3.6681, + "step": 24887 + }, + { + "epoch": 0.14801598629745932, + "grad_norm": 3.071167230606079, + "learning_rate": 4.7345682059392914e-05, + "loss": 3.027, + "step": 24888 + }, + { + "epoch": 0.1480219335807403, + "grad_norm": 2.7173242568969727, + "learning_rate": 4.734547260318093e-05, + "loss": 3.3615, + "step": 24889 + }, + { + "epoch": 0.14802788086402133, + "grad_norm": 2.1972641944885254, + "learning_rate": 4.7345263139168375e-05, + "loss": 4.8097, + "step": 24890 + }, + { + "epoch": 0.14803382814730232, + "grad_norm": 2.031700849533081, + "learning_rate": 4.7345053667355324e-05, + "loss": 5.1153, + "step": 24891 + }, + { + "epoch": 0.1480397754305833, + "grad_norm": 2.627568483352661, + "learning_rate": 4.734484418774183e-05, + "loss": 4.3777, + "step": 24892 + }, + { + "epoch": 0.14804572271386432, + "grad_norm": 2.2821667194366455, + "learning_rate": 4.734463470032799e-05, + "loss": 4.4845, + "step": 24893 + }, + { + "epoch": 0.1480516699971453, + "grad_norm": 1.8525490760803223, + "learning_rate": 4.7344425205113875e-05, + "loss": 5.4187, + "step": 24894 + }, + { + "epoch": 0.1480576172804263, + "grad_norm": 2.0583372116088867, + "learning_rate": 4.7344215702099546e-05, + "loss": 4.4807, + "step": 24895 + }, + { + "epoch": 0.1480635645637073, + "grad_norm": 1.7403303384780884, + "learning_rate": 4.734400619128509e-05, + "loss": 5.5355, + "step": 24896 + }, + { + "epoch": 0.1480695118469883, + "grad_norm": 2.953425645828247, + "learning_rate": 4.734379667267056e-05, + "loss": 4.0136, + "step": 24897 + }, + { + "epoch": 0.1480754591302693, + "grad_norm": 2.8318042755126953, + "learning_rate": 4.7343587146256044e-05, + "loss": 3.5818, + "step": 24898 + }, + { + "epoch": 0.1480814064135503, + "grad_norm": 1.6144517660140991, + "learning_rate": 4.7343377612041615e-05, + "loss": 4.789, + "step": 24899 + }, + { + "epoch": 0.1480873536968313, + "grad_norm": 1.639545202255249, + "learning_rate": 4.734316807002734e-05, + "loss": 5.1812, + "step": 24900 + }, + { + "epoch": 0.14809330098011228, + "grad_norm": 1.7593424320220947, + "learning_rate": 4.734295852021331e-05, + "loss": 5.0547, + "step": 24901 + }, + { + "epoch": 0.14809924826339327, + "grad_norm": 1.6794737577438354, + "learning_rate": 4.734274896259957e-05, + "loss": 5.125, + "step": 24902 + }, + { + "epoch": 0.14810519554667428, + "grad_norm": 1.5941787958145142, + "learning_rate": 4.734253939718621e-05, + "loss": 5.0559, + "step": 24903 + }, + { + "epoch": 0.14811114282995527, + "grad_norm": 1.9701952934265137, + "learning_rate": 4.7342329823973304e-05, + "loss": 4.7468, + "step": 24904 + }, + { + "epoch": 0.14811709011323626, + "grad_norm": 1.8744746446609497, + "learning_rate": 4.734212024296092e-05, + "loss": 5.2544, + "step": 24905 + }, + { + "epoch": 0.14812303739651728, + "grad_norm": 1.5343592166900635, + "learning_rate": 4.734191065414913e-05, + "loss": 5.1794, + "step": 24906 + }, + { + "epoch": 0.14812898467979826, + "grad_norm": 1.509623408317566, + "learning_rate": 4.734170105753801e-05, + "loss": 5.4512, + "step": 24907 + }, + { + "epoch": 0.14813493196307925, + "grad_norm": 1.4235179424285889, + "learning_rate": 4.734149145312764e-05, + "loss": 5.4535, + "step": 24908 + }, + { + "epoch": 0.14814087924636027, + "grad_norm": 1.4011653661727905, + "learning_rate": 4.7341281840918076e-05, + "loss": 5.4248, + "step": 24909 + }, + { + "epoch": 0.14814682652964126, + "grad_norm": 1.3742294311523438, + "learning_rate": 4.734107222090941e-05, + "loss": 5.3076, + "step": 24910 + }, + { + "epoch": 0.14815277381292224, + "grad_norm": 1.4808472394943237, + "learning_rate": 4.73408625931017e-05, + "loss": 5.4432, + "step": 24911 + }, + { + "epoch": 0.14815872109620326, + "grad_norm": 1.3847295045852661, + "learning_rate": 4.734065295749502e-05, + "loss": 5.4678, + "step": 24912 + }, + { + "epoch": 0.14816466837948425, + "grad_norm": 1.4962565898895264, + "learning_rate": 4.734044331408947e-05, + "loss": 5.6803, + "step": 24913 + }, + { + "epoch": 0.14817061566276524, + "grad_norm": 1.7258118391036987, + "learning_rate": 4.734023366288508e-05, + "loss": 4.933, + "step": 24914 + }, + { + "epoch": 0.14817656294604625, + "grad_norm": 1.7875369787216187, + "learning_rate": 4.7340024003881955e-05, + "loss": 4.9978, + "step": 24915 + }, + { + "epoch": 0.14818251022932724, + "grad_norm": 1.5841879844665527, + "learning_rate": 4.733981433708016e-05, + "loss": 5.1718, + "step": 24916 + }, + { + "epoch": 0.14818845751260823, + "grad_norm": 1.4346718788146973, + "learning_rate": 4.733960466247976e-05, + "loss": 4.6579, + "step": 24917 + }, + { + "epoch": 0.14819440479588925, + "grad_norm": 1.4387844800949097, + "learning_rate": 4.7339394980080844e-05, + "loss": 5.012, + "step": 24918 + }, + { + "epoch": 0.14820035207917023, + "grad_norm": 1.7081257104873657, + "learning_rate": 4.733918528988347e-05, + "loss": 5.4316, + "step": 24919 + }, + { + "epoch": 0.14820629936245122, + "grad_norm": 1.7600195407867432, + "learning_rate": 4.733897559188771e-05, + "loss": 5.309, + "step": 24920 + }, + { + "epoch": 0.14821224664573224, + "grad_norm": 1.7399616241455078, + "learning_rate": 4.733876588609366e-05, + "loss": 5.1796, + "step": 24921 + }, + { + "epoch": 0.14821819392901323, + "grad_norm": 1.7843348979949951, + "learning_rate": 4.733855617250137e-05, + "loss": 5.0371, + "step": 24922 + }, + { + "epoch": 0.1482241412122942, + "grad_norm": 1.6706308126449585, + "learning_rate": 4.733834645111092e-05, + "loss": 5.1058, + "step": 24923 + }, + { + "epoch": 0.14823008849557523, + "grad_norm": 2.6056525707244873, + "learning_rate": 4.733813672192239e-05, + "loss": 4.5804, + "step": 24924 + }, + { + "epoch": 0.14823603577885622, + "grad_norm": 1.836887001991272, + "learning_rate": 4.733792698493584e-05, + "loss": 5.0871, + "step": 24925 + }, + { + "epoch": 0.1482419830621372, + "grad_norm": 1.8913605213165283, + "learning_rate": 4.733771724015135e-05, + "loss": 5.4228, + "step": 24926 + }, + { + "epoch": 0.14824793034541822, + "grad_norm": 1.7032699584960938, + "learning_rate": 4.7337507487569e-05, + "loss": 5.5599, + "step": 24927 + }, + { + "epoch": 0.1482538776286992, + "grad_norm": 1.6115164756774902, + "learning_rate": 4.733729772718885e-05, + "loss": 5.5348, + "step": 24928 + }, + { + "epoch": 0.1482598249119802, + "grad_norm": 1.563080906867981, + "learning_rate": 4.733708795901098e-05, + "loss": 5.4334, + "step": 24929 + }, + { + "epoch": 0.14826577219526121, + "grad_norm": 1.6452966928482056, + "learning_rate": 4.733687818303547e-05, + "loss": 5.7378, + "step": 24930 + }, + { + "epoch": 0.1482717194785422, + "grad_norm": 1.602687120437622, + "learning_rate": 4.7336668399262386e-05, + "loss": 5.7311, + "step": 24931 + }, + { + "epoch": 0.1482776667618232, + "grad_norm": 1.6656992435455322, + "learning_rate": 4.73364586076918e-05, + "loss": 5.3285, + "step": 24932 + }, + { + "epoch": 0.1482836140451042, + "grad_norm": 2.0401406288146973, + "learning_rate": 4.7336248808323786e-05, + "loss": 4.9655, + "step": 24933 + }, + { + "epoch": 0.1482895613283852, + "grad_norm": 2.536595582962036, + "learning_rate": 4.733603900115842e-05, + "loss": 4.6622, + "step": 24934 + }, + { + "epoch": 0.14829550861166618, + "grad_norm": 1.5609594583511353, + "learning_rate": 4.7335829186195766e-05, + "loss": 5.2326, + "step": 24935 + }, + { + "epoch": 0.1483014558949472, + "grad_norm": 1.6761829853057861, + "learning_rate": 4.733561936343591e-05, + "loss": 5.4059, + "step": 24936 + }, + { + "epoch": 0.1483074031782282, + "grad_norm": 1.1501821279525757, + "learning_rate": 4.733540953287893e-05, + "loss": 4.8906, + "step": 24937 + }, + { + "epoch": 0.14831335046150917, + "grad_norm": 1.6217314004898071, + "learning_rate": 4.733519969452488e-05, + "loss": 4.8381, + "step": 24938 + }, + { + "epoch": 0.1483192977447902, + "grad_norm": 1.8240901231765747, + "learning_rate": 4.733498984837384e-05, + "loss": 5.4137, + "step": 24939 + }, + { + "epoch": 0.14832524502807118, + "grad_norm": 1.7012525796890259, + "learning_rate": 4.733477999442589e-05, + "loss": 5.4581, + "step": 24940 + }, + { + "epoch": 0.14833119231135217, + "grad_norm": 1.3260048627853394, + "learning_rate": 4.73345701326811e-05, + "loss": 5.6434, + "step": 24941 + }, + { + "epoch": 0.14833713959463318, + "grad_norm": 1.6175122261047363, + "learning_rate": 4.7334360263139536e-05, + "loss": 5.5073, + "step": 24942 + }, + { + "epoch": 0.14834308687791417, + "grad_norm": 1.890405535697937, + "learning_rate": 4.7334150385801276e-05, + "loss": 5.059, + "step": 24943 + }, + { + "epoch": 0.14834903416119516, + "grad_norm": 2.121887683868408, + "learning_rate": 4.733394050066641e-05, + "loss": 4.7292, + "step": 24944 + }, + { + "epoch": 0.14835498144447617, + "grad_norm": 2.054938316345215, + "learning_rate": 4.7333730607734985e-05, + "loss": 4.7551, + "step": 24945 + }, + { + "epoch": 0.14836092872775716, + "grad_norm": 1.853046178817749, + "learning_rate": 4.733352070700708e-05, + "loss": 4.7807, + "step": 24946 + }, + { + "epoch": 0.14836687601103815, + "grad_norm": 1.926611304283142, + "learning_rate": 4.733331079848279e-05, + "loss": 5.026, + "step": 24947 + }, + { + "epoch": 0.14837282329431917, + "grad_norm": 1.9281972646713257, + "learning_rate": 4.7333100882162164e-05, + "loss": 5.0131, + "step": 24948 + }, + { + "epoch": 0.14837877057760016, + "grad_norm": 2.158128499984741, + "learning_rate": 4.733289095804527e-05, + "loss": 4.8987, + "step": 24949 + }, + { + "epoch": 0.14838471786088114, + "grad_norm": 1.9640719890594482, + "learning_rate": 4.7332681026132216e-05, + "loss": 4.868, + "step": 24950 + }, + { + "epoch": 0.14839066514416216, + "grad_norm": 2.0871901512145996, + "learning_rate": 4.7332471086423045e-05, + "loss": 4.8542, + "step": 24951 + }, + { + "epoch": 0.14839661242744315, + "grad_norm": 2.2361068725585938, + "learning_rate": 4.7332261138917836e-05, + "loss": 4.9536, + "step": 24952 + }, + { + "epoch": 0.14840255971072414, + "grad_norm": 2.3177475929260254, + "learning_rate": 4.7332051183616665e-05, + "loss": 4.9228, + "step": 24953 + }, + { + "epoch": 0.14840850699400515, + "grad_norm": 2.0412709712982178, + "learning_rate": 4.733184122051961e-05, + "loss": 4.888, + "step": 24954 + }, + { + "epoch": 0.14841445427728614, + "grad_norm": 1.904599666595459, + "learning_rate": 4.733163124962674e-05, + "loss": 4.842, + "step": 24955 + }, + { + "epoch": 0.14842040156056713, + "grad_norm": 2.3957440853118896, + "learning_rate": 4.733142127093813e-05, + "loss": 4.7589, + "step": 24956 + }, + { + "epoch": 0.14842634884384814, + "grad_norm": 1.966145634651184, + "learning_rate": 4.733121128445384e-05, + "loss": 4.5783, + "step": 24957 + }, + { + "epoch": 0.14843229612712913, + "grad_norm": 2.230134963989258, + "learning_rate": 4.7331001290173966e-05, + "loss": 4.6108, + "step": 24958 + }, + { + "epoch": 0.14843824341041012, + "grad_norm": 1.9063829183578491, + "learning_rate": 4.7330791288098565e-05, + "loss": 4.765, + "step": 24959 + }, + { + "epoch": 0.1484441906936911, + "grad_norm": 2.0853664875030518, + "learning_rate": 4.7330581278227716e-05, + "loss": 6.0523, + "step": 24960 + }, + { + "epoch": 0.14845013797697212, + "grad_norm": 2.0823090076446533, + "learning_rate": 4.7330371260561494e-05, + "loss": 6.1014, + "step": 24961 + }, + { + "epoch": 0.1484560852602531, + "grad_norm": 1.7553062438964844, + "learning_rate": 4.733016123509997e-05, + "loss": 5.5322, + "step": 24962 + }, + { + "epoch": 0.1484620325435341, + "grad_norm": 1.7482306957244873, + "learning_rate": 4.7329951201843217e-05, + "loss": 5.5981, + "step": 24963 + }, + { + "epoch": 0.14846797982681512, + "grad_norm": 1.7615885734558105, + "learning_rate": 4.732974116079131e-05, + "loss": 5.447, + "step": 24964 + }, + { + "epoch": 0.1484739271100961, + "grad_norm": 1.645790696144104, + "learning_rate": 4.732953111194432e-05, + "loss": 5.4439, + "step": 24965 + }, + { + "epoch": 0.1484798743933771, + "grad_norm": 1.8099596500396729, + "learning_rate": 4.7329321055302326e-05, + "loss": 5.1291, + "step": 24966 + }, + { + "epoch": 0.1484858216766581, + "grad_norm": 1.8523690700531006, + "learning_rate": 4.732911099086539e-05, + "loss": 4.9296, + "step": 24967 + }, + { + "epoch": 0.1484917689599391, + "grad_norm": 1.7897992134094238, + "learning_rate": 4.732890091863359e-05, + "loss": 5.1764, + "step": 24968 + }, + { + "epoch": 0.14849771624322008, + "grad_norm": 1.8922818899154663, + "learning_rate": 4.7328690838607e-05, + "loss": 5.1548, + "step": 24969 + }, + { + "epoch": 0.1485036635265011, + "grad_norm": 1.9169872999191284, + "learning_rate": 4.73284807507857e-05, + "loss": 5.0837, + "step": 24970 + }, + { + "epoch": 0.1485096108097821, + "grad_norm": 1.649895429611206, + "learning_rate": 4.732827065516976e-05, + "loss": 5.2689, + "step": 24971 + }, + { + "epoch": 0.14851555809306308, + "grad_norm": 1.638153076171875, + "learning_rate": 4.732806055175925e-05, + "loss": 5.5579, + "step": 24972 + }, + { + "epoch": 0.1485215053763441, + "grad_norm": 1.6101715564727783, + "learning_rate": 4.7327850440554244e-05, + "loss": 5.5632, + "step": 24973 + }, + { + "epoch": 0.14852745265962508, + "grad_norm": 1.5299588441848755, + "learning_rate": 4.7327640321554815e-05, + "loss": 5.6415, + "step": 24974 + }, + { + "epoch": 0.14853339994290607, + "grad_norm": 1.508520245552063, + "learning_rate": 4.732743019476104e-05, + "loss": 5.1519, + "step": 24975 + }, + { + "epoch": 0.14853934722618709, + "grad_norm": 1.760366439819336, + "learning_rate": 4.732722006017299e-05, + "loss": 4.2604, + "step": 24976 + }, + { + "epoch": 0.14854529450946807, + "grad_norm": 1.6827213764190674, + "learning_rate": 4.732700991779073e-05, + "loss": 4.2258, + "step": 24977 + }, + { + "epoch": 0.14855124179274906, + "grad_norm": 1.576389193534851, + "learning_rate": 4.732679976761435e-05, + "loss": 4.2854, + "step": 24978 + }, + { + "epoch": 0.14855718907603008, + "grad_norm": 1.592392921447754, + "learning_rate": 4.732658960964391e-05, + "loss": 4.2775, + "step": 24979 + }, + { + "epoch": 0.14856313635931107, + "grad_norm": 1.6771488189697266, + "learning_rate": 4.7326379443879495e-05, + "loss": 4.3001, + "step": 24980 + }, + { + "epoch": 0.14856908364259205, + "grad_norm": 1.584578037261963, + "learning_rate": 4.732616927032117e-05, + "loss": 4.1592, + "step": 24981 + }, + { + "epoch": 0.14857503092587307, + "grad_norm": 1.7568552494049072, + "learning_rate": 4.732595908896901e-05, + "loss": 4.1514, + "step": 24982 + }, + { + "epoch": 0.14858097820915406, + "grad_norm": 1.6334513425827026, + "learning_rate": 4.732574889982309e-05, + "loss": 4.1319, + "step": 24983 + }, + { + "epoch": 0.14858692549243505, + "grad_norm": 1.7330750226974487, + "learning_rate": 4.732553870288347e-05, + "loss": 4.1036, + "step": 24984 + }, + { + "epoch": 0.14859287277571606, + "grad_norm": 1.7719300985336304, + "learning_rate": 4.732532849815024e-05, + "loss": 5.2043, + "step": 24985 + }, + { + "epoch": 0.14859882005899705, + "grad_norm": 2.9879441261291504, + "learning_rate": 4.732511828562347e-05, + "loss": 3.8784, + "step": 24986 + }, + { + "epoch": 0.14860476734227804, + "grad_norm": 1.9443185329437256, + "learning_rate": 4.732490806530324e-05, + "loss": 5.5898, + "step": 24987 + }, + { + "epoch": 0.14861071462555905, + "grad_norm": 1.800279140472412, + "learning_rate": 4.73246978371896e-05, + "loss": 5.465, + "step": 24988 + }, + { + "epoch": 0.14861666190884004, + "grad_norm": 1.9028568267822266, + "learning_rate": 4.732448760128265e-05, + "loss": 4.8782, + "step": 24989 + }, + { + "epoch": 0.14862260919212103, + "grad_norm": 2.79314923286438, + "learning_rate": 4.732427735758245e-05, + "loss": 4.5421, + "step": 24990 + }, + { + "epoch": 0.14862855647540205, + "grad_norm": 2.4686412811279297, + "learning_rate": 4.7324067106089074e-05, + "loss": 4.4616, + "step": 24991 + }, + { + "epoch": 0.14863450375868303, + "grad_norm": 1.8359897136688232, + "learning_rate": 4.73238568468026e-05, + "loss": 4.8081, + "step": 24992 + }, + { + "epoch": 0.14864045104196402, + "grad_norm": 2.3388144969940186, + "learning_rate": 4.732364657972309e-05, + "loss": 4.527, + "step": 24993 + }, + { + "epoch": 0.14864639832524504, + "grad_norm": 2.888598680496216, + "learning_rate": 4.7323436304850634e-05, + "loss": 4.1855, + "step": 24994 + }, + { + "epoch": 0.14865234560852603, + "grad_norm": 3.1639111042022705, + "learning_rate": 4.7323226022185296e-05, + "loss": 4.0865, + "step": 24995 + }, + { + "epoch": 0.14865829289180701, + "grad_norm": 2.8708033561706543, + "learning_rate": 4.732301573172715e-05, + "loss": 3.8629, + "step": 24996 + }, + { + "epoch": 0.14866424017508803, + "grad_norm": 2.667426347732544, + "learning_rate": 4.732280543347627e-05, + "loss": 4.0511, + "step": 24997 + }, + { + "epoch": 0.14867018745836902, + "grad_norm": 2.5031850337982178, + "learning_rate": 4.7322595127432725e-05, + "loss": 4.2035, + "step": 24998 + }, + { + "epoch": 0.14867613474165, + "grad_norm": 2.4356188774108887, + "learning_rate": 4.7322384813596595e-05, + "loss": 3.8996, + "step": 24999 + }, + { + "epoch": 0.14868208202493102, + "grad_norm": 2.334566354751587, + "learning_rate": 4.732217449196795e-05, + "loss": 4.2353, + "step": 25000 + }, + { + "epoch": 0.148688029308212, + "grad_norm": 2.357844591140747, + "learning_rate": 4.732196416254686e-05, + "loss": 4.3695, + "step": 25001 + }, + { + "epoch": 0.148693976591493, + "grad_norm": 2.4662234783172607, + "learning_rate": 4.7321753825333416e-05, + "loss": 3.9325, + "step": 25002 + }, + { + "epoch": 0.14869992387477401, + "grad_norm": 1.840820074081421, + "learning_rate": 4.7321543480327666e-05, + "loss": 5.1156, + "step": 25003 + }, + { + "epoch": 0.148705871158055, + "grad_norm": 1.9830942153930664, + "learning_rate": 4.73213331275297e-05, + "loss": 4.6774, + "step": 25004 + }, + { + "epoch": 0.148711818441336, + "grad_norm": 1.6185516119003296, + "learning_rate": 4.732112276693959e-05, + "loss": 4.6241, + "step": 25005 + }, + { + "epoch": 0.148717765724617, + "grad_norm": 1.8661324977874756, + "learning_rate": 4.7320912398557403e-05, + "loss": 4.6107, + "step": 25006 + }, + { + "epoch": 0.148723713007898, + "grad_norm": 1.750866174697876, + "learning_rate": 4.7320702022383226e-05, + "loss": 4.7134, + "step": 25007 + }, + { + "epoch": 0.14872966029117898, + "grad_norm": 1.7875406742095947, + "learning_rate": 4.7320491638417105e-05, + "loss": 4.6935, + "step": 25008 + }, + { + "epoch": 0.14873560757446, + "grad_norm": 1.6559946537017822, + "learning_rate": 4.732028124665915e-05, + "loss": 4.7556, + "step": 25009 + }, + { + "epoch": 0.148741554857741, + "grad_norm": 2.075535535812378, + "learning_rate": 4.7320070847109396e-05, + "loss": 4.6646, + "step": 25010 + }, + { + "epoch": 0.14874750214102198, + "grad_norm": 2.1029436588287354, + "learning_rate": 4.731986043976795e-05, + "loss": 5.0169, + "step": 25011 + }, + { + "epoch": 0.148753449424303, + "grad_norm": 1.9193171262741089, + "learning_rate": 4.7319650024634866e-05, + "loss": 5.236, + "step": 25012 + }, + { + "epoch": 0.14875939670758398, + "grad_norm": 1.6295948028564453, + "learning_rate": 4.731943960171022e-05, + "loss": 5.3538, + "step": 25013 + }, + { + "epoch": 0.14876534399086497, + "grad_norm": 1.5699677467346191, + "learning_rate": 4.73192291709941e-05, + "loss": 5.5413, + "step": 25014 + }, + { + "epoch": 0.14877129127414598, + "grad_norm": 2.8893580436706543, + "learning_rate": 4.7319018732486555e-05, + "loss": 4.5995, + "step": 25015 + }, + { + "epoch": 0.14877723855742697, + "grad_norm": 2.366352081298828, + "learning_rate": 4.731880828618768e-05, + "loss": 4.5993, + "step": 25016 + }, + { + "epoch": 0.14878318584070796, + "grad_norm": 2.1206884384155273, + "learning_rate": 4.731859783209753e-05, + "loss": 4.2081, + "step": 25017 + }, + { + "epoch": 0.14878913312398895, + "grad_norm": 2.4171648025512695, + "learning_rate": 4.73183873702162e-05, + "loss": 4.287, + "step": 25018 + }, + { + "epoch": 0.14879508040726996, + "grad_norm": 1.9675270318984985, + "learning_rate": 4.7318176900543744e-05, + "loss": 4.5648, + "step": 25019 + }, + { + "epoch": 0.14880102769055095, + "grad_norm": 1.750753402709961, + "learning_rate": 4.731796642308024e-05, + "loss": 5.6165, + "step": 25020 + }, + { + "epoch": 0.14880697497383194, + "grad_norm": 1.7137641906738281, + "learning_rate": 4.731775593782577e-05, + "loss": 5.1204, + "step": 25021 + }, + { + "epoch": 0.14881292225711296, + "grad_norm": 1.4377870559692383, + "learning_rate": 4.73175454447804e-05, + "loss": 5.4076, + "step": 25022 + }, + { + "epoch": 0.14881886954039394, + "grad_norm": 1.3382959365844727, + "learning_rate": 4.7317334943944204e-05, + "loss": 5.444, + "step": 25023 + }, + { + "epoch": 0.14882481682367493, + "grad_norm": 1.0098121166229248, + "learning_rate": 4.731712443531726e-05, + "loss": 5.2913, + "step": 25024 + }, + { + "epoch": 0.14883076410695595, + "grad_norm": 0.897736132144928, + "learning_rate": 4.7316913918899644e-05, + "loss": 5.2909, + "step": 25025 + }, + { + "epoch": 0.14883671139023694, + "grad_norm": 1.1516233682632446, + "learning_rate": 4.731670339469141e-05, + "loss": 5.3357, + "step": 25026 + }, + { + "epoch": 0.14884265867351792, + "grad_norm": 1.7736589908599854, + "learning_rate": 4.731649286269265e-05, + "loss": 5.1258, + "step": 25027 + }, + { + "epoch": 0.14884860595679894, + "grad_norm": 1.8994569778442383, + "learning_rate": 4.731628232290344e-05, + "loss": 5.5661, + "step": 25028 + }, + { + "epoch": 0.14885455324007993, + "grad_norm": 1.7552026510238647, + "learning_rate": 4.731607177532384e-05, + "loss": 5.3648, + "step": 25029 + }, + { + "epoch": 0.14886050052336092, + "grad_norm": 2.8771791458129883, + "learning_rate": 4.731586121995393e-05, + "loss": 4.6516, + "step": 25030 + }, + { + "epoch": 0.14886644780664193, + "grad_norm": 2.073287010192871, + "learning_rate": 4.731565065679379e-05, + "loss": 4.8374, + "step": 25031 + }, + { + "epoch": 0.14887239508992292, + "grad_norm": 1.6661057472229004, + "learning_rate": 4.7315440085843476e-05, + "loss": 5.0031, + "step": 25032 + }, + { + "epoch": 0.1488783423732039, + "grad_norm": 2.286806106567383, + "learning_rate": 4.7315229507103084e-05, + "loss": 4.3394, + "step": 25033 + }, + { + "epoch": 0.14888428965648492, + "grad_norm": 2.3657538890838623, + "learning_rate": 4.7315018920572666e-05, + "loss": 4.4455, + "step": 25034 + }, + { + "epoch": 0.1488902369397659, + "grad_norm": 2.1653788089752197, + "learning_rate": 4.7314808326252316e-05, + "loss": 4.5676, + "step": 25035 + }, + { + "epoch": 0.1488961842230469, + "grad_norm": 1.853837251663208, + "learning_rate": 4.731459772414208e-05, + "loss": 4.4169, + "step": 25036 + }, + { + "epoch": 0.14890213150632792, + "grad_norm": 2.1202454566955566, + "learning_rate": 4.7314387114242064e-05, + "loss": 4.4917, + "step": 25037 + }, + { + "epoch": 0.1489080787896089, + "grad_norm": 2.1203508377075195, + "learning_rate": 4.731417649655232e-05, + "loss": 4.2212, + "step": 25038 + }, + { + "epoch": 0.1489140260728899, + "grad_norm": 2.220571994781494, + "learning_rate": 4.731396587107293e-05, + "loss": 4.3678, + "step": 25039 + }, + { + "epoch": 0.1489199733561709, + "grad_norm": 1.9346973896026611, + "learning_rate": 4.731375523780397e-05, + "loss": 3.9189, + "step": 25040 + }, + { + "epoch": 0.1489259206394519, + "grad_norm": 2.1453700065612793, + "learning_rate": 4.731354459674549e-05, + "loss": 5.4543, + "step": 25041 + }, + { + "epoch": 0.14893186792273289, + "grad_norm": 2.7248880863189697, + "learning_rate": 4.73133339478976e-05, + "loss": 4.419, + "step": 25042 + }, + { + "epoch": 0.1489378152060139, + "grad_norm": 2.675060510635376, + "learning_rate": 4.731312329126035e-05, + "loss": 4.2858, + "step": 25043 + }, + { + "epoch": 0.1489437624892949, + "grad_norm": 2.5627496242523193, + "learning_rate": 4.731291262683382e-05, + "loss": 4.3065, + "step": 25044 + }, + { + "epoch": 0.14894970977257588, + "grad_norm": 2.238367795944214, + "learning_rate": 4.7312701954618086e-05, + "loss": 4.1853, + "step": 25045 + }, + { + "epoch": 0.1489556570558569, + "grad_norm": 2.144697427749634, + "learning_rate": 4.731249127461321e-05, + "loss": 4.5655, + "step": 25046 + }, + { + "epoch": 0.14896160433913788, + "grad_norm": 1.676389455795288, + "learning_rate": 4.731228058681928e-05, + "loss": 4.8332, + "step": 25047 + }, + { + "epoch": 0.14896755162241887, + "grad_norm": 2.7558321952819824, + "learning_rate": 4.7312069891236364e-05, + "loss": 3.5354, + "step": 25048 + }, + { + "epoch": 0.14897349890569989, + "grad_norm": 1.8224084377288818, + "learning_rate": 4.731185918786453e-05, + "loss": 4.8105, + "step": 25049 + }, + { + "epoch": 0.14897944618898087, + "grad_norm": 1.8380038738250732, + "learning_rate": 4.731164847670386e-05, + "loss": 4.8584, + "step": 25050 + }, + { + "epoch": 0.14898539347226186, + "grad_norm": 1.6260594129562378, + "learning_rate": 4.7311437757754425e-05, + "loss": 4.5548, + "step": 25051 + }, + { + "epoch": 0.14899134075554288, + "grad_norm": 1.490978717803955, + "learning_rate": 4.731122703101629e-05, + "loss": 4.7144, + "step": 25052 + }, + { + "epoch": 0.14899728803882387, + "grad_norm": 2.054363489151001, + "learning_rate": 4.731101629648954e-05, + "loss": 4.9561, + "step": 25053 + }, + { + "epoch": 0.14900323532210485, + "grad_norm": 2.431696891784668, + "learning_rate": 4.7310805554174255e-05, + "loss": 4.6347, + "step": 25054 + }, + { + "epoch": 0.14900918260538587, + "grad_norm": 2.9854423999786377, + "learning_rate": 4.7310594804070485e-05, + "loss": 4.3526, + "step": 25055 + }, + { + "epoch": 0.14901512988866686, + "grad_norm": 2.859827756881714, + "learning_rate": 4.731038404617832e-05, + "loss": 4.3427, + "step": 25056 + }, + { + "epoch": 0.14902107717194785, + "grad_norm": 2.866624593734741, + "learning_rate": 4.731017328049784e-05, + "loss": 4.4747, + "step": 25057 + }, + { + "epoch": 0.14902702445522886, + "grad_norm": 2.0833802223205566, + "learning_rate": 4.730996250702909e-05, + "loss": 4.1979, + "step": 25058 + }, + { + "epoch": 0.14903297173850985, + "grad_norm": 2.095679521560669, + "learning_rate": 4.7309751725772176e-05, + "loss": 4.2466, + "step": 25059 + }, + { + "epoch": 0.14903891902179084, + "grad_norm": 2.3466885089874268, + "learning_rate": 4.730954093672716e-05, + "loss": 4.3074, + "step": 25060 + }, + { + "epoch": 0.14904486630507185, + "grad_norm": 2.1188759803771973, + "learning_rate": 4.730933013989411e-05, + "loss": 4.2482, + "step": 25061 + }, + { + "epoch": 0.14905081358835284, + "grad_norm": 2.1638059616088867, + "learning_rate": 4.73091193352731e-05, + "loss": 4.1506, + "step": 25062 + }, + { + "epoch": 0.14905676087163383, + "grad_norm": 2.035240650177002, + "learning_rate": 4.7308908522864215e-05, + "loss": 4.4322, + "step": 25063 + }, + { + "epoch": 0.14906270815491485, + "grad_norm": 2.375912666320801, + "learning_rate": 4.730869770266751e-05, + "loss": 5.3206, + "step": 25064 + }, + { + "epoch": 0.14906865543819584, + "grad_norm": 1.8899742364883423, + "learning_rate": 4.7308486874683075e-05, + "loss": 5.1336, + "step": 25065 + }, + { + "epoch": 0.14907460272147682, + "grad_norm": 1.7068132162094116, + "learning_rate": 4.730827603891098e-05, + "loss": 5.0085, + "step": 25066 + }, + { + "epoch": 0.14908055000475784, + "grad_norm": 1.737470269203186, + "learning_rate": 4.730806519535129e-05, + "loss": 5.9056, + "step": 25067 + }, + { + "epoch": 0.14908649728803883, + "grad_norm": 1.251652717590332, + "learning_rate": 4.730785434400409e-05, + "loss": 5.3772, + "step": 25068 + }, + { + "epoch": 0.14909244457131982, + "grad_norm": 1.2134002447128296, + "learning_rate": 4.730764348486945e-05, + "loss": 5.4202, + "step": 25069 + }, + { + "epoch": 0.14909839185460083, + "grad_norm": 1.028356671333313, + "learning_rate": 4.730743261794743e-05, + "loss": 5.4883, + "step": 25070 + }, + { + "epoch": 0.14910433913788182, + "grad_norm": 1.3931416273117065, + "learning_rate": 4.730722174323813e-05, + "loss": 5.3274, + "step": 25071 + }, + { + "epoch": 0.1491102864211628, + "grad_norm": 1.2539725303649902, + "learning_rate": 4.7307010860741607e-05, + "loss": 5.2628, + "step": 25072 + }, + { + "epoch": 0.14911623370444382, + "grad_norm": 1.2422703504562378, + "learning_rate": 4.730679997045793e-05, + "loss": 5.1639, + "step": 25073 + }, + { + "epoch": 0.1491221809877248, + "grad_norm": 1.4616423845291138, + "learning_rate": 4.730658907238719e-05, + "loss": 5.0979, + "step": 25074 + }, + { + "epoch": 0.1491281282710058, + "grad_norm": 1.2968275547027588, + "learning_rate": 4.730637816652944e-05, + "loss": 5.0785, + "step": 25075 + }, + { + "epoch": 0.1491340755542868, + "grad_norm": 1.304254412651062, + "learning_rate": 4.730616725288477e-05, + "loss": 5.4885, + "step": 25076 + }, + { + "epoch": 0.1491400228375678, + "grad_norm": 2.3498852252960205, + "learning_rate": 4.730595633145324e-05, + "loss": 5.3064, + "step": 25077 + }, + { + "epoch": 0.1491459701208488, + "grad_norm": 1.7321240901947021, + "learning_rate": 4.730574540223493e-05, + "loss": 5.1844, + "step": 25078 + }, + { + "epoch": 0.14915191740412978, + "grad_norm": 1.903198480606079, + "learning_rate": 4.730553446522993e-05, + "loss": 5.5481, + "step": 25079 + }, + { + "epoch": 0.1491578646874108, + "grad_norm": 1.659658670425415, + "learning_rate": 4.7305323520438285e-05, + "loss": 5.3265, + "step": 25080 + }, + { + "epoch": 0.14916381197069178, + "grad_norm": 1.4510316848754883, + "learning_rate": 4.7305112567860085e-05, + "loss": 5.2607, + "step": 25081 + }, + { + "epoch": 0.14916975925397277, + "grad_norm": 1.5634890794754028, + "learning_rate": 4.73049016074954e-05, + "loss": 5.1961, + "step": 25082 + }, + { + "epoch": 0.1491757065372538, + "grad_norm": 1.5400700569152832, + "learning_rate": 4.730469063934431e-05, + "loss": 5.6441, + "step": 25083 + }, + { + "epoch": 0.14918165382053478, + "grad_norm": 1.814353108406067, + "learning_rate": 4.730447966340688e-05, + "loss": 5.1855, + "step": 25084 + }, + { + "epoch": 0.14918760110381576, + "grad_norm": 2.3644423484802246, + "learning_rate": 4.7304268679683184e-05, + "loss": 4.5312, + "step": 25085 + }, + { + "epoch": 0.14919354838709678, + "grad_norm": 2.6960058212280273, + "learning_rate": 4.73040576881733e-05, + "loss": 4.3128, + "step": 25086 + }, + { + "epoch": 0.14919949567037777, + "grad_norm": 2.50162410736084, + "learning_rate": 4.73038466888773e-05, + "loss": 4.3356, + "step": 25087 + }, + { + "epoch": 0.14920544295365876, + "grad_norm": 1.938988208770752, + "learning_rate": 4.730363568179526e-05, + "loss": 4.6391, + "step": 25088 + }, + { + "epoch": 0.14921139023693977, + "grad_norm": 2.0165152549743652, + "learning_rate": 4.730342466692725e-05, + "loss": 5.3267, + "step": 25089 + }, + { + "epoch": 0.14921733752022076, + "grad_norm": 2.3626153469085693, + "learning_rate": 4.7303213644273345e-05, + "loss": 5.2551, + "step": 25090 + }, + { + "epoch": 0.14922328480350175, + "grad_norm": 2.1070075035095215, + "learning_rate": 4.730300261383361e-05, + "loss": 5.2231, + "step": 25091 + }, + { + "epoch": 0.14922923208678276, + "grad_norm": 1.6806228160858154, + "learning_rate": 4.7302791575608144e-05, + "loss": 5.5844, + "step": 25092 + }, + { + "epoch": 0.14923517937006375, + "grad_norm": 2.149728298187256, + "learning_rate": 4.7302580529596985e-05, + "loss": 4.7185, + "step": 25093 + }, + { + "epoch": 0.14924112665334474, + "grad_norm": 1.93796968460083, + "learning_rate": 4.730236947580024e-05, + "loss": 4.7622, + "step": 25094 + }, + { + "epoch": 0.14924707393662576, + "grad_norm": 1.7360033988952637, + "learning_rate": 4.7302158414217964e-05, + "loss": 4.7068, + "step": 25095 + }, + { + "epoch": 0.14925302121990675, + "grad_norm": 1.712073564529419, + "learning_rate": 4.730194734485023e-05, + "loss": 4.8146, + "step": 25096 + }, + { + "epoch": 0.14925896850318773, + "grad_norm": 1.789083480834961, + "learning_rate": 4.730173626769712e-05, + "loss": 4.774, + "step": 25097 + }, + { + "epoch": 0.14926491578646875, + "grad_norm": 1.9072470664978027, + "learning_rate": 4.730152518275871e-05, + "loss": 4.9099, + "step": 25098 + }, + { + "epoch": 0.14927086306974974, + "grad_norm": 1.7209197282791138, + "learning_rate": 4.730131409003506e-05, + "loss": 4.7141, + "step": 25099 + }, + { + "epoch": 0.14927681035303073, + "grad_norm": 1.8528800010681152, + "learning_rate": 4.730110298952625e-05, + "loss": 4.9741, + "step": 25100 + }, + { + "epoch": 0.14928275763631174, + "grad_norm": 1.9865680932998657, + "learning_rate": 4.7300891881232365e-05, + "loss": 4.9079, + "step": 25101 + }, + { + "epoch": 0.14928870491959273, + "grad_norm": 2.1327319145202637, + "learning_rate": 4.730068076515346e-05, + "loss": 4.9929, + "step": 25102 + }, + { + "epoch": 0.14929465220287372, + "grad_norm": 1.856972336769104, + "learning_rate": 4.730046964128962e-05, + "loss": 4.935, + "step": 25103 + }, + { + "epoch": 0.14930059948615473, + "grad_norm": 1.9982047080993652, + "learning_rate": 4.7300258509640924e-05, + "loss": 5.1254, + "step": 25104 + }, + { + "epoch": 0.14930654676943572, + "grad_norm": 1.866350531578064, + "learning_rate": 4.730004737020743e-05, + "loss": 5.0198, + "step": 25105 + }, + { + "epoch": 0.1493124940527167, + "grad_norm": 1.8669421672821045, + "learning_rate": 4.729983622298922e-05, + "loss": 4.817, + "step": 25106 + }, + { + "epoch": 0.14931844133599773, + "grad_norm": 2.3156704902648926, + "learning_rate": 4.7299625067986366e-05, + "loss": 4.9341, + "step": 25107 + }, + { + "epoch": 0.14932438861927871, + "grad_norm": 2.304932117462158, + "learning_rate": 4.7299413905198956e-05, + "loss": 4.908, + "step": 25108 + }, + { + "epoch": 0.1493303359025597, + "grad_norm": 2.0287182331085205, + "learning_rate": 4.7299202734627035e-05, + "loss": 4.9244, + "step": 25109 + }, + { + "epoch": 0.14933628318584072, + "grad_norm": 2.554980754852295, + "learning_rate": 4.72989915562707e-05, + "loss": 4.7163, + "step": 25110 + }, + { + "epoch": 0.1493422304691217, + "grad_norm": 2.76092791557312, + "learning_rate": 4.7298780370130014e-05, + "loss": 5.293, + "step": 25111 + }, + { + "epoch": 0.1493481777524027, + "grad_norm": 2.203293561935425, + "learning_rate": 4.729856917620506e-05, + "loss": 4.8891, + "step": 25112 + }, + { + "epoch": 0.1493541250356837, + "grad_norm": 2.2550253868103027, + "learning_rate": 4.7298357974495905e-05, + "loss": 5.1578, + "step": 25113 + }, + { + "epoch": 0.1493600723189647, + "grad_norm": 2.41914963722229, + "learning_rate": 4.7298146765002624e-05, + "loss": 5.0363, + "step": 25114 + }, + { + "epoch": 0.1493660196022457, + "grad_norm": 2.058586359024048, + "learning_rate": 4.729793554772528e-05, + "loss": 4.9537, + "step": 25115 + }, + { + "epoch": 0.1493719668855267, + "grad_norm": 2.3880207538604736, + "learning_rate": 4.729772432266397e-05, + "loss": 4.9701, + "step": 25116 + }, + { + "epoch": 0.1493779141688077, + "grad_norm": 2.012542247772217, + "learning_rate": 4.7297513089818745e-05, + "loss": 5.0596, + "step": 25117 + }, + { + "epoch": 0.14938386145208868, + "grad_norm": 1.9091664552688599, + "learning_rate": 4.72973018491897e-05, + "loss": 5.0199, + "step": 25118 + }, + { + "epoch": 0.1493898087353697, + "grad_norm": 1.9325292110443115, + "learning_rate": 4.7297090600776886e-05, + "loss": 4.9442, + "step": 25119 + }, + { + "epoch": 0.14939575601865068, + "grad_norm": 2.106926918029785, + "learning_rate": 4.729687934458039e-05, + "loss": 4.8628, + "step": 25120 + }, + { + "epoch": 0.14940170330193167, + "grad_norm": 1.7365446090698242, + "learning_rate": 4.729666808060029e-05, + "loss": 4.8492, + "step": 25121 + }, + { + "epoch": 0.1494076505852127, + "grad_norm": 1.9125512838363647, + "learning_rate": 4.729645680883665e-05, + "loss": 4.9389, + "step": 25122 + }, + { + "epoch": 0.14941359786849367, + "grad_norm": 2.0423247814178467, + "learning_rate": 4.729624552928954e-05, + "loss": 4.8626, + "step": 25123 + }, + { + "epoch": 0.14941954515177466, + "grad_norm": 1.9502712488174438, + "learning_rate": 4.729603424195905e-05, + "loss": 5.0237, + "step": 25124 + }, + { + "epoch": 0.14942549243505568, + "grad_norm": 2.0014281272888184, + "learning_rate": 4.7295822946845245e-05, + "loss": 4.9913, + "step": 25125 + }, + { + "epoch": 0.14943143971833667, + "grad_norm": 1.9854202270507812, + "learning_rate": 4.7295611643948204e-05, + "loss": 4.9394, + "step": 25126 + }, + { + "epoch": 0.14943738700161766, + "grad_norm": 1.7897859811782837, + "learning_rate": 4.729540033326798e-05, + "loss": 4.9434, + "step": 25127 + }, + { + "epoch": 0.14944333428489867, + "grad_norm": 2.092635154724121, + "learning_rate": 4.7295189014804676e-05, + "loss": 4.9032, + "step": 25128 + }, + { + "epoch": 0.14944928156817966, + "grad_norm": 1.9637115001678467, + "learning_rate": 4.729497768855834e-05, + "loss": 4.7775, + "step": 25129 + }, + { + "epoch": 0.14945522885146065, + "grad_norm": 1.8016657829284668, + "learning_rate": 4.729476635452906e-05, + "loss": 4.791, + "step": 25130 + }, + { + "epoch": 0.14946117613474166, + "grad_norm": 2.326096534729004, + "learning_rate": 4.7294555012716915e-05, + "loss": 5.3299, + "step": 25131 + }, + { + "epoch": 0.14946712341802265, + "grad_norm": 2.1310572624206543, + "learning_rate": 4.7294343663121965e-05, + "loss": 5.1919, + "step": 25132 + }, + { + "epoch": 0.14947307070130364, + "grad_norm": 2.3155853748321533, + "learning_rate": 4.72941323057443e-05, + "loss": 5.0858, + "step": 25133 + }, + { + "epoch": 0.14947901798458466, + "grad_norm": 2.049995183944702, + "learning_rate": 4.729392094058397e-05, + "loss": 5.065, + "step": 25134 + }, + { + "epoch": 0.14948496526786564, + "grad_norm": 1.8955172300338745, + "learning_rate": 4.729370956764107e-05, + "loss": 5.1361, + "step": 25135 + }, + { + "epoch": 0.14949091255114663, + "grad_norm": 3.226020336151123, + "learning_rate": 4.729349818691567e-05, + "loss": 4.7323, + "step": 25136 + }, + { + "epoch": 0.14949685983442762, + "grad_norm": 3.1648058891296387, + "learning_rate": 4.7293286798407833e-05, + "loss": 4.6663, + "step": 25137 + }, + { + "epoch": 0.14950280711770864, + "grad_norm": 2.2341058254241943, + "learning_rate": 4.729307540211764e-05, + "loss": 4.584, + "step": 25138 + }, + { + "epoch": 0.14950875440098962, + "grad_norm": 2.088019609451294, + "learning_rate": 4.729286399804517e-05, + "loss": 4.5618, + "step": 25139 + }, + { + "epoch": 0.1495147016842706, + "grad_norm": 1.8777929544448853, + "learning_rate": 4.729265258619048e-05, + "loss": 5.0011, + "step": 25140 + }, + { + "epoch": 0.14952064896755163, + "grad_norm": 2.080986261367798, + "learning_rate": 4.729244116655366e-05, + "loss": 5.6192, + "step": 25141 + }, + { + "epoch": 0.14952659625083262, + "grad_norm": 1.9895329475402832, + "learning_rate": 4.729222973913479e-05, + "loss": 5.8569, + "step": 25142 + }, + { + "epoch": 0.1495325435341136, + "grad_norm": 2.0990312099456787, + "learning_rate": 4.7292018303933924e-05, + "loss": 5.772, + "step": 25143 + }, + { + "epoch": 0.14953849081739462, + "grad_norm": 1.8530125617980957, + "learning_rate": 4.7291806860951145e-05, + "loss": 5.9042, + "step": 25144 + }, + { + "epoch": 0.1495444381006756, + "grad_norm": 1.7631386518478394, + "learning_rate": 4.7291595410186526e-05, + "loss": 5.7611, + "step": 25145 + }, + { + "epoch": 0.1495503853839566, + "grad_norm": 1.4668217897415161, + "learning_rate": 4.729138395164015e-05, + "loss": 5.4997, + "step": 25146 + }, + { + "epoch": 0.1495563326672376, + "grad_norm": 1.2580885887145996, + "learning_rate": 4.729117248531206e-05, + "loss": 5.5554, + "step": 25147 + }, + { + "epoch": 0.1495622799505186, + "grad_norm": 1.612502932548523, + "learning_rate": 4.7290961011202375e-05, + "loss": 5.0982, + "step": 25148 + }, + { + "epoch": 0.1495682272337996, + "grad_norm": 1.6753286123275757, + "learning_rate": 4.729074952931114e-05, + "loss": 4.9553, + "step": 25149 + }, + { + "epoch": 0.1495741745170806, + "grad_norm": 1.530179738998413, + "learning_rate": 4.729053803963843e-05, + "loss": 4.9314, + "step": 25150 + }, + { + "epoch": 0.1495801218003616, + "grad_norm": 1.5077494382858276, + "learning_rate": 4.729032654218433e-05, + "loss": 5.4957, + "step": 25151 + }, + { + "epoch": 0.14958606908364258, + "grad_norm": 1.6995402574539185, + "learning_rate": 4.72901150369489e-05, + "loss": 5.7406, + "step": 25152 + }, + { + "epoch": 0.1495920163669236, + "grad_norm": 1.4611583948135376, + "learning_rate": 4.728990352393222e-05, + "loss": 5.5632, + "step": 25153 + }, + { + "epoch": 0.14959796365020459, + "grad_norm": 1.775568962097168, + "learning_rate": 4.728969200313437e-05, + "loss": 5.1666, + "step": 25154 + }, + { + "epoch": 0.14960391093348557, + "grad_norm": 1.6890829801559448, + "learning_rate": 4.728948047455541e-05, + "loss": 5.1776, + "step": 25155 + }, + { + "epoch": 0.1496098582167666, + "grad_norm": 1.7455476522445679, + "learning_rate": 4.728926893819544e-05, + "loss": 5.0308, + "step": 25156 + }, + { + "epoch": 0.14961580550004758, + "grad_norm": 2.0798380374908447, + "learning_rate": 4.72890573940545e-05, + "loss": 4.8164, + "step": 25157 + }, + { + "epoch": 0.14962175278332857, + "grad_norm": 2.0280489921569824, + "learning_rate": 4.728884584213269e-05, + "loss": 4.7693, + "step": 25158 + }, + { + "epoch": 0.14962770006660958, + "grad_norm": 1.9629135131835938, + "learning_rate": 4.728863428243008e-05, + "loss": 4.9072, + "step": 25159 + }, + { + "epoch": 0.14963364734989057, + "grad_norm": 2.1143929958343506, + "learning_rate": 4.7288422714946724e-05, + "loss": 4.6828, + "step": 25160 + }, + { + "epoch": 0.14963959463317156, + "grad_norm": 1.9618384838104248, + "learning_rate": 4.7288211139682715e-05, + "loss": 5.0383, + "step": 25161 + }, + { + "epoch": 0.14964554191645257, + "grad_norm": 1.8829975128173828, + "learning_rate": 4.728799955663812e-05, + "loss": 5.5072, + "step": 25162 + }, + { + "epoch": 0.14965148919973356, + "grad_norm": 1.5670249462127686, + "learning_rate": 4.728778796581302e-05, + "loss": 5.1815, + "step": 25163 + }, + { + "epoch": 0.14965743648301455, + "grad_norm": 2.0932981967926025, + "learning_rate": 4.728757636720748e-05, + "loss": 5.0871, + "step": 25164 + }, + { + "epoch": 0.14966338376629557, + "grad_norm": 1.5827875137329102, + "learning_rate": 4.728736476082158e-05, + "loss": 5.0983, + "step": 25165 + }, + { + "epoch": 0.14966933104957655, + "grad_norm": 1.7353198528289795, + "learning_rate": 4.728715314665538e-05, + "loss": 4.8113, + "step": 25166 + }, + { + "epoch": 0.14967527833285754, + "grad_norm": 1.6395387649536133, + "learning_rate": 4.728694152470898e-05, + "loss": 4.9403, + "step": 25167 + }, + { + "epoch": 0.14968122561613856, + "grad_norm": 1.8546936511993408, + "learning_rate": 4.7286729894982434e-05, + "loss": 4.9092, + "step": 25168 + }, + { + "epoch": 0.14968717289941955, + "grad_norm": 1.5432714223861694, + "learning_rate": 4.728651825747582e-05, + "loss": 4.8257, + "step": 25169 + }, + { + "epoch": 0.14969312018270053, + "grad_norm": 1.6309102773666382, + "learning_rate": 4.728630661218921e-05, + "loss": 5.5829, + "step": 25170 + }, + { + "epoch": 0.14969906746598155, + "grad_norm": 1.8060203790664673, + "learning_rate": 4.7286094959122685e-05, + "loss": 5.3099, + "step": 25171 + }, + { + "epoch": 0.14970501474926254, + "grad_norm": 1.8817297220230103, + "learning_rate": 4.728588329827631e-05, + "loss": 5.5393, + "step": 25172 + }, + { + "epoch": 0.14971096203254353, + "grad_norm": 1.806970477104187, + "learning_rate": 4.728567162965017e-05, + "loss": 5.8567, + "step": 25173 + }, + { + "epoch": 0.14971690931582454, + "grad_norm": 1.6101081371307373, + "learning_rate": 4.728545995324433e-05, + "loss": 5.5389, + "step": 25174 + }, + { + "epoch": 0.14972285659910553, + "grad_norm": 1.5525349378585815, + "learning_rate": 4.7285248269058854e-05, + "loss": 5.6075, + "step": 25175 + }, + { + "epoch": 0.14972880388238652, + "grad_norm": 1.543853998184204, + "learning_rate": 4.7285036577093844e-05, + "loss": 5.5287, + "step": 25176 + }, + { + "epoch": 0.14973475116566753, + "grad_norm": 1.5811434984207153, + "learning_rate": 4.728482487734935e-05, + "loss": 5.5584, + "step": 25177 + }, + { + "epoch": 0.14974069844894852, + "grad_norm": 1.2957634925842285, + "learning_rate": 4.728461316982546e-05, + "loss": 5.5264, + "step": 25178 + }, + { + "epoch": 0.1497466457322295, + "grad_norm": 1.3600691556930542, + "learning_rate": 4.728440145452224e-05, + "loss": 5.5781, + "step": 25179 + }, + { + "epoch": 0.14975259301551053, + "grad_norm": 1.3423492908477783, + "learning_rate": 4.7284189731439764e-05, + "loss": 5.535, + "step": 25180 + }, + { + "epoch": 0.14975854029879151, + "grad_norm": 1.5586212873458862, + "learning_rate": 4.7283978000578107e-05, + "loss": 5.0746, + "step": 25181 + }, + { + "epoch": 0.1497644875820725, + "grad_norm": 1.8833614587783813, + "learning_rate": 4.7283766261937346e-05, + "loss": 4.6121, + "step": 25182 + }, + { + "epoch": 0.14977043486535352, + "grad_norm": 1.8890469074249268, + "learning_rate": 4.728355451551755e-05, + "loss": 4.5572, + "step": 25183 + }, + { + "epoch": 0.1497763821486345, + "grad_norm": 1.7143722772598267, + "learning_rate": 4.728334276131879e-05, + "loss": 4.5289, + "step": 25184 + }, + { + "epoch": 0.1497823294319155, + "grad_norm": 1.766708493232727, + "learning_rate": 4.728313099934115e-05, + "loss": 4.6957, + "step": 25185 + }, + { + "epoch": 0.1497882767151965, + "grad_norm": 1.8504046201705933, + "learning_rate": 4.72829192295847e-05, + "loss": 4.8764, + "step": 25186 + }, + { + "epoch": 0.1497942239984775, + "grad_norm": 2.0711238384246826, + "learning_rate": 4.728270745204951e-05, + "loss": 4.9157, + "step": 25187 + }, + { + "epoch": 0.1498001712817585, + "grad_norm": 2.0366387367248535, + "learning_rate": 4.728249566673567e-05, + "loss": 4.9295, + "step": 25188 + }, + { + "epoch": 0.1498061185650395, + "grad_norm": 1.7883682250976562, + "learning_rate": 4.728228387364323e-05, + "loss": 5.1173, + "step": 25189 + }, + { + "epoch": 0.1498120658483205, + "grad_norm": 1.8308504819869995, + "learning_rate": 4.7282072072772276e-05, + "loss": 5.0593, + "step": 25190 + }, + { + "epoch": 0.14981801313160148, + "grad_norm": 1.5662436485290527, + "learning_rate": 4.728186026412288e-05, + "loss": 5.1499, + "step": 25191 + }, + { + "epoch": 0.1498239604148825, + "grad_norm": 1.8079571723937988, + "learning_rate": 4.728164844769511e-05, + "loss": 4.948, + "step": 25192 + }, + { + "epoch": 0.14982990769816348, + "grad_norm": 1.681217908859253, + "learning_rate": 4.728143662348906e-05, + "loss": 5.3433, + "step": 25193 + }, + { + "epoch": 0.14983585498144447, + "grad_norm": 1.5585112571716309, + "learning_rate": 4.7281224791504784e-05, + "loss": 5.6366, + "step": 25194 + }, + { + "epoch": 0.14984180226472546, + "grad_norm": 1.8676329851150513, + "learning_rate": 4.7281012951742364e-05, + "loss": 5.1824, + "step": 25195 + }, + { + "epoch": 0.14984774954800648, + "grad_norm": 2.227149248123169, + "learning_rate": 4.728080110420188e-05, + "loss": 5.0203, + "step": 25196 + }, + { + "epoch": 0.14985369683128746, + "grad_norm": 1.6362202167510986, + "learning_rate": 4.728058924888339e-05, + "loss": 5.1942, + "step": 25197 + }, + { + "epoch": 0.14985964411456845, + "grad_norm": 1.9886643886566162, + "learning_rate": 4.7280377385786976e-05, + "loss": 5.4607, + "step": 25198 + }, + { + "epoch": 0.14986559139784947, + "grad_norm": 1.8965426683425903, + "learning_rate": 4.728016551491271e-05, + "loss": 5.4426, + "step": 25199 + }, + { + "epoch": 0.14987153868113046, + "grad_norm": 1.7106379270553589, + "learning_rate": 4.7279953636260677e-05, + "loss": 5.2894, + "step": 25200 + }, + { + "epoch": 0.14987748596441144, + "grad_norm": 1.5771503448486328, + "learning_rate": 4.727974174983093e-05, + "loss": 5.7972, + "step": 25201 + }, + { + "epoch": 0.14988343324769246, + "grad_norm": 1.4394875764846802, + "learning_rate": 4.727952985562357e-05, + "loss": 5.4622, + "step": 25202 + }, + { + "epoch": 0.14988938053097345, + "grad_norm": 1.421237826347351, + "learning_rate": 4.727931795363864e-05, + "loss": 5.5927, + "step": 25203 + }, + { + "epoch": 0.14989532781425444, + "grad_norm": 1.4579883813858032, + "learning_rate": 4.727910604387624e-05, + "loss": 5.6534, + "step": 25204 + }, + { + "epoch": 0.14990127509753545, + "grad_norm": 1.5861623287200928, + "learning_rate": 4.727889412633644e-05, + "loss": 5.423, + "step": 25205 + }, + { + "epoch": 0.14990722238081644, + "grad_norm": 1.1634724140167236, + "learning_rate": 4.72786822010193e-05, + "loss": 5.5339, + "step": 25206 + }, + { + "epoch": 0.14991316966409743, + "grad_norm": 1.3486993312835693, + "learning_rate": 4.72784702679249e-05, + "loss": 5.572, + "step": 25207 + }, + { + "epoch": 0.14991911694737844, + "grad_norm": 1.1783596277236938, + "learning_rate": 4.727825832705333e-05, + "loss": 5.4949, + "step": 25208 + }, + { + "epoch": 0.14992506423065943, + "grad_norm": 1.405774712562561, + "learning_rate": 4.727804637840464e-05, + "loss": 5.4044, + "step": 25209 + }, + { + "epoch": 0.14993101151394042, + "grad_norm": 1.4211558103561401, + "learning_rate": 4.727783442197891e-05, + "loss": 5.3778, + "step": 25210 + }, + { + "epoch": 0.14993695879722144, + "grad_norm": 1.572511076927185, + "learning_rate": 4.727762245777623e-05, + "loss": 5.4308, + "step": 25211 + }, + { + "epoch": 0.14994290608050242, + "grad_norm": 1.4699571132659912, + "learning_rate": 4.727741048579665e-05, + "loss": 5.3195, + "step": 25212 + }, + { + "epoch": 0.1499488533637834, + "grad_norm": 1.231878399848938, + "learning_rate": 4.727719850604026e-05, + "loss": 5.2663, + "step": 25213 + }, + { + "epoch": 0.14995480064706443, + "grad_norm": 1.3779250383377075, + "learning_rate": 4.7276986518507136e-05, + "loss": 5.1489, + "step": 25214 + }, + { + "epoch": 0.14996074793034542, + "grad_norm": 2.058643341064453, + "learning_rate": 4.7276774523197334e-05, + "loss": 5.4943, + "step": 25215 + }, + { + "epoch": 0.1499666952136264, + "grad_norm": 2.3679542541503906, + "learning_rate": 4.727656252011095e-05, + "loss": 4.688, + "step": 25216 + }, + { + "epoch": 0.14997264249690742, + "grad_norm": 2.2339799404144287, + "learning_rate": 4.727635050924805e-05, + "loss": 5.1016, + "step": 25217 + }, + { + "epoch": 0.1499785897801884, + "grad_norm": 1.536407709121704, + "learning_rate": 4.72761384906087e-05, + "loss": 5.2741, + "step": 25218 + }, + { + "epoch": 0.1499845370634694, + "grad_norm": 1.6192244291305542, + "learning_rate": 4.7275926464192985e-05, + "loss": 5.0808, + "step": 25219 + }, + { + "epoch": 0.1499904843467504, + "grad_norm": 1.6183874607086182, + "learning_rate": 4.727571443000097e-05, + "loss": 5.4735, + "step": 25220 + }, + { + "epoch": 0.1499964316300314, + "grad_norm": 1.5945466756820679, + "learning_rate": 4.7275502388032736e-05, + "loss": 5.7213, + "step": 25221 + }, + { + "epoch": 0.1500023789133124, + "grad_norm": 1.455883264541626, + "learning_rate": 4.727529033828835e-05, + "loss": 5.588, + "step": 25222 + }, + { + "epoch": 0.1500083261965934, + "grad_norm": 1.6111440658569336, + "learning_rate": 4.727507828076789e-05, + "loss": 5.0907, + "step": 25223 + }, + { + "epoch": 0.1500142734798744, + "grad_norm": 1.6382368803024292, + "learning_rate": 4.727486621547144e-05, + "loss": 5.2271, + "step": 25224 + }, + { + "epoch": 0.15002022076315538, + "grad_norm": 1.637136697769165, + "learning_rate": 4.7274654142399056e-05, + "loss": 4.9102, + "step": 25225 + }, + { + "epoch": 0.1500261680464364, + "grad_norm": 1.8395768404006958, + "learning_rate": 4.727444206155082e-05, + "loss": 5.0519, + "step": 25226 + }, + { + "epoch": 0.15003211532971739, + "grad_norm": 1.7471513748168945, + "learning_rate": 4.727422997292681e-05, + "loss": 5.2439, + "step": 25227 + }, + { + "epoch": 0.15003806261299837, + "grad_norm": 2.3117516040802, + "learning_rate": 4.72740178765271e-05, + "loss": 5.1935, + "step": 25228 + }, + { + "epoch": 0.1500440098962794, + "grad_norm": 2.0054478645324707, + "learning_rate": 4.727380577235175e-05, + "loss": 5.2919, + "step": 25229 + }, + { + "epoch": 0.15004995717956038, + "grad_norm": 1.9058947563171387, + "learning_rate": 4.727359366040085e-05, + "loss": 4.8624, + "step": 25230 + }, + { + "epoch": 0.15005590446284137, + "grad_norm": 1.746030569076538, + "learning_rate": 4.727338154067447e-05, + "loss": 4.9731, + "step": 25231 + }, + { + "epoch": 0.15006185174612238, + "grad_norm": 1.693912386894226, + "learning_rate": 4.727316941317268e-05, + "loss": 4.948, + "step": 25232 + }, + { + "epoch": 0.15006779902940337, + "grad_norm": 1.742431640625, + "learning_rate": 4.727295727789556e-05, + "loss": 4.9891, + "step": 25233 + }, + { + "epoch": 0.15007374631268436, + "grad_norm": 2.8610570430755615, + "learning_rate": 4.7272745134843175e-05, + "loss": 3.9769, + "step": 25234 + }, + { + "epoch": 0.15007969359596537, + "grad_norm": 1.6757450103759766, + "learning_rate": 4.72725329840156e-05, + "loss": 5.4376, + "step": 25235 + }, + { + "epoch": 0.15008564087924636, + "grad_norm": 1.6358832120895386, + "learning_rate": 4.727232082541293e-05, + "loss": 5.6665, + "step": 25236 + }, + { + "epoch": 0.15009158816252735, + "grad_norm": 1.8907593488693237, + "learning_rate": 4.727210865903522e-05, + "loss": 5.4225, + "step": 25237 + }, + { + "epoch": 0.15009753544580837, + "grad_norm": 1.5822373628616333, + "learning_rate": 4.727189648488254e-05, + "loss": 5.5356, + "step": 25238 + }, + { + "epoch": 0.15010348272908935, + "grad_norm": 1.626504898071289, + "learning_rate": 4.7271684302954974e-05, + "loss": 5.2066, + "step": 25239 + }, + { + "epoch": 0.15010943001237034, + "grad_norm": 1.7297816276550293, + "learning_rate": 4.727147211325259e-05, + "loss": 5.109, + "step": 25240 + }, + { + "epoch": 0.15011537729565136, + "grad_norm": 1.6709920167922974, + "learning_rate": 4.727125991577547e-05, + "loss": 5.2468, + "step": 25241 + }, + { + "epoch": 0.15012132457893235, + "grad_norm": 1.5390464067459106, + "learning_rate": 4.727104771052368e-05, + "loss": 5.237, + "step": 25242 + }, + { + "epoch": 0.15012727186221334, + "grad_norm": 1.4673635959625244, + "learning_rate": 4.72708354974973e-05, + "loss": 5.2971, + "step": 25243 + }, + { + "epoch": 0.15013321914549435, + "grad_norm": 1.6094917058944702, + "learning_rate": 4.7270623276696394e-05, + "loss": 5.3539, + "step": 25244 + }, + { + "epoch": 0.15013916642877534, + "grad_norm": 1.697434902191162, + "learning_rate": 4.727041104812105e-05, + "loss": 4.9796, + "step": 25245 + }, + { + "epoch": 0.15014511371205633, + "grad_norm": 1.7680538892745972, + "learning_rate": 4.727019881177134e-05, + "loss": 5.0622, + "step": 25246 + }, + { + "epoch": 0.15015106099533734, + "grad_norm": 1.6313658952713013, + "learning_rate": 4.7269986567647324e-05, + "loss": 5.0507, + "step": 25247 + }, + { + "epoch": 0.15015700827861833, + "grad_norm": 1.6400883197784424, + "learning_rate": 4.72697743157491e-05, + "loss": 4.9752, + "step": 25248 + }, + { + "epoch": 0.15016295556189932, + "grad_norm": 1.6866703033447266, + "learning_rate": 4.726956205607671e-05, + "loss": 5.2475, + "step": 25249 + }, + { + "epoch": 0.15016890284518034, + "grad_norm": 1.5988578796386719, + "learning_rate": 4.7269349788630255e-05, + "loss": 4.9963, + "step": 25250 + }, + { + "epoch": 0.15017485012846132, + "grad_norm": 1.8661000728607178, + "learning_rate": 4.7269137513409796e-05, + "loss": 4.7149, + "step": 25251 + }, + { + "epoch": 0.1501807974117423, + "grad_norm": 1.5544322729110718, + "learning_rate": 4.726892523041541e-05, + "loss": 5.0037, + "step": 25252 + }, + { + "epoch": 0.1501867446950233, + "grad_norm": 1.6971745491027832, + "learning_rate": 4.726871293964718e-05, + "loss": 5.1207, + "step": 25253 + }, + { + "epoch": 0.15019269197830432, + "grad_norm": 1.508044958114624, + "learning_rate": 4.726850064110517e-05, + "loss": 5.3578, + "step": 25254 + }, + { + "epoch": 0.1501986392615853, + "grad_norm": 1.7235703468322754, + "learning_rate": 4.726828833478946e-05, + "loss": 5.3506, + "step": 25255 + }, + { + "epoch": 0.1502045865448663, + "grad_norm": 1.7117946147918701, + "learning_rate": 4.726807602070011e-05, + "loss": 5.0023, + "step": 25256 + }, + { + "epoch": 0.1502105338281473, + "grad_norm": 1.6594294309616089, + "learning_rate": 4.726786369883721e-05, + "loss": 4.8674, + "step": 25257 + }, + { + "epoch": 0.1502164811114283, + "grad_norm": 1.7046406269073486, + "learning_rate": 4.7267651369200825e-05, + "loss": 4.9614, + "step": 25258 + }, + { + "epoch": 0.15022242839470928, + "grad_norm": 1.6488447189331055, + "learning_rate": 4.726743903179104e-05, + "loss": 5.0612, + "step": 25259 + }, + { + "epoch": 0.1502283756779903, + "grad_norm": 1.5859414339065552, + "learning_rate": 4.726722668660792e-05, + "loss": 4.9399, + "step": 25260 + }, + { + "epoch": 0.1502343229612713, + "grad_norm": 2.1271414756774902, + "learning_rate": 4.726701433365154e-05, + "loss": 5.0729, + "step": 25261 + }, + { + "epoch": 0.15024027024455228, + "grad_norm": 1.9313926696777344, + "learning_rate": 4.726680197292198e-05, + "loss": 5.271, + "step": 25262 + }, + { + "epoch": 0.1502462175278333, + "grad_norm": 1.933329463005066, + "learning_rate": 4.72665896044193e-05, + "loss": 5.0125, + "step": 25263 + }, + { + "epoch": 0.15025216481111428, + "grad_norm": 1.7074263095855713, + "learning_rate": 4.726637722814359e-05, + "loss": 4.8612, + "step": 25264 + }, + { + "epoch": 0.15025811209439527, + "grad_norm": 2.2242465019226074, + "learning_rate": 4.7266164844094915e-05, + "loss": 4.5163, + "step": 25265 + }, + { + "epoch": 0.15026405937767628, + "grad_norm": 1.5982950925827026, + "learning_rate": 4.726595245227336e-05, + "loss": 5.2747, + "step": 25266 + }, + { + "epoch": 0.15027000666095727, + "grad_norm": 2.0305862426757812, + "learning_rate": 4.726574005267898e-05, + "loss": 4.6378, + "step": 25267 + }, + { + "epoch": 0.15027595394423826, + "grad_norm": 1.7604337930679321, + "learning_rate": 4.726552764531187e-05, + "loss": 5.0755, + "step": 25268 + }, + { + "epoch": 0.15028190122751928, + "grad_norm": 1.9310117959976196, + "learning_rate": 4.7265315230172087e-05, + "loss": 4.5722, + "step": 25269 + }, + { + "epoch": 0.15028784851080026, + "grad_norm": 1.7772380113601685, + "learning_rate": 4.726510280725972e-05, + "loss": 4.8739, + "step": 25270 + }, + { + "epoch": 0.15029379579408125, + "grad_norm": 1.635905385017395, + "learning_rate": 4.7264890376574824e-05, + "loss": 4.8656, + "step": 25271 + }, + { + "epoch": 0.15029974307736227, + "grad_norm": 1.7308213710784912, + "learning_rate": 4.7264677938117496e-05, + "loss": 4.8062, + "step": 25272 + }, + { + "epoch": 0.15030569036064326, + "grad_norm": 1.751625895500183, + "learning_rate": 4.7264465491887786e-05, + "loss": 4.9999, + "step": 25273 + }, + { + "epoch": 0.15031163764392425, + "grad_norm": 1.9022659063339233, + "learning_rate": 4.726425303788579e-05, + "loss": 4.3717, + "step": 25274 + }, + { + "epoch": 0.15031758492720526, + "grad_norm": 1.6903055906295776, + "learning_rate": 4.7264040576111576e-05, + "loss": 4.6601, + "step": 25275 + }, + { + "epoch": 0.15032353221048625, + "grad_norm": 1.7622424364089966, + "learning_rate": 4.726382810656521e-05, + "loss": 4.711, + "step": 25276 + }, + { + "epoch": 0.15032947949376724, + "grad_norm": 1.6687418222427368, + "learning_rate": 4.726361562924678e-05, + "loss": 4.8469, + "step": 25277 + }, + { + "epoch": 0.15033542677704825, + "grad_norm": 1.6430240869522095, + "learning_rate": 4.7263403144156334e-05, + "loss": 4.7209, + "step": 25278 + }, + { + "epoch": 0.15034137406032924, + "grad_norm": 1.8600574731826782, + "learning_rate": 4.726319065129398e-05, + "loss": 4.465, + "step": 25279 + }, + { + "epoch": 0.15034732134361023, + "grad_norm": 1.4847289323806763, + "learning_rate": 4.7262978150659776e-05, + "loss": 5.3048, + "step": 25280 + }, + { + "epoch": 0.15035326862689125, + "grad_norm": 1.5062929391860962, + "learning_rate": 4.726276564225379e-05, + "loss": 5.0202, + "step": 25281 + }, + { + "epoch": 0.15035921591017223, + "grad_norm": 1.999292254447937, + "learning_rate": 4.7262553126076106e-05, + "loss": 4.2882, + "step": 25282 + }, + { + "epoch": 0.15036516319345322, + "grad_norm": 1.7813308238983154, + "learning_rate": 4.7262340602126794e-05, + "loss": 4.7198, + "step": 25283 + }, + { + "epoch": 0.15037111047673424, + "grad_norm": 1.8029576539993286, + "learning_rate": 4.726212807040593e-05, + "loss": 4.9741, + "step": 25284 + }, + { + "epoch": 0.15037705776001523, + "grad_norm": 1.629035472869873, + "learning_rate": 4.726191553091358e-05, + "loss": 5.1917, + "step": 25285 + }, + { + "epoch": 0.15038300504329621, + "grad_norm": 1.54799222946167, + "learning_rate": 4.726170298364983e-05, + "loss": 4.9093, + "step": 25286 + }, + { + "epoch": 0.15038895232657723, + "grad_norm": 1.8892208337783813, + "learning_rate": 4.726149042861475e-05, + "loss": 4.2702, + "step": 25287 + }, + { + "epoch": 0.15039489960985822, + "grad_norm": 1.7078487873077393, + "learning_rate": 4.726127786580842e-05, + "loss": 4.2082, + "step": 25288 + }, + { + "epoch": 0.1504008468931392, + "grad_norm": 1.818529725074768, + "learning_rate": 4.72610652952309e-05, + "loss": 4.5002, + "step": 25289 + }, + { + "epoch": 0.15040679417642022, + "grad_norm": 1.600824236869812, + "learning_rate": 4.726085271688227e-05, + "loss": 4.8372, + "step": 25290 + }, + { + "epoch": 0.1504127414597012, + "grad_norm": 1.6711620092391968, + "learning_rate": 4.726064013076261e-05, + "loss": 4.8079, + "step": 25291 + }, + { + "epoch": 0.1504186887429822, + "grad_norm": 1.7478057146072388, + "learning_rate": 4.7260427536871985e-05, + "loss": 4.7123, + "step": 25292 + }, + { + "epoch": 0.15042463602626321, + "grad_norm": 1.6385493278503418, + "learning_rate": 4.726021493521048e-05, + "loss": 4.8043, + "step": 25293 + }, + { + "epoch": 0.1504305833095442, + "grad_norm": 1.6353743076324463, + "learning_rate": 4.7260002325778165e-05, + "loss": 4.7891, + "step": 25294 + }, + { + "epoch": 0.1504365305928252, + "grad_norm": 1.8076624870300293, + "learning_rate": 4.725978970857511e-05, + "loss": 4.502, + "step": 25295 + }, + { + "epoch": 0.1504424778761062, + "grad_norm": 2.979780673980713, + "learning_rate": 4.72595770836014e-05, + "loss": 3.7136, + "step": 25296 + }, + { + "epoch": 0.1504484251593872, + "grad_norm": 1.698283314704895, + "learning_rate": 4.7259364450857096e-05, + "loss": 4.9292, + "step": 25297 + }, + { + "epoch": 0.15045437244266818, + "grad_norm": 1.577962040901184, + "learning_rate": 4.725915181034228e-05, + "loss": 5.177, + "step": 25298 + }, + { + "epoch": 0.1504603197259492, + "grad_norm": 1.7820360660552979, + "learning_rate": 4.725893916205702e-05, + "loss": 4.6215, + "step": 25299 + }, + { + "epoch": 0.1504662670092302, + "grad_norm": 1.8856147527694702, + "learning_rate": 4.7258726506001396e-05, + "loss": 4.49, + "step": 25300 + }, + { + "epoch": 0.15047221429251117, + "grad_norm": 1.6485686302185059, + "learning_rate": 4.7258513842175475e-05, + "loss": 5.7732, + "step": 25301 + }, + { + "epoch": 0.1504781615757922, + "grad_norm": 2.143477439880371, + "learning_rate": 4.725830117057935e-05, + "loss": 4.8915, + "step": 25302 + }, + { + "epoch": 0.15048410885907318, + "grad_norm": 1.6669731140136719, + "learning_rate": 4.725808849121307e-05, + "loss": 5.1107, + "step": 25303 + }, + { + "epoch": 0.15049005614235417, + "grad_norm": 1.6642520427703857, + "learning_rate": 4.725787580407673e-05, + "loss": 4.5454, + "step": 25304 + }, + { + "epoch": 0.15049600342563518, + "grad_norm": 1.7125663757324219, + "learning_rate": 4.725766310917039e-05, + "loss": 4.7463, + "step": 25305 + }, + { + "epoch": 0.15050195070891617, + "grad_norm": 1.7411010265350342, + "learning_rate": 4.725745040649413e-05, + "loss": 4.643, + "step": 25306 + }, + { + "epoch": 0.15050789799219716, + "grad_norm": 1.8865814208984375, + "learning_rate": 4.725723769604803e-05, + "loss": 4.5555, + "step": 25307 + }, + { + "epoch": 0.15051384527547818, + "grad_norm": 1.6867681741714478, + "learning_rate": 4.725702497783215e-05, + "loss": 4.7334, + "step": 25308 + }, + { + "epoch": 0.15051979255875916, + "grad_norm": 1.5820156335830688, + "learning_rate": 4.7256812251846576e-05, + "loss": 5.5799, + "step": 25309 + }, + { + "epoch": 0.15052573984204015, + "grad_norm": 1.772575855255127, + "learning_rate": 4.725659951809138e-05, + "loss": 5.0303, + "step": 25310 + }, + { + "epoch": 0.15053168712532114, + "grad_norm": 1.7370164394378662, + "learning_rate": 4.725638677656663e-05, + "loss": 4.6378, + "step": 25311 + }, + { + "epoch": 0.15053763440860216, + "grad_norm": 1.6597602367401123, + "learning_rate": 4.725617402727241e-05, + "loss": 4.6918, + "step": 25312 + }, + { + "epoch": 0.15054358169188314, + "grad_norm": 1.6710939407348633, + "learning_rate": 4.725596127020879e-05, + "loss": 4.5664, + "step": 25313 + }, + { + "epoch": 0.15054952897516413, + "grad_norm": 1.7546216249465942, + "learning_rate": 4.725574850537584e-05, + "loss": 4.8903, + "step": 25314 + }, + { + "epoch": 0.15055547625844515, + "grad_norm": 1.8587819337844849, + "learning_rate": 4.725553573277365e-05, + "loss": 4.9894, + "step": 25315 + }, + { + "epoch": 0.15056142354172614, + "grad_norm": 1.3700711727142334, + "learning_rate": 4.725532295240227e-05, + "loss": 5.2452, + "step": 25316 + }, + { + "epoch": 0.15056737082500712, + "grad_norm": 1.7877662181854248, + "learning_rate": 4.725511016426179e-05, + "loss": 4.214, + "step": 25317 + }, + { + "epoch": 0.15057331810828814, + "grad_norm": 1.8162602186203003, + "learning_rate": 4.725489736835228e-05, + "loss": 4.9041, + "step": 25318 + }, + { + "epoch": 0.15057926539156913, + "grad_norm": 1.6758408546447754, + "learning_rate": 4.725468456467381e-05, + "loss": 4.3246, + "step": 25319 + }, + { + "epoch": 0.15058521267485012, + "grad_norm": 1.5553221702575684, + "learning_rate": 4.725447175322647e-05, + "loss": 5.1303, + "step": 25320 + }, + { + "epoch": 0.15059115995813113, + "grad_norm": 1.5233205556869507, + "learning_rate": 4.725425893401032e-05, + "loss": 5.4629, + "step": 25321 + }, + { + "epoch": 0.15059710724141212, + "grad_norm": 1.5840942859649658, + "learning_rate": 4.725404610702544e-05, + "loss": 5.12, + "step": 25322 + }, + { + "epoch": 0.1506030545246931, + "grad_norm": 1.787832260131836, + "learning_rate": 4.72538332722719e-05, + "loss": 5.2794, + "step": 25323 + }, + { + "epoch": 0.15060900180797412, + "grad_norm": 1.725203275680542, + "learning_rate": 4.725362042974978e-05, + "loss": 5.1121, + "step": 25324 + }, + { + "epoch": 0.1506149490912551, + "grad_norm": 1.5242986679077148, + "learning_rate": 4.725340757945914e-05, + "loss": 5.2826, + "step": 25325 + }, + { + "epoch": 0.1506208963745361, + "grad_norm": 1.9072916507720947, + "learning_rate": 4.725319472140007e-05, + "loss": 5.134, + "step": 25326 + }, + { + "epoch": 0.15062684365781712, + "grad_norm": 1.5604580640792847, + "learning_rate": 4.725298185557265e-05, + "loss": 5.1551, + "step": 25327 + }, + { + "epoch": 0.1506327909410981, + "grad_norm": 1.7541977167129517, + "learning_rate": 4.725276898197694e-05, + "loss": 4.6415, + "step": 25328 + }, + { + "epoch": 0.1506387382243791, + "grad_norm": 1.6959171295166016, + "learning_rate": 4.725255610061301e-05, + "loss": 4.9428, + "step": 25329 + }, + { + "epoch": 0.1506446855076601, + "grad_norm": 1.8614954948425293, + "learning_rate": 4.725234321148095e-05, + "loss": 5.2815, + "step": 25330 + }, + { + "epoch": 0.1506506327909411, + "grad_norm": 2.654698610305786, + "learning_rate": 4.725213031458082e-05, + "loss": 4.4367, + "step": 25331 + }, + { + "epoch": 0.15065658007422209, + "grad_norm": 2.4033470153808594, + "learning_rate": 4.7251917409912705e-05, + "loss": 4.6682, + "step": 25332 + }, + { + "epoch": 0.1506625273575031, + "grad_norm": 2.164626121520996, + "learning_rate": 4.725170449747668e-05, + "loss": 4.8865, + "step": 25333 + }, + { + "epoch": 0.1506684746407841, + "grad_norm": 2.046325445175171, + "learning_rate": 4.725149157727281e-05, + "loss": 4.9494, + "step": 25334 + }, + { + "epoch": 0.15067442192406508, + "grad_norm": 1.8939987421035767, + "learning_rate": 4.7251278649301175e-05, + "loss": 4.7641, + "step": 25335 + }, + { + "epoch": 0.1506803692073461, + "grad_norm": 1.6845778226852417, + "learning_rate": 4.725106571356185e-05, + "loss": 4.8831, + "step": 25336 + }, + { + "epoch": 0.15068631649062708, + "grad_norm": 1.7191179990768433, + "learning_rate": 4.7250852770054905e-05, + "loss": 4.9732, + "step": 25337 + }, + { + "epoch": 0.15069226377390807, + "grad_norm": 2.061174154281616, + "learning_rate": 4.725063981878042e-05, + "loss": 4.2263, + "step": 25338 + }, + { + "epoch": 0.15069821105718909, + "grad_norm": 2.3144235610961914, + "learning_rate": 4.7250426859738464e-05, + "loss": 4.2848, + "step": 25339 + }, + { + "epoch": 0.15070415834047007, + "grad_norm": 2.0103487968444824, + "learning_rate": 4.7250213892929115e-05, + "loss": 4.178, + "step": 25340 + }, + { + "epoch": 0.15071010562375106, + "grad_norm": 2.093339443206787, + "learning_rate": 4.725000091835245e-05, + "loss": 4.3689, + "step": 25341 + }, + { + "epoch": 0.15071605290703208, + "grad_norm": 2.085618495941162, + "learning_rate": 4.724978793600853e-05, + "loss": 4.1158, + "step": 25342 + }, + { + "epoch": 0.15072200019031307, + "grad_norm": 2.2095706462860107, + "learning_rate": 4.7249574945897445e-05, + "loss": 4.3338, + "step": 25343 + }, + { + "epoch": 0.15072794747359405, + "grad_norm": 2.169772148132324, + "learning_rate": 4.7249361948019267e-05, + "loss": 4.63, + "step": 25344 + }, + { + "epoch": 0.15073389475687507, + "grad_norm": 2.5633938312530518, + "learning_rate": 4.7249148942374054e-05, + "loss": 4.954, + "step": 25345 + }, + { + "epoch": 0.15073984204015606, + "grad_norm": 2.181420087814331, + "learning_rate": 4.72489359289619e-05, + "loss": 4.5234, + "step": 25346 + }, + { + "epoch": 0.15074578932343705, + "grad_norm": 2.265392541885376, + "learning_rate": 4.724872290778288e-05, + "loss": 4.1063, + "step": 25347 + }, + { + "epoch": 0.15075173660671806, + "grad_norm": 1.8531908988952637, + "learning_rate": 4.7248509878837054e-05, + "loss": 4.7115, + "step": 25348 + }, + { + "epoch": 0.15075768388999905, + "grad_norm": 2.096639633178711, + "learning_rate": 4.724829684212451e-05, + "loss": 4.2179, + "step": 25349 + }, + { + "epoch": 0.15076363117328004, + "grad_norm": 1.99870765209198, + "learning_rate": 4.72480837976453e-05, + "loss": 4.3259, + "step": 25350 + }, + { + "epoch": 0.15076957845656105, + "grad_norm": 2.024890422821045, + "learning_rate": 4.724787074539953e-05, + "loss": 4.1168, + "step": 25351 + }, + { + "epoch": 0.15077552573984204, + "grad_norm": 2.2805378437042236, + "learning_rate": 4.724765768538725e-05, + "loss": 4.3184, + "step": 25352 + }, + { + "epoch": 0.15078147302312303, + "grad_norm": 2.2098236083984375, + "learning_rate": 4.7247444617608535e-05, + "loss": 4.3815, + "step": 25353 + }, + { + "epoch": 0.15078742030640405, + "grad_norm": 2.6324753761291504, + "learning_rate": 4.724723154206348e-05, + "loss": 4.3017, + "step": 25354 + }, + { + "epoch": 0.15079336758968503, + "grad_norm": 3.0926623344421387, + "learning_rate": 4.724701845875215e-05, + "loss": 4.6768, + "step": 25355 + }, + { + "epoch": 0.15079931487296602, + "grad_norm": 2.8633837699890137, + "learning_rate": 4.7246805367674603e-05, + "loss": 4.3765, + "step": 25356 + }, + { + "epoch": 0.15080526215624704, + "grad_norm": 2.4857215881347656, + "learning_rate": 4.7246592268830924e-05, + "loss": 4.3245, + "step": 25357 + }, + { + "epoch": 0.15081120943952803, + "grad_norm": 3.3124706745147705, + "learning_rate": 4.72463791622212e-05, + "loss": 4.1451, + "step": 25358 + }, + { + "epoch": 0.15081715672280901, + "grad_norm": 2.3086657524108887, + "learning_rate": 4.724616604784549e-05, + "loss": 4.5879, + "step": 25359 + }, + { + "epoch": 0.15082310400609003, + "grad_norm": 2.082601308822632, + "learning_rate": 4.724595292570387e-05, + "loss": 5.1047, + "step": 25360 + }, + { + "epoch": 0.15082905128937102, + "grad_norm": 1.6798832416534424, + "learning_rate": 4.7245739795796426e-05, + "loss": 4.7877, + "step": 25361 + }, + { + "epoch": 0.150834998572652, + "grad_norm": 2.76798152923584, + "learning_rate": 4.724552665812322e-05, + "loss": 4.1044, + "step": 25362 + }, + { + "epoch": 0.15084094585593302, + "grad_norm": 2.7487802505493164, + "learning_rate": 4.724531351268433e-05, + "loss": 4.4089, + "step": 25363 + }, + { + "epoch": 0.150846893139214, + "grad_norm": 2.2958571910858154, + "learning_rate": 4.7245100359479833e-05, + "loss": 4.1923, + "step": 25364 + }, + { + "epoch": 0.150852840422495, + "grad_norm": 2.200896978378296, + "learning_rate": 4.7244887198509805e-05, + "loss": 4.3105, + "step": 25365 + }, + { + "epoch": 0.15085878770577602, + "grad_norm": 2.0711123943328857, + "learning_rate": 4.7244674029774307e-05, + "loss": 4.3327, + "step": 25366 + }, + { + "epoch": 0.150864734989057, + "grad_norm": 1.8481465578079224, + "learning_rate": 4.724446085327342e-05, + "loss": 4.7603, + "step": 25367 + }, + { + "epoch": 0.150870682272338, + "grad_norm": 1.5740338563919067, + "learning_rate": 4.7244247669007234e-05, + "loss": 4.7191, + "step": 25368 + }, + { + "epoch": 0.15087662955561898, + "grad_norm": 1.4988723993301392, + "learning_rate": 4.724403447697581e-05, + "loss": 4.6288, + "step": 25369 + }, + { + "epoch": 0.1508825768389, + "grad_norm": 1.8862982988357544, + "learning_rate": 4.7243821277179213e-05, + "loss": 4.6308, + "step": 25370 + }, + { + "epoch": 0.15088852412218098, + "grad_norm": 1.6412887573242188, + "learning_rate": 4.7243608069617534e-05, + "loss": 5.1476, + "step": 25371 + }, + { + "epoch": 0.15089447140546197, + "grad_norm": 1.58519446849823, + "learning_rate": 4.7243394854290847e-05, + "loss": 5.6586, + "step": 25372 + }, + { + "epoch": 0.150900418688743, + "grad_norm": 1.5548374652862549, + "learning_rate": 4.724318163119921e-05, + "loss": 5.4283, + "step": 25373 + }, + { + "epoch": 0.15090636597202398, + "grad_norm": 1.456405758857727, + "learning_rate": 4.724296840034271e-05, + "loss": 5.3778, + "step": 25374 + }, + { + "epoch": 0.15091231325530496, + "grad_norm": 1.2034344673156738, + "learning_rate": 4.7242755161721424e-05, + "loss": 5.1189, + "step": 25375 + }, + { + "epoch": 0.15091826053858598, + "grad_norm": 2.2144997119903564, + "learning_rate": 4.724254191533543e-05, + "loss": 4.7091, + "step": 25376 + }, + { + "epoch": 0.15092420782186697, + "grad_norm": 2.322824239730835, + "learning_rate": 4.7242328661184774e-05, + "loss": 4.3568, + "step": 25377 + }, + { + "epoch": 0.15093015510514796, + "grad_norm": 2.832406997680664, + "learning_rate": 4.7242115399269567e-05, + "loss": 4.156, + "step": 25378 + }, + { + "epoch": 0.15093610238842897, + "grad_norm": 2.5387492179870605, + "learning_rate": 4.724190212958986e-05, + "loss": 4.2464, + "step": 25379 + }, + { + "epoch": 0.15094204967170996, + "grad_norm": 2.3497941493988037, + "learning_rate": 4.724168885214574e-05, + "loss": 4.2937, + "step": 25380 + }, + { + "epoch": 0.15094799695499095, + "grad_norm": 1.9066410064697266, + "learning_rate": 4.724147556693727e-05, + "loss": 4.3862, + "step": 25381 + }, + { + "epoch": 0.15095394423827196, + "grad_norm": 1.981546401977539, + "learning_rate": 4.724126227396454e-05, + "loss": 4.2936, + "step": 25382 + }, + { + "epoch": 0.15095989152155295, + "grad_norm": 1.7924445867538452, + "learning_rate": 4.7241048973227604e-05, + "loss": 5.173, + "step": 25383 + }, + { + "epoch": 0.15096583880483394, + "grad_norm": 1.985730528831482, + "learning_rate": 4.724083566472655e-05, + "loss": 4.6256, + "step": 25384 + }, + { + "epoch": 0.15097178608811496, + "grad_norm": 1.7368820905685425, + "learning_rate": 4.7240622348461457e-05, + "loss": 5.2259, + "step": 25385 + }, + { + "epoch": 0.15097773337139594, + "grad_norm": 1.761334776878357, + "learning_rate": 4.724040902443239e-05, + "loss": 4.8674, + "step": 25386 + }, + { + "epoch": 0.15098368065467693, + "grad_norm": 2.460028886795044, + "learning_rate": 4.724019569263942e-05, + "loss": 4.6597, + "step": 25387 + }, + { + "epoch": 0.15098962793795795, + "grad_norm": 2.524463176727295, + "learning_rate": 4.723998235308263e-05, + "loss": 4.2823, + "step": 25388 + }, + { + "epoch": 0.15099557522123894, + "grad_norm": 2.211486577987671, + "learning_rate": 4.723976900576209e-05, + "loss": 4.2802, + "step": 25389 + }, + { + "epoch": 0.15100152250451992, + "grad_norm": 2.323294162750244, + "learning_rate": 4.723955565067788e-05, + "loss": 4.2044, + "step": 25390 + }, + { + "epoch": 0.15100746978780094, + "grad_norm": 2.0671331882476807, + "learning_rate": 4.723934228783007e-05, + "loss": 4.2368, + "step": 25391 + }, + { + "epoch": 0.15101341707108193, + "grad_norm": 2.4726204872131348, + "learning_rate": 4.723912891721874e-05, + "loss": 3.9728, + "step": 25392 + }, + { + "epoch": 0.15101936435436292, + "grad_norm": 2.278228998184204, + "learning_rate": 4.7238915538843954e-05, + "loss": 4.0742, + "step": 25393 + }, + { + "epoch": 0.15102531163764393, + "grad_norm": 2.3213517665863037, + "learning_rate": 4.7238702152705794e-05, + "loss": 4.2124, + "step": 25394 + }, + { + "epoch": 0.15103125892092492, + "grad_norm": 1.7494871616363525, + "learning_rate": 4.7238488758804334e-05, + "loss": 5.1252, + "step": 25395 + }, + { + "epoch": 0.1510372062042059, + "grad_norm": 1.8289192914962769, + "learning_rate": 4.723827535713965e-05, + "loss": 4.9194, + "step": 25396 + }, + { + "epoch": 0.15104315348748693, + "grad_norm": 1.7058460712432861, + "learning_rate": 4.723806194771181e-05, + "loss": 5.2878, + "step": 25397 + }, + { + "epoch": 0.1510491007707679, + "grad_norm": 2.0224595069885254, + "learning_rate": 4.723784853052089e-05, + "loss": 4.4899, + "step": 25398 + }, + { + "epoch": 0.1510550480540489, + "grad_norm": 2.4246976375579834, + "learning_rate": 4.723763510556697e-05, + "loss": 3.9646, + "step": 25399 + }, + { + "epoch": 0.15106099533732992, + "grad_norm": 2.473158597946167, + "learning_rate": 4.723742167285012e-05, + "loss": 4.1942, + "step": 25400 + }, + { + "epoch": 0.1510669426206109, + "grad_norm": 3.9526100158691406, + "learning_rate": 4.723720823237041e-05, + "loss": 3.6103, + "step": 25401 + }, + { + "epoch": 0.1510728899038919, + "grad_norm": 3.6537516117095947, + "learning_rate": 4.723699478412793e-05, + "loss": 4.2312, + "step": 25402 + }, + { + "epoch": 0.1510788371871729, + "grad_norm": 1.5094470977783203, + "learning_rate": 4.7236781328122745e-05, + "loss": 5.577, + "step": 25403 + }, + { + "epoch": 0.1510847844704539, + "grad_norm": 1.7783223390579224, + "learning_rate": 4.7236567864354924e-05, + "loss": 5.6923, + "step": 25404 + }, + { + "epoch": 0.15109073175373489, + "grad_norm": 1.8453465700149536, + "learning_rate": 4.723635439282455e-05, + "loss": 5.3975, + "step": 25405 + }, + { + "epoch": 0.1510966790370159, + "grad_norm": 1.7783082723617554, + "learning_rate": 4.723614091353169e-05, + "loss": 5.2236, + "step": 25406 + }, + { + "epoch": 0.1511026263202969, + "grad_norm": 1.6507834196090698, + "learning_rate": 4.723592742647643e-05, + "loss": 5.3565, + "step": 25407 + }, + { + "epoch": 0.15110857360357788, + "grad_norm": 1.4875059127807617, + "learning_rate": 4.723571393165883e-05, + "loss": 5.5752, + "step": 25408 + }, + { + "epoch": 0.1511145208868589, + "grad_norm": 1.6694411039352417, + "learning_rate": 4.7235500429078985e-05, + "loss": 5.4707, + "step": 25409 + }, + { + "epoch": 0.15112046817013988, + "grad_norm": 1.7157987356185913, + "learning_rate": 4.723528691873694e-05, + "loss": 5.3777, + "step": 25410 + }, + { + "epoch": 0.15112641545342087, + "grad_norm": 2.611750602722168, + "learning_rate": 4.72350734006328e-05, + "loss": 3.1969, + "step": 25411 + }, + { + "epoch": 0.1511323627367019, + "grad_norm": 2.0207319259643555, + "learning_rate": 4.7234859874766614e-05, + "loss": 4.8871, + "step": 25412 + }, + { + "epoch": 0.15113831001998287, + "grad_norm": 2.598403215408325, + "learning_rate": 4.723464634113847e-05, + "loss": 4.9404, + "step": 25413 + }, + { + "epoch": 0.15114425730326386, + "grad_norm": 1.764269232749939, + "learning_rate": 4.723443279974845e-05, + "loss": 5.2649, + "step": 25414 + }, + { + "epoch": 0.15115020458654488, + "grad_norm": 1.8783745765686035, + "learning_rate": 4.723421925059661e-05, + "loss": 4.8755, + "step": 25415 + }, + { + "epoch": 0.15115615186982587, + "grad_norm": 1.497833251953125, + "learning_rate": 4.7234005693683035e-05, + "loss": 5.0806, + "step": 25416 + }, + { + "epoch": 0.15116209915310685, + "grad_norm": 1.6030247211456299, + "learning_rate": 4.72337921290078e-05, + "loss": 5.0388, + "step": 25417 + }, + { + "epoch": 0.15116804643638787, + "grad_norm": 1.7181298732757568, + "learning_rate": 4.723357855657098e-05, + "loss": 4.8316, + "step": 25418 + }, + { + "epoch": 0.15117399371966886, + "grad_norm": 1.4665559530258179, + "learning_rate": 4.7233364976372644e-05, + "loss": 5.5005, + "step": 25419 + }, + { + "epoch": 0.15117994100294985, + "grad_norm": 3.3794503211975098, + "learning_rate": 4.723315138841287e-05, + "loss": 3.9864, + "step": 25420 + }, + { + "epoch": 0.15118588828623086, + "grad_norm": 1.7290079593658447, + "learning_rate": 4.723293779269173e-05, + "loss": 5.3736, + "step": 25421 + }, + { + "epoch": 0.15119183556951185, + "grad_norm": 1.995943307876587, + "learning_rate": 4.723272418920931e-05, + "loss": 4.8142, + "step": 25422 + }, + { + "epoch": 0.15119778285279284, + "grad_norm": 1.8627694845199585, + "learning_rate": 4.7232510577965674e-05, + "loss": 5.2348, + "step": 25423 + }, + { + "epoch": 0.15120373013607386, + "grad_norm": 1.5469872951507568, + "learning_rate": 4.72322969589609e-05, + "loss": 5.1102, + "step": 25424 + }, + { + "epoch": 0.15120967741935484, + "grad_norm": 1.503350853919983, + "learning_rate": 4.723208333219505e-05, + "loss": 5.2009, + "step": 25425 + }, + { + "epoch": 0.15121562470263583, + "grad_norm": 1.5141102075576782, + "learning_rate": 4.7231869697668214e-05, + "loss": 5.4231, + "step": 25426 + }, + { + "epoch": 0.15122157198591682, + "grad_norm": 1.5022274255752563, + "learning_rate": 4.723165605538046e-05, + "loss": 5.1454, + "step": 25427 + }, + { + "epoch": 0.15122751926919784, + "grad_norm": 1.2774550914764404, + "learning_rate": 4.7231442405331874e-05, + "loss": 5.4048, + "step": 25428 + }, + { + "epoch": 0.15123346655247882, + "grad_norm": 1.4588242769241333, + "learning_rate": 4.723122874752251e-05, + "loss": 5.1466, + "step": 25429 + }, + { + "epoch": 0.1512394138357598, + "grad_norm": 1.6666613817214966, + "learning_rate": 4.7231015081952454e-05, + "loss": 5.6505, + "step": 25430 + }, + { + "epoch": 0.15124536111904083, + "grad_norm": 3.1419155597686768, + "learning_rate": 4.72308014086218e-05, + "loss": 5.1714, + "step": 25431 + }, + { + "epoch": 0.15125130840232182, + "grad_norm": 1.8372479677200317, + "learning_rate": 4.723058772753058e-05, + "loss": 5.3135, + "step": 25432 + }, + { + "epoch": 0.1512572556856028, + "grad_norm": 1.4300392866134644, + "learning_rate": 4.7230374038678895e-05, + "loss": 5.4404, + "step": 25433 + }, + { + "epoch": 0.15126320296888382, + "grad_norm": 1.4411662817001343, + "learning_rate": 4.723016034206682e-05, + "loss": 5.4341, + "step": 25434 + }, + { + "epoch": 0.1512691502521648, + "grad_norm": 1.4989326000213623, + "learning_rate": 4.7229946637694425e-05, + "loss": 5.3632, + "step": 25435 + }, + { + "epoch": 0.1512750975354458, + "grad_norm": 1.2930675745010376, + "learning_rate": 4.7229732925561785e-05, + "loss": 5.1667, + "step": 25436 + }, + { + "epoch": 0.1512810448187268, + "grad_norm": 1.6399480104446411, + "learning_rate": 4.722951920566898e-05, + "loss": 5.0464, + "step": 25437 + }, + { + "epoch": 0.1512869921020078, + "grad_norm": 1.6308560371398926, + "learning_rate": 4.722930547801608e-05, + "loss": 5.416, + "step": 25438 + }, + { + "epoch": 0.1512929393852888, + "grad_norm": 1.8431388139724731, + "learning_rate": 4.722909174260316e-05, + "loss": 5.6069, + "step": 25439 + }, + { + "epoch": 0.1512988866685698, + "grad_norm": 1.964154601097107, + "learning_rate": 4.722887799943028e-05, + "loss": 5.845, + "step": 25440 + }, + { + "epoch": 0.1513048339518508, + "grad_norm": 1.731370210647583, + "learning_rate": 4.722866424849753e-05, + "loss": 5.3155, + "step": 25441 + }, + { + "epoch": 0.15131078123513178, + "grad_norm": 1.9794760942459106, + "learning_rate": 4.7228450489805e-05, + "loss": 4.8395, + "step": 25442 + }, + { + "epoch": 0.1513167285184128, + "grad_norm": 2.016857862472534, + "learning_rate": 4.7228236723352735e-05, + "loss": 4.5546, + "step": 25443 + }, + { + "epoch": 0.15132267580169378, + "grad_norm": 1.9085549116134644, + "learning_rate": 4.722802294914083e-05, + "loss": 4.7848, + "step": 25444 + }, + { + "epoch": 0.15132862308497477, + "grad_norm": 1.5769025087356567, + "learning_rate": 4.7227809167169345e-05, + "loss": 5.1207, + "step": 25445 + }, + { + "epoch": 0.1513345703682558, + "grad_norm": 1.4327126741409302, + "learning_rate": 4.7227595377438364e-05, + "loss": 5.323, + "step": 25446 + }, + { + "epoch": 0.15134051765153678, + "grad_norm": 1.536750316619873, + "learning_rate": 4.722738157994796e-05, + "loss": 4.812, + "step": 25447 + }, + { + "epoch": 0.15134646493481776, + "grad_norm": 1.6312404870986938, + "learning_rate": 4.72271677746982e-05, + "loss": 4.8753, + "step": 25448 + }, + { + "epoch": 0.15135241221809878, + "grad_norm": 1.3323699235916138, + "learning_rate": 4.722695396168917e-05, + "loss": 5.6005, + "step": 25449 + }, + { + "epoch": 0.15135835950137977, + "grad_norm": 1.5522531270980835, + "learning_rate": 4.722674014092094e-05, + "loss": 5.3848, + "step": 25450 + }, + { + "epoch": 0.15136430678466076, + "grad_norm": 1.5421935319900513, + "learning_rate": 4.722652631239358e-05, + "loss": 5.4136, + "step": 25451 + }, + { + "epoch": 0.15137025406794177, + "grad_norm": 1.564570665359497, + "learning_rate": 4.722631247610718e-05, + "loss": 5.3169, + "step": 25452 + }, + { + "epoch": 0.15137620135122276, + "grad_norm": 1.7175198793411255, + "learning_rate": 4.72260986320618e-05, + "loss": 4.5904, + "step": 25453 + }, + { + "epoch": 0.15138214863450375, + "grad_norm": 1.5852707624435425, + "learning_rate": 4.722588478025751e-05, + "loss": 4.8459, + "step": 25454 + }, + { + "epoch": 0.15138809591778477, + "grad_norm": 1.4209281206130981, + "learning_rate": 4.7225670920694404e-05, + "loss": 5.4134, + "step": 25455 + }, + { + "epoch": 0.15139404320106575, + "grad_norm": 1.4841557741165161, + "learning_rate": 4.722545705337254e-05, + "loss": 5.0996, + "step": 25456 + }, + { + "epoch": 0.15139999048434674, + "grad_norm": 1.4958367347717285, + "learning_rate": 4.7225243178292e-05, + "loss": 4.5363, + "step": 25457 + }, + { + "epoch": 0.15140593776762776, + "grad_norm": 1.6424293518066406, + "learning_rate": 4.722502929545286e-05, + "loss": 5.0227, + "step": 25458 + }, + { + "epoch": 0.15141188505090875, + "grad_norm": 1.687121868133545, + "learning_rate": 4.722481540485519e-05, + "loss": 4.9662, + "step": 25459 + }, + { + "epoch": 0.15141783233418973, + "grad_norm": 1.6748243570327759, + "learning_rate": 4.722460150649907e-05, + "loss": 4.4443, + "step": 25460 + }, + { + "epoch": 0.15142377961747075, + "grad_norm": 2.2483417987823486, + "learning_rate": 4.722438760038456e-05, + "loss": 4.8411, + "step": 25461 + }, + { + "epoch": 0.15142972690075174, + "grad_norm": 1.6556822061538696, + "learning_rate": 4.7224173686511754e-05, + "loss": 5.1596, + "step": 25462 + }, + { + "epoch": 0.15143567418403273, + "grad_norm": 1.6137731075286865, + "learning_rate": 4.722395976488072e-05, + "loss": 4.6538, + "step": 25463 + }, + { + "epoch": 0.15144162146731374, + "grad_norm": 1.7086783647537231, + "learning_rate": 4.722374583549153e-05, + "loss": 5.2168, + "step": 25464 + }, + { + "epoch": 0.15144756875059473, + "grad_norm": 1.668527603149414, + "learning_rate": 4.7223531898344256e-05, + "loss": 5.138, + "step": 25465 + }, + { + "epoch": 0.15145351603387572, + "grad_norm": 2.2906320095062256, + "learning_rate": 4.722331795343899e-05, + "loss": 4.6954, + "step": 25466 + }, + { + "epoch": 0.15145946331715673, + "grad_norm": 2.410048246383667, + "learning_rate": 4.722310400077578e-05, + "loss": 4.5377, + "step": 25467 + }, + { + "epoch": 0.15146541060043772, + "grad_norm": 1.7885384559631348, + "learning_rate": 4.722289004035471e-05, + "loss": 4.8978, + "step": 25468 + }, + { + "epoch": 0.1514713578837187, + "grad_norm": 1.5193252563476562, + "learning_rate": 4.7222676072175866e-05, + "loss": 5.2818, + "step": 25469 + }, + { + "epoch": 0.15147730516699973, + "grad_norm": 2.0139195919036865, + "learning_rate": 4.7222462096239314e-05, + "loss": 4.1632, + "step": 25470 + }, + { + "epoch": 0.15148325245028071, + "grad_norm": 2.007025718688965, + "learning_rate": 4.7222248112545133e-05, + "loss": 4.0832, + "step": 25471 + }, + { + "epoch": 0.1514891997335617, + "grad_norm": 2.2270402908325195, + "learning_rate": 4.722203412109339e-05, + "loss": 4.2317, + "step": 25472 + }, + { + "epoch": 0.15149514701684272, + "grad_norm": 2.0418808460235596, + "learning_rate": 4.722182012188417e-05, + "loss": 4.1849, + "step": 25473 + }, + { + "epoch": 0.1515010943001237, + "grad_norm": 2.087785243988037, + "learning_rate": 4.722160611491754e-05, + "loss": 4.1218, + "step": 25474 + }, + { + "epoch": 0.1515070415834047, + "grad_norm": 2.303571939468384, + "learning_rate": 4.7221392100193575e-05, + "loss": 3.9614, + "step": 25475 + }, + { + "epoch": 0.1515129888666857, + "grad_norm": 1.9516772031784058, + "learning_rate": 4.722117807771235e-05, + "loss": 3.9619, + "step": 25476 + }, + { + "epoch": 0.1515189361499667, + "grad_norm": 1.9611634016036987, + "learning_rate": 4.722096404747395e-05, + "loss": 3.9133, + "step": 25477 + }, + { + "epoch": 0.1515248834332477, + "grad_norm": 1.9254827499389648, + "learning_rate": 4.722075000947843e-05, + "loss": 3.877, + "step": 25478 + }, + { + "epoch": 0.1515308307165287, + "grad_norm": 1.803846001625061, + "learning_rate": 4.722053596372588e-05, + "loss": 3.8338, + "step": 25479 + }, + { + "epoch": 0.1515367779998097, + "grad_norm": 1.829439401626587, + "learning_rate": 4.722032191021637e-05, + "loss": 3.8183, + "step": 25480 + }, + { + "epoch": 0.15154272528309068, + "grad_norm": 1.7955585718154907, + "learning_rate": 4.722010784894998e-05, + "loss": 4.6821, + "step": 25481 + }, + { + "epoch": 0.1515486725663717, + "grad_norm": 2.9624781608581543, + "learning_rate": 4.7219893779926775e-05, + "loss": 3.9385, + "step": 25482 + }, + { + "epoch": 0.15155461984965268, + "grad_norm": 1.8687463998794556, + "learning_rate": 4.721967970314684e-05, + "loss": 4.0364, + "step": 25483 + }, + { + "epoch": 0.15156056713293367, + "grad_norm": 1.9090644121170044, + "learning_rate": 4.721946561861024e-05, + "loss": 3.8046, + "step": 25484 + }, + { + "epoch": 0.15156651441621466, + "grad_norm": 1.9757955074310303, + "learning_rate": 4.721925152631706e-05, + "loss": 3.943, + "step": 25485 + }, + { + "epoch": 0.15157246169949568, + "grad_norm": 1.9161666631698608, + "learning_rate": 4.7219037426267356e-05, + "loss": 3.8818, + "step": 25486 + }, + { + "epoch": 0.15157840898277666, + "grad_norm": 1.8484982252120972, + "learning_rate": 4.7218823318461226e-05, + "loss": 4.0713, + "step": 25487 + }, + { + "epoch": 0.15158435626605765, + "grad_norm": 1.6787267923355103, + "learning_rate": 4.7218609202898726e-05, + "loss": 5.7814, + "step": 25488 + }, + { + "epoch": 0.15159030354933867, + "grad_norm": 1.6946018934249878, + "learning_rate": 4.7218395079579946e-05, + "loss": 5.9241, + "step": 25489 + }, + { + "epoch": 0.15159625083261966, + "grad_norm": 1.5210212469100952, + "learning_rate": 4.721818094850495e-05, + "loss": 6.0828, + "step": 25490 + }, + { + "epoch": 0.15160219811590064, + "grad_norm": 1.7792625427246094, + "learning_rate": 4.721796680967382e-05, + "loss": 6.241, + "step": 25491 + }, + { + "epoch": 0.15160814539918166, + "grad_norm": 1.5366078615188599, + "learning_rate": 4.7217752663086626e-05, + "loss": 5.7111, + "step": 25492 + }, + { + "epoch": 0.15161409268246265, + "grad_norm": 1.5193569660186768, + "learning_rate": 4.721753850874344e-05, + "loss": 5.3155, + "step": 25493 + }, + { + "epoch": 0.15162003996574364, + "grad_norm": 1.9060078859329224, + "learning_rate": 4.7217324346644356e-05, + "loss": 5.368, + "step": 25494 + }, + { + "epoch": 0.15162598724902465, + "grad_norm": 1.4217309951782227, + "learning_rate": 4.7217110176789416e-05, + "loss": 5.4781, + "step": 25495 + }, + { + "epoch": 0.15163193453230564, + "grad_norm": 1.561132550239563, + "learning_rate": 4.7216895999178725e-05, + "loss": 5.3316, + "step": 25496 + }, + { + "epoch": 0.15163788181558663, + "grad_norm": 1.397314429283142, + "learning_rate": 4.7216681813812335e-05, + "loss": 5.4047, + "step": 25497 + }, + { + "epoch": 0.15164382909886764, + "grad_norm": 1.3138307332992554, + "learning_rate": 4.7216467620690335e-05, + "loss": 5.3706, + "step": 25498 + }, + { + "epoch": 0.15164977638214863, + "grad_norm": 1.4298443794250488, + "learning_rate": 4.7216253419812794e-05, + "loss": 5.3704, + "step": 25499 + }, + { + "epoch": 0.15165572366542962, + "grad_norm": 1.703792929649353, + "learning_rate": 4.72160392111798e-05, + "loss": 5.2468, + "step": 25500 + }, + { + "epoch": 0.15166167094871064, + "grad_norm": 1.566309928894043, + "learning_rate": 4.72158249947914e-05, + "loss": 5.5153, + "step": 25501 + }, + { + "epoch": 0.15166761823199162, + "grad_norm": 1.3141274452209473, + "learning_rate": 4.721561077064769e-05, + "loss": 5.6254, + "step": 25502 + }, + { + "epoch": 0.1516735655152726, + "grad_norm": 1.4979000091552734, + "learning_rate": 4.721539653874874e-05, + "loss": 5.4936, + "step": 25503 + }, + { + "epoch": 0.15167951279855363, + "grad_norm": 1.694068193435669, + "learning_rate": 4.721518229909463e-05, + "loss": 5.6601, + "step": 25504 + }, + { + "epoch": 0.15168546008183462, + "grad_norm": 1.8887871503829956, + "learning_rate": 4.721496805168543e-05, + "loss": 4.8596, + "step": 25505 + }, + { + "epoch": 0.1516914073651156, + "grad_norm": 2.5169517993927, + "learning_rate": 4.721475379652121e-05, + "loss": 4.0797, + "step": 25506 + }, + { + "epoch": 0.15169735464839662, + "grad_norm": 2.4206509590148926, + "learning_rate": 4.7214539533602046e-05, + "loss": 3.9878, + "step": 25507 + }, + { + "epoch": 0.1517033019316776, + "grad_norm": 2.054685354232788, + "learning_rate": 4.7214325262928013e-05, + "loss": 3.948, + "step": 25508 + }, + { + "epoch": 0.1517092492149586, + "grad_norm": 1.4626624584197998, + "learning_rate": 4.721411098449919e-05, + "loss": 5.4617, + "step": 25509 + }, + { + "epoch": 0.1517151964982396, + "grad_norm": 1.7592542171478271, + "learning_rate": 4.721389669831566e-05, + "loss": 5.4125, + "step": 25510 + }, + { + "epoch": 0.1517211437815206, + "grad_norm": 1.669419288635254, + "learning_rate": 4.721368240437748e-05, + "loss": 5.4718, + "step": 25511 + }, + { + "epoch": 0.1517270910648016, + "grad_norm": 1.0741300582885742, + "learning_rate": 4.721346810268473e-05, + "loss": 5.5668, + "step": 25512 + }, + { + "epoch": 0.1517330383480826, + "grad_norm": 1.41902494430542, + "learning_rate": 4.72132537932375e-05, + "loss": 5.5451, + "step": 25513 + }, + { + "epoch": 0.1517389856313636, + "grad_norm": 1.7693331241607666, + "learning_rate": 4.721303947603584e-05, + "loss": 5.7588, + "step": 25514 + }, + { + "epoch": 0.15174493291464458, + "grad_norm": 1.7695659399032593, + "learning_rate": 4.7212825151079844e-05, + "loss": 5.6659, + "step": 25515 + }, + { + "epoch": 0.1517508801979256, + "grad_norm": 1.5901025533676147, + "learning_rate": 4.7212610818369586e-05, + "loss": 5.3805, + "step": 25516 + }, + { + "epoch": 0.15175682748120659, + "grad_norm": 1.8363381624221802, + "learning_rate": 4.721239647790512e-05, + "loss": 5.808, + "step": 25517 + }, + { + "epoch": 0.15176277476448757, + "grad_norm": 1.7976000308990479, + "learning_rate": 4.721218212968655e-05, + "loss": 5.7034, + "step": 25518 + }, + { + "epoch": 0.1517687220477686, + "grad_norm": 1.7203330993652344, + "learning_rate": 4.721196777371393e-05, + "loss": 5.4174, + "step": 25519 + }, + { + "epoch": 0.15177466933104958, + "grad_norm": 1.6678218841552734, + "learning_rate": 4.7211753409987344e-05, + "loss": 5.4002, + "step": 25520 + }, + { + "epoch": 0.15178061661433057, + "grad_norm": 1.3932818174362183, + "learning_rate": 4.721153903850686e-05, + "loss": 5.7598, + "step": 25521 + }, + { + "epoch": 0.15178656389761158, + "grad_norm": 1.4975392818450928, + "learning_rate": 4.721132465927256e-05, + "loss": 5.2991, + "step": 25522 + }, + { + "epoch": 0.15179251118089257, + "grad_norm": 1.5375689268112183, + "learning_rate": 4.721111027228452e-05, + "loss": 5.7456, + "step": 25523 + }, + { + "epoch": 0.15179845846417356, + "grad_norm": 1.6894830465316772, + "learning_rate": 4.72108958775428e-05, + "loss": 5.1867, + "step": 25524 + }, + { + "epoch": 0.15180440574745457, + "grad_norm": 1.569059133529663, + "learning_rate": 4.72106814750475e-05, + "loss": 5.4544, + "step": 25525 + }, + { + "epoch": 0.15181035303073556, + "grad_norm": 1.5884952545166016, + "learning_rate": 4.721046706479867e-05, + "loss": 5.1496, + "step": 25526 + }, + { + "epoch": 0.15181630031401655, + "grad_norm": 1.552410364151001, + "learning_rate": 4.721025264679639e-05, + "loss": 5.0916, + "step": 25527 + }, + { + "epoch": 0.15182224759729757, + "grad_norm": 1.5972039699554443, + "learning_rate": 4.721003822104076e-05, + "loss": 5.2073, + "step": 25528 + }, + { + "epoch": 0.15182819488057855, + "grad_norm": 1.6742616891860962, + "learning_rate": 4.720982378753182e-05, + "loss": 5.4851, + "step": 25529 + }, + { + "epoch": 0.15183414216385954, + "grad_norm": 1.4974780082702637, + "learning_rate": 4.7209609346269665e-05, + "loss": 5.4444, + "step": 25530 + }, + { + "epoch": 0.15184008944714056, + "grad_norm": 1.5599150657653809, + "learning_rate": 4.7209394897254363e-05, + "loss": 4.8842, + "step": 25531 + }, + { + "epoch": 0.15184603673042155, + "grad_norm": 1.3979945182800293, + "learning_rate": 4.7209180440485986e-05, + "loss": 5.2836, + "step": 25532 + }, + { + "epoch": 0.15185198401370253, + "grad_norm": 1.3515275716781616, + "learning_rate": 4.720896597596462e-05, + "loss": 5.3011, + "step": 25533 + }, + { + "epoch": 0.15185793129698355, + "grad_norm": 1.7592774629592896, + "learning_rate": 4.720875150369034e-05, + "loss": 5.0874, + "step": 25534 + }, + { + "epoch": 0.15186387858026454, + "grad_norm": 1.5977163314819336, + "learning_rate": 4.72085370236632e-05, + "loss": 4.7678, + "step": 25535 + }, + { + "epoch": 0.15186982586354553, + "grad_norm": 1.3309252262115479, + "learning_rate": 4.7208322535883295e-05, + "loss": 4.9821, + "step": 25536 + }, + { + "epoch": 0.15187577314682654, + "grad_norm": 1.5985299348831177, + "learning_rate": 4.720810804035069e-05, + "loss": 5.1845, + "step": 25537 + }, + { + "epoch": 0.15188172043010753, + "grad_norm": 1.6021031141281128, + "learning_rate": 4.7207893537065475e-05, + "loss": 5.1628, + "step": 25538 + }, + { + "epoch": 0.15188766771338852, + "grad_norm": 1.6445283889770508, + "learning_rate": 4.7207679026027704e-05, + "loss": 4.7933, + "step": 25539 + }, + { + "epoch": 0.15189361499666953, + "grad_norm": 1.6480634212493896, + "learning_rate": 4.7207464507237474e-05, + "loss": 4.7912, + "step": 25540 + }, + { + "epoch": 0.15189956227995052, + "grad_norm": 1.7439652681350708, + "learning_rate": 4.720724998069483e-05, + "loss": 4.5412, + "step": 25541 + }, + { + "epoch": 0.1519055095632315, + "grad_norm": 1.5786992311477661, + "learning_rate": 4.720703544639988e-05, + "loss": 4.8873, + "step": 25542 + }, + { + "epoch": 0.1519114568465125, + "grad_norm": 1.3782871961593628, + "learning_rate": 4.7206820904352675e-05, + "loss": 4.5825, + "step": 25543 + }, + { + "epoch": 0.15191740412979352, + "grad_norm": 1.8048298358917236, + "learning_rate": 4.72066063545533e-05, + "loss": 4.746, + "step": 25544 + }, + { + "epoch": 0.1519233514130745, + "grad_norm": 1.4801894426345825, + "learning_rate": 4.7206391797001826e-05, + "loss": 4.8802, + "step": 25545 + }, + { + "epoch": 0.1519292986963555, + "grad_norm": 1.7984564304351807, + "learning_rate": 4.7206177231698333e-05, + "loss": 4.7674, + "step": 25546 + }, + { + "epoch": 0.1519352459796365, + "grad_norm": 1.7244421243667603, + "learning_rate": 4.72059626586429e-05, + "loss": 5.2729, + "step": 25547 + }, + { + "epoch": 0.1519411932629175, + "grad_norm": 1.2454429864883423, + "learning_rate": 4.7205748077835584e-05, + "loss": 4.9657, + "step": 25548 + }, + { + "epoch": 0.15194714054619848, + "grad_norm": 1.5179264545440674, + "learning_rate": 4.720553348927647e-05, + "loss": 5.2248, + "step": 25549 + }, + { + "epoch": 0.1519530878294795, + "grad_norm": 1.6204310655593872, + "learning_rate": 4.7205318892965636e-05, + "loss": 4.7349, + "step": 25550 + }, + { + "epoch": 0.1519590351127605, + "grad_norm": 1.6427180767059326, + "learning_rate": 4.7205104288903156e-05, + "loss": 4.9733, + "step": 25551 + }, + { + "epoch": 0.15196498239604148, + "grad_norm": 1.7110134363174438, + "learning_rate": 4.7204889677089104e-05, + "loss": 5.1714, + "step": 25552 + }, + { + "epoch": 0.1519709296793225, + "grad_norm": 1.6110901832580566, + "learning_rate": 4.7204675057523556e-05, + "loss": 5.409, + "step": 25553 + }, + { + "epoch": 0.15197687696260348, + "grad_norm": 1.7748627662658691, + "learning_rate": 4.720446043020658e-05, + "loss": 5.443, + "step": 25554 + }, + { + "epoch": 0.15198282424588447, + "grad_norm": 1.574576497077942, + "learning_rate": 4.720424579513826e-05, + "loss": 4.9988, + "step": 25555 + }, + { + "epoch": 0.15198877152916548, + "grad_norm": 1.4916949272155762, + "learning_rate": 4.720403115231867e-05, + "loss": 4.9242, + "step": 25556 + }, + { + "epoch": 0.15199471881244647, + "grad_norm": 1.4862215518951416, + "learning_rate": 4.7203816501747875e-05, + "loss": 5.2778, + "step": 25557 + }, + { + "epoch": 0.15200066609572746, + "grad_norm": 1.445859670639038, + "learning_rate": 4.720360184342597e-05, + "loss": 5.6821, + "step": 25558 + }, + { + "epoch": 0.15200661337900848, + "grad_norm": 1.5154931545257568, + "learning_rate": 4.7203387177353006e-05, + "loss": 5.1821, + "step": 25559 + }, + { + "epoch": 0.15201256066228946, + "grad_norm": 1.1950480937957764, + "learning_rate": 4.720317250352907e-05, + "loss": 5.55, + "step": 25560 + }, + { + "epoch": 0.15201850794557045, + "grad_norm": 1.4134416580200195, + "learning_rate": 4.720295782195423e-05, + "loss": 5.7252, + "step": 25561 + }, + { + "epoch": 0.15202445522885147, + "grad_norm": 1.5440611839294434, + "learning_rate": 4.720274313262858e-05, + "loss": 5.5527, + "step": 25562 + }, + { + "epoch": 0.15203040251213246, + "grad_norm": 1.3670108318328857, + "learning_rate": 4.720252843555217e-05, + "loss": 5.459, + "step": 25563 + }, + { + "epoch": 0.15203634979541344, + "grad_norm": 1.4591896533966064, + "learning_rate": 4.7202313730725094e-05, + "loss": 5.4654, + "step": 25564 + }, + { + "epoch": 0.15204229707869446, + "grad_norm": 1.675755500793457, + "learning_rate": 4.7202099018147414e-05, + "loss": 5.4915, + "step": 25565 + }, + { + "epoch": 0.15204824436197545, + "grad_norm": 1.9771230220794678, + "learning_rate": 4.720188429781922e-05, + "loss": 4.8577, + "step": 25566 + }, + { + "epoch": 0.15205419164525644, + "grad_norm": 1.3904792070388794, + "learning_rate": 4.720166956974057e-05, + "loss": 5.4445, + "step": 25567 + }, + { + "epoch": 0.15206013892853745, + "grad_norm": 1.4478521347045898, + "learning_rate": 4.720145483391155e-05, + "loss": 5.1729, + "step": 25568 + }, + { + "epoch": 0.15206608621181844, + "grad_norm": 2.138211250305176, + "learning_rate": 4.720124009033223e-05, + "loss": 4.0202, + "step": 25569 + }, + { + "epoch": 0.15207203349509943, + "grad_norm": 2.1613049507141113, + "learning_rate": 4.720102533900268e-05, + "loss": 4.0708, + "step": 25570 + }, + { + "epoch": 0.15207798077838044, + "grad_norm": 2.3467164039611816, + "learning_rate": 4.7200810579922996e-05, + "loss": 4.0428, + "step": 25571 + }, + { + "epoch": 0.15208392806166143, + "grad_norm": 2.0889739990234375, + "learning_rate": 4.720059581309323e-05, + "loss": 4.1653, + "step": 25572 + }, + { + "epoch": 0.15208987534494242, + "grad_norm": 1.611956238746643, + "learning_rate": 4.720038103851346e-05, + "loss": 5.3328, + "step": 25573 + }, + { + "epoch": 0.15209582262822344, + "grad_norm": 1.3318549394607544, + "learning_rate": 4.7200166256183776e-05, + "loss": 5.4102, + "step": 25574 + }, + { + "epoch": 0.15210176991150443, + "grad_norm": 1.674455165863037, + "learning_rate": 4.7199951466104234e-05, + "loss": 5.21, + "step": 25575 + }, + { + "epoch": 0.1521077171947854, + "grad_norm": 1.4780274629592896, + "learning_rate": 4.7199736668274924e-05, + "loss": 5.3385, + "step": 25576 + }, + { + "epoch": 0.15211366447806643, + "grad_norm": 1.7735114097595215, + "learning_rate": 4.719952186269592e-05, + "loss": 4.8768, + "step": 25577 + }, + { + "epoch": 0.15211961176134742, + "grad_norm": 1.6420248746871948, + "learning_rate": 4.719930704936728e-05, + "loss": 5.2584, + "step": 25578 + }, + { + "epoch": 0.1521255590446284, + "grad_norm": 1.970648169517517, + "learning_rate": 4.71990922282891e-05, + "loss": 4.4764, + "step": 25579 + }, + { + "epoch": 0.15213150632790942, + "grad_norm": 1.4318586587905884, + "learning_rate": 4.719887739946145e-05, + "loss": 5.5169, + "step": 25580 + }, + { + "epoch": 0.1521374536111904, + "grad_norm": 1.7637288570404053, + "learning_rate": 4.719866256288439e-05, + "loss": 5.1493, + "step": 25581 + }, + { + "epoch": 0.1521434008944714, + "grad_norm": 1.7159098386764526, + "learning_rate": 4.719844771855801e-05, + "loss": 5.3964, + "step": 25582 + }, + { + "epoch": 0.1521493481777524, + "grad_norm": 1.6556905508041382, + "learning_rate": 4.719823286648238e-05, + "loss": 5.3116, + "step": 25583 + }, + { + "epoch": 0.1521552954610334, + "grad_norm": 1.5177308320999146, + "learning_rate": 4.7198018006657584e-05, + "loss": 5.8963, + "step": 25584 + }, + { + "epoch": 0.1521612427443144, + "grad_norm": 1.960729718208313, + "learning_rate": 4.719780313908368e-05, + "loss": 5.266, + "step": 25585 + }, + { + "epoch": 0.1521671900275954, + "grad_norm": 1.6893891096115112, + "learning_rate": 4.719758826376076e-05, + "loss": 5.3618, + "step": 25586 + }, + { + "epoch": 0.1521731373108764, + "grad_norm": 1.5606249570846558, + "learning_rate": 4.719737338068889e-05, + "loss": 5.8684, + "step": 25587 + }, + { + "epoch": 0.15217908459415738, + "grad_norm": 1.6435186862945557, + "learning_rate": 4.7197158489868143e-05, + "loss": 4.9082, + "step": 25588 + }, + { + "epoch": 0.1521850318774384, + "grad_norm": 1.9077845811843872, + "learning_rate": 4.71969435912986e-05, + "loss": 4.0132, + "step": 25589 + }, + { + "epoch": 0.1521909791607194, + "grad_norm": 1.4427006244659424, + "learning_rate": 4.719672868498034e-05, + "loss": 5.5848, + "step": 25590 + }, + { + "epoch": 0.15219692644400037, + "grad_norm": 1.671826958656311, + "learning_rate": 4.719651377091342e-05, + "loss": 5.0797, + "step": 25591 + }, + { + "epoch": 0.1522028737272814, + "grad_norm": 1.8073980808258057, + "learning_rate": 4.719629884909793e-05, + "loss": 3.8879, + "step": 25592 + }, + { + "epoch": 0.15220882101056238, + "grad_norm": 1.8267574310302734, + "learning_rate": 4.719608391953394e-05, + "loss": 3.8104, + "step": 25593 + }, + { + "epoch": 0.15221476829384337, + "grad_norm": 1.8598294258117676, + "learning_rate": 4.7195868982221526e-05, + "loss": 3.6587, + "step": 25594 + }, + { + "epoch": 0.15222071557712438, + "grad_norm": 1.705465316772461, + "learning_rate": 4.7195654037160765e-05, + "loss": 3.9886, + "step": 25595 + }, + { + "epoch": 0.15222666286040537, + "grad_norm": 1.8253175020217896, + "learning_rate": 4.7195439084351734e-05, + "loss": 3.9031, + "step": 25596 + }, + { + "epoch": 0.15223261014368636, + "grad_norm": 1.718245506286621, + "learning_rate": 4.71952241237945e-05, + "loss": 4.2814, + "step": 25597 + }, + { + "epoch": 0.15223855742696737, + "grad_norm": 1.7115817070007324, + "learning_rate": 4.719500915548914e-05, + "loss": 4.748, + "step": 25598 + }, + { + "epoch": 0.15224450471024836, + "grad_norm": 1.53532874584198, + "learning_rate": 4.719479417943574e-05, + "loss": 5.499, + "step": 25599 + }, + { + "epoch": 0.15225045199352935, + "grad_norm": 1.854274868965149, + "learning_rate": 4.719457919563436e-05, + "loss": 4.1188, + "step": 25600 + }, + { + "epoch": 0.15225639927681037, + "grad_norm": 2.001619338989258, + "learning_rate": 4.7194364204085085e-05, + "loss": 3.89, + "step": 25601 + }, + { + "epoch": 0.15226234656009136, + "grad_norm": 1.9772802591323853, + "learning_rate": 4.7194149204787986e-05, + "loss": 3.8764, + "step": 25602 + }, + { + "epoch": 0.15226829384337234, + "grad_norm": 1.9361356496810913, + "learning_rate": 4.719393419774314e-05, + "loss": 5.0285, + "step": 25603 + }, + { + "epoch": 0.15227424112665333, + "grad_norm": 1.6824191808700562, + "learning_rate": 4.719371918295061e-05, + "loss": 5.2847, + "step": 25604 + }, + { + "epoch": 0.15228018840993435, + "grad_norm": 2.423736095428467, + "learning_rate": 4.7193504160410495e-05, + "loss": 4.087, + "step": 25605 + }, + { + "epoch": 0.15228613569321534, + "grad_norm": 1.711818814277649, + "learning_rate": 4.719328913012285e-05, + "loss": 5.0702, + "step": 25606 + }, + { + "epoch": 0.15229208297649632, + "grad_norm": 2.406665325164795, + "learning_rate": 4.7193074092087765e-05, + "loss": 4.1674, + "step": 25607 + }, + { + "epoch": 0.15229803025977734, + "grad_norm": 2.0252084732055664, + "learning_rate": 4.71928590463053e-05, + "loss": 3.9202, + "step": 25608 + }, + { + "epoch": 0.15230397754305833, + "grad_norm": 1.6908705234527588, + "learning_rate": 4.7192643992775534e-05, + "loss": 4.5446, + "step": 25609 + }, + { + "epoch": 0.15230992482633932, + "grad_norm": 1.2706576585769653, + "learning_rate": 4.719242893149855e-05, + "loss": 5.6578, + "step": 25610 + }, + { + "epoch": 0.15231587210962033, + "grad_norm": 1.380682349205017, + "learning_rate": 4.719221386247442e-05, + "loss": 5.6256, + "step": 25611 + }, + { + "epoch": 0.15232181939290132, + "grad_norm": 1.6104844808578491, + "learning_rate": 4.7191998785703214e-05, + "loss": 5.5271, + "step": 25612 + }, + { + "epoch": 0.1523277666761823, + "grad_norm": 1.5654959678649902, + "learning_rate": 4.719178370118502e-05, + "loss": 5.0767, + "step": 25613 + }, + { + "epoch": 0.15233371395946332, + "grad_norm": 1.7980438470840454, + "learning_rate": 4.719156860891989e-05, + "loss": 4.6667, + "step": 25614 + }, + { + "epoch": 0.1523396612427443, + "grad_norm": 1.6443228721618652, + "learning_rate": 4.719135350890792e-05, + "loss": 4.2763, + "step": 25615 + }, + { + "epoch": 0.1523456085260253, + "grad_norm": 1.442205548286438, + "learning_rate": 4.719113840114918e-05, + "loss": 5.0442, + "step": 25616 + }, + { + "epoch": 0.15235155580930632, + "grad_norm": 1.5215251445770264, + "learning_rate": 4.719092328564374e-05, + "loss": 5.2175, + "step": 25617 + }, + { + "epoch": 0.1523575030925873, + "grad_norm": 1.4463436603546143, + "learning_rate": 4.7190708162391677e-05, + "loss": 5.6153, + "step": 25618 + }, + { + "epoch": 0.1523634503758683, + "grad_norm": 1.624923825263977, + "learning_rate": 4.719049303139307e-05, + "loss": 5.4211, + "step": 25619 + }, + { + "epoch": 0.1523693976591493, + "grad_norm": 1.5821541547775269, + "learning_rate": 4.719027789264799e-05, + "loss": 5.7905, + "step": 25620 + }, + { + "epoch": 0.1523753449424303, + "grad_norm": 1.6683502197265625, + "learning_rate": 4.719006274615651e-05, + "loss": 5.112, + "step": 25621 + }, + { + "epoch": 0.15238129222571128, + "grad_norm": 1.3617998361587524, + "learning_rate": 4.7189847591918714e-05, + "loss": 5.3799, + "step": 25622 + }, + { + "epoch": 0.1523872395089923, + "grad_norm": 1.5106703042984009, + "learning_rate": 4.718963242993466e-05, + "loss": 4.9833, + "step": 25623 + }, + { + "epoch": 0.1523931867922733, + "grad_norm": 1.7020819187164307, + "learning_rate": 4.718941726020445e-05, + "loss": 4.2403, + "step": 25624 + }, + { + "epoch": 0.15239913407555428, + "grad_norm": 1.5678812265396118, + "learning_rate": 4.7189202082728133e-05, + "loss": 5.0985, + "step": 25625 + }, + { + "epoch": 0.1524050813588353, + "grad_norm": 1.4727619886398315, + "learning_rate": 4.71889868975058e-05, + "loss": 4.9088, + "step": 25626 + }, + { + "epoch": 0.15241102864211628, + "grad_norm": 1.5460275411605835, + "learning_rate": 4.7188771704537515e-05, + "loss": 5.2766, + "step": 25627 + }, + { + "epoch": 0.15241697592539727, + "grad_norm": 1.5763301849365234, + "learning_rate": 4.7188556503823366e-05, + "loss": 4.9134, + "step": 25628 + }, + { + "epoch": 0.15242292320867828, + "grad_norm": 1.8980252742767334, + "learning_rate": 4.718834129536341e-05, + "loss": 4.9331, + "step": 25629 + }, + { + "epoch": 0.15242887049195927, + "grad_norm": 2.768523693084717, + "learning_rate": 4.7188126079157744e-05, + "loss": 4.3952, + "step": 25630 + }, + { + "epoch": 0.15243481777524026, + "grad_norm": 2.6490437984466553, + "learning_rate": 4.718791085520643e-05, + "loss": 4.1387, + "step": 25631 + }, + { + "epoch": 0.15244076505852128, + "grad_norm": 1.806143879890442, + "learning_rate": 4.718769562350955e-05, + "loss": 4.7686, + "step": 25632 + }, + { + "epoch": 0.15244671234180227, + "grad_norm": 1.6871095895767212, + "learning_rate": 4.718748038406717e-05, + "loss": 5.3937, + "step": 25633 + }, + { + "epoch": 0.15245265962508325, + "grad_norm": 2.2100014686584473, + "learning_rate": 4.7187265136879364e-05, + "loss": 4.7869, + "step": 25634 + }, + { + "epoch": 0.15245860690836427, + "grad_norm": 1.978220820426941, + "learning_rate": 4.7187049881946224e-05, + "loss": 4.4701, + "step": 25635 + }, + { + "epoch": 0.15246455419164526, + "grad_norm": 1.8031092882156372, + "learning_rate": 4.718683461926781e-05, + "loss": 4.5107, + "step": 25636 + }, + { + "epoch": 0.15247050147492625, + "grad_norm": 1.795417308807373, + "learning_rate": 4.7186619348844196e-05, + "loss": 5.2659, + "step": 25637 + }, + { + "epoch": 0.15247644875820726, + "grad_norm": 2.3051810264587402, + "learning_rate": 4.718640407067547e-05, + "loss": 4.5413, + "step": 25638 + }, + { + "epoch": 0.15248239604148825, + "grad_norm": 1.983340859413147, + "learning_rate": 4.71861887847617e-05, + "loss": 4.5167, + "step": 25639 + }, + { + "epoch": 0.15248834332476924, + "grad_norm": 1.7354977130889893, + "learning_rate": 4.718597349110295e-05, + "loss": 4.5704, + "step": 25640 + }, + { + "epoch": 0.15249429060805025, + "grad_norm": 1.9091737270355225, + "learning_rate": 4.7185758189699313e-05, + "loss": 4.4381, + "step": 25641 + }, + { + "epoch": 0.15250023789133124, + "grad_norm": 1.8753962516784668, + "learning_rate": 4.718554288055086e-05, + "loss": 4.445, + "step": 25642 + }, + { + "epoch": 0.15250618517461223, + "grad_norm": 1.7315021753311157, + "learning_rate": 4.718532756365765e-05, + "loss": 4.7802, + "step": 25643 + }, + { + "epoch": 0.15251213245789325, + "grad_norm": 1.4017493724822998, + "learning_rate": 4.718511223901979e-05, + "loss": 5.3923, + "step": 25644 + }, + { + "epoch": 0.15251807974117423, + "grad_norm": 1.8367207050323486, + "learning_rate": 4.7184896906637326e-05, + "loss": 4.6229, + "step": 25645 + }, + { + "epoch": 0.15252402702445522, + "grad_norm": 2.3250296115875244, + "learning_rate": 4.718468156651035e-05, + "loss": 4.6332, + "step": 25646 + }, + { + "epoch": 0.15252997430773624, + "grad_norm": 2.047855854034424, + "learning_rate": 4.7184466218638925e-05, + "loss": 4.5316, + "step": 25647 + }, + { + "epoch": 0.15253592159101723, + "grad_norm": 1.9817044734954834, + "learning_rate": 4.7184250863023125e-05, + "loss": 4.3888, + "step": 25648 + }, + { + "epoch": 0.15254186887429821, + "grad_norm": 1.889957308769226, + "learning_rate": 4.718403549966305e-05, + "loss": 4.6436, + "step": 25649 + }, + { + "epoch": 0.15254781615757923, + "grad_norm": 1.4799065589904785, + "learning_rate": 4.718382012855874e-05, + "loss": 4.7965, + "step": 25650 + }, + { + "epoch": 0.15255376344086022, + "grad_norm": 2.046947717666626, + "learning_rate": 4.7183604749710296e-05, + "loss": 4.3206, + "step": 25651 + }, + { + "epoch": 0.1525597107241412, + "grad_norm": 1.970746636390686, + "learning_rate": 4.718338936311778e-05, + "loss": 4.3668, + "step": 25652 + }, + { + "epoch": 0.15256565800742222, + "grad_norm": 1.889931321144104, + "learning_rate": 4.718317396878128e-05, + "loss": 4.3436, + "step": 25653 + }, + { + "epoch": 0.1525716052907032, + "grad_norm": 2.0069503784179688, + "learning_rate": 4.7182958566700865e-05, + "loss": 4.5258, + "step": 25654 + }, + { + "epoch": 0.1525775525739842, + "grad_norm": 2.222224712371826, + "learning_rate": 4.7182743156876596e-05, + "loss": 4.362, + "step": 25655 + }, + { + "epoch": 0.15258349985726521, + "grad_norm": 2.2478747367858887, + "learning_rate": 4.718252773930857e-05, + "loss": 4.7401, + "step": 25656 + }, + { + "epoch": 0.1525894471405462, + "grad_norm": 2.224696636199951, + "learning_rate": 4.718231231399685e-05, + "loss": 4.5413, + "step": 25657 + }, + { + "epoch": 0.1525953944238272, + "grad_norm": 1.9385725259780884, + "learning_rate": 4.718209688094152e-05, + "loss": 4.7279, + "step": 25658 + }, + { + "epoch": 0.1526013417071082, + "grad_norm": 2.030127763748169, + "learning_rate": 4.718188144014264e-05, + "loss": 4.4943, + "step": 25659 + }, + { + "epoch": 0.1526072889903892, + "grad_norm": 2.115994453430176, + "learning_rate": 4.7181665991600296e-05, + "loss": 4.5709, + "step": 25660 + }, + { + "epoch": 0.15261323627367018, + "grad_norm": 1.6957606077194214, + "learning_rate": 4.718145053531456e-05, + "loss": 4.8779, + "step": 25661 + }, + { + "epoch": 0.15261918355695117, + "grad_norm": 1.9567986726760864, + "learning_rate": 4.718123507128551e-05, + "loss": 4.5541, + "step": 25662 + }, + { + "epoch": 0.1526251308402322, + "grad_norm": 2.147771120071411, + "learning_rate": 4.718101959951323e-05, + "loss": 4.5141, + "step": 25663 + }, + { + "epoch": 0.15263107812351318, + "grad_norm": 2.1374590396881104, + "learning_rate": 4.7180804119997774e-05, + "loss": 4.3474, + "step": 25664 + }, + { + "epoch": 0.15263702540679416, + "grad_norm": 2.060826539993286, + "learning_rate": 4.718058863273923e-05, + "loss": 4.4178, + "step": 25665 + }, + { + "epoch": 0.15264297269007518, + "grad_norm": 1.9931002855300903, + "learning_rate": 4.7180373137737673e-05, + "loss": 4.3213, + "step": 25666 + }, + { + "epoch": 0.15264891997335617, + "grad_norm": 1.3702372312545776, + "learning_rate": 4.718015763499318e-05, + "loss": 5.0551, + "step": 25667 + }, + { + "epoch": 0.15265486725663716, + "grad_norm": 1.8524867296218872, + "learning_rate": 4.7179942124505814e-05, + "loss": 5.0618, + "step": 25668 + }, + { + "epoch": 0.15266081453991817, + "grad_norm": 1.876756191253662, + "learning_rate": 4.717972660627567e-05, + "loss": 4.2719, + "step": 25669 + }, + { + "epoch": 0.15266676182319916, + "grad_norm": 2.0334908962249756, + "learning_rate": 4.7179511080302804e-05, + "loss": 4.5764, + "step": 25670 + }, + { + "epoch": 0.15267270910648015, + "grad_norm": 2.554891347885132, + "learning_rate": 4.717929554658731e-05, + "loss": 4.6706, + "step": 25671 + }, + { + "epoch": 0.15267865638976116, + "grad_norm": 2.032592296600342, + "learning_rate": 4.717908000512925e-05, + "loss": 4.9648, + "step": 25672 + }, + { + "epoch": 0.15268460367304215, + "grad_norm": 1.6153349876403809, + "learning_rate": 4.7178864455928696e-05, + "loss": 5.2224, + "step": 25673 + }, + { + "epoch": 0.15269055095632314, + "grad_norm": 2.0942156314849854, + "learning_rate": 4.7178648898985734e-05, + "loss": 4.6427, + "step": 25674 + }, + { + "epoch": 0.15269649823960416, + "grad_norm": 1.9911080598831177, + "learning_rate": 4.717843333430043e-05, + "loss": 4.3348, + "step": 25675 + }, + { + "epoch": 0.15270244552288514, + "grad_norm": 2.017202377319336, + "learning_rate": 4.7178217761872866e-05, + "loss": 4.5306, + "step": 25676 + }, + { + "epoch": 0.15270839280616613, + "grad_norm": 1.9934179782867432, + "learning_rate": 4.7178002181703116e-05, + "loss": 4.7443, + "step": 25677 + }, + { + "epoch": 0.15271434008944715, + "grad_norm": 1.9597182273864746, + "learning_rate": 4.717778659379126e-05, + "loss": 4.5526, + "step": 25678 + }, + { + "epoch": 0.15272028737272814, + "grad_norm": 1.3593907356262207, + "learning_rate": 4.717757099813737e-05, + "loss": 5.5802, + "step": 25679 + }, + { + "epoch": 0.15272623465600912, + "grad_norm": 2.0012102127075195, + "learning_rate": 4.717735539474151e-05, + "loss": 5.0289, + "step": 25680 + }, + { + "epoch": 0.15273218193929014, + "grad_norm": 1.5621830224990845, + "learning_rate": 4.7177139783603765e-05, + "loss": 4.9388, + "step": 25681 + }, + { + "epoch": 0.15273812922257113, + "grad_norm": 1.502643346786499, + "learning_rate": 4.717692416472421e-05, + "loss": 5.3317, + "step": 25682 + }, + { + "epoch": 0.15274407650585212, + "grad_norm": 1.6496142148971558, + "learning_rate": 4.717670853810292e-05, + "loss": 5.9642, + "step": 25683 + }, + { + "epoch": 0.15275002378913313, + "grad_norm": 1.7263692617416382, + "learning_rate": 4.717649290373997e-05, + "loss": 4.9383, + "step": 25684 + }, + { + "epoch": 0.15275597107241412, + "grad_norm": 1.4914296865463257, + "learning_rate": 4.7176277261635434e-05, + "loss": 5.2599, + "step": 25685 + }, + { + "epoch": 0.1527619183556951, + "grad_norm": 1.3947960138320923, + "learning_rate": 4.71760616117894e-05, + "loss": 5.3177, + "step": 25686 + }, + { + "epoch": 0.15276786563897612, + "grad_norm": 1.6703267097473145, + "learning_rate": 4.717584595420192e-05, + "loss": 5.0309, + "step": 25687 + }, + { + "epoch": 0.1527738129222571, + "grad_norm": 1.622600793838501, + "learning_rate": 4.7175630288873083e-05, + "loss": 5.2554, + "step": 25688 + }, + { + "epoch": 0.1527797602055381, + "grad_norm": 1.678843379020691, + "learning_rate": 4.717541461580297e-05, + "loss": 5.012, + "step": 25689 + }, + { + "epoch": 0.15278570748881912, + "grad_norm": 2.2063186168670654, + "learning_rate": 4.717519893499164e-05, + "loss": 4.4479, + "step": 25690 + }, + { + "epoch": 0.1527916547721001, + "grad_norm": 2.0667500495910645, + "learning_rate": 4.717498324643918e-05, + "loss": 4.7081, + "step": 25691 + }, + { + "epoch": 0.1527976020553811, + "grad_norm": 2.192436695098877, + "learning_rate": 4.717476755014566e-05, + "loss": 4.7662, + "step": 25692 + }, + { + "epoch": 0.1528035493386621, + "grad_norm": 1.4742953777313232, + "learning_rate": 4.7174551846111165e-05, + "loss": 5.5788, + "step": 25693 + }, + { + "epoch": 0.1528094966219431, + "grad_norm": 1.7715102434158325, + "learning_rate": 4.7174336134335765e-05, + "loss": 5.203, + "step": 25694 + }, + { + "epoch": 0.15281544390522409, + "grad_norm": 2.406721353530884, + "learning_rate": 4.717412041481952e-05, + "loss": 4.7807, + "step": 25695 + }, + { + "epoch": 0.1528213911885051, + "grad_norm": 1.765756607055664, + "learning_rate": 4.7173904687562525e-05, + "loss": 5.2479, + "step": 25696 + }, + { + "epoch": 0.1528273384717861, + "grad_norm": 1.6135215759277344, + "learning_rate": 4.7173688952564856e-05, + "loss": 5.4787, + "step": 25697 + }, + { + "epoch": 0.15283328575506708, + "grad_norm": 1.5617319345474243, + "learning_rate": 4.7173473209826566e-05, + "loss": 5.02, + "step": 25698 + }, + { + "epoch": 0.1528392330383481, + "grad_norm": 1.4704324007034302, + "learning_rate": 4.7173257459347756e-05, + "loss": 5.1675, + "step": 25699 + }, + { + "epoch": 0.15284518032162908, + "grad_norm": 1.8787862062454224, + "learning_rate": 4.7173041701128496e-05, + "loss": 4.7247, + "step": 25700 + }, + { + "epoch": 0.15285112760491007, + "grad_norm": 3.8647372722625732, + "learning_rate": 4.7172825935168845e-05, + "loss": 3.5335, + "step": 25701 + }, + { + "epoch": 0.15285707488819109, + "grad_norm": 3.6721291542053223, + "learning_rate": 4.717261016146889e-05, + "loss": 2.8843, + "step": 25702 + }, + { + "epoch": 0.15286302217147207, + "grad_norm": 2.0848543643951416, + "learning_rate": 4.717239438002872e-05, + "loss": 4.4863, + "step": 25703 + }, + { + "epoch": 0.15286896945475306, + "grad_norm": 1.7783108949661255, + "learning_rate": 4.717217859084838e-05, + "loss": 5.2903, + "step": 25704 + }, + { + "epoch": 0.15287491673803408, + "grad_norm": 2.006303548812866, + "learning_rate": 4.717196279392797e-05, + "loss": 4.3923, + "step": 25705 + }, + { + "epoch": 0.15288086402131507, + "grad_norm": 2.4214632511138916, + "learning_rate": 4.7171746989267553e-05, + "loss": 3.3506, + "step": 25706 + }, + { + "epoch": 0.15288681130459605, + "grad_norm": 2.8976924419403076, + "learning_rate": 4.7171531176867214e-05, + "loss": 3.2211, + "step": 25707 + }, + { + "epoch": 0.15289275858787707, + "grad_norm": 3.2015345096588135, + "learning_rate": 4.717131535672702e-05, + "loss": 2.8205, + "step": 25708 + }, + { + "epoch": 0.15289870587115806, + "grad_norm": 3.559465169906616, + "learning_rate": 4.7171099528847044e-05, + "loss": 2.8882, + "step": 25709 + }, + { + "epoch": 0.15290465315443905, + "grad_norm": 3.3753960132598877, + "learning_rate": 4.717088369322737e-05, + "loss": 2.6752, + "step": 25710 + }, + { + "epoch": 0.15291060043772006, + "grad_norm": 2.129783868789673, + "learning_rate": 4.717066784986806e-05, + "loss": 3.9983, + "step": 25711 + }, + { + "epoch": 0.15291654772100105, + "grad_norm": 1.797956943511963, + "learning_rate": 4.7170451998769214e-05, + "loss": 4.8075, + "step": 25712 + }, + { + "epoch": 0.15292249500428204, + "grad_norm": 3.3450467586517334, + "learning_rate": 4.717023613993089e-05, + "loss": 4.177, + "step": 25713 + }, + { + "epoch": 0.15292844228756305, + "grad_norm": 2.303511381149292, + "learning_rate": 4.7170020273353164e-05, + "loss": 4.471, + "step": 25714 + }, + { + "epoch": 0.15293438957084404, + "grad_norm": 1.4113452434539795, + "learning_rate": 4.7169804399036105e-05, + "loss": 5.4846, + "step": 25715 + }, + { + "epoch": 0.15294033685412503, + "grad_norm": 1.7091588973999023, + "learning_rate": 4.71695885169798e-05, + "loss": 4.8856, + "step": 25716 + }, + { + "epoch": 0.15294628413740605, + "grad_norm": 2.783010244369507, + "learning_rate": 4.7169372627184326e-05, + "loss": 4.3426, + "step": 25717 + }, + { + "epoch": 0.15295223142068703, + "grad_norm": 1.4658305644989014, + "learning_rate": 4.716915672964975e-05, + "loss": 5.3191, + "step": 25718 + }, + { + "epoch": 0.15295817870396802, + "grad_norm": 1.2862242460250854, + "learning_rate": 4.716894082437615e-05, + "loss": 5.3939, + "step": 25719 + }, + { + "epoch": 0.152964125987249, + "grad_norm": 1.4098438024520874, + "learning_rate": 4.71687249113636e-05, + "loss": 5.4493, + "step": 25720 + }, + { + "epoch": 0.15297007327053003, + "grad_norm": 1.4778176546096802, + "learning_rate": 4.7168508990612183e-05, + "loss": 5.2679, + "step": 25721 + }, + { + "epoch": 0.15297602055381102, + "grad_norm": 1.5448487997055054, + "learning_rate": 4.716829306212196e-05, + "loss": 5.1446, + "step": 25722 + }, + { + "epoch": 0.152981967837092, + "grad_norm": 1.3638159036636353, + "learning_rate": 4.716807712589302e-05, + "loss": 5.1152, + "step": 25723 + }, + { + "epoch": 0.15298791512037302, + "grad_norm": 1.7068208456039429, + "learning_rate": 4.716786118192543e-05, + "loss": 5.1389, + "step": 25724 + }, + { + "epoch": 0.152993862403654, + "grad_norm": 1.8191746473312378, + "learning_rate": 4.716764523021928e-05, + "loss": 5.2305, + "step": 25725 + }, + { + "epoch": 0.152999809686935, + "grad_norm": 1.6970409154891968, + "learning_rate": 4.716742927077462e-05, + "loss": 5.1097, + "step": 25726 + }, + { + "epoch": 0.153005756970216, + "grad_norm": 1.5453951358795166, + "learning_rate": 4.716721330359155e-05, + "loss": 5.2614, + "step": 25727 + }, + { + "epoch": 0.153011704253497, + "grad_norm": 1.5335613489151, + "learning_rate": 4.7166997328670125e-05, + "loss": 4.8482, + "step": 25728 + }, + { + "epoch": 0.153017651536778, + "grad_norm": 1.6566481590270996, + "learning_rate": 4.716678134601044e-05, + "loss": 4.9346, + "step": 25729 + }, + { + "epoch": 0.153023598820059, + "grad_norm": 1.7899013757705688, + "learning_rate": 4.716656535561256e-05, + "loss": 5.0877, + "step": 25730 + }, + { + "epoch": 0.15302954610334, + "grad_norm": 2.1659116744995117, + "learning_rate": 4.716634935747655e-05, + "loss": 4.6431, + "step": 25731 + }, + { + "epoch": 0.15303549338662098, + "grad_norm": 1.914923071861267, + "learning_rate": 4.71661333516025e-05, + "loss": 4.9001, + "step": 25732 + }, + { + "epoch": 0.153041440669902, + "grad_norm": 1.9240248203277588, + "learning_rate": 4.7165917337990495e-05, + "loss": 4.7709, + "step": 25733 + }, + { + "epoch": 0.15304738795318298, + "grad_norm": 1.6446973085403442, + "learning_rate": 4.7165701316640585e-05, + "loss": 4.9816, + "step": 25734 + }, + { + "epoch": 0.15305333523646397, + "grad_norm": 1.7971409559249878, + "learning_rate": 4.716548528755286e-05, + "loss": 5.0082, + "step": 25735 + }, + { + "epoch": 0.153059282519745, + "grad_norm": 1.3862462043762207, + "learning_rate": 4.716526925072739e-05, + "loss": 5.0245, + "step": 25736 + }, + { + "epoch": 0.15306522980302598, + "grad_norm": 2.157005548477173, + "learning_rate": 4.716505320616425e-05, + "loss": 4.573, + "step": 25737 + }, + { + "epoch": 0.15307117708630696, + "grad_norm": 2.4460175037384033, + "learning_rate": 4.716483715386354e-05, + "loss": 4.0872, + "step": 25738 + }, + { + "epoch": 0.15307712436958798, + "grad_norm": 1.7140263319015503, + "learning_rate": 4.7164621093825294e-05, + "loss": 4.5421, + "step": 25739 + }, + { + "epoch": 0.15308307165286897, + "grad_norm": 1.684173583984375, + "learning_rate": 4.7164405026049616e-05, + "loss": 4.5274, + "step": 25740 + }, + { + "epoch": 0.15308901893614996, + "grad_norm": 1.9424148797988892, + "learning_rate": 4.716418895053657e-05, + "loss": 4.2669, + "step": 25741 + }, + { + "epoch": 0.15309496621943097, + "grad_norm": 1.576071858406067, + "learning_rate": 4.716397286728623e-05, + "loss": 4.9536, + "step": 25742 + }, + { + "epoch": 0.15310091350271196, + "grad_norm": 1.8285739421844482, + "learning_rate": 4.7163756776298686e-05, + "loss": 4.9322, + "step": 25743 + }, + { + "epoch": 0.15310686078599295, + "grad_norm": 2.058610200881958, + "learning_rate": 4.7163540677574004e-05, + "loss": 4.4565, + "step": 25744 + }, + { + "epoch": 0.15311280806927396, + "grad_norm": 2.106513261795044, + "learning_rate": 4.716332457111226e-05, + "loss": 4.0534, + "step": 25745 + }, + { + "epoch": 0.15311875535255495, + "grad_norm": 1.821857213973999, + "learning_rate": 4.716310845691351e-05, + "loss": 4.5302, + "step": 25746 + }, + { + "epoch": 0.15312470263583594, + "grad_norm": 1.5679446458816528, + "learning_rate": 4.716289233497787e-05, + "loss": 4.9452, + "step": 25747 + }, + { + "epoch": 0.15313064991911696, + "grad_norm": 1.612362027168274, + "learning_rate": 4.716267620530538e-05, + "loss": 5.0074, + "step": 25748 + }, + { + "epoch": 0.15313659720239794, + "grad_norm": 1.6841483116149902, + "learning_rate": 4.716246006789613e-05, + "loss": 5.0202, + "step": 25749 + }, + { + "epoch": 0.15314254448567893, + "grad_norm": 1.7533215284347534, + "learning_rate": 4.7162243922750196e-05, + "loss": 4.6901, + "step": 25750 + }, + { + "epoch": 0.15314849176895995, + "grad_norm": 2.2937755584716797, + "learning_rate": 4.716202776986766e-05, + "loss": 4.0934, + "step": 25751 + }, + { + "epoch": 0.15315443905224094, + "grad_norm": 2.413012742996216, + "learning_rate": 4.7161811609248576e-05, + "loss": 4.0128, + "step": 25752 + }, + { + "epoch": 0.15316038633552193, + "grad_norm": 2.481255054473877, + "learning_rate": 4.7161595440893035e-05, + "loss": 4.4044, + "step": 25753 + }, + { + "epoch": 0.15316633361880294, + "grad_norm": 1.8999838829040527, + "learning_rate": 4.7161379264801115e-05, + "loss": 4.2328, + "step": 25754 + }, + { + "epoch": 0.15317228090208393, + "grad_norm": 2.3453261852264404, + "learning_rate": 4.7161163080972884e-05, + "loss": 4.283, + "step": 25755 + }, + { + "epoch": 0.15317822818536492, + "grad_norm": 1.6733421087265015, + "learning_rate": 4.716094688940842e-05, + "loss": 4.7254, + "step": 25756 + }, + { + "epoch": 0.15318417546864593, + "grad_norm": 1.5302658081054688, + "learning_rate": 4.7160730690107794e-05, + "loss": 4.9403, + "step": 25757 + }, + { + "epoch": 0.15319012275192692, + "grad_norm": 1.6725687980651855, + "learning_rate": 4.716051448307109e-05, + "loss": 4.699, + "step": 25758 + }, + { + "epoch": 0.1531960700352079, + "grad_norm": 2.067267894744873, + "learning_rate": 4.716029826829839e-05, + "loss": 4.0136, + "step": 25759 + }, + { + "epoch": 0.15320201731848893, + "grad_norm": 2.2834413051605225, + "learning_rate": 4.716008204578975e-05, + "loss": 4.1914, + "step": 25760 + }, + { + "epoch": 0.1532079646017699, + "grad_norm": 1.9917986392974854, + "learning_rate": 4.715986581554524e-05, + "loss": 4.2899, + "step": 25761 + }, + { + "epoch": 0.1532139118850509, + "grad_norm": 1.6681551933288574, + "learning_rate": 4.715964957756497e-05, + "loss": 4.7627, + "step": 25762 + }, + { + "epoch": 0.15321985916833192, + "grad_norm": 2.005560874938965, + "learning_rate": 4.715943333184899e-05, + "loss": 4.1686, + "step": 25763 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 1.7380902767181396, + "learning_rate": 4.715921707839738e-05, + "loss": 4.4208, + "step": 25764 + }, + { + "epoch": 0.1532317537348939, + "grad_norm": 2.6380422115325928, + "learning_rate": 4.7159000817210205e-05, + "loss": 4.9835, + "step": 25765 + }, + { + "epoch": 0.1532377010181749, + "grad_norm": 2.4079694747924805, + "learning_rate": 4.715878454828757e-05, + "loss": 4.5758, + "step": 25766 + }, + { + "epoch": 0.1532436483014559, + "grad_norm": 1.7469686269760132, + "learning_rate": 4.715856827162952e-05, + "loss": 4.8894, + "step": 25767 + }, + { + "epoch": 0.1532495955847369, + "grad_norm": 1.7569485902786255, + "learning_rate": 4.715835198723615e-05, + "loss": 5.0324, + "step": 25768 + }, + { + "epoch": 0.1532555428680179, + "grad_norm": 1.9182626008987427, + "learning_rate": 4.715813569510752e-05, + "loss": 4.2196, + "step": 25769 + }, + { + "epoch": 0.1532614901512989, + "grad_norm": 1.8836737871170044, + "learning_rate": 4.715791939524372e-05, + "loss": 4.3797, + "step": 25770 + }, + { + "epoch": 0.15326743743457988, + "grad_norm": 1.5073226690292358, + "learning_rate": 4.7157703087644816e-05, + "loss": 4.7137, + "step": 25771 + }, + { + "epoch": 0.1532733847178609, + "grad_norm": 1.764160394668579, + "learning_rate": 4.715748677231089e-05, + "loss": 4.784, + "step": 25772 + }, + { + "epoch": 0.15327933200114188, + "grad_norm": 1.5940345525741577, + "learning_rate": 4.715727044924201e-05, + "loss": 4.7749, + "step": 25773 + }, + { + "epoch": 0.15328527928442287, + "grad_norm": 1.9873480796813965, + "learning_rate": 4.715705411843826e-05, + "loss": 4.7084, + "step": 25774 + }, + { + "epoch": 0.1532912265677039, + "grad_norm": 2.712846279144287, + "learning_rate": 4.715683777989971e-05, + "loss": 4.8726, + "step": 25775 + }, + { + "epoch": 0.15329717385098487, + "grad_norm": 1.9030331373214722, + "learning_rate": 4.7156621433626434e-05, + "loss": 4.6475, + "step": 25776 + }, + { + "epoch": 0.15330312113426586, + "grad_norm": 1.9939697980880737, + "learning_rate": 4.715640507961852e-05, + "loss": 4.8202, + "step": 25777 + }, + { + "epoch": 0.15330906841754685, + "grad_norm": 1.5398924350738525, + "learning_rate": 4.715618871787602e-05, + "loss": 4.9801, + "step": 25778 + }, + { + "epoch": 0.15331501570082787, + "grad_norm": 1.5413012504577637, + "learning_rate": 4.7155972348399034e-05, + "loss": 4.7795, + "step": 25779 + }, + { + "epoch": 0.15332096298410886, + "grad_norm": 1.6835294961929321, + "learning_rate": 4.7155755971187625e-05, + "loss": 4.5937, + "step": 25780 + }, + { + "epoch": 0.15332691026738984, + "grad_norm": 1.4007564783096313, + "learning_rate": 4.715553958624187e-05, + "loss": 5.5904, + "step": 25781 + }, + { + "epoch": 0.15333285755067086, + "grad_norm": 1.6113498210906982, + "learning_rate": 4.715532319356184e-05, + "loss": 5.1083, + "step": 25782 + }, + { + "epoch": 0.15333880483395185, + "grad_norm": 1.9218871593475342, + "learning_rate": 4.715510679314762e-05, + "loss": 4.6371, + "step": 25783 + }, + { + "epoch": 0.15334475211723284, + "grad_norm": 1.4686646461486816, + "learning_rate": 4.715489038499928e-05, + "loss": 5.0536, + "step": 25784 + }, + { + "epoch": 0.15335069940051385, + "grad_norm": 1.5875191688537598, + "learning_rate": 4.71546739691169e-05, + "loss": 4.5976, + "step": 25785 + }, + { + "epoch": 0.15335664668379484, + "grad_norm": 1.5260745286941528, + "learning_rate": 4.7154457545500554e-05, + "loss": 4.6875, + "step": 25786 + }, + { + "epoch": 0.15336259396707583, + "grad_norm": 1.8652924299240112, + "learning_rate": 4.715424111415031e-05, + "loss": 4.846, + "step": 25787 + }, + { + "epoch": 0.15336854125035684, + "grad_norm": 1.3980404138565063, + "learning_rate": 4.715402467506625e-05, + "loss": 5.2552, + "step": 25788 + }, + { + "epoch": 0.15337448853363783, + "grad_norm": 1.6307755708694458, + "learning_rate": 4.715380822824845e-05, + "loss": 5.1316, + "step": 25789 + }, + { + "epoch": 0.15338043581691882, + "grad_norm": 1.9057358503341675, + "learning_rate": 4.715359177369698e-05, + "loss": 4.6232, + "step": 25790 + }, + { + "epoch": 0.15338638310019984, + "grad_norm": 1.260809302330017, + "learning_rate": 4.715337531141193e-05, + "loss": 5.1614, + "step": 25791 + }, + { + "epoch": 0.15339233038348082, + "grad_norm": 2.7115111351013184, + "learning_rate": 4.7153158841393354e-05, + "loss": 3.6292, + "step": 25792 + }, + { + "epoch": 0.1533982776667618, + "grad_norm": 1.296697974205017, + "learning_rate": 4.715294236364135e-05, + "loss": 5.5909, + "step": 25793 + }, + { + "epoch": 0.15340422495004283, + "grad_norm": 1.466179370880127, + "learning_rate": 4.7152725878155975e-05, + "loss": 5.3005, + "step": 25794 + }, + { + "epoch": 0.15341017223332382, + "grad_norm": 1.5478910207748413, + "learning_rate": 4.715250938493732e-05, + "loss": 4.9116, + "step": 25795 + }, + { + "epoch": 0.1534161195166048, + "grad_norm": 1.371853232383728, + "learning_rate": 4.715229288398544e-05, + "loss": 5.2196, + "step": 25796 + }, + { + "epoch": 0.15342206679988582, + "grad_norm": 1.4444376230239868, + "learning_rate": 4.715207637530043e-05, + "loss": 4.9255, + "step": 25797 + }, + { + "epoch": 0.1534280140831668, + "grad_norm": 1.3257986307144165, + "learning_rate": 4.715185985888236e-05, + "loss": 4.9662, + "step": 25798 + }, + { + "epoch": 0.1534339613664478, + "grad_norm": 1.4831913709640503, + "learning_rate": 4.71516433347313e-05, + "loss": 4.9466, + "step": 25799 + }, + { + "epoch": 0.1534399086497288, + "grad_norm": 1.8146830797195435, + "learning_rate": 4.715142680284734e-05, + "loss": 4.711, + "step": 25800 + }, + { + "epoch": 0.1534458559330098, + "grad_norm": 1.73066246509552, + "learning_rate": 4.7151210263230536e-05, + "loss": 4.4107, + "step": 25801 + }, + { + "epoch": 0.1534518032162908, + "grad_norm": 2.014646291732788, + "learning_rate": 4.715099371588098e-05, + "loss": 4.6119, + "step": 25802 + }, + { + "epoch": 0.1534577504995718, + "grad_norm": 2.1739413738250732, + "learning_rate": 4.715077716079874e-05, + "loss": 4.4887, + "step": 25803 + }, + { + "epoch": 0.1534636977828528, + "grad_norm": 1.4722633361816406, + "learning_rate": 4.7150560597983895e-05, + "loss": 5.0312, + "step": 25804 + }, + { + "epoch": 0.15346964506613378, + "grad_norm": 1.654250144958496, + "learning_rate": 4.715034402743651e-05, + "loss": 4.8815, + "step": 25805 + }, + { + "epoch": 0.1534755923494148, + "grad_norm": 1.6598440408706665, + "learning_rate": 4.715012744915668e-05, + "loss": 4.3904, + "step": 25806 + }, + { + "epoch": 0.15348153963269578, + "grad_norm": 1.5754339694976807, + "learning_rate": 4.714991086314445e-05, + "loss": 4.4223, + "step": 25807 + }, + { + "epoch": 0.15348748691597677, + "grad_norm": 1.800657033920288, + "learning_rate": 4.714969426939994e-05, + "loss": 4.5314, + "step": 25808 + }, + { + "epoch": 0.1534934341992578, + "grad_norm": 1.8917250633239746, + "learning_rate": 4.714947766792318e-05, + "loss": 4.4049, + "step": 25809 + }, + { + "epoch": 0.15349938148253878, + "grad_norm": 1.9953207969665527, + "learning_rate": 4.714926105871428e-05, + "loss": 4.3155, + "step": 25810 + }, + { + "epoch": 0.15350532876581977, + "grad_norm": 1.7314120531082153, + "learning_rate": 4.714904444177329e-05, + "loss": 4.3324, + "step": 25811 + }, + { + "epoch": 0.15351127604910078, + "grad_norm": 1.577124834060669, + "learning_rate": 4.7148827817100306e-05, + "loss": 4.6899, + "step": 25812 + }, + { + "epoch": 0.15351722333238177, + "grad_norm": 1.6661646366119385, + "learning_rate": 4.714861118469539e-05, + "loss": 4.9735, + "step": 25813 + }, + { + "epoch": 0.15352317061566276, + "grad_norm": 1.8606276512145996, + "learning_rate": 4.714839454455863e-05, + "loss": 5.2351, + "step": 25814 + }, + { + "epoch": 0.15352911789894377, + "grad_norm": 2.0107643604278564, + "learning_rate": 4.7148177896690085e-05, + "loss": 4.4152, + "step": 25815 + }, + { + "epoch": 0.15353506518222476, + "grad_norm": 1.6447992324829102, + "learning_rate": 4.7147961241089846e-05, + "loss": 4.5391, + "step": 25816 + }, + { + "epoch": 0.15354101246550575, + "grad_norm": 1.6666457653045654, + "learning_rate": 4.714774457775798e-05, + "loss": 4.5104, + "step": 25817 + }, + { + "epoch": 0.15354695974878677, + "grad_norm": 1.7214492559432983, + "learning_rate": 4.714752790669457e-05, + "loss": 5.0634, + "step": 25818 + }, + { + "epoch": 0.15355290703206775, + "grad_norm": 1.5697379112243652, + "learning_rate": 4.714731122789968e-05, + "loss": 4.8279, + "step": 25819 + }, + { + "epoch": 0.15355885431534874, + "grad_norm": 2.531752109527588, + "learning_rate": 4.7147094541373395e-05, + "loss": 3.9172, + "step": 25820 + }, + { + "epoch": 0.15356480159862976, + "grad_norm": 1.5037142038345337, + "learning_rate": 4.714687784711579e-05, + "loss": 4.7534, + "step": 25821 + }, + { + "epoch": 0.15357074888191075, + "grad_norm": 1.5798907279968262, + "learning_rate": 4.714666114512693e-05, + "loss": 4.6779, + "step": 25822 + }, + { + "epoch": 0.15357669616519173, + "grad_norm": 1.5223065614700317, + "learning_rate": 4.714644443540691e-05, + "loss": 4.8612, + "step": 25823 + }, + { + "epoch": 0.15358264344847275, + "grad_norm": 1.7736209630966187, + "learning_rate": 4.714622771795579e-05, + "loss": 4.9765, + "step": 25824 + }, + { + "epoch": 0.15358859073175374, + "grad_norm": 1.5920718908309937, + "learning_rate": 4.714601099277365e-05, + "loss": 5.2479, + "step": 25825 + }, + { + "epoch": 0.15359453801503473, + "grad_norm": 1.7325233221054077, + "learning_rate": 4.7145794259860576e-05, + "loss": 4.9202, + "step": 25826 + }, + { + "epoch": 0.15360048529831574, + "grad_norm": 1.6514594554901123, + "learning_rate": 4.714557751921662e-05, + "loss": 4.9212, + "step": 25827 + }, + { + "epoch": 0.15360643258159673, + "grad_norm": 1.731692910194397, + "learning_rate": 4.714536077084188e-05, + "loss": 4.8916, + "step": 25828 + }, + { + "epoch": 0.15361237986487772, + "grad_norm": 1.7444603443145752, + "learning_rate": 4.714514401473642e-05, + "loss": 4.4659, + "step": 25829 + }, + { + "epoch": 0.15361832714815873, + "grad_norm": 1.7847130298614502, + "learning_rate": 4.714492725090033e-05, + "loss": 4.3516, + "step": 25830 + }, + { + "epoch": 0.15362427443143972, + "grad_norm": 1.6140960454940796, + "learning_rate": 4.714471047933366e-05, + "loss": 4.3894, + "step": 25831 + }, + { + "epoch": 0.1536302217147207, + "grad_norm": 1.5573277473449707, + "learning_rate": 4.714449370003651e-05, + "loss": 5.0749, + "step": 25832 + }, + { + "epoch": 0.15363616899800173, + "grad_norm": 1.7352724075317383, + "learning_rate": 4.7144276913008936e-05, + "loss": 4.6311, + "step": 25833 + }, + { + "epoch": 0.15364211628128271, + "grad_norm": 2.1136815547943115, + "learning_rate": 4.714406011825103e-05, + "loss": 3.9239, + "step": 25834 + }, + { + "epoch": 0.1536480635645637, + "grad_norm": 1.5329402685165405, + "learning_rate": 4.7143843315762856e-05, + "loss": 5.0124, + "step": 25835 + }, + { + "epoch": 0.1536540108478447, + "grad_norm": 1.6305334568023682, + "learning_rate": 4.7143626505544504e-05, + "loss": 5.3047, + "step": 25836 + }, + { + "epoch": 0.1536599581311257, + "grad_norm": 1.6582584381103516, + "learning_rate": 4.714340968759604e-05, + "loss": 4.909, + "step": 25837 + }, + { + "epoch": 0.1536659054144067, + "grad_norm": 1.581274151802063, + "learning_rate": 4.7143192861917536e-05, + "loss": 4.8241, + "step": 25838 + }, + { + "epoch": 0.15367185269768768, + "grad_norm": 1.6180393695831299, + "learning_rate": 4.7142976028509076e-05, + "loss": 4.6608, + "step": 25839 + }, + { + "epoch": 0.1536777999809687, + "grad_norm": 1.8333182334899902, + "learning_rate": 4.714275918737073e-05, + "loss": 5.3005, + "step": 25840 + }, + { + "epoch": 0.1536837472642497, + "grad_norm": 1.6652151346206665, + "learning_rate": 4.714254233850257e-05, + "loss": 4.5989, + "step": 25841 + }, + { + "epoch": 0.15368969454753068, + "grad_norm": 1.7609338760375977, + "learning_rate": 4.714232548190468e-05, + "loss": 5.2105, + "step": 25842 + }, + { + "epoch": 0.1536956418308117, + "grad_norm": 1.6076292991638184, + "learning_rate": 4.714210861757714e-05, + "loss": 5.32, + "step": 25843 + }, + { + "epoch": 0.15370158911409268, + "grad_norm": 1.6114000082015991, + "learning_rate": 4.7141891745520005e-05, + "loss": 5.1365, + "step": 25844 + }, + { + "epoch": 0.15370753639737367, + "grad_norm": 1.9237120151519775, + "learning_rate": 4.714167486573337e-05, + "loss": 4.8821, + "step": 25845 + }, + { + "epoch": 0.15371348368065468, + "grad_norm": 1.7089736461639404, + "learning_rate": 4.7141457978217315e-05, + "loss": 4.8468, + "step": 25846 + }, + { + "epoch": 0.15371943096393567, + "grad_norm": 1.6240943670272827, + "learning_rate": 4.71412410829719e-05, + "loss": 4.9153, + "step": 25847 + }, + { + "epoch": 0.15372537824721666, + "grad_norm": 1.4397730827331543, + "learning_rate": 4.7141024179997205e-05, + "loss": 5.0853, + "step": 25848 + }, + { + "epoch": 0.15373132553049768, + "grad_norm": 1.6480834484100342, + "learning_rate": 4.714080726929331e-05, + "loss": 4.6492, + "step": 25849 + }, + { + "epoch": 0.15373727281377866, + "grad_norm": 1.702221155166626, + "learning_rate": 4.714059035086028e-05, + "loss": 4.5677, + "step": 25850 + }, + { + "epoch": 0.15374322009705965, + "grad_norm": 1.5285601615905762, + "learning_rate": 4.7140373424698206e-05, + "loss": 4.621, + "step": 25851 + }, + { + "epoch": 0.15374916738034067, + "grad_norm": 2.0238354206085205, + "learning_rate": 4.7140156490807156e-05, + "loss": 4.6883, + "step": 25852 + }, + { + "epoch": 0.15375511466362166, + "grad_norm": 2.392547845840454, + "learning_rate": 4.713993954918721e-05, + "loss": 4.7537, + "step": 25853 + }, + { + "epoch": 0.15376106194690264, + "grad_norm": 2.639981746673584, + "learning_rate": 4.713972259983843e-05, + "loss": 3.958, + "step": 25854 + }, + { + "epoch": 0.15376700923018366, + "grad_norm": 2.11757755279541, + "learning_rate": 4.713950564276091e-05, + "loss": 5.0082, + "step": 25855 + }, + { + "epoch": 0.15377295651346465, + "grad_norm": 2.032003879547119, + "learning_rate": 4.713928867795471e-05, + "loss": 4.9212, + "step": 25856 + }, + { + "epoch": 0.15377890379674564, + "grad_norm": 1.7791013717651367, + "learning_rate": 4.713907170541991e-05, + "loss": 4.925, + "step": 25857 + }, + { + "epoch": 0.15378485108002665, + "grad_norm": 1.8376729488372803, + "learning_rate": 4.71388547251566e-05, + "loss": 5.1545, + "step": 25858 + }, + { + "epoch": 0.15379079836330764, + "grad_norm": 1.7532944679260254, + "learning_rate": 4.7138637737164836e-05, + "loss": 5.1329, + "step": 25859 + }, + { + "epoch": 0.15379674564658863, + "grad_norm": 2.4505176544189453, + "learning_rate": 4.7138420741444704e-05, + "loss": 4.8803, + "step": 25860 + }, + { + "epoch": 0.15380269292986964, + "grad_norm": 2.4481520652770996, + "learning_rate": 4.7138203737996283e-05, + "loss": 4.9071, + "step": 25861 + }, + { + "epoch": 0.15380864021315063, + "grad_norm": 1.805619716644287, + "learning_rate": 4.7137986726819636e-05, + "loss": 4.9145, + "step": 25862 + }, + { + "epoch": 0.15381458749643162, + "grad_norm": 1.353178858757019, + "learning_rate": 4.7137769707914856e-05, + "loss": 4.8159, + "step": 25863 + }, + { + "epoch": 0.15382053477971264, + "grad_norm": 2.1220030784606934, + "learning_rate": 4.7137552681282006e-05, + "loss": 4.7573, + "step": 25864 + }, + { + "epoch": 0.15382648206299362, + "grad_norm": 1.7052141427993774, + "learning_rate": 4.713733564692116e-05, + "loss": 5.0372, + "step": 25865 + }, + { + "epoch": 0.1538324293462746, + "grad_norm": 1.5306216478347778, + "learning_rate": 4.71371186048324e-05, + "loss": 5.0694, + "step": 25866 + }, + { + "epoch": 0.15383837662955563, + "grad_norm": 1.5422348976135254, + "learning_rate": 4.713690155501581e-05, + "loss": 5.1864, + "step": 25867 + }, + { + "epoch": 0.15384432391283662, + "grad_norm": 1.5703792572021484, + "learning_rate": 4.7136684497471444e-05, + "loss": 5.1686, + "step": 25868 + }, + { + "epoch": 0.1538502711961176, + "grad_norm": 1.6716407537460327, + "learning_rate": 4.7136467432199396e-05, + "loss": 5.2515, + "step": 25869 + }, + { + "epoch": 0.15385621847939862, + "grad_norm": 1.5796306133270264, + "learning_rate": 4.713625035919974e-05, + "loss": 5.0068, + "step": 25870 + }, + { + "epoch": 0.1538621657626796, + "grad_norm": 1.6445972919464111, + "learning_rate": 4.713603327847254e-05, + "loss": 4.9683, + "step": 25871 + }, + { + "epoch": 0.1538681130459606, + "grad_norm": 1.588665246963501, + "learning_rate": 4.713581619001788e-05, + "loss": 4.9913, + "step": 25872 + }, + { + "epoch": 0.1538740603292416, + "grad_norm": 1.5067355632781982, + "learning_rate": 4.713559909383584e-05, + "loss": 5.1648, + "step": 25873 + }, + { + "epoch": 0.1538800076125226, + "grad_norm": 1.6328977346420288, + "learning_rate": 4.713538198992649e-05, + "loss": 4.9316, + "step": 25874 + }, + { + "epoch": 0.1538859548958036, + "grad_norm": 1.6389905214309692, + "learning_rate": 4.7135164878289903e-05, + "loss": 5.1095, + "step": 25875 + }, + { + "epoch": 0.1538919021790846, + "grad_norm": 1.5004593133926392, + "learning_rate": 4.713494775892616e-05, + "loss": 4.8718, + "step": 25876 + }, + { + "epoch": 0.1538978494623656, + "grad_norm": 1.7928706407546997, + "learning_rate": 4.713473063183534e-05, + "loss": 5.1074, + "step": 25877 + }, + { + "epoch": 0.15390379674564658, + "grad_norm": 1.4132859706878662, + "learning_rate": 4.713451349701751e-05, + "loss": 5.2395, + "step": 25878 + }, + { + "epoch": 0.1539097440289276, + "grad_norm": 1.7291496992111206, + "learning_rate": 4.7134296354472754e-05, + "loss": 5.2648, + "step": 25879 + }, + { + "epoch": 0.15391569131220859, + "grad_norm": 1.6724679470062256, + "learning_rate": 4.713407920420114e-05, + "loss": 5.2074, + "step": 25880 + }, + { + "epoch": 0.15392163859548957, + "grad_norm": 1.5899326801300049, + "learning_rate": 4.713386204620275e-05, + "loss": 5.0018, + "step": 25881 + }, + { + "epoch": 0.1539275858787706, + "grad_norm": 1.5092980861663818, + "learning_rate": 4.7133644880477656e-05, + "loss": 5.2861, + "step": 25882 + }, + { + "epoch": 0.15393353316205158, + "grad_norm": 1.5518758296966553, + "learning_rate": 4.7133427707025935e-05, + "loss": 5.2302, + "step": 25883 + }, + { + "epoch": 0.15393948044533257, + "grad_norm": 1.8629082441329956, + "learning_rate": 4.713321052584766e-05, + "loss": 4.8252, + "step": 25884 + }, + { + "epoch": 0.15394542772861358, + "grad_norm": 1.618132472038269, + "learning_rate": 4.713299333694291e-05, + "loss": 5.0853, + "step": 25885 + }, + { + "epoch": 0.15395137501189457, + "grad_norm": 1.494831919670105, + "learning_rate": 4.713277614031177e-05, + "loss": 5.1517, + "step": 25886 + }, + { + "epoch": 0.15395732229517556, + "grad_norm": 1.6972736120224, + "learning_rate": 4.71325589359543e-05, + "loss": 5.3104, + "step": 25887 + }, + { + "epoch": 0.15396326957845657, + "grad_norm": 1.8251672983169556, + "learning_rate": 4.713234172387058e-05, + "loss": 5.0705, + "step": 25888 + }, + { + "epoch": 0.15396921686173756, + "grad_norm": 1.4835257530212402, + "learning_rate": 4.7132124504060696e-05, + "loss": 4.5481, + "step": 25889 + }, + { + "epoch": 0.15397516414501855, + "grad_norm": 1.447768211364746, + "learning_rate": 4.713190727652471e-05, + "loss": 4.7023, + "step": 25890 + }, + { + "epoch": 0.15398111142829957, + "grad_norm": 1.581663727760315, + "learning_rate": 4.71316900412627e-05, + "loss": 4.5446, + "step": 25891 + }, + { + "epoch": 0.15398705871158055, + "grad_norm": 1.5457055568695068, + "learning_rate": 4.7131472798274754e-05, + "loss": 4.8265, + "step": 25892 + }, + { + "epoch": 0.15399300599486154, + "grad_norm": 1.5043967962265015, + "learning_rate": 4.713125554756093e-05, + "loss": 5.2398, + "step": 25893 + }, + { + "epoch": 0.15399895327814253, + "grad_norm": 1.3700400590896606, + "learning_rate": 4.7131038289121324e-05, + "loss": 4.9516, + "step": 25894 + }, + { + "epoch": 0.15400490056142355, + "grad_norm": 1.4897541999816895, + "learning_rate": 4.713082102295599e-05, + "loss": 4.9884, + "step": 25895 + }, + { + "epoch": 0.15401084784470453, + "grad_norm": 1.560887098312378, + "learning_rate": 4.713060374906503e-05, + "loss": 4.8639, + "step": 25896 + }, + { + "epoch": 0.15401679512798552, + "grad_norm": 1.542069911956787, + "learning_rate": 4.7130386467448495e-05, + "loss": 4.7692, + "step": 25897 + }, + { + "epoch": 0.15402274241126654, + "grad_norm": 1.7924245595932007, + "learning_rate": 4.7130169178106465e-05, + "loss": 4.6172, + "step": 25898 + }, + { + "epoch": 0.15402868969454753, + "grad_norm": 1.4520066976547241, + "learning_rate": 4.7129951881039033e-05, + "loss": 4.9518, + "step": 25899 + }, + { + "epoch": 0.15403463697782852, + "grad_norm": 1.4653339385986328, + "learning_rate": 4.7129734576246255e-05, + "loss": 5.0738, + "step": 25900 + }, + { + "epoch": 0.15404058426110953, + "grad_norm": 1.2604494094848633, + "learning_rate": 4.7129517263728224e-05, + "loss": 5.0677, + "step": 25901 + }, + { + "epoch": 0.15404653154439052, + "grad_norm": 1.4956402778625488, + "learning_rate": 4.7129299943485e-05, + "loss": 5.0547, + "step": 25902 + }, + { + "epoch": 0.1540524788276715, + "grad_norm": 1.3395041227340698, + "learning_rate": 4.712908261551667e-05, + "loss": 4.9042, + "step": 25903 + }, + { + "epoch": 0.15405842611095252, + "grad_norm": 1.4592647552490234, + "learning_rate": 4.7128865279823304e-05, + "loss": 4.8363, + "step": 25904 + }, + { + "epoch": 0.1540643733942335, + "grad_norm": 1.339340329170227, + "learning_rate": 4.712864793640498e-05, + "loss": 4.8916, + "step": 25905 + }, + { + "epoch": 0.1540703206775145, + "grad_norm": 1.5001643896102905, + "learning_rate": 4.7128430585261775e-05, + "loss": 5.1015, + "step": 25906 + }, + { + "epoch": 0.15407626796079552, + "grad_norm": 1.3876299858093262, + "learning_rate": 4.7128213226393756e-05, + "loss": 5.0368, + "step": 25907 + }, + { + "epoch": 0.1540822152440765, + "grad_norm": 1.4904955625534058, + "learning_rate": 4.712799585980101e-05, + "loss": 5.0785, + "step": 25908 + }, + { + "epoch": 0.1540881625273575, + "grad_norm": 1.4284460544586182, + "learning_rate": 4.712777848548362e-05, + "loss": 5.0015, + "step": 25909 + }, + { + "epoch": 0.1540941098106385, + "grad_norm": 1.4823048114776611, + "learning_rate": 4.712756110344164e-05, + "loss": 4.9969, + "step": 25910 + }, + { + "epoch": 0.1541000570939195, + "grad_norm": 1.5989056825637817, + "learning_rate": 4.712734371367516e-05, + "loss": 5.4401, + "step": 25911 + }, + { + "epoch": 0.15410600437720048, + "grad_norm": 1.475415587425232, + "learning_rate": 4.7127126316184256e-05, + "loss": 5.3553, + "step": 25912 + }, + { + "epoch": 0.1541119516604815, + "grad_norm": 1.3556677103042603, + "learning_rate": 4.712690891096899e-05, + "loss": 5.4228, + "step": 25913 + }, + { + "epoch": 0.1541178989437625, + "grad_norm": 1.4386837482452393, + "learning_rate": 4.712669149802946e-05, + "loss": 5.387, + "step": 25914 + }, + { + "epoch": 0.15412384622704348, + "grad_norm": 1.4365500211715698, + "learning_rate": 4.712647407736573e-05, + "loss": 4.8597, + "step": 25915 + }, + { + "epoch": 0.1541297935103245, + "grad_norm": 1.5703059434890747, + "learning_rate": 4.712625664897788e-05, + "loss": 5.2659, + "step": 25916 + }, + { + "epoch": 0.15413574079360548, + "grad_norm": 1.5057390928268433, + "learning_rate": 4.712603921286597e-05, + "loss": 4.9931, + "step": 25917 + }, + { + "epoch": 0.15414168807688647, + "grad_norm": 1.2982683181762695, + "learning_rate": 4.712582176903009e-05, + "loss": 5.5226, + "step": 25918 + }, + { + "epoch": 0.15414763536016748, + "grad_norm": 1.4120944738388062, + "learning_rate": 4.712560431747032e-05, + "loss": 5.4037, + "step": 25919 + }, + { + "epoch": 0.15415358264344847, + "grad_norm": 1.3634661436080933, + "learning_rate": 4.712538685818673e-05, + "loss": 5.521, + "step": 25920 + }, + { + "epoch": 0.15415952992672946, + "grad_norm": 1.3352160453796387, + "learning_rate": 4.7125169391179394e-05, + "loss": 5.2938, + "step": 25921 + }, + { + "epoch": 0.15416547721001048, + "grad_norm": 1.3874114751815796, + "learning_rate": 4.712495191644839e-05, + "loss": 5.272, + "step": 25922 + }, + { + "epoch": 0.15417142449329146, + "grad_norm": 1.5225552320480347, + "learning_rate": 4.712473443399379e-05, + "loss": 5.3211, + "step": 25923 + }, + { + "epoch": 0.15417737177657245, + "grad_norm": 1.4493452310562134, + "learning_rate": 4.712451694381568e-05, + "loss": 5.2799, + "step": 25924 + }, + { + "epoch": 0.15418331905985347, + "grad_norm": 1.3240947723388672, + "learning_rate": 4.712429944591413e-05, + "loss": 5.441, + "step": 25925 + }, + { + "epoch": 0.15418926634313446, + "grad_norm": 1.2881836891174316, + "learning_rate": 4.712408194028921e-05, + "loss": 5.4478, + "step": 25926 + }, + { + "epoch": 0.15419521362641544, + "grad_norm": 1.4163159132003784, + "learning_rate": 4.712386442694101e-05, + "loss": 5.252, + "step": 25927 + }, + { + "epoch": 0.15420116090969646, + "grad_norm": 1.4597609043121338, + "learning_rate": 4.712364690586959e-05, + "loss": 5.4359, + "step": 25928 + }, + { + "epoch": 0.15420710819297745, + "grad_norm": 1.31305992603302, + "learning_rate": 4.7123429377075036e-05, + "loss": 5.3141, + "step": 25929 + }, + { + "epoch": 0.15421305547625844, + "grad_norm": 1.1765657663345337, + "learning_rate": 4.712321184055742e-05, + "loss": 5.1828, + "step": 25930 + }, + { + "epoch": 0.15421900275953945, + "grad_norm": 1.3116487264633179, + "learning_rate": 4.7122994296316824e-05, + "loss": 5.4107, + "step": 25931 + }, + { + "epoch": 0.15422495004282044, + "grad_norm": 1.3636351823806763, + "learning_rate": 4.712277674435331e-05, + "loss": 5.3273, + "step": 25932 + }, + { + "epoch": 0.15423089732610143, + "grad_norm": 1.4326391220092773, + "learning_rate": 4.712255918466697e-05, + "loss": 5.4123, + "step": 25933 + }, + { + "epoch": 0.15423684460938245, + "grad_norm": 1.3996350765228271, + "learning_rate": 4.712234161725788e-05, + "loss": 5.3111, + "step": 25934 + }, + { + "epoch": 0.15424279189266343, + "grad_norm": 1.5358290672302246, + "learning_rate": 4.712212404212609e-05, + "loss": 5.4522, + "step": 25935 + }, + { + "epoch": 0.15424873917594442, + "grad_norm": 1.3900970220565796, + "learning_rate": 4.7121906459271716e-05, + "loss": 5.6671, + "step": 25936 + }, + { + "epoch": 0.15425468645922544, + "grad_norm": 1.5113252401351929, + "learning_rate": 4.71216888686948e-05, + "loss": 5.0736, + "step": 25937 + }, + { + "epoch": 0.15426063374250643, + "grad_norm": 1.434477686882019, + "learning_rate": 4.7121471270395434e-05, + "loss": 5.259, + "step": 25938 + }, + { + "epoch": 0.1542665810257874, + "grad_norm": 1.4467335939407349, + "learning_rate": 4.712125366437369e-05, + "loss": 5.3382, + "step": 25939 + }, + { + "epoch": 0.15427252830906843, + "grad_norm": 1.6080671548843384, + "learning_rate": 4.712103605062965e-05, + "loss": 5.1767, + "step": 25940 + }, + { + "epoch": 0.15427847559234942, + "grad_norm": 1.497689962387085, + "learning_rate": 4.712081842916338e-05, + "loss": 4.884, + "step": 25941 + }, + { + "epoch": 0.1542844228756304, + "grad_norm": 1.691441535949707, + "learning_rate": 4.712060079997496e-05, + "loss": 5.2065, + "step": 25942 + }, + { + "epoch": 0.15429037015891142, + "grad_norm": 1.4759876728057861, + "learning_rate": 4.712038316306447e-05, + "loss": 5.17, + "step": 25943 + }, + { + "epoch": 0.1542963174421924, + "grad_norm": 1.4109833240509033, + "learning_rate": 4.712016551843198e-05, + "loss": 5.1986, + "step": 25944 + }, + { + "epoch": 0.1543022647254734, + "grad_norm": 1.4481924772262573, + "learning_rate": 4.7119947866077566e-05, + "loss": 4.9301, + "step": 25945 + }, + { + "epoch": 0.15430821200875441, + "grad_norm": 1.4721769094467163, + "learning_rate": 4.711973020600131e-05, + "loss": 5.123, + "step": 25946 + }, + { + "epoch": 0.1543141592920354, + "grad_norm": 1.6822638511657715, + "learning_rate": 4.711951253820329e-05, + "loss": 5.122, + "step": 25947 + }, + { + "epoch": 0.1543201065753164, + "grad_norm": 1.6047651767730713, + "learning_rate": 4.711929486268357e-05, + "loss": 5.1417, + "step": 25948 + }, + { + "epoch": 0.1543260538585974, + "grad_norm": 1.4773536920547485, + "learning_rate": 4.711907717944224e-05, + "loss": 4.9562, + "step": 25949 + }, + { + "epoch": 0.1543320011418784, + "grad_norm": 1.4373167753219604, + "learning_rate": 4.711885948847936e-05, + "loss": 5.3515, + "step": 25950 + }, + { + "epoch": 0.15433794842515938, + "grad_norm": 1.4517033100128174, + "learning_rate": 4.711864178979501e-05, + "loss": 5.0668, + "step": 25951 + }, + { + "epoch": 0.15434389570844037, + "grad_norm": 1.7582489252090454, + "learning_rate": 4.711842408338929e-05, + "loss": 4.7104, + "step": 25952 + }, + { + "epoch": 0.1543498429917214, + "grad_norm": 1.6162217855453491, + "learning_rate": 4.711820636926224e-05, + "loss": 4.7747, + "step": 25953 + }, + { + "epoch": 0.15435579027500237, + "grad_norm": 1.7326339483261108, + "learning_rate": 4.711798864741396e-05, + "loss": 4.818, + "step": 25954 + }, + { + "epoch": 0.15436173755828336, + "grad_norm": 1.642146110534668, + "learning_rate": 4.711777091784452e-05, + "loss": 4.7517, + "step": 25955 + }, + { + "epoch": 0.15436768484156438, + "grad_norm": 1.5122802257537842, + "learning_rate": 4.711755318055399e-05, + "loss": 5.0139, + "step": 25956 + }, + { + "epoch": 0.15437363212484537, + "grad_norm": 1.7299772500991821, + "learning_rate": 4.711733543554245e-05, + "loss": 4.9988, + "step": 25957 + }, + { + "epoch": 0.15437957940812636, + "grad_norm": 1.5812711715698242, + "learning_rate": 4.711711768280998e-05, + "loss": 4.7134, + "step": 25958 + }, + { + "epoch": 0.15438552669140737, + "grad_norm": 1.5953545570373535, + "learning_rate": 4.711689992235665e-05, + "loss": 4.9644, + "step": 25959 + }, + { + "epoch": 0.15439147397468836, + "grad_norm": 1.7964719533920288, + "learning_rate": 4.711668215418255e-05, + "loss": 4.8476, + "step": 25960 + }, + { + "epoch": 0.15439742125796935, + "grad_norm": 1.6458512544631958, + "learning_rate": 4.711646437828773e-05, + "loss": 4.8117, + "step": 25961 + }, + { + "epoch": 0.15440336854125036, + "grad_norm": 1.4821311235427856, + "learning_rate": 4.711624659467229e-05, + "loss": 4.8647, + "step": 25962 + }, + { + "epoch": 0.15440931582453135, + "grad_norm": 1.4640769958496094, + "learning_rate": 4.711602880333629e-05, + "loss": 5.0038, + "step": 25963 + }, + { + "epoch": 0.15441526310781234, + "grad_norm": 1.7705153226852417, + "learning_rate": 4.711581100427981e-05, + "loss": 5.12, + "step": 25964 + }, + { + "epoch": 0.15442121039109336, + "grad_norm": 1.7333801984786987, + "learning_rate": 4.711559319750294e-05, + "loss": 4.9785, + "step": 25965 + }, + { + "epoch": 0.15442715767437434, + "grad_norm": 1.6170109510421753, + "learning_rate": 4.711537538300574e-05, + "loss": 4.9764, + "step": 25966 + }, + { + "epoch": 0.15443310495765533, + "grad_norm": 1.4895650148391724, + "learning_rate": 4.7115157560788295e-05, + "loss": 4.5585, + "step": 25967 + }, + { + "epoch": 0.15443905224093635, + "grad_norm": 1.6678147315979004, + "learning_rate": 4.711493973085067e-05, + "loss": 4.7897, + "step": 25968 + }, + { + "epoch": 0.15444499952421734, + "grad_norm": 1.537511944770813, + "learning_rate": 4.7114721893192945e-05, + "loss": 4.8845, + "step": 25969 + }, + { + "epoch": 0.15445094680749832, + "grad_norm": 1.7167041301727295, + "learning_rate": 4.711450404781521e-05, + "loss": 4.9126, + "step": 25970 + }, + { + "epoch": 0.15445689409077934, + "grad_norm": 1.763170599937439, + "learning_rate": 4.711428619471752e-05, + "loss": 4.6864, + "step": 25971 + }, + { + "epoch": 0.15446284137406033, + "grad_norm": 1.4620569944381714, + "learning_rate": 4.7114068333899964e-05, + "loss": 4.744, + "step": 25972 + }, + { + "epoch": 0.15446878865734132, + "grad_norm": 1.6106908321380615, + "learning_rate": 4.711385046536262e-05, + "loss": 5.2037, + "step": 25973 + }, + { + "epoch": 0.15447473594062233, + "grad_norm": 2.173444986343384, + "learning_rate": 4.711363258910556e-05, + "loss": 4.8086, + "step": 25974 + }, + { + "epoch": 0.15448068322390332, + "grad_norm": 2.0350496768951416, + "learning_rate": 4.711341470512885e-05, + "loss": 4.7291, + "step": 25975 + }, + { + "epoch": 0.1544866305071843, + "grad_norm": 1.9148650169372559, + "learning_rate": 4.7113196813432584e-05, + "loss": 4.7627, + "step": 25976 + }, + { + "epoch": 0.15449257779046532, + "grad_norm": 1.9944121837615967, + "learning_rate": 4.711297891401683e-05, + "loss": 4.8124, + "step": 25977 + }, + { + "epoch": 0.1544985250737463, + "grad_norm": 1.515162706375122, + "learning_rate": 4.7112761006881655e-05, + "loss": 4.8781, + "step": 25978 + }, + { + "epoch": 0.1545044723570273, + "grad_norm": 1.7549412250518799, + "learning_rate": 4.711254309202715e-05, + "loss": 4.9173, + "step": 25979 + }, + { + "epoch": 0.15451041964030832, + "grad_norm": 1.5914033651351929, + "learning_rate": 4.711232516945338e-05, + "loss": 5.012, + "step": 25980 + }, + { + "epoch": 0.1545163669235893, + "grad_norm": 1.7436847686767578, + "learning_rate": 4.711210723916043e-05, + "loss": 4.4552, + "step": 25981 + }, + { + "epoch": 0.1545223142068703, + "grad_norm": 1.5679067373275757, + "learning_rate": 4.711188930114837e-05, + "loss": 4.9158, + "step": 25982 + }, + { + "epoch": 0.1545282614901513, + "grad_norm": 1.5164258480072021, + "learning_rate": 4.711167135541727e-05, + "loss": 4.2524, + "step": 25983 + }, + { + "epoch": 0.1545342087734323, + "grad_norm": 1.7215555906295776, + "learning_rate": 4.711145340196723e-05, + "loss": 4.4035, + "step": 25984 + }, + { + "epoch": 0.15454015605671328, + "grad_norm": 1.8671064376831055, + "learning_rate": 4.7111235440798303e-05, + "loss": 4.6875, + "step": 25985 + }, + { + "epoch": 0.1545461033399943, + "grad_norm": 1.760772705078125, + "learning_rate": 4.7111017471910566e-05, + "loss": 4.7645, + "step": 25986 + }, + { + "epoch": 0.1545520506232753, + "grad_norm": 1.8126411437988281, + "learning_rate": 4.7110799495304115e-05, + "loss": 5.1524, + "step": 25987 + }, + { + "epoch": 0.15455799790655628, + "grad_norm": 1.6593974828720093, + "learning_rate": 4.7110581510979e-05, + "loss": 5.1902, + "step": 25988 + }, + { + "epoch": 0.1545639451898373, + "grad_norm": 1.721921443939209, + "learning_rate": 4.711036351893532e-05, + "loss": 5.0316, + "step": 25989 + }, + { + "epoch": 0.15456989247311828, + "grad_norm": 2.030829668045044, + "learning_rate": 4.7110145519173135e-05, + "loss": 4.9087, + "step": 25990 + }, + { + "epoch": 0.15457583975639927, + "grad_norm": 1.6568117141723633, + "learning_rate": 4.710992751169252e-05, + "loss": 4.3814, + "step": 25991 + }, + { + "epoch": 0.15458178703968029, + "grad_norm": 1.667718768119812, + "learning_rate": 4.7109709496493565e-05, + "loss": 4.8191, + "step": 25992 + }, + { + "epoch": 0.15458773432296127, + "grad_norm": 1.6483817100524902, + "learning_rate": 4.710949147357634e-05, + "loss": 5.055, + "step": 25993 + }, + { + "epoch": 0.15459368160624226, + "grad_norm": 1.703580617904663, + "learning_rate": 4.710927344294092e-05, + "loss": 5.0259, + "step": 25994 + }, + { + "epoch": 0.15459962888952328, + "grad_norm": 1.512531042098999, + "learning_rate": 4.710905540458737e-05, + "loss": 5.1221, + "step": 25995 + }, + { + "epoch": 0.15460557617280427, + "grad_norm": 1.4010028839111328, + "learning_rate": 4.710883735851579e-05, + "loss": 5.2263, + "step": 25996 + }, + { + "epoch": 0.15461152345608525, + "grad_norm": 1.694629192352295, + "learning_rate": 4.710861930472624e-05, + "loss": 4.9348, + "step": 25997 + }, + { + "epoch": 0.15461747073936627, + "grad_norm": 1.5974243879318237, + "learning_rate": 4.710840124321879e-05, + "loss": 5.1262, + "step": 25998 + }, + { + "epoch": 0.15462341802264726, + "grad_norm": 1.6333894729614258, + "learning_rate": 4.7108183173993535e-05, + "loss": 4.6557, + "step": 25999 + }, + { + "epoch": 0.15462936530592825, + "grad_norm": 1.660767674446106, + "learning_rate": 4.710796509705054e-05, + "loss": 4.9764, + "step": 26000 + }, + { + "epoch": 0.15463531258920926, + "grad_norm": 1.5514689683914185, + "learning_rate": 4.710774701238989e-05, + "loss": 4.8895, + "step": 26001 + }, + { + "epoch": 0.15464125987249025, + "grad_norm": 1.7753626108169556, + "learning_rate": 4.7107528920011645e-05, + "loss": 5.1251, + "step": 26002 + }, + { + "epoch": 0.15464720715577124, + "grad_norm": 1.5963994264602661, + "learning_rate": 4.7107310819915895e-05, + "loss": 4.9678, + "step": 26003 + }, + { + "epoch": 0.15465315443905225, + "grad_norm": 1.7098819017410278, + "learning_rate": 4.7107092712102706e-05, + "loss": 4.7313, + "step": 26004 + }, + { + "epoch": 0.15465910172233324, + "grad_norm": 1.7636046409606934, + "learning_rate": 4.710687459657216e-05, + "loss": 4.7752, + "step": 26005 + }, + { + "epoch": 0.15466504900561423, + "grad_norm": 1.5514246225357056, + "learning_rate": 4.7106656473324336e-05, + "loss": 4.6835, + "step": 26006 + }, + { + "epoch": 0.15467099628889525, + "grad_norm": 1.6040410995483398, + "learning_rate": 4.7106438342359303e-05, + "loss": 4.8096, + "step": 26007 + }, + { + "epoch": 0.15467694357217623, + "grad_norm": 1.622213363647461, + "learning_rate": 4.7106220203677144e-05, + "loss": 5.0896, + "step": 26008 + }, + { + "epoch": 0.15468289085545722, + "grad_norm": 1.6227675676345825, + "learning_rate": 4.710600205727793e-05, + "loss": 5.0895, + "step": 26009 + }, + { + "epoch": 0.1546888381387382, + "grad_norm": 1.6498078107833862, + "learning_rate": 4.710578390316174e-05, + "loss": 4.8625, + "step": 26010 + }, + { + "epoch": 0.15469478542201923, + "grad_norm": 1.6175272464752197, + "learning_rate": 4.710556574132865e-05, + "loss": 4.9729, + "step": 26011 + }, + { + "epoch": 0.15470073270530021, + "grad_norm": 1.5892902612686157, + "learning_rate": 4.7105347571778735e-05, + "loss": 4.755, + "step": 26012 + }, + { + "epoch": 0.1547066799885812, + "grad_norm": 1.4750880002975464, + "learning_rate": 4.710512939451207e-05, + "loss": 4.7497, + "step": 26013 + }, + { + "epoch": 0.15471262727186222, + "grad_norm": 1.5363775491714478, + "learning_rate": 4.710491120952874e-05, + "loss": 5.1039, + "step": 26014 + }, + { + "epoch": 0.1547185745551432, + "grad_norm": 1.5225108861923218, + "learning_rate": 4.71046930168288e-05, + "loss": 4.782, + "step": 26015 + }, + { + "epoch": 0.1547245218384242, + "grad_norm": 1.6348788738250732, + "learning_rate": 4.7104474816412345e-05, + "loss": 4.9252, + "step": 26016 + }, + { + "epoch": 0.1547304691217052, + "grad_norm": 1.6000639200210571, + "learning_rate": 4.7104256608279454e-05, + "loss": 4.9286, + "step": 26017 + }, + { + "epoch": 0.1547364164049862, + "grad_norm": 1.4785354137420654, + "learning_rate": 4.710403839243018e-05, + "loss": 4.7383, + "step": 26018 + }, + { + "epoch": 0.1547423636882672, + "grad_norm": 1.548176884651184, + "learning_rate": 4.710382016886463e-05, + "loss": 4.7526, + "step": 26019 + }, + { + "epoch": 0.1547483109715482, + "grad_norm": 1.537049651145935, + "learning_rate": 4.710360193758287e-05, + "loss": 4.6532, + "step": 26020 + }, + { + "epoch": 0.1547542582548292, + "grad_norm": 1.4506211280822754, + "learning_rate": 4.710338369858495e-05, + "loss": 5.1028, + "step": 26021 + }, + { + "epoch": 0.15476020553811018, + "grad_norm": 1.4539066553115845, + "learning_rate": 4.710316545187098e-05, + "loss": 5.0396, + "step": 26022 + }, + { + "epoch": 0.1547661528213912, + "grad_norm": 1.408674716949463, + "learning_rate": 4.7102947197441016e-05, + "loss": 5.2779, + "step": 26023 + }, + { + "epoch": 0.15477210010467218, + "grad_norm": 1.5732898712158203, + "learning_rate": 4.710272893529515e-05, + "loss": 5.1519, + "step": 26024 + }, + { + "epoch": 0.15477804738795317, + "grad_norm": 1.5260519981384277, + "learning_rate": 4.710251066543344e-05, + "loss": 5.056, + "step": 26025 + }, + { + "epoch": 0.1547839946712342, + "grad_norm": 1.4518004655838013, + "learning_rate": 4.710229238785598e-05, + "loss": 4.9322, + "step": 26026 + }, + { + "epoch": 0.15478994195451518, + "grad_norm": 1.6032034158706665, + "learning_rate": 4.7102074102562835e-05, + "loss": 5.0368, + "step": 26027 + }, + { + "epoch": 0.15479588923779616, + "grad_norm": 1.6396820545196533, + "learning_rate": 4.7101855809554085e-05, + "loss": 4.4808, + "step": 26028 + }, + { + "epoch": 0.15480183652107718, + "grad_norm": 1.6207085847854614, + "learning_rate": 4.710163750882981e-05, + "loss": 4.5206, + "step": 26029 + }, + { + "epoch": 0.15480778380435817, + "grad_norm": 1.5769189596176147, + "learning_rate": 4.7101419200390073e-05, + "loss": 4.4192, + "step": 26030 + }, + { + "epoch": 0.15481373108763916, + "grad_norm": 1.4689233303070068, + "learning_rate": 4.710120088423496e-05, + "loss": 4.8726, + "step": 26031 + }, + { + "epoch": 0.15481967837092017, + "grad_norm": 1.3557206392288208, + "learning_rate": 4.710098256036455e-05, + "loss": 5.1076, + "step": 26032 + }, + { + "epoch": 0.15482562565420116, + "grad_norm": 1.561497688293457, + "learning_rate": 4.710076422877891e-05, + "loss": 4.6845, + "step": 26033 + }, + { + "epoch": 0.15483157293748215, + "grad_norm": 1.6871447563171387, + "learning_rate": 4.710054588947813e-05, + "loss": 4.8231, + "step": 26034 + }, + { + "epoch": 0.15483752022076316, + "grad_norm": 1.7153793573379517, + "learning_rate": 4.710032754246228e-05, + "loss": 4.767, + "step": 26035 + }, + { + "epoch": 0.15484346750404415, + "grad_norm": 1.6859761476516724, + "learning_rate": 4.710010918773142e-05, + "loss": 4.6774, + "step": 26036 + }, + { + "epoch": 0.15484941478732514, + "grad_norm": 1.4598466157913208, + "learning_rate": 4.709989082528565e-05, + "loss": 4.8141, + "step": 26037 + }, + { + "epoch": 0.15485536207060616, + "grad_norm": 1.572952389717102, + "learning_rate": 4.709967245512504e-05, + "loss": 5.0215, + "step": 26038 + }, + { + "epoch": 0.15486130935388714, + "grad_norm": 1.6656177043914795, + "learning_rate": 4.7099454077249655e-05, + "loss": 4.5755, + "step": 26039 + }, + { + "epoch": 0.15486725663716813, + "grad_norm": 1.4872766733169556, + "learning_rate": 4.709923569165958e-05, + "loss": 4.9086, + "step": 26040 + }, + { + "epoch": 0.15487320392044915, + "grad_norm": 1.603215217590332, + "learning_rate": 4.70990172983549e-05, + "loss": 4.8528, + "step": 26041 + }, + { + "epoch": 0.15487915120373014, + "grad_norm": 1.5077006816864014, + "learning_rate": 4.7098798897335664e-05, + "loss": 4.8544, + "step": 26042 + }, + { + "epoch": 0.15488509848701112, + "grad_norm": 1.515825629234314, + "learning_rate": 4.709858048860197e-05, + "loss": 4.7793, + "step": 26043 + }, + { + "epoch": 0.15489104577029214, + "grad_norm": 1.472776174545288, + "learning_rate": 4.7098362072153904e-05, + "loss": 4.8047, + "step": 26044 + }, + { + "epoch": 0.15489699305357313, + "grad_norm": 1.5982736349105835, + "learning_rate": 4.709814364799151e-05, + "loss": 4.9911, + "step": 26045 + }, + { + "epoch": 0.15490294033685412, + "grad_norm": 1.3136348724365234, + "learning_rate": 4.709792521611489e-05, + "loss": 5.3009, + "step": 26046 + }, + { + "epoch": 0.15490888762013513, + "grad_norm": 1.6178503036499023, + "learning_rate": 4.709770677652412e-05, + "loss": 4.7873, + "step": 26047 + }, + { + "epoch": 0.15491483490341612, + "grad_norm": 1.544202446937561, + "learning_rate": 4.709748832921926e-05, + "loss": 4.645, + "step": 26048 + }, + { + "epoch": 0.1549207821866971, + "grad_norm": 1.359904408454895, + "learning_rate": 4.70972698742004e-05, + "loss": 5.0246, + "step": 26049 + }, + { + "epoch": 0.15492672946997812, + "grad_norm": 1.4320893287658691, + "learning_rate": 4.7097051411467606e-05, + "loss": 5.0227, + "step": 26050 + }, + { + "epoch": 0.1549326767532591, + "grad_norm": 1.7229030132293701, + "learning_rate": 4.7096832941020963e-05, + "loss": 5.2792, + "step": 26051 + }, + { + "epoch": 0.1549386240365401, + "grad_norm": 1.672554850578308, + "learning_rate": 4.709661446286054e-05, + "loss": 4.9227, + "step": 26052 + }, + { + "epoch": 0.15494457131982112, + "grad_norm": 1.5159001350402832, + "learning_rate": 4.709639597698642e-05, + "loss": 4.7464, + "step": 26053 + }, + { + "epoch": 0.1549505186031021, + "grad_norm": 1.5735573768615723, + "learning_rate": 4.7096177483398676e-05, + "loss": 5.2281, + "step": 26054 + }, + { + "epoch": 0.1549564658863831, + "grad_norm": 1.4174078702926636, + "learning_rate": 4.709595898209739e-05, + "loss": 5.138, + "step": 26055 + }, + { + "epoch": 0.1549624131696641, + "grad_norm": 1.3748446702957153, + "learning_rate": 4.7095740473082626e-05, + "loss": 5.2084, + "step": 26056 + }, + { + "epoch": 0.1549683604529451, + "grad_norm": 1.5169907808303833, + "learning_rate": 4.709552195635447e-05, + "loss": 5.3272, + "step": 26057 + }, + { + "epoch": 0.15497430773622609, + "grad_norm": 1.6235400438308716, + "learning_rate": 4.7095303431912994e-05, + "loss": 5.2201, + "step": 26058 + }, + { + "epoch": 0.1549802550195071, + "grad_norm": 1.571418046951294, + "learning_rate": 4.709508489975828e-05, + "loss": 5.3584, + "step": 26059 + }, + { + "epoch": 0.1549862023027881, + "grad_norm": 1.690524697303772, + "learning_rate": 4.70948663598904e-05, + "loss": 5.3091, + "step": 26060 + }, + { + "epoch": 0.15499214958606908, + "grad_norm": 1.6778768301010132, + "learning_rate": 4.7094647812309424e-05, + "loss": 4.8765, + "step": 26061 + }, + { + "epoch": 0.1549980968693501, + "grad_norm": 1.6365214586257935, + "learning_rate": 4.709442925701544e-05, + "loss": 5.4826, + "step": 26062 + }, + { + "epoch": 0.15500404415263108, + "grad_norm": 1.4799535274505615, + "learning_rate": 4.709421069400851e-05, + "loss": 5.5668, + "step": 26063 + }, + { + "epoch": 0.15500999143591207, + "grad_norm": 1.5750006437301636, + "learning_rate": 4.7093992123288734e-05, + "loss": 5.235, + "step": 26064 + }, + { + "epoch": 0.15501593871919309, + "grad_norm": 1.8067607879638672, + "learning_rate": 4.7093773544856165e-05, + "loss": 5.2708, + "step": 26065 + }, + { + "epoch": 0.15502188600247407, + "grad_norm": 1.4780645370483398, + "learning_rate": 4.709355495871088e-05, + "loss": 5.1626, + "step": 26066 + }, + { + "epoch": 0.15502783328575506, + "grad_norm": 1.5702919960021973, + "learning_rate": 4.709333636485298e-05, + "loss": 5.2306, + "step": 26067 + }, + { + "epoch": 0.15503378056903608, + "grad_norm": 1.7658028602600098, + "learning_rate": 4.7093117763282515e-05, + "loss": 4.9352, + "step": 26068 + }, + { + "epoch": 0.15503972785231707, + "grad_norm": 1.69098961353302, + "learning_rate": 4.709289915399957e-05, + "loss": 4.7679, + "step": 26069 + }, + { + "epoch": 0.15504567513559805, + "grad_norm": 1.704026460647583, + "learning_rate": 4.709268053700423e-05, + "loss": 4.6209, + "step": 26070 + }, + { + "epoch": 0.15505162241887904, + "grad_norm": 1.4715653657913208, + "learning_rate": 4.709246191229656e-05, + "loss": 5.1664, + "step": 26071 + }, + { + "epoch": 0.15505756970216006, + "grad_norm": 1.5663673877716064, + "learning_rate": 4.7092243279876634e-05, + "loss": 5.3833, + "step": 26072 + }, + { + "epoch": 0.15506351698544105, + "grad_norm": 1.4647293090820312, + "learning_rate": 4.709202463974454e-05, + "loss": 5.2766, + "step": 26073 + }, + { + "epoch": 0.15506946426872203, + "grad_norm": 1.5950292348861694, + "learning_rate": 4.7091805991900344e-05, + "loss": 5.2686, + "step": 26074 + }, + { + "epoch": 0.15507541155200305, + "grad_norm": 1.593206524848938, + "learning_rate": 4.709158733634413e-05, + "loss": 4.9969, + "step": 26075 + }, + { + "epoch": 0.15508135883528404, + "grad_norm": 1.5884050130844116, + "learning_rate": 4.7091368673075975e-05, + "loss": 4.9804, + "step": 26076 + }, + { + "epoch": 0.15508730611856503, + "grad_norm": 1.5333365201950073, + "learning_rate": 4.709115000209594e-05, + "loss": 4.6808, + "step": 26077 + }, + { + "epoch": 0.15509325340184604, + "grad_norm": 1.4642858505249023, + "learning_rate": 4.7090931323404116e-05, + "loss": 4.6828, + "step": 26078 + }, + { + "epoch": 0.15509920068512703, + "grad_norm": 2.0302491188049316, + "learning_rate": 4.709071263700059e-05, + "loss": 4.5523, + "step": 26079 + }, + { + "epoch": 0.15510514796840802, + "grad_norm": 1.6798481941223145, + "learning_rate": 4.709049394288541e-05, + "loss": 5.1286, + "step": 26080 + }, + { + "epoch": 0.15511109525168904, + "grad_norm": 1.5074591636657715, + "learning_rate": 4.7090275241058676e-05, + "loss": 5.3037, + "step": 26081 + }, + { + "epoch": 0.15511704253497002, + "grad_norm": 1.7001566886901855, + "learning_rate": 4.709005653152044e-05, + "loss": 5.0217, + "step": 26082 + }, + { + "epoch": 0.155122989818251, + "grad_norm": 1.84412682056427, + "learning_rate": 4.708983781427081e-05, + "loss": 4.5579, + "step": 26083 + }, + { + "epoch": 0.15512893710153203, + "grad_norm": 1.770264744758606, + "learning_rate": 4.708961908930984e-05, + "loss": 4.7394, + "step": 26084 + }, + { + "epoch": 0.15513488438481302, + "grad_norm": 1.7658874988555908, + "learning_rate": 4.7089400356637615e-05, + "loss": 4.9278, + "step": 26085 + }, + { + "epoch": 0.155140831668094, + "grad_norm": 1.5701930522918701, + "learning_rate": 4.7089181616254204e-05, + "loss": 4.7227, + "step": 26086 + }, + { + "epoch": 0.15514677895137502, + "grad_norm": 1.5790002346038818, + "learning_rate": 4.708896286815969e-05, + "loss": 4.9207, + "step": 26087 + }, + { + "epoch": 0.155152726234656, + "grad_norm": 1.8411163091659546, + "learning_rate": 4.7088744112354146e-05, + "loss": 3.8647, + "step": 26088 + }, + { + "epoch": 0.155158673517937, + "grad_norm": 1.813536524772644, + "learning_rate": 4.708852534883765e-05, + "loss": 4.1148, + "step": 26089 + }, + { + "epoch": 0.155164620801218, + "grad_norm": 1.6122519969940186, + "learning_rate": 4.708830657761028e-05, + "loss": 4.9749, + "step": 26090 + }, + { + "epoch": 0.155170568084499, + "grad_norm": 1.9105713367462158, + "learning_rate": 4.70880877986721e-05, + "loss": 4.9895, + "step": 26091 + }, + { + "epoch": 0.15517651536778, + "grad_norm": 1.849824070930481, + "learning_rate": 4.7087869012023215e-05, + "loss": 5.5382, + "step": 26092 + }, + { + "epoch": 0.155182462651061, + "grad_norm": 2.346090793609619, + "learning_rate": 4.708765021766367e-05, + "loss": 5.6398, + "step": 26093 + }, + { + "epoch": 0.155188409934342, + "grad_norm": 1.8905435800552368, + "learning_rate": 4.7087431415593555e-05, + "loss": 5.6089, + "step": 26094 + }, + { + "epoch": 0.15519435721762298, + "grad_norm": 1.6987192630767822, + "learning_rate": 4.7087212605812944e-05, + "loss": 5.4127, + "step": 26095 + }, + { + "epoch": 0.155200304500904, + "grad_norm": 1.7915600538253784, + "learning_rate": 4.708699378832193e-05, + "loss": 4.9027, + "step": 26096 + }, + { + "epoch": 0.15520625178418498, + "grad_norm": 1.5736148357391357, + "learning_rate": 4.708677496312056e-05, + "loss": 5.1403, + "step": 26097 + }, + { + "epoch": 0.15521219906746597, + "grad_norm": 1.6473568677902222, + "learning_rate": 4.708655613020893e-05, + "loss": 5.0299, + "step": 26098 + }, + { + "epoch": 0.155218146350747, + "grad_norm": 1.733720064163208, + "learning_rate": 4.708633728958711e-05, + "loss": 5.0153, + "step": 26099 + }, + { + "epoch": 0.15522409363402798, + "grad_norm": 1.842244267463684, + "learning_rate": 4.708611844125518e-05, + "loss": 4.7, + "step": 26100 + }, + { + "epoch": 0.15523004091730896, + "grad_norm": 1.8227342367172241, + "learning_rate": 4.708589958521321e-05, + "loss": 4.4889, + "step": 26101 + }, + { + "epoch": 0.15523598820058998, + "grad_norm": 1.7300339937210083, + "learning_rate": 4.708568072146129e-05, + "loss": 5.0326, + "step": 26102 + }, + { + "epoch": 0.15524193548387097, + "grad_norm": 2.0854434967041016, + "learning_rate": 4.708546184999948e-05, + "loss": 5.6966, + "step": 26103 + }, + { + "epoch": 0.15524788276715196, + "grad_norm": 1.5393275022506714, + "learning_rate": 4.708524297082786e-05, + "loss": 5.5777, + "step": 26104 + }, + { + "epoch": 0.15525383005043297, + "grad_norm": 1.7765403985977173, + "learning_rate": 4.7085024083946514e-05, + "loss": 5.7488, + "step": 26105 + }, + { + "epoch": 0.15525977733371396, + "grad_norm": 1.668286919593811, + "learning_rate": 4.708480518935552e-05, + "loss": 5.3823, + "step": 26106 + }, + { + "epoch": 0.15526572461699495, + "grad_norm": 1.7656164169311523, + "learning_rate": 4.708458628705494e-05, + "loss": 5.1098, + "step": 26107 + }, + { + "epoch": 0.15527167190027596, + "grad_norm": 1.6078004837036133, + "learning_rate": 4.708436737704486e-05, + "loss": 4.8957, + "step": 26108 + }, + { + "epoch": 0.15527761918355695, + "grad_norm": 1.5649595260620117, + "learning_rate": 4.7084148459325364e-05, + "loss": 5.4546, + "step": 26109 + }, + { + "epoch": 0.15528356646683794, + "grad_norm": 1.7555382251739502, + "learning_rate": 4.7083929533896506e-05, + "loss": 5.6428, + "step": 26110 + }, + { + "epoch": 0.15528951375011896, + "grad_norm": 1.7282280921936035, + "learning_rate": 4.708371060075839e-05, + "loss": 5.4197, + "step": 26111 + }, + { + "epoch": 0.15529546103339995, + "grad_norm": 1.8044626712799072, + "learning_rate": 4.708349165991107e-05, + "loss": 5.4676, + "step": 26112 + }, + { + "epoch": 0.15530140831668093, + "grad_norm": 1.6488827466964722, + "learning_rate": 4.7083272711354634e-05, + "loss": 5.2725, + "step": 26113 + }, + { + "epoch": 0.15530735559996195, + "grad_norm": 1.9291478395462036, + "learning_rate": 4.7083053755089155e-05, + "loss": 5.2565, + "step": 26114 + }, + { + "epoch": 0.15531330288324294, + "grad_norm": 1.9248192310333252, + "learning_rate": 4.708283479111471e-05, + "loss": 5.2514, + "step": 26115 + }, + { + "epoch": 0.15531925016652393, + "grad_norm": 1.9327218532562256, + "learning_rate": 4.708261581943137e-05, + "loss": 5.0833, + "step": 26116 + }, + { + "epoch": 0.15532519744980494, + "grad_norm": 1.952842354774475, + "learning_rate": 4.708239684003923e-05, + "loss": 5.0989, + "step": 26117 + }, + { + "epoch": 0.15533114473308593, + "grad_norm": 1.7923991680145264, + "learning_rate": 4.7082177852938344e-05, + "loss": 4.8204, + "step": 26118 + }, + { + "epoch": 0.15533709201636692, + "grad_norm": 1.761819839477539, + "learning_rate": 4.708195885812881e-05, + "loss": 5.1966, + "step": 26119 + }, + { + "epoch": 0.15534303929964793, + "grad_norm": 2.061192035675049, + "learning_rate": 4.7081739855610674e-05, + "loss": 4.7254, + "step": 26120 + }, + { + "epoch": 0.15534898658292892, + "grad_norm": 1.7219372987747192, + "learning_rate": 4.708152084538404e-05, + "loss": 5.008, + "step": 26121 + }, + { + "epoch": 0.1553549338662099, + "grad_norm": 1.836690068244934, + "learning_rate": 4.708130182744898e-05, + "loss": 4.8645, + "step": 26122 + }, + { + "epoch": 0.15536088114949093, + "grad_norm": 1.6488652229309082, + "learning_rate": 4.708108280180556e-05, + "loss": 5.1588, + "step": 26123 + }, + { + "epoch": 0.15536682843277191, + "grad_norm": 1.7643523216247559, + "learning_rate": 4.708086376845386e-05, + "loss": 4.9774, + "step": 26124 + }, + { + "epoch": 0.1553727757160529, + "grad_norm": 1.7396107912063599, + "learning_rate": 4.7080644727393967e-05, + "loss": 5.1542, + "step": 26125 + }, + { + "epoch": 0.15537872299933392, + "grad_norm": 1.723271131515503, + "learning_rate": 4.708042567862594e-05, + "loss": 4.5029, + "step": 26126 + }, + { + "epoch": 0.1553846702826149, + "grad_norm": 1.7824338674545288, + "learning_rate": 4.708020662214987e-05, + "loss": 4.8107, + "step": 26127 + }, + { + "epoch": 0.1553906175658959, + "grad_norm": 1.6587624549865723, + "learning_rate": 4.707998755796582e-05, + "loss": 5.0076, + "step": 26128 + }, + { + "epoch": 0.15539656484917688, + "grad_norm": 1.6058495044708252, + "learning_rate": 4.7079768486073884e-05, + "loss": 4.8512, + "step": 26129 + }, + { + "epoch": 0.1554025121324579, + "grad_norm": 1.6286768913269043, + "learning_rate": 4.707954940647412e-05, + "loss": 5.0587, + "step": 26130 + }, + { + "epoch": 0.1554084594157389, + "grad_norm": 1.5808156728744507, + "learning_rate": 4.707933031916662e-05, + "loss": 5.0254, + "step": 26131 + }, + { + "epoch": 0.15541440669901987, + "grad_norm": 1.7283897399902344, + "learning_rate": 4.707911122415145e-05, + "loss": 5.1255, + "step": 26132 + }, + { + "epoch": 0.1554203539823009, + "grad_norm": 1.9916651248931885, + "learning_rate": 4.70788921214287e-05, + "loss": 4.9384, + "step": 26133 + }, + { + "epoch": 0.15542630126558188, + "grad_norm": 1.5505808591842651, + "learning_rate": 4.7078673010998425e-05, + "loss": 5.0284, + "step": 26134 + }, + { + "epoch": 0.15543224854886287, + "grad_norm": 1.8529605865478516, + "learning_rate": 4.707845389286072e-05, + "loss": 5.1745, + "step": 26135 + }, + { + "epoch": 0.15543819583214388, + "grad_norm": 1.5921772718429565, + "learning_rate": 4.707823476701565e-05, + "loss": 5.1941, + "step": 26136 + }, + { + "epoch": 0.15544414311542487, + "grad_norm": 1.676703691482544, + "learning_rate": 4.70780156334633e-05, + "loss": 4.9678, + "step": 26137 + }, + { + "epoch": 0.15545009039870586, + "grad_norm": 1.5701407194137573, + "learning_rate": 4.707779649220374e-05, + "loss": 4.8332, + "step": 26138 + }, + { + "epoch": 0.15545603768198687, + "grad_norm": 1.4418753385543823, + "learning_rate": 4.707757734323706e-05, + "loss": 4.9294, + "step": 26139 + }, + { + "epoch": 0.15546198496526786, + "grad_norm": 1.4596991539001465, + "learning_rate": 4.707735818656331e-05, + "loss": 4.874, + "step": 26140 + }, + { + "epoch": 0.15546793224854885, + "grad_norm": 1.475049376487732, + "learning_rate": 4.707713902218259e-05, + "loss": 5.0269, + "step": 26141 + }, + { + "epoch": 0.15547387953182987, + "grad_norm": 1.4616882801055908, + "learning_rate": 4.7076919850094966e-05, + "loss": 5.0152, + "step": 26142 + }, + { + "epoch": 0.15547982681511086, + "grad_norm": 1.5477145910263062, + "learning_rate": 4.707670067030052e-05, + "loss": 4.9596, + "step": 26143 + }, + { + "epoch": 0.15548577409839184, + "grad_norm": 1.6296616792678833, + "learning_rate": 4.707648148279933e-05, + "loss": 4.7555, + "step": 26144 + }, + { + "epoch": 0.15549172138167286, + "grad_norm": 2.044677257537842, + "learning_rate": 4.707626228759147e-05, + "loss": 4.2117, + "step": 26145 + }, + { + "epoch": 0.15549766866495385, + "grad_norm": 1.8100709915161133, + "learning_rate": 4.7076043084677e-05, + "loss": 4.5057, + "step": 26146 + }, + { + "epoch": 0.15550361594823484, + "grad_norm": 1.698901653289795, + "learning_rate": 4.7075823874056026e-05, + "loss": 4.6707, + "step": 26147 + }, + { + "epoch": 0.15550956323151585, + "grad_norm": 1.5637656450271606, + "learning_rate": 4.70756046557286e-05, + "loss": 4.871, + "step": 26148 + }, + { + "epoch": 0.15551551051479684, + "grad_norm": 1.5465519428253174, + "learning_rate": 4.707538542969481e-05, + "loss": 4.6844, + "step": 26149 + }, + { + "epoch": 0.15552145779807783, + "grad_norm": 1.6268285512924194, + "learning_rate": 4.7075166195954736e-05, + "loss": 5.046, + "step": 26150 + }, + { + "epoch": 0.15552740508135884, + "grad_norm": 1.6071034669876099, + "learning_rate": 4.707494695450845e-05, + "loss": 4.9576, + "step": 26151 + }, + { + "epoch": 0.15553335236463983, + "grad_norm": 1.4627524614334106, + "learning_rate": 4.707472770535603e-05, + "loss": 5.0786, + "step": 26152 + }, + { + "epoch": 0.15553929964792082, + "grad_norm": 1.7464107275009155, + "learning_rate": 4.707450844849754e-05, + "loss": 5.0383, + "step": 26153 + }, + { + "epoch": 0.15554524693120184, + "grad_norm": 1.7528932094573975, + "learning_rate": 4.7074289183933077e-05, + "loss": 4.7332, + "step": 26154 + }, + { + "epoch": 0.15555119421448282, + "grad_norm": 1.9061720371246338, + "learning_rate": 4.70740699116627e-05, + "loss": 4.5108, + "step": 26155 + }, + { + "epoch": 0.1555571414977638, + "grad_norm": 1.6121511459350586, + "learning_rate": 4.70738506316865e-05, + "loss": 4.9586, + "step": 26156 + }, + { + "epoch": 0.15556308878104483, + "grad_norm": 1.622747778892517, + "learning_rate": 4.707363134400454e-05, + "loss": 5.0985, + "step": 26157 + }, + { + "epoch": 0.15556903606432582, + "grad_norm": 1.4669454097747803, + "learning_rate": 4.707341204861691e-05, + "loss": 4.9397, + "step": 26158 + }, + { + "epoch": 0.1555749833476068, + "grad_norm": 1.4583669900894165, + "learning_rate": 4.707319274552368e-05, + "loss": 5.0822, + "step": 26159 + }, + { + "epoch": 0.15558093063088782, + "grad_norm": 1.9358830451965332, + "learning_rate": 4.707297343472492e-05, + "loss": 4.9557, + "step": 26160 + }, + { + "epoch": 0.1555868779141688, + "grad_norm": 1.7523856163024902, + "learning_rate": 4.707275411622072e-05, + "loss": 4.5959, + "step": 26161 + }, + { + "epoch": 0.1555928251974498, + "grad_norm": 1.7858316898345947, + "learning_rate": 4.707253479001114e-05, + "loss": 5.1765, + "step": 26162 + }, + { + "epoch": 0.1555987724807308, + "grad_norm": 1.7400814294815063, + "learning_rate": 4.707231545609627e-05, + "loss": 5.4312, + "step": 26163 + }, + { + "epoch": 0.1556047197640118, + "grad_norm": 1.6235188245773315, + "learning_rate": 4.7072096114476186e-05, + "loss": 5.1745, + "step": 26164 + }, + { + "epoch": 0.1556106670472928, + "grad_norm": 1.6003834009170532, + "learning_rate": 4.7071876765150963e-05, + "loss": 4.9194, + "step": 26165 + }, + { + "epoch": 0.1556166143305738, + "grad_norm": 1.7427910566329956, + "learning_rate": 4.7071657408120675e-05, + "loss": 5.1942, + "step": 26166 + }, + { + "epoch": 0.1556225616138548, + "grad_norm": 1.5763969421386719, + "learning_rate": 4.7071438043385395e-05, + "loss": 4.9424, + "step": 26167 + }, + { + "epoch": 0.15562850889713578, + "grad_norm": 1.6284310817718506, + "learning_rate": 4.7071218670945206e-05, + "loss": 5.4415, + "step": 26168 + }, + { + "epoch": 0.1556344561804168, + "grad_norm": 1.3858957290649414, + "learning_rate": 4.707099929080019e-05, + "loss": 5.6362, + "step": 26169 + }, + { + "epoch": 0.15564040346369779, + "grad_norm": 1.4326859712600708, + "learning_rate": 4.70707799029504e-05, + "loss": 5.2872, + "step": 26170 + }, + { + "epoch": 0.15564635074697877, + "grad_norm": 1.6624369621276855, + "learning_rate": 4.7070560507395944e-05, + "loss": 5.1741, + "step": 26171 + }, + { + "epoch": 0.1556522980302598, + "grad_norm": 2.4475722312927246, + "learning_rate": 4.707034110413688e-05, + "loss": 4.8206, + "step": 26172 + }, + { + "epoch": 0.15565824531354078, + "grad_norm": 2.2583391666412354, + "learning_rate": 4.707012169317329e-05, + "loss": 4.6716, + "step": 26173 + }, + { + "epoch": 0.15566419259682177, + "grad_norm": 2.161346197128296, + "learning_rate": 4.706990227450524e-05, + "loss": 4.5228, + "step": 26174 + }, + { + "epoch": 0.15567013988010278, + "grad_norm": 1.550593614578247, + "learning_rate": 4.7069682848132815e-05, + "loss": 5.1581, + "step": 26175 + }, + { + "epoch": 0.15567608716338377, + "grad_norm": 1.524939775466919, + "learning_rate": 4.70694634140561e-05, + "loss": 5.6605, + "step": 26176 + }, + { + "epoch": 0.15568203444666476, + "grad_norm": 2.134462833404541, + "learning_rate": 4.7069243972275155e-05, + "loss": 4.9063, + "step": 26177 + }, + { + "epoch": 0.15568798172994577, + "grad_norm": 2.2610831260681152, + "learning_rate": 4.7069024522790075e-05, + "loss": 4.4764, + "step": 26178 + }, + { + "epoch": 0.15569392901322676, + "grad_norm": 2.4277896881103516, + "learning_rate": 4.706880506560092e-05, + "loss": 4.7747, + "step": 26179 + }, + { + "epoch": 0.15569987629650775, + "grad_norm": 2.5465261936187744, + "learning_rate": 4.706858560070777e-05, + "loss": 4.7831, + "step": 26180 + }, + { + "epoch": 0.15570582357978877, + "grad_norm": 2.4795758724212646, + "learning_rate": 4.706836612811071e-05, + "loss": 4.6256, + "step": 26181 + }, + { + "epoch": 0.15571177086306975, + "grad_norm": 2.624998092651367, + "learning_rate": 4.7068146647809805e-05, + "loss": 4.5916, + "step": 26182 + }, + { + "epoch": 0.15571771814635074, + "grad_norm": 2.1440951824188232, + "learning_rate": 4.706792715980515e-05, + "loss": 4.5955, + "step": 26183 + }, + { + "epoch": 0.15572366542963176, + "grad_norm": 2.386084794998169, + "learning_rate": 4.70677076640968e-05, + "loss": 3.9781, + "step": 26184 + }, + { + "epoch": 0.15572961271291275, + "grad_norm": 2.271477699279785, + "learning_rate": 4.7067488160684844e-05, + "loss": 4.3557, + "step": 26185 + }, + { + "epoch": 0.15573555999619373, + "grad_norm": 2.227630853652954, + "learning_rate": 4.706726864956935e-05, + "loss": 4.117, + "step": 26186 + }, + { + "epoch": 0.15574150727947472, + "grad_norm": 2.1777312755584717, + "learning_rate": 4.7067049130750414e-05, + "loss": 4.4695, + "step": 26187 + }, + { + "epoch": 0.15574745456275574, + "grad_norm": 2.131826162338257, + "learning_rate": 4.7066829604228094e-05, + "loss": 4.185, + "step": 26188 + }, + { + "epoch": 0.15575340184603673, + "grad_norm": 1.9766490459442139, + "learning_rate": 4.706661007000246e-05, + "loss": 5.6452, + "step": 26189 + }, + { + "epoch": 0.15575934912931771, + "grad_norm": 2.088787078857422, + "learning_rate": 4.706639052807361e-05, + "loss": 4.6965, + "step": 26190 + }, + { + "epoch": 0.15576529641259873, + "grad_norm": 2.012974262237549, + "learning_rate": 4.7066170978441616e-05, + "loss": 4.4508, + "step": 26191 + }, + { + "epoch": 0.15577124369587972, + "grad_norm": 2.473616123199463, + "learning_rate": 4.706595142110654e-05, + "loss": 4.4842, + "step": 26192 + }, + { + "epoch": 0.1557771909791607, + "grad_norm": 2.5314011573791504, + "learning_rate": 4.7065731856068475e-05, + "loss": 4.5175, + "step": 26193 + }, + { + "epoch": 0.15578313826244172, + "grad_norm": 2.0637693405151367, + "learning_rate": 4.7065512283327484e-05, + "loss": 4.8803, + "step": 26194 + }, + { + "epoch": 0.1557890855457227, + "grad_norm": 2.659450054168701, + "learning_rate": 4.706529270288366e-05, + "loss": 4.7659, + "step": 26195 + }, + { + "epoch": 0.1557950328290037, + "grad_norm": 1.741438865661621, + "learning_rate": 4.706507311473707e-05, + "loss": 5.5987, + "step": 26196 + }, + { + "epoch": 0.15580098011228471, + "grad_norm": 1.621771216392517, + "learning_rate": 4.706485351888778e-05, + "loss": 5.477, + "step": 26197 + }, + { + "epoch": 0.1558069273955657, + "grad_norm": 1.8086066246032715, + "learning_rate": 4.706463391533589e-05, + "loss": 5.4196, + "step": 26198 + }, + { + "epoch": 0.1558128746788467, + "grad_norm": 1.4268287420272827, + "learning_rate": 4.706441430408145e-05, + "loss": 5.8321, + "step": 26199 + }, + { + "epoch": 0.1558188219621277, + "grad_norm": 1.5565332174301147, + "learning_rate": 4.7064194685124564e-05, + "loss": 5.5548, + "step": 26200 + }, + { + "epoch": 0.1558247692454087, + "grad_norm": 1.7371162176132202, + "learning_rate": 4.706397505846529e-05, + "loss": 5.4536, + "step": 26201 + }, + { + "epoch": 0.15583071652868968, + "grad_norm": 1.6265679597854614, + "learning_rate": 4.706375542410371e-05, + "loss": 4.7589, + "step": 26202 + }, + { + "epoch": 0.1558366638119707, + "grad_norm": 1.5395931005477905, + "learning_rate": 4.70635357820399e-05, + "loss": 5.2809, + "step": 26203 + }, + { + "epoch": 0.1558426110952517, + "grad_norm": 1.5577752590179443, + "learning_rate": 4.7063316132273937e-05, + "loss": 5.2526, + "step": 26204 + }, + { + "epoch": 0.15584855837853268, + "grad_norm": 1.3954623937606812, + "learning_rate": 4.706309647480591e-05, + "loss": 5.3674, + "step": 26205 + }, + { + "epoch": 0.1558545056618137, + "grad_norm": 1.7251001596450806, + "learning_rate": 4.706287680963587e-05, + "loss": 5.2069, + "step": 26206 + }, + { + "epoch": 0.15586045294509468, + "grad_norm": 1.8611587285995483, + "learning_rate": 4.706265713676391e-05, + "loss": 5.2805, + "step": 26207 + }, + { + "epoch": 0.15586640022837567, + "grad_norm": 1.5871427059173584, + "learning_rate": 4.706243745619011e-05, + "loss": 5.2921, + "step": 26208 + }, + { + "epoch": 0.15587234751165668, + "grad_norm": 1.6353893280029297, + "learning_rate": 4.706221776791454e-05, + "loss": 5.3425, + "step": 26209 + }, + { + "epoch": 0.15587829479493767, + "grad_norm": 1.6304540634155273, + "learning_rate": 4.7061998071937274e-05, + "loss": 5.3577, + "step": 26210 + }, + { + "epoch": 0.15588424207821866, + "grad_norm": 1.6434270143508911, + "learning_rate": 4.706177836825839e-05, + "loss": 5.4573, + "step": 26211 + }, + { + "epoch": 0.15589018936149968, + "grad_norm": 1.6281068325042725, + "learning_rate": 4.7061558656877976e-05, + "loss": 4.8948, + "step": 26212 + }, + { + "epoch": 0.15589613664478066, + "grad_norm": 1.7287936210632324, + "learning_rate": 4.70613389377961e-05, + "loss": 5.2005, + "step": 26213 + }, + { + "epoch": 0.15590208392806165, + "grad_norm": 1.8355118036270142, + "learning_rate": 4.706111921101283e-05, + "loss": 5.456, + "step": 26214 + }, + { + "epoch": 0.15590803121134267, + "grad_norm": 1.5891990661621094, + "learning_rate": 4.7060899476528253e-05, + "loss": 5.1405, + "step": 26215 + }, + { + "epoch": 0.15591397849462366, + "grad_norm": 1.5852643251419067, + "learning_rate": 4.706067973434244e-05, + "loss": 5.5963, + "step": 26216 + }, + { + "epoch": 0.15591992577790464, + "grad_norm": 2.340528726577759, + "learning_rate": 4.706045998445548e-05, + "loss": 4.6047, + "step": 26217 + }, + { + "epoch": 0.15592587306118566, + "grad_norm": 1.872802495956421, + "learning_rate": 4.706024022686744e-05, + "loss": 4.7129, + "step": 26218 + }, + { + "epoch": 0.15593182034446665, + "grad_norm": 1.6725971698760986, + "learning_rate": 4.706002046157839e-05, + "loss": 5.2416, + "step": 26219 + }, + { + "epoch": 0.15593776762774764, + "grad_norm": 1.6346997022628784, + "learning_rate": 4.705980068858843e-05, + "loss": 5.0625, + "step": 26220 + }, + { + "epoch": 0.15594371491102865, + "grad_norm": 1.8969260454177856, + "learning_rate": 4.705958090789761e-05, + "loss": 4.6915, + "step": 26221 + }, + { + "epoch": 0.15594966219430964, + "grad_norm": 1.6025121212005615, + "learning_rate": 4.705936111950602e-05, + "loss": 4.9978, + "step": 26222 + }, + { + "epoch": 0.15595560947759063, + "grad_norm": 1.406001329421997, + "learning_rate": 4.705914132341374e-05, + "loss": 5.7913, + "step": 26223 + }, + { + "epoch": 0.15596155676087164, + "grad_norm": 2.1708552837371826, + "learning_rate": 4.7058921519620834e-05, + "loss": 5.1468, + "step": 26224 + }, + { + "epoch": 0.15596750404415263, + "grad_norm": 2.216993808746338, + "learning_rate": 4.705870170812739e-05, + "loss": 5.1279, + "step": 26225 + }, + { + "epoch": 0.15597345132743362, + "grad_norm": 1.7173157930374146, + "learning_rate": 4.705848188893348e-05, + "loss": 5.1289, + "step": 26226 + }, + { + "epoch": 0.15597939861071464, + "grad_norm": 1.6096726655960083, + "learning_rate": 4.705826206203918e-05, + "loss": 5.5078, + "step": 26227 + }, + { + "epoch": 0.15598534589399562, + "grad_norm": 1.8224303722381592, + "learning_rate": 4.705804222744458e-05, + "loss": 5.4791, + "step": 26228 + }, + { + "epoch": 0.1559912931772766, + "grad_norm": 1.722948431968689, + "learning_rate": 4.705782238514973e-05, + "loss": 5.1473, + "step": 26229 + }, + { + "epoch": 0.15599724046055763, + "grad_norm": 1.7583675384521484, + "learning_rate": 4.705760253515473e-05, + "loss": 5.5127, + "step": 26230 + }, + { + "epoch": 0.15600318774383862, + "grad_norm": 1.5635607242584229, + "learning_rate": 4.705738267745965e-05, + "loss": 5.417, + "step": 26231 + }, + { + "epoch": 0.1560091350271196, + "grad_norm": 1.570145606994629, + "learning_rate": 4.705716281206456e-05, + "loss": 5.266, + "step": 26232 + }, + { + "epoch": 0.15601508231040062, + "grad_norm": 1.6425197124481201, + "learning_rate": 4.705694293896955e-05, + "loss": 4.7162, + "step": 26233 + }, + { + "epoch": 0.1560210295936816, + "grad_norm": 1.6312974691390991, + "learning_rate": 4.705672305817468e-05, + "loss": 4.8861, + "step": 26234 + }, + { + "epoch": 0.1560269768769626, + "grad_norm": 1.6320679187774658, + "learning_rate": 4.7056503169680046e-05, + "loss": 5.2133, + "step": 26235 + }, + { + "epoch": 0.1560329241602436, + "grad_norm": 1.6294546127319336, + "learning_rate": 4.705628327348571e-05, + "loss": 5.7012, + "step": 26236 + }, + { + "epoch": 0.1560388714435246, + "grad_norm": 1.472088098526001, + "learning_rate": 4.705606336959175e-05, + "loss": 5.404, + "step": 26237 + }, + { + "epoch": 0.1560448187268056, + "grad_norm": 1.5214602947235107, + "learning_rate": 4.705584345799825e-05, + "loss": 5.3916, + "step": 26238 + }, + { + "epoch": 0.1560507660100866, + "grad_norm": 1.45046067237854, + "learning_rate": 4.705562353870528e-05, + "loss": 5.2275, + "step": 26239 + }, + { + "epoch": 0.1560567132933676, + "grad_norm": 1.5730977058410645, + "learning_rate": 4.705540361171292e-05, + "loss": 5.4597, + "step": 26240 + }, + { + "epoch": 0.15606266057664858, + "grad_norm": 1.6403652429580688, + "learning_rate": 4.7055183677021254e-05, + "loss": 4.7476, + "step": 26241 + }, + { + "epoch": 0.1560686078599296, + "grad_norm": 2.0256097316741943, + "learning_rate": 4.705496373463034e-05, + "loss": 4.7275, + "step": 26242 + }, + { + "epoch": 0.15607455514321059, + "grad_norm": 2.1107068061828613, + "learning_rate": 4.7054743784540265e-05, + "loss": 4.7459, + "step": 26243 + }, + { + "epoch": 0.15608050242649157, + "grad_norm": 1.4644510746002197, + "learning_rate": 4.705452382675112e-05, + "loss": 5.3951, + "step": 26244 + }, + { + "epoch": 0.15608644970977256, + "grad_norm": 1.4154125452041626, + "learning_rate": 4.705430386126296e-05, + "loss": 5.5351, + "step": 26245 + }, + { + "epoch": 0.15609239699305358, + "grad_norm": 1.4124795198440552, + "learning_rate": 4.7054083888075875e-05, + "loss": 5.3797, + "step": 26246 + }, + { + "epoch": 0.15609834427633457, + "grad_norm": 1.6197364330291748, + "learning_rate": 4.705386390718993e-05, + "loss": 5.3903, + "step": 26247 + }, + { + "epoch": 0.15610429155961555, + "grad_norm": 1.5693352222442627, + "learning_rate": 4.7053643918605216e-05, + "loss": 5.4997, + "step": 26248 + }, + { + "epoch": 0.15611023884289657, + "grad_norm": 1.4047479629516602, + "learning_rate": 4.70534239223218e-05, + "loss": 5.0258, + "step": 26249 + }, + { + "epoch": 0.15611618612617756, + "grad_norm": 1.7006193399429321, + "learning_rate": 4.705320391833976e-05, + "loss": 4.9798, + "step": 26250 + }, + { + "epoch": 0.15612213340945855, + "grad_norm": 1.7294094562530518, + "learning_rate": 4.705298390665917e-05, + "loss": 5.5811, + "step": 26251 + }, + { + "epoch": 0.15612808069273956, + "grad_norm": 1.4665381908416748, + "learning_rate": 4.705276388728013e-05, + "loss": 5.5117, + "step": 26252 + }, + { + "epoch": 0.15613402797602055, + "grad_norm": 1.4549496173858643, + "learning_rate": 4.705254386020268e-05, + "loss": 5.6141, + "step": 26253 + }, + { + "epoch": 0.15613997525930154, + "grad_norm": 1.4019516706466675, + "learning_rate": 4.705232382542691e-05, + "loss": 5.6525, + "step": 26254 + }, + { + "epoch": 0.15614592254258255, + "grad_norm": 1.3660154342651367, + "learning_rate": 4.705210378295292e-05, + "loss": 5.4377, + "step": 26255 + }, + { + "epoch": 0.15615186982586354, + "grad_norm": 1.5590531826019287, + "learning_rate": 4.7051883732780755e-05, + "loss": 5.5679, + "step": 26256 + }, + { + "epoch": 0.15615781710914453, + "grad_norm": 2.126138687133789, + "learning_rate": 4.7051663674910514e-05, + "loss": 4.8662, + "step": 26257 + }, + { + "epoch": 0.15616376439242555, + "grad_norm": 1.5536115169525146, + "learning_rate": 4.705144360934226e-05, + "loss": 4.97, + "step": 26258 + }, + { + "epoch": 0.15616971167570654, + "grad_norm": 2.0653862953186035, + "learning_rate": 4.705122353607607e-05, + "loss": 4.8683, + "step": 26259 + }, + { + "epoch": 0.15617565895898752, + "grad_norm": 1.872904658317566, + "learning_rate": 4.705100345511204e-05, + "loss": 4.8923, + "step": 26260 + }, + { + "epoch": 0.15618160624226854, + "grad_norm": 2.112368583679199, + "learning_rate": 4.7050783366450224e-05, + "loss": 4.7857, + "step": 26261 + }, + { + "epoch": 0.15618755352554953, + "grad_norm": 1.4000160694122314, + "learning_rate": 4.7050563270090704e-05, + "loss": 5.2055, + "step": 26262 + }, + { + "epoch": 0.15619350080883052, + "grad_norm": 1.4316319227218628, + "learning_rate": 4.705034316603356e-05, + "loss": 5.5257, + "step": 26263 + }, + { + "epoch": 0.15619944809211153, + "grad_norm": 1.4394290447235107, + "learning_rate": 4.705012305427887e-05, + "loss": 5.2702, + "step": 26264 + }, + { + "epoch": 0.15620539537539252, + "grad_norm": 2.0612921714782715, + "learning_rate": 4.704990293482672e-05, + "loss": 4.964, + "step": 26265 + }, + { + "epoch": 0.1562113426586735, + "grad_norm": 1.7573301792144775, + "learning_rate": 4.704968280767716e-05, + "loss": 5.1509, + "step": 26266 + }, + { + "epoch": 0.15621728994195452, + "grad_norm": 1.546891450881958, + "learning_rate": 4.70494626728303e-05, + "loss": 5.3226, + "step": 26267 + }, + { + "epoch": 0.1562232372252355, + "grad_norm": 1.672478437423706, + "learning_rate": 4.7049242530286195e-05, + "loss": 4.998, + "step": 26268 + }, + { + "epoch": 0.1562291845085165, + "grad_norm": 1.943877100944519, + "learning_rate": 4.704902238004492e-05, + "loss": 4.6489, + "step": 26269 + }, + { + "epoch": 0.15623513179179752, + "grad_norm": 2.779040813446045, + "learning_rate": 4.704880222210657e-05, + "loss": 3.8466, + "step": 26270 + }, + { + "epoch": 0.1562410790750785, + "grad_norm": 2.8241045475006104, + "learning_rate": 4.7048582056471205e-05, + "loss": 4.026, + "step": 26271 + }, + { + "epoch": 0.1562470263583595, + "grad_norm": 1.6769524812698364, + "learning_rate": 4.70483618831389e-05, + "loss": 4.6255, + "step": 26272 + }, + { + "epoch": 0.1562529736416405, + "grad_norm": 1.4940049648284912, + "learning_rate": 4.704814170210975e-05, + "loss": 4.7496, + "step": 26273 + }, + { + "epoch": 0.1562589209249215, + "grad_norm": 1.6519593000411987, + "learning_rate": 4.704792151338382e-05, + "loss": 4.7485, + "step": 26274 + }, + { + "epoch": 0.15626486820820248, + "grad_norm": 2.30234956741333, + "learning_rate": 4.704770131696119e-05, + "loss": 4.6089, + "step": 26275 + }, + { + "epoch": 0.1562708154914835, + "grad_norm": 1.6795179843902588, + "learning_rate": 4.704748111284193e-05, + "loss": 5.2412, + "step": 26276 + }, + { + "epoch": 0.1562767627747645, + "grad_norm": 2.194812536239624, + "learning_rate": 4.7047260901026124e-05, + "loss": 5.156, + "step": 26277 + }, + { + "epoch": 0.15628271005804548, + "grad_norm": 2.5557010173797607, + "learning_rate": 4.704704068151385e-05, + "loss": 4.5438, + "step": 26278 + }, + { + "epoch": 0.1562886573413265, + "grad_norm": 1.95830237865448, + "learning_rate": 4.704682045430518e-05, + "loss": 4.6183, + "step": 26279 + }, + { + "epoch": 0.15629460462460748, + "grad_norm": 2.1255557537078857, + "learning_rate": 4.704660021940019e-05, + "loss": 4.5619, + "step": 26280 + }, + { + "epoch": 0.15630055190788847, + "grad_norm": 1.6092948913574219, + "learning_rate": 4.704637997679896e-05, + "loss": 5.64, + "step": 26281 + }, + { + "epoch": 0.15630649919116948, + "grad_norm": 2.1546456813812256, + "learning_rate": 4.704615972650157e-05, + "loss": 4.9573, + "step": 26282 + }, + { + "epoch": 0.15631244647445047, + "grad_norm": 2.154639959335327, + "learning_rate": 4.7045939468508095e-05, + "loss": 4.4704, + "step": 26283 + }, + { + "epoch": 0.15631839375773146, + "grad_norm": 1.819509744644165, + "learning_rate": 4.7045719202818605e-05, + "loss": 4.6245, + "step": 26284 + }, + { + "epoch": 0.15632434104101248, + "grad_norm": 2.337667942047119, + "learning_rate": 4.704549892943318e-05, + "loss": 4.4268, + "step": 26285 + }, + { + "epoch": 0.15633028832429346, + "grad_norm": 2.308842658996582, + "learning_rate": 4.704527864835191e-05, + "loss": 4.7084, + "step": 26286 + }, + { + "epoch": 0.15633623560757445, + "grad_norm": 1.664182424545288, + "learning_rate": 4.704505835957486e-05, + "loss": 5.2576, + "step": 26287 + }, + { + "epoch": 0.15634218289085547, + "grad_norm": 1.7331715822219849, + "learning_rate": 4.7044838063102096e-05, + "loss": 5.3069, + "step": 26288 + }, + { + "epoch": 0.15634813017413646, + "grad_norm": 1.4833427667617798, + "learning_rate": 4.7044617758933714e-05, + "loss": 4.8484, + "step": 26289 + }, + { + "epoch": 0.15635407745741745, + "grad_norm": 2.975609064102173, + "learning_rate": 4.704439744706978e-05, + "loss": 5.5747, + "step": 26290 + }, + { + "epoch": 0.15636002474069846, + "grad_norm": 1.8256950378417969, + "learning_rate": 4.704417712751038e-05, + "loss": 5.2464, + "step": 26291 + }, + { + "epoch": 0.15636597202397945, + "grad_norm": 1.5019065141677856, + "learning_rate": 4.7043956800255585e-05, + "loss": 5.5261, + "step": 26292 + }, + { + "epoch": 0.15637191930726044, + "grad_norm": 1.4906537532806396, + "learning_rate": 4.7043736465305464e-05, + "loss": 5.38, + "step": 26293 + }, + { + "epoch": 0.15637786659054145, + "grad_norm": 1.601969599723816, + "learning_rate": 4.704351612266012e-05, + "loss": 5.2111, + "step": 26294 + }, + { + "epoch": 0.15638381387382244, + "grad_norm": 1.5806862115859985, + "learning_rate": 4.70432957723196e-05, + "loss": 5.5473, + "step": 26295 + }, + { + "epoch": 0.15638976115710343, + "grad_norm": 1.5971914529800415, + "learning_rate": 4.7043075414283986e-05, + "loss": 5.4841, + "step": 26296 + }, + { + "epoch": 0.15639570844038445, + "grad_norm": 1.6458126306533813, + "learning_rate": 4.704285504855337e-05, + "loss": 5.3215, + "step": 26297 + }, + { + "epoch": 0.15640165572366543, + "grad_norm": 1.5553637742996216, + "learning_rate": 4.704263467512782e-05, + "loss": 5.4461, + "step": 26298 + }, + { + "epoch": 0.15640760300694642, + "grad_norm": 1.447519063949585, + "learning_rate": 4.704241429400742e-05, + "loss": 5.3617, + "step": 26299 + }, + { + "epoch": 0.15641355029022744, + "grad_norm": 1.5533196926116943, + "learning_rate": 4.704219390519223e-05, + "loss": 4.8446, + "step": 26300 + }, + { + "epoch": 0.15641949757350843, + "grad_norm": 1.5320333242416382, + "learning_rate": 4.7041973508682344e-05, + "loss": 5.3333, + "step": 26301 + }, + { + "epoch": 0.15642544485678941, + "grad_norm": 1.6192045211791992, + "learning_rate": 4.704175310447784e-05, + "loss": 5.221, + "step": 26302 + }, + { + "epoch": 0.1564313921400704, + "grad_norm": 1.4964373111724854, + "learning_rate": 4.704153269257878e-05, + "loss": 5.3061, + "step": 26303 + }, + { + "epoch": 0.15643733942335142, + "grad_norm": 1.6173138618469238, + "learning_rate": 4.704131227298525e-05, + "loss": 5.3485, + "step": 26304 + }, + { + "epoch": 0.1564432867066324, + "grad_norm": 1.511825680732727, + "learning_rate": 4.704109184569733e-05, + "loss": 5.2024, + "step": 26305 + }, + { + "epoch": 0.1564492339899134, + "grad_norm": 1.5368350744247437, + "learning_rate": 4.704087141071508e-05, + "loss": 5.3867, + "step": 26306 + }, + { + "epoch": 0.1564551812731944, + "grad_norm": 1.612384557723999, + "learning_rate": 4.7040650968038605e-05, + "loss": 5.1923, + "step": 26307 + }, + { + "epoch": 0.1564611285564754, + "grad_norm": 1.5889664888381958, + "learning_rate": 4.704043051766795e-05, + "loss": 5.0457, + "step": 26308 + }, + { + "epoch": 0.1564670758397564, + "grad_norm": 1.5363719463348389, + "learning_rate": 4.704021005960322e-05, + "loss": 5.3852, + "step": 26309 + }, + { + "epoch": 0.1564730231230374, + "grad_norm": 1.5099613666534424, + "learning_rate": 4.703998959384447e-05, + "loss": 5.8659, + "step": 26310 + }, + { + "epoch": 0.1564789704063184, + "grad_norm": 1.5517312288284302, + "learning_rate": 4.70397691203918e-05, + "loss": 6.0298, + "step": 26311 + }, + { + "epoch": 0.15648491768959938, + "grad_norm": 1.616828441619873, + "learning_rate": 4.703954863924527e-05, + "loss": 4.8686, + "step": 26312 + }, + { + "epoch": 0.1564908649728804, + "grad_norm": 1.4939557313919067, + "learning_rate": 4.703932815040496e-05, + "loss": 5.3872, + "step": 26313 + }, + { + "epoch": 0.15649681225616138, + "grad_norm": 1.444994568824768, + "learning_rate": 4.7039107653870954e-05, + "loss": 5.38, + "step": 26314 + }, + { + "epoch": 0.15650275953944237, + "grad_norm": 1.7697070837020874, + "learning_rate": 4.7038887149643304e-05, + "loss": 5.6994, + "step": 26315 + }, + { + "epoch": 0.1565087068227234, + "grad_norm": 1.628763198852539, + "learning_rate": 4.703866663772213e-05, + "loss": 5.5986, + "step": 26316 + }, + { + "epoch": 0.15651465410600438, + "grad_norm": 1.5433357954025269, + "learning_rate": 4.703844611810747e-05, + "loss": 5.5968, + "step": 26317 + }, + { + "epoch": 0.15652060138928536, + "grad_norm": 1.452527403831482, + "learning_rate": 4.7038225590799424e-05, + "loss": 5.5669, + "step": 26318 + }, + { + "epoch": 0.15652654867256638, + "grad_norm": 1.6079583168029785, + "learning_rate": 4.703800505579806e-05, + "loss": 5.2624, + "step": 26319 + }, + { + "epoch": 0.15653249595584737, + "grad_norm": 1.4639090299606323, + "learning_rate": 4.703778451310345e-05, + "loss": 5.4219, + "step": 26320 + }, + { + "epoch": 0.15653844323912836, + "grad_norm": 1.7064789533615112, + "learning_rate": 4.703756396271568e-05, + "loss": 5.055, + "step": 26321 + }, + { + "epoch": 0.15654439052240937, + "grad_norm": 1.596901297569275, + "learning_rate": 4.7037343404634824e-05, + "loss": 6.4061, + "step": 26322 + }, + { + "epoch": 0.15655033780569036, + "grad_norm": 1.4072599411010742, + "learning_rate": 4.703712283886097e-05, + "loss": 5.4348, + "step": 26323 + }, + { + "epoch": 0.15655628508897135, + "grad_norm": 1.4027669429779053, + "learning_rate": 4.703690226539417e-05, + "loss": 5.285, + "step": 26324 + }, + { + "epoch": 0.15656223237225236, + "grad_norm": 1.3492887020111084, + "learning_rate": 4.703668168423452e-05, + "loss": 5.2334, + "step": 26325 + }, + { + "epoch": 0.15656817965553335, + "grad_norm": 1.5650583505630493, + "learning_rate": 4.703646109538209e-05, + "loss": 5.3706, + "step": 26326 + }, + { + "epoch": 0.15657412693881434, + "grad_norm": 1.549395203590393, + "learning_rate": 4.703624049883696e-05, + "loss": 5.3483, + "step": 26327 + }, + { + "epoch": 0.15658007422209536, + "grad_norm": 1.5657979249954224, + "learning_rate": 4.70360198945992e-05, + "loss": 5.2897, + "step": 26328 + }, + { + "epoch": 0.15658602150537634, + "grad_norm": 1.3859858512878418, + "learning_rate": 4.7035799282668906e-05, + "loss": 5.3292, + "step": 26329 + }, + { + "epoch": 0.15659196878865733, + "grad_norm": 1.8330230712890625, + "learning_rate": 4.7035578663046136e-05, + "loss": 5.6592, + "step": 26330 + }, + { + "epoch": 0.15659791607193835, + "grad_norm": 1.6347804069519043, + "learning_rate": 4.703535803573097e-05, + "loss": 5.5734, + "step": 26331 + }, + { + "epoch": 0.15660386335521934, + "grad_norm": 1.615646481513977, + "learning_rate": 4.7035137400723496e-05, + "loss": 5.8483, + "step": 26332 + }, + { + "epoch": 0.15660981063850032, + "grad_norm": 1.7376673221588135, + "learning_rate": 4.703491675802378e-05, + "loss": 5.327, + "step": 26333 + }, + { + "epoch": 0.15661575792178134, + "grad_norm": 2.2167186737060547, + "learning_rate": 4.70346961076319e-05, + "loss": 4.6295, + "step": 26334 + }, + { + "epoch": 0.15662170520506233, + "grad_norm": 1.8190215826034546, + "learning_rate": 4.703447544954794e-05, + "loss": 4.6977, + "step": 26335 + }, + { + "epoch": 0.15662765248834332, + "grad_norm": 1.8056445121765137, + "learning_rate": 4.703425478377197e-05, + "loss": 4.7828, + "step": 26336 + }, + { + "epoch": 0.15663359977162433, + "grad_norm": 1.3003071546554565, + "learning_rate": 4.7034034110304056e-05, + "loss": 5.3244, + "step": 26337 + }, + { + "epoch": 0.15663954705490532, + "grad_norm": 1.5494154691696167, + "learning_rate": 4.703381342914431e-05, + "loss": 5.2614, + "step": 26338 + }, + { + "epoch": 0.1566454943381863, + "grad_norm": 1.4443477392196655, + "learning_rate": 4.703359274029278e-05, + "loss": 5.6987, + "step": 26339 + }, + { + "epoch": 0.15665144162146732, + "grad_norm": 1.6877416372299194, + "learning_rate": 4.703337204374955e-05, + "loss": 5.0908, + "step": 26340 + }, + { + "epoch": 0.1566573889047483, + "grad_norm": 1.7778805494308472, + "learning_rate": 4.703315133951469e-05, + "loss": 5.067, + "step": 26341 + }, + { + "epoch": 0.1566633361880293, + "grad_norm": 1.8032246828079224, + "learning_rate": 4.703293062758829e-05, + "loss": 5.2325, + "step": 26342 + }, + { + "epoch": 0.15666928347131032, + "grad_norm": 1.6244032382965088, + "learning_rate": 4.703270990797042e-05, + "loss": 4.7988, + "step": 26343 + }, + { + "epoch": 0.1566752307545913, + "grad_norm": 2.212272882461548, + "learning_rate": 4.7032489180661154e-05, + "loss": 4.6136, + "step": 26344 + }, + { + "epoch": 0.1566811780378723, + "grad_norm": 1.4413294792175293, + "learning_rate": 4.703226844566059e-05, + "loss": 5.1378, + "step": 26345 + }, + { + "epoch": 0.1566871253211533, + "grad_norm": 1.7251073122024536, + "learning_rate": 4.703204770296877e-05, + "loss": 4.8629, + "step": 26346 + }, + { + "epoch": 0.1566930726044343, + "grad_norm": 1.8171210289001465, + "learning_rate": 4.70318269525858e-05, + "loss": 4.8487, + "step": 26347 + }, + { + "epoch": 0.15669901988771529, + "grad_norm": 1.7784240245819092, + "learning_rate": 4.703160619451175e-05, + "loss": 5.3187, + "step": 26348 + }, + { + "epoch": 0.1567049671709963, + "grad_norm": 1.7092580795288086, + "learning_rate": 4.703138542874669e-05, + "loss": 5.0771, + "step": 26349 + }, + { + "epoch": 0.1567109144542773, + "grad_norm": 1.4181660413742065, + "learning_rate": 4.7031164655290695e-05, + "loss": 5.3487, + "step": 26350 + }, + { + "epoch": 0.15671686173755828, + "grad_norm": 1.6292651891708374, + "learning_rate": 4.703094387414385e-05, + "loss": 5.2079, + "step": 26351 + }, + { + "epoch": 0.1567228090208393, + "grad_norm": 1.5617179870605469, + "learning_rate": 4.703072308530624e-05, + "loss": 5.3438, + "step": 26352 + }, + { + "epoch": 0.15672875630412028, + "grad_norm": 1.8505250215530396, + "learning_rate": 4.703050228877792e-05, + "loss": 5.223, + "step": 26353 + }, + { + "epoch": 0.15673470358740127, + "grad_norm": 1.2503677606582642, + "learning_rate": 4.7030281484558984e-05, + "loss": 4.7168, + "step": 26354 + }, + { + "epoch": 0.15674065087068229, + "grad_norm": 1.4453564882278442, + "learning_rate": 4.70300606726495e-05, + "loss": 5.3493, + "step": 26355 + }, + { + "epoch": 0.15674659815396327, + "grad_norm": 1.305949091911316, + "learning_rate": 4.702983985304956e-05, + "loss": 5.0599, + "step": 26356 + }, + { + "epoch": 0.15675254543724426, + "grad_norm": 2.160369634628296, + "learning_rate": 4.702961902575923e-05, + "loss": 4.2452, + "step": 26357 + }, + { + "epoch": 0.15675849272052528, + "grad_norm": 4.334263324737549, + "learning_rate": 4.7029398190778574e-05, + "loss": 2.7403, + "step": 26358 + }, + { + "epoch": 0.15676444000380627, + "grad_norm": 2.7898688316345215, + "learning_rate": 4.702917734810769e-05, + "loss": 2.7024, + "step": 26359 + }, + { + "epoch": 0.15677038728708725, + "grad_norm": 2.939950466156006, + "learning_rate": 4.702895649774665e-05, + "loss": 2.5659, + "step": 26360 + }, + { + "epoch": 0.15677633457036824, + "grad_norm": 2.2159571647644043, + "learning_rate": 4.702873563969553e-05, + "loss": 4.2729, + "step": 26361 + }, + { + "epoch": 0.15678228185364926, + "grad_norm": 1.4781655073165894, + "learning_rate": 4.7028514773954404e-05, + "loss": 4.7654, + "step": 26362 + }, + { + "epoch": 0.15678822913693025, + "grad_norm": 3.3153202533721924, + "learning_rate": 4.702829390052335e-05, + "loss": 4.055, + "step": 26363 + }, + { + "epoch": 0.15679417642021123, + "grad_norm": 4.366955757141113, + "learning_rate": 4.7028073019402446e-05, + "loss": 2.463, + "step": 26364 + }, + { + "epoch": 0.15680012370349225, + "grad_norm": 3.7748520374298096, + "learning_rate": 4.702785213059177e-05, + "loss": 2.8617, + "step": 26365 + }, + { + "epoch": 0.15680607098677324, + "grad_norm": 3.252652645111084, + "learning_rate": 4.7027631234091394e-05, + "loss": 2.8654, + "step": 26366 + }, + { + "epoch": 0.15681201827005423, + "grad_norm": 3.4591829776763916, + "learning_rate": 4.7027410329901414e-05, + "loss": 3.3268, + "step": 26367 + }, + { + "epoch": 0.15681796555333524, + "grad_norm": 2.971773624420166, + "learning_rate": 4.702718941802188e-05, + "loss": 2.835, + "step": 26368 + }, + { + "epoch": 0.15682391283661623, + "grad_norm": 2.8094983100891113, + "learning_rate": 4.7026968498452884e-05, + "loss": 3.5431, + "step": 26369 + }, + { + "epoch": 0.15682986011989722, + "grad_norm": 3.014570474624634, + "learning_rate": 4.7026747571194496e-05, + "loss": 3.2034, + "step": 26370 + }, + { + "epoch": 0.15683580740317823, + "grad_norm": 3.1913933753967285, + "learning_rate": 4.7026526636246805e-05, + "loss": 2.944, + "step": 26371 + }, + { + "epoch": 0.15684175468645922, + "grad_norm": 3.0981903076171875, + "learning_rate": 4.7026305693609884e-05, + "loss": 3.1399, + "step": 26372 + }, + { + "epoch": 0.1568477019697402, + "grad_norm": 2.7449357509613037, + "learning_rate": 4.70260847432838e-05, + "loss": 2.9713, + "step": 26373 + }, + { + "epoch": 0.15685364925302123, + "grad_norm": 2.5030126571655273, + "learning_rate": 4.7025863785268645e-05, + "loss": 4.1367, + "step": 26374 + }, + { + "epoch": 0.15685959653630221, + "grad_norm": 1.7585763931274414, + "learning_rate": 4.7025642819564476e-05, + "loss": 5.4266, + "step": 26375 + }, + { + "epoch": 0.1568655438195832, + "grad_norm": 1.6513370275497437, + "learning_rate": 4.702542184617139e-05, + "loss": 5.4329, + "step": 26376 + }, + { + "epoch": 0.15687149110286422, + "grad_norm": 1.381144404411316, + "learning_rate": 4.702520086508946e-05, + "loss": 5.2046, + "step": 26377 + }, + { + "epoch": 0.1568774383861452, + "grad_norm": 1.9510244131088257, + "learning_rate": 4.702497987631875e-05, + "loss": 5.365, + "step": 26378 + }, + { + "epoch": 0.1568833856694262, + "grad_norm": 2.6427478790283203, + "learning_rate": 4.702475887985936e-05, + "loss": 4.8551, + "step": 26379 + }, + { + "epoch": 0.1568893329527072, + "grad_norm": 1.9253584146499634, + "learning_rate": 4.702453787571135e-05, + "loss": 4.7738, + "step": 26380 + }, + { + "epoch": 0.1568952802359882, + "grad_norm": 1.9647809267044067, + "learning_rate": 4.7024316863874795e-05, + "loss": 5.0153, + "step": 26381 + }, + { + "epoch": 0.1569012275192692, + "grad_norm": 1.7858566045761108, + "learning_rate": 4.7024095844349786e-05, + "loss": 5.4806, + "step": 26382 + }, + { + "epoch": 0.1569071748025502, + "grad_norm": 1.5491056442260742, + "learning_rate": 4.7023874817136395e-05, + "loss": 5.1898, + "step": 26383 + }, + { + "epoch": 0.1569131220858312, + "grad_norm": 1.4932126998901367, + "learning_rate": 4.702365378223469e-05, + "loss": 5.3636, + "step": 26384 + }, + { + "epoch": 0.15691906936911218, + "grad_norm": 1.5436698198318481, + "learning_rate": 4.702343273964475e-05, + "loss": 5.2469, + "step": 26385 + }, + { + "epoch": 0.1569250166523932, + "grad_norm": 1.9735430479049683, + "learning_rate": 4.7023211689366666e-05, + "loss": 5.111, + "step": 26386 + }, + { + "epoch": 0.15693096393567418, + "grad_norm": 1.4643042087554932, + "learning_rate": 4.70229906314005e-05, + "loss": 4.9215, + "step": 26387 + }, + { + "epoch": 0.15693691121895517, + "grad_norm": 2.3229660987854004, + "learning_rate": 4.7022769565746345e-05, + "loss": 4.7726, + "step": 26388 + }, + { + "epoch": 0.1569428585022362, + "grad_norm": 4.978843688964844, + "learning_rate": 4.7022548492404264e-05, + "loss": 4.1208, + "step": 26389 + }, + { + "epoch": 0.15694880578551718, + "grad_norm": 4.040123462677002, + "learning_rate": 4.702232741137434e-05, + "loss": 4.6272, + "step": 26390 + }, + { + "epoch": 0.15695475306879816, + "grad_norm": 1.6977242231369019, + "learning_rate": 4.7022106322656643e-05, + "loss": 5.0605, + "step": 26391 + }, + { + "epoch": 0.15696070035207918, + "grad_norm": 2.055257558822632, + "learning_rate": 4.702188522625126e-05, + "loss": 4.9685, + "step": 26392 + }, + { + "epoch": 0.15696664763536017, + "grad_norm": 1.5921961069107056, + "learning_rate": 4.7021664122158264e-05, + "loss": 5.1433, + "step": 26393 + }, + { + "epoch": 0.15697259491864116, + "grad_norm": 1.5311743021011353, + "learning_rate": 4.7021443010377734e-05, + "loss": 5.2865, + "step": 26394 + }, + { + "epoch": 0.15697854220192217, + "grad_norm": 1.4683947563171387, + "learning_rate": 4.702122189090975e-05, + "loss": 5.2697, + "step": 26395 + }, + { + "epoch": 0.15698448948520316, + "grad_norm": 1.5425411462783813, + "learning_rate": 4.702100076375438e-05, + "loss": 5.5033, + "step": 26396 + }, + { + "epoch": 0.15699043676848415, + "grad_norm": 1.8671424388885498, + "learning_rate": 4.70207796289117e-05, + "loss": 4.544, + "step": 26397 + }, + { + "epoch": 0.15699638405176516, + "grad_norm": 2.107107400894165, + "learning_rate": 4.70205584863818e-05, + "loss": 4.2386, + "step": 26398 + }, + { + "epoch": 0.15700233133504615, + "grad_norm": 1.6025463342666626, + "learning_rate": 4.7020337336164746e-05, + "loss": 5.742, + "step": 26399 + }, + { + "epoch": 0.15700827861832714, + "grad_norm": 1.4157508611679077, + "learning_rate": 4.702011617826063e-05, + "loss": 6.2568, + "step": 26400 + }, + { + "epoch": 0.15701422590160816, + "grad_norm": 1.4367010593414307, + "learning_rate": 4.701989501266951e-05, + "loss": 6.0992, + "step": 26401 + }, + { + "epoch": 0.15702017318488914, + "grad_norm": 1.7271238565444946, + "learning_rate": 4.7019673839391476e-05, + "loss": 4.9925, + "step": 26402 + }, + { + "epoch": 0.15702612046817013, + "grad_norm": 1.4689936637878418, + "learning_rate": 4.70194526584266e-05, + "loss": 5.1224, + "step": 26403 + }, + { + "epoch": 0.15703206775145115, + "grad_norm": 1.816994071006775, + "learning_rate": 4.701923146977496e-05, + "loss": 4.5333, + "step": 26404 + }, + { + "epoch": 0.15703801503473214, + "grad_norm": 1.6789166927337646, + "learning_rate": 4.7019010273436634e-05, + "loss": 4.9303, + "step": 26405 + }, + { + "epoch": 0.15704396231801313, + "grad_norm": 1.8921838998794556, + "learning_rate": 4.70187890694117e-05, + "loss": 4.3924, + "step": 26406 + }, + { + "epoch": 0.15704990960129414, + "grad_norm": 2.397531270980835, + "learning_rate": 4.701856785770024e-05, + "loss": 3.317, + "step": 26407 + }, + { + "epoch": 0.15705585688457513, + "grad_norm": 2.1896491050720215, + "learning_rate": 4.7018346638302314e-05, + "loss": 4.2621, + "step": 26408 + }, + { + "epoch": 0.15706180416785612, + "grad_norm": 1.5073274374008179, + "learning_rate": 4.7018125411218014e-05, + "loss": 5.238, + "step": 26409 + }, + { + "epoch": 0.15706775145113713, + "grad_norm": 1.672512173652649, + "learning_rate": 4.701790417644741e-05, + "loss": 5.0822, + "step": 26410 + }, + { + "epoch": 0.15707369873441812, + "grad_norm": 1.6251648664474487, + "learning_rate": 4.701768293399059e-05, + "loss": 5.3444, + "step": 26411 + }, + { + "epoch": 0.1570796460176991, + "grad_norm": 1.8805150985717773, + "learning_rate": 4.701746168384763e-05, + "loss": 4.8765, + "step": 26412 + }, + { + "epoch": 0.15708559330098013, + "grad_norm": 1.7325724363327026, + "learning_rate": 4.701724042601859e-05, + "loss": 5.3281, + "step": 26413 + }, + { + "epoch": 0.1570915405842611, + "grad_norm": 1.5105476379394531, + "learning_rate": 4.701701916050357e-05, + "loss": 5.2577, + "step": 26414 + }, + { + "epoch": 0.1570974878675421, + "grad_norm": 1.766034722328186, + "learning_rate": 4.701679788730263e-05, + "loss": 4.8186, + "step": 26415 + }, + { + "epoch": 0.15710343515082312, + "grad_norm": 1.5909993648529053, + "learning_rate": 4.701657660641585e-05, + "loss": 4.9077, + "step": 26416 + }, + { + "epoch": 0.1571093824341041, + "grad_norm": 1.663878083229065, + "learning_rate": 4.7016355317843316e-05, + "loss": 5.3196, + "step": 26417 + }, + { + "epoch": 0.1571153297173851, + "grad_norm": 1.8101507425308228, + "learning_rate": 4.7016134021585095e-05, + "loss": 4.7219, + "step": 26418 + }, + { + "epoch": 0.15712127700066608, + "grad_norm": 1.3929054737091064, + "learning_rate": 4.7015912717641276e-05, + "loss": 5.169, + "step": 26419 + }, + { + "epoch": 0.1571272242839471, + "grad_norm": 1.6896204948425293, + "learning_rate": 4.701569140601192e-05, + "loss": 4.9141, + "step": 26420 + }, + { + "epoch": 0.15713317156722809, + "grad_norm": 2.3035976886749268, + "learning_rate": 4.7015470086697124e-05, + "loss": 4.4289, + "step": 26421 + }, + { + "epoch": 0.15713911885050907, + "grad_norm": 1.8286256790161133, + "learning_rate": 4.701524875969695e-05, + "loss": 4.7177, + "step": 26422 + }, + { + "epoch": 0.1571450661337901, + "grad_norm": 1.7254390716552734, + "learning_rate": 4.701502742501147e-05, + "loss": 3.99, + "step": 26423 + }, + { + "epoch": 0.15715101341707108, + "grad_norm": 1.6733616590499878, + "learning_rate": 4.701480608264078e-05, + "loss": 5.4146, + "step": 26424 + }, + { + "epoch": 0.15715696070035207, + "grad_norm": 2.167525291442871, + "learning_rate": 4.701458473258496e-05, + "loss": 5.751, + "step": 26425 + }, + { + "epoch": 0.15716290798363308, + "grad_norm": 1.5784038305282593, + "learning_rate": 4.7014363374844064e-05, + "loss": 5.2341, + "step": 26426 + }, + { + "epoch": 0.15716885526691407, + "grad_norm": 1.6087944507598877, + "learning_rate": 4.7014142009418176e-05, + "loss": 4.6644, + "step": 26427 + }, + { + "epoch": 0.15717480255019506, + "grad_norm": 2.1396427154541016, + "learning_rate": 4.701392063630739e-05, + "loss": 4.7034, + "step": 26428 + }, + { + "epoch": 0.15718074983347607, + "grad_norm": 2.069359540939331, + "learning_rate": 4.701369925551177e-05, + "loss": 4.1612, + "step": 26429 + }, + { + "epoch": 0.15718669711675706, + "grad_norm": 2.0008041858673096, + "learning_rate": 4.7013477867031385e-05, + "loss": 4.3536, + "step": 26430 + }, + { + "epoch": 0.15719264440003805, + "grad_norm": 1.9997189044952393, + "learning_rate": 4.701325647086633e-05, + "loss": 4.4613, + "step": 26431 + }, + { + "epoch": 0.15719859168331907, + "grad_norm": 1.625603437423706, + "learning_rate": 4.701303506701667e-05, + "loss": 4.63, + "step": 26432 + }, + { + "epoch": 0.15720453896660005, + "grad_norm": 1.5895150899887085, + "learning_rate": 4.701281365548249e-05, + "loss": 4.884, + "step": 26433 + }, + { + "epoch": 0.15721048624988104, + "grad_norm": 1.6569048166275024, + "learning_rate": 4.7012592236263865e-05, + "loss": 4.5834, + "step": 26434 + }, + { + "epoch": 0.15721643353316206, + "grad_norm": 1.9942916631698608, + "learning_rate": 4.7012370809360874e-05, + "loss": 4.8536, + "step": 26435 + }, + { + "epoch": 0.15722238081644305, + "grad_norm": 1.7535972595214844, + "learning_rate": 4.701214937477359e-05, + "loss": 4.9008, + "step": 26436 + }, + { + "epoch": 0.15722832809972404, + "grad_norm": 1.9767074584960938, + "learning_rate": 4.7011927932502085e-05, + "loss": 5.4972, + "step": 26437 + }, + { + "epoch": 0.15723427538300505, + "grad_norm": 1.6117023229599, + "learning_rate": 4.701170648254645e-05, + "loss": 5.2583, + "step": 26438 + }, + { + "epoch": 0.15724022266628604, + "grad_norm": 1.6277034282684326, + "learning_rate": 4.7011485024906754e-05, + "loss": 5.0635, + "step": 26439 + }, + { + "epoch": 0.15724616994956703, + "grad_norm": 1.5075265169143677, + "learning_rate": 4.701126355958308e-05, + "loss": 5.2974, + "step": 26440 + }, + { + "epoch": 0.15725211723284804, + "grad_norm": 1.377233862876892, + "learning_rate": 4.70110420865755e-05, + "loss": 5.0643, + "step": 26441 + }, + { + "epoch": 0.15725806451612903, + "grad_norm": 1.5468838214874268, + "learning_rate": 4.7010820605884085e-05, + "loss": 5.0746, + "step": 26442 + }, + { + "epoch": 0.15726401179941002, + "grad_norm": 1.864901065826416, + "learning_rate": 4.701059911750893e-05, + "loss": 5.0492, + "step": 26443 + }, + { + "epoch": 0.15726995908269104, + "grad_norm": 2.086214542388916, + "learning_rate": 4.70103776214501e-05, + "loss": 4.8566, + "step": 26444 + }, + { + "epoch": 0.15727590636597202, + "grad_norm": 1.571226716041565, + "learning_rate": 4.701015611770767e-05, + "loss": 4.7567, + "step": 26445 + }, + { + "epoch": 0.157281853649253, + "grad_norm": 2.299607753753662, + "learning_rate": 4.7009934606281726e-05, + "loss": 4.8576, + "step": 26446 + }, + { + "epoch": 0.15728780093253403, + "grad_norm": 2.019814968109131, + "learning_rate": 4.7009713087172335e-05, + "loss": 4.6524, + "step": 26447 + }, + { + "epoch": 0.15729374821581502, + "grad_norm": 1.8718371391296387, + "learning_rate": 4.700949156037959e-05, + "loss": 4.6629, + "step": 26448 + }, + { + "epoch": 0.157299695499096, + "grad_norm": 1.9023678302764893, + "learning_rate": 4.700927002590355e-05, + "loss": 4.8558, + "step": 26449 + }, + { + "epoch": 0.15730564278237702, + "grad_norm": 1.8519774675369263, + "learning_rate": 4.700904848374431e-05, + "loss": 4.8498, + "step": 26450 + }, + { + "epoch": 0.157311590065658, + "grad_norm": 2.1003715991973877, + "learning_rate": 4.7008826933901937e-05, + "loss": 4.9443, + "step": 26451 + }, + { + "epoch": 0.157317537348939, + "grad_norm": 1.8350003957748413, + "learning_rate": 4.7008605376376504e-05, + "loss": 4.9194, + "step": 26452 + }, + { + "epoch": 0.15732348463222, + "grad_norm": 1.9740381240844727, + "learning_rate": 4.70083838111681e-05, + "loss": 5.035, + "step": 26453 + }, + { + "epoch": 0.157329431915501, + "grad_norm": 1.8660650253295898, + "learning_rate": 4.700816223827679e-05, + "loss": 4.7712, + "step": 26454 + }, + { + "epoch": 0.157335379198782, + "grad_norm": 2.6117658615112305, + "learning_rate": 4.700794065770266e-05, + "loss": 4.0286, + "step": 26455 + }, + { + "epoch": 0.157341326482063, + "grad_norm": 2.0968191623687744, + "learning_rate": 4.700771906944579e-05, + "loss": 4.505, + "step": 26456 + }, + { + "epoch": 0.157347273765344, + "grad_norm": 2.0062074661254883, + "learning_rate": 4.700749747350624e-05, + "loss": 4.806, + "step": 26457 + }, + { + "epoch": 0.15735322104862498, + "grad_norm": 1.8398696184158325, + "learning_rate": 4.700727586988412e-05, + "loss": 4.799, + "step": 26458 + }, + { + "epoch": 0.157359168331906, + "grad_norm": 1.8096837997436523, + "learning_rate": 4.7007054258579474e-05, + "loss": 5.0503, + "step": 26459 + }, + { + "epoch": 0.15736511561518698, + "grad_norm": 1.735893726348877, + "learning_rate": 4.7006832639592396e-05, + "loss": 5.037, + "step": 26460 + }, + { + "epoch": 0.15737106289846797, + "grad_norm": 1.9189250469207764, + "learning_rate": 4.7006611012922966e-05, + "loss": 5.3352, + "step": 26461 + }, + { + "epoch": 0.157377010181749, + "grad_norm": 2.387317657470703, + "learning_rate": 4.7006389378571246e-05, + "loss": 4.055, + "step": 26462 + }, + { + "epoch": 0.15738295746502998, + "grad_norm": 2.414651870727539, + "learning_rate": 4.7006167736537323e-05, + "loss": 3.7756, + "step": 26463 + }, + { + "epoch": 0.15738890474831096, + "grad_norm": 2.497237205505371, + "learning_rate": 4.700594608682127e-05, + "loss": 3.7823, + "step": 26464 + }, + { + "epoch": 0.15739485203159198, + "grad_norm": 2.2141029834747314, + "learning_rate": 4.700572442942318e-05, + "loss": 4.1131, + "step": 26465 + }, + { + "epoch": 0.15740079931487297, + "grad_norm": 1.8615038394927979, + "learning_rate": 4.700550276434312e-05, + "loss": 4.8686, + "step": 26466 + }, + { + "epoch": 0.15740674659815396, + "grad_norm": 1.7082819938659668, + "learning_rate": 4.700528109158115e-05, + "loss": 5.2237, + "step": 26467 + }, + { + "epoch": 0.15741269388143497, + "grad_norm": 1.8039544820785522, + "learning_rate": 4.700505941113739e-05, + "loss": 4.5243, + "step": 26468 + }, + { + "epoch": 0.15741864116471596, + "grad_norm": 1.874585509300232, + "learning_rate": 4.700483772301187e-05, + "loss": 4.7674, + "step": 26469 + }, + { + "epoch": 0.15742458844799695, + "grad_norm": 2.083904266357422, + "learning_rate": 4.70046160272047e-05, + "loss": 4.8949, + "step": 26470 + }, + { + "epoch": 0.15743053573127797, + "grad_norm": 1.3937793970108032, + "learning_rate": 4.700439432371593e-05, + "loss": 5.6113, + "step": 26471 + }, + { + "epoch": 0.15743648301455895, + "grad_norm": 1.924481987953186, + "learning_rate": 4.700417261254567e-05, + "loss": 5.1439, + "step": 26472 + }, + { + "epoch": 0.15744243029783994, + "grad_norm": 1.6527281999588013, + "learning_rate": 4.700395089369397e-05, + "loss": 5.6962, + "step": 26473 + }, + { + "epoch": 0.15744837758112096, + "grad_norm": 1.5053030252456665, + "learning_rate": 4.700372916716093e-05, + "loss": 4.7299, + "step": 26474 + }, + { + "epoch": 0.15745432486440195, + "grad_norm": 1.2048367261886597, + "learning_rate": 4.7003507432946604e-05, + "loss": 5.5429, + "step": 26475 + }, + { + "epoch": 0.15746027214768293, + "grad_norm": 1.3451159000396729, + "learning_rate": 4.700328569105108e-05, + "loss": 5.5326, + "step": 26476 + }, + { + "epoch": 0.15746621943096392, + "grad_norm": 1.4441956281661987, + "learning_rate": 4.700306394147445e-05, + "loss": 5.5795, + "step": 26477 + }, + { + "epoch": 0.15747216671424494, + "grad_norm": 1.5551849603652954, + "learning_rate": 4.700284218421676e-05, + "loss": 5.2977, + "step": 26478 + }, + { + "epoch": 0.15747811399752593, + "grad_norm": 1.713437795639038, + "learning_rate": 4.7002620419278115e-05, + "loss": 5.242, + "step": 26479 + }, + { + "epoch": 0.15748406128080691, + "grad_norm": 1.4137530326843262, + "learning_rate": 4.7002398646658586e-05, + "loss": 5.2396, + "step": 26480 + }, + { + "epoch": 0.15749000856408793, + "grad_norm": 1.846640706062317, + "learning_rate": 4.700217686635824e-05, + "loss": 4.926, + "step": 26481 + }, + { + "epoch": 0.15749595584736892, + "grad_norm": 2.2699780464172363, + "learning_rate": 4.7001955078377156e-05, + "loss": 3.8352, + "step": 26482 + }, + { + "epoch": 0.1575019031306499, + "grad_norm": 1.959821105003357, + "learning_rate": 4.700173328271543e-05, + "loss": 4.7261, + "step": 26483 + }, + { + "epoch": 0.15750785041393092, + "grad_norm": 1.5478743314743042, + "learning_rate": 4.700151147937312e-05, + "loss": 5.463, + "step": 26484 + }, + { + "epoch": 0.1575137976972119, + "grad_norm": 1.835830807685852, + "learning_rate": 4.7001289668350314e-05, + "loss": 4.9938, + "step": 26485 + }, + { + "epoch": 0.1575197449804929, + "grad_norm": 2.1762354373931885, + "learning_rate": 4.700106784964708e-05, + "loss": 4.0548, + "step": 26486 + }, + { + "epoch": 0.15752569226377391, + "grad_norm": 1.8922265768051147, + "learning_rate": 4.70008460232635e-05, + "loss": 4.1947, + "step": 26487 + }, + { + "epoch": 0.1575316395470549, + "grad_norm": 1.6450932025909424, + "learning_rate": 4.7000624189199646e-05, + "loss": 5.014, + "step": 26488 + }, + { + "epoch": 0.1575375868303359, + "grad_norm": 1.5196298360824585, + "learning_rate": 4.7000402347455616e-05, + "loss": 5.332, + "step": 26489 + }, + { + "epoch": 0.1575435341136169, + "grad_norm": 1.665044903755188, + "learning_rate": 4.700018049803146e-05, + "loss": 4.992, + "step": 26490 + }, + { + "epoch": 0.1575494813968979, + "grad_norm": 1.4281147718429565, + "learning_rate": 4.6999958640927275e-05, + "loss": 4.9014, + "step": 26491 + }, + { + "epoch": 0.15755542868017888, + "grad_norm": 1.4559162855148315, + "learning_rate": 4.6999736776143135e-05, + "loss": 4.9361, + "step": 26492 + }, + { + "epoch": 0.1575613759634599, + "grad_norm": 1.7235175371170044, + "learning_rate": 4.699951490367911e-05, + "loss": 5.2429, + "step": 26493 + }, + { + "epoch": 0.1575673232467409, + "grad_norm": 1.5422228574752808, + "learning_rate": 4.699929302353528e-05, + "loss": 5.5294, + "step": 26494 + }, + { + "epoch": 0.15757327053002188, + "grad_norm": 1.6905406713485718, + "learning_rate": 4.699907113571173e-05, + "loss": 5.0958, + "step": 26495 + }, + { + "epoch": 0.1575792178133029, + "grad_norm": 1.8692830801010132, + "learning_rate": 4.699884924020853e-05, + "loss": 4.7711, + "step": 26496 + }, + { + "epoch": 0.15758516509658388, + "grad_norm": 1.7128182649612427, + "learning_rate": 4.699862733702575e-05, + "loss": 5.344, + "step": 26497 + }, + { + "epoch": 0.15759111237986487, + "grad_norm": 1.7795850038528442, + "learning_rate": 4.6998405426163486e-05, + "loss": 5.044, + "step": 26498 + }, + { + "epoch": 0.15759705966314588, + "grad_norm": 1.8591927289962769, + "learning_rate": 4.6998183507621804e-05, + "loss": 5.7269, + "step": 26499 + }, + { + "epoch": 0.15760300694642687, + "grad_norm": 1.7289692163467407, + "learning_rate": 4.6997961581400785e-05, + "loss": 5.295, + "step": 26500 + }, + { + "epoch": 0.15760895422970786, + "grad_norm": 2.03056001663208, + "learning_rate": 4.699773964750049e-05, + "loss": 4.9402, + "step": 26501 + }, + { + "epoch": 0.15761490151298888, + "grad_norm": 1.7518073320388794, + "learning_rate": 4.699751770592104e-05, + "loss": 4.8934, + "step": 26502 + }, + { + "epoch": 0.15762084879626986, + "grad_norm": 1.7724835872650146, + "learning_rate": 4.6997295756662465e-05, + "loss": 4.6237, + "step": 26503 + }, + { + "epoch": 0.15762679607955085, + "grad_norm": 1.475229263305664, + "learning_rate": 4.699707379972485e-05, + "loss": 5.2655, + "step": 26504 + }, + { + "epoch": 0.15763274336283187, + "grad_norm": 1.4267539978027344, + "learning_rate": 4.69968518351083e-05, + "loss": 5.2016, + "step": 26505 + }, + { + "epoch": 0.15763869064611286, + "grad_norm": 2.1211252212524414, + "learning_rate": 4.699662986281288e-05, + "loss": 4.1632, + "step": 26506 + }, + { + "epoch": 0.15764463792939384, + "grad_norm": 2.0549299716949463, + "learning_rate": 4.699640788283866e-05, + "loss": 4.0886, + "step": 26507 + }, + { + "epoch": 0.15765058521267486, + "grad_norm": 2.210500717163086, + "learning_rate": 4.699618589518572e-05, + "loss": 4.3042, + "step": 26508 + }, + { + "epoch": 0.15765653249595585, + "grad_norm": 2.2884981632232666, + "learning_rate": 4.699596389985413e-05, + "loss": 4.178, + "step": 26509 + }, + { + "epoch": 0.15766247977923684, + "grad_norm": 2.24526047706604, + "learning_rate": 4.699574189684399e-05, + "loss": 4.2319, + "step": 26510 + }, + { + "epoch": 0.15766842706251785, + "grad_norm": 2.401103973388672, + "learning_rate": 4.699551988615535e-05, + "loss": 4.1215, + "step": 26511 + }, + { + "epoch": 0.15767437434579884, + "grad_norm": 2.3012118339538574, + "learning_rate": 4.699529786778831e-05, + "loss": 4.3254, + "step": 26512 + }, + { + "epoch": 0.15768032162907983, + "grad_norm": 1.963396668434143, + "learning_rate": 4.699507584174294e-05, + "loss": 4.4707, + "step": 26513 + }, + { + "epoch": 0.15768626891236084, + "grad_norm": 2.3375425338745117, + "learning_rate": 4.699485380801931e-05, + "loss": 4.2861, + "step": 26514 + }, + { + "epoch": 0.15769221619564183, + "grad_norm": 2.189077377319336, + "learning_rate": 4.699463176661751e-05, + "loss": 4.3273, + "step": 26515 + }, + { + "epoch": 0.15769816347892282, + "grad_norm": 1.8198938369750977, + "learning_rate": 4.699440971753761e-05, + "loss": 4.6847, + "step": 26516 + }, + { + "epoch": 0.15770411076220384, + "grad_norm": 1.646579623222351, + "learning_rate": 4.699418766077969e-05, + "loss": 5.126, + "step": 26517 + }, + { + "epoch": 0.15771005804548482, + "grad_norm": 2.0718090534210205, + "learning_rate": 4.6993965596343825e-05, + "loss": 4.5059, + "step": 26518 + }, + { + "epoch": 0.1577160053287658, + "grad_norm": 1.6022831201553345, + "learning_rate": 4.699374352423009e-05, + "loss": 5.5119, + "step": 26519 + }, + { + "epoch": 0.15772195261204683, + "grad_norm": 1.3838839530944824, + "learning_rate": 4.699352144443857e-05, + "loss": 5.0512, + "step": 26520 + }, + { + "epoch": 0.15772789989532782, + "grad_norm": 1.3122941255569458, + "learning_rate": 4.699329935696934e-05, + "loss": 5.1832, + "step": 26521 + }, + { + "epoch": 0.1577338471786088, + "grad_norm": 1.6332945823669434, + "learning_rate": 4.699307726182247e-05, + "loss": 5.081, + "step": 26522 + }, + { + "epoch": 0.15773979446188982, + "grad_norm": 1.5045149326324463, + "learning_rate": 4.699285515899805e-05, + "loss": 5.2076, + "step": 26523 + }, + { + "epoch": 0.1577457417451708, + "grad_norm": 1.4530036449432373, + "learning_rate": 4.699263304849615e-05, + "loss": 5.3623, + "step": 26524 + }, + { + "epoch": 0.1577516890284518, + "grad_norm": 1.6600695848464966, + "learning_rate": 4.699241093031685e-05, + "loss": 5.5862, + "step": 26525 + }, + { + "epoch": 0.1577576363117328, + "grad_norm": 1.6276617050170898, + "learning_rate": 4.6992188804460225e-05, + "loss": 5.282, + "step": 26526 + }, + { + "epoch": 0.1577635835950138, + "grad_norm": 1.7213892936706543, + "learning_rate": 4.6991966670926355e-05, + "loss": 5.4613, + "step": 26527 + }, + { + "epoch": 0.1577695308782948, + "grad_norm": 1.63749361038208, + "learning_rate": 4.6991744529715316e-05, + "loss": 5.4498, + "step": 26528 + }, + { + "epoch": 0.1577754781615758, + "grad_norm": 1.5182081460952759, + "learning_rate": 4.6991522380827184e-05, + "loss": 5.3962, + "step": 26529 + }, + { + "epoch": 0.1577814254448568, + "grad_norm": 1.6695536375045776, + "learning_rate": 4.699130022426204e-05, + "loss": 5.1221, + "step": 26530 + }, + { + "epoch": 0.15778737272813778, + "grad_norm": 1.4350519180297852, + "learning_rate": 4.6991078060019966e-05, + "loss": 5.319, + "step": 26531 + }, + { + "epoch": 0.1577933200114188, + "grad_norm": 1.2092465162277222, + "learning_rate": 4.699085588810103e-05, + "loss": 5.4316, + "step": 26532 + }, + { + "epoch": 0.15779926729469979, + "grad_norm": 1.474252700805664, + "learning_rate": 4.6990633708505304e-05, + "loss": 5.6559, + "step": 26533 + }, + { + "epoch": 0.15780521457798077, + "grad_norm": 1.6271101236343384, + "learning_rate": 4.699041152123289e-05, + "loss": 5.7491, + "step": 26534 + }, + { + "epoch": 0.1578111618612618, + "grad_norm": 1.6184288263320923, + "learning_rate": 4.699018932628384e-05, + "loss": 5.3195, + "step": 26535 + }, + { + "epoch": 0.15781710914454278, + "grad_norm": 1.3626726865768433, + "learning_rate": 4.698996712365825e-05, + "loss": 5.2913, + "step": 26536 + }, + { + "epoch": 0.15782305642782377, + "grad_norm": 2.3408188819885254, + "learning_rate": 4.6989744913356185e-05, + "loss": 4.774, + "step": 26537 + }, + { + "epoch": 0.15782900371110475, + "grad_norm": 1.500992774963379, + "learning_rate": 4.698952269537773e-05, + "loss": 5.5717, + "step": 26538 + }, + { + "epoch": 0.15783495099438577, + "grad_norm": 1.393517017364502, + "learning_rate": 4.6989300469722955e-05, + "loss": 6.1478, + "step": 26539 + }, + { + "epoch": 0.15784089827766676, + "grad_norm": 1.6048024892807007, + "learning_rate": 4.698907823639195e-05, + "loss": 5.5076, + "step": 26540 + }, + { + "epoch": 0.15784684556094775, + "grad_norm": 1.7231130599975586, + "learning_rate": 4.698885599538478e-05, + "loss": 5.1799, + "step": 26541 + }, + { + "epoch": 0.15785279284422876, + "grad_norm": 1.4809112548828125, + "learning_rate": 4.6988633746701525e-05, + "loss": 5.146, + "step": 26542 + }, + { + "epoch": 0.15785874012750975, + "grad_norm": 1.6530802249908447, + "learning_rate": 4.6988411490342266e-05, + "loss": 5.3245, + "step": 26543 + }, + { + "epoch": 0.15786468741079074, + "grad_norm": 1.5264098644256592, + "learning_rate": 4.6988189226307087e-05, + "loss": 5.3715, + "step": 26544 + }, + { + "epoch": 0.15787063469407175, + "grad_norm": 1.3241318464279175, + "learning_rate": 4.6987966954596054e-05, + "loss": 5.387, + "step": 26545 + }, + { + "epoch": 0.15787658197735274, + "grad_norm": 1.6130857467651367, + "learning_rate": 4.698774467520924e-05, + "loss": 5.2902, + "step": 26546 + }, + { + "epoch": 0.15788252926063373, + "grad_norm": 1.4999042749404907, + "learning_rate": 4.698752238814674e-05, + "loss": 5.2129, + "step": 26547 + }, + { + "epoch": 0.15788847654391475, + "grad_norm": 1.4773963689804077, + "learning_rate": 4.698730009340863e-05, + "loss": 5.7722, + "step": 26548 + }, + { + "epoch": 0.15789442382719573, + "grad_norm": 1.666413426399231, + "learning_rate": 4.698707779099497e-05, + "loss": 5.7418, + "step": 26549 + }, + { + "epoch": 0.15790037111047672, + "grad_norm": 1.4869890213012695, + "learning_rate": 4.698685548090585e-05, + "loss": 4.8418, + "step": 26550 + }, + { + "epoch": 0.15790631839375774, + "grad_norm": 1.6295100450515747, + "learning_rate": 4.698663316314135e-05, + "loss": 4.7722, + "step": 26551 + }, + { + "epoch": 0.15791226567703873, + "grad_norm": 1.5449434518814087, + "learning_rate": 4.698641083770154e-05, + "loss": 5.0621, + "step": 26552 + }, + { + "epoch": 0.15791821296031971, + "grad_norm": 1.6735725402832031, + "learning_rate": 4.6986188504586507e-05, + "loss": 5.5605, + "step": 26553 + }, + { + "epoch": 0.15792416024360073, + "grad_norm": 1.6270878314971924, + "learning_rate": 4.698596616379631e-05, + "loss": 5.279, + "step": 26554 + }, + { + "epoch": 0.15793010752688172, + "grad_norm": 1.6335285902023315, + "learning_rate": 4.698574381533105e-05, + "loss": 5.398, + "step": 26555 + }, + { + "epoch": 0.1579360548101627, + "grad_norm": 2.2176520824432373, + "learning_rate": 4.698552145919079e-05, + "loss": 4.9806, + "step": 26556 + }, + { + "epoch": 0.15794200209344372, + "grad_norm": 1.8645645380020142, + "learning_rate": 4.6985299095375615e-05, + "loss": 5.2633, + "step": 26557 + }, + { + "epoch": 0.1579479493767247, + "grad_norm": 1.708526372909546, + "learning_rate": 4.698507672388559e-05, + "loss": 5.0308, + "step": 26558 + }, + { + "epoch": 0.1579538966600057, + "grad_norm": 2.148980140686035, + "learning_rate": 4.698485434472081e-05, + "loss": 4.5213, + "step": 26559 + }, + { + "epoch": 0.15795984394328672, + "grad_norm": 2.402442693710327, + "learning_rate": 4.6984631957881346e-05, + "loss": 4.4377, + "step": 26560 + }, + { + "epoch": 0.1579657912265677, + "grad_norm": 2.298003911972046, + "learning_rate": 4.698440956336727e-05, + "loss": 4.5809, + "step": 26561 + }, + { + "epoch": 0.1579717385098487, + "grad_norm": 2.53639554977417, + "learning_rate": 4.698418716117867e-05, + "loss": 4.1869, + "step": 26562 + }, + { + "epoch": 0.1579776857931297, + "grad_norm": 2.0686380863189697, + "learning_rate": 4.698396475131561e-05, + "loss": 4.413, + "step": 26563 + }, + { + "epoch": 0.1579836330764107, + "grad_norm": 1.8968595266342163, + "learning_rate": 4.698374233377818e-05, + "loss": 4.9939, + "step": 26564 + }, + { + "epoch": 0.15798958035969168, + "grad_norm": 1.8896044492721558, + "learning_rate": 4.698351990856645e-05, + "loss": 4.6383, + "step": 26565 + }, + { + "epoch": 0.1579955276429727, + "grad_norm": 1.7179672718048096, + "learning_rate": 4.6983297475680496e-05, + "loss": 5.5635, + "step": 26566 + }, + { + "epoch": 0.1580014749262537, + "grad_norm": 1.6506478786468506, + "learning_rate": 4.6983075035120404e-05, + "loss": 5.1821, + "step": 26567 + }, + { + "epoch": 0.15800742220953468, + "grad_norm": 2.180238723754883, + "learning_rate": 4.698285258688625e-05, + "loss": 4.1298, + "step": 26568 + }, + { + "epoch": 0.1580133694928157, + "grad_norm": 2.208676338195801, + "learning_rate": 4.698263013097811e-05, + "loss": 4.3238, + "step": 26569 + }, + { + "epoch": 0.15801931677609668, + "grad_norm": 1.694823145866394, + "learning_rate": 4.6982407667396055e-05, + "loss": 5.3418, + "step": 26570 + }, + { + "epoch": 0.15802526405937767, + "grad_norm": 1.7310692071914673, + "learning_rate": 4.6982185196140174e-05, + "loss": 5.4066, + "step": 26571 + }, + { + "epoch": 0.15803121134265868, + "grad_norm": 2.302055597305298, + "learning_rate": 4.698196271721054e-05, + "loss": 4.1817, + "step": 26572 + }, + { + "epoch": 0.15803715862593967, + "grad_norm": 1.872363567352295, + "learning_rate": 4.698174023060722e-05, + "loss": 4.6733, + "step": 26573 + }, + { + "epoch": 0.15804310590922066, + "grad_norm": 2.134537696838379, + "learning_rate": 4.698151773633032e-05, + "loss": 4.3211, + "step": 26574 + }, + { + "epoch": 0.15804905319250168, + "grad_norm": 2.4381020069122314, + "learning_rate": 4.698129523437989e-05, + "loss": 4.2212, + "step": 26575 + }, + { + "epoch": 0.15805500047578266, + "grad_norm": 1.6739851236343384, + "learning_rate": 4.6981072724756e-05, + "loss": 5.3057, + "step": 26576 + }, + { + "epoch": 0.15806094775906365, + "grad_norm": 1.8092267513275146, + "learning_rate": 4.6980850207458765e-05, + "loss": 4.7359, + "step": 26577 + }, + { + "epoch": 0.15806689504234467, + "grad_norm": 1.6420230865478516, + "learning_rate": 4.6980627682488235e-05, + "loss": 5.086, + "step": 26578 + }, + { + "epoch": 0.15807284232562566, + "grad_norm": 1.8741960525512695, + "learning_rate": 4.6980405149844494e-05, + "loss": 4.7842, + "step": 26579 + }, + { + "epoch": 0.15807878960890664, + "grad_norm": 2.6539900302886963, + "learning_rate": 4.698018260952763e-05, + "loss": 3.809, + "step": 26580 + }, + { + "epoch": 0.15808473689218766, + "grad_norm": 1.8262064456939697, + "learning_rate": 4.69799600615377e-05, + "loss": 4.8959, + "step": 26581 + }, + { + "epoch": 0.15809068417546865, + "grad_norm": 1.7090948820114136, + "learning_rate": 4.6979737505874796e-05, + "loss": 4.7723, + "step": 26582 + }, + { + "epoch": 0.15809663145874964, + "grad_norm": 1.5634857416152954, + "learning_rate": 4.6979514942539e-05, + "loss": 4.7533, + "step": 26583 + }, + { + "epoch": 0.15810257874203065, + "grad_norm": 1.6470197439193726, + "learning_rate": 4.697929237153037e-05, + "loss": 5.2194, + "step": 26584 + }, + { + "epoch": 0.15810852602531164, + "grad_norm": 2.060804605484009, + "learning_rate": 4.697906979284901e-05, + "loss": 4.3637, + "step": 26585 + }, + { + "epoch": 0.15811447330859263, + "grad_norm": 2.065943717956543, + "learning_rate": 4.697884720649498e-05, + "loss": 4.8908, + "step": 26586 + }, + { + "epoch": 0.15812042059187364, + "grad_norm": 1.5104914903640747, + "learning_rate": 4.697862461246836e-05, + "loss": 5.7029, + "step": 26587 + }, + { + "epoch": 0.15812636787515463, + "grad_norm": 1.593296766281128, + "learning_rate": 4.697840201076922e-05, + "loss": 5.7005, + "step": 26588 + }, + { + "epoch": 0.15813231515843562, + "grad_norm": 1.6516765356063843, + "learning_rate": 4.697817940139766e-05, + "loss": 5.3843, + "step": 26589 + }, + { + "epoch": 0.15813826244171664, + "grad_norm": 1.3671473264694214, + "learning_rate": 4.697795678435374e-05, + "loss": 5.4862, + "step": 26590 + }, + { + "epoch": 0.15814420972499763, + "grad_norm": 1.4163672924041748, + "learning_rate": 4.697773415963754e-05, + "loss": 5.4793, + "step": 26591 + }, + { + "epoch": 0.1581501570082786, + "grad_norm": 1.5477086305618286, + "learning_rate": 4.697751152724914e-05, + "loss": 5.2835, + "step": 26592 + }, + { + "epoch": 0.15815610429155963, + "grad_norm": 1.6029425859451294, + "learning_rate": 4.697728888718862e-05, + "loss": 5.3689, + "step": 26593 + }, + { + "epoch": 0.15816205157484062, + "grad_norm": 1.5130633115768433, + "learning_rate": 4.697706623945605e-05, + "loss": 6.1627, + "step": 26594 + }, + { + "epoch": 0.1581679988581216, + "grad_norm": 1.5171791315078735, + "learning_rate": 4.697684358405152e-05, + "loss": 4.9849, + "step": 26595 + }, + { + "epoch": 0.1581739461414026, + "grad_norm": 1.449781894683838, + "learning_rate": 4.69766209209751e-05, + "loss": 5.5273, + "step": 26596 + }, + { + "epoch": 0.1581798934246836, + "grad_norm": 1.430094838142395, + "learning_rate": 4.697639825022687e-05, + "loss": 5.6825, + "step": 26597 + }, + { + "epoch": 0.1581858407079646, + "grad_norm": 1.2635716199874878, + "learning_rate": 4.69761755718069e-05, + "loss": 5.2177, + "step": 26598 + }, + { + "epoch": 0.15819178799124559, + "grad_norm": 2.20355224609375, + "learning_rate": 4.697595288571528e-05, + "loss": 4.6664, + "step": 26599 + }, + { + "epoch": 0.1581977352745266, + "grad_norm": 1.586509108543396, + "learning_rate": 4.6975730191952086e-05, + "loss": 5.056, + "step": 26600 + }, + { + "epoch": 0.1582036825578076, + "grad_norm": 1.4773000478744507, + "learning_rate": 4.697550749051738e-05, + "loss": 5.2931, + "step": 26601 + }, + { + "epoch": 0.15820962984108858, + "grad_norm": 1.4557143449783325, + "learning_rate": 4.697528478141125e-05, + "loss": 4.9378, + "step": 26602 + }, + { + "epoch": 0.1582155771243696, + "grad_norm": 1.5859819650650024, + "learning_rate": 4.697506206463379e-05, + "loss": 5.1998, + "step": 26603 + }, + { + "epoch": 0.15822152440765058, + "grad_norm": 1.5068250894546509, + "learning_rate": 4.697483934018505e-05, + "loss": 5.2748, + "step": 26604 + }, + { + "epoch": 0.15822747169093157, + "grad_norm": 1.5842232704162598, + "learning_rate": 4.697461660806513e-05, + "loss": 5.326, + "step": 26605 + }, + { + "epoch": 0.1582334189742126, + "grad_norm": 1.5164762735366821, + "learning_rate": 4.697439386827409e-05, + "loss": 5.2282, + "step": 26606 + }, + { + "epoch": 0.15823936625749357, + "grad_norm": 1.5359309911727905, + "learning_rate": 4.697417112081203e-05, + "loss": 5.3723, + "step": 26607 + }, + { + "epoch": 0.15824531354077456, + "grad_norm": 1.560502529144287, + "learning_rate": 4.6973948365678996e-05, + "loss": 5.0822, + "step": 26608 + }, + { + "epoch": 0.15825126082405558, + "grad_norm": 1.5915874242782593, + "learning_rate": 4.69737256028751e-05, + "loss": 5.2849, + "step": 26609 + }, + { + "epoch": 0.15825720810733657, + "grad_norm": 1.613585352897644, + "learning_rate": 4.697350283240039e-05, + "loss": 5.1898, + "step": 26610 + }, + { + "epoch": 0.15826315539061755, + "grad_norm": 1.5696673393249512, + "learning_rate": 4.6973280054254966e-05, + "loss": 5.2518, + "step": 26611 + }, + { + "epoch": 0.15826910267389857, + "grad_norm": 1.2109240293502808, + "learning_rate": 4.697305726843889e-05, + "loss": 5.4032, + "step": 26612 + }, + { + "epoch": 0.15827504995717956, + "grad_norm": 1.47042715549469, + "learning_rate": 4.697283447495225e-05, + "loss": 5.1456, + "step": 26613 + }, + { + "epoch": 0.15828099724046055, + "grad_norm": 1.3937478065490723, + "learning_rate": 4.697261167379512e-05, + "loss": 5.3592, + "step": 26614 + }, + { + "epoch": 0.15828694452374156, + "grad_norm": 1.6204369068145752, + "learning_rate": 4.6972388864967574e-05, + "loss": 5.2882, + "step": 26615 + }, + { + "epoch": 0.15829289180702255, + "grad_norm": 1.654252290725708, + "learning_rate": 4.69721660484697e-05, + "loss": 5.2655, + "step": 26616 + }, + { + "epoch": 0.15829883909030354, + "grad_norm": 1.583075761795044, + "learning_rate": 4.6971943224301576e-05, + "loss": 5.097, + "step": 26617 + }, + { + "epoch": 0.15830478637358456, + "grad_norm": 1.3745534420013428, + "learning_rate": 4.697172039246326e-05, + "loss": 5.1911, + "step": 26618 + }, + { + "epoch": 0.15831073365686554, + "grad_norm": 1.662632703781128, + "learning_rate": 4.697149755295485e-05, + "loss": 4.9032, + "step": 26619 + }, + { + "epoch": 0.15831668094014653, + "grad_norm": 1.3548792600631714, + "learning_rate": 4.697127470577642e-05, + "loss": 5.3656, + "step": 26620 + }, + { + "epoch": 0.15832262822342755, + "grad_norm": 1.2697865962982178, + "learning_rate": 4.697105185092804e-05, + "loss": 5.2743, + "step": 26621 + }, + { + "epoch": 0.15832857550670854, + "grad_norm": 1.424477458000183, + "learning_rate": 4.69708289884098e-05, + "loss": 5.1278, + "step": 26622 + }, + { + "epoch": 0.15833452278998952, + "grad_norm": 1.5525426864624023, + "learning_rate": 4.697060611822176e-05, + "loss": 5.2804, + "step": 26623 + }, + { + "epoch": 0.15834047007327054, + "grad_norm": 1.5966732501983643, + "learning_rate": 4.697038324036401e-05, + "loss": 5.3546, + "step": 26624 + }, + { + "epoch": 0.15834641735655153, + "grad_norm": 1.4296703338623047, + "learning_rate": 4.6970160354836634e-05, + "loss": 5.1681, + "step": 26625 + }, + { + "epoch": 0.15835236463983252, + "grad_norm": 1.5928189754486084, + "learning_rate": 4.69699374616397e-05, + "loss": 5.2565, + "step": 26626 + }, + { + "epoch": 0.15835831192311353, + "grad_norm": 1.437814712524414, + "learning_rate": 4.696971456077328e-05, + "loss": 5.1813, + "step": 26627 + }, + { + "epoch": 0.15836425920639452, + "grad_norm": 1.4782744646072388, + "learning_rate": 4.696949165223747e-05, + "loss": 5.365, + "step": 26628 + }, + { + "epoch": 0.1583702064896755, + "grad_norm": 1.5123037099838257, + "learning_rate": 4.696926873603233e-05, + "loss": 5.255, + "step": 26629 + }, + { + "epoch": 0.15837615377295652, + "grad_norm": 1.4208122491836548, + "learning_rate": 4.696904581215795e-05, + "loss": 5.0531, + "step": 26630 + }, + { + "epoch": 0.1583821010562375, + "grad_norm": 1.4333672523498535, + "learning_rate": 4.69688228806144e-05, + "loss": 5.1035, + "step": 26631 + }, + { + "epoch": 0.1583880483395185, + "grad_norm": 1.3645392656326294, + "learning_rate": 4.696859994140176e-05, + "loss": 5.0107, + "step": 26632 + }, + { + "epoch": 0.15839399562279952, + "grad_norm": 1.6100040674209595, + "learning_rate": 4.6968376994520116e-05, + "loss": 5.054, + "step": 26633 + }, + { + "epoch": 0.1583999429060805, + "grad_norm": 1.431036353111267, + "learning_rate": 4.696815403996953e-05, + "loss": 5.4406, + "step": 26634 + }, + { + "epoch": 0.1584058901893615, + "grad_norm": 1.6785353422164917, + "learning_rate": 4.6967931077750096e-05, + "loss": 5.7861, + "step": 26635 + }, + { + "epoch": 0.1584118374726425, + "grad_norm": 1.549333095550537, + "learning_rate": 4.6967708107861876e-05, + "loss": 5.6662, + "step": 26636 + }, + { + "epoch": 0.1584177847559235, + "grad_norm": 1.5669690370559692, + "learning_rate": 4.696748513030496e-05, + "loss": 5.3213, + "step": 26637 + }, + { + "epoch": 0.15842373203920448, + "grad_norm": 1.6420881748199463, + "learning_rate": 4.696726214507942e-05, + "loss": 5.2381, + "step": 26638 + }, + { + "epoch": 0.1584296793224855, + "grad_norm": 1.811171293258667, + "learning_rate": 4.6967039152185345e-05, + "loss": 5.3656, + "step": 26639 + }, + { + "epoch": 0.1584356266057665, + "grad_norm": 1.7578849792480469, + "learning_rate": 4.696681615162279e-05, + "loss": 4.8774, + "step": 26640 + }, + { + "epoch": 0.15844157388904748, + "grad_norm": 2.0880799293518066, + "learning_rate": 4.696659314339185e-05, + "loss": 4.945, + "step": 26641 + }, + { + "epoch": 0.1584475211723285, + "grad_norm": 1.4735814332962036, + "learning_rate": 4.6966370127492603e-05, + "loss": 5.5415, + "step": 26642 + }, + { + "epoch": 0.15845346845560948, + "grad_norm": 1.7141392230987549, + "learning_rate": 4.696614710392512e-05, + "loss": 4.8197, + "step": 26643 + }, + { + "epoch": 0.15845941573889047, + "grad_norm": 1.9631140232086182, + "learning_rate": 4.696592407268949e-05, + "loss": 5.0147, + "step": 26644 + }, + { + "epoch": 0.15846536302217148, + "grad_norm": 2.1569128036499023, + "learning_rate": 4.696570103378577e-05, + "loss": 4.8175, + "step": 26645 + }, + { + "epoch": 0.15847131030545247, + "grad_norm": 2.07602596282959, + "learning_rate": 4.696547798721406e-05, + "loss": 5.0289, + "step": 26646 + }, + { + "epoch": 0.15847725758873346, + "grad_norm": 1.5406705141067505, + "learning_rate": 4.696525493297443e-05, + "loss": 5.1569, + "step": 26647 + }, + { + "epoch": 0.15848320487201448, + "grad_norm": 1.630928635597229, + "learning_rate": 4.696503187106695e-05, + "loss": 5.4698, + "step": 26648 + }, + { + "epoch": 0.15848915215529547, + "grad_norm": 1.5992403030395508, + "learning_rate": 4.696480880149171e-05, + "loss": 5.296, + "step": 26649 + }, + { + "epoch": 0.15849509943857645, + "grad_norm": 1.8908748626708984, + "learning_rate": 4.696458572424878e-05, + "loss": 5.0706, + "step": 26650 + }, + { + "epoch": 0.15850104672185747, + "grad_norm": 2.149810552597046, + "learning_rate": 4.6964362639338236e-05, + "loss": 4.8138, + "step": 26651 + }, + { + "epoch": 0.15850699400513846, + "grad_norm": 2.049520254135132, + "learning_rate": 4.696413954676016e-05, + "loss": 4.9173, + "step": 26652 + }, + { + "epoch": 0.15851294128841945, + "grad_norm": 2.03076434135437, + "learning_rate": 4.6963916446514634e-05, + "loss": 4.889, + "step": 26653 + }, + { + "epoch": 0.15851888857170043, + "grad_norm": 1.8261823654174805, + "learning_rate": 4.696369333860173e-05, + "loss": 4.7856, + "step": 26654 + }, + { + "epoch": 0.15852483585498145, + "grad_norm": 1.864707112312317, + "learning_rate": 4.6963470223021535e-05, + "loss": 4.8419, + "step": 26655 + }, + { + "epoch": 0.15853078313826244, + "grad_norm": 1.9796535968780518, + "learning_rate": 4.696324709977411e-05, + "loss": 4.7506, + "step": 26656 + }, + { + "epoch": 0.15853673042154343, + "grad_norm": 1.9936281442642212, + "learning_rate": 4.696302396885954e-05, + "loss": 4.8773, + "step": 26657 + }, + { + "epoch": 0.15854267770482444, + "grad_norm": 1.790238618850708, + "learning_rate": 4.696280083027791e-05, + "loss": 4.7998, + "step": 26658 + }, + { + "epoch": 0.15854862498810543, + "grad_norm": 1.9320149421691895, + "learning_rate": 4.69625776840293e-05, + "loss": 4.7113, + "step": 26659 + }, + { + "epoch": 0.15855457227138642, + "grad_norm": 1.6032037734985352, + "learning_rate": 4.696235453011377e-05, + "loss": 5.2848, + "step": 26660 + }, + { + "epoch": 0.15856051955466743, + "grad_norm": 1.8947795629501343, + "learning_rate": 4.696213136853141e-05, + "loss": 4.7212, + "step": 26661 + }, + { + "epoch": 0.15856646683794842, + "grad_norm": 2.017988681793213, + "learning_rate": 4.69619081992823e-05, + "loss": 4.8043, + "step": 26662 + }, + { + "epoch": 0.1585724141212294, + "grad_norm": 2.114877223968506, + "learning_rate": 4.696168502236652e-05, + "loss": 4.6626, + "step": 26663 + }, + { + "epoch": 0.15857836140451043, + "grad_norm": 2.029026985168457, + "learning_rate": 4.6961461837784134e-05, + "loss": 4.5315, + "step": 26664 + }, + { + "epoch": 0.15858430868779141, + "grad_norm": 2.052255630493164, + "learning_rate": 4.696123864553523e-05, + "loss": 4.6957, + "step": 26665 + }, + { + "epoch": 0.1585902559710724, + "grad_norm": 1.9599274396896362, + "learning_rate": 4.696101544561989e-05, + "loss": 4.792, + "step": 26666 + }, + { + "epoch": 0.15859620325435342, + "grad_norm": 2.1609420776367188, + "learning_rate": 4.6960792238038184e-05, + "loss": 4.8083, + "step": 26667 + }, + { + "epoch": 0.1586021505376344, + "grad_norm": 2.0834262371063232, + "learning_rate": 4.696056902279019e-05, + "loss": 4.7683, + "step": 26668 + }, + { + "epoch": 0.1586080978209154, + "grad_norm": 2.0544068813323975, + "learning_rate": 4.6960345799875995e-05, + "loss": 4.7, + "step": 26669 + }, + { + "epoch": 0.1586140451041964, + "grad_norm": 2.036548137664795, + "learning_rate": 4.696012256929566e-05, + "loss": 4.5653, + "step": 26670 + }, + { + "epoch": 0.1586199923874774, + "grad_norm": 1.7801802158355713, + "learning_rate": 4.6959899331049276e-05, + "loss": 4.7215, + "step": 26671 + }, + { + "epoch": 0.1586259396707584, + "grad_norm": 2.0025057792663574, + "learning_rate": 4.695967608513692e-05, + "loss": 4.6259, + "step": 26672 + }, + { + "epoch": 0.1586318869540394, + "grad_norm": 2.0719566345214844, + "learning_rate": 4.695945283155867e-05, + "loss": 4.7383, + "step": 26673 + }, + { + "epoch": 0.1586378342373204, + "grad_norm": 2.0565052032470703, + "learning_rate": 4.69592295703146e-05, + "loss": 5.2066, + "step": 26674 + }, + { + "epoch": 0.15864378152060138, + "grad_norm": 1.7758921384811401, + "learning_rate": 4.695900630140479e-05, + "loss": 5.5619, + "step": 26675 + }, + { + "epoch": 0.1586497288038824, + "grad_norm": 1.799654483795166, + "learning_rate": 4.695878302482931e-05, + "loss": 5.0901, + "step": 26676 + }, + { + "epoch": 0.15865567608716338, + "grad_norm": 1.785900592803955, + "learning_rate": 4.695855974058826e-05, + "loss": 4.9323, + "step": 26677 + }, + { + "epoch": 0.15866162337044437, + "grad_norm": 1.9525444507598877, + "learning_rate": 4.695833644868169e-05, + "loss": 4.7603, + "step": 26678 + }, + { + "epoch": 0.1586675706537254, + "grad_norm": 1.9197458028793335, + "learning_rate": 4.69581131491097e-05, + "loss": 4.636, + "step": 26679 + }, + { + "epoch": 0.15867351793700638, + "grad_norm": 2.3043594360351562, + "learning_rate": 4.695788984187236e-05, + "loss": 4.4529, + "step": 26680 + }, + { + "epoch": 0.15867946522028736, + "grad_norm": 1.687930703163147, + "learning_rate": 4.6957666526969744e-05, + "loss": 4.9549, + "step": 26681 + }, + { + "epoch": 0.15868541250356838, + "grad_norm": 1.5754574537277222, + "learning_rate": 4.6957443204401935e-05, + "loss": 5.4364, + "step": 26682 + }, + { + "epoch": 0.15869135978684937, + "grad_norm": 1.5300992727279663, + "learning_rate": 4.6957219874169013e-05, + "loss": 5.3151, + "step": 26683 + }, + { + "epoch": 0.15869730707013036, + "grad_norm": 1.7758506536483765, + "learning_rate": 4.695699653627105e-05, + "loss": 5.2053, + "step": 26684 + }, + { + "epoch": 0.15870325435341137, + "grad_norm": 1.5882158279418945, + "learning_rate": 4.6956773190708116e-05, + "loss": 4.8202, + "step": 26685 + }, + { + "epoch": 0.15870920163669236, + "grad_norm": 1.5649267435073853, + "learning_rate": 4.695654983748031e-05, + "loss": 4.3946, + "step": 26686 + }, + { + "epoch": 0.15871514891997335, + "grad_norm": 1.5999925136566162, + "learning_rate": 4.6956326476587696e-05, + "loss": 4.3512, + "step": 26687 + }, + { + "epoch": 0.15872109620325436, + "grad_norm": 1.699987530708313, + "learning_rate": 4.6956103108030356e-05, + "loss": 4.7479, + "step": 26688 + }, + { + "epoch": 0.15872704348653535, + "grad_norm": 1.4755208492279053, + "learning_rate": 4.695587973180837e-05, + "loss": 5.1206, + "step": 26689 + }, + { + "epoch": 0.15873299076981634, + "grad_norm": 1.7642509937286377, + "learning_rate": 4.6955656347921813e-05, + "loss": 5.3179, + "step": 26690 + }, + { + "epoch": 0.15873893805309736, + "grad_norm": 1.5511635541915894, + "learning_rate": 4.695543295637076e-05, + "loss": 4.4365, + "step": 26691 + }, + { + "epoch": 0.15874488533637834, + "grad_norm": 1.5347273349761963, + "learning_rate": 4.6955209557155286e-05, + "loss": 4.368, + "step": 26692 + }, + { + "epoch": 0.15875083261965933, + "grad_norm": 1.5347685813903809, + "learning_rate": 4.695498615027549e-05, + "loss": 4.2812, + "step": 26693 + }, + { + "epoch": 0.15875677990294035, + "grad_norm": 1.5469902753829956, + "learning_rate": 4.6954762735731425e-05, + "loss": 4.4445, + "step": 26694 + }, + { + "epoch": 0.15876272718622134, + "grad_norm": 1.4887003898620605, + "learning_rate": 4.695453931352318e-05, + "loss": 4.3584, + "step": 26695 + }, + { + "epoch": 0.15876867446950232, + "grad_norm": 1.5207375288009644, + "learning_rate": 4.695431588365084e-05, + "loss": 4.7219, + "step": 26696 + }, + { + "epoch": 0.15877462175278334, + "grad_norm": 1.7801141738891602, + "learning_rate": 4.695409244611447e-05, + "loss": 5.0328, + "step": 26697 + }, + { + "epoch": 0.15878056903606433, + "grad_norm": 1.8171552419662476, + "learning_rate": 4.695386900091415e-05, + "loss": 5.4286, + "step": 26698 + }, + { + "epoch": 0.15878651631934532, + "grad_norm": 1.698379397392273, + "learning_rate": 4.695364554804996e-05, + "loss": 4.7824, + "step": 26699 + }, + { + "epoch": 0.15879246360262633, + "grad_norm": 1.5824103355407715, + "learning_rate": 4.695342208752199e-05, + "loss": 4.2949, + "step": 26700 + }, + { + "epoch": 0.15879841088590732, + "grad_norm": 1.5890088081359863, + "learning_rate": 4.6953198619330295e-05, + "loss": 4.3689, + "step": 26701 + }, + { + "epoch": 0.1588043581691883, + "grad_norm": 1.6158654689788818, + "learning_rate": 4.6952975143474975e-05, + "loss": 4.7294, + "step": 26702 + }, + { + "epoch": 0.15881030545246932, + "grad_norm": 1.5613304376602173, + "learning_rate": 4.695275165995609e-05, + "loss": 5.4067, + "step": 26703 + }, + { + "epoch": 0.1588162527357503, + "grad_norm": 1.5085046291351318, + "learning_rate": 4.695252816877373e-05, + "loss": 5.4355, + "step": 26704 + }, + { + "epoch": 0.1588222000190313, + "grad_norm": 1.6180028915405273, + "learning_rate": 4.695230466992797e-05, + "loss": 5.0541, + "step": 26705 + }, + { + "epoch": 0.15882814730231232, + "grad_norm": 1.8564766645431519, + "learning_rate": 4.695208116341888e-05, + "loss": 5.3307, + "step": 26706 + }, + { + "epoch": 0.1588340945855933, + "grad_norm": 1.762041449546814, + "learning_rate": 4.6951857649246555e-05, + "loss": 5.2526, + "step": 26707 + }, + { + "epoch": 0.1588400418688743, + "grad_norm": 1.5610746145248413, + "learning_rate": 4.695163412741106e-05, + "loss": 5.0561, + "step": 26708 + }, + { + "epoch": 0.1588459891521553, + "grad_norm": 1.6463086605072021, + "learning_rate": 4.695141059791247e-05, + "loss": 4.9357, + "step": 26709 + }, + { + "epoch": 0.1588519364354363, + "grad_norm": 1.794967532157898, + "learning_rate": 4.695118706075088e-05, + "loss": 5.4786, + "step": 26710 + }, + { + "epoch": 0.15885788371871729, + "grad_norm": 1.6720161437988281, + "learning_rate": 4.695096351592635e-05, + "loss": 5.4702, + "step": 26711 + }, + { + "epoch": 0.15886383100199827, + "grad_norm": 1.6844518184661865, + "learning_rate": 4.6950739963438975e-05, + "loss": 5.2407, + "step": 26712 + }, + { + "epoch": 0.1588697782852793, + "grad_norm": 1.7027579545974731, + "learning_rate": 4.695051640328881e-05, + "loss": 5.1027, + "step": 26713 + }, + { + "epoch": 0.15887572556856028, + "grad_norm": 2.385157823562622, + "learning_rate": 4.695029283547595e-05, + "loss": 4.6013, + "step": 26714 + }, + { + "epoch": 0.15888167285184127, + "grad_norm": 1.7393914461135864, + "learning_rate": 4.6950069260000475e-05, + "loss": 4.9536, + "step": 26715 + }, + { + "epoch": 0.15888762013512228, + "grad_norm": 1.5079336166381836, + "learning_rate": 4.694984567686246e-05, + "loss": 4.9043, + "step": 26716 + }, + { + "epoch": 0.15889356741840327, + "grad_norm": 1.3347656726837158, + "learning_rate": 4.694962208606197e-05, + "loss": 5.6922, + "step": 26717 + }, + { + "epoch": 0.15889951470168426, + "grad_norm": 1.8166699409484863, + "learning_rate": 4.6949398487599106e-05, + "loss": 5.3646, + "step": 26718 + }, + { + "epoch": 0.15890546198496527, + "grad_norm": 1.7105693817138672, + "learning_rate": 4.694917488147392e-05, + "loss": 5.3915, + "step": 26719 + }, + { + "epoch": 0.15891140926824626, + "grad_norm": 2.5474836826324463, + "learning_rate": 4.6948951267686514e-05, + "loss": 4.7226, + "step": 26720 + }, + { + "epoch": 0.15891735655152725, + "grad_norm": 2.544551372528076, + "learning_rate": 4.694872764623696e-05, + "loss": 4.5184, + "step": 26721 + }, + { + "epoch": 0.15892330383480827, + "grad_norm": 1.6088052988052368, + "learning_rate": 4.6948504017125316e-05, + "loss": 4.913, + "step": 26722 + }, + { + "epoch": 0.15892925111808925, + "grad_norm": 2.0992431640625, + "learning_rate": 4.6948280380351686e-05, + "loss": 4.7943, + "step": 26723 + }, + { + "epoch": 0.15893519840137024, + "grad_norm": 2.4200751781463623, + "learning_rate": 4.6948056735916135e-05, + "loss": 4.6055, + "step": 26724 + }, + { + "epoch": 0.15894114568465126, + "grad_norm": 1.802924633026123, + "learning_rate": 4.694783308381875e-05, + "loss": 4.2752, + "step": 26725 + }, + { + "epoch": 0.15894709296793225, + "grad_norm": 1.8056386709213257, + "learning_rate": 4.694760942405959e-05, + "loss": 5.0256, + "step": 26726 + }, + { + "epoch": 0.15895304025121323, + "grad_norm": 1.5216751098632812, + "learning_rate": 4.694738575663876e-05, + "loss": 4.9218, + "step": 26727 + }, + { + "epoch": 0.15895898753449425, + "grad_norm": 2.3782224655151367, + "learning_rate": 4.694716208155632e-05, + "loss": 4.7504, + "step": 26728 + }, + { + "epoch": 0.15896493481777524, + "grad_norm": 2.0227694511413574, + "learning_rate": 4.694693839881236e-05, + "loss": 4.5376, + "step": 26729 + }, + { + "epoch": 0.15897088210105623, + "grad_norm": 2.289433240890503, + "learning_rate": 4.694671470840693e-05, + "loss": 4.4428, + "step": 26730 + }, + { + "epoch": 0.15897682938433724, + "grad_norm": 2.2303051948547363, + "learning_rate": 4.694649101034015e-05, + "loss": 4.326, + "step": 26731 + }, + { + "epoch": 0.15898277666761823, + "grad_norm": 1.6835930347442627, + "learning_rate": 4.6946267304612067e-05, + "loss": 4.7231, + "step": 26732 + }, + { + "epoch": 0.15898872395089922, + "grad_norm": 1.6131420135498047, + "learning_rate": 4.694604359122277e-05, + "loss": 5.5532, + "step": 26733 + }, + { + "epoch": 0.15899467123418023, + "grad_norm": 1.4710865020751953, + "learning_rate": 4.6945819870172335e-05, + "loss": 5.3341, + "step": 26734 + }, + { + "epoch": 0.15900061851746122, + "grad_norm": 1.5708924531936646, + "learning_rate": 4.694559614146085e-05, + "loss": 4.9195, + "step": 26735 + }, + { + "epoch": 0.1590065658007422, + "grad_norm": 1.5540367364883423, + "learning_rate": 4.6945372405088374e-05, + "loss": 5.2529, + "step": 26736 + }, + { + "epoch": 0.15901251308402323, + "grad_norm": 1.8328397274017334, + "learning_rate": 4.6945148661054995e-05, + "loss": 5.0446, + "step": 26737 + }, + { + "epoch": 0.15901846036730422, + "grad_norm": 1.9213111400604248, + "learning_rate": 4.694492490936079e-05, + "loss": 4.505, + "step": 26738 + }, + { + "epoch": 0.1590244076505852, + "grad_norm": 1.6417537927627563, + "learning_rate": 4.694470115000584e-05, + "loss": 5.064, + "step": 26739 + }, + { + "epoch": 0.15903035493386622, + "grad_norm": 1.4690046310424805, + "learning_rate": 4.6944477382990224e-05, + "loss": 5.1935, + "step": 26740 + }, + { + "epoch": 0.1590363022171472, + "grad_norm": 1.6286424398422241, + "learning_rate": 4.694425360831402e-05, + "loss": 4.8251, + "step": 26741 + }, + { + "epoch": 0.1590422495004282, + "grad_norm": 1.6581510305404663, + "learning_rate": 4.6944029825977296e-05, + "loss": 4.9166, + "step": 26742 + }, + { + "epoch": 0.1590481967837092, + "grad_norm": 1.4425448179244995, + "learning_rate": 4.694380603598015e-05, + "loss": 4.9857, + "step": 26743 + }, + { + "epoch": 0.1590541440669902, + "grad_norm": 1.6443709135055542, + "learning_rate": 4.694358223832263e-05, + "loss": 4.3642, + "step": 26744 + }, + { + "epoch": 0.1590600913502712, + "grad_norm": 1.8886995315551758, + "learning_rate": 4.6943358433004856e-05, + "loss": 4.2237, + "step": 26745 + }, + { + "epoch": 0.1590660386335522, + "grad_norm": 1.779401421546936, + "learning_rate": 4.6943134620026865e-05, + "loss": 3.8314, + "step": 26746 + }, + { + "epoch": 0.1590719859168332, + "grad_norm": 1.9053362607955933, + "learning_rate": 4.6942910799388755e-05, + "loss": 4.0761, + "step": 26747 + }, + { + "epoch": 0.15907793320011418, + "grad_norm": 1.7256511449813843, + "learning_rate": 4.694268697109061e-05, + "loss": 5.4427, + "step": 26748 + }, + { + "epoch": 0.1590838804833952, + "grad_norm": 1.7450202703475952, + "learning_rate": 4.6942463135132484e-05, + "loss": 4.535, + "step": 26749 + }, + { + "epoch": 0.15908982776667618, + "grad_norm": 1.4825485944747925, + "learning_rate": 4.6942239291514486e-05, + "loss": 4.4373, + "step": 26750 + }, + { + "epoch": 0.15909577504995717, + "grad_norm": 1.5326381921768188, + "learning_rate": 4.6942015440236673e-05, + "loss": 4.3876, + "step": 26751 + }, + { + "epoch": 0.1591017223332382, + "grad_norm": 1.7042746543884277, + "learning_rate": 4.694179158129913e-05, + "loss": 4.6091, + "step": 26752 + }, + { + "epoch": 0.15910766961651918, + "grad_norm": 1.584315299987793, + "learning_rate": 4.6941567714701926e-05, + "loss": 4.5937, + "step": 26753 + }, + { + "epoch": 0.15911361689980016, + "grad_norm": 1.5627310276031494, + "learning_rate": 4.694134384044516e-05, + "loss": 4.719, + "step": 26754 + }, + { + "epoch": 0.15911956418308118, + "grad_norm": 1.726309061050415, + "learning_rate": 4.694111995852889e-05, + "loss": 4.8064, + "step": 26755 + }, + { + "epoch": 0.15912551146636217, + "grad_norm": 1.6186972856521606, + "learning_rate": 4.6940896068953204e-05, + "loss": 5.215, + "step": 26756 + }, + { + "epoch": 0.15913145874964316, + "grad_norm": 1.7018485069274902, + "learning_rate": 4.694067217171818e-05, + "loss": 5.1681, + "step": 26757 + }, + { + "epoch": 0.15913740603292417, + "grad_norm": 1.8986917734146118, + "learning_rate": 4.694044826682389e-05, + "loss": 5.1551, + "step": 26758 + }, + { + "epoch": 0.15914335331620516, + "grad_norm": 1.6398760080337524, + "learning_rate": 4.694022435427042e-05, + "loss": 4.8223, + "step": 26759 + }, + { + "epoch": 0.15914930059948615, + "grad_norm": 1.5714781284332275, + "learning_rate": 4.694000043405784e-05, + "loss": 4.6631, + "step": 26760 + }, + { + "epoch": 0.15915524788276716, + "grad_norm": 1.9300872087478638, + "learning_rate": 4.6939776506186234e-05, + "loss": 4.8107, + "step": 26761 + }, + { + "epoch": 0.15916119516604815, + "grad_norm": 1.8684272766113281, + "learning_rate": 4.6939552570655674e-05, + "loss": 4.9762, + "step": 26762 + }, + { + "epoch": 0.15916714244932914, + "grad_norm": 1.6835062503814697, + "learning_rate": 4.693932862746625e-05, + "loss": 4.8015, + "step": 26763 + }, + { + "epoch": 0.15917308973261016, + "grad_norm": 1.5635250806808472, + "learning_rate": 4.693910467661803e-05, + "loss": 4.7763, + "step": 26764 + }, + { + "epoch": 0.15917903701589114, + "grad_norm": 1.584123134613037, + "learning_rate": 4.69388807181111e-05, + "loss": 4.7093, + "step": 26765 + }, + { + "epoch": 0.15918498429917213, + "grad_norm": 1.597011685371399, + "learning_rate": 4.693865675194553e-05, + "loss": 4.7376, + "step": 26766 + }, + { + "epoch": 0.15919093158245315, + "grad_norm": 1.5018924474716187, + "learning_rate": 4.693843277812141e-05, + "loss": 4.5752, + "step": 26767 + }, + { + "epoch": 0.15919687886573414, + "grad_norm": 1.5398659706115723, + "learning_rate": 4.6938208796638796e-05, + "loss": 4.3835, + "step": 26768 + }, + { + "epoch": 0.15920282614901513, + "grad_norm": 1.753659963607788, + "learning_rate": 4.693798480749778e-05, + "loss": 4.5366, + "step": 26769 + }, + { + "epoch": 0.1592087734322961, + "grad_norm": 1.6807688474655151, + "learning_rate": 4.693776081069845e-05, + "loss": 4.5043, + "step": 26770 + }, + { + "epoch": 0.15921472071557713, + "grad_norm": 1.547088384628296, + "learning_rate": 4.6937536806240865e-05, + "loss": 4.4129, + "step": 26771 + }, + { + "epoch": 0.15922066799885812, + "grad_norm": 1.6225403547286987, + "learning_rate": 4.693731279412512e-05, + "loss": 4.3027, + "step": 26772 + }, + { + "epoch": 0.1592266152821391, + "grad_norm": 1.521183967590332, + "learning_rate": 4.693708877435128e-05, + "loss": 4.3267, + "step": 26773 + }, + { + "epoch": 0.15923256256542012, + "grad_norm": 1.503652572631836, + "learning_rate": 4.693686474691944e-05, + "loss": 4.5069, + "step": 26774 + }, + { + "epoch": 0.1592385098487011, + "grad_norm": 1.3765262365341187, + "learning_rate": 4.693664071182965e-05, + "loss": 4.8385, + "step": 26775 + }, + { + "epoch": 0.1592444571319821, + "grad_norm": 1.552372932434082, + "learning_rate": 4.6936416669082015e-05, + "loss": 4.7109, + "step": 26776 + }, + { + "epoch": 0.1592504044152631, + "grad_norm": 1.5098180770874023, + "learning_rate": 4.693619261867661e-05, + "loss": 4.6682, + "step": 26777 + }, + { + "epoch": 0.1592563516985441, + "grad_norm": 1.7043485641479492, + "learning_rate": 4.69359685606135e-05, + "loss": 4.7291, + "step": 26778 + }, + { + "epoch": 0.1592622989818251, + "grad_norm": 1.342060923576355, + "learning_rate": 4.693574449489277e-05, + "loss": 4.4172, + "step": 26779 + }, + { + "epoch": 0.1592682462651061, + "grad_norm": 1.5385740995407104, + "learning_rate": 4.6935520421514494e-05, + "loss": 4.1767, + "step": 26780 + }, + { + "epoch": 0.1592741935483871, + "grad_norm": 1.3378406763076782, + "learning_rate": 4.6935296340478764e-05, + "loss": 4.419, + "step": 26781 + }, + { + "epoch": 0.15928014083166808, + "grad_norm": 1.5734392404556274, + "learning_rate": 4.693507225178564e-05, + "loss": 4.3342, + "step": 26782 + }, + { + "epoch": 0.1592860881149491, + "grad_norm": 1.9071681499481201, + "learning_rate": 4.6934848155435216e-05, + "loss": 4.4808, + "step": 26783 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 1.4852991104125977, + "learning_rate": 4.693462405142755e-05, + "loss": 5.2923, + "step": 26784 + }, + { + "epoch": 0.15929798268151107, + "grad_norm": 1.7078371047973633, + "learning_rate": 4.6934399939762746e-05, + "loss": 4.5363, + "step": 26785 + }, + { + "epoch": 0.1593039299647921, + "grad_norm": 1.731362223625183, + "learning_rate": 4.693417582044087e-05, + "loss": 4.3905, + "step": 26786 + }, + { + "epoch": 0.15930987724807308, + "grad_norm": 1.7854750156402588, + "learning_rate": 4.6933951693462e-05, + "loss": 4.6509, + "step": 26787 + }, + { + "epoch": 0.15931582453135407, + "grad_norm": 1.804178237915039, + "learning_rate": 4.69337275588262e-05, + "loss": 4.5157, + "step": 26788 + }, + { + "epoch": 0.15932177181463508, + "grad_norm": 1.9014322757720947, + "learning_rate": 4.693350341653358e-05, + "loss": 4.5673, + "step": 26789 + }, + { + "epoch": 0.15932771909791607, + "grad_norm": 2.1549782752990723, + "learning_rate": 4.693327926658418e-05, + "loss": 4.6754, + "step": 26790 + }, + { + "epoch": 0.15933366638119706, + "grad_norm": 1.9609428644180298, + "learning_rate": 4.693305510897812e-05, + "loss": 4.6832, + "step": 26791 + }, + { + "epoch": 0.15933961366447807, + "grad_norm": 2.0541574954986572, + "learning_rate": 4.693283094371545e-05, + "loss": 4.3928, + "step": 26792 + }, + { + "epoch": 0.15934556094775906, + "grad_norm": 2.151719331741333, + "learning_rate": 4.693260677079625e-05, + "loss": 4.2179, + "step": 26793 + }, + { + "epoch": 0.15935150823104005, + "grad_norm": 1.6300101280212402, + "learning_rate": 4.693238259022062e-05, + "loss": 5.202, + "step": 26794 + }, + { + "epoch": 0.15935745551432107, + "grad_norm": 1.860836148262024, + "learning_rate": 4.69321584019886e-05, + "loss": 4.7327, + "step": 26795 + }, + { + "epoch": 0.15936340279760206, + "grad_norm": 1.7627391815185547, + "learning_rate": 4.6931934206100304e-05, + "loss": 5.0884, + "step": 26796 + }, + { + "epoch": 0.15936935008088304, + "grad_norm": 1.6358652114868164, + "learning_rate": 4.693171000255579e-05, + "loss": 5.1218, + "step": 26797 + }, + { + "epoch": 0.15937529736416406, + "grad_norm": 1.938833475112915, + "learning_rate": 4.693148579135514e-05, + "loss": 5.0097, + "step": 26798 + }, + { + "epoch": 0.15938124464744505, + "grad_norm": 1.6986185312271118, + "learning_rate": 4.6931261572498445e-05, + "loss": 5.0552, + "step": 26799 + }, + { + "epoch": 0.15938719193072604, + "grad_norm": 1.9049108028411865, + "learning_rate": 4.693103734598576e-05, + "loss": 4.5521, + "step": 26800 + }, + { + "epoch": 0.15939313921400705, + "grad_norm": 1.723593831062317, + "learning_rate": 4.693081311181719e-05, + "loss": 4.624, + "step": 26801 + }, + { + "epoch": 0.15939908649728804, + "grad_norm": 1.8977972269058228, + "learning_rate": 4.693058886999279e-05, + "loss": 4.508, + "step": 26802 + }, + { + "epoch": 0.15940503378056903, + "grad_norm": 1.8587881326675415, + "learning_rate": 4.6930364620512656e-05, + "loss": 4.5824, + "step": 26803 + }, + { + "epoch": 0.15941098106385004, + "grad_norm": 2.033412456512451, + "learning_rate": 4.693014036337685e-05, + "loss": 4.2831, + "step": 26804 + }, + { + "epoch": 0.15941692834713103, + "grad_norm": 1.7461220026016235, + "learning_rate": 4.692991609858547e-05, + "loss": 4.3987, + "step": 26805 + }, + { + "epoch": 0.15942287563041202, + "grad_norm": 1.5717246532440186, + "learning_rate": 4.692969182613857e-05, + "loss": 4.4173, + "step": 26806 + }, + { + "epoch": 0.15942882291369304, + "grad_norm": 1.825589656829834, + "learning_rate": 4.692946754603625e-05, + "loss": 4.5616, + "step": 26807 + }, + { + "epoch": 0.15943477019697402, + "grad_norm": 1.5404088497161865, + "learning_rate": 4.6929243258278576e-05, + "loss": 5.393, + "step": 26808 + }, + { + "epoch": 0.159440717480255, + "grad_norm": 2.0158777236938477, + "learning_rate": 4.692901896286563e-05, + "loss": 4.7878, + "step": 26809 + }, + { + "epoch": 0.15944666476353603, + "grad_norm": 2.152909755706787, + "learning_rate": 4.6928794659797494e-05, + "loss": 4.1923, + "step": 26810 + }, + { + "epoch": 0.15945261204681702, + "grad_norm": 2.1839582920074463, + "learning_rate": 4.692857034907423e-05, + "loss": 4.4213, + "step": 26811 + }, + { + "epoch": 0.159458559330098, + "grad_norm": 1.7359018325805664, + "learning_rate": 4.6928346030695934e-05, + "loss": 4.4409, + "step": 26812 + }, + { + "epoch": 0.15946450661337902, + "grad_norm": 1.6525425910949707, + "learning_rate": 4.692812170466269e-05, + "loss": 5.0243, + "step": 26813 + }, + { + "epoch": 0.15947045389666, + "grad_norm": 1.471819519996643, + "learning_rate": 4.692789737097455e-05, + "loss": 5.5855, + "step": 26814 + }, + { + "epoch": 0.159476401179941, + "grad_norm": 1.4903481006622314, + "learning_rate": 4.692767302963162e-05, + "loss": 5.4807, + "step": 26815 + }, + { + "epoch": 0.159482348463222, + "grad_norm": 1.6658556461334229, + "learning_rate": 4.6927448680633954e-05, + "loss": 5.2928, + "step": 26816 + }, + { + "epoch": 0.159488295746503, + "grad_norm": 1.8180750608444214, + "learning_rate": 4.692722432398166e-05, + "loss": 5.0372, + "step": 26817 + }, + { + "epoch": 0.159494243029784, + "grad_norm": 1.4245752096176147, + "learning_rate": 4.692699995967478e-05, + "loss": 4.9285, + "step": 26818 + }, + { + "epoch": 0.159500190313065, + "grad_norm": 1.5879698991775513, + "learning_rate": 4.692677558771342e-05, + "loss": 4.7327, + "step": 26819 + }, + { + "epoch": 0.159506137596346, + "grad_norm": 2.3847367763519287, + "learning_rate": 4.692655120809764e-05, + "loss": 4.0357, + "step": 26820 + }, + { + "epoch": 0.15951208487962698, + "grad_norm": 2.5753002166748047, + "learning_rate": 4.692632682082754e-05, + "loss": 3.9462, + "step": 26821 + }, + { + "epoch": 0.159518032162908, + "grad_norm": 2.6524651050567627, + "learning_rate": 4.6926102425903185e-05, + "loss": 4.1065, + "step": 26822 + }, + { + "epoch": 0.15952397944618898, + "grad_norm": 2.808206558227539, + "learning_rate": 4.692587802332464e-05, + "loss": 4.112, + "step": 26823 + }, + { + "epoch": 0.15952992672946997, + "grad_norm": 1.5214722156524658, + "learning_rate": 4.692565361309201e-05, + "loss": 5.4128, + "step": 26824 + }, + { + "epoch": 0.159535874012751, + "grad_norm": 2.1168901920318604, + "learning_rate": 4.692542919520536e-05, + "loss": 4.1342, + "step": 26825 + }, + { + "epoch": 0.15954182129603198, + "grad_norm": 2.5575170516967773, + "learning_rate": 4.692520476966477e-05, + "loss": 4.0117, + "step": 26826 + }, + { + "epoch": 0.15954776857931297, + "grad_norm": 2.9047164916992188, + "learning_rate": 4.6924980336470314e-05, + "loss": 4.1555, + "step": 26827 + }, + { + "epoch": 0.15955371586259395, + "grad_norm": 2.678936719894409, + "learning_rate": 4.6924755895622076e-05, + "loss": 4.0008, + "step": 26828 + }, + { + "epoch": 0.15955966314587497, + "grad_norm": 2.4771978855133057, + "learning_rate": 4.692453144712014e-05, + "loss": 4.1707, + "step": 26829 + }, + { + "epoch": 0.15956561042915596, + "grad_norm": 2.1536855697631836, + "learning_rate": 4.6924306990964564e-05, + "loss": 4.1883, + "step": 26830 + }, + { + "epoch": 0.15957155771243695, + "grad_norm": 1.8136900663375854, + "learning_rate": 4.692408252715544e-05, + "loss": 4.8374, + "step": 26831 + }, + { + "epoch": 0.15957750499571796, + "grad_norm": 2.4778616428375244, + "learning_rate": 4.692385805569285e-05, + "loss": 3.9603, + "step": 26832 + }, + { + "epoch": 0.15958345227899895, + "grad_norm": 1.9646393060684204, + "learning_rate": 4.692363357657686e-05, + "loss": 4.2872, + "step": 26833 + }, + { + "epoch": 0.15958939956227994, + "grad_norm": 2.0261855125427246, + "learning_rate": 4.6923409089807566e-05, + "loss": 4.2673, + "step": 26834 + }, + { + "epoch": 0.15959534684556095, + "grad_norm": 2.361943244934082, + "learning_rate": 4.692318459538503e-05, + "loss": 3.9284, + "step": 26835 + }, + { + "epoch": 0.15960129412884194, + "grad_norm": 1.9567387104034424, + "learning_rate": 4.6922960093309334e-05, + "loss": 4.366, + "step": 26836 + }, + { + "epoch": 0.15960724141212293, + "grad_norm": 2.046351432800293, + "learning_rate": 4.692273558358057e-05, + "loss": 4.1074, + "step": 26837 + }, + { + "epoch": 0.15961318869540395, + "grad_norm": 1.9861648082733154, + "learning_rate": 4.6922511066198796e-05, + "loss": 4.1299, + "step": 26838 + }, + { + "epoch": 0.15961913597868493, + "grad_norm": 2.061688184738159, + "learning_rate": 4.692228654116411e-05, + "loss": 4.056, + "step": 26839 + }, + { + "epoch": 0.15962508326196592, + "grad_norm": 2.4299874305725098, + "learning_rate": 4.692206200847656e-05, + "loss": 3.8725, + "step": 26840 + }, + { + "epoch": 0.15963103054524694, + "grad_norm": 2.0996625423431396, + "learning_rate": 4.692183746813626e-05, + "loss": 3.9208, + "step": 26841 + }, + { + "epoch": 0.15963697782852793, + "grad_norm": 1.4910566806793213, + "learning_rate": 4.6921612920143276e-05, + "loss": 5.4869, + "step": 26842 + }, + { + "epoch": 0.15964292511180891, + "grad_norm": 2.304666042327881, + "learning_rate": 4.692138836449768e-05, + "loss": 4.3594, + "step": 26843 + }, + { + "epoch": 0.15964887239508993, + "grad_norm": 2.0998356342315674, + "learning_rate": 4.6921163801199553e-05, + "loss": 4.184, + "step": 26844 + }, + { + "epoch": 0.15965481967837092, + "grad_norm": 2.05517315864563, + "learning_rate": 4.692093923024897e-05, + "loss": 4.0709, + "step": 26845 + }, + { + "epoch": 0.1596607669616519, + "grad_norm": 1.7358988523483276, + "learning_rate": 4.692071465164601e-05, + "loss": 4.8628, + "step": 26846 + }, + { + "epoch": 0.15966671424493292, + "grad_norm": 2.173988103866577, + "learning_rate": 4.6920490065390766e-05, + "loss": 4.2944, + "step": 26847 + }, + { + "epoch": 0.1596726615282139, + "grad_norm": 1.41978919506073, + "learning_rate": 4.69202654714833e-05, + "loss": 4.9699, + "step": 26848 + }, + { + "epoch": 0.1596786088114949, + "grad_norm": 1.748255968093872, + "learning_rate": 4.6920040869923695e-05, + "loss": 3.9938, + "step": 26849 + }, + { + "epoch": 0.15968455609477591, + "grad_norm": 1.7858299016952515, + "learning_rate": 4.691981626071204e-05, + "loss": 4.7106, + "step": 26850 + }, + { + "epoch": 0.1596905033780569, + "grad_norm": 1.575324296951294, + "learning_rate": 4.691959164384839e-05, + "loss": 5.4768, + "step": 26851 + }, + { + "epoch": 0.1596964506613379, + "grad_norm": 1.383719801902771, + "learning_rate": 4.691936701933285e-05, + "loss": 5.154, + "step": 26852 + }, + { + "epoch": 0.1597023979446189, + "grad_norm": 1.559497356414795, + "learning_rate": 4.6919142387165476e-05, + "loss": 5.4081, + "step": 26853 + }, + { + "epoch": 0.1597083452278999, + "grad_norm": 2.3833580017089844, + "learning_rate": 4.691891774734636e-05, + "loss": 4.3001, + "step": 26854 + }, + { + "epoch": 0.15971429251118088, + "grad_norm": 1.5790619850158691, + "learning_rate": 4.6918693099875575e-05, + "loss": 5.1468, + "step": 26855 + }, + { + "epoch": 0.1597202397944619, + "grad_norm": 2.088935613632202, + "learning_rate": 4.69184684447532e-05, + "loss": 4.6097, + "step": 26856 + }, + { + "epoch": 0.1597261870777429, + "grad_norm": 1.7923367023468018, + "learning_rate": 4.691824378197931e-05, + "loss": 4.2733, + "step": 26857 + }, + { + "epoch": 0.15973213436102388, + "grad_norm": 1.583054780960083, + "learning_rate": 4.691801911155399e-05, + "loss": 4.7933, + "step": 26858 + }, + { + "epoch": 0.1597380816443049, + "grad_norm": 1.6564888954162598, + "learning_rate": 4.691779443347733e-05, + "loss": 4.6326, + "step": 26859 + }, + { + "epoch": 0.15974402892758588, + "grad_norm": 1.4905378818511963, + "learning_rate": 4.691756974774938e-05, + "loss": 4.8904, + "step": 26860 + }, + { + "epoch": 0.15974997621086687, + "grad_norm": 1.6564618349075317, + "learning_rate": 4.6917345054370234e-05, + "loss": 4.6245, + "step": 26861 + }, + { + "epoch": 0.15975592349414788, + "grad_norm": 1.262850284576416, + "learning_rate": 4.691712035333996e-05, + "loss": 5.584, + "step": 26862 + }, + { + "epoch": 0.15976187077742887, + "grad_norm": 1.54867684841156, + "learning_rate": 4.691689564465867e-05, + "loss": 5.543, + "step": 26863 + }, + { + "epoch": 0.15976781806070986, + "grad_norm": 1.470517635345459, + "learning_rate": 4.69166709283264e-05, + "loss": 5.5524, + "step": 26864 + }, + { + "epoch": 0.15977376534399088, + "grad_norm": 1.5773262977600098, + "learning_rate": 4.6916446204343245e-05, + "loss": 4.9904, + "step": 26865 + }, + { + "epoch": 0.15977971262727186, + "grad_norm": 1.5984915494918823, + "learning_rate": 4.6916221472709295e-05, + "loss": 4.7114, + "step": 26866 + }, + { + "epoch": 0.15978565991055285, + "grad_norm": 1.4829813241958618, + "learning_rate": 4.691599673342462e-05, + "loss": 4.9843, + "step": 26867 + }, + { + "epoch": 0.15979160719383387, + "grad_norm": 1.7312453985214233, + "learning_rate": 4.691577198648929e-05, + "loss": 4.2701, + "step": 26868 + }, + { + "epoch": 0.15979755447711486, + "grad_norm": 1.4807355403900146, + "learning_rate": 4.691554723190339e-05, + "loss": 4.7952, + "step": 26869 + }, + { + "epoch": 0.15980350176039584, + "grad_norm": 1.3604083061218262, + "learning_rate": 4.6915322469667e-05, + "loss": 5.1496, + "step": 26870 + }, + { + "epoch": 0.15980944904367686, + "grad_norm": 1.5444153547286987, + "learning_rate": 4.69150976997802e-05, + "loss": 5.791, + "step": 26871 + }, + { + "epoch": 0.15981539632695785, + "grad_norm": 1.617533564567566, + "learning_rate": 4.691487292224306e-05, + "loss": 5.5533, + "step": 26872 + }, + { + "epoch": 0.15982134361023884, + "grad_norm": 1.5946470499038696, + "learning_rate": 4.691464813705567e-05, + "loss": 5.5958, + "step": 26873 + }, + { + "epoch": 0.15982729089351985, + "grad_norm": 1.862707495689392, + "learning_rate": 4.691442334421809e-05, + "loss": 4.8171, + "step": 26874 + }, + { + "epoch": 0.15983323817680084, + "grad_norm": 1.355368971824646, + "learning_rate": 4.6914198543730425e-05, + "loss": 5.5431, + "step": 26875 + }, + { + "epoch": 0.15983918546008183, + "grad_norm": 1.4658385515213013, + "learning_rate": 4.6913973735592744e-05, + "loss": 5.3588, + "step": 26876 + }, + { + "epoch": 0.15984513274336284, + "grad_norm": 1.4573192596435547, + "learning_rate": 4.6913748919805115e-05, + "loss": 5.5454, + "step": 26877 + }, + { + "epoch": 0.15985108002664383, + "grad_norm": 1.495696783065796, + "learning_rate": 4.691352409636762e-05, + "loss": 5.5131, + "step": 26878 + }, + { + "epoch": 0.15985702730992482, + "grad_norm": 1.474161148071289, + "learning_rate": 4.691329926528034e-05, + "loss": 5.6235, + "step": 26879 + }, + { + "epoch": 0.15986297459320584, + "grad_norm": 1.5069948434829712, + "learning_rate": 4.6913074426543355e-05, + "loss": 5.3926, + "step": 26880 + }, + { + "epoch": 0.15986892187648682, + "grad_norm": 1.4088873863220215, + "learning_rate": 4.691284958015674e-05, + "loss": 5.2991, + "step": 26881 + }, + { + "epoch": 0.1598748691597678, + "grad_norm": 1.483222484588623, + "learning_rate": 4.691262472612058e-05, + "loss": 5.205, + "step": 26882 + }, + { + "epoch": 0.15988081644304883, + "grad_norm": 1.5325754880905151, + "learning_rate": 4.6912399864434953e-05, + "loss": 5.261, + "step": 26883 + }, + { + "epoch": 0.15988676372632982, + "grad_norm": 1.4159071445465088, + "learning_rate": 4.691217499509992e-05, + "loss": 5.2486, + "step": 26884 + }, + { + "epoch": 0.1598927110096108, + "grad_norm": 1.514702320098877, + "learning_rate": 4.6911950118115584e-05, + "loss": 5.332, + "step": 26885 + }, + { + "epoch": 0.1598986582928918, + "grad_norm": 1.757711410522461, + "learning_rate": 4.6911725233482005e-05, + "loss": 4.5752, + "step": 26886 + }, + { + "epoch": 0.1599046055761728, + "grad_norm": 1.6628808975219727, + "learning_rate": 4.691150034119928e-05, + "loss": 4.8776, + "step": 26887 + }, + { + "epoch": 0.1599105528594538, + "grad_norm": 1.6468075513839722, + "learning_rate": 4.691127544126746e-05, + "loss": 4.7613, + "step": 26888 + }, + { + "epoch": 0.15991650014273479, + "grad_norm": 1.603371262550354, + "learning_rate": 4.6911050533686656e-05, + "loss": 4.8145, + "step": 26889 + }, + { + "epoch": 0.1599224474260158, + "grad_norm": 1.4971832036972046, + "learning_rate": 4.6910825618456925e-05, + "loss": 5.5747, + "step": 26890 + }, + { + "epoch": 0.1599283947092968, + "grad_norm": 1.6911252737045288, + "learning_rate": 4.691060069557836e-05, + "loss": 5.5936, + "step": 26891 + }, + { + "epoch": 0.15993434199257778, + "grad_norm": 1.4903403520584106, + "learning_rate": 4.6910375765051016e-05, + "loss": 5.6195, + "step": 26892 + }, + { + "epoch": 0.1599402892758588, + "grad_norm": 1.8719216585159302, + "learning_rate": 4.6910150826874986e-05, + "loss": 4.818, + "step": 26893 + }, + { + "epoch": 0.15994623655913978, + "grad_norm": 1.7679294347763062, + "learning_rate": 4.690992588105036e-05, + "loss": 4.9175, + "step": 26894 + }, + { + "epoch": 0.15995218384242077, + "grad_norm": 1.8319326639175415, + "learning_rate": 4.69097009275772e-05, + "loss": 5.7222, + "step": 26895 + }, + { + "epoch": 0.15995813112570179, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.690947596645559e-05, + "loss": 5.2146, + "step": 26896 + }, + { + "epoch": 0.15996407840898277, + "grad_norm": 1.6124671697616577, + "learning_rate": 4.690925099768561e-05, + "loss": 5.3234, + "step": 26897 + }, + { + "epoch": 0.15997002569226376, + "grad_norm": 1.546627402305603, + "learning_rate": 4.6909026021267336e-05, + "loss": 5.4278, + "step": 26898 + }, + { + "epoch": 0.15997597297554478, + "grad_norm": 1.492988109588623, + "learning_rate": 4.690880103720084e-05, + "loss": 5.5902, + "step": 26899 + }, + { + "epoch": 0.15998192025882577, + "grad_norm": 1.4887235164642334, + "learning_rate": 4.690857604548622e-05, + "loss": 5.5054, + "step": 26900 + }, + { + "epoch": 0.15998786754210675, + "grad_norm": 1.6349844932556152, + "learning_rate": 4.690835104612353e-05, + "loss": 5.4657, + "step": 26901 + }, + { + "epoch": 0.15999381482538777, + "grad_norm": 1.5228698253631592, + "learning_rate": 4.690812603911287e-05, + "loss": 5.3062, + "step": 26902 + }, + { + "epoch": 0.15999976210866876, + "grad_norm": 2.3719773292541504, + "learning_rate": 4.69079010244543e-05, + "loss": 4.3533, + "step": 26903 + }, + { + "epoch": 0.16000570939194975, + "grad_norm": 1.7740064859390259, + "learning_rate": 4.690767600214792e-05, + "loss": 4.8227, + "step": 26904 + }, + { + "epoch": 0.16001165667523076, + "grad_norm": 1.5493906736373901, + "learning_rate": 4.690745097219379e-05, + "loss": 5.2635, + "step": 26905 + }, + { + "epoch": 0.16001760395851175, + "grad_norm": 1.5318504571914673, + "learning_rate": 4.6907225934592e-05, + "loss": 5.1352, + "step": 26906 + }, + { + "epoch": 0.16002355124179274, + "grad_norm": 1.6286877393722534, + "learning_rate": 4.6907000889342626e-05, + "loss": 5.122, + "step": 26907 + }, + { + "epoch": 0.16002949852507375, + "grad_norm": 1.7091056108474731, + "learning_rate": 4.6906775836445735e-05, + "loss": 4.8629, + "step": 26908 + }, + { + "epoch": 0.16003544580835474, + "grad_norm": 1.8141852617263794, + "learning_rate": 4.6906550775901417e-05, + "loss": 5.0909, + "step": 26909 + }, + { + "epoch": 0.16004139309163573, + "grad_norm": 1.5500266551971436, + "learning_rate": 4.690632570770975e-05, + "loss": 5.3479, + "step": 26910 + }, + { + "epoch": 0.16004734037491675, + "grad_norm": 1.6703251600265503, + "learning_rate": 4.690610063187081e-05, + "loss": 5.264, + "step": 26911 + }, + { + "epoch": 0.16005328765819773, + "grad_norm": 1.2872283458709717, + "learning_rate": 4.690587554838468e-05, + "loss": 5.3643, + "step": 26912 + }, + { + "epoch": 0.16005923494147872, + "grad_norm": 1.456085443496704, + "learning_rate": 4.6905650457251435e-05, + "loss": 5.4866, + "step": 26913 + }, + { + "epoch": 0.16006518222475974, + "grad_norm": 1.560021996498108, + "learning_rate": 4.690542535847115e-05, + "loss": 5.3858, + "step": 26914 + }, + { + "epoch": 0.16007112950804073, + "grad_norm": 1.4462066888809204, + "learning_rate": 4.690520025204391e-05, + "loss": 5.2111, + "step": 26915 + }, + { + "epoch": 0.16007707679132172, + "grad_norm": 1.5655597448349, + "learning_rate": 4.6904975137969786e-05, + "loss": 5.2547, + "step": 26916 + }, + { + "epoch": 0.16008302407460273, + "grad_norm": 1.3707412481307983, + "learning_rate": 4.6904750016248865e-05, + "loss": 5.3997, + "step": 26917 + }, + { + "epoch": 0.16008897135788372, + "grad_norm": 1.7030435800552368, + "learning_rate": 4.690452488688123e-05, + "loss": 5.4115, + "step": 26918 + }, + { + "epoch": 0.1600949186411647, + "grad_norm": 1.4965012073516846, + "learning_rate": 4.690429974986694e-05, + "loss": 4.9977, + "step": 26919 + }, + { + "epoch": 0.16010086592444572, + "grad_norm": 1.3461761474609375, + "learning_rate": 4.69040746052061e-05, + "loss": 5.3629, + "step": 26920 + }, + { + "epoch": 0.1601068132077267, + "grad_norm": 1.3323198556900024, + "learning_rate": 4.690384945289875e-05, + "loss": 5.3162, + "step": 26921 + }, + { + "epoch": 0.1601127604910077, + "grad_norm": 1.6808300018310547, + "learning_rate": 4.690362429294501e-05, + "loss": 5.0513, + "step": 26922 + }, + { + "epoch": 0.16011870777428872, + "grad_norm": 1.659193515777588, + "learning_rate": 4.690339912534494e-05, + "loss": 5.2587, + "step": 26923 + }, + { + "epoch": 0.1601246550575697, + "grad_norm": 1.7092478275299072, + "learning_rate": 4.690317395009861e-05, + "loss": 5.1897, + "step": 26924 + }, + { + "epoch": 0.1601306023408507, + "grad_norm": 1.5868886709213257, + "learning_rate": 4.6902948767206115e-05, + "loss": 4.7132, + "step": 26925 + }, + { + "epoch": 0.1601365496241317, + "grad_norm": 1.584676742553711, + "learning_rate": 4.690272357666753e-05, + "loss": 4.8759, + "step": 26926 + }, + { + "epoch": 0.1601424969074127, + "grad_norm": 1.6470085382461548, + "learning_rate": 4.690249837848293e-05, + "loss": 4.9947, + "step": 26927 + }, + { + "epoch": 0.16014844419069368, + "grad_norm": 1.4562335014343262, + "learning_rate": 4.690227317265239e-05, + "loss": 5.1101, + "step": 26928 + }, + { + "epoch": 0.1601543914739747, + "grad_norm": 1.4088939428329468, + "learning_rate": 4.690204795917599e-05, + "loss": 5.3212, + "step": 26929 + }, + { + "epoch": 0.1601603387572557, + "grad_norm": 1.4988348484039307, + "learning_rate": 4.6901822738053816e-05, + "loss": 4.9456, + "step": 26930 + }, + { + "epoch": 0.16016628604053668, + "grad_norm": 1.608365535736084, + "learning_rate": 4.690159750928594e-05, + "loss": 5.082, + "step": 26931 + }, + { + "epoch": 0.1601722333238177, + "grad_norm": 1.5603444576263428, + "learning_rate": 4.6901372272872445e-05, + "loss": 5.4297, + "step": 26932 + }, + { + "epoch": 0.16017818060709868, + "grad_norm": 1.6907488107681274, + "learning_rate": 4.690114702881341e-05, + "loss": 4.9653, + "step": 26933 + }, + { + "epoch": 0.16018412789037967, + "grad_norm": 1.566992998123169, + "learning_rate": 4.69009217771089e-05, + "loss": 5.2261, + "step": 26934 + }, + { + "epoch": 0.16019007517366068, + "grad_norm": 1.4666292667388916, + "learning_rate": 4.690069651775901e-05, + "loss": 5.0251, + "step": 26935 + }, + { + "epoch": 0.16019602245694167, + "grad_norm": 1.5898406505584717, + "learning_rate": 4.690047125076382e-05, + "loss": 5.1041, + "step": 26936 + }, + { + "epoch": 0.16020196974022266, + "grad_norm": 1.3918042182922363, + "learning_rate": 4.6900245976123396e-05, + "loss": 5.3757, + "step": 26937 + }, + { + "epoch": 0.16020791702350368, + "grad_norm": 1.390620231628418, + "learning_rate": 4.690002069383782e-05, + "loss": 5.2667, + "step": 26938 + }, + { + "epoch": 0.16021386430678466, + "grad_norm": 1.4058221578598022, + "learning_rate": 4.6899795403907174e-05, + "loss": 5.8193, + "step": 26939 + }, + { + "epoch": 0.16021981159006565, + "grad_norm": 1.7895981073379517, + "learning_rate": 4.689957010633154e-05, + "loss": 4.9949, + "step": 26940 + }, + { + "epoch": 0.16022575887334667, + "grad_norm": 1.6591132879257202, + "learning_rate": 4.689934480111099e-05, + "loss": 5.0723, + "step": 26941 + }, + { + "epoch": 0.16023170615662766, + "grad_norm": 1.6578445434570312, + "learning_rate": 4.6899119488245605e-05, + "loss": 5.0636, + "step": 26942 + }, + { + "epoch": 0.16023765343990864, + "grad_norm": 1.7342137098312378, + "learning_rate": 4.6898894167735464e-05, + "loss": 4.9476, + "step": 26943 + }, + { + "epoch": 0.16024360072318963, + "grad_norm": 1.7774765491485596, + "learning_rate": 4.689866883958065e-05, + "loss": 5.04, + "step": 26944 + }, + { + "epoch": 0.16024954800647065, + "grad_norm": 1.519485354423523, + "learning_rate": 4.689844350378122e-05, + "loss": 5.353, + "step": 26945 + }, + { + "epoch": 0.16025549528975164, + "grad_norm": 1.7019078731536865, + "learning_rate": 4.6898218160337286e-05, + "loss": 5.2927, + "step": 26946 + }, + { + "epoch": 0.16026144257303263, + "grad_norm": 1.6364177465438843, + "learning_rate": 4.6897992809248903e-05, + "loss": 5.3286, + "step": 26947 + }, + { + "epoch": 0.16026738985631364, + "grad_norm": 1.5034300088882446, + "learning_rate": 4.6897767450516164e-05, + "loss": 5.1647, + "step": 26948 + }, + { + "epoch": 0.16027333713959463, + "grad_norm": 1.4327138662338257, + "learning_rate": 4.6897542084139135e-05, + "loss": 5.1381, + "step": 26949 + }, + { + "epoch": 0.16027928442287562, + "grad_norm": 1.666137456893921, + "learning_rate": 4.68973167101179e-05, + "loss": 4.7333, + "step": 26950 + }, + { + "epoch": 0.16028523170615663, + "grad_norm": 1.6748521327972412, + "learning_rate": 4.689709132845254e-05, + "loss": 4.8698, + "step": 26951 + }, + { + "epoch": 0.16029117898943762, + "grad_norm": 1.7348641157150269, + "learning_rate": 4.689686593914313e-05, + "loss": 5.0501, + "step": 26952 + }, + { + "epoch": 0.1602971262727186, + "grad_norm": 1.6517002582550049, + "learning_rate": 4.689664054218975e-05, + "loss": 4.9992, + "step": 26953 + }, + { + "epoch": 0.16030307355599963, + "grad_norm": 1.9717700481414795, + "learning_rate": 4.689641513759249e-05, + "loss": 4.6581, + "step": 26954 + }, + { + "epoch": 0.1603090208392806, + "grad_norm": 1.9283233880996704, + "learning_rate": 4.68961897253514e-05, + "loss": 4.1993, + "step": 26955 + }, + { + "epoch": 0.1603149681225616, + "grad_norm": 2.814549446105957, + "learning_rate": 4.689596430546659e-05, + "loss": 4.2436, + "step": 26956 + }, + { + "epoch": 0.16032091540584262, + "grad_norm": 1.8716390132904053, + "learning_rate": 4.689573887793811e-05, + "loss": 4.7558, + "step": 26957 + }, + { + "epoch": 0.1603268626891236, + "grad_norm": 1.5305246114730835, + "learning_rate": 4.689551344276607e-05, + "loss": 5.0986, + "step": 26958 + }, + { + "epoch": 0.1603328099724046, + "grad_norm": 1.7304683923721313, + "learning_rate": 4.689528799995052e-05, + "loss": 4.8627, + "step": 26959 + }, + { + "epoch": 0.1603387572556856, + "grad_norm": 1.6693211793899536, + "learning_rate": 4.6895062549491564e-05, + "loss": 4.6759, + "step": 26960 + }, + { + "epoch": 0.1603447045389666, + "grad_norm": 1.6889755725860596, + "learning_rate": 4.6894837091389256e-05, + "loss": 4.6676, + "step": 26961 + }, + { + "epoch": 0.1603506518222476, + "grad_norm": 1.7085540294647217, + "learning_rate": 4.6894611625643695e-05, + "loss": 5.2494, + "step": 26962 + }, + { + "epoch": 0.1603565991055286, + "grad_norm": 1.7167129516601562, + "learning_rate": 4.689438615225494e-05, + "loss": 4.7013, + "step": 26963 + }, + { + "epoch": 0.1603625463888096, + "grad_norm": 1.6896833181381226, + "learning_rate": 4.689416067122309e-05, + "loss": 5.0363, + "step": 26964 + }, + { + "epoch": 0.16036849367209058, + "grad_norm": 1.4529087543487549, + "learning_rate": 4.6893935182548215e-05, + "loss": 5.2665, + "step": 26965 + }, + { + "epoch": 0.1603744409553716, + "grad_norm": 1.630214810371399, + "learning_rate": 4.689370968623039e-05, + "loss": 5.3018, + "step": 26966 + }, + { + "epoch": 0.16038038823865258, + "grad_norm": 1.4638413190841675, + "learning_rate": 4.6893484182269697e-05, + "loss": 5.4105, + "step": 26967 + }, + { + "epoch": 0.16038633552193357, + "grad_norm": 1.7969051599502563, + "learning_rate": 4.689325867066622e-05, + "loss": 5.3511, + "step": 26968 + }, + { + "epoch": 0.1603922828052146, + "grad_norm": 1.65691339969635, + "learning_rate": 4.689303315142003e-05, + "loss": 5.158, + "step": 26969 + }, + { + "epoch": 0.16039823008849557, + "grad_norm": 1.391390085220337, + "learning_rate": 4.689280762453121e-05, + "loss": 5.2721, + "step": 26970 + }, + { + "epoch": 0.16040417737177656, + "grad_norm": 1.699019193649292, + "learning_rate": 4.689258208999983e-05, + "loss": 5.0995, + "step": 26971 + }, + { + "epoch": 0.16041012465505758, + "grad_norm": 1.7829947471618652, + "learning_rate": 4.6892356547825984e-05, + "loss": 4.9086, + "step": 26972 + }, + { + "epoch": 0.16041607193833857, + "grad_norm": 1.7381236553192139, + "learning_rate": 4.689213099800974e-05, + "loss": 4.9298, + "step": 26973 + }, + { + "epoch": 0.16042201922161956, + "grad_norm": 1.273488998413086, + "learning_rate": 4.689190544055118e-05, + "loss": 5.1877, + "step": 26974 + }, + { + "epoch": 0.16042796650490057, + "grad_norm": 1.5737167596817017, + "learning_rate": 4.689167987545038e-05, + "loss": 5.229, + "step": 26975 + }, + { + "epoch": 0.16043391378818156, + "grad_norm": 1.4660385847091675, + "learning_rate": 4.6891454302707414e-05, + "loss": 5.3256, + "step": 26976 + }, + { + "epoch": 0.16043986107146255, + "grad_norm": 1.7380048036575317, + "learning_rate": 4.6891228722322375e-05, + "loss": 4.3369, + "step": 26977 + }, + { + "epoch": 0.16044580835474356, + "grad_norm": 1.686514139175415, + "learning_rate": 4.6891003134295336e-05, + "loss": 4.9901, + "step": 26978 + }, + { + "epoch": 0.16045175563802455, + "grad_norm": 1.8255095481872559, + "learning_rate": 4.689077753862637e-05, + "loss": 4.7844, + "step": 26979 + }, + { + "epoch": 0.16045770292130554, + "grad_norm": 1.7652206420898438, + "learning_rate": 4.689055193531556e-05, + "loss": 5.2592, + "step": 26980 + }, + { + "epoch": 0.16046365020458656, + "grad_norm": 2.122629165649414, + "learning_rate": 4.6890326324362985e-05, + "loss": 4.9435, + "step": 26981 + }, + { + "epoch": 0.16046959748786754, + "grad_norm": 2.0414109230041504, + "learning_rate": 4.689010070576872e-05, + "loss": 4.8267, + "step": 26982 + }, + { + "epoch": 0.16047554477114853, + "grad_norm": 1.8635056018829346, + "learning_rate": 4.6889875079532855e-05, + "loss": 5.0768, + "step": 26983 + }, + { + "epoch": 0.16048149205442955, + "grad_norm": 1.649129033088684, + "learning_rate": 4.688964944565546e-05, + "loss": 5.1536, + "step": 26984 + }, + { + "epoch": 0.16048743933771054, + "grad_norm": 1.6211038827896118, + "learning_rate": 4.688942380413661e-05, + "loss": 5.1866, + "step": 26985 + }, + { + "epoch": 0.16049338662099152, + "grad_norm": 1.862961769104004, + "learning_rate": 4.6889198154976387e-05, + "loss": 4.9439, + "step": 26986 + }, + { + "epoch": 0.16049933390427254, + "grad_norm": 2.02945613861084, + "learning_rate": 4.6888972498174874e-05, + "loss": 4.8791, + "step": 26987 + }, + { + "epoch": 0.16050528118755353, + "grad_norm": 2.434349536895752, + "learning_rate": 4.688874683373215e-05, + "loss": 4.9336, + "step": 26988 + }, + { + "epoch": 0.16051122847083452, + "grad_norm": 1.6819970607757568, + "learning_rate": 4.6888521161648284e-05, + "loss": 4.9917, + "step": 26989 + }, + { + "epoch": 0.16051717575411553, + "grad_norm": 1.7764739990234375, + "learning_rate": 4.688829548192337e-05, + "loss": 5.274, + "step": 26990 + }, + { + "epoch": 0.16052312303739652, + "grad_norm": 1.4962623119354248, + "learning_rate": 4.6888069794557465e-05, + "loss": 5.0699, + "step": 26991 + }, + { + "epoch": 0.1605290703206775, + "grad_norm": 1.7750627994537354, + "learning_rate": 4.688784409955067e-05, + "loss": 4.9197, + "step": 26992 + }, + { + "epoch": 0.16053501760395852, + "grad_norm": 1.7030991315841675, + "learning_rate": 4.6887618396903055e-05, + "loss": 5.1113, + "step": 26993 + }, + { + "epoch": 0.1605409648872395, + "grad_norm": 1.7158962488174438, + "learning_rate": 4.68873926866147e-05, + "loss": 5.2175, + "step": 26994 + }, + { + "epoch": 0.1605469121705205, + "grad_norm": 1.5792635679244995, + "learning_rate": 4.6887166968685684e-05, + "loss": 5.2031, + "step": 26995 + }, + { + "epoch": 0.16055285945380152, + "grad_norm": 1.6441086530685425, + "learning_rate": 4.688694124311607e-05, + "loss": 4.669, + "step": 26996 + }, + { + "epoch": 0.1605588067370825, + "grad_norm": 1.4879902601242065, + "learning_rate": 4.688671550990597e-05, + "loss": 5.2163, + "step": 26997 + }, + { + "epoch": 0.1605647540203635, + "grad_norm": 1.7525761127471924, + "learning_rate": 4.688648976905543e-05, + "loss": 4.6094, + "step": 26998 + }, + { + "epoch": 0.1605707013036445, + "grad_norm": 1.500331163406372, + "learning_rate": 4.6886264020564544e-05, + "loss": 5.0793, + "step": 26999 + }, + { + "epoch": 0.1605766485869255, + "grad_norm": 1.505900502204895, + "learning_rate": 4.688603826443339e-05, + "loss": 4.9562, + "step": 27000 + }, + { + "epoch": 0.16058259587020648, + "grad_norm": 1.558977723121643, + "learning_rate": 4.688581250066205e-05, + "loss": 4.8143, + "step": 27001 + }, + { + "epoch": 0.1605885431534875, + "grad_norm": 1.4914512634277344, + "learning_rate": 4.6885586729250596e-05, + "loss": 4.624, + "step": 27002 + }, + { + "epoch": 0.1605944904367685, + "grad_norm": 1.482251763343811, + "learning_rate": 4.688536095019911e-05, + "loss": 4.87, + "step": 27003 + }, + { + "epoch": 0.16060043772004948, + "grad_norm": 1.4962702989578247, + "learning_rate": 4.688513516350767e-05, + "loss": 5.1775, + "step": 27004 + }, + { + "epoch": 0.16060638500333047, + "grad_norm": 1.71797513961792, + "learning_rate": 4.688490936917636e-05, + "loss": 5.3413, + "step": 27005 + }, + { + "epoch": 0.16061233228661148, + "grad_norm": 1.5410555601119995, + "learning_rate": 4.688468356720525e-05, + "loss": 5.399, + "step": 27006 + }, + { + "epoch": 0.16061827956989247, + "grad_norm": 1.597773551940918, + "learning_rate": 4.6884457757594424e-05, + "loss": 5.4056, + "step": 27007 + }, + { + "epoch": 0.16062422685317346, + "grad_norm": 1.3013349771499634, + "learning_rate": 4.688423194034396e-05, + "loss": 5.6953, + "step": 27008 + }, + { + "epoch": 0.16063017413645447, + "grad_norm": 1.557054877281189, + "learning_rate": 4.6884006115453935e-05, + "loss": 5.078, + "step": 27009 + }, + { + "epoch": 0.16063612141973546, + "grad_norm": 1.5944912433624268, + "learning_rate": 4.688378028292443e-05, + "loss": 5.0212, + "step": 27010 + }, + { + "epoch": 0.16064206870301645, + "grad_norm": 1.45020592212677, + "learning_rate": 4.6883554442755526e-05, + "loss": 4.9653, + "step": 27011 + }, + { + "epoch": 0.16064801598629747, + "grad_norm": 1.7178733348846436, + "learning_rate": 4.68833285949473e-05, + "loss": 5.2027, + "step": 27012 + }, + { + "epoch": 0.16065396326957845, + "grad_norm": 1.574744462966919, + "learning_rate": 4.688310273949983e-05, + "loss": 5.3929, + "step": 27013 + }, + { + "epoch": 0.16065991055285944, + "grad_norm": 1.511526107788086, + "learning_rate": 4.688287687641319e-05, + "loss": 4.9275, + "step": 27014 + }, + { + "epoch": 0.16066585783614046, + "grad_norm": 1.5261460542678833, + "learning_rate": 4.688265100568747e-05, + "loss": 5.193, + "step": 27015 + }, + { + "epoch": 0.16067180511942145, + "grad_norm": 1.3765456676483154, + "learning_rate": 4.688242512732274e-05, + "loss": 5.006, + "step": 27016 + }, + { + "epoch": 0.16067775240270243, + "grad_norm": 1.4258984327316284, + "learning_rate": 4.688219924131908e-05, + "loss": 5.0301, + "step": 27017 + }, + { + "epoch": 0.16068369968598345, + "grad_norm": 1.6083779335021973, + "learning_rate": 4.688197334767657e-05, + "loss": 5.0202, + "step": 27018 + }, + { + "epoch": 0.16068964696926444, + "grad_norm": 1.3578145503997803, + "learning_rate": 4.6881747446395285e-05, + "loss": 5.0357, + "step": 27019 + }, + { + "epoch": 0.16069559425254543, + "grad_norm": 1.5515062808990479, + "learning_rate": 4.6881521537475316e-05, + "loss": 4.7463, + "step": 27020 + }, + { + "epoch": 0.16070154153582644, + "grad_norm": 1.5254274606704712, + "learning_rate": 4.688129562091673e-05, + "loss": 5.0846, + "step": 27021 + }, + { + "epoch": 0.16070748881910743, + "grad_norm": 1.6628260612487793, + "learning_rate": 4.6881069696719615e-05, + "loss": 4.7732, + "step": 27022 + }, + { + "epoch": 0.16071343610238842, + "grad_norm": 1.5955768823623657, + "learning_rate": 4.6880843764884044e-05, + "loss": 4.7582, + "step": 27023 + }, + { + "epoch": 0.16071938338566943, + "grad_norm": 1.4915939569473267, + "learning_rate": 4.6880617825410086e-05, + "loss": 4.7503, + "step": 27024 + }, + { + "epoch": 0.16072533066895042, + "grad_norm": 1.6703109741210938, + "learning_rate": 4.6880391878297836e-05, + "loss": 4.393, + "step": 27025 + }, + { + "epoch": 0.1607312779522314, + "grad_norm": 1.6725270748138428, + "learning_rate": 4.688016592354737e-05, + "loss": 5.2538, + "step": 27026 + }, + { + "epoch": 0.16073722523551243, + "grad_norm": 1.820046305656433, + "learning_rate": 4.687993996115876e-05, + "loss": 4.7337, + "step": 27027 + }, + { + "epoch": 0.16074317251879341, + "grad_norm": 1.7842957973480225, + "learning_rate": 4.6879713991132096e-05, + "loss": 4.8615, + "step": 27028 + }, + { + "epoch": 0.1607491198020744, + "grad_norm": 1.9226150512695312, + "learning_rate": 4.687948801346745e-05, + "loss": 4.3828, + "step": 27029 + }, + { + "epoch": 0.16075506708535542, + "grad_norm": 1.3625149726867676, + "learning_rate": 4.6879262028164895e-05, + "loss": 4.962, + "step": 27030 + }, + { + "epoch": 0.1607610143686364, + "grad_norm": 1.6589162349700928, + "learning_rate": 4.687903603522452e-05, + "loss": 4.373, + "step": 27031 + }, + { + "epoch": 0.1607669616519174, + "grad_norm": 1.5190513134002686, + "learning_rate": 4.6878810034646395e-05, + "loss": 5.3889, + "step": 27032 + }, + { + "epoch": 0.1607729089351984, + "grad_norm": 1.4899837970733643, + "learning_rate": 4.6878584026430604e-05, + "loss": 4.6972, + "step": 27033 + }, + { + "epoch": 0.1607788562184794, + "grad_norm": 1.7779310941696167, + "learning_rate": 4.6878358010577226e-05, + "loss": 5.0265, + "step": 27034 + }, + { + "epoch": 0.1607848035017604, + "grad_norm": 1.7755082845687866, + "learning_rate": 4.687813198708634e-05, + "loss": 4.7129, + "step": 27035 + }, + { + "epoch": 0.1607907507850414, + "grad_norm": 1.986676573753357, + "learning_rate": 4.6877905955958024e-05, + "loss": 4.5315, + "step": 27036 + }, + { + "epoch": 0.1607966980683224, + "grad_norm": 1.727644443511963, + "learning_rate": 4.687767991719235e-05, + "loss": 4.5498, + "step": 27037 + }, + { + "epoch": 0.16080264535160338, + "grad_norm": 1.936285138130188, + "learning_rate": 4.687745387078942e-05, + "loss": 4.2741, + "step": 27038 + }, + { + "epoch": 0.1608085926348844, + "grad_norm": 1.7781955003738403, + "learning_rate": 4.687722781674928e-05, + "loss": 5.0867, + "step": 27039 + }, + { + "epoch": 0.16081453991816538, + "grad_norm": 1.7659040689468384, + "learning_rate": 4.687700175507204e-05, + "loss": 5.2197, + "step": 27040 + }, + { + "epoch": 0.16082048720144637, + "grad_norm": 1.8074475526809692, + "learning_rate": 4.6876775685757755e-05, + "loss": 4.8669, + "step": 27041 + }, + { + "epoch": 0.1608264344847274, + "grad_norm": 1.8640440702438354, + "learning_rate": 4.687654960880652e-05, + "loss": 4.2379, + "step": 27042 + }, + { + "epoch": 0.16083238176800838, + "grad_norm": 2.278597831726074, + "learning_rate": 4.6876323524218405e-05, + "loss": 4.4334, + "step": 27043 + }, + { + "epoch": 0.16083832905128936, + "grad_norm": 1.7002289295196533, + "learning_rate": 4.6876097431993486e-05, + "loss": 4.9251, + "step": 27044 + }, + { + "epoch": 0.16084427633457038, + "grad_norm": 1.626347303390503, + "learning_rate": 4.687587133213186e-05, + "loss": 5.3526, + "step": 27045 + }, + { + "epoch": 0.16085022361785137, + "grad_norm": 1.6184710264205933, + "learning_rate": 4.687564522463358e-05, + "loss": 4.9963, + "step": 27046 + }, + { + "epoch": 0.16085617090113236, + "grad_norm": 1.9560445547103882, + "learning_rate": 4.687541910949874e-05, + "loss": 4.3859, + "step": 27047 + }, + { + "epoch": 0.16086211818441337, + "grad_norm": 1.8181273937225342, + "learning_rate": 4.687519298672743e-05, + "loss": 4.7349, + "step": 27048 + }, + { + "epoch": 0.16086806546769436, + "grad_norm": 1.76878023147583, + "learning_rate": 4.68749668563197e-05, + "loss": 4.6734, + "step": 27049 + }, + { + "epoch": 0.16087401275097535, + "grad_norm": 1.6105148792266846, + "learning_rate": 4.6874740718275655e-05, + "loss": 4.7374, + "step": 27050 + }, + { + "epoch": 0.16087996003425636, + "grad_norm": 1.7216439247131348, + "learning_rate": 4.687451457259536e-05, + "loss": 4.7108, + "step": 27051 + }, + { + "epoch": 0.16088590731753735, + "grad_norm": 1.591200828552246, + "learning_rate": 4.68742884192789e-05, + "loss": 4.8113, + "step": 27052 + }, + { + "epoch": 0.16089185460081834, + "grad_norm": 1.8275965452194214, + "learning_rate": 4.687406225832635e-05, + "loss": 4.765, + "step": 27053 + }, + { + "epoch": 0.16089780188409936, + "grad_norm": 1.796170949935913, + "learning_rate": 4.68738360897378e-05, + "loss": 4.5656, + "step": 27054 + }, + { + "epoch": 0.16090374916738034, + "grad_norm": 1.6721670627593994, + "learning_rate": 4.6873609913513307e-05, + "loss": 4.7761, + "step": 27055 + }, + { + "epoch": 0.16090969645066133, + "grad_norm": 1.577500820159912, + "learning_rate": 4.687338372965296e-05, + "loss": 4.6552, + "step": 27056 + }, + { + "epoch": 0.16091564373394235, + "grad_norm": 1.4649289846420288, + "learning_rate": 4.687315753815685e-05, + "loss": 4.7041, + "step": 27057 + }, + { + "epoch": 0.16092159101722334, + "grad_norm": 1.5088578462600708, + "learning_rate": 4.687293133902505e-05, + "loss": 4.9058, + "step": 27058 + }, + { + "epoch": 0.16092753830050432, + "grad_norm": 1.5987037420272827, + "learning_rate": 4.687270513225763e-05, + "loss": 4.6935, + "step": 27059 + }, + { + "epoch": 0.16093348558378534, + "grad_norm": 1.6780216693878174, + "learning_rate": 4.687247891785468e-05, + "loss": 4.6561, + "step": 27060 + }, + { + "epoch": 0.16093943286706633, + "grad_norm": 1.678200125694275, + "learning_rate": 4.6872252695816265e-05, + "loss": 4.7769, + "step": 27061 + }, + { + "epoch": 0.16094538015034732, + "grad_norm": 1.7499932050704956, + "learning_rate": 4.687202646614248e-05, + "loss": 4.8831, + "step": 27062 + }, + { + "epoch": 0.1609513274336283, + "grad_norm": 1.5174812078475952, + "learning_rate": 4.687180022883339e-05, + "loss": 5.3915, + "step": 27063 + }, + { + "epoch": 0.16095727471690932, + "grad_norm": 1.6853543519973755, + "learning_rate": 4.6871573983889084e-05, + "loss": 5.0194, + "step": 27064 + }, + { + "epoch": 0.1609632220001903, + "grad_norm": 1.590044379234314, + "learning_rate": 4.6871347731309634e-05, + "loss": 4.8239, + "step": 27065 + }, + { + "epoch": 0.1609691692834713, + "grad_norm": 1.6128438711166382, + "learning_rate": 4.6871121471095124e-05, + "loss": 4.418, + "step": 27066 + }, + { + "epoch": 0.1609751165667523, + "grad_norm": 1.5933514833450317, + "learning_rate": 4.6870895203245635e-05, + "loss": 4.5319, + "step": 27067 + }, + { + "epoch": 0.1609810638500333, + "grad_norm": 2.0290753841400146, + "learning_rate": 4.687066892776124e-05, + "loss": 4.2566, + "step": 27068 + }, + { + "epoch": 0.1609870111333143, + "grad_norm": 1.7339308261871338, + "learning_rate": 4.687044264464202e-05, + "loss": 4.7884, + "step": 27069 + }, + { + "epoch": 0.1609929584165953, + "grad_norm": 1.3594622611999512, + "learning_rate": 4.6870216353888056e-05, + "loss": 5.2241, + "step": 27070 + }, + { + "epoch": 0.1609989056998763, + "grad_norm": 1.599043607711792, + "learning_rate": 4.6869990055499424e-05, + "loss": 4.7043, + "step": 27071 + }, + { + "epoch": 0.16100485298315728, + "grad_norm": 1.6405742168426514, + "learning_rate": 4.686976374947621e-05, + "loss": 4.7731, + "step": 27072 + }, + { + "epoch": 0.1610108002664383, + "grad_norm": 1.544199824333191, + "learning_rate": 4.686953743581848e-05, + "loss": 4.3322, + "step": 27073 + }, + { + "epoch": 0.16101674754971929, + "grad_norm": 1.5622215270996094, + "learning_rate": 4.686931111452633e-05, + "loss": 4.4059, + "step": 27074 + }, + { + "epoch": 0.16102269483300027, + "grad_norm": 1.472733497619629, + "learning_rate": 4.6869084785599814e-05, + "loss": 4.5119, + "step": 27075 + }, + { + "epoch": 0.1610286421162813, + "grad_norm": 1.6917856931686401, + "learning_rate": 4.686885844903904e-05, + "loss": 4.4056, + "step": 27076 + }, + { + "epoch": 0.16103458939956228, + "grad_norm": 1.67365300655365, + "learning_rate": 4.6868632104844066e-05, + "loss": 4.6975, + "step": 27077 + }, + { + "epoch": 0.16104053668284327, + "grad_norm": 1.7588708400726318, + "learning_rate": 4.6868405753014974e-05, + "loss": 4.5234, + "step": 27078 + }, + { + "epoch": 0.16104648396612428, + "grad_norm": 1.703722596168518, + "learning_rate": 4.686817939355186e-05, + "loss": 4.8189, + "step": 27079 + }, + { + "epoch": 0.16105243124940527, + "grad_norm": 1.9225337505340576, + "learning_rate": 4.686795302645478e-05, + "loss": 4.6807, + "step": 27080 + }, + { + "epoch": 0.16105837853268626, + "grad_norm": 1.9755665063858032, + "learning_rate": 4.686772665172383e-05, + "loss": 4.6981, + "step": 27081 + }, + { + "epoch": 0.16106432581596727, + "grad_norm": 1.8112698793411255, + "learning_rate": 4.6867500269359084e-05, + "loss": 4.6576, + "step": 27082 + }, + { + "epoch": 0.16107027309924826, + "grad_norm": 1.5739562511444092, + "learning_rate": 4.686727387936062e-05, + "loss": 4.8203, + "step": 27083 + }, + { + "epoch": 0.16107622038252925, + "grad_norm": 1.6816823482513428, + "learning_rate": 4.686704748172851e-05, + "loss": 4.9051, + "step": 27084 + }, + { + "epoch": 0.16108216766581027, + "grad_norm": 1.9315879344940186, + "learning_rate": 4.6866821076462844e-05, + "loss": 4.9205, + "step": 27085 + }, + { + "epoch": 0.16108811494909125, + "grad_norm": 1.9262312650680542, + "learning_rate": 4.686659466356369e-05, + "loss": 4.8491, + "step": 27086 + }, + { + "epoch": 0.16109406223237224, + "grad_norm": 2.244142532348633, + "learning_rate": 4.686636824303114e-05, + "loss": 4.1662, + "step": 27087 + }, + { + "epoch": 0.16110000951565326, + "grad_norm": 1.8732181787490845, + "learning_rate": 4.6866141814865266e-05, + "loss": 4.6906, + "step": 27088 + }, + { + "epoch": 0.16110595679893425, + "grad_norm": 1.7964503765106201, + "learning_rate": 4.686591537906615e-05, + "loss": 4.8282, + "step": 27089 + }, + { + "epoch": 0.16111190408221523, + "grad_norm": 1.828946828842163, + "learning_rate": 4.686568893563387e-05, + "loss": 4.6226, + "step": 27090 + }, + { + "epoch": 0.16111785136549625, + "grad_norm": 1.6230894327163696, + "learning_rate": 4.68654624845685e-05, + "loss": 4.9008, + "step": 27091 + }, + { + "epoch": 0.16112379864877724, + "grad_norm": 1.7094733715057373, + "learning_rate": 4.686523602587012e-05, + "loss": 4.4854, + "step": 27092 + }, + { + "epoch": 0.16112974593205823, + "grad_norm": 1.5419751405715942, + "learning_rate": 4.6865009559538815e-05, + "loss": 4.7452, + "step": 27093 + }, + { + "epoch": 0.16113569321533924, + "grad_norm": 1.7994260787963867, + "learning_rate": 4.686478308557466e-05, + "loss": 4.798, + "step": 27094 + }, + { + "epoch": 0.16114164049862023, + "grad_norm": 1.5732755661010742, + "learning_rate": 4.6864556603977736e-05, + "loss": 5.0714, + "step": 27095 + }, + { + "epoch": 0.16114758778190122, + "grad_norm": 1.7569549083709717, + "learning_rate": 4.686433011474812e-05, + "loss": 5.1888, + "step": 27096 + }, + { + "epoch": 0.16115353506518224, + "grad_norm": 1.5478622913360596, + "learning_rate": 4.6864103617885895e-05, + "loss": 5.1684, + "step": 27097 + }, + { + "epoch": 0.16115948234846322, + "grad_norm": 1.80837082862854, + "learning_rate": 4.6863877113391136e-05, + "loss": 5.0916, + "step": 27098 + }, + { + "epoch": 0.1611654296317442, + "grad_norm": 1.6820951700210571, + "learning_rate": 4.686365060126392e-05, + "loss": 5.0685, + "step": 27099 + }, + { + "epoch": 0.16117137691502523, + "grad_norm": 1.6210129261016846, + "learning_rate": 4.686342408150434e-05, + "loss": 4.591, + "step": 27100 + }, + { + "epoch": 0.16117732419830622, + "grad_norm": 1.7377861738204956, + "learning_rate": 4.6863197554112455e-05, + "loss": 4.7656, + "step": 27101 + }, + { + "epoch": 0.1611832714815872, + "grad_norm": 1.5875985622406006, + "learning_rate": 4.686297101908835e-05, + "loss": 5.003, + "step": 27102 + }, + { + "epoch": 0.16118921876486822, + "grad_norm": 1.6775810718536377, + "learning_rate": 4.686274447643212e-05, + "loss": 5.269, + "step": 27103 + }, + { + "epoch": 0.1611951660481492, + "grad_norm": 1.7519687414169312, + "learning_rate": 4.6862517926143826e-05, + "loss": 5.3185, + "step": 27104 + }, + { + "epoch": 0.1612011133314302, + "grad_norm": 1.6947530508041382, + "learning_rate": 4.6862291368223554e-05, + "loss": 5.0105, + "step": 27105 + }, + { + "epoch": 0.1612070606147112, + "grad_norm": 1.6445891857147217, + "learning_rate": 4.686206480267138e-05, + "loss": 4.6697, + "step": 27106 + }, + { + "epoch": 0.1612130078979922, + "grad_norm": 1.7407753467559814, + "learning_rate": 4.6861838229487385e-05, + "loss": 4.6508, + "step": 27107 + }, + { + "epoch": 0.1612189551812732, + "grad_norm": 1.7013847827911377, + "learning_rate": 4.686161164867164e-05, + "loss": 4.6613, + "step": 27108 + }, + { + "epoch": 0.1612249024645542, + "grad_norm": 1.5500074625015259, + "learning_rate": 4.686138506022425e-05, + "loss": 4.5501, + "step": 27109 + }, + { + "epoch": 0.1612308497478352, + "grad_norm": 1.7138715982437134, + "learning_rate": 4.686115846414526e-05, + "loss": 5.1747, + "step": 27110 + }, + { + "epoch": 0.16123679703111618, + "grad_norm": 1.6952149868011475, + "learning_rate": 4.686093186043478e-05, + "loss": 5.6011, + "step": 27111 + }, + { + "epoch": 0.1612427443143972, + "grad_norm": 1.4229787588119507, + "learning_rate": 4.6860705249092864e-05, + "loss": 5.2581, + "step": 27112 + }, + { + "epoch": 0.16124869159767818, + "grad_norm": 1.5605623722076416, + "learning_rate": 4.68604786301196e-05, + "loss": 4.8483, + "step": 27113 + }, + { + "epoch": 0.16125463888095917, + "grad_norm": 1.7442682981491089, + "learning_rate": 4.686025200351508e-05, + "loss": 5.1217, + "step": 27114 + }, + { + "epoch": 0.1612605861642402, + "grad_norm": 1.8555563688278198, + "learning_rate": 4.6860025369279365e-05, + "loss": 4.8616, + "step": 27115 + }, + { + "epoch": 0.16126653344752118, + "grad_norm": 1.525015115737915, + "learning_rate": 4.685979872741254e-05, + "loss": 5.5315, + "step": 27116 + }, + { + "epoch": 0.16127248073080216, + "grad_norm": 1.656496524810791, + "learning_rate": 4.685957207791468e-05, + "loss": 5.081, + "step": 27117 + }, + { + "epoch": 0.16127842801408318, + "grad_norm": 1.717789649963379, + "learning_rate": 4.685934542078588e-05, + "loss": 5.0375, + "step": 27118 + }, + { + "epoch": 0.16128437529736417, + "grad_norm": 1.4504932165145874, + "learning_rate": 4.6859118756026205e-05, + "loss": 5.5946, + "step": 27119 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 1.7576172351837158, + "learning_rate": 4.685889208363573e-05, + "loss": 5.2869, + "step": 27120 + }, + { + "epoch": 0.16129626986392614, + "grad_norm": 1.7422624826431274, + "learning_rate": 4.685866540361456e-05, + "loss": 5.5119, + "step": 27121 + }, + { + "epoch": 0.16130221714720716, + "grad_norm": 1.8503597974777222, + "learning_rate": 4.685843871596274e-05, + "loss": 5.2748, + "step": 27122 + }, + { + "epoch": 0.16130816443048815, + "grad_norm": 1.4682457447052002, + "learning_rate": 4.685821202068037e-05, + "loss": 5.1808, + "step": 27123 + }, + { + "epoch": 0.16131411171376914, + "grad_norm": 1.6852977275848389, + "learning_rate": 4.685798531776752e-05, + "loss": 5.0024, + "step": 27124 + }, + { + "epoch": 0.16132005899705015, + "grad_norm": 1.3914788961410522, + "learning_rate": 4.6857758607224275e-05, + "loss": 5.6072, + "step": 27125 + }, + { + "epoch": 0.16132600628033114, + "grad_norm": 1.3304249048233032, + "learning_rate": 4.6857531889050716e-05, + "loss": 5.6519, + "step": 27126 + }, + { + "epoch": 0.16133195356361213, + "grad_norm": 1.4981189966201782, + "learning_rate": 4.6857305163246915e-05, + "loss": 5.377, + "step": 27127 + }, + { + "epoch": 0.16133790084689315, + "grad_norm": 1.6323606967926025, + "learning_rate": 4.685707842981295e-05, + "loss": 5.3525, + "step": 27128 + }, + { + "epoch": 0.16134384813017413, + "grad_norm": 1.7571280002593994, + "learning_rate": 4.685685168874892e-05, + "loss": 5.7243, + "step": 27129 + }, + { + "epoch": 0.16134979541345512, + "grad_norm": 1.4080052375793457, + "learning_rate": 4.685662494005487e-05, + "loss": 5.368, + "step": 27130 + }, + { + "epoch": 0.16135574269673614, + "grad_norm": 1.3173414468765259, + "learning_rate": 4.685639818373091e-05, + "loss": 5.6447, + "step": 27131 + }, + { + "epoch": 0.16136168998001713, + "grad_norm": 1.6236382722854614, + "learning_rate": 4.685617141977711e-05, + "loss": 5.4868, + "step": 27132 + }, + { + "epoch": 0.1613676372632981, + "grad_norm": 1.4955110549926758, + "learning_rate": 4.6855944648193535e-05, + "loss": 5.6484, + "step": 27133 + }, + { + "epoch": 0.16137358454657913, + "grad_norm": 1.408130407333374, + "learning_rate": 4.685571786898028e-05, + "loss": 5.4925, + "step": 27134 + }, + { + "epoch": 0.16137953182986012, + "grad_norm": 1.2188119888305664, + "learning_rate": 4.685549108213742e-05, + "loss": 5.459, + "step": 27135 + }, + { + "epoch": 0.1613854791131411, + "grad_norm": 1.5991405248641968, + "learning_rate": 4.685526428766503e-05, + "loss": 5.3962, + "step": 27136 + }, + { + "epoch": 0.16139142639642212, + "grad_norm": 1.3470097780227661, + "learning_rate": 4.68550374855632e-05, + "loss": 5.4446, + "step": 27137 + }, + { + "epoch": 0.1613973736797031, + "grad_norm": 1.439078450202942, + "learning_rate": 4.685481067583201e-05, + "loss": 5.5934, + "step": 27138 + }, + { + "epoch": 0.1614033209629841, + "grad_norm": 1.5195162296295166, + "learning_rate": 4.6854583858471514e-05, + "loss": 5.5948, + "step": 27139 + }, + { + "epoch": 0.16140926824626511, + "grad_norm": 1.3565785884857178, + "learning_rate": 4.6854357033481815e-05, + "loss": 5.4467, + "step": 27140 + }, + { + "epoch": 0.1614152155295461, + "grad_norm": 1.3213258981704712, + "learning_rate": 4.685413020086299e-05, + "loss": 4.7896, + "step": 27141 + }, + { + "epoch": 0.1614211628128271, + "grad_norm": 1.6580665111541748, + "learning_rate": 4.6853903360615106e-05, + "loss": 4.9581, + "step": 27142 + }, + { + "epoch": 0.1614271100961081, + "grad_norm": 1.5277694463729858, + "learning_rate": 4.685367651273825e-05, + "loss": 5.3508, + "step": 27143 + }, + { + "epoch": 0.1614330573793891, + "grad_norm": 1.6369842290878296, + "learning_rate": 4.685344965723251e-05, + "loss": 5.2761, + "step": 27144 + }, + { + "epoch": 0.16143900466267008, + "grad_norm": 1.6954752206802368, + "learning_rate": 4.685322279409795e-05, + "loss": 5.1258, + "step": 27145 + }, + { + "epoch": 0.1614449519459511, + "grad_norm": 1.5073758363723755, + "learning_rate": 4.6852995923334664e-05, + "loss": 5.2927, + "step": 27146 + }, + { + "epoch": 0.1614508992292321, + "grad_norm": 1.5817281007766724, + "learning_rate": 4.685276904494271e-05, + "loss": 5.208, + "step": 27147 + }, + { + "epoch": 0.16145684651251307, + "grad_norm": 1.4444465637207031, + "learning_rate": 4.685254215892219e-05, + "loss": 5.0568, + "step": 27148 + }, + { + "epoch": 0.1614627937957941, + "grad_norm": 1.6507529020309448, + "learning_rate": 4.6852315265273175e-05, + "loss": 4.4881, + "step": 27149 + }, + { + "epoch": 0.16146874107907508, + "grad_norm": 1.3630253076553345, + "learning_rate": 4.685208836399573e-05, + "loss": 4.4938, + "step": 27150 + }, + { + "epoch": 0.16147468836235607, + "grad_norm": 1.5907013416290283, + "learning_rate": 4.685186145508996e-05, + "loss": 4.6613, + "step": 27151 + }, + { + "epoch": 0.16148063564563708, + "grad_norm": 1.4582465887069702, + "learning_rate": 4.6851634538555925e-05, + "loss": 4.8144, + "step": 27152 + }, + { + "epoch": 0.16148658292891807, + "grad_norm": 1.5481383800506592, + "learning_rate": 4.685140761439371e-05, + "loss": 5.2502, + "step": 27153 + }, + { + "epoch": 0.16149253021219906, + "grad_norm": 1.523826003074646, + "learning_rate": 4.685118068260339e-05, + "loss": 5.6317, + "step": 27154 + }, + { + "epoch": 0.16149847749548008, + "grad_norm": 1.502137541770935, + "learning_rate": 4.6850953743185055e-05, + "loss": 5.12, + "step": 27155 + }, + { + "epoch": 0.16150442477876106, + "grad_norm": 1.5802767276763916, + "learning_rate": 4.6850726796138776e-05, + "loss": 4.8374, + "step": 27156 + }, + { + "epoch": 0.16151037206204205, + "grad_norm": 1.6513301134109497, + "learning_rate": 4.685049984146463e-05, + "loss": 5.0668, + "step": 27157 + }, + { + "epoch": 0.16151631934532307, + "grad_norm": 1.5628081560134888, + "learning_rate": 4.6850272879162714e-05, + "loss": 4.7497, + "step": 27158 + }, + { + "epoch": 0.16152226662860406, + "grad_norm": 1.4100914001464844, + "learning_rate": 4.685004590923308e-05, + "loss": 5.606, + "step": 27159 + }, + { + "epoch": 0.16152821391188504, + "grad_norm": 1.3457648754119873, + "learning_rate": 4.684981893167583e-05, + "loss": 5.5325, + "step": 27160 + }, + { + "epoch": 0.16153416119516606, + "grad_norm": 1.6010215282440186, + "learning_rate": 4.684959194649102e-05, + "loss": 5.5653, + "step": 27161 + }, + { + "epoch": 0.16154010847844705, + "grad_norm": 1.8687788248062134, + "learning_rate": 4.684936495367875e-05, + "loss": 5.2795, + "step": 27162 + }, + { + "epoch": 0.16154605576172804, + "grad_norm": 2.1888749599456787, + "learning_rate": 4.68491379532391e-05, + "loss": 5.0313, + "step": 27163 + }, + { + "epoch": 0.16155200304500905, + "grad_norm": 1.466637372970581, + "learning_rate": 4.684891094517214e-05, + "loss": 5.221, + "step": 27164 + }, + { + "epoch": 0.16155795032829004, + "grad_norm": 1.518754482269287, + "learning_rate": 4.684868392947794e-05, + "loss": 5.037, + "step": 27165 + }, + { + "epoch": 0.16156389761157103, + "grad_norm": 1.550714373588562, + "learning_rate": 4.68484569061566e-05, + "loss": 4.8986, + "step": 27166 + }, + { + "epoch": 0.16156984489485204, + "grad_norm": 1.5226268768310547, + "learning_rate": 4.6848229875208186e-05, + "loss": 5.2425, + "step": 27167 + }, + { + "epoch": 0.16157579217813303, + "grad_norm": 1.4854047298431396, + "learning_rate": 4.684800283663279e-05, + "loss": 5.0766, + "step": 27168 + }, + { + "epoch": 0.16158173946141402, + "grad_norm": 1.6625477075576782, + "learning_rate": 4.684777579043047e-05, + "loss": 5.1967, + "step": 27169 + }, + { + "epoch": 0.16158768674469504, + "grad_norm": 1.409916877746582, + "learning_rate": 4.684754873660132e-05, + "loss": 5.0735, + "step": 27170 + }, + { + "epoch": 0.16159363402797602, + "grad_norm": 1.4444838762283325, + "learning_rate": 4.684732167514542e-05, + "loss": 5.013, + "step": 27171 + }, + { + "epoch": 0.161599581311257, + "grad_norm": 1.5226528644561768, + "learning_rate": 4.684709460606284e-05, + "loss": 4.9328, + "step": 27172 + }, + { + "epoch": 0.16160552859453803, + "grad_norm": 1.7353004217147827, + "learning_rate": 4.6846867529353664e-05, + "loss": 4.9422, + "step": 27173 + }, + { + "epoch": 0.16161147587781902, + "grad_norm": 1.613166093826294, + "learning_rate": 4.6846640445017974e-05, + "loss": 5.0545, + "step": 27174 + }, + { + "epoch": 0.1616174231611, + "grad_norm": 1.7949568033218384, + "learning_rate": 4.684641335305585e-05, + "loss": 4.944, + "step": 27175 + }, + { + "epoch": 0.16162337044438102, + "grad_norm": 1.508563756942749, + "learning_rate": 4.684618625346737e-05, + "loss": 5.2551, + "step": 27176 + }, + { + "epoch": 0.161629317727662, + "grad_norm": 1.6090425252914429, + "learning_rate": 4.6845959146252605e-05, + "loss": 5.0839, + "step": 27177 + }, + { + "epoch": 0.161635265010943, + "grad_norm": 1.6595830917358398, + "learning_rate": 4.6845732031411646e-05, + "loss": 5.2307, + "step": 27178 + }, + { + "epoch": 0.16164121229422398, + "grad_norm": 1.787662386894226, + "learning_rate": 4.684550490894457e-05, + "loss": 5.3956, + "step": 27179 + }, + { + "epoch": 0.161647159577505, + "grad_norm": 1.5315039157867432, + "learning_rate": 4.684527777885145e-05, + "loss": 5.8196, + "step": 27180 + }, + { + "epoch": 0.161653106860786, + "grad_norm": 2.004093647003174, + "learning_rate": 4.684505064113238e-05, + "loss": 4.9922, + "step": 27181 + }, + { + "epoch": 0.16165905414406698, + "grad_norm": 1.655718445777893, + "learning_rate": 4.684482349578742e-05, + "loss": 5.0178, + "step": 27182 + }, + { + "epoch": 0.161665001427348, + "grad_norm": 1.721838116645813, + "learning_rate": 4.6844596342816654e-05, + "loss": 4.8412, + "step": 27183 + }, + { + "epoch": 0.16167094871062898, + "grad_norm": 1.6883397102355957, + "learning_rate": 4.684436918222017e-05, + "loss": 4.602, + "step": 27184 + }, + { + "epoch": 0.16167689599390997, + "grad_norm": 1.4376475811004639, + "learning_rate": 4.6844142013998035e-05, + "loss": 4.7408, + "step": 27185 + }, + { + "epoch": 0.16168284327719099, + "grad_norm": 1.5542229413986206, + "learning_rate": 4.684391483815035e-05, + "loss": 5.384, + "step": 27186 + }, + { + "epoch": 0.16168879056047197, + "grad_norm": 1.4321660995483398, + "learning_rate": 4.6843687654677163e-05, + "loss": 5.3393, + "step": 27187 + }, + { + "epoch": 0.16169473784375296, + "grad_norm": 1.7398759126663208, + "learning_rate": 4.684346046357858e-05, + "loss": 5.2492, + "step": 27188 + }, + { + "epoch": 0.16170068512703398, + "grad_norm": 1.802462100982666, + "learning_rate": 4.684323326485467e-05, + "loss": 5.8437, + "step": 27189 + }, + { + "epoch": 0.16170663241031497, + "grad_norm": 1.5931847095489502, + "learning_rate": 4.684300605850551e-05, + "loss": 5.6417, + "step": 27190 + }, + { + "epoch": 0.16171257969359595, + "grad_norm": 1.6900547742843628, + "learning_rate": 4.684277884453119e-05, + "loss": 4.4741, + "step": 27191 + }, + { + "epoch": 0.16171852697687697, + "grad_norm": 1.5422314405441284, + "learning_rate": 4.684255162293178e-05, + "loss": 4.5219, + "step": 27192 + }, + { + "epoch": 0.16172447426015796, + "grad_norm": 1.816253662109375, + "learning_rate": 4.6842324393707354e-05, + "loss": 4.5676, + "step": 27193 + }, + { + "epoch": 0.16173042154343895, + "grad_norm": 1.3935781717300415, + "learning_rate": 4.6842097156858e-05, + "loss": 5.5091, + "step": 27194 + }, + { + "epoch": 0.16173636882671996, + "grad_norm": 1.7103323936462402, + "learning_rate": 4.6841869912383794e-05, + "loss": 5.3831, + "step": 27195 + }, + { + "epoch": 0.16174231611000095, + "grad_norm": 1.4029678106307983, + "learning_rate": 4.6841642660284826e-05, + "loss": 5.2129, + "step": 27196 + }, + { + "epoch": 0.16174826339328194, + "grad_norm": 1.7814414501190186, + "learning_rate": 4.684141540056116e-05, + "loss": 5.3053, + "step": 27197 + }, + { + "epoch": 0.16175421067656295, + "grad_norm": 1.5314795970916748, + "learning_rate": 4.684118813321288e-05, + "loss": 5.3863, + "step": 27198 + }, + { + "epoch": 0.16176015795984394, + "grad_norm": 1.5359210968017578, + "learning_rate": 4.6840960858240065e-05, + "loss": 5.1326, + "step": 27199 + }, + { + "epoch": 0.16176610524312493, + "grad_norm": 1.5624679327011108, + "learning_rate": 4.68407335756428e-05, + "loss": 4.8275, + "step": 27200 + }, + { + "epoch": 0.16177205252640595, + "grad_norm": 1.4580225944519043, + "learning_rate": 4.6840506285421165e-05, + "loss": 4.8576, + "step": 27201 + }, + { + "epoch": 0.16177799980968693, + "grad_norm": 1.687788724899292, + "learning_rate": 4.684027898757523e-05, + "loss": 4.8731, + "step": 27202 + }, + { + "epoch": 0.16178394709296792, + "grad_norm": 1.882171869277954, + "learning_rate": 4.684005168210508e-05, + "loss": 4.8912, + "step": 27203 + }, + { + "epoch": 0.16178989437624894, + "grad_norm": 1.513374924659729, + "learning_rate": 4.6839824369010795e-05, + "loss": 5.2447, + "step": 27204 + }, + { + "epoch": 0.16179584165952993, + "grad_norm": 1.7321797609329224, + "learning_rate": 4.683959704829245e-05, + "loss": 5.0003, + "step": 27205 + }, + { + "epoch": 0.16180178894281091, + "grad_norm": 1.677239179611206, + "learning_rate": 4.683936971995013e-05, + "loss": 5.4732, + "step": 27206 + }, + { + "epoch": 0.16180773622609193, + "grad_norm": 1.615957498550415, + "learning_rate": 4.6839142383983906e-05, + "loss": 5.4448, + "step": 27207 + }, + { + "epoch": 0.16181368350937292, + "grad_norm": 1.4981861114501953, + "learning_rate": 4.6838915040393874e-05, + "loss": 5.4369, + "step": 27208 + }, + { + "epoch": 0.1618196307926539, + "grad_norm": 1.5658632516860962, + "learning_rate": 4.683868768918009e-05, + "loss": 5.474, + "step": 27209 + }, + { + "epoch": 0.16182557807593492, + "grad_norm": 1.469122052192688, + "learning_rate": 4.6838460330342646e-05, + "loss": 5.3001, + "step": 27210 + }, + { + "epoch": 0.1618315253592159, + "grad_norm": 1.5378628969192505, + "learning_rate": 4.683823296388163e-05, + "loss": 4.8535, + "step": 27211 + }, + { + "epoch": 0.1618374726424969, + "grad_norm": 1.6330792903900146, + "learning_rate": 4.6838005589797106e-05, + "loss": 4.812, + "step": 27212 + }, + { + "epoch": 0.16184341992577791, + "grad_norm": 1.89853036403656, + "learning_rate": 4.683777820808917e-05, + "loss": 5.1666, + "step": 27213 + }, + { + "epoch": 0.1618493672090589, + "grad_norm": 1.5161629915237427, + "learning_rate": 4.683755081875788e-05, + "loss": 5.1444, + "step": 27214 + }, + { + "epoch": 0.1618553144923399, + "grad_norm": 1.6083909273147583, + "learning_rate": 4.683732342180333e-05, + "loss": 5.1403, + "step": 27215 + }, + { + "epoch": 0.1618612617756209, + "grad_norm": 1.5731655359268188, + "learning_rate": 4.68370960172256e-05, + "loss": 5.1038, + "step": 27216 + }, + { + "epoch": 0.1618672090589019, + "grad_norm": 1.8221924304962158, + "learning_rate": 4.6836868605024756e-05, + "loss": 4.8889, + "step": 27217 + }, + { + "epoch": 0.16187315634218288, + "grad_norm": 1.7264484167099, + "learning_rate": 4.683664118520089e-05, + "loss": 5.2846, + "step": 27218 + }, + { + "epoch": 0.1618791036254639, + "grad_norm": 1.6429424285888672, + "learning_rate": 4.683641375775409e-05, + "loss": 5.1433, + "step": 27219 + }, + { + "epoch": 0.1618850509087449, + "grad_norm": 1.6444041728973389, + "learning_rate": 4.683618632268441e-05, + "loss": 5.7116, + "step": 27220 + }, + { + "epoch": 0.16189099819202588, + "grad_norm": 1.595996379852295, + "learning_rate": 4.683595887999195e-05, + "loss": 5.4419, + "step": 27221 + }, + { + "epoch": 0.1618969454753069, + "grad_norm": 1.489001989364624, + "learning_rate": 4.6835731429676776e-05, + "loss": 5.2004, + "step": 27222 + }, + { + "epoch": 0.16190289275858788, + "grad_norm": 1.6208230257034302, + "learning_rate": 4.683550397173898e-05, + "loss": 5.2405, + "step": 27223 + }, + { + "epoch": 0.16190884004186887, + "grad_norm": 1.7584507465362549, + "learning_rate": 4.683527650617863e-05, + "loss": 4.5921, + "step": 27224 + }, + { + "epoch": 0.16191478732514988, + "grad_norm": 1.8459594249725342, + "learning_rate": 4.683504903299581e-05, + "loss": 4.6269, + "step": 27225 + }, + { + "epoch": 0.16192073460843087, + "grad_norm": 2.055671453475952, + "learning_rate": 4.683482155219061e-05, + "loss": 4.8219, + "step": 27226 + }, + { + "epoch": 0.16192668189171186, + "grad_norm": 1.8772468566894531, + "learning_rate": 4.683459406376309e-05, + "loss": 4.9343, + "step": 27227 + }, + { + "epoch": 0.16193262917499288, + "grad_norm": 1.8033567667007446, + "learning_rate": 4.683436656771334e-05, + "loss": 4.5181, + "step": 27228 + }, + { + "epoch": 0.16193857645827386, + "grad_norm": 1.8112131357192993, + "learning_rate": 4.6834139064041436e-05, + "loss": 4.6479, + "step": 27229 + }, + { + "epoch": 0.16194452374155485, + "grad_norm": 1.958721399307251, + "learning_rate": 4.6833911552747466e-05, + "loss": 4.3747, + "step": 27230 + }, + { + "epoch": 0.16195047102483587, + "grad_norm": 1.9740078449249268, + "learning_rate": 4.683368403383151e-05, + "loss": 4.5357, + "step": 27231 + }, + { + "epoch": 0.16195641830811686, + "grad_norm": 1.8071064949035645, + "learning_rate": 4.683345650729362e-05, + "loss": 4.2025, + "step": 27232 + }, + { + "epoch": 0.16196236559139784, + "grad_norm": 2.067153215408325, + "learning_rate": 4.6833228973133914e-05, + "loss": 4.7599, + "step": 27233 + }, + { + "epoch": 0.16196831287467886, + "grad_norm": 2.219170570373535, + "learning_rate": 4.683300143135244e-05, + "loss": 4.8643, + "step": 27234 + }, + { + "epoch": 0.16197426015795985, + "grad_norm": 1.8077818155288696, + "learning_rate": 4.68327738819493e-05, + "loss": 4.9781, + "step": 27235 + }, + { + "epoch": 0.16198020744124084, + "grad_norm": 2.1170096397399902, + "learning_rate": 4.683254632492456e-05, + "loss": 4.5507, + "step": 27236 + }, + { + "epoch": 0.16198615472452182, + "grad_norm": 1.9441372156143188, + "learning_rate": 4.6832318760278306e-05, + "loss": 4.2419, + "step": 27237 + }, + { + "epoch": 0.16199210200780284, + "grad_norm": 2.261038064956665, + "learning_rate": 4.6832091188010615e-05, + "loss": 4.8287, + "step": 27238 + }, + { + "epoch": 0.16199804929108383, + "grad_norm": 1.906253457069397, + "learning_rate": 4.6831863608121565e-05, + "loss": 4.7154, + "step": 27239 + }, + { + "epoch": 0.16200399657436482, + "grad_norm": 1.7181471586227417, + "learning_rate": 4.683163602061124e-05, + "loss": 4.8286, + "step": 27240 + }, + { + "epoch": 0.16200994385764583, + "grad_norm": 1.6163973808288574, + "learning_rate": 4.683140842547971e-05, + "loss": 5.1988, + "step": 27241 + }, + { + "epoch": 0.16201589114092682, + "grad_norm": 1.8723608255386353, + "learning_rate": 4.6831180822727064e-05, + "loss": 4.6135, + "step": 27242 + }, + { + "epoch": 0.1620218384242078, + "grad_norm": 1.557589054107666, + "learning_rate": 4.683095321235338e-05, + "loss": 4.7632, + "step": 27243 + }, + { + "epoch": 0.16202778570748883, + "grad_norm": 1.3284127712249756, + "learning_rate": 4.683072559435873e-05, + "loss": 4.0683, + "step": 27244 + }, + { + "epoch": 0.1620337329907698, + "grad_norm": 1.5295307636260986, + "learning_rate": 4.68304979687432e-05, + "loss": 4.2219, + "step": 27245 + }, + { + "epoch": 0.1620396802740508, + "grad_norm": 2.0153698921203613, + "learning_rate": 4.683027033550687e-05, + "loss": 4.8334, + "step": 27246 + }, + { + "epoch": 0.16204562755733182, + "grad_norm": 1.3090236186981201, + "learning_rate": 4.683004269464983e-05, + "loss": 5.1588, + "step": 27247 + }, + { + "epoch": 0.1620515748406128, + "grad_norm": 1.4936387538909912, + "learning_rate": 4.6829815046172136e-05, + "loss": 5.2226, + "step": 27248 + }, + { + "epoch": 0.1620575221238938, + "grad_norm": 1.6028317213058472, + "learning_rate": 4.682958739007388e-05, + "loss": 5.0174, + "step": 27249 + }, + { + "epoch": 0.1620634694071748, + "grad_norm": 1.221101999282837, + "learning_rate": 4.6829359726355144e-05, + "loss": 5.3307, + "step": 27250 + }, + { + "epoch": 0.1620694166904558, + "grad_norm": 1.348512887954712, + "learning_rate": 4.6829132055016e-05, + "loss": 5.4754, + "step": 27251 + }, + { + "epoch": 0.16207536397373679, + "grad_norm": 1.506373643875122, + "learning_rate": 4.682890437605654e-05, + "loss": 5.0422, + "step": 27252 + }, + { + "epoch": 0.1620813112570178, + "grad_norm": 1.7753325700759888, + "learning_rate": 4.6828676689476825e-05, + "loss": 5.0218, + "step": 27253 + }, + { + "epoch": 0.1620872585402988, + "grad_norm": 1.5221372842788696, + "learning_rate": 4.6828448995276945e-05, + "loss": 5.1423, + "step": 27254 + }, + { + "epoch": 0.16209320582357978, + "grad_norm": 1.7772079706192017, + "learning_rate": 4.682822129345699e-05, + "loss": 4.8782, + "step": 27255 + }, + { + "epoch": 0.1620991531068608, + "grad_norm": 1.495651125907898, + "learning_rate": 4.6827993584017014e-05, + "loss": 5.2042, + "step": 27256 + }, + { + "epoch": 0.16210510039014178, + "grad_norm": 1.5901660919189453, + "learning_rate": 4.682776586695712e-05, + "loss": 5.5121, + "step": 27257 + }, + { + "epoch": 0.16211104767342277, + "grad_norm": 1.7442855834960938, + "learning_rate": 4.6827538142277373e-05, + "loss": 4.9278, + "step": 27258 + }, + { + "epoch": 0.16211699495670379, + "grad_norm": 2.777273416519165, + "learning_rate": 4.682731040997786e-05, + "loss": 4.6258, + "step": 27259 + }, + { + "epoch": 0.16212294223998477, + "grad_norm": 1.8470478057861328, + "learning_rate": 4.6827082670058655e-05, + "loss": 4.87, + "step": 27260 + }, + { + "epoch": 0.16212888952326576, + "grad_norm": 1.545902132987976, + "learning_rate": 4.6826854922519844e-05, + "loss": 4.8776, + "step": 27261 + }, + { + "epoch": 0.16213483680654678, + "grad_norm": 1.5720170736312866, + "learning_rate": 4.682662716736151e-05, + "loss": 4.9046, + "step": 27262 + }, + { + "epoch": 0.16214078408982777, + "grad_norm": 1.6243836879730225, + "learning_rate": 4.682639940458372e-05, + "loss": 5.0243, + "step": 27263 + }, + { + "epoch": 0.16214673137310875, + "grad_norm": 2.738065719604492, + "learning_rate": 4.682617163418656e-05, + "loss": 4.1899, + "step": 27264 + }, + { + "epoch": 0.16215267865638977, + "grad_norm": 4.745233058929443, + "learning_rate": 4.682594385617011e-05, + "loss": 3.0819, + "step": 27265 + }, + { + "epoch": 0.16215862593967076, + "grad_norm": 4.1978936195373535, + "learning_rate": 4.6825716070534444e-05, + "loss": 3.1755, + "step": 27266 + }, + { + "epoch": 0.16216457322295175, + "grad_norm": 2.8367183208465576, + "learning_rate": 4.682548827727965e-05, + "loss": 3.53, + "step": 27267 + }, + { + "epoch": 0.16217052050623276, + "grad_norm": 1.7866027355194092, + "learning_rate": 4.6825260476405805e-05, + "loss": 4.6173, + "step": 27268 + }, + { + "epoch": 0.16217646778951375, + "grad_norm": 1.7661093473434448, + "learning_rate": 4.6825032667912984e-05, + "loss": 5.0541, + "step": 27269 + }, + { + "epoch": 0.16218241507279474, + "grad_norm": 1.9146814346313477, + "learning_rate": 4.682480485180127e-05, + "loss": 4.9121, + "step": 27270 + }, + { + "epoch": 0.16218836235607575, + "grad_norm": 2.8185949325561523, + "learning_rate": 4.682457702807075e-05, + "loss": 2.9822, + "step": 27271 + }, + { + "epoch": 0.16219430963935674, + "grad_norm": 3.360478162765503, + "learning_rate": 4.682434919672148e-05, + "loss": 2.2526, + "step": 27272 + }, + { + "epoch": 0.16220025692263773, + "grad_norm": 3.5563254356384277, + "learning_rate": 4.682412135775357e-05, + "loss": 3.3203, + "step": 27273 + }, + { + "epoch": 0.16220620420591875, + "grad_norm": 2.84264874458313, + "learning_rate": 4.682389351116707e-05, + "loss": 3.1093, + "step": 27274 + }, + { + "epoch": 0.16221215148919974, + "grad_norm": 2.6400508880615234, + "learning_rate": 4.682366565696208e-05, + "loss": 4.1745, + "step": 27275 + }, + { + "epoch": 0.16221809877248072, + "grad_norm": 2.5986385345458984, + "learning_rate": 4.682343779513868e-05, + "loss": 5.5863, + "step": 27276 + }, + { + "epoch": 0.16222404605576174, + "grad_norm": 2.3456249237060547, + "learning_rate": 4.6823209925696945e-05, + "loss": 4.3825, + "step": 27277 + }, + { + "epoch": 0.16222999333904273, + "grad_norm": 1.909117341041565, + "learning_rate": 4.682298204863694e-05, + "loss": 4.9451, + "step": 27278 + }, + { + "epoch": 0.16223594062232372, + "grad_norm": 1.6204262971878052, + "learning_rate": 4.682275416395877e-05, + "loss": 5.0483, + "step": 27279 + }, + { + "epoch": 0.16224188790560473, + "grad_norm": 1.5689494609832764, + "learning_rate": 4.68225262716625e-05, + "loss": 5.0821, + "step": 27280 + }, + { + "epoch": 0.16224783518888572, + "grad_norm": 1.553642749786377, + "learning_rate": 4.682229837174821e-05, + "loss": 5.3247, + "step": 27281 + }, + { + "epoch": 0.1622537824721667, + "grad_norm": 2.1375479698181152, + "learning_rate": 4.682207046421597e-05, + "loss": 4.4596, + "step": 27282 + }, + { + "epoch": 0.16225972975544772, + "grad_norm": 2.6894989013671875, + "learning_rate": 4.682184254906589e-05, + "loss": 4.1466, + "step": 27283 + }, + { + "epoch": 0.1622656770387287, + "grad_norm": 2.0883328914642334, + "learning_rate": 4.6821614626298015e-05, + "loss": 4.1182, + "step": 27284 + }, + { + "epoch": 0.1622716243220097, + "grad_norm": 2.263207197189331, + "learning_rate": 4.6821386695912444e-05, + "loss": 4.1029, + "step": 27285 + }, + { + "epoch": 0.16227757160529072, + "grad_norm": 2.2623839378356934, + "learning_rate": 4.6821158757909255e-05, + "loss": 4.0745, + "step": 27286 + }, + { + "epoch": 0.1622835188885717, + "grad_norm": 1.7428866624832153, + "learning_rate": 4.682093081228852e-05, + "loss": 4.7707, + "step": 27287 + }, + { + "epoch": 0.1622894661718527, + "grad_norm": 2.0418710708618164, + "learning_rate": 4.682070285905033e-05, + "loss": 4.5464, + "step": 27288 + }, + { + "epoch": 0.1622954134551337, + "grad_norm": 2.421755313873291, + "learning_rate": 4.682047489819475e-05, + "loss": 3.9835, + "step": 27289 + }, + { + "epoch": 0.1623013607384147, + "grad_norm": 2.3179736137390137, + "learning_rate": 4.682024692972188e-05, + "loss": 3.8936, + "step": 27290 + }, + { + "epoch": 0.16230730802169568, + "grad_norm": 2.144463300704956, + "learning_rate": 4.682001895363177e-05, + "loss": 4.123, + "step": 27291 + }, + { + "epoch": 0.1623132553049767, + "grad_norm": 1.8054444789886475, + "learning_rate": 4.681979096992454e-05, + "loss": 4.5947, + "step": 27292 + }, + { + "epoch": 0.1623192025882577, + "grad_norm": 1.9559820890426636, + "learning_rate": 4.681956297860023e-05, + "loss": 4.1805, + "step": 27293 + }, + { + "epoch": 0.16232514987153868, + "grad_norm": 2.253756284713745, + "learning_rate": 4.6819334979658934e-05, + "loss": 3.7279, + "step": 27294 + }, + { + "epoch": 0.16233109715481966, + "grad_norm": 2.1193337440490723, + "learning_rate": 4.681910697310074e-05, + "loss": 3.646, + "step": 27295 + }, + { + "epoch": 0.16233704443810068, + "grad_norm": 2.2527666091918945, + "learning_rate": 4.681887895892572e-05, + "loss": 4.0891, + "step": 27296 + }, + { + "epoch": 0.16234299172138167, + "grad_norm": 2.255565643310547, + "learning_rate": 4.681865093713396e-05, + "loss": 3.8497, + "step": 27297 + }, + { + "epoch": 0.16234893900466266, + "grad_norm": 2.3153398036956787, + "learning_rate": 4.681842290772552e-05, + "loss": 3.787, + "step": 27298 + }, + { + "epoch": 0.16235488628794367, + "grad_norm": 2.7600228786468506, + "learning_rate": 4.681819487070051e-05, + "loss": 4.0376, + "step": 27299 + }, + { + "epoch": 0.16236083357122466, + "grad_norm": 1.8102682828903198, + "learning_rate": 4.681796682605898e-05, + "loss": 4.1901, + "step": 27300 + }, + { + "epoch": 0.16236678085450565, + "grad_norm": 2.125884771347046, + "learning_rate": 4.6817738773801035e-05, + "loss": 4.4809, + "step": 27301 + }, + { + "epoch": 0.16237272813778666, + "grad_norm": 2.308034658432007, + "learning_rate": 4.681751071392674e-05, + "loss": 3.7836, + "step": 27302 + }, + { + "epoch": 0.16237867542106765, + "grad_norm": 2.2197370529174805, + "learning_rate": 4.6817282646436166e-05, + "loss": 3.7033, + "step": 27303 + }, + { + "epoch": 0.16238462270434864, + "grad_norm": 1.7763569355010986, + "learning_rate": 4.681705457132942e-05, + "loss": 4.7483, + "step": 27304 + }, + { + "epoch": 0.16239056998762966, + "grad_norm": 2.2781457901000977, + "learning_rate": 4.681682648860656e-05, + "loss": 3.5617, + "step": 27305 + }, + { + "epoch": 0.16239651727091065, + "grad_norm": 2.257497787475586, + "learning_rate": 4.6816598398267664e-05, + "loss": 3.7756, + "step": 27306 + }, + { + "epoch": 0.16240246455419163, + "grad_norm": 2.277405261993408, + "learning_rate": 4.681637030031283e-05, + "loss": 3.6759, + "step": 27307 + }, + { + "epoch": 0.16240841183747265, + "grad_norm": 2.160238265991211, + "learning_rate": 4.681614219474212e-05, + "loss": 3.568, + "step": 27308 + }, + { + "epoch": 0.16241435912075364, + "grad_norm": 2.0068106651306152, + "learning_rate": 4.6815914081555624e-05, + "loss": 3.7039, + "step": 27309 + }, + { + "epoch": 0.16242030640403463, + "grad_norm": 3.0893945693969727, + "learning_rate": 4.681568596075341e-05, + "loss": 3.8708, + "step": 27310 + }, + { + "epoch": 0.16242625368731564, + "grad_norm": 2.5544440746307373, + "learning_rate": 4.681545783233557e-05, + "loss": 3.9529, + "step": 27311 + }, + { + "epoch": 0.16243220097059663, + "grad_norm": 1.7706321477890015, + "learning_rate": 4.681522969630218e-05, + "loss": 6.004, + "step": 27312 + }, + { + "epoch": 0.16243814825387762, + "grad_norm": 2.0155160427093506, + "learning_rate": 4.681500155265332e-05, + "loss": 4.1088, + "step": 27313 + }, + { + "epoch": 0.16244409553715863, + "grad_norm": 2.436854124069214, + "learning_rate": 4.681477340138907e-05, + "loss": 3.7281, + "step": 27314 + }, + { + "epoch": 0.16245004282043962, + "grad_norm": 1.7717199325561523, + "learning_rate": 4.68145452425095e-05, + "loss": 4.7058, + "step": 27315 + }, + { + "epoch": 0.1624559901037206, + "grad_norm": 1.8537521362304688, + "learning_rate": 4.6814317076014705e-05, + "loss": 5.5633, + "step": 27316 + }, + { + "epoch": 0.16246193738700163, + "grad_norm": 1.4485749006271362, + "learning_rate": 4.681408890190475e-05, + "loss": 6.1646, + "step": 27317 + }, + { + "epoch": 0.16246788467028261, + "grad_norm": 1.7619411945343018, + "learning_rate": 4.681386072017973e-05, + "loss": 4.9872, + "step": 27318 + }, + { + "epoch": 0.1624738319535636, + "grad_norm": 1.3868266344070435, + "learning_rate": 4.681363253083971e-05, + "loss": 5.337, + "step": 27319 + }, + { + "epoch": 0.16247977923684462, + "grad_norm": 2.339705467224121, + "learning_rate": 4.681340433388478e-05, + "loss": 4.1131, + "step": 27320 + }, + { + "epoch": 0.1624857265201256, + "grad_norm": 2.4623711109161377, + "learning_rate": 4.681317612931502e-05, + "loss": 4.0167, + "step": 27321 + }, + { + "epoch": 0.1624916738034066, + "grad_norm": 2.06557559967041, + "learning_rate": 4.68129479171305e-05, + "loss": 4.4482, + "step": 27322 + }, + { + "epoch": 0.1624976210866876, + "grad_norm": 1.9864208698272705, + "learning_rate": 4.681271969733131e-05, + "loss": 4.5421, + "step": 27323 + }, + { + "epoch": 0.1625035683699686, + "grad_norm": 2.29591703414917, + "learning_rate": 4.6812491469917516e-05, + "loss": 4.4407, + "step": 27324 + }, + { + "epoch": 0.1625095156532496, + "grad_norm": 1.9640796184539795, + "learning_rate": 4.681226323488921e-05, + "loss": 4.3113, + "step": 27325 + }, + { + "epoch": 0.1625154629365306, + "grad_norm": 1.6823822259902954, + "learning_rate": 4.6812034992246464e-05, + "loss": 5.3048, + "step": 27326 + }, + { + "epoch": 0.1625214102198116, + "grad_norm": 1.7765403985977173, + "learning_rate": 4.681180674198937e-05, + "loss": 4.7484, + "step": 27327 + }, + { + "epoch": 0.16252735750309258, + "grad_norm": 2.8496274948120117, + "learning_rate": 4.6811578484118e-05, + "loss": 2.9788, + "step": 27328 + }, + { + "epoch": 0.1625333047863736, + "grad_norm": 2.600203514099121, + "learning_rate": 4.681135021863243e-05, + "loss": 3.6706, + "step": 27329 + }, + { + "epoch": 0.16253925206965458, + "grad_norm": 2.3449292182922363, + "learning_rate": 4.681112194553274e-05, + "loss": 3.1501, + "step": 27330 + }, + { + "epoch": 0.16254519935293557, + "grad_norm": 2.6009342670440674, + "learning_rate": 4.681089366481902e-05, + "loss": 3.3097, + "step": 27331 + }, + { + "epoch": 0.1625511466362166, + "grad_norm": 2.4977009296417236, + "learning_rate": 4.681066537649134e-05, + "loss": 3.2114, + "step": 27332 + }, + { + "epoch": 0.16255709391949758, + "grad_norm": 1.9522204399108887, + "learning_rate": 4.681043708054978e-05, + "loss": 4.9502, + "step": 27333 + }, + { + "epoch": 0.16256304120277856, + "grad_norm": 1.8254719972610474, + "learning_rate": 4.6810208776994425e-05, + "loss": 5.1497, + "step": 27334 + }, + { + "epoch": 0.16256898848605958, + "grad_norm": 2.9470701217651367, + "learning_rate": 4.680998046582535e-05, + "loss": 3.1034, + "step": 27335 + }, + { + "epoch": 0.16257493576934057, + "grad_norm": 3.033200979232788, + "learning_rate": 4.680975214704263e-05, + "loss": 3.1627, + "step": 27336 + }, + { + "epoch": 0.16258088305262156, + "grad_norm": 2.9590744972229004, + "learning_rate": 4.680952382064636e-05, + "loss": 3.6219, + "step": 27337 + }, + { + "epoch": 0.16258683033590257, + "grad_norm": 1.759320616722107, + "learning_rate": 4.680929548663661e-05, + "loss": 5.0067, + "step": 27338 + }, + { + "epoch": 0.16259277761918356, + "grad_norm": 1.7571178674697876, + "learning_rate": 4.680906714501345e-05, + "loss": 4.9829, + "step": 27339 + }, + { + "epoch": 0.16259872490246455, + "grad_norm": 1.7212225198745728, + "learning_rate": 4.680883879577698e-05, + "loss": 4.854, + "step": 27340 + }, + { + "epoch": 0.16260467218574556, + "grad_norm": 1.732384204864502, + "learning_rate": 4.680861043892727e-05, + "loss": 4.7023, + "step": 27341 + }, + { + "epoch": 0.16261061946902655, + "grad_norm": 1.8678463697433472, + "learning_rate": 4.680838207446439e-05, + "loss": 5.3755, + "step": 27342 + }, + { + "epoch": 0.16261656675230754, + "grad_norm": 1.6973927021026611, + "learning_rate": 4.680815370238843e-05, + "loss": 4.678, + "step": 27343 + }, + { + "epoch": 0.16262251403558856, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.680792532269948e-05, + "loss": 4.8053, + "step": 27344 + }, + { + "epoch": 0.16262846131886954, + "grad_norm": 1.6367913484573364, + "learning_rate": 4.6807696935397604e-05, + "loss": 4.8855, + "step": 27345 + }, + { + "epoch": 0.16263440860215053, + "grad_norm": 1.5021651983261108, + "learning_rate": 4.680746854048288e-05, + "loss": 5.318, + "step": 27346 + }, + { + "epoch": 0.16264035588543155, + "grad_norm": 1.329917073249817, + "learning_rate": 4.68072401379554e-05, + "loss": 6.0137, + "step": 27347 + }, + { + "epoch": 0.16264630316871254, + "grad_norm": 1.6316022872924805, + "learning_rate": 4.680701172781524e-05, + "loss": 5.8787, + "step": 27348 + }, + { + "epoch": 0.16265225045199352, + "grad_norm": 1.640479564666748, + "learning_rate": 4.6806783310062476e-05, + "loss": 5.568, + "step": 27349 + }, + { + "epoch": 0.16265819773527454, + "grad_norm": 1.6600250005722046, + "learning_rate": 4.680655488469718e-05, + "loss": 5.3461, + "step": 27350 + }, + { + "epoch": 0.16266414501855553, + "grad_norm": 1.7950623035430908, + "learning_rate": 4.680632645171945e-05, + "loss": 4.8529, + "step": 27351 + }, + { + "epoch": 0.16267009230183652, + "grad_norm": 1.732972502708435, + "learning_rate": 4.6806098011129356e-05, + "loss": 4.8085, + "step": 27352 + }, + { + "epoch": 0.1626760395851175, + "grad_norm": 1.7508574724197388, + "learning_rate": 4.680586956292698e-05, + "loss": 4.9188, + "step": 27353 + }, + { + "epoch": 0.16268198686839852, + "grad_norm": 1.521814227104187, + "learning_rate": 4.6805641107112395e-05, + "loss": 4.6616, + "step": 27354 + }, + { + "epoch": 0.1626879341516795, + "grad_norm": 1.7594850063323975, + "learning_rate": 4.6805412643685684e-05, + "loss": 4.6634, + "step": 27355 + }, + { + "epoch": 0.1626938814349605, + "grad_norm": 1.5281226634979248, + "learning_rate": 4.6805184172646944e-05, + "loss": 5.0508, + "step": 27356 + }, + { + "epoch": 0.1626998287182415, + "grad_norm": 1.3342808485031128, + "learning_rate": 4.6804955693996225e-05, + "loss": 5.605, + "step": 27357 + }, + { + "epoch": 0.1627057760015225, + "grad_norm": 1.5639429092407227, + "learning_rate": 4.680472720773362e-05, + "loss": 5.0959, + "step": 27358 + }, + { + "epoch": 0.1627117232848035, + "grad_norm": 1.661442756652832, + "learning_rate": 4.680449871385922e-05, + "loss": 4.8981, + "step": 27359 + }, + { + "epoch": 0.1627176705680845, + "grad_norm": 1.601442813873291, + "learning_rate": 4.6804270212373094e-05, + "loss": 4.8313, + "step": 27360 + }, + { + "epoch": 0.1627236178513655, + "grad_norm": 1.5367902517318726, + "learning_rate": 4.6804041703275315e-05, + "loss": 4.8772, + "step": 27361 + }, + { + "epoch": 0.16272956513464648, + "grad_norm": 1.5161237716674805, + "learning_rate": 4.680381318656597e-05, + "loss": 4.7877, + "step": 27362 + }, + { + "epoch": 0.1627355124179275, + "grad_norm": 1.790384292602539, + "learning_rate": 4.680358466224515e-05, + "loss": 5.2596, + "step": 27363 + }, + { + "epoch": 0.16274145970120849, + "grad_norm": 1.6441622972488403, + "learning_rate": 4.6803356130312915e-05, + "loss": 5.3774, + "step": 27364 + }, + { + "epoch": 0.16274740698448947, + "grad_norm": 1.4816210269927979, + "learning_rate": 4.680312759076935e-05, + "loss": 5.4754, + "step": 27365 + }, + { + "epoch": 0.1627533542677705, + "grad_norm": 1.5345895290374756, + "learning_rate": 4.680289904361454e-05, + "loss": 5.2805, + "step": 27366 + }, + { + "epoch": 0.16275930155105148, + "grad_norm": 1.3760472536087036, + "learning_rate": 4.680267048884857e-05, + "loss": 5.327, + "step": 27367 + }, + { + "epoch": 0.16276524883433247, + "grad_norm": 2.4343063831329346, + "learning_rate": 4.680244192647151e-05, + "loss": 4.8059, + "step": 27368 + }, + { + "epoch": 0.16277119611761348, + "grad_norm": 2.8197708129882812, + "learning_rate": 4.6802213356483444e-05, + "loss": 4.1087, + "step": 27369 + }, + { + "epoch": 0.16277714340089447, + "grad_norm": 3.0709099769592285, + "learning_rate": 4.680198477888445e-05, + "loss": 4.1441, + "step": 27370 + }, + { + "epoch": 0.16278309068417546, + "grad_norm": 2.8608505725860596, + "learning_rate": 4.680175619367461e-05, + "loss": 4.3136, + "step": 27371 + }, + { + "epoch": 0.16278903796745647, + "grad_norm": 2.9403672218322754, + "learning_rate": 4.6801527600854e-05, + "loss": 3.903, + "step": 27372 + }, + { + "epoch": 0.16279498525073746, + "grad_norm": 1.7551895380020142, + "learning_rate": 4.6801299000422696e-05, + "loss": 5.0392, + "step": 27373 + }, + { + "epoch": 0.16280093253401845, + "grad_norm": 1.862855076789856, + "learning_rate": 4.680107039238079e-05, + "loss": 4.712, + "step": 27374 + }, + { + "epoch": 0.16280687981729947, + "grad_norm": 1.6673380136489868, + "learning_rate": 4.680084177672835e-05, + "loss": 5.1954, + "step": 27375 + }, + { + "epoch": 0.16281282710058045, + "grad_norm": 1.3807284832000732, + "learning_rate": 4.680061315346547e-05, + "loss": 5.7525, + "step": 27376 + }, + { + "epoch": 0.16281877438386144, + "grad_norm": 1.6106042861938477, + "learning_rate": 4.680038452259222e-05, + "loss": 6.1879, + "step": 27377 + }, + { + "epoch": 0.16282472166714246, + "grad_norm": 1.3592698574066162, + "learning_rate": 4.6800155884108674e-05, + "loss": 5.725, + "step": 27378 + }, + { + "epoch": 0.16283066895042345, + "grad_norm": 1.7938450574874878, + "learning_rate": 4.679992723801493e-05, + "loss": 4.8694, + "step": 27379 + }, + { + "epoch": 0.16283661623370443, + "grad_norm": 2.0678904056549072, + "learning_rate": 4.679969858431105e-05, + "loss": 5.0753, + "step": 27380 + }, + { + "epoch": 0.16284256351698545, + "grad_norm": 2.147873640060425, + "learning_rate": 4.679946992299712e-05, + "loss": 5.2131, + "step": 27381 + }, + { + "epoch": 0.16284851080026644, + "grad_norm": 1.7163617610931396, + "learning_rate": 4.679924125407322e-05, + "loss": 5.2478, + "step": 27382 + }, + { + "epoch": 0.16285445808354743, + "grad_norm": 2.040842056274414, + "learning_rate": 4.679901257753943e-05, + "loss": 5.2402, + "step": 27383 + }, + { + "epoch": 0.16286040536682844, + "grad_norm": 1.8307139873504639, + "learning_rate": 4.6798783893395834e-05, + "loss": 4.5761, + "step": 27384 + }, + { + "epoch": 0.16286635265010943, + "grad_norm": 1.4522336721420288, + "learning_rate": 4.67985552016425e-05, + "loss": 4.7127, + "step": 27385 + }, + { + "epoch": 0.16287229993339042, + "grad_norm": 1.8996527194976807, + "learning_rate": 4.679832650227952e-05, + "loss": 4.7754, + "step": 27386 + }, + { + "epoch": 0.16287824721667143, + "grad_norm": 2.1785221099853516, + "learning_rate": 4.679809779530697e-05, + "loss": 4.9305, + "step": 27387 + }, + { + "epoch": 0.16288419449995242, + "grad_norm": 2.266005754470825, + "learning_rate": 4.679786908072493e-05, + "loss": 5.1013, + "step": 27388 + }, + { + "epoch": 0.1628901417832334, + "grad_norm": 2.08335542678833, + "learning_rate": 4.679764035853348e-05, + "loss": 5.0172, + "step": 27389 + }, + { + "epoch": 0.16289608906651443, + "grad_norm": 2.1042888164520264, + "learning_rate": 4.679741162873269e-05, + "loss": 5.0088, + "step": 27390 + }, + { + "epoch": 0.16290203634979541, + "grad_norm": 2.0641071796417236, + "learning_rate": 4.679718289132266e-05, + "loss": 4.9374, + "step": 27391 + }, + { + "epoch": 0.1629079836330764, + "grad_norm": 1.855651617050171, + "learning_rate": 4.6796954146303454e-05, + "loss": 5.0419, + "step": 27392 + }, + { + "epoch": 0.16291393091635742, + "grad_norm": 1.8837964534759521, + "learning_rate": 4.679672539367516e-05, + "loss": 5.0203, + "step": 27393 + }, + { + "epoch": 0.1629198781996384, + "grad_norm": 1.9748656749725342, + "learning_rate": 4.679649663343785e-05, + "loss": 5.0305, + "step": 27394 + }, + { + "epoch": 0.1629258254829194, + "grad_norm": 2.2613768577575684, + "learning_rate": 4.67962678655916e-05, + "loss": 4.9047, + "step": 27395 + }, + { + "epoch": 0.1629317727662004, + "grad_norm": 1.583208441734314, + "learning_rate": 4.6796039090136514e-05, + "loss": 4.6715, + "step": 27396 + }, + { + "epoch": 0.1629377200494814, + "grad_norm": 1.6698166131973267, + "learning_rate": 4.679581030707265e-05, + "loss": 5.3792, + "step": 27397 + }, + { + "epoch": 0.1629436673327624, + "grad_norm": 1.778937816619873, + "learning_rate": 4.679558151640009e-05, + "loss": 5.682, + "step": 27398 + }, + { + "epoch": 0.1629496146160434, + "grad_norm": 1.7441314458847046, + "learning_rate": 4.679535271811892e-05, + "loss": 5.2928, + "step": 27399 + }, + { + "epoch": 0.1629555618993244, + "grad_norm": 2.2535476684570312, + "learning_rate": 4.679512391222922e-05, + "loss": 4.9041, + "step": 27400 + }, + { + "epoch": 0.16296150918260538, + "grad_norm": 2.237154483795166, + "learning_rate": 4.679489509873106e-05, + "loss": 4.8852, + "step": 27401 + }, + { + "epoch": 0.1629674564658864, + "grad_norm": 1.7429604530334473, + "learning_rate": 4.679466627762454e-05, + "loss": 4.7548, + "step": 27402 + }, + { + "epoch": 0.16297340374916738, + "grad_norm": 2.02030086517334, + "learning_rate": 4.6794437448909723e-05, + "loss": 4.8708, + "step": 27403 + }, + { + "epoch": 0.16297935103244837, + "grad_norm": 1.5148401260375977, + "learning_rate": 4.6794208612586684e-05, + "loss": 4.8774, + "step": 27404 + }, + { + "epoch": 0.1629852983157294, + "grad_norm": 1.9291085004806519, + "learning_rate": 4.679397976865552e-05, + "loss": 4.7936, + "step": 27405 + }, + { + "epoch": 0.16299124559901038, + "grad_norm": 2.0261623859405518, + "learning_rate": 4.67937509171163e-05, + "loss": 4.5639, + "step": 27406 + }, + { + "epoch": 0.16299719288229136, + "grad_norm": 2.1595592498779297, + "learning_rate": 4.679352205796911e-05, + "loss": 4.7767, + "step": 27407 + }, + { + "epoch": 0.16300314016557238, + "grad_norm": 1.7030655145645142, + "learning_rate": 4.679329319121403e-05, + "loss": 4.9251, + "step": 27408 + }, + { + "epoch": 0.16300908744885337, + "grad_norm": 1.5864980220794678, + "learning_rate": 4.679306431685112e-05, + "loss": 5.0048, + "step": 27409 + }, + { + "epoch": 0.16301503473213436, + "grad_norm": 1.695307970046997, + "learning_rate": 4.679283543488049e-05, + "loss": 5.1882, + "step": 27410 + }, + { + "epoch": 0.16302098201541534, + "grad_norm": 1.4839437007904053, + "learning_rate": 4.6792606545302206e-05, + "loss": 5.3838, + "step": 27411 + }, + { + "epoch": 0.16302692929869636, + "grad_norm": 1.883641242980957, + "learning_rate": 4.6792377648116346e-05, + "loss": 4.9213, + "step": 27412 + }, + { + "epoch": 0.16303287658197735, + "grad_norm": 2.2560174465179443, + "learning_rate": 4.6792148743322985e-05, + "loss": 4.2573, + "step": 27413 + }, + { + "epoch": 0.16303882386525834, + "grad_norm": 2.452279567718506, + "learning_rate": 4.6791919830922225e-05, + "loss": 4.526, + "step": 27414 + }, + { + "epoch": 0.16304477114853935, + "grad_norm": 2.429499387741089, + "learning_rate": 4.679169091091412e-05, + "loss": 4.1269, + "step": 27415 + }, + { + "epoch": 0.16305071843182034, + "grad_norm": 1.7020376920700073, + "learning_rate": 4.6791461983298764e-05, + "loss": 5.367, + "step": 27416 + }, + { + "epoch": 0.16305666571510133, + "grad_norm": 1.6802117824554443, + "learning_rate": 4.679123304807623e-05, + "loss": 5.628, + "step": 27417 + }, + { + "epoch": 0.16306261299838234, + "grad_norm": 1.5536737442016602, + "learning_rate": 4.6791004105246606e-05, + "loss": 4.4013, + "step": 27418 + }, + { + "epoch": 0.16306856028166333, + "grad_norm": 1.6626231670379639, + "learning_rate": 4.6790775154809966e-05, + "loss": 5.1377, + "step": 27419 + }, + { + "epoch": 0.16307450756494432, + "grad_norm": 1.4954432249069214, + "learning_rate": 4.6790546196766395e-05, + "loss": 4.8278, + "step": 27420 + }, + { + "epoch": 0.16308045484822534, + "grad_norm": 2.2759921550750732, + "learning_rate": 4.679031723111597e-05, + "loss": 4.0856, + "step": 27421 + }, + { + "epoch": 0.16308640213150633, + "grad_norm": 2.298222541809082, + "learning_rate": 4.679008825785877e-05, + "loss": 4.169, + "step": 27422 + }, + { + "epoch": 0.1630923494147873, + "grad_norm": 2.435786247253418, + "learning_rate": 4.678985927699486e-05, + "loss": 3.9992, + "step": 27423 + }, + { + "epoch": 0.16309829669806833, + "grad_norm": 2.273677110671997, + "learning_rate": 4.678963028852436e-05, + "loss": 3.689, + "step": 27424 + }, + { + "epoch": 0.16310424398134932, + "grad_norm": 2.1706488132476807, + "learning_rate": 4.6789401292447306e-05, + "loss": 3.7752, + "step": 27425 + }, + { + "epoch": 0.1631101912646303, + "grad_norm": 1.7838464975357056, + "learning_rate": 4.6789172288763804e-05, + "loss": 4.863, + "step": 27426 + }, + { + "epoch": 0.16311613854791132, + "grad_norm": 2.0465335845947266, + "learning_rate": 4.678894327747393e-05, + "loss": 4.8415, + "step": 27427 + }, + { + "epoch": 0.1631220858311923, + "grad_norm": 2.5023603439331055, + "learning_rate": 4.678871425857775e-05, + "loss": 3.8268, + "step": 27428 + }, + { + "epoch": 0.1631280331144733, + "grad_norm": 3.1593286991119385, + "learning_rate": 4.6788485232075366e-05, + "loss": 3.8232, + "step": 27429 + }, + { + "epoch": 0.1631339803977543, + "grad_norm": 2.5644307136535645, + "learning_rate": 4.6788256197966847e-05, + "loss": 3.4984, + "step": 27430 + }, + { + "epoch": 0.1631399276810353, + "grad_norm": 2.0135555267333984, + "learning_rate": 4.678802715625227e-05, + "loss": 4.1888, + "step": 27431 + }, + { + "epoch": 0.1631458749643163, + "grad_norm": 2.4584031105041504, + "learning_rate": 4.678779810693171e-05, + "loss": 4.2168, + "step": 27432 + }, + { + "epoch": 0.1631518222475973, + "grad_norm": 3.071559429168701, + "learning_rate": 4.678756905000526e-05, + "loss": 4.191, + "step": 27433 + }, + { + "epoch": 0.1631577695308783, + "grad_norm": 2.8028981685638428, + "learning_rate": 4.6787339985473e-05, + "loss": 3.9579, + "step": 27434 + }, + { + "epoch": 0.16316371681415928, + "grad_norm": 1.8563295602798462, + "learning_rate": 4.6787110913335006e-05, + "loss": 4.7058, + "step": 27435 + }, + { + "epoch": 0.1631696640974403, + "grad_norm": 1.576141357421875, + "learning_rate": 4.678688183359135e-05, + "loss": 5.2126, + "step": 27436 + }, + { + "epoch": 0.16317561138072129, + "grad_norm": 1.715032935142517, + "learning_rate": 4.6786652746242124e-05, + "loss": 5.1945, + "step": 27437 + }, + { + "epoch": 0.16318155866400227, + "grad_norm": 1.5476752519607544, + "learning_rate": 4.67864236512874e-05, + "loss": 5.523, + "step": 27438 + }, + { + "epoch": 0.1631875059472833, + "grad_norm": 1.4861894845962524, + "learning_rate": 4.6786194548727255e-05, + "loss": 5.4119, + "step": 27439 + }, + { + "epoch": 0.16319345323056428, + "grad_norm": 1.3097593784332275, + "learning_rate": 4.6785965438561784e-05, + "loss": 5.4008, + "step": 27440 + }, + { + "epoch": 0.16319940051384527, + "grad_norm": 1.733404517173767, + "learning_rate": 4.678573632079105e-05, + "loss": 4.4261, + "step": 27441 + }, + { + "epoch": 0.16320534779712628, + "grad_norm": 1.4431440830230713, + "learning_rate": 4.678550719541514e-05, + "loss": 3.8523, + "step": 27442 + }, + { + "epoch": 0.16321129508040727, + "grad_norm": 1.5869112014770508, + "learning_rate": 4.678527806243415e-05, + "loss": 5.0346, + "step": 27443 + }, + { + "epoch": 0.16321724236368826, + "grad_norm": 1.7510712146759033, + "learning_rate": 4.6785048921848127e-05, + "loss": 5.2022, + "step": 27444 + }, + { + "epoch": 0.16322318964696927, + "grad_norm": 2.5091726779937744, + "learning_rate": 4.678481977365717e-05, + "loss": 4.3526, + "step": 27445 + }, + { + "epoch": 0.16322913693025026, + "grad_norm": 2.355930805206299, + "learning_rate": 4.6784590617861365e-05, + "loss": 3.9097, + "step": 27446 + }, + { + "epoch": 0.16323508421353125, + "grad_norm": 2.104262113571167, + "learning_rate": 4.678436145446078e-05, + "loss": 3.9491, + "step": 27447 + }, + { + "epoch": 0.16324103149681227, + "grad_norm": 2.6814212799072266, + "learning_rate": 4.678413228345551e-05, + "loss": 3.9986, + "step": 27448 + }, + { + "epoch": 0.16324697878009325, + "grad_norm": 2.017530679702759, + "learning_rate": 4.678390310484561e-05, + "loss": 4.0997, + "step": 27449 + }, + { + "epoch": 0.16325292606337424, + "grad_norm": 2.437260389328003, + "learning_rate": 4.6783673918631175e-05, + "loss": 4.2466, + "step": 27450 + }, + { + "epoch": 0.16325887334665526, + "grad_norm": 2.4225821495056152, + "learning_rate": 4.67834447248123e-05, + "loss": 4.0411, + "step": 27451 + }, + { + "epoch": 0.16326482062993625, + "grad_norm": 1.833397388458252, + "learning_rate": 4.6783215523389035e-05, + "loss": 4.5873, + "step": 27452 + }, + { + "epoch": 0.16327076791321724, + "grad_norm": 1.7432091236114502, + "learning_rate": 4.6782986314361477e-05, + "loss": 5.3351, + "step": 27453 + }, + { + "epoch": 0.16327671519649825, + "grad_norm": 1.8234552145004272, + "learning_rate": 4.6782757097729704e-05, + "loss": 5.3769, + "step": 27454 + }, + { + "epoch": 0.16328266247977924, + "grad_norm": 1.7435389757156372, + "learning_rate": 4.67825278734938e-05, + "loss": 4.6875, + "step": 27455 + }, + { + "epoch": 0.16328860976306023, + "grad_norm": 2.265040874481201, + "learning_rate": 4.678229864165383e-05, + "loss": 4.6138, + "step": 27456 + }, + { + "epoch": 0.16329455704634124, + "grad_norm": 2.105421304702759, + "learning_rate": 4.678206940220989e-05, + "loss": 4.7799, + "step": 27457 + }, + { + "epoch": 0.16330050432962223, + "grad_norm": 1.9669932126998901, + "learning_rate": 4.678184015516206e-05, + "loss": 4.3826, + "step": 27458 + }, + { + "epoch": 0.16330645161290322, + "grad_norm": 2.2020108699798584, + "learning_rate": 4.6781610900510406e-05, + "loss": 4.7784, + "step": 27459 + }, + { + "epoch": 0.16331239889618424, + "grad_norm": 2.0246944427490234, + "learning_rate": 4.678138163825503e-05, + "loss": 4.5324, + "step": 27460 + }, + { + "epoch": 0.16331834617946522, + "grad_norm": 2.0522918701171875, + "learning_rate": 4.678115236839599e-05, + "loss": 4.1903, + "step": 27461 + }, + { + "epoch": 0.1633242934627462, + "grad_norm": 2.0524399280548096, + "learning_rate": 4.678092309093337e-05, + "loss": 4.5542, + "step": 27462 + }, + { + "epoch": 0.16333024074602723, + "grad_norm": 2.0562379360198975, + "learning_rate": 4.678069380586726e-05, + "loss": 4.6572, + "step": 27463 + }, + { + "epoch": 0.16333618802930822, + "grad_norm": 1.931517481803894, + "learning_rate": 4.678046451319774e-05, + "loss": 4.3204, + "step": 27464 + }, + { + "epoch": 0.1633421353125892, + "grad_norm": 1.852124810218811, + "learning_rate": 4.678023521292487e-05, + "loss": 4.5307, + "step": 27465 + }, + { + "epoch": 0.16334808259587022, + "grad_norm": 1.690384030342102, + "learning_rate": 4.6780005905048764e-05, + "loss": 5.1771, + "step": 27466 + }, + { + "epoch": 0.1633540298791512, + "grad_norm": 1.7573405504226685, + "learning_rate": 4.6779776589569466e-05, + "loss": 4.894, + "step": 27467 + }, + { + "epoch": 0.1633599771624322, + "grad_norm": 2.139704942703247, + "learning_rate": 4.677954726648708e-05, + "loss": 4.7212, + "step": 27468 + }, + { + "epoch": 0.1633659244457132, + "grad_norm": 1.9621661901474, + "learning_rate": 4.677931793580168e-05, + "loss": 4.6083, + "step": 27469 + }, + { + "epoch": 0.1633718717289942, + "grad_norm": 1.9202685356140137, + "learning_rate": 4.6779088597513346e-05, + "loss": 5.3296, + "step": 27470 + }, + { + "epoch": 0.1633778190122752, + "grad_norm": 1.6269041299819946, + "learning_rate": 4.677885925162216e-05, + "loss": 5.4541, + "step": 27471 + }, + { + "epoch": 0.16338376629555618, + "grad_norm": 1.928564190864563, + "learning_rate": 4.677862989812819e-05, + "loss": 4.8419, + "step": 27472 + }, + { + "epoch": 0.1633897135788372, + "grad_norm": 2.1393957138061523, + "learning_rate": 4.677840053703153e-05, + "loss": 4.5768, + "step": 27473 + }, + { + "epoch": 0.16339566086211818, + "grad_norm": 2.2332470417022705, + "learning_rate": 4.677817116833225e-05, + "loss": 4.7571, + "step": 27474 + }, + { + "epoch": 0.16340160814539917, + "grad_norm": 1.7523399591445923, + "learning_rate": 4.6777941792030446e-05, + "loss": 5.0372, + "step": 27475 + }, + { + "epoch": 0.16340755542868018, + "grad_norm": 1.5460946559906006, + "learning_rate": 4.677771240812619e-05, + "loss": 5.1194, + "step": 27476 + }, + { + "epoch": 0.16341350271196117, + "grad_norm": 1.6920409202575684, + "learning_rate": 4.677748301661954e-05, + "loss": 5.0852, + "step": 27477 + }, + { + "epoch": 0.16341944999524216, + "grad_norm": 1.5086921453475952, + "learning_rate": 4.677725361751061e-05, + "loss": 5.2414, + "step": 27478 + }, + { + "epoch": 0.16342539727852318, + "grad_norm": 1.4637200832366943, + "learning_rate": 4.6777024210799465e-05, + "loss": 4.9873, + "step": 27479 + }, + { + "epoch": 0.16343134456180416, + "grad_norm": 1.6477910280227661, + "learning_rate": 4.677679479648618e-05, + "loss": 5.2834, + "step": 27480 + }, + { + "epoch": 0.16343729184508515, + "grad_norm": 1.7025471925735474, + "learning_rate": 4.6776565374570844e-05, + "loss": 5.3655, + "step": 27481 + }, + { + "epoch": 0.16344323912836617, + "grad_norm": 1.8360841274261475, + "learning_rate": 4.677633594505354e-05, + "loss": 4.4539, + "step": 27482 + }, + { + "epoch": 0.16344918641164716, + "grad_norm": 2.10629940032959, + "learning_rate": 4.6776106507934336e-05, + "loss": 4.2894, + "step": 27483 + }, + { + "epoch": 0.16345513369492815, + "grad_norm": 1.706100583076477, + "learning_rate": 4.677587706321333e-05, + "loss": 4.7572, + "step": 27484 + }, + { + "epoch": 0.16346108097820916, + "grad_norm": 1.518978238105774, + "learning_rate": 4.677564761089057e-05, + "loss": 5.8137, + "step": 27485 + }, + { + "epoch": 0.16346702826149015, + "grad_norm": 1.903784155845642, + "learning_rate": 4.677541815096617e-05, + "loss": 4.7093, + "step": 27486 + }, + { + "epoch": 0.16347297554477114, + "grad_norm": 1.9231067895889282, + "learning_rate": 4.677518868344019e-05, + "loss": 4.6492, + "step": 27487 + }, + { + "epoch": 0.16347892282805215, + "grad_norm": 1.5489968061447144, + "learning_rate": 4.6774959208312717e-05, + "loss": 5.1375, + "step": 27488 + }, + { + "epoch": 0.16348487011133314, + "grad_norm": 1.6851353645324707, + "learning_rate": 4.677472972558383e-05, + "loss": 5.3354, + "step": 27489 + }, + { + "epoch": 0.16349081739461413, + "grad_norm": 1.6556458473205566, + "learning_rate": 4.6774500235253614e-05, + "loss": 4.4959, + "step": 27490 + }, + { + "epoch": 0.16349676467789515, + "grad_norm": 1.8800296783447266, + "learning_rate": 4.6774270737322145e-05, + "loss": 4.0961, + "step": 27491 + }, + { + "epoch": 0.16350271196117613, + "grad_norm": 1.847226858139038, + "learning_rate": 4.67740412317895e-05, + "loss": 4.0567, + "step": 27492 + }, + { + "epoch": 0.16350865924445712, + "grad_norm": 1.8994855880737305, + "learning_rate": 4.6773811718655766e-05, + "loss": 4.8829, + "step": 27493 + }, + { + "epoch": 0.16351460652773814, + "grad_norm": 1.6551505327224731, + "learning_rate": 4.677358219792102e-05, + "loss": 5.0247, + "step": 27494 + }, + { + "epoch": 0.16352055381101913, + "grad_norm": 1.6510465145111084, + "learning_rate": 4.6773352669585336e-05, + "loss": 5.2324, + "step": 27495 + }, + { + "epoch": 0.16352650109430011, + "grad_norm": 1.851661205291748, + "learning_rate": 4.67731231336488e-05, + "loss": 4.1622, + "step": 27496 + }, + { + "epoch": 0.16353244837758113, + "grad_norm": 1.9479695558547974, + "learning_rate": 4.67728935901115e-05, + "loss": 3.9269, + "step": 27497 + }, + { + "epoch": 0.16353839566086212, + "grad_norm": 1.8207287788391113, + "learning_rate": 4.67726640389735e-05, + "loss": 3.8434, + "step": 27498 + }, + { + "epoch": 0.1635443429441431, + "grad_norm": 1.8698455095291138, + "learning_rate": 4.677243448023489e-05, + "loss": 3.9786, + "step": 27499 + }, + { + "epoch": 0.16355029022742412, + "grad_norm": 1.8257921934127808, + "learning_rate": 4.6772204913895746e-05, + "loss": 3.947, + "step": 27500 + }, + { + "epoch": 0.1635562375107051, + "grad_norm": 1.6152242422103882, + "learning_rate": 4.6771975339956155e-05, + "loss": 4.4898, + "step": 27501 + }, + { + "epoch": 0.1635621847939861, + "grad_norm": 1.956666350364685, + "learning_rate": 4.6771745758416185e-05, + "loss": 4.8584, + "step": 27502 + }, + { + "epoch": 0.16356813207726711, + "grad_norm": 1.8477699756622314, + "learning_rate": 4.677151616927593e-05, + "loss": 5.0331, + "step": 27503 + }, + { + "epoch": 0.1635740793605481, + "grad_norm": 1.705209732055664, + "learning_rate": 4.677128657253545e-05, + "loss": 4.193, + "step": 27504 + }, + { + "epoch": 0.1635800266438291, + "grad_norm": 1.8259029388427734, + "learning_rate": 4.677105696819486e-05, + "loss": 3.8187, + "step": 27505 + }, + { + "epoch": 0.1635859739271101, + "grad_norm": 1.633556604385376, + "learning_rate": 4.677082735625421e-05, + "loss": 3.8045, + "step": 27506 + }, + { + "epoch": 0.1635919212103911, + "grad_norm": 1.7349916696548462, + "learning_rate": 4.677059773671358e-05, + "loss": 4.1425, + "step": 27507 + }, + { + "epoch": 0.16359786849367208, + "grad_norm": 1.8932249546051025, + "learning_rate": 4.677036810957307e-05, + "loss": 4.838, + "step": 27508 + }, + { + "epoch": 0.1636038157769531, + "grad_norm": 1.6211893558502197, + "learning_rate": 4.677013847483275e-05, + "loss": 5.2038, + "step": 27509 + }, + { + "epoch": 0.1636097630602341, + "grad_norm": 1.7109664678573608, + "learning_rate": 4.6769908832492694e-05, + "loss": 4.8308, + "step": 27510 + }, + { + "epoch": 0.16361571034351508, + "grad_norm": 1.603644847869873, + "learning_rate": 4.6769679182553e-05, + "loss": 4.8959, + "step": 27511 + }, + { + "epoch": 0.1636216576267961, + "grad_norm": 1.6871256828308105, + "learning_rate": 4.676944952501372e-05, + "loss": 4.7762, + "step": 27512 + }, + { + "epoch": 0.16362760491007708, + "grad_norm": 1.5820897817611694, + "learning_rate": 4.676921985987496e-05, + "loss": 4.4533, + "step": 27513 + }, + { + "epoch": 0.16363355219335807, + "grad_norm": 1.6850042343139648, + "learning_rate": 4.676899018713678e-05, + "loss": 4.7149, + "step": 27514 + }, + { + "epoch": 0.16363949947663908, + "grad_norm": 1.6211190223693848, + "learning_rate": 4.676876050679928e-05, + "loss": 5.1372, + "step": 27515 + }, + { + "epoch": 0.16364544675992007, + "grad_norm": 1.7970921993255615, + "learning_rate": 4.676853081886252e-05, + "loss": 4.9738, + "step": 27516 + }, + { + "epoch": 0.16365139404320106, + "grad_norm": 1.9819167852401733, + "learning_rate": 4.67683011233266e-05, + "loss": 4.9069, + "step": 27517 + }, + { + "epoch": 0.16365734132648208, + "grad_norm": 1.9208866357803345, + "learning_rate": 4.6768071420191596e-05, + "loss": 4.6224, + "step": 27518 + }, + { + "epoch": 0.16366328860976306, + "grad_norm": 1.4924341440200806, + "learning_rate": 4.676784170945757e-05, + "loss": 4.4268, + "step": 27519 + }, + { + "epoch": 0.16366923589304405, + "grad_norm": 1.5947877168655396, + "learning_rate": 4.676761199112462e-05, + "loss": 4.231, + "step": 27520 + }, + { + "epoch": 0.16367518317632507, + "grad_norm": 1.4336072206497192, + "learning_rate": 4.676738226519283e-05, + "loss": 4.7233, + "step": 27521 + }, + { + "epoch": 0.16368113045960606, + "grad_norm": 1.496932864189148, + "learning_rate": 4.676715253166226e-05, + "loss": 4.2295, + "step": 27522 + }, + { + "epoch": 0.16368707774288704, + "grad_norm": 1.3215701580047607, + "learning_rate": 4.6766922790533005e-05, + "loss": 4.2627, + "step": 27523 + }, + { + "epoch": 0.16369302502616806, + "grad_norm": 1.524957299232483, + "learning_rate": 4.676669304180514e-05, + "loss": 4.5299, + "step": 27524 + }, + { + "epoch": 0.16369897230944905, + "grad_norm": 2.0174505710601807, + "learning_rate": 4.676646328547876e-05, + "loss": 4.8986, + "step": 27525 + }, + { + "epoch": 0.16370491959273004, + "grad_norm": 1.6895251274108887, + "learning_rate": 4.676623352155392e-05, + "loss": 4.6933, + "step": 27526 + }, + { + "epoch": 0.16371086687601105, + "grad_norm": 1.3915743827819824, + "learning_rate": 4.676600375003072e-05, + "loss": 4.3735, + "step": 27527 + }, + { + "epoch": 0.16371681415929204, + "grad_norm": 2.5097527503967285, + "learning_rate": 4.6765773970909224e-05, + "loss": 4.7227, + "step": 27528 + }, + { + "epoch": 0.16372276144257303, + "grad_norm": 1.4059836864471436, + "learning_rate": 4.676554418418953e-05, + "loss": 4.3861, + "step": 27529 + }, + { + "epoch": 0.16372870872585402, + "grad_norm": 1.5270711183547974, + "learning_rate": 4.6765314389871704e-05, + "loss": 4.4302, + "step": 27530 + }, + { + "epoch": 0.16373465600913503, + "grad_norm": 1.8292162418365479, + "learning_rate": 4.676508458795583e-05, + "loss": 4.697, + "step": 27531 + }, + { + "epoch": 0.16374060329241602, + "grad_norm": 1.8712737560272217, + "learning_rate": 4.6764854778442e-05, + "loss": 4.6228, + "step": 27532 + }, + { + "epoch": 0.163746550575697, + "grad_norm": 1.551424503326416, + "learning_rate": 4.6764624961330274e-05, + "loss": 5.1146, + "step": 27533 + }, + { + "epoch": 0.16375249785897802, + "grad_norm": 1.522362232208252, + "learning_rate": 4.6764395136620745e-05, + "loss": 4.8196, + "step": 27534 + }, + { + "epoch": 0.163758445142259, + "grad_norm": 2.196622371673584, + "learning_rate": 4.676416530431349e-05, + "loss": 4.6695, + "step": 27535 + }, + { + "epoch": 0.16376439242554, + "grad_norm": 1.7196024656295776, + "learning_rate": 4.676393546440859e-05, + "loss": 4.3153, + "step": 27536 + }, + { + "epoch": 0.16377033970882102, + "grad_norm": 1.841454267501831, + "learning_rate": 4.676370561690613e-05, + "loss": 3.9704, + "step": 27537 + }, + { + "epoch": 0.163776286992102, + "grad_norm": 1.8239476680755615, + "learning_rate": 4.6763475761806185e-05, + "loss": 3.9419, + "step": 27538 + }, + { + "epoch": 0.163782234275383, + "grad_norm": 1.8012974262237549, + "learning_rate": 4.6763245899108834e-05, + "loss": 3.9246, + "step": 27539 + }, + { + "epoch": 0.163788181558664, + "grad_norm": 1.7155267000198364, + "learning_rate": 4.676301602881415e-05, + "loss": 4.7766, + "step": 27540 + }, + { + "epoch": 0.163794128841945, + "grad_norm": 1.986662745475769, + "learning_rate": 4.676278615092223e-05, + "loss": 4.5932, + "step": 27541 + }, + { + "epoch": 0.16380007612522599, + "grad_norm": 1.7661755084991455, + "learning_rate": 4.676255626543314e-05, + "loss": 4.2295, + "step": 27542 + }, + { + "epoch": 0.163806023408507, + "grad_norm": 1.7953100204467773, + "learning_rate": 4.676232637234698e-05, + "loss": 3.7245, + "step": 27543 + }, + { + "epoch": 0.163811970691788, + "grad_norm": 1.8963271379470825, + "learning_rate": 4.6762096471663805e-05, + "loss": 3.7599, + "step": 27544 + }, + { + "epoch": 0.16381791797506898, + "grad_norm": 1.8365765810012817, + "learning_rate": 4.676186656338371e-05, + "loss": 3.8955, + "step": 27545 + }, + { + "epoch": 0.16382386525835, + "grad_norm": 1.7611230611801147, + "learning_rate": 4.676163664750677e-05, + "loss": 3.7164, + "step": 27546 + }, + { + "epoch": 0.16382981254163098, + "grad_norm": 1.6881484985351562, + "learning_rate": 4.676140672403307e-05, + "loss": 3.905, + "step": 27547 + }, + { + "epoch": 0.16383575982491197, + "grad_norm": 1.655831217765808, + "learning_rate": 4.676117679296269e-05, + "loss": 4.9185, + "step": 27548 + }, + { + "epoch": 0.16384170710819299, + "grad_norm": 1.602988839149475, + "learning_rate": 4.6760946854295707e-05, + "loss": 5.228, + "step": 27549 + }, + { + "epoch": 0.16384765439147397, + "grad_norm": 1.6523774862289429, + "learning_rate": 4.67607169080322e-05, + "loss": 5.2095, + "step": 27550 + }, + { + "epoch": 0.16385360167475496, + "grad_norm": 2.0141515731811523, + "learning_rate": 4.676048695417224e-05, + "loss": 5.2764, + "step": 27551 + }, + { + "epoch": 0.16385954895803598, + "grad_norm": 1.824358344078064, + "learning_rate": 4.676025699271594e-05, + "loss": 4.4083, + "step": 27552 + }, + { + "epoch": 0.16386549624131697, + "grad_norm": 1.90078604221344, + "learning_rate": 4.676002702366334e-05, + "loss": 4.3142, + "step": 27553 + }, + { + "epoch": 0.16387144352459795, + "grad_norm": 2.1593260765075684, + "learning_rate": 4.6759797047014554e-05, + "loss": 4.8884, + "step": 27554 + }, + { + "epoch": 0.16387739080787897, + "grad_norm": 1.6608953475952148, + "learning_rate": 4.675956706276965e-05, + "loss": 5.0272, + "step": 27555 + }, + { + "epoch": 0.16388333809115996, + "grad_norm": 1.6689786911010742, + "learning_rate": 4.67593370709287e-05, + "loss": 4.8278, + "step": 27556 + }, + { + "epoch": 0.16388928537444095, + "grad_norm": 1.5720055103302002, + "learning_rate": 4.675910707149178e-05, + "loss": 4.9288, + "step": 27557 + }, + { + "epoch": 0.16389523265772196, + "grad_norm": 1.6609811782836914, + "learning_rate": 4.675887706445899e-05, + "loss": 4.9233, + "step": 27558 + }, + { + "epoch": 0.16390117994100295, + "grad_norm": 1.7448883056640625, + "learning_rate": 4.6758647049830405e-05, + "loss": 4.8793, + "step": 27559 + }, + { + "epoch": 0.16390712722428394, + "grad_norm": 1.728389859199524, + "learning_rate": 4.6758417027606094e-05, + "loss": 5.2122, + "step": 27560 + }, + { + "epoch": 0.16391307450756495, + "grad_norm": 1.4038145542144775, + "learning_rate": 4.675818699778615e-05, + "loss": 5.1715, + "step": 27561 + }, + { + "epoch": 0.16391902179084594, + "grad_norm": 1.7425341606140137, + "learning_rate": 4.675795696037064e-05, + "loss": 5.3856, + "step": 27562 + }, + { + "epoch": 0.16392496907412693, + "grad_norm": 1.6463298797607422, + "learning_rate": 4.675772691535966e-05, + "loss": 4.8584, + "step": 27563 + }, + { + "epoch": 0.16393091635740795, + "grad_norm": 1.8424142599105835, + "learning_rate": 4.675749686275328e-05, + "loss": 4.7667, + "step": 27564 + }, + { + "epoch": 0.16393686364068893, + "grad_norm": 2.32179594039917, + "learning_rate": 4.675726680255158e-05, + "loss": 4.2014, + "step": 27565 + }, + { + "epoch": 0.16394281092396992, + "grad_norm": 2.380255699157715, + "learning_rate": 4.675703673475464e-05, + "loss": 4.5618, + "step": 27566 + }, + { + "epoch": 0.16394875820725094, + "grad_norm": 1.846535563468933, + "learning_rate": 4.675680665936255e-05, + "loss": 4.9291, + "step": 27567 + }, + { + "epoch": 0.16395470549053193, + "grad_norm": 1.9701546430587769, + "learning_rate": 4.675657657637538e-05, + "loss": 4.4594, + "step": 27568 + }, + { + "epoch": 0.16396065277381291, + "grad_norm": 2.15051007270813, + "learning_rate": 4.675634648579322e-05, + "loss": 4.0397, + "step": 27569 + }, + { + "epoch": 0.16396660005709393, + "grad_norm": 1.7181464433670044, + "learning_rate": 4.6756116387616136e-05, + "loss": 5.0483, + "step": 27570 + }, + { + "epoch": 0.16397254734037492, + "grad_norm": 1.3659751415252686, + "learning_rate": 4.675588628184422e-05, + "loss": 5.0627, + "step": 27571 + }, + { + "epoch": 0.1639784946236559, + "grad_norm": 1.7381535768508911, + "learning_rate": 4.6755656168477553e-05, + "loss": 4.8013, + "step": 27572 + }, + { + "epoch": 0.16398444190693692, + "grad_norm": 1.9152921438217163, + "learning_rate": 4.6755426047516205e-05, + "loss": 4.5437, + "step": 27573 + }, + { + "epoch": 0.1639903891902179, + "grad_norm": 1.449018955230713, + "learning_rate": 4.675519591896026e-05, + "loss": 5.046, + "step": 27574 + }, + { + "epoch": 0.1639963364734989, + "grad_norm": 2.2243831157684326, + "learning_rate": 4.675496578280981e-05, + "loss": 4.0585, + "step": 27575 + }, + { + "epoch": 0.16400228375677992, + "grad_norm": 1.9781684875488281, + "learning_rate": 4.675473563906492e-05, + "loss": 4.6334, + "step": 27576 + }, + { + "epoch": 0.1640082310400609, + "grad_norm": 1.9873735904693604, + "learning_rate": 4.675450548772568e-05, + "loss": 4.6854, + "step": 27577 + }, + { + "epoch": 0.1640141783233419, + "grad_norm": 1.914959192276001, + "learning_rate": 4.675427532879216e-05, + "loss": 4.7866, + "step": 27578 + }, + { + "epoch": 0.1640201256066229, + "grad_norm": 1.8510034084320068, + "learning_rate": 4.675404516226446e-05, + "loss": 4.4274, + "step": 27579 + }, + { + "epoch": 0.1640260728899039, + "grad_norm": 1.726172924041748, + "learning_rate": 4.6753814988142644e-05, + "loss": 4.4166, + "step": 27580 + }, + { + "epoch": 0.16403202017318488, + "grad_norm": 1.7206041812896729, + "learning_rate": 4.6753584806426786e-05, + "loss": 4.3724, + "step": 27581 + }, + { + "epoch": 0.1640379674564659, + "grad_norm": 1.9253183603286743, + "learning_rate": 4.6753354617116987e-05, + "loss": 3.8641, + "step": 27582 + }, + { + "epoch": 0.1640439147397469, + "grad_norm": 1.9023802280426025, + "learning_rate": 4.6753124420213306e-05, + "loss": 4.231, + "step": 27583 + }, + { + "epoch": 0.16404986202302788, + "grad_norm": 2.092531442642212, + "learning_rate": 4.675289421571584e-05, + "loss": 4.4025, + "step": 27584 + }, + { + "epoch": 0.1640558093063089, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.675266400362466e-05, + "loss": 4.4643, + "step": 27585 + }, + { + "epoch": 0.16406175658958988, + "grad_norm": 2.1016385555267334, + "learning_rate": 4.6752433783939855e-05, + "loss": 4.4391, + "step": 27586 + }, + { + "epoch": 0.16406770387287087, + "grad_norm": 2.07698130607605, + "learning_rate": 4.67522035566615e-05, + "loss": 4.483, + "step": 27587 + }, + { + "epoch": 0.16407365115615186, + "grad_norm": 2.172579288482666, + "learning_rate": 4.6751973321789675e-05, + "loss": 4.2118, + "step": 27588 + }, + { + "epoch": 0.16407959843943287, + "grad_norm": 2.1808786392211914, + "learning_rate": 4.675174307932446e-05, + "loss": 4.4722, + "step": 27589 + }, + { + "epoch": 0.16408554572271386, + "grad_norm": 2.163482427597046, + "learning_rate": 4.675151282926593e-05, + "loss": 4.747, + "step": 27590 + }, + { + "epoch": 0.16409149300599485, + "grad_norm": 2.431328773498535, + "learning_rate": 4.675128257161418e-05, + "loss": 4.0239, + "step": 27591 + }, + { + "epoch": 0.16409744028927586, + "grad_norm": 2.2003822326660156, + "learning_rate": 4.675105230636928e-05, + "loss": 4.2945, + "step": 27592 + }, + { + "epoch": 0.16410338757255685, + "grad_norm": 1.8259824514389038, + "learning_rate": 4.675082203353131e-05, + "loss": 4.3246, + "step": 27593 + }, + { + "epoch": 0.16410933485583784, + "grad_norm": 2.02915358543396, + "learning_rate": 4.6750591753100356e-05, + "loss": 5.5056, + "step": 27594 + }, + { + "epoch": 0.16411528213911886, + "grad_norm": 2.2010276317596436, + "learning_rate": 4.675036146507649e-05, + "loss": 5.3688, + "step": 27595 + }, + { + "epoch": 0.16412122942239984, + "grad_norm": 1.8411953449249268, + "learning_rate": 4.6750131169459806e-05, + "loss": 5.6156, + "step": 27596 + }, + { + "epoch": 0.16412717670568083, + "grad_norm": 1.8446851968765259, + "learning_rate": 4.674990086625037e-05, + "loss": 5.1344, + "step": 27597 + }, + { + "epoch": 0.16413312398896185, + "grad_norm": 1.7121134996414185, + "learning_rate": 4.674967055544827e-05, + "loss": 5.164, + "step": 27598 + }, + { + "epoch": 0.16413907127224284, + "grad_norm": 1.68525230884552, + "learning_rate": 4.6749440237053574e-05, + "loss": 4.9757, + "step": 27599 + }, + { + "epoch": 0.16414501855552383, + "grad_norm": 1.9436984062194824, + "learning_rate": 4.6749209911066396e-05, + "loss": 4.4168, + "step": 27600 + }, + { + "epoch": 0.16415096583880484, + "grad_norm": 1.9261338710784912, + "learning_rate": 4.6748979577486774e-05, + "loss": 4.5949, + "step": 27601 + }, + { + "epoch": 0.16415691312208583, + "grad_norm": 1.4877192974090576, + "learning_rate": 4.6748749236314816e-05, + "loss": 5.0274, + "step": 27602 + }, + { + "epoch": 0.16416286040536682, + "grad_norm": 2.030029296875, + "learning_rate": 4.674851888755059e-05, + "loss": 5.3301, + "step": 27603 + }, + { + "epoch": 0.16416880768864783, + "grad_norm": 1.4313018321990967, + "learning_rate": 4.674828853119418e-05, + "loss": 4.9408, + "step": 27604 + }, + { + "epoch": 0.16417475497192882, + "grad_norm": 1.4011638164520264, + "learning_rate": 4.674805816724568e-05, + "loss": 5.2628, + "step": 27605 + }, + { + "epoch": 0.1641807022552098, + "grad_norm": 1.6607071161270142, + "learning_rate": 4.674782779570514e-05, + "loss": 5.0739, + "step": 27606 + }, + { + "epoch": 0.16418664953849083, + "grad_norm": 2.07830810546875, + "learning_rate": 4.674759741657267e-05, + "loss": 4.7624, + "step": 27607 + }, + { + "epoch": 0.1641925968217718, + "grad_norm": 2.402186870574951, + "learning_rate": 4.674736702984833e-05, + "loss": 4.2407, + "step": 27608 + }, + { + "epoch": 0.1641985441050528, + "grad_norm": 2.498345136642456, + "learning_rate": 4.674713663553222e-05, + "loss": 4.2357, + "step": 27609 + }, + { + "epoch": 0.16420449138833382, + "grad_norm": 2.4307384490966797, + "learning_rate": 4.67469062336244e-05, + "loss": 4.4379, + "step": 27610 + }, + { + "epoch": 0.1642104386716148, + "grad_norm": 1.721940279006958, + "learning_rate": 4.6746675824124964e-05, + "loss": 4.4393, + "step": 27611 + }, + { + "epoch": 0.1642163859548958, + "grad_norm": 1.9504097700119019, + "learning_rate": 4.674644540703399e-05, + "loss": 4.753, + "step": 27612 + }, + { + "epoch": 0.1642223332381768, + "grad_norm": 2.2953338623046875, + "learning_rate": 4.674621498235155e-05, + "loss": 4.7865, + "step": 27613 + }, + { + "epoch": 0.1642282805214578, + "grad_norm": 2.291163921356201, + "learning_rate": 4.674598455007773e-05, + "loss": 4.7659, + "step": 27614 + }, + { + "epoch": 0.16423422780473879, + "grad_norm": 2.1821818351745605, + "learning_rate": 4.674575411021262e-05, + "loss": 4.0771, + "step": 27615 + }, + { + "epoch": 0.1642401750880198, + "grad_norm": 2.2602016925811768, + "learning_rate": 4.6745523662756286e-05, + "loss": 4.2426, + "step": 27616 + }, + { + "epoch": 0.1642461223713008, + "grad_norm": 1.443772792816162, + "learning_rate": 4.674529320770882e-05, + "loss": 5.2936, + "step": 27617 + }, + { + "epoch": 0.16425206965458178, + "grad_norm": 2.0360827445983887, + "learning_rate": 4.674506274507029e-05, + "loss": 5.3444, + "step": 27618 + }, + { + "epoch": 0.1642580169378628, + "grad_norm": 1.7705327272415161, + "learning_rate": 4.6744832274840786e-05, + "loss": 5.0619, + "step": 27619 + }, + { + "epoch": 0.16426396422114378, + "grad_norm": 2.3405168056488037, + "learning_rate": 4.6744601797020384e-05, + "loss": 4.0113, + "step": 27620 + }, + { + "epoch": 0.16426991150442477, + "grad_norm": 1.6145120859146118, + "learning_rate": 4.674437131160917e-05, + "loss": 4.87, + "step": 27621 + }, + { + "epoch": 0.1642758587877058, + "grad_norm": 1.7102009057998657, + "learning_rate": 4.674414081860722e-05, + "loss": 5.2878, + "step": 27622 + }, + { + "epoch": 0.16428180607098677, + "grad_norm": 1.5974667072296143, + "learning_rate": 4.674391031801461e-05, + "loss": 5.1225, + "step": 27623 + }, + { + "epoch": 0.16428775335426776, + "grad_norm": 1.7934401035308838, + "learning_rate": 4.674367980983143e-05, + "loss": 5.1496, + "step": 27624 + }, + { + "epoch": 0.16429370063754878, + "grad_norm": 1.625554084777832, + "learning_rate": 4.674344929405775e-05, + "loss": 4.9198, + "step": 27625 + }, + { + "epoch": 0.16429964792082977, + "grad_norm": 1.5650711059570312, + "learning_rate": 4.674321877069366e-05, + "loss": 5.6505, + "step": 27626 + }, + { + "epoch": 0.16430559520411075, + "grad_norm": 1.8613455295562744, + "learning_rate": 4.674298823973924e-05, + "loss": 5.6026, + "step": 27627 + }, + { + "epoch": 0.16431154248739177, + "grad_norm": 1.617720603942871, + "learning_rate": 4.674275770119457e-05, + "loss": 5.4009, + "step": 27628 + }, + { + "epoch": 0.16431748977067276, + "grad_norm": 1.937449336051941, + "learning_rate": 4.6742527155059724e-05, + "loss": 4.8275, + "step": 27629 + }, + { + "epoch": 0.16432343705395375, + "grad_norm": 2.541095733642578, + "learning_rate": 4.674229660133479e-05, + "loss": 4.0442, + "step": 27630 + }, + { + "epoch": 0.16432938433723476, + "grad_norm": 2.760444402694702, + "learning_rate": 4.674206604001984e-05, + "loss": 3.19, + "step": 27631 + }, + { + "epoch": 0.16433533162051575, + "grad_norm": 2.561680316925049, + "learning_rate": 4.674183547111496e-05, + "loss": 3.9053, + "step": 27632 + }, + { + "epoch": 0.16434127890379674, + "grad_norm": 2.6636784076690674, + "learning_rate": 4.6741604894620225e-05, + "loss": 4.1, + "step": 27633 + }, + { + "epoch": 0.16434722618707776, + "grad_norm": 2.010796070098877, + "learning_rate": 4.674137431053573e-05, + "loss": 4.5599, + "step": 27634 + }, + { + "epoch": 0.16435317347035874, + "grad_norm": 2.131115198135376, + "learning_rate": 4.674114371886154e-05, + "loss": 4.2314, + "step": 27635 + }, + { + "epoch": 0.16435912075363973, + "grad_norm": 2.2468631267547607, + "learning_rate": 4.674091311959774e-05, + "loss": 4.3132, + "step": 27636 + }, + { + "epoch": 0.16436506803692075, + "grad_norm": 2.325503349304199, + "learning_rate": 4.674068251274442e-05, + "loss": 3.7301, + "step": 27637 + }, + { + "epoch": 0.16437101532020174, + "grad_norm": 2.631612777709961, + "learning_rate": 4.6740451898301646e-05, + "loss": 3.6578, + "step": 27638 + }, + { + "epoch": 0.16437696260348272, + "grad_norm": 2.2272074222564697, + "learning_rate": 4.67402212762695e-05, + "loss": 4.1707, + "step": 27639 + }, + { + "epoch": 0.16438290988676374, + "grad_norm": 1.6620466709136963, + "learning_rate": 4.673999064664808e-05, + "loss": 5.1998, + "step": 27640 + }, + { + "epoch": 0.16438885717004473, + "grad_norm": 2.39687442779541, + "learning_rate": 4.673976000943745e-05, + "loss": 4.99, + "step": 27641 + }, + { + "epoch": 0.16439480445332572, + "grad_norm": 2.3301122188568115, + "learning_rate": 4.673952936463769e-05, + "loss": 4.7562, + "step": 27642 + }, + { + "epoch": 0.16440075173660673, + "grad_norm": 2.335031509399414, + "learning_rate": 4.6739298712248887e-05, + "loss": 4.6406, + "step": 27643 + }, + { + "epoch": 0.16440669901988772, + "grad_norm": 2.3373608589172363, + "learning_rate": 4.6739068052271115e-05, + "loss": 4.555, + "step": 27644 + }, + { + "epoch": 0.1644126463031687, + "grad_norm": 1.887984037399292, + "learning_rate": 4.6738837384704463e-05, + "loss": 5.0687, + "step": 27645 + }, + { + "epoch": 0.1644185935864497, + "grad_norm": 2.8348052501678467, + "learning_rate": 4.673860670954901e-05, + "loss": 5.0324, + "step": 27646 + }, + { + "epoch": 0.1644245408697307, + "grad_norm": 2.3812403678894043, + "learning_rate": 4.673837602680483e-05, + "loss": 5.1471, + "step": 27647 + }, + { + "epoch": 0.1644304881530117, + "grad_norm": 2.797342300415039, + "learning_rate": 4.673814533647201e-05, + "loss": 4.9506, + "step": 27648 + }, + { + "epoch": 0.1644364354362927, + "grad_norm": 2.2026922702789307, + "learning_rate": 4.673791463855063e-05, + "loss": 4.8893, + "step": 27649 + }, + { + "epoch": 0.1644423827195737, + "grad_norm": 1.6675883531570435, + "learning_rate": 4.6737683933040766e-05, + "loss": 5.247, + "step": 27650 + }, + { + "epoch": 0.1644483300028547, + "grad_norm": 1.771507978439331, + "learning_rate": 4.6737453219942495e-05, + "loss": 5.0371, + "step": 27651 + }, + { + "epoch": 0.16445427728613568, + "grad_norm": 1.753718614578247, + "learning_rate": 4.6737222499255914e-05, + "loss": 4.9673, + "step": 27652 + }, + { + "epoch": 0.1644602245694167, + "grad_norm": 2.460538387298584, + "learning_rate": 4.673699177098109e-05, + "loss": 5.0578, + "step": 27653 + }, + { + "epoch": 0.16446617185269768, + "grad_norm": 2.2908952236175537, + "learning_rate": 4.6736761035118104e-05, + "loss": 4.9473, + "step": 27654 + }, + { + "epoch": 0.16447211913597867, + "grad_norm": 2.1169328689575195, + "learning_rate": 4.673653029166704e-05, + "loss": 4.8466, + "step": 27655 + }, + { + "epoch": 0.1644780664192597, + "grad_norm": 1.8647359609603882, + "learning_rate": 4.673629954062797e-05, + "loss": 4.9256, + "step": 27656 + }, + { + "epoch": 0.16448401370254068, + "grad_norm": 2.2176151275634766, + "learning_rate": 4.6736068782001e-05, + "loss": 5.1344, + "step": 27657 + }, + { + "epoch": 0.16448996098582166, + "grad_norm": 2.300567626953125, + "learning_rate": 4.6735838015786185e-05, + "loss": 4.9018, + "step": 27658 + }, + { + "epoch": 0.16449590826910268, + "grad_norm": 2.458017110824585, + "learning_rate": 4.673560724198361e-05, + "loss": 5.2333, + "step": 27659 + }, + { + "epoch": 0.16450185555238367, + "grad_norm": 2.418851852416992, + "learning_rate": 4.673537646059336e-05, + "loss": 5.0428, + "step": 27660 + }, + { + "epoch": 0.16450780283566466, + "grad_norm": 2.163425922393799, + "learning_rate": 4.673514567161551e-05, + "loss": 5.2115, + "step": 27661 + }, + { + "epoch": 0.16451375011894567, + "grad_norm": 2.171957492828369, + "learning_rate": 4.673491487505015e-05, + "loss": 5.1336, + "step": 27662 + }, + { + "epoch": 0.16451969740222666, + "grad_norm": 1.6024816036224365, + "learning_rate": 4.6734684070897364e-05, + "loss": 5.2832, + "step": 27663 + }, + { + "epoch": 0.16452564468550765, + "grad_norm": 2.581366777420044, + "learning_rate": 4.673445325915722e-05, + "loss": 4.2245, + "step": 27664 + }, + { + "epoch": 0.16453159196878867, + "grad_norm": 2.65466570854187, + "learning_rate": 4.67342224398298e-05, + "loss": 3.8786, + "step": 27665 + }, + { + "epoch": 0.16453753925206965, + "grad_norm": 1.909327745437622, + "learning_rate": 4.673399161291519e-05, + "loss": 5.2398, + "step": 27666 + }, + { + "epoch": 0.16454348653535064, + "grad_norm": 2.0884993076324463, + "learning_rate": 4.673376077841346e-05, + "loss": 4.8081, + "step": 27667 + }, + { + "epoch": 0.16454943381863166, + "grad_norm": 2.1802215576171875, + "learning_rate": 4.67335299363247e-05, + "loss": 4.9251, + "step": 27668 + }, + { + "epoch": 0.16455538110191265, + "grad_norm": 2.281020402908325, + "learning_rate": 4.6733299086648996e-05, + "loss": 4.2682, + "step": 27669 + }, + { + "epoch": 0.16456132838519363, + "grad_norm": 2.34698224067688, + "learning_rate": 4.673306822938642e-05, + "loss": 3.8815, + "step": 27670 + }, + { + "epoch": 0.16456727566847465, + "grad_norm": 2.84965181350708, + "learning_rate": 4.673283736453705e-05, + "loss": 3.8124, + "step": 27671 + }, + { + "epoch": 0.16457322295175564, + "grad_norm": 2.604818344116211, + "learning_rate": 4.673260649210098e-05, + "loss": 3.8991, + "step": 27672 + }, + { + "epoch": 0.16457917023503663, + "grad_norm": 2.5472776889801025, + "learning_rate": 4.673237561207827e-05, + "loss": 3.8003, + "step": 27673 + }, + { + "epoch": 0.16458511751831764, + "grad_norm": 1.9040625095367432, + "learning_rate": 4.673214472446902e-05, + "loss": 4.1075, + "step": 27674 + }, + { + "epoch": 0.16459106480159863, + "grad_norm": 1.5493569374084473, + "learning_rate": 4.6731913829273303e-05, + "loss": 5.5934, + "step": 27675 + }, + { + "epoch": 0.16459701208487962, + "grad_norm": 1.683307409286499, + "learning_rate": 4.67316829264912e-05, + "loss": 5.3139, + "step": 27676 + }, + { + "epoch": 0.16460295936816063, + "grad_norm": 1.5558831691741943, + "learning_rate": 4.673145201612279e-05, + "loss": 5.331, + "step": 27677 + }, + { + "epoch": 0.16460890665144162, + "grad_norm": 1.7119014263153076, + "learning_rate": 4.673122109816815e-05, + "loss": 5.4438, + "step": 27678 + }, + { + "epoch": 0.1646148539347226, + "grad_norm": 2.4084794521331787, + "learning_rate": 4.673099017262737e-05, + "loss": 4.2357, + "step": 27679 + }, + { + "epoch": 0.16462080121800363, + "grad_norm": 1.8065168857574463, + "learning_rate": 4.673075923950053e-05, + "loss": 4.4894, + "step": 27680 + }, + { + "epoch": 0.16462674850128461, + "grad_norm": 1.5240797996520996, + "learning_rate": 4.673052829878769e-05, + "loss": 4.6992, + "step": 27681 + }, + { + "epoch": 0.1646326957845656, + "grad_norm": 1.9197040796279907, + "learning_rate": 4.673029735048896e-05, + "loss": 5.0591, + "step": 27682 + }, + { + "epoch": 0.16463864306784662, + "grad_norm": 1.5522626638412476, + "learning_rate": 4.673006639460441e-05, + "loss": 5.2923, + "step": 27683 + }, + { + "epoch": 0.1646445903511276, + "grad_norm": 1.663277506828308, + "learning_rate": 4.6729835431134115e-05, + "loss": 5.0555, + "step": 27684 + }, + { + "epoch": 0.1646505376344086, + "grad_norm": 1.5276461839675903, + "learning_rate": 4.672960446007816e-05, + "loss": 5.1765, + "step": 27685 + }, + { + "epoch": 0.1646564849176896, + "grad_norm": 1.5308914184570312, + "learning_rate": 4.672937348143662e-05, + "loss": 4.371, + "step": 27686 + }, + { + "epoch": 0.1646624322009706, + "grad_norm": 1.6172471046447754, + "learning_rate": 4.672914249520958e-05, + "loss": 4.8801, + "step": 27687 + }, + { + "epoch": 0.1646683794842516, + "grad_norm": 1.523914098739624, + "learning_rate": 4.6728911501397124e-05, + "loss": 4.3999, + "step": 27688 + }, + { + "epoch": 0.1646743267675326, + "grad_norm": 1.2214871644973755, + "learning_rate": 4.672868049999933e-05, + "loss": 3.7981, + "step": 27689 + }, + { + "epoch": 0.1646802740508136, + "grad_norm": 1.340168833732605, + "learning_rate": 4.672844949101628e-05, + "loss": 4.5471, + "step": 27690 + }, + { + "epoch": 0.16468622133409458, + "grad_norm": 1.8667452335357666, + "learning_rate": 4.672821847444805e-05, + "loss": 4.3881, + "step": 27691 + }, + { + "epoch": 0.1646921686173756, + "grad_norm": 3.047363042831421, + "learning_rate": 4.672798745029472e-05, + "loss": 3.7606, + "step": 27692 + }, + { + "epoch": 0.16469811590065658, + "grad_norm": 1.8616588115692139, + "learning_rate": 4.672775641855638e-05, + "loss": 5.0264, + "step": 27693 + }, + { + "epoch": 0.16470406318393757, + "grad_norm": 1.9045435190200806, + "learning_rate": 4.67275253792331e-05, + "loss": 4.5934, + "step": 27694 + }, + { + "epoch": 0.1647100104672186, + "grad_norm": 1.9803951978683472, + "learning_rate": 4.672729433232497e-05, + "loss": 4.0846, + "step": 27695 + }, + { + "epoch": 0.16471595775049958, + "grad_norm": 1.797312617301941, + "learning_rate": 4.672706327783206e-05, + "loss": 4.5876, + "step": 27696 + }, + { + "epoch": 0.16472190503378056, + "grad_norm": 1.954188585281372, + "learning_rate": 4.672683221575446e-05, + "loss": 4.3985, + "step": 27697 + }, + { + "epoch": 0.16472785231706158, + "grad_norm": 2.246690273284912, + "learning_rate": 4.6726601146092255e-05, + "loss": 4.1058, + "step": 27698 + }, + { + "epoch": 0.16473379960034257, + "grad_norm": 2.160576343536377, + "learning_rate": 4.67263700688455e-05, + "loss": 4.0139, + "step": 27699 + }, + { + "epoch": 0.16473974688362356, + "grad_norm": 2.5650711059570312, + "learning_rate": 4.672613898401431e-05, + "loss": 3.5785, + "step": 27700 + }, + { + "epoch": 0.16474569416690457, + "grad_norm": 2.6694283485412598, + "learning_rate": 4.6725907891598744e-05, + "loss": 3.4553, + "step": 27701 + }, + { + "epoch": 0.16475164145018556, + "grad_norm": 1.8965697288513184, + "learning_rate": 4.672567679159888e-05, + "loss": 3.8723, + "step": 27702 + }, + { + "epoch": 0.16475758873346655, + "grad_norm": 2.0568554401397705, + "learning_rate": 4.6725445684014824e-05, + "loss": 4.6828, + "step": 27703 + }, + { + "epoch": 0.16476353601674754, + "grad_norm": 1.7810505628585815, + "learning_rate": 4.672521456884663e-05, + "loss": 5.5463, + "step": 27704 + }, + { + "epoch": 0.16476948330002855, + "grad_norm": 1.8636524677276611, + "learning_rate": 4.6724983446094385e-05, + "loss": 4.9334, + "step": 27705 + }, + { + "epoch": 0.16477543058330954, + "grad_norm": 2.172565460205078, + "learning_rate": 4.6724752315758174e-05, + "loss": 5.4723, + "step": 27706 + }, + { + "epoch": 0.16478137786659053, + "grad_norm": 2.461881637573242, + "learning_rate": 4.672452117783808e-05, + "loss": 4.6138, + "step": 27707 + }, + { + "epoch": 0.16478732514987154, + "grad_norm": 2.3633780479431152, + "learning_rate": 4.672429003233418e-05, + "loss": 4.2636, + "step": 27708 + }, + { + "epoch": 0.16479327243315253, + "grad_norm": 2.4033286571502686, + "learning_rate": 4.6724058879246546e-05, + "loss": 3.716, + "step": 27709 + }, + { + "epoch": 0.16479921971643352, + "grad_norm": 2.201249599456787, + "learning_rate": 4.672382771857527e-05, + "loss": 4.9046, + "step": 27710 + }, + { + "epoch": 0.16480516699971454, + "grad_norm": 2.0308284759521484, + "learning_rate": 4.672359655032044e-05, + "loss": 4.255, + "step": 27711 + }, + { + "epoch": 0.16481111428299552, + "grad_norm": 2.46120023727417, + "learning_rate": 4.672336537448212e-05, + "loss": 3.7853, + "step": 27712 + }, + { + "epoch": 0.1648170615662765, + "grad_norm": 2.130208969116211, + "learning_rate": 4.6723134191060404e-05, + "loss": 3.8114, + "step": 27713 + }, + { + "epoch": 0.16482300884955753, + "grad_norm": 2.138585329055786, + "learning_rate": 4.672290300005536e-05, + "loss": 4.6266, + "step": 27714 + }, + { + "epoch": 0.16482895613283852, + "grad_norm": 1.8015727996826172, + "learning_rate": 4.6722671801467074e-05, + "loss": 4.2178, + "step": 27715 + }, + { + "epoch": 0.1648349034161195, + "grad_norm": 2.3047871589660645, + "learning_rate": 4.672244059529564e-05, + "loss": 4.258, + "step": 27716 + }, + { + "epoch": 0.16484085069940052, + "grad_norm": 2.022953987121582, + "learning_rate": 4.672220938154111e-05, + "loss": 3.7605, + "step": 27717 + }, + { + "epoch": 0.1648467979826815, + "grad_norm": 2.3721368312835693, + "learning_rate": 4.672197816020358e-05, + "loss": 3.6132, + "step": 27718 + }, + { + "epoch": 0.1648527452659625, + "grad_norm": 1.9578886032104492, + "learning_rate": 4.672174693128314e-05, + "loss": 3.9983, + "step": 27719 + }, + { + "epoch": 0.1648586925492435, + "grad_norm": 2.0287981033325195, + "learning_rate": 4.672151569477987e-05, + "loss": 3.8297, + "step": 27720 + }, + { + "epoch": 0.1648646398325245, + "grad_norm": 2.1453230381011963, + "learning_rate": 4.672128445069383e-05, + "loss": 3.5676, + "step": 27721 + }, + { + "epoch": 0.1648705871158055, + "grad_norm": 2.209982395172119, + "learning_rate": 4.672105319902512e-05, + "loss": 3.6304, + "step": 27722 + }, + { + "epoch": 0.1648765343990865, + "grad_norm": 2.1707348823547363, + "learning_rate": 4.672082193977382e-05, + "loss": 3.679, + "step": 27723 + }, + { + "epoch": 0.1648824816823675, + "grad_norm": 1.9688754081726074, + "learning_rate": 4.672059067293999e-05, + "loss": 4.235, + "step": 27724 + }, + { + "epoch": 0.16488842896564848, + "grad_norm": 1.988599419593811, + "learning_rate": 4.672035939852374e-05, + "loss": 3.8704, + "step": 27725 + }, + { + "epoch": 0.1648943762489295, + "grad_norm": 1.7759329080581665, + "learning_rate": 4.672012811652513e-05, + "loss": 4.5621, + "step": 27726 + }, + { + "epoch": 0.16490032353221049, + "grad_norm": 1.9790258407592773, + "learning_rate": 4.6719896826944255e-05, + "loss": 4.2214, + "step": 27727 + }, + { + "epoch": 0.16490627081549147, + "grad_norm": 1.6736228466033936, + "learning_rate": 4.671966552978118e-05, + "loss": 4.555, + "step": 27728 + }, + { + "epoch": 0.1649122180987725, + "grad_norm": 2.4587225914001465, + "learning_rate": 4.6719434225036e-05, + "loss": 4.4134, + "step": 27729 + }, + { + "epoch": 0.16491816538205348, + "grad_norm": 1.5891488790512085, + "learning_rate": 4.671920291270879e-05, + "loss": 5.4629, + "step": 27730 + }, + { + "epoch": 0.16492411266533447, + "grad_norm": 1.5606093406677246, + "learning_rate": 4.671897159279962e-05, + "loss": 4.5045, + "step": 27731 + }, + { + "epoch": 0.16493005994861548, + "grad_norm": 2.5481436252593994, + "learning_rate": 4.6718740265308595e-05, + "loss": 3.2812, + "step": 27732 + }, + { + "epoch": 0.16493600723189647, + "grad_norm": 2.602802276611328, + "learning_rate": 4.671850893023577e-05, + "loss": 3.082, + "step": 27733 + }, + { + "epoch": 0.16494195451517746, + "grad_norm": 2.3786399364471436, + "learning_rate": 4.6718277587581246e-05, + "loss": 3.5956, + "step": 27734 + }, + { + "epoch": 0.16494790179845847, + "grad_norm": 1.5555487871170044, + "learning_rate": 4.67180462373451e-05, + "loss": 5.2082, + "step": 27735 + }, + { + "epoch": 0.16495384908173946, + "grad_norm": 1.6801286935806274, + "learning_rate": 4.67178148795274e-05, + "loss": 5.3879, + "step": 27736 + }, + { + "epoch": 0.16495979636502045, + "grad_norm": 1.3999351263046265, + "learning_rate": 4.671758351412824e-05, + "loss": 4.9347, + "step": 27737 + }, + { + "epoch": 0.16496574364830147, + "grad_norm": 2.48246693611145, + "learning_rate": 4.6717352141147696e-05, + "loss": 3.5764, + "step": 27738 + }, + { + "epoch": 0.16497169093158245, + "grad_norm": 2.5625791549682617, + "learning_rate": 4.6717120760585844e-05, + "loss": 2.841, + "step": 27739 + }, + { + "epoch": 0.16497763821486344, + "grad_norm": 2.025188684463501, + "learning_rate": 4.6716889372442775e-05, + "loss": 3.9643, + "step": 27740 + }, + { + "epoch": 0.16498358549814446, + "grad_norm": 2.5314674377441406, + "learning_rate": 4.671665797671856e-05, + "loss": 1.9703, + "step": 27741 + }, + { + "epoch": 0.16498953278142545, + "grad_norm": 2.7406599521636963, + "learning_rate": 4.671642657341329e-05, + "loss": 3.0071, + "step": 27742 + }, + { + "epoch": 0.16499548006470643, + "grad_norm": 2.0033769607543945, + "learning_rate": 4.671619516252703e-05, + "loss": 4.5621, + "step": 27743 + }, + { + "epoch": 0.16500142734798745, + "grad_norm": 1.587997555732727, + "learning_rate": 4.6715963744059874e-05, + "loss": 4.9265, + "step": 27744 + }, + { + "epoch": 0.16500737463126844, + "grad_norm": 1.6401697397232056, + "learning_rate": 4.6715732318011905e-05, + "loss": 4.6801, + "step": 27745 + }, + { + "epoch": 0.16501332191454943, + "grad_norm": 2.994272232055664, + "learning_rate": 4.671550088438319e-05, + "loss": 2.0322, + "step": 27746 + }, + { + "epoch": 0.16501926919783044, + "grad_norm": 3.038865089416504, + "learning_rate": 4.671526944317382e-05, + "loss": 2.0138, + "step": 27747 + }, + { + "epoch": 0.16502521648111143, + "grad_norm": 2.9179296493530273, + "learning_rate": 4.671503799438388e-05, + "loss": 3.2955, + "step": 27748 + }, + { + "epoch": 0.16503116376439242, + "grad_norm": 1.7475281953811646, + "learning_rate": 4.6714806538013446e-05, + "loss": 5.4316, + "step": 27749 + }, + { + "epoch": 0.16503711104767343, + "grad_norm": 1.4781032800674438, + "learning_rate": 4.6714575074062596e-05, + "loss": 5.4519, + "step": 27750 + }, + { + "epoch": 0.16504305833095442, + "grad_norm": 1.3684823513031006, + "learning_rate": 4.6714343602531404e-05, + "loss": 5.3335, + "step": 27751 + }, + { + "epoch": 0.1650490056142354, + "grad_norm": 1.6875170469284058, + "learning_rate": 4.6714112123419965e-05, + "loss": 5.0396, + "step": 27752 + }, + { + "epoch": 0.16505495289751643, + "grad_norm": 1.6213173866271973, + "learning_rate": 4.6713880636728346e-05, + "loss": 4.763, + "step": 27753 + }, + { + "epoch": 0.16506090018079742, + "grad_norm": 1.5345633029937744, + "learning_rate": 4.6713649142456644e-05, + "loss": 4.9192, + "step": 27754 + }, + { + "epoch": 0.1650668474640784, + "grad_norm": 1.9773199558258057, + "learning_rate": 4.671341764060493e-05, + "loss": 4.7158, + "step": 27755 + }, + { + "epoch": 0.16507279474735942, + "grad_norm": 1.786027193069458, + "learning_rate": 4.6713186131173284e-05, + "loss": 5.6319, + "step": 27756 + }, + { + "epoch": 0.1650787420306404, + "grad_norm": 1.5743745565414429, + "learning_rate": 4.6712954614161797e-05, + "loss": 5.5466, + "step": 27757 + }, + { + "epoch": 0.1650846893139214, + "grad_norm": 1.6003429889678955, + "learning_rate": 4.671272308957053e-05, + "loss": 5.5166, + "step": 27758 + }, + { + "epoch": 0.1650906365972024, + "grad_norm": 1.567775011062622, + "learning_rate": 4.6712491557399585e-05, + "loss": 5.1731, + "step": 27759 + }, + { + "epoch": 0.1650965838804834, + "grad_norm": 1.7042558193206787, + "learning_rate": 4.671226001764903e-05, + "loss": 4.7854, + "step": 27760 + }, + { + "epoch": 0.1651025311637644, + "grad_norm": 2.414813280105591, + "learning_rate": 4.6712028470318946e-05, + "loss": 3.969, + "step": 27761 + }, + { + "epoch": 0.16510847844704538, + "grad_norm": 2.2361044883728027, + "learning_rate": 4.671179691540942e-05, + "loss": 4.0416, + "step": 27762 + }, + { + "epoch": 0.1651144257303264, + "grad_norm": 1.4998681545257568, + "learning_rate": 4.6711565352920526e-05, + "loss": 4.0769, + "step": 27763 + }, + { + "epoch": 0.16512037301360738, + "grad_norm": 1.8944214582443237, + "learning_rate": 4.6711333782852364e-05, + "loss": 3.9101, + "step": 27764 + }, + { + "epoch": 0.16512632029688837, + "grad_norm": 2.432981252670288, + "learning_rate": 4.671110220520498e-05, + "loss": 3.7838, + "step": 27765 + }, + { + "epoch": 0.16513226758016938, + "grad_norm": 2.3724024295806885, + "learning_rate": 4.6710870619978486e-05, + "loss": 4.0045, + "step": 27766 + }, + { + "epoch": 0.16513821486345037, + "grad_norm": 2.136061429977417, + "learning_rate": 4.671063902717295e-05, + "loss": 4.3335, + "step": 27767 + }, + { + "epoch": 0.16514416214673136, + "grad_norm": 2.263643264770508, + "learning_rate": 4.671040742678845e-05, + "loss": 4.417, + "step": 27768 + }, + { + "epoch": 0.16515010943001238, + "grad_norm": 2.2661242485046387, + "learning_rate": 4.671017581882507e-05, + "loss": 3.9163, + "step": 27769 + }, + { + "epoch": 0.16515605671329336, + "grad_norm": 1.9908580780029297, + "learning_rate": 4.6709944203282905e-05, + "loss": 4.5396, + "step": 27770 + }, + { + "epoch": 0.16516200399657435, + "grad_norm": 1.7676030397415161, + "learning_rate": 4.6709712580162014e-05, + "loss": 4.3445, + "step": 27771 + }, + { + "epoch": 0.16516795127985537, + "grad_norm": 2.308959722518921, + "learning_rate": 4.670948094946248e-05, + "loss": 3.3659, + "step": 27772 + }, + { + "epoch": 0.16517389856313636, + "grad_norm": 2.0675418376922607, + "learning_rate": 4.67092493111844e-05, + "loss": 3.5967, + "step": 27773 + }, + { + "epoch": 0.16517984584641734, + "grad_norm": 2.192368268966675, + "learning_rate": 4.670901766532784e-05, + "loss": 3.7969, + "step": 27774 + }, + { + "epoch": 0.16518579312969836, + "grad_norm": 2.0077974796295166, + "learning_rate": 4.670878601189289e-05, + "loss": 3.43, + "step": 27775 + }, + { + "epoch": 0.16519174041297935, + "grad_norm": 2.169884443283081, + "learning_rate": 4.670855435087963e-05, + "loss": 4.8072, + "step": 27776 + }, + { + "epoch": 0.16519768769626034, + "grad_norm": 2.4910061359405518, + "learning_rate": 4.670832268228813e-05, + "loss": 3.5874, + "step": 27777 + }, + { + "epoch": 0.16520363497954135, + "grad_norm": 2.0694758892059326, + "learning_rate": 4.670809100611848e-05, + "loss": 4.3965, + "step": 27778 + }, + { + "epoch": 0.16520958226282234, + "grad_norm": 1.5337821245193481, + "learning_rate": 4.670785932237076e-05, + "loss": 4.8369, + "step": 27779 + }, + { + "epoch": 0.16521552954610333, + "grad_norm": 1.8797821998596191, + "learning_rate": 4.670762763104506e-05, + "loss": 5.2661, + "step": 27780 + }, + { + "epoch": 0.16522147682938434, + "grad_norm": 1.6902687549591064, + "learning_rate": 4.670739593214144e-05, + "loss": 5.4648, + "step": 27781 + }, + { + "epoch": 0.16522742411266533, + "grad_norm": 1.485190987586975, + "learning_rate": 4.670716422565999e-05, + "loss": 4.9547, + "step": 27782 + }, + { + "epoch": 0.16523337139594632, + "grad_norm": 1.7863634824752808, + "learning_rate": 4.670693251160081e-05, + "loss": 4.9542, + "step": 27783 + }, + { + "epoch": 0.16523931867922734, + "grad_norm": 1.7533354759216309, + "learning_rate": 4.670670078996395e-05, + "loss": 4.7394, + "step": 27784 + }, + { + "epoch": 0.16524526596250833, + "grad_norm": 1.7423986196517944, + "learning_rate": 4.670646906074951e-05, + "loss": 4.8273, + "step": 27785 + }, + { + "epoch": 0.1652512132457893, + "grad_norm": 1.3752869367599487, + "learning_rate": 4.670623732395756e-05, + "loss": 5.0926, + "step": 27786 + }, + { + "epoch": 0.16525716052907033, + "grad_norm": 1.5826581716537476, + "learning_rate": 4.67060055795882e-05, + "loss": 5.167, + "step": 27787 + }, + { + "epoch": 0.16526310781235132, + "grad_norm": 1.5029367208480835, + "learning_rate": 4.6705773827641485e-05, + "loss": 5.145, + "step": 27788 + }, + { + "epoch": 0.1652690550956323, + "grad_norm": 1.720220923423767, + "learning_rate": 4.670554206811751e-05, + "loss": 5.2389, + "step": 27789 + }, + { + "epoch": 0.16527500237891332, + "grad_norm": 1.8043471574783325, + "learning_rate": 4.6705310301016355e-05, + "loss": 5.0942, + "step": 27790 + }, + { + "epoch": 0.1652809496621943, + "grad_norm": 1.7888808250427246, + "learning_rate": 4.670507852633811e-05, + "loss": 5.2764, + "step": 27791 + }, + { + "epoch": 0.1652868969454753, + "grad_norm": 1.6223100423812866, + "learning_rate": 4.6704846744082835e-05, + "loss": 5.2812, + "step": 27792 + }, + { + "epoch": 0.1652928442287563, + "grad_norm": 1.5120409727096558, + "learning_rate": 4.670461495425063e-05, + "loss": 5.2022, + "step": 27793 + }, + { + "epoch": 0.1652987915120373, + "grad_norm": 1.5947920083999634, + "learning_rate": 4.670438315684156e-05, + "loss": 5.2711, + "step": 27794 + }, + { + "epoch": 0.1653047387953183, + "grad_norm": 1.6690993309020996, + "learning_rate": 4.6704151351855716e-05, + "loss": 4.8284, + "step": 27795 + }, + { + "epoch": 0.1653106860785993, + "grad_norm": 1.4904134273529053, + "learning_rate": 4.670391953929318e-05, + "loss": 5.2171, + "step": 27796 + }, + { + "epoch": 0.1653166333618803, + "grad_norm": 1.556333065032959, + "learning_rate": 4.6703687719154034e-05, + "loss": 5.6598, + "step": 27797 + }, + { + "epoch": 0.16532258064516128, + "grad_norm": 1.55083167552948, + "learning_rate": 4.670345589143835e-05, + "loss": 5.5919, + "step": 27798 + }, + { + "epoch": 0.1653285279284423, + "grad_norm": 1.9281244277954102, + "learning_rate": 4.670322405614621e-05, + "loss": 5.3313, + "step": 27799 + }, + { + "epoch": 0.1653344752117233, + "grad_norm": 1.4666374921798706, + "learning_rate": 4.670299221327771e-05, + "loss": 5.0905, + "step": 27800 + }, + { + "epoch": 0.16534042249500427, + "grad_norm": 1.8032478094100952, + "learning_rate": 4.670276036283291e-05, + "loss": 4.9322, + "step": 27801 + }, + { + "epoch": 0.1653463697782853, + "grad_norm": 1.7652195692062378, + "learning_rate": 4.67025285048119e-05, + "loss": 4.6763, + "step": 27802 + }, + { + "epoch": 0.16535231706156628, + "grad_norm": 1.7903348207473755, + "learning_rate": 4.6702296639214766e-05, + "loss": 4.491, + "step": 27803 + }, + { + "epoch": 0.16535826434484727, + "grad_norm": 1.6135162115097046, + "learning_rate": 4.6702064766041584e-05, + "loss": 4.3593, + "step": 27804 + }, + { + "epoch": 0.16536421162812828, + "grad_norm": 1.5779284238815308, + "learning_rate": 4.670183288529243e-05, + "loss": 4.3606, + "step": 27805 + }, + { + "epoch": 0.16537015891140927, + "grad_norm": 1.6469144821166992, + "learning_rate": 4.67016009969674e-05, + "loss": 4.3772, + "step": 27806 + }, + { + "epoch": 0.16537610619469026, + "grad_norm": 2.209540367126465, + "learning_rate": 4.670136910106656e-05, + "loss": 5.1859, + "step": 27807 + }, + { + "epoch": 0.16538205347797127, + "grad_norm": 2.5719592571258545, + "learning_rate": 4.670113719758999e-05, + "loss": 5.1312, + "step": 27808 + }, + { + "epoch": 0.16538800076125226, + "grad_norm": 2.1322646141052246, + "learning_rate": 4.670090528653779e-05, + "loss": 5.5602, + "step": 27809 + }, + { + "epoch": 0.16539394804453325, + "grad_norm": 1.8350342512130737, + "learning_rate": 4.670067336791002e-05, + "loss": 5.6963, + "step": 27810 + }, + { + "epoch": 0.16539989532781427, + "grad_norm": 1.6520220041275024, + "learning_rate": 4.670044144170677e-05, + "loss": 5.8053, + "step": 27811 + }, + { + "epoch": 0.16540584261109526, + "grad_norm": 1.559950590133667, + "learning_rate": 4.670020950792812e-05, + "loss": 5.5382, + "step": 27812 + }, + { + "epoch": 0.16541178989437624, + "grad_norm": 1.7970432043075562, + "learning_rate": 4.669997756657415e-05, + "loss": 4.7823, + "step": 27813 + }, + { + "epoch": 0.16541773717765726, + "grad_norm": 1.8613402843475342, + "learning_rate": 4.6699745617644945e-05, + "loss": 5.3559, + "step": 27814 + }, + { + "epoch": 0.16542368446093825, + "grad_norm": 2.660762310028076, + "learning_rate": 4.669951366114058e-05, + "loss": 4.7255, + "step": 27815 + }, + { + "epoch": 0.16542963174421924, + "grad_norm": 2.8636231422424316, + "learning_rate": 4.669928169706114e-05, + "loss": 4.8591, + "step": 27816 + }, + { + "epoch": 0.16543557902750025, + "grad_norm": 1.6894406080245972, + "learning_rate": 4.669904972540671e-05, + "loss": 5.0576, + "step": 27817 + }, + { + "epoch": 0.16544152631078124, + "grad_norm": 2.539830207824707, + "learning_rate": 4.669881774617736e-05, + "loss": 4.9346, + "step": 27818 + }, + { + "epoch": 0.16544747359406223, + "grad_norm": 2.0870940685272217, + "learning_rate": 4.669858575937318e-05, + "loss": 5.0034, + "step": 27819 + }, + { + "epoch": 0.16545342087734322, + "grad_norm": 1.6307538747787476, + "learning_rate": 4.669835376499425e-05, + "loss": 5.4536, + "step": 27820 + }, + { + "epoch": 0.16545936816062423, + "grad_norm": 1.1906611919403076, + "learning_rate": 4.669812176304064e-05, + "loss": 5.5965, + "step": 27821 + }, + { + "epoch": 0.16546531544390522, + "grad_norm": 1.5987422466278076, + "learning_rate": 4.669788975351245e-05, + "loss": 5.4403, + "step": 27822 + }, + { + "epoch": 0.1654712627271862, + "grad_norm": 2.267430543899536, + "learning_rate": 4.669765773640974e-05, + "loss": 5.0344, + "step": 27823 + }, + { + "epoch": 0.16547721001046722, + "grad_norm": 2.2842605113983154, + "learning_rate": 4.669742571173261e-05, + "loss": 4.481, + "step": 27824 + }, + { + "epoch": 0.1654831572937482, + "grad_norm": 1.5940486192703247, + "learning_rate": 4.6697193679481135e-05, + "loss": 5.1313, + "step": 27825 + }, + { + "epoch": 0.1654891045770292, + "grad_norm": 1.9549680948257446, + "learning_rate": 4.6696961639655386e-05, + "loss": 5.1298, + "step": 27826 + }, + { + "epoch": 0.16549505186031022, + "grad_norm": 2.387866497039795, + "learning_rate": 4.6696729592255454e-05, + "loss": 4.9029, + "step": 27827 + }, + { + "epoch": 0.1655009991435912, + "grad_norm": 1.6883118152618408, + "learning_rate": 4.669649753728142e-05, + "loss": 5.1273, + "step": 27828 + }, + { + "epoch": 0.1655069464268722, + "grad_norm": 1.6538794040679932, + "learning_rate": 4.669626547473336e-05, + "loss": 5.2022, + "step": 27829 + }, + { + "epoch": 0.1655128937101532, + "grad_norm": 1.7652950286865234, + "learning_rate": 4.669603340461136e-05, + "loss": 5.5397, + "step": 27830 + }, + { + "epoch": 0.1655188409934342, + "grad_norm": 1.6421597003936768, + "learning_rate": 4.66958013269155e-05, + "loss": 4.9982, + "step": 27831 + }, + { + "epoch": 0.16552478827671518, + "grad_norm": 1.5024685859680176, + "learning_rate": 4.669556924164586e-05, + "loss": 5.6933, + "step": 27832 + }, + { + "epoch": 0.1655307355599962, + "grad_norm": 1.4680891036987305, + "learning_rate": 4.669533714880252e-05, + "loss": 5.3262, + "step": 27833 + }, + { + "epoch": 0.1655366828432772, + "grad_norm": 1.375623345375061, + "learning_rate": 4.669510504838556e-05, + "loss": 5.2673, + "step": 27834 + }, + { + "epoch": 0.16554263012655818, + "grad_norm": 2.1354503631591797, + "learning_rate": 4.669487294039506e-05, + "loss": 4.2156, + "step": 27835 + }, + { + "epoch": 0.1655485774098392, + "grad_norm": 1.5564913749694824, + "learning_rate": 4.669464082483112e-05, + "loss": 4.7238, + "step": 27836 + }, + { + "epoch": 0.16555452469312018, + "grad_norm": 1.6255192756652832, + "learning_rate": 4.669440870169379e-05, + "loss": 5.6043, + "step": 27837 + }, + { + "epoch": 0.16556047197640117, + "grad_norm": 1.6268353462219238, + "learning_rate": 4.6694176570983174e-05, + "loss": 5.3919, + "step": 27838 + }, + { + "epoch": 0.16556641925968218, + "grad_norm": 1.5626128911972046, + "learning_rate": 4.669394443269933e-05, + "loss": 5.5142, + "step": 27839 + }, + { + "epoch": 0.16557236654296317, + "grad_norm": 1.5001987218856812, + "learning_rate": 4.669371228684237e-05, + "loss": 4.7294, + "step": 27840 + }, + { + "epoch": 0.16557831382624416, + "grad_norm": 1.5922046899795532, + "learning_rate": 4.669348013341235e-05, + "loss": 4.9363, + "step": 27841 + }, + { + "epoch": 0.16558426110952518, + "grad_norm": 1.555086374282837, + "learning_rate": 4.669324797240937e-05, + "loss": 4.6704, + "step": 27842 + }, + { + "epoch": 0.16559020839280617, + "grad_norm": 1.711538553237915, + "learning_rate": 4.66930158038335e-05, + "loss": 4.673, + "step": 27843 + }, + { + "epoch": 0.16559615567608715, + "grad_norm": 1.7905937433242798, + "learning_rate": 4.669278362768481e-05, + "loss": 4.5295, + "step": 27844 + }, + { + "epoch": 0.16560210295936817, + "grad_norm": 1.8714954853057861, + "learning_rate": 4.669255144396341e-05, + "loss": 4.699, + "step": 27845 + }, + { + "epoch": 0.16560805024264916, + "grad_norm": 1.6783734560012817, + "learning_rate": 4.669231925266935e-05, + "loss": 5.5447, + "step": 27846 + }, + { + "epoch": 0.16561399752593015, + "grad_norm": 1.3632158041000366, + "learning_rate": 4.669208705380273e-05, + "loss": 5.5541, + "step": 27847 + }, + { + "epoch": 0.16561994480921116, + "grad_norm": 1.6476699113845825, + "learning_rate": 4.669185484736362e-05, + "loss": 4.5751, + "step": 27848 + }, + { + "epoch": 0.16562589209249215, + "grad_norm": 1.630963921546936, + "learning_rate": 4.669162263335212e-05, + "loss": 5.3621, + "step": 27849 + }, + { + "epoch": 0.16563183937577314, + "grad_norm": 1.4858328104019165, + "learning_rate": 4.66913904117683e-05, + "loss": 5.3973, + "step": 27850 + }, + { + "epoch": 0.16563778665905415, + "grad_norm": 1.7069036960601807, + "learning_rate": 4.669115818261223e-05, + "loss": 5.0102, + "step": 27851 + }, + { + "epoch": 0.16564373394233514, + "grad_norm": 1.4385701417922974, + "learning_rate": 4.6690925945884e-05, + "loss": 5.4805, + "step": 27852 + }, + { + "epoch": 0.16564968122561613, + "grad_norm": 1.6895365715026855, + "learning_rate": 4.66906937015837e-05, + "loss": 4.9834, + "step": 27853 + }, + { + "epoch": 0.16565562850889715, + "grad_norm": 2.1618361473083496, + "learning_rate": 4.66904614497114e-05, + "loss": 4.6309, + "step": 27854 + }, + { + "epoch": 0.16566157579217813, + "grad_norm": 2.331005811691284, + "learning_rate": 4.669022919026718e-05, + "loss": 4.1853, + "step": 27855 + }, + { + "epoch": 0.16566752307545912, + "grad_norm": 1.7161813974380493, + "learning_rate": 4.668999692325113e-05, + "loss": 4.5842, + "step": 27856 + }, + { + "epoch": 0.16567347035874014, + "grad_norm": 2.117947816848755, + "learning_rate": 4.668976464866332e-05, + "loss": 4.6009, + "step": 27857 + }, + { + "epoch": 0.16567941764202113, + "grad_norm": 1.6272234916687012, + "learning_rate": 4.6689532366503846e-05, + "loss": 4.8592, + "step": 27858 + }, + { + "epoch": 0.16568536492530211, + "grad_norm": 1.9852404594421387, + "learning_rate": 4.6689300076772776e-05, + "loss": 4.363, + "step": 27859 + }, + { + "epoch": 0.16569131220858313, + "grad_norm": 1.6235220432281494, + "learning_rate": 4.6689067779470194e-05, + "loss": 4.6625, + "step": 27860 + }, + { + "epoch": 0.16569725949186412, + "grad_norm": 1.7212275266647339, + "learning_rate": 4.668883547459618e-05, + "loss": 4.7013, + "step": 27861 + }, + { + "epoch": 0.1657032067751451, + "grad_norm": 2.5496368408203125, + "learning_rate": 4.6688603162150824e-05, + "loss": 4.0435, + "step": 27862 + }, + { + "epoch": 0.16570915405842612, + "grad_norm": 2.681366443634033, + "learning_rate": 4.66883708421342e-05, + "loss": 4.4567, + "step": 27863 + }, + { + "epoch": 0.1657151013417071, + "grad_norm": 2.2227134704589844, + "learning_rate": 4.668813851454639e-05, + "loss": 4.5467, + "step": 27864 + }, + { + "epoch": 0.1657210486249881, + "grad_norm": 2.413037061691284, + "learning_rate": 4.668790617938748e-05, + "loss": 4.1955, + "step": 27865 + }, + { + "epoch": 0.16572699590826911, + "grad_norm": 2.749058723449707, + "learning_rate": 4.668767383665753e-05, + "loss": 4.1209, + "step": 27866 + }, + { + "epoch": 0.1657329431915501, + "grad_norm": 2.075108528137207, + "learning_rate": 4.668744148635665e-05, + "loss": 4.2322, + "step": 27867 + }, + { + "epoch": 0.1657388904748311, + "grad_norm": 1.7476239204406738, + "learning_rate": 4.66872091284849e-05, + "loss": 4.7075, + "step": 27868 + }, + { + "epoch": 0.1657448377581121, + "grad_norm": 1.7722108364105225, + "learning_rate": 4.6686976763042376e-05, + "loss": 4.7211, + "step": 27869 + }, + { + "epoch": 0.1657507850413931, + "grad_norm": 1.57614266872406, + "learning_rate": 4.668674439002915e-05, + "loss": 4.8495, + "step": 27870 + }, + { + "epoch": 0.16575673232467408, + "grad_norm": 1.5763459205627441, + "learning_rate": 4.6686512009445306e-05, + "loss": 5.1311, + "step": 27871 + }, + { + "epoch": 0.1657626796079551, + "grad_norm": 1.5253850221633911, + "learning_rate": 4.6686279621290925e-05, + "loss": 5.3513, + "step": 27872 + }, + { + "epoch": 0.1657686268912361, + "grad_norm": 1.8837103843688965, + "learning_rate": 4.668604722556609e-05, + "loss": 4.9349, + "step": 27873 + }, + { + "epoch": 0.16577457417451708, + "grad_norm": 1.7190310955047607, + "learning_rate": 4.668581482227087e-05, + "loss": 5.4962, + "step": 27874 + }, + { + "epoch": 0.1657805214577981, + "grad_norm": 1.6501142978668213, + "learning_rate": 4.668558241140537e-05, + "loss": 5.0092, + "step": 27875 + }, + { + "epoch": 0.16578646874107908, + "grad_norm": 2.03367018699646, + "learning_rate": 4.668534999296965e-05, + "loss": 5.2323, + "step": 27876 + }, + { + "epoch": 0.16579241602436007, + "grad_norm": 2.455427885055542, + "learning_rate": 4.66851175669638e-05, + "loss": 4.2927, + "step": 27877 + }, + { + "epoch": 0.16579836330764108, + "grad_norm": 2.443146228790283, + "learning_rate": 4.668488513338789e-05, + "loss": 4.3029, + "step": 27878 + }, + { + "epoch": 0.16580431059092207, + "grad_norm": 2.656646251678467, + "learning_rate": 4.6684652692242026e-05, + "loss": 4.2249, + "step": 27879 + }, + { + "epoch": 0.16581025787420306, + "grad_norm": 2.4562222957611084, + "learning_rate": 4.668442024352626e-05, + "loss": 4.5162, + "step": 27880 + }, + { + "epoch": 0.16581620515748405, + "grad_norm": 2.8980703353881836, + "learning_rate": 4.6684187787240695e-05, + "loss": 4.1083, + "step": 27881 + }, + { + "epoch": 0.16582215244076506, + "grad_norm": 2.5985610485076904, + "learning_rate": 4.668395532338541e-05, + "loss": 4.1557, + "step": 27882 + }, + { + "epoch": 0.16582809972404605, + "grad_norm": 2.4054651260375977, + "learning_rate": 4.6683722851960465e-05, + "loss": 4.2334, + "step": 27883 + }, + { + "epoch": 0.16583404700732704, + "grad_norm": 2.0977237224578857, + "learning_rate": 4.668349037296597e-05, + "loss": 4.5715, + "step": 27884 + }, + { + "epoch": 0.16583999429060806, + "grad_norm": 2.0701677799224854, + "learning_rate": 4.6683257886401985e-05, + "loss": 4.7195, + "step": 27885 + }, + { + "epoch": 0.16584594157388904, + "grad_norm": 1.9294004440307617, + "learning_rate": 4.6683025392268597e-05, + "loss": 4.6521, + "step": 27886 + }, + { + "epoch": 0.16585188885717003, + "grad_norm": 2.1713595390319824, + "learning_rate": 4.66827928905659e-05, + "loss": 4.7052, + "step": 27887 + }, + { + "epoch": 0.16585783614045105, + "grad_norm": 2.835434913635254, + "learning_rate": 4.668256038129395e-05, + "loss": 4.4006, + "step": 27888 + }, + { + "epoch": 0.16586378342373204, + "grad_norm": 2.466986894607544, + "learning_rate": 4.668232786445285e-05, + "loss": 4.3107, + "step": 27889 + }, + { + "epoch": 0.16586973070701302, + "grad_norm": 1.7013013362884521, + "learning_rate": 4.6682095340042675e-05, + "loss": 4.4813, + "step": 27890 + }, + { + "epoch": 0.16587567799029404, + "grad_norm": 1.7486096620559692, + "learning_rate": 4.66818628080635e-05, + "loss": 4.6227, + "step": 27891 + }, + { + "epoch": 0.16588162527357503, + "grad_norm": 1.6579736471176147, + "learning_rate": 4.6681630268515407e-05, + "loss": 4.7124, + "step": 27892 + }, + { + "epoch": 0.16588757255685602, + "grad_norm": 1.5885511636734009, + "learning_rate": 4.668139772139849e-05, + "loss": 4.6244, + "step": 27893 + }, + { + "epoch": 0.16589351984013703, + "grad_norm": 1.5703203678131104, + "learning_rate": 4.668116516671282e-05, + "loss": 4.7233, + "step": 27894 + }, + { + "epoch": 0.16589946712341802, + "grad_norm": 1.6852905750274658, + "learning_rate": 4.668093260445847e-05, + "loss": 5.0091, + "step": 27895 + }, + { + "epoch": 0.165905414406699, + "grad_norm": 1.7425652742385864, + "learning_rate": 4.668070003463554e-05, + "loss": 5.0271, + "step": 27896 + }, + { + "epoch": 0.16591136168998002, + "grad_norm": 1.7271431684494019, + "learning_rate": 4.6680467457244104e-05, + "loss": 4.525, + "step": 27897 + }, + { + "epoch": 0.165917308973261, + "grad_norm": 1.8759088516235352, + "learning_rate": 4.668023487228423e-05, + "loss": 4.4729, + "step": 27898 + }, + { + "epoch": 0.165923256256542, + "grad_norm": 1.5073447227478027, + "learning_rate": 4.668000227975602e-05, + "loss": 4.8768, + "step": 27899 + }, + { + "epoch": 0.16592920353982302, + "grad_norm": 1.3689100742340088, + "learning_rate": 4.667976967965954e-05, + "loss": 5.1424, + "step": 27900 + }, + { + "epoch": 0.165935150823104, + "grad_norm": 1.7475918531417847, + "learning_rate": 4.6679537071994874e-05, + "loss": 4.7103, + "step": 27901 + }, + { + "epoch": 0.165941098106385, + "grad_norm": 1.5559403896331787, + "learning_rate": 4.6679304456762107e-05, + "loss": 5.0524, + "step": 27902 + }, + { + "epoch": 0.165947045389666, + "grad_norm": 1.7627094984054565, + "learning_rate": 4.667907183396132e-05, + "loss": 4.9901, + "step": 27903 + }, + { + "epoch": 0.165952992672947, + "grad_norm": 1.8173136711120605, + "learning_rate": 4.667883920359259e-05, + "loss": 4.6419, + "step": 27904 + }, + { + "epoch": 0.16595893995622799, + "grad_norm": 2.0207037925720215, + "learning_rate": 4.667860656565601e-05, + "loss": 5.2537, + "step": 27905 + }, + { + "epoch": 0.165964887239509, + "grad_norm": 1.6715987920761108, + "learning_rate": 4.6678373920151646e-05, + "loss": 5.0337, + "step": 27906 + }, + { + "epoch": 0.16597083452279, + "grad_norm": 1.6425293684005737, + "learning_rate": 4.667814126707959e-05, + "loss": 5.0065, + "step": 27907 + }, + { + "epoch": 0.16597678180607098, + "grad_norm": 1.8118547201156616, + "learning_rate": 4.667790860643991e-05, + "loss": 4.9293, + "step": 27908 + }, + { + "epoch": 0.165982729089352, + "grad_norm": 1.5994832515716553, + "learning_rate": 4.66776759382327e-05, + "loss": 5.2379, + "step": 27909 + }, + { + "epoch": 0.16598867637263298, + "grad_norm": 1.6475836038589478, + "learning_rate": 4.667744326245804e-05, + "loss": 5.4609, + "step": 27910 + }, + { + "epoch": 0.16599462365591397, + "grad_norm": 1.4168953895568848, + "learning_rate": 4.6677210579116e-05, + "loss": 5.5907, + "step": 27911 + }, + { + "epoch": 0.16600057093919499, + "grad_norm": 1.46638822555542, + "learning_rate": 4.667697788820669e-05, + "loss": 5.4639, + "step": 27912 + }, + { + "epoch": 0.16600651822247597, + "grad_norm": 1.6889718770980835, + "learning_rate": 4.667674518973015e-05, + "loss": 5.4013, + "step": 27913 + }, + { + "epoch": 0.16601246550575696, + "grad_norm": 1.8182064294815063, + "learning_rate": 4.6676512483686495e-05, + "loss": 4.7796, + "step": 27914 + }, + { + "epoch": 0.16601841278903798, + "grad_norm": 1.6663529872894287, + "learning_rate": 4.6676279770075784e-05, + "loss": 4.8987, + "step": 27915 + }, + { + "epoch": 0.16602436007231897, + "grad_norm": 1.762170672416687, + "learning_rate": 4.6676047048898115e-05, + "loss": 4.8513, + "step": 27916 + }, + { + "epoch": 0.16603030735559995, + "grad_norm": 1.6480133533477783, + "learning_rate": 4.6675814320153554e-05, + "loss": 4.7579, + "step": 27917 + }, + { + "epoch": 0.16603625463888097, + "grad_norm": 1.698567271232605, + "learning_rate": 4.66755815838422e-05, + "loss": 4.8489, + "step": 27918 + }, + { + "epoch": 0.16604220192216196, + "grad_norm": 1.5158538818359375, + "learning_rate": 4.667534883996412e-05, + "loss": 4.878, + "step": 27919 + }, + { + "epoch": 0.16604814920544295, + "grad_norm": 2.1120738983154297, + "learning_rate": 4.66751160885194e-05, + "loss": 4.8203, + "step": 27920 + }, + { + "epoch": 0.16605409648872396, + "grad_norm": 1.8523337841033936, + "learning_rate": 4.667488332950812e-05, + "loss": 4.79, + "step": 27921 + }, + { + "epoch": 0.16606004377200495, + "grad_norm": 1.9057866334915161, + "learning_rate": 4.6674650562930364e-05, + "loss": 4.55, + "step": 27922 + }, + { + "epoch": 0.16606599105528594, + "grad_norm": 1.690329670906067, + "learning_rate": 4.6674417788786206e-05, + "loss": 4.8434, + "step": 27923 + }, + { + "epoch": 0.16607193833856695, + "grad_norm": 1.796695590019226, + "learning_rate": 4.667418500707574e-05, + "loss": 4.8883, + "step": 27924 + }, + { + "epoch": 0.16607788562184794, + "grad_norm": 1.9424879550933838, + "learning_rate": 4.6673952217799035e-05, + "loss": 4.2556, + "step": 27925 + }, + { + "epoch": 0.16608383290512893, + "grad_norm": 1.886226773262024, + "learning_rate": 4.6673719420956176e-05, + "loss": 4.5911, + "step": 27926 + }, + { + "epoch": 0.16608978018840995, + "grad_norm": 2.1246280670166016, + "learning_rate": 4.6673486616547254e-05, + "loss": 4.5398, + "step": 27927 + }, + { + "epoch": 0.16609572747169093, + "grad_norm": 2.219155788421631, + "learning_rate": 4.667325380457233e-05, + "loss": 4.6747, + "step": 27928 + }, + { + "epoch": 0.16610167475497192, + "grad_norm": 2.0169975757598877, + "learning_rate": 4.66730209850315e-05, + "loss": 4.7622, + "step": 27929 + }, + { + "epoch": 0.16610762203825294, + "grad_norm": 1.884619116783142, + "learning_rate": 4.667278815792485e-05, + "loss": 5.0192, + "step": 27930 + }, + { + "epoch": 0.16611356932153393, + "grad_norm": 1.8539994955062866, + "learning_rate": 4.6672555323252446e-05, + "loss": 4.2732, + "step": 27931 + }, + { + "epoch": 0.16611951660481492, + "grad_norm": 2.045879364013672, + "learning_rate": 4.667232248101439e-05, + "loss": 3.8245, + "step": 27932 + }, + { + "epoch": 0.16612546388809593, + "grad_norm": 2.005019426345825, + "learning_rate": 4.667208963121073e-05, + "loss": 3.9687, + "step": 27933 + }, + { + "epoch": 0.16613141117137692, + "grad_norm": 1.7998180389404297, + "learning_rate": 4.667185677384158e-05, + "loss": 3.84, + "step": 27934 + }, + { + "epoch": 0.1661373584546579, + "grad_norm": 1.9813350439071655, + "learning_rate": 4.6671623908907e-05, + "loss": 3.7387, + "step": 27935 + }, + { + "epoch": 0.16614330573793892, + "grad_norm": 1.9212778806686401, + "learning_rate": 4.6671391036407086e-05, + "loss": 3.48, + "step": 27936 + }, + { + "epoch": 0.1661492530212199, + "grad_norm": 1.9081000089645386, + "learning_rate": 4.667115815634192e-05, + "loss": 3.4218, + "step": 27937 + }, + { + "epoch": 0.1661552003045009, + "grad_norm": 2.209960699081421, + "learning_rate": 4.667092526871156e-05, + "loss": 3.7272, + "step": 27938 + }, + { + "epoch": 0.1661611475877819, + "grad_norm": 2.3802664279937744, + "learning_rate": 4.6670692373516124e-05, + "loss": 3.6476, + "step": 27939 + }, + { + "epoch": 0.1661670948710629, + "grad_norm": 2.359929323196411, + "learning_rate": 4.667045947075566e-05, + "loss": 3.7406, + "step": 27940 + }, + { + "epoch": 0.1661730421543439, + "grad_norm": 2.242333173751831, + "learning_rate": 4.6670226560430266e-05, + "loss": 3.8315, + "step": 27941 + }, + { + "epoch": 0.16617898943762488, + "grad_norm": 1.7727068662643433, + "learning_rate": 4.6669993642540017e-05, + "loss": 4.6083, + "step": 27942 + }, + { + "epoch": 0.1661849367209059, + "grad_norm": 2.2704246044158936, + "learning_rate": 4.6669760717085e-05, + "loss": 3.947, + "step": 27943 + }, + { + "epoch": 0.16619088400418688, + "grad_norm": 2.550279140472412, + "learning_rate": 4.6669527784065295e-05, + "loss": 3.5335, + "step": 27944 + }, + { + "epoch": 0.16619683128746787, + "grad_norm": 2.455237627029419, + "learning_rate": 4.666929484348097e-05, + "loss": 3.5817, + "step": 27945 + }, + { + "epoch": 0.1662027785707489, + "grad_norm": 1.9026764631271362, + "learning_rate": 4.666906189533213e-05, + "loss": 3.4742, + "step": 27946 + }, + { + "epoch": 0.16620872585402988, + "grad_norm": 1.9334417581558228, + "learning_rate": 4.6668828939618845e-05, + "loss": 3.3938, + "step": 27947 + }, + { + "epoch": 0.16621467313731086, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.666859597634119e-05, + "loss": 4.0506, + "step": 27948 + }, + { + "epoch": 0.16622062042059188, + "grad_norm": 1.702767252922058, + "learning_rate": 4.666836300549926e-05, + "loss": 5.1613, + "step": 27949 + }, + { + "epoch": 0.16622656770387287, + "grad_norm": 2.1399359703063965, + "learning_rate": 4.666813002709312e-05, + "loss": 4.9766, + "step": 27950 + }, + { + "epoch": 0.16623251498715386, + "grad_norm": 2.493435859680176, + "learning_rate": 4.666789704112286e-05, + "loss": 4.2058, + "step": 27951 + }, + { + "epoch": 0.16623846227043487, + "grad_norm": 2.689168930053711, + "learning_rate": 4.666766404758857e-05, + "loss": 3.7151, + "step": 27952 + }, + { + "epoch": 0.16624440955371586, + "grad_norm": 2.172666311264038, + "learning_rate": 4.666743104649031e-05, + "loss": 3.6916, + "step": 27953 + }, + { + "epoch": 0.16625035683699685, + "grad_norm": 1.551274299621582, + "learning_rate": 4.6667198037828173e-05, + "loss": 4.9331, + "step": 27954 + }, + { + "epoch": 0.16625630412027786, + "grad_norm": 1.7849092483520508, + "learning_rate": 4.666696502160226e-05, + "loss": 5.288, + "step": 27955 + }, + { + "epoch": 0.16626225140355885, + "grad_norm": 1.8850775957107544, + "learning_rate": 4.6666731997812614e-05, + "loss": 5.1946, + "step": 27956 + }, + { + "epoch": 0.16626819868683984, + "grad_norm": 1.4710248708724976, + "learning_rate": 4.666649896645934e-05, + "loss": 5.2753, + "step": 27957 + }, + { + "epoch": 0.16627414597012086, + "grad_norm": 1.4987525939941406, + "learning_rate": 4.6666265927542516e-05, + "loss": 5.3751, + "step": 27958 + }, + { + "epoch": 0.16628009325340184, + "grad_norm": 1.5894343852996826, + "learning_rate": 4.666603288106223e-05, + "loss": 5.1087, + "step": 27959 + }, + { + "epoch": 0.16628604053668283, + "grad_norm": 1.491098165512085, + "learning_rate": 4.666579982701855e-05, + "loss": 5.1876, + "step": 27960 + }, + { + "epoch": 0.16629198781996385, + "grad_norm": 1.6574211120605469, + "learning_rate": 4.666556676541156e-05, + "loss": 5.1677, + "step": 27961 + }, + { + "epoch": 0.16629793510324484, + "grad_norm": 1.3962849378585815, + "learning_rate": 4.666533369624135e-05, + "loss": 4.6312, + "step": 27962 + }, + { + "epoch": 0.16630388238652583, + "grad_norm": 1.3819752931594849, + "learning_rate": 4.6665100619507986e-05, + "loss": 5.1794, + "step": 27963 + }, + { + "epoch": 0.16630982966980684, + "grad_norm": 1.392821192741394, + "learning_rate": 4.666486753521157e-05, + "loss": 5.192, + "step": 27964 + }, + { + "epoch": 0.16631577695308783, + "grad_norm": 1.3655375242233276, + "learning_rate": 4.6664634443352176e-05, + "loss": 5.0533, + "step": 27965 + }, + { + "epoch": 0.16632172423636882, + "grad_norm": 1.7046358585357666, + "learning_rate": 4.6664401343929864e-05, + "loss": 4.7244, + "step": 27966 + }, + { + "epoch": 0.16632767151964983, + "grad_norm": 1.8924365043640137, + "learning_rate": 4.6664168236944747e-05, + "loss": 4.8182, + "step": 27967 + }, + { + "epoch": 0.16633361880293082, + "grad_norm": 1.7032650709152222, + "learning_rate": 4.666393512239689e-05, + "loss": 4.6594, + "step": 27968 + }, + { + "epoch": 0.1663395660862118, + "grad_norm": 2.0425281524658203, + "learning_rate": 4.666370200028638e-05, + "loss": 4.0096, + "step": 27969 + }, + { + "epoch": 0.16634551336949283, + "grad_norm": 2.4013113975524902, + "learning_rate": 4.666346887061329e-05, + "loss": 3.7662, + "step": 27970 + }, + { + "epoch": 0.1663514606527738, + "grad_norm": 1.8698662519454956, + "learning_rate": 4.666323573337771e-05, + "loss": 4.2575, + "step": 27971 + }, + { + "epoch": 0.1663574079360548, + "grad_norm": 1.5415421724319458, + "learning_rate": 4.666300258857972e-05, + "loss": 4.739, + "step": 27972 + }, + { + "epoch": 0.16636335521933582, + "grad_norm": 1.79619562625885, + "learning_rate": 4.666276943621939e-05, + "loss": 4.7542, + "step": 27973 + }, + { + "epoch": 0.1663693025026168, + "grad_norm": 1.5592199563980103, + "learning_rate": 4.666253627629682e-05, + "loss": 4.5968, + "step": 27974 + }, + { + "epoch": 0.1663752497858978, + "grad_norm": 1.7237550020217896, + "learning_rate": 4.666230310881208e-05, + "loss": 4.6581, + "step": 27975 + }, + { + "epoch": 0.1663811970691788, + "grad_norm": 1.6247119903564453, + "learning_rate": 4.6662069933765255e-05, + "loss": 4.6803, + "step": 27976 + }, + { + "epoch": 0.1663871443524598, + "grad_norm": 1.6257696151733398, + "learning_rate": 4.666183675115643e-05, + "loss": 4.7591, + "step": 27977 + }, + { + "epoch": 0.1663930916357408, + "grad_norm": 1.6353588104248047, + "learning_rate": 4.666160356098567e-05, + "loss": 4.3821, + "step": 27978 + }, + { + "epoch": 0.1663990389190218, + "grad_norm": 1.7179335355758667, + "learning_rate": 4.666137036325308e-05, + "loss": 4.6386, + "step": 27979 + }, + { + "epoch": 0.1664049862023028, + "grad_norm": 1.6724573373794556, + "learning_rate": 4.6661137157958716e-05, + "loss": 4.596, + "step": 27980 + }, + { + "epoch": 0.16641093348558378, + "grad_norm": 1.8331623077392578, + "learning_rate": 4.666090394510269e-05, + "loss": 4.6706, + "step": 27981 + }, + { + "epoch": 0.1664168807688648, + "grad_norm": 1.5815516710281372, + "learning_rate": 4.666067072468505e-05, + "loss": 4.5764, + "step": 27982 + }, + { + "epoch": 0.16642282805214578, + "grad_norm": 1.6047725677490234, + "learning_rate": 4.66604374967059e-05, + "loss": 4.4228, + "step": 27983 + }, + { + "epoch": 0.16642877533542677, + "grad_norm": 2.057325601577759, + "learning_rate": 4.666020426116531e-05, + "loss": 3.886, + "step": 27984 + }, + { + "epoch": 0.1664347226187078, + "grad_norm": 2.2633588314056396, + "learning_rate": 4.6659971018063375e-05, + "loss": 4.2796, + "step": 27985 + }, + { + "epoch": 0.16644066990198877, + "grad_norm": 1.9848732948303223, + "learning_rate": 4.6659737767400166e-05, + "loss": 4.4349, + "step": 27986 + }, + { + "epoch": 0.16644661718526976, + "grad_norm": 1.8116247653961182, + "learning_rate": 4.6659504509175764e-05, + "loss": 4.5313, + "step": 27987 + }, + { + "epoch": 0.16645256446855078, + "grad_norm": 1.8909553289413452, + "learning_rate": 4.665927124339026e-05, + "loss": 4.5166, + "step": 27988 + }, + { + "epoch": 0.16645851175183177, + "grad_norm": 1.6827013492584229, + "learning_rate": 4.665903797004371e-05, + "loss": 4.7353, + "step": 27989 + }, + { + "epoch": 0.16646445903511276, + "grad_norm": 1.8081045150756836, + "learning_rate": 4.6658804689136227e-05, + "loss": 4.743, + "step": 27990 + }, + { + "epoch": 0.16647040631839377, + "grad_norm": 1.7859995365142822, + "learning_rate": 4.665857140066788e-05, + "loss": 4.6476, + "step": 27991 + }, + { + "epoch": 0.16647635360167476, + "grad_norm": 2.158141613006592, + "learning_rate": 4.665833810463874e-05, + "loss": 4.1541, + "step": 27992 + }, + { + "epoch": 0.16648230088495575, + "grad_norm": 2.059534788131714, + "learning_rate": 4.665810480104891e-05, + "loss": 4.2993, + "step": 27993 + }, + { + "epoch": 0.16648824816823676, + "grad_norm": 2.0945677757263184, + "learning_rate": 4.665787148989845e-05, + "loss": 4.5941, + "step": 27994 + }, + { + "epoch": 0.16649419545151775, + "grad_norm": 1.8792952299118042, + "learning_rate": 4.6657638171187455e-05, + "loss": 4.5735, + "step": 27995 + }, + { + "epoch": 0.16650014273479874, + "grad_norm": 1.7018059492111206, + "learning_rate": 4.665740484491601e-05, + "loss": 4.6591, + "step": 27996 + }, + { + "epoch": 0.16650609001807973, + "grad_norm": 1.6992706060409546, + "learning_rate": 4.6657171511084175e-05, + "loss": 4.512, + "step": 27997 + }, + { + "epoch": 0.16651203730136074, + "grad_norm": 1.7492562532424927, + "learning_rate": 4.6656938169692054e-05, + "loss": 4.6722, + "step": 27998 + }, + { + "epoch": 0.16651798458464173, + "grad_norm": 1.6457120180130005, + "learning_rate": 4.665670482073972e-05, + "loss": 4.5632, + "step": 27999 + }, + { + "epoch": 0.16652393186792272, + "grad_norm": 1.9052523374557495, + "learning_rate": 4.6656471464227246e-05, + "loss": 4.5678, + "step": 28000 + }, + { + "epoch": 0.16652987915120374, + "grad_norm": 1.7932218313217163, + "learning_rate": 4.665623810015473e-05, + "loss": 4.5433, + "step": 28001 + }, + { + "epoch": 0.16653582643448472, + "grad_norm": 1.7252825498580933, + "learning_rate": 4.665600472852224e-05, + "loss": 4.3902, + "step": 28002 + }, + { + "epoch": 0.1665417737177657, + "grad_norm": 1.810628890991211, + "learning_rate": 4.665577134932986e-05, + "loss": 4.242, + "step": 28003 + }, + { + "epoch": 0.16654772100104673, + "grad_norm": 1.7332589626312256, + "learning_rate": 4.6655537962577676e-05, + "loss": 4.2713, + "step": 28004 + }, + { + "epoch": 0.16655366828432772, + "grad_norm": 1.720533847808838, + "learning_rate": 4.6655304568265776e-05, + "loss": 4.3828, + "step": 28005 + }, + { + "epoch": 0.1665596155676087, + "grad_norm": 1.680240511894226, + "learning_rate": 4.665507116639423e-05, + "loss": 4.4578, + "step": 28006 + }, + { + "epoch": 0.16656556285088972, + "grad_norm": 1.6451648473739624, + "learning_rate": 4.665483775696311e-05, + "loss": 4.4493, + "step": 28007 + }, + { + "epoch": 0.1665715101341707, + "grad_norm": 1.8150712251663208, + "learning_rate": 4.665460433997252e-05, + "loss": 4.353, + "step": 28008 + }, + { + "epoch": 0.1665774574174517, + "grad_norm": 1.6858443021774292, + "learning_rate": 4.665437091542253e-05, + "loss": 4.2929, + "step": 28009 + }, + { + "epoch": 0.1665834047007327, + "grad_norm": 1.7269021272659302, + "learning_rate": 4.665413748331322e-05, + "loss": 4.2856, + "step": 28010 + }, + { + "epoch": 0.1665893519840137, + "grad_norm": 1.6517678499221802, + "learning_rate": 4.665390404364468e-05, + "loss": 4.977, + "step": 28011 + }, + { + "epoch": 0.1665952992672947, + "grad_norm": 1.8300232887268066, + "learning_rate": 4.665367059641698e-05, + "loss": 4.3829, + "step": 28012 + }, + { + "epoch": 0.1666012465505757, + "grad_norm": 1.7685927152633667, + "learning_rate": 4.6653437141630215e-05, + "loss": 4.3178, + "step": 28013 + }, + { + "epoch": 0.1666071938338567, + "grad_norm": 1.944615125656128, + "learning_rate": 4.665320367928445e-05, + "loss": 4.2248, + "step": 28014 + }, + { + "epoch": 0.16661314111713768, + "grad_norm": 2.097490072250366, + "learning_rate": 4.6652970209379775e-05, + "loss": 4.2814, + "step": 28015 + }, + { + "epoch": 0.1666190884004187, + "grad_norm": 1.5824095010757446, + "learning_rate": 4.665273673191628e-05, + "loss": 4.2074, + "step": 28016 + }, + { + "epoch": 0.16662503568369968, + "grad_norm": 1.6979020833969116, + "learning_rate": 4.665250324689403e-05, + "loss": 4.3534, + "step": 28017 + }, + { + "epoch": 0.16663098296698067, + "grad_norm": 1.7754404544830322, + "learning_rate": 4.6652269754313125e-05, + "loss": 4.3066, + "step": 28018 + }, + { + "epoch": 0.1666369302502617, + "grad_norm": 1.8645826578140259, + "learning_rate": 4.665203625417363e-05, + "loss": 4.1896, + "step": 28019 + }, + { + "epoch": 0.16664287753354268, + "grad_norm": 1.8967339992523193, + "learning_rate": 4.6651802746475633e-05, + "loss": 4.4092, + "step": 28020 + }, + { + "epoch": 0.16664882481682367, + "grad_norm": 1.76931893825531, + "learning_rate": 4.665156923121922e-05, + "loss": 4.5632, + "step": 28021 + }, + { + "epoch": 0.16665477210010468, + "grad_norm": 2.338927745819092, + "learning_rate": 4.665133570840446e-05, + "loss": 4.2858, + "step": 28022 + }, + { + "epoch": 0.16666071938338567, + "grad_norm": 1.747149109840393, + "learning_rate": 4.665110217803145e-05, + "loss": 4.6505, + "step": 28023 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 2.8555750846862793, + "learning_rate": 4.6650868640100254e-05, + "loss": 4.4681, + "step": 28024 + }, + { + "epoch": 0.16667261394994767, + "grad_norm": 2.284841299057007, + "learning_rate": 4.665063509461097e-05, + "loss": 3.9607, + "step": 28025 + }, + { + "epoch": 0.16667856123322866, + "grad_norm": 2.51346492767334, + "learning_rate": 4.6650401541563673e-05, + "loss": 3.8373, + "step": 28026 + }, + { + "epoch": 0.16668450851650965, + "grad_norm": 2.33309006690979, + "learning_rate": 4.6650167980958444e-05, + "loss": 3.8783, + "step": 28027 + }, + { + "epoch": 0.16669045579979067, + "grad_norm": 1.886756420135498, + "learning_rate": 4.664993441279536e-05, + "loss": 4.4607, + "step": 28028 + }, + { + "epoch": 0.16669640308307165, + "grad_norm": 1.6356357336044312, + "learning_rate": 4.664970083707452e-05, + "loss": 4.2901, + "step": 28029 + }, + { + "epoch": 0.16670235036635264, + "grad_norm": 2.366969108581543, + "learning_rate": 4.664946725379598e-05, + "loss": 3.7561, + "step": 28030 + }, + { + "epoch": 0.16670829764963366, + "grad_norm": 2.387471914291382, + "learning_rate": 4.664923366295984e-05, + "loss": 3.745, + "step": 28031 + }, + { + "epoch": 0.16671424493291465, + "grad_norm": 2.0741076469421387, + "learning_rate": 4.664900006456617e-05, + "loss": 3.6031, + "step": 28032 + }, + { + "epoch": 0.16672019221619563, + "grad_norm": 2.5359690189361572, + "learning_rate": 4.6648766458615066e-05, + "loss": 3.8495, + "step": 28033 + }, + { + "epoch": 0.16672613949947665, + "grad_norm": 2.423198938369751, + "learning_rate": 4.6648532845106604e-05, + "loss": 3.5664, + "step": 28034 + }, + { + "epoch": 0.16673208678275764, + "grad_norm": 1.7040067911148071, + "learning_rate": 4.664829922404086e-05, + "loss": 4.4474, + "step": 28035 + }, + { + "epoch": 0.16673803406603863, + "grad_norm": 1.8603174686431885, + "learning_rate": 4.664806559541791e-05, + "loss": 4.7263, + "step": 28036 + }, + { + "epoch": 0.16674398134931964, + "grad_norm": 1.7510238885879517, + "learning_rate": 4.664783195923785e-05, + "loss": 4.5566, + "step": 28037 + }, + { + "epoch": 0.16674992863260063, + "grad_norm": 1.6786305904388428, + "learning_rate": 4.6647598315500764e-05, + "loss": 4.5139, + "step": 28038 + }, + { + "epoch": 0.16675587591588162, + "grad_norm": 1.7382848262786865, + "learning_rate": 4.664736466420671e-05, + "loss": 4.5474, + "step": 28039 + }, + { + "epoch": 0.16676182319916263, + "grad_norm": 2.1977128982543945, + "learning_rate": 4.664713100535579e-05, + "loss": 3.8657, + "step": 28040 + }, + { + "epoch": 0.16676777048244362, + "grad_norm": 2.147538185119629, + "learning_rate": 4.664689733894808e-05, + "loss": 4.3258, + "step": 28041 + }, + { + "epoch": 0.1667737177657246, + "grad_norm": 1.6165980100631714, + "learning_rate": 4.6646663664983667e-05, + "loss": 5.6764, + "step": 28042 + }, + { + "epoch": 0.16677966504900563, + "grad_norm": 1.5513676404953003, + "learning_rate": 4.664642998346263e-05, + "loss": 5.286, + "step": 28043 + }, + { + "epoch": 0.16678561233228661, + "grad_norm": 2.4869754314422607, + "learning_rate": 4.664619629438504e-05, + "loss": 3.9925, + "step": 28044 + }, + { + "epoch": 0.1667915596155676, + "grad_norm": 1.9613736867904663, + "learning_rate": 4.6645962597750985e-05, + "loss": 4.3125, + "step": 28045 + }, + { + "epoch": 0.16679750689884862, + "grad_norm": 2.345310688018799, + "learning_rate": 4.664572889356055e-05, + "loss": 4.5785, + "step": 28046 + }, + { + "epoch": 0.1668034541821296, + "grad_norm": 1.6253316402435303, + "learning_rate": 4.664549518181382e-05, + "loss": 4.9924, + "step": 28047 + }, + { + "epoch": 0.1668094014654106, + "grad_norm": 1.7358524799346924, + "learning_rate": 4.664526146251087e-05, + "loss": 4.7523, + "step": 28048 + }, + { + "epoch": 0.1668153487486916, + "grad_norm": 1.6783114671707153, + "learning_rate": 4.664502773565178e-05, + "loss": 5.1525, + "step": 28049 + }, + { + "epoch": 0.1668212960319726, + "grad_norm": 1.102388858795166, + "learning_rate": 4.664479400123663e-05, + "loss": 5.5695, + "step": 28050 + }, + { + "epoch": 0.1668272433152536, + "grad_norm": 1.6548655033111572, + "learning_rate": 4.664456025926551e-05, + "loss": 4.7331, + "step": 28051 + }, + { + "epoch": 0.1668331905985346, + "grad_norm": 1.6468528509140015, + "learning_rate": 4.66443265097385e-05, + "loss": 4.7818, + "step": 28052 + }, + { + "epoch": 0.1668391378818156, + "grad_norm": 1.623849630355835, + "learning_rate": 4.664409275265568e-05, + "loss": 4.9336, + "step": 28053 + }, + { + "epoch": 0.16684508516509658, + "grad_norm": 1.4946188926696777, + "learning_rate": 4.664385898801713e-05, + "loss": 4.9361, + "step": 28054 + }, + { + "epoch": 0.16685103244837757, + "grad_norm": 1.6323179006576538, + "learning_rate": 4.664362521582293e-05, + "loss": 4.9529, + "step": 28055 + }, + { + "epoch": 0.16685697973165858, + "grad_norm": 1.579441785812378, + "learning_rate": 4.6643391436073165e-05, + "loss": 4.5593, + "step": 28056 + }, + { + "epoch": 0.16686292701493957, + "grad_norm": 1.578658103942871, + "learning_rate": 4.664315764876791e-05, + "loss": 4.5736, + "step": 28057 + }, + { + "epoch": 0.16686887429822056, + "grad_norm": 1.661720633506775, + "learning_rate": 4.664292385390726e-05, + "loss": 4.9137, + "step": 28058 + }, + { + "epoch": 0.16687482158150158, + "grad_norm": 1.9020450115203857, + "learning_rate": 4.664269005149129e-05, + "loss": 5.3975, + "step": 28059 + }, + { + "epoch": 0.16688076886478256, + "grad_norm": 2.0544557571411133, + "learning_rate": 4.664245624152007e-05, + "loss": 5.3485, + "step": 28060 + }, + { + "epoch": 0.16688671614806355, + "grad_norm": 1.8861839771270752, + "learning_rate": 4.664222242399371e-05, + "loss": 5.3917, + "step": 28061 + }, + { + "epoch": 0.16689266343134457, + "grad_norm": 1.7115676403045654, + "learning_rate": 4.6641988598912256e-05, + "loss": 5.3101, + "step": 28062 + }, + { + "epoch": 0.16689861071462556, + "grad_norm": 2.6457252502441406, + "learning_rate": 4.6641754766275815e-05, + "loss": 4.8323, + "step": 28063 + }, + { + "epoch": 0.16690455799790654, + "grad_norm": 1.9158306121826172, + "learning_rate": 4.664152092608446e-05, + "loss": 4.902, + "step": 28064 + }, + { + "epoch": 0.16691050528118756, + "grad_norm": 1.5592490434646606, + "learning_rate": 4.664128707833828e-05, + "loss": 4.85, + "step": 28065 + }, + { + "epoch": 0.16691645256446855, + "grad_norm": 1.8784046173095703, + "learning_rate": 4.664105322303734e-05, + "loss": 4.9118, + "step": 28066 + }, + { + "epoch": 0.16692239984774954, + "grad_norm": 1.8043493032455444, + "learning_rate": 4.6640819360181734e-05, + "loss": 4.8248, + "step": 28067 + }, + { + "epoch": 0.16692834713103055, + "grad_norm": 1.925399661064148, + "learning_rate": 4.664058548977155e-05, + "loss": 4.8808, + "step": 28068 + }, + { + "epoch": 0.16693429441431154, + "grad_norm": 2.1420938968658447, + "learning_rate": 4.664035161180686e-05, + "loss": 4.5251, + "step": 28069 + }, + { + "epoch": 0.16694024169759253, + "grad_norm": 1.3386578559875488, + "learning_rate": 4.664011772628774e-05, + "loss": 5.0788, + "step": 28070 + }, + { + "epoch": 0.16694618898087354, + "grad_norm": 1.7500650882720947, + "learning_rate": 4.663988383321427e-05, + "loss": 4.6332, + "step": 28071 + }, + { + "epoch": 0.16695213626415453, + "grad_norm": 1.6339102983474731, + "learning_rate": 4.6639649932586555e-05, + "loss": 4.9342, + "step": 28072 + }, + { + "epoch": 0.16695808354743552, + "grad_norm": 1.634045124053955, + "learning_rate": 4.6639416024404655e-05, + "loss": 4.8166, + "step": 28073 + }, + { + "epoch": 0.16696403083071654, + "grad_norm": 1.6168557405471802, + "learning_rate": 4.663918210866866e-05, + "loss": 4.9086, + "step": 28074 + }, + { + "epoch": 0.16696997811399752, + "grad_norm": 1.7027981281280518, + "learning_rate": 4.663894818537865e-05, + "loss": 4.7404, + "step": 28075 + }, + { + "epoch": 0.1669759253972785, + "grad_norm": 1.621127724647522, + "learning_rate": 4.663871425453471e-05, + "loss": 4.8458, + "step": 28076 + }, + { + "epoch": 0.16698187268055953, + "grad_norm": 1.524674415588379, + "learning_rate": 4.663848031613691e-05, + "loss": 4.8977, + "step": 28077 + }, + { + "epoch": 0.16698781996384052, + "grad_norm": 1.3619705438613892, + "learning_rate": 4.663824637018535e-05, + "loss": 4.6809, + "step": 28078 + }, + { + "epoch": 0.1669937672471215, + "grad_norm": 1.6202057600021362, + "learning_rate": 4.66380124166801e-05, + "loss": 4.2993, + "step": 28079 + }, + { + "epoch": 0.16699971453040252, + "grad_norm": 1.6400598287582397, + "learning_rate": 4.663777845562124e-05, + "loss": 4.498, + "step": 28080 + }, + { + "epoch": 0.1670056618136835, + "grad_norm": 1.6038521528244019, + "learning_rate": 4.663754448700885e-05, + "loss": 4.5864, + "step": 28081 + }, + { + "epoch": 0.1670116090969645, + "grad_norm": 1.6111528873443604, + "learning_rate": 4.663731051084303e-05, + "loss": 4.77, + "step": 28082 + }, + { + "epoch": 0.1670175563802455, + "grad_norm": 1.7978882789611816, + "learning_rate": 4.663707652712384e-05, + "loss": 4.2634, + "step": 28083 + }, + { + "epoch": 0.1670235036635265, + "grad_norm": 1.8533109426498413, + "learning_rate": 4.6636842535851374e-05, + "loss": 4.6601, + "step": 28084 + }, + { + "epoch": 0.1670294509468075, + "grad_norm": 1.776292324066162, + "learning_rate": 4.663660853702571e-05, + "loss": 4.4957, + "step": 28085 + }, + { + "epoch": 0.1670353982300885, + "grad_norm": 2.15081524848938, + "learning_rate": 4.663637453064692e-05, + "loss": 4.2726, + "step": 28086 + }, + { + "epoch": 0.1670413455133695, + "grad_norm": 1.4586591720581055, + "learning_rate": 4.6636140516715104e-05, + "loss": 5.4757, + "step": 28087 + }, + { + "epoch": 0.16704729279665048, + "grad_norm": 1.4819058179855347, + "learning_rate": 4.663590649523033e-05, + "loss": 5.6895, + "step": 28088 + }, + { + "epoch": 0.1670532400799315, + "grad_norm": 1.8194465637207031, + "learning_rate": 4.663567246619269e-05, + "loss": 4.5697, + "step": 28089 + }, + { + "epoch": 0.16705918736321249, + "grad_norm": 1.8187286853790283, + "learning_rate": 4.663543842960226e-05, + "loss": 4.4745, + "step": 28090 + }, + { + "epoch": 0.16706513464649347, + "grad_norm": 1.7815576791763306, + "learning_rate": 4.663520438545912e-05, + "loss": 4.7309, + "step": 28091 + }, + { + "epoch": 0.1670710819297745, + "grad_norm": 1.9799631834030151, + "learning_rate": 4.663497033376335e-05, + "loss": 4.5429, + "step": 28092 + }, + { + "epoch": 0.16707702921305548, + "grad_norm": 1.7019764184951782, + "learning_rate": 4.663473627451504e-05, + "loss": 4.402, + "step": 28093 + }, + { + "epoch": 0.16708297649633647, + "grad_norm": 1.9056285619735718, + "learning_rate": 4.663450220771427e-05, + "loss": 4.3428, + "step": 28094 + }, + { + "epoch": 0.16708892377961748, + "grad_norm": 1.877556562423706, + "learning_rate": 4.663426813336112e-05, + "loss": 4.4579, + "step": 28095 + }, + { + "epoch": 0.16709487106289847, + "grad_norm": 1.6415005922317505, + "learning_rate": 4.663403405145565e-05, + "loss": 5.1392, + "step": 28096 + }, + { + "epoch": 0.16710081834617946, + "grad_norm": 2.0315005779266357, + "learning_rate": 4.663379996199798e-05, + "loss": 4.5666, + "step": 28097 + }, + { + "epoch": 0.16710676562946047, + "grad_norm": 1.744367241859436, + "learning_rate": 4.663356586498817e-05, + "loss": 4.6629, + "step": 28098 + }, + { + "epoch": 0.16711271291274146, + "grad_norm": 1.8645330667495728, + "learning_rate": 4.663333176042631e-05, + "loss": 4.2716, + "step": 28099 + }, + { + "epoch": 0.16711866019602245, + "grad_norm": 1.6384168863296509, + "learning_rate": 4.6633097648312476e-05, + "loss": 4.1565, + "step": 28100 + }, + { + "epoch": 0.16712460747930347, + "grad_norm": 2.0455496311187744, + "learning_rate": 4.663286352864675e-05, + "loss": 4.3342, + "step": 28101 + }, + { + "epoch": 0.16713055476258445, + "grad_norm": 1.689454197883606, + "learning_rate": 4.663262940142921e-05, + "loss": 5.1503, + "step": 28102 + }, + { + "epoch": 0.16713650204586544, + "grad_norm": 1.7138323783874512, + "learning_rate": 4.663239526665995e-05, + "loss": 4.3616, + "step": 28103 + }, + { + "epoch": 0.16714244932914646, + "grad_norm": 2.171147584915161, + "learning_rate": 4.663216112433904e-05, + "loss": 4.3054, + "step": 28104 + }, + { + "epoch": 0.16714839661242745, + "grad_norm": 2.5418312549591064, + "learning_rate": 4.663192697446657e-05, + "loss": 3.387, + "step": 28105 + }, + { + "epoch": 0.16715434389570843, + "grad_norm": 1.5790460109710693, + "learning_rate": 4.6631692817042615e-05, + "loss": 5.2555, + "step": 28106 + }, + { + "epoch": 0.16716029117898945, + "grad_norm": 1.4285277128219604, + "learning_rate": 4.663145865206726e-05, + "loss": 5.2408, + "step": 28107 + }, + { + "epoch": 0.16716623846227044, + "grad_norm": 1.3292522430419922, + "learning_rate": 4.663122447954058e-05, + "loss": 5.1494, + "step": 28108 + }, + { + "epoch": 0.16717218574555143, + "grad_norm": 1.7032718658447266, + "learning_rate": 4.663099029946267e-05, + "loss": 4.7939, + "step": 28109 + }, + { + "epoch": 0.16717813302883244, + "grad_norm": 1.6049028635025024, + "learning_rate": 4.6630756111833605e-05, + "loss": 4.9406, + "step": 28110 + }, + { + "epoch": 0.16718408031211343, + "grad_norm": 1.4805787801742554, + "learning_rate": 4.663052191665347e-05, + "loss": 4.9251, + "step": 28111 + }, + { + "epoch": 0.16719002759539442, + "grad_norm": 1.585306167602539, + "learning_rate": 4.663028771392234e-05, + "loss": 5.3119, + "step": 28112 + }, + { + "epoch": 0.1671959748786754, + "grad_norm": 1.5918222665786743, + "learning_rate": 4.663005350364029e-05, + "loss": 5.1405, + "step": 28113 + }, + { + "epoch": 0.16720192216195642, + "grad_norm": 1.5273454189300537, + "learning_rate": 4.6629819285807426e-05, + "loss": 4.9654, + "step": 28114 + }, + { + "epoch": 0.1672078694452374, + "grad_norm": 2.3424551486968994, + "learning_rate": 4.662958506042381e-05, + "loss": 4.6364, + "step": 28115 + }, + { + "epoch": 0.1672138167285184, + "grad_norm": 1.5244309902191162, + "learning_rate": 4.6629350827489527e-05, + "loss": 5.1469, + "step": 28116 + }, + { + "epoch": 0.16721976401179942, + "grad_norm": 1.6393519639968872, + "learning_rate": 4.662911658700466e-05, + "loss": 5.3803, + "step": 28117 + }, + { + "epoch": 0.1672257112950804, + "grad_norm": 1.6506540775299072, + "learning_rate": 4.662888233896929e-05, + "loss": 5.2188, + "step": 28118 + }, + { + "epoch": 0.1672316585783614, + "grad_norm": 1.481735110282898, + "learning_rate": 4.6628648083383516e-05, + "loss": 5.4692, + "step": 28119 + }, + { + "epoch": 0.1672376058616424, + "grad_norm": 1.5239784717559814, + "learning_rate": 4.662841382024739e-05, + "loss": 5.5937, + "step": 28120 + }, + { + "epoch": 0.1672435531449234, + "grad_norm": 1.7525306940078735, + "learning_rate": 4.662817954956101e-05, + "loss": 5.2913, + "step": 28121 + }, + { + "epoch": 0.16724950042820438, + "grad_norm": 1.5808900594711304, + "learning_rate": 4.662794527132446e-05, + "loss": 4.8341, + "step": 28122 + }, + { + "epoch": 0.1672554477114854, + "grad_norm": 1.7503292560577393, + "learning_rate": 4.662771098553782e-05, + "loss": 4.6066, + "step": 28123 + }, + { + "epoch": 0.1672613949947664, + "grad_norm": 2.0583229064941406, + "learning_rate": 4.662747669220116e-05, + "loss": 4.747, + "step": 28124 + }, + { + "epoch": 0.16726734227804738, + "grad_norm": 1.8209635019302368, + "learning_rate": 4.662724239131458e-05, + "loss": 4.6837, + "step": 28125 + }, + { + "epoch": 0.1672732895613284, + "grad_norm": 1.3161481618881226, + "learning_rate": 4.662700808287815e-05, + "loss": 5.1877, + "step": 28126 + }, + { + "epoch": 0.16727923684460938, + "grad_norm": 1.492100715637207, + "learning_rate": 4.662677376689195e-05, + "loss": 5.0719, + "step": 28127 + }, + { + "epoch": 0.16728518412789037, + "grad_norm": 1.5123339891433716, + "learning_rate": 4.662653944335608e-05, + "loss": 5.2237, + "step": 28128 + }, + { + "epoch": 0.16729113141117138, + "grad_norm": 1.3963336944580078, + "learning_rate": 4.66263051122706e-05, + "loss": 5.5465, + "step": 28129 + }, + { + "epoch": 0.16729707869445237, + "grad_norm": 1.4128196239471436, + "learning_rate": 4.662607077363559e-05, + "loss": 5.4236, + "step": 28130 + }, + { + "epoch": 0.16730302597773336, + "grad_norm": 1.5107556581497192, + "learning_rate": 4.662583642745116e-05, + "loss": 5.411, + "step": 28131 + }, + { + "epoch": 0.16730897326101438, + "grad_norm": 1.4282488822937012, + "learning_rate": 4.662560207371737e-05, + "loss": 5.4301, + "step": 28132 + }, + { + "epoch": 0.16731492054429536, + "grad_norm": 1.7082507610321045, + "learning_rate": 4.6625367712434295e-05, + "loss": 5.2167, + "step": 28133 + }, + { + "epoch": 0.16732086782757635, + "grad_norm": 1.4769392013549805, + "learning_rate": 4.662513334360204e-05, + "loss": 4.8894, + "step": 28134 + }, + { + "epoch": 0.16732681511085737, + "grad_norm": 1.6305506229400635, + "learning_rate": 4.6624898967220664e-05, + "loss": 5.2891, + "step": 28135 + }, + { + "epoch": 0.16733276239413836, + "grad_norm": 1.4358271360397339, + "learning_rate": 4.662466458329027e-05, + "loss": 5.4362, + "step": 28136 + }, + { + "epoch": 0.16733870967741934, + "grad_norm": 1.3945128917694092, + "learning_rate": 4.662443019181092e-05, + "loss": 5.4208, + "step": 28137 + }, + { + "epoch": 0.16734465696070036, + "grad_norm": 1.3432549238204956, + "learning_rate": 4.662419579278271e-05, + "loss": 5.4326, + "step": 28138 + }, + { + "epoch": 0.16735060424398135, + "grad_norm": 1.3106540441513062, + "learning_rate": 4.662396138620571e-05, + "loss": 5.554, + "step": 28139 + }, + { + "epoch": 0.16735655152726234, + "grad_norm": 1.449013590812683, + "learning_rate": 4.662372697208002e-05, + "loss": 5.3896, + "step": 28140 + }, + { + "epoch": 0.16736249881054335, + "grad_norm": 1.2621738910675049, + "learning_rate": 4.66234925504057e-05, + "loss": 5.5235, + "step": 28141 + }, + { + "epoch": 0.16736844609382434, + "grad_norm": 1.5813289880752563, + "learning_rate": 4.6623258121182845e-05, + "loss": 5.4607, + "step": 28142 + }, + { + "epoch": 0.16737439337710533, + "grad_norm": 1.4719443321228027, + "learning_rate": 4.662302368441154e-05, + "loss": 5.2416, + "step": 28143 + }, + { + "epoch": 0.16738034066038635, + "grad_norm": 1.3261717557907104, + "learning_rate": 4.662278924009185e-05, + "loss": 5.2426, + "step": 28144 + }, + { + "epoch": 0.16738628794366733, + "grad_norm": 1.409119725227356, + "learning_rate": 4.6622554788223874e-05, + "loss": 4.8306, + "step": 28145 + }, + { + "epoch": 0.16739223522694832, + "grad_norm": 1.3746771812438965, + "learning_rate": 4.662232032880769e-05, + "loss": 5.3939, + "step": 28146 + }, + { + "epoch": 0.16739818251022934, + "grad_norm": 1.5453044176101685, + "learning_rate": 4.662208586184337e-05, + "loss": 5.2989, + "step": 28147 + }, + { + "epoch": 0.16740412979351033, + "grad_norm": 2.140986919403076, + "learning_rate": 4.6621851387331003e-05, + "loss": 4.7526, + "step": 28148 + }, + { + "epoch": 0.1674100770767913, + "grad_norm": 1.305344820022583, + "learning_rate": 4.662161690527068e-05, + "loss": 5.3339, + "step": 28149 + }, + { + "epoch": 0.16741602436007233, + "grad_norm": 1.200656533241272, + "learning_rate": 4.662138241566247e-05, + "loss": 5.2464, + "step": 28150 + }, + { + "epoch": 0.16742197164335332, + "grad_norm": 1.2441010475158691, + "learning_rate": 4.6621147918506457e-05, + "loss": 5.4545, + "step": 28151 + }, + { + "epoch": 0.1674279189266343, + "grad_norm": 1.6146814823150635, + "learning_rate": 4.662091341380272e-05, + "loss": 4.9968, + "step": 28152 + }, + { + "epoch": 0.16743386620991532, + "grad_norm": 1.2502530813217163, + "learning_rate": 4.6620678901551354e-05, + "loss": 5.3297, + "step": 28153 + }, + { + "epoch": 0.1674398134931963, + "grad_norm": 1.5260026454925537, + "learning_rate": 4.662044438175243e-05, + "loss": 5.2643, + "step": 28154 + }, + { + "epoch": 0.1674457607764773, + "grad_norm": 1.2725012302398682, + "learning_rate": 4.662020985440603e-05, + "loss": 5.4469, + "step": 28155 + }, + { + "epoch": 0.16745170805975831, + "grad_norm": 1.717331051826477, + "learning_rate": 4.661997531951224e-05, + "loss": 5.2711, + "step": 28156 + }, + { + "epoch": 0.1674576553430393, + "grad_norm": 1.6104686260223389, + "learning_rate": 4.661974077707114e-05, + "loss": 5.0773, + "step": 28157 + }, + { + "epoch": 0.1674636026263203, + "grad_norm": 1.568558692932129, + "learning_rate": 4.661950622708281e-05, + "loss": 4.4339, + "step": 28158 + }, + { + "epoch": 0.1674695499096013, + "grad_norm": 1.5101975202560425, + "learning_rate": 4.661927166954734e-05, + "loss": 3.9035, + "step": 28159 + }, + { + "epoch": 0.1674754971928823, + "grad_norm": 1.6529417037963867, + "learning_rate": 4.66190371044648e-05, + "loss": 3.8917, + "step": 28160 + }, + { + "epoch": 0.16748144447616328, + "grad_norm": 1.2637635469436646, + "learning_rate": 4.6618802531835285e-05, + "loss": 5.2091, + "step": 28161 + }, + { + "epoch": 0.1674873917594443, + "grad_norm": 1.4303425550460815, + "learning_rate": 4.661856795165886e-05, + "loss": 5.368, + "step": 28162 + }, + { + "epoch": 0.1674933390427253, + "grad_norm": 1.8119208812713623, + "learning_rate": 4.661833336393562e-05, + "loss": 4.257, + "step": 28163 + }, + { + "epoch": 0.16749928632600627, + "grad_norm": 2.0059077739715576, + "learning_rate": 4.661809876866564e-05, + "loss": 4.225, + "step": 28164 + }, + { + "epoch": 0.1675052336092873, + "grad_norm": 1.87846839427948, + "learning_rate": 4.6617864165849005e-05, + "loss": 4.182, + "step": 28165 + }, + { + "epoch": 0.16751118089256828, + "grad_norm": 1.5655750036239624, + "learning_rate": 4.66176295554858e-05, + "loss": 5.441, + "step": 28166 + }, + { + "epoch": 0.16751712817584927, + "grad_norm": 1.735921025276184, + "learning_rate": 4.661739493757611e-05, + "loss": 5.1415, + "step": 28167 + }, + { + "epoch": 0.16752307545913028, + "grad_norm": 1.6819477081298828, + "learning_rate": 4.661716031212e-05, + "loss": 5.2213, + "step": 28168 + }, + { + "epoch": 0.16752902274241127, + "grad_norm": 1.5038045644760132, + "learning_rate": 4.661692567911756e-05, + "loss": 4.3357, + "step": 28169 + }, + { + "epoch": 0.16753497002569226, + "grad_norm": 1.8683745861053467, + "learning_rate": 4.6616691038568885e-05, + "loss": 4.5498, + "step": 28170 + }, + { + "epoch": 0.16754091730897325, + "grad_norm": 1.6156747341156006, + "learning_rate": 4.661645639047405e-05, + "loss": 4.7422, + "step": 28171 + }, + { + "epoch": 0.16754686459225426, + "grad_norm": 1.8638094663619995, + "learning_rate": 4.661622173483312e-05, + "loss": 4.4363, + "step": 28172 + }, + { + "epoch": 0.16755281187553525, + "grad_norm": 1.800417184829712, + "learning_rate": 4.6615987071646194e-05, + "loss": 4.355, + "step": 28173 + }, + { + "epoch": 0.16755875915881624, + "grad_norm": 1.765234351158142, + "learning_rate": 4.661575240091336e-05, + "loss": 4.3521, + "step": 28174 + }, + { + "epoch": 0.16756470644209726, + "grad_norm": 1.7296849489212036, + "learning_rate": 4.661551772263468e-05, + "loss": 4.8884, + "step": 28175 + }, + { + "epoch": 0.16757065372537824, + "grad_norm": 1.609222650527954, + "learning_rate": 4.661528303681025e-05, + "loss": 4.6088, + "step": 28176 + }, + { + "epoch": 0.16757660100865923, + "grad_norm": 1.910651445388794, + "learning_rate": 4.6615048343440145e-05, + "loss": 4.3531, + "step": 28177 + }, + { + "epoch": 0.16758254829194025, + "grad_norm": 1.6934939622879028, + "learning_rate": 4.6614813642524454e-05, + "loss": 4.1895, + "step": 28178 + }, + { + "epoch": 0.16758849557522124, + "grad_norm": 1.630308985710144, + "learning_rate": 4.6614578934063244e-05, + "loss": 4.5883, + "step": 28179 + }, + { + "epoch": 0.16759444285850222, + "grad_norm": 1.4629896879196167, + "learning_rate": 4.6614344218056624e-05, + "loss": 4.4655, + "step": 28180 + }, + { + "epoch": 0.16760039014178324, + "grad_norm": 1.522980809211731, + "learning_rate": 4.6614109494504654e-05, + "loss": 5.3745, + "step": 28181 + }, + { + "epoch": 0.16760633742506423, + "grad_norm": 1.3758256435394287, + "learning_rate": 4.661387476340742e-05, + "loss": 5.4374, + "step": 28182 + }, + { + "epoch": 0.16761228470834522, + "grad_norm": 1.4767520427703857, + "learning_rate": 4.661364002476501e-05, + "loss": 5.4039, + "step": 28183 + }, + { + "epoch": 0.16761823199162623, + "grad_norm": 1.3167197704315186, + "learning_rate": 4.661340527857749e-05, + "loss": 5.3886, + "step": 28184 + }, + { + "epoch": 0.16762417927490722, + "grad_norm": 1.8137489557266235, + "learning_rate": 4.661317052484496e-05, + "loss": 4.6928, + "step": 28185 + }, + { + "epoch": 0.1676301265581882, + "grad_norm": 1.7553741931915283, + "learning_rate": 4.66129357635675e-05, + "loss": 5.0159, + "step": 28186 + }, + { + "epoch": 0.16763607384146922, + "grad_norm": 1.341352939605713, + "learning_rate": 4.661270099474518e-05, + "loss": 5.4529, + "step": 28187 + }, + { + "epoch": 0.1676420211247502, + "grad_norm": 1.553514003753662, + "learning_rate": 4.661246621837809e-05, + "loss": 5.1907, + "step": 28188 + }, + { + "epoch": 0.1676479684080312, + "grad_norm": 1.4974607229232788, + "learning_rate": 4.661223143446631e-05, + "loss": 5.2143, + "step": 28189 + }, + { + "epoch": 0.16765391569131222, + "grad_norm": 1.5769060850143433, + "learning_rate": 4.661199664300993e-05, + "loss": 5.1265, + "step": 28190 + }, + { + "epoch": 0.1676598629745932, + "grad_norm": 1.4753527641296387, + "learning_rate": 4.6611761844009026e-05, + "loss": 4.974, + "step": 28191 + }, + { + "epoch": 0.1676658102578742, + "grad_norm": 1.5406947135925293, + "learning_rate": 4.661152703746368e-05, + "loss": 4.8269, + "step": 28192 + }, + { + "epoch": 0.1676717575411552, + "grad_norm": 1.864577054977417, + "learning_rate": 4.661129222337397e-05, + "loss": 4.505, + "step": 28193 + }, + { + "epoch": 0.1676777048244362, + "grad_norm": 1.561606526374817, + "learning_rate": 4.6611057401739976e-05, + "loss": 4.6992, + "step": 28194 + }, + { + "epoch": 0.16768365210771718, + "grad_norm": 1.6339094638824463, + "learning_rate": 4.661082257256179e-05, + "loss": 4.8973, + "step": 28195 + }, + { + "epoch": 0.1676895993909982, + "grad_norm": 1.8106483221054077, + "learning_rate": 4.661058773583949e-05, + "loss": 4.5909, + "step": 28196 + }, + { + "epoch": 0.1676955466742792, + "grad_norm": 1.6181379556655884, + "learning_rate": 4.661035289157316e-05, + "loss": 5.225, + "step": 28197 + }, + { + "epoch": 0.16770149395756018, + "grad_norm": 1.8745672702789307, + "learning_rate": 4.6610118039762876e-05, + "loss": 4.6381, + "step": 28198 + }, + { + "epoch": 0.1677074412408412, + "grad_norm": 1.6809148788452148, + "learning_rate": 4.6609883180408717e-05, + "loss": 4.879, + "step": 28199 + }, + { + "epoch": 0.16771338852412218, + "grad_norm": 1.6960088014602661, + "learning_rate": 4.660964831351078e-05, + "loss": 4.8171, + "step": 28200 + }, + { + "epoch": 0.16771933580740317, + "grad_norm": 1.8078324794769287, + "learning_rate": 4.660941343906913e-05, + "loss": 4.4722, + "step": 28201 + }, + { + "epoch": 0.16772528309068419, + "grad_norm": 1.6765756607055664, + "learning_rate": 4.660917855708386e-05, + "loss": 4.3086, + "step": 28202 + }, + { + "epoch": 0.16773123037396517, + "grad_norm": 1.608927845954895, + "learning_rate": 4.660894366755505e-05, + "loss": 4.5967, + "step": 28203 + }, + { + "epoch": 0.16773717765724616, + "grad_norm": 2.0235023498535156, + "learning_rate": 4.660870877048278e-05, + "loss": 4.4936, + "step": 28204 + }, + { + "epoch": 0.16774312494052718, + "grad_norm": 1.6895809173583984, + "learning_rate": 4.660847386586713e-05, + "loss": 4.9949, + "step": 28205 + }, + { + "epoch": 0.16774907222380817, + "grad_norm": 1.6481704711914062, + "learning_rate": 4.660823895370819e-05, + "loss": 5.2061, + "step": 28206 + }, + { + "epoch": 0.16775501950708915, + "grad_norm": 1.5078449249267578, + "learning_rate": 4.660800403400604e-05, + "loss": 5.0231, + "step": 28207 + }, + { + "epoch": 0.16776096679037017, + "grad_norm": 1.6977524757385254, + "learning_rate": 4.660776910676076e-05, + "loss": 4.9922, + "step": 28208 + }, + { + "epoch": 0.16776691407365116, + "grad_norm": 1.826011300086975, + "learning_rate": 4.6607534171972425e-05, + "loss": 4.2673, + "step": 28209 + }, + { + "epoch": 0.16777286135693215, + "grad_norm": 2.544302463531494, + "learning_rate": 4.660729922964112e-05, + "loss": 4.3124, + "step": 28210 + }, + { + "epoch": 0.16777880864021316, + "grad_norm": 1.7719815969467163, + "learning_rate": 4.660706427976693e-05, + "loss": 4.249, + "step": 28211 + }, + { + "epoch": 0.16778475592349415, + "grad_norm": 1.6741911172866821, + "learning_rate": 4.660682932234994e-05, + "loss": 4.3522, + "step": 28212 + }, + { + "epoch": 0.16779070320677514, + "grad_norm": 1.6827515363693237, + "learning_rate": 4.660659435739023e-05, + "loss": 4.3316, + "step": 28213 + }, + { + "epoch": 0.16779665049005615, + "grad_norm": 1.722598671913147, + "learning_rate": 4.6606359384887884e-05, + "loss": 4.3367, + "step": 28214 + }, + { + "epoch": 0.16780259777333714, + "grad_norm": 1.7667568922042847, + "learning_rate": 4.660612440484298e-05, + "loss": 4.2754, + "step": 28215 + }, + { + "epoch": 0.16780854505661813, + "grad_norm": 1.7074247598648071, + "learning_rate": 4.6605889417255596e-05, + "loss": 4.2489, + "step": 28216 + }, + { + "epoch": 0.16781449233989915, + "grad_norm": 1.8784146308898926, + "learning_rate": 4.6605654422125836e-05, + "loss": 4.4672, + "step": 28217 + }, + { + "epoch": 0.16782043962318013, + "grad_norm": 1.909641981124878, + "learning_rate": 4.660541941945374e-05, + "loss": 4.5413, + "step": 28218 + }, + { + "epoch": 0.16782638690646112, + "grad_norm": 1.4848551750183105, + "learning_rate": 4.660518440923943e-05, + "loss": 4.6922, + "step": 28219 + }, + { + "epoch": 0.16783233418974214, + "grad_norm": 1.5976632833480835, + "learning_rate": 4.6604949391482974e-05, + "loss": 4.7525, + "step": 28220 + }, + { + "epoch": 0.16783828147302313, + "grad_norm": 1.609236478805542, + "learning_rate": 4.6604714366184455e-05, + "loss": 5.1537, + "step": 28221 + }, + { + "epoch": 0.16784422875630411, + "grad_norm": 1.4178111553192139, + "learning_rate": 4.660447933334394e-05, + "loss": 5.0935, + "step": 28222 + }, + { + "epoch": 0.16785017603958513, + "grad_norm": 1.7521015405654907, + "learning_rate": 4.660424429296154e-05, + "loss": 4.6712, + "step": 28223 + }, + { + "epoch": 0.16785612332286612, + "grad_norm": 1.8282933235168457, + "learning_rate": 4.660400924503731e-05, + "loss": 5.8207, + "step": 28224 + }, + { + "epoch": 0.1678620706061471, + "grad_norm": 1.5437854528427124, + "learning_rate": 4.6603774189571345e-05, + "loss": 5.751, + "step": 28225 + }, + { + "epoch": 0.16786801788942812, + "grad_norm": 1.723281979560852, + "learning_rate": 4.660353912656373e-05, + "loss": 4.6481, + "step": 28226 + }, + { + "epoch": 0.1678739651727091, + "grad_norm": 1.718805193901062, + "learning_rate": 4.6603304056014545e-05, + "loss": 5.4971, + "step": 28227 + }, + { + "epoch": 0.1678799124559901, + "grad_norm": 1.6174219846725464, + "learning_rate": 4.660306897792387e-05, + "loss": 4.475, + "step": 28228 + }, + { + "epoch": 0.1678858597392711, + "grad_norm": 1.8539583683013916, + "learning_rate": 4.660283389229178e-05, + "loss": 4.3182, + "step": 28229 + }, + { + "epoch": 0.1678918070225521, + "grad_norm": 1.6682637929916382, + "learning_rate": 4.660259879911837e-05, + "loss": 4.5625, + "step": 28230 + }, + { + "epoch": 0.1678977543058331, + "grad_norm": 1.825737714767456, + "learning_rate": 4.660236369840371e-05, + "loss": 4.1975, + "step": 28231 + }, + { + "epoch": 0.16790370158911408, + "grad_norm": 1.6130248308181763, + "learning_rate": 4.6602128590147894e-05, + "loss": 5.6634, + "step": 28232 + }, + { + "epoch": 0.1679096488723951, + "grad_norm": 1.6243139505386353, + "learning_rate": 4.660189347435099e-05, + "loss": 4.972, + "step": 28233 + }, + { + "epoch": 0.16791559615567608, + "grad_norm": 1.5760700702667236, + "learning_rate": 4.66016583510131e-05, + "loss": 4.7272, + "step": 28234 + }, + { + "epoch": 0.16792154343895707, + "grad_norm": 1.2500736713409424, + "learning_rate": 4.660142322013429e-05, + "loss": 4.469, + "step": 28235 + }, + { + "epoch": 0.1679274907222381, + "grad_norm": 1.3888235092163086, + "learning_rate": 4.660118808171464e-05, + "loss": 5.3952, + "step": 28236 + }, + { + "epoch": 0.16793343800551908, + "grad_norm": 1.3789753913879395, + "learning_rate": 4.660095293575424e-05, + "loss": 5.8424, + "step": 28237 + }, + { + "epoch": 0.16793938528880006, + "grad_norm": 1.1890273094177246, + "learning_rate": 4.660071778225317e-05, + "loss": 5.9341, + "step": 28238 + }, + { + "epoch": 0.16794533257208108, + "grad_norm": 1.3315849304199219, + "learning_rate": 4.660048262121152e-05, + "loss": 5.9202, + "step": 28239 + }, + { + "epoch": 0.16795127985536207, + "grad_norm": 1.5866754055023193, + "learning_rate": 4.6600247452629365e-05, + "loss": 5.1867, + "step": 28240 + }, + { + "epoch": 0.16795722713864306, + "grad_norm": 1.842445969581604, + "learning_rate": 4.660001227650678e-05, + "loss": 4.4602, + "step": 28241 + }, + { + "epoch": 0.16796317442192407, + "grad_norm": 1.7466117143630981, + "learning_rate": 4.6599777092843855e-05, + "loss": 4.696, + "step": 28242 + }, + { + "epoch": 0.16796912170520506, + "grad_norm": 1.5599199533462524, + "learning_rate": 4.6599541901640665e-05, + "loss": 4.5027, + "step": 28243 + }, + { + "epoch": 0.16797506898848605, + "grad_norm": 1.3156886100769043, + "learning_rate": 4.6599306702897304e-05, + "loss": 4.2991, + "step": 28244 + }, + { + "epoch": 0.16798101627176706, + "grad_norm": 1.372679352760315, + "learning_rate": 4.659907149661386e-05, + "loss": 4.6257, + "step": 28245 + }, + { + "epoch": 0.16798696355504805, + "grad_norm": 1.599493384361267, + "learning_rate": 4.659883628279039e-05, + "loss": 4.4781, + "step": 28246 + }, + { + "epoch": 0.16799291083832904, + "grad_norm": 1.516619324684143, + "learning_rate": 4.6598601061426986e-05, + "loss": 4.4817, + "step": 28247 + }, + { + "epoch": 0.16799885812161006, + "grad_norm": 1.6319454908370972, + "learning_rate": 4.6598365832523736e-05, + "loss": 4.4314, + "step": 28248 + }, + { + "epoch": 0.16800480540489104, + "grad_norm": 1.5013442039489746, + "learning_rate": 4.6598130596080726e-05, + "loss": 4.3608, + "step": 28249 + }, + { + "epoch": 0.16801075268817203, + "grad_norm": 1.5573625564575195, + "learning_rate": 4.659789535209803e-05, + "loss": 4.38, + "step": 28250 + }, + { + "epoch": 0.16801669997145305, + "grad_norm": 1.5244330167770386, + "learning_rate": 4.659766010057574e-05, + "loss": 4.4152, + "step": 28251 + }, + { + "epoch": 0.16802264725473404, + "grad_norm": 2.792175054550171, + "learning_rate": 4.659742484151391e-05, + "loss": 3.7226, + "step": 28252 + }, + { + "epoch": 0.16802859453801502, + "grad_norm": 2.0370240211486816, + "learning_rate": 4.6597189574912654e-05, + "loss": 4.1552, + "step": 28253 + }, + { + "epoch": 0.16803454182129604, + "grad_norm": 1.6263444423675537, + "learning_rate": 4.6596954300772044e-05, + "loss": 4.7215, + "step": 28254 + }, + { + "epoch": 0.16804048910457703, + "grad_norm": 1.6130170822143555, + "learning_rate": 4.659671901909215e-05, + "loss": 4.5078, + "step": 28255 + }, + { + "epoch": 0.16804643638785802, + "grad_norm": 1.3925176858901978, + "learning_rate": 4.659648372987308e-05, + "loss": 4.6085, + "step": 28256 + }, + { + "epoch": 0.16805238367113903, + "grad_norm": 1.4680298566818237, + "learning_rate": 4.6596248433114886e-05, + "loss": 4.4605, + "step": 28257 + }, + { + "epoch": 0.16805833095442002, + "grad_norm": 1.9639580249786377, + "learning_rate": 4.659601312881767e-05, + "loss": 4.1688, + "step": 28258 + }, + { + "epoch": 0.168064278237701, + "grad_norm": 1.7880107164382935, + "learning_rate": 4.6595777816981515e-05, + "loss": 4.3835, + "step": 28259 + }, + { + "epoch": 0.16807022552098203, + "grad_norm": 1.8420106172561646, + "learning_rate": 4.659554249760649e-05, + "loss": 4.4068, + "step": 28260 + }, + { + "epoch": 0.168076172804263, + "grad_norm": 1.7331891059875488, + "learning_rate": 4.659530717069269e-05, + "loss": 4.2069, + "step": 28261 + }, + { + "epoch": 0.168082120087544, + "grad_norm": 1.6757560968399048, + "learning_rate": 4.659507183624019e-05, + "loss": 4.7915, + "step": 28262 + }, + { + "epoch": 0.16808806737082502, + "grad_norm": 1.6277943849563599, + "learning_rate": 4.6594836494249066e-05, + "loss": 4.431, + "step": 28263 + }, + { + "epoch": 0.168094014654106, + "grad_norm": 1.9865028858184814, + "learning_rate": 4.6594601144719406e-05, + "loss": 4.8244, + "step": 28264 + }, + { + "epoch": 0.168099961937387, + "grad_norm": 1.818390130996704, + "learning_rate": 4.659436578765131e-05, + "loss": 4.7089, + "step": 28265 + }, + { + "epoch": 0.168105909220668, + "grad_norm": 1.3201099634170532, + "learning_rate": 4.6594130423044836e-05, + "loss": 4.8117, + "step": 28266 + }, + { + "epoch": 0.168111856503949, + "grad_norm": 1.7755099534988403, + "learning_rate": 4.6593895050900074e-05, + "loss": 4.4389, + "step": 28267 + }, + { + "epoch": 0.16811780378722999, + "grad_norm": 1.6653193235397339, + "learning_rate": 4.65936596712171e-05, + "loss": 4.3489, + "step": 28268 + }, + { + "epoch": 0.168123751070511, + "grad_norm": 1.4699918031692505, + "learning_rate": 4.6593424283996004e-05, + "loss": 4.935, + "step": 28269 + }, + { + "epoch": 0.168129698353792, + "grad_norm": 1.8290356397628784, + "learning_rate": 4.659318888923687e-05, + "loss": 5.1348, + "step": 28270 + }, + { + "epoch": 0.16813564563707298, + "grad_norm": 1.7782410383224487, + "learning_rate": 4.6592953486939784e-05, + "loss": 5.1601, + "step": 28271 + }, + { + "epoch": 0.168141592920354, + "grad_norm": 1.8384326696395874, + "learning_rate": 4.6592718077104814e-05, + "loss": 4.7923, + "step": 28272 + }, + { + "epoch": 0.16814754020363498, + "grad_norm": 1.6723445653915405, + "learning_rate": 4.659248265973205e-05, + "loss": 4.9497, + "step": 28273 + }, + { + "epoch": 0.16815348748691597, + "grad_norm": 1.4820493459701538, + "learning_rate": 4.6592247234821575e-05, + "loss": 4.3104, + "step": 28274 + }, + { + "epoch": 0.16815943477019699, + "grad_norm": 1.4215086698532104, + "learning_rate": 4.659201180237346e-05, + "loss": 4.5723, + "step": 28275 + }, + { + "epoch": 0.16816538205347797, + "grad_norm": 1.6446219682693481, + "learning_rate": 4.6591776362387804e-05, + "loss": 4.6208, + "step": 28276 + }, + { + "epoch": 0.16817132933675896, + "grad_norm": 1.6352293491363525, + "learning_rate": 4.6591540914864686e-05, + "loss": 5.03, + "step": 28277 + }, + { + "epoch": 0.16817727662003998, + "grad_norm": 1.59463369846344, + "learning_rate": 4.659130545980418e-05, + "loss": 4.5116, + "step": 28278 + }, + { + "epoch": 0.16818322390332097, + "grad_norm": 1.8565449714660645, + "learning_rate": 4.659106999720637e-05, + "loss": 4.4572, + "step": 28279 + }, + { + "epoch": 0.16818917118660195, + "grad_norm": 1.7354021072387695, + "learning_rate": 4.659083452707135e-05, + "loss": 4.9343, + "step": 28280 + }, + { + "epoch": 0.16819511846988297, + "grad_norm": 1.8169907331466675, + "learning_rate": 4.659059904939918e-05, + "loss": 4.6285, + "step": 28281 + }, + { + "epoch": 0.16820106575316396, + "grad_norm": 1.6343300342559814, + "learning_rate": 4.659036356418996e-05, + "loss": 4.6125, + "step": 28282 + }, + { + "epoch": 0.16820701303644495, + "grad_norm": 1.5487629175186157, + "learning_rate": 4.659012807144377e-05, + "loss": 4.5907, + "step": 28283 + }, + { + "epoch": 0.16821296031972596, + "grad_norm": 1.4640655517578125, + "learning_rate": 4.658989257116069e-05, + "loss": 4.4199, + "step": 28284 + }, + { + "epoch": 0.16821890760300695, + "grad_norm": 1.4370266199111938, + "learning_rate": 4.65896570633408e-05, + "loss": 4.5677, + "step": 28285 + }, + { + "epoch": 0.16822485488628794, + "grad_norm": 1.6564301252365112, + "learning_rate": 4.658942154798418e-05, + "loss": 4.5189, + "step": 28286 + }, + { + "epoch": 0.16823080216956893, + "grad_norm": 1.6301320791244507, + "learning_rate": 4.658918602509091e-05, + "loss": 4.9653, + "step": 28287 + }, + { + "epoch": 0.16823674945284994, + "grad_norm": 1.5462539196014404, + "learning_rate": 4.6588950494661096e-05, + "loss": 5.011, + "step": 28288 + }, + { + "epoch": 0.16824269673613093, + "grad_norm": 1.7004579305648804, + "learning_rate": 4.658871495669479e-05, + "loss": 4.7863, + "step": 28289 + }, + { + "epoch": 0.16824864401941192, + "grad_norm": 1.47449791431427, + "learning_rate": 4.658847941119209e-05, + "loss": 4.8344, + "step": 28290 + }, + { + "epoch": 0.16825459130269294, + "grad_norm": 1.7310223579406738, + "learning_rate": 4.658824385815308e-05, + "loss": 4.5996, + "step": 28291 + }, + { + "epoch": 0.16826053858597392, + "grad_norm": 1.5716323852539062, + "learning_rate": 4.658800829757782e-05, + "loss": 4.6623, + "step": 28292 + }, + { + "epoch": 0.1682664858692549, + "grad_norm": 1.8458023071289062, + "learning_rate": 4.6587772729466426e-05, + "loss": 4.8966, + "step": 28293 + }, + { + "epoch": 0.16827243315253593, + "grad_norm": 1.4939119815826416, + "learning_rate": 4.658753715381896e-05, + "loss": 4.9607, + "step": 28294 + }, + { + "epoch": 0.16827838043581692, + "grad_norm": 1.6060224771499634, + "learning_rate": 4.658730157063551e-05, + "loss": 4.9144, + "step": 28295 + }, + { + "epoch": 0.1682843277190979, + "grad_norm": 1.6743205785751343, + "learning_rate": 4.658706597991615e-05, + "loss": 5.1634, + "step": 28296 + }, + { + "epoch": 0.16829027500237892, + "grad_norm": 1.6277934312820435, + "learning_rate": 4.658683038166097e-05, + "loss": 4.5367, + "step": 28297 + }, + { + "epoch": 0.1682962222856599, + "grad_norm": 2.8272674083709717, + "learning_rate": 4.658659477587005e-05, + "loss": 4.5467, + "step": 28298 + }, + { + "epoch": 0.1683021695689409, + "grad_norm": 2.199181318283081, + "learning_rate": 4.658635916254348e-05, + "loss": 4.595, + "step": 28299 + }, + { + "epoch": 0.1683081168522219, + "grad_norm": 1.860811710357666, + "learning_rate": 4.6586123541681324e-05, + "loss": 4.6934, + "step": 28300 + }, + { + "epoch": 0.1683140641355029, + "grad_norm": 1.5959035158157349, + "learning_rate": 4.6585887913283685e-05, + "loss": 4.5346, + "step": 28301 + }, + { + "epoch": 0.1683200114187839, + "grad_norm": 1.503235936164856, + "learning_rate": 4.658565227735063e-05, + "loss": 4.7135, + "step": 28302 + }, + { + "epoch": 0.1683259587020649, + "grad_norm": 1.5272914171218872, + "learning_rate": 4.658541663388225e-05, + "loss": 4.507, + "step": 28303 + }, + { + "epoch": 0.1683319059853459, + "grad_norm": 1.7282012701034546, + "learning_rate": 4.6585180982878615e-05, + "loss": 4.4787, + "step": 28304 + }, + { + "epoch": 0.16833785326862688, + "grad_norm": 1.6522059440612793, + "learning_rate": 4.6584945324339823e-05, + "loss": 4.5825, + "step": 28305 + }, + { + "epoch": 0.1683438005519079, + "grad_norm": 1.3752492666244507, + "learning_rate": 4.6584709658265955e-05, + "loss": 4.7064, + "step": 28306 + }, + { + "epoch": 0.16834974783518888, + "grad_norm": 2.415187358856201, + "learning_rate": 4.6584473984657086e-05, + "loss": 4.1959, + "step": 28307 + }, + { + "epoch": 0.16835569511846987, + "grad_norm": 1.545029640197754, + "learning_rate": 4.6584238303513295e-05, + "loss": 4.426, + "step": 28308 + }, + { + "epoch": 0.1683616424017509, + "grad_norm": 1.6749895811080933, + "learning_rate": 4.6584002614834666e-05, + "loss": 5.19, + "step": 28309 + }, + { + "epoch": 0.16836758968503188, + "grad_norm": 1.5567103624343872, + "learning_rate": 4.65837669186213e-05, + "loss": 4.854, + "step": 28310 + }, + { + "epoch": 0.16837353696831286, + "grad_norm": 1.2138694524765015, + "learning_rate": 4.658353121487324e-05, + "loss": 4.6035, + "step": 28311 + }, + { + "epoch": 0.16837948425159388, + "grad_norm": 1.4592459201812744, + "learning_rate": 4.658329550359061e-05, + "loss": 4.6315, + "step": 28312 + }, + { + "epoch": 0.16838543153487487, + "grad_norm": 1.5305829048156738, + "learning_rate": 4.658305978477348e-05, + "loss": 4.9041, + "step": 28313 + }, + { + "epoch": 0.16839137881815586, + "grad_norm": 2.0584359169006348, + "learning_rate": 4.658282405842191e-05, + "loss": 3.7849, + "step": 28314 + }, + { + "epoch": 0.16839732610143687, + "grad_norm": 3.1896352767944336, + "learning_rate": 4.658258832453601e-05, + "loss": 3.9083, + "step": 28315 + }, + { + "epoch": 0.16840327338471786, + "grad_norm": 2.942909002304077, + "learning_rate": 4.658235258311584e-05, + "loss": 3.6764, + "step": 28316 + }, + { + "epoch": 0.16840922066799885, + "grad_norm": 3.2764618396759033, + "learning_rate": 4.65821168341615e-05, + "loss": 3.8794, + "step": 28317 + }, + { + "epoch": 0.16841516795127986, + "grad_norm": 2.8366522789001465, + "learning_rate": 4.6581881077673074e-05, + "loss": 4.8133, + "step": 28318 + }, + { + "epoch": 0.16842111523456085, + "grad_norm": 1.551155686378479, + "learning_rate": 4.658164531365063e-05, + "loss": 4.7024, + "step": 28319 + }, + { + "epoch": 0.16842706251784184, + "grad_norm": 2.4063937664031982, + "learning_rate": 4.6581409542094255e-05, + "loss": 3.2516, + "step": 28320 + }, + { + "epoch": 0.16843300980112286, + "grad_norm": 2.5758605003356934, + "learning_rate": 4.658117376300404e-05, + "loss": 3.5301, + "step": 28321 + }, + { + "epoch": 0.16843895708440385, + "grad_norm": 2.643880605697632, + "learning_rate": 4.658093797638005e-05, + "loss": 3.2137, + "step": 28322 + }, + { + "epoch": 0.16844490436768483, + "grad_norm": 2.6048755645751953, + "learning_rate": 4.658070218222238e-05, + "loss": 3.3595, + "step": 28323 + }, + { + "epoch": 0.16845085165096585, + "grad_norm": 2.677281141281128, + "learning_rate": 4.6580466380531116e-05, + "loss": 4.0526, + "step": 28324 + }, + { + "epoch": 0.16845679893424684, + "grad_norm": 2.1559438705444336, + "learning_rate": 4.658023057130633e-05, + "loss": 3.6773, + "step": 28325 + }, + { + "epoch": 0.16846274621752783, + "grad_norm": 2.271451711654663, + "learning_rate": 4.6579994754548105e-05, + "loss": 3.3233, + "step": 28326 + }, + { + "epoch": 0.16846869350080884, + "grad_norm": 2.6819088459014893, + "learning_rate": 4.657975893025653e-05, + "loss": 3.0184, + "step": 28327 + }, + { + "epoch": 0.16847464078408983, + "grad_norm": 2.7791247367858887, + "learning_rate": 4.6579523098431686e-05, + "loss": 3.4093, + "step": 28328 + }, + { + "epoch": 0.16848058806737082, + "grad_norm": 2.7528347969055176, + "learning_rate": 4.6579287259073654e-05, + "loss": 3.0479, + "step": 28329 + }, + { + "epoch": 0.16848653535065183, + "grad_norm": 2.3715124130249023, + "learning_rate": 4.657905141218252e-05, + "loss": 3.7365, + "step": 28330 + }, + { + "epoch": 0.16849248263393282, + "grad_norm": 1.9896430969238281, + "learning_rate": 4.657881555775835e-05, + "loss": 4.6336, + "step": 28331 + }, + { + "epoch": 0.1684984299172138, + "grad_norm": 1.6838959455490112, + "learning_rate": 4.657857969580124e-05, + "loss": 4.8033, + "step": 28332 + }, + { + "epoch": 0.16850437720049483, + "grad_norm": 1.7189829349517822, + "learning_rate": 4.6578343826311274e-05, + "loss": 4.721, + "step": 28333 + }, + { + "epoch": 0.16851032448377581, + "grad_norm": 2.3129501342773438, + "learning_rate": 4.657810794928854e-05, + "loss": 3.626, + "step": 28334 + }, + { + "epoch": 0.1685162717670568, + "grad_norm": 3.216485023498535, + "learning_rate": 4.6577872064733094e-05, + "loss": 3.2259, + "step": 28335 + }, + { + "epoch": 0.16852221905033782, + "grad_norm": 2.995213031768799, + "learning_rate": 4.657763617264506e-05, + "loss": 3.2364, + "step": 28336 + }, + { + "epoch": 0.1685281663336188, + "grad_norm": 2.6219449043273926, + "learning_rate": 4.6577400273024474e-05, + "loss": 4.2354, + "step": 28337 + }, + { + "epoch": 0.1685341136168998, + "grad_norm": 1.6310757398605347, + "learning_rate": 4.657716436587145e-05, + "loss": 5.3334, + "step": 28338 + }, + { + "epoch": 0.1685400609001808, + "grad_norm": 2.375399589538574, + "learning_rate": 4.657692845118605e-05, + "loss": 4.4366, + "step": 28339 + }, + { + "epoch": 0.1685460081834618, + "grad_norm": 1.874076247215271, + "learning_rate": 4.657669252896838e-05, + "loss": 5.2293, + "step": 28340 + }, + { + "epoch": 0.1685519554667428, + "grad_norm": 1.8757516145706177, + "learning_rate": 4.657645659921851e-05, + "loss": 4.6433, + "step": 28341 + }, + { + "epoch": 0.1685579027500238, + "grad_norm": 1.6679904460906982, + "learning_rate": 4.6576220661936514e-05, + "loss": 4.591, + "step": 28342 + }, + { + "epoch": 0.1685638500333048, + "grad_norm": 1.5081669092178345, + "learning_rate": 4.6575984717122487e-05, + "loss": 4.9147, + "step": 28343 + }, + { + "epoch": 0.16856979731658578, + "grad_norm": 1.4801992177963257, + "learning_rate": 4.657574876477651e-05, + "loss": 5.3181, + "step": 28344 + }, + { + "epoch": 0.1685757445998668, + "grad_norm": 1.5100293159484863, + "learning_rate": 4.657551280489865e-05, + "loss": 4.6282, + "step": 28345 + }, + { + "epoch": 0.16858169188314778, + "grad_norm": 1.5850365161895752, + "learning_rate": 4.6575276837489016e-05, + "loss": 4.566, + "step": 28346 + }, + { + "epoch": 0.16858763916642877, + "grad_norm": 1.9910119771957397, + "learning_rate": 4.657504086254766e-05, + "loss": 5.1222, + "step": 28347 + }, + { + "epoch": 0.16859358644970976, + "grad_norm": 1.8456346988677979, + "learning_rate": 4.65748048800747e-05, + "loss": 4.7977, + "step": 28348 + }, + { + "epoch": 0.16859953373299078, + "grad_norm": 2.4570720195770264, + "learning_rate": 4.657456889007018e-05, + "loss": 4.6518, + "step": 28349 + }, + { + "epoch": 0.16860548101627176, + "grad_norm": 2.76509952545166, + "learning_rate": 4.657433289253421e-05, + "loss": 4.2894, + "step": 28350 + }, + { + "epoch": 0.16861142829955275, + "grad_norm": 2.61690616607666, + "learning_rate": 4.657409688746686e-05, + "loss": 4.1016, + "step": 28351 + }, + { + "epoch": 0.16861737558283377, + "grad_norm": 2.678689479827881, + "learning_rate": 4.6573860874868214e-05, + "loss": 4.4325, + "step": 28352 + }, + { + "epoch": 0.16862332286611476, + "grad_norm": 2.1475918292999268, + "learning_rate": 4.657362485473836e-05, + "loss": 4.8043, + "step": 28353 + }, + { + "epoch": 0.16862927014939574, + "grad_norm": 1.7649880647659302, + "learning_rate": 4.657338882707738e-05, + "loss": 5.5315, + "step": 28354 + }, + { + "epoch": 0.16863521743267676, + "grad_norm": 2.451415538787842, + "learning_rate": 4.657315279188534e-05, + "loss": 4.4149, + "step": 28355 + }, + { + "epoch": 0.16864116471595775, + "grad_norm": 2.628056764602661, + "learning_rate": 4.657291674916234e-05, + "loss": 3.9996, + "step": 28356 + }, + { + "epoch": 0.16864711199923874, + "grad_norm": 2.5917954444885254, + "learning_rate": 4.657268069890847e-05, + "loss": 4.1523, + "step": 28357 + }, + { + "epoch": 0.16865305928251975, + "grad_norm": 2.5339810848236084, + "learning_rate": 4.657244464112379e-05, + "loss": 4.1835, + "step": 28358 + }, + { + "epoch": 0.16865900656580074, + "grad_norm": 2.5512847900390625, + "learning_rate": 4.657220857580839e-05, + "loss": 4.2205, + "step": 28359 + }, + { + "epoch": 0.16866495384908173, + "grad_norm": 1.9828633069992065, + "learning_rate": 4.657197250296236e-05, + "loss": 4.5812, + "step": 28360 + }, + { + "epoch": 0.16867090113236274, + "grad_norm": 1.9058914184570312, + "learning_rate": 4.657173642258578e-05, + "loss": 4.9579, + "step": 28361 + }, + { + "epoch": 0.16867684841564373, + "grad_norm": 2.473252534866333, + "learning_rate": 4.657150033467872e-05, + "loss": 4.2123, + "step": 28362 + }, + { + "epoch": 0.16868279569892472, + "grad_norm": 2.2516047954559326, + "learning_rate": 4.657126423924128e-05, + "loss": 4.2096, + "step": 28363 + }, + { + "epoch": 0.16868874298220574, + "grad_norm": 2.4706156253814697, + "learning_rate": 4.657102813627353e-05, + "loss": 4.0615, + "step": 28364 + }, + { + "epoch": 0.16869469026548672, + "grad_norm": 2.5827410221099854, + "learning_rate": 4.657079202577556e-05, + "loss": 4.4003, + "step": 28365 + }, + { + "epoch": 0.1687006375487677, + "grad_norm": 1.812254548072815, + "learning_rate": 4.657055590774745e-05, + "loss": 4.7705, + "step": 28366 + }, + { + "epoch": 0.16870658483204873, + "grad_norm": 1.5623784065246582, + "learning_rate": 4.6570319782189284e-05, + "loss": 5.3618, + "step": 28367 + }, + { + "epoch": 0.16871253211532972, + "grad_norm": 1.9756156206130981, + "learning_rate": 4.657008364910114e-05, + "loss": 5.0061, + "step": 28368 + }, + { + "epoch": 0.1687184793986107, + "grad_norm": 2.592015027999878, + "learning_rate": 4.65698475084831e-05, + "loss": 4.771, + "step": 28369 + }, + { + "epoch": 0.16872442668189172, + "grad_norm": 1.7394741773605347, + "learning_rate": 4.656961136033525e-05, + "loss": 5.4057, + "step": 28370 + }, + { + "epoch": 0.1687303739651727, + "grad_norm": 1.712748646736145, + "learning_rate": 4.656937520465767e-05, + "loss": 5.242, + "step": 28371 + }, + { + "epoch": 0.1687363212484537, + "grad_norm": 1.794945240020752, + "learning_rate": 4.6569139041450446e-05, + "loss": 5.1821, + "step": 28372 + }, + { + "epoch": 0.1687422685317347, + "grad_norm": 1.6122878789901733, + "learning_rate": 4.656890287071366e-05, + "loss": 5.3729, + "step": 28373 + }, + { + "epoch": 0.1687482158150157, + "grad_norm": 1.6189091205596924, + "learning_rate": 4.656866669244739e-05, + "loss": 5.5319, + "step": 28374 + }, + { + "epoch": 0.1687541630982967, + "grad_norm": 1.4604097604751587, + "learning_rate": 4.6568430506651715e-05, + "loss": 5.7885, + "step": 28375 + }, + { + "epoch": 0.1687601103815777, + "grad_norm": 1.4060790538787842, + "learning_rate": 4.656819431332673e-05, + "loss": 5.8022, + "step": 28376 + }, + { + "epoch": 0.1687660576648587, + "grad_norm": 1.4350751638412476, + "learning_rate": 4.6567958112472515e-05, + "loss": 5.8437, + "step": 28377 + }, + { + "epoch": 0.16877200494813968, + "grad_norm": 1.572094202041626, + "learning_rate": 4.656772190408914e-05, + "loss": 5.2559, + "step": 28378 + }, + { + "epoch": 0.1687779522314207, + "grad_norm": 1.5529630184173584, + "learning_rate": 4.656748568817671e-05, + "loss": 5.325, + "step": 28379 + }, + { + "epoch": 0.16878389951470169, + "grad_norm": 1.5496705770492554, + "learning_rate": 4.656724946473528e-05, + "loss": 5.2824, + "step": 28380 + }, + { + "epoch": 0.16878984679798267, + "grad_norm": 1.4349329471588135, + "learning_rate": 4.656701323376496e-05, + "loss": 5.3192, + "step": 28381 + }, + { + "epoch": 0.1687957940812637, + "grad_norm": 1.391747236251831, + "learning_rate": 4.6566776995265804e-05, + "loss": 5.2476, + "step": 28382 + }, + { + "epoch": 0.16880174136454468, + "grad_norm": 1.3532518148422241, + "learning_rate": 4.6566540749237916e-05, + "loss": 5.1795, + "step": 28383 + }, + { + "epoch": 0.16880768864782567, + "grad_norm": 1.4906384944915771, + "learning_rate": 4.656630449568137e-05, + "loss": 5.3211, + "step": 28384 + }, + { + "epoch": 0.16881363593110668, + "grad_norm": 1.560478687286377, + "learning_rate": 4.656606823459625e-05, + "loss": 5.2823, + "step": 28385 + }, + { + "epoch": 0.16881958321438767, + "grad_norm": 1.6834107637405396, + "learning_rate": 4.656583196598264e-05, + "loss": 5.206, + "step": 28386 + }, + { + "epoch": 0.16882553049766866, + "grad_norm": 1.4601906538009644, + "learning_rate": 4.656559568984062e-05, + "loss": 5.2269, + "step": 28387 + }, + { + "epoch": 0.16883147778094967, + "grad_norm": 1.7208976745605469, + "learning_rate": 4.656535940617027e-05, + "loss": 5.3731, + "step": 28388 + }, + { + "epoch": 0.16883742506423066, + "grad_norm": 1.6507620811462402, + "learning_rate": 4.656512311497168e-05, + "loss": 5.544, + "step": 28389 + }, + { + "epoch": 0.16884337234751165, + "grad_norm": 1.7269225120544434, + "learning_rate": 4.6564886816244926e-05, + "loss": 5.5757, + "step": 28390 + }, + { + "epoch": 0.16884931963079267, + "grad_norm": 1.8436660766601562, + "learning_rate": 4.6564650509990096e-05, + "loss": 5.2549, + "step": 28391 + }, + { + "epoch": 0.16885526691407365, + "grad_norm": 2.2432281970977783, + "learning_rate": 4.656441419620727e-05, + "loss": 4.788, + "step": 28392 + }, + { + "epoch": 0.16886121419735464, + "grad_norm": 1.6931114196777344, + "learning_rate": 4.656417787489652e-05, + "loss": 4.9039, + "step": 28393 + }, + { + "epoch": 0.16886716148063566, + "grad_norm": 1.6208950281143188, + "learning_rate": 4.656394154605795e-05, + "loss": 5.2821, + "step": 28394 + }, + { + "epoch": 0.16887310876391665, + "grad_norm": 2.725078821182251, + "learning_rate": 4.656370520969162e-05, + "loss": 4.3892, + "step": 28395 + }, + { + "epoch": 0.16887905604719763, + "grad_norm": 3.6109495162963867, + "learning_rate": 4.6563468865797636e-05, + "loss": 4.1935, + "step": 28396 + }, + { + "epoch": 0.16888500333047865, + "grad_norm": 1.9827744960784912, + "learning_rate": 4.656323251437606e-05, + "loss": 5.1187, + "step": 28397 + }, + { + "epoch": 0.16889095061375964, + "grad_norm": 1.8615485429763794, + "learning_rate": 4.6562996155426985e-05, + "loss": 5.6777, + "step": 28398 + }, + { + "epoch": 0.16889689789704063, + "grad_norm": 1.7114287614822388, + "learning_rate": 4.6562759788950484e-05, + "loss": 5.5126, + "step": 28399 + }, + { + "epoch": 0.16890284518032164, + "grad_norm": 1.672108769416809, + "learning_rate": 4.656252341494666e-05, + "loss": 5.2453, + "step": 28400 + }, + { + "epoch": 0.16890879246360263, + "grad_norm": 1.7363505363464355, + "learning_rate": 4.656228703341556e-05, + "loss": 5.1452, + "step": 28401 + }, + { + "epoch": 0.16891473974688362, + "grad_norm": 1.6358929872512817, + "learning_rate": 4.656205064435731e-05, + "loss": 4.7812, + "step": 28402 + }, + { + "epoch": 0.16892068703016463, + "grad_norm": 1.5269345045089722, + "learning_rate": 4.656181424777196e-05, + "loss": 4.9725, + "step": 28403 + }, + { + "epoch": 0.16892663431344562, + "grad_norm": 1.8694361448287964, + "learning_rate": 4.656157784365961e-05, + "loss": 4.8145, + "step": 28404 + }, + { + "epoch": 0.1689325815967266, + "grad_norm": 1.6409978866577148, + "learning_rate": 4.6561341432020335e-05, + "loss": 4.8409, + "step": 28405 + }, + { + "epoch": 0.1689385288800076, + "grad_norm": 1.586323618888855, + "learning_rate": 4.656110501285421e-05, + "loss": 4.9883, + "step": 28406 + }, + { + "epoch": 0.16894447616328861, + "grad_norm": 1.936805009841919, + "learning_rate": 4.656086858616133e-05, + "loss": 4.8728, + "step": 28407 + }, + { + "epoch": 0.1689504234465696, + "grad_norm": 2.4873859882354736, + "learning_rate": 4.656063215194178e-05, + "loss": 4.3402, + "step": 28408 + }, + { + "epoch": 0.1689563707298506, + "grad_norm": 2.295729637145996, + "learning_rate": 4.6560395710195624e-05, + "loss": 4.2334, + "step": 28409 + }, + { + "epoch": 0.1689623180131316, + "grad_norm": 2.2564427852630615, + "learning_rate": 4.6560159260922966e-05, + "loss": 4.6056, + "step": 28410 + }, + { + "epoch": 0.1689682652964126, + "grad_norm": 1.5321199893951416, + "learning_rate": 4.655992280412388e-05, + "loss": 5.7092, + "step": 28411 + }, + { + "epoch": 0.16897421257969358, + "grad_norm": 1.4915989637374878, + "learning_rate": 4.655968633979844e-05, + "loss": 5.5028, + "step": 28412 + }, + { + "epoch": 0.1689801598629746, + "grad_norm": 1.6282528638839722, + "learning_rate": 4.655944986794675e-05, + "loss": 5.405, + "step": 28413 + }, + { + "epoch": 0.1689861071462556, + "grad_norm": 1.5174504518508911, + "learning_rate": 4.6559213388568865e-05, + "loss": 5.2818, + "step": 28414 + }, + { + "epoch": 0.16899205442953658, + "grad_norm": 1.6792948246002197, + "learning_rate": 4.6558976901664885e-05, + "loss": 5.4466, + "step": 28415 + }, + { + "epoch": 0.1689980017128176, + "grad_norm": 1.5633111000061035, + "learning_rate": 4.655874040723489e-05, + "loss": 5.3313, + "step": 28416 + }, + { + "epoch": 0.16900394899609858, + "grad_norm": 1.6550037860870361, + "learning_rate": 4.655850390527896e-05, + "loss": 5.3279, + "step": 28417 + }, + { + "epoch": 0.16900989627937957, + "grad_norm": 1.6670206785202026, + "learning_rate": 4.6558267395797186e-05, + "loss": 5.0354, + "step": 28418 + }, + { + "epoch": 0.16901584356266058, + "grad_norm": 1.577187180519104, + "learning_rate": 4.6558030878789635e-05, + "loss": 4.9382, + "step": 28419 + }, + { + "epoch": 0.16902179084594157, + "grad_norm": 1.5832712650299072, + "learning_rate": 4.65577943542564e-05, + "loss": 5.3036, + "step": 28420 + }, + { + "epoch": 0.16902773812922256, + "grad_norm": 1.4962387084960938, + "learning_rate": 4.655755782219756e-05, + "loss": 5.3586, + "step": 28421 + }, + { + "epoch": 0.16903368541250358, + "grad_norm": 1.2843531370162964, + "learning_rate": 4.655732128261321e-05, + "loss": 5.3972, + "step": 28422 + }, + { + "epoch": 0.16903963269578456, + "grad_norm": 1.1370457410812378, + "learning_rate": 4.6557084735503406e-05, + "loss": 5.2004, + "step": 28423 + }, + { + "epoch": 0.16904557997906555, + "grad_norm": 2.759056329727173, + "learning_rate": 4.655684818086825e-05, + "loss": 4.5741, + "step": 28424 + }, + { + "epoch": 0.16905152726234657, + "grad_norm": 2.7487027645111084, + "learning_rate": 4.655661161870783e-05, + "loss": 4.1308, + "step": 28425 + }, + { + "epoch": 0.16905747454562756, + "grad_norm": 2.479084014892578, + "learning_rate": 4.655637504902221e-05, + "loss": 4.2166, + "step": 28426 + }, + { + "epoch": 0.16906342182890854, + "grad_norm": 2.667968511581421, + "learning_rate": 4.65561384718115e-05, + "loss": 4.1276, + "step": 28427 + }, + { + "epoch": 0.16906936911218956, + "grad_norm": 2.6374669075012207, + "learning_rate": 4.655590188707575e-05, + "loss": 3.7747, + "step": 28428 + }, + { + "epoch": 0.16907531639547055, + "grad_norm": 2.0448408126831055, + "learning_rate": 4.655566529481505e-05, + "loss": 4.7242, + "step": 28429 + }, + { + "epoch": 0.16908126367875154, + "grad_norm": 2.416416645050049, + "learning_rate": 4.65554286950295e-05, + "loss": 4.3241, + "step": 28430 + }, + { + "epoch": 0.16908721096203255, + "grad_norm": 2.018310308456421, + "learning_rate": 4.6555192087719175e-05, + "loss": 4.2137, + "step": 28431 + }, + { + "epoch": 0.16909315824531354, + "grad_norm": 2.2149248123168945, + "learning_rate": 4.655495547288415e-05, + "loss": 4.2518, + "step": 28432 + }, + { + "epoch": 0.16909910552859453, + "grad_norm": 2.190190553665161, + "learning_rate": 4.655471885052452e-05, + "loss": 4.0488, + "step": 28433 + }, + { + "epoch": 0.16910505281187554, + "grad_norm": 2.146759033203125, + "learning_rate": 4.6554482220640347e-05, + "loss": 4.005, + "step": 28434 + }, + { + "epoch": 0.16911100009515653, + "grad_norm": 1.7445921897888184, + "learning_rate": 4.655424558323174e-05, + "loss": 4.5846, + "step": 28435 + }, + { + "epoch": 0.16911694737843752, + "grad_norm": 1.924498200416565, + "learning_rate": 4.655400893829876e-05, + "loss": 4.4729, + "step": 28436 + }, + { + "epoch": 0.16912289466171854, + "grad_norm": 2.297170877456665, + "learning_rate": 4.65537722858415e-05, + "loss": 4.0639, + "step": 28437 + }, + { + "epoch": 0.16912884194499953, + "grad_norm": 2.254561424255371, + "learning_rate": 4.6553535625860044e-05, + "loss": 3.6444, + "step": 28438 + }, + { + "epoch": 0.1691347892282805, + "grad_norm": 2.3372230529785156, + "learning_rate": 4.655329895835447e-05, + "loss": 3.9905, + "step": 28439 + }, + { + "epoch": 0.16914073651156153, + "grad_norm": 2.376207113265991, + "learning_rate": 4.655306228332486e-05, + "loss": 3.9777, + "step": 28440 + }, + { + "epoch": 0.16914668379484252, + "grad_norm": 1.6520785093307495, + "learning_rate": 4.65528256007713e-05, + "loss": 4.9314, + "step": 28441 + }, + { + "epoch": 0.1691526310781235, + "grad_norm": 1.93073308467865, + "learning_rate": 4.6552588910693876e-05, + "loss": 5.1317, + "step": 28442 + }, + { + "epoch": 0.16915857836140452, + "grad_norm": 1.5278276205062866, + "learning_rate": 4.655235221309266e-05, + "loss": 5.2949, + "step": 28443 + }, + { + "epoch": 0.1691645256446855, + "grad_norm": 1.5671179294586182, + "learning_rate": 4.6552115507967744e-05, + "loss": 4.8824, + "step": 28444 + }, + { + "epoch": 0.1691704729279665, + "grad_norm": 1.6631091833114624, + "learning_rate": 4.6551878795319204e-05, + "loss": 4.6696, + "step": 28445 + }, + { + "epoch": 0.1691764202112475, + "grad_norm": 1.9113469123840332, + "learning_rate": 4.655164207514713e-05, + "loss": 4.2842, + "step": 28446 + }, + { + "epoch": 0.1691823674945285, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.655140534745159e-05, + "loss": 5.3818, + "step": 28447 + }, + { + "epoch": 0.1691883147778095, + "grad_norm": 1.7372487783432007, + "learning_rate": 4.6551168612232685e-05, + "loss": 5.2441, + "step": 28448 + }, + { + "epoch": 0.1691942620610905, + "grad_norm": 1.8049054145812988, + "learning_rate": 4.655093186949049e-05, + "loss": 5.2056, + "step": 28449 + }, + { + "epoch": 0.1692002093443715, + "grad_norm": 2.019453763961792, + "learning_rate": 4.6550695119225086e-05, + "loss": 5.4237, + "step": 28450 + }, + { + "epoch": 0.16920615662765248, + "grad_norm": 1.3187928199768066, + "learning_rate": 4.6550458361436554e-05, + "loss": 5.2069, + "step": 28451 + }, + { + "epoch": 0.1692121039109335, + "grad_norm": 2.054603099822998, + "learning_rate": 4.655022159612499e-05, + "loss": 4.4155, + "step": 28452 + }, + { + "epoch": 0.16921805119421449, + "grad_norm": 2.41377854347229, + "learning_rate": 4.6549984823290454e-05, + "loss": 3.613, + "step": 28453 + }, + { + "epoch": 0.16922399847749547, + "grad_norm": 1.9458948373794556, + "learning_rate": 4.654974804293305e-05, + "loss": 3.6051, + "step": 28454 + }, + { + "epoch": 0.1692299457607765, + "grad_norm": 1.7371017932891846, + "learning_rate": 4.6549511255052844e-05, + "loss": 5.1229, + "step": 28455 + }, + { + "epoch": 0.16923589304405748, + "grad_norm": 1.3374329805374146, + "learning_rate": 4.654927445964993e-05, + "loss": 5.7105, + "step": 28456 + }, + { + "epoch": 0.16924184032733847, + "grad_norm": 1.453912377357483, + "learning_rate": 4.654903765672439e-05, + "loss": 5.7225, + "step": 28457 + }, + { + "epoch": 0.16924778761061948, + "grad_norm": 1.984152913093567, + "learning_rate": 4.65488008462763e-05, + "loss": 4.874, + "step": 28458 + }, + { + "epoch": 0.16925373489390047, + "grad_norm": 1.618017554283142, + "learning_rate": 4.6548564028305746e-05, + "loss": 4.6159, + "step": 28459 + }, + { + "epoch": 0.16925968217718146, + "grad_norm": 2.104875087738037, + "learning_rate": 4.654832720281281e-05, + "loss": 3.9827, + "step": 28460 + }, + { + "epoch": 0.16926562946046247, + "grad_norm": 1.9092068672180176, + "learning_rate": 4.654809036979758e-05, + "loss": 3.8551, + "step": 28461 + }, + { + "epoch": 0.16927157674374346, + "grad_norm": 1.6868946552276611, + "learning_rate": 4.6547853529260135e-05, + "loss": 5.6583, + "step": 28462 + }, + { + "epoch": 0.16927752402702445, + "grad_norm": 2.0791547298431396, + "learning_rate": 4.6547616681200544e-05, + "loss": 4.7682, + "step": 28463 + }, + { + "epoch": 0.16928347131030544, + "grad_norm": 2.254826307296753, + "learning_rate": 4.654737982561892e-05, + "loss": 3.7339, + "step": 28464 + }, + { + "epoch": 0.16928941859358645, + "grad_norm": 1.6225947141647339, + "learning_rate": 4.6547142962515314e-05, + "loss": 4.8278, + "step": 28465 + }, + { + "epoch": 0.16929536587686744, + "grad_norm": 1.8425785303115845, + "learning_rate": 4.654690609188983e-05, + "loss": 4.0161, + "step": 28466 + }, + { + "epoch": 0.16930131316014843, + "grad_norm": 1.9367843866348267, + "learning_rate": 4.6546669213742545e-05, + "loss": 3.794, + "step": 28467 + }, + { + "epoch": 0.16930726044342945, + "grad_norm": 1.988096833229065, + "learning_rate": 4.654643232807354e-05, + "loss": 3.7874, + "step": 28468 + }, + { + "epoch": 0.16931320772671044, + "grad_norm": 1.84897780418396, + "learning_rate": 4.6546195434882895e-05, + "loss": 3.8368, + "step": 28469 + }, + { + "epoch": 0.16931915500999142, + "grad_norm": 1.7867851257324219, + "learning_rate": 4.65459585341707e-05, + "loss": 3.7485, + "step": 28470 + }, + { + "epoch": 0.16932510229327244, + "grad_norm": 1.8112739324569702, + "learning_rate": 4.654572162593703e-05, + "loss": 3.7541, + "step": 28471 + }, + { + "epoch": 0.16933104957655343, + "grad_norm": 1.7835328578948975, + "learning_rate": 4.6545484710181974e-05, + "loss": 3.8461, + "step": 28472 + }, + { + "epoch": 0.16933699685983442, + "grad_norm": 1.7823615074157715, + "learning_rate": 4.6545247786905614e-05, + "loss": 3.7878, + "step": 28473 + }, + { + "epoch": 0.16934294414311543, + "grad_norm": 1.8897929191589355, + "learning_rate": 4.654501085610802e-05, + "loss": 3.8613, + "step": 28474 + }, + { + "epoch": 0.16934889142639642, + "grad_norm": 1.9433989524841309, + "learning_rate": 4.654477391778929e-05, + "loss": 3.7189, + "step": 28475 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 1.688061237335205, + "learning_rate": 4.6544536971949504e-05, + "loss": 4.1471, + "step": 28476 + }, + { + "epoch": 0.16936078599295842, + "grad_norm": 1.9753577709197998, + "learning_rate": 4.654430001858874e-05, + "loss": 4.1729, + "step": 28477 + }, + { + "epoch": 0.1693667332762394, + "grad_norm": 1.6471655368804932, + "learning_rate": 4.654406305770709e-05, + "loss": 5.4232, + "step": 28478 + }, + { + "epoch": 0.1693726805595204, + "grad_norm": 1.5919240713119507, + "learning_rate": 4.6543826089304626e-05, + "loss": 5.6299, + "step": 28479 + }, + { + "epoch": 0.16937862784280142, + "grad_norm": 1.505886435508728, + "learning_rate": 4.6543589113381434e-05, + "loss": 5.472, + "step": 28480 + }, + { + "epoch": 0.1693845751260824, + "grad_norm": 1.3407920598983765, + "learning_rate": 4.65433521299376e-05, + "loss": 5.4519, + "step": 28481 + }, + { + "epoch": 0.1693905224093634, + "grad_norm": 1.785452127456665, + "learning_rate": 4.65431151389732e-05, + "loss": 5.0539, + "step": 28482 + }, + { + "epoch": 0.1693964696926444, + "grad_norm": 1.6076501607894897, + "learning_rate": 4.654287814048833e-05, + "loss": 5.5523, + "step": 28483 + }, + { + "epoch": 0.1694024169759254, + "grad_norm": 1.7751826047897339, + "learning_rate": 4.654264113448306e-05, + "loss": 5.3904, + "step": 28484 + }, + { + "epoch": 0.16940836425920638, + "grad_norm": 2.516270160675049, + "learning_rate": 4.6542404120957465e-05, + "loss": 3.6737, + "step": 28485 + }, + { + "epoch": 0.1694143115424874, + "grad_norm": 2.094210386276245, + "learning_rate": 4.654216709991165e-05, + "loss": 3.3822, + "step": 28486 + }, + { + "epoch": 0.1694202588257684, + "grad_norm": 1.9401110410690308, + "learning_rate": 4.6541930071345685e-05, + "loss": 3.3866, + "step": 28487 + }, + { + "epoch": 0.16942620610904938, + "grad_norm": 1.6965755224227905, + "learning_rate": 4.654169303525966e-05, + "loss": 4.8492, + "step": 28488 + }, + { + "epoch": 0.1694321533923304, + "grad_norm": 2.676941156387329, + "learning_rate": 4.654145599165365e-05, + "loss": 4.4578, + "step": 28489 + }, + { + "epoch": 0.16943810067561138, + "grad_norm": 2.53593111038208, + "learning_rate": 4.654121894052773e-05, + "loss": 3.9574, + "step": 28490 + }, + { + "epoch": 0.16944404795889237, + "grad_norm": 2.355025053024292, + "learning_rate": 4.6540981881882006e-05, + "loss": 4.0911, + "step": 28491 + }, + { + "epoch": 0.16944999524217338, + "grad_norm": 2.2941341400146484, + "learning_rate": 4.654074481571654e-05, + "loss": 4.3186, + "step": 28492 + }, + { + "epoch": 0.16945594252545437, + "grad_norm": 2.2436282634735107, + "learning_rate": 4.654050774203143e-05, + "loss": 4.0785, + "step": 28493 + }, + { + "epoch": 0.16946188980873536, + "grad_norm": 2.8532540798187256, + "learning_rate": 4.6540270660826744e-05, + "loss": 3.2517, + "step": 28494 + }, + { + "epoch": 0.16946783709201638, + "grad_norm": 2.7810893058776855, + "learning_rate": 4.6540033572102575e-05, + "loss": 3.462, + "step": 28495 + }, + { + "epoch": 0.16947378437529736, + "grad_norm": 2.5841453075408936, + "learning_rate": 4.6539796475859004e-05, + "loss": 4.4611, + "step": 28496 + }, + { + "epoch": 0.16947973165857835, + "grad_norm": 2.433039903640747, + "learning_rate": 4.653955937209611e-05, + "loss": 3.7666, + "step": 28497 + }, + { + "epoch": 0.16948567894185937, + "grad_norm": 1.7830419540405273, + "learning_rate": 4.6539322260813984e-05, + "loss": 4.9613, + "step": 28498 + }, + { + "epoch": 0.16949162622514036, + "grad_norm": 1.8452028036117554, + "learning_rate": 4.653908514201269e-05, + "loss": 5.0721, + "step": 28499 + }, + { + "epoch": 0.16949757350842135, + "grad_norm": 1.9641203880310059, + "learning_rate": 4.6538848015692336e-05, + "loss": 4.2726, + "step": 28500 + }, + { + "epoch": 0.16950352079170236, + "grad_norm": 2.1620960235595703, + "learning_rate": 4.6538610881853e-05, + "loss": 3.9638, + "step": 28501 + }, + { + "epoch": 0.16950946807498335, + "grad_norm": 1.977523922920227, + "learning_rate": 4.6538373740494737e-05, + "loss": 4.0448, + "step": 28502 + }, + { + "epoch": 0.16951541535826434, + "grad_norm": 1.7069354057312012, + "learning_rate": 4.653813659161766e-05, + "loss": 4.053, + "step": 28503 + }, + { + "epoch": 0.16952136264154535, + "grad_norm": 1.8894158601760864, + "learning_rate": 4.653789943522184e-05, + "loss": 4.1357, + "step": 28504 + }, + { + "epoch": 0.16952730992482634, + "grad_norm": 1.8103679418563843, + "learning_rate": 4.6537662271307366e-05, + "loss": 3.8426, + "step": 28505 + }, + { + "epoch": 0.16953325720810733, + "grad_norm": 1.6966679096221924, + "learning_rate": 4.653742509987431e-05, + "loss": 3.9686, + "step": 28506 + }, + { + "epoch": 0.16953920449138835, + "grad_norm": 1.8758342266082764, + "learning_rate": 4.653718792092278e-05, + "loss": 3.7168, + "step": 28507 + }, + { + "epoch": 0.16954515177466933, + "grad_norm": 1.738481879234314, + "learning_rate": 4.6536950734452824e-05, + "loss": 4.0376, + "step": 28508 + }, + { + "epoch": 0.16955109905795032, + "grad_norm": 1.8814899921417236, + "learning_rate": 4.653671354046454e-05, + "loss": 3.7981, + "step": 28509 + }, + { + "epoch": 0.16955704634123134, + "grad_norm": 1.7275527715682983, + "learning_rate": 4.653647633895801e-05, + "loss": 3.7576, + "step": 28510 + }, + { + "epoch": 0.16956299362451233, + "grad_norm": 1.5637880563735962, + "learning_rate": 4.6536239129933326e-05, + "loss": 5.5343, + "step": 28511 + }, + { + "epoch": 0.16956894090779331, + "grad_norm": 1.6974562406539917, + "learning_rate": 4.653600191339056e-05, + "loss": 5.9386, + "step": 28512 + }, + { + "epoch": 0.16957488819107433, + "grad_norm": 2.0787951946258545, + "learning_rate": 4.65357646893298e-05, + "loss": 5.6018, + "step": 28513 + }, + { + "epoch": 0.16958083547435532, + "grad_norm": 2.0893337726593018, + "learning_rate": 4.653552745775113e-05, + "loss": 5.5357, + "step": 28514 + }, + { + "epoch": 0.1695867827576363, + "grad_norm": 2.1055009365081787, + "learning_rate": 4.6535290218654624e-05, + "loss": 5.6448, + "step": 28515 + }, + { + "epoch": 0.16959273004091732, + "grad_norm": 2.247347116470337, + "learning_rate": 4.653505297204037e-05, + "loss": 4.0233, + "step": 28516 + }, + { + "epoch": 0.1695986773241983, + "grad_norm": 1.5102436542510986, + "learning_rate": 4.653481571790846e-05, + "loss": 5.1274, + "step": 28517 + }, + { + "epoch": 0.1696046246074793, + "grad_norm": 1.5515743494033813, + "learning_rate": 4.653457845625896e-05, + "loss": 6.1905, + "step": 28518 + }, + { + "epoch": 0.16961057189076031, + "grad_norm": 1.5858293771743774, + "learning_rate": 4.6534341187091965e-05, + "loss": 5.2316, + "step": 28519 + }, + { + "epoch": 0.1696165191740413, + "grad_norm": 3.305469274520874, + "learning_rate": 4.653410391040755e-05, + "loss": 4.022, + "step": 28520 + }, + { + "epoch": 0.1696224664573223, + "grad_norm": 1.6751025915145874, + "learning_rate": 4.6533866626205805e-05, + "loss": 5.2442, + "step": 28521 + }, + { + "epoch": 0.16962841374060328, + "grad_norm": 1.777486801147461, + "learning_rate": 4.653362933448681e-05, + "loss": 5.0407, + "step": 28522 + }, + { + "epoch": 0.1696343610238843, + "grad_norm": 1.5896446704864502, + "learning_rate": 4.653339203525065e-05, + "loss": 4.807, + "step": 28523 + }, + { + "epoch": 0.16964030830716528, + "grad_norm": 1.9087060689926147, + "learning_rate": 4.65331547284974e-05, + "loss": 5.0863, + "step": 28524 + }, + { + "epoch": 0.16964625559044627, + "grad_norm": 1.7064319849014282, + "learning_rate": 4.653291741422715e-05, + "loss": 5.2761, + "step": 28525 + }, + { + "epoch": 0.1696522028737273, + "grad_norm": 1.5838422775268555, + "learning_rate": 4.6532680092439986e-05, + "loss": 5.316, + "step": 28526 + }, + { + "epoch": 0.16965815015700828, + "grad_norm": 1.702512264251709, + "learning_rate": 4.653244276313598e-05, + "loss": 5.2548, + "step": 28527 + }, + { + "epoch": 0.16966409744028926, + "grad_norm": 1.4088670015335083, + "learning_rate": 4.6532205426315215e-05, + "loss": 5.1767, + "step": 28528 + }, + { + "epoch": 0.16967004472357028, + "grad_norm": 2.7728757858276367, + "learning_rate": 4.653196808197779e-05, + "loss": 4.5771, + "step": 28529 + }, + { + "epoch": 0.16967599200685127, + "grad_norm": 2.977949857711792, + "learning_rate": 4.653173073012377e-05, + "loss": 4.2778, + "step": 28530 + }, + { + "epoch": 0.16968193929013226, + "grad_norm": 2.986652374267578, + "learning_rate": 4.6531493370753254e-05, + "loss": 4.1076, + "step": 28531 + }, + { + "epoch": 0.16968788657341327, + "grad_norm": 2.596334934234619, + "learning_rate": 4.6531256003866305e-05, + "loss": 3.6769, + "step": 28532 + }, + { + "epoch": 0.16969383385669426, + "grad_norm": 2.381591796875, + "learning_rate": 4.653101862946303e-05, + "loss": 3.9261, + "step": 28533 + }, + { + "epoch": 0.16969978113997525, + "grad_norm": 2.287313938140869, + "learning_rate": 4.653078124754349e-05, + "loss": 4.4583, + "step": 28534 + }, + { + "epoch": 0.16970572842325626, + "grad_norm": 1.716257929801941, + "learning_rate": 4.6530543858107776e-05, + "loss": 5.1735, + "step": 28535 + }, + { + "epoch": 0.16971167570653725, + "grad_norm": 1.5777500867843628, + "learning_rate": 4.6530306461155976e-05, + "loss": 4.958, + "step": 28536 + }, + { + "epoch": 0.16971762298981824, + "grad_norm": 1.6747970581054688, + "learning_rate": 4.653006905668817e-05, + "loss": 4.6559, + "step": 28537 + }, + { + "epoch": 0.16972357027309926, + "grad_norm": 1.8283017873764038, + "learning_rate": 4.652983164470444e-05, + "loss": 4.4711, + "step": 28538 + }, + { + "epoch": 0.16972951755638024, + "grad_norm": 2.753277063369751, + "learning_rate": 4.652959422520485e-05, + "loss": 3.9467, + "step": 28539 + }, + { + "epoch": 0.16973546483966123, + "grad_norm": 1.993268370628357, + "learning_rate": 4.652935679818952e-05, + "loss": 4.8315, + "step": 28540 + }, + { + "epoch": 0.16974141212294225, + "grad_norm": 1.7056300640106201, + "learning_rate": 4.652911936365851e-05, + "loss": 5.6509, + "step": 28541 + }, + { + "epoch": 0.16974735940622324, + "grad_norm": 1.6653499603271484, + "learning_rate": 4.6528881921611904e-05, + "loss": 5.5002, + "step": 28542 + }, + { + "epoch": 0.16975330668950422, + "grad_norm": 1.5368744134902954, + "learning_rate": 4.6528644472049795e-05, + "loss": 5.0847, + "step": 28543 + }, + { + "epoch": 0.16975925397278524, + "grad_norm": 1.597609043121338, + "learning_rate": 4.6528407014972255e-05, + "loss": 5.4779, + "step": 28544 + }, + { + "epoch": 0.16976520125606623, + "grad_norm": 1.5362802743911743, + "learning_rate": 4.6528169550379364e-05, + "loss": 4.931, + "step": 28545 + }, + { + "epoch": 0.16977114853934722, + "grad_norm": 1.4700133800506592, + "learning_rate": 4.652793207827122e-05, + "loss": 5.6209, + "step": 28546 + }, + { + "epoch": 0.16977709582262823, + "grad_norm": 2.0117483139038086, + "learning_rate": 4.652769459864788e-05, + "loss": 4.7425, + "step": 28547 + }, + { + "epoch": 0.16978304310590922, + "grad_norm": 1.4520665407180786, + "learning_rate": 4.652745711150946e-05, + "loss": 5.135, + "step": 28548 + }, + { + "epoch": 0.1697889903891902, + "grad_norm": 1.5992931127548218, + "learning_rate": 4.6527219616856036e-05, + "loss": 5.2732, + "step": 28549 + }, + { + "epoch": 0.16979493767247122, + "grad_norm": 1.689389944076538, + "learning_rate": 4.6526982114687666e-05, + "loss": 5.1537, + "step": 28550 + }, + { + "epoch": 0.1698008849557522, + "grad_norm": 1.5059309005737305, + "learning_rate": 4.652674460500446e-05, + "loss": 4.9021, + "step": 28551 + }, + { + "epoch": 0.1698068322390332, + "grad_norm": 2.6482186317443848, + "learning_rate": 4.652650708780648e-05, + "loss": 4.9221, + "step": 28552 + }, + { + "epoch": 0.16981277952231422, + "grad_norm": 1.7961699962615967, + "learning_rate": 4.652626956309382e-05, + "loss": 5.3804, + "step": 28553 + }, + { + "epoch": 0.1698187268055952, + "grad_norm": 1.704698085784912, + "learning_rate": 4.652603203086656e-05, + "loss": 5.775, + "step": 28554 + }, + { + "epoch": 0.1698246740888762, + "grad_norm": 1.7374398708343506, + "learning_rate": 4.65257944911248e-05, + "loss": 5.6455, + "step": 28555 + }, + { + "epoch": 0.1698306213721572, + "grad_norm": 1.5410466194152832, + "learning_rate": 4.652555694386859e-05, + "loss": 5.7316, + "step": 28556 + }, + { + "epoch": 0.1698365686554382, + "grad_norm": 1.5294291973114014, + "learning_rate": 4.652531938909804e-05, + "loss": 5.0427, + "step": 28557 + }, + { + "epoch": 0.16984251593871919, + "grad_norm": 2.2420549392700195, + "learning_rate": 4.652508182681322e-05, + "loss": 3.8954, + "step": 28558 + }, + { + "epoch": 0.1698484632220002, + "grad_norm": 1.640631079673767, + "learning_rate": 4.652484425701422e-05, + "loss": 5.2021, + "step": 28559 + }, + { + "epoch": 0.1698544105052812, + "grad_norm": 1.3961762189865112, + "learning_rate": 4.652460667970111e-05, + "loss": 4.6562, + "step": 28560 + }, + { + "epoch": 0.16986035778856218, + "grad_norm": 1.408497929573059, + "learning_rate": 4.6524369094873985e-05, + "loss": 5.2449, + "step": 28561 + }, + { + "epoch": 0.1698663050718432, + "grad_norm": 1.544072151184082, + "learning_rate": 4.6524131502532934e-05, + "loss": 5.1623, + "step": 28562 + }, + { + "epoch": 0.16987225235512418, + "grad_norm": 1.4092038869857788, + "learning_rate": 4.652389390267802e-05, + "loss": 5.1672, + "step": 28563 + }, + { + "epoch": 0.16987819963840517, + "grad_norm": 1.533828616142273, + "learning_rate": 4.6523656295309346e-05, + "loss": 5.1873, + "step": 28564 + }, + { + "epoch": 0.16988414692168619, + "grad_norm": 1.690058946609497, + "learning_rate": 4.6523418680426986e-05, + "loss": 5.1518, + "step": 28565 + }, + { + "epoch": 0.16989009420496717, + "grad_norm": 1.192253828048706, + "learning_rate": 4.652318105803102e-05, + "loss": 5.1708, + "step": 28566 + }, + { + "epoch": 0.16989604148824816, + "grad_norm": 1.6222058534622192, + "learning_rate": 4.6522943428121526e-05, + "loss": 5.2261, + "step": 28567 + }, + { + "epoch": 0.16990198877152918, + "grad_norm": 1.9990545511245728, + "learning_rate": 4.65227057906986e-05, + "loss": 5.1013, + "step": 28568 + }, + { + "epoch": 0.16990793605481017, + "grad_norm": 1.929602861404419, + "learning_rate": 4.652246814576233e-05, + "loss": 4.8618, + "step": 28569 + }, + { + "epoch": 0.16991388333809115, + "grad_norm": 1.3916577100753784, + "learning_rate": 4.6522230493312777e-05, + "loss": 4.929, + "step": 28570 + }, + { + "epoch": 0.16991983062137217, + "grad_norm": 1.7045917510986328, + "learning_rate": 4.6521992833350036e-05, + "loss": 4.925, + "step": 28571 + }, + { + "epoch": 0.16992577790465316, + "grad_norm": 1.68044114112854, + "learning_rate": 4.6521755165874194e-05, + "loss": 5.3032, + "step": 28572 + }, + { + "epoch": 0.16993172518793415, + "grad_norm": 1.747460126876831, + "learning_rate": 4.652151749088533e-05, + "loss": 5.1043, + "step": 28573 + }, + { + "epoch": 0.16993767247121516, + "grad_norm": 1.7225557565689087, + "learning_rate": 4.6521279808383526e-05, + "loss": 4.7359, + "step": 28574 + }, + { + "epoch": 0.16994361975449615, + "grad_norm": 1.9875255823135376, + "learning_rate": 4.652104211836886e-05, + "loss": 3.912, + "step": 28575 + }, + { + "epoch": 0.16994956703777714, + "grad_norm": 1.898094654083252, + "learning_rate": 4.652080442084142e-05, + "loss": 4.012, + "step": 28576 + }, + { + "epoch": 0.16995551432105815, + "grad_norm": 1.8791594505310059, + "learning_rate": 4.65205667158013e-05, + "loss": 3.8007, + "step": 28577 + }, + { + "epoch": 0.16996146160433914, + "grad_norm": 1.85286545753479, + "learning_rate": 4.652032900324857e-05, + "loss": 3.8686, + "step": 28578 + }, + { + "epoch": 0.16996740888762013, + "grad_norm": 1.8084555864334106, + "learning_rate": 4.652009128318331e-05, + "loss": 3.8287, + "step": 28579 + }, + { + "epoch": 0.16997335617090112, + "grad_norm": 1.8365230560302734, + "learning_rate": 4.651985355560562e-05, + "loss": 3.8072, + "step": 28580 + }, + { + "epoch": 0.16997930345418213, + "grad_norm": 1.8318002223968506, + "learning_rate": 4.651961582051555e-05, + "loss": 3.5751, + "step": 28581 + }, + { + "epoch": 0.16998525073746312, + "grad_norm": 2.9217238426208496, + "learning_rate": 4.651937807791322e-05, + "loss": 4.3074, + "step": 28582 + }, + { + "epoch": 0.1699911980207441, + "grad_norm": 1.8495897054672241, + "learning_rate": 4.651914032779869e-05, + "loss": 3.5268, + "step": 28583 + }, + { + "epoch": 0.16999714530402513, + "grad_norm": 1.7885898351669312, + "learning_rate": 4.651890257017206e-05, + "loss": 3.2383, + "step": 28584 + }, + { + "epoch": 0.17000309258730611, + "grad_norm": 1.9159060716629028, + "learning_rate": 4.6518664805033395e-05, + "loss": 3.7259, + "step": 28585 + }, + { + "epoch": 0.1700090398705871, + "grad_norm": 1.733549952507019, + "learning_rate": 4.6518427032382793e-05, + "loss": 5.1259, + "step": 28586 + }, + { + "epoch": 0.17001498715386812, + "grad_norm": 2.508037805557251, + "learning_rate": 4.651818925222033e-05, + "loss": 3.8367, + "step": 28587 + }, + { + "epoch": 0.1700209344371491, + "grad_norm": 2.5397400856018066, + "learning_rate": 4.651795146454608e-05, + "loss": 3.4588, + "step": 28588 + }, + { + "epoch": 0.1700268817204301, + "grad_norm": 2.3859269618988037, + "learning_rate": 4.651771366936015e-05, + "loss": 3.3977, + "step": 28589 + }, + { + "epoch": 0.1700328290037111, + "grad_norm": 1.8520206212997437, + "learning_rate": 4.65174758666626e-05, + "loss": 4.0797, + "step": 28590 + }, + { + "epoch": 0.1700387762869921, + "grad_norm": 2.0465288162231445, + "learning_rate": 4.651723805645352e-05, + "loss": 3.2528, + "step": 28591 + }, + { + "epoch": 0.1700447235702731, + "grad_norm": 2.100496530532837, + "learning_rate": 4.651700023873299e-05, + "loss": 2.9472, + "step": 28592 + }, + { + "epoch": 0.1700506708535541, + "grad_norm": 2.4353413581848145, + "learning_rate": 4.6516762413501106e-05, + "loss": 3.161, + "step": 28593 + }, + { + "epoch": 0.1700566181368351, + "grad_norm": 2.609565019607544, + "learning_rate": 4.651652458075794e-05, + "loss": 3.5234, + "step": 28594 + }, + { + "epoch": 0.17006256542011608, + "grad_norm": 2.2567410469055176, + "learning_rate": 4.651628674050358e-05, + "loss": 3.5863, + "step": 28595 + }, + { + "epoch": 0.1700685127033971, + "grad_norm": 2.6345736980438232, + "learning_rate": 4.6516048892738104e-05, + "loss": 3.5194, + "step": 28596 + }, + { + "epoch": 0.17007445998667808, + "grad_norm": 1.9039238691329956, + "learning_rate": 4.65158110374616e-05, + "loss": 4.0329, + "step": 28597 + }, + { + "epoch": 0.17008040726995907, + "grad_norm": 1.6507738828659058, + "learning_rate": 4.6515573174674143e-05, + "loss": 4.9022, + "step": 28598 + }, + { + "epoch": 0.1700863545532401, + "grad_norm": 1.6945186853408813, + "learning_rate": 4.651533530437583e-05, + "loss": 4.9487, + "step": 28599 + }, + { + "epoch": 0.17009230183652108, + "grad_norm": 1.8337676525115967, + "learning_rate": 4.651509742656673e-05, + "loss": 5.1238, + "step": 28600 + }, + { + "epoch": 0.17009824911980206, + "grad_norm": 1.4968239068984985, + "learning_rate": 4.651485954124694e-05, + "loss": 4.782, + "step": 28601 + }, + { + "epoch": 0.17010419640308308, + "grad_norm": 1.8200058937072754, + "learning_rate": 4.651462164841652e-05, + "loss": 5.3675, + "step": 28602 + }, + { + "epoch": 0.17011014368636407, + "grad_norm": 1.788134217262268, + "learning_rate": 4.6514383748075575e-05, + "loss": 4.6486, + "step": 28603 + }, + { + "epoch": 0.17011609096964506, + "grad_norm": 1.6064730882644653, + "learning_rate": 4.6514145840224184e-05, + "loss": 4.4153, + "step": 28604 + }, + { + "epoch": 0.17012203825292607, + "grad_norm": 1.4705356359481812, + "learning_rate": 4.651390792486242e-05, + "loss": 4.7254, + "step": 28605 + }, + { + "epoch": 0.17012798553620706, + "grad_norm": 1.5670931339263916, + "learning_rate": 4.6513670001990385e-05, + "loss": 5.0288, + "step": 28606 + }, + { + "epoch": 0.17013393281948805, + "grad_norm": 1.9141185283660889, + "learning_rate": 4.651343207160814e-05, + "loss": 5.0111, + "step": 28607 + }, + { + "epoch": 0.17013988010276906, + "grad_norm": 1.485753059387207, + "learning_rate": 4.6513194133715776e-05, + "loss": 5.0013, + "step": 28608 + }, + { + "epoch": 0.17014582738605005, + "grad_norm": 1.6797868013381958, + "learning_rate": 4.651295618831338e-05, + "loss": 5.0576, + "step": 28609 + }, + { + "epoch": 0.17015177466933104, + "grad_norm": 2.6057140827178955, + "learning_rate": 4.651271823540104e-05, + "loss": 3.9116, + "step": 28610 + }, + { + "epoch": 0.17015772195261206, + "grad_norm": 2.83886456489563, + "learning_rate": 4.651248027497883e-05, + "loss": 4.3674, + "step": 28611 + }, + { + "epoch": 0.17016366923589304, + "grad_norm": 2.470137596130371, + "learning_rate": 4.6512242307046834e-05, + "loss": 4.5506, + "step": 28612 + }, + { + "epoch": 0.17016961651917403, + "grad_norm": 2.0518956184387207, + "learning_rate": 4.6512004331605134e-05, + "loss": 4.9991, + "step": 28613 + }, + { + "epoch": 0.17017556380245505, + "grad_norm": 2.012444257736206, + "learning_rate": 4.6511766348653816e-05, + "loss": 4.6678, + "step": 28614 + }, + { + "epoch": 0.17018151108573604, + "grad_norm": 2.152315616607666, + "learning_rate": 4.651152835819297e-05, + "loss": 3.7695, + "step": 28615 + }, + { + "epoch": 0.17018745836901703, + "grad_norm": 2.255277156829834, + "learning_rate": 4.6511290360222664e-05, + "loss": 3.861, + "step": 28616 + }, + { + "epoch": 0.17019340565229804, + "grad_norm": 2.317800998687744, + "learning_rate": 4.651105235474299e-05, + "loss": 3.813, + "step": 28617 + }, + { + "epoch": 0.17019935293557903, + "grad_norm": 2.330914258956909, + "learning_rate": 4.651081434175403e-05, + "loss": 3.6723, + "step": 28618 + }, + { + "epoch": 0.17020530021886002, + "grad_norm": 2.112302541732788, + "learning_rate": 4.651057632125587e-05, + "loss": 3.6212, + "step": 28619 + }, + { + "epoch": 0.17021124750214103, + "grad_norm": 1.9216437339782715, + "learning_rate": 4.651033829324859e-05, + "loss": 4.3208, + "step": 28620 + }, + { + "epoch": 0.17021719478542202, + "grad_norm": 1.9902441501617432, + "learning_rate": 4.651010025773227e-05, + "loss": 4.7577, + "step": 28621 + }, + { + "epoch": 0.170223142068703, + "grad_norm": 1.7886050939559937, + "learning_rate": 4.6509862214707e-05, + "loss": 4.494, + "step": 28622 + }, + { + "epoch": 0.17022908935198403, + "grad_norm": 1.8544505834579468, + "learning_rate": 4.650962416417285e-05, + "loss": 5.4149, + "step": 28623 + }, + { + "epoch": 0.170235036635265, + "grad_norm": 1.682219386100769, + "learning_rate": 4.650938610612992e-05, + "loss": 5.434, + "step": 28624 + }, + { + "epoch": 0.170240983918546, + "grad_norm": 2.096231698989868, + "learning_rate": 4.650914804057829e-05, + "loss": 4.3005, + "step": 28625 + }, + { + "epoch": 0.17024693120182702, + "grad_norm": 2.311213970184326, + "learning_rate": 4.650890996751803e-05, + "loss": 3.7311, + "step": 28626 + }, + { + "epoch": 0.170252878485108, + "grad_norm": 1.9578297138214111, + "learning_rate": 4.650867188694924e-05, + "loss": 4.6696, + "step": 28627 + }, + { + "epoch": 0.170258825768389, + "grad_norm": 2.9123547077178955, + "learning_rate": 4.650843379887199e-05, + "loss": 3.8884, + "step": 28628 + }, + { + "epoch": 0.17026477305167, + "grad_norm": 2.6703314781188965, + "learning_rate": 4.650819570328636e-05, + "loss": 3.9453, + "step": 28629 + }, + { + "epoch": 0.170270720334951, + "grad_norm": 1.7576513290405273, + "learning_rate": 4.6507957600192454e-05, + "loss": 4.8754, + "step": 28630 + }, + { + "epoch": 0.17027666761823199, + "grad_norm": 1.6122910976409912, + "learning_rate": 4.650771948959033e-05, + "loss": 5.0507, + "step": 28631 + }, + { + "epoch": 0.170282614901513, + "grad_norm": 1.5017814636230469, + "learning_rate": 4.650748137148009e-05, + "loss": 4.9571, + "step": 28632 + }, + { + "epoch": 0.170288562184794, + "grad_norm": 1.4443883895874023, + "learning_rate": 4.6507243245861815e-05, + "loss": 4.524, + "step": 28633 + }, + { + "epoch": 0.17029450946807498, + "grad_norm": 1.8001708984375, + "learning_rate": 4.650700511273558e-05, + "loss": 4.8942, + "step": 28634 + }, + { + "epoch": 0.170300456751356, + "grad_norm": 2.039597749710083, + "learning_rate": 4.650676697210147e-05, + "loss": 5.0357, + "step": 28635 + }, + { + "epoch": 0.17030640403463698, + "grad_norm": 1.7828583717346191, + "learning_rate": 4.650652882395957e-05, + "loss": 4.8489, + "step": 28636 + }, + { + "epoch": 0.17031235131791797, + "grad_norm": 2.0128636360168457, + "learning_rate": 4.650629066830996e-05, + "loss": 4.3581, + "step": 28637 + }, + { + "epoch": 0.17031829860119896, + "grad_norm": 1.6843047142028809, + "learning_rate": 4.650605250515273e-05, + "loss": 5.2302, + "step": 28638 + }, + { + "epoch": 0.17032424588447997, + "grad_norm": 1.6175137758255005, + "learning_rate": 4.650581433448796e-05, + "loss": 5.2985, + "step": 28639 + }, + { + "epoch": 0.17033019316776096, + "grad_norm": 1.982064962387085, + "learning_rate": 4.6505576156315734e-05, + "loss": 4.8775, + "step": 28640 + }, + { + "epoch": 0.17033614045104195, + "grad_norm": 1.9722973108291626, + "learning_rate": 4.650533797063613e-05, + "loss": 4.6054, + "step": 28641 + }, + { + "epoch": 0.17034208773432297, + "grad_norm": 2.2383551597595215, + "learning_rate": 4.650509977744923e-05, + "loss": 4.2201, + "step": 28642 + }, + { + "epoch": 0.17034803501760395, + "grad_norm": 1.647186040878296, + "learning_rate": 4.650486157675513e-05, + "loss": 4.8552, + "step": 28643 + }, + { + "epoch": 0.17035398230088494, + "grad_norm": 2.658078193664551, + "learning_rate": 4.650462336855391e-05, + "loss": 4.0346, + "step": 28644 + }, + { + "epoch": 0.17035992958416596, + "grad_norm": 1.9004065990447998, + "learning_rate": 4.650438515284564e-05, + "loss": 4.7588, + "step": 28645 + }, + { + "epoch": 0.17036587686744695, + "grad_norm": 1.6584961414337158, + "learning_rate": 4.650414692963041e-05, + "loss": 5.0345, + "step": 28646 + }, + { + "epoch": 0.17037182415072794, + "grad_norm": 1.6760051250457764, + "learning_rate": 4.650390869890831e-05, + "loss": 5.2614, + "step": 28647 + }, + { + "epoch": 0.17037777143400895, + "grad_norm": 1.538028597831726, + "learning_rate": 4.650367046067942e-05, + "loss": 5.3746, + "step": 28648 + }, + { + "epoch": 0.17038371871728994, + "grad_norm": 1.592532992362976, + "learning_rate": 4.650343221494381e-05, + "loss": 5.2738, + "step": 28649 + }, + { + "epoch": 0.17038966600057093, + "grad_norm": 1.472048044204712, + "learning_rate": 4.650319396170158e-05, + "loss": 5.1399, + "step": 28650 + }, + { + "epoch": 0.17039561328385194, + "grad_norm": 1.570019245147705, + "learning_rate": 4.650295570095281e-05, + "loss": 5.199, + "step": 28651 + }, + { + "epoch": 0.17040156056713293, + "grad_norm": 1.82230806350708, + "learning_rate": 4.6502717432697577e-05, + "loss": 5.1108, + "step": 28652 + }, + { + "epoch": 0.17040750785041392, + "grad_norm": 1.9128144979476929, + "learning_rate": 4.650247915693596e-05, + "loss": 5.1805, + "step": 28653 + }, + { + "epoch": 0.17041345513369494, + "grad_norm": 1.683923363685608, + "learning_rate": 4.650224087366806e-05, + "loss": 5.203, + "step": 28654 + }, + { + "epoch": 0.17041940241697592, + "grad_norm": 1.5329160690307617, + "learning_rate": 4.6502002582893944e-05, + "loss": 4.8658, + "step": 28655 + }, + { + "epoch": 0.1704253497002569, + "grad_norm": 2.3513686656951904, + "learning_rate": 4.65017642846137e-05, + "loss": 4.9593, + "step": 28656 + }, + { + "epoch": 0.17043129698353793, + "grad_norm": 1.7208911180496216, + "learning_rate": 4.650152597882742e-05, + "loss": 5.2315, + "step": 28657 + }, + { + "epoch": 0.17043724426681892, + "grad_norm": 1.7835557460784912, + "learning_rate": 4.650128766553518e-05, + "loss": 5.2212, + "step": 28658 + }, + { + "epoch": 0.1704431915500999, + "grad_norm": 2.004202365875244, + "learning_rate": 4.650104934473705e-05, + "loss": 4.8766, + "step": 28659 + }, + { + "epoch": 0.17044913883338092, + "grad_norm": 1.7374918460845947, + "learning_rate": 4.650081101643314e-05, + "loss": 5.3659, + "step": 28660 + }, + { + "epoch": 0.1704550861166619, + "grad_norm": 1.5580469369888306, + "learning_rate": 4.650057268062351e-05, + "loss": 5.012, + "step": 28661 + }, + { + "epoch": 0.1704610333999429, + "grad_norm": 1.7098673582077026, + "learning_rate": 4.650033433730826e-05, + "loss": 5.0506, + "step": 28662 + }, + { + "epoch": 0.1704669806832239, + "grad_norm": 1.7775324583053589, + "learning_rate": 4.6500095986487454e-05, + "loss": 5.3536, + "step": 28663 + }, + { + "epoch": 0.1704729279665049, + "grad_norm": 1.7413294315338135, + "learning_rate": 4.649985762816119e-05, + "loss": 5.2773, + "step": 28664 + }, + { + "epoch": 0.1704788752497859, + "grad_norm": 1.791043996810913, + "learning_rate": 4.649961926232955e-05, + "loss": 5.1409, + "step": 28665 + }, + { + "epoch": 0.1704848225330669, + "grad_norm": 1.8042404651641846, + "learning_rate": 4.649938088899262e-05, + "loss": 5.3099, + "step": 28666 + }, + { + "epoch": 0.1704907698163479, + "grad_norm": 2.329183340072632, + "learning_rate": 4.649914250815047e-05, + "loss": 4.631, + "step": 28667 + }, + { + "epoch": 0.17049671709962888, + "grad_norm": 2.9833004474639893, + "learning_rate": 4.64989041198032e-05, + "loss": 5.1604, + "step": 28668 + }, + { + "epoch": 0.1705026643829099, + "grad_norm": 3.150871992111206, + "learning_rate": 4.649866572395088e-05, + "loss": 5.0831, + "step": 28669 + }, + { + "epoch": 0.17050861166619088, + "grad_norm": 1.6283338069915771, + "learning_rate": 4.64984273205936e-05, + "loss": 5.1733, + "step": 28670 + }, + { + "epoch": 0.17051455894947187, + "grad_norm": 1.6267815828323364, + "learning_rate": 4.649818890973143e-05, + "loss": 5.3692, + "step": 28671 + }, + { + "epoch": 0.1705205062327529, + "grad_norm": 1.638006567955017, + "learning_rate": 4.649795049136448e-05, + "loss": 5.5058, + "step": 28672 + }, + { + "epoch": 0.17052645351603388, + "grad_norm": 1.605161428451538, + "learning_rate": 4.649771206549281e-05, + "loss": 4.9665, + "step": 28673 + }, + { + "epoch": 0.17053240079931486, + "grad_norm": 1.762798547744751, + "learning_rate": 4.649747363211652e-05, + "loss": 4.6831, + "step": 28674 + }, + { + "epoch": 0.17053834808259588, + "grad_norm": 2.23942494392395, + "learning_rate": 4.649723519123567e-05, + "loss": 4.6154, + "step": 28675 + }, + { + "epoch": 0.17054429536587687, + "grad_norm": 1.6567063331604004, + "learning_rate": 4.649699674285036e-05, + "loss": 5.0949, + "step": 28676 + }, + { + "epoch": 0.17055024264915786, + "grad_norm": 1.4644149541854858, + "learning_rate": 4.649675828696067e-05, + "loss": 5.5432, + "step": 28677 + }, + { + "epoch": 0.17055618993243887, + "grad_norm": 1.7737239599227905, + "learning_rate": 4.6496519823566695e-05, + "loss": 5.0056, + "step": 28678 + }, + { + "epoch": 0.17056213721571986, + "grad_norm": 2.3689754009246826, + "learning_rate": 4.64962813526685e-05, + "loss": 3.7473, + "step": 28679 + }, + { + "epoch": 0.17056808449900085, + "grad_norm": 2.3994569778442383, + "learning_rate": 4.649604287426618e-05, + "loss": 3.7447, + "step": 28680 + }, + { + "epoch": 0.17057403178228187, + "grad_norm": 2.2940452098846436, + "learning_rate": 4.64958043883598e-05, + "loss": 3.623, + "step": 28681 + }, + { + "epoch": 0.17057997906556285, + "grad_norm": 2.1584625244140625, + "learning_rate": 4.6495565894949466e-05, + "loss": 3.5711, + "step": 28682 + }, + { + "epoch": 0.17058592634884384, + "grad_norm": 1.7486004829406738, + "learning_rate": 4.649532739403526e-05, + "loss": 4.4838, + "step": 28683 + }, + { + "epoch": 0.17059187363212486, + "grad_norm": 1.8745564222335815, + "learning_rate": 4.6495088885617245e-05, + "loss": 4.6985, + "step": 28684 + }, + { + "epoch": 0.17059782091540585, + "grad_norm": 1.6774717569351196, + "learning_rate": 4.6494850369695517e-05, + "loss": 4.9845, + "step": 28685 + }, + { + "epoch": 0.17060376819868683, + "grad_norm": 1.6051801443099976, + "learning_rate": 4.649461184627017e-05, + "loss": 5.085, + "step": 28686 + }, + { + "epoch": 0.17060971548196785, + "grad_norm": 1.9558120965957642, + "learning_rate": 4.649437331534126e-05, + "loss": 5.7887, + "step": 28687 + }, + { + "epoch": 0.17061566276524884, + "grad_norm": 2.1222105026245117, + "learning_rate": 4.649413477690889e-05, + "loss": 3.9971, + "step": 28688 + }, + { + "epoch": 0.17062161004852983, + "grad_norm": 2.5469319820404053, + "learning_rate": 4.6493896230973147e-05, + "loss": 3.3402, + "step": 28689 + }, + { + "epoch": 0.17062755733181084, + "grad_norm": 1.747454285621643, + "learning_rate": 4.6493657677534107e-05, + "loss": 4.5433, + "step": 28690 + }, + { + "epoch": 0.17063350461509183, + "grad_norm": 2.327911138534546, + "learning_rate": 4.6493419116591845e-05, + "loss": 5.1279, + "step": 28691 + }, + { + "epoch": 0.17063945189837282, + "grad_norm": 1.96173894405365, + "learning_rate": 4.649318054814646e-05, + "loss": 4.6642, + "step": 28692 + }, + { + "epoch": 0.17064539918165383, + "grad_norm": 2.74940824508667, + "learning_rate": 4.6492941972198026e-05, + "loss": 4.9272, + "step": 28693 + }, + { + "epoch": 0.17065134646493482, + "grad_norm": 2.1249771118164062, + "learning_rate": 4.649270338874663e-05, + "loss": 4.8603, + "step": 28694 + }, + { + "epoch": 0.1706572937482158, + "grad_norm": 1.5566577911376953, + "learning_rate": 4.6492464797792344e-05, + "loss": 5.0004, + "step": 28695 + }, + { + "epoch": 0.1706632410314968, + "grad_norm": 1.5969873666763306, + "learning_rate": 4.649222619933527e-05, + "loss": 5.1347, + "step": 28696 + }, + { + "epoch": 0.17066918831477781, + "grad_norm": 1.894946813583374, + "learning_rate": 4.649198759337548e-05, + "loss": 5.1455, + "step": 28697 + }, + { + "epoch": 0.1706751355980588, + "grad_norm": 1.7214184999465942, + "learning_rate": 4.6491748979913056e-05, + "loss": 5.2916, + "step": 28698 + }, + { + "epoch": 0.1706810828813398, + "grad_norm": 1.8061472177505493, + "learning_rate": 4.649151035894809e-05, + "loss": 4.8581, + "step": 28699 + }, + { + "epoch": 0.1706870301646208, + "grad_norm": 2.3920493125915527, + "learning_rate": 4.649127173048066e-05, + "loss": 4.8851, + "step": 28700 + }, + { + "epoch": 0.1706929774479018, + "grad_norm": 1.7309520244598389, + "learning_rate": 4.649103309451084e-05, + "loss": 4.5377, + "step": 28701 + }, + { + "epoch": 0.17069892473118278, + "grad_norm": 1.757692813873291, + "learning_rate": 4.6490794451038725e-05, + "loss": 4.9765, + "step": 28702 + }, + { + "epoch": 0.1707048720144638, + "grad_norm": 2.2090845108032227, + "learning_rate": 4.64905558000644e-05, + "loss": 4.741, + "step": 28703 + }, + { + "epoch": 0.1707108192977448, + "grad_norm": 1.7464302778244019, + "learning_rate": 4.649031714158794e-05, + "loss": 4.9167, + "step": 28704 + }, + { + "epoch": 0.17071676658102578, + "grad_norm": 1.4639854431152344, + "learning_rate": 4.649007847560944e-05, + "loss": 5.1732, + "step": 28705 + }, + { + "epoch": 0.1707227138643068, + "grad_norm": 1.8633160591125488, + "learning_rate": 4.648983980212896e-05, + "loss": 4.3169, + "step": 28706 + }, + { + "epoch": 0.17072866114758778, + "grad_norm": 1.645669937133789, + "learning_rate": 4.648960112114662e-05, + "loss": 5.3615, + "step": 28707 + }, + { + "epoch": 0.17073460843086877, + "grad_norm": 1.802817702293396, + "learning_rate": 4.648936243266246e-05, + "loss": 4.6081, + "step": 28708 + }, + { + "epoch": 0.17074055571414978, + "grad_norm": 1.6780096292495728, + "learning_rate": 4.648912373667661e-05, + "loss": 4.8164, + "step": 28709 + }, + { + "epoch": 0.17074650299743077, + "grad_norm": 1.6830222606658936, + "learning_rate": 4.648888503318911e-05, + "loss": 5.1217, + "step": 28710 + }, + { + "epoch": 0.17075245028071176, + "grad_norm": 1.9091911315917969, + "learning_rate": 4.648864632220007e-05, + "loss": 4.6718, + "step": 28711 + }, + { + "epoch": 0.17075839756399278, + "grad_norm": 1.7040106058120728, + "learning_rate": 4.6488407603709566e-05, + "loss": 5.3872, + "step": 28712 + }, + { + "epoch": 0.17076434484727376, + "grad_norm": 1.5387471914291382, + "learning_rate": 4.648816887771768e-05, + "loss": 4.999, + "step": 28713 + }, + { + "epoch": 0.17077029213055475, + "grad_norm": 1.6032272577285767, + "learning_rate": 4.648793014422449e-05, + "loss": 5.3291, + "step": 28714 + }, + { + "epoch": 0.17077623941383577, + "grad_norm": 2.1550817489624023, + "learning_rate": 4.6487691403230096e-05, + "loss": 4.4169, + "step": 28715 + }, + { + "epoch": 0.17078218669711676, + "grad_norm": 1.632123589515686, + "learning_rate": 4.648745265473457e-05, + "loss": 4.8016, + "step": 28716 + }, + { + "epoch": 0.17078813398039774, + "grad_norm": 1.9822715520858765, + "learning_rate": 4.6487213898737986e-05, + "loss": 4.8404, + "step": 28717 + }, + { + "epoch": 0.17079408126367876, + "grad_norm": 1.4587271213531494, + "learning_rate": 4.648697513524044e-05, + "loss": 5.195, + "step": 28718 + }, + { + "epoch": 0.17080002854695975, + "grad_norm": 1.4583262205123901, + "learning_rate": 4.648673636424202e-05, + "loss": 5.331, + "step": 28719 + }, + { + "epoch": 0.17080597583024074, + "grad_norm": 1.508599877357483, + "learning_rate": 4.648649758574279e-05, + "loss": 5.3316, + "step": 28720 + }, + { + "epoch": 0.17081192311352175, + "grad_norm": 1.5801657438278198, + "learning_rate": 4.648625879974287e-05, + "loss": 4.9691, + "step": 28721 + }, + { + "epoch": 0.17081787039680274, + "grad_norm": 1.383544921875, + "learning_rate": 4.648602000624229e-05, + "loss": 4.8747, + "step": 28722 + }, + { + "epoch": 0.17082381768008373, + "grad_norm": 1.6122874021530151, + "learning_rate": 4.648578120524118e-05, + "loss": 4.8057, + "step": 28723 + }, + { + "epoch": 0.17082976496336474, + "grad_norm": 1.7532804012298584, + "learning_rate": 4.64855423967396e-05, + "loss": 4.7074, + "step": 28724 + }, + { + "epoch": 0.17083571224664573, + "grad_norm": 1.440300703048706, + "learning_rate": 4.648530358073764e-05, + "loss": 4.6827, + "step": 28725 + }, + { + "epoch": 0.17084165952992672, + "grad_norm": 1.4043488502502441, + "learning_rate": 4.648506475723539e-05, + "loss": 5.1083, + "step": 28726 + }, + { + "epoch": 0.17084760681320774, + "grad_norm": 2.273939609527588, + "learning_rate": 4.6484825926232914e-05, + "loss": 4.3264, + "step": 28727 + }, + { + "epoch": 0.17085355409648872, + "grad_norm": 2.029352903366089, + "learning_rate": 4.6484587087730316e-05, + "loss": 4.2814, + "step": 28728 + }, + { + "epoch": 0.1708595013797697, + "grad_norm": 1.6527879238128662, + "learning_rate": 4.648434824172767e-05, + "loss": 4.6651, + "step": 28729 + }, + { + "epoch": 0.17086544866305073, + "grad_norm": 1.6313071250915527, + "learning_rate": 4.648410938822505e-05, + "loss": 5.202, + "step": 28730 + }, + { + "epoch": 0.17087139594633172, + "grad_norm": 1.706916332244873, + "learning_rate": 4.648387052722256e-05, + "loss": 5.1041, + "step": 28731 + }, + { + "epoch": 0.1708773432296127, + "grad_norm": 1.8511303663253784, + "learning_rate": 4.6483631658720265e-05, + "loss": 4.7474, + "step": 28732 + }, + { + "epoch": 0.17088329051289372, + "grad_norm": 2.102651357650757, + "learning_rate": 4.648339278271826e-05, + "loss": 4.7116, + "step": 28733 + }, + { + "epoch": 0.1708892377961747, + "grad_norm": 1.5868231058120728, + "learning_rate": 4.648315389921662e-05, + "loss": 4.8723, + "step": 28734 + }, + { + "epoch": 0.1708951850794557, + "grad_norm": 1.5616002082824707, + "learning_rate": 4.648291500821544e-05, + "loss": 4.7078, + "step": 28735 + }, + { + "epoch": 0.1709011323627367, + "grad_norm": 1.8076444864273071, + "learning_rate": 4.6482676109714804e-05, + "loss": 4.0856, + "step": 28736 + }, + { + "epoch": 0.1709070796460177, + "grad_norm": 2.5661611557006836, + "learning_rate": 4.6482437203714766e-05, + "loss": 4.0065, + "step": 28737 + }, + { + "epoch": 0.1709130269292987, + "grad_norm": 1.9630448818206787, + "learning_rate": 4.648219829021545e-05, + "loss": 4.3436, + "step": 28738 + }, + { + "epoch": 0.1709189742125797, + "grad_norm": 1.588693618774414, + "learning_rate": 4.648195936921691e-05, + "loss": 4.8528, + "step": 28739 + }, + { + "epoch": 0.1709249214958607, + "grad_norm": 1.6260273456573486, + "learning_rate": 4.6481720440719246e-05, + "loss": 4.9007, + "step": 28740 + }, + { + "epoch": 0.17093086877914168, + "grad_norm": 1.4332720041275024, + "learning_rate": 4.648148150472253e-05, + "loss": 4.6039, + "step": 28741 + }, + { + "epoch": 0.1709368160624227, + "grad_norm": 1.5845040082931519, + "learning_rate": 4.648124256122686e-05, + "loss": 4.6129, + "step": 28742 + }, + { + "epoch": 0.17094276334570369, + "grad_norm": 1.9368457794189453, + "learning_rate": 4.6481003610232296e-05, + "loss": 4.4027, + "step": 28743 + }, + { + "epoch": 0.17094871062898467, + "grad_norm": 2.4336676597595215, + "learning_rate": 4.648076465173894e-05, + "loss": 3.9717, + "step": 28744 + }, + { + "epoch": 0.1709546579122657, + "grad_norm": 2.120758056640625, + "learning_rate": 4.648052568574688e-05, + "loss": 3.4959, + "step": 28745 + }, + { + "epoch": 0.17096060519554668, + "grad_norm": 2.1304919719696045, + "learning_rate": 4.648028671225618e-05, + "loss": 3.6002, + "step": 28746 + }, + { + "epoch": 0.17096655247882767, + "grad_norm": 2.2495477199554443, + "learning_rate": 4.648004773126694e-05, + "loss": 3.8202, + "step": 28747 + }, + { + "epoch": 0.17097249976210868, + "grad_norm": 2.0952799320220947, + "learning_rate": 4.647980874277924e-05, + "loss": 4.0671, + "step": 28748 + }, + { + "epoch": 0.17097844704538967, + "grad_norm": 2.260267972946167, + "learning_rate": 4.6479569746793154e-05, + "loss": 4.004, + "step": 28749 + }, + { + "epoch": 0.17098439432867066, + "grad_norm": 1.6694860458374023, + "learning_rate": 4.647933074330878e-05, + "loss": 4.6784, + "step": 28750 + }, + { + "epoch": 0.17099034161195167, + "grad_norm": 1.8118653297424316, + "learning_rate": 4.647909173232618e-05, + "loss": 4.4819, + "step": 28751 + }, + { + "epoch": 0.17099628889523266, + "grad_norm": 1.6766449213027954, + "learning_rate": 4.647885271384546e-05, + "loss": 4.5391, + "step": 28752 + }, + { + "epoch": 0.17100223617851365, + "grad_norm": 2.1435959339141846, + "learning_rate": 4.6478613687866696e-05, + "loss": 3.5559, + "step": 28753 + }, + { + "epoch": 0.17100818346179464, + "grad_norm": 2.2521913051605225, + "learning_rate": 4.647837465438997e-05, + "loss": 3.434, + "step": 28754 + }, + { + "epoch": 0.17101413074507565, + "grad_norm": 2.012451171875, + "learning_rate": 4.6478135613415366e-05, + "loss": 3.7475, + "step": 28755 + }, + { + "epoch": 0.17102007802835664, + "grad_norm": 2.383465528488159, + "learning_rate": 4.6477896564942956e-05, + "loss": 4.2333, + "step": 28756 + }, + { + "epoch": 0.17102602531163763, + "grad_norm": 2.0753815174102783, + "learning_rate": 4.647765750897284e-05, + "loss": 3.9532, + "step": 28757 + }, + { + "epoch": 0.17103197259491865, + "grad_norm": 2.0559349060058594, + "learning_rate": 4.64774184455051e-05, + "loss": 3.8132, + "step": 28758 + }, + { + "epoch": 0.17103791987819963, + "grad_norm": 2.2562434673309326, + "learning_rate": 4.6477179374539814e-05, + "loss": 3.9445, + "step": 28759 + }, + { + "epoch": 0.17104386716148062, + "grad_norm": 1.9799115657806396, + "learning_rate": 4.6476940296077065e-05, + "loss": 4.0676, + "step": 28760 + }, + { + "epoch": 0.17104981444476164, + "grad_norm": 2.034501552581787, + "learning_rate": 4.6476701210116935e-05, + "loss": 3.5055, + "step": 28761 + }, + { + "epoch": 0.17105576172804263, + "grad_norm": 2.2014403343200684, + "learning_rate": 4.6476462116659514e-05, + "loss": 3.7419, + "step": 28762 + }, + { + "epoch": 0.17106170901132361, + "grad_norm": 2.271733522415161, + "learning_rate": 4.6476223015704875e-05, + "loss": 3.5206, + "step": 28763 + }, + { + "epoch": 0.17106765629460463, + "grad_norm": 2.144587278366089, + "learning_rate": 4.647598390725312e-05, + "loss": 3.4963, + "step": 28764 + }, + { + "epoch": 0.17107360357788562, + "grad_norm": 1.8896453380584717, + "learning_rate": 4.647574479130432e-05, + "loss": 3.6917, + "step": 28765 + }, + { + "epoch": 0.1710795508611666, + "grad_norm": 2.5320651531219482, + "learning_rate": 4.6475505667858556e-05, + "loss": 3.4057, + "step": 28766 + }, + { + "epoch": 0.17108549814444762, + "grad_norm": 2.5660650730133057, + "learning_rate": 4.647526653691591e-05, + "loss": 3.5343, + "step": 28767 + }, + { + "epoch": 0.1710914454277286, + "grad_norm": 2.016521453857422, + "learning_rate": 4.647502739847647e-05, + "loss": 5.0209, + "step": 28768 + }, + { + "epoch": 0.1710973927110096, + "grad_norm": 2.098594903945923, + "learning_rate": 4.6474788252540323e-05, + "loss": 3.4916, + "step": 28769 + }, + { + "epoch": 0.17110333999429062, + "grad_norm": 2.502556562423706, + "learning_rate": 4.6474549099107555e-05, + "loss": 3.6106, + "step": 28770 + }, + { + "epoch": 0.1711092872775716, + "grad_norm": 2.3364086151123047, + "learning_rate": 4.647430993817824e-05, + "loss": 3.6718, + "step": 28771 + }, + { + "epoch": 0.1711152345608526, + "grad_norm": 2.453624963760376, + "learning_rate": 4.647407076975247e-05, + "loss": 4.0256, + "step": 28772 + }, + { + "epoch": 0.1711211818441336, + "grad_norm": 2.250152826309204, + "learning_rate": 4.647383159383031e-05, + "loss": 3.8149, + "step": 28773 + }, + { + "epoch": 0.1711271291274146, + "grad_norm": 2.2971277236938477, + "learning_rate": 4.6473592410411864e-05, + "loss": 4.0557, + "step": 28774 + }, + { + "epoch": 0.17113307641069558, + "grad_norm": 2.2991559505462646, + "learning_rate": 4.647335321949721e-05, + "loss": 3.9136, + "step": 28775 + }, + { + "epoch": 0.1711390236939766, + "grad_norm": 2.220536708831787, + "learning_rate": 4.647311402108643e-05, + "loss": 4.0714, + "step": 28776 + }, + { + "epoch": 0.1711449709772576, + "grad_norm": 2.1241915225982666, + "learning_rate": 4.647287481517961e-05, + "loss": 3.5843, + "step": 28777 + }, + { + "epoch": 0.17115091826053858, + "grad_norm": 2.195129632949829, + "learning_rate": 4.647263560177683e-05, + "loss": 3.5294, + "step": 28778 + }, + { + "epoch": 0.1711568655438196, + "grad_norm": 2.3440191745758057, + "learning_rate": 4.647239638087817e-05, + "loss": 3.6608, + "step": 28779 + }, + { + "epoch": 0.17116281282710058, + "grad_norm": 2.478482246398926, + "learning_rate": 4.6472157152483726e-05, + "loss": 3.8389, + "step": 28780 + }, + { + "epoch": 0.17116876011038157, + "grad_norm": 2.488262414932251, + "learning_rate": 4.647191791659357e-05, + "loss": 3.3664, + "step": 28781 + }, + { + "epoch": 0.17117470739366258, + "grad_norm": 1.9902031421661377, + "learning_rate": 4.6471678673207784e-05, + "loss": 3.4656, + "step": 28782 + }, + { + "epoch": 0.17118065467694357, + "grad_norm": 1.7979692220687866, + "learning_rate": 4.647143942232647e-05, + "loss": 4.1077, + "step": 28783 + }, + { + "epoch": 0.17118660196022456, + "grad_norm": 2.0550832748413086, + "learning_rate": 4.647120016394969e-05, + "loss": 5.0827, + "step": 28784 + }, + { + "epoch": 0.17119254924350558, + "grad_norm": 2.58035945892334, + "learning_rate": 4.647096089807753e-05, + "loss": 3.3431, + "step": 28785 + }, + { + "epoch": 0.17119849652678656, + "grad_norm": 2.9299840927124023, + "learning_rate": 4.647072162471009e-05, + "loss": 4.3467, + "step": 28786 + }, + { + "epoch": 0.17120444381006755, + "grad_norm": 2.9246139526367188, + "learning_rate": 4.6470482343847434e-05, + "loss": 4.5002, + "step": 28787 + }, + { + "epoch": 0.17121039109334857, + "grad_norm": 2.434800148010254, + "learning_rate": 4.647024305548966e-05, + "loss": 4.39, + "step": 28788 + }, + { + "epoch": 0.17121633837662956, + "grad_norm": 2.0700294971466064, + "learning_rate": 4.647000375963685e-05, + "loss": 3.6275, + "step": 28789 + }, + { + "epoch": 0.17122228565991054, + "grad_norm": 2.0739026069641113, + "learning_rate": 4.6469764456289075e-05, + "loss": 3.294, + "step": 28790 + }, + { + "epoch": 0.17122823294319156, + "grad_norm": 2.158195972442627, + "learning_rate": 4.646952514544643e-05, + "loss": 3.0345, + "step": 28791 + }, + { + "epoch": 0.17123418022647255, + "grad_norm": 2.25756573677063, + "learning_rate": 4.6469285827109e-05, + "loss": 3.4395, + "step": 28792 + }, + { + "epoch": 0.17124012750975354, + "grad_norm": 1.756030559539795, + "learning_rate": 4.646904650127686e-05, + "loss": 4.57, + "step": 28793 + }, + { + "epoch": 0.17124607479303455, + "grad_norm": 1.7527079582214355, + "learning_rate": 4.6468807167950096e-05, + "loss": 4.8592, + "step": 28794 + }, + { + "epoch": 0.17125202207631554, + "grad_norm": 2.0758533477783203, + "learning_rate": 4.646856782712879e-05, + "loss": 3.6941, + "step": 28795 + }, + { + "epoch": 0.17125796935959653, + "grad_norm": 1.977253794670105, + "learning_rate": 4.646832847881304e-05, + "loss": 3.3686, + "step": 28796 + }, + { + "epoch": 0.17126391664287755, + "grad_norm": 2.0132908821105957, + "learning_rate": 4.646808912300291e-05, + "loss": 3.3937, + "step": 28797 + }, + { + "epoch": 0.17126986392615853, + "grad_norm": 1.8328338861465454, + "learning_rate": 4.646784975969849e-05, + "loss": 3.4359, + "step": 28798 + }, + { + "epoch": 0.17127581120943952, + "grad_norm": 1.7316343784332275, + "learning_rate": 4.646761038889987e-05, + "loss": 4.062, + "step": 28799 + }, + { + "epoch": 0.17128175849272054, + "grad_norm": 1.98564875125885, + "learning_rate": 4.646737101060713e-05, + "loss": 3.9671, + "step": 28800 + }, + { + "epoch": 0.17128770577600153, + "grad_norm": 1.4254114627838135, + "learning_rate": 4.646713162482035e-05, + "loss": 5.6623, + "step": 28801 + }, + { + "epoch": 0.1712936530592825, + "grad_norm": 1.7182563543319702, + "learning_rate": 4.646689223153962e-05, + "loss": 3.7951, + "step": 28802 + }, + { + "epoch": 0.17129960034256353, + "grad_norm": 1.9816060066223145, + "learning_rate": 4.646665283076502e-05, + "loss": 3.1926, + "step": 28803 + }, + { + "epoch": 0.17130554762584452, + "grad_norm": 1.9026448726654053, + "learning_rate": 4.646641342249663e-05, + "loss": 3.4481, + "step": 28804 + }, + { + "epoch": 0.1713114949091255, + "grad_norm": 1.9280551671981812, + "learning_rate": 4.646617400673453e-05, + "loss": 3.7474, + "step": 28805 + }, + { + "epoch": 0.17131744219240652, + "grad_norm": 1.9468990564346313, + "learning_rate": 4.646593458347882e-05, + "loss": 3.6522, + "step": 28806 + }, + { + "epoch": 0.1713233894756875, + "grad_norm": 1.8785784244537354, + "learning_rate": 4.646569515272957e-05, + "loss": 4.4277, + "step": 28807 + }, + { + "epoch": 0.1713293367589685, + "grad_norm": 2.5380280017852783, + "learning_rate": 4.6465455714486875e-05, + "loss": 4.7558, + "step": 28808 + }, + { + "epoch": 0.1713352840422495, + "grad_norm": 2.311422824859619, + "learning_rate": 4.64652162687508e-05, + "loss": 4.5887, + "step": 28809 + }, + { + "epoch": 0.1713412313255305, + "grad_norm": 2.215386390686035, + "learning_rate": 4.646497681552144e-05, + "loss": 4.6318, + "step": 28810 + }, + { + "epoch": 0.1713471786088115, + "grad_norm": 2.1793322563171387, + "learning_rate": 4.646473735479889e-05, + "loss": 4.8652, + "step": 28811 + }, + { + "epoch": 0.1713531258920925, + "grad_norm": 1.6395008563995361, + "learning_rate": 4.646449788658321e-05, + "loss": 5.1602, + "step": 28812 + }, + { + "epoch": 0.1713590731753735, + "grad_norm": 1.781542181968689, + "learning_rate": 4.646425841087451e-05, + "loss": 5.5992, + "step": 28813 + }, + { + "epoch": 0.17136502045865448, + "grad_norm": 1.7979416847229004, + "learning_rate": 4.6464018927672846e-05, + "loss": 5.4619, + "step": 28814 + }, + { + "epoch": 0.17137096774193547, + "grad_norm": 1.5196144580841064, + "learning_rate": 4.646377943697832e-05, + "loss": 5.5668, + "step": 28815 + }, + { + "epoch": 0.1713769150252165, + "grad_norm": 1.849569320678711, + "learning_rate": 4.6463539938791e-05, + "loss": 5.2762, + "step": 28816 + }, + { + "epoch": 0.17138286230849747, + "grad_norm": 2.4651362895965576, + "learning_rate": 4.6463300433111e-05, + "loss": 4.2121, + "step": 28817 + }, + { + "epoch": 0.17138880959177846, + "grad_norm": 2.2481956481933594, + "learning_rate": 4.646306091993837e-05, + "loss": 4.2369, + "step": 28818 + }, + { + "epoch": 0.17139475687505948, + "grad_norm": 1.5985668897628784, + "learning_rate": 4.646282139927321e-05, + "loss": 5.0238, + "step": 28819 + }, + { + "epoch": 0.17140070415834047, + "grad_norm": 1.5861318111419678, + "learning_rate": 4.64625818711156e-05, + "loss": 4.6181, + "step": 28820 + }, + { + "epoch": 0.17140665144162145, + "grad_norm": 1.5382401943206787, + "learning_rate": 4.646234233546562e-05, + "loss": 4.9682, + "step": 28821 + }, + { + "epoch": 0.17141259872490247, + "grad_norm": 1.604730248451233, + "learning_rate": 4.646210279232337e-05, + "loss": 5.2491, + "step": 28822 + }, + { + "epoch": 0.17141854600818346, + "grad_norm": 1.83149254322052, + "learning_rate": 4.6461863241688914e-05, + "loss": 5.514, + "step": 28823 + }, + { + "epoch": 0.17142449329146445, + "grad_norm": 2.151071786880493, + "learning_rate": 4.6461623683562336e-05, + "loss": 4.6684, + "step": 28824 + }, + { + "epoch": 0.17143044057474546, + "grad_norm": 1.934921145439148, + "learning_rate": 4.646138411794374e-05, + "loss": 4.5529, + "step": 28825 + }, + { + "epoch": 0.17143638785802645, + "grad_norm": 3.118504047393799, + "learning_rate": 4.646114454483319e-05, + "loss": 3.8805, + "step": 28826 + }, + { + "epoch": 0.17144233514130744, + "grad_norm": 2.784353733062744, + "learning_rate": 4.6460904964230776e-05, + "loss": 3.7983, + "step": 28827 + }, + { + "epoch": 0.17144828242458846, + "grad_norm": 2.2608816623687744, + "learning_rate": 4.6460665376136586e-05, + "loss": 4.0043, + "step": 28828 + }, + { + "epoch": 0.17145422970786944, + "grad_norm": 2.0400445461273193, + "learning_rate": 4.6460425780550695e-05, + "loss": 4.3601, + "step": 28829 + }, + { + "epoch": 0.17146017699115043, + "grad_norm": 1.7697999477386475, + "learning_rate": 4.64601861774732e-05, + "loss": 5.0038, + "step": 28830 + }, + { + "epoch": 0.17146612427443145, + "grad_norm": 1.916419267654419, + "learning_rate": 4.645994656690417e-05, + "loss": 3.8579, + "step": 28831 + }, + { + "epoch": 0.17147207155771244, + "grad_norm": 1.8474862575531006, + "learning_rate": 4.6459706948843687e-05, + "loss": 4.528, + "step": 28832 + }, + { + "epoch": 0.17147801884099342, + "grad_norm": 1.532090425491333, + "learning_rate": 4.645946732329185e-05, + "loss": 5.7598, + "step": 28833 + }, + { + "epoch": 0.17148396612427444, + "grad_norm": 1.4666064977645874, + "learning_rate": 4.645922769024873e-05, + "loss": 5.3868, + "step": 28834 + }, + { + "epoch": 0.17148991340755543, + "grad_norm": 1.5077399015426636, + "learning_rate": 4.645898804971442e-05, + "loss": 5.1645, + "step": 28835 + }, + { + "epoch": 0.17149586069083642, + "grad_norm": 1.5031183958053589, + "learning_rate": 4.6458748401689e-05, + "loss": 4.6318, + "step": 28836 + }, + { + "epoch": 0.17150180797411743, + "grad_norm": 1.9876207113265991, + "learning_rate": 4.6458508746172544e-05, + "loss": 3.7609, + "step": 28837 + }, + { + "epoch": 0.17150775525739842, + "grad_norm": 1.9552377462387085, + "learning_rate": 4.6458269083165155e-05, + "loss": 3.7297, + "step": 28838 + }, + { + "epoch": 0.1715137025406794, + "grad_norm": 1.7688027620315552, + "learning_rate": 4.64580294126669e-05, + "loss": 4.2031, + "step": 28839 + }, + { + "epoch": 0.17151964982396042, + "grad_norm": 1.7358896732330322, + "learning_rate": 4.645778973467787e-05, + "loss": 5.3203, + "step": 28840 + }, + { + "epoch": 0.1715255971072414, + "grad_norm": 1.6685024499893188, + "learning_rate": 4.645755004919814e-05, + "loss": 4.1383, + "step": 28841 + }, + { + "epoch": 0.1715315443905224, + "grad_norm": 1.7474262714385986, + "learning_rate": 4.645731035622781e-05, + "loss": 4.3956, + "step": 28842 + }, + { + "epoch": 0.17153749167380342, + "grad_norm": 2.3153438568115234, + "learning_rate": 4.6457070655766956e-05, + "loss": 3.6617, + "step": 28843 + }, + { + "epoch": 0.1715434389570844, + "grad_norm": 1.6651357412338257, + "learning_rate": 4.645683094781565e-05, + "loss": 3.7946, + "step": 28844 + }, + { + "epoch": 0.1715493862403654, + "grad_norm": 1.8230834007263184, + "learning_rate": 4.645659123237399e-05, + "loss": 3.6286, + "step": 28845 + }, + { + "epoch": 0.1715553335236464, + "grad_norm": 1.724862813949585, + "learning_rate": 4.645635150944206e-05, + "loss": 3.8681, + "step": 28846 + }, + { + "epoch": 0.1715612808069274, + "grad_norm": 1.7765378952026367, + "learning_rate": 4.645611177901994e-05, + "loss": 3.9172, + "step": 28847 + }, + { + "epoch": 0.17156722809020838, + "grad_norm": 1.7206759452819824, + "learning_rate": 4.645587204110771e-05, + "loss": 3.8603, + "step": 28848 + }, + { + "epoch": 0.1715731753734894, + "grad_norm": 1.9421840906143188, + "learning_rate": 4.645563229570546e-05, + "loss": 3.5207, + "step": 28849 + }, + { + "epoch": 0.1715791226567704, + "grad_norm": 1.9873075485229492, + "learning_rate": 4.645539254281327e-05, + "loss": 4.0805, + "step": 28850 + }, + { + "epoch": 0.17158506994005138, + "grad_norm": 1.7919063568115234, + "learning_rate": 4.645515278243122e-05, + "loss": 4.1832, + "step": 28851 + }, + { + "epoch": 0.1715910172233324, + "grad_norm": 1.6959470510482788, + "learning_rate": 4.6454913014559395e-05, + "loss": 4.135, + "step": 28852 + }, + { + "epoch": 0.17159696450661338, + "grad_norm": 2.2556352615356445, + "learning_rate": 4.645467323919789e-05, + "loss": 3.9897, + "step": 28853 + }, + { + "epoch": 0.17160291178989437, + "grad_norm": 2.394732713699341, + "learning_rate": 4.645443345634678e-05, + "loss": 4.0581, + "step": 28854 + }, + { + "epoch": 0.17160885907317538, + "grad_norm": 1.7620495557785034, + "learning_rate": 4.6454193666006144e-05, + "loss": 3.6301, + "step": 28855 + }, + { + "epoch": 0.17161480635645637, + "grad_norm": 2.046990394592285, + "learning_rate": 4.645395386817607e-05, + "loss": 3.6809, + "step": 28856 + }, + { + "epoch": 0.17162075363973736, + "grad_norm": 1.8854444026947021, + "learning_rate": 4.6453714062856645e-05, + "loss": 3.8665, + "step": 28857 + }, + { + "epoch": 0.17162670092301838, + "grad_norm": 1.952010989189148, + "learning_rate": 4.645347425004795e-05, + "loss": 3.9584, + "step": 28858 + }, + { + "epoch": 0.17163264820629937, + "grad_norm": 2.7259037494659424, + "learning_rate": 4.645323442975007e-05, + "loss": 4.1483, + "step": 28859 + }, + { + "epoch": 0.17163859548958035, + "grad_norm": 2.6531686782836914, + "learning_rate": 4.645299460196309e-05, + "loss": 4.2874, + "step": 28860 + }, + { + "epoch": 0.17164454277286137, + "grad_norm": 2.204883337020874, + "learning_rate": 4.645275476668708e-05, + "loss": 4.6409, + "step": 28861 + }, + { + "epoch": 0.17165049005614236, + "grad_norm": 1.8465254306793213, + "learning_rate": 4.645251492392214e-05, + "loss": 3.6078, + "step": 28862 + }, + { + "epoch": 0.17165643733942335, + "grad_norm": 1.6021015644073486, + "learning_rate": 4.645227507366835e-05, + "loss": 3.9142, + "step": 28863 + }, + { + "epoch": 0.17166238462270436, + "grad_norm": 1.9014915227890015, + "learning_rate": 4.645203521592579e-05, + "loss": 4.5439, + "step": 28864 + }, + { + "epoch": 0.17166833190598535, + "grad_norm": 2.176541805267334, + "learning_rate": 4.645179535069455e-05, + "loss": 4.0324, + "step": 28865 + }, + { + "epoch": 0.17167427918926634, + "grad_norm": 1.6138490438461304, + "learning_rate": 4.645155547797472e-05, + "loss": 5.2606, + "step": 28866 + }, + { + "epoch": 0.17168022647254735, + "grad_norm": 1.5091575384140015, + "learning_rate": 4.645131559776635e-05, + "loss": 4.8829, + "step": 28867 + }, + { + "epoch": 0.17168617375582834, + "grad_norm": 2.131401777267456, + "learning_rate": 4.645107571006957e-05, + "loss": 5.1779, + "step": 28868 + }, + { + "epoch": 0.17169212103910933, + "grad_norm": 1.871749758720398, + "learning_rate": 4.645083581488443e-05, + "loss": 4.8126, + "step": 28869 + }, + { + "epoch": 0.17169806832239035, + "grad_norm": 1.825909972190857, + "learning_rate": 4.6450595912211026e-05, + "loss": 4.4965, + "step": 28870 + }, + { + "epoch": 0.17170401560567133, + "grad_norm": 1.546570897102356, + "learning_rate": 4.645035600204944e-05, + "loss": 4.8261, + "step": 28871 + }, + { + "epoch": 0.17170996288895232, + "grad_norm": 1.6035295724868774, + "learning_rate": 4.6450116084399753e-05, + "loss": 4.8019, + "step": 28872 + }, + { + "epoch": 0.1717159101722333, + "grad_norm": 1.6257683038711548, + "learning_rate": 4.644987615926206e-05, + "loss": 4.6993, + "step": 28873 + }, + { + "epoch": 0.17172185745551433, + "grad_norm": 1.6006081104278564, + "learning_rate": 4.6449636226636427e-05, + "loss": 4.7575, + "step": 28874 + }, + { + "epoch": 0.17172780473879531, + "grad_norm": 1.9441580772399902, + "learning_rate": 4.6449396286522954e-05, + "loss": 4.4509, + "step": 28875 + }, + { + "epoch": 0.1717337520220763, + "grad_norm": 2.2355899810791016, + "learning_rate": 4.6449156338921716e-05, + "loss": 3.3666, + "step": 28876 + }, + { + "epoch": 0.17173969930535732, + "grad_norm": 1.863898754119873, + "learning_rate": 4.644891638383281e-05, + "loss": 3.4932, + "step": 28877 + }, + { + "epoch": 0.1717456465886383, + "grad_norm": 1.505720615386963, + "learning_rate": 4.64486764212563e-05, + "loss": 4.3892, + "step": 28878 + }, + { + "epoch": 0.1717515938719193, + "grad_norm": 2.197970151901245, + "learning_rate": 4.644843645119228e-05, + "loss": 4.5169, + "step": 28879 + }, + { + "epoch": 0.1717575411552003, + "grad_norm": 2.1132233142852783, + "learning_rate": 4.644819647364082e-05, + "loss": 3.9246, + "step": 28880 + }, + { + "epoch": 0.1717634884384813, + "grad_norm": 2.273036479949951, + "learning_rate": 4.644795648860203e-05, + "loss": 4.0134, + "step": 28881 + }, + { + "epoch": 0.1717694357217623, + "grad_norm": 2.3725993633270264, + "learning_rate": 4.6447716496075975e-05, + "loss": 3.9562, + "step": 28882 + }, + { + "epoch": 0.1717753830050433, + "grad_norm": 1.6925543546676636, + "learning_rate": 4.6447476496062745e-05, + "loss": 5.22, + "step": 28883 + }, + { + "epoch": 0.1717813302883243, + "grad_norm": 1.7216755151748657, + "learning_rate": 4.644723648856243e-05, + "loss": 4.2907, + "step": 28884 + }, + { + "epoch": 0.17178727757160528, + "grad_norm": 1.9896382093429565, + "learning_rate": 4.64469964735751e-05, + "loss": 3.4634, + "step": 28885 + }, + { + "epoch": 0.1717932248548863, + "grad_norm": 1.924800992012024, + "learning_rate": 4.6446756451100844e-05, + "loss": 3.627, + "step": 28886 + }, + { + "epoch": 0.17179917213816728, + "grad_norm": 2.1140928268432617, + "learning_rate": 4.644651642113975e-05, + "loss": 3.8234, + "step": 28887 + }, + { + "epoch": 0.17180511942144827, + "grad_norm": 1.9103795289993286, + "learning_rate": 4.644627638369189e-05, + "loss": 3.7129, + "step": 28888 + }, + { + "epoch": 0.1718110667047293, + "grad_norm": 2.002732038497925, + "learning_rate": 4.6446036338757363e-05, + "loss": 3.741, + "step": 28889 + }, + { + "epoch": 0.17181701398801028, + "grad_norm": 1.6863858699798584, + "learning_rate": 4.644579628633625e-05, + "loss": 4.3454, + "step": 28890 + }, + { + "epoch": 0.17182296127129126, + "grad_norm": 1.5118045806884766, + "learning_rate": 4.6445556226428625e-05, + "loss": 5.1573, + "step": 28891 + }, + { + "epoch": 0.17182890855457228, + "grad_norm": 2.336212158203125, + "learning_rate": 4.644531615903458e-05, + "loss": 3.7499, + "step": 28892 + }, + { + "epoch": 0.17183485583785327, + "grad_norm": 1.5706313848495483, + "learning_rate": 4.6445076084154195e-05, + "loss": 4.5392, + "step": 28893 + }, + { + "epoch": 0.17184080312113426, + "grad_norm": 1.9531837701797485, + "learning_rate": 4.644483600178756e-05, + "loss": 3.72, + "step": 28894 + }, + { + "epoch": 0.17184675040441527, + "grad_norm": 1.652535080909729, + "learning_rate": 4.644459591193475e-05, + "loss": 4.6445, + "step": 28895 + }, + { + "epoch": 0.17185269768769626, + "grad_norm": 1.856799840927124, + "learning_rate": 4.644435581459585e-05, + "loss": 3.6899, + "step": 28896 + }, + { + "epoch": 0.17185864497097725, + "grad_norm": 1.8917557001113892, + "learning_rate": 4.644411570977096e-05, + "loss": 3.7475, + "step": 28897 + }, + { + "epoch": 0.17186459225425826, + "grad_norm": 1.7784960269927979, + "learning_rate": 4.644387559746014e-05, + "loss": 3.6315, + "step": 28898 + }, + { + "epoch": 0.17187053953753925, + "grad_norm": 1.8464044332504272, + "learning_rate": 4.644363547766348e-05, + "loss": 4.0489, + "step": 28899 + }, + { + "epoch": 0.17187648682082024, + "grad_norm": 1.8629194498062134, + "learning_rate": 4.6443395350381084e-05, + "loss": 3.755, + "step": 28900 + }, + { + "epoch": 0.17188243410410126, + "grad_norm": 1.774107813835144, + "learning_rate": 4.644315521561301e-05, + "loss": 3.6051, + "step": 28901 + }, + { + "epoch": 0.17188838138738224, + "grad_norm": 1.6542714834213257, + "learning_rate": 4.644291507335935e-05, + "loss": 3.622, + "step": 28902 + }, + { + "epoch": 0.17189432867066323, + "grad_norm": 1.7980518341064453, + "learning_rate": 4.64426749236202e-05, + "loss": 3.7703, + "step": 28903 + }, + { + "epoch": 0.17190027595394425, + "grad_norm": 1.771996021270752, + "learning_rate": 4.644243476639563e-05, + "loss": 3.8511, + "step": 28904 + }, + { + "epoch": 0.17190622323722524, + "grad_norm": 1.9656630754470825, + "learning_rate": 4.644219460168572e-05, + "loss": 5.0433, + "step": 28905 + }, + { + "epoch": 0.17191217052050622, + "grad_norm": 1.7453303337097168, + "learning_rate": 4.6441954429490564e-05, + "loss": 4.3733, + "step": 28906 + }, + { + "epoch": 0.17191811780378724, + "grad_norm": 1.8528467416763306, + "learning_rate": 4.644171424981025e-05, + "loss": 3.7542, + "step": 28907 + }, + { + "epoch": 0.17192406508706823, + "grad_norm": 1.8916527032852173, + "learning_rate": 4.6441474062644844e-05, + "loss": 3.726, + "step": 28908 + }, + { + "epoch": 0.17193001237034922, + "grad_norm": 1.8707592487335205, + "learning_rate": 4.644123386799445e-05, + "loss": 3.77, + "step": 28909 + }, + { + "epoch": 0.17193595965363023, + "grad_norm": 1.7839124202728271, + "learning_rate": 4.644099366585914e-05, + "loss": 3.8036, + "step": 28910 + }, + { + "epoch": 0.17194190693691122, + "grad_norm": 2.1418814659118652, + "learning_rate": 4.6440753456239e-05, + "loss": 3.83, + "step": 28911 + }, + { + "epoch": 0.1719478542201922, + "grad_norm": 1.7159006595611572, + "learning_rate": 4.644051323913412e-05, + "loss": 3.6423, + "step": 28912 + }, + { + "epoch": 0.17195380150347322, + "grad_norm": 2.0046510696411133, + "learning_rate": 4.644027301454457e-05, + "loss": 3.6761, + "step": 28913 + }, + { + "epoch": 0.1719597487867542, + "grad_norm": 1.8171806335449219, + "learning_rate": 4.6440032782470446e-05, + "loss": 3.6621, + "step": 28914 + }, + { + "epoch": 0.1719656960700352, + "grad_norm": 1.813620924949646, + "learning_rate": 4.6439792542911826e-05, + "loss": 3.6249, + "step": 28915 + }, + { + "epoch": 0.17197164335331622, + "grad_norm": 1.8341031074523926, + "learning_rate": 4.64395522958688e-05, + "loss": 4.1758, + "step": 28916 + }, + { + "epoch": 0.1719775906365972, + "grad_norm": 2.3422980308532715, + "learning_rate": 4.643931204134144e-05, + "loss": 4.0642, + "step": 28917 + }, + { + "epoch": 0.1719835379198782, + "grad_norm": 2.2799339294433594, + "learning_rate": 4.643907177932985e-05, + "loss": 3.5248, + "step": 28918 + }, + { + "epoch": 0.1719894852031592, + "grad_norm": 2.3583829402923584, + "learning_rate": 4.643883150983409e-05, + "loss": 3.4972, + "step": 28919 + }, + { + "epoch": 0.1719954324864402, + "grad_norm": 2.667558431625366, + "learning_rate": 4.6438591232854265e-05, + "loss": 3.3926, + "step": 28920 + }, + { + "epoch": 0.17200137976972119, + "grad_norm": 2.2808713912963867, + "learning_rate": 4.6438350948390444e-05, + "loss": 3.2806, + "step": 28921 + }, + { + "epoch": 0.1720073270530022, + "grad_norm": 2.0563879013061523, + "learning_rate": 4.6438110656442713e-05, + "loss": 4.4691, + "step": 28922 + }, + { + "epoch": 0.1720132743362832, + "grad_norm": 1.8717663288116455, + "learning_rate": 4.643787035701116e-05, + "loss": 4.8282, + "step": 28923 + }, + { + "epoch": 0.17201922161956418, + "grad_norm": 2.2592520713806152, + "learning_rate": 4.643763005009588e-05, + "loss": 3.6768, + "step": 28924 + }, + { + "epoch": 0.1720251689028452, + "grad_norm": 2.2937116622924805, + "learning_rate": 4.643738973569693e-05, + "loss": 3.5727, + "step": 28925 + }, + { + "epoch": 0.17203111618612618, + "grad_norm": 2.3913755416870117, + "learning_rate": 4.643714941381441e-05, + "loss": 3.6011, + "step": 28926 + }, + { + "epoch": 0.17203706346940717, + "grad_norm": 2.3368663787841797, + "learning_rate": 4.643690908444841e-05, + "loss": 3.6664, + "step": 28927 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 1.4821833372116089, + "learning_rate": 4.6436668747599005e-05, + "loss": 5.495, + "step": 28928 + }, + { + "epoch": 0.17204895803596917, + "grad_norm": 1.8062217235565186, + "learning_rate": 4.643642840326627e-05, + "loss": 4.632, + "step": 28929 + }, + { + "epoch": 0.17205490531925016, + "grad_norm": 2.0992000102996826, + "learning_rate": 4.6436188051450314e-05, + "loss": 4.1965, + "step": 28930 + }, + { + "epoch": 0.17206085260253115, + "grad_norm": 1.6724803447723389, + "learning_rate": 4.6435947692151207e-05, + "loss": 5.1407, + "step": 28931 + }, + { + "epoch": 0.17206679988581217, + "grad_norm": 2.1039113998413086, + "learning_rate": 4.6435707325369024e-05, + "loss": 4.9189, + "step": 28932 + }, + { + "epoch": 0.17207274716909315, + "grad_norm": 1.7378982305526733, + "learning_rate": 4.6435466951103853e-05, + "loss": 5.0936, + "step": 28933 + }, + { + "epoch": 0.17207869445237414, + "grad_norm": 1.7237809896469116, + "learning_rate": 4.643522656935579e-05, + "loss": 5.175, + "step": 28934 + }, + { + "epoch": 0.17208464173565516, + "grad_norm": 1.5770435333251953, + "learning_rate": 4.6434986180124904e-05, + "loss": 5.0878, + "step": 28935 + }, + { + "epoch": 0.17209058901893615, + "grad_norm": 1.5708106756210327, + "learning_rate": 4.6434745783411294e-05, + "loss": 5.185, + "step": 28936 + }, + { + "epoch": 0.17209653630221713, + "grad_norm": 1.840494990348816, + "learning_rate": 4.643450537921503e-05, + "loss": 5.0293, + "step": 28937 + }, + { + "epoch": 0.17210248358549815, + "grad_norm": 1.9380584955215454, + "learning_rate": 4.64342649675362e-05, + "loss": 4.6983, + "step": 28938 + }, + { + "epoch": 0.17210843086877914, + "grad_norm": 1.6215778589248657, + "learning_rate": 4.64340245483749e-05, + "loss": 5.1622, + "step": 28939 + }, + { + "epoch": 0.17211437815206013, + "grad_norm": 2.1743335723876953, + "learning_rate": 4.6433784121731196e-05, + "loss": 4.2748, + "step": 28940 + }, + { + "epoch": 0.17212032543534114, + "grad_norm": 2.269792318344116, + "learning_rate": 4.643354368760517e-05, + "loss": 3.682, + "step": 28941 + }, + { + "epoch": 0.17212627271862213, + "grad_norm": 1.956141471862793, + "learning_rate": 4.643330324599693e-05, + "loss": 4.4543, + "step": 28942 + }, + { + "epoch": 0.17213222000190312, + "grad_norm": 1.5037137269973755, + "learning_rate": 4.6433062796906544e-05, + "loss": 5.4757, + "step": 28943 + }, + { + "epoch": 0.17213816728518413, + "grad_norm": 2.0092952251434326, + "learning_rate": 4.643282234033409e-05, + "loss": 3.9942, + "step": 28944 + }, + { + "epoch": 0.17214411456846512, + "grad_norm": 2.0670738220214844, + "learning_rate": 4.643258187627967e-05, + "loss": 3.2918, + "step": 28945 + }, + { + "epoch": 0.1721500618517461, + "grad_norm": 2.011192560195923, + "learning_rate": 4.643234140474334e-05, + "loss": 3.6096, + "step": 28946 + }, + { + "epoch": 0.17215600913502713, + "grad_norm": 2.221064805984497, + "learning_rate": 4.643210092572522e-05, + "loss": 4.0979, + "step": 28947 + }, + { + "epoch": 0.17216195641830812, + "grad_norm": 2.543839931488037, + "learning_rate": 4.643186043922536e-05, + "loss": 3.8645, + "step": 28948 + }, + { + "epoch": 0.1721679037015891, + "grad_norm": 1.8699936866760254, + "learning_rate": 4.6431619945243866e-05, + "loss": 3.8908, + "step": 28949 + }, + { + "epoch": 0.17217385098487012, + "grad_norm": 1.6603435277938843, + "learning_rate": 4.6431379443780815e-05, + "loss": 4.9394, + "step": 28950 + }, + { + "epoch": 0.1721797982681511, + "grad_norm": 2.0914523601531982, + "learning_rate": 4.643113893483629e-05, + "loss": 3.1328, + "step": 28951 + }, + { + "epoch": 0.1721857455514321, + "grad_norm": 2.469694137573242, + "learning_rate": 4.6430898418410373e-05, + "loss": 3.5583, + "step": 28952 + }, + { + "epoch": 0.1721916928347131, + "grad_norm": 2.5100619792938232, + "learning_rate": 4.643065789450315e-05, + "loss": 3.7234, + "step": 28953 + }, + { + "epoch": 0.1721976401179941, + "grad_norm": 2.565922737121582, + "learning_rate": 4.643041736311471e-05, + "loss": 3.3566, + "step": 28954 + }, + { + "epoch": 0.1722035874012751, + "grad_norm": 2.454882860183716, + "learning_rate": 4.643017682424513e-05, + "loss": 3.6576, + "step": 28955 + }, + { + "epoch": 0.1722095346845561, + "grad_norm": 1.6239404678344727, + "learning_rate": 4.64299362778945e-05, + "loss": 4.7344, + "step": 28956 + }, + { + "epoch": 0.1722154819678371, + "grad_norm": 1.6332730054855347, + "learning_rate": 4.6429695724062906e-05, + "loss": 4.9091, + "step": 28957 + }, + { + "epoch": 0.17222142925111808, + "grad_norm": 1.495293378829956, + "learning_rate": 4.642945516275041e-05, + "loss": 4.7336, + "step": 28958 + }, + { + "epoch": 0.1722273765343991, + "grad_norm": 1.531150460243225, + "learning_rate": 4.6429214593957125e-05, + "loss": 4.7503, + "step": 28959 + }, + { + "epoch": 0.17223332381768008, + "grad_norm": 1.2761198282241821, + "learning_rate": 4.642897401768312e-05, + "loss": 4.6507, + "step": 28960 + }, + { + "epoch": 0.17223927110096107, + "grad_norm": 1.366808295249939, + "learning_rate": 4.642873343392848e-05, + "loss": 4.7195, + "step": 28961 + }, + { + "epoch": 0.1722452183842421, + "grad_norm": 2.072298765182495, + "learning_rate": 4.6428492842693295e-05, + "loss": 4.3342, + "step": 28962 + }, + { + "epoch": 0.17225116566752308, + "grad_norm": 2.4667413234710693, + "learning_rate": 4.642825224397764e-05, + "loss": 3.3579, + "step": 28963 + }, + { + "epoch": 0.17225711295080406, + "grad_norm": 2.5743234157562256, + "learning_rate": 4.64280116377816e-05, + "loss": 3.559, + "step": 28964 + }, + { + "epoch": 0.17226306023408508, + "grad_norm": 2.4581592082977295, + "learning_rate": 4.6427771024105274e-05, + "loss": 3.6332, + "step": 28965 + }, + { + "epoch": 0.17226900751736607, + "grad_norm": 2.156362533569336, + "learning_rate": 4.642753040294873e-05, + "loss": 4.5459, + "step": 28966 + }, + { + "epoch": 0.17227495480064706, + "grad_norm": 2.2250757217407227, + "learning_rate": 4.642728977431205e-05, + "loss": 3.7909, + "step": 28967 + }, + { + "epoch": 0.17228090208392807, + "grad_norm": 2.06371808052063, + "learning_rate": 4.642704913819533e-05, + "loss": 5.3105, + "step": 28968 + }, + { + "epoch": 0.17228684936720906, + "grad_norm": 2.0080556869506836, + "learning_rate": 4.642680849459865e-05, + "loss": 5.2019, + "step": 28969 + }, + { + "epoch": 0.17229279665049005, + "grad_norm": 1.4533225297927856, + "learning_rate": 4.642656784352209e-05, + "loss": 5.3035, + "step": 28970 + }, + { + "epoch": 0.17229874393377106, + "grad_norm": 1.8252445459365845, + "learning_rate": 4.642632718496573e-05, + "loss": 4.5186, + "step": 28971 + }, + { + "epoch": 0.17230469121705205, + "grad_norm": 2.125659465789795, + "learning_rate": 4.642608651892967e-05, + "loss": 4.5968, + "step": 28972 + }, + { + "epoch": 0.17231063850033304, + "grad_norm": 1.7049205303192139, + "learning_rate": 4.6425845845413984e-05, + "loss": 5.2613, + "step": 28973 + }, + { + "epoch": 0.17231658578361406, + "grad_norm": 1.818495512008667, + "learning_rate": 4.642560516441875e-05, + "loss": 4.5706, + "step": 28974 + }, + { + "epoch": 0.17232253306689505, + "grad_norm": 1.4389350414276123, + "learning_rate": 4.6425364475944065e-05, + "loss": 5.3398, + "step": 28975 + }, + { + "epoch": 0.17232848035017603, + "grad_norm": 1.3256508111953735, + "learning_rate": 4.6425123779990005e-05, + "loss": 5.0498, + "step": 28976 + }, + { + "epoch": 0.17233442763345705, + "grad_norm": 1.3190927505493164, + "learning_rate": 4.642488307655666e-05, + "loss": 5.1833, + "step": 28977 + }, + { + "epoch": 0.17234037491673804, + "grad_norm": 1.6174373626708984, + "learning_rate": 4.64246423656441e-05, + "loss": 4.7737, + "step": 28978 + }, + { + "epoch": 0.17234632220001903, + "grad_norm": 1.3956570625305176, + "learning_rate": 4.6424401647252425e-05, + "loss": 5.0439, + "step": 28979 + }, + { + "epoch": 0.17235226948330004, + "grad_norm": 1.3336056470870972, + "learning_rate": 4.642416092138171e-05, + "loss": 5.526, + "step": 28980 + }, + { + "epoch": 0.17235821676658103, + "grad_norm": 1.9870527982711792, + "learning_rate": 4.642392018803204e-05, + "loss": 4.6277, + "step": 28981 + }, + { + "epoch": 0.17236416404986202, + "grad_norm": 1.9504579305648804, + "learning_rate": 4.64236794472035e-05, + "loss": 4.6113, + "step": 28982 + }, + { + "epoch": 0.17237011133314303, + "grad_norm": 1.7667953968048096, + "learning_rate": 4.642343869889618e-05, + "loss": 5.0653, + "step": 28983 + }, + { + "epoch": 0.17237605861642402, + "grad_norm": 1.6792775392532349, + "learning_rate": 4.642319794311016e-05, + "loss": 5.1556, + "step": 28984 + }, + { + "epoch": 0.172382005899705, + "grad_norm": 1.7935463190078735, + "learning_rate": 4.642295717984551e-05, + "loss": 4.4604, + "step": 28985 + }, + { + "epoch": 0.17238795318298603, + "grad_norm": 1.8608596324920654, + "learning_rate": 4.642271640910235e-05, + "loss": 5.1865, + "step": 28986 + }, + { + "epoch": 0.172393900466267, + "grad_norm": 1.7945232391357422, + "learning_rate": 4.642247563088073e-05, + "loss": 4.8413, + "step": 28987 + }, + { + "epoch": 0.172399847749548, + "grad_norm": 1.6362812519073486, + "learning_rate": 4.6422234845180734e-05, + "loss": 5.4072, + "step": 28988 + }, + { + "epoch": 0.172405795032829, + "grad_norm": 1.7283893823623657, + "learning_rate": 4.642199405200247e-05, + "loss": 5.2463, + "step": 28989 + }, + { + "epoch": 0.17241174231611, + "grad_norm": 2.589603900909424, + "learning_rate": 4.6421753251346004e-05, + "loss": 4.0614, + "step": 28990 + }, + { + "epoch": 0.172417689599391, + "grad_norm": 1.785037875175476, + "learning_rate": 4.642151244321143e-05, + "loss": 4.7127, + "step": 28991 + }, + { + "epoch": 0.17242363688267198, + "grad_norm": 1.5093384981155396, + "learning_rate": 4.6421271627598826e-05, + "loss": 5.2746, + "step": 28992 + }, + { + "epoch": 0.172429584165953, + "grad_norm": 1.4697469472885132, + "learning_rate": 4.642103080450828e-05, + "loss": 5.2444, + "step": 28993 + }, + { + "epoch": 0.172435531449234, + "grad_norm": 1.5588436126708984, + "learning_rate": 4.642078997393986e-05, + "loss": 5.3832, + "step": 28994 + }, + { + "epoch": 0.17244147873251497, + "grad_norm": 1.4939788579940796, + "learning_rate": 4.642054913589368e-05, + "loss": 5.5868, + "step": 28995 + }, + { + "epoch": 0.172447426015796, + "grad_norm": 1.8973298072814941, + "learning_rate": 4.6420308290369795e-05, + "loss": 5.3981, + "step": 28996 + }, + { + "epoch": 0.17245337329907698, + "grad_norm": 1.7295379638671875, + "learning_rate": 4.642006743736831e-05, + "loss": 4.8308, + "step": 28997 + }, + { + "epoch": 0.17245932058235797, + "grad_norm": 1.519732117652893, + "learning_rate": 4.641982657688929e-05, + "loss": 5.423, + "step": 28998 + }, + { + "epoch": 0.17246526786563898, + "grad_norm": 1.6511726379394531, + "learning_rate": 4.641958570893284e-05, + "loss": 5.2029, + "step": 28999 + }, + { + "epoch": 0.17247121514891997, + "grad_norm": 1.5355091094970703, + "learning_rate": 4.641934483349903e-05, + "loss": 5.3556, + "step": 29000 + }, + { + "epoch": 0.17247716243220096, + "grad_norm": 1.562451720237732, + "learning_rate": 4.641910395058795e-05, + "loss": 5.3171, + "step": 29001 + }, + { + "epoch": 0.17248310971548197, + "grad_norm": 1.4412742853164673, + "learning_rate": 4.6418863060199684e-05, + "loss": 5.1771, + "step": 29002 + }, + { + "epoch": 0.17248905699876296, + "grad_norm": 1.5048646926879883, + "learning_rate": 4.6418622162334315e-05, + "loss": 5.3242, + "step": 29003 + }, + { + "epoch": 0.17249500428204395, + "grad_norm": 1.4204987287521362, + "learning_rate": 4.641838125699192e-05, + "loss": 5.3281, + "step": 29004 + }, + { + "epoch": 0.17250095156532497, + "grad_norm": 1.5606169700622559, + "learning_rate": 4.641814034417259e-05, + "loss": 5.0594, + "step": 29005 + }, + { + "epoch": 0.17250689884860596, + "grad_norm": 1.5690323114395142, + "learning_rate": 4.641789942387641e-05, + "loss": 5.2602, + "step": 29006 + }, + { + "epoch": 0.17251284613188694, + "grad_norm": 1.4904906749725342, + "learning_rate": 4.641765849610347e-05, + "loss": 5.2554, + "step": 29007 + }, + { + "epoch": 0.17251879341516796, + "grad_norm": 1.8319326639175415, + "learning_rate": 4.641741756085384e-05, + "loss": 4.5856, + "step": 29008 + }, + { + "epoch": 0.17252474069844895, + "grad_norm": 1.984311819076538, + "learning_rate": 4.6417176618127614e-05, + "loss": 5.2343, + "step": 29009 + }, + { + "epoch": 0.17253068798172994, + "grad_norm": 1.8066591024398804, + "learning_rate": 4.6416935667924864e-05, + "loss": 5.6382, + "step": 29010 + }, + { + "epoch": 0.17253663526501095, + "grad_norm": 1.3843746185302734, + "learning_rate": 4.641669471024569e-05, + "loss": 5.4115, + "step": 29011 + }, + { + "epoch": 0.17254258254829194, + "grad_norm": 1.6255708932876587, + "learning_rate": 4.6416453745090164e-05, + "loss": 5.1379, + "step": 29012 + }, + { + "epoch": 0.17254852983157293, + "grad_norm": 1.4723587036132812, + "learning_rate": 4.641621277245838e-05, + "loss": 5.1829, + "step": 29013 + }, + { + "epoch": 0.17255447711485394, + "grad_norm": 1.7830013036727905, + "learning_rate": 4.641597179235042e-05, + "loss": 4.8646, + "step": 29014 + }, + { + "epoch": 0.17256042439813493, + "grad_norm": 1.6139211654663086, + "learning_rate": 4.641573080476636e-05, + "loss": 5.3989, + "step": 29015 + }, + { + "epoch": 0.17256637168141592, + "grad_norm": 2.9187774658203125, + "learning_rate": 4.641548980970629e-05, + "loss": 3.3579, + "step": 29016 + }, + { + "epoch": 0.17257231896469694, + "grad_norm": 1.4265162944793701, + "learning_rate": 4.6415248807170296e-05, + "loss": 5.1783, + "step": 29017 + }, + { + "epoch": 0.17257826624797792, + "grad_norm": 1.3095968961715698, + "learning_rate": 4.641500779715846e-05, + "loss": 5.6357, + "step": 29018 + }, + { + "epoch": 0.1725842135312589, + "grad_norm": 1.3929443359375, + "learning_rate": 4.641476677967087e-05, + "loss": 5.3234, + "step": 29019 + }, + { + "epoch": 0.17259016081453993, + "grad_norm": 1.6466419696807861, + "learning_rate": 4.64145257547076e-05, + "loss": 5.5066, + "step": 29020 + }, + { + "epoch": 0.17259610809782092, + "grad_norm": 1.4895389080047607, + "learning_rate": 4.6414284722268745e-05, + "loss": 5.0983, + "step": 29021 + }, + { + "epoch": 0.1726020553811019, + "grad_norm": 1.6978981494903564, + "learning_rate": 4.641404368235438e-05, + "loss": 5.3724, + "step": 29022 + }, + { + "epoch": 0.17260800266438292, + "grad_norm": 1.7038211822509766, + "learning_rate": 4.641380263496459e-05, + "loss": 5.2525, + "step": 29023 + }, + { + "epoch": 0.1726139499476639, + "grad_norm": 1.4917408227920532, + "learning_rate": 4.641356158009947e-05, + "loss": 4.9793, + "step": 29024 + }, + { + "epoch": 0.1726198972309449, + "grad_norm": 1.6916602849960327, + "learning_rate": 4.6413320517759094e-05, + "loss": 5.0735, + "step": 29025 + }, + { + "epoch": 0.1726258445142259, + "grad_norm": 1.4852558374404907, + "learning_rate": 4.6413079447943556e-05, + "loss": 5.27, + "step": 29026 + }, + { + "epoch": 0.1726317917975069, + "grad_norm": 1.6030479669570923, + "learning_rate": 4.6412838370652925e-05, + "loss": 5.2712, + "step": 29027 + }, + { + "epoch": 0.1726377390807879, + "grad_norm": 1.5208861827850342, + "learning_rate": 4.6412597285887296e-05, + "loss": 5.4238, + "step": 29028 + }, + { + "epoch": 0.1726436863640689, + "grad_norm": 1.8001056909561157, + "learning_rate": 4.6412356193646744e-05, + "loss": 5.433, + "step": 29029 + }, + { + "epoch": 0.1726496336473499, + "grad_norm": 1.570449948310852, + "learning_rate": 4.641211509393136e-05, + "loss": 5.3843, + "step": 29030 + }, + { + "epoch": 0.17265558093063088, + "grad_norm": 1.4007776975631714, + "learning_rate": 4.641187398674124e-05, + "loss": 5.213, + "step": 29031 + }, + { + "epoch": 0.1726615282139119, + "grad_norm": 1.7244693040847778, + "learning_rate": 4.641163287207645e-05, + "loss": 4.342, + "step": 29032 + }, + { + "epoch": 0.17266747549719288, + "grad_norm": 1.752119779586792, + "learning_rate": 4.6411391749937076e-05, + "loss": 5.2256, + "step": 29033 + }, + { + "epoch": 0.17267342278047387, + "grad_norm": 1.7031835317611694, + "learning_rate": 4.6411150620323214e-05, + "loss": 5.3993, + "step": 29034 + }, + { + "epoch": 0.1726793700637549, + "grad_norm": 1.6741119623184204, + "learning_rate": 4.641090948323493e-05, + "loss": 5.3929, + "step": 29035 + }, + { + "epoch": 0.17268531734703588, + "grad_norm": 1.5801132917404175, + "learning_rate": 4.6410668338672326e-05, + "loss": 5.5049, + "step": 29036 + }, + { + "epoch": 0.17269126463031687, + "grad_norm": 1.6885874271392822, + "learning_rate": 4.641042718663548e-05, + "loss": 5.4284, + "step": 29037 + }, + { + "epoch": 0.17269721191359788, + "grad_norm": 2.0031561851501465, + "learning_rate": 4.6410186027124475e-05, + "loss": 5.064, + "step": 29038 + }, + { + "epoch": 0.17270315919687887, + "grad_norm": 1.9345756769180298, + "learning_rate": 4.640994486013939e-05, + "loss": 4.902, + "step": 29039 + }, + { + "epoch": 0.17270910648015986, + "grad_norm": 1.7898815870285034, + "learning_rate": 4.640970368568032e-05, + "loss": 4.576, + "step": 29040 + }, + { + "epoch": 0.17271505376344087, + "grad_norm": 1.7370834350585938, + "learning_rate": 4.640946250374734e-05, + "loss": 4.2676, + "step": 29041 + }, + { + "epoch": 0.17272100104672186, + "grad_norm": 1.3820379972457886, + "learning_rate": 4.640922131434054e-05, + "loss": 4.1509, + "step": 29042 + }, + { + "epoch": 0.17272694833000285, + "grad_norm": 1.507027506828308, + "learning_rate": 4.640898011746e-05, + "loss": 4.8934, + "step": 29043 + }, + { + "epoch": 0.17273289561328387, + "grad_norm": 1.7124078273773193, + "learning_rate": 4.640873891310581e-05, + "loss": 5.0756, + "step": 29044 + }, + { + "epoch": 0.17273884289656485, + "grad_norm": 1.5267462730407715, + "learning_rate": 4.6408497701278045e-05, + "loss": 5.2387, + "step": 29045 + }, + { + "epoch": 0.17274479017984584, + "grad_norm": 1.560703158378601, + "learning_rate": 4.64082564819768e-05, + "loss": 4.8667, + "step": 29046 + }, + { + "epoch": 0.17275073746312683, + "grad_norm": 1.5322329998016357, + "learning_rate": 4.6408015255202145e-05, + "loss": 5.013, + "step": 29047 + }, + { + "epoch": 0.17275668474640785, + "grad_norm": 1.675746202468872, + "learning_rate": 4.640777402095419e-05, + "loss": 4.8509, + "step": 29048 + }, + { + "epoch": 0.17276263202968883, + "grad_norm": 1.6513665914535522, + "learning_rate": 4.640753277923299e-05, + "loss": 4.9737, + "step": 29049 + }, + { + "epoch": 0.17276857931296982, + "grad_norm": 1.7950671911239624, + "learning_rate": 4.640729153003864e-05, + "loss": 4.3243, + "step": 29050 + }, + { + "epoch": 0.17277452659625084, + "grad_norm": 1.7763174772262573, + "learning_rate": 4.6407050273371225e-05, + "loss": 4.3468, + "step": 29051 + }, + { + "epoch": 0.17278047387953183, + "grad_norm": 1.7274105548858643, + "learning_rate": 4.640680900923083e-05, + "loss": 4.3678, + "step": 29052 + }, + { + "epoch": 0.17278642116281281, + "grad_norm": 1.8083571195602417, + "learning_rate": 4.640656773761755e-05, + "loss": 4.0583, + "step": 29053 + }, + { + "epoch": 0.17279236844609383, + "grad_norm": 1.5555697679519653, + "learning_rate": 4.640632645853145e-05, + "loss": 4.9759, + "step": 29054 + }, + { + "epoch": 0.17279831572937482, + "grad_norm": 1.5617389678955078, + "learning_rate": 4.640608517197263e-05, + "loss": 4.9137, + "step": 29055 + }, + { + "epoch": 0.1728042630126558, + "grad_norm": 1.549464225769043, + "learning_rate": 4.640584387794115e-05, + "loss": 5.158, + "step": 29056 + }, + { + "epoch": 0.17281021029593682, + "grad_norm": 1.7087653875350952, + "learning_rate": 4.6405602576437126e-05, + "loss": 5.136, + "step": 29057 + }, + { + "epoch": 0.1728161575792178, + "grad_norm": 1.5118201971054077, + "learning_rate": 4.640536126746062e-05, + "loss": 5.1956, + "step": 29058 + }, + { + "epoch": 0.1728221048624988, + "grad_norm": 1.6387808322906494, + "learning_rate": 4.640511995101173e-05, + "loss": 5.0441, + "step": 29059 + }, + { + "epoch": 0.17282805214577981, + "grad_norm": 1.652024745941162, + "learning_rate": 4.640487862709053e-05, + "loss": 4.9147, + "step": 29060 + }, + { + "epoch": 0.1728339994290608, + "grad_norm": 1.6259782314300537, + "learning_rate": 4.640463729569711e-05, + "loss": 4.2755, + "step": 29061 + }, + { + "epoch": 0.1728399467123418, + "grad_norm": 1.6286218166351318, + "learning_rate": 4.640439595683155e-05, + "loss": 4.6328, + "step": 29062 + }, + { + "epoch": 0.1728458939956228, + "grad_norm": 1.7396693229675293, + "learning_rate": 4.6404154610493934e-05, + "loss": 4.5711, + "step": 29063 + }, + { + "epoch": 0.1728518412789038, + "grad_norm": 1.4926822185516357, + "learning_rate": 4.640391325668435e-05, + "loss": 5.118, + "step": 29064 + }, + { + "epoch": 0.17285778856218478, + "grad_norm": 2.454763650894165, + "learning_rate": 4.6403671895402884e-05, + "loss": 4.817, + "step": 29065 + }, + { + "epoch": 0.1728637358454658, + "grad_norm": 1.6225837469100952, + "learning_rate": 4.640343052664962e-05, + "loss": 4.9953, + "step": 29066 + }, + { + "epoch": 0.1728696831287468, + "grad_norm": 1.8164595365524292, + "learning_rate": 4.640318915042463e-05, + "loss": 4.9384, + "step": 29067 + }, + { + "epoch": 0.17287563041202778, + "grad_norm": 1.4794782400131226, + "learning_rate": 4.640294776672801e-05, + "loss": 5.2635, + "step": 29068 + }, + { + "epoch": 0.1728815776953088, + "grad_norm": 1.6981302499771118, + "learning_rate": 4.640270637555985e-05, + "loss": 5.283, + "step": 29069 + }, + { + "epoch": 0.17288752497858978, + "grad_norm": 1.8669052124023438, + "learning_rate": 4.640246497692022e-05, + "loss": 4.303, + "step": 29070 + }, + { + "epoch": 0.17289347226187077, + "grad_norm": 1.8505442142486572, + "learning_rate": 4.640222357080921e-05, + "loss": 4.6573, + "step": 29071 + }, + { + "epoch": 0.17289941954515178, + "grad_norm": 1.6368263959884644, + "learning_rate": 4.640198215722691e-05, + "loss": 4.5301, + "step": 29072 + }, + { + "epoch": 0.17290536682843277, + "grad_norm": 1.665531039237976, + "learning_rate": 4.640174073617339e-05, + "loss": 5.2184, + "step": 29073 + }, + { + "epoch": 0.17291131411171376, + "grad_norm": 1.663392186164856, + "learning_rate": 4.640149930764875e-05, + "loss": 4.1373, + "step": 29074 + }, + { + "epoch": 0.17291726139499478, + "grad_norm": 1.8580307960510254, + "learning_rate": 4.640125787165307e-05, + "loss": 4.4035, + "step": 29075 + }, + { + "epoch": 0.17292320867827576, + "grad_norm": 1.5936819314956665, + "learning_rate": 4.640101642818643e-05, + "loss": 5.145, + "step": 29076 + }, + { + "epoch": 0.17292915596155675, + "grad_norm": 1.7124170064926147, + "learning_rate": 4.6400774977248915e-05, + "loss": 4.1569, + "step": 29077 + }, + { + "epoch": 0.17293510324483777, + "grad_norm": 2.51955509185791, + "learning_rate": 4.6400533518840614e-05, + "loss": 3.8795, + "step": 29078 + }, + { + "epoch": 0.17294105052811876, + "grad_norm": 1.6238064765930176, + "learning_rate": 4.6400292052961604e-05, + "loss": 5.0575, + "step": 29079 + }, + { + "epoch": 0.17294699781139974, + "grad_norm": 1.7471083402633667, + "learning_rate": 4.6400050579611974e-05, + "loss": 4.1607, + "step": 29080 + }, + { + "epoch": 0.17295294509468076, + "grad_norm": 1.7179365158081055, + "learning_rate": 4.639980909879181e-05, + "loss": 4.2253, + "step": 29081 + }, + { + "epoch": 0.17295889237796175, + "grad_norm": 1.6772149801254272, + "learning_rate": 4.639956761050119e-05, + "loss": 4.0833, + "step": 29082 + }, + { + "epoch": 0.17296483966124274, + "grad_norm": 1.6395635604858398, + "learning_rate": 4.639932611474021e-05, + "loss": 4.3961, + "step": 29083 + }, + { + "epoch": 0.17297078694452375, + "grad_norm": 1.5897985696792603, + "learning_rate": 4.6399084611508935e-05, + "loss": 4.5272, + "step": 29084 + }, + { + "epoch": 0.17297673422780474, + "grad_norm": 1.5276799201965332, + "learning_rate": 4.639884310080746e-05, + "loss": 5.037, + "step": 29085 + }, + { + "epoch": 0.17298268151108573, + "grad_norm": 1.5612523555755615, + "learning_rate": 4.639860158263588e-05, + "loss": 5.2272, + "step": 29086 + }, + { + "epoch": 0.17298862879436674, + "grad_norm": 1.7078372240066528, + "learning_rate": 4.639836005699426e-05, + "loss": 4.2294, + "step": 29087 + }, + { + "epoch": 0.17299457607764773, + "grad_norm": 1.643798828125, + "learning_rate": 4.63981185238827e-05, + "loss": 4.1974, + "step": 29088 + }, + { + "epoch": 0.17300052336092872, + "grad_norm": 1.7256457805633545, + "learning_rate": 4.639787698330128e-05, + "loss": 4.3683, + "step": 29089 + }, + { + "epoch": 0.17300647064420974, + "grad_norm": 1.9199156761169434, + "learning_rate": 4.6397635435250076e-05, + "loss": 4.3005, + "step": 29090 + }, + { + "epoch": 0.17301241792749072, + "grad_norm": 1.927114486694336, + "learning_rate": 4.6397393879729176e-05, + "loss": 3.53, + "step": 29091 + }, + { + "epoch": 0.1730183652107717, + "grad_norm": 1.5402168035507202, + "learning_rate": 4.639715231673868e-05, + "loss": 5.048, + "step": 29092 + }, + { + "epoch": 0.17302431249405273, + "grad_norm": 1.4014962911605835, + "learning_rate": 4.6396910746278646e-05, + "loss": 4.9029, + "step": 29093 + }, + { + "epoch": 0.17303025977733372, + "grad_norm": 1.3504273891448975, + "learning_rate": 4.639666916834918e-05, + "loss": 4.9728, + "step": 29094 + }, + { + "epoch": 0.1730362070606147, + "grad_norm": 1.4277746677398682, + "learning_rate": 4.639642758295035e-05, + "loss": 4.9853, + "step": 29095 + }, + { + "epoch": 0.17304215434389572, + "grad_norm": 1.664764165878296, + "learning_rate": 4.639618599008225e-05, + "loss": 4.9195, + "step": 29096 + }, + { + "epoch": 0.1730481016271767, + "grad_norm": 1.7788653373718262, + "learning_rate": 4.639594438974497e-05, + "loss": 4.6073, + "step": 29097 + }, + { + "epoch": 0.1730540489104577, + "grad_norm": 1.543224573135376, + "learning_rate": 4.639570278193858e-05, + "loss": 4.5988, + "step": 29098 + }, + { + "epoch": 0.1730599961937387, + "grad_norm": 1.8790651559829712, + "learning_rate": 4.639546116666317e-05, + "loss": 4.3982, + "step": 29099 + }, + { + "epoch": 0.1730659434770197, + "grad_norm": 1.6308414936065674, + "learning_rate": 4.639521954391883e-05, + "loss": 4.8477, + "step": 29100 + }, + { + "epoch": 0.1730718907603007, + "grad_norm": 1.7135157585144043, + "learning_rate": 4.639497791370564e-05, + "loss": 5.0111, + "step": 29101 + }, + { + "epoch": 0.1730778380435817, + "grad_norm": 1.9777605533599854, + "learning_rate": 4.639473627602369e-05, + "loss": 5.2615, + "step": 29102 + }, + { + "epoch": 0.1730837853268627, + "grad_norm": 1.8689080476760864, + "learning_rate": 4.639449463087304e-05, + "loss": 5.4032, + "step": 29103 + }, + { + "epoch": 0.17308973261014368, + "grad_norm": 1.8719011545181274, + "learning_rate": 4.6394252978253814e-05, + "loss": 4.7377, + "step": 29104 + }, + { + "epoch": 0.17309567989342467, + "grad_norm": 2.0242390632629395, + "learning_rate": 4.6394011318166066e-05, + "loss": 4.3017, + "step": 29105 + }, + { + "epoch": 0.17310162717670569, + "grad_norm": 1.6117249727249146, + "learning_rate": 4.639376965060989e-05, + "loss": 4.5215, + "step": 29106 + }, + { + "epoch": 0.17310757445998667, + "grad_norm": 1.9272388219833374, + "learning_rate": 4.639352797558536e-05, + "loss": 4.4802, + "step": 29107 + }, + { + "epoch": 0.17311352174326766, + "grad_norm": 1.7987074851989746, + "learning_rate": 4.639328629309259e-05, + "loss": 4.4009, + "step": 29108 + }, + { + "epoch": 0.17311946902654868, + "grad_norm": 1.8932039737701416, + "learning_rate": 4.639304460313163e-05, + "loss": 4.3668, + "step": 29109 + }, + { + "epoch": 0.17312541630982967, + "grad_norm": 2.2508416175842285, + "learning_rate": 4.639280290570258e-05, + "loss": 4.9557, + "step": 29110 + }, + { + "epoch": 0.17313136359311065, + "grad_norm": 2.086383104324341, + "learning_rate": 4.639256120080553e-05, + "loss": 5.0933, + "step": 29111 + }, + { + "epoch": 0.17313731087639167, + "grad_norm": 1.7917490005493164, + "learning_rate": 4.639231948844056e-05, + "loss": 5.2057, + "step": 29112 + }, + { + "epoch": 0.17314325815967266, + "grad_norm": 1.8576172590255737, + "learning_rate": 4.639207776860774e-05, + "loss": 4.4434, + "step": 29113 + }, + { + "epoch": 0.17314920544295365, + "grad_norm": 1.746186375617981, + "learning_rate": 4.639183604130717e-05, + "loss": 4.2003, + "step": 29114 + }, + { + "epoch": 0.17315515272623466, + "grad_norm": 2.03523588180542, + "learning_rate": 4.639159430653894e-05, + "loss": 4.2907, + "step": 29115 + }, + { + "epoch": 0.17316110000951565, + "grad_norm": 2.0713725090026855, + "learning_rate": 4.639135256430312e-05, + "loss": 4.3741, + "step": 29116 + }, + { + "epoch": 0.17316704729279664, + "grad_norm": 2.745671510696411, + "learning_rate": 4.63911108145998e-05, + "loss": 4.6313, + "step": 29117 + }, + { + "epoch": 0.17317299457607765, + "grad_norm": 1.9662394523620605, + "learning_rate": 4.639086905742906e-05, + "loss": 4.2027, + "step": 29118 + }, + { + "epoch": 0.17317894185935864, + "grad_norm": 1.7448909282684326, + "learning_rate": 4.6390627292791e-05, + "loss": 4.9481, + "step": 29119 + }, + { + "epoch": 0.17318488914263963, + "grad_norm": 1.684590458869934, + "learning_rate": 4.639038552068569e-05, + "loss": 4.8794, + "step": 29120 + }, + { + "epoch": 0.17319083642592065, + "grad_norm": 1.8462331295013428, + "learning_rate": 4.639014374111321e-05, + "loss": 3.9728, + "step": 29121 + }, + { + "epoch": 0.17319678370920163, + "grad_norm": 1.9657787084579468, + "learning_rate": 4.638990195407366e-05, + "loss": 4.0798, + "step": 29122 + }, + { + "epoch": 0.17320273099248262, + "grad_norm": 1.7591108083724976, + "learning_rate": 4.638966015956711e-05, + "loss": 3.9714, + "step": 29123 + }, + { + "epoch": 0.17320867827576364, + "grad_norm": 1.6764097213745117, + "learning_rate": 4.638941835759365e-05, + "loss": 4.7804, + "step": 29124 + }, + { + "epoch": 0.17321462555904463, + "grad_norm": 1.7766660451889038, + "learning_rate": 4.638917654815336e-05, + "loss": 4.8408, + "step": 29125 + }, + { + "epoch": 0.17322057284232562, + "grad_norm": 1.7548637390136719, + "learning_rate": 4.638893473124634e-05, + "loss": 4.9905, + "step": 29126 + }, + { + "epoch": 0.17322652012560663, + "grad_norm": 1.933996319770813, + "learning_rate": 4.6388692906872664e-05, + "loss": 4.757, + "step": 29127 + }, + { + "epoch": 0.17323246740888762, + "grad_norm": 1.6957604885101318, + "learning_rate": 4.638845107503241e-05, + "loss": 5.1555, + "step": 29128 + }, + { + "epoch": 0.1732384146921686, + "grad_norm": 1.7500252723693848, + "learning_rate": 4.638820923572567e-05, + "loss": 4.9637, + "step": 29129 + }, + { + "epoch": 0.17324436197544962, + "grad_norm": 1.8749233484268188, + "learning_rate": 4.638796738895253e-05, + "loss": 4.0375, + "step": 29130 + }, + { + "epoch": 0.1732503092587306, + "grad_norm": 2.124462842941284, + "learning_rate": 4.6387725534713066e-05, + "loss": 4.6226, + "step": 29131 + }, + { + "epoch": 0.1732562565420116, + "grad_norm": 1.877875804901123, + "learning_rate": 4.6387483673007375e-05, + "loss": 4.572, + "step": 29132 + }, + { + "epoch": 0.17326220382529262, + "grad_norm": 1.7845820188522339, + "learning_rate": 4.6387241803835535e-05, + "loss": 4.4978, + "step": 29133 + }, + { + "epoch": 0.1732681511085736, + "grad_norm": 1.5177055597305298, + "learning_rate": 4.638699992719762e-05, + "loss": 4.6488, + "step": 29134 + }, + { + "epoch": 0.1732740983918546, + "grad_norm": 1.6078678369522095, + "learning_rate": 4.6386758043093736e-05, + "loss": 4.5668, + "step": 29135 + }, + { + "epoch": 0.1732800456751356, + "grad_norm": 1.640980839729309, + "learning_rate": 4.638651615152395e-05, + "loss": 4.8367, + "step": 29136 + }, + { + "epoch": 0.1732859929584166, + "grad_norm": 1.4911829233169556, + "learning_rate": 4.638627425248835e-05, + "loss": 4.6056, + "step": 29137 + }, + { + "epoch": 0.17329194024169758, + "grad_norm": 1.7402757406234741, + "learning_rate": 4.6386032345987026e-05, + "loss": 4.6695, + "step": 29138 + }, + { + "epoch": 0.1732978875249786, + "grad_norm": 1.7571971416473389, + "learning_rate": 4.638579043202006e-05, + "loss": 4.3587, + "step": 29139 + }, + { + "epoch": 0.1733038348082596, + "grad_norm": 1.9201890230178833, + "learning_rate": 4.6385548510587527e-05, + "loss": 4.6875, + "step": 29140 + }, + { + "epoch": 0.17330978209154058, + "grad_norm": 1.61739182472229, + "learning_rate": 4.638530658168954e-05, + "loss": 4.354, + "step": 29141 + }, + { + "epoch": 0.1733157293748216, + "grad_norm": 1.530254602432251, + "learning_rate": 4.6385064645326144e-05, + "loss": 5.1195, + "step": 29142 + }, + { + "epoch": 0.17332167665810258, + "grad_norm": 1.604181170463562, + "learning_rate": 4.638482270149745e-05, + "loss": 4.5733, + "step": 29143 + }, + { + "epoch": 0.17332762394138357, + "grad_norm": 1.5250577926635742, + "learning_rate": 4.638458075020353e-05, + "loss": 5.0787, + "step": 29144 + }, + { + "epoch": 0.17333357122466458, + "grad_norm": 1.539345383644104, + "learning_rate": 4.638433879144448e-05, + "loss": 4.5644, + "step": 29145 + }, + { + "epoch": 0.17333951850794557, + "grad_norm": 1.4076765775680542, + "learning_rate": 4.6384096825220376e-05, + "loss": 4.8226, + "step": 29146 + }, + { + "epoch": 0.17334546579122656, + "grad_norm": 1.5576672554016113, + "learning_rate": 4.6383854851531304e-05, + "loss": 4.8671, + "step": 29147 + }, + { + "epoch": 0.17335141307450758, + "grad_norm": 1.4902443885803223, + "learning_rate": 4.638361287037735e-05, + "loss": 5.0003, + "step": 29148 + }, + { + "epoch": 0.17335736035778856, + "grad_norm": 1.3985077142715454, + "learning_rate": 4.63833708817586e-05, + "loss": 5.297, + "step": 29149 + }, + { + "epoch": 0.17336330764106955, + "grad_norm": 1.798403263092041, + "learning_rate": 4.638312888567513e-05, + "loss": 4.8625, + "step": 29150 + }, + { + "epoch": 0.17336925492435057, + "grad_norm": 1.5843340158462524, + "learning_rate": 4.638288688212704e-05, + "loss": 4.7577, + "step": 29151 + }, + { + "epoch": 0.17337520220763156, + "grad_norm": 1.5716784000396729, + "learning_rate": 4.63826448711144e-05, + "loss": 5.1091, + "step": 29152 + }, + { + "epoch": 0.17338114949091255, + "grad_norm": 1.7493597269058228, + "learning_rate": 4.6382402852637294e-05, + "loss": 4.8566, + "step": 29153 + }, + { + "epoch": 0.17338709677419356, + "grad_norm": 1.6974579095840454, + "learning_rate": 4.638216082669582e-05, + "loss": 4.8687, + "step": 29154 + }, + { + "epoch": 0.17339304405747455, + "grad_norm": 1.6314281225204468, + "learning_rate": 4.6381918793290055e-05, + "loss": 4.8077, + "step": 29155 + }, + { + "epoch": 0.17339899134075554, + "grad_norm": 1.6575573682785034, + "learning_rate": 4.6381676752420076e-05, + "loss": 4.9225, + "step": 29156 + }, + { + "epoch": 0.17340493862403655, + "grad_norm": 1.4562337398529053, + "learning_rate": 4.638143470408598e-05, + "loss": 5.056, + "step": 29157 + }, + { + "epoch": 0.17341088590731754, + "grad_norm": 1.6989314556121826, + "learning_rate": 4.638119264828784e-05, + "loss": 5.0006, + "step": 29158 + }, + { + "epoch": 0.17341683319059853, + "grad_norm": 1.6114591360092163, + "learning_rate": 4.638095058502575e-05, + "loss": 4.7174, + "step": 29159 + }, + { + "epoch": 0.17342278047387955, + "grad_norm": 1.8833446502685547, + "learning_rate": 4.6380708514299794e-05, + "loss": 4.6826, + "step": 29160 + }, + { + "epoch": 0.17342872775716053, + "grad_norm": 1.8556679487228394, + "learning_rate": 4.638046643611006e-05, + "loss": 4.6246, + "step": 29161 + }, + { + "epoch": 0.17343467504044152, + "grad_norm": 1.8661102056503296, + "learning_rate": 4.6380224350456615e-05, + "loss": 4.4789, + "step": 29162 + }, + { + "epoch": 0.1734406223237225, + "grad_norm": 1.7095074653625488, + "learning_rate": 4.637998225733956e-05, + "loss": 4.923, + "step": 29163 + }, + { + "epoch": 0.17344656960700353, + "grad_norm": 1.34967041015625, + "learning_rate": 4.6379740156758966e-05, + "loss": 4.797, + "step": 29164 + }, + { + "epoch": 0.1734525168902845, + "grad_norm": 1.7319891452789307, + "learning_rate": 4.637949804871493e-05, + "loss": 4.5764, + "step": 29165 + }, + { + "epoch": 0.1734584641735655, + "grad_norm": 1.7644058465957642, + "learning_rate": 4.637925593320754e-05, + "loss": 4.5592, + "step": 29166 + }, + { + "epoch": 0.17346441145684652, + "grad_norm": 1.773938775062561, + "learning_rate": 4.637901381023686e-05, + "loss": 5.0608, + "step": 29167 + }, + { + "epoch": 0.1734703587401275, + "grad_norm": 1.7514781951904297, + "learning_rate": 4.637877167980299e-05, + "loss": 4.6467, + "step": 29168 + }, + { + "epoch": 0.1734763060234085, + "grad_norm": 1.6960844993591309, + "learning_rate": 4.637852954190602e-05, + "loss": 4.4893, + "step": 29169 + }, + { + "epoch": 0.1734822533066895, + "grad_norm": 1.687488317489624, + "learning_rate": 4.6378287396546024e-05, + "loss": 4.5032, + "step": 29170 + }, + { + "epoch": 0.1734882005899705, + "grad_norm": 1.71660315990448, + "learning_rate": 4.6378045243723084e-05, + "loss": 4.9538, + "step": 29171 + }, + { + "epoch": 0.1734941478732515, + "grad_norm": 1.8937394618988037, + "learning_rate": 4.637780308343729e-05, + "loss": 4.6157, + "step": 29172 + }, + { + "epoch": 0.1735000951565325, + "grad_norm": 1.8577438592910767, + "learning_rate": 4.637756091568873e-05, + "loss": 4.5289, + "step": 29173 + }, + { + "epoch": 0.1735060424398135, + "grad_norm": 1.6964426040649414, + "learning_rate": 4.637731874047748e-05, + "loss": 4.6735, + "step": 29174 + }, + { + "epoch": 0.17351198972309448, + "grad_norm": 1.708333134651184, + "learning_rate": 4.637707655780363e-05, + "loss": 4.7042, + "step": 29175 + }, + { + "epoch": 0.1735179370063755, + "grad_norm": 2.0699708461761475, + "learning_rate": 4.637683436766726e-05, + "loss": 4.259, + "step": 29176 + }, + { + "epoch": 0.17352388428965648, + "grad_norm": 1.9782260656356812, + "learning_rate": 4.637659217006846e-05, + "loss": 4.2724, + "step": 29177 + }, + { + "epoch": 0.17352983157293747, + "grad_norm": 1.8892062902450562, + "learning_rate": 4.6376349965007316e-05, + "loss": 4.0619, + "step": 29178 + }, + { + "epoch": 0.1735357788562185, + "grad_norm": 3.4207348823547363, + "learning_rate": 4.637610775248391e-05, + "loss": 4.0752, + "step": 29179 + }, + { + "epoch": 0.17354172613949947, + "grad_norm": 2.4128661155700684, + "learning_rate": 4.6375865532498316e-05, + "loss": 3.7859, + "step": 29180 + }, + { + "epoch": 0.17354767342278046, + "grad_norm": 1.7334697246551514, + "learning_rate": 4.6375623305050635e-05, + "loss": 4.586, + "step": 29181 + }, + { + "epoch": 0.17355362070606148, + "grad_norm": 2.0362465381622314, + "learning_rate": 4.6375381070140946e-05, + "loss": 4.2091, + "step": 29182 + }, + { + "epoch": 0.17355956798934247, + "grad_norm": 1.7851359844207764, + "learning_rate": 4.637513882776933e-05, + "loss": 4.1567, + "step": 29183 + }, + { + "epoch": 0.17356551527262346, + "grad_norm": 1.9078037738800049, + "learning_rate": 4.637489657793588e-05, + "loss": 4.0716, + "step": 29184 + }, + { + "epoch": 0.17357146255590447, + "grad_norm": 1.7366207838058472, + "learning_rate": 4.6374654320640666e-05, + "loss": 4.3262, + "step": 29185 + }, + { + "epoch": 0.17357740983918546, + "grad_norm": 1.8948423862457275, + "learning_rate": 4.6374412055883785e-05, + "loss": 4.1564, + "step": 29186 + }, + { + "epoch": 0.17358335712246645, + "grad_norm": 1.9613217115402222, + "learning_rate": 4.637416978366532e-05, + "loss": 4.1586, + "step": 29187 + }, + { + "epoch": 0.17358930440574746, + "grad_norm": 2.4783365726470947, + "learning_rate": 4.637392750398535e-05, + "loss": 3.6734, + "step": 29188 + }, + { + "epoch": 0.17359525168902845, + "grad_norm": 2.1660149097442627, + "learning_rate": 4.637368521684396e-05, + "loss": 3.7469, + "step": 29189 + }, + { + "epoch": 0.17360119897230944, + "grad_norm": 2.462066650390625, + "learning_rate": 4.637344292224124e-05, + "loss": 3.6566, + "step": 29190 + }, + { + "epoch": 0.17360714625559046, + "grad_norm": 1.8963021039962769, + "learning_rate": 4.637320062017727e-05, + "loss": 4.0244, + "step": 29191 + }, + { + "epoch": 0.17361309353887144, + "grad_norm": 1.9739018678665161, + "learning_rate": 4.6372958310652135e-05, + "loss": 4.1696, + "step": 29192 + }, + { + "epoch": 0.17361904082215243, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.637271599366593e-05, + "loss": 4.7111, + "step": 29193 + }, + { + "epoch": 0.17362498810543345, + "grad_norm": 1.8292521238327026, + "learning_rate": 4.637247366921872e-05, + "loss": 4.6283, + "step": 29194 + }, + { + "epoch": 0.17363093538871444, + "grad_norm": 1.5309460163116455, + "learning_rate": 4.6372231337310605e-05, + "loss": 4.6252, + "step": 29195 + }, + { + "epoch": 0.17363688267199542, + "grad_norm": 1.8792744874954224, + "learning_rate": 4.637198899794167e-05, + "loss": 4.2226, + "step": 29196 + }, + { + "epoch": 0.17364282995527644, + "grad_norm": 2.1824088096618652, + "learning_rate": 4.6371746651111985e-05, + "loss": 4.2028, + "step": 29197 + }, + { + "epoch": 0.17364877723855743, + "grad_norm": 2.0413753986358643, + "learning_rate": 4.637150429682165e-05, + "loss": 4.1982, + "step": 29198 + }, + { + "epoch": 0.17365472452183842, + "grad_norm": 1.6897474527359009, + "learning_rate": 4.637126193507074e-05, + "loss": 4.5085, + "step": 29199 + }, + { + "epoch": 0.17366067180511943, + "grad_norm": 1.6577891111373901, + "learning_rate": 4.637101956585935e-05, + "loss": 4.6212, + "step": 29200 + }, + { + "epoch": 0.17366661908840042, + "grad_norm": 1.6855782270431519, + "learning_rate": 4.637077718918755e-05, + "loss": 4.7156, + "step": 29201 + }, + { + "epoch": 0.1736725663716814, + "grad_norm": 2.017664909362793, + "learning_rate": 4.637053480505543e-05, + "loss": 4.5439, + "step": 29202 + }, + { + "epoch": 0.17367851365496242, + "grad_norm": 1.7421058416366577, + "learning_rate": 4.637029241346309e-05, + "loss": 4.3292, + "step": 29203 + }, + { + "epoch": 0.1736844609382434, + "grad_norm": 1.6741775274276733, + "learning_rate": 4.6370050014410594e-05, + "loss": 4.3136, + "step": 29204 + }, + { + "epoch": 0.1736904082215244, + "grad_norm": 1.9777534008026123, + "learning_rate": 4.636980760789803e-05, + "loss": 4.1499, + "step": 29205 + }, + { + "epoch": 0.17369635550480542, + "grad_norm": 2.133716583251953, + "learning_rate": 4.6369565193925505e-05, + "loss": 4.2251, + "step": 29206 + }, + { + "epoch": 0.1737023027880864, + "grad_norm": 2.047595739364624, + "learning_rate": 4.636932277249306e-05, + "loss": 4.0876, + "step": 29207 + }, + { + "epoch": 0.1737082500713674, + "grad_norm": 1.9693220853805542, + "learning_rate": 4.636908034360082e-05, + "loss": 3.8007, + "step": 29208 + }, + { + "epoch": 0.1737141973546484, + "grad_norm": 1.7148840427398682, + "learning_rate": 4.6368837907248855e-05, + "loss": 4.3048, + "step": 29209 + }, + { + "epoch": 0.1737201446379294, + "grad_norm": 1.4605804681777954, + "learning_rate": 4.6368595463437246e-05, + "loss": 4.402, + "step": 29210 + }, + { + "epoch": 0.17372609192121038, + "grad_norm": 1.8033897876739502, + "learning_rate": 4.636835301216608e-05, + "loss": 4.491, + "step": 29211 + }, + { + "epoch": 0.1737320392044914, + "grad_norm": 1.6581388711929321, + "learning_rate": 4.636811055343545e-05, + "loss": 4.5847, + "step": 29212 + }, + { + "epoch": 0.1737379864877724, + "grad_norm": 1.7046984434127808, + "learning_rate": 4.636786808724542e-05, + "loss": 4.7485, + "step": 29213 + }, + { + "epoch": 0.17374393377105338, + "grad_norm": 1.735479474067688, + "learning_rate": 4.6367625613596096e-05, + "loss": 4.8771, + "step": 29214 + }, + { + "epoch": 0.1737498810543344, + "grad_norm": 1.781473994255066, + "learning_rate": 4.636738313248756e-05, + "loss": 4.4308, + "step": 29215 + }, + { + "epoch": 0.17375582833761538, + "grad_norm": 1.945377230644226, + "learning_rate": 4.636714064391988e-05, + "loss": 3.9839, + "step": 29216 + }, + { + "epoch": 0.17376177562089637, + "grad_norm": 1.9880878925323486, + "learning_rate": 4.6366898147893165e-05, + "loss": 4.1544, + "step": 29217 + }, + { + "epoch": 0.17376772290417739, + "grad_norm": 1.9976726770401, + "learning_rate": 4.6366655644407475e-05, + "loss": 4.4061, + "step": 29218 + }, + { + "epoch": 0.17377367018745837, + "grad_norm": 2.0192174911499023, + "learning_rate": 4.6366413133462915e-05, + "loss": 4.3094, + "step": 29219 + }, + { + "epoch": 0.17377961747073936, + "grad_norm": 1.9302101135253906, + "learning_rate": 4.636617061505956e-05, + "loss": 4.7673, + "step": 29220 + }, + { + "epoch": 0.17378556475402035, + "grad_norm": 1.6863242387771606, + "learning_rate": 4.636592808919749e-05, + "loss": 4.7641, + "step": 29221 + }, + { + "epoch": 0.17379151203730137, + "grad_norm": 1.8345664739608765, + "learning_rate": 4.63656855558768e-05, + "loss": 4.6849, + "step": 29222 + }, + { + "epoch": 0.17379745932058235, + "grad_norm": 1.5179288387298584, + "learning_rate": 4.636544301509756e-05, + "loss": 4.7481, + "step": 29223 + }, + { + "epoch": 0.17380340660386334, + "grad_norm": 1.82593834400177, + "learning_rate": 4.6365200466859876e-05, + "loss": 4.7234, + "step": 29224 + }, + { + "epoch": 0.17380935388714436, + "grad_norm": 1.7959182262420654, + "learning_rate": 4.636495791116382e-05, + "loss": 5.0005, + "step": 29225 + }, + { + "epoch": 0.17381530117042535, + "grad_norm": 2.36141037940979, + "learning_rate": 4.636471534800947e-05, + "loss": 4.1279, + "step": 29226 + }, + { + "epoch": 0.17382124845370633, + "grad_norm": 1.8446800708770752, + "learning_rate": 4.636447277739693e-05, + "loss": 4.3379, + "step": 29227 + }, + { + "epoch": 0.17382719573698735, + "grad_norm": 1.9190828800201416, + "learning_rate": 4.636423019932626e-05, + "loss": 4.3296, + "step": 29228 + }, + { + "epoch": 0.17383314302026834, + "grad_norm": 1.863991379737854, + "learning_rate": 4.636398761379756e-05, + "loss": 4.3733, + "step": 29229 + }, + { + "epoch": 0.17383909030354933, + "grad_norm": 1.7630629539489746, + "learning_rate": 4.636374502081092e-05, + "loss": 4.3829, + "step": 29230 + }, + { + "epoch": 0.17384503758683034, + "grad_norm": 1.554083228111267, + "learning_rate": 4.636350242036642e-05, + "loss": 4.6883, + "step": 29231 + }, + { + "epoch": 0.17385098487011133, + "grad_norm": 1.6765477657318115, + "learning_rate": 4.6363259812464135e-05, + "loss": 4.5129, + "step": 29232 + }, + { + "epoch": 0.17385693215339232, + "grad_norm": 1.6007416248321533, + "learning_rate": 4.636301719710416e-05, + "loss": 4.561, + "step": 29233 + }, + { + "epoch": 0.17386287943667333, + "grad_norm": 1.6795105934143066, + "learning_rate": 4.6362774574286575e-05, + "loss": 4.6389, + "step": 29234 + }, + { + "epoch": 0.17386882671995432, + "grad_norm": 1.6491032838821411, + "learning_rate": 4.6362531944011464e-05, + "loss": 4.3857, + "step": 29235 + }, + { + "epoch": 0.1738747740032353, + "grad_norm": 2.123032569885254, + "learning_rate": 4.636228930627892e-05, + "loss": 3.8423, + "step": 29236 + }, + { + "epoch": 0.17388072128651633, + "grad_norm": 2.0041513442993164, + "learning_rate": 4.636204666108902e-05, + "loss": 3.1621, + "step": 29237 + }, + { + "epoch": 0.17388666856979731, + "grad_norm": 1.6654435396194458, + "learning_rate": 4.636180400844185e-05, + "loss": 4.3272, + "step": 29238 + }, + { + "epoch": 0.1738926158530783, + "grad_norm": 1.553393006324768, + "learning_rate": 4.636156134833749e-05, + "loss": 4.9542, + "step": 29239 + }, + { + "epoch": 0.17389856313635932, + "grad_norm": 1.6511328220367432, + "learning_rate": 4.6361318680776035e-05, + "loss": 5.0055, + "step": 29240 + }, + { + "epoch": 0.1739045104196403, + "grad_norm": 1.8133567571640015, + "learning_rate": 4.6361076005757554e-05, + "loss": 4.4575, + "step": 29241 + }, + { + "epoch": 0.1739104577029213, + "grad_norm": 2.6649341583251953, + "learning_rate": 4.636083332328215e-05, + "loss": 4.1054, + "step": 29242 + }, + { + "epoch": 0.1739164049862023, + "grad_norm": 2.676636219024658, + "learning_rate": 4.63605906333499e-05, + "loss": 3.5847, + "step": 29243 + }, + { + "epoch": 0.1739223522694833, + "grad_norm": 2.376490592956543, + "learning_rate": 4.636034793596089e-05, + "loss": 3.9051, + "step": 29244 + }, + { + "epoch": 0.1739282995527643, + "grad_norm": 1.6567094326019287, + "learning_rate": 4.63601052311152e-05, + "loss": 5.1711, + "step": 29245 + }, + { + "epoch": 0.1739342468360453, + "grad_norm": 1.981115698814392, + "learning_rate": 4.6359862518812924e-05, + "loss": 3.8426, + "step": 29246 + }, + { + "epoch": 0.1739401941193263, + "grad_norm": 1.640690565109253, + "learning_rate": 4.6359619799054136e-05, + "loss": 4.3196, + "step": 29247 + }, + { + "epoch": 0.17394614140260728, + "grad_norm": 1.6027098894119263, + "learning_rate": 4.635937707183892e-05, + "loss": 5.2091, + "step": 29248 + }, + { + "epoch": 0.1739520886858883, + "grad_norm": 1.732526183128357, + "learning_rate": 4.6359134337167375e-05, + "loss": 5.0799, + "step": 29249 + }, + { + "epoch": 0.17395803596916928, + "grad_norm": 1.7720987796783447, + "learning_rate": 4.635889159503957e-05, + "loss": 4.9359, + "step": 29250 + }, + { + "epoch": 0.17396398325245027, + "grad_norm": 1.60392427444458, + "learning_rate": 4.63586488454556e-05, + "loss": 4.8213, + "step": 29251 + }, + { + "epoch": 0.1739699305357313, + "grad_norm": 1.4416741132736206, + "learning_rate": 4.635840608841555e-05, + "loss": 5.1283, + "step": 29252 + }, + { + "epoch": 0.17397587781901228, + "grad_norm": 1.9322450160980225, + "learning_rate": 4.63581633239195e-05, + "loss": 4.4477, + "step": 29253 + }, + { + "epoch": 0.17398182510229326, + "grad_norm": 1.661475658416748, + "learning_rate": 4.635792055196753e-05, + "loss": 4.6993, + "step": 29254 + }, + { + "epoch": 0.17398777238557428, + "grad_norm": 1.7771600484848022, + "learning_rate": 4.635767777255973e-05, + "loss": 4.4883, + "step": 29255 + }, + { + "epoch": 0.17399371966885527, + "grad_norm": 1.8131498098373413, + "learning_rate": 4.635743498569619e-05, + "loss": 3.9214, + "step": 29256 + }, + { + "epoch": 0.17399966695213626, + "grad_norm": 1.6624927520751953, + "learning_rate": 4.635719219137699e-05, + "loss": 4.9492, + "step": 29257 + }, + { + "epoch": 0.17400561423541727, + "grad_norm": 2.7123286724090576, + "learning_rate": 4.6356949389602214e-05, + "loss": 4.7048, + "step": 29258 + }, + { + "epoch": 0.17401156151869826, + "grad_norm": 2.078057050704956, + "learning_rate": 4.6356706580371945e-05, + "loss": 4.5294, + "step": 29259 + }, + { + "epoch": 0.17401750880197925, + "grad_norm": 1.738935947418213, + "learning_rate": 4.6356463763686275e-05, + "loss": 4.7332, + "step": 29260 + }, + { + "epoch": 0.17402345608526026, + "grad_norm": 1.8803629875183105, + "learning_rate": 4.635622093954528e-05, + "loss": 4.9347, + "step": 29261 + }, + { + "epoch": 0.17402940336854125, + "grad_norm": 1.3738025426864624, + "learning_rate": 4.635597810794905e-05, + "loss": 5.4709, + "step": 29262 + }, + { + "epoch": 0.17403535065182224, + "grad_norm": 1.6917965412139893, + "learning_rate": 4.635573526889767e-05, + "loss": 4.5494, + "step": 29263 + }, + { + "epoch": 0.17404129793510326, + "grad_norm": 1.9916536808013916, + "learning_rate": 4.6355492422391226e-05, + "loss": 4.4302, + "step": 29264 + }, + { + "epoch": 0.17404724521838424, + "grad_norm": 1.8959016799926758, + "learning_rate": 4.63552495684298e-05, + "loss": 4.1595, + "step": 29265 + }, + { + "epoch": 0.17405319250166523, + "grad_norm": 1.7730271816253662, + "learning_rate": 4.635500670701347e-05, + "loss": 4.6212, + "step": 29266 + }, + { + "epoch": 0.17405913978494625, + "grad_norm": 1.9785410165786743, + "learning_rate": 4.635476383814233e-05, + "loss": 4.6885, + "step": 29267 + }, + { + "epoch": 0.17406508706822724, + "grad_norm": 1.915924310684204, + "learning_rate": 4.6354520961816475e-05, + "loss": 4.4186, + "step": 29268 + }, + { + "epoch": 0.17407103435150822, + "grad_norm": 1.6227480173110962, + "learning_rate": 4.6354278078035964e-05, + "loss": 4.6483, + "step": 29269 + }, + { + "epoch": 0.17407698163478924, + "grad_norm": 1.6679190397262573, + "learning_rate": 4.635403518680089e-05, + "loss": 4.9393, + "step": 29270 + }, + { + "epoch": 0.17408292891807023, + "grad_norm": 1.3380484580993652, + "learning_rate": 4.6353792288111353e-05, + "loss": 5.1539, + "step": 29271 + }, + { + "epoch": 0.17408887620135122, + "grad_norm": 1.3670740127563477, + "learning_rate": 4.635354938196743e-05, + "loss": 5.1949, + "step": 29272 + }, + { + "epoch": 0.17409482348463223, + "grad_norm": 1.288189172744751, + "learning_rate": 4.63533064683692e-05, + "loss": 5.4657, + "step": 29273 + }, + { + "epoch": 0.17410077076791322, + "grad_norm": 1.4686154127120972, + "learning_rate": 4.635306354731675e-05, + "loss": 5.5222, + "step": 29274 + }, + { + "epoch": 0.1741067180511942, + "grad_norm": 1.4154938459396362, + "learning_rate": 4.635282061881017e-05, + "loss": 5.261, + "step": 29275 + }, + { + "epoch": 0.17411266533447523, + "grad_norm": 1.7723246812820435, + "learning_rate": 4.635257768284953e-05, + "loss": 5.1817, + "step": 29276 + }, + { + "epoch": 0.1741186126177562, + "grad_norm": 1.7621451616287231, + "learning_rate": 4.635233473943494e-05, + "loss": 5.2426, + "step": 29277 + }, + { + "epoch": 0.1741245599010372, + "grad_norm": 1.6899840831756592, + "learning_rate": 4.6352091788566466e-05, + "loss": 4.5392, + "step": 29278 + }, + { + "epoch": 0.17413050718431822, + "grad_norm": 1.7704182863235474, + "learning_rate": 4.6351848830244195e-05, + "loss": 4.4345, + "step": 29279 + }, + { + "epoch": 0.1741364544675992, + "grad_norm": 1.9371867179870605, + "learning_rate": 4.635160586446821e-05, + "loss": 4.1621, + "step": 29280 + }, + { + "epoch": 0.1741424017508802, + "grad_norm": 1.771759271621704, + "learning_rate": 4.63513628912386e-05, + "loss": 4.7001, + "step": 29281 + }, + { + "epoch": 0.17414834903416118, + "grad_norm": 2.212144136428833, + "learning_rate": 4.635111991055546e-05, + "loss": 3.7101, + "step": 29282 + }, + { + "epoch": 0.1741542963174422, + "grad_norm": 2.0476841926574707, + "learning_rate": 4.6350876922418864e-05, + "loss": 3.4412, + "step": 29283 + }, + { + "epoch": 0.17416024360072319, + "grad_norm": 1.849636197090149, + "learning_rate": 4.635063392682889e-05, + "loss": 4.553, + "step": 29284 + }, + { + "epoch": 0.17416619088400417, + "grad_norm": 1.9307612180709839, + "learning_rate": 4.6350390923785634e-05, + "loss": 3.7483, + "step": 29285 + }, + { + "epoch": 0.1741721381672852, + "grad_norm": 1.9862045049667358, + "learning_rate": 4.6350147913289176e-05, + "loss": 4.3754, + "step": 29286 + }, + { + "epoch": 0.17417808545056618, + "grad_norm": 1.7079651355743408, + "learning_rate": 4.63499048953396e-05, + "loss": 5.3671, + "step": 29287 + }, + { + "epoch": 0.17418403273384717, + "grad_norm": 1.8182214498519897, + "learning_rate": 4.6349661869937e-05, + "loss": 5.148, + "step": 29288 + }, + { + "epoch": 0.17418998001712818, + "grad_norm": 1.7571437358856201, + "learning_rate": 4.6349418837081445e-05, + "loss": 4.4641, + "step": 29289 + }, + { + "epoch": 0.17419592730040917, + "grad_norm": 1.6432558298110962, + "learning_rate": 4.6349175796773026e-05, + "loss": 4.6966, + "step": 29290 + }, + { + "epoch": 0.17420187458369016, + "grad_norm": 1.729112148284912, + "learning_rate": 4.634893274901184e-05, + "loss": 4.6125, + "step": 29291 + }, + { + "epoch": 0.17420782186697117, + "grad_norm": 1.5376940965652466, + "learning_rate": 4.6348689693797954e-05, + "loss": 4.4921, + "step": 29292 + }, + { + "epoch": 0.17421376915025216, + "grad_norm": 1.8997972011566162, + "learning_rate": 4.634844663113147e-05, + "loss": 4.4163, + "step": 29293 + }, + { + "epoch": 0.17421971643353315, + "grad_norm": 1.6643134355545044, + "learning_rate": 4.634820356101246e-05, + "loss": 4.7624, + "step": 29294 + }, + { + "epoch": 0.17422566371681417, + "grad_norm": 1.4758678674697876, + "learning_rate": 4.6347960483441013e-05, + "loss": 5.3261, + "step": 29295 + }, + { + "epoch": 0.17423161100009515, + "grad_norm": 1.7518540620803833, + "learning_rate": 4.6347717398417203e-05, + "loss": 4.4916, + "step": 29296 + }, + { + "epoch": 0.17423755828337614, + "grad_norm": 1.6143438816070557, + "learning_rate": 4.634747430594114e-05, + "loss": 4.336, + "step": 29297 + }, + { + "epoch": 0.17424350556665716, + "grad_norm": 1.6077839136123657, + "learning_rate": 4.634723120601289e-05, + "loss": 4.5486, + "step": 29298 + }, + { + "epoch": 0.17424945284993815, + "grad_norm": 1.9146685600280762, + "learning_rate": 4.634698809863254e-05, + "loss": 5.1115, + "step": 29299 + }, + { + "epoch": 0.17425540013321913, + "grad_norm": 1.6625542640686035, + "learning_rate": 4.634674498380018e-05, + "loss": 4.653, + "step": 29300 + }, + { + "epoch": 0.17426134741650015, + "grad_norm": 1.7577245235443115, + "learning_rate": 4.634650186151589e-05, + "loss": 4.8305, + "step": 29301 + }, + { + "epoch": 0.17426729469978114, + "grad_norm": 1.5614792108535767, + "learning_rate": 4.6346258731779755e-05, + "loss": 4.8553, + "step": 29302 + }, + { + "epoch": 0.17427324198306213, + "grad_norm": 1.5734407901763916, + "learning_rate": 4.634601559459186e-05, + "loss": 4.9925, + "step": 29303 + }, + { + "epoch": 0.17427918926634314, + "grad_norm": 1.855974555015564, + "learning_rate": 4.6345772449952293e-05, + "loss": 4.7862, + "step": 29304 + }, + { + "epoch": 0.17428513654962413, + "grad_norm": 2.7702269554138184, + "learning_rate": 4.6345529297861146e-05, + "loss": 3.766, + "step": 29305 + }, + { + "epoch": 0.17429108383290512, + "grad_norm": 2.7763569355010986, + "learning_rate": 4.634528613831848e-05, + "loss": 3.343, + "step": 29306 + }, + { + "epoch": 0.17429703111618614, + "grad_norm": 3.1644763946533203, + "learning_rate": 4.6345042971324406e-05, + "loss": 3.4067, + "step": 29307 + }, + { + "epoch": 0.17430297839946712, + "grad_norm": 2.487724781036377, + "learning_rate": 4.6344799796878996e-05, + "loss": 3.226, + "step": 29308 + }, + { + "epoch": 0.1743089256827481, + "grad_norm": 2.340416431427002, + "learning_rate": 4.634455661498234e-05, + "loss": 3.2899, + "step": 29309 + }, + { + "epoch": 0.17431487296602913, + "grad_norm": 1.6526988744735718, + "learning_rate": 4.634431342563451e-05, + "loss": 4.9105, + "step": 29310 + }, + { + "epoch": 0.17432082024931012, + "grad_norm": 2.876229763031006, + "learning_rate": 4.6344070228835614e-05, + "loss": 3.4319, + "step": 29311 + }, + { + "epoch": 0.1743267675325911, + "grad_norm": 2.176748037338257, + "learning_rate": 4.6343827024585716e-05, + "loss": 3.2444, + "step": 29312 + }, + { + "epoch": 0.17433271481587212, + "grad_norm": 2.6688148975372314, + "learning_rate": 4.6343583812884904e-05, + "loss": 3.3417, + "step": 29313 + }, + { + "epoch": 0.1743386620991531, + "grad_norm": 2.5572376251220703, + "learning_rate": 4.634334059373328e-05, + "loss": 4.0048, + "step": 29314 + }, + { + "epoch": 0.1743446093824341, + "grad_norm": 2.3012688159942627, + "learning_rate": 4.6343097367130905e-05, + "loss": 3.5363, + "step": 29315 + }, + { + "epoch": 0.1743505566657151, + "grad_norm": 1.9870244264602661, + "learning_rate": 4.6342854133077875e-05, + "loss": 4.0843, + "step": 29316 + }, + { + "epoch": 0.1743565039489961, + "grad_norm": 2.538632392883301, + "learning_rate": 4.6342610891574276e-05, + "loss": 3.3337, + "step": 29317 + }, + { + "epoch": 0.1743624512322771, + "grad_norm": 2.8932511806488037, + "learning_rate": 4.63423676426202e-05, + "loss": 3.2887, + "step": 29318 + }, + { + "epoch": 0.1743683985155581, + "grad_norm": 2.55438494682312, + "learning_rate": 4.634212438621572e-05, + "loss": 3.6218, + "step": 29319 + }, + { + "epoch": 0.1743743457988391, + "grad_norm": 2.505047082901001, + "learning_rate": 4.634188112236092e-05, + "loss": 3.182, + "step": 29320 + }, + { + "epoch": 0.17438029308212008, + "grad_norm": 2.8068132400512695, + "learning_rate": 4.63416378510559e-05, + "loss": 3.5654, + "step": 29321 + }, + { + "epoch": 0.1743862403654011, + "grad_norm": 1.9296205043792725, + "learning_rate": 4.6341394572300725e-05, + "loss": 4.492, + "step": 29322 + }, + { + "epoch": 0.17439218764868208, + "grad_norm": 1.6537705659866333, + "learning_rate": 4.63411512860955e-05, + "loss": 5.0017, + "step": 29323 + }, + { + "epoch": 0.17439813493196307, + "grad_norm": 1.8064972162246704, + "learning_rate": 4.634090799244028e-05, + "loss": 4.5991, + "step": 29324 + }, + { + "epoch": 0.1744040822152441, + "grad_norm": 1.7944536209106445, + "learning_rate": 4.634066469133519e-05, + "loss": 4.6755, + "step": 29325 + }, + { + "epoch": 0.17441002949852508, + "grad_norm": 2.222592830657959, + "learning_rate": 4.6340421382780286e-05, + "loss": 4.3161, + "step": 29326 + }, + { + "epoch": 0.17441597678180606, + "grad_norm": 2.1058638095855713, + "learning_rate": 4.634017806677567e-05, + "loss": 4.077, + "step": 29327 + }, + { + "epoch": 0.17442192406508708, + "grad_norm": 1.8931814432144165, + "learning_rate": 4.63399347433214e-05, + "loss": 4.2838, + "step": 29328 + }, + { + "epoch": 0.17442787134836807, + "grad_norm": 1.7035942077636719, + "learning_rate": 4.6339691412417586e-05, + "loss": 4.8623, + "step": 29329 + }, + { + "epoch": 0.17443381863164906, + "grad_norm": 1.7701468467712402, + "learning_rate": 4.6339448074064314e-05, + "loss": 4.9063, + "step": 29330 + }, + { + "epoch": 0.17443976591493007, + "grad_norm": 1.7608574628829956, + "learning_rate": 4.633920472826165e-05, + "loss": 4.42, + "step": 29331 + }, + { + "epoch": 0.17444571319821106, + "grad_norm": 2.5129191875457764, + "learning_rate": 4.633896137500971e-05, + "loss": 3.2521, + "step": 29332 + }, + { + "epoch": 0.17445166048149205, + "grad_norm": 1.477378010749817, + "learning_rate": 4.6338718014308534e-05, + "loss": 5.6654, + "step": 29333 + }, + { + "epoch": 0.17445760776477306, + "grad_norm": 1.6242940425872803, + "learning_rate": 4.633847464615825e-05, + "loss": 5.392, + "step": 29334 + }, + { + "epoch": 0.17446355504805405, + "grad_norm": 1.827919602394104, + "learning_rate": 4.633823127055892e-05, + "loss": 4.8818, + "step": 29335 + }, + { + "epoch": 0.17446950233133504, + "grad_norm": 1.6197007894515991, + "learning_rate": 4.633798788751063e-05, + "loss": 4.983, + "step": 29336 + }, + { + "epoch": 0.17447544961461606, + "grad_norm": 1.703899621963501, + "learning_rate": 4.633774449701347e-05, + "loss": 4.9122, + "step": 29337 + }, + { + "epoch": 0.17448139689789705, + "grad_norm": 1.7812259197235107, + "learning_rate": 4.633750109906753e-05, + "loss": 4.6429, + "step": 29338 + }, + { + "epoch": 0.17448734418117803, + "grad_norm": 1.6351381540298462, + "learning_rate": 4.633725769367288e-05, + "loss": 4.9078, + "step": 29339 + }, + { + "epoch": 0.17449329146445902, + "grad_norm": 1.7403061389923096, + "learning_rate": 4.633701428082962e-05, + "loss": 4.6946, + "step": 29340 + }, + { + "epoch": 0.17449923874774004, + "grad_norm": 1.8006681203842163, + "learning_rate": 4.633677086053783e-05, + "loss": 4.2984, + "step": 29341 + }, + { + "epoch": 0.17450518603102103, + "grad_norm": 1.7105704545974731, + "learning_rate": 4.633652743279759e-05, + "loss": 4.4426, + "step": 29342 + }, + { + "epoch": 0.174511133314302, + "grad_norm": 1.7440415620803833, + "learning_rate": 4.6336283997608984e-05, + "loss": 4.4029, + "step": 29343 + }, + { + "epoch": 0.17451708059758303, + "grad_norm": 1.7197996377944946, + "learning_rate": 4.633604055497211e-05, + "loss": 4.263, + "step": 29344 + }, + { + "epoch": 0.17452302788086402, + "grad_norm": 1.7282319068908691, + "learning_rate": 4.633579710488704e-05, + "loss": 4.546, + "step": 29345 + }, + { + "epoch": 0.174528975164145, + "grad_norm": 1.7449449300765991, + "learning_rate": 4.633555364735387e-05, + "loss": 5.1083, + "step": 29346 + }, + { + "epoch": 0.17453492244742602, + "grad_norm": 1.645507574081421, + "learning_rate": 4.633531018237267e-05, + "loss": 4.1636, + "step": 29347 + }, + { + "epoch": 0.174540869730707, + "grad_norm": 1.671286702156067, + "learning_rate": 4.6335066709943534e-05, + "loss": 4.5991, + "step": 29348 + }, + { + "epoch": 0.174546817013988, + "grad_norm": 1.5074694156646729, + "learning_rate": 4.6334823230066554e-05, + "loss": 4.5064, + "step": 29349 + }, + { + "epoch": 0.17455276429726901, + "grad_norm": 1.7285078763961792, + "learning_rate": 4.63345797427418e-05, + "loss": 4.561, + "step": 29350 + }, + { + "epoch": 0.17455871158055, + "grad_norm": 1.9212089776992798, + "learning_rate": 4.6334336247969376e-05, + "loss": 4.2444, + "step": 29351 + }, + { + "epoch": 0.174564658863831, + "grad_norm": 1.6223878860473633, + "learning_rate": 4.633409274574935e-05, + "loss": 4.8405, + "step": 29352 + }, + { + "epoch": 0.174570606147112, + "grad_norm": 1.7474267482757568, + "learning_rate": 4.6333849236081805e-05, + "loss": 4.5651, + "step": 29353 + }, + { + "epoch": 0.174576553430393, + "grad_norm": 1.6735780239105225, + "learning_rate": 4.6333605718966844e-05, + "loss": 4.1536, + "step": 29354 + }, + { + "epoch": 0.17458250071367398, + "grad_norm": 1.7096998691558838, + "learning_rate": 4.633336219440453e-05, + "loss": 4.4034, + "step": 29355 + }, + { + "epoch": 0.174588447996955, + "grad_norm": 1.7881802320480347, + "learning_rate": 4.633311866239497e-05, + "loss": 4.308, + "step": 29356 + }, + { + "epoch": 0.174594395280236, + "grad_norm": 1.4776397943496704, + "learning_rate": 4.6332875122938236e-05, + "loss": 5.1879, + "step": 29357 + }, + { + "epoch": 0.17460034256351697, + "grad_norm": 1.499626636505127, + "learning_rate": 4.6332631576034414e-05, + "loss": 5.1217, + "step": 29358 + }, + { + "epoch": 0.174606289846798, + "grad_norm": 1.5779564380645752, + "learning_rate": 4.6332388021683594e-05, + "loss": 5.1155, + "step": 29359 + }, + { + "epoch": 0.17461223713007898, + "grad_norm": 1.5778738260269165, + "learning_rate": 4.633214445988585e-05, + "loss": 5.0889, + "step": 29360 + }, + { + "epoch": 0.17461818441335997, + "grad_norm": 1.4342097043991089, + "learning_rate": 4.633190089064128e-05, + "loss": 5.1313, + "step": 29361 + }, + { + "epoch": 0.17462413169664098, + "grad_norm": 1.977306604385376, + "learning_rate": 4.6331657313949975e-05, + "loss": 4.3384, + "step": 29362 + }, + { + "epoch": 0.17463007897992197, + "grad_norm": 1.7359813451766968, + "learning_rate": 4.633141372981199e-05, + "loss": 4.9874, + "step": 29363 + }, + { + "epoch": 0.17463602626320296, + "grad_norm": 1.5922671556472778, + "learning_rate": 4.6331170138227435e-05, + "loss": 5.1194, + "step": 29364 + }, + { + "epoch": 0.17464197354648398, + "grad_norm": 1.8139041662216187, + "learning_rate": 4.63309265391964e-05, + "loss": 5.0046, + "step": 29365 + }, + { + "epoch": 0.17464792082976496, + "grad_norm": 1.6782366037368774, + "learning_rate": 4.633068293271895e-05, + "loss": 5.056, + "step": 29366 + }, + { + "epoch": 0.17465386811304595, + "grad_norm": 1.6051324605941772, + "learning_rate": 4.6330439318795174e-05, + "loss": 5.1002, + "step": 29367 + }, + { + "epoch": 0.17465981539632697, + "grad_norm": 1.6109590530395508, + "learning_rate": 4.633019569742517e-05, + "loss": 4.9802, + "step": 29368 + }, + { + "epoch": 0.17466576267960796, + "grad_norm": 1.5063222646713257, + "learning_rate": 4.6329952068609005e-05, + "loss": 5.5857, + "step": 29369 + }, + { + "epoch": 0.17467170996288894, + "grad_norm": 1.6874276399612427, + "learning_rate": 4.632970843234678e-05, + "loss": 5.161, + "step": 29370 + }, + { + "epoch": 0.17467765724616996, + "grad_norm": 1.8858634233474731, + "learning_rate": 4.6329464788638576e-05, + "loss": 4.6397, + "step": 29371 + }, + { + "epoch": 0.17468360452945095, + "grad_norm": 2.004140853881836, + "learning_rate": 4.632922113748447e-05, + "loss": 4.5306, + "step": 29372 + }, + { + "epoch": 0.17468955181273194, + "grad_norm": 1.278494954109192, + "learning_rate": 4.632897747888456e-05, + "loss": 5.032, + "step": 29373 + }, + { + "epoch": 0.17469549909601295, + "grad_norm": 1.7012786865234375, + "learning_rate": 4.6328733812838925e-05, + "loss": 5.1362, + "step": 29374 + }, + { + "epoch": 0.17470144637929394, + "grad_norm": 1.6155195236206055, + "learning_rate": 4.632849013934765e-05, + "loss": 5.4839, + "step": 29375 + }, + { + "epoch": 0.17470739366257493, + "grad_norm": 1.3312060832977295, + "learning_rate": 4.6328246458410816e-05, + "loss": 5.521, + "step": 29376 + }, + { + "epoch": 0.17471334094585594, + "grad_norm": 1.6347986459732056, + "learning_rate": 4.632800277002851e-05, + "loss": 5.1883, + "step": 29377 + }, + { + "epoch": 0.17471928822913693, + "grad_norm": 1.6213163137435913, + "learning_rate": 4.632775907420082e-05, + "loss": 5.1724, + "step": 29378 + }, + { + "epoch": 0.17472523551241792, + "grad_norm": 2.0514700412750244, + "learning_rate": 4.632751537092783e-05, + "loss": 3.6934, + "step": 29379 + }, + { + "epoch": 0.17473118279569894, + "grad_norm": 1.4713187217712402, + "learning_rate": 4.6327271660209626e-05, + "loss": 4.7456, + "step": 29380 + }, + { + "epoch": 0.17473713007897992, + "grad_norm": 1.5584750175476074, + "learning_rate": 4.6327027942046286e-05, + "loss": 5.0259, + "step": 29381 + }, + { + "epoch": 0.1747430773622609, + "grad_norm": 1.7405140399932861, + "learning_rate": 4.632678421643791e-05, + "loss": 5.1115, + "step": 29382 + }, + { + "epoch": 0.17474902464554193, + "grad_norm": 1.7233058214187622, + "learning_rate": 4.632654048338457e-05, + "loss": 5.3849, + "step": 29383 + }, + { + "epoch": 0.17475497192882292, + "grad_norm": 1.7387725114822388, + "learning_rate": 4.6326296742886356e-05, + "loss": 5.4367, + "step": 29384 + }, + { + "epoch": 0.1747609192121039, + "grad_norm": 1.7022291421890259, + "learning_rate": 4.632605299494335e-05, + "loss": 5.1317, + "step": 29385 + }, + { + "epoch": 0.17476686649538492, + "grad_norm": 1.7683387994766235, + "learning_rate": 4.632580923955564e-05, + "loss": 4.4575, + "step": 29386 + }, + { + "epoch": 0.1747728137786659, + "grad_norm": 1.4611074924468994, + "learning_rate": 4.632556547672331e-05, + "loss": 4.7676, + "step": 29387 + }, + { + "epoch": 0.1747787610619469, + "grad_norm": 1.9123033285140991, + "learning_rate": 4.632532170644644e-05, + "loss": 4.966, + "step": 29388 + }, + { + "epoch": 0.1747847083452279, + "grad_norm": 1.857445478439331, + "learning_rate": 4.632507792872513e-05, + "loss": 4.0338, + "step": 29389 + }, + { + "epoch": 0.1747906556285089, + "grad_norm": 2.620339870452881, + "learning_rate": 4.632483414355945e-05, + "loss": 3.4506, + "step": 29390 + }, + { + "epoch": 0.1747966029117899, + "grad_norm": 2.141939401626587, + "learning_rate": 4.6324590350949494e-05, + "loss": 4.516, + "step": 29391 + }, + { + "epoch": 0.1748025501950709, + "grad_norm": 1.5560227632522583, + "learning_rate": 4.632434655089535e-05, + "loss": 4.8785, + "step": 29392 + }, + { + "epoch": 0.1748084974783519, + "grad_norm": 1.640221357345581, + "learning_rate": 4.632410274339708e-05, + "loss": 4.9614, + "step": 29393 + }, + { + "epoch": 0.17481444476163288, + "grad_norm": 1.6104960441589355, + "learning_rate": 4.63238589284548e-05, + "loss": 4.7536, + "step": 29394 + }, + { + "epoch": 0.1748203920449139, + "grad_norm": 1.599259853363037, + "learning_rate": 4.6323615106068575e-05, + "loss": 5.0939, + "step": 29395 + }, + { + "epoch": 0.17482633932819489, + "grad_norm": 1.630430817604065, + "learning_rate": 4.6323371276238496e-05, + "loss": 4.8851, + "step": 29396 + }, + { + "epoch": 0.17483228661147587, + "grad_norm": 1.6281993389129639, + "learning_rate": 4.632312743896465e-05, + "loss": 4.8152, + "step": 29397 + }, + { + "epoch": 0.17483823389475686, + "grad_norm": 1.7055253982543945, + "learning_rate": 4.632288359424712e-05, + "loss": 4.2515, + "step": 29398 + }, + { + "epoch": 0.17484418117803788, + "grad_norm": 1.739365577697754, + "learning_rate": 4.6322639742085995e-05, + "loss": 4.5137, + "step": 29399 + }, + { + "epoch": 0.17485012846131887, + "grad_norm": 1.7686853408813477, + "learning_rate": 4.632239588248135e-05, + "loss": 5.307, + "step": 29400 + }, + { + "epoch": 0.17485607574459985, + "grad_norm": 1.369730830192566, + "learning_rate": 4.632215201543328e-05, + "loss": 5.3096, + "step": 29401 + }, + { + "epoch": 0.17486202302788087, + "grad_norm": 1.6965676546096802, + "learning_rate": 4.6321908140941874e-05, + "loss": 4.9252, + "step": 29402 + }, + { + "epoch": 0.17486797031116186, + "grad_norm": 1.797540307044983, + "learning_rate": 4.63216642590072e-05, + "loss": 4.4397, + "step": 29403 + }, + { + "epoch": 0.17487391759444285, + "grad_norm": 1.7250994443893433, + "learning_rate": 4.632142036962936e-05, + "loss": 4.4416, + "step": 29404 + }, + { + "epoch": 0.17487986487772386, + "grad_norm": 1.649828314781189, + "learning_rate": 4.632117647280843e-05, + "loss": 4.4497, + "step": 29405 + }, + { + "epoch": 0.17488581216100485, + "grad_norm": 1.7073628902435303, + "learning_rate": 4.632093256854449e-05, + "loss": 4.3074, + "step": 29406 + }, + { + "epoch": 0.17489175944428584, + "grad_norm": 1.6241555213928223, + "learning_rate": 4.632068865683765e-05, + "loss": 4.1219, + "step": 29407 + }, + { + "epoch": 0.17489770672756685, + "grad_norm": 1.356092929840088, + "learning_rate": 4.6320444737687965e-05, + "loss": 4.5548, + "step": 29408 + }, + { + "epoch": 0.17490365401084784, + "grad_norm": 1.5094983577728271, + "learning_rate": 4.632020081109554e-05, + "loss": 5.0598, + "step": 29409 + }, + { + "epoch": 0.17490960129412883, + "grad_norm": 1.596183180809021, + "learning_rate": 4.6319956877060445e-05, + "loss": 5.0795, + "step": 29410 + }, + { + "epoch": 0.17491554857740985, + "grad_norm": 1.7887545824050903, + "learning_rate": 4.6319712935582784e-05, + "loss": 4.9287, + "step": 29411 + }, + { + "epoch": 0.17492149586069083, + "grad_norm": 1.4806302785873413, + "learning_rate": 4.631946898666262e-05, + "loss": 5.0627, + "step": 29412 + }, + { + "epoch": 0.17492744314397182, + "grad_norm": 1.5581897497177124, + "learning_rate": 4.631922503030005e-05, + "loss": 5.2001, + "step": 29413 + }, + { + "epoch": 0.17493339042725284, + "grad_norm": 1.614473819732666, + "learning_rate": 4.631898106649517e-05, + "loss": 4.396, + "step": 29414 + }, + { + "epoch": 0.17493933771053383, + "grad_norm": 1.9394686222076416, + "learning_rate": 4.6318737095248044e-05, + "loss": 3.9614, + "step": 29415 + }, + { + "epoch": 0.17494528499381481, + "grad_norm": 1.6874741315841675, + "learning_rate": 4.631849311655877e-05, + "loss": 4.4714, + "step": 29416 + }, + { + "epoch": 0.17495123227709583, + "grad_norm": 1.8840105533599854, + "learning_rate": 4.6318249130427435e-05, + "loss": 4.51, + "step": 29417 + }, + { + "epoch": 0.17495717956037682, + "grad_norm": 1.7205270528793335, + "learning_rate": 4.631800513685412e-05, + "loss": 4.554, + "step": 29418 + }, + { + "epoch": 0.1749631268436578, + "grad_norm": 1.449798583984375, + "learning_rate": 4.6317761135838896e-05, + "loss": 5.0114, + "step": 29419 + }, + { + "epoch": 0.17496907412693882, + "grad_norm": 1.6449236869812012, + "learning_rate": 4.631751712738187e-05, + "loss": 5.7704, + "step": 29420 + }, + { + "epoch": 0.1749750214102198, + "grad_norm": 1.5362746715545654, + "learning_rate": 4.631727311148312e-05, + "loss": 5.6398, + "step": 29421 + }, + { + "epoch": 0.1749809686935008, + "grad_norm": 1.6383920907974243, + "learning_rate": 4.6317029088142726e-05, + "loss": 5.2901, + "step": 29422 + }, + { + "epoch": 0.17498691597678181, + "grad_norm": 1.8682830333709717, + "learning_rate": 4.631678505736079e-05, + "loss": 4.2822, + "step": 29423 + }, + { + "epoch": 0.1749928632600628, + "grad_norm": 1.9640558958053589, + "learning_rate": 4.631654101913737e-05, + "loss": 4.121, + "step": 29424 + }, + { + "epoch": 0.1749988105433438, + "grad_norm": 1.569744348526001, + "learning_rate": 4.6316296973472576e-05, + "loss": 4.3937, + "step": 29425 + }, + { + "epoch": 0.1750047578266248, + "grad_norm": 1.524356484413147, + "learning_rate": 4.6316052920366475e-05, + "loss": 4.8107, + "step": 29426 + }, + { + "epoch": 0.1750107051099058, + "grad_norm": 1.7055494785308838, + "learning_rate": 4.6315808859819164e-05, + "loss": 4.8751, + "step": 29427 + }, + { + "epoch": 0.17501665239318678, + "grad_norm": 1.683262586593628, + "learning_rate": 4.631556479183072e-05, + "loss": 5.4053, + "step": 29428 + }, + { + "epoch": 0.1750225996764678, + "grad_norm": 1.7124066352844238, + "learning_rate": 4.6315320716401244e-05, + "loss": 5.0109, + "step": 29429 + }, + { + "epoch": 0.1750285469597488, + "grad_norm": 1.6951466798782349, + "learning_rate": 4.63150766335308e-05, + "loss": 5.4747, + "step": 29430 + }, + { + "epoch": 0.17503449424302978, + "grad_norm": 1.5457607507705688, + "learning_rate": 4.631483254321949e-05, + "loss": 4.8729, + "step": 29431 + }, + { + "epoch": 0.1750404415263108, + "grad_norm": 1.5366050004959106, + "learning_rate": 4.6314588445467386e-05, + "loss": 5.0268, + "step": 29432 + }, + { + "epoch": 0.17504638880959178, + "grad_norm": 1.6533615589141846, + "learning_rate": 4.6314344340274573e-05, + "loss": 4.7626, + "step": 29433 + }, + { + "epoch": 0.17505233609287277, + "grad_norm": 1.559486746788025, + "learning_rate": 4.631410022764115e-05, + "loss": 5.0673, + "step": 29434 + }, + { + "epoch": 0.17505828337615378, + "grad_norm": 1.534456729888916, + "learning_rate": 4.63138561075672e-05, + "loss": 5.5142, + "step": 29435 + }, + { + "epoch": 0.17506423065943477, + "grad_norm": 1.641667366027832, + "learning_rate": 4.63136119800528e-05, + "loss": 4.7032, + "step": 29436 + }, + { + "epoch": 0.17507017794271576, + "grad_norm": 1.4128551483154297, + "learning_rate": 4.631336784509803e-05, + "loss": 4.8777, + "step": 29437 + }, + { + "epoch": 0.17507612522599678, + "grad_norm": 1.4912710189819336, + "learning_rate": 4.6313123702703e-05, + "loss": 4.866, + "step": 29438 + }, + { + "epoch": 0.17508207250927776, + "grad_norm": 1.381341576576233, + "learning_rate": 4.631287955286776e-05, + "loss": 4.6116, + "step": 29439 + }, + { + "epoch": 0.17508801979255875, + "grad_norm": 1.4270753860473633, + "learning_rate": 4.631263539559243e-05, + "loss": 5.0519, + "step": 29440 + }, + { + "epoch": 0.17509396707583977, + "grad_norm": 1.4962128400802612, + "learning_rate": 4.6312391230877074e-05, + "loss": 4.6934, + "step": 29441 + }, + { + "epoch": 0.17509991435912076, + "grad_norm": 1.3959366083145142, + "learning_rate": 4.631214705872178e-05, + "loss": 4.9172, + "step": 29442 + }, + { + "epoch": 0.17510586164240174, + "grad_norm": 1.5014355182647705, + "learning_rate": 4.631190287912663e-05, + "loss": 4.8429, + "step": 29443 + }, + { + "epoch": 0.17511180892568276, + "grad_norm": 1.584879994392395, + "learning_rate": 4.631165869209172e-05, + "loss": 5.1186, + "step": 29444 + }, + { + "epoch": 0.17511775620896375, + "grad_norm": 1.6547553539276123, + "learning_rate": 4.6311414497617135e-05, + "loss": 4.9739, + "step": 29445 + }, + { + "epoch": 0.17512370349224474, + "grad_norm": 1.4584704637527466, + "learning_rate": 4.631117029570295e-05, + "loss": 4.927, + "step": 29446 + }, + { + "epoch": 0.17512965077552575, + "grad_norm": 1.5092477798461914, + "learning_rate": 4.631092608634926e-05, + "loss": 4.9163, + "step": 29447 + }, + { + "epoch": 0.17513559805880674, + "grad_norm": 1.466023564338684, + "learning_rate": 4.631068186955614e-05, + "loss": 4.9867, + "step": 29448 + }, + { + "epoch": 0.17514154534208773, + "grad_norm": 1.8561779260635376, + "learning_rate": 4.6310437645323676e-05, + "loss": 4.6118, + "step": 29449 + }, + { + "epoch": 0.17514749262536874, + "grad_norm": 2.27844500541687, + "learning_rate": 4.631019341365197e-05, + "loss": 4.4978, + "step": 29450 + }, + { + "epoch": 0.17515343990864973, + "grad_norm": 1.7874199151992798, + "learning_rate": 4.6309949174541096e-05, + "loss": 3.7357, + "step": 29451 + }, + { + "epoch": 0.17515938719193072, + "grad_norm": 1.6950316429138184, + "learning_rate": 4.6309704927991136e-05, + "loss": 4.1866, + "step": 29452 + }, + { + "epoch": 0.17516533447521174, + "grad_norm": 1.6692928075790405, + "learning_rate": 4.630946067400217e-05, + "loss": 3.9566, + "step": 29453 + }, + { + "epoch": 0.17517128175849273, + "grad_norm": 1.680684208869934, + "learning_rate": 4.63092164125743e-05, + "loss": 4.0473, + "step": 29454 + }, + { + "epoch": 0.1751772290417737, + "grad_norm": 1.7636792659759521, + "learning_rate": 4.6308972143707606e-05, + "loss": 4.161, + "step": 29455 + }, + { + "epoch": 0.1751831763250547, + "grad_norm": 1.7277029752731323, + "learning_rate": 4.6308727867402165e-05, + "loss": 4.6943, + "step": 29456 + }, + { + "epoch": 0.17518912360833572, + "grad_norm": 1.7087599039077759, + "learning_rate": 4.630848358365807e-05, + "loss": 4.9239, + "step": 29457 + }, + { + "epoch": 0.1751950708916167, + "grad_norm": 1.8207015991210938, + "learning_rate": 4.63082392924754e-05, + "loss": 4.8358, + "step": 29458 + }, + { + "epoch": 0.1752010181748977, + "grad_norm": 1.9595861434936523, + "learning_rate": 4.6307994993854245e-05, + "loss": 4.3975, + "step": 29459 + }, + { + "epoch": 0.1752069654581787, + "grad_norm": 2.330233335494995, + "learning_rate": 4.630775068779469e-05, + "loss": 3.9516, + "step": 29460 + }, + { + "epoch": 0.1752129127414597, + "grad_norm": 1.801896572113037, + "learning_rate": 4.630750637429682e-05, + "loss": 4.3272, + "step": 29461 + }, + { + "epoch": 0.17521886002474069, + "grad_norm": 1.8079783916473389, + "learning_rate": 4.630726205336071e-05, + "loss": 4.4698, + "step": 29462 + }, + { + "epoch": 0.1752248073080217, + "grad_norm": 1.7742640972137451, + "learning_rate": 4.6307017724986466e-05, + "loss": 4.5466, + "step": 29463 + }, + { + "epoch": 0.1752307545913027, + "grad_norm": 1.5979267358779907, + "learning_rate": 4.6306773389174154e-05, + "loss": 4.497, + "step": 29464 + }, + { + "epoch": 0.17523670187458368, + "grad_norm": 1.6667109727859497, + "learning_rate": 4.630652904592388e-05, + "loss": 5.338, + "step": 29465 + }, + { + "epoch": 0.1752426491578647, + "grad_norm": 1.5170248746871948, + "learning_rate": 4.63062846952357e-05, + "loss": 4.6994, + "step": 29466 + }, + { + "epoch": 0.17524859644114568, + "grad_norm": 1.597468376159668, + "learning_rate": 4.630604033710974e-05, + "loss": 4.1865, + "step": 29467 + }, + { + "epoch": 0.17525454372442667, + "grad_norm": 1.638096809387207, + "learning_rate": 4.630579597154604e-05, + "loss": 4.2936, + "step": 29468 + }, + { + "epoch": 0.17526049100770769, + "grad_norm": 1.5512175559997559, + "learning_rate": 4.630555159854472e-05, + "loss": 4.6191, + "step": 29469 + }, + { + "epoch": 0.17526643829098867, + "grad_norm": 1.57890784740448, + "learning_rate": 4.630530721810584e-05, + "loss": 4.9381, + "step": 29470 + }, + { + "epoch": 0.17527238557426966, + "grad_norm": 1.7156378030776978, + "learning_rate": 4.63050628302295e-05, + "loss": 5.022, + "step": 29471 + }, + { + "epoch": 0.17527833285755068, + "grad_norm": 1.6688953638076782, + "learning_rate": 4.630481843491579e-05, + "loss": 4.5509, + "step": 29472 + }, + { + "epoch": 0.17528428014083167, + "grad_norm": 1.835450530052185, + "learning_rate": 4.630457403216478e-05, + "loss": 4.6413, + "step": 29473 + }, + { + "epoch": 0.17529022742411265, + "grad_norm": 1.2935006618499756, + "learning_rate": 4.6304329621976574e-05, + "loss": 4.9823, + "step": 29474 + }, + { + "epoch": 0.17529617470739367, + "grad_norm": 2.152981758117676, + "learning_rate": 4.6304085204351234e-05, + "loss": 4.6183, + "step": 29475 + }, + { + "epoch": 0.17530212199067466, + "grad_norm": 1.6258760690689087, + "learning_rate": 4.630384077928886e-05, + "loss": 4.9874, + "step": 29476 + }, + { + "epoch": 0.17530806927395565, + "grad_norm": 1.6755950450897217, + "learning_rate": 4.630359634678954e-05, + "loss": 5.089, + "step": 29477 + }, + { + "epoch": 0.17531401655723666, + "grad_norm": 1.7208611965179443, + "learning_rate": 4.6303351906853355e-05, + "loss": 5.3393, + "step": 29478 + }, + { + "epoch": 0.17531996384051765, + "grad_norm": 1.5461162328720093, + "learning_rate": 4.630310745948039e-05, + "loss": 5.2263, + "step": 29479 + }, + { + "epoch": 0.17532591112379864, + "grad_norm": 1.9592080116271973, + "learning_rate": 4.630286300467073e-05, + "loss": 4.1235, + "step": 29480 + }, + { + "epoch": 0.17533185840707965, + "grad_norm": 1.8409465551376343, + "learning_rate": 4.630261854242446e-05, + "loss": 4.8235, + "step": 29481 + }, + { + "epoch": 0.17533780569036064, + "grad_norm": 1.6198770999908447, + "learning_rate": 4.630237407274166e-05, + "loss": 5.5198, + "step": 29482 + }, + { + "epoch": 0.17534375297364163, + "grad_norm": 1.692572832107544, + "learning_rate": 4.630212959562243e-05, + "loss": 4.8526, + "step": 29483 + }, + { + "epoch": 0.17534970025692265, + "grad_norm": 1.7479051351547241, + "learning_rate": 4.6301885111066847e-05, + "loss": 4.8774, + "step": 29484 + }, + { + "epoch": 0.17535564754020364, + "grad_norm": 2.0946943759918213, + "learning_rate": 4.630164061907499e-05, + "loss": 4.4918, + "step": 29485 + }, + { + "epoch": 0.17536159482348462, + "grad_norm": 1.702415943145752, + "learning_rate": 4.6301396119646954e-05, + "loss": 4.424, + "step": 29486 + }, + { + "epoch": 0.17536754210676564, + "grad_norm": 1.4786335229873657, + "learning_rate": 4.630115161278282e-05, + "loss": 5.5655, + "step": 29487 + }, + { + "epoch": 0.17537348939004663, + "grad_norm": 1.5471251010894775, + "learning_rate": 4.630090709848267e-05, + "loss": 5.2839, + "step": 29488 + }, + { + "epoch": 0.17537943667332762, + "grad_norm": 1.8128043413162231, + "learning_rate": 4.6300662576746595e-05, + "loss": 4.7968, + "step": 29489 + }, + { + "epoch": 0.17538538395660863, + "grad_norm": 1.6280453205108643, + "learning_rate": 4.630041804757469e-05, + "loss": 4.7266, + "step": 29490 + }, + { + "epoch": 0.17539133123988962, + "grad_norm": 1.6138848066329956, + "learning_rate": 4.6300173510967015e-05, + "loss": 4.3718, + "step": 29491 + }, + { + "epoch": 0.1753972785231706, + "grad_norm": 1.6392838954925537, + "learning_rate": 4.6299928966923675e-05, + "loss": 4.7491, + "step": 29492 + }, + { + "epoch": 0.17540322580645162, + "grad_norm": 1.722277283668518, + "learning_rate": 4.629968441544475e-05, + "loss": 4.4053, + "step": 29493 + }, + { + "epoch": 0.1754091730897326, + "grad_norm": 1.4803645610809326, + "learning_rate": 4.629943985653032e-05, + "loss": 4.5624, + "step": 29494 + }, + { + "epoch": 0.1754151203730136, + "grad_norm": 1.696871042251587, + "learning_rate": 4.629919529018048e-05, + "loss": 4.2274, + "step": 29495 + }, + { + "epoch": 0.17542106765629462, + "grad_norm": 2.0104081630706787, + "learning_rate": 4.629895071639531e-05, + "loss": 4.954, + "step": 29496 + }, + { + "epoch": 0.1754270149395756, + "grad_norm": 1.91762113571167, + "learning_rate": 4.62987061351749e-05, + "loss": 4.5869, + "step": 29497 + }, + { + "epoch": 0.1754329622228566, + "grad_norm": 2.0672197341918945, + "learning_rate": 4.629846154651932e-05, + "loss": 4.3838, + "step": 29498 + }, + { + "epoch": 0.1754389095061376, + "grad_norm": 1.9841183423995972, + "learning_rate": 4.629821695042869e-05, + "loss": 5.2067, + "step": 29499 + }, + { + "epoch": 0.1754448567894186, + "grad_norm": 1.850253701210022, + "learning_rate": 4.6297972346903055e-05, + "loss": 4.7302, + "step": 29500 + }, + { + "epoch": 0.17545080407269958, + "grad_norm": 1.4990947246551514, + "learning_rate": 4.629772773594252e-05, + "loss": 4.9005, + "step": 29501 + }, + { + "epoch": 0.1754567513559806, + "grad_norm": 1.5953363180160522, + "learning_rate": 4.629748311754717e-05, + "loss": 4.9025, + "step": 29502 + }, + { + "epoch": 0.1754626986392616, + "grad_norm": 1.5136396884918213, + "learning_rate": 4.6297238491717085e-05, + "loss": 4.835, + "step": 29503 + }, + { + "epoch": 0.17546864592254258, + "grad_norm": 1.7335329055786133, + "learning_rate": 4.6296993858452356e-05, + "loss": 4.7231, + "step": 29504 + }, + { + "epoch": 0.1754745932058236, + "grad_norm": 1.5969070196151733, + "learning_rate": 4.629674921775307e-05, + "loss": 4.7903, + "step": 29505 + }, + { + "epoch": 0.17548054048910458, + "grad_norm": 1.7393018007278442, + "learning_rate": 4.62965045696193e-05, + "loss": 5.2468, + "step": 29506 + }, + { + "epoch": 0.17548648777238557, + "grad_norm": 1.4993494749069214, + "learning_rate": 4.629625991405116e-05, + "loss": 5.0639, + "step": 29507 + }, + { + "epoch": 0.17549243505566658, + "grad_norm": 1.559507966041565, + "learning_rate": 4.62960152510487e-05, + "loss": 5.2718, + "step": 29508 + }, + { + "epoch": 0.17549838233894757, + "grad_norm": 1.6528722047805786, + "learning_rate": 4.629577058061202e-05, + "loss": 5.0881, + "step": 29509 + }, + { + "epoch": 0.17550432962222856, + "grad_norm": 1.5357880592346191, + "learning_rate": 4.629552590274121e-05, + "loss": 4.5841, + "step": 29510 + }, + { + "epoch": 0.17551027690550958, + "grad_norm": 1.7293065786361694, + "learning_rate": 4.629528121743635e-05, + "loss": 4.6718, + "step": 29511 + }, + { + "epoch": 0.17551622418879056, + "grad_norm": 2.699164390563965, + "learning_rate": 4.6295036524697536e-05, + "loss": 4.1491, + "step": 29512 + }, + { + "epoch": 0.17552217147207155, + "grad_norm": 1.5221933126449585, + "learning_rate": 4.629479182452483e-05, + "loss": 4.8606, + "step": 29513 + }, + { + "epoch": 0.17552811875535254, + "grad_norm": 1.5474234819412231, + "learning_rate": 4.629454711691835e-05, + "loss": 4.7198, + "step": 29514 + }, + { + "epoch": 0.17553406603863356, + "grad_norm": 1.5748153924942017, + "learning_rate": 4.629430240187816e-05, + "loss": 4.9429, + "step": 29515 + }, + { + "epoch": 0.17554001332191455, + "grad_norm": 1.5812437534332275, + "learning_rate": 4.629405767940434e-05, + "loss": 4.7219, + "step": 29516 + }, + { + "epoch": 0.17554596060519553, + "grad_norm": 1.572482943534851, + "learning_rate": 4.629381294949698e-05, + "loss": 4.9071, + "step": 29517 + }, + { + "epoch": 0.17555190788847655, + "grad_norm": 1.8683935403823853, + "learning_rate": 4.629356821215618e-05, + "loss": 4.539, + "step": 29518 + }, + { + "epoch": 0.17555785517175754, + "grad_norm": 3.200904607772827, + "learning_rate": 4.629332346738201e-05, + "loss": 4.2734, + "step": 29519 + }, + { + "epoch": 0.17556380245503853, + "grad_norm": 2.051896572113037, + "learning_rate": 4.629307871517457e-05, + "loss": 5.0986, + "step": 29520 + }, + { + "epoch": 0.17556974973831954, + "grad_norm": 1.7927826642990112, + "learning_rate": 4.6292833955533926e-05, + "loss": 4.6581, + "step": 29521 + }, + { + "epoch": 0.17557569702160053, + "grad_norm": 1.6184303760528564, + "learning_rate": 4.629258918846018e-05, + "loss": 4.8106, + "step": 29522 + }, + { + "epoch": 0.17558164430488152, + "grad_norm": 1.4969747066497803, + "learning_rate": 4.62923444139534e-05, + "loss": 5.2787, + "step": 29523 + }, + { + "epoch": 0.17558759158816253, + "grad_norm": 1.471805214881897, + "learning_rate": 4.6292099632013695e-05, + "loss": 5.3599, + "step": 29524 + }, + { + "epoch": 0.17559353887144352, + "grad_norm": 1.3968273401260376, + "learning_rate": 4.629185484264113e-05, + "loss": 4.9754, + "step": 29525 + }, + { + "epoch": 0.1755994861547245, + "grad_norm": 1.627172589302063, + "learning_rate": 4.629161004583581e-05, + "loss": 4.3703, + "step": 29526 + }, + { + "epoch": 0.17560543343800553, + "grad_norm": 1.5334340333938599, + "learning_rate": 4.62913652415978e-05, + "loss": 4.8447, + "step": 29527 + }, + { + "epoch": 0.17561138072128651, + "grad_norm": 1.552454948425293, + "learning_rate": 4.6291120429927194e-05, + "loss": 4.823, + "step": 29528 + }, + { + "epoch": 0.1756173280045675, + "grad_norm": 1.4378019571304321, + "learning_rate": 4.629087561082408e-05, + "loss": 5.019, + "step": 29529 + }, + { + "epoch": 0.17562327528784852, + "grad_norm": 1.513752818107605, + "learning_rate": 4.6290630784288544e-05, + "loss": 4.7146, + "step": 29530 + }, + { + "epoch": 0.1756292225711295, + "grad_norm": 1.5130308866500854, + "learning_rate": 4.629038595032066e-05, + "loss": 4.5687, + "step": 29531 + }, + { + "epoch": 0.1756351698544105, + "grad_norm": 1.6177191734313965, + "learning_rate": 4.6290141108920534e-05, + "loss": 4.49, + "step": 29532 + }, + { + "epoch": 0.1756411171376915, + "grad_norm": 1.6133641004562378, + "learning_rate": 4.628989626008823e-05, + "loss": 4.6966, + "step": 29533 + }, + { + "epoch": 0.1756470644209725, + "grad_norm": 1.5740238428115845, + "learning_rate": 4.628965140382385e-05, + "loss": 4.8149, + "step": 29534 + }, + { + "epoch": 0.1756530117042535, + "grad_norm": 1.4787334203720093, + "learning_rate": 4.6289406540127466e-05, + "loss": 4.7759, + "step": 29535 + }, + { + "epoch": 0.1756589589875345, + "grad_norm": 1.5558816194534302, + "learning_rate": 4.628916166899917e-05, + "loss": 5.0831, + "step": 29536 + }, + { + "epoch": 0.1756649062708155, + "grad_norm": 1.3332229852676392, + "learning_rate": 4.628891679043905e-05, + "loss": 4.9866, + "step": 29537 + }, + { + "epoch": 0.17567085355409648, + "grad_norm": 1.5539603233337402, + "learning_rate": 4.6288671904447195e-05, + "loss": 4.96, + "step": 29538 + }, + { + "epoch": 0.1756768008373775, + "grad_norm": 1.4858051538467407, + "learning_rate": 4.628842701102368e-05, + "loss": 4.9161, + "step": 29539 + }, + { + "epoch": 0.17568274812065848, + "grad_norm": 1.6222684383392334, + "learning_rate": 4.62881821101686e-05, + "loss": 4.9328, + "step": 29540 + }, + { + "epoch": 0.17568869540393947, + "grad_norm": 1.6516577005386353, + "learning_rate": 4.6287937201882025e-05, + "loss": 4.7577, + "step": 29541 + }, + { + "epoch": 0.1756946426872205, + "grad_norm": 1.7349826097488403, + "learning_rate": 4.6287692286164056e-05, + "loss": 4.5927, + "step": 29542 + }, + { + "epoch": 0.17570058997050148, + "grad_norm": 1.4014586210250854, + "learning_rate": 4.6287447363014776e-05, + "loss": 4.8835, + "step": 29543 + }, + { + "epoch": 0.17570653725378246, + "grad_norm": 1.5037766695022583, + "learning_rate": 4.6287202432434265e-05, + "loss": 4.9221, + "step": 29544 + }, + { + "epoch": 0.17571248453706348, + "grad_norm": 1.5138404369354248, + "learning_rate": 4.628695749442261e-05, + "loss": 4.5962, + "step": 29545 + }, + { + "epoch": 0.17571843182034447, + "grad_norm": 1.5634385347366333, + "learning_rate": 4.6286712548979907e-05, + "loss": 5.2178, + "step": 29546 + }, + { + "epoch": 0.17572437910362546, + "grad_norm": 1.6049305200576782, + "learning_rate": 4.628646759610622e-05, + "loss": 5.1726, + "step": 29547 + }, + { + "epoch": 0.17573032638690647, + "grad_norm": 1.6202237606048584, + "learning_rate": 4.628622263580166e-05, + "loss": 4.8598, + "step": 29548 + }, + { + "epoch": 0.17573627367018746, + "grad_norm": 1.4801881313323975, + "learning_rate": 4.628597766806629e-05, + "loss": 4.9164, + "step": 29549 + }, + { + "epoch": 0.17574222095346845, + "grad_norm": 1.5014153718948364, + "learning_rate": 4.628573269290021e-05, + "loss": 4.3787, + "step": 29550 + }, + { + "epoch": 0.17574816823674946, + "grad_norm": 1.5468509197235107, + "learning_rate": 4.62854877103035e-05, + "loss": 4.9178, + "step": 29551 + }, + { + "epoch": 0.17575411552003045, + "grad_norm": 1.4622128009796143, + "learning_rate": 4.628524272027624e-05, + "loss": 4.8219, + "step": 29552 + }, + { + "epoch": 0.17576006280331144, + "grad_norm": 1.6060843467712402, + "learning_rate": 4.628499772281853e-05, + "loss": 4.869, + "step": 29553 + }, + { + "epoch": 0.17576601008659246, + "grad_norm": 1.7407468557357788, + "learning_rate": 4.628475271793044e-05, + "loss": 4.7171, + "step": 29554 + }, + { + "epoch": 0.17577195736987344, + "grad_norm": 1.5435397624969482, + "learning_rate": 4.628450770561207e-05, + "loss": 4.6929, + "step": 29555 + }, + { + "epoch": 0.17577790465315443, + "grad_norm": 1.5211220979690552, + "learning_rate": 4.628426268586349e-05, + "loss": 4.6811, + "step": 29556 + }, + { + "epoch": 0.17578385193643545, + "grad_norm": 1.3432724475860596, + "learning_rate": 4.6284017658684796e-05, + "loss": 4.8499, + "step": 29557 + }, + { + "epoch": 0.17578979921971644, + "grad_norm": 1.6592440605163574, + "learning_rate": 4.628377262407608e-05, + "loss": 4.4278, + "step": 29558 + }, + { + "epoch": 0.17579574650299742, + "grad_norm": 1.5314370393753052, + "learning_rate": 4.6283527582037415e-05, + "loss": 5.0514, + "step": 29559 + }, + { + "epoch": 0.17580169378627844, + "grad_norm": 1.8792412281036377, + "learning_rate": 4.6283282532568884e-05, + "loss": 4.3201, + "step": 29560 + }, + { + "epoch": 0.17580764106955943, + "grad_norm": 1.726537823677063, + "learning_rate": 4.628303747567058e-05, + "loss": 4.4524, + "step": 29561 + }, + { + "epoch": 0.17581358835284042, + "grad_norm": 1.5222519636154175, + "learning_rate": 4.628279241134259e-05, + "loss": 4.7075, + "step": 29562 + }, + { + "epoch": 0.17581953563612143, + "grad_norm": 1.6036890745162964, + "learning_rate": 4.6282547339585e-05, + "loss": 4.6974, + "step": 29563 + }, + { + "epoch": 0.17582548291940242, + "grad_norm": 1.6295074224472046, + "learning_rate": 4.628230226039789e-05, + "loss": 4.4021, + "step": 29564 + }, + { + "epoch": 0.1758314302026834, + "grad_norm": 2.6549839973449707, + "learning_rate": 4.628205717378135e-05, + "loss": 3.8639, + "step": 29565 + }, + { + "epoch": 0.17583737748596442, + "grad_norm": 2.752455234527588, + "learning_rate": 4.628181207973547e-05, + "loss": 3.745, + "step": 29566 + }, + { + "epoch": 0.1758433247692454, + "grad_norm": 2.4327378273010254, + "learning_rate": 4.6281566978260314e-05, + "loss": 3.4675, + "step": 29567 + }, + { + "epoch": 0.1758492720525264, + "grad_norm": 2.2893288135528564, + "learning_rate": 4.628132186935599e-05, + "loss": 3.4223, + "step": 29568 + }, + { + "epoch": 0.17585521933580742, + "grad_norm": 2.6514787673950195, + "learning_rate": 4.628107675302258e-05, + "loss": 3.6378, + "step": 29569 + }, + { + "epoch": 0.1758611666190884, + "grad_norm": 1.501243233680725, + "learning_rate": 4.628083162926016e-05, + "loss": 4.9402, + "step": 29570 + }, + { + "epoch": 0.1758671139023694, + "grad_norm": 2.5400307178497314, + "learning_rate": 4.6280586498068824e-05, + "loss": 3.9097, + "step": 29571 + }, + { + "epoch": 0.17587306118565038, + "grad_norm": 3.0715131759643555, + "learning_rate": 4.628034135944865e-05, + "loss": 3.8084, + "step": 29572 + }, + { + "epoch": 0.1758790084689314, + "grad_norm": 2.320291042327881, + "learning_rate": 4.628009621339974e-05, + "loss": 3.743, + "step": 29573 + }, + { + "epoch": 0.17588495575221239, + "grad_norm": 2.653029441833496, + "learning_rate": 4.627985105992216e-05, + "loss": 3.5106, + "step": 29574 + }, + { + "epoch": 0.17589090303549337, + "grad_norm": 2.5279390811920166, + "learning_rate": 4.6279605899016007e-05, + "loss": 3.6074, + "step": 29575 + }, + { + "epoch": 0.1758968503187744, + "grad_norm": 2.6520915031433105, + "learning_rate": 4.6279360730681364e-05, + "loss": 3.5559, + "step": 29576 + }, + { + "epoch": 0.17590279760205538, + "grad_norm": 1.5509624481201172, + "learning_rate": 4.627911555491831e-05, + "loss": 4.8954, + "step": 29577 + }, + { + "epoch": 0.17590874488533637, + "grad_norm": 2.044759750366211, + "learning_rate": 4.627887037172695e-05, + "loss": 3.7401, + "step": 29578 + }, + { + "epoch": 0.17591469216861738, + "grad_norm": 2.512817144393921, + "learning_rate": 4.6278625181107336e-05, + "loss": 3.3898, + "step": 29579 + }, + { + "epoch": 0.17592063945189837, + "grad_norm": 2.3796133995056152, + "learning_rate": 4.627837998305959e-05, + "loss": 3.5277, + "step": 29580 + }, + { + "epoch": 0.17592658673517936, + "grad_norm": 2.6435763835906982, + "learning_rate": 4.6278134777583774e-05, + "loss": 3.6078, + "step": 29581 + }, + { + "epoch": 0.17593253401846037, + "grad_norm": 1.9326622486114502, + "learning_rate": 4.6277889564679986e-05, + "loss": 4.3017, + "step": 29582 + }, + { + "epoch": 0.17593848130174136, + "grad_norm": 2.0501444339752197, + "learning_rate": 4.62776443443483e-05, + "loss": 4.2909, + "step": 29583 + }, + { + "epoch": 0.17594442858502235, + "grad_norm": 2.1053049564361572, + "learning_rate": 4.6277399116588816e-05, + "loss": 3.4639, + "step": 29584 + }, + { + "epoch": 0.17595037586830337, + "grad_norm": 2.2305474281311035, + "learning_rate": 4.627715388140161e-05, + "loss": 3.6551, + "step": 29585 + }, + { + "epoch": 0.17595632315158435, + "grad_norm": 2.328937292098999, + "learning_rate": 4.6276908638786766e-05, + "loss": 3.2528, + "step": 29586 + }, + { + "epoch": 0.17596227043486534, + "grad_norm": 3.2846357822418213, + "learning_rate": 4.627666338874437e-05, + "loss": 3.7581, + "step": 29587 + }, + { + "epoch": 0.17596821771814636, + "grad_norm": 2.145848512649536, + "learning_rate": 4.627641813127452e-05, + "loss": 3.6736, + "step": 29588 + }, + { + "epoch": 0.17597416500142735, + "grad_norm": 2.367215871810913, + "learning_rate": 4.627617286637729e-05, + "loss": 3.3043, + "step": 29589 + }, + { + "epoch": 0.17598011228470833, + "grad_norm": 2.314913272857666, + "learning_rate": 4.627592759405276e-05, + "loss": 3.3871, + "step": 29590 + }, + { + "epoch": 0.17598605956798935, + "grad_norm": 2.3208961486816406, + "learning_rate": 4.627568231430103e-05, + "loss": 3.3427, + "step": 29591 + }, + { + "epoch": 0.17599200685127034, + "grad_norm": 2.2277936935424805, + "learning_rate": 4.627543702712218e-05, + "loss": 3.4393, + "step": 29592 + }, + { + "epoch": 0.17599795413455133, + "grad_norm": 2.6522443294525146, + "learning_rate": 4.627519173251629e-05, + "loss": 3.4554, + "step": 29593 + }, + { + "epoch": 0.17600390141783234, + "grad_norm": 1.6064810752868652, + "learning_rate": 4.6274946430483454e-05, + "loss": 5.2487, + "step": 29594 + }, + { + "epoch": 0.17600984870111333, + "grad_norm": 2.488597869873047, + "learning_rate": 4.627470112102375e-05, + "loss": 3.8507, + "step": 29595 + }, + { + "epoch": 0.17601579598439432, + "grad_norm": 2.4922280311584473, + "learning_rate": 4.627445580413727e-05, + "loss": 3.901, + "step": 29596 + }, + { + "epoch": 0.17602174326767533, + "grad_norm": 2.5545835494995117, + "learning_rate": 4.62742104798241e-05, + "loss": 3.7327, + "step": 29597 + }, + { + "epoch": 0.17602769055095632, + "grad_norm": 2.674534559249878, + "learning_rate": 4.627396514808432e-05, + "loss": 3.6846, + "step": 29598 + }, + { + "epoch": 0.1760336378342373, + "grad_norm": 2.51946759223938, + "learning_rate": 4.627371980891801e-05, + "loss": 3.504, + "step": 29599 + }, + { + "epoch": 0.17603958511751833, + "grad_norm": 1.584033489227295, + "learning_rate": 4.6273474462325286e-05, + "loss": 4.9813, + "step": 29600 + }, + { + "epoch": 0.17604553240079931, + "grad_norm": 1.5800496339797974, + "learning_rate": 4.6273229108306195e-05, + "loss": 5.6641, + "step": 29601 + }, + { + "epoch": 0.1760514796840803, + "grad_norm": 1.5663219690322876, + "learning_rate": 4.627298374686084e-05, + "loss": 5.6077, + "step": 29602 + }, + { + "epoch": 0.17605742696736132, + "grad_norm": 1.5315394401550293, + "learning_rate": 4.627273837798932e-05, + "loss": 5.3647, + "step": 29603 + }, + { + "epoch": 0.1760633742506423, + "grad_norm": 1.6742242574691772, + "learning_rate": 4.627249300169169e-05, + "loss": 5.2066, + "step": 29604 + }, + { + "epoch": 0.1760693215339233, + "grad_norm": 1.6399402618408203, + "learning_rate": 4.627224761796806e-05, + "loss": 5.0195, + "step": 29605 + }, + { + "epoch": 0.1760752688172043, + "grad_norm": 1.7168047428131104, + "learning_rate": 4.627200222681851e-05, + "loss": 5.3056, + "step": 29606 + }, + { + "epoch": 0.1760812161004853, + "grad_norm": 1.6890738010406494, + "learning_rate": 4.627175682824312e-05, + "loss": 5.1811, + "step": 29607 + }, + { + "epoch": 0.1760871633837663, + "grad_norm": 1.7669142484664917, + "learning_rate": 4.627151142224198e-05, + "loss": 5.2459, + "step": 29608 + }, + { + "epoch": 0.1760931106670473, + "grad_norm": 1.4989925622940063, + "learning_rate": 4.627126600881517e-05, + "loss": 5.092, + "step": 29609 + }, + { + "epoch": 0.1760990579503283, + "grad_norm": 1.4541029930114746, + "learning_rate": 4.627102058796279e-05, + "loss": 5.0705, + "step": 29610 + }, + { + "epoch": 0.17610500523360928, + "grad_norm": 2.039470911026001, + "learning_rate": 4.627077515968492e-05, + "loss": 4.1636, + "step": 29611 + }, + { + "epoch": 0.1761109525168903, + "grad_norm": 3.1738526821136475, + "learning_rate": 4.6270529723981635e-05, + "loss": 2.1184, + "step": 29612 + }, + { + "epoch": 0.17611689980017128, + "grad_norm": 1.7128700017929077, + "learning_rate": 4.6270284280853024e-05, + "loss": 5.7775, + "step": 29613 + }, + { + "epoch": 0.17612284708345227, + "grad_norm": 1.7605071067810059, + "learning_rate": 4.627003883029918e-05, + "loss": 5.6578, + "step": 29614 + }, + { + "epoch": 0.1761287943667333, + "grad_norm": 1.6726125478744507, + "learning_rate": 4.6269793372320186e-05, + "loss": 5.3621, + "step": 29615 + }, + { + "epoch": 0.17613474165001428, + "grad_norm": 1.6924387216567993, + "learning_rate": 4.626954790691612e-05, + "loss": 5.2866, + "step": 29616 + }, + { + "epoch": 0.17614068893329526, + "grad_norm": 1.705000400543213, + "learning_rate": 4.6269302434087085e-05, + "loss": 5.009, + "step": 29617 + }, + { + "epoch": 0.17614663621657628, + "grad_norm": 1.6577481031417847, + "learning_rate": 4.6269056953833157e-05, + "loss": 5.4761, + "step": 29618 + }, + { + "epoch": 0.17615258349985727, + "grad_norm": 1.635854721069336, + "learning_rate": 4.6268811466154415e-05, + "loss": 5.3624, + "step": 29619 + }, + { + "epoch": 0.17615853078313826, + "grad_norm": 1.6608973741531372, + "learning_rate": 4.626856597105095e-05, + "loss": 5.4398, + "step": 29620 + }, + { + "epoch": 0.17616447806641927, + "grad_norm": 1.5028787851333618, + "learning_rate": 4.626832046852285e-05, + "loss": 5.3025, + "step": 29621 + }, + { + "epoch": 0.17617042534970026, + "grad_norm": 2.694622278213501, + "learning_rate": 4.62680749585702e-05, + "loss": 2.389, + "step": 29622 + }, + { + "epoch": 0.17617637263298125, + "grad_norm": 1.6484723091125488, + "learning_rate": 4.6267829441193086e-05, + "loss": 4.871, + "step": 29623 + }, + { + "epoch": 0.17618231991626226, + "grad_norm": 1.6752315759658813, + "learning_rate": 4.626758391639159e-05, + "loss": 5.1089, + "step": 29624 + }, + { + "epoch": 0.17618826719954325, + "grad_norm": 1.8165408372879028, + "learning_rate": 4.62673383841658e-05, + "loss": 5.1408, + "step": 29625 + }, + { + "epoch": 0.17619421448282424, + "grad_norm": 1.7555296421051025, + "learning_rate": 4.6267092844515804e-05, + "loss": 5.2196, + "step": 29626 + }, + { + "epoch": 0.17620016176610526, + "grad_norm": 1.6462376117706299, + "learning_rate": 4.626684729744168e-05, + "loss": 5.2127, + "step": 29627 + }, + { + "epoch": 0.17620610904938624, + "grad_norm": 1.7403783798217773, + "learning_rate": 4.6266601742943526e-05, + "loss": 5.1372, + "step": 29628 + }, + { + "epoch": 0.17621205633266723, + "grad_norm": 2.6064391136169434, + "learning_rate": 4.626635618102142e-05, + "loss": 5.3963, + "step": 29629 + }, + { + "epoch": 0.17621800361594822, + "grad_norm": 1.4826772212982178, + "learning_rate": 4.6266110611675446e-05, + "loss": 5.7049, + "step": 29630 + }, + { + "epoch": 0.17622395089922924, + "grad_norm": 1.685837984085083, + "learning_rate": 4.62658650349057e-05, + "loss": 5.117, + "step": 29631 + }, + { + "epoch": 0.17622989818251023, + "grad_norm": 1.5930708646774292, + "learning_rate": 4.626561945071225e-05, + "loss": 5.1709, + "step": 29632 + }, + { + "epoch": 0.1762358454657912, + "grad_norm": 1.7052996158599854, + "learning_rate": 4.6265373859095197e-05, + "loss": 5.3743, + "step": 29633 + }, + { + "epoch": 0.17624179274907223, + "grad_norm": 1.9218865633010864, + "learning_rate": 4.626512826005462e-05, + "loss": 5.0207, + "step": 29634 + }, + { + "epoch": 0.17624774003235322, + "grad_norm": 2.1410880088806152, + "learning_rate": 4.62648826535906e-05, + "loss": 4.7898, + "step": 29635 + }, + { + "epoch": 0.1762536873156342, + "grad_norm": 3.278724431991577, + "learning_rate": 4.626463703970324e-05, + "loss": 3.7456, + "step": 29636 + }, + { + "epoch": 0.17625963459891522, + "grad_norm": 1.6557966470718384, + "learning_rate": 4.6264391418392615e-05, + "loss": 5.1905, + "step": 29637 + }, + { + "epoch": 0.1762655818821962, + "grad_norm": 1.3662563562393188, + "learning_rate": 4.6264145789658804e-05, + "loss": 5.2232, + "step": 29638 + }, + { + "epoch": 0.1762715291654772, + "grad_norm": 1.5638326406478882, + "learning_rate": 4.62639001535019e-05, + "loss": 5.0933, + "step": 29639 + }, + { + "epoch": 0.1762774764487582, + "grad_norm": 1.81962251663208, + "learning_rate": 4.6263654509921996e-05, + "loss": 4.6625, + "step": 29640 + }, + { + "epoch": 0.1762834237320392, + "grad_norm": 1.5421823263168335, + "learning_rate": 4.626340885891916e-05, + "loss": 5.0372, + "step": 29641 + }, + { + "epoch": 0.1762893710153202, + "grad_norm": 1.8756135702133179, + "learning_rate": 4.626316320049349e-05, + "loss": 5.224, + "step": 29642 + }, + { + "epoch": 0.1762953182986012, + "grad_norm": 1.617411494255066, + "learning_rate": 4.6262917534645076e-05, + "loss": 5.3449, + "step": 29643 + }, + { + "epoch": 0.1763012655818822, + "grad_norm": 1.3965401649475098, + "learning_rate": 4.626267186137399e-05, + "loss": 5.4929, + "step": 29644 + }, + { + "epoch": 0.17630721286516318, + "grad_norm": 1.4743956327438354, + "learning_rate": 4.626242618068033e-05, + "loss": 5.3105, + "step": 29645 + }, + { + "epoch": 0.1763131601484442, + "grad_norm": 1.5603059530258179, + "learning_rate": 4.626218049256417e-05, + "loss": 5.2059, + "step": 29646 + }, + { + "epoch": 0.17631910743172519, + "grad_norm": 1.5562357902526855, + "learning_rate": 4.626193479702561e-05, + "loss": 5.0752, + "step": 29647 + }, + { + "epoch": 0.17632505471500617, + "grad_norm": 1.4330555200576782, + "learning_rate": 4.6261689094064724e-05, + "loss": 5.0991, + "step": 29648 + }, + { + "epoch": 0.1763310019982872, + "grad_norm": 1.636109709739685, + "learning_rate": 4.62614433836816e-05, + "loss": 5.2, + "step": 29649 + }, + { + "epoch": 0.17633694928156818, + "grad_norm": 1.4994865655899048, + "learning_rate": 4.626119766587633e-05, + "loss": 5.4368, + "step": 29650 + }, + { + "epoch": 0.17634289656484917, + "grad_norm": 1.5928007364273071, + "learning_rate": 4.6260951940648996e-05, + "loss": 5.3432, + "step": 29651 + }, + { + "epoch": 0.17634884384813018, + "grad_norm": 2.4773452281951904, + "learning_rate": 4.626070620799968e-05, + "loss": 4.6023, + "step": 29652 + }, + { + "epoch": 0.17635479113141117, + "grad_norm": 1.4862966537475586, + "learning_rate": 4.626046046792847e-05, + "loss": 5.2271, + "step": 29653 + }, + { + "epoch": 0.17636073841469216, + "grad_norm": 1.659691333770752, + "learning_rate": 4.626021472043546e-05, + "loss": 5.1621, + "step": 29654 + }, + { + "epoch": 0.17636668569797317, + "grad_norm": 1.708454966545105, + "learning_rate": 4.625996896552073e-05, + "loss": 4.9272, + "step": 29655 + }, + { + "epoch": 0.17637263298125416, + "grad_norm": 1.7151225805282593, + "learning_rate": 4.625972320318435e-05, + "loss": 5.0272, + "step": 29656 + }, + { + "epoch": 0.17637858026453515, + "grad_norm": 1.635591983795166, + "learning_rate": 4.625947743342644e-05, + "loss": 5.1541, + "step": 29657 + }, + { + "epoch": 0.17638452754781617, + "grad_norm": 1.6878983974456787, + "learning_rate": 4.625923165624705e-05, + "loss": 5.1822, + "step": 29658 + }, + { + "epoch": 0.17639047483109715, + "grad_norm": 1.5905377864837646, + "learning_rate": 4.625898587164628e-05, + "loss": 4.9331, + "step": 29659 + }, + { + "epoch": 0.17639642211437814, + "grad_norm": 1.5988421440124512, + "learning_rate": 4.625874007962423e-05, + "loss": 4.811, + "step": 29660 + }, + { + "epoch": 0.17640236939765916, + "grad_norm": 1.725674033164978, + "learning_rate": 4.625849428018096e-05, + "loss": 4.95, + "step": 29661 + }, + { + "epoch": 0.17640831668094015, + "grad_norm": 1.6319259405136108, + "learning_rate": 4.625824847331658e-05, + "loss": 4.8133, + "step": 29662 + }, + { + "epoch": 0.17641426396422114, + "grad_norm": 1.6534069776535034, + "learning_rate": 4.625800265903116e-05, + "loss": 4.8914, + "step": 29663 + }, + { + "epoch": 0.17642021124750215, + "grad_norm": 1.6242649555206299, + "learning_rate": 4.6257756837324793e-05, + "loss": 5.1348, + "step": 29664 + }, + { + "epoch": 0.17642615853078314, + "grad_norm": 1.59992253780365, + "learning_rate": 4.625751100819757e-05, + "loss": 5.5775, + "step": 29665 + }, + { + "epoch": 0.17643210581406413, + "grad_norm": 1.8516936302185059, + "learning_rate": 4.625726517164956e-05, + "loss": 4.8874, + "step": 29666 + }, + { + "epoch": 0.17643805309734514, + "grad_norm": 2.0659658908843994, + "learning_rate": 4.625701932768086e-05, + "loss": 4.8295, + "step": 29667 + }, + { + "epoch": 0.17644400038062613, + "grad_norm": 1.914340615272522, + "learning_rate": 4.625677347629156e-05, + "loss": 4.8001, + "step": 29668 + }, + { + "epoch": 0.17644994766390712, + "grad_norm": 1.76264226436615, + "learning_rate": 4.6256527617481734e-05, + "loss": 5.0296, + "step": 29669 + }, + { + "epoch": 0.17645589494718814, + "grad_norm": 2.414245367050171, + "learning_rate": 4.625628175125147e-05, + "loss": 4.4596, + "step": 29670 + }, + { + "epoch": 0.17646184223046912, + "grad_norm": 2.4253740310668945, + "learning_rate": 4.625603587760087e-05, + "loss": 4.8557, + "step": 29671 + }, + { + "epoch": 0.1764677895137501, + "grad_norm": 1.5761579275131226, + "learning_rate": 4.6255789996529995e-05, + "loss": 5.3967, + "step": 29672 + }, + { + "epoch": 0.17647373679703113, + "grad_norm": 1.6232905387878418, + "learning_rate": 4.625554410803895e-05, + "loss": 5.2305, + "step": 29673 + }, + { + "epoch": 0.17647968408031212, + "grad_norm": 1.5074714422225952, + "learning_rate": 4.6255298212127806e-05, + "loss": 5.0091, + "step": 29674 + }, + { + "epoch": 0.1764856313635931, + "grad_norm": 1.4851216077804565, + "learning_rate": 4.625505230879667e-05, + "loss": 5.3812, + "step": 29675 + }, + { + "epoch": 0.17649157864687412, + "grad_norm": 1.5750563144683838, + "learning_rate": 4.62548063980456e-05, + "loss": 5.1194, + "step": 29676 + }, + { + "epoch": 0.1764975259301551, + "grad_norm": 1.6650339365005493, + "learning_rate": 4.625456047987471e-05, + "loss": 5.7083, + "step": 29677 + }, + { + "epoch": 0.1765034732134361, + "grad_norm": 1.6024653911590576, + "learning_rate": 4.625431455428407e-05, + "loss": 5.435, + "step": 29678 + }, + { + "epoch": 0.1765094204967171, + "grad_norm": 2.434255361557007, + "learning_rate": 4.625406862127376e-05, + "loss": 4.4856, + "step": 29679 + }, + { + "epoch": 0.1765153677799981, + "grad_norm": 2.248991012573242, + "learning_rate": 4.6253822680843885e-05, + "loss": 4.5724, + "step": 29680 + }, + { + "epoch": 0.1765213150632791, + "grad_norm": 2.187962293624878, + "learning_rate": 4.625357673299451e-05, + "loss": 4.7556, + "step": 29681 + }, + { + "epoch": 0.1765272623465601, + "grad_norm": 1.6530205011367798, + "learning_rate": 4.625333077772574e-05, + "loss": 5.1289, + "step": 29682 + }, + { + "epoch": 0.1765332096298411, + "grad_norm": 1.3826985359191895, + "learning_rate": 4.625308481503765e-05, + "loss": 5.2029, + "step": 29683 + }, + { + "epoch": 0.17653915691312208, + "grad_norm": 1.4573781490325928, + "learning_rate": 4.625283884493032e-05, + "loss": 5.1572, + "step": 29684 + }, + { + "epoch": 0.1765451041964031, + "grad_norm": 1.4935249090194702, + "learning_rate": 4.6252592867403856e-05, + "loss": 5.0828, + "step": 29685 + }, + { + "epoch": 0.17655105147968408, + "grad_norm": 1.6328359842300415, + "learning_rate": 4.625234688245832e-05, + "loss": 5.1604, + "step": 29686 + }, + { + "epoch": 0.17655699876296507, + "grad_norm": 1.4190014600753784, + "learning_rate": 4.6252100890093816e-05, + "loss": 4.9567, + "step": 29687 + }, + { + "epoch": 0.17656294604624606, + "grad_norm": 1.7209579944610596, + "learning_rate": 4.625185489031042e-05, + "loss": 4.412, + "step": 29688 + }, + { + "epoch": 0.17656889332952708, + "grad_norm": 1.5644607543945312, + "learning_rate": 4.625160888310822e-05, + "loss": 4.9651, + "step": 29689 + }, + { + "epoch": 0.17657484061280806, + "grad_norm": 1.498563289642334, + "learning_rate": 4.62513628684873e-05, + "loss": 5.4318, + "step": 29690 + }, + { + "epoch": 0.17658078789608905, + "grad_norm": 1.4302527904510498, + "learning_rate": 4.625111684644776e-05, + "loss": 4.9763, + "step": 29691 + }, + { + "epoch": 0.17658673517937007, + "grad_norm": 1.5234086513519287, + "learning_rate": 4.6250870816989664e-05, + "loss": 4.9747, + "step": 29692 + }, + { + "epoch": 0.17659268246265106, + "grad_norm": 1.611867904663086, + "learning_rate": 4.6250624780113116e-05, + "loss": 4.8275, + "step": 29693 + }, + { + "epoch": 0.17659862974593205, + "grad_norm": 2.0380537509918213, + "learning_rate": 4.625037873581819e-05, + "loss": 5.1795, + "step": 29694 + }, + { + "epoch": 0.17660457702921306, + "grad_norm": 1.433166742324829, + "learning_rate": 4.625013268410498e-05, + "loss": 5.3237, + "step": 29695 + }, + { + "epoch": 0.17661052431249405, + "grad_norm": 1.8627065420150757, + "learning_rate": 4.6249886624973564e-05, + "loss": 5.28, + "step": 29696 + }, + { + "epoch": 0.17661647159577504, + "grad_norm": 1.572050929069519, + "learning_rate": 4.6249640558424036e-05, + "loss": 5.3744, + "step": 29697 + }, + { + "epoch": 0.17662241887905605, + "grad_norm": 3.271996021270752, + "learning_rate": 4.624939448445648e-05, + "loss": 3.856, + "step": 29698 + }, + { + "epoch": 0.17662836616233704, + "grad_norm": 1.7473957538604736, + "learning_rate": 4.624914840307098e-05, + "loss": 4.7745, + "step": 29699 + }, + { + "epoch": 0.17663431344561803, + "grad_norm": 1.5957887172698975, + "learning_rate": 4.62489023142676e-05, + "loss": 5.3401, + "step": 29700 + }, + { + "epoch": 0.17664026072889905, + "grad_norm": 1.519698977470398, + "learning_rate": 4.624865621804647e-05, + "loss": 5.2996, + "step": 29701 + }, + { + "epoch": 0.17664620801218003, + "grad_norm": 1.4777617454528809, + "learning_rate": 4.624841011440765e-05, + "loss": 5.2181, + "step": 29702 + }, + { + "epoch": 0.17665215529546102, + "grad_norm": 1.5206866264343262, + "learning_rate": 4.624816400335123e-05, + "loss": 5.3529, + "step": 29703 + }, + { + "epoch": 0.17665810257874204, + "grad_norm": 1.6352920532226562, + "learning_rate": 4.6247917884877296e-05, + "loss": 5.3274, + "step": 29704 + }, + { + "epoch": 0.17666404986202303, + "grad_norm": 1.572554111480713, + "learning_rate": 4.6247671758985934e-05, + "loss": 5.3941, + "step": 29705 + }, + { + "epoch": 0.17666999714530401, + "grad_norm": 2.0956475734710693, + "learning_rate": 4.624742562567722e-05, + "loss": 4.0032, + "step": 29706 + }, + { + "epoch": 0.17667594442858503, + "grad_norm": 1.382948398590088, + "learning_rate": 4.624717948495126e-05, + "loss": 5.539, + "step": 29707 + }, + { + "epoch": 0.17668189171186602, + "grad_norm": 1.406977653503418, + "learning_rate": 4.6246933336808126e-05, + "loss": 5.5437, + "step": 29708 + }, + { + "epoch": 0.176687838995147, + "grad_norm": 1.6577895879745483, + "learning_rate": 4.62466871812479e-05, + "loss": 5.1155, + "step": 29709 + }, + { + "epoch": 0.17669378627842802, + "grad_norm": 1.9551897048950195, + "learning_rate": 4.624644101827069e-05, + "loss": 4.6531, + "step": 29710 + }, + { + "epoch": 0.176699733561709, + "grad_norm": 2.409532308578491, + "learning_rate": 4.624619484787655e-05, + "loss": 4.5918, + "step": 29711 + }, + { + "epoch": 0.17670568084499, + "grad_norm": 1.8758010864257812, + "learning_rate": 4.6245948670065594e-05, + "loss": 4.9051, + "step": 29712 + }, + { + "epoch": 0.17671162812827101, + "grad_norm": 1.777886152267456, + "learning_rate": 4.6245702484837894e-05, + "loss": 5.1955, + "step": 29713 + }, + { + "epoch": 0.176717575411552, + "grad_norm": 1.6413220167160034, + "learning_rate": 4.624545629219354e-05, + "loss": 4.9031, + "step": 29714 + }, + { + "epoch": 0.176723522694833, + "grad_norm": 1.7025271654129028, + "learning_rate": 4.624521009213262e-05, + "loss": 5.2195, + "step": 29715 + }, + { + "epoch": 0.176729469978114, + "grad_norm": 1.4530411958694458, + "learning_rate": 4.6244963884655204e-05, + "loss": 5.3771, + "step": 29716 + }, + { + "epoch": 0.176735417261395, + "grad_norm": 1.4960378408432007, + "learning_rate": 4.62447176697614e-05, + "loss": 5.1019, + "step": 29717 + }, + { + "epoch": 0.17674136454467598, + "grad_norm": 1.863013505935669, + "learning_rate": 4.624447144745129e-05, + "loss": 4.7721, + "step": 29718 + }, + { + "epoch": 0.176747311827957, + "grad_norm": 1.7837802171707153, + "learning_rate": 4.624422521772495e-05, + "loss": 5.2047, + "step": 29719 + }, + { + "epoch": 0.176753259111238, + "grad_norm": 2.3820879459381104, + "learning_rate": 4.6243978980582456e-05, + "loss": 4.7627, + "step": 29720 + }, + { + "epoch": 0.17675920639451898, + "grad_norm": 2.2981441020965576, + "learning_rate": 4.6243732736023926e-05, + "loss": 4.7149, + "step": 29721 + }, + { + "epoch": 0.1767651536778, + "grad_norm": 1.916215181350708, + "learning_rate": 4.6243486484049426e-05, + "loss": 4.6663, + "step": 29722 + }, + { + "epoch": 0.17677110096108098, + "grad_norm": 1.7512091398239136, + "learning_rate": 4.624324022465904e-05, + "loss": 5.0612, + "step": 29723 + }, + { + "epoch": 0.17677704824436197, + "grad_norm": 1.513918161392212, + "learning_rate": 4.6242993957852855e-05, + "loss": 5.131, + "step": 29724 + }, + { + "epoch": 0.17678299552764298, + "grad_norm": 1.5861341953277588, + "learning_rate": 4.6242747683630966e-05, + "loss": 5.1035, + "step": 29725 + }, + { + "epoch": 0.17678894281092397, + "grad_norm": 1.5094410181045532, + "learning_rate": 4.6242501401993454e-05, + "loss": 5.0484, + "step": 29726 + }, + { + "epoch": 0.17679489009420496, + "grad_norm": 1.5102661848068237, + "learning_rate": 4.6242255112940405e-05, + "loss": 5.0001, + "step": 29727 + }, + { + "epoch": 0.17680083737748598, + "grad_norm": 1.8255689144134521, + "learning_rate": 4.62420088164719e-05, + "loss": 5.4749, + "step": 29728 + }, + { + "epoch": 0.17680678466076696, + "grad_norm": 1.9394241571426392, + "learning_rate": 4.624176251258803e-05, + "loss": 5.3997, + "step": 29729 + }, + { + "epoch": 0.17681273194404795, + "grad_norm": 1.6546714305877686, + "learning_rate": 4.624151620128888e-05, + "loss": 5.396, + "step": 29730 + }, + { + "epoch": 0.17681867922732897, + "grad_norm": 1.55864679813385, + "learning_rate": 4.6241269882574534e-05, + "loss": 5.145, + "step": 29731 + }, + { + "epoch": 0.17682462651060996, + "grad_norm": 1.5503425598144531, + "learning_rate": 4.6241023556445084e-05, + "loss": 5.0982, + "step": 29732 + }, + { + "epoch": 0.17683057379389094, + "grad_norm": 1.6777262687683105, + "learning_rate": 4.624077722290061e-05, + "loss": 4.8005, + "step": 29733 + }, + { + "epoch": 0.17683652107717196, + "grad_norm": 1.4268922805786133, + "learning_rate": 4.62405308819412e-05, + "loss": 5.0045, + "step": 29734 + }, + { + "epoch": 0.17684246836045295, + "grad_norm": 1.7886883020401, + "learning_rate": 4.6240284533566946e-05, + "loss": 4.8464, + "step": 29735 + }, + { + "epoch": 0.17684841564373394, + "grad_norm": 1.5553979873657227, + "learning_rate": 4.624003817777792e-05, + "loss": 5.3561, + "step": 29736 + }, + { + "epoch": 0.17685436292701495, + "grad_norm": 1.508204698562622, + "learning_rate": 4.6239791814574224e-05, + "loss": 5.3903, + "step": 29737 + }, + { + "epoch": 0.17686031021029594, + "grad_norm": 1.3388547897338867, + "learning_rate": 4.623954544395593e-05, + "loss": 5.488, + "step": 29738 + }, + { + "epoch": 0.17686625749357693, + "grad_norm": 1.518465280532837, + "learning_rate": 4.623929906592313e-05, + "loss": 5.4595, + "step": 29739 + }, + { + "epoch": 0.17687220477685794, + "grad_norm": 1.5171095132827759, + "learning_rate": 4.623905268047592e-05, + "loss": 5.6942, + "step": 29740 + }, + { + "epoch": 0.17687815206013893, + "grad_norm": 1.4345729351043701, + "learning_rate": 4.623880628761436e-05, + "loss": 5.598, + "step": 29741 + }, + { + "epoch": 0.17688409934341992, + "grad_norm": 1.3692567348480225, + "learning_rate": 4.623855988733856e-05, + "loss": 5.8299, + "step": 29742 + }, + { + "epoch": 0.17689004662670094, + "grad_norm": 1.6717381477355957, + "learning_rate": 4.62383134796486e-05, + "loss": 4.9299, + "step": 29743 + }, + { + "epoch": 0.17689599390998192, + "grad_norm": 1.6725213527679443, + "learning_rate": 4.6238067064544565e-05, + "loss": 4.8448, + "step": 29744 + }, + { + "epoch": 0.1769019411932629, + "grad_norm": 1.885776400566101, + "learning_rate": 4.623782064202653e-05, + "loss": 4.8159, + "step": 29745 + }, + { + "epoch": 0.17690788847654393, + "grad_norm": 1.7408405542373657, + "learning_rate": 4.6237574212094605e-05, + "loss": 5.3162, + "step": 29746 + }, + { + "epoch": 0.17691383575982492, + "grad_norm": 1.4585955142974854, + "learning_rate": 4.6237327774748856e-05, + "loss": 5.933, + "step": 29747 + }, + { + "epoch": 0.1769197830431059, + "grad_norm": 1.6204352378845215, + "learning_rate": 4.623708132998937e-05, + "loss": 5.4457, + "step": 29748 + }, + { + "epoch": 0.1769257303263869, + "grad_norm": 1.4227222204208374, + "learning_rate": 4.623683487781625e-05, + "loss": 5.387, + "step": 29749 + }, + { + "epoch": 0.1769316776096679, + "grad_norm": 1.4104609489440918, + "learning_rate": 4.623658841822956e-05, + "loss": 5.5075, + "step": 29750 + }, + { + "epoch": 0.1769376248929489, + "grad_norm": 2.1077404022216797, + "learning_rate": 4.6236341951229406e-05, + "loss": 4.343, + "step": 29751 + }, + { + "epoch": 0.17694357217622989, + "grad_norm": 1.820806622505188, + "learning_rate": 4.6236095476815855e-05, + "loss": 4.8388, + "step": 29752 + }, + { + "epoch": 0.1769495194595109, + "grad_norm": 1.6640592813491821, + "learning_rate": 4.623584899498901e-05, + "loss": 5.129, + "step": 29753 + }, + { + "epoch": 0.1769554667427919, + "grad_norm": 1.6439399719238281, + "learning_rate": 4.623560250574894e-05, + "loss": 5.1712, + "step": 29754 + }, + { + "epoch": 0.17696141402607288, + "grad_norm": 1.6510851383209229, + "learning_rate": 4.623535600909575e-05, + "loss": 5.1796, + "step": 29755 + }, + { + "epoch": 0.1769673613093539, + "grad_norm": 1.8089758157730103, + "learning_rate": 4.6235109505029515e-05, + "loss": 4.5897, + "step": 29756 + }, + { + "epoch": 0.17697330859263488, + "grad_norm": 1.734377384185791, + "learning_rate": 4.6234862993550324e-05, + "loss": 5.1078, + "step": 29757 + }, + { + "epoch": 0.17697925587591587, + "grad_norm": 1.7873172760009766, + "learning_rate": 4.623461647465825e-05, + "loss": 5.3811, + "step": 29758 + }, + { + "epoch": 0.17698520315919689, + "grad_norm": 2.1304049491882324, + "learning_rate": 4.623436994835341e-05, + "loss": 4.6419, + "step": 29759 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 2.734135150909424, + "learning_rate": 4.6234123414635856e-05, + "loss": 4.4103, + "step": 29760 + }, + { + "epoch": 0.17699709772575886, + "grad_norm": 1.9526289701461792, + "learning_rate": 4.6233876873505694e-05, + "loss": 4.4495, + "step": 29761 + }, + { + "epoch": 0.17700304500903988, + "grad_norm": 1.7902294397354126, + "learning_rate": 4.6233630324963004e-05, + "loss": 4.9202, + "step": 29762 + }, + { + "epoch": 0.17700899229232087, + "grad_norm": 2.161142587661743, + "learning_rate": 4.6233383769007874e-05, + "loss": 4.1941, + "step": 29763 + }, + { + "epoch": 0.17701493957560185, + "grad_norm": 2.3652687072753906, + "learning_rate": 4.6233137205640386e-05, + "loss": 4.085, + "step": 29764 + }, + { + "epoch": 0.17702088685888287, + "grad_norm": 2.204157829284668, + "learning_rate": 4.6232890634860635e-05, + "loss": 3.9856, + "step": 29765 + }, + { + "epoch": 0.17702683414216386, + "grad_norm": 2.5543384552001953, + "learning_rate": 4.6232644056668695e-05, + "loss": 4.1421, + "step": 29766 + }, + { + "epoch": 0.17703278142544485, + "grad_norm": 2.0842933654785156, + "learning_rate": 4.623239747106466e-05, + "loss": 3.8326, + "step": 29767 + }, + { + "epoch": 0.17703872870872586, + "grad_norm": 1.953341007232666, + "learning_rate": 4.623215087804862e-05, + "loss": 4.0444, + "step": 29768 + }, + { + "epoch": 0.17704467599200685, + "grad_norm": 2.1980764865875244, + "learning_rate": 4.6231904277620644e-05, + "loss": 4.2192, + "step": 29769 + }, + { + "epoch": 0.17705062327528784, + "grad_norm": 2.225207567214966, + "learning_rate": 4.6231657669780836e-05, + "loss": 4.7365, + "step": 29770 + }, + { + "epoch": 0.17705657055856885, + "grad_norm": 2.128333330154419, + "learning_rate": 4.623141105452928e-05, + "loss": 5.6755, + "step": 29771 + }, + { + "epoch": 0.17706251784184984, + "grad_norm": 1.8886544704437256, + "learning_rate": 4.623116443186605e-05, + "loss": 4.9885, + "step": 29772 + }, + { + "epoch": 0.17706846512513083, + "grad_norm": 3.213632345199585, + "learning_rate": 4.623091780179125e-05, + "loss": 3.1388, + "step": 29773 + }, + { + "epoch": 0.17707441240841185, + "grad_norm": 2.6279642581939697, + "learning_rate": 4.623067116430495e-05, + "loss": 4.1536, + "step": 29774 + }, + { + "epoch": 0.17708035969169283, + "grad_norm": 1.6456087827682495, + "learning_rate": 4.623042451940724e-05, + "loss": 5.1824, + "step": 29775 + }, + { + "epoch": 0.17708630697497382, + "grad_norm": 1.8505003452301025, + "learning_rate": 4.623017786709821e-05, + "loss": 5.1548, + "step": 29776 + }, + { + "epoch": 0.17709225425825484, + "grad_norm": 1.5285630226135254, + "learning_rate": 4.622993120737794e-05, + "loss": 5.1444, + "step": 29777 + }, + { + "epoch": 0.17709820154153583, + "grad_norm": 1.6634210348129272, + "learning_rate": 4.622968454024652e-05, + "loss": 5.3108, + "step": 29778 + }, + { + "epoch": 0.17710414882481681, + "grad_norm": 1.6948342323303223, + "learning_rate": 4.622943786570405e-05, + "loss": 5.0025, + "step": 29779 + }, + { + "epoch": 0.17711009610809783, + "grad_norm": 2.1120948791503906, + "learning_rate": 4.6229191183750594e-05, + "loss": 4.6668, + "step": 29780 + }, + { + "epoch": 0.17711604339137882, + "grad_norm": 5.567571640014648, + "learning_rate": 4.622894449438624e-05, + "loss": 4.7644, + "step": 29781 + }, + { + "epoch": 0.1771219906746598, + "grad_norm": 4.830391883850098, + "learning_rate": 4.622869779761109e-05, + "loss": 4.5086, + "step": 29782 + }, + { + "epoch": 0.17712793795794082, + "grad_norm": 3.956571578979492, + "learning_rate": 4.622845109342522e-05, + "loss": 4.311, + "step": 29783 + }, + { + "epoch": 0.1771338852412218, + "grad_norm": 3.274723529815674, + "learning_rate": 4.622820438182871e-05, + "loss": 4.3097, + "step": 29784 + }, + { + "epoch": 0.1771398325245028, + "grad_norm": 2.478320360183716, + "learning_rate": 4.6227957662821666e-05, + "loss": 4.4818, + "step": 29785 + }, + { + "epoch": 0.17714577980778382, + "grad_norm": 1.271023154258728, + "learning_rate": 4.6227710936404144e-05, + "loss": 5.4578, + "step": 29786 + }, + { + "epoch": 0.1771517270910648, + "grad_norm": 1.687338948249817, + "learning_rate": 4.622746420257626e-05, + "loss": 5.0832, + "step": 29787 + }, + { + "epoch": 0.1771576743743458, + "grad_norm": 1.6693392992019653, + "learning_rate": 4.6227217461338084e-05, + "loss": 5.23, + "step": 29788 + }, + { + "epoch": 0.1771636216576268, + "grad_norm": 1.884928822517395, + "learning_rate": 4.622697071268971e-05, + "loss": 4.4254, + "step": 29789 + }, + { + "epoch": 0.1771695689409078, + "grad_norm": 1.8463094234466553, + "learning_rate": 4.622672395663121e-05, + "loss": 4.3649, + "step": 29790 + }, + { + "epoch": 0.17717551622418878, + "grad_norm": 1.5451326370239258, + "learning_rate": 4.6226477193162685e-05, + "loss": 4.7212, + "step": 29791 + }, + { + "epoch": 0.1771814635074698, + "grad_norm": 1.6390217542648315, + "learning_rate": 4.622623042228422e-05, + "loss": 5.5775, + "step": 29792 + }, + { + "epoch": 0.1771874107907508, + "grad_norm": 1.553244709968567, + "learning_rate": 4.62259836439959e-05, + "loss": 5.5905, + "step": 29793 + }, + { + "epoch": 0.17719335807403178, + "grad_norm": 1.398796558380127, + "learning_rate": 4.62257368582978e-05, + "loss": 5.5597, + "step": 29794 + }, + { + "epoch": 0.1771993053573128, + "grad_norm": 1.6612623929977417, + "learning_rate": 4.622549006519001e-05, + "loss": 4.9175, + "step": 29795 + }, + { + "epoch": 0.17720525264059378, + "grad_norm": 1.7774828672409058, + "learning_rate": 4.622524326467263e-05, + "loss": 5.2457, + "step": 29796 + }, + { + "epoch": 0.17721119992387477, + "grad_norm": 1.447310447692871, + "learning_rate": 4.622499645674574e-05, + "loss": 4.6974, + "step": 29797 + }, + { + "epoch": 0.17721714720715578, + "grad_norm": 1.8368786573410034, + "learning_rate": 4.6224749641409417e-05, + "loss": 4.7698, + "step": 29798 + }, + { + "epoch": 0.17722309449043677, + "grad_norm": 1.7796480655670166, + "learning_rate": 4.622450281866375e-05, + "loss": 5.0171, + "step": 29799 + }, + { + "epoch": 0.17722904177371776, + "grad_norm": 1.584720492362976, + "learning_rate": 4.6224255988508836e-05, + "loss": 5.5296, + "step": 29800 + }, + { + "epoch": 0.17723498905699878, + "grad_norm": 1.7539535760879517, + "learning_rate": 4.622400915094475e-05, + "loss": 5.5441, + "step": 29801 + }, + { + "epoch": 0.17724093634027976, + "grad_norm": 1.608579397201538, + "learning_rate": 4.6223762305971576e-05, + "loss": 5.3746, + "step": 29802 + }, + { + "epoch": 0.17724688362356075, + "grad_norm": 1.7146000862121582, + "learning_rate": 4.622351545358942e-05, + "loss": 5.2776, + "step": 29803 + }, + { + "epoch": 0.17725283090684177, + "grad_norm": 1.741254448890686, + "learning_rate": 4.622326859379834e-05, + "loss": 5.14, + "step": 29804 + }, + { + "epoch": 0.17725877819012276, + "grad_norm": 1.669607162475586, + "learning_rate": 4.6223021726598434e-05, + "loss": 4.9702, + "step": 29805 + }, + { + "epoch": 0.17726472547340374, + "grad_norm": 1.817954659461975, + "learning_rate": 4.62227748519898e-05, + "loss": 5.1888, + "step": 29806 + }, + { + "epoch": 0.17727067275668473, + "grad_norm": 1.7606234550476074, + "learning_rate": 4.6222527969972516e-05, + "loss": 5.1171, + "step": 29807 + }, + { + "epoch": 0.17727662003996575, + "grad_norm": 1.6854933500289917, + "learning_rate": 4.622228108054666e-05, + "loss": 4.9143, + "step": 29808 + }, + { + "epoch": 0.17728256732324674, + "grad_norm": 1.801241159439087, + "learning_rate": 4.622203418371233e-05, + "loss": 4.5452, + "step": 29809 + }, + { + "epoch": 0.17728851460652773, + "grad_norm": 1.7132951021194458, + "learning_rate": 4.6221787279469606e-05, + "loss": 5.6643, + "step": 29810 + }, + { + "epoch": 0.17729446188980874, + "grad_norm": 1.5202804803848267, + "learning_rate": 4.6221540367818576e-05, + "loss": 5.7674, + "step": 29811 + }, + { + "epoch": 0.17730040917308973, + "grad_norm": 1.3772656917572021, + "learning_rate": 4.622129344875932e-05, + "loss": 5.4231, + "step": 29812 + }, + { + "epoch": 0.17730635645637072, + "grad_norm": 1.7075127363204956, + "learning_rate": 4.6221046522291936e-05, + "loss": 5.1009, + "step": 29813 + }, + { + "epoch": 0.17731230373965173, + "grad_norm": 1.6497002840042114, + "learning_rate": 4.622079958841651e-05, + "loss": 5.2202, + "step": 29814 + }, + { + "epoch": 0.17731825102293272, + "grad_norm": 1.796449065208435, + "learning_rate": 4.622055264713311e-05, + "loss": 4.9304, + "step": 29815 + }, + { + "epoch": 0.1773241983062137, + "grad_norm": 1.6709007024765015, + "learning_rate": 4.6220305698441836e-05, + "loss": 4.9885, + "step": 29816 + }, + { + "epoch": 0.17733014558949473, + "grad_norm": 1.4689090251922607, + "learning_rate": 4.622005874234278e-05, + "loss": 4.9051, + "step": 29817 + }, + { + "epoch": 0.1773360928727757, + "grad_norm": 1.7701568603515625, + "learning_rate": 4.621981177883601e-05, + "loss": 4.8309, + "step": 29818 + }, + { + "epoch": 0.1773420401560567, + "grad_norm": 1.6992321014404297, + "learning_rate": 4.621956480792163e-05, + "loss": 4.7161, + "step": 29819 + }, + { + "epoch": 0.17734798743933772, + "grad_norm": 1.7641901969909668, + "learning_rate": 4.6219317829599715e-05, + "loss": 4.5102, + "step": 29820 + }, + { + "epoch": 0.1773539347226187, + "grad_norm": 1.9778741598129272, + "learning_rate": 4.621907084387036e-05, + "loss": 5.0063, + "step": 29821 + }, + { + "epoch": 0.1773598820058997, + "grad_norm": 2.4267444610595703, + "learning_rate": 4.6218823850733636e-05, + "loss": 4.6155, + "step": 29822 + }, + { + "epoch": 0.1773658292891807, + "grad_norm": 1.8586831092834473, + "learning_rate": 4.6218576850189655e-05, + "loss": 5.1348, + "step": 29823 + }, + { + "epoch": 0.1773717765724617, + "grad_norm": 2.0853071212768555, + "learning_rate": 4.621832984223849e-05, + "loss": 4.9064, + "step": 29824 + }, + { + "epoch": 0.17737772385574269, + "grad_norm": 1.9400508403778076, + "learning_rate": 4.6218082826880205e-05, + "loss": 5.0123, + "step": 29825 + }, + { + "epoch": 0.1773836711390237, + "grad_norm": 1.6919422149658203, + "learning_rate": 4.621783580411492e-05, + "loss": 4.8755, + "step": 29826 + }, + { + "epoch": 0.1773896184223047, + "grad_norm": 2.295384407043457, + "learning_rate": 4.621758877394271e-05, + "loss": 3.9202, + "step": 29827 + }, + { + "epoch": 0.17739556570558568, + "grad_norm": 2.417031764984131, + "learning_rate": 4.621734173636365e-05, + "loss": 4.441, + "step": 29828 + }, + { + "epoch": 0.1774015129888667, + "grad_norm": 3.097060203552246, + "learning_rate": 4.6217094691377835e-05, + "loss": 4.6754, + "step": 29829 + }, + { + "epoch": 0.17740746027214768, + "grad_norm": 2.9717020988464355, + "learning_rate": 4.621684763898536e-05, + "loss": 4.7217, + "step": 29830 + }, + { + "epoch": 0.17741340755542867, + "grad_norm": 1.9695039987564087, + "learning_rate": 4.62166005791863e-05, + "loss": 5.0213, + "step": 29831 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 1.6653083562850952, + "learning_rate": 4.621635351198074e-05, + "loss": 4.7739, + "step": 29832 + }, + { + "epoch": 0.17742530212199067, + "grad_norm": 1.9583450555801392, + "learning_rate": 4.621610643736878e-05, + "loss": 5.0863, + "step": 29833 + }, + { + "epoch": 0.17743124940527166, + "grad_norm": 2.460378646850586, + "learning_rate": 4.621585935535049e-05, + "loss": 4.5889, + "step": 29834 + }, + { + "epoch": 0.17743719668855268, + "grad_norm": 2.478996992111206, + "learning_rate": 4.621561226592596e-05, + "loss": 3.7157, + "step": 29835 + }, + { + "epoch": 0.17744314397183367, + "grad_norm": 2.4852869510650635, + "learning_rate": 4.6215365169095283e-05, + "loss": 3.674, + "step": 29836 + }, + { + "epoch": 0.17744909125511465, + "grad_norm": 3.0013065338134766, + "learning_rate": 4.621511806485853e-05, + "loss": 3.6854, + "step": 29837 + }, + { + "epoch": 0.17745503853839567, + "grad_norm": 2.071744918823242, + "learning_rate": 4.621487095321581e-05, + "loss": 4.2681, + "step": 29838 + }, + { + "epoch": 0.17746098582167666, + "grad_norm": 1.7033419609069824, + "learning_rate": 4.62146238341672e-05, + "loss": 5.4001, + "step": 29839 + }, + { + "epoch": 0.17746693310495765, + "grad_norm": 1.9189993143081665, + "learning_rate": 4.621437670771278e-05, + "loss": 4.9708, + "step": 29840 + }, + { + "epoch": 0.17747288038823866, + "grad_norm": 1.924280047416687, + "learning_rate": 4.621412957385264e-05, + "loss": 4.5928, + "step": 29841 + }, + { + "epoch": 0.17747882767151965, + "grad_norm": 1.6338030099868774, + "learning_rate": 4.621388243258686e-05, + "loss": 4.6546, + "step": 29842 + }, + { + "epoch": 0.17748477495480064, + "grad_norm": 1.6776630878448486, + "learning_rate": 4.621363528391555e-05, + "loss": 5.0897, + "step": 29843 + }, + { + "epoch": 0.17749072223808166, + "grad_norm": 2.56796932220459, + "learning_rate": 4.621338812783877e-05, + "loss": 4.1294, + "step": 29844 + }, + { + "epoch": 0.17749666952136264, + "grad_norm": 2.6277754306793213, + "learning_rate": 4.621314096435661e-05, + "loss": 4.2364, + "step": 29845 + }, + { + "epoch": 0.17750261680464363, + "grad_norm": 2.836585760116577, + "learning_rate": 4.621289379346916e-05, + "loss": 4.4706, + "step": 29846 + }, + { + "epoch": 0.17750856408792465, + "grad_norm": 2.3705074787139893, + "learning_rate": 4.6212646615176514e-05, + "loss": 4.245, + "step": 29847 + }, + { + "epoch": 0.17751451137120564, + "grad_norm": 1.7258014678955078, + "learning_rate": 4.621239942947875e-05, + "loss": 5.338, + "step": 29848 + }, + { + "epoch": 0.17752045865448662, + "grad_norm": 1.5844351053237915, + "learning_rate": 4.621215223637596e-05, + "loss": 4.85, + "step": 29849 + }, + { + "epoch": 0.17752640593776764, + "grad_norm": 1.7583924531936646, + "learning_rate": 4.6211905035868224e-05, + "loss": 4.1059, + "step": 29850 + }, + { + "epoch": 0.17753235322104863, + "grad_norm": 1.7784796953201294, + "learning_rate": 4.621165782795564e-05, + "loss": 4.1206, + "step": 29851 + }, + { + "epoch": 0.17753830050432962, + "grad_norm": 2.0315020084381104, + "learning_rate": 4.6211410612638273e-05, + "loss": 4.8268, + "step": 29852 + }, + { + "epoch": 0.17754424778761063, + "grad_norm": 2.137162923812866, + "learning_rate": 4.621116338991622e-05, + "loss": 4.6874, + "step": 29853 + }, + { + "epoch": 0.17755019507089162, + "grad_norm": 2.5275580883026123, + "learning_rate": 4.621091615978957e-05, + "loss": 4.4036, + "step": 29854 + }, + { + "epoch": 0.1775561423541726, + "grad_norm": 2.170762062072754, + "learning_rate": 4.621066892225842e-05, + "loss": 4.8377, + "step": 29855 + }, + { + "epoch": 0.17756208963745362, + "grad_norm": 1.592443823814392, + "learning_rate": 4.6210421677322833e-05, + "loss": 4.4257, + "step": 29856 + }, + { + "epoch": 0.1775680369207346, + "grad_norm": 1.479036569595337, + "learning_rate": 4.6210174424982914e-05, + "loss": 4.0881, + "step": 29857 + }, + { + "epoch": 0.1775739842040156, + "grad_norm": 1.5338127613067627, + "learning_rate": 4.620992716523874e-05, + "loss": 4.1253, + "step": 29858 + }, + { + "epoch": 0.17757993148729662, + "grad_norm": 1.6100810766220093, + "learning_rate": 4.62096798980904e-05, + "loss": 4.224, + "step": 29859 + }, + { + "epoch": 0.1775858787705776, + "grad_norm": 1.6029894351959229, + "learning_rate": 4.6209432623537984e-05, + "loss": 4.3054, + "step": 29860 + }, + { + "epoch": 0.1775918260538586, + "grad_norm": 1.5900243520736694, + "learning_rate": 4.620918534158157e-05, + "loss": 5.0011, + "step": 29861 + }, + { + "epoch": 0.1775977733371396, + "grad_norm": 1.4453150033950806, + "learning_rate": 4.620893805222124e-05, + "loss": 5.035, + "step": 29862 + }, + { + "epoch": 0.1776037206204206, + "grad_norm": 1.2561450004577637, + "learning_rate": 4.62086907554571e-05, + "loss": 5.0042, + "step": 29863 + }, + { + "epoch": 0.17760966790370158, + "grad_norm": 1.6127535104751587, + "learning_rate": 4.620844345128923e-05, + "loss": 5.1504, + "step": 29864 + }, + { + "epoch": 0.17761561518698257, + "grad_norm": 1.505549430847168, + "learning_rate": 4.6208196139717697e-05, + "loss": 5.2917, + "step": 29865 + }, + { + "epoch": 0.1776215624702636, + "grad_norm": 1.652327537536621, + "learning_rate": 4.620794882074261e-05, + "loss": 4.7241, + "step": 29866 + }, + { + "epoch": 0.17762750975354458, + "grad_norm": 1.750353217124939, + "learning_rate": 4.620770149436405e-05, + "loss": 4.1058, + "step": 29867 + }, + { + "epoch": 0.17763345703682556, + "grad_norm": 1.6184377670288086, + "learning_rate": 4.6207454160582094e-05, + "loss": 4.0415, + "step": 29868 + }, + { + "epoch": 0.17763940432010658, + "grad_norm": 1.493651270866394, + "learning_rate": 4.6207206819396834e-05, + "loss": 4.3537, + "step": 29869 + }, + { + "epoch": 0.17764535160338757, + "grad_norm": 1.4839292764663696, + "learning_rate": 4.6206959470808364e-05, + "loss": 4.7692, + "step": 29870 + }, + { + "epoch": 0.17765129888666856, + "grad_norm": 1.726027488708496, + "learning_rate": 4.620671211481676e-05, + "loss": 4.971, + "step": 29871 + }, + { + "epoch": 0.17765724616994957, + "grad_norm": 1.6284557580947876, + "learning_rate": 4.6206464751422105e-05, + "loss": 4.4246, + "step": 29872 + }, + { + "epoch": 0.17766319345323056, + "grad_norm": 2.2713751792907715, + "learning_rate": 4.6206217380624505e-05, + "loss": 4.3045, + "step": 29873 + }, + { + "epoch": 0.17766914073651155, + "grad_norm": 1.8392630815505981, + "learning_rate": 4.620597000242403e-05, + "loss": 4.1344, + "step": 29874 + }, + { + "epoch": 0.17767508801979257, + "grad_norm": 1.5239953994750977, + "learning_rate": 4.620572261682077e-05, + "loss": 3.9802, + "step": 29875 + }, + { + "epoch": 0.17768103530307355, + "grad_norm": 1.6723328828811646, + "learning_rate": 4.6205475223814804e-05, + "loss": 4.0901, + "step": 29876 + }, + { + "epoch": 0.17768698258635454, + "grad_norm": 1.555239200592041, + "learning_rate": 4.620522782340623e-05, + "loss": 3.9096, + "step": 29877 + }, + { + "epoch": 0.17769292986963556, + "grad_norm": 1.8839585781097412, + "learning_rate": 4.620498041559513e-05, + "loss": 4.9657, + "step": 29878 + }, + { + "epoch": 0.17769887715291655, + "grad_norm": 1.9911398887634277, + "learning_rate": 4.620473300038159e-05, + "loss": 4.5497, + "step": 29879 + }, + { + "epoch": 0.17770482443619753, + "grad_norm": 2.2058022022247314, + "learning_rate": 4.62044855777657e-05, + "loss": 3.7231, + "step": 29880 + }, + { + "epoch": 0.17771077171947855, + "grad_norm": 2.0669283866882324, + "learning_rate": 4.6204238147747535e-05, + "loss": 3.8466, + "step": 29881 + }, + { + "epoch": 0.17771671900275954, + "grad_norm": 2.122668981552124, + "learning_rate": 4.62039907103272e-05, + "loss": 3.5758, + "step": 29882 + }, + { + "epoch": 0.17772266628604053, + "grad_norm": 2.091607093811035, + "learning_rate": 4.6203743265504765e-05, + "loss": 3.3965, + "step": 29883 + }, + { + "epoch": 0.17772861356932154, + "grad_norm": 2.204787492752075, + "learning_rate": 4.620349581328033e-05, + "loss": 4.3546, + "step": 29884 + }, + { + "epoch": 0.17773456085260253, + "grad_norm": 1.5886098146438599, + "learning_rate": 4.620324835365396e-05, + "loss": 5.0842, + "step": 29885 + }, + { + "epoch": 0.17774050813588352, + "grad_norm": 1.6993340253829956, + "learning_rate": 4.6203000886625766e-05, + "loss": 4.8315, + "step": 29886 + }, + { + "epoch": 0.17774645541916453, + "grad_norm": 1.6817113161087036, + "learning_rate": 4.620275341219582e-05, + "loss": 4.9972, + "step": 29887 + }, + { + "epoch": 0.17775240270244552, + "grad_norm": 1.7113308906555176, + "learning_rate": 4.620250593036421e-05, + "loss": 4.8823, + "step": 29888 + }, + { + "epoch": 0.1777583499857265, + "grad_norm": 1.7548478841781616, + "learning_rate": 4.620225844113103e-05, + "loss": 5.121, + "step": 29889 + }, + { + "epoch": 0.17776429726900753, + "grad_norm": 1.8111287355422974, + "learning_rate": 4.6202010944496356e-05, + "loss": 4.8074, + "step": 29890 + }, + { + "epoch": 0.17777024455228851, + "grad_norm": 1.279390573501587, + "learning_rate": 4.620176344046028e-05, + "loss": 4.6303, + "step": 29891 + }, + { + "epoch": 0.1777761918355695, + "grad_norm": 1.2164942026138306, + "learning_rate": 4.620151592902288e-05, + "loss": 4.6222, + "step": 29892 + }, + { + "epoch": 0.17778213911885052, + "grad_norm": 1.5320428609848022, + "learning_rate": 4.620126841018426e-05, + "loss": 4.9938, + "step": 29893 + }, + { + "epoch": 0.1777880864021315, + "grad_norm": 1.5564218759536743, + "learning_rate": 4.620102088394449e-05, + "loss": 4.961, + "step": 29894 + }, + { + "epoch": 0.1777940336854125, + "grad_norm": 1.5532233715057373, + "learning_rate": 4.6200773350303675e-05, + "loss": 4.8086, + "step": 29895 + }, + { + "epoch": 0.1777999809686935, + "grad_norm": 1.9697725772857666, + "learning_rate": 4.620052580926187e-05, + "loss": 4.6753, + "step": 29896 + }, + { + "epoch": 0.1778059282519745, + "grad_norm": 2.0587549209594727, + "learning_rate": 4.62002782608192e-05, + "loss": 5.3824, + "step": 29897 + }, + { + "epoch": 0.1778118755352555, + "grad_norm": 1.5464704036712646, + "learning_rate": 4.620003070497572e-05, + "loss": 5.2827, + "step": 29898 + }, + { + "epoch": 0.1778178228185365, + "grad_norm": 2.052751064300537, + "learning_rate": 4.619978314173152e-05, + "loss": 4.8924, + "step": 29899 + }, + { + "epoch": 0.1778237701018175, + "grad_norm": 1.857614517211914, + "learning_rate": 4.619953557108671e-05, + "loss": 4.9826, + "step": 29900 + }, + { + "epoch": 0.17782971738509848, + "grad_norm": 1.5344221591949463, + "learning_rate": 4.619928799304136e-05, + "loss": 5.0715, + "step": 29901 + }, + { + "epoch": 0.1778356646683795, + "grad_norm": 1.6682283878326416, + "learning_rate": 4.619904040759555e-05, + "loss": 5.5025, + "step": 29902 + }, + { + "epoch": 0.17784161195166048, + "grad_norm": 1.8382456302642822, + "learning_rate": 4.619879281474938e-05, + "loss": 5.0428, + "step": 29903 + }, + { + "epoch": 0.17784755923494147, + "grad_norm": 1.5137388706207275, + "learning_rate": 4.619854521450293e-05, + "loss": 5.1731, + "step": 29904 + }, + { + "epoch": 0.1778535065182225, + "grad_norm": 1.5241427421569824, + "learning_rate": 4.619829760685628e-05, + "loss": 5.11, + "step": 29905 + }, + { + "epoch": 0.17785945380150348, + "grad_norm": 1.6426124572753906, + "learning_rate": 4.6198049991809534e-05, + "loss": 5.0386, + "step": 29906 + }, + { + "epoch": 0.17786540108478446, + "grad_norm": 1.240784764289856, + "learning_rate": 4.6197802369362756e-05, + "loss": 4.9999, + "step": 29907 + }, + { + "epoch": 0.17787134836806548, + "grad_norm": 1.7629567384719849, + "learning_rate": 4.6197554739516054e-05, + "loss": 5.1035, + "step": 29908 + }, + { + "epoch": 0.17787729565134647, + "grad_norm": 1.7833048105239868, + "learning_rate": 4.61973071022695e-05, + "loss": 5.2879, + "step": 29909 + }, + { + "epoch": 0.17788324293462746, + "grad_norm": 1.6848218441009521, + "learning_rate": 4.619705945762318e-05, + "loss": 5.1269, + "step": 29910 + }, + { + "epoch": 0.17788919021790847, + "grad_norm": 1.917606234550476, + "learning_rate": 4.61968118055772e-05, + "loss": 4.422, + "step": 29911 + }, + { + "epoch": 0.17789513750118946, + "grad_norm": 2.092909336090088, + "learning_rate": 4.619656414613162e-05, + "loss": 4.4046, + "step": 29912 + }, + { + "epoch": 0.17790108478447045, + "grad_norm": 1.580072283744812, + "learning_rate": 4.6196316479286547e-05, + "loss": 5.117, + "step": 29913 + }, + { + "epoch": 0.17790703206775146, + "grad_norm": 1.5650675296783447, + "learning_rate": 4.619606880504205e-05, + "loss": 5.0848, + "step": 29914 + }, + { + "epoch": 0.17791297935103245, + "grad_norm": 1.5918974876403809, + "learning_rate": 4.619582112339823e-05, + "loss": 5.108, + "step": 29915 + }, + { + "epoch": 0.17791892663431344, + "grad_norm": 1.6393519639968872, + "learning_rate": 4.619557343435516e-05, + "loss": 5.1883, + "step": 29916 + }, + { + "epoch": 0.17792487391759446, + "grad_norm": 1.6605910062789917, + "learning_rate": 4.619532573791294e-05, + "loss": 5.3422, + "step": 29917 + }, + { + "epoch": 0.17793082120087544, + "grad_norm": 1.618237853050232, + "learning_rate": 4.619507803407166e-05, + "loss": 5.3366, + "step": 29918 + }, + { + "epoch": 0.17793676848415643, + "grad_norm": 1.7383369207382202, + "learning_rate": 4.6194830322831384e-05, + "loss": 5.2423, + "step": 29919 + }, + { + "epoch": 0.17794271576743745, + "grad_norm": 1.7745330333709717, + "learning_rate": 4.619458260419222e-05, + "loss": 5.5013, + "step": 29920 + }, + { + "epoch": 0.17794866305071844, + "grad_norm": 1.64639151096344, + "learning_rate": 4.6194334878154244e-05, + "loss": 5.6739, + "step": 29921 + }, + { + "epoch": 0.17795461033399942, + "grad_norm": 1.6652768850326538, + "learning_rate": 4.619408714471754e-05, + "loss": 5.5507, + "step": 29922 + }, + { + "epoch": 0.1779605576172804, + "grad_norm": 1.8969260454177856, + "learning_rate": 4.61938394038822e-05, + "loss": 4.7228, + "step": 29923 + }, + { + "epoch": 0.17796650490056143, + "grad_norm": 2.7471752166748047, + "learning_rate": 4.619359165564832e-05, + "loss": 3.7551, + "step": 29924 + }, + { + "epoch": 0.17797245218384242, + "grad_norm": 1.68784499168396, + "learning_rate": 4.6193343900015964e-05, + "loss": 4.6853, + "step": 29925 + }, + { + "epoch": 0.1779783994671234, + "grad_norm": 1.6362453699111938, + "learning_rate": 4.619309613698523e-05, + "loss": 4.665, + "step": 29926 + }, + { + "epoch": 0.17798434675040442, + "grad_norm": 1.737727165222168, + "learning_rate": 4.619284836655621e-05, + "loss": 4.9511, + "step": 29927 + }, + { + "epoch": 0.1779902940336854, + "grad_norm": 1.4916706085205078, + "learning_rate": 4.6192600588728985e-05, + "loss": 4.9043, + "step": 29928 + }, + { + "epoch": 0.1779962413169664, + "grad_norm": 1.6925257444381714, + "learning_rate": 4.619235280350365e-05, + "loss": 4.764, + "step": 29929 + }, + { + "epoch": 0.1780021886002474, + "grad_norm": 1.525317668914795, + "learning_rate": 4.619210501088027e-05, + "loss": 4.5491, + "step": 29930 + }, + { + "epoch": 0.1780081358835284, + "grad_norm": 1.771481990814209, + "learning_rate": 4.619185721085895e-05, + "loss": 4.7972, + "step": 29931 + }, + { + "epoch": 0.1780140831668094, + "grad_norm": 2.018819808959961, + "learning_rate": 4.619160940343977e-05, + "loss": 3.8428, + "step": 29932 + }, + { + "epoch": 0.1780200304500904, + "grad_norm": 1.7792484760284424, + "learning_rate": 4.6191361588622825e-05, + "loss": 4.9156, + "step": 29933 + }, + { + "epoch": 0.1780259777333714, + "grad_norm": 1.8811469078063965, + "learning_rate": 4.619111376640819e-05, + "loss": 4.0915, + "step": 29934 + }, + { + "epoch": 0.17803192501665238, + "grad_norm": 1.7818450927734375, + "learning_rate": 4.619086593679596e-05, + "loss": 5.1882, + "step": 29935 + }, + { + "epoch": 0.1780378722999334, + "grad_norm": 1.587109088897705, + "learning_rate": 4.619061809978621e-05, + "loss": 4.8753, + "step": 29936 + }, + { + "epoch": 0.17804381958321439, + "grad_norm": 1.6229913234710693, + "learning_rate": 4.619037025537904e-05, + "loss": 4.5926, + "step": 29937 + }, + { + "epoch": 0.17804976686649537, + "grad_norm": 2.0784964561462402, + "learning_rate": 4.619012240357452e-05, + "loss": 3.6958, + "step": 29938 + }, + { + "epoch": 0.1780557141497764, + "grad_norm": 1.829585313796997, + "learning_rate": 4.6189874544372766e-05, + "loss": 3.5768, + "step": 29939 + }, + { + "epoch": 0.17806166143305738, + "grad_norm": 2.243161201477051, + "learning_rate": 4.6189626677773837e-05, + "loss": 3.6418, + "step": 29940 + }, + { + "epoch": 0.17806760871633837, + "grad_norm": 1.8179738521575928, + "learning_rate": 4.618937880377782e-05, + "loss": 3.6718, + "step": 29941 + }, + { + "epoch": 0.17807355599961938, + "grad_norm": 1.7654396295547485, + "learning_rate": 4.618913092238482e-05, + "loss": 4.4997, + "step": 29942 + }, + { + "epoch": 0.17807950328290037, + "grad_norm": 1.615114688873291, + "learning_rate": 4.6188883033594907e-05, + "loss": 4.7439, + "step": 29943 + }, + { + "epoch": 0.17808545056618136, + "grad_norm": 1.2790718078613281, + "learning_rate": 4.6188635137408174e-05, + "loss": 4.6724, + "step": 29944 + }, + { + "epoch": 0.17809139784946237, + "grad_norm": 1.6814706325531006, + "learning_rate": 4.6188387233824717e-05, + "loss": 4.9715, + "step": 29945 + }, + { + "epoch": 0.17809734513274336, + "grad_norm": 2.3926637172698975, + "learning_rate": 4.61881393228446e-05, + "loss": 3.682, + "step": 29946 + }, + { + "epoch": 0.17810329241602435, + "grad_norm": 1.4340671300888062, + "learning_rate": 4.618789140446793e-05, + "loss": 4.586, + "step": 29947 + }, + { + "epoch": 0.17810923969930537, + "grad_norm": 1.6323633193969727, + "learning_rate": 4.6187643478694784e-05, + "loss": 4.7435, + "step": 29948 + }, + { + "epoch": 0.17811518698258635, + "grad_norm": 1.6034373044967651, + "learning_rate": 4.618739554552526e-05, + "loss": 4.9142, + "step": 29949 + }, + { + "epoch": 0.17812113426586734, + "grad_norm": 1.599575161933899, + "learning_rate": 4.618714760495943e-05, + "loss": 4.7991, + "step": 29950 + }, + { + "epoch": 0.17812708154914836, + "grad_norm": 1.7768034934997559, + "learning_rate": 4.618689965699737e-05, + "loss": 4.9267, + "step": 29951 + }, + { + "epoch": 0.17813302883242935, + "grad_norm": 1.8471229076385498, + "learning_rate": 4.6186651701639195e-05, + "loss": 4.4194, + "step": 29952 + }, + { + "epoch": 0.17813897611571033, + "grad_norm": 2.222182512283325, + "learning_rate": 4.6186403738884984e-05, + "loss": 4.1248, + "step": 29953 + }, + { + "epoch": 0.17814492339899135, + "grad_norm": 2.373452663421631, + "learning_rate": 4.6186155768734806e-05, + "loss": 4.3799, + "step": 29954 + }, + { + "epoch": 0.17815087068227234, + "grad_norm": 2.6431610584259033, + "learning_rate": 4.618590779118877e-05, + "loss": 4.4425, + "step": 29955 + }, + { + "epoch": 0.17815681796555333, + "grad_norm": 2.160435676574707, + "learning_rate": 4.618565980624695e-05, + "loss": 4.3708, + "step": 29956 + }, + { + "epoch": 0.17816276524883434, + "grad_norm": 2.0715856552124023, + "learning_rate": 4.618541181390943e-05, + "loss": 4.7181, + "step": 29957 + }, + { + "epoch": 0.17816871253211533, + "grad_norm": 2.107534408569336, + "learning_rate": 4.618516381417631e-05, + "loss": 3.9446, + "step": 29958 + }, + { + "epoch": 0.17817465981539632, + "grad_norm": 2.215634822845459, + "learning_rate": 4.618491580704766e-05, + "loss": 4.3066, + "step": 29959 + }, + { + "epoch": 0.17818060709867733, + "grad_norm": 1.760855793952942, + "learning_rate": 4.618466779252359e-05, + "loss": 4.757, + "step": 29960 + }, + { + "epoch": 0.17818655438195832, + "grad_norm": 1.6130295991897583, + "learning_rate": 4.618441977060415e-05, + "loss": 5.0813, + "step": 29961 + }, + { + "epoch": 0.1781925016652393, + "grad_norm": 1.4686352014541626, + "learning_rate": 4.6184171741289454e-05, + "loss": 4.5848, + "step": 29962 + }, + { + "epoch": 0.17819844894852033, + "grad_norm": 1.5685728788375854, + "learning_rate": 4.618392370457959e-05, + "loss": 4.5756, + "step": 29963 + }, + { + "epoch": 0.17820439623180132, + "grad_norm": 1.7625272274017334, + "learning_rate": 4.618367566047463e-05, + "loss": 4.4729, + "step": 29964 + }, + { + "epoch": 0.1782103435150823, + "grad_norm": 2.350189685821533, + "learning_rate": 4.618342760897467e-05, + "loss": 4.2178, + "step": 29965 + }, + { + "epoch": 0.17821629079836332, + "grad_norm": 2.462435007095337, + "learning_rate": 4.6183179550079796e-05, + "loss": 4.5618, + "step": 29966 + }, + { + "epoch": 0.1782222380816443, + "grad_norm": 2.354248523712158, + "learning_rate": 4.618293148379009e-05, + "loss": 4.4869, + "step": 29967 + }, + { + "epoch": 0.1782281853649253, + "grad_norm": 2.1047489643096924, + "learning_rate": 4.6182683410105646e-05, + "loss": 4.3849, + "step": 29968 + }, + { + "epoch": 0.1782341326482063, + "grad_norm": 1.859437108039856, + "learning_rate": 4.618243532902655e-05, + "loss": 4.3603, + "step": 29969 + }, + { + "epoch": 0.1782400799314873, + "grad_norm": 2.014723539352417, + "learning_rate": 4.6182187240552875e-05, + "loss": 5.363, + "step": 29970 + }, + { + "epoch": 0.1782460272147683, + "grad_norm": 1.637157917022705, + "learning_rate": 4.618193914468472e-05, + "loss": 5.0457, + "step": 29971 + }, + { + "epoch": 0.1782519744980493, + "grad_norm": 2.200927734375, + "learning_rate": 4.618169104142217e-05, + "loss": 4.9131, + "step": 29972 + }, + { + "epoch": 0.1782579217813303, + "grad_norm": 2.0116817951202393, + "learning_rate": 4.6181442930765305e-05, + "loss": 4.8401, + "step": 29973 + }, + { + "epoch": 0.17826386906461128, + "grad_norm": 1.9755736589431763, + "learning_rate": 4.618119481271422e-05, + "loss": 4.8402, + "step": 29974 + }, + { + "epoch": 0.1782698163478923, + "grad_norm": 1.954923152923584, + "learning_rate": 4.618094668726901e-05, + "loss": 4.7746, + "step": 29975 + }, + { + "epoch": 0.17827576363117328, + "grad_norm": 2.0195765495300293, + "learning_rate": 4.6180698554429737e-05, + "loss": 4.4359, + "step": 29976 + }, + { + "epoch": 0.17828171091445427, + "grad_norm": 1.9346232414245605, + "learning_rate": 4.618045041419651e-05, + "loss": 5.132, + "step": 29977 + }, + { + "epoch": 0.1782876581977353, + "grad_norm": 1.880932331085205, + "learning_rate": 4.6180202266569394e-05, + "loss": 5.26, + "step": 29978 + }, + { + "epoch": 0.17829360548101628, + "grad_norm": 1.8841670751571655, + "learning_rate": 4.6179954111548495e-05, + "loss": 4.7878, + "step": 29979 + }, + { + "epoch": 0.17829955276429726, + "grad_norm": 1.9039348363876343, + "learning_rate": 4.61797059491339e-05, + "loss": 4.8547, + "step": 29980 + }, + { + "epoch": 0.17830550004757825, + "grad_norm": 2.0296382904052734, + "learning_rate": 4.617945777932568e-05, + "loss": 5.0599, + "step": 29981 + }, + { + "epoch": 0.17831144733085927, + "grad_norm": 1.8153882026672363, + "learning_rate": 4.617920960212393e-05, + "loss": 5.0123, + "step": 29982 + }, + { + "epoch": 0.17831739461414026, + "grad_norm": 1.5454435348510742, + "learning_rate": 4.617896141752874e-05, + "loss": 4.4975, + "step": 29983 + }, + { + "epoch": 0.17832334189742124, + "grad_norm": 1.5883069038391113, + "learning_rate": 4.6178713225540196e-05, + "loss": 4.8825, + "step": 29984 + }, + { + "epoch": 0.17832928918070226, + "grad_norm": 1.58603036403656, + "learning_rate": 4.617846502615837e-05, + "loss": 5.3068, + "step": 29985 + }, + { + "epoch": 0.17833523646398325, + "grad_norm": 1.6731973886489868, + "learning_rate": 4.6178216819383374e-05, + "loss": 5.5331, + "step": 29986 + }, + { + "epoch": 0.17834118374726424, + "grad_norm": 1.6074113845825195, + "learning_rate": 4.6177968605215276e-05, + "loss": 5.5162, + "step": 29987 + }, + { + "epoch": 0.17834713103054525, + "grad_norm": 1.4040982723236084, + "learning_rate": 4.6177720383654166e-05, + "loss": 5.3135, + "step": 29988 + }, + { + "epoch": 0.17835307831382624, + "grad_norm": 1.6419864892959595, + "learning_rate": 4.617747215470014e-05, + "loss": 4.9229, + "step": 29989 + }, + { + "epoch": 0.17835902559710723, + "grad_norm": 1.7256529331207275, + "learning_rate": 4.617722391835327e-05, + "loss": 5.0782, + "step": 29990 + }, + { + "epoch": 0.17836497288038825, + "grad_norm": 1.7224550247192383, + "learning_rate": 4.617697567461365e-05, + "loss": 4.8078, + "step": 29991 + }, + { + "epoch": 0.17837092016366923, + "grad_norm": 1.63644278049469, + "learning_rate": 4.617672742348137e-05, + "loss": 5.2103, + "step": 29992 + }, + { + "epoch": 0.17837686744695022, + "grad_norm": 1.9455114603042603, + "learning_rate": 4.617647916495651e-05, + "loss": 5.3372, + "step": 29993 + }, + { + "epoch": 0.17838281473023124, + "grad_norm": 1.6073265075683594, + "learning_rate": 4.6176230899039166e-05, + "loss": 4.4093, + "step": 29994 + }, + { + "epoch": 0.17838876201351223, + "grad_norm": 2.0087218284606934, + "learning_rate": 4.6175982625729405e-05, + "loss": 5.0169, + "step": 29995 + }, + { + "epoch": 0.1783947092967932, + "grad_norm": 2.3341264724731445, + "learning_rate": 4.617573434502734e-05, + "loss": 4.271, + "step": 29996 + }, + { + "epoch": 0.17840065658007423, + "grad_norm": 1.6453101634979248, + "learning_rate": 4.617548605693305e-05, + "loss": 5.0354, + "step": 29997 + }, + { + "epoch": 0.17840660386335522, + "grad_norm": 1.6747314929962158, + "learning_rate": 4.61752377614466e-05, + "loss": 5.0349, + "step": 29998 + }, + { + "epoch": 0.1784125511466362, + "grad_norm": 1.7050796747207642, + "learning_rate": 4.617498945856811e-05, + "loss": 4.753, + "step": 29999 + }, + { + "epoch": 0.17841849842991722, + "grad_norm": 1.7062735557556152, + "learning_rate": 4.617474114829764e-05, + "loss": 5.1345, + "step": 30000 + }, + { + "epoch": 0.1784244457131982, + "grad_norm": 1.802368402481079, + "learning_rate": 4.6174492830635285e-05, + "loss": 4.4919, + "step": 30001 + }, + { + "epoch": 0.1784303929964792, + "grad_norm": 1.7409639358520508, + "learning_rate": 4.6174244505581135e-05, + "loss": 4.9005, + "step": 30002 + }, + { + "epoch": 0.1784363402797602, + "grad_norm": 1.6387557983398438, + "learning_rate": 4.617399617313528e-05, + "loss": 5.3176, + "step": 30003 + }, + { + "epoch": 0.1784422875630412, + "grad_norm": 2.3082478046417236, + "learning_rate": 4.617374783329779e-05, + "loss": 4.0222, + "step": 30004 + }, + { + "epoch": 0.1784482348463222, + "grad_norm": 2.5321269035339355, + "learning_rate": 4.617349948606878e-05, + "loss": 4.4114, + "step": 30005 + }, + { + "epoch": 0.1784541821296032, + "grad_norm": 1.9945601224899292, + "learning_rate": 4.6173251131448305e-05, + "loss": 5.0146, + "step": 30006 + }, + { + "epoch": 0.1784601294128842, + "grad_norm": 1.408103108406067, + "learning_rate": 4.6173002769436474e-05, + "loss": 5.2415, + "step": 30007 + }, + { + "epoch": 0.17846607669616518, + "grad_norm": 2.4887290000915527, + "learning_rate": 4.6172754400033366e-05, + "loss": 4.3658, + "step": 30008 + }, + { + "epoch": 0.1784720239794462, + "grad_norm": 1.4949021339416504, + "learning_rate": 4.617250602323907e-05, + "loss": 5.0803, + "step": 30009 + }, + { + "epoch": 0.1784779712627272, + "grad_norm": 1.5946985483169556, + "learning_rate": 4.617225763905367e-05, + "loss": 4.5409, + "step": 30010 + }, + { + "epoch": 0.17848391854600817, + "grad_norm": 1.618841528892517, + "learning_rate": 4.6172009247477246e-05, + "loss": 4.9119, + "step": 30011 + }, + { + "epoch": 0.1784898658292892, + "grad_norm": 1.4818013906478882, + "learning_rate": 4.61717608485099e-05, + "loss": 5.0194, + "step": 30012 + }, + { + "epoch": 0.17849581311257018, + "grad_norm": 2.2418477535247803, + "learning_rate": 4.617151244215171e-05, + "loss": 3.8263, + "step": 30013 + }, + { + "epoch": 0.17850176039585117, + "grad_norm": 2.023987054824829, + "learning_rate": 4.617126402840277e-05, + "loss": 5.4911, + "step": 30014 + }, + { + "epoch": 0.17850770767913218, + "grad_norm": 1.9841099977493286, + "learning_rate": 4.6171015607263144e-05, + "loss": 5.3778, + "step": 30015 + }, + { + "epoch": 0.17851365496241317, + "grad_norm": 1.896392822265625, + "learning_rate": 4.617076717873295e-05, + "loss": 4.7488, + "step": 30016 + }, + { + "epoch": 0.17851960224569416, + "grad_norm": 1.816318154335022, + "learning_rate": 4.6170518742812255e-05, + "loss": 4.1921, + "step": 30017 + }, + { + "epoch": 0.17852554952897517, + "grad_norm": 1.8096336126327515, + "learning_rate": 4.617027029950115e-05, + "loss": 4.1089, + "step": 30018 + }, + { + "epoch": 0.17853149681225616, + "grad_norm": 2.236724853515625, + "learning_rate": 4.617002184879973e-05, + "loss": 3.7496, + "step": 30019 + }, + { + "epoch": 0.17853744409553715, + "grad_norm": 2.349423885345459, + "learning_rate": 4.616977339070806e-05, + "loss": 3.8509, + "step": 30020 + }, + { + "epoch": 0.17854339137881817, + "grad_norm": 2.2639737129211426, + "learning_rate": 4.616952492522625e-05, + "loss": 3.5414, + "step": 30021 + }, + { + "epoch": 0.17854933866209916, + "grad_norm": 2.1458024978637695, + "learning_rate": 4.6169276452354374e-05, + "loss": 3.9498, + "step": 30022 + }, + { + "epoch": 0.17855528594538014, + "grad_norm": 1.7704306840896606, + "learning_rate": 4.616902797209253e-05, + "loss": 4.4054, + "step": 30023 + }, + { + "epoch": 0.17856123322866116, + "grad_norm": 2.261296033859253, + "learning_rate": 4.616877948444078e-05, + "loss": 4.5599, + "step": 30024 + }, + { + "epoch": 0.17856718051194215, + "grad_norm": 2.4894723892211914, + "learning_rate": 4.616853098939924e-05, + "loss": 3.9811, + "step": 30025 + }, + { + "epoch": 0.17857312779522314, + "grad_norm": 2.1360697746276855, + "learning_rate": 4.616828248696798e-05, + "loss": 4.2385, + "step": 30026 + }, + { + "epoch": 0.17857907507850415, + "grad_norm": 2.185976505279541, + "learning_rate": 4.61680339771471e-05, + "loss": 4.8738, + "step": 30027 + }, + { + "epoch": 0.17858502236178514, + "grad_norm": 2.6245265007019043, + "learning_rate": 4.6167785459936676e-05, + "loss": 4.2775, + "step": 30028 + }, + { + "epoch": 0.17859096964506613, + "grad_norm": 1.9567252397537231, + "learning_rate": 4.616753693533679e-05, + "loss": 4.7655, + "step": 30029 + }, + { + "epoch": 0.17859691692834714, + "grad_norm": 1.832485556602478, + "learning_rate": 4.616728840334754e-05, + "loss": 5.0603, + "step": 30030 + }, + { + "epoch": 0.17860286421162813, + "grad_norm": 1.8482451438903809, + "learning_rate": 4.6167039863969005e-05, + "loss": 4.6997, + "step": 30031 + }, + { + "epoch": 0.17860881149490912, + "grad_norm": 1.7290279865264893, + "learning_rate": 4.616679131720128e-05, + "loss": 5.1665, + "step": 30032 + }, + { + "epoch": 0.17861475877819014, + "grad_norm": 2.0203309059143066, + "learning_rate": 4.616654276304444e-05, + "loss": 4.5307, + "step": 30033 + }, + { + "epoch": 0.17862070606147112, + "grad_norm": 2.3280582427978516, + "learning_rate": 4.616629420149858e-05, + "loss": 4.1431, + "step": 30034 + }, + { + "epoch": 0.1786266533447521, + "grad_norm": 1.745954155921936, + "learning_rate": 4.616604563256379e-05, + "loss": 5.0325, + "step": 30035 + }, + { + "epoch": 0.17863260062803313, + "grad_norm": 1.4526299238204956, + "learning_rate": 4.616579705624016e-05, + "loss": 5.62, + "step": 30036 + }, + { + "epoch": 0.17863854791131412, + "grad_norm": 1.6712018251419067, + "learning_rate": 4.616554847252775e-05, + "loss": 5.0503, + "step": 30037 + }, + { + "epoch": 0.1786444951945951, + "grad_norm": 1.5935488939285278, + "learning_rate": 4.616529988142668e-05, + "loss": 5.1876, + "step": 30038 + }, + { + "epoch": 0.1786504424778761, + "grad_norm": 1.4841454029083252, + "learning_rate": 4.616505128293701e-05, + "loss": 5.5045, + "step": 30039 + }, + { + "epoch": 0.1786563897611571, + "grad_norm": 1.7214070558547974, + "learning_rate": 4.616480267705885e-05, + "loss": 5.1553, + "step": 30040 + }, + { + "epoch": 0.1786623370444381, + "grad_norm": 1.625107765197754, + "learning_rate": 4.6164554063792277e-05, + "loss": 5.0406, + "step": 30041 + }, + { + "epoch": 0.17866828432771908, + "grad_norm": 1.5284959077835083, + "learning_rate": 4.616430544313737e-05, + "loss": 5.7409, + "step": 30042 + }, + { + "epoch": 0.1786742316110001, + "grad_norm": 1.4745396375656128, + "learning_rate": 4.616405681509423e-05, + "loss": 5.9447, + "step": 30043 + }, + { + "epoch": 0.1786801788942811, + "grad_norm": 1.6352115869522095, + "learning_rate": 4.616380817966293e-05, + "loss": 5.405, + "step": 30044 + }, + { + "epoch": 0.17868612617756208, + "grad_norm": 1.5231393575668335, + "learning_rate": 4.616355953684356e-05, + "loss": 5.8282, + "step": 30045 + }, + { + "epoch": 0.1786920734608431, + "grad_norm": 1.5378559827804565, + "learning_rate": 4.6163310886636216e-05, + "loss": 5.4119, + "step": 30046 + }, + { + "epoch": 0.17869802074412408, + "grad_norm": 1.7744802236557007, + "learning_rate": 4.6163062229040976e-05, + "loss": 4.6722, + "step": 30047 + }, + { + "epoch": 0.17870396802740507, + "grad_norm": 1.6727073192596436, + "learning_rate": 4.616281356405793e-05, + "loss": 4.9073, + "step": 30048 + }, + { + "epoch": 0.17870991531068608, + "grad_norm": 1.7499542236328125, + "learning_rate": 4.616256489168717e-05, + "loss": 5.1734, + "step": 30049 + }, + { + "epoch": 0.17871586259396707, + "grad_norm": 1.6163703203201294, + "learning_rate": 4.616231621192877e-05, + "loss": 5.2352, + "step": 30050 + }, + { + "epoch": 0.17872180987724806, + "grad_norm": 1.8054791688919067, + "learning_rate": 4.6162067524782826e-05, + "loss": 4.7604, + "step": 30051 + }, + { + "epoch": 0.17872775716052908, + "grad_norm": 1.5915356874465942, + "learning_rate": 4.616181883024942e-05, + "loss": 4.9966, + "step": 30052 + }, + { + "epoch": 0.17873370444381007, + "grad_norm": 1.6951193809509277, + "learning_rate": 4.616157012832865e-05, + "loss": 5.2401, + "step": 30053 + }, + { + "epoch": 0.17873965172709105, + "grad_norm": 1.6987075805664062, + "learning_rate": 4.6161321419020584e-05, + "loss": 5.212, + "step": 30054 + }, + { + "epoch": 0.17874559901037207, + "grad_norm": 1.8731896877288818, + "learning_rate": 4.616107270232533e-05, + "loss": 5.1996, + "step": 30055 + }, + { + "epoch": 0.17875154629365306, + "grad_norm": 2.243042469024658, + "learning_rate": 4.6160823978242955e-05, + "loss": 5.0162, + "step": 30056 + }, + { + "epoch": 0.17875749357693405, + "grad_norm": 1.7902021408081055, + "learning_rate": 4.616057524677356e-05, + "loss": 5.046, + "step": 30057 + }, + { + "epoch": 0.17876344086021506, + "grad_norm": 1.591950535774231, + "learning_rate": 4.6160326507917225e-05, + "loss": 5.1167, + "step": 30058 + }, + { + "epoch": 0.17876938814349605, + "grad_norm": 1.8238025903701782, + "learning_rate": 4.616007776167404e-05, + "loss": 4.6017, + "step": 30059 + }, + { + "epoch": 0.17877533542677704, + "grad_norm": 1.719621181488037, + "learning_rate": 4.6159829008044086e-05, + "loss": 4.7455, + "step": 30060 + }, + { + "epoch": 0.17878128271005805, + "grad_norm": 1.6752606630325317, + "learning_rate": 4.6159580247027465e-05, + "loss": 4.965, + "step": 30061 + }, + { + "epoch": 0.17878722999333904, + "grad_norm": 1.6750445365905762, + "learning_rate": 4.615933147862424e-05, + "loss": 5.0824, + "step": 30062 + }, + { + "epoch": 0.17879317727662003, + "grad_norm": 1.6843575239181519, + "learning_rate": 4.615908270283452e-05, + "loss": 4.9444, + "step": 30063 + }, + { + "epoch": 0.17879912455990105, + "grad_norm": 1.3926664590835571, + "learning_rate": 4.6158833919658385e-05, + "loss": 5.8095, + "step": 30064 + }, + { + "epoch": 0.17880507184318203, + "grad_norm": 1.8290106058120728, + "learning_rate": 4.615858512909591e-05, + "loss": 5.1141, + "step": 30065 + }, + { + "epoch": 0.17881101912646302, + "grad_norm": 1.7168490886688232, + "learning_rate": 4.61583363311472e-05, + "loss": 4.4733, + "step": 30066 + }, + { + "epoch": 0.17881696640974404, + "grad_norm": 1.5453979969024658, + "learning_rate": 4.615808752581233e-05, + "loss": 4.3735, + "step": 30067 + }, + { + "epoch": 0.17882291369302503, + "grad_norm": 2.376648426055908, + "learning_rate": 4.615783871309139e-05, + "loss": 4.3287, + "step": 30068 + }, + { + "epoch": 0.17882886097630601, + "grad_norm": 1.7454547882080078, + "learning_rate": 4.615758989298447e-05, + "loss": 4.9139, + "step": 30069 + }, + { + "epoch": 0.17883480825958703, + "grad_norm": 2.697049617767334, + "learning_rate": 4.6157341065491644e-05, + "loss": 4.5206, + "step": 30070 + }, + { + "epoch": 0.17884075554286802, + "grad_norm": 2.183265447616577, + "learning_rate": 4.615709223061302e-05, + "loss": 4.5157, + "step": 30071 + }, + { + "epoch": 0.178846702826149, + "grad_norm": 2.572007179260254, + "learning_rate": 4.615684338834867e-05, + "loss": 4.4589, + "step": 30072 + }, + { + "epoch": 0.17885265010943002, + "grad_norm": 2.5697407722473145, + "learning_rate": 4.6156594538698685e-05, + "loss": 4.4577, + "step": 30073 + }, + { + "epoch": 0.178858597392711, + "grad_norm": 1.9497699737548828, + "learning_rate": 4.615634568166315e-05, + "loss": 4.691, + "step": 30074 + }, + { + "epoch": 0.178864544675992, + "grad_norm": 1.5829882621765137, + "learning_rate": 4.6156096817242154e-05, + "loss": 5.3589, + "step": 30075 + }, + { + "epoch": 0.17887049195927301, + "grad_norm": 1.837938904762268, + "learning_rate": 4.6155847945435785e-05, + "loss": 5.1354, + "step": 30076 + }, + { + "epoch": 0.178876439242554, + "grad_norm": 1.7852935791015625, + "learning_rate": 4.615559906624412e-05, + "loss": 5.3022, + "step": 30077 + }, + { + "epoch": 0.178882386525835, + "grad_norm": 1.8897148370742798, + "learning_rate": 4.615535017966726e-05, + "loss": 5.3356, + "step": 30078 + }, + { + "epoch": 0.178888333809116, + "grad_norm": 1.8716245889663696, + "learning_rate": 4.615510128570529e-05, + "loss": 4.8567, + "step": 30079 + }, + { + "epoch": 0.178894281092397, + "grad_norm": 1.5220413208007812, + "learning_rate": 4.6154852384358286e-05, + "loss": 4.9667, + "step": 30080 + }, + { + "epoch": 0.17890022837567798, + "grad_norm": 1.7942893505096436, + "learning_rate": 4.615460347562635e-05, + "loss": 4.7159, + "step": 30081 + }, + { + "epoch": 0.178906175658959, + "grad_norm": 1.6901856660842896, + "learning_rate": 4.615435455950955e-05, + "loss": 4.7713, + "step": 30082 + }, + { + "epoch": 0.17891212294224, + "grad_norm": 2.3212149143218994, + "learning_rate": 4.615410563600799e-05, + "loss": 4.2903, + "step": 30083 + }, + { + "epoch": 0.17891807022552098, + "grad_norm": 2.5704152584075928, + "learning_rate": 4.6153856705121744e-05, + "loss": 4.0653, + "step": 30084 + }, + { + "epoch": 0.178924017508802, + "grad_norm": 2.5747878551483154, + "learning_rate": 4.6153607766850915e-05, + "loss": 4.176, + "step": 30085 + }, + { + "epoch": 0.17892996479208298, + "grad_norm": 2.633906841278076, + "learning_rate": 4.615335882119557e-05, + "loss": 4.2643, + "step": 30086 + }, + { + "epoch": 0.17893591207536397, + "grad_norm": 2.129531145095825, + "learning_rate": 4.615310986815581e-05, + "loss": 4.2056, + "step": 30087 + }, + { + "epoch": 0.17894185935864498, + "grad_norm": 1.725446343421936, + "learning_rate": 4.615286090773172e-05, + "loss": 5.0645, + "step": 30088 + }, + { + "epoch": 0.17894780664192597, + "grad_norm": 1.3943272829055786, + "learning_rate": 4.6152611939923384e-05, + "loss": 4.9102, + "step": 30089 + }, + { + "epoch": 0.17895375392520696, + "grad_norm": 1.5813884735107422, + "learning_rate": 4.615236296473089e-05, + "loss": 4.9637, + "step": 30090 + }, + { + "epoch": 0.17895970120848798, + "grad_norm": 1.795130968093872, + "learning_rate": 4.6152113982154323e-05, + "loss": 4.4949, + "step": 30091 + }, + { + "epoch": 0.17896564849176896, + "grad_norm": 1.626152753829956, + "learning_rate": 4.615186499219377e-05, + "loss": 4.6137, + "step": 30092 + }, + { + "epoch": 0.17897159577504995, + "grad_norm": 1.7427598237991333, + "learning_rate": 4.6151615994849326e-05, + "loss": 4.6726, + "step": 30093 + }, + { + "epoch": 0.17897754305833097, + "grad_norm": 1.6865589618682861, + "learning_rate": 4.6151366990121065e-05, + "loss": 5.307, + "step": 30094 + }, + { + "epoch": 0.17898349034161196, + "grad_norm": 1.4603716135025024, + "learning_rate": 4.615111797800908e-05, + "loss": 5.0782, + "step": 30095 + }, + { + "epoch": 0.17898943762489294, + "grad_norm": 1.6204586029052734, + "learning_rate": 4.615086895851346e-05, + "loss": 4.6813, + "step": 30096 + }, + { + "epoch": 0.17899538490817393, + "grad_norm": 1.6653324365615845, + "learning_rate": 4.615061993163429e-05, + "loss": 4.9387, + "step": 30097 + }, + { + "epoch": 0.17900133219145495, + "grad_norm": 1.4770258665084839, + "learning_rate": 4.6150370897371664e-05, + "loss": 5.3575, + "step": 30098 + }, + { + "epoch": 0.17900727947473594, + "grad_norm": 1.7126123905181885, + "learning_rate": 4.615012185572565e-05, + "loss": 5.2374, + "step": 30099 + }, + { + "epoch": 0.17901322675801692, + "grad_norm": 1.6398087739944458, + "learning_rate": 4.614987280669635e-05, + "loss": 5.3829, + "step": 30100 + }, + { + "epoch": 0.17901917404129794, + "grad_norm": 1.651924729347229, + "learning_rate": 4.6149623750283854e-05, + "loss": 5.2819, + "step": 30101 + }, + { + "epoch": 0.17902512132457893, + "grad_norm": 1.441523790359497, + "learning_rate": 4.6149374686488245e-05, + "loss": 5.1186, + "step": 30102 + }, + { + "epoch": 0.17903106860785992, + "grad_norm": 1.5080089569091797, + "learning_rate": 4.61491256153096e-05, + "loss": 5.4575, + "step": 30103 + }, + { + "epoch": 0.17903701589114093, + "grad_norm": 1.8069994449615479, + "learning_rate": 4.6148876536748017e-05, + "loss": 4.7502, + "step": 30104 + }, + { + "epoch": 0.17904296317442192, + "grad_norm": 1.6729295253753662, + "learning_rate": 4.6148627450803573e-05, + "loss": 5.6077, + "step": 30105 + }, + { + "epoch": 0.1790489104577029, + "grad_norm": 1.703140377998352, + "learning_rate": 4.614837835747637e-05, + "loss": 5.0991, + "step": 30106 + }, + { + "epoch": 0.17905485774098392, + "grad_norm": 1.6417967081069946, + "learning_rate": 4.614812925676648e-05, + "loss": 5.0502, + "step": 30107 + }, + { + "epoch": 0.1790608050242649, + "grad_norm": 1.5912690162658691, + "learning_rate": 4.6147880148674006e-05, + "loss": 5.1349, + "step": 30108 + }, + { + "epoch": 0.1790667523075459, + "grad_norm": 1.4695717096328735, + "learning_rate": 4.6147631033199026e-05, + "loss": 5.1189, + "step": 30109 + }, + { + "epoch": 0.17907269959082692, + "grad_norm": 1.865962266921997, + "learning_rate": 4.614738191034161e-05, + "loss": 4.9236, + "step": 30110 + }, + { + "epoch": 0.1790786468741079, + "grad_norm": 1.6190448999404907, + "learning_rate": 4.614713278010188e-05, + "loss": 5.6018, + "step": 30111 + }, + { + "epoch": 0.1790845941573889, + "grad_norm": 1.6233062744140625, + "learning_rate": 4.614688364247989e-05, + "loss": 5.5866, + "step": 30112 + }, + { + "epoch": 0.1790905414406699, + "grad_norm": 1.8465989828109741, + "learning_rate": 4.614663449747575e-05, + "loss": 4.9602, + "step": 30113 + }, + { + "epoch": 0.1790964887239509, + "grad_norm": 2.509408950805664, + "learning_rate": 4.614638534508954e-05, + "loss": 4.4523, + "step": 30114 + }, + { + "epoch": 0.17910243600723189, + "grad_norm": 2.0963387489318848, + "learning_rate": 4.6146136185321336e-05, + "loss": 4.6302, + "step": 30115 + }, + { + "epoch": 0.1791083832905129, + "grad_norm": 2.2663495540618896, + "learning_rate": 4.614588701817124e-05, + "loss": 4.4798, + "step": 30116 + }, + { + "epoch": 0.1791143305737939, + "grad_norm": 3.107478380203247, + "learning_rate": 4.6145637843639336e-05, + "loss": 4.4151, + "step": 30117 + }, + { + "epoch": 0.17912027785707488, + "grad_norm": 2.5192575454711914, + "learning_rate": 4.614538866172571e-05, + "loss": 4.3541, + "step": 30118 + }, + { + "epoch": 0.1791262251403559, + "grad_norm": 2.0473275184631348, + "learning_rate": 4.614513947243044e-05, + "loss": 4.9031, + "step": 30119 + }, + { + "epoch": 0.17913217242363688, + "grad_norm": 2.1869711875915527, + "learning_rate": 4.6144890275753614e-05, + "loss": 4.4643, + "step": 30120 + }, + { + "epoch": 0.17913811970691787, + "grad_norm": 2.027974843978882, + "learning_rate": 4.614464107169534e-05, + "loss": 4.4752, + "step": 30121 + }, + { + "epoch": 0.17914406699019889, + "grad_norm": 1.6507370471954346, + "learning_rate": 4.614439186025569e-05, + "loss": 5.1633, + "step": 30122 + }, + { + "epoch": 0.17915001427347987, + "grad_norm": 1.7081741094589233, + "learning_rate": 4.614414264143474e-05, + "loss": 5.2202, + "step": 30123 + }, + { + "epoch": 0.17915596155676086, + "grad_norm": 1.6631501913070679, + "learning_rate": 4.61438934152326e-05, + "loss": 5.3157, + "step": 30124 + }, + { + "epoch": 0.17916190884004188, + "grad_norm": 1.4147378206253052, + "learning_rate": 4.6143644181649336e-05, + "loss": 5.4174, + "step": 30125 + }, + { + "epoch": 0.17916785612332287, + "grad_norm": 2.0424649715423584, + "learning_rate": 4.614339494068505e-05, + "loss": 4.5279, + "step": 30126 + }, + { + "epoch": 0.17917380340660385, + "grad_norm": 1.8058947324752808, + "learning_rate": 4.614314569233982e-05, + "loss": 4.7826, + "step": 30127 + }, + { + "epoch": 0.17917975068988487, + "grad_norm": 2.241539478302002, + "learning_rate": 4.6142896436613735e-05, + "loss": 4.6077, + "step": 30128 + }, + { + "epoch": 0.17918569797316586, + "grad_norm": 2.598933696746826, + "learning_rate": 4.614264717350688e-05, + "loss": 4.4264, + "step": 30129 + }, + { + "epoch": 0.17919164525644685, + "grad_norm": 2.471510887145996, + "learning_rate": 4.614239790301935e-05, + "loss": 4.2513, + "step": 30130 + }, + { + "epoch": 0.17919759253972786, + "grad_norm": 2.7215542793273926, + "learning_rate": 4.6142148625151235e-05, + "loss": 4.3288, + "step": 30131 + }, + { + "epoch": 0.17920353982300885, + "grad_norm": 1.9755866527557373, + "learning_rate": 4.61418993399026e-05, + "loss": 4.5703, + "step": 30132 + }, + { + "epoch": 0.17920948710628984, + "grad_norm": 2.450087070465088, + "learning_rate": 4.614165004727356e-05, + "loss": 4.4569, + "step": 30133 + }, + { + "epoch": 0.17921543438957085, + "grad_norm": 1.7952730655670166, + "learning_rate": 4.614140074726419e-05, + "loss": 5.1308, + "step": 30134 + }, + { + "epoch": 0.17922138167285184, + "grad_norm": 1.4159260988235474, + "learning_rate": 4.614115143987456e-05, + "loss": 5.2613, + "step": 30135 + }, + { + "epoch": 0.17922732895613283, + "grad_norm": 1.546238899230957, + "learning_rate": 4.614090212510478e-05, + "loss": 5.2523, + "step": 30136 + }, + { + "epoch": 0.17923327623941385, + "grad_norm": 1.439784288406372, + "learning_rate": 4.614065280295493e-05, + "loss": 5.3594, + "step": 30137 + }, + { + "epoch": 0.17923922352269483, + "grad_norm": 1.421764612197876, + "learning_rate": 4.6140403473425096e-05, + "loss": 5.1889, + "step": 30138 + }, + { + "epoch": 0.17924517080597582, + "grad_norm": 1.5206106901168823, + "learning_rate": 4.614015413651537e-05, + "loss": 5.1252, + "step": 30139 + }, + { + "epoch": 0.17925111808925684, + "grad_norm": 1.8457632064819336, + "learning_rate": 4.613990479222582e-05, + "loss": 5.2534, + "step": 30140 + }, + { + "epoch": 0.17925706537253783, + "grad_norm": 1.5591540336608887, + "learning_rate": 4.613965544055656e-05, + "loss": 5.5879, + "step": 30141 + }, + { + "epoch": 0.17926301265581882, + "grad_norm": 1.6546518802642822, + "learning_rate": 4.613940608150766e-05, + "loss": 5.4731, + "step": 30142 + }, + { + "epoch": 0.17926895993909983, + "grad_norm": 1.7547178268432617, + "learning_rate": 4.613915671507922e-05, + "loss": 5.2478, + "step": 30143 + }, + { + "epoch": 0.17927490722238082, + "grad_norm": 1.7758798599243164, + "learning_rate": 4.613890734127131e-05, + "loss": 5.2217, + "step": 30144 + }, + { + "epoch": 0.1792808545056618, + "grad_norm": 1.7525664567947388, + "learning_rate": 4.613865796008403e-05, + "loss": 5.1914, + "step": 30145 + }, + { + "epoch": 0.17928680178894282, + "grad_norm": 1.907631754875183, + "learning_rate": 4.6138408571517464e-05, + "loss": 5.4735, + "step": 30146 + }, + { + "epoch": 0.1792927490722238, + "grad_norm": 1.658576250076294, + "learning_rate": 4.6138159175571694e-05, + "loss": 4.9081, + "step": 30147 + }, + { + "epoch": 0.1792986963555048, + "grad_norm": 2.537595272064209, + "learning_rate": 4.613790977224681e-05, + "loss": 4.0906, + "step": 30148 + }, + { + "epoch": 0.17930464363878582, + "grad_norm": 2.0535919666290283, + "learning_rate": 4.613766036154291e-05, + "loss": 4.5155, + "step": 30149 + }, + { + "epoch": 0.1793105909220668, + "grad_norm": 1.588181734085083, + "learning_rate": 4.6137410943460056e-05, + "loss": 5.1572, + "step": 30150 + }, + { + "epoch": 0.1793165382053478, + "grad_norm": 1.74554443359375, + "learning_rate": 4.613716151799836e-05, + "loss": 4.9274, + "step": 30151 + }, + { + "epoch": 0.1793224854886288, + "grad_norm": 1.638634204864502, + "learning_rate": 4.61369120851579e-05, + "loss": 5.3069, + "step": 30152 + }, + { + "epoch": 0.1793284327719098, + "grad_norm": 1.5783028602600098, + "learning_rate": 4.613666264493876e-05, + "loss": 4.9436, + "step": 30153 + }, + { + "epoch": 0.17933438005519078, + "grad_norm": 1.7508025169372559, + "learning_rate": 4.613641319734103e-05, + "loss": 5.5884, + "step": 30154 + }, + { + "epoch": 0.17934032733847177, + "grad_norm": 1.8591163158416748, + "learning_rate": 4.6136163742364794e-05, + "loss": 5.4054, + "step": 30155 + }, + { + "epoch": 0.1793462746217528, + "grad_norm": 1.6123576164245605, + "learning_rate": 4.6135914280010144e-05, + "loss": 5.1458, + "step": 30156 + }, + { + "epoch": 0.17935222190503378, + "grad_norm": 3.3494856357574463, + "learning_rate": 4.613566481027716e-05, + "loss": 5.457, + "step": 30157 + }, + { + "epoch": 0.17935816918831476, + "grad_norm": 1.7815282344818115, + "learning_rate": 4.613541533316594e-05, + "loss": 5.619, + "step": 30158 + }, + { + "epoch": 0.17936411647159578, + "grad_norm": 1.8669323921203613, + "learning_rate": 4.6135165848676567e-05, + "loss": 5.2181, + "step": 30159 + }, + { + "epoch": 0.17937006375487677, + "grad_norm": 2.775512218475342, + "learning_rate": 4.613491635680912e-05, + "loss": 5.274, + "step": 30160 + }, + { + "epoch": 0.17937601103815776, + "grad_norm": 1.3478049039840698, + "learning_rate": 4.613466685756369e-05, + "loss": 5.6895, + "step": 30161 + }, + { + "epoch": 0.17938195832143877, + "grad_norm": 1.3616020679473877, + "learning_rate": 4.6134417350940376e-05, + "loss": 5.4167, + "step": 30162 + }, + { + "epoch": 0.17938790560471976, + "grad_norm": 1.6133387088775635, + "learning_rate": 4.613416783693925e-05, + "loss": 5.4037, + "step": 30163 + }, + { + "epoch": 0.17939385288800075, + "grad_norm": 1.5833585262298584, + "learning_rate": 4.61339183155604e-05, + "loss": 5.5905, + "step": 30164 + }, + { + "epoch": 0.17939980017128176, + "grad_norm": 1.5497944355010986, + "learning_rate": 4.613366878680392e-05, + "loss": 5.5072, + "step": 30165 + }, + { + "epoch": 0.17940574745456275, + "grad_norm": 1.4450465440750122, + "learning_rate": 4.6133419250669893e-05, + "loss": 5.391, + "step": 30166 + }, + { + "epoch": 0.17941169473784374, + "grad_norm": 1.4759451150894165, + "learning_rate": 4.6133169707158415e-05, + "loss": 5.4819, + "step": 30167 + }, + { + "epoch": 0.17941764202112476, + "grad_norm": 1.576032280921936, + "learning_rate": 4.613292015626956e-05, + "loss": 5.0684, + "step": 30168 + }, + { + "epoch": 0.17942358930440575, + "grad_norm": 1.3601480722427368, + "learning_rate": 4.613267059800342e-05, + "loss": 5.3811, + "step": 30169 + }, + { + "epoch": 0.17942953658768673, + "grad_norm": 1.4551454782485962, + "learning_rate": 4.6132421032360084e-05, + "loss": 5.3851, + "step": 30170 + }, + { + "epoch": 0.17943548387096775, + "grad_norm": 1.425933837890625, + "learning_rate": 4.613217145933964e-05, + "loss": 5.2831, + "step": 30171 + }, + { + "epoch": 0.17944143115424874, + "grad_norm": 1.53054678440094, + "learning_rate": 4.613192187894218e-05, + "loss": 5.3126, + "step": 30172 + }, + { + "epoch": 0.17944737843752973, + "grad_norm": 1.5513275861740112, + "learning_rate": 4.613167229116777e-05, + "loss": 5.1566, + "step": 30173 + }, + { + "epoch": 0.17945332572081074, + "grad_norm": 1.659415364265442, + "learning_rate": 4.613142269601652e-05, + "loss": 5.2562, + "step": 30174 + }, + { + "epoch": 0.17945927300409173, + "grad_norm": 2.1108832359313965, + "learning_rate": 4.6131173093488506e-05, + "loss": 5.2514, + "step": 30175 + }, + { + "epoch": 0.17946522028737272, + "grad_norm": 2.187035083770752, + "learning_rate": 4.613092348358382e-05, + "loss": 5.0864, + "step": 30176 + }, + { + "epoch": 0.17947116757065373, + "grad_norm": 1.9420459270477295, + "learning_rate": 4.613067386630254e-05, + "loss": 5.1925, + "step": 30177 + }, + { + "epoch": 0.17947711485393472, + "grad_norm": 2.1403605937957764, + "learning_rate": 4.6130424241644765e-05, + "loss": 5.104, + "step": 30178 + }, + { + "epoch": 0.1794830621372157, + "grad_norm": 1.567936897277832, + "learning_rate": 4.6130174609610584e-05, + "loss": 5.2187, + "step": 30179 + }, + { + "epoch": 0.17948900942049673, + "grad_norm": 1.7955834865570068, + "learning_rate": 4.612992497020007e-05, + "loss": 4.9966, + "step": 30180 + }, + { + "epoch": 0.1794949567037777, + "grad_norm": 1.5525354146957397, + "learning_rate": 4.612967532341332e-05, + "loss": 5.4877, + "step": 30181 + }, + { + "epoch": 0.1795009039870587, + "grad_norm": 2.041837692260742, + "learning_rate": 4.6129425669250416e-05, + "loss": 4.1008, + "step": 30182 + }, + { + "epoch": 0.17950685127033972, + "grad_norm": 1.7052921056747437, + "learning_rate": 4.612917600771145e-05, + "loss": 5.0677, + "step": 30183 + }, + { + "epoch": 0.1795127985536207, + "grad_norm": 1.8973312377929688, + "learning_rate": 4.6128926338796505e-05, + "loss": 5.0398, + "step": 30184 + }, + { + "epoch": 0.1795187458369017, + "grad_norm": 1.696648120880127, + "learning_rate": 4.612867666250567e-05, + "loss": 4.9444, + "step": 30185 + }, + { + "epoch": 0.1795246931201827, + "grad_norm": 1.3013113737106323, + "learning_rate": 4.6128426978839034e-05, + "loss": 5.2848, + "step": 30186 + }, + { + "epoch": 0.1795306404034637, + "grad_norm": 1.6199315786361694, + "learning_rate": 4.612817728779668e-05, + "loss": 5.3178, + "step": 30187 + }, + { + "epoch": 0.1795365876867447, + "grad_norm": 1.5402096509933472, + "learning_rate": 4.612792758937871e-05, + "loss": 5.3269, + "step": 30188 + }, + { + "epoch": 0.1795425349700257, + "grad_norm": 1.4913876056671143, + "learning_rate": 4.612767788358518e-05, + "loss": 5.1784, + "step": 30189 + }, + { + "epoch": 0.1795484822533067, + "grad_norm": 1.6965476274490356, + "learning_rate": 4.6127428170416203e-05, + "loss": 4.9591, + "step": 30190 + }, + { + "epoch": 0.17955442953658768, + "grad_norm": 1.71049964427948, + "learning_rate": 4.612717844987186e-05, + "loss": 5.6287, + "step": 30191 + }, + { + "epoch": 0.1795603768198687, + "grad_norm": 1.6330054998397827, + "learning_rate": 4.612692872195224e-05, + "loss": 4.9837, + "step": 30192 + }, + { + "epoch": 0.17956632410314968, + "grad_norm": 1.6912428140640259, + "learning_rate": 4.6126678986657424e-05, + "loss": 5.2847, + "step": 30193 + }, + { + "epoch": 0.17957227138643067, + "grad_norm": 1.4812456369400024, + "learning_rate": 4.61264292439875e-05, + "loss": 5.1037, + "step": 30194 + }, + { + "epoch": 0.1795782186697117, + "grad_norm": 1.8161237239837646, + "learning_rate": 4.612617949394257e-05, + "loss": 5.141, + "step": 30195 + }, + { + "epoch": 0.17958416595299267, + "grad_norm": 1.7827249765396118, + "learning_rate": 4.61259297365227e-05, + "loss": 5.184, + "step": 30196 + }, + { + "epoch": 0.17959011323627366, + "grad_norm": 2.4642884731292725, + "learning_rate": 4.612567997172798e-05, + "loss": 3.7194, + "step": 30197 + }, + { + "epoch": 0.17959606051955468, + "grad_norm": 1.680345892906189, + "learning_rate": 4.6125430199558515e-05, + "loss": 4.8509, + "step": 30198 + }, + { + "epoch": 0.17960200780283567, + "grad_norm": 1.510986566543579, + "learning_rate": 4.612518042001437e-05, + "loss": 5.4374, + "step": 30199 + }, + { + "epoch": 0.17960795508611666, + "grad_norm": 1.333565592765808, + "learning_rate": 4.612493063309565e-05, + "loss": 5.5605, + "step": 30200 + }, + { + "epoch": 0.17961390236939767, + "grad_norm": 1.5686737298965454, + "learning_rate": 4.612468083880244e-05, + "loss": 5.5275, + "step": 30201 + }, + { + "epoch": 0.17961984965267866, + "grad_norm": 1.4697351455688477, + "learning_rate": 4.6124431037134805e-05, + "loss": 5.2846, + "step": 30202 + }, + { + "epoch": 0.17962579693595965, + "grad_norm": 1.553658127784729, + "learning_rate": 4.612418122809286e-05, + "loss": 5.339, + "step": 30203 + }, + { + "epoch": 0.17963174421924066, + "grad_norm": 1.8520125150680542, + "learning_rate": 4.612393141167669e-05, + "loss": 4.5566, + "step": 30204 + }, + { + "epoch": 0.17963769150252165, + "grad_norm": 1.694443702697754, + "learning_rate": 4.6123681587886356e-05, + "loss": 5.094, + "step": 30205 + }, + { + "epoch": 0.17964363878580264, + "grad_norm": 1.6626062393188477, + "learning_rate": 4.612343175672198e-05, + "loss": 5.0872, + "step": 30206 + }, + { + "epoch": 0.17964958606908366, + "grad_norm": 1.8352187871932983, + "learning_rate": 4.612318191818362e-05, + "loss": 4.0188, + "step": 30207 + }, + { + "epoch": 0.17965553335236464, + "grad_norm": 1.8127634525299072, + "learning_rate": 4.6122932072271385e-05, + "loss": 4.3894, + "step": 30208 + }, + { + "epoch": 0.17966148063564563, + "grad_norm": 1.7093063592910767, + "learning_rate": 4.612268221898535e-05, + "loss": 4.9074, + "step": 30209 + }, + { + "epoch": 0.17966742791892665, + "grad_norm": 1.5482558012008667, + "learning_rate": 4.61224323583256e-05, + "loss": 4.9126, + "step": 30210 + }, + { + "epoch": 0.17967337520220764, + "grad_norm": 1.6215821504592896, + "learning_rate": 4.612218249029223e-05, + "loss": 4.9809, + "step": 30211 + }, + { + "epoch": 0.17967932248548862, + "grad_norm": 1.4449799060821533, + "learning_rate": 4.6121932614885324e-05, + "loss": 4.6292, + "step": 30212 + }, + { + "epoch": 0.17968526976876964, + "grad_norm": 1.6439566612243652, + "learning_rate": 4.612168273210496e-05, + "loss": 4.7304, + "step": 30213 + }, + { + "epoch": 0.17969121705205063, + "grad_norm": 1.566293716430664, + "learning_rate": 4.6121432841951254e-05, + "loss": 4.7359, + "step": 30214 + }, + { + "epoch": 0.17969716433533162, + "grad_norm": 1.3864619731903076, + "learning_rate": 4.612118294442426e-05, + "loss": 4.737, + "step": 30215 + }, + { + "epoch": 0.1797031116186126, + "grad_norm": 1.5013184547424316, + "learning_rate": 4.6120933039524087e-05, + "loss": 4.2283, + "step": 30216 + }, + { + "epoch": 0.17970905890189362, + "grad_norm": 1.6304489374160767, + "learning_rate": 4.612068312725081e-05, + "loss": 4.6443, + "step": 30217 + }, + { + "epoch": 0.1797150061851746, + "grad_norm": 1.512584924697876, + "learning_rate": 4.612043320760452e-05, + "loss": 4.6994, + "step": 30218 + }, + { + "epoch": 0.1797209534684556, + "grad_norm": 1.726282000541687, + "learning_rate": 4.612018328058531e-05, + "loss": 4.7797, + "step": 30219 + }, + { + "epoch": 0.1797269007517366, + "grad_norm": 1.9200310707092285, + "learning_rate": 4.611993334619326e-05, + "loss": 4.3433, + "step": 30220 + }, + { + "epoch": 0.1797328480350176, + "grad_norm": 1.6189771890640259, + "learning_rate": 4.611968340442845e-05, + "loss": 4.8367, + "step": 30221 + }, + { + "epoch": 0.1797387953182986, + "grad_norm": 1.6399370431900024, + "learning_rate": 4.6119433455290985e-05, + "loss": 5.169, + "step": 30222 + }, + { + "epoch": 0.1797447426015796, + "grad_norm": 1.8152503967285156, + "learning_rate": 4.611918349878095e-05, + "loss": 5.4301, + "step": 30223 + }, + { + "epoch": 0.1797506898848606, + "grad_norm": 1.5096112489700317, + "learning_rate": 4.611893353489841e-05, + "loss": 5.3012, + "step": 30224 + }, + { + "epoch": 0.17975663716814158, + "grad_norm": 1.5811582803726196, + "learning_rate": 4.611868356364348e-05, + "loss": 4.65, + "step": 30225 + }, + { + "epoch": 0.1797625844514226, + "grad_norm": 1.6519943475723267, + "learning_rate": 4.611843358501624e-05, + "loss": 5.1057, + "step": 30226 + }, + { + "epoch": 0.17976853173470358, + "grad_norm": 1.5644969940185547, + "learning_rate": 4.611818359901676e-05, + "loss": 5.3185, + "step": 30227 + }, + { + "epoch": 0.17977447901798457, + "grad_norm": 1.344948410987854, + "learning_rate": 4.611793360564515e-05, + "loss": 5.2657, + "step": 30228 + }, + { + "epoch": 0.1797804263012656, + "grad_norm": 1.5945618152618408, + "learning_rate": 4.6117683604901485e-05, + "loss": 5.197, + "step": 30229 + }, + { + "epoch": 0.17978637358454658, + "grad_norm": 1.41254460811615, + "learning_rate": 4.6117433596785855e-05, + "loss": 5.1255, + "step": 30230 + }, + { + "epoch": 0.17979232086782757, + "grad_norm": 1.7176563739776611, + "learning_rate": 4.611718358129835e-05, + "loss": 4.9022, + "step": 30231 + }, + { + "epoch": 0.17979826815110858, + "grad_norm": 1.7248926162719727, + "learning_rate": 4.611693355843905e-05, + "loss": 4.9377, + "step": 30232 + }, + { + "epoch": 0.17980421543438957, + "grad_norm": 1.551627516746521, + "learning_rate": 4.611668352820805e-05, + "loss": 5.2027, + "step": 30233 + }, + { + "epoch": 0.17981016271767056, + "grad_norm": 1.5485448837280273, + "learning_rate": 4.6116433490605435e-05, + "loss": 5.2818, + "step": 30234 + }, + { + "epoch": 0.17981611000095157, + "grad_norm": 1.5185739994049072, + "learning_rate": 4.611618344563129e-05, + "loss": 5.2565, + "step": 30235 + }, + { + "epoch": 0.17982205728423256, + "grad_norm": 1.3627973794937134, + "learning_rate": 4.61159333932857e-05, + "loss": 5.227, + "step": 30236 + }, + { + "epoch": 0.17982800456751355, + "grad_norm": 1.1721487045288086, + "learning_rate": 4.611568333356876e-05, + "loss": 5.244, + "step": 30237 + }, + { + "epoch": 0.17983395185079457, + "grad_norm": 1.4845436811447144, + "learning_rate": 4.611543326648055e-05, + "loss": 5.0118, + "step": 30238 + }, + { + "epoch": 0.17983989913407555, + "grad_norm": 1.733625888824463, + "learning_rate": 4.611518319202116e-05, + "loss": 5.2007, + "step": 30239 + }, + { + "epoch": 0.17984584641735654, + "grad_norm": 1.570659875869751, + "learning_rate": 4.611493311019068e-05, + "loss": 5.1015, + "step": 30240 + }, + { + "epoch": 0.17985179370063756, + "grad_norm": 1.5972294807434082, + "learning_rate": 4.611468302098919e-05, + "loss": 5.0667, + "step": 30241 + }, + { + "epoch": 0.17985774098391855, + "grad_norm": 1.5389827489852905, + "learning_rate": 4.611443292441678e-05, + "loss": 5.1393, + "step": 30242 + }, + { + "epoch": 0.17986368826719953, + "grad_norm": 1.778263807296753, + "learning_rate": 4.611418282047355e-05, + "loss": 4.6927, + "step": 30243 + }, + { + "epoch": 0.17986963555048055, + "grad_norm": 1.642376184463501, + "learning_rate": 4.611393270915958e-05, + "loss": 4.9779, + "step": 30244 + }, + { + "epoch": 0.17987558283376154, + "grad_norm": 1.7385129928588867, + "learning_rate": 4.611368259047494e-05, + "loss": 5.0129, + "step": 30245 + }, + { + "epoch": 0.17988153011704253, + "grad_norm": 1.5934865474700928, + "learning_rate": 4.6113432464419734e-05, + "loss": 5.151, + "step": 30246 + }, + { + "epoch": 0.17988747740032354, + "grad_norm": 1.6236854791641235, + "learning_rate": 4.611318233099406e-05, + "loss": 4.915, + "step": 30247 + }, + { + "epoch": 0.17989342468360453, + "grad_norm": 1.553943157196045, + "learning_rate": 4.611293219019798e-05, + "loss": 4.9386, + "step": 30248 + }, + { + "epoch": 0.17989937196688552, + "grad_norm": 1.597655177116394, + "learning_rate": 4.61126820420316e-05, + "loss": 5.0225, + "step": 30249 + }, + { + "epoch": 0.17990531925016653, + "grad_norm": 1.4023799896240234, + "learning_rate": 4.6112431886495e-05, + "loss": 4.8045, + "step": 30250 + }, + { + "epoch": 0.17991126653344752, + "grad_norm": 1.4906047582626343, + "learning_rate": 4.611218172358826e-05, + "loss": 5.0927, + "step": 30251 + }, + { + "epoch": 0.1799172138167285, + "grad_norm": 1.5440434217453003, + "learning_rate": 4.6111931553311486e-05, + "loss": 4.8847, + "step": 30252 + }, + { + "epoch": 0.17992316110000953, + "grad_norm": 1.5937246084213257, + "learning_rate": 4.611168137566475e-05, + "loss": 4.8006, + "step": 30253 + }, + { + "epoch": 0.17992910838329051, + "grad_norm": 1.913120150566101, + "learning_rate": 4.611143119064814e-05, + "loss": 4.3196, + "step": 30254 + }, + { + "epoch": 0.1799350556665715, + "grad_norm": 2.267242908477783, + "learning_rate": 4.6111180998261754e-05, + "loss": 3.9084, + "step": 30255 + }, + { + "epoch": 0.17994100294985252, + "grad_norm": 1.9298279285430908, + "learning_rate": 4.611093079850567e-05, + "loss": 4.8176, + "step": 30256 + }, + { + "epoch": 0.1799469502331335, + "grad_norm": 2.0990922451019287, + "learning_rate": 4.6110680591379977e-05, + "loss": 3.7105, + "step": 30257 + }, + { + "epoch": 0.1799528975164145, + "grad_norm": 2.2702863216400146, + "learning_rate": 4.611043037688477e-05, + "loss": 3.67, + "step": 30258 + }, + { + "epoch": 0.1799588447996955, + "grad_norm": 1.7797553539276123, + "learning_rate": 4.6110180155020124e-05, + "loss": 4.2347, + "step": 30259 + }, + { + "epoch": 0.1799647920829765, + "grad_norm": 1.827901005744934, + "learning_rate": 4.610992992578613e-05, + "loss": 4.8346, + "step": 30260 + }, + { + "epoch": 0.1799707393662575, + "grad_norm": 1.8111793994903564, + "learning_rate": 4.610967968918288e-05, + "loss": 4.429, + "step": 30261 + }, + { + "epoch": 0.1799766866495385, + "grad_norm": 1.7809714078903198, + "learning_rate": 4.610942944521046e-05, + "loss": 4.8362, + "step": 30262 + }, + { + "epoch": 0.1799826339328195, + "grad_norm": 1.7556761503219604, + "learning_rate": 4.610917919386895e-05, + "loss": 4.5426, + "step": 30263 + }, + { + "epoch": 0.17998858121610048, + "grad_norm": 2.094663381576538, + "learning_rate": 4.6108928935158457e-05, + "loss": 3.9912, + "step": 30264 + }, + { + "epoch": 0.1799945284993815, + "grad_norm": 2.4183871746063232, + "learning_rate": 4.610867866907905e-05, + "loss": 3.5367, + "step": 30265 + }, + { + "epoch": 0.18000047578266248, + "grad_norm": 1.9817161560058594, + "learning_rate": 4.610842839563082e-05, + "loss": 4.1249, + "step": 30266 + }, + { + "epoch": 0.18000642306594347, + "grad_norm": 1.8227890729904175, + "learning_rate": 4.610817811481385e-05, + "loss": 4.729, + "step": 30267 + }, + { + "epoch": 0.1800123703492245, + "grad_norm": 1.8719122409820557, + "learning_rate": 4.610792782662824e-05, + "loss": 4.6823, + "step": 30268 + }, + { + "epoch": 0.18001831763250548, + "grad_norm": 1.8727600574493408, + "learning_rate": 4.6107677531074075e-05, + "loss": 4.2555, + "step": 30269 + }, + { + "epoch": 0.18002426491578646, + "grad_norm": 1.526989221572876, + "learning_rate": 4.610742722815143e-05, + "loss": 4.9572, + "step": 30270 + }, + { + "epoch": 0.18003021219906748, + "grad_norm": 1.7702364921569824, + "learning_rate": 4.61071769178604e-05, + "loss": 4.6608, + "step": 30271 + }, + { + "epoch": 0.18003615948234847, + "grad_norm": 1.7519408464431763, + "learning_rate": 4.610692660020107e-05, + "loss": 4.6615, + "step": 30272 + }, + { + "epoch": 0.18004210676562946, + "grad_norm": 1.6772125959396362, + "learning_rate": 4.610667627517354e-05, + "loss": 4.5468, + "step": 30273 + }, + { + "epoch": 0.18004805404891044, + "grad_norm": 1.8781254291534424, + "learning_rate": 4.610642594277788e-05, + "loss": 4.4068, + "step": 30274 + }, + { + "epoch": 0.18005400133219146, + "grad_norm": 1.6861200332641602, + "learning_rate": 4.610617560301419e-05, + "loss": 4.5556, + "step": 30275 + }, + { + "epoch": 0.18005994861547245, + "grad_norm": 1.7441620826721191, + "learning_rate": 4.6105925255882545e-05, + "loss": 4.7047, + "step": 30276 + }, + { + "epoch": 0.18006589589875344, + "grad_norm": 1.5301376581192017, + "learning_rate": 4.6105674901383044e-05, + "loss": 4.7173, + "step": 30277 + }, + { + "epoch": 0.18007184318203445, + "grad_norm": 1.9823702573776245, + "learning_rate": 4.6105424539515765e-05, + "loss": 4.6475, + "step": 30278 + }, + { + "epoch": 0.18007779046531544, + "grad_norm": 1.7281779050827026, + "learning_rate": 4.6105174170280805e-05, + "loss": 4.5832, + "step": 30279 + }, + { + "epoch": 0.18008373774859643, + "grad_norm": 1.5739697217941284, + "learning_rate": 4.610492379367824e-05, + "loss": 4.6732, + "step": 30280 + }, + { + "epoch": 0.18008968503187744, + "grad_norm": 1.771346092224121, + "learning_rate": 4.6104673409708175e-05, + "loss": 4.4008, + "step": 30281 + }, + { + "epoch": 0.18009563231515843, + "grad_norm": 2.198194980621338, + "learning_rate": 4.610442301837068e-05, + "loss": 4.2249, + "step": 30282 + }, + { + "epoch": 0.18010157959843942, + "grad_norm": 1.7576837539672852, + "learning_rate": 4.610417261966585e-05, + "loss": 4.9569, + "step": 30283 + }, + { + "epoch": 0.18010752688172044, + "grad_norm": 1.849458932876587, + "learning_rate": 4.6103922213593775e-05, + "loss": 4.2704, + "step": 30284 + }, + { + "epoch": 0.18011347416500142, + "grad_norm": 1.8416085243225098, + "learning_rate": 4.610367180015454e-05, + "loss": 4.2699, + "step": 30285 + }, + { + "epoch": 0.1801194214482824, + "grad_norm": 1.8305091857910156, + "learning_rate": 4.610342137934822e-05, + "loss": 4.2922, + "step": 30286 + }, + { + "epoch": 0.18012536873156343, + "grad_norm": 2.0292394161224365, + "learning_rate": 4.6103170951174924e-05, + "loss": 4.1851, + "step": 30287 + }, + { + "epoch": 0.18013131601484442, + "grad_norm": 1.7172123193740845, + "learning_rate": 4.610292051563473e-05, + "loss": 4.4749, + "step": 30288 + }, + { + "epoch": 0.1801372632981254, + "grad_norm": 1.787654995918274, + "learning_rate": 4.610267007272772e-05, + "loss": 4.143, + "step": 30289 + }, + { + "epoch": 0.18014321058140642, + "grad_norm": 1.9985861778259277, + "learning_rate": 4.6102419622453985e-05, + "loss": 4.6417, + "step": 30290 + }, + { + "epoch": 0.1801491578646874, + "grad_norm": 1.7196992635726929, + "learning_rate": 4.610216916481361e-05, + "loss": 5.1097, + "step": 30291 + }, + { + "epoch": 0.1801551051479684, + "grad_norm": 1.5344418287277222, + "learning_rate": 4.610191869980669e-05, + "loss": 5.011, + "step": 30292 + }, + { + "epoch": 0.1801610524312494, + "grad_norm": 2.262801170349121, + "learning_rate": 4.610166822743331e-05, + "loss": 4.302, + "step": 30293 + }, + { + "epoch": 0.1801669997145304, + "grad_norm": 1.6699048280715942, + "learning_rate": 4.610141774769355e-05, + "loss": 4.6451, + "step": 30294 + }, + { + "epoch": 0.1801729469978114, + "grad_norm": 1.636252999305725, + "learning_rate": 4.6101167260587506e-05, + "loss": 4.5226, + "step": 30295 + }, + { + "epoch": 0.1801788942810924, + "grad_norm": 1.6654448509216309, + "learning_rate": 4.610091676611527e-05, + "loss": 4.8778, + "step": 30296 + }, + { + "epoch": 0.1801848415643734, + "grad_norm": 1.832134485244751, + "learning_rate": 4.610066626427691e-05, + "loss": 4.8301, + "step": 30297 + }, + { + "epoch": 0.18019078884765438, + "grad_norm": 1.5756455659866333, + "learning_rate": 4.6100415755072536e-05, + "loss": 4.8757, + "step": 30298 + }, + { + "epoch": 0.1801967361309354, + "grad_norm": 1.5991398096084595, + "learning_rate": 4.610016523850222e-05, + "loss": 4.8261, + "step": 30299 + }, + { + "epoch": 0.18020268341421639, + "grad_norm": 1.5322027206420898, + "learning_rate": 4.609991471456605e-05, + "loss": 5.0081, + "step": 30300 + }, + { + "epoch": 0.18020863069749737, + "grad_norm": 1.6513683795928955, + "learning_rate": 4.6099664183264126e-05, + "loss": 4.8251, + "step": 30301 + }, + { + "epoch": 0.1802145779807784, + "grad_norm": 2.100013494491577, + "learning_rate": 4.609941364459652e-05, + "loss": 4.2063, + "step": 30302 + }, + { + "epoch": 0.18022052526405938, + "grad_norm": 1.8772211074829102, + "learning_rate": 4.609916309856333e-05, + "loss": 5.0742, + "step": 30303 + }, + { + "epoch": 0.18022647254734037, + "grad_norm": 1.5628682374954224, + "learning_rate": 4.609891254516464e-05, + "loss": 5.0105, + "step": 30304 + }, + { + "epoch": 0.18023241983062138, + "grad_norm": 1.633851170539856, + "learning_rate": 4.6098661984400535e-05, + "loss": 4.8846, + "step": 30305 + }, + { + "epoch": 0.18023836711390237, + "grad_norm": 1.6528682708740234, + "learning_rate": 4.609841141627111e-05, + "loss": 4.9063, + "step": 30306 + }, + { + "epoch": 0.18024431439718336, + "grad_norm": 1.699247121810913, + "learning_rate": 4.609816084077645e-05, + "loss": 4.5751, + "step": 30307 + }, + { + "epoch": 0.18025026168046437, + "grad_norm": 1.6774038076400757, + "learning_rate": 4.609791025791663e-05, + "loss": 4.5651, + "step": 30308 + }, + { + "epoch": 0.18025620896374536, + "grad_norm": 1.695169448852539, + "learning_rate": 4.609765966769175e-05, + "loss": 4.5995, + "step": 30309 + }, + { + "epoch": 0.18026215624702635, + "grad_norm": 1.851489543914795, + "learning_rate": 4.6097409070101905e-05, + "loss": 4.6826, + "step": 30310 + }, + { + "epoch": 0.18026810353030737, + "grad_norm": 1.683112382888794, + "learning_rate": 4.609715846514716e-05, + "loss": 4.3293, + "step": 30311 + }, + { + "epoch": 0.18027405081358835, + "grad_norm": 1.5318275690078735, + "learning_rate": 4.609690785282762e-05, + "loss": 5.6662, + "step": 30312 + }, + { + "epoch": 0.18027999809686934, + "grad_norm": 2.2105138301849365, + "learning_rate": 4.609665723314337e-05, + "loss": 3.9675, + "step": 30313 + }, + { + "epoch": 0.18028594538015036, + "grad_norm": 1.7841753959655762, + "learning_rate": 4.609640660609449e-05, + "loss": 4.4832, + "step": 30314 + }, + { + "epoch": 0.18029189266343135, + "grad_norm": 1.7051490545272827, + "learning_rate": 4.6096155971681073e-05, + "loss": 4.3786, + "step": 30315 + }, + { + "epoch": 0.18029783994671233, + "grad_norm": 1.798112392425537, + "learning_rate": 4.609590532990321e-05, + "loss": 4.4915, + "step": 30316 + }, + { + "epoch": 0.18030378722999335, + "grad_norm": 1.8255062103271484, + "learning_rate": 4.6095654680760983e-05, + "loss": 4.6701, + "step": 30317 + }, + { + "epoch": 0.18030973451327434, + "grad_norm": 2.376105308532715, + "learning_rate": 4.609540402425448e-05, + "loss": 4.5314, + "step": 30318 + }, + { + "epoch": 0.18031568179655533, + "grad_norm": 1.6199541091918945, + "learning_rate": 4.609515336038379e-05, + "loss": 4.8641, + "step": 30319 + }, + { + "epoch": 0.18032162907983634, + "grad_norm": 1.8655678033828735, + "learning_rate": 4.6094902689149e-05, + "loss": 4.9883, + "step": 30320 + }, + { + "epoch": 0.18032757636311733, + "grad_norm": 1.6049344539642334, + "learning_rate": 4.6094652010550195e-05, + "loss": 5.0508, + "step": 30321 + }, + { + "epoch": 0.18033352364639832, + "grad_norm": 1.4725605249404907, + "learning_rate": 4.6094401324587464e-05, + "loss": 4.9306, + "step": 30322 + }, + { + "epoch": 0.18033947092967934, + "grad_norm": 1.4839946031570435, + "learning_rate": 4.60941506312609e-05, + "loss": 4.85, + "step": 30323 + }, + { + "epoch": 0.18034541821296032, + "grad_norm": 1.54611074924469, + "learning_rate": 4.609389993057058e-05, + "loss": 4.9655, + "step": 30324 + }, + { + "epoch": 0.1803513654962413, + "grad_norm": 1.612251877784729, + "learning_rate": 4.609364922251661e-05, + "loss": 4.4827, + "step": 30325 + }, + { + "epoch": 0.18035731277952233, + "grad_norm": 1.3921014070510864, + "learning_rate": 4.609339850709905e-05, + "loss": 4.609, + "step": 30326 + }, + { + "epoch": 0.18036326006280332, + "grad_norm": 1.7824617624282837, + "learning_rate": 4.6093147784318014e-05, + "loss": 4.7485, + "step": 30327 + }, + { + "epoch": 0.1803692073460843, + "grad_norm": 1.5730568170547485, + "learning_rate": 4.609289705417357e-05, + "loss": 5.1987, + "step": 30328 + }, + { + "epoch": 0.18037515462936532, + "grad_norm": 1.445325493812561, + "learning_rate": 4.6092646316665814e-05, + "loss": 5.0233, + "step": 30329 + }, + { + "epoch": 0.1803811019126463, + "grad_norm": 1.4553011655807495, + "learning_rate": 4.609239557179484e-05, + "loss": 5.1624, + "step": 30330 + }, + { + "epoch": 0.1803870491959273, + "grad_norm": 1.7723554372787476, + "learning_rate": 4.609214481956072e-05, + "loss": 5.3372, + "step": 30331 + }, + { + "epoch": 0.18039299647920828, + "grad_norm": 1.5231170654296875, + "learning_rate": 4.609189405996356e-05, + "loss": 5.3817, + "step": 30332 + }, + { + "epoch": 0.1803989437624893, + "grad_norm": 1.5292000770568848, + "learning_rate": 4.609164329300343e-05, + "loss": 5.1899, + "step": 30333 + }, + { + "epoch": 0.1804048910457703, + "grad_norm": 2.4883272647857666, + "learning_rate": 4.6091392518680424e-05, + "loss": 4.5457, + "step": 30334 + }, + { + "epoch": 0.18041083832905128, + "grad_norm": 2.357412815093994, + "learning_rate": 4.6091141736994635e-05, + "loss": 4.4771, + "step": 30335 + }, + { + "epoch": 0.1804167856123323, + "grad_norm": 1.6708316802978516, + "learning_rate": 4.6090890947946144e-05, + "loss": 4.8245, + "step": 30336 + }, + { + "epoch": 0.18042273289561328, + "grad_norm": 1.7885435819625854, + "learning_rate": 4.6090640151535046e-05, + "loss": 4.7263, + "step": 30337 + }, + { + "epoch": 0.18042868017889427, + "grad_norm": 2.1166250705718994, + "learning_rate": 4.609038934776142e-05, + "loss": 4.8725, + "step": 30338 + }, + { + "epoch": 0.18043462746217528, + "grad_norm": 1.6104192733764648, + "learning_rate": 4.609013853662536e-05, + "loss": 5.0208, + "step": 30339 + }, + { + "epoch": 0.18044057474545627, + "grad_norm": 1.663496494293213, + "learning_rate": 4.6089887718126945e-05, + "loss": 5.0706, + "step": 30340 + }, + { + "epoch": 0.18044652202873726, + "grad_norm": 1.571781873703003, + "learning_rate": 4.608963689226627e-05, + "loss": 5.0953, + "step": 30341 + }, + { + "epoch": 0.18045246931201828, + "grad_norm": 1.6184124946594238, + "learning_rate": 4.6089386059043415e-05, + "loss": 5.0428, + "step": 30342 + }, + { + "epoch": 0.18045841659529926, + "grad_norm": 2.4237656593322754, + "learning_rate": 4.608913521845848e-05, + "loss": 3.7821, + "step": 30343 + }, + { + "epoch": 0.18046436387858025, + "grad_norm": 2.287548065185547, + "learning_rate": 4.6088884370511545e-05, + "loss": 3.7935, + "step": 30344 + }, + { + "epoch": 0.18047031116186127, + "grad_norm": 2.1035749912261963, + "learning_rate": 4.60886335152027e-05, + "loss": 3.6729, + "step": 30345 + }, + { + "epoch": 0.18047625844514226, + "grad_norm": 1.9365202188491821, + "learning_rate": 4.608838265253203e-05, + "loss": 3.5706, + "step": 30346 + }, + { + "epoch": 0.18048220572842325, + "grad_norm": 1.8482760190963745, + "learning_rate": 4.608813178249962e-05, + "loss": 3.7941, + "step": 30347 + }, + { + "epoch": 0.18048815301170426, + "grad_norm": 1.879911184310913, + "learning_rate": 4.608788090510557e-05, + "loss": 4.0947, + "step": 30348 + }, + { + "epoch": 0.18049410029498525, + "grad_norm": 1.9760171175003052, + "learning_rate": 4.608763002034995e-05, + "loss": 3.6721, + "step": 30349 + }, + { + "epoch": 0.18050004757826624, + "grad_norm": 1.85044264793396, + "learning_rate": 4.608737912823286e-05, + "loss": 3.6058, + "step": 30350 + }, + { + "epoch": 0.18050599486154725, + "grad_norm": 1.7919642925262451, + "learning_rate": 4.6087128228754384e-05, + "loss": 3.5611, + "step": 30351 + }, + { + "epoch": 0.18051194214482824, + "grad_norm": 1.933648943901062, + "learning_rate": 4.60868773219146e-05, + "loss": 3.5883, + "step": 30352 + }, + { + "epoch": 0.18051788942810923, + "grad_norm": 1.9025899171829224, + "learning_rate": 4.6086626407713615e-05, + "loss": 3.6201, + "step": 30353 + }, + { + "epoch": 0.18052383671139025, + "grad_norm": 1.9761525392532349, + "learning_rate": 4.608637548615151e-05, + "loss": 3.6038, + "step": 30354 + }, + { + "epoch": 0.18052978399467123, + "grad_norm": 2.008164644241333, + "learning_rate": 4.608612455722836e-05, + "loss": 3.6495, + "step": 30355 + }, + { + "epoch": 0.18053573127795222, + "grad_norm": 1.7661700248718262, + "learning_rate": 4.6085873620944266e-05, + "loss": 3.6006, + "step": 30356 + }, + { + "epoch": 0.18054167856123324, + "grad_norm": 1.872231364250183, + "learning_rate": 4.608562267729931e-05, + "loss": 3.6929, + "step": 30357 + }, + { + "epoch": 0.18054762584451423, + "grad_norm": 1.8716074228286743, + "learning_rate": 4.608537172629358e-05, + "loss": 3.4804, + "step": 30358 + }, + { + "epoch": 0.1805535731277952, + "grad_norm": 1.6453325748443604, + "learning_rate": 4.608512076792717e-05, + "loss": 4.3521, + "step": 30359 + }, + { + "epoch": 0.18055952041107623, + "grad_norm": 1.9353103637695312, + "learning_rate": 4.6084869802200156e-05, + "loss": 3.5408, + "step": 30360 + }, + { + "epoch": 0.18056546769435722, + "grad_norm": 1.854251503944397, + "learning_rate": 4.6084618829112636e-05, + "loss": 3.5502, + "step": 30361 + }, + { + "epoch": 0.1805714149776382, + "grad_norm": 1.8924806118011475, + "learning_rate": 4.608436784866469e-05, + "loss": 3.4984, + "step": 30362 + }, + { + "epoch": 0.18057736226091922, + "grad_norm": 1.876546859741211, + "learning_rate": 4.608411686085641e-05, + "loss": 3.5422, + "step": 30363 + }, + { + "epoch": 0.1805833095442002, + "grad_norm": 1.81404709815979, + "learning_rate": 4.608386586568788e-05, + "loss": 3.528, + "step": 30364 + }, + { + "epoch": 0.1805892568274812, + "grad_norm": 1.6718660593032837, + "learning_rate": 4.60836148631592e-05, + "loss": 4.2733, + "step": 30365 + }, + { + "epoch": 0.18059520411076221, + "grad_norm": 1.8086154460906982, + "learning_rate": 4.6083363853270436e-05, + "loss": 4.2946, + "step": 30366 + }, + { + "epoch": 0.1806011513940432, + "grad_norm": 1.661757230758667, + "learning_rate": 4.6083112836021694e-05, + "loss": 4.6596, + "step": 30367 + }, + { + "epoch": 0.1806070986773242, + "grad_norm": 1.8891844749450684, + "learning_rate": 4.6082861811413056e-05, + "loss": 4.173, + "step": 30368 + }, + { + "epoch": 0.1806130459606052, + "grad_norm": 2.1718995571136475, + "learning_rate": 4.60826107794446e-05, + "loss": 3.6396, + "step": 30369 + }, + { + "epoch": 0.1806189932438862, + "grad_norm": 1.6074626445770264, + "learning_rate": 4.608235974011643e-05, + "loss": 4.2138, + "step": 30370 + }, + { + "epoch": 0.18062494052716718, + "grad_norm": 2.053957223892212, + "learning_rate": 4.608210869342863e-05, + "loss": 3.6579, + "step": 30371 + }, + { + "epoch": 0.1806308878104482, + "grad_norm": 2.0929627418518066, + "learning_rate": 4.6081857639381274e-05, + "loss": 3.7675, + "step": 30372 + }, + { + "epoch": 0.1806368350937292, + "grad_norm": 1.8131572008132935, + "learning_rate": 4.608160657797447e-05, + "loss": 4.482, + "step": 30373 + }, + { + "epoch": 0.18064278237701017, + "grad_norm": 1.8105684518814087, + "learning_rate": 4.608135550920829e-05, + "loss": 4.2737, + "step": 30374 + }, + { + "epoch": 0.1806487296602912, + "grad_norm": 1.7839126586914062, + "learning_rate": 4.608110443308282e-05, + "loss": 4.1166, + "step": 30375 + }, + { + "epoch": 0.18065467694357218, + "grad_norm": 1.7233171463012695, + "learning_rate": 4.6080853349598164e-05, + "loss": 4.1941, + "step": 30376 + }, + { + "epoch": 0.18066062422685317, + "grad_norm": 2.1062052249908447, + "learning_rate": 4.608060225875439e-05, + "loss": 4.5294, + "step": 30377 + }, + { + "epoch": 0.18066657151013418, + "grad_norm": 1.744558572769165, + "learning_rate": 4.6080351160551605e-05, + "loss": 5.3094, + "step": 30378 + }, + { + "epoch": 0.18067251879341517, + "grad_norm": 1.5789061784744263, + "learning_rate": 4.608010005498988e-05, + "loss": 4.7674, + "step": 30379 + }, + { + "epoch": 0.18067846607669616, + "grad_norm": 2.0195188522338867, + "learning_rate": 4.6079848942069316e-05, + "loss": 4.5897, + "step": 30380 + }, + { + "epoch": 0.18068441335997718, + "grad_norm": 1.8995375633239746, + "learning_rate": 4.6079597821789993e-05, + "loss": 3.55, + "step": 30381 + }, + { + "epoch": 0.18069036064325816, + "grad_norm": 1.9370126724243164, + "learning_rate": 4.6079346694152e-05, + "loss": 3.681, + "step": 30382 + }, + { + "epoch": 0.18069630792653915, + "grad_norm": 1.6433509588241577, + "learning_rate": 4.607909555915542e-05, + "loss": 4.6211, + "step": 30383 + }, + { + "epoch": 0.18070225520982017, + "grad_norm": 1.9012796878814697, + "learning_rate": 4.607884441680035e-05, + "loss": 4.5669, + "step": 30384 + }, + { + "epoch": 0.18070820249310116, + "grad_norm": 1.8061003684997559, + "learning_rate": 4.607859326708687e-05, + "loss": 4.4649, + "step": 30385 + }, + { + "epoch": 0.18071414977638214, + "grad_norm": 1.7555569410324097, + "learning_rate": 4.607834211001508e-05, + "loss": 4.4836, + "step": 30386 + }, + { + "epoch": 0.18072009705966316, + "grad_norm": 1.9138058423995972, + "learning_rate": 4.607809094558505e-05, + "loss": 4.5009, + "step": 30387 + }, + { + "epoch": 0.18072604434294415, + "grad_norm": 2.0391855239868164, + "learning_rate": 4.6077839773796874e-05, + "loss": 4.4596, + "step": 30388 + }, + { + "epoch": 0.18073199162622514, + "grad_norm": 2.037545680999756, + "learning_rate": 4.607758859465065e-05, + "loss": 4.5291, + "step": 30389 + }, + { + "epoch": 0.18073793890950612, + "grad_norm": 2.7652394771575928, + "learning_rate": 4.607733740814645e-05, + "loss": 4.2606, + "step": 30390 + }, + { + "epoch": 0.18074388619278714, + "grad_norm": 2.835252285003662, + "learning_rate": 4.607708621428438e-05, + "loss": 4.2674, + "step": 30391 + }, + { + "epoch": 0.18074983347606813, + "grad_norm": 2.889340400695801, + "learning_rate": 4.607683501306451e-05, + "loss": 4.3982, + "step": 30392 + }, + { + "epoch": 0.18075578075934912, + "grad_norm": 1.8587162494659424, + "learning_rate": 4.607658380448693e-05, + "loss": 4.1949, + "step": 30393 + }, + { + "epoch": 0.18076172804263013, + "grad_norm": 2.183932304382324, + "learning_rate": 4.607633258855174e-05, + "loss": 4.4338, + "step": 30394 + }, + { + "epoch": 0.18076767532591112, + "grad_norm": 1.8604317903518677, + "learning_rate": 4.607608136525902e-05, + "loss": 5.2043, + "step": 30395 + }, + { + "epoch": 0.1807736226091921, + "grad_norm": 1.7363629341125488, + "learning_rate": 4.607583013460885e-05, + "loss": 5.1606, + "step": 30396 + }, + { + "epoch": 0.18077956989247312, + "grad_norm": 1.6214736700057983, + "learning_rate": 4.607557889660133e-05, + "loss": 5.2732, + "step": 30397 + }, + { + "epoch": 0.1807855171757541, + "grad_norm": 1.7445697784423828, + "learning_rate": 4.607532765123654e-05, + "loss": 4.7826, + "step": 30398 + }, + { + "epoch": 0.1807914644590351, + "grad_norm": 2.053269147872925, + "learning_rate": 4.607507639851458e-05, + "loss": 4.5327, + "step": 30399 + }, + { + "epoch": 0.18079741174231612, + "grad_norm": 1.63230299949646, + "learning_rate": 4.607482513843552e-05, + "loss": 4.989, + "step": 30400 + }, + { + "epoch": 0.1808033590255971, + "grad_norm": 1.586403489112854, + "learning_rate": 4.607457387099946e-05, + "loss": 5.3342, + "step": 30401 + }, + { + "epoch": 0.1808093063088781, + "grad_norm": 1.43230140209198, + "learning_rate": 4.607432259620648e-05, + "loss": 5.377, + "step": 30402 + }, + { + "epoch": 0.1808152535921591, + "grad_norm": 2.190584182739258, + "learning_rate": 4.6074071314056676e-05, + "loss": 4.9366, + "step": 30403 + }, + { + "epoch": 0.1808212008754401, + "grad_norm": 1.6194654703140259, + "learning_rate": 4.607382002455013e-05, + "loss": 5.2639, + "step": 30404 + }, + { + "epoch": 0.18082714815872108, + "grad_norm": 1.615243911743164, + "learning_rate": 4.607356872768693e-05, + "loss": 4.6323, + "step": 30405 + }, + { + "epoch": 0.1808330954420021, + "grad_norm": 1.5417380332946777, + "learning_rate": 4.607331742346717e-05, + "loss": 4.8193, + "step": 30406 + }, + { + "epoch": 0.1808390427252831, + "grad_norm": 1.5013401508331299, + "learning_rate": 4.607306611189093e-05, + "loss": 5.1733, + "step": 30407 + }, + { + "epoch": 0.18084499000856408, + "grad_norm": 1.2872532606124878, + "learning_rate": 4.60728147929583e-05, + "loss": 5.1983, + "step": 30408 + }, + { + "epoch": 0.1808509372918451, + "grad_norm": 1.4880503416061401, + "learning_rate": 4.607256346666936e-05, + "loss": 5.4417, + "step": 30409 + }, + { + "epoch": 0.18085688457512608, + "grad_norm": 1.2395708560943604, + "learning_rate": 4.607231213302422e-05, + "loss": 5.5189, + "step": 30410 + }, + { + "epoch": 0.18086283185840707, + "grad_norm": 1.7053332328796387, + "learning_rate": 4.607206079202294e-05, + "loss": 5.3116, + "step": 30411 + }, + { + "epoch": 0.18086877914168809, + "grad_norm": 1.5006909370422363, + "learning_rate": 4.607180944366563e-05, + "loss": 5.6907, + "step": 30412 + }, + { + "epoch": 0.18087472642496907, + "grad_norm": 1.489794373512268, + "learning_rate": 4.6071558087952364e-05, + "loss": 5.5739, + "step": 30413 + }, + { + "epoch": 0.18088067370825006, + "grad_norm": 1.5303220748901367, + "learning_rate": 4.607130672488324e-05, + "loss": 5.3727, + "step": 30414 + }, + { + "epoch": 0.18088662099153108, + "grad_norm": 2.531562566757202, + "learning_rate": 4.6071055354458335e-05, + "loss": 4.2266, + "step": 30415 + }, + { + "epoch": 0.18089256827481207, + "grad_norm": 1.5819337368011475, + "learning_rate": 4.6070803976677744e-05, + "loss": 5.01, + "step": 30416 + }, + { + "epoch": 0.18089851555809305, + "grad_norm": 1.4588855504989624, + "learning_rate": 4.607055259154156e-05, + "loss": 5.0615, + "step": 30417 + }, + { + "epoch": 0.18090446284137407, + "grad_norm": 1.7806695699691772, + "learning_rate": 4.607030119904986e-05, + "loss": 5.1481, + "step": 30418 + }, + { + "epoch": 0.18091041012465506, + "grad_norm": 1.37575364112854, + "learning_rate": 4.607004979920273e-05, + "loss": 5.0087, + "step": 30419 + }, + { + "epoch": 0.18091635740793605, + "grad_norm": 1.6504050493240356, + "learning_rate": 4.606979839200027e-05, + "loss": 5.0311, + "step": 30420 + }, + { + "epoch": 0.18092230469121706, + "grad_norm": 1.484144687652588, + "learning_rate": 4.6069546977442556e-05, + "loss": 5.3201, + "step": 30421 + }, + { + "epoch": 0.18092825197449805, + "grad_norm": 1.762091040611267, + "learning_rate": 4.606929555552968e-05, + "loss": 5.1807, + "step": 30422 + }, + { + "epoch": 0.18093419925777904, + "grad_norm": 1.8154287338256836, + "learning_rate": 4.606904412626174e-05, + "loss": 4.8606, + "step": 30423 + }, + { + "epoch": 0.18094014654106005, + "grad_norm": 1.7479325532913208, + "learning_rate": 4.606879268963881e-05, + "loss": 4.8235, + "step": 30424 + }, + { + "epoch": 0.18094609382434104, + "grad_norm": 1.44249427318573, + "learning_rate": 4.6068541245660974e-05, + "loss": 4.7681, + "step": 30425 + }, + { + "epoch": 0.18095204110762203, + "grad_norm": 1.3895748853683472, + "learning_rate": 4.606828979432833e-05, + "loss": 5.1613, + "step": 30426 + }, + { + "epoch": 0.18095798839090305, + "grad_norm": 1.5282186269760132, + "learning_rate": 4.606803833564097e-05, + "loss": 4.8431, + "step": 30427 + }, + { + "epoch": 0.18096393567418403, + "grad_norm": 2.7380192279815674, + "learning_rate": 4.606778686959897e-05, + "loss": 4.7426, + "step": 30428 + }, + { + "epoch": 0.18096988295746502, + "grad_norm": 2.365036725997925, + "learning_rate": 4.6067535396202434e-05, + "loss": 4.9487, + "step": 30429 + }, + { + "epoch": 0.18097583024074604, + "grad_norm": 1.7427470684051514, + "learning_rate": 4.606728391545143e-05, + "loss": 4.8868, + "step": 30430 + }, + { + "epoch": 0.18098177752402703, + "grad_norm": 1.6613335609436035, + "learning_rate": 4.606703242734606e-05, + "loss": 5.0911, + "step": 30431 + }, + { + "epoch": 0.18098772480730801, + "grad_norm": 1.71418297290802, + "learning_rate": 4.60667809318864e-05, + "loss": 4.9135, + "step": 30432 + }, + { + "epoch": 0.18099367209058903, + "grad_norm": 1.4050582647323608, + "learning_rate": 4.6066529429072545e-05, + "loss": 5.1009, + "step": 30433 + }, + { + "epoch": 0.18099961937387002, + "grad_norm": 1.2690151929855347, + "learning_rate": 4.606627791890458e-05, + "loss": 5.5164, + "step": 30434 + }, + { + "epoch": 0.181005566657151, + "grad_norm": 1.6794445514678955, + "learning_rate": 4.60660264013826e-05, + "loss": 5.159, + "step": 30435 + }, + { + "epoch": 0.18101151394043202, + "grad_norm": 1.845813512802124, + "learning_rate": 4.606577487650669e-05, + "loss": 4.6055, + "step": 30436 + }, + { + "epoch": 0.181017461223713, + "grad_norm": 1.6325689554214478, + "learning_rate": 4.6065523344276925e-05, + "loss": 4.7565, + "step": 30437 + }, + { + "epoch": 0.181023408506994, + "grad_norm": 1.64036226272583, + "learning_rate": 4.6065271804693424e-05, + "loss": 5.1105, + "step": 30438 + }, + { + "epoch": 0.18102935579027502, + "grad_norm": 1.5065094232559204, + "learning_rate": 4.6065020257756234e-05, + "loss": 4.7116, + "step": 30439 + }, + { + "epoch": 0.181035303073556, + "grad_norm": 1.8012547492980957, + "learning_rate": 4.6064768703465476e-05, + "loss": 4.3993, + "step": 30440 + }, + { + "epoch": 0.181041250356837, + "grad_norm": 1.5189584493637085, + "learning_rate": 4.606451714182122e-05, + "loss": 4.6601, + "step": 30441 + }, + { + "epoch": 0.181047197640118, + "grad_norm": 1.7323181629180908, + "learning_rate": 4.606426557282356e-05, + "loss": 5.5846, + "step": 30442 + }, + { + "epoch": 0.181053144923399, + "grad_norm": 1.5709025859832764, + "learning_rate": 4.606401399647258e-05, + "loss": 5.1547, + "step": 30443 + }, + { + "epoch": 0.18105909220667998, + "grad_norm": 1.6060830354690552, + "learning_rate": 4.6063762412768365e-05, + "loss": 4.867, + "step": 30444 + }, + { + "epoch": 0.181065039489961, + "grad_norm": 1.4921566247940063, + "learning_rate": 4.606351082171102e-05, + "loss": 5.0458, + "step": 30445 + }, + { + "epoch": 0.181070986773242, + "grad_norm": 1.9008151292800903, + "learning_rate": 4.606325922330062e-05, + "loss": 4.9894, + "step": 30446 + }, + { + "epoch": 0.18107693405652298, + "grad_norm": 2.0366036891937256, + "learning_rate": 4.606300761753724e-05, + "loss": 4.6917, + "step": 30447 + }, + { + "epoch": 0.18108288133980396, + "grad_norm": 1.8549975156784058, + "learning_rate": 4.606275600442099e-05, + "loss": 4.9342, + "step": 30448 + }, + { + "epoch": 0.18108882862308498, + "grad_norm": 1.7794413566589355, + "learning_rate": 4.606250438395196e-05, + "loss": 4.6526, + "step": 30449 + }, + { + "epoch": 0.18109477590636597, + "grad_norm": 1.7541767358779907, + "learning_rate": 4.606225275613021e-05, + "loss": 4.7991, + "step": 30450 + }, + { + "epoch": 0.18110072318964696, + "grad_norm": 2.040306329727173, + "learning_rate": 4.6062001120955854e-05, + "loss": 4.1135, + "step": 30451 + }, + { + "epoch": 0.18110667047292797, + "grad_norm": 2.444293737411499, + "learning_rate": 4.606174947842897e-05, + "loss": 3.4574, + "step": 30452 + }, + { + "epoch": 0.18111261775620896, + "grad_norm": 2.5346062183380127, + "learning_rate": 4.606149782854964e-05, + "loss": 3.2278, + "step": 30453 + }, + { + "epoch": 0.18111856503948995, + "grad_norm": 2.1727371215820312, + "learning_rate": 4.6061246171317975e-05, + "loss": 3.6005, + "step": 30454 + }, + { + "epoch": 0.18112451232277096, + "grad_norm": 1.6244183778762817, + "learning_rate": 4.6060994506734034e-05, + "loss": 4.8594, + "step": 30455 + }, + { + "epoch": 0.18113045960605195, + "grad_norm": 1.6611864566802979, + "learning_rate": 4.606074283479792e-05, + "loss": 4.6606, + "step": 30456 + }, + { + "epoch": 0.18113640688933294, + "grad_norm": 1.9803105592727661, + "learning_rate": 4.606049115550972e-05, + "loss": 4.7584, + "step": 30457 + }, + { + "epoch": 0.18114235417261396, + "grad_norm": 2.047974109649658, + "learning_rate": 4.6060239468869514e-05, + "loss": 4.0147, + "step": 30458 + }, + { + "epoch": 0.18114830145589494, + "grad_norm": 2.57551908493042, + "learning_rate": 4.60599877748774e-05, + "loss": 3.3263, + "step": 30459 + }, + { + "epoch": 0.18115424873917593, + "grad_norm": 2.1633079051971436, + "learning_rate": 4.6059736073533465e-05, + "loss": 3.2757, + "step": 30460 + }, + { + "epoch": 0.18116019602245695, + "grad_norm": 3.3115196228027344, + "learning_rate": 4.605948436483779e-05, + "loss": 3.2996, + "step": 30461 + }, + { + "epoch": 0.18116614330573794, + "grad_norm": 2.717261791229248, + "learning_rate": 4.6059232648790465e-05, + "loss": 3.1929, + "step": 30462 + }, + { + "epoch": 0.18117209058901892, + "grad_norm": 2.1867258548736572, + "learning_rate": 4.6058980925391585e-05, + "loss": 3.3655, + "step": 30463 + }, + { + "epoch": 0.18117803787229994, + "grad_norm": 2.306809186935425, + "learning_rate": 4.6058729194641225e-05, + "loss": 2.9844, + "step": 30464 + }, + { + "epoch": 0.18118398515558093, + "grad_norm": 2.939728260040283, + "learning_rate": 4.6058477456539486e-05, + "loss": 3.158, + "step": 30465 + }, + { + "epoch": 0.18118993243886192, + "grad_norm": 2.996995687484741, + "learning_rate": 4.605822571108646e-05, + "loss": 3.141, + "step": 30466 + }, + { + "epoch": 0.18119587972214293, + "grad_norm": 2.5442357063293457, + "learning_rate": 4.6057973958282205e-05, + "loss": 3.2498, + "step": 30467 + }, + { + "epoch": 0.18120182700542392, + "grad_norm": 2.3496897220611572, + "learning_rate": 4.605772219812684e-05, + "loss": 3.537, + "step": 30468 + }, + { + "epoch": 0.1812077742887049, + "grad_norm": 1.6112096309661865, + "learning_rate": 4.605747043062044e-05, + "loss": 4.8052, + "step": 30469 + }, + { + "epoch": 0.18121372157198593, + "grad_norm": 2.8755533695220947, + "learning_rate": 4.605721865576309e-05, + "loss": 3.8164, + "step": 30470 + }, + { + "epoch": 0.1812196688552669, + "grad_norm": 2.406846046447754, + "learning_rate": 4.605696687355489e-05, + "loss": 3.4058, + "step": 30471 + }, + { + "epoch": 0.1812256161385479, + "grad_norm": 3.146632671356201, + "learning_rate": 4.605671508399592e-05, + "loss": 3.5037, + "step": 30472 + }, + { + "epoch": 0.18123156342182892, + "grad_norm": 2.710477828979492, + "learning_rate": 4.605646328708626e-05, + "loss": 3.1083, + "step": 30473 + }, + { + "epoch": 0.1812375107051099, + "grad_norm": 2.2567665576934814, + "learning_rate": 4.6056211482826e-05, + "loss": 3.2056, + "step": 30474 + }, + { + "epoch": 0.1812434579883909, + "grad_norm": 2.9403610229492188, + "learning_rate": 4.6055959671215256e-05, + "loss": 3.5021, + "step": 30475 + }, + { + "epoch": 0.1812494052716719, + "grad_norm": 2.386746406555176, + "learning_rate": 4.6055707852254085e-05, + "loss": 3.3324, + "step": 30476 + }, + { + "epoch": 0.1812553525549529, + "grad_norm": 1.872837781906128, + "learning_rate": 4.605545602594258e-05, + "loss": 4.0415, + "step": 30477 + }, + { + "epoch": 0.18126129983823389, + "grad_norm": 2.302643060684204, + "learning_rate": 4.605520419228084e-05, + "loss": 3.5739, + "step": 30478 + }, + { + "epoch": 0.1812672471215149, + "grad_norm": 1.8837559223175049, + "learning_rate": 4.6054952351268935e-05, + "loss": 3.8909, + "step": 30479 + }, + { + "epoch": 0.1812731944047959, + "grad_norm": 1.8574949502944946, + "learning_rate": 4.605470050290697e-05, + "loss": 4.6073, + "step": 30480 + }, + { + "epoch": 0.18127914168807688, + "grad_norm": 3.745434522628784, + "learning_rate": 4.605444864719503e-05, + "loss": 4.9296, + "step": 30481 + }, + { + "epoch": 0.1812850889713579, + "grad_norm": 2.209376573562622, + "learning_rate": 4.6054196784133195e-05, + "loss": 5.1083, + "step": 30482 + }, + { + "epoch": 0.18129103625463888, + "grad_norm": 1.746163249015808, + "learning_rate": 4.6053944913721555e-05, + "loss": 4.2706, + "step": 30483 + }, + { + "epoch": 0.18129698353791987, + "grad_norm": 2.2691433429718018, + "learning_rate": 4.6053693035960204e-05, + "loss": 3.8251, + "step": 30484 + }, + { + "epoch": 0.18130293082120089, + "grad_norm": 1.9895451068878174, + "learning_rate": 4.605344115084923e-05, + "loss": 3.8413, + "step": 30485 + }, + { + "epoch": 0.18130887810448187, + "grad_norm": 2.2342569828033447, + "learning_rate": 4.6053189258388706e-05, + "loss": 4.4328, + "step": 30486 + }, + { + "epoch": 0.18131482538776286, + "grad_norm": 1.7602850198745728, + "learning_rate": 4.605293735857874e-05, + "loss": 4.779, + "step": 30487 + }, + { + "epoch": 0.18132077267104388, + "grad_norm": 1.689023494720459, + "learning_rate": 4.6052685451419405e-05, + "loss": 4.7603, + "step": 30488 + }, + { + "epoch": 0.18132671995432487, + "grad_norm": 1.6477890014648438, + "learning_rate": 4.6052433536910804e-05, + "loss": 4.9194, + "step": 30489 + }, + { + "epoch": 0.18133266723760585, + "grad_norm": 1.879791021347046, + "learning_rate": 4.605218161505301e-05, + "loss": 4.9174, + "step": 30490 + }, + { + "epoch": 0.18133861452088687, + "grad_norm": 2.530984878540039, + "learning_rate": 4.605192968584612e-05, + "loss": 3.6623, + "step": 30491 + }, + { + "epoch": 0.18134456180416786, + "grad_norm": 2.555924415588379, + "learning_rate": 4.605167774929022e-05, + "loss": 3.5684, + "step": 30492 + }, + { + "epoch": 0.18135050908744885, + "grad_norm": 2.00748872756958, + "learning_rate": 4.6051425805385394e-05, + "loss": 4.3182, + "step": 30493 + }, + { + "epoch": 0.18135645637072986, + "grad_norm": 1.7455837726593018, + "learning_rate": 4.605117385413174e-05, + "loss": 5.2199, + "step": 30494 + }, + { + "epoch": 0.18136240365401085, + "grad_norm": 1.7002990245819092, + "learning_rate": 4.605092189552932e-05, + "loss": 4.8912, + "step": 30495 + }, + { + "epoch": 0.18136835093729184, + "grad_norm": 1.830411434173584, + "learning_rate": 4.605066992957825e-05, + "loss": 4.5212, + "step": 30496 + }, + { + "epoch": 0.18137429822057285, + "grad_norm": 1.7505379915237427, + "learning_rate": 4.605041795627861e-05, + "loss": 4.5038, + "step": 30497 + }, + { + "epoch": 0.18138024550385384, + "grad_norm": 1.3816022872924805, + "learning_rate": 4.605016597563049e-05, + "loss": 5.1461, + "step": 30498 + }, + { + "epoch": 0.18138619278713483, + "grad_norm": 1.1977434158325195, + "learning_rate": 4.6049913987633976e-05, + "loss": 5.2844, + "step": 30499 + }, + { + "epoch": 0.18139214007041585, + "grad_norm": 1.4711052179336548, + "learning_rate": 4.604966199228915e-05, + "loss": 5.1301, + "step": 30500 + }, + { + "epoch": 0.18139808735369684, + "grad_norm": 1.316135048866272, + "learning_rate": 4.6049409989596105e-05, + "loss": 5.2839, + "step": 30501 + }, + { + "epoch": 0.18140403463697782, + "grad_norm": 1.491049885749817, + "learning_rate": 4.6049157979554926e-05, + "loss": 5.3503, + "step": 30502 + }, + { + "epoch": 0.18140998192025884, + "grad_norm": 1.5653736591339111, + "learning_rate": 4.60489059621657e-05, + "loss": 5.2905, + "step": 30503 + }, + { + "epoch": 0.18141592920353983, + "grad_norm": 1.5193443298339844, + "learning_rate": 4.6048653937428523e-05, + "loss": 5.4668, + "step": 30504 + }, + { + "epoch": 0.18142187648682082, + "grad_norm": 1.5355736017227173, + "learning_rate": 4.604840190534349e-05, + "loss": 5.2235, + "step": 30505 + }, + { + "epoch": 0.1814278237701018, + "grad_norm": 1.6808356046676636, + "learning_rate": 4.604814986591066e-05, + "loss": 5.1193, + "step": 30506 + }, + { + "epoch": 0.18143377105338282, + "grad_norm": 1.5504355430603027, + "learning_rate": 4.6047897819130146e-05, + "loss": 5.4469, + "step": 30507 + }, + { + "epoch": 0.1814397183366638, + "grad_norm": 1.394782304763794, + "learning_rate": 4.604764576500202e-05, + "loss": 5.1401, + "step": 30508 + }, + { + "epoch": 0.1814456656199448, + "grad_norm": 1.9043993949890137, + "learning_rate": 4.6047393703526386e-05, + "loss": 4.1807, + "step": 30509 + }, + { + "epoch": 0.1814516129032258, + "grad_norm": 1.5536892414093018, + "learning_rate": 4.604714163470333e-05, + "loss": 4.8233, + "step": 30510 + }, + { + "epoch": 0.1814575601865068, + "grad_norm": 1.5314890146255493, + "learning_rate": 4.604688955853293e-05, + "loss": 4.8918, + "step": 30511 + }, + { + "epoch": 0.1814635074697878, + "grad_norm": 1.5154199600219727, + "learning_rate": 4.604663747501527e-05, + "loss": 5.2876, + "step": 30512 + }, + { + "epoch": 0.1814694547530688, + "grad_norm": 1.3783801794052124, + "learning_rate": 4.604638538415046e-05, + "loss": 5.437, + "step": 30513 + }, + { + "epoch": 0.1814754020363498, + "grad_norm": 1.850745677947998, + "learning_rate": 4.6046133285938567e-05, + "loss": 4.9178, + "step": 30514 + }, + { + "epoch": 0.18148134931963078, + "grad_norm": 1.5241893529891968, + "learning_rate": 4.604588118037968e-05, + "loss": 5.1389, + "step": 30515 + }, + { + "epoch": 0.1814872966029118, + "grad_norm": 1.4288957118988037, + "learning_rate": 4.60456290674739e-05, + "loss": 5.0731, + "step": 30516 + }, + { + "epoch": 0.18149324388619278, + "grad_norm": 1.7770181894302368, + "learning_rate": 4.604537694722131e-05, + "loss": 4.9011, + "step": 30517 + }, + { + "epoch": 0.18149919116947377, + "grad_norm": 1.6263269186019897, + "learning_rate": 4.6045124819621995e-05, + "loss": 4.5729, + "step": 30518 + }, + { + "epoch": 0.1815051384527548, + "grad_norm": 1.7641338109970093, + "learning_rate": 4.6044872684676044e-05, + "loss": 4.9114, + "step": 30519 + }, + { + "epoch": 0.18151108573603578, + "grad_norm": 2.29036283493042, + "learning_rate": 4.6044620542383546e-05, + "loss": 3.1598, + "step": 30520 + }, + { + "epoch": 0.18151703301931676, + "grad_norm": 3.0936734676361084, + "learning_rate": 4.604436839274459e-05, + "loss": 2.9494, + "step": 30521 + }, + { + "epoch": 0.18152298030259778, + "grad_norm": 2.300161838531494, + "learning_rate": 4.604411623575925e-05, + "loss": 3.103, + "step": 30522 + }, + { + "epoch": 0.18152892758587877, + "grad_norm": 2.440436601638794, + "learning_rate": 4.604386407142764e-05, + "loss": 2.7361, + "step": 30523 + }, + { + "epoch": 0.18153487486915976, + "grad_norm": 2.3842546939849854, + "learning_rate": 4.604361189974983e-05, + "loss": 3.3269, + "step": 30524 + }, + { + "epoch": 0.18154082215244077, + "grad_norm": 2.316323757171631, + "learning_rate": 4.6043359720725916e-05, + "loss": 2.4623, + "step": 30525 + }, + { + "epoch": 0.18154676943572176, + "grad_norm": 2.311478853225708, + "learning_rate": 4.604310753435598e-05, + "loss": 3.3375, + "step": 30526 + }, + { + "epoch": 0.18155271671900275, + "grad_norm": 2.571591854095459, + "learning_rate": 4.604285534064011e-05, + "loss": 3.1362, + "step": 30527 + }, + { + "epoch": 0.18155866400228377, + "grad_norm": 2.753108263015747, + "learning_rate": 4.60426031395784e-05, + "loss": 2.7714, + "step": 30528 + }, + { + "epoch": 0.18156461128556475, + "grad_norm": 2.680237054824829, + "learning_rate": 4.604235093117093e-05, + "loss": 2.7293, + "step": 30529 + }, + { + "epoch": 0.18157055856884574, + "grad_norm": 2.6374194622039795, + "learning_rate": 4.6042098715417795e-05, + "loss": 3.2162, + "step": 30530 + }, + { + "epoch": 0.18157650585212676, + "grad_norm": 2.288968563079834, + "learning_rate": 4.6041846492319086e-05, + "loss": 2.9725, + "step": 30531 + }, + { + "epoch": 0.18158245313540775, + "grad_norm": 2.3108694553375244, + "learning_rate": 4.604159426187488e-05, + "loss": 3.4549, + "step": 30532 + }, + { + "epoch": 0.18158840041868873, + "grad_norm": 2.3923144340515137, + "learning_rate": 4.604134202408528e-05, + "loss": 3.4278, + "step": 30533 + }, + { + "epoch": 0.18159434770196975, + "grad_norm": 2.669036626815796, + "learning_rate": 4.6041089778950355e-05, + "loss": 3.7157, + "step": 30534 + }, + { + "epoch": 0.18160029498525074, + "grad_norm": 2.038989782333374, + "learning_rate": 4.60408375264702e-05, + "loss": 4.6171, + "step": 30535 + }, + { + "epoch": 0.18160624226853173, + "grad_norm": 1.9777814149856567, + "learning_rate": 4.604058526664491e-05, + "loss": 5.1166, + "step": 30536 + }, + { + "epoch": 0.18161218955181274, + "grad_norm": 2.14339280128479, + "learning_rate": 4.604033299947457e-05, + "loss": 4.0872, + "step": 30537 + }, + { + "epoch": 0.18161813683509373, + "grad_norm": 2.5352818965911865, + "learning_rate": 4.604008072495927e-05, + "loss": 3.5657, + "step": 30538 + }, + { + "epoch": 0.18162408411837472, + "grad_norm": 2.4932284355163574, + "learning_rate": 4.603982844309909e-05, + "loss": 3.4923, + "step": 30539 + }, + { + "epoch": 0.18163003140165573, + "grad_norm": 2.817173719406128, + "learning_rate": 4.603957615389413e-05, + "loss": 3.612, + "step": 30540 + }, + { + "epoch": 0.18163597868493672, + "grad_norm": 2.3959133625030518, + "learning_rate": 4.603932385734446e-05, + "loss": 3.4037, + "step": 30541 + }, + { + "epoch": 0.1816419259682177, + "grad_norm": 2.288473129272461, + "learning_rate": 4.6039071553450194e-05, + "loss": 3.1999, + "step": 30542 + }, + { + "epoch": 0.18164787325149873, + "grad_norm": 2.2291407585144043, + "learning_rate": 4.60388192422114e-05, + "loss": 3.2781, + "step": 30543 + }, + { + "epoch": 0.18165382053477971, + "grad_norm": 2.4226462841033936, + "learning_rate": 4.603856692362817e-05, + "loss": 3.3288, + "step": 30544 + }, + { + "epoch": 0.1816597678180607, + "grad_norm": 2.264042377471924, + "learning_rate": 4.6038314597700594e-05, + "loss": 3.4528, + "step": 30545 + }, + { + "epoch": 0.18166571510134172, + "grad_norm": 2.625178813934326, + "learning_rate": 4.6038062264428756e-05, + "loss": 3.2663, + "step": 30546 + }, + { + "epoch": 0.1816716623846227, + "grad_norm": 2.498853921890259, + "learning_rate": 4.603780992381275e-05, + "loss": 3.3232, + "step": 30547 + }, + { + "epoch": 0.1816776096679037, + "grad_norm": 2.1288323402404785, + "learning_rate": 4.603755757585266e-05, + "loss": 3.3592, + "step": 30548 + }, + { + "epoch": 0.1816835569511847, + "grad_norm": 2.363189697265625, + "learning_rate": 4.603730522054858e-05, + "loss": 3.3607, + "step": 30549 + }, + { + "epoch": 0.1816895042344657, + "grad_norm": 2.465437889099121, + "learning_rate": 4.60370528579006e-05, + "loss": 3.2047, + "step": 30550 + }, + { + "epoch": 0.1816954515177467, + "grad_norm": 2.6008546352386475, + "learning_rate": 4.603680048790879e-05, + "loss": 3.3468, + "step": 30551 + }, + { + "epoch": 0.1817013988010277, + "grad_norm": 2.6666195392608643, + "learning_rate": 4.603654811057325e-05, + "loss": 3.7408, + "step": 30552 + }, + { + "epoch": 0.1817073460843087, + "grad_norm": 2.3587095737457275, + "learning_rate": 4.603629572589408e-05, + "loss": 3.5229, + "step": 30553 + }, + { + "epoch": 0.18171329336758968, + "grad_norm": 2.3080029487609863, + "learning_rate": 4.603604333387135e-05, + "loss": 3.0769, + "step": 30554 + }, + { + "epoch": 0.1817192406508707, + "grad_norm": 2.7178757190704346, + "learning_rate": 4.603579093450515e-05, + "loss": 3.2384, + "step": 30555 + }, + { + "epoch": 0.18172518793415168, + "grad_norm": 2.6380956172943115, + "learning_rate": 4.603553852779559e-05, + "loss": 3.0647, + "step": 30556 + }, + { + "epoch": 0.18173113521743267, + "grad_norm": 2.6807405948638916, + "learning_rate": 4.603528611374272e-05, + "loss": 3.5373, + "step": 30557 + }, + { + "epoch": 0.1817370825007137, + "grad_norm": 2.2781288623809814, + "learning_rate": 4.603503369234666e-05, + "loss": 3.18, + "step": 30558 + }, + { + "epoch": 0.18174302978399468, + "grad_norm": 2.6194839477539062, + "learning_rate": 4.6034781263607485e-05, + "loss": 3.2369, + "step": 30559 + }, + { + "epoch": 0.18174897706727566, + "grad_norm": 2.236381769180298, + "learning_rate": 4.603452882752528e-05, + "loss": 3.6477, + "step": 30560 + }, + { + "epoch": 0.18175492435055668, + "grad_norm": 3.2307355403900146, + "learning_rate": 4.603427638410014e-05, + "loss": 2.8661, + "step": 30561 + }, + { + "epoch": 0.18176087163383767, + "grad_norm": 3.1829538345336914, + "learning_rate": 4.603402393333216e-05, + "loss": 3.5902, + "step": 30562 + }, + { + "epoch": 0.18176681891711866, + "grad_norm": 3.2353084087371826, + "learning_rate": 4.603377147522141e-05, + "loss": 3.4499, + "step": 30563 + }, + { + "epoch": 0.18177276620039964, + "grad_norm": 2.7337300777435303, + "learning_rate": 4.6033519009767995e-05, + "loss": 3.0508, + "step": 30564 + }, + { + "epoch": 0.18177871348368066, + "grad_norm": 2.4610583782196045, + "learning_rate": 4.603326653697199e-05, + "loss": 3.372, + "step": 30565 + }, + { + "epoch": 0.18178466076696165, + "grad_norm": 1.5927339792251587, + "learning_rate": 4.603301405683349e-05, + "loss": 5.1742, + "step": 30566 + }, + { + "epoch": 0.18179060805024264, + "grad_norm": 2.8343615531921387, + "learning_rate": 4.6032761569352587e-05, + "loss": 2.8788, + "step": 30567 + }, + { + "epoch": 0.18179655533352365, + "grad_norm": 2.8158621788024902, + "learning_rate": 4.603250907452936e-05, + "loss": 2.9255, + "step": 30568 + }, + { + "epoch": 0.18180250261680464, + "grad_norm": 2.777045488357544, + "learning_rate": 4.60322565723639e-05, + "loss": 3.2428, + "step": 30569 + }, + { + "epoch": 0.18180844990008563, + "grad_norm": 2.668269157409668, + "learning_rate": 4.60320040628563e-05, + "loss": 3.0804, + "step": 30570 + }, + { + "epoch": 0.18181439718336664, + "grad_norm": 2.453457832336426, + "learning_rate": 4.603175154600664e-05, + "loss": 3.0223, + "step": 30571 + }, + { + "epoch": 0.18182034446664763, + "grad_norm": 1.9281212091445923, + "learning_rate": 4.6031499021815014e-05, + "loss": 3.4469, + "step": 30572 + }, + { + "epoch": 0.18182629174992862, + "grad_norm": 1.8291780948638916, + "learning_rate": 4.603124649028152e-05, + "loss": 4.7501, + "step": 30573 + }, + { + "epoch": 0.18183223903320964, + "grad_norm": 1.518445372581482, + "learning_rate": 4.603099395140622e-05, + "loss": 4.9895, + "step": 30574 + }, + { + "epoch": 0.18183818631649062, + "grad_norm": 1.562727928161621, + "learning_rate": 4.603074140518923e-05, + "loss": 5.0106, + "step": 30575 + }, + { + "epoch": 0.1818441335997716, + "grad_norm": 2.01888370513916, + "learning_rate": 4.6030488851630615e-05, + "loss": 4.8952, + "step": 30576 + }, + { + "epoch": 0.18185008088305263, + "grad_norm": 1.2194279432296753, + "learning_rate": 4.6030236290730476e-05, + "loss": 5.1772, + "step": 30577 + }, + { + "epoch": 0.18185602816633362, + "grad_norm": 2.1817402839660645, + "learning_rate": 4.60299837224889e-05, + "loss": 3.7245, + "step": 30578 + }, + { + "epoch": 0.1818619754496146, + "grad_norm": 1.5736979246139526, + "learning_rate": 4.6029731146905975e-05, + "loss": 5.4276, + "step": 30579 + }, + { + "epoch": 0.18186792273289562, + "grad_norm": 1.9954670667648315, + "learning_rate": 4.602947856398179e-05, + "loss": 4.3673, + "step": 30580 + }, + { + "epoch": 0.1818738700161766, + "grad_norm": 1.5366657972335815, + "learning_rate": 4.6029225973716426e-05, + "loss": 4.8274, + "step": 30581 + }, + { + "epoch": 0.1818798172994576, + "grad_norm": 1.5931968688964844, + "learning_rate": 4.602897337610998e-05, + "loss": 5.2114, + "step": 30582 + }, + { + "epoch": 0.1818857645827386, + "grad_norm": 1.6159030199050903, + "learning_rate": 4.6028720771162536e-05, + "loss": 4.5858, + "step": 30583 + }, + { + "epoch": 0.1818917118660196, + "grad_norm": 1.531935214996338, + "learning_rate": 4.602846815887418e-05, + "loss": 5.4222, + "step": 30584 + }, + { + "epoch": 0.1818976591493006, + "grad_norm": 1.7498992681503296, + "learning_rate": 4.6028215539245015e-05, + "loss": 5.406, + "step": 30585 + }, + { + "epoch": 0.1819036064325816, + "grad_norm": 1.5374906063079834, + "learning_rate": 4.60279629122751e-05, + "loss": 5.2197, + "step": 30586 + }, + { + "epoch": 0.1819095537158626, + "grad_norm": 1.4167890548706055, + "learning_rate": 4.6027710277964555e-05, + "loss": 5.5045, + "step": 30587 + }, + { + "epoch": 0.18191550099914358, + "grad_norm": 1.7180233001708984, + "learning_rate": 4.6027457636313446e-05, + "loss": 5.1006, + "step": 30588 + }, + { + "epoch": 0.1819214482824246, + "grad_norm": 1.6115717887878418, + "learning_rate": 4.602720498732187e-05, + "loss": 4.985, + "step": 30589 + }, + { + "epoch": 0.18192739556570559, + "grad_norm": 2.0676872730255127, + "learning_rate": 4.602695233098991e-05, + "loss": 4.838, + "step": 30590 + }, + { + "epoch": 0.18193334284898657, + "grad_norm": 1.924194574356079, + "learning_rate": 4.6026699667317663e-05, + "loss": 4.8063, + "step": 30591 + }, + { + "epoch": 0.1819392901322676, + "grad_norm": 2.717851400375366, + "learning_rate": 4.602644699630521e-05, + "loss": 4.4838, + "step": 30592 + }, + { + "epoch": 0.18194523741554858, + "grad_norm": 1.5828056335449219, + "learning_rate": 4.602619431795264e-05, + "loss": 5.6776, + "step": 30593 + }, + { + "epoch": 0.18195118469882957, + "grad_norm": 2.8755221366882324, + "learning_rate": 4.602594163226005e-05, + "loss": 5.0374, + "step": 30594 + }, + { + "epoch": 0.18195713198211058, + "grad_norm": 1.8692079782485962, + "learning_rate": 4.602568893922752e-05, + "loss": 4.5605, + "step": 30595 + }, + { + "epoch": 0.18196307926539157, + "grad_norm": 1.3632681369781494, + "learning_rate": 4.602543623885513e-05, + "loss": 5.179, + "step": 30596 + }, + { + "epoch": 0.18196902654867256, + "grad_norm": 1.5239547491073608, + "learning_rate": 4.602518353114298e-05, + "loss": 5.0395, + "step": 30597 + }, + { + "epoch": 0.18197497383195357, + "grad_norm": 1.5662137269973755, + "learning_rate": 4.602493081609116e-05, + "loss": 5.0049, + "step": 30598 + }, + { + "epoch": 0.18198092111523456, + "grad_norm": 1.5579825639724731, + "learning_rate": 4.602467809369976e-05, + "loss": 5.1001, + "step": 30599 + }, + { + "epoch": 0.18198686839851555, + "grad_norm": 1.6686931848526, + "learning_rate": 4.6024425363968846e-05, + "loss": 5.1319, + "step": 30600 + }, + { + "epoch": 0.18199281568179657, + "grad_norm": 1.5801063776016235, + "learning_rate": 4.602417262689853e-05, + "loss": 4.971, + "step": 30601 + }, + { + "epoch": 0.18199876296507755, + "grad_norm": 1.508872389793396, + "learning_rate": 4.6023919882488896e-05, + "loss": 4.9048, + "step": 30602 + }, + { + "epoch": 0.18200471024835854, + "grad_norm": 1.772307276725769, + "learning_rate": 4.602366713074003e-05, + "loss": 4.811, + "step": 30603 + }, + { + "epoch": 0.18201065753163956, + "grad_norm": 1.7669419050216675, + "learning_rate": 4.602341437165202e-05, + "loss": 5.1116, + "step": 30604 + }, + { + "epoch": 0.18201660481492055, + "grad_norm": 1.6603509187698364, + "learning_rate": 4.602316160522494e-05, + "loss": 5.2131, + "step": 30605 + }, + { + "epoch": 0.18202255209820153, + "grad_norm": 1.71107816696167, + "learning_rate": 4.60229088314589e-05, + "loss": 4.8201, + "step": 30606 + }, + { + "epoch": 0.18202849938148255, + "grad_norm": 1.6192432641983032, + "learning_rate": 4.602265605035398e-05, + "loss": 5.3336, + "step": 30607 + }, + { + "epoch": 0.18203444666476354, + "grad_norm": 1.3941278457641602, + "learning_rate": 4.602240326191027e-05, + "loss": 5.2017, + "step": 30608 + }, + { + "epoch": 0.18204039394804453, + "grad_norm": 1.7096537351608276, + "learning_rate": 4.602215046612785e-05, + "loss": 4.6411, + "step": 30609 + }, + { + "epoch": 0.18204634123132554, + "grad_norm": 1.819649338722229, + "learning_rate": 4.6021897663006826e-05, + "loss": 4.5691, + "step": 30610 + }, + { + "epoch": 0.18205228851460653, + "grad_norm": 1.976924180984497, + "learning_rate": 4.602164485254726e-05, + "loss": 4.6638, + "step": 30611 + }, + { + "epoch": 0.18205823579788752, + "grad_norm": 1.6236119270324707, + "learning_rate": 4.602139203474927e-05, + "loss": 4.2149, + "step": 30612 + }, + { + "epoch": 0.18206418308116853, + "grad_norm": 1.688239336013794, + "learning_rate": 4.602113920961292e-05, + "loss": 4.2815, + "step": 30613 + }, + { + "epoch": 0.18207013036444952, + "grad_norm": 1.854436993598938, + "learning_rate": 4.60208863771383e-05, + "loss": 4.3709, + "step": 30614 + }, + { + "epoch": 0.1820760776477305, + "grad_norm": 1.7107741832733154, + "learning_rate": 4.6020633537325516e-05, + "loss": 4.6272, + "step": 30615 + }, + { + "epoch": 0.18208202493101153, + "grad_norm": 1.785346508026123, + "learning_rate": 4.6020380690174645e-05, + "loss": 5.2134, + "step": 30616 + }, + { + "epoch": 0.18208797221429252, + "grad_norm": 1.5961878299713135, + "learning_rate": 4.602012783568578e-05, + "loss": 4.8977, + "step": 30617 + }, + { + "epoch": 0.1820939194975735, + "grad_norm": 1.711595892906189, + "learning_rate": 4.6019874973859e-05, + "loss": 3.9303, + "step": 30618 + }, + { + "epoch": 0.18209986678085452, + "grad_norm": 1.432024598121643, + "learning_rate": 4.6019622104694406e-05, + "loss": 4.1765, + "step": 30619 + }, + { + "epoch": 0.1821058140641355, + "grad_norm": 1.624489188194275, + "learning_rate": 4.601936922819207e-05, + "loss": 4.8633, + "step": 30620 + }, + { + "epoch": 0.1821117613474165, + "grad_norm": 1.4783191680908203, + "learning_rate": 4.6019116344352095e-05, + "loss": 4.8363, + "step": 30621 + }, + { + "epoch": 0.1821177086306975, + "grad_norm": 1.564587950706482, + "learning_rate": 4.601886345317456e-05, + "loss": 4.8749, + "step": 30622 + }, + { + "epoch": 0.1821236559139785, + "grad_norm": 1.7457023859024048, + "learning_rate": 4.601861055465956e-05, + "loss": 4.724, + "step": 30623 + }, + { + "epoch": 0.1821296031972595, + "grad_norm": 1.6358530521392822, + "learning_rate": 4.6018357648807175e-05, + "loss": 4.8055, + "step": 30624 + }, + { + "epoch": 0.18213555048054048, + "grad_norm": 1.67806875705719, + "learning_rate": 4.601810473561751e-05, + "loss": 4.7913, + "step": 30625 + }, + { + "epoch": 0.1821414977638215, + "grad_norm": 1.7741279602050781, + "learning_rate": 4.601785181509063e-05, + "loss": 4.056, + "step": 30626 + }, + { + "epoch": 0.18214744504710248, + "grad_norm": 1.6061371564865112, + "learning_rate": 4.601759888722663e-05, + "loss": 4.0635, + "step": 30627 + }, + { + "epoch": 0.18215339233038347, + "grad_norm": 1.329079508781433, + "learning_rate": 4.6017345952025616e-05, + "loss": 5.2524, + "step": 30628 + }, + { + "epoch": 0.18215933961366448, + "grad_norm": 1.6871402263641357, + "learning_rate": 4.601709300948767e-05, + "loss": 4.5095, + "step": 30629 + }, + { + "epoch": 0.18216528689694547, + "grad_norm": 1.7423584461212158, + "learning_rate": 4.6016840059612856e-05, + "loss": 4.2299, + "step": 30630 + }, + { + "epoch": 0.18217123418022646, + "grad_norm": 1.7102059125900269, + "learning_rate": 4.601658710240129e-05, + "loss": 4.3972, + "step": 30631 + }, + { + "epoch": 0.18217718146350748, + "grad_norm": 1.7572731971740723, + "learning_rate": 4.601633413785305e-05, + "loss": 5.0521, + "step": 30632 + }, + { + "epoch": 0.18218312874678846, + "grad_norm": 1.632642388343811, + "learning_rate": 4.6016081165968215e-05, + "loss": 4.9921, + "step": 30633 + }, + { + "epoch": 0.18218907603006945, + "grad_norm": 1.7840354442596436, + "learning_rate": 4.6015828186746896e-05, + "loss": 4.9606, + "step": 30634 + }, + { + "epoch": 0.18219502331335047, + "grad_norm": 1.7963460683822632, + "learning_rate": 4.601557520018917e-05, + "loss": 4.633, + "step": 30635 + }, + { + "epoch": 0.18220097059663146, + "grad_norm": 1.5674350261688232, + "learning_rate": 4.601532220629511e-05, + "loss": 5.1909, + "step": 30636 + }, + { + "epoch": 0.18220691787991244, + "grad_norm": 1.7398990392684937, + "learning_rate": 4.6015069205064835e-05, + "loss": 5.0319, + "step": 30637 + }, + { + "epoch": 0.18221286516319346, + "grad_norm": 1.474489450454712, + "learning_rate": 4.60148161964984e-05, + "loss": 5.0893, + "step": 30638 + }, + { + "epoch": 0.18221881244647445, + "grad_norm": 1.4791532754898071, + "learning_rate": 4.601456318059592e-05, + "loss": 4.6663, + "step": 30639 + }, + { + "epoch": 0.18222475972975544, + "grad_norm": 1.7666285037994385, + "learning_rate": 4.601431015735747e-05, + "loss": 4.297, + "step": 30640 + }, + { + "epoch": 0.18223070701303645, + "grad_norm": 1.7343413829803467, + "learning_rate": 4.601405712678314e-05, + "loss": 4.6023, + "step": 30641 + }, + { + "epoch": 0.18223665429631744, + "grad_norm": 1.87008798122406, + "learning_rate": 4.601380408887302e-05, + "loss": 5.0135, + "step": 30642 + }, + { + "epoch": 0.18224260157959843, + "grad_norm": 1.5589100122451782, + "learning_rate": 4.60135510436272e-05, + "loss": 4.973, + "step": 30643 + }, + { + "epoch": 0.18224854886287944, + "grad_norm": 1.6267797946929932, + "learning_rate": 4.601329799104577e-05, + "loss": 4.9089, + "step": 30644 + }, + { + "epoch": 0.18225449614616043, + "grad_norm": 1.3924577236175537, + "learning_rate": 4.601304493112881e-05, + "loss": 4.8534, + "step": 30645 + }, + { + "epoch": 0.18226044342944142, + "grad_norm": 1.6482142210006714, + "learning_rate": 4.601279186387642e-05, + "loss": 4.8919, + "step": 30646 + }, + { + "epoch": 0.18226639071272244, + "grad_norm": 1.5615832805633545, + "learning_rate": 4.6012538789288676e-05, + "loss": 4.9114, + "step": 30647 + }, + { + "epoch": 0.18227233799600343, + "grad_norm": 1.4806512594223022, + "learning_rate": 4.601228570736566e-05, + "loss": 4.8957, + "step": 30648 + }, + { + "epoch": 0.1822782852792844, + "grad_norm": 1.3537266254425049, + "learning_rate": 4.6012032618107494e-05, + "loss": 4.8277, + "step": 30649 + }, + { + "epoch": 0.18228423256256543, + "grad_norm": 1.5608755350112915, + "learning_rate": 4.601177952151423e-05, + "loss": 4.9707, + "step": 30650 + }, + { + "epoch": 0.18229017984584642, + "grad_norm": 1.6153634786605835, + "learning_rate": 4.601152641758597e-05, + "loss": 5.1415, + "step": 30651 + }, + { + "epoch": 0.1822961271291274, + "grad_norm": 1.7191613912582397, + "learning_rate": 4.601127330632281e-05, + "loss": 5.1045, + "step": 30652 + }, + { + "epoch": 0.18230207441240842, + "grad_norm": 1.668485164642334, + "learning_rate": 4.601102018772483e-05, + "loss": 5.1807, + "step": 30653 + }, + { + "epoch": 0.1823080216956894, + "grad_norm": 1.7589253187179565, + "learning_rate": 4.601076706179212e-05, + "loss": 4.9829, + "step": 30654 + }, + { + "epoch": 0.1823139689789704, + "grad_norm": 1.5183218717575073, + "learning_rate": 4.6010513928524766e-05, + "loss": 4.9434, + "step": 30655 + }, + { + "epoch": 0.1823199162622514, + "grad_norm": 1.5674960613250732, + "learning_rate": 4.601026078792287e-05, + "loss": 4.8959, + "step": 30656 + }, + { + "epoch": 0.1823258635455324, + "grad_norm": 1.8212403059005737, + "learning_rate": 4.60100076399865e-05, + "loss": 4.2204, + "step": 30657 + }, + { + "epoch": 0.1823318108288134, + "grad_norm": 1.7452092170715332, + "learning_rate": 4.600975448471575e-05, + "loss": 4.7962, + "step": 30658 + }, + { + "epoch": 0.1823377581120944, + "grad_norm": 1.5074201822280884, + "learning_rate": 4.6009501322110716e-05, + "loss": 5.202, + "step": 30659 + }, + { + "epoch": 0.1823437053953754, + "grad_norm": 1.5057684183120728, + "learning_rate": 4.600924815217147e-05, + "loss": 5.3574, + "step": 30660 + }, + { + "epoch": 0.18234965267865638, + "grad_norm": 1.5492216348648071, + "learning_rate": 4.600899497489813e-05, + "loss": 5.2738, + "step": 30661 + }, + { + "epoch": 0.1823555999619374, + "grad_norm": 1.65701425075531, + "learning_rate": 4.600874179029076e-05, + "loss": 5.2794, + "step": 30662 + }, + { + "epoch": 0.18236154724521839, + "grad_norm": 2.055145740509033, + "learning_rate": 4.600848859834945e-05, + "loss": 5.1107, + "step": 30663 + }, + { + "epoch": 0.18236749452849937, + "grad_norm": 2.13283634185791, + "learning_rate": 4.600823539907431e-05, + "loss": 4.5388, + "step": 30664 + }, + { + "epoch": 0.1823734418117804, + "grad_norm": 1.6410765647888184, + "learning_rate": 4.6007982192465396e-05, + "loss": 5.5528, + "step": 30665 + }, + { + "epoch": 0.18237938909506138, + "grad_norm": 1.7154231071472168, + "learning_rate": 4.600772897852282e-05, + "loss": 5.2923, + "step": 30666 + }, + { + "epoch": 0.18238533637834237, + "grad_norm": 1.7217670679092407, + "learning_rate": 4.6007475757246656e-05, + "loss": 5.0955, + "step": 30667 + }, + { + "epoch": 0.18239128366162338, + "grad_norm": 1.7309542894363403, + "learning_rate": 4.6007222528637005e-05, + "loss": 5.4714, + "step": 30668 + }, + { + "epoch": 0.18239723094490437, + "grad_norm": 2.1107077598571777, + "learning_rate": 4.6006969292693946e-05, + "loss": 4.1897, + "step": 30669 + }, + { + "epoch": 0.18240317822818536, + "grad_norm": 1.6427122354507446, + "learning_rate": 4.6006716049417574e-05, + "loss": 5.4529, + "step": 30670 + }, + { + "epoch": 0.18240912551146637, + "grad_norm": 1.698148488998413, + "learning_rate": 4.600646279880798e-05, + "loss": 4.844, + "step": 30671 + }, + { + "epoch": 0.18241507279474736, + "grad_norm": 2.373337984085083, + "learning_rate": 4.6006209540865236e-05, + "loss": 4.3026, + "step": 30672 + }, + { + "epoch": 0.18242102007802835, + "grad_norm": 2.3324615955352783, + "learning_rate": 4.6005956275589446e-05, + "loss": 4.1663, + "step": 30673 + }, + { + "epoch": 0.18242696736130937, + "grad_norm": 2.296529531478882, + "learning_rate": 4.6005703002980696e-05, + "loss": 4.3019, + "step": 30674 + }, + { + "epoch": 0.18243291464459035, + "grad_norm": 2.13657808303833, + "learning_rate": 4.6005449723039066e-05, + "loss": 3.9219, + "step": 30675 + }, + { + "epoch": 0.18243886192787134, + "grad_norm": 1.3434631824493408, + "learning_rate": 4.600519643576466e-05, + "loss": 5.5071, + "step": 30676 + }, + { + "epoch": 0.18244480921115236, + "grad_norm": 1.3897916078567505, + "learning_rate": 4.6004943141157554e-05, + "loss": 5.8751, + "step": 30677 + }, + { + "epoch": 0.18245075649443335, + "grad_norm": 1.660503625869751, + "learning_rate": 4.600468983921783e-05, + "loss": 5.3946, + "step": 30678 + }, + { + "epoch": 0.18245670377771434, + "grad_norm": 1.4913995265960693, + "learning_rate": 4.6004436529945596e-05, + "loss": 5.3409, + "step": 30679 + }, + { + "epoch": 0.18246265106099535, + "grad_norm": 1.590925693511963, + "learning_rate": 4.6004183213340924e-05, + "loss": 5.2041, + "step": 30680 + }, + { + "epoch": 0.18246859834427634, + "grad_norm": 1.5279881954193115, + "learning_rate": 4.6003929889403915e-05, + "loss": 4.8881, + "step": 30681 + }, + { + "epoch": 0.18247454562755733, + "grad_norm": 1.558207392692566, + "learning_rate": 4.600367655813464e-05, + "loss": 5.0951, + "step": 30682 + }, + { + "epoch": 0.18248049291083832, + "grad_norm": 1.4454327821731567, + "learning_rate": 4.600342321953322e-05, + "loss": 5.1351, + "step": 30683 + }, + { + "epoch": 0.18248644019411933, + "grad_norm": 1.6854497194290161, + "learning_rate": 4.60031698735997e-05, + "loss": 5.1165, + "step": 30684 + }, + { + "epoch": 0.18249238747740032, + "grad_norm": 2.277977466583252, + "learning_rate": 4.600291652033419e-05, + "loss": 5.2921, + "step": 30685 + }, + { + "epoch": 0.1824983347606813, + "grad_norm": 1.7434666156768799, + "learning_rate": 4.600266315973679e-05, + "loss": 5.1459, + "step": 30686 + }, + { + "epoch": 0.18250428204396232, + "grad_norm": 1.8711892366409302, + "learning_rate": 4.6002409791807575e-05, + "loss": 5.0815, + "step": 30687 + }, + { + "epoch": 0.1825102293272433, + "grad_norm": 1.7337292432785034, + "learning_rate": 4.600215641654664e-05, + "loss": 5.5504, + "step": 30688 + }, + { + "epoch": 0.1825161766105243, + "grad_norm": 1.599770188331604, + "learning_rate": 4.600190303395407e-05, + "loss": 5.4996, + "step": 30689 + }, + { + "epoch": 0.18252212389380532, + "grad_norm": 1.6465502977371216, + "learning_rate": 4.6001649644029945e-05, + "loss": 4.6497, + "step": 30690 + }, + { + "epoch": 0.1825280711770863, + "grad_norm": 1.3220854997634888, + "learning_rate": 4.600139624677436e-05, + "loss": 5.5304, + "step": 30691 + }, + { + "epoch": 0.1825340184603673, + "grad_norm": 1.4749271869659424, + "learning_rate": 4.600114284218741e-05, + "loss": 5.3472, + "step": 30692 + }, + { + "epoch": 0.1825399657436483, + "grad_norm": 1.3068197965621948, + "learning_rate": 4.6000889430269175e-05, + "loss": 5.4445, + "step": 30693 + }, + { + "epoch": 0.1825459130269293, + "grad_norm": 1.3629871606826782, + "learning_rate": 4.600063601101974e-05, + "loss": 5.3633, + "step": 30694 + }, + { + "epoch": 0.18255186031021028, + "grad_norm": 1.565169334411621, + "learning_rate": 4.600038258443921e-05, + "loss": 5.3485, + "step": 30695 + }, + { + "epoch": 0.1825578075934913, + "grad_norm": 1.4095406532287598, + "learning_rate": 4.600012915052766e-05, + "loss": 5.3252, + "step": 30696 + }, + { + "epoch": 0.1825637548767723, + "grad_norm": 1.3348292112350464, + "learning_rate": 4.599987570928518e-05, + "loss": 5.2613, + "step": 30697 + }, + { + "epoch": 0.18256970216005328, + "grad_norm": 1.436431646347046, + "learning_rate": 4.599962226071187e-05, + "loss": 5.2325, + "step": 30698 + }, + { + "epoch": 0.1825756494433343, + "grad_norm": 1.4782795906066895, + "learning_rate": 4.59993688048078e-05, + "loss": 5.2674, + "step": 30699 + }, + { + "epoch": 0.18258159672661528, + "grad_norm": 1.8673319816589355, + "learning_rate": 4.599911534157306e-05, + "loss": 4.9126, + "step": 30700 + }, + { + "epoch": 0.18258754400989627, + "grad_norm": 1.695785403251648, + "learning_rate": 4.599886187100776e-05, + "loss": 5.2728, + "step": 30701 + }, + { + "epoch": 0.18259349129317728, + "grad_norm": 1.6430630683898926, + "learning_rate": 4.599860839311197e-05, + "loss": 5.1289, + "step": 30702 + }, + { + "epoch": 0.18259943857645827, + "grad_norm": 1.4497203826904297, + "learning_rate": 4.599835490788578e-05, + "loss": 5.1118, + "step": 30703 + }, + { + "epoch": 0.18260538585973926, + "grad_norm": 1.6501460075378418, + "learning_rate": 4.599810141532929e-05, + "loss": 5.3177, + "step": 30704 + }, + { + "epoch": 0.18261133314302028, + "grad_norm": 1.6418206691741943, + "learning_rate": 4.5997847915442564e-05, + "loss": 5.3195, + "step": 30705 + }, + { + "epoch": 0.18261728042630127, + "grad_norm": 2.704659938812256, + "learning_rate": 4.599759440822572e-05, + "loss": 4.5861, + "step": 30706 + }, + { + "epoch": 0.18262322770958225, + "grad_norm": 2.0303256511688232, + "learning_rate": 4.5997340893678825e-05, + "loss": 4.7766, + "step": 30707 + }, + { + "epoch": 0.18262917499286327, + "grad_norm": 1.7311177253723145, + "learning_rate": 4.599708737180198e-05, + "loss": 5.2158, + "step": 30708 + }, + { + "epoch": 0.18263512227614426, + "grad_norm": 1.7286479473114014, + "learning_rate": 4.5996833842595264e-05, + "loss": 5.4676, + "step": 30709 + }, + { + "epoch": 0.18264106955942525, + "grad_norm": 1.5010279417037964, + "learning_rate": 4.599658030605877e-05, + "loss": 5.3087, + "step": 30710 + }, + { + "epoch": 0.18264701684270626, + "grad_norm": 1.9836444854736328, + "learning_rate": 4.5996326762192585e-05, + "loss": 5.1255, + "step": 30711 + }, + { + "epoch": 0.18265296412598725, + "grad_norm": 1.8559132814407349, + "learning_rate": 4.599607321099681e-05, + "loss": 5.5439, + "step": 30712 + }, + { + "epoch": 0.18265891140926824, + "grad_norm": 2.781334400177002, + "learning_rate": 4.5995819652471515e-05, + "loss": 4.236, + "step": 30713 + }, + { + "epoch": 0.18266485869254925, + "grad_norm": 1.7039425373077393, + "learning_rate": 4.59955660866168e-05, + "loss": 5.2539, + "step": 30714 + }, + { + "epoch": 0.18267080597583024, + "grad_norm": 1.7565476894378662, + "learning_rate": 4.5995312513432744e-05, + "loss": 5.3456, + "step": 30715 + }, + { + "epoch": 0.18267675325911123, + "grad_norm": 1.8682184219360352, + "learning_rate": 4.599505893291945e-05, + "loss": 5.1608, + "step": 30716 + }, + { + "epoch": 0.18268270054239225, + "grad_norm": 1.6879570484161377, + "learning_rate": 4.599480534507699e-05, + "loss": 5.2518, + "step": 30717 + }, + { + "epoch": 0.18268864782567323, + "grad_norm": 1.6643418073654175, + "learning_rate": 4.599455174990546e-05, + "loss": 5.2634, + "step": 30718 + }, + { + "epoch": 0.18269459510895422, + "grad_norm": 1.6813920736312866, + "learning_rate": 4.599429814740496e-05, + "loss": 5.1519, + "step": 30719 + }, + { + "epoch": 0.18270054239223524, + "grad_norm": 1.4897735118865967, + "learning_rate": 4.599404453757555e-05, + "loss": 5.0287, + "step": 30720 + }, + { + "epoch": 0.18270648967551623, + "grad_norm": 1.6526249647140503, + "learning_rate": 4.599379092041735e-05, + "loss": 5.2445, + "step": 30721 + }, + { + "epoch": 0.18271243695879721, + "grad_norm": 1.635257363319397, + "learning_rate": 4.599353729593043e-05, + "loss": 4.9676, + "step": 30722 + }, + { + "epoch": 0.18271838424207823, + "grad_norm": 2.6660733222961426, + "learning_rate": 4.599328366411488e-05, + "loss": 3.8513, + "step": 30723 + }, + { + "epoch": 0.18272433152535922, + "grad_norm": 2.4976534843444824, + "learning_rate": 4.5993030024970796e-05, + "loss": 3.8516, + "step": 30724 + }, + { + "epoch": 0.1827302788086402, + "grad_norm": 1.936405062675476, + "learning_rate": 4.599277637849826e-05, + "loss": 4.5016, + "step": 30725 + }, + { + "epoch": 0.18273622609192122, + "grad_norm": 1.5889533758163452, + "learning_rate": 4.5992522724697365e-05, + "loss": 5.095, + "step": 30726 + }, + { + "epoch": 0.1827421733752022, + "grad_norm": 1.8294072151184082, + "learning_rate": 4.5992269063568195e-05, + "loss": 5.1108, + "step": 30727 + }, + { + "epoch": 0.1827481206584832, + "grad_norm": 1.8671683073043823, + "learning_rate": 4.5992015395110835e-05, + "loss": 4.9816, + "step": 30728 + }, + { + "epoch": 0.18275406794176421, + "grad_norm": 1.6619024276733398, + "learning_rate": 4.5991761719325386e-05, + "loss": 5.0146, + "step": 30729 + }, + { + "epoch": 0.1827600152250452, + "grad_norm": 1.7837094068527222, + "learning_rate": 4.5991508036211936e-05, + "loss": 5.2135, + "step": 30730 + }, + { + "epoch": 0.1827659625083262, + "grad_norm": 2.0837316513061523, + "learning_rate": 4.5991254345770554e-05, + "loss": 5.4597, + "step": 30731 + }, + { + "epoch": 0.1827719097916072, + "grad_norm": 1.580824851989746, + "learning_rate": 4.5991000648001354e-05, + "loss": 5.0718, + "step": 30732 + }, + { + "epoch": 0.1827778570748882, + "grad_norm": 1.8121145963668823, + "learning_rate": 4.5990746942904404e-05, + "loss": 4.8452, + "step": 30733 + }, + { + "epoch": 0.18278380435816918, + "grad_norm": 1.6485167741775513, + "learning_rate": 4.59904932304798e-05, + "loss": 5.696, + "step": 30734 + }, + { + "epoch": 0.1827897516414502, + "grad_norm": 1.5621600151062012, + "learning_rate": 4.599023951072764e-05, + "loss": 5.5879, + "step": 30735 + }, + { + "epoch": 0.1827956989247312, + "grad_norm": 1.4888461828231812, + "learning_rate": 4.5989985783648006e-05, + "loss": 5.6557, + "step": 30736 + }, + { + "epoch": 0.18280164620801218, + "grad_norm": 1.422515869140625, + "learning_rate": 4.598973204924097e-05, + "loss": 5.3813, + "step": 30737 + }, + { + "epoch": 0.1828075934912932, + "grad_norm": 2.142186403274536, + "learning_rate": 4.598947830750665e-05, + "loss": 4.405, + "step": 30738 + }, + { + "epoch": 0.18281354077457418, + "grad_norm": 1.8209202289581299, + "learning_rate": 4.598922455844511e-05, + "loss": 4.4518, + "step": 30739 + }, + { + "epoch": 0.18281948805785517, + "grad_norm": 1.6258145570755005, + "learning_rate": 4.5988970802056454e-05, + "loss": 5.0344, + "step": 30740 + }, + { + "epoch": 0.18282543534113616, + "grad_norm": 1.7348908185958862, + "learning_rate": 4.5988717038340766e-05, + "loss": 4.9899, + "step": 30741 + }, + { + "epoch": 0.18283138262441717, + "grad_norm": 1.7576826810836792, + "learning_rate": 4.5988463267298134e-05, + "loss": 5.0383, + "step": 30742 + }, + { + "epoch": 0.18283732990769816, + "grad_norm": 1.9962698221206665, + "learning_rate": 4.598820948892864e-05, + "loss": 5.2593, + "step": 30743 + }, + { + "epoch": 0.18284327719097915, + "grad_norm": 2.0773308277130127, + "learning_rate": 4.5987955703232385e-05, + "loss": 5.3262, + "step": 30744 + }, + { + "epoch": 0.18284922447426016, + "grad_norm": 1.3248738050460815, + "learning_rate": 4.5987701910209445e-05, + "loss": 5.1114, + "step": 30745 + }, + { + "epoch": 0.18285517175754115, + "grad_norm": 1.578334927558899, + "learning_rate": 4.598744810985992e-05, + "loss": 4.863, + "step": 30746 + }, + { + "epoch": 0.18286111904082214, + "grad_norm": 1.6194567680358887, + "learning_rate": 4.59871943021839e-05, + "loss": 5.6935, + "step": 30747 + }, + { + "epoch": 0.18286706632410316, + "grad_norm": 1.4933133125305176, + "learning_rate": 4.5986940487181457e-05, + "loss": 5.7171, + "step": 30748 + }, + { + "epoch": 0.18287301360738414, + "grad_norm": 1.9716706275939941, + "learning_rate": 4.5986686664852694e-05, + "loss": 4.8064, + "step": 30749 + }, + { + "epoch": 0.18287896089066513, + "grad_norm": 1.518475890159607, + "learning_rate": 4.598643283519769e-05, + "loss": 4.675, + "step": 30750 + }, + { + "epoch": 0.18288490817394615, + "grad_norm": 1.9535077810287476, + "learning_rate": 4.598617899821655e-05, + "loss": 4.7127, + "step": 30751 + }, + { + "epoch": 0.18289085545722714, + "grad_norm": 1.7789416313171387, + "learning_rate": 4.598592515390934e-05, + "loss": 4.9423, + "step": 30752 + }, + { + "epoch": 0.18289680274050812, + "grad_norm": 1.5687211751937866, + "learning_rate": 4.5985671302276165e-05, + "loss": 5.1696, + "step": 30753 + }, + { + "epoch": 0.18290275002378914, + "grad_norm": 1.5808442831039429, + "learning_rate": 4.598541744331711e-05, + "loss": 4.9582, + "step": 30754 + }, + { + "epoch": 0.18290869730707013, + "grad_norm": 1.7823104858398438, + "learning_rate": 4.5985163577032264e-05, + "loss": 4.8837, + "step": 30755 + }, + { + "epoch": 0.18291464459035112, + "grad_norm": 1.5199090242385864, + "learning_rate": 4.598490970342172e-05, + "loss": 5.6375, + "step": 30756 + }, + { + "epoch": 0.18292059187363213, + "grad_norm": 1.3824554681777954, + "learning_rate": 4.598465582248555e-05, + "loss": 5.2014, + "step": 30757 + }, + { + "epoch": 0.18292653915691312, + "grad_norm": 1.5041509866714478, + "learning_rate": 4.598440193422386e-05, + "loss": 5.7263, + "step": 30758 + }, + { + "epoch": 0.1829324864401941, + "grad_norm": 1.5195960998535156, + "learning_rate": 4.598414803863673e-05, + "loss": 5.672, + "step": 30759 + }, + { + "epoch": 0.18293843372347512, + "grad_norm": 1.3529129028320312, + "learning_rate": 4.5983894135724245e-05, + "loss": 5.4614, + "step": 30760 + }, + { + "epoch": 0.1829443810067561, + "grad_norm": 1.6261144876480103, + "learning_rate": 4.59836402254865e-05, + "loss": 5.2684, + "step": 30761 + }, + { + "epoch": 0.1829503282900371, + "grad_norm": 1.7360271215438843, + "learning_rate": 4.5983386307923594e-05, + "loss": 5.2797, + "step": 30762 + }, + { + "epoch": 0.18295627557331812, + "grad_norm": 1.720116376876831, + "learning_rate": 4.59831323830356e-05, + "loss": 5.0761, + "step": 30763 + }, + { + "epoch": 0.1829622228565991, + "grad_norm": 1.8357081413269043, + "learning_rate": 4.598287845082261e-05, + "loss": 5.0053, + "step": 30764 + }, + { + "epoch": 0.1829681701398801, + "grad_norm": 2.3169431686401367, + "learning_rate": 4.598262451128471e-05, + "loss": 4.3751, + "step": 30765 + }, + { + "epoch": 0.1829741174231611, + "grad_norm": 2.5170505046844482, + "learning_rate": 4.5982370564422e-05, + "loss": 4.6384, + "step": 30766 + }, + { + "epoch": 0.1829800647064421, + "grad_norm": 1.6446155309677124, + "learning_rate": 4.598211661023455e-05, + "loss": 5.4014, + "step": 30767 + }, + { + "epoch": 0.18298601198972309, + "grad_norm": 1.8167470693588257, + "learning_rate": 4.598186264872247e-05, + "loss": 5.2924, + "step": 30768 + }, + { + "epoch": 0.1829919592730041, + "grad_norm": 1.7999087572097778, + "learning_rate": 4.598160867988584e-05, + "loss": 5.1272, + "step": 30769 + }, + { + "epoch": 0.1829979065562851, + "grad_norm": 1.9385194778442383, + "learning_rate": 4.598135470372473e-05, + "loss": 5.2449, + "step": 30770 + }, + { + "epoch": 0.18300385383956608, + "grad_norm": 1.6628237962722778, + "learning_rate": 4.598110072023927e-05, + "loss": 5.1795, + "step": 30771 + }, + { + "epoch": 0.1830098011228471, + "grad_norm": 1.775499701499939, + "learning_rate": 4.598084672942951e-05, + "loss": 4.2424, + "step": 30772 + }, + { + "epoch": 0.18301574840612808, + "grad_norm": 1.7905422449111938, + "learning_rate": 4.5980592731295554e-05, + "loss": 4.1706, + "step": 30773 + }, + { + "epoch": 0.18302169568940907, + "grad_norm": 2.221928834915161, + "learning_rate": 4.598033872583749e-05, + "loss": 4.5438, + "step": 30774 + }, + { + "epoch": 0.18302764297269009, + "grad_norm": 1.609844446182251, + "learning_rate": 4.59800847130554e-05, + "loss": 4.8598, + "step": 30775 + }, + { + "epoch": 0.18303359025597107, + "grad_norm": 1.642585277557373, + "learning_rate": 4.59798306929494e-05, + "loss": 5.3954, + "step": 30776 + }, + { + "epoch": 0.18303953753925206, + "grad_norm": 1.5810272693634033, + "learning_rate": 4.5979576665519543e-05, + "loss": 5.6698, + "step": 30777 + }, + { + "epoch": 0.18304548482253308, + "grad_norm": 1.5354760885238647, + "learning_rate": 4.597932263076593e-05, + "loss": 5.193, + "step": 30778 + }, + { + "epoch": 0.18305143210581407, + "grad_norm": 1.9338527917861938, + "learning_rate": 4.597906858868866e-05, + "loss": 4.9313, + "step": 30779 + }, + { + "epoch": 0.18305737938909505, + "grad_norm": 1.46987783908844, + "learning_rate": 4.5978814539287804e-05, + "loss": 5.164, + "step": 30780 + }, + { + "epoch": 0.18306332667237607, + "grad_norm": 1.794464349746704, + "learning_rate": 4.597856048256348e-05, + "loss": 5.2455, + "step": 30781 + }, + { + "epoch": 0.18306927395565706, + "grad_norm": 2.2967662811279297, + "learning_rate": 4.5978306418515736e-05, + "loss": 3.8119, + "step": 30782 + }, + { + "epoch": 0.18307522123893805, + "grad_norm": 3.0278241634368896, + "learning_rate": 4.59780523471447e-05, + "loss": 2.8503, + "step": 30783 + }, + { + "epoch": 0.18308116852221906, + "grad_norm": 2.2508223056793213, + "learning_rate": 4.597779826845043e-05, + "loss": 3.6492, + "step": 30784 + }, + { + "epoch": 0.18308711580550005, + "grad_norm": 1.6087052822113037, + "learning_rate": 4.597754418243303e-05, + "loss": 5.4596, + "step": 30785 + }, + { + "epoch": 0.18309306308878104, + "grad_norm": 1.630355954170227, + "learning_rate": 4.597729008909258e-05, + "loss": 5.2701, + "step": 30786 + }, + { + "epoch": 0.18309901037206205, + "grad_norm": 2.1090071201324463, + "learning_rate": 4.597703598842919e-05, + "loss": 4.9463, + "step": 30787 + }, + { + "epoch": 0.18310495765534304, + "grad_norm": 1.7964558601379395, + "learning_rate": 4.597678188044292e-05, + "loss": 5.1681, + "step": 30788 + }, + { + "epoch": 0.18311090493862403, + "grad_norm": 1.802701473236084, + "learning_rate": 4.5976527765133884e-05, + "loss": 5.6211, + "step": 30789 + }, + { + "epoch": 0.18311685222190505, + "grad_norm": 2.110750675201416, + "learning_rate": 4.5976273642502146e-05, + "loss": 5.3476, + "step": 30790 + }, + { + "epoch": 0.18312279950518603, + "grad_norm": 1.558624029159546, + "learning_rate": 4.5976019512547816e-05, + "loss": 5.9624, + "step": 30791 + }, + { + "epoch": 0.18312874678846702, + "grad_norm": 2.025865316390991, + "learning_rate": 4.597576537527097e-05, + "loss": 4.7095, + "step": 30792 + }, + { + "epoch": 0.18313469407174804, + "grad_norm": 1.986502766609192, + "learning_rate": 4.59755112306717e-05, + "loss": 5.0208, + "step": 30793 + }, + { + "epoch": 0.18314064135502903, + "grad_norm": 2.3034214973449707, + "learning_rate": 4.59752570787501e-05, + "loss": 5.3155, + "step": 30794 + }, + { + "epoch": 0.18314658863831002, + "grad_norm": 2.048161029815674, + "learning_rate": 4.597500291950626e-05, + "loss": 5.3074, + "step": 30795 + }, + { + "epoch": 0.18315253592159103, + "grad_norm": 1.9678623676300049, + "learning_rate": 4.5974748752940255e-05, + "loss": 5.6205, + "step": 30796 + }, + { + "epoch": 0.18315848320487202, + "grad_norm": 1.9089009761810303, + "learning_rate": 4.597449457905218e-05, + "loss": 5.1992, + "step": 30797 + }, + { + "epoch": 0.183164430488153, + "grad_norm": 1.6243164539337158, + "learning_rate": 4.5974240397842126e-05, + "loss": 5.8933, + "step": 30798 + }, + { + "epoch": 0.183170377771434, + "grad_norm": 1.676802396774292, + "learning_rate": 4.597398620931019e-05, + "loss": 5.2581, + "step": 30799 + }, + { + "epoch": 0.183176325054715, + "grad_norm": 1.8412030935287476, + "learning_rate": 4.5973732013456444e-05, + "loss": 5.3714, + "step": 30800 + }, + { + "epoch": 0.183182272337996, + "grad_norm": 1.725168228149414, + "learning_rate": 4.597347781028099e-05, + "loss": 5.365, + "step": 30801 + }, + { + "epoch": 0.183188219621277, + "grad_norm": 1.681129813194275, + "learning_rate": 4.5973223599783906e-05, + "loss": 5.6418, + "step": 30802 + }, + { + "epoch": 0.183194166904558, + "grad_norm": 2.0006189346313477, + "learning_rate": 4.597296938196529e-05, + "loss": 4.1994, + "step": 30803 + }, + { + "epoch": 0.183200114187839, + "grad_norm": 1.8607888221740723, + "learning_rate": 4.5972715156825225e-05, + "loss": 5.2584, + "step": 30804 + }, + { + "epoch": 0.18320606147111998, + "grad_norm": 1.9822429418563843, + "learning_rate": 4.59724609243638e-05, + "loss": 5.282, + "step": 30805 + }, + { + "epoch": 0.183212008754401, + "grad_norm": 1.6500173807144165, + "learning_rate": 4.597220668458111e-05, + "loss": 5.1869, + "step": 30806 + }, + { + "epoch": 0.18321795603768198, + "grad_norm": 1.6790781021118164, + "learning_rate": 4.597195243747724e-05, + "loss": 5.3556, + "step": 30807 + }, + { + "epoch": 0.18322390332096297, + "grad_norm": 2.0036866664886475, + "learning_rate": 4.597169818305228e-05, + "loss": 5.1406, + "step": 30808 + }, + { + "epoch": 0.183229850604244, + "grad_norm": 1.4782299995422363, + "learning_rate": 4.5971443921306315e-05, + "loss": 5.3893, + "step": 30809 + }, + { + "epoch": 0.18323579788752498, + "grad_norm": 2.174090623855591, + "learning_rate": 4.597118965223942e-05, + "loss": 5.0501, + "step": 30810 + }, + { + "epoch": 0.18324174517080596, + "grad_norm": 2.0031697750091553, + "learning_rate": 4.597093537585172e-05, + "loss": 5.1521, + "step": 30811 + }, + { + "epoch": 0.18324769245408698, + "grad_norm": 2.0772757530212402, + "learning_rate": 4.597068109214328e-05, + "loss": 4.1726, + "step": 30812 + }, + { + "epoch": 0.18325363973736797, + "grad_norm": 2.2878589630126953, + "learning_rate": 4.597042680111418e-05, + "loss": 4.0209, + "step": 30813 + }, + { + "epoch": 0.18325958702064896, + "grad_norm": 1.8026955127716064, + "learning_rate": 4.597017250276453e-05, + "loss": 5.3708, + "step": 30814 + }, + { + "epoch": 0.18326553430392997, + "grad_norm": 2.1650643348693848, + "learning_rate": 4.596991819709441e-05, + "loss": 3.9999, + "step": 30815 + }, + { + "epoch": 0.18327148158721096, + "grad_norm": 2.541799306869507, + "learning_rate": 4.59696638841039e-05, + "loss": 4.6602, + "step": 30816 + }, + { + "epoch": 0.18327742887049195, + "grad_norm": 1.9072203636169434, + "learning_rate": 4.596940956379311e-05, + "loss": 5.3711, + "step": 30817 + }, + { + "epoch": 0.18328337615377296, + "grad_norm": 1.8470267057418823, + "learning_rate": 4.596915523616211e-05, + "loss": 5.4715, + "step": 30818 + }, + { + "epoch": 0.18328932343705395, + "grad_norm": 1.887373924255371, + "learning_rate": 4.596890090121099e-05, + "loss": 5.9223, + "step": 30819 + }, + { + "epoch": 0.18329527072033494, + "grad_norm": 1.7427541017532349, + "learning_rate": 4.596864655893984e-05, + "loss": 5.8105, + "step": 30820 + }, + { + "epoch": 0.18330121800361596, + "grad_norm": 1.5923210382461548, + "learning_rate": 4.5968392209348763e-05, + "loss": 5.1934, + "step": 30821 + }, + { + "epoch": 0.18330716528689694, + "grad_norm": 2.4530539512634277, + "learning_rate": 4.596813785243783e-05, + "loss": 4.6757, + "step": 30822 + }, + { + "epoch": 0.18331311257017793, + "grad_norm": 2.533837080001831, + "learning_rate": 4.596788348820714e-05, + "loss": 4.1553, + "step": 30823 + }, + { + "epoch": 0.18331905985345895, + "grad_norm": 2.394258737564087, + "learning_rate": 4.596762911665678e-05, + "loss": 3.9019, + "step": 30824 + }, + { + "epoch": 0.18332500713673994, + "grad_norm": 1.8879469633102417, + "learning_rate": 4.596737473778684e-05, + "loss": 4.7484, + "step": 30825 + }, + { + "epoch": 0.18333095442002093, + "grad_norm": 1.896796464920044, + "learning_rate": 4.59671203515974e-05, + "loss": 5.5434, + "step": 30826 + }, + { + "epoch": 0.18333690170330194, + "grad_norm": 1.7430917024612427, + "learning_rate": 4.5966865958088555e-05, + "loss": 5.4315, + "step": 30827 + }, + { + "epoch": 0.18334284898658293, + "grad_norm": 2.2284209728240967, + "learning_rate": 4.59666115572604e-05, + "loss": 3.7282, + "step": 30828 + }, + { + "epoch": 0.18334879626986392, + "grad_norm": 2.362053155899048, + "learning_rate": 4.5966357149113005e-05, + "loss": 3.999, + "step": 30829 + }, + { + "epoch": 0.18335474355314493, + "grad_norm": 2.5124330520629883, + "learning_rate": 4.596610273364648e-05, + "loss": 3.9441, + "step": 30830 + }, + { + "epoch": 0.18336069083642592, + "grad_norm": 2.0157835483551025, + "learning_rate": 4.5965848310860906e-05, + "loss": 4.6031, + "step": 30831 + }, + { + "epoch": 0.1833666381197069, + "grad_norm": 2.036010503768921, + "learning_rate": 4.5965593880756365e-05, + "loss": 5.4114, + "step": 30832 + }, + { + "epoch": 0.18337258540298793, + "grad_norm": 1.6221730709075928, + "learning_rate": 4.596533944333296e-05, + "loss": 5.1669, + "step": 30833 + }, + { + "epoch": 0.1833785326862689, + "grad_norm": 1.5751827955245972, + "learning_rate": 4.5965084998590765e-05, + "loss": 5.0151, + "step": 30834 + }, + { + "epoch": 0.1833844799695499, + "grad_norm": 1.7404930591583252, + "learning_rate": 4.596483054652988e-05, + "loss": 5.0466, + "step": 30835 + }, + { + "epoch": 0.18339042725283092, + "grad_norm": 2.216836452484131, + "learning_rate": 4.5964576087150384e-05, + "loss": 3.9343, + "step": 30836 + }, + { + "epoch": 0.1833963745361119, + "grad_norm": 2.5696306228637695, + "learning_rate": 4.596432162045238e-05, + "loss": 3.9757, + "step": 30837 + }, + { + "epoch": 0.1834023218193929, + "grad_norm": 2.1181252002716064, + "learning_rate": 4.596406714643594e-05, + "loss": 3.6056, + "step": 30838 + }, + { + "epoch": 0.1834082691026739, + "grad_norm": 1.6865168809890747, + "learning_rate": 4.596381266510116e-05, + "loss": 4.7002, + "step": 30839 + }, + { + "epoch": 0.1834142163859549, + "grad_norm": 1.8423880338668823, + "learning_rate": 4.596355817644813e-05, + "loss": 5.0851, + "step": 30840 + }, + { + "epoch": 0.18342016366923589, + "grad_norm": 2.2296884059906006, + "learning_rate": 4.5963303680476945e-05, + "loss": 4.5105, + "step": 30841 + }, + { + "epoch": 0.1834261109525169, + "grad_norm": 2.051112413406372, + "learning_rate": 4.596304917718768e-05, + "loss": 4.465, + "step": 30842 + }, + { + "epoch": 0.1834320582357979, + "grad_norm": 1.638643741607666, + "learning_rate": 4.5962794666580435e-05, + "loss": 4.8162, + "step": 30843 + }, + { + "epoch": 0.18343800551907888, + "grad_norm": 1.5052911043167114, + "learning_rate": 4.5962540148655294e-05, + "loss": 5.7588, + "step": 30844 + }, + { + "epoch": 0.1834439528023599, + "grad_norm": 1.9093655347824097, + "learning_rate": 4.596228562341235e-05, + "loss": 4.9756, + "step": 30845 + }, + { + "epoch": 0.18344990008564088, + "grad_norm": 1.8002632856369019, + "learning_rate": 4.596203109085168e-05, + "loss": 5.4573, + "step": 30846 + }, + { + "epoch": 0.18345584736892187, + "grad_norm": 1.6063766479492188, + "learning_rate": 4.596177655097339e-05, + "loss": 5.1171, + "step": 30847 + }, + { + "epoch": 0.1834617946522029, + "grad_norm": 1.9913804531097412, + "learning_rate": 4.5961522003777554e-05, + "loss": 4.6128, + "step": 30848 + }, + { + "epoch": 0.18346774193548387, + "grad_norm": 1.7059962749481201, + "learning_rate": 4.5961267449264276e-05, + "loss": 5.1847, + "step": 30849 + }, + { + "epoch": 0.18347368921876486, + "grad_norm": 1.802331566810608, + "learning_rate": 4.596101288743362e-05, + "loss": 4.8961, + "step": 30850 + }, + { + "epoch": 0.18347963650204588, + "grad_norm": 3.5751075744628906, + "learning_rate": 4.596075831828571e-05, + "loss": 4.4167, + "step": 30851 + }, + { + "epoch": 0.18348558378532687, + "grad_norm": 3.360201597213745, + "learning_rate": 4.59605037418206e-05, + "loss": 4.7809, + "step": 30852 + }, + { + "epoch": 0.18349153106860785, + "grad_norm": 1.7143275737762451, + "learning_rate": 4.5960249158038406e-05, + "loss": 4.853, + "step": 30853 + }, + { + "epoch": 0.18349747835188887, + "grad_norm": 1.6688681840896606, + "learning_rate": 4.59599945669392e-05, + "loss": 5.0618, + "step": 30854 + }, + { + "epoch": 0.18350342563516986, + "grad_norm": 1.5650609731674194, + "learning_rate": 4.595973996852308e-05, + "loss": 4.8802, + "step": 30855 + }, + { + "epoch": 0.18350937291845085, + "grad_norm": 2.0990846157073975, + "learning_rate": 4.595948536279013e-05, + "loss": 4.4201, + "step": 30856 + }, + { + "epoch": 0.18351532020173184, + "grad_norm": 2.3723249435424805, + "learning_rate": 4.595923074974044e-05, + "loss": 4.2342, + "step": 30857 + }, + { + "epoch": 0.18352126748501285, + "grad_norm": 2.24969744682312, + "learning_rate": 4.59589761293741e-05, + "loss": 4.0662, + "step": 30858 + }, + { + "epoch": 0.18352721476829384, + "grad_norm": 2.0236549377441406, + "learning_rate": 4.595872150169119e-05, + "loss": 5.124, + "step": 30859 + }, + { + "epoch": 0.18353316205157483, + "grad_norm": 2.5715887546539307, + "learning_rate": 4.595846686669182e-05, + "loss": 4.1854, + "step": 30860 + }, + { + "epoch": 0.18353910933485584, + "grad_norm": 2.2042219638824463, + "learning_rate": 4.595821222437606e-05, + "loss": 4.55, + "step": 30861 + }, + { + "epoch": 0.18354505661813683, + "grad_norm": 1.5966359376907349, + "learning_rate": 4.5957957574744007e-05, + "loss": 4.9982, + "step": 30862 + }, + { + "epoch": 0.18355100390141782, + "grad_norm": 1.5397683382034302, + "learning_rate": 4.595770291779574e-05, + "loss": 4.568, + "step": 30863 + }, + { + "epoch": 0.18355695118469884, + "grad_norm": 2.3468825817108154, + "learning_rate": 4.595744825353136e-05, + "loss": 3.9617, + "step": 30864 + }, + { + "epoch": 0.18356289846797982, + "grad_norm": 2.3146417140960693, + "learning_rate": 4.595719358195095e-05, + "loss": 4.0914, + "step": 30865 + }, + { + "epoch": 0.1835688457512608, + "grad_norm": 2.2103490829467773, + "learning_rate": 4.59569389030546e-05, + "loss": 4.1443, + "step": 30866 + }, + { + "epoch": 0.18357479303454183, + "grad_norm": 2.2794134616851807, + "learning_rate": 4.59566842168424e-05, + "loss": 4.1926, + "step": 30867 + }, + { + "epoch": 0.18358074031782282, + "grad_norm": 2.3235437870025635, + "learning_rate": 4.595642952331444e-05, + "loss": 4.0462, + "step": 30868 + }, + { + "epoch": 0.1835866876011038, + "grad_norm": 2.440493583679199, + "learning_rate": 4.595617482247081e-05, + "loss": 4.0408, + "step": 30869 + }, + { + "epoch": 0.18359263488438482, + "grad_norm": 2.231560230255127, + "learning_rate": 4.595592011431159e-05, + "loss": 4.095, + "step": 30870 + }, + { + "epoch": 0.1835985821676658, + "grad_norm": 1.8984894752502441, + "learning_rate": 4.5955665398836877e-05, + "loss": 5.1887, + "step": 30871 + }, + { + "epoch": 0.1836045294509468, + "grad_norm": 1.725150465965271, + "learning_rate": 4.5955410676046754e-05, + "loss": 5.0515, + "step": 30872 + }, + { + "epoch": 0.1836104767342278, + "grad_norm": 1.5244455337524414, + "learning_rate": 4.595515594594132e-05, + "loss": 5.0655, + "step": 30873 + }, + { + "epoch": 0.1836164240175088, + "grad_norm": 1.5998716354370117, + "learning_rate": 4.595490120852065e-05, + "loss": 5.3198, + "step": 30874 + }, + { + "epoch": 0.1836223713007898, + "grad_norm": 1.787981390953064, + "learning_rate": 4.595464646378485e-05, + "loss": 4.6043, + "step": 30875 + }, + { + "epoch": 0.1836283185840708, + "grad_norm": 1.4464097023010254, + "learning_rate": 4.595439171173399e-05, + "loss": 4.7063, + "step": 30876 + }, + { + "epoch": 0.1836342658673518, + "grad_norm": 2.4086809158325195, + "learning_rate": 4.5954136952368175e-05, + "loss": 4.1193, + "step": 30877 + }, + { + "epoch": 0.18364021315063278, + "grad_norm": 2.57763671875, + "learning_rate": 4.595388218568748e-05, + "loss": 4.1104, + "step": 30878 + }, + { + "epoch": 0.1836461604339138, + "grad_norm": 2.3610222339630127, + "learning_rate": 4.5953627411692016e-05, + "loss": 3.9965, + "step": 30879 + }, + { + "epoch": 0.18365210771719478, + "grad_norm": 1.8578461408615112, + "learning_rate": 4.5953372630381845e-05, + "loss": 4.6334, + "step": 30880 + }, + { + "epoch": 0.18365805500047577, + "grad_norm": 1.5059680938720703, + "learning_rate": 4.595311784175706e-05, + "loss": 4.4804, + "step": 30881 + }, + { + "epoch": 0.1836640022837568, + "grad_norm": 1.833595871925354, + "learning_rate": 4.595286304581777e-05, + "loss": 4.563, + "step": 30882 + }, + { + "epoch": 0.18366994956703778, + "grad_norm": 1.8078968524932861, + "learning_rate": 4.595260824256405e-05, + "loss": 4.9626, + "step": 30883 + }, + { + "epoch": 0.18367589685031877, + "grad_norm": 1.5788074731826782, + "learning_rate": 4.5952353431996e-05, + "loss": 5.3483, + "step": 30884 + }, + { + "epoch": 0.18368184413359978, + "grad_norm": 1.642112135887146, + "learning_rate": 4.5952098614113684e-05, + "loss": 5.2537, + "step": 30885 + }, + { + "epoch": 0.18368779141688077, + "grad_norm": 1.4819180965423584, + "learning_rate": 4.595184378891722e-05, + "loss": 4.5967, + "step": 30886 + }, + { + "epoch": 0.18369373870016176, + "grad_norm": 1.5278507471084595, + "learning_rate": 4.5951588956406676e-05, + "loss": 4.6367, + "step": 30887 + }, + { + "epoch": 0.18369968598344277, + "grad_norm": 1.7402983903884888, + "learning_rate": 4.595133411658215e-05, + "loss": 4.7334, + "step": 30888 + }, + { + "epoch": 0.18370563326672376, + "grad_norm": 1.892587423324585, + "learning_rate": 4.595107926944373e-05, + "loss": 4.7473, + "step": 30889 + }, + { + "epoch": 0.18371158055000475, + "grad_norm": 1.741618275642395, + "learning_rate": 4.59508244149915e-05, + "loss": 4.5484, + "step": 30890 + }, + { + "epoch": 0.18371752783328577, + "grad_norm": 1.8447742462158203, + "learning_rate": 4.5950569553225565e-05, + "loss": 4.853, + "step": 30891 + }, + { + "epoch": 0.18372347511656675, + "grad_norm": 1.8637365102767944, + "learning_rate": 4.595031468414599e-05, + "loss": 5.2374, + "step": 30892 + }, + { + "epoch": 0.18372942239984774, + "grad_norm": 1.9203366041183472, + "learning_rate": 4.5950059807752886e-05, + "loss": 4.9026, + "step": 30893 + }, + { + "epoch": 0.18373536968312876, + "grad_norm": 1.5132418870925903, + "learning_rate": 4.5949804924046324e-05, + "loss": 4.7941, + "step": 30894 + }, + { + "epoch": 0.18374131696640975, + "grad_norm": 1.567147135734558, + "learning_rate": 4.594955003302641e-05, + "loss": 4.6679, + "step": 30895 + }, + { + "epoch": 0.18374726424969073, + "grad_norm": 1.6055753231048584, + "learning_rate": 4.594929513469322e-05, + "loss": 4.6216, + "step": 30896 + }, + { + "epoch": 0.18375321153297175, + "grad_norm": 1.609041690826416, + "learning_rate": 4.594904022904685e-05, + "loss": 4.6356, + "step": 30897 + }, + { + "epoch": 0.18375915881625274, + "grad_norm": 1.7323532104492188, + "learning_rate": 4.594878531608738e-05, + "loss": 5.333, + "step": 30898 + }, + { + "epoch": 0.18376510609953373, + "grad_norm": 1.7134934663772583, + "learning_rate": 4.5948530395814916e-05, + "loss": 5.3289, + "step": 30899 + }, + { + "epoch": 0.18377105338281474, + "grad_norm": 1.6868717670440674, + "learning_rate": 4.594827546822953e-05, + "loss": 4.7537, + "step": 30900 + }, + { + "epoch": 0.18377700066609573, + "grad_norm": 1.6590864658355713, + "learning_rate": 4.594802053333132e-05, + "loss": 5.8669, + "step": 30901 + }, + { + "epoch": 0.18378294794937672, + "grad_norm": 1.964417576789856, + "learning_rate": 4.594776559112037e-05, + "loss": 5.4957, + "step": 30902 + }, + { + "epoch": 0.18378889523265773, + "grad_norm": 1.68085777759552, + "learning_rate": 4.5947510641596775e-05, + "loss": 5.1391, + "step": 30903 + }, + { + "epoch": 0.18379484251593872, + "grad_norm": 1.7038891315460205, + "learning_rate": 4.5947255684760615e-05, + "loss": 5.1364, + "step": 30904 + }, + { + "epoch": 0.1838007897992197, + "grad_norm": 1.7355235815048218, + "learning_rate": 4.5947000720611985e-05, + "loss": 4.9449, + "step": 30905 + }, + { + "epoch": 0.18380673708250073, + "grad_norm": 1.458635926246643, + "learning_rate": 4.594674574915098e-05, + "loss": 5.0392, + "step": 30906 + }, + { + "epoch": 0.18381268436578171, + "grad_norm": 1.7265875339508057, + "learning_rate": 4.594649077037768e-05, + "loss": 4.9802, + "step": 30907 + }, + { + "epoch": 0.1838186316490627, + "grad_norm": 1.5100198984146118, + "learning_rate": 4.594623578429217e-05, + "loss": 5.0036, + "step": 30908 + }, + { + "epoch": 0.18382457893234372, + "grad_norm": 1.6836403608322144, + "learning_rate": 4.5945980790894553e-05, + "loss": 4.5476, + "step": 30909 + }, + { + "epoch": 0.1838305262156247, + "grad_norm": 1.6595370769500732, + "learning_rate": 4.5945725790184905e-05, + "loss": 4.9626, + "step": 30910 + }, + { + "epoch": 0.1838364734989057, + "grad_norm": 1.6304545402526855, + "learning_rate": 4.594547078216332e-05, + "loss": 5.1261, + "step": 30911 + }, + { + "epoch": 0.1838424207821867, + "grad_norm": 1.6057839393615723, + "learning_rate": 4.5945215766829894e-05, + "loss": 5.2167, + "step": 30912 + }, + { + "epoch": 0.1838483680654677, + "grad_norm": 1.5401513576507568, + "learning_rate": 4.594496074418471e-05, + "loss": 4.8433, + "step": 30913 + }, + { + "epoch": 0.1838543153487487, + "grad_norm": 1.6510026454925537, + "learning_rate": 4.594470571422785e-05, + "loss": 4.407, + "step": 30914 + }, + { + "epoch": 0.18386026263202968, + "grad_norm": 1.5904121398925781, + "learning_rate": 4.5944450676959414e-05, + "loss": 4.7868, + "step": 30915 + }, + { + "epoch": 0.1838662099153107, + "grad_norm": 1.5439600944519043, + "learning_rate": 4.594419563237949e-05, + "loss": 4.9075, + "step": 30916 + }, + { + "epoch": 0.18387215719859168, + "grad_norm": 1.6869488954544067, + "learning_rate": 4.5943940580488154e-05, + "loss": 4.7118, + "step": 30917 + }, + { + "epoch": 0.18387810448187267, + "grad_norm": 1.858880639076233, + "learning_rate": 4.594368552128551e-05, + "loss": 5.226, + "step": 30918 + }, + { + "epoch": 0.18388405176515368, + "grad_norm": 1.7510879039764404, + "learning_rate": 4.5943430454771644e-05, + "loss": 4.8886, + "step": 30919 + }, + { + "epoch": 0.18388999904843467, + "grad_norm": 1.6084439754486084, + "learning_rate": 4.594317538094664e-05, + "loss": 4.7247, + "step": 30920 + }, + { + "epoch": 0.18389594633171566, + "grad_norm": 1.7126952409744263, + "learning_rate": 4.594292029981059e-05, + "loss": 5.2381, + "step": 30921 + }, + { + "epoch": 0.18390189361499668, + "grad_norm": 1.8401120901107788, + "learning_rate": 4.594266521136358e-05, + "loss": 5.2361, + "step": 30922 + }, + { + "epoch": 0.18390784089827766, + "grad_norm": 1.7398508787155151, + "learning_rate": 4.59424101156057e-05, + "loss": 5.016, + "step": 30923 + }, + { + "epoch": 0.18391378818155865, + "grad_norm": 1.9287174940109253, + "learning_rate": 4.5942155012537056e-05, + "loss": 4.8992, + "step": 30924 + }, + { + "epoch": 0.18391973546483967, + "grad_norm": 1.8512134552001953, + "learning_rate": 4.5941899902157715e-05, + "loss": 4.815, + "step": 30925 + }, + { + "epoch": 0.18392568274812066, + "grad_norm": 1.500188946723938, + "learning_rate": 4.594164478446776e-05, + "loss": 4.9531, + "step": 30926 + }, + { + "epoch": 0.18393163003140164, + "grad_norm": 1.597621202468872, + "learning_rate": 4.594138965946731e-05, + "loss": 4.984, + "step": 30927 + }, + { + "epoch": 0.18393757731468266, + "grad_norm": 2.3577587604522705, + "learning_rate": 4.594113452715643e-05, + "loss": 4.5873, + "step": 30928 + }, + { + "epoch": 0.18394352459796365, + "grad_norm": 1.807442545890808, + "learning_rate": 4.594087938753522e-05, + "loss": 4.2157, + "step": 30929 + }, + { + "epoch": 0.18394947188124464, + "grad_norm": 1.7667385339736938, + "learning_rate": 4.594062424060376e-05, + "loss": 4.7323, + "step": 30930 + }, + { + "epoch": 0.18395541916452565, + "grad_norm": 1.7243330478668213, + "learning_rate": 4.5940369086362144e-05, + "loss": 5.2673, + "step": 30931 + }, + { + "epoch": 0.18396136644780664, + "grad_norm": 1.6076741218566895, + "learning_rate": 4.594011392481047e-05, + "loss": 5.2537, + "step": 30932 + }, + { + "epoch": 0.18396731373108763, + "grad_norm": 1.8104612827301025, + "learning_rate": 4.5939858755948806e-05, + "loss": 5.2573, + "step": 30933 + }, + { + "epoch": 0.18397326101436864, + "grad_norm": 1.4915204048156738, + "learning_rate": 4.5939603579777266e-05, + "loss": 5.1661, + "step": 30934 + }, + { + "epoch": 0.18397920829764963, + "grad_norm": 1.6471868753433228, + "learning_rate": 4.593934839629592e-05, + "loss": 4.8264, + "step": 30935 + }, + { + "epoch": 0.18398515558093062, + "grad_norm": 1.6875669956207275, + "learning_rate": 4.593909320550486e-05, + "loss": 5.0788, + "step": 30936 + }, + { + "epoch": 0.18399110286421164, + "grad_norm": 1.9455054998397827, + "learning_rate": 4.5938838007404185e-05, + "loss": 5.022, + "step": 30937 + }, + { + "epoch": 0.18399705014749262, + "grad_norm": 2.0597965717315674, + "learning_rate": 4.593858280199398e-05, + "loss": 4.6885, + "step": 30938 + }, + { + "epoch": 0.1840029974307736, + "grad_norm": 1.8781501054763794, + "learning_rate": 4.5938327589274324e-05, + "loss": 5.0725, + "step": 30939 + }, + { + "epoch": 0.18400894471405463, + "grad_norm": 1.7399587631225586, + "learning_rate": 4.593807236924532e-05, + "loss": 5.0705, + "step": 30940 + }, + { + "epoch": 0.18401489199733562, + "grad_norm": 1.5905550718307495, + "learning_rate": 4.5937817141907054e-05, + "loss": 5.269, + "step": 30941 + }, + { + "epoch": 0.1840208392806166, + "grad_norm": 1.5723954439163208, + "learning_rate": 4.5937561907259604e-05, + "loss": 5.1356, + "step": 30942 + }, + { + "epoch": 0.18402678656389762, + "grad_norm": 1.725982904434204, + "learning_rate": 4.593730666530307e-05, + "loss": 4.9754, + "step": 30943 + }, + { + "epoch": 0.1840327338471786, + "grad_norm": 1.5784368515014648, + "learning_rate": 4.593705141603755e-05, + "loss": 4.8637, + "step": 30944 + }, + { + "epoch": 0.1840386811304596, + "grad_norm": 1.2270019054412842, + "learning_rate": 4.5936796159463106e-05, + "loss": 4.4398, + "step": 30945 + }, + { + "epoch": 0.1840446284137406, + "grad_norm": 1.6701734066009521, + "learning_rate": 4.593654089557985e-05, + "loss": 4.6544, + "step": 30946 + }, + { + "epoch": 0.1840505756970216, + "grad_norm": 1.6493332386016846, + "learning_rate": 4.5936285624387865e-05, + "loss": 4.9398, + "step": 30947 + }, + { + "epoch": 0.1840565229803026, + "grad_norm": 1.6047924757003784, + "learning_rate": 4.5936030345887236e-05, + "loss": 4.6506, + "step": 30948 + }, + { + "epoch": 0.1840624702635836, + "grad_norm": 1.6082524061203003, + "learning_rate": 4.5935775060078055e-05, + "loss": 4.8463, + "step": 30949 + }, + { + "epoch": 0.1840684175468646, + "grad_norm": 1.603140115737915, + "learning_rate": 4.593551976696041e-05, + "loss": 4.9072, + "step": 30950 + }, + { + "epoch": 0.18407436483014558, + "grad_norm": 1.6736758947372437, + "learning_rate": 4.593526446653439e-05, + "loss": 4.8175, + "step": 30951 + }, + { + "epoch": 0.1840803121134266, + "grad_norm": 2.159503221511841, + "learning_rate": 4.593500915880009e-05, + "loss": 5.2822, + "step": 30952 + }, + { + "epoch": 0.18408625939670759, + "grad_norm": 2.116179943084717, + "learning_rate": 4.59347538437576e-05, + "loss": 4.9114, + "step": 30953 + }, + { + "epoch": 0.18409220667998857, + "grad_norm": 2.1627538204193115, + "learning_rate": 4.5934498521407e-05, + "loss": 4.6353, + "step": 30954 + }, + { + "epoch": 0.1840981539632696, + "grad_norm": 1.7306194305419922, + "learning_rate": 4.593424319174838e-05, + "loss": 5.0884, + "step": 30955 + }, + { + "epoch": 0.18410410124655058, + "grad_norm": 1.7881605625152588, + "learning_rate": 4.5933987854781824e-05, + "loss": 5.4829, + "step": 30956 + }, + { + "epoch": 0.18411004852983157, + "grad_norm": 1.6097657680511475, + "learning_rate": 4.5933732510507446e-05, + "loss": 5.4447, + "step": 30957 + }, + { + "epoch": 0.18411599581311258, + "grad_norm": 1.4753258228302002, + "learning_rate": 4.59334771589253e-05, + "loss": 5.4069, + "step": 30958 + }, + { + "epoch": 0.18412194309639357, + "grad_norm": 1.4360363483428955, + "learning_rate": 4.593322180003551e-05, + "loss": 5.3144, + "step": 30959 + }, + { + "epoch": 0.18412789037967456, + "grad_norm": 1.5445841550827026, + "learning_rate": 4.593296643383814e-05, + "loss": 5.3294, + "step": 30960 + }, + { + "epoch": 0.18413383766295557, + "grad_norm": 1.8465672731399536, + "learning_rate": 4.593271106033329e-05, + "loss": 5.2602, + "step": 30961 + }, + { + "epoch": 0.18413978494623656, + "grad_norm": 1.7009365558624268, + "learning_rate": 4.5932455679521046e-05, + "loss": 5.2779, + "step": 30962 + }, + { + "epoch": 0.18414573222951755, + "grad_norm": 1.5198291540145874, + "learning_rate": 4.593220029140149e-05, + "loss": 5.1775, + "step": 30963 + }, + { + "epoch": 0.18415167951279857, + "grad_norm": 1.5233417749404907, + "learning_rate": 4.5931944895974735e-05, + "loss": 5.1338, + "step": 30964 + }, + { + "epoch": 0.18415762679607955, + "grad_norm": 1.3948924541473389, + "learning_rate": 4.593168949324084e-05, + "loss": 5.2121, + "step": 30965 + }, + { + "epoch": 0.18416357407936054, + "grad_norm": 1.596511721611023, + "learning_rate": 4.593143408319992e-05, + "loss": 5.1374, + "step": 30966 + }, + { + "epoch": 0.18416952136264156, + "grad_norm": 1.8476365804672241, + "learning_rate": 4.593117866585205e-05, + "loss": 5.0453, + "step": 30967 + }, + { + "epoch": 0.18417546864592255, + "grad_norm": 1.9178073406219482, + "learning_rate": 4.5930923241197315e-05, + "loss": 5.1195, + "step": 30968 + }, + { + "epoch": 0.18418141592920353, + "grad_norm": 1.8207836151123047, + "learning_rate": 4.593066780923582e-05, + "loss": 4.8808, + "step": 30969 + }, + { + "epoch": 0.18418736321248455, + "grad_norm": 1.556929588317871, + "learning_rate": 4.5930412369967636e-05, + "loss": 5.042, + "step": 30970 + }, + { + "epoch": 0.18419331049576554, + "grad_norm": 1.5927326679229736, + "learning_rate": 4.593015692339286e-05, + "loss": 4.9574, + "step": 30971 + }, + { + "epoch": 0.18419925777904653, + "grad_norm": 1.686204195022583, + "learning_rate": 4.5929901469511594e-05, + "loss": 5.1615, + "step": 30972 + }, + { + "epoch": 0.18420520506232752, + "grad_norm": 1.8560882806777954, + "learning_rate": 4.5929646008323915e-05, + "loss": 5.4144, + "step": 30973 + }, + { + "epoch": 0.18421115234560853, + "grad_norm": 1.9906892776489258, + "learning_rate": 4.59293905398299e-05, + "loss": 5.5249, + "step": 30974 + }, + { + "epoch": 0.18421709962888952, + "grad_norm": 1.8656678199768066, + "learning_rate": 4.592913506402966e-05, + "loss": 5.4574, + "step": 30975 + }, + { + "epoch": 0.1842230469121705, + "grad_norm": 1.5969977378845215, + "learning_rate": 4.592887958092327e-05, + "loss": 5.3052, + "step": 30976 + }, + { + "epoch": 0.18422899419545152, + "grad_norm": 1.8761509656906128, + "learning_rate": 4.592862409051083e-05, + "loss": 5.4617, + "step": 30977 + }, + { + "epoch": 0.1842349414787325, + "grad_norm": 1.7512613534927368, + "learning_rate": 4.592836859279243e-05, + "loss": 5.1404, + "step": 30978 + }, + { + "epoch": 0.1842408887620135, + "grad_norm": 1.9314844608306885, + "learning_rate": 4.592811308776814e-05, + "loss": 5.1451, + "step": 30979 + }, + { + "epoch": 0.18424683604529452, + "grad_norm": 1.7287604808807373, + "learning_rate": 4.592785757543806e-05, + "loss": 4.9971, + "step": 30980 + }, + { + "epoch": 0.1842527833285755, + "grad_norm": 1.5554300546646118, + "learning_rate": 4.592760205580229e-05, + "loss": 4.9128, + "step": 30981 + }, + { + "epoch": 0.1842587306118565, + "grad_norm": 1.5447009801864624, + "learning_rate": 4.5927346528860907e-05, + "loss": 5.0247, + "step": 30982 + }, + { + "epoch": 0.1842646778951375, + "grad_norm": 1.4151129722595215, + "learning_rate": 4.592709099461401e-05, + "loss": 4.9106, + "step": 30983 + }, + { + "epoch": 0.1842706251784185, + "grad_norm": 1.4430291652679443, + "learning_rate": 4.5926835453061665e-05, + "loss": 5.0316, + "step": 30984 + }, + { + "epoch": 0.18427657246169948, + "grad_norm": 2.097165584564209, + "learning_rate": 4.592657990420399e-05, + "loss": 5.14, + "step": 30985 + }, + { + "epoch": 0.1842825197449805, + "grad_norm": 1.9558128118515015, + "learning_rate": 4.592632434804107e-05, + "loss": 4.6043, + "step": 30986 + }, + { + "epoch": 0.1842884670282615, + "grad_norm": 1.8616024255752563, + "learning_rate": 4.5926068784572975e-05, + "loss": 4.8654, + "step": 30987 + }, + { + "epoch": 0.18429441431154248, + "grad_norm": 2.043250560760498, + "learning_rate": 4.5925813213799805e-05, + "loss": 5.0763, + "step": 30988 + }, + { + "epoch": 0.1843003615948235, + "grad_norm": 1.9793142080307007, + "learning_rate": 4.5925557635721654e-05, + "loss": 4.9104, + "step": 30989 + }, + { + "epoch": 0.18430630887810448, + "grad_norm": 1.7368297576904297, + "learning_rate": 4.59253020503386e-05, + "loss": 5.0578, + "step": 30990 + }, + { + "epoch": 0.18431225616138547, + "grad_norm": 2.311291456222534, + "learning_rate": 4.592504645765075e-05, + "loss": 4.7787, + "step": 30991 + }, + { + "epoch": 0.18431820344466648, + "grad_norm": 1.9127613306045532, + "learning_rate": 4.592479085765818e-05, + "loss": 4.7311, + "step": 30992 + }, + { + "epoch": 0.18432415072794747, + "grad_norm": 2.0677103996276855, + "learning_rate": 4.592453525036098e-05, + "loss": 5.073, + "step": 30993 + }, + { + "epoch": 0.18433009801122846, + "grad_norm": 1.6885477304458618, + "learning_rate": 4.592427963575924e-05, + "loss": 4.7878, + "step": 30994 + }, + { + "epoch": 0.18433604529450948, + "grad_norm": 1.7439665794372559, + "learning_rate": 4.592402401385305e-05, + "loss": 4.882, + "step": 30995 + }, + { + "epoch": 0.18434199257779046, + "grad_norm": 2.02858567237854, + "learning_rate": 4.5923768384642494e-05, + "loss": 4.8182, + "step": 30996 + }, + { + "epoch": 0.18434793986107145, + "grad_norm": 2.1561737060546875, + "learning_rate": 4.5923512748127676e-05, + "loss": 4.8795, + "step": 30997 + }, + { + "epoch": 0.18435388714435247, + "grad_norm": 2.319322347640991, + "learning_rate": 4.592325710430867e-05, + "loss": 4.725, + "step": 30998 + }, + { + "epoch": 0.18435983442763346, + "grad_norm": 2.0449020862579346, + "learning_rate": 4.5923001453185575e-05, + "loss": 4.746, + "step": 30999 + }, + { + "epoch": 0.18436578171091444, + "grad_norm": 2.0369932651519775, + "learning_rate": 4.5922745794758475e-05, + "loss": 4.4575, + "step": 31000 + }, + { + "epoch": 0.18437172899419546, + "grad_norm": 2.1663169860839844, + "learning_rate": 4.5922490129027464e-05, + "loss": 4.7254, + "step": 31001 + }, + { + "epoch": 0.18437767627747645, + "grad_norm": 1.6700929403305054, + "learning_rate": 4.5922234455992617e-05, + "loss": 4.6762, + "step": 31002 + }, + { + "epoch": 0.18438362356075744, + "grad_norm": 2.359294891357422, + "learning_rate": 4.592197877565404e-05, + "loss": 4.67, + "step": 31003 + }, + { + "epoch": 0.18438957084403845, + "grad_norm": 1.9069437980651855, + "learning_rate": 4.5921723088011826e-05, + "loss": 4.9545, + "step": 31004 + }, + { + "epoch": 0.18439551812731944, + "grad_norm": 2.373521327972412, + "learning_rate": 4.592146739306604e-05, + "loss": 3.948, + "step": 31005 + }, + { + "epoch": 0.18440146541060043, + "grad_norm": 2.227628469467163, + "learning_rate": 4.592121169081679e-05, + "loss": 4.1342, + "step": 31006 + }, + { + "epoch": 0.18440741269388145, + "grad_norm": 2.1248085498809814, + "learning_rate": 4.592095598126417e-05, + "loss": 4.3805, + "step": 31007 + }, + { + "epoch": 0.18441335997716243, + "grad_norm": 2.362063407897949, + "learning_rate": 4.592070026440825e-05, + "loss": 4.6606, + "step": 31008 + }, + { + "epoch": 0.18441930726044342, + "grad_norm": 2.0881500244140625, + "learning_rate": 4.5920444540249135e-05, + "loss": 4.7613, + "step": 31009 + }, + { + "epoch": 0.18442525454372444, + "grad_norm": 2.026759147644043, + "learning_rate": 4.5920188808786904e-05, + "loss": 4.5697, + "step": 31010 + }, + { + "epoch": 0.18443120182700543, + "grad_norm": 2.4088351726531982, + "learning_rate": 4.5919933070021657e-05, + "loss": 4.1511, + "step": 31011 + }, + { + "epoch": 0.1844371491102864, + "grad_norm": 2.3477118015289307, + "learning_rate": 4.5919677323953474e-05, + "loss": 4.2753, + "step": 31012 + }, + { + "epoch": 0.18444309639356743, + "grad_norm": 2.198819875717163, + "learning_rate": 4.591942157058245e-05, + "loss": 4.4163, + "step": 31013 + }, + { + "epoch": 0.18444904367684842, + "grad_norm": 2.212641477584839, + "learning_rate": 4.591916580990867e-05, + "loss": 4.6979, + "step": 31014 + }, + { + "epoch": 0.1844549909601294, + "grad_norm": 1.924052119255066, + "learning_rate": 4.591891004193223e-05, + "loss": 4.7703, + "step": 31015 + }, + { + "epoch": 0.18446093824341042, + "grad_norm": 2.4676082134246826, + "learning_rate": 4.591865426665321e-05, + "loss": 4.6545, + "step": 31016 + }, + { + "epoch": 0.1844668855266914, + "grad_norm": 2.432497262954712, + "learning_rate": 4.59183984840717e-05, + "loss": 4.671, + "step": 31017 + }, + { + "epoch": 0.1844728328099724, + "grad_norm": 2.691105842590332, + "learning_rate": 4.59181426941878e-05, + "loss": 4.4464, + "step": 31018 + }, + { + "epoch": 0.1844787800932534, + "grad_norm": 2.5249433517456055, + "learning_rate": 4.591788689700159e-05, + "loss": 4.2623, + "step": 31019 + }, + { + "epoch": 0.1844847273765344, + "grad_norm": 2.6374852657318115, + "learning_rate": 4.5917631092513156e-05, + "loss": 3.4994, + "step": 31020 + }, + { + "epoch": 0.1844906746598154, + "grad_norm": 2.6089253425598145, + "learning_rate": 4.591737528072261e-05, + "loss": 3.8228, + "step": 31021 + }, + { + "epoch": 0.1844966219430964, + "grad_norm": 2.5166683197021484, + "learning_rate": 4.591711946163001e-05, + "loss": 3.4982, + "step": 31022 + }, + { + "epoch": 0.1845025692263774, + "grad_norm": 2.3516764640808105, + "learning_rate": 4.591686363523546e-05, + "loss": 3.5029, + "step": 31023 + }, + { + "epoch": 0.18450851650965838, + "grad_norm": 2.5474250316619873, + "learning_rate": 4.591660780153906e-05, + "loss": 3.5499, + "step": 31024 + }, + { + "epoch": 0.1845144637929394, + "grad_norm": 1.7902573347091675, + "learning_rate": 4.591635196054088e-05, + "loss": 4.6401, + "step": 31025 + }, + { + "epoch": 0.1845204110762204, + "grad_norm": 2.301729679107666, + "learning_rate": 4.5916096112241015e-05, + "loss": 5.2124, + "step": 31026 + }, + { + "epoch": 0.18452635835950137, + "grad_norm": 1.9211527109146118, + "learning_rate": 4.591584025663956e-05, + "loss": 5.107, + "step": 31027 + }, + { + "epoch": 0.1845323056427824, + "grad_norm": 2.245776653289795, + "learning_rate": 4.59155843937366e-05, + "loss": 4.1589, + "step": 31028 + }, + { + "epoch": 0.18453825292606338, + "grad_norm": 2.7997524738311768, + "learning_rate": 4.591532852353223e-05, + "loss": 3.6491, + "step": 31029 + }, + { + "epoch": 0.18454420020934437, + "grad_norm": 2.8077120780944824, + "learning_rate": 4.591507264602653e-05, + "loss": 3.3127, + "step": 31030 + }, + { + "epoch": 0.18455014749262535, + "grad_norm": 1.4262480735778809, + "learning_rate": 4.591481676121959e-05, + "loss": 4.9064, + "step": 31031 + }, + { + "epoch": 0.18455609477590637, + "grad_norm": 1.6911439895629883, + "learning_rate": 4.591456086911152e-05, + "loss": 5.1697, + "step": 31032 + }, + { + "epoch": 0.18456204205918736, + "grad_norm": 1.591536045074463, + "learning_rate": 4.591430496970238e-05, + "loss": 5.3011, + "step": 31033 + }, + { + "epoch": 0.18456798934246835, + "grad_norm": 1.1955918073654175, + "learning_rate": 4.591404906299227e-05, + "loss": 5.1465, + "step": 31034 + }, + { + "epoch": 0.18457393662574936, + "grad_norm": 1.6647759675979614, + "learning_rate": 4.5913793148981286e-05, + "loss": 5.0915, + "step": 31035 + }, + { + "epoch": 0.18457988390903035, + "grad_norm": 1.7477984428405762, + "learning_rate": 4.5913537227669515e-05, + "loss": 4.7211, + "step": 31036 + }, + { + "epoch": 0.18458583119231134, + "grad_norm": 1.973645567893982, + "learning_rate": 4.5913281299057045e-05, + "loss": 5.6001, + "step": 31037 + }, + { + "epoch": 0.18459177847559236, + "grad_norm": 1.3707242012023926, + "learning_rate": 4.591302536314396e-05, + "loss": 5.3164, + "step": 31038 + }, + { + "epoch": 0.18459772575887334, + "grad_norm": 1.71157968044281, + "learning_rate": 4.591276941993036e-05, + "loss": 5.2662, + "step": 31039 + }, + { + "epoch": 0.18460367304215433, + "grad_norm": 1.3975422382354736, + "learning_rate": 4.5912513469416315e-05, + "loss": 4.753, + "step": 31040 + }, + { + "epoch": 0.18460962032543535, + "grad_norm": 2.232591152191162, + "learning_rate": 4.5912257511601944e-05, + "loss": 4.7482, + "step": 31041 + }, + { + "epoch": 0.18461556760871634, + "grad_norm": 1.5958985090255737, + "learning_rate": 4.591200154648731e-05, + "loss": 5.0753, + "step": 31042 + }, + { + "epoch": 0.18462151489199732, + "grad_norm": 1.4874860048294067, + "learning_rate": 4.591174557407252e-05, + "loss": 4.9495, + "step": 31043 + }, + { + "epoch": 0.18462746217527834, + "grad_norm": 1.8329155445098877, + "learning_rate": 4.591148959435765e-05, + "loss": 5.3401, + "step": 31044 + }, + { + "epoch": 0.18463340945855933, + "grad_norm": 1.6365807056427002, + "learning_rate": 4.591123360734279e-05, + "loss": 4.8406, + "step": 31045 + }, + { + "epoch": 0.18463935674184032, + "grad_norm": 1.5671586990356445, + "learning_rate": 4.591097761302804e-05, + "loss": 4.9428, + "step": 31046 + }, + { + "epoch": 0.18464530402512133, + "grad_norm": 1.838995099067688, + "learning_rate": 4.5910721611413486e-05, + "loss": 5.2237, + "step": 31047 + }, + { + "epoch": 0.18465125130840232, + "grad_norm": 1.947945475578308, + "learning_rate": 4.5910465602499216e-05, + "loss": 4.4973, + "step": 31048 + }, + { + "epoch": 0.1846571985916833, + "grad_norm": 2.2322356700897217, + "learning_rate": 4.591020958628531e-05, + "loss": 4.3619, + "step": 31049 + }, + { + "epoch": 0.18466314587496432, + "grad_norm": 2.417125940322876, + "learning_rate": 4.590995356277187e-05, + "loss": 4.2789, + "step": 31050 + }, + { + "epoch": 0.1846690931582453, + "grad_norm": 2.1802711486816406, + "learning_rate": 4.590969753195898e-05, + "loss": 4.0677, + "step": 31051 + }, + { + "epoch": 0.1846750404415263, + "grad_norm": 2.1682262420654297, + "learning_rate": 4.590944149384674e-05, + "loss": 3.8951, + "step": 31052 + }, + { + "epoch": 0.18468098772480732, + "grad_norm": 2.156933546066284, + "learning_rate": 4.5909185448435224e-05, + "loss": 4.1987, + "step": 31053 + }, + { + "epoch": 0.1846869350080883, + "grad_norm": 1.6152640581130981, + "learning_rate": 4.5908929395724526e-05, + "loss": 5.3105, + "step": 31054 + }, + { + "epoch": 0.1846928822913693, + "grad_norm": 2.371634006500244, + "learning_rate": 4.5908673335714735e-05, + "loss": 3.9825, + "step": 31055 + }, + { + "epoch": 0.1846988295746503, + "grad_norm": 2.6450035572052, + "learning_rate": 4.5908417268405946e-05, + "loss": 3.9852, + "step": 31056 + }, + { + "epoch": 0.1847047768579313, + "grad_norm": 2.498091459274292, + "learning_rate": 4.590816119379825e-05, + "loss": 4.1961, + "step": 31057 + }, + { + "epoch": 0.18471072414121228, + "grad_norm": 2.2890594005584717, + "learning_rate": 4.590790511189172e-05, + "loss": 4.2483, + "step": 31058 + }, + { + "epoch": 0.1847166714244933, + "grad_norm": 1.9878109693527222, + "learning_rate": 4.590764902268646e-05, + "loss": 4.2698, + "step": 31059 + }, + { + "epoch": 0.1847226187077743, + "grad_norm": 1.937960147857666, + "learning_rate": 4.590739292618256e-05, + "loss": 4.9191, + "step": 31060 + }, + { + "epoch": 0.18472856599105528, + "grad_norm": 2.69293212890625, + "learning_rate": 4.590713682238009e-05, + "loss": 4.3818, + "step": 31061 + }, + { + "epoch": 0.1847345132743363, + "grad_norm": 1.8170857429504395, + "learning_rate": 4.590688071127917e-05, + "loss": 4.8619, + "step": 31062 + }, + { + "epoch": 0.18474046055761728, + "grad_norm": 2.600891590118408, + "learning_rate": 4.590662459287987e-05, + "loss": 5.3781, + "step": 31063 + }, + { + "epoch": 0.18474640784089827, + "grad_norm": 1.780999779701233, + "learning_rate": 4.590636846718228e-05, + "loss": 5.1513, + "step": 31064 + }, + { + "epoch": 0.18475235512417928, + "grad_norm": 1.7034980058670044, + "learning_rate": 4.59061123341865e-05, + "loss": 5.2488, + "step": 31065 + }, + { + "epoch": 0.18475830240746027, + "grad_norm": 1.7700848579406738, + "learning_rate": 4.590585619389261e-05, + "loss": 4.9997, + "step": 31066 + }, + { + "epoch": 0.18476424969074126, + "grad_norm": 1.8884303569793701, + "learning_rate": 4.5905600046300694e-05, + "loss": 4.8398, + "step": 31067 + }, + { + "epoch": 0.18477019697402228, + "grad_norm": 1.7098636627197266, + "learning_rate": 4.590534389141086e-05, + "loss": 4.7416, + "step": 31068 + }, + { + "epoch": 0.18477614425730327, + "grad_norm": 1.5094579458236694, + "learning_rate": 4.590508772922318e-05, + "loss": 4.8252, + "step": 31069 + }, + { + "epoch": 0.18478209154058425, + "grad_norm": 1.5103203058242798, + "learning_rate": 4.590483155973775e-05, + "loss": 4.9903, + "step": 31070 + }, + { + "epoch": 0.18478803882386527, + "grad_norm": 1.6473743915557861, + "learning_rate": 4.590457538295466e-05, + "loss": 4.5804, + "step": 31071 + }, + { + "epoch": 0.18479398610714626, + "grad_norm": 2.5655574798583984, + "learning_rate": 4.5904319198874e-05, + "loss": 4.8887, + "step": 31072 + }, + { + "epoch": 0.18479993339042725, + "grad_norm": 2.067401647567749, + "learning_rate": 4.5904063007495854e-05, + "loss": 4.4422, + "step": 31073 + }, + { + "epoch": 0.18480588067370826, + "grad_norm": 1.9005351066589355, + "learning_rate": 4.590380680882032e-05, + "loss": 4.3074, + "step": 31074 + }, + { + "epoch": 0.18481182795698925, + "grad_norm": 1.704513669013977, + "learning_rate": 4.590355060284748e-05, + "loss": 4.6102, + "step": 31075 + }, + { + "epoch": 0.18481777524027024, + "grad_norm": 1.7560302019119263, + "learning_rate": 4.590329438957743e-05, + "loss": 4.6725, + "step": 31076 + }, + { + "epoch": 0.18482372252355125, + "grad_norm": 1.44539213180542, + "learning_rate": 4.5903038169010254e-05, + "loss": 4.8119, + "step": 31077 + }, + { + "epoch": 0.18482966980683224, + "grad_norm": 1.451361894607544, + "learning_rate": 4.5902781941146045e-05, + "loss": 5.1253, + "step": 31078 + }, + { + "epoch": 0.18483561709011323, + "grad_norm": 1.8367254734039307, + "learning_rate": 4.590252570598489e-05, + "loss": 5.2783, + "step": 31079 + }, + { + "epoch": 0.18484156437339425, + "grad_norm": 1.64362370967865, + "learning_rate": 4.590226946352688e-05, + "loss": 4.8848, + "step": 31080 + }, + { + "epoch": 0.18484751165667523, + "grad_norm": 1.3705360889434814, + "learning_rate": 4.590201321377209e-05, + "loss": 5.0658, + "step": 31081 + }, + { + "epoch": 0.18485345893995622, + "grad_norm": 1.3959661722183228, + "learning_rate": 4.5901756956720645e-05, + "loss": 5.2573, + "step": 31082 + }, + { + "epoch": 0.18485940622323724, + "grad_norm": 2.0968472957611084, + "learning_rate": 4.59015006923726e-05, + "loss": 4.5473, + "step": 31083 + }, + { + "epoch": 0.18486535350651823, + "grad_norm": 1.7659695148468018, + "learning_rate": 4.5901244420728055e-05, + "loss": 4.6719, + "step": 31084 + }, + { + "epoch": 0.18487130078979921, + "grad_norm": 2.2793681621551514, + "learning_rate": 4.59009881417871e-05, + "loss": 4.4078, + "step": 31085 + }, + { + "epoch": 0.18487724807308023, + "grad_norm": 1.7163949012756348, + "learning_rate": 4.590073185554984e-05, + "loss": 4.9258, + "step": 31086 + }, + { + "epoch": 0.18488319535636122, + "grad_norm": 2.0032429695129395, + "learning_rate": 4.5900475562016346e-05, + "loss": 5.2906, + "step": 31087 + }, + { + "epoch": 0.1848891426396422, + "grad_norm": 1.6730388402938843, + "learning_rate": 4.5900219261186706e-05, + "loss": 4.7542, + "step": 31088 + }, + { + "epoch": 0.18489508992292322, + "grad_norm": 2.3741564750671387, + "learning_rate": 4.5899962953061015e-05, + "loss": 4.3728, + "step": 31089 + }, + { + "epoch": 0.1849010372062042, + "grad_norm": 1.950449824333191, + "learning_rate": 4.589970663763937e-05, + "loss": 4.7416, + "step": 31090 + }, + { + "epoch": 0.1849069844894852, + "grad_norm": 1.9121187925338745, + "learning_rate": 4.589945031492185e-05, + "loss": 4.6129, + "step": 31091 + }, + { + "epoch": 0.1849129317727662, + "grad_norm": 2.0481319427490234, + "learning_rate": 4.589919398490854e-05, + "loss": 4.4413, + "step": 31092 + }, + { + "epoch": 0.1849188790560472, + "grad_norm": 2.135030508041382, + "learning_rate": 4.589893764759955e-05, + "loss": 4.3973, + "step": 31093 + }, + { + "epoch": 0.1849248263393282, + "grad_norm": 1.7354028224945068, + "learning_rate": 4.589868130299495e-05, + "loss": 4.8435, + "step": 31094 + }, + { + "epoch": 0.18493077362260918, + "grad_norm": 1.616546630859375, + "learning_rate": 4.5898424951094834e-05, + "loss": 4.889, + "step": 31095 + }, + { + "epoch": 0.1849367209058902, + "grad_norm": 1.718268871307373, + "learning_rate": 4.5898168591899305e-05, + "loss": 5.0764, + "step": 31096 + }, + { + "epoch": 0.18494266818917118, + "grad_norm": 1.5846326351165771, + "learning_rate": 4.589791222540842e-05, + "loss": 5.0193, + "step": 31097 + }, + { + "epoch": 0.18494861547245217, + "grad_norm": 1.6834520101547241, + "learning_rate": 4.589765585162231e-05, + "loss": 4.9781, + "step": 31098 + }, + { + "epoch": 0.1849545627557332, + "grad_norm": 2.0722103118896484, + "learning_rate": 4.5897399470541035e-05, + "loss": 4.499, + "step": 31099 + }, + { + "epoch": 0.18496051003901418, + "grad_norm": 2.0412447452545166, + "learning_rate": 4.589714308216469e-05, + "loss": 4.6282, + "step": 31100 + }, + { + "epoch": 0.18496645732229516, + "grad_norm": 1.5334446430206299, + "learning_rate": 4.589688668649338e-05, + "loss": 4.8552, + "step": 31101 + }, + { + "epoch": 0.18497240460557618, + "grad_norm": 1.6716012954711914, + "learning_rate": 4.589663028352718e-05, + "loss": 4.6907, + "step": 31102 + }, + { + "epoch": 0.18497835188885717, + "grad_norm": 1.5221296548843384, + "learning_rate": 4.589637387326618e-05, + "loss": 4.8665, + "step": 31103 + }, + { + "epoch": 0.18498429917213816, + "grad_norm": 1.4777991771697998, + "learning_rate": 4.589611745571047e-05, + "loss": 4.6618, + "step": 31104 + }, + { + "epoch": 0.18499024645541917, + "grad_norm": 1.5103845596313477, + "learning_rate": 4.589586103086014e-05, + "loss": 4.8099, + "step": 31105 + }, + { + "epoch": 0.18499619373870016, + "grad_norm": 1.7128773927688599, + "learning_rate": 4.5895604598715284e-05, + "loss": 4.5107, + "step": 31106 + }, + { + "epoch": 0.18500214102198115, + "grad_norm": 1.7347596883773804, + "learning_rate": 4.5895348159275986e-05, + "loss": 5.2684, + "step": 31107 + }, + { + "epoch": 0.18500808830526216, + "grad_norm": 1.5564923286437988, + "learning_rate": 4.5895091712542346e-05, + "loss": 4.8463, + "step": 31108 + }, + { + "epoch": 0.18501403558854315, + "grad_norm": 1.6237825155258179, + "learning_rate": 4.589483525851444e-05, + "loss": 4.7388, + "step": 31109 + }, + { + "epoch": 0.18501998287182414, + "grad_norm": 2.2543084621429443, + "learning_rate": 4.5894578797192355e-05, + "loss": 4.5901, + "step": 31110 + }, + { + "epoch": 0.18502593015510516, + "grad_norm": 2.227154016494751, + "learning_rate": 4.58943223285762e-05, + "loss": 4.1078, + "step": 31111 + }, + { + "epoch": 0.18503187743838614, + "grad_norm": 2.371508836746216, + "learning_rate": 4.5894065852666044e-05, + "loss": 3.3593, + "step": 31112 + }, + { + "epoch": 0.18503782472166713, + "grad_norm": 2.228203773498535, + "learning_rate": 4.589380936946199e-05, + "loss": 3.3639, + "step": 31113 + }, + { + "epoch": 0.18504377200494815, + "grad_norm": 1.926233172416687, + "learning_rate": 4.589355287896412e-05, + "loss": 3.5136, + "step": 31114 + }, + { + "epoch": 0.18504971928822914, + "grad_norm": 1.9671204090118408, + "learning_rate": 4.5893296381172537e-05, + "loss": 3.8437, + "step": 31115 + }, + { + "epoch": 0.18505566657151012, + "grad_norm": 1.9354090690612793, + "learning_rate": 4.5893039876087305e-05, + "loss": 3.5759, + "step": 31116 + }, + { + "epoch": 0.18506161385479114, + "grad_norm": 1.5726033449172974, + "learning_rate": 4.589278336370854e-05, + "loss": 4.5015, + "step": 31117 + }, + { + "epoch": 0.18506756113807213, + "grad_norm": 1.7983962297439575, + "learning_rate": 4.5892526844036307e-05, + "loss": 4.7782, + "step": 31118 + }, + { + "epoch": 0.18507350842135312, + "grad_norm": 2.0265634059906006, + "learning_rate": 4.589227031707072e-05, + "loss": 3.7139, + "step": 31119 + }, + { + "epoch": 0.18507945570463413, + "grad_norm": 2.074643135070801, + "learning_rate": 4.589201378281186e-05, + "loss": 3.7737, + "step": 31120 + }, + { + "epoch": 0.18508540298791512, + "grad_norm": 1.91277277469635, + "learning_rate": 4.58917572412598e-05, + "loss": 4.5868, + "step": 31121 + }, + { + "epoch": 0.1850913502711961, + "grad_norm": 1.944737195968628, + "learning_rate": 4.589150069241466e-05, + "loss": 4.1658, + "step": 31122 + }, + { + "epoch": 0.18509729755447712, + "grad_norm": 2.1314923763275146, + "learning_rate": 4.58912441362765e-05, + "loss": 3.8389, + "step": 31123 + }, + { + "epoch": 0.1851032448377581, + "grad_norm": 1.9352933168411255, + "learning_rate": 4.589098757284543e-05, + "loss": 4.6027, + "step": 31124 + }, + { + "epoch": 0.1851091921210391, + "grad_norm": 1.8150495290756226, + "learning_rate": 4.589073100212153e-05, + "loss": 4.6796, + "step": 31125 + }, + { + "epoch": 0.18511513940432012, + "grad_norm": 1.7410165071487427, + "learning_rate": 4.589047442410489e-05, + "loss": 4.8486, + "step": 31126 + }, + { + "epoch": 0.1851210866876011, + "grad_norm": 2.197824716567993, + "learning_rate": 4.58902178387956e-05, + "loss": 4.6977, + "step": 31127 + }, + { + "epoch": 0.1851270339708821, + "grad_norm": 1.9299874305725098, + "learning_rate": 4.588996124619376e-05, + "loss": 4.4558, + "step": 31128 + }, + { + "epoch": 0.1851329812541631, + "grad_norm": 1.6607778072357178, + "learning_rate": 4.5889704646299433e-05, + "loss": 4.562, + "step": 31129 + }, + { + "epoch": 0.1851389285374441, + "grad_norm": 1.7494784593582153, + "learning_rate": 4.588944803911274e-05, + "loss": 4.9075, + "step": 31130 + }, + { + "epoch": 0.18514487582072509, + "grad_norm": 1.6292402744293213, + "learning_rate": 4.588919142463376e-05, + "loss": 5.0776, + "step": 31131 + }, + { + "epoch": 0.1851508231040061, + "grad_norm": 1.9825034141540527, + "learning_rate": 4.588893480286257e-05, + "loss": 4.3945, + "step": 31132 + }, + { + "epoch": 0.1851567703872871, + "grad_norm": 1.7921351194381714, + "learning_rate": 4.588867817379927e-05, + "loss": 4.8753, + "step": 31133 + }, + { + "epoch": 0.18516271767056808, + "grad_norm": 1.920423984527588, + "learning_rate": 4.588842153744395e-05, + "loss": 4.3311, + "step": 31134 + }, + { + "epoch": 0.1851686649538491, + "grad_norm": 2.3672003746032715, + "learning_rate": 4.5888164893796695e-05, + "loss": 4.1878, + "step": 31135 + }, + { + "epoch": 0.18517461223713008, + "grad_norm": 1.6721351146697998, + "learning_rate": 4.5887908242857594e-05, + "loss": 5.3044, + "step": 31136 + }, + { + "epoch": 0.18518055952041107, + "grad_norm": 2.2272095680236816, + "learning_rate": 4.5887651584626745e-05, + "loss": 4.1318, + "step": 31137 + }, + { + "epoch": 0.18518650680369209, + "grad_norm": 2.2360355854034424, + "learning_rate": 4.588739491910424e-05, + "loss": 4.0698, + "step": 31138 + }, + { + "epoch": 0.18519245408697307, + "grad_norm": 1.863351583480835, + "learning_rate": 4.588713824629015e-05, + "loss": 4.3417, + "step": 31139 + }, + { + "epoch": 0.18519840137025406, + "grad_norm": 1.5672686100006104, + "learning_rate": 4.588688156618458e-05, + "loss": 4.9516, + "step": 31140 + }, + { + "epoch": 0.18520434865353508, + "grad_norm": 1.7040348052978516, + "learning_rate": 4.5886624878787624e-05, + "loss": 4.8062, + "step": 31141 + }, + { + "epoch": 0.18521029593681607, + "grad_norm": 1.4039283990859985, + "learning_rate": 4.5886368184099346e-05, + "loss": 5.0631, + "step": 31142 + }, + { + "epoch": 0.18521624322009705, + "grad_norm": 1.474048376083374, + "learning_rate": 4.588611148211986e-05, + "loss": 4.9985, + "step": 31143 + }, + { + "epoch": 0.18522219050337807, + "grad_norm": 1.634386420249939, + "learning_rate": 4.5885854772849254e-05, + "loss": 4.9347, + "step": 31144 + }, + { + "epoch": 0.18522813778665906, + "grad_norm": 1.8768565654754639, + "learning_rate": 4.5885598056287604e-05, + "loss": 4.9722, + "step": 31145 + }, + { + "epoch": 0.18523408506994005, + "grad_norm": 2.405940532684326, + "learning_rate": 4.588534133243501e-05, + "loss": 4.2056, + "step": 31146 + }, + { + "epoch": 0.18524003235322106, + "grad_norm": 1.7994506359100342, + "learning_rate": 4.588508460129156e-05, + "loss": 4.8714, + "step": 31147 + }, + { + "epoch": 0.18524597963650205, + "grad_norm": 1.5454603433609009, + "learning_rate": 4.5884827862857344e-05, + "loss": 5.1527, + "step": 31148 + }, + { + "epoch": 0.18525192691978304, + "grad_norm": 1.4534333944320679, + "learning_rate": 4.5884571117132444e-05, + "loss": 5.0984, + "step": 31149 + }, + { + "epoch": 0.18525787420306403, + "grad_norm": 1.6229337453842163, + "learning_rate": 4.588431436411696e-05, + "loss": 4.3216, + "step": 31150 + }, + { + "epoch": 0.18526382148634504, + "grad_norm": 1.905275583267212, + "learning_rate": 4.588405760381098e-05, + "loss": 4.3979, + "step": 31151 + }, + { + "epoch": 0.18526976876962603, + "grad_norm": 1.85005521774292, + "learning_rate": 4.58838008362146e-05, + "loss": 5.1768, + "step": 31152 + }, + { + "epoch": 0.18527571605290702, + "grad_norm": 1.5412628650665283, + "learning_rate": 4.5883544061327885e-05, + "loss": 5.0367, + "step": 31153 + }, + { + "epoch": 0.18528166333618803, + "grad_norm": 1.4088354110717773, + "learning_rate": 4.588328727915094e-05, + "loss": 5.0446, + "step": 31154 + }, + { + "epoch": 0.18528761061946902, + "grad_norm": 1.4099864959716797, + "learning_rate": 4.5883030489683865e-05, + "loss": 4.8956, + "step": 31155 + }, + { + "epoch": 0.18529355790275, + "grad_norm": 1.5859589576721191, + "learning_rate": 4.588277369292674e-05, + "loss": 5.1765, + "step": 31156 + }, + { + "epoch": 0.18529950518603103, + "grad_norm": 1.9431182146072388, + "learning_rate": 4.588251688887965e-05, + "loss": 4.2741, + "step": 31157 + }, + { + "epoch": 0.18530545246931202, + "grad_norm": 2.457024335861206, + "learning_rate": 4.5882260077542685e-05, + "loss": 4.4239, + "step": 31158 + }, + { + "epoch": 0.185311399752593, + "grad_norm": 2.1999270915985107, + "learning_rate": 4.588200325891595e-05, + "loss": 4.8285, + "step": 31159 + }, + { + "epoch": 0.18531734703587402, + "grad_norm": 2.221158504486084, + "learning_rate": 4.588174643299952e-05, + "loss": 4.6192, + "step": 31160 + }, + { + "epoch": 0.185323294319155, + "grad_norm": 2.167083501815796, + "learning_rate": 4.5881489599793484e-05, + "loss": 4.6133, + "step": 31161 + }, + { + "epoch": 0.185329241602436, + "grad_norm": 2.050466775894165, + "learning_rate": 4.588123275929793e-05, + "loss": 4.3539, + "step": 31162 + }, + { + "epoch": 0.185335188885717, + "grad_norm": 1.8196213245391846, + "learning_rate": 4.588097591151296e-05, + "loss": 4.8754, + "step": 31163 + }, + { + "epoch": 0.185341136168998, + "grad_norm": 2.024564504623413, + "learning_rate": 4.588071905643866e-05, + "loss": 4.8754, + "step": 31164 + }, + { + "epoch": 0.185347083452279, + "grad_norm": 2.1911628246307373, + "learning_rate": 4.5880462194075114e-05, + "loss": 4.7296, + "step": 31165 + }, + { + "epoch": 0.18535303073556, + "grad_norm": 2.6719770431518555, + "learning_rate": 4.588020532442241e-05, + "loss": 4.6449, + "step": 31166 + }, + { + "epoch": 0.185358978018841, + "grad_norm": 1.9585731029510498, + "learning_rate": 4.587994844748065e-05, + "loss": 4.5527, + "step": 31167 + }, + { + "epoch": 0.18536492530212198, + "grad_norm": 1.9000869989395142, + "learning_rate": 4.587969156324991e-05, + "loss": 4.7364, + "step": 31168 + }, + { + "epoch": 0.185370872585403, + "grad_norm": 2.080929756164551, + "learning_rate": 4.587943467173029e-05, + "loss": 4.797, + "step": 31169 + }, + { + "epoch": 0.18537681986868398, + "grad_norm": 2.2646484375, + "learning_rate": 4.5879177772921864e-05, + "loss": 4.6755, + "step": 31170 + }, + { + "epoch": 0.18538276715196497, + "grad_norm": 2.2647855281829834, + "learning_rate": 4.5878920866824746e-05, + "loss": 4.7376, + "step": 31171 + }, + { + "epoch": 0.185388714435246, + "grad_norm": 2.094724655151367, + "learning_rate": 4.5878663953439005e-05, + "loss": 4.4832, + "step": 31172 + }, + { + "epoch": 0.18539466171852698, + "grad_norm": 2.17482852935791, + "learning_rate": 4.587840703276474e-05, + "loss": 4.5812, + "step": 31173 + }, + { + "epoch": 0.18540060900180796, + "grad_norm": 2.3196496963500977, + "learning_rate": 4.5878150104802045e-05, + "loss": 4.5377, + "step": 31174 + }, + { + "epoch": 0.18540655628508898, + "grad_norm": 1.9016317129135132, + "learning_rate": 4.5877893169550996e-05, + "loss": 4.4074, + "step": 31175 + }, + { + "epoch": 0.18541250356836997, + "grad_norm": 2.1237874031066895, + "learning_rate": 4.587763622701169e-05, + "loss": 4.6557, + "step": 31176 + }, + { + "epoch": 0.18541845085165096, + "grad_norm": 1.9775478839874268, + "learning_rate": 4.587737927718422e-05, + "loss": 4.7775, + "step": 31177 + }, + { + "epoch": 0.18542439813493197, + "grad_norm": 1.7758903503417969, + "learning_rate": 4.587712232006868e-05, + "loss": 5.1214, + "step": 31178 + }, + { + "epoch": 0.18543034541821296, + "grad_norm": 2.0964064598083496, + "learning_rate": 4.5876865355665135e-05, + "loss": 4.7776, + "step": 31179 + }, + { + "epoch": 0.18543629270149395, + "grad_norm": 2.201028347015381, + "learning_rate": 4.58766083839737e-05, + "loss": 4.4993, + "step": 31180 + }, + { + "epoch": 0.18544223998477496, + "grad_norm": 1.6263900995254517, + "learning_rate": 4.587635140499446e-05, + "loss": 4.8201, + "step": 31181 + }, + { + "epoch": 0.18544818726805595, + "grad_norm": 1.5977891683578491, + "learning_rate": 4.58760944187275e-05, + "loss": 4.8996, + "step": 31182 + }, + { + "epoch": 0.18545413455133694, + "grad_norm": 1.4332998991012573, + "learning_rate": 4.5875837425172904e-05, + "loss": 4.9172, + "step": 31183 + }, + { + "epoch": 0.18546008183461796, + "grad_norm": 1.5299646854400635, + "learning_rate": 4.5875580424330774e-05, + "loss": 4.7755, + "step": 31184 + }, + { + "epoch": 0.18546602911789895, + "grad_norm": 1.75115966796875, + "learning_rate": 4.58753234162012e-05, + "loss": 5.1949, + "step": 31185 + }, + { + "epoch": 0.18547197640117993, + "grad_norm": 1.7606922388076782, + "learning_rate": 4.587506640078426e-05, + "loss": 4.5256, + "step": 31186 + }, + { + "epoch": 0.18547792368446095, + "grad_norm": 1.8649322986602783, + "learning_rate": 4.5874809378080055e-05, + "loss": 4.4196, + "step": 31187 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 1.56643807888031, + "learning_rate": 4.587455234808867e-05, + "loss": 4.7597, + "step": 31188 + }, + { + "epoch": 0.18548981825102293, + "grad_norm": 1.4705426692962646, + "learning_rate": 4.587429531081019e-05, + "loss": 4.7919, + "step": 31189 + }, + { + "epoch": 0.18549576553430394, + "grad_norm": 1.472716212272644, + "learning_rate": 4.587403826624471e-05, + "loss": 4.89, + "step": 31190 + }, + { + "epoch": 0.18550171281758493, + "grad_norm": 1.6982768774032593, + "learning_rate": 4.5873781214392315e-05, + "loss": 5.0726, + "step": 31191 + }, + { + "epoch": 0.18550766010086592, + "grad_norm": 1.543344497680664, + "learning_rate": 4.58735241552531e-05, + "loss": 4.9335, + "step": 31192 + }, + { + "epoch": 0.18551360738414693, + "grad_norm": 1.5692951679229736, + "learning_rate": 4.587326708882716e-05, + "loss": 5.0962, + "step": 31193 + }, + { + "epoch": 0.18551955466742792, + "grad_norm": 1.42678701877594, + "learning_rate": 4.587301001511457e-05, + "loss": 4.9869, + "step": 31194 + }, + { + "epoch": 0.1855255019507089, + "grad_norm": 1.5326842069625854, + "learning_rate": 4.5872752934115437e-05, + "loss": 4.9169, + "step": 31195 + }, + { + "epoch": 0.18553144923398993, + "grad_norm": 1.6546173095703125, + "learning_rate": 4.587249584582983e-05, + "loss": 4.8579, + "step": 31196 + }, + { + "epoch": 0.1855373965172709, + "grad_norm": 1.5519356727600098, + "learning_rate": 4.587223875025786e-05, + "loss": 4.8597, + "step": 31197 + }, + { + "epoch": 0.1855433438005519, + "grad_norm": 1.6477192640304565, + "learning_rate": 4.5871981647399606e-05, + "loss": 4.9838, + "step": 31198 + }, + { + "epoch": 0.18554929108383292, + "grad_norm": 1.55699622631073, + "learning_rate": 4.587172453725516e-05, + "loss": 4.6201, + "step": 31199 + }, + { + "epoch": 0.1855552383671139, + "grad_norm": 1.8529999256134033, + "learning_rate": 4.58714674198246e-05, + "loss": 4.8909, + "step": 31200 + }, + { + "epoch": 0.1855611856503949, + "grad_norm": 1.6725835800170898, + "learning_rate": 4.587121029510804e-05, + "loss": 4.9577, + "step": 31201 + }, + { + "epoch": 0.1855671329336759, + "grad_norm": 1.6824015378952026, + "learning_rate": 4.5870953163105545e-05, + "loss": 5.1628, + "step": 31202 + }, + { + "epoch": 0.1855730802169569, + "grad_norm": 1.4448360204696655, + "learning_rate": 4.587069602381722e-05, + "loss": 5.0237, + "step": 31203 + }, + { + "epoch": 0.1855790275002379, + "grad_norm": 1.6337754726409912, + "learning_rate": 4.5870438877243154e-05, + "loss": 4.9626, + "step": 31204 + }, + { + "epoch": 0.1855849747835189, + "grad_norm": 1.80125093460083, + "learning_rate": 4.587018172338343e-05, + "loss": 5.3551, + "step": 31205 + }, + { + "epoch": 0.1855909220667999, + "grad_norm": 1.9253333806991577, + "learning_rate": 4.586992456223814e-05, + "loss": 5.2549, + "step": 31206 + }, + { + "epoch": 0.18559686935008088, + "grad_norm": 1.6633672714233398, + "learning_rate": 4.586966739380738e-05, + "loss": 4.8286, + "step": 31207 + }, + { + "epoch": 0.18560281663336187, + "grad_norm": 1.5835380554199219, + "learning_rate": 4.586941021809124e-05, + "loss": 5.2259, + "step": 31208 + }, + { + "epoch": 0.18560876391664288, + "grad_norm": 1.4492098093032837, + "learning_rate": 4.586915303508979e-05, + "loss": 4.9461, + "step": 31209 + }, + { + "epoch": 0.18561471119992387, + "grad_norm": 1.6649349927902222, + "learning_rate": 4.586889584480314e-05, + "loss": 5.0827, + "step": 31210 + }, + { + "epoch": 0.18562065848320486, + "grad_norm": 1.904850959777832, + "learning_rate": 4.5868638647231374e-05, + "loss": 5.1674, + "step": 31211 + }, + { + "epoch": 0.18562660576648587, + "grad_norm": 1.5173715353012085, + "learning_rate": 4.586838144237458e-05, + "loss": 5.1684, + "step": 31212 + }, + { + "epoch": 0.18563255304976686, + "grad_norm": 1.5624539852142334, + "learning_rate": 4.586812423023285e-05, + "loss": 5.1, + "step": 31213 + }, + { + "epoch": 0.18563850033304785, + "grad_norm": 1.4277743101119995, + "learning_rate": 4.5867867010806275e-05, + "loss": 5.1583, + "step": 31214 + }, + { + "epoch": 0.18564444761632887, + "grad_norm": 1.5859686136245728, + "learning_rate": 4.586760978409494e-05, + "loss": 5.1848, + "step": 31215 + }, + { + "epoch": 0.18565039489960986, + "grad_norm": 1.5678226947784424, + "learning_rate": 4.586735255009895e-05, + "loss": 4.919, + "step": 31216 + }, + { + "epoch": 0.18565634218289084, + "grad_norm": 1.8046095371246338, + "learning_rate": 4.586709530881837e-05, + "loss": 4.741, + "step": 31217 + }, + { + "epoch": 0.18566228946617186, + "grad_norm": 1.8384325504302979, + "learning_rate": 4.586683806025331e-05, + "loss": 4.6733, + "step": 31218 + }, + { + "epoch": 0.18566823674945285, + "grad_norm": 1.7575494050979614, + "learning_rate": 4.586658080440385e-05, + "loss": 4.5463, + "step": 31219 + }, + { + "epoch": 0.18567418403273384, + "grad_norm": 1.5560238361358643, + "learning_rate": 4.586632354127009e-05, + "loss": 4.8716, + "step": 31220 + }, + { + "epoch": 0.18568013131601485, + "grad_norm": 1.6690888404846191, + "learning_rate": 4.586606627085209e-05, + "loss": 5.1506, + "step": 31221 + }, + { + "epoch": 0.18568607859929584, + "grad_norm": 1.8201206922531128, + "learning_rate": 4.5865808993149985e-05, + "loss": 4.9691, + "step": 31222 + }, + { + "epoch": 0.18569202588257683, + "grad_norm": 1.4912810325622559, + "learning_rate": 4.586555170816383e-05, + "loss": 4.707, + "step": 31223 + }, + { + "epoch": 0.18569797316585784, + "grad_norm": 1.8407074213027954, + "learning_rate": 4.5865294415893726e-05, + "loss": 5.3221, + "step": 31224 + }, + { + "epoch": 0.18570392044913883, + "grad_norm": 1.699225664138794, + "learning_rate": 4.586503711633976e-05, + "loss": 4.8717, + "step": 31225 + }, + { + "epoch": 0.18570986773241982, + "grad_norm": 1.76210618019104, + "learning_rate": 4.586477980950203e-05, + "loss": 4.7315, + "step": 31226 + }, + { + "epoch": 0.18571581501570084, + "grad_norm": 1.6527961492538452, + "learning_rate": 4.586452249538063e-05, + "loss": 4.9227, + "step": 31227 + }, + { + "epoch": 0.18572176229898182, + "grad_norm": 1.576004981994629, + "learning_rate": 4.586426517397563e-05, + "loss": 4.8288, + "step": 31228 + }, + { + "epoch": 0.1857277095822628, + "grad_norm": 1.8966927528381348, + "learning_rate": 4.586400784528714e-05, + "loss": 4.6546, + "step": 31229 + }, + { + "epoch": 0.18573365686554383, + "grad_norm": 1.7888445854187012, + "learning_rate": 4.586375050931523e-05, + "loss": 5.3152, + "step": 31230 + }, + { + "epoch": 0.18573960414882482, + "grad_norm": 1.656417965888977, + "learning_rate": 4.586349316606e-05, + "loss": 4.8675, + "step": 31231 + }, + { + "epoch": 0.1857455514321058, + "grad_norm": 1.5206754207611084, + "learning_rate": 4.586323581552155e-05, + "loss": 4.8806, + "step": 31232 + }, + { + "epoch": 0.18575149871538682, + "grad_norm": 1.7659846544265747, + "learning_rate": 4.586297845769995e-05, + "loss": 4.7916, + "step": 31233 + }, + { + "epoch": 0.1857574459986678, + "grad_norm": 1.6599786281585693, + "learning_rate": 4.5862721092595305e-05, + "loss": 4.9745, + "step": 31234 + }, + { + "epoch": 0.1857633932819488, + "grad_norm": 1.5032174587249756, + "learning_rate": 4.5862463720207696e-05, + "loss": 5.0588, + "step": 31235 + }, + { + "epoch": 0.1857693405652298, + "grad_norm": 1.4480514526367188, + "learning_rate": 4.5862206340537215e-05, + "loss": 5.3229, + "step": 31236 + }, + { + "epoch": 0.1857752878485108, + "grad_norm": 2.4662117958068848, + "learning_rate": 4.5861948953583966e-05, + "loss": 4.3971, + "step": 31237 + }, + { + "epoch": 0.1857812351317918, + "grad_norm": 1.864040493965149, + "learning_rate": 4.5861691559348016e-05, + "loss": 4.5105, + "step": 31238 + }, + { + "epoch": 0.1857871824150728, + "grad_norm": 1.6042394638061523, + "learning_rate": 4.5861434157829466e-05, + "loss": 4.9541, + "step": 31239 + }, + { + "epoch": 0.1857931296983538, + "grad_norm": 1.6892380714416504, + "learning_rate": 4.58611767490284e-05, + "loss": 4.8085, + "step": 31240 + }, + { + "epoch": 0.18579907698163478, + "grad_norm": 1.625783920288086, + "learning_rate": 4.586091933294492e-05, + "loss": 4.5476, + "step": 31241 + }, + { + "epoch": 0.1858050242649158, + "grad_norm": 1.3952319622039795, + "learning_rate": 4.5860661909579106e-05, + "loss": 4.493, + "step": 31242 + }, + { + "epoch": 0.18581097154819678, + "grad_norm": 1.6816487312316895, + "learning_rate": 4.586040447893105e-05, + "loss": 4.562, + "step": 31243 + }, + { + "epoch": 0.18581691883147777, + "grad_norm": 1.593903660774231, + "learning_rate": 4.586014704100085e-05, + "loss": 4.4116, + "step": 31244 + }, + { + "epoch": 0.1858228661147588, + "grad_norm": 1.8696283102035522, + "learning_rate": 4.5859889595788577e-05, + "loss": 5.0991, + "step": 31245 + }, + { + "epoch": 0.18582881339803978, + "grad_norm": 1.7519524097442627, + "learning_rate": 4.585963214329434e-05, + "loss": 5.0826, + "step": 31246 + }, + { + "epoch": 0.18583476068132077, + "grad_norm": 1.9816068410873413, + "learning_rate": 4.5859374683518216e-05, + "loss": 5.0646, + "step": 31247 + }, + { + "epoch": 0.18584070796460178, + "grad_norm": 2.6658241748809814, + "learning_rate": 4.58591172164603e-05, + "loss": 4.0542, + "step": 31248 + }, + { + "epoch": 0.18584665524788277, + "grad_norm": 1.6509416103363037, + "learning_rate": 4.585885974212068e-05, + "loss": 5.1681, + "step": 31249 + }, + { + "epoch": 0.18585260253116376, + "grad_norm": 1.7301459312438965, + "learning_rate": 4.585860226049945e-05, + "loss": 4.6656, + "step": 31250 + }, + { + "epoch": 0.18585854981444477, + "grad_norm": 1.643879771232605, + "learning_rate": 4.58583447715967e-05, + "loss": 5.1228, + "step": 31251 + }, + { + "epoch": 0.18586449709772576, + "grad_norm": 1.7651242017745972, + "learning_rate": 4.585808727541252e-05, + "loss": 4.5896, + "step": 31252 + }, + { + "epoch": 0.18587044438100675, + "grad_norm": 1.7697621583938599, + "learning_rate": 4.585782977194699e-05, + "loss": 4.6722, + "step": 31253 + }, + { + "epoch": 0.18587639166428777, + "grad_norm": 1.751751184463501, + "learning_rate": 4.585757226120021e-05, + "loss": 5.1881, + "step": 31254 + }, + { + "epoch": 0.18588233894756875, + "grad_norm": 1.4857293367385864, + "learning_rate": 4.5857314743172265e-05, + "loss": 5.3738, + "step": 31255 + }, + { + "epoch": 0.18588828623084974, + "grad_norm": 1.6109179258346558, + "learning_rate": 4.5857057217863245e-05, + "loss": 5.0916, + "step": 31256 + }, + { + "epoch": 0.18589423351413076, + "grad_norm": 1.9718564748764038, + "learning_rate": 4.5856799685273244e-05, + "loss": 4.8674, + "step": 31257 + }, + { + "epoch": 0.18590018079741175, + "grad_norm": 2.1158852577209473, + "learning_rate": 4.585654214540235e-05, + "loss": 5.0665, + "step": 31258 + }, + { + "epoch": 0.18590612808069273, + "grad_norm": 1.5725470781326294, + "learning_rate": 4.5856284598250656e-05, + "loss": 5.3185, + "step": 31259 + }, + { + "epoch": 0.18591207536397375, + "grad_norm": 2.0950751304626465, + "learning_rate": 4.585602704381825e-05, + "loss": 4.3702, + "step": 31260 + }, + { + "epoch": 0.18591802264725474, + "grad_norm": 1.6896560192108154, + "learning_rate": 4.585576948210522e-05, + "loss": 4.3898, + "step": 31261 + }, + { + "epoch": 0.18592396993053573, + "grad_norm": 1.580668568611145, + "learning_rate": 4.5855511913111646e-05, + "loss": 4.9532, + "step": 31262 + }, + { + "epoch": 0.18592991721381674, + "grad_norm": 1.6650530099868774, + "learning_rate": 4.5855254336837636e-05, + "loss": 4.8532, + "step": 31263 + }, + { + "epoch": 0.18593586449709773, + "grad_norm": 2.025588274002075, + "learning_rate": 4.5854996753283276e-05, + "loss": 5.184, + "step": 31264 + }, + { + "epoch": 0.18594181178037872, + "grad_norm": 2.844881534576416, + "learning_rate": 4.585473916244865e-05, + "loss": 3.9266, + "step": 31265 + }, + { + "epoch": 0.1859477590636597, + "grad_norm": 2.53692626953125, + "learning_rate": 4.585448156433384e-05, + "loss": 4.2536, + "step": 31266 + }, + { + "epoch": 0.18595370634694072, + "grad_norm": 1.666998028755188, + "learning_rate": 4.585422395893896e-05, + "loss": 4.8051, + "step": 31267 + }, + { + "epoch": 0.1859596536302217, + "grad_norm": 1.4353892803192139, + "learning_rate": 4.585396634626408e-05, + "loss": 4.6797, + "step": 31268 + }, + { + "epoch": 0.1859656009135027, + "grad_norm": 1.8029720783233643, + "learning_rate": 4.5853708726309285e-05, + "loss": 4.3908, + "step": 31269 + }, + { + "epoch": 0.18597154819678371, + "grad_norm": 1.8102213144302368, + "learning_rate": 4.585345109907469e-05, + "loss": 4.6718, + "step": 31270 + }, + { + "epoch": 0.1859774954800647, + "grad_norm": 1.8388559818267822, + "learning_rate": 4.585319346456036e-05, + "loss": 4.8911, + "step": 31271 + }, + { + "epoch": 0.1859834427633457, + "grad_norm": 1.69950270652771, + "learning_rate": 4.585293582276641e-05, + "loss": 5.6486, + "step": 31272 + }, + { + "epoch": 0.1859893900466267, + "grad_norm": 1.5373098850250244, + "learning_rate": 4.585267817369291e-05, + "loss": 4.9994, + "step": 31273 + }, + { + "epoch": 0.1859953373299077, + "grad_norm": 1.6866233348846436, + "learning_rate": 4.5852420517339945e-05, + "loss": 4.9179, + "step": 31274 + }, + { + "epoch": 0.18600128461318868, + "grad_norm": 1.7572931051254272, + "learning_rate": 4.5852162853707625e-05, + "loss": 5.2829, + "step": 31275 + }, + { + "epoch": 0.1860072318964697, + "grad_norm": 1.6226259469985962, + "learning_rate": 4.5851905182796026e-05, + "loss": 5.0821, + "step": 31276 + }, + { + "epoch": 0.1860131791797507, + "grad_norm": 1.8315626382827759, + "learning_rate": 4.585164750460525e-05, + "loss": 4.9505, + "step": 31277 + }, + { + "epoch": 0.18601912646303168, + "grad_norm": 1.7805089950561523, + "learning_rate": 4.585138981913537e-05, + "loss": 5.2987, + "step": 31278 + }, + { + "epoch": 0.1860250737463127, + "grad_norm": 1.6392905712127686, + "learning_rate": 4.58511321263865e-05, + "loss": 4.5575, + "step": 31279 + }, + { + "epoch": 0.18603102102959368, + "grad_norm": 2.2516140937805176, + "learning_rate": 4.5850874426358703e-05, + "loss": 4.4565, + "step": 31280 + }, + { + "epoch": 0.18603696831287467, + "grad_norm": 2.3298892974853516, + "learning_rate": 4.585061671905208e-05, + "loss": 4.0626, + "step": 31281 + }, + { + "epoch": 0.18604291559615568, + "grad_norm": 1.3898862600326538, + "learning_rate": 4.585035900446673e-05, + "loss": 5.026, + "step": 31282 + }, + { + "epoch": 0.18604886287943667, + "grad_norm": 1.517095923423767, + "learning_rate": 4.5850101282602723e-05, + "loss": 5.0808, + "step": 31283 + }, + { + "epoch": 0.18605481016271766, + "grad_norm": 1.505425214767456, + "learning_rate": 4.584984355346017e-05, + "loss": 5.0229, + "step": 31284 + }, + { + "epoch": 0.18606075744599868, + "grad_norm": 2.051816701889038, + "learning_rate": 4.584958581703915e-05, + "loss": 4.5713, + "step": 31285 + }, + { + "epoch": 0.18606670472927966, + "grad_norm": 1.7285747528076172, + "learning_rate": 4.5849328073339756e-05, + "loss": 5.0913, + "step": 31286 + }, + { + "epoch": 0.18607265201256065, + "grad_norm": 1.5341402292251587, + "learning_rate": 4.584907032236208e-05, + "loss": 5.0847, + "step": 31287 + }, + { + "epoch": 0.18607859929584167, + "grad_norm": 1.6782925128936768, + "learning_rate": 4.58488125641062e-05, + "loss": 5.1384, + "step": 31288 + }, + { + "epoch": 0.18608454657912266, + "grad_norm": 1.3116849660873413, + "learning_rate": 4.584855479857222e-05, + "loss": 5.1293, + "step": 31289 + }, + { + "epoch": 0.18609049386240364, + "grad_norm": 1.7799185514450073, + "learning_rate": 4.584829702576022e-05, + "loss": 4.9044, + "step": 31290 + }, + { + "epoch": 0.18609644114568466, + "grad_norm": 2.572935104370117, + "learning_rate": 4.5848039245670304e-05, + "loss": 4.1723, + "step": 31291 + }, + { + "epoch": 0.18610238842896565, + "grad_norm": 1.9144190549850464, + "learning_rate": 4.584778145830255e-05, + "loss": 4.5369, + "step": 31292 + }, + { + "epoch": 0.18610833571224664, + "grad_norm": 2.1058690547943115, + "learning_rate": 4.584752366365706e-05, + "loss": 4.2558, + "step": 31293 + }, + { + "epoch": 0.18611428299552765, + "grad_norm": 2.1572718620300293, + "learning_rate": 4.58472658617339e-05, + "loss": 3.7545, + "step": 31294 + }, + { + "epoch": 0.18612023027880864, + "grad_norm": 2.5771355628967285, + "learning_rate": 4.584700805253317e-05, + "loss": 4.04, + "step": 31295 + }, + { + "epoch": 0.18612617756208963, + "grad_norm": 1.5100488662719727, + "learning_rate": 4.584675023605498e-05, + "loss": 4.5827, + "step": 31296 + }, + { + "epoch": 0.18613212484537064, + "grad_norm": 2.33481764793396, + "learning_rate": 4.58464924122994e-05, + "loss": 4.2408, + "step": 31297 + }, + { + "epoch": 0.18613807212865163, + "grad_norm": 2.062885284423828, + "learning_rate": 4.584623458126652e-05, + "loss": 4.196, + "step": 31298 + }, + { + "epoch": 0.18614401941193262, + "grad_norm": 2.3652517795562744, + "learning_rate": 4.584597674295644e-05, + "loss": 3.8771, + "step": 31299 + }, + { + "epoch": 0.18614996669521364, + "grad_norm": 2.5485894680023193, + "learning_rate": 4.584571889736925e-05, + "loss": 4.203, + "step": 31300 + }, + { + "epoch": 0.18615591397849462, + "grad_norm": 2.2650632858276367, + "learning_rate": 4.584546104450502e-05, + "loss": 4.1996, + "step": 31301 + }, + { + "epoch": 0.1861618612617756, + "grad_norm": 2.359175205230713, + "learning_rate": 4.584520318436387e-05, + "loss": 4.029, + "step": 31302 + }, + { + "epoch": 0.18616780854505663, + "grad_norm": 1.8140226602554321, + "learning_rate": 4.5844945316945867e-05, + "loss": 5.2865, + "step": 31303 + }, + { + "epoch": 0.18617375582833762, + "grad_norm": 1.4269286394119263, + "learning_rate": 4.584468744225111e-05, + "loss": 5.4931, + "step": 31304 + }, + { + "epoch": 0.1861797031116186, + "grad_norm": 1.6179120540618896, + "learning_rate": 4.584442956027969e-05, + "loss": 5.9247, + "step": 31305 + }, + { + "epoch": 0.18618565039489962, + "grad_norm": 1.5601685047149658, + "learning_rate": 4.5844171671031696e-05, + "loss": 5.6042, + "step": 31306 + }, + { + "epoch": 0.1861915976781806, + "grad_norm": 1.9319846630096436, + "learning_rate": 4.584391377450721e-05, + "loss": 4.4306, + "step": 31307 + }, + { + "epoch": 0.1861975449614616, + "grad_norm": 1.6437833309173584, + "learning_rate": 4.584365587070634e-05, + "loss": 4.9218, + "step": 31308 + }, + { + "epoch": 0.1862034922447426, + "grad_norm": 1.5281999111175537, + "learning_rate": 4.584339795962915e-05, + "loss": 4.7631, + "step": 31309 + }, + { + "epoch": 0.1862094395280236, + "grad_norm": 1.864122986793518, + "learning_rate": 4.584314004127576e-05, + "loss": 4.5204, + "step": 31310 + }, + { + "epoch": 0.1862153868113046, + "grad_norm": 1.773083209991455, + "learning_rate": 4.5842882115646234e-05, + "loss": 4.8342, + "step": 31311 + }, + { + "epoch": 0.1862213340945856, + "grad_norm": 1.4922150373458862, + "learning_rate": 4.5842624182740676e-05, + "loss": 5.2839, + "step": 31312 + }, + { + "epoch": 0.1862272813778666, + "grad_norm": 2.119924545288086, + "learning_rate": 4.584236624255918e-05, + "loss": 4.1028, + "step": 31313 + }, + { + "epoch": 0.18623322866114758, + "grad_norm": 2.1595730781555176, + "learning_rate": 4.584210829510183e-05, + "loss": 3.5932, + "step": 31314 + }, + { + "epoch": 0.1862391759444286, + "grad_norm": 2.704542875289917, + "learning_rate": 4.5841850340368706e-05, + "loss": 3.715, + "step": 31315 + }, + { + "epoch": 0.18624512322770959, + "grad_norm": 2.3942182064056396, + "learning_rate": 4.584159237835991e-05, + "loss": 3.5308, + "step": 31316 + }, + { + "epoch": 0.18625107051099057, + "grad_norm": 2.023380994796753, + "learning_rate": 4.584133440907553e-05, + "loss": 3.3817, + "step": 31317 + }, + { + "epoch": 0.1862570177942716, + "grad_norm": 2.1290016174316406, + "learning_rate": 4.5841076432515655e-05, + "loss": 3.6297, + "step": 31318 + }, + { + "epoch": 0.18626296507755258, + "grad_norm": 2.054892063140869, + "learning_rate": 4.584081844868038e-05, + "loss": 5.0177, + "step": 31319 + }, + { + "epoch": 0.18626891236083357, + "grad_norm": 2.030423879623413, + "learning_rate": 4.584056045756979e-05, + "loss": 4.1709, + "step": 31320 + }, + { + "epoch": 0.18627485964411458, + "grad_norm": 2.3559436798095703, + "learning_rate": 4.584030245918397e-05, + "loss": 4.2042, + "step": 31321 + }, + { + "epoch": 0.18628080692739557, + "grad_norm": 2.2861175537109375, + "learning_rate": 4.584004445352302e-05, + "loss": 5.2536, + "step": 31322 + }, + { + "epoch": 0.18628675421067656, + "grad_norm": 2.001182794570923, + "learning_rate": 4.5839786440587016e-05, + "loss": 5.2153, + "step": 31323 + }, + { + "epoch": 0.18629270149395755, + "grad_norm": 2.0066721439361572, + "learning_rate": 4.583952842037608e-05, + "loss": 5.3088, + "step": 31324 + }, + { + "epoch": 0.18629864877723856, + "grad_norm": 2.068047285079956, + "learning_rate": 4.583927039289026e-05, + "loss": 4.7806, + "step": 31325 + }, + { + "epoch": 0.18630459606051955, + "grad_norm": 2.7590277194976807, + "learning_rate": 4.5839012358129676e-05, + "loss": 2.992, + "step": 31326 + }, + { + "epoch": 0.18631054334380054, + "grad_norm": 2.9526596069335938, + "learning_rate": 4.58387543160944e-05, + "loss": 3.2839, + "step": 31327 + }, + { + "epoch": 0.18631649062708155, + "grad_norm": 1.7671618461608887, + "learning_rate": 4.5838496266784534e-05, + "loss": 4.8593, + "step": 31328 + }, + { + "epoch": 0.18632243791036254, + "grad_norm": 1.5757657289505005, + "learning_rate": 4.5838238210200165e-05, + "loss": 5.1869, + "step": 31329 + }, + { + "epoch": 0.18632838519364353, + "grad_norm": 1.4810543060302734, + "learning_rate": 4.5837980146341383e-05, + "loss": 4.7026, + "step": 31330 + }, + { + "epoch": 0.18633433247692455, + "grad_norm": 1.845725655555725, + "learning_rate": 4.5837722075208276e-05, + "loss": 4.3001, + "step": 31331 + }, + { + "epoch": 0.18634027976020553, + "grad_norm": 1.7754443883895874, + "learning_rate": 4.583746399680093e-05, + "loss": 5.4637, + "step": 31332 + }, + { + "epoch": 0.18634622704348652, + "grad_norm": 1.6854273080825806, + "learning_rate": 4.583720591111945e-05, + "loss": 5.3277, + "step": 31333 + }, + { + "epoch": 0.18635217432676754, + "grad_norm": 1.672729253768921, + "learning_rate": 4.583694781816392e-05, + "loss": 5.3735, + "step": 31334 + }, + { + "epoch": 0.18635812161004853, + "grad_norm": 1.7456068992614746, + "learning_rate": 4.583668971793442e-05, + "loss": 5.2757, + "step": 31335 + }, + { + "epoch": 0.18636406889332952, + "grad_norm": 2.863888740539551, + "learning_rate": 4.583643161043104e-05, + "loss": 3.96, + "step": 31336 + }, + { + "epoch": 0.18637001617661053, + "grad_norm": 2.82841157913208, + "learning_rate": 4.583617349565389e-05, + "loss": 3.8763, + "step": 31337 + }, + { + "epoch": 0.18637596345989152, + "grad_norm": 2.5859992504119873, + "learning_rate": 4.583591537360304e-05, + "loss": 3.6348, + "step": 31338 + }, + { + "epoch": 0.1863819107431725, + "grad_norm": 2.1093318462371826, + "learning_rate": 4.5835657244278584e-05, + "loss": 4.5981, + "step": 31339 + }, + { + "epoch": 0.18638785802645352, + "grad_norm": 1.672054409980774, + "learning_rate": 4.5835399107680624e-05, + "loss": 5.0322, + "step": 31340 + }, + { + "epoch": 0.1863938053097345, + "grad_norm": 1.9632993936538696, + "learning_rate": 4.583514096380924e-05, + "loss": 5.2158, + "step": 31341 + }, + { + "epoch": 0.1863997525930155, + "grad_norm": 1.7678093910217285, + "learning_rate": 4.583488281266451e-05, + "loss": 5.1891, + "step": 31342 + }, + { + "epoch": 0.18640569987629652, + "grad_norm": 1.7519903182983398, + "learning_rate": 4.583462465424656e-05, + "loss": 4.935, + "step": 31343 + }, + { + "epoch": 0.1864116471595775, + "grad_norm": 2.023782968521118, + "learning_rate": 4.5834366488555434e-05, + "loss": 5.2837, + "step": 31344 + }, + { + "epoch": 0.1864175944428585, + "grad_norm": 1.8081834316253662, + "learning_rate": 4.583410831559126e-05, + "loss": 4.9925, + "step": 31345 + }, + { + "epoch": 0.1864235417261395, + "grad_norm": 1.6684492826461792, + "learning_rate": 4.5833850135354115e-05, + "loss": 4.4966, + "step": 31346 + }, + { + "epoch": 0.1864294890094205, + "grad_norm": 2.077711582183838, + "learning_rate": 4.583359194784409e-05, + "loss": 3.9967, + "step": 31347 + }, + { + "epoch": 0.18643543629270148, + "grad_norm": 1.727041482925415, + "learning_rate": 4.5833333753061266e-05, + "loss": 5.0418, + "step": 31348 + }, + { + "epoch": 0.1864413835759825, + "grad_norm": 1.7768146991729736, + "learning_rate": 4.5833075551005745e-05, + "loss": 5.151, + "step": 31349 + }, + { + "epoch": 0.1864473308592635, + "grad_norm": 1.6089451313018799, + "learning_rate": 4.5832817341677606e-05, + "loss": 5.4846, + "step": 31350 + }, + { + "epoch": 0.18645327814254448, + "grad_norm": 1.5748451948165894, + "learning_rate": 4.583255912507695e-05, + "loss": 5.2612, + "step": 31351 + }, + { + "epoch": 0.1864592254258255, + "grad_norm": 1.5321335792541504, + "learning_rate": 4.583230090120386e-05, + "loss": 5.3703, + "step": 31352 + }, + { + "epoch": 0.18646517270910648, + "grad_norm": 1.5108387470245361, + "learning_rate": 4.5832042670058436e-05, + "loss": 5.7321, + "step": 31353 + }, + { + "epoch": 0.18647111999238747, + "grad_norm": 1.5854402780532837, + "learning_rate": 4.583178443164075e-05, + "loss": 4.8038, + "step": 31354 + }, + { + "epoch": 0.18647706727566848, + "grad_norm": 1.736132025718689, + "learning_rate": 4.583152618595092e-05, + "loss": 4.9117, + "step": 31355 + }, + { + "epoch": 0.18648301455894947, + "grad_norm": 1.6473597288131714, + "learning_rate": 4.583126793298901e-05, + "loss": 5.1955, + "step": 31356 + }, + { + "epoch": 0.18648896184223046, + "grad_norm": 1.777772307395935, + "learning_rate": 4.583100967275512e-05, + "loss": 4.8981, + "step": 31357 + }, + { + "epoch": 0.18649490912551148, + "grad_norm": 1.8098564147949219, + "learning_rate": 4.583075140524934e-05, + "loss": 4.7925, + "step": 31358 + }, + { + "epoch": 0.18650085640879246, + "grad_norm": 1.8321475982666016, + "learning_rate": 4.583049313047175e-05, + "loss": 5.1179, + "step": 31359 + }, + { + "epoch": 0.18650680369207345, + "grad_norm": 1.6555041074752808, + "learning_rate": 4.583023484842246e-05, + "loss": 5.3435, + "step": 31360 + }, + { + "epoch": 0.18651275097535447, + "grad_norm": 1.7198667526245117, + "learning_rate": 4.5829976559101553e-05, + "loss": 4.7813, + "step": 31361 + }, + { + "epoch": 0.18651869825863546, + "grad_norm": 2.30778169631958, + "learning_rate": 4.582971826250911e-05, + "loss": 3.006, + "step": 31362 + }, + { + "epoch": 0.18652464554191645, + "grad_norm": 2.258409261703491, + "learning_rate": 4.582945995864523e-05, + "loss": 2.8892, + "step": 31363 + }, + { + "epoch": 0.18653059282519746, + "grad_norm": 2.3513214588165283, + "learning_rate": 4.582920164751e-05, + "loss": 2.8185, + "step": 31364 + }, + { + "epoch": 0.18653654010847845, + "grad_norm": 2.1013023853302, + "learning_rate": 4.5828943329103513e-05, + "loss": 3.0731, + "step": 31365 + }, + { + "epoch": 0.18654248739175944, + "grad_norm": 1.7189773321151733, + "learning_rate": 4.582868500342586e-05, + "loss": 5.1003, + "step": 31366 + }, + { + "epoch": 0.18654843467504045, + "grad_norm": 2.006357192993164, + "learning_rate": 4.582842667047712e-05, + "loss": 5.4696, + "step": 31367 + }, + { + "epoch": 0.18655438195832144, + "grad_norm": 1.5163938999176025, + "learning_rate": 4.5828168330257396e-05, + "loss": 5.427, + "step": 31368 + }, + { + "epoch": 0.18656032924160243, + "grad_norm": 1.5231959819793701, + "learning_rate": 4.582790998276678e-05, + "loss": 5.3535, + "step": 31369 + }, + { + "epoch": 0.18656627652488345, + "grad_norm": 1.6396427154541016, + "learning_rate": 4.582765162800534e-05, + "loss": 5.0343, + "step": 31370 + }, + { + "epoch": 0.18657222380816443, + "grad_norm": 1.7178908586502075, + "learning_rate": 4.582739326597319e-05, + "loss": 4.675, + "step": 31371 + }, + { + "epoch": 0.18657817109144542, + "grad_norm": 2.420158863067627, + "learning_rate": 4.582713489667042e-05, + "loss": 4.0231, + "step": 31372 + }, + { + "epoch": 0.18658411837472644, + "grad_norm": 1.9106335639953613, + "learning_rate": 4.582687652009711e-05, + "loss": 4.6815, + "step": 31373 + }, + { + "epoch": 0.18659006565800743, + "grad_norm": 1.3372851610183716, + "learning_rate": 4.582661813625334e-05, + "loss": 5.1749, + "step": 31374 + }, + { + "epoch": 0.1865960129412884, + "grad_norm": 1.7023464441299438, + "learning_rate": 4.582635974513923e-05, + "loss": 4.9542, + "step": 31375 + }, + { + "epoch": 0.18660196022456943, + "grad_norm": 2.3644206523895264, + "learning_rate": 4.582610134675483e-05, + "loss": 4.3933, + "step": 31376 + }, + { + "epoch": 0.18660790750785042, + "grad_norm": 1.7321727275848389, + "learning_rate": 4.582584294110027e-05, + "loss": 4.6583, + "step": 31377 + }, + { + "epoch": 0.1866138547911314, + "grad_norm": 1.7842439413070679, + "learning_rate": 4.582558452817563e-05, + "loss": 4.5918, + "step": 31378 + }, + { + "epoch": 0.18661980207441242, + "grad_norm": 1.9122416973114014, + "learning_rate": 4.582532610798098e-05, + "loss": 4.187, + "step": 31379 + }, + { + "epoch": 0.1866257493576934, + "grad_norm": 1.8635472059249878, + "learning_rate": 4.5825067680516427e-05, + "loss": 4.4158, + "step": 31380 + }, + { + "epoch": 0.1866316966409744, + "grad_norm": 1.8706049919128418, + "learning_rate": 4.5824809245782066e-05, + "loss": 4.037, + "step": 31381 + }, + { + "epoch": 0.1866376439242554, + "grad_norm": 1.762373447418213, + "learning_rate": 4.582455080377797e-05, + "loss": 3.9966, + "step": 31382 + }, + { + "epoch": 0.1866435912075364, + "grad_norm": 1.6706191301345825, + "learning_rate": 4.582429235450424e-05, + "loss": 4.2182, + "step": 31383 + }, + { + "epoch": 0.1866495384908174, + "grad_norm": 1.55520498752594, + "learning_rate": 4.582403389796096e-05, + "loss": 5.1072, + "step": 31384 + }, + { + "epoch": 0.18665548577409838, + "grad_norm": 1.5530856847763062, + "learning_rate": 4.582377543414823e-05, + "loss": 5.0972, + "step": 31385 + }, + { + "epoch": 0.1866614330573794, + "grad_norm": 1.747187614440918, + "learning_rate": 4.582351696306614e-05, + "loss": 4.9334, + "step": 31386 + }, + { + "epoch": 0.18666738034066038, + "grad_norm": 1.696406602859497, + "learning_rate": 4.582325848471477e-05, + "loss": 5.6964, + "step": 31387 + }, + { + "epoch": 0.18667332762394137, + "grad_norm": 1.426660418510437, + "learning_rate": 4.5822999999094215e-05, + "loss": 5.3495, + "step": 31388 + }, + { + "epoch": 0.1866792749072224, + "grad_norm": 1.656969428062439, + "learning_rate": 4.582274150620457e-05, + "loss": 4.5178, + "step": 31389 + }, + { + "epoch": 0.18668522219050337, + "grad_norm": 1.558522343635559, + "learning_rate": 4.5822483006045915e-05, + "loss": 4.5923, + "step": 31390 + }, + { + "epoch": 0.18669116947378436, + "grad_norm": 1.76998770236969, + "learning_rate": 4.582222449861835e-05, + "loss": 4.517, + "step": 31391 + }, + { + "epoch": 0.18669711675706538, + "grad_norm": 1.4918303489685059, + "learning_rate": 4.582196598392196e-05, + "loss": 5.4223, + "step": 31392 + }, + { + "epoch": 0.18670306404034637, + "grad_norm": 1.9973161220550537, + "learning_rate": 4.5821707461956836e-05, + "loss": 5.229, + "step": 31393 + }, + { + "epoch": 0.18670901132362736, + "grad_norm": 1.789795994758606, + "learning_rate": 4.582144893272307e-05, + "loss": 5.6042, + "step": 31394 + }, + { + "epoch": 0.18671495860690837, + "grad_norm": 1.5900517702102661, + "learning_rate": 4.5821190396220756e-05, + "loss": 4.8256, + "step": 31395 + }, + { + "epoch": 0.18672090589018936, + "grad_norm": 1.594332218170166, + "learning_rate": 4.582093185244997e-05, + "loss": 5.0181, + "step": 31396 + }, + { + "epoch": 0.18672685317347035, + "grad_norm": 1.881818413734436, + "learning_rate": 4.582067330141082e-05, + "loss": 5.3832, + "step": 31397 + }, + { + "epoch": 0.18673280045675136, + "grad_norm": 2.042795419692993, + "learning_rate": 4.582041474310339e-05, + "loss": 5.0048, + "step": 31398 + }, + { + "epoch": 0.18673874774003235, + "grad_norm": 1.8554868698120117, + "learning_rate": 4.5820156177527764e-05, + "loss": 4.7971, + "step": 31399 + }, + { + "epoch": 0.18674469502331334, + "grad_norm": 1.6183528900146484, + "learning_rate": 4.581989760468404e-05, + "loss": 4.9781, + "step": 31400 + }, + { + "epoch": 0.18675064230659436, + "grad_norm": 2.160238265991211, + "learning_rate": 4.5819639024572295e-05, + "loss": 5.0855, + "step": 31401 + }, + { + "epoch": 0.18675658958987534, + "grad_norm": 2.1129162311553955, + "learning_rate": 4.5819380437192636e-05, + "loss": 5.111, + "step": 31402 + }, + { + "epoch": 0.18676253687315633, + "grad_norm": 1.541813850402832, + "learning_rate": 4.5819121842545144e-05, + "loss": 5.1907, + "step": 31403 + }, + { + "epoch": 0.18676848415643735, + "grad_norm": 1.655600905418396, + "learning_rate": 4.581886324062992e-05, + "loss": 5.6293, + "step": 31404 + }, + { + "epoch": 0.18677443143971834, + "grad_norm": 1.5326381921768188, + "learning_rate": 4.581860463144703e-05, + "loss": 4.9882, + "step": 31405 + }, + { + "epoch": 0.18678037872299932, + "grad_norm": 2.5064444541931152, + "learning_rate": 4.58183460149966e-05, + "loss": 4.3907, + "step": 31406 + }, + { + "epoch": 0.18678632600628034, + "grad_norm": 2.4211840629577637, + "learning_rate": 4.581808739127868e-05, + "loss": 4.6788, + "step": 31407 + }, + { + "epoch": 0.18679227328956133, + "grad_norm": 1.835132122039795, + "learning_rate": 4.581782876029339e-05, + "loss": 4.4737, + "step": 31408 + }, + { + "epoch": 0.18679822057284232, + "grad_norm": 1.724884033203125, + "learning_rate": 4.581757012204082e-05, + "loss": 4.2805, + "step": 31409 + }, + { + "epoch": 0.18680416785612333, + "grad_norm": 1.43998384475708, + "learning_rate": 4.581731147652104e-05, + "loss": 4.9872, + "step": 31410 + }, + { + "epoch": 0.18681011513940432, + "grad_norm": 1.7539047002792358, + "learning_rate": 4.5817052823734155e-05, + "loss": 5.1531, + "step": 31411 + }, + { + "epoch": 0.1868160624226853, + "grad_norm": 1.7996374368667603, + "learning_rate": 4.5816794163680255e-05, + "loss": 4.5348, + "step": 31412 + }, + { + "epoch": 0.18682200970596632, + "grad_norm": 1.9007580280303955, + "learning_rate": 4.5816535496359416e-05, + "loss": 4.5503, + "step": 31413 + }, + { + "epoch": 0.1868279569892473, + "grad_norm": 2.9723873138427734, + "learning_rate": 4.581627682177175e-05, + "loss": 3.6093, + "step": 31414 + }, + { + "epoch": 0.1868339042725283, + "grad_norm": 1.840366244316101, + "learning_rate": 4.581601813991734e-05, + "loss": 4.5359, + "step": 31415 + }, + { + "epoch": 0.18683985155580932, + "grad_norm": 1.7800344228744507, + "learning_rate": 4.5815759450796265e-05, + "loss": 4.7916, + "step": 31416 + }, + { + "epoch": 0.1868457988390903, + "grad_norm": 2.508409261703491, + "learning_rate": 4.581550075440862e-05, + "loss": 3.9651, + "step": 31417 + }, + { + "epoch": 0.1868517461223713, + "grad_norm": 1.4773229360580444, + "learning_rate": 4.581524205075451e-05, + "loss": 5.1962, + "step": 31418 + }, + { + "epoch": 0.1868576934056523, + "grad_norm": 1.7282037734985352, + "learning_rate": 4.5814983339834004e-05, + "loss": 5.0627, + "step": 31419 + }, + { + "epoch": 0.1868636406889333, + "grad_norm": 1.5566262006759644, + "learning_rate": 4.581472462164721e-05, + "loss": 5.0318, + "step": 31420 + }, + { + "epoch": 0.18686958797221428, + "grad_norm": 1.586804986000061, + "learning_rate": 4.581446589619421e-05, + "loss": 5.3587, + "step": 31421 + }, + { + "epoch": 0.1868755352554953, + "grad_norm": 1.626639723777771, + "learning_rate": 4.5814207163475094e-05, + "loss": 5.1839, + "step": 31422 + }, + { + "epoch": 0.1868814825387763, + "grad_norm": 1.9931199550628662, + "learning_rate": 4.581394842348995e-05, + "loss": 4.5328, + "step": 31423 + }, + { + "epoch": 0.18688742982205728, + "grad_norm": 1.5360701084136963, + "learning_rate": 4.581368967623887e-05, + "loss": 6.0491, + "step": 31424 + }, + { + "epoch": 0.1868933771053383, + "grad_norm": 1.7270042896270752, + "learning_rate": 4.5813430921721954e-05, + "loss": 5.4057, + "step": 31425 + }, + { + "epoch": 0.18689932438861928, + "grad_norm": 1.620786190032959, + "learning_rate": 4.5813172159939276e-05, + "loss": 5.1965, + "step": 31426 + }, + { + "epoch": 0.18690527167190027, + "grad_norm": 1.6832870244979858, + "learning_rate": 4.5812913390890945e-05, + "loss": 5.1923, + "step": 31427 + }, + { + "epoch": 0.18691121895518129, + "grad_norm": 1.7056113481521606, + "learning_rate": 4.581265461457703e-05, + "loss": 5.0523, + "step": 31428 + }, + { + "epoch": 0.18691716623846227, + "grad_norm": 1.7429434061050415, + "learning_rate": 4.581239583099763e-05, + "loss": 5.1345, + "step": 31429 + }, + { + "epoch": 0.18692311352174326, + "grad_norm": 1.6870777606964111, + "learning_rate": 4.5812137040152854e-05, + "loss": 5.3135, + "step": 31430 + }, + { + "epoch": 0.18692906080502428, + "grad_norm": 1.7804944515228271, + "learning_rate": 4.581187824204277e-05, + "loss": 5.3752, + "step": 31431 + }, + { + "epoch": 0.18693500808830527, + "grad_norm": 1.5267258882522583, + "learning_rate": 4.5811619436667465e-05, + "loss": 5.5806, + "step": 31432 + }, + { + "epoch": 0.18694095537158625, + "grad_norm": 1.6377745866775513, + "learning_rate": 4.5811360624027045e-05, + "loss": 5.3912, + "step": 31433 + }, + { + "epoch": 0.18694690265486727, + "grad_norm": 1.8628687858581543, + "learning_rate": 4.581110180412159e-05, + "loss": 4.087, + "step": 31434 + }, + { + "epoch": 0.18695284993814826, + "grad_norm": 1.439253568649292, + "learning_rate": 4.58108429769512e-05, + "loss": 5.171, + "step": 31435 + }, + { + "epoch": 0.18695879722142925, + "grad_norm": 1.7017579078674316, + "learning_rate": 4.581058414251596e-05, + "loss": 4.8104, + "step": 31436 + }, + { + "epoch": 0.18696474450471026, + "grad_norm": 1.866621971130371, + "learning_rate": 4.581032530081596e-05, + "loss": 4.979, + "step": 31437 + }, + { + "epoch": 0.18697069178799125, + "grad_norm": 1.5694007873535156, + "learning_rate": 4.581006645185129e-05, + "loss": 5.031, + "step": 31438 + }, + { + "epoch": 0.18697663907127224, + "grad_norm": 1.5056393146514893, + "learning_rate": 4.580980759562203e-05, + "loss": 5.082, + "step": 31439 + }, + { + "epoch": 0.18698258635455323, + "grad_norm": 1.5853091478347778, + "learning_rate": 4.580954873212829e-05, + "loss": 5.0652, + "step": 31440 + }, + { + "epoch": 0.18698853363783424, + "grad_norm": 1.423098087310791, + "learning_rate": 4.580928986137015e-05, + "loss": 5.2198, + "step": 31441 + }, + { + "epoch": 0.18699448092111523, + "grad_norm": 1.8297144174575806, + "learning_rate": 4.580903098334771e-05, + "loss": 4.8045, + "step": 31442 + }, + { + "epoch": 0.18700042820439622, + "grad_norm": 1.4703069925308228, + "learning_rate": 4.580877209806105e-05, + "loss": 4.9772, + "step": 31443 + }, + { + "epoch": 0.18700637548767723, + "grad_norm": 1.6311166286468506, + "learning_rate": 4.580851320551025e-05, + "loss": 5.0265, + "step": 31444 + }, + { + "epoch": 0.18701232277095822, + "grad_norm": 1.5908745527267456, + "learning_rate": 4.5808254305695425e-05, + "loss": 5.6455, + "step": 31445 + }, + { + "epoch": 0.1870182700542392, + "grad_norm": 1.6188886165618896, + "learning_rate": 4.580799539861665e-05, + "loss": 4.9907, + "step": 31446 + }, + { + "epoch": 0.18702421733752023, + "grad_norm": 1.6662514209747314, + "learning_rate": 4.580773648427402e-05, + "loss": 4.599, + "step": 31447 + }, + { + "epoch": 0.18703016462080121, + "grad_norm": 1.7355191707611084, + "learning_rate": 4.5807477562667624e-05, + "loss": 4.721, + "step": 31448 + }, + { + "epoch": 0.1870361119040822, + "grad_norm": 1.6992077827453613, + "learning_rate": 4.580721863379755e-05, + "loss": 4.6429, + "step": 31449 + }, + { + "epoch": 0.18704205918736322, + "grad_norm": 1.8001128435134888, + "learning_rate": 4.580695969766389e-05, + "loss": 4.6414, + "step": 31450 + }, + { + "epoch": 0.1870480064706442, + "grad_norm": 1.691829800605774, + "learning_rate": 4.580670075426674e-05, + "loss": 4.6086, + "step": 31451 + }, + { + "epoch": 0.1870539537539252, + "grad_norm": 1.8028392791748047, + "learning_rate": 4.580644180360618e-05, + "loss": 4.8074, + "step": 31452 + }, + { + "epoch": 0.1870599010372062, + "grad_norm": 1.355403184890747, + "learning_rate": 4.580618284568231e-05, + "loss": 5.077, + "step": 31453 + }, + { + "epoch": 0.1870658483204872, + "grad_norm": 1.6251015663146973, + "learning_rate": 4.580592388049522e-05, + "loss": 4.7268, + "step": 31454 + }, + { + "epoch": 0.1870717956037682, + "grad_norm": 1.8957926034927368, + "learning_rate": 4.580566490804499e-05, + "loss": 4.5649, + "step": 31455 + }, + { + "epoch": 0.1870777428870492, + "grad_norm": 1.628433346748352, + "learning_rate": 4.5805405928331726e-05, + "loss": 4.5964, + "step": 31456 + }, + { + "epoch": 0.1870836901703302, + "grad_norm": 1.7020845413208008, + "learning_rate": 4.58051469413555e-05, + "loss": 4.5698, + "step": 31457 + }, + { + "epoch": 0.18708963745361118, + "grad_norm": 1.6829500198364258, + "learning_rate": 4.580488794711641e-05, + "loss": 4.619, + "step": 31458 + }, + { + "epoch": 0.1870955847368922, + "grad_norm": 1.7393929958343506, + "learning_rate": 4.580462894561456e-05, + "loss": 4.4903, + "step": 31459 + }, + { + "epoch": 0.18710153202017318, + "grad_norm": 1.6554701328277588, + "learning_rate": 4.5804369936850024e-05, + "loss": 5.2823, + "step": 31460 + }, + { + "epoch": 0.18710747930345417, + "grad_norm": 1.4598510265350342, + "learning_rate": 4.58041109208229e-05, + "loss": 5.2572, + "step": 31461 + }, + { + "epoch": 0.1871134265867352, + "grad_norm": 1.5052999258041382, + "learning_rate": 4.5803851897533265e-05, + "loss": 5.1571, + "step": 31462 + }, + { + "epoch": 0.18711937387001618, + "grad_norm": 1.4165245294570923, + "learning_rate": 4.580359286698123e-05, + "loss": 5.0514, + "step": 31463 + }, + { + "epoch": 0.18712532115329716, + "grad_norm": 1.668857455253601, + "learning_rate": 4.5803333829166874e-05, + "loss": 4.7183, + "step": 31464 + }, + { + "epoch": 0.18713126843657818, + "grad_norm": 1.7835750579833984, + "learning_rate": 4.580307478409029e-05, + "loss": 4.1674, + "step": 31465 + }, + { + "epoch": 0.18713721571985917, + "grad_norm": 1.8612866401672363, + "learning_rate": 4.580281573175157e-05, + "loss": 4.245, + "step": 31466 + }, + { + "epoch": 0.18714316300314016, + "grad_norm": 2.1322779655456543, + "learning_rate": 4.58025566721508e-05, + "loss": 4.191, + "step": 31467 + }, + { + "epoch": 0.18714911028642117, + "grad_norm": 1.4032418727874756, + "learning_rate": 4.580229760528807e-05, + "loss": 4.7888, + "step": 31468 + }, + { + "epoch": 0.18715505756970216, + "grad_norm": 1.4955732822418213, + "learning_rate": 4.580203853116347e-05, + "loss": 5.0653, + "step": 31469 + }, + { + "epoch": 0.18716100485298315, + "grad_norm": 1.857201099395752, + "learning_rate": 4.580177944977709e-05, + "loss": 4.9189, + "step": 31470 + }, + { + "epoch": 0.18716695213626416, + "grad_norm": 1.4744160175323486, + "learning_rate": 4.5801520361129034e-05, + "loss": 3.9242, + "step": 31471 + }, + { + "epoch": 0.18717289941954515, + "grad_norm": 1.6050392389297485, + "learning_rate": 4.580126126521938e-05, + "loss": 4.7737, + "step": 31472 + }, + { + "epoch": 0.18717884670282614, + "grad_norm": 1.4203214645385742, + "learning_rate": 4.580100216204822e-05, + "loss": 4.7792, + "step": 31473 + }, + { + "epoch": 0.18718479398610716, + "grad_norm": 1.7042044401168823, + "learning_rate": 4.580074305161565e-05, + "loss": 4.5548, + "step": 31474 + }, + { + "epoch": 0.18719074126938814, + "grad_norm": 1.8733965158462524, + "learning_rate": 4.5800483933921746e-05, + "loss": 4.289, + "step": 31475 + }, + { + "epoch": 0.18719668855266913, + "grad_norm": 1.8629066944122314, + "learning_rate": 4.580022480896661e-05, + "loss": 4.2435, + "step": 31476 + }, + { + "epoch": 0.18720263583595015, + "grad_norm": 1.7233967781066895, + "learning_rate": 4.5799965676750336e-05, + "loss": 4.4444, + "step": 31477 + }, + { + "epoch": 0.18720858311923114, + "grad_norm": 1.6446317434310913, + "learning_rate": 4.5799706537273e-05, + "loss": 4.754, + "step": 31478 + }, + { + "epoch": 0.18721453040251212, + "grad_norm": 1.7049897909164429, + "learning_rate": 4.5799447390534714e-05, + "loss": 4.5082, + "step": 31479 + }, + { + "epoch": 0.18722047768579314, + "grad_norm": 1.6299967765808105, + "learning_rate": 4.579918823653554e-05, + "loss": 4.5914, + "step": 31480 + }, + { + "epoch": 0.18722642496907413, + "grad_norm": 1.862816333770752, + "learning_rate": 4.579892907527559e-05, + "loss": 4.4565, + "step": 31481 + }, + { + "epoch": 0.18723237225235512, + "grad_norm": 1.6829630136489868, + "learning_rate": 4.579866990675495e-05, + "loss": 3.9664, + "step": 31482 + }, + { + "epoch": 0.18723831953563613, + "grad_norm": 1.7739498615264893, + "learning_rate": 4.579841073097372e-05, + "loss": 4.5638, + "step": 31483 + }, + { + "epoch": 0.18724426681891712, + "grad_norm": 1.7989349365234375, + "learning_rate": 4.5798151547931963e-05, + "loss": 4.6418, + "step": 31484 + }, + { + "epoch": 0.1872502141021981, + "grad_norm": 1.6883355379104614, + "learning_rate": 4.5797892357629794e-05, + "loss": 4.6899, + "step": 31485 + }, + { + "epoch": 0.18725616138547913, + "grad_norm": 1.5071123838424683, + "learning_rate": 4.57976331600673e-05, + "loss": 4.605, + "step": 31486 + }, + { + "epoch": 0.1872621086687601, + "grad_norm": 1.6472139358520508, + "learning_rate": 4.579737395524456e-05, + "loss": 4.4949, + "step": 31487 + }, + { + "epoch": 0.1872680559520411, + "grad_norm": 2.729337215423584, + "learning_rate": 4.579711474316167e-05, + "loss": 4.4027, + "step": 31488 + }, + { + "epoch": 0.18727400323532212, + "grad_norm": 1.8999816179275513, + "learning_rate": 4.5796855523818726e-05, + "loss": 4.7577, + "step": 31489 + }, + { + "epoch": 0.1872799505186031, + "grad_norm": 1.6633950471878052, + "learning_rate": 4.5796596297215815e-05, + "loss": 4.3385, + "step": 31490 + }, + { + "epoch": 0.1872858978018841, + "grad_norm": 1.6885244846343994, + "learning_rate": 4.579633706335303e-05, + "loss": 4.4684, + "step": 31491 + }, + { + "epoch": 0.1872918450851651, + "grad_norm": 1.56419837474823, + "learning_rate": 4.579607782223045e-05, + "loss": 4.5609, + "step": 31492 + }, + { + "epoch": 0.1872977923684461, + "grad_norm": 1.6976735591888428, + "learning_rate": 4.579581857384818e-05, + "loss": 4.3122, + "step": 31493 + }, + { + "epoch": 0.18730373965172709, + "grad_norm": 2.019990921020508, + "learning_rate": 4.5795559318206304e-05, + "loss": 4.0644, + "step": 31494 + }, + { + "epoch": 0.1873096869350081, + "grad_norm": 2.4111409187316895, + "learning_rate": 4.5795300055304914e-05, + "loss": 3.8046, + "step": 31495 + }, + { + "epoch": 0.1873156342182891, + "grad_norm": 1.6888504028320312, + "learning_rate": 4.57950407851441e-05, + "loss": 3.7976, + "step": 31496 + }, + { + "epoch": 0.18732158150157008, + "grad_norm": 2.261028528213501, + "learning_rate": 4.579478150772395e-05, + "loss": 3.9696, + "step": 31497 + }, + { + "epoch": 0.18732752878485107, + "grad_norm": 2.104658365249634, + "learning_rate": 4.5794522223044555e-05, + "loss": 3.879, + "step": 31498 + }, + { + "epoch": 0.18733347606813208, + "grad_norm": 2.300837755203247, + "learning_rate": 4.5794262931106015e-05, + "loss": 4.1062, + "step": 31499 + }, + { + "epoch": 0.18733942335141307, + "grad_norm": 2.2843008041381836, + "learning_rate": 4.57940036319084e-05, + "loss": 3.7473, + "step": 31500 + }, + { + "epoch": 0.18734537063469406, + "grad_norm": 2.924936294555664, + "learning_rate": 4.5793744325451826e-05, + "loss": 3.7478, + "step": 31501 + }, + { + "epoch": 0.18735131791797507, + "grad_norm": 2.4981048107147217, + "learning_rate": 4.579348501173636e-05, + "loss": 3.7812, + "step": 31502 + }, + { + "epoch": 0.18735726520125606, + "grad_norm": 2.363129138946533, + "learning_rate": 4.5793225690762106e-05, + "loss": 3.7943, + "step": 31503 + }, + { + "epoch": 0.18736321248453705, + "grad_norm": 2.4851186275482178, + "learning_rate": 4.579296636252915e-05, + "loss": 3.8708, + "step": 31504 + }, + { + "epoch": 0.18736915976781807, + "grad_norm": 2.625079870223999, + "learning_rate": 4.5792707027037595e-05, + "loss": 3.769, + "step": 31505 + }, + { + "epoch": 0.18737510705109905, + "grad_norm": 1.9397916793823242, + "learning_rate": 4.579244768428751e-05, + "loss": 4.3074, + "step": 31506 + }, + { + "epoch": 0.18738105433438004, + "grad_norm": 2.270460605621338, + "learning_rate": 4.5792188334279004e-05, + "loss": 3.8198, + "step": 31507 + }, + { + "epoch": 0.18738700161766106, + "grad_norm": 2.187398910522461, + "learning_rate": 4.579192897701215e-05, + "loss": 3.7374, + "step": 31508 + }, + { + "epoch": 0.18739294890094205, + "grad_norm": 2.3796896934509277, + "learning_rate": 4.579166961248706e-05, + "loss": 3.9178, + "step": 31509 + }, + { + "epoch": 0.18739889618422303, + "grad_norm": 2.440819501876831, + "learning_rate": 4.579141024070381e-05, + "loss": 3.6605, + "step": 31510 + }, + { + "epoch": 0.18740484346750405, + "grad_norm": 2.090683698654175, + "learning_rate": 4.579115086166249e-05, + "loss": 4.0199, + "step": 31511 + }, + { + "epoch": 0.18741079075078504, + "grad_norm": 1.8660192489624023, + "learning_rate": 4.5790891475363195e-05, + "loss": 5.4397, + "step": 31512 + }, + { + "epoch": 0.18741673803406603, + "grad_norm": 1.8933132886886597, + "learning_rate": 4.579063208180601e-05, + "loss": 5.161, + "step": 31513 + }, + { + "epoch": 0.18742268531734704, + "grad_norm": 1.442830204963684, + "learning_rate": 4.5790372680991035e-05, + "loss": 4.9392, + "step": 31514 + }, + { + "epoch": 0.18742863260062803, + "grad_norm": 1.606457233428955, + "learning_rate": 4.5790113272918355e-05, + "loss": 5.1507, + "step": 31515 + }, + { + "epoch": 0.18743457988390902, + "grad_norm": 1.7178606986999512, + "learning_rate": 4.578985385758806e-05, + "loss": 5.1888, + "step": 31516 + }, + { + "epoch": 0.18744052716719004, + "grad_norm": 1.7797423601150513, + "learning_rate": 4.578959443500025e-05, + "loss": 5.1161, + "step": 31517 + }, + { + "epoch": 0.18744647445047102, + "grad_norm": 1.7583237886428833, + "learning_rate": 4.5789335005154996e-05, + "loss": 4.9044, + "step": 31518 + }, + { + "epoch": 0.187452421733752, + "grad_norm": 1.9187301397323608, + "learning_rate": 4.578907556805241e-05, + "loss": 4.8383, + "step": 31519 + }, + { + "epoch": 0.18745836901703303, + "grad_norm": 1.3928438425064087, + "learning_rate": 4.578881612369256e-05, + "loss": 4.6952, + "step": 31520 + }, + { + "epoch": 0.18746431630031402, + "grad_norm": 1.5495777130126953, + "learning_rate": 4.578855667207556e-05, + "loss": 5.093, + "step": 31521 + }, + { + "epoch": 0.187470263583595, + "grad_norm": 2.0939781665802, + "learning_rate": 4.578829721320148e-05, + "loss": 4.4353, + "step": 31522 + }, + { + "epoch": 0.18747621086687602, + "grad_norm": 2.6413023471832275, + "learning_rate": 4.578803774707043e-05, + "loss": 3.7471, + "step": 31523 + }, + { + "epoch": 0.187482158150157, + "grad_norm": 2.237964630126953, + "learning_rate": 4.578777827368249e-05, + "loss": 4.1189, + "step": 31524 + }, + { + "epoch": 0.187488105433438, + "grad_norm": 1.77215576171875, + "learning_rate": 4.5787518793037745e-05, + "loss": 4.5919, + "step": 31525 + }, + { + "epoch": 0.187494052716719, + "grad_norm": 1.7483875751495361, + "learning_rate": 4.5787259305136297e-05, + "loss": 4.7209, + "step": 31526 + }, + { + "epoch": 0.1875, + "grad_norm": 1.7072293758392334, + "learning_rate": 4.578699980997823e-05, + "loss": 4.4651, + "step": 31527 + }, + { + "epoch": 0.187505947283281, + "grad_norm": 1.7075767517089844, + "learning_rate": 4.5786740307563636e-05, + "loss": 4.3471, + "step": 31528 + }, + { + "epoch": 0.187511894566562, + "grad_norm": 2.496588945388794, + "learning_rate": 4.578648079789261e-05, + "loss": 3.6709, + "step": 31529 + }, + { + "epoch": 0.187517841849843, + "grad_norm": 2.438305139541626, + "learning_rate": 4.578622128096522e-05, + "loss": 3.8271, + "step": 31530 + }, + { + "epoch": 0.18752378913312398, + "grad_norm": 2.574528455734253, + "learning_rate": 4.578596175678159e-05, + "loss": 3.7591, + "step": 31531 + }, + { + "epoch": 0.187529736416405, + "grad_norm": 2.3681464195251465, + "learning_rate": 4.5785702225341796e-05, + "loss": 4.214, + "step": 31532 + }, + { + "epoch": 0.18753568369968598, + "grad_norm": 1.5918017625808716, + "learning_rate": 4.578544268664593e-05, + "loss": 5.2159, + "step": 31533 + }, + { + "epoch": 0.18754163098296697, + "grad_norm": 1.9178626537322998, + "learning_rate": 4.5785183140694073e-05, + "loss": 3.9341, + "step": 31534 + }, + { + "epoch": 0.187547578266248, + "grad_norm": 2.1391525268554688, + "learning_rate": 4.578492358748633e-05, + "loss": 3.1456, + "step": 31535 + }, + { + "epoch": 0.18755352554952898, + "grad_norm": 2.421508312225342, + "learning_rate": 4.578466402702278e-05, + "loss": 3.1124, + "step": 31536 + }, + { + "epoch": 0.18755947283280996, + "grad_norm": 2.379535675048828, + "learning_rate": 4.578440445930352e-05, + "loss": 3.2543, + "step": 31537 + }, + { + "epoch": 0.18756542011609098, + "grad_norm": 2.236633777618408, + "learning_rate": 4.578414488432864e-05, + "loss": 3.3216, + "step": 31538 + }, + { + "epoch": 0.18757136739937197, + "grad_norm": 2.082542657852173, + "learning_rate": 4.578388530209823e-05, + "loss": 3.1493, + "step": 31539 + }, + { + "epoch": 0.18757731468265296, + "grad_norm": 2.2979769706726074, + "learning_rate": 4.5783625712612384e-05, + "loss": 3.2585, + "step": 31540 + }, + { + "epoch": 0.18758326196593397, + "grad_norm": 2.1978182792663574, + "learning_rate": 4.5783366115871186e-05, + "loss": 3.2713, + "step": 31541 + }, + { + "epoch": 0.18758920924921496, + "grad_norm": 2.097055435180664, + "learning_rate": 4.578310651187473e-05, + "loss": 3.3176, + "step": 31542 + }, + { + "epoch": 0.18759515653249595, + "grad_norm": 2.2990310192108154, + "learning_rate": 4.57828469006231e-05, + "loss": 3.1615, + "step": 31543 + }, + { + "epoch": 0.18760110381577697, + "grad_norm": 2.353107213973999, + "learning_rate": 4.5782587282116394e-05, + "loss": 3.0828, + "step": 31544 + }, + { + "epoch": 0.18760705109905795, + "grad_norm": 2.156449794769287, + "learning_rate": 4.578232765635471e-05, + "loss": 3.7385, + "step": 31545 + }, + { + "epoch": 0.18761299838233894, + "grad_norm": 1.8776116371154785, + "learning_rate": 4.578206802333812e-05, + "loss": 5.1393, + "step": 31546 + }, + { + "epoch": 0.18761894566561996, + "grad_norm": 1.8295111656188965, + "learning_rate": 4.578180838306674e-05, + "loss": 4.6989, + "step": 31547 + }, + { + "epoch": 0.18762489294890095, + "grad_norm": 1.707702398300171, + "learning_rate": 4.578154873554063e-05, + "loss": 4.6461, + "step": 31548 + }, + { + "epoch": 0.18763084023218193, + "grad_norm": 2.7304489612579346, + "learning_rate": 4.57812890807599e-05, + "loss": 3.7014, + "step": 31549 + }, + { + "epoch": 0.18763678751546295, + "grad_norm": 3.1167895793914795, + "learning_rate": 4.578102941872464e-05, + "loss": 3.9208, + "step": 31550 + }, + { + "epoch": 0.18764273479874394, + "grad_norm": 2.5492351055145264, + "learning_rate": 4.578076974943494e-05, + "loss": 3.769, + "step": 31551 + }, + { + "epoch": 0.18764868208202493, + "grad_norm": 1.8772006034851074, + "learning_rate": 4.578051007289088e-05, + "loss": 4.4789, + "step": 31552 + }, + { + "epoch": 0.18765462936530594, + "grad_norm": 1.7834813594818115, + "learning_rate": 4.578025038909256e-05, + "loss": 4.9667, + "step": 31553 + }, + { + "epoch": 0.18766057664858693, + "grad_norm": 1.9036569595336914, + "learning_rate": 4.5779990698040074e-05, + "loss": 4.4362, + "step": 31554 + }, + { + "epoch": 0.18766652393186792, + "grad_norm": 1.837803602218628, + "learning_rate": 4.5779730999733506e-05, + "loss": 4.7798, + "step": 31555 + }, + { + "epoch": 0.18767247121514893, + "grad_norm": 1.6703819036483765, + "learning_rate": 4.577947129417295e-05, + "loss": 4.8608, + "step": 31556 + }, + { + "epoch": 0.18767841849842992, + "grad_norm": 1.7654380798339844, + "learning_rate": 4.577921158135849e-05, + "loss": 4.6977, + "step": 31557 + }, + { + "epoch": 0.1876843657817109, + "grad_norm": 1.843579649925232, + "learning_rate": 4.577895186129022e-05, + "loss": 4.2761, + "step": 31558 + }, + { + "epoch": 0.1876903130649919, + "grad_norm": 1.7880736589431763, + "learning_rate": 4.577869213396824e-05, + "loss": 4.7802, + "step": 31559 + }, + { + "epoch": 0.18769626034827291, + "grad_norm": 1.5163524150848389, + "learning_rate": 4.577843239939263e-05, + "loss": 4.9608, + "step": 31560 + }, + { + "epoch": 0.1877022076315539, + "grad_norm": 1.6260676383972168, + "learning_rate": 4.5778172657563486e-05, + "loss": 4.8441, + "step": 31561 + }, + { + "epoch": 0.1877081549148349, + "grad_norm": 2.001150369644165, + "learning_rate": 4.57779129084809e-05, + "loss": 4.1881, + "step": 31562 + }, + { + "epoch": 0.1877141021981159, + "grad_norm": 1.6918448209762573, + "learning_rate": 4.577765315214495e-05, + "loss": 4.309, + "step": 31563 + }, + { + "epoch": 0.1877200494813969, + "grad_norm": 1.5819053649902344, + "learning_rate": 4.5777393388555745e-05, + "loss": 4.7125, + "step": 31564 + }, + { + "epoch": 0.18772599676467788, + "grad_norm": 1.521506428718567, + "learning_rate": 4.5777133617713355e-05, + "loss": 4.4762, + "step": 31565 + }, + { + "epoch": 0.1877319440479589, + "grad_norm": 1.608293056488037, + "learning_rate": 4.57768738396179e-05, + "loss": 4.8308, + "step": 31566 + }, + { + "epoch": 0.1877378913312399, + "grad_norm": 1.7008312940597534, + "learning_rate": 4.577661405426943e-05, + "loss": 4.7827, + "step": 31567 + }, + { + "epoch": 0.18774383861452087, + "grad_norm": 1.6263885498046875, + "learning_rate": 4.577635426166807e-05, + "loss": 5.102, + "step": 31568 + }, + { + "epoch": 0.1877497858978019, + "grad_norm": 1.7362202405929565, + "learning_rate": 4.5776094461813903e-05, + "loss": 5.0606, + "step": 31569 + }, + { + "epoch": 0.18775573318108288, + "grad_norm": 1.699578881263733, + "learning_rate": 4.577583465470702e-05, + "loss": 4.5649, + "step": 31570 + }, + { + "epoch": 0.18776168046436387, + "grad_norm": 1.5926166772842407, + "learning_rate": 4.5775574840347504e-05, + "loss": 4.9645, + "step": 31571 + }, + { + "epoch": 0.18776762774764488, + "grad_norm": 1.5831513404846191, + "learning_rate": 4.5775315018735443e-05, + "loss": 5.0697, + "step": 31572 + }, + { + "epoch": 0.18777357503092587, + "grad_norm": 1.4057412147521973, + "learning_rate": 4.5775055189870945e-05, + "loss": 4.9488, + "step": 31573 + }, + { + "epoch": 0.18777952231420686, + "grad_norm": 1.5728765726089478, + "learning_rate": 4.5774795353754075e-05, + "loss": 4.6663, + "step": 31574 + }, + { + "epoch": 0.18778546959748788, + "grad_norm": 1.5813493728637695, + "learning_rate": 4.577453551038495e-05, + "loss": 5.5153, + "step": 31575 + }, + { + "epoch": 0.18779141688076886, + "grad_norm": 1.6821653842926025, + "learning_rate": 4.5774275659763644e-05, + "loss": 5.2125, + "step": 31576 + }, + { + "epoch": 0.18779736416404985, + "grad_norm": 1.6527361869812012, + "learning_rate": 4.577401580189025e-05, + "loss": 5.0845, + "step": 31577 + }, + { + "epoch": 0.18780331144733087, + "grad_norm": 1.7719552516937256, + "learning_rate": 4.5773755936764876e-05, + "loss": 5.0366, + "step": 31578 + }, + { + "epoch": 0.18780925873061186, + "grad_norm": 1.7301576137542725, + "learning_rate": 4.5773496064387576e-05, + "loss": 5.7365, + "step": 31579 + }, + { + "epoch": 0.18781520601389284, + "grad_norm": 1.64248788356781, + "learning_rate": 4.577323618475848e-05, + "loss": 5.1362, + "step": 31580 + }, + { + "epoch": 0.18782115329717386, + "grad_norm": 1.539428472518921, + "learning_rate": 4.5772976297877653e-05, + "loss": 5.2903, + "step": 31581 + }, + { + "epoch": 0.18782710058045485, + "grad_norm": 1.7478768825531006, + "learning_rate": 4.577271640374521e-05, + "loss": 4.9548, + "step": 31582 + }, + { + "epoch": 0.18783304786373584, + "grad_norm": 1.6046321392059326, + "learning_rate": 4.5772456502361216e-05, + "loss": 5.1547, + "step": 31583 + }, + { + "epoch": 0.18783899514701685, + "grad_norm": 1.613788366317749, + "learning_rate": 4.577219659372577e-05, + "loss": 4.7554, + "step": 31584 + }, + { + "epoch": 0.18784494243029784, + "grad_norm": 1.7057472467422485, + "learning_rate": 4.577193667783897e-05, + "loss": 4.8775, + "step": 31585 + }, + { + "epoch": 0.18785088971357883, + "grad_norm": 1.5329315662384033, + "learning_rate": 4.5771676754700896e-05, + "loss": 4.7219, + "step": 31586 + }, + { + "epoch": 0.18785683699685984, + "grad_norm": 1.5598114728927612, + "learning_rate": 4.577141682431164e-05, + "loss": 4.9717, + "step": 31587 + }, + { + "epoch": 0.18786278428014083, + "grad_norm": 1.6115435361862183, + "learning_rate": 4.577115688667131e-05, + "loss": 4.7923, + "step": 31588 + }, + { + "epoch": 0.18786873156342182, + "grad_norm": 1.2374604940414429, + "learning_rate": 4.5770896941779974e-05, + "loss": 5.4604, + "step": 31589 + }, + { + "epoch": 0.18787467884670284, + "grad_norm": 1.4944182634353638, + "learning_rate": 4.577063698963774e-05, + "loss": 5.6087, + "step": 31590 + }, + { + "epoch": 0.18788062612998382, + "grad_norm": 1.454232931137085, + "learning_rate": 4.577037703024468e-05, + "loss": 4.9809, + "step": 31591 + }, + { + "epoch": 0.1878865734132648, + "grad_norm": 1.7529237270355225, + "learning_rate": 4.57701170636009e-05, + "loss": 4.9926, + "step": 31592 + }, + { + "epoch": 0.18789252069654583, + "grad_norm": 1.7798666954040527, + "learning_rate": 4.5769857089706494e-05, + "loss": 4.7364, + "step": 31593 + }, + { + "epoch": 0.18789846797982682, + "grad_norm": 1.6372876167297363, + "learning_rate": 4.576959710856154e-05, + "loss": 4.7, + "step": 31594 + }, + { + "epoch": 0.1879044152631078, + "grad_norm": 1.503820776939392, + "learning_rate": 4.5769337120166135e-05, + "loss": 4.7779, + "step": 31595 + }, + { + "epoch": 0.18791036254638882, + "grad_norm": 1.51885187625885, + "learning_rate": 4.576907712452037e-05, + "loss": 5.1936, + "step": 31596 + }, + { + "epoch": 0.1879163098296698, + "grad_norm": 1.5635126829147339, + "learning_rate": 4.576881712162434e-05, + "loss": 5.4207, + "step": 31597 + }, + { + "epoch": 0.1879222571129508, + "grad_norm": 1.565337061882019, + "learning_rate": 4.576855711147812e-05, + "loss": 5.4425, + "step": 31598 + }, + { + "epoch": 0.1879282043962318, + "grad_norm": 1.7001174688339233, + "learning_rate": 4.576829709408181e-05, + "loss": 5.1692, + "step": 31599 + }, + { + "epoch": 0.1879341516795128, + "grad_norm": 1.720685362815857, + "learning_rate": 4.576803706943551e-05, + "loss": 5.3135, + "step": 31600 + }, + { + "epoch": 0.1879400989627938, + "grad_norm": 1.5667119026184082, + "learning_rate": 4.5767777037539304e-05, + "loss": 5.3522, + "step": 31601 + }, + { + "epoch": 0.1879460462460748, + "grad_norm": 1.7021211385726929, + "learning_rate": 4.576751699839328e-05, + "loss": 5.0113, + "step": 31602 + }, + { + "epoch": 0.1879519935293558, + "grad_norm": 1.6862629652023315, + "learning_rate": 4.5767256951997525e-05, + "loss": 5.2257, + "step": 31603 + }, + { + "epoch": 0.18795794081263678, + "grad_norm": 1.5623557567596436, + "learning_rate": 4.5766996898352146e-05, + "loss": 5.1346, + "step": 31604 + }, + { + "epoch": 0.1879638880959178, + "grad_norm": 1.6088786125183105, + "learning_rate": 4.576673683745721e-05, + "loss": 5.2304, + "step": 31605 + }, + { + "epoch": 0.18796983537919879, + "grad_norm": 1.5381817817687988, + "learning_rate": 4.5766476769312827e-05, + "loss": 5.399, + "step": 31606 + }, + { + "epoch": 0.18797578266247977, + "grad_norm": 1.4870381355285645, + "learning_rate": 4.576621669391908e-05, + "loss": 5.2942, + "step": 31607 + }, + { + "epoch": 0.1879817299457608, + "grad_norm": 1.8326987028121948, + "learning_rate": 4.576595661127606e-05, + "loss": 4.5692, + "step": 31608 + }, + { + "epoch": 0.18798767722904178, + "grad_norm": 1.8177613019943237, + "learning_rate": 4.5765696521383863e-05, + "loss": 4.6066, + "step": 31609 + }, + { + "epoch": 0.18799362451232277, + "grad_norm": 1.6168222427368164, + "learning_rate": 4.576543642424257e-05, + "loss": 5.113, + "step": 31610 + }, + { + "epoch": 0.18799957179560378, + "grad_norm": 1.8792698383331299, + "learning_rate": 4.5765176319852287e-05, + "loss": 4.9994, + "step": 31611 + }, + { + "epoch": 0.18800551907888477, + "grad_norm": 1.4694404602050781, + "learning_rate": 4.576491620821309e-05, + "loss": 4.9587, + "step": 31612 + }, + { + "epoch": 0.18801146636216576, + "grad_norm": 1.4442496299743652, + "learning_rate": 4.576465608932508e-05, + "loss": 5.0864, + "step": 31613 + }, + { + "epoch": 0.18801741364544677, + "grad_norm": 1.923790693283081, + "learning_rate": 4.5764395963188335e-05, + "loss": 4.4928, + "step": 31614 + }, + { + "epoch": 0.18802336092872776, + "grad_norm": 1.8033101558685303, + "learning_rate": 4.5764135829802956e-05, + "loss": 4.5554, + "step": 31615 + }, + { + "epoch": 0.18802930821200875, + "grad_norm": 1.7350363731384277, + "learning_rate": 4.5763875689169034e-05, + "loss": 4.4954, + "step": 31616 + }, + { + "epoch": 0.18803525549528974, + "grad_norm": 1.75509774684906, + "learning_rate": 4.576361554128665e-05, + "loss": 4.3791, + "step": 31617 + }, + { + "epoch": 0.18804120277857075, + "grad_norm": 1.8107062578201294, + "learning_rate": 4.576335538615592e-05, + "loss": 4.0603, + "step": 31618 + }, + { + "epoch": 0.18804715006185174, + "grad_norm": 1.824713110923767, + "learning_rate": 4.57630952237769e-05, + "loss": 4.2995, + "step": 31619 + }, + { + "epoch": 0.18805309734513273, + "grad_norm": 2.6946823596954346, + "learning_rate": 4.57628350541497e-05, + "loss": 4.1015, + "step": 31620 + }, + { + "epoch": 0.18805904462841375, + "grad_norm": 1.6974413394927979, + "learning_rate": 4.576257487727442e-05, + "loss": 4.603, + "step": 31621 + }, + { + "epoch": 0.18806499191169473, + "grad_norm": 2.0421180725097656, + "learning_rate": 4.576231469315113e-05, + "loss": 4.3945, + "step": 31622 + }, + { + "epoch": 0.18807093919497572, + "grad_norm": 1.8003754615783691, + "learning_rate": 4.5762054501779934e-05, + "loss": 4.5459, + "step": 31623 + }, + { + "epoch": 0.18807688647825674, + "grad_norm": 1.7390872240066528, + "learning_rate": 4.576179430316092e-05, + "loss": 4.4821, + "step": 31624 + }, + { + "epoch": 0.18808283376153773, + "grad_norm": 1.8832662105560303, + "learning_rate": 4.5761534097294174e-05, + "loss": 3.8606, + "step": 31625 + }, + { + "epoch": 0.18808878104481871, + "grad_norm": 1.6978578567504883, + "learning_rate": 4.576127388417979e-05, + "loss": 5.0896, + "step": 31626 + }, + { + "epoch": 0.18809472832809973, + "grad_norm": 2.140113592147827, + "learning_rate": 4.5761013663817864e-05, + "loss": 5.2355, + "step": 31627 + }, + { + "epoch": 0.18810067561138072, + "grad_norm": 1.6502524614334106, + "learning_rate": 4.576075343620848e-05, + "loss": 5.5907, + "step": 31628 + }, + { + "epoch": 0.1881066228946617, + "grad_norm": 1.6842014789581299, + "learning_rate": 4.576049320135174e-05, + "loss": 5.1909, + "step": 31629 + }, + { + "epoch": 0.18811257017794272, + "grad_norm": 1.5731878280639648, + "learning_rate": 4.576023295924772e-05, + "loss": 5.2126, + "step": 31630 + }, + { + "epoch": 0.1881185174612237, + "grad_norm": 1.822248101234436, + "learning_rate": 4.5759972709896516e-05, + "loss": 4.8566, + "step": 31631 + }, + { + "epoch": 0.1881244647445047, + "grad_norm": 1.8849093914031982, + "learning_rate": 4.575971245329822e-05, + "loss": 5.0437, + "step": 31632 + }, + { + "epoch": 0.18813041202778572, + "grad_norm": 1.7385406494140625, + "learning_rate": 4.575945218945292e-05, + "loss": 5.1574, + "step": 31633 + }, + { + "epoch": 0.1881363593110667, + "grad_norm": 1.8704962730407715, + "learning_rate": 4.5759191918360713e-05, + "loss": 5.4756, + "step": 31634 + }, + { + "epoch": 0.1881423065943477, + "grad_norm": 1.8415088653564453, + "learning_rate": 4.5758931640021684e-05, + "loss": 5.31, + "step": 31635 + }, + { + "epoch": 0.1881482538776287, + "grad_norm": 1.817290186882019, + "learning_rate": 4.5758671354435936e-05, + "loss": 5.2309, + "step": 31636 + }, + { + "epoch": 0.1881542011609097, + "grad_norm": 1.9851620197296143, + "learning_rate": 4.575841106160354e-05, + "loss": 5.1805, + "step": 31637 + }, + { + "epoch": 0.18816014844419068, + "grad_norm": 2.085020065307617, + "learning_rate": 4.57581507615246e-05, + "loss": 3.9923, + "step": 31638 + }, + { + "epoch": 0.1881660957274717, + "grad_norm": 1.8631166219711304, + "learning_rate": 4.57578904541992e-05, + "loss": 4.1787, + "step": 31639 + }, + { + "epoch": 0.1881720430107527, + "grad_norm": 2.2452220916748047, + "learning_rate": 4.5757630139627445e-05, + "loss": 3.9551, + "step": 31640 + }, + { + "epoch": 0.18817799029403368, + "grad_norm": 1.7852009534835815, + "learning_rate": 4.5757369817809415e-05, + "loss": 4.3387, + "step": 31641 + }, + { + "epoch": 0.1881839375773147, + "grad_norm": 1.7815812826156616, + "learning_rate": 4.5757109488745194e-05, + "loss": 4.3556, + "step": 31642 + }, + { + "epoch": 0.18818988486059568, + "grad_norm": 1.7845134735107422, + "learning_rate": 4.5756849152434884e-05, + "loss": 4.0154, + "step": 31643 + }, + { + "epoch": 0.18819583214387667, + "grad_norm": 2.093745231628418, + "learning_rate": 4.5756588808878574e-05, + "loss": 4.2242, + "step": 31644 + }, + { + "epoch": 0.18820177942715768, + "grad_norm": 1.9645696878433228, + "learning_rate": 4.575632845807635e-05, + "loss": 3.8064, + "step": 31645 + }, + { + "epoch": 0.18820772671043867, + "grad_norm": 2.1012284755706787, + "learning_rate": 4.57560681000283e-05, + "loss": 3.9011, + "step": 31646 + }, + { + "epoch": 0.18821367399371966, + "grad_norm": 1.9608296155929565, + "learning_rate": 4.575580773473454e-05, + "loss": 4.008, + "step": 31647 + }, + { + "epoch": 0.18821962127700068, + "grad_norm": 1.7520424127578735, + "learning_rate": 4.5755547362195125e-05, + "loss": 4.2574, + "step": 31648 + }, + { + "epoch": 0.18822556856028166, + "grad_norm": 1.8842599391937256, + "learning_rate": 4.5755286982410165e-05, + "loss": 4.1908, + "step": 31649 + }, + { + "epoch": 0.18823151584356265, + "grad_norm": 1.8884096145629883, + "learning_rate": 4.575502659537976e-05, + "loss": 4.2132, + "step": 31650 + }, + { + "epoch": 0.18823746312684367, + "grad_norm": 1.7970027923583984, + "learning_rate": 4.575476620110398e-05, + "loss": 4.2381, + "step": 31651 + }, + { + "epoch": 0.18824341041012466, + "grad_norm": 1.8529993295669556, + "learning_rate": 4.5754505799582925e-05, + "loss": 4.1563, + "step": 31652 + }, + { + "epoch": 0.18824935769340564, + "grad_norm": 1.8202285766601562, + "learning_rate": 4.5754245390816685e-05, + "loss": 3.8115, + "step": 31653 + }, + { + "epoch": 0.18825530497668666, + "grad_norm": 1.821083426475525, + "learning_rate": 4.575398497480536e-05, + "loss": 4.3038, + "step": 31654 + }, + { + "epoch": 0.18826125225996765, + "grad_norm": 2.2761406898498535, + "learning_rate": 4.575372455154903e-05, + "loss": 3.286, + "step": 31655 + }, + { + "epoch": 0.18826719954324864, + "grad_norm": 2.224435806274414, + "learning_rate": 4.575346412104779e-05, + "loss": 3.3841, + "step": 31656 + }, + { + "epoch": 0.18827314682652965, + "grad_norm": 2.656628370285034, + "learning_rate": 4.5753203683301725e-05, + "loss": 3.4385, + "step": 31657 + }, + { + "epoch": 0.18827909410981064, + "grad_norm": 2.2864227294921875, + "learning_rate": 4.5752943238310935e-05, + "loss": 3.5027, + "step": 31658 + }, + { + "epoch": 0.18828504139309163, + "grad_norm": 2.571734666824341, + "learning_rate": 4.575268278607551e-05, + "loss": 3.4458, + "step": 31659 + }, + { + "epoch": 0.18829098867637264, + "grad_norm": 2.2151083946228027, + "learning_rate": 4.5752422326595534e-05, + "loss": 3.9343, + "step": 31660 + }, + { + "epoch": 0.18829693595965363, + "grad_norm": 1.8273411989212036, + "learning_rate": 4.57521618598711e-05, + "loss": 4.1698, + "step": 31661 + }, + { + "epoch": 0.18830288324293462, + "grad_norm": 1.4451392889022827, + "learning_rate": 4.57519013859023e-05, + "loss": 5.1803, + "step": 31662 + }, + { + "epoch": 0.18830883052621564, + "grad_norm": 1.5774602890014648, + "learning_rate": 4.5751640904689233e-05, + "loss": 5.5158, + "step": 31663 + }, + { + "epoch": 0.18831477780949663, + "grad_norm": 1.76852548122406, + "learning_rate": 4.575138041623197e-05, + "loss": 4.6102, + "step": 31664 + }, + { + "epoch": 0.1883207250927776, + "grad_norm": 2.1750409603118896, + "learning_rate": 4.575111992053063e-05, + "loss": 4.2259, + "step": 31665 + }, + { + "epoch": 0.18832667237605863, + "grad_norm": 2.2930684089660645, + "learning_rate": 4.575085941758528e-05, + "loss": 3.645, + "step": 31666 + }, + { + "epoch": 0.18833261965933962, + "grad_norm": 1.6000158786773682, + "learning_rate": 4.5750598907396015e-05, + "loss": 4.7112, + "step": 31667 + }, + { + "epoch": 0.1883385669426206, + "grad_norm": 2.21150279045105, + "learning_rate": 4.5750338389962936e-05, + "loss": 2.8276, + "step": 31668 + }, + { + "epoch": 0.18834451422590162, + "grad_norm": 2.080242156982422, + "learning_rate": 4.575007786528613e-05, + "loss": 3.6421, + "step": 31669 + }, + { + "epoch": 0.1883504615091826, + "grad_norm": 1.7053500413894653, + "learning_rate": 4.5749817333365687e-05, + "loss": 5.3526, + "step": 31670 + }, + { + "epoch": 0.1883564087924636, + "grad_norm": 1.4372013807296753, + "learning_rate": 4.574955679420169e-05, + "loss": 5.0641, + "step": 31671 + }, + { + "epoch": 0.1883623560757446, + "grad_norm": 1.6831438541412354, + "learning_rate": 4.5749296247794246e-05, + "loss": 4.7807, + "step": 31672 + }, + { + "epoch": 0.1883683033590256, + "grad_norm": 1.7787952423095703, + "learning_rate": 4.574903569414343e-05, + "loss": 4.1886, + "step": 31673 + }, + { + "epoch": 0.1883742506423066, + "grad_norm": 2.1964874267578125, + "learning_rate": 4.5748775133249345e-05, + "loss": 3.8119, + "step": 31674 + }, + { + "epoch": 0.18838019792558758, + "grad_norm": 1.8583804368972778, + "learning_rate": 4.5748514565112074e-05, + "loss": 4.4153, + "step": 31675 + }, + { + "epoch": 0.1883861452088686, + "grad_norm": 1.8326549530029297, + "learning_rate": 4.574825398973171e-05, + "loss": 4.7196, + "step": 31676 + }, + { + "epoch": 0.18839209249214958, + "grad_norm": 1.685388207435608, + "learning_rate": 4.5747993407108345e-05, + "loss": 4.4115, + "step": 31677 + }, + { + "epoch": 0.18839803977543057, + "grad_norm": 1.5775798559188843, + "learning_rate": 4.574773281724207e-05, + "loss": 4.7152, + "step": 31678 + }, + { + "epoch": 0.1884039870587116, + "grad_norm": 1.4056192636489868, + "learning_rate": 4.574747222013298e-05, + "loss": 5.0494, + "step": 31679 + }, + { + "epoch": 0.18840993434199257, + "grad_norm": 1.5998051166534424, + "learning_rate": 4.574721161578115e-05, + "loss": 5.4125, + "step": 31680 + }, + { + "epoch": 0.18841588162527356, + "grad_norm": 1.418294072151184, + "learning_rate": 4.57469510041867e-05, + "loss": 5.2475, + "step": 31681 + }, + { + "epoch": 0.18842182890855458, + "grad_norm": 2.817990779876709, + "learning_rate": 4.574669038534969e-05, + "loss": 3.9644, + "step": 31682 + }, + { + "epoch": 0.18842777619183557, + "grad_norm": 1.8277714252471924, + "learning_rate": 4.574642975927023e-05, + "loss": 4.7339, + "step": 31683 + }, + { + "epoch": 0.18843372347511655, + "grad_norm": 1.7349371910095215, + "learning_rate": 4.5746169125948406e-05, + "loss": 5.0213, + "step": 31684 + }, + { + "epoch": 0.18843967075839757, + "grad_norm": 1.8414616584777832, + "learning_rate": 4.574590848538431e-05, + "loss": 4.6055, + "step": 31685 + }, + { + "epoch": 0.18844561804167856, + "grad_norm": 1.864438533782959, + "learning_rate": 4.574564783757803e-05, + "loss": 4.5234, + "step": 31686 + }, + { + "epoch": 0.18845156532495955, + "grad_norm": 1.8597543239593506, + "learning_rate": 4.574538718252966e-05, + "loss": 4.6606, + "step": 31687 + }, + { + "epoch": 0.18845751260824056, + "grad_norm": 1.7549642324447632, + "learning_rate": 4.574512652023929e-05, + "loss": 5.1434, + "step": 31688 + }, + { + "epoch": 0.18846345989152155, + "grad_norm": 1.574147343635559, + "learning_rate": 4.574486585070701e-05, + "loss": 5.2176, + "step": 31689 + }, + { + "epoch": 0.18846940717480254, + "grad_norm": 1.7602109909057617, + "learning_rate": 4.5744605173932906e-05, + "loss": 5.2822, + "step": 31690 + }, + { + "epoch": 0.18847535445808355, + "grad_norm": 1.6231430768966675, + "learning_rate": 4.574434448991708e-05, + "loss": 5.3009, + "step": 31691 + }, + { + "epoch": 0.18848130174136454, + "grad_norm": 1.9236938953399658, + "learning_rate": 4.5744083798659615e-05, + "loss": 5.1129, + "step": 31692 + }, + { + "epoch": 0.18848724902464553, + "grad_norm": 1.755083680152893, + "learning_rate": 4.574382310016061e-05, + "loss": 4.6636, + "step": 31693 + }, + { + "epoch": 0.18849319630792655, + "grad_norm": 1.7704771757125854, + "learning_rate": 4.574356239442015e-05, + "loss": 4.7429, + "step": 31694 + }, + { + "epoch": 0.18849914359120754, + "grad_norm": 2.079738140106201, + "learning_rate": 4.574330168143831e-05, + "loss": 4.1216, + "step": 31695 + }, + { + "epoch": 0.18850509087448852, + "grad_norm": 1.823591947555542, + "learning_rate": 4.574304096121522e-05, + "loss": 4.3193, + "step": 31696 + }, + { + "epoch": 0.18851103815776954, + "grad_norm": 1.7429176568984985, + "learning_rate": 4.5742780233750936e-05, + "loss": 4.9425, + "step": 31697 + }, + { + "epoch": 0.18851698544105053, + "grad_norm": 1.4497638940811157, + "learning_rate": 4.5742519499045565e-05, + "loss": 4.9634, + "step": 31698 + }, + { + "epoch": 0.18852293272433152, + "grad_norm": 1.698063850402832, + "learning_rate": 4.57422587570992e-05, + "loss": 4.8983, + "step": 31699 + }, + { + "epoch": 0.18852888000761253, + "grad_norm": 1.638048768043518, + "learning_rate": 4.574199800791192e-05, + "loss": 4.5512, + "step": 31700 + }, + { + "epoch": 0.18853482729089352, + "grad_norm": 1.8207498788833618, + "learning_rate": 4.574173725148383e-05, + "loss": 4.646, + "step": 31701 + }, + { + "epoch": 0.1885407745741745, + "grad_norm": 1.7710716724395752, + "learning_rate": 4.5741476487815006e-05, + "loss": 4.5777, + "step": 31702 + }, + { + "epoch": 0.18854672185745552, + "grad_norm": 2.0382273197174072, + "learning_rate": 4.574121571690555e-05, + "loss": 3.9293, + "step": 31703 + }, + { + "epoch": 0.1885526691407365, + "grad_norm": 1.8165003061294556, + "learning_rate": 4.574095493875555e-05, + "loss": 4.3543, + "step": 31704 + }, + { + "epoch": 0.1885586164240175, + "grad_norm": 1.7196195125579834, + "learning_rate": 4.57406941533651e-05, + "loss": 4.1694, + "step": 31705 + }, + { + "epoch": 0.18856456370729852, + "grad_norm": 1.9387542009353638, + "learning_rate": 4.574043336073428e-05, + "loss": 4.1091, + "step": 31706 + }, + { + "epoch": 0.1885705109905795, + "grad_norm": 1.605260968208313, + "learning_rate": 4.5740172560863194e-05, + "loss": 4.957, + "step": 31707 + }, + { + "epoch": 0.1885764582738605, + "grad_norm": 1.6566966772079468, + "learning_rate": 4.573991175375193e-05, + "loss": 4.9981, + "step": 31708 + }, + { + "epoch": 0.1885824055571415, + "grad_norm": 1.6529970169067383, + "learning_rate": 4.573965093940058e-05, + "loss": 4.9042, + "step": 31709 + }, + { + "epoch": 0.1885883528404225, + "grad_norm": 1.7275558710098267, + "learning_rate": 4.573939011780922e-05, + "loss": 4.4827, + "step": 31710 + }, + { + "epoch": 0.18859430012370348, + "grad_norm": 1.8472176790237427, + "learning_rate": 4.573912928897796e-05, + "loss": 3.9163, + "step": 31711 + }, + { + "epoch": 0.1886002474069845, + "grad_norm": 2.4503021240234375, + "learning_rate": 4.5738868452906886e-05, + "loss": 3.4869, + "step": 31712 + }, + { + "epoch": 0.1886061946902655, + "grad_norm": 2.6473751068115234, + "learning_rate": 4.573860760959608e-05, + "loss": 2.8014, + "step": 31713 + }, + { + "epoch": 0.18861214197354648, + "grad_norm": 2.5864624977111816, + "learning_rate": 4.5738346759045646e-05, + "loss": 3.4543, + "step": 31714 + }, + { + "epoch": 0.1886180892568275, + "grad_norm": 2.3731234073638916, + "learning_rate": 4.5738085901255674e-05, + "loss": 3.2747, + "step": 31715 + }, + { + "epoch": 0.18862403654010848, + "grad_norm": 2.6904380321502686, + "learning_rate": 4.573782503622625e-05, + "loss": 3.3082, + "step": 31716 + }, + { + "epoch": 0.18862998382338947, + "grad_norm": 2.2932288646698, + "learning_rate": 4.573756416395746e-05, + "loss": 3.5169, + "step": 31717 + }, + { + "epoch": 0.18863593110667048, + "grad_norm": 2.7179884910583496, + "learning_rate": 4.573730328444939e-05, + "loss": 3.6862, + "step": 31718 + }, + { + "epoch": 0.18864187838995147, + "grad_norm": 3.153721332550049, + "learning_rate": 4.573704239770216e-05, + "loss": 4.091, + "step": 31719 + }, + { + "epoch": 0.18864782567323246, + "grad_norm": 2.8296713829040527, + "learning_rate": 4.5736781503715844e-05, + "loss": 3.9948, + "step": 31720 + }, + { + "epoch": 0.18865377295651348, + "grad_norm": 2.557539701461792, + "learning_rate": 4.573652060249052e-05, + "loss": 3.7615, + "step": 31721 + }, + { + "epoch": 0.18865972023979447, + "grad_norm": 2.5534744262695312, + "learning_rate": 4.57362596940263e-05, + "loss": 3.1447, + "step": 31722 + }, + { + "epoch": 0.18866566752307545, + "grad_norm": 2.5435099601745605, + "learning_rate": 4.573599877832325e-05, + "loss": 3.0014, + "step": 31723 + }, + { + "epoch": 0.18867161480635647, + "grad_norm": 2.939772605895996, + "learning_rate": 4.573573785538149e-05, + "loss": 4.174, + "step": 31724 + }, + { + "epoch": 0.18867756208963746, + "grad_norm": 2.6355509757995605, + "learning_rate": 4.5735476925201095e-05, + "loss": 3.5901, + "step": 31725 + }, + { + "epoch": 0.18868350937291845, + "grad_norm": 2.7327325344085693, + "learning_rate": 4.5735215987782166e-05, + "loss": 3.6472, + "step": 31726 + }, + { + "epoch": 0.18868945665619946, + "grad_norm": 2.610990285873413, + "learning_rate": 4.5734955043124784e-05, + "loss": 3.8589, + "step": 31727 + }, + { + "epoch": 0.18869540393948045, + "grad_norm": 2.165526866912842, + "learning_rate": 4.573469409122904e-05, + "loss": 3.7147, + "step": 31728 + }, + { + "epoch": 0.18870135122276144, + "grad_norm": 2.7130625247955322, + "learning_rate": 4.573443313209503e-05, + "loss": 3.5475, + "step": 31729 + }, + { + "epoch": 0.18870729850604245, + "grad_norm": 2.849789619445801, + "learning_rate": 4.5734172165722847e-05, + "loss": 2.8693, + "step": 31730 + }, + { + "epoch": 0.18871324578932344, + "grad_norm": 2.405048370361328, + "learning_rate": 4.5733911192112564e-05, + "loss": 2.8687, + "step": 31731 + }, + { + "epoch": 0.18871919307260443, + "grad_norm": 2.5103397369384766, + "learning_rate": 4.5733650211264304e-05, + "loss": 3.4796, + "step": 31732 + }, + { + "epoch": 0.18872514035588542, + "grad_norm": 2.737912654876709, + "learning_rate": 4.573338922317813e-05, + "loss": 3.7991, + "step": 31733 + }, + { + "epoch": 0.18873108763916643, + "grad_norm": 1.9937493801116943, + "learning_rate": 4.573312822785415e-05, + "loss": 4.6986, + "step": 31734 + }, + { + "epoch": 0.18873703492244742, + "grad_norm": 1.9701282978057861, + "learning_rate": 4.5732867225292455e-05, + "loss": 4.6174, + "step": 31735 + }, + { + "epoch": 0.1887429822057284, + "grad_norm": 1.8944740295410156, + "learning_rate": 4.5732606215493116e-05, + "loss": 4.9427, + "step": 31736 + }, + { + "epoch": 0.18874892948900943, + "grad_norm": 1.601288080215454, + "learning_rate": 4.573234519845625e-05, + "loss": 5.2254, + "step": 31737 + }, + { + "epoch": 0.18875487677229041, + "grad_norm": 1.7480894327163696, + "learning_rate": 4.5732084174181936e-05, + "loss": 4.9654, + "step": 31738 + }, + { + "epoch": 0.1887608240555714, + "grad_norm": 2.1990926265716553, + "learning_rate": 4.5731823142670256e-05, + "loss": 4.3537, + "step": 31739 + }, + { + "epoch": 0.18876677133885242, + "grad_norm": 1.7892099618911743, + "learning_rate": 4.573156210392132e-05, + "loss": 4.9544, + "step": 31740 + }, + { + "epoch": 0.1887727186221334, + "grad_norm": 1.6811445951461792, + "learning_rate": 4.57313010579352e-05, + "loss": 4.9715, + "step": 31741 + }, + { + "epoch": 0.1887786659054144, + "grad_norm": 1.6930960416793823, + "learning_rate": 4.5731040004712006e-05, + "loss": 4.8801, + "step": 31742 + }, + { + "epoch": 0.1887846131886954, + "grad_norm": 1.424654245376587, + "learning_rate": 4.573077894425182e-05, + "loss": 5.0213, + "step": 31743 + }, + { + "epoch": 0.1887905604719764, + "grad_norm": 2.3140928745269775, + "learning_rate": 4.573051787655472e-05, + "loss": 4.1046, + "step": 31744 + }, + { + "epoch": 0.1887965077552574, + "grad_norm": 1.9428435564041138, + "learning_rate": 4.573025680162083e-05, + "loss": 4.7393, + "step": 31745 + }, + { + "epoch": 0.1888024550385384, + "grad_norm": 1.5271058082580566, + "learning_rate": 4.572999571945021e-05, + "loss": 4.9096, + "step": 31746 + }, + { + "epoch": 0.1888084023218194, + "grad_norm": 1.8436189889907837, + "learning_rate": 4.5729734630042964e-05, + "loss": 4.5118, + "step": 31747 + }, + { + "epoch": 0.18881434960510038, + "grad_norm": 1.3995059728622437, + "learning_rate": 4.572947353339918e-05, + "loss": 5.3253, + "step": 31748 + }, + { + "epoch": 0.1888202968883814, + "grad_norm": 1.628862738609314, + "learning_rate": 4.572921242951896e-05, + "loss": 5.0558, + "step": 31749 + }, + { + "epoch": 0.18882624417166238, + "grad_norm": 1.7975375652313232, + "learning_rate": 4.572895131840238e-05, + "loss": 4.8898, + "step": 31750 + }, + { + "epoch": 0.18883219145494337, + "grad_norm": 1.7084927558898926, + "learning_rate": 4.572869020004953e-05, + "loss": 5.0099, + "step": 31751 + }, + { + "epoch": 0.1888381387382244, + "grad_norm": 1.518763542175293, + "learning_rate": 4.572842907446052e-05, + "loss": 4.8393, + "step": 31752 + }, + { + "epoch": 0.18884408602150538, + "grad_norm": 1.4125078916549683, + "learning_rate": 4.5728167941635425e-05, + "loss": 4.6985, + "step": 31753 + }, + { + "epoch": 0.18885003330478636, + "grad_norm": 1.5411655902862549, + "learning_rate": 4.572790680157434e-05, + "loss": 4.6634, + "step": 31754 + }, + { + "epoch": 0.18885598058806738, + "grad_norm": 1.5578504800796509, + "learning_rate": 4.572764565427736e-05, + "loss": 4.6396, + "step": 31755 + }, + { + "epoch": 0.18886192787134837, + "grad_norm": 1.4977939128875732, + "learning_rate": 4.572738449974457e-05, + "loss": 4.742, + "step": 31756 + }, + { + "epoch": 0.18886787515462936, + "grad_norm": 1.4126876592636108, + "learning_rate": 4.572712333797606e-05, + "loss": 5.0874, + "step": 31757 + }, + { + "epoch": 0.18887382243791037, + "grad_norm": 1.4195787906646729, + "learning_rate": 4.572686216897194e-05, + "loss": 4.663, + "step": 31758 + }, + { + "epoch": 0.18887976972119136, + "grad_norm": 1.4082183837890625, + "learning_rate": 4.5726600992732274e-05, + "loss": 4.7646, + "step": 31759 + }, + { + "epoch": 0.18888571700447235, + "grad_norm": 1.3189529180526733, + "learning_rate": 4.572633980925717e-05, + "loss": 4.6805, + "step": 31760 + }, + { + "epoch": 0.18889166428775336, + "grad_norm": 1.589459776878357, + "learning_rate": 4.5726078618546707e-05, + "loss": 4.7832, + "step": 31761 + }, + { + "epoch": 0.18889761157103435, + "grad_norm": 1.7522832155227661, + "learning_rate": 4.5725817420600995e-05, + "loss": 4.384, + "step": 31762 + }, + { + "epoch": 0.18890355885431534, + "grad_norm": 1.5586985349655151, + "learning_rate": 4.5725556215420104e-05, + "loss": 4.9273, + "step": 31763 + }, + { + "epoch": 0.18890950613759636, + "grad_norm": 1.8645926713943481, + "learning_rate": 4.572529500300415e-05, + "loss": 4.8043, + "step": 31764 + }, + { + "epoch": 0.18891545342087734, + "grad_norm": 1.742484450340271, + "learning_rate": 4.57250337833532e-05, + "loss": 4.8432, + "step": 31765 + }, + { + "epoch": 0.18892140070415833, + "grad_norm": 1.6944609880447388, + "learning_rate": 4.572477255646736e-05, + "loss": 4.8162, + "step": 31766 + }, + { + "epoch": 0.18892734798743935, + "grad_norm": 1.5811859369277954, + "learning_rate": 4.5724511322346705e-05, + "loss": 4.6349, + "step": 31767 + }, + { + "epoch": 0.18893329527072034, + "grad_norm": 1.6676167249679565, + "learning_rate": 4.572425008099135e-05, + "loss": 4.7742, + "step": 31768 + }, + { + "epoch": 0.18893924255400132, + "grad_norm": 2.0189199447631836, + "learning_rate": 4.5723988832401366e-05, + "loss": 4.5048, + "step": 31769 + }, + { + "epoch": 0.18894518983728234, + "grad_norm": 1.9740796089172363, + "learning_rate": 4.572372757657685e-05, + "loss": 4.7859, + "step": 31770 + }, + { + "epoch": 0.18895113712056333, + "grad_norm": 1.780246615409851, + "learning_rate": 4.57234663135179e-05, + "loss": 4.5585, + "step": 31771 + }, + { + "epoch": 0.18895708440384432, + "grad_norm": 1.744525671005249, + "learning_rate": 4.5723205043224596e-05, + "loss": 4.6835, + "step": 31772 + }, + { + "epoch": 0.18896303168712533, + "grad_norm": 2.0885379314422607, + "learning_rate": 4.572294376569704e-05, + "loss": 4.1156, + "step": 31773 + }, + { + "epoch": 0.18896897897040632, + "grad_norm": 1.7957887649536133, + "learning_rate": 4.572268248093532e-05, + "loss": 4.7995, + "step": 31774 + }, + { + "epoch": 0.1889749262536873, + "grad_norm": 1.7398995161056519, + "learning_rate": 4.572242118893953e-05, + "loss": 4.649, + "step": 31775 + }, + { + "epoch": 0.18898087353696832, + "grad_norm": 1.6801024675369263, + "learning_rate": 4.572215988970974e-05, + "loss": 4.6727, + "step": 31776 + }, + { + "epoch": 0.1889868208202493, + "grad_norm": 1.7167788743972778, + "learning_rate": 4.572189858324607e-05, + "loss": 4.8166, + "step": 31777 + }, + { + "epoch": 0.1889927681035303, + "grad_norm": 1.853050947189331, + "learning_rate": 4.57216372695486e-05, + "loss": 4.6311, + "step": 31778 + }, + { + "epoch": 0.18899871538681132, + "grad_norm": 1.5725040435791016, + "learning_rate": 4.5721375948617416e-05, + "loss": 4.6623, + "step": 31779 + }, + { + "epoch": 0.1890046626700923, + "grad_norm": 1.5537104606628418, + "learning_rate": 4.572111462045261e-05, + "loss": 4.3422, + "step": 31780 + }, + { + "epoch": 0.1890106099533733, + "grad_norm": 1.5853313207626343, + "learning_rate": 4.572085328505429e-05, + "loss": 4.6766, + "step": 31781 + }, + { + "epoch": 0.1890165572366543, + "grad_norm": 1.4046406745910645, + "learning_rate": 4.5720591942422524e-05, + "loss": 4.5923, + "step": 31782 + }, + { + "epoch": 0.1890225045199353, + "grad_norm": 1.8137469291687012, + "learning_rate": 4.5720330592557415e-05, + "loss": 4.4214, + "step": 31783 + }, + { + "epoch": 0.18902845180321629, + "grad_norm": 1.552449107170105, + "learning_rate": 4.572006923545906e-05, + "loss": 4.5052, + "step": 31784 + }, + { + "epoch": 0.1890343990864973, + "grad_norm": 1.7049251794815063, + "learning_rate": 4.571980787112753e-05, + "loss": 4.4893, + "step": 31785 + }, + { + "epoch": 0.1890403463697783, + "grad_norm": 1.8483717441558838, + "learning_rate": 4.5719546499562937e-05, + "loss": 4.7381, + "step": 31786 + }, + { + "epoch": 0.18904629365305928, + "grad_norm": 1.7575819492340088, + "learning_rate": 4.5719285120765363e-05, + "loss": 4.8062, + "step": 31787 + }, + { + "epoch": 0.1890522409363403, + "grad_norm": 1.5546499490737915, + "learning_rate": 4.57190237347349e-05, + "loss": 4.6929, + "step": 31788 + }, + { + "epoch": 0.18905818821962128, + "grad_norm": 1.3272528648376465, + "learning_rate": 4.5718762341471645e-05, + "loss": 4.7948, + "step": 31789 + }, + { + "epoch": 0.18906413550290227, + "grad_norm": 1.6771869659423828, + "learning_rate": 4.571850094097568e-05, + "loss": 5.0822, + "step": 31790 + }, + { + "epoch": 0.18907008278618326, + "grad_norm": 1.6481125354766846, + "learning_rate": 4.57182395332471e-05, + "loss": 4.4177, + "step": 31791 + }, + { + "epoch": 0.18907603006946427, + "grad_norm": 1.8274805545806885, + "learning_rate": 4.5717978118286e-05, + "loss": 4.4042, + "step": 31792 + }, + { + "epoch": 0.18908197735274526, + "grad_norm": 1.6022708415985107, + "learning_rate": 4.5717716696092464e-05, + "loss": 4.3987, + "step": 31793 + }, + { + "epoch": 0.18908792463602625, + "grad_norm": 1.7151497602462769, + "learning_rate": 4.5717455266666586e-05, + "loss": 4.3557, + "step": 31794 + }, + { + "epoch": 0.18909387191930727, + "grad_norm": 1.4646251201629639, + "learning_rate": 4.5717193830008457e-05, + "loss": 4.5261, + "step": 31795 + }, + { + "epoch": 0.18909981920258825, + "grad_norm": 1.6726088523864746, + "learning_rate": 4.571693238611817e-05, + "loss": 4.6804, + "step": 31796 + }, + { + "epoch": 0.18910576648586924, + "grad_norm": 2.105010509490967, + "learning_rate": 4.571667093499583e-05, + "loss": 4.0689, + "step": 31797 + }, + { + "epoch": 0.18911171376915026, + "grad_norm": 1.9176527261734009, + "learning_rate": 4.57164094766415e-05, + "loss": 4.3012, + "step": 31798 + }, + { + "epoch": 0.18911766105243125, + "grad_norm": 1.653403639793396, + "learning_rate": 4.571614801105528e-05, + "loss": 4.7231, + "step": 31799 + }, + { + "epoch": 0.18912360833571223, + "grad_norm": 2.0761914253234863, + "learning_rate": 4.571588653823728e-05, + "loss": 3.5717, + "step": 31800 + }, + { + "epoch": 0.18912955561899325, + "grad_norm": 1.7735234498977661, + "learning_rate": 4.5715625058187574e-05, + "loss": 4.4324, + "step": 31801 + }, + { + "epoch": 0.18913550290227424, + "grad_norm": 1.6627693176269531, + "learning_rate": 4.5715363570906256e-05, + "loss": 4.5788, + "step": 31802 + }, + { + "epoch": 0.18914145018555523, + "grad_norm": 1.888426661491394, + "learning_rate": 4.5715102076393425e-05, + "loss": 4.8467, + "step": 31803 + }, + { + "epoch": 0.18914739746883624, + "grad_norm": 1.6043262481689453, + "learning_rate": 4.5714840574649155e-05, + "loss": 4.974, + "step": 31804 + }, + { + "epoch": 0.18915334475211723, + "grad_norm": 1.7923390865325928, + "learning_rate": 4.5714579065673555e-05, + "loss": 4.9429, + "step": 31805 + }, + { + "epoch": 0.18915929203539822, + "grad_norm": 2.3900370597839355, + "learning_rate": 4.571431754946671e-05, + "loss": 3.9334, + "step": 31806 + }, + { + "epoch": 0.18916523931867923, + "grad_norm": 1.6400319337844849, + "learning_rate": 4.571405602602871e-05, + "loss": 4.6774, + "step": 31807 + }, + { + "epoch": 0.18917118660196022, + "grad_norm": 2.724771738052368, + "learning_rate": 4.571379449535964e-05, + "loss": 3.6422, + "step": 31808 + }, + { + "epoch": 0.1891771338852412, + "grad_norm": 1.6361116170883179, + "learning_rate": 4.5713532957459605e-05, + "loss": 4.7115, + "step": 31809 + }, + { + "epoch": 0.18918308116852223, + "grad_norm": 1.7161823511123657, + "learning_rate": 4.571327141232869e-05, + "loss": 4.6349, + "step": 31810 + }, + { + "epoch": 0.18918902845180322, + "grad_norm": 1.8056199550628662, + "learning_rate": 4.571300985996698e-05, + "loss": 4.3451, + "step": 31811 + }, + { + "epoch": 0.1891949757350842, + "grad_norm": 1.9487394094467163, + "learning_rate": 4.571274830037458e-05, + "loss": 4.3959, + "step": 31812 + }, + { + "epoch": 0.18920092301836522, + "grad_norm": 1.9206527471542358, + "learning_rate": 4.5712486733551574e-05, + "loss": 4.5261, + "step": 31813 + }, + { + "epoch": 0.1892068703016462, + "grad_norm": 1.872023344039917, + "learning_rate": 4.5712225159498046e-05, + "loss": 4.7062, + "step": 31814 + }, + { + "epoch": 0.1892128175849272, + "grad_norm": 2.086467981338501, + "learning_rate": 4.57119635782141e-05, + "loss": 3.7319, + "step": 31815 + }, + { + "epoch": 0.1892187648682082, + "grad_norm": 1.942416787147522, + "learning_rate": 4.571170198969982e-05, + "loss": 4.3991, + "step": 31816 + }, + { + "epoch": 0.1892247121514892, + "grad_norm": 1.8786695003509521, + "learning_rate": 4.5711440393955295e-05, + "loss": 4.9689, + "step": 31817 + }, + { + "epoch": 0.1892306594347702, + "grad_norm": 1.9197100400924683, + "learning_rate": 4.571117879098063e-05, + "loss": 5.0809, + "step": 31818 + }, + { + "epoch": 0.1892366067180512, + "grad_norm": 1.9586657285690308, + "learning_rate": 4.571091718077589e-05, + "loss": 4.5672, + "step": 31819 + }, + { + "epoch": 0.1892425540013322, + "grad_norm": 1.8381383419036865, + "learning_rate": 4.5710655563341196e-05, + "loss": 4.2682, + "step": 31820 + }, + { + "epoch": 0.18924850128461318, + "grad_norm": 1.8966319561004639, + "learning_rate": 4.571039393867662e-05, + "loss": 3.4864, + "step": 31821 + }, + { + "epoch": 0.1892544485678942, + "grad_norm": 1.8893778324127197, + "learning_rate": 4.571013230678226e-05, + "loss": 3.6155, + "step": 31822 + }, + { + "epoch": 0.18926039585117518, + "grad_norm": 1.9687188863754272, + "learning_rate": 4.570987066765821e-05, + "loss": 3.6131, + "step": 31823 + }, + { + "epoch": 0.18926634313445617, + "grad_norm": 1.932376742362976, + "learning_rate": 4.570960902130456e-05, + "loss": 3.6234, + "step": 31824 + }, + { + "epoch": 0.1892722904177372, + "grad_norm": 1.8500068187713623, + "learning_rate": 4.570934736772139e-05, + "loss": 3.5913, + "step": 31825 + }, + { + "epoch": 0.18927823770101818, + "grad_norm": 1.765598177909851, + "learning_rate": 4.570908570690881e-05, + "loss": 3.4619, + "step": 31826 + }, + { + "epoch": 0.18928418498429916, + "grad_norm": 1.8413002490997314, + "learning_rate": 4.57088240388669e-05, + "loss": 3.596, + "step": 31827 + }, + { + "epoch": 0.18929013226758018, + "grad_norm": 1.9876320362091064, + "learning_rate": 4.570856236359575e-05, + "loss": 3.4154, + "step": 31828 + }, + { + "epoch": 0.18929607955086117, + "grad_norm": 1.9374988079071045, + "learning_rate": 4.570830068109546e-05, + "loss": 3.5511, + "step": 31829 + }, + { + "epoch": 0.18930202683414216, + "grad_norm": 1.971796989440918, + "learning_rate": 4.570803899136611e-05, + "loss": 3.4346, + "step": 31830 + }, + { + "epoch": 0.18930797411742317, + "grad_norm": 1.9629862308502197, + "learning_rate": 4.57077772944078e-05, + "loss": 3.7305, + "step": 31831 + }, + { + "epoch": 0.18931392140070416, + "grad_norm": 2.155545234680176, + "learning_rate": 4.5707515590220625e-05, + "loss": 3.9258, + "step": 31832 + }, + { + "epoch": 0.18931986868398515, + "grad_norm": 2.084571123123169, + "learning_rate": 4.5707253878804665e-05, + "loss": 3.5942, + "step": 31833 + }, + { + "epoch": 0.18932581596726616, + "grad_norm": 2.053971529006958, + "learning_rate": 4.570699216016001e-05, + "loss": 3.5606, + "step": 31834 + }, + { + "epoch": 0.18933176325054715, + "grad_norm": 1.9901275634765625, + "learning_rate": 4.570673043428677e-05, + "loss": 3.4579, + "step": 31835 + }, + { + "epoch": 0.18933771053382814, + "grad_norm": 1.846103549003601, + "learning_rate": 4.570646870118502e-05, + "loss": 3.319, + "step": 31836 + }, + { + "epoch": 0.18934365781710916, + "grad_norm": 2.0345115661621094, + "learning_rate": 4.570620696085486e-05, + "loss": 3.4513, + "step": 31837 + }, + { + "epoch": 0.18934960510039014, + "grad_norm": 1.8338862657546997, + "learning_rate": 4.570594521329636e-05, + "loss": 3.5135, + "step": 31838 + }, + { + "epoch": 0.18935555238367113, + "grad_norm": 1.9724763631820679, + "learning_rate": 4.5705683458509646e-05, + "loss": 3.4845, + "step": 31839 + }, + { + "epoch": 0.18936149966695215, + "grad_norm": 1.9579484462738037, + "learning_rate": 4.570542169649479e-05, + "loss": 3.4505, + "step": 31840 + }, + { + "epoch": 0.18936744695023314, + "grad_norm": 1.915587067604065, + "learning_rate": 4.570515992725187e-05, + "loss": 3.4607, + "step": 31841 + }, + { + "epoch": 0.18937339423351413, + "grad_norm": 2.1304988861083984, + "learning_rate": 4.5704898150781004e-05, + "loss": 3.4474, + "step": 31842 + }, + { + "epoch": 0.18937934151679514, + "grad_norm": 1.8973450660705566, + "learning_rate": 4.5704636367082275e-05, + "loss": 3.9997, + "step": 31843 + }, + { + "epoch": 0.18938528880007613, + "grad_norm": 2.0216281414031982, + "learning_rate": 4.570437457615577e-05, + "loss": 3.4405, + "step": 31844 + }, + { + "epoch": 0.18939123608335712, + "grad_norm": 1.8983052968978882, + "learning_rate": 4.5704112778001586e-05, + "loss": 3.5817, + "step": 31845 + }, + { + "epoch": 0.18939718336663813, + "grad_norm": 1.9334758520126343, + "learning_rate": 4.5703850972619796e-05, + "loss": 3.6004, + "step": 31846 + }, + { + "epoch": 0.18940313064991912, + "grad_norm": 1.9281392097473145, + "learning_rate": 4.570358916001052e-05, + "loss": 3.5149, + "step": 31847 + }, + { + "epoch": 0.1894090779332001, + "grad_norm": 2.114772081375122, + "learning_rate": 4.5703327340173826e-05, + "loss": 3.4635, + "step": 31848 + }, + { + "epoch": 0.1894150252164811, + "grad_norm": 1.9354569911956787, + "learning_rate": 4.5703065513109815e-05, + "loss": 3.4728, + "step": 31849 + }, + { + "epoch": 0.1894209724997621, + "grad_norm": 1.9464221000671387, + "learning_rate": 4.570280367881859e-05, + "loss": 3.364, + "step": 31850 + }, + { + "epoch": 0.1894269197830431, + "grad_norm": 1.9195743799209595, + "learning_rate": 4.570254183730021e-05, + "loss": 3.4428, + "step": 31851 + }, + { + "epoch": 0.1894328670663241, + "grad_norm": 1.9214719533920288, + "learning_rate": 4.57022799885548e-05, + "loss": 3.4926, + "step": 31852 + }, + { + "epoch": 0.1894388143496051, + "grad_norm": 1.9174745082855225, + "learning_rate": 4.5702018132582435e-05, + "loss": 3.4202, + "step": 31853 + }, + { + "epoch": 0.1894447616328861, + "grad_norm": 1.8605272769927979, + "learning_rate": 4.57017562693832e-05, + "loss": 3.7625, + "step": 31854 + }, + { + "epoch": 0.18945070891616708, + "grad_norm": 1.8724991083145142, + "learning_rate": 4.5701494398957214e-05, + "loss": 3.9864, + "step": 31855 + }, + { + "epoch": 0.1894566561994481, + "grad_norm": 1.8957018852233887, + "learning_rate": 4.570123252130454e-05, + "loss": 3.4135, + "step": 31856 + }, + { + "epoch": 0.1894626034827291, + "grad_norm": 1.8492218255996704, + "learning_rate": 4.570097063642528e-05, + "loss": 3.4631, + "step": 31857 + }, + { + "epoch": 0.18946855076601007, + "grad_norm": 1.955808401107788, + "learning_rate": 4.570070874431952e-05, + "loss": 3.3076, + "step": 31858 + }, + { + "epoch": 0.1894744980492911, + "grad_norm": 1.8996524810791016, + "learning_rate": 4.570044684498737e-05, + "loss": 3.3948, + "step": 31859 + }, + { + "epoch": 0.18948044533257208, + "grad_norm": 1.89797842502594, + "learning_rate": 4.570018493842889e-05, + "loss": 3.4224, + "step": 31860 + }, + { + "epoch": 0.18948639261585307, + "grad_norm": 1.8745187520980835, + "learning_rate": 4.5699923024644195e-05, + "loss": 3.4154, + "step": 31861 + }, + { + "epoch": 0.18949233989913408, + "grad_norm": 1.9138267040252686, + "learning_rate": 4.569966110363338e-05, + "loss": 3.2955, + "step": 31862 + }, + { + "epoch": 0.18949828718241507, + "grad_norm": 1.970042109489441, + "learning_rate": 4.569939917539652e-05, + "loss": 3.328, + "step": 31863 + }, + { + "epoch": 0.18950423446569606, + "grad_norm": 1.8819622993469238, + "learning_rate": 4.5699137239933716e-05, + "loss": 3.358, + "step": 31864 + }, + { + "epoch": 0.18951018174897707, + "grad_norm": 1.7060779333114624, + "learning_rate": 4.569887529724506e-05, + "loss": 3.3375, + "step": 31865 + }, + { + "epoch": 0.18951612903225806, + "grad_norm": 1.6891839504241943, + "learning_rate": 4.569861334733063e-05, + "loss": 5.0216, + "step": 31866 + }, + { + "epoch": 0.18952207631553905, + "grad_norm": 1.8553059101104736, + "learning_rate": 4.569835139019054e-05, + "loss": 3.5624, + "step": 31867 + }, + { + "epoch": 0.18952802359882007, + "grad_norm": 2.997297763824463, + "learning_rate": 4.569808942582486e-05, + "loss": 4.0015, + "step": 31868 + }, + { + "epoch": 0.18953397088210105, + "grad_norm": 1.9399126768112183, + "learning_rate": 4.56978274542337e-05, + "loss": 3.3589, + "step": 31869 + }, + { + "epoch": 0.18953991816538204, + "grad_norm": 2.066025733947754, + "learning_rate": 4.5697565475417135e-05, + "loss": 3.3023, + "step": 31870 + }, + { + "epoch": 0.18954586544866306, + "grad_norm": 1.9673593044281006, + "learning_rate": 4.5697303489375266e-05, + "loss": 3.1509, + "step": 31871 + }, + { + "epoch": 0.18955181273194405, + "grad_norm": 1.9587528705596924, + "learning_rate": 4.569704149610818e-05, + "loss": 3.4391, + "step": 31872 + }, + { + "epoch": 0.18955776001522504, + "grad_norm": 1.9434003829956055, + "learning_rate": 4.5696779495615974e-05, + "loss": 3.4094, + "step": 31873 + }, + { + "epoch": 0.18956370729850605, + "grad_norm": 2.1255874633789062, + "learning_rate": 4.5696517487898735e-05, + "loss": 3.4675, + "step": 31874 + }, + { + "epoch": 0.18956965458178704, + "grad_norm": 1.951338291168213, + "learning_rate": 4.569625547295655e-05, + "loss": 3.4062, + "step": 31875 + }, + { + "epoch": 0.18957560186506803, + "grad_norm": 1.9367462396621704, + "learning_rate": 4.5695993450789523e-05, + "loss": 3.4532, + "step": 31876 + }, + { + "epoch": 0.18958154914834904, + "grad_norm": 1.9697223901748657, + "learning_rate": 4.5695731421397734e-05, + "loss": 3.3318, + "step": 31877 + }, + { + "epoch": 0.18958749643163003, + "grad_norm": 1.6190401315689087, + "learning_rate": 4.569546938478129e-05, + "loss": 4.3346, + "step": 31878 + }, + { + "epoch": 0.18959344371491102, + "grad_norm": 1.9056912660598755, + "learning_rate": 4.569520734094026e-05, + "loss": 4.6924, + "step": 31879 + }, + { + "epoch": 0.18959939099819204, + "grad_norm": 1.7069121599197388, + "learning_rate": 4.5694945289874744e-05, + "loss": 4.7868, + "step": 31880 + }, + { + "epoch": 0.18960533828147302, + "grad_norm": 1.788473129272461, + "learning_rate": 4.569468323158485e-05, + "loss": 4.4976, + "step": 31881 + }, + { + "epoch": 0.189611285564754, + "grad_norm": 1.6094763278961182, + "learning_rate": 4.569442116607065e-05, + "loss": 4.6732, + "step": 31882 + }, + { + "epoch": 0.18961723284803503, + "grad_norm": 1.8496800661087036, + "learning_rate": 4.569415909333223e-05, + "loss": 4.0664, + "step": 31883 + }, + { + "epoch": 0.18962318013131602, + "grad_norm": 1.9682886600494385, + "learning_rate": 4.5693897013369715e-05, + "loss": 3.4463, + "step": 31884 + }, + { + "epoch": 0.189629127414597, + "grad_norm": 1.6034213304519653, + "learning_rate": 4.569363492618316e-05, + "loss": 4.6424, + "step": 31885 + }, + { + "epoch": 0.18963507469787802, + "grad_norm": 1.7703704833984375, + "learning_rate": 4.5693372831772675e-05, + "loss": 4.5325, + "step": 31886 + }, + { + "epoch": 0.189641021981159, + "grad_norm": 1.7447285652160645, + "learning_rate": 4.569311073013834e-05, + "loss": 4.5242, + "step": 31887 + }, + { + "epoch": 0.18964696926444, + "grad_norm": 1.6660053730010986, + "learning_rate": 4.569284862128026e-05, + "loss": 4.6989, + "step": 31888 + }, + { + "epoch": 0.189652916547721, + "grad_norm": 1.5886887311935425, + "learning_rate": 4.569258650519852e-05, + "loss": 4.6059, + "step": 31889 + }, + { + "epoch": 0.189658863831002, + "grad_norm": 1.530544638633728, + "learning_rate": 4.569232438189321e-05, + "loss": 5.1136, + "step": 31890 + }, + { + "epoch": 0.189664811114283, + "grad_norm": 1.814598560333252, + "learning_rate": 4.569206225136442e-05, + "loss": 4.7673, + "step": 31891 + }, + { + "epoch": 0.189670758397564, + "grad_norm": 1.8687660694122314, + "learning_rate": 4.569180011361225e-05, + "loss": 4.6851, + "step": 31892 + }, + { + "epoch": 0.189676705680845, + "grad_norm": 1.673263669013977, + "learning_rate": 4.569153796863679e-05, + "loss": 4.0894, + "step": 31893 + }, + { + "epoch": 0.18968265296412598, + "grad_norm": 1.7278350591659546, + "learning_rate": 4.569127581643812e-05, + "loss": 4.4977, + "step": 31894 + }, + { + "epoch": 0.189688600247407, + "grad_norm": 1.7827249765396118, + "learning_rate": 4.569101365701635e-05, + "loss": 4.4619, + "step": 31895 + }, + { + "epoch": 0.18969454753068798, + "grad_norm": 1.8192304372787476, + "learning_rate": 4.569075149037155e-05, + "loss": 4.5414, + "step": 31896 + }, + { + "epoch": 0.18970049481396897, + "grad_norm": 1.8950804471969604, + "learning_rate": 4.5690489316503824e-05, + "loss": 4.97, + "step": 31897 + }, + { + "epoch": 0.18970644209725, + "grad_norm": 2.004835605621338, + "learning_rate": 4.5690227135413266e-05, + "loss": 5.0177, + "step": 31898 + }, + { + "epoch": 0.18971238938053098, + "grad_norm": 1.5404337644577026, + "learning_rate": 4.568996494709996e-05, + "loss": 4.8197, + "step": 31899 + }, + { + "epoch": 0.18971833666381197, + "grad_norm": 2.201564073562622, + "learning_rate": 4.5689702751564e-05, + "loss": 4.5455, + "step": 31900 + }, + { + "epoch": 0.18972428394709298, + "grad_norm": 2.286740303039551, + "learning_rate": 4.568944054880549e-05, + "loss": 4.4012, + "step": 31901 + }, + { + "epoch": 0.18973023123037397, + "grad_norm": 1.5117316246032715, + "learning_rate": 4.56891783388245e-05, + "loss": 4.9668, + "step": 31902 + }, + { + "epoch": 0.18973617851365496, + "grad_norm": 1.5675127506256104, + "learning_rate": 4.568891612162113e-05, + "loss": 4.1084, + "step": 31903 + }, + { + "epoch": 0.18974212579693597, + "grad_norm": 1.7535734176635742, + "learning_rate": 4.568865389719548e-05, + "loss": 4.7085, + "step": 31904 + }, + { + "epoch": 0.18974807308021696, + "grad_norm": 1.822056531906128, + "learning_rate": 4.5688391665547624e-05, + "loss": 5.0422, + "step": 31905 + }, + { + "epoch": 0.18975402036349795, + "grad_norm": 2.162489414215088, + "learning_rate": 4.5688129426677675e-05, + "loss": 4.6578, + "step": 31906 + }, + { + "epoch": 0.18975996764677894, + "grad_norm": 1.6846884489059448, + "learning_rate": 4.5687867180585706e-05, + "loss": 4.7004, + "step": 31907 + }, + { + "epoch": 0.18976591493005995, + "grad_norm": 1.652793526649475, + "learning_rate": 4.568760492727182e-05, + "loss": 5.4678, + "step": 31908 + }, + { + "epoch": 0.18977186221334094, + "grad_norm": 1.8100409507751465, + "learning_rate": 4.568734266673611e-05, + "loss": 5.2258, + "step": 31909 + }, + { + "epoch": 0.18977780949662193, + "grad_norm": 1.7055999040603638, + "learning_rate": 4.568708039897865e-05, + "loss": 4.78, + "step": 31910 + }, + { + "epoch": 0.18978375677990295, + "grad_norm": 1.980807900428772, + "learning_rate": 4.568681812399955e-05, + "loss": 5.2426, + "step": 31911 + }, + { + "epoch": 0.18978970406318393, + "grad_norm": 1.5969680547714233, + "learning_rate": 4.56865558417989e-05, + "loss": 4.1172, + "step": 31912 + }, + { + "epoch": 0.18979565134646492, + "grad_norm": 1.5067203044891357, + "learning_rate": 4.5686293552376786e-05, + "loss": 5.0804, + "step": 31913 + }, + { + "epoch": 0.18980159862974594, + "grad_norm": 1.7787573337554932, + "learning_rate": 4.5686031255733295e-05, + "loss": 5.5088, + "step": 31914 + }, + { + "epoch": 0.18980754591302693, + "grad_norm": 1.7157970666885376, + "learning_rate": 4.568576895186853e-05, + "loss": 5.0429, + "step": 31915 + }, + { + "epoch": 0.18981349319630791, + "grad_norm": 1.6739206314086914, + "learning_rate": 4.568550664078257e-05, + "loss": 5.2735, + "step": 31916 + }, + { + "epoch": 0.18981944047958893, + "grad_norm": 1.33918297290802, + "learning_rate": 4.568524432247552e-05, + "loss": 5.3635, + "step": 31917 + }, + { + "epoch": 0.18982538776286992, + "grad_norm": 1.5649034976959229, + "learning_rate": 4.568498199694746e-05, + "loss": 5.4058, + "step": 31918 + }, + { + "epoch": 0.1898313350461509, + "grad_norm": 1.9228683710098267, + "learning_rate": 4.568471966419849e-05, + "loss": 4.801, + "step": 31919 + }, + { + "epoch": 0.18983728232943192, + "grad_norm": 2.49468731880188, + "learning_rate": 4.56844573242287e-05, + "loss": 4.0647, + "step": 31920 + }, + { + "epoch": 0.1898432296127129, + "grad_norm": 1.6524834632873535, + "learning_rate": 4.5684194977038175e-05, + "loss": 4.5686, + "step": 31921 + }, + { + "epoch": 0.1898491768959939, + "grad_norm": 2.149550676345825, + "learning_rate": 4.568393262262701e-05, + "loss": 4.6678, + "step": 31922 + }, + { + "epoch": 0.18985512417927491, + "grad_norm": 1.6779396533966064, + "learning_rate": 4.56836702609953e-05, + "loss": 5.0776, + "step": 31923 + }, + { + "epoch": 0.1898610714625559, + "grad_norm": 1.5314429998397827, + "learning_rate": 4.568340789214314e-05, + "loss": 4.7417, + "step": 31924 + }, + { + "epoch": 0.1898670187458369, + "grad_norm": 2.9489622116088867, + "learning_rate": 4.568314551607061e-05, + "loss": 3.0551, + "step": 31925 + }, + { + "epoch": 0.1898729660291179, + "grad_norm": 2.9115781784057617, + "learning_rate": 4.568288313277781e-05, + "loss": 3.5898, + "step": 31926 + }, + { + "epoch": 0.1898789133123989, + "grad_norm": 2.426448345184326, + "learning_rate": 4.568262074226483e-05, + "loss": 3.763, + "step": 31927 + }, + { + "epoch": 0.18988486059567988, + "grad_norm": 2.463843822479248, + "learning_rate": 4.568235834453176e-05, + "loss": 3.6311, + "step": 31928 + }, + { + "epoch": 0.1898908078789609, + "grad_norm": 2.4178626537323, + "learning_rate": 4.568209593957869e-05, + "loss": 4.0005, + "step": 31929 + }, + { + "epoch": 0.1898967551622419, + "grad_norm": 2.1303043365478516, + "learning_rate": 4.568183352740571e-05, + "loss": 4.7775, + "step": 31930 + }, + { + "epoch": 0.18990270244552288, + "grad_norm": 2.039669990539551, + "learning_rate": 4.568157110801293e-05, + "loss": 4.4208, + "step": 31931 + }, + { + "epoch": 0.1899086497288039, + "grad_norm": 2.6738369464874268, + "learning_rate": 4.568130868140041e-05, + "loss": 3.5947, + "step": 31932 + }, + { + "epoch": 0.18991459701208488, + "grad_norm": 2.8675057888031006, + "learning_rate": 4.5681046247568273e-05, + "loss": 2.9846, + "step": 31933 + }, + { + "epoch": 0.18992054429536587, + "grad_norm": 2.8975415229797363, + "learning_rate": 4.5680783806516595e-05, + "loss": 2.8397, + "step": 31934 + }, + { + "epoch": 0.18992649157864688, + "grad_norm": 2.770543098449707, + "learning_rate": 4.568052135824545e-05, + "loss": 2.5696, + "step": 31935 + }, + { + "epoch": 0.18993243886192787, + "grad_norm": 2.7730634212493896, + "learning_rate": 4.568025890275497e-05, + "loss": 2.6406, + "step": 31936 + }, + { + "epoch": 0.18993838614520886, + "grad_norm": 1.8441970348358154, + "learning_rate": 4.5679996440045216e-05, + "loss": 4.7971, + "step": 31937 + }, + { + "epoch": 0.18994433342848988, + "grad_norm": 2.130506992340088, + "learning_rate": 4.5679733970116293e-05, + "loss": 4.6538, + "step": 31938 + }, + { + "epoch": 0.18995028071177086, + "grad_norm": 3.2489874362945557, + "learning_rate": 4.5679471492968286e-05, + "loss": 3.7206, + "step": 31939 + }, + { + "epoch": 0.18995622799505185, + "grad_norm": 3.3406145572662354, + "learning_rate": 4.567920900860129e-05, + "loss": 3.457, + "step": 31940 + }, + { + "epoch": 0.18996217527833287, + "grad_norm": 3.1997690200805664, + "learning_rate": 4.56789465170154e-05, + "loss": 3.4264, + "step": 31941 + }, + { + "epoch": 0.18996812256161386, + "grad_norm": 3.3533401489257812, + "learning_rate": 4.5678684018210697e-05, + "loss": 3.4097, + "step": 31942 + }, + { + "epoch": 0.18997406984489484, + "grad_norm": 1.8355157375335693, + "learning_rate": 4.5678421512187274e-05, + "loss": 4.9528, + "step": 31943 + }, + { + "epoch": 0.18998001712817586, + "grad_norm": 1.5663495063781738, + "learning_rate": 4.567815899894524e-05, + "loss": 5.2737, + "step": 31944 + }, + { + "epoch": 0.18998596441145685, + "grad_norm": 1.7897334098815918, + "learning_rate": 4.567789647848467e-05, + "loss": 5.2766, + "step": 31945 + }, + { + "epoch": 0.18999191169473784, + "grad_norm": 1.6036760807037354, + "learning_rate": 4.567763395080565e-05, + "loss": 5.1449, + "step": 31946 + }, + { + "epoch": 0.18999785897801885, + "grad_norm": 1.47257661819458, + "learning_rate": 4.56773714159083e-05, + "loss": 5.6391, + "step": 31947 + }, + { + "epoch": 0.19000380626129984, + "grad_norm": 1.5196605920791626, + "learning_rate": 4.567710887379268e-05, + "loss": 5.3161, + "step": 31948 + }, + { + "epoch": 0.19000975354458083, + "grad_norm": 1.9018585681915283, + "learning_rate": 4.56768463244589e-05, + "loss": 4.9531, + "step": 31949 + }, + { + "epoch": 0.19001570082786184, + "grad_norm": 1.56498122215271, + "learning_rate": 4.567658376790704e-05, + "loss": 5.0693, + "step": 31950 + }, + { + "epoch": 0.19002164811114283, + "grad_norm": 1.5698566436767578, + "learning_rate": 4.567632120413721e-05, + "loss": 5.177, + "step": 31951 + }, + { + "epoch": 0.19002759539442382, + "grad_norm": 1.456125259399414, + "learning_rate": 4.5676058633149484e-05, + "loss": 5.145, + "step": 31952 + }, + { + "epoch": 0.19003354267770484, + "grad_norm": 1.3860251903533936, + "learning_rate": 4.5675796054943954e-05, + "loss": 5.24, + "step": 31953 + }, + { + "epoch": 0.19003948996098582, + "grad_norm": 1.4301811456680298, + "learning_rate": 4.567553346952073e-05, + "loss": 5.0656, + "step": 31954 + }, + { + "epoch": 0.1900454372442668, + "grad_norm": 1.8247642517089844, + "learning_rate": 4.567527087687988e-05, + "loss": 4.4608, + "step": 31955 + }, + { + "epoch": 0.19005138452754783, + "grad_norm": 1.5283252000808716, + "learning_rate": 4.567500827702151e-05, + "loss": 4.7095, + "step": 31956 + }, + { + "epoch": 0.19005733181082882, + "grad_norm": 1.5899708271026611, + "learning_rate": 4.567474566994571e-05, + "loss": 5.0809, + "step": 31957 + }, + { + "epoch": 0.1900632790941098, + "grad_norm": 1.4520339965820312, + "learning_rate": 4.567448305565256e-05, + "loss": 5.4298, + "step": 31958 + }, + { + "epoch": 0.19006922637739082, + "grad_norm": 1.658177375793457, + "learning_rate": 4.5674220434142175e-05, + "loss": 5.3595, + "step": 31959 + }, + { + "epoch": 0.1900751736606718, + "grad_norm": 1.4237635135650635, + "learning_rate": 4.5673957805414626e-05, + "loss": 5.2803, + "step": 31960 + }, + { + "epoch": 0.1900811209439528, + "grad_norm": 1.6651546955108643, + "learning_rate": 4.567369516947001e-05, + "loss": 5.2991, + "step": 31961 + }, + { + "epoch": 0.1900870682272338, + "grad_norm": 1.48691987991333, + "learning_rate": 4.5673432526308424e-05, + "loss": 5.4731, + "step": 31962 + }, + { + "epoch": 0.1900930155105148, + "grad_norm": 1.541694164276123, + "learning_rate": 4.5673169875929954e-05, + "loss": 5.7939, + "step": 31963 + }, + { + "epoch": 0.1900989627937958, + "grad_norm": 1.5470638275146484, + "learning_rate": 4.56729072183347e-05, + "loss": 5.5046, + "step": 31964 + }, + { + "epoch": 0.19010491007707678, + "grad_norm": 1.4966381788253784, + "learning_rate": 4.567264455352275e-05, + "loss": 5.5842, + "step": 31965 + }, + { + "epoch": 0.1901108573603578, + "grad_norm": 1.698122262954712, + "learning_rate": 4.5672381881494186e-05, + "loss": 5.3864, + "step": 31966 + }, + { + "epoch": 0.19011680464363878, + "grad_norm": 1.5268694162368774, + "learning_rate": 4.5672119202249104e-05, + "loss": 5.6954, + "step": 31967 + }, + { + "epoch": 0.19012275192691977, + "grad_norm": 1.555295705795288, + "learning_rate": 4.5671856515787606e-05, + "loss": 5.5636, + "step": 31968 + }, + { + "epoch": 0.19012869921020079, + "grad_norm": 2.07952618598938, + "learning_rate": 4.567159382210977e-05, + "loss": 4.9687, + "step": 31969 + }, + { + "epoch": 0.19013464649348177, + "grad_norm": 1.6233636140823364, + "learning_rate": 4.5671331121215696e-05, + "loss": 5.1942, + "step": 31970 + }, + { + "epoch": 0.19014059377676276, + "grad_norm": 1.7472115755081177, + "learning_rate": 4.567106841310548e-05, + "loss": 4.914, + "step": 31971 + }, + { + "epoch": 0.19014654106004378, + "grad_norm": 1.6313844919204712, + "learning_rate": 4.56708056977792e-05, + "loss": 5.0832, + "step": 31972 + }, + { + "epoch": 0.19015248834332477, + "grad_norm": 1.5175881385803223, + "learning_rate": 4.5670542975236965e-05, + "loss": 5.6577, + "step": 31973 + }, + { + "epoch": 0.19015843562660575, + "grad_norm": 1.3097161054611206, + "learning_rate": 4.567028024547885e-05, + "loss": 5.5634, + "step": 31974 + }, + { + "epoch": 0.19016438290988677, + "grad_norm": 1.4037551879882812, + "learning_rate": 4.567001750850495e-05, + "loss": 5.5816, + "step": 31975 + }, + { + "epoch": 0.19017033019316776, + "grad_norm": 1.6129430532455444, + "learning_rate": 4.5669754764315364e-05, + "loss": 5.3667, + "step": 31976 + }, + { + "epoch": 0.19017627747644875, + "grad_norm": 1.309594988822937, + "learning_rate": 4.566949201291018e-05, + "loss": 5.4406, + "step": 31977 + }, + { + "epoch": 0.19018222475972976, + "grad_norm": 1.4947516918182373, + "learning_rate": 4.5669229254289495e-05, + "loss": 4.7619, + "step": 31978 + }, + { + "epoch": 0.19018817204301075, + "grad_norm": 1.9797264337539673, + "learning_rate": 4.5668966488453394e-05, + "loss": 4.7724, + "step": 31979 + }, + { + "epoch": 0.19019411932629174, + "grad_norm": 2.4875199794769287, + "learning_rate": 4.566870371540196e-05, + "loss": 4.6923, + "step": 31980 + }, + { + "epoch": 0.19020006660957275, + "grad_norm": 1.5810712575912476, + "learning_rate": 4.5668440935135305e-05, + "loss": 5.4779, + "step": 31981 + }, + { + "epoch": 0.19020601389285374, + "grad_norm": 1.3546984195709229, + "learning_rate": 4.566817814765351e-05, + "loss": 5.5403, + "step": 31982 + }, + { + "epoch": 0.19021196117613473, + "grad_norm": 1.6855329275131226, + "learning_rate": 4.566791535295666e-05, + "loss": 5.2699, + "step": 31983 + }, + { + "epoch": 0.19021790845941575, + "grad_norm": 1.7854288816452026, + "learning_rate": 4.5667652551044865e-05, + "loss": 5.1219, + "step": 31984 + }, + { + "epoch": 0.19022385574269673, + "grad_norm": 1.589922547340393, + "learning_rate": 4.56673897419182e-05, + "loss": 5.0681, + "step": 31985 + }, + { + "epoch": 0.19022980302597772, + "grad_norm": 1.7833002805709839, + "learning_rate": 4.566712692557677e-05, + "loss": 5.0301, + "step": 31986 + }, + { + "epoch": 0.19023575030925874, + "grad_norm": 1.6957120895385742, + "learning_rate": 4.566686410202064e-05, + "loss": 5.1839, + "step": 31987 + }, + { + "epoch": 0.19024169759253973, + "grad_norm": 1.4636527299880981, + "learning_rate": 4.5666601271249945e-05, + "loss": 4.9968, + "step": 31988 + }, + { + "epoch": 0.19024764487582072, + "grad_norm": 2.6251659393310547, + "learning_rate": 4.566633843326474e-05, + "loss": 4.1913, + "step": 31989 + }, + { + "epoch": 0.19025359215910173, + "grad_norm": 3.022430896759033, + "learning_rate": 4.566607558806513e-05, + "loss": 3.7352, + "step": 31990 + }, + { + "epoch": 0.19025953944238272, + "grad_norm": 2.3903746604919434, + "learning_rate": 4.566581273565122e-05, + "loss": 3.9359, + "step": 31991 + }, + { + "epoch": 0.1902654867256637, + "grad_norm": 1.598952054977417, + "learning_rate": 4.5665549876023076e-05, + "loss": 4.674, + "step": 31992 + }, + { + "epoch": 0.19027143400894472, + "grad_norm": 2.1441328525543213, + "learning_rate": 4.5665287009180796e-05, + "loss": 3.9013, + "step": 31993 + }, + { + "epoch": 0.1902773812922257, + "grad_norm": 1.7473663091659546, + "learning_rate": 4.566502413512449e-05, + "loss": 4.7093, + "step": 31994 + }, + { + "epoch": 0.1902833285755067, + "grad_norm": 1.4611793756484985, + "learning_rate": 4.5664761253854226e-05, + "loss": 5.2113, + "step": 31995 + }, + { + "epoch": 0.19028927585878772, + "grad_norm": 1.7185208797454834, + "learning_rate": 4.566449836537012e-05, + "loss": 4.6621, + "step": 31996 + }, + { + "epoch": 0.1902952231420687, + "grad_norm": 2.710164785385132, + "learning_rate": 4.5664235469672246e-05, + "loss": 3.0121, + "step": 31997 + }, + { + "epoch": 0.1903011704253497, + "grad_norm": 1.7560441493988037, + "learning_rate": 4.5663972566760694e-05, + "loss": 5.158, + "step": 31998 + }, + { + "epoch": 0.1903071177086307, + "grad_norm": 2.3180789947509766, + "learning_rate": 4.566370965663557e-05, + "loss": 3.8676, + "step": 31999 + }, + { + "epoch": 0.1903130649919117, + "grad_norm": 2.8105721473693848, + "learning_rate": 4.5663446739296956e-05, + "loss": 3.9046, + "step": 32000 + }, + { + "epoch": 0.19031901227519268, + "grad_norm": 1.9129337072372437, + "learning_rate": 4.5663183814744946e-05, + "loss": 4.9668, + "step": 32001 + }, + { + "epoch": 0.1903249595584737, + "grad_norm": 2.512235403060913, + "learning_rate": 4.566292088297964e-05, + "loss": 5.2666, + "step": 32002 + }, + { + "epoch": 0.1903309068417547, + "grad_norm": 2.6467642784118652, + "learning_rate": 4.566265794400111e-05, + "loss": 5.1184, + "step": 32003 + }, + { + "epoch": 0.19033685412503568, + "grad_norm": 2.0429317951202393, + "learning_rate": 4.566239499780946e-05, + "loss": 4.6186, + "step": 32004 + }, + { + "epoch": 0.1903428014083167, + "grad_norm": 2.026602029800415, + "learning_rate": 4.566213204440479e-05, + "loss": 2.941, + "step": 32005 + }, + { + "epoch": 0.19034874869159768, + "grad_norm": 2.3440706729888916, + "learning_rate": 4.5661869083787184e-05, + "loss": 2.9276, + "step": 32006 + }, + { + "epoch": 0.19035469597487867, + "grad_norm": 1.5349546670913696, + "learning_rate": 4.566160611595673e-05, + "loss": 5.0741, + "step": 32007 + }, + { + "epoch": 0.19036064325815968, + "grad_norm": 1.4628055095672607, + "learning_rate": 4.566134314091352e-05, + "loss": 5.2766, + "step": 32008 + }, + { + "epoch": 0.19036659054144067, + "grad_norm": 1.768277645111084, + "learning_rate": 4.566108015865765e-05, + "loss": 5.1867, + "step": 32009 + }, + { + "epoch": 0.19037253782472166, + "grad_norm": 1.2792719602584839, + "learning_rate": 4.566081716918921e-05, + "loss": 5.1267, + "step": 32010 + }, + { + "epoch": 0.19037848510800268, + "grad_norm": 1.5036875009536743, + "learning_rate": 4.56605541725083e-05, + "loss": 5.2554, + "step": 32011 + }, + { + "epoch": 0.19038443239128366, + "grad_norm": 1.3368226289749146, + "learning_rate": 4.566029116861499e-05, + "loss": 5.2237, + "step": 32012 + }, + { + "epoch": 0.19039037967456465, + "grad_norm": 1.3333531618118286, + "learning_rate": 4.566002815750939e-05, + "loss": 5.2719, + "step": 32013 + }, + { + "epoch": 0.19039632695784567, + "grad_norm": 1.5420610904693604, + "learning_rate": 4.565976513919159e-05, + "loss": 5.2795, + "step": 32014 + }, + { + "epoch": 0.19040227424112666, + "grad_norm": 1.6228092908859253, + "learning_rate": 4.565950211366168e-05, + "loss": 5.6646, + "step": 32015 + }, + { + "epoch": 0.19040822152440764, + "grad_norm": 1.671846866607666, + "learning_rate": 4.5659239080919746e-05, + "loss": 5.5739, + "step": 32016 + }, + { + "epoch": 0.19041416880768866, + "grad_norm": 1.377333641052246, + "learning_rate": 4.5658976040965893e-05, + "loss": 5.5736, + "step": 32017 + }, + { + "epoch": 0.19042011609096965, + "grad_norm": 1.5366264581680298, + "learning_rate": 4.56587129938002e-05, + "loss": 5.182, + "step": 32018 + }, + { + "epoch": 0.19042606337425064, + "grad_norm": 1.6668857336044312, + "learning_rate": 4.5658449939422765e-05, + "loss": 4.9602, + "step": 32019 + }, + { + "epoch": 0.19043201065753165, + "grad_norm": 1.5666422843933105, + "learning_rate": 4.565818687783368e-05, + "loss": 5.1586, + "step": 32020 + }, + { + "epoch": 0.19043795794081264, + "grad_norm": 1.2899186611175537, + "learning_rate": 4.5657923809033035e-05, + "loss": 5.1735, + "step": 32021 + }, + { + "epoch": 0.19044390522409363, + "grad_norm": 1.7876954078674316, + "learning_rate": 4.565766073302092e-05, + "loss": 5.697, + "step": 32022 + }, + { + "epoch": 0.19044985250737465, + "grad_norm": 1.9547079801559448, + "learning_rate": 4.565739764979743e-05, + "loss": 5.6536, + "step": 32023 + }, + { + "epoch": 0.19045579979065563, + "grad_norm": 2.118058204650879, + "learning_rate": 4.5657134559362655e-05, + "loss": 4.2402, + "step": 32024 + }, + { + "epoch": 0.19046174707393662, + "grad_norm": 1.5718128681182861, + "learning_rate": 4.565687146171669e-05, + "loss": 5.0162, + "step": 32025 + }, + { + "epoch": 0.1904676943572176, + "grad_norm": 1.668532371520996, + "learning_rate": 4.5656608356859624e-05, + "loss": 4.8103, + "step": 32026 + }, + { + "epoch": 0.19047364164049863, + "grad_norm": 1.6259323358535767, + "learning_rate": 4.5656345244791554e-05, + "loss": 5.1993, + "step": 32027 + }, + { + "epoch": 0.1904795889237796, + "grad_norm": 1.5034635066986084, + "learning_rate": 4.565608212551256e-05, + "loss": 5.0077, + "step": 32028 + }, + { + "epoch": 0.1904855362070606, + "grad_norm": 1.434511423110962, + "learning_rate": 4.565581899902274e-05, + "loss": 5.1024, + "step": 32029 + }, + { + "epoch": 0.19049148349034162, + "grad_norm": 1.8334324359893799, + "learning_rate": 4.565555586532219e-05, + "loss": 5.266, + "step": 32030 + }, + { + "epoch": 0.1904974307736226, + "grad_norm": 1.622114896774292, + "learning_rate": 4.5655292724411004e-05, + "loss": 5.066, + "step": 32031 + }, + { + "epoch": 0.1905033780569036, + "grad_norm": 1.6296263933181763, + "learning_rate": 4.565502957628926e-05, + "loss": 4.9455, + "step": 32032 + }, + { + "epoch": 0.1905093253401846, + "grad_norm": 1.504682183265686, + "learning_rate": 4.565476642095706e-05, + "loss": 4.9339, + "step": 32033 + }, + { + "epoch": 0.1905152726234656, + "grad_norm": 1.6852915287017822, + "learning_rate": 4.56545032584145e-05, + "loss": 5.3419, + "step": 32034 + }, + { + "epoch": 0.1905212199067466, + "grad_norm": 1.54212486743927, + "learning_rate": 4.565424008866166e-05, + "loss": 4.9588, + "step": 32035 + }, + { + "epoch": 0.1905271671900276, + "grad_norm": 1.5721091032028198, + "learning_rate": 4.565397691169865e-05, + "loss": 4.9884, + "step": 32036 + }, + { + "epoch": 0.1905331144733086, + "grad_norm": 1.6846574544906616, + "learning_rate": 4.565371372752554e-05, + "loss": 5.0185, + "step": 32037 + }, + { + "epoch": 0.19053906175658958, + "grad_norm": 1.4917422533035278, + "learning_rate": 4.565345053614243e-05, + "loss": 5.0957, + "step": 32038 + }, + { + "epoch": 0.1905450090398706, + "grad_norm": 1.6241521835327148, + "learning_rate": 4.565318733754942e-05, + "loss": 4.9592, + "step": 32039 + }, + { + "epoch": 0.19055095632315158, + "grad_norm": 1.6633590459823608, + "learning_rate": 4.565292413174659e-05, + "loss": 5.1149, + "step": 32040 + }, + { + "epoch": 0.19055690360643257, + "grad_norm": 1.4443227052688599, + "learning_rate": 4.565266091873404e-05, + "loss": 5.3016, + "step": 32041 + }, + { + "epoch": 0.1905628508897136, + "grad_norm": 1.7696523666381836, + "learning_rate": 4.565239769851186e-05, + "loss": 5.3079, + "step": 32042 + }, + { + "epoch": 0.19056879817299457, + "grad_norm": 1.5043975114822388, + "learning_rate": 4.565213447108014e-05, + "loss": 5.2658, + "step": 32043 + }, + { + "epoch": 0.19057474545627556, + "grad_norm": 1.4827871322631836, + "learning_rate": 4.565187123643898e-05, + "loss": 5.2462, + "step": 32044 + }, + { + "epoch": 0.19058069273955658, + "grad_norm": 1.2935054302215576, + "learning_rate": 4.565160799458845e-05, + "loss": 5.2094, + "step": 32045 + }, + { + "epoch": 0.19058664002283757, + "grad_norm": 1.7372486591339111, + "learning_rate": 4.565134474552867e-05, + "loss": 4.6756, + "step": 32046 + }, + { + "epoch": 0.19059258730611855, + "grad_norm": 1.8113619089126587, + "learning_rate": 4.565108148925972e-05, + "loss": 4.3151, + "step": 32047 + }, + { + "epoch": 0.19059853458939957, + "grad_norm": 2.1872177124023438, + "learning_rate": 4.565081822578168e-05, + "loss": 4.2009, + "step": 32048 + }, + { + "epoch": 0.19060448187268056, + "grad_norm": 2.375410556793213, + "learning_rate": 4.565055495509466e-05, + "loss": 4.2222, + "step": 32049 + }, + { + "epoch": 0.19061042915596155, + "grad_norm": 2.441967010498047, + "learning_rate": 4.565029167719874e-05, + "loss": 4.0724, + "step": 32050 + }, + { + "epoch": 0.19061637643924256, + "grad_norm": 1.6841249465942383, + "learning_rate": 4.5650028392094026e-05, + "loss": 4.8674, + "step": 32051 + }, + { + "epoch": 0.19062232372252355, + "grad_norm": 1.6137785911560059, + "learning_rate": 4.564976509978059e-05, + "loss": 5.457, + "step": 32052 + }, + { + "epoch": 0.19062827100580454, + "grad_norm": 1.6453704833984375, + "learning_rate": 4.564950180025854e-05, + "loss": 5.3921, + "step": 32053 + }, + { + "epoch": 0.19063421828908556, + "grad_norm": 1.6670149564743042, + "learning_rate": 4.564923849352796e-05, + "loss": 4.8336, + "step": 32054 + }, + { + "epoch": 0.19064016557236654, + "grad_norm": 1.7052969932556152, + "learning_rate": 4.564897517958895e-05, + "loss": 5.4213, + "step": 32055 + }, + { + "epoch": 0.19064611285564753, + "grad_norm": 1.4670642614364624, + "learning_rate": 4.564871185844159e-05, + "loss": 5.3312, + "step": 32056 + }, + { + "epoch": 0.19065206013892855, + "grad_norm": 1.2755639553070068, + "learning_rate": 4.564844853008598e-05, + "loss": 5.3718, + "step": 32057 + }, + { + "epoch": 0.19065800742220954, + "grad_norm": 1.7414531707763672, + "learning_rate": 4.564818519452221e-05, + "loss": 5.2937, + "step": 32058 + }, + { + "epoch": 0.19066395470549052, + "grad_norm": 1.677453637123108, + "learning_rate": 4.564792185175037e-05, + "loss": 5.3738, + "step": 32059 + }, + { + "epoch": 0.19066990198877154, + "grad_norm": 1.7875051498413086, + "learning_rate": 4.564765850177056e-05, + "loss": 5.082, + "step": 32060 + }, + { + "epoch": 0.19067584927205253, + "grad_norm": 1.8616997003555298, + "learning_rate": 4.564739514458286e-05, + "loss": 4.6828, + "step": 32061 + }, + { + "epoch": 0.19068179655533352, + "grad_norm": 1.7642920017242432, + "learning_rate": 4.564713178018737e-05, + "loss": 4.5539, + "step": 32062 + }, + { + "epoch": 0.19068774383861453, + "grad_norm": 1.6018472909927368, + "learning_rate": 4.5646868408584175e-05, + "loss": 5.2847, + "step": 32063 + }, + { + "epoch": 0.19069369112189552, + "grad_norm": 1.6408270597457886, + "learning_rate": 4.564660502977337e-05, + "loss": 5.3356, + "step": 32064 + }, + { + "epoch": 0.1906996384051765, + "grad_norm": 1.9505417346954346, + "learning_rate": 4.564634164375505e-05, + "loss": 4.6428, + "step": 32065 + }, + { + "epoch": 0.19070558568845752, + "grad_norm": 2.5392873287200928, + "learning_rate": 4.564607825052931e-05, + "loss": 3.5372, + "step": 32066 + }, + { + "epoch": 0.1907115329717385, + "grad_norm": 3.086822509765625, + "learning_rate": 4.564581485009623e-05, + "loss": 4.4435, + "step": 32067 + }, + { + "epoch": 0.1907174802550195, + "grad_norm": 2.6055800914764404, + "learning_rate": 4.564555144245592e-05, + "loss": 4.5866, + "step": 32068 + }, + { + "epoch": 0.19072342753830052, + "grad_norm": 1.843562126159668, + "learning_rate": 4.5645288027608465e-05, + "loss": 5.2971, + "step": 32069 + }, + { + "epoch": 0.1907293748215815, + "grad_norm": 1.7707334756851196, + "learning_rate": 4.5645024605553944e-05, + "loss": 5.6213, + "step": 32070 + }, + { + "epoch": 0.1907353221048625, + "grad_norm": 1.7816311120986938, + "learning_rate": 4.564476117629245e-05, + "loss": 5.2668, + "step": 32071 + }, + { + "epoch": 0.1907412693881435, + "grad_norm": 1.5548468828201294, + "learning_rate": 4.56444977398241e-05, + "loss": 5.2673, + "step": 32072 + }, + { + "epoch": 0.1907472166714245, + "grad_norm": 1.581838607788086, + "learning_rate": 4.5644234296148955e-05, + "loss": 5.0574, + "step": 32073 + }, + { + "epoch": 0.19075316395470548, + "grad_norm": 1.596366047859192, + "learning_rate": 4.564397084526714e-05, + "loss": 5.0789, + "step": 32074 + }, + { + "epoch": 0.1907591112379865, + "grad_norm": 1.4851096868515015, + "learning_rate": 4.564370738717871e-05, + "loss": 5.5059, + "step": 32075 + }, + { + "epoch": 0.1907650585212675, + "grad_norm": 1.7206519842147827, + "learning_rate": 4.564344392188378e-05, + "loss": 5.1603, + "step": 32076 + }, + { + "epoch": 0.19077100580454848, + "grad_norm": 1.7164605855941772, + "learning_rate": 4.5643180449382436e-05, + "loss": 5.2737, + "step": 32077 + }, + { + "epoch": 0.1907769530878295, + "grad_norm": 1.7038495540618896, + "learning_rate": 4.564291696967477e-05, + "loss": 5.3388, + "step": 32078 + }, + { + "epoch": 0.19078290037111048, + "grad_norm": 1.5821107625961304, + "learning_rate": 4.564265348276088e-05, + "loss": 5.3052, + "step": 32079 + }, + { + "epoch": 0.19078884765439147, + "grad_norm": 1.6088097095489502, + "learning_rate": 4.564238998864085e-05, + "loss": 5.373, + "step": 32080 + }, + { + "epoch": 0.19079479493767248, + "grad_norm": 1.8086316585540771, + "learning_rate": 4.564212648731477e-05, + "loss": 5.2465, + "step": 32081 + }, + { + "epoch": 0.19080074222095347, + "grad_norm": 1.8377625942230225, + "learning_rate": 4.5641862978782746e-05, + "loss": 5.4968, + "step": 32082 + }, + { + "epoch": 0.19080668950423446, + "grad_norm": 1.7471630573272705, + "learning_rate": 4.564159946304486e-05, + "loss": 5.2868, + "step": 32083 + }, + { + "epoch": 0.19081263678751545, + "grad_norm": 1.4050308465957642, + "learning_rate": 4.5641335940101196e-05, + "loss": 5.3825, + "step": 32084 + }, + { + "epoch": 0.19081858407079647, + "grad_norm": 1.4709471464157104, + "learning_rate": 4.564107240995187e-05, + "loss": 5.273, + "step": 32085 + }, + { + "epoch": 0.19082453135407745, + "grad_norm": 1.5411866903305054, + "learning_rate": 4.5640808872596944e-05, + "loss": 5.3013, + "step": 32086 + }, + { + "epoch": 0.19083047863735844, + "grad_norm": 1.7834669351577759, + "learning_rate": 4.5640545328036536e-05, + "loss": 4.7055, + "step": 32087 + }, + { + "epoch": 0.19083642592063946, + "grad_norm": 1.9448041915893555, + "learning_rate": 4.564028177627072e-05, + "loss": 4.9035, + "step": 32088 + }, + { + "epoch": 0.19084237320392045, + "grad_norm": 1.5890318155288696, + "learning_rate": 4.5640018217299593e-05, + "loss": 5.2144, + "step": 32089 + }, + { + "epoch": 0.19084832048720143, + "grad_norm": 1.7338842153549194, + "learning_rate": 4.563975465112325e-05, + "loss": 5.435, + "step": 32090 + }, + { + "epoch": 0.19085426777048245, + "grad_norm": 1.6438137292861938, + "learning_rate": 4.5639491077741786e-05, + "loss": 5.39, + "step": 32091 + }, + { + "epoch": 0.19086021505376344, + "grad_norm": 1.5553499460220337, + "learning_rate": 4.563922749715529e-05, + "loss": 5.4373, + "step": 32092 + }, + { + "epoch": 0.19086616233704443, + "grad_norm": 2.2094454765319824, + "learning_rate": 4.5638963909363845e-05, + "loss": 3.9242, + "step": 32093 + }, + { + "epoch": 0.19087210962032544, + "grad_norm": 2.6177477836608887, + "learning_rate": 4.563870031436756e-05, + "loss": 3.6645, + "step": 32094 + }, + { + "epoch": 0.19087805690360643, + "grad_norm": 2.5494978427886963, + "learning_rate": 4.563843671216651e-05, + "loss": 3.6596, + "step": 32095 + }, + { + "epoch": 0.19088400418688742, + "grad_norm": 2.483102798461914, + "learning_rate": 4.56381731027608e-05, + "loss": 3.7689, + "step": 32096 + }, + { + "epoch": 0.19088995147016843, + "grad_norm": 2.2079670429229736, + "learning_rate": 4.563790948615052e-05, + "loss": 3.599, + "step": 32097 + }, + { + "epoch": 0.19089589875344942, + "grad_norm": 2.243823766708374, + "learning_rate": 4.563764586233575e-05, + "loss": 4.1006, + "step": 32098 + }, + { + "epoch": 0.1909018460367304, + "grad_norm": 2.53912091255188, + "learning_rate": 4.5637382231316595e-05, + "loss": 3.9895, + "step": 32099 + }, + { + "epoch": 0.19090779332001143, + "grad_norm": 3.3844449520111084, + "learning_rate": 4.563711859309314e-05, + "loss": 3.0281, + "step": 32100 + }, + { + "epoch": 0.19091374060329241, + "grad_norm": 3.253937244415283, + "learning_rate": 4.563685494766549e-05, + "loss": 2.2503, + "step": 32101 + }, + { + "epoch": 0.1909196878865734, + "grad_norm": 2.649468183517456, + "learning_rate": 4.5636591295033723e-05, + "loss": 2.2933, + "step": 32102 + }, + { + "epoch": 0.19092563516985442, + "grad_norm": 2.899502992630005, + "learning_rate": 4.563632763519793e-05, + "loss": 3.5769, + "step": 32103 + }, + { + "epoch": 0.1909315824531354, + "grad_norm": 2.6918673515319824, + "learning_rate": 4.563606396815821e-05, + "loss": 3.7763, + "step": 32104 + }, + { + "epoch": 0.1909375297364164, + "grad_norm": 2.239382028579712, + "learning_rate": 4.5635800293914654e-05, + "loss": 5.3445, + "step": 32105 + }, + { + "epoch": 0.1909434770196974, + "grad_norm": 3.2034034729003906, + "learning_rate": 4.563553661246736e-05, + "loss": 3.0084, + "step": 32106 + }, + { + "epoch": 0.1909494243029784, + "grad_norm": 3.6095571517944336, + "learning_rate": 4.5635272923816406e-05, + "loss": 3.1999, + "step": 32107 + }, + { + "epoch": 0.1909553715862594, + "grad_norm": 2.625148057937622, + "learning_rate": 4.5635009227961886e-05, + "loss": 2.4227, + "step": 32108 + }, + { + "epoch": 0.1909613188695404, + "grad_norm": 2.8703715801239014, + "learning_rate": 4.563474552490391e-05, + "loss": 2.4958, + "step": 32109 + }, + { + "epoch": 0.1909672661528214, + "grad_norm": 2.1197023391723633, + "learning_rate": 4.563448181464255e-05, + "loss": 4.5549, + "step": 32110 + }, + { + "epoch": 0.19097321343610238, + "grad_norm": 2.1457529067993164, + "learning_rate": 4.563421809717791e-05, + "loss": 5.0002, + "step": 32111 + }, + { + "epoch": 0.1909791607193834, + "grad_norm": 1.7680915594100952, + "learning_rate": 4.563395437251007e-05, + "loss": 5.9225, + "step": 32112 + }, + { + "epoch": 0.19098510800266438, + "grad_norm": 1.8019880056381226, + "learning_rate": 4.5633690640639135e-05, + "loss": 5.5489, + "step": 32113 + }, + { + "epoch": 0.19099105528594537, + "grad_norm": 2.032569408416748, + "learning_rate": 4.563342690156519e-05, + "loss": 4.7542, + "step": 32114 + }, + { + "epoch": 0.1909970025692264, + "grad_norm": 2.230560541152954, + "learning_rate": 4.563316315528834e-05, + "loss": 4.009, + "step": 32115 + }, + { + "epoch": 0.19100294985250738, + "grad_norm": 1.82839834690094, + "learning_rate": 4.563289940180865e-05, + "loss": 4.3989, + "step": 32116 + }, + { + "epoch": 0.19100889713578836, + "grad_norm": 2.581059694290161, + "learning_rate": 4.5632635641126234e-05, + "loss": 3.6806, + "step": 32117 + }, + { + "epoch": 0.19101484441906938, + "grad_norm": 3.031672716140747, + "learning_rate": 4.563237187324118e-05, + "loss": 3.1234, + "step": 32118 + }, + { + "epoch": 0.19102079170235037, + "grad_norm": 2.618824005126953, + "learning_rate": 4.5632108098153576e-05, + "loss": 3.2917, + "step": 32119 + }, + { + "epoch": 0.19102673898563136, + "grad_norm": 2.6035311222076416, + "learning_rate": 4.563184431586351e-05, + "loss": 3.8774, + "step": 32120 + }, + { + "epoch": 0.19103268626891237, + "grad_norm": 2.398284673690796, + "learning_rate": 4.56315805263711e-05, + "loss": 3.6489, + "step": 32121 + }, + { + "epoch": 0.19103863355219336, + "grad_norm": 1.7356419563293457, + "learning_rate": 4.56313167296764e-05, + "loss": 4.4262, + "step": 32122 + }, + { + "epoch": 0.19104458083547435, + "grad_norm": 2.066359281539917, + "learning_rate": 4.5631052925779526e-05, + "loss": 5.0187, + "step": 32123 + }, + { + "epoch": 0.19105052811875536, + "grad_norm": 1.9970104694366455, + "learning_rate": 4.563078911468056e-05, + "loss": 5.2749, + "step": 32124 + }, + { + "epoch": 0.19105647540203635, + "grad_norm": 1.5883231163024902, + "learning_rate": 4.5630525296379604e-05, + "loss": 5.1499, + "step": 32125 + }, + { + "epoch": 0.19106242268531734, + "grad_norm": 1.4975897073745728, + "learning_rate": 4.5630261470876745e-05, + "loss": 5.2505, + "step": 32126 + }, + { + "epoch": 0.19106836996859836, + "grad_norm": 1.7211848497390747, + "learning_rate": 4.562999763817207e-05, + "loss": 5.2076, + "step": 32127 + }, + { + "epoch": 0.19107431725187934, + "grad_norm": 2.7472379207611084, + "learning_rate": 4.562973379826568e-05, + "loss": 3.2447, + "step": 32128 + }, + { + "epoch": 0.19108026453516033, + "grad_norm": 1.8915821313858032, + "learning_rate": 4.5629469951157667e-05, + "loss": 4.7631, + "step": 32129 + }, + { + "epoch": 0.19108621181844135, + "grad_norm": 2.09493350982666, + "learning_rate": 4.562920609684812e-05, + "loss": 5.094, + "step": 32130 + }, + { + "epoch": 0.19109215910172234, + "grad_norm": 2.0612680912017822, + "learning_rate": 4.562894223533712e-05, + "loss": 4.9167, + "step": 32131 + }, + { + "epoch": 0.19109810638500332, + "grad_norm": 2.0020735263824463, + "learning_rate": 4.562867836662478e-05, + "loss": 4.8969, + "step": 32132 + }, + { + "epoch": 0.19110405366828434, + "grad_norm": 1.935276985168457, + "learning_rate": 4.562841449071117e-05, + "loss": 5.1707, + "step": 32133 + }, + { + "epoch": 0.19111000095156533, + "grad_norm": 1.9530506134033203, + "learning_rate": 4.56281506075964e-05, + "loss": 4.84, + "step": 32134 + }, + { + "epoch": 0.19111594823484632, + "grad_norm": 1.9572010040283203, + "learning_rate": 4.5627886717280557e-05, + "loss": 4.5848, + "step": 32135 + }, + { + "epoch": 0.19112189551812733, + "grad_norm": 1.5390974283218384, + "learning_rate": 4.562762281976373e-05, + "loss": 4.8283, + "step": 32136 + }, + { + "epoch": 0.19112784280140832, + "grad_norm": 1.5366077423095703, + "learning_rate": 4.5627358915046015e-05, + "loss": 4.7778, + "step": 32137 + }, + { + "epoch": 0.1911337900846893, + "grad_norm": 2.3752942085266113, + "learning_rate": 4.56270950031275e-05, + "loss": 4.7363, + "step": 32138 + }, + { + "epoch": 0.19113973736797032, + "grad_norm": 2.1061747074127197, + "learning_rate": 4.562683108400828e-05, + "loss": 4.9934, + "step": 32139 + }, + { + "epoch": 0.1911456846512513, + "grad_norm": 1.8647900819778442, + "learning_rate": 4.562656715768844e-05, + "loss": 4.6105, + "step": 32140 + }, + { + "epoch": 0.1911516319345323, + "grad_norm": 3.472999334335327, + "learning_rate": 4.5626303224168085e-05, + "loss": 3.5257, + "step": 32141 + }, + { + "epoch": 0.1911575792178133, + "grad_norm": 3.206305503845215, + "learning_rate": 4.562603928344731e-05, + "loss": 3.2121, + "step": 32142 + }, + { + "epoch": 0.1911635265010943, + "grad_norm": 2.549485683441162, + "learning_rate": 4.562577533552618e-05, + "loss": 4.0823, + "step": 32143 + }, + { + "epoch": 0.1911694737843753, + "grad_norm": 2.314005136489868, + "learning_rate": 4.5625511380404816e-05, + "loss": 4.4944, + "step": 32144 + }, + { + "epoch": 0.19117542106765628, + "grad_norm": 1.9643429517745972, + "learning_rate": 4.562524741808329e-05, + "loss": 4.4158, + "step": 32145 + }, + { + "epoch": 0.1911813683509373, + "grad_norm": 3.2631421089172363, + "learning_rate": 4.562498344856172e-05, + "loss": 2.6561, + "step": 32146 + }, + { + "epoch": 0.19118731563421829, + "grad_norm": 2.985316514968872, + "learning_rate": 4.5624719471840166e-05, + "loss": 2.5877, + "step": 32147 + }, + { + "epoch": 0.19119326291749927, + "grad_norm": 2.864422559738159, + "learning_rate": 4.562445548791874e-05, + "loss": 3.0388, + "step": 32148 + }, + { + "epoch": 0.1911992102007803, + "grad_norm": 3.315479040145874, + "learning_rate": 4.562419149679753e-05, + "loss": 2.1148, + "step": 32149 + }, + { + "epoch": 0.19120515748406128, + "grad_norm": 2.9864017963409424, + "learning_rate": 4.562392749847663e-05, + "loss": 2.0678, + "step": 32150 + }, + { + "epoch": 0.19121110476734227, + "grad_norm": 2.3070502281188965, + "learning_rate": 4.562366349295613e-05, + "loss": 3.8237, + "step": 32151 + }, + { + "epoch": 0.19121705205062328, + "grad_norm": 3.103196620941162, + "learning_rate": 4.562339948023611e-05, + "loss": 2.2358, + "step": 32152 + }, + { + "epoch": 0.19122299933390427, + "grad_norm": 3.050037384033203, + "learning_rate": 4.562313546031669e-05, + "loss": 2.3102, + "step": 32153 + }, + { + "epoch": 0.19122894661718526, + "grad_norm": 3.0590415000915527, + "learning_rate": 4.562287143319794e-05, + "loss": 2.6283, + "step": 32154 + }, + { + "epoch": 0.19123489390046627, + "grad_norm": 3.2240819931030273, + "learning_rate": 4.5622607398879956e-05, + "loss": 1.8508, + "step": 32155 + }, + { + "epoch": 0.19124084118374726, + "grad_norm": 3.0105197429656982, + "learning_rate": 4.562234335736284e-05, + "loss": 2.0124, + "step": 32156 + }, + { + "epoch": 0.19124678846702825, + "grad_norm": 2.1753182411193848, + "learning_rate": 4.562207930864667e-05, + "loss": 4.0, + "step": 32157 + }, + { + "epoch": 0.19125273575030927, + "grad_norm": 1.8794794082641602, + "learning_rate": 4.562181525273155e-05, + "loss": 4.8347, + "step": 32158 + }, + { + "epoch": 0.19125868303359025, + "grad_norm": 1.8856089115142822, + "learning_rate": 4.5621551189617564e-05, + "loss": 4.8426, + "step": 32159 + }, + { + "epoch": 0.19126463031687124, + "grad_norm": 1.557360291481018, + "learning_rate": 4.562128711930481e-05, + "loss": 5.9689, + "step": 32160 + }, + { + "epoch": 0.19127057760015226, + "grad_norm": 1.6035491228103638, + "learning_rate": 4.562102304179338e-05, + "loss": 5.5338, + "step": 32161 + }, + { + "epoch": 0.19127652488343325, + "grad_norm": 1.6492342948913574, + "learning_rate": 4.562075895708335e-05, + "loss": 5.1785, + "step": 32162 + }, + { + "epoch": 0.19128247216671423, + "grad_norm": 1.8426982164382935, + "learning_rate": 4.5620494865174846e-05, + "loss": 4.9595, + "step": 32163 + }, + { + "epoch": 0.19128841944999525, + "grad_norm": 1.4188711643218994, + "learning_rate": 4.562023076606793e-05, + "loss": 5.0459, + "step": 32164 + }, + { + "epoch": 0.19129436673327624, + "grad_norm": 1.7565912008285522, + "learning_rate": 4.5619966659762705e-05, + "loss": 4.7912, + "step": 32165 + }, + { + "epoch": 0.19130031401655723, + "grad_norm": 1.7340164184570312, + "learning_rate": 4.561970254625926e-05, + "loss": 5.7673, + "step": 32166 + }, + { + "epoch": 0.19130626129983824, + "grad_norm": 1.6327145099639893, + "learning_rate": 4.561943842555769e-05, + "loss": 5.7126, + "step": 32167 + }, + { + "epoch": 0.19131220858311923, + "grad_norm": 1.762123703956604, + "learning_rate": 4.56191742976581e-05, + "loss": 4.8158, + "step": 32168 + }, + { + "epoch": 0.19131815586640022, + "grad_norm": 1.8846794366836548, + "learning_rate": 4.561891016256055e-05, + "loss": 4.9619, + "step": 32169 + }, + { + "epoch": 0.19132410314968123, + "grad_norm": 1.7014399766921997, + "learning_rate": 4.5618646020265165e-05, + "loss": 5.3071, + "step": 32170 + }, + { + "epoch": 0.19133005043296222, + "grad_norm": 1.6020660400390625, + "learning_rate": 4.561838187077202e-05, + "loss": 5.1499, + "step": 32171 + }, + { + "epoch": 0.1913359977162432, + "grad_norm": 1.6537081003189087, + "learning_rate": 4.561811771408121e-05, + "loss": 5.0837, + "step": 32172 + }, + { + "epoch": 0.19134194499952423, + "grad_norm": 1.7917143106460571, + "learning_rate": 4.5617853550192826e-05, + "loss": 4.9645, + "step": 32173 + }, + { + "epoch": 0.19134789228280522, + "grad_norm": 1.5786539316177368, + "learning_rate": 4.561758937910696e-05, + "loss": 5.1049, + "step": 32174 + }, + { + "epoch": 0.1913538395660862, + "grad_norm": 1.4922531843185425, + "learning_rate": 4.5617325200823715e-05, + "loss": 5.8555, + "step": 32175 + }, + { + "epoch": 0.19135978684936722, + "grad_norm": 1.5561562776565552, + "learning_rate": 4.561706101534317e-05, + "loss": 5.4443, + "step": 32176 + }, + { + "epoch": 0.1913657341326482, + "grad_norm": 1.5478456020355225, + "learning_rate": 4.5616796822665425e-05, + "loss": 5.1066, + "step": 32177 + }, + { + "epoch": 0.1913716814159292, + "grad_norm": 1.5855305194854736, + "learning_rate": 4.561653262279057e-05, + "loss": 4.898, + "step": 32178 + }, + { + "epoch": 0.1913776286992102, + "grad_norm": 1.540865182876587, + "learning_rate": 4.5616268415718686e-05, + "loss": 5.6622, + "step": 32179 + }, + { + "epoch": 0.1913835759824912, + "grad_norm": 1.6557115316390991, + "learning_rate": 4.561600420144989e-05, + "loss": 5.6751, + "step": 32180 + }, + { + "epoch": 0.1913895232657722, + "grad_norm": 1.9517576694488525, + "learning_rate": 4.561573997998425e-05, + "loss": 5.058, + "step": 32181 + }, + { + "epoch": 0.1913954705490532, + "grad_norm": 1.6348973512649536, + "learning_rate": 4.561547575132188e-05, + "loss": 4.8167, + "step": 32182 + }, + { + "epoch": 0.1914014178323342, + "grad_norm": 1.6906663179397583, + "learning_rate": 4.561521151546284e-05, + "loss": 4.9711, + "step": 32183 + }, + { + "epoch": 0.19140736511561518, + "grad_norm": 2.0812265872955322, + "learning_rate": 4.561494727240726e-05, + "loss": 5.4894, + "step": 32184 + }, + { + "epoch": 0.1914133123988962, + "grad_norm": 1.509619116783142, + "learning_rate": 4.561468302215521e-05, + "loss": 5.646, + "step": 32185 + }, + { + "epoch": 0.19141925968217718, + "grad_norm": 1.4971113204956055, + "learning_rate": 4.561441876470679e-05, + "loss": 5.3408, + "step": 32186 + }, + { + "epoch": 0.19142520696545817, + "grad_norm": 1.7571582794189453, + "learning_rate": 4.5614154500062084e-05, + "loss": 4.5655, + "step": 32187 + }, + { + "epoch": 0.1914311542487392, + "grad_norm": 1.7431119680404663, + "learning_rate": 4.561389022822119e-05, + "loss": 4.6405, + "step": 32188 + }, + { + "epoch": 0.19143710153202018, + "grad_norm": 1.694812297821045, + "learning_rate": 4.56136259491842e-05, + "loss": 4.6362, + "step": 32189 + }, + { + "epoch": 0.19144304881530116, + "grad_norm": 1.6445132493972778, + "learning_rate": 4.5613361662951206e-05, + "loss": 4.5245, + "step": 32190 + }, + { + "epoch": 0.19144899609858218, + "grad_norm": 1.6429940462112427, + "learning_rate": 4.561309736952231e-05, + "loss": 4.3761, + "step": 32191 + }, + { + "epoch": 0.19145494338186317, + "grad_norm": 1.5726126432418823, + "learning_rate": 4.561283306889759e-05, + "loss": 4.4205, + "step": 32192 + }, + { + "epoch": 0.19146089066514416, + "grad_norm": 1.6214704513549805, + "learning_rate": 4.561256876107713e-05, + "loss": 3.8221, + "step": 32193 + }, + { + "epoch": 0.19146683794842517, + "grad_norm": 1.7851347923278809, + "learning_rate": 4.561230444606105e-05, + "loss": 3.9998, + "step": 32194 + }, + { + "epoch": 0.19147278523170616, + "grad_norm": 1.6264827251434326, + "learning_rate": 4.561204012384942e-05, + "loss": 3.8884, + "step": 32195 + }, + { + "epoch": 0.19147873251498715, + "grad_norm": 1.7741085290908813, + "learning_rate": 4.5611775794442346e-05, + "loss": 4.003, + "step": 32196 + }, + { + "epoch": 0.19148467979826816, + "grad_norm": 1.7139368057250977, + "learning_rate": 4.561151145783991e-05, + "loss": 4.7756, + "step": 32197 + }, + { + "epoch": 0.19149062708154915, + "grad_norm": 2.100501775741577, + "learning_rate": 4.56112471140422e-05, + "loss": 4.8871, + "step": 32198 + }, + { + "epoch": 0.19149657436483014, + "grad_norm": 2.702474355697632, + "learning_rate": 4.561098276304933e-05, + "loss": 4.7146, + "step": 32199 + }, + { + "epoch": 0.19150252164811113, + "grad_norm": 2.8174052238464355, + "learning_rate": 4.5610718404861375e-05, + "loss": 4.7004, + "step": 32200 + }, + { + "epoch": 0.19150846893139215, + "grad_norm": 2.7826762199401855, + "learning_rate": 4.561045403947843e-05, + "loss": 4.4944, + "step": 32201 + }, + { + "epoch": 0.19151441621467313, + "grad_norm": 2.7929837703704834, + "learning_rate": 4.5610189666900585e-05, + "loss": 4.5329, + "step": 32202 + }, + { + "epoch": 0.19152036349795412, + "grad_norm": 1.817031741142273, + "learning_rate": 4.560992528712794e-05, + "loss": 4.3808, + "step": 32203 + }, + { + "epoch": 0.19152631078123514, + "grad_norm": 1.8460679054260254, + "learning_rate": 4.5609660900160584e-05, + "loss": 5.214, + "step": 32204 + }, + { + "epoch": 0.19153225806451613, + "grad_norm": 1.5890216827392578, + "learning_rate": 4.5609396505998604e-05, + "loss": 5.8593, + "step": 32205 + }, + { + "epoch": 0.1915382053477971, + "grad_norm": 1.7008686065673828, + "learning_rate": 4.56091321046421e-05, + "loss": 5.0469, + "step": 32206 + }, + { + "epoch": 0.19154415263107813, + "grad_norm": 1.6503829956054688, + "learning_rate": 4.560886769609116e-05, + "loss": 4.8837, + "step": 32207 + }, + { + "epoch": 0.19155009991435912, + "grad_norm": 2.075839042663574, + "learning_rate": 4.560860328034588e-05, + "loss": 4.7162, + "step": 32208 + }, + { + "epoch": 0.1915560471976401, + "grad_norm": 1.648373007774353, + "learning_rate": 4.560833885740635e-05, + "loss": 4.7624, + "step": 32209 + }, + { + "epoch": 0.19156199448092112, + "grad_norm": 1.5314377546310425, + "learning_rate": 4.560807442727265e-05, + "loss": 4.5067, + "step": 32210 + }, + { + "epoch": 0.1915679417642021, + "grad_norm": 1.555922269821167, + "learning_rate": 4.5607809989944894e-05, + "loss": 4.6025, + "step": 32211 + }, + { + "epoch": 0.1915738890474831, + "grad_norm": 1.4894582033157349, + "learning_rate": 4.5607545545423166e-05, + "loss": 5.1213, + "step": 32212 + }, + { + "epoch": 0.19157983633076411, + "grad_norm": 1.6521800756454468, + "learning_rate": 4.5607281093707554e-05, + "loss": 5.2245, + "step": 32213 + }, + { + "epoch": 0.1915857836140451, + "grad_norm": 1.6810579299926758, + "learning_rate": 4.560701663479815e-05, + "loss": 5.1212, + "step": 32214 + }, + { + "epoch": 0.1915917308973261, + "grad_norm": 1.622002124786377, + "learning_rate": 4.5606752168695055e-05, + "loss": 4.6669, + "step": 32215 + }, + { + "epoch": 0.1915976781806071, + "grad_norm": 1.5422900915145874, + "learning_rate": 4.560648769539835e-05, + "loss": 5.0407, + "step": 32216 + }, + { + "epoch": 0.1916036254638881, + "grad_norm": 1.907077431678772, + "learning_rate": 4.560622321490814e-05, + "loss": 4.6231, + "step": 32217 + }, + { + "epoch": 0.19160957274716908, + "grad_norm": 1.7939534187316895, + "learning_rate": 4.560595872722451e-05, + "loss": 5.0616, + "step": 32218 + }, + { + "epoch": 0.1916155200304501, + "grad_norm": 1.643505573272705, + "learning_rate": 4.560569423234755e-05, + "loss": 5.1006, + "step": 32219 + }, + { + "epoch": 0.1916214673137311, + "grad_norm": 1.5651081800460815, + "learning_rate": 4.560542973027735e-05, + "loss": 4.9603, + "step": 32220 + }, + { + "epoch": 0.19162741459701207, + "grad_norm": 1.8621745109558105, + "learning_rate": 4.560516522101401e-05, + "loss": 4.4891, + "step": 32221 + }, + { + "epoch": 0.1916333618802931, + "grad_norm": 1.6403456926345825, + "learning_rate": 4.560490070455762e-05, + "loss": 4.7257, + "step": 32222 + }, + { + "epoch": 0.19163930916357408, + "grad_norm": 1.6413493156433105, + "learning_rate": 4.560463618090828e-05, + "loss": 4.707, + "step": 32223 + }, + { + "epoch": 0.19164525644685507, + "grad_norm": 1.6639118194580078, + "learning_rate": 4.560437165006606e-05, + "loss": 4.8485, + "step": 32224 + }, + { + "epoch": 0.19165120373013608, + "grad_norm": 1.8064452409744263, + "learning_rate": 4.560410711203108e-05, + "loss": 4.568, + "step": 32225 + }, + { + "epoch": 0.19165715101341707, + "grad_norm": 1.610862135887146, + "learning_rate": 4.560384256680341e-05, + "loss": 4.509, + "step": 32226 + }, + { + "epoch": 0.19166309829669806, + "grad_norm": 1.6460599899291992, + "learning_rate": 4.560357801438315e-05, + "loss": 4.5947, + "step": 32227 + }, + { + "epoch": 0.19166904557997907, + "grad_norm": 1.7876906394958496, + "learning_rate": 4.5603313454770404e-05, + "loss": 5.1632, + "step": 32228 + }, + { + "epoch": 0.19167499286326006, + "grad_norm": 1.642273187637329, + "learning_rate": 4.560304888796525e-05, + "loss": 5.3845, + "step": 32229 + }, + { + "epoch": 0.19168094014654105, + "grad_norm": 1.5736563205718994, + "learning_rate": 4.5602784313967784e-05, + "loss": 4.8355, + "step": 32230 + }, + { + "epoch": 0.19168688742982207, + "grad_norm": 1.5724636316299438, + "learning_rate": 4.56025197327781e-05, + "loss": 4.6927, + "step": 32231 + }, + { + "epoch": 0.19169283471310306, + "grad_norm": 1.7795807123184204, + "learning_rate": 4.560225514439628e-05, + "loss": 4.9267, + "step": 32232 + }, + { + "epoch": 0.19169878199638404, + "grad_norm": 1.7334511280059814, + "learning_rate": 4.5601990548822436e-05, + "loss": 4.9678, + "step": 32233 + }, + { + "epoch": 0.19170472927966506, + "grad_norm": 1.7917529344558716, + "learning_rate": 4.560172594605665e-05, + "loss": 4.4641, + "step": 32234 + }, + { + "epoch": 0.19171067656294605, + "grad_norm": 1.7616432905197144, + "learning_rate": 4.560146133609901e-05, + "loss": 4.2135, + "step": 32235 + }, + { + "epoch": 0.19171662384622704, + "grad_norm": 1.7462329864501953, + "learning_rate": 4.5601196718949614e-05, + "loss": 4.2736, + "step": 32236 + }, + { + "epoch": 0.19172257112950805, + "grad_norm": 1.7398426532745361, + "learning_rate": 4.560093209460855e-05, + "loss": 4.5267, + "step": 32237 + }, + { + "epoch": 0.19172851841278904, + "grad_norm": 2.057189464569092, + "learning_rate": 4.5600667463075916e-05, + "loss": 4.4011, + "step": 32238 + }, + { + "epoch": 0.19173446569607003, + "grad_norm": 1.757520318031311, + "learning_rate": 4.560040282435181e-05, + "loss": 4.883, + "step": 32239 + }, + { + "epoch": 0.19174041297935104, + "grad_norm": 1.9065808057785034, + "learning_rate": 4.56001381784363e-05, + "loss": 5.2427, + "step": 32240 + }, + { + "epoch": 0.19174636026263203, + "grad_norm": 1.8721452951431274, + "learning_rate": 4.5599873525329505e-05, + "loss": 4.6069, + "step": 32241 + }, + { + "epoch": 0.19175230754591302, + "grad_norm": 1.7365145683288574, + "learning_rate": 4.559960886503151e-05, + "loss": 4.7842, + "step": 32242 + }, + { + "epoch": 0.19175825482919404, + "grad_norm": 1.9017609357833862, + "learning_rate": 4.559934419754239e-05, + "loss": 4.9601, + "step": 32243 + }, + { + "epoch": 0.19176420211247502, + "grad_norm": 1.5734943151474, + "learning_rate": 4.559907952286226e-05, + "loss": 4.9961, + "step": 32244 + }, + { + "epoch": 0.191770149395756, + "grad_norm": 1.8231675624847412, + "learning_rate": 4.559881484099121e-05, + "loss": 4.8581, + "step": 32245 + }, + { + "epoch": 0.19177609667903703, + "grad_norm": 1.657576560974121, + "learning_rate": 4.559855015192933e-05, + "loss": 4.739, + "step": 32246 + }, + { + "epoch": 0.19178204396231802, + "grad_norm": 1.801846981048584, + "learning_rate": 4.5598285455676694e-05, + "loss": 4.9883, + "step": 32247 + }, + { + "epoch": 0.191787991245599, + "grad_norm": 1.6971237659454346, + "learning_rate": 4.5598020752233414e-05, + "loss": 5.0313, + "step": 32248 + }, + { + "epoch": 0.19179393852888002, + "grad_norm": 1.7070435285568237, + "learning_rate": 4.559775604159958e-05, + "loss": 5.1385, + "step": 32249 + }, + { + "epoch": 0.191799885812161, + "grad_norm": 1.8786835670471191, + "learning_rate": 4.559749132377529e-05, + "loss": 4.5535, + "step": 32250 + }, + { + "epoch": 0.191805833095442, + "grad_norm": 2.4562580585479736, + "learning_rate": 4.559722659876061e-05, + "loss": 4.3224, + "step": 32251 + }, + { + "epoch": 0.191811780378723, + "grad_norm": 2.2160046100616455, + "learning_rate": 4.5596961866555665e-05, + "loss": 4.1118, + "step": 32252 + }, + { + "epoch": 0.191817727662004, + "grad_norm": 1.9994086027145386, + "learning_rate": 4.559669712716053e-05, + "loss": 4.2414, + "step": 32253 + }, + { + "epoch": 0.191823674945285, + "grad_norm": 1.820043921470642, + "learning_rate": 4.55964323805753e-05, + "loss": 4.2737, + "step": 32254 + }, + { + "epoch": 0.191829622228566, + "grad_norm": 1.5855984687805176, + "learning_rate": 4.559616762680008e-05, + "loss": 5.3039, + "step": 32255 + }, + { + "epoch": 0.191835569511847, + "grad_norm": 2.3250484466552734, + "learning_rate": 4.5595902865834924e-05, + "loss": 4.2797, + "step": 32256 + }, + { + "epoch": 0.19184151679512798, + "grad_norm": 2.333883285522461, + "learning_rate": 4.559563809767997e-05, + "loss": 3.8071, + "step": 32257 + }, + { + "epoch": 0.19184746407840897, + "grad_norm": 2.718810796737671, + "learning_rate": 4.559537332233529e-05, + "loss": 3.7749, + "step": 32258 + }, + { + "epoch": 0.19185341136168998, + "grad_norm": 2.366673469543457, + "learning_rate": 4.559510853980098e-05, + "loss": 3.7677, + "step": 32259 + }, + { + "epoch": 0.19185935864497097, + "grad_norm": 2.129708766937256, + "learning_rate": 4.559484375007713e-05, + "loss": 4.2157, + "step": 32260 + }, + { + "epoch": 0.19186530592825196, + "grad_norm": 1.6533547639846802, + "learning_rate": 4.559457895316382e-05, + "loss": 5.3304, + "step": 32261 + }, + { + "epoch": 0.19187125321153298, + "grad_norm": 1.7954961061477661, + "learning_rate": 4.5594314149061166e-05, + "loss": 5.5018, + "step": 32262 + }, + { + "epoch": 0.19187720049481397, + "grad_norm": 1.6768550872802734, + "learning_rate": 4.559404933776925e-05, + "loss": 5.1384, + "step": 32263 + }, + { + "epoch": 0.19188314777809495, + "grad_norm": 1.719743013381958, + "learning_rate": 4.5593784519288165e-05, + "loss": 4.7757, + "step": 32264 + }, + { + "epoch": 0.19188909506137597, + "grad_norm": 1.8016587495803833, + "learning_rate": 4.5593519693618e-05, + "loss": 4.6744, + "step": 32265 + }, + { + "epoch": 0.19189504234465696, + "grad_norm": 2.5645673274993896, + "learning_rate": 4.559325486075885e-05, + "loss": 3.7642, + "step": 32266 + }, + { + "epoch": 0.19190098962793795, + "grad_norm": 1.8583413362503052, + "learning_rate": 4.559299002071081e-05, + "loss": 4.0051, + "step": 32267 + }, + { + "epoch": 0.19190693691121896, + "grad_norm": 1.8633100986480713, + "learning_rate": 4.5592725173473964e-05, + "loss": 4.9233, + "step": 32268 + }, + { + "epoch": 0.19191288419449995, + "grad_norm": 1.5865737199783325, + "learning_rate": 4.5592460319048415e-05, + "loss": 4.9211, + "step": 32269 + }, + { + "epoch": 0.19191883147778094, + "grad_norm": 1.6763908863067627, + "learning_rate": 4.559219545743425e-05, + "loss": 5.0468, + "step": 32270 + }, + { + "epoch": 0.19192477876106195, + "grad_norm": 1.7344523668289185, + "learning_rate": 4.559193058863156e-05, + "loss": 4.9828, + "step": 32271 + }, + { + "epoch": 0.19193072604434294, + "grad_norm": 1.5607928037643433, + "learning_rate": 4.559166571264045e-05, + "loss": 5.0621, + "step": 32272 + }, + { + "epoch": 0.19193667332762393, + "grad_norm": 1.6489591598510742, + "learning_rate": 4.559140082946099e-05, + "loss": 5.075, + "step": 32273 + }, + { + "epoch": 0.19194262061090495, + "grad_norm": 2.018446207046509, + "learning_rate": 4.5591135939093286e-05, + "loss": 4.6623, + "step": 32274 + }, + { + "epoch": 0.19194856789418593, + "grad_norm": 1.7344367504119873, + "learning_rate": 4.559087104153743e-05, + "loss": 4.7845, + "step": 32275 + }, + { + "epoch": 0.19195451517746692, + "grad_norm": 1.705333948135376, + "learning_rate": 4.5590606136793524e-05, + "loss": 4.7642, + "step": 32276 + }, + { + "epoch": 0.19196046246074794, + "grad_norm": 1.784136176109314, + "learning_rate": 4.5590341224861635e-05, + "loss": 4.7684, + "step": 32277 + }, + { + "epoch": 0.19196640974402893, + "grad_norm": 2.4740941524505615, + "learning_rate": 4.559007630574188e-05, + "loss": 4.5288, + "step": 32278 + }, + { + "epoch": 0.19197235702730991, + "grad_norm": 1.569096326828003, + "learning_rate": 4.558981137943434e-05, + "loss": 4.3353, + "step": 32279 + }, + { + "epoch": 0.19197830431059093, + "grad_norm": 1.7482889890670776, + "learning_rate": 4.558954644593911e-05, + "loss": 4.7269, + "step": 32280 + }, + { + "epoch": 0.19198425159387192, + "grad_norm": 1.8372119665145874, + "learning_rate": 4.558928150525628e-05, + "loss": 5.0292, + "step": 32281 + }, + { + "epoch": 0.1919901988771529, + "grad_norm": 1.5978213548660278, + "learning_rate": 4.558901655738594e-05, + "loss": 5.2528, + "step": 32282 + }, + { + "epoch": 0.19199614616043392, + "grad_norm": 1.761826515197754, + "learning_rate": 4.55887516023282e-05, + "loss": 5.1056, + "step": 32283 + }, + { + "epoch": 0.1920020934437149, + "grad_norm": 1.9713767766952515, + "learning_rate": 4.5588486640083134e-05, + "loss": 4.6393, + "step": 32284 + }, + { + "epoch": 0.1920080407269959, + "grad_norm": 2.8644440174102783, + "learning_rate": 4.558822167065084e-05, + "loss": 3.3245, + "step": 32285 + }, + { + "epoch": 0.19201398801027691, + "grad_norm": 2.9571681022644043, + "learning_rate": 4.558795669403141e-05, + "loss": 3.1213, + "step": 32286 + }, + { + "epoch": 0.1920199352935579, + "grad_norm": 2.428436279296875, + "learning_rate": 4.5587691710224935e-05, + "loss": 4.4437, + "step": 32287 + }, + { + "epoch": 0.1920258825768389, + "grad_norm": 2.6146740913391113, + "learning_rate": 4.5587426719231506e-05, + "loss": 3.8897, + "step": 32288 + }, + { + "epoch": 0.1920318298601199, + "grad_norm": 1.8975647687911987, + "learning_rate": 4.5587161721051226e-05, + "loss": 4.214, + "step": 32289 + }, + { + "epoch": 0.1920377771434009, + "grad_norm": 2.0018997192382812, + "learning_rate": 4.558689671568418e-05, + "loss": 4.4845, + "step": 32290 + }, + { + "epoch": 0.19204372442668188, + "grad_norm": 1.9768357276916504, + "learning_rate": 4.558663170313046e-05, + "loss": 3.9529, + "step": 32291 + }, + { + "epoch": 0.1920496717099629, + "grad_norm": 2.3069944381713867, + "learning_rate": 4.558636668339016e-05, + "loss": 4.0115, + "step": 32292 + }, + { + "epoch": 0.1920556189932439, + "grad_norm": 3.4548919200897217, + "learning_rate": 4.5586101656463365e-05, + "loss": 4.1852, + "step": 32293 + }, + { + "epoch": 0.19206156627652488, + "grad_norm": 2.8961174488067627, + "learning_rate": 4.558583662235018e-05, + "loss": 4.1678, + "step": 32294 + }, + { + "epoch": 0.1920675135598059, + "grad_norm": 2.465935468673706, + "learning_rate": 4.558557158105069e-05, + "loss": 4.5003, + "step": 32295 + }, + { + "epoch": 0.19207346084308688, + "grad_norm": 1.8684260845184326, + "learning_rate": 4.5585306532564995e-05, + "loss": 5.1349, + "step": 32296 + }, + { + "epoch": 0.19207940812636787, + "grad_norm": 1.5592044591903687, + "learning_rate": 4.558504147689317e-05, + "loss": 4.959, + "step": 32297 + }, + { + "epoch": 0.19208535540964888, + "grad_norm": 1.5255820751190186, + "learning_rate": 4.5584776414035334e-05, + "loss": 5.3422, + "step": 32298 + }, + { + "epoch": 0.19209130269292987, + "grad_norm": 1.5954620838165283, + "learning_rate": 4.5584511343991566e-05, + "loss": 5.5565, + "step": 32299 + }, + { + "epoch": 0.19209724997621086, + "grad_norm": 1.7753039598464966, + "learning_rate": 4.5584246266761957e-05, + "loss": 4.8505, + "step": 32300 + }, + { + "epoch": 0.19210319725949188, + "grad_norm": 1.7820825576782227, + "learning_rate": 4.5583981182346594e-05, + "loss": 4.8029, + "step": 32301 + }, + { + "epoch": 0.19210914454277286, + "grad_norm": 2.3863165378570557, + "learning_rate": 4.558371609074557e-05, + "loss": 4.3447, + "step": 32302 + }, + { + "epoch": 0.19211509182605385, + "grad_norm": 1.9576959609985352, + "learning_rate": 4.558345099195899e-05, + "loss": 4.5883, + "step": 32303 + }, + { + "epoch": 0.19212103910933487, + "grad_norm": 3.7398250102996826, + "learning_rate": 4.558318588598694e-05, + "loss": 4.473, + "step": 32304 + }, + { + "epoch": 0.19212698639261586, + "grad_norm": 1.9002548456192017, + "learning_rate": 4.5582920772829515e-05, + "loss": 4.4629, + "step": 32305 + }, + { + "epoch": 0.19213293367589684, + "grad_norm": 1.493038535118103, + "learning_rate": 4.55826556524868e-05, + "loss": 5.047, + "step": 32306 + }, + { + "epoch": 0.19213888095917786, + "grad_norm": 1.5841251611709595, + "learning_rate": 4.558239052495889e-05, + "loss": 5.2142, + "step": 32307 + }, + { + "epoch": 0.19214482824245885, + "grad_norm": 1.573392629623413, + "learning_rate": 4.558212539024589e-05, + "loss": 5.0779, + "step": 32308 + }, + { + "epoch": 0.19215077552573984, + "grad_norm": 1.6538444757461548, + "learning_rate": 4.558186024834788e-05, + "loss": 5.0981, + "step": 32309 + }, + { + "epoch": 0.19215672280902085, + "grad_norm": 1.6234486103057861, + "learning_rate": 4.5581595099264954e-05, + "loss": 4.4117, + "step": 32310 + }, + { + "epoch": 0.19216267009230184, + "grad_norm": 1.9895765781402588, + "learning_rate": 4.55813299429972e-05, + "loss": 4.9125, + "step": 32311 + }, + { + "epoch": 0.19216861737558283, + "grad_norm": 1.8990195989608765, + "learning_rate": 4.558106477954473e-05, + "loss": 4.4664, + "step": 32312 + }, + { + "epoch": 0.19217456465886384, + "grad_norm": 2.297137498855591, + "learning_rate": 4.558079960890761e-05, + "loss": 4.1712, + "step": 32313 + }, + { + "epoch": 0.19218051194214483, + "grad_norm": 1.7623494863510132, + "learning_rate": 4.5580534431085955e-05, + "loss": 4.6109, + "step": 32314 + }, + { + "epoch": 0.19218645922542582, + "grad_norm": 1.7693278789520264, + "learning_rate": 4.558026924607984e-05, + "loss": 4.2927, + "step": 32315 + }, + { + "epoch": 0.1921924065087068, + "grad_norm": 1.873820424079895, + "learning_rate": 4.5580004053889366e-05, + "loss": 4.2318, + "step": 32316 + }, + { + "epoch": 0.19219835379198782, + "grad_norm": 2.195477247238159, + "learning_rate": 4.557973885451463e-05, + "loss": 4.0132, + "step": 32317 + }, + { + "epoch": 0.1922043010752688, + "grad_norm": 1.898896336555481, + "learning_rate": 4.5579473647955714e-05, + "loss": 4.2662, + "step": 32318 + }, + { + "epoch": 0.1922102483585498, + "grad_norm": 1.6225823163986206, + "learning_rate": 4.5579208434212725e-05, + "loss": 4.5132, + "step": 32319 + }, + { + "epoch": 0.19221619564183082, + "grad_norm": 1.6939207315444946, + "learning_rate": 4.557894321328574e-05, + "loss": 4.268, + "step": 32320 + }, + { + "epoch": 0.1922221429251118, + "grad_norm": 2.0803871154785156, + "learning_rate": 4.5578677985174854e-05, + "loss": 3.8686, + "step": 32321 + }, + { + "epoch": 0.1922280902083928, + "grad_norm": 2.1536972522735596, + "learning_rate": 4.5578412749880176e-05, + "loss": 4.0257, + "step": 32322 + }, + { + "epoch": 0.1922340374916738, + "grad_norm": 2.5257532596588135, + "learning_rate": 4.5578147507401784e-05, + "loss": 4.1924, + "step": 32323 + }, + { + "epoch": 0.1922399847749548, + "grad_norm": 2.1187551021575928, + "learning_rate": 4.5577882257739766e-05, + "loss": 4.4954, + "step": 32324 + }, + { + "epoch": 0.19224593205823579, + "grad_norm": 1.5388280153274536, + "learning_rate": 4.5577617000894225e-05, + "loss": 4.3945, + "step": 32325 + }, + { + "epoch": 0.1922518793415168, + "grad_norm": 1.9172996282577515, + "learning_rate": 4.5577351736865255e-05, + "loss": 3.8684, + "step": 32326 + }, + { + "epoch": 0.1922578266247978, + "grad_norm": 1.9113341569900513, + "learning_rate": 4.5577086465652944e-05, + "loss": 4.1434, + "step": 32327 + }, + { + "epoch": 0.19226377390807878, + "grad_norm": 2.022688865661621, + "learning_rate": 4.557682118725738e-05, + "loss": 3.8283, + "step": 32328 + }, + { + "epoch": 0.1922697211913598, + "grad_norm": 1.9376680850982666, + "learning_rate": 4.5576555901678665e-05, + "loss": 3.8662, + "step": 32329 + }, + { + "epoch": 0.19227566847464078, + "grad_norm": 1.8243870735168457, + "learning_rate": 4.557629060891688e-05, + "loss": 3.8863, + "step": 32330 + }, + { + "epoch": 0.19228161575792177, + "grad_norm": 2.059737205505371, + "learning_rate": 4.557602530897213e-05, + "loss": 4.0976, + "step": 32331 + }, + { + "epoch": 0.19228756304120279, + "grad_norm": 2.3299999237060547, + "learning_rate": 4.5575760001844494e-05, + "loss": 4.1038, + "step": 32332 + }, + { + "epoch": 0.19229351032448377, + "grad_norm": 2.2769482135772705, + "learning_rate": 4.557549468753408e-05, + "loss": 4.1553, + "step": 32333 + }, + { + "epoch": 0.19229945760776476, + "grad_norm": 1.806193470954895, + "learning_rate": 4.5575229366040975e-05, + "loss": 5.0664, + "step": 32334 + }, + { + "epoch": 0.19230540489104578, + "grad_norm": 1.7117464542388916, + "learning_rate": 4.557496403736527e-05, + "loss": 4.9323, + "step": 32335 + }, + { + "epoch": 0.19231135217432677, + "grad_norm": 1.8777096271514893, + "learning_rate": 4.557469870150706e-05, + "loss": 4.1108, + "step": 32336 + }, + { + "epoch": 0.19231729945760775, + "grad_norm": 2.475425958633423, + "learning_rate": 4.557443335846643e-05, + "loss": 3.5492, + "step": 32337 + }, + { + "epoch": 0.19232324674088877, + "grad_norm": 1.8026037216186523, + "learning_rate": 4.5574168008243474e-05, + "loss": 4.6844, + "step": 32338 + }, + { + "epoch": 0.19232919402416976, + "grad_norm": 1.6337968111038208, + "learning_rate": 4.557390265083829e-05, + "loss": 5.6839, + "step": 32339 + }, + { + "epoch": 0.19233514130745075, + "grad_norm": 1.5935887098312378, + "learning_rate": 4.557363728625098e-05, + "loss": 4.8699, + "step": 32340 + }, + { + "epoch": 0.19234108859073176, + "grad_norm": 1.7847586870193481, + "learning_rate": 4.557337191448161e-05, + "loss": 5.2093, + "step": 32341 + }, + { + "epoch": 0.19234703587401275, + "grad_norm": 1.8020373582839966, + "learning_rate": 4.5573106535530295e-05, + "loss": 4.7497, + "step": 32342 + }, + { + "epoch": 0.19235298315729374, + "grad_norm": 1.8254616260528564, + "learning_rate": 4.557284114939713e-05, + "loss": 4.8339, + "step": 32343 + }, + { + "epoch": 0.19235893044057475, + "grad_norm": 1.8874919414520264, + "learning_rate": 4.5572575756082184e-05, + "loss": 4.9181, + "step": 32344 + }, + { + "epoch": 0.19236487772385574, + "grad_norm": 2.2326526641845703, + "learning_rate": 4.5572310355585574e-05, + "loss": 4.7761, + "step": 32345 + }, + { + "epoch": 0.19237082500713673, + "grad_norm": 1.892404556274414, + "learning_rate": 4.557204494790738e-05, + "loss": 4.6584, + "step": 32346 + }, + { + "epoch": 0.19237677229041775, + "grad_norm": 1.560922622680664, + "learning_rate": 4.55717795330477e-05, + "loss": 5.5746, + "step": 32347 + }, + { + "epoch": 0.19238271957369873, + "grad_norm": 1.8915635347366333, + "learning_rate": 4.5571514111006616e-05, + "loss": 5.3499, + "step": 32348 + }, + { + "epoch": 0.19238866685697972, + "grad_norm": 2.5687575340270996, + "learning_rate": 4.557124868178424e-05, + "loss": 3.8322, + "step": 32349 + }, + { + "epoch": 0.19239461414026074, + "grad_norm": 2.195842742919922, + "learning_rate": 4.557098324538065e-05, + "loss": 3.7871, + "step": 32350 + }, + { + "epoch": 0.19240056142354173, + "grad_norm": 2.0727920532226562, + "learning_rate": 4.557071780179594e-05, + "loss": 4.1199, + "step": 32351 + }, + { + "epoch": 0.19240650870682272, + "grad_norm": 1.653414011001587, + "learning_rate": 4.557045235103021e-05, + "loss": 4.7838, + "step": 32352 + }, + { + "epoch": 0.19241245599010373, + "grad_norm": 2.0768418312072754, + "learning_rate": 4.557018689308354e-05, + "loss": 4.5005, + "step": 32353 + }, + { + "epoch": 0.19241840327338472, + "grad_norm": 2.001793622970581, + "learning_rate": 4.5569921427956034e-05, + "loss": 4.1306, + "step": 32354 + }, + { + "epoch": 0.1924243505566657, + "grad_norm": 2.1622116565704346, + "learning_rate": 4.556965595564778e-05, + "loss": 3.8781, + "step": 32355 + }, + { + "epoch": 0.19243029783994672, + "grad_norm": 2.073871374130249, + "learning_rate": 4.556939047615888e-05, + "loss": 3.8632, + "step": 32356 + }, + { + "epoch": 0.1924362451232277, + "grad_norm": 2.1704652309417725, + "learning_rate": 4.5569124989489404e-05, + "loss": 3.7858, + "step": 32357 + }, + { + "epoch": 0.1924421924065087, + "grad_norm": 1.822009801864624, + "learning_rate": 4.556885949563947e-05, + "loss": 4.1138, + "step": 32358 + }, + { + "epoch": 0.19244813968978972, + "grad_norm": 1.777799367904663, + "learning_rate": 4.556859399460916e-05, + "loss": 4.6384, + "step": 32359 + }, + { + "epoch": 0.1924540869730707, + "grad_norm": 1.926173448562622, + "learning_rate": 4.556832848639855e-05, + "loss": 5.0295, + "step": 32360 + }, + { + "epoch": 0.1924600342563517, + "grad_norm": 1.8721709251403809, + "learning_rate": 4.5568062971007764e-05, + "loss": 5.0287, + "step": 32361 + }, + { + "epoch": 0.1924659815396327, + "grad_norm": 1.7049319744110107, + "learning_rate": 4.556779744843688e-05, + "loss": 4.3578, + "step": 32362 + }, + { + "epoch": 0.1924719288229137, + "grad_norm": 1.873555302619934, + "learning_rate": 4.5567531918685984e-05, + "loss": 3.983, + "step": 32363 + }, + { + "epoch": 0.19247787610619468, + "grad_norm": 2.7111735343933105, + "learning_rate": 4.556726638175518e-05, + "loss": 4.0963, + "step": 32364 + }, + { + "epoch": 0.1924838233894757, + "grad_norm": 2.063129425048828, + "learning_rate": 4.5567000837644555e-05, + "loss": 3.8396, + "step": 32365 + }, + { + "epoch": 0.1924897706727567, + "grad_norm": 2.247694969177246, + "learning_rate": 4.55667352863542e-05, + "loss": 3.8202, + "step": 32366 + }, + { + "epoch": 0.19249571795603768, + "grad_norm": 2.430349349975586, + "learning_rate": 4.556646972788421e-05, + "loss": 3.9607, + "step": 32367 + }, + { + "epoch": 0.1925016652393187, + "grad_norm": 2.3638129234313965, + "learning_rate": 4.556620416223468e-05, + "loss": 4.2398, + "step": 32368 + }, + { + "epoch": 0.19250761252259968, + "grad_norm": 2.057927370071411, + "learning_rate": 4.55659385894057e-05, + "loss": 4.3298, + "step": 32369 + }, + { + "epoch": 0.19251355980588067, + "grad_norm": 1.7141249179840088, + "learning_rate": 4.5565673009397366e-05, + "loss": 4.7427, + "step": 32370 + }, + { + "epoch": 0.19251950708916168, + "grad_norm": 1.7085816860198975, + "learning_rate": 4.556540742220976e-05, + "loss": 4.9883, + "step": 32371 + }, + { + "epoch": 0.19252545437244267, + "grad_norm": 1.463494896888733, + "learning_rate": 4.5565141827842996e-05, + "loss": 4.7302, + "step": 32372 + }, + { + "epoch": 0.19253140165572366, + "grad_norm": 1.647187352180481, + "learning_rate": 4.556487622629714e-05, + "loss": 4.7999, + "step": 32373 + }, + { + "epoch": 0.19253734893900465, + "grad_norm": 1.526756763458252, + "learning_rate": 4.55646106175723e-05, + "loss": 4.8183, + "step": 32374 + }, + { + "epoch": 0.19254329622228566, + "grad_norm": 1.2896729707717896, + "learning_rate": 4.556434500166858e-05, + "loss": 4.6084, + "step": 32375 + }, + { + "epoch": 0.19254924350556665, + "grad_norm": 1.6381428241729736, + "learning_rate": 4.556407937858605e-05, + "loss": 5.186, + "step": 32376 + }, + { + "epoch": 0.19255519078884764, + "grad_norm": 3.1183688640594482, + "learning_rate": 4.5563813748324804e-05, + "loss": 4.0471, + "step": 32377 + }, + { + "epoch": 0.19256113807212866, + "grad_norm": 2.0422890186309814, + "learning_rate": 4.556354811088496e-05, + "loss": 4.7993, + "step": 32378 + }, + { + "epoch": 0.19256708535540965, + "grad_norm": 1.9046860933303833, + "learning_rate": 4.5563282466266574e-05, + "loss": 4.5938, + "step": 32379 + }, + { + "epoch": 0.19257303263869063, + "grad_norm": 1.9312288761138916, + "learning_rate": 4.5563016814469776e-05, + "loss": 4.4985, + "step": 32380 + }, + { + "epoch": 0.19257897992197165, + "grad_norm": 1.828894853591919, + "learning_rate": 4.556275115549464e-05, + "loss": 4.3507, + "step": 32381 + }, + { + "epoch": 0.19258492720525264, + "grad_norm": 1.8356082439422607, + "learning_rate": 4.5562485489341256e-05, + "loss": 4.8413, + "step": 32382 + }, + { + "epoch": 0.19259087448853363, + "grad_norm": 1.6310971975326538, + "learning_rate": 4.5562219816009716e-05, + "loss": 4.9384, + "step": 32383 + }, + { + "epoch": 0.19259682177181464, + "grad_norm": 1.6916502714157104, + "learning_rate": 4.556195413550012e-05, + "loss": 4.7824, + "step": 32384 + }, + { + "epoch": 0.19260276905509563, + "grad_norm": 1.468487024307251, + "learning_rate": 4.556168844781256e-05, + "loss": 4.635, + "step": 32385 + }, + { + "epoch": 0.19260871633837662, + "grad_norm": 1.5585215091705322, + "learning_rate": 4.5561422752947124e-05, + "loss": 4.6277, + "step": 32386 + }, + { + "epoch": 0.19261466362165763, + "grad_norm": 1.7868255376815796, + "learning_rate": 4.556115705090391e-05, + "loss": 4.7244, + "step": 32387 + }, + { + "epoch": 0.19262061090493862, + "grad_norm": 1.7397072315216064, + "learning_rate": 4.556089134168301e-05, + "loss": 4.4101, + "step": 32388 + }, + { + "epoch": 0.1926265581882196, + "grad_norm": 2.364893674850464, + "learning_rate": 4.556062562528452e-05, + "loss": 3.7639, + "step": 32389 + }, + { + "epoch": 0.19263250547150063, + "grad_norm": 2.690023899078369, + "learning_rate": 4.5560359901708524e-05, + "loss": 3.6474, + "step": 32390 + }, + { + "epoch": 0.19263845275478161, + "grad_norm": 2.4105823040008545, + "learning_rate": 4.5560094170955116e-05, + "loss": 3.5294, + "step": 32391 + }, + { + "epoch": 0.1926444000380626, + "grad_norm": 2.0659773349761963, + "learning_rate": 4.5559828433024385e-05, + "loss": 3.7319, + "step": 32392 + }, + { + "epoch": 0.19265034732134362, + "grad_norm": 2.075104236602783, + "learning_rate": 4.5559562687916445e-05, + "loss": 5.1514, + "step": 32393 + }, + { + "epoch": 0.1926562946046246, + "grad_norm": 1.8932725191116333, + "learning_rate": 4.5559296935631365e-05, + "loss": 4.7867, + "step": 32394 + }, + { + "epoch": 0.1926622418879056, + "grad_norm": 2.2901084423065186, + "learning_rate": 4.5559031176169246e-05, + "loss": 3.2693, + "step": 32395 + }, + { + "epoch": 0.1926681891711866, + "grad_norm": 2.2299394607543945, + "learning_rate": 4.555876540953019e-05, + "loss": 3.5186, + "step": 32396 + }, + { + "epoch": 0.1926741364544676, + "grad_norm": 2.254751443862915, + "learning_rate": 4.555849963571427e-05, + "loss": 3.4051, + "step": 32397 + }, + { + "epoch": 0.1926800837377486, + "grad_norm": 1.9714645147323608, + "learning_rate": 4.55582338547216e-05, + "loss": 3.3963, + "step": 32398 + }, + { + "epoch": 0.1926860310210296, + "grad_norm": 2.350437641143799, + "learning_rate": 4.555796806655226e-05, + "loss": 3.3453, + "step": 32399 + }, + { + "epoch": 0.1926919783043106, + "grad_norm": 2.113746166229248, + "learning_rate": 4.555770227120634e-05, + "loss": 3.5508, + "step": 32400 + }, + { + "epoch": 0.19269792558759158, + "grad_norm": 2.558175563812256, + "learning_rate": 4.555743646868395e-05, + "loss": 3.4324, + "step": 32401 + }, + { + "epoch": 0.1927038728708726, + "grad_norm": 2.097472667694092, + "learning_rate": 4.555717065898516e-05, + "loss": 3.4113, + "step": 32402 + }, + { + "epoch": 0.19270982015415358, + "grad_norm": 2.507054567337036, + "learning_rate": 4.555690484211008e-05, + "loss": 4.0971, + "step": 32403 + }, + { + "epoch": 0.19271576743743457, + "grad_norm": 1.6816004514694214, + "learning_rate": 4.5556639018058793e-05, + "loss": 5.3368, + "step": 32404 + }, + { + "epoch": 0.1927217147207156, + "grad_norm": 1.6590732336044312, + "learning_rate": 4.55563731868314e-05, + "loss": 4.9217, + "step": 32405 + }, + { + "epoch": 0.19272766200399657, + "grad_norm": 1.6414915323257446, + "learning_rate": 4.555610734842799e-05, + "loss": 4.0729, + "step": 32406 + }, + { + "epoch": 0.19273360928727756, + "grad_norm": 1.5531092882156372, + "learning_rate": 4.555584150284865e-05, + "loss": 3.8463, + "step": 32407 + }, + { + "epoch": 0.19273955657055858, + "grad_norm": 1.6675087213516235, + "learning_rate": 4.5555575650093484e-05, + "loss": 3.7309, + "step": 32408 + }, + { + "epoch": 0.19274550385383957, + "grad_norm": 1.5836836099624634, + "learning_rate": 4.555530979016257e-05, + "loss": 4.0221, + "step": 32409 + }, + { + "epoch": 0.19275145113712056, + "grad_norm": 2.2653143405914307, + "learning_rate": 4.5555043923056015e-05, + "loss": 4.4883, + "step": 32410 + }, + { + "epoch": 0.19275739842040157, + "grad_norm": 2.1392593383789062, + "learning_rate": 4.555477804877392e-05, + "loss": 4.3895, + "step": 32411 + }, + { + "epoch": 0.19276334570368256, + "grad_norm": 1.947454571723938, + "learning_rate": 4.555451216731634e-05, + "loss": 4.4879, + "step": 32412 + }, + { + "epoch": 0.19276929298696355, + "grad_norm": 1.9660381078720093, + "learning_rate": 4.555424627868341e-05, + "loss": 4.399, + "step": 32413 + }, + { + "epoch": 0.19277524027024456, + "grad_norm": 1.8891009092330933, + "learning_rate": 4.555398038287519e-05, + "loss": 3.8344, + "step": 32414 + }, + { + "epoch": 0.19278118755352555, + "grad_norm": 1.7115179300308228, + "learning_rate": 4.5553714479891804e-05, + "loss": 3.8118, + "step": 32415 + }, + { + "epoch": 0.19278713483680654, + "grad_norm": 2.0297632217407227, + "learning_rate": 4.555344856973332e-05, + "loss": 3.9253, + "step": 32416 + }, + { + "epoch": 0.19279308212008756, + "grad_norm": 1.6160376071929932, + "learning_rate": 4.555318265239984e-05, + "loss": 4.1249, + "step": 32417 + }, + { + "epoch": 0.19279902940336854, + "grad_norm": 1.6909234523773193, + "learning_rate": 4.555291672789146e-05, + "loss": 5.1648, + "step": 32418 + }, + { + "epoch": 0.19280497668664953, + "grad_norm": 1.8374849557876587, + "learning_rate": 4.5552650796208265e-05, + "loss": 5.3176, + "step": 32419 + }, + { + "epoch": 0.19281092396993055, + "grad_norm": 1.8304452896118164, + "learning_rate": 4.555238485735035e-05, + "loss": 4.7696, + "step": 32420 + }, + { + "epoch": 0.19281687125321154, + "grad_norm": 1.974797010421753, + "learning_rate": 4.555211891131782e-05, + "loss": 4.2615, + "step": 32421 + }, + { + "epoch": 0.19282281853649252, + "grad_norm": 1.8388688564300537, + "learning_rate": 4.555185295811075e-05, + "loss": 4.5941, + "step": 32422 + }, + { + "epoch": 0.19282876581977354, + "grad_norm": 1.852777361869812, + "learning_rate": 4.555158699772924e-05, + "loss": 4.379, + "step": 32423 + }, + { + "epoch": 0.19283471310305453, + "grad_norm": 1.903781771659851, + "learning_rate": 4.5551321030173376e-05, + "loss": 5.0642, + "step": 32424 + }, + { + "epoch": 0.19284066038633552, + "grad_norm": 1.830812692642212, + "learning_rate": 4.555105505544327e-05, + "loss": 5.2191, + "step": 32425 + }, + { + "epoch": 0.19284660766961653, + "grad_norm": 1.7071088552474976, + "learning_rate": 4.5550789073539e-05, + "loss": 4.6368, + "step": 32426 + }, + { + "epoch": 0.19285255495289752, + "grad_norm": 1.6677404642105103, + "learning_rate": 4.5550523084460664e-05, + "loss": 4.9055, + "step": 32427 + }, + { + "epoch": 0.1928585022361785, + "grad_norm": 1.7404626607894897, + "learning_rate": 4.555025708820835e-05, + "loss": 4.9614, + "step": 32428 + }, + { + "epoch": 0.19286444951945952, + "grad_norm": 1.6599600315093994, + "learning_rate": 4.554999108478215e-05, + "loss": 4.8015, + "step": 32429 + }, + { + "epoch": 0.1928703968027405, + "grad_norm": 2.5620381832122803, + "learning_rate": 4.554972507418217e-05, + "loss": 3.6728, + "step": 32430 + }, + { + "epoch": 0.1928763440860215, + "grad_norm": 2.435203790664673, + "learning_rate": 4.554945905640848e-05, + "loss": 4.3076, + "step": 32431 + }, + { + "epoch": 0.1928822913693025, + "grad_norm": 2.521820068359375, + "learning_rate": 4.55491930314612e-05, + "loss": 3.5146, + "step": 32432 + }, + { + "epoch": 0.1928882386525835, + "grad_norm": 2.866119861602783, + "learning_rate": 4.55489269993404e-05, + "loss": 2.8378, + "step": 32433 + }, + { + "epoch": 0.1928941859358645, + "grad_norm": 2.237283945083618, + "learning_rate": 4.554866096004619e-05, + "loss": 3.974, + "step": 32434 + }, + { + "epoch": 0.19290013321914548, + "grad_norm": 1.55573308467865, + "learning_rate": 4.5548394913578643e-05, + "loss": 5.4577, + "step": 32435 + }, + { + "epoch": 0.1929060805024265, + "grad_norm": 1.571730613708496, + "learning_rate": 4.554812885993787e-05, + "loss": 4.9712, + "step": 32436 + }, + { + "epoch": 0.19291202778570748, + "grad_norm": 2.1712872982025146, + "learning_rate": 4.554786279912395e-05, + "loss": 4.862, + "step": 32437 + }, + { + "epoch": 0.19291797506898847, + "grad_norm": 2.0782408714294434, + "learning_rate": 4.5547596731137e-05, + "loss": 4.4303, + "step": 32438 + }, + { + "epoch": 0.1929239223522695, + "grad_norm": 1.590576171875, + "learning_rate": 4.554733065597708e-05, + "loss": 4.885, + "step": 32439 + }, + { + "epoch": 0.19292986963555048, + "grad_norm": 1.8684148788452148, + "learning_rate": 4.5547064573644306e-05, + "loss": 5.0804, + "step": 32440 + }, + { + "epoch": 0.19293581691883147, + "grad_norm": 2.731905937194824, + "learning_rate": 4.554679848413876e-05, + "loss": 3.6933, + "step": 32441 + }, + { + "epoch": 0.19294176420211248, + "grad_norm": 3.1184794902801514, + "learning_rate": 4.554653238746055e-05, + "loss": 3.3802, + "step": 32442 + }, + { + "epoch": 0.19294771148539347, + "grad_norm": 1.865108847618103, + "learning_rate": 4.554626628360975e-05, + "loss": 4.882, + "step": 32443 + }, + { + "epoch": 0.19295365876867446, + "grad_norm": 2.3064188957214355, + "learning_rate": 4.554600017258646e-05, + "loss": 4.858, + "step": 32444 + }, + { + "epoch": 0.19295960605195547, + "grad_norm": 2.0467426776885986, + "learning_rate": 4.554573405439078e-05, + "loss": 4.5301, + "step": 32445 + }, + { + "epoch": 0.19296555333523646, + "grad_norm": 1.878140926361084, + "learning_rate": 4.554546792902279e-05, + "loss": 4.7595, + "step": 32446 + }, + { + "epoch": 0.19297150061851745, + "grad_norm": 1.7915738821029663, + "learning_rate": 4.554520179648259e-05, + "loss": 4.4789, + "step": 32447 + }, + { + "epoch": 0.19297744790179847, + "grad_norm": 1.7169902324676514, + "learning_rate": 4.554493565677027e-05, + "loss": 5.3695, + "step": 32448 + }, + { + "epoch": 0.19298339518507945, + "grad_norm": 1.7827154397964478, + "learning_rate": 4.554466950988593e-05, + "loss": 5.4014, + "step": 32449 + }, + { + "epoch": 0.19298934246836044, + "grad_norm": 2.4304897785186768, + "learning_rate": 4.5544403355829656e-05, + "loss": 3.4442, + "step": 32450 + }, + { + "epoch": 0.19299528975164146, + "grad_norm": 2.8224079608917236, + "learning_rate": 4.554413719460154e-05, + "loss": 3.2425, + "step": 32451 + }, + { + "epoch": 0.19300123703492245, + "grad_norm": 2.2338883876800537, + "learning_rate": 4.554387102620169e-05, + "loss": 3.3087, + "step": 32452 + }, + { + "epoch": 0.19300718431820343, + "grad_norm": 1.79100501537323, + "learning_rate": 4.5543604850630174e-05, + "loss": 5.0432, + "step": 32453 + }, + { + "epoch": 0.19301313160148445, + "grad_norm": 1.68960440158844, + "learning_rate": 4.5543338667887104e-05, + "loss": 5.031, + "step": 32454 + }, + { + "epoch": 0.19301907888476544, + "grad_norm": 2.2218265533447266, + "learning_rate": 4.554307247797256e-05, + "loss": 3.4563, + "step": 32455 + }, + { + "epoch": 0.19302502616804643, + "grad_norm": 2.5037896633148193, + "learning_rate": 4.554280628088665e-05, + "loss": 3.4917, + "step": 32456 + }, + { + "epoch": 0.19303097345132744, + "grad_norm": 2.1465871334075928, + "learning_rate": 4.554254007662946e-05, + "loss": 3.3443, + "step": 32457 + }, + { + "epoch": 0.19303692073460843, + "grad_norm": 2.032118320465088, + "learning_rate": 4.554227386520107e-05, + "loss": 3.5841, + "step": 32458 + }, + { + "epoch": 0.19304286801788942, + "grad_norm": 2.566612482070923, + "learning_rate": 4.554200764660159e-05, + "loss": 4.4753, + "step": 32459 + }, + { + "epoch": 0.19304881530117043, + "grad_norm": 3.184678077697754, + "learning_rate": 4.5541741420831105e-05, + "loss": 3.56, + "step": 32460 + }, + { + "epoch": 0.19305476258445142, + "grad_norm": 2.433135986328125, + "learning_rate": 4.554147518788972e-05, + "loss": 3.6974, + "step": 32461 + }, + { + "epoch": 0.1930607098677324, + "grad_norm": 2.502509355545044, + "learning_rate": 4.554120894777751e-05, + "loss": 3.5679, + "step": 32462 + }, + { + "epoch": 0.19306665715101343, + "grad_norm": 2.334136724472046, + "learning_rate": 4.5540942700494585e-05, + "loss": 3.6318, + "step": 32463 + }, + { + "epoch": 0.19307260443429441, + "grad_norm": 2.52958083152771, + "learning_rate": 4.554067644604102e-05, + "loss": 3.372, + "step": 32464 + }, + { + "epoch": 0.1930785517175754, + "grad_norm": 2.6455636024475098, + "learning_rate": 4.554041018441692e-05, + "loss": 3.5264, + "step": 32465 + }, + { + "epoch": 0.19308449900085642, + "grad_norm": 3.023738145828247, + "learning_rate": 4.554014391562237e-05, + "loss": 3.2593, + "step": 32466 + }, + { + "epoch": 0.1930904462841374, + "grad_norm": 2.817189931869507, + "learning_rate": 4.553987763965747e-05, + "loss": 3.199, + "step": 32467 + }, + { + "epoch": 0.1930963935674184, + "grad_norm": 2.676410675048828, + "learning_rate": 4.553961135652232e-05, + "loss": 3.5504, + "step": 32468 + }, + { + "epoch": 0.1931023408506994, + "grad_norm": 2.2987060546875, + "learning_rate": 4.553934506621699e-05, + "loss": 3.4431, + "step": 32469 + }, + { + "epoch": 0.1931082881339804, + "grad_norm": 2.421534538269043, + "learning_rate": 4.5539078768741596e-05, + "loss": 3.6071, + "step": 32470 + }, + { + "epoch": 0.1931142354172614, + "grad_norm": 2.2620744705200195, + "learning_rate": 4.553881246409622e-05, + "loss": 3.5397, + "step": 32471 + }, + { + "epoch": 0.1931201827005424, + "grad_norm": 1.4946821928024292, + "learning_rate": 4.5538546152280956e-05, + "loss": 5.0372, + "step": 32472 + }, + { + "epoch": 0.1931261299838234, + "grad_norm": 2.9421093463897705, + "learning_rate": 4.55382798332959e-05, + "loss": 4.3724, + "step": 32473 + }, + { + "epoch": 0.19313207726710438, + "grad_norm": 2.7655880451202393, + "learning_rate": 4.553801350714114e-05, + "loss": 4.1008, + "step": 32474 + }, + { + "epoch": 0.1931380245503854, + "grad_norm": 2.118710994720459, + "learning_rate": 4.553774717381677e-05, + "loss": 3.538, + "step": 32475 + }, + { + "epoch": 0.19314397183366638, + "grad_norm": 2.1854286193847656, + "learning_rate": 4.5537480833322886e-05, + "loss": 3.5067, + "step": 32476 + }, + { + "epoch": 0.19314991911694737, + "grad_norm": 2.528470039367676, + "learning_rate": 4.553721448565959e-05, + "loss": 3.4925, + "step": 32477 + }, + { + "epoch": 0.1931558664002284, + "grad_norm": 2.235788583755493, + "learning_rate": 4.553694813082695e-05, + "loss": 3.492, + "step": 32478 + }, + { + "epoch": 0.19316181368350938, + "grad_norm": 2.3836355209350586, + "learning_rate": 4.5536681768825076e-05, + "loss": 3.2728, + "step": 32479 + }, + { + "epoch": 0.19316776096679036, + "grad_norm": 2.189574956893921, + "learning_rate": 4.5536415399654066e-05, + "loss": 3.3595, + "step": 32480 + }, + { + "epoch": 0.19317370825007138, + "grad_norm": 2.3354239463806152, + "learning_rate": 4.5536149023314e-05, + "loss": 3.5092, + "step": 32481 + }, + { + "epoch": 0.19317965553335237, + "grad_norm": 2.3847224712371826, + "learning_rate": 4.553588263980498e-05, + "loss": 3.4803, + "step": 32482 + }, + { + "epoch": 0.19318560281663336, + "grad_norm": 2.399078845977783, + "learning_rate": 4.553561624912709e-05, + "loss": 3.5816, + "step": 32483 + }, + { + "epoch": 0.19319155009991437, + "grad_norm": 2.423222064971924, + "learning_rate": 4.553534985128043e-05, + "loss": 3.5522, + "step": 32484 + }, + { + "epoch": 0.19319749738319536, + "grad_norm": 2.08549427986145, + "learning_rate": 4.553508344626509e-05, + "loss": 3.5339, + "step": 32485 + }, + { + "epoch": 0.19320344466647635, + "grad_norm": 1.9425020217895508, + "learning_rate": 4.553481703408118e-05, + "loss": 3.6995, + "step": 32486 + }, + { + "epoch": 0.19320939194975736, + "grad_norm": 1.8637107610702515, + "learning_rate": 4.553455061472876e-05, + "loss": 5.0999, + "step": 32487 + }, + { + "epoch": 0.19321533923303835, + "grad_norm": 1.5940055847167969, + "learning_rate": 4.553428418820794e-05, + "loss": 5.0079, + "step": 32488 + }, + { + "epoch": 0.19322128651631934, + "grad_norm": 1.6175649166107178, + "learning_rate": 4.553401775451882e-05, + "loss": 5.2676, + "step": 32489 + }, + { + "epoch": 0.19322723379960036, + "grad_norm": 1.4351513385772705, + "learning_rate": 4.553375131366149e-05, + "loss": 5.1476, + "step": 32490 + }, + { + "epoch": 0.19323318108288134, + "grad_norm": 1.4783397912979126, + "learning_rate": 4.553348486563603e-05, + "loss": 5.2757, + "step": 32491 + }, + { + "epoch": 0.19323912836616233, + "grad_norm": 1.5456229448318481, + "learning_rate": 4.5533218410442556e-05, + "loss": 5.1947, + "step": 32492 + }, + { + "epoch": 0.19324507564944332, + "grad_norm": 1.7031913995742798, + "learning_rate": 4.553295194808114e-05, + "loss": 4.8797, + "step": 32493 + }, + { + "epoch": 0.19325102293272434, + "grad_norm": 2.224454879760742, + "learning_rate": 4.553268547855188e-05, + "loss": 4.7078, + "step": 32494 + }, + { + "epoch": 0.19325697021600532, + "grad_norm": 2.446502685546875, + "learning_rate": 4.553241900185488e-05, + "loss": 4.085, + "step": 32495 + }, + { + "epoch": 0.1932629174992863, + "grad_norm": 1.8770337104797363, + "learning_rate": 4.553215251799021e-05, + "loss": 5.3268, + "step": 32496 + }, + { + "epoch": 0.19326886478256733, + "grad_norm": 2.499891996383667, + "learning_rate": 4.5531886026957994e-05, + "loss": 4.6427, + "step": 32497 + }, + { + "epoch": 0.19327481206584832, + "grad_norm": 2.7995948791503906, + "learning_rate": 4.553161952875829e-05, + "loss": 3.4704, + "step": 32498 + }, + { + "epoch": 0.1932807593491293, + "grad_norm": 2.4758827686309814, + "learning_rate": 4.553135302339123e-05, + "loss": 3.6428, + "step": 32499 + }, + { + "epoch": 0.19328670663241032, + "grad_norm": 1.7038213014602661, + "learning_rate": 4.553108651085688e-05, + "loss": 5.3407, + "step": 32500 + }, + { + "epoch": 0.1932926539156913, + "grad_norm": 1.8451898097991943, + "learning_rate": 4.5530819991155325e-05, + "loss": 5.1482, + "step": 32501 + }, + { + "epoch": 0.1932986011989723, + "grad_norm": 3.1001696586608887, + "learning_rate": 4.553055346428669e-05, + "loss": 3.8696, + "step": 32502 + }, + { + "epoch": 0.1933045484822533, + "grad_norm": 3.6699612140655518, + "learning_rate": 4.553028693025105e-05, + "loss": 4.1102, + "step": 32503 + }, + { + "epoch": 0.1933104957655343, + "grad_norm": 2.5202810764312744, + "learning_rate": 4.553002038904849e-05, + "loss": 4.1498, + "step": 32504 + }, + { + "epoch": 0.1933164430488153, + "grad_norm": 1.6970324516296387, + "learning_rate": 4.552975384067912e-05, + "loss": 4.9378, + "step": 32505 + }, + { + "epoch": 0.1933223903320963, + "grad_norm": 1.8077476024627686, + "learning_rate": 4.5529487285143026e-05, + "loss": 5.3876, + "step": 32506 + }, + { + "epoch": 0.1933283376153773, + "grad_norm": 1.61594820022583, + "learning_rate": 4.552922072244029e-05, + "loss": 5.316, + "step": 32507 + }, + { + "epoch": 0.19333428489865828, + "grad_norm": 1.7117811441421509, + "learning_rate": 4.552895415257102e-05, + "loss": 5.1787, + "step": 32508 + }, + { + "epoch": 0.1933402321819393, + "grad_norm": 1.8115290403366089, + "learning_rate": 4.5528687575535314e-05, + "loss": 5.347, + "step": 32509 + }, + { + "epoch": 0.19334617946522029, + "grad_norm": 1.6437400579452515, + "learning_rate": 4.552842099133324e-05, + "loss": 5.4069, + "step": 32510 + }, + { + "epoch": 0.19335212674850127, + "grad_norm": 1.8343757390975952, + "learning_rate": 4.5528154399964915e-05, + "loss": 4.6641, + "step": 32511 + }, + { + "epoch": 0.1933580740317823, + "grad_norm": 1.716610312461853, + "learning_rate": 4.552788780143042e-05, + "loss": 5.2652, + "step": 32512 + }, + { + "epoch": 0.19336402131506328, + "grad_norm": 1.73993980884552, + "learning_rate": 4.552762119572985e-05, + "loss": 5.1183, + "step": 32513 + }, + { + "epoch": 0.19336996859834427, + "grad_norm": 1.729629635810852, + "learning_rate": 4.55273545828633e-05, + "loss": 5.1342, + "step": 32514 + }, + { + "epoch": 0.19337591588162528, + "grad_norm": 1.6299128532409668, + "learning_rate": 4.552708796283087e-05, + "loss": 5.1742, + "step": 32515 + }, + { + "epoch": 0.19338186316490627, + "grad_norm": 2.0269429683685303, + "learning_rate": 4.552682133563264e-05, + "loss": 4.0218, + "step": 32516 + }, + { + "epoch": 0.19338781044818726, + "grad_norm": 2.795447826385498, + "learning_rate": 4.552655470126871e-05, + "loss": 3.5319, + "step": 32517 + }, + { + "epoch": 0.19339375773146827, + "grad_norm": 2.5553972721099854, + "learning_rate": 4.552628805973917e-05, + "loss": 3.6563, + "step": 32518 + }, + { + "epoch": 0.19339970501474926, + "grad_norm": 2.5591487884521484, + "learning_rate": 4.552602141104412e-05, + "loss": 3.5933, + "step": 32519 + }, + { + "epoch": 0.19340565229803025, + "grad_norm": 2.5582263469696045, + "learning_rate": 4.552575475518364e-05, + "loss": 3.325, + "step": 32520 + }, + { + "epoch": 0.19341159958131127, + "grad_norm": 1.9097342491149902, + "learning_rate": 4.552548809215784e-05, + "loss": 3.7953, + "step": 32521 + }, + { + "epoch": 0.19341754686459225, + "grad_norm": 1.9214484691619873, + "learning_rate": 4.552522142196679e-05, + "loss": 4.5439, + "step": 32522 + }, + { + "epoch": 0.19342349414787324, + "grad_norm": 2.408139944076538, + "learning_rate": 4.5524954744610614e-05, + "loss": 3.5626, + "step": 32523 + }, + { + "epoch": 0.19342944143115426, + "grad_norm": 2.6337690353393555, + "learning_rate": 4.552468806008938e-05, + "loss": 3.5864, + "step": 32524 + }, + { + "epoch": 0.19343538871443525, + "grad_norm": 2.5147154331207275, + "learning_rate": 4.552442136840319e-05, + "loss": 3.669, + "step": 32525 + }, + { + "epoch": 0.19344133599771623, + "grad_norm": 2.5548198223114014, + "learning_rate": 4.5524154669552136e-05, + "loss": 3.502, + "step": 32526 + }, + { + "epoch": 0.19344728328099725, + "grad_norm": 2.8102235794067383, + "learning_rate": 4.5523887963536316e-05, + "loss": 3.6769, + "step": 32527 + }, + { + "epoch": 0.19345323056427824, + "grad_norm": 2.849118947982788, + "learning_rate": 4.552362125035581e-05, + "loss": 3.3322, + "step": 32528 + }, + { + "epoch": 0.19345917784755923, + "grad_norm": 3.095203161239624, + "learning_rate": 4.552335453001073e-05, + "loss": 2.9217, + "step": 32529 + }, + { + "epoch": 0.19346512513084024, + "grad_norm": 2.7572739124298096, + "learning_rate": 4.5523087802501155e-05, + "loss": 3.0508, + "step": 32530 + }, + { + "epoch": 0.19347107241412123, + "grad_norm": 2.872610330581665, + "learning_rate": 4.5522821067827174e-05, + "loss": 3.501, + "step": 32531 + }, + { + "epoch": 0.19347701969740222, + "grad_norm": 2.6060242652893066, + "learning_rate": 4.5522554325988894e-05, + "loss": 3.0258, + "step": 32532 + }, + { + "epoch": 0.19348296698068324, + "grad_norm": 2.3665926456451416, + "learning_rate": 4.552228757698641e-05, + "loss": 3.551, + "step": 32533 + }, + { + "epoch": 0.19348891426396422, + "grad_norm": 2.1231276988983154, + "learning_rate": 4.55220208208198e-05, + "loss": 5.2629, + "step": 32534 + }, + { + "epoch": 0.1934948615472452, + "grad_norm": 1.810520052909851, + "learning_rate": 4.5521754057489166e-05, + "loss": 5.1596, + "step": 32535 + }, + { + "epoch": 0.19350080883052623, + "grad_norm": 2.078847885131836, + "learning_rate": 4.55214872869946e-05, + "loss": 5.1359, + "step": 32536 + }, + { + "epoch": 0.19350675611380722, + "grad_norm": 1.823213815689087, + "learning_rate": 4.5521220509336194e-05, + "loss": 5.3621, + "step": 32537 + }, + { + "epoch": 0.1935127033970882, + "grad_norm": 1.9431284666061401, + "learning_rate": 4.5520953724514034e-05, + "loss": 5.1437, + "step": 32538 + }, + { + "epoch": 0.19351865068036922, + "grad_norm": 2.144991636276245, + "learning_rate": 4.552068693252823e-05, + "loss": 4.718, + "step": 32539 + }, + { + "epoch": 0.1935245979636502, + "grad_norm": 1.8919559717178345, + "learning_rate": 4.552042013337887e-05, + "loss": 5.3912, + "step": 32540 + }, + { + "epoch": 0.1935305452469312, + "grad_norm": 2.217273473739624, + "learning_rate": 4.552015332706604e-05, + "loss": 5.0869, + "step": 32541 + }, + { + "epoch": 0.1935364925302122, + "grad_norm": 1.923957109451294, + "learning_rate": 4.5519886513589835e-05, + "loss": 4.5278, + "step": 32542 + }, + { + "epoch": 0.1935424398134932, + "grad_norm": 1.3886886835098267, + "learning_rate": 4.551961969295035e-05, + "loss": 5.2535, + "step": 32543 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 1.6783514022827148, + "learning_rate": 4.551935286514768e-05, + "loss": 5.23, + "step": 32544 + }, + { + "epoch": 0.1935543343800552, + "grad_norm": 2.014293670654297, + "learning_rate": 4.551908603018191e-05, + "loss": 4.3551, + "step": 32545 + }, + { + "epoch": 0.1935602816633362, + "grad_norm": 2.4212424755096436, + "learning_rate": 4.551881918805314e-05, + "loss": 4.1387, + "step": 32546 + }, + { + "epoch": 0.19356622894661718, + "grad_norm": 1.541458010673523, + "learning_rate": 4.5518552338761466e-05, + "loss": 5.1392, + "step": 32547 + }, + { + "epoch": 0.1935721762298982, + "grad_norm": 1.645020604133606, + "learning_rate": 4.551828548230698e-05, + "loss": 4.9932, + "step": 32548 + }, + { + "epoch": 0.19357812351317918, + "grad_norm": 1.833045244216919, + "learning_rate": 4.551801861868977e-05, + "loss": 4.5334, + "step": 32549 + }, + { + "epoch": 0.19358407079646017, + "grad_norm": 1.7046024799346924, + "learning_rate": 4.5517751747909925e-05, + "loss": 4.718, + "step": 32550 + }, + { + "epoch": 0.19359001807974116, + "grad_norm": 1.642063856124878, + "learning_rate": 4.551748486996755e-05, + "loss": 5.3216, + "step": 32551 + }, + { + "epoch": 0.19359596536302218, + "grad_norm": 1.8512293100357056, + "learning_rate": 4.5517217984862736e-05, + "loss": 4.2428, + "step": 32552 + }, + { + "epoch": 0.19360191264630316, + "grad_norm": 1.4418857097625732, + "learning_rate": 4.551695109259557e-05, + "loss": 5.1654, + "step": 32553 + }, + { + "epoch": 0.19360785992958415, + "grad_norm": 1.3343303203582764, + "learning_rate": 4.551668419316615e-05, + "loss": 4.236, + "step": 32554 + }, + { + "epoch": 0.19361380721286517, + "grad_norm": 1.4303019046783447, + "learning_rate": 4.5516417286574567e-05, + "loss": 5.2193, + "step": 32555 + }, + { + "epoch": 0.19361975449614616, + "grad_norm": 2.1168198585510254, + "learning_rate": 4.551615037282091e-05, + "loss": 4.3826, + "step": 32556 + }, + { + "epoch": 0.19362570177942715, + "grad_norm": 1.8565425872802734, + "learning_rate": 4.551588345190528e-05, + "loss": 4.4285, + "step": 32557 + }, + { + "epoch": 0.19363164906270816, + "grad_norm": 1.4676978588104248, + "learning_rate": 4.551561652382777e-05, + "loss": 5.1846, + "step": 32558 + }, + { + "epoch": 0.19363759634598915, + "grad_norm": 1.3559863567352295, + "learning_rate": 4.5515349588588465e-05, + "loss": 5.4113, + "step": 32559 + }, + { + "epoch": 0.19364354362927014, + "grad_norm": 1.5853360891342163, + "learning_rate": 4.551508264618747e-05, + "loss": 5.0185, + "step": 32560 + }, + { + "epoch": 0.19364949091255115, + "grad_norm": 1.5268198251724243, + "learning_rate": 4.551481569662487e-05, + "loss": 5.5332, + "step": 32561 + }, + { + "epoch": 0.19365543819583214, + "grad_norm": 1.873355507850647, + "learning_rate": 4.551454873990075e-05, + "loss": 4.9289, + "step": 32562 + }, + { + "epoch": 0.19366138547911313, + "grad_norm": 1.559546709060669, + "learning_rate": 4.5514281776015225e-05, + "loss": 4.7161, + "step": 32563 + }, + { + "epoch": 0.19366733276239415, + "grad_norm": 1.5937471389770508, + "learning_rate": 4.551401480496837e-05, + "loss": 5.2034, + "step": 32564 + }, + { + "epoch": 0.19367328004567513, + "grad_norm": 1.3408461809158325, + "learning_rate": 4.551374782676029e-05, + "loss": 5.4465, + "step": 32565 + }, + { + "epoch": 0.19367922732895612, + "grad_norm": 1.5724444389343262, + "learning_rate": 4.551348084139107e-05, + "loss": 4.877, + "step": 32566 + }, + { + "epoch": 0.19368517461223714, + "grad_norm": 1.6047838926315308, + "learning_rate": 4.55132138488608e-05, + "loss": 4.8731, + "step": 32567 + }, + { + "epoch": 0.19369112189551813, + "grad_norm": 1.4955263137817383, + "learning_rate": 4.5512946849169586e-05, + "loss": 5.1981, + "step": 32568 + }, + { + "epoch": 0.19369706917879911, + "grad_norm": 1.8634296655654907, + "learning_rate": 4.5512679842317506e-05, + "loss": 4.9656, + "step": 32569 + }, + { + "epoch": 0.19370301646208013, + "grad_norm": 1.7974209785461426, + "learning_rate": 4.551241282830467e-05, + "loss": 5.0912, + "step": 32570 + }, + { + "epoch": 0.19370896374536112, + "grad_norm": 1.4037576913833618, + "learning_rate": 4.551214580713116e-05, + "loss": 5.0925, + "step": 32571 + }, + { + "epoch": 0.1937149110286421, + "grad_norm": 1.3789407014846802, + "learning_rate": 4.551187877879707e-05, + "loss": 4.9709, + "step": 32572 + }, + { + "epoch": 0.19372085831192312, + "grad_norm": 1.8228954076766968, + "learning_rate": 4.5511611743302504e-05, + "loss": 5.0125, + "step": 32573 + }, + { + "epoch": 0.1937268055952041, + "grad_norm": 1.6477984189987183, + "learning_rate": 4.5511344700647534e-05, + "loss": 4.6782, + "step": 32574 + }, + { + "epoch": 0.1937327528784851, + "grad_norm": 1.5533115863800049, + "learning_rate": 4.5511077650832275e-05, + "loss": 5.1811, + "step": 32575 + }, + { + "epoch": 0.19373870016176611, + "grad_norm": 1.4982826709747314, + "learning_rate": 4.5510810593856804e-05, + "loss": 5.4532, + "step": 32576 + }, + { + "epoch": 0.1937446474450471, + "grad_norm": 1.2856029272079468, + "learning_rate": 4.551054352972122e-05, + "loss": 5.3471, + "step": 32577 + }, + { + "epoch": 0.1937505947283281, + "grad_norm": 1.7265102863311768, + "learning_rate": 4.551027645842562e-05, + "loss": 4.7828, + "step": 32578 + }, + { + "epoch": 0.1937565420116091, + "grad_norm": 1.6499298810958862, + "learning_rate": 4.55100093799701e-05, + "loss": 5.0823, + "step": 32579 + }, + { + "epoch": 0.1937624892948901, + "grad_norm": 1.6047910451889038, + "learning_rate": 4.550974229435474e-05, + "loss": 4.8312, + "step": 32580 + }, + { + "epoch": 0.19376843657817108, + "grad_norm": 1.4857003688812256, + "learning_rate": 4.5509475201579645e-05, + "loss": 5.0103, + "step": 32581 + }, + { + "epoch": 0.1937743838614521, + "grad_norm": 1.4241074323654175, + "learning_rate": 4.55092081016449e-05, + "loss": 4.994, + "step": 32582 + }, + { + "epoch": 0.1937803311447331, + "grad_norm": 1.4176710844039917, + "learning_rate": 4.55089409945506e-05, + "loss": 5.0398, + "step": 32583 + }, + { + "epoch": 0.19378627842801407, + "grad_norm": 1.2380996942520142, + "learning_rate": 4.5508673880296846e-05, + "loss": 5.2739, + "step": 32584 + }, + { + "epoch": 0.1937922257112951, + "grad_norm": 1.4685777425765991, + "learning_rate": 4.550840675888373e-05, + "loss": 5.5471, + "step": 32585 + }, + { + "epoch": 0.19379817299457608, + "grad_norm": 2.9389889240264893, + "learning_rate": 4.5508139630311333e-05, + "loss": 4.1122, + "step": 32586 + }, + { + "epoch": 0.19380412027785707, + "grad_norm": 2.7261459827423096, + "learning_rate": 4.550787249457976e-05, + "loss": 4.0543, + "step": 32587 + }, + { + "epoch": 0.19381006756113808, + "grad_norm": 2.227731704711914, + "learning_rate": 4.55076053516891e-05, + "loss": 3.5322, + "step": 32588 + }, + { + "epoch": 0.19381601484441907, + "grad_norm": 2.80881929397583, + "learning_rate": 4.550733820163945e-05, + "loss": 3.7871, + "step": 32589 + }, + { + "epoch": 0.19382196212770006, + "grad_norm": 1.8524302244186401, + "learning_rate": 4.5507071044430894e-05, + "loss": 5.2918, + "step": 32590 + }, + { + "epoch": 0.19382790941098108, + "grad_norm": 1.3771488666534424, + "learning_rate": 4.550680388006353e-05, + "loss": 5.3438, + "step": 32591 + }, + { + "epoch": 0.19383385669426206, + "grad_norm": 2.4697203636169434, + "learning_rate": 4.5506536708537464e-05, + "loss": 4.6906, + "step": 32592 + }, + { + "epoch": 0.19383980397754305, + "grad_norm": 1.4540528059005737, + "learning_rate": 4.550626952985276e-05, + "loss": 5.2692, + "step": 32593 + }, + { + "epoch": 0.19384575126082407, + "grad_norm": 1.4477177858352661, + "learning_rate": 4.550600234400954e-05, + "loss": 5.2064, + "step": 32594 + }, + { + "epoch": 0.19385169854410506, + "grad_norm": 1.533417820930481, + "learning_rate": 4.550573515100789e-05, + "loss": 5.235, + "step": 32595 + }, + { + "epoch": 0.19385764582738604, + "grad_norm": 2.2805964946746826, + "learning_rate": 4.5505467950847904e-05, + "loss": 3.8858, + "step": 32596 + }, + { + "epoch": 0.19386359311066706, + "grad_norm": 2.02608323097229, + "learning_rate": 4.550520074352966e-05, + "loss": 4.1653, + "step": 32597 + }, + { + "epoch": 0.19386954039394805, + "grad_norm": 1.5796022415161133, + "learning_rate": 4.5504933529053264e-05, + "loss": 5.434, + "step": 32598 + }, + { + "epoch": 0.19387548767722904, + "grad_norm": 1.2967963218688965, + "learning_rate": 4.550466630741881e-05, + "loss": 5.5094, + "step": 32599 + }, + { + "epoch": 0.19388143496051005, + "grad_norm": 1.4472898244857788, + "learning_rate": 4.5504399078626384e-05, + "loss": 5.5817, + "step": 32600 + }, + { + "epoch": 0.19388738224379104, + "grad_norm": 1.9111692905426025, + "learning_rate": 4.550413184267609e-05, + "loss": 5.2645, + "step": 32601 + }, + { + "epoch": 0.19389332952707203, + "grad_norm": 2.3285624980926514, + "learning_rate": 4.5503864599568014e-05, + "loss": 4.6221, + "step": 32602 + }, + { + "epoch": 0.19389927681035304, + "grad_norm": 1.628674864768982, + "learning_rate": 4.550359734930225e-05, + "loss": 4.8921, + "step": 32603 + }, + { + "epoch": 0.19390522409363403, + "grad_norm": 1.6365528106689453, + "learning_rate": 4.550333009187889e-05, + "loss": 4.9367, + "step": 32604 + }, + { + "epoch": 0.19391117137691502, + "grad_norm": 1.8951784372329712, + "learning_rate": 4.5503062827298026e-05, + "loss": 4.7174, + "step": 32605 + }, + { + "epoch": 0.19391711866019604, + "grad_norm": 1.7012661695480347, + "learning_rate": 4.550279555555976e-05, + "loss": 5.0791, + "step": 32606 + }, + { + "epoch": 0.19392306594347702, + "grad_norm": 1.5482909679412842, + "learning_rate": 4.550252827666418e-05, + "loss": 5.3443, + "step": 32607 + }, + { + "epoch": 0.193929013226758, + "grad_norm": 1.7100435495376587, + "learning_rate": 4.5502260990611385e-05, + "loss": 5.3665, + "step": 32608 + }, + { + "epoch": 0.193934960510039, + "grad_norm": 1.7301504611968994, + "learning_rate": 4.550199369740146e-05, + "loss": 5.2521, + "step": 32609 + }, + { + "epoch": 0.19394090779332002, + "grad_norm": 1.4652512073516846, + "learning_rate": 4.550172639703449e-05, + "loss": 5.3211, + "step": 32610 + }, + { + "epoch": 0.193946855076601, + "grad_norm": 1.6663676500320435, + "learning_rate": 4.550145908951059e-05, + "loss": 4.8544, + "step": 32611 + }, + { + "epoch": 0.193952802359882, + "grad_norm": 2.0611562728881836, + "learning_rate": 4.550119177482983e-05, + "loss": 4.3234, + "step": 32612 + }, + { + "epoch": 0.193958749643163, + "grad_norm": 1.612246036529541, + "learning_rate": 4.5500924452992335e-05, + "loss": 5.1136, + "step": 32613 + }, + { + "epoch": 0.193964696926444, + "grad_norm": 1.622652530670166, + "learning_rate": 4.550065712399816e-05, + "loss": 4.9411, + "step": 32614 + }, + { + "epoch": 0.19397064420972498, + "grad_norm": 1.7339041233062744, + "learning_rate": 4.5500389787847434e-05, + "loss": 4.7983, + "step": 32615 + }, + { + "epoch": 0.193976591493006, + "grad_norm": 1.4528504610061646, + "learning_rate": 4.550012244454022e-05, + "loss": 5.2277, + "step": 32616 + }, + { + "epoch": 0.193982538776287, + "grad_norm": 1.3635590076446533, + "learning_rate": 4.5499855094076634e-05, + "loss": 5.1081, + "step": 32617 + }, + { + "epoch": 0.19398848605956798, + "grad_norm": 1.7279419898986816, + "learning_rate": 4.549958773645676e-05, + "loss": 5.1649, + "step": 32618 + }, + { + "epoch": 0.193994433342849, + "grad_norm": 1.3046659231185913, + "learning_rate": 4.549932037168069e-05, + "loss": 5.1288, + "step": 32619 + }, + { + "epoch": 0.19400038062612998, + "grad_norm": 1.3744393587112427, + "learning_rate": 4.549905299974852e-05, + "loss": 5.1108, + "step": 32620 + }, + { + "epoch": 0.19400632790941097, + "grad_norm": 1.4627171754837036, + "learning_rate": 4.5498785620660334e-05, + "loss": 5.1959, + "step": 32621 + }, + { + "epoch": 0.19401227519269199, + "grad_norm": 1.3577818870544434, + "learning_rate": 4.549851823441624e-05, + "loss": 5.3846, + "step": 32622 + }, + { + "epoch": 0.19401822247597297, + "grad_norm": 1.5022718906402588, + "learning_rate": 4.5498250841016324e-05, + "loss": 5.1238, + "step": 32623 + }, + { + "epoch": 0.19402416975925396, + "grad_norm": 1.5609856843948364, + "learning_rate": 4.549798344046068e-05, + "loss": 5.0529, + "step": 32624 + }, + { + "epoch": 0.19403011704253498, + "grad_norm": 1.523977279663086, + "learning_rate": 4.5497716032749404e-05, + "loss": 5.1331, + "step": 32625 + }, + { + "epoch": 0.19403606432581597, + "grad_norm": 1.4886420965194702, + "learning_rate": 4.549744861788259e-05, + "loss": 5.1282, + "step": 32626 + }, + { + "epoch": 0.19404201160909695, + "grad_norm": 2.039761781692505, + "learning_rate": 4.549718119586032e-05, + "loss": 3.9558, + "step": 32627 + }, + { + "epoch": 0.19404795889237797, + "grad_norm": 3.320199728012085, + "learning_rate": 4.54969137666827e-05, + "loss": 2.5416, + "step": 32628 + }, + { + "epoch": 0.19405390617565896, + "grad_norm": 2.6014201641082764, + "learning_rate": 4.549664633034982e-05, + "loss": 3.4481, + "step": 32629 + }, + { + "epoch": 0.19405985345893995, + "grad_norm": 2.4465911388397217, + "learning_rate": 4.549637888686177e-05, + "loss": 2.8785, + "step": 32630 + }, + { + "epoch": 0.19406580074222096, + "grad_norm": 2.668752670288086, + "learning_rate": 4.549611143621865e-05, + "loss": 3.2297, + "step": 32631 + }, + { + "epoch": 0.19407174802550195, + "grad_norm": 2.3197052478790283, + "learning_rate": 4.549584397842055e-05, + "loss": 3.6405, + "step": 32632 + }, + { + "epoch": 0.19407769530878294, + "grad_norm": 1.5895888805389404, + "learning_rate": 4.549557651346756e-05, + "loss": 4.9316, + "step": 32633 + }, + { + "epoch": 0.19408364259206395, + "grad_norm": 1.7346808910369873, + "learning_rate": 4.5495309041359774e-05, + "loss": 4.9319, + "step": 32634 + }, + { + "epoch": 0.19408958987534494, + "grad_norm": 1.7234487533569336, + "learning_rate": 4.549504156209729e-05, + "loss": 5.0863, + "step": 32635 + }, + { + "epoch": 0.19409553715862593, + "grad_norm": 2.0025033950805664, + "learning_rate": 4.5494774075680204e-05, + "loss": 5.3009, + "step": 32636 + }, + { + "epoch": 0.19410148444190695, + "grad_norm": 1.7096216678619385, + "learning_rate": 4.549450658210859e-05, + "loss": 5.3195, + "step": 32637 + }, + { + "epoch": 0.19410743172518793, + "grad_norm": 2.219808578491211, + "learning_rate": 4.549423908138257e-05, + "loss": 4.5839, + "step": 32638 + }, + { + "epoch": 0.19411337900846892, + "grad_norm": 1.8379898071289062, + "learning_rate": 4.549397157350221e-05, + "loss": 4.5265, + "step": 32639 + }, + { + "epoch": 0.19411932629174994, + "grad_norm": 2.1760711669921875, + "learning_rate": 4.549370405846762e-05, + "loss": 4.161, + "step": 32640 + }, + { + "epoch": 0.19412527357503093, + "grad_norm": 1.9466861486434937, + "learning_rate": 4.54934365362789e-05, + "loss": 4.6235, + "step": 32641 + }, + { + "epoch": 0.19413122085831191, + "grad_norm": 1.8308939933776855, + "learning_rate": 4.549316900693612e-05, + "loss": 4.6028, + "step": 32642 + }, + { + "epoch": 0.19413716814159293, + "grad_norm": 1.7800029516220093, + "learning_rate": 4.54929014704394e-05, + "loss": 4.721, + "step": 32643 + }, + { + "epoch": 0.19414311542487392, + "grad_norm": 2.0616424083709717, + "learning_rate": 4.5492633926788806e-05, + "loss": 4.4319, + "step": 32644 + }, + { + "epoch": 0.1941490627081549, + "grad_norm": 1.8670353889465332, + "learning_rate": 4.549236637598445e-05, + "loss": 4.2249, + "step": 32645 + }, + { + "epoch": 0.19415500999143592, + "grad_norm": 1.6615244150161743, + "learning_rate": 4.5492098818026425e-05, + "loss": 4.1405, + "step": 32646 + }, + { + "epoch": 0.1941609572747169, + "grad_norm": 1.727453589439392, + "learning_rate": 4.549183125291481e-05, + "loss": 4.2608, + "step": 32647 + }, + { + "epoch": 0.1941669045579979, + "grad_norm": 1.7687768936157227, + "learning_rate": 4.549156368064972e-05, + "loss": 4.2649, + "step": 32648 + }, + { + "epoch": 0.19417285184127892, + "grad_norm": 1.9458198547363281, + "learning_rate": 4.549129610123123e-05, + "loss": 4.6472, + "step": 32649 + }, + { + "epoch": 0.1941787991245599, + "grad_norm": 1.8845311403274536, + "learning_rate": 4.549102851465944e-05, + "loss": 4.5324, + "step": 32650 + }, + { + "epoch": 0.1941847464078409, + "grad_norm": 1.973351240158081, + "learning_rate": 4.549076092093445e-05, + "loss": 4.0662, + "step": 32651 + }, + { + "epoch": 0.1941906936911219, + "grad_norm": 1.5977756977081299, + "learning_rate": 4.549049332005634e-05, + "loss": 4.4065, + "step": 32652 + }, + { + "epoch": 0.1941966409744029, + "grad_norm": 1.5310672521591187, + "learning_rate": 4.5490225712025215e-05, + "loss": 4.3943, + "step": 32653 + }, + { + "epoch": 0.19420258825768388, + "grad_norm": 1.4160810708999634, + "learning_rate": 4.548995809684116e-05, + "loss": 4.4643, + "step": 32654 + }, + { + "epoch": 0.1942085355409649, + "grad_norm": 1.9193739891052246, + "learning_rate": 4.548969047450428e-05, + "loss": 4.5345, + "step": 32655 + }, + { + "epoch": 0.1942144828242459, + "grad_norm": 1.8352816104888916, + "learning_rate": 4.548942284501465e-05, + "loss": 4.6301, + "step": 32656 + }, + { + "epoch": 0.19422043010752688, + "grad_norm": 1.87077796459198, + "learning_rate": 4.5489155208372384e-05, + "loss": 4.4388, + "step": 32657 + }, + { + "epoch": 0.1942263773908079, + "grad_norm": 1.719300389289856, + "learning_rate": 4.5488887564577555e-05, + "loss": 4.5482, + "step": 32658 + }, + { + "epoch": 0.19423232467408888, + "grad_norm": 1.7464433908462524, + "learning_rate": 4.548861991363028e-05, + "loss": 4.5615, + "step": 32659 + }, + { + "epoch": 0.19423827195736987, + "grad_norm": 2.0196592807769775, + "learning_rate": 4.548835225553063e-05, + "loss": 4.7149, + "step": 32660 + }, + { + "epoch": 0.19424421924065088, + "grad_norm": 1.725612998008728, + "learning_rate": 4.548808459027871e-05, + "loss": 4.4756, + "step": 32661 + }, + { + "epoch": 0.19425016652393187, + "grad_norm": 2.12505841255188, + "learning_rate": 4.548781691787461e-05, + "loss": 4.4415, + "step": 32662 + }, + { + "epoch": 0.19425611380721286, + "grad_norm": 1.8461819887161255, + "learning_rate": 4.548754923831843e-05, + "loss": 4.7161, + "step": 32663 + }, + { + "epoch": 0.19426206109049388, + "grad_norm": 1.680954933166504, + "learning_rate": 4.548728155161025e-05, + "loss": 4.3207, + "step": 32664 + }, + { + "epoch": 0.19426800837377486, + "grad_norm": 1.8344814777374268, + "learning_rate": 4.548701385775018e-05, + "loss": 4.0626, + "step": 32665 + }, + { + "epoch": 0.19427395565705585, + "grad_norm": 1.6902893781661987, + "learning_rate": 4.54867461567383e-05, + "loss": 4.7755, + "step": 32666 + }, + { + "epoch": 0.19427990294033684, + "grad_norm": 1.5608021020889282, + "learning_rate": 4.548647844857471e-05, + "loss": 4.8721, + "step": 32667 + }, + { + "epoch": 0.19428585022361786, + "grad_norm": 1.6685339212417603, + "learning_rate": 4.54862107332595e-05, + "loss": 4.6452, + "step": 32668 + }, + { + "epoch": 0.19429179750689884, + "grad_norm": 1.8371236324310303, + "learning_rate": 4.548594301079277e-05, + "loss": 4.4041, + "step": 32669 + }, + { + "epoch": 0.19429774479017983, + "grad_norm": 1.7567802667617798, + "learning_rate": 4.548567528117461e-05, + "loss": 4.4144, + "step": 32670 + }, + { + "epoch": 0.19430369207346085, + "grad_norm": 1.7036006450653076, + "learning_rate": 4.54854075444051e-05, + "loss": 4.5322, + "step": 32671 + }, + { + "epoch": 0.19430963935674184, + "grad_norm": 2.345913887023926, + "learning_rate": 4.5485139800484356e-05, + "loss": 4.0522, + "step": 32672 + }, + { + "epoch": 0.19431558664002282, + "grad_norm": 2.7532145977020264, + "learning_rate": 4.5484872049412465e-05, + "loss": 3.836, + "step": 32673 + }, + { + "epoch": 0.19432153392330384, + "grad_norm": 2.086850166320801, + "learning_rate": 4.548460429118951e-05, + "loss": 4.1181, + "step": 32674 + }, + { + "epoch": 0.19432748120658483, + "grad_norm": 2.241453170776367, + "learning_rate": 4.5484336525815596e-05, + "loss": 3.7434, + "step": 32675 + }, + { + "epoch": 0.19433342848986582, + "grad_norm": 2.2940404415130615, + "learning_rate": 4.548406875329081e-05, + "loss": 3.6265, + "step": 32676 + }, + { + "epoch": 0.19433937577314683, + "grad_norm": 2.1785483360290527, + "learning_rate": 4.5483800973615245e-05, + "loss": 3.4999, + "step": 32677 + }, + { + "epoch": 0.19434532305642782, + "grad_norm": 1.7296667098999023, + "learning_rate": 4.5483533186788996e-05, + "loss": 4.0844, + "step": 32678 + }, + { + "epoch": 0.1943512703397088, + "grad_norm": 1.4519742727279663, + "learning_rate": 4.5483265392812156e-05, + "loss": 4.9145, + "step": 32679 + }, + { + "epoch": 0.19435721762298983, + "grad_norm": 1.702333927154541, + "learning_rate": 4.5482997591684826e-05, + "loss": 4.7128, + "step": 32680 + }, + { + "epoch": 0.1943631649062708, + "grad_norm": 1.63100266456604, + "learning_rate": 4.548272978340709e-05, + "loss": 4.5922, + "step": 32681 + }, + { + "epoch": 0.1943691121895518, + "grad_norm": 1.5324856042861938, + "learning_rate": 4.548246196797904e-05, + "loss": 5.0184, + "step": 32682 + }, + { + "epoch": 0.19437505947283282, + "grad_norm": 1.7499281167984009, + "learning_rate": 4.548219414540078e-05, + "loss": 4.9284, + "step": 32683 + }, + { + "epoch": 0.1943810067561138, + "grad_norm": 1.6011301279067993, + "learning_rate": 4.5481926315672395e-05, + "loss": 4.8825, + "step": 32684 + }, + { + "epoch": 0.1943869540393948, + "grad_norm": 1.8732880353927612, + "learning_rate": 4.5481658478793986e-05, + "loss": 4.7278, + "step": 32685 + }, + { + "epoch": 0.1943929013226758, + "grad_norm": 1.6948355436325073, + "learning_rate": 4.548139063476564e-05, + "loss": 4.647, + "step": 32686 + }, + { + "epoch": 0.1943988486059568, + "grad_norm": 1.7804508209228516, + "learning_rate": 4.548112278358745e-05, + "loss": 4.6336, + "step": 32687 + }, + { + "epoch": 0.19440479588923779, + "grad_norm": 2.1667730808258057, + "learning_rate": 4.548085492525951e-05, + "loss": 4.7747, + "step": 32688 + }, + { + "epoch": 0.1944107431725188, + "grad_norm": 1.5616405010223389, + "learning_rate": 4.5480587059781916e-05, + "loss": 4.8905, + "step": 32689 + }, + { + "epoch": 0.1944166904557998, + "grad_norm": 1.6781290769577026, + "learning_rate": 4.548031918715476e-05, + "loss": 4.5202, + "step": 32690 + }, + { + "epoch": 0.19442263773908078, + "grad_norm": 1.858265995979309, + "learning_rate": 4.548005130737814e-05, + "loss": 4.8571, + "step": 32691 + }, + { + "epoch": 0.1944285850223618, + "grad_norm": 1.4587072134017944, + "learning_rate": 4.5479783420452144e-05, + "loss": 4.8802, + "step": 32692 + }, + { + "epoch": 0.19443453230564278, + "grad_norm": 1.6977627277374268, + "learning_rate": 4.5479515526376866e-05, + "loss": 4.4907, + "step": 32693 + }, + { + "epoch": 0.19444047958892377, + "grad_norm": 1.749733805656433, + "learning_rate": 4.54792476251524e-05, + "loss": 4.4612, + "step": 32694 + }, + { + "epoch": 0.1944464268722048, + "grad_norm": 1.9689366817474365, + "learning_rate": 4.547897971677885e-05, + "loss": 4.222, + "step": 32695 + }, + { + "epoch": 0.19445237415548577, + "grad_norm": 2.6120550632476807, + "learning_rate": 4.547871180125628e-05, + "loss": 3.7294, + "step": 32696 + }, + { + "epoch": 0.19445832143876676, + "grad_norm": 2.5318052768707275, + "learning_rate": 4.547844387858482e-05, + "loss": 3.9223, + "step": 32697 + }, + { + "epoch": 0.19446426872204778, + "grad_norm": 2.4473683834075928, + "learning_rate": 4.547817594876454e-05, + "loss": 3.801, + "step": 32698 + }, + { + "epoch": 0.19447021600532877, + "grad_norm": 1.6112592220306396, + "learning_rate": 4.5477908011795546e-05, + "loss": 4.8024, + "step": 32699 + }, + { + "epoch": 0.19447616328860975, + "grad_norm": 1.7253385782241821, + "learning_rate": 4.5477640067677915e-05, + "loss": 4.6842, + "step": 32700 + }, + { + "epoch": 0.19448211057189077, + "grad_norm": 2.232095718383789, + "learning_rate": 4.547737211641176e-05, + "loss": 3.5904, + "step": 32701 + }, + { + "epoch": 0.19448805785517176, + "grad_norm": 2.517429828643799, + "learning_rate": 4.547710415799716e-05, + "loss": 3.5036, + "step": 32702 + }, + { + "epoch": 0.19449400513845275, + "grad_norm": 2.414701461791992, + "learning_rate": 4.547683619243423e-05, + "loss": 3.6109, + "step": 32703 + }, + { + "epoch": 0.19449995242173376, + "grad_norm": 2.3504600524902344, + "learning_rate": 4.5476568219723027e-05, + "loss": 3.9853, + "step": 32704 + }, + { + "epoch": 0.19450589970501475, + "grad_norm": 2.6596758365631104, + "learning_rate": 4.547630023986368e-05, + "loss": 3.7625, + "step": 32705 + }, + { + "epoch": 0.19451184698829574, + "grad_norm": 2.6922054290771484, + "learning_rate": 4.547603225285626e-05, + "loss": 4.1548, + "step": 32706 + }, + { + "epoch": 0.19451779427157675, + "grad_norm": 2.4801747798919678, + "learning_rate": 4.547576425870087e-05, + "loss": 4.0531, + "step": 32707 + }, + { + "epoch": 0.19452374155485774, + "grad_norm": 2.2931368350982666, + "learning_rate": 4.547549625739761e-05, + "loss": 3.9223, + "step": 32708 + }, + { + "epoch": 0.19452968883813873, + "grad_norm": 2.383759021759033, + "learning_rate": 4.547522824894655e-05, + "loss": 3.6651, + "step": 32709 + }, + { + "epoch": 0.19453563612141975, + "grad_norm": 2.0957138538360596, + "learning_rate": 4.547496023334782e-05, + "loss": 3.3919, + "step": 32710 + }, + { + "epoch": 0.19454158340470074, + "grad_norm": 2.0263047218322754, + "learning_rate": 4.547469221060148e-05, + "loss": 3.8709, + "step": 32711 + }, + { + "epoch": 0.19454753068798172, + "grad_norm": 1.7334496974945068, + "learning_rate": 4.5474424180707634e-05, + "loss": 3.9556, + "step": 32712 + }, + { + "epoch": 0.19455347797126274, + "grad_norm": 1.9237737655639648, + "learning_rate": 4.547415614366639e-05, + "loss": 4.3447, + "step": 32713 + }, + { + "epoch": 0.19455942525454373, + "grad_norm": 1.7965775728225708, + "learning_rate": 4.547388809947782e-05, + "loss": 4.3011, + "step": 32714 + }, + { + "epoch": 0.19456537253782472, + "grad_norm": 2.085796356201172, + "learning_rate": 4.547362004814203e-05, + "loss": 4.1599, + "step": 32715 + }, + { + "epoch": 0.19457131982110573, + "grad_norm": 2.460947275161743, + "learning_rate": 4.547335198965911e-05, + "loss": 3.1574, + "step": 32716 + }, + { + "epoch": 0.19457726710438672, + "grad_norm": 2.9936110973358154, + "learning_rate": 4.547308392402915e-05, + "loss": 3.1313, + "step": 32717 + }, + { + "epoch": 0.1945832143876677, + "grad_norm": 1.802701473236084, + "learning_rate": 4.547281585125225e-05, + "loss": 4.513, + "step": 32718 + }, + { + "epoch": 0.19458916167094872, + "grad_norm": 1.5326614379882812, + "learning_rate": 4.547254777132851e-05, + "loss": 4.6847, + "step": 32719 + }, + { + "epoch": 0.1945951089542297, + "grad_norm": 1.3601535558700562, + "learning_rate": 4.547227968425801e-05, + "loss": 4.8147, + "step": 32720 + }, + { + "epoch": 0.1946010562375107, + "grad_norm": 1.5872572660446167, + "learning_rate": 4.5472011590040845e-05, + "loss": 4.6806, + "step": 32721 + }, + { + "epoch": 0.19460700352079172, + "grad_norm": 1.700873851776123, + "learning_rate": 4.547174348867712e-05, + "loss": 5.0069, + "step": 32722 + }, + { + "epoch": 0.1946129508040727, + "grad_norm": 2.1521737575531006, + "learning_rate": 4.547147538016691e-05, + "loss": 3.8698, + "step": 32723 + }, + { + "epoch": 0.1946188980873537, + "grad_norm": 1.4435259103775024, + "learning_rate": 4.547120726451033e-05, + "loss": 4.5179, + "step": 32724 + }, + { + "epoch": 0.19462484537063468, + "grad_norm": 1.4912521839141846, + "learning_rate": 4.547093914170746e-05, + "loss": 4.9731, + "step": 32725 + }, + { + "epoch": 0.1946307926539157, + "grad_norm": 1.36370050907135, + "learning_rate": 4.5470671011758395e-05, + "loss": 4.6799, + "step": 32726 + }, + { + "epoch": 0.19463673993719668, + "grad_norm": 1.2558645009994507, + "learning_rate": 4.5470402874663226e-05, + "loss": 4.6845, + "step": 32727 + }, + { + "epoch": 0.19464268722047767, + "grad_norm": 1.3222334384918213, + "learning_rate": 4.5470134730422053e-05, + "loss": 4.7477, + "step": 32728 + }, + { + "epoch": 0.1946486345037587, + "grad_norm": 1.6657606363296509, + "learning_rate": 4.546986657903497e-05, + "loss": 4.5929, + "step": 32729 + }, + { + "epoch": 0.19465458178703968, + "grad_norm": 1.6633927822113037, + "learning_rate": 4.546959842050207e-05, + "loss": 4.9297, + "step": 32730 + }, + { + "epoch": 0.19466052907032066, + "grad_norm": 1.549243450164795, + "learning_rate": 4.546933025482344e-05, + "loss": 5.1384, + "step": 32731 + }, + { + "epoch": 0.19466647635360168, + "grad_norm": 1.4809843301773071, + "learning_rate": 4.5469062081999184e-05, + "loss": 5.1748, + "step": 32732 + }, + { + "epoch": 0.19467242363688267, + "grad_norm": 1.3342254161834717, + "learning_rate": 4.546879390202938e-05, + "loss": 5.1949, + "step": 32733 + }, + { + "epoch": 0.19467837092016366, + "grad_norm": 1.558632254600525, + "learning_rate": 4.5468525714914146e-05, + "loss": 4.7441, + "step": 32734 + }, + { + "epoch": 0.19468431820344467, + "grad_norm": 1.9341686964035034, + "learning_rate": 4.546825752065355e-05, + "loss": 4.5898, + "step": 32735 + }, + { + "epoch": 0.19469026548672566, + "grad_norm": 2.236103057861328, + "learning_rate": 4.54679893192477e-05, + "loss": 3.8901, + "step": 32736 + }, + { + "epoch": 0.19469621277000665, + "grad_norm": 1.7344380617141724, + "learning_rate": 4.5467721110696685e-05, + "loss": 5.2491, + "step": 32737 + }, + { + "epoch": 0.19470216005328767, + "grad_norm": 1.894675850868225, + "learning_rate": 4.5467452895000606e-05, + "loss": 4.9341, + "step": 32738 + }, + { + "epoch": 0.19470810733656865, + "grad_norm": 1.3538182973861694, + "learning_rate": 4.5467184672159546e-05, + "loss": 5.1136, + "step": 32739 + }, + { + "epoch": 0.19471405461984964, + "grad_norm": 1.481584072113037, + "learning_rate": 4.54669164421736e-05, + "loss": 4.7057, + "step": 32740 + }, + { + "epoch": 0.19472000190313066, + "grad_norm": 1.7495735883712769, + "learning_rate": 4.546664820504287e-05, + "loss": 4.5717, + "step": 32741 + }, + { + "epoch": 0.19472594918641165, + "grad_norm": 1.5889508724212646, + "learning_rate": 4.546637996076744e-05, + "loss": 4.6641, + "step": 32742 + }, + { + "epoch": 0.19473189646969263, + "grad_norm": 1.6097511053085327, + "learning_rate": 4.5466111709347415e-05, + "loss": 4.8375, + "step": 32743 + }, + { + "epoch": 0.19473784375297365, + "grad_norm": 1.790899634361267, + "learning_rate": 4.5465843450782876e-05, + "loss": 4.8971, + "step": 32744 + }, + { + "epoch": 0.19474379103625464, + "grad_norm": 1.564828872680664, + "learning_rate": 4.546557518507392e-05, + "loss": 4.8392, + "step": 32745 + }, + { + "epoch": 0.19474973831953563, + "grad_norm": 1.536988615989685, + "learning_rate": 4.546530691222065e-05, + "loss": 5.1304, + "step": 32746 + }, + { + "epoch": 0.19475568560281664, + "grad_norm": 1.7013825178146362, + "learning_rate": 4.546503863222315e-05, + "loss": 4.5505, + "step": 32747 + }, + { + "epoch": 0.19476163288609763, + "grad_norm": 1.6183735132217407, + "learning_rate": 4.546477034508152e-05, + "loss": 4.6205, + "step": 32748 + }, + { + "epoch": 0.19476758016937862, + "grad_norm": 1.8244844675064087, + "learning_rate": 4.5464502050795844e-05, + "loss": 4.5857, + "step": 32749 + }, + { + "epoch": 0.19477352745265963, + "grad_norm": 2.38612961769104, + "learning_rate": 4.546423374936623e-05, + "loss": 4.3756, + "step": 32750 + }, + { + "epoch": 0.19477947473594062, + "grad_norm": 2.186896324157715, + "learning_rate": 4.5463965440792755e-05, + "loss": 4.578, + "step": 32751 + }, + { + "epoch": 0.1947854220192216, + "grad_norm": 2.066823959350586, + "learning_rate": 4.546369712507552e-05, + "loss": 4.198, + "step": 32752 + }, + { + "epoch": 0.19479136930250263, + "grad_norm": 2.2022926807403564, + "learning_rate": 4.5463428802214624e-05, + "loss": 3.6525, + "step": 32753 + }, + { + "epoch": 0.19479731658578361, + "grad_norm": 1.8750653266906738, + "learning_rate": 4.546316047221016e-05, + "loss": 4.2666, + "step": 32754 + }, + { + "epoch": 0.1948032638690646, + "grad_norm": 2.1228365898132324, + "learning_rate": 4.5462892135062215e-05, + "loss": 3.4313, + "step": 32755 + }, + { + "epoch": 0.19480921115234562, + "grad_norm": 2.177910804748535, + "learning_rate": 4.546262379077089e-05, + "loss": 3.4433, + "step": 32756 + }, + { + "epoch": 0.1948151584356266, + "grad_norm": 2.1423957347869873, + "learning_rate": 4.546235543933626e-05, + "loss": 3.4016, + "step": 32757 + }, + { + "epoch": 0.1948211057189076, + "grad_norm": 1.9580178260803223, + "learning_rate": 4.546208708075844e-05, + "loss": 3.36, + "step": 32758 + }, + { + "epoch": 0.1948270530021886, + "grad_norm": 2.11665940284729, + "learning_rate": 4.546181871503752e-05, + "loss": 3.7196, + "step": 32759 + }, + { + "epoch": 0.1948330002854696, + "grad_norm": 2.0595879554748535, + "learning_rate": 4.54615503421736e-05, + "loss": 3.6423, + "step": 32760 + }, + { + "epoch": 0.1948389475687506, + "grad_norm": 2.0790436267852783, + "learning_rate": 4.546128196216675e-05, + "loss": 3.3795, + "step": 32761 + }, + { + "epoch": 0.1948448948520316, + "grad_norm": 2.1012542247772217, + "learning_rate": 4.546101357501708e-05, + "loss": 3.3949, + "step": 32762 + }, + { + "epoch": 0.1948508421353126, + "grad_norm": 2.005147933959961, + "learning_rate": 4.5460745180724684e-05, + "loss": 3.34, + "step": 32763 + }, + { + "epoch": 0.19485678941859358, + "grad_norm": 2.040877342224121, + "learning_rate": 4.546047677928965e-05, + "loss": 3.471, + "step": 32764 + }, + { + "epoch": 0.1948627367018746, + "grad_norm": 1.92231285572052, + "learning_rate": 4.5460208370712085e-05, + "loss": 3.6301, + "step": 32765 + }, + { + "epoch": 0.19486868398515558, + "grad_norm": 2.008256435394287, + "learning_rate": 4.545993995499206e-05, + "loss": 3.5078, + "step": 32766 + }, + { + "epoch": 0.19487463126843657, + "grad_norm": 1.9159399271011353, + "learning_rate": 4.545967153212969e-05, + "loss": 3.2598, + "step": 32767 + }, + { + "epoch": 0.1948805785517176, + "grad_norm": 2.008863687515259, + "learning_rate": 4.545940310212505e-05, + "loss": 3.2273, + "step": 32768 + }, + { + "epoch": 0.19488652583499858, + "grad_norm": 2.072593927383423, + "learning_rate": 4.545913466497825e-05, + "loss": 3.245, + "step": 32769 + }, + { + "epoch": 0.19489247311827956, + "grad_norm": 2.1123046875, + "learning_rate": 4.5458866220689386e-05, + "loss": 3.337, + "step": 32770 + }, + { + "epoch": 0.19489842040156058, + "grad_norm": 2.036161422729492, + "learning_rate": 4.5458597769258535e-05, + "loss": 3.4031, + "step": 32771 + }, + { + "epoch": 0.19490436768484157, + "grad_norm": 1.7956360578536987, + "learning_rate": 4.54583293106858e-05, + "loss": 3.3922, + "step": 32772 + }, + { + "epoch": 0.19491031496812256, + "grad_norm": 1.9955687522888184, + "learning_rate": 4.545806084497127e-05, + "loss": 3.398, + "step": 32773 + }, + { + "epoch": 0.19491626225140357, + "grad_norm": 1.9657707214355469, + "learning_rate": 4.545779237211504e-05, + "loss": 3.3049, + "step": 32774 + }, + { + "epoch": 0.19492220953468456, + "grad_norm": 2.042170286178589, + "learning_rate": 4.545752389211722e-05, + "loss": 3.3357, + "step": 32775 + }, + { + "epoch": 0.19492815681796555, + "grad_norm": 1.8873474597930908, + "learning_rate": 4.545725540497787e-05, + "loss": 3.3893, + "step": 32776 + }, + { + "epoch": 0.19493410410124656, + "grad_norm": 2.1129064559936523, + "learning_rate": 4.545698691069712e-05, + "loss": 4.1575, + "step": 32777 + }, + { + "epoch": 0.19494005138452755, + "grad_norm": 1.821212887763977, + "learning_rate": 4.545671840927504e-05, + "loss": 4.0723, + "step": 32778 + }, + { + "epoch": 0.19494599866780854, + "grad_norm": 1.8211898803710938, + "learning_rate": 4.545644990071174e-05, + "loss": 3.9323, + "step": 32779 + }, + { + "epoch": 0.19495194595108956, + "grad_norm": 2.6873621940612793, + "learning_rate": 4.545618138500729e-05, + "loss": 4.0404, + "step": 32780 + }, + { + "epoch": 0.19495789323437054, + "grad_norm": 1.9011731147766113, + "learning_rate": 4.54559128621618e-05, + "loss": 4.6881, + "step": 32781 + }, + { + "epoch": 0.19496384051765153, + "grad_norm": 2.0312952995300293, + "learning_rate": 4.5455644332175374e-05, + "loss": 3.2722, + "step": 32782 + }, + { + "epoch": 0.19496978780093252, + "grad_norm": 1.7892794609069824, + "learning_rate": 4.5455375795048086e-05, + "loss": 3.9904, + "step": 32783 + }, + { + "epoch": 0.19497573508421354, + "grad_norm": 2.0266246795654297, + "learning_rate": 4.545510725078004e-05, + "loss": 4.0798, + "step": 32784 + }, + { + "epoch": 0.19498168236749452, + "grad_norm": 1.9282901287078857, + "learning_rate": 4.545483869937133e-05, + "loss": 3.7749, + "step": 32785 + }, + { + "epoch": 0.1949876296507755, + "grad_norm": 1.7957758903503418, + "learning_rate": 4.545457014082204e-05, + "loss": 3.5671, + "step": 32786 + }, + { + "epoch": 0.19499357693405653, + "grad_norm": 2.084536075592041, + "learning_rate": 4.545430157513227e-05, + "loss": 3.3633, + "step": 32787 + }, + { + "epoch": 0.19499952421733752, + "grad_norm": 1.8941612243652344, + "learning_rate": 4.545403300230212e-05, + "loss": 4.1718, + "step": 32788 + }, + { + "epoch": 0.1950054715006185, + "grad_norm": 2.0162341594696045, + "learning_rate": 4.545376442233168e-05, + "loss": 4.8271, + "step": 32789 + }, + { + "epoch": 0.19501141878389952, + "grad_norm": 2.3421337604522705, + "learning_rate": 4.545349583522104e-05, + "loss": 4.3666, + "step": 32790 + }, + { + "epoch": 0.1950173660671805, + "grad_norm": 2.9880783557891846, + "learning_rate": 4.54532272409703e-05, + "loss": 3.6112, + "step": 32791 + }, + { + "epoch": 0.1950233133504615, + "grad_norm": 2.165861129760742, + "learning_rate": 4.545295863957955e-05, + "loss": 3.9271, + "step": 32792 + }, + { + "epoch": 0.1950292606337425, + "grad_norm": 1.7366812229156494, + "learning_rate": 4.545269003104887e-05, + "loss": 4.4979, + "step": 32793 + }, + { + "epoch": 0.1950352079170235, + "grad_norm": 1.8814879655838013, + "learning_rate": 4.5452421415378374e-05, + "loss": 4.5148, + "step": 32794 + }, + { + "epoch": 0.1950411552003045, + "grad_norm": 1.8868308067321777, + "learning_rate": 4.545215279256815e-05, + "loss": 4.9256, + "step": 32795 + }, + { + "epoch": 0.1950471024835855, + "grad_norm": 2.2767741680145264, + "learning_rate": 4.54518841626183e-05, + "loss": 4.7479, + "step": 32796 + }, + { + "epoch": 0.1950530497668665, + "grad_norm": 1.6899062395095825, + "learning_rate": 4.54516155255289e-05, + "loss": 4.6015, + "step": 32797 + }, + { + "epoch": 0.19505899705014748, + "grad_norm": 2.2540504932403564, + "learning_rate": 4.545134688130005e-05, + "loss": 3.93, + "step": 32798 + }, + { + "epoch": 0.1950649443334285, + "grad_norm": 1.7807284593582153, + "learning_rate": 4.5451078229931846e-05, + "loss": 4.8511, + "step": 32799 + }, + { + "epoch": 0.19507089161670949, + "grad_norm": 1.7292965650558472, + "learning_rate": 4.5450809571424384e-05, + "loss": 5.0537, + "step": 32800 + }, + { + "epoch": 0.19507683889999047, + "grad_norm": 1.7457902431488037, + "learning_rate": 4.545054090577776e-05, + "loss": 4.8452, + "step": 32801 + }, + { + "epoch": 0.1950827861832715, + "grad_norm": 1.799851417541504, + "learning_rate": 4.5450272232992056e-05, + "loss": 4.5091, + "step": 32802 + }, + { + "epoch": 0.19508873346655248, + "grad_norm": 1.6106951236724854, + "learning_rate": 4.545000355306738e-05, + "loss": 5.3681, + "step": 32803 + }, + { + "epoch": 0.19509468074983347, + "grad_norm": 1.6085110902786255, + "learning_rate": 4.5449734866003815e-05, + "loss": 5.3412, + "step": 32804 + }, + { + "epoch": 0.19510062803311448, + "grad_norm": 1.7096600532531738, + "learning_rate": 4.5449466171801456e-05, + "loss": 5.1475, + "step": 32805 + }, + { + "epoch": 0.19510657531639547, + "grad_norm": 1.7066271305084229, + "learning_rate": 4.5449197470460405e-05, + "loss": 4.9972, + "step": 32806 + }, + { + "epoch": 0.19511252259967646, + "grad_norm": 1.731095552444458, + "learning_rate": 4.544892876198075e-05, + "loss": 5.1551, + "step": 32807 + }, + { + "epoch": 0.19511846988295747, + "grad_norm": 1.9543027877807617, + "learning_rate": 4.544866004636259e-05, + "loss": 4.6056, + "step": 32808 + }, + { + "epoch": 0.19512441716623846, + "grad_norm": 1.8629963397979736, + "learning_rate": 4.5448391323606e-05, + "loss": 4.3998, + "step": 32809 + }, + { + "epoch": 0.19513036444951945, + "grad_norm": 2.1942734718322754, + "learning_rate": 4.54481225937111e-05, + "loss": 3.8038, + "step": 32810 + }, + { + "epoch": 0.19513631173280047, + "grad_norm": 2.3515050411224365, + "learning_rate": 4.544785385667797e-05, + "loss": 4.2252, + "step": 32811 + }, + { + "epoch": 0.19514225901608145, + "grad_norm": 1.9163727760314941, + "learning_rate": 4.54475851125067e-05, + "loss": 4.5602, + "step": 32812 + }, + { + "epoch": 0.19514820629936244, + "grad_norm": 1.7677435874938965, + "learning_rate": 4.544731636119739e-05, + "loss": 4.4722, + "step": 32813 + }, + { + "epoch": 0.19515415358264346, + "grad_norm": 2.2984094619750977, + "learning_rate": 4.544704760275014e-05, + "loss": 4.7956, + "step": 32814 + }, + { + "epoch": 0.19516010086592445, + "grad_norm": 1.6422353982925415, + "learning_rate": 4.544677883716503e-05, + "loss": 4.5415, + "step": 32815 + }, + { + "epoch": 0.19516604814920543, + "grad_norm": 1.931949496269226, + "learning_rate": 4.544651006444216e-05, + "loss": 4.22, + "step": 32816 + }, + { + "epoch": 0.19517199543248645, + "grad_norm": 1.7817195653915405, + "learning_rate": 4.5446241284581634e-05, + "loss": 4.501, + "step": 32817 + }, + { + "epoch": 0.19517794271576744, + "grad_norm": 1.7133018970489502, + "learning_rate": 4.544597249758353e-05, + "loss": 5.3803, + "step": 32818 + }, + { + "epoch": 0.19518388999904843, + "grad_norm": 1.8452601432800293, + "learning_rate": 4.544570370344795e-05, + "loss": 4.819, + "step": 32819 + }, + { + "epoch": 0.19518983728232944, + "grad_norm": 1.8337496519088745, + "learning_rate": 4.544543490217498e-05, + "loss": 4.4296, + "step": 32820 + }, + { + "epoch": 0.19519578456561043, + "grad_norm": 1.8516271114349365, + "learning_rate": 4.544516609376472e-05, + "loss": 4.4751, + "step": 32821 + }, + { + "epoch": 0.19520173184889142, + "grad_norm": 1.8621165752410889, + "learning_rate": 4.5444897278217276e-05, + "loss": 4.5967, + "step": 32822 + }, + { + "epoch": 0.19520767913217243, + "grad_norm": 1.7688180208206177, + "learning_rate": 4.5444628455532725e-05, + "loss": 4.4673, + "step": 32823 + }, + { + "epoch": 0.19521362641545342, + "grad_norm": 2.06526517868042, + "learning_rate": 4.5444359625711156e-05, + "loss": 4.4147, + "step": 32824 + }, + { + "epoch": 0.1952195736987344, + "grad_norm": 1.914939045906067, + "learning_rate": 4.544409078875268e-05, + "loss": 4.4997, + "step": 32825 + }, + { + "epoch": 0.19522552098201543, + "grad_norm": 1.7460870742797852, + "learning_rate": 4.544382194465738e-05, + "loss": 4.4229, + "step": 32826 + }, + { + "epoch": 0.19523146826529642, + "grad_norm": 1.801352620124817, + "learning_rate": 4.5443553093425353e-05, + "loss": 4.4512, + "step": 32827 + }, + { + "epoch": 0.1952374155485774, + "grad_norm": 1.7984275817871094, + "learning_rate": 4.544328423505669e-05, + "loss": 4.4493, + "step": 32828 + }, + { + "epoch": 0.19524336283185842, + "grad_norm": 1.717602014541626, + "learning_rate": 4.54430153695515e-05, + "loss": 4.2443, + "step": 32829 + }, + { + "epoch": 0.1952493101151394, + "grad_norm": 1.5940791368484497, + "learning_rate": 4.5442746496909856e-05, + "loss": 5.0274, + "step": 32830 + }, + { + "epoch": 0.1952552573984204, + "grad_norm": 1.8372310400009155, + "learning_rate": 4.544247761713186e-05, + "loss": 4.3161, + "step": 32831 + }, + { + "epoch": 0.1952612046817014, + "grad_norm": 1.54202401638031, + "learning_rate": 4.544220873021761e-05, + "loss": 4.8721, + "step": 32832 + }, + { + "epoch": 0.1952671519649824, + "grad_norm": 1.751818299293518, + "learning_rate": 4.544193983616719e-05, + "loss": 4.7725, + "step": 32833 + }, + { + "epoch": 0.1952730992482634, + "grad_norm": 1.7168902158737183, + "learning_rate": 4.5441670934980704e-05, + "loss": 4.583, + "step": 32834 + }, + { + "epoch": 0.1952790465315444, + "grad_norm": 1.6986567974090576, + "learning_rate": 4.544140202665824e-05, + "loss": 4.6351, + "step": 32835 + }, + { + "epoch": 0.1952849938148254, + "grad_norm": 1.9313454627990723, + "learning_rate": 4.54411331111999e-05, + "loss": 4.5758, + "step": 32836 + }, + { + "epoch": 0.19529094109810638, + "grad_norm": 1.725821614265442, + "learning_rate": 4.544086418860576e-05, + "loss": 4.569, + "step": 32837 + }, + { + "epoch": 0.1952968883813874, + "grad_norm": 1.6717588901519775, + "learning_rate": 4.5440595258875935e-05, + "loss": 4.3477, + "step": 32838 + }, + { + "epoch": 0.19530283566466838, + "grad_norm": 1.7990453243255615, + "learning_rate": 4.544032632201051e-05, + "loss": 4.063, + "step": 32839 + }, + { + "epoch": 0.19530878294794937, + "grad_norm": 1.9088106155395508, + "learning_rate": 4.544005737800957e-05, + "loss": 3.9041, + "step": 32840 + }, + { + "epoch": 0.19531473023123036, + "grad_norm": 1.69024658203125, + "learning_rate": 4.543978842687322e-05, + "loss": 4.6119, + "step": 32841 + }, + { + "epoch": 0.19532067751451138, + "grad_norm": 1.9073139429092407, + "learning_rate": 4.5439519468601553e-05, + "loss": 5.0721, + "step": 32842 + }, + { + "epoch": 0.19532662479779236, + "grad_norm": 1.8873580694198608, + "learning_rate": 4.543925050319466e-05, + "loss": 4.6936, + "step": 32843 + }, + { + "epoch": 0.19533257208107335, + "grad_norm": 1.8995583057403564, + "learning_rate": 4.543898153065264e-05, + "loss": 4.6549, + "step": 32844 + }, + { + "epoch": 0.19533851936435437, + "grad_norm": 1.682868480682373, + "learning_rate": 4.543871255097558e-05, + "loss": 4.7243, + "step": 32845 + }, + { + "epoch": 0.19534446664763536, + "grad_norm": 1.9877535104751587, + "learning_rate": 4.543844356416358e-05, + "loss": 4.8988, + "step": 32846 + }, + { + "epoch": 0.19535041393091634, + "grad_norm": 1.8145191669464111, + "learning_rate": 4.5438174570216716e-05, + "loss": 4.4799, + "step": 32847 + }, + { + "epoch": 0.19535636121419736, + "grad_norm": 2.011967182159424, + "learning_rate": 4.5437905569135106e-05, + "loss": 4.7395, + "step": 32848 + }, + { + "epoch": 0.19536230849747835, + "grad_norm": 2.4764623641967773, + "learning_rate": 4.5437636560918837e-05, + "loss": 4.6929, + "step": 32849 + }, + { + "epoch": 0.19536825578075934, + "grad_norm": 2.0704572200775146, + "learning_rate": 4.5437367545568e-05, + "loss": 4.748, + "step": 32850 + }, + { + "epoch": 0.19537420306404035, + "grad_norm": 2.423081159591675, + "learning_rate": 4.543709852308269e-05, + "loss": 4.4915, + "step": 32851 + }, + { + "epoch": 0.19538015034732134, + "grad_norm": 1.9316084384918213, + "learning_rate": 4.5436829493462995e-05, + "loss": 4.647, + "step": 32852 + }, + { + "epoch": 0.19538609763060233, + "grad_norm": 1.9612782001495361, + "learning_rate": 4.543656045670902e-05, + "loss": 4.5981, + "step": 32853 + }, + { + "epoch": 0.19539204491388334, + "grad_norm": 1.8731095790863037, + "learning_rate": 4.543629141282085e-05, + "loss": 4.7771, + "step": 32854 + }, + { + "epoch": 0.19539799219716433, + "grad_norm": 2.0212156772613525, + "learning_rate": 4.543602236179858e-05, + "loss": 4.825, + "step": 32855 + }, + { + "epoch": 0.19540393948044532, + "grad_norm": 1.7706294059753418, + "learning_rate": 4.5435753303642305e-05, + "loss": 4.5824, + "step": 32856 + }, + { + "epoch": 0.19540988676372634, + "grad_norm": 1.7553569078445435, + "learning_rate": 4.543548423835212e-05, + "loss": 4.5136, + "step": 32857 + }, + { + "epoch": 0.19541583404700733, + "grad_norm": 1.8563607931137085, + "learning_rate": 4.5435215165928126e-05, + "loss": 4.6181, + "step": 32858 + }, + { + "epoch": 0.1954217813302883, + "grad_norm": 1.7440742254257202, + "learning_rate": 4.54349460863704e-05, + "loss": 4.7325, + "step": 32859 + }, + { + "epoch": 0.19542772861356933, + "grad_norm": 1.9677025079727173, + "learning_rate": 4.543467699967905e-05, + "loss": 4.4658, + "step": 32860 + }, + { + "epoch": 0.19543367589685032, + "grad_norm": 1.8595266342163086, + "learning_rate": 4.543440790585417e-05, + "loss": 4.89, + "step": 32861 + }, + { + "epoch": 0.1954396231801313, + "grad_norm": 1.582334280014038, + "learning_rate": 4.543413880489584e-05, + "loss": 5.5308, + "step": 32862 + }, + { + "epoch": 0.19544557046341232, + "grad_norm": 1.445690631866455, + "learning_rate": 4.543386969680417e-05, + "loss": 5.403, + "step": 32863 + }, + { + "epoch": 0.1954515177466933, + "grad_norm": 2.2700390815734863, + "learning_rate": 4.543360058157924e-05, + "loss": 4.45, + "step": 32864 + }, + { + "epoch": 0.1954574650299743, + "grad_norm": 1.6211910247802734, + "learning_rate": 4.543333145922116e-05, + "loss": 5.1182, + "step": 32865 + }, + { + "epoch": 0.1954634123132553, + "grad_norm": 1.6810754537582397, + "learning_rate": 4.543306232973001e-05, + "loss": 4.3004, + "step": 32866 + }, + { + "epoch": 0.1954693595965363, + "grad_norm": 1.7132052183151245, + "learning_rate": 4.5432793193105884e-05, + "loss": 4.1847, + "step": 32867 + }, + { + "epoch": 0.1954753068798173, + "grad_norm": 1.7639859914779663, + "learning_rate": 4.543252404934889e-05, + "loss": 3.8669, + "step": 32868 + }, + { + "epoch": 0.1954812541630983, + "grad_norm": 1.8150482177734375, + "learning_rate": 4.543225489845911e-05, + "loss": 4.6428, + "step": 32869 + }, + { + "epoch": 0.1954872014463793, + "grad_norm": 1.6536823511123657, + "learning_rate": 4.5431985740436636e-05, + "loss": 5.0678, + "step": 32870 + }, + { + "epoch": 0.19549314872966028, + "grad_norm": 1.6762903928756714, + "learning_rate": 4.5431716575281574e-05, + "loss": 4.7561, + "step": 32871 + }, + { + "epoch": 0.1954990960129413, + "grad_norm": 1.7050037384033203, + "learning_rate": 4.543144740299401e-05, + "loss": 4.5412, + "step": 32872 + }, + { + "epoch": 0.1955050432962223, + "grad_norm": 1.747406244277954, + "learning_rate": 4.5431178223574034e-05, + "loss": 4.3635, + "step": 32873 + }, + { + "epoch": 0.19551099057950327, + "grad_norm": 1.8921083211898804, + "learning_rate": 4.5430909037021754e-05, + "loss": 4.2937, + "step": 32874 + }, + { + "epoch": 0.1955169378627843, + "grad_norm": 1.8949475288391113, + "learning_rate": 4.543063984333724e-05, + "loss": 4.3948, + "step": 32875 + }, + { + "epoch": 0.19552288514606528, + "grad_norm": 1.673620343208313, + "learning_rate": 4.543037064252061e-05, + "loss": 4.3489, + "step": 32876 + }, + { + "epoch": 0.19552883242934627, + "grad_norm": 1.3811769485473633, + "learning_rate": 4.543010143457195e-05, + "loss": 5.1095, + "step": 32877 + }, + { + "epoch": 0.19553477971262728, + "grad_norm": 1.7001514434814453, + "learning_rate": 4.5429832219491354e-05, + "loss": 4.3213, + "step": 32878 + }, + { + "epoch": 0.19554072699590827, + "grad_norm": 1.6770343780517578, + "learning_rate": 4.5429562997278905e-05, + "loss": 4.2158, + "step": 32879 + }, + { + "epoch": 0.19554667427918926, + "grad_norm": 1.7170015573501587, + "learning_rate": 4.542929376793472e-05, + "loss": 4.2278, + "step": 32880 + }, + { + "epoch": 0.19555262156247027, + "grad_norm": 1.381479263305664, + "learning_rate": 4.542902453145886e-05, + "loss": 4.084, + "step": 32881 + }, + { + "epoch": 0.19555856884575126, + "grad_norm": 1.6345101594924927, + "learning_rate": 4.542875528785145e-05, + "loss": 4.9881, + "step": 32882 + }, + { + "epoch": 0.19556451612903225, + "grad_norm": 1.6286730766296387, + "learning_rate": 4.542848603711258e-05, + "loss": 4.9177, + "step": 32883 + }, + { + "epoch": 0.19557046341231327, + "grad_norm": 1.800990104675293, + "learning_rate": 4.5428216779242336e-05, + "loss": 4.7879, + "step": 32884 + }, + { + "epoch": 0.19557641069559425, + "grad_norm": 1.6580811738967896, + "learning_rate": 4.54279475142408e-05, + "loss": 4.924, + "step": 32885 + }, + { + "epoch": 0.19558235797887524, + "grad_norm": 1.6891523599624634, + "learning_rate": 4.542767824210808e-05, + "loss": 5.1019, + "step": 32886 + }, + { + "epoch": 0.19558830526215626, + "grad_norm": 1.9298063516616821, + "learning_rate": 4.542740896284428e-05, + "loss": 3.242, + "step": 32887 + }, + { + "epoch": 0.19559425254543725, + "grad_norm": 1.5368505716323853, + "learning_rate": 4.542713967644948e-05, + "loss": 4.5356, + "step": 32888 + }, + { + "epoch": 0.19560019982871824, + "grad_norm": 1.5489712953567505, + "learning_rate": 4.542687038292377e-05, + "loss": 5.2564, + "step": 32889 + }, + { + "epoch": 0.19560614711199925, + "grad_norm": 1.367336392402649, + "learning_rate": 4.5426601082267254e-05, + "loss": 5.2064, + "step": 32890 + }, + { + "epoch": 0.19561209439528024, + "grad_norm": 1.6218714714050293, + "learning_rate": 4.542633177448002e-05, + "loss": 5.1743, + "step": 32891 + }, + { + "epoch": 0.19561804167856123, + "grad_norm": 1.8278539180755615, + "learning_rate": 4.542606245956217e-05, + "loss": 5.2703, + "step": 32892 + }, + { + "epoch": 0.19562398896184224, + "grad_norm": 1.579430341720581, + "learning_rate": 4.542579313751379e-05, + "loss": 5.3688, + "step": 32893 + }, + { + "epoch": 0.19562993624512323, + "grad_norm": 1.5411518812179565, + "learning_rate": 4.542552380833498e-05, + "loss": 4.9046, + "step": 32894 + }, + { + "epoch": 0.19563588352840422, + "grad_norm": 1.7552369832992554, + "learning_rate": 4.542525447202582e-05, + "loss": 4.7059, + "step": 32895 + }, + { + "epoch": 0.19564183081168524, + "grad_norm": 1.7786237001419067, + "learning_rate": 4.5424985128586424e-05, + "loss": 4.6587, + "step": 32896 + }, + { + "epoch": 0.19564777809496622, + "grad_norm": 1.7300502061843872, + "learning_rate": 4.542471577801687e-05, + "loss": 4.8136, + "step": 32897 + }, + { + "epoch": 0.1956537253782472, + "grad_norm": 1.612258791923523, + "learning_rate": 4.542444642031727e-05, + "loss": 4.8126, + "step": 32898 + }, + { + "epoch": 0.19565967266152823, + "grad_norm": 1.6730974912643433, + "learning_rate": 4.5424177055487693e-05, + "loss": 4.6028, + "step": 32899 + }, + { + "epoch": 0.19566561994480922, + "grad_norm": 1.7678196430206299, + "learning_rate": 4.5423907683528256e-05, + "loss": 4.8532, + "step": 32900 + }, + { + "epoch": 0.1956715672280902, + "grad_norm": 1.8062410354614258, + "learning_rate": 4.5423638304439044e-05, + "loss": 4.8629, + "step": 32901 + }, + { + "epoch": 0.1956775145113712, + "grad_norm": 1.6573866605758667, + "learning_rate": 4.5423368918220146e-05, + "loss": 4.6673, + "step": 32902 + }, + { + "epoch": 0.1956834617946522, + "grad_norm": 1.6542189121246338, + "learning_rate": 4.5423099524871656e-05, + "loss": 4.6684, + "step": 32903 + }, + { + "epoch": 0.1956894090779332, + "grad_norm": 1.6946748495101929, + "learning_rate": 4.542283012439369e-05, + "loss": 4.7429, + "step": 32904 + }, + { + "epoch": 0.19569535636121418, + "grad_norm": 1.6120235919952393, + "learning_rate": 4.542256071678631e-05, + "loss": 4.917, + "step": 32905 + }, + { + "epoch": 0.1957013036444952, + "grad_norm": 1.635718822479248, + "learning_rate": 4.5422291302049634e-05, + "loss": 4.9375, + "step": 32906 + }, + { + "epoch": 0.1957072509277762, + "grad_norm": 1.5894275903701782, + "learning_rate": 4.542202188018374e-05, + "loss": 4.8344, + "step": 32907 + }, + { + "epoch": 0.19571319821105718, + "grad_norm": 1.6269679069519043, + "learning_rate": 4.542175245118873e-05, + "loss": 5.1075, + "step": 32908 + }, + { + "epoch": 0.1957191454943382, + "grad_norm": 1.5488533973693848, + "learning_rate": 4.5421483015064703e-05, + "loss": 4.7309, + "step": 32909 + }, + { + "epoch": 0.19572509277761918, + "grad_norm": 1.6821287870407104, + "learning_rate": 4.5421213571811736e-05, + "loss": 5.0394, + "step": 32910 + }, + { + "epoch": 0.19573104006090017, + "grad_norm": 1.798214316368103, + "learning_rate": 4.542094412142995e-05, + "loss": 4.68, + "step": 32911 + }, + { + "epoch": 0.19573698734418118, + "grad_norm": 1.5563793182373047, + "learning_rate": 4.542067466391942e-05, + "loss": 5.062, + "step": 32912 + }, + { + "epoch": 0.19574293462746217, + "grad_norm": 1.9282946586608887, + "learning_rate": 4.542040519928024e-05, + "loss": 4.676, + "step": 32913 + }, + { + "epoch": 0.19574888191074316, + "grad_norm": 1.7567148208618164, + "learning_rate": 4.5420135727512504e-05, + "loss": 4.8652, + "step": 32914 + }, + { + "epoch": 0.19575482919402418, + "grad_norm": 1.8503345251083374, + "learning_rate": 4.54198662486163e-05, + "loss": 4.6167, + "step": 32915 + }, + { + "epoch": 0.19576077647730517, + "grad_norm": 1.3916583061218262, + "learning_rate": 4.5419596762591755e-05, + "loss": 4.9467, + "step": 32916 + }, + { + "epoch": 0.19576672376058615, + "grad_norm": 1.5608447790145874, + "learning_rate": 4.5419327269438925e-05, + "loss": 4.9949, + "step": 32917 + }, + { + "epoch": 0.19577267104386717, + "grad_norm": 1.7178374528884888, + "learning_rate": 4.5419057769157927e-05, + "loss": 5.0352, + "step": 32918 + }, + { + "epoch": 0.19577861832714816, + "grad_norm": 2.536865234375, + "learning_rate": 4.5418788261748834e-05, + "loss": 4.6149, + "step": 32919 + }, + { + "epoch": 0.19578456561042915, + "grad_norm": 2.1935441493988037, + "learning_rate": 4.541851874721176e-05, + "loss": 4.6066, + "step": 32920 + }, + { + "epoch": 0.19579051289371016, + "grad_norm": 1.8435254096984863, + "learning_rate": 4.5418249225546794e-05, + "loss": 4.7086, + "step": 32921 + }, + { + "epoch": 0.19579646017699115, + "grad_norm": 2.078380584716797, + "learning_rate": 4.541797969675403e-05, + "loss": 4.0338, + "step": 32922 + }, + { + "epoch": 0.19580240746027214, + "grad_norm": 1.5899152755737305, + "learning_rate": 4.541771016083356e-05, + "loss": 4.9363, + "step": 32923 + }, + { + "epoch": 0.19580835474355315, + "grad_norm": 1.5216234922409058, + "learning_rate": 4.541744061778547e-05, + "loss": 4.9327, + "step": 32924 + }, + { + "epoch": 0.19581430202683414, + "grad_norm": 1.815258502960205, + "learning_rate": 4.541717106760987e-05, + "loss": 4.8025, + "step": 32925 + }, + { + "epoch": 0.19582024931011513, + "grad_norm": 1.6201529502868652, + "learning_rate": 4.541690151030684e-05, + "loss": 4.6954, + "step": 32926 + }, + { + "epoch": 0.19582619659339615, + "grad_norm": 2.1462132930755615, + "learning_rate": 4.5416631945876494e-05, + "loss": 4.0121, + "step": 32927 + }, + { + "epoch": 0.19583214387667713, + "grad_norm": 1.832979679107666, + "learning_rate": 4.54163623743189e-05, + "loss": 4.1431, + "step": 32928 + }, + { + "epoch": 0.19583809115995812, + "grad_norm": 1.5806697607040405, + "learning_rate": 4.5416092795634167e-05, + "loss": 5.179, + "step": 32929 + }, + { + "epoch": 0.19584403844323914, + "grad_norm": 1.6073065996170044, + "learning_rate": 4.5415823209822397e-05, + "loss": 4.9575, + "step": 32930 + }, + { + "epoch": 0.19584998572652013, + "grad_norm": 1.9261529445648193, + "learning_rate": 4.541555361688366e-05, + "loss": 4.5455, + "step": 32931 + }, + { + "epoch": 0.19585593300980111, + "grad_norm": 1.7330681085586548, + "learning_rate": 4.541528401681807e-05, + "loss": 4.6912, + "step": 32932 + }, + { + "epoch": 0.19586188029308213, + "grad_norm": 1.682050347328186, + "learning_rate": 4.541501440962572e-05, + "loss": 4.7136, + "step": 32933 + }, + { + "epoch": 0.19586782757636312, + "grad_norm": 1.949378252029419, + "learning_rate": 4.541474479530669e-05, + "loss": 4.7249, + "step": 32934 + }, + { + "epoch": 0.1958737748596441, + "grad_norm": 1.828692078590393, + "learning_rate": 4.541447517386109e-05, + "loss": 4.1328, + "step": 32935 + }, + { + "epoch": 0.19587972214292512, + "grad_norm": 1.5934466123580933, + "learning_rate": 4.5414205545289e-05, + "loss": 4.1072, + "step": 32936 + }, + { + "epoch": 0.1958856694262061, + "grad_norm": 1.539414882659912, + "learning_rate": 4.541393590959053e-05, + "loss": 4.9876, + "step": 32937 + }, + { + "epoch": 0.1958916167094871, + "grad_norm": 1.6584326028823853, + "learning_rate": 4.5413666266765765e-05, + "loss": 5.1275, + "step": 32938 + }, + { + "epoch": 0.19589756399276811, + "grad_norm": 1.6987926959991455, + "learning_rate": 4.5413396616814797e-05, + "loss": 4.6132, + "step": 32939 + }, + { + "epoch": 0.1959035112760491, + "grad_norm": 1.741507649421692, + "learning_rate": 4.5413126959737727e-05, + "loss": 4.8676, + "step": 32940 + }, + { + "epoch": 0.1959094585593301, + "grad_norm": 1.660169005393982, + "learning_rate": 4.5412857295534636e-05, + "loss": 4.5619, + "step": 32941 + }, + { + "epoch": 0.1959154058426111, + "grad_norm": 1.785941481590271, + "learning_rate": 4.541258762420564e-05, + "loss": 4.6198, + "step": 32942 + }, + { + "epoch": 0.1959213531258921, + "grad_norm": 1.7086410522460938, + "learning_rate": 4.5412317945750814e-05, + "loss": 4.4215, + "step": 32943 + }, + { + "epoch": 0.19592730040917308, + "grad_norm": 1.8102631568908691, + "learning_rate": 4.541204826017026e-05, + "loss": 4.5202, + "step": 32944 + }, + { + "epoch": 0.1959332476924541, + "grad_norm": 1.8132258653640747, + "learning_rate": 4.541177856746407e-05, + "loss": 4.5079, + "step": 32945 + }, + { + "epoch": 0.1959391949757351, + "grad_norm": 2.1485495567321777, + "learning_rate": 4.541150886763234e-05, + "loss": 4.3483, + "step": 32946 + }, + { + "epoch": 0.19594514225901608, + "grad_norm": 1.4712555408477783, + "learning_rate": 4.541123916067516e-05, + "loss": 4.2898, + "step": 32947 + }, + { + "epoch": 0.1959510895422971, + "grad_norm": 1.8281893730163574, + "learning_rate": 4.541096944659263e-05, + "loss": 4.2012, + "step": 32948 + }, + { + "epoch": 0.19595703682557808, + "grad_norm": 1.8990435600280762, + "learning_rate": 4.541069972538484e-05, + "loss": 3.783, + "step": 32949 + }, + { + "epoch": 0.19596298410885907, + "grad_norm": 1.5741428136825562, + "learning_rate": 4.541042999705189e-05, + "loss": 5.6173, + "step": 32950 + }, + { + "epoch": 0.19596893139214008, + "grad_norm": 1.9983577728271484, + "learning_rate": 4.541016026159387e-05, + "loss": 5.3004, + "step": 32951 + }, + { + "epoch": 0.19597487867542107, + "grad_norm": 1.514825701713562, + "learning_rate": 4.5409890519010866e-05, + "loss": 4.927, + "step": 32952 + }, + { + "epoch": 0.19598082595870206, + "grad_norm": 1.4992481470108032, + "learning_rate": 4.5409620769302985e-05, + "loss": 5.0936, + "step": 32953 + }, + { + "epoch": 0.19598677324198308, + "grad_norm": 1.5728949308395386, + "learning_rate": 4.5409351012470316e-05, + "loss": 5.3866, + "step": 32954 + }, + { + "epoch": 0.19599272052526406, + "grad_norm": 1.4812259674072266, + "learning_rate": 4.5409081248512955e-05, + "loss": 5.3022, + "step": 32955 + }, + { + "epoch": 0.19599866780854505, + "grad_norm": 1.670962929725647, + "learning_rate": 4.540881147743099e-05, + "loss": 4.9182, + "step": 32956 + }, + { + "epoch": 0.19600461509182607, + "grad_norm": 2.0521981716156006, + "learning_rate": 4.540854169922453e-05, + "loss": 3.4115, + "step": 32957 + }, + { + "epoch": 0.19601056237510706, + "grad_norm": 1.4718791246414185, + "learning_rate": 4.5408271913893646e-05, + "loss": 4.712, + "step": 32958 + }, + { + "epoch": 0.19601650965838804, + "grad_norm": 1.6191232204437256, + "learning_rate": 4.540800212143845e-05, + "loss": 5.2245, + "step": 32959 + }, + { + "epoch": 0.19602245694166903, + "grad_norm": 1.2227195501327515, + "learning_rate": 4.540773232185903e-05, + "loss": 5.4206, + "step": 32960 + }, + { + "epoch": 0.19602840422495005, + "grad_norm": 1.417944073677063, + "learning_rate": 4.540746251515549e-05, + "loss": 5.2218, + "step": 32961 + }, + { + "epoch": 0.19603435150823104, + "grad_norm": 1.7467671632766724, + "learning_rate": 4.5407192701327904e-05, + "loss": 3.9432, + "step": 32962 + }, + { + "epoch": 0.19604029879151202, + "grad_norm": 1.7392356395721436, + "learning_rate": 4.5406922880376386e-05, + "loss": 4.4476, + "step": 32963 + }, + { + "epoch": 0.19604624607479304, + "grad_norm": 1.9860972166061401, + "learning_rate": 4.5406653052301017e-05, + "loss": 4.663, + "step": 32964 + }, + { + "epoch": 0.19605219335807403, + "grad_norm": 2.1099915504455566, + "learning_rate": 4.54063832171019e-05, + "loss": 5.0918, + "step": 32965 + }, + { + "epoch": 0.19605814064135502, + "grad_norm": 1.7715723514556885, + "learning_rate": 4.540611337477913e-05, + "loss": 5.0995, + "step": 32966 + }, + { + "epoch": 0.19606408792463603, + "grad_norm": 1.490571141242981, + "learning_rate": 4.5405843525332784e-05, + "loss": 5.2124, + "step": 32967 + }, + { + "epoch": 0.19607003520791702, + "grad_norm": 1.6321748495101929, + "learning_rate": 4.5405573668762975e-05, + "loss": 5.3355, + "step": 32968 + }, + { + "epoch": 0.196075982491198, + "grad_norm": 1.6419252157211304, + "learning_rate": 4.540530380506979e-05, + "loss": 5.004, + "step": 32969 + }, + { + "epoch": 0.19608192977447902, + "grad_norm": 1.8405059576034546, + "learning_rate": 4.5405033934253326e-05, + "loss": 4.7141, + "step": 32970 + }, + { + "epoch": 0.19608787705776, + "grad_norm": 2.000082492828369, + "learning_rate": 4.5404764056313675e-05, + "loss": 4.2939, + "step": 32971 + }, + { + "epoch": 0.196093824341041, + "grad_norm": 2.117877960205078, + "learning_rate": 4.540449417125093e-05, + "loss": 4.3797, + "step": 32972 + }, + { + "epoch": 0.19609977162432202, + "grad_norm": 2.1574883460998535, + "learning_rate": 4.540422427906519e-05, + "loss": 4.1668, + "step": 32973 + }, + { + "epoch": 0.196105718907603, + "grad_norm": 2.0455899238586426, + "learning_rate": 4.5403954379756544e-05, + "loss": 4.5018, + "step": 32974 + }, + { + "epoch": 0.196111666190884, + "grad_norm": 1.5043025016784668, + "learning_rate": 4.540368447332509e-05, + "loss": 5.0477, + "step": 32975 + }, + { + "epoch": 0.196117613474165, + "grad_norm": 2.334475040435791, + "learning_rate": 4.5403414559770917e-05, + "loss": 4.673, + "step": 32976 + }, + { + "epoch": 0.196123560757446, + "grad_norm": 2.1780107021331787, + "learning_rate": 4.540314463909413e-05, + "loss": 4.7737, + "step": 32977 + }, + { + "epoch": 0.19612950804072699, + "grad_norm": 1.887604832649231, + "learning_rate": 4.540287471129481e-05, + "loss": 4.8789, + "step": 32978 + }, + { + "epoch": 0.196135455324008, + "grad_norm": 1.7331857681274414, + "learning_rate": 4.540260477637306e-05, + "loss": 5.079, + "step": 32979 + }, + { + "epoch": 0.196141402607289, + "grad_norm": 1.5630770921707153, + "learning_rate": 4.540233483432896e-05, + "loss": 5.1134, + "step": 32980 + }, + { + "epoch": 0.19614734989056998, + "grad_norm": 2.662470817565918, + "learning_rate": 4.540206488516263e-05, + "loss": 4.0535, + "step": 32981 + }, + { + "epoch": 0.196153297173851, + "grad_norm": 2.1369266510009766, + "learning_rate": 4.5401794928874145e-05, + "loss": 4.3121, + "step": 32982 + }, + { + "epoch": 0.19615924445713198, + "grad_norm": 2.7305498123168945, + "learning_rate": 4.5401524965463604e-05, + "loss": 3.7048, + "step": 32983 + }, + { + "epoch": 0.19616519174041297, + "grad_norm": 1.8726544380187988, + "learning_rate": 4.540125499493111e-05, + "loss": 4.3633, + "step": 32984 + }, + { + "epoch": 0.19617113902369399, + "grad_norm": 1.4531916379928589, + "learning_rate": 4.5400985017276735e-05, + "loss": 4.9491, + "step": 32985 + }, + { + "epoch": 0.19617708630697497, + "grad_norm": 2.7652368545532227, + "learning_rate": 4.5400715032500595e-05, + "loss": 4.4811, + "step": 32986 + }, + { + "epoch": 0.19618303359025596, + "grad_norm": 2.069976568222046, + "learning_rate": 4.540044504060277e-05, + "loss": 4.6528, + "step": 32987 + }, + { + "epoch": 0.19618898087353698, + "grad_norm": 2.0444564819335938, + "learning_rate": 4.540017504158337e-05, + "loss": 4.6799, + "step": 32988 + }, + { + "epoch": 0.19619492815681797, + "grad_norm": 1.6998240947723389, + "learning_rate": 4.5399905035442467e-05, + "loss": 4.8338, + "step": 32989 + }, + { + "epoch": 0.19620087544009895, + "grad_norm": 1.84773588180542, + "learning_rate": 4.5399635022180175e-05, + "loss": 4.7789, + "step": 32990 + }, + { + "epoch": 0.19620682272337997, + "grad_norm": 1.6940088272094727, + "learning_rate": 4.5399365001796586e-05, + "loss": 4.9627, + "step": 32991 + }, + { + "epoch": 0.19621277000666096, + "grad_norm": 1.6246799230575562, + "learning_rate": 4.539909497429178e-05, + "loss": 4.996, + "step": 32992 + }, + { + "epoch": 0.19621871728994195, + "grad_norm": 1.7621272802352905, + "learning_rate": 4.539882493966587e-05, + "loss": 4.9238, + "step": 32993 + }, + { + "epoch": 0.19622466457322296, + "grad_norm": 1.5167536735534668, + "learning_rate": 4.539855489791893e-05, + "loss": 5.2917, + "step": 32994 + }, + { + "epoch": 0.19623061185650395, + "grad_norm": 1.728780746459961, + "learning_rate": 4.539828484905107e-05, + "loss": 5.515, + "step": 32995 + }, + { + "epoch": 0.19623655913978494, + "grad_norm": 1.7221986055374146, + "learning_rate": 4.5398014793062386e-05, + "loss": 4.7529, + "step": 32996 + }, + { + "epoch": 0.19624250642306595, + "grad_norm": 2.0246353149414062, + "learning_rate": 4.5397744729952964e-05, + "loss": 4.4317, + "step": 32997 + }, + { + "epoch": 0.19624845370634694, + "grad_norm": 1.7018826007843018, + "learning_rate": 4.5397474659722896e-05, + "loss": 4.7282, + "step": 32998 + }, + { + "epoch": 0.19625440098962793, + "grad_norm": 1.7294108867645264, + "learning_rate": 4.5397204582372276e-05, + "loss": 4.4915, + "step": 32999 + }, + { + "epoch": 0.19626034827290895, + "grad_norm": 1.8619226217269897, + "learning_rate": 4.539693449790121e-05, + "loss": 4.3318, + "step": 33000 + }, + { + "epoch": 0.19626629555618993, + "grad_norm": 1.7333225011825562, + "learning_rate": 4.5396664406309785e-05, + "loss": 4.371, + "step": 33001 + }, + { + "epoch": 0.19627224283947092, + "grad_norm": 1.843048095703125, + "learning_rate": 4.539639430759809e-05, + "loss": 4.4083, + "step": 33002 + }, + { + "epoch": 0.19627819012275194, + "grad_norm": 1.607068419456482, + "learning_rate": 4.5396124201766226e-05, + "loss": 4.3408, + "step": 33003 + }, + { + "epoch": 0.19628413740603293, + "grad_norm": 1.7347930669784546, + "learning_rate": 4.539585408881429e-05, + "loss": 4.0526, + "step": 33004 + }, + { + "epoch": 0.19629008468931392, + "grad_norm": 1.7406977415084839, + "learning_rate": 4.539558396874237e-05, + "loss": 4.1061, + "step": 33005 + }, + { + "epoch": 0.19629603197259493, + "grad_norm": 1.804682731628418, + "learning_rate": 4.5395313841550555e-05, + "loss": 4.4277, + "step": 33006 + }, + { + "epoch": 0.19630197925587592, + "grad_norm": 1.443662405014038, + "learning_rate": 4.5395043707238954e-05, + "loss": 4.5178, + "step": 33007 + }, + { + "epoch": 0.1963079265391569, + "grad_norm": 1.7877321243286133, + "learning_rate": 4.5394773565807655e-05, + "loss": 4.5201, + "step": 33008 + }, + { + "epoch": 0.19631387382243792, + "grad_norm": 2.0785722732543945, + "learning_rate": 4.539450341725675e-05, + "loss": 4.3904, + "step": 33009 + }, + { + "epoch": 0.1963198211057189, + "grad_norm": 1.7074304819107056, + "learning_rate": 4.5394233261586336e-05, + "loss": 4.8033, + "step": 33010 + }, + { + "epoch": 0.1963257683889999, + "grad_norm": 1.7941499948501587, + "learning_rate": 4.53939630987965e-05, + "loss": 5.0883, + "step": 33011 + }, + { + "epoch": 0.19633171567228092, + "grad_norm": 1.5039217472076416, + "learning_rate": 4.539369292888734e-05, + "loss": 4.5738, + "step": 33012 + }, + { + "epoch": 0.1963376629555619, + "grad_norm": 1.7019708156585693, + "learning_rate": 4.539342275185896e-05, + "loss": 4.2565, + "step": 33013 + }, + { + "epoch": 0.1963436102388429, + "grad_norm": 1.853834867477417, + "learning_rate": 4.539315256771145e-05, + "loss": 4.0714, + "step": 33014 + }, + { + "epoch": 0.1963495575221239, + "grad_norm": 1.663608193397522, + "learning_rate": 4.5392882376444896e-05, + "loss": 4.3966, + "step": 33015 + }, + { + "epoch": 0.1963555048054049, + "grad_norm": 1.6027350425720215, + "learning_rate": 4.539261217805939e-05, + "loss": 4.6439, + "step": 33016 + }, + { + "epoch": 0.19636145208868588, + "grad_norm": 1.6448129415512085, + "learning_rate": 4.539234197255505e-05, + "loss": 4.8542, + "step": 33017 + }, + { + "epoch": 0.19636739937196687, + "grad_norm": 1.5828901529312134, + "learning_rate": 4.539207175993194e-05, + "loss": 4.8388, + "step": 33018 + }, + { + "epoch": 0.1963733466552479, + "grad_norm": 1.5006245374679565, + "learning_rate": 4.5391801540190184e-05, + "loss": 4.8398, + "step": 33019 + }, + { + "epoch": 0.19637929393852888, + "grad_norm": 1.584307312965393, + "learning_rate": 4.5391531313329846e-05, + "loss": 4.6774, + "step": 33020 + }, + { + "epoch": 0.19638524122180986, + "grad_norm": 1.5445975065231323, + "learning_rate": 4.5391261079351036e-05, + "loss": 4.9719, + "step": 33021 + }, + { + "epoch": 0.19639118850509088, + "grad_norm": 1.3577830791473389, + "learning_rate": 4.5390990838253856e-05, + "loss": 4.9825, + "step": 33022 + }, + { + "epoch": 0.19639713578837187, + "grad_norm": 1.3781458139419556, + "learning_rate": 4.539072059003838e-05, + "loss": 4.8167, + "step": 33023 + }, + { + "epoch": 0.19640308307165286, + "grad_norm": 1.602210521697998, + "learning_rate": 4.5390450334704725e-05, + "loss": 4.8044, + "step": 33024 + }, + { + "epoch": 0.19640903035493387, + "grad_norm": 1.4063019752502441, + "learning_rate": 4.539018007225298e-05, + "loss": 4.8966, + "step": 33025 + }, + { + "epoch": 0.19641497763821486, + "grad_norm": 1.4408751726150513, + "learning_rate": 4.538990980268322e-05, + "loss": 5.0514, + "step": 33026 + }, + { + "epoch": 0.19642092492149585, + "grad_norm": 1.548294186592102, + "learning_rate": 4.538963952599555e-05, + "loss": 4.3517, + "step": 33027 + }, + { + "epoch": 0.19642687220477686, + "grad_norm": 1.462956428527832, + "learning_rate": 4.5389369242190075e-05, + "loss": 4.6278, + "step": 33028 + }, + { + "epoch": 0.19643281948805785, + "grad_norm": 1.5403681993484497, + "learning_rate": 4.538909895126689e-05, + "loss": 5.233, + "step": 33029 + }, + { + "epoch": 0.19643876677133884, + "grad_norm": 1.4483433961868286, + "learning_rate": 4.538882865322607e-05, + "loss": 4.8854, + "step": 33030 + }, + { + "epoch": 0.19644471405461986, + "grad_norm": 1.6093387603759766, + "learning_rate": 4.5388558348067725e-05, + "loss": 4.9251, + "step": 33031 + }, + { + "epoch": 0.19645066133790084, + "grad_norm": 1.6804461479187012, + "learning_rate": 4.5388288035791934e-05, + "loss": 4.5588, + "step": 33032 + }, + { + "epoch": 0.19645660862118183, + "grad_norm": 1.799657940864563, + "learning_rate": 4.5388017716398816e-05, + "loss": 4.4804, + "step": 33033 + }, + { + "epoch": 0.19646255590446285, + "grad_norm": 1.4390314817428589, + "learning_rate": 4.538774738988845e-05, + "loss": 4.7733, + "step": 33034 + }, + { + "epoch": 0.19646850318774384, + "grad_norm": 1.8508771657943726, + "learning_rate": 4.538747705626093e-05, + "loss": 4.6182, + "step": 33035 + }, + { + "epoch": 0.19647445047102483, + "grad_norm": 1.6584879159927368, + "learning_rate": 4.538720671551635e-05, + "loss": 4.4793, + "step": 33036 + }, + { + "epoch": 0.19648039775430584, + "grad_norm": 1.6483509540557861, + "learning_rate": 4.538693636765481e-05, + "loss": 4.5021, + "step": 33037 + }, + { + "epoch": 0.19648634503758683, + "grad_norm": 1.6273133754730225, + "learning_rate": 4.53866660126764e-05, + "loss": 4.615, + "step": 33038 + }, + { + "epoch": 0.19649229232086782, + "grad_norm": 1.80341637134552, + "learning_rate": 4.5386395650581215e-05, + "loss": 4.9965, + "step": 33039 + }, + { + "epoch": 0.19649823960414883, + "grad_norm": 1.5780657529830933, + "learning_rate": 4.538612528136935e-05, + "loss": 5.5826, + "step": 33040 + }, + { + "epoch": 0.19650418688742982, + "grad_norm": 1.6564321517944336, + "learning_rate": 4.53858549050409e-05, + "loss": 4.9127, + "step": 33041 + }, + { + "epoch": 0.1965101341707108, + "grad_norm": 1.5042874813079834, + "learning_rate": 4.538558452159596e-05, + "loss": 5.0219, + "step": 33042 + }, + { + "epoch": 0.19651608145399183, + "grad_norm": 1.621291160583496, + "learning_rate": 4.538531413103462e-05, + "loss": 4.8946, + "step": 33043 + }, + { + "epoch": 0.1965220287372728, + "grad_norm": 1.7965176105499268, + "learning_rate": 4.5385043733356976e-05, + "loss": 4.3394, + "step": 33044 + }, + { + "epoch": 0.1965279760205538, + "grad_norm": 1.7505266666412354, + "learning_rate": 4.5384773328563124e-05, + "loss": 4.6966, + "step": 33045 + }, + { + "epoch": 0.19653392330383482, + "grad_norm": 1.4543168544769287, + "learning_rate": 4.538450291665316e-05, + "loss": 5.189, + "step": 33046 + }, + { + "epoch": 0.1965398705871158, + "grad_norm": 1.7490246295928955, + "learning_rate": 4.538423249762718e-05, + "loss": 4.5155, + "step": 33047 + }, + { + "epoch": 0.1965458178703968, + "grad_norm": 1.5133061408996582, + "learning_rate": 4.538396207148528e-05, + "loss": 4.8084, + "step": 33048 + }, + { + "epoch": 0.1965517651536778, + "grad_norm": 1.908988356590271, + "learning_rate": 4.5383691638227534e-05, + "loss": 5.0611, + "step": 33049 + }, + { + "epoch": 0.1965577124369588, + "grad_norm": 1.6494390964508057, + "learning_rate": 4.5383421197854056e-05, + "loss": 5.0496, + "step": 33050 + }, + { + "epoch": 0.1965636597202398, + "grad_norm": 1.5613998174667358, + "learning_rate": 4.5383150750364946e-05, + "loss": 4.6471, + "step": 33051 + }, + { + "epoch": 0.1965696070035208, + "grad_norm": 1.5566452741622925, + "learning_rate": 4.5382880295760284e-05, + "loss": 4.8781, + "step": 33052 + }, + { + "epoch": 0.1965755542868018, + "grad_norm": 1.4820610284805298, + "learning_rate": 4.5382609834040166e-05, + "loss": 4.9933, + "step": 33053 + }, + { + "epoch": 0.19658150157008278, + "grad_norm": 1.6967642307281494, + "learning_rate": 4.5382339365204694e-05, + "loss": 4.981, + "step": 33054 + }, + { + "epoch": 0.1965874488533638, + "grad_norm": 1.6705995798110962, + "learning_rate": 4.538206888925395e-05, + "loss": 4.8135, + "step": 33055 + }, + { + "epoch": 0.19659339613664478, + "grad_norm": 1.6412502527236938, + "learning_rate": 4.5381798406188044e-05, + "loss": 4.3751, + "step": 33056 + }, + { + "epoch": 0.19659934341992577, + "grad_norm": 1.8060193061828613, + "learning_rate": 4.5381527916007063e-05, + "loss": 4.6845, + "step": 33057 + }, + { + "epoch": 0.1966052907032068, + "grad_norm": 1.8145633935928345, + "learning_rate": 4.5381257418711094e-05, + "loss": 4.9956, + "step": 33058 + }, + { + "epoch": 0.19661123798648777, + "grad_norm": 1.7470539808273315, + "learning_rate": 4.538098691430024e-05, + "loss": 4.583, + "step": 33059 + }, + { + "epoch": 0.19661718526976876, + "grad_norm": 1.6165781021118164, + "learning_rate": 4.5380716402774596e-05, + "loss": 4.4499, + "step": 33060 + }, + { + "epoch": 0.19662313255304978, + "grad_norm": 1.5280836820602417, + "learning_rate": 4.538044588413426e-05, + "loss": 4.5728, + "step": 33061 + }, + { + "epoch": 0.19662907983633077, + "grad_norm": 1.750088095664978, + "learning_rate": 4.5380175358379316e-05, + "loss": 4.9698, + "step": 33062 + }, + { + "epoch": 0.19663502711961175, + "grad_norm": 2.1302971839904785, + "learning_rate": 4.537990482550986e-05, + "loss": 4.3426, + "step": 33063 + }, + { + "epoch": 0.19664097440289277, + "grad_norm": 2.5557992458343506, + "learning_rate": 4.5379634285526e-05, + "loss": 4.2919, + "step": 33064 + }, + { + "epoch": 0.19664692168617376, + "grad_norm": 1.9230780601501465, + "learning_rate": 4.5379363738427806e-05, + "loss": 4.4673, + "step": 33065 + }, + { + "epoch": 0.19665286896945475, + "grad_norm": 1.7957717180252075, + "learning_rate": 4.537909318421539e-05, + "loss": 4.0586, + "step": 33066 + }, + { + "epoch": 0.19665881625273576, + "grad_norm": 1.8782682418823242, + "learning_rate": 4.537882262288885e-05, + "loss": 4.449, + "step": 33067 + }, + { + "epoch": 0.19666476353601675, + "grad_norm": 1.7372145652770996, + "learning_rate": 4.5378552054448276e-05, + "loss": 4.8808, + "step": 33068 + }, + { + "epoch": 0.19667071081929774, + "grad_norm": 2.0615148544311523, + "learning_rate": 4.537828147889376e-05, + "loss": 3.8952, + "step": 33069 + }, + { + "epoch": 0.19667665810257876, + "grad_norm": 1.7238409519195557, + "learning_rate": 4.537801089622539e-05, + "loss": 4.489, + "step": 33070 + }, + { + "epoch": 0.19668260538585974, + "grad_norm": 2.1890852451324463, + "learning_rate": 4.537774030644326e-05, + "loss": 4.208, + "step": 33071 + }, + { + "epoch": 0.19668855266914073, + "grad_norm": 2.126760482788086, + "learning_rate": 4.5377469709547485e-05, + "loss": 4.3385, + "step": 33072 + }, + { + "epoch": 0.19669449995242175, + "grad_norm": 1.8360297679901123, + "learning_rate": 4.537719910553814e-05, + "loss": 4.2634, + "step": 33073 + }, + { + "epoch": 0.19670044723570274, + "grad_norm": 1.8070091009140015, + "learning_rate": 4.5376928494415326e-05, + "loss": 4.7106, + "step": 33074 + }, + { + "epoch": 0.19670639451898372, + "grad_norm": 1.9259190559387207, + "learning_rate": 4.537665787617913e-05, + "loss": 4.3962, + "step": 33075 + }, + { + "epoch": 0.1967123418022647, + "grad_norm": 1.8697553873062134, + "learning_rate": 4.5376387250829664e-05, + "loss": 4.4294, + "step": 33076 + }, + { + "epoch": 0.19671828908554573, + "grad_norm": 2.0083229541778564, + "learning_rate": 4.537611661836701e-05, + "loss": 4.3672, + "step": 33077 + }, + { + "epoch": 0.19672423636882672, + "grad_norm": 1.8586071729660034, + "learning_rate": 4.537584597879126e-05, + "loss": 4.2949, + "step": 33078 + }, + { + "epoch": 0.1967301836521077, + "grad_norm": 2.0329997539520264, + "learning_rate": 4.5375575332102514e-05, + "loss": 4.0786, + "step": 33079 + }, + { + "epoch": 0.19673613093538872, + "grad_norm": 1.8664171695709229, + "learning_rate": 4.537530467830087e-05, + "loss": 4.8517, + "step": 33080 + }, + { + "epoch": 0.1967420782186697, + "grad_norm": 1.5570780038833618, + "learning_rate": 4.5375034017386406e-05, + "loss": 4.6582, + "step": 33081 + }, + { + "epoch": 0.1967480255019507, + "grad_norm": 1.5720075368881226, + "learning_rate": 4.537476334935924e-05, + "loss": 4.7118, + "step": 33082 + }, + { + "epoch": 0.1967539727852317, + "grad_norm": 1.888211965560913, + "learning_rate": 4.537449267421945e-05, + "loss": 4.4994, + "step": 33083 + }, + { + "epoch": 0.1967599200685127, + "grad_norm": 1.670282006263733, + "learning_rate": 4.5374221991967136e-05, + "loss": 5.0491, + "step": 33084 + }, + { + "epoch": 0.1967658673517937, + "grad_norm": 1.7795008420944214, + "learning_rate": 4.5373951302602394e-05, + "loss": 4.6723, + "step": 33085 + }, + { + "epoch": 0.1967718146350747, + "grad_norm": 1.8241984844207764, + "learning_rate": 4.537368060612531e-05, + "loss": 4.6965, + "step": 33086 + }, + { + "epoch": 0.1967777619183557, + "grad_norm": 1.6808873414993286, + "learning_rate": 4.537340990253599e-05, + "loss": 4.6631, + "step": 33087 + }, + { + "epoch": 0.19678370920163668, + "grad_norm": 2.037264585494995, + "learning_rate": 4.537313919183451e-05, + "loss": 4.2909, + "step": 33088 + }, + { + "epoch": 0.1967896564849177, + "grad_norm": 1.729772925376892, + "learning_rate": 4.5372868474020996e-05, + "loss": 4.4949, + "step": 33089 + }, + { + "epoch": 0.19679560376819868, + "grad_norm": 1.4116592407226562, + "learning_rate": 4.537259774909551e-05, + "loss": 5.1077, + "step": 33090 + }, + { + "epoch": 0.19680155105147967, + "grad_norm": 1.5582292079925537, + "learning_rate": 4.537232701705817e-05, + "loss": 4.9148, + "step": 33091 + }, + { + "epoch": 0.1968074983347607, + "grad_norm": 1.4327534437179565, + "learning_rate": 4.5372056277909055e-05, + "loss": 5.0867, + "step": 33092 + }, + { + "epoch": 0.19681344561804168, + "grad_norm": 1.9894887208938599, + "learning_rate": 4.537178553164827e-05, + "loss": 4.3336, + "step": 33093 + }, + { + "epoch": 0.19681939290132267, + "grad_norm": 1.8432674407958984, + "learning_rate": 4.5371514778275904e-05, + "loss": 4.325, + "step": 33094 + }, + { + "epoch": 0.19682534018460368, + "grad_norm": 2.4664008617401123, + "learning_rate": 4.537124401779206e-05, + "loss": 4.3697, + "step": 33095 + }, + { + "epoch": 0.19683128746788467, + "grad_norm": 1.7858588695526123, + "learning_rate": 4.537097325019681e-05, + "loss": 4.5536, + "step": 33096 + }, + { + "epoch": 0.19683723475116566, + "grad_norm": 1.5062922239303589, + "learning_rate": 4.537070247549028e-05, + "loss": 4.7737, + "step": 33097 + }, + { + "epoch": 0.19684318203444667, + "grad_norm": 1.8330934047698975, + "learning_rate": 4.537043169367253e-05, + "loss": 4.3142, + "step": 33098 + }, + { + "epoch": 0.19684912931772766, + "grad_norm": 1.6842762231826782, + "learning_rate": 4.5370160904743686e-05, + "loss": 4.1331, + "step": 33099 + }, + { + "epoch": 0.19685507660100865, + "grad_norm": 1.6639212369918823, + "learning_rate": 4.5369890108703824e-05, + "loss": 4.3134, + "step": 33100 + }, + { + "epoch": 0.19686102388428967, + "grad_norm": 1.7178279161453247, + "learning_rate": 4.5369619305553047e-05, + "loss": 4.2592, + "step": 33101 + }, + { + "epoch": 0.19686697116757065, + "grad_norm": 1.693440318107605, + "learning_rate": 4.536934849529144e-05, + "loss": 4.3392, + "step": 33102 + }, + { + "epoch": 0.19687291845085164, + "grad_norm": 1.7481168508529663, + "learning_rate": 4.5369077677919116e-05, + "loss": 5.2228, + "step": 33103 + }, + { + "epoch": 0.19687886573413266, + "grad_norm": 1.4601521492004395, + "learning_rate": 4.5368806853436145e-05, + "loss": 4.7914, + "step": 33104 + }, + { + "epoch": 0.19688481301741365, + "grad_norm": 1.6039336919784546, + "learning_rate": 4.536853602184264e-05, + "loss": 4.3547, + "step": 33105 + }, + { + "epoch": 0.19689076030069463, + "grad_norm": 1.9059422016143799, + "learning_rate": 4.536826518313869e-05, + "loss": 4.1615, + "step": 33106 + }, + { + "epoch": 0.19689670758397565, + "grad_norm": 1.8276565074920654, + "learning_rate": 4.536799433732438e-05, + "loss": 4.3688, + "step": 33107 + }, + { + "epoch": 0.19690265486725664, + "grad_norm": 1.251856803894043, + "learning_rate": 4.5367723484399825e-05, + "loss": 5.0791, + "step": 33108 + }, + { + "epoch": 0.19690860215053763, + "grad_norm": 1.649273157119751, + "learning_rate": 4.5367452624365107e-05, + "loss": 4.6438, + "step": 33109 + }, + { + "epoch": 0.19691454943381864, + "grad_norm": 1.8959378004074097, + "learning_rate": 4.5367181757220326e-05, + "loss": 4.7787, + "step": 33110 + }, + { + "epoch": 0.19692049671709963, + "grad_norm": 1.8937031030654907, + "learning_rate": 4.536691088296556e-05, + "loss": 5.0304, + "step": 33111 + }, + { + "epoch": 0.19692644400038062, + "grad_norm": 1.6562620401382446, + "learning_rate": 4.5366640001600916e-05, + "loss": 4.7406, + "step": 33112 + }, + { + "epoch": 0.19693239128366163, + "grad_norm": 1.491281270980835, + "learning_rate": 4.53663691131265e-05, + "loss": 5.3676, + "step": 33113 + }, + { + "epoch": 0.19693833856694262, + "grad_norm": 1.5142914056777954, + "learning_rate": 4.536609821754239e-05, + "loss": 4.8378, + "step": 33114 + }, + { + "epoch": 0.1969442858502236, + "grad_norm": 1.6782684326171875, + "learning_rate": 4.536582731484868e-05, + "loss": 5.4148, + "step": 33115 + }, + { + "epoch": 0.19695023313350463, + "grad_norm": 1.8408838510513306, + "learning_rate": 4.5365556405045475e-05, + "loss": 5.2331, + "step": 33116 + }, + { + "epoch": 0.19695618041678561, + "grad_norm": 1.7965582609176636, + "learning_rate": 4.536528548813286e-05, + "loss": 4.6984, + "step": 33117 + }, + { + "epoch": 0.1969621277000666, + "grad_norm": 1.7825274467468262, + "learning_rate": 4.536501456411094e-05, + "loss": 4.9418, + "step": 33118 + }, + { + "epoch": 0.19696807498334762, + "grad_norm": 1.957222819328308, + "learning_rate": 4.53647436329798e-05, + "loss": 5.0835, + "step": 33119 + }, + { + "epoch": 0.1969740222666286, + "grad_norm": 1.7899425029754639, + "learning_rate": 4.536447269473954e-05, + "loss": 5.1203, + "step": 33120 + }, + { + "epoch": 0.1969799695499096, + "grad_norm": 1.5552259683609009, + "learning_rate": 4.536420174939025e-05, + "loss": 4.8925, + "step": 33121 + }, + { + "epoch": 0.1969859168331906, + "grad_norm": 1.6164780855178833, + "learning_rate": 4.5363930796932036e-05, + "loss": 4.9743, + "step": 33122 + }, + { + "epoch": 0.1969918641164716, + "grad_norm": 1.7106695175170898, + "learning_rate": 4.536365983736498e-05, + "loss": 4.5289, + "step": 33123 + }, + { + "epoch": 0.1969978113997526, + "grad_norm": 1.6085342168807983, + "learning_rate": 4.5363388870689175e-05, + "loss": 4.9196, + "step": 33124 + }, + { + "epoch": 0.1970037586830336, + "grad_norm": 1.8197940587997437, + "learning_rate": 4.536311789690473e-05, + "loss": 4.1818, + "step": 33125 + }, + { + "epoch": 0.1970097059663146, + "grad_norm": 1.6849818229675293, + "learning_rate": 4.5362846916011724e-05, + "loss": 4.6461, + "step": 33126 + }, + { + "epoch": 0.19701565324959558, + "grad_norm": 1.6120171546936035, + "learning_rate": 4.536257592801026e-05, + "loss": 4.8924, + "step": 33127 + }, + { + "epoch": 0.1970216005328766, + "grad_norm": 1.5898586511611938, + "learning_rate": 4.536230493290043e-05, + "loss": 4.7168, + "step": 33128 + }, + { + "epoch": 0.19702754781615758, + "grad_norm": 1.620339274406433, + "learning_rate": 4.536203393068234e-05, + "loss": 4.9929, + "step": 33129 + }, + { + "epoch": 0.19703349509943857, + "grad_norm": 1.6156890392303467, + "learning_rate": 4.536176292135606e-05, + "loss": 4.7432, + "step": 33130 + }, + { + "epoch": 0.1970394423827196, + "grad_norm": 1.7055577039718628, + "learning_rate": 4.5361491904921704e-05, + "loss": 4.7621, + "step": 33131 + }, + { + "epoch": 0.19704538966600058, + "grad_norm": 1.474246621131897, + "learning_rate": 4.5361220881379364e-05, + "loss": 4.5933, + "step": 33132 + }, + { + "epoch": 0.19705133694928156, + "grad_norm": 1.624182105064392, + "learning_rate": 4.536094985072914e-05, + "loss": 4.518, + "step": 33133 + }, + { + "epoch": 0.19705728423256255, + "grad_norm": 1.8042110204696655, + "learning_rate": 4.5360678812971104e-05, + "loss": 4.9295, + "step": 33134 + }, + { + "epoch": 0.19706323151584357, + "grad_norm": 1.6601067781448364, + "learning_rate": 4.536040776810537e-05, + "loss": 4.9089, + "step": 33135 + }, + { + "epoch": 0.19706917879912456, + "grad_norm": 1.9153242111206055, + "learning_rate": 4.5360136716132026e-05, + "loss": 4.9126, + "step": 33136 + }, + { + "epoch": 0.19707512608240554, + "grad_norm": 1.4137238264083862, + "learning_rate": 4.535986565705118e-05, + "loss": 4.9162, + "step": 33137 + }, + { + "epoch": 0.19708107336568656, + "grad_norm": 1.6325432062149048, + "learning_rate": 4.5359594590862905e-05, + "loss": 4.5288, + "step": 33138 + }, + { + "epoch": 0.19708702064896755, + "grad_norm": 1.8361495733261108, + "learning_rate": 4.5359323517567306e-05, + "loss": 4.3896, + "step": 33139 + }, + { + "epoch": 0.19709296793224854, + "grad_norm": 1.5172821283340454, + "learning_rate": 4.535905243716448e-05, + "loss": 4.8043, + "step": 33140 + }, + { + "epoch": 0.19709891521552955, + "grad_norm": 1.8739675283432007, + "learning_rate": 4.535878134965452e-05, + "loss": 5.7305, + "step": 33141 + }, + { + "epoch": 0.19710486249881054, + "grad_norm": 2.015746831893921, + "learning_rate": 4.535851025503752e-05, + "loss": 5.5957, + "step": 33142 + }, + { + "epoch": 0.19711080978209153, + "grad_norm": 1.5228544473648071, + "learning_rate": 4.535823915331357e-05, + "loss": 5.1121, + "step": 33143 + }, + { + "epoch": 0.19711675706537254, + "grad_norm": 1.8229737281799316, + "learning_rate": 4.5357968044482776e-05, + "loss": 5.3085, + "step": 33144 + }, + { + "epoch": 0.19712270434865353, + "grad_norm": 1.5784951448440552, + "learning_rate": 4.5357696928545215e-05, + "loss": 5.8142, + "step": 33145 + }, + { + "epoch": 0.19712865163193452, + "grad_norm": 1.6534473896026611, + "learning_rate": 4.5357425805501e-05, + "loss": 5.3424, + "step": 33146 + }, + { + "epoch": 0.19713459891521554, + "grad_norm": 2.2714569568634033, + "learning_rate": 4.535715467535022e-05, + "loss": 4.3029, + "step": 33147 + }, + { + "epoch": 0.19714054619849652, + "grad_norm": 1.9718842506408691, + "learning_rate": 4.5356883538092967e-05, + "loss": 4.2328, + "step": 33148 + }, + { + "epoch": 0.1971464934817775, + "grad_norm": 1.6277927160263062, + "learning_rate": 4.5356612393729325e-05, + "loss": 4.4578, + "step": 33149 + }, + { + "epoch": 0.19715244076505853, + "grad_norm": 1.7372039556503296, + "learning_rate": 4.5356341242259416e-05, + "loss": 4.8696, + "step": 33150 + }, + { + "epoch": 0.19715838804833952, + "grad_norm": 1.6973861455917358, + "learning_rate": 4.5356070083683313e-05, + "loss": 4.7767, + "step": 33151 + }, + { + "epoch": 0.1971643353316205, + "grad_norm": 2.262956142425537, + "learning_rate": 4.5355798918001106e-05, + "loss": 3.7523, + "step": 33152 + }, + { + "epoch": 0.19717028261490152, + "grad_norm": 2.558028221130371, + "learning_rate": 4.535552774521291e-05, + "loss": 3.7098, + "step": 33153 + }, + { + "epoch": 0.1971762298981825, + "grad_norm": 2.026247024536133, + "learning_rate": 4.5355256565318804e-05, + "loss": 4.1682, + "step": 33154 + }, + { + "epoch": 0.1971821771814635, + "grad_norm": 1.616791844367981, + "learning_rate": 4.535498537831889e-05, + "loss": 5.0644, + "step": 33155 + }, + { + "epoch": 0.1971881244647445, + "grad_norm": 2.0363311767578125, + "learning_rate": 4.5354714184213274e-05, + "loss": 5.1429, + "step": 33156 + }, + { + "epoch": 0.1971940717480255, + "grad_norm": 1.687772274017334, + "learning_rate": 4.535444298300202e-05, + "loss": 4.8497, + "step": 33157 + }, + { + "epoch": 0.1972000190313065, + "grad_norm": 2.0011701583862305, + "learning_rate": 4.535417177468525e-05, + "loss": 4.5291, + "step": 33158 + }, + { + "epoch": 0.1972059663145875, + "grad_norm": 1.8161364793777466, + "learning_rate": 4.5353900559263044e-05, + "loss": 5.1733, + "step": 33159 + }, + { + "epoch": 0.1972119135978685, + "grad_norm": 1.8265936374664307, + "learning_rate": 4.535362933673551e-05, + "loss": 5.214, + "step": 33160 + }, + { + "epoch": 0.19721786088114948, + "grad_norm": 1.8060880899429321, + "learning_rate": 4.535335810710272e-05, + "loss": 5.1948, + "step": 33161 + }, + { + "epoch": 0.1972238081644305, + "grad_norm": 1.8454203605651855, + "learning_rate": 4.5353086870364795e-05, + "loss": 5.2124, + "step": 33162 + }, + { + "epoch": 0.19722975544771149, + "grad_norm": 1.4462891817092896, + "learning_rate": 4.535281562652181e-05, + "loss": 5.2756, + "step": 33163 + }, + { + "epoch": 0.19723570273099247, + "grad_norm": 1.7189714908599854, + "learning_rate": 4.535254437557387e-05, + "loss": 4.7875, + "step": 33164 + }, + { + "epoch": 0.1972416500142735, + "grad_norm": 1.6477726697921753, + "learning_rate": 4.5352273117521074e-05, + "loss": 4.7295, + "step": 33165 + }, + { + "epoch": 0.19724759729755448, + "grad_norm": 1.5731415748596191, + "learning_rate": 4.535200185236349e-05, + "loss": 5.168, + "step": 33166 + }, + { + "epoch": 0.19725354458083547, + "grad_norm": 1.8423577547073364, + "learning_rate": 4.535173058010125e-05, + "loss": 5.1467, + "step": 33167 + }, + { + "epoch": 0.19725949186411648, + "grad_norm": 1.6812883615493774, + "learning_rate": 4.5351459300734436e-05, + "loss": 5.6017, + "step": 33168 + }, + { + "epoch": 0.19726543914739747, + "grad_norm": 1.5396465063095093, + "learning_rate": 4.535118801426312e-05, + "loss": 5.4747, + "step": 33169 + }, + { + "epoch": 0.19727138643067846, + "grad_norm": 2.3084144592285156, + "learning_rate": 4.535091672068743e-05, + "loss": 4.1229, + "step": 33170 + }, + { + "epoch": 0.19727733371395947, + "grad_norm": 2.0174407958984375, + "learning_rate": 4.535064542000743e-05, + "loss": 5.4288, + "step": 33171 + }, + { + "epoch": 0.19728328099724046, + "grad_norm": 1.5771641731262207, + "learning_rate": 4.535037411222324e-05, + "loss": 5.3271, + "step": 33172 + }, + { + "epoch": 0.19728922828052145, + "grad_norm": 1.7815163135528564, + "learning_rate": 4.5350102797334946e-05, + "loss": 5.7822, + "step": 33173 + }, + { + "epoch": 0.19729517556380247, + "grad_norm": 2.0947186946868896, + "learning_rate": 4.534983147534264e-05, + "loss": 4.737, + "step": 33174 + }, + { + "epoch": 0.19730112284708345, + "grad_norm": 1.4813156127929688, + "learning_rate": 4.534956014624642e-05, + "loss": 4.7643, + "step": 33175 + }, + { + "epoch": 0.19730707013036444, + "grad_norm": 1.9580156803131104, + "learning_rate": 4.534928881004637e-05, + "loss": 4.922, + "step": 33176 + }, + { + "epoch": 0.19731301741364546, + "grad_norm": 1.7537955045700073, + "learning_rate": 4.53490174667426e-05, + "loss": 5.154, + "step": 33177 + }, + { + "epoch": 0.19731896469692645, + "grad_norm": 1.444298267364502, + "learning_rate": 4.534874611633519e-05, + "loss": 5.0872, + "step": 33178 + }, + { + "epoch": 0.19732491198020743, + "grad_norm": 1.4737969636917114, + "learning_rate": 4.5348474758824254e-05, + "loss": 4.9913, + "step": 33179 + }, + { + "epoch": 0.19733085926348845, + "grad_norm": 1.9675285816192627, + "learning_rate": 4.5348203394209874e-05, + "loss": 4.9247, + "step": 33180 + }, + { + "epoch": 0.19733680654676944, + "grad_norm": 1.5464117527008057, + "learning_rate": 4.534793202249215e-05, + "loss": 5.0685, + "step": 33181 + }, + { + "epoch": 0.19734275383005043, + "grad_norm": 1.5645267963409424, + "learning_rate": 4.5347660643671155e-05, + "loss": 5.2382, + "step": 33182 + }, + { + "epoch": 0.19734870111333144, + "grad_norm": 1.5690304040908813, + "learning_rate": 4.534738925774702e-05, + "loss": 5.3408, + "step": 33183 + }, + { + "epoch": 0.19735464839661243, + "grad_norm": 1.8297967910766602, + "learning_rate": 4.534711786471981e-05, + "loss": 5.5102, + "step": 33184 + }, + { + "epoch": 0.19736059567989342, + "grad_norm": 2.669644594192505, + "learning_rate": 4.534684646458964e-05, + "loss": 5.0929, + "step": 33185 + }, + { + "epoch": 0.19736654296317444, + "grad_norm": 2.3194940090179443, + "learning_rate": 4.53465750573566e-05, + "loss": 5.1063, + "step": 33186 + }, + { + "epoch": 0.19737249024645542, + "grad_norm": 1.6241377592086792, + "learning_rate": 4.534630364302077e-05, + "loss": 4.8875, + "step": 33187 + }, + { + "epoch": 0.1973784375297364, + "grad_norm": 1.7001700401306152, + "learning_rate": 4.534603222158226e-05, + "loss": 5.2486, + "step": 33188 + }, + { + "epoch": 0.19738438481301743, + "grad_norm": 1.9493141174316406, + "learning_rate": 4.5345760793041156e-05, + "loss": 4.9473, + "step": 33189 + }, + { + "epoch": 0.19739033209629842, + "grad_norm": 1.5743041038513184, + "learning_rate": 4.534548935739756e-05, + "loss": 4.6756, + "step": 33190 + }, + { + "epoch": 0.1973962793795794, + "grad_norm": 1.6579638719558716, + "learning_rate": 4.5345217914651575e-05, + "loss": 5.2041, + "step": 33191 + }, + { + "epoch": 0.1974022266628604, + "grad_norm": 1.9961597919464111, + "learning_rate": 4.5344946464803264e-05, + "loss": 3.488, + "step": 33192 + }, + { + "epoch": 0.1974081739461414, + "grad_norm": 2.9726107120513916, + "learning_rate": 4.534467500785275e-05, + "loss": 3.443, + "step": 33193 + }, + { + "epoch": 0.1974141212294224, + "grad_norm": 2.6377058029174805, + "learning_rate": 4.5344403543800127e-05, + "loss": 3.6516, + "step": 33194 + }, + { + "epoch": 0.19742006851270338, + "grad_norm": 2.2696948051452637, + "learning_rate": 4.534413207264548e-05, + "loss": 5.2825, + "step": 33195 + }, + { + "epoch": 0.1974260157959844, + "grad_norm": 3.0145084857940674, + "learning_rate": 4.5343860594388906e-05, + "loss": 4.9292, + "step": 33196 + }, + { + "epoch": 0.1974319630792654, + "grad_norm": 1.9451453685760498, + "learning_rate": 4.53435891090305e-05, + "loss": 4.6508, + "step": 33197 + }, + { + "epoch": 0.19743791036254638, + "grad_norm": 1.8160196542739868, + "learning_rate": 4.5343317616570356e-05, + "loss": 4.9695, + "step": 33198 + }, + { + "epoch": 0.1974438576458274, + "grad_norm": 1.522060513496399, + "learning_rate": 4.534304611700858e-05, + "loss": 5.113, + "step": 33199 + }, + { + "epoch": 0.19744980492910838, + "grad_norm": 1.8439273834228516, + "learning_rate": 4.534277461034524e-05, + "loss": 5.3172, + "step": 33200 + }, + { + "epoch": 0.19745575221238937, + "grad_norm": 1.535640001296997, + "learning_rate": 4.5342503096580455e-05, + "loss": 5.1527, + "step": 33201 + }, + { + "epoch": 0.19746169949567038, + "grad_norm": 1.8293157815933228, + "learning_rate": 4.5342231575714314e-05, + "loss": 4.6291, + "step": 33202 + }, + { + "epoch": 0.19746764677895137, + "grad_norm": 1.7495836019515991, + "learning_rate": 4.534196004774691e-05, + "loss": 4.3015, + "step": 33203 + }, + { + "epoch": 0.19747359406223236, + "grad_norm": 1.6512584686279297, + "learning_rate": 4.534168851267834e-05, + "loss": 4.379, + "step": 33204 + }, + { + "epoch": 0.19747954134551338, + "grad_norm": 1.8381824493408203, + "learning_rate": 4.534141697050869e-05, + "loss": 4.3344, + "step": 33205 + }, + { + "epoch": 0.19748548862879436, + "grad_norm": 1.7134857177734375, + "learning_rate": 4.5341145421238065e-05, + "loss": 4.2597, + "step": 33206 + }, + { + "epoch": 0.19749143591207535, + "grad_norm": 1.6309324502944946, + "learning_rate": 4.534087386486656e-05, + "loss": 4.4719, + "step": 33207 + }, + { + "epoch": 0.19749738319535637, + "grad_norm": 1.8172357082366943, + "learning_rate": 4.534060230139426e-05, + "loss": 4.1611, + "step": 33208 + }, + { + "epoch": 0.19750333047863736, + "grad_norm": 1.8709135055541992, + "learning_rate": 4.5340330730821266e-05, + "loss": 4.0607, + "step": 33209 + }, + { + "epoch": 0.19750927776191834, + "grad_norm": 1.7528961896896362, + "learning_rate": 4.534005915314768e-05, + "loss": 4.1092, + "step": 33210 + }, + { + "epoch": 0.19751522504519936, + "grad_norm": 1.7320555448532104, + "learning_rate": 4.533978756837358e-05, + "loss": 4.0203, + "step": 33211 + }, + { + "epoch": 0.19752117232848035, + "grad_norm": 1.8663568496704102, + "learning_rate": 4.533951597649908e-05, + "loss": 4.0895, + "step": 33212 + }, + { + "epoch": 0.19752711961176134, + "grad_norm": 1.8251363039016724, + "learning_rate": 4.533924437752426e-05, + "loss": 4.1971, + "step": 33213 + }, + { + "epoch": 0.19753306689504235, + "grad_norm": 1.9213861227035522, + "learning_rate": 4.533897277144922e-05, + "loss": 5.2216, + "step": 33214 + }, + { + "epoch": 0.19753901417832334, + "grad_norm": 1.9868927001953125, + "learning_rate": 4.533870115827405e-05, + "loss": 5.0257, + "step": 33215 + }, + { + "epoch": 0.19754496146160433, + "grad_norm": 1.8457907438278198, + "learning_rate": 4.5338429537998864e-05, + "loss": 4.9828, + "step": 33216 + }, + { + "epoch": 0.19755090874488535, + "grad_norm": 1.5566126108169556, + "learning_rate": 4.5338157910623724e-05, + "loss": 5.0219, + "step": 33217 + }, + { + "epoch": 0.19755685602816633, + "grad_norm": 1.585070252418518, + "learning_rate": 4.533788627614875e-05, + "loss": 4.8923, + "step": 33218 + }, + { + "epoch": 0.19756280331144732, + "grad_norm": 1.589237928390503, + "learning_rate": 4.5337614634574034e-05, + "loss": 4.973, + "step": 33219 + }, + { + "epoch": 0.19756875059472834, + "grad_norm": 1.6661782264709473, + "learning_rate": 4.533734298589967e-05, + "loss": 5.3401, + "step": 33220 + }, + { + "epoch": 0.19757469787800933, + "grad_norm": 1.6391420364379883, + "learning_rate": 4.533707133012574e-05, + "loss": 5.4286, + "step": 33221 + }, + { + "epoch": 0.1975806451612903, + "grad_norm": 1.8194524049758911, + "learning_rate": 4.533679966725235e-05, + "loss": 5.2341, + "step": 33222 + }, + { + "epoch": 0.19758659244457133, + "grad_norm": 1.606191635131836, + "learning_rate": 4.53365279972796e-05, + "loss": 5.0454, + "step": 33223 + }, + { + "epoch": 0.19759253972785232, + "grad_norm": 1.6205066442489624, + "learning_rate": 4.533625632020757e-05, + "loss": 4.772, + "step": 33224 + }, + { + "epoch": 0.1975984870111333, + "grad_norm": 1.5267688035964966, + "learning_rate": 4.533598463603637e-05, + "loss": 5.0322, + "step": 33225 + }, + { + "epoch": 0.19760443429441432, + "grad_norm": 1.5950186252593994, + "learning_rate": 4.533571294476608e-05, + "loss": 5.1335, + "step": 33226 + }, + { + "epoch": 0.1976103815776953, + "grad_norm": 1.5251303911209106, + "learning_rate": 4.5335441246396814e-05, + "loss": 5.2293, + "step": 33227 + }, + { + "epoch": 0.1976163288609763, + "grad_norm": 1.4677468538284302, + "learning_rate": 4.533516954092865e-05, + "loss": 5.0307, + "step": 33228 + }, + { + "epoch": 0.19762227614425731, + "grad_norm": 1.7119927406311035, + "learning_rate": 4.533489782836169e-05, + "loss": 5.0289, + "step": 33229 + }, + { + "epoch": 0.1976282234275383, + "grad_norm": 1.698115587234497, + "learning_rate": 4.533462610869602e-05, + "loss": 4.7599, + "step": 33230 + }, + { + "epoch": 0.1976341707108193, + "grad_norm": 1.7038428783416748, + "learning_rate": 4.5334354381931755e-05, + "loss": 5.3396, + "step": 33231 + }, + { + "epoch": 0.1976401179941003, + "grad_norm": 2.631863594055176, + "learning_rate": 4.533408264806897e-05, + "loss": 4.3216, + "step": 33232 + }, + { + "epoch": 0.1976460652773813, + "grad_norm": 1.6633570194244385, + "learning_rate": 4.533381090710776e-05, + "loss": 5.5176, + "step": 33233 + }, + { + "epoch": 0.19765201256066228, + "grad_norm": 1.5169641971588135, + "learning_rate": 4.533353915904823e-05, + "loss": 5.5365, + "step": 33234 + }, + { + "epoch": 0.1976579598439433, + "grad_norm": 1.8276102542877197, + "learning_rate": 4.533326740389048e-05, + "loss": 5.1131, + "step": 33235 + }, + { + "epoch": 0.1976639071272243, + "grad_norm": 1.5195908546447754, + "learning_rate": 4.533299564163459e-05, + "loss": 5.0087, + "step": 33236 + }, + { + "epoch": 0.19766985441050527, + "grad_norm": 1.7778794765472412, + "learning_rate": 4.533272387228066e-05, + "loss": 4.638, + "step": 33237 + }, + { + "epoch": 0.1976758016937863, + "grad_norm": 1.6670910120010376, + "learning_rate": 4.533245209582879e-05, + "loss": 5.7267, + "step": 33238 + }, + { + "epoch": 0.19768174897706728, + "grad_norm": 1.620630145072937, + "learning_rate": 4.533218031227907e-05, + "loss": 5.6041, + "step": 33239 + }, + { + "epoch": 0.19768769626034827, + "grad_norm": 1.9409407377243042, + "learning_rate": 4.5331908521631594e-05, + "loss": 5.3162, + "step": 33240 + }, + { + "epoch": 0.19769364354362928, + "grad_norm": 1.670377492904663, + "learning_rate": 4.533163672388646e-05, + "loss": 4.9583, + "step": 33241 + }, + { + "epoch": 0.19769959082691027, + "grad_norm": 1.6396856307983398, + "learning_rate": 4.5331364919043764e-05, + "loss": 5.4872, + "step": 33242 + }, + { + "epoch": 0.19770553811019126, + "grad_norm": 1.6345281600952148, + "learning_rate": 4.5331093107103594e-05, + "loss": 5.1595, + "step": 33243 + }, + { + "epoch": 0.19771148539347227, + "grad_norm": 1.724963903427124, + "learning_rate": 4.533082128806605e-05, + "loss": 4.845, + "step": 33244 + }, + { + "epoch": 0.19771743267675326, + "grad_norm": 1.480117917060852, + "learning_rate": 4.5330549461931223e-05, + "loss": 5.2803, + "step": 33245 + }, + { + "epoch": 0.19772337996003425, + "grad_norm": 1.884554386138916, + "learning_rate": 4.533027762869923e-05, + "loss": 5.307, + "step": 33246 + }, + { + "epoch": 0.19772932724331527, + "grad_norm": 1.9123550653457642, + "learning_rate": 4.5330005788370124e-05, + "loss": 4.961, + "step": 33247 + }, + { + "epoch": 0.19773527452659626, + "grad_norm": 1.6469107866287231, + "learning_rate": 4.532973394094403e-05, + "loss": 4.7209, + "step": 33248 + }, + { + "epoch": 0.19774122180987724, + "grad_norm": 1.5761797428131104, + "learning_rate": 4.532946208642104e-05, + "loss": 5.3125, + "step": 33249 + }, + { + "epoch": 0.19774716909315823, + "grad_norm": 1.590433120727539, + "learning_rate": 4.532919022480124e-05, + "loss": 5.0676, + "step": 33250 + }, + { + "epoch": 0.19775311637643925, + "grad_norm": 1.5116517543792725, + "learning_rate": 4.532891835608474e-05, + "loss": 5.055, + "step": 33251 + }, + { + "epoch": 0.19775906365972024, + "grad_norm": 1.588983178138733, + "learning_rate": 4.5328646480271616e-05, + "loss": 5.0159, + "step": 33252 + }, + { + "epoch": 0.19776501094300122, + "grad_norm": 1.7448828220367432, + "learning_rate": 4.532837459736197e-05, + "loss": 5.2739, + "step": 33253 + }, + { + "epoch": 0.19777095822628224, + "grad_norm": 1.6734801530838013, + "learning_rate": 4.53281027073559e-05, + "loss": 5.0657, + "step": 33254 + }, + { + "epoch": 0.19777690550956323, + "grad_norm": 1.8620131015777588, + "learning_rate": 4.53278308102535e-05, + "loss": 5.2255, + "step": 33255 + }, + { + "epoch": 0.19778285279284422, + "grad_norm": 1.940711259841919, + "learning_rate": 4.532755890605487e-05, + "loss": 5.1548, + "step": 33256 + }, + { + "epoch": 0.19778880007612523, + "grad_norm": 1.5663199424743652, + "learning_rate": 4.532728699476009e-05, + "loss": 5.0682, + "step": 33257 + }, + { + "epoch": 0.19779474735940622, + "grad_norm": 1.5207146406173706, + "learning_rate": 4.532701507636927e-05, + "loss": 5.418, + "step": 33258 + }, + { + "epoch": 0.1978006946426872, + "grad_norm": 1.4254070520401, + "learning_rate": 4.53267431508825e-05, + "loss": 5.4406, + "step": 33259 + }, + { + "epoch": 0.19780664192596822, + "grad_norm": 1.7687779664993286, + "learning_rate": 4.5326471218299873e-05, + "loss": 4.6602, + "step": 33260 + }, + { + "epoch": 0.1978125892092492, + "grad_norm": 1.4113342761993408, + "learning_rate": 4.532619927862148e-05, + "loss": 4.7432, + "step": 33261 + }, + { + "epoch": 0.1978185364925302, + "grad_norm": 1.816176176071167, + "learning_rate": 4.5325927331847424e-05, + "loss": 5.2539, + "step": 33262 + }, + { + "epoch": 0.19782448377581122, + "grad_norm": 1.5218030214309692, + "learning_rate": 4.5325655377977796e-05, + "loss": 4.8937, + "step": 33263 + }, + { + "epoch": 0.1978304310590922, + "grad_norm": 1.8468036651611328, + "learning_rate": 4.532538341701269e-05, + "loss": 5.0516, + "step": 33264 + }, + { + "epoch": 0.1978363783423732, + "grad_norm": 1.8353204727172852, + "learning_rate": 4.532511144895221e-05, + "loss": 4.9425, + "step": 33265 + }, + { + "epoch": 0.1978423256256542, + "grad_norm": 1.6375226974487305, + "learning_rate": 4.532483947379644e-05, + "loss": 4.7085, + "step": 33266 + }, + { + "epoch": 0.1978482729089352, + "grad_norm": 1.9428247213363647, + "learning_rate": 4.532456749154548e-05, + "loss": 4.844, + "step": 33267 + }, + { + "epoch": 0.19785422019221618, + "grad_norm": 3.1580700874328613, + "learning_rate": 4.532429550219941e-05, + "loss": 2.9092, + "step": 33268 + }, + { + "epoch": 0.1978601674754972, + "grad_norm": 1.4939215183258057, + "learning_rate": 4.532402350575835e-05, + "loss": 5.4607, + "step": 33269 + }, + { + "epoch": 0.1978661147587782, + "grad_norm": 1.400402307510376, + "learning_rate": 4.532375150222239e-05, + "loss": 5.3641, + "step": 33270 + }, + { + "epoch": 0.19787206204205918, + "grad_norm": 1.6665794849395752, + "learning_rate": 4.532347949159161e-05, + "loss": 4.5916, + "step": 33271 + }, + { + "epoch": 0.1978780093253402, + "grad_norm": 1.910585641860962, + "learning_rate": 4.532320747386612e-05, + "loss": 5.1528, + "step": 33272 + }, + { + "epoch": 0.19788395660862118, + "grad_norm": 1.7386438846588135, + "learning_rate": 4.5322935449045994e-05, + "loss": 4.8547, + "step": 33273 + }, + { + "epoch": 0.19788990389190217, + "grad_norm": 1.7548339366912842, + "learning_rate": 4.532266341713135e-05, + "loss": 4.9357, + "step": 33274 + }, + { + "epoch": 0.19789585117518319, + "grad_norm": 1.6126796007156372, + "learning_rate": 4.532239137812228e-05, + "loss": 4.8952, + "step": 33275 + }, + { + "epoch": 0.19790179845846417, + "grad_norm": 1.598954677581787, + "learning_rate": 4.5322119332018866e-05, + "loss": 4.6828, + "step": 33276 + }, + { + "epoch": 0.19790774574174516, + "grad_norm": 1.8405553102493286, + "learning_rate": 4.532184727882121e-05, + "loss": 4.5404, + "step": 33277 + }, + { + "epoch": 0.19791369302502618, + "grad_norm": 2.304363965988159, + "learning_rate": 4.5321575218529406e-05, + "loss": 4.3031, + "step": 33278 + }, + { + "epoch": 0.19791964030830717, + "grad_norm": 2.450482130050659, + "learning_rate": 4.532130315114355e-05, + "loss": 4.1944, + "step": 33279 + }, + { + "epoch": 0.19792558759158815, + "grad_norm": 2.3713395595550537, + "learning_rate": 4.532103107666374e-05, + "loss": 4.3168, + "step": 33280 + }, + { + "epoch": 0.19793153487486917, + "grad_norm": 2.1909902095794678, + "learning_rate": 4.5320758995090064e-05, + "loss": 4.4707, + "step": 33281 + }, + { + "epoch": 0.19793748215815016, + "grad_norm": 1.8460273742675781, + "learning_rate": 4.5320486906422624e-05, + "loss": 4.7456, + "step": 33282 + }, + { + "epoch": 0.19794342944143115, + "grad_norm": 1.8075324296951294, + "learning_rate": 4.5320214810661514e-05, + "loss": 4.5133, + "step": 33283 + }, + { + "epoch": 0.19794937672471216, + "grad_norm": 1.9076029062271118, + "learning_rate": 4.531994270780683e-05, + "loss": 4.7133, + "step": 33284 + }, + { + "epoch": 0.19795532400799315, + "grad_norm": 1.7290363311767578, + "learning_rate": 4.531967059785865e-05, + "loss": 4.7903, + "step": 33285 + }, + { + "epoch": 0.19796127129127414, + "grad_norm": 2.2225213050842285, + "learning_rate": 4.53193984808171e-05, + "loss": 4.896, + "step": 33286 + }, + { + "epoch": 0.19796721857455515, + "grad_norm": 1.7413650751113892, + "learning_rate": 4.531912635668224e-05, + "loss": 5.7902, + "step": 33287 + }, + { + "epoch": 0.19797316585783614, + "grad_norm": 1.6677063703536987, + "learning_rate": 4.53188542254542e-05, + "loss": 5.6381, + "step": 33288 + }, + { + "epoch": 0.19797911314111713, + "grad_norm": 1.64964759349823, + "learning_rate": 4.531858208713305e-05, + "loss": 4.8681, + "step": 33289 + }, + { + "epoch": 0.19798506042439815, + "grad_norm": 1.789642333984375, + "learning_rate": 4.531830994171889e-05, + "loss": 4.8042, + "step": 33290 + }, + { + "epoch": 0.19799100770767913, + "grad_norm": 1.920061707496643, + "learning_rate": 4.531803778921182e-05, + "loss": 4.7461, + "step": 33291 + }, + { + "epoch": 0.19799695499096012, + "grad_norm": 1.8320075273513794, + "learning_rate": 4.531776562961194e-05, + "loss": 4.6238, + "step": 33292 + }, + { + "epoch": 0.19800290227424114, + "grad_norm": 1.7324212789535522, + "learning_rate": 4.531749346291933e-05, + "loss": 4.5368, + "step": 33293 + }, + { + "epoch": 0.19800884955752213, + "grad_norm": 2.327019453048706, + "learning_rate": 4.531722128913409e-05, + "loss": 4.1915, + "step": 33294 + }, + { + "epoch": 0.19801479684080311, + "grad_norm": 2.1580569744110107, + "learning_rate": 4.531694910825632e-05, + "loss": 4.2753, + "step": 33295 + }, + { + "epoch": 0.19802074412408413, + "grad_norm": 1.9125664234161377, + "learning_rate": 4.5316676920286125e-05, + "loss": 4.9737, + "step": 33296 + }, + { + "epoch": 0.19802669140736512, + "grad_norm": 2.3731091022491455, + "learning_rate": 4.5316404725223575e-05, + "loss": 5.0, + "step": 33297 + }, + { + "epoch": 0.1980326386906461, + "grad_norm": 2.2052502632141113, + "learning_rate": 4.531613252306879e-05, + "loss": 4.9842, + "step": 33298 + }, + { + "epoch": 0.19803858597392712, + "grad_norm": 1.8605939149856567, + "learning_rate": 4.5315860313821846e-05, + "loss": 4.9534, + "step": 33299 + }, + { + "epoch": 0.1980445332572081, + "grad_norm": 1.9243404865264893, + "learning_rate": 4.531558809748284e-05, + "loss": 4.7275, + "step": 33300 + }, + { + "epoch": 0.1980504805404891, + "grad_norm": 1.8417762517929077, + "learning_rate": 4.531531587405188e-05, + "loss": 4.5768, + "step": 33301 + }, + { + "epoch": 0.19805642782377011, + "grad_norm": 2.7929775714874268, + "learning_rate": 4.531504364352904e-05, + "loss": 3.6382, + "step": 33302 + }, + { + "epoch": 0.1980623751070511, + "grad_norm": 2.665148973464966, + "learning_rate": 4.531477140591444e-05, + "loss": 3.6138, + "step": 33303 + }, + { + "epoch": 0.1980683223903321, + "grad_norm": 2.0774621963500977, + "learning_rate": 4.531449916120816e-05, + "loss": 3.5553, + "step": 33304 + }, + { + "epoch": 0.1980742696736131, + "grad_norm": 1.8317457437515259, + "learning_rate": 4.53142269094103e-05, + "loss": 4.8087, + "step": 33305 + }, + { + "epoch": 0.1980802169568941, + "grad_norm": 1.7544660568237305, + "learning_rate": 4.531395465052095e-05, + "loss": 6.1719, + "step": 33306 + }, + { + "epoch": 0.19808616424017508, + "grad_norm": 3.286212205886841, + "learning_rate": 4.5313682384540216e-05, + "loss": 3.6332, + "step": 33307 + }, + { + "epoch": 0.19809211152345607, + "grad_norm": 3.265216112136841, + "learning_rate": 4.531341011146818e-05, + "loss": 2.1208, + "step": 33308 + }, + { + "epoch": 0.1980980588067371, + "grad_norm": 2.458509683609009, + "learning_rate": 4.531313783130494e-05, + "loss": 3.4689, + "step": 33309 + }, + { + "epoch": 0.19810400609001808, + "grad_norm": 2.342417001724243, + "learning_rate": 4.53128655440506e-05, + "loss": 3.4864, + "step": 33310 + }, + { + "epoch": 0.19810995337329906, + "grad_norm": 2.6172118186950684, + "learning_rate": 4.5312593249705236e-05, + "loss": 3.5505, + "step": 33311 + }, + { + "epoch": 0.19811590065658008, + "grad_norm": 2.6422629356384277, + "learning_rate": 4.5312320948268974e-05, + "loss": 3.7501, + "step": 33312 + }, + { + "epoch": 0.19812184793986107, + "grad_norm": 2.1356923580169678, + "learning_rate": 4.5312048639741875e-05, + "loss": 4.1028, + "step": 33313 + }, + { + "epoch": 0.19812779522314206, + "grad_norm": 1.9619426727294922, + "learning_rate": 4.531177632412406e-05, + "loss": 4.461, + "step": 33314 + }, + { + "epoch": 0.19813374250642307, + "grad_norm": 2.336240768432617, + "learning_rate": 4.531150400141561e-05, + "loss": 3.2982, + "step": 33315 + }, + { + "epoch": 0.19813968978970406, + "grad_norm": 2.5709304809570312, + "learning_rate": 4.531123167161662e-05, + "loss": 3.0417, + "step": 33316 + }, + { + "epoch": 0.19814563707298505, + "grad_norm": 3.9337923526763916, + "learning_rate": 4.531095933472719e-05, + "loss": 2.95, + "step": 33317 + }, + { + "epoch": 0.19815158435626606, + "grad_norm": 2.6982581615448, + "learning_rate": 4.5310686990747416e-05, + "loss": 3.5422, + "step": 33318 + }, + { + "epoch": 0.19815753163954705, + "grad_norm": 2.1642324924468994, + "learning_rate": 4.531041463967738e-05, + "loss": 4.9477, + "step": 33319 + }, + { + "epoch": 0.19816347892282804, + "grad_norm": 1.937697410583496, + "learning_rate": 4.53101422815172e-05, + "loss": 4.7786, + "step": 33320 + }, + { + "epoch": 0.19816942620610906, + "grad_norm": 1.599066138267517, + "learning_rate": 4.530986991626696e-05, + "loss": 5.2083, + "step": 33321 + }, + { + "epoch": 0.19817537348939004, + "grad_norm": 1.5987446308135986, + "learning_rate": 4.530959754392675e-05, + "loss": 5.1838, + "step": 33322 + }, + { + "epoch": 0.19818132077267103, + "grad_norm": 1.5494792461395264, + "learning_rate": 4.530932516449668e-05, + "loss": 5.5557, + "step": 33323 + }, + { + "epoch": 0.19818726805595205, + "grad_norm": 1.662477731704712, + "learning_rate": 4.530905277797682e-05, + "loss": 5.4674, + "step": 33324 + }, + { + "epoch": 0.19819321533923304, + "grad_norm": 1.4203627109527588, + "learning_rate": 4.530878038436729e-05, + "loss": 5.7035, + "step": 33325 + }, + { + "epoch": 0.19819916262251402, + "grad_norm": 1.727128267288208, + "learning_rate": 4.5308507983668165e-05, + "loss": 5.0072, + "step": 33326 + }, + { + "epoch": 0.19820510990579504, + "grad_norm": 1.7568631172180176, + "learning_rate": 4.530823557587955e-05, + "loss": 4.7131, + "step": 33327 + }, + { + "epoch": 0.19821105718907603, + "grad_norm": 1.8544484376907349, + "learning_rate": 4.530796316100155e-05, + "loss": 4.6808, + "step": 33328 + }, + { + "epoch": 0.19821700447235702, + "grad_norm": 1.6898458003997803, + "learning_rate": 4.530769073903424e-05, + "loss": 4.8085, + "step": 33329 + }, + { + "epoch": 0.19822295175563803, + "grad_norm": 2.1594486236572266, + "learning_rate": 4.530741830997773e-05, + "loss": 5.1586, + "step": 33330 + }, + { + "epoch": 0.19822889903891902, + "grad_norm": 1.6536179780960083, + "learning_rate": 4.5307145873832116e-05, + "loss": 5.4879, + "step": 33331 + }, + { + "epoch": 0.1982348463222, + "grad_norm": 1.6635406017303467, + "learning_rate": 4.530687343059748e-05, + "loss": 5.5663, + "step": 33332 + }, + { + "epoch": 0.19824079360548102, + "grad_norm": 1.500622272491455, + "learning_rate": 4.530660098027392e-05, + "loss": 5.3621, + "step": 33333 + }, + { + "epoch": 0.198246740888762, + "grad_norm": 1.6053495407104492, + "learning_rate": 4.530632852286154e-05, + "loss": 4.6813, + "step": 33334 + }, + { + "epoch": 0.198252688172043, + "grad_norm": 2.006056308746338, + "learning_rate": 4.5306056058360424e-05, + "loss": 3.9102, + "step": 33335 + }, + { + "epoch": 0.19825863545532402, + "grad_norm": 1.6927076578140259, + "learning_rate": 4.5305783586770686e-05, + "loss": 5.4046, + "step": 33336 + }, + { + "epoch": 0.198264582738605, + "grad_norm": 1.7682117223739624, + "learning_rate": 4.53055111080924e-05, + "loss": 5.3556, + "step": 33337 + }, + { + "epoch": 0.198270530021886, + "grad_norm": 1.7829780578613281, + "learning_rate": 4.5305238622325676e-05, + "loss": 5.2791, + "step": 33338 + }, + { + "epoch": 0.198276477305167, + "grad_norm": 1.6257526874542236, + "learning_rate": 4.53049661294706e-05, + "loss": 5.2301, + "step": 33339 + }, + { + "epoch": 0.198282424588448, + "grad_norm": 1.6963531970977783, + "learning_rate": 4.530469362952727e-05, + "loss": 5.1795, + "step": 33340 + }, + { + "epoch": 0.19828837187172899, + "grad_norm": 1.9438083171844482, + "learning_rate": 4.5304421122495774e-05, + "loss": 5.0384, + "step": 33341 + }, + { + "epoch": 0.19829431915501, + "grad_norm": 1.8972619771957397, + "learning_rate": 4.530414860837623e-05, + "loss": 5.6679, + "step": 33342 + }, + { + "epoch": 0.198300266438291, + "grad_norm": 1.9090536832809448, + "learning_rate": 4.530387608716871e-05, + "loss": 5.5994, + "step": 33343 + }, + { + "epoch": 0.19830621372157198, + "grad_norm": 1.7110793590545654, + "learning_rate": 4.530360355887331e-05, + "loss": 5.4753, + "step": 33344 + }, + { + "epoch": 0.198312161004853, + "grad_norm": 1.8114757537841797, + "learning_rate": 4.5303331023490136e-05, + "loss": 5.456, + "step": 33345 + }, + { + "epoch": 0.19831810828813398, + "grad_norm": 1.671255111694336, + "learning_rate": 4.530305848101928e-05, + "loss": 5.4312, + "step": 33346 + }, + { + "epoch": 0.19832405557141497, + "grad_norm": 3.440305471420288, + "learning_rate": 4.5302785931460836e-05, + "loss": 2.4397, + "step": 33347 + }, + { + "epoch": 0.19833000285469599, + "grad_norm": 3.13948655128479, + "learning_rate": 4.53025133748149e-05, + "loss": 2.306, + "step": 33348 + }, + { + "epoch": 0.19833595013797697, + "grad_norm": 2.2596566677093506, + "learning_rate": 4.5302240811081566e-05, + "loss": 3.5614, + "step": 33349 + }, + { + "epoch": 0.19834189742125796, + "grad_norm": 1.682121753692627, + "learning_rate": 4.530196824026093e-05, + "loss": 4.9683, + "step": 33350 + }, + { + "epoch": 0.19834784470453898, + "grad_norm": 1.9128234386444092, + "learning_rate": 4.530169566235308e-05, + "loss": 5.4938, + "step": 33351 + }, + { + "epoch": 0.19835379198781997, + "grad_norm": 2.0970449447631836, + "learning_rate": 4.530142307735813e-05, + "loss": 5.2809, + "step": 33352 + }, + { + "epoch": 0.19835973927110095, + "grad_norm": 1.815956711769104, + "learning_rate": 4.5301150485276156e-05, + "loss": 4.8106, + "step": 33353 + }, + { + "epoch": 0.19836568655438197, + "grad_norm": 2.473682403564453, + "learning_rate": 4.5300877886107264e-05, + "loss": 3.9659, + "step": 33354 + }, + { + "epoch": 0.19837163383766296, + "grad_norm": 4.671222686767578, + "learning_rate": 4.530060527985154e-05, + "loss": 2.8541, + "step": 33355 + }, + { + "epoch": 0.19837758112094395, + "grad_norm": 1.7974921464920044, + "learning_rate": 4.530033266650908e-05, + "loss": 5.2783, + "step": 33356 + }, + { + "epoch": 0.19838352840422496, + "grad_norm": 4.036770820617676, + "learning_rate": 4.5300060046079996e-05, + "loss": 3.7766, + "step": 33357 + }, + { + "epoch": 0.19838947568750595, + "grad_norm": 3.522930860519409, + "learning_rate": 4.529978741856436e-05, + "loss": 2.3974, + "step": 33358 + }, + { + "epoch": 0.19839542297078694, + "grad_norm": 2.413550615310669, + "learning_rate": 4.5299514783962285e-05, + "loss": 3.1974, + "step": 33359 + }, + { + "epoch": 0.19840137025406795, + "grad_norm": 1.75148606300354, + "learning_rate": 4.529924214227386e-05, + "loss": 4.9708, + "step": 33360 + }, + { + "epoch": 0.19840731753734894, + "grad_norm": 1.5809080600738525, + "learning_rate": 4.5298969493499165e-05, + "loss": 4.8973, + "step": 33361 + }, + { + "epoch": 0.19841326482062993, + "grad_norm": 1.7478617429733276, + "learning_rate": 4.5298696837638325e-05, + "loss": 5.2721, + "step": 33362 + }, + { + "epoch": 0.19841921210391095, + "grad_norm": 1.6357113122940063, + "learning_rate": 4.5298424174691417e-05, + "loss": 5.1701, + "step": 33363 + }, + { + "epoch": 0.19842515938719194, + "grad_norm": 1.5457570552825928, + "learning_rate": 4.5298151504658536e-05, + "loss": 5.1177, + "step": 33364 + }, + { + "epoch": 0.19843110667047292, + "grad_norm": 2.2305829524993896, + "learning_rate": 4.5297878827539784e-05, + "loss": 4.1489, + "step": 33365 + }, + { + "epoch": 0.19843705395375394, + "grad_norm": 3.119000196456909, + "learning_rate": 4.529760614333525e-05, + "loss": 3.8102, + "step": 33366 + }, + { + "epoch": 0.19844300123703493, + "grad_norm": 2.1986236572265625, + "learning_rate": 4.5297333452045025e-05, + "loss": 4.9004, + "step": 33367 + }, + { + "epoch": 0.19844894852031592, + "grad_norm": 1.6912589073181152, + "learning_rate": 4.5297060753669216e-05, + "loss": 5.6423, + "step": 33368 + }, + { + "epoch": 0.1984548958035969, + "grad_norm": 1.681021809577942, + "learning_rate": 4.5296788048207915e-05, + "loss": 4.3601, + "step": 33369 + }, + { + "epoch": 0.19846084308687792, + "grad_norm": 1.6064156293869019, + "learning_rate": 4.529651533566122e-05, + "loss": 4.565, + "step": 33370 + }, + { + "epoch": 0.1984667903701589, + "grad_norm": 1.5751850605010986, + "learning_rate": 4.5296242616029204e-05, + "loss": 5.108, + "step": 33371 + }, + { + "epoch": 0.1984727376534399, + "grad_norm": 1.5508745908737183, + "learning_rate": 4.5295969889312e-05, + "loss": 5.8937, + "step": 33372 + }, + { + "epoch": 0.1984786849367209, + "grad_norm": 1.5728036165237427, + "learning_rate": 4.5295697155509665e-05, + "loss": 5.6068, + "step": 33373 + }, + { + "epoch": 0.1984846322200019, + "grad_norm": 1.8891894817352295, + "learning_rate": 4.5295424414622315e-05, + "loss": 4.8044, + "step": 33374 + }, + { + "epoch": 0.1984905795032829, + "grad_norm": 2.7243154048919678, + "learning_rate": 4.529515166665005e-05, + "loss": 3.5559, + "step": 33375 + }, + { + "epoch": 0.1984965267865639, + "grad_norm": 2.6664438247680664, + "learning_rate": 4.529487891159295e-05, + "loss": 3.6558, + "step": 33376 + }, + { + "epoch": 0.1985024740698449, + "grad_norm": 1.6460233926773071, + "learning_rate": 4.5294606149451125e-05, + "loss": 4.8593, + "step": 33377 + }, + { + "epoch": 0.19850842135312588, + "grad_norm": 1.710748553276062, + "learning_rate": 4.5294333380224655e-05, + "loss": 5.2335, + "step": 33378 + }, + { + "epoch": 0.1985143686364069, + "grad_norm": 1.633082628250122, + "learning_rate": 4.529406060391365e-05, + "loss": 5.0631, + "step": 33379 + }, + { + "epoch": 0.19852031591968788, + "grad_norm": 1.5868422985076904, + "learning_rate": 4.529378782051819e-05, + "loss": 4.4778, + "step": 33380 + }, + { + "epoch": 0.19852626320296887, + "grad_norm": 1.3042185306549072, + "learning_rate": 4.529351503003838e-05, + "loss": 4.5756, + "step": 33381 + }, + { + "epoch": 0.1985322104862499, + "grad_norm": 1.6254199743270874, + "learning_rate": 4.529324223247432e-05, + "loss": 5.177, + "step": 33382 + }, + { + "epoch": 0.19853815776953088, + "grad_norm": 1.570239782333374, + "learning_rate": 4.529296942782609e-05, + "loss": 5.0367, + "step": 33383 + }, + { + "epoch": 0.19854410505281186, + "grad_norm": 1.8509953022003174, + "learning_rate": 4.52926966160938e-05, + "loss": 4.5187, + "step": 33384 + }, + { + "epoch": 0.19855005233609288, + "grad_norm": 1.6336568593978882, + "learning_rate": 4.529242379727754e-05, + "loss": 5.3582, + "step": 33385 + }, + { + "epoch": 0.19855599961937387, + "grad_norm": 1.587899923324585, + "learning_rate": 4.52921509713774e-05, + "loss": 4.8127, + "step": 33386 + }, + { + "epoch": 0.19856194690265486, + "grad_norm": 1.5488510131835938, + "learning_rate": 4.529187813839349e-05, + "loss": 5.5138, + "step": 33387 + }, + { + "epoch": 0.19856789418593587, + "grad_norm": 1.808288335800171, + "learning_rate": 4.5291605298325884e-05, + "loss": 4.5717, + "step": 33388 + }, + { + "epoch": 0.19857384146921686, + "grad_norm": 1.478675365447998, + "learning_rate": 4.5291332451174687e-05, + "loss": 4.53, + "step": 33389 + }, + { + "epoch": 0.19857978875249785, + "grad_norm": 1.9420822858810425, + "learning_rate": 4.5291059596940004e-05, + "loss": 4.5866, + "step": 33390 + }, + { + "epoch": 0.19858573603577886, + "grad_norm": 2.0074143409729004, + "learning_rate": 4.5290786735621916e-05, + "loss": 4.9823, + "step": 33391 + }, + { + "epoch": 0.19859168331905985, + "grad_norm": 1.8389657735824585, + "learning_rate": 4.529051386722053e-05, + "loss": 5.1662, + "step": 33392 + }, + { + "epoch": 0.19859763060234084, + "grad_norm": 1.6590776443481445, + "learning_rate": 4.5290240991735934e-05, + "loss": 5.5359, + "step": 33393 + }, + { + "epoch": 0.19860357788562186, + "grad_norm": 1.7295751571655273, + "learning_rate": 4.5289968109168216e-05, + "loss": 3.9299, + "step": 33394 + }, + { + "epoch": 0.19860952516890285, + "grad_norm": 1.7071540355682373, + "learning_rate": 4.5289695219517486e-05, + "loss": 3.8308, + "step": 33395 + }, + { + "epoch": 0.19861547245218383, + "grad_norm": 1.7689669132232666, + "learning_rate": 4.528942232278383e-05, + "loss": 3.9113, + "step": 33396 + }, + { + "epoch": 0.19862141973546485, + "grad_norm": 1.9830238819122314, + "learning_rate": 4.5289149418967345e-05, + "loss": 4.1391, + "step": 33397 + }, + { + "epoch": 0.19862736701874584, + "grad_norm": 2.3440747261047363, + "learning_rate": 4.5288876508068136e-05, + "loss": 3.2765, + "step": 33398 + }, + { + "epoch": 0.19863331430202683, + "grad_norm": 1.9929230213165283, + "learning_rate": 4.528860359008629e-05, + "loss": 4.2321, + "step": 33399 + }, + { + "epoch": 0.19863926158530784, + "grad_norm": 1.9815763235092163, + "learning_rate": 4.528833066502189e-05, + "loss": 4.7819, + "step": 33400 + }, + { + "epoch": 0.19864520886858883, + "grad_norm": 1.6043485403060913, + "learning_rate": 4.528805773287506e-05, + "loss": 5.1746, + "step": 33401 + }, + { + "epoch": 0.19865115615186982, + "grad_norm": 1.8365287780761719, + "learning_rate": 4.528778479364586e-05, + "loss": 4.8251, + "step": 33402 + }, + { + "epoch": 0.19865710343515083, + "grad_norm": 1.966765284538269, + "learning_rate": 4.5287511847334416e-05, + "loss": 5.223, + "step": 33403 + }, + { + "epoch": 0.19866305071843182, + "grad_norm": 1.7002321481704712, + "learning_rate": 4.528723889394081e-05, + "loss": 4.4082, + "step": 33404 + }, + { + "epoch": 0.1986689980017128, + "grad_norm": 2.144162893295288, + "learning_rate": 4.528696593346513e-05, + "loss": 4.077, + "step": 33405 + }, + { + "epoch": 0.19867494528499383, + "grad_norm": 1.9121687412261963, + "learning_rate": 4.528669296590749e-05, + "loss": 4.2574, + "step": 33406 + }, + { + "epoch": 0.19868089256827481, + "grad_norm": 1.817332148551941, + "learning_rate": 4.5286419991267966e-05, + "loss": 4.4668, + "step": 33407 + }, + { + "epoch": 0.1986868398515558, + "grad_norm": 2.071458578109741, + "learning_rate": 4.528614700954667e-05, + "loss": 5.1306, + "step": 33408 + }, + { + "epoch": 0.19869278713483682, + "grad_norm": 1.7303532361984253, + "learning_rate": 4.528587402074369e-05, + "loss": 4.905, + "step": 33409 + }, + { + "epoch": 0.1986987344181178, + "grad_norm": 1.7372905015945435, + "learning_rate": 4.528560102485912e-05, + "loss": 5.1694, + "step": 33410 + }, + { + "epoch": 0.1987046817013988, + "grad_norm": 1.7425367832183838, + "learning_rate": 4.528532802189306e-05, + "loss": 3.9595, + "step": 33411 + }, + { + "epoch": 0.1987106289846798, + "grad_norm": 1.6367287635803223, + "learning_rate": 4.528505501184559e-05, + "loss": 3.797, + "step": 33412 + }, + { + "epoch": 0.1987165762679608, + "grad_norm": 1.4426088333129883, + "learning_rate": 4.5284781994716826e-05, + "loss": 4.0362, + "step": 33413 + }, + { + "epoch": 0.1987225235512418, + "grad_norm": 1.4077881574630737, + "learning_rate": 4.528450897050685e-05, + "loss": 4.0144, + "step": 33414 + }, + { + "epoch": 0.1987284708345228, + "grad_norm": 1.437828540802002, + "learning_rate": 4.5284235939215765e-05, + "loss": 4.0895, + "step": 33415 + }, + { + "epoch": 0.1987344181178038, + "grad_norm": 1.3500796556472778, + "learning_rate": 4.5283962900843654e-05, + "loss": 4.1622, + "step": 33416 + }, + { + "epoch": 0.19874036540108478, + "grad_norm": 2.1117026805877686, + "learning_rate": 4.528368985539063e-05, + "loss": 5.0076, + "step": 33417 + }, + { + "epoch": 0.1987463126843658, + "grad_norm": 1.337552547454834, + "learning_rate": 4.528341680285678e-05, + "loss": 5.1327, + "step": 33418 + }, + { + "epoch": 0.19875225996764678, + "grad_norm": 2.3471126556396484, + "learning_rate": 4.5283143743242197e-05, + "loss": 3.321, + "step": 33419 + }, + { + "epoch": 0.19875820725092777, + "grad_norm": 2.268986940383911, + "learning_rate": 4.528287067654697e-05, + "loss": 3.1999, + "step": 33420 + }, + { + "epoch": 0.1987641545342088, + "grad_norm": 1.8402795791625977, + "learning_rate": 4.5282597602771215e-05, + "loss": 4.2179, + "step": 33421 + }, + { + "epoch": 0.19877010181748977, + "grad_norm": 1.7201100587844849, + "learning_rate": 4.528232452191501e-05, + "loss": 4.1152, + "step": 33422 + }, + { + "epoch": 0.19877604910077076, + "grad_norm": 1.4088517427444458, + "learning_rate": 4.528205143397846e-05, + "loss": 4.6504, + "step": 33423 + }, + { + "epoch": 0.19878199638405178, + "grad_norm": 1.7721384763717651, + "learning_rate": 4.5281778338961644e-05, + "loss": 5.1871, + "step": 33424 + }, + { + "epoch": 0.19878794366733277, + "grad_norm": 2.0416159629821777, + "learning_rate": 4.528150523686468e-05, + "loss": 4.2345, + "step": 33425 + }, + { + "epoch": 0.19879389095061376, + "grad_norm": 1.943342924118042, + "learning_rate": 4.528123212768764e-05, + "loss": 5.1177, + "step": 33426 + }, + { + "epoch": 0.19879983823389474, + "grad_norm": 1.8057464361190796, + "learning_rate": 4.528095901143063e-05, + "loss": 4.8638, + "step": 33427 + }, + { + "epoch": 0.19880578551717576, + "grad_norm": 1.9637550115585327, + "learning_rate": 4.5280685888093764e-05, + "loss": 4.5566, + "step": 33428 + }, + { + "epoch": 0.19881173280045675, + "grad_norm": 1.7107211351394653, + "learning_rate": 4.5280412757677104e-05, + "loss": 5.3038, + "step": 33429 + }, + { + "epoch": 0.19881768008373774, + "grad_norm": 1.9364093542099, + "learning_rate": 4.5280139620180773e-05, + "loss": 5.1847, + "step": 33430 + }, + { + "epoch": 0.19882362736701875, + "grad_norm": 1.9583579301834106, + "learning_rate": 4.5279866475604846e-05, + "loss": 5.1695, + "step": 33431 + }, + { + "epoch": 0.19882957465029974, + "grad_norm": 1.931999683380127, + "learning_rate": 4.527959332394943e-05, + "loss": 5.2474, + "step": 33432 + }, + { + "epoch": 0.19883552193358073, + "grad_norm": 1.7884893417358398, + "learning_rate": 4.5279320165214623e-05, + "loss": 5.1989, + "step": 33433 + }, + { + "epoch": 0.19884146921686174, + "grad_norm": 1.706418752670288, + "learning_rate": 4.527904699940051e-05, + "loss": 5.076, + "step": 33434 + }, + { + "epoch": 0.19884741650014273, + "grad_norm": 1.7451330423355103, + "learning_rate": 4.5278773826507195e-05, + "loss": 5.1754, + "step": 33435 + }, + { + "epoch": 0.19885336378342372, + "grad_norm": 1.9312299489974976, + "learning_rate": 4.5278500646534764e-05, + "loss": 4.4978, + "step": 33436 + }, + { + "epoch": 0.19885931106670474, + "grad_norm": 2.42375111579895, + "learning_rate": 4.527822745948332e-05, + "loss": 4.2296, + "step": 33437 + }, + { + "epoch": 0.19886525834998572, + "grad_norm": 1.8249690532684326, + "learning_rate": 4.5277954265352956e-05, + "loss": 4.9476, + "step": 33438 + }, + { + "epoch": 0.1988712056332667, + "grad_norm": 1.886839509010315, + "learning_rate": 4.527768106414377e-05, + "loss": 5.082, + "step": 33439 + }, + { + "epoch": 0.19887715291654773, + "grad_norm": 1.6707491874694824, + "learning_rate": 4.527740785585585e-05, + "loss": 4.887, + "step": 33440 + }, + { + "epoch": 0.19888310019982872, + "grad_norm": 1.8287665843963623, + "learning_rate": 4.5277134640489296e-05, + "loss": 5.2154, + "step": 33441 + }, + { + "epoch": 0.1988890474831097, + "grad_norm": 1.7216829061508179, + "learning_rate": 4.5276861418044214e-05, + "loss": 5.4044, + "step": 33442 + }, + { + "epoch": 0.19889499476639072, + "grad_norm": 1.6618791818618774, + "learning_rate": 4.527658818852068e-05, + "loss": 5.4637, + "step": 33443 + }, + { + "epoch": 0.1989009420496717, + "grad_norm": 1.4226763248443604, + "learning_rate": 4.52763149519188e-05, + "loss": 5.2645, + "step": 33444 + }, + { + "epoch": 0.1989068893329527, + "grad_norm": 1.634399175643921, + "learning_rate": 4.527604170823867e-05, + "loss": 4.8305, + "step": 33445 + }, + { + "epoch": 0.1989128366162337, + "grad_norm": 1.6638933420181274, + "learning_rate": 4.5275768457480384e-05, + "loss": 4.9628, + "step": 33446 + }, + { + "epoch": 0.1989187838995147, + "grad_norm": 1.7389144897460938, + "learning_rate": 4.5275495199644036e-05, + "loss": 5.0198, + "step": 33447 + }, + { + "epoch": 0.1989247311827957, + "grad_norm": 1.6871912479400635, + "learning_rate": 4.527522193472972e-05, + "loss": 4.4683, + "step": 33448 + }, + { + "epoch": 0.1989306784660767, + "grad_norm": 1.8285382986068726, + "learning_rate": 4.527494866273753e-05, + "loss": 5.0369, + "step": 33449 + }, + { + "epoch": 0.1989366257493577, + "grad_norm": 1.6555229425430298, + "learning_rate": 4.527467538366758e-05, + "loss": 5.3217, + "step": 33450 + }, + { + "epoch": 0.19894257303263868, + "grad_norm": 2.1087262630462646, + "learning_rate": 4.5274402097519933e-05, + "loss": 4.3458, + "step": 33451 + }, + { + "epoch": 0.1989485203159197, + "grad_norm": 1.8274654150009155, + "learning_rate": 4.5274128804294705e-05, + "loss": 4.9379, + "step": 33452 + }, + { + "epoch": 0.19895446759920069, + "grad_norm": 2.4635887145996094, + "learning_rate": 4.5273855503991994e-05, + "loss": 4.4695, + "step": 33453 + }, + { + "epoch": 0.19896041488248167, + "grad_norm": 1.6194392442703247, + "learning_rate": 4.527358219661189e-05, + "loss": 4.739, + "step": 33454 + }, + { + "epoch": 0.1989663621657627, + "grad_norm": 1.819419026374817, + "learning_rate": 4.527330888215448e-05, + "loss": 4.7175, + "step": 33455 + }, + { + "epoch": 0.19897230944904368, + "grad_norm": 1.6347033977508545, + "learning_rate": 4.527303556061987e-05, + "loss": 4.9085, + "step": 33456 + }, + { + "epoch": 0.19897825673232467, + "grad_norm": 1.6178079843521118, + "learning_rate": 4.527276223200816e-05, + "loss": 5.0249, + "step": 33457 + }, + { + "epoch": 0.19898420401560568, + "grad_norm": 2.922417163848877, + "learning_rate": 4.5272488896319434e-05, + "loss": 4.4398, + "step": 33458 + }, + { + "epoch": 0.19899015129888667, + "grad_norm": 2.004303455352783, + "learning_rate": 4.527221555355379e-05, + "loss": 4.417, + "step": 33459 + }, + { + "epoch": 0.19899609858216766, + "grad_norm": 1.8364394903182983, + "learning_rate": 4.5271942203711326e-05, + "loss": 4.9475, + "step": 33460 + }, + { + "epoch": 0.19900204586544867, + "grad_norm": 1.5880411863327026, + "learning_rate": 4.5271668846792134e-05, + "loss": 5.062, + "step": 33461 + }, + { + "epoch": 0.19900799314872966, + "grad_norm": 1.5956658124923706, + "learning_rate": 4.5271395482796306e-05, + "loss": 5.0605, + "step": 33462 + }, + { + "epoch": 0.19901394043201065, + "grad_norm": 1.5776112079620361, + "learning_rate": 4.527112211172396e-05, + "loss": 4.8766, + "step": 33463 + }, + { + "epoch": 0.19901988771529167, + "grad_norm": 1.6271411180496216, + "learning_rate": 4.5270848733575166e-05, + "loss": 4.997, + "step": 33464 + }, + { + "epoch": 0.19902583499857265, + "grad_norm": 1.4883582592010498, + "learning_rate": 4.527057534835002e-05, + "loss": 4.9727, + "step": 33465 + }, + { + "epoch": 0.19903178228185364, + "grad_norm": 1.74699866771698, + "learning_rate": 4.527030195604864e-05, + "loss": 4.5348, + "step": 33466 + }, + { + "epoch": 0.19903772956513466, + "grad_norm": 1.9776692390441895, + "learning_rate": 4.52700285566711e-05, + "loss": 4.5182, + "step": 33467 + }, + { + "epoch": 0.19904367684841565, + "grad_norm": 1.4299882650375366, + "learning_rate": 4.52697551502175e-05, + "loss": 5.0135, + "step": 33468 + }, + { + "epoch": 0.19904962413169663, + "grad_norm": 2.07140851020813, + "learning_rate": 4.5269481736687945e-05, + "loss": 4.2703, + "step": 33469 + }, + { + "epoch": 0.19905557141497765, + "grad_norm": 2.40633225440979, + "learning_rate": 4.526920831608252e-05, + "loss": 3.8694, + "step": 33470 + }, + { + "epoch": 0.19906151869825864, + "grad_norm": 1.616706132888794, + "learning_rate": 4.526893488840132e-05, + "loss": 5.2851, + "step": 33471 + }, + { + "epoch": 0.19906746598153963, + "grad_norm": 2.0044960975646973, + "learning_rate": 4.526866145364445e-05, + "loss": 5.5549, + "step": 33472 + }, + { + "epoch": 0.19907341326482064, + "grad_norm": 1.6400177478790283, + "learning_rate": 4.5268388011812e-05, + "loss": 4.9827, + "step": 33473 + }, + { + "epoch": 0.19907936054810163, + "grad_norm": 1.522547960281372, + "learning_rate": 4.526811456290406e-05, + "loss": 4.961, + "step": 33474 + }, + { + "epoch": 0.19908530783138262, + "grad_norm": 2.100389242172241, + "learning_rate": 4.5267841106920724e-05, + "loss": 4.6502, + "step": 33475 + }, + { + "epoch": 0.19909125511466363, + "grad_norm": 1.7050951719284058, + "learning_rate": 4.526756764386211e-05, + "loss": 4.9722, + "step": 33476 + }, + { + "epoch": 0.19909720239794462, + "grad_norm": 2.1410129070281982, + "learning_rate": 4.526729417372829e-05, + "loss": 5.023, + "step": 33477 + }, + { + "epoch": 0.1991031496812256, + "grad_norm": 1.8448638916015625, + "learning_rate": 4.526702069651937e-05, + "loss": 5.2159, + "step": 33478 + }, + { + "epoch": 0.19910909696450663, + "grad_norm": 2.0991101264953613, + "learning_rate": 4.526674721223544e-05, + "loss": 4.1942, + "step": 33479 + }, + { + "epoch": 0.19911504424778761, + "grad_norm": 1.3801043033599854, + "learning_rate": 4.526647372087659e-05, + "loss": 5.1376, + "step": 33480 + }, + { + "epoch": 0.1991209915310686, + "grad_norm": 1.584425926208496, + "learning_rate": 4.526620022244293e-05, + "loss": 4.5196, + "step": 33481 + }, + { + "epoch": 0.19912693881434962, + "grad_norm": 1.665459394454956, + "learning_rate": 4.5265926716934556e-05, + "loss": 5.1194, + "step": 33482 + }, + { + "epoch": 0.1991328860976306, + "grad_norm": 1.5680651664733887, + "learning_rate": 4.526565320435155e-05, + "loss": 4.9814, + "step": 33483 + }, + { + "epoch": 0.1991388333809116, + "grad_norm": 1.9074794054031372, + "learning_rate": 4.526537968469401e-05, + "loss": 5.2678, + "step": 33484 + }, + { + "epoch": 0.19914478066419258, + "grad_norm": 1.5251576900482178, + "learning_rate": 4.526510615796205e-05, + "loss": 5.0521, + "step": 33485 + }, + { + "epoch": 0.1991507279474736, + "grad_norm": 1.5786724090576172, + "learning_rate": 4.526483262415573e-05, + "loss": 4.8732, + "step": 33486 + }, + { + "epoch": 0.1991566752307546, + "grad_norm": 1.6850212812423706, + "learning_rate": 4.5264559083275185e-05, + "loss": 4.8264, + "step": 33487 + }, + { + "epoch": 0.19916262251403558, + "grad_norm": 1.9387089014053345, + "learning_rate": 4.526428553532048e-05, + "loss": 4.6695, + "step": 33488 + }, + { + "epoch": 0.1991685697973166, + "grad_norm": 1.6213630437850952, + "learning_rate": 4.5264011980291724e-05, + "loss": 4.8366, + "step": 33489 + }, + { + "epoch": 0.19917451708059758, + "grad_norm": 1.4320731163024902, + "learning_rate": 4.526373841818901e-05, + "loss": 5.0784, + "step": 33490 + }, + { + "epoch": 0.19918046436387857, + "grad_norm": 1.5601176023483276, + "learning_rate": 4.5263464849012436e-05, + "loss": 4.8712, + "step": 33491 + }, + { + "epoch": 0.19918641164715958, + "grad_norm": 1.610245943069458, + "learning_rate": 4.52631912727621e-05, + "loss": 5.0412, + "step": 33492 + }, + { + "epoch": 0.19919235893044057, + "grad_norm": 1.4566705226898193, + "learning_rate": 4.5262917689438086e-05, + "loss": 4.8381, + "step": 33493 + }, + { + "epoch": 0.19919830621372156, + "grad_norm": 2.0661633014678955, + "learning_rate": 4.52626440990405e-05, + "loss": 4.2362, + "step": 33494 + }, + { + "epoch": 0.19920425349700258, + "grad_norm": 2.077457904815674, + "learning_rate": 4.526237050156944e-05, + "loss": 4.9774, + "step": 33495 + }, + { + "epoch": 0.19921020078028356, + "grad_norm": 1.723219394683838, + "learning_rate": 4.5262096897024985e-05, + "loss": 5.2097, + "step": 33496 + }, + { + "epoch": 0.19921614806356455, + "grad_norm": 1.7461673021316528, + "learning_rate": 4.526182328540725e-05, + "loss": 4.9274, + "step": 33497 + }, + { + "epoch": 0.19922209534684557, + "grad_norm": 2.2677931785583496, + "learning_rate": 4.526154966671632e-05, + "loss": 4.0952, + "step": 33498 + }, + { + "epoch": 0.19922804263012656, + "grad_norm": 3.0971813201904297, + "learning_rate": 4.526127604095229e-05, + "loss": 3.9733, + "step": 33499 + }, + { + "epoch": 0.19923398991340754, + "grad_norm": 3.0695557594299316, + "learning_rate": 4.526100240811526e-05, + "loss": 3.4307, + "step": 33500 + }, + { + "epoch": 0.19923993719668856, + "grad_norm": 2.502638101577759, + "learning_rate": 4.526072876820532e-05, + "loss": 4.0434, + "step": 33501 + }, + { + "epoch": 0.19924588447996955, + "grad_norm": 1.444030523300171, + "learning_rate": 4.5260455121222566e-05, + "loss": 5.0315, + "step": 33502 + }, + { + "epoch": 0.19925183176325054, + "grad_norm": 1.5067824125289917, + "learning_rate": 4.526018146716711e-05, + "loss": 4.7649, + "step": 33503 + }, + { + "epoch": 0.19925777904653155, + "grad_norm": 1.5262528657913208, + "learning_rate": 4.525990780603903e-05, + "loss": 4.9649, + "step": 33504 + }, + { + "epoch": 0.19926372632981254, + "grad_norm": 1.6207854747772217, + "learning_rate": 4.525963413783841e-05, + "loss": 4.7577, + "step": 33505 + }, + { + "epoch": 0.19926967361309353, + "grad_norm": 2.1585114002227783, + "learning_rate": 4.5259360462565377e-05, + "loss": 4.3194, + "step": 33506 + }, + { + "epoch": 0.19927562089637454, + "grad_norm": 3.0893638134002686, + "learning_rate": 4.525908678022001e-05, + "loss": 3.4338, + "step": 33507 + }, + { + "epoch": 0.19928156817965553, + "grad_norm": 3.0618252754211426, + "learning_rate": 4.5258813090802396e-05, + "loss": 3.6044, + "step": 33508 + }, + { + "epoch": 0.19928751546293652, + "grad_norm": 3.0148963928222656, + "learning_rate": 4.525853939431264e-05, + "loss": 3.6999, + "step": 33509 + }, + { + "epoch": 0.19929346274621754, + "grad_norm": 1.7465107440948486, + "learning_rate": 4.5258265690750846e-05, + "loss": 4.6342, + "step": 33510 + }, + { + "epoch": 0.19929941002949852, + "grad_norm": 1.6526566743850708, + "learning_rate": 4.52579919801171e-05, + "loss": 5.1376, + "step": 33511 + }, + { + "epoch": 0.1993053573127795, + "grad_norm": 1.583158254623413, + "learning_rate": 4.525771826241149e-05, + "loss": 4.8617, + "step": 33512 + }, + { + "epoch": 0.19931130459606053, + "grad_norm": 1.6602866649627686, + "learning_rate": 4.5257444537634124e-05, + "loss": 4.5414, + "step": 33513 + }, + { + "epoch": 0.19931725187934152, + "grad_norm": 1.583927035331726, + "learning_rate": 4.5257170805785095e-05, + "loss": 4.8343, + "step": 33514 + }, + { + "epoch": 0.1993231991626225, + "grad_norm": 1.6319681406021118, + "learning_rate": 4.52568970668645e-05, + "loss": 5.2782, + "step": 33515 + }, + { + "epoch": 0.19932914644590352, + "grad_norm": 1.7109445333480835, + "learning_rate": 4.5256623320872424e-05, + "loss": 4.9891, + "step": 33516 + }, + { + "epoch": 0.1993350937291845, + "grad_norm": 1.7144900560379028, + "learning_rate": 4.525634956780897e-05, + "loss": 4.9999, + "step": 33517 + }, + { + "epoch": 0.1993410410124655, + "grad_norm": 1.9427156448364258, + "learning_rate": 4.5256075807674233e-05, + "loss": 5.1138, + "step": 33518 + }, + { + "epoch": 0.1993469882957465, + "grad_norm": 1.6421605348587036, + "learning_rate": 4.525580204046832e-05, + "loss": 4.9218, + "step": 33519 + }, + { + "epoch": 0.1993529355790275, + "grad_norm": 1.7899574041366577, + "learning_rate": 4.52555282661913e-05, + "loss": 4.5285, + "step": 33520 + }, + { + "epoch": 0.1993588828623085, + "grad_norm": 1.706308364868164, + "learning_rate": 4.52552544848433e-05, + "loss": 5.1926, + "step": 33521 + }, + { + "epoch": 0.1993648301455895, + "grad_norm": 2.0579419136047363, + "learning_rate": 4.5254980696424396e-05, + "loss": 4.8708, + "step": 33522 + }, + { + "epoch": 0.1993707774288705, + "grad_norm": 2.4866833686828613, + "learning_rate": 4.5254706900934684e-05, + "loss": 4.33, + "step": 33523 + }, + { + "epoch": 0.19937672471215148, + "grad_norm": 1.4279406070709229, + "learning_rate": 4.525443309837426e-05, + "loss": 4.7774, + "step": 33524 + }, + { + "epoch": 0.1993826719954325, + "grad_norm": 1.9905481338500977, + "learning_rate": 4.525415928874324e-05, + "loss": 5.0473, + "step": 33525 + }, + { + "epoch": 0.19938861927871349, + "grad_norm": 1.6799120903015137, + "learning_rate": 4.525388547204168e-05, + "loss": 4.8557, + "step": 33526 + }, + { + "epoch": 0.19939456656199447, + "grad_norm": 1.8065446615219116, + "learning_rate": 4.525361164826971e-05, + "loss": 5.3703, + "step": 33527 + }, + { + "epoch": 0.1994005138452755, + "grad_norm": 1.5986427068710327, + "learning_rate": 4.525333781742741e-05, + "loss": 5.0066, + "step": 33528 + }, + { + "epoch": 0.19940646112855648, + "grad_norm": 2.090648889541626, + "learning_rate": 4.525306397951488e-05, + "loss": 4.8344, + "step": 33529 + }, + { + "epoch": 0.19941240841183747, + "grad_norm": 1.7685662508010864, + "learning_rate": 4.525279013453221e-05, + "loss": 4.5956, + "step": 33530 + }, + { + "epoch": 0.19941835569511848, + "grad_norm": 1.6398029327392578, + "learning_rate": 4.525251628247951e-05, + "loss": 5.3404, + "step": 33531 + }, + { + "epoch": 0.19942430297839947, + "grad_norm": 1.805405616760254, + "learning_rate": 4.525224242335685e-05, + "loss": 5.2919, + "step": 33532 + }, + { + "epoch": 0.19943025026168046, + "grad_norm": 1.791210651397705, + "learning_rate": 4.525196855716435e-05, + "loss": 5.3577, + "step": 33533 + }, + { + "epoch": 0.19943619754496147, + "grad_norm": 1.7393286228179932, + "learning_rate": 4.52516946839021e-05, + "loss": 5.2954, + "step": 33534 + }, + { + "epoch": 0.19944214482824246, + "grad_norm": 1.9773000478744507, + "learning_rate": 4.525142080357019e-05, + "loss": 4.7362, + "step": 33535 + }, + { + "epoch": 0.19944809211152345, + "grad_norm": 1.4539952278137207, + "learning_rate": 4.5251146916168715e-05, + "loss": 4.9983, + "step": 33536 + }, + { + "epoch": 0.19945403939480447, + "grad_norm": 1.7288161516189575, + "learning_rate": 4.525087302169778e-05, + "loss": 5.1685, + "step": 33537 + }, + { + "epoch": 0.19945998667808545, + "grad_norm": 1.477931261062622, + "learning_rate": 4.525059912015748e-05, + "loss": 4.9073, + "step": 33538 + }, + { + "epoch": 0.19946593396136644, + "grad_norm": 2.294431209564209, + "learning_rate": 4.525032521154789e-05, + "loss": 4.3584, + "step": 33539 + }, + { + "epoch": 0.19947188124464746, + "grad_norm": 1.388110876083374, + "learning_rate": 4.525005129586913e-05, + "loss": 5.2412, + "step": 33540 + }, + { + "epoch": 0.19947782852792845, + "grad_norm": 1.660605788230896, + "learning_rate": 4.5249777373121285e-05, + "loss": 4.7119, + "step": 33541 + }, + { + "epoch": 0.19948377581120944, + "grad_norm": 1.37186861038208, + "learning_rate": 4.524950344330445e-05, + "loss": 5.1161, + "step": 33542 + }, + { + "epoch": 0.19948972309449042, + "grad_norm": 2.0066730976104736, + "learning_rate": 4.5249229506418725e-05, + "loss": 4.9816, + "step": 33543 + }, + { + "epoch": 0.19949567037777144, + "grad_norm": 1.7703311443328857, + "learning_rate": 4.52489555624642e-05, + "loss": 5.0812, + "step": 33544 + }, + { + "epoch": 0.19950161766105243, + "grad_norm": 1.6476131677627563, + "learning_rate": 4.524868161144098e-05, + "loss": 5.1316, + "step": 33545 + }, + { + "epoch": 0.19950756494433342, + "grad_norm": 1.8000843524932861, + "learning_rate": 4.524840765334915e-05, + "loss": 5.1469, + "step": 33546 + }, + { + "epoch": 0.19951351222761443, + "grad_norm": 2.017563581466675, + "learning_rate": 4.524813368818881e-05, + "loss": 5.2712, + "step": 33547 + }, + { + "epoch": 0.19951945951089542, + "grad_norm": 1.9950426816940308, + "learning_rate": 4.524785971596006e-05, + "loss": 5.0172, + "step": 33548 + }, + { + "epoch": 0.1995254067941764, + "grad_norm": 2.131312370300293, + "learning_rate": 4.5247585736662985e-05, + "loss": 4.4589, + "step": 33549 + }, + { + "epoch": 0.19953135407745742, + "grad_norm": 2.1414794921875, + "learning_rate": 4.524731175029769e-05, + "loss": 4.1757, + "step": 33550 + }, + { + "epoch": 0.1995373013607384, + "grad_norm": 1.6311516761779785, + "learning_rate": 4.524703775686426e-05, + "loss": 4.7597, + "step": 33551 + }, + { + "epoch": 0.1995432486440194, + "grad_norm": 1.5711687803268433, + "learning_rate": 4.524676375636281e-05, + "loss": 3.9699, + "step": 33552 + }, + { + "epoch": 0.19954919592730042, + "grad_norm": 2.298886299133301, + "learning_rate": 4.524648974879342e-05, + "loss": 3.5056, + "step": 33553 + }, + { + "epoch": 0.1995551432105814, + "grad_norm": 1.629654049873352, + "learning_rate": 4.5246215734156186e-05, + "loss": 5.0122, + "step": 33554 + }, + { + "epoch": 0.1995610904938624, + "grad_norm": 1.8879503011703491, + "learning_rate": 4.5245941712451215e-05, + "loss": 4.5958, + "step": 33555 + }, + { + "epoch": 0.1995670377771434, + "grad_norm": 1.9116814136505127, + "learning_rate": 4.524566768367859e-05, + "loss": 4.9318, + "step": 33556 + }, + { + "epoch": 0.1995729850604244, + "grad_norm": 1.3296679258346558, + "learning_rate": 4.524539364783841e-05, + "loss": 4.2284, + "step": 33557 + }, + { + "epoch": 0.19957893234370538, + "grad_norm": 1.4925459623336792, + "learning_rate": 4.5245119604930775e-05, + "loss": 5.0751, + "step": 33558 + }, + { + "epoch": 0.1995848796269864, + "grad_norm": 1.524156093597412, + "learning_rate": 4.5244845554955774e-05, + "loss": 3.9814, + "step": 33559 + }, + { + "epoch": 0.1995908269102674, + "grad_norm": 1.7568142414093018, + "learning_rate": 4.524457149791351e-05, + "loss": 3.994, + "step": 33560 + }, + { + "epoch": 0.19959677419354838, + "grad_norm": 1.3893617391586304, + "learning_rate": 4.524429743380407e-05, + "loss": 3.9492, + "step": 33561 + }, + { + "epoch": 0.1996027214768294, + "grad_norm": 1.5644930601119995, + "learning_rate": 4.524402336262756e-05, + "loss": 4.4825, + "step": 33562 + }, + { + "epoch": 0.19960866876011038, + "grad_norm": 1.7970536947250366, + "learning_rate": 4.524374928438407e-05, + "loss": 3.8519, + "step": 33563 + }, + { + "epoch": 0.19961461604339137, + "grad_norm": 1.5144481658935547, + "learning_rate": 4.52434751990737e-05, + "loss": 4.4697, + "step": 33564 + }, + { + "epoch": 0.19962056332667238, + "grad_norm": 1.679702639579773, + "learning_rate": 4.524320110669654e-05, + "loss": 4.8946, + "step": 33565 + }, + { + "epoch": 0.19962651060995337, + "grad_norm": 1.7595206499099731, + "learning_rate": 4.524292700725268e-05, + "loss": 5.0111, + "step": 33566 + }, + { + "epoch": 0.19963245789323436, + "grad_norm": 1.3525060415267944, + "learning_rate": 4.524265290074223e-05, + "loss": 4.9942, + "step": 33567 + }, + { + "epoch": 0.19963840517651538, + "grad_norm": 1.6003968715667725, + "learning_rate": 4.524237878716529e-05, + "loss": 4.4323, + "step": 33568 + }, + { + "epoch": 0.19964435245979636, + "grad_norm": 1.654555082321167, + "learning_rate": 4.524210466652192e-05, + "loss": 3.804, + "step": 33569 + }, + { + "epoch": 0.19965029974307735, + "grad_norm": 1.7716010808944702, + "learning_rate": 4.524183053881226e-05, + "loss": 3.8744, + "step": 33570 + }, + { + "epoch": 0.19965624702635837, + "grad_norm": 1.7306915521621704, + "learning_rate": 4.524155640403638e-05, + "loss": 3.8362, + "step": 33571 + }, + { + "epoch": 0.19966219430963936, + "grad_norm": 1.5759642124176025, + "learning_rate": 4.524128226219438e-05, + "loss": 3.8123, + "step": 33572 + }, + { + "epoch": 0.19966814159292035, + "grad_norm": 1.6143770217895508, + "learning_rate": 4.524100811328636e-05, + "loss": 4.1127, + "step": 33573 + }, + { + "epoch": 0.19967408887620136, + "grad_norm": 1.6612343788146973, + "learning_rate": 4.524073395731241e-05, + "loss": 5.2288, + "step": 33574 + }, + { + "epoch": 0.19968003615948235, + "grad_norm": 2.272780418395996, + "learning_rate": 4.524045979427263e-05, + "loss": 3.6138, + "step": 33575 + }, + { + "epoch": 0.19968598344276334, + "grad_norm": 1.8799057006835938, + "learning_rate": 4.524018562416712e-05, + "loss": 3.7476, + "step": 33576 + }, + { + "epoch": 0.19969193072604435, + "grad_norm": 1.7091578245162964, + "learning_rate": 4.5239911446995966e-05, + "loss": 3.7152, + "step": 33577 + }, + { + "epoch": 0.19969787800932534, + "grad_norm": 1.7033981084823608, + "learning_rate": 4.523963726275926e-05, + "loss": 4.178, + "step": 33578 + }, + { + "epoch": 0.19970382529260633, + "grad_norm": 1.5857266187667847, + "learning_rate": 4.523936307145712e-05, + "loss": 4.2662, + "step": 33579 + }, + { + "epoch": 0.19970977257588735, + "grad_norm": 1.5587173700332642, + "learning_rate": 4.523908887308962e-05, + "loss": 4.7286, + "step": 33580 + }, + { + "epoch": 0.19971571985916833, + "grad_norm": 1.6295536756515503, + "learning_rate": 4.523881466765686e-05, + "loss": 4.7793, + "step": 33581 + }, + { + "epoch": 0.19972166714244932, + "grad_norm": 1.4184001684188843, + "learning_rate": 4.523854045515895e-05, + "loss": 4.4593, + "step": 33582 + }, + { + "epoch": 0.19972761442573034, + "grad_norm": 1.3835517168045044, + "learning_rate": 4.5238266235595964e-05, + "loss": 5.4586, + "step": 33583 + }, + { + "epoch": 0.19973356170901133, + "grad_norm": 2.0789854526519775, + "learning_rate": 4.523799200896801e-05, + "loss": 4.8789, + "step": 33584 + }, + { + "epoch": 0.19973950899229231, + "grad_norm": 1.995231032371521, + "learning_rate": 4.5237717775275184e-05, + "loss": 5.0695, + "step": 33585 + }, + { + "epoch": 0.19974545627557333, + "grad_norm": 1.5125774145126343, + "learning_rate": 4.523744353451758e-05, + "loss": 5.1086, + "step": 33586 + }, + { + "epoch": 0.19975140355885432, + "grad_norm": 1.6523572206497192, + "learning_rate": 4.523716928669529e-05, + "loss": 4.886, + "step": 33587 + }, + { + "epoch": 0.1997573508421353, + "grad_norm": 1.6928048133850098, + "learning_rate": 4.5236895031808425e-05, + "loss": 5.1758, + "step": 33588 + }, + { + "epoch": 0.19976329812541632, + "grad_norm": 1.6727235317230225, + "learning_rate": 4.523662076985706e-05, + "loss": 5.1094, + "step": 33589 + }, + { + "epoch": 0.1997692454086973, + "grad_norm": 1.4500248432159424, + "learning_rate": 4.5236346500841297e-05, + "loss": 4.9389, + "step": 33590 + }, + { + "epoch": 0.1997751926919783, + "grad_norm": 1.9560678005218506, + "learning_rate": 4.523607222476124e-05, + "loss": 4.6893, + "step": 33591 + }, + { + "epoch": 0.19978113997525931, + "grad_norm": 2.10848331451416, + "learning_rate": 4.523579794161697e-05, + "loss": 4.0417, + "step": 33592 + }, + { + "epoch": 0.1997870872585403, + "grad_norm": 2.400477647781372, + "learning_rate": 4.523552365140861e-05, + "loss": 3.408, + "step": 33593 + }, + { + "epoch": 0.1997930345418213, + "grad_norm": 1.886122226715088, + "learning_rate": 4.523524935413622e-05, + "loss": 4.6157, + "step": 33594 + }, + { + "epoch": 0.1997989818251023, + "grad_norm": 1.5088223218917847, + "learning_rate": 4.523497504979992e-05, + "loss": 5.0846, + "step": 33595 + }, + { + "epoch": 0.1998049291083833, + "grad_norm": 1.4798957109451294, + "learning_rate": 4.52347007383998e-05, + "loss": 5.0978, + "step": 33596 + }, + { + "epoch": 0.19981087639166428, + "grad_norm": 1.7828933000564575, + "learning_rate": 4.523442641993596e-05, + "loss": 4.4761, + "step": 33597 + }, + { + "epoch": 0.1998168236749453, + "grad_norm": 2.1810219287872314, + "learning_rate": 4.523415209440848e-05, + "loss": 3.5858, + "step": 33598 + }, + { + "epoch": 0.1998227709582263, + "grad_norm": 2.2807984352111816, + "learning_rate": 4.523387776181747e-05, + "loss": 3.5799, + "step": 33599 + }, + { + "epoch": 0.19982871824150727, + "grad_norm": 2.3635599613189697, + "learning_rate": 4.523360342216303e-05, + "loss": 3.6036, + "step": 33600 + }, + { + "epoch": 0.19983466552478826, + "grad_norm": 2.358201503753662, + "learning_rate": 4.5233329075445244e-05, + "loss": 3.5597, + "step": 33601 + }, + { + "epoch": 0.19984061280806928, + "grad_norm": 2.496837854385376, + "learning_rate": 4.523305472166421e-05, + "loss": 3.4199, + "step": 33602 + }, + { + "epoch": 0.19984656009135027, + "grad_norm": 2.0924534797668457, + "learning_rate": 4.523278036082003e-05, + "loss": 4.3533, + "step": 33603 + }, + { + "epoch": 0.19985250737463126, + "grad_norm": 1.5738506317138672, + "learning_rate": 4.523250599291279e-05, + "loss": 5.2645, + "step": 33604 + }, + { + "epoch": 0.19985845465791227, + "grad_norm": 1.8330590724945068, + "learning_rate": 4.523223161794259e-05, + "loss": 4.6044, + "step": 33605 + }, + { + "epoch": 0.19986440194119326, + "grad_norm": 2.3316526412963867, + "learning_rate": 4.523195723590953e-05, + "loss": 3.661, + "step": 33606 + }, + { + "epoch": 0.19987034922447425, + "grad_norm": 1.7145735025405884, + "learning_rate": 4.52316828468137e-05, + "loss": 4.5833, + "step": 33607 + }, + { + "epoch": 0.19987629650775526, + "grad_norm": 2.236112117767334, + "learning_rate": 4.5231408450655196e-05, + "loss": 5.1055, + "step": 33608 + }, + { + "epoch": 0.19988224379103625, + "grad_norm": 2.227168321609497, + "learning_rate": 4.5231134047434124e-05, + "loss": 4.0876, + "step": 33609 + }, + { + "epoch": 0.19988819107431724, + "grad_norm": 1.6515976190567017, + "learning_rate": 4.523085963715057e-05, + "loss": 4.897, + "step": 33610 + }, + { + "epoch": 0.19989413835759826, + "grad_norm": 1.844726800918579, + "learning_rate": 4.5230585219804636e-05, + "loss": 4.9802, + "step": 33611 + }, + { + "epoch": 0.19990008564087924, + "grad_norm": 1.967348575592041, + "learning_rate": 4.52303107953964e-05, + "loss": 5.0002, + "step": 33612 + }, + { + "epoch": 0.19990603292416023, + "grad_norm": 1.6869394779205322, + "learning_rate": 4.523003636392599e-05, + "loss": 4.5466, + "step": 33613 + }, + { + "epoch": 0.19991198020744125, + "grad_norm": 1.9090338945388794, + "learning_rate": 4.522976192539347e-05, + "loss": 4.4996, + "step": 33614 + }, + { + "epoch": 0.19991792749072224, + "grad_norm": 1.6536940336227417, + "learning_rate": 4.522948747979895e-05, + "loss": 4.7394, + "step": 33615 + }, + { + "epoch": 0.19992387477400322, + "grad_norm": 1.6711348295211792, + "learning_rate": 4.5229213027142526e-05, + "loss": 4.2212, + "step": 33616 + }, + { + "epoch": 0.19992982205728424, + "grad_norm": 1.4655362367630005, + "learning_rate": 4.5228938567424295e-05, + "loss": 4.163, + "step": 33617 + }, + { + "epoch": 0.19993576934056523, + "grad_norm": 1.509748935699463, + "learning_rate": 4.522866410064435e-05, + "loss": 4.6835, + "step": 33618 + }, + { + "epoch": 0.19994171662384622, + "grad_norm": 1.8132991790771484, + "learning_rate": 4.5228389626802794e-05, + "loss": 5.5276, + "step": 33619 + }, + { + "epoch": 0.19994766390712723, + "grad_norm": 2.3421835899353027, + "learning_rate": 4.5228115145899707e-05, + "loss": 3.8201, + "step": 33620 + }, + { + "epoch": 0.19995361119040822, + "grad_norm": 1.4546209573745728, + "learning_rate": 4.52278406579352e-05, + "loss": 5.174, + "step": 33621 + }, + { + "epoch": 0.1999595584736892, + "grad_norm": 1.5802754163742065, + "learning_rate": 4.522756616290935e-05, + "loss": 5.3047, + "step": 33622 + }, + { + "epoch": 0.19996550575697022, + "grad_norm": 1.6700994968414307, + "learning_rate": 4.5227291660822276e-05, + "loss": 5.362, + "step": 33623 + }, + { + "epoch": 0.1999714530402512, + "grad_norm": 1.743464469909668, + "learning_rate": 4.522701715167407e-05, + "loss": 5.1647, + "step": 33624 + }, + { + "epoch": 0.1999774003235322, + "grad_norm": 1.8635927438735962, + "learning_rate": 4.5226742635464805e-05, + "loss": 5.0099, + "step": 33625 + }, + { + "epoch": 0.19998334760681322, + "grad_norm": 1.5073845386505127, + "learning_rate": 4.522646811219461e-05, + "loss": 5.0938, + "step": 33626 + }, + { + "epoch": 0.1999892948900942, + "grad_norm": 1.8857444524765015, + "learning_rate": 4.522619358186355e-05, + "loss": 5.1196, + "step": 33627 + }, + { + "epoch": 0.1999952421733752, + "grad_norm": 1.7090919017791748, + "learning_rate": 4.5225919044471746e-05, + "loss": 5.3662, + "step": 33628 + }, + { + "epoch": 0.2000011894566562, + "grad_norm": 1.6622498035430908, + "learning_rate": 4.522564450001927e-05, + "loss": 5.3546, + "step": 33629 + }, + { + "epoch": 0.2000071367399372, + "grad_norm": 1.5253161191940308, + "learning_rate": 4.522536994850624e-05, + "loss": 5.3692, + "step": 33630 + }, + { + "epoch": 0.20001308402321819, + "grad_norm": 1.6020673513412476, + "learning_rate": 4.522509538993274e-05, + "loss": 5.278, + "step": 33631 + }, + { + "epoch": 0.2000190313064992, + "grad_norm": 1.7955602407455444, + "learning_rate": 4.522482082429887e-05, + "loss": 4.7938, + "step": 33632 + }, + { + "epoch": 0.2000249785897802, + "grad_norm": 1.694838047027588, + "learning_rate": 4.522454625160472e-05, + "loss": 4.5588, + "step": 33633 + }, + { + "epoch": 0.20003092587306118, + "grad_norm": 1.6719664335250854, + "learning_rate": 4.522427167185039e-05, + "loss": 4.6892, + "step": 33634 + }, + { + "epoch": 0.2000368731563422, + "grad_norm": 1.7728748321533203, + "learning_rate": 4.522399708503599e-05, + "loss": 4.9853, + "step": 33635 + }, + { + "epoch": 0.20004282043962318, + "grad_norm": 2.797647476196289, + "learning_rate": 4.522372249116158e-05, + "loss": 4.4462, + "step": 33636 + }, + { + "epoch": 0.20004876772290417, + "grad_norm": 2.5635032653808594, + "learning_rate": 4.522344789022729e-05, + "loss": 4.1496, + "step": 33637 + }, + { + "epoch": 0.20005471500618519, + "grad_norm": 2.256369113922119, + "learning_rate": 4.52231732822332e-05, + "loss": 4.0803, + "step": 33638 + }, + { + "epoch": 0.20006066228946617, + "grad_norm": 3.085843086242676, + "learning_rate": 4.5222898667179404e-05, + "loss": 2.9132, + "step": 33639 + }, + { + "epoch": 0.20006660957274716, + "grad_norm": 1.646597146987915, + "learning_rate": 4.522262404506601e-05, + "loss": 4.2852, + "step": 33640 + }, + { + "epoch": 0.20007255685602818, + "grad_norm": 2.579864740371704, + "learning_rate": 4.5222349415893106e-05, + "loss": 3.5672, + "step": 33641 + }, + { + "epoch": 0.20007850413930917, + "grad_norm": 2.537965774536133, + "learning_rate": 4.5222074779660784e-05, + "loss": 3.3253, + "step": 33642 + }, + { + "epoch": 0.20008445142259015, + "grad_norm": 2.1766700744628906, + "learning_rate": 4.5221800136369155e-05, + "loss": 3.8213, + "step": 33643 + }, + { + "epoch": 0.20009039870587117, + "grad_norm": 1.603519320487976, + "learning_rate": 4.522152548601829e-05, + "loss": 4.7899, + "step": 33644 + }, + { + "epoch": 0.20009634598915216, + "grad_norm": 2.1622631549835205, + "learning_rate": 4.522125082860831e-05, + "loss": 3.0835, + "step": 33645 + }, + { + "epoch": 0.20010229327243315, + "grad_norm": 1.785031795501709, + "learning_rate": 4.522097616413929e-05, + "loss": 4.1423, + "step": 33646 + }, + { + "epoch": 0.20010824055571416, + "grad_norm": 2.3329782485961914, + "learning_rate": 4.522070149261135e-05, + "loss": 3.4989, + "step": 33647 + }, + { + "epoch": 0.20011418783899515, + "grad_norm": 2.6644299030303955, + "learning_rate": 4.5220426814024564e-05, + "loss": 2.7116, + "step": 33648 + }, + { + "epoch": 0.20012013512227614, + "grad_norm": 2.071437358856201, + "learning_rate": 4.522015212837904e-05, + "loss": 4.3735, + "step": 33649 + }, + { + "epoch": 0.20012608240555715, + "grad_norm": 1.4981132745742798, + "learning_rate": 4.521987743567487e-05, + "loss": 5.1696, + "step": 33650 + }, + { + "epoch": 0.20013202968883814, + "grad_norm": 1.7726006507873535, + "learning_rate": 4.521960273591215e-05, + "loss": 4.9392, + "step": 33651 + }, + { + "epoch": 0.20013797697211913, + "grad_norm": 1.9665300846099854, + "learning_rate": 4.5219328029090966e-05, + "loss": 3.5765, + "step": 33652 + }, + { + "epoch": 0.20014392425540015, + "grad_norm": 2.3966944217681885, + "learning_rate": 4.521905331521143e-05, + "loss": 3.5962, + "step": 33653 + }, + { + "epoch": 0.20014987153868113, + "grad_norm": 2.8166298866271973, + "learning_rate": 4.521877859427363e-05, + "loss": 3.5926, + "step": 33654 + }, + { + "epoch": 0.20015581882196212, + "grad_norm": 1.7879718542099, + "learning_rate": 4.521850386627767e-05, + "loss": 4.6034, + "step": 33655 + }, + { + "epoch": 0.20016176610524314, + "grad_norm": 2.0207948684692383, + "learning_rate": 4.521822913122363e-05, + "loss": 5.2371, + "step": 33656 + }, + { + "epoch": 0.20016771338852413, + "grad_norm": 1.6166136264801025, + "learning_rate": 4.5217954389111615e-05, + "loss": 5.6755, + "step": 33657 + }, + { + "epoch": 0.20017366067180511, + "grad_norm": 1.5825445652008057, + "learning_rate": 4.521767963994173e-05, + "loss": 5.5416, + "step": 33658 + }, + { + "epoch": 0.2001796079550861, + "grad_norm": 2.376970052719116, + "learning_rate": 4.521740488371406e-05, + "loss": 4.8429, + "step": 33659 + }, + { + "epoch": 0.20018555523836712, + "grad_norm": 1.9127243757247925, + "learning_rate": 4.52171301204287e-05, + "loss": 5.299, + "step": 33660 + }, + { + "epoch": 0.2001915025216481, + "grad_norm": 2.695713758468628, + "learning_rate": 4.5216855350085745e-05, + "loss": 3.9133, + "step": 33661 + }, + { + "epoch": 0.2001974498049291, + "grad_norm": 1.516388177871704, + "learning_rate": 4.521658057268529e-05, + "loss": 5.4228, + "step": 33662 + }, + { + "epoch": 0.2002033970882101, + "grad_norm": 2.076374053955078, + "learning_rate": 4.521630578822745e-05, + "loss": 4.7299, + "step": 33663 + }, + { + "epoch": 0.2002093443714911, + "grad_norm": 1.7333403825759888, + "learning_rate": 4.52160309967123e-05, + "loss": 5.1839, + "step": 33664 + }, + { + "epoch": 0.2002152916547721, + "grad_norm": 1.616132378578186, + "learning_rate": 4.521575619813995e-05, + "loss": 5.1235, + "step": 33665 + }, + { + "epoch": 0.2002212389380531, + "grad_norm": 1.7438740730285645, + "learning_rate": 4.5215481392510476e-05, + "loss": 5.2177, + "step": 33666 + }, + { + "epoch": 0.2002271862213341, + "grad_norm": 1.4537467956542969, + "learning_rate": 4.521520657982399e-05, + "loss": 4.8443, + "step": 33667 + }, + { + "epoch": 0.20023313350461508, + "grad_norm": 2.2082064151763916, + "learning_rate": 4.521493176008059e-05, + "loss": 4.3767, + "step": 33668 + }, + { + "epoch": 0.2002390807878961, + "grad_norm": 2.066798210144043, + "learning_rate": 4.521465693328036e-05, + "loss": 3.9378, + "step": 33669 + }, + { + "epoch": 0.20024502807117708, + "grad_norm": 2.1056056022644043, + "learning_rate": 4.52143820994234e-05, + "loss": 4.3377, + "step": 33670 + }, + { + "epoch": 0.20025097535445807, + "grad_norm": 2.173313617706299, + "learning_rate": 4.521410725850981e-05, + "loss": 3.8741, + "step": 33671 + }, + { + "epoch": 0.2002569226377391, + "grad_norm": 2.4892916679382324, + "learning_rate": 4.521383241053969e-05, + "loss": 3.9224, + "step": 33672 + }, + { + "epoch": 0.20026286992102008, + "grad_norm": 1.7403076887130737, + "learning_rate": 4.521355755551313e-05, + "loss": 4.6479, + "step": 33673 + }, + { + "epoch": 0.20026881720430106, + "grad_norm": 2.1816036701202393, + "learning_rate": 4.521328269343022e-05, + "loss": 4.3331, + "step": 33674 + }, + { + "epoch": 0.20027476448758208, + "grad_norm": 1.727345585823059, + "learning_rate": 4.521300782429106e-05, + "loss": 4.7984, + "step": 33675 + }, + { + "epoch": 0.20028071177086307, + "grad_norm": 1.813586711883545, + "learning_rate": 4.521273294809575e-05, + "loss": 5.1053, + "step": 33676 + }, + { + "epoch": 0.20028665905414406, + "grad_norm": 1.5746510028839111, + "learning_rate": 4.521245806484439e-05, + "loss": 5.08, + "step": 33677 + }, + { + "epoch": 0.20029260633742507, + "grad_norm": 1.7232789993286133, + "learning_rate": 4.521218317453706e-05, + "loss": 5.1837, + "step": 33678 + }, + { + "epoch": 0.20029855362070606, + "grad_norm": 1.592498540878296, + "learning_rate": 4.521190827717387e-05, + "loss": 4.7453, + "step": 33679 + }, + { + "epoch": 0.20030450090398705, + "grad_norm": 2.30441951751709, + "learning_rate": 4.521163337275492e-05, + "loss": 3.9628, + "step": 33680 + }, + { + "epoch": 0.20031044818726806, + "grad_norm": 1.50408935546875, + "learning_rate": 4.521135846128028e-05, + "loss": 4.7726, + "step": 33681 + }, + { + "epoch": 0.20031639547054905, + "grad_norm": 1.6377472877502441, + "learning_rate": 4.5211083542750074e-05, + "loss": 4.7695, + "step": 33682 + }, + { + "epoch": 0.20032234275383004, + "grad_norm": 1.6713389158248901, + "learning_rate": 4.521080861716439e-05, + "loss": 4.8269, + "step": 33683 + }, + { + "epoch": 0.20032829003711106, + "grad_norm": 1.6516128778457642, + "learning_rate": 4.5210533684523314e-05, + "loss": 4.347, + "step": 33684 + }, + { + "epoch": 0.20033423732039204, + "grad_norm": 2.528104782104492, + "learning_rate": 4.521025874482696e-05, + "loss": 3.0607, + "step": 33685 + }, + { + "epoch": 0.20034018460367303, + "grad_norm": 2.153841495513916, + "learning_rate": 4.520998379807541e-05, + "loss": 3.5991, + "step": 33686 + }, + { + "epoch": 0.20034613188695405, + "grad_norm": 2.264549970626831, + "learning_rate": 4.520970884426876e-05, + "loss": 3.3218, + "step": 33687 + }, + { + "epoch": 0.20035207917023504, + "grad_norm": 2.517428159713745, + "learning_rate": 4.52094338834071e-05, + "loss": 2.7056, + "step": 33688 + }, + { + "epoch": 0.20035802645351602, + "grad_norm": 2.2318918704986572, + "learning_rate": 4.520915891549055e-05, + "loss": 4.1438, + "step": 33689 + }, + { + "epoch": 0.20036397373679704, + "grad_norm": 1.3869786262512207, + "learning_rate": 4.520888394051919e-05, + "loss": 4.7448, + "step": 33690 + }, + { + "epoch": 0.20036992102007803, + "grad_norm": 1.7762783765792847, + "learning_rate": 4.520860895849311e-05, + "loss": 5.587, + "step": 33691 + }, + { + "epoch": 0.20037586830335902, + "grad_norm": 1.5028401613235474, + "learning_rate": 4.520833396941242e-05, + "loss": 5.6461, + "step": 33692 + }, + { + "epoch": 0.20038181558664003, + "grad_norm": 2.091181516647339, + "learning_rate": 4.5208058973277215e-05, + "loss": 4.3468, + "step": 33693 + }, + { + "epoch": 0.20038776286992102, + "grad_norm": 2.1915535926818848, + "learning_rate": 4.520778397008757e-05, + "loss": 3.909, + "step": 33694 + }, + { + "epoch": 0.200393710153202, + "grad_norm": 2.2266931533813477, + "learning_rate": 4.5207508959843606e-05, + "loss": 3.6981, + "step": 33695 + }, + { + "epoch": 0.20039965743648303, + "grad_norm": 1.649043083190918, + "learning_rate": 4.5207233942545406e-05, + "loss": 5.3721, + "step": 33696 + }, + { + "epoch": 0.200405604719764, + "grad_norm": 1.6526726484298706, + "learning_rate": 4.520695891819307e-05, + "loss": 5.0437, + "step": 33697 + }, + { + "epoch": 0.200411552003045, + "grad_norm": 1.662593960762024, + "learning_rate": 4.520668388678669e-05, + "loss": 5.0767, + "step": 33698 + }, + { + "epoch": 0.20041749928632602, + "grad_norm": 2.0036306381225586, + "learning_rate": 4.520640884832638e-05, + "loss": 4.9235, + "step": 33699 + }, + { + "epoch": 0.200423446569607, + "grad_norm": 1.6705793142318726, + "learning_rate": 4.52061338028122e-05, + "loss": 4.9874, + "step": 33700 + }, + { + "epoch": 0.200429393852888, + "grad_norm": 1.6362453699111938, + "learning_rate": 4.520585875024429e-05, + "loss": 4.6005, + "step": 33701 + }, + { + "epoch": 0.200435341136169, + "grad_norm": 1.509127140045166, + "learning_rate": 4.52055836906227e-05, + "loss": 5.0991, + "step": 33702 + }, + { + "epoch": 0.20044128841945, + "grad_norm": 1.5291036367416382, + "learning_rate": 4.520530862394757e-05, + "loss": 5.0673, + "step": 33703 + }, + { + "epoch": 0.20044723570273099, + "grad_norm": 1.4072394371032715, + "learning_rate": 4.5205033550218964e-05, + "loss": 5.2334, + "step": 33704 + }, + { + "epoch": 0.200453182986012, + "grad_norm": 1.7063164710998535, + "learning_rate": 4.520475846943699e-05, + "loss": 5.3813, + "step": 33705 + }, + { + "epoch": 0.200459130269293, + "grad_norm": 1.6799110174179077, + "learning_rate": 4.520448338160175e-05, + "loss": 5.3719, + "step": 33706 + }, + { + "epoch": 0.20046507755257398, + "grad_norm": 1.409774899482727, + "learning_rate": 4.5204208286713326e-05, + "loss": 5.1838, + "step": 33707 + }, + { + "epoch": 0.200471024835855, + "grad_norm": 1.6743974685668945, + "learning_rate": 4.520393318477183e-05, + "loss": 5.1994, + "step": 33708 + }, + { + "epoch": 0.20047697211913598, + "grad_norm": 1.5790249109268188, + "learning_rate": 4.5203658075777344e-05, + "loss": 5.5178, + "step": 33709 + }, + { + "epoch": 0.20048291940241697, + "grad_norm": 1.3018198013305664, + "learning_rate": 4.520338295972997e-05, + "loss": 5.3647, + "step": 33710 + }, + { + "epoch": 0.200488866685698, + "grad_norm": 1.6319355964660645, + "learning_rate": 4.52031078366298e-05, + "loss": 5.499, + "step": 33711 + }, + { + "epoch": 0.20049481396897897, + "grad_norm": 1.6148849725723267, + "learning_rate": 4.520283270647694e-05, + "loss": 5.728, + "step": 33712 + }, + { + "epoch": 0.20050076125225996, + "grad_norm": 1.6749992370605469, + "learning_rate": 4.520255756927147e-05, + "loss": 4.9337, + "step": 33713 + }, + { + "epoch": 0.20050670853554098, + "grad_norm": 1.8952507972717285, + "learning_rate": 4.520228242501351e-05, + "loss": 5.1761, + "step": 33714 + }, + { + "epoch": 0.20051265581882197, + "grad_norm": 1.6296254396438599, + "learning_rate": 4.520200727370314e-05, + "loss": 5.0946, + "step": 33715 + }, + { + "epoch": 0.20051860310210295, + "grad_norm": 1.622511386871338, + "learning_rate": 4.520173211534045e-05, + "loss": 4.9613, + "step": 33716 + }, + { + "epoch": 0.20052455038538394, + "grad_norm": 1.5678802728652954, + "learning_rate": 4.5201456949925547e-05, + "loss": 4.8578, + "step": 33717 + }, + { + "epoch": 0.20053049766866496, + "grad_norm": 1.661635160446167, + "learning_rate": 4.5201181777458526e-05, + "loss": 5.3303, + "step": 33718 + }, + { + "epoch": 0.20053644495194595, + "grad_norm": 1.4430382251739502, + "learning_rate": 4.520090659793948e-05, + "loss": 4.9673, + "step": 33719 + }, + { + "epoch": 0.20054239223522694, + "grad_norm": 1.8783633708953857, + "learning_rate": 4.520063141136851e-05, + "loss": 4.6469, + "step": 33720 + }, + { + "epoch": 0.20054833951850795, + "grad_norm": 1.6063121557235718, + "learning_rate": 4.5200356217745704e-05, + "loss": 4.9283, + "step": 33721 + }, + { + "epoch": 0.20055428680178894, + "grad_norm": 1.7810618877410889, + "learning_rate": 4.520008101707116e-05, + "loss": 4.6507, + "step": 33722 + }, + { + "epoch": 0.20056023408506993, + "grad_norm": 1.9146829843521118, + "learning_rate": 4.519980580934498e-05, + "loss": 4.8663, + "step": 33723 + }, + { + "epoch": 0.20056618136835094, + "grad_norm": 1.5050143003463745, + "learning_rate": 4.519953059456726e-05, + "loss": 4.8159, + "step": 33724 + }, + { + "epoch": 0.20057212865163193, + "grad_norm": 1.4203321933746338, + "learning_rate": 4.519925537273808e-05, + "loss": 5.2271, + "step": 33725 + }, + { + "epoch": 0.20057807593491292, + "grad_norm": 1.7080183029174805, + "learning_rate": 4.519898014385756e-05, + "loss": 4.6244, + "step": 33726 + }, + { + "epoch": 0.20058402321819394, + "grad_norm": 2.466174840927124, + "learning_rate": 4.519870490792578e-05, + "loss": 4.3746, + "step": 33727 + }, + { + "epoch": 0.20058997050147492, + "grad_norm": 1.9741504192352295, + "learning_rate": 4.519842966494284e-05, + "loss": 3.5025, + "step": 33728 + }, + { + "epoch": 0.2005959177847559, + "grad_norm": 1.5923235416412354, + "learning_rate": 4.519815441490884e-05, + "loss": 5.4725, + "step": 33729 + }, + { + "epoch": 0.20060186506803693, + "grad_norm": 1.650692343711853, + "learning_rate": 4.5197879157823874e-05, + "loss": 4.8989, + "step": 33730 + }, + { + "epoch": 0.20060781235131792, + "grad_norm": 2.597038745880127, + "learning_rate": 4.5197603893688034e-05, + "loss": 3.7543, + "step": 33731 + }, + { + "epoch": 0.2006137596345989, + "grad_norm": 2.3229899406433105, + "learning_rate": 4.5197328622501425e-05, + "loss": 3.8646, + "step": 33732 + }, + { + "epoch": 0.20061970691787992, + "grad_norm": 1.6960362195968628, + "learning_rate": 4.519705334426413e-05, + "loss": 5.4444, + "step": 33733 + }, + { + "epoch": 0.2006256542011609, + "grad_norm": 1.6503461599349976, + "learning_rate": 4.5196778058976255e-05, + "loss": 4.9644, + "step": 33734 + }, + { + "epoch": 0.2006316014844419, + "grad_norm": 1.5266268253326416, + "learning_rate": 4.519650276663789e-05, + "loss": 4.9012, + "step": 33735 + }, + { + "epoch": 0.2006375487677229, + "grad_norm": 1.8135932683944702, + "learning_rate": 4.5196227467249144e-05, + "loss": 4.6224, + "step": 33736 + }, + { + "epoch": 0.2006434960510039, + "grad_norm": 1.9666510820388794, + "learning_rate": 4.5195952160810094e-05, + "loss": 4.8198, + "step": 33737 + }, + { + "epoch": 0.2006494433342849, + "grad_norm": 2.169323444366455, + "learning_rate": 4.5195676847320856e-05, + "loss": 4.5872, + "step": 33738 + }, + { + "epoch": 0.2006553906175659, + "grad_norm": 2.2922489643096924, + "learning_rate": 4.5195401526781506e-05, + "loss": 3.3659, + "step": 33739 + }, + { + "epoch": 0.2006613379008469, + "grad_norm": 1.6436244249343872, + "learning_rate": 4.519512619919215e-05, + "loss": 4.824, + "step": 33740 + }, + { + "epoch": 0.20066728518412788, + "grad_norm": 2.283162832260132, + "learning_rate": 4.519485086455289e-05, + "loss": 3.0829, + "step": 33741 + }, + { + "epoch": 0.2006732324674089, + "grad_norm": 1.8069710731506348, + "learning_rate": 4.519457552286381e-05, + "loss": 5.1824, + "step": 33742 + }, + { + "epoch": 0.20067917975068988, + "grad_norm": 1.619968056678772, + "learning_rate": 4.519430017412502e-05, + "loss": 4.9325, + "step": 33743 + }, + { + "epoch": 0.20068512703397087, + "grad_norm": 2.0284674167633057, + "learning_rate": 4.51940248183366e-05, + "loss": 4.6265, + "step": 33744 + }, + { + "epoch": 0.2006910743172519, + "grad_norm": 2.3306424617767334, + "learning_rate": 4.5193749455498664e-05, + "loss": 4.7731, + "step": 33745 + }, + { + "epoch": 0.20069702160053288, + "grad_norm": 2.6167304515838623, + "learning_rate": 4.519347408561129e-05, + "loss": 5.0508, + "step": 33746 + }, + { + "epoch": 0.20070296888381386, + "grad_norm": 1.625686526298523, + "learning_rate": 4.519319870867459e-05, + "loss": 4.6101, + "step": 33747 + }, + { + "epoch": 0.20070891616709488, + "grad_norm": 1.5814995765686035, + "learning_rate": 4.519292332468865e-05, + "loss": 5.195, + "step": 33748 + }, + { + "epoch": 0.20071486345037587, + "grad_norm": 1.4932879209518433, + "learning_rate": 4.5192647933653566e-05, + "loss": 5.2172, + "step": 33749 + }, + { + "epoch": 0.20072081073365686, + "grad_norm": 1.6655844449996948, + "learning_rate": 4.519237253556944e-05, + "loss": 5.0021, + "step": 33750 + }, + { + "epoch": 0.20072675801693787, + "grad_norm": 2.265012264251709, + "learning_rate": 4.519209713043636e-05, + "loss": 4.1171, + "step": 33751 + }, + { + "epoch": 0.20073270530021886, + "grad_norm": 2.524155378341675, + "learning_rate": 4.5191821718254436e-05, + "loss": 3.5122, + "step": 33752 + }, + { + "epoch": 0.20073865258349985, + "grad_norm": 1.620287537574768, + "learning_rate": 4.5191546299023754e-05, + "loss": 4.9341, + "step": 33753 + }, + { + "epoch": 0.20074459986678087, + "grad_norm": 1.6102601289749146, + "learning_rate": 4.519127087274441e-05, + "loss": 5.3899, + "step": 33754 + }, + { + "epoch": 0.20075054715006185, + "grad_norm": 1.5587860345840454, + "learning_rate": 4.51909954394165e-05, + "loss": 4.8153, + "step": 33755 + }, + { + "epoch": 0.20075649443334284, + "grad_norm": 1.8633415699005127, + "learning_rate": 4.5190719999040124e-05, + "loss": 5.0412, + "step": 33756 + }, + { + "epoch": 0.20076244171662386, + "grad_norm": 1.7614189386367798, + "learning_rate": 4.519044455161538e-05, + "loss": 5.299, + "step": 33757 + }, + { + "epoch": 0.20076838899990485, + "grad_norm": 1.6365293264389038, + "learning_rate": 4.5190169097142355e-05, + "loss": 5.2779, + "step": 33758 + }, + { + "epoch": 0.20077433628318583, + "grad_norm": 1.696018099784851, + "learning_rate": 4.518989363562115e-05, + "loss": 4.6489, + "step": 33759 + }, + { + "epoch": 0.20078028356646685, + "grad_norm": 1.7082701921463013, + "learning_rate": 4.5189618167051866e-05, + "loss": 4.9396, + "step": 33760 + }, + { + "epoch": 0.20078623084974784, + "grad_norm": 1.533921241760254, + "learning_rate": 4.518934269143459e-05, + "loss": 5.2822, + "step": 33761 + }, + { + "epoch": 0.20079217813302883, + "grad_norm": 1.9116073846817017, + "learning_rate": 4.518906720876943e-05, + "loss": 5.6835, + "step": 33762 + }, + { + "epoch": 0.20079812541630984, + "grad_norm": 1.6028169393539429, + "learning_rate": 4.5188791719056466e-05, + "loss": 5.6127, + "step": 33763 + }, + { + "epoch": 0.20080407269959083, + "grad_norm": 1.6425648927688599, + "learning_rate": 4.5188516222295814e-05, + "loss": 5.5184, + "step": 33764 + }, + { + "epoch": 0.20081001998287182, + "grad_norm": 1.7046092748641968, + "learning_rate": 4.518824071848755e-05, + "loss": 5.1118, + "step": 33765 + }, + { + "epoch": 0.20081596726615283, + "grad_norm": 1.6231269836425781, + "learning_rate": 4.518796520763179e-05, + "loss": 5.0246, + "step": 33766 + }, + { + "epoch": 0.20082191454943382, + "grad_norm": 1.6386373043060303, + "learning_rate": 4.5187689689728606e-05, + "loss": 5.5888, + "step": 33767 + }, + { + "epoch": 0.2008278618327148, + "grad_norm": 1.582251787185669, + "learning_rate": 4.518741416477812e-05, + "loss": 5.1171, + "step": 33768 + }, + { + "epoch": 0.20083380911599583, + "grad_norm": 1.5769929885864258, + "learning_rate": 4.518713863278041e-05, + "loss": 5.1322, + "step": 33769 + }, + { + "epoch": 0.20083975639927681, + "grad_norm": 1.6422269344329834, + "learning_rate": 4.5186863093735585e-05, + "loss": 4.652, + "step": 33770 + }, + { + "epoch": 0.2008457036825578, + "grad_norm": 1.5146641731262207, + "learning_rate": 4.518658754764373e-05, + "loss": 4.3487, + "step": 33771 + }, + { + "epoch": 0.20085165096583882, + "grad_norm": 1.463438868522644, + "learning_rate": 4.518631199450494e-05, + "loss": 4.2242, + "step": 33772 + }, + { + "epoch": 0.2008575982491198, + "grad_norm": 1.4330111742019653, + "learning_rate": 4.5186036434319324e-05, + "loss": 4.0898, + "step": 33773 + }, + { + "epoch": 0.2008635455324008, + "grad_norm": 1.4045552015304565, + "learning_rate": 4.5185760867086975e-05, + "loss": 4.346, + "step": 33774 + }, + { + "epoch": 0.20086949281568178, + "grad_norm": 1.5829514265060425, + "learning_rate": 4.5185485292807975e-05, + "loss": 4.8288, + "step": 33775 + }, + { + "epoch": 0.2008754400989628, + "grad_norm": 1.363128423690796, + "learning_rate": 4.518520971148244e-05, + "loss": 4.2016, + "step": 33776 + }, + { + "epoch": 0.2008813873822438, + "grad_norm": 1.6097347736358643, + "learning_rate": 4.518493412311045e-05, + "loss": 3.9491, + "step": 33777 + }, + { + "epoch": 0.20088733466552477, + "grad_norm": 1.5099202394485474, + "learning_rate": 4.5184658527692114e-05, + "loss": 3.7755, + "step": 33778 + }, + { + "epoch": 0.2008932819488058, + "grad_norm": 1.602229118347168, + "learning_rate": 4.518438292522752e-05, + "loss": 3.9349, + "step": 33779 + }, + { + "epoch": 0.20089922923208678, + "grad_norm": 1.5963069200515747, + "learning_rate": 4.5184107315716765e-05, + "loss": 4.1626, + "step": 33780 + }, + { + "epoch": 0.20090517651536777, + "grad_norm": 1.5034286975860596, + "learning_rate": 4.518383169915995e-05, + "loss": 4.6347, + "step": 33781 + }, + { + "epoch": 0.20091112379864878, + "grad_norm": 1.4610581398010254, + "learning_rate": 4.518355607555717e-05, + "loss": 4.2356, + "step": 33782 + }, + { + "epoch": 0.20091707108192977, + "grad_norm": 1.468599557876587, + "learning_rate": 4.5183280444908504e-05, + "loss": 3.7972, + "step": 33783 + }, + { + "epoch": 0.20092301836521076, + "grad_norm": 1.265889286994934, + "learning_rate": 4.518300480721408e-05, + "loss": 3.9609, + "step": 33784 + }, + { + "epoch": 0.20092896564849178, + "grad_norm": 1.615130066871643, + "learning_rate": 4.5182729162473967e-05, + "loss": 3.8964, + "step": 33785 + }, + { + "epoch": 0.20093491293177276, + "grad_norm": 1.606234073638916, + "learning_rate": 4.518245351068828e-05, + "loss": 3.6912, + "step": 33786 + }, + { + "epoch": 0.20094086021505375, + "grad_norm": 1.4742984771728516, + "learning_rate": 4.51821778518571e-05, + "loss": 3.7547, + "step": 33787 + }, + { + "epoch": 0.20094680749833477, + "grad_norm": 1.3932676315307617, + "learning_rate": 4.518190218598054e-05, + "loss": 3.922, + "step": 33788 + }, + { + "epoch": 0.20095275478161576, + "grad_norm": 1.5401780605316162, + "learning_rate": 4.518162651305867e-05, + "loss": 3.8633, + "step": 33789 + }, + { + "epoch": 0.20095870206489674, + "grad_norm": 1.9733563661575317, + "learning_rate": 4.5181350833091616e-05, + "loss": 4.2922, + "step": 33790 + }, + { + "epoch": 0.20096464934817776, + "grad_norm": 1.675879955291748, + "learning_rate": 4.5181075146079456e-05, + "loss": 3.6927, + "step": 33791 + }, + { + "epoch": 0.20097059663145875, + "grad_norm": 1.6591668128967285, + "learning_rate": 4.5180799452022294e-05, + "loss": 3.7511, + "step": 33792 + }, + { + "epoch": 0.20097654391473974, + "grad_norm": 1.493525505065918, + "learning_rate": 4.518052375092022e-05, + "loss": 3.9572, + "step": 33793 + }, + { + "epoch": 0.20098249119802075, + "grad_norm": 1.5096441507339478, + "learning_rate": 4.5180248042773344e-05, + "loss": 4.4723, + "step": 33794 + }, + { + "epoch": 0.20098843848130174, + "grad_norm": 1.6198865175247192, + "learning_rate": 4.517997232758174e-05, + "loss": 3.968, + "step": 33795 + }, + { + "epoch": 0.20099438576458273, + "grad_norm": 1.4703052043914795, + "learning_rate": 4.517969660534552e-05, + "loss": 3.962, + "step": 33796 + }, + { + "epoch": 0.20100033304786374, + "grad_norm": 1.6844958066940308, + "learning_rate": 4.5179420876064776e-05, + "loss": 3.9622, + "step": 33797 + }, + { + "epoch": 0.20100628033114473, + "grad_norm": 1.5714399814605713, + "learning_rate": 4.5179145139739605e-05, + "loss": 3.6723, + "step": 33798 + }, + { + "epoch": 0.20101222761442572, + "grad_norm": 1.3336405754089355, + "learning_rate": 4.51788693963701e-05, + "loss": 4.0349, + "step": 33799 + }, + { + "epoch": 0.20101817489770674, + "grad_norm": 2.3248112201690674, + "learning_rate": 4.517859364595637e-05, + "loss": 4.4507, + "step": 33800 + }, + { + "epoch": 0.20102412218098772, + "grad_norm": 1.7180213928222656, + "learning_rate": 4.517831788849849e-05, + "loss": 4.7463, + "step": 33801 + }, + { + "epoch": 0.2010300694642687, + "grad_norm": 1.627234697341919, + "learning_rate": 4.5178042123996565e-05, + "loss": 4.4101, + "step": 33802 + }, + { + "epoch": 0.20103601674754973, + "grad_norm": 1.6691185235977173, + "learning_rate": 4.517776635245071e-05, + "loss": 4.6427, + "step": 33803 + }, + { + "epoch": 0.20104196403083072, + "grad_norm": 1.580978512763977, + "learning_rate": 4.517749057386099e-05, + "loss": 3.8459, + "step": 33804 + }, + { + "epoch": 0.2010479113141117, + "grad_norm": 1.5489826202392578, + "learning_rate": 4.5177214788227526e-05, + "loss": 4.1169, + "step": 33805 + }, + { + "epoch": 0.20105385859739272, + "grad_norm": 2.0057342052459717, + "learning_rate": 4.51769389955504e-05, + "loss": 4.6964, + "step": 33806 + }, + { + "epoch": 0.2010598058806737, + "grad_norm": 1.6826112270355225, + "learning_rate": 4.517666319582972e-05, + "loss": 5.1589, + "step": 33807 + }, + { + "epoch": 0.2010657531639547, + "grad_norm": 1.729201316833496, + "learning_rate": 4.5176387389065564e-05, + "loss": 4.9901, + "step": 33808 + }, + { + "epoch": 0.2010717004472357, + "grad_norm": 1.7442471981048584, + "learning_rate": 4.517611157525805e-05, + "loss": 4.9073, + "step": 33809 + }, + { + "epoch": 0.2010776477305167, + "grad_norm": 1.923149585723877, + "learning_rate": 4.5175835754407256e-05, + "loss": 4.8148, + "step": 33810 + }, + { + "epoch": 0.2010835950137977, + "grad_norm": 2.2062087059020996, + "learning_rate": 4.517555992651329e-05, + "loss": 4.0303, + "step": 33811 + }, + { + "epoch": 0.2010895422970787, + "grad_norm": 1.5704069137573242, + "learning_rate": 4.517528409157624e-05, + "loss": 5.2119, + "step": 33812 + }, + { + "epoch": 0.2010954895803597, + "grad_norm": 1.6825261116027832, + "learning_rate": 4.517500824959621e-05, + "loss": 4.8707, + "step": 33813 + }, + { + "epoch": 0.20110143686364068, + "grad_norm": 1.5696799755096436, + "learning_rate": 4.517473240057329e-05, + "loss": 4.7079, + "step": 33814 + }, + { + "epoch": 0.2011073841469217, + "grad_norm": 1.6693792343139648, + "learning_rate": 4.5174456544507594e-05, + "loss": 5.0667, + "step": 33815 + }, + { + "epoch": 0.20111333143020269, + "grad_norm": 1.5435715913772583, + "learning_rate": 4.517418068139919e-05, + "loss": 5.1521, + "step": 33816 + }, + { + "epoch": 0.20111927871348367, + "grad_norm": 1.5700812339782715, + "learning_rate": 4.517390481124819e-05, + "loss": 5.1064, + "step": 33817 + }, + { + "epoch": 0.2011252259967647, + "grad_norm": 1.550162434577942, + "learning_rate": 4.5173628934054694e-05, + "loss": 4.284, + "step": 33818 + }, + { + "epoch": 0.20113117328004568, + "grad_norm": 1.7881672382354736, + "learning_rate": 4.517335304981878e-05, + "loss": 4.2866, + "step": 33819 + }, + { + "epoch": 0.20113712056332667, + "grad_norm": 1.7079659700393677, + "learning_rate": 4.5173077158540566e-05, + "loss": 4.5039, + "step": 33820 + }, + { + "epoch": 0.20114306784660768, + "grad_norm": 1.5491669178009033, + "learning_rate": 4.517280126022014e-05, + "loss": 4.671, + "step": 33821 + }, + { + "epoch": 0.20114901512988867, + "grad_norm": 1.63919997215271, + "learning_rate": 4.517252535485759e-05, + "loss": 4.7127, + "step": 33822 + }, + { + "epoch": 0.20115496241316966, + "grad_norm": 1.8322843313217163, + "learning_rate": 4.517224944245303e-05, + "loss": 4.6952, + "step": 33823 + }, + { + "epoch": 0.20116090969645067, + "grad_norm": 1.7782399654388428, + "learning_rate": 4.517197352300654e-05, + "loss": 4.6892, + "step": 33824 + }, + { + "epoch": 0.20116685697973166, + "grad_norm": 1.7981961965560913, + "learning_rate": 4.517169759651823e-05, + "loss": 4.5741, + "step": 33825 + }, + { + "epoch": 0.20117280426301265, + "grad_norm": 1.8265764713287354, + "learning_rate": 4.5171421662988175e-05, + "loss": 4.4527, + "step": 33826 + }, + { + "epoch": 0.20117875154629367, + "grad_norm": 1.6261963844299316, + "learning_rate": 4.517114572241649e-05, + "loss": 4.4656, + "step": 33827 + }, + { + "epoch": 0.20118469882957465, + "grad_norm": 1.478434681892395, + "learning_rate": 4.517086977480327e-05, + "loss": 4.674, + "step": 33828 + }, + { + "epoch": 0.20119064611285564, + "grad_norm": 2.420952796936035, + "learning_rate": 4.517059382014861e-05, + "loss": 3.497, + "step": 33829 + }, + { + "epoch": 0.20119659339613666, + "grad_norm": 1.835784912109375, + "learning_rate": 4.51703178584526e-05, + "loss": 4.1869, + "step": 33830 + }, + { + "epoch": 0.20120254067941765, + "grad_norm": 1.6024458408355713, + "learning_rate": 4.517004188971534e-05, + "loss": 4.1086, + "step": 33831 + }, + { + "epoch": 0.20120848796269863, + "grad_norm": 2.3725204467773438, + "learning_rate": 4.516976591393692e-05, + "loss": 3.6407, + "step": 33832 + }, + { + "epoch": 0.20121443524597965, + "grad_norm": 2.743121862411499, + "learning_rate": 4.516948993111746e-05, + "loss": 4.2811, + "step": 33833 + }, + { + "epoch": 0.20122038252926064, + "grad_norm": 1.8155949115753174, + "learning_rate": 4.5169213941257024e-05, + "loss": 4.5768, + "step": 33834 + }, + { + "epoch": 0.20122632981254163, + "grad_norm": 1.7074800729751587, + "learning_rate": 4.516893794435574e-05, + "loss": 4.6348, + "step": 33835 + }, + { + "epoch": 0.20123227709582261, + "grad_norm": 1.7050331830978394, + "learning_rate": 4.516866194041367e-05, + "loss": 4.7784, + "step": 33836 + }, + { + "epoch": 0.20123822437910363, + "grad_norm": 1.6249829530715942, + "learning_rate": 4.516838592943094e-05, + "loss": 4.4591, + "step": 33837 + }, + { + "epoch": 0.20124417166238462, + "grad_norm": 1.6271724700927734, + "learning_rate": 4.516810991140763e-05, + "loss": 4.4689, + "step": 33838 + }, + { + "epoch": 0.2012501189456656, + "grad_norm": 1.787264108657837, + "learning_rate": 4.516783388634385e-05, + "loss": 4.3448, + "step": 33839 + }, + { + "epoch": 0.20125606622894662, + "grad_norm": 1.6502000093460083, + "learning_rate": 4.516755785423967e-05, + "loss": 4.761, + "step": 33840 + }, + { + "epoch": 0.2012620135122276, + "grad_norm": 1.768717885017395, + "learning_rate": 4.5167281815095216e-05, + "loss": 4.6362, + "step": 33841 + }, + { + "epoch": 0.2012679607955086, + "grad_norm": 1.5358744859695435, + "learning_rate": 4.5167005768910573e-05, + "loss": 4.6384, + "step": 33842 + }, + { + "epoch": 0.20127390807878962, + "grad_norm": 1.7373604774475098, + "learning_rate": 4.5166729715685833e-05, + "loss": 4.4974, + "step": 33843 + }, + { + "epoch": 0.2012798553620706, + "grad_norm": 1.7411723136901855, + "learning_rate": 4.51664536554211e-05, + "loss": 4.6346, + "step": 33844 + }, + { + "epoch": 0.2012858026453516, + "grad_norm": 1.4830048084259033, + "learning_rate": 4.516617758811647e-05, + "loss": 4.4465, + "step": 33845 + }, + { + "epoch": 0.2012917499286326, + "grad_norm": 1.7425602674484253, + "learning_rate": 4.5165901513772025e-05, + "loss": 4.3604, + "step": 33846 + }, + { + "epoch": 0.2012976972119136, + "grad_norm": 1.4752614498138428, + "learning_rate": 4.516562543238787e-05, + "loss": 4.4717, + "step": 33847 + }, + { + "epoch": 0.20130364449519458, + "grad_norm": 1.6776503324508667, + "learning_rate": 4.5165349343964115e-05, + "loss": 4.4002, + "step": 33848 + }, + { + "epoch": 0.2013095917784756, + "grad_norm": 2.209038734436035, + "learning_rate": 4.516507324850084e-05, + "loss": 4.381, + "step": 33849 + }, + { + "epoch": 0.2013155390617566, + "grad_norm": 2.257248878479004, + "learning_rate": 4.516479714599814e-05, + "loss": 4.6487, + "step": 33850 + }, + { + "epoch": 0.20132148634503758, + "grad_norm": 1.6058926582336426, + "learning_rate": 4.516452103645613e-05, + "loss": 4.7832, + "step": 33851 + }, + { + "epoch": 0.2013274336283186, + "grad_norm": 2.744135856628418, + "learning_rate": 4.5164244919874885e-05, + "loss": 4.0109, + "step": 33852 + }, + { + "epoch": 0.20133338091159958, + "grad_norm": 1.5897787809371948, + "learning_rate": 4.516396879625451e-05, + "loss": 4.5663, + "step": 33853 + }, + { + "epoch": 0.20133932819488057, + "grad_norm": 1.8678447008132935, + "learning_rate": 4.516369266559511e-05, + "loss": 4.2331, + "step": 33854 + }, + { + "epoch": 0.20134527547816158, + "grad_norm": 2.034632921218872, + "learning_rate": 4.516341652789676e-05, + "loss": 4.3551, + "step": 33855 + }, + { + "epoch": 0.20135122276144257, + "grad_norm": 1.9875417947769165, + "learning_rate": 4.5163140383159586e-05, + "loss": 4.203, + "step": 33856 + }, + { + "epoch": 0.20135717004472356, + "grad_norm": 1.689079999923706, + "learning_rate": 4.516286423138366e-05, + "loss": 4.3866, + "step": 33857 + }, + { + "epoch": 0.20136311732800458, + "grad_norm": 1.6041475534439087, + "learning_rate": 4.516258807256908e-05, + "loss": 4.6978, + "step": 33858 + }, + { + "epoch": 0.20136906461128556, + "grad_norm": 1.5705976486206055, + "learning_rate": 4.516231190671596e-05, + "loss": 4.62, + "step": 33859 + }, + { + "epoch": 0.20137501189456655, + "grad_norm": 1.814264178276062, + "learning_rate": 4.516203573382438e-05, + "loss": 4.3952, + "step": 33860 + }, + { + "epoch": 0.20138095917784757, + "grad_norm": 1.797286868095398, + "learning_rate": 4.516175955389445e-05, + "loss": 4.2057, + "step": 33861 + }, + { + "epoch": 0.20138690646112856, + "grad_norm": 1.5275870561599731, + "learning_rate": 4.516148336692624e-05, + "loss": 4.4812, + "step": 33862 + }, + { + "epoch": 0.20139285374440954, + "grad_norm": 1.4183309078216553, + "learning_rate": 4.5161207172919875e-05, + "loss": 4.6353, + "step": 33863 + }, + { + "epoch": 0.20139880102769056, + "grad_norm": 2.1223907470703125, + "learning_rate": 4.516093097187544e-05, + "loss": 4.6249, + "step": 33864 + }, + { + "epoch": 0.20140474831097155, + "grad_norm": 2.0036821365356445, + "learning_rate": 4.516065476379303e-05, + "loss": 4.9462, + "step": 33865 + }, + { + "epoch": 0.20141069559425254, + "grad_norm": 1.7337446212768555, + "learning_rate": 4.516037854867275e-05, + "loss": 4.7494, + "step": 33866 + }, + { + "epoch": 0.20141664287753355, + "grad_norm": 1.7076916694641113, + "learning_rate": 4.516010232651469e-05, + "loss": 4.0989, + "step": 33867 + }, + { + "epoch": 0.20142259016081454, + "grad_norm": 1.617569088935852, + "learning_rate": 4.5159826097318934e-05, + "loss": 4.2642, + "step": 33868 + }, + { + "epoch": 0.20142853744409553, + "grad_norm": 1.8856641054153442, + "learning_rate": 4.5159549861085604e-05, + "loss": 3.8425, + "step": 33869 + }, + { + "epoch": 0.20143448472737654, + "grad_norm": 2.1982221603393555, + "learning_rate": 4.515927361781478e-05, + "loss": 3.4169, + "step": 33870 + }, + { + "epoch": 0.20144043201065753, + "grad_norm": 2.361307144165039, + "learning_rate": 4.515899736750656e-05, + "loss": 3.1205, + "step": 33871 + }, + { + "epoch": 0.20144637929393852, + "grad_norm": 1.8608986139297485, + "learning_rate": 4.515872111016104e-05, + "loss": 4.3047, + "step": 33872 + }, + { + "epoch": 0.20145232657721954, + "grad_norm": 1.6542391777038574, + "learning_rate": 4.515844484577833e-05, + "loss": 4.7464, + "step": 33873 + }, + { + "epoch": 0.20145827386050053, + "grad_norm": 1.8147705793380737, + "learning_rate": 4.51581685743585e-05, + "loss": 4.2698, + "step": 33874 + }, + { + "epoch": 0.2014642211437815, + "grad_norm": 2.070788621902466, + "learning_rate": 4.515789229590167e-05, + "loss": 4.0678, + "step": 33875 + }, + { + "epoch": 0.20147016842706253, + "grad_norm": 1.9720908403396606, + "learning_rate": 4.5157616010407934e-05, + "loss": 4.0804, + "step": 33876 + }, + { + "epoch": 0.20147611571034352, + "grad_norm": 2.033067464828491, + "learning_rate": 4.5157339717877366e-05, + "loss": 3.8606, + "step": 33877 + }, + { + "epoch": 0.2014820629936245, + "grad_norm": 2.8475182056427, + "learning_rate": 4.5157063418310095e-05, + "loss": 2.9459, + "step": 33878 + }, + { + "epoch": 0.20148801027690552, + "grad_norm": 2.976738929748535, + "learning_rate": 4.5156787111706196e-05, + "loss": 2.9561, + "step": 33879 + }, + { + "epoch": 0.2014939575601865, + "grad_norm": 2.4976749420166016, + "learning_rate": 4.5156510798065764e-05, + "loss": 3.0772, + "step": 33880 + }, + { + "epoch": 0.2014999048434675, + "grad_norm": 1.491884469985962, + "learning_rate": 4.5156234477388914e-05, + "loss": 5.0386, + "step": 33881 + }, + { + "epoch": 0.2015058521267485, + "grad_norm": 1.7481471300125122, + "learning_rate": 4.515595814967573e-05, + "loss": 5.1319, + "step": 33882 + }, + { + "epoch": 0.2015117994100295, + "grad_norm": 1.8939447402954102, + "learning_rate": 4.51556818149263e-05, + "loss": 5.3305, + "step": 33883 + }, + { + "epoch": 0.2015177466933105, + "grad_norm": 1.7944999933242798, + "learning_rate": 4.515540547314073e-05, + "loss": 5.2071, + "step": 33884 + }, + { + "epoch": 0.2015236939765915, + "grad_norm": 1.799474835395813, + "learning_rate": 4.515512912431912e-05, + "loss": 5.1697, + "step": 33885 + }, + { + "epoch": 0.2015296412598725, + "grad_norm": 1.777791976928711, + "learning_rate": 4.515485276846157e-05, + "loss": 4.4441, + "step": 33886 + }, + { + "epoch": 0.20153558854315348, + "grad_norm": 1.6787590980529785, + "learning_rate": 4.5154576405568154e-05, + "loss": 4.5804, + "step": 33887 + }, + { + "epoch": 0.2015415358264345, + "grad_norm": 1.602138876914978, + "learning_rate": 4.515430003563899e-05, + "loss": 4.9907, + "step": 33888 + }, + { + "epoch": 0.2015474831097155, + "grad_norm": 1.4458924531936646, + "learning_rate": 4.515402365867417e-05, + "loss": 5.0, + "step": 33889 + }, + { + "epoch": 0.20155343039299647, + "grad_norm": 1.3784939050674438, + "learning_rate": 4.515374727467379e-05, + "loss": 5.036, + "step": 33890 + }, + { + "epoch": 0.2015593776762775, + "grad_norm": 1.384204626083374, + "learning_rate": 4.515347088363794e-05, + "loss": 4.9079, + "step": 33891 + }, + { + "epoch": 0.20156532495955848, + "grad_norm": 1.595136046409607, + "learning_rate": 4.515319448556673e-05, + "loss": 4.5787, + "step": 33892 + }, + { + "epoch": 0.20157127224283947, + "grad_norm": 1.3380727767944336, + "learning_rate": 4.515291808046024e-05, + "loss": 5.0094, + "step": 33893 + }, + { + "epoch": 0.20157721952612045, + "grad_norm": 1.488208293914795, + "learning_rate": 4.515264166831858e-05, + "loss": 5.0324, + "step": 33894 + }, + { + "epoch": 0.20158316680940147, + "grad_norm": 1.4779205322265625, + "learning_rate": 4.5152365249141835e-05, + "loss": 4.8467, + "step": 33895 + }, + { + "epoch": 0.20158911409268246, + "grad_norm": 1.5820229053497314, + "learning_rate": 4.515208882293011e-05, + "loss": 4.9763, + "step": 33896 + }, + { + "epoch": 0.20159506137596345, + "grad_norm": 1.2227067947387695, + "learning_rate": 4.51518123896835e-05, + "loss": 5.0146, + "step": 33897 + }, + { + "epoch": 0.20160100865924446, + "grad_norm": 1.7960015535354614, + "learning_rate": 4.51515359494021e-05, + "loss": 4.598, + "step": 33898 + }, + { + "epoch": 0.20160695594252545, + "grad_norm": 2.1942708492279053, + "learning_rate": 4.515125950208601e-05, + "loss": 3.9657, + "step": 33899 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 2.034914016723633, + "learning_rate": 4.5150983047735316e-05, + "loss": 4.5544, + "step": 33900 + }, + { + "epoch": 0.20161885050908745, + "grad_norm": 2.0738654136657715, + "learning_rate": 4.515070658635013e-05, + "loss": 3.9512, + "step": 33901 + }, + { + "epoch": 0.20162479779236844, + "grad_norm": 2.08801007270813, + "learning_rate": 4.515043011793053e-05, + "loss": 3.5748, + "step": 33902 + }, + { + "epoch": 0.20163074507564943, + "grad_norm": 2.220031261444092, + "learning_rate": 4.515015364247664e-05, + "loss": 3.6819, + "step": 33903 + }, + { + "epoch": 0.20163669235893045, + "grad_norm": 2.0764245986938477, + "learning_rate": 4.514987715998853e-05, + "loss": 4.2446, + "step": 33904 + }, + { + "epoch": 0.20164263964221144, + "grad_norm": 2.2536237239837646, + "learning_rate": 4.51496006704663e-05, + "loss": 3.6282, + "step": 33905 + }, + { + "epoch": 0.20164858692549242, + "grad_norm": 1.65569269657135, + "learning_rate": 4.514932417391006e-05, + "loss": 4.477, + "step": 33906 + }, + { + "epoch": 0.20165453420877344, + "grad_norm": 1.6719849109649658, + "learning_rate": 4.51490476703199e-05, + "loss": 4.447, + "step": 33907 + }, + { + "epoch": 0.20166048149205443, + "grad_norm": 1.5780644416809082, + "learning_rate": 4.514877115969591e-05, + "loss": 4.8047, + "step": 33908 + }, + { + "epoch": 0.20166642877533542, + "grad_norm": 1.6983767747879028, + "learning_rate": 4.5148494642038194e-05, + "loss": 4.4235, + "step": 33909 + }, + { + "epoch": 0.20167237605861643, + "grad_norm": 1.9663766622543335, + "learning_rate": 4.514821811734685e-05, + "loss": 4.1565, + "step": 33910 + }, + { + "epoch": 0.20167832334189742, + "grad_norm": 2.1460719108581543, + "learning_rate": 4.5147941585621965e-05, + "loss": 3.8833, + "step": 33911 + }, + { + "epoch": 0.2016842706251784, + "grad_norm": 1.7094260454177856, + "learning_rate": 4.5147665046863655e-05, + "loss": 4.3554, + "step": 33912 + }, + { + "epoch": 0.20169021790845942, + "grad_norm": 2.377586603164673, + "learning_rate": 4.5147388501071984e-05, + "loss": 3.4381, + "step": 33913 + }, + { + "epoch": 0.2016961651917404, + "grad_norm": 1.6335028409957886, + "learning_rate": 4.514711194824708e-05, + "loss": 4.8601, + "step": 33914 + }, + { + "epoch": 0.2017021124750214, + "grad_norm": 2.024763584136963, + "learning_rate": 4.514683538838903e-05, + "loss": 4.5755, + "step": 33915 + }, + { + "epoch": 0.20170805975830242, + "grad_norm": 1.655968427658081, + "learning_rate": 4.514655882149792e-05, + "loss": 4.7452, + "step": 33916 + }, + { + "epoch": 0.2017140070415834, + "grad_norm": 1.5172895193099976, + "learning_rate": 4.5146282247573855e-05, + "loss": 5.0361, + "step": 33917 + }, + { + "epoch": 0.2017199543248644, + "grad_norm": 1.302919626235962, + "learning_rate": 4.514600566661693e-05, + "loss": 5.2368, + "step": 33918 + }, + { + "epoch": 0.2017259016081454, + "grad_norm": 1.4833548069000244, + "learning_rate": 4.514572907862725e-05, + "loss": 5.1284, + "step": 33919 + }, + { + "epoch": 0.2017318488914264, + "grad_norm": 1.5283784866333008, + "learning_rate": 4.514545248360491e-05, + "loss": 5.1214, + "step": 33920 + }, + { + "epoch": 0.20173779617470738, + "grad_norm": 1.643585443496704, + "learning_rate": 4.514517588154998e-05, + "loss": 4.8686, + "step": 33921 + }, + { + "epoch": 0.2017437434579884, + "grad_norm": 1.5718209743499756, + "learning_rate": 4.5144899272462594e-05, + "loss": 4.824, + "step": 33922 + }, + { + "epoch": 0.2017496907412694, + "grad_norm": 1.4388155937194824, + "learning_rate": 4.514462265634283e-05, + "loss": 5.2063, + "step": 33923 + }, + { + "epoch": 0.20175563802455038, + "grad_norm": 1.2471232414245605, + "learning_rate": 4.5144346033190776e-05, + "loss": 5.1597, + "step": 33924 + }, + { + "epoch": 0.2017615853078314, + "grad_norm": 1.626516342163086, + "learning_rate": 4.514406940300655e-05, + "loss": 5.136, + "step": 33925 + }, + { + "epoch": 0.20176753259111238, + "grad_norm": 1.1768821477890015, + "learning_rate": 4.514379276579023e-05, + "loss": 5.1374, + "step": 33926 + }, + { + "epoch": 0.20177347987439337, + "grad_norm": 1.5507917404174805, + "learning_rate": 4.5143516121541926e-05, + "loss": 4.7123, + "step": 33927 + }, + { + "epoch": 0.20177942715767438, + "grad_norm": 1.611994981765747, + "learning_rate": 4.514323947026172e-05, + "loss": 3.8981, + "step": 33928 + }, + { + "epoch": 0.20178537444095537, + "grad_norm": 1.2168185710906982, + "learning_rate": 4.5142962811949724e-05, + "loss": 5.0275, + "step": 33929 + }, + { + "epoch": 0.20179132172423636, + "grad_norm": 1.4680912494659424, + "learning_rate": 4.514268614660603e-05, + "loss": 5.1313, + "step": 33930 + }, + { + "epoch": 0.20179726900751738, + "grad_norm": 1.660117506980896, + "learning_rate": 4.514240947423073e-05, + "loss": 5.1246, + "step": 33931 + }, + { + "epoch": 0.20180321629079837, + "grad_norm": 1.4809633493423462, + "learning_rate": 4.514213279482392e-05, + "loss": 5.2159, + "step": 33932 + }, + { + "epoch": 0.20180916357407935, + "grad_norm": 1.6122835874557495, + "learning_rate": 4.51418561083857e-05, + "loss": 4.674, + "step": 33933 + }, + { + "epoch": 0.20181511085736037, + "grad_norm": 1.5591886043548584, + "learning_rate": 4.5141579414916166e-05, + "loss": 5.2786, + "step": 33934 + }, + { + "epoch": 0.20182105814064136, + "grad_norm": 1.4378422498703003, + "learning_rate": 4.514130271441541e-05, + "loss": 5.3939, + "step": 33935 + }, + { + "epoch": 0.20182700542392235, + "grad_norm": 1.9341799020767212, + "learning_rate": 4.5141026006883543e-05, + "loss": 4.2788, + "step": 33936 + }, + { + "epoch": 0.20183295270720336, + "grad_norm": 1.7629951238632202, + "learning_rate": 4.514074929232065e-05, + "loss": 4.6655, + "step": 33937 + }, + { + "epoch": 0.20183889999048435, + "grad_norm": 2.02024245262146, + "learning_rate": 4.514047257072683e-05, + "loss": 4.1873, + "step": 33938 + }, + { + "epoch": 0.20184484727376534, + "grad_norm": 1.8670521974563599, + "learning_rate": 4.514019584210217e-05, + "loss": 4.762, + "step": 33939 + }, + { + "epoch": 0.20185079455704635, + "grad_norm": 1.35395085811615, + "learning_rate": 4.5139919106446796e-05, + "loss": 4.8601, + "step": 33940 + }, + { + "epoch": 0.20185674184032734, + "grad_norm": 1.3640669584274292, + "learning_rate": 4.5139642363760765e-05, + "loss": 4.6912, + "step": 33941 + }, + { + "epoch": 0.20186268912360833, + "grad_norm": 1.4075101613998413, + "learning_rate": 4.51393656140442e-05, + "loss": 5.0414, + "step": 33942 + }, + { + "epoch": 0.20186863640688935, + "grad_norm": 1.4277760982513428, + "learning_rate": 4.513908885729719e-05, + "loss": 5.2387, + "step": 33943 + }, + { + "epoch": 0.20187458369017033, + "grad_norm": 1.665337085723877, + "learning_rate": 4.5138812093519825e-05, + "loss": 4.6897, + "step": 33944 + }, + { + "epoch": 0.20188053097345132, + "grad_norm": 1.6986275911331177, + "learning_rate": 4.513853532271222e-05, + "loss": 5.1868, + "step": 33945 + }, + { + "epoch": 0.20188647825673234, + "grad_norm": 1.6409507989883423, + "learning_rate": 4.5138258544874455e-05, + "loss": 5.0873, + "step": 33946 + }, + { + "epoch": 0.20189242554001333, + "grad_norm": 1.5691696405410767, + "learning_rate": 4.513798176000663e-05, + "loss": 5.1351, + "step": 33947 + }, + { + "epoch": 0.20189837282329431, + "grad_norm": 1.490713119506836, + "learning_rate": 4.513770496810885e-05, + "loss": 5.1177, + "step": 33948 + }, + { + "epoch": 0.20190432010657533, + "grad_norm": 1.505738377571106, + "learning_rate": 4.51374281691812e-05, + "loss": 5.1999, + "step": 33949 + }, + { + "epoch": 0.20191026738985632, + "grad_norm": 1.6345856189727783, + "learning_rate": 4.5137151363223786e-05, + "loss": 5.1542, + "step": 33950 + }, + { + "epoch": 0.2019162146731373, + "grad_norm": 1.6463525295257568, + "learning_rate": 4.5136874550236696e-05, + "loss": 5.443, + "step": 33951 + }, + { + "epoch": 0.2019221619564183, + "grad_norm": 1.616943359375, + "learning_rate": 4.513659773022003e-05, + "loss": 5.5123, + "step": 33952 + }, + { + "epoch": 0.2019281092396993, + "grad_norm": 1.485422134399414, + "learning_rate": 4.513632090317389e-05, + "loss": 5.2979, + "step": 33953 + }, + { + "epoch": 0.2019340565229803, + "grad_norm": 1.629473328590393, + "learning_rate": 4.513604406909837e-05, + "loss": 5.1169, + "step": 33954 + }, + { + "epoch": 0.2019400038062613, + "grad_norm": 1.6643434762954712, + "learning_rate": 4.513576722799357e-05, + "loss": 4.8612, + "step": 33955 + }, + { + "epoch": 0.2019459510895423, + "grad_norm": 1.694492220878601, + "learning_rate": 4.513549037985957e-05, + "loss": 4.7354, + "step": 33956 + }, + { + "epoch": 0.2019518983728233, + "grad_norm": 1.9222434759140015, + "learning_rate": 4.513521352469648e-05, + "loss": 5.2706, + "step": 33957 + }, + { + "epoch": 0.20195784565610428, + "grad_norm": 1.6370993852615356, + "learning_rate": 4.513493666250439e-05, + "loss": 5.1291, + "step": 33958 + }, + { + "epoch": 0.2019637929393853, + "grad_norm": 1.7546459436416626, + "learning_rate": 4.5134659793283416e-05, + "loss": 5.1042, + "step": 33959 + }, + { + "epoch": 0.20196974022266628, + "grad_norm": 1.6431562900543213, + "learning_rate": 4.513438291703364e-05, + "loss": 4.8202, + "step": 33960 + }, + { + "epoch": 0.20197568750594727, + "grad_norm": 1.6383068561553955, + "learning_rate": 4.513410603375514e-05, + "loss": 5.1127, + "step": 33961 + }, + { + "epoch": 0.2019816347892283, + "grad_norm": 1.5812822580337524, + "learning_rate": 4.513382914344805e-05, + "loss": 5.1321, + "step": 33962 + }, + { + "epoch": 0.20198758207250928, + "grad_norm": 1.462621808052063, + "learning_rate": 4.513355224611244e-05, + "loss": 5.1102, + "step": 33963 + }, + { + "epoch": 0.20199352935579026, + "grad_norm": 1.5409513711929321, + "learning_rate": 4.5133275341748414e-05, + "loss": 5.4329, + "step": 33964 + }, + { + "epoch": 0.20199947663907128, + "grad_norm": 1.2433700561523438, + "learning_rate": 4.513299843035608e-05, + "loss": 4.8886, + "step": 33965 + }, + { + "epoch": 0.20200542392235227, + "grad_norm": 1.6082065105438232, + "learning_rate": 4.513272151193552e-05, + "loss": 4.9496, + "step": 33966 + }, + { + "epoch": 0.20201137120563326, + "grad_norm": 1.6117057800292969, + "learning_rate": 4.513244458648682e-05, + "loss": 4.6895, + "step": 33967 + }, + { + "epoch": 0.20201731848891427, + "grad_norm": 1.5260170698165894, + "learning_rate": 4.513216765401011e-05, + "loss": 4.9767, + "step": 33968 + }, + { + "epoch": 0.20202326577219526, + "grad_norm": 1.6406491994857788, + "learning_rate": 4.513189071450546e-05, + "loss": 5.0693, + "step": 33969 + }, + { + "epoch": 0.20202921305547625, + "grad_norm": 1.4740065336227417, + "learning_rate": 4.5131613767972975e-05, + "loss": 5.2069, + "step": 33970 + }, + { + "epoch": 0.20203516033875726, + "grad_norm": 1.6721255779266357, + "learning_rate": 4.513133681441276e-05, + "loss": 5.0749, + "step": 33971 + }, + { + "epoch": 0.20204110762203825, + "grad_norm": 1.666450023651123, + "learning_rate": 4.513105985382489e-05, + "loss": 5.1417, + "step": 33972 + }, + { + "epoch": 0.20204705490531924, + "grad_norm": 1.6091387271881104, + "learning_rate": 4.5130782886209484e-05, + "loss": 5.0461, + "step": 33973 + }, + { + "epoch": 0.20205300218860026, + "grad_norm": 1.525931715965271, + "learning_rate": 4.5130505911566624e-05, + "loss": 5.3005, + "step": 33974 + }, + { + "epoch": 0.20205894947188124, + "grad_norm": 1.5139743089675903, + "learning_rate": 4.513022892989641e-05, + "loss": 5.2355, + "step": 33975 + }, + { + "epoch": 0.20206489675516223, + "grad_norm": 1.7949497699737549, + "learning_rate": 4.512995194119896e-05, + "loss": 5.0951, + "step": 33976 + }, + { + "epoch": 0.20207084403844325, + "grad_norm": 1.5045291185379028, + "learning_rate": 4.512967494547433e-05, + "loss": 5.158, + "step": 33977 + }, + { + "epoch": 0.20207679132172424, + "grad_norm": 1.7383949756622314, + "learning_rate": 4.512939794272265e-05, + "loss": 5.1696, + "step": 33978 + }, + { + "epoch": 0.20208273860500522, + "grad_norm": 1.7070204019546509, + "learning_rate": 4.5129120932944005e-05, + "loss": 4.844, + "step": 33979 + }, + { + "epoch": 0.20208868588828624, + "grad_norm": 1.4247560501098633, + "learning_rate": 4.512884391613849e-05, + "loss": 5.0324, + "step": 33980 + }, + { + "epoch": 0.20209463317156723, + "grad_norm": 1.5811928510665894, + "learning_rate": 4.5128566892306195e-05, + "loss": 4.8644, + "step": 33981 + }, + { + "epoch": 0.20210058045484822, + "grad_norm": 1.5155131816864014, + "learning_rate": 4.5128289861447235e-05, + "loss": 5.2251, + "step": 33982 + }, + { + "epoch": 0.20210652773812923, + "grad_norm": 1.441920518875122, + "learning_rate": 4.5128012823561697e-05, + "loss": 5.0595, + "step": 33983 + }, + { + "epoch": 0.20211247502141022, + "grad_norm": 1.5248456001281738, + "learning_rate": 4.5127735778649674e-05, + "loss": 4.9004, + "step": 33984 + }, + { + "epoch": 0.2021184223046912, + "grad_norm": 1.614963173866272, + "learning_rate": 4.512745872671126e-05, + "loss": 4.7622, + "step": 33985 + }, + { + "epoch": 0.20212436958797222, + "grad_norm": 1.4812332391738892, + "learning_rate": 4.512718166774657e-05, + "loss": 4.6066, + "step": 33986 + }, + { + "epoch": 0.2021303168712532, + "grad_norm": 1.3561605215072632, + "learning_rate": 4.512690460175568e-05, + "loss": 4.5035, + "step": 33987 + }, + { + "epoch": 0.2021362641545342, + "grad_norm": 1.1321245431900024, + "learning_rate": 4.5126627528738704e-05, + "loss": 4.1359, + "step": 33988 + }, + { + "epoch": 0.20214221143781522, + "grad_norm": 1.7284629344940186, + "learning_rate": 4.512635044869573e-05, + "loss": 4.7861, + "step": 33989 + }, + { + "epoch": 0.2021481587210962, + "grad_norm": 1.4472488164901733, + "learning_rate": 4.512607336162685e-05, + "loss": 5.0742, + "step": 33990 + }, + { + "epoch": 0.2021541060043772, + "grad_norm": 1.425902009010315, + "learning_rate": 4.512579626753216e-05, + "loss": 4.73, + "step": 33991 + }, + { + "epoch": 0.2021600532876582, + "grad_norm": 1.4404271841049194, + "learning_rate": 4.512551916641178e-05, + "loss": 5.0756, + "step": 33992 + }, + { + "epoch": 0.2021660005709392, + "grad_norm": 1.6837798357009888, + "learning_rate": 4.512524205826577e-05, + "loss": 4.7235, + "step": 33993 + }, + { + "epoch": 0.20217194785422019, + "grad_norm": 1.9286775588989258, + "learning_rate": 4.512496494309426e-05, + "loss": 4.9271, + "step": 33994 + }, + { + "epoch": 0.2021778951375012, + "grad_norm": 2.1817314624786377, + "learning_rate": 4.512468782089733e-05, + "loss": 4.5035, + "step": 33995 + }, + { + "epoch": 0.2021838424207822, + "grad_norm": 1.429819107055664, + "learning_rate": 4.512441069167507e-05, + "loss": 4.6988, + "step": 33996 + }, + { + "epoch": 0.20218978970406318, + "grad_norm": 1.3980942964553833, + "learning_rate": 4.512413355542759e-05, + "loss": 5.0089, + "step": 33997 + }, + { + "epoch": 0.2021957369873442, + "grad_norm": 1.4934065341949463, + "learning_rate": 4.512385641215499e-05, + "loss": 5.0263, + "step": 33998 + }, + { + "epoch": 0.20220168427062518, + "grad_norm": 1.3305639028549194, + "learning_rate": 4.5123579261857354e-05, + "loss": 5.1148, + "step": 33999 + }, + { + "epoch": 0.20220763155390617, + "grad_norm": 1.524097204208374, + "learning_rate": 4.512330210453479e-05, + "loss": 4.9961, + "step": 34000 + }, + { + "epoch": 0.20221357883718719, + "grad_norm": 1.5130045413970947, + "learning_rate": 4.512302494018738e-05, + "loss": 5.0517, + "step": 34001 + }, + { + "epoch": 0.20221952612046817, + "grad_norm": 1.4187722206115723, + "learning_rate": 4.512274776881523e-05, + "loss": 5.2811, + "step": 34002 + }, + { + "epoch": 0.20222547340374916, + "grad_norm": 1.3560248613357544, + "learning_rate": 4.5122470590418446e-05, + "loss": 5.1782, + "step": 34003 + }, + { + "epoch": 0.20223142068703018, + "grad_norm": 1.6151503324508667, + "learning_rate": 4.5122193404997115e-05, + "loss": 5.0186, + "step": 34004 + }, + { + "epoch": 0.20223736797031117, + "grad_norm": 1.6382167339324951, + "learning_rate": 4.512191621255133e-05, + "loss": 4.7976, + "step": 34005 + }, + { + "epoch": 0.20224331525359215, + "grad_norm": 1.8903952836990356, + "learning_rate": 4.512163901308118e-05, + "loss": 4.5246, + "step": 34006 + }, + { + "epoch": 0.20224926253687317, + "grad_norm": 1.540955662727356, + "learning_rate": 4.512136180658679e-05, + "loss": 4.7971, + "step": 34007 + }, + { + "epoch": 0.20225520982015416, + "grad_norm": 1.3648852109909058, + "learning_rate": 4.512108459306824e-05, + "loss": 4.9859, + "step": 34008 + }, + { + "epoch": 0.20226115710343515, + "grad_norm": 2.3196678161621094, + "learning_rate": 4.512080737252562e-05, + "loss": 4.4534, + "step": 34009 + }, + { + "epoch": 0.20226710438671613, + "grad_norm": 2.2545480728149414, + "learning_rate": 4.512053014495904e-05, + "loss": 4.256, + "step": 34010 + }, + { + "epoch": 0.20227305166999715, + "grad_norm": 1.7504942417144775, + "learning_rate": 4.512025291036859e-05, + "loss": 5.193, + "step": 34011 + }, + { + "epoch": 0.20227899895327814, + "grad_norm": 1.8206931352615356, + "learning_rate": 4.5119975668754365e-05, + "loss": 4.8621, + "step": 34012 + }, + { + "epoch": 0.20228494623655913, + "grad_norm": 1.5588812828063965, + "learning_rate": 4.5119698420116465e-05, + "loss": 4.8035, + "step": 34013 + }, + { + "epoch": 0.20229089351984014, + "grad_norm": 2.13454532623291, + "learning_rate": 4.511942116445499e-05, + "loss": 4.1108, + "step": 34014 + }, + { + "epoch": 0.20229684080312113, + "grad_norm": 2.353149890899658, + "learning_rate": 4.511914390177002e-05, + "loss": 2.8856, + "step": 34015 + }, + { + "epoch": 0.20230278808640212, + "grad_norm": 1.998806357383728, + "learning_rate": 4.511886663206168e-05, + "loss": 4.9761, + "step": 34016 + }, + { + "epoch": 0.20230873536968313, + "grad_norm": 2.1776490211486816, + "learning_rate": 4.5118589355330045e-05, + "loss": 3.3337, + "step": 34017 + }, + { + "epoch": 0.20231468265296412, + "grad_norm": 1.7580403089523315, + "learning_rate": 4.5118312071575217e-05, + "loss": 4.3142, + "step": 34018 + }, + { + "epoch": 0.2023206299362451, + "grad_norm": 1.6570219993591309, + "learning_rate": 4.51180347807973e-05, + "loss": 4.3844, + "step": 34019 + }, + { + "epoch": 0.20232657721952613, + "grad_norm": 1.995206356048584, + "learning_rate": 4.511775748299638e-05, + "loss": 4.8618, + "step": 34020 + }, + { + "epoch": 0.20233252450280712, + "grad_norm": 3.040178060531616, + "learning_rate": 4.5117480178172555e-05, + "loss": 3.7443, + "step": 34021 + }, + { + "epoch": 0.2023384717860881, + "grad_norm": 1.5867849588394165, + "learning_rate": 4.511720286632593e-05, + "loss": 4.8468, + "step": 34022 + }, + { + "epoch": 0.20234441906936912, + "grad_norm": 1.6994186639785767, + "learning_rate": 4.51169255474566e-05, + "loss": 4.8807, + "step": 34023 + }, + { + "epoch": 0.2023503663526501, + "grad_norm": 1.6643023490905762, + "learning_rate": 4.511664822156465e-05, + "loss": 4.9895, + "step": 34024 + }, + { + "epoch": 0.2023563136359311, + "grad_norm": 2.3625648021698, + "learning_rate": 4.5116370888650195e-05, + "loss": 3.5997, + "step": 34025 + }, + { + "epoch": 0.2023622609192121, + "grad_norm": 2.570551633834839, + "learning_rate": 4.5116093548713324e-05, + "loss": 3.1305, + "step": 34026 + }, + { + "epoch": 0.2023682082024931, + "grad_norm": 2.3419370651245117, + "learning_rate": 4.5115816201754123e-05, + "loss": 3.6007, + "step": 34027 + }, + { + "epoch": 0.2023741554857741, + "grad_norm": 1.8358023166656494, + "learning_rate": 4.511553884777271e-05, + "loss": 4.8102, + "step": 34028 + }, + { + "epoch": 0.2023801027690551, + "grad_norm": 1.8780097961425781, + "learning_rate": 4.511526148676916e-05, + "loss": 4.7336, + "step": 34029 + }, + { + "epoch": 0.2023860500523361, + "grad_norm": 1.93792724609375, + "learning_rate": 4.5114984118743584e-05, + "loss": 4.3768, + "step": 34030 + }, + { + "epoch": 0.20239199733561708, + "grad_norm": 3.4534430503845215, + "learning_rate": 4.511470674369608e-05, + "loss": 3.6955, + "step": 34031 + }, + { + "epoch": 0.2023979446188981, + "grad_norm": 2.6207618713378906, + "learning_rate": 4.511442936162673e-05, + "loss": 3.5511, + "step": 34032 + }, + { + "epoch": 0.20240389190217908, + "grad_norm": 1.6200617551803589, + "learning_rate": 4.5114151972535646e-05, + "loss": 4.5561, + "step": 34033 + }, + { + "epoch": 0.20240983918546007, + "grad_norm": 1.6427030563354492, + "learning_rate": 4.511387457642292e-05, + "loss": 4.9699, + "step": 34034 + }, + { + "epoch": 0.2024157864687411, + "grad_norm": 2.553480863571167, + "learning_rate": 4.511359717328865e-05, + "loss": 3.473, + "step": 34035 + }, + { + "epoch": 0.20242173375202208, + "grad_norm": 2.211226224899292, + "learning_rate": 4.5113319763132924e-05, + "loss": 3.6738, + "step": 34036 + }, + { + "epoch": 0.20242768103530306, + "grad_norm": 2.54076886177063, + "learning_rate": 4.511304234595585e-05, + "loss": 3.5138, + "step": 34037 + }, + { + "epoch": 0.20243362831858408, + "grad_norm": 1.4781157970428467, + "learning_rate": 4.5112764921757524e-05, + "loss": 4.9329, + "step": 34038 + }, + { + "epoch": 0.20243957560186507, + "grad_norm": 2.3486785888671875, + "learning_rate": 4.5112487490538033e-05, + "loss": 3.5019, + "step": 34039 + }, + { + "epoch": 0.20244552288514606, + "grad_norm": 2.678544282913208, + "learning_rate": 4.511221005229748e-05, + "loss": 3.5641, + "step": 34040 + }, + { + "epoch": 0.20245147016842707, + "grad_norm": 2.3444156646728516, + "learning_rate": 4.5111932607035965e-05, + "loss": 3.2577, + "step": 34041 + }, + { + "epoch": 0.20245741745170806, + "grad_norm": 2.382840633392334, + "learning_rate": 4.5111655154753584e-05, + "loss": 3.2974, + "step": 34042 + }, + { + "epoch": 0.20246336473498905, + "grad_norm": 2.189680814743042, + "learning_rate": 4.511137769545043e-05, + "loss": 3.7787, + "step": 34043 + }, + { + "epoch": 0.20246931201827006, + "grad_norm": 2.6685993671417236, + "learning_rate": 4.511110022912661e-05, + "loss": 3.0316, + "step": 34044 + }, + { + "epoch": 0.20247525930155105, + "grad_norm": 2.4069671630859375, + "learning_rate": 4.51108227557822e-05, + "loss": 3.2931, + "step": 34045 + }, + { + "epoch": 0.20248120658483204, + "grad_norm": 2.5283761024475098, + "learning_rate": 4.5110545275417314e-05, + "loss": 3.5309, + "step": 34046 + }, + { + "epoch": 0.20248715386811306, + "grad_norm": 2.511444330215454, + "learning_rate": 4.5110267788032044e-05, + "loss": 3.1936, + "step": 34047 + }, + { + "epoch": 0.20249310115139404, + "grad_norm": 2.352766275405884, + "learning_rate": 4.510999029362649e-05, + "loss": 2.993, + "step": 34048 + }, + { + "epoch": 0.20249904843467503, + "grad_norm": 1.8153971433639526, + "learning_rate": 4.510971279220074e-05, + "loss": 4.5221, + "step": 34049 + }, + { + "epoch": 0.20250499571795605, + "grad_norm": 2.084735155105591, + "learning_rate": 4.510943528375491e-05, + "loss": 4.4174, + "step": 34050 + }, + { + "epoch": 0.20251094300123704, + "grad_norm": 1.5497907400131226, + "learning_rate": 4.510915776828907e-05, + "loss": 4.3793, + "step": 34051 + }, + { + "epoch": 0.20251689028451803, + "grad_norm": 2.8055882453918457, + "learning_rate": 4.510888024580333e-05, + "loss": 2.9198, + "step": 34052 + }, + { + "epoch": 0.20252283756779904, + "grad_norm": 2.464205265045166, + "learning_rate": 4.5108602716297805e-05, + "loss": 2.8499, + "step": 34053 + }, + { + "epoch": 0.20252878485108003, + "grad_norm": 2.158693313598633, + "learning_rate": 4.5108325179772556e-05, + "loss": 4.354, + "step": 34054 + }, + { + "epoch": 0.20253473213436102, + "grad_norm": 1.828605055809021, + "learning_rate": 4.5108047636227715e-05, + "loss": 4.848, + "step": 34055 + }, + { + "epoch": 0.20254067941764203, + "grad_norm": 1.7504563331604004, + "learning_rate": 4.510777008566335e-05, + "loss": 4.8546, + "step": 34056 + }, + { + "epoch": 0.20254662670092302, + "grad_norm": 1.7161656618118286, + "learning_rate": 4.510749252807957e-05, + "loss": 4.8179, + "step": 34057 + }, + { + "epoch": 0.202552573984204, + "grad_norm": 1.610592007637024, + "learning_rate": 4.5107214963476476e-05, + "loss": 4.8187, + "step": 34058 + }, + { + "epoch": 0.20255852126748503, + "grad_norm": 1.55141282081604, + "learning_rate": 4.5106937391854167e-05, + "loss": 4.9095, + "step": 34059 + }, + { + "epoch": 0.202564468550766, + "grad_norm": 1.8562514781951904, + "learning_rate": 4.5106659813212725e-05, + "loss": 4.6624, + "step": 34060 + }, + { + "epoch": 0.202570415834047, + "grad_norm": 2.3251969814300537, + "learning_rate": 4.510638222755226e-05, + "loss": 4.1922, + "step": 34061 + }, + { + "epoch": 0.20257636311732802, + "grad_norm": 1.9926371574401855, + "learning_rate": 4.510610463487286e-05, + "loss": 4.1985, + "step": 34062 + }, + { + "epoch": 0.202582310400609, + "grad_norm": 1.8390743732452393, + "learning_rate": 4.5105827035174634e-05, + "loss": 4.7285, + "step": 34063 + }, + { + "epoch": 0.20258825768389, + "grad_norm": 1.6606966257095337, + "learning_rate": 4.510554942845766e-05, + "loss": 4.9654, + "step": 34064 + }, + { + "epoch": 0.202594204967171, + "grad_norm": 1.6574113368988037, + "learning_rate": 4.510527181472205e-05, + "loss": 5.1899, + "step": 34065 + }, + { + "epoch": 0.202600152250452, + "grad_norm": 2.2811429500579834, + "learning_rate": 4.510499419396791e-05, + "loss": 3.9173, + "step": 34066 + }, + { + "epoch": 0.202606099533733, + "grad_norm": 1.7340202331542969, + "learning_rate": 4.510471656619531e-05, + "loss": 4.7533, + "step": 34067 + }, + { + "epoch": 0.20261204681701397, + "grad_norm": 1.6960166692733765, + "learning_rate": 4.5104438931404366e-05, + "loss": 4.6543, + "step": 34068 + }, + { + "epoch": 0.202617994100295, + "grad_norm": 1.5477968454360962, + "learning_rate": 4.510416128959517e-05, + "loss": 4.913, + "step": 34069 + }, + { + "epoch": 0.20262394138357598, + "grad_norm": 1.810110330581665, + "learning_rate": 4.510388364076782e-05, + "loss": 4.7998, + "step": 34070 + }, + { + "epoch": 0.20262988866685697, + "grad_norm": 2.4154820442199707, + "learning_rate": 4.5103605984922416e-05, + "loss": 3.3427, + "step": 34071 + }, + { + "epoch": 0.20263583595013798, + "grad_norm": 1.464949369430542, + "learning_rate": 4.5103328322059046e-05, + "loss": 4.9067, + "step": 34072 + }, + { + "epoch": 0.20264178323341897, + "grad_norm": 1.944841742515564, + "learning_rate": 4.510305065217781e-05, + "loss": 4.2708, + "step": 34073 + }, + { + "epoch": 0.20264773051669996, + "grad_norm": 1.911776065826416, + "learning_rate": 4.5102772975278805e-05, + "loss": 4.2951, + "step": 34074 + }, + { + "epoch": 0.20265367779998097, + "grad_norm": 2.0011467933654785, + "learning_rate": 4.510249529136213e-05, + "loss": 4.2628, + "step": 34075 + }, + { + "epoch": 0.20265962508326196, + "grad_norm": 1.8548624515533447, + "learning_rate": 4.5102217600427887e-05, + "loss": 3.9783, + "step": 34076 + }, + { + "epoch": 0.20266557236654295, + "grad_norm": 1.7101125717163086, + "learning_rate": 4.510193990247616e-05, + "loss": 4.4064, + "step": 34077 + }, + { + "epoch": 0.20267151964982397, + "grad_norm": 1.4838210344314575, + "learning_rate": 4.510166219750707e-05, + "loss": 5.2525, + "step": 34078 + }, + { + "epoch": 0.20267746693310495, + "grad_norm": 1.4394530057907104, + "learning_rate": 4.510138448552068e-05, + "loss": 5.1155, + "step": 34079 + }, + { + "epoch": 0.20268341421638594, + "grad_norm": 1.5585321187973022, + "learning_rate": 4.510110676651711e-05, + "loss": 5.1436, + "step": 34080 + }, + { + "epoch": 0.20268936149966696, + "grad_norm": 1.3252408504486084, + "learning_rate": 4.510082904049645e-05, + "loss": 5.1799, + "step": 34081 + }, + { + "epoch": 0.20269530878294795, + "grad_norm": 1.5365374088287354, + "learning_rate": 4.51005513074588e-05, + "loss": 4.9753, + "step": 34082 + }, + { + "epoch": 0.20270125606622894, + "grad_norm": 1.5917723178863525, + "learning_rate": 4.510027356740426e-05, + "loss": 4.4666, + "step": 34083 + }, + { + "epoch": 0.20270720334950995, + "grad_norm": 1.5753759145736694, + "learning_rate": 4.509999582033292e-05, + "loss": 5.0185, + "step": 34084 + }, + { + "epoch": 0.20271315063279094, + "grad_norm": 1.7368061542510986, + "learning_rate": 4.5099718066244875e-05, + "loss": 4.8307, + "step": 34085 + }, + { + "epoch": 0.20271909791607193, + "grad_norm": 1.7237951755523682, + "learning_rate": 4.509944030514023e-05, + "loss": 4.736, + "step": 34086 + }, + { + "epoch": 0.20272504519935294, + "grad_norm": 1.337406873703003, + "learning_rate": 4.509916253701907e-05, + "loss": 5.0079, + "step": 34087 + }, + { + "epoch": 0.20273099248263393, + "grad_norm": 1.7189267873764038, + "learning_rate": 4.509888476188151e-05, + "loss": 4.861, + "step": 34088 + }, + { + "epoch": 0.20273693976591492, + "grad_norm": 1.5400909185409546, + "learning_rate": 4.509860697972763e-05, + "loss": 5.025, + "step": 34089 + }, + { + "epoch": 0.20274288704919594, + "grad_norm": 1.5735805034637451, + "learning_rate": 4.509832919055754e-05, + "loss": 5.1001, + "step": 34090 + }, + { + "epoch": 0.20274883433247692, + "grad_norm": 1.5908180475234985, + "learning_rate": 4.5098051394371324e-05, + "loss": 4.0066, + "step": 34091 + }, + { + "epoch": 0.2027547816157579, + "grad_norm": 1.4632060527801514, + "learning_rate": 4.509777359116909e-05, + "loss": 4.9066, + "step": 34092 + }, + { + "epoch": 0.20276072889903893, + "grad_norm": 1.6393321752548218, + "learning_rate": 4.5097495780950926e-05, + "loss": 4.9735, + "step": 34093 + }, + { + "epoch": 0.20276667618231992, + "grad_norm": 1.5407154560089111, + "learning_rate": 4.5097217963716946e-05, + "loss": 5.1568, + "step": 34094 + }, + { + "epoch": 0.2027726234656009, + "grad_norm": 1.3990727663040161, + "learning_rate": 4.509694013946723e-05, + "loss": 5.1463, + "step": 34095 + }, + { + "epoch": 0.20277857074888192, + "grad_norm": 1.9776240587234497, + "learning_rate": 4.509666230820187e-05, + "loss": 4.3307, + "step": 34096 + }, + { + "epoch": 0.2027845180321629, + "grad_norm": 1.737297534942627, + "learning_rate": 4.509638446992098e-05, + "loss": 4.6655, + "step": 34097 + }, + { + "epoch": 0.2027904653154439, + "grad_norm": 1.946708083152771, + "learning_rate": 4.5096106624624647e-05, + "loss": 4.4116, + "step": 34098 + }, + { + "epoch": 0.2027964125987249, + "grad_norm": 1.475724697113037, + "learning_rate": 4.509582877231298e-05, + "loss": 5.2965, + "step": 34099 + }, + { + "epoch": 0.2028023598820059, + "grad_norm": 2.5185513496398926, + "learning_rate": 4.509555091298605e-05, + "loss": 3.333, + "step": 34100 + }, + { + "epoch": 0.2028083071652869, + "grad_norm": 1.4091417789459229, + "learning_rate": 4.5095273046643985e-05, + "loss": 4.9255, + "step": 34101 + }, + { + "epoch": 0.2028142544485679, + "grad_norm": 1.1531707048416138, + "learning_rate": 4.509499517328686e-05, + "loss": 4.8433, + "step": 34102 + }, + { + "epoch": 0.2028202017318489, + "grad_norm": 1.0453072786331177, + "learning_rate": 4.509471729291479e-05, + "loss": 4.6551, + "step": 34103 + }, + { + "epoch": 0.20282614901512988, + "grad_norm": 0.9254010319709778, + "learning_rate": 4.509443940552785e-05, + "loss": 4.619, + "step": 34104 + }, + { + "epoch": 0.2028320962984109, + "grad_norm": 1.067936897277832, + "learning_rate": 4.5094161511126155e-05, + "loss": 4.2688, + "step": 34105 + }, + { + "epoch": 0.20283804358169188, + "grad_norm": 1.2932766675949097, + "learning_rate": 4.50938836097098e-05, + "loss": 4.6168, + "step": 34106 + }, + { + "epoch": 0.20284399086497287, + "grad_norm": 1.522346019744873, + "learning_rate": 4.509360570127887e-05, + "loss": 4.9451, + "step": 34107 + }, + { + "epoch": 0.2028499381482539, + "grad_norm": 1.7008284330368042, + "learning_rate": 4.509332778583347e-05, + "loss": 4.9488, + "step": 34108 + }, + { + "epoch": 0.20285588543153488, + "grad_norm": 1.7703099250793457, + "learning_rate": 4.50930498633737e-05, + "loss": 4.9046, + "step": 34109 + }, + { + "epoch": 0.20286183271481587, + "grad_norm": 1.7272570133209229, + "learning_rate": 4.509277193389965e-05, + "loss": 4.8267, + "step": 34110 + }, + { + "epoch": 0.20286777999809688, + "grad_norm": 1.2832982540130615, + "learning_rate": 4.5092493997411426e-05, + "loss": 4.5058, + "step": 34111 + }, + { + "epoch": 0.20287372728137787, + "grad_norm": 1.063335657119751, + "learning_rate": 4.509221605390912e-05, + "loss": 4.3458, + "step": 34112 + }, + { + "epoch": 0.20287967456465886, + "grad_norm": 0.9658304452896118, + "learning_rate": 4.509193810339283e-05, + "loss": 4.4834, + "step": 34113 + }, + { + "epoch": 0.20288562184793987, + "grad_norm": 1.211989164352417, + "learning_rate": 4.509166014586265e-05, + "loss": 4.3731, + "step": 34114 + }, + { + "epoch": 0.20289156913122086, + "grad_norm": 1.7385507822036743, + "learning_rate": 4.5091382181318675e-05, + "loss": 5.0037, + "step": 34115 + }, + { + "epoch": 0.20289751641450185, + "grad_norm": 1.5110931396484375, + "learning_rate": 4.5091104209761005e-05, + "loss": 5.127, + "step": 34116 + }, + { + "epoch": 0.20290346369778287, + "grad_norm": 1.644289255142212, + "learning_rate": 4.5090826231189745e-05, + "loss": 5.2034, + "step": 34117 + }, + { + "epoch": 0.20290941098106385, + "grad_norm": 1.3904880285263062, + "learning_rate": 4.509054824560498e-05, + "loss": 5.1311, + "step": 34118 + }, + { + "epoch": 0.20291535826434484, + "grad_norm": 1.6756666898727417, + "learning_rate": 4.509027025300682e-05, + "loss": 4.8252, + "step": 34119 + }, + { + "epoch": 0.20292130554762586, + "grad_norm": 1.3861212730407715, + "learning_rate": 4.508999225339534e-05, + "loss": 4.7141, + "step": 34120 + }, + { + "epoch": 0.20292725283090685, + "grad_norm": 1.4065701961517334, + "learning_rate": 4.5089714246770663e-05, + "loss": 4.5625, + "step": 34121 + }, + { + "epoch": 0.20293320011418783, + "grad_norm": 1.336972951889038, + "learning_rate": 4.508943623313288e-05, + "loss": 4.4038, + "step": 34122 + }, + { + "epoch": 0.20293914739746885, + "grad_norm": 1.7632920742034912, + "learning_rate": 4.5089158212482064e-05, + "loss": 5.1773, + "step": 34123 + }, + { + "epoch": 0.20294509468074984, + "grad_norm": 1.5751595497131348, + "learning_rate": 4.508888018481834e-05, + "loss": 4.7093, + "step": 34124 + }, + { + "epoch": 0.20295104196403083, + "grad_norm": 1.4306808710098267, + "learning_rate": 4.5088602150141793e-05, + "loss": 4.8948, + "step": 34125 + }, + { + "epoch": 0.20295698924731181, + "grad_norm": 1.533740758895874, + "learning_rate": 4.5088324108452525e-05, + "loss": 4.8152, + "step": 34126 + }, + { + "epoch": 0.20296293653059283, + "grad_norm": 1.5290772914886475, + "learning_rate": 4.508804605975063e-05, + "loss": 4.6585, + "step": 34127 + }, + { + "epoch": 0.20296888381387382, + "grad_norm": 1.3709888458251953, + "learning_rate": 4.508776800403621e-05, + "loss": 4.7503, + "step": 34128 + }, + { + "epoch": 0.2029748310971548, + "grad_norm": 1.2883923053741455, + "learning_rate": 4.5087489941309356e-05, + "loss": 4.4883, + "step": 34129 + }, + { + "epoch": 0.20298077838043582, + "grad_norm": 1.5060383081436157, + "learning_rate": 4.5087211871570165e-05, + "loss": 4.7866, + "step": 34130 + }, + { + "epoch": 0.2029867256637168, + "grad_norm": 1.5895962715148926, + "learning_rate": 4.5086933794818733e-05, + "loss": 4.8881, + "step": 34131 + }, + { + "epoch": 0.2029926729469978, + "grad_norm": 1.570587396621704, + "learning_rate": 4.5086655711055164e-05, + "loss": 4.7775, + "step": 34132 + }, + { + "epoch": 0.20299862023027881, + "grad_norm": 1.7003437280654907, + "learning_rate": 4.508637762027955e-05, + "loss": 4.9595, + "step": 34133 + }, + { + "epoch": 0.2030045675135598, + "grad_norm": 1.3333162069320679, + "learning_rate": 4.508609952249199e-05, + "loss": 5.1303, + "step": 34134 + }, + { + "epoch": 0.2030105147968408, + "grad_norm": 1.6453673839569092, + "learning_rate": 4.508582141769258e-05, + "loss": 4.1194, + "step": 34135 + }, + { + "epoch": 0.2030164620801218, + "grad_norm": 2.4057064056396484, + "learning_rate": 4.508554330588142e-05, + "loss": 4.0858, + "step": 34136 + }, + { + "epoch": 0.2030224093634028, + "grad_norm": 2.333036184310913, + "learning_rate": 4.508526518705859e-05, + "loss": 4.5886, + "step": 34137 + }, + { + "epoch": 0.20302835664668378, + "grad_norm": 1.5182788372039795, + "learning_rate": 4.5084987061224216e-05, + "loss": 5.316, + "step": 34138 + }, + { + "epoch": 0.2030343039299648, + "grad_norm": 1.2949062585830688, + "learning_rate": 4.5084708928378374e-05, + "loss": 5.1341, + "step": 34139 + }, + { + "epoch": 0.2030402512132458, + "grad_norm": 2.1052892208099365, + "learning_rate": 4.508443078852117e-05, + "loss": 4.5668, + "step": 34140 + }, + { + "epoch": 0.20304619849652678, + "grad_norm": 2.886911153793335, + "learning_rate": 4.50841526416527e-05, + "loss": 3.6236, + "step": 34141 + }, + { + "epoch": 0.2030521457798078, + "grad_norm": 1.5125616788864136, + "learning_rate": 4.5083874487773056e-05, + "loss": 4.6975, + "step": 34142 + }, + { + "epoch": 0.20305809306308878, + "grad_norm": 1.399048089981079, + "learning_rate": 4.5083596326882346e-05, + "loss": 4.2171, + "step": 34143 + }, + { + "epoch": 0.20306404034636977, + "grad_norm": 1.4590729475021362, + "learning_rate": 4.5083318158980656e-05, + "loss": 4.8204, + "step": 34144 + }, + { + "epoch": 0.20306998762965078, + "grad_norm": 1.7433021068572998, + "learning_rate": 4.508303998406809e-05, + "loss": 4.7279, + "step": 34145 + }, + { + "epoch": 0.20307593491293177, + "grad_norm": 1.47339928150177, + "learning_rate": 4.5082761802144736e-05, + "loss": 4.9708, + "step": 34146 + }, + { + "epoch": 0.20308188219621276, + "grad_norm": 2.5525825023651123, + "learning_rate": 4.5082483613210696e-05, + "loss": 4.5048, + "step": 34147 + }, + { + "epoch": 0.20308782947949378, + "grad_norm": 1.897265911102295, + "learning_rate": 4.5082205417266076e-05, + "loss": 4.8667, + "step": 34148 + }, + { + "epoch": 0.20309377676277476, + "grad_norm": 1.443208932876587, + "learning_rate": 4.508192721431096e-05, + "loss": 5.4088, + "step": 34149 + }, + { + "epoch": 0.20309972404605575, + "grad_norm": 2.650792121887207, + "learning_rate": 4.508164900434545e-05, + "loss": 4.1196, + "step": 34150 + }, + { + "epoch": 0.20310567132933677, + "grad_norm": 2.9030683040618896, + "learning_rate": 4.508137078736965e-05, + "loss": 3.1477, + "step": 34151 + }, + { + "epoch": 0.20311161861261776, + "grad_norm": 1.8367629051208496, + "learning_rate": 4.5081092563383645e-05, + "loss": 4.0951, + "step": 34152 + }, + { + "epoch": 0.20311756589589874, + "grad_norm": 1.3655685186386108, + "learning_rate": 4.508081433238754e-05, + "loss": 4.4232, + "step": 34153 + }, + { + "epoch": 0.20312351317917976, + "grad_norm": 1.5286078453063965, + "learning_rate": 4.5080536094381434e-05, + "loss": 4.3104, + "step": 34154 + }, + { + "epoch": 0.20312946046246075, + "grad_norm": 1.593637228012085, + "learning_rate": 4.508025784936542e-05, + "loss": 4.1811, + "step": 34155 + }, + { + "epoch": 0.20313540774574174, + "grad_norm": 1.498099446296692, + "learning_rate": 4.5079979597339586e-05, + "loss": 4.204, + "step": 34156 + }, + { + "epoch": 0.20314135502902275, + "grad_norm": 1.6303921937942505, + "learning_rate": 4.507970133830405e-05, + "loss": 4.3768, + "step": 34157 + }, + { + "epoch": 0.20314730231230374, + "grad_norm": 1.4380861520767212, + "learning_rate": 4.507942307225889e-05, + "loss": 4.4061, + "step": 34158 + }, + { + "epoch": 0.20315324959558473, + "grad_norm": 1.672142744064331, + "learning_rate": 4.5079144799204216e-05, + "loss": 4.8228, + "step": 34159 + }, + { + "epoch": 0.20315919687886574, + "grad_norm": 1.6014958620071411, + "learning_rate": 4.507886651914012e-05, + "loss": 4.7106, + "step": 34160 + }, + { + "epoch": 0.20316514416214673, + "grad_norm": 1.5370984077453613, + "learning_rate": 4.507858823206669e-05, + "loss": 4.8702, + "step": 34161 + }, + { + "epoch": 0.20317109144542772, + "grad_norm": 2.199638605117798, + "learning_rate": 4.507830993798404e-05, + "loss": 3.1011, + "step": 34162 + }, + { + "epoch": 0.20317703872870874, + "grad_norm": 1.298632025718689, + "learning_rate": 4.507803163689226e-05, + "loss": 5.0342, + "step": 34163 + }, + { + "epoch": 0.20318298601198972, + "grad_norm": 1.423470377922058, + "learning_rate": 4.5077753328791446e-05, + "loss": 4.6831, + "step": 34164 + }, + { + "epoch": 0.2031889332952707, + "grad_norm": 1.4942458868026733, + "learning_rate": 4.507747501368169e-05, + "loss": 4.7348, + "step": 34165 + }, + { + "epoch": 0.20319488057855173, + "grad_norm": 1.4068806171417236, + "learning_rate": 4.5077196691563104e-05, + "loss": 4.7561, + "step": 34166 + }, + { + "epoch": 0.20320082786183272, + "grad_norm": 1.4947446584701538, + "learning_rate": 4.5076918362435774e-05, + "loss": 4.6205, + "step": 34167 + }, + { + "epoch": 0.2032067751451137, + "grad_norm": 1.5509511232376099, + "learning_rate": 4.5076640026299794e-05, + "loss": 4.7343, + "step": 34168 + }, + { + "epoch": 0.20321272242839472, + "grad_norm": 1.7500367164611816, + "learning_rate": 4.5076361683155275e-05, + "loss": 4.948, + "step": 34169 + }, + { + "epoch": 0.2032186697116757, + "grad_norm": 1.6232200860977173, + "learning_rate": 4.5076083333002296e-05, + "loss": 5.1769, + "step": 34170 + }, + { + "epoch": 0.2032246169949567, + "grad_norm": 1.635056734085083, + "learning_rate": 4.507580497584097e-05, + "loss": 4.5965, + "step": 34171 + }, + { + "epoch": 0.2032305642782377, + "grad_norm": 1.6716241836547852, + "learning_rate": 4.507552661167138e-05, + "loss": 4.5625, + "step": 34172 + }, + { + "epoch": 0.2032365115615187, + "grad_norm": 1.4650036096572876, + "learning_rate": 4.5075248240493636e-05, + "loss": 4.8342, + "step": 34173 + }, + { + "epoch": 0.2032424588447997, + "grad_norm": 1.595201015472412, + "learning_rate": 4.507496986230784e-05, + "loss": 4.8685, + "step": 34174 + }, + { + "epoch": 0.2032484061280807, + "grad_norm": 1.3592157363891602, + "learning_rate": 4.507469147711406e-05, + "loss": 4.6101, + "step": 34175 + }, + { + "epoch": 0.2032543534113617, + "grad_norm": 1.5765128135681152, + "learning_rate": 4.507441308491242e-05, + "loss": 4.5686, + "step": 34176 + }, + { + "epoch": 0.20326030069464268, + "grad_norm": 1.5563489198684692, + "learning_rate": 4.5074134685703016e-05, + "loss": 5.0364, + "step": 34177 + }, + { + "epoch": 0.2032662479779237, + "grad_norm": 1.671233057975769, + "learning_rate": 4.5073856279485936e-05, + "loss": 4.8082, + "step": 34178 + }, + { + "epoch": 0.20327219526120469, + "grad_norm": 1.494294285774231, + "learning_rate": 4.5073577866261285e-05, + "loss": 5.3636, + "step": 34179 + }, + { + "epoch": 0.20327814254448567, + "grad_norm": 1.52043879032135, + "learning_rate": 4.507329944602915e-05, + "loss": 5.1198, + "step": 34180 + }, + { + "epoch": 0.2032840898277667, + "grad_norm": 2.0095272064208984, + "learning_rate": 4.5073021018789635e-05, + "loss": 4.2995, + "step": 34181 + }, + { + "epoch": 0.20329003711104768, + "grad_norm": 1.7347562313079834, + "learning_rate": 4.507274258454283e-05, + "loss": 4.8881, + "step": 34182 + }, + { + "epoch": 0.20329598439432867, + "grad_norm": 1.5348436832427979, + "learning_rate": 4.5072464143288844e-05, + "loss": 4.6506, + "step": 34183 + }, + { + "epoch": 0.20330193167760965, + "grad_norm": 1.8851455450057983, + "learning_rate": 4.5072185695027766e-05, + "loss": 4.7133, + "step": 34184 + }, + { + "epoch": 0.20330787896089067, + "grad_norm": 1.6985150575637817, + "learning_rate": 4.50719072397597e-05, + "loss": 4.9648, + "step": 34185 + }, + { + "epoch": 0.20331382624417166, + "grad_norm": 2.302384853363037, + "learning_rate": 4.507162877748473e-05, + "loss": 4.061, + "step": 34186 + }, + { + "epoch": 0.20331977352745265, + "grad_norm": 2.0493087768554688, + "learning_rate": 4.507135030820297e-05, + "loss": 4.2728, + "step": 34187 + }, + { + "epoch": 0.20332572081073366, + "grad_norm": 1.9146785736083984, + "learning_rate": 4.5071071831914504e-05, + "loss": 4.6218, + "step": 34188 + }, + { + "epoch": 0.20333166809401465, + "grad_norm": 1.750434160232544, + "learning_rate": 4.507079334861943e-05, + "loss": 4.7743, + "step": 34189 + }, + { + "epoch": 0.20333761537729564, + "grad_norm": 1.74863600730896, + "learning_rate": 4.507051485831786e-05, + "loss": 5.0927, + "step": 34190 + }, + { + "epoch": 0.20334356266057665, + "grad_norm": 1.523288369178772, + "learning_rate": 4.507023636100988e-05, + "loss": 4.9635, + "step": 34191 + }, + { + "epoch": 0.20334950994385764, + "grad_norm": 1.5992393493652344, + "learning_rate": 4.506995785669558e-05, + "loss": 4.9328, + "step": 34192 + }, + { + "epoch": 0.20335545722713863, + "grad_norm": 1.365012764930725, + "learning_rate": 4.5069679345375064e-05, + "loss": 4.9001, + "step": 34193 + }, + { + "epoch": 0.20336140451041965, + "grad_norm": 2.0055863857269287, + "learning_rate": 4.506940082704844e-05, + "loss": 4.8008, + "step": 34194 + }, + { + "epoch": 0.20336735179370063, + "grad_norm": 2.1375856399536133, + "learning_rate": 4.506912230171579e-05, + "loss": 4.1867, + "step": 34195 + }, + { + "epoch": 0.20337329907698162, + "grad_norm": 2.8420791625976562, + "learning_rate": 4.506884376937721e-05, + "loss": 4.1056, + "step": 34196 + }, + { + "epoch": 0.20337924636026264, + "grad_norm": 2.5468852519989014, + "learning_rate": 4.506856523003282e-05, + "loss": 4.1181, + "step": 34197 + }, + { + "epoch": 0.20338519364354363, + "grad_norm": 1.7750333547592163, + "learning_rate": 4.506828668368269e-05, + "loss": 5.1556, + "step": 34198 + }, + { + "epoch": 0.20339114092682462, + "grad_norm": 1.6695621013641357, + "learning_rate": 4.506800813032693e-05, + "loss": 5.8686, + "step": 34199 + }, + { + "epoch": 0.20339708821010563, + "grad_norm": 1.627415418624878, + "learning_rate": 4.506772956996563e-05, + "loss": 5.079, + "step": 34200 + }, + { + "epoch": 0.20340303549338662, + "grad_norm": 1.6603140830993652, + "learning_rate": 4.50674510025989e-05, + "loss": 4.8437, + "step": 34201 + }, + { + "epoch": 0.2034089827766676, + "grad_norm": 1.7012217044830322, + "learning_rate": 4.5067172428226835e-05, + "loss": 5.4832, + "step": 34202 + }, + { + "epoch": 0.20341493005994862, + "grad_norm": 1.4697450399398804, + "learning_rate": 4.506689384684952e-05, + "loss": 5.5992, + "step": 34203 + }, + { + "epoch": 0.2034208773432296, + "grad_norm": 1.4704852104187012, + "learning_rate": 4.506661525846706e-05, + "loss": 5.1104, + "step": 34204 + }, + { + "epoch": 0.2034268246265106, + "grad_norm": 1.6582788228988647, + "learning_rate": 4.5066336663079554e-05, + "loss": 4.9394, + "step": 34205 + }, + { + "epoch": 0.20343277190979162, + "grad_norm": 1.7526438236236572, + "learning_rate": 4.50660580606871e-05, + "loss": 5.2653, + "step": 34206 + }, + { + "epoch": 0.2034387191930726, + "grad_norm": 1.5664905309677124, + "learning_rate": 4.506577945128978e-05, + "loss": 5.5802, + "step": 34207 + }, + { + "epoch": 0.2034446664763536, + "grad_norm": 1.4898643493652344, + "learning_rate": 4.506550083488772e-05, + "loss": 5.606, + "step": 34208 + }, + { + "epoch": 0.2034506137596346, + "grad_norm": 1.4939732551574707, + "learning_rate": 4.5065222211480996e-05, + "loss": 5.4784, + "step": 34209 + }, + { + "epoch": 0.2034565610429156, + "grad_norm": 1.7237370014190674, + "learning_rate": 4.5064943581069705e-05, + "loss": 5.0776, + "step": 34210 + }, + { + "epoch": 0.20346250832619658, + "grad_norm": 1.7304513454437256, + "learning_rate": 4.506466494365395e-05, + "loss": 4.688, + "step": 34211 + }, + { + "epoch": 0.2034684556094776, + "grad_norm": 1.7174402475357056, + "learning_rate": 4.5064386299233826e-05, + "loss": 4.977, + "step": 34212 + }, + { + "epoch": 0.2034744028927586, + "grad_norm": 1.7457705736160278, + "learning_rate": 4.5064107647809436e-05, + "loss": 5.553, + "step": 34213 + }, + { + "epoch": 0.20348035017603958, + "grad_norm": 1.737239956855774, + "learning_rate": 4.5063828989380876e-05, + "loss": 5.4613, + "step": 34214 + }, + { + "epoch": 0.2034862974593206, + "grad_norm": 1.6310393810272217, + "learning_rate": 4.506355032394824e-05, + "loss": 5.51, + "step": 34215 + }, + { + "epoch": 0.20349224474260158, + "grad_norm": 1.660376787185669, + "learning_rate": 4.506327165151162e-05, + "loss": 5.5371, + "step": 34216 + }, + { + "epoch": 0.20349819202588257, + "grad_norm": 1.5626025199890137, + "learning_rate": 4.506299297207113e-05, + "loss": 5.4861, + "step": 34217 + }, + { + "epoch": 0.20350413930916358, + "grad_norm": 1.654665470123291, + "learning_rate": 4.506271428562685e-05, + "loss": 4.5907, + "step": 34218 + }, + { + "epoch": 0.20351008659244457, + "grad_norm": 1.474399447441101, + "learning_rate": 4.506243559217887e-05, + "loss": 5.1529, + "step": 34219 + }, + { + "epoch": 0.20351603387572556, + "grad_norm": 1.4964390993118286, + "learning_rate": 4.506215689172733e-05, + "loss": 5.1153, + "step": 34220 + }, + { + "epoch": 0.20352198115900658, + "grad_norm": 1.9598966836929321, + "learning_rate": 4.506187818427228e-05, + "loss": 4.1903, + "step": 34221 + }, + { + "epoch": 0.20352792844228756, + "grad_norm": 1.8703410625457764, + "learning_rate": 4.506159946981383e-05, + "loss": 4.0715, + "step": 34222 + }, + { + "epoch": 0.20353387572556855, + "grad_norm": 1.718729019165039, + "learning_rate": 4.50613207483521e-05, + "loss": 4.9715, + "step": 34223 + }, + { + "epoch": 0.20353982300884957, + "grad_norm": 1.7516825199127197, + "learning_rate": 4.506104201988716e-05, + "loss": 4.2259, + "step": 34224 + }, + { + "epoch": 0.20354577029213056, + "grad_norm": 1.7814204692840576, + "learning_rate": 4.5060763284419114e-05, + "loss": 5.0681, + "step": 34225 + }, + { + "epoch": 0.20355171757541154, + "grad_norm": 2.5831916332244873, + "learning_rate": 4.506048454194807e-05, + "loss": 4.0293, + "step": 34226 + }, + { + "epoch": 0.20355766485869256, + "grad_norm": 1.8654228448867798, + "learning_rate": 4.506020579247412e-05, + "loss": 4.5101, + "step": 34227 + }, + { + "epoch": 0.20356361214197355, + "grad_norm": 1.6361619234085083, + "learning_rate": 4.5059927035997354e-05, + "loss": 5.4405, + "step": 34228 + }, + { + "epoch": 0.20356955942525454, + "grad_norm": 1.5046677589416504, + "learning_rate": 4.505964827251787e-05, + "loss": 4.8682, + "step": 34229 + }, + { + "epoch": 0.20357550670853555, + "grad_norm": 1.6146504878997803, + "learning_rate": 4.505936950203578e-05, + "loss": 4.345, + "step": 34230 + }, + { + "epoch": 0.20358145399181654, + "grad_norm": 1.7138882875442505, + "learning_rate": 4.5059090724551166e-05, + "loss": 5.3332, + "step": 34231 + }, + { + "epoch": 0.20358740127509753, + "grad_norm": 1.7118430137634277, + "learning_rate": 4.505881194006413e-05, + "loss": 4.8239, + "step": 34232 + }, + { + "epoch": 0.20359334855837855, + "grad_norm": 1.4503207206726074, + "learning_rate": 4.505853314857477e-05, + "loss": 4.7619, + "step": 34233 + }, + { + "epoch": 0.20359929584165953, + "grad_norm": 1.5013114213943481, + "learning_rate": 4.5058254350083185e-05, + "loss": 5.1288, + "step": 34234 + }, + { + "epoch": 0.20360524312494052, + "grad_norm": 1.5356587171554565, + "learning_rate": 4.505797554458947e-05, + "loss": 5.051, + "step": 34235 + }, + { + "epoch": 0.20361119040822154, + "grad_norm": 1.7051646709442139, + "learning_rate": 4.5057696732093724e-05, + "loss": 4.7767, + "step": 34236 + }, + { + "epoch": 0.20361713769150253, + "grad_norm": 2.3335628509521484, + "learning_rate": 4.505741791259605e-05, + "loss": 4.9362, + "step": 34237 + }, + { + "epoch": 0.2036230849747835, + "grad_norm": 1.9061404466629028, + "learning_rate": 4.505713908609653e-05, + "loss": 4.9848, + "step": 34238 + }, + { + "epoch": 0.20362903225806453, + "grad_norm": 1.7989264726638794, + "learning_rate": 4.505686025259527e-05, + "loss": 4.9515, + "step": 34239 + }, + { + "epoch": 0.20363497954134552, + "grad_norm": 2.2343575954437256, + "learning_rate": 4.505658141209237e-05, + "loss": 4.6851, + "step": 34240 + }, + { + "epoch": 0.2036409268246265, + "grad_norm": 1.568610668182373, + "learning_rate": 4.5056302564587924e-05, + "loss": 5.0318, + "step": 34241 + }, + { + "epoch": 0.2036468741079075, + "grad_norm": 1.4426900148391724, + "learning_rate": 4.505602371008203e-05, + "loss": 5.2873, + "step": 34242 + }, + { + "epoch": 0.2036528213911885, + "grad_norm": 1.5718400478363037, + "learning_rate": 4.505574484857478e-05, + "loss": 5.181, + "step": 34243 + }, + { + "epoch": 0.2036587686744695, + "grad_norm": 1.4871337413787842, + "learning_rate": 4.505546598006628e-05, + "loss": 5.275, + "step": 34244 + }, + { + "epoch": 0.2036647159577505, + "grad_norm": 1.4343833923339844, + "learning_rate": 4.505518710455663e-05, + "loss": 5.0945, + "step": 34245 + }, + { + "epoch": 0.2036706632410315, + "grad_norm": 1.514282464981079, + "learning_rate": 4.5054908222045916e-05, + "loss": 5.0487, + "step": 34246 + }, + { + "epoch": 0.2036766105243125, + "grad_norm": 1.5430774688720703, + "learning_rate": 4.5054629332534246e-05, + "loss": 5.0369, + "step": 34247 + }, + { + "epoch": 0.20368255780759348, + "grad_norm": 1.7679804563522339, + "learning_rate": 4.5054350436021706e-05, + "loss": 5.5694, + "step": 34248 + }, + { + "epoch": 0.2036885050908745, + "grad_norm": 1.6152211427688599, + "learning_rate": 4.5054071532508404e-05, + "loss": 5.6343, + "step": 34249 + }, + { + "epoch": 0.20369445237415548, + "grad_norm": 1.6899724006652832, + "learning_rate": 4.505379262199442e-05, + "loss": 5.3575, + "step": 34250 + }, + { + "epoch": 0.20370039965743647, + "grad_norm": 1.8305284976959229, + "learning_rate": 4.505351370447988e-05, + "loss": 5.3694, + "step": 34251 + }, + { + "epoch": 0.2037063469407175, + "grad_norm": 1.396485686302185, + "learning_rate": 4.505323477996486e-05, + "loss": 5.0583, + "step": 34252 + }, + { + "epoch": 0.20371229422399847, + "grad_norm": 1.662598967552185, + "learning_rate": 4.5052955848449465e-05, + "loss": 5.1148, + "step": 34253 + }, + { + "epoch": 0.20371824150727946, + "grad_norm": 1.6300253868103027, + "learning_rate": 4.505267690993378e-05, + "loss": 5.5773, + "step": 34254 + }, + { + "epoch": 0.20372418879056048, + "grad_norm": 1.7322368621826172, + "learning_rate": 4.5052397964417925e-05, + "loss": 5.0738, + "step": 34255 + }, + { + "epoch": 0.20373013607384147, + "grad_norm": 1.4914497137069702, + "learning_rate": 4.5052119011901986e-05, + "loss": 5.1192, + "step": 34256 + }, + { + "epoch": 0.20373608335712245, + "grad_norm": 1.4011354446411133, + "learning_rate": 4.5051840052386044e-05, + "loss": 5.1526, + "step": 34257 + }, + { + "epoch": 0.20374203064040347, + "grad_norm": 1.4619200229644775, + "learning_rate": 4.505156108587022e-05, + "loss": 4.9555, + "step": 34258 + }, + { + "epoch": 0.20374797792368446, + "grad_norm": 1.4376040697097778, + "learning_rate": 4.505128211235461e-05, + "loss": 4.9896, + "step": 34259 + }, + { + "epoch": 0.20375392520696545, + "grad_norm": 1.5649752616882324, + "learning_rate": 4.50510031318393e-05, + "loss": 4.9176, + "step": 34260 + }, + { + "epoch": 0.20375987249024646, + "grad_norm": 1.7832107543945312, + "learning_rate": 4.505072414432439e-05, + "loss": 4.696, + "step": 34261 + }, + { + "epoch": 0.20376581977352745, + "grad_norm": 1.9836961030960083, + "learning_rate": 4.505044514980998e-05, + "loss": 4.6756, + "step": 34262 + }, + { + "epoch": 0.20377176705680844, + "grad_norm": 2.0420374870300293, + "learning_rate": 4.505016614829617e-05, + "loss": 4.991, + "step": 34263 + }, + { + "epoch": 0.20377771434008946, + "grad_norm": 1.666096806526184, + "learning_rate": 4.504988713978305e-05, + "loss": 5.0536, + "step": 34264 + }, + { + "epoch": 0.20378366162337044, + "grad_norm": 1.5408387184143066, + "learning_rate": 4.504960812427072e-05, + "loss": 4.9794, + "step": 34265 + }, + { + "epoch": 0.20378960890665143, + "grad_norm": 2.0508735179901123, + "learning_rate": 4.504932910175929e-05, + "loss": 3.8745, + "step": 34266 + }, + { + "epoch": 0.20379555618993245, + "grad_norm": 2.475095272064209, + "learning_rate": 4.504905007224883e-05, + "loss": 3.7332, + "step": 34267 + }, + { + "epoch": 0.20380150347321344, + "grad_norm": 1.6301664113998413, + "learning_rate": 4.5048771035739466e-05, + "loss": 4.8958, + "step": 34268 + }, + { + "epoch": 0.20380745075649442, + "grad_norm": 1.7478148937225342, + "learning_rate": 4.504849199223128e-05, + "loss": 5.4425, + "step": 34269 + }, + { + "epoch": 0.20381339803977544, + "grad_norm": 1.4645951986312866, + "learning_rate": 4.504821294172438e-05, + "loss": 5.0187, + "step": 34270 + }, + { + "epoch": 0.20381934532305643, + "grad_norm": 1.511397123336792, + "learning_rate": 4.504793388421884e-05, + "loss": 5.0823, + "step": 34271 + }, + { + "epoch": 0.20382529260633742, + "grad_norm": 1.5013232231140137, + "learning_rate": 4.504765481971478e-05, + "loss": 4.9601, + "step": 34272 + }, + { + "epoch": 0.20383123988961843, + "grad_norm": 1.3130029439926147, + "learning_rate": 4.504737574821229e-05, + "loss": 4.9463, + "step": 34273 + }, + { + "epoch": 0.20383718717289942, + "grad_norm": 1.2741039991378784, + "learning_rate": 4.504709666971147e-05, + "loss": 4.9866, + "step": 34274 + }, + { + "epoch": 0.2038431344561804, + "grad_norm": 1.6717923879623413, + "learning_rate": 4.504681758421242e-05, + "loss": 4.8065, + "step": 34275 + }, + { + "epoch": 0.20384908173946142, + "grad_norm": 2.5650248527526855, + "learning_rate": 4.504653849171523e-05, + "loss": 3.2156, + "step": 34276 + }, + { + "epoch": 0.2038550290227424, + "grad_norm": 2.1047005653381348, + "learning_rate": 4.5046259392220006e-05, + "loss": 4.2701, + "step": 34277 + }, + { + "epoch": 0.2038609763060234, + "grad_norm": 1.4460844993591309, + "learning_rate": 4.504598028572683e-05, + "loss": 4.1266, + "step": 34278 + }, + { + "epoch": 0.20386692358930442, + "grad_norm": 1.375220537185669, + "learning_rate": 4.504570117223581e-05, + "loss": 4.6078, + "step": 34279 + }, + { + "epoch": 0.2038728708725854, + "grad_norm": 1.5132031440734863, + "learning_rate": 4.5045422051747046e-05, + "loss": 4.9891, + "step": 34280 + }, + { + "epoch": 0.2038788181558664, + "grad_norm": 1.6141597032546997, + "learning_rate": 4.5045142924260636e-05, + "loss": 4.9529, + "step": 34281 + }, + { + "epoch": 0.2038847654391474, + "grad_norm": 2.2230634689331055, + "learning_rate": 4.504486378977667e-05, + "loss": 4.6287, + "step": 34282 + }, + { + "epoch": 0.2038907127224284, + "grad_norm": 2.391753673553467, + "learning_rate": 4.504458464829525e-05, + "loss": 4.429, + "step": 34283 + }, + { + "epoch": 0.20389666000570938, + "grad_norm": 2.246250867843628, + "learning_rate": 4.504430549981647e-05, + "loss": 4.1301, + "step": 34284 + }, + { + "epoch": 0.2039026072889904, + "grad_norm": 2.15234375, + "learning_rate": 4.504402634434043e-05, + "loss": 3.9196, + "step": 34285 + }, + { + "epoch": 0.2039085545722714, + "grad_norm": 1.6975746154785156, + "learning_rate": 4.504374718186723e-05, + "loss": 5.5317, + "step": 34286 + }, + { + "epoch": 0.20391450185555238, + "grad_norm": 2.212271213531494, + "learning_rate": 4.504346801239696e-05, + "loss": 4.0733, + "step": 34287 + }, + { + "epoch": 0.2039204491388334, + "grad_norm": 1.9130991697311401, + "learning_rate": 4.504318883592973e-05, + "loss": 4.8775, + "step": 34288 + }, + { + "epoch": 0.20392639642211438, + "grad_norm": 1.4307633638381958, + "learning_rate": 4.5042909652465624e-05, + "loss": 4.4828, + "step": 34289 + }, + { + "epoch": 0.20393234370539537, + "grad_norm": 1.3600475788116455, + "learning_rate": 4.504263046200475e-05, + "loss": 4.2668, + "step": 34290 + }, + { + "epoch": 0.20393829098867639, + "grad_norm": 2.528594493865967, + "learning_rate": 4.50423512645472e-05, + "loss": 3.4961, + "step": 34291 + }, + { + "epoch": 0.20394423827195737, + "grad_norm": 2.440265655517578, + "learning_rate": 4.504207206009307e-05, + "loss": 3.3638, + "step": 34292 + }, + { + "epoch": 0.20395018555523836, + "grad_norm": 2.088148832321167, + "learning_rate": 4.5041792848642463e-05, + "loss": 4.19, + "step": 34293 + }, + { + "epoch": 0.20395613283851938, + "grad_norm": 1.9497579336166382, + "learning_rate": 4.504151363019546e-05, + "loss": 4.4115, + "step": 34294 + }, + { + "epoch": 0.20396208012180037, + "grad_norm": 2.358234405517578, + "learning_rate": 4.5041234404752185e-05, + "loss": 4.0831, + "step": 34295 + }, + { + "epoch": 0.20396802740508135, + "grad_norm": 2.400301456451416, + "learning_rate": 4.504095517231273e-05, + "loss": 3.8786, + "step": 34296 + }, + { + "epoch": 0.20397397468836237, + "grad_norm": 2.4365954399108887, + "learning_rate": 4.504067593287717e-05, + "loss": 4.0625, + "step": 34297 + }, + { + "epoch": 0.20397992197164336, + "grad_norm": 1.4779819250106812, + "learning_rate": 4.5040396686445616e-05, + "loss": 5.7232, + "step": 34298 + }, + { + "epoch": 0.20398586925492435, + "grad_norm": 1.4149293899536133, + "learning_rate": 4.504011743301817e-05, + "loss": 5.8752, + "step": 34299 + }, + { + "epoch": 0.20399181653820536, + "grad_norm": 1.4655671119689941, + "learning_rate": 4.5039838172594936e-05, + "loss": 5.1771, + "step": 34300 + }, + { + "epoch": 0.20399776382148635, + "grad_norm": 1.5849196910858154, + "learning_rate": 4.503955890517599e-05, + "loss": 5.6272, + "step": 34301 + }, + { + "epoch": 0.20400371110476734, + "grad_norm": 1.4127448797225952, + "learning_rate": 4.5039279630761445e-05, + "loss": 5.5198, + "step": 34302 + }, + { + "epoch": 0.20400965838804833, + "grad_norm": 2.142515182495117, + "learning_rate": 4.503900034935139e-05, + "loss": 4.8905, + "step": 34303 + }, + { + "epoch": 0.20401560567132934, + "grad_norm": 1.9965078830718994, + "learning_rate": 4.5038721060945935e-05, + "loss": 4.3118, + "step": 34304 + }, + { + "epoch": 0.20402155295461033, + "grad_norm": 1.8355085849761963, + "learning_rate": 4.5038441765545164e-05, + "loss": 4.5507, + "step": 34305 + }, + { + "epoch": 0.20402750023789132, + "grad_norm": 1.819510817527771, + "learning_rate": 4.503816246314918e-05, + "loss": 4.7781, + "step": 34306 + }, + { + "epoch": 0.20403344752117233, + "grad_norm": 1.8253278732299805, + "learning_rate": 4.503788315375809e-05, + "loss": 4.7338, + "step": 34307 + }, + { + "epoch": 0.20403939480445332, + "grad_norm": 1.6907480955123901, + "learning_rate": 4.5037603837371966e-05, + "loss": 4.7709, + "step": 34308 + }, + { + "epoch": 0.2040453420877343, + "grad_norm": 1.4460554122924805, + "learning_rate": 4.503732451399093e-05, + "loss": 5.3781, + "step": 34309 + }, + { + "epoch": 0.20405128937101533, + "grad_norm": 1.8433641195297241, + "learning_rate": 4.503704518361507e-05, + "loss": 5.5734, + "step": 34310 + }, + { + "epoch": 0.20405723665429631, + "grad_norm": 1.6964929103851318, + "learning_rate": 4.503676584624449e-05, + "loss": 5.3381, + "step": 34311 + }, + { + "epoch": 0.2040631839375773, + "grad_norm": 1.965718388557434, + "learning_rate": 4.503648650187927e-05, + "loss": 5.0361, + "step": 34312 + }, + { + "epoch": 0.20406913122085832, + "grad_norm": 1.9891307353973389, + "learning_rate": 4.503620715051953e-05, + "loss": 5.0337, + "step": 34313 + }, + { + "epoch": 0.2040750785041393, + "grad_norm": 1.799054741859436, + "learning_rate": 4.503592779216536e-05, + "loss": 4.4287, + "step": 34314 + }, + { + "epoch": 0.2040810257874203, + "grad_norm": 1.7559428215026855, + "learning_rate": 4.503564842681684e-05, + "loss": 4.4849, + "step": 34315 + }, + { + "epoch": 0.2040869730707013, + "grad_norm": 1.7106789350509644, + "learning_rate": 4.503536905447409e-05, + "loss": 4.5868, + "step": 34316 + }, + { + "epoch": 0.2040929203539823, + "grad_norm": 1.664260745048523, + "learning_rate": 4.50350896751372e-05, + "loss": 5.0689, + "step": 34317 + }, + { + "epoch": 0.2040988676372633, + "grad_norm": 1.566235065460205, + "learning_rate": 4.503481028880627e-05, + "loss": 5.37, + "step": 34318 + }, + { + "epoch": 0.2041048149205443, + "grad_norm": 1.839880108833313, + "learning_rate": 4.503453089548139e-05, + "loss": 5.3544, + "step": 34319 + }, + { + "epoch": 0.2041107622038253, + "grad_norm": 1.5123977661132812, + "learning_rate": 4.5034251495162663e-05, + "loss": 5.0838, + "step": 34320 + }, + { + "epoch": 0.20411670948710628, + "grad_norm": 1.642776608467102, + "learning_rate": 4.5033972087850184e-05, + "loss": 4.7068, + "step": 34321 + }, + { + "epoch": 0.2041226567703873, + "grad_norm": 1.6237605810165405, + "learning_rate": 4.503369267354406e-05, + "loss": 5.2005, + "step": 34322 + }, + { + "epoch": 0.20412860405366828, + "grad_norm": 2.285550355911255, + "learning_rate": 4.503341325224437e-05, + "loss": 4.728, + "step": 34323 + }, + { + "epoch": 0.20413455133694927, + "grad_norm": 3.8627207279205322, + "learning_rate": 4.5033133823951236e-05, + "loss": 3.6855, + "step": 34324 + }, + { + "epoch": 0.2041404986202303, + "grad_norm": 3.054490566253662, + "learning_rate": 4.503285438866473e-05, + "loss": 3.8219, + "step": 34325 + }, + { + "epoch": 0.20414644590351128, + "grad_norm": 2.7683627605438232, + "learning_rate": 4.503257494638497e-05, + "loss": 4.096, + "step": 34326 + }, + { + "epoch": 0.20415239318679226, + "grad_norm": 2.6042439937591553, + "learning_rate": 4.5032295497112035e-05, + "loss": 3.442, + "step": 34327 + }, + { + "epoch": 0.20415834047007328, + "grad_norm": 2.1823248863220215, + "learning_rate": 4.5032016040846045e-05, + "loss": 4.541, + "step": 34328 + }, + { + "epoch": 0.20416428775335427, + "grad_norm": 2.897273540496826, + "learning_rate": 4.5031736577587074e-05, + "loss": 4.0124, + "step": 34329 + }, + { + "epoch": 0.20417023503663526, + "grad_norm": 1.747259259223938, + "learning_rate": 4.503145710733524e-05, + "loss": 4.8034, + "step": 34330 + }, + { + "epoch": 0.20417618231991627, + "grad_norm": 1.7826976776123047, + "learning_rate": 4.5031177630090625e-05, + "loss": 5.2103, + "step": 34331 + }, + { + "epoch": 0.20418212960319726, + "grad_norm": 1.7653011083602905, + "learning_rate": 4.503089814585333e-05, + "loss": 5.1752, + "step": 34332 + }, + { + "epoch": 0.20418807688647825, + "grad_norm": 1.8423312902450562, + "learning_rate": 4.503061865462346e-05, + "loss": 5.1152, + "step": 34333 + }, + { + "epoch": 0.20419402416975926, + "grad_norm": 1.7056430578231812, + "learning_rate": 4.503033915640111e-05, + "loss": 5.1135, + "step": 34334 + }, + { + "epoch": 0.20419997145304025, + "grad_norm": 1.9776579141616821, + "learning_rate": 4.5030059651186376e-05, + "loss": 4.9217, + "step": 34335 + }, + { + "epoch": 0.20420591873632124, + "grad_norm": 1.475510835647583, + "learning_rate": 4.502978013897935e-05, + "loss": 4.7542, + "step": 34336 + }, + { + "epoch": 0.20421186601960226, + "grad_norm": 1.5682835578918457, + "learning_rate": 4.502950061978014e-05, + "loss": 5.1662, + "step": 34337 + }, + { + "epoch": 0.20421781330288324, + "grad_norm": 2.6880135536193848, + "learning_rate": 4.5029221093588836e-05, + "loss": 3.9135, + "step": 34338 + }, + { + "epoch": 0.20422376058616423, + "grad_norm": 2.1446547508239746, + "learning_rate": 4.502894156040553e-05, + "loss": 3.9488, + "step": 34339 + }, + { + "epoch": 0.20422970786944525, + "grad_norm": 1.6459128856658936, + "learning_rate": 4.502866202023034e-05, + "loss": 5.1773, + "step": 34340 + }, + { + "epoch": 0.20423565515272624, + "grad_norm": 1.842704176902771, + "learning_rate": 4.502838247306335e-05, + "loss": 5.0428, + "step": 34341 + }, + { + "epoch": 0.20424160243600722, + "grad_norm": 1.9853084087371826, + "learning_rate": 4.5028102918904644e-05, + "loss": 4.87, + "step": 34342 + }, + { + "epoch": 0.20424754971928824, + "grad_norm": 1.943145990371704, + "learning_rate": 4.502782335775435e-05, + "loss": 4.8169, + "step": 34343 + }, + { + "epoch": 0.20425349700256923, + "grad_norm": 1.7412112951278687, + "learning_rate": 4.502754378961255e-05, + "loss": 4.8134, + "step": 34344 + }, + { + "epoch": 0.20425944428585022, + "grad_norm": 1.7240549325942993, + "learning_rate": 4.502726421447933e-05, + "loss": 5.0885, + "step": 34345 + }, + { + "epoch": 0.20426539156913123, + "grad_norm": 1.4919542074203491, + "learning_rate": 4.502698463235481e-05, + "loss": 5.2307, + "step": 34346 + }, + { + "epoch": 0.20427133885241222, + "grad_norm": 1.32732093334198, + "learning_rate": 4.502670504323907e-05, + "loss": 5.1218, + "step": 34347 + }, + { + "epoch": 0.2042772861356932, + "grad_norm": 1.7667738199234009, + "learning_rate": 4.5026425447132214e-05, + "loss": 4.0633, + "step": 34348 + }, + { + "epoch": 0.20428323341897422, + "grad_norm": 1.4684362411499023, + "learning_rate": 4.502614584403434e-05, + "loss": 5.2084, + "step": 34349 + }, + { + "epoch": 0.2042891807022552, + "grad_norm": 1.3652414083480835, + "learning_rate": 4.5025866233945546e-05, + "loss": 5.2852, + "step": 34350 + }, + { + "epoch": 0.2042951279855362, + "grad_norm": 1.6385377645492554, + "learning_rate": 4.5025586616865926e-05, + "loss": 5.103, + "step": 34351 + }, + { + "epoch": 0.20430107526881722, + "grad_norm": 1.8744497299194336, + "learning_rate": 4.502530699279559e-05, + "loss": 4.42, + "step": 34352 + }, + { + "epoch": 0.2043070225520982, + "grad_norm": 1.8791557550430298, + "learning_rate": 4.502502736173462e-05, + "loss": 4.3255, + "step": 34353 + }, + { + "epoch": 0.2043129698353792, + "grad_norm": 1.8308615684509277, + "learning_rate": 4.502474772368312e-05, + "loss": 4.3123, + "step": 34354 + }, + { + "epoch": 0.2043189171186602, + "grad_norm": 1.897897481918335, + "learning_rate": 4.502446807864118e-05, + "loss": 4.5253, + "step": 34355 + }, + { + "epoch": 0.2043248644019412, + "grad_norm": 1.822041392326355, + "learning_rate": 4.502418842660892e-05, + "loss": 5.4352, + "step": 34356 + }, + { + "epoch": 0.20433081168522219, + "grad_norm": 1.7441822290420532, + "learning_rate": 4.5023908767586416e-05, + "loss": 5.2117, + "step": 34357 + }, + { + "epoch": 0.2043367589685032, + "grad_norm": 1.9075031280517578, + "learning_rate": 4.502362910157377e-05, + "loss": 5.1509, + "step": 34358 + }, + { + "epoch": 0.2043427062517842, + "grad_norm": 1.7022250890731812, + "learning_rate": 4.502334942857108e-05, + "loss": 5.2759, + "step": 34359 + }, + { + "epoch": 0.20434865353506518, + "grad_norm": 1.8610200881958008, + "learning_rate": 4.502306974857845e-05, + "loss": 5.0043, + "step": 34360 + }, + { + "epoch": 0.20435460081834617, + "grad_norm": 1.7256522178649902, + "learning_rate": 4.5022790061595976e-05, + "loss": 5.1169, + "step": 34361 + }, + { + "epoch": 0.20436054810162718, + "grad_norm": 1.9288054704666138, + "learning_rate": 4.502251036762375e-05, + "loss": 5.0786, + "step": 34362 + }, + { + "epoch": 0.20436649538490817, + "grad_norm": 1.950032353401184, + "learning_rate": 4.502223066666187e-05, + "loss": 4.6047, + "step": 34363 + }, + { + "epoch": 0.20437244266818916, + "grad_norm": 1.7432233095169067, + "learning_rate": 4.502195095871044e-05, + "loss": 4.7526, + "step": 34364 + }, + { + "epoch": 0.20437838995147017, + "grad_norm": 2.0420267581939697, + "learning_rate": 4.502167124376955e-05, + "loss": 4.995, + "step": 34365 + }, + { + "epoch": 0.20438433723475116, + "grad_norm": 1.6214263439178467, + "learning_rate": 4.5021391521839304e-05, + "loss": 4.8635, + "step": 34366 + }, + { + "epoch": 0.20439028451803215, + "grad_norm": 1.4966545104980469, + "learning_rate": 4.50211117929198e-05, + "loss": 5.0103, + "step": 34367 + }, + { + "epoch": 0.20439623180131317, + "grad_norm": 1.501697063446045, + "learning_rate": 4.5020832057011127e-05, + "loss": 5.2083, + "step": 34368 + }, + { + "epoch": 0.20440217908459415, + "grad_norm": 1.6379048824310303, + "learning_rate": 4.502055231411339e-05, + "loss": 4.6216, + "step": 34369 + }, + { + "epoch": 0.20440812636787514, + "grad_norm": 1.9041169881820679, + "learning_rate": 4.502027256422668e-05, + "loss": 4.3796, + "step": 34370 + }, + { + "epoch": 0.20441407365115616, + "grad_norm": 1.7477339506149292, + "learning_rate": 4.501999280735111e-05, + "loss": 4.6934, + "step": 34371 + }, + { + "epoch": 0.20442002093443715, + "grad_norm": 1.5829856395721436, + "learning_rate": 4.501971304348676e-05, + "loss": 5.2587, + "step": 34372 + }, + { + "epoch": 0.20442596821771813, + "grad_norm": 1.445803165435791, + "learning_rate": 4.501943327263374e-05, + "loss": 5.1847, + "step": 34373 + }, + { + "epoch": 0.20443191550099915, + "grad_norm": 1.5141373872756958, + "learning_rate": 4.5019153494792144e-05, + "loss": 5.3348, + "step": 34374 + }, + { + "epoch": 0.20443786278428014, + "grad_norm": 1.970505714416504, + "learning_rate": 4.501887370996206e-05, + "loss": 3.957, + "step": 34375 + }, + { + "epoch": 0.20444381006756113, + "grad_norm": 1.7028234004974365, + "learning_rate": 4.5018593918143596e-05, + "loss": 4.4819, + "step": 34376 + }, + { + "epoch": 0.20444975735084214, + "grad_norm": 1.9567445516586304, + "learning_rate": 4.501831411933685e-05, + "loss": 4.3555, + "step": 34377 + }, + { + "epoch": 0.20445570463412313, + "grad_norm": 1.6176704168319702, + "learning_rate": 4.5018034313541925e-05, + "loss": 4.7144, + "step": 34378 + }, + { + "epoch": 0.20446165191740412, + "grad_norm": 1.7398934364318848, + "learning_rate": 4.50177545007589e-05, + "loss": 4.2907, + "step": 34379 + }, + { + "epoch": 0.20446759920068514, + "grad_norm": 1.7774847745895386, + "learning_rate": 4.501747468098789e-05, + "loss": 4.3437, + "step": 34380 + }, + { + "epoch": 0.20447354648396612, + "grad_norm": 1.7353404760360718, + "learning_rate": 4.5017194854228984e-05, + "loss": 4.796, + "step": 34381 + }, + { + "epoch": 0.2044794937672471, + "grad_norm": 1.7452480792999268, + "learning_rate": 4.501691502048227e-05, + "loss": 4.655, + "step": 34382 + }, + { + "epoch": 0.20448544105052813, + "grad_norm": 1.8226715326309204, + "learning_rate": 4.501663517974788e-05, + "loss": 3.9734, + "step": 34383 + }, + { + "epoch": 0.20449138833380912, + "grad_norm": 1.74259352684021, + "learning_rate": 4.501635533202587e-05, + "loss": 4.319, + "step": 34384 + }, + { + "epoch": 0.2044973356170901, + "grad_norm": 1.9349931478500366, + "learning_rate": 4.501607547731637e-05, + "loss": 4.29, + "step": 34385 + }, + { + "epoch": 0.20450328290037112, + "grad_norm": 1.7214958667755127, + "learning_rate": 4.501579561561946e-05, + "loss": 4.3099, + "step": 34386 + }, + { + "epoch": 0.2045092301836521, + "grad_norm": 1.6682885885238647, + "learning_rate": 4.5015515746935246e-05, + "loss": 4.2436, + "step": 34387 + }, + { + "epoch": 0.2045151774669331, + "grad_norm": 1.7586250305175781, + "learning_rate": 4.5015235871263813e-05, + "loss": 4.4776, + "step": 34388 + }, + { + "epoch": 0.2045211247502141, + "grad_norm": 2.00175142288208, + "learning_rate": 4.501495598860528e-05, + "loss": 3.932, + "step": 34389 + }, + { + "epoch": 0.2045270720334951, + "grad_norm": 2.066532611846924, + "learning_rate": 4.501467609895973e-05, + "loss": 4.3484, + "step": 34390 + }, + { + "epoch": 0.2045330193167761, + "grad_norm": 2.031602621078491, + "learning_rate": 4.501439620232726e-05, + "loss": 4.4164, + "step": 34391 + }, + { + "epoch": 0.2045389666000571, + "grad_norm": 1.8264626264572144, + "learning_rate": 4.5014116298707975e-05, + "loss": 4.6198, + "step": 34392 + }, + { + "epoch": 0.2045449138833381, + "grad_norm": 1.8924089670181274, + "learning_rate": 4.501383638810196e-05, + "loss": 4.5491, + "step": 34393 + }, + { + "epoch": 0.20455086116661908, + "grad_norm": 1.8219579458236694, + "learning_rate": 4.501355647050933e-05, + "loss": 4.3047, + "step": 34394 + }, + { + "epoch": 0.2045568084499001, + "grad_norm": 1.65507173538208, + "learning_rate": 4.501327654593017e-05, + "loss": 4.3469, + "step": 34395 + }, + { + "epoch": 0.20456275573318108, + "grad_norm": 1.838722586631775, + "learning_rate": 4.5012996614364584e-05, + "loss": 4.4339, + "step": 34396 + }, + { + "epoch": 0.20456870301646207, + "grad_norm": 1.591939926147461, + "learning_rate": 4.5012716675812664e-05, + "loss": 4.3333, + "step": 34397 + }, + { + "epoch": 0.2045746502997431, + "grad_norm": 1.6647378206253052, + "learning_rate": 4.501243673027452e-05, + "loss": 4.4846, + "step": 34398 + }, + { + "epoch": 0.20458059758302408, + "grad_norm": 1.6199638843536377, + "learning_rate": 4.501215677775023e-05, + "loss": 4.5612, + "step": 34399 + }, + { + "epoch": 0.20458654486630506, + "grad_norm": 1.6180731058120728, + "learning_rate": 4.501187681823991e-05, + "loss": 4.5946, + "step": 34400 + }, + { + "epoch": 0.20459249214958608, + "grad_norm": 1.7323843240737915, + "learning_rate": 4.501159685174365e-05, + "loss": 4.7182, + "step": 34401 + }, + { + "epoch": 0.20459843943286707, + "grad_norm": 1.6931322813034058, + "learning_rate": 4.5011316878261545e-05, + "loss": 4.621, + "step": 34402 + }, + { + "epoch": 0.20460438671614806, + "grad_norm": 1.8126243352890015, + "learning_rate": 4.50110368977937e-05, + "loss": 4.3956, + "step": 34403 + }, + { + "epoch": 0.20461033399942907, + "grad_norm": 1.9750713109970093, + "learning_rate": 4.50107569103402e-05, + "loss": 4.3139, + "step": 34404 + }, + { + "epoch": 0.20461628128271006, + "grad_norm": 1.7892979383468628, + "learning_rate": 4.5010476915901164e-05, + "loss": 4.8248, + "step": 34405 + }, + { + "epoch": 0.20462222856599105, + "grad_norm": 1.7640190124511719, + "learning_rate": 4.501019691447667e-05, + "loss": 4.6024, + "step": 34406 + }, + { + "epoch": 0.20462817584927206, + "grad_norm": 1.7281399965286255, + "learning_rate": 4.500991690606682e-05, + "loss": 4.5826, + "step": 34407 + }, + { + "epoch": 0.20463412313255305, + "grad_norm": 1.7226693630218506, + "learning_rate": 4.500963689067173e-05, + "loss": 4.4828, + "step": 34408 + }, + { + "epoch": 0.20464007041583404, + "grad_norm": 1.9858465194702148, + "learning_rate": 4.5009356868291464e-05, + "loss": 4.4441, + "step": 34409 + }, + { + "epoch": 0.20464601769911506, + "grad_norm": 2.349806547164917, + "learning_rate": 4.500907683892615e-05, + "loss": 3.6723, + "step": 34410 + }, + { + "epoch": 0.20465196498239605, + "grad_norm": 2.7743988037109375, + "learning_rate": 4.500879680257587e-05, + "loss": 3.2242, + "step": 34411 + }, + { + "epoch": 0.20465791226567703, + "grad_norm": 3.5897703170776367, + "learning_rate": 4.5008516759240725e-05, + "loss": 3.8626, + "step": 34412 + }, + { + "epoch": 0.20466385954895805, + "grad_norm": 3.0878939628601074, + "learning_rate": 4.5008236708920816e-05, + "loss": 4.1804, + "step": 34413 + }, + { + "epoch": 0.20466980683223904, + "grad_norm": 2.4697554111480713, + "learning_rate": 4.500795665161623e-05, + "loss": 3.9456, + "step": 34414 + }, + { + "epoch": 0.20467575411552003, + "grad_norm": 1.8446409702301025, + "learning_rate": 4.500767658732708e-05, + "loss": 4.5432, + "step": 34415 + }, + { + "epoch": 0.20468170139880104, + "grad_norm": 2.3806347846984863, + "learning_rate": 4.5007396516053455e-05, + "loss": 4.9125, + "step": 34416 + }, + { + "epoch": 0.20468764868208203, + "grad_norm": 2.268231153488159, + "learning_rate": 4.500711643779546e-05, + "loss": 4.2447, + "step": 34417 + }, + { + "epoch": 0.20469359596536302, + "grad_norm": 2.3919286727905273, + "learning_rate": 4.500683635255318e-05, + "loss": 3.3776, + "step": 34418 + }, + { + "epoch": 0.204699543248644, + "grad_norm": 2.2718050479888916, + "learning_rate": 4.500655626032673e-05, + "loss": 3.3755, + "step": 34419 + }, + { + "epoch": 0.20470549053192502, + "grad_norm": 1.688220739364624, + "learning_rate": 4.500627616111619e-05, + "loss": 4.782, + "step": 34420 + }, + { + "epoch": 0.204711437815206, + "grad_norm": 1.78768789768219, + "learning_rate": 4.500599605492166e-05, + "loss": 4.8989, + "step": 34421 + }, + { + "epoch": 0.204717385098487, + "grad_norm": 2.1640124320983887, + "learning_rate": 4.5005715941743255e-05, + "loss": 4.5965, + "step": 34422 + }, + { + "epoch": 0.20472333238176801, + "grad_norm": 1.9868354797363281, + "learning_rate": 4.5005435821581054e-05, + "loss": 4.6511, + "step": 34423 + }, + { + "epoch": 0.204729279665049, + "grad_norm": 2.1577179431915283, + "learning_rate": 4.500515569443516e-05, + "loss": 4.6006, + "step": 34424 + }, + { + "epoch": 0.20473522694833, + "grad_norm": 2.440462589263916, + "learning_rate": 4.500487556030568e-05, + "loss": 4.4556, + "step": 34425 + }, + { + "epoch": 0.204741174231611, + "grad_norm": 2.115889072418213, + "learning_rate": 4.500459541919271e-05, + "loss": 4.6232, + "step": 34426 + }, + { + "epoch": 0.204747121514892, + "grad_norm": 2.1383614540100098, + "learning_rate": 4.500431527109633e-05, + "loss": 4.709, + "step": 34427 + }, + { + "epoch": 0.20475306879817298, + "grad_norm": 2.339848041534424, + "learning_rate": 4.500403511601665e-05, + "loss": 4.6746, + "step": 34428 + }, + { + "epoch": 0.204759016081454, + "grad_norm": 2.115816831588745, + "learning_rate": 4.500375495395378e-05, + "loss": 4.7467, + "step": 34429 + }, + { + "epoch": 0.204764963364735, + "grad_norm": 2.461951494216919, + "learning_rate": 4.5003474784907796e-05, + "loss": 4.6304, + "step": 34430 + }, + { + "epoch": 0.20477091064801597, + "grad_norm": 2.1366448402404785, + "learning_rate": 4.5003194608878806e-05, + "loss": 4.7499, + "step": 34431 + }, + { + "epoch": 0.204776857931297, + "grad_norm": 1.7789580821990967, + "learning_rate": 4.5002914425866916e-05, + "loss": 4.491, + "step": 34432 + }, + { + "epoch": 0.20478280521457798, + "grad_norm": 1.984432578086853, + "learning_rate": 4.50026342358722e-05, + "loss": 4.4704, + "step": 34433 + }, + { + "epoch": 0.20478875249785897, + "grad_norm": 2.3361284732818604, + "learning_rate": 4.500235403889479e-05, + "loss": 4.5737, + "step": 34434 + }, + { + "epoch": 0.20479469978113998, + "grad_norm": 2.1250100135803223, + "learning_rate": 4.500207383493475e-05, + "loss": 4.3715, + "step": 34435 + }, + { + "epoch": 0.20480064706442097, + "grad_norm": 1.9171262979507446, + "learning_rate": 4.500179362399219e-05, + "loss": 4.5543, + "step": 34436 + }, + { + "epoch": 0.20480659434770196, + "grad_norm": 1.9183216094970703, + "learning_rate": 4.5001513406067224e-05, + "loss": 4.6798, + "step": 34437 + }, + { + "epoch": 0.20481254163098297, + "grad_norm": 2.0602195262908936, + "learning_rate": 4.500123318115993e-05, + "loss": 4.2771, + "step": 34438 + }, + { + "epoch": 0.20481848891426396, + "grad_norm": 1.8789846897125244, + "learning_rate": 4.5000952949270414e-05, + "loss": 3.779, + "step": 34439 + }, + { + "epoch": 0.20482443619754495, + "grad_norm": 1.7185890674591064, + "learning_rate": 4.5000672710398775e-05, + "loss": 3.7439, + "step": 34440 + }, + { + "epoch": 0.20483038348082597, + "grad_norm": 1.758641004562378, + "learning_rate": 4.50003924645451e-05, + "loss": 3.8585, + "step": 34441 + }, + { + "epoch": 0.20483633076410696, + "grad_norm": 1.8966751098632812, + "learning_rate": 4.50001122117095e-05, + "loss": 4.1861, + "step": 34442 + }, + { + "epoch": 0.20484227804738794, + "grad_norm": 2.1946234703063965, + "learning_rate": 4.4999831951892056e-05, + "loss": 4.2264, + "step": 34443 + }, + { + "epoch": 0.20484822533066896, + "grad_norm": 1.9412094354629517, + "learning_rate": 4.4999551685092893e-05, + "loss": 4.5887, + "step": 34444 + }, + { + "epoch": 0.20485417261394995, + "grad_norm": 1.4762779474258423, + "learning_rate": 4.4999271411312086e-05, + "loss": 4.7418, + "step": 34445 + }, + { + "epoch": 0.20486011989723094, + "grad_norm": 1.520828127861023, + "learning_rate": 4.499899113054974e-05, + "loss": 4.6643, + "step": 34446 + }, + { + "epoch": 0.20486606718051195, + "grad_norm": 1.7773675918579102, + "learning_rate": 4.4998710842805955e-05, + "loss": 4.8711, + "step": 34447 + }, + { + "epoch": 0.20487201446379294, + "grad_norm": 1.8981387615203857, + "learning_rate": 4.499843054808082e-05, + "loss": 4.4935, + "step": 34448 + }, + { + "epoch": 0.20487796174707393, + "grad_norm": 2.329200029373169, + "learning_rate": 4.499815024637445e-05, + "loss": 4.5589, + "step": 34449 + }, + { + "epoch": 0.20488390903035494, + "grad_norm": 3.6857268810272217, + "learning_rate": 4.4997869937686926e-05, + "loss": 3.6962, + "step": 34450 + }, + { + "epoch": 0.20488985631363593, + "grad_norm": 2.840702533721924, + "learning_rate": 4.4997589622018354e-05, + "loss": 4.3877, + "step": 34451 + }, + { + "epoch": 0.20489580359691692, + "grad_norm": 2.9141292572021484, + "learning_rate": 4.499730929936883e-05, + "loss": 4.1279, + "step": 34452 + }, + { + "epoch": 0.20490175088019794, + "grad_norm": 2.270629405975342, + "learning_rate": 4.4997028969738456e-05, + "loss": 3.9257, + "step": 34453 + }, + { + "epoch": 0.20490769816347892, + "grad_norm": 1.5963208675384521, + "learning_rate": 4.499674863312732e-05, + "loss": 4.7587, + "step": 34454 + }, + { + "epoch": 0.2049136454467599, + "grad_norm": 1.8655664920806885, + "learning_rate": 4.499646828953552e-05, + "loss": 5.1353, + "step": 34455 + }, + { + "epoch": 0.20491959273004093, + "grad_norm": 1.582878589630127, + "learning_rate": 4.499618793896317e-05, + "loss": 5.1033, + "step": 34456 + }, + { + "epoch": 0.20492554001332192, + "grad_norm": 1.8107730150222778, + "learning_rate": 4.499590758141035e-05, + "loss": 4.9176, + "step": 34457 + }, + { + "epoch": 0.2049314872966029, + "grad_norm": 1.7597856521606445, + "learning_rate": 4.499562721687717e-05, + "loss": 5.1321, + "step": 34458 + }, + { + "epoch": 0.20493743457988392, + "grad_norm": 2.249377489089966, + "learning_rate": 4.4995346845363715e-05, + "loss": 3.7656, + "step": 34459 + }, + { + "epoch": 0.2049433818631649, + "grad_norm": 2.7382209300994873, + "learning_rate": 4.4995066466870106e-05, + "loss": 3.8454, + "step": 34460 + }, + { + "epoch": 0.2049493291464459, + "grad_norm": 1.8879845142364502, + "learning_rate": 4.499478608139641e-05, + "loss": 5.0165, + "step": 34461 + }, + { + "epoch": 0.2049552764297269, + "grad_norm": 1.58770751953125, + "learning_rate": 4.4994505688942757e-05, + "loss": 5.0374, + "step": 34462 + }, + { + "epoch": 0.2049612237130079, + "grad_norm": 1.4101433753967285, + "learning_rate": 4.499422528950922e-05, + "loss": 4.8346, + "step": 34463 + }, + { + "epoch": 0.2049671709962889, + "grad_norm": 1.5214909315109253, + "learning_rate": 4.49939448830959e-05, + "loss": 5.0864, + "step": 34464 + }, + { + "epoch": 0.2049731182795699, + "grad_norm": 1.444889783859253, + "learning_rate": 4.4993664469702914e-05, + "loss": 5.2856, + "step": 34465 + }, + { + "epoch": 0.2049790655628509, + "grad_norm": 1.7581872940063477, + "learning_rate": 4.4993384049330336e-05, + "loss": 4.9138, + "step": 34466 + }, + { + "epoch": 0.20498501284613188, + "grad_norm": 1.5535180568695068, + "learning_rate": 4.4993103621978275e-05, + "loss": 4.6956, + "step": 34467 + }, + { + "epoch": 0.2049909601294129, + "grad_norm": 1.653976559638977, + "learning_rate": 4.4992823187646824e-05, + "loss": 4.6417, + "step": 34468 + }, + { + "epoch": 0.20499690741269389, + "grad_norm": 1.23905611038208, + "learning_rate": 4.499254274633609e-05, + "loss": 4.7466, + "step": 34469 + }, + { + "epoch": 0.20500285469597487, + "grad_norm": 1.6507742404937744, + "learning_rate": 4.499226229804617e-05, + "loss": 4.6249, + "step": 34470 + }, + { + "epoch": 0.2050088019792559, + "grad_norm": 1.6019675731658936, + "learning_rate": 4.4991981842777156e-05, + "loss": 4.7097, + "step": 34471 + }, + { + "epoch": 0.20501474926253688, + "grad_norm": 2.959336519241333, + "learning_rate": 4.4991701380529146e-05, + "loss": 3.2528, + "step": 34472 + }, + { + "epoch": 0.20502069654581787, + "grad_norm": 1.8020458221435547, + "learning_rate": 4.499142091130224e-05, + "loss": 4.6609, + "step": 34473 + }, + { + "epoch": 0.20502664382909888, + "grad_norm": 1.6824336051940918, + "learning_rate": 4.499114043509653e-05, + "loss": 4.5646, + "step": 34474 + }, + { + "epoch": 0.20503259111237987, + "grad_norm": 1.5131100416183472, + "learning_rate": 4.4990859951912124e-05, + "loss": 4.6802, + "step": 34475 + }, + { + "epoch": 0.20503853839566086, + "grad_norm": 1.7208993434906006, + "learning_rate": 4.499057946174911e-05, + "loss": 4.7112, + "step": 34476 + }, + { + "epoch": 0.20504448567894185, + "grad_norm": 1.701457142829895, + "learning_rate": 4.49902989646076e-05, + "loss": 4.6948, + "step": 34477 + }, + { + "epoch": 0.20505043296222286, + "grad_norm": 1.5838974714279175, + "learning_rate": 4.4990018460487683e-05, + "loss": 4.6722, + "step": 34478 + }, + { + "epoch": 0.20505638024550385, + "grad_norm": 1.7028863430023193, + "learning_rate": 4.4989737949389454e-05, + "loss": 4.4673, + "step": 34479 + }, + { + "epoch": 0.20506232752878484, + "grad_norm": 1.5195289850234985, + "learning_rate": 4.498945743131302e-05, + "loss": 4.5802, + "step": 34480 + }, + { + "epoch": 0.20506827481206585, + "grad_norm": 1.2915374040603638, + "learning_rate": 4.498917690625847e-05, + "loss": 4.4341, + "step": 34481 + }, + { + "epoch": 0.20507422209534684, + "grad_norm": 1.3090250492095947, + "learning_rate": 4.49888963742259e-05, + "loss": 4.7476, + "step": 34482 + }, + { + "epoch": 0.20508016937862783, + "grad_norm": 1.5086677074432373, + "learning_rate": 4.498861583521541e-05, + "loss": 4.5027, + "step": 34483 + }, + { + "epoch": 0.20508611666190885, + "grad_norm": 1.279787540435791, + "learning_rate": 4.4988335289227104e-05, + "loss": 4.6363, + "step": 34484 + }, + { + "epoch": 0.20509206394518983, + "grad_norm": 1.5406779050827026, + "learning_rate": 4.498805473626107e-05, + "loss": 4.834, + "step": 34485 + }, + { + "epoch": 0.20509801122847082, + "grad_norm": 1.5772720575332642, + "learning_rate": 4.4987774176317435e-05, + "loss": 4.5374, + "step": 34486 + }, + { + "epoch": 0.20510395851175184, + "grad_norm": 1.5194119215011597, + "learning_rate": 4.498749360939626e-05, + "loss": 4.5295, + "step": 34487 + }, + { + "epoch": 0.20510990579503283, + "grad_norm": 1.4598057270050049, + "learning_rate": 4.498721303549766e-05, + "loss": 4.45, + "step": 34488 + }, + { + "epoch": 0.20511585307831381, + "grad_norm": 1.4096565246582031, + "learning_rate": 4.498693245462172e-05, + "loss": 4.5477, + "step": 34489 + }, + { + "epoch": 0.20512180036159483, + "grad_norm": 1.7675260305404663, + "learning_rate": 4.498665186676856e-05, + "loss": 4.599, + "step": 34490 + }, + { + "epoch": 0.20512774764487582, + "grad_norm": 1.5095406770706177, + "learning_rate": 4.498637127193826e-05, + "loss": 4.9472, + "step": 34491 + }, + { + "epoch": 0.2051336949281568, + "grad_norm": 1.389299750328064, + "learning_rate": 4.498609067013093e-05, + "loss": 4.625, + "step": 34492 + }, + { + "epoch": 0.20513964221143782, + "grad_norm": 1.573198676109314, + "learning_rate": 4.498581006134666e-05, + "loss": 4.5716, + "step": 34493 + }, + { + "epoch": 0.2051455894947188, + "grad_norm": 1.655664324760437, + "learning_rate": 4.498552944558554e-05, + "loss": 4.6848, + "step": 34494 + }, + { + "epoch": 0.2051515367779998, + "grad_norm": 1.6053812503814697, + "learning_rate": 4.498524882284769e-05, + "loss": 4.6676, + "step": 34495 + }, + { + "epoch": 0.20515748406128081, + "grad_norm": 1.6616476774215698, + "learning_rate": 4.49849681931332e-05, + "loss": 5.0148, + "step": 34496 + }, + { + "epoch": 0.2051634313445618, + "grad_norm": 2.0143654346466064, + "learning_rate": 4.4984687556442155e-05, + "loss": 4.4989, + "step": 34497 + }, + { + "epoch": 0.2051693786278428, + "grad_norm": 1.833949089050293, + "learning_rate": 4.498440691277467e-05, + "loss": 4.3259, + "step": 34498 + }, + { + "epoch": 0.2051753259111238, + "grad_norm": 1.5047730207443237, + "learning_rate": 4.4984126262130825e-05, + "loss": 4.6325, + "step": 34499 + }, + { + "epoch": 0.2051812731944048, + "grad_norm": 1.6812539100646973, + "learning_rate": 4.4983845604510736e-05, + "loss": 4.3358, + "step": 34500 + }, + { + "epoch": 0.20518722047768578, + "grad_norm": 1.4969497919082642, + "learning_rate": 4.498356493991449e-05, + "loss": 4.7152, + "step": 34501 + }, + { + "epoch": 0.2051931677609668, + "grad_norm": 1.9464809894561768, + "learning_rate": 4.498328426834218e-05, + "loss": 4.7395, + "step": 34502 + }, + { + "epoch": 0.2051991150442478, + "grad_norm": 1.6289222240447998, + "learning_rate": 4.498300358979393e-05, + "loss": 4.9731, + "step": 34503 + }, + { + "epoch": 0.20520506232752878, + "grad_norm": 1.6166993379592896, + "learning_rate": 4.49827229042698e-05, + "loss": 5.0576, + "step": 34504 + }, + { + "epoch": 0.2052110096108098, + "grad_norm": 1.5953607559204102, + "learning_rate": 4.4982442211769926e-05, + "loss": 5.2747, + "step": 34505 + }, + { + "epoch": 0.20521695689409078, + "grad_norm": 1.8515969514846802, + "learning_rate": 4.4982161512294374e-05, + "loss": 5.2296, + "step": 34506 + }, + { + "epoch": 0.20522290417737177, + "grad_norm": 1.594019889831543, + "learning_rate": 4.498188080584326e-05, + "loss": 5.1967, + "step": 34507 + }, + { + "epoch": 0.20522885146065278, + "grad_norm": 1.6203807592391968, + "learning_rate": 4.498160009241668e-05, + "loss": 4.9829, + "step": 34508 + }, + { + "epoch": 0.20523479874393377, + "grad_norm": 1.5720133781433105, + "learning_rate": 4.498131937201473e-05, + "loss": 4.8464, + "step": 34509 + }, + { + "epoch": 0.20524074602721476, + "grad_norm": 1.3742187023162842, + "learning_rate": 4.498103864463751e-05, + "loss": 4.8129, + "step": 34510 + }, + { + "epoch": 0.20524669331049578, + "grad_norm": 1.432989478111267, + "learning_rate": 4.498075791028511e-05, + "loss": 4.7693, + "step": 34511 + }, + { + "epoch": 0.20525264059377676, + "grad_norm": 1.66667640209198, + "learning_rate": 4.498047716895763e-05, + "loss": 4.9305, + "step": 34512 + }, + { + "epoch": 0.20525858787705775, + "grad_norm": 1.795190691947937, + "learning_rate": 4.4980196420655185e-05, + "loss": 4.6456, + "step": 34513 + }, + { + "epoch": 0.20526453516033877, + "grad_norm": 2.04221510887146, + "learning_rate": 4.4979915665377855e-05, + "loss": 5.2033, + "step": 34514 + }, + { + "epoch": 0.20527048244361976, + "grad_norm": 2.103423595428467, + "learning_rate": 4.497963490312574e-05, + "loss": 5.1652, + "step": 34515 + }, + { + "epoch": 0.20527642972690074, + "grad_norm": 1.7535483837127686, + "learning_rate": 4.497935413389894e-05, + "loss": 4.9614, + "step": 34516 + }, + { + "epoch": 0.20528237701018176, + "grad_norm": 1.7065552473068237, + "learning_rate": 4.497907335769756e-05, + "loss": 4.8857, + "step": 34517 + }, + { + "epoch": 0.20528832429346275, + "grad_norm": 1.8153327703475952, + "learning_rate": 4.497879257452168e-05, + "loss": 4.9539, + "step": 34518 + }, + { + "epoch": 0.20529427157674374, + "grad_norm": 3.195288896560669, + "learning_rate": 4.497851178437142e-05, + "loss": 2.5544, + "step": 34519 + }, + { + "epoch": 0.20530021886002475, + "grad_norm": 2.209017276763916, + "learning_rate": 4.497823098724687e-05, + "loss": 3.4065, + "step": 34520 + }, + { + "epoch": 0.20530616614330574, + "grad_norm": 1.9605618715286255, + "learning_rate": 4.497795018314812e-05, + "loss": 4.4923, + "step": 34521 + }, + { + "epoch": 0.20531211342658673, + "grad_norm": 1.7516566514968872, + "learning_rate": 4.497766937207528e-05, + "loss": 4.3444, + "step": 34522 + }, + { + "epoch": 0.20531806070986774, + "grad_norm": 1.7743666172027588, + "learning_rate": 4.497738855402843e-05, + "loss": 4.8758, + "step": 34523 + }, + { + "epoch": 0.20532400799314873, + "grad_norm": 1.9033406972885132, + "learning_rate": 4.497710772900769e-05, + "loss": 4.8746, + "step": 34524 + }, + { + "epoch": 0.20532995527642972, + "grad_norm": 1.7820247411727905, + "learning_rate": 4.497682689701315e-05, + "loss": 4.4355, + "step": 34525 + }, + { + "epoch": 0.20533590255971074, + "grad_norm": 1.929594874382019, + "learning_rate": 4.49765460580449e-05, + "loss": 4.4736, + "step": 34526 + }, + { + "epoch": 0.20534184984299172, + "grad_norm": 1.8177223205566406, + "learning_rate": 4.497626521210304e-05, + "loss": 4.3102, + "step": 34527 + }, + { + "epoch": 0.2053477971262727, + "grad_norm": 1.7658437490463257, + "learning_rate": 4.497598435918768e-05, + "loss": 4.3177, + "step": 34528 + }, + { + "epoch": 0.20535374440955373, + "grad_norm": 2.5987002849578857, + "learning_rate": 4.497570349929891e-05, + "loss": 3.2716, + "step": 34529 + }, + { + "epoch": 0.20535969169283472, + "grad_norm": 2.0609161853790283, + "learning_rate": 4.497542263243683e-05, + "loss": 3.2153, + "step": 34530 + }, + { + "epoch": 0.2053656389761157, + "grad_norm": 1.6488370895385742, + "learning_rate": 4.497514175860153e-05, + "loss": 4.782, + "step": 34531 + }, + { + "epoch": 0.20537158625939672, + "grad_norm": 1.972416877746582, + "learning_rate": 4.497486087779312e-05, + "loss": 5.144, + "step": 34532 + }, + { + "epoch": 0.2053775335426777, + "grad_norm": 1.8627492189407349, + "learning_rate": 4.497457999001169e-05, + "loss": 5.2683, + "step": 34533 + }, + { + "epoch": 0.2053834808259587, + "grad_norm": 2.3711135387420654, + "learning_rate": 4.497429909525733e-05, + "loss": 3.4938, + "step": 34534 + }, + { + "epoch": 0.20538942810923969, + "grad_norm": 1.9512462615966797, + "learning_rate": 4.4974018193530165e-05, + "loss": 4.6567, + "step": 34535 + }, + { + "epoch": 0.2053953753925207, + "grad_norm": 2.878932476043701, + "learning_rate": 4.497373728483027e-05, + "loss": 3.7388, + "step": 34536 + }, + { + "epoch": 0.2054013226758017, + "grad_norm": 1.8661671876907349, + "learning_rate": 4.497345636915775e-05, + "loss": 4.3109, + "step": 34537 + }, + { + "epoch": 0.20540726995908268, + "grad_norm": 1.4568744897842407, + "learning_rate": 4.49731754465127e-05, + "loss": 4.6756, + "step": 34538 + }, + { + "epoch": 0.2054132172423637, + "grad_norm": 1.2677067518234253, + "learning_rate": 4.497289451689522e-05, + "loss": 4.5082, + "step": 34539 + }, + { + "epoch": 0.20541916452564468, + "grad_norm": 1.466506004333496, + "learning_rate": 4.497261358030542e-05, + "loss": 4.5668, + "step": 34540 + }, + { + "epoch": 0.20542511180892567, + "grad_norm": 1.3032081127166748, + "learning_rate": 4.497233263674338e-05, + "loss": 4.5761, + "step": 34541 + }, + { + "epoch": 0.20543105909220669, + "grad_norm": 1.3104262351989746, + "learning_rate": 4.49720516862092e-05, + "loss": 4.555, + "step": 34542 + }, + { + "epoch": 0.20543700637548767, + "grad_norm": 1.2286089658737183, + "learning_rate": 4.497177072870299e-05, + "loss": 4.5533, + "step": 34543 + }, + { + "epoch": 0.20544295365876866, + "grad_norm": 1.5584557056427002, + "learning_rate": 4.4971489764224836e-05, + "loss": 4.4975, + "step": 34544 + }, + { + "epoch": 0.20544890094204968, + "grad_norm": 1.4209648370742798, + "learning_rate": 4.497120879277485e-05, + "loss": 4.5484, + "step": 34545 + }, + { + "epoch": 0.20545484822533067, + "grad_norm": 1.4486362934112549, + "learning_rate": 4.4970927814353116e-05, + "loss": 4.5861, + "step": 34546 + }, + { + "epoch": 0.20546079550861165, + "grad_norm": 1.3003538846969604, + "learning_rate": 4.4970646828959736e-05, + "loss": 4.6317, + "step": 34547 + }, + { + "epoch": 0.20546674279189267, + "grad_norm": 1.624047040939331, + "learning_rate": 4.497036583659481e-05, + "loss": 4.8953, + "step": 34548 + }, + { + "epoch": 0.20547269007517366, + "grad_norm": 1.9646494388580322, + "learning_rate": 4.497008483725844e-05, + "loss": 4.8024, + "step": 34549 + }, + { + "epoch": 0.20547863735845465, + "grad_norm": 2.3605401515960693, + "learning_rate": 4.496980383095071e-05, + "loss": 5.2422, + "step": 34550 + }, + { + "epoch": 0.20548458464173566, + "grad_norm": 2.1505236625671387, + "learning_rate": 4.496952281767174e-05, + "loss": 4.0803, + "step": 34551 + }, + { + "epoch": 0.20549053192501665, + "grad_norm": 1.591495156288147, + "learning_rate": 4.49692417974216e-05, + "loss": 4.2671, + "step": 34552 + }, + { + "epoch": 0.20549647920829764, + "grad_norm": 1.3606783151626587, + "learning_rate": 4.496896077020042e-05, + "loss": 4.4897, + "step": 34553 + }, + { + "epoch": 0.20550242649157865, + "grad_norm": 2.085550308227539, + "learning_rate": 4.496867973600827e-05, + "loss": 5.0765, + "step": 34554 + }, + { + "epoch": 0.20550837377485964, + "grad_norm": 2.3612008094787598, + "learning_rate": 4.496839869484527e-05, + "loss": 5.0725, + "step": 34555 + }, + { + "epoch": 0.20551432105814063, + "grad_norm": 2.228905439376831, + "learning_rate": 4.4968117646711506e-05, + "loss": 5.139, + "step": 34556 + }, + { + "epoch": 0.20552026834142165, + "grad_norm": 1.8952064514160156, + "learning_rate": 4.496783659160707e-05, + "loss": 5.0891, + "step": 34557 + }, + { + "epoch": 0.20552621562470264, + "grad_norm": 1.8927900791168213, + "learning_rate": 4.496755552953208e-05, + "loss": 4.9674, + "step": 34558 + }, + { + "epoch": 0.20553216290798362, + "grad_norm": 1.776397466659546, + "learning_rate": 4.496727446048662e-05, + "loss": 4.9904, + "step": 34559 + }, + { + "epoch": 0.20553811019126464, + "grad_norm": 1.8404251337051392, + "learning_rate": 4.496699338447078e-05, + "loss": 4.92, + "step": 34560 + }, + { + "epoch": 0.20554405747454563, + "grad_norm": 1.8732088804244995, + "learning_rate": 4.4966712301484685e-05, + "loss": 5.0232, + "step": 34561 + }, + { + "epoch": 0.20555000475782662, + "grad_norm": 1.7764228582382202, + "learning_rate": 4.49664312115284e-05, + "loss": 4.9615, + "step": 34562 + }, + { + "epoch": 0.20555595204110763, + "grad_norm": 1.8939924240112305, + "learning_rate": 4.496615011460206e-05, + "loss": 4.9276, + "step": 34563 + }, + { + "epoch": 0.20556189932438862, + "grad_norm": 1.7696598768234253, + "learning_rate": 4.496586901070573e-05, + "loss": 4.8614, + "step": 34564 + }, + { + "epoch": 0.2055678466076696, + "grad_norm": 1.7702951431274414, + "learning_rate": 4.496558789983952e-05, + "loss": 4.8973, + "step": 34565 + }, + { + "epoch": 0.20557379389095062, + "grad_norm": 1.6990092992782593, + "learning_rate": 4.4965306782003535e-05, + "loss": 4.8359, + "step": 34566 + }, + { + "epoch": 0.2055797411742316, + "grad_norm": 1.6071676015853882, + "learning_rate": 4.496502565719787e-05, + "loss": 4.9214, + "step": 34567 + }, + { + "epoch": 0.2055856884575126, + "grad_norm": 1.7637958526611328, + "learning_rate": 4.4964744525422615e-05, + "loss": 4.8521, + "step": 34568 + }, + { + "epoch": 0.20559163574079362, + "grad_norm": 1.728930115699768, + "learning_rate": 4.496446338667788e-05, + "loss": 4.7967, + "step": 34569 + }, + { + "epoch": 0.2055975830240746, + "grad_norm": 1.597816824913025, + "learning_rate": 4.4964182240963746e-05, + "loss": 4.8614, + "step": 34570 + }, + { + "epoch": 0.2056035303073556, + "grad_norm": 1.9850479364395142, + "learning_rate": 4.496390108828033e-05, + "loss": 4.4582, + "step": 34571 + }, + { + "epoch": 0.2056094775906366, + "grad_norm": 2.1062679290771484, + "learning_rate": 4.496361992862772e-05, + "loss": 5.2632, + "step": 34572 + }, + { + "epoch": 0.2056154248739176, + "grad_norm": 1.8323718309402466, + "learning_rate": 4.496333876200602e-05, + "loss": 5.1667, + "step": 34573 + }, + { + "epoch": 0.20562137215719858, + "grad_norm": 1.64948308467865, + "learning_rate": 4.496305758841532e-05, + "loss": 5.0581, + "step": 34574 + }, + { + "epoch": 0.2056273194404796, + "grad_norm": 1.5759135484695435, + "learning_rate": 4.496277640785573e-05, + "loss": 5.0197, + "step": 34575 + }, + { + "epoch": 0.2056332667237606, + "grad_norm": 1.7933998107910156, + "learning_rate": 4.496249522032734e-05, + "loss": 4.5411, + "step": 34576 + }, + { + "epoch": 0.20563921400704158, + "grad_norm": 2.153806447982788, + "learning_rate": 4.496221402583024e-05, + "loss": 4.3102, + "step": 34577 + }, + { + "epoch": 0.2056451612903226, + "grad_norm": 2.2217137813568115, + "learning_rate": 4.496193282436455e-05, + "loss": 3.6145, + "step": 34578 + }, + { + "epoch": 0.20565110857360358, + "grad_norm": 2.1557652950286865, + "learning_rate": 4.496165161593035e-05, + "loss": 4.4684, + "step": 34579 + }, + { + "epoch": 0.20565705585688457, + "grad_norm": 1.7254935503005981, + "learning_rate": 4.496137040052773e-05, + "loss": 5.1948, + "step": 34580 + }, + { + "epoch": 0.20566300314016558, + "grad_norm": 1.663953185081482, + "learning_rate": 4.496108917815682e-05, + "loss": 5.0254, + "step": 34581 + }, + { + "epoch": 0.20566895042344657, + "grad_norm": 1.463458776473999, + "learning_rate": 4.4960807948817695e-05, + "loss": 4.8313, + "step": 34582 + }, + { + "epoch": 0.20567489770672756, + "grad_norm": 2.062659740447998, + "learning_rate": 4.4960526712510456e-05, + "loss": 4.5435, + "step": 34583 + }, + { + "epoch": 0.20568084499000858, + "grad_norm": 2.5706655979156494, + "learning_rate": 4.4960245469235206e-05, + "loss": 3.9599, + "step": 34584 + }, + { + "epoch": 0.20568679227328956, + "grad_norm": 1.9307024478912354, + "learning_rate": 4.495996421899204e-05, + "loss": 4.274, + "step": 34585 + }, + { + "epoch": 0.20569273955657055, + "grad_norm": 1.5417910814285278, + "learning_rate": 4.4959682961781056e-05, + "loss": 4.8377, + "step": 34586 + }, + { + "epoch": 0.20569868683985157, + "grad_norm": 1.7265223264694214, + "learning_rate": 4.4959401697602354e-05, + "loss": 4.7724, + "step": 34587 + }, + { + "epoch": 0.20570463412313256, + "grad_norm": 1.8895970582962036, + "learning_rate": 4.4959120426456026e-05, + "loss": 4.2129, + "step": 34588 + }, + { + "epoch": 0.20571058140641355, + "grad_norm": 1.9905476570129395, + "learning_rate": 4.495883914834218e-05, + "loss": 4.8612, + "step": 34589 + }, + { + "epoch": 0.20571652868969456, + "grad_norm": 1.8988146781921387, + "learning_rate": 4.495855786326091e-05, + "loss": 4.8655, + "step": 34590 + }, + { + "epoch": 0.20572247597297555, + "grad_norm": 1.9545416831970215, + "learning_rate": 4.495827657121231e-05, + "loss": 4.724, + "step": 34591 + }, + { + "epoch": 0.20572842325625654, + "grad_norm": 1.9015052318572998, + "learning_rate": 4.495799527219648e-05, + "loss": 4.8043, + "step": 34592 + }, + { + "epoch": 0.20573437053953753, + "grad_norm": 1.8014155626296997, + "learning_rate": 4.495771396621353e-05, + "loss": 4.6953, + "step": 34593 + }, + { + "epoch": 0.20574031782281854, + "grad_norm": 1.8659149408340454, + "learning_rate": 4.495743265326354e-05, + "loss": 4.5499, + "step": 34594 + }, + { + "epoch": 0.20574626510609953, + "grad_norm": 1.817817211151123, + "learning_rate": 4.495715133334662e-05, + "loss": 4.8158, + "step": 34595 + }, + { + "epoch": 0.20575221238938052, + "grad_norm": 2.685556411743164, + "learning_rate": 4.495687000646286e-05, + "loss": 3.3173, + "step": 34596 + }, + { + "epoch": 0.20575815967266153, + "grad_norm": 1.909069299697876, + "learning_rate": 4.495658867261237e-05, + "loss": 4.7473, + "step": 34597 + }, + { + "epoch": 0.20576410695594252, + "grad_norm": 1.9138617515563965, + "learning_rate": 4.495630733179524e-05, + "loss": 4.7425, + "step": 34598 + }, + { + "epoch": 0.2057700542392235, + "grad_norm": 1.8828593492507935, + "learning_rate": 4.495602598401156e-05, + "loss": 4.6867, + "step": 34599 + }, + { + "epoch": 0.20577600152250453, + "grad_norm": 1.8820278644561768, + "learning_rate": 4.4955744629261455e-05, + "loss": 4.6925, + "step": 34600 + }, + { + "epoch": 0.20578194880578551, + "grad_norm": 1.833011269569397, + "learning_rate": 4.495546326754499e-05, + "loss": 4.6976, + "step": 34601 + }, + { + "epoch": 0.2057878960890665, + "grad_norm": 1.8647735118865967, + "learning_rate": 4.4955181898862284e-05, + "loss": 4.7463, + "step": 34602 + }, + { + "epoch": 0.20579384337234752, + "grad_norm": 1.8595027923583984, + "learning_rate": 4.495490052321343e-05, + "loss": 4.8303, + "step": 34603 + }, + { + "epoch": 0.2057997906556285, + "grad_norm": 1.90599524974823, + "learning_rate": 4.495461914059853e-05, + "loss": 4.5668, + "step": 34604 + }, + { + "epoch": 0.2058057379389095, + "grad_norm": 1.754970908164978, + "learning_rate": 4.4954337751017675e-05, + "loss": 4.4404, + "step": 34605 + }, + { + "epoch": 0.2058116852221905, + "grad_norm": 1.9905904531478882, + "learning_rate": 4.4954056354470966e-05, + "loss": 4.2951, + "step": 34606 + }, + { + "epoch": 0.2058176325054715, + "grad_norm": 1.8667941093444824, + "learning_rate": 4.49537749509585e-05, + "loss": 4.5369, + "step": 34607 + }, + { + "epoch": 0.2058235797887525, + "grad_norm": 1.883657455444336, + "learning_rate": 4.4953493540480384e-05, + "loss": 4.751, + "step": 34608 + }, + { + "epoch": 0.2058295270720335, + "grad_norm": 1.8843159675598145, + "learning_rate": 4.49532121230367e-05, + "loss": 4.7321, + "step": 34609 + }, + { + "epoch": 0.2058354743553145, + "grad_norm": 1.9638304710388184, + "learning_rate": 4.495293069862756e-05, + "loss": 4.7074, + "step": 34610 + }, + { + "epoch": 0.20584142163859548, + "grad_norm": 1.865575909614563, + "learning_rate": 4.4952649267253065e-05, + "loss": 4.7712, + "step": 34611 + }, + { + "epoch": 0.2058473689218765, + "grad_norm": 1.781821608543396, + "learning_rate": 4.49523678289133e-05, + "loss": 5.0408, + "step": 34612 + }, + { + "epoch": 0.20585331620515748, + "grad_norm": 1.908469796180725, + "learning_rate": 4.495208638360837e-05, + "loss": 5.2425, + "step": 34613 + }, + { + "epoch": 0.20585926348843847, + "grad_norm": 1.6282795667648315, + "learning_rate": 4.495180493133837e-05, + "loss": 4.9834, + "step": 34614 + }, + { + "epoch": 0.2058652107717195, + "grad_norm": 1.9788955450057983, + "learning_rate": 4.495152347210341e-05, + "loss": 4.3966, + "step": 34615 + }, + { + "epoch": 0.20587115805500047, + "grad_norm": 1.7629202604293823, + "learning_rate": 4.495124200590357e-05, + "loss": 4.8765, + "step": 34616 + }, + { + "epoch": 0.20587710533828146, + "grad_norm": 1.6397231817245483, + "learning_rate": 4.495096053273896e-05, + "loss": 4.8986, + "step": 34617 + }, + { + "epoch": 0.20588305262156248, + "grad_norm": 2.045825481414795, + "learning_rate": 4.4950679052609676e-05, + "loss": 4.2573, + "step": 34618 + }, + { + "epoch": 0.20588899990484347, + "grad_norm": 2.2558441162109375, + "learning_rate": 4.495039756551582e-05, + "loss": 4.7439, + "step": 34619 + }, + { + "epoch": 0.20589494718812446, + "grad_norm": 1.8919719457626343, + "learning_rate": 4.495011607145748e-05, + "loss": 4.2334, + "step": 34620 + }, + { + "epoch": 0.20590089447140547, + "grad_norm": 1.94011652469635, + "learning_rate": 4.4949834570434766e-05, + "loss": 4.5599, + "step": 34621 + }, + { + "epoch": 0.20590684175468646, + "grad_norm": 1.7179418802261353, + "learning_rate": 4.494955306244777e-05, + "loss": 4.3644, + "step": 34622 + }, + { + "epoch": 0.20591278903796745, + "grad_norm": 1.7644224166870117, + "learning_rate": 4.494927154749659e-05, + "loss": 4.5259, + "step": 34623 + }, + { + "epoch": 0.20591873632124846, + "grad_norm": 1.7699881792068481, + "learning_rate": 4.494899002558133e-05, + "loss": 4.3367, + "step": 34624 + }, + { + "epoch": 0.20592468360452945, + "grad_norm": 1.8351449966430664, + "learning_rate": 4.494870849670207e-05, + "loss": 4.3265, + "step": 34625 + }, + { + "epoch": 0.20593063088781044, + "grad_norm": 1.7717596292495728, + "learning_rate": 4.494842696085894e-05, + "loss": 4.6112, + "step": 34626 + }, + { + "epoch": 0.20593657817109146, + "grad_norm": 1.813578486442566, + "learning_rate": 4.494814541805201e-05, + "loss": 4.2206, + "step": 34627 + }, + { + "epoch": 0.20594252545437244, + "grad_norm": 1.9783309698104858, + "learning_rate": 4.494786386828139e-05, + "loss": 4.3013, + "step": 34628 + }, + { + "epoch": 0.20594847273765343, + "grad_norm": 2.3097033500671387, + "learning_rate": 4.4947582311547175e-05, + "loss": 2.6195, + "step": 34629 + }, + { + "epoch": 0.20595442002093445, + "grad_norm": 2.7069907188415527, + "learning_rate": 4.4947300747849464e-05, + "loss": 2.0729, + "step": 34630 + }, + { + "epoch": 0.20596036730421544, + "grad_norm": 3.0851471424102783, + "learning_rate": 4.494701917718836e-05, + "loss": 3.1709, + "step": 34631 + }, + { + "epoch": 0.20596631458749642, + "grad_norm": 2.7397348880767822, + "learning_rate": 4.494673759956396e-05, + "loss": 2.7993, + "step": 34632 + }, + { + "epoch": 0.20597226187077744, + "grad_norm": 1.8076826333999634, + "learning_rate": 4.4946456014976356e-05, + "loss": 3.9575, + "step": 34633 + }, + { + "epoch": 0.20597820915405843, + "grad_norm": 1.8170053958892822, + "learning_rate": 4.494617442342565e-05, + "loss": 4.4885, + "step": 34634 + }, + { + "epoch": 0.20598415643733942, + "grad_norm": 2.324014186859131, + "learning_rate": 4.494589282491194e-05, + "loss": 1.5578, + "step": 34635 + }, + { + "epoch": 0.20599010372062043, + "grad_norm": 3.2078003883361816, + "learning_rate": 4.4945611219435326e-05, + "loss": 2.1456, + "step": 34636 + }, + { + "epoch": 0.20599605100390142, + "grad_norm": 2.683720827102661, + "learning_rate": 4.49453296069959e-05, + "loss": 2.3518, + "step": 34637 + }, + { + "epoch": 0.2060019982871824, + "grad_norm": 2.6722137928009033, + "learning_rate": 4.494504798759378e-05, + "loss": 2.8547, + "step": 34638 + }, + { + "epoch": 0.20600794557046342, + "grad_norm": 2.827223539352417, + "learning_rate": 4.494476636122904e-05, + "loss": 2.045, + "step": 34639 + }, + { + "epoch": 0.2060138928537444, + "grad_norm": 1.9061033725738525, + "learning_rate": 4.494448472790179e-05, + "loss": 4.0358, + "step": 34640 + }, + { + "epoch": 0.2060198401370254, + "grad_norm": 1.6916943788528442, + "learning_rate": 4.4944203087612125e-05, + "loss": 4.9196, + "step": 34641 + }, + { + "epoch": 0.20602578742030642, + "grad_norm": 1.6783428192138672, + "learning_rate": 4.494392144036014e-05, + "loss": 5.0467, + "step": 34642 + }, + { + "epoch": 0.2060317347035874, + "grad_norm": 2.8061981201171875, + "learning_rate": 4.494363978614594e-05, + "loss": 3.4047, + "step": 34643 + }, + { + "epoch": 0.2060376819868684, + "grad_norm": 2.3837738037109375, + "learning_rate": 4.4943358124969634e-05, + "loss": 3.3825, + "step": 34644 + }, + { + "epoch": 0.2060436292701494, + "grad_norm": 1.9662126302719116, + "learning_rate": 4.49430764568313e-05, + "loss": 5.0622, + "step": 34645 + }, + { + "epoch": 0.2060495765534304, + "grad_norm": 1.7634485960006714, + "learning_rate": 4.494279478173103e-05, + "loss": 5.2412, + "step": 34646 + }, + { + "epoch": 0.20605552383671139, + "grad_norm": 1.5668162107467651, + "learning_rate": 4.494251309966896e-05, + "loss": 5.1638, + "step": 34647 + }, + { + "epoch": 0.2060614711199924, + "grad_norm": 1.904152512550354, + "learning_rate": 4.4942231410645154e-05, + "loss": 4.5813, + "step": 34648 + }, + { + "epoch": 0.2060674184032734, + "grad_norm": 1.5555822849273682, + "learning_rate": 4.4941949714659714e-05, + "loss": 4.9484, + "step": 34649 + }, + { + "epoch": 0.20607336568655438, + "grad_norm": 1.6825793981552124, + "learning_rate": 4.4941668011712754e-05, + "loss": 4.9136, + "step": 34650 + }, + { + "epoch": 0.20607931296983537, + "grad_norm": 1.7734719514846802, + "learning_rate": 4.494138630180437e-05, + "loss": 4.5992, + "step": 34651 + }, + { + "epoch": 0.20608526025311638, + "grad_norm": 1.7022820711135864, + "learning_rate": 4.494110458493464e-05, + "loss": 4.5084, + "step": 34652 + }, + { + "epoch": 0.20609120753639737, + "grad_norm": 1.6979751586914062, + "learning_rate": 4.4940822861103684e-05, + "loss": 5.0131, + "step": 34653 + }, + { + "epoch": 0.20609715481967836, + "grad_norm": 2.1307291984558105, + "learning_rate": 4.494054113031159e-05, + "loss": 4.6225, + "step": 34654 + }, + { + "epoch": 0.20610310210295937, + "grad_norm": 1.5449354648590088, + "learning_rate": 4.494025939255846e-05, + "loss": 4.8424, + "step": 34655 + }, + { + "epoch": 0.20610904938624036, + "grad_norm": 1.623788833618164, + "learning_rate": 4.493997764784439e-05, + "loss": 4.9259, + "step": 34656 + }, + { + "epoch": 0.20611499666952135, + "grad_norm": 1.5342544317245483, + "learning_rate": 4.4939695896169483e-05, + "loss": 4.7405, + "step": 34657 + }, + { + "epoch": 0.20612094395280237, + "grad_norm": 2.769473075866699, + "learning_rate": 4.493941413753383e-05, + "loss": 3.4611, + "step": 34658 + }, + { + "epoch": 0.20612689123608335, + "grad_norm": 1.6174215078353882, + "learning_rate": 4.493913237193753e-05, + "loss": 4.5891, + "step": 34659 + }, + { + "epoch": 0.20613283851936434, + "grad_norm": 1.6780261993408203, + "learning_rate": 4.49388505993807e-05, + "loss": 4.5571, + "step": 34660 + }, + { + "epoch": 0.20613878580264536, + "grad_norm": 1.503455638885498, + "learning_rate": 4.493856881986341e-05, + "loss": 4.7737, + "step": 34661 + }, + { + "epoch": 0.20614473308592635, + "grad_norm": 1.4152953624725342, + "learning_rate": 4.493828703338577e-05, + "loss": 4.2979, + "step": 34662 + }, + { + "epoch": 0.20615068036920733, + "grad_norm": 1.6800228357315063, + "learning_rate": 4.4938005239947896e-05, + "loss": 4.4647, + "step": 34663 + }, + { + "epoch": 0.20615662765248835, + "grad_norm": 1.8049077987670898, + "learning_rate": 4.4937723439549857e-05, + "loss": 4.7413, + "step": 34664 + }, + { + "epoch": 0.20616257493576934, + "grad_norm": 1.651145577430725, + "learning_rate": 4.493744163219177e-05, + "loss": 5.3694, + "step": 34665 + }, + { + "epoch": 0.20616852221905033, + "grad_norm": 1.7472602128982544, + "learning_rate": 4.493715981787372e-05, + "loss": 4.83, + "step": 34666 + }, + { + "epoch": 0.20617446950233134, + "grad_norm": 1.6628087759017944, + "learning_rate": 4.493687799659583e-05, + "loss": 4.6301, + "step": 34667 + }, + { + "epoch": 0.20618041678561233, + "grad_norm": 1.5734481811523438, + "learning_rate": 4.493659616835816e-05, + "loss": 4.577, + "step": 34668 + }, + { + "epoch": 0.20618636406889332, + "grad_norm": 1.6595523357391357, + "learning_rate": 4.4936314333160844e-05, + "loss": 4.5683, + "step": 34669 + }, + { + "epoch": 0.20619231135217433, + "grad_norm": 1.8388851881027222, + "learning_rate": 4.4936032491003965e-05, + "loss": 4.9936, + "step": 34670 + }, + { + "epoch": 0.20619825863545532, + "grad_norm": 2.1026453971862793, + "learning_rate": 4.493575064188762e-05, + "loss": 5.5576, + "step": 34671 + }, + { + "epoch": 0.2062042059187363, + "grad_norm": 1.7481777667999268, + "learning_rate": 4.4935468785811916e-05, + "loss": 5.2993, + "step": 34672 + }, + { + "epoch": 0.20621015320201733, + "grad_norm": 1.7570959329605103, + "learning_rate": 4.493518692277694e-05, + "loss": 5.2391, + "step": 34673 + }, + { + "epoch": 0.20621610048529831, + "grad_norm": 1.5971003770828247, + "learning_rate": 4.49349050527828e-05, + "loss": 5.2646, + "step": 34674 + }, + { + "epoch": 0.2062220477685793, + "grad_norm": 1.7364708185195923, + "learning_rate": 4.493462317582959e-05, + "loss": 4.7422, + "step": 34675 + }, + { + "epoch": 0.20622799505186032, + "grad_norm": 3.195284128189087, + "learning_rate": 4.493434129191741e-05, + "loss": 3.3892, + "step": 34676 + }, + { + "epoch": 0.2062339423351413, + "grad_norm": 2.5080318450927734, + "learning_rate": 4.493405940104636e-05, + "loss": 4.6867, + "step": 34677 + }, + { + "epoch": 0.2062398896184223, + "grad_norm": 1.9916131496429443, + "learning_rate": 4.4933777503216525e-05, + "loss": 4.8544, + "step": 34678 + }, + { + "epoch": 0.2062458369017033, + "grad_norm": 3.0195977687835693, + "learning_rate": 4.4933495598428024e-05, + "loss": 2.5483, + "step": 34679 + }, + { + "epoch": 0.2062517841849843, + "grad_norm": 2.745537042617798, + "learning_rate": 4.4933213686680944e-05, + "loss": 3.6633, + "step": 34680 + }, + { + "epoch": 0.2062577314682653, + "grad_norm": 2.856989860534668, + "learning_rate": 4.493293176797538e-05, + "loss": 4.8289, + "step": 34681 + }, + { + "epoch": 0.2062636787515463, + "grad_norm": 4.457959175109863, + "learning_rate": 4.4932649842311435e-05, + "loss": 4.8733, + "step": 34682 + }, + { + "epoch": 0.2062696260348273, + "grad_norm": 2.740654230117798, + "learning_rate": 4.493236790968921e-05, + "loss": 4.4471, + "step": 34683 + }, + { + "epoch": 0.20627557331810828, + "grad_norm": 1.9697023630142212, + "learning_rate": 4.493208597010881e-05, + "loss": 4.4508, + "step": 34684 + }, + { + "epoch": 0.2062815206013893, + "grad_norm": 2.9418060779571533, + "learning_rate": 4.493180402357031e-05, + "loss": 4.0835, + "step": 34685 + }, + { + "epoch": 0.20628746788467028, + "grad_norm": 2.65594744682312, + "learning_rate": 4.493152207007383e-05, + "loss": 3.7277, + "step": 34686 + }, + { + "epoch": 0.20629341516795127, + "grad_norm": 2.2085533142089844, + "learning_rate": 4.4931240109619464e-05, + "loss": 3.4425, + "step": 34687 + }, + { + "epoch": 0.2062993624512323, + "grad_norm": 2.824538230895996, + "learning_rate": 4.493095814220731e-05, + "loss": 4.2863, + "step": 34688 + }, + { + "epoch": 0.20630530973451328, + "grad_norm": 1.7613264322280884, + "learning_rate": 4.493067616783746e-05, + "loss": 4.3662, + "step": 34689 + }, + { + "epoch": 0.20631125701779426, + "grad_norm": 2.478560209274292, + "learning_rate": 4.493039418651002e-05, + "loss": 4.4783, + "step": 34690 + }, + { + "epoch": 0.20631720430107528, + "grad_norm": 2.1893646717071533, + "learning_rate": 4.493011219822508e-05, + "loss": 4.6626, + "step": 34691 + }, + { + "epoch": 0.20632315158435627, + "grad_norm": 2.2104086875915527, + "learning_rate": 4.4929830202982745e-05, + "loss": 4.2291, + "step": 34692 + }, + { + "epoch": 0.20632909886763726, + "grad_norm": 2.0803580284118652, + "learning_rate": 4.492954820078312e-05, + "loss": 3.3411, + "step": 34693 + }, + { + "epoch": 0.20633504615091827, + "grad_norm": 2.582167387008667, + "learning_rate": 4.492926619162629e-05, + "loss": 3.3111, + "step": 34694 + }, + { + "epoch": 0.20634099343419926, + "grad_norm": 2.3166258335113525, + "learning_rate": 4.4928984175512354e-05, + "loss": 3.4107, + "step": 34695 + }, + { + "epoch": 0.20634694071748025, + "grad_norm": 2.4472901821136475, + "learning_rate": 4.492870215244142e-05, + "loss": 3.6275, + "step": 34696 + }, + { + "epoch": 0.20635288800076126, + "grad_norm": 2.252182960510254, + "learning_rate": 4.4928420122413584e-05, + "loss": 3.8503, + "step": 34697 + }, + { + "epoch": 0.20635883528404225, + "grad_norm": 1.9681614637374878, + "learning_rate": 4.4928138085428936e-05, + "loss": 3.9637, + "step": 34698 + }, + { + "epoch": 0.20636478256732324, + "grad_norm": 2.356943130493164, + "learning_rate": 4.4927856041487586e-05, + "loss": 3.6805, + "step": 34699 + }, + { + "epoch": 0.20637072985060426, + "grad_norm": 2.424372673034668, + "learning_rate": 4.4927573990589636e-05, + "loss": 3.3821, + "step": 34700 + }, + { + "epoch": 0.20637667713388524, + "grad_norm": 2.569279432296753, + "learning_rate": 4.492729193273516e-05, + "loss": 3.494, + "step": 34701 + }, + { + "epoch": 0.20638262441716623, + "grad_norm": 2.154430627822876, + "learning_rate": 4.492700986792427e-05, + "loss": 3.4451, + "step": 34702 + }, + { + "epoch": 0.20638857170044725, + "grad_norm": 2.0662691593170166, + "learning_rate": 4.4926727796157084e-05, + "loss": 3.5951, + "step": 34703 + }, + { + "epoch": 0.20639451898372824, + "grad_norm": 1.9252958297729492, + "learning_rate": 4.4926445717433674e-05, + "loss": 4.6183, + "step": 34704 + }, + { + "epoch": 0.20640046626700922, + "grad_norm": 1.816887378692627, + "learning_rate": 4.4926163631754146e-05, + "loss": 5.0774, + "step": 34705 + }, + { + "epoch": 0.20640641355029024, + "grad_norm": 1.710744023323059, + "learning_rate": 4.4925881539118606e-05, + "loss": 5.0954, + "step": 34706 + }, + { + "epoch": 0.20641236083357123, + "grad_norm": 1.6318283081054688, + "learning_rate": 4.4925599439527144e-05, + "loss": 4.4587, + "step": 34707 + }, + { + "epoch": 0.20641830811685222, + "grad_norm": 1.6551474332809448, + "learning_rate": 4.4925317332979854e-05, + "loss": 4.2257, + "step": 34708 + }, + { + "epoch": 0.2064242554001332, + "grad_norm": 2.1529228687286377, + "learning_rate": 4.492503521947685e-05, + "loss": 4.0616, + "step": 34709 + }, + { + "epoch": 0.20643020268341422, + "grad_norm": 1.7376173734664917, + "learning_rate": 4.4924753099018225e-05, + "loss": 4.9182, + "step": 34710 + }, + { + "epoch": 0.2064361499666952, + "grad_norm": 1.8715314865112305, + "learning_rate": 4.492447097160407e-05, + "loss": 4.7435, + "step": 34711 + }, + { + "epoch": 0.2064420972499762, + "grad_norm": 1.711469292640686, + "learning_rate": 4.4924188837234483e-05, + "loss": 4.5403, + "step": 34712 + }, + { + "epoch": 0.2064480445332572, + "grad_norm": 1.5449049472808838, + "learning_rate": 4.492390669590957e-05, + "loss": 4.4102, + "step": 34713 + }, + { + "epoch": 0.2064539918165382, + "grad_norm": 1.852977991104126, + "learning_rate": 4.492362454762943e-05, + "loss": 4.394, + "step": 34714 + }, + { + "epoch": 0.2064599390998192, + "grad_norm": 1.880318284034729, + "learning_rate": 4.492334239239416e-05, + "loss": 4.3825, + "step": 34715 + }, + { + "epoch": 0.2064658863831002, + "grad_norm": 1.7306921482086182, + "learning_rate": 4.492306023020385e-05, + "loss": 4.2197, + "step": 34716 + }, + { + "epoch": 0.2064718336663812, + "grad_norm": 1.637911081314087, + "learning_rate": 4.492277806105861e-05, + "loss": 4.3934, + "step": 34717 + }, + { + "epoch": 0.20647778094966218, + "grad_norm": 1.708601713180542, + "learning_rate": 4.492249588495854e-05, + "loss": 4.3054, + "step": 34718 + }, + { + "epoch": 0.2064837282329432, + "grad_norm": 1.9779586791992188, + "learning_rate": 4.492221370190373e-05, + "loss": 4.2863, + "step": 34719 + }, + { + "epoch": 0.20648967551622419, + "grad_norm": 2.324136734008789, + "learning_rate": 4.492193151189427e-05, + "loss": 3.4542, + "step": 34720 + }, + { + "epoch": 0.20649562279950517, + "grad_norm": 2.028463125228882, + "learning_rate": 4.492164931493028e-05, + "loss": 4.1182, + "step": 34721 + }, + { + "epoch": 0.2065015700827862, + "grad_norm": 2.0588998794555664, + "learning_rate": 4.492136711101185e-05, + "loss": 4.8401, + "step": 34722 + }, + { + "epoch": 0.20650751736606718, + "grad_norm": 1.6144108772277832, + "learning_rate": 4.492108490013906e-05, + "loss": 4.666, + "step": 34723 + }, + { + "epoch": 0.20651346464934817, + "grad_norm": 2.0475502014160156, + "learning_rate": 4.4920802682312047e-05, + "loss": 4.8169, + "step": 34724 + }, + { + "epoch": 0.20651941193262918, + "grad_norm": 2.2128946781158447, + "learning_rate": 4.492052045753088e-05, + "loss": 4.6387, + "step": 34725 + }, + { + "epoch": 0.20652535921591017, + "grad_norm": 1.4781862497329712, + "learning_rate": 4.4920238225795654e-05, + "loss": 4.9999, + "step": 34726 + }, + { + "epoch": 0.20653130649919116, + "grad_norm": 1.5465887784957886, + "learning_rate": 4.491995598710649e-05, + "loss": 4.9641, + "step": 34727 + }, + { + "epoch": 0.20653725378247217, + "grad_norm": 1.8786133527755737, + "learning_rate": 4.491967374146347e-05, + "loss": 4.4062, + "step": 34728 + }, + { + "epoch": 0.20654320106575316, + "grad_norm": 1.8625175952911377, + "learning_rate": 4.49193914888667e-05, + "loss": 4.7282, + "step": 34729 + }, + { + "epoch": 0.20654914834903415, + "grad_norm": 1.958048939704895, + "learning_rate": 4.4919109229316274e-05, + "loss": 4.8336, + "step": 34730 + }, + { + "epoch": 0.20655509563231517, + "grad_norm": 1.724219799041748, + "learning_rate": 4.49188269628123e-05, + "loss": 4.3932, + "step": 34731 + }, + { + "epoch": 0.20656104291559615, + "grad_norm": 1.767488718032837, + "learning_rate": 4.491854468935486e-05, + "loss": 4.8139, + "step": 34732 + }, + { + "epoch": 0.20656699019887714, + "grad_norm": 1.734523892402649, + "learning_rate": 4.491826240894407e-05, + "loss": 4.5022, + "step": 34733 + }, + { + "epoch": 0.20657293748215816, + "grad_norm": 1.702898383140564, + "learning_rate": 4.491798012158002e-05, + "loss": 5.2369, + "step": 34734 + }, + { + "epoch": 0.20657888476543915, + "grad_norm": 1.6671706438064575, + "learning_rate": 4.4917697827262795e-05, + "loss": 5.3677, + "step": 34735 + }, + { + "epoch": 0.20658483204872014, + "grad_norm": 1.6979637145996094, + "learning_rate": 4.4917415525992524e-05, + "loss": 5.3411, + "step": 34736 + }, + { + "epoch": 0.20659077933200115, + "grad_norm": 1.7467466592788696, + "learning_rate": 4.491713321776928e-05, + "loss": 4.5008, + "step": 34737 + }, + { + "epoch": 0.20659672661528214, + "grad_norm": 1.5151604413986206, + "learning_rate": 4.491685090259318e-05, + "loss": 5.1486, + "step": 34738 + }, + { + "epoch": 0.20660267389856313, + "grad_norm": 1.8055251836776733, + "learning_rate": 4.49165685804643e-05, + "loss": 4.9527, + "step": 34739 + }, + { + "epoch": 0.20660862118184414, + "grad_norm": 1.7542595863342285, + "learning_rate": 4.4916286251382754e-05, + "loss": 4.9497, + "step": 34740 + }, + { + "epoch": 0.20661456846512513, + "grad_norm": 1.7868531942367554, + "learning_rate": 4.4916003915348645e-05, + "loss": 4.8182, + "step": 34741 + }, + { + "epoch": 0.20662051574840612, + "grad_norm": 1.7146828174591064, + "learning_rate": 4.491572157236206e-05, + "loss": 4.4512, + "step": 34742 + }, + { + "epoch": 0.20662646303168714, + "grad_norm": 1.6494626998901367, + "learning_rate": 4.491543922242311e-05, + "loss": 4.4872, + "step": 34743 + }, + { + "epoch": 0.20663241031496812, + "grad_norm": 1.9803482294082642, + "learning_rate": 4.4915156865531875e-05, + "loss": 4.8061, + "step": 34744 + }, + { + "epoch": 0.2066383575982491, + "grad_norm": 2.0528030395507812, + "learning_rate": 4.4914874501688475e-05, + "loss": 4.4076, + "step": 34745 + }, + { + "epoch": 0.20664430488153013, + "grad_norm": 1.5636694431304932, + "learning_rate": 4.491459213089299e-05, + "loss": 5.0246, + "step": 34746 + }, + { + "epoch": 0.20665025216481112, + "grad_norm": 2.5548834800720215, + "learning_rate": 4.4914309753145534e-05, + "loss": 3.7054, + "step": 34747 + }, + { + "epoch": 0.2066561994480921, + "grad_norm": 2.4566895961761475, + "learning_rate": 4.491402736844619e-05, + "loss": 3.5679, + "step": 34748 + }, + { + "epoch": 0.20666214673137312, + "grad_norm": 1.9277645349502563, + "learning_rate": 4.491374497679507e-05, + "loss": 4.3468, + "step": 34749 + }, + { + "epoch": 0.2066680940146541, + "grad_norm": 1.8425731658935547, + "learning_rate": 4.4913462578192265e-05, + "loss": 4.792, + "step": 34750 + }, + { + "epoch": 0.2066740412979351, + "grad_norm": 1.7215994596481323, + "learning_rate": 4.491318017263788e-05, + "loss": 5.2611, + "step": 34751 + }, + { + "epoch": 0.2066799885812161, + "grad_norm": 1.879885196685791, + "learning_rate": 4.491289776013201e-05, + "loss": 5.0435, + "step": 34752 + }, + { + "epoch": 0.2066859358644971, + "grad_norm": 2.316704511642456, + "learning_rate": 4.491261534067475e-05, + "loss": 4.434, + "step": 34753 + }, + { + "epoch": 0.2066918831477781, + "grad_norm": 2.6675474643707275, + "learning_rate": 4.4912332914266195e-05, + "loss": 4.4805, + "step": 34754 + }, + { + "epoch": 0.2066978304310591, + "grad_norm": 2.7434020042419434, + "learning_rate": 4.4912050480906455e-05, + "loss": 3.8732, + "step": 34755 + }, + { + "epoch": 0.2067037777143401, + "grad_norm": 2.0465853214263916, + "learning_rate": 4.4911768040595624e-05, + "loss": 3.4234, + "step": 34756 + }, + { + "epoch": 0.20670972499762108, + "grad_norm": 2.282705307006836, + "learning_rate": 4.4911485593333804e-05, + "loss": 3.4257, + "step": 34757 + }, + { + "epoch": 0.2067156722809021, + "grad_norm": 2.1085431575775146, + "learning_rate": 4.491120313912109e-05, + "loss": 3.1277, + "step": 34758 + }, + { + "epoch": 0.20672161956418308, + "grad_norm": 2.307992935180664, + "learning_rate": 4.491092067795758e-05, + "loss": 2.9563, + "step": 34759 + }, + { + "epoch": 0.20672756684746407, + "grad_norm": 1.7869884967803955, + "learning_rate": 4.491063820984337e-05, + "loss": 3.4671, + "step": 34760 + }, + { + "epoch": 0.2067335141307451, + "grad_norm": 1.573107361793518, + "learning_rate": 4.4910355734778564e-05, + "loss": 4.6225, + "step": 34761 + }, + { + "epoch": 0.20673946141402608, + "grad_norm": 1.8124967813491821, + "learning_rate": 4.491007325276326e-05, + "loss": 4.8808, + "step": 34762 + }, + { + "epoch": 0.20674540869730706, + "grad_norm": 2.266270875930786, + "learning_rate": 4.4909790763797555e-05, + "loss": 4.1334, + "step": 34763 + }, + { + "epoch": 0.20675135598058808, + "grad_norm": 2.0331921577453613, + "learning_rate": 4.4909508267881545e-05, + "loss": 4.9498, + "step": 34764 + }, + { + "epoch": 0.20675730326386907, + "grad_norm": 1.7160965204238892, + "learning_rate": 4.4909225765015325e-05, + "loss": 4.5748, + "step": 34765 + }, + { + "epoch": 0.20676325054715006, + "grad_norm": 1.5300441980361938, + "learning_rate": 4.490894325519901e-05, + "loss": 4.2806, + "step": 34766 + }, + { + "epoch": 0.20676919783043107, + "grad_norm": 2.390836477279663, + "learning_rate": 4.490866073843269e-05, + "loss": 3.4529, + "step": 34767 + }, + { + "epoch": 0.20677514511371206, + "grad_norm": 1.91972017288208, + "learning_rate": 4.4908378214716454e-05, + "loss": 4.4385, + "step": 34768 + }, + { + "epoch": 0.20678109239699305, + "grad_norm": 1.836112141609192, + "learning_rate": 4.4908095684050416e-05, + "loss": 4.6575, + "step": 34769 + }, + { + "epoch": 0.20678703968027404, + "grad_norm": 1.7108503580093384, + "learning_rate": 4.490781314643466e-05, + "loss": 4.4053, + "step": 34770 + }, + { + "epoch": 0.20679298696355505, + "grad_norm": 1.6383551359176636, + "learning_rate": 4.49075306018693e-05, + "loss": 4.5103, + "step": 34771 + }, + { + "epoch": 0.20679893424683604, + "grad_norm": 1.7861992120742798, + "learning_rate": 4.490724805035442e-05, + "loss": 4.5834, + "step": 34772 + }, + { + "epoch": 0.20680488153011703, + "grad_norm": 1.6550997495651245, + "learning_rate": 4.490696549189014e-05, + "loss": 4.4976, + "step": 34773 + }, + { + "epoch": 0.20681082881339805, + "grad_norm": 1.8998942375183105, + "learning_rate": 4.4906682926476525e-05, + "loss": 4.5288, + "step": 34774 + }, + { + "epoch": 0.20681677609667903, + "grad_norm": 1.920011281967163, + "learning_rate": 4.4906400354113705e-05, + "loss": 4.4106, + "step": 34775 + }, + { + "epoch": 0.20682272337996002, + "grad_norm": 1.5240533351898193, + "learning_rate": 4.490611777480176e-05, + "loss": 4.868, + "step": 34776 + }, + { + "epoch": 0.20682867066324104, + "grad_norm": 1.8516569137573242, + "learning_rate": 4.49058351885408e-05, + "loss": 4.6862, + "step": 34777 + }, + { + "epoch": 0.20683461794652203, + "grad_norm": 1.8184990882873535, + "learning_rate": 4.4905552595330915e-05, + "loss": 4.5043, + "step": 34778 + }, + { + "epoch": 0.20684056522980301, + "grad_norm": 1.9880046844482422, + "learning_rate": 4.490526999517221e-05, + "loss": 4.2611, + "step": 34779 + }, + { + "epoch": 0.20684651251308403, + "grad_norm": 2.5457332134246826, + "learning_rate": 4.490498738806478e-05, + "loss": 4.0233, + "step": 34780 + }, + { + "epoch": 0.20685245979636502, + "grad_norm": 2.4234964847564697, + "learning_rate": 4.490470477400872e-05, + "loss": 3.9144, + "step": 34781 + }, + { + "epoch": 0.206858407079646, + "grad_norm": 2.0977954864501953, + "learning_rate": 4.490442215300413e-05, + "loss": 3.8256, + "step": 34782 + }, + { + "epoch": 0.20686435436292702, + "grad_norm": 2.3387715816497803, + "learning_rate": 4.490413952505113e-05, + "loss": 3.7419, + "step": 34783 + }, + { + "epoch": 0.206870301646208, + "grad_norm": 1.8677074909210205, + "learning_rate": 4.490385689014978e-05, + "loss": 5.0754, + "step": 34784 + }, + { + "epoch": 0.206876248929489, + "grad_norm": 1.5382182598114014, + "learning_rate": 4.490357424830021e-05, + "loss": 5.3484, + "step": 34785 + }, + { + "epoch": 0.20688219621277001, + "grad_norm": 1.6211512088775635, + "learning_rate": 4.4903291599502506e-05, + "loss": 5.2409, + "step": 34786 + }, + { + "epoch": 0.206888143496051, + "grad_norm": 1.8651448488235474, + "learning_rate": 4.4903008943756766e-05, + "loss": 4.7752, + "step": 34787 + }, + { + "epoch": 0.206894090779332, + "grad_norm": 1.579422950744629, + "learning_rate": 4.490272628106309e-05, + "loss": 4.899, + "step": 34788 + }, + { + "epoch": 0.206900038062613, + "grad_norm": 1.7237675189971924, + "learning_rate": 4.490244361142159e-05, + "loss": 4.9186, + "step": 34789 + }, + { + "epoch": 0.206905985345894, + "grad_norm": 2.424854040145874, + "learning_rate": 4.490216093483234e-05, + "loss": 3.9027, + "step": 34790 + }, + { + "epoch": 0.20691193262917498, + "grad_norm": 1.656636357307434, + "learning_rate": 4.490187825129546e-05, + "loss": 4.4577, + "step": 34791 + }, + { + "epoch": 0.206917879912456, + "grad_norm": 2.7975332736968994, + "learning_rate": 4.490159556081103e-05, + "loss": 4.2677, + "step": 34792 + }, + { + "epoch": 0.206923827195737, + "grad_norm": 2.6634609699249268, + "learning_rate": 4.490131286337916e-05, + "loss": 3.5967, + "step": 34793 + }, + { + "epoch": 0.20692977447901797, + "grad_norm": 2.820051431655884, + "learning_rate": 4.4901030158999954e-05, + "loss": 3.791, + "step": 34794 + }, + { + "epoch": 0.206935721762299, + "grad_norm": 1.9154092073440552, + "learning_rate": 4.4900747447673505e-05, + "loss": 4.2114, + "step": 34795 + }, + { + "epoch": 0.20694166904557998, + "grad_norm": 1.6924352645874023, + "learning_rate": 4.490046472939991e-05, + "loss": 5.4559, + "step": 34796 + }, + { + "epoch": 0.20694761632886097, + "grad_norm": 2.0808238983154297, + "learning_rate": 4.490018200417926e-05, + "loss": 4.0044, + "step": 34797 + }, + { + "epoch": 0.20695356361214198, + "grad_norm": 3.8569533824920654, + "learning_rate": 4.489989927201167e-05, + "loss": 3.6529, + "step": 34798 + }, + { + "epoch": 0.20695951089542297, + "grad_norm": 2.5783863067626953, + "learning_rate": 4.489961653289723e-05, + "loss": 3.3469, + "step": 34799 + }, + { + "epoch": 0.20696545817870396, + "grad_norm": 2.322880268096924, + "learning_rate": 4.4899333786836026e-05, + "loss": 3.5252, + "step": 34800 + }, + { + "epoch": 0.20697140546198498, + "grad_norm": 1.4952900409698486, + "learning_rate": 4.489905103382819e-05, + "loss": 4.9481, + "step": 34801 + }, + { + "epoch": 0.20697735274526596, + "grad_norm": 1.5042228698730469, + "learning_rate": 4.4898768273873796e-05, + "loss": 5.0064, + "step": 34802 + }, + { + "epoch": 0.20698330002854695, + "grad_norm": 2.2395477294921875, + "learning_rate": 4.4898485506972945e-05, + "loss": 4.0019, + "step": 34803 + }, + { + "epoch": 0.20698924731182797, + "grad_norm": 2.6849710941314697, + "learning_rate": 4.489820273312573e-05, + "loss": 3.6374, + "step": 34804 + }, + { + "epoch": 0.20699519459510896, + "grad_norm": 2.534201145172119, + "learning_rate": 4.489791995233227e-05, + "loss": 3.7995, + "step": 34805 + }, + { + "epoch": 0.20700114187838994, + "grad_norm": 2.291923761367798, + "learning_rate": 4.489763716459264e-05, + "loss": 3.876, + "step": 34806 + }, + { + "epoch": 0.20700708916167096, + "grad_norm": 2.2157461643218994, + "learning_rate": 4.489735436990696e-05, + "loss": 4.0497, + "step": 34807 + }, + { + "epoch": 0.20701303644495195, + "grad_norm": 2.394935369491577, + "learning_rate": 4.489707156827532e-05, + "loss": 3.4041, + "step": 34808 + }, + { + "epoch": 0.20701898372823294, + "grad_norm": 2.634643077850342, + "learning_rate": 4.4896788759697813e-05, + "loss": 3.4985, + "step": 34809 + }, + { + "epoch": 0.20702493101151395, + "grad_norm": 2.609468460083008, + "learning_rate": 4.489650594417454e-05, + "loss": 3.2843, + "step": 34810 + }, + { + "epoch": 0.20703087829479494, + "grad_norm": 2.5767226219177246, + "learning_rate": 4.4896223121705606e-05, + "loss": 3.3413, + "step": 34811 + }, + { + "epoch": 0.20703682557807593, + "grad_norm": 2.39313006401062, + "learning_rate": 4.489594029229111e-05, + "loss": 3.358, + "step": 34812 + }, + { + "epoch": 0.20704277286135694, + "grad_norm": 2.763227701187134, + "learning_rate": 4.489565745593114e-05, + "loss": 3.6319, + "step": 34813 + }, + { + "epoch": 0.20704872014463793, + "grad_norm": 1.9068472385406494, + "learning_rate": 4.489537461262581e-05, + "loss": 4.35, + "step": 34814 + }, + { + "epoch": 0.20705466742791892, + "grad_norm": 2.774386405944824, + "learning_rate": 4.48950917623752e-05, + "loss": 4.5764, + "step": 34815 + }, + { + "epoch": 0.20706061471119994, + "grad_norm": 2.7725729942321777, + "learning_rate": 4.4894808905179426e-05, + "loss": 4.8665, + "step": 34816 + }, + { + "epoch": 0.20706656199448092, + "grad_norm": 1.7243051528930664, + "learning_rate": 4.4894526041038577e-05, + "loss": 4.5846, + "step": 34817 + }, + { + "epoch": 0.2070725092777619, + "grad_norm": 2.355294704437256, + "learning_rate": 4.4894243169952755e-05, + "loss": 3.4419, + "step": 34818 + }, + { + "epoch": 0.20707845656104293, + "grad_norm": 2.7653069496154785, + "learning_rate": 4.489396029192206e-05, + "loss": 3.8239, + "step": 34819 + }, + { + "epoch": 0.20708440384432392, + "grad_norm": 2.699720621109009, + "learning_rate": 4.489367740694659e-05, + "loss": 4.3421, + "step": 34820 + }, + { + "epoch": 0.2070903511276049, + "grad_norm": 2.5409398078918457, + "learning_rate": 4.489339451502644e-05, + "loss": 4.4411, + "step": 34821 + }, + { + "epoch": 0.20709629841088592, + "grad_norm": 2.486370801925659, + "learning_rate": 4.489311161616171e-05, + "loss": 4.1267, + "step": 34822 + }, + { + "epoch": 0.2071022456941669, + "grad_norm": 1.9662883281707764, + "learning_rate": 4.48928287103525e-05, + "loss": 4.0986, + "step": 34823 + }, + { + "epoch": 0.2071081929774479, + "grad_norm": 1.8960779905319214, + "learning_rate": 4.489254579759891e-05, + "loss": 4.8072, + "step": 34824 + }, + { + "epoch": 0.2071141402607289, + "grad_norm": 1.8817890882492065, + "learning_rate": 4.4892262877901044e-05, + "loss": 4.5285, + "step": 34825 + }, + { + "epoch": 0.2071200875440099, + "grad_norm": 2.148820400238037, + "learning_rate": 4.489197995125899e-05, + "loss": 4.5258, + "step": 34826 + }, + { + "epoch": 0.2071260348272909, + "grad_norm": 2.0745046138763428, + "learning_rate": 4.489169701767285e-05, + "loss": 4.6216, + "step": 34827 + }, + { + "epoch": 0.20713198211057188, + "grad_norm": 1.9720550775527954, + "learning_rate": 4.4891414077142726e-05, + "loss": 4.3668, + "step": 34828 + }, + { + "epoch": 0.2071379293938529, + "grad_norm": 2.2304906845092773, + "learning_rate": 4.489113112966871e-05, + "loss": 4.5013, + "step": 34829 + }, + { + "epoch": 0.20714387667713388, + "grad_norm": 2.174670934677124, + "learning_rate": 4.489084817525091e-05, + "loss": 4.5277, + "step": 34830 + }, + { + "epoch": 0.20714982396041487, + "grad_norm": 2.458003044128418, + "learning_rate": 4.489056521388942e-05, + "loss": 4.4647, + "step": 34831 + }, + { + "epoch": 0.20715577124369589, + "grad_norm": 2.281400203704834, + "learning_rate": 4.489028224558434e-05, + "loss": 4.5083, + "step": 34832 + }, + { + "epoch": 0.20716171852697687, + "grad_norm": 2.4862747192382812, + "learning_rate": 4.4889999270335765e-05, + "loss": 4.7163, + "step": 34833 + }, + { + "epoch": 0.20716766581025786, + "grad_norm": 2.276209592819214, + "learning_rate": 4.48897162881438e-05, + "loss": 4.8147, + "step": 34834 + }, + { + "epoch": 0.20717361309353888, + "grad_norm": 2.0201053619384766, + "learning_rate": 4.488943329900854e-05, + "loss": 4.5599, + "step": 34835 + }, + { + "epoch": 0.20717956037681987, + "grad_norm": 2.284170389175415, + "learning_rate": 4.4889150302930085e-05, + "loss": 4.5729, + "step": 34836 + }, + { + "epoch": 0.20718550766010085, + "grad_norm": 1.691230297088623, + "learning_rate": 4.488886729990853e-05, + "loss": 4.3631, + "step": 34837 + }, + { + "epoch": 0.20719145494338187, + "grad_norm": 2.024777412414551, + "learning_rate": 4.488858428994398e-05, + "loss": 4.3278, + "step": 34838 + }, + { + "epoch": 0.20719740222666286, + "grad_norm": 2.0853986740112305, + "learning_rate": 4.488830127303653e-05, + "loss": 4.533, + "step": 34839 + }, + { + "epoch": 0.20720334950994385, + "grad_norm": 2.2168142795562744, + "learning_rate": 4.488801824918627e-05, + "loss": 4.5766, + "step": 34840 + }, + { + "epoch": 0.20720929679322486, + "grad_norm": 2.369561195373535, + "learning_rate": 4.488773521839332e-05, + "loss": 4.4252, + "step": 34841 + }, + { + "epoch": 0.20721524407650585, + "grad_norm": 1.9899331331253052, + "learning_rate": 4.4887452180657764e-05, + "loss": 4.5878, + "step": 34842 + }, + { + "epoch": 0.20722119135978684, + "grad_norm": 1.9128245115280151, + "learning_rate": 4.48871691359797e-05, + "loss": 4.5696, + "step": 34843 + }, + { + "epoch": 0.20722713864306785, + "grad_norm": 1.8677480220794678, + "learning_rate": 4.488688608435924e-05, + "loss": 4.4652, + "step": 34844 + }, + { + "epoch": 0.20723308592634884, + "grad_norm": 2.1576309204101562, + "learning_rate": 4.488660302579647e-05, + "loss": 4.2685, + "step": 34845 + }, + { + "epoch": 0.20723903320962983, + "grad_norm": 1.9897032976150513, + "learning_rate": 4.488631996029149e-05, + "loss": 4.3295, + "step": 34846 + }, + { + "epoch": 0.20724498049291085, + "grad_norm": 1.9403741359710693, + "learning_rate": 4.488603688784439e-05, + "loss": 4.4789, + "step": 34847 + }, + { + "epoch": 0.20725092777619183, + "grad_norm": 2.024747610092163, + "learning_rate": 4.48857538084553e-05, + "loss": 4.4787, + "step": 34848 + }, + { + "epoch": 0.20725687505947282, + "grad_norm": 1.6366159915924072, + "learning_rate": 4.488547072212429e-05, + "loss": 4.6898, + "step": 34849 + }, + { + "epoch": 0.20726282234275384, + "grad_norm": 2.3541810512542725, + "learning_rate": 4.488518762885147e-05, + "loss": 3.8412, + "step": 34850 + }, + { + "epoch": 0.20726876962603483, + "grad_norm": 2.3278443813323975, + "learning_rate": 4.4884904528636934e-05, + "loss": 4.4837, + "step": 34851 + }, + { + "epoch": 0.20727471690931581, + "grad_norm": 2.1795244216918945, + "learning_rate": 4.488462142148078e-05, + "loss": 4.4418, + "step": 34852 + }, + { + "epoch": 0.20728066419259683, + "grad_norm": 2.204218626022339, + "learning_rate": 4.488433830738312e-05, + "loss": 4.4315, + "step": 34853 + }, + { + "epoch": 0.20728661147587782, + "grad_norm": 2.254626750946045, + "learning_rate": 4.488405518634403e-05, + "loss": 4.6286, + "step": 34854 + }, + { + "epoch": 0.2072925587591588, + "grad_norm": 2.035433530807495, + "learning_rate": 4.4883772058363635e-05, + "loss": 4.6472, + "step": 34855 + }, + { + "epoch": 0.20729850604243982, + "grad_norm": 1.5613362789154053, + "learning_rate": 4.4883488923442006e-05, + "loss": 5.4295, + "step": 34856 + }, + { + "epoch": 0.2073044533257208, + "grad_norm": 2.2521488666534424, + "learning_rate": 4.488320578157927e-05, + "loss": 4.5495, + "step": 34857 + }, + { + "epoch": 0.2073104006090018, + "grad_norm": 2.360024929046631, + "learning_rate": 4.4882922632775506e-05, + "loss": 4.0133, + "step": 34858 + }, + { + "epoch": 0.20731634789228282, + "grad_norm": 2.220082998275757, + "learning_rate": 4.488263947703082e-05, + "loss": 4.4248, + "step": 34859 + }, + { + "epoch": 0.2073222951755638, + "grad_norm": 2.175050735473633, + "learning_rate": 4.4882356314345306e-05, + "loss": 4.6056, + "step": 34860 + }, + { + "epoch": 0.2073282424588448, + "grad_norm": 2.203740358352661, + "learning_rate": 4.488207314471907e-05, + "loss": 4.8203, + "step": 34861 + }, + { + "epoch": 0.2073341897421258, + "grad_norm": 1.7499996423721313, + "learning_rate": 4.488178996815221e-05, + "loss": 4.5105, + "step": 34862 + }, + { + "epoch": 0.2073401370254068, + "grad_norm": 1.7292070388793945, + "learning_rate": 4.488150678464482e-05, + "loss": 4.2345, + "step": 34863 + }, + { + "epoch": 0.20734608430868778, + "grad_norm": 1.886146903038025, + "learning_rate": 4.488122359419701e-05, + "loss": 4.2784, + "step": 34864 + }, + { + "epoch": 0.2073520315919688, + "grad_norm": 1.5068321228027344, + "learning_rate": 4.4880940396808856e-05, + "loss": 5.0306, + "step": 34865 + }, + { + "epoch": 0.2073579788752498, + "grad_norm": 1.4977796077728271, + "learning_rate": 4.488065719248048e-05, + "loss": 4.4446, + "step": 34866 + }, + { + "epoch": 0.20736392615853078, + "grad_norm": 1.4082682132720947, + "learning_rate": 4.488037398121197e-05, + "loss": 4.9294, + "step": 34867 + }, + { + "epoch": 0.2073698734418118, + "grad_norm": 1.662846565246582, + "learning_rate": 4.488009076300343e-05, + "loss": 5.0529, + "step": 34868 + }, + { + "epoch": 0.20737582072509278, + "grad_norm": 1.5319976806640625, + "learning_rate": 4.487980753785495e-05, + "loss": 5.0529, + "step": 34869 + }, + { + "epoch": 0.20738176800837377, + "grad_norm": 1.417098879814148, + "learning_rate": 4.487952430576664e-05, + "loss": 4.9239, + "step": 34870 + }, + { + "epoch": 0.20738771529165478, + "grad_norm": 1.549066424369812, + "learning_rate": 4.487924106673859e-05, + "loss": 5.0236, + "step": 34871 + }, + { + "epoch": 0.20739366257493577, + "grad_norm": 1.6567977666854858, + "learning_rate": 4.48789578207709e-05, + "loss": 4.5281, + "step": 34872 + }, + { + "epoch": 0.20739960985821676, + "grad_norm": 1.597029209136963, + "learning_rate": 4.487867456786367e-05, + "loss": 4.7681, + "step": 34873 + }, + { + "epoch": 0.20740555714149778, + "grad_norm": 1.635974645614624, + "learning_rate": 4.487839130801701e-05, + "loss": 4.8318, + "step": 34874 + }, + { + "epoch": 0.20741150442477876, + "grad_norm": 1.685660481452942, + "learning_rate": 4.4878108041231e-05, + "loss": 4.9574, + "step": 34875 + }, + { + "epoch": 0.20741745170805975, + "grad_norm": 1.482374668121338, + "learning_rate": 4.487782476750575e-05, + "loss": 4.8813, + "step": 34876 + }, + { + "epoch": 0.20742339899134077, + "grad_norm": 1.5559237003326416, + "learning_rate": 4.487754148684136e-05, + "loss": 4.9915, + "step": 34877 + }, + { + "epoch": 0.20742934627462176, + "grad_norm": 1.431868314743042, + "learning_rate": 4.487725819923792e-05, + "loss": 4.4923, + "step": 34878 + }, + { + "epoch": 0.20743529355790274, + "grad_norm": 1.550361156463623, + "learning_rate": 4.4876974904695535e-05, + "loss": 4.9491, + "step": 34879 + }, + { + "epoch": 0.20744124084118376, + "grad_norm": 1.580848217010498, + "learning_rate": 4.487669160321431e-05, + "loss": 4.7541, + "step": 34880 + }, + { + "epoch": 0.20744718812446475, + "grad_norm": 1.7145969867706299, + "learning_rate": 4.487640829479433e-05, + "loss": 5.0173, + "step": 34881 + }, + { + "epoch": 0.20745313540774574, + "grad_norm": 1.5619465112686157, + "learning_rate": 4.4876124979435704e-05, + "loss": 4.7407, + "step": 34882 + }, + { + "epoch": 0.20745908269102675, + "grad_norm": 1.6751627922058105, + "learning_rate": 4.487584165713853e-05, + "loss": 4.4732, + "step": 34883 + }, + { + "epoch": 0.20746502997430774, + "grad_norm": 1.599061131477356, + "learning_rate": 4.4875558327902906e-05, + "loss": 4.6993, + "step": 34884 + }, + { + "epoch": 0.20747097725758873, + "grad_norm": 1.4041860103607178, + "learning_rate": 4.4875274991728925e-05, + "loss": 4.6427, + "step": 34885 + }, + { + "epoch": 0.20747692454086972, + "grad_norm": 1.539746642112732, + "learning_rate": 4.4874991648616694e-05, + "loss": 4.781, + "step": 34886 + }, + { + "epoch": 0.20748287182415073, + "grad_norm": 1.5050103664398193, + "learning_rate": 4.487470829856631e-05, + "loss": 4.7492, + "step": 34887 + }, + { + "epoch": 0.20748881910743172, + "grad_norm": 1.6688284873962402, + "learning_rate": 4.487442494157786e-05, + "loss": 4.3758, + "step": 34888 + }, + { + "epoch": 0.2074947663907127, + "grad_norm": 1.5119291543960571, + "learning_rate": 4.4874141577651476e-05, + "loss": 4.9482, + "step": 34889 + }, + { + "epoch": 0.20750071367399373, + "grad_norm": 1.5905176401138306, + "learning_rate": 4.487385820678722e-05, + "loss": 5.2721, + "step": 34890 + }, + { + "epoch": 0.2075066609572747, + "grad_norm": 1.6275291442871094, + "learning_rate": 4.48735748289852e-05, + "loss": 4.5295, + "step": 34891 + }, + { + "epoch": 0.2075126082405557, + "grad_norm": 1.5032380819320679, + "learning_rate": 4.487329144424552e-05, + "loss": 4.8084, + "step": 34892 + }, + { + "epoch": 0.20751855552383672, + "grad_norm": 1.4824553728103638, + "learning_rate": 4.48730080525683e-05, + "loss": 4.772, + "step": 34893 + }, + { + "epoch": 0.2075245028071177, + "grad_norm": 1.5292681455612183, + "learning_rate": 4.48727246539536e-05, + "loss": 4.6488, + "step": 34894 + }, + { + "epoch": 0.2075304500903987, + "grad_norm": 1.4371155500411987, + "learning_rate": 4.487244124840154e-05, + "loss": 4.9293, + "step": 34895 + }, + { + "epoch": 0.2075363973736797, + "grad_norm": 1.7150744199752808, + "learning_rate": 4.487215783591222e-05, + "loss": 4.8491, + "step": 34896 + }, + { + "epoch": 0.2075423446569607, + "grad_norm": 1.7894489765167236, + "learning_rate": 4.487187441648573e-05, + "loss": 4.2539, + "step": 34897 + }, + { + "epoch": 0.20754829194024169, + "grad_norm": 2.7374889850616455, + "learning_rate": 4.4871590990122174e-05, + "loss": 3.205, + "step": 34898 + }, + { + "epoch": 0.2075542392235227, + "grad_norm": 3.2529096603393555, + "learning_rate": 4.487130755682165e-05, + "loss": 2.6124, + "step": 34899 + }, + { + "epoch": 0.2075601865068037, + "grad_norm": 1.6190886497497559, + "learning_rate": 4.487102411658426e-05, + "loss": 4.8742, + "step": 34900 + }, + { + "epoch": 0.20756613379008468, + "grad_norm": 1.7997056245803833, + "learning_rate": 4.4870740669410104e-05, + "loss": 4.9073, + "step": 34901 + }, + { + "epoch": 0.2075720810733657, + "grad_norm": 1.6690300703048706, + "learning_rate": 4.4870457215299274e-05, + "loss": 4.5691, + "step": 34902 + }, + { + "epoch": 0.20757802835664668, + "grad_norm": 1.5387898683547974, + "learning_rate": 4.4870173754251874e-05, + "loss": 4.5575, + "step": 34903 + }, + { + "epoch": 0.20758397563992767, + "grad_norm": 1.6400445699691772, + "learning_rate": 4.4869890286268006e-05, + "loss": 4.7597, + "step": 34904 + }, + { + "epoch": 0.2075899229232087, + "grad_norm": 1.5093486309051514, + "learning_rate": 4.4869606811347766e-05, + "loss": 5.3104, + "step": 34905 + }, + { + "epoch": 0.20759587020648967, + "grad_norm": 1.6980483531951904, + "learning_rate": 4.486932332949124e-05, + "loss": 4.5805, + "step": 34906 + }, + { + "epoch": 0.20760181748977066, + "grad_norm": 1.4920854568481445, + "learning_rate": 4.4869039840698544e-05, + "loss": 5.2039, + "step": 34907 + }, + { + "epoch": 0.20760776477305168, + "grad_norm": 1.7514317035675049, + "learning_rate": 4.486875634496977e-05, + "loss": 4.5796, + "step": 34908 + }, + { + "epoch": 0.20761371205633267, + "grad_norm": 1.739810585975647, + "learning_rate": 4.486847284230502e-05, + "loss": 4.6184, + "step": 34909 + }, + { + "epoch": 0.20761965933961365, + "grad_norm": 1.4580451250076294, + "learning_rate": 4.486818933270439e-05, + "loss": 4.8263, + "step": 34910 + }, + { + "epoch": 0.20762560662289467, + "grad_norm": 1.5463128089904785, + "learning_rate": 4.4867905816167986e-05, + "loss": 4.5738, + "step": 34911 + }, + { + "epoch": 0.20763155390617566, + "grad_norm": 1.5785971879959106, + "learning_rate": 4.48676222926959e-05, + "loss": 4.7776, + "step": 34912 + }, + { + "epoch": 0.20763750118945665, + "grad_norm": 2.756115674972534, + "learning_rate": 4.486733876228823e-05, + "loss": 4.1323, + "step": 34913 + }, + { + "epoch": 0.20764344847273766, + "grad_norm": 2.10459041595459, + "learning_rate": 4.4867055224945076e-05, + "loss": 4.9213, + "step": 34914 + }, + { + "epoch": 0.20764939575601865, + "grad_norm": 2.2832987308502197, + "learning_rate": 4.4866771680666545e-05, + "loss": 4.8089, + "step": 34915 + }, + { + "epoch": 0.20765534303929964, + "grad_norm": 1.6514110565185547, + "learning_rate": 4.4866488129452725e-05, + "loss": 5.107, + "step": 34916 + }, + { + "epoch": 0.20766129032258066, + "grad_norm": 1.623475193977356, + "learning_rate": 4.486620457130371e-05, + "loss": 4.46, + "step": 34917 + }, + { + "epoch": 0.20766723760586164, + "grad_norm": 1.4691333770751953, + "learning_rate": 4.486592100621961e-05, + "loss": 4.7703, + "step": 34918 + }, + { + "epoch": 0.20767318488914263, + "grad_norm": 1.6401385068893433, + "learning_rate": 4.4865637434200535e-05, + "loss": 4.7759, + "step": 34919 + }, + { + "epoch": 0.20767913217242365, + "grad_norm": 1.4848181009292603, + "learning_rate": 4.486535385524656e-05, + "loss": 5.0626, + "step": 34920 + }, + { + "epoch": 0.20768507945570464, + "grad_norm": 1.5136383771896362, + "learning_rate": 4.48650702693578e-05, + "loss": 5.0347, + "step": 34921 + }, + { + "epoch": 0.20769102673898562, + "grad_norm": 1.4435847997665405, + "learning_rate": 4.4864786676534356e-05, + "loss": 5.1182, + "step": 34922 + }, + { + "epoch": 0.20769697402226664, + "grad_norm": 1.5047261714935303, + "learning_rate": 4.486450307677631e-05, + "loss": 5.6022, + "step": 34923 + }, + { + "epoch": 0.20770292130554763, + "grad_norm": 1.491112232208252, + "learning_rate": 4.486421947008378e-05, + "loss": 5.312, + "step": 34924 + }, + { + "epoch": 0.20770886858882862, + "grad_norm": 1.816465139389038, + "learning_rate": 4.4863935856456844e-05, + "loss": 5.0285, + "step": 34925 + }, + { + "epoch": 0.20771481587210963, + "grad_norm": 1.6388663053512573, + "learning_rate": 4.4863652235895624e-05, + "loss": 4.9933, + "step": 34926 + }, + { + "epoch": 0.20772076315539062, + "grad_norm": 1.7427036762237549, + "learning_rate": 4.4863368608400206e-05, + "loss": 4.7291, + "step": 34927 + }, + { + "epoch": 0.2077267104386716, + "grad_norm": 1.8048992156982422, + "learning_rate": 4.4863084973970684e-05, + "loss": 4.4709, + "step": 34928 + }, + { + "epoch": 0.20773265772195262, + "grad_norm": 1.848111629486084, + "learning_rate": 4.4862801332607175e-05, + "loss": 5.4746, + "step": 34929 + }, + { + "epoch": 0.2077386050052336, + "grad_norm": 1.9766002893447876, + "learning_rate": 4.486251768430976e-05, + "loss": 4.6229, + "step": 34930 + }, + { + "epoch": 0.2077445522885146, + "grad_norm": 2.060670852661133, + "learning_rate": 4.4862234029078545e-05, + "loss": 5.1253, + "step": 34931 + }, + { + "epoch": 0.20775049957179562, + "grad_norm": 1.8634072542190552, + "learning_rate": 4.486195036691363e-05, + "loss": 4.9958, + "step": 34932 + }, + { + "epoch": 0.2077564468550766, + "grad_norm": 1.8860241174697876, + "learning_rate": 4.4861666697815115e-05, + "loss": 5.2148, + "step": 34933 + }, + { + "epoch": 0.2077623941383576, + "grad_norm": 1.6811100244522095, + "learning_rate": 4.4861383021783096e-05, + "loss": 4.9268, + "step": 34934 + }, + { + "epoch": 0.2077683414216386, + "grad_norm": 2.6467933654785156, + "learning_rate": 4.486109933881767e-05, + "loss": 4.2397, + "step": 34935 + }, + { + "epoch": 0.2077742887049196, + "grad_norm": 2.065701484680176, + "learning_rate": 4.486081564891895e-05, + "loss": 4.3548, + "step": 34936 + }, + { + "epoch": 0.20778023598820058, + "grad_norm": 1.5673136711120605, + "learning_rate": 4.4860531952087015e-05, + "loss": 5.1111, + "step": 34937 + }, + { + "epoch": 0.2077861832714816, + "grad_norm": 1.7255089282989502, + "learning_rate": 4.486024824832198e-05, + "loss": 5.2419, + "step": 34938 + }, + { + "epoch": 0.2077921305547626, + "grad_norm": 1.5966598987579346, + "learning_rate": 4.485996453762393e-05, + "loss": 5.3367, + "step": 34939 + }, + { + "epoch": 0.20779807783804358, + "grad_norm": 1.687177062034607, + "learning_rate": 4.485968081999298e-05, + "loss": 4.8, + "step": 34940 + }, + { + "epoch": 0.2078040251213246, + "grad_norm": 1.4911394119262695, + "learning_rate": 4.485939709542921e-05, + "loss": 4.6558, + "step": 34941 + }, + { + "epoch": 0.20780997240460558, + "grad_norm": 1.705232858657837, + "learning_rate": 4.4859113363932735e-05, + "loss": 4.5196, + "step": 34942 + }, + { + "epoch": 0.20781591968788657, + "grad_norm": 2.290523052215576, + "learning_rate": 4.4858829625503654e-05, + "loss": 4.8916, + "step": 34943 + }, + { + "epoch": 0.20782186697116756, + "grad_norm": 1.3084735870361328, + "learning_rate": 4.4858545880142056e-05, + "loss": 4.8922, + "step": 34944 + }, + { + "epoch": 0.20782781425444857, + "grad_norm": 1.6424446105957031, + "learning_rate": 4.485826212784805e-05, + "loss": 4.4804, + "step": 34945 + }, + { + "epoch": 0.20783376153772956, + "grad_norm": 1.5474185943603516, + "learning_rate": 4.485797836862172e-05, + "loss": 4.7975, + "step": 34946 + }, + { + "epoch": 0.20783970882101055, + "grad_norm": 1.5839738845825195, + "learning_rate": 4.485769460246319e-05, + "loss": 4.6328, + "step": 34947 + }, + { + "epoch": 0.20784565610429157, + "grad_norm": 1.7342357635498047, + "learning_rate": 4.485741082937252e-05, + "loss": 4.2369, + "step": 34948 + }, + { + "epoch": 0.20785160338757255, + "grad_norm": 1.6092901229858398, + "learning_rate": 4.4857127049349856e-05, + "loss": 4.4599, + "step": 34949 + }, + { + "epoch": 0.20785755067085354, + "grad_norm": 1.6004117727279663, + "learning_rate": 4.485684326239527e-05, + "loss": 5.1366, + "step": 34950 + }, + { + "epoch": 0.20786349795413456, + "grad_norm": 1.75442373752594, + "learning_rate": 4.485655946850885e-05, + "loss": 4.1551, + "step": 34951 + }, + { + "epoch": 0.20786944523741555, + "grad_norm": 1.4843125343322754, + "learning_rate": 4.485627566769073e-05, + "loss": 5.0695, + "step": 34952 + }, + { + "epoch": 0.20787539252069653, + "grad_norm": 1.3438167572021484, + "learning_rate": 4.485599185994097e-05, + "loss": 4.9293, + "step": 34953 + }, + { + "epoch": 0.20788133980397755, + "grad_norm": 1.9467015266418457, + "learning_rate": 4.48557080452597e-05, + "loss": 4.5889, + "step": 34954 + }, + { + "epoch": 0.20788728708725854, + "grad_norm": 1.8029512166976929, + "learning_rate": 4.4855424223647005e-05, + "loss": 4.9904, + "step": 34955 + }, + { + "epoch": 0.20789323437053953, + "grad_norm": 1.4334784746170044, + "learning_rate": 4.485514039510299e-05, + "loss": 4.3459, + "step": 34956 + }, + { + "epoch": 0.20789918165382054, + "grad_norm": 1.6164671182632446, + "learning_rate": 4.4854856559627746e-05, + "loss": 5.3017, + "step": 34957 + }, + { + "epoch": 0.20790512893710153, + "grad_norm": 1.6465972661972046, + "learning_rate": 4.485457271722138e-05, + "loss": 4.353, + "step": 34958 + }, + { + "epoch": 0.20791107622038252, + "grad_norm": 1.5343241691589355, + "learning_rate": 4.485428886788399e-05, + "loss": 5.112, + "step": 34959 + }, + { + "epoch": 0.20791702350366353, + "grad_norm": 1.5733743906021118, + "learning_rate": 4.485400501161566e-05, + "loss": 5.0707, + "step": 34960 + }, + { + "epoch": 0.20792297078694452, + "grad_norm": 1.1540693044662476, + "learning_rate": 4.4853721148416515e-05, + "loss": 5.0026, + "step": 34961 + }, + { + "epoch": 0.2079289180702255, + "grad_norm": 1.4453891515731812, + "learning_rate": 4.485343727828664e-05, + "loss": 4.2512, + "step": 34962 + }, + { + "epoch": 0.20793486535350653, + "grad_norm": 1.5326753854751587, + "learning_rate": 4.4853153401226135e-05, + "loss": 4.804, + "step": 34963 + }, + { + "epoch": 0.20794081263678751, + "grad_norm": 1.5941990613937378, + "learning_rate": 4.48528695172351e-05, + "loss": 5.1824, + "step": 34964 + }, + { + "epoch": 0.2079467599200685, + "grad_norm": 1.634143590927124, + "learning_rate": 4.485258562631363e-05, + "loss": 4.6857, + "step": 34965 + }, + { + "epoch": 0.20795270720334952, + "grad_norm": 1.5137478113174438, + "learning_rate": 4.485230172846182e-05, + "loss": 5.3183, + "step": 34966 + }, + { + "epoch": 0.2079586544866305, + "grad_norm": 1.673877477645874, + "learning_rate": 4.4852017823679785e-05, + "loss": 4.7819, + "step": 34967 + }, + { + "epoch": 0.2079646017699115, + "grad_norm": 1.770723581314087, + "learning_rate": 4.485173391196761e-05, + "loss": 4.0521, + "step": 34968 + }, + { + "epoch": 0.2079705490531925, + "grad_norm": 1.598290205001831, + "learning_rate": 4.485144999332541e-05, + "loss": 5.344, + "step": 34969 + }, + { + "epoch": 0.2079764963364735, + "grad_norm": 1.3610836267471313, + "learning_rate": 4.4851166067753266e-05, + "loss": 5.0096, + "step": 34970 + }, + { + "epoch": 0.2079824436197545, + "grad_norm": 1.4452751874923706, + "learning_rate": 4.485088213525129e-05, + "loss": 4.2761, + "step": 34971 + }, + { + "epoch": 0.2079883909030355, + "grad_norm": 1.6192525625228882, + "learning_rate": 4.485059819581957e-05, + "loss": 5.2101, + "step": 34972 + }, + { + "epoch": 0.2079943381863165, + "grad_norm": 1.6380634307861328, + "learning_rate": 4.4850314249458215e-05, + "loss": 5.1043, + "step": 34973 + }, + { + "epoch": 0.20800028546959748, + "grad_norm": 1.61093008518219, + "learning_rate": 4.485003029616732e-05, + "loss": 5.0412, + "step": 34974 + }, + { + "epoch": 0.2080062327528785, + "grad_norm": 2.0800046920776367, + "learning_rate": 4.4849746335946986e-05, + "loss": 4.2104, + "step": 34975 + }, + { + "epoch": 0.20801218003615948, + "grad_norm": 1.797250747680664, + "learning_rate": 4.484946236879731e-05, + "loss": 4.6008, + "step": 34976 + }, + { + "epoch": 0.20801812731944047, + "grad_norm": 1.5170632600784302, + "learning_rate": 4.4849178394718394e-05, + "loss": 5.3258, + "step": 34977 + }, + { + "epoch": 0.2080240746027215, + "grad_norm": 1.777762532234192, + "learning_rate": 4.4848894413710326e-05, + "loss": 4.4505, + "step": 34978 + }, + { + "epoch": 0.20803002188600248, + "grad_norm": 1.9124006032943726, + "learning_rate": 4.4848610425773224e-05, + "loss": 4.5747, + "step": 34979 + }, + { + "epoch": 0.20803596916928346, + "grad_norm": 1.5491348505020142, + "learning_rate": 4.4848326430907175e-05, + "loss": 5.6545, + "step": 34980 + }, + { + "epoch": 0.20804191645256448, + "grad_norm": 1.9779603481292725, + "learning_rate": 4.484804242911228e-05, + "loss": 4.5576, + "step": 34981 + }, + { + "epoch": 0.20804786373584547, + "grad_norm": 1.4585378170013428, + "learning_rate": 4.484775842038863e-05, + "loss": 4.989, + "step": 34982 + }, + { + "epoch": 0.20805381101912646, + "grad_norm": 1.6832143068313599, + "learning_rate": 4.4847474404736346e-05, + "loss": 4.9199, + "step": 34983 + }, + { + "epoch": 0.20805975830240747, + "grad_norm": 1.6539632081985474, + "learning_rate": 4.48471903821555e-05, + "loss": 4.3962, + "step": 34984 + }, + { + "epoch": 0.20806570558568846, + "grad_norm": 1.4862840175628662, + "learning_rate": 4.484690635264622e-05, + "loss": 4.9105, + "step": 34985 + }, + { + "epoch": 0.20807165286896945, + "grad_norm": 1.5097556114196777, + "learning_rate": 4.484662231620857e-05, + "loss": 4.8813, + "step": 34986 + }, + { + "epoch": 0.20807760015225046, + "grad_norm": 1.749756932258606, + "learning_rate": 4.484633827284269e-05, + "loss": 4.871, + "step": 34987 + }, + { + "epoch": 0.20808354743553145, + "grad_norm": 1.7925949096679688, + "learning_rate": 4.484605422254865e-05, + "loss": 4.4457, + "step": 34988 + }, + { + "epoch": 0.20808949471881244, + "grad_norm": 2.0869626998901367, + "learning_rate": 4.4845770165326555e-05, + "loss": 4.1676, + "step": 34989 + }, + { + "epoch": 0.20809544200209346, + "grad_norm": 1.7017414569854736, + "learning_rate": 4.484548610117651e-05, + "loss": 4.6058, + "step": 34990 + }, + { + "epoch": 0.20810138928537444, + "grad_norm": 1.696089506149292, + "learning_rate": 4.484520203009861e-05, + "loss": 4.4662, + "step": 34991 + }, + { + "epoch": 0.20810733656865543, + "grad_norm": 1.7537122964859009, + "learning_rate": 4.484491795209296e-05, + "loss": 4.4703, + "step": 34992 + }, + { + "epoch": 0.20811328385193645, + "grad_norm": 1.5926291942596436, + "learning_rate": 4.484463386715965e-05, + "loss": 4.7058, + "step": 34993 + }, + { + "epoch": 0.20811923113521744, + "grad_norm": 1.554070234298706, + "learning_rate": 4.484434977529878e-05, + "loss": 4.4741, + "step": 34994 + }, + { + "epoch": 0.20812517841849842, + "grad_norm": 1.7016302347183228, + "learning_rate": 4.484406567651045e-05, + "loss": 4.3942, + "step": 34995 + }, + { + "epoch": 0.20813112570177944, + "grad_norm": 2.093773603439331, + "learning_rate": 4.484378157079477e-05, + "loss": 3.8448, + "step": 34996 + }, + { + "epoch": 0.20813707298506043, + "grad_norm": 1.7667289972305298, + "learning_rate": 4.484349745815183e-05, + "loss": 4.0643, + "step": 34997 + }, + { + "epoch": 0.20814302026834142, + "grad_norm": 1.6492136716842651, + "learning_rate": 4.484321333858172e-05, + "loss": 4.2844, + "step": 34998 + }, + { + "epoch": 0.20814896755162243, + "grad_norm": 1.8489280939102173, + "learning_rate": 4.484292921208456e-05, + "loss": 4.2982, + "step": 34999 + }, + { + "epoch": 0.20815491483490342, + "grad_norm": 1.7901874780654907, + "learning_rate": 4.484264507866043e-05, + "loss": 4.3482, + "step": 35000 + }, + { + "epoch": 0.2081608621181844, + "grad_norm": 3.0331995487213135, + "learning_rate": 4.484236093830945e-05, + "loss": 4.3448, + "step": 35001 + }, + { + "epoch": 0.2081668094014654, + "grad_norm": 3.182864189147949, + "learning_rate": 4.484207679103169e-05, + "loss": 4.604, + "step": 35002 + }, + { + "epoch": 0.2081727566847464, + "grad_norm": 2.4753639698028564, + "learning_rate": 4.484179263682729e-05, + "loss": 4.4595, + "step": 35003 + }, + { + "epoch": 0.2081787039680274, + "grad_norm": 1.9478391408920288, + "learning_rate": 4.48415084756963e-05, + "loss": 4.1304, + "step": 35004 + }, + { + "epoch": 0.2081846512513084, + "grad_norm": 1.8722947835922241, + "learning_rate": 4.4841224307638856e-05, + "loss": 4.4465, + "step": 35005 + }, + { + "epoch": 0.2081905985345894, + "grad_norm": 1.7963893413543701, + "learning_rate": 4.4840940132655045e-05, + "loss": 4.4874, + "step": 35006 + }, + { + "epoch": 0.2081965458178704, + "grad_norm": 2.4044625759124756, + "learning_rate": 4.4840655950744965e-05, + "loss": 4.3432, + "step": 35007 + }, + { + "epoch": 0.20820249310115138, + "grad_norm": 2.240295648574829, + "learning_rate": 4.484037176190872e-05, + "loss": 4.4625, + "step": 35008 + }, + { + "epoch": 0.2082084403844324, + "grad_norm": 2.3064870834350586, + "learning_rate": 4.48400875661464e-05, + "loss": 4.3748, + "step": 35009 + }, + { + "epoch": 0.20821438766771339, + "grad_norm": 2.2277655601501465, + "learning_rate": 4.483980336345812e-05, + "loss": 4.2621, + "step": 35010 + }, + { + "epoch": 0.20822033495099437, + "grad_norm": 2.3768885135650635, + "learning_rate": 4.483951915384396e-05, + "loss": 4.3309, + "step": 35011 + }, + { + "epoch": 0.2082262822342754, + "grad_norm": 2.423457384109497, + "learning_rate": 4.4839234937304036e-05, + "loss": 4.4512, + "step": 35012 + }, + { + "epoch": 0.20823222951755638, + "grad_norm": 2.166076421737671, + "learning_rate": 4.483895071383843e-05, + "loss": 4.3357, + "step": 35013 + }, + { + "epoch": 0.20823817680083737, + "grad_norm": 2.4752864837646484, + "learning_rate": 4.483866648344727e-05, + "loss": 4.3325, + "step": 35014 + }, + { + "epoch": 0.20824412408411838, + "grad_norm": 2.4272568225860596, + "learning_rate": 4.483838224613062e-05, + "loss": 4.3429, + "step": 35015 + }, + { + "epoch": 0.20825007136739937, + "grad_norm": 2.4228246212005615, + "learning_rate": 4.48380980018886e-05, + "loss": 4.2242, + "step": 35016 + }, + { + "epoch": 0.20825601865068036, + "grad_norm": 2.2205100059509277, + "learning_rate": 4.4837813750721305e-05, + "loss": 4.3772, + "step": 35017 + }, + { + "epoch": 0.20826196593396137, + "grad_norm": 2.3196351528167725, + "learning_rate": 4.483752949262884e-05, + "loss": 4.1687, + "step": 35018 + }, + { + "epoch": 0.20826791321724236, + "grad_norm": 2.2172744274139404, + "learning_rate": 4.483724522761129e-05, + "loss": 4.2437, + "step": 35019 + }, + { + "epoch": 0.20827386050052335, + "grad_norm": 1.8818265199661255, + "learning_rate": 4.4836960955668773e-05, + "loss": 4.1817, + "step": 35020 + }, + { + "epoch": 0.20827980778380437, + "grad_norm": 2.141326904296875, + "learning_rate": 4.483667667680137e-05, + "loss": 4.2353, + "step": 35021 + }, + { + "epoch": 0.20828575506708535, + "grad_norm": 2.064363956451416, + "learning_rate": 4.483639239100919e-05, + "loss": 4.0952, + "step": 35022 + }, + { + "epoch": 0.20829170235036634, + "grad_norm": 1.8391005992889404, + "learning_rate": 4.483610809829232e-05, + "loss": 4.3724, + "step": 35023 + }, + { + "epoch": 0.20829764963364736, + "grad_norm": 2.879714250564575, + "learning_rate": 4.4835823798650884e-05, + "loss": 3.7298, + "step": 35024 + }, + { + "epoch": 0.20830359691692835, + "grad_norm": 2.702657699584961, + "learning_rate": 4.483553949208496e-05, + "loss": 3.5413, + "step": 35025 + }, + { + "epoch": 0.20830954420020933, + "grad_norm": 2.232855796813965, + "learning_rate": 4.483525517859466e-05, + "loss": 4.5523, + "step": 35026 + }, + { + "epoch": 0.20831549148349035, + "grad_norm": 2.239912748336792, + "learning_rate": 4.483497085818007e-05, + "loss": 4.139, + "step": 35027 + }, + { + "epoch": 0.20832143876677134, + "grad_norm": 2.0883944034576416, + "learning_rate": 4.4834686530841296e-05, + "loss": 4.1629, + "step": 35028 + }, + { + "epoch": 0.20832738605005233, + "grad_norm": 1.9795372486114502, + "learning_rate": 4.483440219657845e-05, + "loss": 4.485, + "step": 35029 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 1.6449062824249268, + "learning_rate": 4.48341178553916e-05, + "loss": 4.457, + "step": 35030 + }, + { + "epoch": 0.20833928061661433, + "grad_norm": 2.1899871826171875, + "learning_rate": 4.4833833507280884e-05, + "loss": 4.7317, + "step": 35031 + }, + { + "epoch": 0.20834522789989532, + "grad_norm": 1.852655291557312, + "learning_rate": 4.483354915224637e-05, + "loss": 4.8662, + "step": 35032 + }, + { + "epoch": 0.20835117518317633, + "grad_norm": 1.612601399421692, + "learning_rate": 4.4833264790288175e-05, + "loss": 4.4567, + "step": 35033 + }, + { + "epoch": 0.20835712246645732, + "grad_norm": 1.7933584451675415, + "learning_rate": 4.483298042140639e-05, + "loss": 4.8182, + "step": 35034 + }, + { + "epoch": 0.2083630697497383, + "grad_norm": 1.7479445934295654, + "learning_rate": 4.483269604560111e-05, + "loss": 4.571, + "step": 35035 + }, + { + "epoch": 0.20836901703301933, + "grad_norm": 1.9602153301239014, + "learning_rate": 4.4832411662872445e-05, + "loss": 4.4672, + "step": 35036 + }, + { + "epoch": 0.20837496431630032, + "grad_norm": 2.1408305168151855, + "learning_rate": 4.4832127273220494e-05, + "loss": 4.8864, + "step": 35037 + }, + { + "epoch": 0.2083809115995813, + "grad_norm": 2.346951961517334, + "learning_rate": 4.483184287664535e-05, + "loss": 4.5457, + "step": 35038 + }, + { + "epoch": 0.20838685888286232, + "grad_norm": 2.623779535293579, + "learning_rate": 4.483155847314712e-05, + "loss": 4.4477, + "step": 35039 + }, + { + "epoch": 0.2083928061661433, + "grad_norm": 2.211894989013672, + "learning_rate": 4.483127406272588e-05, + "loss": 3.9478, + "step": 35040 + }, + { + "epoch": 0.2083987534494243, + "grad_norm": 2.5383923053741455, + "learning_rate": 4.483098964538176e-05, + "loss": 3.6751, + "step": 35041 + }, + { + "epoch": 0.2084047007327053, + "grad_norm": 1.7223814725875854, + "learning_rate": 4.4830705221114845e-05, + "loss": 4.9108, + "step": 35042 + }, + { + "epoch": 0.2084106480159863, + "grad_norm": 1.842361330986023, + "learning_rate": 4.483042078992524e-05, + "loss": 5.1871, + "step": 35043 + }, + { + "epoch": 0.2084165952992673, + "grad_norm": 1.7734646797180176, + "learning_rate": 4.4830136351813035e-05, + "loss": 4.9155, + "step": 35044 + }, + { + "epoch": 0.2084225425825483, + "grad_norm": 1.9333977699279785, + "learning_rate": 4.482985190677834e-05, + "loss": 4.6412, + "step": 35045 + }, + { + "epoch": 0.2084284898658293, + "grad_norm": 1.7834563255310059, + "learning_rate": 4.4829567454821244e-05, + "loss": 4.7411, + "step": 35046 + }, + { + "epoch": 0.20843443714911028, + "grad_norm": 1.4860880374908447, + "learning_rate": 4.4829282995941846e-05, + "loss": 4.899, + "step": 35047 + }, + { + "epoch": 0.2084403844323913, + "grad_norm": 3.1144561767578125, + "learning_rate": 4.482899853014025e-05, + "loss": 3.0548, + "step": 35048 + }, + { + "epoch": 0.20844633171567228, + "grad_norm": 2.9812357425689697, + "learning_rate": 4.482871405741657e-05, + "loss": 2.8871, + "step": 35049 + }, + { + "epoch": 0.20845227899895327, + "grad_norm": 2.895320177078247, + "learning_rate": 4.4828429577770876e-05, + "loss": 2.1233, + "step": 35050 + }, + { + "epoch": 0.2084582262822343, + "grad_norm": 2.7188355922698975, + "learning_rate": 4.482814509120329e-05, + "loss": 2.3401, + "step": 35051 + }, + { + "epoch": 0.20846417356551528, + "grad_norm": 2.81449818611145, + "learning_rate": 4.48278605977139e-05, + "loss": 2.8756, + "step": 35052 + }, + { + "epoch": 0.20847012084879626, + "grad_norm": 2.326000452041626, + "learning_rate": 4.482757609730281e-05, + "loss": 3.6878, + "step": 35053 + }, + { + "epoch": 0.20847606813207728, + "grad_norm": 2.1207025051116943, + "learning_rate": 4.482729158997011e-05, + "loss": 4.064, + "step": 35054 + }, + { + "epoch": 0.20848201541535827, + "grad_norm": 1.9843616485595703, + "learning_rate": 4.482700707571592e-05, + "loss": 4.6437, + "step": 35055 + }, + { + "epoch": 0.20848796269863926, + "grad_norm": 1.9951170682907104, + "learning_rate": 4.4826722554540316e-05, + "loss": 4.7161, + "step": 35056 + }, + { + "epoch": 0.20849390998192027, + "grad_norm": 1.8895988464355469, + "learning_rate": 4.482643802644342e-05, + "loss": 4.8795, + "step": 35057 + }, + { + "epoch": 0.20849985726520126, + "grad_norm": 1.9991610050201416, + "learning_rate": 4.4826153491425307e-05, + "loss": 4.45, + "step": 35058 + }, + { + "epoch": 0.20850580454848225, + "grad_norm": 3.778761386871338, + "learning_rate": 4.4825868949486095e-05, + "loss": 3.1729, + "step": 35059 + }, + { + "epoch": 0.20851175183176324, + "grad_norm": 3.2395191192626953, + "learning_rate": 4.482558440062587e-05, + "loss": 3.8024, + "step": 35060 + }, + { + "epoch": 0.20851769911504425, + "grad_norm": 3.4706618785858154, + "learning_rate": 4.4825299844844746e-05, + "loss": 3.5835, + "step": 35061 + }, + { + "epoch": 0.20852364639832524, + "grad_norm": 3.5210063457489014, + "learning_rate": 4.482501528214282e-05, + "loss": 2.6244, + "step": 35062 + }, + { + "epoch": 0.20852959368160623, + "grad_norm": 3.2918505668640137, + "learning_rate": 4.482473071252018e-05, + "loss": 2.6852, + "step": 35063 + }, + { + "epoch": 0.20853554096488724, + "grad_norm": 2.6687605381011963, + "learning_rate": 4.4824446135976926e-05, + "loss": 1.4744, + "step": 35064 + }, + { + "epoch": 0.20854148824816823, + "grad_norm": 3.270942211151123, + "learning_rate": 4.4824161552513164e-05, + "loss": 2.2422, + "step": 35065 + }, + { + "epoch": 0.20854743553144922, + "grad_norm": 3.1928775310516357, + "learning_rate": 4.4823876962128994e-05, + "loss": 2.8081, + "step": 35066 + }, + { + "epoch": 0.20855338281473024, + "grad_norm": 3.082271099090576, + "learning_rate": 4.482359236482452e-05, + "loss": 2.5842, + "step": 35067 + }, + { + "epoch": 0.20855933009801123, + "grad_norm": 2.944580316543579, + "learning_rate": 4.482330776059983e-05, + "loss": 2.9251, + "step": 35068 + }, + { + "epoch": 0.2085652773812922, + "grad_norm": 3.051842451095581, + "learning_rate": 4.4823023149455024e-05, + "loss": 2.4925, + "step": 35069 + }, + { + "epoch": 0.20857122466457323, + "grad_norm": 2.8280301094055176, + "learning_rate": 4.482273853139021e-05, + "loss": 2.982, + "step": 35070 + }, + { + "epoch": 0.20857717194785422, + "grad_norm": 1.844641089439392, + "learning_rate": 4.482245390640548e-05, + "loss": 5.0901, + "step": 35071 + }, + { + "epoch": 0.2085831192311352, + "grad_norm": 1.7683004140853882, + "learning_rate": 4.4822169274500936e-05, + "loss": 5.0017, + "step": 35072 + }, + { + "epoch": 0.20858906651441622, + "grad_norm": 1.6485508680343628, + "learning_rate": 4.482188463567668e-05, + "loss": 4.7149, + "step": 35073 + }, + { + "epoch": 0.2085950137976972, + "grad_norm": 1.6834670305252075, + "learning_rate": 4.482159998993281e-05, + "loss": 4.898, + "step": 35074 + }, + { + "epoch": 0.2086009610809782, + "grad_norm": 1.6922709941864014, + "learning_rate": 4.482131533726942e-05, + "loss": 4.647, + "step": 35075 + }, + { + "epoch": 0.2086069083642592, + "grad_norm": 2.076922655105591, + "learning_rate": 4.482103067768662e-05, + "loss": 4.0892, + "step": 35076 + }, + { + "epoch": 0.2086128556475402, + "grad_norm": 2.6704981327056885, + "learning_rate": 4.4820746011184496e-05, + "loss": 4.3682, + "step": 35077 + }, + { + "epoch": 0.2086188029308212, + "grad_norm": 2.4579737186431885, + "learning_rate": 4.482046133776316e-05, + "loss": 4.1968, + "step": 35078 + }, + { + "epoch": 0.2086247502141022, + "grad_norm": 2.7950711250305176, + "learning_rate": 4.4820176657422693e-05, + "loss": 4.2779, + "step": 35079 + }, + { + "epoch": 0.2086306974973832, + "grad_norm": 2.204728364944458, + "learning_rate": 4.4819891970163216e-05, + "loss": 4.3001, + "step": 35080 + }, + { + "epoch": 0.20863664478066418, + "grad_norm": 1.8583356142044067, + "learning_rate": 4.4819607275984835e-05, + "loss": 4.3775, + "step": 35081 + }, + { + "epoch": 0.2086425920639452, + "grad_norm": 1.8849300146102905, + "learning_rate": 4.481932257488761e-05, + "loss": 4.6873, + "step": 35082 + }, + { + "epoch": 0.2086485393472262, + "grad_norm": 1.940974235534668, + "learning_rate": 4.481903786687167e-05, + "loss": 4.2494, + "step": 35083 + }, + { + "epoch": 0.20865448663050717, + "grad_norm": 2.0488009452819824, + "learning_rate": 4.481875315193712e-05, + "loss": 4.1133, + "step": 35084 + }, + { + "epoch": 0.2086604339137882, + "grad_norm": 1.903907060623169, + "learning_rate": 4.481846843008404e-05, + "loss": 4.3153, + "step": 35085 + }, + { + "epoch": 0.20866638119706918, + "grad_norm": 2.271176815032959, + "learning_rate": 4.4818183701312534e-05, + "loss": 4.1729, + "step": 35086 + }, + { + "epoch": 0.20867232848035017, + "grad_norm": 1.9775025844573975, + "learning_rate": 4.481789896562271e-05, + "loss": 4.4771, + "step": 35087 + }, + { + "epoch": 0.20867827576363118, + "grad_norm": 2.0481247901916504, + "learning_rate": 4.481761422301466e-05, + "loss": 4.3511, + "step": 35088 + }, + { + "epoch": 0.20868422304691217, + "grad_norm": 1.8747283220291138, + "learning_rate": 4.481732947348849e-05, + "loss": 4.4928, + "step": 35089 + }, + { + "epoch": 0.20869017033019316, + "grad_norm": 1.950616478919983, + "learning_rate": 4.4817044717044285e-05, + "loss": 4.3065, + "step": 35090 + }, + { + "epoch": 0.20869611761347417, + "grad_norm": 2.182492256164551, + "learning_rate": 4.481675995368216e-05, + "loss": 5.4017, + "step": 35091 + }, + { + "epoch": 0.20870206489675516, + "grad_norm": 1.7582393884658813, + "learning_rate": 4.4816475183402215e-05, + "loss": 3.7494, + "step": 35092 + }, + { + "epoch": 0.20870801218003615, + "grad_norm": 1.568738579750061, + "learning_rate": 4.481619040620454e-05, + "loss": 5.4418, + "step": 35093 + }, + { + "epoch": 0.20871395946331717, + "grad_norm": 2.5985677242279053, + "learning_rate": 4.4815905622089226e-05, + "loss": 4.8025, + "step": 35094 + }, + { + "epoch": 0.20871990674659816, + "grad_norm": 3.3421452045440674, + "learning_rate": 4.48156208310564e-05, + "loss": 4.5642, + "step": 35095 + }, + { + "epoch": 0.20872585402987914, + "grad_norm": 3.004498243331909, + "learning_rate": 4.4815336033106137e-05, + "loss": 4.0177, + "step": 35096 + }, + { + "epoch": 0.20873180131316016, + "grad_norm": 3.208674192428589, + "learning_rate": 4.481505122823855e-05, + "loss": 3.7116, + "step": 35097 + }, + { + "epoch": 0.20873774859644115, + "grad_norm": 2.824521064758301, + "learning_rate": 4.481476641645373e-05, + "loss": 4.1009, + "step": 35098 + }, + { + "epoch": 0.20874369587972214, + "grad_norm": 2.6789655685424805, + "learning_rate": 4.481448159775178e-05, + "loss": 3.7663, + "step": 35099 + }, + { + "epoch": 0.20874964316300315, + "grad_norm": 2.2127678394317627, + "learning_rate": 4.4814196772132796e-05, + "loss": 4.2904, + "step": 35100 + }, + { + "epoch": 0.20875559044628414, + "grad_norm": 2.5038135051727295, + "learning_rate": 4.481391193959689e-05, + "loss": 4.494, + "step": 35101 + }, + { + "epoch": 0.20876153772956513, + "grad_norm": 2.7562382221221924, + "learning_rate": 4.481362710014414e-05, + "loss": 4.677, + "step": 35102 + }, + { + "epoch": 0.20876748501284614, + "grad_norm": 2.437014579772949, + "learning_rate": 4.481334225377466e-05, + "loss": 4.451, + "step": 35103 + }, + { + "epoch": 0.20877343229612713, + "grad_norm": 2.1886918544769287, + "learning_rate": 4.481305740048856e-05, + "loss": 4.5499, + "step": 35104 + }, + { + "epoch": 0.20877937957940812, + "grad_norm": 2.4093780517578125, + "learning_rate": 4.481277254028591e-05, + "loss": 4.4421, + "step": 35105 + }, + { + "epoch": 0.20878532686268914, + "grad_norm": 2.2635338306427, + "learning_rate": 4.4812487673166834e-05, + "loss": 4.2136, + "step": 35106 + }, + { + "epoch": 0.20879127414597012, + "grad_norm": 2.370861291885376, + "learning_rate": 4.481220279913142e-05, + "loss": 3.9111, + "step": 35107 + }, + { + "epoch": 0.2087972214292511, + "grad_norm": 2.5577683448791504, + "learning_rate": 4.481191791817977e-05, + "loss": 3.6726, + "step": 35108 + }, + { + "epoch": 0.20880316871253213, + "grad_norm": 2.295682430267334, + "learning_rate": 4.481163303031199e-05, + "loss": 4.3153, + "step": 35109 + }, + { + "epoch": 0.20880911599581312, + "grad_norm": 2.155688762664795, + "learning_rate": 4.4811348135528165e-05, + "loss": 4.3882, + "step": 35110 + }, + { + "epoch": 0.2088150632790941, + "grad_norm": 1.9954904317855835, + "learning_rate": 4.481106323382841e-05, + "loss": 4.3073, + "step": 35111 + }, + { + "epoch": 0.20882101056237512, + "grad_norm": 2.2071473598480225, + "learning_rate": 4.481077832521282e-05, + "loss": 4.0741, + "step": 35112 + }, + { + "epoch": 0.2088269578456561, + "grad_norm": 2.506493330001831, + "learning_rate": 4.4810493409681486e-05, + "loss": 3.9014, + "step": 35113 + }, + { + "epoch": 0.2088329051289371, + "grad_norm": 2.4370062351226807, + "learning_rate": 4.4810208487234515e-05, + "loss": 4.2152, + "step": 35114 + }, + { + "epoch": 0.2088388524122181, + "grad_norm": 2.3963093757629395, + "learning_rate": 4.4809923557872e-05, + "loss": 3.9049, + "step": 35115 + }, + { + "epoch": 0.2088447996954991, + "grad_norm": 2.477271556854248, + "learning_rate": 4.4809638621594054e-05, + "loss": 3.8888, + "step": 35116 + }, + { + "epoch": 0.2088507469787801, + "grad_norm": 2.3511276245117188, + "learning_rate": 4.480935367840076e-05, + "loss": 4.0679, + "step": 35117 + }, + { + "epoch": 0.20885669426206108, + "grad_norm": 2.7005770206451416, + "learning_rate": 4.480906872829223e-05, + "loss": 4.3311, + "step": 35118 + }, + { + "epoch": 0.2088626415453421, + "grad_norm": 2.63441801071167, + "learning_rate": 4.480878377126856e-05, + "loss": 4.4622, + "step": 35119 + }, + { + "epoch": 0.20886858882862308, + "grad_norm": 2.249758005142212, + "learning_rate": 4.480849880732985e-05, + "loss": 4.2341, + "step": 35120 + }, + { + "epoch": 0.20887453611190407, + "grad_norm": 2.5190210342407227, + "learning_rate": 4.480821383647619e-05, + "loss": 4.0408, + "step": 35121 + }, + { + "epoch": 0.20888048339518508, + "grad_norm": 2.223970890045166, + "learning_rate": 4.4807928858707696e-05, + "loss": 4.2521, + "step": 35122 + }, + { + "epoch": 0.20888643067846607, + "grad_norm": 2.336270570755005, + "learning_rate": 4.480764387402445e-05, + "loss": 4.2149, + "step": 35123 + }, + { + "epoch": 0.20889237796174706, + "grad_norm": 2.7396438121795654, + "learning_rate": 4.4807358882426564e-05, + "loss": 4.0704, + "step": 35124 + }, + { + "epoch": 0.20889832524502808, + "grad_norm": 2.5378661155700684, + "learning_rate": 4.480707388391413e-05, + "loss": 3.8361, + "step": 35125 + }, + { + "epoch": 0.20890427252830907, + "grad_norm": 2.328519582748413, + "learning_rate": 4.480678887848726e-05, + "loss": 4.2595, + "step": 35126 + }, + { + "epoch": 0.20891021981159005, + "grad_norm": 2.6499722003936768, + "learning_rate": 4.4806503866146036e-05, + "loss": 4.1182, + "step": 35127 + }, + { + "epoch": 0.20891616709487107, + "grad_norm": 2.24397349357605, + "learning_rate": 4.480621884689057e-05, + "loss": 4.1824, + "step": 35128 + }, + { + "epoch": 0.20892211437815206, + "grad_norm": 2.074115514755249, + "learning_rate": 4.480593382072096e-05, + "loss": 4.2477, + "step": 35129 + }, + { + "epoch": 0.20892806166143305, + "grad_norm": 2.1369383335113525, + "learning_rate": 4.48056487876373e-05, + "loss": 4.138, + "step": 35130 + }, + { + "epoch": 0.20893400894471406, + "grad_norm": 2.288029193878174, + "learning_rate": 4.48053637476397e-05, + "loss": 4.2829, + "step": 35131 + }, + { + "epoch": 0.20893995622799505, + "grad_norm": 2.116546869277954, + "learning_rate": 4.4805078700728235e-05, + "loss": 3.9769, + "step": 35132 + }, + { + "epoch": 0.20894590351127604, + "grad_norm": 2.4647371768951416, + "learning_rate": 4.480479364690303e-05, + "loss": 3.9284, + "step": 35133 + }, + { + "epoch": 0.20895185079455705, + "grad_norm": 2.188466787338257, + "learning_rate": 4.4804508586164184e-05, + "loss": 4.2483, + "step": 35134 + }, + { + "epoch": 0.20895779807783804, + "grad_norm": 2.6960582733154297, + "learning_rate": 4.480422351851178e-05, + "loss": 4.5028, + "step": 35135 + }, + { + "epoch": 0.20896374536111903, + "grad_norm": 2.6402602195739746, + "learning_rate": 4.480393844394592e-05, + "loss": 4.4322, + "step": 35136 + }, + { + "epoch": 0.20896969264440005, + "grad_norm": 2.3040831089019775, + "learning_rate": 4.480365336246673e-05, + "loss": 4.6079, + "step": 35137 + }, + { + "epoch": 0.20897563992768103, + "grad_norm": 2.240013360977173, + "learning_rate": 4.480336827407427e-05, + "loss": 4.4865, + "step": 35138 + }, + { + "epoch": 0.20898158721096202, + "grad_norm": 2.362314462661743, + "learning_rate": 4.4803083178768667e-05, + "loss": 4.0822, + "step": 35139 + }, + { + "epoch": 0.20898753449424304, + "grad_norm": 2.440065622329712, + "learning_rate": 4.480279807655001e-05, + "loss": 4.6024, + "step": 35140 + }, + { + "epoch": 0.20899348177752403, + "grad_norm": 2.482828378677368, + "learning_rate": 4.480251296741841e-05, + "loss": 4.4708, + "step": 35141 + }, + { + "epoch": 0.20899942906080501, + "grad_norm": 2.3551008701324463, + "learning_rate": 4.480222785137395e-05, + "loss": 4.4908, + "step": 35142 + }, + { + "epoch": 0.20900537634408603, + "grad_norm": 2.2475407123565674, + "learning_rate": 4.4801942728416734e-05, + "loss": 4.5739, + "step": 35143 + }, + { + "epoch": 0.20901132362736702, + "grad_norm": 2.2544219493865967, + "learning_rate": 4.4801657598546865e-05, + "loss": 4.6347, + "step": 35144 + }, + { + "epoch": 0.209017270910648, + "grad_norm": 1.9965347051620483, + "learning_rate": 4.480137246176445e-05, + "loss": 4.5584, + "step": 35145 + }, + { + "epoch": 0.20902321819392902, + "grad_norm": 2.303065061569214, + "learning_rate": 4.480108731806957e-05, + "loss": 4.5769, + "step": 35146 + }, + { + "epoch": 0.20902916547721, + "grad_norm": 2.274526596069336, + "learning_rate": 4.480080216746234e-05, + "loss": 4.5248, + "step": 35147 + }, + { + "epoch": 0.209035112760491, + "grad_norm": 2.3997044563293457, + "learning_rate": 4.480051700994286e-05, + "loss": 4.713, + "step": 35148 + }, + { + "epoch": 0.20904106004377201, + "grad_norm": 2.116888999938965, + "learning_rate": 4.480023184551121e-05, + "loss": 4.4275, + "step": 35149 + }, + { + "epoch": 0.209047007327053, + "grad_norm": 2.7580020427703857, + "learning_rate": 4.479994667416751e-05, + "loss": 4.0885, + "step": 35150 + }, + { + "epoch": 0.209052954610334, + "grad_norm": 2.0336437225341797, + "learning_rate": 4.479966149591186e-05, + "loss": 4.1945, + "step": 35151 + }, + { + "epoch": 0.209058901893615, + "grad_norm": 2.166522979736328, + "learning_rate": 4.479937631074435e-05, + "loss": 4.3606, + "step": 35152 + }, + { + "epoch": 0.209064849176896, + "grad_norm": 2.0222151279449463, + "learning_rate": 4.479909111866507e-05, + "loss": 4.2598, + "step": 35153 + }, + { + "epoch": 0.20907079646017698, + "grad_norm": 2.1556873321533203, + "learning_rate": 4.479880591967415e-05, + "loss": 4.2919, + "step": 35154 + }, + { + "epoch": 0.209076743743458, + "grad_norm": 1.9412826299667358, + "learning_rate": 4.4798520713771655e-05, + "loss": 4.3147, + "step": 35155 + }, + { + "epoch": 0.209082691026739, + "grad_norm": 2.263427972793579, + "learning_rate": 4.479823550095771e-05, + "loss": 4.2697, + "step": 35156 + }, + { + "epoch": 0.20908863831001998, + "grad_norm": 2.105473756790161, + "learning_rate": 4.4797950281232405e-05, + "loss": 4.2734, + "step": 35157 + }, + { + "epoch": 0.209094585593301, + "grad_norm": 2.31563138961792, + "learning_rate": 4.479766505459584e-05, + "loss": 4.3769, + "step": 35158 + }, + { + "epoch": 0.20910053287658198, + "grad_norm": 2.2249670028686523, + "learning_rate": 4.479737982104811e-05, + "loss": 4.358, + "step": 35159 + }, + { + "epoch": 0.20910648015986297, + "grad_norm": 1.9306457042694092, + "learning_rate": 4.479709458058933e-05, + "loss": 4.5871, + "step": 35160 + }, + { + "epoch": 0.20911242744314398, + "grad_norm": 2.4048049449920654, + "learning_rate": 4.479680933321958e-05, + "loss": 4.164, + "step": 35161 + }, + { + "epoch": 0.20911837472642497, + "grad_norm": 2.171954393386841, + "learning_rate": 4.4796524078938974e-05, + "loss": 4.3285, + "step": 35162 + }, + { + "epoch": 0.20912432200970596, + "grad_norm": 2.1672539710998535, + "learning_rate": 4.47962388177476e-05, + "loss": 3.9887, + "step": 35163 + }, + { + "epoch": 0.20913026929298698, + "grad_norm": 2.4803264141082764, + "learning_rate": 4.479595354964556e-05, + "loss": 4.2461, + "step": 35164 + }, + { + "epoch": 0.20913621657626796, + "grad_norm": 2.385725975036621, + "learning_rate": 4.4795668274632965e-05, + "loss": 4.3727, + "step": 35165 + }, + { + "epoch": 0.20914216385954895, + "grad_norm": 2.17445707321167, + "learning_rate": 4.4795382992709914e-05, + "loss": 4.386, + "step": 35166 + }, + { + "epoch": 0.20914811114282997, + "grad_norm": 2.45085072517395, + "learning_rate": 4.4795097703876484e-05, + "loss": 4.3429, + "step": 35167 + }, + { + "epoch": 0.20915405842611096, + "grad_norm": 2.0739786624908447, + "learning_rate": 4.4794812408132796e-05, + "loss": 4.2034, + "step": 35168 + }, + { + "epoch": 0.20916000570939194, + "grad_norm": 2.2545764446258545, + "learning_rate": 4.4794527105478946e-05, + "loss": 4.3227, + "step": 35169 + }, + { + "epoch": 0.20916595299267296, + "grad_norm": 2.1697545051574707, + "learning_rate": 4.479424179591503e-05, + "loss": 4.4496, + "step": 35170 + }, + { + "epoch": 0.20917190027595395, + "grad_norm": 2.493567943572998, + "learning_rate": 4.4793956479441144e-05, + "loss": 4.2328, + "step": 35171 + }, + { + "epoch": 0.20917784755923494, + "grad_norm": 2.3742611408233643, + "learning_rate": 4.4793671156057396e-05, + "loss": 4.3438, + "step": 35172 + }, + { + "epoch": 0.20918379484251595, + "grad_norm": 2.3343236446380615, + "learning_rate": 4.4793385825763885e-05, + "loss": 4.1543, + "step": 35173 + }, + { + "epoch": 0.20918974212579694, + "grad_norm": 2.553321599960327, + "learning_rate": 4.47931004885607e-05, + "loss": 4.1855, + "step": 35174 + }, + { + "epoch": 0.20919568940907793, + "grad_norm": 2.2950751781463623, + "learning_rate": 4.4792815144447954e-05, + "loss": 4.2327, + "step": 35175 + }, + { + "epoch": 0.20920163669235892, + "grad_norm": 2.211557388305664, + "learning_rate": 4.4792529793425744e-05, + "loss": 4.2977, + "step": 35176 + }, + { + "epoch": 0.20920758397563993, + "grad_norm": 2.1329169273376465, + "learning_rate": 4.479224443549416e-05, + "loss": 4.0576, + "step": 35177 + }, + { + "epoch": 0.20921353125892092, + "grad_norm": 2.223177194595337, + "learning_rate": 4.4791959070653304e-05, + "loss": 4.2397, + "step": 35178 + }, + { + "epoch": 0.2092194785422019, + "grad_norm": 2.0334205627441406, + "learning_rate": 4.479167369890328e-05, + "loss": 4.3574, + "step": 35179 + }, + { + "epoch": 0.20922542582548292, + "grad_norm": 2.3178441524505615, + "learning_rate": 4.47913883202442e-05, + "loss": 4.1552, + "step": 35180 + }, + { + "epoch": 0.2092313731087639, + "grad_norm": 2.5519886016845703, + "learning_rate": 4.479110293467614e-05, + "loss": 4.0306, + "step": 35181 + }, + { + "epoch": 0.2092373203920449, + "grad_norm": 2.1643712520599365, + "learning_rate": 4.479081754219922e-05, + "loss": 4.2537, + "step": 35182 + }, + { + "epoch": 0.20924326767532592, + "grad_norm": 2.2243382930755615, + "learning_rate": 4.479053214281352e-05, + "loss": 4.271, + "step": 35183 + }, + { + "epoch": 0.2092492149586069, + "grad_norm": 2.1648247241973877, + "learning_rate": 4.4790246736519145e-05, + "loss": 4.1709, + "step": 35184 + }, + { + "epoch": 0.2092551622418879, + "grad_norm": 2.4812254905700684, + "learning_rate": 4.478996132331621e-05, + "loss": 3.6349, + "step": 35185 + }, + { + "epoch": 0.2092611095251689, + "grad_norm": 1.966122031211853, + "learning_rate": 4.4789675903204805e-05, + "loss": 4.0373, + "step": 35186 + }, + { + "epoch": 0.2092670568084499, + "grad_norm": 2.027426242828369, + "learning_rate": 4.4789390476185024e-05, + "loss": 4.005, + "step": 35187 + }, + { + "epoch": 0.20927300409173089, + "grad_norm": 2.40824818611145, + "learning_rate": 4.478910504225697e-05, + "loss": 3.7404, + "step": 35188 + }, + { + "epoch": 0.2092789513750119, + "grad_norm": 2.3484485149383545, + "learning_rate": 4.478881960142075e-05, + "loss": 4.4463, + "step": 35189 + }, + { + "epoch": 0.2092848986582929, + "grad_norm": 2.3539352416992188, + "learning_rate": 4.4788534153676455e-05, + "loss": 4.2863, + "step": 35190 + }, + { + "epoch": 0.20929084594157388, + "grad_norm": 2.364746570587158, + "learning_rate": 4.478824869902418e-05, + "loss": 4.1668, + "step": 35191 + }, + { + "epoch": 0.2092967932248549, + "grad_norm": 1.9087117910385132, + "learning_rate": 4.478796323746404e-05, + "loss": 4.5418, + "step": 35192 + }, + { + "epoch": 0.20930274050813588, + "grad_norm": 1.5683953762054443, + "learning_rate": 4.478767776899612e-05, + "loss": 4.593, + "step": 35193 + }, + { + "epoch": 0.20930868779141687, + "grad_norm": 1.957962155342102, + "learning_rate": 4.478739229362053e-05, + "loss": 4.4794, + "step": 35194 + }, + { + "epoch": 0.20931463507469789, + "grad_norm": 2.0151965618133545, + "learning_rate": 4.478710681133737e-05, + "loss": 4.4208, + "step": 35195 + }, + { + "epoch": 0.20932058235797887, + "grad_norm": 2.0356502532958984, + "learning_rate": 4.4786821322146735e-05, + "loss": 4.3885, + "step": 35196 + }, + { + "epoch": 0.20932652964125986, + "grad_norm": 1.8693149089813232, + "learning_rate": 4.4786535826048714e-05, + "loss": 4.1047, + "step": 35197 + }, + { + "epoch": 0.20933247692454088, + "grad_norm": 1.9223167896270752, + "learning_rate": 4.478625032304343e-05, + "loss": 4.5433, + "step": 35198 + }, + { + "epoch": 0.20933842420782187, + "grad_norm": 1.6214507818222046, + "learning_rate": 4.4785964813130964e-05, + "loss": 4.5297, + "step": 35199 + }, + { + "epoch": 0.20934437149110285, + "grad_norm": 1.4718947410583496, + "learning_rate": 4.478567929631142e-05, + "loss": 4.2412, + "step": 35200 + }, + { + "epoch": 0.20935031877438387, + "grad_norm": 1.8548481464385986, + "learning_rate": 4.478539377258491e-05, + "loss": 4.67, + "step": 35201 + }, + { + "epoch": 0.20935626605766486, + "grad_norm": 1.70490300655365, + "learning_rate": 4.478510824195151e-05, + "loss": 4.484, + "step": 35202 + }, + { + "epoch": 0.20936221334094585, + "grad_norm": 1.9421411752700806, + "learning_rate": 4.478482270441135e-05, + "loss": 4.3252, + "step": 35203 + }, + { + "epoch": 0.20936816062422686, + "grad_norm": 1.811452865600586, + "learning_rate": 4.47845371599645e-05, + "loss": 4.462, + "step": 35204 + }, + { + "epoch": 0.20937410790750785, + "grad_norm": 1.5066571235656738, + "learning_rate": 4.478425160861107e-05, + "loss": 4.8129, + "step": 35205 + }, + { + "epoch": 0.20938005519078884, + "grad_norm": 1.7884474992752075, + "learning_rate": 4.478396605035117e-05, + "loss": 4.5078, + "step": 35206 + }, + { + "epoch": 0.20938600247406985, + "grad_norm": 1.5454497337341309, + "learning_rate": 4.4783680485184885e-05, + "loss": 4.3778, + "step": 35207 + }, + { + "epoch": 0.20939194975735084, + "grad_norm": 1.5749461650848389, + "learning_rate": 4.478339491311233e-05, + "loss": 4.6328, + "step": 35208 + }, + { + "epoch": 0.20939789704063183, + "grad_norm": 1.6245211362838745, + "learning_rate": 4.478310933413359e-05, + "loss": 4.7798, + "step": 35209 + }, + { + "epoch": 0.20940384432391285, + "grad_norm": 1.6137746572494507, + "learning_rate": 4.478282374824878e-05, + "loss": 4.2092, + "step": 35210 + }, + { + "epoch": 0.20940979160719383, + "grad_norm": 1.6076363325119019, + "learning_rate": 4.478253815545798e-05, + "loss": 4.0179, + "step": 35211 + }, + { + "epoch": 0.20941573889047482, + "grad_norm": 2.2339091300964355, + "learning_rate": 4.4782252555761304e-05, + "loss": 4.5163, + "step": 35212 + }, + { + "epoch": 0.20942168617375584, + "grad_norm": 2.1667139530181885, + "learning_rate": 4.478196694915885e-05, + "loss": 4.623, + "step": 35213 + }, + { + "epoch": 0.20942763345703683, + "grad_norm": 1.8673685789108276, + "learning_rate": 4.478168133565071e-05, + "loss": 4.5131, + "step": 35214 + }, + { + "epoch": 0.20943358074031782, + "grad_norm": 1.7350704669952393, + "learning_rate": 4.4781395715237e-05, + "loss": 4.6335, + "step": 35215 + }, + { + "epoch": 0.20943952802359883, + "grad_norm": 2.038442611694336, + "learning_rate": 4.4781110087917796e-05, + "loss": 4.071, + "step": 35216 + }, + { + "epoch": 0.20944547530687982, + "grad_norm": 1.670720100402832, + "learning_rate": 4.478082445369322e-05, + "loss": 4.8153, + "step": 35217 + }, + { + "epoch": 0.2094514225901608, + "grad_norm": 1.5883606672286987, + "learning_rate": 4.4780538812563355e-05, + "loss": 4.6545, + "step": 35218 + }, + { + "epoch": 0.20945736987344182, + "grad_norm": 1.758099913597107, + "learning_rate": 4.478025316452832e-05, + "loss": 5.1267, + "step": 35219 + }, + { + "epoch": 0.2094633171567228, + "grad_norm": 1.836985468864441, + "learning_rate": 4.477996750958819e-05, + "loss": 4.9632, + "step": 35220 + }, + { + "epoch": 0.2094692644400038, + "grad_norm": 1.5703904628753662, + "learning_rate": 4.477968184774308e-05, + "loss": 5.1253, + "step": 35221 + }, + { + "epoch": 0.20947521172328482, + "grad_norm": 1.717297911643982, + "learning_rate": 4.4779396178993094e-05, + "loss": 4.1864, + "step": 35222 + }, + { + "epoch": 0.2094811590065658, + "grad_norm": 1.8805084228515625, + "learning_rate": 4.4779110503338325e-05, + "loss": 4.2417, + "step": 35223 + }, + { + "epoch": 0.2094871062898468, + "grad_norm": 2.0293681621551514, + "learning_rate": 4.477882482077887e-05, + "loss": 4.1028, + "step": 35224 + }, + { + "epoch": 0.2094930535731278, + "grad_norm": 1.868170142173767, + "learning_rate": 4.477853913131483e-05, + "loss": 3.8102, + "step": 35225 + }, + { + "epoch": 0.2094990008564088, + "grad_norm": 1.7893959283828735, + "learning_rate": 4.4778253434946305e-05, + "loss": 4.4876, + "step": 35226 + }, + { + "epoch": 0.20950494813968978, + "grad_norm": 1.837123990058899, + "learning_rate": 4.47779677316734e-05, + "loss": 4.3673, + "step": 35227 + }, + { + "epoch": 0.2095108954229708, + "grad_norm": 2.12733793258667, + "learning_rate": 4.477768202149621e-05, + "loss": 4.2554, + "step": 35228 + }, + { + "epoch": 0.2095168427062518, + "grad_norm": 2.033757209777832, + "learning_rate": 4.477739630441484e-05, + "loss": 4.4083, + "step": 35229 + }, + { + "epoch": 0.20952278998953278, + "grad_norm": 2.0243184566497803, + "learning_rate": 4.477711058042938e-05, + "loss": 4.5907, + "step": 35230 + }, + { + "epoch": 0.2095287372728138, + "grad_norm": 2.0109145641326904, + "learning_rate": 4.4776824849539935e-05, + "loss": 4.1614, + "step": 35231 + }, + { + "epoch": 0.20953468455609478, + "grad_norm": 1.9397257566452026, + "learning_rate": 4.4776539111746604e-05, + "loss": 4.2821, + "step": 35232 + }, + { + "epoch": 0.20954063183937577, + "grad_norm": 1.6508504152297974, + "learning_rate": 4.4776253367049495e-05, + "loss": 4.3756, + "step": 35233 + }, + { + "epoch": 0.20954657912265678, + "grad_norm": 2.013890266418457, + "learning_rate": 4.477596761544869e-05, + "loss": 3.8501, + "step": 35234 + }, + { + "epoch": 0.20955252640593777, + "grad_norm": 1.892042875289917, + "learning_rate": 4.47756818569443e-05, + "loss": 4.1742, + "step": 35235 + }, + { + "epoch": 0.20955847368921876, + "grad_norm": 1.5260576009750366, + "learning_rate": 4.4775396091536425e-05, + "loss": 4.7992, + "step": 35236 + }, + { + "epoch": 0.20956442097249975, + "grad_norm": 1.4659627676010132, + "learning_rate": 4.477511031922517e-05, + "loss": 5.1104, + "step": 35237 + }, + { + "epoch": 0.20957036825578076, + "grad_norm": 1.4066425561904907, + "learning_rate": 4.4774824540010625e-05, + "loss": 5.0196, + "step": 35238 + }, + { + "epoch": 0.20957631553906175, + "grad_norm": 2.000969409942627, + "learning_rate": 4.477453875389289e-05, + "loss": 4.4638, + "step": 35239 + }, + { + "epoch": 0.20958226282234274, + "grad_norm": 1.9423243999481201, + "learning_rate": 4.4774252960872066e-05, + "loss": 4.5488, + "step": 35240 + }, + { + "epoch": 0.20958821010562376, + "grad_norm": 1.9737645387649536, + "learning_rate": 4.477396716094826e-05, + "loss": 4.5551, + "step": 35241 + }, + { + "epoch": 0.20959415738890474, + "grad_norm": 2.2152135372161865, + "learning_rate": 4.477368135412157e-05, + "loss": 4.1769, + "step": 35242 + }, + { + "epoch": 0.20960010467218573, + "grad_norm": 1.604505181312561, + "learning_rate": 4.4773395540392086e-05, + "loss": 4.2283, + "step": 35243 + }, + { + "epoch": 0.20960605195546675, + "grad_norm": 1.5856635570526123, + "learning_rate": 4.477310971975991e-05, + "loss": 4.4282, + "step": 35244 + }, + { + "epoch": 0.20961199923874774, + "grad_norm": 1.7525242567062378, + "learning_rate": 4.4772823892225146e-05, + "loss": 4.4311, + "step": 35245 + }, + { + "epoch": 0.20961794652202873, + "grad_norm": 1.9557976722717285, + "learning_rate": 4.4772538057787904e-05, + "loss": 4.4978, + "step": 35246 + }, + { + "epoch": 0.20962389380530974, + "grad_norm": 1.9850143194198608, + "learning_rate": 4.4772252216448265e-05, + "loss": 4.1881, + "step": 35247 + }, + { + "epoch": 0.20962984108859073, + "grad_norm": 2.2965569496154785, + "learning_rate": 4.4771966368206345e-05, + "loss": 4.4171, + "step": 35248 + }, + { + "epoch": 0.20963578837187172, + "grad_norm": 2.1682398319244385, + "learning_rate": 4.4771680513062224e-05, + "loss": 4.2601, + "step": 35249 + }, + { + "epoch": 0.20964173565515273, + "grad_norm": 2.0759825706481934, + "learning_rate": 4.477139465101602e-05, + "loss": 4.5485, + "step": 35250 + }, + { + "epoch": 0.20964768293843372, + "grad_norm": 1.9251832962036133, + "learning_rate": 4.477110878206783e-05, + "loss": 4.4246, + "step": 35251 + }, + { + "epoch": 0.2096536302217147, + "grad_norm": 1.963021993637085, + "learning_rate": 4.477082290621774e-05, + "loss": 4.5177, + "step": 35252 + }, + { + "epoch": 0.20965957750499573, + "grad_norm": 1.748746633529663, + "learning_rate": 4.4770537023465864e-05, + "loss": 4.7079, + "step": 35253 + }, + { + "epoch": 0.2096655247882767, + "grad_norm": 1.9013080596923828, + "learning_rate": 4.47702511338123e-05, + "loss": 4.6707, + "step": 35254 + }, + { + "epoch": 0.2096714720715577, + "grad_norm": 1.957573413848877, + "learning_rate": 4.476996523725715e-05, + "loss": 4.2977, + "step": 35255 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 1.7000887393951416, + "learning_rate": 4.476967933380051e-05, + "loss": 4.4874, + "step": 35256 + }, + { + "epoch": 0.2096833666381197, + "grad_norm": 1.8152967691421509, + "learning_rate": 4.476939342344246e-05, + "loss": 4.216, + "step": 35257 + }, + { + "epoch": 0.2096893139214007, + "grad_norm": 1.932236671447754, + "learning_rate": 4.476910750618314e-05, + "loss": 4.4371, + "step": 35258 + }, + { + "epoch": 0.2096952612046817, + "grad_norm": 2.0386545658111572, + "learning_rate": 4.4768821582022625e-05, + "loss": 4.4598, + "step": 35259 + }, + { + "epoch": 0.2097012084879627, + "grad_norm": 1.8710325956344604, + "learning_rate": 4.4768535650961014e-05, + "loss": 4.2673, + "step": 35260 + }, + { + "epoch": 0.2097071557712437, + "grad_norm": 1.9699058532714844, + "learning_rate": 4.476824971299841e-05, + "loss": 4.1835, + "step": 35261 + }, + { + "epoch": 0.2097131030545247, + "grad_norm": 1.6277472972869873, + "learning_rate": 4.476796376813492e-05, + "loss": 5.0931, + "step": 35262 + }, + { + "epoch": 0.2097190503378057, + "grad_norm": 1.709804892539978, + "learning_rate": 4.476767781637064e-05, + "loss": 5.1094, + "step": 35263 + }, + { + "epoch": 0.20972499762108668, + "grad_norm": 1.692352056503296, + "learning_rate": 4.4767391857705654e-05, + "loss": 5.0337, + "step": 35264 + }, + { + "epoch": 0.2097309449043677, + "grad_norm": 1.619791030883789, + "learning_rate": 4.476710589214009e-05, + "loss": 5.044, + "step": 35265 + }, + { + "epoch": 0.20973689218764868, + "grad_norm": 1.5870261192321777, + "learning_rate": 4.4766819919674024e-05, + "loss": 4.9422, + "step": 35266 + }, + { + "epoch": 0.20974283947092967, + "grad_norm": 1.756438136100769, + "learning_rate": 4.4766533940307574e-05, + "loss": 4.9927, + "step": 35267 + }, + { + "epoch": 0.2097487867542107, + "grad_norm": 2.050565004348755, + "learning_rate": 4.476624795404082e-05, + "loss": 5.0727, + "step": 35268 + }, + { + "epoch": 0.20975473403749167, + "grad_norm": 1.7845942974090576, + "learning_rate": 4.476596196087388e-05, + "loss": 4.9263, + "step": 35269 + }, + { + "epoch": 0.20976068132077266, + "grad_norm": 1.4715898036956787, + "learning_rate": 4.476567596080685e-05, + "loss": 5.2048, + "step": 35270 + }, + { + "epoch": 0.20976662860405368, + "grad_norm": 1.4592971801757812, + "learning_rate": 4.4765389953839824e-05, + "loss": 5.5331, + "step": 35271 + }, + { + "epoch": 0.20977257588733467, + "grad_norm": 1.4662095308303833, + "learning_rate": 4.476510393997291e-05, + "loss": 5.398, + "step": 35272 + }, + { + "epoch": 0.20977852317061566, + "grad_norm": 1.626869559288025, + "learning_rate": 4.4764817919206194e-05, + "loss": 4.9234, + "step": 35273 + }, + { + "epoch": 0.20978447045389667, + "grad_norm": 1.792540431022644, + "learning_rate": 4.4764531891539785e-05, + "loss": 5.1534, + "step": 35274 + }, + { + "epoch": 0.20979041773717766, + "grad_norm": 1.702528715133667, + "learning_rate": 4.4764245856973784e-05, + "loss": 5.1792, + "step": 35275 + }, + { + "epoch": 0.20979636502045865, + "grad_norm": 1.5575789213180542, + "learning_rate": 4.476395981550829e-05, + "loss": 5.3819, + "step": 35276 + }, + { + "epoch": 0.20980231230373966, + "grad_norm": 1.7874699831008911, + "learning_rate": 4.47636737671434e-05, + "loss": 5.4195, + "step": 35277 + }, + { + "epoch": 0.20980825958702065, + "grad_norm": 1.5927484035491943, + "learning_rate": 4.4763387711879215e-05, + "loss": 4.7251, + "step": 35278 + }, + { + "epoch": 0.20981420687030164, + "grad_norm": 1.572074294090271, + "learning_rate": 4.476310164971584e-05, + "loss": 5.6226, + "step": 35279 + }, + { + "epoch": 0.20982015415358266, + "grad_norm": 1.5262272357940674, + "learning_rate": 4.476281558065336e-05, + "loss": 5.2816, + "step": 35280 + }, + { + "epoch": 0.20982610143686364, + "grad_norm": 1.8157652616500854, + "learning_rate": 4.47625295046919e-05, + "loss": 5.198, + "step": 35281 + }, + { + "epoch": 0.20983204872014463, + "grad_norm": 1.550521969795227, + "learning_rate": 4.4762243421831536e-05, + "loss": 4.96, + "step": 35282 + }, + { + "epoch": 0.20983799600342565, + "grad_norm": 1.6990987062454224, + "learning_rate": 4.476195733207238e-05, + "loss": 4.6078, + "step": 35283 + }, + { + "epoch": 0.20984394328670664, + "grad_norm": 1.61250901222229, + "learning_rate": 4.476167123541453e-05, + "loss": 4.6525, + "step": 35284 + }, + { + "epoch": 0.20984989056998762, + "grad_norm": 1.630925178527832, + "learning_rate": 4.476138513185808e-05, + "loss": 5.1194, + "step": 35285 + }, + { + "epoch": 0.20985583785326864, + "grad_norm": 1.3319804668426514, + "learning_rate": 4.476109902140313e-05, + "loss": 5.4476, + "step": 35286 + }, + { + "epoch": 0.20986178513654963, + "grad_norm": 1.4637563228607178, + "learning_rate": 4.47608129040498e-05, + "loss": 5.4099, + "step": 35287 + }, + { + "epoch": 0.20986773241983062, + "grad_norm": 1.3934221267700195, + "learning_rate": 4.476052677979816e-05, + "loss": 5.3829, + "step": 35288 + }, + { + "epoch": 0.20987367970311163, + "grad_norm": 1.43252432346344, + "learning_rate": 4.476024064864833e-05, + "loss": 5.3713, + "step": 35289 + }, + { + "epoch": 0.20987962698639262, + "grad_norm": 1.6679284572601318, + "learning_rate": 4.4759954510600404e-05, + "loss": 5.382, + "step": 35290 + }, + { + "epoch": 0.2098855742696736, + "grad_norm": 1.7102776765823364, + "learning_rate": 4.4759668365654484e-05, + "loss": 5.5006, + "step": 35291 + }, + { + "epoch": 0.20989152155295462, + "grad_norm": 1.8174772262573242, + "learning_rate": 4.4759382213810667e-05, + "loss": 5.2365, + "step": 35292 + }, + { + "epoch": 0.2098974688362356, + "grad_norm": 1.3059958219528198, + "learning_rate": 4.475909605506905e-05, + "loss": 5.5279, + "step": 35293 + }, + { + "epoch": 0.2099034161195166, + "grad_norm": 1.3822412490844727, + "learning_rate": 4.4758809889429745e-05, + "loss": 4.8987, + "step": 35294 + }, + { + "epoch": 0.2099093634027976, + "grad_norm": 1.7537777423858643, + "learning_rate": 4.475852371689284e-05, + "loss": 5.4948, + "step": 35295 + }, + { + "epoch": 0.2099153106860786, + "grad_norm": 1.6818406581878662, + "learning_rate": 4.475823753745843e-05, + "loss": 5.2707, + "step": 35296 + }, + { + "epoch": 0.2099212579693596, + "grad_norm": 1.5473475456237793, + "learning_rate": 4.475795135112663e-05, + "loss": 5.246, + "step": 35297 + }, + { + "epoch": 0.20992720525264058, + "grad_norm": 1.9564220905303955, + "learning_rate": 4.475766515789753e-05, + "loss": 5.0882, + "step": 35298 + }, + { + "epoch": 0.2099331525359216, + "grad_norm": 1.9098862409591675, + "learning_rate": 4.475737895777124e-05, + "loss": 4.8618, + "step": 35299 + }, + { + "epoch": 0.20993909981920258, + "grad_norm": 1.313395380973816, + "learning_rate": 4.4757092750747856e-05, + "loss": 5.102, + "step": 35300 + }, + { + "epoch": 0.20994504710248357, + "grad_norm": 1.600813865661621, + "learning_rate": 4.475680653682747e-05, + "loss": 4.5547, + "step": 35301 + }, + { + "epoch": 0.2099509943857646, + "grad_norm": 1.5113312005996704, + "learning_rate": 4.4756520316010183e-05, + "loss": 5.0377, + "step": 35302 + }, + { + "epoch": 0.20995694166904558, + "grad_norm": 1.413179874420166, + "learning_rate": 4.47562340882961e-05, + "loss": 4.1832, + "step": 35303 + }, + { + "epoch": 0.20996288895232657, + "grad_norm": 1.613791584968567, + "learning_rate": 4.475594785368532e-05, + "loss": 4.2055, + "step": 35304 + }, + { + "epoch": 0.20996883623560758, + "grad_norm": 1.739326000213623, + "learning_rate": 4.475566161217795e-05, + "loss": 4.7967, + "step": 35305 + }, + { + "epoch": 0.20997478351888857, + "grad_norm": 1.3964680433273315, + "learning_rate": 4.4755375363774074e-05, + "loss": 4.814, + "step": 35306 + }, + { + "epoch": 0.20998073080216956, + "grad_norm": 1.4224399328231812, + "learning_rate": 4.47550891084738e-05, + "loss": 4.8103, + "step": 35307 + }, + { + "epoch": 0.20998667808545057, + "grad_norm": 1.8177698850631714, + "learning_rate": 4.475480284627723e-05, + "loss": 4.7559, + "step": 35308 + }, + { + "epoch": 0.20999262536873156, + "grad_norm": 2.1109840869903564, + "learning_rate": 4.475451657718447e-05, + "loss": 4.196, + "step": 35309 + }, + { + "epoch": 0.20999857265201255, + "grad_norm": 2.44920015335083, + "learning_rate": 4.47542303011956e-05, + "loss": 4.4954, + "step": 35310 + }, + { + "epoch": 0.21000451993529357, + "grad_norm": 1.8589818477630615, + "learning_rate": 4.4753944018310744e-05, + "loss": 4.7965, + "step": 35311 + }, + { + "epoch": 0.21001046721857455, + "grad_norm": 1.8963855504989624, + "learning_rate": 4.475365772852998e-05, + "loss": 5.0571, + "step": 35312 + }, + { + "epoch": 0.21001641450185554, + "grad_norm": 1.857127070426941, + "learning_rate": 4.475337143185343e-05, + "loss": 5.4925, + "step": 35313 + }, + { + "epoch": 0.21002236178513656, + "grad_norm": 1.6797605752944946, + "learning_rate": 4.475308512828117e-05, + "loss": 5.0388, + "step": 35314 + }, + { + "epoch": 0.21002830906841755, + "grad_norm": 1.6497384309768677, + "learning_rate": 4.4752798817813317e-05, + "loss": 5.1263, + "step": 35315 + }, + { + "epoch": 0.21003425635169853, + "grad_norm": 1.8872557878494263, + "learning_rate": 4.475251250044997e-05, + "loss": 4.5841, + "step": 35316 + }, + { + "epoch": 0.21004020363497955, + "grad_norm": 1.843013048171997, + "learning_rate": 4.475222617619122e-05, + "loss": 4.9794, + "step": 35317 + }, + { + "epoch": 0.21004615091826054, + "grad_norm": 1.5779093503952026, + "learning_rate": 4.475193984503717e-05, + "loss": 5.1123, + "step": 35318 + }, + { + "epoch": 0.21005209820154153, + "grad_norm": 1.647068738937378, + "learning_rate": 4.475165350698793e-05, + "loss": 5.0647, + "step": 35319 + }, + { + "epoch": 0.21005804548482254, + "grad_norm": 1.9030619859695435, + "learning_rate": 4.475136716204359e-05, + "loss": 4.638, + "step": 35320 + }, + { + "epoch": 0.21006399276810353, + "grad_norm": 1.9144114255905151, + "learning_rate": 4.4751080810204244e-05, + "loss": 4.8637, + "step": 35321 + }, + { + "epoch": 0.21006994005138452, + "grad_norm": 2.0055301189422607, + "learning_rate": 4.475079445147e-05, + "loss": 4.7652, + "step": 35322 + }, + { + "epoch": 0.21007588733466553, + "grad_norm": 2.0406198501586914, + "learning_rate": 4.475050808584097e-05, + "loss": 4.5856, + "step": 35323 + }, + { + "epoch": 0.21008183461794652, + "grad_norm": 1.781522274017334, + "learning_rate": 4.475022171331723e-05, + "loss": 5.1992, + "step": 35324 + }, + { + "epoch": 0.2100877819012275, + "grad_norm": 1.5984790325164795, + "learning_rate": 4.4749935333898896e-05, + "loss": 5.1308, + "step": 35325 + }, + { + "epoch": 0.21009372918450853, + "grad_norm": 1.5800871849060059, + "learning_rate": 4.4749648947586065e-05, + "loss": 5.1936, + "step": 35326 + }, + { + "epoch": 0.21009967646778951, + "grad_norm": 1.6657015085220337, + "learning_rate": 4.4749362554378834e-05, + "loss": 5.1147, + "step": 35327 + }, + { + "epoch": 0.2101056237510705, + "grad_norm": 1.6115208864212036, + "learning_rate": 4.47490761542773e-05, + "loss": 5.0284, + "step": 35328 + }, + { + "epoch": 0.21011157103435152, + "grad_norm": 1.7555902004241943, + "learning_rate": 4.4748789747281574e-05, + "loss": 5.0457, + "step": 35329 + }, + { + "epoch": 0.2101175183176325, + "grad_norm": 1.597569227218628, + "learning_rate": 4.474850333339174e-05, + "loss": 4.39, + "step": 35330 + }, + { + "epoch": 0.2101234656009135, + "grad_norm": 1.4470419883728027, + "learning_rate": 4.4748216912607924e-05, + "loss": 4.7876, + "step": 35331 + }, + { + "epoch": 0.2101294128841945, + "grad_norm": 1.2878272533416748, + "learning_rate": 4.47479304849302e-05, + "loss": 5.1986, + "step": 35332 + }, + { + "epoch": 0.2101353601674755, + "grad_norm": 1.3939337730407715, + "learning_rate": 4.4747644050358676e-05, + "loss": 4.9916, + "step": 35333 + }, + { + "epoch": 0.2101413074507565, + "grad_norm": 1.404179573059082, + "learning_rate": 4.4747357608893456e-05, + "loss": 4.8303, + "step": 35334 + }, + { + "epoch": 0.2101472547340375, + "grad_norm": 1.5711162090301514, + "learning_rate": 4.4747071160534635e-05, + "loss": 4.7675, + "step": 35335 + }, + { + "epoch": 0.2101532020173185, + "grad_norm": 1.5045690536499023, + "learning_rate": 4.474678470528232e-05, + "loss": 4.7171, + "step": 35336 + }, + { + "epoch": 0.21015914930059948, + "grad_norm": 1.6560767889022827, + "learning_rate": 4.47464982431366e-05, + "loss": 4.7026, + "step": 35337 + }, + { + "epoch": 0.2101650965838805, + "grad_norm": 1.5439120531082153, + "learning_rate": 4.474621177409759e-05, + "loss": 4.4902, + "step": 35338 + }, + { + "epoch": 0.21017104386716148, + "grad_norm": 1.718030571937561, + "learning_rate": 4.474592529816538e-05, + "loss": 4.6819, + "step": 35339 + }, + { + "epoch": 0.21017699115044247, + "grad_norm": 1.6997952461242676, + "learning_rate": 4.474563881534006e-05, + "loss": 4.5594, + "step": 35340 + }, + { + "epoch": 0.2101829384337235, + "grad_norm": 1.8977982997894287, + "learning_rate": 4.474535232562176e-05, + "loss": 5.0617, + "step": 35341 + }, + { + "epoch": 0.21018888571700448, + "grad_norm": 1.6481338739395142, + "learning_rate": 4.474506582901054e-05, + "loss": 4.8686, + "step": 35342 + }, + { + "epoch": 0.21019483300028546, + "grad_norm": 1.7469749450683594, + "learning_rate": 4.474477932550654e-05, + "loss": 5.0062, + "step": 35343 + }, + { + "epoch": 0.21020078028356648, + "grad_norm": 1.6420084238052368, + "learning_rate": 4.474449281510984e-05, + "loss": 5.0616, + "step": 35344 + }, + { + "epoch": 0.21020672756684747, + "grad_norm": 1.5826870203018188, + "learning_rate": 4.474420629782053e-05, + "loss": 4.7822, + "step": 35345 + }, + { + "epoch": 0.21021267485012846, + "grad_norm": 1.6704856157302856, + "learning_rate": 4.4743919773638724e-05, + "loss": 4.6443, + "step": 35346 + }, + { + "epoch": 0.21021862213340947, + "grad_norm": 1.573956847190857, + "learning_rate": 4.474363324256453e-05, + "loss": 4.5345, + "step": 35347 + }, + { + "epoch": 0.21022456941669046, + "grad_norm": 1.736695408821106, + "learning_rate": 4.474334670459802e-05, + "loss": 4.7278, + "step": 35348 + }, + { + "epoch": 0.21023051669997145, + "grad_norm": 1.8054871559143066, + "learning_rate": 4.474306015973933e-05, + "loss": 4.8814, + "step": 35349 + }, + { + "epoch": 0.21023646398325246, + "grad_norm": 1.9570103883743286, + "learning_rate": 4.474277360798853e-05, + "loss": 4.4891, + "step": 35350 + }, + { + "epoch": 0.21024241126653345, + "grad_norm": 1.7210701704025269, + "learning_rate": 4.474248704934574e-05, + "loss": 5.5031, + "step": 35351 + }, + { + "epoch": 0.21024835854981444, + "grad_norm": 1.6097183227539062, + "learning_rate": 4.474220048381105e-05, + "loss": 5.2063, + "step": 35352 + }, + { + "epoch": 0.21025430583309543, + "grad_norm": 1.2283453941345215, + "learning_rate": 4.4741913911384556e-05, + "loss": 5.4689, + "step": 35353 + }, + { + "epoch": 0.21026025311637644, + "grad_norm": 1.5359746217727661, + "learning_rate": 4.4741627332066364e-05, + "loss": 5.0208, + "step": 35354 + }, + { + "epoch": 0.21026620039965743, + "grad_norm": 1.7506155967712402, + "learning_rate": 4.474134074585658e-05, + "loss": 4.8214, + "step": 35355 + }, + { + "epoch": 0.21027214768293842, + "grad_norm": 1.6390902996063232, + "learning_rate": 4.474105415275529e-05, + "loss": 4.8685, + "step": 35356 + }, + { + "epoch": 0.21027809496621944, + "grad_norm": 1.7952314615249634, + "learning_rate": 4.474076755276261e-05, + "loss": 4.671, + "step": 35357 + }, + { + "epoch": 0.21028404224950042, + "grad_norm": 1.6228652000427246, + "learning_rate": 4.4740480945878624e-05, + "loss": 5.0547, + "step": 35358 + }, + { + "epoch": 0.2102899895327814, + "grad_norm": 1.460041880607605, + "learning_rate": 4.4740194332103444e-05, + "loss": 5.0881, + "step": 35359 + }, + { + "epoch": 0.21029593681606243, + "grad_norm": 1.5461219549179077, + "learning_rate": 4.4739907711437176e-05, + "loss": 4.7493, + "step": 35360 + }, + { + "epoch": 0.21030188409934342, + "grad_norm": 1.6176092624664307, + "learning_rate": 4.4739621083879896e-05, + "loss": 4.797, + "step": 35361 + }, + { + "epoch": 0.2103078313826244, + "grad_norm": 1.599272608757019, + "learning_rate": 4.4739334449431725e-05, + "loss": 4.8203, + "step": 35362 + }, + { + "epoch": 0.21031377866590542, + "grad_norm": 1.6934388875961304, + "learning_rate": 4.473904780809276e-05, + "loss": 4.8852, + "step": 35363 + }, + { + "epoch": 0.2103197259491864, + "grad_norm": 1.7300370931625366, + "learning_rate": 4.473876115986308e-05, + "loss": 4.8187, + "step": 35364 + }, + { + "epoch": 0.2103256732324674, + "grad_norm": 1.8484524488449097, + "learning_rate": 4.473847450474282e-05, + "loss": 4.8067, + "step": 35365 + }, + { + "epoch": 0.2103316205157484, + "grad_norm": 1.6827832460403442, + "learning_rate": 4.473818784273206e-05, + "loss": 4.8757, + "step": 35366 + }, + { + "epoch": 0.2103375677990294, + "grad_norm": 2.2498104572296143, + "learning_rate": 4.47379011738309e-05, + "loss": 3.5606, + "step": 35367 + }, + { + "epoch": 0.2103435150823104, + "grad_norm": 2.074948787689209, + "learning_rate": 4.473761449803944e-05, + "loss": 4.1696, + "step": 35368 + }, + { + "epoch": 0.2103494623655914, + "grad_norm": 1.5519061088562012, + "learning_rate": 4.473732781535778e-05, + "loss": 4.9684, + "step": 35369 + }, + { + "epoch": 0.2103554096488724, + "grad_norm": 1.718672752380371, + "learning_rate": 4.473704112578603e-05, + "loss": 5.1141, + "step": 35370 + }, + { + "epoch": 0.21036135693215338, + "grad_norm": 2.0321526527404785, + "learning_rate": 4.4736754429324276e-05, + "loss": 5.4429, + "step": 35371 + }, + { + "epoch": 0.2103673042154344, + "grad_norm": 1.5147876739501953, + "learning_rate": 4.473646772597263e-05, + "loss": 5.0556, + "step": 35372 + }, + { + "epoch": 0.21037325149871539, + "grad_norm": 1.7505944967269897, + "learning_rate": 4.4736181015731186e-05, + "loss": 4.3422, + "step": 35373 + }, + { + "epoch": 0.21037919878199637, + "grad_norm": 1.6300780773162842, + "learning_rate": 4.473589429860004e-05, + "loss": 4.8597, + "step": 35374 + }, + { + "epoch": 0.2103851460652774, + "grad_norm": 1.5492527484893799, + "learning_rate": 4.473560757457931e-05, + "loss": 4.9378, + "step": 35375 + }, + { + "epoch": 0.21039109334855838, + "grad_norm": 1.7544493675231934, + "learning_rate": 4.4735320843669074e-05, + "loss": 4.8018, + "step": 35376 + }, + { + "epoch": 0.21039704063183937, + "grad_norm": 1.588906168937683, + "learning_rate": 4.4735034105869446e-05, + "loss": 4.9619, + "step": 35377 + }, + { + "epoch": 0.21040298791512038, + "grad_norm": 1.677171230316162, + "learning_rate": 4.473474736118052e-05, + "loss": 4.8541, + "step": 35378 + }, + { + "epoch": 0.21040893519840137, + "grad_norm": 1.5742454528808594, + "learning_rate": 4.473446060960239e-05, + "loss": 4.8147, + "step": 35379 + }, + { + "epoch": 0.21041488248168236, + "grad_norm": 1.5566039085388184, + "learning_rate": 4.473417385113518e-05, + "loss": 4.7967, + "step": 35380 + }, + { + "epoch": 0.21042082976496337, + "grad_norm": 1.377108097076416, + "learning_rate": 4.473388708577896e-05, + "loss": 4.7101, + "step": 35381 + }, + { + "epoch": 0.21042677704824436, + "grad_norm": 1.8263981342315674, + "learning_rate": 4.473360031353384e-05, + "loss": 4.6103, + "step": 35382 + }, + { + "epoch": 0.21043272433152535, + "grad_norm": 1.547569990158081, + "learning_rate": 4.4733313534399934e-05, + "loss": 4.7909, + "step": 35383 + }, + { + "epoch": 0.21043867161480637, + "grad_norm": 1.49032461643219, + "learning_rate": 4.473302674837733e-05, + "loss": 4.7885, + "step": 35384 + }, + { + "epoch": 0.21044461889808735, + "grad_norm": 1.6592745780944824, + "learning_rate": 4.473273995546613e-05, + "loss": 4.8221, + "step": 35385 + }, + { + "epoch": 0.21045056618136834, + "grad_norm": 1.3907108306884766, + "learning_rate": 4.473245315566644e-05, + "loss": 4.8192, + "step": 35386 + }, + { + "epoch": 0.21045651346464936, + "grad_norm": 1.4064911603927612, + "learning_rate": 4.4732166348978345e-05, + "loss": 4.6388, + "step": 35387 + }, + { + "epoch": 0.21046246074793035, + "grad_norm": 1.567564845085144, + "learning_rate": 4.473187953540196e-05, + "loss": 4.7417, + "step": 35388 + }, + { + "epoch": 0.21046840803121133, + "grad_norm": 1.6142017841339111, + "learning_rate": 4.4731592714937375e-05, + "loss": 4.8069, + "step": 35389 + }, + { + "epoch": 0.21047435531449235, + "grad_norm": 1.7662934064865112, + "learning_rate": 4.4731305887584694e-05, + "loss": 4.7536, + "step": 35390 + }, + { + "epoch": 0.21048030259777334, + "grad_norm": 1.4565373659133911, + "learning_rate": 4.473101905334403e-05, + "loss": 4.6508, + "step": 35391 + }, + { + "epoch": 0.21048624988105433, + "grad_norm": 1.4265527725219727, + "learning_rate": 4.473073221221546e-05, + "loss": 4.7285, + "step": 35392 + }, + { + "epoch": 0.21049219716433534, + "grad_norm": 1.4948612451553345, + "learning_rate": 4.47304453641991e-05, + "loss": 4.6359, + "step": 35393 + }, + { + "epoch": 0.21049814444761633, + "grad_norm": 1.4874624013900757, + "learning_rate": 4.473015850929504e-05, + "loss": 4.6045, + "step": 35394 + }, + { + "epoch": 0.21050409173089732, + "grad_norm": 1.5556808710098267, + "learning_rate": 4.4729871647503394e-05, + "loss": 4.6039, + "step": 35395 + }, + { + "epoch": 0.21051003901417834, + "grad_norm": 1.529366374015808, + "learning_rate": 4.4729584778824246e-05, + "loss": 4.6944, + "step": 35396 + }, + { + "epoch": 0.21051598629745932, + "grad_norm": 1.514224648475647, + "learning_rate": 4.4729297903257704e-05, + "loss": 4.6967, + "step": 35397 + }, + { + "epoch": 0.2105219335807403, + "grad_norm": 1.601287841796875, + "learning_rate": 4.4729011020803876e-05, + "loss": 4.8392, + "step": 35398 + }, + { + "epoch": 0.21052788086402133, + "grad_norm": 1.6562103033065796, + "learning_rate": 4.4728724131462844e-05, + "loss": 4.5464, + "step": 35399 + }, + { + "epoch": 0.21053382814730232, + "grad_norm": 1.7306544780731201, + "learning_rate": 4.472843723523472e-05, + "loss": 4.7439, + "step": 35400 + }, + { + "epoch": 0.2105397754305833, + "grad_norm": 2.2626922130584717, + "learning_rate": 4.472815033211961e-05, + "loss": 3.9945, + "step": 35401 + }, + { + "epoch": 0.21054572271386432, + "grad_norm": 1.5204706192016602, + "learning_rate": 4.4727863422117597e-05, + "loss": 5.2211, + "step": 35402 + }, + { + "epoch": 0.2105516699971453, + "grad_norm": 2.4926559925079346, + "learning_rate": 4.4727576505228796e-05, + "loss": 4.4308, + "step": 35403 + }, + { + "epoch": 0.2105576172804263, + "grad_norm": 2.4240057468414307, + "learning_rate": 4.47272895814533e-05, + "loss": 4.5483, + "step": 35404 + }, + { + "epoch": 0.2105635645637073, + "grad_norm": 2.342590570449829, + "learning_rate": 4.4727002650791215e-05, + "loss": 4.4737, + "step": 35405 + }, + { + "epoch": 0.2105695118469883, + "grad_norm": 2.181455373764038, + "learning_rate": 4.472671571324264e-05, + "loss": 4.3658, + "step": 35406 + }, + { + "epoch": 0.2105754591302693, + "grad_norm": 1.8659454584121704, + "learning_rate": 4.472642876880766e-05, + "loss": 4.3592, + "step": 35407 + }, + { + "epoch": 0.2105814064135503, + "grad_norm": 2.0775270462036133, + "learning_rate": 4.47261418174864e-05, + "loss": 4.5404, + "step": 35408 + }, + { + "epoch": 0.2105873536968313, + "grad_norm": 2.4261813163757324, + "learning_rate": 4.4725854859278935e-05, + "loss": 4.5228, + "step": 35409 + }, + { + "epoch": 0.21059330098011228, + "grad_norm": 1.8874660730361938, + "learning_rate": 4.472556789418539e-05, + "loss": 4.68, + "step": 35410 + }, + { + "epoch": 0.21059924826339327, + "grad_norm": 1.7671655416488647, + "learning_rate": 4.472528092220585e-05, + "loss": 4.9307, + "step": 35411 + }, + { + "epoch": 0.21060519554667428, + "grad_norm": 2.424664258956909, + "learning_rate": 4.472499394334041e-05, + "loss": 3.6838, + "step": 35412 + }, + { + "epoch": 0.21061114282995527, + "grad_norm": 2.5734341144561768, + "learning_rate": 4.472470695758919e-05, + "loss": 3.6457, + "step": 35413 + }, + { + "epoch": 0.21061709011323626, + "grad_norm": 2.7775492668151855, + "learning_rate": 4.4724419964952267e-05, + "loss": 3.6932, + "step": 35414 + }, + { + "epoch": 0.21062303739651728, + "grad_norm": 2.4683339595794678, + "learning_rate": 4.4724132965429764e-05, + "loss": 3.3679, + "step": 35415 + }, + { + "epoch": 0.21062898467979826, + "grad_norm": 2.478834629058838, + "learning_rate": 4.472384595902176e-05, + "loss": 3.5933, + "step": 35416 + }, + { + "epoch": 0.21063493196307925, + "grad_norm": 2.095451831817627, + "learning_rate": 4.472355894572837e-05, + "loss": 4.1527, + "step": 35417 + }, + { + "epoch": 0.21064087924636027, + "grad_norm": 1.720664381980896, + "learning_rate": 4.472327192554969e-05, + "loss": 5.2839, + "step": 35418 + }, + { + "epoch": 0.21064682652964126, + "grad_norm": 1.7128255367279053, + "learning_rate": 4.472298489848582e-05, + "loss": 5.2786, + "step": 35419 + }, + { + "epoch": 0.21065277381292224, + "grad_norm": 2.405431032180786, + "learning_rate": 4.472269786453686e-05, + "loss": 4.0237, + "step": 35420 + }, + { + "epoch": 0.21065872109620326, + "grad_norm": 2.2845146656036377, + "learning_rate": 4.472241082370291e-05, + "loss": 4.6355, + "step": 35421 + }, + { + "epoch": 0.21066466837948425, + "grad_norm": 1.5968048572540283, + "learning_rate": 4.4722123775984074e-05, + "loss": 5.0121, + "step": 35422 + }, + { + "epoch": 0.21067061566276524, + "grad_norm": 1.874489188194275, + "learning_rate": 4.472183672138044e-05, + "loss": 4.9313, + "step": 35423 + }, + { + "epoch": 0.21067656294604625, + "grad_norm": 1.8483302593231201, + "learning_rate": 4.472154965989211e-05, + "loss": 4.9481, + "step": 35424 + }, + { + "epoch": 0.21068251022932724, + "grad_norm": 1.7580935955047607, + "learning_rate": 4.47212625915192e-05, + "loss": 5.0041, + "step": 35425 + }, + { + "epoch": 0.21068845751260823, + "grad_norm": 2.090477705001831, + "learning_rate": 4.472097551626181e-05, + "loss": 4.5731, + "step": 35426 + }, + { + "epoch": 0.21069440479588925, + "grad_norm": 1.7280174493789673, + "learning_rate": 4.472068843412002e-05, + "loss": 4.7637, + "step": 35427 + }, + { + "epoch": 0.21070035207917023, + "grad_norm": 1.6075327396392822, + "learning_rate": 4.4720401345093944e-05, + "loss": 5.6445, + "step": 35428 + }, + { + "epoch": 0.21070629936245122, + "grad_norm": 1.6671342849731445, + "learning_rate": 4.472011424918367e-05, + "loss": 5.1241, + "step": 35429 + }, + { + "epoch": 0.21071224664573224, + "grad_norm": 1.8082268238067627, + "learning_rate": 4.471982714638933e-05, + "loss": 4.5997, + "step": 35430 + }, + { + "epoch": 0.21071819392901323, + "grad_norm": 1.6478972434997559, + "learning_rate": 4.4719540036710984e-05, + "loss": 4.8913, + "step": 35431 + }, + { + "epoch": 0.2107241412122942, + "grad_norm": 1.7590205669403076, + "learning_rate": 4.4719252920148756e-05, + "loss": 5.0692, + "step": 35432 + }, + { + "epoch": 0.21073008849557523, + "grad_norm": 1.618296504020691, + "learning_rate": 4.471896579670274e-05, + "loss": 5.036, + "step": 35433 + }, + { + "epoch": 0.21073603577885622, + "grad_norm": 1.7069618701934814, + "learning_rate": 4.471867866637304e-05, + "loss": 4.7965, + "step": 35434 + }, + { + "epoch": 0.2107419830621372, + "grad_norm": 1.7504613399505615, + "learning_rate": 4.471839152915975e-05, + "loss": 4.7929, + "step": 35435 + }, + { + "epoch": 0.21074793034541822, + "grad_norm": 2.9193410873413086, + "learning_rate": 4.471810438506297e-05, + "loss": 3.469, + "step": 35436 + }, + { + "epoch": 0.2107538776286992, + "grad_norm": 3.223361015319824, + "learning_rate": 4.471781723408281e-05, + "loss": 3.3836, + "step": 35437 + }, + { + "epoch": 0.2107598249119802, + "grad_norm": 2.4114229679107666, + "learning_rate": 4.471753007621936e-05, + "loss": 3.9834, + "step": 35438 + }, + { + "epoch": 0.21076577219526121, + "grad_norm": 1.8739566802978516, + "learning_rate": 4.471724291147272e-05, + "loss": 4.1878, + "step": 35439 + }, + { + "epoch": 0.2107717194785422, + "grad_norm": 1.6680757999420166, + "learning_rate": 4.4716955739843004e-05, + "loss": 4.4736, + "step": 35440 + }, + { + "epoch": 0.2107776667618232, + "grad_norm": 1.5248615741729736, + "learning_rate": 4.47166685613303e-05, + "loss": 5.1447, + "step": 35441 + }, + { + "epoch": 0.2107836140451042, + "grad_norm": 1.8997430801391602, + "learning_rate": 4.47163813759347e-05, + "loss": 5.1902, + "step": 35442 + }, + { + "epoch": 0.2107895613283852, + "grad_norm": 1.822595477104187, + "learning_rate": 4.471609418365632e-05, + "loss": 5.119, + "step": 35443 + }, + { + "epoch": 0.21079550861166618, + "grad_norm": 1.5157179832458496, + "learning_rate": 4.471580698449526e-05, + "loss": 5.9801, + "step": 35444 + }, + { + "epoch": 0.2108014558949472, + "grad_norm": 1.7875192165374756, + "learning_rate": 4.471551977845162e-05, + "loss": 5.1399, + "step": 35445 + }, + { + "epoch": 0.2108074031782282, + "grad_norm": 1.8765265941619873, + "learning_rate": 4.471523256552549e-05, + "loss": 5.2492, + "step": 35446 + }, + { + "epoch": 0.21081335046150917, + "grad_norm": 1.514116644859314, + "learning_rate": 4.4714945345716976e-05, + "loss": 5.1928, + "step": 35447 + }, + { + "epoch": 0.2108192977447902, + "grad_norm": 1.7573895454406738, + "learning_rate": 4.471465811902617e-05, + "loss": 5.1439, + "step": 35448 + }, + { + "epoch": 0.21082524502807118, + "grad_norm": 1.641224980354309, + "learning_rate": 4.471437088545319e-05, + "loss": 4.8997, + "step": 35449 + }, + { + "epoch": 0.21083119231135217, + "grad_norm": 4.0163116455078125, + "learning_rate": 4.4714083644998126e-05, + "loss": 3.5777, + "step": 35450 + }, + { + "epoch": 0.21083713959463318, + "grad_norm": 3.2716312408447266, + "learning_rate": 4.471379639766108e-05, + "loss": 3.2344, + "step": 35451 + }, + { + "epoch": 0.21084308687791417, + "grad_norm": 2.3019282817840576, + "learning_rate": 4.4713509143442146e-05, + "loss": 4.5853, + "step": 35452 + }, + { + "epoch": 0.21084903416119516, + "grad_norm": 1.856331467628479, + "learning_rate": 4.4713221882341436e-05, + "loss": 4.6279, + "step": 35453 + }, + { + "epoch": 0.21085498144447617, + "grad_norm": 2.480015277862549, + "learning_rate": 4.471293461435904e-05, + "loss": 3.1665, + "step": 35454 + }, + { + "epoch": 0.21086092872775716, + "grad_norm": 2.5631415843963623, + "learning_rate": 4.471264733949506e-05, + "loss": 3.739, + "step": 35455 + }, + { + "epoch": 0.21086687601103815, + "grad_norm": 2.8379833698272705, + "learning_rate": 4.47123600577496e-05, + "loss": 2.5041, + "step": 35456 + }, + { + "epoch": 0.21087282329431917, + "grad_norm": 2.9057741165161133, + "learning_rate": 4.471207276912276e-05, + "loss": 4.1469, + "step": 35457 + }, + { + "epoch": 0.21087877057760016, + "grad_norm": 1.9106336832046509, + "learning_rate": 4.4711785473614644e-05, + "loss": 5.3251, + "step": 35458 + }, + { + "epoch": 0.21088471786088114, + "grad_norm": 1.7996464967727661, + "learning_rate": 4.471149817122534e-05, + "loss": 5.5621, + "step": 35459 + }, + { + "epoch": 0.21089066514416216, + "grad_norm": 2.341482162475586, + "learning_rate": 4.471121086195496e-05, + "loss": 4.8165, + "step": 35460 + }, + { + "epoch": 0.21089661242744315, + "grad_norm": 2.984218120574951, + "learning_rate": 4.47109235458036e-05, + "loss": 2.3554, + "step": 35461 + }, + { + "epoch": 0.21090255971072414, + "grad_norm": 3.3478970527648926, + "learning_rate": 4.471063622277135e-05, + "loss": 2.5034, + "step": 35462 + }, + { + "epoch": 0.21090850699400515, + "grad_norm": 2.904313325881958, + "learning_rate": 4.4710348892858333e-05, + "loss": 3.3472, + "step": 35463 + }, + { + "epoch": 0.21091445427728614, + "grad_norm": 1.8072670698165894, + "learning_rate": 4.471006155606463e-05, + "loss": 4.8444, + "step": 35464 + }, + { + "epoch": 0.21092040156056713, + "grad_norm": 3.2260665893554688, + "learning_rate": 4.470977421239035e-05, + "loss": 3.1718, + "step": 35465 + }, + { + "epoch": 0.21092634884384814, + "grad_norm": 1.743060827255249, + "learning_rate": 4.470948686183559e-05, + "loss": 5.0387, + "step": 35466 + }, + { + "epoch": 0.21093229612712913, + "grad_norm": 1.634989857673645, + "learning_rate": 4.4709199504400456e-05, + "loss": 4.993, + "step": 35467 + }, + { + "epoch": 0.21093824341041012, + "grad_norm": 1.6594475507736206, + "learning_rate": 4.470891214008505e-05, + "loss": 4.5849, + "step": 35468 + }, + { + "epoch": 0.2109441906936911, + "grad_norm": 1.6074466705322266, + "learning_rate": 4.470862476888945e-05, + "loss": 4.996, + "step": 35469 + }, + { + "epoch": 0.21095013797697212, + "grad_norm": 1.9464056491851807, + "learning_rate": 4.470833739081378e-05, + "loss": 4.5604, + "step": 35470 + }, + { + "epoch": 0.2109560852602531, + "grad_norm": 1.9392175674438477, + "learning_rate": 4.470805000585814e-05, + "loss": 4.8605, + "step": 35471 + }, + { + "epoch": 0.2109620325435341, + "grad_norm": 1.7574516534805298, + "learning_rate": 4.470776261402262e-05, + "loss": 4.8513, + "step": 35472 + }, + { + "epoch": 0.21096797982681512, + "grad_norm": 1.8646680116653442, + "learning_rate": 4.4707475215307315e-05, + "loss": 4.8125, + "step": 35473 + }, + { + "epoch": 0.2109739271100961, + "grad_norm": 1.7200084924697876, + "learning_rate": 4.4707187809712346e-05, + "loss": 4.7822, + "step": 35474 + }, + { + "epoch": 0.2109798743933771, + "grad_norm": 1.8245753049850464, + "learning_rate": 4.4706900397237795e-05, + "loss": 4.4128, + "step": 35475 + }, + { + "epoch": 0.2109858216766581, + "grad_norm": 1.5685904026031494, + "learning_rate": 4.4706612977883765e-05, + "loss": 5.1459, + "step": 35476 + }, + { + "epoch": 0.2109917689599391, + "grad_norm": 1.5470824241638184, + "learning_rate": 4.4706325551650364e-05, + "loss": 4.6915, + "step": 35477 + }, + { + "epoch": 0.21099771624322008, + "grad_norm": 1.3199049234390259, + "learning_rate": 4.470603811853769e-05, + "loss": 4.6157, + "step": 35478 + }, + { + "epoch": 0.2110036635265011, + "grad_norm": 1.665404200553894, + "learning_rate": 4.470575067854584e-05, + "loss": 4.8574, + "step": 35479 + }, + { + "epoch": 0.2110096108097821, + "grad_norm": 1.8050642013549805, + "learning_rate": 4.4705463231674915e-05, + "loss": 4.7309, + "step": 35480 + }, + { + "epoch": 0.21101555809306308, + "grad_norm": 1.8453636169433594, + "learning_rate": 4.4705175777925025e-05, + "loss": 4.6349, + "step": 35481 + }, + { + "epoch": 0.2110215053763441, + "grad_norm": 1.7850289344787598, + "learning_rate": 4.470488831729625e-05, + "loss": 4.6913, + "step": 35482 + }, + { + "epoch": 0.21102745265962508, + "grad_norm": 1.808980107307434, + "learning_rate": 4.4704600849788703e-05, + "loss": 4.6751, + "step": 35483 + }, + { + "epoch": 0.21103339994290607, + "grad_norm": 1.6603264808654785, + "learning_rate": 4.470431337540249e-05, + "loss": 4.5178, + "step": 35484 + }, + { + "epoch": 0.21103934722618709, + "grad_norm": 1.672696590423584, + "learning_rate": 4.47040258941377e-05, + "loss": 5.1297, + "step": 35485 + }, + { + "epoch": 0.21104529450946807, + "grad_norm": 1.8498941659927368, + "learning_rate": 4.4703738405994446e-05, + "loss": 4.4831, + "step": 35486 + }, + { + "epoch": 0.21105124179274906, + "grad_norm": 2.02712345123291, + "learning_rate": 4.470345091097281e-05, + "loss": 5.1251, + "step": 35487 + }, + { + "epoch": 0.21105718907603008, + "grad_norm": 1.5441256761550903, + "learning_rate": 4.470316340907291e-05, + "loss": 5.1557, + "step": 35488 + }, + { + "epoch": 0.21106313635931107, + "grad_norm": 1.5917513370513916, + "learning_rate": 4.470287590029483e-05, + "loss": 4.7467, + "step": 35489 + }, + { + "epoch": 0.21106908364259205, + "grad_norm": 1.693744421005249, + "learning_rate": 4.4702588384638686e-05, + "loss": 5.2251, + "step": 35490 + }, + { + "epoch": 0.21107503092587307, + "grad_norm": 1.5168530941009521, + "learning_rate": 4.470230086210457e-05, + "loss": 5.1613, + "step": 35491 + }, + { + "epoch": 0.21108097820915406, + "grad_norm": 1.5303258895874023, + "learning_rate": 4.4702013332692596e-05, + "loss": 5.0635, + "step": 35492 + }, + { + "epoch": 0.21108692549243505, + "grad_norm": 1.5826016664505005, + "learning_rate": 4.470172579640284e-05, + "loss": 4.6436, + "step": 35493 + }, + { + "epoch": 0.21109287277571606, + "grad_norm": 1.8279229402542114, + "learning_rate": 4.470143825323542e-05, + "loss": 4.772, + "step": 35494 + }, + { + "epoch": 0.21109882005899705, + "grad_norm": 1.7597702741622925, + "learning_rate": 4.4701150703190423e-05, + "loss": 5.0296, + "step": 35495 + }, + { + "epoch": 0.21110476734227804, + "grad_norm": 1.5488911867141724, + "learning_rate": 4.470086314626797e-05, + "loss": 4.9258, + "step": 35496 + }, + { + "epoch": 0.21111071462555905, + "grad_norm": 1.6315878629684448, + "learning_rate": 4.470057558246814e-05, + "loss": 4.5243, + "step": 35497 + }, + { + "epoch": 0.21111666190884004, + "grad_norm": 2.2954094409942627, + "learning_rate": 4.470028801179105e-05, + "loss": 4.2517, + "step": 35498 + }, + { + "epoch": 0.21112260919212103, + "grad_norm": 2.7295327186584473, + "learning_rate": 4.470000043423679e-05, + "loss": 4.2955, + "step": 35499 + }, + { + "epoch": 0.21112855647540205, + "grad_norm": 2.555826425552368, + "learning_rate": 4.469971284980546e-05, + "loss": 4.7908, + "step": 35500 + }, + { + "epoch": 0.21113450375868303, + "grad_norm": 1.5622174739837646, + "learning_rate": 4.4699425258497165e-05, + "loss": 5.0972, + "step": 35501 + }, + { + "epoch": 0.21114045104196402, + "grad_norm": 1.6253665685653687, + "learning_rate": 4.469913766031201e-05, + "loss": 4.5942, + "step": 35502 + }, + { + "epoch": 0.21114639832524504, + "grad_norm": 1.5531213283538818, + "learning_rate": 4.469885005525008e-05, + "loss": 4.9864, + "step": 35503 + }, + { + "epoch": 0.21115234560852603, + "grad_norm": 1.544110655784607, + "learning_rate": 4.4698562443311487e-05, + "loss": 4.7724, + "step": 35504 + }, + { + "epoch": 0.21115829289180701, + "grad_norm": 1.592146396636963, + "learning_rate": 4.4698274824496335e-05, + "loss": 4.9874, + "step": 35505 + }, + { + "epoch": 0.21116424017508803, + "grad_norm": 1.7738112211227417, + "learning_rate": 4.4697987198804713e-05, + "loss": 5.1586, + "step": 35506 + }, + { + "epoch": 0.21117018745836902, + "grad_norm": 1.7359950542449951, + "learning_rate": 4.4697699566236736e-05, + "loss": 4.6817, + "step": 35507 + }, + { + "epoch": 0.21117613474165, + "grad_norm": 1.5513485670089722, + "learning_rate": 4.469741192679249e-05, + "loss": 5.0546, + "step": 35508 + }, + { + "epoch": 0.21118208202493102, + "grad_norm": 1.6848827600479126, + "learning_rate": 4.469712428047208e-05, + "loss": 4.8273, + "step": 35509 + }, + { + "epoch": 0.211188029308212, + "grad_norm": 1.9015002250671387, + "learning_rate": 4.469683662727561e-05, + "loss": 4.5408, + "step": 35510 + }, + { + "epoch": 0.211193976591493, + "grad_norm": 1.6639163494110107, + "learning_rate": 4.469654896720317e-05, + "loss": 5.4344, + "step": 35511 + }, + { + "epoch": 0.21119992387477401, + "grad_norm": 1.6011817455291748, + "learning_rate": 4.469626130025488e-05, + "loss": 4.8481, + "step": 35512 + }, + { + "epoch": 0.211205871158055, + "grad_norm": 1.4600160121917725, + "learning_rate": 4.469597362643082e-05, + "loss": 4.9088, + "step": 35513 + }, + { + "epoch": 0.211211818441336, + "grad_norm": 1.4780898094177246, + "learning_rate": 4.46956859457311e-05, + "loss": 5.0465, + "step": 35514 + }, + { + "epoch": 0.211217765724617, + "grad_norm": 1.4310654401779175, + "learning_rate": 4.469539825815582e-05, + "loss": 4.8735, + "step": 35515 + }, + { + "epoch": 0.211223713007898, + "grad_norm": 1.7487471103668213, + "learning_rate": 4.469511056370508e-05, + "loss": 5.0721, + "step": 35516 + }, + { + "epoch": 0.21122966029117898, + "grad_norm": 1.745934009552002, + "learning_rate": 4.469482286237898e-05, + "loss": 5.0724, + "step": 35517 + }, + { + "epoch": 0.21123560757446, + "grad_norm": 1.542649745941162, + "learning_rate": 4.469453515417763e-05, + "loss": 5.294, + "step": 35518 + }, + { + "epoch": 0.211241554857741, + "grad_norm": 1.6778455972671509, + "learning_rate": 4.469424743910111e-05, + "loss": 4.7619, + "step": 35519 + }, + { + "epoch": 0.21124750214102198, + "grad_norm": 1.6462548971176147, + "learning_rate": 4.4693959717149536e-05, + "loss": 4.7533, + "step": 35520 + }, + { + "epoch": 0.211253449424303, + "grad_norm": 1.833646297454834, + "learning_rate": 4.4693671988323006e-05, + "loss": 4.3012, + "step": 35521 + }, + { + "epoch": 0.21125939670758398, + "grad_norm": 1.5945441722869873, + "learning_rate": 4.469338425262162e-05, + "loss": 4.9917, + "step": 35522 + }, + { + "epoch": 0.21126534399086497, + "grad_norm": 2.1458442211151123, + "learning_rate": 4.469309651004547e-05, + "loss": 4.2472, + "step": 35523 + }, + { + "epoch": 0.21127129127414598, + "grad_norm": 2.0150773525238037, + "learning_rate": 4.469280876059467e-05, + "loss": 4.101, + "step": 35524 + }, + { + "epoch": 0.21127723855742697, + "grad_norm": 1.8790959119796753, + "learning_rate": 4.469252100426931e-05, + "loss": 4.1637, + "step": 35525 + }, + { + "epoch": 0.21128318584070796, + "grad_norm": 1.9778228998184204, + "learning_rate": 4.469223324106949e-05, + "loss": 4.2768, + "step": 35526 + }, + { + "epoch": 0.21128913312398895, + "grad_norm": 2.055441379547119, + "learning_rate": 4.469194547099532e-05, + "loss": 4.1131, + "step": 35527 + }, + { + "epoch": 0.21129508040726996, + "grad_norm": 1.8175396919250488, + "learning_rate": 4.46916576940469e-05, + "loss": 4.5419, + "step": 35528 + }, + { + "epoch": 0.21130102769055095, + "grad_norm": 2.1261353492736816, + "learning_rate": 4.4691369910224315e-05, + "loss": 4.2994, + "step": 35529 + }, + { + "epoch": 0.21130697497383194, + "grad_norm": 1.6003457307815552, + "learning_rate": 4.4691082119527686e-05, + "loss": 4.2859, + "step": 35530 + }, + { + "epoch": 0.21131292225711296, + "grad_norm": 2.1611742973327637, + "learning_rate": 4.4690794321957094e-05, + "loss": 4.0373, + "step": 35531 + }, + { + "epoch": 0.21131886954039394, + "grad_norm": 1.887533187866211, + "learning_rate": 4.469050651751266e-05, + "loss": 4.3426, + "step": 35532 + }, + { + "epoch": 0.21132481682367493, + "grad_norm": 1.844598650932312, + "learning_rate": 4.469021870619447e-05, + "loss": 4.3078, + "step": 35533 + }, + { + "epoch": 0.21133076410695595, + "grad_norm": 1.7349529266357422, + "learning_rate": 4.4689930888002626e-05, + "loss": 4.4804, + "step": 35534 + }, + { + "epoch": 0.21133671139023694, + "grad_norm": 1.7875169515609741, + "learning_rate": 4.468964306293723e-05, + "loss": 4.1911, + "step": 35535 + }, + { + "epoch": 0.21134265867351792, + "grad_norm": 2.0172529220581055, + "learning_rate": 4.468935523099838e-05, + "loss": 4.3372, + "step": 35536 + }, + { + "epoch": 0.21134860595679894, + "grad_norm": 1.7885106801986694, + "learning_rate": 4.468906739218619e-05, + "loss": 4.429, + "step": 35537 + }, + { + "epoch": 0.21135455324007993, + "grad_norm": 1.7366465330123901, + "learning_rate": 4.468877954650074e-05, + "loss": 4.1948, + "step": 35538 + }, + { + "epoch": 0.21136050052336092, + "grad_norm": 1.6380743980407715, + "learning_rate": 4.468849169394215e-05, + "loss": 4.2948, + "step": 35539 + }, + { + "epoch": 0.21136644780664193, + "grad_norm": 1.6585488319396973, + "learning_rate": 4.46882038345105e-05, + "loss": 4.4046, + "step": 35540 + }, + { + "epoch": 0.21137239508992292, + "grad_norm": 1.6397299766540527, + "learning_rate": 4.468791596820591e-05, + "loss": 4.3406, + "step": 35541 + }, + { + "epoch": 0.2113783423732039, + "grad_norm": 1.8481812477111816, + "learning_rate": 4.468762809502847e-05, + "loss": 4.4447, + "step": 35542 + }, + { + "epoch": 0.21138428965648492, + "grad_norm": 1.6186330318450928, + "learning_rate": 4.468734021497828e-05, + "loss": 4.4204, + "step": 35543 + }, + { + "epoch": 0.2113902369397659, + "grad_norm": 1.7163970470428467, + "learning_rate": 4.4687052328055444e-05, + "loss": 4.1462, + "step": 35544 + }, + { + "epoch": 0.2113961842230469, + "grad_norm": 1.6585257053375244, + "learning_rate": 4.468676443426006e-05, + "loss": 4.4792, + "step": 35545 + }, + { + "epoch": 0.21140213150632792, + "grad_norm": 1.6501747369766235, + "learning_rate": 4.468647653359223e-05, + "loss": 4.3497, + "step": 35546 + }, + { + "epoch": 0.2114080787896089, + "grad_norm": 1.636633038520813, + "learning_rate": 4.468618862605205e-05, + "loss": 4.3438, + "step": 35547 + }, + { + "epoch": 0.2114140260728899, + "grad_norm": 1.6481387615203857, + "learning_rate": 4.468590071163964e-05, + "loss": 4.6326, + "step": 35548 + }, + { + "epoch": 0.2114199733561709, + "grad_norm": 1.4565008878707886, + "learning_rate": 4.468561279035508e-05, + "loss": 4.4197, + "step": 35549 + }, + { + "epoch": 0.2114259206394519, + "grad_norm": 1.7687804698944092, + "learning_rate": 4.4685324862198465e-05, + "loss": 4.0757, + "step": 35550 + }, + { + "epoch": 0.21143186792273289, + "grad_norm": 1.5998481512069702, + "learning_rate": 4.468503692716991e-05, + "loss": 4.2909, + "step": 35551 + }, + { + "epoch": 0.2114378152060139, + "grad_norm": 1.8555128574371338, + "learning_rate": 4.468474898526952e-05, + "loss": 4.6799, + "step": 35552 + }, + { + "epoch": 0.2114437624892949, + "grad_norm": 1.8145393133163452, + "learning_rate": 4.4684461036497385e-05, + "loss": 4.8618, + "step": 35553 + }, + { + "epoch": 0.21144970977257588, + "grad_norm": 1.6641209125518799, + "learning_rate": 4.4684173080853606e-05, + "loss": 4.9345, + "step": 35554 + }, + { + "epoch": 0.2114556570558569, + "grad_norm": 1.7130677700042725, + "learning_rate": 4.468388511833828e-05, + "loss": 4.4058, + "step": 35555 + }, + { + "epoch": 0.21146160433913788, + "grad_norm": 1.589738368988037, + "learning_rate": 4.4683597148951515e-05, + "loss": 4.4582, + "step": 35556 + }, + { + "epoch": 0.21146755162241887, + "grad_norm": 1.6870765686035156, + "learning_rate": 4.468330917269342e-05, + "loss": 4.2733, + "step": 35557 + }, + { + "epoch": 0.21147349890569989, + "grad_norm": 1.8612738847732544, + "learning_rate": 4.468302118956408e-05, + "loss": 4.4758, + "step": 35558 + }, + { + "epoch": 0.21147944618898087, + "grad_norm": 1.7577272653579712, + "learning_rate": 4.46827331995636e-05, + "loss": 4.2751, + "step": 35559 + }, + { + "epoch": 0.21148539347226186, + "grad_norm": 2.018022298812866, + "learning_rate": 4.468244520269208e-05, + "loss": 4.0733, + "step": 35560 + }, + { + "epoch": 0.21149134075554288, + "grad_norm": 2.8763539791107178, + "learning_rate": 4.4682157198949614e-05, + "loss": 3.7829, + "step": 35561 + }, + { + "epoch": 0.21149728803882387, + "grad_norm": 2.9004275798797607, + "learning_rate": 4.4681869188336324e-05, + "loss": 3.8345, + "step": 35562 + }, + { + "epoch": 0.21150323532210485, + "grad_norm": 2.5729360580444336, + "learning_rate": 4.4681581170852296e-05, + "loss": 3.088, + "step": 35563 + }, + { + "epoch": 0.21150918260538587, + "grad_norm": 1.752673864364624, + "learning_rate": 4.468129314649762e-05, + "loss": 4.4127, + "step": 35564 + }, + { + "epoch": 0.21151512988866686, + "grad_norm": 1.3503072261810303, + "learning_rate": 4.468100511527241e-05, + "loss": 5.1179, + "step": 35565 + }, + { + "epoch": 0.21152107717194785, + "grad_norm": 1.5340571403503418, + "learning_rate": 4.468071707717677e-05, + "loss": 5.0112, + "step": 35566 + }, + { + "epoch": 0.21152702445522886, + "grad_norm": 2.2375614643096924, + "learning_rate": 4.46804290322108e-05, + "loss": 3.5679, + "step": 35567 + }, + { + "epoch": 0.21153297173850985, + "grad_norm": 2.288891077041626, + "learning_rate": 4.4680140980374584e-05, + "loss": 3.3486, + "step": 35568 + }, + { + "epoch": 0.21153891902179084, + "grad_norm": 2.604292154312134, + "learning_rate": 4.467985292166824e-05, + "loss": 3.5723, + "step": 35569 + }, + { + "epoch": 0.21154486630507185, + "grad_norm": 2.132558822631836, + "learning_rate": 4.467956485609186e-05, + "loss": 3.6876, + "step": 35570 + }, + { + "epoch": 0.21155081358835284, + "grad_norm": 1.9742835760116577, + "learning_rate": 4.4679276783645554e-05, + "loss": 3.5008, + "step": 35571 + }, + { + "epoch": 0.21155676087163383, + "grad_norm": 2.3548946380615234, + "learning_rate": 4.467898870432941e-05, + "loss": 4.3595, + "step": 35572 + }, + { + "epoch": 0.21156270815491485, + "grad_norm": 2.1973586082458496, + "learning_rate": 4.467870061814353e-05, + "loss": 4.5111, + "step": 35573 + }, + { + "epoch": 0.21156865543819584, + "grad_norm": 2.1064834594726562, + "learning_rate": 4.4678412525088025e-05, + "loss": 3.7621, + "step": 35574 + }, + { + "epoch": 0.21157460272147682, + "grad_norm": 2.0585405826568604, + "learning_rate": 4.467812442516299e-05, + "loss": 3.9767, + "step": 35575 + }, + { + "epoch": 0.21158055000475784, + "grad_norm": 2.481163501739502, + "learning_rate": 4.467783631836853e-05, + "loss": 4.3745, + "step": 35576 + }, + { + "epoch": 0.21158649728803883, + "grad_norm": 2.713836193084717, + "learning_rate": 4.4677548204704734e-05, + "loss": 4.2229, + "step": 35577 + }, + { + "epoch": 0.21159244457131982, + "grad_norm": 2.270063877105713, + "learning_rate": 4.46772600841717e-05, + "loss": 4.5278, + "step": 35578 + }, + { + "epoch": 0.21159839185460083, + "grad_norm": 2.3832831382751465, + "learning_rate": 4.4676971956769555e-05, + "loss": 4.1892, + "step": 35579 + }, + { + "epoch": 0.21160433913788182, + "grad_norm": 2.1676185131073, + "learning_rate": 4.467668382249837e-05, + "loss": 4.2975, + "step": 35580 + }, + { + "epoch": 0.2116102864211628, + "grad_norm": 2.134890556335449, + "learning_rate": 4.467639568135826e-05, + "loss": 4.3765, + "step": 35581 + }, + { + "epoch": 0.21161623370444382, + "grad_norm": 2.3156919479370117, + "learning_rate": 4.4676107533349335e-05, + "loss": 4.087, + "step": 35582 + }, + { + "epoch": 0.2116221809877248, + "grad_norm": 2.341125965118408, + "learning_rate": 4.467581937847167e-05, + "loss": 4.1806, + "step": 35583 + }, + { + "epoch": 0.2116281282710058, + "grad_norm": 2.721090793609619, + "learning_rate": 4.467553121672539e-05, + "loss": 4.2418, + "step": 35584 + }, + { + "epoch": 0.2116340755542868, + "grad_norm": 2.1176726818084717, + "learning_rate": 4.467524304811058e-05, + "loss": 4.0049, + "step": 35585 + }, + { + "epoch": 0.2116400228375678, + "grad_norm": 2.553966760635376, + "learning_rate": 4.4674954872627345e-05, + "loss": 4.095, + "step": 35586 + }, + { + "epoch": 0.2116459701208488, + "grad_norm": 2.35528826713562, + "learning_rate": 4.467466669027579e-05, + "loss": 4.2416, + "step": 35587 + }, + { + "epoch": 0.21165191740412978, + "grad_norm": 1.906132459640503, + "learning_rate": 4.467437850105601e-05, + "loss": 4.9423, + "step": 35588 + }, + { + "epoch": 0.2116578646874108, + "grad_norm": 2.400595188140869, + "learning_rate": 4.4674090304968106e-05, + "loss": 4.0596, + "step": 35589 + }, + { + "epoch": 0.21166381197069178, + "grad_norm": 2.12864089012146, + "learning_rate": 4.467380210201218e-05, + "loss": 4.1793, + "step": 35590 + }, + { + "epoch": 0.21166975925397277, + "grad_norm": 2.3407888412475586, + "learning_rate": 4.4673513892188335e-05, + "loss": 3.7511, + "step": 35591 + }, + { + "epoch": 0.2116757065372538, + "grad_norm": 2.0563061237335205, + "learning_rate": 4.467322567549667e-05, + "loss": 4.2335, + "step": 35592 + }, + { + "epoch": 0.21168165382053478, + "grad_norm": 1.9491883516311646, + "learning_rate": 4.467293745193729e-05, + "loss": 4.1301, + "step": 35593 + }, + { + "epoch": 0.21168760110381576, + "grad_norm": 2.4588730335235596, + "learning_rate": 4.467264922151028e-05, + "loss": 4.0841, + "step": 35594 + }, + { + "epoch": 0.21169354838709678, + "grad_norm": 1.9393937587738037, + "learning_rate": 4.467236098421576e-05, + "loss": 4.2677, + "step": 35595 + }, + { + "epoch": 0.21169949567037777, + "grad_norm": 2.00981068611145, + "learning_rate": 4.4672072740053816e-05, + "loss": 4.1659, + "step": 35596 + }, + { + "epoch": 0.21170544295365876, + "grad_norm": 1.8913508653640747, + "learning_rate": 4.467178448902456e-05, + "loss": 3.9792, + "step": 35597 + }, + { + "epoch": 0.21171139023693977, + "grad_norm": 2.168665647506714, + "learning_rate": 4.467149623112809e-05, + "loss": 3.6157, + "step": 35598 + }, + { + "epoch": 0.21171733752022076, + "grad_norm": 2.5305583477020264, + "learning_rate": 4.467120796636449e-05, + "loss": 4.0156, + "step": 35599 + }, + { + "epoch": 0.21172328480350175, + "grad_norm": 2.593087911605835, + "learning_rate": 4.467091969473389e-05, + "loss": 3.9066, + "step": 35600 + }, + { + "epoch": 0.21172923208678276, + "grad_norm": 1.93959641456604, + "learning_rate": 4.4670631416236365e-05, + "loss": 4.0216, + "step": 35601 + }, + { + "epoch": 0.21173517937006375, + "grad_norm": 4.1372246742248535, + "learning_rate": 4.467034313087203e-05, + "loss": 3.5619, + "step": 35602 + }, + { + "epoch": 0.21174112665334474, + "grad_norm": 3.2538001537323, + "learning_rate": 4.4670054838640984e-05, + "loss": 3.2081, + "step": 35603 + }, + { + "epoch": 0.21174707393662576, + "grad_norm": 4.0510640144348145, + "learning_rate": 4.466976653954332e-05, + "loss": 2.6592, + "step": 35604 + }, + { + "epoch": 0.21175302121990675, + "grad_norm": 3.2859723567962646, + "learning_rate": 4.4669478233579143e-05, + "loss": 2.4745, + "step": 35605 + }, + { + "epoch": 0.21175896850318773, + "grad_norm": 2.0423004627227783, + "learning_rate": 4.466918992074856e-05, + "loss": 4.7596, + "step": 35606 + }, + { + "epoch": 0.21176491578646875, + "grad_norm": 4.604837417602539, + "learning_rate": 4.4668901601051663e-05, + "loss": 4.6869, + "step": 35607 + }, + { + "epoch": 0.21177086306974974, + "grad_norm": 4.67194938659668, + "learning_rate": 4.466861327448856e-05, + "loss": 4.217, + "step": 35608 + }, + { + "epoch": 0.21177681035303073, + "grad_norm": 3.2619986534118652, + "learning_rate": 4.466832494105934e-05, + "loss": 2.6688, + "step": 35609 + }, + { + "epoch": 0.21178275763631174, + "grad_norm": 3.189119577407837, + "learning_rate": 4.466803660076411e-05, + "loss": 3.1766, + "step": 35610 + }, + { + "epoch": 0.21178870491959273, + "grad_norm": 3.00148344039917, + "learning_rate": 4.4667748253602976e-05, + "loss": 2.9854, + "step": 35611 + }, + { + "epoch": 0.21179465220287372, + "grad_norm": 3.20414662361145, + "learning_rate": 4.4667459899576034e-05, + "loss": 3.4441, + "step": 35612 + }, + { + "epoch": 0.21180059948615473, + "grad_norm": 2.8852174282073975, + "learning_rate": 4.466717153868338e-05, + "loss": 3.3384, + "step": 35613 + }, + { + "epoch": 0.21180654676943572, + "grad_norm": 3.3265509605407715, + "learning_rate": 4.466688317092513e-05, + "loss": 4.3809, + "step": 35614 + }, + { + "epoch": 0.2118124940527167, + "grad_norm": 1.7819219827651978, + "learning_rate": 4.4666594796301366e-05, + "loss": 5.4982, + "step": 35615 + }, + { + "epoch": 0.21181844133599773, + "grad_norm": 2.841721296310425, + "learning_rate": 4.46663064148122e-05, + "loss": 2.1118, + "step": 35616 + }, + { + "epoch": 0.21182438861927871, + "grad_norm": 2.0219855308532715, + "learning_rate": 4.466601802645773e-05, + "loss": 4.4131, + "step": 35617 + }, + { + "epoch": 0.2118303359025597, + "grad_norm": 1.6084177494049072, + "learning_rate": 4.466572963123805e-05, + "loss": 5.1337, + "step": 35618 + }, + { + "epoch": 0.21183628318584072, + "grad_norm": 1.998936414718628, + "learning_rate": 4.4665441229153285e-05, + "loss": 4.7807, + "step": 35619 + }, + { + "epoch": 0.2118422304691217, + "grad_norm": 2.4785871505737305, + "learning_rate": 4.46651528202035e-05, + "loss": 5.2531, + "step": 35620 + }, + { + "epoch": 0.2118481777524027, + "grad_norm": 1.9801669120788574, + "learning_rate": 4.466486440438882e-05, + "loss": 5.3602, + "step": 35621 + }, + { + "epoch": 0.2118541250356837, + "grad_norm": 1.637373924255371, + "learning_rate": 4.4664575981709333e-05, + "loss": 5.2314, + "step": 35622 + }, + { + "epoch": 0.2118600723189647, + "grad_norm": 1.5949249267578125, + "learning_rate": 4.466428755216515e-05, + "loss": 5.2677, + "step": 35623 + }, + { + "epoch": 0.2118660196022457, + "grad_norm": 1.5839226245880127, + "learning_rate": 4.466399911575637e-05, + "loss": 5.2145, + "step": 35624 + }, + { + "epoch": 0.2118719668855267, + "grad_norm": 1.6766635179519653, + "learning_rate": 4.4663710672483084e-05, + "loss": 4.9226, + "step": 35625 + }, + { + "epoch": 0.2118779141688077, + "grad_norm": 1.5780537128448486, + "learning_rate": 4.466342222234541e-05, + "loss": 5.1708, + "step": 35626 + }, + { + "epoch": 0.21188386145208868, + "grad_norm": 1.5924153327941895, + "learning_rate": 4.4663133765343436e-05, + "loss": 5.1272, + "step": 35627 + }, + { + "epoch": 0.2118898087353697, + "grad_norm": 1.7102172374725342, + "learning_rate": 4.466284530147725e-05, + "loss": 5.0985, + "step": 35628 + }, + { + "epoch": 0.21189575601865068, + "grad_norm": 1.7256853580474854, + "learning_rate": 4.4662556830746985e-05, + "loss": 5.2144, + "step": 35629 + }, + { + "epoch": 0.21190170330193167, + "grad_norm": 1.5665667057037354, + "learning_rate": 4.466226835315272e-05, + "loss": 5.2794, + "step": 35630 + }, + { + "epoch": 0.2119076505852127, + "grad_norm": 1.538317322731018, + "learning_rate": 4.466197986869456e-05, + "loss": 5.0972, + "step": 35631 + }, + { + "epoch": 0.21191359786849367, + "grad_norm": 1.5625393390655518, + "learning_rate": 4.466169137737261e-05, + "loss": 4.7858, + "step": 35632 + }, + { + "epoch": 0.21191954515177466, + "grad_norm": 1.8737174272537231, + "learning_rate": 4.466140287918695e-05, + "loss": 4.6701, + "step": 35633 + }, + { + "epoch": 0.21192549243505568, + "grad_norm": 1.9811254739761353, + "learning_rate": 4.4661114374137716e-05, + "loss": 4.2716, + "step": 35634 + }, + { + "epoch": 0.21193143971833667, + "grad_norm": 1.6436641216278076, + "learning_rate": 4.4660825862224984e-05, + "loss": 4.6396, + "step": 35635 + }, + { + "epoch": 0.21193738700161766, + "grad_norm": 1.5466450452804565, + "learning_rate": 4.466053734344886e-05, + "loss": 4.5979, + "step": 35636 + }, + { + "epoch": 0.21194333428489867, + "grad_norm": 1.7120367288589478, + "learning_rate": 4.4660248817809444e-05, + "loss": 4.8051, + "step": 35637 + }, + { + "epoch": 0.21194928156817966, + "grad_norm": 1.540959119796753, + "learning_rate": 4.4659960285306846e-05, + "loss": 4.9112, + "step": 35638 + }, + { + "epoch": 0.21195522885146065, + "grad_norm": 1.5579898357391357, + "learning_rate": 4.4659671745941147e-05, + "loss": 4.7432, + "step": 35639 + }, + { + "epoch": 0.21196117613474166, + "grad_norm": 1.3252559900283813, + "learning_rate": 4.465938319971247e-05, + "loss": 4.5407, + "step": 35640 + }, + { + "epoch": 0.21196712341802265, + "grad_norm": 1.4672505855560303, + "learning_rate": 4.4659094646620904e-05, + "loss": 4.4972, + "step": 35641 + }, + { + "epoch": 0.21197307070130364, + "grad_norm": 1.603417158126831, + "learning_rate": 4.4658806086666544e-05, + "loss": 4.6884, + "step": 35642 + }, + { + "epoch": 0.21197901798458466, + "grad_norm": 1.8993263244628906, + "learning_rate": 4.465851751984951e-05, + "loss": 4.8931, + "step": 35643 + }, + { + "epoch": 0.21198496526786564, + "grad_norm": 1.5901163816452026, + "learning_rate": 4.4658228946169875e-05, + "loss": 4.6111, + "step": 35644 + }, + { + "epoch": 0.21199091255114663, + "grad_norm": 1.392621397972107, + "learning_rate": 4.465794036562776e-05, + "loss": 4.4493, + "step": 35645 + }, + { + "epoch": 0.21199685983442762, + "grad_norm": 1.6505818367004395, + "learning_rate": 4.465765177822327e-05, + "loss": 4.931, + "step": 35646 + }, + { + "epoch": 0.21200280711770864, + "grad_norm": 2.01570463180542, + "learning_rate": 4.465736318395649e-05, + "loss": 4.3823, + "step": 35647 + }, + { + "epoch": 0.21200875440098962, + "grad_norm": 2.1474528312683105, + "learning_rate": 4.465707458282753e-05, + "loss": 4.3524, + "step": 35648 + }, + { + "epoch": 0.2120147016842706, + "grad_norm": 1.5785243511199951, + "learning_rate": 4.465678597483649e-05, + "loss": 4.287, + "step": 35649 + }, + { + "epoch": 0.21202064896755163, + "grad_norm": 1.863834023475647, + "learning_rate": 4.465649735998346e-05, + "loss": 5.2325, + "step": 35650 + }, + { + "epoch": 0.21202659625083262, + "grad_norm": 1.8547208309173584, + "learning_rate": 4.465620873826856e-05, + "loss": 5.1475, + "step": 35651 + }, + { + "epoch": 0.2120325435341136, + "grad_norm": 1.5947805643081665, + "learning_rate": 4.465592010969187e-05, + "loss": 5.3976, + "step": 35652 + }, + { + "epoch": 0.21203849081739462, + "grad_norm": 1.9001067876815796, + "learning_rate": 4.4655631474253515e-05, + "loss": 5.492, + "step": 35653 + }, + { + "epoch": 0.2120444381006756, + "grad_norm": 1.6406006813049316, + "learning_rate": 4.465534283195357e-05, + "loss": 4.5243, + "step": 35654 + }, + { + "epoch": 0.2120503853839566, + "grad_norm": 1.9014918804168701, + "learning_rate": 4.4655054182792156e-05, + "loss": 4.794, + "step": 35655 + }, + { + "epoch": 0.2120563326672376, + "grad_norm": 1.647063970565796, + "learning_rate": 4.4654765526769365e-05, + "loss": 4.9163, + "step": 35656 + }, + { + "epoch": 0.2120622799505186, + "grad_norm": 1.5540443658828735, + "learning_rate": 4.4654476863885296e-05, + "loss": 5.0264, + "step": 35657 + }, + { + "epoch": 0.2120682272337996, + "grad_norm": 1.7547403573989868, + "learning_rate": 4.465418819414005e-05, + "loss": 4.8722, + "step": 35658 + }, + { + "epoch": 0.2120741745170806, + "grad_norm": 1.6932998895645142, + "learning_rate": 4.4653899517533736e-05, + "loss": 4.9104, + "step": 35659 + }, + { + "epoch": 0.2120801218003616, + "grad_norm": 1.6615930795669556, + "learning_rate": 4.465361083406645e-05, + "loss": 4.9179, + "step": 35660 + }, + { + "epoch": 0.21208606908364258, + "grad_norm": 2.116122007369995, + "learning_rate": 4.465332214373828e-05, + "loss": 4.0842, + "step": 35661 + }, + { + "epoch": 0.2120920163669236, + "grad_norm": 1.4886269569396973, + "learning_rate": 4.465303344654935e-05, + "loss": 4.8633, + "step": 35662 + }, + { + "epoch": 0.21209796365020459, + "grad_norm": 1.5376653671264648, + "learning_rate": 4.4652744742499744e-05, + "loss": 4.9968, + "step": 35663 + }, + { + "epoch": 0.21210391093348557, + "grad_norm": 1.5422334671020508, + "learning_rate": 4.4652456031589565e-05, + "loss": 4.6542, + "step": 35664 + }, + { + "epoch": 0.2121098582167666, + "grad_norm": 1.8304499387741089, + "learning_rate": 4.465216731381891e-05, + "loss": 5.2773, + "step": 35665 + }, + { + "epoch": 0.21211580550004758, + "grad_norm": 1.8360862731933594, + "learning_rate": 4.4651878589187904e-05, + "loss": 5.1749, + "step": 35666 + }, + { + "epoch": 0.21212175278332857, + "grad_norm": 2.1891777515411377, + "learning_rate": 4.465158985769662e-05, + "loss": 4.5996, + "step": 35667 + }, + { + "epoch": 0.21212770006660958, + "grad_norm": 1.8620492219924927, + "learning_rate": 4.4651301119345174e-05, + "loss": 5.1286, + "step": 35668 + }, + { + "epoch": 0.21213364734989057, + "grad_norm": 1.7725592851638794, + "learning_rate": 4.465101237413366e-05, + "loss": 3.8595, + "step": 35669 + }, + { + "epoch": 0.21213959463317156, + "grad_norm": 3.5651681423187256, + "learning_rate": 4.4650723622062174e-05, + "loss": 2.1436, + "step": 35670 + }, + { + "epoch": 0.21214554191645257, + "grad_norm": 2.6675519943237305, + "learning_rate": 4.465043486313083e-05, + "loss": 2.6111, + "step": 35671 + }, + { + "epoch": 0.21215148919973356, + "grad_norm": 2.8939945697784424, + "learning_rate": 4.4650146097339726e-05, + "loss": 1.951, + "step": 35672 + }, + { + "epoch": 0.21215743648301455, + "grad_norm": 2.7901999950408936, + "learning_rate": 4.464985732468895e-05, + "loss": 2.5639, + "step": 35673 + }, + { + "epoch": 0.21216338376629557, + "grad_norm": 2.7896947860717773, + "learning_rate": 4.464956854517862e-05, + "loss": 2.4701, + "step": 35674 + }, + { + "epoch": 0.21216933104957655, + "grad_norm": 3.1296167373657227, + "learning_rate": 4.464927975880882e-05, + "loss": 2.8541, + "step": 35675 + }, + { + "epoch": 0.21217527833285754, + "grad_norm": 2.6969821453094482, + "learning_rate": 4.4648990965579665e-05, + "loss": 2.5398, + "step": 35676 + }, + { + "epoch": 0.21218122561613856, + "grad_norm": 2.5808277130126953, + "learning_rate": 4.4648702165491255e-05, + "loss": 2.0513, + "step": 35677 + }, + { + "epoch": 0.21218717289941955, + "grad_norm": 2.433685064315796, + "learning_rate": 4.464841335854367e-05, + "loss": 4.4278, + "step": 35678 + }, + { + "epoch": 0.21219312018270053, + "grad_norm": 2.1320486068725586, + "learning_rate": 4.464812454473705e-05, + "loss": 4.12, + "step": 35679 + }, + { + "epoch": 0.21219906746598155, + "grad_norm": 2.456299304962158, + "learning_rate": 4.464783572407145e-05, + "loss": 4.5267, + "step": 35680 + }, + { + "epoch": 0.21220501474926254, + "grad_norm": 2.1469194889068604, + "learning_rate": 4.464754689654701e-05, + "loss": 4.3901, + "step": 35681 + }, + { + "epoch": 0.21221096203254353, + "grad_norm": 2.1196210384368896, + "learning_rate": 4.46472580621638e-05, + "loss": 4.4883, + "step": 35682 + }, + { + "epoch": 0.21221690931582454, + "grad_norm": 1.8412578105926514, + "learning_rate": 4.464696922092195e-05, + "loss": 4.3989, + "step": 35683 + }, + { + "epoch": 0.21222285659910553, + "grad_norm": 1.8631144762039185, + "learning_rate": 4.464668037282154e-05, + "loss": 4.3183, + "step": 35684 + }, + { + "epoch": 0.21222880388238652, + "grad_norm": 2.0931034088134766, + "learning_rate": 4.464639151786267e-05, + "loss": 4.4251, + "step": 35685 + }, + { + "epoch": 0.21223475116566753, + "grad_norm": 2.132053852081299, + "learning_rate": 4.464610265604546e-05, + "loss": 4.1585, + "step": 35686 + }, + { + "epoch": 0.21224069844894852, + "grad_norm": 2.145237445831299, + "learning_rate": 4.464581378736999e-05, + "loss": 4.5089, + "step": 35687 + }, + { + "epoch": 0.2122466457322295, + "grad_norm": 1.8298094272613525, + "learning_rate": 4.464552491183637e-05, + "loss": 4.3018, + "step": 35688 + }, + { + "epoch": 0.21225259301551053, + "grad_norm": 1.7321758270263672, + "learning_rate": 4.4645236029444704e-05, + "loss": 4.3424, + "step": 35689 + }, + { + "epoch": 0.21225854029879151, + "grad_norm": 1.5509285926818848, + "learning_rate": 4.464494714019508e-05, + "loss": 4.7001, + "step": 35690 + }, + { + "epoch": 0.2122644875820725, + "grad_norm": 1.5042033195495605, + "learning_rate": 4.464465824408762e-05, + "loss": 4.7668, + "step": 35691 + }, + { + "epoch": 0.21227043486535352, + "grad_norm": 1.405381202697754, + "learning_rate": 4.4644369341122405e-05, + "loss": 4.5569, + "step": 35692 + }, + { + "epoch": 0.2122763821486345, + "grad_norm": 2.495974540710449, + "learning_rate": 4.464408043129955e-05, + "loss": 3.6424, + "step": 35693 + }, + { + "epoch": 0.2122823294319155, + "grad_norm": 1.930151104927063, + "learning_rate": 4.4643791514619146e-05, + "loss": 4.7363, + "step": 35694 + }, + { + "epoch": 0.2122882767151965, + "grad_norm": 2.1979784965515137, + "learning_rate": 4.46435025910813e-05, + "loss": 4.4973, + "step": 35695 + }, + { + "epoch": 0.2122942239984775, + "grad_norm": 1.9017161130905151, + "learning_rate": 4.46432136606861e-05, + "loss": 5.0348, + "step": 35696 + }, + { + "epoch": 0.2123001712817585, + "grad_norm": 2.4398694038391113, + "learning_rate": 4.464292472343367e-05, + "loss": 5.0757, + "step": 35697 + }, + { + "epoch": 0.2123061185650395, + "grad_norm": 2.061084508895874, + "learning_rate": 4.464263577932409e-05, + "loss": 4.9802, + "step": 35698 + }, + { + "epoch": 0.2123120658483205, + "grad_norm": 2.277392864227295, + "learning_rate": 4.4642346828357474e-05, + "loss": 4.7231, + "step": 35699 + }, + { + "epoch": 0.21231801313160148, + "grad_norm": 2.2129130363464355, + "learning_rate": 4.464205787053391e-05, + "loss": 4.7154, + "step": 35700 + }, + { + "epoch": 0.2123239604148825, + "grad_norm": 1.9063429832458496, + "learning_rate": 4.4641768905853506e-05, + "loss": 5.1813, + "step": 35701 + }, + { + "epoch": 0.21232990769816348, + "grad_norm": 1.896718978881836, + "learning_rate": 4.464147993431638e-05, + "loss": 4.9855, + "step": 35702 + }, + { + "epoch": 0.21233585498144447, + "grad_norm": 1.7391164302825928, + "learning_rate": 4.46411909559226e-05, + "loss": 4.9448, + "step": 35703 + }, + { + "epoch": 0.21234180226472546, + "grad_norm": 1.8338813781738281, + "learning_rate": 4.464090197067229e-05, + "loss": 4.9424, + "step": 35704 + }, + { + "epoch": 0.21234774954800648, + "grad_norm": 1.8616620302200317, + "learning_rate": 4.4640612978565536e-05, + "loss": 4.9748, + "step": 35705 + }, + { + "epoch": 0.21235369683128746, + "grad_norm": 1.8451703786849976, + "learning_rate": 4.4640323979602456e-05, + "loss": 4.8245, + "step": 35706 + }, + { + "epoch": 0.21235964411456845, + "grad_norm": 1.9476757049560547, + "learning_rate": 4.464003497378314e-05, + "loss": 4.4859, + "step": 35707 + }, + { + "epoch": 0.21236559139784947, + "grad_norm": 1.9685393571853638, + "learning_rate": 4.463974596110769e-05, + "loss": 4.8838, + "step": 35708 + }, + { + "epoch": 0.21237153868113046, + "grad_norm": 1.8450342416763306, + "learning_rate": 4.463945694157621e-05, + "loss": 4.8105, + "step": 35709 + }, + { + "epoch": 0.21237748596441144, + "grad_norm": 1.8277257680892944, + "learning_rate": 4.463916791518879e-05, + "loss": 4.7422, + "step": 35710 + }, + { + "epoch": 0.21238343324769246, + "grad_norm": 2.5418896675109863, + "learning_rate": 4.463887888194555e-05, + "loss": 3.8398, + "step": 35711 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 2.198882818222046, + "learning_rate": 4.4638589841846564e-05, + "loss": 4.2856, + "step": 35712 + }, + { + "epoch": 0.21239532781425444, + "grad_norm": 1.7862573862075806, + "learning_rate": 4.463830079489196e-05, + "loss": 4.6743, + "step": 35713 + }, + { + "epoch": 0.21240127509753545, + "grad_norm": 2.3604986667633057, + "learning_rate": 4.463801174108183e-05, + "loss": 3.2922, + "step": 35714 + }, + { + "epoch": 0.21240722238081644, + "grad_norm": 2.7802772521972656, + "learning_rate": 4.463772268041627e-05, + "loss": 3.8924, + "step": 35715 + }, + { + "epoch": 0.21241316966409743, + "grad_norm": 2.6005308628082275, + "learning_rate": 4.46374336128954e-05, + "loss": 3.9048, + "step": 35716 + }, + { + "epoch": 0.21241911694737844, + "grad_norm": 2.9707534313201904, + "learning_rate": 4.463714453851928e-05, + "loss": 4.0335, + "step": 35717 + }, + { + "epoch": 0.21242506423065943, + "grad_norm": 2.741023302078247, + "learning_rate": 4.4636855457288046e-05, + "loss": 3.7947, + "step": 35718 + }, + { + "epoch": 0.21243101151394042, + "grad_norm": 3.775846242904663, + "learning_rate": 4.463656636920179e-05, + "loss": 3.8971, + "step": 35719 + }, + { + "epoch": 0.21243695879722144, + "grad_norm": 2.2663304805755615, + "learning_rate": 4.463627727426061e-05, + "loss": 3.6525, + "step": 35720 + }, + { + "epoch": 0.21244290608050242, + "grad_norm": 2.3076207637786865, + "learning_rate": 4.463598817246461e-05, + "loss": 3.502, + "step": 35721 + }, + { + "epoch": 0.2124488533637834, + "grad_norm": 2.273998260498047, + "learning_rate": 4.4635699063813884e-05, + "loss": 3.5203, + "step": 35722 + }, + { + "epoch": 0.21245480064706443, + "grad_norm": 2.7163095474243164, + "learning_rate": 4.463540994830855e-05, + "loss": 3.9886, + "step": 35723 + }, + { + "epoch": 0.21246074793034542, + "grad_norm": 2.482473850250244, + "learning_rate": 4.463512082594868e-05, + "loss": 3.552, + "step": 35724 + }, + { + "epoch": 0.2124666952136264, + "grad_norm": 1.8834370374679565, + "learning_rate": 4.4634831696734404e-05, + "loss": 4.1334, + "step": 35725 + }, + { + "epoch": 0.21247264249690742, + "grad_norm": 2.005268096923828, + "learning_rate": 4.463454256066581e-05, + "loss": 5.1996, + "step": 35726 + }, + { + "epoch": 0.2124785897801884, + "grad_norm": 1.4959584474563599, + "learning_rate": 4.4634253417743e-05, + "loss": 4.7173, + "step": 35727 + }, + { + "epoch": 0.2124845370634694, + "grad_norm": 1.490785002708435, + "learning_rate": 4.463396426796608e-05, + "loss": 4.6401, + "step": 35728 + }, + { + "epoch": 0.2124904843467504, + "grad_norm": 1.722306251525879, + "learning_rate": 4.463367511133513e-05, + "loss": 4.9166, + "step": 35729 + }, + { + "epoch": 0.2124964316300314, + "grad_norm": 1.8705493211746216, + "learning_rate": 4.463338594785028e-05, + "loss": 4.3841, + "step": 35730 + }, + { + "epoch": 0.2125023789133124, + "grad_norm": 1.6293779611587524, + "learning_rate": 4.4633096777511614e-05, + "loss": 4.7904, + "step": 35731 + }, + { + "epoch": 0.2125083261965934, + "grad_norm": 1.474142074584961, + "learning_rate": 4.4632807600319236e-05, + "loss": 4.7202, + "step": 35732 + }, + { + "epoch": 0.2125142734798744, + "grad_norm": 1.8260791301727295, + "learning_rate": 4.463251841627325e-05, + "loss": 4.7537, + "step": 35733 + }, + { + "epoch": 0.21252022076315538, + "grad_norm": 2.335918664932251, + "learning_rate": 4.463222922537376e-05, + "loss": 4.5175, + "step": 35734 + }, + { + "epoch": 0.2125261680464364, + "grad_norm": 2.1824939250946045, + "learning_rate": 4.463194002762084e-05, + "loss": 4.8903, + "step": 35735 + }, + { + "epoch": 0.21253211532971739, + "grad_norm": 2.128995656967163, + "learning_rate": 4.4631650823014635e-05, + "loss": 4.8174, + "step": 35736 + }, + { + "epoch": 0.21253806261299837, + "grad_norm": 2.001495122909546, + "learning_rate": 4.4631361611555214e-05, + "loss": 4.7711, + "step": 35737 + }, + { + "epoch": 0.2125440098962794, + "grad_norm": 1.9211745262145996, + "learning_rate": 4.463107239324269e-05, + "loss": 4.6105, + "step": 35738 + }, + { + "epoch": 0.21254995717956038, + "grad_norm": 1.8648548126220703, + "learning_rate": 4.463078316807716e-05, + "loss": 4.6635, + "step": 35739 + }, + { + "epoch": 0.21255590446284137, + "grad_norm": 1.7322161197662354, + "learning_rate": 4.4630493936058726e-05, + "loss": 4.7696, + "step": 35740 + }, + { + "epoch": 0.21256185174612238, + "grad_norm": 1.7490285634994507, + "learning_rate": 4.4630204697187495e-05, + "loss": 4.5781, + "step": 35741 + }, + { + "epoch": 0.21256779902940337, + "grad_norm": 2.229279041290283, + "learning_rate": 4.462991545146355e-05, + "loss": 4.9855, + "step": 35742 + }, + { + "epoch": 0.21257374631268436, + "grad_norm": 1.8897117376327515, + "learning_rate": 4.462962619888702e-05, + "loss": 4.7951, + "step": 35743 + }, + { + "epoch": 0.21257969359596537, + "grad_norm": 1.908650279045105, + "learning_rate": 4.4629336939457986e-05, + "loss": 5.3459, + "step": 35744 + }, + { + "epoch": 0.21258564087924636, + "grad_norm": 1.9401918649673462, + "learning_rate": 4.462904767317655e-05, + "loss": 5.2774, + "step": 35745 + }, + { + "epoch": 0.21259158816252735, + "grad_norm": 1.8013694286346436, + "learning_rate": 4.462875840004281e-05, + "loss": 5.1422, + "step": 35746 + }, + { + "epoch": 0.21259753544580837, + "grad_norm": 1.7482459545135498, + "learning_rate": 4.462846912005688e-05, + "loss": 4.9011, + "step": 35747 + }, + { + "epoch": 0.21260348272908935, + "grad_norm": 2.0190155506134033, + "learning_rate": 4.462817983321885e-05, + "loss": 5.1047, + "step": 35748 + }, + { + "epoch": 0.21260943001237034, + "grad_norm": 1.6946347951889038, + "learning_rate": 4.4627890539528836e-05, + "loss": 4.5423, + "step": 35749 + }, + { + "epoch": 0.21261537729565136, + "grad_norm": 1.8933213949203491, + "learning_rate": 4.462760123898692e-05, + "loss": 4.7896, + "step": 35750 + }, + { + "epoch": 0.21262132457893235, + "grad_norm": 1.4609590768814087, + "learning_rate": 4.462731193159321e-05, + "loss": 4.6521, + "step": 35751 + }, + { + "epoch": 0.21262727186221334, + "grad_norm": 1.5144892930984497, + "learning_rate": 4.462702261734781e-05, + "loss": 4.6135, + "step": 35752 + }, + { + "epoch": 0.21263321914549435, + "grad_norm": 1.5038193464279175, + "learning_rate": 4.4626733296250825e-05, + "loss": 4.642, + "step": 35753 + }, + { + "epoch": 0.21263916642877534, + "grad_norm": 1.3743622303009033, + "learning_rate": 4.4626443968302344e-05, + "loss": 4.5229, + "step": 35754 + }, + { + "epoch": 0.21264511371205633, + "grad_norm": 1.39356529712677, + "learning_rate": 4.462615463350247e-05, + "loss": 4.5139, + "step": 35755 + }, + { + "epoch": 0.21265106099533734, + "grad_norm": 1.4335349798202515, + "learning_rate": 4.462586529185132e-05, + "loss": 4.5719, + "step": 35756 + }, + { + "epoch": 0.21265700827861833, + "grad_norm": 1.6316946744918823, + "learning_rate": 4.4625575943348976e-05, + "loss": 4.6748, + "step": 35757 + }, + { + "epoch": 0.21266295556189932, + "grad_norm": 1.7913111448287964, + "learning_rate": 4.462528658799554e-05, + "loss": 4.4836, + "step": 35758 + }, + { + "epoch": 0.21266890284518034, + "grad_norm": 1.8246521949768066, + "learning_rate": 4.462499722579113e-05, + "loss": 5.1016, + "step": 35759 + }, + { + "epoch": 0.21267485012846132, + "grad_norm": 1.8918733596801758, + "learning_rate": 4.462470785673583e-05, + "loss": 5.2359, + "step": 35760 + }, + { + "epoch": 0.2126807974117423, + "grad_norm": 1.6759446859359741, + "learning_rate": 4.4624418480829754e-05, + "loss": 5.0853, + "step": 35761 + }, + { + "epoch": 0.2126867446950233, + "grad_norm": 2.1670455932617188, + "learning_rate": 4.462412909807299e-05, + "loss": 4.4643, + "step": 35762 + }, + { + "epoch": 0.21269269197830432, + "grad_norm": 1.9109561443328857, + "learning_rate": 4.4623839708465646e-05, + "loss": 4.695, + "step": 35763 + }, + { + "epoch": 0.2126986392615853, + "grad_norm": 2.0584371089935303, + "learning_rate": 4.462355031200782e-05, + "loss": 4.5412, + "step": 35764 + }, + { + "epoch": 0.2127045865448663, + "grad_norm": 2.116912364959717, + "learning_rate": 4.462326090869963e-05, + "loss": 4.4798, + "step": 35765 + }, + { + "epoch": 0.2127105338281473, + "grad_norm": 1.6834105253219604, + "learning_rate": 4.4622971498541147e-05, + "loss": 4.1607, + "step": 35766 + }, + { + "epoch": 0.2127164811114283, + "grad_norm": 1.9433541297912598, + "learning_rate": 4.4622682081532484e-05, + "loss": 4.2535, + "step": 35767 + }, + { + "epoch": 0.21272242839470928, + "grad_norm": 1.8354408740997314, + "learning_rate": 4.462239265767376e-05, + "loss": 4.1845, + "step": 35768 + }, + { + "epoch": 0.2127283756779903, + "grad_norm": 1.801477074623108, + "learning_rate": 4.462210322696505e-05, + "loss": 4.1377, + "step": 35769 + }, + { + "epoch": 0.2127343229612713, + "grad_norm": 1.6852128505706787, + "learning_rate": 4.462181378940647e-05, + "loss": 4.0879, + "step": 35770 + }, + { + "epoch": 0.21274027024455228, + "grad_norm": 1.8251643180847168, + "learning_rate": 4.4621524344998124e-05, + "loss": 4.1113, + "step": 35771 + }, + { + "epoch": 0.2127462175278333, + "grad_norm": 2.3179166316986084, + "learning_rate": 4.462123489374009e-05, + "loss": 3.7295, + "step": 35772 + }, + { + "epoch": 0.21275216481111428, + "grad_norm": 3.6453943252563477, + "learning_rate": 4.46209454356325e-05, + "loss": 2.9047, + "step": 35773 + }, + { + "epoch": 0.21275811209439527, + "grad_norm": 1.855807900428772, + "learning_rate": 4.462065597067544e-05, + "loss": 4.4095, + "step": 35774 + }, + { + "epoch": 0.21276405937767628, + "grad_norm": 2.9473495483398438, + "learning_rate": 4.4620366498869e-05, + "loss": 2.6528, + "step": 35775 + }, + { + "epoch": 0.21277000666095727, + "grad_norm": 2.007720947265625, + "learning_rate": 4.462007702021331e-05, + "loss": 4.7348, + "step": 35776 + }, + { + "epoch": 0.21277595394423826, + "grad_norm": 2.2951998710632324, + "learning_rate": 4.461978753470845e-05, + "loss": 4.5622, + "step": 35777 + }, + { + "epoch": 0.21278190122751928, + "grad_norm": 1.5362045764923096, + "learning_rate": 4.461949804235451e-05, + "loss": 4.9606, + "step": 35778 + }, + { + "epoch": 0.21278784851080026, + "grad_norm": 1.6383750438690186, + "learning_rate": 4.461920854315162e-05, + "loss": 4.6241, + "step": 35779 + }, + { + "epoch": 0.21279379579408125, + "grad_norm": 2.050675630569458, + "learning_rate": 4.461891903709986e-05, + "loss": 5.3119, + "step": 35780 + }, + { + "epoch": 0.21279974307736227, + "grad_norm": 1.8561534881591797, + "learning_rate": 4.461862952419934e-05, + "loss": 4.8164, + "step": 35781 + }, + { + "epoch": 0.21280569036064326, + "grad_norm": 1.5282032489776611, + "learning_rate": 4.4618340004450164e-05, + "loss": 4.9202, + "step": 35782 + }, + { + "epoch": 0.21281163764392425, + "grad_norm": 1.5900583267211914, + "learning_rate": 4.4618050477852426e-05, + "loss": 4.9648, + "step": 35783 + }, + { + "epoch": 0.21281758492720526, + "grad_norm": 1.5663514137268066, + "learning_rate": 4.461776094440623e-05, + "loss": 4.7691, + "step": 35784 + }, + { + "epoch": 0.21282353221048625, + "grad_norm": 1.5911515951156616, + "learning_rate": 4.4617471404111665e-05, + "loss": 4.8285, + "step": 35785 + }, + { + "epoch": 0.21282947949376724, + "grad_norm": 1.5293818712234497, + "learning_rate": 4.461718185696886e-05, + "loss": 4.7698, + "step": 35786 + }, + { + "epoch": 0.21283542677704825, + "grad_norm": 1.4674770832061768, + "learning_rate": 4.4616892302977886e-05, + "loss": 4.7565, + "step": 35787 + }, + { + "epoch": 0.21284137406032924, + "grad_norm": 1.7607558965682983, + "learning_rate": 4.461660274213887e-05, + "loss": 4.7961, + "step": 35788 + }, + { + "epoch": 0.21284732134361023, + "grad_norm": 1.4412648677825928, + "learning_rate": 4.461631317445189e-05, + "loss": 4.8314, + "step": 35789 + }, + { + "epoch": 0.21285326862689125, + "grad_norm": 1.4143060445785522, + "learning_rate": 4.461602359991706e-05, + "loss": 4.8198, + "step": 35790 + }, + { + "epoch": 0.21285921591017223, + "grad_norm": 1.4745891094207764, + "learning_rate": 4.4615734018534484e-05, + "loss": 4.8315, + "step": 35791 + }, + { + "epoch": 0.21286516319345322, + "grad_norm": 1.4099732637405396, + "learning_rate": 4.461544443030426e-05, + "loss": 4.8005, + "step": 35792 + }, + { + "epoch": 0.21287111047673424, + "grad_norm": 1.5773065090179443, + "learning_rate": 4.4615154835226474e-05, + "loss": 4.7124, + "step": 35793 + }, + { + "epoch": 0.21287705776001523, + "grad_norm": 1.6054891347885132, + "learning_rate": 4.461486523330125e-05, + "loss": 5.3169, + "step": 35794 + }, + { + "epoch": 0.21288300504329621, + "grad_norm": 1.9800649881362915, + "learning_rate": 4.461457562452868e-05, + "loss": 4.5722, + "step": 35795 + }, + { + "epoch": 0.21288895232657723, + "grad_norm": 1.9030554294586182, + "learning_rate": 4.4614286008908854e-05, + "loss": 4.5221, + "step": 35796 + }, + { + "epoch": 0.21289489960985822, + "grad_norm": 1.8483407497406006, + "learning_rate": 4.4613996386441895e-05, + "loss": 4.7036, + "step": 35797 + }, + { + "epoch": 0.2129008468931392, + "grad_norm": 1.7602765560150146, + "learning_rate": 4.461370675712788e-05, + "loss": 4.5447, + "step": 35798 + }, + { + "epoch": 0.21290679417642022, + "grad_norm": 1.4833706617355347, + "learning_rate": 4.461341712096694e-05, + "loss": 4.6214, + "step": 35799 + }, + { + "epoch": 0.2129127414597012, + "grad_norm": 1.4221755266189575, + "learning_rate": 4.4613127477959146e-05, + "loss": 4.7195, + "step": 35800 + }, + { + "epoch": 0.2129186887429822, + "grad_norm": 1.5532176494598389, + "learning_rate": 4.4612837828104616e-05, + "loss": 4.466, + "step": 35801 + }, + { + "epoch": 0.21292463602626321, + "grad_norm": 1.4992002248764038, + "learning_rate": 4.4612548171403444e-05, + "loss": 4.454, + "step": 35802 + }, + { + "epoch": 0.2129305833095442, + "grad_norm": 1.8023090362548828, + "learning_rate": 4.461225850785574e-05, + "loss": 4.542, + "step": 35803 + }, + { + "epoch": 0.2129365305928252, + "grad_norm": 1.6397573947906494, + "learning_rate": 4.4611968837461595e-05, + "loss": 4.552, + "step": 35804 + }, + { + "epoch": 0.2129424778761062, + "grad_norm": 1.6525506973266602, + "learning_rate": 4.461167916022111e-05, + "loss": 4.5412, + "step": 35805 + }, + { + "epoch": 0.2129484251593872, + "grad_norm": 1.623678207397461, + "learning_rate": 4.46113894761344e-05, + "loss": 4.5214, + "step": 35806 + }, + { + "epoch": 0.21295437244266818, + "grad_norm": 1.4363800287246704, + "learning_rate": 4.461109978520155e-05, + "loss": 4.8244, + "step": 35807 + }, + { + "epoch": 0.2129603197259492, + "grad_norm": 1.6746747493743896, + "learning_rate": 4.461081008742267e-05, + "loss": 4.6021, + "step": 35808 + }, + { + "epoch": 0.2129662670092302, + "grad_norm": 1.4615259170532227, + "learning_rate": 4.4610520382797856e-05, + "loss": 4.5655, + "step": 35809 + }, + { + "epoch": 0.21297221429251117, + "grad_norm": 1.4905229806900024, + "learning_rate": 4.4610230671327215e-05, + "loss": 4.6959, + "step": 35810 + }, + { + "epoch": 0.2129781615757922, + "grad_norm": 2.2128641605377197, + "learning_rate": 4.460994095301084e-05, + "loss": 4.5274, + "step": 35811 + }, + { + "epoch": 0.21298410885907318, + "grad_norm": 1.7724326848983765, + "learning_rate": 4.460965122784885e-05, + "loss": 5.253, + "step": 35812 + }, + { + "epoch": 0.21299005614235417, + "grad_norm": 1.8824642896652222, + "learning_rate": 4.460936149584132e-05, + "loss": 5.3778, + "step": 35813 + }, + { + "epoch": 0.21299600342563518, + "grad_norm": 1.788230538368225, + "learning_rate": 4.460907175698837e-05, + "loss": 5.2686, + "step": 35814 + }, + { + "epoch": 0.21300195070891617, + "grad_norm": 1.6524558067321777, + "learning_rate": 4.46087820112901e-05, + "loss": 5.2969, + "step": 35815 + }, + { + "epoch": 0.21300789799219716, + "grad_norm": 1.5646259784698486, + "learning_rate": 4.460849225874659e-05, + "loss": 5.1971, + "step": 35816 + }, + { + "epoch": 0.21301384527547818, + "grad_norm": 2.1101200580596924, + "learning_rate": 4.460820249935798e-05, + "loss": 4.8999, + "step": 35817 + }, + { + "epoch": 0.21301979255875916, + "grad_norm": 1.7931146621704102, + "learning_rate": 4.460791273312433e-05, + "loss": 5.7204, + "step": 35818 + }, + { + "epoch": 0.21302573984204015, + "grad_norm": 1.9258630275726318, + "learning_rate": 4.460762296004577e-05, + "loss": 5.4044, + "step": 35819 + }, + { + "epoch": 0.21303168712532114, + "grad_norm": 3.792379140853882, + "learning_rate": 4.460733318012239e-05, + "loss": 2.9617, + "step": 35820 + }, + { + "epoch": 0.21303763440860216, + "grad_norm": 3.076469659805298, + "learning_rate": 4.46070433933543e-05, + "loss": 4.0408, + "step": 35821 + }, + { + "epoch": 0.21304358169188314, + "grad_norm": 3.011936902999878, + "learning_rate": 4.460675359974158e-05, + "loss": 2.7702, + "step": 35822 + }, + { + "epoch": 0.21304952897516413, + "grad_norm": 3.1061980724334717, + "learning_rate": 4.460646379928435e-05, + "loss": 2.0251, + "step": 35823 + }, + { + "epoch": 0.21305547625844515, + "grad_norm": 2.8176026344299316, + "learning_rate": 4.460617399198271e-05, + "loss": 3.023, + "step": 35824 + }, + { + "epoch": 0.21306142354172614, + "grad_norm": 3.274871826171875, + "learning_rate": 4.460588417783675e-05, + "loss": 3.5582, + "step": 35825 + }, + { + "epoch": 0.21306737082500712, + "grad_norm": 2.003629684448242, + "learning_rate": 4.4605594356846594e-05, + "loss": 3.5854, + "step": 35826 + }, + { + "epoch": 0.21307331810828814, + "grad_norm": 1.5609272718429565, + "learning_rate": 4.460530452901231e-05, + "loss": 4.5643, + "step": 35827 + }, + { + "epoch": 0.21307926539156913, + "grad_norm": 1.874121904373169, + "learning_rate": 4.4605014694334024e-05, + "loss": 4.8483, + "step": 35828 + }, + { + "epoch": 0.21308521267485012, + "grad_norm": 1.758209228515625, + "learning_rate": 4.460472485281183e-05, + "loss": 4.9385, + "step": 35829 + }, + { + "epoch": 0.21309115995813113, + "grad_norm": 1.755028486251831, + "learning_rate": 4.4604435004445824e-05, + "loss": 5.0858, + "step": 35830 + }, + { + "epoch": 0.21309710724141212, + "grad_norm": 1.6154873371124268, + "learning_rate": 4.460414514923612e-05, + "loss": 4.2717, + "step": 35831 + }, + { + "epoch": 0.2131030545246931, + "grad_norm": 1.7695956230163574, + "learning_rate": 4.4603855287182806e-05, + "loss": 3.6908, + "step": 35832 + }, + { + "epoch": 0.21310900180797412, + "grad_norm": 1.7642066478729248, + "learning_rate": 4.4603565418285996e-05, + "loss": 3.6801, + "step": 35833 + }, + { + "epoch": 0.2131149490912551, + "grad_norm": 1.4951072931289673, + "learning_rate": 4.460327554254578e-05, + "loss": 3.7037, + "step": 35834 + }, + { + "epoch": 0.2131208963745361, + "grad_norm": 1.7861125469207764, + "learning_rate": 4.460298565996226e-05, + "loss": 3.7424, + "step": 35835 + }, + { + "epoch": 0.21312684365781712, + "grad_norm": 2.4425766468048096, + "learning_rate": 4.4602695770535544e-05, + "loss": 4.6082, + "step": 35836 + }, + { + "epoch": 0.2131327909410981, + "grad_norm": 1.7068989276885986, + "learning_rate": 4.460240587426572e-05, + "loss": 3.7878, + "step": 35837 + }, + { + "epoch": 0.2131387382243791, + "grad_norm": 1.6994092464447021, + "learning_rate": 4.4602115971152905e-05, + "loss": 3.781, + "step": 35838 + }, + { + "epoch": 0.2131446855076601, + "grad_norm": 1.6545926332473755, + "learning_rate": 4.46018260611972e-05, + "loss": 3.7755, + "step": 35839 + }, + { + "epoch": 0.2131506327909411, + "grad_norm": 1.654785394668579, + "learning_rate": 4.4601536144398695e-05, + "loss": 3.6985, + "step": 35840 + }, + { + "epoch": 0.21315658007422209, + "grad_norm": 2.636845588684082, + "learning_rate": 4.460124622075749e-05, + "loss": 4.2517, + "step": 35841 + }, + { + "epoch": 0.2131625273575031, + "grad_norm": 2.595813751220703, + "learning_rate": 4.46009562902737e-05, + "loss": 4.2488, + "step": 35842 + }, + { + "epoch": 0.2131684746407841, + "grad_norm": 2.3771018981933594, + "learning_rate": 4.4600666352947416e-05, + "loss": 4.1553, + "step": 35843 + }, + { + "epoch": 0.21317442192406508, + "grad_norm": 2.2217776775360107, + "learning_rate": 4.4600376408778746e-05, + "loss": 4.1849, + "step": 35844 + }, + { + "epoch": 0.2131803692073461, + "grad_norm": 2.8838157653808594, + "learning_rate": 4.4600086457767784e-05, + "loss": 4.0867, + "step": 35845 + }, + { + "epoch": 0.21318631649062708, + "grad_norm": 2.105971574783325, + "learning_rate": 4.459979649991464e-05, + "loss": 4.3621, + "step": 35846 + }, + { + "epoch": 0.21319226377390807, + "grad_norm": 2.231476306915283, + "learning_rate": 4.45995065352194e-05, + "loss": 4.2008, + "step": 35847 + }, + { + "epoch": 0.21319821105718909, + "grad_norm": 2.104140281677246, + "learning_rate": 4.459921656368218e-05, + "loss": 3.914, + "step": 35848 + }, + { + "epoch": 0.21320415834047007, + "grad_norm": 2.1466448307037354, + "learning_rate": 4.459892658530307e-05, + "loss": 4.1908, + "step": 35849 + }, + { + "epoch": 0.21321010562375106, + "grad_norm": 2.4501988887786865, + "learning_rate": 4.459863660008218e-05, + "loss": 4.3681, + "step": 35850 + }, + { + "epoch": 0.21321605290703208, + "grad_norm": 2.4947612285614014, + "learning_rate": 4.459834660801961e-05, + "loss": 4.2168, + "step": 35851 + }, + { + "epoch": 0.21322200019031307, + "grad_norm": 2.237306594848633, + "learning_rate": 4.459805660911546e-05, + "loss": 4.2742, + "step": 35852 + }, + { + "epoch": 0.21322794747359405, + "grad_norm": 2.4983670711517334, + "learning_rate": 4.4597766603369834e-05, + "loss": 4.204, + "step": 35853 + }, + { + "epoch": 0.21323389475687507, + "grad_norm": 2.3211803436279297, + "learning_rate": 4.459747659078283e-05, + "loss": 4.1786, + "step": 35854 + }, + { + "epoch": 0.21323984204015606, + "grad_norm": 2.4706544876098633, + "learning_rate": 4.4597186571354544e-05, + "loss": 4.3427, + "step": 35855 + }, + { + "epoch": 0.21324578932343705, + "grad_norm": 2.552676200866699, + "learning_rate": 4.4596896545085084e-05, + "loss": 4.3238, + "step": 35856 + }, + { + "epoch": 0.21325173660671806, + "grad_norm": 2.366426467895508, + "learning_rate": 4.459660651197455e-05, + "loss": 4.4553, + "step": 35857 + }, + { + "epoch": 0.21325768388999905, + "grad_norm": 2.8086371421813965, + "learning_rate": 4.4596316472023044e-05, + "loss": 4.2748, + "step": 35858 + }, + { + "epoch": 0.21326363117328004, + "grad_norm": 2.2683427333831787, + "learning_rate": 4.459602642523067e-05, + "loss": 4.1705, + "step": 35859 + }, + { + "epoch": 0.21326957845656105, + "grad_norm": 2.0883960723876953, + "learning_rate": 4.459573637159752e-05, + "loss": 4.2445, + "step": 35860 + }, + { + "epoch": 0.21327552573984204, + "grad_norm": 2.2819952964782715, + "learning_rate": 4.45954463111237e-05, + "loss": 4.4289, + "step": 35861 + }, + { + "epoch": 0.21328147302312303, + "grad_norm": 2.1826071739196777, + "learning_rate": 4.459515624380932e-05, + "loss": 4.1867, + "step": 35862 + }, + { + "epoch": 0.21328742030640405, + "grad_norm": 2.458500623703003, + "learning_rate": 4.459486616965447e-05, + "loss": 4.2723, + "step": 35863 + }, + { + "epoch": 0.21329336758968503, + "grad_norm": 2.137686252593994, + "learning_rate": 4.459457608865925e-05, + "loss": 4.2249, + "step": 35864 + }, + { + "epoch": 0.21329931487296602, + "grad_norm": 1.9973599910736084, + "learning_rate": 4.459428600082377e-05, + "loss": 4.2212, + "step": 35865 + }, + { + "epoch": 0.21330526215624704, + "grad_norm": 2.1852917671203613, + "learning_rate": 4.459399590614813e-05, + "loss": 4.1206, + "step": 35866 + }, + { + "epoch": 0.21331120943952803, + "grad_norm": 2.2127127647399902, + "learning_rate": 4.459370580463242e-05, + "loss": 4.2551, + "step": 35867 + }, + { + "epoch": 0.21331715672280901, + "grad_norm": 2.2800424098968506, + "learning_rate": 4.459341569627675e-05, + "loss": 4.142, + "step": 35868 + }, + { + "epoch": 0.21332310400609003, + "grad_norm": 2.3102056980133057, + "learning_rate": 4.459312558108123e-05, + "loss": 4.1007, + "step": 35869 + }, + { + "epoch": 0.21332905128937102, + "grad_norm": 2.5117461681365967, + "learning_rate": 4.459283545904595e-05, + "loss": 3.9965, + "step": 35870 + }, + { + "epoch": 0.213334998572652, + "grad_norm": 2.34240460395813, + "learning_rate": 4.459254533017101e-05, + "loss": 4.0532, + "step": 35871 + }, + { + "epoch": 0.21334094585593302, + "grad_norm": 2.803379774093628, + "learning_rate": 4.459225519445652e-05, + "loss": 3.9698, + "step": 35872 + }, + { + "epoch": 0.213346893139214, + "grad_norm": 2.3026621341705322, + "learning_rate": 4.4591965051902574e-05, + "loss": 4.1482, + "step": 35873 + }, + { + "epoch": 0.213352840422495, + "grad_norm": 1.9966895580291748, + "learning_rate": 4.459167490250927e-05, + "loss": 4.3117, + "step": 35874 + }, + { + "epoch": 0.21335878770577602, + "grad_norm": 3.08508563041687, + "learning_rate": 4.459138474627672e-05, + "loss": 3.4137, + "step": 35875 + }, + { + "epoch": 0.213364734989057, + "grad_norm": 3.1647846698760986, + "learning_rate": 4.459109458320502e-05, + "loss": 3.6346, + "step": 35876 + }, + { + "epoch": 0.213370682272338, + "grad_norm": 2.066763162612915, + "learning_rate": 4.459080441329426e-05, + "loss": 4.5963, + "step": 35877 + }, + { + "epoch": 0.21337662955561898, + "grad_norm": 1.815376877784729, + "learning_rate": 4.4590514236544567e-05, + "loss": 4.8914, + "step": 35878 + }, + { + "epoch": 0.2133825768389, + "grad_norm": 2.9240071773529053, + "learning_rate": 4.459022405295602e-05, + "loss": 3.5932, + "step": 35879 + }, + { + "epoch": 0.21338852412218098, + "grad_norm": 2.881493330001831, + "learning_rate": 4.458993386252874e-05, + "loss": 3.2032, + "step": 35880 + }, + { + "epoch": 0.21339447140546197, + "grad_norm": 2.6276941299438477, + "learning_rate": 4.45896436652628e-05, + "loss": 3.2093, + "step": 35881 + }, + { + "epoch": 0.213400418688743, + "grad_norm": 2.8940045833587646, + "learning_rate": 4.4589353461158335e-05, + "loss": 4.2197, + "step": 35882 + }, + { + "epoch": 0.21340636597202398, + "grad_norm": 2.8076045513153076, + "learning_rate": 4.458906325021541e-05, + "loss": 3.1704, + "step": 35883 + }, + { + "epoch": 0.21341231325530496, + "grad_norm": 2.643134117126465, + "learning_rate": 4.458877303243416e-05, + "loss": 3.3753, + "step": 35884 + }, + { + "epoch": 0.21341826053858598, + "grad_norm": 2.9000542163848877, + "learning_rate": 4.458848280781467e-05, + "loss": 3.6017, + "step": 35885 + }, + { + "epoch": 0.21342420782186697, + "grad_norm": 2.967768907546997, + "learning_rate": 4.4588192576357036e-05, + "loss": 3.2405, + "step": 35886 + }, + { + "epoch": 0.21343015510514796, + "grad_norm": 2.1402149200439453, + "learning_rate": 4.458790233806137e-05, + "loss": 4.0389, + "step": 35887 + }, + { + "epoch": 0.21343610238842897, + "grad_norm": 1.6989480257034302, + "learning_rate": 4.4587612092927774e-05, + "loss": 4.1777, + "step": 35888 + }, + { + "epoch": 0.21344204967170996, + "grad_norm": 1.930235743522644, + "learning_rate": 4.4587321840956336e-05, + "loss": 4.243, + "step": 35889 + }, + { + "epoch": 0.21344799695499095, + "grad_norm": 1.681248426437378, + "learning_rate": 4.4587031582147174e-05, + "loss": 4.3933, + "step": 35890 + }, + { + "epoch": 0.21345394423827196, + "grad_norm": 1.8064907789230347, + "learning_rate": 4.458674131650038e-05, + "loss": 5.1875, + "step": 35891 + }, + { + "epoch": 0.21345989152155295, + "grad_norm": 1.755428671836853, + "learning_rate": 4.458645104401605e-05, + "loss": 4.5157, + "step": 35892 + }, + { + "epoch": 0.21346583880483394, + "grad_norm": 1.8016186952590942, + "learning_rate": 4.45861607646943e-05, + "loss": 4.1718, + "step": 35893 + }, + { + "epoch": 0.21347178608811496, + "grad_norm": 1.8302110433578491, + "learning_rate": 4.458587047853522e-05, + "loss": 4.2413, + "step": 35894 + }, + { + "epoch": 0.21347773337139594, + "grad_norm": 1.8271868228912354, + "learning_rate": 4.458558018553892e-05, + "loss": 4.2252, + "step": 35895 + }, + { + "epoch": 0.21348368065467693, + "grad_norm": 1.774984359741211, + "learning_rate": 4.4585289885705495e-05, + "loss": 4.1691, + "step": 35896 + }, + { + "epoch": 0.21348962793795795, + "grad_norm": 1.678552269935608, + "learning_rate": 4.458499957903505e-05, + "loss": 4.4492, + "step": 35897 + }, + { + "epoch": 0.21349557522123894, + "grad_norm": 1.810869812965393, + "learning_rate": 4.458470926552767e-05, + "loss": 4.302, + "step": 35898 + }, + { + "epoch": 0.21350152250451992, + "grad_norm": 1.6837462186813354, + "learning_rate": 4.458441894518348e-05, + "loss": 4.2768, + "step": 35899 + }, + { + "epoch": 0.21350746978780094, + "grad_norm": 1.5244439840316772, + "learning_rate": 4.458412861800257e-05, + "loss": 4.4128, + "step": 35900 + }, + { + "epoch": 0.21351341707108193, + "grad_norm": 1.6847096681594849, + "learning_rate": 4.4583838283985043e-05, + "loss": 4.0461, + "step": 35901 + }, + { + "epoch": 0.21351936435436292, + "grad_norm": 1.9673925638198853, + "learning_rate": 4.458354794313101e-05, + "loss": 4.11, + "step": 35902 + }, + { + "epoch": 0.21352531163764393, + "grad_norm": 1.8580288887023926, + "learning_rate": 4.458325759544055e-05, + "loss": 4.294, + "step": 35903 + }, + { + "epoch": 0.21353125892092492, + "grad_norm": 1.7768099308013916, + "learning_rate": 4.458296724091379e-05, + "loss": 4.358, + "step": 35904 + }, + { + "epoch": 0.2135372062042059, + "grad_norm": 1.7895678281784058, + "learning_rate": 4.45826768795508e-05, + "loss": 4.3601, + "step": 35905 + }, + { + "epoch": 0.21354315348748693, + "grad_norm": 1.8596075773239136, + "learning_rate": 4.4582386511351714e-05, + "loss": 4.8139, + "step": 35906 + }, + { + "epoch": 0.2135491007707679, + "grad_norm": 1.6231931447982788, + "learning_rate": 4.4582096136316614e-05, + "loss": 5.1909, + "step": 35907 + }, + { + "epoch": 0.2135550480540489, + "grad_norm": 1.7343584299087524, + "learning_rate": 4.458180575444561e-05, + "loss": 4.6373, + "step": 35908 + }, + { + "epoch": 0.21356099533732992, + "grad_norm": 1.5508018732070923, + "learning_rate": 4.45815153657388e-05, + "loss": 4.7082, + "step": 35909 + }, + { + "epoch": 0.2135669426206109, + "grad_norm": 1.599245548248291, + "learning_rate": 4.458122497019628e-05, + "loss": 4.8541, + "step": 35910 + }, + { + "epoch": 0.2135728899038919, + "grad_norm": 1.624755859375, + "learning_rate": 4.4580934567818164e-05, + "loss": 5.0132, + "step": 35911 + }, + { + "epoch": 0.2135788371871729, + "grad_norm": 1.705117106437683, + "learning_rate": 4.458064415860454e-05, + "loss": 5.1333, + "step": 35912 + }, + { + "epoch": 0.2135847844704539, + "grad_norm": 1.5134979486465454, + "learning_rate": 4.4580353742555515e-05, + "loss": 4.9791, + "step": 35913 + }, + { + "epoch": 0.21359073175373489, + "grad_norm": 1.5711792707443237, + "learning_rate": 4.458006331967119e-05, + "loss": 4.8364, + "step": 35914 + }, + { + "epoch": 0.2135966790370159, + "grad_norm": 1.481801986694336, + "learning_rate": 4.457977288995168e-05, + "loss": 4.897, + "step": 35915 + }, + { + "epoch": 0.2136026263202969, + "grad_norm": 1.4712084531784058, + "learning_rate": 4.457948245339706e-05, + "loss": 4.7349, + "step": 35916 + }, + { + "epoch": 0.21360857360357788, + "grad_norm": 1.7659448385238647, + "learning_rate": 4.457919201000745e-05, + "loss": 5.0048, + "step": 35917 + }, + { + "epoch": 0.2136145208868589, + "grad_norm": 1.469093918800354, + "learning_rate": 4.4578901559782944e-05, + "loss": 5.0344, + "step": 35918 + }, + { + "epoch": 0.21362046817013988, + "grad_norm": 1.451316237449646, + "learning_rate": 4.457861110272365e-05, + "loss": 4.7897, + "step": 35919 + }, + { + "epoch": 0.21362641545342087, + "grad_norm": 1.4311710596084595, + "learning_rate": 4.4578320638829655e-05, + "loss": 4.8051, + "step": 35920 + }, + { + "epoch": 0.2136323627367019, + "grad_norm": 1.1662811040878296, + "learning_rate": 4.457803016810108e-05, + "loss": 5.0022, + "step": 35921 + }, + { + "epoch": 0.21363831001998287, + "grad_norm": 1.424172282218933, + "learning_rate": 4.457773969053801e-05, + "loss": 4.921, + "step": 35922 + }, + { + "epoch": 0.21364425730326386, + "grad_norm": 1.396257758140564, + "learning_rate": 4.4577449206140564e-05, + "loss": 5.1584, + "step": 35923 + }, + { + "epoch": 0.21365020458654488, + "grad_norm": 1.4995923042297363, + "learning_rate": 4.457715871490882e-05, + "loss": 5.0498, + "step": 35924 + }, + { + "epoch": 0.21365615186982587, + "grad_norm": 1.8978110551834106, + "learning_rate": 4.4576868216842904e-05, + "loss": 4.8733, + "step": 35925 + }, + { + "epoch": 0.21366209915310685, + "grad_norm": 1.7578836679458618, + "learning_rate": 4.45765777119429e-05, + "loss": 4.9622, + "step": 35926 + }, + { + "epoch": 0.21366804643638787, + "grad_norm": 1.5519132614135742, + "learning_rate": 4.457628720020891e-05, + "loss": 4.8019, + "step": 35927 + }, + { + "epoch": 0.21367399371966886, + "grad_norm": 2.0452818870544434, + "learning_rate": 4.4575996681641054e-05, + "loss": 4.9459, + "step": 35928 + }, + { + "epoch": 0.21367994100294985, + "grad_norm": 1.705398678779602, + "learning_rate": 4.4575706156239405e-05, + "loss": 5.2987, + "step": 35929 + }, + { + "epoch": 0.21368588828623086, + "grad_norm": 1.7086260318756104, + "learning_rate": 4.457541562400409e-05, + "loss": 4.7203, + "step": 35930 + }, + { + "epoch": 0.21369183556951185, + "grad_norm": 1.47868812084198, + "learning_rate": 4.4575125084935186e-05, + "loss": 5.1416, + "step": 35931 + }, + { + "epoch": 0.21369778285279284, + "grad_norm": 1.5206907987594604, + "learning_rate": 4.4574834539032826e-05, + "loss": 5.2395, + "step": 35932 + }, + { + "epoch": 0.21370373013607386, + "grad_norm": 1.540887713432312, + "learning_rate": 4.457454398629708e-05, + "loss": 5.071, + "step": 35933 + }, + { + "epoch": 0.21370967741935484, + "grad_norm": 1.546628713607788, + "learning_rate": 4.4574253426728066e-05, + "loss": 5.1932, + "step": 35934 + }, + { + "epoch": 0.21371562470263583, + "grad_norm": 1.783042550086975, + "learning_rate": 4.457396286032589e-05, + "loss": 5.0621, + "step": 35935 + }, + { + "epoch": 0.21372157198591682, + "grad_norm": 1.8897498846054077, + "learning_rate": 4.4573672287090637e-05, + "loss": 4.7175, + "step": 35936 + }, + { + "epoch": 0.21372751926919784, + "grad_norm": 1.9781421422958374, + "learning_rate": 4.457338170702242e-05, + "loss": 4.7669, + "step": 35937 + }, + { + "epoch": 0.21373346655247882, + "grad_norm": 1.9311988353729248, + "learning_rate": 4.457309112012134e-05, + "loss": 5.278, + "step": 35938 + }, + { + "epoch": 0.2137394138357598, + "grad_norm": 1.77422034740448, + "learning_rate": 4.4572800526387495e-05, + "loss": 4.733, + "step": 35939 + }, + { + "epoch": 0.21374536111904083, + "grad_norm": 1.7609598636627197, + "learning_rate": 4.457250992582098e-05, + "loss": 4.5193, + "step": 35940 + }, + { + "epoch": 0.21375130840232182, + "grad_norm": 1.7665215730667114, + "learning_rate": 4.4572219318421916e-05, + "loss": 4.4299, + "step": 35941 + }, + { + "epoch": 0.2137572556856028, + "grad_norm": 1.8990488052368164, + "learning_rate": 4.4571928704190384e-05, + "loss": 4.395, + "step": 35942 + }, + { + "epoch": 0.21376320296888382, + "grad_norm": 1.751051425933838, + "learning_rate": 4.45716380831265e-05, + "loss": 4.5123, + "step": 35943 + }, + { + "epoch": 0.2137691502521648, + "grad_norm": 2.0244052410125732, + "learning_rate": 4.4571347455230356e-05, + "loss": 5.1076, + "step": 35944 + }, + { + "epoch": 0.2137750975354458, + "grad_norm": 1.783740520477295, + "learning_rate": 4.4571056820502056e-05, + "loss": 5.2825, + "step": 35945 + }, + { + "epoch": 0.2137810448187268, + "grad_norm": 1.5837445259094238, + "learning_rate": 4.4570766178941704e-05, + "loss": 5.2201, + "step": 35946 + }, + { + "epoch": 0.2137869921020078, + "grad_norm": 1.6283888816833496, + "learning_rate": 4.4570475530549394e-05, + "loss": 5.2683, + "step": 35947 + }, + { + "epoch": 0.2137929393852888, + "grad_norm": 1.8020168542861938, + "learning_rate": 4.4570184875325235e-05, + "loss": 5.0123, + "step": 35948 + }, + { + "epoch": 0.2137988866685698, + "grad_norm": 1.8496737480163574, + "learning_rate": 4.4569894213269335e-05, + "loss": 4.9256, + "step": 35949 + }, + { + "epoch": 0.2138048339518508, + "grad_norm": 1.6990101337432861, + "learning_rate": 4.456960354438178e-05, + "loss": 4.5923, + "step": 35950 + }, + { + "epoch": 0.21381078123513178, + "grad_norm": 2.2053756713867188, + "learning_rate": 4.4569312868662686e-05, + "loss": 4.8017, + "step": 35951 + }, + { + "epoch": 0.2138167285184128, + "grad_norm": 1.7671394348144531, + "learning_rate": 4.456902218611214e-05, + "loss": 4.7598, + "step": 35952 + }, + { + "epoch": 0.21382267580169378, + "grad_norm": 1.6970982551574707, + "learning_rate": 4.456873149673025e-05, + "loss": 4.7862, + "step": 35953 + }, + { + "epoch": 0.21382862308497477, + "grad_norm": 1.9007402658462524, + "learning_rate": 4.456844080051712e-05, + "loss": 4.7001, + "step": 35954 + }, + { + "epoch": 0.2138345703682558, + "grad_norm": 1.4777690172195435, + "learning_rate": 4.4568150097472846e-05, + "loss": 4.6834, + "step": 35955 + }, + { + "epoch": 0.21384051765153678, + "grad_norm": 1.5025019645690918, + "learning_rate": 4.4567859387597545e-05, + "loss": 4.7556, + "step": 35956 + }, + { + "epoch": 0.21384646493481776, + "grad_norm": 1.7506788969039917, + "learning_rate": 4.456756867089129e-05, + "loss": 4.7992, + "step": 35957 + }, + { + "epoch": 0.21385241221809878, + "grad_norm": 1.5796469449996948, + "learning_rate": 4.4567277947354215e-05, + "loss": 4.6379, + "step": 35958 + }, + { + "epoch": 0.21385835950137977, + "grad_norm": 1.9620283842086792, + "learning_rate": 4.456698721698639e-05, + "loss": 4.6841, + "step": 35959 + }, + { + "epoch": 0.21386430678466076, + "grad_norm": 2.463955879211426, + "learning_rate": 4.456669647978794e-05, + "loss": 4.2983, + "step": 35960 + }, + { + "epoch": 0.21387025406794177, + "grad_norm": 1.619341492652893, + "learning_rate": 4.456640573575896e-05, + "loss": 4.6181, + "step": 35961 + }, + { + "epoch": 0.21387620135122276, + "grad_norm": 1.8905354738235474, + "learning_rate": 4.456611498489955e-05, + "loss": 5.2374, + "step": 35962 + }, + { + "epoch": 0.21388214863450375, + "grad_norm": 1.745563268661499, + "learning_rate": 4.456582422720981e-05, + "loss": 5.1386, + "step": 35963 + }, + { + "epoch": 0.21388809591778477, + "grad_norm": 2.177422046661377, + "learning_rate": 4.456553346268983e-05, + "loss": 4.2874, + "step": 35964 + }, + { + "epoch": 0.21389404320106575, + "grad_norm": 2.8467772006988525, + "learning_rate": 4.4565242691339746e-05, + "loss": 3.5243, + "step": 35965 + }, + { + "epoch": 0.21389999048434674, + "grad_norm": 2.371044635772705, + "learning_rate": 4.456495191315963e-05, + "loss": 3.4414, + "step": 35966 + }, + { + "epoch": 0.21390593776762776, + "grad_norm": 2.501070737838745, + "learning_rate": 4.456466112814959e-05, + "loss": 3.2322, + "step": 35967 + }, + { + "epoch": 0.21391188505090875, + "grad_norm": 2.228771448135376, + "learning_rate": 4.456437033630972e-05, + "loss": 3.1681, + "step": 35968 + }, + { + "epoch": 0.21391783233418973, + "grad_norm": 2.327960729598999, + "learning_rate": 4.456407953764015e-05, + "loss": 3.8851, + "step": 35969 + }, + { + "epoch": 0.21392377961747075, + "grad_norm": 1.7042254209518433, + "learning_rate": 4.456378873214094e-05, + "loss": 4.8426, + "step": 35970 + }, + { + "epoch": 0.21392972690075174, + "grad_norm": 3.039768695831299, + "learning_rate": 4.456349791981223e-05, + "loss": 3.2656, + "step": 35971 + }, + { + "epoch": 0.21393567418403273, + "grad_norm": 4.14467191696167, + "learning_rate": 4.45632071006541e-05, + "loss": 2.8295, + "step": 35972 + }, + { + "epoch": 0.21394162146731374, + "grad_norm": 3.6734323501586914, + "learning_rate": 4.456291627466665e-05, + "loss": 2.6344, + "step": 35973 + }, + { + "epoch": 0.21394756875059473, + "grad_norm": 3.536844491958618, + "learning_rate": 4.4562625441849994e-05, + "loss": 2.5706, + "step": 35974 + }, + { + "epoch": 0.21395351603387572, + "grad_norm": 2.9864909648895264, + "learning_rate": 4.456233460220423e-05, + "loss": 2.7415, + "step": 35975 + }, + { + "epoch": 0.21395946331715673, + "grad_norm": 3.0675835609436035, + "learning_rate": 4.456204375572945e-05, + "loss": 3.1035, + "step": 35976 + }, + { + "epoch": 0.21396541060043772, + "grad_norm": 3.9746904373168945, + "learning_rate": 4.456175290242577e-05, + "loss": 3.0007, + "step": 35977 + }, + { + "epoch": 0.2139713578837187, + "grad_norm": 5.928574562072754, + "learning_rate": 4.456146204229328e-05, + "loss": 2.6625, + "step": 35978 + }, + { + "epoch": 0.21397730516699973, + "grad_norm": 4.5435309410095215, + "learning_rate": 4.45611711753321e-05, + "loss": 2.2242, + "step": 35979 + }, + { + "epoch": 0.21398325245028071, + "grad_norm": 4.220280647277832, + "learning_rate": 4.4560880301542293e-05, + "loss": 2.2737, + "step": 35980 + }, + { + "epoch": 0.2139891997335617, + "grad_norm": 4.68203592300415, + "learning_rate": 4.4560589420923995e-05, + "loss": 2.8215, + "step": 35981 + }, + { + "epoch": 0.21399514701684272, + "grad_norm": 4.116830348968506, + "learning_rate": 4.4560298533477304e-05, + "loss": 2.6675, + "step": 35982 + }, + { + "epoch": 0.2140010943001237, + "grad_norm": 3.708685874938965, + "learning_rate": 4.456000763920231e-05, + "loss": 2.6958, + "step": 35983 + }, + { + "epoch": 0.2140070415834047, + "grad_norm": 3.6551620960235596, + "learning_rate": 4.455971673809912e-05, + "loss": 2.498, + "step": 35984 + }, + { + "epoch": 0.2140129888666857, + "grad_norm": 3.9570088386535645, + "learning_rate": 4.455942583016783e-05, + "loss": 2.1696, + "step": 35985 + }, + { + "epoch": 0.2140189361499667, + "grad_norm": 3.8783881664276123, + "learning_rate": 4.455913491540855e-05, + "loss": 3.3353, + "step": 35986 + }, + { + "epoch": 0.2140248834332477, + "grad_norm": 3.2755517959594727, + "learning_rate": 4.4558843993821374e-05, + "loss": 3.4415, + "step": 35987 + }, + { + "epoch": 0.2140308307165287, + "grad_norm": 3.13720965385437, + "learning_rate": 4.4558553065406416e-05, + "loss": 2.4986, + "step": 35988 + }, + { + "epoch": 0.2140367779998097, + "grad_norm": 3.374295473098755, + "learning_rate": 4.455826213016376e-05, + "loss": 1.6225, + "step": 35989 + }, + { + "epoch": 0.21404272528309068, + "grad_norm": 2.5789332389831543, + "learning_rate": 4.455797118809353e-05, + "loss": 1.8221, + "step": 35990 + }, + { + "epoch": 0.2140486725663717, + "grad_norm": 1.7907307147979736, + "learning_rate": 4.45576802391958e-05, + "loss": 5.1448, + "step": 35991 + }, + { + "epoch": 0.21405461984965268, + "grad_norm": 1.7972261905670166, + "learning_rate": 4.4557389283470694e-05, + "loss": 5.1784, + "step": 35992 + }, + { + "epoch": 0.21406056713293367, + "grad_norm": 1.7984882593154907, + "learning_rate": 4.455709832091831e-05, + "loss": 5.1594, + "step": 35993 + }, + { + "epoch": 0.21406651441621466, + "grad_norm": 1.9921157360076904, + "learning_rate": 4.455680735153873e-05, + "loss": 5.0676, + "step": 35994 + }, + { + "epoch": 0.21407246169949568, + "grad_norm": 2.3399744033813477, + "learning_rate": 4.455651637533208e-05, + "loss": 4.9908, + "step": 35995 + }, + { + "epoch": 0.21407840898277666, + "grad_norm": 2.1457231044769287, + "learning_rate": 4.455622539229846e-05, + "loss": 4.7422, + "step": 35996 + }, + { + "epoch": 0.21408435626605765, + "grad_norm": 2.49147629737854, + "learning_rate": 4.455593440243795e-05, + "loss": 4.384, + "step": 35997 + }, + { + "epoch": 0.21409030354933867, + "grad_norm": 3.087649345397949, + "learning_rate": 4.455564340575067e-05, + "loss": 4.1671, + "step": 35998 + }, + { + "epoch": 0.21409625083261966, + "grad_norm": 2.1485769748687744, + "learning_rate": 4.4555352402236715e-05, + "loss": 4.7317, + "step": 35999 + }, + { + "epoch": 0.21410219811590064, + "grad_norm": 1.8602612018585205, + "learning_rate": 4.455506139189619e-05, + "loss": 4.6042, + "step": 36000 + }, + { + "epoch": 0.21410814539918166, + "grad_norm": 2.006908655166626, + "learning_rate": 4.45547703747292e-05, + "loss": 4.5201, + "step": 36001 + }, + { + "epoch": 0.21411409268246265, + "grad_norm": 1.8241304159164429, + "learning_rate": 4.4554479350735836e-05, + "loss": 4.5176, + "step": 36002 + }, + { + "epoch": 0.21412003996574364, + "grad_norm": 1.69816255569458, + "learning_rate": 4.455418831991621e-05, + "loss": 4.4631, + "step": 36003 + }, + { + "epoch": 0.21412598724902465, + "grad_norm": 1.821662425994873, + "learning_rate": 4.4553897282270415e-05, + "loss": 4.3294, + "step": 36004 + }, + { + "epoch": 0.21413193453230564, + "grad_norm": 1.6003782749176025, + "learning_rate": 4.455360623779855e-05, + "loss": 4.9333, + "step": 36005 + }, + { + "epoch": 0.21413788181558663, + "grad_norm": 1.675066351890564, + "learning_rate": 4.455331518650073e-05, + "loss": 5.4692, + "step": 36006 + }, + { + "epoch": 0.21414382909886764, + "grad_norm": 1.792047142982483, + "learning_rate": 4.455302412837705e-05, + "loss": 4.8869, + "step": 36007 + }, + { + "epoch": 0.21414977638214863, + "grad_norm": 2.5633020401000977, + "learning_rate": 4.455273306342762e-05, + "loss": 3.7999, + "step": 36008 + }, + { + "epoch": 0.21415572366542962, + "grad_norm": 2.494217872619629, + "learning_rate": 4.455244199165252e-05, + "loss": 3.7959, + "step": 36009 + }, + { + "epoch": 0.21416167094871064, + "grad_norm": 2.7078194618225098, + "learning_rate": 4.4552150913051874e-05, + "loss": 4.072, + "step": 36010 + }, + { + "epoch": 0.21416761823199162, + "grad_norm": 2.8123793601989746, + "learning_rate": 4.4551859827625766e-05, + "loss": 3.8776, + "step": 36011 + }, + { + "epoch": 0.2141735655152726, + "grad_norm": 2.320986270904541, + "learning_rate": 4.455156873537431e-05, + "loss": 3.9132, + "step": 36012 + }, + { + "epoch": 0.21417951279855363, + "grad_norm": 2.0167579650878906, + "learning_rate": 4.455127763629759e-05, + "loss": 4.1274, + "step": 36013 + }, + { + "epoch": 0.21418546008183462, + "grad_norm": 1.4378185272216797, + "learning_rate": 4.4550986530395744e-05, + "loss": 5.4604, + "step": 36014 + }, + { + "epoch": 0.2141914073651156, + "grad_norm": 1.6383719444274902, + "learning_rate": 4.4550695417668836e-05, + "loss": 4.9049, + "step": 36015 + }, + { + "epoch": 0.21419735464839662, + "grad_norm": 2.096296548843384, + "learning_rate": 4.455040429811699e-05, + "loss": 4.8674, + "step": 36016 + }, + { + "epoch": 0.2142033019316776, + "grad_norm": 3.1170642375946045, + "learning_rate": 4.4550113171740295e-05, + "loss": 3.7861, + "step": 36017 + }, + { + "epoch": 0.2142092492149586, + "grad_norm": 2.940704822540283, + "learning_rate": 4.454982203853886e-05, + "loss": 3.4928, + "step": 36018 + }, + { + "epoch": 0.2142151964982396, + "grad_norm": 2.2609074115753174, + "learning_rate": 4.4549530898512784e-05, + "loss": 4.6711, + "step": 36019 + }, + { + "epoch": 0.2142211437815206, + "grad_norm": 1.8117674589157104, + "learning_rate": 4.454923975166216e-05, + "loss": 4.8602, + "step": 36020 + }, + { + "epoch": 0.2142270910648016, + "grad_norm": 2.5625758171081543, + "learning_rate": 4.454894859798711e-05, + "loss": 4.5147, + "step": 36021 + }, + { + "epoch": 0.2142330383480826, + "grad_norm": 2.0803611278533936, + "learning_rate": 4.454865743748772e-05, + "loss": 4.1966, + "step": 36022 + }, + { + "epoch": 0.2142389856313636, + "grad_norm": 2.294630527496338, + "learning_rate": 4.454836627016409e-05, + "loss": 4.3761, + "step": 36023 + }, + { + "epoch": 0.21424493291464458, + "grad_norm": 2.7524173259735107, + "learning_rate": 4.454807509601633e-05, + "loss": 4.1234, + "step": 36024 + }, + { + "epoch": 0.2142508801979256, + "grad_norm": 2.5124104022979736, + "learning_rate": 4.454778391504454e-05, + "loss": 4.0255, + "step": 36025 + }, + { + "epoch": 0.21425682748120659, + "grad_norm": 2.565599203109741, + "learning_rate": 4.4547492727248826e-05, + "loss": 4.0547, + "step": 36026 + }, + { + "epoch": 0.21426277476448757, + "grad_norm": 2.376383066177368, + "learning_rate": 4.454720153262928e-05, + "loss": 4.0174, + "step": 36027 + }, + { + "epoch": 0.2142687220477686, + "grad_norm": 2.1378703117370605, + "learning_rate": 4.454691033118601e-05, + "loss": 3.9006, + "step": 36028 + }, + { + "epoch": 0.21427466933104958, + "grad_norm": 2.2482197284698486, + "learning_rate": 4.454661912291911e-05, + "loss": 3.9213, + "step": 36029 + }, + { + "epoch": 0.21428061661433057, + "grad_norm": 2.2354705333709717, + "learning_rate": 4.454632790782869e-05, + "loss": 3.9075, + "step": 36030 + }, + { + "epoch": 0.21428656389761158, + "grad_norm": 2.235800266265869, + "learning_rate": 4.454603668591485e-05, + "loss": 3.8096, + "step": 36031 + }, + { + "epoch": 0.21429251118089257, + "grad_norm": 2.11818528175354, + "learning_rate": 4.454574545717769e-05, + "loss": 4.4192, + "step": 36032 + }, + { + "epoch": 0.21429845846417356, + "grad_norm": 1.4688904285430908, + "learning_rate": 4.454545422161731e-05, + "loss": 5.1884, + "step": 36033 + }, + { + "epoch": 0.21430440574745457, + "grad_norm": 1.7121012210845947, + "learning_rate": 4.4545162979233815e-05, + "loss": 4.9336, + "step": 36034 + }, + { + "epoch": 0.21431035303073556, + "grad_norm": 1.6486331224441528, + "learning_rate": 4.454487173002731e-05, + "loss": 5.097, + "step": 36035 + }, + { + "epoch": 0.21431630031401655, + "grad_norm": 1.6203352212905884, + "learning_rate": 4.454458047399789e-05, + "loss": 4.97, + "step": 36036 + }, + { + "epoch": 0.21432224759729757, + "grad_norm": 2.226794719696045, + "learning_rate": 4.454428921114565e-05, + "loss": 3.9059, + "step": 36037 + }, + { + "epoch": 0.21432819488057855, + "grad_norm": 2.4930450916290283, + "learning_rate": 4.4543997941470715e-05, + "loss": 2.5628, + "step": 36038 + }, + { + "epoch": 0.21433414216385954, + "grad_norm": 2.5009028911590576, + "learning_rate": 4.4543706664973164e-05, + "loss": 3.21, + "step": 36039 + }, + { + "epoch": 0.21434008944714056, + "grad_norm": 2.357839822769165, + "learning_rate": 4.454341538165311e-05, + "loss": 2.7942, + "step": 36040 + }, + { + "epoch": 0.21434603673042155, + "grad_norm": 2.6789627075195312, + "learning_rate": 4.4543124091510644e-05, + "loss": 2.74, + "step": 36041 + }, + { + "epoch": 0.21435198401370253, + "grad_norm": 2.7236430644989014, + "learning_rate": 4.4542832794545884e-05, + "loss": 2.952, + "step": 36042 + }, + { + "epoch": 0.21435793129698355, + "grad_norm": 2.7634379863739014, + "learning_rate": 4.4542541490758924e-05, + "loss": 2.9004, + "step": 36043 + }, + { + "epoch": 0.21436387858026454, + "grad_norm": 2.463062047958374, + "learning_rate": 4.454225018014986e-05, + "loss": 2.7224, + "step": 36044 + }, + { + "epoch": 0.21436982586354553, + "grad_norm": 2.313150405883789, + "learning_rate": 4.45419588627188e-05, + "loss": 2.6462, + "step": 36045 + }, + { + "epoch": 0.21437577314682654, + "grad_norm": 2.3792331218719482, + "learning_rate": 4.4541667538465835e-05, + "loss": 2.682, + "step": 36046 + }, + { + "epoch": 0.21438172043010753, + "grad_norm": 2.3990492820739746, + "learning_rate": 4.454137620739109e-05, + "loss": 2.8759, + "step": 36047 + }, + { + "epoch": 0.21438766771338852, + "grad_norm": 2.345261335372925, + "learning_rate": 4.4541084869494644e-05, + "loss": 3.1526, + "step": 36048 + }, + { + "epoch": 0.21439361499666953, + "grad_norm": 2.0746614933013916, + "learning_rate": 4.454079352477661e-05, + "loss": 5.2981, + "step": 36049 + }, + { + "epoch": 0.21439956227995052, + "grad_norm": 1.9888861179351807, + "learning_rate": 4.4540502173237086e-05, + "loss": 5.1718, + "step": 36050 + }, + { + "epoch": 0.2144055095632315, + "grad_norm": 1.7000229358673096, + "learning_rate": 4.454021081487618e-05, + "loss": 5.0606, + "step": 36051 + }, + { + "epoch": 0.2144114568465125, + "grad_norm": 1.5641193389892578, + "learning_rate": 4.453991944969398e-05, + "loss": 5.4505, + "step": 36052 + }, + { + "epoch": 0.21441740412979352, + "grad_norm": 1.5807493925094604, + "learning_rate": 4.45396280776906e-05, + "loss": 5.0804, + "step": 36053 + }, + { + "epoch": 0.2144233514130745, + "grad_norm": 1.5401573181152344, + "learning_rate": 4.4539336698866143e-05, + "loss": 5.2455, + "step": 36054 + }, + { + "epoch": 0.2144292986963555, + "grad_norm": 1.6141964197158813, + "learning_rate": 4.453904531322069e-05, + "loss": 4.8019, + "step": 36055 + }, + { + "epoch": 0.2144352459796365, + "grad_norm": 1.8339582681655884, + "learning_rate": 4.453875392075437e-05, + "loss": 4.8166, + "step": 36056 + }, + { + "epoch": 0.2144411932629175, + "grad_norm": 1.4061498641967773, + "learning_rate": 4.453846252146727e-05, + "loss": 5.1891, + "step": 36057 + }, + { + "epoch": 0.21444714054619848, + "grad_norm": 1.4515758752822876, + "learning_rate": 4.45381711153595e-05, + "loss": 5.1881, + "step": 36058 + }, + { + "epoch": 0.2144530878294795, + "grad_norm": 1.8617783784866333, + "learning_rate": 4.453787970243115e-05, + "loss": 4.2562, + "step": 36059 + }, + { + "epoch": 0.2144590351127605, + "grad_norm": 1.7045130729675293, + "learning_rate": 4.453758828268233e-05, + "loss": 4.7226, + "step": 36060 + }, + { + "epoch": 0.21446498239604148, + "grad_norm": 1.6791839599609375, + "learning_rate": 4.4537296856113134e-05, + "loss": 5.6172, + "step": 36061 + }, + { + "epoch": 0.2144709296793225, + "grad_norm": 1.605233073234558, + "learning_rate": 4.4537005422723676e-05, + "loss": 5.4356, + "step": 36062 + }, + { + "epoch": 0.21447687696260348, + "grad_norm": 1.3953781127929688, + "learning_rate": 4.453671398251406e-05, + "loss": 5.3499, + "step": 36063 + }, + { + "epoch": 0.21448282424588447, + "grad_norm": 1.6507956981658936, + "learning_rate": 4.453642253548436e-05, + "loss": 5.4016, + "step": 36064 + }, + { + "epoch": 0.21448877152916548, + "grad_norm": 1.700950264930725, + "learning_rate": 4.4536131081634705e-05, + "loss": 5.0299, + "step": 36065 + }, + { + "epoch": 0.21449471881244647, + "grad_norm": 1.6992217302322388, + "learning_rate": 4.4535839620965195e-05, + "loss": 5.1387, + "step": 36066 + }, + { + "epoch": 0.21450066609572746, + "grad_norm": 1.5594868659973145, + "learning_rate": 4.453554815347592e-05, + "loss": 5.0754, + "step": 36067 + }, + { + "epoch": 0.21450661337900848, + "grad_norm": 1.28568434715271, + "learning_rate": 4.453525667916698e-05, + "loss": 5.1151, + "step": 36068 + }, + { + "epoch": 0.21451256066228946, + "grad_norm": 1.5007739067077637, + "learning_rate": 4.453496519803849e-05, + "loss": 5.1162, + "step": 36069 + }, + { + "epoch": 0.21451850794557045, + "grad_norm": 1.3101109266281128, + "learning_rate": 4.453467371009055e-05, + "loss": 5.0668, + "step": 36070 + }, + { + "epoch": 0.21452445522885147, + "grad_norm": 1.4987660646438599, + "learning_rate": 4.453438221532325e-05, + "loss": 5.0398, + "step": 36071 + }, + { + "epoch": 0.21453040251213246, + "grad_norm": 1.7213561534881592, + "learning_rate": 4.45340907137367e-05, + "loss": 5.0728, + "step": 36072 + }, + { + "epoch": 0.21453634979541344, + "grad_norm": 1.9010142087936401, + "learning_rate": 4.4533799205331006e-05, + "loss": 4.9607, + "step": 36073 + }, + { + "epoch": 0.21454229707869446, + "grad_norm": 1.9126904010772705, + "learning_rate": 4.453350769010626e-05, + "loss": 4.6572, + "step": 36074 + }, + { + "epoch": 0.21454824436197545, + "grad_norm": 2.3189423084259033, + "learning_rate": 4.453321616806257e-05, + "loss": 4.2961, + "step": 36075 + }, + { + "epoch": 0.21455419164525644, + "grad_norm": 2.047769784927368, + "learning_rate": 4.453292463920004e-05, + "loss": 5.2367, + "step": 36076 + }, + { + "epoch": 0.21456013892853745, + "grad_norm": 1.8531605005264282, + "learning_rate": 4.453263310351876e-05, + "loss": 5.0521, + "step": 36077 + }, + { + "epoch": 0.21456608621181844, + "grad_norm": 1.3894751071929932, + "learning_rate": 4.453234156101884e-05, + "loss": 5.238, + "step": 36078 + }, + { + "epoch": 0.21457203349509943, + "grad_norm": 1.6113433837890625, + "learning_rate": 4.453205001170039e-05, + "loss": 4.9134, + "step": 36079 + }, + { + "epoch": 0.21457798077838044, + "grad_norm": 1.8081282377243042, + "learning_rate": 4.4531758455563495e-05, + "loss": 4.9779, + "step": 36080 + }, + { + "epoch": 0.21458392806166143, + "grad_norm": 2.274998188018799, + "learning_rate": 4.4531466892608266e-05, + "loss": 4.3828, + "step": 36081 + }, + { + "epoch": 0.21458987534494242, + "grad_norm": 1.9097249507904053, + "learning_rate": 4.453117532283481e-05, + "loss": 4.5971, + "step": 36082 + }, + { + "epoch": 0.21459582262822344, + "grad_norm": 2.342449903488159, + "learning_rate": 4.4530883746243214e-05, + "loss": 4.5699, + "step": 36083 + }, + { + "epoch": 0.21460176991150443, + "grad_norm": 1.9449174404144287, + "learning_rate": 4.453059216283358e-05, + "loss": 4.0389, + "step": 36084 + }, + { + "epoch": 0.2146077171947854, + "grad_norm": 1.6288878917694092, + "learning_rate": 4.453030057260604e-05, + "loss": 5.2517, + "step": 36085 + }, + { + "epoch": 0.21461366447806643, + "grad_norm": 1.4354645013809204, + "learning_rate": 4.453000897556066e-05, + "loss": 5.1874, + "step": 36086 + }, + { + "epoch": 0.21461961176134742, + "grad_norm": 1.6599136590957642, + "learning_rate": 4.452971737169756e-05, + "loss": 5.1117, + "step": 36087 + }, + { + "epoch": 0.2146255590446284, + "grad_norm": 1.649203896522522, + "learning_rate": 4.4529425761016835e-05, + "loss": 5.2197, + "step": 36088 + }, + { + "epoch": 0.21463150632790942, + "grad_norm": 2.038905143737793, + "learning_rate": 4.452913414351859e-05, + "loss": 5.0925, + "step": 36089 + }, + { + "epoch": 0.2146374536111904, + "grad_norm": 2.200108289718628, + "learning_rate": 4.452884251920293e-05, + "loss": 5.0225, + "step": 36090 + }, + { + "epoch": 0.2146434008944714, + "grad_norm": 1.9698771238327026, + "learning_rate": 4.452855088806995e-05, + "loss": 4.7777, + "step": 36091 + }, + { + "epoch": 0.2146493481777524, + "grad_norm": 1.687897801399231, + "learning_rate": 4.4528259250119756e-05, + "loss": 5.0942, + "step": 36092 + }, + { + "epoch": 0.2146552954610334, + "grad_norm": 1.311324954032898, + "learning_rate": 4.4527967605352446e-05, + "loss": 5.2022, + "step": 36093 + }, + { + "epoch": 0.2146612427443144, + "grad_norm": 1.464908480644226, + "learning_rate": 4.452767595376812e-05, + "loss": 5.1902, + "step": 36094 + }, + { + "epoch": 0.2146671900275954, + "grad_norm": 1.609305500984192, + "learning_rate": 4.4527384295366893e-05, + "loss": 4.7921, + "step": 36095 + }, + { + "epoch": 0.2146731373108764, + "grad_norm": 1.8921763896942139, + "learning_rate": 4.4527092630148854e-05, + "loss": 4.1943, + "step": 36096 + }, + { + "epoch": 0.21467908459415738, + "grad_norm": 1.436725378036499, + "learning_rate": 4.452680095811411e-05, + "loss": 4.7097, + "step": 36097 + }, + { + "epoch": 0.2146850318774384, + "grad_norm": 1.8407703638076782, + "learning_rate": 4.4526509279262764e-05, + "loss": 4.3972, + "step": 36098 + }, + { + "epoch": 0.2146909791607194, + "grad_norm": 1.5586193799972534, + "learning_rate": 4.45262175935949e-05, + "loss": 4.9978, + "step": 36099 + }, + { + "epoch": 0.21469692644400037, + "grad_norm": 1.5589431524276733, + "learning_rate": 4.4525925901110656e-05, + "loss": 5.113, + "step": 36100 + }, + { + "epoch": 0.2147028737272814, + "grad_norm": 1.633216142654419, + "learning_rate": 4.45256342018101e-05, + "loss": 4.0823, + "step": 36101 + }, + { + "epoch": 0.21470882101056238, + "grad_norm": 1.530907392501831, + "learning_rate": 4.452534249569335e-05, + "loss": 4.3349, + "step": 36102 + }, + { + "epoch": 0.21471476829384337, + "grad_norm": 1.7243280410766602, + "learning_rate": 4.45250507827605e-05, + "loss": 5.4634, + "step": 36103 + }, + { + "epoch": 0.21472071557712438, + "grad_norm": 1.7470494508743286, + "learning_rate": 4.452475906301167e-05, + "loss": 5.3959, + "step": 36104 + }, + { + "epoch": 0.21472666286040537, + "grad_norm": 1.5940369367599487, + "learning_rate": 4.452446733644694e-05, + "loss": 5.3744, + "step": 36105 + }, + { + "epoch": 0.21473261014368636, + "grad_norm": 1.5966911315917969, + "learning_rate": 4.452417560306642e-05, + "loss": 5.3308, + "step": 36106 + }, + { + "epoch": 0.21473855742696737, + "grad_norm": 1.665995478630066, + "learning_rate": 4.452388386287021e-05, + "loss": 5.5209, + "step": 36107 + }, + { + "epoch": 0.21474450471024836, + "grad_norm": 1.476784110069275, + "learning_rate": 4.452359211585841e-05, + "loss": 5.3837, + "step": 36108 + }, + { + "epoch": 0.21475045199352935, + "grad_norm": 1.5506277084350586, + "learning_rate": 4.452330036203114e-05, + "loss": 5.4229, + "step": 36109 + }, + { + "epoch": 0.21475639927681037, + "grad_norm": 1.3492239713668823, + "learning_rate": 4.4523008601388475e-05, + "loss": 5.4111, + "step": 36110 + }, + { + "epoch": 0.21476234656009136, + "grad_norm": 1.4536131620407104, + "learning_rate": 4.452271683393053e-05, + "loss": 5.4376, + "step": 36111 + }, + { + "epoch": 0.21476829384337234, + "grad_norm": 1.5479294061660767, + "learning_rate": 4.452242505965741e-05, + "loss": 4.9195, + "step": 36112 + }, + { + "epoch": 0.21477424112665333, + "grad_norm": 1.7930655479431152, + "learning_rate": 4.452213327856922e-05, + "loss": 4.9069, + "step": 36113 + }, + { + "epoch": 0.21478018840993435, + "grad_norm": 2.004514455795288, + "learning_rate": 4.452184149066605e-05, + "loss": 4.7419, + "step": 36114 + }, + { + "epoch": 0.21478613569321534, + "grad_norm": 1.8205454349517822, + "learning_rate": 4.4521549695948004e-05, + "loss": 5.0988, + "step": 36115 + }, + { + "epoch": 0.21479208297649632, + "grad_norm": 1.8684512376785278, + "learning_rate": 4.4521257894415183e-05, + "loss": 5.1889, + "step": 36116 + }, + { + "epoch": 0.21479803025977734, + "grad_norm": 1.5708959102630615, + "learning_rate": 4.45209660860677e-05, + "loss": 5.1908, + "step": 36117 + }, + { + "epoch": 0.21480397754305833, + "grad_norm": 1.7478984594345093, + "learning_rate": 4.452067427090565e-05, + "loss": 5.4033, + "step": 36118 + }, + { + "epoch": 0.21480992482633932, + "grad_norm": 1.8025048971176147, + "learning_rate": 4.452038244892913e-05, + "loss": 5.4858, + "step": 36119 + }, + { + "epoch": 0.21481587210962033, + "grad_norm": 1.5002171993255615, + "learning_rate": 4.4520090620138245e-05, + "loss": 5.6049, + "step": 36120 + }, + { + "epoch": 0.21482181939290132, + "grad_norm": 1.793312668800354, + "learning_rate": 4.45197987845331e-05, + "loss": 5.0971, + "step": 36121 + }, + { + "epoch": 0.2148277666761823, + "grad_norm": 1.6286466121673584, + "learning_rate": 4.45195069421138e-05, + "loss": 5.0552, + "step": 36122 + }, + { + "epoch": 0.21483371395946332, + "grad_norm": 2.28002667427063, + "learning_rate": 4.451921509288043e-05, + "loss": 4.136, + "step": 36123 + }, + { + "epoch": 0.2148396612427443, + "grad_norm": 1.5719590187072754, + "learning_rate": 4.451892323683311e-05, + "loss": 5.3065, + "step": 36124 + }, + { + "epoch": 0.2148456085260253, + "grad_norm": 1.5892250537872314, + "learning_rate": 4.451863137397193e-05, + "loss": 5.4593, + "step": 36125 + }, + { + "epoch": 0.21485155580930632, + "grad_norm": 1.6752700805664062, + "learning_rate": 4.4518339504297013e-05, + "loss": 5.2985, + "step": 36126 + }, + { + "epoch": 0.2148575030925873, + "grad_norm": 1.8093560934066772, + "learning_rate": 4.451804762780843e-05, + "loss": 4.9665, + "step": 36127 + }, + { + "epoch": 0.2148634503758683, + "grad_norm": 1.5342146158218384, + "learning_rate": 4.4517755744506303e-05, + "loss": 4.9299, + "step": 36128 + }, + { + "epoch": 0.2148693976591493, + "grad_norm": 1.5814716815948486, + "learning_rate": 4.451746385439074e-05, + "loss": 5.0069, + "step": 36129 + }, + { + "epoch": 0.2148753449424303, + "grad_norm": 1.4868812561035156, + "learning_rate": 4.4517171957461814e-05, + "loss": 4.9328, + "step": 36130 + }, + { + "epoch": 0.21488129222571128, + "grad_norm": 1.4403107166290283, + "learning_rate": 4.4516880053719655e-05, + "loss": 5.1021, + "step": 36131 + }, + { + "epoch": 0.2148872395089923, + "grad_norm": 2.2056379318237305, + "learning_rate": 4.451658814316435e-05, + "loss": 4.652, + "step": 36132 + }, + { + "epoch": 0.2148931867922733, + "grad_norm": 1.6643704175949097, + "learning_rate": 4.451629622579601e-05, + "loss": 5.6042, + "step": 36133 + }, + { + "epoch": 0.21489913407555428, + "grad_norm": 1.6085230112075806, + "learning_rate": 4.4516004301614734e-05, + "loss": 5.2156, + "step": 36134 + }, + { + "epoch": 0.2149050813588353, + "grad_norm": 1.741129755973816, + "learning_rate": 4.451571237062062e-05, + "loss": 4.7964, + "step": 36135 + }, + { + "epoch": 0.21491102864211628, + "grad_norm": 1.5676339864730835, + "learning_rate": 4.451542043281377e-05, + "loss": 5.1581, + "step": 36136 + }, + { + "epoch": 0.21491697592539727, + "grad_norm": 1.5741878747940063, + "learning_rate": 4.451512848819429e-05, + "loss": 5.1789, + "step": 36137 + }, + { + "epoch": 0.21492292320867828, + "grad_norm": 1.6025103330612183, + "learning_rate": 4.4514836536762286e-05, + "loss": 4.9795, + "step": 36138 + }, + { + "epoch": 0.21492887049195927, + "grad_norm": 1.680410385131836, + "learning_rate": 4.451454457851785e-05, + "loss": 5.0603, + "step": 36139 + }, + { + "epoch": 0.21493481777524026, + "grad_norm": 2.5844266414642334, + "learning_rate": 4.451425261346108e-05, + "loss": 3.9414, + "step": 36140 + }, + { + "epoch": 0.21494076505852128, + "grad_norm": 1.7749565839767456, + "learning_rate": 4.45139606415921e-05, + "loss": 4.5331, + "step": 36141 + }, + { + "epoch": 0.21494671234180227, + "grad_norm": 1.769710898399353, + "learning_rate": 4.451366866291098e-05, + "loss": 4.7675, + "step": 36142 + }, + { + "epoch": 0.21495265962508325, + "grad_norm": 1.5556137561798096, + "learning_rate": 4.451337667741785e-05, + "loss": 4.9267, + "step": 36143 + }, + { + "epoch": 0.21495860690836427, + "grad_norm": 1.4366059303283691, + "learning_rate": 4.45130846851128e-05, + "loss": 5.5483, + "step": 36144 + }, + { + "epoch": 0.21496455419164526, + "grad_norm": 1.356587529182434, + "learning_rate": 4.451279268599594e-05, + "loss": 5.5553, + "step": 36145 + }, + { + "epoch": 0.21497050147492625, + "grad_norm": 1.7132307291030884, + "learning_rate": 4.451250068006736e-05, + "loss": 4.4766, + "step": 36146 + }, + { + "epoch": 0.21497644875820726, + "grad_norm": 1.420796513557434, + "learning_rate": 4.4512208667327175e-05, + "loss": 5.422, + "step": 36147 + }, + { + "epoch": 0.21498239604148825, + "grad_norm": 1.9435526132583618, + "learning_rate": 4.4511916647775474e-05, + "loss": 4.3675, + "step": 36148 + }, + { + "epoch": 0.21498834332476924, + "grad_norm": 2.1586434841156006, + "learning_rate": 4.451162462141236e-05, + "loss": 4.3742, + "step": 36149 + }, + { + "epoch": 0.21499429060805025, + "grad_norm": 1.7578691244125366, + "learning_rate": 4.451133258823795e-05, + "loss": 4.7244, + "step": 36150 + }, + { + "epoch": 0.21500023789133124, + "grad_norm": 1.5594449043273926, + "learning_rate": 4.4511040548252325e-05, + "loss": 4.8837, + "step": 36151 + }, + { + "epoch": 0.21500618517461223, + "grad_norm": 1.5725610256195068, + "learning_rate": 4.45107485014556e-05, + "loss": 4.272, + "step": 36152 + }, + { + "epoch": 0.21501213245789325, + "grad_norm": 1.5220437049865723, + "learning_rate": 4.451045644784788e-05, + "loss": 4.3584, + "step": 36153 + }, + { + "epoch": 0.21501807974117423, + "grad_norm": 1.3363945484161377, + "learning_rate": 4.451016438742925e-05, + "loss": 5.0875, + "step": 36154 + }, + { + "epoch": 0.21502402702445522, + "grad_norm": 1.5395842790603638, + "learning_rate": 4.450987232019984e-05, + "loss": 4.9718, + "step": 36155 + }, + { + "epoch": 0.21502997430773624, + "grad_norm": 1.7091704607009888, + "learning_rate": 4.450958024615972e-05, + "loss": 4.882, + "step": 36156 + }, + { + "epoch": 0.21503592159101723, + "grad_norm": 2.3344812393188477, + "learning_rate": 4.4509288165309015e-05, + "loss": 4.2766, + "step": 36157 + }, + { + "epoch": 0.21504186887429821, + "grad_norm": 1.9190376996994019, + "learning_rate": 4.450899607764782e-05, + "loss": 4.2321, + "step": 36158 + }, + { + "epoch": 0.21504781615757923, + "grad_norm": 1.8463904857635498, + "learning_rate": 4.450870398317623e-05, + "loss": 4.4709, + "step": 36159 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 1.4867302179336548, + "learning_rate": 4.4508411881894356e-05, + "loss": 4.5178, + "step": 36160 + }, + { + "epoch": 0.2150597107241412, + "grad_norm": 1.4961345195770264, + "learning_rate": 4.45081197738023e-05, + "loss": 4.6105, + "step": 36161 + }, + { + "epoch": 0.21506565800742222, + "grad_norm": 1.3281563520431519, + "learning_rate": 4.450782765890016e-05, + "loss": 4.6637, + "step": 36162 + }, + { + "epoch": 0.2150716052907032, + "grad_norm": 1.4934107065200806, + "learning_rate": 4.450753553718803e-05, + "loss": 4.4112, + "step": 36163 + }, + { + "epoch": 0.2150775525739842, + "grad_norm": 1.3608429431915283, + "learning_rate": 4.450724340866603e-05, + "loss": 4.8015, + "step": 36164 + }, + { + "epoch": 0.21508349985726521, + "grad_norm": 1.5349289178848267, + "learning_rate": 4.450695127333425e-05, + "loss": 4.7622, + "step": 36165 + }, + { + "epoch": 0.2150894471405462, + "grad_norm": 1.5071897506713867, + "learning_rate": 4.450665913119279e-05, + "loss": 4.4918, + "step": 36166 + }, + { + "epoch": 0.2150953944238272, + "grad_norm": 1.6646260023117065, + "learning_rate": 4.4506366982241766e-05, + "loss": 4.625, + "step": 36167 + }, + { + "epoch": 0.2151013417071082, + "grad_norm": 1.5301088094711304, + "learning_rate": 4.450607482648127e-05, + "loss": 4.4959, + "step": 36168 + }, + { + "epoch": 0.2151072889903892, + "grad_norm": 1.5578876733779907, + "learning_rate": 4.45057826639114e-05, + "loss": 4.5457, + "step": 36169 + }, + { + "epoch": 0.21511323627367018, + "grad_norm": 1.5869579315185547, + "learning_rate": 4.450549049453227e-05, + "loss": 4.6881, + "step": 36170 + }, + { + "epoch": 0.21511918355695117, + "grad_norm": 1.5819709300994873, + "learning_rate": 4.450519831834397e-05, + "loss": 4.5569, + "step": 36171 + }, + { + "epoch": 0.2151251308402322, + "grad_norm": 1.6285146474838257, + "learning_rate": 4.45049061353466e-05, + "loss": 4.2395, + "step": 36172 + }, + { + "epoch": 0.21513107812351318, + "grad_norm": 1.5318942070007324, + "learning_rate": 4.450461394554027e-05, + "loss": 4.314, + "step": 36173 + }, + { + "epoch": 0.21513702540679416, + "grad_norm": 1.5344517230987549, + "learning_rate": 4.450432174892509e-05, + "loss": 4.2087, + "step": 36174 + }, + { + "epoch": 0.21514297269007518, + "grad_norm": 1.4494154453277588, + "learning_rate": 4.450402954550115e-05, + "loss": 4.3346, + "step": 36175 + }, + { + "epoch": 0.21514891997335617, + "grad_norm": 1.575466513633728, + "learning_rate": 4.450373733526855e-05, + "loss": 4.6291, + "step": 36176 + }, + { + "epoch": 0.21515486725663716, + "grad_norm": 1.6240023374557495, + "learning_rate": 4.4503445118227396e-05, + "loss": 4.6026, + "step": 36177 + }, + { + "epoch": 0.21516081453991817, + "grad_norm": 1.355594515800476, + "learning_rate": 4.45031528943778e-05, + "loss": 4.2619, + "step": 36178 + }, + { + "epoch": 0.21516676182319916, + "grad_norm": 1.5286892652511597, + "learning_rate": 4.4502860663719846e-05, + "loss": 4.2924, + "step": 36179 + }, + { + "epoch": 0.21517270910648015, + "grad_norm": 1.399736762046814, + "learning_rate": 4.4502568426253646e-05, + "loss": 4.3781, + "step": 36180 + }, + { + "epoch": 0.21517865638976116, + "grad_norm": 1.585708498954773, + "learning_rate": 4.4502276181979306e-05, + "loss": 5.5202, + "step": 36181 + }, + { + "epoch": 0.21518460367304215, + "grad_norm": 1.4541770219802856, + "learning_rate": 4.4501983930896916e-05, + "loss": 5.2694, + "step": 36182 + }, + { + "epoch": 0.21519055095632314, + "grad_norm": 1.3780940771102905, + "learning_rate": 4.4501691673006596e-05, + "loss": 5.1998, + "step": 36183 + }, + { + "epoch": 0.21519649823960416, + "grad_norm": 1.5186759233474731, + "learning_rate": 4.450139940830843e-05, + "loss": 4.7699, + "step": 36184 + }, + { + "epoch": 0.21520244552288514, + "grad_norm": 1.7143422365188599, + "learning_rate": 4.450110713680252e-05, + "loss": 5.135, + "step": 36185 + }, + { + "epoch": 0.21520839280616613, + "grad_norm": 1.5604811906814575, + "learning_rate": 4.4500814858488984e-05, + "loss": 5.2164, + "step": 36186 + }, + { + "epoch": 0.21521434008944715, + "grad_norm": 1.1931864023208618, + "learning_rate": 4.450052257336792e-05, + "loss": 5.1427, + "step": 36187 + }, + { + "epoch": 0.21522028737272814, + "grad_norm": 1.7609212398529053, + "learning_rate": 4.4500230281439416e-05, + "loss": 4.6423, + "step": 36188 + }, + { + "epoch": 0.21522623465600912, + "grad_norm": 1.8664861917495728, + "learning_rate": 4.4499937982703575e-05, + "loss": 4.6855, + "step": 36189 + }, + { + "epoch": 0.21523218193929014, + "grad_norm": 1.6264115571975708, + "learning_rate": 4.449964567716052e-05, + "loss": 4.7096, + "step": 36190 + }, + { + "epoch": 0.21523812922257113, + "grad_norm": 1.592661738395691, + "learning_rate": 4.4499353364810335e-05, + "loss": 4.612, + "step": 36191 + }, + { + "epoch": 0.21524407650585212, + "grad_norm": 1.600019097328186, + "learning_rate": 4.449906104565313e-05, + "loss": 4.835, + "step": 36192 + }, + { + "epoch": 0.21525002378913313, + "grad_norm": 1.672973394393921, + "learning_rate": 4.449876871968901e-05, + "loss": 4.8086, + "step": 36193 + }, + { + "epoch": 0.21525597107241412, + "grad_norm": 1.5232312679290771, + "learning_rate": 4.4498476386918066e-05, + "loss": 5.0278, + "step": 36194 + }, + { + "epoch": 0.2152619183556951, + "grad_norm": 1.459900975227356, + "learning_rate": 4.4498184047340404e-05, + "loss": 4.8873, + "step": 36195 + }, + { + "epoch": 0.21526786563897612, + "grad_norm": 1.6688652038574219, + "learning_rate": 4.449789170095612e-05, + "loss": 5.1418, + "step": 36196 + }, + { + "epoch": 0.2152738129222571, + "grad_norm": 1.9095449447631836, + "learning_rate": 4.449759934776533e-05, + "loss": 4.1144, + "step": 36197 + }, + { + "epoch": 0.2152797602055381, + "grad_norm": 1.5863621234893799, + "learning_rate": 4.449730698776814e-05, + "loss": 4.6239, + "step": 36198 + }, + { + "epoch": 0.21528570748881912, + "grad_norm": 1.4681096076965332, + "learning_rate": 4.449701462096463e-05, + "loss": 4.9615, + "step": 36199 + }, + { + "epoch": 0.2152916547721001, + "grad_norm": 1.204664707183838, + "learning_rate": 4.4496722247354916e-05, + "loss": 5.1881, + "step": 36200 + }, + { + "epoch": 0.2152976020553811, + "grad_norm": 1.3521478176116943, + "learning_rate": 4.449642986693909e-05, + "loss": 4.891, + "step": 36201 + }, + { + "epoch": 0.2153035493386621, + "grad_norm": 1.3406485319137573, + "learning_rate": 4.4496137479717276e-05, + "loss": 4.7794, + "step": 36202 + }, + { + "epoch": 0.2153094966219431, + "grad_norm": 1.1206368207931519, + "learning_rate": 4.449584508568955e-05, + "loss": 4.4378, + "step": 36203 + }, + { + "epoch": 0.21531544390522409, + "grad_norm": 1.1885775327682495, + "learning_rate": 4.449555268485603e-05, + "loss": 4.6944, + "step": 36204 + }, + { + "epoch": 0.2153213911885051, + "grad_norm": 1.4947532415390015, + "learning_rate": 4.449526027721682e-05, + "loss": 4.3875, + "step": 36205 + }, + { + "epoch": 0.2153273384717861, + "grad_norm": 1.6429933309555054, + "learning_rate": 4.449496786277201e-05, + "loss": 4.1325, + "step": 36206 + }, + { + "epoch": 0.21533328575506708, + "grad_norm": 1.6001614332199097, + "learning_rate": 4.449467544152171e-05, + "loss": 4.9123, + "step": 36207 + }, + { + "epoch": 0.2153392330383481, + "grad_norm": 2.3580222129821777, + "learning_rate": 4.449438301346602e-05, + "loss": 4.5931, + "step": 36208 + }, + { + "epoch": 0.21534518032162908, + "grad_norm": 1.3925138711929321, + "learning_rate": 4.449409057860504e-05, + "loss": 5.0114, + "step": 36209 + }, + { + "epoch": 0.21535112760491007, + "grad_norm": 1.1549257040023804, + "learning_rate": 4.449379813693888e-05, + "loss": 4.8184, + "step": 36210 + }, + { + "epoch": 0.21535707488819109, + "grad_norm": 1.3624850511550903, + "learning_rate": 4.449350568846763e-05, + "loss": 4.6685, + "step": 36211 + }, + { + "epoch": 0.21536302217147207, + "grad_norm": 1.8626717329025269, + "learning_rate": 4.44932132331914e-05, + "loss": 4.2869, + "step": 36212 + }, + { + "epoch": 0.21536896945475306, + "grad_norm": 2.0599145889282227, + "learning_rate": 4.449292077111029e-05, + "loss": 4.0511, + "step": 36213 + }, + { + "epoch": 0.21537491673803408, + "grad_norm": 1.567862629890442, + "learning_rate": 4.44926283022244e-05, + "loss": 4.8801, + "step": 36214 + }, + { + "epoch": 0.21538086402131507, + "grad_norm": 1.5523948669433594, + "learning_rate": 4.4492335826533845e-05, + "loss": 4.7761, + "step": 36215 + }, + { + "epoch": 0.21538681130459605, + "grad_norm": 1.818768858909607, + "learning_rate": 4.449204334403871e-05, + "loss": 4.3282, + "step": 36216 + }, + { + "epoch": 0.21539275858787707, + "grad_norm": 1.7761987447738647, + "learning_rate": 4.449175085473911e-05, + "loss": 4.636, + "step": 36217 + }, + { + "epoch": 0.21539870587115806, + "grad_norm": 1.20720636844635, + "learning_rate": 4.449145835863513e-05, + "loss": 4.7549, + "step": 36218 + }, + { + "epoch": 0.21540465315443905, + "grad_norm": 1.3905788660049438, + "learning_rate": 4.449116585572689e-05, + "loss": 4.8965, + "step": 36219 + }, + { + "epoch": 0.21541060043772006, + "grad_norm": 1.7040412425994873, + "learning_rate": 4.449087334601448e-05, + "loss": 4.8117, + "step": 36220 + }, + { + "epoch": 0.21541654772100105, + "grad_norm": 1.7664754390716553, + "learning_rate": 4.449058082949802e-05, + "loss": 5.2555, + "step": 36221 + }, + { + "epoch": 0.21542249500428204, + "grad_norm": 1.8236404657363892, + "learning_rate": 4.449028830617759e-05, + "loss": 5.212, + "step": 36222 + }, + { + "epoch": 0.21542844228756305, + "grad_norm": 1.6441102027893066, + "learning_rate": 4.44899957760533e-05, + "loss": 5.3216, + "step": 36223 + }, + { + "epoch": 0.21543438957084404, + "grad_norm": 1.6687595844268799, + "learning_rate": 4.4489703239125265e-05, + "loss": 4.9983, + "step": 36224 + }, + { + "epoch": 0.21544033685412503, + "grad_norm": 1.7582825422286987, + "learning_rate": 4.448941069539356e-05, + "loss": 4.4715, + "step": 36225 + }, + { + "epoch": 0.21544628413740605, + "grad_norm": 1.653625726699829, + "learning_rate": 4.4489118144858324e-05, + "loss": 4.5712, + "step": 36226 + }, + { + "epoch": 0.21545223142068703, + "grad_norm": 1.70182466506958, + "learning_rate": 4.448882558751962e-05, + "loss": 4.4521, + "step": 36227 + }, + { + "epoch": 0.21545817870396802, + "grad_norm": 1.3696340322494507, + "learning_rate": 4.448853302337758e-05, + "loss": 5.0353, + "step": 36228 + }, + { + "epoch": 0.215464125987249, + "grad_norm": 1.3940473794937134, + "learning_rate": 4.448824045243228e-05, + "loss": 5.0052, + "step": 36229 + }, + { + "epoch": 0.21547007327053003, + "grad_norm": 1.5669533014297485, + "learning_rate": 4.448794787468384e-05, + "loss": 4.8972, + "step": 36230 + }, + { + "epoch": 0.21547602055381102, + "grad_norm": 1.5695853233337402, + "learning_rate": 4.448765529013237e-05, + "loss": 4.9789, + "step": 36231 + }, + { + "epoch": 0.215481967837092, + "grad_norm": 1.5337707996368408, + "learning_rate": 4.4487362698777956e-05, + "loss": 4.5533, + "step": 36232 + }, + { + "epoch": 0.21548791512037302, + "grad_norm": 1.8555629253387451, + "learning_rate": 4.4487070100620705e-05, + "loss": 4.2116, + "step": 36233 + }, + { + "epoch": 0.215493862403654, + "grad_norm": 1.8653151988983154, + "learning_rate": 4.4486777495660715e-05, + "loss": 4.6188, + "step": 36234 + }, + { + "epoch": 0.215499809686935, + "grad_norm": 1.7039722204208374, + "learning_rate": 4.4486484883898096e-05, + "loss": 4.9626, + "step": 36235 + }, + { + "epoch": 0.215505756970216, + "grad_norm": 1.7743360996246338, + "learning_rate": 4.448619226533295e-05, + "loss": 4.7605, + "step": 36236 + }, + { + "epoch": 0.215511704253497, + "grad_norm": 1.6669758558273315, + "learning_rate": 4.4485899639965366e-05, + "loss": 4.6322, + "step": 36237 + }, + { + "epoch": 0.215517651536778, + "grad_norm": 1.4342900514602661, + "learning_rate": 4.448560700779546e-05, + "loss": 5.2187, + "step": 36238 + }, + { + "epoch": 0.215523598820059, + "grad_norm": 1.6044615507125854, + "learning_rate": 4.448531436882333e-05, + "loss": 4.8211, + "step": 36239 + }, + { + "epoch": 0.21552954610334, + "grad_norm": 1.783548355102539, + "learning_rate": 4.448502172304908e-05, + "loss": 5.0975, + "step": 36240 + }, + { + "epoch": 0.21553549338662098, + "grad_norm": 1.6044108867645264, + "learning_rate": 4.4484729070472806e-05, + "loss": 4.8102, + "step": 36241 + }, + { + "epoch": 0.215541440669902, + "grad_norm": 1.4885926246643066, + "learning_rate": 4.448443641109462e-05, + "loss": 5.0961, + "step": 36242 + }, + { + "epoch": 0.21554738795318298, + "grad_norm": 1.358034610748291, + "learning_rate": 4.448414374491462e-05, + "loss": 5.0771, + "step": 36243 + }, + { + "epoch": 0.21555333523646397, + "grad_norm": 1.6421840190887451, + "learning_rate": 4.44838510719329e-05, + "loss": 4.769, + "step": 36244 + }, + { + "epoch": 0.215559282519745, + "grad_norm": 1.5576809644699097, + "learning_rate": 4.448355839214958e-05, + "loss": 4.9235, + "step": 36245 + }, + { + "epoch": 0.21556522980302598, + "grad_norm": 1.6670345067977905, + "learning_rate": 4.4483265705564736e-05, + "loss": 4.4313, + "step": 36246 + }, + { + "epoch": 0.21557117708630696, + "grad_norm": 1.5631513595581055, + "learning_rate": 4.44829730121785e-05, + "loss": 4.7763, + "step": 36247 + }, + { + "epoch": 0.21557712436958798, + "grad_norm": 1.5368024110794067, + "learning_rate": 4.4482680311990944e-05, + "loss": 4.8561, + "step": 36248 + }, + { + "epoch": 0.21558307165286897, + "grad_norm": 1.5789357423782349, + "learning_rate": 4.44823876050022e-05, + "loss": 4.9685, + "step": 36249 + }, + { + "epoch": 0.21558901893614996, + "grad_norm": 1.771773338317871, + "learning_rate": 4.4482094891212345e-05, + "loss": 4.7449, + "step": 36250 + }, + { + "epoch": 0.21559496621943097, + "grad_norm": 1.5449539422988892, + "learning_rate": 4.4481802170621496e-05, + "loss": 5.161, + "step": 36251 + }, + { + "epoch": 0.21560091350271196, + "grad_norm": 1.5021382570266724, + "learning_rate": 4.448150944322975e-05, + "loss": 5.3135, + "step": 36252 + }, + { + "epoch": 0.21560686078599295, + "grad_norm": 1.404275894165039, + "learning_rate": 4.448121670903721e-05, + "loss": 5.0408, + "step": 36253 + }, + { + "epoch": 0.21561280806927396, + "grad_norm": 1.5224852561950684, + "learning_rate": 4.448092396804398e-05, + "loss": 4.8239, + "step": 36254 + }, + { + "epoch": 0.21561875535255495, + "grad_norm": 1.8339958190917969, + "learning_rate": 4.4480631220250156e-05, + "loss": 4.8889, + "step": 36255 + }, + { + "epoch": 0.21562470263583594, + "grad_norm": 1.7479435205459595, + "learning_rate": 4.448033846565586e-05, + "loss": 4.9801, + "step": 36256 + }, + { + "epoch": 0.21563064991911696, + "grad_norm": 1.8542855978012085, + "learning_rate": 4.4480045704261164e-05, + "loss": 3.7599, + "step": 36257 + }, + { + "epoch": 0.21563659720239794, + "grad_norm": 1.655678629875183, + "learning_rate": 4.447975293606619e-05, + "loss": 4.702, + "step": 36258 + }, + { + "epoch": 0.21564254448567893, + "grad_norm": 1.811626672744751, + "learning_rate": 4.447946016107104e-05, + "loss": 4.8734, + "step": 36259 + }, + { + "epoch": 0.21564849176895995, + "grad_norm": 1.7445614337921143, + "learning_rate": 4.4479167379275796e-05, + "loss": 4.6599, + "step": 36260 + }, + { + "epoch": 0.21565443905224094, + "grad_norm": 2.8761303424835205, + "learning_rate": 4.447887459068059e-05, + "loss": 3.4399, + "step": 36261 + }, + { + "epoch": 0.21566038633552193, + "grad_norm": 1.9224152565002441, + "learning_rate": 4.4478581795285515e-05, + "loss": 4.2702, + "step": 36262 + }, + { + "epoch": 0.21566633361880294, + "grad_norm": 1.9684844017028809, + "learning_rate": 4.447828899309066e-05, + "loss": 4.2957, + "step": 36263 + }, + { + "epoch": 0.21567228090208393, + "grad_norm": 1.8181499242782593, + "learning_rate": 4.4477996184096125e-05, + "loss": 4.7013, + "step": 36264 + }, + { + "epoch": 0.21567822818536492, + "grad_norm": 1.7259873151779175, + "learning_rate": 4.447770336830204e-05, + "loss": 4.5011, + "step": 36265 + }, + { + "epoch": 0.21568417546864593, + "grad_norm": 1.6804701089859009, + "learning_rate": 4.447741054570849e-05, + "loss": 4.3317, + "step": 36266 + }, + { + "epoch": 0.21569012275192692, + "grad_norm": 1.349643588066101, + "learning_rate": 4.4477117716315565e-05, + "loss": 5.0237, + "step": 36267 + }, + { + "epoch": 0.2156960700352079, + "grad_norm": 1.9172464609146118, + "learning_rate": 4.447682488012338e-05, + "loss": 4.6157, + "step": 36268 + }, + { + "epoch": 0.21570201731848893, + "grad_norm": 1.5372661352157593, + "learning_rate": 4.447653203713205e-05, + "loss": 4.966, + "step": 36269 + }, + { + "epoch": 0.2157079646017699, + "grad_norm": 2.078467845916748, + "learning_rate": 4.447623918734165e-05, + "loss": 4.3186, + "step": 36270 + }, + { + "epoch": 0.2157139118850509, + "grad_norm": 2.3255176544189453, + "learning_rate": 4.44759463307523e-05, + "loss": 2.9728, + "step": 36271 + }, + { + "epoch": 0.21571985916833192, + "grad_norm": 1.7872892618179321, + "learning_rate": 4.4475653467364106e-05, + "loss": 4.2326, + "step": 36272 + }, + { + "epoch": 0.2157258064516129, + "grad_norm": 1.6925581693649292, + "learning_rate": 4.447536059717715e-05, + "loss": 4.6084, + "step": 36273 + }, + { + "epoch": 0.2157317537348939, + "grad_norm": 1.6806141138076782, + "learning_rate": 4.447506772019155e-05, + "loss": 4.7579, + "step": 36274 + }, + { + "epoch": 0.2157377010181749, + "grad_norm": 2.586641788482666, + "learning_rate": 4.447477483640742e-05, + "loss": 3.3903, + "step": 36275 + }, + { + "epoch": 0.2157436483014559, + "grad_norm": 2.487593173980713, + "learning_rate": 4.447448194582483e-05, + "loss": 4.3451, + "step": 36276 + }, + { + "epoch": 0.2157495955847369, + "grad_norm": 1.8467118740081787, + "learning_rate": 4.4474189048443907e-05, + "loss": 4.7606, + "step": 36277 + }, + { + "epoch": 0.2157555428680179, + "grad_norm": 1.3377431631088257, + "learning_rate": 4.447389614426475e-05, + "loss": 4.9565, + "step": 36278 + }, + { + "epoch": 0.2157614901512989, + "grad_norm": 1.932654857635498, + "learning_rate": 4.4473603233287445e-05, + "loss": 4.5205, + "step": 36279 + }, + { + "epoch": 0.21576743743457988, + "grad_norm": 1.7796809673309326, + "learning_rate": 4.4473310315512116e-05, + "loss": 4.3455, + "step": 36280 + }, + { + "epoch": 0.2157733847178609, + "grad_norm": 1.90752112865448, + "learning_rate": 4.4473017390938854e-05, + "loss": 4.3992, + "step": 36281 + }, + { + "epoch": 0.21577933200114188, + "grad_norm": 1.6692253351211548, + "learning_rate": 4.447272445956776e-05, + "loss": 4.6441, + "step": 36282 + }, + { + "epoch": 0.21578527928442287, + "grad_norm": 2.3043060302734375, + "learning_rate": 4.447243152139894e-05, + "loss": 3.1886, + "step": 36283 + }, + { + "epoch": 0.2157912265677039, + "grad_norm": 1.9240410327911377, + "learning_rate": 4.44721385764325e-05, + "loss": 4.2297, + "step": 36284 + }, + { + "epoch": 0.21579717385098487, + "grad_norm": 2.9011518955230713, + "learning_rate": 4.447184562466853e-05, + "loss": 2.8793, + "step": 36285 + }, + { + "epoch": 0.21580312113426586, + "grad_norm": 1.9503211975097656, + "learning_rate": 4.447155266610714e-05, + "loss": 4.9535, + "step": 36286 + }, + { + "epoch": 0.21580906841754685, + "grad_norm": 1.957065463066101, + "learning_rate": 4.4471259700748436e-05, + "loss": 4.0936, + "step": 36287 + }, + { + "epoch": 0.21581501570082787, + "grad_norm": 2.4622087478637695, + "learning_rate": 4.4470966728592515e-05, + "loss": 3.0493, + "step": 36288 + }, + { + "epoch": 0.21582096298410886, + "grad_norm": 2.5405967235565186, + "learning_rate": 4.447067374963948e-05, + "loss": 3.3334, + "step": 36289 + }, + { + "epoch": 0.21582691026738984, + "grad_norm": 2.6690690517425537, + "learning_rate": 4.447038076388944e-05, + "loss": 3.5309, + "step": 36290 + }, + { + "epoch": 0.21583285755067086, + "grad_norm": 2.7902510166168213, + "learning_rate": 4.447008777134248e-05, + "loss": 3.5418, + "step": 36291 + }, + { + "epoch": 0.21583880483395185, + "grad_norm": 2.6572537422180176, + "learning_rate": 4.4469794771998726e-05, + "loss": 3.4522, + "step": 36292 + }, + { + "epoch": 0.21584475211723284, + "grad_norm": 2.5804686546325684, + "learning_rate": 4.446950176585826e-05, + "loss": 3.5556, + "step": 36293 + }, + { + "epoch": 0.21585069940051385, + "grad_norm": 2.6215248107910156, + "learning_rate": 4.4469208752921196e-05, + "loss": 3.6112, + "step": 36294 + }, + { + "epoch": 0.21585664668379484, + "grad_norm": 2.5278232097625732, + "learning_rate": 4.4468915733187624e-05, + "loss": 3.4373, + "step": 36295 + }, + { + "epoch": 0.21586259396707583, + "grad_norm": 2.2777929306030273, + "learning_rate": 4.446862270665766e-05, + "loss": 3.0397, + "step": 36296 + }, + { + "epoch": 0.21586854125035684, + "grad_norm": 2.003936529159546, + "learning_rate": 4.446832967333141e-05, + "loss": 3.7177, + "step": 36297 + }, + { + "epoch": 0.21587448853363783, + "grad_norm": 2.218179941177368, + "learning_rate": 4.446803663320895e-05, + "loss": 4.398, + "step": 36298 + }, + { + "epoch": 0.21588043581691882, + "grad_norm": 2.1191961765289307, + "learning_rate": 4.4467743586290414e-05, + "loss": 4.3687, + "step": 36299 + }, + { + "epoch": 0.21588638310019984, + "grad_norm": 2.0627639293670654, + "learning_rate": 4.446745053257588e-05, + "loss": 4.4121, + "step": 36300 + }, + { + "epoch": 0.21589233038348082, + "grad_norm": 2.177537441253662, + "learning_rate": 4.446715747206547e-05, + "loss": 4.7772, + "step": 36301 + }, + { + "epoch": 0.2158982776667618, + "grad_norm": 2.4316155910491943, + "learning_rate": 4.446686440475927e-05, + "loss": 3.606, + "step": 36302 + }, + { + "epoch": 0.21590422495004283, + "grad_norm": 1.6192671060562134, + "learning_rate": 4.446657133065739e-05, + "loss": 4.9919, + "step": 36303 + }, + { + "epoch": 0.21591017223332382, + "grad_norm": 1.7824963331222534, + "learning_rate": 4.446627824975993e-05, + "loss": 5.085, + "step": 36304 + }, + { + "epoch": 0.2159161195166048, + "grad_norm": 2.347855806350708, + "learning_rate": 4.446598516206699e-05, + "loss": 3.9342, + "step": 36305 + }, + { + "epoch": 0.21592206679988582, + "grad_norm": 2.2459559440612793, + "learning_rate": 4.446569206757868e-05, + "loss": 3.7066, + "step": 36306 + }, + { + "epoch": 0.2159280140831668, + "grad_norm": 1.8832706212997437, + "learning_rate": 4.44653989662951e-05, + "loss": 4.3872, + "step": 36307 + }, + { + "epoch": 0.2159339613664478, + "grad_norm": 1.6729106903076172, + "learning_rate": 4.4465105858216346e-05, + "loss": 4.5845, + "step": 36308 + }, + { + "epoch": 0.2159399086497288, + "grad_norm": 1.494909644126892, + "learning_rate": 4.446481274334253e-05, + "loss": 5.0, + "step": 36309 + }, + { + "epoch": 0.2159458559330098, + "grad_norm": 1.655707597732544, + "learning_rate": 4.446451962167375e-05, + "loss": 5.1941, + "step": 36310 + }, + { + "epoch": 0.2159518032162908, + "grad_norm": 2.25812029838562, + "learning_rate": 4.4464226493210105e-05, + "loss": 4.5174, + "step": 36311 + }, + { + "epoch": 0.2159577504995718, + "grad_norm": 1.9949771165847778, + "learning_rate": 4.4463933357951695e-05, + "loss": 4.6311, + "step": 36312 + }, + { + "epoch": 0.2159636977828528, + "grad_norm": 1.69150710105896, + "learning_rate": 4.446364021589863e-05, + "loss": 4.9013, + "step": 36313 + }, + { + "epoch": 0.21596964506613378, + "grad_norm": 2.227994680404663, + "learning_rate": 4.4463347067051006e-05, + "loss": 4.3162, + "step": 36314 + }, + { + "epoch": 0.2159755923494148, + "grad_norm": 3.0076286792755127, + "learning_rate": 4.446305391140894e-05, + "loss": 4.0106, + "step": 36315 + }, + { + "epoch": 0.21598153963269578, + "grad_norm": 2.24741268157959, + "learning_rate": 4.4462760748972507e-05, + "loss": 3.759, + "step": 36316 + }, + { + "epoch": 0.21598748691597677, + "grad_norm": 1.5488991737365723, + "learning_rate": 4.4462467579741834e-05, + "loss": 4.5564, + "step": 36317 + }, + { + "epoch": 0.2159934341992578, + "grad_norm": 1.7913551330566406, + "learning_rate": 4.4462174403717016e-05, + "loss": 4.8823, + "step": 36318 + }, + { + "epoch": 0.21599938148253878, + "grad_norm": 2.324786901473999, + "learning_rate": 4.446188122089815e-05, + "loss": 4.1834, + "step": 36319 + }, + { + "epoch": 0.21600532876581977, + "grad_norm": 1.3889487981796265, + "learning_rate": 4.446158803128534e-05, + "loss": 5.1393, + "step": 36320 + }, + { + "epoch": 0.21601127604910078, + "grad_norm": 1.303863525390625, + "learning_rate": 4.44612948348787e-05, + "loss": 5.0815, + "step": 36321 + }, + { + "epoch": 0.21601722333238177, + "grad_norm": 1.2250717878341675, + "learning_rate": 4.446100163167831e-05, + "loss": 5.4439, + "step": 36322 + }, + { + "epoch": 0.21602317061566276, + "grad_norm": 1.3837891817092896, + "learning_rate": 4.4460708421684295e-05, + "loss": 5.0406, + "step": 36323 + }, + { + "epoch": 0.21602911789894377, + "grad_norm": 1.6228313446044922, + "learning_rate": 4.446041520489675e-05, + "loss": 5.0236, + "step": 36324 + }, + { + "epoch": 0.21603506518222476, + "grad_norm": 1.707972764968872, + "learning_rate": 4.446012198131577e-05, + "loss": 4.6587, + "step": 36325 + }, + { + "epoch": 0.21604101246550575, + "grad_norm": 1.5421570539474487, + "learning_rate": 4.4459828750941465e-05, + "loss": 4.8926, + "step": 36326 + }, + { + "epoch": 0.21604695974878677, + "grad_norm": 1.5230952501296997, + "learning_rate": 4.445953551377393e-05, + "loss": 5.042, + "step": 36327 + }, + { + "epoch": 0.21605290703206775, + "grad_norm": 1.3272488117218018, + "learning_rate": 4.445924226981327e-05, + "loss": 5.1375, + "step": 36328 + }, + { + "epoch": 0.21605885431534874, + "grad_norm": 1.1550372838974, + "learning_rate": 4.4458949019059606e-05, + "loss": 5.2326, + "step": 36329 + }, + { + "epoch": 0.21606480159862976, + "grad_norm": 1.3413779735565186, + "learning_rate": 4.445865576151301e-05, + "loss": 4.9879, + "step": 36330 + }, + { + "epoch": 0.21607074888191075, + "grad_norm": 1.5402988195419312, + "learning_rate": 4.44583624971736e-05, + "loss": 5.1719, + "step": 36331 + }, + { + "epoch": 0.21607669616519173, + "grad_norm": 2.013479471206665, + "learning_rate": 4.445806922604148e-05, + "loss": 4.5107, + "step": 36332 + }, + { + "epoch": 0.21608264344847275, + "grad_norm": 1.3441870212554932, + "learning_rate": 4.445777594811674e-05, + "loss": 4.706, + "step": 36333 + }, + { + "epoch": 0.21608859073175374, + "grad_norm": 1.5314089059829712, + "learning_rate": 4.44574826633995e-05, + "loss": 4.737, + "step": 36334 + }, + { + "epoch": 0.21609453801503473, + "grad_norm": 1.3800076246261597, + "learning_rate": 4.445718937188985e-05, + "loss": 4.9501, + "step": 36335 + }, + { + "epoch": 0.21610048529831574, + "grad_norm": 1.5042531490325928, + "learning_rate": 4.4456896073587905e-05, + "loss": 4.8638, + "step": 36336 + }, + { + "epoch": 0.21610643258159673, + "grad_norm": 1.4311203956604004, + "learning_rate": 4.445660276849375e-05, + "loss": 4.8713, + "step": 36337 + }, + { + "epoch": 0.21611237986487772, + "grad_norm": 1.5277742147445679, + "learning_rate": 4.44563094566075e-05, + "loss": 4.9963, + "step": 36338 + }, + { + "epoch": 0.21611832714815873, + "grad_norm": 1.7784839868545532, + "learning_rate": 4.4456016137929246e-05, + "loss": 5.0083, + "step": 36339 + }, + { + "epoch": 0.21612427443143972, + "grad_norm": 1.3861591815948486, + "learning_rate": 4.4455722812459104e-05, + "loss": 4.8264, + "step": 36340 + }, + { + "epoch": 0.2161302217147207, + "grad_norm": 1.4573569297790527, + "learning_rate": 4.445542948019717e-05, + "loss": 4.8561, + "step": 36341 + }, + { + "epoch": 0.21613616899800173, + "grad_norm": 1.3556313514709473, + "learning_rate": 4.445513614114355e-05, + "loss": 4.8997, + "step": 36342 + }, + { + "epoch": 0.21614211628128271, + "grad_norm": 1.5516074895858765, + "learning_rate": 4.445484279529834e-05, + "loss": 4.8283, + "step": 36343 + }, + { + "epoch": 0.2161480635645637, + "grad_norm": 1.4483047723770142, + "learning_rate": 4.445454944266164e-05, + "loss": 4.9132, + "step": 36344 + }, + { + "epoch": 0.2161540108478447, + "grad_norm": 1.6741615533828735, + "learning_rate": 4.4454256083233556e-05, + "loss": 4.6512, + "step": 36345 + }, + { + "epoch": 0.2161599581311257, + "grad_norm": 1.598311424255371, + "learning_rate": 4.445396271701421e-05, + "loss": 4.9215, + "step": 36346 + }, + { + "epoch": 0.2161659054144067, + "grad_norm": 1.5425868034362793, + "learning_rate": 4.445366934400367e-05, + "loss": 4.8261, + "step": 36347 + }, + { + "epoch": 0.21617185269768768, + "grad_norm": 1.6026711463928223, + "learning_rate": 4.445337596420206e-05, + "loss": 4.8442, + "step": 36348 + }, + { + "epoch": 0.2161777999809687, + "grad_norm": 1.524340271949768, + "learning_rate": 4.4453082577609474e-05, + "loss": 4.9961, + "step": 36349 + }, + { + "epoch": 0.2161837472642497, + "grad_norm": 1.5798773765563965, + "learning_rate": 4.445278918422602e-05, + "loss": 4.9096, + "step": 36350 + }, + { + "epoch": 0.21618969454753068, + "grad_norm": 1.4556652307510376, + "learning_rate": 4.44524957840518e-05, + "loss": 4.8589, + "step": 36351 + }, + { + "epoch": 0.2161956418308117, + "grad_norm": 1.522506833076477, + "learning_rate": 4.445220237708692e-05, + "loss": 4.8721, + "step": 36352 + }, + { + "epoch": 0.21620158911409268, + "grad_norm": 1.347317099571228, + "learning_rate": 4.445190896333147e-05, + "loss": 4.9497, + "step": 36353 + }, + { + "epoch": 0.21620753639737367, + "grad_norm": 1.5334205627441406, + "learning_rate": 4.445161554278556e-05, + "loss": 4.9534, + "step": 36354 + }, + { + "epoch": 0.21621348368065468, + "grad_norm": 1.5388821363449097, + "learning_rate": 4.445132211544929e-05, + "loss": 4.8753, + "step": 36355 + }, + { + "epoch": 0.21621943096393567, + "grad_norm": 1.5709154605865479, + "learning_rate": 4.4451028681322764e-05, + "loss": 4.8397, + "step": 36356 + }, + { + "epoch": 0.21622537824721666, + "grad_norm": 1.835668683052063, + "learning_rate": 4.445073524040609e-05, + "loss": 5.2878, + "step": 36357 + }, + { + "epoch": 0.21623132553049768, + "grad_norm": 1.3644315004348755, + "learning_rate": 4.445044179269936e-05, + "loss": 4.6898, + "step": 36358 + }, + { + "epoch": 0.21623727281377866, + "grad_norm": 2.2211451530456543, + "learning_rate": 4.445014833820269e-05, + "loss": 4.1424, + "step": 36359 + }, + { + "epoch": 0.21624322009705965, + "grad_norm": 1.6837176084518433, + "learning_rate": 4.444985487691617e-05, + "loss": 5.2938, + "step": 36360 + }, + { + "epoch": 0.21624916738034067, + "grad_norm": 1.6217468976974487, + "learning_rate": 4.44495614088399e-05, + "loss": 4.6109, + "step": 36361 + }, + { + "epoch": 0.21625511466362166, + "grad_norm": 1.295923113822937, + "learning_rate": 4.4449267933974e-05, + "loss": 4.9983, + "step": 36362 + }, + { + "epoch": 0.21626106194690264, + "grad_norm": 1.8383874893188477, + "learning_rate": 4.444897445231855e-05, + "loss": 4.2286, + "step": 36363 + }, + { + "epoch": 0.21626700923018366, + "grad_norm": 1.6389504671096802, + "learning_rate": 4.4448680963873674e-05, + "loss": 5.3462, + "step": 36364 + }, + { + "epoch": 0.21627295651346465, + "grad_norm": 1.5260887145996094, + "learning_rate": 4.444838746863946e-05, + "loss": 4.7547, + "step": 36365 + }, + { + "epoch": 0.21627890379674564, + "grad_norm": 1.573678970336914, + "learning_rate": 4.4448093966616015e-05, + "loss": 4.4435, + "step": 36366 + }, + { + "epoch": 0.21628485108002665, + "grad_norm": 1.3941434621810913, + "learning_rate": 4.4447800457803444e-05, + "loss": 4.4243, + "step": 36367 + }, + { + "epoch": 0.21629079836330764, + "grad_norm": 1.664817214012146, + "learning_rate": 4.444750694220184e-05, + "loss": 4.4748, + "step": 36368 + }, + { + "epoch": 0.21629674564658863, + "grad_norm": 1.423172116279602, + "learning_rate": 4.444721341981132e-05, + "loss": 4.5332, + "step": 36369 + }, + { + "epoch": 0.21630269292986964, + "grad_norm": 1.7631560564041138, + "learning_rate": 4.444691989063198e-05, + "loss": 5.0456, + "step": 36370 + }, + { + "epoch": 0.21630864021315063, + "grad_norm": 1.3937678337097168, + "learning_rate": 4.444662635466391e-05, + "loss": 5.1116, + "step": 36371 + }, + { + "epoch": 0.21631458749643162, + "grad_norm": 1.5468742847442627, + "learning_rate": 4.444633281190723e-05, + "loss": 5.0551, + "step": 36372 + }, + { + "epoch": 0.21632053477971264, + "grad_norm": 1.6004170179367065, + "learning_rate": 4.444603926236204e-05, + "loss": 4.7766, + "step": 36373 + }, + { + "epoch": 0.21632648206299362, + "grad_norm": 1.4662137031555176, + "learning_rate": 4.444574570602843e-05, + "loss": 4.9473, + "step": 36374 + }, + { + "epoch": 0.2163324293462746, + "grad_norm": 1.4400924444198608, + "learning_rate": 4.4445452142906515e-05, + "loss": 4.9529, + "step": 36375 + }, + { + "epoch": 0.21633837662955563, + "grad_norm": 1.3921599388122559, + "learning_rate": 4.44451585729964e-05, + "loss": 5.5826, + "step": 36376 + }, + { + "epoch": 0.21634432391283662, + "grad_norm": 1.650146722793579, + "learning_rate": 4.444486499629818e-05, + "loss": 4.3338, + "step": 36377 + }, + { + "epoch": 0.2163502711961176, + "grad_norm": 1.5027433633804321, + "learning_rate": 4.4444571412811954e-05, + "loss": 4.5485, + "step": 36378 + }, + { + "epoch": 0.21635621847939862, + "grad_norm": 1.3315762281417847, + "learning_rate": 4.4444277822537826e-05, + "loss": 5.277, + "step": 36379 + }, + { + "epoch": 0.2163621657626796, + "grad_norm": 1.5802031755447388, + "learning_rate": 4.44439842254759e-05, + "loss": 4.9875, + "step": 36380 + }, + { + "epoch": 0.2163681130459606, + "grad_norm": 1.4244681596755981, + "learning_rate": 4.444369062162629e-05, + "loss": 5.0073, + "step": 36381 + }, + { + "epoch": 0.2163740603292416, + "grad_norm": 1.5206032991409302, + "learning_rate": 4.444339701098909e-05, + "loss": 4.8693, + "step": 36382 + }, + { + "epoch": 0.2163800076125226, + "grad_norm": 1.3556402921676636, + "learning_rate": 4.444310339356439e-05, + "loss": 4.8651, + "step": 36383 + }, + { + "epoch": 0.2163859548958036, + "grad_norm": 1.7892037630081177, + "learning_rate": 4.44428097693523e-05, + "loss": 4.5375, + "step": 36384 + }, + { + "epoch": 0.2163919021790846, + "grad_norm": 1.5106563568115234, + "learning_rate": 4.444251613835294e-05, + "loss": 5.0685, + "step": 36385 + }, + { + "epoch": 0.2163978494623656, + "grad_norm": 1.341135859489441, + "learning_rate": 4.444222250056639e-05, + "loss": 5.1778, + "step": 36386 + }, + { + "epoch": 0.21640379674564658, + "grad_norm": 1.385373592376709, + "learning_rate": 4.444192885599276e-05, + "loss": 4.8729, + "step": 36387 + }, + { + "epoch": 0.2164097440289276, + "grad_norm": 1.454485535621643, + "learning_rate": 4.4441635204632156e-05, + "loss": 5.0157, + "step": 36388 + }, + { + "epoch": 0.21641569131220859, + "grad_norm": 1.7790766954421997, + "learning_rate": 4.444134154648468e-05, + "loss": 4.4941, + "step": 36389 + }, + { + "epoch": 0.21642163859548957, + "grad_norm": 1.628504991531372, + "learning_rate": 4.444104788155043e-05, + "loss": 4.8383, + "step": 36390 + }, + { + "epoch": 0.2164275858787706, + "grad_norm": 1.4350956678390503, + "learning_rate": 4.444075420982951e-05, + "loss": 5.241, + "step": 36391 + }, + { + "epoch": 0.21643353316205158, + "grad_norm": 1.3836671113967896, + "learning_rate": 4.444046053132202e-05, + "loss": 4.9542, + "step": 36392 + }, + { + "epoch": 0.21643948044533257, + "grad_norm": 1.3507336378097534, + "learning_rate": 4.4440166846028084e-05, + "loss": 5.1188, + "step": 36393 + }, + { + "epoch": 0.21644542772861358, + "grad_norm": 1.6130249500274658, + "learning_rate": 4.443987315394778e-05, + "loss": 4.6271, + "step": 36394 + }, + { + "epoch": 0.21645137501189457, + "grad_norm": 1.2183295488357544, + "learning_rate": 4.443957945508121e-05, + "loss": 5.0471, + "step": 36395 + }, + { + "epoch": 0.21645732229517556, + "grad_norm": 1.1758854389190674, + "learning_rate": 4.443928574942848e-05, + "loss": 4.9255, + "step": 36396 + }, + { + "epoch": 0.21646326957845657, + "grad_norm": 1.2792357206344604, + "learning_rate": 4.44389920369897e-05, + "loss": 4.9221, + "step": 36397 + }, + { + "epoch": 0.21646921686173756, + "grad_norm": 1.5022220611572266, + "learning_rate": 4.443869831776497e-05, + "loss": 5.1525, + "step": 36398 + }, + { + "epoch": 0.21647516414501855, + "grad_norm": 1.5304787158966064, + "learning_rate": 4.443840459175439e-05, + "loss": 5.3113, + "step": 36399 + }, + { + "epoch": 0.21648111142829957, + "grad_norm": 1.3236007690429688, + "learning_rate": 4.443811085895807e-05, + "loss": 5.0398, + "step": 36400 + }, + { + "epoch": 0.21648705871158055, + "grad_norm": 1.2853519916534424, + "learning_rate": 4.44378171193761e-05, + "loss": 5.2451, + "step": 36401 + }, + { + "epoch": 0.21649300599486154, + "grad_norm": 1.410645842552185, + "learning_rate": 4.443752337300859e-05, + "loss": 4.9105, + "step": 36402 + }, + { + "epoch": 0.21649895327814253, + "grad_norm": 2.0519766807556152, + "learning_rate": 4.443722961985564e-05, + "loss": 3.9625, + "step": 36403 + }, + { + "epoch": 0.21650490056142355, + "grad_norm": 1.5357091426849365, + "learning_rate": 4.443693585991736e-05, + "loss": 4.7008, + "step": 36404 + }, + { + "epoch": 0.21651084784470453, + "grad_norm": 1.5789777040481567, + "learning_rate": 4.443664209319383e-05, + "loss": 5.0572, + "step": 36405 + }, + { + "epoch": 0.21651679512798552, + "grad_norm": 1.5537595748901367, + "learning_rate": 4.443634831968519e-05, + "loss": 4.5723, + "step": 36406 + }, + { + "epoch": 0.21652274241126654, + "grad_norm": 1.5900410413742065, + "learning_rate": 4.4436054539391516e-05, + "loss": 4.9849, + "step": 36407 + }, + { + "epoch": 0.21652868969454753, + "grad_norm": 1.1238914728164673, + "learning_rate": 4.443576075231291e-05, + "loss": 4.66, + "step": 36408 + }, + { + "epoch": 0.21653463697782852, + "grad_norm": 1.427838921546936, + "learning_rate": 4.4435466958449485e-05, + "loss": 4.7631, + "step": 36409 + }, + { + "epoch": 0.21654058426110953, + "grad_norm": 1.6186624765396118, + "learning_rate": 4.4435173157801334e-05, + "loss": 5.1209, + "step": 36410 + }, + { + "epoch": 0.21654653154439052, + "grad_norm": 1.5863722562789917, + "learning_rate": 4.443487935036857e-05, + "loss": 5.0902, + "step": 36411 + }, + { + "epoch": 0.2165524788276715, + "grad_norm": 1.7387241125106812, + "learning_rate": 4.443458553615129e-05, + "loss": 5.1191, + "step": 36412 + }, + { + "epoch": 0.21655842611095252, + "grad_norm": 1.605112910270691, + "learning_rate": 4.4434291715149603e-05, + "loss": 4.8986, + "step": 36413 + }, + { + "epoch": 0.2165643733942335, + "grad_norm": 1.6139943599700928, + "learning_rate": 4.4433997887363595e-05, + "loss": 4.9041, + "step": 36414 + }, + { + "epoch": 0.2165703206775145, + "grad_norm": 1.6797585487365723, + "learning_rate": 4.443370405279338e-05, + "loss": 4.8667, + "step": 36415 + }, + { + "epoch": 0.21657626796079552, + "grad_norm": 1.4826325178146362, + "learning_rate": 4.443341021143906e-05, + "loss": 4.9097, + "step": 36416 + }, + { + "epoch": 0.2165822152440765, + "grad_norm": 1.4120008945465088, + "learning_rate": 4.443311636330074e-05, + "loss": 4.842, + "step": 36417 + }, + { + "epoch": 0.2165881625273575, + "grad_norm": 1.6395269632339478, + "learning_rate": 4.443282250837852e-05, + "loss": 4.798, + "step": 36418 + }, + { + "epoch": 0.2165941098106385, + "grad_norm": 1.6432803869247437, + "learning_rate": 4.44325286466725e-05, + "loss": 4.9683, + "step": 36419 + }, + { + "epoch": 0.2166000570939195, + "grad_norm": 1.3104444742202759, + "learning_rate": 4.443223477818279e-05, + "loss": 5.0913, + "step": 36420 + }, + { + "epoch": 0.21660600437720048, + "grad_norm": 1.674682855606079, + "learning_rate": 4.443194090290949e-05, + "loss": 3.7327, + "step": 36421 + }, + { + "epoch": 0.2166119516604815, + "grad_norm": 1.6492141485214233, + "learning_rate": 4.443164702085269e-05, + "loss": 4.5528, + "step": 36422 + }, + { + "epoch": 0.2166178989437625, + "grad_norm": 1.6470035314559937, + "learning_rate": 4.443135313201251e-05, + "loss": 5.1476, + "step": 36423 + }, + { + "epoch": 0.21662384622704348, + "grad_norm": 1.5849100351333618, + "learning_rate": 4.443105923638904e-05, + "loss": 4.9662, + "step": 36424 + }, + { + "epoch": 0.2166297935103245, + "grad_norm": 1.444566249847412, + "learning_rate": 4.443076533398239e-05, + "loss": 4.9241, + "step": 36425 + }, + { + "epoch": 0.21663574079360548, + "grad_norm": 1.5952868461608887, + "learning_rate": 4.443047142479266e-05, + "loss": 4.766, + "step": 36426 + }, + { + "epoch": 0.21664168807688647, + "grad_norm": 1.3955894708633423, + "learning_rate": 4.443017750881996e-05, + "loss": 5.1851, + "step": 36427 + }, + { + "epoch": 0.21664763536016748, + "grad_norm": 1.377500057220459, + "learning_rate": 4.442988358606438e-05, + "loss": 4.8027, + "step": 36428 + }, + { + "epoch": 0.21665358264344847, + "grad_norm": 1.313023328781128, + "learning_rate": 4.4429589656526024e-05, + "loss": 4.825, + "step": 36429 + }, + { + "epoch": 0.21665952992672946, + "grad_norm": 1.479194164276123, + "learning_rate": 4.442929572020501e-05, + "loss": 4.9135, + "step": 36430 + }, + { + "epoch": 0.21666547721001048, + "grad_norm": 1.3001906871795654, + "learning_rate": 4.442900177710142e-05, + "loss": 4.9644, + "step": 36431 + }, + { + "epoch": 0.21667142449329146, + "grad_norm": 1.6930853128433228, + "learning_rate": 4.4428707827215374e-05, + "loss": 5.0808, + "step": 36432 + }, + { + "epoch": 0.21667737177657245, + "grad_norm": 1.3844190835952759, + "learning_rate": 4.442841387054696e-05, + "loss": 4.7619, + "step": 36433 + }, + { + "epoch": 0.21668331905985347, + "grad_norm": 1.3809784650802612, + "learning_rate": 4.4428119907096285e-05, + "loss": 4.7743, + "step": 36434 + }, + { + "epoch": 0.21668926634313446, + "grad_norm": 1.5848809480667114, + "learning_rate": 4.4427825936863465e-05, + "loss": 5.2092, + "step": 36435 + }, + { + "epoch": 0.21669521362641544, + "grad_norm": 1.2051990032196045, + "learning_rate": 4.442753195984859e-05, + "loss": 5.3504, + "step": 36436 + }, + { + "epoch": 0.21670116090969646, + "grad_norm": 1.4225530624389648, + "learning_rate": 4.4427237976051754e-05, + "loss": 5.5421, + "step": 36437 + }, + { + "epoch": 0.21670710819297745, + "grad_norm": 1.548554539680481, + "learning_rate": 4.442694398547308e-05, + "loss": 5.0913, + "step": 36438 + }, + { + "epoch": 0.21671305547625844, + "grad_norm": 1.8550792932510376, + "learning_rate": 4.4426649988112654e-05, + "loss": 5.1924, + "step": 36439 + }, + { + "epoch": 0.21671900275953945, + "grad_norm": 1.9623850584030151, + "learning_rate": 4.442635598397059e-05, + "loss": 4.5524, + "step": 36440 + }, + { + "epoch": 0.21672495004282044, + "grad_norm": 1.6146697998046875, + "learning_rate": 4.442606197304698e-05, + "loss": 5.2329, + "step": 36441 + }, + { + "epoch": 0.21673089732610143, + "grad_norm": 3.1403307914733887, + "learning_rate": 4.442576795534193e-05, + "loss": 3.5127, + "step": 36442 + }, + { + "epoch": 0.21673684460938245, + "grad_norm": 1.3766248226165771, + "learning_rate": 4.4425473930855554e-05, + "loss": 4.8523, + "step": 36443 + }, + { + "epoch": 0.21674279189266343, + "grad_norm": 1.4641730785369873, + "learning_rate": 4.4425179899587945e-05, + "loss": 4.8117, + "step": 36444 + }, + { + "epoch": 0.21674873917594442, + "grad_norm": 1.4823542833328247, + "learning_rate": 4.442488586153921e-05, + "loss": 4.7413, + "step": 36445 + }, + { + "epoch": 0.21675468645922544, + "grad_norm": 1.401440143585205, + "learning_rate": 4.4424591816709436e-05, + "loss": 4.7065, + "step": 36446 + }, + { + "epoch": 0.21676063374250643, + "grad_norm": 1.3161439895629883, + "learning_rate": 4.4424297765098745e-05, + "loss": 4.9648, + "step": 36447 + }, + { + "epoch": 0.2167665810257874, + "grad_norm": 1.7631399631500244, + "learning_rate": 4.442400370670723e-05, + "loss": 4.9201, + "step": 36448 + }, + { + "epoch": 0.21677252830906843, + "grad_norm": 1.556435227394104, + "learning_rate": 4.4423709641535e-05, + "loss": 5.1891, + "step": 36449 + }, + { + "epoch": 0.21677847559234942, + "grad_norm": 1.5537375211715698, + "learning_rate": 4.442341556958215e-05, + "loss": 5.1704, + "step": 36450 + }, + { + "epoch": 0.2167844228756304, + "grad_norm": 1.9972381591796875, + "learning_rate": 4.4423121490848785e-05, + "loss": 4.9716, + "step": 36451 + }, + { + "epoch": 0.21679037015891142, + "grad_norm": 1.3904248476028442, + "learning_rate": 4.442282740533501e-05, + "loss": 5.1136, + "step": 36452 + }, + { + "epoch": 0.2167963174421924, + "grad_norm": 1.5099358558654785, + "learning_rate": 4.442253331304093e-05, + "loss": 4.9462, + "step": 36453 + }, + { + "epoch": 0.2168022647254734, + "grad_norm": 1.4884952306747437, + "learning_rate": 4.4422239213966645e-05, + "loss": 4.9833, + "step": 36454 + }, + { + "epoch": 0.21680821200875441, + "grad_norm": 1.6606906652450562, + "learning_rate": 4.442194510811225e-05, + "loss": 4.9137, + "step": 36455 + }, + { + "epoch": 0.2168141592920354, + "grad_norm": 2.3339645862579346, + "learning_rate": 4.442165099547786e-05, + "loss": 4.6693, + "step": 36456 + }, + { + "epoch": 0.2168201065753164, + "grad_norm": 1.3238904476165771, + "learning_rate": 4.4421356876063566e-05, + "loss": 4.9108, + "step": 36457 + }, + { + "epoch": 0.2168260538585974, + "grad_norm": 1.689540982246399, + "learning_rate": 4.442106274986949e-05, + "loss": 5.0264, + "step": 36458 + }, + { + "epoch": 0.2168320011418784, + "grad_norm": 1.574047327041626, + "learning_rate": 4.4420768616895714e-05, + "loss": 5.0985, + "step": 36459 + }, + { + "epoch": 0.21683794842515938, + "grad_norm": 1.422987699508667, + "learning_rate": 4.442047447714234e-05, + "loss": 4.98, + "step": 36460 + }, + { + "epoch": 0.21684389570844037, + "grad_norm": 1.5349971055984497, + "learning_rate": 4.442018033060949e-05, + "loss": 4.9211, + "step": 36461 + }, + { + "epoch": 0.2168498429917214, + "grad_norm": 1.6502734422683716, + "learning_rate": 4.441988617729726e-05, + "loss": 5.2832, + "step": 36462 + }, + { + "epoch": 0.21685579027500237, + "grad_norm": 1.5576223134994507, + "learning_rate": 4.4419592017205735e-05, + "loss": 4.7322, + "step": 36463 + }, + { + "epoch": 0.21686173755828336, + "grad_norm": 1.554739236831665, + "learning_rate": 4.4419297850335036e-05, + "loss": 4.9784, + "step": 36464 + }, + { + "epoch": 0.21686768484156438, + "grad_norm": 1.572361946105957, + "learning_rate": 4.441900367668526e-05, + "loss": 4.631, + "step": 36465 + }, + { + "epoch": 0.21687363212484537, + "grad_norm": 1.6870968341827393, + "learning_rate": 4.441870949625652e-05, + "loss": 4.6154, + "step": 36466 + }, + { + "epoch": 0.21687957940812636, + "grad_norm": 1.7353061437606812, + "learning_rate": 4.441841530904889e-05, + "loss": 5.0098, + "step": 36467 + }, + { + "epoch": 0.21688552669140737, + "grad_norm": 1.622704267501831, + "learning_rate": 4.4418121115062506e-05, + "loss": 5.1513, + "step": 36468 + }, + { + "epoch": 0.21689147397468836, + "grad_norm": 1.66656494140625, + "learning_rate": 4.441782691429746e-05, + "loss": 5.1693, + "step": 36469 + }, + { + "epoch": 0.21689742125796935, + "grad_norm": 1.3424537181854248, + "learning_rate": 4.441753270675384e-05, + "loss": 5.17, + "step": 36470 + }, + { + "epoch": 0.21690336854125036, + "grad_norm": 1.405543327331543, + "learning_rate": 4.441723849243177e-05, + "loss": 5.0782, + "step": 36471 + }, + { + "epoch": 0.21690931582453135, + "grad_norm": 1.5939109325408936, + "learning_rate": 4.441694427133133e-05, + "loss": 4.4947, + "step": 36472 + }, + { + "epoch": 0.21691526310781234, + "grad_norm": 1.6071003675460815, + "learning_rate": 4.441665004345265e-05, + "loss": 4.7007, + "step": 36473 + }, + { + "epoch": 0.21692121039109336, + "grad_norm": 1.5466821193695068, + "learning_rate": 4.441635580879581e-05, + "loss": 4.6536, + "step": 36474 + }, + { + "epoch": 0.21692715767437434, + "grad_norm": 1.6554591655731201, + "learning_rate": 4.441606156736092e-05, + "loss": 4.4594, + "step": 36475 + }, + { + "epoch": 0.21693310495765533, + "grad_norm": 1.5430635213851929, + "learning_rate": 4.441576731914808e-05, + "loss": 4.6767, + "step": 36476 + }, + { + "epoch": 0.21693905224093635, + "grad_norm": 1.5925291776657104, + "learning_rate": 4.441547306415741e-05, + "loss": 4.8824, + "step": 36477 + }, + { + "epoch": 0.21694499952421734, + "grad_norm": 1.6011813879013062, + "learning_rate": 4.4415178802388986e-05, + "loss": 4.8297, + "step": 36478 + }, + { + "epoch": 0.21695094680749832, + "grad_norm": 1.3895270824432373, + "learning_rate": 4.4414884533842925e-05, + "loss": 5.1119, + "step": 36479 + }, + { + "epoch": 0.21695689409077934, + "grad_norm": 1.9162174463272095, + "learning_rate": 4.4414590258519334e-05, + "loss": 4.4094, + "step": 36480 + }, + { + "epoch": 0.21696284137406033, + "grad_norm": 2.446828603744507, + "learning_rate": 4.4414295976418306e-05, + "loss": 4.2579, + "step": 36481 + }, + { + "epoch": 0.21696878865734132, + "grad_norm": 1.7227983474731445, + "learning_rate": 4.441400168753995e-05, + "loss": 4.8391, + "step": 36482 + }, + { + "epoch": 0.21697473594062233, + "grad_norm": 1.6229579448699951, + "learning_rate": 4.4413707391884364e-05, + "loss": 4.8293, + "step": 36483 + }, + { + "epoch": 0.21698068322390332, + "grad_norm": 1.9420546293258667, + "learning_rate": 4.441341308945165e-05, + "loss": 3.5347, + "step": 36484 + }, + { + "epoch": 0.2169866305071843, + "grad_norm": 1.9547382593154907, + "learning_rate": 4.4413118780241925e-05, + "loss": 4.3297, + "step": 36485 + }, + { + "epoch": 0.21699257779046532, + "grad_norm": 1.977729082107544, + "learning_rate": 4.441282446425528e-05, + "loss": 4.1395, + "step": 36486 + }, + { + "epoch": 0.2169985250737463, + "grad_norm": 1.5033422708511353, + "learning_rate": 4.441253014149181e-05, + "loss": 5.0993, + "step": 36487 + }, + { + "epoch": 0.2170044723570273, + "grad_norm": 1.4688711166381836, + "learning_rate": 4.441223581195163e-05, + "loss": 5.0349, + "step": 36488 + }, + { + "epoch": 0.21701041964030832, + "grad_norm": 2.3826143741607666, + "learning_rate": 4.4411941475634844e-05, + "loss": 4.3747, + "step": 36489 + }, + { + "epoch": 0.2170163669235893, + "grad_norm": 2.2079734802246094, + "learning_rate": 4.441164713254154e-05, + "loss": 3.9939, + "step": 36490 + }, + { + "epoch": 0.2170223142068703, + "grad_norm": 3.0292141437530518, + "learning_rate": 4.4411352782671835e-05, + "loss": 3.793, + "step": 36491 + }, + { + "epoch": 0.2170282614901513, + "grad_norm": 2.8700766563415527, + "learning_rate": 4.441105842602583e-05, + "loss": 3.8592, + "step": 36492 + }, + { + "epoch": 0.2170342087734323, + "grad_norm": 2.866060972213745, + "learning_rate": 4.4410764062603616e-05, + "loss": 4.7633, + "step": 36493 + }, + { + "epoch": 0.21704015605671328, + "grad_norm": 1.6045869588851929, + "learning_rate": 4.4410469692405314e-05, + "loss": 4.7084, + "step": 36494 + }, + { + "epoch": 0.2170461033399943, + "grad_norm": 1.9329087734222412, + "learning_rate": 4.441017531543101e-05, + "loss": 4.041, + "step": 36495 + }, + { + "epoch": 0.2170520506232753, + "grad_norm": 1.7166354656219482, + "learning_rate": 4.440988093168083e-05, + "loss": 4.5453, + "step": 36496 + }, + { + "epoch": 0.21705799790655628, + "grad_norm": 1.5979840755462646, + "learning_rate": 4.4409586541154846e-05, + "loss": 4.461, + "step": 36497 + }, + { + "epoch": 0.2170639451898373, + "grad_norm": 1.6630820035934448, + "learning_rate": 4.4409292143853184e-05, + "loss": 4.6191, + "step": 36498 + }, + { + "epoch": 0.21706989247311828, + "grad_norm": 2.3144614696502686, + "learning_rate": 4.440899773977593e-05, + "loss": 3.9898, + "step": 36499 + }, + { + "epoch": 0.21707583975639927, + "grad_norm": 1.5565422773361206, + "learning_rate": 4.4408703328923204e-05, + "loss": 4.4901, + "step": 36500 + }, + { + "epoch": 0.21708178703968029, + "grad_norm": 1.6009669303894043, + "learning_rate": 4.4408408911295096e-05, + "loss": 4.2009, + "step": 36501 + }, + { + "epoch": 0.21708773432296127, + "grad_norm": 1.558400273323059, + "learning_rate": 4.4408114486891713e-05, + "loss": 4.5287, + "step": 36502 + }, + { + "epoch": 0.21709368160624226, + "grad_norm": 1.758833885192871, + "learning_rate": 4.440782005571316e-05, + "loss": 4.5359, + "step": 36503 + }, + { + "epoch": 0.21709962888952328, + "grad_norm": 1.6637458801269531, + "learning_rate": 4.440752561775953e-05, + "loss": 4.4195, + "step": 36504 + }, + { + "epoch": 0.21710557617280427, + "grad_norm": 1.729332685470581, + "learning_rate": 4.440723117303094e-05, + "loss": 4.8236, + "step": 36505 + }, + { + "epoch": 0.21711152345608525, + "grad_norm": 1.6611237525939941, + "learning_rate": 4.440693672152749e-05, + "loss": 4.7444, + "step": 36506 + }, + { + "epoch": 0.21711747073936627, + "grad_norm": 1.6037817001342773, + "learning_rate": 4.440664226324927e-05, + "loss": 4.9678, + "step": 36507 + }, + { + "epoch": 0.21712341802264726, + "grad_norm": 1.5919970273971558, + "learning_rate": 4.44063477981964e-05, + "loss": 4.618, + "step": 36508 + }, + { + "epoch": 0.21712936530592825, + "grad_norm": 1.7502778768539429, + "learning_rate": 4.440605332636897e-05, + "loss": 4.6642, + "step": 36509 + }, + { + "epoch": 0.21713531258920926, + "grad_norm": 2.1507225036621094, + "learning_rate": 4.440575884776709e-05, + "loss": 4.5249, + "step": 36510 + }, + { + "epoch": 0.21714125987249025, + "grad_norm": 1.4958648681640625, + "learning_rate": 4.440546436239085e-05, + "loss": 4.6402, + "step": 36511 + }, + { + "epoch": 0.21714720715577124, + "grad_norm": 2.740826368331909, + "learning_rate": 4.440516987024037e-05, + "loss": 2.2294, + "step": 36512 + }, + { + "epoch": 0.21715315443905225, + "grad_norm": 3.044275999069214, + "learning_rate": 4.440487537131575e-05, + "loss": 1.9464, + "step": 36513 + }, + { + "epoch": 0.21715910172233324, + "grad_norm": 2.7739100456237793, + "learning_rate": 4.4404580865617084e-05, + "loss": 1.6097, + "step": 36514 + }, + { + "epoch": 0.21716504900561423, + "grad_norm": 2.5852739810943604, + "learning_rate": 4.4404286353144474e-05, + "loss": 1.306, + "step": 36515 + }, + { + "epoch": 0.21717099628889525, + "grad_norm": 2.5203051567077637, + "learning_rate": 4.440399183389804e-05, + "loss": 1.4329, + "step": 36516 + }, + { + "epoch": 0.21717694357217623, + "grad_norm": 2.085726022720337, + "learning_rate": 4.440369730787787e-05, + "loss": 4.2591, + "step": 36517 + }, + { + "epoch": 0.21718289085545722, + "grad_norm": 1.939746379852295, + "learning_rate": 4.4403402775084066e-05, + "loss": 4.5084, + "step": 36518 + }, + { + "epoch": 0.2171888381387382, + "grad_norm": 1.800667405128479, + "learning_rate": 4.4403108235516735e-05, + "loss": 4.5063, + "step": 36519 + }, + { + "epoch": 0.21719478542201923, + "grad_norm": 2.342440128326416, + "learning_rate": 4.440281368917598e-05, + "loss": 3.8184, + "step": 36520 + }, + { + "epoch": 0.21720073270530021, + "grad_norm": 2.439656972885132, + "learning_rate": 4.4402519136061897e-05, + "loss": 2.9155, + "step": 36521 + }, + { + "epoch": 0.2172066799885812, + "grad_norm": 2.338834047317505, + "learning_rate": 4.44022245761746e-05, + "loss": 2.8786, + "step": 36522 + }, + { + "epoch": 0.21721262727186222, + "grad_norm": 3.4408295154571533, + "learning_rate": 4.4401930009514194e-05, + "loss": 2.6139, + "step": 36523 + }, + { + "epoch": 0.2172185745551432, + "grad_norm": 1.776883602142334, + "learning_rate": 4.440163543608077e-05, + "loss": 4.7785, + "step": 36524 + }, + { + "epoch": 0.2172245218384242, + "grad_norm": 1.5086807012557983, + "learning_rate": 4.440134085587443e-05, + "loss": 4.5959, + "step": 36525 + }, + { + "epoch": 0.2172304691217052, + "grad_norm": 2.188570499420166, + "learning_rate": 4.440104626889529e-05, + "loss": 4.1522, + "step": 36526 + }, + { + "epoch": 0.2172364164049862, + "grad_norm": 2.371351718902588, + "learning_rate": 4.4400751675143436e-05, + "loss": 5.1309, + "step": 36527 + }, + { + "epoch": 0.2172423636882672, + "grad_norm": 2.145080804824829, + "learning_rate": 4.4400457074618987e-05, + "loss": 3.662, + "step": 36528 + }, + { + "epoch": 0.2172483109715482, + "grad_norm": 1.631962776184082, + "learning_rate": 4.4400162467322034e-05, + "loss": 4.1333, + "step": 36529 + }, + { + "epoch": 0.2172542582548292, + "grad_norm": 2.0072202682495117, + "learning_rate": 4.439986785325269e-05, + "loss": 4.9659, + "step": 36530 + }, + { + "epoch": 0.21726020553811018, + "grad_norm": 1.7635982036590576, + "learning_rate": 4.439957323241105e-05, + "loss": 4.8684, + "step": 36531 + }, + { + "epoch": 0.2172661528213912, + "grad_norm": 1.6722582578659058, + "learning_rate": 4.4399278604797225e-05, + "loss": 4.7648, + "step": 36532 + }, + { + "epoch": 0.21727210010467218, + "grad_norm": 1.4029744863510132, + "learning_rate": 4.43989839704113e-05, + "loss": 4.8315, + "step": 36533 + }, + { + "epoch": 0.21727804738795317, + "grad_norm": 1.951603651046753, + "learning_rate": 4.43986893292534e-05, + "loss": 5.0957, + "step": 36534 + }, + { + "epoch": 0.2172839946712342, + "grad_norm": 1.844542145729065, + "learning_rate": 4.439839468132362e-05, + "loss": 4.767, + "step": 36535 + }, + { + "epoch": 0.21728994195451518, + "grad_norm": 1.7336167097091675, + "learning_rate": 4.4398100026622053e-05, + "loss": 4.3439, + "step": 36536 + }, + { + "epoch": 0.21729588923779616, + "grad_norm": 2.220607280731201, + "learning_rate": 4.439780536514881e-05, + "loss": 4.6749, + "step": 36537 + }, + { + "epoch": 0.21730183652107718, + "grad_norm": 1.6565178632736206, + "learning_rate": 4.4397510696904e-05, + "loss": 4.5169, + "step": 36538 + }, + { + "epoch": 0.21730778380435817, + "grad_norm": 1.898890495300293, + "learning_rate": 4.439721602188771e-05, + "loss": 4.5224, + "step": 36539 + }, + { + "epoch": 0.21731373108763916, + "grad_norm": 1.4901927709579468, + "learning_rate": 4.439692134010006e-05, + "loss": 4.4407, + "step": 36540 + }, + { + "epoch": 0.21731967837092017, + "grad_norm": 1.7129950523376465, + "learning_rate": 4.4396626651541144e-05, + "loss": 4.7336, + "step": 36541 + }, + { + "epoch": 0.21732562565420116, + "grad_norm": 1.7661831378936768, + "learning_rate": 4.439633195621107e-05, + "loss": 4.7136, + "step": 36542 + }, + { + "epoch": 0.21733157293748215, + "grad_norm": 1.171346664428711, + "learning_rate": 4.4396037254109926e-05, + "loss": 5.3083, + "step": 36543 + }, + { + "epoch": 0.21733752022076316, + "grad_norm": 1.7209404706954956, + "learning_rate": 4.439574254523783e-05, + "loss": 5.1993, + "step": 36544 + }, + { + "epoch": 0.21734346750404415, + "grad_norm": 1.5561963319778442, + "learning_rate": 4.4395447829594884e-05, + "loss": 4.9057, + "step": 36545 + }, + { + "epoch": 0.21734941478732514, + "grad_norm": 2.2409908771514893, + "learning_rate": 4.439515310718119e-05, + "loss": 3.9832, + "step": 36546 + }, + { + "epoch": 0.21735536207060616, + "grad_norm": 1.7077785730361938, + "learning_rate": 4.4394858377996844e-05, + "loss": 4.7167, + "step": 36547 + }, + { + "epoch": 0.21736130935388714, + "grad_norm": 2.373032569885254, + "learning_rate": 4.439456364204195e-05, + "loss": 3.1113, + "step": 36548 + }, + { + "epoch": 0.21736725663716813, + "grad_norm": 2.4456026554107666, + "learning_rate": 4.439426889931662e-05, + "loss": 2.9454, + "step": 36549 + }, + { + "epoch": 0.21737320392044915, + "grad_norm": 2.256770133972168, + "learning_rate": 4.439397414982095e-05, + "loss": 2.8073, + "step": 36550 + }, + { + "epoch": 0.21737915120373014, + "grad_norm": 2.642658233642578, + "learning_rate": 4.4393679393555045e-05, + "loss": 2.8812, + "step": 36551 + }, + { + "epoch": 0.21738509848701112, + "grad_norm": 2.680724859237671, + "learning_rate": 4.439338463051901e-05, + "loss": 3.0972, + "step": 36552 + }, + { + "epoch": 0.21739104577029214, + "grad_norm": 1.975277304649353, + "learning_rate": 4.439308986071293e-05, + "loss": 3.9978, + "step": 36553 + }, + { + "epoch": 0.21739699305357313, + "grad_norm": 1.7531930208206177, + "learning_rate": 4.4392795084136934e-05, + "loss": 4.764, + "step": 36554 + }, + { + "epoch": 0.21740294033685412, + "grad_norm": 1.7737468481063843, + "learning_rate": 4.4392500300791116e-05, + "loss": 4.5489, + "step": 36555 + }, + { + "epoch": 0.21740888762013513, + "grad_norm": 1.6888933181762695, + "learning_rate": 4.439220551067557e-05, + "loss": 4.4714, + "step": 36556 + }, + { + "epoch": 0.21741483490341612, + "grad_norm": 2.4339828491210938, + "learning_rate": 4.439191071379041e-05, + "loss": 3.4193, + "step": 36557 + }, + { + "epoch": 0.2174207821866971, + "grad_norm": 2.4790961742401123, + "learning_rate": 4.4391615910135734e-05, + "loss": 3.3395, + "step": 36558 + }, + { + "epoch": 0.21742672946997812, + "grad_norm": 4.457415580749512, + "learning_rate": 4.439132109971164e-05, + "loss": 3.7631, + "step": 36559 + }, + { + "epoch": 0.2174326767532591, + "grad_norm": 4.010805130004883, + "learning_rate": 4.4391026282518245e-05, + "loss": 4.032, + "step": 36560 + }, + { + "epoch": 0.2174386240365401, + "grad_norm": 1.6782433986663818, + "learning_rate": 4.439073145855563e-05, + "loss": 3.9182, + "step": 36561 + }, + { + "epoch": 0.21744457131982112, + "grad_norm": 1.3384885787963867, + "learning_rate": 4.4390436627823924e-05, + "loss": 4.0683, + "step": 36562 + }, + { + "epoch": 0.2174505186031021, + "grad_norm": 1.5583269596099854, + "learning_rate": 4.439014179032321e-05, + "loss": 3.9236, + "step": 36563 + }, + { + "epoch": 0.2174564658863831, + "grad_norm": 1.5254034996032715, + "learning_rate": 4.4389846946053605e-05, + "loss": 3.8808, + "step": 36564 + }, + { + "epoch": 0.2174624131696641, + "grad_norm": 1.4916062355041504, + "learning_rate": 4.43895520950152e-05, + "loss": 4.0083, + "step": 36565 + }, + { + "epoch": 0.2174683604529451, + "grad_norm": 1.4041520357131958, + "learning_rate": 4.4389257237208095e-05, + "loss": 3.9174, + "step": 36566 + }, + { + "epoch": 0.21747430773622609, + "grad_norm": 1.4670138359069824, + "learning_rate": 4.438896237263241e-05, + "loss": 3.8039, + "step": 36567 + }, + { + "epoch": 0.2174802550195071, + "grad_norm": 1.4441214799880981, + "learning_rate": 4.4388667501288234e-05, + "loss": 3.8671, + "step": 36568 + }, + { + "epoch": 0.2174862023027881, + "grad_norm": 1.3265955448150635, + "learning_rate": 4.438837262317568e-05, + "loss": 3.8891, + "step": 36569 + }, + { + "epoch": 0.21749214958606908, + "grad_norm": 1.4868719577789307, + "learning_rate": 4.438807773829484e-05, + "loss": 3.8434, + "step": 36570 + }, + { + "epoch": 0.2174980968693501, + "grad_norm": 1.3975825309753418, + "learning_rate": 4.4387782846645826e-05, + "loss": 3.8614, + "step": 36571 + }, + { + "epoch": 0.21750404415263108, + "grad_norm": 1.457074522972107, + "learning_rate": 4.438748794822873e-05, + "loss": 4.0471, + "step": 36572 + }, + { + "epoch": 0.21750999143591207, + "grad_norm": 1.4638450145721436, + "learning_rate": 4.4387193043043675e-05, + "loss": 4.049, + "step": 36573 + }, + { + "epoch": 0.21751593871919309, + "grad_norm": 1.5106879472732544, + "learning_rate": 4.4386898131090744e-05, + "loss": 4.0879, + "step": 36574 + }, + { + "epoch": 0.21752188600247407, + "grad_norm": 1.399688720703125, + "learning_rate": 4.438660321237004e-05, + "loss": 3.9919, + "step": 36575 + }, + { + "epoch": 0.21752783328575506, + "grad_norm": 1.3647470474243164, + "learning_rate": 4.4386308286881685e-05, + "loss": 4.1388, + "step": 36576 + }, + { + "epoch": 0.21753378056903608, + "grad_norm": 1.491940975189209, + "learning_rate": 4.438601335462577e-05, + "loss": 3.9717, + "step": 36577 + }, + { + "epoch": 0.21753972785231707, + "grad_norm": 1.417490839958191, + "learning_rate": 4.438571841560239e-05, + "loss": 4.0151, + "step": 36578 + }, + { + "epoch": 0.21754567513559805, + "grad_norm": 1.3765039443969727, + "learning_rate": 4.438542346981166e-05, + "loss": 4.2077, + "step": 36579 + }, + { + "epoch": 0.21755162241887904, + "grad_norm": 3.1067752838134766, + "learning_rate": 4.438512851725368e-05, + "loss": 4.5656, + "step": 36580 + }, + { + "epoch": 0.21755756970216006, + "grad_norm": 1.3396860361099243, + "learning_rate": 4.4384833557928553e-05, + "loss": 3.9936, + "step": 36581 + }, + { + "epoch": 0.21756351698544105, + "grad_norm": 1.3664778470993042, + "learning_rate": 4.438453859183637e-05, + "loss": 3.9645, + "step": 36582 + }, + { + "epoch": 0.21756946426872203, + "grad_norm": 1.8627538681030273, + "learning_rate": 4.438424361897725e-05, + "loss": 4.3642, + "step": 36583 + }, + { + "epoch": 0.21757541155200305, + "grad_norm": 1.9578425884246826, + "learning_rate": 4.43839486393513e-05, + "loss": 3.8721, + "step": 36584 + }, + { + "epoch": 0.21758135883528404, + "grad_norm": 2.2273995876312256, + "learning_rate": 4.438365365295861e-05, + "loss": 3.5968, + "step": 36585 + }, + { + "epoch": 0.21758730611856503, + "grad_norm": 1.7115248441696167, + "learning_rate": 4.438335865979928e-05, + "loss": 5.0829, + "step": 36586 + }, + { + "epoch": 0.21759325340184604, + "grad_norm": 1.6734553575515747, + "learning_rate": 4.4383063659873426e-05, + "loss": 5.0234, + "step": 36587 + }, + { + "epoch": 0.21759920068512703, + "grad_norm": 1.8772227764129639, + "learning_rate": 4.438276865318114e-05, + "loss": 5.0525, + "step": 36588 + }, + { + "epoch": 0.21760514796840802, + "grad_norm": 2.0212247371673584, + "learning_rate": 4.438247363972253e-05, + "loss": 3.7799, + "step": 36589 + }, + { + "epoch": 0.21761109525168904, + "grad_norm": 2.225306987762451, + "learning_rate": 4.43821786194977e-05, + "loss": 3.6704, + "step": 36590 + }, + { + "epoch": 0.21761704253497002, + "grad_norm": 2.1250686645507812, + "learning_rate": 4.438188359250676e-05, + "loss": 3.5848, + "step": 36591 + }, + { + "epoch": 0.217622989818251, + "grad_norm": 2.0238659381866455, + "learning_rate": 4.43815885587498e-05, + "loss": 3.5389, + "step": 36592 + }, + { + "epoch": 0.21762893710153203, + "grad_norm": 2.118901014328003, + "learning_rate": 4.438129351822692e-05, + "loss": 3.6547, + "step": 36593 + }, + { + "epoch": 0.21763488438481302, + "grad_norm": 1.7090996503829956, + "learning_rate": 4.438099847093824e-05, + "loss": 4.2832, + "step": 36594 + }, + { + "epoch": 0.217640831668094, + "grad_norm": 2.008913993835449, + "learning_rate": 4.438070341688385e-05, + "loss": 3.5492, + "step": 36595 + }, + { + "epoch": 0.21764677895137502, + "grad_norm": 1.9323056936264038, + "learning_rate": 4.438040835606385e-05, + "loss": 3.769, + "step": 36596 + }, + { + "epoch": 0.217652726234656, + "grad_norm": 2.1152114868164062, + "learning_rate": 4.438011328847836e-05, + "loss": 3.5528, + "step": 36597 + }, + { + "epoch": 0.217658673517937, + "grad_norm": 2.274790048599243, + "learning_rate": 4.437981821412746e-05, + "loss": 3.5154, + "step": 36598 + }, + { + "epoch": 0.217664620801218, + "grad_norm": 1.9242825508117676, + "learning_rate": 4.437952313301128e-05, + "loss": 3.5931, + "step": 36599 + }, + { + "epoch": 0.217670568084499, + "grad_norm": 1.7859545946121216, + "learning_rate": 4.437922804512991e-05, + "loss": 4.3912, + "step": 36600 + }, + { + "epoch": 0.21767651536778, + "grad_norm": 2.2939436435699463, + "learning_rate": 4.437893295048344e-05, + "loss": 3.3681, + "step": 36601 + }, + { + "epoch": 0.217682462651061, + "grad_norm": 2.1420838832855225, + "learning_rate": 4.437863784907199e-05, + "loss": 3.576, + "step": 36602 + }, + { + "epoch": 0.217688409934342, + "grad_norm": 2.0002729892730713, + "learning_rate": 4.4378342740895656e-05, + "loss": 3.4117, + "step": 36603 + }, + { + "epoch": 0.21769435721762298, + "grad_norm": 2.1192610263824463, + "learning_rate": 4.4378047625954546e-05, + "loss": 3.3726, + "step": 36604 + }, + { + "epoch": 0.217700304500904, + "grad_norm": 2.092013359069824, + "learning_rate": 4.437775250424876e-05, + "loss": 3.4378, + "step": 36605 + }, + { + "epoch": 0.21770625178418498, + "grad_norm": 1.9066373109817505, + "learning_rate": 4.43774573757784e-05, + "loss": 4.4583, + "step": 36606 + }, + { + "epoch": 0.21771219906746597, + "grad_norm": 1.9746664762496948, + "learning_rate": 4.437716224054357e-05, + "loss": 4.5363, + "step": 36607 + }, + { + "epoch": 0.217718146350747, + "grad_norm": 2.223649263381958, + "learning_rate": 4.4376867098544364e-05, + "loss": 4.7908, + "step": 36608 + }, + { + "epoch": 0.21772409363402798, + "grad_norm": 2.136047840118408, + "learning_rate": 4.43765719497809e-05, + "loss": 4.792, + "step": 36609 + }, + { + "epoch": 0.21773004091730896, + "grad_norm": 1.7172813415527344, + "learning_rate": 4.437627679425328e-05, + "loss": 4.7565, + "step": 36610 + }, + { + "epoch": 0.21773598820058998, + "grad_norm": 2.0952224731445312, + "learning_rate": 4.43759816319616e-05, + "loss": 5.0642, + "step": 36611 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 1.7799891233444214, + "learning_rate": 4.437568646290596e-05, + "loss": 5.0625, + "step": 36612 + }, + { + "epoch": 0.21774788276715196, + "grad_norm": 1.6467608213424683, + "learning_rate": 4.437539128708647e-05, + "loss": 5.0044, + "step": 36613 + }, + { + "epoch": 0.21775383005043297, + "grad_norm": 1.7772294282913208, + "learning_rate": 4.4375096104503236e-05, + "loss": 4.9322, + "step": 36614 + }, + { + "epoch": 0.21775977733371396, + "grad_norm": 1.634451985359192, + "learning_rate": 4.437480091515635e-05, + "loss": 4.7343, + "step": 36615 + }, + { + "epoch": 0.21776572461699495, + "grad_norm": 1.9060132503509521, + "learning_rate": 4.4374505719045924e-05, + "loss": 5.1219, + "step": 36616 + }, + { + "epoch": 0.21777167190027596, + "grad_norm": 1.6871609687805176, + "learning_rate": 4.437421051617205e-05, + "loss": 4.9647, + "step": 36617 + }, + { + "epoch": 0.21777761918355695, + "grad_norm": 1.503361463546753, + "learning_rate": 4.4373915306534854e-05, + "loss": 5.1347, + "step": 36618 + }, + { + "epoch": 0.21778356646683794, + "grad_norm": 1.5180853605270386, + "learning_rate": 4.4373620090134415e-05, + "loss": 5.1036, + "step": 36619 + }, + { + "epoch": 0.21778951375011896, + "grad_norm": 1.4979952573776245, + "learning_rate": 4.437332486697085e-05, + "loss": 4.7413, + "step": 36620 + }, + { + "epoch": 0.21779546103339995, + "grad_norm": 1.615602731704712, + "learning_rate": 4.437302963704425e-05, + "loss": 5.0448, + "step": 36621 + }, + { + "epoch": 0.21780140831668093, + "grad_norm": 1.4078423976898193, + "learning_rate": 4.437273440035473e-05, + "loss": 5.3924, + "step": 36622 + }, + { + "epoch": 0.21780735559996195, + "grad_norm": 1.4002037048339844, + "learning_rate": 4.437243915690239e-05, + "loss": 5.1455, + "step": 36623 + }, + { + "epoch": 0.21781330288324294, + "grad_norm": 1.3948068618774414, + "learning_rate": 4.4372143906687336e-05, + "loss": 5.186, + "step": 36624 + }, + { + "epoch": 0.21781925016652393, + "grad_norm": 1.7832770347595215, + "learning_rate": 4.4371848649709655e-05, + "loss": 5.2705, + "step": 36625 + }, + { + "epoch": 0.21782519744980494, + "grad_norm": 1.6136474609375, + "learning_rate": 4.437155338596948e-05, + "loss": 5.3217, + "step": 36626 + }, + { + "epoch": 0.21783114473308593, + "grad_norm": 1.701969027519226, + "learning_rate": 4.437125811546687e-05, + "loss": 5.3073, + "step": 36627 + }, + { + "epoch": 0.21783709201636692, + "grad_norm": 1.6547741889953613, + "learning_rate": 4.437096283820198e-05, + "loss": 5.3173, + "step": 36628 + }, + { + "epoch": 0.21784303929964793, + "grad_norm": 1.7442517280578613, + "learning_rate": 4.437066755417487e-05, + "loss": 5.6794, + "step": 36629 + }, + { + "epoch": 0.21784898658292892, + "grad_norm": 1.4823875427246094, + "learning_rate": 4.437037226338566e-05, + "loss": 5.6783, + "step": 36630 + }, + { + "epoch": 0.2178549338662099, + "grad_norm": 1.6044561862945557, + "learning_rate": 4.437007696583446e-05, + "loss": 4.1659, + "step": 36631 + }, + { + "epoch": 0.21786088114949093, + "grad_norm": 1.591406226158142, + "learning_rate": 4.436978166152137e-05, + "loss": 5.0407, + "step": 36632 + }, + { + "epoch": 0.21786682843277191, + "grad_norm": 1.8867915868759155, + "learning_rate": 4.436948635044648e-05, + "loss": 5.2259, + "step": 36633 + }, + { + "epoch": 0.2178727757160529, + "grad_norm": 1.458233118057251, + "learning_rate": 4.4369191032609905e-05, + "loss": 5.1809, + "step": 36634 + }, + { + "epoch": 0.21787872299933392, + "grad_norm": 1.5315808057785034, + "learning_rate": 4.436889570801175e-05, + "loss": 5.5395, + "step": 36635 + }, + { + "epoch": 0.2178846702826149, + "grad_norm": 1.4725675582885742, + "learning_rate": 4.436860037665211e-05, + "loss": 4.6842, + "step": 36636 + }, + { + "epoch": 0.2178906175658959, + "grad_norm": 1.55316162109375, + "learning_rate": 4.4368305038531095e-05, + "loss": 4.8684, + "step": 36637 + }, + { + "epoch": 0.21789656484917688, + "grad_norm": 1.693979024887085, + "learning_rate": 4.43680096936488e-05, + "loss": 5.1906, + "step": 36638 + }, + { + "epoch": 0.2179025121324579, + "grad_norm": 1.5667452812194824, + "learning_rate": 4.436771434200534e-05, + "loss": 4.9815, + "step": 36639 + }, + { + "epoch": 0.2179084594157389, + "grad_norm": 1.8694840669631958, + "learning_rate": 4.436741898360081e-05, + "loss": 4.6853, + "step": 36640 + }, + { + "epoch": 0.21791440669901987, + "grad_norm": 2.4778950214385986, + "learning_rate": 4.43671236184353e-05, + "loss": 5.0781, + "step": 36641 + }, + { + "epoch": 0.2179203539823009, + "grad_norm": 2.2444334030151367, + "learning_rate": 4.436682824650894e-05, + "loss": 4.4174, + "step": 36642 + }, + { + "epoch": 0.21792630126558188, + "grad_norm": 2.237233877182007, + "learning_rate": 4.4366532867821816e-05, + "loss": 5.1541, + "step": 36643 + }, + { + "epoch": 0.21793224854886287, + "grad_norm": 1.7043414115905762, + "learning_rate": 4.436623748237404e-05, + "loss": 5.0169, + "step": 36644 + }, + { + "epoch": 0.21793819583214388, + "grad_norm": 2.007829427719116, + "learning_rate": 4.4365942090165705e-05, + "loss": 4.0678, + "step": 36645 + }, + { + "epoch": 0.21794414311542487, + "grad_norm": 3.07391357421875, + "learning_rate": 4.4365646691196923e-05, + "loss": 2.7182, + "step": 36646 + }, + { + "epoch": 0.21795009039870586, + "grad_norm": 1.5059764385223389, + "learning_rate": 4.4365351285467796e-05, + "loss": 5.1282, + "step": 36647 + }, + { + "epoch": 0.21795603768198687, + "grad_norm": 2.457430839538574, + "learning_rate": 4.4365055872978424e-05, + "loss": 3.8752, + "step": 36648 + }, + { + "epoch": 0.21796198496526786, + "grad_norm": 2.5790088176727295, + "learning_rate": 4.43647604537289e-05, + "loss": 3.3587, + "step": 36649 + }, + { + "epoch": 0.21796793224854885, + "grad_norm": 2.324841260910034, + "learning_rate": 4.4364465027719346e-05, + "loss": 3.1681, + "step": 36650 + }, + { + "epoch": 0.21797387953182987, + "grad_norm": 2.64815092086792, + "learning_rate": 4.436416959494987e-05, + "loss": 2.97, + "step": 36651 + }, + { + "epoch": 0.21797982681511086, + "grad_norm": 1.9580894708633423, + "learning_rate": 4.436387415542055e-05, + "loss": 3.4777, + "step": 36652 + }, + { + "epoch": 0.21798577409839184, + "grad_norm": 2.4816932678222656, + "learning_rate": 4.43635787091315e-05, + "loss": 4.0422, + "step": 36653 + }, + { + "epoch": 0.21799172138167286, + "grad_norm": 2.4435534477233887, + "learning_rate": 4.436328325608283e-05, + "loss": 4.4932, + "step": 36654 + }, + { + "epoch": 0.21799766866495385, + "grad_norm": 1.6799771785736084, + "learning_rate": 4.436298779627463e-05, + "loss": 5.1482, + "step": 36655 + }, + { + "epoch": 0.21800361594823484, + "grad_norm": 1.5511815547943115, + "learning_rate": 4.436269232970702e-05, + "loss": 5.0648, + "step": 36656 + }, + { + "epoch": 0.21800956323151585, + "grad_norm": 1.6648553609848022, + "learning_rate": 4.436239685638008e-05, + "loss": 4.9462, + "step": 36657 + }, + { + "epoch": 0.21801551051479684, + "grad_norm": 1.543513298034668, + "learning_rate": 4.436210137629394e-05, + "loss": 4.8705, + "step": 36658 + }, + { + "epoch": 0.21802145779807783, + "grad_norm": 1.799896478652954, + "learning_rate": 4.436180588944869e-05, + "loss": 4.5315, + "step": 36659 + }, + { + "epoch": 0.21802740508135884, + "grad_norm": 1.4603382349014282, + "learning_rate": 4.4361510395844435e-05, + "loss": 5.2586, + "step": 36660 + }, + { + "epoch": 0.21803335236463983, + "grad_norm": 1.6055859327316284, + "learning_rate": 4.4361214895481277e-05, + "loss": 5.2098, + "step": 36661 + }, + { + "epoch": 0.21803929964792082, + "grad_norm": 1.6412562131881714, + "learning_rate": 4.436091938835931e-05, + "loss": 4.6264, + "step": 36662 + }, + { + "epoch": 0.21804524693120184, + "grad_norm": 1.660805106163025, + "learning_rate": 4.436062387447866e-05, + "loss": 4.0578, + "step": 36663 + }, + { + "epoch": 0.21805119421448282, + "grad_norm": 2.2433223724365234, + "learning_rate": 4.43603283538394e-05, + "loss": 4.6199, + "step": 36664 + }, + { + "epoch": 0.2180571414977638, + "grad_norm": 1.8339132070541382, + "learning_rate": 4.4360032826441654e-05, + "loss": 4.2583, + "step": 36665 + }, + { + "epoch": 0.21806308878104483, + "grad_norm": 1.7962074279785156, + "learning_rate": 4.435973729228553e-05, + "loss": 4.4523, + "step": 36666 + }, + { + "epoch": 0.21806903606432582, + "grad_norm": 1.5702697038650513, + "learning_rate": 4.435944175137111e-05, + "loss": 4.7028, + "step": 36667 + }, + { + "epoch": 0.2180749833476068, + "grad_norm": 1.853299617767334, + "learning_rate": 4.435914620369852e-05, + "loss": 4.4249, + "step": 36668 + }, + { + "epoch": 0.21808093063088782, + "grad_norm": 1.8155862092971802, + "learning_rate": 4.435885064926785e-05, + "loss": 5.0144, + "step": 36669 + }, + { + "epoch": 0.2180868779141688, + "grad_norm": 1.5878140926361084, + "learning_rate": 4.435855508807919e-05, + "loss": 4.848, + "step": 36670 + }, + { + "epoch": 0.2180928251974498, + "grad_norm": 1.7106904983520508, + "learning_rate": 4.435825952013267e-05, + "loss": 4.9348, + "step": 36671 + }, + { + "epoch": 0.2180987724807308, + "grad_norm": 1.9532265663146973, + "learning_rate": 4.435796394542839e-05, + "loss": 5.4063, + "step": 36672 + }, + { + "epoch": 0.2181047197640118, + "grad_norm": 1.745326042175293, + "learning_rate": 4.435766836396643e-05, + "loss": 5.1079, + "step": 36673 + }, + { + "epoch": 0.2181106670472928, + "grad_norm": 1.9781886339187622, + "learning_rate": 4.435737277574692e-05, + "loss": 4.7416, + "step": 36674 + }, + { + "epoch": 0.2181166143305738, + "grad_norm": 1.6559377908706665, + "learning_rate": 4.435707718076994e-05, + "loss": 4.7594, + "step": 36675 + }, + { + "epoch": 0.2181225616138548, + "grad_norm": 1.8455668687820435, + "learning_rate": 4.435678157903561e-05, + "loss": 5.3747, + "step": 36676 + }, + { + "epoch": 0.21812850889713578, + "grad_norm": 1.821624994277954, + "learning_rate": 4.4356485970544026e-05, + "loss": 5.559, + "step": 36677 + }, + { + "epoch": 0.2181344561804168, + "grad_norm": 1.5654447078704834, + "learning_rate": 4.4356190355295295e-05, + "loss": 4.9508, + "step": 36678 + }, + { + "epoch": 0.21814040346369779, + "grad_norm": 1.6591967344284058, + "learning_rate": 4.435589473328952e-05, + "loss": 4.6972, + "step": 36679 + }, + { + "epoch": 0.21814635074697877, + "grad_norm": 1.548817753791809, + "learning_rate": 4.43555991045268e-05, + "loss": 4.7788, + "step": 36680 + }, + { + "epoch": 0.2181522980302598, + "grad_norm": 1.8624293804168701, + "learning_rate": 4.435530346900724e-05, + "loss": 5.0808, + "step": 36681 + }, + { + "epoch": 0.21815824531354078, + "grad_norm": 1.5332111120224, + "learning_rate": 4.435500782673094e-05, + "loss": 4.9267, + "step": 36682 + }, + { + "epoch": 0.21816419259682177, + "grad_norm": 1.3629741668701172, + "learning_rate": 4.435471217769801e-05, + "loss": 4.7387, + "step": 36683 + }, + { + "epoch": 0.21817013988010278, + "grad_norm": 1.6867749691009521, + "learning_rate": 4.4354416521908546e-05, + "loss": 4.9142, + "step": 36684 + }, + { + "epoch": 0.21817608716338377, + "grad_norm": 1.4487711191177368, + "learning_rate": 4.435412085936266e-05, + "loss": 5.7823, + "step": 36685 + }, + { + "epoch": 0.21818203444666476, + "grad_norm": 1.579579472541809, + "learning_rate": 4.435382519006045e-05, + "loss": 5.6165, + "step": 36686 + }, + { + "epoch": 0.21818798172994577, + "grad_norm": 1.6927639245986938, + "learning_rate": 4.435352951400202e-05, + "loss": 4.7974, + "step": 36687 + }, + { + "epoch": 0.21819392901322676, + "grad_norm": 1.4306942224502563, + "learning_rate": 4.4353233831187466e-05, + "loss": 5.0413, + "step": 36688 + }, + { + "epoch": 0.21819987629650775, + "grad_norm": 1.5357601642608643, + "learning_rate": 4.4352938141616906e-05, + "loss": 4.9543, + "step": 36689 + }, + { + "epoch": 0.21820582357978877, + "grad_norm": 1.5910861492156982, + "learning_rate": 4.435264244529044e-05, + "loss": 5.3783, + "step": 36690 + }, + { + "epoch": 0.21821177086306975, + "grad_norm": 1.5916094779968262, + "learning_rate": 4.435234674220816e-05, + "loss": 5.7495, + "step": 36691 + }, + { + "epoch": 0.21821771814635074, + "grad_norm": 1.7037458419799805, + "learning_rate": 4.4352051032370175e-05, + "loss": 4.8695, + "step": 36692 + }, + { + "epoch": 0.21822366542963176, + "grad_norm": 1.7072062492370605, + "learning_rate": 4.435175531577659e-05, + "loss": 4.7444, + "step": 36693 + }, + { + "epoch": 0.21822961271291275, + "grad_norm": 1.3686712980270386, + "learning_rate": 4.43514595924275e-05, + "loss": 4.9637, + "step": 36694 + }, + { + "epoch": 0.21823555999619373, + "grad_norm": 1.5178147554397583, + "learning_rate": 4.435116386232302e-05, + "loss": 5.0143, + "step": 36695 + }, + { + "epoch": 0.21824150727947472, + "grad_norm": 1.7472846508026123, + "learning_rate": 4.4350868125463254e-05, + "loss": 5.0185, + "step": 36696 + }, + { + "epoch": 0.21824745456275574, + "grad_norm": 1.6126089096069336, + "learning_rate": 4.43505723818483e-05, + "loss": 5.2427, + "step": 36697 + }, + { + "epoch": 0.21825340184603673, + "grad_norm": 1.8243975639343262, + "learning_rate": 4.435027663147825e-05, + "loss": 4.7768, + "step": 36698 + }, + { + "epoch": 0.21825934912931771, + "grad_norm": 1.7908824682235718, + "learning_rate": 4.434998087435323e-05, + "loss": 5.1686, + "step": 36699 + }, + { + "epoch": 0.21826529641259873, + "grad_norm": 1.4360270500183105, + "learning_rate": 4.434968511047333e-05, + "loss": 4.9374, + "step": 36700 + }, + { + "epoch": 0.21827124369587972, + "grad_norm": 2.2321109771728516, + "learning_rate": 4.434938933983864e-05, + "loss": 4.9715, + "step": 36701 + }, + { + "epoch": 0.2182771909791607, + "grad_norm": 1.8325153589248657, + "learning_rate": 4.43490935624493e-05, + "loss": 4.8856, + "step": 36702 + }, + { + "epoch": 0.21828313826244172, + "grad_norm": 1.6531904935836792, + "learning_rate": 4.434879777830538e-05, + "loss": 4.5969, + "step": 36703 + }, + { + "epoch": 0.2182890855457227, + "grad_norm": 2.921745777130127, + "learning_rate": 4.434850198740699e-05, + "loss": 4.6352, + "step": 36704 + }, + { + "epoch": 0.2182950328290037, + "grad_norm": 1.4636532068252563, + "learning_rate": 4.434820618975425e-05, + "loss": 5.6227, + "step": 36705 + }, + { + "epoch": 0.21830098011228471, + "grad_norm": 1.5883086919784546, + "learning_rate": 4.434791038534724e-05, + "loss": 5.5488, + "step": 36706 + }, + { + "epoch": 0.2183069273955657, + "grad_norm": 1.7048338651657104, + "learning_rate": 4.434761457418608e-05, + "loss": 4.8811, + "step": 36707 + }, + { + "epoch": 0.2183128746788467, + "grad_norm": 3.260709524154663, + "learning_rate": 4.4347318756270864e-05, + "loss": 3.5482, + "step": 36708 + }, + { + "epoch": 0.2183188219621277, + "grad_norm": 3.1779022216796875, + "learning_rate": 4.43470229316017e-05, + "loss": 3.4502, + "step": 36709 + }, + { + "epoch": 0.2183247692454087, + "grad_norm": 2.1265110969543457, + "learning_rate": 4.4346727100178696e-05, + "loss": 4.5825, + "step": 36710 + }, + { + "epoch": 0.21833071652868968, + "grad_norm": 1.619808316230774, + "learning_rate": 4.434643126200194e-05, + "loss": 5.3028, + "step": 36711 + }, + { + "epoch": 0.2183366638119707, + "grad_norm": 3.3339853286743164, + "learning_rate": 4.434613541707156e-05, + "loss": 3.2459, + "step": 36712 + }, + { + "epoch": 0.2183426110952517, + "grad_norm": 3.406125068664551, + "learning_rate": 4.4345839565387626e-05, + "loss": 2.936, + "step": 36713 + }, + { + "epoch": 0.21834855837853268, + "grad_norm": 2.45475697517395, + "learning_rate": 4.4345543706950274e-05, + "loss": 3.0101, + "step": 36714 + }, + { + "epoch": 0.2183545056618137, + "grad_norm": 2.70192551612854, + "learning_rate": 4.434524784175958e-05, + "loss": 3.2301, + "step": 36715 + }, + { + "epoch": 0.21836045294509468, + "grad_norm": 2.4062118530273438, + "learning_rate": 4.434495196981567e-05, + "loss": 2.8497, + "step": 36716 + }, + { + "epoch": 0.21836640022837567, + "grad_norm": 2.3117856979370117, + "learning_rate": 4.434465609111863e-05, + "loss": 2.8549, + "step": 36717 + }, + { + "epoch": 0.21837234751165668, + "grad_norm": 2.4423859119415283, + "learning_rate": 4.434436020566857e-05, + "loss": 2.849, + "step": 36718 + }, + { + "epoch": 0.21837829479493767, + "grad_norm": 2.377615213394165, + "learning_rate": 4.43440643134656e-05, + "loss": 2.6986, + "step": 36719 + }, + { + "epoch": 0.21838424207821866, + "grad_norm": 2.4111328125, + "learning_rate": 4.434376841450981e-05, + "loss": 2.7673, + "step": 36720 + }, + { + "epoch": 0.21839018936149968, + "grad_norm": 2.4199485778808594, + "learning_rate": 4.434347250880132e-05, + "loss": 3.4974, + "step": 36721 + }, + { + "epoch": 0.21839613664478066, + "grad_norm": 2.2651684284210205, + "learning_rate": 4.434317659634022e-05, + "loss": 3.073, + "step": 36722 + }, + { + "epoch": 0.21840208392806165, + "grad_norm": 2.3775131702423096, + "learning_rate": 4.4342880677126606e-05, + "loss": 3.1214, + "step": 36723 + }, + { + "epoch": 0.21840803121134267, + "grad_norm": 2.3316352367401123, + "learning_rate": 4.434258475116061e-05, + "loss": 3.1617, + "step": 36724 + }, + { + "epoch": 0.21841397849462366, + "grad_norm": 2.312774181365967, + "learning_rate": 4.43422888184423e-05, + "loss": 2.7227, + "step": 36725 + }, + { + "epoch": 0.21841992577790464, + "grad_norm": 2.354297161102295, + "learning_rate": 4.434199287897181e-05, + "loss": 3.3812, + "step": 36726 + }, + { + "epoch": 0.21842587306118566, + "grad_norm": 1.8631865978240967, + "learning_rate": 4.4341696932749224e-05, + "loss": 3.9094, + "step": 36727 + }, + { + "epoch": 0.21843182034446665, + "grad_norm": 2.2943942546844482, + "learning_rate": 4.434140097977465e-05, + "loss": 2.5553, + "step": 36728 + }, + { + "epoch": 0.21843776762774764, + "grad_norm": 2.2607760429382324, + "learning_rate": 4.43411050200482e-05, + "loss": 3.1659, + "step": 36729 + }, + { + "epoch": 0.21844371491102865, + "grad_norm": 1.8395904302597046, + "learning_rate": 4.434080905356997e-05, + "loss": 4.2747, + "step": 36730 + }, + { + "epoch": 0.21844966219430964, + "grad_norm": 2.3018381595611572, + "learning_rate": 4.4340513080340054e-05, + "loss": 3.8753, + "step": 36731 + }, + { + "epoch": 0.21845560947759063, + "grad_norm": 2.3045477867126465, + "learning_rate": 4.434021710035857e-05, + "loss": 3.9674, + "step": 36732 + }, + { + "epoch": 0.21846155676087164, + "grad_norm": 1.9292182922363281, + "learning_rate": 4.433992111362562e-05, + "loss": 4.1323, + "step": 36733 + }, + { + "epoch": 0.21846750404415263, + "grad_norm": 1.6731642484664917, + "learning_rate": 4.433962512014129e-05, + "loss": 4.743, + "step": 36734 + }, + { + "epoch": 0.21847345132743362, + "grad_norm": 2.1067628860473633, + "learning_rate": 4.433932911990571e-05, + "loss": 4.3337, + "step": 36735 + }, + { + "epoch": 0.21847939861071464, + "grad_norm": 3.1354286670684814, + "learning_rate": 4.4339033112918966e-05, + "loss": 4.1579, + "step": 36736 + }, + { + "epoch": 0.21848534589399562, + "grad_norm": 2.441835403442383, + "learning_rate": 4.433873709918116e-05, + "loss": 4.0169, + "step": 36737 + }, + { + "epoch": 0.2184912931772766, + "grad_norm": 2.3072402477264404, + "learning_rate": 4.433844107869241e-05, + "loss": 3.9989, + "step": 36738 + }, + { + "epoch": 0.21849724046055763, + "grad_norm": 2.370055913925171, + "learning_rate": 4.4338145051452804e-05, + "loss": 3.8085, + "step": 36739 + }, + { + "epoch": 0.21850318774383862, + "grad_norm": 1.6106902360916138, + "learning_rate": 4.433784901746245e-05, + "loss": 5.171, + "step": 36740 + }, + { + "epoch": 0.2185091350271196, + "grad_norm": 1.7052632570266724, + "learning_rate": 4.4337552976721466e-05, + "loss": 5.4133, + "step": 36741 + }, + { + "epoch": 0.21851508231040062, + "grad_norm": 1.4769154787063599, + "learning_rate": 4.4337256929229925e-05, + "loss": 5.3987, + "step": 36742 + }, + { + "epoch": 0.2185210295936816, + "grad_norm": 1.929015874862671, + "learning_rate": 4.433696087498795e-05, + "loss": 4.5028, + "step": 36743 + }, + { + "epoch": 0.2185269768769626, + "grad_norm": 3.284090042114258, + "learning_rate": 4.4336664813995654e-05, + "loss": 3.7967, + "step": 36744 + }, + { + "epoch": 0.2185329241602436, + "grad_norm": 1.7083535194396973, + "learning_rate": 4.433636874625312e-05, + "loss": 5.4123, + "step": 36745 + }, + { + "epoch": 0.2185388714435246, + "grad_norm": 1.9117016792297363, + "learning_rate": 4.433607267176045e-05, + "loss": 4.685, + "step": 36746 + }, + { + "epoch": 0.2185448187268056, + "grad_norm": 1.509709119796753, + "learning_rate": 4.433577659051777e-05, + "loss": 4.7325, + "step": 36747 + }, + { + "epoch": 0.2185507660100866, + "grad_norm": 1.774267554283142, + "learning_rate": 4.4335480502525174e-05, + "loss": 4.6373, + "step": 36748 + }, + { + "epoch": 0.2185567132933676, + "grad_norm": 1.41673743724823, + "learning_rate": 4.433518440778275e-05, + "loss": 4.6033, + "step": 36749 + }, + { + "epoch": 0.21856266057664858, + "grad_norm": 1.881105661392212, + "learning_rate": 4.433488830629061e-05, + "loss": 4.5465, + "step": 36750 + }, + { + "epoch": 0.2185686078599296, + "grad_norm": 2.3924362659454346, + "learning_rate": 4.433459219804887e-05, + "loss": 4.5526, + "step": 36751 + }, + { + "epoch": 0.21857455514321059, + "grad_norm": 1.4470226764678955, + "learning_rate": 4.433429608305763e-05, + "loss": 5.0721, + "step": 36752 + }, + { + "epoch": 0.21858050242649157, + "grad_norm": 1.72877836227417, + "learning_rate": 4.4333999961316974e-05, + "loss": 5.0966, + "step": 36753 + }, + { + "epoch": 0.21858644970977256, + "grad_norm": 1.8247528076171875, + "learning_rate": 4.4333703832827026e-05, + "loss": 4.9581, + "step": 36754 + }, + { + "epoch": 0.21859239699305358, + "grad_norm": 1.702294945716858, + "learning_rate": 4.433340769758787e-05, + "loss": 4.7263, + "step": 36755 + }, + { + "epoch": 0.21859834427633457, + "grad_norm": 1.863827109336853, + "learning_rate": 4.433311155559963e-05, + "loss": 4.6154, + "step": 36756 + }, + { + "epoch": 0.21860429155961555, + "grad_norm": 1.580384373664856, + "learning_rate": 4.43328154068624e-05, + "loss": 4.5494, + "step": 36757 + }, + { + "epoch": 0.21861023884289657, + "grad_norm": 1.8330835103988647, + "learning_rate": 4.433251925137628e-05, + "loss": 4.8742, + "step": 36758 + }, + { + "epoch": 0.21861618612617756, + "grad_norm": 1.6677405834197998, + "learning_rate": 4.433222308914138e-05, + "loss": 5.2166, + "step": 36759 + }, + { + "epoch": 0.21862213340945855, + "grad_norm": 2.5527493953704834, + "learning_rate": 4.433192692015781e-05, + "loss": 4.1831, + "step": 36760 + }, + { + "epoch": 0.21862808069273956, + "grad_norm": 1.9266340732574463, + "learning_rate": 4.433163074442564e-05, + "loss": 4.869, + "step": 36761 + }, + { + "epoch": 0.21863402797602055, + "grad_norm": 1.7809799909591675, + "learning_rate": 4.433133456194502e-05, + "loss": 4.6188, + "step": 36762 + }, + { + "epoch": 0.21863997525930154, + "grad_norm": 1.6126807928085327, + "learning_rate": 4.433103837271603e-05, + "loss": 4.7458, + "step": 36763 + }, + { + "epoch": 0.21864592254258255, + "grad_norm": 1.7361348867416382, + "learning_rate": 4.433074217673876e-05, + "loss": 5.255, + "step": 36764 + }, + { + "epoch": 0.21865186982586354, + "grad_norm": 2.3801043033599854, + "learning_rate": 4.433044597401333e-05, + "loss": 4.1598, + "step": 36765 + }, + { + "epoch": 0.21865781710914453, + "grad_norm": 2.6629974842071533, + "learning_rate": 4.433014976453985e-05, + "loss": 4.5282, + "step": 36766 + }, + { + "epoch": 0.21866376439242555, + "grad_norm": 2.1977686882019043, + "learning_rate": 4.432985354831841e-05, + "loss": 3.5004, + "step": 36767 + }, + { + "epoch": 0.21866971167570654, + "grad_norm": 2.285147190093994, + "learning_rate": 4.432955732534912e-05, + "loss": 3.5626, + "step": 36768 + }, + { + "epoch": 0.21867565895898752, + "grad_norm": 1.8421472311019897, + "learning_rate": 4.432926109563208e-05, + "loss": 5.2143, + "step": 36769 + }, + { + "epoch": 0.21868160624226854, + "grad_norm": 1.6054788827896118, + "learning_rate": 4.4328964859167396e-05, + "loss": 5.2803, + "step": 36770 + }, + { + "epoch": 0.21868755352554953, + "grad_norm": 2.29986834526062, + "learning_rate": 4.432866861595517e-05, + "loss": 3.5438, + "step": 36771 + }, + { + "epoch": 0.21869350080883052, + "grad_norm": 2.4712657928466797, + "learning_rate": 4.43283723659955e-05, + "loss": 2.8448, + "step": 36772 + }, + { + "epoch": 0.21869944809211153, + "grad_norm": 2.8239145278930664, + "learning_rate": 4.43280761092885e-05, + "loss": 2.1695, + "step": 36773 + }, + { + "epoch": 0.21870539537539252, + "grad_norm": 2.5487799644470215, + "learning_rate": 4.432777984583427e-05, + "loss": 2.4765, + "step": 36774 + }, + { + "epoch": 0.2187113426586735, + "grad_norm": 2.319502353668213, + "learning_rate": 4.432748357563291e-05, + "loss": 3.483, + "step": 36775 + }, + { + "epoch": 0.21871728994195452, + "grad_norm": 2.333451747894287, + "learning_rate": 4.432718729868453e-05, + "loss": 3.1763, + "step": 36776 + }, + { + "epoch": 0.2187232372252355, + "grad_norm": 2.6187281608581543, + "learning_rate": 4.4326891014989216e-05, + "loss": 1.9022, + "step": 36777 + }, + { + "epoch": 0.2187291845085165, + "grad_norm": 2.4931774139404297, + "learning_rate": 4.432659472454709e-05, + "loss": 1.7533, + "step": 36778 + }, + { + "epoch": 0.21873513179179752, + "grad_norm": 2.9427437782287598, + "learning_rate": 4.4326298427358246e-05, + "loss": 1.5369, + "step": 36779 + }, + { + "epoch": 0.2187410790750785, + "grad_norm": 2.4648377895355225, + "learning_rate": 4.43260021234228e-05, + "loss": 1.7569, + "step": 36780 + }, + { + "epoch": 0.2187470263583595, + "grad_norm": 2.9976446628570557, + "learning_rate": 4.432570581274084e-05, + "loss": 1.672, + "step": 36781 + }, + { + "epoch": 0.2187529736416405, + "grad_norm": 3.176912307739258, + "learning_rate": 4.4325409495312476e-05, + "loss": 1.8063, + "step": 36782 + }, + { + "epoch": 0.2187589209249215, + "grad_norm": 2.667128801345825, + "learning_rate": 4.432511317113781e-05, + "loss": 2.672, + "step": 36783 + }, + { + "epoch": 0.21876486820820248, + "grad_norm": 2.843445301055908, + "learning_rate": 4.432481684021695e-05, + "loss": 3.2058, + "step": 36784 + }, + { + "epoch": 0.2187708154914835, + "grad_norm": 2.820849657058716, + "learning_rate": 4.432452050255e-05, + "loss": 3.3282, + "step": 36785 + }, + { + "epoch": 0.2187767627747645, + "grad_norm": 2.657222032546997, + "learning_rate": 4.432422415813705e-05, + "loss": 3.3505, + "step": 36786 + }, + { + "epoch": 0.21878271005804548, + "grad_norm": 2.6856789588928223, + "learning_rate": 4.4323927806978214e-05, + "loss": 3.3864, + "step": 36787 + }, + { + "epoch": 0.2187886573413265, + "grad_norm": 2.786029815673828, + "learning_rate": 4.432363144907361e-05, + "loss": 3.4048, + "step": 36788 + }, + { + "epoch": 0.21879460462460748, + "grad_norm": 2.5305304527282715, + "learning_rate": 4.4323335084423305e-05, + "loss": 3.2925, + "step": 36789 + }, + { + "epoch": 0.21880055190788847, + "grad_norm": 2.462794065475464, + "learning_rate": 4.432303871302743e-05, + "loss": 3.2506, + "step": 36790 + }, + { + "epoch": 0.21880649919116948, + "grad_norm": 2.3902087211608887, + "learning_rate": 4.4322742334886094e-05, + "loss": 3.5342, + "step": 36791 + }, + { + "epoch": 0.21881244647445047, + "grad_norm": 2.16796875, + "learning_rate": 4.432244594999937e-05, + "loss": 4.4603, + "step": 36792 + }, + { + "epoch": 0.21881839375773146, + "grad_norm": 1.8874982595443726, + "learning_rate": 4.432214955836739e-05, + "loss": 4.3329, + "step": 36793 + }, + { + "epoch": 0.21882434104101248, + "grad_norm": 2.055091381072998, + "learning_rate": 4.4321853159990244e-05, + "loss": 4.5713, + "step": 36794 + }, + { + "epoch": 0.21883028832429346, + "grad_norm": 2.253117322921753, + "learning_rate": 4.432155675486804e-05, + "loss": 4.151, + "step": 36795 + }, + { + "epoch": 0.21883623560757445, + "grad_norm": 2.199066638946533, + "learning_rate": 4.432126034300088e-05, + "loss": 4.5785, + "step": 36796 + }, + { + "epoch": 0.21884218289085547, + "grad_norm": 1.666224718093872, + "learning_rate": 4.432096392438887e-05, + "loss": 4.5748, + "step": 36797 + }, + { + "epoch": 0.21884813017413646, + "grad_norm": 2.1748523712158203, + "learning_rate": 4.432066749903211e-05, + "loss": 4.0945, + "step": 36798 + }, + { + "epoch": 0.21885407745741745, + "grad_norm": 2.570986747741699, + "learning_rate": 4.43203710669307e-05, + "loss": 3.7925, + "step": 36799 + }, + { + "epoch": 0.21886002474069846, + "grad_norm": 2.303675651550293, + "learning_rate": 4.4320074628084754e-05, + "loss": 3.7916, + "step": 36800 + }, + { + "epoch": 0.21886597202397945, + "grad_norm": 2.1665382385253906, + "learning_rate": 4.431977818249436e-05, + "loss": 3.9575, + "step": 36801 + }, + { + "epoch": 0.21887191930726044, + "grad_norm": 2.1685996055603027, + "learning_rate": 4.431948173015964e-05, + "loss": 3.8452, + "step": 36802 + }, + { + "epoch": 0.21887786659054145, + "grad_norm": 2.4096124172210693, + "learning_rate": 4.431918527108069e-05, + "loss": 3.8829, + "step": 36803 + }, + { + "epoch": 0.21888381387382244, + "grad_norm": 2.1310126781463623, + "learning_rate": 4.431888880525761e-05, + "loss": 3.5221, + "step": 36804 + }, + { + "epoch": 0.21888976115710343, + "grad_norm": 2.318202018737793, + "learning_rate": 4.4318592332690504e-05, + "loss": 3.7006, + "step": 36805 + }, + { + "epoch": 0.21889570844038445, + "grad_norm": 2.4116504192352295, + "learning_rate": 4.431829585337948e-05, + "loss": 3.4329, + "step": 36806 + }, + { + "epoch": 0.21890165572366543, + "grad_norm": 2.2227671146392822, + "learning_rate": 4.4317999367324635e-05, + "loss": 3.4883, + "step": 36807 + }, + { + "epoch": 0.21890760300694642, + "grad_norm": 2.3181447982788086, + "learning_rate": 4.431770287452608e-05, + "loss": 3.4717, + "step": 36808 + }, + { + "epoch": 0.21891355029022744, + "grad_norm": 1.950046420097351, + "learning_rate": 4.4317406374983905e-05, + "loss": 4.4156, + "step": 36809 + }, + { + "epoch": 0.21891949757350843, + "grad_norm": 3.2731096744537354, + "learning_rate": 4.431710986869823e-05, + "loss": 3.3211, + "step": 36810 + }, + { + "epoch": 0.21892544485678941, + "grad_norm": 2.419877767562866, + "learning_rate": 4.431681335566915e-05, + "loss": 3.4889, + "step": 36811 + }, + { + "epoch": 0.2189313921400704, + "grad_norm": 2.390082359313965, + "learning_rate": 4.4316516835896773e-05, + "loss": 3.4228, + "step": 36812 + }, + { + "epoch": 0.21893733942335142, + "grad_norm": 2.5118305683135986, + "learning_rate": 4.43162203093812e-05, + "loss": 3.8365, + "step": 36813 + }, + { + "epoch": 0.2189432867066324, + "grad_norm": 2.055748701095581, + "learning_rate": 4.4315923776122524e-05, + "loss": 5.0624, + "step": 36814 + }, + { + "epoch": 0.2189492339899134, + "grad_norm": 2.4033682346343994, + "learning_rate": 4.431562723612087e-05, + "loss": 5.4134, + "step": 36815 + }, + { + "epoch": 0.2189551812731944, + "grad_norm": 1.550174593925476, + "learning_rate": 4.4315330689376325e-05, + "loss": 5.0781, + "step": 36816 + }, + { + "epoch": 0.2189611285564754, + "grad_norm": 1.4097830057144165, + "learning_rate": 4.4315034135889e-05, + "loss": 5.2124, + "step": 36817 + }, + { + "epoch": 0.2189670758397564, + "grad_norm": 1.503827691078186, + "learning_rate": 4.431473757565899e-05, + "loss": 5.3189, + "step": 36818 + }, + { + "epoch": 0.2189730231230374, + "grad_norm": 1.8725134134292603, + "learning_rate": 4.4314441008686414e-05, + "loss": 4.9859, + "step": 36819 + }, + { + "epoch": 0.2189789704063184, + "grad_norm": 1.5328760147094727, + "learning_rate": 4.431414443497136e-05, + "loss": 5.2791, + "step": 36820 + }, + { + "epoch": 0.21898491768959938, + "grad_norm": 1.5473660230636597, + "learning_rate": 4.431384785451395e-05, + "loss": 5.4368, + "step": 36821 + }, + { + "epoch": 0.2189908649728804, + "grad_norm": 1.6382627487182617, + "learning_rate": 4.4313551267314255e-05, + "loss": 5.4049, + "step": 36822 + }, + { + "epoch": 0.21899681225616138, + "grad_norm": 1.6156213283538818, + "learning_rate": 4.4313254673372405e-05, + "loss": 5.4171, + "step": 36823 + }, + { + "epoch": 0.21900275953944237, + "grad_norm": 1.3826895952224731, + "learning_rate": 4.4312958072688504e-05, + "loss": 5.2223, + "step": 36824 + }, + { + "epoch": 0.2190087068227234, + "grad_norm": 5.227144241333008, + "learning_rate": 4.431266146526265e-05, + "loss": 3.7088, + "step": 36825 + }, + { + "epoch": 0.21901465410600438, + "grad_norm": 3.769723415374756, + "learning_rate": 4.431236485109493e-05, + "loss": 4.6714, + "step": 36826 + }, + { + "epoch": 0.21902060138928536, + "grad_norm": 4.917707443237305, + "learning_rate": 4.431206823018548e-05, + "loss": 4.1719, + "step": 36827 + }, + { + "epoch": 0.21902654867256638, + "grad_norm": 2.7149879932403564, + "learning_rate": 4.431177160253438e-05, + "loss": 4.5361, + "step": 36828 + }, + { + "epoch": 0.21903249595584737, + "grad_norm": 4.330033779144287, + "learning_rate": 4.4311474968141745e-05, + "loss": 2.9403, + "step": 36829 + }, + { + "epoch": 0.21903844323912836, + "grad_norm": 4.110903263092041, + "learning_rate": 4.4311178327007664e-05, + "loss": 3.2446, + "step": 36830 + }, + { + "epoch": 0.21904439052240937, + "grad_norm": 3.4869606494903564, + "learning_rate": 4.431088167913225e-05, + "loss": 2.7637, + "step": 36831 + }, + { + "epoch": 0.21905033780569036, + "grad_norm": 3.579864263534546, + "learning_rate": 4.4310585024515615e-05, + "loss": 2.8165, + "step": 36832 + }, + { + "epoch": 0.21905628508897135, + "grad_norm": 1.7594797611236572, + "learning_rate": 4.431028836315786e-05, + "loss": 5.0774, + "step": 36833 + }, + { + "epoch": 0.21906223237225236, + "grad_norm": 1.7493889331817627, + "learning_rate": 4.430999169505907e-05, + "loss": 5.3393, + "step": 36834 + }, + { + "epoch": 0.21906817965553335, + "grad_norm": 1.4827722311019897, + "learning_rate": 4.430969502021937e-05, + "loss": 5.7201, + "step": 36835 + }, + { + "epoch": 0.21907412693881434, + "grad_norm": 1.6048434972763062, + "learning_rate": 4.430939833863884e-05, + "loss": 4.9929, + "step": 36836 + }, + { + "epoch": 0.21908007422209536, + "grad_norm": 3.3946571350097656, + "learning_rate": 4.430910165031761e-05, + "loss": 2.6245, + "step": 36837 + }, + { + "epoch": 0.21908602150537634, + "grad_norm": 1.9543274641036987, + "learning_rate": 4.4308804955255775e-05, + "loss": 4.876, + "step": 36838 + }, + { + "epoch": 0.21909196878865733, + "grad_norm": 1.7576123476028442, + "learning_rate": 4.4308508253453426e-05, + "loss": 4.8688, + "step": 36839 + }, + { + "epoch": 0.21909791607193835, + "grad_norm": 1.5310838222503662, + "learning_rate": 4.430821154491069e-05, + "loss": 4.8038, + "step": 36840 + }, + { + "epoch": 0.21910386335521934, + "grad_norm": 1.8425617218017578, + "learning_rate": 4.430791482962765e-05, + "loss": 4.4143, + "step": 36841 + }, + { + "epoch": 0.21910981063850032, + "grad_norm": 1.6769896745681763, + "learning_rate": 4.430761810760441e-05, + "loss": 4.7782, + "step": 36842 + }, + { + "epoch": 0.21911575792178134, + "grad_norm": 2.571417808532715, + "learning_rate": 4.430732137884109e-05, + "loss": 3.6217, + "step": 36843 + }, + { + "epoch": 0.21912170520506233, + "grad_norm": 1.8328107595443726, + "learning_rate": 4.430702464333777e-05, + "loss": 4.4206, + "step": 36844 + }, + { + "epoch": 0.21912765248834332, + "grad_norm": 1.6589614152908325, + "learning_rate": 4.430672790109458e-05, + "loss": 4.7387, + "step": 36845 + }, + { + "epoch": 0.21913359977162433, + "grad_norm": 2.539179801940918, + "learning_rate": 4.4306431152111604e-05, + "loss": 3.394, + "step": 36846 + }, + { + "epoch": 0.21913954705490532, + "grad_norm": 2.6338255405426025, + "learning_rate": 4.430613439638896e-05, + "loss": 3.477, + "step": 36847 + }, + { + "epoch": 0.2191454943381863, + "grad_norm": 2.6075761318206787, + "learning_rate": 4.430583763392674e-05, + "loss": 3.4526, + "step": 36848 + }, + { + "epoch": 0.21915144162146732, + "grad_norm": 2.717876672744751, + "learning_rate": 4.430554086472505e-05, + "loss": 3.5442, + "step": 36849 + }, + { + "epoch": 0.2191573889047483, + "grad_norm": 1.6439937353134155, + "learning_rate": 4.430524408878399e-05, + "loss": 4.4737, + "step": 36850 + }, + { + "epoch": 0.2191633361880293, + "grad_norm": 2.434872627258301, + "learning_rate": 4.430494730610368e-05, + "loss": 3.5731, + "step": 36851 + }, + { + "epoch": 0.21916928347131032, + "grad_norm": 2.7367117404937744, + "learning_rate": 4.43046505166842e-05, + "loss": 3.5388, + "step": 36852 + }, + { + "epoch": 0.2191752307545913, + "grad_norm": 2.5835742950439453, + "learning_rate": 4.430435372052568e-05, + "loss": 3.8811, + "step": 36853 + }, + { + "epoch": 0.2191811780378723, + "grad_norm": 1.5144959688186646, + "learning_rate": 4.43040569176282e-05, + "loss": 5.0131, + "step": 36854 + }, + { + "epoch": 0.2191871253211533, + "grad_norm": 1.4497205018997192, + "learning_rate": 4.4303760107991874e-05, + "loss": 5.0003, + "step": 36855 + }, + { + "epoch": 0.2191930726044343, + "grad_norm": 1.6756725311279297, + "learning_rate": 4.430346329161681e-05, + "loss": 5.0879, + "step": 36856 + }, + { + "epoch": 0.21919901988771529, + "grad_norm": 1.5844732522964478, + "learning_rate": 4.43031664685031e-05, + "loss": 5.0556, + "step": 36857 + }, + { + "epoch": 0.2192049671709963, + "grad_norm": 1.3791886568069458, + "learning_rate": 4.4302869638650856e-05, + "loss": 4.6868, + "step": 36858 + }, + { + "epoch": 0.2192109144542773, + "grad_norm": 1.4763437509536743, + "learning_rate": 4.430257280206018e-05, + "loss": 4.3887, + "step": 36859 + }, + { + "epoch": 0.21921686173755828, + "grad_norm": 1.5181909799575806, + "learning_rate": 4.430227595873118e-05, + "loss": 4.6743, + "step": 36860 + }, + { + "epoch": 0.2192228090208393, + "grad_norm": 1.728208065032959, + "learning_rate": 4.430197910866395e-05, + "loss": 4.1249, + "step": 36861 + }, + { + "epoch": 0.21922875630412028, + "grad_norm": 1.6396219730377197, + "learning_rate": 4.430168225185859e-05, + "loss": 4.8028, + "step": 36862 + }, + { + "epoch": 0.21923470358740127, + "grad_norm": 1.736222505569458, + "learning_rate": 4.430138538831523e-05, + "loss": 4.9611, + "step": 36863 + }, + { + "epoch": 0.21924065087068229, + "grad_norm": 1.7069100141525269, + "learning_rate": 4.430108851803394e-05, + "loss": 5.2723, + "step": 36864 + }, + { + "epoch": 0.21924659815396327, + "grad_norm": 1.4066749811172485, + "learning_rate": 4.430079164101485e-05, + "loss": 4.8875, + "step": 36865 + }, + { + "epoch": 0.21925254543724426, + "grad_norm": 2.0646684169769287, + "learning_rate": 4.430049475725805e-05, + "loss": 4.4612, + "step": 36866 + }, + { + "epoch": 0.21925849272052528, + "grad_norm": 1.9822113513946533, + "learning_rate": 4.430019786676365e-05, + "loss": 4.3854, + "step": 36867 + }, + { + "epoch": 0.21926444000380627, + "grad_norm": 2.068380355834961, + "learning_rate": 4.429990096953174e-05, + "loss": 3.472, + "step": 36868 + }, + { + "epoch": 0.21927038728708725, + "grad_norm": 2.1574547290802, + "learning_rate": 4.429960406556244e-05, + "loss": 3.3961, + "step": 36869 + }, + { + "epoch": 0.21927633457036824, + "grad_norm": 2.2769057750701904, + "learning_rate": 4.4299307154855855e-05, + "loss": 3.4244, + "step": 36870 + }, + { + "epoch": 0.21928228185364926, + "grad_norm": 2.169564962387085, + "learning_rate": 4.429901023741207e-05, + "loss": 3.3977, + "step": 36871 + }, + { + "epoch": 0.21928822913693025, + "grad_norm": 1.7176889181137085, + "learning_rate": 4.42987133132312e-05, + "loss": 4.2225, + "step": 36872 + }, + { + "epoch": 0.21929417642021123, + "grad_norm": 1.5642355680465698, + "learning_rate": 4.4298416382313355e-05, + "loss": 4.8421, + "step": 36873 + }, + { + "epoch": 0.21930012370349225, + "grad_norm": 2.2092230319976807, + "learning_rate": 4.4298119444658633e-05, + "loss": 3.436, + "step": 36874 + }, + { + "epoch": 0.21930607098677324, + "grad_norm": 1.5236022472381592, + "learning_rate": 4.4297822500267127e-05, + "loss": 3.7891, + "step": 36875 + }, + { + "epoch": 0.21931201827005423, + "grad_norm": 1.6356561183929443, + "learning_rate": 4.4297525549138963e-05, + "loss": 4.4118, + "step": 36876 + }, + { + "epoch": 0.21931796555333524, + "grad_norm": 1.5913872718811035, + "learning_rate": 4.4297228591274225e-05, + "loss": 4.0968, + "step": 36877 + }, + { + "epoch": 0.21932391283661623, + "grad_norm": 1.8036432266235352, + "learning_rate": 4.429693162667302e-05, + "loss": 4.9122, + "step": 36878 + }, + { + "epoch": 0.21932986011989722, + "grad_norm": 1.545316457748413, + "learning_rate": 4.4296634655335464e-05, + "loss": 4.8599, + "step": 36879 + }, + { + "epoch": 0.21933580740317823, + "grad_norm": 1.493503451347351, + "learning_rate": 4.429633767726165e-05, + "loss": 4.0066, + "step": 36880 + }, + { + "epoch": 0.21934175468645922, + "grad_norm": 1.5063025951385498, + "learning_rate": 4.429604069245168e-05, + "loss": 3.7343, + "step": 36881 + }, + { + "epoch": 0.2193477019697402, + "grad_norm": 1.4364261627197266, + "learning_rate": 4.429574370090567e-05, + "loss": 4.0285, + "step": 36882 + }, + { + "epoch": 0.21935364925302123, + "grad_norm": 1.497621774673462, + "learning_rate": 4.429544670262371e-05, + "loss": 4.2485, + "step": 36883 + }, + { + "epoch": 0.21935959653630221, + "grad_norm": 1.4784116744995117, + "learning_rate": 4.429514969760591e-05, + "loss": 4.0498, + "step": 36884 + }, + { + "epoch": 0.2193655438195832, + "grad_norm": 1.4145901203155518, + "learning_rate": 4.4294852685852366e-05, + "loss": 3.9571, + "step": 36885 + }, + { + "epoch": 0.21937149110286422, + "grad_norm": 1.4398488998413086, + "learning_rate": 4.429455566736319e-05, + "loss": 4.0692, + "step": 36886 + }, + { + "epoch": 0.2193774383861452, + "grad_norm": 1.5177552700042725, + "learning_rate": 4.4294258642138495e-05, + "loss": 4.0562, + "step": 36887 + }, + { + "epoch": 0.2193833856694262, + "grad_norm": 1.36580491065979, + "learning_rate": 4.429396161017836e-05, + "loss": 4.0386, + "step": 36888 + }, + { + "epoch": 0.2193893329527072, + "grad_norm": 1.3633042573928833, + "learning_rate": 4.429366457148291e-05, + "loss": 3.8754, + "step": 36889 + }, + { + "epoch": 0.2193952802359882, + "grad_norm": 2.18786883354187, + "learning_rate": 4.4293367526052246e-05, + "loss": 3.5764, + "step": 36890 + }, + { + "epoch": 0.2194012275192692, + "grad_norm": 1.3502684831619263, + "learning_rate": 4.4293070473886456e-05, + "loss": 3.9478, + "step": 36891 + }, + { + "epoch": 0.2194071748025502, + "grad_norm": 1.330976128578186, + "learning_rate": 4.4292773414985656e-05, + "loss": 4.0364, + "step": 36892 + }, + { + "epoch": 0.2194131220858312, + "grad_norm": 1.3216843605041504, + "learning_rate": 4.4292476349349955e-05, + "loss": 3.9309, + "step": 36893 + }, + { + "epoch": 0.21941906936911218, + "grad_norm": 1.4405933618545532, + "learning_rate": 4.429217927697944e-05, + "loss": 3.6253, + "step": 36894 + }, + { + "epoch": 0.2194250166523932, + "grad_norm": 2.24751877784729, + "learning_rate": 4.4291882197874234e-05, + "loss": 3.5166, + "step": 36895 + }, + { + "epoch": 0.21943096393567418, + "grad_norm": 1.3963483572006226, + "learning_rate": 4.4291585112034426e-05, + "loss": 4.0243, + "step": 36896 + }, + { + "epoch": 0.21943691121895517, + "grad_norm": 2.135277509689331, + "learning_rate": 4.429128801946012e-05, + "loss": 3.5312, + "step": 36897 + }, + { + "epoch": 0.2194428585022362, + "grad_norm": 2.387871026992798, + "learning_rate": 4.429099092015144e-05, + "loss": 3.4618, + "step": 36898 + }, + { + "epoch": 0.21944880578551718, + "grad_norm": 2.3422837257385254, + "learning_rate": 4.4290693814108465e-05, + "loss": 3.5615, + "step": 36899 + }, + { + "epoch": 0.21945475306879816, + "grad_norm": 2.3634982109069824, + "learning_rate": 4.429039670133131e-05, + "loss": 3.3284, + "step": 36900 + }, + { + "epoch": 0.21946070035207918, + "grad_norm": 2.0091755390167236, + "learning_rate": 4.429009958182007e-05, + "loss": 4.2969, + "step": 36901 + }, + { + "epoch": 0.21946664763536017, + "grad_norm": 1.939206600189209, + "learning_rate": 4.428980245557486e-05, + "loss": 5.2382, + "step": 36902 + }, + { + "epoch": 0.21947259491864116, + "grad_norm": 1.8103601932525635, + "learning_rate": 4.428950532259578e-05, + "loss": 5.3107, + "step": 36903 + }, + { + "epoch": 0.21947854220192217, + "grad_norm": 1.8780220746994019, + "learning_rate": 4.4289208182882936e-05, + "loss": 4.026, + "step": 36904 + }, + { + "epoch": 0.21948448948520316, + "grad_norm": 2.2127411365509033, + "learning_rate": 4.428891103643642e-05, + "loss": 3.2846, + "step": 36905 + }, + { + "epoch": 0.21949043676848415, + "grad_norm": 1.7850244045257568, + "learning_rate": 4.4288613883256356e-05, + "loss": 4.898, + "step": 36906 + }, + { + "epoch": 0.21949638405176516, + "grad_norm": 1.9830641746520996, + "learning_rate": 4.4288316723342824e-05, + "loss": 4.7774, + "step": 36907 + }, + { + "epoch": 0.21950233133504615, + "grad_norm": 1.9393038749694824, + "learning_rate": 4.428801955669595e-05, + "loss": 5.2107, + "step": 36908 + }, + { + "epoch": 0.21950827861832714, + "grad_norm": 1.9476639032363892, + "learning_rate": 4.428772238331582e-05, + "loss": 5.0619, + "step": 36909 + }, + { + "epoch": 0.21951422590160816, + "grad_norm": 2.0159640312194824, + "learning_rate": 4.428742520320255e-05, + "loss": 5.0202, + "step": 36910 + }, + { + "epoch": 0.21952017318488914, + "grad_norm": 1.6942940950393677, + "learning_rate": 4.428712801635624e-05, + "loss": 4.934, + "step": 36911 + }, + { + "epoch": 0.21952612046817013, + "grad_norm": 2.129357099533081, + "learning_rate": 4.428683082277699e-05, + "loss": 4.5209, + "step": 36912 + }, + { + "epoch": 0.21953206775145115, + "grad_norm": 1.7726565599441528, + "learning_rate": 4.428653362246491e-05, + "loss": 5.0013, + "step": 36913 + }, + { + "epoch": 0.21953801503473214, + "grad_norm": 1.8441473245620728, + "learning_rate": 4.4286236415420094e-05, + "loss": 4.4107, + "step": 36914 + }, + { + "epoch": 0.21954396231801313, + "grad_norm": 2.1780929565429688, + "learning_rate": 4.428593920164266e-05, + "loss": 4.7304, + "step": 36915 + }, + { + "epoch": 0.21954990960129414, + "grad_norm": 1.6376374959945679, + "learning_rate": 4.42856419811327e-05, + "loss": 5.1897, + "step": 36916 + }, + { + "epoch": 0.21955585688457513, + "grad_norm": 1.9564027786254883, + "learning_rate": 4.4285344753890326e-05, + "loss": 4.7024, + "step": 36917 + }, + { + "epoch": 0.21956180416785612, + "grad_norm": 1.499194860458374, + "learning_rate": 4.428504751991562e-05, + "loss": 5.0983, + "step": 36918 + }, + { + "epoch": 0.21956775145113713, + "grad_norm": 1.8426028490066528, + "learning_rate": 4.428475027920873e-05, + "loss": 5.1254, + "step": 36919 + }, + { + "epoch": 0.21957369873441812, + "grad_norm": 1.5901544094085693, + "learning_rate": 4.428445303176971e-05, + "loss": 5.1756, + "step": 36920 + }, + { + "epoch": 0.2195796460176991, + "grad_norm": 1.766211748123169, + "learning_rate": 4.4284155777598704e-05, + "loss": 5.256, + "step": 36921 + }, + { + "epoch": 0.21958559330098013, + "grad_norm": 1.6671624183654785, + "learning_rate": 4.4283858516695786e-05, + "loss": 5.2203, + "step": 36922 + }, + { + "epoch": 0.2195915405842611, + "grad_norm": 1.5648390054702759, + "learning_rate": 4.428356124906108e-05, + "loss": 5.0774, + "step": 36923 + }, + { + "epoch": 0.2195974878675421, + "grad_norm": 1.8170748949050903, + "learning_rate": 4.428326397469468e-05, + "loss": 5.1123, + "step": 36924 + }, + { + "epoch": 0.21960343515082312, + "grad_norm": 1.5695691108703613, + "learning_rate": 4.4282966693596686e-05, + "loss": 5.0659, + "step": 36925 + }, + { + "epoch": 0.2196093824341041, + "grad_norm": 1.7228821516036987, + "learning_rate": 4.428266940576721e-05, + "loss": 4.9237, + "step": 36926 + }, + { + "epoch": 0.2196153297173851, + "grad_norm": 1.7535570859909058, + "learning_rate": 4.428237211120636e-05, + "loss": 5.0222, + "step": 36927 + }, + { + "epoch": 0.21962127700066608, + "grad_norm": 1.8773938417434692, + "learning_rate": 4.428207480991422e-05, + "loss": 4.942, + "step": 36928 + }, + { + "epoch": 0.2196272242839471, + "grad_norm": 1.834860920906067, + "learning_rate": 4.428177750189092e-05, + "loss": 4.9838, + "step": 36929 + }, + { + "epoch": 0.21963317156722809, + "grad_norm": 1.5257384777069092, + "learning_rate": 4.4281480187136546e-05, + "loss": 5.0234, + "step": 36930 + }, + { + "epoch": 0.21963911885050907, + "grad_norm": 1.86570143699646, + "learning_rate": 4.42811828656512e-05, + "loss": 4.7741, + "step": 36931 + }, + { + "epoch": 0.2196450661337901, + "grad_norm": 1.9325884580612183, + "learning_rate": 4.4280885537435e-05, + "loss": 5.0504, + "step": 36932 + }, + { + "epoch": 0.21965101341707108, + "grad_norm": 1.9406647682189941, + "learning_rate": 4.428058820248804e-05, + "loss": 4.919, + "step": 36933 + }, + { + "epoch": 0.21965696070035207, + "grad_norm": 2.314631223678589, + "learning_rate": 4.428029086081043e-05, + "loss": 4.321, + "step": 36934 + }, + { + "epoch": 0.21966290798363308, + "grad_norm": 3.108458995819092, + "learning_rate": 4.4279993512402265e-05, + "loss": 4.5806, + "step": 36935 + }, + { + "epoch": 0.21966885526691407, + "grad_norm": 2.192230463027954, + "learning_rate": 4.427969615726366e-05, + "loss": 4.6003, + "step": 36936 + }, + { + "epoch": 0.21967480255019506, + "grad_norm": 1.8830350637435913, + "learning_rate": 4.42793987953947e-05, + "loss": 4.2452, + "step": 36937 + }, + { + "epoch": 0.21968074983347607, + "grad_norm": 1.664759874343872, + "learning_rate": 4.427910142679551e-05, + "loss": 4.7462, + "step": 36938 + }, + { + "epoch": 0.21968669711675706, + "grad_norm": 1.638677716255188, + "learning_rate": 4.427880405146618e-05, + "loss": 4.7584, + "step": 36939 + }, + { + "epoch": 0.21969264440003805, + "grad_norm": 1.658952236175537, + "learning_rate": 4.427850666940683e-05, + "loss": 5.0226, + "step": 36940 + }, + { + "epoch": 0.21969859168331907, + "grad_norm": 1.8079904317855835, + "learning_rate": 4.427820928061754e-05, + "loss": 4.8951, + "step": 36941 + }, + { + "epoch": 0.21970453896660005, + "grad_norm": 1.5243951082229614, + "learning_rate": 4.427791188509843e-05, + "loss": 4.4953, + "step": 36942 + }, + { + "epoch": 0.21971048624988104, + "grad_norm": 1.5899109840393066, + "learning_rate": 4.42776144828496e-05, + "loss": 4.5787, + "step": 36943 + }, + { + "epoch": 0.21971643353316206, + "grad_norm": 1.787782073020935, + "learning_rate": 4.4277317073871156e-05, + "loss": 4.7603, + "step": 36944 + }, + { + "epoch": 0.21972238081644305, + "grad_norm": 2.6901321411132812, + "learning_rate": 4.4277019658163196e-05, + "loss": 4.2848, + "step": 36945 + }, + { + "epoch": 0.21972832809972404, + "grad_norm": 2.159574508666992, + "learning_rate": 4.427672223572583e-05, + "loss": 4.4136, + "step": 36946 + }, + { + "epoch": 0.21973427538300505, + "grad_norm": 1.6421222686767578, + "learning_rate": 4.427642480655916e-05, + "loss": 4.9688, + "step": 36947 + }, + { + "epoch": 0.21974022266628604, + "grad_norm": 1.9344775676727295, + "learning_rate": 4.4276127370663286e-05, + "loss": 4.4278, + "step": 36948 + }, + { + "epoch": 0.21974616994956703, + "grad_norm": 2.361130714416504, + "learning_rate": 4.427582992803831e-05, + "loss": 4.095, + "step": 36949 + }, + { + "epoch": 0.21975211723284804, + "grad_norm": 3.1432952880859375, + "learning_rate": 4.4275532478684354e-05, + "loss": 3.8301, + "step": 36950 + }, + { + "epoch": 0.21975806451612903, + "grad_norm": 2.9474925994873047, + "learning_rate": 4.4275235022601504e-05, + "loss": 3.3343, + "step": 36951 + }, + { + "epoch": 0.21976401179941002, + "grad_norm": 2.163156032562256, + "learning_rate": 4.427493755978987e-05, + "loss": 3.549, + "step": 36952 + }, + { + "epoch": 0.21976995908269104, + "grad_norm": 2.9386045932769775, + "learning_rate": 4.427464009024955e-05, + "loss": 3.5566, + "step": 36953 + }, + { + "epoch": 0.21977590636597202, + "grad_norm": 2.1227951049804688, + "learning_rate": 4.427434261398066e-05, + "loss": 4.3692, + "step": 36954 + }, + { + "epoch": 0.219781853649253, + "grad_norm": 2.999837875366211, + "learning_rate": 4.427404513098329e-05, + "loss": 3.6306, + "step": 36955 + }, + { + "epoch": 0.21978780093253403, + "grad_norm": 2.0129096508026123, + "learning_rate": 4.4273747641257546e-05, + "loss": 4.3312, + "step": 36956 + }, + { + "epoch": 0.21979374821581502, + "grad_norm": 1.743152141571045, + "learning_rate": 4.427345014480354e-05, + "loss": 5.2778, + "step": 36957 + }, + { + "epoch": 0.219799695499096, + "grad_norm": 1.6184766292572021, + "learning_rate": 4.4273152641621376e-05, + "loss": 4.9119, + "step": 36958 + }, + { + "epoch": 0.21980564278237702, + "grad_norm": 1.468083381652832, + "learning_rate": 4.427285513171115e-05, + "loss": 4.7192, + "step": 36959 + }, + { + "epoch": 0.219811590065658, + "grad_norm": 1.6739065647125244, + "learning_rate": 4.427255761507297e-05, + "loss": 5.0034, + "step": 36960 + }, + { + "epoch": 0.219817537348939, + "grad_norm": 1.8841058015823364, + "learning_rate": 4.427226009170693e-05, + "loss": 4.2297, + "step": 36961 + }, + { + "epoch": 0.21982348463222, + "grad_norm": 1.512008786201477, + "learning_rate": 4.4271962561613156e-05, + "loss": 4.9728, + "step": 36962 + }, + { + "epoch": 0.219829431915501, + "grad_norm": 1.4150924682617188, + "learning_rate": 4.427166502479173e-05, + "loss": 4.9096, + "step": 36963 + }, + { + "epoch": 0.219835379198782, + "grad_norm": 1.4463622570037842, + "learning_rate": 4.427136748124277e-05, + "loss": 4.8228, + "step": 36964 + }, + { + "epoch": 0.219841326482063, + "grad_norm": 1.7630513906478882, + "learning_rate": 4.427106993096638e-05, + "loss": 4.4025, + "step": 36965 + }, + { + "epoch": 0.219847273765344, + "grad_norm": 1.438467025756836, + "learning_rate": 4.427077237396265e-05, + "loss": 4.9638, + "step": 36966 + }, + { + "epoch": 0.21985322104862498, + "grad_norm": 1.6267468929290771, + "learning_rate": 4.42704748102317e-05, + "loss": 4.562, + "step": 36967 + }, + { + "epoch": 0.219859168331906, + "grad_norm": 1.7281938791275024, + "learning_rate": 4.427017723977361e-05, + "loss": 4.663, + "step": 36968 + }, + { + "epoch": 0.21986511561518698, + "grad_norm": 1.7493743896484375, + "learning_rate": 4.426987966258852e-05, + "loss": 4.9956, + "step": 36969 + }, + { + "epoch": 0.21987106289846797, + "grad_norm": 2.017735719680786, + "learning_rate": 4.4269582078676504e-05, + "loss": 4.6434, + "step": 36970 + }, + { + "epoch": 0.219877010181749, + "grad_norm": 1.9060436487197876, + "learning_rate": 4.426928448803768e-05, + "loss": 5.2025, + "step": 36971 + }, + { + "epoch": 0.21988295746502998, + "grad_norm": 1.8023102283477783, + "learning_rate": 4.426898689067214e-05, + "loss": 5.2327, + "step": 36972 + }, + { + "epoch": 0.21988890474831096, + "grad_norm": 1.7578123807907104, + "learning_rate": 4.426868928658e-05, + "loss": 4.853, + "step": 36973 + }, + { + "epoch": 0.21989485203159198, + "grad_norm": 1.8165631294250488, + "learning_rate": 4.426839167576137e-05, + "loss": 4.7873, + "step": 36974 + }, + { + "epoch": 0.21990079931487297, + "grad_norm": 2.621605396270752, + "learning_rate": 4.4268094058216325e-05, + "loss": 3.4732, + "step": 36975 + }, + { + "epoch": 0.21990674659815396, + "grad_norm": 2.3086864948272705, + "learning_rate": 4.4267796433945e-05, + "loss": 3.6192, + "step": 36976 + }, + { + "epoch": 0.21991269388143497, + "grad_norm": 2.730029582977295, + "learning_rate": 4.426749880294748e-05, + "loss": 3.824, + "step": 36977 + }, + { + "epoch": 0.21991864116471596, + "grad_norm": 2.5434679985046387, + "learning_rate": 4.4267201165223885e-05, + "loss": 4.0274, + "step": 36978 + }, + { + "epoch": 0.21992458844799695, + "grad_norm": 1.646130919456482, + "learning_rate": 4.42669035207743e-05, + "loss": 5.3633, + "step": 36979 + }, + { + "epoch": 0.21993053573127797, + "grad_norm": 2.096497058868408, + "learning_rate": 4.426660586959884e-05, + "loss": 5.1765, + "step": 36980 + }, + { + "epoch": 0.21993648301455895, + "grad_norm": 1.6388026475906372, + "learning_rate": 4.4266308211697605e-05, + "loss": 5.0053, + "step": 36981 + }, + { + "epoch": 0.21994243029783994, + "grad_norm": 1.3540456295013428, + "learning_rate": 4.42660105470707e-05, + "loss": 4.9745, + "step": 36982 + }, + { + "epoch": 0.21994837758112096, + "grad_norm": 1.547318696975708, + "learning_rate": 4.426571287571824e-05, + "loss": 4.7473, + "step": 36983 + }, + { + "epoch": 0.21995432486440195, + "grad_norm": 1.4012900590896606, + "learning_rate": 4.426541519764031e-05, + "loss": 4.8873, + "step": 36984 + }, + { + "epoch": 0.21996027214768293, + "grad_norm": 1.3376727104187012, + "learning_rate": 4.4265117512837023e-05, + "loss": 4.8311, + "step": 36985 + }, + { + "epoch": 0.21996621943096392, + "grad_norm": 1.3277151584625244, + "learning_rate": 4.4264819821308484e-05, + "loss": 4.6289, + "step": 36986 + }, + { + "epoch": 0.21997216671424494, + "grad_norm": 1.7237157821655273, + "learning_rate": 4.4264522123054795e-05, + "loss": 4.7096, + "step": 36987 + }, + { + "epoch": 0.21997811399752593, + "grad_norm": 2.6192526817321777, + "learning_rate": 4.4264224418076063e-05, + "loss": 3.8533, + "step": 36988 + }, + { + "epoch": 0.21998406128080691, + "grad_norm": 2.7556326389312744, + "learning_rate": 4.4263926706372385e-05, + "loss": 3.1077, + "step": 36989 + }, + { + "epoch": 0.21999000856408793, + "grad_norm": 2.582603931427002, + "learning_rate": 4.4263628987943875e-05, + "loss": 3.3059, + "step": 36990 + }, + { + "epoch": 0.21999595584736892, + "grad_norm": 2.8123531341552734, + "learning_rate": 4.426333126279062e-05, + "loss": 3.2447, + "step": 36991 + }, + { + "epoch": 0.2200019031306499, + "grad_norm": 2.7251224517822266, + "learning_rate": 4.4263033530912746e-05, + "loss": 3.6572, + "step": 36992 + }, + { + "epoch": 0.22000785041393092, + "grad_norm": 2.870852470397949, + "learning_rate": 4.426273579231034e-05, + "loss": 3.5223, + "step": 36993 + }, + { + "epoch": 0.2200137976972119, + "grad_norm": 2.64644718170166, + "learning_rate": 4.426243804698351e-05, + "loss": 3.1218, + "step": 36994 + }, + { + "epoch": 0.2200197449804929, + "grad_norm": 2.7472634315490723, + "learning_rate": 4.426214029493237e-05, + "loss": 3.3861, + "step": 36995 + }, + { + "epoch": 0.22002569226377391, + "grad_norm": 2.274350643157959, + "learning_rate": 4.4261842536157014e-05, + "loss": 3.8112, + "step": 36996 + }, + { + "epoch": 0.2200316395470549, + "grad_norm": 1.6150776147842407, + "learning_rate": 4.426154477065755e-05, + "loss": 4.7959, + "step": 36997 + }, + { + "epoch": 0.2200375868303359, + "grad_norm": 1.7377833127975464, + "learning_rate": 4.426124699843407e-05, + "loss": 4.5754, + "step": 36998 + }, + { + "epoch": 0.2200435341136169, + "grad_norm": 1.8310593366622925, + "learning_rate": 4.426094921948669e-05, + "loss": 4.8721, + "step": 36999 + }, + { + "epoch": 0.2200494813968979, + "grad_norm": 1.8585983514785767, + "learning_rate": 4.426065143381552e-05, + "loss": 4.8439, + "step": 37000 + }, + { + "epoch": 0.22005542868017888, + "grad_norm": 1.8049155473709106, + "learning_rate": 4.426035364142065e-05, + "loss": 4.2048, + "step": 37001 + }, + { + "epoch": 0.2200613759634599, + "grad_norm": 2.056905746459961, + "learning_rate": 4.426005584230219e-05, + "loss": 4.16, + "step": 37002 + }, + { + "epoch": 0.2200673232467409, + "grad_norm": 1.85906982421875, + "learning_rate": 4.425975803646024e-05, + "loss": 4.9968, + "step": 37003 + }, + { + "epoch": 0.22007327053002188, + "grad_norm": 1.816170573234558, + "learning_rate": 4.4259460223894914e-05, + "loss": 4.9686, + "step": 37004 + }, + { + "epoch": 0.2200792178133029, + "grad_norm": 1.6196742057800293, + "learning_rate": 4.42591624046063e-05, + "loss": 4.9311, + "step": 37005 + }, + { + "epoch": 0.22008516509658388, + "grad_norm": 1.6003400087356567, + "learning_rate": 4.4258864578594524e-05, + "loss": 4.8954, + "step": 37006 + }, + { + "epoch": 0.22009111237986487, + "grad_norm": 1.6523009538650513, + "learning_rate": 4.425856674585967e-05, + "loss": 4.9859, + "step": 37007 + }, + { + "epoch": 0.22009705966314588, + "grad_norm": 1.8884902000427246, + "learning_rate": 4.425826890640185e-05, + "loss": 4.8798, + "step": 37008 + }, + { + "epoch": 0.22010300694642687, + "grad_norm": 1.7629953622817993, + "learning_rate": 4.425797106022117e-05, + "loss": 4.7541, + "step": 37009 + }, + { + "epoch": 0.22010895422970786, + "grad_norm": 1.7024192810058594, + "learning_rate": 4.425767320731773e-05, + "loss": 4.45, + "step": 37010 + }, + { + "epoch": 0.22011490151298888, + "grad_norm": 1.8037065267562866, + "learning_rate": 4.4257375347691635e-05, + "loss": 5.3712, + "step": 37011 + }, + { + "epoch": 0.22012084879626986, + "grad_norm": 1.6815311908721924, + "learning_rate": 4.4257077481342976e-05, + "loss": 5.1085, + "step": 37012 + }, + { + "epoch": 0.22012679607955085, + "grad_norm": 1.4821339845657349, + "learning_rate": 4.425677960827189e-05, + "loss": 5.1132, + "step": 37013 + }, + { + "epoch": 0.22013274336283187, + "grad_norm": 1.645802617073059, + "learning_rate": 4.425648172847845e-05, + "loss": 5.274, + "step": 37014 + }, + { + "epoch": 0.22013869064611286, + "grad_norm": 1.838371753692627, + "learning_rate": 4.4256183841962776e-05, + "loss": 4.7118, + "step": 37015 + }, + { + "epoch": 0.22014463792939384, + "grad_norm": 1.8690693378448486, + "learning_rate": 4.425588594872497e-05, + "loss": 5.0579, + "step": 37016 + }, + { + "epoch": 0.22015058521267486, + "grad_norm": 1.6576164960861206, + "learning_rate": 4.425558804876513e-05, + "loss": 4.6789, + "step": 37017 + }, + { + "epoch": 0.22015653249595585, + "grad_norm": 1.6403518915176392, + "learning_rate": 4.425529014208336e-05, + "loss": 4.2318, + "step": 37018 + }, + { + "epoch": 0.22016247977923684, + "grad_norm": 2.0635440349578857, + "learning_rate": 4.4254992228679774e-05, + "loss": 4.8183, + "step": 37019 + }, + { + "epoch": 0.22016842706251785, + "grad_norm": 1.6289507150650024, + "learning_rate": 4.425469430855446e-05, + "loss": 4.8729, + "step": 37020 + }, + { + "epoch": 0.22017437434579884, + "grad_norm": 1.8985059261322021, + "learning_rate": 4.4254396381707534e-05, + "loss": 5.0734, + "step": 37021 + }, + { + "epoch": 0.22018032162907983, + "grad_norm": 1.793545126914978, + "learning_rate": 4.4254098448139106e-05, + "loss": 4.9311, + "step": 37022 + }, + { + "epoch": 0.22018626891236084, + "grad_norm": 1.7476612329483032, + "learning_rate": 4.4253800507849256e-05, + "loss": 4.9602, + "step": 37023 + }, + { + "epoch": 0.22019221619564183, + "grad_norm": 1.7672104835510254, + "learning_rate": 4.425350256083811e-05, + "loss": 5.0557, + "step": 37024 + }, + { + "epoch": 0.22019816347892282, + "grad_norm": 1.7356926202774048, + "learning_rate": 4.425320460710577e-05, + "loss": 4.9895, + "step": 37025 + }, + { + "epoch": 0.22020411076220384, + "grad_norm": 1.6702399253845215, + "learning_rate": 4.425290664665233e-05, + "loss": 5.5446, + "step": 37026 + }, + { + "epoch": 0.22021005804548482, + "grad_norm": 1.6615018844604492, + "learning_rate": 4.42526086794779e-05, + "loss": 4.8189, + "step": 37027 + }, + { + "epoch": 0.2202160053287658, + "grad_norm": 1.7490131855010986, + "learning_rate": 4.425231070558259e-05, + "loss": 4.3846, + "step": 37028 + }, + { + "epoch": 0.22022195261204683, + "grad_norm": 1.4904981851577759, + "learning_rate": 4.425201272496648e-05, + "loss": 4.9885, + "step": 37029 + }, + { + "epoch": 0.22022789989532782, + "grad_norm": 1.7829434871673584, + "learning_rate": 4.425171473762971e-05, + "loss": 4.6263, + "step": 37030 + }, + { + "epoch": 0.2202338471786088, + "grad_norm": 2.5793685913085938, + "learning_rate": 4.4251416743572364e-05, + "loss": 4.0243, + "step": 37031 + }, + { + "epoch": 0.22023979446188982, + "grad_norm": 1.8999863862991333, + "learning_rate": 4.4251118742794535e-05, + "loss": 5.3582, + "step": 37032 + }, + { + "epoch": 0.2202457417451708, + "grad_norm": 1.8360003232955933, + "learning_rate": 4.4250820735296347e-05, + "loss": 5.1397, + "step": 37033 + }, + { + "epoch": 0.2202516890284518, + "grad_norm": 1.742761492729187, + "learning_rate": 4.4250522721077894e-05, + "loss": 5.149, + "step": 37034 + }, + { + "epoch": 0.2202576363117328, + "grad_norm": 1.606214165687561, + "learning_rate": 4.425022470013928e-05, + "loss": 5.0941, + "step": 37035 + }, + { + "epoch": 0.2202635835950138, + "grad_norm": 2.075307607650757, + "learning_rate": 4.4249926672480615e-05, + "loss": 3.9427, + "step": 37036 + }, + { + "epoch": 0.2202695308782948, + "grad_norm": 3.00763201713562, + "learning_rate": 4.4249628638102005e-05, + "loss": 3.4657, + "step": 37037 + }, + { + "epoch": 0.2202754781615758, + "grad_norm": 2.541367292404175, + "learning_rate": 4.424933059700354e-05, + "loss": 3.3816, + "step": 37038 + }, + { + "epoch": 0.2202814254448568, + "grad_norm": 2.394335985183716, + "learning_rate": 4.4249032549185335e-05, + "loss": 3.0965, + "step": 37039 + }, + { + "epoch": 0.22028737272813778, + "grad_norm": 2.1029319763183594, + "learning_rate": 4.424873449464749e-05, + "loss": 3.0838, + "step": 37040 + }, + { + "epoch": 0.2202933200114188, + "grad_norm": 2.285747766494751, + "learning_rate": 4.424843643339011e-05, + "loss": 3.4436, + "step": 37041 + }, + { + "epoch": 0.22029926729469979, + "grad_norm": 2.01776123046875, + "learning_rate": 4.4248138365413305e-05, + "loss": 3.355, + "step": 37042 + }, + { + "epoch": 0.22030521457798077, + "grad_norm": 2.2943174839019775, + "learning_rate": 4.424784029071717e-05, + "loss": 3.1579, + "step": 37043 + }, + { + "epoch": 0.2203111618612618, + "grad_norm": 2.418867826461792, + "learning_rate": 4.4247542209301815e-05, + "loss": 3.0624, + "step": 37044 + }, + { + "epoch": 0.22031710914454278, + "grad_norm": 1.8208024501800537, + "learning_rate": 4.424724412116734e-05, + "loss": 4.4339, + "step": 37045 + }, + { + "epoch": 0.22032305642782377, + "grad_norm": 1.88398277759552, + "learning_rate": 4.424694602631385e-05, + "loss": 4.0576, + "step": 37046 + }, + { + "epoch": 0.22032900371110475, + "grad_norm": 1.6278939247131348, + "learning_rate": 4.424664792474145e-05, + "loss": 3.7719, + "step": 37047 + }, + { + "epoch": 0.22033495099438577, + "grad_norm": 1.5962262153625488, + "learning_rate": 4.424634981645025e-05, + "loss": 3.8542, + "step": 37048 + }, + { + "epoch": 0.22034089827766676, + "grad_norm": 1.5399160385131836, + "learning_rate": 4.424605170144034e-05, + "loss": 3.7271, + "step": 37049 + }, + { + "epoch": 0.22034684556094775, + "grad_norm": 1.6625111103057861, + "learning_rate": 4.4245753579711837e-05, + "loss": 3.7294, + "step": 37050 + }, + { + "epoch": 0.22035279284422876, + "grad_norm": 1.5418323278427124, + "learning_rate": 4.424545545126484e-05, + "loss": 3.7352, + "step": 37051 + }, + { + "epoch": 0.22035874012750975, + "grad_norm": 1.566845417022705, + "learning_rate": 4.424515731609945e-05, + "loss": 3.681, + "step": 37052 + }, + { + "epoch": 0.22036468741079074, + "grad_norm": 1.4394952058792114, + "learning_rate": 4.424485917421578e-05, + "loss": 3.8459, + "step": 37053 + }, + { + "epoch": 0.22037063469407175, + "grad_norm": 1.5573005676269531, + "learning_rate": 4.4244561025613924e-05, + "loss": 3.6516, + "step": 37054 + }, + { + "epoch": 0.22037658197735274, + "grad_norm": 1.4430670738220215, + "learning_rate": 4.424426287029399e-05, + "loss": 3.8846, + "step": 37055 + }, + { + "epoch": 0.22038252926063373, + "grad_norm": 1.496464490890503, + "learning_rate": 4.4243964708256086e-05, + "loss": 3.6196, + "step": 37056 + }, + { + "epoch": 0.22038847654391475, + "grad_norm": 1.4447180032730103, + "learning_rate": 4.4243666539500314e-05, + "loss": 3.6695, + "step": 37057 + }, + { + "epoch": 0.22039442382719573, + "grad_norm": 1.4226678609848022, + "learning_rate": 4.424336836402677e-05, + "loss": 3.9564, + "step": 37058 + }, + { + "epoch": 0.22040037111047672, + "grad_norm": 1.4498428106307983, + "learning_rate": 4.4243070181835566e-05, + "loss": 3.8076, + "step": 37059 + }, + { + "epoch": 0.22040631839375774, + "grad_norm": 1.4278359413146973, + "learning_rate": 4.4242771992926813e-05, + "loss": 3.7534, + "step": 37060 + }, + { + "epoch": 0.22041226567703873, + "grad_norm": 1.357118010520935, + "learning_rate": 4.42424737973006e-05, + "loss": 3.6754, + "step": 37061 + }, + { + "epoch": 0.22041821296031971, + "grad_norm": 1.3253631591796875, + "learning_rate": 4.424217559495704e-05, + "loss": 3.6916, + "step": 37062 + }, + { + "epoch": 0.22042416024360073, + "grad_norm": 1.3829140663146973, + "learning_rate": 4.4241877385896235e-05, + "loss": 3.8588, + "step": 37063 + }, + { + "epoch": 0.22043010752688172, + "grad_norm": 1.528243899345398, + "learning_rate": 4.424157917011829e-05, + "loss": 4.7923, + "step": 37064 + }, + { + "epoch": 0.2204360548101627, + "grad_norm": 1.7748266458511353, + "learning_rate": 4.424128094762331e-05, + "loss": 4.7928, + "step": 37065 + }, + { + "epoch": 0.22044200209344372, + "grad_norm": 1.7042765617370605, + "learning_rate": 4.42409827184114e-05, + "loss": 5.0612, + "step": 37066 + }, + { + "epoch": 0.2204479493767247, + "grad_norm": 1.7830193042755127, + "learning_rate": 4.424068448248265e-05, + "loss": 5.2614, + "step": 37067 + }, + { + "epoch": 0.2204538966600057, + "grad_norm": 1.6546859741210938, + "learning_rate": 4.424038623983718e-05, + "loss": 4.7816, + "step": 37068 + }, + { + "epoch": 0.22045984394328672, + "grad_norm": 1.5960372686386108, + "learning_rate": 4.42400879904751e-05, + "loss": 3.7849, + "step": 37069 + }, + { + "epoch": 0.2204657912265677, + "grad_norm": 1.5768903493881226, + "learning_rate": 4.42397897343965e-05, + "loss": 3.7381, + "step": 37070 + }, + { + "epoch": 0.2204717385098487, + "grad_norm": 1.6076819896697998, + "learning_rate": 4.423949147160148e-05, + "loss": 3.787, + "step": 37071 + }, + { + "epoch": 0.2204776857931297, + "grad_norm": 1.5903054475784302, + "learning_rate": 4.4239193202090165e-05, + "loss": 3.8237, + "step": 37072 + }, + { + "epoch": 0.2204836330764107, + "grad_norm": 1.6422667503356934, + "learning_rate": 4.423889492586264e-05, + "loss": 3.727, + "step": 37073 + }, + { + "epoch": 0.22048958035969168, + "grad_norm": 1.488077163696289, + "learning_rate": 4.423859664291901e-05, + "loss": 3.5784, + "step": 37074 + }, + { + "epoch": 0.2204955276429727, + "grad_norm": 1.885972499847412, + "learning_rate": 4.423829835325939e-05, + "loss": 4.8389, + "step": 37075 + }, + { + "epoch": 0.2205014749262537, + "grad_norm": 1.730106234550476, + "learning_rate": 4.423800005688388e-05, + "loss": 5.2459, + "step": 37076 + }, + { + "epoch": 0.22050742220953468, + "grad_norm": 1.8794136047363281, + "learning_rate": 4.4237701753792585e-05, + "loss": 5.1238, + "step": 37077 + }, + { + "epoch": 0.2205133694928157, + "grad_norm": 1.6428134441375732, + "learning_rate": 4.42374034439856e-05, + "loss": 4.8715, + "step": 37078 + }, + { + "epoch": 0.22051931677609668, + "grad_norm": 2.0203256607055664, + "learning_rate": 4.423710512746304e-05, + "loss": 3.9266, + "step": 37079 + }, + { + "epoch": 0.22052526405937767, + "grad_norm": 2.743778705596924, + "learning_rate": 4.4236806804225006e-05, + "loss": 3.575, + "step": 37080 + }, + { + "epoch": 0.22053121134265868, + "grad_norm": 2.646286725997925, + "learning_rate": 4.42365084742716e-05, + "loss": 3.2283, + "step": 37081 + }, + { + "epoch": 0.22053715862593967, + "grad_norm": 2.472491502761841, + "learning_rate": 4.423621013760293e-05, + "loss": 3.5303, + "step": 37082 + }, + { + "epoch": 0.22054310590922066, + "grad_norm": 2.5591135025024414, + "learning_rate": 4.423591179421909e-05, + "loss": 3.4337, + "step": 37083 + }, + { + "epoch": 0.22054905319250168, + "grad_norm": 2.592282772064209, + "learning_rate": 4.423561344412021e-05, + "loss": 2.904, + "step": 37084 + }, + { + "epoch": 0.22055500047578266, + "grad_norm": 2.4524197578430176, + "learning_rate": 4.423531508730635e-05, + "loss": 3.3817, + "step": 37085 + }, + { + "epoch": 0.22056094775906365, + "grad_norm": 2.398085594177246, + "learning_rate": 4.4235016723777656e-05, + "loss": 3.2741, + "step": 37086 + }, + { + "epoch": 0.22056689504234467, + "grad_norm": 2.337100028991699, + "learning_rate": 4.423471835353422e-05, + "loss": 3.5931, + "step": 37087 + }, + { + "epoch": 0.22057284232562566, + "grad_norm": 2.589341402053833, + "learning_rate": 4.4234419976576137e-05, + "loss": 3.4136, + "step": 37088 + }, + { + "epoch": 0.22057878960890664, + "grad_norm": 2.466911792755127, + "learning_rate": 4.4234121592903515e-05, + "loss": 3.5329, + "step": 37089 + }, + { + "epoch": 0.22058473689218766, + "grad_norm": 1.5576742887496948, + "learning_rate": 4.423382320251646e-05, + "loss": 4.388, + "step": 37090 + }, + { + "epoch": 0.22059068417546865, + "grad_norm": 1.3827784061431885, + "learning_rate": 4.423352480541508e-05, + "loss": 4.9481, + "step": 37091 + }, + { + "epoch": 0.22059663145874964, + "grad_norm": 1.4498347043991089, + "learning_rate": 4.423322640159947e-05, + "loss": 5.3971, + "step": 37092 + }, + { + "epoch": 0.22060257874203065, + "grad_norm": 1.6802235841751099, + "learning_rate": 4.423292799106974e-05, + "loss": 5.2876, + "step": 37093 + }, + { + "epoch": 0.22060852602531164, + "grad_norm": 1.9586135149002075, + "learning_rate": 4.4232629573826e-05, + "loss": 4.9878, + "step": 37094 + }, + { + "epoch": 0.22061447330859263, + "grad_norm": 1.6873750686645508, + "learning_rate": 4.423233114986834e-05, + "loss": 4.9648, + "step": 37095 + }, + { + "epoch": 0.22062042059187364, + "grad_norm": 1.708585500717163, + "learning_rate": 4.423203271919688e-05, + "loss": 4.9923, + "step": 37096 + }, + { + "epoch": 0.22062636787515463, + "grad_norm": 1.4877114295959473, + "learning_rate": 4.423173428181171e-05, + "loss": 5.2307, + "step": 37097 + }, + { + "epoch": 0.22063231515843562, + "grad_norm": 1.485723614692688, + "learning_rate": 4.423143583771294e-05, + "loss": 4.2279, + "step": 37098 + }, + { + "epoch": 0.22063826244171664, + "grad_norm": 1.5309786796569824, + "learning_rate": 4.423113738690068e-05, + "loss": 4.9294, + "step": 37099 + }, + { + "epoch": 0.22064420972499763, + "grad_norm": 1.7999223470687866, + "learning_rate": 4.4230838929375027e-05, + "loss": 4.7005, + "step": 37100 + }, + { + "epoch": 0.2206501570082786, + "grad_norm": 1.5340321063995361, + "learning_rate": 4.423054046513608e-05, + "loss": 4.7574, + "step": 37101 + }, + { + "epoch": 0.22065610429155963, + "grad_norm": 1.641364574432373, + "learning_rate": 4.423024199418396e-05, + "loss": 5.6716, + "step": 37102 + }, + { + "epoch": 0.22066205157484062, + "grad_norm": 1.973738431930542, + "learning_rate": 4.422994351651875e-05, + "loss": 5.1761, + "step": 37103 + }, + { + "epoch": 0.2206679988581216, + "grad_norm": 2.4407904148101807, + "learning_rate": 4.4229645032140574e-05, + "loss": 3.4594, + "step": 37104 + }, + { + "epoch": 0.2206739461414026, + "grad_norm": 2.4867618083953857, + "learning_rate": 4.4229346541049525e-05, + "loss": 3.3473, + "step": 37105 + }, + { + "epoch": 0.2206798934246836, + "grad_norm": 2.196946144104004, + "learning_rate": 4.422904804324571e-05, + "loss": 3.3534, + "step": 37106 + }, + { + "epoch": 0.2206858407079646, + "grad_norm": 1.8784695863723755, + "learning_rate": 4.422874953872923e-05, + "loss": 4.8781, + "step": 37107 + }, + { + "epoch": 0.22069178799124559, + "grad_norm": 1.9972692728042603, + "learning_rate": 4.4228451027500196e-05, + "loss": 4.244, + "step": 37108 + }, + { + "epoch": 0.2206977352745266, + "grad_norm": 1.573676586151123, + "learning_rate": 4.4228152509558704e-05, + "loss": 4.7457, + "step": 37109 + }, + { + "epoch": 0.2207036825578076, + "grad_norm": 1.4594526290893555, + "learning_rate": 4.422785398490487e-05, + "loss": 4.6177, + "step": 37110 + }, + { + "epoch": 0.22070962984108858, + "grad_norm": 1.6894400119781494, + "learning_rate": 4.4227555453538784e-05, + "loss": 4.9041, + "step": 37111 + }, + { + "epoch": 0.2207155771243696, + "grad_norm": 1.7728346586227417, + "learning_rate": 4.422725691546056e-05, + "loss": 4.9142, + "step": 37112 + }, + { + "epoch": 0.22072152440765058, + "grad_norm": 1.8219943046569824, + "learning_rate": 4.42269583706703e-05, + "loss": 5.0506, + "step": 37113 + }, + { + "epoch": 0.22072747169093157, + "grad_norm": 1.455910086631775, + "learning_rate": 4.422665981916811e-05, + "loss": 5.5078, + "step": 37114 + }, + { + "epoch": 0.2207334189742126, + "grad_norm": 1.5510808229446411, + "learning_rate": 4.422636126095409e-05, + "loss": 4.6677, + "step": 37115 + }, + { + "epoch": 0.22073936625749357, + "grad_norm": 2.251783609390259, + "learning_rate": 4.4226062696028334e-05, + "loss": 3.674, + "step": 37116 + }, + { + "epoch": 0.22074531354077456, + "grad_norm": 2.697303295135498, + "learning_rate": 4.422576412439098e-05, + "loss": 2.9135, + "step": 37117 + }, + { + "epoch": 0.22075126082405558, + "grad_norm": 2.8013083934783936, + "learning_rate": 4.4225465546042096e-05, + "loss": 2.7409, + "step": 37118 + }, + { + "epoch": 0.22075720810733657, + "grad_norm": 2.6208369731903076, + "learning_rate": 4.42251669609818e-05, + "loss": 3.1612, + "step": 37119 + }, + { + "epoch": 0.22076315539061755, + "grad_norm": 2.6360249519348145, + "learning_rate": 4.42248683692102e-05, + "loss": 3.3315, + "step": 37120 + }, + { + "epoch": 0.22076910267389857, + "grad_norm": 2.7215638160705566, + "learning_rate": 4.42245697707274e-05, + "loss": 3.0673, + "step": 37121 + }, + { + "epoch": 0.22077504995717956, + "grad_norm": 2.5598907470703125, + "learning_rate": 4.422427116553349e-05, + "loss": 2.9353, + "step": 37122 + }, + { + "epoch": 0.22078099724046055, + "grad_norm": 2.8053741455078125, + "learning_rate": 4.42239725536286e-05, + "loss": 3.1066, + "step": 37123 + }, + { + "epoch": 0.22078694452374156, + "grad_norm": 2.600266933441162, + "learning_rate": 4.422367393501281e-05, + "loss": 2.7168, + "step": 37124 + }, + { + "epoch": 0.22079289180702255, + "grad_norm": 2.486725091934204, + "learning_rate": 4.422337530968624e-05, + "loss": 2.6892, + "step": 37125 + }, + { + "epoch": 0.22079883909030354, + "grad_norm": 2.9207983016967773, + "learning_rate": 4.422307667764899e-05, + "loss": 2.3465, + "step": 37126 + }, + { + "epoch": 0.22080478637358456, + "grad_norm": 2.5730931758880615, + "learning_rate": 4.422277803890116e-05, + "loss": 2.3872, + "step": 37127 + }, + { + "epoch": 0.22081073365686554, + "grad_norm": 2.5988657474517822, + "learning_rate": 4.422247939344285e-05, + "loss": 3.0179, + "step": 37128 + }, + { + "epoch": 0.22081668094014653, + "grad_norm": 2.7020468711853027, + "learning_rate": 4.422218074127418e-05, + "loss": 2.1756, + "step": 37129 + }, + { + "epoch": 0.22082262822342755, + "grad_norm": 2.697157144546509, + "learning_rate": 4.422188208239524e-05, + "loss": 2.1083, + "step": 37130 + }, + { + "epoch": 0.22082857550670854, + "grad_norm": 2.241924285888672, + "learning_rate": 4.422158341680614e-05, + "loss": 3.8524, + "step": 37131 + }, + { + "epoch": 0.22083452278998952, + "grad_norm": 1.8899613618850708, + "learning_rate": 4.422128474450699e-05, + "loss": 4.1308, + "step": 37132 + }, + { + "epoch": 0.22084047007327054, + "grad_norm": 1.6082144975662231, + "learning_rate": 4.4220986065497884e-05, + "loss": 5.5055, + "step": 37133 + }, + { + "epoch": 0.22084641735655153, + "grad_norm": 1.8562133312225342, + "learning_rate": 4.4220687379778924e-05, + "loss": 5.2945, + "step": 37134 + }, + { + "epoch": 0.22085236463983252, + "grad_norm": 1.9147547483444214, + "learning_rate": 4.422038868735022e-05, + "loss": 5.5291, + "step": 37135 + }, + { + "epoch": 0.22085831192311353, + "grad_norm": 1.6603139638900757, + "learning_rate": 4.422008998821189e-05, + "loss": 5.4428, + "step": 37136 + }, + { + "epoch": 0.22086425920639452, + "grad_norm": 1.6215204000473022, + "learning_rate": 4.421979128236401e-05, + "loss": 5.343, + "step": 37137 + }, + { + "epoch": 0.2208702064896755, + "grad_norm": 1.7748491764068604, + "learning_rate": 4.421949256980671e-05, + "loss": 5.1708, + "step": 37138 + }, + { + "epoch": 0.22087615377295652, + "grad_norm": 1.7499853372573853, + "learning_rate": 4.421919385054008e-05, + "loss": 5.3233, + "step": 37139 + }, + { + "epoch": 0.2208821010562375, + "grad_norm": 1.7295137643814087, + "learning_rate": 4.421889512456423e-05, + "loss": 5.3818, + "step": 37140 + }, + { + "epoch": 0.2208880483395185, + "grad_norm": 1.611734390258789, + "learning_rate": 4.4218596391879264e-05, + "loss": 5.1509, + "step": 37141 + }, + { + "epoch": 0.22089399562279952, + "grad_norm": 1.661490797996521, + "learning_rate": 4.421829765248528e-05, + "loss": 5.1012, + "step": 37142 + }, + { + "epoch": 0.2208999429060805, + "grad_norm": 1.471693992614746, + "learning_rate": 4.4217998906382395e-05, + "loss": 5.2618, + "step": 37143 + }, + { + "epoch": 0.2209058901893615, + "grad_norm": 1.8665975332260132, + "learning_rate": 4.4217700153570694e-05, + "loss": 5.0267, + "step": 37144 + }, + { + "epoch": 0.2209118374726425, + "grad_norm": 1.7666631937026978, + "learning_rate": 4.42174013940503e-05, + "loss": 5.1182, + "step": 37145 + }, + { + "epoch": 0.2209177847559235, + "grad_norm": 1.6956653594970703, + "learning_rate": 4.421710262782131e-05, + "loss": 5.1901, + "step": 37146 + }, + { + "epoch": 0.22092373203920448, + "grad_norm": 1.8353193998336792, + "learning_rate": 4.4216803854883826e-05, + "loss": 5.2775, + "step": 37147 + }, + { + "epoch": 0.2209296793224855, + "grad_norm": 1.8415271043777466, + "learning_rate": 4.4216505075237945e-05, + "loss": 5.1925, + "step": 37148 + }, + { + "epoch": 0.2209356266057665, + "grad_norm": 1.7486032247543335, + "learning_rate": 4.4216206288883794e-05, + "loss": 5.1695, + "step": 37149 + }, + { + "epoch": 0.22094157388904748, + "grad_norm": 1.6249213218688965, + "learning_rate": 4.421590749582146e-05, + "loss": 5.3091, + "step": 37150 + }, + { + "epoch": 0.2209475211723285, + "grad_norm": 1.5573538541793823, + "learning_rate": 4.4215608696051045e-05, + "loss": 5.2265, + "step": 37151 + }, + { + "epoch": 0.22095346845560948, + "grad_norm": 1.4847667217254639, + "learning_rate": 4.421530988957267e-05, + "loss": 5.2365, + "step": 37152 + }, + { + "epoch": 0.22095941573889047, + "grad_norm": 1.5954338312149048, + "learning_rate": 4.421501107638643e-05, + "loss": 5.0707, + "step": 37153 + }, + { + "epoch": 0.22096536302217148, + "grad_norm": 1.6025676727294922, + "learning_rate": 4.421471225649242e-05, + "loss": 5.1383, + "step": 37154 + }, + { + "epoch": 0.22097131030545247, + "grad_norm": 2.082498550415039, + "learning_rate": 4.421441342989075e-05, + "loss": 4.5216, + "step": 37155 + }, + { + "epoch": 0.22097725758873346, + "grad_norm": 2.626246690750122, + "learning_rate": 4.421411459658153e-05, + "loss": 3.4209, + "step": 37156 + }, + { + "epoch": 0.22098320487201448, + "grad_norm": 3.0174107551574707, + "learning_rate": 4.421381575656486e-05, + "loss": 3.3856, + "step": 37157 + }, + { + "epoch": 0.22098915215529547, + "grad_norm": 2.355088472366333, + "learning_rate": 4.421351690984084e-05, + "loss": 3.1049, + "step": 37158 + }, + { + "epoch": 0.22099509943857645, + "grad_norm": 2.104196310043335, + "learning_rate": 4.42132180564096e-05, + "loss": 3.2537, + "step": 37159 + }, + { + "epoch": 0.22100104672185747, + "grad_norm": 2.216770648956299, + "learning_rate": 4.4212919196271205e-05, + "loss": 3.1694, + "step": 37160 + }, + { + "epoch": 0.22100699400513846, + "grad_norm": 2.238762617111206, + "learning_rate": 4.421262032942579e-05, + "loss": 3.128, + "step": 37161 + }, + { + "epoch": 0.22101294128841945, + "grad_norm": 3.443631410598755, + "learning_rate": 4.421232145587344e-05, + "loss": 2.4831, + "step": 37162 + }, + { + "epoch": 0.22101888857170043, + "grad_norm": 3.6234066486358643, + "learning_rate": 4.421202257561427e-05, + "loss": 2.0318, + "step": 37163 + }, + { + "epoch": 0.22102483585498145, + "grad_norm": 3.405298948287964, + "learning_rate": 4.421172368864838e-05, + "loss": 2.1004, + "step": 37164 + }, + { + "epoch": 0.22103078313826244, + "grad_norm": 3.220759391784668, + "learning_rate": 4.4211424794975875e-05, + "loss": 2.2621, + "step": 37165 + }, + { + "epoch": 0.22103673042154343, + "grad_norm": 3.005004644393921, + "learning_rate": 4.4211125894596865e-05, + "loss": 2.1246, + "step": 37166 + }, + { + "epoch": 0.22104267770482444, + "grad_norm": 3.2884764671325684, + "learning_rate": 4.421082698751144e-05, + "loss": 2.0425, + "step": 37167 + }, + { + "epoch": 0.22104862498810543, + "grad_norm": 2.9863510131835938, + "learning_rate": 4.4210528073719727e-05, + "loss": 1.8929, + "step": 37168 + }, + { + "epoch": 0.22105457227138642, + "grad_norm": 2.6555914878845215, + "learning_rate": 4.421022915322181e-05, + "loss": 1.4331, + "step": 37169 + }, + { + "epoch": 0.22106051955466743, + "grad_norm": 2.6842329502105713, + "learning_rate": 4.4209930226017807e-05, + "loss": 1.6554, + "step": 37170 + }, + { + "epoch": 0.22106646683794842, + "grad_norm": 2.9295549392700195, + "learning_rate": 4.42096312921078e-05, + "loss": 3.4228, + "step": 37171 + }, + { + "epoch": 0.2210724141212294, + "grad_norm": 2.749258279800415, + "learning_rate": 4.420933235149192e-05, + "loss": 3.4164, + "step": 37172 + }, + { + "epoch": 0.22107836140451043, + "grad_norm": 2.6917507648468018, + "learning_rate": 4.420903340417026e-05, + "loss": 3.5214, + "step": 37173 + }, + { + "epoch": 0.22108430868779141, + "grad_norm": 2.447829484939575, + "learning_rate": 4.420873445014292e-05, + "loss": 3.428, + "step": 37174 + }, + { + "epoch": 0.2210902559710724, + "grad_norm": 1.9641824960708618, + "learning_rate": 4.420843548941002e-05, + "loss": 3.3165, + "step": 37175 + }, + { + "epoch": 0.22109620325435342, + "grad_norm": 2.1110525131225586, + "learning_rate": 4.4208136521971646e-05, + "loss": 3.434, + "step": 37176 + }, + { + "epoch": 0.2211021505376344, + "grad_norm": 1.9869229793548584, + "learning_rate": 4.4207837547827905e-05, + "loss": 4.4427, + "step": 37177 + }, + { + "epoch": 0.2211080978209154, + "grad_norm": 1.9617522954940796, + "learning_rate": 4.4207538566978915e-05, + "loss": 5.4059, + "step": 37178 + }, + { + "epoch": 0.2211140451041964, + "grad_norm": 2.4324228763580322, + "learning_rate": 4.420723957942477e-05, + "loss": 5.1003, + "step": 37179 + }, + { + "epoch": 0.2211199923874774, + "grad_norm": 1.8642364740371704, + "learning_rate": 4.420694058516557e-05, + "loss": 4.9225, + "step": 37180 + }, + { + "epoch": 0.2211259396707584, + "grad_norm": 1.636181116104126, + "learning_rate": 4.420664158420143e-05, + "loss": 4.8789, + "step": 37181 + }, + { + "epoch": 0.2211318869540394, + "grad_norm": 1.830492377281189, + "learning_rate": 4.420634257653245e-05, + "loss": 4.6706, + "step": 37182 + }, + { + "epoch": 0.2211378342373204, + "grad_norm": 1.7127333879470825, + "learning_rate": 4.420604356215874e-05, + "loss": 4.5773, + "step": 37183 + }, + { + "epoch": 0.22114378152060138, + "grad_norm": 1.882017731666565, + "learning_rate": 4.4205744541080394e-05, + "loss": 4.6778, + "step": 37184 + }, + { + "epoch": 0.2211497288038824, + "grad_norm": 1.7298130989074707, + "learning_rate": 4.420544551329752e-05, + "loss": 4.5224, + "step": 37185 + }, + { + "epoch": 0.22115567608716338, + "grad_norm": 1.6471002101898193, + "learning_rate": 4.420514647881022e-05, + "loss": 4.4796, + "step": 37186 + }, + { + "epoch": 0.22116162337044437, + "grad_norm": 1.6308108568191528, + "learning_rate": 4.420484743761861e-05, + "loss": 4.4928, + "step": 37187 + }, + { + "epoch": 0.2211675706537254, + "grad_norm": 1.8822065591812134, + "learning_rate": 4.420454838972278e-05, + "loss": 4.4417, + "step": 37188 + }, + { + "epoch": 0.22117351793700638, + "grad_norm": 1.8454277515411377, + "learning_rate": 4.420424933512284e-05, + "loss": 4.3258, + "step": 37189 + }, + { + "epoch": 0.22117946522028736, + "grad_norm": 1.7358025312423706, + "learning_rate": 4.42039502738189e-05, + "loss": 4.9944, + "step": 37190 + }, + { + "epoch": 0.22118541250356838, + "grad_norm": 1.7481547594070435, + "learning_rate": 4.420365120581106e-05, + "loss": 4.9499, + "step": 37191 + }, + { + "epoch": 0.22119135978684937, + "grad_norm": 2.411710500717163, + "learning_rate": 4.4203352131099416e-05, + "loss": 4.6692, + "step": 37192 + }, + { + "epoch": 0.22119730707013036, + "grad_norm": 1.3139026165008545, + "learning_rate": 4.420305304968408e-05, + "loss": 5.2405, + "step": 37193 + }, + { + "epoch": 0.22120325435341137, + "grad_norm": 2.4839398860931396, + "learning_rate": 4.4202753961565166e-05, + "loss": 4.192, + "step": 37194 + }, + { + "epoch": 0.22120920163669236, + "grad_norm": 2.9722938537597656, + "learning_rate": 4.4202454866742763e-05, + "loss": 3.9029, + "step": 37195 + }, + { + "epoch": 0.22121514891997335, + "grad_norm": 2.6020498275756836, + "learning_rate": 4.4202155765216976e-05, + "loss": 3.9087, + "step": 37196 + }, + { + "epoch": 0.22122109620325436, + "grad_norm": 2.354983329772949, + "learning_rate": 4.4201856656987926e-05, + "loss": 3.8118, + "step": 37197 + }, + { + "epoch": 0.22122704348653535, + "grad_norm": 2.4077634811401367, + "learning_rate": 4.42015575420557e-05, + "loss": 3.9617, + "step": 37198 + }, + { + "epoch": 0.22123299076981634, + "grad_norm": 2.160682201385498, + "learning_rate": 4.420125842042041e-05, + "loss": 3.8235, + "step": 37199 + }, + { + "epoch": 0.22123893805309736, + "grad_norm": 2.4441070556640625, + "learning_rate": 4.420095929208217e-05, + "loss": 4.5287, + "step": 37200 + }, + { + "epoch": 0.22124488533637834, + "grad_norm": 1.7624927759170532, + "learning_rate": 4.420066015704105e-05, + "loss": 5.0133, + "step": 37201 + }, + { + "epoch": 0.22125083261965933, + "grad_norm": 1.8188538551330566, + "learning_rate": 4.4200361015297196e-05, + "loss": 5.1234, + "step": 37202 + }, + { + "epoch": 0.22125677990294035, + "grad_norm": 1.8965922594070435, + "learning_rate": 4.420006186685069e-05, + "loss": 4.9296, + "step": 37203 + }, + { + "epoch": 0.22126272718622134, + "grad_norm": 1.711780309677124, + "learning_rate": 4.4199762711701646e-05, + "loss": 4.7628, + "step": 37204 + }, + { + "epoch": 0.22126867446950232, + "grad_norm": 2.455254077911377, + "learning_rate": 4.419946354985015e-05, + "loss": 3.7198, + "step": 37205 + }, + { + "epoch": 0.22127462175278334, + "grad_norm": 2.212334156036377, + "learning_rate": 4.419916438129632e-05, + "loss": 3.9509, + "step": 37206 + }, + { + "epoch": 0.22128056903606433, + "grad_norm": 2.515057325363159, + "learning_rate": 4.4198865206040275e-05, + "loss": 4.1821, + "step": 37207 + }, + { + "epoch": 0.22128651631934532, + "grad_norm": 2.1646876335144043, + "learning_rate": 4.4198566024082096e-05, + "loss": 3.7462, + "step": 37208 + }, + { + "epoch": 0.22129246360262633, + "grad_norm": 2.334415912628174, + "learning_rate": 4.4198266835421894e-05, + "loss": 3.8333, + "step": 37209 + }, + { + "epoch": 0.22129841088590732, + "grad_norm": 3.074552536010742, + "learning_rate": 4.419796764005978e-05, + "loss": 4.0844, + "step": 37210 + }, + { + "epoch": 0.2213043581691883, + "grad_norm": 1.8268340826034546, + "learning_rate": 4.419766843799585e-05, + "loss": 4.7415, + "step": 37211 + }, + { + "epoch": 0.22131030545246932, + "grad_norm": 2.2268385887145996, + "learning_rate": 4.419736922923021e-05, + "loss": 4.114, + "step": 37212 + }, + { + "epoch": 0.2213162527357503, + "grad_norm": 1.7792022228240967, + "learning_rate": 4.419707001376297e-05, + "loss": 4.9278, + "step": 37213 + }, + { + "epoch": 0.2213222000190313, + "grad_norm": 3.0060274600982666, + "learning_rate": 4.4196770791594236e-05, + "loss": 2.7797, + "step": 37214 + }, + { + "epoch": 0.22132814730231232, + "grad_norm": 2.738672971725464, + "learning_rate": 4.41964715627241e-05, + "loss": 3.0921, + "step": 37215 + }, + { + "epoch": 0.2213340945855933, + "grad_norm": 1.313151240348816, + "learning_rate": 4.419617232715267e-05, + "loss": 5.1684, + "step": 37216 + }, + { + "epoch": 0.2213400418688743, + "grad_norm": 1.3445955514907837, + "learning_rate": 4.419587308488007e-05, + "loss": 5.3816, + "step": 37217 + }, + { + "epoch": 0.2213459891521553, + "grad_norm": 1.6494323015213013, + "learning_rate": 4.419557383590638e-05, + "loss": 4.7054, + "step": 37218 + }, + { + "epoch": 0.2213519364354363, + "grad_norm": 1.4395712614059448, + "learning_rate": 4.419527458023171e-05, + "loss": 4.5004, + "step": 37219 + }, + { + "epoch": 0.22135788371871729, + "grad_norm": 1.4600639343261719, + "learning_rate": 4.419497531785617e-05, + "loss": 4.6431, + "step": 37220 + }, + { + "epoch": 0.22136383100199827, + "grad_norm": 1.544190764427185, + "learning_rate": 4.419467604877987e-05, + "loss": 4.933, + "step": 37221 + }, + { + "epoch": 0.2213697782852793, + "grad_norm": 1.767937421798706, + "learning_rate": 4.41943767730029e-05, + "loss": 4.8382, + "step": 37222 + }, + { + "epoch": 0.22137572556856028, + "grad_norm": 1.9524779319763184, + "learning_rate": 4.4194077490525373e-05, + "loss": 4.7597, + "step": 37223 + }, + { + "epoch": 0.22138167285184127, + "grad_norm": 1.614169955253601, + "learning_rate": 4.419377820134739e-05, + "loss": 4.9252, + "step": 37224 + }, + { + "epoch": 0.22138762013512228, + "grad_norm": 1.4424415826797485, + "learning_rate": 4.419347890546907e-05, + "loss": 5.3474, + "step": 37225 + }, + { + "epoch": 0.22139356741840327, + "grad_norm": 1.42082941532135, + "learning_rate": 4.419317960289049e-05, + "loss": 5.084, + "step": 37226 + }, + { + "epoch": 0.22139951470168426, + "grad_norm": 1.8457419872283936, + "learning_rate": 4.4192880293611774e-05, + "loss": 5.062, + "step": 37227 + }, + { + "epoch": 0.22140546198496527, + "grad_norm": 1.5780494213104248, + "learning_rate": 4.419258097763301e-05, + "loss": 4.9313, + "step": 37228 + }, + { + "epoch": 0.22141140926824626, + "grad_norm": 1.3056610822677612, + "learning_rate": 4.419228165495433e-05, + "loss": 4.8709, + "step": 37229 + }, + { + "epoch": 0.22141735655152725, + "grad_norm": 1.1669378280639648, + "learning_rate": 4.419198232557582e-05, + "loss": 4.871, + "step": 37230 + }, + { + "epoch": 0.22142330383480827, + "grad_norm": 1.4716078042984009, + "learning_rate": 4.4191682989497584e-05, + "loss": 4.9545, + "step": 37231 + }, + { + "epoch": 0.22142925111808925, + "grad_norm": 2.2933619022369385, + "learning_rate": 4.419138364671973e-05, + "loss": 4.8166, + "step": 37232 + }, + { + "epoch": 0.22143519840137024, + "grad_norm": 1.8404078483581543, + "learning_rate": 4.419108429724236e-05, + "loss": 5.0332, + "step": 37233 + }, + { + "epoch": 0.22144114568465126, + "grad_norm": 1.7566367387771606, + "learning_rate": 4.419078494106559e-05, + "loss": 5.1599, + "step": 37234 + }, + { + "epoch": 0.22144709296793225, + "grad_norm": 1.758940577507019, + "learning_rate": 4.419048557818951e-05, + "loss": 4.9106, + "step": 37235 + }, + { + "epoch": 0.22145304025121323, + "grad_norm": 1.363818645477295, + "learning_rate": 4.4190186208614224e-05, + "loss": 5.6113, + "step": 37236 + }, + { + "epoch": 0.22145898753449425, + "grad_norm": 1.3969568014144897, + "learning_rate": 4.4189886832339857e-05, + "loss": 5.5177, + "step": 37237 + }, + { + "epoch": 0.22146493481777524, + "grad_norm": 1.6910165548324585, + "learning_rate": 4.418958744936648e-05, + "loss": 4.9269, + "step": 37238 + }, + { + "epoch": 0.22147088210105623, + "grad_norm": 1.5654246807098389, + "learning_rate": 4.418928805969423e-05, + "loss": 5.0792, + "step": 37239 + }, + { + "epoch": 0.22147682938433724, + "grad_norm": 1.5806190967559814, + "learning_rate": 4.41889886633232e-05, + "loss": 4.8614, + "step": 37240 + }, + { + "epoch": 0.22148277666761823, + "grad_norm": 2.206296443939209, + "learning_rate": 4.418868926025347e-05, + "loss": 4.1954, + "step": 37241 + }, + { + "epoch": 0.22148872395089922, + "grad_norm": 1.6986488103866577, + "learning_rate": 4.418838985048519e-05, + "loss": 5.2841, + "step": 37242 + }, + { + "epoch": 0.22149467123418023, + "grad_norm": 1.815557599067688, + "learning_rate": 4.418809043401843e-05, + "loss": 5.0666, + "step": 37243 + }, + { + "epoch": 0.22150061851746122, + "grad_norm": 1.6166787147521973, + "learning_rate": 4.418779101085331e-05, + "loss": 5.0519, + "step": 37244 + }, + { + "epoch": 0.2215065658007422, + "grad_norm": 1.575146198272705, + "learning_rate": 4.4187491580989926e-05, + "loss": 4.9857, + "step": 37245 + }, + { + "epoch": 0.22151251308402323, + "grad_norm": 1.4937185049057007, + "learning_rate": 4.4187192144428395e-05, + "loss": 5.3216, + "step": 37246 + }, + { + "epoch": 0.22151846036730422, + "grad_norm": 1.5050480365753174, + "learning_rate": 4.4186892701168805e-05, + "loss": 5.5195, + "step": 37247 + }, + { + "epoch": 0.2215244076505852, + "grad_norm": 1.4928349256515503, + "learning_rate": 4.418659325121128e-05, + "loss": 5.1624, + "step": 37248 + }, + { + "epoch": 0.22153035493386622, + "grad_norm": 1.5524920225143433, + "learning_rate": 4.4186293794555904e-05, + "loss": 4.8779, + "step": 37249 + }, + { + "epoch": 0.2215363022171472, + "grad_norm": 2.0788793563842773, + "learning_rate": 4.4185994331202795e-05, + "loss": 5.0675, + "step": 37250 + }, + { + "epoch": 0.2215422495004282, + "grad_norm": 1.9269020557403564, + "learning_rate": 4.418569486115205e-05, + "loss": 4.9713, + "step": 37251 + }, + { + "epoch": 0.2215481967837092, + "grad_norm": 1.3561869859695435, + "learning_rate": 4.4185395384403784e-05, + "loss": 4.9614, + "step": 37252 + }, + { + "epoch": 0.2215541440669902, + "grad_norm": 1.6398110389709473, + "learning_rate": 4.4185095900958085e-05, + "loss": 5.0054, + "step": 37253 + }, + { + "epoch": 0.2215600913502712, + "grad_norm": 1.5096663236618042, + "learning_rate": 4.418479641081507e-05, + "loss": 5.2334, + "step": 37254 + }, + { + "epoch": 0.2215660386335522, + "grad_norm": 1.9203683137893677, + "learning_rate": 4.418449691397485e-05, + "loss": 4.7206, + "step": 37255 + }, + { + "epoch": 0.2215719859168332, + "grad_norm": 1.8168144226074219, + "learning_rate": 4.4184197410437514e-05, + "loss": 4.6005, + "step": 37256 + }, + { + "epoch": 0.22157793320011418, + "grad_norm": 1.7423299551010132, + "learning_rate": 4.4183897900203164e-05, + "loss": 4.8605, + "step": 37257 + }, + { + "epoch": 0.2215838804833952, + "grad_norm": 1.7743721008300781, + "learning_rate": 4.418359838327193e-05, + "loss": 4.8841, + "step": 37258 + }, + { + "epoch": 0.22158982776667618, + "grad_norm": 1.8115425109863281, + "learning_rate": 4.418329885964389e-05, + "loss": 4.7159, + "step": 37259 + }, + { + "epoch": 0.22159577504995717, + "grad_norm": 1.4087785482406616, + "learning_rate": 4.418299932931916e-05, + "loss": 4.8462, + "step": 37260 + }, + { + "epoch": 0.2216017223332382, + "grad_norm": 1.7165182828903198, + "learning_rate": 4.4182699792297844e-05, + "loss": 4.66, + "step": 37261 + }, + { + "epoch": 0.22160766961651918, + "grad_norm": 1.7734102010726929, + "learning_rate": 4.418240024858004e-05, + "loss": 5.1589, + "step": 37262 + }, + { + "epoch": 0.22161361689980016, + "grad_norm": 1.6220389604568481, + "learning_rate": 4.418210069816586e-05, + "loss": 5.4969, + "step": 37263 + }, + { + "epoch": 0.22161956418308118, + "grad_norm": 1.607691764831543, + "learning_rate": 4.4181801141055415e-05, + "loss": 5.2169, + "step": 37264 + }, + { + "epoch": 0.22162551146636217, + "grad_norm": 2.094848871231079, + "learning_rate": 4.418150157724879e-05, + "loss": 5.2492, + "step": 37265 + }, + { + "epoch": 0.22163145874964316, + "grad_norm": 1.8658332824707031, + "learning_rate": 4.418120200674611e-05, + "loss": 5.299, + "step": 37266 + }, + { + "epoch": 0.22163740603292417, + "grad_norm": 1.4364315271377563, + "learning_rate": 4.418090242954748e-05, + "loss": 5.0549, + "step": 37267 + }, + { + "epoch": 0.22164335331620516, + "grad_norm": 1.4865174293518066, + "learning_rate": 4.4180602845652975e-05, + "loss": 4.9178, + "step": 37268 + }, + { + "epoch": 0.22164930059948615, + "grad_norm": 1.584671974182129, + "learning_rate": 4.4180303255062724e-05, + "loss": 5.0584, + "step": 37269 + }, + { + "epoch": 0.22165524788276716, + "grad_norm": 1.6680519580841064, + "learning_rate": 4.4180003657776834e-05, + "loss": 5.4479, + "step": 37270 + }, + { + "epoch": 0.22166119516604815, + "grad_norm": 2.0023248195648193, + "learning_rate": 4.41797040537954e-05, + "loss": 3.9446, + "step": 37271 + }, + { + "epoch": 0.22166714244932914, + "grad_norm": 2.2941033840179443, + "learning_rate": 4.4179404443118534e-05, + "loss": 2.83, + "step": 37272 + }, + { + "epoch": 0.22167308973261016, + "grad_norm": 2.8651883602142334, + "learning_rate": 4.4179104825746335e-05, + "loss": 3.4193, + "step": 37273 + }, + { + "epoch": 0.22167903701589114, + "grad_norm": 2.1556551456451416, + "learning_rate": 4.4178805201678895e-05, + "loss": 4.4954, + "step": 37274 + }, + { + "epoch": 0.22168498429917213, + "grad_norm": 2.3173985481262207, + "learning_rate": 4.417850557091635e-05, + "loss": 5.131, + "step": 37275 + }, + { + "epoch": 0.22169093158245315, + "grad_norm": 1.8110771179199219, + "learning_rate": 4.417820593345878e-05, + "loss": 5.0599, + "step": 37276 + }, + { + "epoch": 0.22169687886573414, + "grad_norm": 1.6023890972137451, + "learning_rate": 4.417790628930629e-05, + "loss": 4.9864, + "step": 37277 + }, + { + "epoch": 0.22170282614901513, + "grad_norm": 1.5635809898376465, + "learning_rate": 4.4177606638459004e-05, + "loss": 5.6305, + "step": 37278 + }, + { + "epoch": 0.2217087734322961, + "grad_norm": 1.7081363201141357, + "learning_rate": 4.4177306980917e-05, + "loss": 4.4753, + "step": 37279 + }, + { + "epoch": 0.22171472071557713, + "grad_norm": 1.7905879020690918, + "learning_rate": 4.4177007316680404e-05, + "loss": 4.9362, + "step": 37280 + }, + { + "epoch": 0.22172066799885812, + "grad_norm": 1.5954350233078003, + "learning_rate": 4.4176707645749316e-05, + "loss": 5.0016, + "step": 37281 + }, + { + "epoch": 0.2217266152821391, + "grad_norm": 1.5420632362365723, + "learning_rate": 4.4176407968123834e-05, + "loss": 5.1209, + "step": 37282 + }, + { + "epoch": 0.22173256256542012, + "grad_norm": 1.4477598667144775, + "learning_rate": 4.417610828380406e-05, + "loss": 5.4703, + "step": 37283 + }, + { + "epoch": 0.2217385098487011, + "grad_norm": 1.7971065044403076, + "learning_rate": 4.417580859279011e-05, + "loss": 4.1857, + "step": 37284 + }, + { + "epoch": 0.2217444571319821, + "grad_norm": 1.6404802799224854, + "learning_rate": 4.417550889508208e-05, + "loss": 4.6683, + "step": 37285 + }, + { + "epoch": 0.2217504044152631, + "grad_norm": 1.6057367324829102, + "learning_rate": 4.417520919068009e-05, + "loss": 4.7164, + "step": 37286 + }, + { + "epoch": 0.2217563516985441, + "grad_norm": 1.6254706382751465, + "learning_rate": 4.4174909479584214e-05, + "loss": 4.9378, + "step": 37287 + }, + { + "epoch": 0.2217622989818251, + "grad_norm": 1.501516342163086, + "learning_rate": 4.417460976179459e-05, + "loss": 5.6384, + "step": 37288 + }, + { + "epoch": 0.2217682462651061, + "grad_norm": 1.5623992681503296, + "learning_rate": 4.417431003731131e-05, + "loss": 5.4863, + "step": 37289 + }, + { + "epoch": 0.2217741935483871, + "grad_norm": 1.533334493637085, + "learning_rate": 4.417401030613446e-05, + "loss": 5.4763, + "step": 37290 + }, + { + "epoch": 0.22178014083166808, + "grad_norm": 1.5613082647323608, + "learning_rate": 4.417371056826417e-05, + "loss": 5.5083, + "step": 37291 + }, + { + "epoch": 0.2217860881149491, + "grad_norm": 1.5319432020187378, + "learning_rate": 4.417341082370054e-05, + "loss": 5.4524, + "step": 37292 + }, + { + "epoch": 0.2217920353982301, + "grad_norm": 1.9295907020568848, + "learning_rate": 4.417311107244366e-05, + "loss": 5.2326, + "step": 37293 + }, + { + "epoch": 0.22179798268151107, + "grad_norm": 1.6446950435638428, + "learning_rate": 4.417281131449366e-05, + "loss": 5.2152, + "step": 37294 + }, + { + "epoch": 0.2218039299647921, + "grad_norm": 1.6639310121536255, + "learning_rate": 4.417251154985062e-05, + "loss": 5.2117, + "step": 37295 + }, + { + "epoch": 0.22180987724807308, + "grad_norm": 1.6263519525527954, + "learning_rate": 4.417221177851466e-05, + "loss": 5.2871, + "step": 37296 + }, + { + "epoch": 0.22181582453135407, + "grad_norm": 1.3505241870880127, + "learning_rate": 4.4171912000485874e-05, + "loss": 5.4459, + "step": 37297 + }, + { + "epoch": 0.22182177181463508, + "grad_norm": 1.5780766010284424, + "learning_rate": 4.4171612215764366e-05, + "loss": 5.4587, + "step": 37298 + }, + { + "epoch": 0.22182771909791607, + "grad_norm": 1.377548336982727, + "learning_rate": 4.4171312424350253e-05, + "loss": 5.0177, + "step": 37299 + }, + { + "epoch": 0.22183366638119706, + "grad_norm": 1.283535361289978, + "learning_rate": 4.417101262624363e-05, + "loss": 4.9686, + "step": 37300 + }, + { + "epoch": 0.22183961366447807, + "grad_norm": 1.591565489768982, + "learning_rate": 4.4170712821444604e-05, + "loss": 4.9452, + "step": 37301 + }, + { + "epoch": 0.22184556094775906, + "grad_norm": 1.7594454288482666, + "learning_rate": 4.417041300995329e-05, + "loss": 4.8756, + "step": 37302 + }, + { + "epoch": 0.22185150823104005, + "grad_norm": 1.743808388710022, + "learning_rate": 4.417011319176977e-05, + "loss": 4.8011, + "step": 37303 + }, + { + "epoch": 0.22185745551432107, + "grad_norm": 1.5689365863800049, + "learning_rate": 4.416981336689417e-05, + "loss": 4.4118, + "step": 37304 + }, + { + "epoch": 0.22186340279760206, + "grad_norm": 2.2633965015411377, + "learning_rate": 4.4169513535326585e-05, + "loss": 4.8006, + "step": 37305 + }, + { + "epoch": 0.22186935008088304, + "grad_norm": 2.0904433727264404, + "learning_rate": 4.416921369706712e-05, + "loss": 3.6457, + "step": 37306 + }, + { + "epoch": 0.22187529736416406, + "grad_norm": 2.4290525913238525, + "learning_rate": 4.4168913852115876e-05, + "loss": 3.5459, + "step": 37307 + }, + { + "epoch": 0.22188124464744505, + "grad_norm": 2.113612413406372, + "learning_rate": 4.416861400047297e-05, + "loss": 3.6094, + "step": 37308 + }, + { + "epoch": 0.22188719193072604, + "grad_norm": 1.9198821783065796, + "learning_rate": 4.416831414213849e-05, + "loss": 3.6966, + "step": 37309 + }, + { + "epoch": 0.22189313921400705, + "grad_norm": 2.143109083175659, + "learning_rate": 4.4168014277112554e-05, + "loss": 3.36, + "step": 37310 + }, + { + "epoch": 0.22189908649728804, + "grad_norm": 2.0741262435913086, + "learning_rate": 4.4167714405395267e-05, + "loss": 3.8931, + "step": 37311 + }, + { + "epoch": 0.22190503378056903, + "grad_norm": 1.7945109605789185, + "learning_rate": 4.416741452698673e-05, + "loss": 5.0497, + "step": 37312 + }, + { + "epoch": 0.22191098106385004, + "grad_norm": 1.7045809030532837, + "learning_rate": 4.4167114641887033e-05, + "loss": 5.1705, + "step": 37313 + }, + { + "epoch": 0.22191692834713103, + "grad_norm": 2.1909990310668945, + "learning_rate": 4.4166814750096305e-05, + "loss": 4.0491, + "step": 37314 + }, + { + "epoch": 0.22192287563041202, + "grad_norm": 1.9659631252288818, + "learning_rate": 4.416651485161464e-05, + "loss": 4.8632, + "step": 37315 + }, + { + "epoch": 0.22192882291369304, + "grad_norm": 2.046928644180298, + "learning_rate": 4.416621494644214e-05, + "loss": 4.8176, + "step": 37316 + }, + { + "epoch": 0.22193477019697402, + "grad_norm": 1.9225927591323853, + "learning_rate": 4.416591503457891e-05, + "loss": 5.1519, + "step": 37317 + }, + { + "epoch": 0.221940717480255, + "grad_norm": 1.9699875116348267, + "learning_rate": 4.416561511602506e-05, + "loss": 4.6214, + "step": 37318 + }, + { + "epoch": 0.22194666476353603, + "grad_norm": 1.7100906372070312, + "learning_rate": 4.416531519078069e-05, + "loss": 4.6123, + "step": 37319 + }, + { + "epoch": 0.22195261204681702, + "grad_norm": 2.0231447219848633, + "learning_rate": 4.416501525884591e-05, + "loss": 4.7937, + "step": 37320 + }, + { + "epoch": 0.221958559330098, + "grad_norm": 1.9513578414916992, + "learning_rate": 4.4164715320220814e-05, + "loss": 5.5448, + "step": 37321 + }, + { + "epoch": 0.22196450661337902, + "grad_norm": 1.7066813707351685, + "learning_rate": 4.416441537490552e-05, + "loss": 5.2223, + "step": 37322 + }, + { + "epoch": 0.22197045389666, + "grad_norm": 2.141442060470581, + "learning_rate": 4.416411542290013e-05, + "loss": 4.6248, + "step": 37323 + }, + { + "epoch": 0.221976401179941, + "grad_norm": 2.92130970954895, + "learning_rate": 4.416381546420474e-05, + "loss": 3.9375, + "step": 37324 + }, + { + "epoch": 0.221982348463222, + "grad_norm": 1.8970509767532349, + "learning_rate": 4.4163515498819464e-05, + "loss": 4.4692, + "step": 37325 + }, + { + "epoch": 0.221988295746503, + "grad_norm": 1.6135637760162354, + "learning_rate": 4.41632155267444e-05, + "loss": 4.2313, + "step": 37326 + }, + { + "epoch": 0.221994243029784, + "grad_norm": 1.5715364217758179, + "learning_rate": 4.4162915547979655e-05, + "loss": 4.7554, + "step": 37327 + }, + { + "epoch": 0.222000190313065, + "grad_norm": 2.162321090698242, + "learning_rate": 4.416261556252533e-05, + "loss": 4.9361, + "step": 37328 + }, + { + "epoch": 0.222006137596346, + "grad_norm": 1.5135966539382935, + "learning_rate": 4.416231557038154e-05, + "loss": 5.1464, + "step": 37329 + }, + { + "epoch": 0.22201208487962698, + "grad_norm": 1.588383436203003, + "learning_rate": 4.416201557154838e-05, + "loss": 5.0772, + "step": 37330 + }, + { + "epoch": 0.222018032162908, + "grad_norm": 1.5293753147125244, + "learning_rate": 4.416171556602596e-05, + "loss": 4.5423, + "step": 37331 + }, + { + "epoch": 0.22202397944618898, + "grad_norm": 1.4758036136627197, + "learning_rate": 4.416141555381439e-05, + "loss": 5.0102, + "step": 37332 + }, + { + "epoch": 0.22202992672946997, + "grad_norm": 1.5266300439834595, + "learning_rate": 4.4161115534913755e-05, + "loss": 4.7217, + "step": 37333 + }, + { + "epoch": 0.222035874012751, + "grad_norm": 1.528539776802063, + "learning_rate": 4.4160815509324184e-05, + "loss": 4.7979, + "step": 37334 + }, + { + "epoch": 0.22204182129603198, + "grad_norm": 1.58788001537323, + "learning_rate": 4.4160515477045764e-05, + "loss": 4.4164, + "step": 37335 + }, + { + "epoch": 0.22204776857931297, + "grad_norm": 1.5381730794906616, + "learning_rate": 4.416021543807861e-05, + "loss": 4.9237, + "step": 37336 + }, + { + "epoch": 0.22205371586259395, + "grad_norm": 1.800580620765686, + "learning_rate": 4.4159915392422814e-05, + "loss": 4.7554, + "step": 37337 + }, + { + "epoch": 0.22205966314587497, + "grad_norm": 1.5559518337249756, + "learning_rate": 4.4159615340078495e-05, + "loss": 4.8953, + "step": 37338 + }, + { + "epoch": 0.22206561042915596, + "grad_norm": 1.968245029449463, + "learning_rate": 4.415931528104575e-05, + "loss": 4.4203, + "step": 37339 + }, + { + "epoch": 0.22207155771243695, + "grad_norm": 1.6635748147964478, + "learning_rate": 4.4159015215324696e-05, + "loss": 4.3885, + "step": 37340 + }, + { + "epoch": 0.22207750499571796, + "grad_norm": 1.7728335857391357, + "learning_rate": 4.415871514291542e-05, + "loss": 3.8601, + "step": 37341 + }, + { + "epoch": 0.22208345227899895, + "grad_norm": 1.3408424854278564, + "learning_rate": 4.4158415063818025e-05, + "loss": 3.8574, + "step": 37342 + }, + { + "epoch": 0.22208939956227994, + "grad_norm": 1.610424280166626, + "learning_rate": 4.415811497803264e-05, + "loss": 4.7664, + "step": 37343 + }, + { + "epoch": 0.22209534684556095, + "grad_norm": 1.6959972381591797, + "learning_rate": 4.415781488555935e-05, + "loss": 4.9161, + "step": 37344 + }, + { + "epoch": 0.22210129412884194, + "grad_norm": 1.5437208414077759, + "learning_rate": 4.415751478639826e-05, + "loss": 4.9545, + "step": 37345 + }, + { + "epoch": 0.22210724141212293, + "grad_norm": 1.6301335096359253, + "learning_rate": 4.4157214680549485e-05, + "loss": 4.4827, + "step": 37346 + }, + { + "epoch": 0.22211318869540395, + "grad_norm": 1.9365746974945068, + "learning_rate": 4.415691456801313e-05, + "loss": 4.0574, + "step": 37347 + }, + { + "epoch": 0.22211913597868493, + "grad_norm": 1.971279501914978, + "learning_rate": 4.415661444878928e-05, + "loss": 3.8612, + "step": 37348 + }, + { + "epoch": 0.22212508326196592, + "grad_norm": 1.8018229007720947, + "learning_rate": 4.4156314322878064e-05, + "loss": 4.1403, + "step": 37349 + }, + { + "epoch": 0.22213103054524694, + "grad_norm": 1.7496007680892944, + "learning_rate": 4.4156014190279576e-05, + "loss": 4.2662, + "step": 37350 + }, + { + "epoch": 0.22213697782852793, + "grad_norm": 2.0904650688171387, + "learning_rate": 4.415571405099391e-05, + "loss": 4.4694, + "step": 37351 + }, + { + "epoch": 0.22214292511180891, + "grad_norm": 1.7494895458221436, + "learning_rate": 4.41554139050212e-05, + "loss": 4.4199, + "step": 37352 + }, + { + "epoch": 0.22214887239508993, + "grad_norm": 1.7014201879501343, + "learning_rate": 4.415511375236152e-05, + "loss": 4.5209, + "step": 37353 + }, + { + "epoch": 0.22215481967837092, + "grad_norm": 1.5451538562774658, + "learning_rate": 4.415481359301499e-05, + "loss": 4.2152, + "step": 37354 + }, + { + "epoch": 0.2221607669616519, + "grad_norm": 1.3573757410049438, + "learning_rate": 4.4154513426981714e-05, + "loss": 3.6105, + "step": 37355 + }, + { + "epoch": 0.22216671424493292, + "grad_norm": 1.498342752456665, + "learning_rate": 4.41542132542618e-05, + "loss": 3.9755, + "step": 37356 + }, + { + "epoch": 0.2221726615282139, + "grad_norm": 1.7153942584991455, + "learning_rate": 4.4153913074855344e-05, + "loss": 4.0452, + "step": 37357 + }, + { + "epoch": 0.2221786088114949, + "grad_norm": 1.6392310857772827, + "learning_rate": 4.4153612888762455e-05, + "loss": 4.6287, + "step": 37358 + }, + { + "epoch": 0.22218455609477591, + "grad_norm": 1.8928215503692627, + "learning_rate": 4.415331269598324e-05, + "loss": 4.587, + "step": 37359 + }, + { + "epoch": 0.2221905033780569, + "grad_norm": 1.5934067964553833, + "learning_rate": 4.415301249651779e-05, + "loss": 4.626, + "step": 37360 + }, + { + "epoch": 0.2221964506613379, + "grad_norm": 1.2099053859710693, + "learning_rate": 4.415271229036623e-05, + "loss": 4.7737, + "step": 37361 + }, + { + "epoch": 0.2222023979446189, + "grad_norm": 1.5044233798980713, + "learning_rate": 4.415241207752866e-05, + "loss": 4.94, + "step": 37362 + }, + { + "epoch": 0.2222083452278999, + "grad_norm": 1.8237147331237793, + "learning_rate": 4.415211185800517e-05, + "loss": 4.2119, + "step": 37363 + }, + { + "epoch": 0.22221429251118088, + "grad_norm": 1.3939549922943115, + "learning_rate": 4.415181163179589e-05, + "loss": 5.1684, + "step": 37364 + }, + { + "epoch": 0.2222202397944619, + "grad_norm": 1.8115434646606445, + "learning_rate": 4.41515113989009e-05, + "loss": 4.5456, + "step": 37365 + }, + { + "epoch": 0.2222261870777429, + "grad_norm": 1.6453301906585693, + "learning_rate": 4.415121115932031e-05, + "loss": 4.6794, + "step": 37366 + }, + { + "epoch": 0.22223213436102388, + "grad_norm": 1.6238987445831299, + "learning_rate": 4.4150910913054244e-05, + "loss": 4.7563, + "step": 37367 + }, + { + "epoch": 0.2222380816443049, + "grad_norm": 1.9033849239349365, + "learning_rate": 4.415061066010279e-05, + "loss": 4.5263, + "step": 37368 + }, + { + "epoch": 0.22224402892758588, + "grad_norm": 1.6006360054016113, + "learning_rate": 4.415031040046605e-05, + "loss": 4.7523, + "step": 37369 + }, + { + "epoch": 0.22224997621086687, + "grad_norm": 1.5096614360809326, + "learning_rate": 4.415001013414414e-05, + "loss": 4.872, + "step": 37370 + }, + { + "epoch": 0.22225592349414788, + "grad_norm": 1.7860287427902222, + "learning_rate": 4.414970986113716e-05, + "loss": 4.4742, + "step": 37371 + }, + { + "epoch": 0.22226187077742887, + "grad_norm": 1.6604384183883667, + "learning_rate": 4.41494095814452e-05, + "loss": 4.7356, + "step": 37372 + }, + { + "epoch": 0.22226781806070986, + "grad_norm": 1.7952136993408203, + "learning_rate": 4.414910929506839e-05, + "loss": 4.5291, + "step": 37373 + }, + { + "epoch": 0.22227376534399088, + "grad_norm": 1.5343014001846313, + "learning_rate": 4.414880900200682e-05, + "loss": 4.6148, + "step": 37374 + }, + { + "epoch": 0.22227971262727186, + "grad_norm": 1.661389946937561, + "learning_rate": 4.4148508702260605e-05, + "loss": 4.071, + "step": 37375 + }, + { + "epoch": 0.22228565991055285, + "grad_norm": 1.4415347576141357, + "learning_rate": 4.414820839582984e-05, + "loss": 4.4066, + "step": 37376 + }, + { + "epoch": 0.22229160719383387, + "grad_norm": 1.6499462127685547, + "learning_rate": 4.414790808271464e-05, + "loss": 4.3395, + "step": 37377 + }, + { + "epoch": 0.22229755447711486, + "grad_norm": 1.5493072271347046, + "learning_rate": 4.414760776291509e-05, + "loss": 4.3965, + "step": 37378 + }, + { + "epoch": 0.22230350176039584, + "grad_norm": 1.5924429893493652, + "learning_rate": 4.4147307436431316e-05, + "loss": 4.4357, + "step": 37379 + }, + { + "epoch": 0.22230944904367686, + "grad_norm": 1.7015823125839233, + "learning_rate": 4.4147007103263415e-05, + "loss": 4.2155, + "step": 37380 + }, + { + "epoch": 0.22231539632695785, + "grad_norm": 1.7009806632995605, + "learning_rate": 4.414670676341149e-05, + "loss": 4.6103, + "step": 37381 + }, + { + "epoch": 0.22232134361023884, + "grad_norm": 1.572592854499817, + "learning_rate": 4.414640641687564e-05, + "loss": 4.6888, + "step": 37382 + }, + { + "epoch": 0.22232729089351985, + "grad_norm": 1.8123164176940918, + "learning_rate": 4.414610606365599e-05, + "loss": 4.3521, + "step": 37383 + }, + { + "epoch": 0.22233323817680084, + "grad_norm": 1.926174521446228, + "learning_rate": 4.414580570375262e-05, + "loss": 4.2927, + "step": 37384 + }, + { + "epoch": 0.22233918546008183, + "grad_norm": 1.7663146257400513, + "learning_rate": 4.414550533716566e-05, + "loss": 4.4666, + "step": 37385 + }, + { + "epoch": 0.22234513274336284, + "grad_norm": 2.544118881225586, + "learning_rate": 4.414520496389519e-05, + "loss": 2.8958, + "step": 37386 + }, + { + "epoch": 0.22235108002664383, + "grad_norm": 1.5476171970367432, + "learning_rate": 4.414490458394134e-05, + "loss": 4.5871, + "step": 37387 + }, + { + "epoch": 0.22235702730992482, + "grad_norm": 1.7039881944656372, + "learning_rate": 4.414460419730419e-05, + "loss": 4.8665, + "step": 37388 + }, + { + "epoch": 0.22236297459320584, + "grad_norm": 1.6667733192443848, + "learning_rate": 4.414430380398386e-05, + "loss": 4.4097, + "step": 37389 + }, + { + "epoch": 0.22236892187648682, + "grad_norm": 1.839328408241272, + "learning_rate": 4.414400340398045e-05, + "loss": 4.3882, + "step": 37390 + }, + { + "epoch": 0.2223748691597678, + "grad_norm": 1.97493314743042, + "learning_rate": 4.4143702997294066e-05, + "loss": 4.1716, + "step": 37391 + }, + { + "epoch": 0.22238081644304883, + "grad_norm": 1.4891178607940674, + "learning_rate": 4.414340258392482e-05, + "loss": 4.6106, + "step": 37392 + }, + { + "epoch": 0.22238676372632982, + "grad_norm": 1.6419004201889038, + "learning_rate": 4.414310216387281e-05, + "loss": 4.6838, + "step": 37393 + }, + { + "epoch": 0.2223927110096108, + "grad_norm": 1.5355687141418457, + "learning_rate": 4.414280173713813e-05, + "loss": 4.5691, + "step": 37394 + }, + { + "epoch": 0.2223986582928918, + "grad_norm": 1.6541396379470825, + "learning_rate": 4.4142501303720904e-05, + "loss": 4.7063, + "step": 37395 + }, + { + "epoch": 0.2224046055761728, + "grad_norm": 1.459181308746338, + "learning_rate": 4.4142200863621226e-05, + "loss": 4.9176, + "step": 37396 + }, + { + "epoch": 0.2224105528594538, + "grad_norm": 1.7777023315429688, + "learning_rate": 4.4141900416839196e-05, + "loss": 4.4156, + "step": 37397 + }, + { + "epoch": 0.22241650014273479, + "grad_norm": 2.0304362773895264, + "learning_rate": 4.4141599963374944e-05, + "loss": 4.2872, + "step": 37398 + }, + { + "epoch": 0.2224224474260158, + "grad_norm": 1.6349958181381226, + "learning_rate": 4.414129950322854e-05, + "loss": 4.6948, + "step": 37399 + }, + { + "epoch": 0.2224283947092968, + "grad_norm": 2.002707004547119, + "learning_rate": 4.4140999036400116e-05, + "loss": 4.0864, + "step": 37400 + }, + { + "epoch": 0.22243434199257778, + "grad_norm": 1.1629236936569214, + "learning_rate": 4.4140698562889765e-05, + "loss": 5.0643, + "step": 37401 + }, + { + "epoch": 0.2224402892758588, + "grad_norm": 1.2741730213165283, + "learning_rate": 4.414039808269759e-05, + "loss": 4.8496, + "step": 37402 + }, + { + "epoch": 0.22244623655913978, + "grad_norm": 1.4503839015960693, + "learning_rate": 4.41400975958237e-05, + "loss": 5.0413, + "step": 37403 + }, + { + "epoch": 0.22245218384242077, + "grad_norm": 1.4399816989898682, + "learning_rate": 4.413979710226821e-05, + "loss": 5.1163, + "step": 37404 + }, + { + "epoch": 0.22245813112570179, + "grad_norm": 1.431563138961792, + "learning_rate": 4.4139496602031204e-05, + "loss": 4.866, + "step": 37405 + }, + { + "epoch": 0.22246407840898277, + "grad_norm": 1.7766673564910889, + "learning_rate": 4.41391960951128e-05, + "loss": 5.1027, + "step": 37406 + }, + { + "epoch": 0.22247002569226376, + "grad_norm": 1.546772837638855, + "learning_rate": 4.41388955815131e-05, + "loss": 4.9627, + "step": 37407 + }, + { + "epoch": 0.22247597297554478, + "grad_norm": 1.5983178615570068, + "learning_rate": 4.413859506123221e-05, + "loss": 5.0733, + "step": 37408 + }, + { + "epoch": 0.22248192025882577, + "grad_norm": 1.9304602146148682, + "learning_rate": 4.4138294534270234e-05, + "loss": 4.9825, + "step": 37409 + }, + { + "epoch": 0.22248786754210675, + "grad_norm": 1.861481785774231, + "learning_rate": 4.413799400062728e-05, + "loss": 4.7403, + "step": 37410 + }, + { + "epoch": 0.22249381482538777, + "grad_norm": 1.5608370304107666, + "learning_rate": 4.413769346030345e-05, + "loss": 5.0014, + "step": 37411 + }, + { + "epoch": 0.22249976210866876, + "grad_norm": 1.7520523071289062, + "learning_rate": 4.413739291329884e-05, + "loss": 4.8831, + "step": 37412 + }, + { + "epoch": 0.22250570939194975, + "grad_norm": 1.58255136013031, + "learning_rate": 4.413709235961358e-05, + "loss": 4.8975, + "step": 37413 + }, + { + "epoch": 0.22251165667523076, + "grad_norm": 1.6198471784591675, + "learning_rate": 4.413679179924774e-05, + "loss": 4.9793, + "step": 37414 + }, + { + "epoch": 0.22251760395851175, + "grad_norm": 1.5712491273880005, + "learning_rate": 4.4136491232201454e-05, + "loss": 5.107, + "step": 37415 + }, + { + "epoch": 0.22252355124179274, + "grad_norm": 1.3748947381973267, + "learning_rate": 4.413619065847482e-05, + "loss": 5.0109, + "step": 37416 + }, + { + "epoch": 0.22252949852507375, + "grad_norm": 1.292171597480774, + "learning_rate": 4.4135890078067935e-05, + "loss": 4.7851, + "step": 37417 + }, + { + "epoch": 0.22253544580835474, + "grad_norm": 1.2264519929885864, + "learning_rate": 4.413558949098091e-05, + "loss": 4.7857, + "step": 37418 + }, + { + "epoch": 0.22254139309163573, + "grad_norm": 1.3294142484664917, + "learning_rate": 4.413528889721385e-05, + "loss": 4.6828, + "step": 37419 + }, + { + "epoch": 0.22254734037491675, + "grad_norm": 1.415412425994873, + "learning_rate": 4.413498829676685e-05, + "loss": 4.7359, + "step": 37420 + }, + { + "epoch": 0.22255328765819773, + "grad_norm": 1.5666321516036987, + "learning_rate": 4.4134687689640016e-05, + "loss": 4.8958, + "step": 37421 + }, + { + "epoch": 0.22255923494147872, + "grad_norm": 1.5865098237991333, + "learning_rate": 4.4134387075833484e-05, + "loss": 4.7827, + "step": 37422 + }, + { + "epoch": 0.22256518222475974, + "grad_norm": 1.4719741344451904, + "learning_rate": 4.4134086455347325e-05, + "loss": 4.673, + "step": 37423 + }, + { + "epoch": 0.22257112950804073, + "grad_norm": 1.574626088142395, + "learning_rate": 4.413378582818165e-05, + "loss": 4.6705, + "step": 37424 + }, + { + "epoch": 0.22257707679132172, + "grad_norm": 1.4533343315124512, + "learning_rate": 4.413348519433657e-05, + "loss": 4.5065, + "step": 37425 + }, + { + "epoch": 0.22258302407460273, + "grad_norm": 1.488586187362671, + "learning_rate": 4.413318455381219e-05, + "loss": 4.5226, + "step": 37426 + }, + { + "epoch": 0.22258897135788372, + "grad_norm": 1.7579782009124756, + "learning_rate": 4.4132883906608616e-05, + "loss": 4.6338, + "step": 37427 + }, + { + "epoch": 0.2225949186411647, + "grad_norm": 1.8849931955337524, + "learning_rate": 4.413258325272594e-05, + "loss": 4.6397, + "step": 37428 + }, + { + "epoch": 0.22260086592444572, + "grad_norm": 1.6533501148223877, + "learning_rate": 4.4132282592164286e-05, + "loss": 5.01, + "step": 37429 + }, + { + "epoch": 0.2226068132077267, + "grad_norm": 1.648901104927063, + "learning_rate": 4.4131981924923744e-05, + "loss": 4.9204, + "step": 37430 + }, + { + "epoch": 0.2226127604910077, + "grad_norm": 1.7030214071273804, + "learning_rate": 4.413168125100443e-05, + "loss": 4.839, + "step": 37431 + }, + { + "epoch": 0.22261870777428872, + "grad_norm": 1.5959028005599976, + "learning_rate": 4.413138057040644e-05, + "loss": 4.8709, + "step": 37432 + }, + { + "epoch": 0.2226246550575697, + "grad_norm": 1.9714707136154175, + "learning_rate": 4.413107988312988e-05, + "loss": 4.7504, + "step": 37433 + }, + { + "epoch": 0.2226306023408507, + "grad_norm": 1.6847248077392578, + "learning_rate": 4.4130779189174865e-05, + "loss": 4.8802, + "step": 37434 + }, + { + "epoch": 0.2226365496241317, + "grad_norm": 1.743908405303955, + "learning_rate": 4.4130478488541486e-05, + "loss": 4.8833, + "step": 37435 + }, + { + "epoch": 0.2226424969074127, + "grad_norm": 1.3226217031478882, + "learning_rate": 4.4130177781229855e-05, + "loss": 4.9731, + "step": 37436 + }, + { + "epoch": 0.22264844419069368, + "grad_norm": 1.6143287420272827, + "learning_rate": 4.412987706724008e-05, + "loss": 4.8707, + "step": 37437 + }, + { + "epoch": 0.2226543914739747, + "grad_norm": 1.3847980499267578, + "learning_rate": 4.4129576346572264e-05, + "loss": 5.3281, + "step": 37438 + }, + { + "epoch": 0.2226603387572557, + "grad_norm": 1.5923258066177368, + "learning_rate": 4.412927561922651e-05, + "loss": 4.7546, + "step": 37439 + }, + { + "epoch": 0.22266628604053668, + "grad_norm": 1.4530616998672485, + "learning_rate": 4.4128974885202914e-05, + "loss": 4.5942, + "step": 37440 + }, + { + "epoch": 0.2226722333238177, + "grad_norm": 1.5023390054702759, + "learning_rate": 4.4128674144501604e-05, + "loss": 4.7337, + "step": 37441 + }, + { + "epoch": 0.22267818060709868, + "grad_norm": 2.181118965148926, + "learning_rate": 4.4128373397122665e-05, + "loss": 3.9208, + "step": 37442 + }, + { + "epoch": 0.22268412789037967, + "grad_norm": 1.8951972723007202, + "learning_rate": 4.412807264306621e-05, + "loss": 3.7684, + "step": 37443 + }, + { + "epoch": 0.22269007517366068, + "grad_norm": 1.570377230644226, + "learning_rate": 4.412777188233234e-05, + "loss": 5.1016, + "step": 37444 + }, + { + "epoch": 0.22269602245694167, + "grad_norm": 1.399253487586975, + "learning_rate": 4.412747111492116e-05, + "loss": 5.2181, + "step": 37445 + }, + { + "epoch": 0.22270196974022266, + "grad_norm": 1.4472614526748657, + "learning_rate": 4.412717034083279e-05, + "loss": 4.9678, + "step": 37446 + }, + { + "epoch": 0.22270791702350368, + "grad_norm": 1.7032182216644287, + "learning_rate": 4.412686956006731e-05, + "loss": 5.055, + "step": 37447 + }, + { + "epoch": 0.22271386430678466, + "grad_norm": 2.5398552417755127, + "learning_rate": 4.412656877262484e-05, + "loss": 4.0731, + "step": 37448 + }, + { + "epoch": 0.22271981159006565, + "grad_norm": 1.3599528074264526, + "learning_rate": 4.4126267978505486e-05, + "loss": 5.2592, + "step": 37449 + }, + { + "epoch": 0.22272575887334667, + "grad_norm": 1.395141839981079, + "learning_rate": 4.412596717770935e-05, + "loss": 5.4062, + "step": 37450 + }, + { + "epoch": 0.22273170615662766, + "grad_norm": 1.623476505279541, + "learning_rate": 4.4125666370236526e-05, + "loss": 4.8599, + "step": 37451 + }, + { + "epoch": 0.22273765343990864, + "grad_norm": 1.533883810043335, + "learning_rate": 4.412536555608714e-05, + "loss": 4.8718, + "step": 37452 + }, + { + "epoch": 0.22274360072318963, + "grad_norm": 1.4520567655563354, + "learning_rate": 4.412506473526128e-05, + "loss": 5.0113, + "step": 37453 + }, + { + "epoch": 0.22274954800647065, + "grad_norm": 1.4977203607559204, + "learning_rate": 4.4124763907759064e-05, + "loss": 4.7799, + "step": 37454 + }, + { + "epoch": 0.22275549528975164, + "grad_norm": 2.2048726081848145, + "learning_rate": 4.412446307358059e-05, + "loss": 4.2383, + "step": 37455 + }, + { + "epoch": 0.22276144257303263, + "grad_norm": 1.8190462589263916, + "learning_rate": 4.4124162232725964e-05, + "loss": 4.8038, + "step": 37456 + }, + { + "epoch": 0.22276738985631364, + "grad_norm": 1.6494126319885254, + "learning_rate": 4.4123861385195286e-05, + "loss": 4.8618, + "step": 37457 + }, + { + "epoch": 0.22277333713959463, + "grad_norm": 1.6867988109588623, + "learning_rate": 4.412356053098866e-05, + "loss": 4.924, + "step": 37458 + }, + { + "epoch": 0.22277928442287562, + "grad_norm": 2.111293077468872, + "learning_rate": 4.412325967010621e-05, + "loss": 3.7083, + "step": 37459 + }, + { + "epoch": 0.22278523170615663, + "grad_norm": 1.785895586013794, + "learning_rate": 4.412295880254802e-05, + "loss": 4.6681, + "step": 37460 + }, + { + "epoch": 0.22279117898943762, + "grad_norm": 2.56091570854187, + "learning_rate": 4.41226579283142e-05, + "loss": 3.0985, + "step": 37461 + }, + { + "epoch": 0.2227971262727186, + "grad_norm": 1.961890459060669, + "learning_rate": 4.412235704740487e-05, + "loss": 4.2298, + "step": 37462 + }, + { + "epoch": 0.22280307355599963, + "grad_norm": 1.4484755992889404, + "learning_rate": 4.4122056159820116e-05, + "loss": 4.8922, + "step": 37463 + }, + { + "epoch": 0.2228090208392806, + "grad_norm": 1.5370919704437256, + "learning_rate": 4.412175526556004e-05, + "loss": 4.8014, + "step": 37464 + }, + { + "epoch": 0.2228149681225616, + "grad_norm": 1.1378029584884644, + "learning_rate": 4.412145436462477e-05, + "loss": 5.0578, + "step": 37465 + }, + { + "epoch": 0.22282091540584262, + "grad_norm": 1.4581009149551392, + "learning_rate": 4.412115345701439e-05, + "loss": 5.0006, + "step": 37466 + }, + { + "epoch": 0.2228268626891236, + "grad_norm": 1.5039770603179932, + "learning_rate": 4.412085254272902e-05, + "loss": 4.8977, + "step": 37467 + }, + { + "epoch": 0.2228328099724046, + "grad_norm": 2.168529510498047, + "learning_rate": 4.412055162176875e-05, + "loss": 3.9599, + "step": 37468 + }, + { + "epoch": 0.2228387572556856, + "grad_norm": 2.1273956298828125, + "learning_rate": 4.41202506941337e-05, + "loss": 4.1872, + "step": 37469 + }, + { + "epoch": 0.2228447045389666, + "grad_norm": 2.2555415630340576, + "learning_rate": 4.4119949759823965e-05, + "loss": 3.5768, + "step": 37470 + }, + { + "epoch": 0.2228506518222476, + "grad_norm": 1.4979069232940674, + "learning_rate": 4.411964881883965e-05, + "loss": 5.0065, + "step": 37471 + }, + { + "epoch": 0.2228565991055286, + "grad_norm": 1.27516508102417, + "learning_rate": 4.4119347871180865e-05, + "loss": 5.1013, + "step": 37472 + }, + { + "epoch": 0.2228625463888096, + "grad_norm": 1.738444209098816, + "learning_rate": 4.4119046916847715e-05, + "loss": 4.526, + "step": 37473 + }, + { + "epoch": 0.22286849367209058, + "grad_norm": 1.953614592552185, + "learning_rate": 4.4118745955840304e-05, + "loss": 5.1593, + "step": 37474 + }, + { + "epoch": 0.2228744409553716, + "grad_norm": 1.550534725189209, + "learning_rate": 4.411844498815873e-05, + "loss": 5.1234, + "step": 37475 + }, + { + "epoch": 0.22288038823865258, + "grad_norm": 1.301795244216919, + "learning_rate": 4.411814401380311e-05, + "loss": 4.9212, + "step": 37476 + }, + { + "epoch": 0.22288633552193357, + "grad_norm": 1.4100189208984375, + "learning_rate": 4.4117843032773545e-05, + "loss": 4.8568, + "step": 37477 + }, + { + "epoch": 0.2228922828052146, + "grad_norm": 1.6080713272094727, + "learning_rate": 4.4117542045070136e-05, + "loss": 4.8908, + "step": 37478 + }, + { + "epoch": 0.22289823008849557, + "grad_norm": 1.619407296180725, + "learning_rate": 4.411724105069299e-05, + "loss": 5.0473, + "step": 37479 + }, + { + "epoch": 0.22290417737177656, + "grad_norm": 2.0852749347686768, + "learning_rate": 4.411694004964221e-05, + "loss": 4.4932, + "step": 37480 + }, + { + "epoch": 0.22291012465505758, + "grad_norm": 1.6893035173416138, + "learning_rate": 4.411663904191791e-05, + "loss": 4.5006, + "step": 37481 + }, + { + "epoch": 0.22291607193833857, + "grad_norm": 1.794718861579895, + "learning_rate": 4.411633802752019e-05, + "loss": 4.4382, + "step": 37482 + }, + { + "epoch": 0.22292201922161956, + "grad_norm": 1.9049642086029053, + "learning_rate": 4.411603700644914e-05, + "loss": 4.2267, + "step": 37483 + }, + { + "epoch": 0.22292796650490057, + "grad_norm": 1.7459529638290405, + "learning_rate": 4.4115735978704894e-05, + "loss": 4.3071, + "step": 37484 + }, + { + "epoch": 0.22293391378818156, + "grad_norm": 2.4059667587280273, + "learning_rate": 4.4115434944287536e-05, + "loss": 3.3393, + "step": 37485 + }, + { + "epoch": 0.22293986107146255, + "grad_norm": 1.8413442373275757, + "learning_rate": 4.411513390319718e-05, + "loss": 4.3703, + "step": 37486 + }, + { + "epoch": 0.22294580835474356, + "grad_norm": 1.9206432104110718, + "learning_rate": 4.4114832855433916e-05, + "loss": 4.3611, + "step": 37487 + }, + { + "epoch": 0.22295175563802455, + "grad_norm": 1.8674482107162476, + "learning_rate": 4.4114531800997876e-05, + "loss": 3.9913, + "step": 37488 + }, + { + "epoch": 0.22295770292130554, + "grad_norm": 1.7336639165878296, + "learning_rate": 4.411423073988915e-05, + "loss": 4.26, + "step": 37489 + }, + { + "epoch": 0.22296365020458656, + "grad_norm": 2.2799072265625, + "learning_rate": 4.4113929672107834e-05, + "loss": 3.5076, + "step": 37490 + }, + { + "epoch": 0.22296959748786754, + "grad_norm": 1.8522865772247314, + "learning_rate": 4.411362859765405e-05, + "loss": 3.9865, + "step": 37491 + }, + { + "epoch": 0.22297554477114853, + "grad_norm": 2.023106098175049, + "learning_rate": 4.411332751652789e-05, + "loss": 4.3329, + "step": 37492 + }, + { + "epoch": 0.22298149205442955, + "grad_norm": 2.047266721725464, + "learning_rate": 4.4113026428729474e-05, + "loss": 3.833, + "step": 37493 + }, + { + "epoch": 0.22298743933771054, + "grad_norm": 3.233015775680542, + "learning_rate": 4.411272533425889e-05, + "loss": 2.4502, + "step": 37494 + }, + { + "epoch": 0.22299338662099152, + "grad_norm": 2.6674745082855225, + "learning_rate": 4.4112424233116254e-05, + "loss": 2.4631, + "step": 37495 + }, + { + "epoch": 0.22299933390427254, + "grad_norm": 2.8339457511901855, + "learning_rate": 4.411212312530167e-05, + "loss": 2.3195, + "step": 37496 + }, + { + "epoch": 0.22300528118755353, + "grad_norm": 2.9914252758026123, + "learning_rate": 4.411182201081524e-05, + "loss": 1.8622, + "step": 37497 + }, + { + "epoch": 0.22301122847083452, + "grad_norm": 2.7792508602142334, + "learning_rate": 4.411152088965706e-05, + "loss": 2.671, + "step": 37498 + }, + { + "epoch": 0.22301717575411553, + "grad_norm": 3.0347492694854736, + "learning_rate": 4.411121976182726e-05, + "loss": 1.9535, + "step": 37499 + }, + { + "epoch": 0.22302312303739652, + "grad_norm": 4.229783535003662, + "learning_rate": 4.4110918627325924e-05, + "loss": 2.3643, + "step": 37500 + }, + { + "epoch": 0.2230290703206775, + "grad_norm": 4.008993625640869, + "learning_rate": 4.411061748615317e-05, + "loss": 2.4496, + "step": 37501 + }, + { + "epoch": 0.22303501760395852, + "grad_norm": 2.544724702835083, + "learning_rate": 4.4110316338309086e-05, + "loss": 3.348, + "step": 37502 + }, + { + "epoch": 0.2230409648872395, + "grad_norm": 2.404447078704834, + "learning_rate": 4.4110015183793794e-05, + "loss": 3.4087, + "step": 37503 + }, + { + "epoch": 0.2230469121705205, + "grad_norm": 1.6754000186920166, + "learning_rate": 4.41097140226074e-05, + "loss": 5.1169, + "step": 37504 + }, + { + "epoch": 0.22305285945380152, + "grad_norm": 1.438940167427063, + "learning_rate": 4.4109412854749994e-05, + "loss": 5.1149, + "step": 37505 + }, + { + "epoch": 0.2230588067370825, + "grad_norm": 1.4823182821273804, + "learning_rate": 4.4109111680221685e-05, + "loss": 5.3555, + "step": 37506 + }, + { + "epoch": 0.2230647540203635, + "grad_norm": 1.63999342918396, + "learning_rate": 4.410881049902259e-05, + "loss": 5.1622, + "step": 37507 + }, + { + "epoch": 0.2230707013036445, + "grad_norm": 1.972383737564087, + "learning_rate": 4.410850931115281e-05, + "loss": 5.1285, + "step": 37508 + }, + { + "epoch": 0.2230766485869255, + "grad_norm": 1.6305196285247803, + "learning_rate": 4.4108208116612436e-05, + "loss": 5.0795, + "step": 37509 + }, + { + "epoch": 0.22308259587020648, + "grad_norm": 2.873812675476074, + "learning_rate": 4.41079069154016e-05, + "loss": 2.7287, + "step": 37510 + }, + { + "epoch": 0.2230885431534875, + "grad_norm": 2.0002241134643555, + "learning_rate": 4.410760570752037e-05, + "loss": 4.9091, + "step": 37511 + }, + { + "epoch": 0.2230944904367685, + "grad_norm": 1.6034547090530396, + "learning_rate": 4.410730449296889e-05, + "loss": 4.7845, + "step": 37512 + }, + { + "epoch": 0.22310043772004948, + "grad_norm": 1.55063796043396, + "learning_rate": 4.4107003271747236e-05, + "loss": 5.4072, + "step": 37513 + }, + { + "epoch": 0.22310638500333047, + "grad_norm": 1.4179991483688354, + "learning_rate": 4.410670204385553e-05, + "loss": 4.5219, + "step": 37514 + }, + { + "epoch": 0.22311233228661148, + "grad_norm": 1.62294340133667, + "learning_rate": 4.410640080929388e-05, + "loss": 4.7374, + "step": 37515 + }, + { + "epoch": 0.22311827956989247, + "grad_norm": 1.6114813089370728, + "learning_rate": 4.4106099568062367e-05, + "loss": 4.7302, + "step": 37516 + }, + { + "epoch": 0.22312422685317346, + "grad_norm": 1.6104267835617065, + "learning_rate": 4.4105798320161115e-05, + "loss": 4.7967, + "step": 37517 + }, + { + "epoch": 0.22313017413645447, + "grad_norm": 1.6183431148529053, + "learning_rate": 4.410549706559023e-05, + "loss": 4.5677, + "step": 37518 + }, + { + "epoch": 0.22313612141973546, + "grad_norm": 1.3311508893966675, + "learning_rate": 4.410519580434982e-05, + "loss": 4.5629, + "step": 37519 + }, + { + "epoch": 0.22314206870301645, + "grad_norm": 1.5924433469772339, + "learning_rate": 4.4104894536439974e-05, + "loss": 4.4574, + "step": 37520 + }, + { + "epoch": 0.22314801598629747, + "grad_norm": 2.075273275375366, + "learning_rate": 4.410459326186081e-05, + "loss": 3.9978, + "step": 37521 + }, + { + "epoch": 0.22315396326957845, + "grad_norm": 1.531011700630188, + "learning_rate": 4.410429198061243e-05, + "loss": 4.3236, + "step": 37522 + }, + { + "epoch": 0.22315991055285944, + "grad_norm": 1.6738545894622803, + "learning_rate": 4.410399069269494e-05, + "loss": 4.0581, + "step": 37523 + }, + { + "epoch": 0.22316585783614046, + "grad_norm": 1.5245554447174072, + "learning_rate": 4.410368939810844e-05, + "loss": 3.9841, + "step": 37524 + }, + { + "epoch": 0.22317180511942145, + "grad_norm": 1.6631666421890259, + "learning_rate": 4.4103388096853036e-05, + "loss": 4.336, + "step": 37525 + }, + { + "epoch": 0.22317775240270243, + "grad_norm": 1.869472622871399, + "learning_rate": 4.410308678892885e-05, + "loss": 4.8101, + "step": 37526 + }, + { + "epoch": 0.22318369968598345, + "grad_norm": 1.6422040462493896, + "learning_rate": 4.410278547433596e-05, + "loss": 4.8477, + "step": 37527 + }, + { + "epoch": 0.22318964696926444, + "grad_norm": 1.7507972717285156, + "learning_rate": 4.4102484153074496e-05, + "loss": 4.8798, + "step": 37528 + }, + { + "epoch": 0.22319559425254543, + "grad_norm": 1.578799843788147, + "learning_rate": 4.410218282514454e-05, + "loss": 4.6632, + "step": 37529 + }, + { + "epoch": 0.22320154153582644, + "grad_norm": 1.505683422088623, + "learning_rate": 4.410188149054623e-05, + "loss": 4.9293, + "step": 37530 + }, + { + "epoch": 0.22320748881910743, + "grad_norm": 2.2628307342529297, + "learning_rate": 4.410158014927963e-05, + "loss": 4.4323, + "step": 37531 + }, + { + "epoch": 0.22321343610238842, + "grad_norm": 1.7342091798782349, + "learning_rate": 4.4101278801344875e-05, + "loss": 3.8814, + "step": 37532 + }, + { + "epoch": 0.22321938338566943, + "grad_norm": 1.491938829421997, + "learning_rate": 4.4100977446742057e-05, + "loss": 5.0043, + "step": 37533 + }, + { + "epoch": 0.22322533066895042, + "grad_norm": 1.63412606716156, + "learning_rate": 4.4100676085471286e-05, + "loss": 4.9129, + "step": 37534 + }, + { + "epoch": 0.2232312779522314, + "grad_norm": 2.06516695022583, + "learning_rate": 4.4100374717532666e-05, + "loss": 4.1505, + "step": 37535 + }, + { + "epoch": 0.22323722523551243, + "grad_norm": 1.8857531547546387, + "learning_rate": 4.4100073342926304e-05, + "loss": 3.4167, + "step": 37536 + }, + { + "epoch": 0.22324317251879341, + "grad_norm": 1.8068853616714478, + "learning_rate": 4.409977196165231e-05, + "loss": 4.5034, + "step": 37537 + }, + { + "epoch": 0.2232491198020744, + "grad_norm": 1.6763041019439697, + "learning_rate": 4.4099470573710775e-05, + "loss": 4.7625, + "step": 37538 + }, + { + "epoch": 0.22325506708535542, + "grad_norm": 1.791761040687561, + "learning_rate": 4.409916917910181e-05, + "loss": 4.538, + "step": 37539 + }, + { + "epoch": 0.2232610143686364, + "grad_norm": 1.8424322605133057, + "learning_rate": 4.4098867777825526e-05, + "loss": 3.8683, + "step": 37540 + }, + { + "epoch": 0.2232669616519174, + "grad_norm": 2.100870370864868, + "learning_rate": 4.409856636988203e-05, + "loss": 4.2261, + "step": 37541 + }, + { + "epoch": 0.2232729089351984, + "grad_norm": 1.9232033491134644, + "learning_rate": 4.409826495527142e-05, + "loss": 4.8151, + "step": 37542 + }, + { + "epoch": 0.2232788562184794, + "grad_norm": 1.5741419792175293, + "learning_rate": 4.409796353399379e-05, + "loss": 4.9854, + "step": 37543 + }, + { + "epoch": 0.2232848035017604, + "grad_norm": 3.201462507247925, + "learning_rate": 4.4097662106049276e-05, + "loss": 2.4049, + "step": 37544 + }, + { + "epoch": 0.2232907507850414, + "grad_norm": 1.882828950881958, + "learning_rate": 4.4097360671437955e-05, + "loss": 3.4542, + "step": 37545 + }, + { + "epoch": 0.2232966980683224, + "grad_norm": 2.2759206295013428, + "learning_rate": 4.409705923015994e-05, + "loss": 3.3867, + "step": 37546 + }, + { + "epoch": 0.22330264535160338, + "grad_norm": 1.891266107559204, + "learning_rate": 4.409675778221535e-05, + "loss": 3.7851, + "step": 37547 + }, + { + "epoch": 0.2233085926348844, + "grad_norm": 1.5764224529266357, + "learning_rate": 4.409645632760427e-05, + "loss": 4.6758, + "step": 37548 + }, + { + "epoch": 0.22331453991816538, + "grad_norm": 1.5599266290664673, + "learning_rate": 4.409615486632681e-05, + "loss": 4.9474, + "step": 37549 + }, + { + "epoch": 0.22332048720144637, + "grad_norm": 1.725459098815918, + "learning_rate": 4.409585339838309e-05, + "loss": 5.1261, + "step": 37550 + }, + { + "epoch": 0.2233264344847274, + "grad_norm": 1.3846008777618408, + "learning_rate": 4.40955519237732e-05, + "loss": 5.7283, + "step": 37551 + }, + { + "epoch": 0.22333238176800838, + "grad_norm": 1.507738471031189, + "learning_rate": 4.409525044249726e-05, + "loss": 5.6287, + "step": 37552 + }, + { + "epoch": 0.22333832905128936, + "grad_norm": 1.7530328035354614, + "learning_rate": 4.409494895455535e-05, + "loss": 4.9708, + "step": 37553 + }, + { + "epoch": 0.22334427633457038, + "grad_norm": 1.8431001901626587, + "learning_rate": 4.40946474599476e-05, + "loss": 5.145, + "step": 37554 + }, + { + "epoch": 0.22335022361785137, + "grad_norm": 1.5956116914749146, + "learning_rate": 4.409434595867411e-05, + "loss": 5.0481, + "step": 37555 + }, + { + "epoch": 0.22335617090113236, + "grad_norm": 1.5909093618392944, + "learning_rate": 4.409404445073497e-05, + "loss": 4.9317, + "step": 37556 + }, + { + "epoch": 0.22336211818441337, + "grad_norm": 1.5663658380508423, + "learning_rate": 4.40937429361303e-05, + "loss": 4.6503, + "step": 37557 + }, + { + "epoch": 0.22336806546769436, + "grad_norm": 1.413888692855835, + "learning_rate": 4.40934414148602e-05, + "loss": 4.9422, + "step": 37558 + }, + { + "epoch": 0.22337401275097535, + "grad_norm": 1.8675824403762817, + "learning_rate": 4.4093139886924784e-05, + "loss": 5.2428, + "step": 37559 + }, + { + "epoch": 0.22337996003425636, + "grad_norm": 1.8924356698989868, + "learning_rate": 4.409283835232415e-05, + "loss": 4.7687, + "step": 37560 + }, + { + "epoch": 0.22338590731753735, + "grad_norm": 1.582779049873352, + "learning_rate": 4.409253681105839e-05, + "loss": 4.9511, + "step": 37561 + }, + { + "epoch": 0.22339185460081834, + "grad_norm": 1.267486572265625, + "learning_rate": 4.4092235263127634e-05, + "loss": 5.1539, + "step": 37562 + }, + { + "epoch": 0.22339780188409936, + "grad_norm": 1.5250635147094727, + "learning_rate": 4.4091933708531975e-05, + "loss": 5.1331, + "step": 37563 + }, + { + "epoch": 0.22340374916738034, + "grad_norm": 1.4498111009597778, + "learning_rate": 4.409163214727152e-05, + "loss": 5.0517, + "step": 37564 + }, + { + "epoch": 0.22340969645066133, + "grad_norm": 1.1852492094039917, + "learning_rate": 4.409133057934637e-05, + "loss": 5.0065, + "step": 37565 + }, + { + "epoch": 0.22341564373394235, + "grad_norm": 1.3684885501861572, + "learning_rate": 4.409102900475663e-05, + "loss": 5.7824, + "step": 37566 + }, + { + "epoch": 0.22342159101722334, + "grad_norm": 1.485929250717163, + "learning_rate": 4.409072742350242e-05, + "loss": 5.1218, + "step": 37567 + }, + { + "epoch": 0.22342753830050432, + "grad_norm": 1.6653156280517578, + "learning_rate": 4.409042583558383e-05, + "loss": 5.1565, + "step": 37568 + }, + { + "epoch": 0.22343348558378534, + "grad_norm": 1.5432231426239014, + "learning_rate": 4.4090124241000964e-05, + "loss": 5.1212, + "step": 37569 + }, + { + "epoch": 0.22343943286706633, + "grad_norm": 1.2738758325576782, + "learning_rate": 4.408982263975394e-05, + "loss": 4.9139, + "step": 37570 + }, + { + "epoch": 0.22344538015034732, + "grad_norm": 1.5660128593444824, + "learning_rate": 4.408952103184285e-05, + "loss": 5.1038, + "step": 37571 + }, + { + "epoch": 0.2234513274336283, + "grad_norm": 1.8557360172271729, + "learning_rate": 4.408921941726781e-05, + "loss": 3.7227, + "step": 37572 + }, + { + "epoch": 0.22345727471690932, + "grad_norm": 1.9540084600448608, + "learning_rate": 4.408891779602892e-05, + "loss": 4.9764, + "step": 37573 + }, + { + "epoch": 0.2234632220001903, + "grad_norm": 2.0382845401763916, + "learning_rate": 4.408861616812628e-05, + "loss": 3.7373, + "step": 37574 + }, + { + "epoch": 0.2234691692834713, + "grad_norm": 1.766674518585205, + "learning_rate": 4.4088314533560014e-05, + "loss": 3.8062, + "step": 37575 + }, + { + "epoch": 0.2234751165667523, + "grad_norm": 1.572192668914795, + "learning_rate": 4.4088012892330204e-05, + "loss": 3.8451, + "step": 37576 + }, + { + "epoch": 0.2234810638500333, + "grad_norm": 1.5495448112487793, + "learning_rate": 4.408771124443697e-05, + "loss": 4.217, + "step": 37577 + }, + { + "epoch": 0.2234870111333143, + "grad_norm": 1.5044076442718506, + "learning_rate": 4.408740958988041e-05, + "loss": 4.4927, + "step": 37578 + }, + { + "epoch": 0.2234929584165953, + "grad_norm": 1.9189188480377197, + "learning_rate": 4.408710792866064e-05, + "loss": 4.3213, + "step": 37579 + }, + { + "epoch": 0.2234989056998763, + "grad_norm": 1.7820667028427124, + "learning_rate": 4.4086806260777744e-05, + "loss": 4.7006, + "step": 37580 + }, + { + "epoch": 0.22350485298315728, + "grad_norm": 1.445273518562317, + "learning_rate": 4.408650458623186e-05, + "loss": 5.3059, + "step": 37581 + }, + { + "epoch": 0.2235108002664383, + "grad_norm": 1.649367094039917, + "learning_rate": 4.408620290502306e-05, + "loss": 5.2157, + "step": 37582 + }, + { + "epoch": 0.22351674754971929, + "grad_norm": 1.475437879562378, + "learning_rate": 4.408590121715147e-05, + "loss": 5.1782, + "step": 37583 + }, + { + "epoch": 0.22352269483300027, + "grad_norm": 1.7894258499145508, + "learning_rate": 4.408559952261718e-05, + "loss": 4.3475, + "step": 37584 + }, + { + "epoch": 0.2235286421162813, + "grad_norm": 1.75069260597229, + "learning_rate": 4.408529782142031e-05, + "loss": 4.5146, + "step": 37585 + }, + { + "epoch": 0.22353458939956228, + "grad_norm": 1.7912527322769165, + "learning_rate": 4.4084996113560967e-05, + "loss": 5.0784, + "step": 37586 + }, + { + "epoch": 0.22354053668284327, + "grad_norm": 1.85751211643219, + "learning_rate": 4.408469439903924e-05, + "loss": 4.4988, + "step": 37587 + }, + { + "epoch": 0.22354648396612428, + "grad_norm": 1.9540380239486694, + "learning_rate": 4.4084392677855245e-05, + "loss": 4.9008, + "step": 37588 + }, + { + "epoch": 0.22355243124940527, + "grad_norm": 1.750579595565796, + "learning_rate": 4.4084090950009094e-05, + "loss": 4.7877, + "step": 37589 + }, + { + "epoch": 0.22355837853268626, + "grad_norm": 1.8129644393920898, + "learning_rate": 4.408378921550088e-05, + "loss": 4.4876, + "step": 37590 + }, + { + "epoch": 0.22356432581596727, + "grad_norm": 1.6545523405075073, + "learning_rate": 4.40834874743307e-05, + "loss": 4.9386, + "step": 37591 + }, + { + "epoch": 0.22357027309924826, + "grad_norm": 1.6962411403656006, + "learning_rate": 4.4083185726498686e-05, + "loss": 4.8634, + "step": 37592 + }, + { + "epoch": 0.22357622038252925, + "grad_norm": 1.7945470809936523, + "learning_rate": 4.408288397200492e-05, + "loss": 4.8463, + "step": 37593 + }, + { + "epoch": 0.22358216766581027, + "grad_norm": 2.5073330402374268, + "learning_rate": 4.408258221084952e-05, + "loss": 3.1183, + "step": 37594 + }, + { + "epoch": 0.22358811494909125, + "grad_norm": 1.7957638502120972, + "learning_rate": 4.408228044303259e-05, + "loss": 4.1243, + "step": 37595 + }, + { + "epoch": 0.22359406223237224, + "grad_norm": 1.9337985515594482, + "learning_rate": 4.408197866855424e-05, + "loss": 4.6066, + "step": 37596 + }, + { + "epoch": 0.22360000951565326, + "grad_norm": 1.7588727474212646, + "learning_rate": 4.4081676887414555e-05, + "loss": 4.7197, + "step": 37597 + }, + { + "epoch": 0.22360595679893425, + "grad_norm": 1.607006311416626, + "learning_rate": 4.4081375099613656e-05, + "loss": 5.0925, + "step": 37598 + }, + { + "epoch": 0.22361190408221523, + "grad_norm": 1.4255245923995972, + "learning_rate": 4.408107330515165e-05, + "loss": 5.1365, + "step": 37599 + }, + { + "epoch": 0.22361785136549625, + "grad_norm": 1.2731602191925049, + "learning_rate": 4.4080771504028636e-05, + "loss": 5.3427, + "step": 37600 + }, + { + "epoch": 0.22362379864877724, + "grad_norm": 1.595390796661377, + "learning_rate": 4.408046969624472e-05, + "loss": 4.7735, + "step": 37601 + }, + { + "epoch": 0.22362974593205823, + "grad_norm": 1.7148345708847046, + "learning_rate": 4.408016788180002e-05, + "loss": 4.4927, + "step": 37602 + }, + { + "epoch": 0.22363569321533924, + "grad_norm": 1.810319423675537, + "learning_rate": 4.407986606069462e-05, + "loss": 4.799, + "step": 37603 + }, + { + "epoch": 0.22364164049862023, + "grad_norm": 2.190213680267334, + "learning_rate": 4.407956423292863e-05, + "loss": 4.0821, + "step": 37604 + }, + { + "epoch": 0.22364758778190122, + "grad_norm": 1.6705280542373657, + "learning_rate": 4.4079262398502174e-05, + "loss": 5.2685, + "step": 37605 + }, + { + "epoch": 0.22365353506518224, + "grad_norm": 1.8062423467636108, + "learning_rate": 4.407896055741534e-05, + "loss": 4.3002, + "step": 37606 + }, + { + "epoch": 0.22365948234846322, + "grad_norm": 1.9707059860229492, + "learning_rate": 4.407865870966824e-05, + "loss": 5.299, + "step": 37607 + }, + { + "epoch": 0.2236654296317442, + "grad_norm": 1.9427824020385742, + "learning_rate": 4.407835685526097e-05, + "loss": 4.2061, + "step": 37608 + }, + { + "epoch": 0.22367137691502523, + "grad_norm": 2.1432387828826904, + "learning_rate": 4.4078054994193654e-05, + "loss": 3.4683, + "step": 37609 + }, + { + "epoch": 0.22367732419830622, + "grad_norm": 2.56776762008667, + "learning_rate": 4.4077753126466374e-05, + "loss": 4.3432, + "step": 37610 + }, + { + "epoch": 0.2236832714815872, + "grad_norm": 2.1716179847717285, + "learning_rate": 4.407745125207926e-05, + "loss": 3.9791, + "step": 37611 + }, + { + "epoch": 0.22368921876486822, + "grad_norm": 1.9081957340240479, + "learning_rate": 4.4077149371032394e-05, + "loss": 4.2926, + "step": 37612 + }, + { + "epoch": 0.2236951660481492, + "grad_norm": 2.1505470275878906, + "learning_rate": 4.4076847483325903e-05, + "loss": 4.0981, + "step": 37613 + }, + { + "epoch": 0.2237011133314302, + "grad_norm": 2.2152764797210693, + "learning_rate": 4.407654558895987e-05, + "loss": 3.5981, + "step": 37614 + }, + { + "epoch": 0.2237070606147112, + "grad_norm": 2.2406694889068604, + "learning_rate": 4.407624368793442e-05, + "loss": 4.1125, + "step": 37615 + }, + { + "epoch": 0.2237130078979922, + "grad_norm": 1.7292073965072632, + "learning_rate": 4.4075941780249646e-05, + "loss": 4.4267, + "step": 37616 + }, + { + "epoch": 0.2237189551812732, + "grad_norm": 2.047788381576538, + "learning_rate": 4.407563986590566e-05, + "loss": 4.4591, + "step": 37617 + }, + { + "epoch": 0.2237249024645542, + "grad_norm": 2.3711559772491455, + "learning_rate": 4.407533794490256e-05, + "loss": 3.2147, + "step": 37618 + }, + { + "epoch": 0.2237308497478352, + "grad_norm": 2.232598304748535, + "learning_rate": 4.407503601724047e-05, + "loss": 3.1629, + "step": 37619 + }, + { + "epoch": 0.22373679703111618, + "grad_norm": 2.0596656799316406, + "learning_rate": 4.407473408291946e-05, + "loss": 3.2901, + "step": 37620 + }, + { + "epoch": 0.2237427443143972, + "grad_norm": 1.884080171585083, + "learning_rate": 4.407443214193968e-05, + "loss": 3.4791, + "step": 37621 + }, + { + "epoch": 0.22374869159767818, + "grad_norm": 1.9116895198822021, + "learning_rate": 4.40741301943012e-05, + "loss": 3.4078, + "step": 37622 + }, + { + "epoch": 0.22375463888095917, + "grad_norm": 1.9203144311904907, + "learning_rate": 4.4073828240004144e-05, + "loss": 3.2653, + "step": 37623 + }, + { + "epoch": 0.2237605861642402, + "grad_norm": 1.8317451477050781, + "learning_rate": 4.4073526279048616e-05, + "loss": 3.2768, + "step": 37624 + }, + { + "epoch": 0.22376653344752118, + "grad_norm": 1.9589619636535645, + "learning_rate": 4.4073224311434705e-05, + "loss": 3.3164, + "step": 37625 + }, + { + "epoch": 0.22377248073080216, + "grad_norm": 1.97921621799469, + "learning_rate": 4.407292233716254e-05, + "loss": 3.2702, + "step": 37626 + }, + { + "epoch": 0.22377842801408318, + "grad_norm": 1.758956789970398, + "learning_rate": 4.4072620356232205e-05, + "loss": 3.3953, + "step": 37627 + }, + { + "epoch": 0.22378437529736417, + "grad_norm": 1.907297968864441, + "learning_rate": 4.407231836864382e-05, + "loss": 3.2948, + "step": 37628 + }, + { + "epoch": 0.22379032258064516, + "grad_norm": 1.9227485656738281, + "learning_rate": 4.4072016374397485e-05, + "loss": 3.2674, + "step": 37629 + }, + { + "epoch": 0.22379626986392614, + "grad_norm": 2.019357442855835, + "learning_rate": 4.40717143734933e-05, + "loss": 3.349, + "step": 37630 + }, + { + "epoch": 0.22380221714720716, + "grad_norm": 2.0891315937042236, + "learning_rate": 4.4071412365931385e-05, + "loss": 3.1821, + "step": 37631 + }, + { + "epoch": 0.22380816443048815, + "grad_norm": 2.0140082836151123, + "learning_rate": 4.407111035171184e-05, + "loss": 3.3252, + "step": 37632 + }, + { + "epoch": 0.22381411171376914, + "grad_norm": 1.9410052299499512, + "learning_rate": 4.407080833083476e-05, + "loss": 3.3448, + "step": 37633 + }, + { + "epoch": 0.22382005899705015, + "grad_norm": 1.7290452718734741, + "learning_rate": 4.4070506303300266e-05, + "loss": 4.56, + "step": 37634 + }, + { + "epoch": 0.22382600628033114, + "grad_norm": 1.9766449928283691, + "learning_rate": 4.407020426910844e-05, + "loss": 3.263, + "step": 37635 + }, + { + "epoch": 0.22383195356361213, + "grad_norm": 1.8865668773651123, + "learning_rate": 4.4069902228259416e-05, + "loss": 3.4893, + "step": 37636 + }, + { + "epoch": 0.22383790084689315, + "grad_norm": 1.8951842784881592, + "learning_rate": 4.4069600180753276e-05, + "loss": 3.3569, + "step": 37637 + }, + { + "epoch": 0.22384384813017413, + "grad_norm": 1.8201243877410889, + "learning_rate": 4.406929812659014e-05, + "loss": 3.1592, + "step": 37638 + }, + { + "epoch": 0.22384979541345512, + "grad_norm": 1.917259931564331, + "learning_rate": 4.406899606577012e-05, + "loss": 3.2789, + "step": 37639 + }, + { + "epoch": 0.22385574269673614, + "grad_norm": 1.907003402709961, + "learning_rate": 4.40686939982933e-05, + "loss": 3.4995, + "step": 37640 + }, + { + "epoch": 0.22386168998001713, + "grad_norm": 1.937944769859314, + "learning_rate": 4.406839192415979e-05, + "loss": 3.8192, + "step": 37641 + }, + { + "epoch": 0.2238676372632981, + "grad_norm": 2.027071237564087, + "learning_rate": 4.406808984336971e-05, + "loss": 4.5019, + "step": 37642 + }, + { + "epoch": 0.22387358454657913, + "grad_norm": 2.0646047592163086, + "learning_rate": 4.406778775592316e-05, + "loss": 4.6154, + "step": 37643 + }, + { + "epoch": 0.22387953182986012, + "grad_norm": 2.3060433864593506, + "learning_rate": 4.406748566182023e-05, + "loss": 4.4522, + "step": 37644 + }, + { + "epoch": 0.2238854791131411, + "grad_norm": 2.676363706588745, + "learning_rate": 4.406718356106105e-05, + "loss": 4.5443, + "step": 37645 + }, + { + "epoch": 0.22389142639642212, + "grad_norm": 2.4247331619262695, + "learning_rate": 4.406688145364571e-05, + "loss": 4.4311, + "step": 37646 + }, + { + "epoch": 0.2238973736797031, + "grad_norm": 2.202650785446167, + "learning_rate": 4.406657933957431e-05, + "loss": 4.1083, + "step": 37647 + }, + { + "epoch": 0.2239033209629841, + "grad_norm": 2.003657579421997, + "learning_rate": 4.406627721884697e-05, + "loss": 4.5928, + "step": 37648 + }, + { + "epoch": 0.22390926824626511, + "grad_norm": 1.8342602252960205, + "learning_rate": 4.4065975091463796e-05, + "loss": 4.4013, + "step": 37649 + }, + { + "epoch": 0.2239152155295461, + "grad_norm": 2.3087284564971924, + "learning_rate": 4.406567295742488e-05, + "loss": 3.9403, + "step": 37650 + }, + { + "epoch": 0.2239211628128271, + "grad_norm": 2.0790791511535645, + "learning_rate": 4.406537081673034e-05, + "loss": 4.5265, + "step": 37651 + }, + { + "epoch": 0.2239271100961081, + "grad_norm": 1.8618361949920654, + "learning_rate": 4.4065068669380274e-05, + "loss": 4.0404, + "step": 37652 + }, + { + "epoch": 0.2239330573793891, + "grad_norm": 2.4421863555908203, + "learning_rate": 4.406476651537478e-05, + "loss": 4.1454, + "step": 37653 + }, + { + "epoch": 0.22393900466267008, + "grad_norm": 2.2863211631774902, + "learning_rate": 4.4064464354713986e-05, + "loss": 4.0519, + "step": 37654 + }, + { + "epoch": 0.2239449519459511, + "grad_norm": 2.191511392593384, + "learning_rate": 4.406416218739798e-05, + "loss": 4.0931, + "step": 37655 + }, + { + "epoch": 0.2239508992292321, + "grad_norm": 2.0519556999206543, + "learning_rate": 4.406386001342687e-05, + "loss": 4.5592, + "step": 37656 + }, + { + "epoch": 0.22395684651251307, + "grad_norm": 2.1067867279052734, + "learning_rate": 4.406355783280076e-05, + "loss": 3.9904, + "step": 37657 + }, + { + "epoch": 0.2239627937957941, + "grad_norm": 2.265929937362671, + "learning_rate": 4.406325564551977e-05, + "loss": 4.0379, + "step": 37658 + }, + { + "epoch": 0.22396874107907508, + "grad_norm": 2.3162310123443604, + "learning_rate": 4.406295345158399e-05, + "loss": 4.1419, + "step": 37659 + }, + { + "epoch": 0.22397468836235607, + "grad_norm": 2.2604496479034424, + "learning_rate": 4.406265125099353e-05, + "loss": 4.0614, + "step": 37660 + }, + { + "epoch": 0.22398063564563708, + "grad_norm": 2.4090497493743896, + "learning_rate": 4.406234904374849e-05, + "loss": 4.0914, + "step": 37661 + }, + { + "epoch": 0.22398658292891807, + "grad_norm": 2.472126007080078, + "learning_rate": 4.406204682984898e-05, + "loss": 3.7101, + "step": 37662 + }, + { + "epoch": 0.22399253021219906, + "grad_norm": 2.605400323867798, + "learning_rate": 4.406174460929511e-05, + "loss": 4.0506, + "step": 37663 + }, + { + "epoch": 0.22399847749548008, + "grad_norm": 2.2989494800567627, + "learning_rate": 4.406144238208698e-05, + "loss": 4.1113, + "step": 37664 + }, + { + "epoch": 0.22400442477876106, + "grad_norm": 2.052351713180542, + "learning_rate": 4.406114014822471e-05, + "loss": 4.2539, + "step": 37665 + }, + { + "epoch": 0.22401037206204205, + "grad_norm": 2.4834420680999756, + "learning_rate": 4.4060837907708375e-05, + "loss": 4.5214, + "step": 37666 + }, + { + "epoch": 0.22401631934532307, + "grad_norm": 2.3077211380004883, + "learning_rate": 4.406053566053811e-05, + "loss": 4.3625, + "step": 37667 + }, + { + "epoch": 0.22402226662860406, + "grad_norm": 2.109318971633911, + "learning_rate": 4.4060233406714e-05, + "loss": 3.9841, + "step": 37668 + }, + { + "epoch": 0.22402821391188504, + "grad_norm": 2.35363507270813, + "learning_rate": 4.4059931146236165e-05, + "loss": 4.0543, + "step": 37669 + }, + { + "epoch": 0.22403416119516606, + "grad_norm": 2.142488718032837, + "learning_rate": 4.405962887910471e-05, + "loss": 4.4115, + "step": 37670 + }, + { + "epoch": 0.22404010847844705, + "grad_norm": 1.740767240524292, + "learning_rate": 4.405932660531973e-05, + "loss": 4.2541, + "step": 37671 + }, + { + "epoch": 0.22404605576172804, + "grad_norm": 2.355954885482788, + "learning_rate": 4.4059024324881335e-05, + "loss": 4.145, + "step": 37672 + }, + { + "epoch": 0.22405200304500905, + "grad_norm": 2.334618091583252, + "learning_rate": 4.4058722037789635e-05, + "loss": 3.9459, + "step": 37673 + }, + { + "epoch": 0.22405795032829004, + "grad_norm": 2.15889048576355, + "learning_rate": 4.405841974404473e-05, + "loss": 3.9556, + "step": 37674 + }, + { + "epoch": 0.22406389761157103, + "grad_norm": 2.0851681232452393, + "learning_rate": 4.4058117443646724e-05, + "loss": 3.7941, + "step": 37675 + }, + { + "epoch": 0.22406984489485204, + "grad_norm": 2.1203784942626953, + "learning_rate": 4.405781513659572e-05, + "loss": 3.4324, + "step": 37676 + }, + { + "epoch": 0.22407579217813303, + "grad_norm": 2.550434112548828, + "learning_rate": 4.405751282289185e-05, + "loss": 4.2083, + "step": 37677 + }, + { + "epoch": 0.22408173946141402, + "grad_norm": 2.1875874996185303, + "learning_rate": 4.4057210502535184e-05, + "loss": 4.2803, + "step": 37678 + }, + { + "epoch": 0.22408768674469504, + "grad_norm": 1.5319013595581055, + "learning_rate": 4.4056908175525844e-05, + "loss": 5.1222, + "step": 37679 + }, + { + "epoch": 0.22409363402797602, + "grad_norm": 1.5734094381332397, + "learning_rate": 4.405660584186394e-05, + "loss": 4.7175, + "step": 37680 + }, + { + "epoch": 0.224099581311257, + "grad_norm": 2.2757771015167236, + "learning_rate": 4.405630350154957e-05, + "loss": 4.0468, + "step": 37681 + }, + { + "epoch": 0.22410552859453803, + "grad_norm": 1.579903483390808, + "learning_rate": 4.405600115458284e-05, + "loss": 5.1165, + "step": 37682 + }, + { + "epoch": 0.22411147587781902, + "grad_norm": 1.4931329488754272, + "learning_rate": 4.4055698800963855e-05, + "loss": 5.1076, + "step": 37683 + }, + { + "epoch": 0.2241174231611, + "grad_norm": 1.4989633560180664, + "learning_rate": 4.4055396440692724e-05, + "loss": 5.1507, + "step": 37684 + }, + { + "epoch": 0.22412337044438102, + "grad_norm": 1.249399185180664, + "learning_rate": 4.405509407376955e-05, + "loss": 5.0259, + "step": 37685 + }, + { + "epoch": 0.224129317727662, + "grad_norm": 1.4521582126617432, + "learning_rate": 4.4054791700194445e-05, + "loss": 4.5992, + "step": 37686 + }, + { + "epoch": 0.224135265010943, + "grad_norm": 1.8319655656814575, + "learning_rate": 4.405448931996751e-05, + "loss": 4.1182, + "step": 37687 + }, + { + "epoch": 0.22414121229422398, + "grad_norm": 1.6663216352462769, + "learning_rate": 4.4054186933088836e-05, + "loss": 4.8555, + "step": 37688 + }, + { + "epoch": 0.224147159577505, + "grad_norm": 1.56107759475708, + "learning_rate": 4.4053884539558556e-05, + "loss": 5.1188, + "step": 37689 + }, + { + "epoch": 0.224153106860786, + "grad_norm": 1.94844388961792, + "learning_rate": 4.4053582139376756e-05, + "loss": 5.0103, + "step": 37690 + }, + { + "epoch": 0.22415905414406698, + "grad_norm": 2.3126590251922607, + "learning_rate": 4.4053279732543546e-05, + "loss": 5.3361, + "step": 37691 + }, + { + "epoch": 0.224165001427348, + "grad_norm": 2.051386833190918, + "learning_rate": 4.405297731905903e-05, + "loss": 4.8889, + "step": 37692 + }, + { + "epoch": 0.22417094871062898, + "grad_norm": 2.7010273933410645, + "learning_rate": 4.405267489892333e-05, + "loss": 3.7776, + "step": 37693 + }, + { + "epoch": 0.22417689599390997, + "grad_norm": 2.7000935077667236, + "learning_rate": 4.4052372472136526e-05, + "loss": 3.6693, + "step": 37694 + }, + { + "epoch": 0.22418284327719099, + "grad_norm": 2.795950174331665, + "learning_rate": 4.405207003869874e-05, + "loss": 4.1973, + "step": 37695 + }, + { + "epoch": 0.22418879056047197, + "grad_norm": 1.8645349740982056, + "learning_rate": 4.405176759861007e-05, + "loss": 4.5066, + "step": 37696 + }, + { + "epoch": 0.22419473784375296, + "grad_norm": 1.8882391452789307, + "learning_rate": 4.405146515187063e-05, + "loss": 4.4669, + "step": 37697 + }, + { + "epoch": 0.22420068512703398, + "grad_norm": 1.7081935405731201, + "learning_rate": 4.405116269848051e-05, + "loss": 4.4768, + "step": 37698 + }, + { + "epoch": 0.22420663241031497, + "grad_norm": 2.3097574710845947, + "learning_rate": 4.405086023843984e-05, + "loss": 4.5242, + "step": 37699 + }, + { + "epoch": 0.22421257969359595, + "grad_norm": 1.777329921722412, + "learning_rate": 4.40505577717487e-05, + "loss": 4.4065, + "step": 37700 + }, + { + "epoch": 0.22421852697687697, + "grad_norm": 1.9584839344024658, + "learning_rate": 4.405025529840721e-05, + "loss": 4.3524, + "step": 37701 + }, + { + "epoch": 0.22422447426015796, + "grad_norm": 2.2504661083221436, + "learning_rate": 4.4049952818415474e-05, + "loss": 4.4163, + "step": 37702 + }, + { + "epoch": 0.22423042154343895, + "grad_norm": 2.2781872749328613, + "learning_rate": 4.404965033177359e-05, + "loss": 4.8339, + "step": 37703 + }, + { + "epoch": 0.22423636882671996, + "grad_norm": 1.6593425273895264, + "learning_rate": 4.404934783848169e-05, + "loss": 5.1117, + "step": 37704 + }, + { + "epoch": 0.22424231611000095, + "grad_norm": 1.8437799215316772, + "learning_rate": 4.404904533853984e-05, + "loss": 4.5249, + "step": 37705 + }, + { + "epoch": 0.22424826339328194, + "grad_norm": 1.8311305046081543, + "learning_rate": 4.404874283194818e-05, + "loss": 4.3728, + "step": 37706 + }, + { + "epoch": 0.22425421067656295, + "grad_norm": 1.7205126285552979, + "learning_rate": 4.4048440318706784e-05, + "loss": 4.3997, + "step": 37707 + }, + { + "epoch": 0.22426015795984394, + "grad_norm": 1.6154394149780273, + "learning_rate": 4.4048137798815784e-05, + "loss": 4.7922, + "step": 37708 + }, + { + "epoch": 0.22426610524312493, + "grad_norm": 1.695842981338501, + "learning_rate": 4.4047835272275276e-05, + "loss": 4.3493, + "step": 37709 + }, + { + "epoch": 0.22427205252640595, + "grad_norm": 1.6647610664367676, + "learning_rate": 4.404753273908536e-05, + "loss": 4.216, + "step": 37710 + }, + { + "epoch": 0.22427799980968693, + "grad_norm": 1.771431803703308, + "learning_rate": 4.404723019924615e-05, + "loss": 3.4842, + "step": 37711 + }, + { + "epoch": 0.22428394709296792, + "grad_norm": 1.923303484916687, + "learning_rate": 4.4046927652757756e-05, + "loss": 3.6944, + "step": 37712 + }, + { + "epoch": 0.22428989437624894, + "grad_norm": 1.8749626874923706, + "learning_rate": 4.4046625099620264e-05, + "loss": 3.9616, + "step": 37713 + }, + { + "epoch": 0.22429584165952993, + "grad_norm": 2.2243165969848633, + "learning_rate": 4.4046322539833795e-05, + "loss": 3.1859, + "step": 37714 + }, + { + "epoch": 0.22430178894281091, + "grad_norm": 1.9024165868759155, + "learning_rate": 4.4046019973398455e-05, + "loss": 4.1793, + "step": 37715 + }, + { + "epoch": 0.22430773622609193, + "grad_norm": 1.6997089385986328, + "learning_rate": 4.404571740031435e-05, + "loss": 4.0071, + "step": 37716 + }, + { + "epoch": 0.22431368350937292, + "grad_norm": 1.8075357675552368, + "learning_rate": 4.4045414820581574e-05, + "loss": 3.8646, + "step": 37717 + }, + { + "epoch": 0.2243196307926539, + "grad_norm": 1.9021435976028442, + "learning_rate": 4.404511223420024e-05, + "loss": 3.8813, + "step": 37718 + }, + { + "epoch": 0.22432557807593492, + "grad_norm": 1.7350317239761353, + "learning_rate": 4.404480964117046e-05, + "loss": 4.0871, + "step": 37719 + }, + { + "epoch": 0.2243315253592159, + "grad_norm": 2.2858777046203613, + "learning_rate": 4.4044507041492337e-05, + "loss": 3.489, + "step": 37720 + }, + { + "epoch": 0.2243374726424969, + "grad_norm": 2.2867650985717773, + "learning_rate": 4.404420443516596e-05, + "loss": 2.6582, + "step": 37721 + }, + { + "epoch": 0.22434341992577791, + "grad_norm": 1.6754149198532104, + "learning_rate": 4.404390182219146e-05, + "loss": 4.8954, + "step": 37722 + }, + { + "epoch": 0.2243493672090589, + "grad_norm": 3.31197190284729, + "learning_rate": 4.404359920256892e-05, + "loss": 1.4784, + "step": 37723 + }, + { + "epoch": 0.2243553144923399, + "grad_norm": 2.2913246154785156, + "learning_rate": 4.4043296576298464e-05, + "loss": 3.6803, + "step": 37724 + }, + { + "epoch": 0.2243612617756209, + "grad_norm": 1.5901116132736206, + "learning_rate": 4.404299394338019e-05, + "loss": 4.5641, + "step": 37725 + }, + { + "epoch": 0.2243672090589019, + "grad_norm": 1.5679881572723389, + "learning_rate": 4.4042691303814204e-05, + "loss": 4.6357, + "step": 37726 + }, + { + "epoch": 0.22437315634218288, + "grad_norm": 1.945236325263977, + "learning_rate": 4.4042388657600606e-05, + "loss": 3.9994, + "step": 37727 + }, + { + "epoch": 0.2243791036254639, + "grad_norm": 1.9577616453170776, + "learning_rate": 4.404208600473951e-05, + "loss": 3.7444, + "step": 37728 + }, + { + "epoch": 0.2243850509087449, + "grad_norm": 1.7644386291503906, + "learning_rate": 4.404178334523102e-05, + "loss": 3.6799, + "step": 37729 + }, + { + "epoch": 0.22439099819202588, + "grad_norm": 1.9280447959899902, + "learning_rate": 4.4041480679075245e-05, + "loss": 4.2304, + "step": 37730 + }, + { + "epoch": 0.2243969454753069, + "grad_norm": 2.3529813289642334, + "learning_rate": 4.404117800627228e-05, + "loss": 4.1772, + "step": 37731 + }, + { + "epoch": 0.22440289275858788, + "grad_norm": 2.0188229084014893, + "learning_rate": 4.404087532682223e-05, + "loss": 3.7716, + "step": 37732 + }, + { + "epoch": 0.22440884004186887, + "grad_norm": 2.0751125812530518, + "learning_rate": 4.4040572640725215e-05, + "loss": 3.5754, + "step": 37733 + }, + { + "epoch": 0.22441478732514988, + "grad_norm": 2.1539642810821533, + "learning_rate": 4.4040269947981325e-05, + "loss": 3.1711, + "step": 37734 + }, + { + "epoch": 0.22442073460843087, + "grad_norm": 2.405971050262451, + "learning_rate": 4.403996724859069e-05, + "loss": 3.9476, + "step": 37735 + }, + { + "epoch": 0.22442668189171186, + "grad_norm": 2.26133131980896, + "learning_rate": 4.403966454255338e-05, + "loss": 3.2927, + "step": 37736 + }, + { + "epoch": 0.22443262917499288, + "grad_norm": 2.0596282482147217, + "learning_rate": 4.403936182986953e-05, + "loss": 4.1827, + "step": 37737 + }, + { + "epoch": 0.22443857645827386, + "grad_norm": 2.0279719829559326, + "learning_rate": 4.403905911053924e-05, + "loss": 4.2153, + "step": 37738 + }, + { + "epoch": 0.22444452374155485, + "grad_norm": 2.963252067565918, + "learning_rate": 4.4038756384562596e-05, + "loss": 1.2135, + "step": 37739 + }, + { + "epoch": 0.22445047102483587, + "grad_norm": 3.4959542751312256, + "learning_rate": 4.403845365193974e-05, + "loss": 1.5318, + "step": 37740 + }, + { + "epoch": 0.22445641830811686, + "grad_norm": 3.365992784500122, + "learning_rate": 4.403815091267074e-05, + "loss": 1.4584, + "step": 37741 + }, + { + "epoch": 0.22446236559139784, + "grad_norm": 4.033193588256836, + "learning_rate": 4.403784816675572e-05, + "loss": 1.7688, + "step": 37742 + }, + { + "epoch": 0.22446831287467886, + "grad_norm": 3.2633559703826904, + "learning_rate": 4.4037545414194784e-05, + "loss": 1.3533, + "step": 37743 + }, + { + "epoch": 0.22447426015795985, + "grad_norm": 4.7632598876953125, + "learning_rate": 4.403724265498804e-05, + "loss": 1.6567, + "step": 37744 + }, + { + "epoch": 0.22448020744124084, + "grad_norm": 1.7756397724151611, + "learning_rate": 4.40369398891356e-05, + "loss": 4.1304, + "step": 37745 + }, + { + "epoch": 0.22448615472452182, + "grad_norm": 2.0562822818756104, + "learning_rate": 4.403663711663755e-05, + "loss": 4.2739, + "step": 37746 + }, + { + "epoch": 0.22449210200780284, + "grad_norm": 1.9821317195892334, + "learning_rate": 4.4036334337494007e-05, + "loss": 4.1306, + "step": 37747 + }, + { + "epoch": 0.22449804929108383, + "grad_norm": 1.877347469329834, + "learning_rate": 4.403603155170508e-05, + "loss": 4.5766, + "step": 37748 + }, + { + "epoch": 0.22450399657436482, + "grad_norm": 1.9222016334533691, + "learning_rate": 4.403572875927087e-05, + "loss": 4.3211, + "step": 37749 + }, + { + "epoch": 0.22450994385764583, + "grad_norm": 2.12162709236145, + "learning_rate": 4.403542596019148e-05, + "loss": 4.6083, + "step": 37750 + }, + { + "epoch": 0.22451589114092682, + "grad_norm": 1.482627272605896, + "learning_rate": 4.403512315446702e-05, + "loss": 4.6339, + "step": 37751 + }, + { + "epoch": 0.2245218384242078, + "grad_norm": 1.8798069953918457, + "learning_rate": 4.40348203420976e-05, + "loss": 4.365, + "step": 37752 + }, + { + "epoch": 0.22452778570748883, + "grad_norm": 1.9546221494674683, + "learning_rate": 4.403451752308332e-05, + "loss": 4.4222, + "step": 37753 + }, + { + "epoch": 0.2245337329907698, + "grad_norm": 2.216580629348755, + "learning_rate": 4.403421469742428e-05, + "loss": 3.6172, + "step": 37754 + }, + { + "epoch": 0.2245396802740508, + "grad_norm": 1.872523307800293, + "learning_rate": 4.4033911865120606e-05, + "loss": 4.1351, + "step": 37755 + }, + { + "epoch": 0.22454562755733182, + "grad_norm": 1.892671823501587, + "learning_rate": 4.403360902617238e-05, + "loss": 4.4622, + "step": 37756 + }, + { + "epoch": 0.2245515748406128, + "grad_norm": 1.8072375059127808, + "learning_rate": 4.4033306180579713e-05, + "loss": 4.1739, + "step": 37757 + }, + { + "epoch": 0.2245575221238938, + "grad_norm": 1.6658414602279663, + "learning_rate": 4.4033003328342725e-05, + "loss": 4.4121, + "step": 37758 + }, + { + "epoch": 0.2245634694071748, + "grad_norm": 1.9384973049163818, + "learning_rate": 4.403270046946151e-05, + "loss": 4.3525, + "step": 37759 + }, + { + "epoch": 0.2245694166904558, + "grad_norm": 1.798912763595581, + "learning_rate": 4.403239760393617e-05, + "loss": 4.0399, + "step": 37760 + }, + { + "epoch": 0.22457536397373679, + "grad_norm": 2.1004838943481445, + "learning_rate": 4.4032094731766825e-05, + "loss": 4.6759, + "step": 37761 + }, + { + "epoch": 0.2245813112570178, + "grad_norm": 2.2671115398406982, + "learning_rate": 4.403179185295357e-05, + "loss": 4.5474, + "step": 37762 + }, + { + "epoch": 0.2245872585402988, + "grad_norm": 2.3091773986816406, + "learning_rate": 4.403148896749651e-05, + "loss": 4.0547, + "step": 37763 + }, + { + "epoch": 0.22459320582357978, + "grad_norm": 1.733040452003479, + "learning_rate": 4.403118607539576e-05, + "loss": 4.3636, + "step": 37764 + }, + { + "epoch": 0.2245991531068608, + "grad_norm": 1.7878620624542236, + "learning_rate": 4.403088317665142e-05, + "loss": 4.1853, + "step": 37765 + }, + { + "epoch": 0.22460510039014178, + "grad_norm": 1.7317149639129639, + "learning_rate": 4.4030580271263586e-05, + "loss": 4.5654, + "step": 37766 + }, + { + "epoch": 0.22461104767342277, + "grad_norm": 1.7622241973876953, + "learning_rate": 4.403027735923237e-05, + "loss": 4.4526, + "step": 37767 + }, + { + "epoch": 0.22461699495670379, + "grad_norm": 1.70356023311615, + "learning_rate": 4.4029974440557895e-05, + "loss": 4.328, + "step": 37768 + }, + { + "epoch": 0.22462294223998477, + "grad_norm": 1.8185654878616333, + "learning_rate": 4.4029671515240245e-05, + "loss": 4.2525, + "step": 37769 + }, + { + "epoch": 0.22462888952326576, + "grad_norm": 1.8527708053588867, + "learning_rate": 4.402936858327953e-05, + "loss": 4.125, + "step": 37770 + }, + { + "epoch": 0.22463483680654678, + "grad_norm": 1.809019923210144, + "learning_rate": 4.402906564467587e-05, + "loss": 4.0711, + "step": 37771 + }, + { + "epoch": 0.22464078408982777, + "grad_norm": 1.8672319650650024, + "learning_rate": 4.402876269942935e-05, + "loss": 3.9145, + "step": 37772 + }, + { + "epoch": 0.22464673137310875, + "grad_norm": 2.5570287704467773, + "learning_rate": 4.4028459747540086e-05, + "loss": 4.1233, + "step": 37773 + }, + { + "epoch": 0.22465267865638977, + "grad_norm": 1.6708779335021973, + "learning_rate": 4.402815678900819e-05, + "loss": 4.4108, + "step": 37774 + }, + { + "epoch": 0.22465862593967076, + "grad_norm": 2.625601053237915, + "learning_rate": 4.4027853823833755e-05, + "loss": 4.6374, + "step": 37775 + }, + { + "epoch": 0.22466457322295175, + "grad_norm": 2.5341086387634277, + "learning_rate": 4.40275508520169e-05, + "loss": 4.4189, + "step": 37776 + }, + { + "epoch": 0.22467052050623276, + "grad_norm": 2.4302070140838623, + "learning_rate": 4.402724787355771e-05, + "loss": 4.4121, + "step": 37777 + }, + { + "epoch": 0.22467646778951375, + "grad_norm": 2.4907209873199463, + "learning_rate": 4.402694488845631e-05, + "loss": 4.3042, + "step": 37778 + }, + { + "epoch": 0.22468241507279474, + "grad_norm": 2.352330446243286, + "learning_rate": 4.402664189671281e-05, + "loss": 4.4724, + "step": 37779 + }, + { + "epoch": 0.22468836235607575, + "grad_norm": 2.715082883834839, + "learning_rate": 4.40263388983273e-05, + "loss": 4.3558, + "step": 37780 + }, + { + "epoch": 0.22469430963935674, + "grad_norm": 1.915778398513794, + "learning_rate": 4.402603589329989e-05, + "loss": 4.186, + "step": 37781 + }, + { + "epoch": 0.22470025692263773, + "grad_norm": 1.6563055515289307, + "learning_rate": 4.402573288163069e-05, + "loss": 4.5796, + "step": 37782 + }, + { + "epoch": 0.22470620420591875, + "grad_norm": 1.877414584159851, + "learning_rate": 4.40254298633198e-05, + "loss": 5.0202, + "step": 37783 + }, + { + "epoch": 0.22471215148919974, + "grad_norm": 1.7423501014709473, + "learning_rate": 4.402512683836732e-05, + "loss": 5.1002, + "step": 37784 + }, + { + "epoch": 0.22471809877248072, + "grad_norm": 1.7512094974517822, + "learning_rate": 4.402482380677338e-05, + "loss": 4.8946, + "step": 37785 + }, + { + "epoch": 0.22472404605576174, + "grad_norm": 1.822348713874817, + "learning_rate": 4.402452076853807e-05, + "loss": 4.2904, + "step": 37786 + }, + { + "epoch": 0.22472999333904273, + "grad_norm": 2.1487886905670166, + "learning_rate": 4.4024217723661485e-05, + "loss": 3.6705, + "step": 37787 + }, + { + "epoch": 0.22473594062232372, + "grad_norm": 1.9224172830581665, + "learning_rate": 4.402391467214375e-05, + "loss": 3.8513, + "step": 37788 + }, + { + "epoch": 0.22474188790560473, + "grad_norm": 2.0339977741241455, + "learning_rate": 4.4023611613984964e-05, + "loss": 4.1311, + "step": 37789 + }, + { + "epoch": 0.22474783518888572, + "grad_norm": 1.9256433248519897, + "learning_rate": 4.402330854918523e-05, + "loss": 4.3357, + "step": 37790 + }, + { + "epoch": 0.2247537824721667, + "grad_norm": 1.8157620429992676, + "learning_rate": 4.402300547774465e-05, + "loss": 4.2782, + "step": 37791 + }, + { + "epoch": 0.22475972975544772, + "grad_norm": 2.068574905395508, + "learning_rate": 4.402270239966334e-05, + "loss": 4.2032, + "step": 37792 + }, + { + "epoch": 0.2247656770387287, + "grad_norm": 1.9213577508926392, + "learning_rate": 4.40223993149414e-05, + "loss": 4.3021, + "step": 37793 + }, + { + "epoch": 0.2247716243220097, + "grad_norm": 2.1965863704681396, + "learning_rate": 4.402209622357894e-05, + "loss": 4.3002, + "step": 37794 + }, + { + "epoch": 0.22477757160529072, + "grad_norm": 1.570603847503662, + "learning_rate": 4.402179312557606e-05, + "loss": 4.4322, + "step": 37795 + }, + { + "epoch": 0.2247835188885717, + "grad_norm": 1.6555372476577759, + "learning_rate": 4.402149002093288e-05, + "loss": 4.3265, + "step": 37796 + }, + { + "epoch": 0.2247894661718527, + "grad_norm": 1.740679144859314, + "learning_rate": 4.402118690964948e-05, + "loss": 4.2542, + "step": 37797 + }, + { + "epoch": 0.2247954134551337, + "grad_norm": 1.4893536567687988, + "learning_rate": 4.402088379172598e-05, + "loss": 5.4392, + "step": 37798 + }, + { + "epoch": 0.2248013607384147, + "grad_norm": 1.5444672107696533, + "learning_rate": 4.4020580667162494e-05, + "loss": 5.3137, + "step": 37799 + }, + { + "epoch": 0.22480730802169568, + "grad_norm": 1.5143893957138062, + "learning_rate": 4.402027753595911e-05, + "loss": 5.2554, + "step": 37800 + }, + { + "epoch": 0.2248132553049767, + "grad_norm": 1.4042882919311523, + "learning_rate": 4.401997439811595e-05, + "loss": 5.2739, + "step": 37801 + }, + { + "epoch": 0.2248192025882577, + "grad_norm": 1.6029880046844482, + "learning_rate": 4.401967125363311e-05, + "loss": 5.2905, + "step": 37802 + }, + { + "epoch": 0.22482514987153868, + "grad_norm": 1.688639760017395, + "learning_rate": 4.4019368102510705e-05, + "loss": 4.5715, + "step": 37803 + }, + { + "epoch": 0.22483109715481966, + "grad_norm": 1.2697008848190308, + "learning_rate": 4.401906494474883e-05, + "loss": 5.3288, + "step": 37804 + }, + { + "epoch": 0.22483704443810068, + "grad_norm": 1.3962997198104858, + "learning_rate": 4.401876178034761e-05, + "loss": 5.2057, + "step": 37805 + }, + { + "epoch": 0.22484299172138167, + "grad_norm": 1.2445080280303955, + "learning_rate": 4.4018458609307124e-05, + "loss": 5.1935, + "step": 37806 + }, + { + "epoch": 0.22484893900466266, + "grad_norm": 1.4291088581085205, + "learning_rate": 4.401815543162749e-05, + "loss": 5.162, + "step": 37807 + }, + { + "epoch": 0.22485488628794367, + "grad_norm": 1.387148380279541, + "learning_rate": 4.401785224730881e-05, + "loss": 5.1254, + "step": 37808 + }, + { + "epoch": 0.22486083357122466, + "grad_norm": 1.575493574142456, + "learning_rate": 4.40175490563512e-05, + "loss": 4.5117, + "step": 37809 + }, + { + "epoch": 0.22486678085450565, + "grad_norm": 1.5773591995239258, + "learning_rate": 4.4017245858754764e-05, + "loss": 4.9031, + "step": 37810 + }, + { + "epoch": 0.22487272813778666, + "grad_norm": 1.6985411643981934, + "learning_rate": 4.40169426545196e-05, + "loss": 5.3724, + "step": 37811 + }, + { + "epoch": 0.22487867542106765, + "grad_norm": 1.6448116302490234, + "learning_rate": 4.401663944364581e-05, + "loss": 5.2448, + "step": 37812 + }, + { + "epoch": 0.22488462270434864, + "grad_norm": 1.6919282674789429, + "learning_rate": 4.4016336226133524e-05, + "loss": 5.5246, + "step": 37813 + }, + { + "epoch": 0.22489056998762966, + "grad_norm": 1.7464653253555298, + "learning_rate": 4.4016033001982827e-05, + "loss": 4.8954, + "step": 37814 + }, + { + "epoch": 0.22489651727091065, + "grad_norm": 1.6324712038040161, + "learning_rate": 4.401572977119382e-05, + "loss": 4.5895, + "step": 37815 + }, + { + "epoch": 0.22490246455419163, + "grad_norm": 1.9962471723556519, + "learning_rate": 4.4015426533766624e-05, + "loss": 3.9179, + "step": 37816 + }, + { + "epoch": 0.22490841183747265, + "grad_norm": 2.127303123474121, + "learning_rate": 4.401512328970134e-05, + "loss": 3.9693, + "step": 37817 + }, + { + "epoch": 0.22491435912075364, + "grad_norm": 1.8885246515274048, + "learning_rate": 4.401482003899807e-05, + "loss": 3.9877, + "step": 37818 + }, + { + "epoch": 0.22492030640403463, + "grad_norm": 1.7301980257034302, + "learning_rate": 4.4014516781656926e-05, + "loss": 4.4553, + "step": 37819 + }, + { + "epoch": 0.22492625368731564, + "grad_norm": 2.191305160522461, + "learning_rate": 4.401421351767801e-05, + "loss": 3.3981, + "step": 37820 + }, + { + "epoch": 0.22493220097059663, + "grad_norm": 2.289350986480713, + "learning_rate": 4.401391024706142e-05, + "loss": 3.7283, + "step": 37821 + }, + { + "epoch": 0.22493814825387762, + "grad_norm": 3.6579511165618896, + "learning_rate": 4.401360696980729e-05, + "loss": 1.4967, + "step": 37822 + }, + { + "epoch": 0.22494409553715863, + "grad_norm": 3.803406238555908, + "learning_rate": 4.401330368591568e-05, + "loss": 2.8816, + "step": 37823 + }, + { + "epoch": 0.22495004282043962, + "grad_norm": 3.4346766471862793, + "learning_rate": 4.401300039538675e-05, + "loss": 3.2975, + "step": 37824 + }, + { + "epoch": 0.2249559901037206, + "grad_norm": 3.123645305633545, + "learning_rate": 4.4012697098220556e-05, + "loss": 2.6538, + "step": 37825 + }, + { + "epoch": 0.22496193738700163, + "grad_norm": 1.8864881992340088, + "learning_rate": 4.401239379441724e-05, + "loss": 4.2615, + "step": 37826 + }, + { + "epoch": 0.22496788467028261, + "grad_norm": 2.685556173324585, + "learning_rate": 4.401209048397688e-05, + "loss": 3.6858, + "step": 37827 + }, + { + "epoch": 0.2249738319535636, + "grad_norm": 3.324258327484131, + "learning_rate": 4.401178716689961e-05, + "loss": 2.6127, + "step": 37828 + }, + { + "epoch": 0.22497977923684462, + "grad_norm": 3.6253082752227783, + "learning_rate": 4.401148384318551e-05, + "loss": 2.171, + "step": 37829 + }, + { + "epoch": 0.2249857265201256, + "grad_norm": 3.318803071975708, + "learning_rate": 4.4011180512834704e-05, + "loss": 2.4737, + "step": 37830 + }, + { + "epoch": 0.2249916738034066, + "grad_norm": 3.84256911277771, + "learning_rate": 4.401087717584729e-05, + "loss": 3.2811, + "step": 37831 + }, + { + "epoch": 0.2249976210866876, + "grad_norm": 2.7051305770874023, + "learning_rate": 4.401057383222338e-05, + "loss": 3.1816, + "step": 37832 + }, + { + "epoch": 0.2250035683699686, + "grad_norm": 2.243999719619751, + "learning_rate": 4.401027048196307e-05, + "loss": 3.9582, + "step": 37833 + }, + { + "epoch": 0.2250095156532496, + "grad_norm": 2.9261839389801025, + "learning_rate": 4.4009967125066465e-05, + "loss": 1.8198, + "step": 37834 + }, + { + "epoch": 0.2250154629365306, + "grad_norm": 2.6644344329833984, + "learning_rate": 4.4009663761533684e-05, + "loss": 1.0773, + "step": 37835 + }, + { + "epoch": 0.2250214102198116, + "grad_norm": 2.9495484828948975, + "learning_rate": 4.400936039136483e-05, + "loss": 1.5906, + "step": 37836 + }, + { + "epoch": 0.22502735750309258, + "grad_norm": 3.0252106189727783, + "learning_rate": 4.4009057014559996e-05, + "loss": 1.721, + "step": 37837 + }, + { + "epoch": 0.2250333047863736, + "grad_norm": 3.145016670227051, + "learning_rate": 4.4008753631119305e-05, + "loss": 1.8387, + "step": 37838 + }, + { + "epoch": 0.22503925206965458, + "grad_norm": 2.9610512256622314, + "learning_rate": 4.400845024104284e-05, + "loss": 2.0492, + "step": 37839 + }, + { + "epoch": 0.22504519935293557, + "grad_norm": 2.962660789489746, + "learning_rate": 4.400814684433073e-05, + "loss": 1.168, + "step": 37840 + }, + { + "epoch": 0.2250511466362166, + "grad_norm": 3.3425393104553223, + "learning_rate": 4.400784344098308e-05, + "loss": 1.576, + "step": 37841 + }, + { + "epoch": 0.22505709391949758, + "grad_norm": 3.3425233364105225, + "learning_rate": 4.400754003099998e-05, + "loss": 1.6533, + "step": 37842 + }, + { + "epoch": 0.22506304120277856, + "grad_norm": 3.656737804412842, + "learning_rate": 4.4007236614381545e-05, + "loss": 1.3451, + "step": 37843 + }, + { + "epoch": 0.22506898848605958, + "grad_norm": 2.0164568424224854, + "learning_rate": 4.400693319112788e-05, + "loss": 3.6496, + "step": 37844 + }, + { + "epoch": 0.22507493576934057, + "grad_norm": 3.6576480865478516, + "learning_rate": 4.400662976123909e-05, + "loss": 1.5672, + "step": 37845 + }, + { + "epoch": 0.22508088305262156, + "grad_norm": 3.4933855533599854, + "learning_rate": 4.400632632471529e-05, + "loss": 1.3534, + "step": 37846 + }, + { + "epoch": 0.22508683033590257, + "grad_norm": 4.428929805755615, + "learning_rate": 4.400602288155657e-05, + "loss": 2.669, + "step": 37847 + }, + { + "epoch": 0.22509277761918356, + "grad_norm": 3.86712908744812, + "learning_rate": 4.400571943176304e-05, + "loss": 2.175, + "step": 37848 + }, + { + "epoch": 0.22509872490246455, + "grad_norm": 3.457106113433838, + "learning_rate": 4.400541597533482e-05, + "loss": 1.996, + "step": 37849 + }, + { + "epoch": 0.22510467218574556, + "grad_norm": 3.4103970527648926, + "learning_rate": 4.4005112512272e-05, + "loss": 1.8448, + "step": 37850 + }, + { + "epoch": 0.22511061946902655, + "grad_norm": 4.167159557342529, + "learning_rate": 4.400480904257469e-05, + "loss": 2.3319, + "step": 37851 + }, + { + "epoch": 0.22511656675230754, + "grad_norm": 3.852609395980835, + "learning_rate": 4.4004505566243e-05, + "loss": 2.1329, + "step": 37852 + }, + { + "epoch": 0.22512251403558856, + "grad_norm": 3.090017795562744, + "learning_rate": 4.4004202083277034e-05, + "loss": 1.6722, + "step": 37853 + }, + { + "epoch": 0.22512846131886954, + "grad_norm": 2.2390940189361572, + "learning_rate": 4.40038985936769e-05, + "loss": 0.7566, + "step": 37854 + }, + { + "epoch": 0.22513440860215053, + "grad_norm": 2.376133680343628, + "learning_rate": 4.40035950974427e-05, + "loss": 2.1932, + "step": 37855 + }, + { + "epoch": 0.22514035588543155, + "grad_norm": 3.3748831748962402, + "learning_rate": 4.400329159457453e-05, + "loss": 3.3727, + "step": 37856 + }, + { + "epoch": 0.22514630316871254, + "grad_norm": 4.395442962646484, + "learning_rate": 4.400298808507252e-05, + "loss": 3.0625, + "step": 37857 + }, + { + "epoch": 0.22515225045199352, + "grad_norm": 3.7643630504608154, + "learning_rate": 4.400268456893676e-05, + "loss": 2.2564, + "step": 37858 + }, + { + "epoch": 0.22515819773527454, + "grad_norm": 4.105278491973877, + "learning_rate": 4.400238104616736e-05, + "loss": 2.267, + "step": 37859 + }, + { + "epoch": 0.22516414501855553, + "grad_norm": 3.456455945968628, + "learning_rate": 4.400207751676442e-05, + "loss": 1.8341, + "step": 37860 + }, + { + "epoch": 0.22517009230183652, + "grad_norm": 3.196443796157837, + "learning_rate": 4.4001773980728054e-05, + "loss": 2.3685, + "step": 37861 + }, + { + "epoch": 0.2251760395851175, + "grad_norm": 3.148589611053467, + "learning_rate": 4.400147043805837e-05, + "loss": 1.5025, + "step": 37862 + }, + { + "epoch": 0.22518198686839852, + "grad_norm": 3.0841641426086426, + "learning_rate": 4.400116688875546e-05, + "loss": 1.4879, + "step": 37863 + }, + { + "epoch": 0.2251879341516795, + "grad_norm": 3.2474453449249268, + "learning_rate": 4.4000863332819445e-05, + "loss": 1.2444, + "step": 37864 + }, + { + "epoch": 0.2251938814349605, + "grad_norm": 3.0410406589508057, + "learning_rate": 4.400055977025043e-05, + "loss": 1.9955, + "step": 37865 + }, + { + "epoch": 0.2251998287182415, + "grad_norm": 3.0968708992004395, + "learning_rate": 4.400025620104851e-05, + "loss": 2.2284, + "step": 37866 + }, + { + "epoch": 0.2252057760015225, + "grad_norm": 2.2545621395111084, + "learning_rate": 4.399995262521379e-05, + "loss": 3.7539, + "step": 37867 + }, + { + "epoch": 0.2252117232848035, + "grad_norm": 3.4662656784057617, + "learning_rate": 4.399964904274639e-05, + "loss": 2.8159, + "step": 37868 + }, + { + "epoch": 0.2252176705680845, + "grad_norm": 3.1281726360321045, + "learning_rate": 4.3999345453646405e-05, + "loss": 1.7505, + "step": 37869 + }, + { + "epoch": 0.2252236178513655, + "grad_norm": 2.7577738761901855, + "learning_rate": 4.399904185791395e-05, + "loss": 1.6282, + "step": 37870 + }, + { + "epoch": 0.22522956513464648, + "grad_norm": 2.8311750888824463, + "learning_rate": 4.399873825554912e-05, + "loss": 1.6367, + "step": 37871 + }, + { + "epoch": 0.2252355124179275, + "grad_norm": 2.771963596343994, + "learning_rate": 4.3998434646552034e-05, + "loss": 2.1686, + "step": 37872 + }, + { + "epoch": 0.22524145970120849, + "grad_norm": 1.8751553297042847, + "learning_rate": 4.399813103092279e-05, + "loss": 5.0479, + "step": 37873 + }, + { + "epoch": 0.22524740698448947, + "grad_norm": 2.1598060131073, + "learning_rate": 4.399782740866148e-05, + "loss": 4.2219, + "step": 37874 + }, + { + "epoch": 0.2252533542677705, + "grad_norm": 3.289924383163452, + "learning_rate": 4.399752377976825e-05, + "loss": 2.0413, + "step": 37875 + }, + { + "epoch": 0.22525930155105148, + "grad_norm": 3.788972854614258, + "learning_rate": 4.399722014424316e-05, + "loss": 1.941, + "step": 37876 + }, + { + "epoch": 0.22526524883433247, + "grad_norm": 3.5833699703216553, + "learning_rate": 4.3996916502086344e-05, + "loss": 1.8684, + "step": 37877 + }, + { + "epoch": 0.22527119611761348, + "grad_norm": 2.6145071983337402, + "learning_rate": 4.3996612853297906e-05, + "loss": 3.2526, + "step": 37878 + }, + { + "epoch": 0.22527714340089447, + "grad_norm": 1.8506813049316406, + "learning_rate": 4.399630919787794e-05, + "loss": 4.7183, + "step": 37879 + }, + { + "epoch": 0.22528309068417546, + "grad_norm": 1.6560643911361694, + "learning_rate": 4.3996005535826555e-05, + "loss": 5.2118, + "step": 37880 + }, + { + "epoch": 0.22528903796745647, + "grad_norm": 1.5506187677383423, + "learning_rate": 4.3995701867143867e-05, + "loss": 5.2523, + "step": 37881 + }, + { + "epoch": 0.22529498525073746, + "grad_norm": 1.529112458229065, + "learning_rate": 4.3995398191829974e-05, + "loss": 4.7377, + "step": 37882 + }, + { + "epoch": 0.22530093253401845, + "grad_norm": 1.6341055631637573, + "learning_rate": 4.3995094509884986e-05, + "loss": 5.492, + "step": 37883 + }, + { + "epoch": 0.22530687981729947, + "grad_norm": 1.5386645793914795, + "learning_rate": 4.3994790821309004e-05, + "loss": 5.343, + "step": 37884 + }, + { + "epoch": 0.22531282710058045, + "grad_norm": 1.8904048204421997, + "learning_rate": 4.3994487126102137e-05, + "loss": 4.376, + "step": 37885 + }, + { + "epoch": 0.22531877438386144, + "grad_norm": 1.6142657995224, + "learning_rate": 4.39941834242645e-05, + "loss": 5.1632, + "step": 37886 + }, + { + "epoch": 0.22532472166714246, + "grad_norm": 1.5371025800704956, + "learning_rate": 4.399387971579618e-05, + "loss": 5.0741, + "step": 37887 + }, + { + "epoch": 0.22533066895042345, + "grad_norm": 1.6636183261871338, + "learning_rate": 4.399357600069729e-05, + "loss": 5.0503, + "step": 37888 + }, + { + "epoch": 0.22533661623370443, + "grad_norm": 1.4598065614700317, + "learning_rate": 4.3993272278967944e-05, + "loss": 4.9753, + "step": 37889 + }, + { + "epoch": 0.22534256351698545, + "grad_norm": 1.5426924228668213, + "learning_rate": 4.399296855060824e-05, + "loss": 5.1221, + "step": 37890 + }, + { + "epoch": 0.22534851080026644, + "grad_norm": 1.2707856893539429, + "learning_rate": 4.399266481561829e-05, + "loss": 4.883, + "step": 37891 + }, + { + "epoch": 0.22535445808354743, + "grad_norm": 1.5364930629730225, + "learning_rate": 4.39923610739982e-05, + "loss": 4.9607, + "step": 37892 + }, + { + "epoch": 0.22536040536682844, + "grad_norm": 1.5102486610412598, + "learning_rate": 4.3992057325748066e-05, + "loss": 4.9234, + "step": 37893 + }, + { + "epoch": 0.22536635265010943, + "grad_norm": 1.3505035638809204, + "learning_rate": 4.3991753570868e-05, + "loss": 4.9517, + "step": 37894 + }, + { + "epoch": 0.22537229993339042, + "grad_norm": 1.6128617525100708, + "learning_rate": 4.3991449809358115e-05, + "loss": 4.9662, + "step": 37895 + }, + { + "epoch": 0.22537824721667143, + "grad_norm": 1.4386210441589355, + "learning_rate": 4.399114604121851e-05, + "loss": 4.7074, + "step": 37896 + }, + { + "epoch": 0.22538419449995242, + "grad_norm": 1.473679780960083, + "learning_rate": 4.399084226644929e-05, + "loss": 4.4737, + "step": 37897 + }, + { + "epoch": 0.2253901417832334, + "grad_norm": 1.3751991987228394, + "learning_rate": 4.399053848505057e-05, + "loss": 4.9829, + "step": 37898 + }, + { + "epoch": 0.22539608906651443, + "grad_norm": 1.4539976119995117, + "learning_rate": 4.3990234697022434e-05, + "loss": 4.9506, + "step": 37899 + }, + { + "epoch": 0.22540203634979541, + "grad_norm": 1.3100522756576538, + "learning_rate": 4.3989930902365015e-05, + "loss": 4.9638, + "step": 37900 + }, + { + "epoch": 0.2254079836330764, + "grad_norm": 1.3466485738754272, + "learning_rate": 4.3989627101078404e-05, + "loss": 4.8384, + "step": 37901 + }, + { + "epoch": 0.22541393091635742, + "grad_norm": 1.5398622751235962, + "learning_rate": 4.398932329316271e-05, + "loss": 4.9108, + "step": 37902 + }, + { + "epoch": 0.2254198781996384, + "grad_norm": 1.3253413438796997, + "learning_rate": 4.398901947861804e-05, + "loss": 4.929, + "step": 37903 + }, + { + "epoch": 0.2254258254829194, + "grad_norm": 1.4795877933502197, + "learning_rate": 4.39887156574445e-05, + "loss": 4.9455, + "step": 37904 + }, + { + "epoch": 0.2254317727662004, + "grad_norm": 1.6711117029190063, + "learning_rate": 4.398841182964219e-05, + "loss": 4.4684, + "step": 37905 + }, + { + "epoch": 0.2254377200494814, + "grad_norm": 1.282360553741455, + "learning_rate": 4.398810799521124e-05, + "loss": 4.9634, + "step": 37906 + }, + { + "epoch": 0.2254436673327624, + "grad_norm": 1.8916454315185547, + "learning_rate": 4.3987804154151726e-05, + "loss": 5.3391, + "step": 37907 + }, + { + "epoch": 0.2254496146160434, + "grad_norm": 1.8332359790802002, + "learning_rate": 4.3987500306463756e-05, + "loss": 4.7668, + "step": 37908 + }, + { + "epoch": 0.2254555618993244, + "grad_norm": 1.565968632698059, + "learning_rate": 4.398719645214746e-05, + "loss": 4.9305, + "step": 37909 + }, + { + "epoch": 0.22546150918260538, + "grad_norm": 1.66316819190979, + "learning_rate": 4.3986892591202924e-05, + "loss": 5.8043, + "step": 37910 + }, + { + "epoch": 0.2254674564658864, + "grad_norm": 1.5474905967712402, + "learning_rate": 4.398658872363026e-05, + "loss": 5.4856, + "step": 37911 + }, + { + "epoch": 0.22547340374916738, + "grad_norm": 1.5161161422729492, + "learning_rate": 4.398628484942957e-05, + "loss": 5.4491, + "step": 37912 + }, + { + "epoch": 0.22547935103244837, + "grad_norm": 1.4109158515930176, + "learning_rate": 4.398598096860097e-05, + "loss": 5.5486, + "step": 37913 + }, + { + "epoch": 0.2254852983157294, + "grad_norm": 2.6624763011932373, + "learning_rate": 4.398567708114456e-05, + "loss": 4.3903, + "step": 37914 + }, + { + "epoch": 0.22549124559901038, + "grad_norm": 2.456807851791382, + "learning_rate": 4.3985373187060444e-05, + "loss": 4.9181, + "step": 37915 + }, + { + "epoch": 0.22549719288229136, + "grad_norm": 1.541363000869751, + "learning_rate": 4.398506928634873e-05, + "loss": 5.6465, + "step": 37916 + }, + { + "epoch": 0.22550314016557238, + "grad_norm": 2.3970277309417725, + "learning_rate": 4.398476537900953e-05, + "loss": 3.8429, + "step": 37917 + }, + { + "epoch": 0.22550908744885337, + "grad_norm": 1.6470190286636353, + "learning_rate": 4.3984461465042933e-05, + "loss": 5.011, + "step": 37918 + }, + { + "epoch": 0.22551503473213436, + "grad_norm": 1.6763241291046143, + "learning_rate": 4.3984157544449076e-05, + "loss": 5.3823, + "step": 37919 + }, + { + "epoch": 0.22552098201541534, + "grad_norm": 1.6084785461425781, + "learning_rate": 4.398385361722803e-05, + "loss": 5.6275, + "step": 37920 + }, + { + "epoch": 0.22552692929869636, + "grad_norm": 1.4290287494659424, + "learning_rate": 4.398354968337992e-05, + "loss": 5.453, + "step": 37921 + }, + { + "epoch": 0.22553287658197735, + "grad_norm": 1.7292128801345825, + "learning_rate": 4.398324574290485e-05, + "loss": 4.4189, + "step": 37922 + }, + { + "epoch": 0.22553882386525834, + "grad_norm": 1.5806313753128052, + "learning_rate": 4.3982941795802925e-05, + "loss": 4.7038, + "step": 37923 + }, + { + "epoch": 0.22554477114853935, + "grad_norm": 1.7047100067138672, + "learning_rate": 4.398263784207425e-05, + "loss": 5.1124, + "step": 37924 + }, + { + "epoch": 0.22555071843182034, + "grad_norm": 1.7951979637145996, + "learning_rate": 4.398233388171893e-05, + "loss": 5.0583, + "step": 37925 + }, + { + "epoch": 0.22555666571510133, + "grad_norm": 1.7077093124389648, + "learning_rate": 4.3982029914737076e-05, + "loss": 5.0506, + "step": 37926 + }, + { + "epoch": 0.22556261299838234, + "grad_norm": 1.767303228378296, + "learning_rate": 4.3981725941128795e-05, + "loss": 5.3591, + "step": 37927 + }, + { + "epoch": 0.22556856028166333, + "grad_norm": 1.5521531105041504, + "learning_rate": 4.398142196089419e-05, + "loss": 5.2546, + "step": 37928 + }, + { + "epoch": 0.22557450756494432, + "grad_norm": 1.7760071754455566, + "learning_rate": 4.398111797403336e-05, + "loss": 4.2981, + "step": 37929 + }, + { + "epoch": 0.22558045484822534, + "grad_norm": 2.609588146209717, + "learning_rate": 4.398081398054642e-05, + "loss": 3.4733, + "step": 37930 + }, + { + "epoch": 0.22558640213150633, + "grad_norm": 2.9639456272125244, + "learning_rate": 4.3980509980433475e-05, + "loss": 2.1202, + "step": 37931 + }, + { + "epoch": 0.2255923494147873, + "grad_norm": 2.8901517391204834, + "learning_rate": 4.398020597369463e-05, + "loss": 2.3376, + "step": 37932 + }, + { + "epoch": 0.22559829669806833, + "grad_norm": 2.5380642414093018, + "learning_rate": 4.397990196032999e-05, + "loss": 2.2578, + "step": 37933 + }, + { + "epoch": 0.22560424398134932, + "grad_norm": 2.1316301822662354, + "learning_rate": 4.3979597940339664e-05, + "loss": 3.9766, + "step": 37934 + }, + { + "epoch": 0.2256101912646303, + "grad_norm": 1.7142231464385986, + "learning_rate": 4.3979293913723756e-05, + "loss": 4.3819, + "step": 37935 + }, + { + "epoch": 0.22561613854791132, + "grad_norm": 2.933591365814209, + "learning_rate": 4.397898988048238e-05, + "loss": 2.63, + "step": 37936 + }, + { + "epoch": 0.2256220858311923, + "grad_norm": 3.323190927505493, + "learning_rate": 4.397868584061562e-05, + "loss": 2.1169, + "step": 37937 + }, + { + "epoch": 0.2256280331144733, + "grad_norm": 2.8052923679351807, + "learning_rate": 4.3978381794123604e-05, + "loss": 1.8126, + "step": 37938 + }, + { + "epoch": 0.2256339803977543, + "grad_norm": 2.9329421520233154, + "learning_rate": 4.397807774100643e-05, + "loss": 2.0345, + "step": 37939 + }, + { + "epoch": 0.2256399276810353, + "grad_norm": 2.487644910812378, + "learning_rate": 4.3977773681264206e-05, + "loss": 2.0859, + "step": 37940 + }, + { + "epoch": 0.2256458749643163, + "grad_norm": 2.733116388320923, + "learning_rate": 4.3977469614897035e-05, + "loss": 3.8859, + "step": 37941 + }, + { + "epoch": 0.2256518222475973, + "grad_norm": 1.7498195171356201, + "learning_rate": 4.397716554190503e-05, + "loss": 4.8557, + "step": 37942 + }, + { + "epoch": 0.2256577695308783, + "grad_norm": 1.721623420715332, + "learning_rate": 4.397686146228829e-05, + "loss": 4.3423, + "step": 37943 + }, + { + "epoch": 0.22566371681415928, + "grad_norm": 1.6414244174957275, + "learning_rate": 4.397655737604692e-05, + "loss": 4.7495, + "step": 37944 + }, + { + "epoch": 0.2256696640974403, + "grad_norm": 1.493431806564331, + "learning_rate": 4.397625328318104e-05, + "loss": 4.8656, + "step": 37945 + }, + { + "epoch": 0.22567561138072129, + "grad_norm": 1.7861814498901367, + "learning_rate": 4.397594918369074e-05, + "loss": 4.9698, + "step": 37946 + }, + { + "epoch": 0.22568155866400227, + "grad_norm": 1.5873513221740723, + "learning_rate": 4.397564507757613e-05, + "loss": 4.822, + "step": 37947 + }, + { + "epoch": 0.2256875059472833, + "grad_norm": 1.4830878973007202, + "learning_rate": 4.397534096483732e-05, + "loss": 5.19, + "step": 37948 + }, + { + "epoch": 0.22569345323056428, + "grad_norm": 1.7204992771148682, + "learning_rate": 4.397503684547442e-05, + "loss": 5.1106, + "step": 37949 + }, + { + "epoch": 0.22569940051384527, + "grad_norm": 1.515528917312622, + "learning_rate": 4.3974732719487524e-05, + "loss": 5.2533, + "step": 37950 + }, + { + "epoch": 0.22570534779712628, + "grad_norm": 1.438448190689087, + "learning_rate": 4.3974428586876746e-05, + "loss": 5.2196, + "step": 37951 + }, + { + "epoch": 0.22571129508040727, + "grad_norm": 1.884359359741211, + "learning_rate": 4.397412444764219e-05, + "loss": 4.695, + "step": 37952 + }, + { + "epoch": 0.22571724236368826, + "grad_norm": 1.5510938167572021, + "learning_rate": 4.3973820301783965e-05, + "loss": 5.4799, + "step": 37953 + }, + { + "epoch": 0.22572318964696927, + "grad_norm": 1.3831311464309692, + "learning_rate": 4.397351614930217e-05, + "loss": 5.3733, + "step": 37954 + }, + { + "epoch": 0.22572913693025026, + "grad_norm": 1.7165378332138062, + "learning_rate": 4.397321199019693e-05, + "loss": 5.0023, + "step": 37955 + }, + { + "epoch": 0.22573508421353125, + "grad_norm": 1.287805199623108, + "learning_rate": 4.397290782446832e-05, + "loss": 5.0996, + "step": 37956 + }, + { + "epoch": 0.22574103149681227, + "grad_norm": 1.1919018030166626, + "learning_rate": 4.397260365211648e-05, + "loss": 5.4756, + "step": 37957 + }, + { + "epoch": 0.22574697878009325, + "grad_norm": 1.3540643453598022, + "learning_rate": 4.3972299473141494e-05, + "loss": 5.4296, + "step": 37958 + }, + { + "epoch": 0.22575292606337424, + "grad_norm": 1.3627208471298218, + "learning_rate": 4.397199528754348e-05, + "loss": 5.4029, + "step": 37959 + }, + { + "epoch": 0.22575887334665526, + "grad_norm": 1.599832534790039, + "learning_rate": 4.3971691095322526e-05, + "loss": 4.4081, + "step": 37960 + }, + { + "epoch": 0.22576482062993625, + "grad_norm": 1.4021176099777222, + "learning_rate": 4.397138689647876e-05, + "loss": 5.1567, + "step": 37961 + }, + { + "epoch": 0.22577076791321724, + "grad_norm": 1.4261466264724731, + "learning_rate": 4.3971082691012264e-05, + "loss": 5.1062, + "step": 37962 + }, + { + "epoch": 0.22577671519649825, + "grad_norm": 1.5316903591156006, + "learning_rate": 4.397077847892318e-05, + "loss": 5.1274, + "step": 37963 + }, + { + "epoch": 0.22578266247977924, + "grad_norm": 1.535687804222107, + "learning_rate": 4.397047426021158e-05, + "loss": 5.1904, + "step": 37964 + }, + { + "epoch": 0.22578860976306023, + "grad_norm": 1.502808928489685, + "learning_rate": 4.397017003487759e-05, + "loss": 5.0989, + "step": 37965 + }, + { + "epoch": 0.22579455704634124, + "grad_norm": 1.6442323923110962, + "learning_rate": 4.3969865802921306e-05, + "loss": 4.6965, + "step": 37966 + }, + { + "epoch": 0.22580050432962223, + "grad_norm": 1.5118539333343506, + "learning_rate": 4.3969561564342836e-05, + "loss": 5.1223, + "step": 37967 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 1.9736100435256958, + "learning_rate": 4.396925731914229e-05, + "loss": 5.608, + "step": 37968 + }, + { + "epoch": 0.22581239889618424, + "grad_norm": 1.6377054452896118, + "learning_rate": 4.3968953067319777e-05, + "loss": 5.5701, + "step": 37969 + }, + { + "epoch": 0.22581834617946522, + "grad_norm": 1.588903784751892, + "learning_rate": 4.396864880887539e-05, + "loss": 4.9223, + "step": 37970 + }, + { + "epoch": 0.2258242934627462, + "grad_norm": 1.8022443056106567, + "learning_rate": 4.396834454380925e-05, + "loss": 4.6535, + "step": 37971 + }, + { + "epoch": 0.22583024074602723, + "grad_norm": 1.7296810150146484, + "learning_rate": 4.3968040272121456e-05, + "loss": 4.9426, + "step": 37972 + }, + { + "epoch": 0.22583618802930822, + "grad_norm": 1.7298763990402222, + "learning_rate": 4.3967735993812114e-05, + "loss": 5.2988, + "step": 37973 + }, + { + "epoch": 0.2258421353125892, + "grad_norm": 1.6843442916870117, + "learning_rate": 4.396743170888133e-05, + "loss": 5.0539, + "step": 37974 + }, + { + "epoch": 0.22584808259587022, + "grad_norm": 1.5394823551177979, + "learning_rate": 4.396712741732921e-05, + "loss": 5.0311, + "step": 37975 + }, + { + "epoch": 0.2258540298791512, + "grad_norm": 1.322674036026001, + "learning_rate": 4.396682311915586e-05, + "loss": 4.8644, + "step": 37976 + }, + { + "epoch": 0.2258599771624322, + "grad_norm": 1.8961514234542847, + "learning_rate": 4.39665188143614e-05, + "loss": 4.2571, + "step": 37977 + }, + { + "epoch": 0.2258659244457132, + "grad_norm": 1.3595390319824219, + "learning_rate": 4.3966214502945915e-05, + "loss": 4.7363, + "step": 37978 + }, + { + "epoch": 0.2258718717289942, + "grad_norm": 1.3047689199447632, + "learning_rate": 4.396591018490953e-05, + "loss": 4.8923, + "step": 37979 + }, + { + "epoch": 0.2258778190122752, + "grad_norm": 1.4182853698730469, + "learning_rate": 4.396560586025233e-05, + "loss": 4.883, + "step": 37980 + }, + { + "epoch": 0.22588376629555618, + "grad_norm": 1.3140445947647095, + "learning_rate": 4.3965301528974434e-05, + "loss": 4.7423, + "step": 37981 + }, + { + "epoch": 0.2258897135788372, + "grad_norm": 1.4918787479400635, + "learning_rate": 4.396499719107595e-05, + "loss": 4.6526, + "step": 37982 + }, + { + "epoch": 0.22589566086211818, + "grad_norm": 1.5447934865951538, + "learning_rate": 4.396469284655699e-05, + "loss": 4.7521, + "step": 37983 + }, + { + "epoch": 0.22590160814539917, + "grad_norm": 1.2894394397735596, + "learning_rate": 4.396438849541764e-05, + "loss": 4.8453, + "step": 37984 + }, + { + "epoch": 0.22590755542868018, + "grad_norm": 1.7194790840148926, + "learning_rate": 4.396408413765802e-05, + "loss": 4.7401, + "step": 37985 + }, + { + "epoch": 0.22591350271196117, + "grad_norm": 1.5272841453552246, + "learning_rate": 4.3963779773278234e-05, + "loss": 4.652, + "step": 37986 + }, + { + "epoch": 0.22591944999524216, + "grad_norm": 1.9356179237365723, + "learning_rate": 4.3963475402278395e-05, + "loss": 4.2807, + "step": 37987 + }, + { + "epoch": 0.22592539727852318, + "grad_norm": 1.6426055431365967, + "learning_rate": 4.3963171024658586e-05, + "loss": 4.6845, + "step": 37988 + }, + { + "epoch": 0.22593134456180416, + "grad_norm": 1.666062593460083, + "learning_rate": 4.396286664041895e-05, + "loss": 4.8995, + "step": 37989 + }, + { + "epoch": 0.22593729184508515, + "grad_norm": 1.625488042831421, + "learning_rate": 4.3962562249559556e-05, + "loss": 4.9241, + "step": 37990 + }, + { + "epoch": 0.22594323912836617, + "grad_norm": 1.6422685384750366, + "learning_rate": 4.3962257852080545e-05, + "loss": 4.7108, + "step": 37991 + }, + { + "epoch": 0.22594918641164716, + "grad_norm": 1.582587718963623, + "learning_rate": 4.3961953447982e-05, + "loss": 4.7298, + "step": 37992 + }, + { + "epoch": 0.22595513369492815, + "grad_norm": 1.6675734519958496, + "learning_rate": 4.3961649037264025e-05, + "loss": 4.6544, + "step": 37993 + }, + { + "epoch": 0.22596108097820916, + "grad_norm": 1.6701778173446655, + "learning_rate": 4.3961344619926733e-05, + "loss": 4.1497, + "step": 37994 + }, + { + "epoch": 0.22596702826149015, + "grad_norm": 1.668684959411621, + "learning_rate": 4.396104019597024e-05, + "loss": 4.7531, + "step": 37995 + }, + { + "epoch": 0.22597297554477114, + "grad_norm": 1.6830800771713257, + "learning_rate": 4.396073576539465e-05, + "loss": 4.8988, + "step": 37996 + }, + { + "epoch": 0.22597892282805215, + "grad_norm": 1.5785146951675415, + "learning_rate": 4.3960431328200044e-05, + "loss": 4.868, + "step": 37997 + }, + { + "epoch": 0.22598487011133314, + "grad_norm": 1.5874582529067993, + "learning_rate": 4.396012688438656e-05, + "loss": 4.9942, + "step": 37998 + }, + { + "epoch": 0.22599081739461413, + "grad_norm": 1.4960954189300537, + "learning_rate": 4.395982243395429e-05, + "loss": 4.9711, + "step": 37999 + }, + { + "epoch": 0.22599676467789515, + "grad_norm": 1.6689503192901611, + "learning_rate": 4.3959517976903344e-05, + "loss": 4.6986, + "step": 38000 + }, + { + "epoch": 0.22600271196117613, + "grad_norm": 1.583571195602417, + "learning_rate": 4.3959213513233824e-05, + "loss": 4.9802, + "step": 38001 + }, + { + "epoch": 0.22600865924445712, + "grad_norm": 1.3584336042404175, + "learning_rate": 4.3958909042945826e-05, + "loss": 4.7263, + "step": 38002 + }, + { + "epoch": 0.22601460652773814, + "grad_norm": 1.5296635627746582, + "learning_rate": 4.3958604566039485e-05, + "loss": 4.7501, + "step": 38003 + }, + { + "epoch": 0.22602055381101913, + "grad_norm": 1.541918158531189, + "learning_rate": 4.395830008251489e-05, + "loss": 4.7631, + "step": 38004 + }, + { + "epoch": 0.22602650109430011, + "grad_norm": 1.5141816139221191, + "learning_rate": 4.395799559237214e-05, + "loss": 4.5206, + "step": 38005 + }, + { + "epoch": 0.22603244837758113, + "grad_norm": 2.4596874713897705, + "learning_rate": 4.395769109561136e-05, + "loss": 4.5737, + "step": 38006 + }, + { + "epoch": 0.22603839566086212, + "grad_norm": 1.8154200315475464, + "learning_rate": 4.395738659223264e-05, + "loss": 4.3293, + "step": 38007 + }, + { + "epoch": 0.2260443429441431, + "grad_norm": 1.490979790687561, + "learning_rate": 4.395708208223609e-05, + "loss": 4.7954, + "step": 38008 + }, + { + "epoch": 0.22605029022742412, + "grad_norm": 1.4490966796875, + "learning_rate": 4.395677756562182e-05, + "loss": 4.8062, + "step": 38009 + }, + { + "epoch": 0.2260562375107051, + "grad_norm": 1.597187876701355, + "learning_rate": 4.395647304238993e-05, + "loss": 4.7911, + "step": 38010 + }, + { + "epoch": 0.2260621847939861, + "grad_norm": 1.6224030256271362, + "learning_rate": 4.395616851254054e-05, + "loss": 4.8604, + "step": 38011 + }, + { + "epoch": 0.22606813207726711, + "grad_norm": 1.626530408859253, + "learning_rate": 4.3955863976073744e-05, + "loss": 4.7964, + "step": 38012 + }, + { + "epoch": 0.2260740793605481, + "grad_norm": 1.4275909662246704, + "learning_rate": 4.3955559432989654e-05, + "loss": 4.5712, + "step": 38013 + }, + { + "epoch": 0.2260800266438291, + "grad_norm": 1.3585506677627563, + "learning_rate": 4.3955254883288366e-05, + "loss": 4.5507, + "step": 38014 + }, + { + "epoch": 0.2260859739271101, + "grad_norm": 1.594292402267456, + "learning_rate": 4.395495032697e-05, + "loss": 4.664, + "step": 38015 + }, + { + "epoch": 0.2260919212103911, + "grad_norm": 1.5161908864974976, + "learning_rate": 4.3954645764034666e-05, + "loss": 4.6313, + "step": 38016 + }, + { + "epoch": 0.22609786849367208, + "grad_norm": 1.3704510927200317, + "learning_rate": 4.395434119448244e-05, + "loss": 4.6362, + "step": 38017 + }, + { + "epoch": 0.2261038157769531, + "grad_norm": 1.5801879167556763, + "learning_rate": 4.395403661831346e-05, + "loss": 4.5719, + "step": 38018 + }, + { + "epoch": 0.2261097630602341, + "grad_norm": 1.8262200355529785, + "learning_rate": 4.395373203552783e-05, + "loss": 4.9934, + "step": 38019 + }, + { + "epoch": 0.22611571034351508, + "grad_norm": 1.606871485710144, + "learning_rate": 4.395342744612564e-05, + "loss": 5.4006, + "step": 38020 + }, + { + "epoch": 0.2261216576267961, + "grad_norm": 1.888592004776001, + "learning_rate": 4.395312285010701e-05, + "loss": 5.181, + "step": 38021 + }, + { + "epoch": 0.22612760491007708, + "grad_norm": 1.703467845916748, + "learning_rate": 4.3952818247472025e-05, + "loss": 4.9956, + "step": 38022 + }, + { + "epoch": 0.22613355219335807, + "grad_norm": 1.8185619115829468, + "learning_rate": 4.3952513638220825e-05, + "loss": 4.193, + "step": 38023 + }, + { + "epoch": 0.22613949947663908, + "grad_norm": 1.9167721271514893, + "learning_rate": 4.395220902235349e-05, + "loss": 4.0476, + "step": 38024 + }, + { + "epoch": 0.22614544675992007, + "grad_norm": 1.4851292371749878, + "learning_rate": 4.395190439987014e-05, + "loss": 4.0727, + "step": 38025 + }, + { + "epoch": 0.22615139404320106, + "grad_norm": 2.399094581604004, + "learning_rate": 4.395159977077087e-05, + "loss": 4.1274, + "step": 38026 + }, + { + "epoch": 0.22615734132648208, + "grad_norm": 1.73356032371521, + "learning_rate": 4.395129513505579e-05, + "loss": 3.9943, + "step": 38027 + }, + { + "epoch": 0.22616328860976306, + "grad_norm": 1.5265543460845947, + "learning_rate": 4.395099049272501e-05, + "loss": 5.212, + "step": 38028 + }, + { + "epoch": 0.22616923589304405, + "grad_norm": 1.3660756349563599, + "learning_rate": 4.395068584377864e-05, + "loss": 5.3658, + "step": 38029 + }, + { + "epoch": 0.22617518317632507, + "grad_norm": 1.8043396472930908, + "learning_rate": 4.3950381188216786e-05, + "loss": 4.5968, + "step": 38030 + }, + { + "epoch": 0.22618113045960606, + "grad_norm": 1.3216902017593384, + "learning_rate": 4.395007652603954e-05, + "loss": 5.4593, + "step": 38031 + }, + { + "epoch": 0.22618707774288704, + "grad_norm": 1.3674670457839966, + "learning_rate": 4.394977185724701e-05, + "loss": 5.3019, + "step": 38032 + }, + { + "epoch": 0.22619302502616806, + "grad_norm": 1.3079349994659424, + "learning_rate": 4.3949467181839325e-05, + "loss": 5.3165, + "step": 38033 + }, + { + "epoch": 0.22619897230944905, + "grad_norm": 1.3201943635940552, + "learning_rate": 4.394916249981658e-05, + "loss": 5.1214, + "step": 38034 + }, + { + "epoch": 0.22620491959273004, + "grad_norm": 1.5240484476089478, + "learning_rate": 4.394885781117887e-05, + "loss": 4.6877, + "step": 38035 + }, + { + "epoch": 0.22621086687601105, + "grad_norm": 1.624353289604187, + "learning_rate": 4.3948553115926304e-05, + "loss": 4.9886, + "step": 38036 + }, + { + "epoch": 0.22621681415929204, + "grad_norm": 1.5491869449615479, + "learning_rate": 4.3948248414059004e-05, + "loss": 5.4594, + "step": 38037 + }, + { + "epoch": 0.22622276144257303, + "grad_norm": 1.5223517417907715, + "learning_rate": 4.394794370557706e-05, + "loss": 5.3304, + "step": 38038 + }, + { + "epoch": 0.22622870872585402, + "grad_norm": 1.1592520475387573, + "learning_rate": 4.394763899048059e-05, + "loss": 4.9766, + "step": 38039 + }, + { + "epoch": 0.22623465600913503, + "grad_norm": 1.6094484329223633, + "learning_rate": 4.394733426876969e-05, + "loss": 4.6776, + "step": 38040 + }, + { + "epoch": 0.22624060329241602, + "grad_norm": 1.5425121784210205, + "learning_rate": 4.394702954044447e-05, + "loss": 5.4675, + "step": 38041 + }, + { + "epoch": 0.226246550575697, + "grad_norm": 1.905106544494629, + "learning_rate": 4.3946724805505046e-05, + "loss": 4.2846, + "step": 38042 + }, + { + "epoch": 0.22625249785897802, + "grad_norm": 1.6905035972595215, + "learning_rate": 4.394642006395151e-05, + "loss": 5.0805, + "step": 38043 + }, + { + "epoch": 0.226258445142259, + "grad_norm": 1.8033732175827026, + "learning_rate": 4.3946115315783976e-05, + "loss": 4.765, + "step": 38044 + }, + { + "epoch": 0.22626439242554, + "grad_norm": 1.8025847673416138, + "learning_rate": 4.394581056100255e-05, + "loss": 5.1845, + "step": 38045 + }, + { + "epoch": 0.22627033970882102, + "grad_norm": 1.5593371391296387, + "learning_rate": 4.394550579960734e-05, + "loss": 4.9863, + "step": 38046 + }, + { + "epoch": 0.226276286992102, + "grad_norm": 1.5229204893112183, + "learning_rate": 4.394520103159844e-05, + "loss": 4.9358, + "step": 38047 + }, + { + "epoch": 0.226282234275383, + "grad_norm": 2.367879629135132, + "learning_rate": 4.3944896256975975e-05, + "loss": 3.7138, + "step": 38048 + }, + { + "epoch": 0.226288181558664, + "grad_norm": 2.867363452911377, + "learning_rate": 4.394459147574004e-05, + "loss": 2.8982, + "step": 38049 + }, + { + "epoch": 0.226294128841945, + "grad_norm": 2.827266216278076, + "learning_rate": 4.394428668789074e-05, + "loss": 2.8304, + "step": 38050 + }, + { + "epoch": 0.22630007612522599, + "grad_norm": 2.5809528827667236, + "learning_rate": 4.394398189342819e-05, + "loss": 3.289, + "step": 38051 + }, + { + "epoch": 0.226306023408507, + "grad_norm": 2.0330867767333984, + "learning_rate": 4.3943677092352485e-05, + "loss": 4.059, + "step": 38052 + }, + { + "epoch": 0.226311970691788, + "grad_norm": 2.52701997756958, + "learning_rate": 4.3943372284663745e-05, + "loss": 3.8987, + "step": 38053 + }, + { + "epoch": 0.22631791797506898, + "grad_norm": 2.8097949028015137, + "learning_rate": 4.3943067470362064e-05, + "loss": 3.6962, + "step": 38054 + }, + { + "epoch": 0.22632386525835, + "grad_norm": 2.9416728019714355, + "learning_rate": 4.394276264944757e-05, + "loss": 3.5676, + "step": 38055 + }, + { + "epoch": 0.22632981254163098, + "grad_norm": 2.539630174636841, + "learning_rate": 4.394245782192033e-05, + "loss": 3.1762, + "step": 38056 + }, + { + "epoch": 0.22633575982491197, + "grad_norm": 2.8900463581085205, + "learning_rate": 4.3942152987780485e-05, + "loss": 3.4844, + "step": 38057 + }, + { + "epoch": 0.22634170710819299, + "grad_norm": 2.119063377380371, + "learning_rate": 4.394184814702813e-05, + "loss": 4.0965, + "step": 38058 + }, + { + "epoch": 0.22634765439147397, + "grad_norm": 2.483552932739258, + "learning_rate": 4.394154329966337e-05, + "loss": 3.4461, + "step": 38059 + }, + { + "epoch": 0.22635360167475496, + "grad_norm": 2.3346595764160156, + "learning_rate": 4.394123844568632e-05, + "loss": 3.4765, + "step": 38060 + }, + { + "epoch": 0.22635954895803598, + "grad_norm": 2.0471270084381104, + "learning_rate": 4.394093358509706e-05, + "loss": 4.2491, + "step": 38061 + }, + { + "epoch": 0.22636549624131697, + "grad_norm": 1.5037319660186768, + "learning_rate": 4.3940628717895735e-05, + "loss": 5.1251, + "step": 38062 + }, + { + "epoch": 0.22637144352459795, + "grad_norm": 1.6269645690917969, + "learning_rate": 4.3940323844082426e-05, + "loss": 5.2124, + "step": 38063 + }, + { + "epoch": 0.22637739080787897, + "grad_norm": 2.6097071170806885, + "learning_rate": 4.3940018963657246e-05, + "loss": 3.2288, + "step": 38064 + }, + { + "epoch": 0.22638333809115996, + "grad_norm": 1.9301677942276, + "learning_rate": 4.39397140766203e-05, + "loss": 4.0461, + "step": 38065 + }, + { + "epoch": 0.22638928537444095, + "grad_norm": 1.5494807958602905, + "learning_rate": 4.39394091829717e-05, + "loss": 4.7948, + "step": 38066 + }, + { + "epoch": 0.22639523265772196, + "grad_norm": 1.5757453441619873, + "learning_rate": 4.393910428271154e-05, + "loss": 4.7356, + "step": 38067 + }, + { + "epoch": 0.22640117994100295, + "grad_norm": 1.5312561988830566, + "learning_rate": 4.393879937583994e-05, + "loss": 4.5272, + "step": 38068 + }, + { + "epoch": 0.22640712722428394, + "grad_norm": 1.6051007509231567, + "learning_rate": 4.3938494462356996e-05, + "loss": 4.8203, + "step": 38069 + }, + { + "epoch": 0.22641307450756495, + "grad_norm": 1.6052272319793701, + "learning_rate": 4.3938189542262824e-05, + "loss": 4.7911, + "step": 38070 + }, + { + "epoch": 0.22641902179084594, + "grad_norm": 1.5340666770935059, + "learning_rate": 4.3937884615557526e-05, + "loss": 5.0973, + "step": 38071 + }, + { + "epoch": 0.22642496907412693, + "grad_norm": 1.798746109008789, + "learning_rate": 4.3937579682241204e-05, + "loss": 5.0053, + "step": 38072 + }, + { + "epoch": 0.22643091635740795, + "grad_norm": 1.636568307876587, + "learning_rate": 4.393727474231397e-05, + "loss": 5.0263, + "step": 38073 + }, + { + "epoch": 0.22643686364068893, + "grad_norm": 1.5352871417999268, + "learning_rate": 4.393696979577593e-05, + "loss": 4.991, + "step": 38074 + }, + { + "epoch": 0.22644281092396992, + "grad_norm": 1.6464602947235107, + "learning_rate": 4.3936664842627194e-05, + "loss": 4.8768, + "step": 38075 + }, + { + "epoch": 0.22644875820725094, + "grad_norm": 1.8451437950134277, + "learning_rate": 4.393635988286786e-05, + "loss": 4.2802, + "step": 38076 + }, + { + "epoch": 0.22645470549053193, + "grad_norm": 1.7521929740905762, + "learning_rate": 4.393605491649804e-05, + "loss": 5.1212, + "step": 38077 + }, + { + "epoch": 0.22646065277381291, + "grad_norm": 1.8951425552368164, + "learning_rate": 4.3935749943517834e-05, + "loss": 5.2989, + "step": 38078 + }, + { + "epoch": 0.22646660005709393, + "grad_norm": 1.7104054689407349, + "learning_rate": 4.393544496392735e-05, + "loss": 4.3321, + "step": 38079 + }, + { + "epoch": 0.22647254734037492, + "grad_norm": 1.6039187908172607, + "learning_rate": 4.39351399777267e-05, + "loss": 5.1924, + "step": 38080 + }, + { + "epoch": 0.2264784946236559, + "grad_norm": 1.9055510759353638, + "learning_rate": 4.3934834984916004e-05, + "loss": 4.526, + "step": 38081 + }, + { + "epoch": 0.22648444190693692, + "grad_norm": 1.8152254819869995, + "learning_rate": 4.393452998549534e-05, + "loss": 4.6742, + "step": 38082 + }, + { + "epoch": 0.2264903891902179, + "grad_norm": 2.2788617610931396, + "learning_rate": 4.393422497946482e-05, + "loss": 3.7998, + "step": 38083 + }, + { + "epoch": 0.2264963364734989, + "grad_norm": 2.635610342025757, + "learning_rate": 4.393391996682456e-05, + "loss": 2.8398, + "step": 38084 + }, + { + "epoch": 0.22650228375677992, + "grad_norm": 2.5307788848876953, + "learning_rate": 4.393361494757468e-05, + "loss": 2.9038, + "step": 38085 + }, + { + "epoch": 0.2265082310400609, + "grad_norm": 2.791041612625122, + "learning_rate": 4.393330992171526e-05, + "loss": 2.8516, + "step": 38086 + }, + { + "epoch": 0.2265141783233419, + "grad_norm": 1.9251587390899658, + "learning_rate": 4.393300488924642e-05, + "loss": 3.5744, + "step": 38087 + }, + { + "epoch": 0.2265201256066229, + "grad_norm": 2.3512256145477295, + "learning_rate": 4.3932699850168254e-05, + "loss": 3.1597, + "step": 38088 + }, + { + "epoch": 0.2265260728899039, + "grad_norm": 2.621535539627075, + "learning_rate": 4.3932394804480895e-05, + "loss": 2.7866, + "step": 38089 + }, + { + "epoch": 0.22653202017318488, + "grad_norm": 2.6089329719543457, + "learning_rate": 4.3932089752184416e-05, + "loss": 2.9403, + "step": 38090 + }, + { + "epoch": 0.2265379674564659, + "grad_norm": 3.0819029808044434, + "learning_rate": 4.393178469327895e-05, + "loss": 2.8429, + "step": 38091 + }, + { + "epoch": 0.2265439147397469, + "grad_norm": 2.8655409812927246, + "learning_rate": 4.3931479627764585e-05, + "loss": 3.2155, + "step": 38092 + }, + { + "epoch": 0.22654986202302788, + "grad_norm": 2.843642473220825, + "learning_rate": 4.393117455564145e-05, + "loss": 2.8215, + "step": 38093 + }, + { + "epoch": 0.2265558093063089, + "grad_norm": 2.7354695796966553, + "learning_rate": 4.393086947690963e-05, + "loss": 3.461, + "step": 38094 + }, + { + "epoch": 0.22656175658958988, + "grad_norm": 2.8338818550109863, + "learning_rate": 4.393056439156923e-05, + "loss": 2.9972, + "step": 38095 + }, + { + "epoch": 0.22656770387287087, + "grad_norm": 2.944443464279175, + "learning_rate": 4.393025929962037e-05, + "loss": 2.1083, + "step": 38096 + }, + { + "epoch": 0.22657365115615186, + "grad_norm": 1.9833786487579346, + "learning_rate": 4.392995420106316e-05, + "loss": 2.2152, + "step": 38097 + }, + { + "epoch": 0.22657959843943287, + "grad_norm": 3.4136886596679688, + "learning_rate": 4.39296490958977e-05, + "loss": 2.3506, + "step": 38098 + }, + { + "epoch": 0.22658554572271386, + "grad_norm": 3.2970211505889893, + "learning_rate": 4.392934398412408e-05, + "loss": 2.144, + "step": 38099 + }, + { + "epoch": 0.22659149300599485, + "grad_norm": 2.7194559574127197, + "learning_rate": 4.392903886574243e-05, + "loss": 1.1954, + "step": 38100 + }, + { + "epoch": 0.22659744028927586, + "grad_norm": 2.454838991165161, + "learning_rate": 4.392873374075286e-05, + "loss": 0.8838, + "step": 38101 + }, + { + "epoch": 0.22660338757255685, + "grad_norm": 4.048086166381836, + "learning_rate": 4.392842860915545e-05, + "loss": 3.6415, + "step": 38102 + }, + { + "epoch": 0.22660933485583784, + "grad_norm": 2.8631486892700195, + "learning_rate": 4.392812347095032e-05, + "loss": 2.3429, + "step": 38103 + }, + { + "epoch": 0.22661528213911886, + "grad_norm": 3.0390071868896484, + "learning_rate": 4.392781832613758e-05, + "loss": 3.7672, + "step": 38104 + }, + { + "epoch": 0.22662122942239984, + "grad_norm": 3.5983376502990723, + "learning_rate": 4.392751317471734e-05, + "loss": 3.4215, + "step": 38105 + }, + { + "epoch": 0.22662717670568083, + "grad_norm": 2.0240063667297363, + "learning_rate": 4.3927208016689704e-05, + "loss": 4.8848, + "step": 38106 + }, + { + "epoch": 0.22663312398896185, + "grad_norm": 1.9214985370635986, + "learning_rate": 4.392690285205476e-05, + "loss": 5.3785, + "step": 38107 + }, + { + "epoch": 0.22663907127224284, + "grad_norm": 1.5077999830245972, + "learning_rate": 4.3926597680812644e-05, + "loss": 5.3017, + "step": 38108 + }, + { + "epoch": 0.22664501855552383, + "grad_norm": 1.9332648515701294, + "learning_rate": 4.392629250296344e-05, + "loss": 4.5846, + "step": 38109 + }, + { + "epoch": 0.22665096583880484, + "grad_norm": 1.925647258758545, + "learning_rate": 4.392598731850727e-05, + "loss": 4.1814, + "step": 38110 + }, + { + "epoch": 0.22665691312208583, + "grad_norm": 1.6901360750198364, + "learning_rate": 4.392568212744422e-05, + "loss": 5.4319, + "step": 38111 + }, + { + "epoch": 0.22666286040536682, + "grad_norm": 1.7555872201919556, + "learning_rate": 4.392537692977442e-05, + "loss": 5.2164, + "step": 38112 + }, + { + "epoch": 0.22666880768864783, + "grad_norm": 2.7938854694366455, + "learning_rate": 4.392507172549797e-05, + "loss": 3.4674, + "step": 38113 + }, + { + "epoch": 0.22667475497192882, + "grad_norm": 2.125744342803955, + "learning_rate": 4.3924766514614966e-05, + "loss": 3.5049, + "step": 38114 + }, + { + "epoch": 0.2266807022552098, + "grad_norm": 4.356492042541504, + "learning_rate": 4.392446129712552e-05, + "loss": 1.4819, + "step": 38115 + }, + { + "epoch": 0.22668664953849083, + "grad_norm": 2.790475845336914, + "learning_rate": 4.3924156073029746e-05, + "loss": 2.8582, + "step": 38116 + }, + { + "epoch": 0.2266925968217718, + "grad_norm": 1.6480002403259277, + "learning_rate": 4.392385084232774e-05, + "loss": 4.897, + "step": 38117 + }, + { + "epoch": 0.2266985441050528, + "grad_norm": 1.56727135181427, + "learning_rate": 4.392354560501962e-05, + "loss": 4.431, + "step": 38118 + }, + { + "epoch": 0.22670449138833382, + "grad_norm": 2.931684970855713, + "learning_rate": 4.3923240361105476e-05, + "loss": 1.9593, + "step": 38119 + }, + { + "epoch": 0.2267104386716148, + "grad_norm": 2.1170263290405273, + "learning_rate": 4.3922935110585425e-05, + "loss": 4.5195, + "step": 38120 + }, + { + "epoch": 0.2267163859548958, + "grad_norm": 2.30966854095459, + "learning_rate": 4.392262985345959e-05, + "loss": 4.8419, + "step": 38121 + }, + { + "epoch": 0.2267223332381768, + "grad_norm": 1.6240100860595703, + "learning_rate": 4.3922324589728045e-05, + "loss": 4.95, + "step": 38122 + }, + { + "epoch": 0.2267282805214578, + "grad_norm": 2.074064254760742, + "learning_rate": 4.392201931939091e-05, + "loss": 4.658, + "step": 38123 + }, + { + "epoch": 0.22673422780473879, + "grad_norm": 2.826097011566162, + "learning_rate": 4.3921714042448306e-05, + "loss": 4.3729, + "step": 38124 + }, + { + "epoch": 0.2267401750880198, + "grad_norm": 2.2940187454223633, + "learning_rate": 4.392140875890032e-05, + "loss": 4.7278, + "step": 38125 + }, + { + "epoch": 0.2267461223713008, + "grad_norm": 2.0838935375213623, + "learning_rate": 4.392110346874706e-05, + "loss": 4.8679, + "step": 38126 + }, + { + "epoch": 0.22675206965458178, + "grad_norm": 2.258232593536377, + "learning_rate": 4.392079817198864e-05, + "loss": 4.3999, + "step": 38127 + }, + { + "epoch": 0.2267580169378628, + "grad_norm": 1.7443938255310059, + "learning_rate": 4.392049286862517e-05, + "loss": 4.5488, + "step": 38128 + }, + { + "epoch": 0.22676396422114378, + "grad_norm": 1.6828581094741821, + "learning_rate": 4.392018755865674e-05, + "loss": 4.7467, + "step": 38129 + }, + { + "epoch": 0.22676991150442477, + "grad_norm": 2.271589994430542, + "learning_rate": 4.391988224208349e-05, + "loss": 4.4422, + "step": 38130 + }, + { + "epoch": 0.2267758587877058, + "grad_norm": 3.1619069576263428, + "learning_rate": 4.3919576918905495e-05, + "loss": 4.1281, + "step": 38131 + }, + { + "epoch": 0.22678180607098677, + "grad_norm": 2.716963529586792, + "learning_rate": 4.391927158912286e-05, + "loss": 4.0466, + "step": 38132 + }, + { + "epoch": 0.22678775335426776, + "grad_norm": 2.644996166229248, + "learning_rate": 4.391896625273572e-05, + "loss": 4.0363, + "step": 38133 + }, + { + "epoch": 0.22679370063754878, + "grad_norm": 2.2756507396698, + "learning_rate": 4.391866090974415e-05, + "loss": 4.0494, + "step": 38134 + }, + { + "epoch": 0.22679964792082977, + "grad_norm": 2.7238388061523438, + "learning_rate": 4.3918355560148275e-05, + "loss": 3.9859, + "step": 38135 + }, + { + "epoch": 0.22680559520411075, + "grad_norm": 2.129974842071533, + "learning_rate": 4.39180502039482e-05, + "loss": 3.8574, + "step": 38136 + }, + { + "epoch": 0.22681154248739177, + "grad_norm": 2.485243558883667, + "learning_rate": 4.3917744841144024e-05, + "loss": 3.6808, + "step": 38137 + }, + { + "epoch": 0.22681748977067276, + "grad_norm": 2.380950927734375, + "learning_rate": 4.3917439471735865e-05, + "loss": 3.6885, + "step": 38138 + }, + { + "epoch": 0.22682343705395375, + "grad_norm": 2.5191190242767334, + "learning_rate": 4.391713409572382e-05, + "loss": 3.3835, + "step": 38139 + }, + { + "epoch": 0.22682938433723476, + "grad_norm": 2.4364421367645264, + "learning_rate": 4.3916828713108e-05, + "loss": 3.6194, + "step": 38140 + }, + { + "epoch": 0.22683533162051575, + "grad_norm": 2.982856512069702, + "learning_rate": 4.391652332388851e-05, + "loss": 3.4352, + "step": 38141 + }, + { + "epoch": 0.22684127890379674, + "grad_norm": 3.9172885417938232, + "learning_rate": 4.391621792806546e-05, + "loss": 2.2642, + "step": 38142 + }, + { + "epoch": 0.22684722618707776, + "grad_norm": 2.8479931354522705, + "learning_rate": 4.3915912525638944e-05, + "loss": 3.3206, + "step": 38143 + }, + { + "epoch": 0.22685317347035874, + "grad_norm": 1.826278567314148, + "learning_rate": 4.391560711660909e-05, + "loss": 4.1807, + "step": 38144 + }, + { + "epoch": 0.22685912075363973, + "grad_norm": 2.9397122859954834, + "learning_rate": 4.3915301700975985e-05, + "loss": 3.0936, + "step": 38145 + }, + { + "epoch": 0.22686506803692075, + "grad_norm": 3.786853313446045, + "learning_rate": 4.391499627873975e-05, + "loss": 2.7473, + "step": 38146 + }, + { + "epoch": 0.22687101532020174, + "grad_norm": 3.610485792160034, + "learning_rate": 4.391469084990049e-05, + "loss": 2.7369, + "step": 38147 + }, + { + "epoch": 0.22687696260348272, + "grad_norm": 3.4012134075164795, + "learning_rate": 4.391438541445829e-05, + "loss": 2.8883, + "step": 38148 + }, + { + "epoch": 0.22688290988676374, + "grad_norm": 2.813325881958008, + "learning_rate": 4.3914079972413294e-05, + "loss": 4.0143, + "step": 38149 + }, + { + "epoch": 0.22688885717004473, + "grad_norm": 1.4918662309646606, + "learning_rate": 4.391377452376557e-05, + "loss": 5.3134, + "step": 38150 + }, + { + "epoch": 0.22689480445332572, + "grad_norm": 2.268364906311035, + "learning_rate": 4.391346906851524e-05, + "loss": 4.001, + "step": 38151 + }, + { + "epoch": 0.22690075173660673, + "grad_norm": 2.87863826751709, + "learning_rate": 4.3913163606662436e-05, + "loss": 2.2376, + "step": 38152 + }, + { + "epoch": 0.22690669901988772, + "grad_norm": 2.252005100250244, + "learning_rate": 4.3912858138207225e-05, + "loss": 2.9713, + "step": 38153 + }, + { + "epoch": 0.2269126463031687, + "grad_norm": 1.636132836341858, + "learning_rate": 4.391255266314973e-05, + "loss": 4.9184, + "step": 38154 + }, + { + "epoch": 0.2269185935864497, + "grad_norm": 1.5946236848831177, + "learning_rate": 4.391224718149007e-05, + "loss": 4.8575, + "step": 38155 + }, + { + "epoch": 0.2269245408697307, + "grad_norm": 1.6881130933761597, + "learning_rate": 4.391194169322833e-05, + "loss": 5.0459, + "step": 38156 + }, + { + "epoch": 0.2269304881530117, + "grad_norm": 1.6183825731277466, + "learning_rate": 4.3911636198364627e-05, + "loss": 4.854, + "step": 38157 + }, + { + "epoch": 0.2269364354362927, + "grad_norm": 1.6183366775512695, + "learning_rate": 4.391133069689907e-05, + "loss": 4.4129, + "step": 38158 + }, + { + "epoch": 0.2269423827195737, + "grad_norm": 1.561302900314331, + "learning_rate": 4.391102518883177e-05, + "loss": 5.1313, + "step": 38159 + }, + { + "epoch": 0.2269483300028547, + "grad_norm": 1.5048106908798218, + "learning_rate": 4.391071967416282e-05, + "loss": 4.7604, + "step": 38160 + }, + { + "epoch": 0.22695427728613568, + "grad_norm": 1.5101484060287476, + "learning_rate": 4.3910414152892324e-05, + "loss": 5.1585, + "step": 38161 + }, + { + "epoch": 0.2269602245694167, + "grad_norm": 1.4586681127548218, + "learning_rate": 4.391010862502041e-05, + "loss": 4.821, + "step": 38162 + }, + { + "epoch": 0.22696617185269768, + "grad_norm": 1.507530927658081, + "learning_rate": 4.3909803090547164e-05, + "loss": 4.8342, + "step": 38163 + }, + { + "epoch": 0.22697211913597867, + "grad_norm": 1.4657213687896729, + "learning_rate": 4.390949754947271e-05, + "loss": 5.2703, + "step": 38164 + }, + { + "epoch": 0.2269780664192597, + "grad_norm": 1.2653342485427856, + "learning_rate": 4.390919200179714e-05, + "loss": 5.6871, + "step": 38165 + }, + { + "epoch": 0.22698401370254068, + "grad_norm": 1.371896505355835, + "learning_rate": 4.3908886447520565e-05, + "loss": 5.7635, + "step": 38166 + }, + { + "epoch": 0.22698996098582166, + "grad_norm": 1.38970947265625, + "learning_rate": 4.390858088664309e-05, + "loss": 5.5381, + "step": 38167 + }, + { + "epoch": 0.22699590826910268, + "grad_norm": 1.649964451789856, + "learning_rate": 4.390827531916484e-05, + "loss": 4.0239, + "step": 38168 + }, + { + "epoch": 0.22700185555238367, + "grad_norm": 1.8974918127059937, + "learning_rate": 4.390796974508589e-05, + "loss": 4.1992, + "step": 38169 + }, + { + "epoch": 0.22700780283566466, + "grad_norm": 1.6561975479125977, + "learning_rate": 4.390766416440637e-05, + "loss": 5.049, + "step": 38170 + }, + { + "epoch": 0.22701375011894567, + "grad_norm": 1.6487138271331787, + "learning_rate": 4.390735857712638e-05, + "loss": 4.1484, + "step": 38171 + }, + { + "epoch": 0.22701969740222666, + "grad_norm": 1.9046709537506104, + "learning_rate": 4.3907052983246024e-05, + "loss": 4.2686, + "step": 38172 + }, + { + "epoch": 0.22702564468550765, + "grad_norm": 1.9332301616668701, + "learning_rate": 4.390674738276542e-05, + "loss": 4.2826, + "step": 38173 + }, + { + "epoch": 0.22703159196878867, + "grad_norm": 1.6239649057388306, + "learning_rate": 4.3906441775684654e-05, + "loss": 3.9571, + "step": 38174 + }, + { + "epoch": 0.22703753925206965, + "grad_norm": 1.541169285774231, + "learning_rate": 4.3906136162003855e-05, + "loss": 4.5711, + "step": 38175 + }, + { + "epoch": 0.22704348653535064, + "grad_norm": 1.7647597789764404, + "learning_rate": 4.390583054172311e-05, + "loss": 4.8254, + "step": 38176 + }, + { + "epoch": 0.22704943381863166, + "grad_norm": 1.7445447444915771, + "learning_rate": 4.3905524914842536e-05, + "loss": 4.2946, + "step": 38177 + }, + { + "epoch": 0.22705538110191265, + "grad_norm": 1.5402419567108154, + "learning_rate": 4.390521928136224e-05, + "loss": 3.6745, + "step": 38178 + }, + { + "epoch": 0.22706132838519363, + "grad_norm": 1.652811884880066, + "learning_rate": 4.390491364128233e-05, + "loss": 3.652, + "step": 38179 + }, + { + "epoch": 0.22706727566847465, + "grad_norm": 1.7374279499053955, + "learning_rate": 4.390460799460291e-05, + "loss": 3.859, + "step": 38180 + }, + { + "epoch": 0.22707322295175564, + "grad_norm": 1.6328239440917969, + "learning_rate": 4.390430234132408e-05, + "loss": 3.5833, + "step": 38181 + }, + { + "epoch": 0.22707917023503663, + "grad_norm": 1.8972036838531494, + "learning_rate": 4.390399668144596e-05, + "loss": 3.6967, + "step": 38182 + }, + { + "epoch": 0.22708511751831764, + "grad_norm": 1.620894432067871, + "learning_rate": 4.390369101496865e-05, + "loss": 3.6897, + "step": 38183 + }, + { + "epoch": 0.22709106480159863, + "grad_norm": 1.7540589570999146, + "learning_rate": 4.3903385341892255e-05, + "loss": 3.7719, + "step": 38184 + }, + { + "epoch": 0.22709701208487962, + "grad_norm": 1.9118608236312866, + "learning_rate": 4.390307966221688e-05, + "loss": 4.484, + "step": 38185 + }, + { + "epoch": 0.22710295936816063, + "grad_norm": 1.9014246463775635, + "learning_rate": 4.390277397594264e-05, + "loss": 4.0257, + "step": 38186 + }, + { + "epoch": 0.22710890665144162, + "grad_norm": 2.3183906078338623, + "learning_rate": 4.3902468283069644e-05, + "loss": 3.8989, + "step": 38187 + }, + { + "epoch": 0.2271148539347226, + "grad_norm": 2.24916934967041, + "learning_rate": 4.390216258359798e-05, + "loss": 4.0533, + "step": 38188 + }, + { + "epoch": 0.22712080121800363, + "grad_norm": 2.352621555328369, + "learning_rate": 4.390185687752777e-05, + "loss": 3.9287, + "step": 38189 + }, + { + "epoch": 0.22712674850128461, + "grad_norm": 2.341827154159546, + "learning_rate": 4.390155116485911e-05, + "loss": 4.0099, + "step": 38190 + }, + { + "epoch": 0.2271326957845656, + "grad_norm": 1.7769436836242676, + "learning_rate": 4.3901245445592126e-05, + "loss": 4.5457, + "step": 38191 + }, + { + "epoch": 0.22713864306784662, + "grad_norm": 1.550155520439148, + "learning_rate": 4.390093971972691e-05, + "loss": 4.5249, + "step": 38192 + }, + { + "epoch": 0.2271445903511276, + "grad_norm": 1.677330493927002, + "learning_rate": 4.390063398726356e-05, + "loss": 5.328, + "step": 38193 + }, + { + "epoch": 0.2271505376344086, + "grad_norm": 1.6579426527023315, + "learning_rate": 4.39003282482022e-05, + "loss": 5.6862, + "step": 38194 + }, + { + "epoch": 0.2271564849176896, + "grad_norm": 1.6280534267425537, + "learning_rate": 4.3900022502542937e-05, + "loss": 4.9508, + "step": 38195 + }, + { + "epoch": 0.2271624322009706, + "grad_norm": 1.763191819190979, + "learning_rate": 4.389971675028587e-05, + "loss": 4.379, + "step": 38196 + }, + { + "epoch": 0.2271683794842516, + "grad_norm": 1.6606146097183228, + "learning_rate": 4.38994109914311e-05, + "loss": 4.5277, + "step": 38197 + }, + { + "epoch": 0.2271743267675326, + "grad_norm": 1.5619972944259644, + "learning_rate": 4.3899105225978756e-05, + "loss": 4.8777, + "step": 38198 + }, + { + "epoch": 0.2271802740508136, + "grad_norm": 1.8204611539840698, + "learning_rate": 4.389879945392891e-05, + "loss": 4.5978, + "step": 38199 + }, + { + "epoch": 0.22718622133409458, + "grad_norm": 1.9288609027862549, + "learning_rate": 4.38984936752817e-05, + "loss": 4.2724, + "step": 38200 + }, + { + "epoch": 0.2271921686173756, + "grad_norm": 1.7813549041748047, + "learning_rate": 4.389818789003722e-05, + "loss": 4.5287, + "step": 38201 + }, + { + "epoch": 0.22719811590065658, + "grad_norm": 1.5712015628814697, + "learning_rate": 4.389788209819558e-05, + "loss": 4.9682, + "step": 38202 + }, + { + "epoch": 0.22720406318393757, + "grad_norm": 1.4615222215652466, + "learning_rate": 4.3897576299756875e-05, + "loss": 4.6088, + "step": 38203 + }, + { + "epoch": 0.2272100104672186, + "grad_norm": 1.755202293395996, + "learning_rate": 4.389727049472123e-05, + "loss": 4.2747, + "step": 38204 + }, + { + "epoch": 0.22721595775049958, + "grad_norm": 2.1010637283325195, + "learning_rate": 4.3896964683088736e-05, + "loss": 4.1787, + "step": 38205 + }, + { + "epoch": 0.22722190503378056, + "grad_norm": 1.8035705089569092, + "learning_rate": 4.3896658864859516e-05, + "loss": 4.1858, + "step": 38206 + }, + { + "epoch": 0.22722785231706158, + "grad_norm": 1.9293487071990967, + "learning_rate": 4.3896353040033664e-05, + "loss": 4.3255, + "step": 38207 + }, + { + "epoch": 0.22723379960034257, + "grad_norm": 2.041259527206421, + "learning_rate": 4.389604720861128e-05, + "loss": 4.223, + "step": 38208 + }, + { + "epoch": 0.22723974688362356, + "grad_norm": 1.932900309562683, + "learning_rate": 4.389574137059249e-05, + "loss": 4.4073, + "step": 38209 + }, + { + "epoch": 0.22724569416690457, + "grad_norm": 1.8900656700134277, + "learning_rate": 4.38954355259774e-05, + "loss": 4.6547, + "step": 38210 + }, + { + "epoch": 0.22725164145018556, + "grad_norm": 1.7553825378417969, + "learning_rate": 4.38951296747661e-05, + "loss": 4.5234, + "step": 38211 + }, + { + "epoch": 0.22725758873346655, + "grad_norm": 1.724029779434204, + "learning_rate": 4.3894823816958705e-05, + "loss": 4.1834, + "step": 38212 + }, + { + "epoch": 0.22726353601674754, + "grad_norm": 1.871026873588562, + "learning_rate": 4.389451795255533e-05, + "loss": 4.1964, + "step": 38213 + }, + { + "epoch": 0.22726948330002855, + "grad_norm": 1.8918932676315308, + "learning_rate": 4.389421208155606e-05, + "loss": 4.2043, + "step": 38214 + }, + { + "epoch": 0.22727543058330954, + "grad_norm": 1.5714900493621826, + "learning_rate": 4.389390620396102e-05, + "loss": 4.5157, + "step": 38215 + }, + { + "epoch": 0.22728137786659053, + "grad_norm": 1.7289087772369385, + "learning_rate": 4.389360031977032e-05, + "loss": 4.8163, + "step": 38216 + }, + { + "epoch": 0.22728732514987154, + "grad_norm": 1.7521272897720337, + "learning_rate": 4.389329442898406e-05, + "loss": 4.6082, + "step": 38217 + }, + { + "epoch": 0.22729327243315253, + "grad_norm": 1.6807644367218018, + "learning_rate": 4.389298853160234e-05, + "loss": 4.7829, + "step": 38218 + }, + { + "epoch": 0.22729921971643352, + "grad_norm": 1.5811291933059692, + "learning_rate": 4.389268262762527e-05, + "loss": 4.2259, + "step": 38219 + }, + { + "epoch": 0.22730516699971454, + "grad_norm": 1.6493738889694214, + "learning_rate": 4.3892376717052964e-05, + "loss": 4.7039, + "step": 38220 + }, + { + "epoch": 0.22731111428299552, + "grad_norm": 1.739897608757019, + "learning_rate": 4.389207079988552e-05, + "loss": 4.2059, + "step": 38221 + }, + { + "epoch": 0.2273170615662765, + "grad_norm": 1.8707149028778076, + "learning_rate": 4.389176487612306e-05, + "loss": 4.0741, + "step": 38222 + }, + { + "epoch": 0.22732300884955753, + "grad_norm": 1.7619922161102295, + "learning_rate": 4.389145894576567e-05, + "loss": 4.796, + "step": 38223 + }, + { + "epoch": 0.22732895613283852, + "grad_norm": 1.7651190757751465, + "learning_rate": 4.389115300881347e-05, + "loss": 4.5949, + "step": 38224 + }, + { + "epoch": 0.2273349034161195, + "grad_norm": 1.6352847814559937, + "learning_rate": 4.3890847065266564e-05, + "loss": 4.8217, + "step": 38225 + }, + { + "epoch": 0.22734085069940052, + "grad_norm": 2.0873641967773438, + "learning_rate": 4.389054111512506e-05, + "loss": 4.0475, + "step": 38226 + }, + { + "epoch": 0.2273467979826815, + "grad_norm": 2.021920919418335, + "learning_rate": 4.389023515838906e-05, + "loss": 4.086, + "step": 38227 + }, + { + "epoch": 0.2273527452659625, + "grad_norm": 2.1906380653381348, + "learning_rate": 4.388992919505868e-05, + "loss": 3.4877, + "step": 38228 + }, + { + "epoch": 0.2273586925492435, + "grad_norm": 2.515861749649048, + "learning_rate": 4.3889623225134016e-05, + "loss": 3.6833, + "step": 38229 + }, + { + "epoch": 0.2273646398325245, + "grad_norm": 2.3089938163757324, + "learning_rate": 4.388931724861518e-05, + "loss": 3.4421, + "step": 38230 + }, + { + "epoch": 0.2273705871158055, + "grad_norm": 2.258147716522217, + "learning_rate": 4.388901126550228e-05, + "loss": 3.6366, + "step": 38231 + }, + { + "epoch": 0.2273765343990865, + "grad_norm": 1.763493299484253, + "learning_rate": 4.388870527579542e-05, + "loss": 4.0498, + "step": 38232 + }, + { + "epoch": 0.2273824816823675, + "grad_norm": 2.237896680831909, + "learning_rate": 4.3888399279494705e-05, + "loss": 3.534, + "step": 38233 + }, + { + "epoch": 0.22738842896564848, + "grad_norm": 2.1409060955047607, + "learning_rate": 4.3888093276600254e-05, + "loss": 3.4942, + "step": 38234 + }, + { + "epoch": 0.2273943762489295, + "grad_norm": 2.0923609733581543, + "learning_rate": 4.388778726711216e-05, + "loss": 3.5556, + "step": 38235 + }, + { + "epoch": 0.22740032353221049, + "grad_norm": 2.1950254440307617, + "learning_rate": 4.3887481251030524e-05, + "loss": 3.5715, + "step": 38236 + }, + { + "epoch": 0.22740627081549147, + "grad_norm": 2.0914371013641357, + "learning_rate": 4.388717522835548e-05, + "loss": 3.6515, + "step": 38237 + }, + { + "epoch": 0.2274122180987725, + "grad_norm": 2.1977272033691406, + "learning_rate": 4.388686919908711e-05, + "loss": 3.7544, + "step": 38238 + }, + { + "epoch": 0.22741816538205348, + "grad_norm": 2.4383692741394043, + "learning_rate": 4.388656316322553e-05, + "loss": 3.569, + "step": 38239 + }, + { + "epoch": 0.22742411266533447, + "grad_norm": 2.297513484954834, + "learning_rate": 4.388625712077085e-05, + "loss": 3.6434, + "step": 38240 + }, + { + "epoch": 0.22743005994861548, + "grad_norm": 2.3670594692230225, + "learning_rate": 4.388595107172317e-05, + "loss": 3.7064, + "step": 38241 + }, + { + "epoch": 0.22743600723189647, + "grad_norm": 1.7966325283050537, + "learning_rate": 4.3885645016082596e-05, + "loss": 4.2793, + "step": 38242 + }, + { + "epoch": 0.22744195451517746, + "grad_norm": 1.7755250930786133, + "learning_rate": 4.3885338953849245e-05, + "loss": 5.1079, + "step": 38243 + }, + { + "epoch": 0.22744790179845847, + "grad_norm": 1.92861008644104, + "learning_rate": 4.388503288502321e-05, + "loss": 3.8551, + "step": 38244 + }, + { + "epoch": 0.22745384908173946, + "grad_norm": 1.834679365158081, + "learning_rate": 4.38847268096046e-05, + "loss": 4.9659, + "step": 38245 + }, + { + "epoch": 0.22745979636502045, + "grad_norm": 2.094355821609497, + "learning_rate": 4.388442072759355e-05, + "loss": 4.5049, + "step": 38246 + }, + { + "epoch": 0.22746574364830147, + "grad_norm": 1.6657506227493286, + "learning_rate": 4.388411463899012e-05, + "loss": 4.6536, + "step": 38247 + }, + { + "epoch": 0.22747169093158245, + "grad_norm": 2.0281357765197754, + "learning_rate": 4.388380854379445e-05, + "loss": 4.9256, + "step": 38248 + }, + { + "epoch": 0.22747763821486344, + "grad_norm": 1.6166311502456665, + "learning_rate": 4.388350244200664e-05, + "loss": 4.7357, + "step": 38249 + }, + { + "epoch": 0.22748358549814446, + "grad_norm": 1.9163943529129028, + "learning_rate": 4.38831963336268e-05, + "loss": 4.3306, + "step": 38250 + }, + { + "epoch": 0.22748953278142545, + "grad_norm": 1.8184092044830322, + "learning_rate": 4.3882890218655015e-05, + "loss": 4.4076, + "step": 38251 + }, + { + "epoch": 0.22749548006470643, + "grad_norm": 1.776785135269165, + "learning_rate": 4.388258409709142e-05, + "loss": 5.2, + "step": 38252 + }, + { + "epoch": 0.22750142734798745, + "grad_norm": 1.422700047492981, + "learning_rate": 4.38822779689361e-05, + "loss": 5.3973, + "step": 38253 + }, + { + "epoch": 0.22750737463126844, + "grad_norm": 1.5290848016738892, + "learning_rate": 4.388197183418918e-05, + "loss": 4.6603, + "step": 38254 + }, + { + "epoch": 0.22751332191454943, + "grad_norm": 2.919466972351074, + "learning_rate": 4.388166569285076e-05, + "loss": 4.2282, + "step": 38255 + }, + { + "epoch": 0.22751926919783044, + "grad_norm": 1.4154354333877563, + "learning_rate": 4.3881359544920945e-05, + "loss": 4.9397, + "step": 38256 + }, + { + "epoch": 0.22752521648111143, + "grad_norm": 1.3635774850845337, + "learning_rate": 4.388105339039984e-05, + "loss": 5.5165, + "step": 38257 + }, + { + "epoch": 0.22753116376439242, + "grad_norm": 1.314709186553955, + "learning_rate": 4.388074722928755e-05, + "loss": 5.652, + "step": 38258 + }, + { + "epoch": 0.22753711104767343, + "grad_norm": 1.4117316007614136, + "learning_rate": 4.3880441061584194e-05, + "loss": 5.5875, + "step": 38259 + }, + { + "epoch": 0.22754305833095442, + "grad_norm": 1.605210304260254, + "learning_rate": 4.388013488728986e-05, + "loss": 4.96, + "step": 38260 + }, + { + "epoch": 0.2275490056142354, + "grad_norm": 1.8368546962738037, + "learning_rate": 4.387982870640468e-05, + "loss": 4.722, + "step": 38261 + }, + { + "epoch": 0.22755495289751643, + "grad_norm": 1.5915447473526, + "learning_rate": 4.387952251892874e-05, + "loss": 4.7962, + "step": 38262 + }, + { + "epoch": 0.22756090018079742, + "grad_norm": 1.6231772899627686, + "learning_rate": 4.387921632486215e-05, + "loss": 4.1642, + "step": 38263 + }, + { + "epoch": 0.2275668474640784, + "grad_norm": 1.6547075510025024, + "learning_rate": 4.3878910124205034e-05, + "loss": 4.8041, + "step": 38264 + }, + { + "epoch": 0.22757279474735942, + "grad_norm": 2.497774600982666, + "learning_rate": 4.3878603916957474e-05, + "loss": 4.0578, + "step": 38265 + }, + { + "epoch": 0.2275787420306404, + "grad_norm": 5.403087139129639, + "learning_rate": 4.3878297703119584e-05, + "loss": 2.4185, + "step": 38266 + }, + { + "epoch": 0.2275846893139214, + "grad_norm": 2.073349952697754, + "learning_rate": 4.387799148269148e-05, + "loss": 3.2784, + "step": 38267 + }, + { + "epoch": 0.2275906365972024, + "grad_norm": 2.289799451828003, + "learning_rate": 4.387768525567327e-05, + "loss": 4.2342, + "step": 38268 + }, + { + "epoch": 0.2275965838804834, + "grad_norm": 2.4968581199645996, + "learning_rate": 4.3877379022065056e-05, + "loss": 4.2145, + "step": 38269 + }, + { + "epoch": 0.2276025311637644, + "grad_norm": 2.31097674369812, + "learning_rate": 4.387707278186693e-05, + "loss": 4.2879, + "step": 38270 + }, + { + "epoch": 0.22760847844704538, + "grad_norm": 3.108427047729492, + "learning_rate": 4.387676653507903e-05, + "loss": 2.3614, + "step": 38271 + }, + { + "epoch": 0.2276144257303264, + "grad_norm": 2.122161388397217, + "learning_rate": 4.387646028170144e-05, + "loss": 3.6171, + "step": 38272 + }, + { + "epoch": 0.22762037301360738, + "grad_norm": 1.9830756187438965, + "learning_rate": 4.387615402173427e-05, + "loss": 3.752, + "step": 38273 + }, + { + "epoch": 0.22762632029688837, + "grad_norm": 2.001715660095215, + "learning_rate": 4.387584775517763e-05, + "loss": 3.7583, + "step": 38274 + }, + { + "epoch": 0.22763226758016938, + "grad_norm": 2.0779619216918945, + "learning_rate": 4.3875541482031626e-05, + "loss": 3.8147, + "step": 38275 + }, + { + "epoch": 0.22763821486345037, + "grad_norm": 1.901292324066162, + "learning_rate": 4.387523520229637e-05, + "loss": 3.9703, + "step": 38276 + }, + { + "epoch": 0.22764416214673136, + "grad_norm": 1.7687041759490967, + "learning_rate": 4.3874928915971955e-05, + "loss": 5.009, + "step": 38277 + }, + { + "epoch": 0.22765010943001238, + "grad_norm": 1.8333301544189453, + "learning_rate": 4.3874622623058505e-05, + "loss": 3.7311, + "step": 38278 + }, + { + "epoch": 0.22765605671329336, + "grad_norm": 2.1521081924438477, + "learning_rate": 4.387431632355612e-05, + "loss": 3.5839, + "step": 38279 + }, + { + "epoch": 0.22766200399657435, + "grad_norm": 1.766525387763977, + "learning_rate": 4.38740100174649e-05, + "loss": 3.9625, + "step": 38280 + }, + { + "epoch": 0.22766795127985537, + "grad_norm": 1.5381993055343628, + "learning_rate": 4.3873703704784966e-05, + "loss": 4.7761, + "step": 38281 + }, + { + "epoch": 0.22767389856313636, + "grad_norm": 1.351083517074585, + "learning_rate": 4.387339738551641e-05, + "loss": 5.2671, + "step": 38282 + }, + { + "epoch": 0.22767984584641734, + "grad_norm": 1.9041895866394043, + "learning_rate": 4.387309105965935e-05, + "loss": 4.7621, + "step": 38283 + }, + { + "epoch": 0.22768579312969836, + "grad_norm": 1.7635126113891602, + "learning_rate": 4.387278472721389e-05, + "loss": 4.5413, + "step": 38284 + }, + { + "epoch": 0.22769174041297935, + "grad_norm": 2.1425886154174805, + "learning_rate": 4.3872478388180126e-05, + "loss": 3.5661, + "step": 38285 + }, + { + "epoch": 0.22769768769626034, + "grad_norm": 2.4249305725097656, + "learning_rate": 4.387217204255819e-05, + "loss": 3.2622, + "step": 38286 + }, + { + "epoch": 0.22770363497954135, + "grad_norm": 2.1183717250823975, + "learning_rate": 4.387186569034816e-05, + "loss": 3.4106, + "step": 38287 + }, + { + "epoch": 0.22770958226282234, + "grad_norm": 2.123342990875244, + "learning_rate": 4.3871559331550166e-05, + "loss": 3.7708, + "step": 38288 + }, + { + "epoch": 0.22771552954610333, + "grad_norm": 1.565640926361084, + "learning_rate": 4.38712529661643e-05, + "loss": 4.6837, + "step": 38289 + }, + { + "epoch": 0.22772147682938434, + "grad_norm": 1.516226887702942, + "learning_rate": 4.387094659419068e-05, + "loss": 4.8717, + "step": 38290 + }, + { + "epoch": 0.22772742411266533, + "grad_norm": 1.859891653060913, + "learning_rate": 4.38706402156294e-05, + "loss": 4.4244, + "step": 38291 + }, + { + "epoch": 0.22773337139594632, + "grad_norm": 1.8023730516433716, + "learning_rate": 4.387033383048058e-05, + "loss": 4.3214, + "step": 38292 + }, + { + "epoch": 0.22773931867922734, + "grad_norm": 1.324349045753479, + "learning_rate": 4.387002743874432e-05, + "loss": 4.8791, + "step": 38293 + }, + { + "epoch": 0.22774526596250833, + "grad_norm": 1.505937933921814, + "learning_rate": 4.386972104042073e-05, + "loss": 4.4301, + "step": 38294 + }, + { + "epoch": 0.2277512132457893, + "grad_norm": 1.7081881761550903, + "learning_rate": 4.386941463550992e-05, + "loss": 3.9549, + "step": 38295 + }, + { + "epoch": 0.22775716052907033, + "grad_norm": 1.7600693702697754, + "learning_rate": 4.3869108224011976e-05, + "loss": 3.9081, + "step": 38296 + }, + { + "epoch": 0.22776310781235132, + "grad_norm": 1.8432573080062866, + "learning_rate": 4.386880180592703e-05, + "loss": 3.823, + "step": 38297 + }, + { + "epoch": 0.2277690550956323, + "grad_norm": 1.917293667793274, + "learning_rate": 4.386849538125519e-05, + "loss": 3.6595, + "step": 38298 + }, + { + "epoch": 0.22777500237891332, + "grad_norm": 1.771728754043579, + "learning_rate": 4.386818894999654e-05, + "loss": 3.7419, + "step": 38299 + }, + { + "epoch": 0.2277809496621943, + "grad_norm": 1.9334973096847534, + "learning_rate": 4.3867882512151205e-05, + "loss": 3.6426, + "step": 38300 + }, + { + "epoch": 0.2277868969454753, + "grad_norm": 1.738030195236206, + "learning_rate": 4.386757606771929e-05, + "loss": 3.8732, + "step": 38301 + }, + { + "epoch": 0.2277928442287563, + "grad_norm": 1.9789965152740479, + "learning_rate": 4.38672696167009e-05, + "loss": 3.982, + "step": 38302 + }, + { + "epoch": 0.2277987915120373, + "grad_norm": 1.9081391096115112, + "learning_rate": 4.386696315909613e-05, + "loss": 3.9094, + "step": 38303 + }, + { + "epoch": 0.2278047387953183, + "grad_norm": 2.4843461513519287, + "learning_rate": 4.386665669490511e-05, + "loss": 3.8854, + "step": 38304 + }, + { + "epoch": 0.2278106860785993, + "grad_norm": 1.8012548685073853, + "learning_rate": 4.386635022412793e-05, + "loss": 3.9795, + "step": 38305 + }, + { + "epoch": 0.2278166333618803, + "grad_norm": 2.046868324279785, + "learning_rate": 4.38660437467647e-05, + "loss": 3.9579, + "step": 38306 + }, + { + "epoch": 0.22782258064516128, + "grad_norm": 1.9416875839233398, + "learning_rate": 4.386573726281553e-05, + "loss": 3.8472, + "step": 38307 + }, + { + "epoch": 0.2278285279284423, + "grad_norm": 1.9911282062530518, + "learning_rate": 4.386543077228053e-05, + "loss": 3.7991, + "step": 38308 + }, + { + "epoch": 0.2278344752117233, + "grad_norm": 1.767100214958191, + "learning_rate": 4.3865124275159805e-05, + "loss": 3.9515, + "step": 38309 + }, + { + "epoch": 0.22784042249500427, + "grad_norm": 1.7764270305633545, + "learning_rate": 4.386481777145345e-05, + "loss": 3.8708, + "step": 38310 + }, + { + "epoch": 0.2278463697782853, + "grad_norm": 2.0385255813598633, + "learning_rate": 4.386451126116159e-05, + "loss": 3.8093, + "step": 38311 + }, + { + "epoch": 0.22785231706156628, + "grad_norm": 4.206143379211426, + "learning_rate": 4.3864204744284324e-05, + "loss": 2.8644, + "step": 38312 + }, + { + "epoch": 0.22785826434484727, + "grad_norm": 4.548903465270996, + "learning_rate": 4.3863898220821753e-05, + "loss": 1.5476, + "step": 38313 + }, + { + "epoch": 0.22786421162812828, + "grad_norm": 5.352540969848633, + "learning_rate": 4.3863591690773996e-05, + "loss": 1.7165, + "step": 38314 + }, + { + "epoch": 0.22787015891140927, + "grad_norm": 4.13737154006958, + "learning_rate": 4.3863285154141155e-05, + "loss": 1.6609, + "step": 38315 + }, + { + "epoch": 0.22787610619469026, + "grad_norm": 2.086291790008545, + "learning_rate": 4.3862978610923336e-05, + "loss": 4.7187, + "step": 38316 + }, + { + "epoch": 0.22788205347797127, + "grad_norm": 3.0734682083129883, + "learning_rate": 4.3862672061120637e-05, + "loss": 1.2077, + "step": 38317 + }, + { + "epoch": 0.22788800076125226, + "grad_norm": 3.8052728176116943, + "learning_rate": 4.3862365504733184e-05, + "loss": 1.4626, + "step": 38318 + }, + { + "epoch": 0.22789394804453325, + "grad_norm": 3.8481943607330322, + "learning_rate": 4.3862058941761066e-05, + "loss": 1.015, + "step": 38319 + }, + { + "epoch": 0.22789989532781427, + "grad_norm": 3.951799154281616, + "learning_rate": 4.38617523722044e-05, + "loss": 1.2529, + "step": 38320 + }, + { + "epoch": 0.22790584261109526, + "grad_norm": 3.9880523681640625, + "learning_rate": 4.386144579606329e-05, + "loss": 1.4974, + "step": 38321 + }, + { + "epoch": 0.22791178989437624, + "grad_norm": 3.225381851196289, + "learning_rate": 4.386113921333785e-05, + "loss": 1.3402, + "step": 38322 + }, + { + "epoch": 0.22791773717765726, + "grad_norm": 3.4442830085754395, + "learning_rate": 4.386083262402818e-05, + "loss": 1.7391, + "step": 38323 + }, + { + "epoch": 0.22792368446093825, + "grad_norm": 3.1190624237060547, + "learning_rate": 4.3860526028134385e-05, + "loss": 1.328, + "step": 38324 + }, + { + "epoch": 0.22792963174421924, + "grad_norm": 4.021055698394775, + "learning_rate": 4.3860219425656576e-05, + "loss": 2.5143, + "step": 38325 + }, + { + "epoch": 0.22793557902750025, + "grad_norm": 3.794240951538086, + "learning_rate": 4.385991281659486e-05, + "loss": 2.1306, + "step": 38326 + }, + { + "epoch": 0.22794152631078124, + "grad_norm": 3.4017739295959473, + "learning_rate": 4.385960620094934e-05, + "loss": 1.7635, + "step": 38327 + }, + { + "epoch": 0.22794747359406223, + "grad_norm": 3.104942560195923, + "learning_rate": 4.3859299578720136e-05, + "loss": 1.3225, + "step": 38328 + }, + { + "epoch": 0.22795342087734322, + "grad_norm": 1.854576826095581, + "learning_rate": 4.385899294990734e-05, + "loss": 3.7617, + "step": 38329 + }, + { + "epoch": 0.22795936816062423, + "grad_norm": 1.875930666923523, + "learning_rate": 4.385868631451107e-05, + "loss": 4.7611, + "step": 38330 + }, + { + "epoch": 0.22796531544390522, + "grad_norm": 1.7999051809310913, + "learning_rate": 4.385837967253142e-05, + "loss": 4.874, + "step": 38331 + }, + { + "epoch": 0.2279712627271862, + "grad_norm": 1.803697109222412, + "learning_rate": 4.3858073023968504e-05, + "loss": 4.9271, + "step": 38332 + }, + { + "epoch": 0.22797721001046722, + "grad_norm": 2.002218008041382, + "learning_rate": 4.385776636882243e-05, + "loss": 4.8422, + "step": 38333 + }, + { + "epoch": 0.2279831572937482, + "grad_norm": 1.7723742723464966, + "learning_rate": 4.38574597070933e-05, + "loss": 5.1484, + "step": 38334 + }, + { + "epoch": 0.2279891045770292, + "grad_norm": 1.6561287641525269, + "learning_rate": 4.3857153038781236e-05, + "loss": 5.1151, + "step": 38335 + }, + { + "epoch": 0.22799505186031022, + "grad_norm": 1.9022784233093262, + "learning_rate": 4.3856846363886326e-05, + "loss": 5.3096, + "step": 38336 + }, + { + "epoch": 0.2280009991435912, + "grad_norm": 1.7410986423492432, + "learning_rate": 4.3856539682408695e-05, + "loss": 4.9187, + "step": 38337 + }, + { + "epoch": 0.2280069464268722, + "grad_norm": 1.9505523443222046, + "learning_rate": 4.385623299434843e-05, + "loss": 4.1309, + "step": 38338 + }, + { + "epoch": 0.2280128937101532, + "grad_norm": 2.1584455966949463, + "learning_rate": 4.385592629970566e-05, + "loss": 4.5914, + "step": 38339 + }, + { + "epoch": 0.2280188409934342, + "grad_norm": 1.7182157039642334, + "learning_rate": 4.3855619598480466e-05, + "loss": 4.7145, + "step": 38340 + }, + { + "epoch": 0.22802478827671518, + "grad_norm": 1.7989410161972046, + "learning_rate": 4.3855312890672985e-05, + "loss": 4.6119, + "step": 38341 + }, + { + "epoch": 0.2280307355599962, + "grad_norm": 1.574983835220337, + "learning_rate": 4.385500617628331e-05, + "loss": 4.6344, + "step": 38342 + }, + { + "epoch": 0.2280366828432772, + "grad_norm": 1.7368310689926147, + "learning_rate": 4.385469945531153e-05, + "loss": 4.2213, + "step": 38343 + }, + { + "epoch": 0.22804263012655818, + "grad_norm": 1.7998409271240234, + "learning_rate": 4.385439272775779e-05, + "loss": 4.5357, + "step": 38344 + }, + { + "epoch": 0.2280485774098392, + "grad_norm": 1.6426572799682617, + "learning_rate": 4.385408599362216e-05, + "loss": 4.8873, + "step": 38345 + }, + { + "epoch": 0.22805452469312018, + "grad_norm": 1.6232562065124512, + "learning_rate": 4.385377925290477e-05, + "loss": 4.8186, + "step": 38346 + }, + { + "epoch": 0.22806047197640117, + "grad_norm": 1.5165815353393555, + "learning_rate": 4.385347250560572e-05, + "loss": 4.9799, + "step": 38347 + }, + { + "epoch": 0.22806641925968218, + "grad_norm": 1.58846116065979, + "learning_rate": 4.385316575172512e-05, + "loss": 4.6993, + "step": 38348 + }, + { + "epoch": 0.22807236654296317, + "grad_norm": 2.4419918060302734, + "learning_rate": 4.385285899126307e-05, + "loss": 3.9719, + "step": 38349 + }, + { + "epoch": 0.22807831382624416, + "grad_norm": 2.2855114936828613, + "learning_rate": 4.385255222421968e-05, + "loss": 3.4413, + "step": 38350 + }, + { + "epoch": 0.22808426110952518, + "grad_norm": 2.426321268081665, + "learning_rate": 4.385224545059506e-05, + "loss": 3.5723, + "step": 38351 + }, + { + "epoch": 0.22809020839280617, + "grad_norm": 2.4065475463867188, + "learning_rate": 4.3851938670389325e-05, + "loss": 3.8233, + "step": 38352 + }, + { + "epoch": 0.22809615567608715, + "grad_norm": 1.5618879795074463, + "learning_rate": 4.385163188360256e-05, + "loss": 5.2795, + "step": 38353 + }, + { + "epoch": 0.22810210295936817, + "grad_norm": 2.3342907428741455, + "learning_rate": 4.3851325090234894e-05, + "loss": 3.4984, + "step": 38354 + }, + { + "epoch": 0.22810805024264916, + "grad_norm": 2.280735731124878, + "learning_rate": 4.3851018290286424e-05, + "loss": 3.9731, + "step": 38355 + }, + { + "epoch": 0.22811399752593015, + "grad_norm": 2.4038596153259277, + "learning_rate": 4.3850711483757254e-05, + "loss": 3.4351, + "step": 38356 + }, + { + "epoch": 0.22811994480921116, + "grad_norm": 2.344742774963379, + "learning_rate": 4.3850404670647505e-05, + "loss": 3.4398, + "step": 38357 + }, + { + "epoch": 0.22812589209249215, + "grad_norm": 2.090485095977783, + "learning_rate": 4.385009785095726e-05, + "loss": 3.5157, + "step": 38358 + }, + { + "epoch": 0.22813183937577314, + "grad_norm": 1.652655005455017, + "learning_rate": 4.384979102468666e-05, + "loss": 4.728, + "step": 38359 + }, + { + "epoch": 0.22813778665905415, + "grad_norm": 2.0470733642578125, + "learning_rate": 4.3849484191835775e-05, + "loss": 4.3696, + "step": 38360 + }, + { + "epoch": 0.22814373394233514, + "grad_norm": 1.6509379148483276, + "learning_rate": 4.3849177352404735e-05, + "loss": 5.0528, + "step": 38361 + }, + { + "epoch": 0.22814968122561613, + "grad_norm": 1.8258634805679321, + "learning_rate": 4.3848870506393644e-05, + "loss": 4.995, + "step": 38362 + }, + { + "epoch": 0.22815562850889715, + "grad_norm": 1.6283469200134277, + "learning_rate": 4.384856365380261e-05, + "loss": 4.9513, + "step": 38363 + }, + { + "epoch": 0.22816157579217813, + "grad_norm": 1.6566721200942993, + "learning_rate": 4.3848256794631726e-05, + "loss": 4.6471, + "step": 38364 + }, + { + "epoch": 0.22816752307545912, + "grad_norm": 1.37803316116333, + "learning_rate": 4.3847949928881116e-05, + "loss": 4.7292, + "step": 38365 + }, + { + "epoch": 0.22817347035874014, + "grad_norm": 1.688583254814148, + "learning_rate": 4.384764305655089e-05, + "loss": 4.9555, + "step": 38366 + }, + { + "epoch": 0.22817941764202113, + "grad_norm": 1.525038480758667, + "learning_rate": 4.384733617764113e-05, + "loss": 4.4668, + "step": 38367 + }, + { + "epoch": 0.22818536492530211, + "grad_norm": 1.6260348558425903, + "learning_rate": 4.384702929215198e-05, + "loss": 4.3515, + "step": 38368 + }, + { + "epoch": 0.22819131220858313, + "grad_norm": 2.6498796939849854, + "learning_rate": 4.3846722400083505e-05, + "loss": 4.6671, + "step": 38369 + }, + { + "epoch": 0.22819725949186412, + "grad_norm": 1.8406044244766235, + "learning_rate": 4.384641550143584e-05, + "loss": 4.8844, + "step": 38370 + }, + { + "epoch": 0.2282032067751451, + "grad_norm": 1.9134811162948608, + "learning_rate": 4.38461085962091e-05, + "loss": 3.916, + "step": 38371 + }, + { + "epoch": 0.22820915405842612, + "grad_norm": 2.211760997772217, + "learning_rate": 4.384580168440337e-05, + "loss": 4.1463, + "step": 38372 + }, + { + "epoch": 0.2282151013417071, + "grad_norm": 1.8475337028503418, + "learning_rate": 4.384549476601876e-05, + "loss": 4.6829, + "step": 38373 + }, + { + "epoch": 0.2282210486249881, + "grad_norm": 1.9539520740509033, + "learning_rate": 4.384518784105539e-05, + "loss": 4.3289, + "step": 38374 + }, + { + "epoch": 0.22822699590826911, + "grad_norm": 1.8555973768234253, + "learning_rate": 4.384488090951335e-05, + "loss": 4.3754, + "step": 38375 + }, + { + "epoch": 0.2282329431915501, + "grad_norm": 1.7652846574783325, + "learning_rate": 4.384457397139277e-05, + "loss": 4.7864, + "step": 38376 + }, + { + "epoch": 0.2282388904748311, + "grad_norm": 1.609988808631897, + "learning_rate": 4.3844267026693734e-05, + "loss": 4.378, + "step": 38377 + }, + { + "epoch": 0.2282448377581121, + "grad_norm": 1.6453213691711426, + "learning_rate": 4.3843960075416364e-05, + "loss": 4.3171, + "step": 38378 + }, + { + "epoch": 0.2282507850413931, + "grad_norm": 1.6950875520706177, + "learning_rate": 4.384365311756076e-05, + "loss": 4.2858, + "step": 38379 + }, + { + "epoch": 0.22825673232467408, + "grad_norm": 1.496107816696167, + "learning_rate": 4.384334615312703e-05, + "loss": 4.5498, + "step": 38380 + }, + { + "epoch": 0.2282626796079551, + "grad_norm": 1.898187279701233, + "learning_rate": 4.384303918211529e-05, + "loss": 4.3318, + "step": 38381 + }, + { + "epoch": 0.2282686268912361, + "grad_norm": 1.9541302919387817, + "learning_rate": 4.3842732204525626e-05, + "loss": 4.3783, + "step": 38382 + }, + { + "epoch": 0.22827457417451708, + "grad_norm": 1.8452259302139282, + "learning_rate": 4.384242522035817e-05, + "loss": 4.1647, + "step": 38383 + }, + { + "epoch": 0.2282805214577981, + "grad_norm": 1.7965500354766846, + "learning_rate": 4.3842118229613015e-05, + "loss": 4.2006, + "step": 38384 + }, + { + "epoch": 0.22828646874107908, + "grad_norm": 1.7789390087127686, + "learning_rate": 4.384181123229027e-05, + "loss": 4.3562, + "step": 38385 + }, + { + "epoch": 0.22829241602436007, + "grad_norm": 1.790845274925232, + "learning_rate": 4.384150422839005e-05, + "loss": 4.236, + "step": 38386 + }, + { + "epoch": 0.22829836330764108, + "grad_norm": 2.035334825515747, + "learning_rate": 4.384119721791244e-05, + "loss": 4.2813, + "step": 38387 + }, + { + "epoch": 0.22830431059092207, + "grad_norm": 1.4204626083374023, + "learning_rate": 4.3840890200857575e-05, + "loss": 4.3353, + "step": 38388 + }, + { + "epoch": 0.22831025787420306, + "grad_norm": 1.9478641748428345, + "learning_rate": 4.3840583177225546e-05, + "loss": 4.2326, + "step": 38389 + }, + { + "epoch": 0.22831620515748405, + "grad_norm": 1.750410556793213, + "learning_rate": 4.384027614701647e-05, + "loss": 4.1916, + "step": 38390 + }, + { + "epoch": 0.22832215244076506, + "grad_norm": 2.1178817749023438, + "learning_rate": 4.3839969110230444e-05, + "loss": 4.0682, + "step": 38391 + }, + { + "epoch": 0.22832809972404605, + "grad_norm": 1.7404820919036865, + "learning_rate": 4.3839662066867575e-05, + "loss": 3.9515, + "step": 38392 + }, + { + "epoch": 0.22833404700732704, + "grad_norm": 1.667843222618103, + "learning_rate": 4.383935501692798e-05, + "loss": 4.7055, + "step": 38393 + }, + { + "epoch": 0.22833999429060806, + "grad_norm": 1.9728176593780518, + "learning_rate": 4.383904796041176e-05, + "loss": 5.0967, + "step": 38394 + }, + { + "epoch": 0.22834594157388904, + "grad_norm": 2.2353971004486084, + "learning_rate": 4.383874089731902e-05, + "loss": 4.6252, + "step": 38395 + }, + { + "epoch": 0.22835188885717003, + "grad_norm": 2.1327226161956787, + "learning_rate": 4.383843382764988e-05, + "loss": 4.4956, + "step": 38396 + }, + { + "epoch": 0.22835783614045105, + "grad_norm": 1.6246529817581177, + "learning_rate": 4.3838126751404426e-05, + "loss": 4.8295, + "step": 38397 + }, + { + "epoch": 0.22836378342373204, + "grad_norm": 1.6082868576049805, + "learning_rate": 4.383781966858278e-05, + "loss": 4.799, + "step": 38398 + }, + { + "epoch": 0.22836973070701302, + "grad_norm": 1.6632347106933594, + "learning_rate": 4.383751257918505e-05, + "loss": 4.465, + "step": 38399 + }, + { + "epoch": 0.22837567799029404, + "grad_norm": 1.6943600177764893, + "learning_rate": 4.383720548321133e-05, + "loss": 4.5799, + "step": 38400 + }, + { + "epoch": 0.22838162527357503, + "grad_norm": 1.6693763732910156, + "learning_rate": 4.3836898380661744e-05, + "loss": 4.3767, + "step": 38401 + }, + { + "epoch": 0.22838757255685602, + "grad_norm": 1.7295639514923096, + "learning_rate": 4.383659127153639e-05, + "loss": 4.3434, + "step": 38402 + }, + { + "epoch": 0.22839351984013703, + "grad_norm": 1.5898070335388184, + "learning_rate": 4.3836284155835375e-05, + "loss": 4.6257, + "step": 38403 + }, + { + "epoch": 0.22839946712341802, + "grad_norm": 1.638995885848999, + "learning_rate": 4.3835977033558804e-05, + "loss": 4.3646, + "step": 38404 + }, + { + "epoch": 0.228405414406699, + "grad_norm": 1.7454984188079834, + "learning_rate": 4.38356699047068e-05, + "loss": 4.4236, + "step": 38405 + }, + { + "epoch": 0.22841136168998002, + "grad_norm": 1.4987882375717163, + "learning_rate": 4.383536276927945e-05, + "loss": 4.9411, + "step": 38406 + }, + { + "epoch": 0.228417308973261, + "grad_norm": 1.7034831047058105, + "learning_rate": 4.383505562727687e-05, + "loss": 4.5394, + "step": 38407 + }, + { + "epoch": 0.228423256256542, + "grad_norm": 2.3442165851593018, + "learning_rate": 4.383474847869916e-05, + "loss": 3.9469, + "step": 38408 + }, + { + "epoch": 0.22842920353982302, + "grad_norm": 2.1856658458709717, + "learning_rate": 4.383444132354645e-05, + "loss": 3.8203, + "step": 38409 + }, + { + "epoch": 0.228435150823104, + "grad_norm": 2.810123920440674, + "learning_rate": 4.383413416181882e-05, + "loss": 4.2282, + "step": 38410 + }, + { + "epoch": 0.228441098106385, + "grad_norm": 2.2498672008514404, + "learning_rate": 4.3833826993516384e-05, + "loss": 3.9329, + "step": 38411 + }, + { + "epoch": 0.228447045389666, + "grad_norm": 2.1014022827148438, + "learning_rate": 4.383351981863926e-05, + "loss": 4.0302, + "step": 38412 + }, + { + "epoch": 0.228452992672947, + "grad_norm": 2.276716709136963, + "learning_rate": 4.3833212637187547e-05, + "loss": 4.0824, + "step": 38413 + }, + { + "epoch": 0.22845893995622799, + "grad_norm": 2.0157651901245117, + "learning_rate": 4.383290544916136e-05, + "loss": 4.3826, + "step": 38414 + }, + { + "epoch": 0.228464887239509, + "grad_norm": 1.7391401529312134, + "learning_rate": 4.3832598254560796e-05, + "loss": 4.5819, + "step": 38415 + }, + { + "epoch": 0.22847083452279, + "grad_norm": 1.7038103342056274, + "learning_rate": 4.3832291053385965e-05, + "loss": 3.7208, + "step": 38416 + }, + { + "epoch": 0.22847678180607098, + "grad_norm": 1.7914259433746338, + "learning_rate": 4.383198384563698e-05, + "loss": 3.755, + "step": 38417 + }, + { + "epoch": 0.228482729089352, + "grad_norm": 1.7458772659301758, + "learning_rate": 4.383167663131394e-05, + "loss": 3.7681, + "step": 38418 + }, + { + "epoch": 0.22848867637263298, + "grad_norm": 1.546152949333191, + "learning_rate": 4.383136941041696e-05, + "loss": 3.5506, + "step": 38419 + }, + { + "epoch": 0.22849462365591397, + "grad_norm": 1.6328538656234741, + "learning_rate": 4.383106218294614e-05, + "loss": 3.7039, + "step": 38420 + }, + { + "epoch": 0.22850057093919499, + "grad_norm": 1.8553338050842285, + "learning_rate": 4.383075494890159e-05, + "loss": 4.023, + "step": 38421 + }, + { + "epoch": 0.22850651822247597, + "grad_norm": 2.007420301437378, + "learning_rate": 4.3830447708283416e-05, + "loss": 4.2306, + "step": 38422 + }, + { + "epoch": 0.22851246550575696, + "grad_norm": 1.7648934125900269, + "learning_rate": 4.383014046109173e-05, + "loss": 4.0914, + "step": 38423 + }, + { + "epoch": 0.22851841278903798, + "grad_norm": 1.8605449199676514, + "learning_rate": 4.3829833207326644e-05, + "loss": 4.1892, + "step": 38424 + }, + { + "epoch": 0.22852436007231897, + "grad_norm": 1.7090728282928467, + "learning_rate": 4.382952594698825e-05, + "loss": 4.3818, + "step": 38425 + }, + { + "epoch": 0.22853030735559995, + "grad_norm": 1.8204621076583862, + "learning_rate": 4.382921868007666e-05, + "loss": 4.1313, + "step": 38426 + }, + { + "epoch": 0.22853625463888097, + "grad_norm": 2.0081419944763184, + "learning_rate": 4.382891140659199e-05, + "loss": 4.0119, + "step": 38427 + }, + { + "epoch": 0.22854220192216196, + "grad_norm": 2.096193790435791, + "learning_rate": 4.382860412653434e-05, + "loss": 4.4627, + "step": 38428 + }, + { + "epoch": 0.22854814920544295, + "grad_norm": 2.220578908920288, + "learning_rate": 4.3828296839903814e-05, + "loss": 4.1242, + "step": 38429 + }, + { + "epoch": 0.22855409648872396, + "grad_norm": 1.925628662109375, + "learning_rate": 4.3827989546700534e-05, + "loss": 4.0027, + "step": 38430 + }, + { + "epoch": 0.22856004377200495, + "grad_norm": 1.6829743385314941, + "learning_rate": 4.382768224692459e-05, + "loss": 5.0908, + "step": 38431 + }, + { + "epoch": 0.22856599105528594, + "grad_norm": 1.726646065711975, + "learning_rate": 4.38273749405761e-05, + "loss": 5.75, + "step": 38432 + }, + { + "epoch": 0.22857193833856695, + "grad_norm": 1.7120121717453003, + "learning_rate": 4.3827067627655164e-05, + "loss": 5.6595, + "step": 38433 + }, + { + "epoch": 0.22857788562184794, + "grad_norm": 1.7972990274429321, + "learning_rate": 4.38267603081619e-05, + "loss": 4.8448, + "step": 38434 + }, + { + "epoch": 0.22858383290512893, + "grad_norm": 2.042712450027466, + "learning_rate": 4.38264529820964e-05, + "loss": 4.8918, + "step": 38435 + }, + { + "epoch": 0.22858978018840995, + "grad_norm": 1.766764760017395, + "learning_rate": 4.3826145649458786e-05, + "loss": 4.9263, + "step": 38436 + }, + { + "epoch": 0.22859572747169093, + "grad_norm": 1.5604772567749023, + "learning_rate": 4.3825838310249155e-05, + "loss": 5.3605, + "step": 38437 + }, + { + "epoch": 0.22860167475497192, + "grad_norm": 1.8508424758911133, + "learning_rate": 4.3825530964467626e-05, + "loss": 5.2559, + "step": 38438 + }, + { + "epoch": 0.22860762203825294, + "grad_norm": 1.8071962594985962, + "learning_rate": 4.382522361211429e-05, + "loss": 5.1945, + "step": 38439 + }, + { + "epoch": 0.22861356932153393, + "grad_norm": 1.7534548044204712, + "learning_rate": 4.3824916253189266e-05, + "loss": 4.7507, + "step": 38440 + }, + { + "epoch": 0.22861951660481492, + "grad_norm": 1.6226413249969482, + "learning_rate": 4.3824608887692666e-05, + "loss": 5.1489, + "step": 38441 + }, + { + "epoch": 0.22862546388809593, + "grad_norm": 1.7093009948730469, + "learning_rate": 4.382430151562458e-05, + "loss": 5.6064, + "step": 38442 + }, + { + "epoch": 0.22863141117137692, + "grad_norm": 1.6506770849227905, + "learning_rate": 4.3823994136985126e-05, + "loss": 5.335, + "step": 38443 + }, + { + "epoch": 0.2286373584546579, + "grad_norm": 1.7119227647781372, + "learning_rate": 4.382368675177441e-05, + "loss": 4.4665, + "step": 38444 + }, + { + "epoch": 0.22864330573793892, + "grad_norm": 1.745902419090271, + "learning_rate": 4.382337935999254e-05, + "loss": 4.9238, + "step": 38445 + }, + { + "epoch": 0.2286492530212199, + "grad_norm": 1.7766257524490356, + "learning_rate": 4.382307196163962e-05, + "loss": 5.1864, + "step": 38446 + }, + { + "epoch": 0.2286552003045009, + "grad_norm": 1.6583263874053955, + "learning_rate": 4.382276455671577e-05, + "loss": 5.117, + "step": 38447 + }, + { + "epoch": 0.2286611475877819, + "grad_norm": 1.6775164604187012, + "learning_rate": 4.382245714522107e-05, + "loss": 4.8822, + "step": 38448 + }, + { + "epoch": 0.2286670948710629, + "grad_norm": 1.9329807758331299, + "learning_rate": 4.3822149727155656e-05, + "loss": 4.7758, + "step": 38449 + }, + { + "epoch": 0.2286730421543439, + "grad_norm": 1.7038854360580444, + "learning_rate": 4.382184230251963e-05, + "loss": 4.8293, + "step": 38450 + }, + { + "epoch": 0.22867898943762488, + "grad_norm": 1.7462607622146606, + "learning_rate": 4.3821534871313086e-05, + "loss": 4.6184, + "step": 38451 + }, + { + "epoch": 0.2286849367209059, + "grad_norm": 1.8756234645843506, + "learning_rate": 4.3821227433536135e-05, + "loss": 5.0585, + "step": 38452 + }, + { + "epoch": 0.22869088400418688, + "grad_norm": 1.8120574951171875, + "learning_rate": 4.382091998918889e-05, + "loss": 4.9257, + "step": 38453 + }, + { + "epoch": 0.22869683128746787, + "grad_norm": 1.696677327156067, + "learning_rate": 4.3820612538271465e-05, + "loss": 4.8921, + "step": 38454 + }, + { + "epoch": 0.2287027785707489, + "grad_norm": 1.588819980621338, + "learning_rate": 4.382030508078395e-05, + "loss": 5.5639, + "step": 38455 + }, + { + "epoch": 0.22870872585402988, + "grad_norm": 2.115058660507202, + "learning_rate": 4.381999761672646e-05, + "loss": 4.3399, + "step": 38456 + }, + { + "epoch": 0.22871467313731086, + "grad_norm": 3.0161054134368896, + "learning_rate": 4.3819690146099105e-05, + "loss": 2.4075, + "step": 38457 + }, + { + "epoch": 0.22872062042059188, + "grad_norm": 3.0567798614501953, + "learning_rate": 4.3819382668902e-05, + "loss": 2.6078, + "step": 38458 + }, + { + "epoch": 0.22872656770387287, + "grad_norm": 3.0818064212799072, + "learning_rate": 4.3819075185135226e-05, + "loss": 2.2381, + "step": 38459 + }, + { + "epoch": 0.22873251498715386, + "grad_norm": 2.7542152404785156, + "learning_rate": 4.381876769479892e-05, + "loss": 2.6451, + "step": 38460 + }, + { + "epoch": 0.22873846227043487, + "grad_norm": 2.485642671585083, + "learning_rate": 4.381846019789317e-05, + "loss": 3.3389, + "step": 38461 + }, + { + "epoch": 0.22874440955371586, + "grad_norm": 2.538452625274658, + "learning_rate": 4.381815269441809e-05, + "loss": 4.7806, + "step": 38462 + }, + { + "epoch": 0.22875035683699685, + "grad_norm": 2.311481475830078, + "learning_rate": 4.381784518437379e-05, + "loss": 4.7911, + "step": 38463 + }, + { + "epoch": 0.22875630412027786, + "grad_norm": 2.447681188583374, + "learning_rate": 4.3817537667760375e-05, + "loss": 3.5121, + "step": 38464 + }, + { + "epoch": 0.22876225140355885, + "grad_norm": 2.6825411319732666, + "learning_rate": 4.381723014457795e-05, + "loss": 3.9006, + "step": 38465 + }, + { + "epoch": 0.22876819868683984, + "grad_norm": 2.31955885887146, + "learning_rate": 4.381692261482663e-05, + "loss": 4.0621, + "step": 38466 + }, + { + "epoch": 0.22877414597012086, + "grad_norm": 2.573988199234009, + "learning_rate": 4.3816615078506514e-05, + "loss": 4.0674, + "step": 38467 + }, + { + "epoch": 0.22878009325340184, + "grad_norm": 2.437645673751831, + "learning_rate": 4.381630753561771e-05, + "loss": 3.8, + "step": 38468 + }, + { + "epoch": 0.22878604053668283, + "grad_norm": 1.8713231086730957, + "learning_rate": 4.381599998616033e-05, + "loss": 4.5789, + "step": 38469 + }, + { + "epoch": 0.22879198781996385, + "grad_norm": 1.6233890056610107, + "learning_rate": 4.381569243013448e-05, + "loss": 4.5076, + "step": 38470 + }, + { + "epoch": 0.22879793510324484, + "grad_norm": 1.7696006298065186, + "learning_rate": 4.3815384867540255e-05, + "loss": 4.2695, + "step": 38471 + }, + { + "epoch": 0.22880388238652583, + "grad_norm": 1.7355118989944458, + "learning_rate": 4.381507729837778e-05, + "loss": 4.5847, + "step": 38472 + }, + { + "epoch": 0.22880982966980684, + "grad_norm": 1.544387936592102, + "learning_rate": 4.381476972264716e-05, + "loss": 4.9015, + "step": 38473 + }, + { + "epoch": 0.22881577695308783, + "grad_norm": 1.5550681352615356, + "learning_rate": 4.38144621403485e-05, + "loss": 4.6908, + "step": 38474 + }, + { + "epoch": 0.22882172423636882, + "grad_norm": 1.700878620147705, + "learning_rate": 4.38141545514819e-05, + "loss": 4.7589, + "step": 38475 + }, + { + "epoch": 0.22882767151964983, + "grad_norm": 2.258042812347412, + "learning_rate": 4.381384695604748e-05, + "loss": 4.6368, + "step": 38476 + }, + { + "epoch": 0.22883361880293082, + "grad_norm": 2.191384792327881, + "learning_rate": 4.381353935404533e-05, + "loss": 4.666, + "step": 38477 + }, + { + "epoch": 0.2288395660862118, + "grad_norm": 2.2790069580078125, + "learning_rate": 4.381323174547557e-05, + "loss": 4.7982, + "step": 38478 + }, + { + "epoch": 0.22884551336949283, + "grad_norm": 2.4165420532226562, + "learning_rate": 4.381292413033831e-05, + "loss": 4.8295, + "step": 38479 + }, + { + "epoch": 0.2288514606527738, + "grad_norm": 2.2812304496765137, + "learning_rate": 4.3812616508633654e-05, + "loss": 4.6471, + "step": 38480 + }, + { + "epoch": 0.2288574079360548, + "grad_norm": 2.199652671813965, + "learning_rate": 4.38123088803617e-05, + "loss": 4.7415, + "step": 38481 + }, + { + "epoch": 0.22886335521933582, + "grad_norm": 2.2370831966400146, + "learning_rate": 4.381200124552257e-05, + "loss": 4.6689, + "step": 38482 + }, + { + "epoch": 0.2288693025026168, + "grad_norm": 2.2006328105926514, + "learning_rate": 4.381169360411636e-05, + "loss": 4.7032, + "step": 38483 + }, + { + "epoch": 0.2288752497858978, + "grad_norm": 2.2597954273223877, + "learning_rate": 4.381138595614318e-05, + "loss": 4.6799, + "step": 38484 + }, + { + "epoch": 0.2288811970691788, + "grad_norm": 1.9300129413604736, + "learning_rate": 4.381107830160315e-05, + "loss": 4.8595, + "step": 38485 + }, + { + "epoch": 0.2288871443524598, + "grad_norm": 1.6042248010635376, + "learning_rate": 4.381077064049636e-05, + "loss": 4.8479, + "step": 38486 + }, + { + "epoch": 0.2288930916357408, + "grad_norm": 1.685781478881836, + "learning_rate": 4.381046297282292e-05, + "loss": 4.9784, + "step": 38487 + }, + { + "epoch": 0.2288990389190218, + "grad_norm": 2.3685178756713867, + "learning_rate": 4.3810155298582956e-05, + "loss": 2.661, + "step": 38488 + }, + { + "epoch": 0.2289049862023028, + "grad_norm": 2.790250301361084, + "learning_rate": 4.3809847617776545e-05, + "loss": 1.3592, + "step": 38489 + }, + { + "epoch": 0.22891093348558378, + "grad_norm": 2.53934645652771, + "learning_rate": 4.380953993040382e-05, + "loss": 1.2262, + "step": 38490 + }, + { + "epoch": 0.2289168807688648, + "grad_norm": 2.5149827003479004, + "learning_rate": 4.3809232236464875e-05, + "loss": 1.2036, + "step": 38491 + }, + { + "epoch": 0.22892282805214578, + "grad_norm": 2.73903489112854, + "learning_rate": 4.3808924535959825e-05, + "loss": 1.3444, + "step": 38492 + }, + { + "epoch": 0.22892877533542677, + "grad_norm": 2.8913257122039795, + "learning_rate": 4.380861682888877e-05, + "loss": 1.6929, + "step": 38493 + }, + { + "epoch": 0.2289347226187078, + "grad_norm": 2.765486717224121, + "learning_rate": 4.380830911525182e-05, + "loss": 1.1883, + "step": 38494 + }, + { + "epoch": 0.22894066990198877, + "grad_norm": 3.039799928665161, + "learning_rate": 4.380800139504909e-05, + "loss": 1.2903, + "step": 38495 + }, + { + "epoch": 0.22894661718526976, + "grad_norm": 2.9481117725372314, + "learning_rate": 4.3807693668280676e-05, + "loss": 1.3223, + "step": 38496 + }, + { + "epoch": 0.22895256446855078, + "grad_norm": 2.532102346420288, + "learning_rate": 4.3807385934946696e-05, + "loss": 0.8985, + "step": 38497 + }, + { + "epoch": 0.22895851175183177, + "grad_norm": 3.0393545627593994, + "learning_rate": 4.380707819504725e-05, + "loss": 1.4591, + "step": 38498 + }, + { + "epoch": 0.22896445903511276, + "grad_norm": 3.1826932430267334, + "learning_rate": 4.3806770448582435e-05, + "loss": 1.106, + "step": 38499 + }, + { + "epoch": 0.22897040631839377, + "grad_norm": 3.0488319396972656, + "learning_rate": 4.3806462695552385e-05, + "loss": 1.0806, + "step": 38500 + }, + { + "epoch": 0.22897635360167476, + "grad_norm": 2.639357566833496, + "learning_rate": 4.380615493595719e-05, + "loss": 1.4651, + "step": 38501 + }, + { + "epoch": 0.22898230088495575, + "grad_norm": 1.8026386499404907, + "learning_rate": 4.380584716979696e-05, + "loss": 5.0124, + "step": 38502 + }, + { + "epoch": 0.22898824816823676, + "grad_norm": 2.951977014541626, + "learning_rate": 4.3805539397071806e-05, + "loss": 3.8272, + "step": 38503 + }, + { + "epoch": 0.22899419545151775, + "grad_norm": 2.755704402923584, + "learning_rate": 4.3805231617781823e-05, + "loss": 3.3427, + "step": 38504 + }, + { + "epoch": 0.22900014273479874, + "grad_norm": 2.2187037467956543, + "learning_rate": 4.380492383192714e-05, + "loss": 3.7645, + "step": 38505 + }, + { + "epoch": 0.22900609001807973, + "grad_norm": 1.8150204420089722, + "learning_rate": 4.380461603950784e-05, + "loss": 5.5314, + "step": 38506 + }, + { + "epoch": 0.22901203730136074, + "grad_norm": 2.308591604232788, + "learning_rate": 4.380430824052405e-05, + "loss": 3.7683, + "step": 38507 + }, + { + "epoch": 0.22901798458464173, + "grad_norm": 2.0516018867492676, + "learning_rate": 4.380400043497587e-05, + "loss": 3.6923, + "step": 38508 + }, + { + "epoch": 0.22902393186792272, + "grad_norm": 1.8416396379470825, + "learning_rate": 4.380369262286341e-05, + "loss": 4.9084, + "step": 38509 + }, + { + "epoch": 0.22902987915120374, + "grad_norm": 1.7685813903808594, + "learning_rate": 4.380338480418677e-05, + "loss": 4.8024, + "step": 38510 + }, + { + "epoch": 0.22903582643448472, + "grad_norm": 2.066200017929077, + "learning_rate": 4.3803076978946066e-05, + "loss": 3.3572, + "step": 38511 + }, + { + "epoch": 0.2290417737177657, + "grad_norm": 6.249925136566162, + "learning_rate": 4.3802769147141395e-05, + "loss": 2.0614, + "step": 38512 + }, + { + "epoch": 0.22904772100104673, + "grad_norm": 3.487518310546875, + "learning_rate": 4.380246130877288e-05, + "loss": 1.6695, + "step": 38513 + }, + { + "epoch": 0.22905366828432772, + "grad_norm": 2.93048095703125, + "learning_rate": 4.3802153463840626e-05, + "loss": 1.3208, + "step": 38514 + }, + { + "epoch": 0.2290596155676087, + "grad_norm": 2.9948956966400146, + "learning_rate": 4.3801845612344716e-05, + "loss": 1.256, + "step": 38515 + }, + { + "epoch": 0.22906556285088972, + "grad_norm": 2.7932517528533936, + "learning_rate": 4.380153775428529e-05, + "loss": 2.3507, + "step": 38516 + }, + { + "epoch": 0.2290715101341707, + "grad_norm": 2.4867453575134277, + "learning_rate": 4.380122988966244e-05, + "loss": 2.8348, + "step": 38517 + }, + { + "epoch": 0.2290774574174517, + "grad_norm": 2.4838972091674805, + "learning_rate": 4.380092201847627e-05, + "loss": 2.7133, + "step": 38518 + }, + { + "epoch": 0.2290834047007327, + "grad_norm": 2.3084802627563477, + "learning_rate": 4.3800614140726894e-05, + "loss": 2.9998, + "step": 38519 + }, + { + "epoch": 0.2290893519840137, + "grad_norm": 2.5051803588867188, + "learning_rate": 4.3800306256414415e-05, + "loss": 2.8024, + "step": 38520 + }, + { + "epoch": 0.2290952992672947, + "grad_norm": 2.5238940715789795, + "learning_rate": 4.3799998365538954e-05, + "loss": 2.722, + "step": 38521 + }, + { + "epoch": 0.2291012465505757, + "grad_norm": 2.2784957885742188, + "learning_rate": 4.3799690468100594e-05, + "loss": 3.4847, + "step": 38522 + }, + { + "epoch": 0.2291071938338567, + "grad_norm": 1.9627010822296143, + "learning_rate": 4.3799382564099464e-05, + "loss": 5.611, + "step": 38523 + }, + { + "epoch": 0.22911314111713768, + "grad_norm": 6.111119747161865, + "learning_rate": 4.3799074653535664e-05, + "loss": 3.582, + "step": 38524 + }, + { + "epoch": 0.2291190884004187, + "grad_norm": 7.286647319793701, + "learning_rate": 4.3798766736409304e-05, + "loss": 3.8205, + "step": 38525 + }, + { + "epoch": 0.22912503568369968, + "grad_norm": 4.92535400390625, + "learning_rate": 4.379845881272048e-05, + "loss": 3.4899, + "step": 38526 + }, + { + "epoch": 0.22913098296698067, + "grad_norm": 4.405367851257324, + "learning_rate": 4.3798150882469316e-05, + "loss": 3.1935, + "step": 38527 + }, + { + "epoch": 0.2291369302502617, + "grad_norm": 2.9681317806243896, + "learning_rate": 4.379784294565591e-05, + "loss": 3.5363, + "step": 38528 + }, + { + "epoch": 0.22914287753354268, + "grad_norm": 2.0128326416015625, + "learning_rate": 4.3797535002280366e-05, + "loss": 5.0034, + "step": 38529 + }, + { + "epoch": 0.22914882481682367, + "grad_norm": 1.7648944854736328, + "learning_rate": 4.37972270523428e-05, + "loss": 5.3278, + "step": 38530 + }, + { + "epoch": 0.22915477210010468, + "grad_norm": 1.7553101778030396, + "learning_rate": 4.379691909584332e-05, + "loss": 5.2114, + "step": 38531 + }, + { + "epoch": 0.22916071938338567, + "grad_norm": 1.5683118104934692, + "learning_rate": 4.379661113278203e-05, + "loss": 5.0773, + "step": 38532 + }, + { + "epoch": 0.22916666666666666, + "grad_norm": 1.6441692113876343, + "learning_rate": 4.3796303163159025e-05, + "loss": 5.1614, + "step": 38533 + }, + { + "epoch": 0.22917261394994767, + "grad_norm": 1.4447158575057983, + "learning_rate": 4.379599518697444e-05, + "loss": 5.0567, + "step": 38534 + }, + { + "epoch": 0.22917856123322866, + "grad_norm": 1.6365786790847778, + "learning_rate": 4.379568720422836e-05, + "loss": 5.0077, + "step": 38535 + }, + { + "epoch": 0.22918450851650965, + "grad_norm": 1.7679840326309204, + "learning_rate": 4.3795379214920895e-05, + "loss": 5.3002, + "step": 38536 + }, + { + "epoch": 0.22919045579979067, + "grad_norm": 1.586530327796936, + "learning_rate": 4.379507121905217e-05, + "loss": 5.0253, + "step": 38537 + }, + { + "epoch": 0.22919640308307165, + "grad_norm": 1.5012983083724976, + "learning_rate": 4.379476321662227e-05, + "loss": 5.1246, + "step": 38538 + }, + { + "epoch": 0.22920235036635264, + "grad_norm": 1.649339199066162, + "learning_rate": 4.3794455207631315e-05, + "loss": 5.0695, + "step": 38539 + }, + { + "epoch": 0.22920829764963366, + "grad_norm": 1.5892829895019531, + "learning_rate": 4.379414719207941e-05, + "loss": 4.7051, + "step": 38540 + }, + { + "epoch": 0.22921424493291465, + "grad_norm": 1.3112465143203735, + "learning_rate": 4.379383916996666e-05, + "loss": 5.3511, + "step": 38541 + }, + { + "epoch": 0.22922019221619563, + "grad_norm": 1.2667590379714966, + "learning_rate": 4.3793531141293185e-05, + "loss": 5.3804, + "step": 38542 + }, + { + "epoch": 0.22922613949947665, + "grad_norm": 2.032801389694214, + "learning_rate": 4.3793223106059064e-05, + "loss": 5.1612, + "step": 38543 + }, + { + "epoch": 0.22923208678275764, + "grad_norm": 1.8813502788543701, + "learning_rate": 4.379291506426444e-05, + "loss": 5.1347, + "step": 38544 + }, + { + "epoch": 0.22923803406603863, + "grad_norm": 1.8237574100494385, + "learning_rate": 4.379260701590939e-05, + "loss": 4.978, + "step": 38545 + }, + { + "epoch": 0.22924398134931964, + "grad_norm": 1.621106743812561, + "learning_rate": 4.3792298960994046e-05, + "loss": 4.9011, + "step": 38546 + }, + { + "epoch": 0.22924992863260063, + "grad_norm": 1.3890458345413208, + "learning_rate": 4.3791990899518506e-05, + "loss": 4.8795, + "step": 38547 + }, + { + "epoch": 0.22925587591588162, + "grad_norm": 1.5003517866134644, + "learning_rate": 4.379168283148287e-05, + "loss": 4.8412, + "step": 38548 + }, + { + "epoch": 0.22926182319916263, + "grad_norm": 1.7542293071746826, + "learning_rate": 4.379137475688725e-05, + "loss": 4.6803, + "step": 38549 + }, + { + "epoch": 0.22926777048244362, + "grad_norm": 1.7646663188934326, + "learning_rate": 4.379106667573176e-05, + "loss": 5.1501, + "step": 38550 + }, + { + "epoch": 0.2292737177657246, + "grad_norm": 2.451752185821533, + "learning_rate": 4.37907585880165e-05, + "loss": 4.2773, + "step": 38551 + }, + { + "epoch": 0.22927966504900563, + "grad_norm": 2.2721564769744873, + "learning_rate": 4.379045049374158e-05, + "loss": 4.1409, + "step": 38552 + }, + { + "epoch": 0.22928561233228661, + "grad_norm": 2.1969313621520996, + "learning_rate": 4.379014239290711e-05, + "loss": 4.303, + "step": 38553 + }, + { + "epoch": 0.2292915596155676, + "grad_norm": 2.17642879486084, + "learning_rate": 4.3789834285513195e-05, + "loss": 4.8686, + "step": 38554 + }, + { + "epoch": 0.22929750689884862, + "grad_norm": 1.5282912254333496, + "learning_rate": 4.378952617155994e-05, + "loss": 5.4506, + "step": 38555 + }, + { + "epoch": 0.2293034541821296, + "grad_norm": 1.6670398712158203, + "learning_rate": 4.378921805104746e-05, + "loss": 5.2819, + "step": 38556 + }, + { + "epoch": 0.2293094014654106, + "grad_norm": 1.834071159362793, + "learning_rate": 4.378890992397585e-05, + "loss": 4.9508, + "step": 38557 + }, + { + "epoch": 0.2293153487486916, + "grad_norm": 2.6882967948913574, + "learning_rate": 4.378860179034524e-05, + "loss": 4.546, + "step": 38558 + }, + { + "epoch": 0.2293212960319726, + "grad_norm": 2.468848705291748, + "learning_rate": 4.378829365015571e-05, + "loss": 4.6806, + "step": 38559 + }, + { + "epoch": 0.2293272433152536, + "grad_norm": 1.7329350709915161, + "learning_rate": 4.378798550340739e-05, + "loss": 4.4552, + "step": 38560 + }, + { + "epoch": 0.2293331905985346, + "grad_norm": 1.4378310441970825, + "learning_rate": 4.378767735010037e-05, + "loss": 5.2996, + "step": 38561 + }, + { + "epoch": 0.2293391378818156, + "grad_norm": 1.4221470355987549, + "learning_rate": 4.3787369190234766e-05, + "loss": 5.2644, + "step": 38562 + }, + { + "epoch": 0.22934508516509658, + "grad_norm": 1.4187413454055786, + "learning_rate": 4.378706102381069e-05, + "loss": 5.3766, + "step": 38563 + }, + { + "epoch": 0.22935103244837757, + "grad_norm": 2.0169026851654053, + "learning_rate": 4.378675285082825e-05, + "loss": 4.565, + "step": 38564 + }, + { + "epoch": 0.22935697973165858, + "grad_norm": 1.4599847793579102, + "learning_rate": 4.378644467128754e-05, + "loss": 5.1849, + "step": 38565 + }, + { + "epoch": 0.22936292701493957, + "grad_norm": 1.2419488430023193, + "learning_rate": 4.378613648518868e-05, + "loss": 4.9225, + "step": 38566 + }, + { + "epoch": 0.22936887429822056, + "grad_norm": 1.7379957437515259, + "learning_rate": 4.378582829253177e-05, + "loss": 3.9957, + "step": 38567 + }, + { + "epoch": 0.22937482158150158, + "grad_norm": 1.5496753454208374, + "learning_rate": 4.3785520093316926e-05, + "loss": 3.9657, + "step": 38568 + }, + { + "epoch": 0.22938076886478256, + "grad_norm": 1.5953773260116577, + "learning_rate": 4.3785211887544255e-05, + "loss": 4.1013, + "step": 38569 + }, + { + "epoch": 0.22938671614806355, + "grad_norm": 2.0474889278411865, + "learning_rate": 4.378490367521385e-05, + "loss": 3.9464, + "step": 38570 + }, + { + "epoch": 0.22939266343134457, + "grad_norm": 2.4065568447113037, + "learning_rate": 4.378459545632584e-05, + "loss": 3.8675, + "step": 38571 + }, + { + "epoch": 0.22939861071462556, + "grad_norm": 1.6409612894058228, + "learning_rate": 4.378428723088031e-05, + "loss": 5.0453, + "step": 38572 + }, + { + "epoch": 0.22940455799790654, + "grad_norm": 1.7439429759979248, + "learning_rate": 4.3783978998877385e-05, + "loss": 4.9018, + "step": 38573 + }, + { + "epoch": 0.22941050528118756, + "grad_norm": 1.6023813486099243, + "learning_rate": 4.3783670760317174e-05, + "loss": 5.1573, + "step": 38574 + }, + { + "epoch": 0.22941645256446855, + "grad_norm": 1.7142831087112427, + "learning_rate": 4.378336251519977e-05, + "loss": 5.0621, + "step": 38575 + }, + { + "epoch": 0.22942239984774954, + "grad_norm": 1.5479685068130493, + "learning_rate": 4.3783054263525284e-05, + "loss": 4.871, + "step": 38576 + }, + { + "epoch": 0.22942834713103055, + "grad_norm": 1.2989557981491089, + "learning_rate": 4.3782746005293837e-05, + "loss": 4.9569, + "step": 38577 + }, + { + "epoch": 0.22943429441431154, + "grad_norm": 1.3901426792144775, + "learning_rate": 4.378243774050552e-05, + "loss": 5.0305, + "step": 38578 + }, + { + "epoch": 0.22944024169759253, + "grad_norm": 1.2557393312454224, + "learning_rate": 4.378212946916045e-05, + "loss": 5.0187, + "step": 38579 + }, + { + "epoch": 0.22944618898087354, + "grad_norm": 1.7015252113342285, + "learning_rate": 4.378182119125874e-05, + "loss": 4.7364, + "step": 38580 + }, + { + "epoch": 0.22945213626415453, + "grad_norm": 1.5928224325180054, + "learning_rate": 4.378151290680048e-05, + "loss": 4.7833, + "step": 38581 + }, + { + "epoch": 0.22945808354743552, + "grad_norm": 1.5460565090179443, + "learning_rate": 4.3781204615785795e-05, + "loss": 4.8068, + "step": 38582 + }, + { + "epoch": 0.22946403083071654, + "grad_norm": 1.4052801132202148, + "learning_rate": 4.378089631821478e-05, + "loss": 4.8026, + "step": 38583 + }, + { + "epoch": 0.22946997811399752, + "grad_norm": 1.4528011083602905, + "learning_rate": 4.378058801408755e-05, + "loss": 4.8313, + "step": 38584 + }, + { + "epoch": 0.2294759253972785, + "grad_norm": 1.4408035278320312, + "learning_rate": 4.3780279703404214e-05, + "loss": 4.8099, + "step": 38585 + }, + { + "epoch": 0.22948187268055953, + "grad_norm": 1.5796035528182983, + "learning_rate": 4.3779971386164874e-05, + "loss": 4.9001, + "step": 38586 + }, + { + "epoch": 0.22948781996384052, + "grad_norm": 1.4678764343261719, + "learning_rate": 4.377966306236965e-05, + "loss": 4.7479, + "step": 38587 + }, + { + "epoch": 0.2294937672471215, + "grad_norm": 1.553661584854126, + "learning_rate": 4.377935473201862e-05, + "loss": 4.7, + "step": 38588 + }, + { + "epoch": 0.22949971453040252, + "grad_norm": 1.5192108154296875, + "learning_rate": 4.377904639511192e-05, + "loss": 4.7572, + "step": 38589 + }, + { + "epoch": 0.2295056618136835, + "grad_norm": 1.4428250789642334, + "learning_rate": 4.3778738051649656e-05, + "loss": 4.6175, + "step": 38590 + }, + { + "epoch": 0.2295116090969645, + "grad_norm": 1.589508056640625, + "learning_rate": 4.3778429701631915e-05, + "loss": 4.6856, + "step": 38591 + }, + { + "epoch": 0.2295175563802455, + "grad_norm": 1.4473097324371338, + "learning_rate": 4.377812134505883e-05, + "loss": 4.7122, + "step": 38592 + }, + { + "epoch": 0.2295235036635265, + "grad_norm": 1.5862704515457153, + "learning_rate": 4.377781298193049e-05, + "loss": 4.6001, + "step": 38593 + }, + { + "epoch": 0.2295294509468075, + "grad_norm": 1.5695568323135376, + "learning_rate": 4.377750461224702e-05, + "loss": 4.5452, + "step": 38594 + }, + { + "epoch": 0.2295353982300885, + "grad_norm": 1.5204521417617798, + "learning_rate": 4.3777196236008506e-05, + "loss": 4.7466, + "step": 38595 + }, + { + "epoch": 0.2295413455133695, + "grad_norm": 1.4180539846420288, + "learning_rate": 4.377688785321507e-05, + "loss": 4.583, + "step": 38596 + }, + { + "epoch": 0.22954729279665048, + "grad_norm": 1.6071792840957642, + "learning_rate": 4.377657946386682e-05, + "loss": 5.0753, + "step": 38597 + }, + { + "epoch": 0.2295532400799315, + "grad_norm": 1.9293522834777832, + "learning_rate": 4.3776271067963854e-05, + "loss": 5.2675, + "step": 38598 + }, + { + "epoch": 0.22955918736321249, + "grad_norm": 1.8490512371063232, + "learning_rate": 4.377596266550629e-05, + "loss": 5.1265, + "step": 38599 + }, + { + "epoch": 0.22956513464649347, + "grad_norm": 1.7723667621612549, + "learning_rate": 4.3775654256494223e-05, + "loss": 4.8487, + "step": 38600 + }, + { + "epoch": 0.2295710819297745, + "grad_norm": 1.6557354927062988, + "learning_rate": 4.377534584092778e-05, + "loss": 4.986, + "step": 38601 + }, + { + "epoch": 0.22957702921305548, + "grad_norm": 1.9539614915847778, + "learning_rate": 4.377503741880705e-05, + "loss": 3.9877, + "step": 38602 + }, + { + "epoch": 0.22958297649633647, + "grad_norm": 1.7153127193450928, + "learning_rate": 4.377472899013216e-05, + "loss": 4.1697, + "step": 38603 + }, + { + "epoch": 0.22958892377961748, + "grad_norm": 1.461155652999878, + "learning_rate": 4.37744205549032e-05, + "loss": 4.7448, + "step": 38604 + }, + { + "epoch": 0.22959487106289847, + "grad_norm": 1.764017105102539, + "learning_rate": 4.377411211312027e-05, + "loss": 4.9356, + "step": 38605 + }, + { + "epoch": 0.22960081834617946, + "grad_norm": 1.632539987564087, + "learning_rate": 4.377380366478351e-05, + "loss": 5.008, + "step": 38606 + }, + { + "epoch": 0.22960676562946047, + "grad_norm": 1.527861475944519, + "learning_rate": 4.3773495209893e-05, + "loss": 5.057, + "step": 38607 + }, + { + "epoch": 0.22961271291274146, + "grad_norm": 1.5240938663482666, + "learning_rate": 4.377318674844886e-05, + "loss": 4.9921, + "step": 38608 + }, + { + "epoch": 0.22961866019602245, + "grad_norm": 1.7693698406219482, + "learning_rate": 4.377287828045119e-05, + "loss": 4.7684, + "step": 38609 + }, + { + "epoch": 0.22962460747930347, + "grad_norm": 1.8743075132369995, + "learning_rate": 4.377256980590011e-05, + "loss": 4.9417, + "step": 38610 + }, + { + "epoch": 0.22963055476258445, + "grad_norm": 1.7317231893539429, + "learning_rate": 4.377226132479571e-05, + "loss": 5.4715, + "step": 38611 + }, + { + "epoch": 0.22963650204586544, + "grad_norm": 1.7278715372085571, + "learning_rate": 4.377195283713812e-05, + "loss": 5.4851, + "step": 38612 + }, + { + "epoch": 0.22964244932914646, + "grad_norm": 1.6612467765808105, + "learning_rate": 4.377164434292743e-05, + "loss": 5.6315, + "step": 38613 + }, + { + "epoch": 0.22964839661242745, + "grad_norm": 1.584639549255371, + "learning_rate": 4.377133584216375e-05, + "loss": 5.4698, + "step": 38614 + }, + { + "epoch": 0.22965434389570843, + "grad_norm": 1.4986923933029175, + "learning_rate": 4.377102733484719e-05, + "loss": 4.8665, + "step": 38615 + }, + { + "epoch": 0.22966029117898945, + "grad_norm": 1.7153973579406738, + "learning_rate": 4.377071882097786e-05, + "loss": 4.8084, + "step": 38616 + }, + { + "epoch": 0.22966623846227044, + "grad_norm": 1.4490904808044434, + "learning_rate": 4.377041030055586e-05, + "loss": 5.0178, + "step": 38617 + }, + { + "epoch": 0.22967218574555143, + "grad_norm": 1.4377447366714478, + "learning_rate": 4.377010177358132e-05, + "loss": 5.4579, + "step": 38618 + }, + { + "epoch": 0.22967813302883244, + "grad_norm": 1.5415947437286377, + "learning_rate": 4.376979324005432e-05, + "loss": 5.3875, + "step": 38619 + }, + { + "epoch": 0.22968408031211343, + "grad_norm": 1.3691684007644653, + "learning_rate": 4.3769484699974974e-05, + "loss": 5.4255, + "step": 38620 + }, + { + "epoch": 0.22969002759539442, + "grad_norm": 1.26857328414917, + "learning_rate": 4.3769176153343404e-05, + "loss": 5.4, + "step": 38621 + }, + { + "epoch": 0.2296959748786754, + "grad_norm": 1.210349440574646, + "learning_rate": 4.376886760015971e-05, + "loss": 5.4278, + "step": 38622 + }, + { + "epoch": 0.22970192216195642, + "grad_norm": 1.2842780351638794, + "learning_rate": 4.376855904042399e-05, + "loss": 5.3804, + "step": 38623 + }, + { + "epoch": 0.2297078694452374, + "grad_norm": 1.165520429611206, + "learning_rate": 4.3768250474136364e-05, + "loss": 5.3628, + "step": 38624 + }, + { + "epoch": 0.2297138167285184, + "grad_norm": 1.4088956117630005, + "learning_rate": 4.376794190129694e-05, + "loss": 5.3132, + "step": 38625 + }, + { + "epoch": 0.22971976401179942, + "grad_norm": 1.6567013263702393, + "learning_rate": 4.376763332190581e-05, + "loss": 5.3518, + "step": 38626 + }, + { + "epoch": 0.2297257112950804, + "grad_norm": 1.6979491710662842, + "learning_rate": 4.37673247359631e-05, + "loss": 5.1141, + "step": 38627 + }, + { + "epoch": 0.2297316585783614, + "grad_norm": 1.5642043352127075, + "learning_rate": 4.3767016143468916e-05, + "loss": 5.0352, + "step": 38628 + }, + { + "epoch": 0.2297376058616424, + "grad_norm": 2.544403076171875, + "learning_rate": 4.3766707544423354e-05, + "loss": 4.7447, + "step": 38629 + }, + { + "epoch": 0.2297435531449234, + "grad_norm": 4.404366493225098, + "learning_rate": 4.376639893882653e-05, + "loss": 3.9793, + "step": 38630 + }, + { + "epoch": 0.22974950042820438, + "grad_norm": 2.5350682735443115, + "learning_rate": 4.3766090326678545e-05, + "loss": 4.7711, + "step": 38631 + }, + { + "epoch": 0.2297554477114854, + "grad_norm": 2.2797350883483887, + "learning_rate": 4.376578170797952e-05, + "loss": 4.9254, + "step": 38632 + }, + { + "epoch": 0.2297613949947664, + "grad_norm": 1.9921247959136963, + "learning_rate": 4.3765473082729544e-05, + "loss": 4.9098, + "step": 38633 + }, + { + "epoch": 0.22976734227804738, + "grad_norm": 2.9996330738067627, + "learning_rate": 4.3765164450928744e-05, + "loss": 2.6739, + "step": 38634 + }, + { + "epoch": 0.2297732895613284, + "grad_norm": 2.8430988788604736, + "learning_rate": 4.376485581257722e-05, + "loss": 2.8912, + "step": 38635 + }, + { + "epoch": 0.22977923684460938, + "grad_norm": 1.6286189556121826, + "learning_rate": 4.376454716767508e-05, + "loss": 5.0859, + "step": 38636 + }, + { + "epoch": 0.22978518412789037, + "grad_norm": 2.431854248046875, + "learning_rate": 4.376423851622242e-05, + "loss": 3.4044, + "step": 38637 + }, + { + "epoch": 0.22979113141117138, + "grad_norm": 2.651479482650757, + "learning_rate": 4.376392985821937e-05, + "loss": 3.5497, + "step": 38638 + }, + { + "epoch": 0.22979707869445237, + "grad_norm": 2.285003185272217, + "learning_rate": 4.376362119366601e-05, + "loss": 3.5145, + "step": 38639 + }, + { + "epoch": 0.22980302597773336, + "grad_norm": 2.3178820610046387, + "learning_rate": 4.3763312522562483e-05, + "loss": 3.3635, + "step": 38640 + }, + { + "epoch": 0.22980897326101438, + "grad_norm": 1.6898589134216309, + "learning_rate": 4.376300384490887e-05, + "loss": 4.376, + "step": 38641 + }, + { + "epoch": 0.22981492054429536, + "grad_norm": 1.586006999015808, + "learning_rate": 4.376269516070528e-05, + "loss": 5.0728, + "step": 38642 + }, + { + "epoch": 0.22982086782757635, + "grad_norm": 1.542847752571106, + "learning_rate": 4.376238646995183e-05, + "loss": 5.0111, + "step": 38643 + }, + { + "epoch": 0.22982681511085737, + "grad_norm": 1.7499191761016846, + "learning_rate": 4.376207777264863e-05, + "loss": 5.2223, + "step": 38644 + }, + { + "epoch": 0.22983276239413836, + "grad_norm": 1.7189160585403442, + "learning_rate": 4.376176906879578e-05, + "loss": 4.733, + "step": 38645 + }, + { + "epoch": 0.22983870967741934, + "grad_norm": 1.8861480951309204, + "learning_rate": 4.3761460358393384e-05, + "loss": 4.5083, + "step": 38646 + }, + { + "epoch": 0.22984465696070036, + "grad_norm": 2.286637783050537, + "learning_rate": 4.376115164144157e-05, + "loss": 5.2219, + "step": 38647 + }, + { + "epoch": 0.22985060424398135, + "grad_norm": 1.623051404953003, + "learning_rate": 4.376084291794042e-05, + "loss": 5.4874, + "step": 38648 + }, + { + "epoch": 0.22985655152726234, + "grad_norm": 1.6066075563430786, + "learning_rate": 4.3760534187890056e-05, + "loss": 5.3106, + "step": 38649 + }, + { + "epoch": 0.22986249881054335, + "grad_norm": 1.554235816001892, + "learning_rate": 4.376022545129058e-05, + "loss": 5.118, + "step": 38650 + }, + { + "epoch": 0.22986844609382434, + "grad_norm": 1.5623208284378052, + "learning_rate": 4.375991670814211e-05, + "loss": 5.1645, + "step": 38651 + }, + { + "epoch": 0.22987439337710533, + "grad_norm": 1.68990159034729, + "learning_rate": 4.375960795844474e-05, + "loss": 5.1204, + "step": 38652 + }, + { + "epoch": 0.22988034066038635, + "grad_norm": 2.6499814987182617, + "learning_rate": 4.3759299202198586e-05, + "loss": 3.8581, + "step": 38653 + }, + { + "epoch": 0.22988628794366733, + "grad_norm": 1.704318642616272, + "learning_rate": 4.375899043940376e-05, + "loss": 4.6711, + "step": 38654 + }, + { + "epoch": 0.22989223522694832, + "grad_norm": 1.5703479051589966, + "learning_rate": 4.375868167006036e-05, + "loss": 4.7781, + "step": 38655 + }, + { + "epoch": 0.22989818251022934, + "grad_norm": 1.4788894653320312, + "learning_rate": 4.3758372894168496e-05, + "loss": 5.6499, + "step": 38656 + }, + { + "epoch": 0.22990412979351033, + "grad_norm": 1.5116046667099, + "learning_rate": 4.3758064111728286e-05, + "loss": 5.6359, + "step": 38657 + }, + { + "epoch": 0.2299100770767913, + "grad_norm": 1.4817161560058594, + "learning_rate": 4.375775532273982e-05, + "loss": 5.4063, + "step": 38658 + }, + { + "epoch": 0.22991602436007233, + "grad_norm": 1.7036468982696533, + "learning_rate": 4.375744652720323e-05, + "loss": 5.4815, + "step": 38659 + }, + { + "epoch": 0.22992197164335332, + "grad_norm": 1.6927287578582764, + "learning_rate": 4.375713772511859e-05, + "loss": 4.5907, + "step": 38660 + }, + { + "epoch": 0.2299279189266343, + "grad_norm": 1.5966923236846924, + "learning_rate": 4.3756828916486035e-05, + "loss": 4.4788, + "step": 38661 + }, + { + "epoch": 0.22993386620991532, + "grad_norm": 1.7133103609085083, + "learning_rate": 4.375652010130567e-05, + "loss": 5.1275, + "step": 38662 + }, + { + "epoch": 0.2299398134931963, + "grad_norm": 1.5442966222763062, + "learning_rate": 4.37562112795776e-05, + "loss": 5.0381, + "step": 38663 + }, + { + "epoch": 0.2299457607764773, + "grad_norm": 1.5008933544158936, + "learning_rate": 4.3755902451301924e-05, + "loss": 5.1612, + "step": 38664 + }, + { + "epoch": 0.22995170805975831, + "grad_norm": 1.7190736532211304, + "learning_rate": 4.375559361647875e-05, + "loss": 5.0969, + "step": 38665 + }, + { + "epoch": 0.2299576553430393, + "grad_norm": 1.5287115573883057, + "learning_rate": 4.3755284775108195e-05, + "loss": 5.1347, + "step": 38666 + }, + { + "epoch": 0.2299636026263203, + "grad_norm": 1.6138951778411865, + "learning_rate": 4.375497592719037e-05, + "loss": 5.0052, + "step": 38667 + }, + { + "epoch": 0.2299695499096013, + "grad_norm": 2.543833017349243, + "learning_rate": 4.3754667072725374e-05, + "loss": 3.6834, + "step": 38668 + }, + { + "epoch": 0.2299754971928823, + "grad_norm": 3.930359363555908, + "learning_rate": 4.375435821171332e-05, + "loss": 1.4384, + "step": 38669 + }, + { + "epoch": 0.22998144447616328, + "grad_norm": 3.1169984340667725, + "learning_rate": 4.375404934415431e-05, + "loss": 1.8578, + "step": 38670 + }, + { + "epoch": 0.2299873917594443, + "grad_norm": 4.075973033905029, + "learning_rate": 4.3753740470048456e-05, + "loss": 1.2247, + "step": 38671 + }, + { + "epoch": 0.2299933390427253, + "grad_norm": 2.6648190021514893, + "learning_rate": 4.375343158939587e-05, + "loss": 1.079, + "step": 38672 + }, + { + "epoch": 0.22999928632600627, + "grad_norm": 2.488210916519165, + "learning_rate": 4.375312270219665e-05, + "loss": 1.0368, + "step": 38673 + }, + { + "epoch": 0.2300052336092873, + "grad_norm": 3.6227705478668213, + "learning_rate": 4.3752813808450896e-05, + "loss": 1.4294, + "step": 38674 + }, + { + "epoch": 0.23001118089256828, + "grad_norm": 4.386226654052734, + "learning_rate": 4.3752504908158744e-05, + "loss": 2.0833, + "step": 38675 + }, + { + "epoch": 0.23001712817584927, + "grad_norm": 3.764488697052002, + "learning_rate": 4.375219600132029e-05, + "loss": 1.6208, + "step": 38676 + }, + { + "epoch": 0.23002307545913028, + "grad_norm": 2.4590156078338623, + "learning_rate": 4.375188708793563e-05, + "loss": 0.9148, + "step": 38677 + }, + { + "epoch": 0.23002902274241127, + "grad_norm": 4.069763660430908, + "learning_rate": 4.3751578168004875e-05, + "loss": 2.3122, + "step": 38678 + }, + { + "epoch": 0.23003497002569226, + "grad_norm": 5.092566967010498, + "learning_rate": 4.375126924152815e-05, + "loss": 1.8317, + "step": 38679 + }, + { + "epoch": 0.23004091730897325, + "grad_norm": 3.4340686798095703, + "learning_rate": 4.375096030850554e-05, + "loss": 1.6576, + "step": 38680 + }, + { + "epoch": 0.23004686459225426, + "grad_norm": 2.129098653793335, + "learning_rate": 4.375065136893717e-05, + "loss": 4.0079, + "step": 38681 + }, + { + "epoch": 0.23005281187553525, + "grad_norm": 2.533949851989746, + "learning_rate": 4.3750342422823135e-05, + "loss": 4.1513, + "step": 38682 + }, + { + "epoch": 0.23005875915881624, + "grad_norm": 1.7268353700637817, + "learning_rate": 4.375003347016356e-05, + "loss": 4.2263, + "step": 38683 + }, + { + "epoch": 0.23006470644209726, + "grad_norm": 1.9843735694885254, + "learning_rate": 4.3749724510958535e-05, + "loss": 4.9357, + "step": 38684 + }, + { + "epoch": 0.23007065372537824, + "grad_norm": 1.983786702156067, + "learning_rate": 4.374941554520817e-05, + "loss": 4.569, + "step": 38685 + }, + { + "epoch": 0.23007660100865923, + "grad_norm": 1.839876651763916, + "learning_rate": 4.374910657291258e-05, + "loss": 4.8968, + "step": 38686 + }, + { + "epoch": 0.23008254829194025, + "grad_norm": 1.591556429862976, + "learning_rate": 4.374879759407188e-05, + "loss": 4.809, + "step": 38687 + }, + { + "epoch": 0.23008849557522124, + "grad_norm": 1.8136730194091797, + "learning_rate": 4.374848860868615e-05, + "loss": 5.2378, + "step": 38688 + }, + { + "epoch": 0.23009444285850222, + "grad_norm": 1.4922617673873901, + "learning_rate": 4.374817961675553e-05, + "loss": 5.4901, + "step": 38689 + }, + { + "epoch": 0.23010039014178324, + "grad_norm": 1.5398420095443726, + "learning_rate": 4.374787061828012e-05, + "loss": 5.277, + "step": 38690 + }, + { + "epoch": 0.23010633742506423, + "grad_norm": 1.469018578529358, + "learning_rate": 4.3747561613260013e-05, + "loss": 5.2663, + "step": 38691 + }, + { + "epoch": 0.23011228470834522, + "grad_norm": 1.3555761575698853, + "learning_rate": 4.374725260169533e-05, + "loss": 5.3116, + "step": 38692 + }, + { + "epoch": 0.23011823199162623, + "grad_norm": 1.5147504806518555, + "learning_rate": 4.3746943583586175e-05, + "loss": 5.262, + "step": 38693 + }, + { + "epoch": 0.23012417927490722, + "grad_norm": 1.3536839485168457, + "learning_rate": 4.3746634558932646e-05, + "loss": 5.1616, + "step": 38694 + }, + { + "epoch": 0.2301301265581882, + "grad_norm": 1.3796242475509644, + "learning_rate": 4.374632552773487e-05, + "loss": 5.1077, + "step": 38695 + }, + { + "epoch": 0.23013607384146922, + "grad_norm": 1.4209028482437134, + "learning_rate": 4.374601648999295e-05, + "loss": 5.2542, + "step": 38696 + }, + { + "epoch": 0.2301420211247502, + "grad_norm": 1.4143218994140625, + "learning_rate": 4.374570744570697e-05, + "loss": 5.026, + "step": 38697 + }, + { + "epoch": 0.2301479684080312, + "grad_norm": 1.1739543676376343, + "learning_rate": 4.3745398394877074e-05, + "loss": 5.0517, + "step": 38698 + }, + { + "epoch": 0.23015391569131222, + "grad_norm": 1.769179105758667, + "learning_rate": 4.374508933750335e-05, + "loss": 5.3818, + "step": 38699 + }, + { + "epoch": 0.2301598629745932, + "grad_norm": 1.3909661769866943, + "learning_rate": 4.37447802735859e-05, + "loss": 4.9604, + "step": 38700 + }, + { + "epoch": 0.2301658102578742, + "grad_norm": 1.6927801370620728, + "learning_rate": 4.374447120312486e-05, + "loss": 4.9461, + "step": 38701 + }, + { + "epoch": 0.2301717575411552, + "grad_norm": 1.735437273979187, + "learning_rate": 4.37441621261203e-05, + "loss": 4.9774, + "step": 38702 + }, + { + "epoch": 0.2301777048244362, + "grad_norm": 1.732106328010559, + "learning_rate": 4.3743853042572355e-05, + "loss": 4.7804, + "step": 38703 + }, + { + "epoch": 0.23018365210771718, + "grad_norm": 1.686942219734192, + "learning_rate": 4.3743543952481126e-05, + "loss": 4.8964, + "step": 38704 + }, + { + "epoch": 0.2301895993909982, + "grad_norm": 1.6482768058776855, + "learning_rate": 4.3743234855846716e-05, + "loss": 4.8313, + "step": 38705 + }, + { + "epoch": 0.2301955466742792, + "grad_norm": 1.6313527822494507, + "learning_rate": 4.374292575266924e-05, + "loss": 4.9976, + "step": 38706 + }, + { + "epoch": 0.23020149395756018, + "grad_norm": 1.6183964014053345, + "learning_rate": 4.3742616642948796e-05, + "loss": 5.4597, + "step": 38707 + }, + { + "epoch": 0.2302074412408412, + "grad_norm": 1.8126240968704224, + "learning_rate": 4.37423075266855e-05, + "loss": 5.1256, + "step": 38708 + }, + { + "epoch": 0.23021338852412218, + "grad_norm": 1.8139927387237549, + "learning_rate": 4.374199840387946e-05, + "loss": 5.239, + "step": 38709 + }, + { + "epoch": 0.23021933580740317, + "grad_norm": 1.6314213275909424, + "learning_rate": 4.374168927453078e-05, + "loss": 5.0193, + "step": 38710 + }, + { + "epoch": 0.23022528309068419, + "grad_norm": 1.8324249982833862, + "learning_rate": 4.3741380138639574e-05, + "loss": 4.9343, + "step": 38711 + }, + { + "epoch": 0.23023123037396517, + "grad_norm": 1.4922306537628174, + "learning_rate": 4.3741070996205944e-05, + "loss": 4.9846, + "step": 38712 + }, + { + "epoch": 0.23023717765724616, + "grad_norm": 1.712268352508545, + "learning_rate": 4.374076184723e-05, + "loss": 4.5509, + "step": 38713 + }, + { + "epoch": 0.23024312494052718, + "grad_norm": 1.872574806213379, + "learning_rate": 4.374045269171185e-05, + "loss": 4.6417, + "step": 38714 + }, + { + "epoch": 0.23024907222380817, + "grad_norm": 1.7834235429763794, + "learning_rate": 4.37401435296516e-05, + "loss": 4.4271, + "step": 38715 + }, + { + "epoch": 0.23025501950708915, + "grad_norm": 2.986769437789917, + "learning_rate": 4.3739834361049356e-05, + "loss": 1.9502, + "step": 38716 + }, + { + "epoch": 0.23026096679037017, + "grad_norm": 1.4724233150482178, + "learning_rate": 4.373952518590524e-05, + "loss": 4.7217, + "step": 38717 + }, + { + "epoch": 0.23026691407365116, + "grad_norm": 1.3835045099258423, + "learning_rate": 4.373921600421934e-05, + "loss": 4.5464, + "step": 38718 + }, + { + "epoch": 0.23027286135693215, + "grad_norm": 1.334065318107605, + "learning_rate": 4.373890681599178e-05, + "loss": 4.553, + "step": 38719 + }, + { + "epoch": 0.23027880864021316, + "grad_norm": 1.5014736652374268, + "learning_rate": 4.373859762122266e-05, + "loss": 4.638, + "step": 38720 + }, + { + "epoch": 0.23028475592349415, + "grad_norm": 1.645545244216919, + "learning_rate": 4.373828841991208e-05, + "loss": 4.4559, + "step": 38721 + }, + { + "epoch": 0.23029070320677514, + "grad_norm": 1.8344944715499878, + "learning_rate": 4.373797921206016e-05, + "loss": 4.0921, + "step": 38722 + }, + { + "epoch": 0.23029665049005615, + "grad_norm": 1.7468883991241455, + "learning_rate": 4.373766999766701e-05, + "loss": 4.4421, + "step": 38723 + }, + { + "epoch": 0.23030259777333714, + "grad_norm": 1.9306972026824951, + "learning_rate": 4.3737360776732726e-05, + "loss": 4.0814, + "step": 38724 + }, + { + "epoch": 0.23030854505661813, + "grad_norm": 1.600232481956482, + "learning_rate": 4.3737051549257435e-05, + "loss": 4.1041, + "step": 38725 + }, + { + "epoch": 0.23031449233989915, + "grad_norm": 1.8883020877838135, + "learning_rate": 4.373674231524123e-05, + "loss": 3.8982, + "step": 38726 + }, + { + "epoch": 0.23032043962318013, + "grad_norm": 1.8079781532287598, + "learning_rate": 4.373643307468421e-05, + "loss": 3.8882, + "step": 38727 + }, + { + "epoch": 0.23032638690646112, + "grad_norm": 1.4945000410079956, + "learning_rate": 4.3736123827586506e-05, + "loss": 4.7151, + "step": 38728 + }, + { + "epoch": 0.23033233418974214, + "grad_norm": 2.077974319458008, + "learning_rate": 4.373581457394821e-05, + "loss": 4.167, + "step": 38729 + }, + { + "epoch": 0.23033828147302313, + "grad_norm": 1.9432013034820557, + "learning_rate": 4.3735505313769446e-05, + "loss": 4.0149, + "step": 38730 + }, + { + "epoch": 0.23034422875630411, + "grad_norm": 2.184173107147217, + "learning_rate": 4.37351960470503e-05, + "loss": 4.3668, + "step": 38731 + }, + { + "epoch": 0.23035017603958513, + "grad_norm": 3.3158299922943115, + "learning_rate": 4.3734886773790885e-05, + "loss": 3.8251, + "step": 38732 + }, + { + "epoch": 0.23035612332286612, + "grad_norm": 2.032789468765259, + "learning_rate": 4.3734577493991326e-05, + "loss": 3.38, + "step": 38733 + }, + { + "epoch": 0.2303620706061471, + "grad_norm": 2.079367160797119, + "learning_rate": 4.3734268207651704e-05, + "loss": 3.5598, + "step": 38734 + }, + { + "epoch": 0.23036801788942812, + "grad_norm": 2.6133997440338135, + "learning_rate": 4.373395891477216e-05, + "loss": 3.5329, + "step": 38735 + }, + { + "epoch": 0.2303739651727091, + "grad_norm": 1.6688917875289917, + "learning_rate": 4.373364961535278e-05, + "loss": 4.4199, + "step": 38736 + }, + { + "epoch": 0.2303799124559901, + "grad_norm": 1.7220234870910645, + "learning_rate": 4.373334030939367e-05, + "loss": 4.3154, + "step": 38737 + }, + { + "epoch": 0.2303858597392711, + "grad_norm": 1.7266229391098022, + "learning_rate": 4.3733030996894954e-05, + "loss": 4.5222, + "step": 38738 + }, + { + "epoch": 0.2303918070225521, + "grad_norm": 1.653295636177063, + "learning_rate": 4.373272167785672e-05, + "loss": 4.4132, + "step": 38739 + }, + { + "epoch": 0.2303977543058331, + "grad_norm": 1.6252208948135376, + "learning_rate": 4.373241235227909e-05, + "loss": 4.4224, + "step": 38740 + }, + { + "epoch": 0.23040370158911408, + "grad_norm": 1.8031059503555298, + "learning_rate": 4.373210302016217e-05, + "loss": 4.3607, + "step": 38741 + }, + { + "epoch": 0.2304096488723951, + "grad_norm": 1.5618962049484253, + "learning_rate": 4.3731793681506075e-05, + "loss": 4.3514, + "step": 38742 + }, + { + "epoch": 0.23041559615567608, + "grad_norm": 1.519148588180542, + "learning_rate": 4.373148433631089e-05, + "loss": 4.7573, + "step": 38743 + }, + { + "epoch": 0.23042154343895707, + "grad_norm": 1.2487481832504272, + "learning_rate": 4.373117498457675e-05, + "loss": 4.6323, + "step": 38744 + }, + { + "epoch": 0.2304274907222381, + "grad_norm": 1.4358158111572266, + "learning_rate": 4.373086562630374e-05, + "loss": 4.5298, + "step": 38745 + }, + { + "epoch": 0.23043343800551908, + "grad_norm": 1.5527933835983276, + "learning_rate": 4.373055626149198e-05, + "loss": 4.5041, + "step": 38746 + }, + { + "epoch": 0.23043938528880006, + "grad_norm": 1.6646860837936401, + "learning_rate": 4.373024689014158e-05, + "loss": 4.5401, + "step": 38747 + }, + { + "epoch": 0.23044533257208108, + "grad_norm": 1.8804651498794556, + "learning_rate": 4.372993751225264e-05, + "loss": 4.3742, + "step": 38748 + }, + { + "epoch": 0.23045127985536207, + "grad_norm": 1.8429540395736694, + "learning_rate": 4.372962812782527e-05, + "loss": 4.2555, + "step": 38749 + }, + { + "epoch": 0.23045722713864306, + "grad_norm": 1.473212480545044, + "learning_rate": 4.372931873685959e-05, + "loss": 5.2383, + "step": 38750 + }, + { + "epoch": 0.23046317442192407, + "grad_norm": 1.712003231048584, + "learning_rate": 4.372900933935569e-05, + "loss": 4.3601, + "step": 38751 + }, + { + "epoch": 0.23046912170520506, + "grad_norm": 1.7229880094528198, + "learning_rate": 4.3728699935313687e-05, + "loss": 4.5068, + "step": 38752 + }, + { + "epoch": 0.23047506898848605, + "grad_norm": 1.782601237297058, + "learning_rate": 4.37283905247337e-05, + "loss": 4.1018, + "step": 38753 + }, + { + "epoch": 0.23048101627176706, + "grad_norm": 2.0935075283050537, + "learning_rate": 4.3728081107615814e-05, + "loss": 3.8254, + "step": 38754 + }, + { + "epoch": 0.23048696355504805, + "grad_norm": 1.812392234802246, + "learning_rate": 4.372777168396015e-05, + "loss": 4.1183, + "step": 38755 + }, + { + "epoch": 0.23049291083832904, + "grad_norm": 1.8439925909042358, + "learning_rate": 4.3727462253766816e-05, + "loss": 4.3258, + "step": 38756 + }, + { + "epoch": 0.23049885812161006, + "grad_norm": 1.7139822244644165, + "learning_rate": 4.372715281703592e-05, + "loss": 4.1782, + "step": 38757 + }, + { + "epoch": 0.23050480540489104, + "grad_norm": 1.7018375396728516, + "learning_rate": 4.372684337376756e-05, + "loss": 4.3934, + "step": 38758 + }, + { + "epoch": 0.23051075268817203, + "grad_norm": 1.8785852193832397, + "learning_rate": 4.3726533923961854e-05, + "loss": 4.0242, + "step": 38759 + }, + { + "epoch": 0.23051669997145305, + "grad_norm": 1.8708945512771606, + "learning_rate": 4.372622446761891e-05, + "loss": 4.2316, + "step": 38760 + }, + { + "epoch": 0.23052264725473404, + "grad_norm": 1.5988284349441528, + "learning_rate": 4.3725915004738846e-05, + "loss": 4.2701, + "step": 38761 + }, + { + "epoch": 0.23052859453801502, + "grad_norm": 1.6898179054260254, + "learning_rate": 4.372560553532175e-05, + "loss": 4.1239, + "step": 38762 + }, + { + "epoch": 0.23053454182129604, + "grad_norm": 2.0045530796051025, + "learning_rate": 4.3725296059367735e-05, + "loss": 3.9543, + "step": 38763 + }, + { + "epoch": 0.23054048910457703, + "grad_norm": 2.071551561355591, + "learning_rate": 4.372498657687691e-05, + "loss": 4.0462, + "step": 38764 + }, + { + "epoch": 0.23054643638785802, + "grad_norm": 2.0475335121154785, + "learning_rate": 4.3724677087849394e-05, + "loss": 4.2556, + "step": 38765 + }, + { + "epoch": 0.23055238367113903, + "grad_norm": 1.7819331884384155, + "learning_rate": 4.372436759228529e-05, + "loss": 4.4156, + "step": 38766 + }, + { + "epoch": 0.23055833095442002, + "grad_norm": 1.7513604164123535, + "learning_rate": 4.372405809018469e-05, + "loss": 4.6457, + "step": 38767 + }, + { + "epoch": 0.230564278237701, + "grad_norm": 1.6513689756393433, + "learning_rate": 4.372374858154773e-05, + "loss": 4.8633, + "step": 38768 + }, + { + "epoch": 0.23057022552098203, + "grad_norm": 1.607474684715271, + "learning_rate": 4.372343906637449e-05, + "loss": 4.9432, + "step": 38769 + }, + { + "epoch": 0.230576172804263, + "grad_norm": 1.624382734298706, + "learning_rate": 4.3723129544665097e-05, + "loss": 5.0585, + "step": 38770 + }, + { + "epoch": 0.230582120087544, + "grad_norm": 1.963037133216858, + "learning_rate": 4.3722820016419655e-05, + "loss": 4.6664, + "step": 38771 + }, + { + "epoch": 0.23058806737082502, + "grad_norm": 2.7608580589294434, + "learning_rate": 4.372251048163827e-05, + "loss": 4.6565, + "step": 38772 + }, + { + "epoch": 0.230594014654106, + "grad_norm": 2.3045732975006104, + "learning_rate": 4.372220094032104e-05, + "loss": 4.3961, + "step": 38773 + }, + { + "epoch": 0.230599961937387, + "grad_norm": 1.717413067817688, + "learning_rate": 4.372189139246809e-05, + "loss": 4.6134, + "step": 38774 + }, + { + "epoch": 0.230605909220668, + "grad_norm": 1.5653536319732666, + "learning_rate": 4.372158183807952e-05, + "loss": 4.6429, + "step": 38775 + }, + { + "epoch": 0.230611856503949, + "grad_norm": 1.5364784002304077, + "learning_rate": 4.372127227715544e-05, + "loss": 4.8575, + "step": 38776 + }, + { + "epoch": 0.23061780378722999, + "grad_norm": 1.5625269412994385, + "learning_rate": 4.372096270969595e-05, + "loss": 4.7371, + "step": 38777 + }, + { + "epoch": 0.230623751070511, + "grad_norm": 1.5205355882644653, + "learning_rate": 4.3720653135701185e-05, + "loss": 4.6553, + "step": 38778 + }, + { + "epoch": 0.230629698353792, + "grad_norm": 1.3826833963394165, + "learning_rate": 4.372034355517122e-05, + "loss": 4.5907, + "step": 38779 + }, + { + "epoch": 0.23063564563707298, + "grad_norm": 1.336030125617981, + "learning_rate": 4.3720033968106175e-05, + "loss": 4.4755, + "step": 38780 + }, + { + "epoch": 0.230641592920354, + "grad_norm": 1.2729898691177368, + "learning_rate": 4.371972437450616e-05, + "loss": 4.4421, + "step": 38781 + }, + { + "epoch": 0.23064754020363498, + "grad_norm": 1.8722045421600342, + "learning_rate": 4.371941477437128e-05, + "loss": 4.3859, + "step": 38782 + }, + { + "epoch": 0.23065348748691597, + "grad_norm": 1.5908498764038086, + "learning_rate": 4.3719105167701654e-05, + "loss": 4.4019, + "step": 38783 + }, + { + "epoch": 0.23065943477019699, + "grad_norm": 2.3267743587493896, + "learning_rate": 4.3718795554497385e-05, + "loss": 3.559, + "step": 38784 + }, + { + "epoch": 0.23066538205347797, + "grad_norm": 2.3056678771972656, + "learning_rate": 4.371848593475856e-05, + "loss": 3.5686, + "step": 38785 + }, + { + "epoch": 0.23067132933675896, + "grad_norm": 1.8726112842559814, + "learning_rate": 4.371817630848532e-05, + "loss": 5.0275, + "step": 38786 + }, + { + "epoch": 0.23067727662003998, + "grad_norm": 1.9857810735702515, + "learning_rate": 4.371786667567775e-05, + "loss": 4.9584, + "step": 38787 + }, + { + "epoch": 0.23068322390332097, + "grad_norm": 1.8844271898269653, + "learning_rate": 4.371755703633598e-05, + "loss": 4.0207, + "step": 38788 + }, + { + "epoch": 0.23068917118660195, + "grad_norm": 2.8062679767608643, + "learning_rate": 4.3717247390460095e-05, + "loss": 4.2182, + "step": 38789 + }, + { + "epoch": 0.23069511846988297, + "grad_norm": 2.765418529510498, + "learning_rate": 4.37169377380502e-05, + "loss": 4.2685, + "step": 38790 + }, + { + "epoch": 0.23070106575316396, + "grad_norm": 1.712620735168457, + "learning_rate": 4.371662807910643e-05, + "loss": 4.6177, + "step": 38791 + }, + { + "epoch": 0.23070701303644495, + "grad_norm": 1.7626475095748901, + "learning_rate": 4.371631841362888e-05, + "loss": 4.8108, + "step": 38792 + }, + { + "epoch": 0.23071296031972596, + "grad_norm": 1.7972310781478882, + "learning_rate": 4.371600874161765e-05, + "loss": 4.5635, + "step": 38793 + }, + { + "epoch": 0.23071890760300695, + "grad_norm": 2.1518964767456055, + "learning_rate": 4.3715699063072854e-05, + "loss": 5.0052, + "step": 38794 + }, + { + "epoch": 0.23072485488628794, + "grad_norm": 2.6726882457733154, + "learning_rate": 4.371538937799461e-05, + "loss": 3.2855, + "step": 38795 + }, + { + "epoch": 0.23073080216956893, + "grad_norm": 2.3370792865753174, + "learning_rate": 4.371507968638301e-05, + "loss": 3.358, + "step": 38796 + }, + { + "epoch": 0.23073674945284994, + "grad_norm": 1.9931254386901855, + "learning_rate": 4.371476998823817e-05, + "loss": 4.6237, + "step": 38797 + }, + { + "epoch": 0.23074269673613093, + "grad_norm": 1.9586458206176758, + "learning_rate": 4.371446028356019e-05, + "loss": 4.5883, + "step": 38798 + }, + { + "epoch": 0.23074864401941192, + "grad_norm": 2.060645341873169, + "learning_rate": 4.3714150572349194e-05, + "loss": 4.637, + "step": 38799 + }, + { + "epoch": 0.23075459130269294, + "grad_norm": 1.6580359935760498, + "learning_rate": 4.3713840854605284e-05, + "loss": 4.4867, + "step": 38800 + }, + { + "epoch": 0.23076053858597392, + "grad_norm": 1.6574506759643555, + "learning_rate": 4.3713531130328554e-05, + "loss": 4.5072, + "step": 38801 + }, + { + "epoch": 0.2307664858692549, + "grad_norm": 1.7564977407455444, + "learning_rate": 4.371322139951913e-05, + "loss": 4.4847, + "step": 38802 + }, + { + "epoch": 0.23077243315253593, + "grad_norm": 1.671775221824646, + "learning_rate": 4.371291166217712e-05, + "loss": 4.4913, + "step": 38803 + }, + { + "epoch": 0.23077838043581692, + "grad_norm": 1.7091946601867676, + "learning_rate": 4.371260191830261e-05, + "loss": 4.4429, + "step": 38804 + }, + { + "epoch": 0.2307843277190979, + "grad_norm": 1.5660812854766846, + "learning_rate": 4.371229216789574e-05, + "loss": 4.3089, + "step": 38805 + }, + { + "epoch": 0.23079027500237892, + "grad_norm": 1.6085116863250732, + "learning_rate": 4.3711982410956596e-05, + "loss": 4.4015, + "step": 38806 + }, + { + "epoch": 0.2307962222856599, + "grad_norm": 1.703705072402954, + "learning_rate": 4.3711672647485294e-05, + "loss": 4.1894, + "step": 38807 + }, + { + "epoch": 0.2308021695689409, + "grad_norm": 1.681896686553955, + "learning_rate": 4.371136287748193e-05, + "loss": 4.3658, + "step": 38808 + }, + { + "epoch": 0.2308081168522219, + "grad_norm": 1.5659677982330322, + "learning_rate": 4.371105310094664e-05, + "loss": 4.3422, + "step": 38809 + }, + { + "epoch": 0.2308140641355029, + "grad_norm": 1.5186078548431396, + "learning_rate": 4.3710743317879504e-05, + "loss": 4.0806, + "step": 38810 + }, + { + "epoch": 0.2308200114187839, + "grad_norm": 1.5934349298477173, + "learning_rate": 4.3710433528280644e-05, + "loss": 4.2978, + "step": 38811 + }, + { + "epoch": 0.2308259587020649, + "grad_norm": 1.951122522354126, + "learning_rate": 4.371012373215016e-05, + "loss": 5.2274, + "step": 38812 + }, + { + "epoch": 0.2308319059853459, + "grad_norm": 1.8681738376617432, + "learning_rate": 4.370981392948817e-05, + "loss": 4.4869, + "step": 38813 + }, + { + "epoch": 0.23083785326862688, + "grad_norm": 1.753472924232483, + "learning_rate": 4.3709504120294775e-05, + "loss": 4.3416, + "step": 38814 + }, + { + "epoch": 0.2308438005519079, + "grad_norm": 1.6463114023208618, + "learning_rate": 4.370919430457009e-05, + "loss": 4.3267, + "step": 38815 + }, + { + "epoch": 0.23084974783518888, + "grad_norm": 1.634188175201416, + "learning_rate": 4.3708884482314215e-05, + "loss": 4.2345, + "step": 38816 + }, + { + "epoch": 0.23085569511846987, + "grad_norm": 1.6019244194030762, + "learning_rate": 4.370857465352726e-05, + "loss": 4.249, + "step": 38817 + }, + { + "epoch": 0.2308616424017509, + "grad_norm": 1.555641531944275, + "learning_rate": 4.3708264818209335e-05, + "loss": 4.3064, + "step": 38818 + }, + { + "epoch": 0.23086758968503188, + "grad_norm": 1.6986194849014282, + "learning_rate": 4.3707954976360556e-05, + "loss": 4.2284, + "step": 38819 + }, + { + "epoch": 0.23087353696831286, + "grad_norm": 1.656322956085205, + "learning_rate": 4.370764512798101e-05, + "loss": 4.351, + "step": 38820 + }, + { + "epoch": 0.23087948425159388, + "grad_norm": 1.7241042852401733, + "learning_rate": 4.370733527307083e-05, + "loss": 4.3169, + "step": 38821 + }, + { + "epoch": 0.23088543153487487, + "grad_norm": 1.7275463342666626, + "learning_rate": 4.370702541163011e-05, + "loss": 4.4002, + "step": 38822 + }, + { + "epoch": 0.23089137881815586, + "grad_norm": 1.549770712852478, + "learning_rate": 4.3706715543658957e-05, + "loss": 4.0978, + "step": 38823 + }, + { + "epoch": 0.23089732610143687, + "grad_norm": 1.6060540676116943, + "learning_rate": 4.370640566915748e-05, + "loss": 4.435, + "step": 38824 + }, + { + "epoch": 0.23090327338471786, + "grad_norm": 1.6758986711502075, + "learning_rate": 4.3706095788125795e-05, + "loss": 4.4059, + "step": 38825 + }, + { + "epoch": 0.23090922066799885, + "grad_norm": 1.7185044288635254, + "learning_rate": 4.3705785900564e-05, + "loss": 4.3024, + "step": 38826 + }, + { + "epoch": 0.23091516795127986, + "grad_norm": 1.5720844268798828, + "learning_rate": 4.370547600647222e-05, + "loss": 4.4142, + "step": 38827 + }, + { + "epoch": 0.23092111523456085, + "grad_norm": 1.5926580429077148, + "learning_rate": 4.370516610585054e-05, + "loss": 4.3386, + "step": 38828 + }, + { + "epoch": 0.23092706251784184, + "grad_norm": 1.4387127161026, + "learning_rate": 4.3704856198699085e-05, + "loss": 4.2481, + "step": 38829 + }, + { + "epoch": 0.23093300980112286, + "grad_norm": 1.7872234582901, + "learning_rate": 4.3704546285017954e-05, + "loss": 4.1025, + "step": 38830 + }, + { + "epoch": 0.23093895708440385, + "grad_norm": 1.7201859951019287, + "learning_rate": 4.3704236364807264e-05, + "loss": 4.1159, + "step": 38831 + }, + { + "epoch": 0.23094490436768483, + "grad_norm": 1.7127646207809448, + "learning_rate": 4.370392643806712e-05, + "loss": 4.2173, + "step": 38832 + }, + { + "epoch": 0.23095085165096585, + "grad_norm": 1.982006549835205, + "learning_rate": 4.370361650479763e-05, + "loss": 4.1575, + "step": 38833 + }, + { + "epoch": 0.23095679893424684, + "grad_norm": 1.5777769088745117, + "learning_rate": 4.370330656499889e-05, + "loss": 4.0942, + "step": 38834 + }, + { + "epoch": 0.23096274621752783, + "grad_norm": 2.2118375301361084, + "learning_rate": 4.370299661867103e-05, + "loss": 3.961, + "step": 38835 + }, + { + "epoch": 0.23096869350080884, + "grad_norm": 2.4417662620544434, + "learning_rate": 4.3702686665814144e-05, + "loss": 3.5483, + "step": 38836 + }, + { + "epoch": 0.23097464078408983, + "grad_norm": 2.306452751159668, + "learning_rate": 4.3702376706428335e-05, + "loss": 4.0667, + "step": 38837 + }, + { + "epoch": 0.23098058806737082, + "grad_norm": 2.5894603729248047, + "learning_rate": 4.3702066740513726e-05, + "loss": 3.7923, + "step": 38838 + }, + { + "epoch": 0.23098653535065183, + "grad_norm": 2.077296257019043, + "learning_rate": 4.370175676807042e-05, + "loss": 4.1861, + "step": 38839 + }, + { + "epoch": 0.23099248263393282, + "grad_norm": 2.2528553009033203, + "learning_rate": 4.3701446789098523e-05, + "loss": 4.1747, + "step": 38840 + }, + { + "epoch": 0.2309984299172138, + "grad_norm": 1.9800342321395874, + "learning_rate": 4.370113680359814e-05, + "loss": 4.1947, + "step": 38841 + }, + { + "epoch": 0.23100437720049483, + "grad_norm": 1.7648961544036865, + "learning_rate": 4.370082681156939e-05, + "loss": 4.5052, + "step": 38842 + }, + { + "epoch": 0.23101032448377581, + "grad_norm": 1.8483437299728394, + "learning_rate": 4.3700516813012374e-05, + "loss": 4.157, + "step": 38843 + }, + { + "epoch": 0.2310162717670568, + "grad_norm": 2.423189163208008, + "learning_rate": 4.370020680792719e-05, + "loss": 3.8596, + "step": 38844 + }, + { + "epoch": 0.23102221905033782, + "grad_norm": 2.211770534515381, + "learning_rate": 4.369989679631397e-05, + "loss": 3.6305, + "step": 38845 + }, + { + "epoch": 0.2310281663336188, + "grad_norm": 1.9826966524124146, + "learning_rate": 4.36995867781728e-05, + "loss": 4.0012, + "step": 38846 + }, + { + "epoch": 0.2310341136168998, + "grad_norm": 1.9183090925216675, + "learning_rate": 4.3699276753503804e-05, + "loss": 4.1553, + "step": 38847 + }, + { + "epoch": 0.2310400609001808, + "grad_norm": 1.7809723615646362, + "learning_rate": 4.3698966722307085e-05, + "loss": 4.8814, + "step": 38848 + }, + { + "epoch": 0.2310460081834618, + "grad_norm": 1.6132829189300537, + "learning_rate": 4.369865668458274e-05, + "loss": 5.2379, + "step": 38849 + }, + { + "epoch": 0.2310519554667428, + "grad_norm": 1.4619427919387817, + "learning_rate": 4.36983466403309e-05, + "loss": 5.0469, + "step": 38850 + }, + { + "epoch": 0.2310579027500238, + "grad_norm": 1.6037229299545288, + "learning_rate": 4.369803658955165e-05, + "loss": 4.6583, + "step": 38851 + }, + { + "epoch": 0.2310638500333048, + "grad_norm": 1.3536498546600342, + "learning_rate": 4.369772653224512e-05, + "loss": 5.2745, + "step": 38852 + }, + { + "epoch": 0.23106979731658578, + "grad_norm": 1.6615324020385742, + "learning_rate": 4.36974164684114e-05, + "loss": 4.6271, + "step": 38853 + }, + { + "epoch": 0.2310757445998668, + "grad_norm": 1.6488821506500244, + "learning_rate": 4.36971063980506e-05, + "loss": 4.7926, + "step": 38854 + }, + { + "epoch": 0.23108169188314778, + "grad_norm": 1.3780089616775513, + "learning_rate": 4.3696796321162836e-05, + "loss": 4.5641, + "step": 38855 + }, + { + "epoch": 0.23108763916642877, + "grad_norm": 1.5264968872070312, + "learning_rate": 4.3696486237748215e-05, + "loss": 4.7264, + "step": 38856 + }, + { + "epoch": 0.23109358644970976, + "grad_norm": 1.828169822692871, + "learning_rate": 4.369617614780685e-05, + "loss": 4.441, + "step": 38857 + }, + { + "epoch": 0.23109953373299078, + "grad_norm": 1.3571844100952148, + "learning_rate": 4.369586605133883e-05, + "loss": 4.861, + "step": 38858 + }, + { + "epoch": 0.23110548101627176, + "grad_norm": 1.5678229331970215, + "learning_rate": 4.369555594834429e-05, + "loss": 4.6285, + "step": 38859 + }, + { + "epoch": 0.23111142829955275, + "grad_norm": 1.6185591220855713, + "learning_rate": 4.369524583882332e-05, + "loss": 4.7623, + "step": 38860 + }, + { + "epoch": 0.23111737558283377, + "grad_norm": 1.7087242603302002, + "learning_rate": 4.369493572277603e-05, + "loss": 4.7057, + "step": 38861 + }, + { + "epoch": 0.23112332286611476, + "grad_norm": 1.356367588043213, + "learning_rate": 4.3694625600202534e-05, + "loss": 4.8264, + "step": 38862 + }, + { + "epoch": 0.23112927014939574, + "grad_norm": 1.3614306449890137, + "learning_rate": 4.3694315471102934e-05, + "loss": 4.5589, + "step": 38863 + }, + { + "epoch": 0.23113521743267676, + "grad_norm": 2.0256147384643555, + "learning_rate": 4.369400533547734e-05, + "loss": 4.1103, + "step": 38864 + }, + { + "epoch": 0.23114116471595775, + "grad_norm": 1.8039603233337402, + "learning_rate": 4.369369519332586e-05, + "loss": 4.522, + "step": 38865 + }, + { + "epoch": 0.23114711199923874, + "grad_norm": 1.923120141029358, + "learning_rate": 4.3693385044648614e-05, + "loss": 4.6754, + "step": 38866 + }, + { + "epoch": 0.23115305928251975, + "grad_norm": 1.618260145187378, + "learning_rate": 4.3693074889445695e-05, + "loss": 4.7447, + "step": 38867 + }, + { + "epoch": 0.23115900656580074, + "grad_norm": 1.4669636487960815, + "learning_rate": 4.3692764727717214e-05, + "loss": 5.0623, + "step": 38868 + }, + { + "epoch": 0.23116495384908173, + "grad_norm": 1.5794733762741089, + "learning_rate": 4.3692454559463286e-05, + "loss": 4.8538, + "step": 38869 + }, + { + "epoch": 0.23117090113236274, + "grad_norm": 1.8218353986740112, + "learning_rate": 4.369214438468402e-05, + "loss": 4.4285, + "step": 38870 + }, + { + "epoch": 0.23117684841564373, + "grad_norm": 1.5657826662063599, + "learning_rate": 4.369183420337951e-05, + "loss": 4.7327, + "step": 38871 + }, + { + "epoch": 0.23118279569892472, + "grad_norm": 1.5812371969223022, + "learning_rate": 4.369152401554988e-05, + "loss": 4.7874, + "step": 38872 + }, + { + "epoch": 0.23118874298220574, + "grad_norm": 1.9417638778686523, + "learning_rate": 4.369121382119523e-05, + "loss": 4.9785, + "step": 38873 + }, + { + "epoch": 0.23119469026548672, + "grad_norm": 1.6311239004135132, + "learning_rate": 4.369090362031567e-05, + "loss": 4.5811, + "step": 38874 + }, + { + "epoch": 0.2312006375487677, + "grad_norm": 1.3812321424484253, + "learning_rate": 4.369059341291131e-05, + "loss": 5.1751, + "step": 38875 + }, + { + "epoch": 0.23120658483204873, + "grad_norm": 2.401395082473755, + "learning_rate": 4.3690283198982253e-05, + "loss": 4.0534, + "step": 38876 + }, + { + "epoch": 0.23121253211532972, + "grad_norm": 1.577271819114685, + "learning_rate": 4.368997297852861e-05, + "loss": 5.0016, + "step": 38877 + }, + { + "epoch": 0.2312184793986107, + "grad_norm": 1.571954607963562, + "learning_rate": 4.36896627515505e-05, + "loss": 5.1936, + "step": 38878 + }, + { + "epoch": 0.23122442668189172, + "grad_norm": 1.5858561992645264, + "learning_rate": 4.368935251804801e-05, + "loss": 4.3015, + "step": 38879 + }, + { + "epoch": 0.2312303739651727, + "grad_norm": 1.5386252403259277, + "learning_rate": 4.368904227802127e-05, + "loss": 4.541, + "step": 38880 + }, + { + "epoch": 0.2312363212484537, + "grad_norm": 1.5563592910766602, + "learning_rate": 4.368873203147037e-05, + "loss": 4.676, + "step": 38881 + }, + { + "epoch": 0.2312422685317347, + "grad_norm": 1.441646933555603, + "learning_rate": 4.368842177839544e-05, + "loss": 4.9754, + "step": 38882 + }, + { + "epoch": 0.2312482158150157, + "grad_norm": 1.9202433824539185, + "learning_rate": 4.3688111518796556e-05, + "loss": 4.3441, + "step": 38883 + }, + { + "epoch": 0.2312541630982967, + "grad_norm": 1.5717604160308838, + "learning_rate": 4.368780125267387e-05, + "loss": 4.4186, + "step": 38884 + }, + { + "epoch": 0.2312601103815777, + "grad_norm": 1.633315920829773, + "learning_rate": 4.3687490980027444e-05, + "loss": 4.6497, + "step": 38885 + }, + { + "epoch": 0.2312660576648587, + "grad_norm": 1.545074462890625, + "learning_rate": 4.368718070085741e-05, + "loss": 4.8076, + "step": 38886 + }, + { + "epoch": 0.23127200494813968, + "grad_norm": 2.005859851837158, + "learning_rate": 4.368687041516388e-05, + "loss": 4.2126, + "step": 38887 + }, + { + "epoch": 0.2312779522314207, + "grad_norm": 1.443214774131775, + "learning_rate": 4.368656012294696e-05, + "loss": 4.8105, + "step": 38888 + }, + { + "epoch": 0.23128389951470169, + "grad_norm": 1.7497129440307617, + "learning_rate": 4.368624982420675e-05, + "loss": 4.5052, + "step": 38889 + }, + { + "epoch": 0.23128984679798267, + "grad_norm": 1.630719780921936, + "learning_rate": 4.368593951894336e-05, + "loss": 4.8466, + "step": 38890 + }, + { + "epoch": 0.2312957940812637, + "grad_norm": 1.871222734451294, + "learning_rate": 4.368562920715692e-05, + "loss": 4.8048, + "step": 38891 + }, + { + "epoch": 0.23130174136454468, + "grad_norm": 1.5791672468185425, + "learning_rate": 4.36853188888475e-05, + "loss": 4.7656, + "step": 38892 + }, + { + "epoch": 0.23130768864782567, + "grad_norm": 1.7593334913253784, + "learning_rate": 4.368500856401523e-05, + "loss": 4.2775, + "step": 38893 + }, + { + "epoch": 0.23131363593110668, + "grad_norm": 1.6288632154464722, + "learning_rate": 4.3684698232660225e-05, + "loss": 5.005, + "step": 38894 + }, + { + "epoch": 0.23131958321438767, + "grad_norm": 1.7398391962051392, + "learning_rate": 4.368438789478258e-05, + "loss": 4.4744, + "step": 38895 + }, + { + "epoch": 0.23132553049766866, + "grad_norm": 1.9362190961837769, + "learning_rate": 4.3684077550382407e-05, + "loss": 4.3996, + "step": 38896 + }, + { + "epoch": 0.23133147778094967, + "grad_norm": 1.8506320714950562, + "learning_rate": 4.3683767199459826e-05, + "loss": 5.2249, + "step": 38897 + }, + { + "epoch": 0.23133742506423066, + "grad_norm": 1.5073530673980713, + "learning_rate": 4.3683456842014916e-05, + "loss": 5.1084, + "step": 38898 + }, + { + "epoch": 0.23134337234751165, + "grad_norm": 1.642914056777954, + "learning_rate": 4.368314647804782e-05, + "loss": 5.0734, + "step": 38899 + }, + { + "epoch": 0.23134931963079267, + "grad_norm": 1.613958477973938, + "learning_rate": 4.368283610755862e-05, + "loss": 4.9509, + "step": 38900 + }, + { + "epoch": 0.23135526691407365, + "grad_norm": 1.6028896570205688, + "learning_rate": 4.368252573054744e-05, + "loss": 4.7896, + "step": 38901 + }, + { + "epoch": 0.23136121419735464, + "grad_norm": 1.6952170133590698, + "learning_rate": 4.368221534701439e-05, + "loss": 4.7555, + "step": 38902 + }, + { + "epoch": 0.23136716148063566, + "grad_norm": 1.5203158855438232, + "learning_rate": 4.3681904956959565e-05, + "loss": 4.7926, + "step": 38903 + }, + { + "epoch": 0.23137310876391665, + "grad_norm": 1.4692882299423218, + "learning_rate": 4.3681594560383075e-05, + "loss": 4.7705, + "step": 38904 + }, + { + "epoch": 0.23137905604719763, + "grad_norm": 1.6336463689804077, + "learning_rate": 4.368128415728504e-05, + "loss": 4.8079, + "step": 38905 + }, + { + "epoch": 0.23138500333047865, + "grad_norm": 1.5728296041488647, + "learning_rate": 4.368097374766556e-05, + "loss": 4.8961, + "step": 38906 + }, + { + "epoch": 0.23139095061375964, + "grad_norm": 1.7239409685134888, + "learning_rate": 4.368066333152474e-05, + "loss": 4.4861, + "step": 38907 + }, + { + "epoch": 0.23139689789704063, + "grad_norm": 1.3485053777694702, + "learning_rate": 4.3680352908862705e-05, + "loss": 4.8358, + "step": 38908 + }, + { + "epoch": 0.23140284518032164, + "grad_norm": 1.6450964212417603, + "learning_rate": 4.3680042479679546e-05, + "loss": 4.623, + "step": 38909 + }, + { + "epoch": 0.23140879246360263, + "grad_norm": 1.6974247694015503, + "learning_rate": 4.367973204397537e-05, + "loss": 4.3432, + "step": 38910 + }, + { + "epoch": 0.23141473974688362, + "grad_norm": 1.710485577583313, + "learning_rate": 4.36794216017503e-05, + "loss": 4.4643, + "step": 38911 + }, + { + "epoch": 0.23142068703016463, + "grad_norm": 1.679517388343811, + "learning_rate": 4.367911115300444e-05, + "loss": 4.6119, + "step": 38912 + }, + { + "epoch": 0.23142663431344562, + "grad_norm": 2.93080997467041, + "learning_rate": 4.367880069773789e-05, + "loss": 3.169, + "step": 38913 + }, + { + "epoch": 0.2314325815967266, + "grad_norm": 2.643122911453247, + "learning_rate": 4.367849023595076e-05, + "loss": 2.9466, + "step": 38914 + }, + { + "epoch": 0.2314385288800076, + "grad_norm": 2.2763819694519043, + "learning_rate": 4.367817976764317e-05, + "loss": 3.5133, + "step": 38915 + }, + { + "epoch": 0.23144447616328861, + "grad_norm": 2.1890201568603516, + "learning_rate": 4.367786929281522e-05, + "loss": 4.2592, + "step": 38916 + }, + { + "epoch": 0.2314504234465696, + "grad_norm": 1.3603123426437378, + "learning_rate": 4.367755881146701e-05, + "loss": 4.8718, + "step": 38917 + }, + { + "epoch": 0.2314563707298506, + "grad_norm": 1.6598271131515503, + "learning_rate": 4.367724832359867e-05, + "loss": 4.5127, + "step": 38918 + }, + { + "epoch": 0.2314623180131316, + "grad_norm": 1.445361614227295, + "learning_rate": 4.367693782921029e-05, + "loss": 4.9478, + "step": 38919 + }, + { + "epoch": 0.2314682652964126, + "grad_norm": 1.606594443321228, + "learning_rate": 4.3676627328301976e-05, + "loss": 5.0595, + "step": 38920 + }, + { + "epoch": 0.23147421257969358, + "grad_norm": 1.5703539848327637, + "learning_rate": 4.367631682087385e-05, + "loss": 5.0988, + "step": 38921 + }, + { + "epoch": 0.2314801598629746, + "grad_norm": 1.5487500429153442, + "learning_rate": 4.3676006306926024e-05, + "loss": 4.929, + "step": 38922 + }, + { + "epoch": 0.2314861071462556, + "grad_norm": 1.4153128862380981, + "learning_rate": 4.3675695786458584e-05, + "loss": 4.9945, + "step": 38923 + }, + { + "epoch": 0.23149205442953658, + "grad_norm": 1.2444169521331787, + "learning_rate": 4.3675385259471655e-05, + "loss": 4.9692, + "step": 38924 + }, + { + "epoch": 0.2314980017128176, + "grad_norm": 1.5902601480484009, + "learning_rate": 4.367507472596535e-05, + "loss": 5.0088, + "step": 38925 + }, + { + "epoch": 0.23150394899609858, + "grad_norm": 1.5130057334899902, + "learning_rate": 4.3674764185939763e-05, + "loss": 5.0361, + "step": 38926 + }, + { + "epoch": 0.23150989627937957, + "grad_norm": 1.6979068517684937, + "learning_rate": 4.3674453639395005e-05, + "loss": 4.8099, + "step": 38927 + }, + { + "epoch": 0.23151584356266058, + "grad_norm": 1.6237205266952515, + "learning_rate": 4.36741430863312e-05, + "loss": 4.8471, + "step": 38928 + }, + { + "epoch": 0.23152179084594157, + "grad_norm": 1.7361104488372803, + "learning_rate": 4.3673832526748434e-05, + "loss": 4.4522, + "step": 38929 + }, + { + "epoch": 0.23152773812922256, + "grad_norm": 1.6142919063568115, + "learning_rate": 4.3673521960646824e-05, + "loss": 4.8079, + "step": 38930 + }, + { + "epoch": 0.23153368541250358, + "grad_norm": 1.5747629404067993, + "learning_rate": 4.367321138802649e-05, + "loss": 4.9752, + "step": 38931 + }, + { + "epoch": 0.23153963269578456, + "grad_norm": 1.5985512733459473, + "learning_rate": 4.3672900808887516e-05, + "loss": 4.9448, + "step": 38932 + }, + { + "epoch": 0.23154557997906555, + "grad_norm": 1.5115282535552979, + "learning_rate": 4.3672590223230036e-05, + "loss": 4.7868, + "step": 38933 + }, + { + "epoch": 0.23155152726234657, + "grad_norm": 1.294195532798767, + "learning_rate": 4.367227963105415e-05, + "loss": 4.9308, + "step": 38934 + }, + { + "epoch": 0.23155747454562756, + "grad_norm": 1.7012540102005005, + "learning_rate": 4.367196903235996e-05, + "loss": 4.4029, + "step": 38935 + }, + { + "epoch": 0.23156342182890854, + "grad_norm": 1.611894130706787, + "learning_rate": 4.3671658427147584e-05, + "loss": 5.2145, + "step": 38936 + }, + { + "epoch": 0.23156936911218956, + "grad_norm": 1.9200711250305176, + "learning_rate": 4.3671347815417116e-05, + "loss": 4.2015, + "step": 38937 + }, + { + "epoch": 0.23157531639547055, + "grad_norm": 1.3902099132537842, + "learning_rate": 4.367103719716868e-05, + "loss": 4.6965, + "step": 38938 + }, + { + "epoch": 0.23158126367875154, + "grad_norm": 1.5783464908599854, + "learning_rate": 4.3670726572402375e-05, + "loss": 4.7771, + "step": 38939 + }, + { + "epoch": 0.23158721096203255, + "grad_norm": 1.519589900970459, + "learning_rate": 4.367041594111831e-05, + "loss": 4.434, + "step": 38940 + }, + { + "epoch": 0.23159315824531354, + "grad_norm": 1.6310521364212036, + "learning_rate": 4.36701053033166e-05, + "loss": 4.4936, + "step": 38941 + }, + { + "epoch": 0.23159910552859453, + "grad_norm": 1.7462193965911865, + "learning_rate": 4.366979465899734e-05, + "loss": 4.6498, + "step": 38942 + }, + { + "epoch": 0.23160505281187554, + "grad_norm": 1.9061944484710693, + "learning_rate": 4.366948400816066e-05, + "loss": 4.3522, + "step": 38943 + }, + { + "epoch": 0.23161100009515653, + "grad_norm": 1.6201283931732178, + "learning_rate": 4.3669173350806655e-05, + "loss": 4.2227, + "step": 38944 + }, + { + "epoch": 0.23161694737843752, + "grad_norm": 1.670607566833496, + "learning_rate": 4.366886268693543e-05, + "loss": 4.815, + "step": 38945 + }, + { + "epoch": 0.23162289466171854, + "grad_norm": 1.6773320436477661, + "learning_rate": 4.36685520165471e-05, + "loss": 4.6114, + "step": 38946 + }, + { + "epoch": 0.23162884194499953, + "grad_norm": 1.6075963973999023, + "learning_rate": 4.366824133964177e-05, + "loss": 4.8761, + "step": 38947 + }, + { + "epoch": 0.2316347892282805, + "grad_norm": 1.6519663333892822, + "learning_rate": 4.366793065621955e-05, + "loss": 4.9682, + "step": 38948 + }, + { + "epoch": 0.23164073651156153, + "grad_norm": 1.391345500946045, + "learning_rate": 4.366761996628054e-05, + "loss": 4.986, + "step": 38949 + }, + { + "epoch": 0.23164668379484252, + "grad_norm": 1.5131144523620605, + "learning_rate": 4.366730926982487e-05, + "loss": 4.9342, + "step": 38950 + }, + { + "epoch": 0.2316526310781235, + "grad_norm": 1.61806058883667, + "learning_rate": 4.366699856685263e-05, + "loss": 4.9785, + "step": 38951 + }, + { + "epoch": 0.23165857836140452, + "grad_norm": 1.5832924842834473, + "learning_rate": 4.366668785736393e-05, + "loss": 5.0118, + "step": 38952 + }, + { + "epoch": 0.2316645256446855, + "grad_norm": 1.4623991250991821, + "learning_rate": 4.3666377141358885e-05, + "loss": 4.9817, + "step": 38953 + }, + { + "epoch": 0.2316704729279665, + "grad_norm": 1.470866322517395, + "learning_rate": 4.3666066418837605e-05, + "loss": 4.9762, + "step": 38954 + }, + { + "epoch": 0.2316764202112475, + "grad_norm": 1.5453153848648071, + "learning_rate": 4.3665755689800195e-05, + "loss": 4.6325, + "step": 38955 + }, + { + "epoch": 0.2316823674945285, + "grad_norm": 1.753753900527954, + "learning_rate": 4.366544495424675e-05, + "loss": 4.62, + "step": 38956 + }, + { + "epoch": 0.2316883147778095, + "grad_norm": 1.5852446556091309, + "learning_rate": 4.36651342121774e-05, + "loss": 4.4564, + "step": 38957 + }, + { + "epoch": 0.2316942620610905, + "grad_norm": 1.5405995845794678, + "learning_rate": 4.3664823463592244e-05, + "loss": 4.4978, + "step": 38958 + }, + { + "epoch": 0.2317002093443715, + "grad_norm": 1.674048662185669, + "learning_rate": 4.366451270849139e-05, + "loss": 4.4858, + "step": 38959 + }, + { + "epoch": 0.23170615662765248, + "grad_norm": 1.6053109169006348, + "learning_rate": 4.366420194687495e-05, + "loss": 4.4447, + "step": 38960 + }, + { + "epoch": 0.2317121039109335, + "grad_norm": 1.3716999292373657, + "learning_rate": 4.366389117874302e-05, + "loss": 4.8299, + "step": 38961 + }, + { + "epoch": 0.23171805119421449, + "grad_norm": 1.6440811157226562, + "learning_rate": 4.366358040409573e-05, + "loss": 4.6378, + "step": 38962 + }, + { + "epoch": 0.23172399847749547, + "grad_norm": 1.8842734098434448, + "learning_rate": 4.366326962293317e-05, + "loss": 4.3705, + "step": 38963 + }, + { + "epoch": 0.2317299457607765, + "grad_norm": 1.5826550722122192, + "learning_rate": 4.3662958835255466e-05, + "loss": 4.4738, + "step": 38964 + }, + { + "epoch": 0.23173589304405748, + "grad_norm": 1.4358820915222168, + "learning_rate": 4.36626480410627e-05, + "loss": 4.9931, + "step": 38965 + }, + { + "epoch": 0.23174184032733847, + "grad_norm": 1.7191013097763062, + "learning_rate": 4.3662337240355e-05, + "loss": 4.9418, + "step": 38966 + }, + { + "epoch": 0.23174778761061948, + "grad_norm": 1.8441758155822754, + "learning_rate": 4.366202643313249e-05, + "loss": 3.6016, + "step": 38967 + }, + { + "epoch": 0.23175373489390047, + "grad_norm": 1.8906590938568115, + "learning_rate": 4.366171561939524e-05, + "loss": 4.0659, + "step": 38968 + }, + { + "epoch": 0.23175968217718146, + "grad_norm": 1.9503329992294312, + "learning_rate": 4.366140479914338e-05, + "loss": 4.4378, + "step": 38969 + }, + { + "epoch": 0.23176562946046247, + "grad_norm": 1.487377643585205, + "learning_rate": 4.366109397237702e-05, + "loss": 5.0203, + "step": 38970 + }, + { + "epoch": 0.23177157674374346, + "grad_norm": 1.49003005027771, + "learning_rate": 4.366078313909626e-05, + "loss": 5.0222, + "step": 38971 + }, + { + "epoch": 0.23177752402702445, + "grad_norm": 1.5293556451797485, + "learning_rate": 4.3660472299301216e-05, + "loss": 4.9737, + "step": 38972 + }, + { + "epoch": 0.23178347131030544, + "grad_norm": 1.6720876693725586, + "learning_rate": 4.3660161452992e-05, + "loss": 4.9417, + "step": 38973 + }, + { + "epoch": 0.23178941859358645, + "grad_norm": 1.5357182025909424, + "learning_rate": 4.3659850600168713e-05, + "loss": 4.95, + "step": 38974 + }, + { + "epoch": 0.23179536587686744, + "grad_norm": 1.4877994060516357, + "learning_rate": 4.365953974083146e-05, + "loss": 4.8377, + "step": 38975 + }, + { + "epoch": 0.23180131316014843, + "grad_norm": 1.477420687675476, + "learning_rate": 4.365922887498035e-05, + "loss": 4.8592, + "step": 38976 + }, + { + "epoch": 0.23180726044342945, + "grad_norm": 1.5136134624481201, + "learning_rate": 4.365891800261551e-05, + "loss": 5.1658, + "step": 38977 + }, + { + "epoch": 0.23181320772671044, + "grad_norm": 1.9418365955352783, + "learning_rate": 4.365860712373702e-05, + "loss": 4.8201, + "step": 38978 + }, + { + "epoch": 0.23181915500999142, + "grad_norm": 1.5137678384780884, + "learning_rate": 4.3658296238345006e-05, + "loss": 5.4408, + "step": 38979 + }, + { + "epoch": 0.23182510229327244, + "grad_norm": 1.629871129989624, + "learning_rate": 4.3657985346439586e-05, + "loss": 4.6527, + "step": 38980 + }, + { + "epoch": 0.23183104957655343, + "grad_norm": 1.397018313407898, + "learning_rate": 4.365767444802085e-05, + "loss": 4.8008, + "step": 38981 + }, + { + "epoch": 0.23183699685983442, + "grad_norm": 1.5167710781097412, + "learning_rate": 4.36573635430889e-05, + "loss": 4.8846, + "step": 38982 + }, + { + "epoch": 0.23184294414311543, + "grad_norm": 1.3749078512191772, + "learning_rate": 4.365705263164387e-05, + "loss": 4.9298, + "step": 38983 + }, + { + "epoch": 0.23184889142639642, + "grad_norm": 1.4250109195709229, + "learning_rate": 4.3656741713685855e-05, + "loss": 4.8517, + "step": 38984 + }, + { + "epoch": 0.2318548387096774, + "grad_norm": 1.4567049741744995, + "learning_rate": 4.365643078921496e-05, + "loss": 4.8569, + "step": 38985 + }, + { + "epoch": 0.23186078599295842, + "grad_norm": 1.3662344217300415, + "learning_rate": 4.36561198582313e-05, + "loss": 4.992, + "step": 38986 + }, + { + "epoch": 0.2318667332762394, + "grad_norm": 1.3737530708312988, + "learning_rate": 4.3655808920734976e-05, + "loss": 4.7912, + "step": 38987 + }, + { + "epoch": 0.2318726805595204, + "grad_norm": 1.371505618095398, + "learning_rate": 4.365549797672611e-05, + "loss": 4.7928, + "step": 38988 + }, + { + "epoch": 0.23187862784280142, + "grad_norm": 1.371051549911499, + "learning_rate": 4.36551870262048e-05, + "loss": 5.1194, + "step": 38989 + }, + { + "epoch": 0.2318845751260824, + "grad_norm": 1.409606695175171, + "learning_rate": 4.365487606917116e-05, + "loss": 5.6715, + "step": 38990 + }, + { + "epoch": 0.2318905224093634, + "grad_norm": 1.4108288288116455, + "learning_rate": 4.365456510562529e-05, + "loss": 5.7418, + "step": 38991 + }, + { + "epoch": 0.2318964696926444, + "grad_norm": 1.3956706523895264, + "learning_rate": 4.36542541355673e-05, + "loss": 5.3216, + "step": 38992 + }, + { + "epoch": 0.2319024169759254, + "grad_norm": 1.6888619661331177, + "learning_rate": 4.365394315899731e-05, + "loss": 4.2571, + "step": 38993 + }, + { + "epoch": 0.23190836425920638, + "grad_norm": 1.6837373971939087, + "learning_rate": 4.365363217591542e-05, + "loss": 4.5193, + "step": 38994 + }, + { + "epoch": 0.2319143115424874, + "grad_norm": 1.6545789241790771, + "learning_rate": 4.365332118632174e-05, + "loss": 4.265, + "step": 38995 + }, + { + "epoch": 0.2319202588257684, + "grad_norm": 1.6533746719360352, + "learning_rate": 4.365301019021638e-05, + "loss": 4.1409, + "step": 38996 + }, + { + "epoch": 0.23192620610904938, + "grad_norm": 1.5293077230453491, + "learning_rate": 4.365269918759944e-05, + "loss": 4.2099, + "step": 38997 + }, + { + "epoch": 0.2319321533923304, + "grad_norm": 1.5665626525878906, + "learning_rate": 4.365238817847104e-05, + "loss": 5.0483, + "step": 38998 + }, + { + "epoch": 0.23193810067561138, + "grad_norm": 1.9797513484954834, + "learning_rate": 4.365207716283128e-05, + "loss": 4.2643, + "step": 38999 + }, + { + "epoch": 0.23194404795889237, + "grad_norm": 1.7830750942230225, + "learning_rate": 4.365176614068028e-05, + "loss": 4.0168, + "step": 39000 + }, + { + "epoch": 0.23194999524217338, + "grad_norm": 1.7557772397994995, + "learning_rate": 4.365145511201813e-05, + "loss": 4.0527, + "step": 39001 + }, + { + "epoch": 0.23195594252545437, + "grad_norm": 1.6213778257369995, + "learning_rate": 4.3651144076844963e-05, + "loss": 4.2034, + "step": 39002 + }, + { + "epoch": 0.23196188980873536, + "grad_norm": 1.683821439743042, + "learning_rate": 4.365083303516087e-05, + "loss": 3.9285, + "step": 39003 + }, + { + "epoch": 0.23196783709201638, + "grad_norm": 1.7961101531982422, + "learning_rate": 4.3650521986965964e-05, + "loss": 4.4137, + "step": 39004 + }, + { + "epoch": 0.23197378437529736, + "grad_norm": 1.6424694061279297, + "learning_rate": 4.365021093226035e-05, + "loss": 3.97, + "step": 39005 + }, + { + "epoch": 0.23197973165857835, + "grad_norm": 1.7446128129959106, + "learning_rate": 4.3649899871044143e-05, + "loss": 4.1543, + "step": 39006 + }, + { + "epoch": 0.23198567894185937, + "grad_norm": 1.7776148319244385, + "learning_rate": 4.3649588803317445e-05, + "loss": 4.4003, + "step": 39007 + }, + { + "epoch": 0.23199162622514036, + "grad_norm": 1.6425302028656006, + "learning_rate": 4.3649277729080376e-05, + "loss": 4.2777, + "step": 39008 + }, + { + "epoch": 0.23199757350842135, + "grad_norm": 1.7701246738433838, + "learning_rate": 4.364896664833302e-05, + "loss": 3.98, + "step": 39009 + }, + { + "epoch": 0.23200352079170236, + "grad_norm": 1.7276018857955933, + "learning_rate": 4.364865556107552e-05, + "loss": 3.9601, + "step": 39010 + }, + { + "epoch": 0.23200946807498335, + "grad_norm": 1.7356696128845215, + "learning_rate": 4.364834446730796e-05, + "loss": 4.1327, + "step": 39011 + }, + { + "epoch": 0.23201541535826434, + "grad_norm": 1.790169358253479, + "learning_rate": 4.364803336703046e-05, + "loss": 4.2021, + "step": 39012 + }, + { + "epoch": 0.23202136264154535, + "grad_norm": 2.118116617202759, + "learning_rate": 4.364772226024312e-05, + "loss": 3.3244, + "step": 39013 + }, + { + "epoch": 0.23202730992482634, + "grad_norm": 1.806924819946289, + "learning_rate": 4.364741114694605e-05, + "loss": 3.9069, + "step": 39014 + }, + { + "epoch": 0.23203325720810733, + "grad_norm": 1.7346820831298828, + "learning_rate": 4.364710002713937e-05, + "loss": 3.8036, + "step": 39015 + }, + { + "epoch": 0.23203920449138835, + "grad_norm": 1.5005122423171997, + "learning_rate": 4.364678890082317e-05, + "loss": 4.5573, + "step": 39016 + }, + { + "epoch": 0.23204515177466933, + "grad_norm": 1.723801612854004, + "learning_rate": 4.364647776799757e-05, + "loss": 4.2554, + "step": 39017 + }, + { + "epoch": 0.23205109905795032, + "grad_norm": 1.719353199005127, + "learning_rate": 4.3646166628662686e-05, + "loss": 4.1124, + "step": 39018 + }, + { + "epoch": 0.23205704634123134, + "grad_norm": 1.8241521120071411, + "learning_rate": 4.364585548281861e-05, + "loss": 3.8887, + "step": 39019 + }, + { + "epoch": 0.23206299362451233, + "grad_norm": 1.7841458320617676, + "learning_rate": 4.364554433046546e-05, + "loss": 4.0111, + "step": 39020 + }, + { + "epoch": 0.23206894090779331, + "grad_norm": 1.7524330615997314, + "learning_rate": 4.364523317160335e-05, + "loss": 4.2447, + "step": 39021 + }, + { + "epoch": 0.23207488819107433, + "grad_norm": 1.8006513118743896, + "learning_rate": 4.3644922006232366e-05, + "loss": 3.9111, + "step": 39022 + }, + { + "epoch": 0.23208083547435532, + "grad_norm": 1.8408151865005493, + "learning_rate": 4.3644610834352654e-05, + "loss": 3.7837, + "step": 39023 + }, + { + "epoch": 0.2320867827576363, + "grad_norm": 1.7600802183151245, + "learning_rate": 4.3644299655964285e-05, + "loss": 3.8076, + "step": 39024 + }, + { + "epoch": 0.23209273004091732, + "grad_norm": 1.5894376039505005, + "learning_rate": 4.364398847106739e-05, + "loss": 3.9888, + "step": 39025 + }, + { + "epoch": 0.2320986773241983, + "grad_norm": 1.9288008213043213, + "learning_rate": 4.3643677279662063e-05, + "loss": 3.7298, + "step": 39026 + }, + { + "epoch": 0.2321046246074793, + "grad_norm": 1.9717549085617065, + "learning_rate": 4.364336608174843e-05, + "loss": 3.7533, + "step": 39027 + }, + { + "epoch": 0.23211057189076031, + "grad_norm": 1.8057029247283936, + "learning_rate": 4.364305487732659e-05, + "loss": 3.6243, + "step": 39028 + }, + { + "epoch": 0.2321165191740413, + "grad_norm": 1.6260126829147339, + "learning_rate": 4.3642743666396645e-05, + "loss": 4.2258, + "step": 39029 + }, + { + "epoch": 0.2321224664573223, + "grad_norm": 1.8007012605667114, + "learning_rate": 4.3642432448958716e-05, + "loss": 3.975, + "step": 39030 + }, + { + "epoch": 0.23212841374060328, + "grad_norm": 1.9593114852905273, + "learning_rate": 4.364212122501291e-05, + "loss": 3.7982, + "step": 39031 + }, + { + "epoch": 0.2321343610238843, + "grad_norm": 1.8035818338394165, + "learning_rate": 4.3641809994559325e-05, + "loss": 3.8347, + "step": 39032 + }, + { + "epoch": 0.23214030830716528, + "grad_norm": 1.7887778282165527, + "learning_rate": 4.364149875759808e-05, + "loss": 4.0232, + "step": 39033 + }, + { + "epoch": 0.23214625559044627, + "grad_norm": 2.112762451171875, + "learning_rate": 4.3641187514129276e-05, + "loss": 3.4156, + "step": 39034 + }, + { + "epoch": 0.2321522028737273, + "grad_norm": 1.6227257251739502, + "learning_rate": 4.364087626415304e-05, + "loss": 3.6872, + "step": 39035 + }, + { + "epoch": 0.23215815015700828, + "grad_norm": 1.7840327024459839, + "learning_rate": 4.364056500766945e-05, + "loss": 3.9829, + "step": 39036 + }, + { + "epoch": 0.23216409744028926, + "grad_norm": 1.7417759895324707, + "learning_rate": 4.364025374467864e-05, + "loss": 3.9742, + "step": 39037 + }, + { + "epoch": 0.23217004472357028, + "grad_norm": 1.6508119106292725, + "learning_rate": 4.36399424751807e-05, + "loss": 3.9325, + "step": 39038 + }, + { + "epoch": 0.23217599200685127, + "grad_norm": 1.8196989297866821, + "learning_rate": 4.3639631199175765e-05, + "loss": 4.0765, + "step": 39039 + }, + { + "epoch": 0.23218193929013226, + "grad_norm": 1.6848537921905518, + "learning_rate": 4.363931991666392e-05, + "loss": 3.926, + "step": 39040 + }, + { + "epoch": 0.23218788657341327, + "grad_norm": 1.689846158027649, + "learning_rate": 4.3639008627645283e-05, + "loss": 4.0929, + "step": 39041 + }, + { + "epoch": 0.23219383385669426, + "grad_norm": 1.9357181787490845, + "learning_rate": 4.3638697332119956e-05, + "loss": 3.4246, + "step": 39042 + }, + { + "epoch": 0.23219978113997525, + "grad_norm": 1.8697538375854492, + "learning_rate": 4.3638386030088054e-05, + "loss": 3.7608, + "step": 39043 + }, + { + "epoch": 0.23220572842325626, + "grad_norm": 2.1151657104492188, + "learning_rate": 4.3638074721549685e-05, + "loss": 3.2884, + "step": 39044 + }, + { + "epoch": 0.23221167570653725, + "grad_norm": 1.979285717010498, + "learning_rate": 4.363776340650495e-05, + "loss": 3.4265, + "step": 39045 + }, + { + "epoch": 0.23221762298981824, + "grad_norm": 1.954432487487793, + "learning_rate": 4.3637452084953975e-05, + "loss": 3.5171, + "step": 39046 + }, + { + "epoch": 0.23222357027309926, + "grad_norm": 2.206760883331299, + "learning_rate": 4.3637140756896856e-05, + "loss": 3.2805, + "step": 39047 + }, + { + "epoch": 0.23222951755638024, + "grad_norm": 1.9962438344955444, + "learning_rate": 4.363682942233369e-05, + "loss": 3.5797, + "step": 39048 + }, + { + "epoch": 0.23223546483966123, + "grad_norm": 1.9898980855941772, + "learning_rate": 4.3636518081264616e-05, + "loss": 3.1749, + "step": 39049 + }, + { + "epoch": 0.23224141212294225, + "grad_norm": 2.0162951946258545, + "learning_rate": 4.3636206733689724e-05, + "loss": 2.8138, + "step": 39050 + }, + { + "epoch": 0.23224735940622324, + "grad_norm": 2.061389923095703, + "learning_rate": 4.363589537960912e-05, + "loss": 2.8714, + "step": 39051 + }, + { + "epoch": 0.23225330668950422, + "grad_norm": 1.993212342262268, + "learning_rate": 4.363558401902292e-05, + "loss": 2.9435, + "step": 39052 + }, + { + "epoch": 0.23225925397278524, + "grad_norm": 2.071394443511963, + "learning_rate": 4.363527265193122e-05, + "loss": 2.893, + "step": 39053 + }, + { + "epoch": 0.23226520125606623, + "grad_norm": 2.181269407272339, + "learning_rate": 4.363496127833415e-05, + "loss": 2.8135, + "step": 39054 + }, + { + "epoch": 0.23227114853934722, + "grad_norm": 2.2280220985412598, + "learning_rate": 4.3634649898231804e-05, + "loss": 3.0349, + "step": 39055 + }, + { + "epoch": 0.23227709582262823, + "grad_norm": 2.2977817058563232, + "learning_rate": 4.36343385116243e-05, + "loss": 3.0387, + "step": 39056 + }, + { + "epoch": 0.23228304310590922, + "grad_norm": 2.1697254180908203, + "learning_rate": 4.363402711851173e-05, + "loss": 3.1483, + "step": 39057 + }, + { + "epoch": 0.2322889903891902, + "grad_norm": 2.1386520862579346, + "learning_rate": 4.3633715718894226e-05, + "loss": 3.0296, + "step": 39058 + }, + { + "epoch": 0.23229493767247122, + "grad_norm": 2.0731868743896484, + "learning_rate": 4.3633404312771875e-05, + "loss": 3.0588, + "step": 39059 + }, + { + "epoch": 0.2323008849557522, + "grad_norm": 1.5297818183898926, + "learning_rate": 4.36330929001448e-05, + "loss": 4.8685, + "step": 39060 + }, + { + "epoch": 0.2323068322390332, + "grad_norm": 2.4762682914733887, + "learning_rate": 4.3632781481013105e-05, + "loss": 3.4948, + "step": 39061 + }, + { + "epoch": 0.23231277952231422, + "grad_norm": 2.357487201690674, + "learning_rate": 4.36324700553769e-05, + "loss": 3.7714, + "step": 39062 + }, + { + "epoch": 0.2323187268055952, + "grad_norm": 1.713942527770996, + "learning_rate": 4.363215862323628e-05, + "loss": 4.0288, + "step": 39063 + }, + { + "epoch": 0.2323246740888762, + "grad_norm": 2.204071283340454, + "learning_rate": 4.3631847184591376e-05, + "loss": 3.2584, + "step": 39064 + }, + { + "epoch": 0.2323306213721572, + "grad_norm": 1.647165060043335, + "learning_rate": 4.363153573944229e-05, + "loss": 4.762, + "step": 39065 + }, + { + "epoch": 0.2323365686554382, + "grad_norm": 2.2899770736694336, + "learning_rate": 4.3631224287789116e-05, + "loss": 4.5968, + "step": 39066 + }, + { + "epoch": 0.23234251593871919, + "grad_norm": 2.3352129459381104, + "learning_rate": 4.3630912829631986e-05, + "loss": 4.6158, + "step": 39067 + }, + { + "epoch": 0.2323484632220002, + "grad_norm": 2.2160227298736572, + "learning_rate": 4.363060136497099e-05, + "loss": 4.5646, + "step": 39068 + }, + { + "epoch": 0.2323544105052812, + "grad_norm": 1.5986429452896118, + "learning_rate": 4.363028989380625e-05, + "loss": 5.1399, + "step": 39069 + }, + { + "epoch": 0.23236035778856218, + "grad_norm": 1.4894126653671265, + "learning_rate": 4.362997841613786e-05, + "loss": 4.8201, + "step": 39070 + }, + { + "epoch": 0.2323663050718432, + "grad_norm": 2.388699531555176, + "learning_rate": 4.362966693196594e-05, + "loss": 4.1083, + "step": 39071 + }, + { + "epoch": 0.23237225235512418, + "grad_norm": 2.0085203647613525, + "learning_rate": 4.3629355441290596e-05, + "loss": 4.317, + "step": 39072 + }, + { + "epoch": 0.23237819963840517, + "grad_norm": 2.012711763381958, + "learning_rate": 4.362904394411194e-05, + "loss": 4.4285, + "step": 39073 + }, + { + "epoch": 0.23238414692168619, + "grad_norm": 2.2238574028015137, + "learning_rate": 4.362873244043007e-05, + "loss": 4.5044, + "step": 39074 + }, + { + "epoch": 0.23239009420496717, + "grad_norm": 2.2293858528137207, + "learning_rate": 4.3628420930245103e-05, + "loss": 4.5223, + "step": 39075 + }, + { + "epoch": 0.23239604148824816, + "grad_norm": 1.8992894887924194, + "learning_rate": 4.362810941355715e-05, + "loss": 4.9021, + "step": 39076 + }, + { + "epoch": 0.23240198877152918, + "grad_norm": 1.51563560962677, + "learning_rate": 4.362779789036632e-05, + "loss": 4.5958, + "step": 39077 + }, + { + "epoch": 0.23240793605481017, + "grad_norm": 2.2260918617248535, + "learning_rate": 4.362748636067272e-05, + "loss": 4.3224, + "step": 39078 + }, + { + "epoch": 0.23241388333809115, + "grad_norm": 1.7403556108474731, + "learning_rate": 4.362717482447645e-05, + "loss": 4.2088, + "step": 39079 + }, + { + "epoch": 0.23241983062137217, + "grad_norm": 1.914040207862854, + "learning_rate": 4.362686328177762e-05, + "loss": 4.3207, + "step": 39080 + }, + { + "epoch": 0.23242577790465316, + "grad_norm": 1.7242257595062256, + "learning_rate": 4.3626551732576346e-05, + "loss": 4.5082, + "step": 39081 + }, + { + "epoch": 0.23243172518793415, + "grad_norm": 1.698724627494812, + "learning_rate": 4.3626240176872746e-05, + "loss": 4.6776, + "step": 39082 + }, + { + "epoch": 0.23243767247121516, + "grad_norm": 1.7210109233856201, + "learning_rate": 4.362592861466691e-05, + "loss": 4.3961, + "step": 39083 + }, + { + "epoch": 0.23244361975449615, + "grad_norm": 2.1987760066986084, + "learning_rate": 4.362561704595896e-05, + "loss": 4.1219, + "step": 39084 + }, + { + "epoch": 0.23244956703777714, + "grad_norm": 1.7698177099227905, + "learning_rate": 4.3625305470749e-05, + "loss": 4.5081, + "step": 39085 + }, + { + "epoch": 0.23245551432105815, + "grad_norm": 1.9433329105377197, + "learning_rate": 4.362499388903713e-05, + "loss": 4.6824, + "step": 39086 + }, + { + "epoch": 0.23246146160433914, + "grad_norm": 1.6914910078048706, + "learning_rate": 4.3624682300823473e-05, + "loss": 4.572, + "step": 39087 + }, + { + "epoch": 0.23246740888762013, + "grad_norm": 1.7068865299224854, + "learning_rate": 4.362437070610813e-05, + "loss": 4.4017, + "step": 39088 + }, + { + "epoch": 0.23247335617090112, + "grad_norm": 1.7159522771835327, + "learning_rate": 4.3624059104891216e-05, + "loss": 4.6159, + "step": 39089 + }, + { + "epoch": 0.23247930345418213, + "grad_norm": 1.7849717140197754, + "learning_rate": 4.362374749717283e-05, + "loss": 4.8789, + "step": 39090 + }, + { + "epoch": 0.23248525073746312, + "grad_norm": 2.447394847869873, + "learning_rate": 4.362343588295309e-05, + "loss": 3.2631, + "step": 39091 + }, + { + "epoch": 0.2324911980207441, + "grad_norm": 2.2207345962524414, + "learning_rate": 4.36231242622321e-05, + "loss": 3.4677, + "step": 39092 + }, + { + "epoch": 0.23249714530402513, + "grad_norm": 2.4080615043640137, + "learning_rate": 4.3622812635009967e-05, + "loss": 3.3182, + "step": 39093 + }, + { + "epoch": 0.23250309258730611, + "grad_norm": 2.1918601989746094, + "learning_rate": 4.3622501001286806e-05, + "loss": 3.3297, + "step": 39094 + }, + { + "epoch": 0.2325090398705871, + "grad_norm": 2.7159063816070557, + "learning_rate": 4.362218936106272e-05, + "loss": 3.4462, + "step": 39095 + }, + { + "epoch": 0.23251498715386812, + "grad_norm": 2.3878097534179688, + "learning_rate": 4.362187771433782e-05, + "loss": 3.2406, + "step": 39096 + }, + { + "epoch": 0.2325209344371491, + "grad_norm": 2.8980376720428467, + "learning_rate": 4.362156606111222e-05, + "loss": 3.4155, + "step": 39097 + }, + { + "epoch": 0.2325268817204301, + "grad_norm": 2.0443594455718994, + "learning_rate": 4.362125440138601e-05, + "loss": 3.4641, + "step": 39098 + }, + { + "epoch": 0.2325328290037111, + "grad_norm": 2.1212189197540283, + "learning_rate": 4.362094273515933e-05, + "loss": 4.1194, + "step": 39099 + }, + { + "epoch": 0.2325387762869921, + "grad_norm": 2.186098575592041, + "learning_rate": 4.362063106243226e-05, + "loss": 4.1914, + "step": 39100 + }, + { + "epoch": 0.2325447235702731, + "grad_norm": 1.6000093221664429, + "learning_rate": 4.362031938320492e-05, + "loss": 4.987, + "step": 39101 + }, + { + "epoch": 0.2325506708535541, + "grad_norm": 1.4070879220962524, + "learning_rate": 4.362000769747743e-05, + "loss": 5.2052, + "step": 39102 + }, + { + "epoch": 0.2325566181368351, + "grad_norm": 1.739212989807129, + "learning_rate": 4.361969600524988e-05, + "loss": 4.398, + "step": 39103 + }, + { + "epoch": 0.23256256542011608, + "grad_norm": 2.334226369857788, + "learning_rate": 4.361938430652238e-05, + "loss": 3.6885, + "step": 39104 + }, + { + "epoch": 0.2325685127033971, + "grad_norm": 1.7967642545700073, + "learning_rate": 4.361907260129505e-05, + "loss": 4.3779, + "step": 39105 + }, + { + "epoch": 0.23257445998667808, + "grad_norm": 1.9032526016235352, + "learning_rate": 4.3618760889568e-05, + "loss": 4.4896, + "step": 39106 + }, + { + "epoch": 0.23258040726995907, + "grad_norm": 1.9198237657546997, + "learning_rate": 4.3618449171341324e-05, + "loss": 4.3165, + "step": 39107 + }, + { + "epoch": 0.2325863545532401, + "grad_norm": 2.0512235164642334, + "learning_rate": 4.3618137446615146e-05, + "loss": 4.025, + "step": 39108 + }, + { + "epoch": 0.23259230183652108, + "grad_norm": 1.6357065439224243, + "learning_rate": 4.361782571538957e-05, + "loss": 4.2988, + "step": 39109 + }, + { + "epoch": 0.23259824911980206, + "grad_norm": 2.0023303031921387, + "learning_rate": 4.3617513977664695e-05, + "loss": 3.7465, + "step": 39110 + }, + { + "epoch": 0.23260419640308308, + "grad_norm": 2.0384459495544434, + "learning_rate": 4.3617202233440646e-05, + "loss": 3.9628, + "step": 39111 + }, + { + "epoch": 0.23261014368636407, + "grad_norm": 2.0617830753326416, + "learning_rate": 4.361689048271752e-05, + "loss": 4.2029, + "step": 39112 + }, + { + "epoch": 0.23261609096964506, + "grad_norm": 2.023827314376831, + "learning_rate": 4.361657872549544e-05, + "loss": 4.1771, + "step": 39113 + }, + { + "epoch": 0.23262203825292607, + "grad_norm": 1.7333523035049438, + "learning_rate": 4.361626696177449e-05, + "loss": 4.4811, + "step": 39114 + }, + { + "epoch": 0.23262798553620706, + "grad_norm": 1.7693727016448975, + "learning_rate": 4.36159551915548e-05, + "loss": 4.4168, + "step": 39115 + }, + { + "epoch": 0.23263393281948805, + "grad_norm": 1.8410542011260986, + "learning_rate": 4.3615643414836474e-05, + "loss": 4.3063, + "step": 39116 + }, + { + "epoch": 0.23263988010276906, + "grad_norm": 1.760138750076294, + "learning_rate": 4.3615331631619615e-05, + "loss": 4.4036, + "step": 39117 + }, + { + "epoch": 0.23264582738605005, + "grad_norm": 1.694985270500183, + "learning_rate": 4.361501984190434e-05, + "loss": 4.4535, + "step": 39118 + }, + { + "epoch": 0.23265177466933104, + "grad_norm": 1.8033300638198853, + "learning_rate": 4.361470804569075e-05, + "loss": 4.5579, + "step": 39119 + }, + { + "epoch": 0.23265772195261206, + "grad_norm": 1.8068331480026245, + "learning_rate": 4.361439624297896e-05, + "loss": 4.1694, + "step": 39120 + }, + { + "epoch": 0.23266366923589304, + "grad_norm": 1.616873025894165, + "learning_rate": 4.361408443376908e-05, + "loss": 4.4546, + "step": 39121 + }, + { + "epoch": 0.23266961651917403, + "grad_norm": 1.3989241123199463, + "learning_rate": 4.361377261806121e-05, + "loss": 5.0763, + "step": 39122 + }, + { + "epoch": 0.23267556380245505, + "grad_norm": 1.6721818447113037, + "learning_rate": 4.361346079585546e-05, + "loss": 4.9419, + "step": 39123 + }, + { + "epoch": 0.23268151108573604, + "grad_norm": 1.5129653215408325, + "learning_rate": 4.361314896715195e-05, + "loss": 5.3604, + "step": 39124 + }, + { + "epoch": 0.23268745836901703, + "grad_norm": 1.467267394065857, + "learning_rate": 4.3612837131950784e-05, + "loss": 5.3583, + "step": 39125 + }, + { + "epoch": 0.23269340565229804, + "grad_norm": 1.2865101099014282, + "learning_rate": 4.3612525290252066e-05, + "loss": 5.3188, + "step": 39126 + }, + { + "epoch": 0.23269935293557903, + "grad_norm": 1.2326202392578125, + "learning_rate": 4.36122134420559e-05, + "loss": 5.2643, + "step": 39127 + }, + { + "epoch": 0.23270530021886002, + "grad_norm": 1.490971565246582, + "learning_rate": 4.361190158736242e-05, + "loss": 5.0676, + "step": 39128 + }, + { + "epoch": 0.23271124750214103, + "grad_norm": 1.7493031024932861, + "learning_rate": 4.361158972617171e-05, + "loss": 4.5299, + "step": 39129 + }, + { + "epoch": 0.23271719478542202, + "grad_norm": 1.8924753665924072, + "learning_rate": 4.361127785848388e-05, + "loss": 4.4365, + "step": 39130 + }, + { + "epoch": 0.232723142068703, + "grad_norm": 1.9791909456253052, + "learning_rate": 4.3610965984299045e-05, + "loss": 3.9715, + "step": 39131 + }, + { + "epoch": 0.23272908935198403, + "grad_norm": 2.780217170715332, + "learning_rate": 4.3610654103617323e-05, + "loss": 3.503, + "step": 39132 + }, + { + "epoch": 0.232735036635265, + "grad_norm": 1.5088809728622437, + "learning_rate": 4.361034221643881e-05, + "loss": 5.2044, + "step": 39133 + }, + { + "epoch": 0.232740983918546, + "grad_norm": 1.5319719314575195, + "learning_rate": 4.361003032276362e-05, + "loss": 5.2335, + "step": 39134 + }, + { + "epoch": 0.23274693120182702, + "grad_norm": 1.5246729850769043, + "learning_rate": 4.360971842259186e-05, + "loss": 5.1811, + "step": 39135 + }, + { + "epoch": 0.232752878485108, + "grad_norm": 1.5429359674453735, + "learning_rate": 4.3609406515923634e-05, + "loss": 5.5938, + "step": 39136 + }, + { + "epoch": 0.232758825768389, + "grad_norm": 1.4963136911392212, + "learning_rate": 4.360909460275906e-05, + "loss": 5.6445, + "step": 39137 + }, + { + "epoch": 0.23276477305167, + "grad_norm": 1.2514957189559937, + "learning_rate": 4.360878268309825e-05, + "loss": 5.3839, + "step": 39138 + }, + { + "epoch": 0.232770720334951, + "grad_norm": 1.644110918045044, + "learning_rate": 4.36084707569413e-05, + "loss": 4.8794, + "step": 39139 + }, + { + "epoch": 0.23277666761823199, + "grad_norm": 1.8288604021072388, + "learning_rate": 4.360815882428832e-05, + "loss": 4.5039, + "step": 39140 + }, + { + "epoch": 0.232782614901513, + "grad_norm": 2.1350958347320557, + "learning_rate": 4.3607846885139434e-05, + "loss": 3.9829, + "step": 39141 + }, + { + "epoch": 0.232788562184794, + "grad_norm": 2.164173126220703, + "learning_rate": 4.360753493949473e-05, + "loss": 4.2228, + "step": 39142 + }, + { + "epoch": 0.23279450946807498, + "grad_norm": 2.1720077991485596, + "learning_rate": 4.3607222987354335e-05, + "loss": 4.222, + "step": 39143 + }, + { + "epoch": 0.232800456751356, + "grad_norm": 2.344161033630371, + "learning_rate": 4.360691102871835e-05, + "loss": 4.1798, + "step": 39144 + }, + { + "epoch": 0.23280640403463698, + "grad_norm": 2.174213409423828, + "learning_rate": 4.3606599063586886e-05, + "loss": 4.1144, + "step": 39145 + }, + { + "epoch": 0.23281235131791797, + "grad_norm": 2.1762256622314453, + "learning_rate": 4.360628709196005e-05, + "loss": 4.3624, + "step": 39146 + }, + { + "epoch": 0.23281829860119896, + "grad_norm": 1.9964789152145386, + "learning_rate": 4.360597511383795e-05, + "loss": 4.3162, + "step": 39147 + }, + { + "epoch": 0.23282424588447997, + "grad_norm": 2.0995092391967773, + "learning_rate": 4.36056631292207e-05, + "loss": 4.3076, + "step": 39148 + }, + { + "epoch": 0.23283019316776096, + "grad_norm": 1.65229070186615, + "learning_rate": 4.36053511381084e-05, + "loss": 5.1219, + "step": 39149 + }, + { + "epoch": 0.23283614045104195, + "grad_norm": 1.7693278789520264, + "learning_rate": 4.360503914050116e-05, + "loss": 5.0181, + "step": 39150 + }, + { + "epoch": 0.23284208773432297, + "grad_norm": 1.9694136381149292, + "learning_rate": 4.3604727136399105e-05, + "loss": 4.4619, + "step": 39151 + }, + { + "epoch": 0.23284803501760395, + "grad_norm": 2.2055177688598633, + "learning_rate": 4.360441512580232e-05, + "loss": 4.2537, + "step": 39152 + }, + { + "epoch": 0.23285398230088494, + "grad_norm": 1.8748949766159058, + "learning_rate": 4.360410310871094e-05, + "loss": 4.4831, + "step": 39153 + }, + { + "epoch": 0.23285992958416596, + "grad_norm": 1.6037344932556152, + "learning_rate": 4.360379108512504e-05, + "loss": 5.0677, + "step": 39154 + }, + { + "epoch": 0.23286587686744695, + "grad_norm": 1.627509355545044, + "learning_rate": 4.3603479055044774e-05, + "loss": 4.7995, + "step": 39155 + }, + { + "epoch": 0.23287182415072794, + "grad_norm": 1.7337157726287842, + "learning_rate": 4.36031670184702e-05, + "loss": 5.0422, + "step": 39156 + }, + { + "epoch": 0.23287777143400895, + "grad_norm": 1.6469885110855103, + "learning_rate": 4.360285497540148e-05, + "loss": 4.7927, + "step": 39157 + }, + { + "epoch": 0.23288371871728994, + "grad_norm": 1.5350927114486694, + "learning_rate": 4.360254292583867e-05, + "loss": 4.7906, + "step": 39158 + }, + { + "epoch": 0.23288966600057093, + "grad_norm": 1.4045746326446533, + "learning_rate": 4.360223086978191e-05, + "loss": 4.6738, + "step": 39159 + }, + { + "epoch": 0.23289561328385194, + "grad_norm": 1.599743127822876, + "learning_rate": 4.360191880723131e-05, + "loss": 5.1313, + "step": 39160 + }, + { + "epoch": 0.23290156056713293, + "grad_norm": 1.453123927116394, + "learning_rate": 4.360160673818697e-05, + "loss": 5.1246, + "step": 39161 + }, + { + "epoch": 0.23290750785041392, + "grad_norm": 1.555281639099121, + "learning_rate": 4.360129466264901e-05, + "loss": 5.1001, + "step": 39162 + }, + { + "epoch": 0.23291345513369494, + "grad_norm": 1.3762109279632568, + "learning_rate": 4.360098258061752e-05, + "loss": 5.156, + "step": 39163 + }, + { + "epoch": 0.23291940241697592, + "grad_norm": 1.4758714437484741, + "learning_rate": 4.360067049209262e-05, + "loss": 5.0616, + "step": 39164 + }, + { + "epoch": 0.2329253497002569, + "grad_norm": 1.4332698583602905, + "learning_rate": 4.360035839707442e-05, + "loss": 5.0706, + "step": 39165 + }, + { + "epoch": 0.23293129698353793, + "grad_norm": 1.5097830295562744, + "learning_rate": 4.360004629556302e-05, + "loss": 4.9428, + "step": 39166 + }, + { + "epoch": 0.23293724426681892, + "grad_norm": 1.6742161512374878, + "learning_rate": 4.359973418755855e-05, + "loss": 4.8498, + "step": 39167 + }, + { + "epoch": 0.2329431915500999, + "grad_norm": 1.6985595226287842, + "learning_rate": 4.359942207306109e-05, + "loss": 4.7057, + "step": 39168 + }, + { + "epoch": 0.23294913883338092, + "grad_norm": 1.560068964958191, + "learning_rate": 4.359910995207078e-05, + "loss": 4.9408, + "step": 39169 + }, + { + "epoch": 0.2329550861166619, + "grad_norm": 1.4312219619750977, + "learning_rate": 4.35987978245877e-05, + "loss": 5.0214, + "step": 39170 + }, + { + "epoch": 0.2329610333999429, + "grad_norm": 1.3766241073608398, + "learning_rate": 4.359848569061198e-05, + "loss": 4.8688, + "step": 39171 + }, + { + "epoch": 0.2329669806832239, + "grad_norm": 1.4122978448867798, + "learning_rate": 4.359817355014371e-05, + "loss": 5.0583, + "step": 39172 + }, + { + "epoch": 0.2329729279665049, + "grad_norm": 1.368056297302246, + "learning_rate": 4.359786140318302e-05, + "loss": 4.9295, + "step": 39173 + }, + { + "epoch": 0.2329788752497859, + "grad_norm": 1.5163987874984741, + "learning_rate": 4.3597549249730003e-05, + "loss": 4.9142, + "step": 39174 + }, + { + "epoch": 0.2329848225330669, + "grad_norm": 1.3942031860351562, + "learning_rate": 4.359723708978478e-05, + "loss": 4.9853, + "step": 39175 + }, + { + "epoch": 0.2329907698163479, + "grad_norm": 1.6139392852783203, + "learning_rate": 4.3596924923347446e-05, + "loss": 5.0879, + "step": 39176 + }, + { + "epoch": 0.23299671709962888, + "grad_norm": 1.4891060590744019, + "learning_rate": 4.359661275041812e-05, + "loss": 5.4327, + "step": 39177 + }, + { + "epoch": 0.2330026643829099, + "grad_norm": 1.6758902072906494, + "learning_rate": 4.3596300570996905e-05, + "loss": 4.9565, + "step": 39178 + }, + { + "epoch": 0.23300861166619088, + "grad_norm": 1.9034432172775269, + "learning_rate": 4.359598838508392e-05, + "loss": 3.8357, + "step": 39179 + }, + { + "epoch": 0.23301455894947187, + "grad_norm": 1.4876021146774292, + "learning_rate": 4.359567619267927e-05, + "loss": 5.1076, + "step": 39180 + }, + { + "epoch": 0.2330205062327529, + "grad_norm": 1.623687744140625, + "learning_rate": 4.359536399378306e-05, + "loss": 4.9354, + "step": 39181 + }, + { + "epoch": 0.23302645351603388, + "grad_norm": 1.6403672695159912, + "learning_rate": 4.359505178839539e-05, + "loss": 4.8971, + "step": 39182 + }, + { + "epoch": 0.23303240079931486, + "grad_norm": 1.4088045358657837, + "learning_rate": 4.359473957651639e-05, + "loss": 4.696, + "step": 39183 + }, + { + "epoch": 0.23303834808259588, + "grad_norm": 1.4575159549713135, + "learning_rate": 4.3594427358146155e-05, + "loss": 5.0652, + "step": 39184 + }, + { + "epoch": 0.23304429536587687, + "grad_norm": 1.4742602109909058, + "learning_rate": 4.35941151332848e-05, + "loss": 4.663, + "step": 39185 + }, + { + "epoch": 0.23305024264915786, + "grad_norm": 1.45940101146698, + "learning_rate": 4.3593802901932434e-05, + "loss": 4.8016, + "step": 39186 + }, + { + "epoch": 0.23305618993243887, + "grad_norm": 1.4725275039672852, + "learning_rate": 4.359349066408915e-05, + "loss": 5.4615, + "step": 39187 + }, + { + "epoch": 0.23306213721571986, + "grad_norm": 1.3877811431884766, + "learning_rate": 4.3593178419755086e-05, + "loss": 5.362, + "step": 39188 + }, + { + "epoch": 0.23306808449900085, + "grad_norm": 1.5631160736083984, + "learning_rate": 4.359286616893033e-05, + "loss": 4.6152, + "step": 39189 + }, + { + "epoch": 0.23307403178228187, + "grad_norm": 1.55159592628479, + "learning_rate": 4.3592553911615e-05, + "loss": 5.1536, + "step": 39190 + }, + { + "epoch": 0.23307997906556285, + "grad_norm": 1.3627732992172241, + "learning_rate": 4.35922416478092e-05, + "loss": 5.3487, + "step": 39191 + }, + { + "epoch": 0.23308592634884384, + "grad_norm": 1.2899950742721558, + "learning_rate": 4.359192937751303e-05, + "loss": 5.4265, + "step": 39192 + }, + { + "epoch": 0.23309187363212486, + "grad_norm": 1.3154022693634033, + "learning_rate": 4.359161710072662e-05, + "loss": 5.2734, + "step": 39193 + }, + { + "epoch": 0.23309782091540585, + "grad_norm": 1.618874430656433, + "learning_rate": 4.3591304817450066e-05, + "loss": 5.0206, + "step": 39194 + }, + { + "epoch": 0.23310376819868683, + "grad_norm": 1.5921950340270996, + "learning_rate": 4.359099252768348e-05, + "loss": 4.6433, + "step": 39195 + }, + { + "epoch": 0.23310971548196785, + "grad_norm": 1.7619107961654663, + "learning_rate": 4.359068023142697e-05, + "loss": 4.5171, + "step": 39196 + }, + { + "epoch": 0.23311566276524884, + "grad_norm": 1.6703497171401978, + "learning_rate": 4.359036792868064e-05, + "loss": 4.5936, + "step": 39197 + }, + { + "epoch": 0.23312161004852983, + "grad_norm": 1.5407284498214722, + "learning_rate": 4.359005561944461e-05, + "loss": 4.7157, + "step": 39198 + }, + { + "epoch": 0.23312755733181084, + "grad_norm": 1.6638575792312622, + "learning_rate": 4.358974330371899e-05, + "loss": 4.3966, + "step": 39199 + }, + { + "epoch": 0.23313350461509183, + "grad_norm": 1.5696804523468018, + "learning_rate": 4.3589430981503875e-05, + "loss": 4.6406, + "step": 39200 + }, + { + "epoch": 0.23313945189837282, + "grad_norm": 1.6106598377227783, + "learning_rate": 4.358911865279939e-05, + "loss": 4.9825, + "step": 39201 + }, + { + "epoch": 0.23314539918165383, + "grad_norm": 1.5994102954864502, + "learning_rate": 4.3588806317605624e-05, + "loss": 5.1257, + "step": 39202 + }, + { + "epoch": 0.23315134646493482, + "grad_norm": 1.5106816291809082, + "learning_rate": 4.3588493975922704e-05, + "loss": 4.9696, + "step": 39203 + }, + { + "epoch": 0.2331572937482158, + "grad_norm": 1.8663996458053589, + "learning_rate": 4.358818162775073e-05, + "loss": 4.9723, + "step": 39204 + }, + { + "epoch": 0.2331632410314968, + "grad_norm": 1.5792741775512695, + "learning_rate": 4.3587869273089824e-05, + "loss": 5.6912, + "step": 39205 + }, + { + "epoch": 0.23316918831477781, + "grad_norm": 2.0187623500823975, + "learning_rate": 4.358755691194007e-05, + "loss": 4.0808, + "step": 39206 + }, + { + "epoch": 0.2331751355980588, + "grad_norm": 1.7841250896453857, + "learning_rate": 4.35872445443016e-05, + "loss": 4.8503, + "step": 39207 + }, + { + "epoch": 0.2331810828813398, + "grad_norm": 1.5512248277664185, + "learning_rate": 4.358693217017451e-05, + "loss": 4.7434, + "step": 39208 + }, + { + "epoch": 0.2331870301646208, + "grad_norm": 1.6511754989624023, + "learning_rate": 4.358661978955892e-05, + "loss": 4.7894, + "step": 39209 + }, + { + "epoch": 0.2331929774479018, + "grad_norm": 1.4584643840789795, + "learning_rate": 4.358630740245493e-05, + "loss": 4.4343, + "step": 39210 + }, + { + "epoch": 0.23319892473118278, + "grad_norm": 1.4316320419311523, + "learning_rate": 4.3585995008862654e-05, + "loss": 5.2399, + "step": 39211 + }, + { + "epoch": 0.2332048720144638, + "grad_norm": 1.9914770126342773, + "learning_rate": 4.35856826087822e-05, + "loss": 4.037, + "step": 39212 + }, + { + "epoch": 0.2332108192977448, + "grad_norm": 2.455686569213867, + "learning_rate": 4.358537020221367e-05, + "loss": 3.4472, + "step": 39213 + }, + { + "epoch": 0.23321676658102578, + "grad_norm": 2.050672769546509, + "learning_rate": 4.358505778915718e-05, + "loss": 3.5811, + "step": 39214 + }, + { + "epoch": 0.2332227138643068, + "grad_norm": 1.9716814756393433, + "learning_rate": 4.358474536961285e-05, + "loss": 4.2366, + "step": 39215 + }, + { + "epoch": 0.23322866114758778, + "grad_norm": 1.5395363569259644, + "learning_rate": 4.358443294358077e-05, + "loss": 4.7607, + "step": 39216 + }, + { + "epoch": 0.23323460843086877, + "grad_norm": 1.3587455749511719, + "learning_rate": 4.3584120511061045e-05, + "loss": 4.88, + "step": 39217 + }, + { + "epoch": 0.23324055571414978, + "grad_norm": 1.9777147769927979, + "learning_rate": 4.358380807205381e-05, + "loss": 4.0743, + "step": 39218 + }, + { + "epoch": 0.23324650299743077, + "grad_norm": 2.6593072414398193, + "learning_rate": 4.3583495626559155e-05, + "loss": 3.4124, + "step": 39219 + }, + { + "epoch": 0.23325245028071176, + "grad_norm": 2.5443968772888184, + "learning_rate": 4.358318317457719e-05, + "loss": 3.6349, + "step": 39220 + }, + { + "epoch": 0.23325839756399278, + "grad_norm": 1.8876529932022095, + "learning_rate": 4.358287071610804e-05, + "loss": 4.0327, + "step": 39221 + }, + { + "epoch": 0.23326434484727376, + "grad_norm": 2.037179708480835, + "learning_rate": 4.358255825115179e-05, + "loss": 3.7585, + "step": 39222 + }, + { + "epoch": 0.23327029213055475, + "grad_norm": 2.7959325313568115, + "learning_rate": 4.358224577970857e-05, + "loss": 3.5884, + "step": 39223 + }, + { + "epoch": 0.23327623941383577, + "grad_norm": 1.7984586954116821, + "learning_rate": 4.3581933301778466e-05, + "loss": 4.3233, + "step": 39224 + }, + { + "epoch": 0.23328218669711676, + "grad_norm": 1.8435425758361816, + "learning_rate": 4.358162081736161e-05, + "loss": 3.8402, + "step": 39225 + }, + { + "epoch": 0.23328813398039774, + "grad_norm": 1.887158989906311, + "learning_rate": 4.35813083264581e-05, + "loss": 4.4948, + "step": 39226 + }, + { + "epoch": 0.23329408126367876, + "grad_norm": 1.758460283279419, + "learning_rate": 4.358099582906806e-05, + "loss": 4.501, + "step": 39227 + }, + { + "epoch": 0.23330002854695975, + "grad_norm": 1.8516051769256592, + "learning_rate": 4.3580683325191576e-05, + "loss": 4.2182, + "step": 39228 + }, + { + "epoch": 0.23330597583024074, + "grad_norm": 1.7806837558746338, + "learning_rate": 4.3580370814828766e-05, + "loss": 4.108, + "step": 39229 + }, + { + "epoch": 0.23331192311352175, + "grad_norm": 1.9674243927001953, + "learning_rate": 4.358005829797974e-05, + "loss": 3.8963, + "step": 39230 + }, + { + "epoch": 0.23331787039680274, + "grad_norm": 1.6735156774520874, + "learning_rate": 4.357974577464461e-05, + "loss": 4.2549, + "step": 39231 + }, + { + "epoch": 0.23332381768008373, + "grad_norm": 1.656535029411316, + "learning_rate": 4.357943324482348e-05, + "loss": 4.5354, + "step": 39232 + }, + { + "epoch": 0.23332976496336474, + "grad_norm": 1.5299835205078125, + "learning_rate": 4.357912070851646e-05, + "loss": 4.4048, + "step": 39233 + }, + { + "epoch": 0.23333571224664573, + "grad_norm": 1.5637716054916382, + "learning_rate": 4.357880816572366e-05, + "loss": 4.3426, + "step": 39234 + }, + { + "epoch": 0.23334165952992672, + "grad_norm": 1.5826610326766968, + "learning_rate": 4.3578495616445205e-05, + "loss": 4.0884, + "step": 39235 + }, + { + "epoch": 0.23334760681320774, + "grad_norm": 1.7511299848556519, + "learning_rate": 4.357818306068117e-05, + "loss": 3.9539, + "step": 39236 + }, + { + "epoch": 0.23335355409648872, + "grad_norm": 2.037144422531128, + "learning_rate": 4.357787049843169e-05, + "loss": 3.8867, + "step": 39237 + }, + { + "epoch": 0.2333595013797697, + "grad_norm": 1.641832947731018, + "learning_rate": 4.3577557929696875e-05, + "loss": 4.6045, + "step": 39238 + }, + { + "epoch": 0.23336544866305073, + "grad_norm": 1.5627217292785645, + "learning_rate": 4.357724535447682e-05, + "loss": 5.0244, + "step": 39239 + }, + { + "epoch": 0.23337139594633172, + "grad_norm": 1.6410858631134033, + "learning_rate": 4.357693277277164e-05, + "loss": 5.1096, + "step": 39240 + }, + { + "epoch": 0.2333773432296127, + "grad_norm": 1.3919554948806763, + "learning_rate": 4.357662018458145e-05, + "loss": 5.0005, + "step": 39241 + }, + { + "epoch": 0.23338329051289372, + "grad_norm": 1.5992203950881958, + "learning_rate": 4.357630758990634e-05, + "loss": 4.9081, + "step": 39242 + }, + { + "epoch": 0.2333892377961747, + "grad_norm": 1.3488589525222778, + "learning_rate": 4.3575994988746446e-05, + "loss": 4.9424, + "step": 39243 + }, + { + "epoch": 0.2333951850794557, + "grad_norm": 1.512978196144104, + "learning_rate": 4.357568238110186e-05, + "loss": 4.9572, + "step": 39244 + }, + { + "epoch": 0.2334011323627367, + "grad_norm": 1.5400868654251099, + "learning_rate": 4.35753697669727e-05, + "loss": 4.9061, + "step": 39245 + }, + { + "epoch": 0.2334070796460177, + "grad_norm": 1.6540729999542236, + "learning_rate": 4.3575057146359065e-05, + "loss": 5.0591, + "step": 39246 + }, + { + "epoch": 0.2334130269292987, + "grad_norm": 1.5842341184616089, + "learning_rate": 4.357474451926107e-05, + "loss": 4.7289, + "step": 39247 + }, + { + "epoch": 0.2334189742125797, + "grad_norm": 1.6024073362350464, + "learning_rate": 4.3574431885678824e-05, + "loss": 4.7724, + "step": 39248 + }, + { + "epoch": 0.2334249214958607, + "grad_norm": 1.4475951194763184, + "learning_rate": 4.3574119245612435e-05, + "loss": 4.9537, + "step": 39249 + }, + { + "epoch": 0.23343086877914168, + "grad_norm": 1.5023313760757446, + "learning_rate": 4.3573806599062015e-05, + "loss": 4.7418, + "step": 39250 + }, + { + "epoch": 0.2334368160624227, + "grad_norm": 1.4264357089996338, + "learning_rate": 4.357349394602767e-05, + "loss": 4.8347, + "step": 39251 + }, + { + "epoch": 0.23344276334570369, + "grad_norm": 1.4076826572418213, + "learning_rate": 4.357318128650951e-05, + "loss": 4.875, + "step": 39252 + }, + { + "epoch": 0.23344871062898467, + "grad_norm": 1.416060447692871, + "learning_rate": 4.357286862050764e-05, + "loss": 4.856, + "step": 39253 + }, + { + "epoch": 0.2334546579122657, + "grad_norm": 1.4814610481262207, + "learning_rate": 4.357255594802218e-05, + "loss": 4.8171, + "step": 39254 + }, + { + "epoch": 0.23346060519554668, + "grad_norm": 1.5342730283737183, + "learning_rate": 4.357224326905323e-05, + "loss": 4.8825, + "step": 39255 + }, + { + "epoch": 0.23346655247882767, + "grad_norm": 1.477780818939209, + "learning_rate": 4.357193058360091e-05, + "loss": 4.7599, + "step": 39256 + }, + { + "epoch": 0.23347249976210868, + "grad_norm": 1.4768816232681274, + "learning_rate": 4.35716178916653e-05, + "loss": 4.8948, + "step": 39257 + }, + { + "epoch": 0.23347844704538967, + "grad_norm": 1.5315289497375488, + "learning_rate": 4.357130519324655e-05, + "loss": 4.808, + "step": 39258 + }, + { + "epoch": 0.23348439432867066, + "grad_norm": 1.530362606048584, + "learning_rate": 4.357099248834474e-05, + "loss": 4.7999, + "step": 39259 + }, + { + "epoch": 0.23349034161195167, + "grad_norm": 1.496216058731079, + "learning_rate": 4.3570679776959994e-05, + "loss": 4.8051, + "step": 39260 + }, + { + "epoch": 0.23349628889523266, + "grad_norm": 1.5579525232315063, + "learning_rate": 4.357036705909241e-05, + "loss": 4.7486, + "step": 39261 + }, + { + "epoch": 0.23350223617851365, + "grad_norm": 1.4911562204360962, + "learning_rate": 4.357005433474211e-05, + "loss": 4.7698, + "step": 39262 + }, + { + "epoch": 0.23350818346179464, + "grad_norm": 1.5691003799438477, + "learning_rate": 4.3569741603909185e-05, + "loss": 4.8152, + "step": 39263 + }, + { + "epoch": 0.23351413074507565, + "grad_norm": 1.4888197183609009, + "learning_rate": 4.3569428866593764e-05, + "loss": 4.8731, + "step": 39264 + }, + { + "epoch": 0.23352007802835664, + "grad_norm": 1.3809503316879272, + "learning_rate": 4.356911612279594e-05, + "loss": 4.8566, + "step": 39265 + }, + { + "epoch": 0.23352602531163763, + "grad_norm": 1.520364761352539, + "learning_rate": 4.356880337251584e-05, + "loss": 4.919, + "step": 39266 + }, + { + "epoch": 0.23353197259491865, + "grad_norm": 1.36786949634552, + "learning_rate": 4.3568490615753556e-05, + "loss": 4.9626, + "step": 39267 + }, + { + "epoch": 0.23353791987819963, + "grad_norm": 1.5701583623886108, + "learning_rate": 4.35681778525092e-05, + "loss": 4.989, + "step": 39268 + }, + { + "epoch": 0.23354386716148062, + "grad_norm": 1.272887945175171, + "learning_rate": 4.3567865082782886e-05, + "loss": 4.9161, + "step": 39269 + }, + { + "epoch": 0.23354981444476164, + "grad_norm": 1.4108949899673462, + "learning_rate": 4.3567552306574734e-05, + "loss": 4.9134, + "step": 39270 + }, + { + "epoch": 0.23355576172804263, + "grad_norm": 1.3697681427001953, + "learning_rate": 4.356723952388483e-05, + "loss": 4.8394, + "step": 39271 + }, + { + "epoch": 0.23356170901132361, + "grad_norm": 1.5496501922607422, + "learning_rate": 4.356692673471329e-05, + "loss": 4.7908, + "step": 39272 + }, + { + "epoch": 0.23356765629460463, + "grad_norm": 1.1896272897720337, + "learning_rate": 4.3566613939060244e-05, + "loss": 5.0675, + "step": 39273 + }, + { + "epoch": 0.23357360357788562, + "grad_norm": 1.4039702415466309, + "learning_rate": 4.356630113692577e-05, + "loss": 4.8312, + "step": 39274 + }, + { + "epoch": 0.2335795508611666, + "grad_norm": 1.480469822883606, + "learning_rate": 4.3565988328309994e-05, + "loss": 4.8627, + "step": 39275 + }, + { + "epoch": 0.23358549814444762, + "grad_norm": 1.5712021589279175, + "learning_rate": 4.356567551321303e-05, + "loss": 4.9239, + "step": 39276 + }, + { + "epoch": 0.2335914454277286, + "grad_norm": 1.4389845132827759, + "learning_rate": 4.356536269163497e-05, + "loss": 4.8025, + "step": 39277 + }, + { + "epoch": 0.2335973927110096, + "grad_norm": 1.3848538398742676, + "learning_rate": 4.356504986357595e-05, + "loss": 4.7643, + "step": 39278 + }, + { + "epoch": 0.23360333999429062, + "grad_norm": 1.4079018831253052, + "learning_rate": 4.356473702903605e-05, + "loss": 4.8716, + "step": 39279 + }, + { + "epoch": 0.2336092872775716, + "grad_norm": 1.4892137050628662, + "learning_rate": 4.356442418801539e-05, + "loss": 4.9984, + "step": 39280 + }, + { + "epoch": 0.2336152345608526, + "grad_norm": 1.305431604385376, + "learning_rate": 4.356411134051409e-05, + "loss": 4.8958, + "step": 39281 + }, + { + "epoch": 0.2336211818441336, + "grad_norm": 1.395667552947998, + "learning_rate": 4.356379848653225e-05, + "loss": 4.7811, + "step": 39282 + }, + { + "epoch": 0.2336271291274146, + "grad_norm": 1.485810399055481, + "learning_rate": 4.356348562606998e-05, + "loss": 4.7185, + "step": 39283 + }, + { + "epoch": 0.23363307641069558, + "grad_norm": 1.3166253566741943, + "learning_rate": 4.356317275912739e-05, + "loss": 4.6181, + "step": 39284 + }, + { + "epoch": 0.2336390236939766, + "grad_norm": 1.387439489364624, + "learning_rate": 4.356285988570458e-05, + "loss": 4.6781, + "step": 39285 + }, + { + "epoch": 0.2336449709772576, + "grad_norm": 1.4118428230285645, + "learning_rate": 4.356254700580167e-05, + "loss": 4.767, + "step": 39286 + }, + { + "epoch": 0.23365091826053858, + "grad_norm": 1.4941892623901367, + "learning_rate": 4.356223411941877e-05, + "loss": 4.7696, + "step": 39287 + }, + { + "epoch": 0.2336568655438196, + "grad_norm": 1.5000381469726562, + "learning_rate": 4.356192122655599e-05, + "loss": 4.7611, + "step": 39288 + }, + { + "epoch": 0.23366281282710058, + "grad_norm": 1.4702129364013672, + "learning_rate": 4.3561608327213424e-05, + "loss": 4.6394, + "step": 39289 + }, + { + "epoch": 0.23366876011038157, + "grad_norm": 1.28829824924469, + "learning_rate": 4.35612954213912e-05, + "loss": 4.7623, + "step": 39290 + }, + { + "epoch": 0.23367470739366258, + "grad_norm": 1.360546350479126, + "learning_rate": 4.3560982509089416e-05, + "loss": 4.8757, + "step": 39291 + }, + { + "epoch": 0.23368065467694357, + "grad_norm": 1.375982403755188, + "learning_rate": 4.3560669590308184e-05, + "loss": 4.8164, + "step": 39292 + }, + { + "epoch": 0.23368660196022456, + "grad_norm": 1.4468094110488892, + "learning_rate": 4.356035666504762e-05, + "loss": 4.8452, + "step": 39293 + }, + { + "epoch": 0.23369254924350558, + "grad_norm": 1.5260746479034424, + "learning_rate": 4.356004373330782e-05, + "loss": 4.6426, + "step": 39294 + }, + { + "epoch": 0.23369849652678656, + "grad_norm": 1.4539531469345093, + "learning_rate": 4.355973079508891e-05, + "loss": 4.4602, + "step": 39295 + }, + { + "epoch": 0.23370444381006755, + "grad_norm": 2.001392126083374, + "learning_rate": 4.3559417850390974e-05, + "loss": 4.1524, + "step": 39296 + }, + { + "epoch": 0.23371039109334857, + "grad_norm": 2.0820767879486084, + "learning_rate": 4.355910489921415e-05, + "loss": 4.1542, + "step": 39297 + }, + { + "epoch": 0.23371633837662956, + "grad_norm": 2.4654488563537598, + "learning_rate": 4.355879194155853e-05, + "loss": 3.8269, + "step": 39298 + }, + { + "epoch": 0.23372228565991054, + "grad_norm": 2.231116533279419, + "learning_rate": 4.355847897742423e-05, + "loss": 3.6304, + "step": 39299 + }, + { + "epoch": 0.23372823294319156, + "grad_norm": 2.2285022735595703, + "learning_rate": 4.355816600681135e-05, + "loss": 4.0732, + "step": 39300 + }, + { + "epoch": 0.23373418022647255, + "grad_norm": 1.592862606048584, + "learning_rate": 4.355785302972001e-05, + "loss": 4.5091, + "step": 39301 + }, + { + "epoch": 0.23374012750975354, + "grad_norm": 1.5630459785461426, + "learning_rate": 4.355754004615032e-05, + "loss": 4.8329, + "step": 39302 + }, + { + "epoch": 0.23374607479303455, + "grad_norm": 1.876397967338562, + "learning_rate": 4.355722705610238e-05, + "loss": 4.5441, + "step": 39303 + }, + { + "epoch": 0.23375202207631554, + "grad_norm": 1.4842675924301147, + "learning_rate": 4.35569140595763e-05, + "loss": 4.6346, + "step": 39304 + }, + { + "epoch": 0.23375796935959653, + "grad_norm": 1.9447697401046753, + "learning_rate": 4.35566010565722e-05, + "loss": 4.4493, + "step": 39305 + }, + { + "epoch": 0.23376391664287755, + "grad_norm": 1.618638277053833, + "learning_rate": 4.355628804709019e-05, + "loss": 4.2402, + "step": 39306 + }, + { + "epoch": 0.23376986392615853, + "grad_norm": 1.5623420476913452, + "learning_rate": 4.355597503113035e-05, + "loss": 4.2127, + "step": 39307 + }, + { + "epoch": 0.23377581120943952, + "grad_norm": 1.6455943584442139, + "learning_rate": 4.3555662008692824e-05, + "loss": 4.9097, + "step": 39308 + }, + { + "epoch": 0.23378175849272054, + "grad_norm": 1.5201420783996582, + "learning_rate": 4.355534897977771e-05, + "loss": 3.9863, + "step": 39309 + }, + { + "epoch": 0.23378770577600153, + "grad_norm": 1.621320128440857, + "learning_rate": 4.355503594438511e-05, + "loss": 4.1855, + "step": 39310 + }, + { + "epoch": 0.2337936530592825, + "grad_norm": 1.6072403192520142, + "learning_rate": 4.355472290251514e-05, + "loss": 4.1234, + "step": 39311 + }, + { + "epoch": 0.23379960034256353, + "grad_norm": 1.7549769878387451, + "learning_rate": 4.355440985416791e-05, + "loss": 3.9104, + "step": 39312 + }, + { + "epoch": 0.23380554762584452, + "grad_norm": 1.7547187805175781, + "learning_rate": 4.355409679934352e-05, + "loss": 5.3788, + "step": 39313 + }, + { + "epoch": 0.2338114949091255, + "grad_norm": 1.6373289823532104, + "learning_rate": 4.35537837380421e-05, + "loss": 4.2316, + "step": 39314 + }, + { + "epoch": 0.23381744219240652, + "grad_norm": 1.8216767311096191, + "learning_rate": 4.355347067026374e-05, + "loss": 3.9499, + "step": 39315 + }, + { + "epoch": 0.2338233894756875, + "grad_norm": 1.811559796333313, + "learning_rate": 4.3553157596008544e-05, + "loss": 3.6429, + "step": 39316 + }, + { + "epoch": 0.2338293367589685, + "grad_norm": 1.8670121431350708, + "learning_rate": 4.3552844515276645e-05, + "loss": 3.5439, + "step": 39317 + }, + { + "epoch": 0.2338352840422495, + "grad_norm": 1.8059818744659424, + "learning_rate": 4.3552531428068136e-05, + "loss": 3.6974, + "step": 39318 + }, + { + "epoch": 0.2338412313255305, + "grad_norm": 1.666537880897522, + "learning_rate": 4.3552218334383124e-05, + "loss": 3.8927, + "step": 39319 + }, + { + "epoch": 0.2338471786088115, + "grad_norm": 1.8872041702270508, + "learning_rate": 4.355190523422173e-05, + "loss": 3.6902, + "step": 39320 + }, + { + "epoch": 0.2338531258920925, + "grad_norm": 1.8386341333389282, + "learning_rate": 4.355159212758406e-05, + "loss": 3.7288, + "step": 39321 + }, + { + "epoch": 0.2338590731753735, + "grad_norm": 1.7837934494018555, + "learning_rate": 4.355127901447022e-05, + "loss": 3.8012, + "step": 39322 + }, + { + "epoch": 0.23386502045865448, + "grad_norm": 1.7940096855163574, + "learning_rate": 4.3550965894880315e-05, + "loss": 3.5241, + "step": 39323 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 1.997152328491211, + "learning_rate": 4.355065276881447e-05, + "loss": 3.6414, + "step": 39324 + }, + { + "epoch": 0.2338769150252165, + "grad_norm": 1.6963967084884644, + "learning_rate": 4.3550339636272775e-05, + "loss": 3.6445, + "step": 39325 + }, + { + "epoch": 0.23388286230849747, + "grad_norm": 2.0039589405059814, + "learning_rate": 4.3550026497255346e-05, + "loss": 3.7224, + "step": 39326 + }, + { + "epoch": 0.23388880959177846, + "grad_norm": 1.8233304023742676, + "learning_rate": 4.35497133517623e-05, + "loss": 3.6538, + "step": 39327 + }, + { + "epoch": 0.23389475687505948, + "grad_norm": 1.8335343599319458, + "learning_rate": 4.354940019979374e-05, + "loss": 3.614, + "step": 39328 + }, + { + "epoch": 0.23390070415834047, + "grad_norm": 1.7205346822738647, + "learning_rate": 4.3549087041349774e-05, + "loss": 3.7804, + "step": 39329 + }, + { + "epoch": 0.23390665144162145, + "grad_norm": 1.8699969053268433, + "learning_rate": 4.354877387643052e-05, + "loss": 3.7193, + "step": 39330 + }, + { + "epoch": 0.23391259872490247, + "grad_norm": 1.853049397468567, + "learning_rate": 4.354846070503607e-05, + "loss": 3.7105, + "step": 39331 + }, + { + "epoch": 0.23391854600818346, + "grad_norm": 1.8523236513137817, + "learning_rate": 4.354814752716655e-05, + "loss": 3.6334, + "step": 39332 + }, + { + "epoch": 0.23392449329146445, + "grad_norm": 1.8499833345413208, + "learning_rate": 4.354783434282206e-05, + "loss": 3.8866, + "step": 39333 + }, + { + "epoch": 0.23393044057474546, + "grad_norm": 1.9343851804733276, + "learning_rate": 4.354752115200271e-05, + "loss": 3.6149, + "step": 39334 + }, + { + "epoch": 0.23393638785802645, + "grad_norm": 2.0238258838653564, + "learning_rate": 4.3547207954708624e-05, + "loss": 3.7493, + "step": 39335 + }, + { + "epoch": 0.23394233514130744, + "grad_norm": 1.860219120979309, + "learning_rate": 4.354689475093989e-05, + "loss": 3.6736, + "step": 39336 + }, + { + "epoch": 0.23394828242458846, + "grad_norm": 1.8583927154541016, + "learning_rate": 4.3546581540696626e-05, + "loss": 3.7352, + "step": 39337 + }, + { + "epoch": 0.23395422970786944, + "grad_norm": 1.753945231437683, + "learning_rate": 4.354626832397895e-05, + "loss": 3.5608, + "step": 39338 + }, + { + "epoch": 0.23396017699115043, + "grad_norm": 1.8373562097549438, + "learning_rate": 4.3545955100786956e-05, + "loss": 3.5984, + "step": 39339 + }, + { + "epoch": 0.23396612427443145, + "grad_norm": 1.7085545063018799, + "learning_rate": 4.354564187112076e-05, + "loss": 3.5978, + "step": 39340 + }, + { + "epoch": 0.23397207155771244, + "grad_norm": 1.6872609853744507, + "learning_rate": 4.354532863498047e-05, + "loss": 3.6985, + "step": 39341 + }, + { + "epoch": 0.23397801884099342, + "grad_norm": 1.8018085956573486, + "learning_rate": 4.3545015392366205e-05, + "loss": 3.5865, + "step": 39342 + }, + { + "epoch": 0.23398396612427444, + "grad_norm": 1.8761341571807861, + "learning_rate": 4.354470214327807e-05, + "loss": 3.6066, + "step": 39343 + }, + { + "epoch": 0.23398991340755543, + "grad_norm": 1.9360228776931763, + "learning_rate": 4.3544388887716167e-05, + "loss": 3.5594, + "step": 39344 + }, + { + "epoch": 0.23399586069083642, + "grad_norm": 1.6296970844268799, + "learning_rate": 4.3544075625680606e-05, + "loss": 3.4926, + "step": 39345 + }, + { + "epoch": 0.23400180797411743, + "grad_norm": 1.6063154935836792, + "learning_rate": 4.3543762357171504e-05, + "loss": 3.5129, + "step": 39346 + }, + { + "epoch": 0.23400775525739842, + "grad_norm": 1.8837140798568726, + "learning_rate": 4.354344908218896e-05, + "loss": 3.6908, + "step": 39347 + }, + { + "epoch": 0.2340137025406794, + "grad_norm": 1.7227901220321655, + "learning_rate": 4.354313580073309e-05, + "loss": 3.3831, + "step": 39348 + }, + { + "epoch": 0.23401964982396042, + "grad_norm": 1.6574761867523193, + "learning_rate": 4.3542822512804005e-05, + "loss": 3.5345, + "step": 39349 + }, + { + "epoch": 0.2340255971072414, + "grad_norm": 1.5883370637893677, + "learning_rate": 4.354250921840182e-05, + "loss": 4.9003, + "step": 39350 + }, + { + "epoch": 0.2340315443905224, + "grad_norm": 1.8118661642074585, + "learning_rate": 4.3542195917526625e-05, + "loss": 4.7851, + "step": 39351 + }, + { + "epoch": 0.23403749167380342, + "grad_norm": 1.4651440382003784, + "learning_rate": 4.354188261017855e-05, + "loss": 4.9024, + "step": 39352 + }, + { + "epoch": 0.2340434389570844, + "grad_norm": 1.6591970920562744, + "learning_rate": 4.3541569296357684e-05, + "loss": 4.523, + "step": 39353 + }, + { + "epoch": 0.2340493862403654, + "grad_norm": 1.9845739603042603, + "learning_rate": 4.354125597606415e-05, + "loss": 4.6265, + "step": 39354 + }, + { + "epoch": 0.2340553335236464, + "grad_norm": 1.547351360321045, + "learning_rate": 4.354094264929807e-05, + "loss": 4.7699, + "step": 39355 + }, + { + "epoch": 0.2340612808069274, + "grad_norm": 1.6275198459625244, + "learning_rate": 4.354062931605952e-05, + "loss": 4.8717, + "step": 39356 + }, + { + "epoch": 0.23406722809020838, + "grad_norm": 1.460517168045044, + "learning_rate": 4.354031597634864e-05, + "loss": 4.6653, + "step": 39357 + }, + { + "epoch": 0.2340731753734894, + "grad_norm": 1.3596324920654297, + "learning_rate": 4.354000263016552e-05, + "loss": 4.8387, + "step": 39358 + }, + { + "epoch": 0.2340791226567704, + "grad_norm": 1.639746904373169, + "learning_rate": 4.353968927751029e-05, + "loss": 4.5672, + "step": 39359 + }, + { + "epoch": 0.23408506994005138, + "grad_norm": 1.4255754947662354, + "learning_rate": 4.353937591838303e-05, + "loss": 4.6677, + "step": 39360 + }, + { + "epoch": 0.2340910172233324, + "grad_norm": 1.8433109521865845, + "learning_rate": 4.353906255278387e-05, + "loss": 4.4957, + "step": 39361 + }, + { + "epoch": 0.23409696450661338, + "grad_norm": 3.0114545822143555, + "learning_rate": 4.353874918071292e-05, + "loss": 4.411, + "step": 39362 + }, + { + "epoch": 0.23410291178989437, + "grad_norm": 2.885272264480591, + "learning_rate": 4.353843580217028e-05, + "loss": 4.1936, + "step": 39363 + }, + { + "epoch": 0.23410885907317538, + "grad_norm": 2.6946053504943848, + "learning_rate": 4.3538122417156065e-05, + "loss": 4.0882, + "step": 39364 + }, + { + "epoch": 0.23411480635645637, + "grad_norm": 1.7744818925857544, + "learning_rate": 4.3537809025670384e-05, + "loss": 5.003, + "step": 39365 + }, + { + "epoch": 0.23412075363973736, + "grad_norm": 1.6615489721298218, + "learning_rate": 4.353749562771334e-05, + "loss": 5.0962, + "step": 39366 + }, + { + "epoch": 0.23412670092301838, + "grad_norm": 1.8033353090286255, + "learning_rate": 4.353718222328506e-05, + "loss": 4.6759, + "step": 39367 + }, + { + "epoch": 0.23413264820629937, + "grad_norm": 1.5590002536773682, + "learning_rate": 4.3536868812385626e-05, + "loss": 4.7335, + "step": 39368 + }, + { + "epoch": 0.23413859548958035, + "grad_norm": 1.649537444114685, + "learning_rate": 4.3536555395015166e-05, + "loss": 4.7863, + "step": 39369 + }, + { + "epoch": 0.23414454277286137, + "grad_norm": 1.8537696599960327, + "learning_rate": 4.353624197117379e-05, + "loss": 4.89, + "step": 39370 + }, + { + "epoch": 0.23415049005614236, + "grad_norm": 1.4917176961898804, + "learning_rate": 4.35359285408616e-05, + "loss": 5.4885, + "step": 39371 + }, + { + "epoch": 0.23415643733942335, + "grad_norm": 1.3445696830749512, + "learning_rate": 4.353561510407872e-05, + "loss": 5.0474, + "step": 39372 + }, + { + "epoch": 0.23416238462270436, + "grad_norm": 1.8747609853744507, + "learning_rate": 4.3535301660825235e-05, + "loss": 4.9231, + "step": 39373 + }, + { + "epoch": 0.23416833190598535, + "grad_norm": 1.5974386930465698, + "learning_rate": 4.353498821110127e-05, + "loss": 4.4127, + "step": 39374 + }, + { + "epoch": 0.23417427918926634, + "grad_norm": 1.4220181703567505, + "learning_rate": 4.353467475490694e-05, + "loss": 4.4635, + "step": 39375 + }, + { + "epoch": 0.23418022647254735, + "grad_norm": 1.5561754703521729, + "learning_rate": 4.353436129224233e-05, + "loss": 4.7118, + "step": 39376 + }, + { + "epoch": 0.23418617375582834, + "grad_norm": 1.7198326587677002, + "learning_rate": 4.353404782310758e-05, + "loss": 4.9206, + "step": 39377 + }, + { + "epoch": 0.23419212103910933, + "grad_norm": 1.6239429712295532, + "learning_rate": 4.353373434750279e-05, + "loss": 4.9794, + "step": 39378 + }, + { + "epoch": 0.23419806832239035, + "grad_norm": 1.6422456502914429, + "learning_rate": 4.353342086542806e-05, + "loss": 5.5298, + "step": 39379 + }, + { + "epoch": 0.23420401560567133, + "grad_norm": 1.455367922782898, + "learning_rate": 4.353310737688349e-05, + "loss": 5.1242, + "step": 39380 + }, + { + "epoch": 0.23420996288895232, + "grad_norm": 1.3635815382003784, + "learning_rate": 4.3532793881869215e-05, + "loss": 5.1464, + "step": 39381 + }, + { + "epoch": 0.2342159101722333, + "grad_norm": 1.425964593887329, + "learning_rate": 4.3532480380385335e-05, + "loss": 5.1384, + "step": 39382 + }, + { + "epoch": 0.23422185745551433, + "grad_norm": 1.7653659582138062, + "learning_rate": 4.353216687243196e-05, + "loss": 5.2893, + "step": 39383 + }, + { + "epoch": 0.23422780473879531, + "grad_norm": 1.5686538219451904, + "learning_rate": 4.353185335800919e-05, + "loss": 4.9431, + "step": 39384 + }, + { + "epoch": 0.2342337520220763, + "grad_norm": 1.7350513935089111, + "learning_rate": 4.353153983711714e-05, + "loss": 5.0707, + "step": 39385 + }, + { + "epoch": 0.23423969930535732, + "grad_norm": 1.449817419052124, + "learning_rate": 4.353122630975593e-05, + "loss": 5.1242, + "step": 39386 + }, + { + "epoch": 0.2342456465886383, + "grad_norm": 1.4347769021987915, + "learning_rate": 4.3530912775925656e-05, + "loss": 4.9507, + "step": 39387 + }, + { + "epoch": 0.2342515938719193, + "grad_norm": 1.5073580741882324, + "learning_rate": 4.353059923562643e-05, + "loss": 4.9784, + "step": 39388 + }, + { + "epoch": 0.2342575411552003, + "grad_norm": 1.5459281206130981, + "learning_rate": 4.3530285688858365e-05, + "loss": 5.2701, + "step": 39389 + }, + { + "epoch": 0.2342634884384813, + "grad_norm": 1.6322553157806396, + "learning_rate": 4.3529972135621564e-05, + "loss": 5.5123, + "step": 39390 + }, + { + "epoch": 0.2342694357217623, + "grad_norm": 1.8771675825119019, + "learning_rate": 4.352965857591614e-05, + "loss": 4.513, + "step": 39391 + }, + { + "epoch": 0.2342753830050433, + "grad_norm": 1.77946138381958, + "learning_rate": 4.352934500974222e-05, + "loss": 4.8122, + "step": 39392 + }, + { + "epoch": 0.2342813302883243, + "grad_norm": 1.5952132940292358, + "learning_rate": 4.352903143709988e-05, + "loss": 4.9388, + "step": 39393 + }, + { + "epoch": 0.23428727757160528, + "grad_norm": 1.7433884143829346, + "learning_rate": 4.352871785798925e-05, + "loss": 4.7188, + "step": 39394 + }, + { + "epoch": 0.2342932248548863, + "grad_norm": 1.8550909757614136, + "learning_rate": 4.3528404272410443e-05, + "loss": 4.8925, + "step": 39395 + }, + { + "epoch": 0.23429917213816728, + "grad_norm": 1.7110971212387085, + "learning_rate": 4.3528090680363555e-05, + "loss": 4.9685, + "step": 39396 + }, + { + "epoch": 0.23430511942144827, + "grad_norm": 1.8549284934997559, + "learning_rate": 4.352777708184871e-05, + "loss": 4.728, + "step": 39397 + }, + { + "epoch": 0.2343110667047293, + "grad_norm": 1.521715521812439, + "learning_rate": 4.3527463476865995e-05, + "loss": 4.41, + "step": 39398 + }, + { + "epoch": 0.23431701398801028, + "grad_norm": 1.7038367986679077, + "learning_rate": 4.352714986541555e-05, + "loss": 4.301, + "step": 39399 + }, + { + "epoch": 0.23432296127129126, + "grad_norm": 1.6658827066421509, + "learning_rate": 4.352683624749746e-05, + "loss": 5.3613, + "step": 39400 + }, + { + "epoch": 0.23432890855457228, + "grad_norm": 1.850461483001709, + "learning_rate": 4.3526522623111843e-05, + "loss": 5.0851, + "step": 39401 + }, + { + "epoch": 0.23433485583785327, + "grad_norm": 1.600386142730713, + "learning_rate": 4.352620899225881e-05, + "loss": 4.8299, + "step": 39402 + }, + { + "epoch": 0.23434080312113426, + "grad_norm": 1.6664706468582153, + "learning_rate": 4.352589535493846e-05, + "loss": 5.2305, + "step": 39403 + }, + { + "epoch": 0.23434675040441527, + "grad_norm": 1.576553225517273, + "learning_rate": 4.352558171115092e-05, + "loss": 5.5279, + "step": 39404 + }, + { + "epoch": 0.23435269768769626, + "grad_norm": 1.6637064218521118, + "learning_rate": 4.3525268060896296e-05, + "loss": 5.5165, + "step": 39405 + }, + { + "epoch": 0.23435864497097725, + "grad_norm": 1.6022828817367554, + "learning_rate": 4.352495440417468e-05, + "loss": 4.6025, + "step": 39406 + }, + { + "epoch": 0.23436459225425826, + "grad_norm": 2.1038920879364014, + "learning_rate": 4.352464074098621e-05, + "loss": 4.2571, + "step": 39407 + }, + { + "epoch": 0.23437053953753925, + "grad_norm": 1.9733129739761353, + "learning_rate": 4.352432707133096e-05, + "loss": 5.0075, + "step": 39408 + }, + { + "epoch": 0.23437648682082024, + "grad_norm": 3.435161590576172, + "learning_rate": 4.3524013395209074e-05, + "loss": 4.0373, + "step": 39409 + }, + { + "epoch": 0.23438243410410126, + "grad_norm": 3.5970253944396973, + "learning_rate": 4.352369971262064e-05, + "loss": 3.7217, + "step": 39410 + }, + { + "epoch": 0.23438838138738224, + "grad_norm": 3.314128875732422, + "learning_rate": 4.352338602356577e-05, + "loss": 3.5539, + "step": 39411 + }, + { + "epoch": 0.23439432867066323, + "grad_norm": 1.9025384187698364, + "learning_rate": 4.352307232804459e-05, + "loss": 4.6692, + "step": 39412 + }, + { + "epoch": 0.23440027595394425, + "grad_norm": 1.7677268981933594, + "learning_rate": 4.3522758626057184e-05, + "loss": 5.0206, + "step": 39413 + }, + { + "epoch": 0.23440622323722524, + "grad_norm": 1.8457081317901611, + "learning_rate": 4.3522444917603676e-05, + "loss": 4.7544, + "step": 39414 + }, + { + "epoch": 0.23441217052050622, + "grad_norm": 2.1236960887908936, + "learning_rate": 4.352213120268418e-05, + "loss": 3.549, + "step": 39415 + }, + { + "epoch": 0.23441811780378724, + "grad_norm": 2.5540764331817627, + "learning_rate": 4.35218174812988e-05, + "loss": 3.3407, + "step": 39416 + }, + { + "epoch": 0.23442406508706823, + "grad_norm": 2.1707522869110107, + "learning_rate": 4.352150375344763e-05, + "loss": 3.3798, + "step": 39417 + }, + { + "epoch": 0.23443001237034922, + "grad_norm": 1.5410895347595215, + "learning_rate": 4.352119001913081e-05, + "loss": 3.7162, + "step": 39418 + }, + { + "epoch": 0.23443595965363023, + "grad_norm": 1.5335618257522583, + "learning_rate": 4.352087627834843e-05, + "loss": 4.6438, + "step": 39419 + }, + { + "epoch": 0.23444190693691122, + "grad_norm": 1.642561435699463, + "learning_rate": 4.352056253110061e-05, + "loss": 4.8075, + "step": 39420 + }, + { + "epoch": 0.2344478542201922, + "grad_norm": 1.8755661249160767, + "learning_rate": 4.352024877738744e-05, + "loss": 4.8355, + "step": 39421 + }, + { + "epoch": 0.23445380150347322, + "grad_norm": 1.9932796955108643, + "learning_rate": 4.3519935017209045e-05, + "loss": 4.8492, + "step": 39422 + }, + { + "epoch": 0.2344597487867542, + "grad_norm": 1.5432254076004028, + "learning_rate": 4.351962125056553e-05, + "loss": 4.8187, + "step": 39423 + }, + { + "epoch": 0.2344656960700352, + "grad_norm": 1.849226951599121, + "learning_rate": 4.351930747745702e-05, + "loss": 4.3849, + "step": 39424 + }, + { + "epoch": 0.23447164335331622, + "grad_norm": 1.8147178888320923, + "learning_rate": 4.35189936978836e-05, + "loss": 4.4224, + "step": 39425 + }, + { + "epoch": 0.2344775906365972, + "grad_norm": 1.742662787437439, + "learning_rate": 4.35186799118454e-05, + "loss": 4.2055, + "step": 39426 + }, + { + "epoch": 0.2344835379198782, + "grad_norm": 1.741877794265747, + "learning_rate": 4.3518366119342504e-05, + "loss": 4.2056, + "step": 39427 + }, + { + "epoch": 0.2344894852031592, + "grad_norm": 1.7838786840438843, + "learning_rate": 4.3518052320375055e-05, + "loss": 5.4981, + "step": 39428 + }, + { + "epoch": 0.2344954324864402, + "grad_norm": 1.5131925344467163, + "learning_rate": 4.351773851494313e-05, + "loss": 5.1125, + "step": 39429 + }, + { + "epoch": 0.23450137976972119, + "grad_norm": 1.7058484554290771, + "learning_rate": 4.3517424703046864e-05, + "loss": 4.7788, + "step": 39430 + }, + { + "epoch": 0.2345073270530022, + "grad_norm": 1.871307134628296, + "learning_rate": 4.351711088468635e-05, + "loss": 4.5781, + "step": 39431 + }, + { + "epoch": 0.2345132743362832, + "grad_norm": 1.476925253868103, + "learning_rate": 4.351679705986171e-05, + "loss": 4.7101, + "step": 39432 + }, + { + "epoch": 0.23451922161956418, + "grad_norm": 1.629453182220459, + "learning_rate": 4.351648322857304e-05, + "loss": 4.4409, + "step": 39433 + }, + { + "epoch": 0.2345251689028452, + "grad_norm": 1.6458929777145386, + "learning_rate": 4.351616939082047e-05, + "loss": 4.6916, + "step": 39434 + }, + { + "epoch": 0.23453111618612618, + "grad_norm": 1.5822285413742065, + "learning_rate": 4.351585554660409e-05, + "loss": 4.6374, + "step": 39435 + }, + { + "epoch": 0.23453706346940717, + "grad_norm": 1.6070079803466797, + "learning_rate": 4.351554169592401e-05, + "loss": 5.3531, + "step": 39436 + }, + { + "epoch": 0.23454301075268819, + "grad_norm": 1.527276635169983, + "learning_rate": 4.351522783878035e-05, + "loss": 5.5012, + "step": 39437 + }, + { + "epoch": 0.23454895803596917, + "grad_norm": 1.8088719844818115, + "learning_rate": 4.3514913975173224e-05, + "loss": 4.6404, + "step": 39438 + }, + { + "epoch": 0.23455490531925016, + "grad_norm": 1.815889596939087, + "learning_rate": 4.351460010510272e-05, + "loss": 4.4818, + "step": 39439 + }, + { + "epoch": 0.23456085260253115, + "grad_norm": 1.7771681547164917, + "learning_rate": 4.351428622856897e-05, + "loss": 4.3622, + "step": 39440 + }, + { + "epoch": 0.23456679988581217, + "grad_norm": 1.737944483757019, + "learning_rate": 4.351397234557206e-05, + "loss": 4.4178, + "step": 39441 + }, + { + "epoch": 0.23457274716909315, + "grad_norm": 1.5377593040466309, + "learning_rate": 4.3513658456112126e-05, + "loss": 4.4063, + "step": 39442 + }, + { + "epoch": 0.23457869445237414, + "grad_norm": 1.757392168045044, + "learning_rate": 4.351334456018926e-05, + "loss": 4.4072, + "step": 39443 + }, + { + "epoch": 0.23458464173565516, + "grad_norm": 1.8958923816680908, + "learning_rate": 4.3513030657803575e-05, + "loss": 4.4878, + "step": 39444 + }, + { + "epoch": 0.23459058901893615, + "grad_norm": 1.893446683883667, + "learning_rate": 4.351271674895518e-05, + "loss": 4.2177, + "step": 39445 + }, + { + "epoch": 0.23459653630221713, + "grad_norm": 1.8133530616760254, + "learning_rate": 4.35124028336442e-05, + "loss": 4.31, + "step": 39446 + }, + { + "epoch": 0.23460248358549815, + "grad_norm": 1.6795728206634521, + "learning_rate": 4.3512088911870724e-05, + "loss": 4.4789, + "step": 39447 + }, + { + "epoch": 0.23460843086877914, + "grad_norm": 1.7759149074554443, + "learning_rate": 4.351177498363487e-05, + "loss": 4.8038, + "step": 39448 + }, + { + "epoch": 0.23461437815206013, + "grad_norm": 1.7960399389266968, + "learning_rate": 4.3511461048936744e-05, + "loss": 4.9761, + "step": 39449 + }, + { + "epoch": 0.23462032543534114, + "grad_norm": 1.7421083450317383, + "learning_rate": 4.3511147107776465e-05, + "loss": 4.2165, + "step": 39450 + }, + { + "epoch": 0.23462627271862213, + "grad_norm": 1.9104562997817993, + "learning_rate": 4.3510833160154124e-05, + "loss": 4.3507, + "step": 39451 + }, + { + "epoch": 0.23463222000190312, + "grad_norm": 1.6309458017349243, + "learning_rate": 4.351051920606985e-05, + "loss": 5.0622, + "step": 39452 + }, + { + "epoch": 0.23463816728518413, + "grad_norm": 1.7124251127243042, + "learning_rate": 4.3510205245523744e-05, + "loss": 4.9691, + "step": 39453 + }, + { + "epoch": 0.23464411456846512, + "grad_norm": 1.589788556098938, + "learning_rate": 4.3509891278515916e-05, + "loss": 5.0264, + "step": 39454 + }, + { + "epoch": 0.2346500618517461, + "grad_norm": 1.7935584783554077, + "learning_rate": 4.350957730504648e-05, + "loss": 4.9153, + "step": 39455 + }, + { + "epoch": 0.23465600913502713, + "grad_norm": 1.4762649536132812, + "learning_rate": 4.350926332511554e-05, + "loss": 4.9941, + "step": 39456 + }, + { + "epoch": 0.23466195641830812, + "grad_norm": 1.5399127006530762, + "learning_rate": 4.35089493387232e-05, + "loss": 4.8539, + "step": 39457 + }, + { + "epoch": 0.2346679037015891, + "grad_norm": 1.581641674041748, + "learning_rate": 4.350863534586958e-05, + "loss": 4.8709, + "step": 39458 + }, + { + "epoch": 0.23467385098487012, + "grad_norm": 1.7906439304351807, + "learning_rate": 4.350832134655479e-05, + "loss": 4.6924, + "step": 39459 + }, + { + "epoch": 0.2346797982681511, + "grad_norm": 1.7961326837539673, + "learning_rate": 4.3508007340778936e-05, + "loss": 4.7806, + "step": 39460 + }, + { + "epoch": 0.2346857455514321, + "grad_norm": 1.387488842010498, + "learning_rate": 4.350769332854212e-05, + "loss": 5.3746, + "step": 39461 + }, + { + "epoch": 0.2346916928347131, + "grad_norm": 1.3593922853469849, + "learning_rate": 4.3507379309844475e-05, + "loss": 5.0035, + "step": 39462 + }, + { + "epoch": 0.2346976401179941, + "grad_norm": 1.7202558517456055, + "learning_rate": 4.3507065284686086e-05, + "loss": 5.0444, + "step": 39463 + }, + { + "epoch": 0.2347035874012751, + "grad_norm": 1.5007917881011963, + "learning_rate": 4.3506751253067066e-05, + "loss": 4.7126, + "step": 39464 + }, + { + "epoch": 0.2347095346845561, + "grad_norm": 1.666621208190918, + "learning_rate": 4.350643721498754e-05, + "loss": 4.4873, + "step": 39465 + }, + { + "epoch": 0.2347154819678371, + "grad_norm": 2.67529296875, + "learning_rate": 4.350612317044761e-05, + "loss": 4.3927, + "step": 39466 + }, + { + "epoch": 0.23472142925111808, + "grad_norm": 2.530919313430786, + "learning_rate": 4.350580911944737e-05, + "loss": 3.8997, + "step": 39467 + }, + { + "epoch": 0.2347273765343991, + "grad_norm": 2.31793212890625, + "learning_rate": 4.3505495061986954e-05, + "loss": 4.007, + "step": 39468 + }, + { + "epoch": 0.23473332381768008, + "grad_norm": 1.7136186361312866, + "learning_rate": 4.350518099806646e-05, + "loss": 5.1124, + "step": 39469 + }, + { + "epoch": 0.23473927110096107, + "grad_norm": 1.7256274223327637, + "learning_rate": 4.3504866927686e-05, + "loss": 3.8912, + "step": 39470 + }, + { + "epoch": 0.2347452183842421, + "grad_norm": 1.7337493896484375, + "learning_rate": 4.350455285084568e-05, + "loss": 4.2857, + "step": 39471 + }, + { + "epoch": 0.23475116566752308, + "grad_norm": 1.6132410764694214, + "learning_rate": 4.350423876754561e-05, + "loss": 4.9954, + "step": 39472 + }, + { + "epoch": 0.23475711295080406, + "grad_norm": 1.4260412454605103, + "learning_rate": 4.35039246777859e-05, + "loss": 5.1, + "step": 39473 + }, + { + "epoch": 0.23476306023408508, + "grad_norm": 1.6036821603775024, + "learning_rate": 4.3503610581566664e-05, + "loss": 5.2813, + "step": 39474 + }, + { + "epoch": 0.23476900751736607, + "grad_norm": 1.385947823524475, + "learning_rate": 4.350329647888801e-05, + "loss": 4.882, + "step": 39475 + }, + { + "epoch": 0.23477495480064706, + "grad_norm": 1.7176967859268188, + "learning_rate": 4.3502982369750045e-05, + "loss": 4.4356, + "step": 39476 + }, + { + "epoch": 0.23478090208392807, + "grad_norm": 1.7131975889205933, + "learning_rate": 4.350266825415288e-05, + "loss": 4.4235, + "step": 39477 + }, + { + "epoch": 0.23478684936720906, + "grad_norm": 1.6809394359588623, + "learning_rate": 4.3502354132096624e-05, + "loss": 4.7656, + "step": 39478 + }, + { + "epoch": 0.23479279665049005, + "grad_norm": 1.5170011520385742, + "learning_rate": 4.3502040003581385e-05, + "loss": 5.1865, + "step": 39479 + }, + { + "epoch": 0.23479874393377106, + "grad_norm": 1.4915732145309448, + "learning_rate": 4.350172586860728e-05, + "loss": 5.6408, + "step": 39480 + }, + { + "epoch": 0.23480469121705205, + "grad_norm": 1.4026539325714111, + "learning_rate": 4.3501411727174404e-05, + "loss": 5.6604, + "step": 39481 + }, + { + "epoch": 0.23481063850033304, + "grad_norm": 1.4482790231704712, + "learning_rate": 4.350109757928289e-05, + "loss": 4.9783, + "step": 39482 + }, + { + "epoch": 0.23481658578361406, + "grad_norm": 1.2592450380325317, + "learning_rate": 4.350078342493282e-05, + "loss": 4.7902, + "step": 39483 + }, + { + "epoch": 0.23482253306689505, + "grad_norm": 1.7065273523330688, + "learning_rate": 4.350046926412433e-05, + "loss": 4.4604, + "step": 39484 + }, + { + "epoch": 0.23482848035017603, + "grad_norm": 2.8723208904266357, + "learning_rate": 4.350015509685752e-05, + "loss": 3.297, + "step": 39485 + }, + { + "epoch": 0.23483442763345705, + "grad_norm": 2.372694253921509, + "learning_rate": 4.3499840923132484e-05, + "loss": 3.8019, + "step": 39486 + }, + { + "epoch": 0.23484037491673804, + "grad_norm": 1.394163727760315, + "learning_rate": 4.3499526742949346e-05, + "loss": 5.1203, + "step": 39487 + }, + { + "epoch": 0.23484632220001903, + "grad_norm": 1.5372754335403442, + "learning_rate": 4.349921255630822e-05, + "loss": 4.6551, + "step": 39488 + }, + { + "epoch": 0.23485226948330004, + "grad_norm": 1.8942519426345825, + "learning_rate": 4.349889836320921e-05, + "loss": 3.5038, + "step": 39489 + }, + { + "epoch": 0.23485821676658103, + "grad_norm": 2.2233126163482666, + "learning_rate": 4.3498584163652424e-05, + "loss": 3.0863, + "step": 39490 + }, + { + "epoch": 0.23486416404986202, + "grad_norm": 2.37809419631958, + "learning_rate": 4.349826995763797e-05, + "loss": 3.0556, + "step": 39491 + }, + { + "epoch": 0.23487011133314303, + "grad_norm": 2.368769645690918, + "learning_rate": 4.3497955745165966e-05, + "loss": 2.8985, + "step": 39492 + }, + { + "epoch": 0.23487605861642402, + "grad_norm": 1.9883050918579102, + "learning_rate": 4.349764152623652e-05, + "loss": 3.425, + "step": 39493 + }, + { + "epoch": 0.234882005899705, + "grad_norm": 1.586185097694397, + "learning_rate": 4.349732730084973e-05, + "loss": 5.1936, + "step": 39494 + }, + { + "epoch": 0.23488795318298603, + "grad_norm": 2.2893435955047607, + "learning_rate": 4.3497013069005724e-05, + "loss": 2.8021, + "step": 39495 + }, + { + "epoch": 0.234893900466267, + "grad_norm": 2.5505330562591553, + "learning_rate": 4.349669883070459e-05, + "loss": 2.9212, + "step": 39496 + }, + { + "epoch": 0.234899847749548, + "grad_norm": 2.5210187435150146, + "learning_rate": 4.3496384585946455e-05, + "loss": 2.9037, + "step": 39497 + }, + { + "epoch": 0.234905795032829, + "grad_norm": 2.4007105827331543, + "learning_rate": 4.3496070334731425e-05, + "loss": 2.7009, + "step": 39498 + }, + { + "epoch": 0.23491174231611, + "grad_norm": 2.381126642227173, + "learning_rate": 4.34957560770596e-05, + "loss": 2.5355, + "step": 39499 + }, + { + "epoch": 0.234917689599391, + "grad_norm": 2.500800848007202, + "learning_rate": 4.349544181293111e-05, + "loss": 2.6994, + "step": 39500 + }, + { + "epoch": 0.23492363688267198, + "grad_norm": 2.550818681716919, + "learning_rate": 4.3495127542346045e-05, + "loss": 2.7628, + "step": 39501 + }, + { + "epoch": 0.234929584165953, + "grad_norm": 2.438868522644043, + "learning_rate": 4.349481326530453e-05, + "loss": 2.7773, + "step": 39502 + }, + { + "epoch": 0.234935531449234, + "grad_norm": 2.3863131999969482, + "learning_rate": 4.349449898180665e-05, + "loss": 2.565, + "step": 39503 + }, + { + "epoch": 0.23494147873251497, + "grad_norm": 2.595072031021118, + "learning_rate": 4.3494184691852544e-05, + "loss": 2.6882, + "step": 39504 + }, + { + "epoch": 0.234947426015796, + "grad_norm": 2.2971251010894775, + "learning_rate": 4.349387039544231e-05, + "loss": 2.635, + "step": 39505 + }, + { + "epoch": 0.23495337329907698, + "grad_norm": 2.3225739002227783, + "learning_rate": 4.349355609257605e-05, + "loss": 3.3878, + "step": 39506 + }, + { + "epoch": 0.23495932058235797, + "grad_norm": 2.412611722946167, + "learning_rate": 4.349324178325389e-05, + "loss": 3.0068, + "step": 39507 + }, + { + "epoch": 0.23496526786563898, + "grad_norm": 1.950431227684021, + "learning_rate": 4.3492927467475924e-05, + "loss": 3.9725, + "step": 39508 + }, + { + "epoch": 0.23497121514891997, + "grad_norm": 1.5830776691436768, + "learning_rate": 4.3492613145242264e-05, + "loss": 5.1513, + "step": 39509 + }, + { + "epoch": 0.23497716243220096, + "grad_norm": 1.5284711122512817, + "learning_rate": 4.3492298816553034e-05, + "loss": 5.1141, + "step": 39510 + }, + { + "epoch": 0.23498310971548197, + "grad_norm": 2.3426167964935303, + "learning_rate": 4.349198448140833e-05, + "loss": 2.8968, + "step": 39511 + }, + { + "epoch": 0.23498905699876296, + "grad_norm": 1.8177212476730347, + "learning_rate": 4.3491670139808263e-05, + "loss": 5.201, + "step": 39512 + }, + { + "epoch": 0.23499500428204395, + "grad_norm": 1.8724275827407837, + "learning_rate": 4.349135579175294e-05, + "loss": 5.1527, + "step": 39513 + }, + { + "epoch": 0.23500095156532497, + "grad_norm": 1.576641321182251, + "learning_rate": 4.3491041437242486e-05, + "loss": 5.0082, + "step": 39514 + }, + { + "epoch": 0.23500689884860596, + "grad_norm": 1.6473275423049927, + "learning_rate": 4.349072707627699e-05, + "loss": 5.0796, + "step": 39515 + }, + { + "epoch": 0.23501284613188694, + "grad_norm": 1.5497204065322876, + "learning_rate": 4.3490412708856584e-05, + "loss": 4.8808, + "step": 39516 + }, + { + "epoch": 0.23501879341516796, + "grad_norm": 1.9269403219223022, + "learning_rate": 4.349009833498136e-05, + "loss": 5.0166, + "step": 39517 + }, + { + "epoch": 0.23502474069844895, + "grad_norm": 1.795958161354065, + "learning_rate": 4.348978395465143e-05, + "loss": 4.9874, + "step": 39518 + }, + { + "epoch": 0.23503068798172994, + "grad_norm": 1.7254250049591064, + "learning_rate": 4.348946956786691e-05, + "loss": 4.793, + "step": 39519 + }, + { + "epoch": 0.23503663526501095, + "grad_norm": 1.8075964450836182, + "learning_rate": 4.348915517462791e-05, + "loss": 4.9552, + "step": 39520 + }, + { + "epoch": 0.23504258254829194, + "grad_norm": 1.6607565879821777, + "learning_rate": 4.348884077493454e-05, + "loss": 4.9673, + "step": 39521 + }, + { + "epoch": 0.23504852983157293, + "grad_norm": 2.0513784885406494, + "learning_rate": 4.3488526368786897e-05, + "loss": 4.9962, + "step": 39522 + }, + { + "epoch": 0.23505447711485394, + "grad_norm": 1.448609709739685, + "learning_rate": 4.348821195618511e-05, + "loss": 5.2576, + "step": 39523 + }, + { + "epoch": 0.23506042439813493, + "grad_norm": 1.7218849658966064, + "learning_rate": 4.3487897537129266e-05, + "loss": 5.0945, + "step": 39524 + }, + { + "epoch": 0.23506637168141592, + "grad_norm": 1.8158671855926514, + "learning_rate": 4.34875831116195e-05, + "loss": 5.1053, + "step": 39525 + }, + { + "epoch": 0.23507231896469694, + "grad_norm": 1.6465450525283813, + "learning_rate": 4.348726867965591e-05, + "loss": 4.9159, + "step": 39526 + }, + { + "epoch": 0.23507826624797792, + "grad_norm": 1.6938047409057617, + "learning_rate": 4.3486954241238595e-05, + "loss": 5.0443, + "step": 39527 + }, + { + "epoch": 0.2350842135312589, + "grad_norm": 1.5726323127746582, + "learning_rate": 4.348663979636768e-05, + "loss": 4.7019, + "step": 39528 + }, + { + "epoch": 0.23509016081453993, + "grad_norm": 1.506906509399414, + "learning_rate": 4.3486325345043275e-05, + "loss": 5.5914, + "step": 39529 + }, + { + "epoch": 0.23509610809782092, + "grad_norm": 1.6261730194091797, + "learning_rate": 4.3486010887265485e-05, + "loss": 5.9064, + "step": 39530 + }, + { + "epoch": 0.2351020553811019, + "grad_norm": 1.477356195449829, + "learning_rate": 4.3485696423034415e-05, + "loss": 5.748, + "step": 39531 + }, + { + "epoch": 0.23510800266438292, + "grad_norm": 1.7038002014160156, + "learning_rate": 4.348538195235018e-05, + "loss": 5.2618, + "step": 39532 + }, + { + "epoch": 0.2351139499476639, + "grad_norm": 1.9998583793640137, + "learning_rate": 4.348506747521289e-05, + "loss": 4.4769, + "step": 39533 + }, + { + "epoch": 0.2351198972309449, + "grad_norm": 1.7400646209716797, + "learning_rate": 4.348475299162266e-05, + "loss": 4.6152, + "step": 39534 + }, + { + "epoch": 0.2351258445142259, + "grad_norm": 1.4792178869247437, + "learning_rate": 4.348443850157958e-05, + "loss": 4.9018, + "step": 39535 + }, + { + "epoch": 0.2351317917975069, + "grad_norm": 1.8028171062469482, + "learning_rate": 4.348412400508378e-05, + "loss": 4.9801, + "step": 39536 + }, + { + "epoch": 0.2351377390807879, + "grad_norm": 1.7561883926391602, + "learning_rate": 4.3483809502135365e-05, + "loss": 4.9372, + "step": 39537 + }, + { + "epoch": 0.2351436863640689, + "grad_norm": 2.0931129455566406, + "learning_rate": 4.348349499273444e-05, + "loss": 4.8422, + "step": 39538 + }, + { + "epoch": 0.2351496336473499, + "grad_norm": 2.22172474861145, + "learning_rate": 4.3483180476881124e-05, + "loss": 3.7755, + "step": 39539 + }, + { + "epoch": 0.23515558093063088, + "grad_norm": 2.2393369674682617, + "learning_rate": 4.348286595457552e-05, + "loss": 3.6435, + "step": 39540 + }, + { + "epoch": 0.2351615282139119, + "grad_norm": 2.1963789463043213, + "learning_rate": 4.3482551425817735e-05, + "loss": 4.0413, + "step": 39541 + }, + { + "epoch": 0.23516747549719288, + "grad_norm": 1.6470162868499756, + "learning_rate": 4.348223689060788e-05, + "loss": 5.1944, + "step": 39542 + }, + { + "epoch": 0.23517342278047387, + "grad_norm": 1.440127968788147, + "learning_rate": 4.3481922348946066e-05, + "loss": 5.0694, + "step": 39543 + }, + { + "epoch": 0.2351793700637549, + "grad_norm": 1.7956537008285522, + "learning_rate": 4.3481607800832416e-05, + "loss": 5.466, + "step": 39544 + }, + { + "epoch": 0.23518531734703588, + "grad_norm": 1.5988211631774902, + "learning_rate": 4.348129324626702e-05, + "loss": 5.3832, + "step": 39545 + }, + { + "epoch": 0.23519126463031687, + "grad_norm": 1.7652308940887451, + "learning_rate": 4.348097868524999e-05, + "loss": 5.1408, + "step": 39546 + }, + { + "epoch": 0.23519721191359788, + "grad_norm": 1.96556556224823, + "learning_rate": 4.348066411778144e-05, + "loss": 5.1057, + "step": 39547 + }, + { + "epoch": 0.23520315919687887, + "grad_norm": 1.6262363195419312, + "learning_rate": 4.348034954386149e-05, + "loss": 5.1536, + "step": 39548 + }, + { + "epoch": 0.23520910648015986, + "grad_norm": 1.9983383417129517, + "learning_rate": 4.3480034963490244e-05, + "loss": 4.4115, + "step": 39549 + }, + { + "epoch": 0.23521505376344087, + "grad_norm": 1.8249067068099976, + "learning_rate": 4.34797203766678e-05, + "loss": 4.7155, + "step": 39550 + }, + { + "epoch": 0.23522100104672186, + "grad_norm": 1.874074935913086, + "learning_rate": 4.347940578339428e-05, + "loss": 5.1159, + "step": 39551 + }, + { + "epoch": 0.23522694833000285, + "grad_norm": 1.861391305923462, + "learning_rate": 4.347909118366978e-05, + "loss": 4.6431, + "step": 39552 + }, + { + "epoch": 0.23523289561328387, + "grad_norm": 2.319910764694214, + "learning_rate": 4.347877657749444e-05, + "loss": 3.926, + "step": 39553 + }, + { + "epoch": 0.23523884289656485, + "grad_norm": 2.0125155448913574, + "learning_rate": 4.3478461964868336e-05, + "loss": 4.3153, + "step": 39554 + }, + { + "epoch": 0.23524479017984584, + "grad_norm": 2.236830234527588, + "learning_rate": 4.34781473457916e-05, + "loss": 5.1541, + "step": 39555 + }, + { + "epoch": 0.23525073746312683, + "grad_norm": 1.7119864225387573, + "learning_rate": 4.347783272026432e-05, + "loss": 5.0258, + "step": 39556 + }, + { + "epoch": 0.23525668474640785, + "grad_norm": 1.8801114559173584, + "learning_rate": 4.347751808828664e-05, + "loss": 5.0044, + "step": 39557 + }, + { + "epoch": 0.23526263202968883, + "grad_norm": 1.584053874015808, + "learning_rate": 4.347720344985863e-05, + "loss": 5.1926, + "step": 39558 + }, + { + "epoch": 0.23526857931296982, + "grad_norm": 2.160999059677124, + "learning_rate": 4.347688880498043e-05, + "loss": 4.1634, + "step": 39559 + }, + { + "epoch": 0.23527452659625084, + "grad_norm": 1.8408249616622925, + "learning_rate": 4.347657415365214e-05, + "loss": 4.5682, + "step": 39560 + }, + { + "epoch": 0.23528047387953183, + "grad_norm": 2.1002492904663086, + "learning_rate": 4.347625949587386e-05, + "loss": 5.0053, + "step": 39561 + }, + { + "epoch": 0.23528642116281281, + "grad_norm": 1.8429839611053467, + "learning_rate": 4.3475944831645715e-05, + "loss": 4.7472, + "step": 39562 + }, + { + "epoch": 0.23529236844609383, + "grad_norm": 1.9861998558044434, + "learning_rate": 4.3475630160967807e-05, + "loss": 3.7446, + "step": 39563 + }, + { + "epoch": 0.23529831572937482, + "grad_norm": 1.7070415019989014, + "learning_rate": 4.347531548384025e-05, + "loss": 4.8468, + "step": 39564 + }, + { + "epoch": 0.2353042630126558, + "grad_norm": 1.8060168027877808, + "learning_rate": 4.347500080026314e-05, + "loss": 5.0668, + "step": 39565 + }, + { + "epoch": 0.23531021029593682, + "grad_norm": 1.8883711099624634, + "learning_rate": 4.3474686110236616e-05, + "loss": 4.8557, + "step": 39566 + }, + { + "epoch": 0.2353161575792178, + "grad_norm": 2.1879305839538574, + "learning_rate": 4.347437141376076e-05, + "loss": 3.8515, + "step": 39567 + }, + { + "epoch": 0.2353221048624988, + "grad_norm": 1.619714379310608, + "learning_rate": 4.347405671083569e-05, + "loss": 5.1808, + "step": 39568 + }, + { + "epoch": 0.23532805214577981, + "grad_norm": 1.4702348709106445, + "learning_rate": 4.347374200146152e-05, + "loss": 5.0461, + "step": 39569 + }, + { + "epoch": 0.2353339994290608, + "grad_norm": 2.2921080589294434, + "learning_rate": 4.347342728563836e-05, + "loss": 4.0864, + "step": 39570 + }, + { + "epoch": 0.2353399467123418, + "grad_norm": 1.6588904857635498, + "learning_rate": 4.347311256336631e-05, + "loss": 5.0127, + "step": 39571 + }, + { + "epoch": 0.2353458939956228, + "grad_norm": 1.6477138996124268, + "learning_rate": 4.34727978346455e-05, + "loss": 4.7629, + "step": 39572 + }, + { + "epoch": 0.2353518412789038, + "grad_norm": 1.5137696266174316, + "learning_rate": 4.347248309947601e-05, + "loss": 4.9401, + "step": 39573 + }, + { + "epoch": 0.23535778856218478, + "grad_norm": 1.4825645685195923, + "learning_rate": 4.347216835785798e-05, + "loss": 5.9218, + "step": 39574 + }, + { + "epoch": 0.2353637358454658, + "grad_norm": 2.803783655166626, + "learning_rate": 4.3471853609791505e-05, + "loss": 2.9973, + "step": 39575 + }, + { + "epoch": 0.2353696831287468, + "grad_norm": 2.7221133708953857, + "learning_rate": 4.3471538855276695e-05, + "loss": 3.0297, + "step": 39576 + }, + { + "epoch": 0.23537563041202778, + "grad_norm": 2.5941596031188965, + "learning_rate": 4.347122409431367e-05, + "loss": 3.7102, + "step": 39577 + }, + { + "epoch": 0.2353815776953088, + "grad_norm": 2.1006662845611572, + "learning_rate": 4.347090932690252e-05, + "loss": 4.0978, + "step": 39578 + }, + { + "epoch": 0.23538752497858978, + "grad_norm": 2.5507373809814453, + "learning_rate": 4.347059455304337e-05, + "loss": 3.0659, + "step": 39579 + }, + { + "epoch": 0.23539347226187077, + "grad_norm": 2.7012178897857666, + "learning_rate": 4.3470279772736325e-05, + "loss": 2.9277, + "step": 39580 + }, + { + "epoch": 0.23539941954515178, + "grad_norm": 2.677558422088623, + "learning_rate": 4.3469964985981496e-05, + "loss": 2.7663, + "step": 39581 + }, + { + "epoch": 0.23540536682843277, + "grad_norm": 3.0192692279815674, + "learning_rate": 4.346965019277899e-05, + "loss": 2.9613, + "step": 39582 + }, + { + "epoch": 0.23541131411171376, + "grad_norm": 2.792264461517334, + "learning_rate": 4.346933539312893e-05, + "loss": 3.248, + "step": 39583 + }, + { + "epoch": 0.23541726139499478, + "grad_norm": 2.1806282997131348, + "learning_rate": 4.346902058703141e-05, + "loss": 3.8647, + "step": 39584 + }, + { + "epoch": 0.23542320867827576, + "grad_norm": 2.3454999923706055, + "learning_rate": 4.346870577448655e-05, + "loss": 3.9804, + "step": 39585 + }, + { + "epoch": 0.23542915596155675, + "grad_norm": 2.145684003829956, + "learning_rate": 4.346839095549445e-05, + "loss": 4.7823, + "step": 39586 + }, + { + "epoch": 0.23543510324483777, + "grad_norm": 2.0232717990875244, + "learning_rate": 4.346807613005523e-05, + "loss": 4.9594, + "step": 39587 + }, + { + "epoch": 0.23544105052811876, + "grad_norm": 2.0676941871643066, + "learning_rate": 4.3467761298168985e-05, + "loss": 4.2215, + "step": 39588 + }, + { + "epoch": 0.23544699781139974, + "grad_norm": 2.122058391571045, + "learning_rate": 4.346744645983584e-05, + "loss": 3.8307, + "step": 39589 + }, + { + "epoch": 0.23545294509468076, + "grad_norm": 1.8523790836334229, + "learning_rate": 4.3467131615055906e-05, + "loss": 4.9136, + "step": 39590 + }, + { + "epoch": 0.23545889237796175, + "grad_norm": 1.6085901260375977, + "learning_rate": 4.346681676382928e-05, + "loss": 5.096, + "step": 39591 + }, + { + "epoch": 0.23546483966124274, + "grad_norm": 1.5861530303955078, + "learning_rate": 4.346650190615609e-05, + "loss": 4.8874, + "step": 39592 + }, + { + "epoch": 0.23547078694452375, + "grad_norm": 2.204979181289673, + "learning_rate": 4.346618704203642e-05, + "loss": 3.8377, + "step": 39593 + }, + { + "epoch": 0.23547673422780474, + "grad_norm": 2.2924487590789795, + "learning_rate": 4.3465872171470406e-05, + "loss": 3.8936, + "step": 39594 + }, + { + "epoch": 0.23548268151108573, + "grad_norm": 2.0946760177612305, + "learning_rate": 4.346555729445815e-05, + "loss": 3.7599, + "step": 39595 + }, + { + "epoch": 0.23548862879436674, + "grad_norm": 1.7681735754013062, + "learning_rate": 4.346524241099975e-05, + "loss": 5.0385, + "step": 39596 + }, + { + "epoch": 0.23549457607764773, + "grad_norm": 1.4723299741744995, + "learning_rate": 4.3464927521095325e-05, + "loss": 5.105, + "step": 39597 + }, + { + "epoch": 0.23550052336092872, + "grad_norm": 1.6298372745513916, + "learning_rate": 4.346461262474499e-05, + "loss": 5.3164, + "step": 39598 + }, + { + "epoch": 0.23550647064420974, + "grad_norm": 1.8214343786239624, + "learning_rate": 4.346429772194884e-05, + "loss": 4.1696, + "step": 39599 + }, + { + "epoch": 0.23551241792749072, + "grad_norm": 1.6282157897949219, + "learning_rate": 4.3463982812707005e-05, + "loss": 5.2517, + "step": 39600 + }, + { + "epoch": 0.2355183652107717, + "grad_norm": 1.5538759231567383, + "learning_rate": 4.346366789701958e-05, + "loss": 4.8288, + "step": 39601 + }, + { + "epoch": 0.23552431249405273, + "grad_norm": 1.349043846130371, + "learning_rate": 4.346335297488668e-05, + "loss": 5.6924, + "step": 39602 + }, + { + "epoch": 0.23553025977733372, + "grad_norm": 1.605543851852417, + "learning_rate": 4.3463038046308414e-05, + "loss": 4.5651, + "step": 39603 + }, + { + "epoch": 0.2355362070606147, + "grad_norm": 1.511016607284546, + "learning_rate": 4.3462723111284885e-05, + "loss": 5.1926, + "step": 39604 + }, + { + "epoch": 0.23554215434389572, + "grad_norm": 1.6200916767120361, + "learning_rate": 4.346240816981622e-05, + "loss": 5.1843, + "step": 39605 + }, + { + "epoch": 0.2355481016271767, + "grad_norm": 1.439429521560669, + "learning_rate": 4.346209322190252e-05, + "loss": 4.5373, + "step": 39606 + }, + { + "epoch": 0.2355540489104577, + "grad_norm": 1.8972511291503906, + "learning_rate": 4.3461778267543885e-05, + "loss": 4.389, + "step": 39607 + }, + { + "epoch": 0.2355599961937387, + "grad_norm": 1.8233554363250732, + "learning_rate": 4.346146330674044e-05, + "loss": 4.141, + "step": 39608 + }, + { + "epoch": 0.2355659434770197, + "grad_norm": 1.9290480613708496, + "learning_rate": 4.3461148339492283e-05, + "loss": 4.7057, + "step": 39609 + }, + { + "epoch": 0.2355718907603007, + "grad_norm": 1.6103575229644775, + "learning_rate": 4.346083336579953e-05, + "loss": 5.2774, + "step": 39610 + }, + { + "epoch": 0.2355778380435817, + "grad_norm": 1.5069211721420288, + "learning_rate": 4.3460518385662305e-05, + "loss": 5.0212, + "step": 39611 + }, + { + "epoch": 0.2355837853268627, + "grad_norm": 1.6505247354507446, + "learning_rate": 4.3460203399080694e-05, + "loss": 4.371, + "step": 39612 + }, + { + "epoch": 0.23558973261014368, + "grad_norm": 1.5396970510482788, + "learning_rate": 4.3459888406054816e-05, + "loss": 4.3237, + "step": 39613 + }, + { + "epoch": 0.23559567989342467, + "grad_norm": 1.6712186336517334, + "learning_rate": 4.3459573406584786e-05, + "loss": 4.3366, + "step": 39614 + }, + { + "epoch": 0.23560162717670569, + "grad_norm": 1.9782822132110596, + "learning_rate": 4.3459258400670705e-05, + "loss": 4.6864, + "step": 39615 + }, + { + "epoch": 0.23560757445998667, + "grad_norm": 2.00964617729187, + "learning_rate": 4.345894338831269e-05, + "loss": 4.5942, + "step": 39616 + }, + { + "epoch": 0.23561352174326766, + "grad_norm": 1.8180450201034546, + "learning_rate": 4.3458628369510846e-05, + "loss": 4.5192, + "step": 39617 + }, + { + "epoch": 0.23561946902654868, + "grad_norm": 1.7940545082092285, + "learning_rate": 4.345831334426529e-05, + "loss": 4.8344, + "step": 39618 + }, + { + "epoch": 0.23562541630982967, + "grad_norm": 1.7346389293670654, + "learning_rate": 4.345799831257612e-05, + "loss": 4.4739, + "step": 39619 + }, + { + "epoch": 0.23563136359311065, + "grad_norm": 1.5963873863220215, + "learning_rate": 4.345768327444346e-05, + "loss": 5.6441, + "step": 39620 + }, + { + "epoch": 0.23563731087639167, + "grad_norm": 1.9023709297180176, + "learning_rate": 4.345736822986741e-05, + "loss": 4.774, + "step": 39621 + }, + { + "epoch": 0.23564325815967266, + "grad_norm": 1.6928259134292603, + "learning_rate": 4.3457053178848085e-05, + "loss": 4.508, + "step": 39622 + }, + { + "epoch": 0.23564920544295365, + "grad_norm": 2.6728034019470215, + "learning_rate": 4.345673812138559e-05, + "loss": 4.2878, + "step": 39623 + }, + { + "epoch": 0.23565515272623466, + "grad_norm": 3.916083574295044, + "learning_rate": 4.3456423057480045e-05, + "loss": 3.7933, + "step": 39624 + }, + { + "epoch": 0.23566110000951565, + "grad_norm": 4.121766567230225, + "learning_rate": 4.3456107987131544e-05, + "loss": 3.7085, + "step": 39625 + }, + { + "epoch": 0.23566704729279664, + "grad_norm": 3.7750494480133057, + "learning_rate": 4.345579291034021e-05, + "loss": 3.3967, + "step": 39626 + }, + { + "epoch": 0.23567299457607765, + "grad_norm": 3.047283887863159, + "learning_rate": 4.345547782710616e-05, + "loss": 3.7059, + "step": 39627 + }, + { + "epoch": 0.23567894185935864, + "grad_norm": 2.731980562210083, + "learning_rate": 4.3455162737429486e-05, + "loss": 3.4824, + "step": 39628 + }, + { + "epoch": 0.23568488914263963, + "grad_norm": 3.2570066452026367, + "learning_rate": 4.3454847641310306e-05, + "loss": 3.7479, + "step": 39629 + }, + { + "epoch": 0.23569083642592065, + "grad_norm": 3.029534339904785, + "learning_rate": 4.345453253874873e-05, + "loss": 3.4877, + "step": 39630 + }, + { + "epoch": 0.23569678370920163, + "grad_norm": 2.8466241359710693, + "learning_rate": 4.345421742974486e-05, + "loss": 3.3035, + "step": 39631 + }, + { + "epoch": 0.23570273099248262, + "grad_norm": 2.638864517211914, + "learning_rate": 4.345390231429882e-05, + "loss": 3.285, + "step": 39632 + }, + { + "epoch": 0.23570867827576364, + "grad_norm": 2.597590446472168, + "learning_rate": 4.345358719241071e-05, + "loss": 3.3196, + "step": 39633 + }, + { + "epoch": 0.23571462555904463, + "grad_norm": 2.5003716945648193, + "learning_rate": 4.345327206408064e-05, + "loss": 3.2357, + "step": 39634 + }, + { + "epoch": 0.23572057284232562, + "grad_norm": 2.7217302322387695, + "learning_rate": 4.345295692930873e-05, + "loss": 3.2867, + "step": 39635 + }, + { + "epoch": 0.23572652012560663, + "grad_norm": 2.5010008811950684, + "learning_rate": 4.3452641788095084e-05, + "loss": 3.1874, + "step": 39636 + }, + { + "epoch": 0.23573246740888762, + "grad_norm": 2.312774658203125, + "learning_rate": 4.345232664043981e-05, + "loss": 3.4379, + "step": 39637 + }, + { + "epoch": 0.2357384146921686, + "grad_norm": 1.739977478981018, + "learning_rate": 4.3452011486343015e-05, + "loss": 4.4238, + "step": 39638 + }, + { + "epoch": 0.23574436197544962, + "grad_norm": 1.7827277183532715, + "learning_rate": 4.3451696325804825e-05, + "loss": 4.8541, + "step": 39639 + }, + { + "epoch": 0.2357503092587306, + "grad_norm": 1.7119590044021606, + "learning_rate": 4.345138115882533e-05, + "loss": 4.7584, + "step": 39640 + }, + { + "epoch": 0.2357562565420116, + "grad_norm": 1.6340469121932983, + "learning_rate": 4.345106598540465e-05, + "loss": 4.7786, + "step": 39641 + }, + { + "epoch": 0.23576220382529262, + "grad_norm": 1.9640412330627441, + "learning_rate": 4.345075080554289e-05, + "loss": 4.8616, + "step": 39642 + }, + { + "epoch": 0.2357681511085736, + "grad_norm": 1.6392191648483276, + "learning_rate": 4.345043561924016e-05, + "loss": 4.5769, + "step": 39643 + }, + { + "epoch": 0.2357740983918546, + "grad_norm": 1.7787988185882568, + "learning_rate": 4.345012042649658e-05, + "loss": 4.5608, + "step": 39644 + }, + { + "epoch": 0.2357800456751356, + "grad_norm": 1.6990470886230469, + "learning_rate": 4.3449805227312254e-05, + "loss": 4.7891, + "step": 39645 + }, + { + "epoch": 0.2357859929584166, + "grad_norm": 1.9699727296829224, + "learning_rate": 4.3449490021687287e-05, + "loss": 4.1808, + "step": 39646 + }, + { + "epoch": 0.23579194024169758, + "grad_norm": 1.6719646453857422, + "learning_rate": 4.3449174809621805e-05, + "loss": 4.6185, + "step": 39647 + }, + { + "epoch": 0.2357978875249786, + "grad_norm": 2.851654052734375, + "learning_rate": 4.34488595911159e-05, + "loss": 4.1725, + "step": 39648 + }, + { + "epoch": 0.2358038348082596, + "grad_norm": 1.8505662679672241, + "learning_rate": 4.344854436616968e-05, + "loss": 4.053, + "step": 39649 + }, + { + "epoch": 0.23580978209154058, + "grad_norm": 1.769942045211792, + "learning_rate": 4.344822913478328e-05, + "loss": 4.2291, + "step": 39650 + }, + { + "epoch": 0.2358157293748216, + "grad_norm": 1.834368109703064, + "learning_rate": 4.344791389695678e-05, + "loss": 5.0813, + "step": 39651 + }, + { + "epoch": 0.23582167665810258, + "grad_norm": 1.6249654293060303, + "learning_rate": 4.344759865269031e-05, + "loss": 5.1234, + "step": 39652 + }, + { + "epoch": 0.23582762394138357, + "grad_norm": 1.7159777879714966, + "learning_rate": 4.3447283401983975e-05, + "loss": 4.9201, + "step": 39653 + }, + { + "epoch": 0.23583357122466458, + "grad_norm": 1.4406479597091675, + "learning_rate": 4.344696814483788e-05, + "loss": 4.9575, + "step": 39654 + }, + { + "epoch": 0.23583951850794557, + "grad_norm": 1.507131814956665, + "learning_rate": 4.3446652881252144e-05, + "loss": 5.0084, + "step": 39655 + }, + { + "epoch": 0.23584546579122656, + "grad_norm": 1.706028699874878, + "learning_rate": 4.344633761122687e-05, + "loss": 4.5784, + "step": 39656 + }, + { + "epoch": 0.23585141307450758, + "grad_norm": 1.5523202419281006, + "learning_rate": 4.344602233476217e-05, + "loss": 4.5634, + "step": 39657 + }, + { + "epoch": 0.23585736035778856, + "grad_norm": 1.5952317714691162, + "learning_rate": 4.344570705185815e-05, + "loss": 4.7964, + "step": 39658 + }, + { + "epoch": 0.23586330764106955, + "grad_norm": 1.7145177125930786, + "learning_rate": 4.3445391762514934e-05, + "loss": 4.8856, + "step": 39659 + }, + { + "epoch": 0.23586925492435057, + "grad_norm": 1.5456407070159912, + "learning_rate": 4.344507646673261e-05, + "loss": 4.9571, + "step": 39660 + }, + { + "epoch": 0.23587520220763156, + "grad_norm": 1.6312416791915894, + "learning_rate": 4.34447611645113e-05, + "loss": 4.6212, + "step": 39661 + }, + { + "epoch": 0.23588114949091255, + "grad_norm": 1.6123836040496826, + "learning_rate": 4.3444445855851126e-05, + "loss": 4.7611, + "step": 39662 + }, + { + "epoch": 0.23588709677419356, + "grad_norm": 1.5431879758834839, + "learning_rate": 4.3444130540752185e-05, + "loss": 4.5731, + "step": 39663 + }, + { + "epoch": 0.23589304405747455, + "grad_norm": 1.378326654434204, + "learning_rate": 4.344381521921458e-05, + "loss": 4.4747, + "step": 39664 + }, + { + "epoch": 0.23589899134075554, + "grad_norm": 1.6408112049102783, + "learning_rate": 4.344349989123844e-05, + "loss": 4.9272, + "step": 39665 + }, + { + "epoch": 0.23590493862403655, + "grad_norm": 1.6013070344924927, + "learning_rate": 4.3443184556823854e-05, + "loss": 4.3682, + "step": 39666 + }, + { + "epoch": 0.23591088590731754, + "grad_norm": 1.744828462600708, + "learning_rate": 4.344286921597095e-05, + "loss": 4.1018, + "step": 39667 + }, + { + "epoch": 0.23591683319059853, + "grad_norm": 1.688822627067566, + "learning_rate": 4.344255386867983e-05, + "loss": 4.6118, + "step": 39668 + }, + { + "epoch": 0.23592278047387955, + "grad_norm": 1.7823905944824219, + "learning_rate": 4.34422385149506e-05, + "loss": 4.6959, + "step": 39669 + }, + { + "epoch": 0.23592872775716053, + "grad_norm": 1.740500807762146, + "learning_rate": 4.344192315478338e-05, + "loss": 4.8013, + "step": 39670 + }, + { + "epoch": 0.23593467504044152, + "grad_norm": 1.7970569133758545, + "learning_rate": 4.344160778817827e-05, + "loss": 4.8883, + "step": 39671 + }, + { + "epoch": 0.2359406223237225, + "grad_norm": 1.6192377805709839, + "learning_rate": 4.344129241513539e-05, + "loss": 4.8943, + "step": 39672 + }, + { + "epoch": 0.23594656960700353, + "grad_norm": 1.698466420173645, + "learning_rate": 4.344097703565485e-05, + "loss": 4.5755, + "step": 39673 + }, + { + "epoch": 0.2359525168902845, + "grad_norm": 1.6837269067764282, + "learning_rate": 4.344066164973675e-05, + "loss": 4.7297, + "step": 39674 + }, + { + "epoch": 0.2359584641735655, + "grad_norm": 1.6579086780548096, + "learning_rate": 4.3440346257381204e-05, + "loss": 4.9919, + "step": 39675 + }, + { + "epoch": 0.23596441145684652, + "grad_norm": 1.6303355693817139, + "learning_rate": 4.344003085858833e-05, + "loss": 5.1976, + "step": 39676 + }, + { + "epoch": 0.2359703587401275, + "grad_norm": 2.267923355102539, + "learning_rate": 4.343971545335822e-05, + "loss": 4.204, + "step": 39677 + }, + { + "epoch": 0.2359763060234085, + "grad_norm": 1.9635370969772339, + "learning_rate": 4.3439400041691005e-05, + "loss": 5.0505, + "step": 39678 + }, + { + "epoch": 0.2359822533066895, + "grad_norm": 1.9441829919815063, + "learning_rate": 4.343908462358679e-05, + "loss": 4.5925, + "step": 39679 + }, + { + "epoch": 0.2359882005899705, + "grad_norm": 1.6971355676651, + "learning_rate": 4.343876919904567e-05, + "loss": 4.8558, + "step": 39680 + }, + { + "epoch": 0.2359941478732515, + "grad_norm": 1.5483894348144531, + "learning_rate": 4.343845376806777e-05, + "loss": 4.6838, + "step": 39681 + }, + { + "epoch": 0.2360000951565325, + "grad_norm": 1.6236921548843384, + "learning_rate": 4.34381383306532e-05, + "loss": 5.047, + "step": 39682 + }, + { + "epoch": 0.2360060424398135, + "grad_norm": 1.7564657926559448, + "learning_rate": 4.343782288680206e-05, + "loss": 4.6391, + "step": 39683 + }, + { + "epoch": 0.23601198972309448, + "grad_norm": 1.7207711935043335, + "learning_rate": 4.343750743651448e-05, + "loss": 5.0933, + "step": 39684 + }, + { + "epoch": 0.2360179370063755, + "grad_norm": 1.5402401685714722, + "learning_rate": 4.343719197979054e-05, + "loss": 5.0135, + "step": 39685 + }, + { + "epoch": 0.23602388428965648, + "grad_norm": 1.732639193534851, + "learning_rate": 4.343687651663038e-05, + "loss": 4.9194, + "step": 39686 + }, + { + "epoch": 0.23602983157293747, + "grad_norm": 1.5156683921813965, + "learning_rate": 4.343656104703408e-05, + "loss": 4.9513, + "step": 39687 + }, + { + "epoch": 0.2360357788562185, + "grad_norm": 1.4682366847991943, + "learning_rate": 4.343624557100178e-05, + "loss": 4.959, + "step": 39688 + }, + { + "epoch": 0.23604172613949947, + "grad_norm": 1.4715865850448608, + "learning_rate": 4.3435930088533575e-05, + "loss": 4.9556, + "step": 39689 + }, + { + "epoch": 0.23604767342278046, + "grad_norm": 1.731086015701294, + "learning_rate": 4.343561459962958e-05, + "loss": 4.8839, + "step": 39690 + }, + { + "epoch": 0.23605362070606148, + "grad_norm": 1.482099175453186, + "learning_rate": 4.34352991042899e-05, + "loss": 4.9354, + "step": 39691 + }, + { + "epoch": 0.23605956798934247, + "grad_norm": 1.880521535873413, + "learning_rate": 4.343498360251465e-05, + "loss": 4.9111, + "step": 39692 + }, + { + "epoch": 0.23606551527262346, + "grad_norm": 1.752835988998413, + "learning_rate": 4.343466809430393e-05, + "loss": 4.9135, + "step": 39693 + }, + { + "epoch": 0.23607146255590447, + "grad_norm": 1.6565130949020386, + "learning_rate": 4.3434352579657864e-05, + "loss": 4.8368, + "step": 39694 + }, + { + "epoch": 0.23607740983918546, + "grad_norm": 1.6498968601226807, + "learning_rate": 4.3434037058576556e-05, + "loss": 4.9898, + "step": 39695 + }, + { + "epoch": 0.23608335712246645, + "grad_norm": 1.846377968788147, + "learning_rate": 4.3433721531060115e-05, + "loss": 4.7886, + "step": 39696 + }, + { + "epoch": 0.23608930440574746, + "grad_norm": 1.7037628889083862, + "learning_rate": 4.343340599710865e-05, + "loss": 4.9677, + "step": 39697 + }, + { + "epoch": 0.23609525168902845, + "grad_norm": 1.7007973194122314, + "learning_rate": 4.3433090456722275e-05, + "loss": 4.8799, + "step": 39698 + }, + { + "epoch": 0.23610119897230944, + "grad_norm": 1.7689772844314575, + "learning_rate": 4.34327749099011e-05, + "loss": 4.8756, + "step": 39699 + }, + { + "epoch": 0.23610714625559046, + "grad_norm": 1.5716655254364014, + "learning_rate": 4.343245935664523e-05, + "loss": 4.886, + "step": 39700 + }, + { + "epoch": 0.23611309353887144, + "grad_norm": 1.6320520639419556, + "learning_rate": 4.3432143796954785e-05, + "loss": 4.9558, + "step": 39701 + }, + { + "epoch": 0.23611904082215243, + "grad_norm": 1.616171956062317, + "learning_rate": 4.3431828230829864e-05, + "loss": 4.8599, + "step": 39702 + }, + { + "epoch": 0.23612498810543345, + "grad_norm": 1.6095540523529053, + "learning_rate": 4.343151265827058e-05, + "loss": 4.752, + "step": 39703 + }, + { + "epoch": 0.23613093538871444, + "grad_norm": 1.4963219165802002, + "learning_rate": 4.343119707927705e-05, + "loss": 4.7252, + "step": 39704 + }, + { + "epoch": 0.23613688267199542, + "grad_norm": 1.5957183837890625, + "learning_rate": 4.343088149384938e-05, + "loss": 4.5582, + "step": 39705 + }, + { + "epoch": 0.23614282995527644, + "grad_norm": 1.5361696481704712, + "learning_rate": 4.3430565901987674e-05, + "loss": 4.7854, + "step": 39706 + }, + { + "epoch": 0.23614877723855743, + "grad_norm": 1.5576621294021606, + "learning_rate": 4.3430250303692055e-05, + "loss": 4.7568, + "step": 39707 + }, + { + "epoch": 0.23615472452183842, + "grad_norm": 1.5435431003570557, + "learning_rate": 4.3429934698962625e-05, + "loss": 4.5362, + "step": 39708 + }, + { + "epoch": 0.23616067180511943, + "grad_norm": 1.5170698165893555, + "learning_rate": 4.342961908779949e-05, + "loss": 4.6373, + "step": 39709 + }, + { + "epoch": 0.23616661908840042, + "grad_norm": 1.588207721710205, + "learning_rate": 4.342930347020277e-05, + "loss": 4.7936, + "step": 39710 + }, + { + "epoch": 0.2361725663716814, + "grad_norm": 1.751194715499878, + "learning_rate": 4.342898784617257e-05, + "loss": 4.5122, + "step": 39711 + }, + { + "epoch": 0.23617851365496242, + "grad_norm": 1.8367148637771606, + "learning_rate": 4.3428672215709e-05, + "loss": 4.8803, + "step": 39712 + }, + { + "epoch": 0.2361844609382434, + "grad_norm": 1.6473870277404785, + "learning_rate": 4.342835657881217e-05, + "loss": 4.8684, + "step": 39713 + }, + { + "epoch": 0.2361904082215244, + "grad_norm": 1.5764669179916382, + "learning_rate": 4.3428040935482195e-05, + "loss": 4.8413, + "step": 39714 + }, + { + "epoch": 0.23619635550480542, + "grad_norm": 1.715018391609192, + "learning_rate": 4.342772528571918e-05, + "loss": 4.7669, + "step": 39715 + }, + { + "epoch": 0.2362023027880864, + "grad_norm": 1.5597290992736816, + "learning_rate": 4.342740962952323e-05, + "loss": 4.6682, + "step": 39716 + }, + { + "epoch": 0.2362082500713674, + "grad_norm": 1.5768378973007202, + "learning_rate": 4.342709396689447e-05, + "loss": 4.8165, + "step": 39717 + }, + { + "epoch": 0.2362141973546484, + "grad_norm": 1.661075234413147, + "learning_rate": 4.3426778297832995e-05, + "loss": 4.5506, + "step": 39718 + }, + { + "epoch": 0.2362201446379294, + "grad_norm": 1.584989070892334, + "learning_rate": 4.342646262233892e-05, + "loss": 4.8426, + "step": 39719 + }, + { + "epoch": 0.23622609192121038, + "grad_norm": 1.5978823900222778, + "learning_rate": 4.342614694041237e-05, + "loss": 4.7612, + "step": 39720 + }, + { + "epoch": 0.2362320392044914, + "grad_norm": 1.4043251276016235, + "learning_rate": 4.3425831252053436e-05, + "loss": 4.6123, + "step": 39721 + }, + { + "epoch": 0.2362379864877724, + "grad_norm": 1.534112572669983, + "learning_rate": 4.342551555726223e-05, + "loss": 5.0231, + "step": 39722 + }, + { + "epoch": 0.23624393377105338, + "grad_norm": 1.4414738416671753, + "learning_rate": 4.342519985603887e-05, + "loss": 4.7686, + "step": 39723 + }, + { + "epoch": 0.2362498810543344, + "grad_norm": 1.5299564599990845, + "learning_rate": 4.342488414838346e-05, + "loss": 4.7504, + "step": 39724 + }, + { + "epoch": 0.23625582833761538, + "grad_norm": 1.8014576435089111, + "learning_rate": 4.342456843429612e-05, + "loss": 4.5203, + "step": 39725 + }, + { + "epoch": 0.23626177562089637, + "grad_norm": 1.3567272424697876, + "learning_rate": 4.342425271377695e-05, + "loss": 4.638, + "step": 39726 + }, + { + "epoch": 0.23626772290417739, + "grad_norm": 1.448305606842041, + "learning_rate": 4.342393698682607e-05, + "loss": 4.5339, + "step": 39727 + }, + { + "epoch": 0.23627367018745837, + "grad_norm": 1.2754170894622803, + "learning_rate": 4.3423621253443574e-05, + "loss": 4.5913, + "step": 39728 + }, + { + "epoch": 0.23627961747073936, + "grad_norm": 1.2582231760025024, + "learning_rate": 4.342330551362959e-05, + "loss": 4.5565, + "step": 39729 + }, + { + "epoch": 0.23628556475402035, + "grad_norm": 1.4124832153320312, + "learning_rate": 4.3422989767384214e-05, + "loss": 4.6425, + "step": 39730 + }, + { + "epoch": 0.23629151203730137, + "grad_norm": 1.6686687469482422, + "learning_rate": 4.3422674014707564e-05, + "loss": 4.7609, + "step": 39731 + }, + { + "epoch": 0.23629745932058235, + "grad_norm": 1.5445233583450317, + "learning_rate": 4.342235825559975e-05, + "loss": 4.5896, + "step": 39732 + }, + { + "epoch": 0.23630340660386334, + "grad_norm": 1.6400084495544434, + "learning_rate": 4.3422042490060887e-05, + "loss": 4.5642, + "step": 39733 + }, + { + "epoch": 0.23630935388714436, + "grad_norm": 1.735404133796692, + "learning_rate": 4.342172671809107e-05, + "loss": 4.5955, + "step": 39734 + }, + { + "epoch": 0.23631530117042535, + "grad_norm": 1.5258411169052124, + "learning_rate": 4.342141093969042e-05, + "loss": 4.5038, + "step": 39735 + }, + { + "epoch": 0.23632124845370633, + "grad_norm": 1.6231786012649536, + "learning_rate": 4.3421095154859046e-05, + "loss": 4.8466, + "step": 39736 + }, + { + "epoch": 0.23632719573698735, + "grad_norm": 1.498099446296692, + "learning_rate": 4.342077936359706e-05, + "loss": 4.7716, + "step": 39737 + }, + { + "epoch": 0.23633314302026834, + "grad_norm": 1.6102790832519531, + "learning_rate": 4.3420463565904576e-05, + "loss": 4.5928, + "step": 39738 + }, + { + "epoch": 0.23633909030354933, + "grad_norm": 1.6059402227401733, + "learning_rate": 4.342014776178169e-05, + "loss": 4.6719, + "step": 39739 + }, + { + "epoch": 0.23634503758683034, + "grad_norm": 1.6634807586669922, + "learning_rate": 4.341983195122853e-05, + "loss": 4.3684, + "step": 39740 + }, + { + "epoch": 0.23635098487011133, + "grad_norm": 1.5562560558319092, + "learning_rate": 4.3419516134245185e-05, + "loss": 4.3445, + "step": 39741 + }, + { + "epoch": 0.23635693215339232, + "grad_norm": 1.7897953987121582, + "learning_rate": 4.3419200310831784e-05, + "loss": 4.829, + "step": 39742 + }, + { + "epoch": 0.23636287943667333, + "grad_norm": 1.5983352661132812, + "learning_rate": 4.341888448098843e-05, + "loss": 5.426, + "step": 39743 + }, + { + "epoch": 0.23636882671995432, + "grad_norm": 1.7689006328582764, + "learning_rate": 4.341856864471523e-05, + "loss": 5.0936, + "step": 39744 + }, + { + "epoch": 0.2363747740032353, + "grad_norm": 1.8115812540054321, + "learning_rate": 4.34182528020123e-05, + "loss": 4.7046, + "step": 39745 + }, + { + "epoch": 0.23638072128651633, + "grad_norm": 1.6517319679260254, + "learning_rate": 4.3417936952879745e-05, + "loss": 4.9471, + "step": 39746 + }, + { + "epoch": 0.23638666856979731, + "grad_norm": 1.9833317995071411, + "learning_rate": 4.341762109731768e-05, + "loss": 4.259, + "step": 39747 + }, + { + "epoch": 0.2363926158530783, + "grad_norm": 1.6605212688446045, + "learning_rate": 4.341730523532622e-05, + "loss": 4.6667, + "step": 39748 + }, + { + "epoch": 0.23639856313635932, + "grad_norm": 1.8738734722137451, + "learning_rate": 4.341698936690547e-05, + "loss": 4.5505, + "step": 39749 + }, + { + "epoch": 0.2364045104196403, + "grad_norm": 1.7416300773620605, + "learning_rate": 4.3416673492055534e-05, + "loss": 4.5346, + "step": 39750 + }, + { + "epoch": 0.2364104577029213, + "grad_norm": 1.7728749513626099, + "learning_rate": 4.341635761077653e-05, + "loss": 5.125, + "step": 39751 + }, + { + "epoch": 0.2364164049862023, + "grad_norm": 1.8593645095825195, + "learning_rate": 4.3416041723068555e-05, + "loss": 4.9069, + "step": 39752 + }, + { + "epoch": 0.2364223522694833, + "grad_norm": 1.8321951627731323, + "learning_rate": 4.341572582893174e-05, + "loss": 4.4018, + "step": 39753 + }, + { + "epoch": 0.2364282995527643, + "grad_norm": 2.6567561626434326, + "learning_rate": 4.341540992836619e-05, + "loss": 4.1624, + "step": 39754 + }, + { + "epoch": 0.2364342468360453, + "grad_norm": 2.788621425628662, + "learning_rate": 4.3415094021372004e-05, + "loss": 3.9062, + "step": 39755 + }, + { + "epoch": 0.2364401941193263, + "grad_norm": 3.0069475173950195, + "learning_rate": 4.34147781079493e-05, + "loss": 3.9894, + "step": 39756 + }, + { + "epoch": 0.23644614140260728, + "grad_norm": 2.6209957599639893, + "learning_rate": 4.3414462188098186e-05, + "loss": 3.9002, + "step": 39757 + }, + { + "epoch": 0.2364520886858883, + "grad_norm": 2.197650194168091, + "learning_rate": 4.341414626181878e-05, + "loss": 3.8847, + "step": 39758 + }, + { + "epoch": 0.23645803596916928, + "grad_norm": 2.5086872577667236, + "learning_rate": 4.341383032911117e-05, + "loss": 3.7872, + "step": 39759 + }, + { + "epoch": 0.23646398325245027, + "grad_norm": 2.1555652618408203, + "learning_rate": 4.34135143899755e-05, + "loss": 3.4424, + "step": 39760 + }, + { + "epoch": 0.2364699305357313, + "grad_norm": 2.588146924972534, + "learning_rate": 4.341319844441185e-05, + "loss": 3.5797, + "step": 39761 + }, + { + "epoch": 0.23647587781901228, + "grad_norm": 2.394775867462158, + "learning_rate": 4.341288249242035e-05, + "loss": 4.1704, + "step": 39762 + }, + { + "epoch": 0.23648182510229326, + "grad_norm": 2.6768877506256104, + "learning_rate": 4.34125665340011e-05, + "loss": 3.926, + "step": 39763 + }, + { + "epoch": 0.23648777238557428, + "grad_norm": 2.8283958435058594, + "learning_rate": 4.341225056915421e-05, + "loss": 4.2181, + "step": 39764 + }, + { + "epoch": 0.23649371966885527, + "grad_norm": 2.7577545642852783, + "learning_rate": 4.34119345978798e-05, + "loss": 4.0358, + "step": 39765 + }, + { + "epoch": 0.23649966695213626, + "grad_norm": 2.6630911827087402, + "learning_rate": 4.341161862017797e-05, + "loss": 3.9368, + "step": 39766 + }, + { + "epoch": 0.23650561423541727, + "grad_norm": 3.082396984100342, + "learning_rate": 4.341130263604883e-05, + "loss": 3.5809, + "step": 39767 + }, + { + "epoch": 0.23651156151869826, + "grad_norm": 2.6369762420654297, + "learning_rate": 4.34109866454925e-05, + "loss": 3.657, + "step": 39768 + }, + { + "epoch": 0.23651750880197925, + "grad_norm": 2.8537046909332275, + "learning_rate": 4.341067064850909e-05, + "loss": 3.8611, + "step": 39769 + }, + { + "epoch": 0.23652345608526026, + "grad_norm": 2.628800392150879, + "learning_rate": 4.34103546450987e-05, + "loss": 3.6899, + "step": 39770 + }, + { + "epoch": 0.23652940336854125, + "grad_norm": 2.121014356613159, + "learning_rate": 4.341003863526144e-05, + "loss": 3.8184, + "step": 39771 + }, + { + "epoch": 0.23653535065182224, + "grad_norm": 2.651318073272705, + "learning_rate": 4.340972261899743e-05, + "loss": 3.9339, + "step": 39772 + }, + { + "epoch": 0.23654129793510326, + "grad_norm": 2.6411614418029785, + "learning_rate": 4.340940659630678e-05, + "loss": 3.9456, + "step": 39773 + }, + { + "epoch": 0.23654724521838424, + "grad_norm": 2.308238983154297, + "learning_rate": 4.340909056718959e-05, + "loss": 3.9969, + "step": 39774 + }, + { + "epoch": 0.23655319250166523, + "grad_norm": 3.0689187049865723, + "learning_rate": 4.340877453164599e-05, + "loss": 3.7694, + "step": 39775 + }, + { + "epoch": 0.23655913978494625, + "grad_norm": 2.6818394660949707, + "learning_rate": 4.3408458489676064e-05, + "loss": 3.487, + "step": 39776 + }, + { + "epoch": 0.23656508706822724, + "grad_norm": 2.5647828578948975, + "learning_rate": 4.340814244127993e-05, + "loss": 4.0624, + "step": 39777 + }, + { + "epoch": 0.23657103435150822, + "grad_norm": 2.7362442016601562, + "learning_rate": 4.340782638645772e-05, + "loss": 3.9097, + "step": 39778 + }, + { + "epoch": 0.23657698163478924, + "grad_norm": 2.7237389087677, + "learning_rate": 4.340751032520952e-05, + "loss": 3.6611, + "step": 39779 + }, + { + "epoch": 0.23658292891807023, + "grad_norm": 2.5460264682769775, + "learning_rate": 4.3407194257535445e-05, + "loss": 3.8765, + "step": 39780 + }, + { + "epoch": 0.23658887620135122, + "grad_norm": 2.6930301189422607, + "learning_rate": 4.340687818343561e-05, + "loss": 3.5759, + "step": 39781 + }, + { + "epoch": 0.23659482348463223, + "grad_norm": 2.4678304195404053, + "learning_rate": 4.340656210291013e-05, + "loss": 3.8256, + "step": 39782 + }, + { + "epoch": 0.23660077076791322, + "grad_norm": 2.215928316116333, + "learning_rate": 4.34062460159591e-05, + "loss": 3.8329, + "step": 39783 + }, + { + "epoch": 0.2366067180511942, + "grad_norm": 2.689182758331299, + "learning_rate": 4.3405929922582645e-05, + "loss": 3.5107, + "step": 39784 + }, + { + "epoch": 0.23661266533447523, + "grad_norm": 2.366183042526245, + "learning_rate": 4.3405613822780875e-05, + "loss": 3.6994, + "step": 39785 + }, + { + "epoch": 0.2366186126177562, + "grad_norm": 2.7422029972076416, + "learning_rate": 4.3405297716553886e-05, + "loss": 3.8842, + "step": 39786 + }, + { + "epoch": 0.2366245599010372, + "grad_norm": 2.643587589263916, + "learning_rate": 4.34049816039018e-05, + "loss": 3.8722, + "step": 39787 + }, + { + "epoch": 0.23663050718431822, + "grad_norm": 2.4537999629974365, + "learning_rate": 4.340466548482473e-05, + "loss": 3.9502, + "step": 39788 + }, + { + "epoch": 0.2366364544675992, + "grad_norm": 2.574368953704834, + "learning_rate": 4.340434935932277e-05, + "loss": 3.6153, + "step": 39789 + }, + { + "epoch": 0.2366424017508802, + "grad_norm": 2.779705762863159, + "learning_rate": 4.3404033227396054e-05, + "loss": 3.8353, + "step": 39790 + }, + { + "epoch": 0.23664834903416118, + "grad_norm": 2.8180508613586426, + "learning_rate": 4.340371708904468e-05, + "loss": 3.3763, + "step": 39791 + }, + { + "epoch": 0.2366542963174422, + "grad_norm": 2.660182476043701, + "learning_rate": 4.340340094426876e-05, + "loss": 3.6168, + "step": 39792 + }, + { + "epoch": 0.23666024360072319, + "grad_norm": 2.663090229034424, + "learning_rate": 4.340308479306839e-05, + "loss": 3.7262, + "step": 39793 + }, + { + "epoch": 0.23666619088400417, + "grad_norm": 2.3928894996643066, + "learning_rate": 4.34027686354437e-05, + "loss": 3.8692, + "step": 39794 + }, + { + "epoch": 0.2366721381672852, + "grad_norm": 2.654780149459839, + "learning_rate": 4.340245247139479e-05, + "loss": 3.7075, + "step": 39795 + }, + { + "epoch": 0.23667808545056618, + "grad_norm": 2.345092296600342, + "learning_rate": 4.340213630092178e-05, + "loss": 3.7682, + "step": 39796 + }, + { + "epoch": 0.23668403273384717, + "grad_norm": 2.4375827312469482, + "learning_rate": 4.3401820124024776e-05, + "loss": 3.6996, + "step": 39797 + }, + { + "epoch": 0.23668998001712818, + "grad_norm": 2.407268762588501, + "learning_rate": 4.340150394070388e-05, + "loss": 3.8321, + "step": 39798 + }, + { + "epoch": 0.23669592730040917, + "grad_norm": 2.3417975902557373, + "learning_rate": 4.3401187750959216e-05, + "loss": 4.0033, + "step": 39799 + }, + { + "epoch": 0.23670187458369016, + "grad_norm": 2.420501708984375, + "learning_rate": 4.340087155479089e-05, + "loss": 3.739, + "step": 39800 + }, + { + "epoch": 0.23670782186697117, + "grad_norm": 1.9763052463531494, + "learning_rate": 4.3400555352199e-05, + "loss": 4.7612, + "step": 39801 + }, + { + "epoch": 0.23671376915025216, + "grad_norm": 1.9385654926300049, + "learning_rate": 4.340023914318367e-05, + "loss": 4.9685, + "step": 39802 + }, + { + "epoch": 0.23671971643353315, + "grad_norm": 2.449619770050049, + "learning_rate": 4.339992292774501e-05, + "loss": 3.6381, + "step": 39803 + }, + { + "epoch": 0.23672566371681417, + "grad_norm": 1.925249457359314, + "learning_rate": 4.339960670588312e-05, + "loss": 5.0143, + "step": 39804 + }, + { + "epoch": 0.23673161100009515, + "grad_norm": 2.3256402015686035, + "learning_rate": 4.339929047759812e-05, + "loss": 3.7777, + "step": 39805 + }, + { + "epoch": 0.23673755828337614, + "grad_norm": 2.0616559982299805, + "learning_rate": 4.3398974242890124e-05, + "loss": 4.2865, + "step": 39806 + }, + { + "epoch": 0.23674350556665716, + "grad_norm": 2.7752761840820312, + "learning_rate": 4.339865800175923e-05, + "loss": 3.3378, + "step": 39807 + }, + { + "epoch": 0.23674945284993815, + "grad_norm": 2.9076433181762695, + "learning_rate": 4.339834175420555e-05, + "loss": 3.3096, + "step": 39808 + }, + { + "epoch": 0.23675540013321913, + "grad_norm": 2.4606168270111084, + "learning_rate": 4.339802550022921e-05, + "loss": 3.26, + "step": 39809 + }, + { + "epoch": 0.23676134741650015, + "grad_norm": 2.631068468093872, + "learning_rate": 4.3397709239830295e-05, + "loss": 3.31, + "step": 39810 + }, + { + "epoch": 0.23676729469978114, + "grad_norm": 2.1262693405151367, + "learning_rate": 4.339739297300894e-05, + "loss": 4.0044, + "step": 39811 + }, + { + "epoch": 0.23677324198306213, + "grad_norm": 1.440590262413025, + "learning_rate": 4.339707669976525e-05, + "loss": 4.6205, + "step": 39812 + }, + { + "epoch": 0.23677918926634314, + "grad_norm": 1.6795618534088135, + "learning_rate": 4.339676042009933e-05, + "loss": 4.7781, + "step": 39813 + }, + { + "epoch": 0.23678513654962413, + "grad_norm": 1.5972740650177002, + "learning_rate": 4.3396444134011275e-05, + "loss": 4.8949, + "step": 39814 + }, + { + "epoch": 0.23679108383290512, + "grad_norm": 1.659780502319336, + "learning_rate": 4.339612784150122e-05, + "loss": 4.9178, + "step": 39815 + }, + { + "epoch": 0.23679703111618614, + "grad_norm": 1.523654818534851, + "learning_rate": 4.339581154256928e-05, + "loss": 4.9818, + "step": 39816 + }, + { + "epoch": 0.23680297839946712, + "grad_norm": 1.7680178880691528, + "learning_rate": 4.3395495237215535e-05, + "loss": 4.7116, + "step": 39817 + }, + { + "epoch": 0.2368089256827481, + "grad_norm": 2.39174222946167, + "learning_rate": 4.339517892544012e-05, + "loss": 3.4239, + "step": 39818 + }, + { + "epoch": 0.23681487296602913, + "grad_norm": 2.550715923309326, + "learning_rate": 4.3394862607243134e-05, + "loss": 3.31, + "step": 39819 + }, + { + "epoch": 0.23682082024931012, + "grad_norm": 2.13712215423584, + "learning_rate": 4.339454628262469e-05, + "loss": 3.4468, + "step": 39820 + }, + { + "epoch": 0.2368267675325911, + "grad_norm": 1.8183554410934448, + "learning_rate": 4.3394229951584905e-05, + "loss": 4.6941, + "step": 39821 + }, + { + "epoch": 0.23683271481587212, + "grad_norm": 2.161360740661621, + "learning_rate": 4.3393913614123885e-05, + "loss": 3.2488, + "step": 39822 + }, + { + "epoch": 0.2368386620991531, + "grad_norm": 2.250622510910034, + "learning_rate": 4.339359727024174e-05, + "loss": 2.9575, + "step": 39823 + }, + { + "epoch": 0.2368446093824341, + "grad_norm": 1.722509503364563, + "learning_rate": 4.339328091993857e-05, + "loss": 4.599, + "step": 39824 + }, + { + "epoch": 0.2368505566657151, + "grad_norm": 1.7138597965240479, + "learning_rate": 4.339296456321451e-05, + "loss": 4.8187, + "step": 39825 + }, + { + "epoch": 0.2368565039489961, + "grad_norm": 2.0628795623779297, + "learning_rate": 4.339264820006965e-05, + "loss": 4.4325, + "step": 39826 + }, + { + "epoch": 0.2368624512322771, + "grad_norm": 2.7999677658081055, + "learning_rate": 4.339233183050411e-05, + "loss": 3.9543, + "step": 39827 + }, + { + "epoch": 0.2368683985155581, + "grad_norm": 2.9181196689605713, + "learning_rate": 4.339201545451799e-05, + "loss": 4.0597, + "step": 39828 + }, + { + "epoch": 0.2368743457988391, + "grad_norm": 1.9542845487594604, + "learning_rate": 4.3391699072111415e-05, + "loss": 4.9241, + "step": 39829 + }, + { + "epoch": 0.23688029308212008, + "grad_norm": 2.2268269062042236, + "learning_rate": 4.339138268328448e-05, + "loss": 4.1254, + "step": 39830 + }, + { + "epoch": 0.2368862403654011, + "grad_norm": 2.4981327056884766, + "learning_rate": 4.339106628803731e-05, + "loss": 3.7606, + "step": 39831 + }, + { + "epoch": 0.23689218764868208, + "grad_norm": 2.370508909225464, + "learning_rate": 4.339074988637001e-05, + "loss": 3.6986, + "step": 39832 + }, + { + "epoch": 0.23689813493196307, + "grad_norm": 2.0736162662506104, + "learning_rate": 4.339043347828268e-05, + "loss": 4.1281, + "step": 39833 + }, + { + "epoch": 0.2369040822152441, + "grad_norm": 1.6644777059555054, + "learning_rate": 4.339011706377545e-05, + "loss": 4.5578, + "step": 39834 + }, + { + "epoch": 0.23691002949852508, + "grad_norm": 1.5516061782836914, + "learning_rate": 4.338980064284841e-05, + "loss": 4.5777, + "step": 39835 + }, + { + "epoch": 0.23691597678180606, + "grad_norm": 2.0236034393310547, + "learning_rate": 4.338948421550169e-05, + "loss": 4.5546, + "step": 39836 + }, + { + "epoch": 0.23692192406508708, + "grad_norm": 1.72527277469635, + "learning_rate": 4.3389167781735385e-05, + "loss": 4.6173, + "step": 39837 + }, + { + "epoch": 0.23692787134836807, + "grad_norm": 1.8590136766433716, + "learning_rate": 4.338885134154961e-05, + "loss": 4.3471, + "step": 39838 + }, + { + "epoch": 0.23693381863164906, + "grad_norm": 1.7647113800048828, + "learning_rate": 4.3388534894944484e-05, + "loss": 3.9139, + "step": 39839 + }, + { + "epoch": 0.23693976591493007, + "grad_norm": 2.5895087718963623, + "learning_rate": 4.33882184419201e-05, + "loss": 1.952, + "step": 39840 + }, + { + "epoch": 0.23694571319821106, + "grad_norm": 1.8349545001983643, + "learning_rate": 4.3387901982476586e-05, + "loss": 4.2027, + "step": 39841 + }, + { + "epoch": 0.23695166048149205, + "grad_norm": 2.599876880645752, + "learning_rate": 4.338758551661405e-05, + "loss": 1.963, + "step": 39842 + }, + { + "epoch": 0.23695760776477306, + "grad_norm": 2.0699830055236816, + "learning_rate": 4.338726904433259e-05, + "loss": 2.6252, + "step": 39843 + }, + { + "epoch": 0.23696355504805405, + "grad_norm": 2.20517635345459, + "learning_rate": 4.338695256563233e-05, + "loss": 1.1781, + "step": 39844 + }, + { + "epoch": 0.23696950233133504, + "grad_norm": 2.3173837661743164, + "learning_rate": 4.338663608051337e-05, + "loss": 0.9283, + "step": 39845 + }, + { + "epoch": 0.23697544961461606, + "grad_norm": 2.2324037551879883, + "learning_rate": 4.3386319588975836e-05, + "loss": 1.5728, + "step": 39846 + }, + { + "epoch": 0.23698139689789705, + "grad_norm": 1.6736335754394531, + "learning_rate": 4.338600309101981e-05, + "loss": 4.7163, + "step": 39847 + }, + { + "epoch": 0.23698734418117803, + "grad_norm": 1.7723100185394287, + "learning_rate": 4.338568658664543e-05, + "loss": 4.3372, + "step": 39848 + }, + { + "epoch": 0.23699329146445902, + "grad_norm": 2.2402212619781494, + "learning_rate": 4.33853700758528e-05, + "loss": 5.344, + "step": 39849 + }, + { + "epoch": 0.23699923874774004, + "grad_norm": 1.8501451015472412, + "learning_rate": 4.338505355864202e-05, + "loss": 5.0796, + "step": 39850 + }, + { + "epoch": 0.23700518603102103, + "grad_norm": 2.0170962810516357, + "learning_rate": 4.338473703501321e-05, + "loss": 4.6974, + "step": 39851 + }, + { + "epoch": 0.237011133314302, + "grad_norm": 1.8159914016723633, + "learning_rate": 4.338442050496648e-05, + "loss": 4.7729, + "step": 39852 + }, + { + "epoch": 0.23701708059758303, + "grad_norm": 1.6395304203033447, + "learning_rate": 4.338410396850194e-05, + "loss": 4.4962, + "step": 39853 + }, + { + "epoch": 0.23702302788086402, + "grad_norm": 1.4548068046569824, + "learning_rate": 4.338378742561969e-05, + "loss": 4.5226, + "step": 39854 + }, + { + "epoch": 0.237028975164145, + "grad_norm": 1.401894450187683, + "learning_rate": 4.3383470876319864e-05, + "loss": 4.4299, + "step": 39855 + }, + { + "epoch": 0.23703492244742602, + "grad_norm": 1.4954034090042114, + "learning_rate": 4.3383154320602556e-05, + "loss": 4.4422, + "step": 39856 + }, + { + "epoch": 0.237040869730707, + "grad_norm": 1.4662593603134155, + "learning_rate": 4.338283775846786e-05, + "loss": 4.4502, + "step": 39857 + }, + { + "epoch": 0.237046817013988, + "grad_norm": 1.4960722923278809, + "learning_rate": 4.3382521189915924e-05, + "loss": 4.2204, + "step": 39858 + }, + { + "epoch": 0.23705276429726901, + "grad_norm": 1.5249221324920654, + "learning_rate": 4.3382204614946845e-05, + "loss": 4.326, + "step": 39859 + }, + { + "epoch": 0.23705871158055, + "grad_norm": 1.4498658180236816, + "learning_rate": 4.338188803356071e-05, + "loss": 4.3848, + "step": 39860 + }, + { + "epoch": 0.237064658863831, + "grad_norm": 1.4516419172286987, + "learning_rate": 4.338157144575766e-05, + "loss": 4.3933, + "step": 39861 + }, + { + "epoch": 0.237070606147112, + "grad_norm": 1.4814791679382324, + "learning_rate": 4.338125485153779e-05, + "loss": 4.2899, + "step": 39862 + }, + { + "epoch": 0.237076553430393, + "grad_norm": 1.4463040828704834, + "learning_rate": 4.3380938250901216e-05, + "loss": 4.3085, + "step": 39863 + }, + { + "epoch": 0.23708250071367398, + "grad_norm": 1.3938040733337402, + "learning_rate": 4.338062164384804e-05, + "loss": 4.3688, + "step": 39864 + }, + { + "epoch": 0.237088447996955, + "grad_norm": 1.382372260093689, + "learning_rate": 4.3380305030378385e-05, + "loss": 4.2662, + "step": 39865 + }, + { + "epoch": 0.237094395280236, + "grad_norm": 1.8836345672607422, + "learning_rate": 4.337998841049235e-05, + "loss": 4.5557, + "step": 39866 + }, + { + "epoch": 0.23710034256351697, + "grad_norm": 1.430780053138733, + "learning_rate": 4.3379671784190056e-05, + "loss": 4.3896, + "step": 39867 + }, + { + "epoch": 0.237106289846798, + "grad_norm": 1.4985473155975342, + "learning_rate": 4.3379355151471606e-05, + "loss": 4.3321, + "step": 39868 + }, + { + "epoch": 0.23711223713007898, + "grad_norm": 1.4790635108947754, + "learning_rate": 4.337903851233711e-05, + "loss": 4.4599, + "step": 39869 + }, + { + "epoch": 0.23711818441335997, + "grad_norm": 1.7420741319656372, + "learning_rate": 4.337872186678669e-05, + "loss": 4.5288, + "step": 39870 + }, + { + "epoch": 0.23712413169664098, + "grad_norm": 2.139042615890503, + "learning_rate": 4.337840521482044e-05, + "loss": 4.4184, + "step": 39871 + }, + { + "epoch": 0.23713007897992197, + "grad_norm": 2.679811954498291, + "learning_rate": 4.337808855643848e-05, + "loss": 4.491, + "step": 39872 + }, + { + "epoch": 0.23713602626320296, + "grad_norm": 2.33467173576355, + "learning_rate": 4.3377771891640925e-05, + "loss": 4.4055, + "step": 39873 + }, + { + "epoch": 0.23714197354648398, + "grad_norm": 2.19638991355896, + "learning_rate": 4.3377455220427876e-05, + "loss": 4.4909, + "step": 39874 + }, + { + "epoch": 0.23714792082976496, + "grad_norm": 2.1641952991485596, + "learning_rate": 4.337713854279945e-05, + "loss": 4.3292, + "step": 39875 + }, + { + "epoch": 0.23715386811304595, + "grad_norm": 2.2148971557617188, + "learning_rate": 4.3376821858755746e-05, + "loss": 4.2625, + "step": 39876 + }, + { + "epoch": 0.23715981539632697, + "grad_norm": 2.1260080337524414, + "learning_rate": 4.337650516829689e-05, + "loss": 4.3795, + "step": 39877 + }, + { + "epoch": 0.23716576267960796, + "grad_norm": 1.9064221382141113, + "learning_rate": 4.3376188471422984e-05, + "loss": 4.28, + "step": 39878 + }, + { + "epoch": 0.23717170996288894, + "grad_norm": 1.8643522262573242, + "learning_rate": 4.337587176813414e-05, + "loss": 4.2244, + "step": 39879 + }, + { + "epoch": 0.23717765724616996, + "grad_norm": 2.170990467071533, + "learning_rate": 4.337555505843047e-05, + "loss": 4.2965, + "step": 39880 + }, + { + "epoch": 0.23718360452945095, + "grad_norm": 1.8632001876831055, + "learning_rate": 4.3375238342312084e-05, + "loss": 4.3296, + "step": 39881 + }, + { + "epoch": 0.23718955181273194, + "grad_norm": 1.8718262910842896, + "learning_rate": 4.33749216197791e-05, + "loss": 4.2258, + "step": 39882 + }, + { + "epoch": 0.23719549909601295, + "grad_norm": 1.9377762079238892, + "learning_rate": 4.3374604890831605e-05, + "loss": 4.1339, + "step": 39883 + }, + { + "epoch": 0.23720144637929394, + "grad_norm": 1.8045750856399536, + "learning_rate": 4.3374288155469736e-05, + "loss": 4.1913, + "step": 39884 + }, + { + "epoch": 0.23720739366257493, + "grad_norm": 2.4247703552246094, + "learning_rate": 4.3373971413693584e-05, + "loss": 4.4062, + "step": 39885 + }, + { + "epoch": 0.23721334094585594, + "grad_norm": 2.441964864730835, + "learning_rate": 4.337365466550328e-05, + "loss": 4.3062, + "step": 39886 + }, + { + "epoch": 0.23721928822913693, + "grad_norm": 2.0665531158447266, + "learning_rate": 4.3373337910898914e-05, + "loss": 4.1877, + "step": 39887 + }, + { + "epoch": 0.23722523551241792, + "grad_norm": 1.751538872718811, + "learning_rate": 4.3373021149880614e-05, + "loss": 4.0803, + "step": 39888 + }, + { + "epoch": 0.23723118279569894, + "grad_norm": 3.0823750495910645, + "learning_rate": 4.337270438244847e-05, + "loss": 3.9962, + "step": 39889 + }, + { + "epoch": 0.23723713007897992, + "grad_norm": 1.7145901918411255, + "learning_rate": 4.337238760860261e-05, + "loss": 4.9209, + "step": 39890 + }, + { + "epoch": 0.2372430773622609, + "grad_norm": 1.8586928844451904, + "learning_rate": 4.337207082834315e-05, + "loss": 4.7399, + "step": 39891 + }, + { + "epoch": 0.23724902464554193, + "grad_norm": 1.9576743841171265, + "learning_rate": 4.337175404167018e-05, + "loss": 4.8797, + "step": 39892 + }, + { + "epoch": 0.23725497192882292, + "grad_norm": 1.6683032512664795, + "learning_rate": 4.337143724858381e-05, + "loss": 5.1349, + "step": 39893 + }, + { + "epoch": 0.2372609192121039, + "grad_norm": 1.7969902753829956, + "learning_rate": 4.337112044908418e-05, + "loss": 5.1658, + "step": 39894 + }, + { + "epoch": 0.23726686649538492, + "grad_norm": 2.2213234901428223, + "learning_rate": 4.337080364317137e-05, + "loss": 4.9434, + "step": 39895 + }, + { + "epoch": 0.2372728137786659, + "grad_norm": 2.1538355350494385, + "learning_rate": 4.3370486830845507e-05, + "loss": 4.5511, + "step": 39896 + }, + { + "epoch": 0.2372787610619469, + "grad_norm": 2.237603187561035, + "learning_rate": 4.3370170012106694e-05, + "loss": 3.7551, + "step": 39897 + }, + { + "epoch": 0.2372847083452279, + "grad_norm": 3.0955090522766113, + "learning_rate": 4.336985318695505e-05, + "loss": 2.503, + "step": 39898 + }, + { + "epoch": 0.2372906556285089, + "grad_norm": 1.9793435335159302, + "learning_rate": 4.3369536355390675e-05, + "loss": 4.4538, + "step": 39899 + }, + { + "epoch": 0.2372966029117899, + "grad_norm": 2.1285853385925293, + "learning_rate": 4.3369219517413684e-05, + "loss": 4.3584, + "step": 39900 + }, + { + "epoch": 0.2373025501950709, + "grad_norm": 1.7009873390197754, + "learning_rate": 4.3368902673024194e-05, + "loss": 4.6289, + "step": 39901 + }, + { + "epoch": 0.2373084974783519, + "grad_norm": 1.7879126071929932, + "learning_rate": 4.3368585822222304e-05, + "loss": 4.2106, + "step": 39902 + }, + { + "epoch": 0.23731444476163288, + "grad_norm": 1.7139616012573242, + "learning_rate": 4.336826896500814e-05, + "loss": 4.5923, + "step": 39903 + }, + { + "epoch": 0.2373203920449139, + "grad_norm": 1.5922623872756958, + "learning_rate": 4.336795210138179e-05, + "loss": 4.4593, + "step": 39904 + }, + { + "epoch": 0.23732633932819489, + "grad_norm": 1.6713234186172485, + "learning_rate": 4.3367635231343384e-05, + "loss": 4.2166, + "step": 39905 + }, + { + "epoch": 0.23733228661147587, + "grad_norm": 1.633577585220337, + "learning_rate": 4.3367318354893025e-05, + "loss": 4.4896, + "step": 39906 + }, + { + "epoch": 0.23733823389475686, + "grad_norm": 1.6591612100601196, + "learning_rate": 4.3367001472030824e-05, + "loss": 4.173, + "step": 39907 + }, + { + "epoch": 0.23734418117803788, + "grad_norm": 1.8667633533477783, + "learning_rate": 4.3366684582756895e-05, + "loss": 4.2637, + "step": 39908 + }, + { + "epoch": 0.23735012846131887, + "grad_norm": 1.6186610460281372, + "learning_rate": 4.3366367687071346e-05, + "loss": 4.3127, + "step": 39909 + }, + { + "epoch": 0.23735607574459985, + "grad_norm": 1.8370599746704102, + "learning_rate": 4.336605078497429e-05, + "loss": 4.4424, + "step": 39910 + }, + { + "epoch": 0.23736202302788087, + "grad_norm": 1.812067985534668, + "learning_rate": 4.336573387646583e-05, + "loss": 5.2419, + "step": 39911 + }, + { + "epoch": 0.23736797031116186, + "grad_norm": 2.028104066848755, + "learning_rate": 4.336541696154608e-05, + "loss": 4.8321, + "step": 39912 + }, + { + "epoch": 0.23737391759444285, + "grad_norm": 2.516324996948242, + "learning_rate": 4.336510004021516e-05, + "loss": 3.8673, + "step": 39913 + }, + { + "epoch": 0.23737986487772386, + "grad_norm": 2.5107903480529785, + "learning_rate": 4.336478311247317e-05, + "loss": 3.7475, + "step": 39914 + }, + { + "epoch": 0.23738581216100485, + "grad_norm": 2.4185755252838135, + "learning_rate": 4.336446617832023e-05, + "loss": 3.7617, + "step": 39915 + }, + { + "epoch": 0.23739175944428584, + "grad_norm": 1.93293297290802, + "learning_rate": 4.336414923775644e-05, + "loss": 4.3185, + "step": 39916 + }, + { + "epoch": 0.23739770672756685, + "grad_norm": 1.7484050989151, + "learning_rate": 4.336383229078191e-05, + "loss": 5.1168, + "step": 39917 + }, + { + "epoch": 0.23740365401084784, + "grad_norm": 1.5135313272476196, + "learning_rate": 4.336351533739676e-05, + "loss": 4.9183, + "step": 39918 + }, + { + "epoch": 0.23740960129412883, + "grad_norm": 1.8860149383544922, + "learning_rate": 4.33631983776011e-05, + "loss": 5.0206, + "step": 39919 + }, + { + "epoch": 0.23741554857740985, + "grad_norm": 1.641844391822815, + "learning_rate": 4.336288141139503e-05, + "loss": 4.6539, + "step": 39920 + }, + { + "epoch": 0.23742149586069083, + "grad_norm": 1.7509504556655884, + "learning_rate": 4.336256443877867e-05, + "loss": 4.6623, + "step": 39921 + }, + { + "epoch": 0.23742744314397182, + "grad_norm": 1.7655612230300903, + "learning_rate": 4.3362247459752135e-05, + "loss": 5.5298, + "step": 39922 + }, + { + "epoch": 0.23743339042725284, + "grad_norm": 2.3678815364837646, + "learning_rate": 4.3361930474315524e-05, + "loss": 3.7642, + "step": 39923 + }, + { + "epoch": 0.23743933771053383, + "grad_norm": 3.0474207401275635, + "learning_rate": 4.3361613482468954e-05, + "loss": 3.3554, + "step": 39924 + }, + { + "epoch": 0.23744528499381481, + "grad_norm": 3.052656412124634, + "learning_rate": 4.3361296484212534e-05, + "loss": 3.2537, + "step": 39925 + }, + { + "epoch": 0.23745123227709583, + "grad_norm": 3.0903141498565674, + "learning_rate": 4.336097947954637e-05, + "loss": 3.2136, + "step": 39926 + }, + { + "epoch": 0.23745717956037682, + "grad_norm": 2.6233386993408203, + "learning_rate": 4.336066246847058e-05, + "loss": 3.3688, + "step": 39927 + }, + { + "epoch": 0.2374631268436578, + "grad_norm": 3.0395944118499756, + "learning_rate": 4.336034545098528e-05, + "loss": 3.4725, + "step": 39928 + }, + { + "epoch": 0.23746907412693882, + "grad_norm": 2.7053802013397217, + "learning_rate": 4.336002842709057e-05, + "loss": 3.299, + "step": 39929 + }, + { + "epoch": 0.2374750214102198, + "grad_norm": 2.8455517292022705, + "learning_rate": 4.3359711396786554e-05, + "loss": 3.2357, + "step": 39930 + }, + { + "epoch": 0.2374809686935008, + "grad_norm": 2.790203332901001, + "learning_rate": 4.335939436007336e-05, + "loss": 3.0324, + "step": 39931 + }, + { + "epoch": 0.23748691597678181, + "grad_norm": 2.6323273181915283, + "learning_rate": 4.3359077316951096e-05, + "loss": 3.3338, + "step": 39932 + }, + { + "epoch": 0.2374928632600628, + "grad_norm": 1.6055479049682617, + "learning_rate": 4.335876026741986e-05, + "loss": 4.5655, + "step": 39933 + }, + { + "epoch": 0.2374988105433438, + "grad_norm": 2.0111827850341797, + "learning_rate": 4.335844321147978e-05, + "loss": 5.0737, + "step": 39934 + }, + { + "epoch": 0.2375047578266248, + "grad_norm": 1.6341081857681274, + "learning_rate": 4.3358126149130944e-05, + "loss": 5.0941, + "step": 39935 + }, + { + "epoch": 0.2375107051099058, + "grad_norm": 1.9143885374069214, + "learning_rate": 4.3357809080373484e-05, + "loss": 4.1743, + "step": 39936 + }, + { + "epoch": 0.23751665239318678, + "grad_norm": 1.6839019060134888, + "learning_rate": 4.33574920052075e-05, + "loss": 4.6477, + "step": 39937 + }, + { + "epoch": 0.2375225996764678, + "grad_norm": 1.8571311235427856, + "learning_rate": 4.335717492363311e-05, + "loss": 4.8121, + "step": 39938 + }, + { + "epoch": 0.2375285469597488, + "grad_norm": 1.5011353492736816, + "learning_rate": 4.335685783565041e-05, + "loss": 4.7521, + "step": 39939 + }, + { + "epoch": 0.23753449424302978, + "grad_norm": 2.686401844024658, + "learning_rate": 4.335654074125953e-05, + "loss": 3.4226, + "step": 39940 + }, + { + "epoch": 0.2375404415263108, + "grad_norm": 3.0526058673858643, + "learning_rate": 4.335622364046057e-05, + "loss": 3.235, + "step": 39941 + }, + { + "epoch": 0.23754638880959178, + "grad_norm": 3.0678353309631348, + "learning_rate": 4.3355906533253636e-05, + "loss": 3.3255, + "step": 39942 + }, + { + "epoch": 0.23755233609287277, + "grad_norm": 2.445336103439331, + "learning_rate": 4.335558941963885e-05, + "loss": 3.5664, + "step": 39943 + }, + { + "epoch": 0.23755828337615378, + "grad_norm": 2.646639823913574, + "learning_rate": 4.3355272299616314e-05, + "loss": 3.4817, + "step": 39944 + }, + { + "epoch": 0.23756423065943477, + "grad_norm": 2.4064605236053467, + "learning_rate": 4.335495517318614e-05, + "loss": 3.326, + "step": 39945 + }, + { + "epoch": 0.23757017794271576, + "grad_norm": 2.666252613067627, + "learning_rate": 4.335463804034845e-05, + "loss": 2.9059, + "step": 39946 + }, + { + "epoch": 0.23757612522599678, + "grad_norm": 2.5794105529785156, + "learning_rate": 4.3354320901103344e-05, + "loss": 3.6414, + "step": 39947 + }, + { + "epoch": 0.23758207250927776, + "grad_norm": 2.137204885482788, + "learning_rate": 4.3354003755450925e-05, + "loss": 4.2312, + "step": 39948 + }, + { + "epoch": 0.23758801979255875, + "grad_norm": 2.2329344749450684, + "learning_rate": 4.335368660339132e-05, + "loss": 4.108, + "step": 39949 + }, + { + "epoch": 0.23759396707583977, + "grad_norm": 2.0138213634490967, + "learning_rate": 4.335336944492463e-05, + "loss": 4.0529, + "step": 39950 + }, + { + "epoch": 0.23759991435912076, + "grad_norm": 2.558507204055786, + "learning_rate": 4.335305228005097e-05, + "loss": 3.784, + "step": 39951 + }, + { + "epoch": 0.23760586164240174, + "grad_norm": 2.3928165435791016, + "learning_rate": 4.335273510877045e-05, + "loss": 3.8229, + "step": 39952 + }, + { + "epoch": 0.23761180892568276, + "grad_norm": 2.2186508178710938, + "learning_rate": 4.335241793108318e-05, + "loss": 3.7454, + "step": 39953 + }, + { + "epoch": 0.23761775620896375, + "grad_norm": 2.49245285987854, + "learning_rate": 4.3352100746989264e-05, + "loss": 3.5622, + "step": 39954 + }, + { + "epoch": 0.23762370349224474, + "grad_norm": 2.2493436336517334, + "learning_rate": 4.335178355648882e-05, + "loss": 4.0303, + "step": 39955 + }, + { + "epoch": 0.23762965077552575, + "grad_norm": 2.332967519760132, + "learning_rate": 4.335146635958197e-05, + "loss": 3.5922, + "step": 39956 + }, + { + "epoch": 0.23763559805880674, + "grad_norm": 2.505335569381714, + "learning_rate": 4.33511491562688e-05, + "loss": 3.7643, + "step": 39957 + }, + { + "epoch": 0.23764154534208773, + "grad_norm": 2.923208713531494, + "learning_rate": 4.335083194654944e-05, + "loss": 3.5873, + "step": 39958 + }, + { + "epoch": 0.23764749262536874, + "grad_norm": 2.361135244369507, + "learning_rate": 4.3350514730424e-05, + "loss": 3.877, + "step": 39959 + }, + { + "epoch": 0.23765343990864973, + "grad_norm": 2.3764545917510986, + "learning_rate": 4.335019750789257e-05, + "loss": 3.8729, + "step": 39960 + }, + { + "epoch": 0.23765938719193072, + "grad_norm": 2.5335628986358643, + "learning_rate": 4.334988027895528e-05, + "loss": 3.7511, + "step": 39961 + }, + { + "epoch": 0.23766533447521174, + "grad_norm": 2.3174009323120117, + "learning_rate": 4.334956304361224e-05, + "loss": 3.575, + "step": 39962 + }, + { + "epoch": 0.23767128175849273, + "grad_norm": 2.284850597381592, + "learning_rate": 4.334924580186356e-05, + "loss": 3.6594, + "step": 39963 + }, + { + "epoch": 0.2376772290417737, + "grad_norm": 2.640793561935425, + "learning_rate": 4.3348928553709345e-05, + "loss": 3.7082, + "step": 39964 + }, + { + "epoch": 0.2376831763250547, + "grad_norm": 2.5589759349823, + "learning_rate": 4.33486112991497e-05, + "loss": 3.461, + "step": 39965 + }, + { + "epoch": 0.23768912360833572, + "grad_norm": 2.692124605178833, + "learning_rate": 4.334829403818476e-05, + "loss": 3.6977, + "step": 39966 + }, + { + "epoch": 0.2376950708916167, + "grad_norm": 2.029341220855713, + "learning_rate": 4.3347976770814605e-05, + "loss": 4.5998, + "step": 39967 + }, + { + "epoch": 0.2377010181748977, + "grad_norm": 2.0593783855438232, + "learning_rate": 4.3347659497039373e-05, + "loss": 4.7179, + "step": 39968 + }, + { + "epoch": 0.2377069654581787, + "grad_norm": 1.934889793395996, + "learning_rate": 4.3347342216859156e-05, + "loss": 4.5182, + "step": 39969 + }, + { + "epoch": 0.2377129127414597, + "grad_norm": 1.9339655637741089, + "learning_rate": 4.334702493027407e-05, + "loss": 4.9809, + "step": 39970 + }, + { + "epoch": 0.23771886002474069, + "grad_norm": 1.7704025506973267, + "learning_rate": 4.3346707637284234e-05, + "loss": 4.657, + "step": 39971 + }, + { + "epoch": 0.2377248073080217, + "grad_norm": 1.9846539497375488, + "learning_rate": 4.3346390337889745e-05, + "loss": 4.9341, + "step": 39972 + }, + { + "epoch": 0.2377307545913027, + "grad_norm": 1.8515028953552246, + "learning_rate": 4.3346073032090725e-05, + "loss": 4.6859, + "step": 39973 + }, + { + "epoch": 0.23773670187458368, + "grad_norm": 1.6280958652496338, + "learning_rate": 4.334575571988728e-05, + "loss": 5.1042, + "step": 39974 + }, + { + "epoch": 0.2377426491578647, + "grad_norm": 2.0795865058898926, + "learning_rate": 4.334543840127952e-05, + "loss": 4.9098, + "step": 39975 + }, + { + "epoch": 0.23774859644114568, + "grad_norm": 1.8528962135314941, + "learning_rate": 4.334512107626756e-05, + "loss": 4.892, + "step": 39976 + }, + { + "epoch": 0.23775454372442667, + "grad_norm": 1.7945277690887451, + "learning_rate": 4.33448037448515e-05, + "loss": 4.9786, + "step": 39977 + }, + { + "epoch": 0.23776049100770769, + "grad_norm": 1.6035569906234741, + "learning_rate": 4.334448640703147e-05, + "loss": 4.7732, + "step": 39978 + }, + { + "epoch": 0.23776643829098867, + "grad_norm": 1.4391299486160278, + "learning_rate": 4.334416906280756e-05, + "loss": 4.9093, + "step": 39979 + }, + { + "epoch": 0.23777238557426966, + "grad_norm": 1.7167659997940063, + "learning_rate": 4.33438517121799e-05, + "loss": 4.9274, + "step": 39980 + }, + { + "epoch": 0.23777833285755068, + "grad_norm": 1.7619572877883911, + "learning_rate": 4.334353435514857e-05, + "loss": 4.4422, + "step": 39981 + }, + { + "epoch": 0.23778428014083167, + "grad_norm": 1.413558840751648, + "learning_rate": 4.334321699171372e-05, + "loss": 4.3499, + "step": 39982 + }, + { + "epoch": 0.23779022742411265, + "grad_norm": 1.6296491622924805, + "learning_rate": 4.334289962187544e-05, + "loss": 4.6277, + "step": 39983 + }, + { + "epoch": 0.23779617470739367, + "grad_norm": 1.4150809049606323, + "learning_rate": 4.334258224563384e-05, + "loss": 4.5372, + "step": 39984 + }, + { + "epoch": 0.23780212199067466, + "grad_norm": 1.6175013780593872, + "learning_rate": 4.334226486298904e-05, + "loss": 4.9062, + "step": 39985 + }, + { + "epoch": 0.23780806927395565, + "grad_norm": 1.5687006711959839, + "learning_rate": 4.334194747394114e-05, + "loss": 4.9876, + "step": 39986 + }, + { + "epoch": 0.23781401655723666, + "grad_norm": 1.5041331052780151, + "learning_rate": 4.3341630078490254e-05, + "loss": 4.8733, + "step": 39987 + }, + { + "epoch": 0.23781996384051765, + "grad_norm": 1.4065840244293213, + "learning_rate": 4.334131267663649e-05, + "loss": 5.0474, + "step": 39988 + }, + { + "epoch": 0.23782591112379864, + "grad_norm": 1.4845675230026245, + "learning_rate": 4.334099526837997e-05, + "loss": 5.2594, + "step": 39989 + }, + { + "epoch": 0.23783185840707965, + "grad_norm": 1.597825050354004, + "learning_rate": 4.33406778537208e-05, + "loss": 4.6358, + "step": 39990 + }, + { + "epoch": 0.23783780569036064, + "grad_norm": 1.440100073814392, + "learning_rate": 4.334036043265909e-05, + "loss": 4.3406, + "step": 39991 + }, + { + "epoch": 0.23784375297364163, + "grad_norm": 1.7690058946609497, + "learning_rate": 4.334004300519494e-05, + "loss": 5.0523, + "step": 39992 + }, + { + "epoch": 0.23784970025692265, + "grad_norm": 1.6725897789001465, + "learning_rate": 4.333972557132848e-05, + "loss": 4.3085, + "step": 39993 + }, + { + "epoch": 0.23785564754020364, + "grad_norm": 1.5345070362091064, + "learning_rate": 4.333940813105981e-05, + "loss": 4.8832, + "step": 39994 + }, + { + "epoch": 0.23786159482348462, + "grad_norm": 1.42098069190979, + "learning_rate": 4.333909068438904e-05, + "loss": 4.3638, + "step": 39995 + }, + { + "epoch": 0.23786754210676564, + "grad_norm": 1.3835852146148682, + "learning_rate": 4.333877323131628e-05, + "loss": 4.4911, + "step": 39996 + }, + { + "epoch": 0.23787348939004663, + "grad_norm": 1.519081950187683, + "learning_rate": 4.3338455771841645e-05, + "loss": 4.4199, + "step": 39997 + }, + { + "epoch": 0.23787943667332762, + "grad_norm": 1.619184970855713, + "learning_rate": 4.333813830596525e-05, + "loss": 4.9006, + "step": 39998 + }, + { + "epoch": 0.23788538395660863, + "grad_norm": 1.7782379388809204, + "learning_rate": 4.333782083368719e-05, + "loss": 4.9271, + "step": 39999 + }, + { + "epoch": 0.23789133123988962, + "grad_norm": 1.3998247385025024, + "learning_rate": 4.33375033550076e-05, + "loss": 4.9854, + "step": 40000 + } + ], + "logging_steps": 1, + "max_steps": 168144, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.6062643204102554e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-50000/config.json b/checkpoint-50000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-50000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-50000/generation_config.json b/checkpoint-50000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-50000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-50000/model.safetensors.index.json b/checkpoint-50000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-50000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-50000/rng_state_0.pth b/checkpoint-50000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-50000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-50000/rng_state_1.pth b/checkpoint-50000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-50000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-50000/rng_state_2.pth b/checkpoint-50000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-50000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-50000/rng_state_3.pth b/checkpoint-50000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-50000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-50000/rng_state_4.pth b/checkpoint-50000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-50000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-50000/rng_state_5.pth b/checkpoint-50000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-50000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-50000/rng_state_6.pth b/checkpoint-50000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-50000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-50000/rng_state_7.pth b/checkpoint-50000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-50000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-50000/scheduler.pt b/checkpoint-50000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbe83f753cd1ffc229e44e000d42a03f8bc16106 --- /dev/null +++ b/checkpoint-50000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d6e96653afa7ade591a769601d4bc58e8ea01142410840c9f14ca7fd6dda2d8 +size 1064 diff --git a/checkpoint-50000/trainer_state.json b/checkpoint-50000/trainer_state.json new file mode 100644 index 0000000000000000000000000000000000000000..c67e07670187f4ef336374134e19357dbe44419b --- /dev/null +++ b/checkpoint-50000/trainer_state.json @@ -0,0 +1,350034 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 0.29736416404986205, + "eval_steps": 500, + "global_step": 50000, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 5.947283280997241e-06, + "grad_norm": 179.1047821044922, + "learning_rate": 5e-05, + "loss": 14.5158, + "step": 1 + }, + { + "epoch": 1.1894566561994482e-05, + "grad_norm": 40.39401626586914, + "learning_rate": 4.999999999563638e-05, + "loss": 14.152, + "step": 2 + }, + { + "epoch": 1.7841849842991722e-05, + "grad_norm": 137.05079650878906, + "learning_rate": 4.999999998254552e-05, + "loss": 14.6334, + "step": 3 + }, + { + "epoch": 2.3789133123988963e-05, + "grad_norm": 23.315088272094727, + "learning_rate": 4.9999999960727415e-05, + "loss": 12.6458, + "step": 4 + }, + { + "epoch": 2.97364164049862e-05, + "grad_norm": 7.943603992462158, + "learning_rate": 4.9999999930182065e-05, + "loss": 11.8435, + "step": 5 + }, + { + "epoch": 3.5683699685983445e-05, + "grad_norm": 6.374181270599365, + "learning_rate": 4.999999989090948e-05, + "loss": 11.4544, + "step": 6 + }, + { + "epoch": 4.1630982966980686e-05, + "grad_norm": 8.948652267456055, + "learning_rate": 4.999999984290965e-05, + "loss": 11.3516, + "step": 7 + }, + { + "epoch": 4.7578266247977927e-05, + "grad_norm": 3.2318713665008545, + "learning_rate": 4.999999978618258e-05, + "loss": 11.1021, + "step": 8 + }, + { + "epoch": 5.352554952897517e-05, + "grad_norm": 5.6542534828186035, + "learning_rate": 4.9999999720728266e-05, + "loss": 11.0132, + "step": 9 + }, + { + "epoch": 5.94728328099724e-05, + "grad_norm": 3.623577356338501, + "learning_rate": 4.999999964654671e-05, + "loss": 10.8896, + "step": 10 + }, + { + "epoch": 6.542011609096965e-05, + "grad_norm": 3.3209445476531982, + "learning_rate": 4.9999999563637915e-05, + "loss": 10.7339, + "step": 11 + }, + { + "epoch": 7.136739937196689e-05, + "grad_norm": 3.4527082443237305, + "learning_rate": 4.999999947200188e-05, + "loss": 10.5472, + "step": 12 + }, + { + "epoch": 7.731468265296413e-05, + "grad_norm": 3.784444570541382, + "learning_rate": 4.99999993716386e-05, + "loss": 10.4353, + "step": 13 + }, + { + "epoch": 8.326196593396137e-05, + "grad_norm": 4.304569244384766, + "learning_rate": 4.999999926254808e-05, + "loss": 10.4652, + "step": 14 + }, + { + "epoch": 8.920924921495861e-05, + "grad_norm": 3.5867838859558105, + "learning_rate": 4.999999914473032e-05, + "loss": 10.5746, + "step": 15 + }, + { + "epoch": 9.515653249595585e-05, + "grad_norm": 6.1308207511901855, + "learning_rate": 4.9999999018185316e-05, + "loss": 10.4129, + "step": 16 + }, + { + "epoch": 0.0001011038157769531, + "grad_norm": 3.4687230587005615, + "learning_rate": 4.999999888291307e-05, + "loss": 10.2246, + "step": 17 + }, + { + "epoch": 0.00010705109905795033, + "grad_norm": 4.041895866394043, + "learning_rate": 4.9999998738913586e-05, + "loss": 10.0852, + "step": 18 + }, + { + "epoch": 0.00011299838233894758, + "grad_norm": 4.437602519989014, + "learning_rate": 4.999999858618686e-05, + "loss": 9.8841, + "step": 19 + }, + { + "epoch": 0.0001189456656199448, + "grad_norm": 3.9608142375946045, + "learning_rate": 4.9999998424732884e-05, + "loss": 10.0537, + "step": 20 + }, + { + "epoch": 0.00012489294890094204, + "grad_norm": 3.799363613128662, + "learning_rate": 4.999999825455168e-05, + "loss": 9.8487, + "step": 21 + }, + { + "epoch": 0.0001308402321819393, + "grad_norm": 3.626058340072632, + "learning_rate": 4.999999807564323e-05, + "loss": 9.8048, + "step": 22 + }, + { + "epoch": 0.00013678751546293653, + "grad_norm": 4.21406364440918, + "learning_rate": 4.999999788800754e-05, + "loss": 9.6091, + "step": 23 + }, + { + "epoch": 0.00014273479874393378, + "grad_norm": 5.26548957824707, + "learning_rate": 4.9999997691644605e-05, + "loss": 9.3935, + "step": 24 + }, + { + "epoch": 0.000148682082024931, + "grad_norm": 6.5113396644592285, + "learning_rate": 4.999999748655443e-05, + "loss": 9.2602, + "step": 25 + }, + { + "epoch": 0.00015462936530592826, + "grad_norm": 4.6141133308410645, + "learning_rate": 4.9999997272737014e-05, + "loss": 9.1492, + "step": 26 + }, + { + "epoch": 0.0001605766485869255, + "grad_norm": 4.645262241363525, + "learning_rate": 4.999999705019236e-05, + "loss": 9.2238, + "step": 27 + }, + { + "epoch": 0.00016652393186792274, + "grad_norm": 4.599213123321533, + "learning_rate": 4.9999996818920464e-05, + "loss": 9.1673, + "step": 28 + }, + { + "epoch": 0.00017247121514891997, + "grad_norm": 4.820634365081787, + "learning_rate": 4.999999657892133e-05, + "loss": 9.0044, + "step": 29 + }, + { + "epoch": 0.00017841849842991722, + "grad_norm": 4.57854700088501, + "learning_rate": 4.9999996330194956e-05, + "loss": 8.8746, + "step": 30 + }, + { + "epoch": 0.00018436578171091445, + "grad_norm": 4.567880153656006, + "learning_rate": 4.999999607274133e-05, + "loss": 8.7224, + "step": 31 + }, + { + "epoch": 0.0001903130649919117, + "grad_norm": 4.545701503753662, + "learning_rate": 4.9999995806560475e-05, + "loss": 8.6979, + "step": 32 + }, + { + "epoch": 0.00019626034827290893, + "grad_norm": 4.098274230957031, + "learning_rate": 4.9999995531652374e-05, + "loss": 8.5787, + "step": 33 + }, + { + "epoch": 0.0002022076315539062, + "grad_norm": 4.341195106506348, + "learning_rate": 4.999999524801704e-05, + "loss": 8.4452, + "step": 34 + }, + { + "epoch": 0.00020815491483490341, + "grad_norm": 4.651747703552246, + "learning_rate": 4.999999495565446e-05, + "loss": 8.4383, + "step": 35 + }, + { + "epoch": 0.00021410219811590067, + "grad_norm": 4.187220573425293, + "learning_rate": 4.999999465456464e-05, + "loss": 8.2441, + "step": 36 + }, + { + "epoch": 0.0002200494813968979, + "grad_norm": 4.094058990478516, + "learning_rate": 4.999999434474758e-05, + "loss": 8.2784, + "step": 37 + }, + { + "epoch": 0.00022599676467789515, + "grad_norm": 4.6094794273376465, + "learning_rate": 4.999999402620329e-05, + "loss": 8.3893, + "step": 38 + }, + { + "epoch": 0.00023194404795889238, + "grad_norm": 5.391327381134033, + "learning_rate": 4.999999369893175e-05, + "loss": 8.6491, + "step": 39 + }, + { + "epoch": 0.0002378913312398896, + "grad_norm": 5.03748893737793, + "learning_rate": 4.9999993362932974e-05, + "loss": 8.5279, + "step": 40 + }, + { + "epoch": 0.00024383861452088686, + "grad_norm": 5.306002616882324, + "learning_rate": 4.9999993018206956e-05, + "loss": 9.9965, + "step": 41 + }, + { + "epoch": 0.0002497858978018841, + "grad_norm": 5.5374274253845215, + "learning_rate": 4.99999926647537e-05, + "loss": 10.5594, + "step": 42 + }, + { + "epoch": 0.00025573318108288134, + "grad_norm": 3.8107693195343018, + "learning_rate": 4.999999230257321e-05, + "loss": 10.5451, + "step": 43 + }, + { + "epoch": 0.0002616804643638786, + "grad_norm": 3.922286033630371, + "learning_rate": 4.999999193166547e-05, + "loss": 10.4123, + "step": 44 + }, + { + "epoch": 0.0002676277476448758, + "grad_norm": 3.2090535163879395, + "learning_rate": 4.99999915520305e-05, + "loss": 10.0646, + "step": 45 + }, + { + "epoch": 0.00027357503092587305, + "grad_norm": 3.153404474258423, + "learning_rate": 4.9999991163668285e-05, + "loss": 10.237, + "step": 46 + }, + { + "epoch": 0.0002795223142068703, + "grad_norm": 4.83523416519165, + "learning_rate": 4.999999076657884e-05, + "loss": 8.9392, + "step": 47 + }, + { + "epoch": 0.00028546959748786756, + "grad_norm": 3.954632043838501, + "learning_rate": 4.999999036076215e-05, + "loss": 8.8562, + "step": 48 + }, + { + "epoch": 0.00029141688076886476, + "grad_norm": 4.452631950378418, + "learning_rate": 4.999998994621822e-05, + "loss": 9.8819, + "step": 49 + }, + { + "epoch": 0.000297364164049862, + "grad_norm": 4.71603536605835, + "learning_rate": 4.9999989522947055e-05, + "loss": 9.8503, + "step": 50 + }, + { + "epoch": 0.00030331144733085927, + "grad_norm": 3.8008105754852295, + "learning_rate": 4.999998909094865e-05, + "loss": 9.8072, + "step": 51 + }, + { + "epoch": 0.0003092587306118565, + "grad_norm": 3.9906716346740723, + "learning_rate": 4.999998865022301e-05, + "loss": 9.168, + "step": 52 + }, + { + "epoch": 0.0003152060138928537, + "grad_norm": 3.9425785541534424, + "learning_rate": 4.999998820077013e-05, + "loss": 9.8441, + "step": 53 + }, + { + "epoch": 0.000321153297173851, + "grad_norm": 3.6698031425476074, + "learning_rate": 4.999998774259002e-05, + "loss": 10.036, + "step": 54 + }, + { + "epoch": 0.00032710058045484823, + "grad_norm": 3.3027005195617676, + "learning_rate": 4.999998727568266e-05, + "loss": 9.8701, + "step": 55 + }, + { + "epoch": 0.0003330478637358455, + "grad_norm": 3.312570333480835, + "learning_rate": 4.999998680004807e-05, + "loss": 9.3354, + "step": 56 + }, + { + "epoch": 0.0003389951470168427, + "grad_norm": 3.323969602584839, + "learning_rate": 4.999998631568624e-05, + "loss": 9.2899, + "step": 57 + }, + { + "epoch": 0.00034494243029783994, + "grad_norm": 3.1319313049316406, + "learning_rate": 4.999998582259717e-05, + "loss": 9.1033, + "step": 58 + }, + { + "epoch": 0.0003508897135788372, + "grad_norm": 3.655060291290283, + "learning_rate": 4.999998532078087e-05, + "loss": 9.1574, + "step": 59 + }, + { + "epoch": 0.00035683699685983445, + "grad_norm": 3.2051918506622314, + "learning_rate": 4.999998481023733e-05, + "loss": 9.564, + "step": 60 + }, + { + "epoch": 0.00036278428014083165, + "grad_norm": 3.223015308380127, + "learning_rate": 4.999998429096656e-05, + "loss": 9.46, + "step": 61 + }, + { + "epoch": 0.0003687315634218289, + "grad_norm": 4.121186256408691, + "learning_rate": 4.999998376296855e-05, + "loss": 8.4136, + "step": 62 + }, + { + "epoch": 0.00037467884670282616, + "grad_norm": 3.5580086708068848, + "learning_rate": 4.9999983226243296e-05, + "loss": 9.3504, + "step": 63 + }, + { + "epoch": 0.0003806261299838234, + "grad_norm": 3.664219379425049, + "learning_rate": 4.999998268079081e-05, + "loss": 9.2889, + "step": 64 + }, + { + "epoch": 0.0003865734132648206, + "grad_norm": 2.955582618713379, + "learning_rate": 4.99999821266111e-05, + "loss": 8.9193, + "step": 65 + }, + { + "epoch": 0.00039252069654581787, + "grad_norm": 3.0592539310455322, + "learning_rate": 4.9999981563704144e-05, + "loss": 9.6739, + "step": 66 + }, + { + "epoch": 0.0003984679798268151, + "grad_norm": 3.32024884223938, + "learning_rate": 4.999998099206995e-05, + "loss": 9.3648, + "step": 67 + }, + { + "epoch": 0.0004044152631078124, + "grad_norm": 3.2716033458709717, + "learning_rate": 4.9999980411708524e-05, + "loss": 9.3652, + "step": 68 + }, + { + "epoch": 0.0004103625463888096, + "grad_norm": 3.1926631927490234, + "learning_rate": 4.999997982261987e-05, + "loss": 9.2924, + "step": 69 + }, + { + "epoch": 0.00041630982966980683, + "grad_norm": 3.589841604232788, + "learning_rate": 4.999997922480397e-05, + "loss": 9.2185, + "step": 70 + }, + { + "epoch": 0.0004222571129508041, + "grad_norm": 2.902132034301758, + "learning_rate": 4.999997861826084e-05, + "loss": 9.1047, + "step": 71 + }, + { + "epoch": 0.00042820439623180134, + "grad_norm": 3.2352359294891357, + "learning_rate": 4.999997800299048e-05, + "loss": 9.0309, + "step": 72 + }, + { + "epoch": 0.00043415167951279854, + "grad_norm": 2.683664560317993, + "learning_rate": 4.9999977378992884e-05, + "loss": 8.9977, + "step": 73 + }, + { + "epoch": 0.0004400989627937958, + "grad_norm": 3.0073423385620117, + "learning_rate": 4.9999976746268055e-05, + "loss": 9.0967, + "step": 74 + }, + { + "epoch": 0.00044604624607479305, + "grad_norm": 3.364819288253784, + "learning_rate": 4.9999976104815994e-05, + "loss": 8.9401, + "step": 75 + }, + { + "epoch": 0.0004519935293557903, + "grad_norm": 3.478936195373535, + "learning_rate": 4.9999975454636695e-05, + "loss": 8.8173, + "step": 76 + }, + { + "epoch": 0.0004579408126367875, + "grad_norm": 3.059669017791748, + "learning_rate": 4.9999974795730165e-05, + "loss": 9.2588, + "step": 77 + }, + { + "epoch": 0.00046388809591778476, + "grad_norm": 3.1980936527252197, + "learning_rate": 4.999997412809639e-05, + "loss": 9.3374, + "step": 78 + }, + { + "epoch": 0.000469835379198782, + "grad_norm": 2.859935998916626, + "learning_rate": 4.9999973451735405e-05, + "loss": 8.8996, + "step": 79 + }, + { + "epoch": 0.0004757826624797792, + "grad_norm": 3.6268489360809326, + "learning_rate": 4.9999972766647175e-05, + "loss": 8.7878, + "step": 80 + }, + { + "epoch": 0.00048172994576077647, + "grad_norm": 3.0187010765075684, + "learning_rate": 4.9999972072831714e-05, + "loss": 8.9177, + "step": 81 + }, + { + "epoch": 0.0004876772290417737, + "grad_norm": 3.304633378982544, + "learning_rate": 4.9999971370289014e-05, + "loss": 8.8098, + "step": 82 + }, + { + "epoch": 0.0004936245123227709, + "grad_norm": 3.678696870803833, + "learning_rate": 4.999997065901909e-05, + "loss": 8.9408, + "step": 83 + }, + { + "epoch": 0.0004995717956037682, + "grad_norm": 3.485488176345825, + "learning_rate": 4.9999969939021936e-05, + "loss": 8.7374, + "step": 84 + }, + { + "epoch": 0.0005055190788847654, + "grad_norm": 3.276916265487671, + "learning_rate": 4.999996921029755e-05, + "loss": 8.7177, + "step": 85 + }, + { + "epoch": 0.0005114663621657627, + "grad_norm": 3.060227632522583, + "learning_rate": 4.9999968472845926e-05, + "loss": 8.9673, + "step": 86 + }, + { + "epoch": 0.0005174136454467599, + "grad_norm": 3.359055995941162, + "learning_rate": 4.999996772666708e-05, + "loss": 8.8029, + "step": 87 + }, + { + "epoch": 0.0005233609287277572, + "grad_norm": 3.8916943073272705, + "learning_rate": 4.9999966971761004e-05, + "loss": 8.8363, + "step": 88 + }, + { + "epoch": 0.0005293082120087544, + "grad_norm": 3.825075387954712, + "learning_rate": 4.9999966208127694e-05, + "loss": 8.5683, + "step": 89 + }, + { + "epoch": 0.0005352554952897516, + "grad_norm": 3.475759267807007, + "learning_rate": 4.999996543576715e-05, + "loss": 8.5723, + "step": 90 + }, + { + "epoch": 0.0005412027785707488, + "grad_norm": 3.609776020050049, + "learning_rate": 4.9999964654679385e-05, + "loss": 8.6123, + "step": 91 + }, + { + "epoch": 0.0005471500618517461, + "grad_norm": 3.3749685287475586, + "learning_rate": 4.999996386486439e-05, + "loss": 8.4887, + "step": 92 + }, + { + "epoch": 0.0005530973451327434, + "grad_norm": 3.3853306770324707, + "learning_rate": 4.999996306632215e-05, + "loss": 8.56, + "step": 93 + }, + { + "epoch": 0.0005590446284137406, + "grad_norm": 3.9347422122955322, + "learning_rate": 4.99999622590527e-05, + "loss": 8.5053, + "step": 94 + }, + { + "epoch": 0.0005649919116947379, + "grad_norm": 3.6037611961364746, + "learning_rate": 4.999996144305601e-05, + "loss": 8.3367, + "step": 95 + }, + { + "epoch": 0.0005709391949757351, + "grad_norm": 3.4608941078186035, + "learning_rate": 4.99999606183321e-05, + "loss": 8.0674, + "step": 96 + }, + { + "epoch": 0.0005768864782567324, + "grad_norm": 3.4882898330688477, + "learning_rate": 4.999995978488096e-05, + "loss": 8.1728, + "step": 97 + }, + { + "epoch": 0.0005828337615377295, + "grad_norm": 3.6789562702178955, + "learning_rate": 4.999995894270258e-05, + "loss": 7.9535, + "step": 98 + }, + { + "epoch": 0.0005887810448187268, + "grad_norm": 3.57328200340271, + "learning_rate": 4.9999958091796986e-05, + "loss": 8.2048, + "step": 99 + }, + { + "epoch": 0.000594728328099724, + "grad_norm": 3.803468942642212, + "learning_rate": 4.999995723216416e-05, + "loss": 7.8073, + "step": 100 + }, + { + "epoch": 0.0006006756113807213, + "grad_norm": 3.8187785148620605, + "learning_rate": 4.9999956363804116e-05, + "loss": 7.6325, + "step": 101 + }, + { + "epoch": 0.0006066228946617185, + "grad_norm": 3.8681981563568115, + "learning_rate": 4.999995548671684e-05, + "loss": 7.7104, + "step": 102 + }, + { + "epoch": 0.0006125701779427158, + "grad_norm": 3.869074583053589, + "learning_rate": 4.9999954600902334e-05, + "loss": 7.8445, + "step": 103 + }, + { + "epoch": 0.000618517461223713, + "grad_norm": 3.852057695388794, + "learning_rate": 4.99999537063606e-05, + "loss": 7.872, + "step": 104 + }, + { + "epoch": 0.0006244647445047103, + "grad_norm": 4.784586429595947, + "learning_rate": 4.9999952803091654e-05, + "loss": 9.2218, + "step": 105 + }, + { + "epoch": 0.0006304120277857074, + "grad_norm": 4.296675682067871, + "learning_rate": 4.9999951891095474e-05, + "loss": 9.0957, + "step": 106 + }, + { + "epoch": 0.0006363593110667047, + "grad_norm": 3.9155995845794678, + "learning_rate": 4.999995097037207e-05, + "loss": 8.9829, + "step": 107 + }, + { + "epoch": 0.000642306594347702, + "grad_norm": 3.8967478275299072, + "learning_rate": 4.999995004092144e-05, + "loss": 8.2017, + "step": 108 + }, + { + "epoch": 0.0006482538776286992, + "grad_norm": 5.238500595092773, + "learning_rate": 4.999994910274358e-05, + "loss": 7.7976, + "step": 109 + }, + { + "epoch": 0.0006542011609096965, + "grad_norm": 3.7043144702911377, + "learning_rate": 4.9999948155838504e-05, + "loss": 8.3116, + "step": 110 + }, + { + "epoch": 0.0006601484441906937, + "grad_norm": 2.9745211601257324, + "learning_rate": 4.99999472002062e-05, + "loss": 8.69, + "step": 111 + }, + { + "epoch": 0.000666095727471691, + "grad_norm": 3.172652006149292, + "learning_rate": 4.999994623584668e-05, + "loss": 8.6244, + "step": 112 + }, + { + "epoch": 0.0006720430107526882, + "grad_norm": 3.224888801574707, + "learning_rate": 4.999994526275993e-05, + "loss": 8.6823, + "step": 113 + }, + { + "epoch": 0.0006779902940336854, + "grad_norm": 3.53104305267334, + "learning_rate": 4.9999944280945964e-05, + "loss": 8.495, + "step": 114 + }, + { + "epoch": 0.0006839375773146826, + "grad_norm": 3.013505697250366, + "learning_rate": 4.999994329040477e-05, + "loss": 8.4807, + "step": 115 + }, + { + "epoch": 0.0006898848605956799, + "grad_norm": 4.4741339683532715, + "learning_rate": 4.999994229113636e-05, + "loss": 8.94, + "step": 116 + }, + { + "epoch": 0.0006958321438766771, + "grad_norm": 4.78712272644043, + "learning_rate": 4.999994128314072e-05, + "loss": 8.9367, + "step": 117 + }, + { + "epoch": 0.0007017794271576744, + "grad_norm": 3.6983933448791504, + "learning_rate": 4.999994026641787e-05, + "loss": 8.7524, + "step": 118 + }, + { + "epoch": 0.0007077267104386716, + "grad_norm": 3.74997615814209, + "learning_rate": 4.9999939240967784e-05, + "loss": 8.3417, + "step": 119 + }, + { + "epoch": 0.0007136739937196689, + "grad_norm": 3.614593982696533, + "learning_rate": 4.999993820679049e-05, + "loss": 8.4848, + "step": 120 + }, + { + "epoch": 0.000719621277000666, + "grad_norm": 2.903045654296875, + "learning_rate": 4.999993716388597e-05, + "loss": 8.5519, + "step": 121 + }, + { + "epoch": 0.0007255685602816633, + "grad_norm": 3.402444839477539, + "learning_rate": 4.999993611225423e-05, + "loss": 8.2905, + "step": 122 + }, + { + "epoch": 0.0007315158435626606, + "grad_norm": 3.663893938064575, + "learning_rate": 4.9999935051895274e-05, + "loss": 8.4842, + "step": 123 + }, + { + "epoch": 0.0007374631268436578, + "grad_norm": 3.7535622119903564, + "learning_rate": 4.99999339828091e-05, + "loss": 8.4766, + "step": 124 + }, + { + "epoch": 0.0007434104101246551, + "grad_norm": 3.1285574436187744, + "learning_rate": 4.99999329049957e-05, + "loss": 8.3716, + "step": 125 + }, + { + "epoch": 0.0007493576934056523, + "grad_norm": 3.648869752883911, + "learning_rate": 4.9999931818455086e-05, + "loss": 8.3413, + "step": 126 + }, + { + "epoch": 0.0007553049766866496, + "grad_norm": 3.253399133682251, + "learning_rate": 4.9999930723187255e-05, + "loss": 8.0412, + "step": 127 + }, + { + "epoch": 0.0007612522599676468, + "grad_norm": 3.5694124698638916, + "learning_rate": 4.999992961919221e-05, + "loss": 8.0895, + "step": 128 + }, + { + "epoch": 0.000767199543248644, + "grad_norm": 4.106658458709717, + "learning_rate": 4.999992850646994e-05, + "loss": 8.3654, + "step": 129 + }, + { + "epoch": 0.0007731468265296412, + "grad_norm": 4.082829475402832, + "learning_rate": 4.9999927385020455e-05, + "loss": 8.2663, + "step": 130 + }, + { + "epoch": 0.0007790941098106385, + "grad_norm": 4.349386215209961, + "learning_rate": 4.9999926254843753e-05, + "loss": 8.2435, + "step": 131 + }, + { + "epoch": 0.0007850413930916357, + "grad_norm": 3.375697135925293, + "learning_rate": 4.999992511593984e-05, + "loss": 8.0827, + "step": 132 + }, + { + "epoch": 0.000790988676372633, + "grad_norm": 3.2566957473754883, + "learning_rate": 4.999992396830871e-05, + "loss": 8.4891, + "step": 133 + }, + { + "epoch": 0.0007969359596536302, + "grad_norm": 3.791579008102417, + "learning_rate": 4.999992281195036e-05, + "loss": 8.1567, + "step": 134 + }, + { + "epoch": 0.0008028832429346275, + "grad_norm": 3.8741838932037354, + "learning_rate": 4.99999216468648e-05, + "loss": 8.4033, + "step": 135 + }, + { + "epoch": 0.0008088305262156248, + "grad_norm": 4.229452133178711, + "learning_rate": 4.999992047305203e-05, + "loss": 8.3897, + "step": 136 + }, + { + "epoch": 0.0008147778094966219, + "grad_norm": 3.2732088565826416, + "learning_rate": 4.9999919290512034e-05, + "loss": 8.1758, + "step": 137 + }, + { + "epoch": 0.0008207250927776192, + "grad_norm": 3.2048966884613037, + "learning_rate": 4.9999918099244836e-05, + "loss": 8.1459, + "step": 138 + }, + { + "epoch": 0.0008266723760586164, + "grad_norm": 3.8639938831329346, + "learning_rate": 4.999991689925042e-05, + "loss": 7.9437, + "step": 139 + }, + { + "epoch": 0.0008326196593396137, + "grad_norm": 3.297252655029297, + "learning_rate": 4.9999915690528794e-05, + "loss": 8.1751, + "step": 140 + }, + { + "epoch": 0.0008385669426206109, + "grad_norm": 3.878218173980713, + "learning_rate": 4.999991447307995e-05, + "loss": 8.0572, + "step": 141 + }, + { + "epoch": 0.0008445142259016082, + "grad_norm": 3.6870739459991455, + "learning_rate": 4.9999913246903895e-05, + "loss": 8.0958, + "step": 142 + }, + { + "epoch": 0.0008504615091826054, + "grad_norm": 3.1817922592163086, + "learning_rate": 4.9999912012000636e-05, + "loss": 8.2683, + "step": 143 + }, + { + "epoch": 0.0008564087924636027, + "grad_norm": 3.4008772373199463, + "learning_rate": 4.999991076837016e-05, + "loss": 8.4171, + "step": 144 + }, + { + "epoch": 0.0008623560757445998, + "grad_norm": 3.002333641052246, + "learning_rate": 4.999990951601247e-05, + "loss": 8.1149, + "step": 145 + }, + { + "epoch": 0.0008683033590255971, + "grad_norm": 3.51910662651062, + "learning_rate": 4.999990825492757e-05, + "loss": 8.5284, + "step": 146 + }, + { + "epoch": 0.0008742506423065943, + "grad_norm": 2.978875160217285, + "learning_rate": 4.999990698511548e-05, + "loss": 8.4855, + "step": 147 + }, + { + "epoch": 0.0008801979255875916, + "grad_norm": 3.4708774089813232, + "learning_rate": 4.999990570657616e-05, + "loss": 8.333, + "step": 148 + }, + { + "epoch": 0.0008861452088685888, + "grad_norm": 2.994084596633911, + "learning_rate": 4.999990441930963e-05, + "loss": 8.3456, + "step": 149 + }, + { + "epoch": 0.0008920924921495861, + "grad_norm": 3.1295697689056396, + "learning_rate": 4.99999031233159e-05, + "loss": 8.2204, + "step": 150 + }, + { + "epoch": 0.0008980397754305833, + "grad_norm": 3.349720001220703, + "learning_rate": 4.9999901818594966e-05, + "loss": 8.2739, + "step": 151 + }, + { + "epoch": 0.0009039870587115806, + "grad_norm": 3.852964401245117, + "learning_rate": 4.999990050514681e-05, + "loss": 8.4225, + "step": 152 + }, + { + "epoch": 0.0009099343419925777, + "grad_norm": 3.92203950881958, + "learning_rate": 4.9999899182971456e-05, + "loss": 8.2882, + "step": 153 + }, + { + "epoch": 0.000915881625273575, + "grad_norm": 3.9960269927978516, + "learning_rate": 4.99998978520689e-05, + "loss": 8.2091, + "step": 154 + }, + { + "epoch": 0.0009218289085545723, + "grad_norm": 3.952327251434326, + "learning_rate": 4.999989651243913e-05, + "loss": 8.1726, + "step": 155 + }, + { + "epoch": 0.0009277761918355695, + "grad_norm": 3.9594647884368896, + "learning_rate": 4.9999895164082156e-05, + "loss": 8.0241, + "step": 156 + }, + { + "epoch": 0.0009337234751165668, + "grad_norm": 3.1129961013793945, + "learning_rate": 4.999989380699798e-05, + "loss": 8.14, + "step": 157 + }, + { + "epoch": 0.000939670758397564, + "grad_norm": 4.7737860679626465, + "learning_rate": 4.9999892441186604e-05, + "loss": 7.869, + "step": 158 + }, + { + "epoch": 0.0009456180416785613, + "grad_norm": 3.351327657699585, + "learning_rate": 4.9999891066648006e-05, + "loss": 8.1831, + "step": 159 + }, + { + "epoch": 0.0009515653249595584, + "grad_norm": 3.0245375633239746, + "learning_rate": 4.999988968338222e-05, + "loss": 8.3871, + "step": 160 + }, + { + "epoch": 0.0009575126082405557, + "grad_norm": 4.766855716705322, + "learning_rate": 4.999988829138923e-05, + "loss": 8.0078, + "step": 161 + }, + { + "epoch": 0.0009634598915215529, + "grad_norm": 3.975804090499878, + "learning_rate": 4.999988689066903e-05, + "loss": 7.6923, + "step": 162 + }, + { + "epoch": 0.0009694071748025502, + "grad_norm": 4.024605751037598, + "learning_rate": 4.999988548122163e-05, + "loss": 8.2986, + "step": 163 + }, + { + "epoch": 0.0009753544580835474, + "grad_norm": 4.230019569396973, + "learning_rate": 4.999988406304703e-05, + "loss": 8.2903, + "step": 164 + }, + { + "epoch": 0.0009813017413645446, + "grad_norm": 3.972825050354004, + "learning_rate": 4.9999882636145236e-05, + "loss": 8.3589, + "step": 165 + }, + { + "epoch": 0.0009872490246455418, + "grad_norm": 3.6381688117980957, + "learning_rate": 4.999988120051623e-05, + "loss": 8.2648, + "step": 166 + }, + { + "epoch": 0.000993196307926539, + "grad_norm": 4.203462600708008, + "learning_rate": 4.9999879756160025e-05, + "loss": 8.363, + "step": 167 + }, + { + "epoch": 0.0009991435912075363, + "grad_norm": 2.944103479385376, + "learning_rate": 4.9999878303076624e-05, + "loss": 7.9752, + "step": 168 + }, + { + "epoch": 0.0010050908744885336, + "grad_norm": 3.4115283489227295, + "learning_rate": 4.9999876841266025e-05, + "loss": 8.1044, + "step": 169 + }, + { + "epoch": 0.0010110381577695309, + "grad_norm": 4.185582160949707, + "learning_rate": 4.999987537072822e-05, + "loss": 8.0347, + "step": 170 + }, + { + "epoch": 0.0010169854410505281, + "grad_norm": 3.333649158477783, + "learning_rate": 4.999987389146323e-05, + "loss": 8.0545, + "step": 171 + }, + { + "epoch": 0.0010229327243315254, + "grad_norm": 3.7702765464782715, + "learning_rate": 4.999987240347103e-05, + "loss": 7.8936, + "step": 172 + }, + { + "epoch": 0.0010288800076125226, + "grad_norm": 4.113167762756348, + "learning_rate": 4.9999870906751636e-05, + "loss": 7.9447, + "step": 173 + }, + { + "epoch": 0.0010348272908935199, + "grad_norm": 3.370821714401245, + "learning_rate": 4.999986940130505e-05, + "loss": 7.9745, + "step": 174 + }, + { + "epoch": 0.0010407745741745171, + "grad_norm": 3.552391767501831, + "learning_rate": 4.999986788713126e-05, + "loss": 7.8882, + "step": 175 + }, + { + "epoch": 0.0010467218574555144, + "grad_norm": 3.3497536182403564, + "learning_rate": 4.999986636423028e-05, + "loss": 7.8601, + "step": 176 + }, + { + "epoch": 0.0010526691407365116, + "grad_norm": 3.256685733795166, + "learning_rate": 4.9999864832602105e-05, + "loss": 7.8341, + "step": 177 + }, + { + "epoch": 0.001058616424017509, + "grad_norm": 3.028108835220337, + "learning_rate": 4.999986329224674e-05, + "loss": 7.884, + "step": 178 + }, + { + "epoch": 0.0010645637072985061, + "grad_norm": 2.9583778381347656, + "learning_rate": 4.9999861743164165e-05, + "loss": 7.7875, + "step": 179 + }, + { + "epoch": 0.0010705109905795032, + "grad_norm": 3.109215497970581, + "learning_rate": 4.999986018535441e-05, + "loss": 8.4081, + "step": 180 + }, + { + "epoch": 0.0010764582738605004, + "grad_norm": 3.8907759189605713, + "learning_rate": 4.999985861881746e-05, + "loss": 8.0971, + "step": 181 + }, + { + "epoch": 0.0010824055571414977, + "grad_norm": 4.20400857925415, + "learning_rate": 4.9999857043553314e-05, + "loss": 7.9077, + "step": 182 + }, + { + "epoch": 0.001088352840422495, + "grad_norm": 3.580486297607422, + "learning_rate": 4.999985545956198e-05, + "loss": 7.8935, + "step": 183 + }, + { + "epoch": 0.0010943001237034922, + "grad_norm": 3.3833847045898438, + "learning_rate": 4.999985386684345e-05, + "loss": 7.9956, + "step": 184 + }, + { + "epoch": 0.0011002474069844895, + "grad_norm": 2.8848624229431152, + "learning_rate": 4.9999852265397734e-05, + "loss": 8.0718, + "step": 185 + }, + { + "epoch": 0.0011061946902654867, + "grad_norm": 3.8933818340301514, + "learning_rate": 4.999985065522483e-05, + "loss": 8.0517, + "step": 186 + }, + { + "epoch": 0.001112141973546484, + "grad_norm": 3.6559605598449707, + "learning_rate": 4.999984903632473e-05, + "loss": 8.3664, + "step": 187 + }, + { + "epoch": 0.0011180892568274812, + "grad_norm": 3.4633536338806152, + "learning_rate": 4.999984740869744e-05, + "loss": 8.3481, + "step": 188 + }, + { + "epoch": 0.0011240365401084785, + "grad_norm": 3.483020305633545, + "learning_rate": 4.999984577234297e-05, + "loss": 8.3407, + "step": 189 + }, + { + "epoch": 0.0011299838233894757, + "grad_norm": 2.772434711456299, + "learning_rate": 4.999984412726131e-05, + "loss": 8.4524, + "step": 190 + }, + { + "epoch": 0.001135931106670473, + "grad_norm": 3.3341007232666016, + "learning_rate": 4.999984247345246e-05, + "loss": 8.1063, + "step": 191 + }, + { + "epoch": 0.0011418783899514702, + "grad_norm": 3.0063467025756836, + "learning_rate": 4.999984081091642e-05, + "loss": 8.0077, + "step": 192 + }, + { + "epoch": 0.0011478256732324675, + "grad_norm": 2.9670779705047607, + "learning_rate": 4.99998391396532e-05, + "loss": 8.2338, + "step": 193 + }, + { + "epoch": 0.0011537729565134647, + "grad_norm": 3.024505138397217, + "learning_rate": 4.999983745966279e-05, + "loss": 8.1794, + "step": 194 + }, + { + "epoch": 0.0011597202397944618, + "grad_norm": 2.834131956100464, + "learning_rate": 4.9999835770945195e-05, + "loss": 8.2078, + "step": 195 + }, + { + "epoch": 0.001165667523075459, + "grad_norm": 3.555525064468384, + "learning_rate": 4.999983407350042e-05, + "loss": 8.0838, + "step": 196 + }, + { + "epoch": 0.0011716148063564563, + "grad_norm": 3.5013587474823, + "learning_rate": 4.999983236732846e-05, + "loss": 8.092, + "step": 197 + }, + { + "epoch": 0.0011775620896374535, + "grad_norm": 3.3721518516540527, + "learning_rate": 4.9999830652429314e-05, + "loss": 8.1137, + "step": 198 + }, + { + "epoch": 0.0011835093729184508, + "grad_norm": 3.364952564239502, + "learning_rate": 4.9999828928802986e-05, + "loss": 8.1197, + "step": 199 + }, + { + "epoch": 0.001189456656199448, + "grad_norm": 3.691249132156372, + "learning_rate": 4.999982719644948e-05, + "loss": 8.0922, + "step": 200 + }, + { + "epoch": 0.0011954039394804453, + "grad_norm": 6.919185161590576, + "learning_rate": 4.9999825455368785e-05, + "loss": 7.9215, + "step": 201 + }, + { + "epoch": 0.0012013512227614426, + "grad_norm": 3.3332598209381104, + "learning_rate": 4.999982370556091e-05, + "loss": 7.7605, + "step": 202 + }, + { + "epoch": 0.0012072985060424398, + "grad_norm": 2.842517375946045, + "learning_rate": 4.999982194702586e-05, + "loss": 8.0527, + "step": 203 + }, + { + "epoch": 0.001213245789323437, + "grad_norm": 3.086371660232544, + "learning_rate": 4.999982017976364e-05, + "loss": 8.2637, + "step": 204 + }, + { + "epoch": 0.0012191930726044343, + "grad_norm": 3.0870208740234375, + "learning_rate": 4.999981840377422e-05, + "loss": 8.3538, + "step": 205 + }, + { + "epoch": 0.0012251403558854316, + "grad_norm": 3.1244094371795654, + "learning_rate": 4.9999816619057633e-05, + "loss": 8.4604, + "step": 206 + }, + { + "epoch": 0.0012310876391664288, + "grad_norm": 2.7808034420013428, + "learning_rate": 4.999981482561387e-05, + "loss": 8.3227, + "step": 207 + }, + { + "epoch": 0.001237034922447426, + "grad_norm": 2.791182518005371, + "learning_rate": 4.999981302344292e-05, + "loss": 8.1481, + "step": 208 + }, + { + "epoch": 0.0012429822057284233, + "grad_norm": 3.045971632003784, + "learning_rate": 4.99998112125448e-05, + "loss": 7.7842, + "step": 209 + }, + { + "epoch": 0.0012489294890094206, + "grad_norm": 3.2548067569732666, + "learning_rate": 4.99998093929195e-05, + "loss": 7.9935, + "step": 210 + }, + { + "epoch": 0.0012548767722904176, + "grad_norm": 3.5448713302612305, + "learning_rate": 4.999980756456704e-05, + "loss": 8.0323, + "step": 211 + }, + { + "epoch": 0.0012608240555714149, + "grad_norm": 3.717900514602661, + "learning_rate": 4.9999805727487395e-05, + "loss": 8.0532, + "step": 212 + }, + { + "epoch": 0.0012667713388524121, + "grad_norm": 3.2943921089172363, + "learning_rate": 4.9999803881680576e-05, + "loss": 8.0326, + "step": 213 + }, + { + "epoch": 0.0012727186221334094, + "grad_norm": 3.4586269855499268, + "learning_rate": 4.999980202714658e-05, + "loss": 7.8765, + "step": 214 + }, + { + "epoch": 0.0012786659054144067, + "grad_norm": 3.1898810863494873, + "learning_rate": 4.9999800163885414e-05, + "loss": 7.8859, + "step": 215 + }, + { + "epoch": 0.001284613188695404, + "grad_norm": 2.977229595184326, + "learning_rate": 4.9999798291897084e-05, + "loss": 7.8841, + "step": 216 + }, + { + "epoch": 0.0012905604719764012, + "grad_norm": 3.368680000305176, + "learning_rate": 4.999979641118157e-05, + "loss": 7.8055, + "step": 217 + }, + { + "epoch": 0.0012965077552573984, + "grad_norm": 4.295344352722168, + "learning_rate": 4.9999794521738894e-05, + "loss": 7.6456, + "step": 218 + }, + { + "epoch": 0.0013024550385383957, + "grad_norm": 3.985480546951294, + "learning_rate": 4.999979262356904e-05, + "loss": 7.6987, + "step": 219 + }, + { + "epoch": 0.001308402321819393, + "grad_norm": 3.8719842433929443, + "learning_rate": 4.999979071667202e-05, + "loss": 7.6994, + "step": 220 + }, + { + "epoch": 0.0013143496051003902, + "grad_norm": 4.699835300445557, + "learning_rate": 4.999978880104784e-05, + "loss": 8.1815, + "step": 221 + }, + { + "epoch": 0.0013202968883813874, + "grad_norm": 3.9221127033233643, + "learning_rate": 4.9999786876696485e-05, + "loss": 7.8765, + "step": 222 + }, + { + "epoch": 0.0013262441716623847, + "grad_norm": 4.4223504066467285, + "learning_rate": 4.9999784943617964e-05, + "loss": 7.7244, + "step": 223 + }, + { + "epoch": 0.001332191454943382, + "grad_norm": 3.4598348140716553, + "learning_rate": 4.999978300181227e-05, + "loss": 7.7072, + "step": 224 + }, + { + "epoch": 0.0013381387382243792, + "grad_norm": 3.536752223968506, + "learning_rate": 4.999978105127941e-05, + "loss": 7.6337, + "step": 225 + }, + { + "epoch": 0.0013440860215053765, + "grad_norm": 3.6432204246520996, + "learning_rate": 4.99997790920194e-05, + "loss": 7.8078, + "step": 226 + }, + { + "epoch": 0.0013500333047863735, + "grad_norm": 4.8305768966674805, + "learning_rate": 4.999977712403221e-05, + "loss": 7.9003, + "step": 227 + }, + { + "epoch": 0.0013559805880673707, + "grad_norm": 3.773876428604126, + "learning_rate": 4.999977514731786e-05, + "loss": 8.0513, + "step": 228 + }, + { + "epoch": 0.001361927871348368, + "grad_norm": 4.465645790100098, + "learning_rate": 4.999977316187635e-05, + "loss": 7.9847, + "step": 229 + }, + { + "epoch": 0.0013678751546293653, + "grad_norm": 3.9466493129730225, + "learning_rate": 4.9999771167707674e-05, + "loss": 7.9902, + "step": 230 + }, + { + "epoch": 0.0013738224379103625, + "grad_norm": 4.432138919830322, + "learning_rate": 4.9999769164811846e-05, + "loss": 7.8929, + "step": 231 + }, + { + "epoch": 0.0013797697211913598, + "grad_norm": 3.5211949348449707, + "learning_rate": 4.999976715318885e-05, + "loss": 8.1838, + "step": 232 + }, + { + "epoch": 0.001385717004472357, + "grad_norm": 3.0819287300109863, + "learning_rate": 4.9999765132838686e-05, + "loss": 8.2823, + "step": 233 + }, + { + "epoch": 0.0013916642877533543, + "grad_norm": 3.436112880706787, + "learning_rate": 4.9999763103761374e-05, + "loss": 7.7796, + "step": 234 + }, + { + "epoch": 0.0013976115710343515, + "grad_norm": 3.6699061393737793, + "learning_rate": 4.99997610659569e-05, + "loss": 7.5792, + "step": 235 + }, + { + "epoch": 0.0014035588543153488, + "grad_norm": 3.814182758331299, + "learning_rate": 4.999975901942526e-05, + "loss": 7.5631, + "step": 236 + }, + { + "epoch": 0.001409506137596346, + "grad_norm": 3.84110164642334, + "learning_rate": 4.9999756964166465e-05, + "loss": 7.4244, + "step": 237 + }, + { + "epoch": 0.0014154534208773433, + "grad_norm": 3.278045415878296, + "learning_rate": 4.999975490018052e-05, + "loss": 7.9049, + "step": 238 + }, + { + "epoch": 0.0014214007041583405, + "grad_norm": 3.5502712726593018, + "learning_rate": 4.999975282746742e-05, + "loss": 8.0021, + "step": 239 + }, + { + "epoch": 0.0014273479874393378, + "grad_norm": 2.7919108867645264, + "learning_rate": 4.9999750746027153e-05, + "loss": 8.2854, + "step": 240 + }, + { + "epoch": 0.001433295270720335, + "grad_norm": 3.1689581871032715, + "learning_rate": 4.999974865585973e-05, + "loss": 8.3177, + "step": 241 + }, + { + "epoch": 0.001439242554001332, + "grad_norm": 2.728679656982422, + "learning_rate": 4.999974655696517e-05, + "loss": 8.3181, + "step": 242 + }, + { + "epoch": 0.0014451898372823293, + "grad_norm": 3.5175108909606934, + "learning_rate": 4.9999744449343445e-05, + "loss": 8.03, + "step": 243 + }, + { + "epoch": 0.0014511371205633266, + "grad_norm": 3.714219808578491, + "learning_rate": 4.999974233299457e-05, + "loss": 8.0824, + "step": 244 + }, + { + "epoch": 0.0014570844038443239, + "grad_norm": 3.42090106010437, + "learning_rate": 4.9999740207918546e-05, + "loss": 8.0455, + "step": 245 + }, + { + "epoch": 0.001463031687125321, + "grad_norm": 3.035047769546509, + "learning_rate": 4.999973807411537e-05, + "loss": 8.0117, + "step": 246 + }, + { + "epoch": 0.0014689789704063184, + "grad_norm": 3.4878122806549072, + "learning_rate": 4.9999735931585034e-05, + "loss": 8.1368, + "step": 247 + }, + { + "epoch": 0.0014749262536873156, + "grad_norm": 3.648115873336792, + "learning_rate": 4.999973378032756e-05, + "loss": 7.9987, + "step": 248 + }, + { + "epoch": 0.0014808735369683129, + "grad_norm": 3.171255588531494, + "learning_rate": 4.9999731620342936e-05, + "loss": 7.9733, + "step": 249 + }, + { + "epoch": 0.0014868208202493101, + "grad_norm": 3.157804250717163, + "learning_rate": 4.999972945163116e-05, + "loss": 7.8511, + "step": 250 + }, + { + "epoch": 0.0014927681035303074, + "grad_norm": 3.4346978664398193, + "learning_rate": 4.999972727419224e-05, + "loss": 7.9075, + "step": 251 + }, + { + "epoch": 0.0014987153868113046, + "grad_norm": 3.281135082244873, + "learning_rate": 4.9999725088026175e-05, + "loss": 7.876, + "step": 252 + }, + { + "epoch": 0.0015046626700923019, + "grad_norm": 3.1481714248657227, + "learning_rate": 4.9999722893132954e-05, + "loss": 8.1458, + "step": 253 + }, + { + "epoch": 0.0015106099533732991, + "grad_norm": 2.821460247039795, + "learning_rate": 4.99997206895126e-05, + "loss": 7.9141, + "step": 254 + }, + { + "epoch": 0.0015165572366542964, + "grad_norm": 2.887997627258301, + "learning_rate": 4.999971847716509e-05, + "loss": 8.2246, + "step": 255 + }, + { + "epoch": 0.0015225045199352936, + "grad_norm": 2.8097078800201416, + "learning_rate": 4.999971625609044e-05, + "loss": 7.8576, + "step": 256 + }, + { + "epoch": 0.001528451803216291, + "grad_norm": 2.9272890090942383, + "learning_rate": 4.999971402628866e-05, + "loss": 7.6856, + "step": 257 + }, + { + "epoch": 0.001534399086497288, + "grad_norm": 3.487027168273926, + "learning_rate": 4.999971178775973e-05, + "loss": 7.8179, + "step": 258 + }, + { + "epoch": 0.0015403463697782852, + "grad_norm": 3.575681209564209, + "learning_rate": 4.9999709540503656e-05, + "loss": 7.8115, + "step": 259 + }, + { + "epoch": 0.0015462936530592824, + "grad_norm": 3.457756757736206, + "learning_rate": 4.9999707284520435e-05, + "loss": 7.7985, + "step": 260 + }, + { + "epoch": 0.0015522409363402797, + "grad_norm": 3.732728958129883, + "learning_rate": 4.999970501981009e-05, + "loss": 7.8369, + "step": 261 + }, + { + "epoch": 0.001558188219621277, + "grad_norm": 4.1466898918151855, + "learning_rate": 4.99997027463726e-05, + "loss": 8.2435, + "step": 262 + }, + { + "epoch": 0.0015641355029022742, + "grad_norm": 4.028534889221191, + "learning_rate": 4.9999700464207965e-05, + "loss": 8.2338, + "step": 263 + }, + { + "epoch": 0.0015700827861832715, + "grad_norm": 3.7445273399353027, + "learning_rate": 4.99996981733162e-05, + "loss": 8.1182, + "step": 264 + }, + { + "epoch": 0.0015760300694642687, + "grad_norm": 3.455228567123413, + "learning_rate": 4.99996958736973e-05, + "loss": 8.1932, + "step": 265 + }, + { + "epoch": 0.001581977352745266, + "grad_norm": 3.1530332565307617, + "learning_rate": 4.9999693565351256e-05, + "loss": 7.8304, + "step": 266 + }, + { + "epoch": 0.0015879246360262632, + "grad_norm": 3.113161325454712, + "learning_rate": 4.999969124827809e-05, + "loss": 7.6625, + "step": 267 + }, + { + "epoch": 0.0015938719193072605, + "grad_norm": 3.621076822280884, + "learning_rate": 4.999968892247778e-05, + "loss": 8.0983, + "step": 268 + }, + { + "epoch": 0.0015998192025882577, + "grad_norm": 3.533395767211914, + "learning_rate": 4.9999686587950346e-05, + "loss": 7.9564, + "step": 269 + }, + { + "epoch": 0.001605766485869255, + "grad_norm": 3.6486849784851074, + "learning_rate": 4.999968424469577e-05, + "loss": 7.9864, + "step": 270 + }, + { + "epoch": 0.0016117137691502522, + "grad_norm": 3.223167657852173, + "learning_rate": 4.999968189271407e-05, + "loss": 7.8516, + "step": 271 + }, + { + "epoch": 0.0016176610524312495, + "grad_norm": 3.282062530517578, + "learning_rate": 4.999967953200523e-05, + "loss": 7.9247, + "step": 272 + }, + { + "epoch": 0.0016236083357122465, + "grad_norm": 2.8589930534362793, + "learning_rate": 4.999967716256927e-05, + "loss": 7.8871, + "step": 273 + }, + { + "epoch": 0.0016295556189932438, + "grad_norm": 3.136882781982422, + "learning_rate": 4.9999674784406174e-05, + "loss": 7.8793, + "step": 274 + }, + { + "epoch": 0.001635502902274241, + "grad_norm": 3.9103915691375732, + "learning_rate": 4.999967239751595e-05, + "loss": 7.9005, + "step": 275 + }, + { + "epoch": 0.0016414501855552383, + "grad_norm": 4.40267276763916, + "learning_rate": 4.99996700018986e-05, + "loss": 7.9247, + "step": 276 + }, + { + "epoch": 0.0016473974688362356, + "grad_norm": 3.6620242595672607, + "learning_rate": 4.9999667597554136e-05, + "loss": 8.0719, + "step": 277 + }, + { + "epoch": 0.0016533447521172328, + "grad_norm": 3.1278858184814453, + "learning_rate": 4.999966518448253e-05, + "loss": 8.0822, + "step": 278 + }, + { + "epoch": 0.00165929203539823, + "grad_norm": 3.321831464767456, + "learning_rate": 4.9999662762683805e-05, + "loss": 8.1266, + "step": 279 + }, + { + "epoch": 0.0016652393186792273, + "grad_norm": 3.4116811752319336, + "learning_rate": 4.999966033215795e-05, + "loss": 8.2159, + "step": 280 + }, + { + "epoch": 0.0016711866019602246, + "grad_norm": 3.58381724357605, + "learning_rate": 4.999965789290498e-05, + "loss": 8.0275, + "step": 281 + }, + { + "epoch": 0.0016771338852412218, + "grad_norm": 3.0357518196105957, + "learning_rate": 4.9999655444924884e-05, + "loss": 8.1171, + "step": 282 + }, + { + "epoch": 0.001683081168522219, + "grad_norm": 3.237764596939087, + "learning_rate": 4.999965298821767e-05, + "loss": 7.822, + "step": 283 + }, + { + "epoch": 0.0016890284518032163, + "grad_norm": 3.0861873626708984, + "learning_rate": 4.999965052278334e-05, + "loss": 7.7991, + "step": 284 + }, + { + "epoch": 0.0016949757350842136, + "grad_norm": 2.8045542240142822, + "learning_rate": 4.999964804862187e-05, + "loss": 7.9659, + "step": 285 + }, + { + "epoch": 0.0017009230183652108, + "grad_norm": 3.1282641887664795, + "learning_rate": 4.9999645565733297e-05, + "loss": 7.8354, + "step": 286 + }, + { + "epoch": 0.001706870301646208, + "grad_norm": 2.980001211166382, + "learning_rate": 4.999964307411761e-05, + "loss": 7.806, + "step": 287 + }, + { + "epoch": 0.0017128175849272054, + "grad_norm": 3.114238977432251, + "learning_rate": 4.99996405737748e-05, + "loss": 7.6173, + "step": 288 + }, + { + "epoch": 0.0017187648682082024, + "grad_norm": 2.6732640266418457, + "learning_rate": 4.9999638064704866e-05, + "loss": 7.5944, + "step": 289 + }, + { + "epoch": 0.0017247121514891996, + "grad_norm": 3.2139906883239746, + "learning_rate": 4.999963554690783e-05, + "loss": 7.5738, + "step": 290 + }, + { + "epoch": 0.001730659434770197, + "grad_norm": 3.0964555740356445, + "learning_rate": 4.999963302038368e-05, + "loss": 7.4431, + "step": 291 + }, + { + "epoch": 0.0017366067180511942, + "grad_norm": 3.0611374378204346, + "learning_rate": 4.99996304851324e-05, + "loss": 7.3748, + "step": 292 + }, + { + "epoch": 0.0017425540013321914, + "grad_norm": 2.88114333152771, + "learning_rate": 4.999962794115402e-05, + "loss": 7.3554, + "step": 293 + }, + { + "epoch": 0.0017485012846131887, + "grad_norm": 2.895141363143921, + "learning_rate": 4.999962538844852e-05, + "loss": 7.2801, + "step": 294 + }, + { + "epoch": 0.001754448567894186, + "grad_norm": 3.0645008087158203, + "learning_rate": 4.9999622827015914e-05, + "loss": 7.1753, + "step": 295 + }, + { + "epoch": 0.0017603958511751832, + "grad_norm": 3.0750465393066406, + "learning_rate": 4.99996202568562e-05, + "loss": 7.1905, + "step": 296 + }, + { + "epoch": 0.0017663431344561804, + "grad_norm": 3.1322436332702637, + "learning_rate": 4.9999617677969374e-05, + "loss": 7.0851, + "step": 297 + }, + { + "epoch": 0.0017722904177371777, + "grad_norm": 3.8287153244018555, + "learning_rate": 4.999961509035544e-05, + "loss": 7.0842, + "step": 298 + }, + { + "epoch": 0.001778237701018175, + "grad_norm": 2.874312162399292, + "learning_rate": 4.9999612494014403e-05, + "loss": 6.9588, + "step": 299 + }, + { + "epoch": 0.0017841849842991722, + "grad_norm": 2.916250705718994, + "learning_rate": 4.999960988894625e-05, + "loss": 7.1342, + "step": 300 + }, + { + "epoch": 0.0017901322675801694, + "grad_norm": 2.71624755859375, + "learning_rate": 4.9999607275151e-05, + "loss": 7.0418, + "step": 301 + }, + { + "epoch": 0.0017960795508611667, + "grad_norm": 2.655630350112915, + "learning_rate": 4.999960465262864e-05, + "loss": 6.937, + "step": 302 + }, + { + "epoch": 0.001802026834142164, + "grad_norm": 2.8819122314453125, + "learning_rate": 4.999960202137918e-05, + "loss": 7.0116, + "step": 303 + }, + { + "epoch": 0.0018079741174231612, + "grad_norm": 2.909701108932495, + "learning_rate": 4.999959938140262e-05, + "loss": 6.9588, + "step": 304 + }, + { + "epoch": 0.0018139214007041582, + "grad_norm": 3.276395797729492, + "learning_rate": 4.999959673269895e-05, + "loss": 6.9066, + "step": 305 + }, + { + "epoch": 0.0018198686839851555, + "grad_norm": 2.8774867057800293, + "learning_rate": 4.9999594075268186e-05, + "loss": 7.0112, + "step": 306 + }, + { + "epoch": 0.0018258159672661528, + "grad_norm": 2.9667818546295166, + "learning_rate": 4.999959140911032e-05, + "loss": 7.1467, + "step": 307 + }, + { + "epoch": 0.00183176325054715, + "grad_norm": 6.6612958908081055, + "learning_rate": 4.999958873422536e-05, + "loss": 8.4457, + "step": 308 + }, + { + "epoch": 0.0018377105338281473, + "grad_norm": 4.234557628631592, + "learning_rate": 4.999958605061329e-05, + "loss": 8.904, + "step": 309 + }, + { + "epoch": 0.0018436578171091445, + "grad_norm": 4.049502372741699, + "learning_rate": 4.999958335827413e-05, + "loss": 7.5174, + "step": 310 + }, + { + "epoch": 0.0018496051003901418, + "grad_norm": 3.574474334716797, + "learning_rate": 4.999958065720787e-05, + "loss": 8.6537, + "step": 311 + }, + { + "epoch": 0.001855552383671139, + "grad_norm": 3.6154026985168457, + "learning_rate": 4.9999577947414515e-05, + "loss": 8.5833, + "step": 312 + }, + { + "epoch": 0.0018614996669521363, + "grad_norm": 2.9204158782958984, + "learning_rate": 4.999957522889407e-05, + "loss": 8.5486, + "step": 313 + }, + { + "epoch": 0.0018674469502331335, + "grad_norm": 3.095310688018799, + "learning_rate": 4.999957250164653e-05, + "loss": 8.3855, + "step": 314 + }, + { + "epoch": 0.0018733942335141308, + "grad_norm": 3.872267723083496, + "learning_rate": 4.999956976567189e-05, + "loss": 8.2715, + "step": 315 + }, + { + "epoch": 0.001879341516795128, + "grad_norm": 3.5560686588287354, + "learning_rate": 4.9999567020970175e-05, + "loss": 8.1571, + "step": 316 + }, + { + "epoch": 0.0018852888000761253, + "grad_norm": 2.6759164333343506, + "learning_rate": 4.9999564267541356e-05, + "loss": 8.4072, + "step": 317 + }, + { + "epoch": 0.0018912360833571226, + "grad_norm": 4.034712791442871, + "learning_rate": 4.999956150538545e-05, + "loss": 7.7622, + "step": 318 + }, + { + "epoch": 0.0018971833666381198, + "grad_norm": 3.8927831649780273, + "learning_rate": 4.999955873450246e-05, + "loss": 7.5012, + "step": 319 + }, + { + "epoch": 0.0019031306499191168, + "grad_norm": 3.4422812461853027, + "learning_rate": 4.999955595489237e-05, + "loss": 7.6894, + "step": 320 + }, + { + "epoch": 0.001909077933200114, + "grad_norm": 3.0367283821105957, + "learning_rate": 4.999955316655521e-05, + "loss": 7.8151, + "step": 321 + }, + { + "epoch": 0.0019150252164811114, + "grad_norm": 3.7553489208221436, + "learning_rate": 4.9999550369490955e-05, + "loss": 8.0462, + "step": 322 + }, + { + "epoch": 0.0019209724997621086, + "grad_norm": 3.432591438293457, + "learning_rate": 4.999954756369962e-05, + "loss": 7.8782, + "step": 323 + }, + { + "epoch": 0.0019269197830431059, + "grad_norm": 2.7325966358184814, + "learning_rate": 4.9999544749181196e-05, + "loss": 7.9045, + "step": 324 + }, + { + "epoch": 0.0019328670663241031, + "grad_norm": 4.31963586807251, + "learning_rate": 4.9999541925935686e-05, + "loss": 7.7791, + "step": 325 + }, + { + "epoch": 0.0019388143496051004, + "grad_norm": 2.840189218521118, + "learning_rate": 4.999953909396311e-05, + "loss": 7.8334, + "step": 326 + }, + { + "epoch": 0.0019447616328860976, + "grad_norm": 3.2388041019439697, + "learning_rate": 4.9999536253263434e-05, + "loss": 7.6756, + "step": 327 + }, + { + "epoch": 0.0019507089161670949, + "grad_norm": 3.6291563510894775, + "learning_rate": 4.999953340383669e-05, + "loss": 7.6511, + "step": 328 + }, + { + "epoch": 0.001956656199448092, + "grad_norm": 3.35703706741333, + "learning_rate": 4.999953054568287e-05, + "loss": 7.6382, + "step": 329 + }, + { + "epoch": 0.001962603482729089, + "grad_norm": 3.117281198501587, + "learning_rate": 4.999952767880196e-05, + "loss": 7.6233, + "step": 330 + }, + { + "epoch": 0.0019685507660100864, + "grad_norm": 2.8385257720947266, + "learning_rate": 4.999952480319398e-05, + "loss": 7.6594, + "step": 331 + }, + { + "epoch": 0.0019744980492910837, + "grad_norm": 2.5914418697357178, + "learning_rate": 4.999952191885893e-05, + "loss": 8.2647, + "step": 332 + }, + { + "epoch": 0.001980445332572081, + "grad_norm": 2.5847742557525635, + "learning_rate": 4.9999519025796795e-05, + "loss": 8.339, + "step": 333 + }, + { + "epoch": 0.001986392615853078, + "grad_norm": 2.7022132873535156, + "learning_rate": 4.999951612400759e-05, + "loss": 7.9114, + "step": 334 + }, + { + "epoch": 0.0019923398991340754, + "grad_norm": 3.0290884971618652, + "learning_rate": 4.999951321349131e-05, + "loss": 7.4531, + "step": 335 + }, + { + "epoch": 0.0019982871824150727, + "grad_norm": 2.8910324573516846, + "learning_rate": 4.999951029424796e-05, + "loss": 7.398, + "step": 336 + }, + { + "epoch": 0.00200423446569607, + "grad_norm": 2.8917605876922607, + "learning_rate": 4.9999507366277545e-05, + "loss": 7.48, + "step": 337 + }, + { + "epoch": 0.002010181748977067, + "grad_norm": 2.8957982063293457, + "learning_rate": 4.999950442958005e-05, + "loss": 7.8662, + "step": 338 + }, + { + "epoch": 0.0020161290322580645, + "grad_norm": 3.562232255935669, + "learning_rate": 4.9999501484155485e-05, + "loss": 7.8388, + "step": 339 + }, + { + "epoch": 0.0020220763155390617, + "grad_norm": 2.51676607131958, + "learning_rate": 4.9999498530003866e-05, + "loss": 8.2834, + "step": 340 + }, + { + "epoch": 0.002028023598820059, + "grad_norm": 2.326110363006592, + "learning_rate": 4.999949556712517e-05, + "loss": 8.2528, + "step": 341 + }, + { + "epoch": 0.0020339708821010562, + "grad_norm": 2.7621335983276367, + "learning_rate": 4.999949259551941e-05, + "loss": 7.9791, + "step": 342 + }, + { + "epoch": 0.0020399181653820535, + "grad_norm": 3.045431137084961, + "learning_rate": 4.999948961518659e-05, + "loss": 7.8575, + "step": 343 + }, + { + "epoch": 0.0020458654486630507, + "grad_norm": 3.1940131187438965, + "learning_rate": 4.9999486626126703e-05, + "loss": 7.8581, + "step": 344 + }, + { + "epoch": 0.002051812731944048, + "grad_norm": 2.964136838912964, + "learning_rate": 4.999948362833975e-05, + "loss": 7.9656, + "step": 345 + }, + { + "epoch": 0.0020577600152250452, + "grad_norm": 3.167573928833008, + "learning_rate": 4.999948062182574e-05, + "loss": 7.7448, + "step": 346 + }, + { + "epoch": 0.0020637072985060425, + "grad_norm": 3.062666177749634, + "learning_rate": 4.9999477606584666e-05, + "loss": 7.7655, + "step": 347 + }, + { + "epoch": 0.0020696545817870397, + "grad_norm": 3.1097402572631836, + "learning_rate": 4.999947458261653e-05, + "loss": 7.643, + "step": 348 + }, + { + "epoch": 0.002075601865068037, + "grad_norm": 3.1663928031921387, + "learning_rate": 4.999947154992135e-05, + "loss": 7.8348, + "step": 349 + }, + { + "epoch": 0.0020815491483490343, + "grad_norm": 2.8295886516571045, + "learning_rate": 4.99994685084991e-05, + "loss": 7.7752, + "step": 350 + }, + { + "epoch": 0.0020874964316300315, + "grad_norm": 2.7384233474731445, + "learning_rate": 4.99994654583498e-05, + "loss": 7.7644, + "step": 351 + }, + { + "epoch": 0.0020934437149110288, + "grad_norm": 2.6654486656188965, + "learning_rate": 4.999946239947344e-05, + "loss": 7.7489, + "step": 352 + }, + { + "epoch": 0.002099390998192026, + "grad_norm": 2.8949942588806152, + "learning_rate": 4.999945933187003e-05, + "loss": 7.7105, + "step": 353 + }, + { + "epoch": 0.0021053382814730233, + "grad_norm": 2.590036630630493, + "learning_rate": 4.999945625553957e-05, + "loss": 7.6821, + "step": 354 + }, + { + "epoch": 0.0021112855647540205, + "grad_norm": 3.4601457118988037, + "learning_rate": 4.999945317048205e-05, + "loss": 7.3552, + "step": 355 + }, + { + "epoch": 0.002117232848035018, + "grad_norm": 4.022705078125, + "learning_rate": 4.999945007669748e-05, + "loss": 7.0281, + "step": 356 + }, + { + "epoch": 0.002123180131316015, + "grad_norm": 3.249699592590332, + "learning_rate": 4.999944697418587e-05, + "loss": 7.9279, + "step": 357 + }, + { + "epoch": 0.0021291274145970123, + "grad_norm": 2.8424601554870605, + "learning_rate": 4.99994438629472e-05, + "loss": 8.1485, + "step": 358 + }, + { + "epoch": 0.002135074697878009, + "grad_norm": 3.0473172664642334, + "learning_rate": 4.9999440742981486e-05, + "loss": 8.0877, + "step": 359 + }, + { + "epoch": 0.0021410219811590064, + "grad_norm": 3.0614171028137207, + "learning_rate": 4.9999437614288726e-05, + "loss": 7.7817, + "step": 360 + }, + { + "epoch": 0.0021469692644400036, + "grad_norm": 3.309464931488037, + "learning_rate": 4.9999434476868925e-05, + "loss": 7.857, + "step": 361 + }, + { + "epoch": 0.002152916547721001, + "grad_norm": 3.031921148300171, + "learning_rate": 4.999943133072207e-05, + "loss": 7.6393, + "step": 362 + }, + { + "epoch": 0.002158863831001998, + "grad_norm": 3.3756978511810303, + "learning_rate": 4.999942817584818e-05, + "loss": 7.7422, + "step": 363 + }, + { + "epoch": 0.0021648111142829954, + "grad_norm": 3.53362774848938, + "learning_rate": 4.999942501224724e-05, + "loss": 7.9388, + "step": 364 + }, + { + "epoch": 0.0021707583975639926, + "grad_norm": 3.4082882404327393, + "learning_rate": 4.999942183991927e-05, + "loss": 7.3578, + "step": 365 + }, + { + "epoch": 0.00217670568084499, + "grad_norm": 4.035211086273193, + "learning_rate": 4.999941865886425e-05, + "loss": 7.7833, + "step": 366 + }, + { + "epoch": 0.002182652964125987, + "grad_norm": 3.0394630432128906, + "learning_rate": 4.99994154690822e-05, + "loss": 7.9392, + "step": 367 + }, + { + "epoch": 0.0021886002474069844, + "grad_norm": 3.088926076889038, + "learning_rate": 4.99994122705731e-05, + "loss": 7.8149, + "step": 368 + }, + { + "epoch": 0.0021945475306879817, + "grad_norm": 2.3173277378082275, + "learning_rate": 4.9999409063336976e-05, + "loss": 8.2211, + "step": 369 + }, + { + "epoch": 0.002200494813968979, + "grad_norm": 2.9960854053497314, + "learning_rate": 4.9999405847373815e-05, + "loss": 7.6764, + "step": 370 + }, + { + "epoch": 0.002206442097249976, + "grad_norm": 2.841848134994507, + "learning_rate": 4.999940262268361e-05, + "loss": 7.9418, + "step": 371 + }, + { + "epoch": 0.0022123893805309734, + "grad_norm": 3.748779058456421, + "learning_rate": 4.999939938926638e-05, + "loss": 7.7843, + "step": 372 + }, + { + "epoch": 0.0022183366638119707, + "grad_norm": 2.8345019817352295, + "learning_rate": 4.999939614712212e-05, + "loss": 7.592, + "step": 373 + }, + { + "epoch": 0.002224283947092968, + "grad_norm": 3.12503719329834, + "learning_rate": 4.9999392896250826e-05, + "loss": 7.9543, + "step": 374 + }, + { + "epoch": 0.002230231230373965, + "grad_norm": 2.7812912464141846, + "learning_rate": 4.99993896366525e-05, + "loss": 7.8738, + "step": 375 + }, + { + "epoch": 0.0022361785136549624, + "grad_norm": 2.9477410316467285, + "learning_rate": 4.9999386368327144e-05, + "loss": 7.7738, + "step": 376 + }, + { + "epoch": 0.0022421257969359597, + "grad_norm": 2.305204391479492, + "learning_rate": 4.999938309127477e-05, + "loss": 7.9123, + "step": 377 + }, + { + "epoch": 0.002248073080216957, + "grad_norm": 3.3839781284332275, + "learning_rate": 4.999937980549536e-05, + "loss": 7.8542, + "step": 378 + }, + { + "epoch": 0.002254020363497954, + "grad_norm": 3.6973462104797363, + "learning_rate": 4.9999376510988924e-05, + "loss": 7.6953, + "step": 379 + }, + { + "epoch": 0.0022599676467789515, + "grad_norm": 3.8176333904266357, + "learning_rate": 4.999937320775547e-05, + "loss": 7.6548, + "step": 380 + }, + { + "epoch": 0.0022659149300599487, + "grad_norm": 3.0237386226654053, + "learning_rate": 4.999936989579499e-05, + "loss": 7.7843, + "step": 381 + }, + { + "epoch": 0.002271862213340946, + "grad_norm": 2.699695348739624, + "learning_rate": 4.999936657510749e-05, + "loss": 7.8841, + "step": 382 + }, + { + "epoch": 0.0022778094966219432, + "grad_norm": 3.7468206882476807, + "learning_rate": 4.9999363245692965e-05, + "loss": 7.8069, + "step": 383 + }, + { + "epoch": 0.0022837567799029405, + "grad_norm": 3.1074821949005127, + "learning_rate": 4.999935990755142e-05, + "loss": 7.8392, + "step": 384 + }, + { + "epoch": 0.0022897040631839377, + "grad_norm": 2.420884609222412, + "learning_rate": 4.999935656068287e-05, + "loss": 7.9238, + "step": 385 + }, + { + "epoch": 0.002295651346464935, + "grad_norm": 3.1354825496673584, + "learning_rate": 4.9999353205087296e-05, + "loss": 7.9766, + "step": 386 + }, + { + "epoch": 0.0023015986297459322, + "grad_norm": 2.7911901473999023, + "learning_rate": 4.9999349840764695e-05, + "loss": 7.9118, + "step": 387 + }, + { + "epoch": 0.0023075459130269295, + "grad_norm": 2.59529447555542, + "learning_rate": 4.999934646771509e-05, + "loss": 7.8839, + "step": 388 + }, + { + "epoch": 0.0023134931963079267, + "grad_norm": 4.121276378631592, + "learning_rate": 4.999934308593848e-05, + "loss": 7.8406, + "step": 389 + }, + { + "epoch": 0.0023194404795889236, + "grad_norm": 2.9091265201568604, + "learning_rate": 4.999933969543485e-05, + "loss": 7.86, + "step": 390 + }, + { + "epoch": 0.002325387762869921, + "grad_norm": 3.0700483322143555, + "learning_rate": 4.9999336296204195e-05, + "loss": 7.8214, + "step": 391 + }, + { + "epoch": 0.002331335046150918, + "grad_norm": 3.3008790016174316, + "learning_rate": 4.999933288824654e-05, + "loss": 7.5863, + "step": 392 + }, + { + "epoch": 0.0023372823294319153, + "grad_norm": 3.1414108276367188, + "learning_rate": 4.999932947156188e-05, + "loss": 7.5815, + "step": 393 + }, + { + "epoch": 0.0023432296127129126, + "grad_norm": 2.6881701946258545, + "learning_rate": 4.999932604615021e-05, + "loss": 7.959, + "step": 394 + }, + { + "epoch": 0.00234917689599391, + "grad_norm": 2.45609712600708, + "learning_rate": 4.9999322612011534e-05, + "loss": 7.9668, + "step": 395 + }, + { + "epoch": 0.002355124179274907, + "grad_norm": 3.1126747131347656, + "learning_rate": 4.999931916914585e-05, + "loss": 7.774, + "step": 396 + }, + { + "epoch": 0.0023610714625559043, + "grad_norm": 2.806708574295044, + "learning_rate": 4.999931571755316e-05, + "loss": 7.6297, + "step": 397 + }, + { + "epoch": 0.0023670187458369016, + "grad_norm": 3.220013380050659, + "learning_rate": 4.999931225723348e-05, + "loss": 7.3856, + "step": 398 + }, + { + "epoch": 0.002372966029117899, + "grad_norm": 3.0159943103790283, + "learning_rate": 4.9999308788186786e-05, + "loss": 7.3822, + "step": 399 + }, + { + "epoch": 0.002378913312398896, + "grad_norm": 3.1066205501556396, + "learning_rate": 4.9999305310413094e-05, + "loss": 7.3905, + "step": 400 + }, + { + "epoch": 0.0023848605956798934, + "grad_norm": 2.8004367351531982, + "learning_rate": 4.99993018239124e-05, + "loss": 7.8548, + "step": 401 + }, + { + "epoch": 0.0023908078789608906, + "grad_norm": 3.004378318786621, + "learning_rate": 4.999929832868471e-05, + "loss": 7.7846, + "step": 402 + }, + { + "epoch": 0.002396755162241888, + "grad_norm": 3.42901349067688, + "learning_rate": 4.9999294824730025e-05, + "loss": 7.9188, + "step": 403 + }, + { + "epoch": 0.002402702445522885, + "grad_norm": 3.7258527278900146, + "learning_rate": 4.9999291312048343e-05, + "loss": 7.7302, + "step": 404 + }, + { + "epoch": 0.0024086497288038824, + "grad_norm": 4.215145111083984, + "learning_rate": 4.999928779063967e-05, + "loss": 7.6597, + "step": 405 + }, + { + "epoch": 0.0024145970120848796, + "grad_norm": 3.157273769378662, + "learning_rate": 4.9999284260504004e-05, + "loss": 7.7262, + "step": 406 + }, + { + "epoch": 0.002420544295365877, + "grad_norm": 2.9977381229400635, + "learning_rate": 4.999928072164135e-05, + "loss": 7.72, + "step": 407 + }, + { + "epoch": 0.002426491578646874, + "grad_norm": 2.791682720184326, + "learning_rate": 4.9999277174051696e-05, + "loss": 7.8022, + "step": 408 + }, + { + "epoch": 0.0024324388619278714, + "grad_norm": 3.4143035411834717, + "learning_rate": 4.999927361773506e-05, + "loss": 7.5116, + "step": 409 + }, + { + "epoch": 0.0024383861452088687, + "grad_norm": 3.3458821773529053, + "learning_rate": 4.9999270052691425e-05, + "loss": 7.4337, + "step": 410 + }, + { + "epoch": 0.002444333428489866, + "grad_norm": 3.3339595794677734, + "learning_rate": 4.999926647892081e-05, + "loss": 7.7345, + "step": 411 + }, + { + "epoch": 0.002450280711770863, + "grad_norm": 4.285780429840088, + "learning_rate": 4.999926289642321e-05, + "loss": 7.9388, + "step": 412 + }, + { + "epoch": 0.0024562279950518604, + "grad_norm": 3.9473414421081543, + "learning_rate": 4.9999259305198624e-05, + "loss": 7.6038, + "step": 413 + }, + { + "epoch": 0.0024621752783328577, + "grad_norm": 3.504227638244629, + "learning_rate": 4.999925570524706e-05, + "loss": 7.4818, + "step": 414 + }, + { + "epoch": 0.002468122561613855, + "grad_norm": 3.2182157039642334, + "learning_rate": 4.999925209656851e-05, + "loss": 7.3493, + "step": 415 + }, + { + "epoch": 0.002474069844894852, + "grad_norm": 3.1944262981414795, + "learning_rate": 4.999924847916297e-05, + "loss": 7.3646, + "step": 416 + }, + { + "epoch": 0.0024800171281758494, + "grad_norm": 2.957244634628296, + "learning_rate": 4.999924485303047e-05, + "loss": 7.4403, + "step": 417 + }, + { + "epoch": 0.0024859644114568467, + "grad_norm": 2.971285343170166, + "learning_rate": 4.999924121817098e-05, + "loss": 7.7266, + "step": 418 + }, + { + "epoch": 0.002491911694737844, + "grad_norm": 4.029009819030762, + "learning_rate": 4.999923757458451e-05, + "loss": 7.3919, + "step": 419 + }, + { + "epoch": 0.002497858978018841, + "grad_norm": 3.9034767150878906, + "learning_rate": 4.999923392227107e-05, + "loss": 7.2349, + "step": 420 + }, + { + "epoch": 0.002503806261299838, + "grad_norm": 3.23218035697937, + "learning_rate": 4.9999230261230656e-05, + "loss": 7.5146, + "step": 421 + }, + { + "epoch": 0.0025097535445808353, + "grad_norm": 3.193225622177124, + "learning_rate": 4.9999226591463265e-05, + "loss": 7.1699, + "step": 422 + }, + { + "epoch": 0.0025157008278618325, + "grad_norm": 2.9796435832977295, + "learning_rate": 4.999922291296891e-05, + "loss": 7.5719, + "step": 423 + }, + { + "epoch": 0.0025216481111428298, + "grad_norm": 2.6746885776519775, + "learning_rate": 4.999921922574758e-05, + "loss": 7.8086, + "step": 424 + }, + { + "epoch": 0.002527595394423827, + "grad_norm": 3.0622920989990234, + "learning_rate": 4.999921552979928e-05, + "loss": 7.3233, + "step": 425 + }, + { + "epoch": 0.0025335426777048243, + "grad_norm": 3.0908501148223877, + "learning_rate": 4.999921182512402e-05, + "loss": 7.2582, + "step": 426 + }, + { + "epoch": 0.0025394899609858215, + "grad_norm": 2.6913537979125977, + "learning_rate": 4.999920811172178e-05, + "loss": 7.6643, + "step": 427 + }, + { + "epoch": 0.002545437244266819, + "grad_norm": 2.7793848514556885, + "learning_rate": 4.999920438959258e-05, + "loss": 7.9445, + "step": 428 + }, + { + "epoch": 0.002551384527547816, + "grad_norm": 2.741617202758789, + "learning_rate": 4.999920065873642e-05, + "loss": 8.0755, + "step": 429 + }, + { + "epoch": 0.0025573318108288133, + "grad_norm": 2.7102227210998535, + "learning_rate": 4.999919691915329e-05, + "loss": 7.8908, + "step": 430 + }, + { + "epoch": 0.0025632790941098106, + "grad_norm": 2.687788248062134, + "learning_rate": 4.9999193170843206e-05, + "loss": 7.9025, + "step": 431 + }, + { + "epoch": 0.002569226377390808, + "grad_norm": 2.923664093017578, + "learning_rate": 4.999918941380616e-05, + "loss": 7.9331, + "step": 432 + }, + { + "epoch": 0.002575173660671805, + "grad_norm": 2.934735059738159, + "learning_rate": 4.999918564804215e-05, + "loss": 7.722, + "step": 433 + }, + { + "epoch": 0.0025811209439528023, + "grad_norm": 3.8156228065490723, + "learning_rate": 4.999918187355119e-05, + "loss": 7.9392, + "step": 434 + }, + { + "epoch": 0.0025870682272337996, + "grad_norm": 2.333798408508301, + "learning_rate": 4.999917809033327e-05, + "loss": 7.9093, + "step": 435 + }, + { + "epoch": 0.002593015510514797, + "grad_norm": 2.078932046890259, + "learning_rate": 4.99991742983884e-05, + "loss": 7.8484, + "step": 436 + }, + { + "epoch": 0.002598962793795794, + "grad_norm": 2.433375835418701, + "learning_rate": 4.999917049771657e-05, + "loss": 7.9124, + "step": 437 + }, + { + "epoch": 0.0026049100770767913, + "grad_norm": 3.1881024837493896, + "learning_rate": 4.999916668831779e-05, + "loss": 7.3966, + "step": 438 + }, + { + "epoch": 0.0026108573603577886, + "grad_norm": 2.4724855422973633, + "learning_rate": 4.9999162870192065e-05, + "loss": 7.535, + "step": 439 + }, + { + "epoch": 0.002616804643638786, + "grad_norm": 2.8757777214050293, + "learning_rate": 4.999915904333938e-05, + "loss": 7.6728, + "step": 440 + }, + { + "epoch": 0.002622751926919783, + "grad_norm": 3.5439565181732178, + "learning_rate": 4.999915520775975e-05, + "loss": 7.5308, + "step": 441 + }, + { + "epoch": 0.0026286992102007804, + "grad_norm": 2.8345577716827393, + "learning_rate": 4.999915136345318e-05, + "loss": 7.7083, + "step": 442 + }, + { + "epoch": 0.0026346464934817776, + "grad_norm": 3.0842509269714355, + "learning_rate": 4.999914751041965e-05, + "loss": 7.9281, + "step": 443 + }, + { + "epoch": 0.002640593776762775, + "grad_norm": 3.0017757415771484, + "learning_rate": 4.999914364865919e-05, + "loss": 7.4727, + "step": 444 + }, + { + "epoch": 0.002646541060043772, + "grad_norm": 2.637838125228882, + "learning_rate": 4.9999139778171785e-05, + "loss": 7.5284, + "step": 445 + }, + { + "epoch": 0.0026524883433247694, + "grad_norm": 2.7749550342559814, + "learning_rate": 4.999913589895743e-05, + "loss": 7.7006, + "step": 446 + }, + { + "epoch": 0.0026584356266057666, + "grad_norm": 3.1636059284210205, + "learning_rate": 4.9999132011016146e-05, + "loss": 7.6441, + "step": 447 + }, + { + "epoch": 0.002664382909886764, + "grad_norm": 2.623776435852051, + "learning_rate": 4.9999128114347913e-05, + "loss": 7.8027, + "step": 448 + }, + { + "epoch": 0.002670330193167761, + "grad_norm": 2.803612232208252, + "learning_rate": 4.9999124208952755e-05, + "loss": 7.553, + "step": 449 + }, + { + "epoch": 0.0026762774764487584, + "grad_norm": 3.3169047832489014, + "learning_rate": 4.9999120294830656e-05, + "loss": 8.0965, + "step": 450 + }, + { + "epoch": 0.0026822247597297556, + "grad_norm": 3.9928581714630127, + "learning_rate": 4.999911637198161e-05, + "loss": 7.8152, + "step": 451 + }, + { + "epoch": 0.002688172043010753, + "grad_norm": 2.8126320838928223, + "learning_rate": 4.9999112440405646e-05, + "loss": 7.4843, + "step": 452 + }, + { + "epoch": 0.0026941193262917497, + "grad_norm": 2.773427963256836, + "learning_rate": 4.999910850010275e-05, + "loss": 7.7074, + "step": 453 + }, + { + "epoch": 0.002700066609572747, + "grad_norm": 2.8877642154693604, + "learning_rate": 4.999910455107292e-05, + "loss": 7.7764, + "step": 454 + }, + { + "epoch": 0.0027060138928537442, + "grad_norm": 2.6323535442352295, + "learning_rate": 4.9999100593316155e-05, + "loss": 7.7336, + "step": 455 + }, + { + "epoch": 0.0027119611761347415, + "grad_norm": 2.939509153366089, + "learning_rate": 4.9999096626832465e-05, + "loss": 7.8184, + "step": 456 + }, + { + "epoch": 0.0027179084594157387, + "grad_norm": 2.6926229000091553, + "learning_rate": 4.9999092651621855e-05, + "loss": 7.5027, + "step": 457 + }, + { + "epoch": 0.002723855742696736, + "grad_norm": 2.889389991760254, + "learning_rate": 4.999908866768431e-05, + "loss": 7.1138, + "step": 458 + }, + { + "epoch": 0.0027298030259777332, + "grad_norm": 2.951796531677246, + "learning_rate": 4.999908467501985e-05, + "loss": 7.7549, + "step": 459 + }, + { + "epoch": 0.0027357503092587305, + "grad_norm": 2.9076783657073975, + "learning_rate": 4.999908067362847e-05, + "loss": 7.6577, + "step": 460 + }, + { + "epoch": 0.0027416975925397278, + "grad_norm": 3.010636806488037, + "learning_rate": 4.9999076663510155e-05, + "loss": 7.6467, + "step": 461 + }, + { + "epoch": 0.002747644875820725, + "grad_norm": 2.7591371536254883, + "learning_rate": 4.9999072644664935e-05, + "loss": 7.5825, + "step": 462 + }, + { + "epoch": 0.0027535921591017223, + "grad_norm": 2.503632068634033, + "learning_rate": 4.9999068617092795e-05, + "loss": 7.711, + "step": 463 + }, + { + "epoch": 0.0027595394423827195, + "grad_norm": 2.6518661975860596, + "learning_rate": 4.999906458079373e-05, + "loss": 7.557, + "step": 464 + }, + { + "epoch": 0.0027654867256637168, + "grad_norm": 2.6865615844726562, + "learning_rate": 4.9999060535767764e-05, + "loss": 7.5788, + "step": 465 + }, + { + "epoch": 0.002771434008944714, + "grad_norm": 2.715190887451172, + "learning_rate": 4.999905648201487e-05, + "loss": 7.517, + "step": 466 + }, + { + "epoch": 0.0027773812922257113, + "grad_norm": 3.1603381633758545, + "learning_rate": 4.999905241953506e-05, + "loss": 7.6176, + "step": 467 + }, + { + "epoch": 0.0027833285755067085, + "grad_norm": 3.1451528072357178, + "learning_rate": 4.999904834832836e-05, + "loss": 7.6051, + "step": 468 + }, + { + "epoch": 0.002789275858787706, + "grad_norm": 2.5310862064361572, + "learning_rate": 4.9999044268394736e-05, + "loss": 7.6075, + "step": 469 + }, + { + "epoch": 0.002795223142068703, + "grad_norm": 2.9285359382629395, + "learning_rate": 4.99990401797342e-05, + "loss": 7.5399, + "step": 470 + }, + { + "epoch": 0.0028011704253497003, + "grad_norm": 3.2180614471435547, + "learning_rate": 4.9999036082346766e-05, + "loss": 7.6952, + "step": 471 + }, + { + "epoch": 0.0028071177086306976, + "grad_norm": 4.041499614715576, + "learning_rate": 4.9999031976232426e-05, + "loss": 7.841, + "step": 472 + }, + { + "epoch": 0.002813064991911695, + "grad_norm": 3.233492612838745, + "learning_rate": 4.999902786139118e-05, + "loss": 7.5267, + "step": 473 + }, + { + "epoch": 0.002819012275192692, + "grad_norm": 2.7749760150909424, + "learning_rate": 4.9999023737823034e-05, + "loss": 7.3703, + "step": 474 + }, + { + "epoch": 0.0028249595584736893, + "grad_norm": 2.9886162281036377, + "learning_rate": 4.999901960552798e-05, + "loss": 7.4684, + "step": 475 + }, + { + "epoch": 0.0028309068417546866, + "grad_norm": 2.934190511703491, + "learning_rate": 4.999901546450604e-05, + "loss": 7.4432, + "step": 476 + }, + { + "epoch": 0.002836854125035684, + "grad_norm": 3.696247100830078, + "learning_rate": 4.9999011314757196e-05, + "loss": 7.4944, + "step": 477 + }, + { + "epoch": 0.002842801408316681, + "grad_norm": 3.6706700325012207, + "learning_rate": 4.9999007156281454e-05, + "loss": 7.3726, + "step": 478 + }, + { + "epoch": 0.0028487486915976783, + "grad_norm": 3.8638553619384766, + "learning_rate": 4.999900298907881e-05, + "loss": 7.072, + "step": 479 + }, + { + "epoch": 0.0028546959748786756, + "grad_norm": 4.307566165924072, + "learning_rate": 4.999899881314928e-05, + "loss": 6.9371, + "step": 480 + }, + { + "epoch": 0.002860643258159673, + "grad_norm": 3.337372064590454, + "learning_rate": 4.9998994628492854e-05, + "loss": 7.7299, + "step": 481 + }, + { + "epoch": 0.00286659054144067, + "grad_norm": 3.1284921169281006, + "learning_rate": 4.9998990435109535e-05, + "loss": 7.5629, + "step": 482 + }, + { + "epoch": 0.0028725378247216674, + "grad_norm": 3.06904935836792, + "learning_rate": 4.999898623299933e-05, + "loss": 7.5332, + "step": 483 + }, + { + "epoch": 0.002878485108002664, + "grad_norm": 2.985121011734009, + "learning_rate": 4.999898202216224e-05, + "loss": 7.5972, + "step": 484 + }, + { + "epoch": 0.0028844323912836614, + "grad_norm": 2.9188039302825928, + "learning_rate": 4.999897780259827e-05, + "loss": 7.6242, + "step": 485 + }, + { + "epoch": 0.0028903796745646587, + "grad_norm": 3.2263259887695312, + "learning_rate": 4.9998973574307406e-05, + "loss": 7.5746, + "step": 486 + }, + { + "epoch": 0.002896326957845656, + "grad_norm": 2.645188331604004, + "learning_rate": 4.999896933728966e-05, + "loss": 7.6122, + "step": 487 + }, + { + "epoch": 0.002902274241126653, + "grad_norm": 2.89583158493042, + "learning_rate": 4.9998965091545035e-05, + "loss": 7.6157, + "step": 488 + }, + { + "epoch": 0.0029082215244076504, + "grad_norm": 3.6182286739349365, + "learning_rate": 4.9998960837073524e-05, + "loss": 7.4056, + "step": 489 + }, + { + "epoch": 0.0029141688076886477, + "grad_norm": 3.377560615539551, + "learning_rate": 4.9998956573875135e-05, + "loss": 7.4408, + "step": 490 + }, + { + "epoch": 0.002920116090969645, + "grad_norm": 3.0581517219543457, + "learning_rate": 4.9998952301949874e-05, + "loss": 7.5776, + "step": 491 + }, + { + "epoch": 0.002926063374250642, + "grad_norm": 3.5199148654937744, + "learning_rate": 4.999894802129773e-05, + "loss": 7.4747, + "step": 492 + }, + { + "epoch": 0.0029320106575316395, + "grad_norm": 3.866055727005005, + "learning_rate": 4.9998943731918714e-05, + "loss": 7.5985, + "step": 493 + }, + { + "epoch": 0.0029379579408126367, + "grad_norm": 2.856255054473877, + "learning_rate": 4.999893943381283e-05, + "loss": 7.9698, + "step": 494 + }, + { + "epoch": 0.002943905224093634, + "grad_norm": 3.0758626461029053, + "learning_rate": 4.999893512698007e-05, + "loss": 7.6311, + "step": 495 + }, + { + "epoch": 0.0029498525073746312, + "grad_norm": 3.739844560623169, + "learning_rate": 4.999893081142044e-05, + "loss": 7.6829, + "step": 496 + }, + { + "epoch": 0.0029557997906556285, + "grad_norm": 4.025709629058838, + "learning_rate": 4.999892648713394e-05, + "loss": 7.2717, + "step": 497 + }, + { + "epoch": 0.0029617470739366257, + "grad_norm": 3.6604738235473633, + "learning_rate": 4.999892215412057e-05, + "loss": 7.2985, + "step": 498 + }, + { + "epoch": 0.002967694357217623, + "grad_norm": 3.230109930038452, + "learning_rate": 4.999891781238034e-05, + "loss": 8.1041, + "step": 499 + }, + { + "epoch": 0.0029736416404986202, + "grad_norm": 2.5046725273132324, + "learning_rate": 4.999891346191325e-05, + "loss": 8.0888, + "step": 500 + }, + { + "epoch": 0.0029795889237796175, + "grad_norm": 2.916459798812866, + "learning_rate": 4.999890910271929e-05, + "loss": 7.8675, + "step": 501 + }, + { + "epoch": 0.0029855362070606148, + "grad_norm": 2.7806055545806885, + "learning_rate": 4.999890473479848e-05, + "loss": 7.8903, + "step": 502 + }, + { + "epoch": 0.002991483490341612, + "grad_norm": 2.9877662658691406, + "learning_rate": 4.99989003581508e-05, + "loss": 7.473, + "step": 503 + }, + { + "epoch": 0.0029974307736226093, + "grad_norm": 3.1581692695617676, + "learning_rate": 4.999889597277626e-05, + "loss": 7.5654, + "step": 504 + }, + { + "epoch": 0.0030033780569036065, + "grad_norm": 3.102539539337158, + "learning_rate": 4.9998891578674866e-05, + "loss": 7.8865, + "step": 505 + }, + { + "epoch": 0.0030093253401846038, + "grad_norm": 3.0357863903045654, + "learning_rate": 4.999888717584662e-05, + "loss": 7.291, + "step": 506 + }, + { + "epoch": 0.003015272623465601, + "grad_norm": 2.604048252105713, + "learning_rate": 4.999888276429152e-05, + "loss": 7.4892, + "step": 507 + }, + { + "epoch": 0.0030212199067465983, + "grad_norm": 2.734354257583618, + "learning_rate": 4.999887834400957e-05, + "loss": 7.1182, + "step": 508 + }, + { + "epoch": 0.0030271671900275955, + "grad_norm": 2.5255348682403564, + "learning_rate": 4.9998873915000775e-05, + "loss": 7.449, + "step": 509 + }, + { + "epoch": 0.003033114473308593, + "grad_norm": 2.864072322845459, + "learning_rate": 4.999886947726512e-05, + "loss": 7.3213, + "step": 510 + }, + { + "epoch": 0.00303906175658959, + "grad_norm": 2.764187812805176, + "learning_rate": 4.999886503080262e-05, + "loss": 7.337, + "step": 511 + }, + { + "epoch": 0.0030450090398705873, + "grad_norm": 3.5725066661834717, + "learning_rate": 4.9998860575613285e-05, + "loss": 7.8398, + "step": 512 + }, + { + "epoch": 0.0030509563231515846, + "grad_norm": 3.8559648990631104, + "learning_rate": 4.9998856111697096e-05, + "loss": 7.395, + "step": 513 + }, + { + "epoch": 0.003056903606432582, + "grad_norm": 2.9047908782958984, + "learning_rate": 4.999885163905407e-05, + "loss": 7.7016, + "step": 514 + }, + { + "epoch": 0.0030628508897135786, + "grad_norm": 3.1485037803649902, + "learning_rate": 4.99988471576842e-05, + "loss": 6.9411, + "step": 515 + }, + { + "epoch": 0.003068798172994576, + "grad_norm": 3.2763617038726807, + "learning_rate": 4.999884266758749e-05, + "loss": 6.4778, + "step": 516 + }, + { + "epoch": 0.003074745456275573, + "grad_norm": 2.7609500885009766, + "learning_rate": 4.999883816876394e-05, + "loss": 7.0576, + "step": 517 + }, + { + "epoch": 0.0030806927395565704, + "grad_norm": 3.7407751083374023, + "learning_rate": 4.999883366121356e-05, + "loss": 7.7389, + "step": 518 + }, + { + "epoch": 0.0030866400228375676, + "grad_norm": 3.3356568813323975, + "learning_rate": 4.999882914493634e-05, + "loss": 7.7, + "step": 519 + }, + { + "epoch": 0.003092587306118565, + "grad_norm": 2.635594129562378, + "learning_rate": 4.999882461993229e-05, + "loss": 7.6103, + "step": 520 + }, + { + "epoch": 0.003098534589399562, + "grad_norm": 3.7604281902313232, + "learning_rate": 4.9998820086201406e-05, + "loss": 7.6814, + "step": 521 + }, + { + "epoch": 0.0031044818726805594, + "grad_norm": 3.6567211151123047, + "learning_rate": 4.99988155437437e-05, + "loss": 7.6729, + "step": 522 + }, + { + "epoch": 0.0031104291559615567, + "grad_norm": 3.605442523956299, + "learning_rate": 4.999881099255916e-05, + "loss": 7.7464, + "step": 523 + }, + { + "epoch": 0.003116376439242554, + "grad_norm": 3.015500783920288, + "learning_rate": 4.99988064326478e-05, + "loss": 7.5168, + "step": 524 + }, + { + "epoch": 0.003122323722523551, + "grad_norm": 2.9037563800811768, + "learning_rate": 4.9998801864009604e-05, + "loss": 7.7059, + "step": 525 + }, + { + "epoch": 0.0031282710058045484, + "grad_norm": 2.812509059906006, + "learning_rate": 4.999879728664458e-05, + "loss": 7.4178, + "step": 526 + }, + { + "epoch": 0.0031342182890855457, + "grad_norm": 3.340226888656616, + "learning_rate": 4.9998792700552746e-05, + "loss": 7.7872, + "step": 527 + }, + { + "epoch": 0.003140165572366543, + "grad_norm": 3.0951550006866455, + "learning_rate": 4.999878810573409e-05, + "loss": 8.0153, + "step": 528 + }, + { + "epoch": 0.00314611285564754, + "grad_norm": 3.1077651977539062, + "learning_rate": 4.9998783502188616e-05, + "loss": 7.7053, + "step": 529 + }, + { + "epoch": 0.0031520601389285374, + "grad_norm": 3.442451000213623, + "learning_rate": 4.999877888991632e-05, + "loss": 7.5149, + "step": 530 + }, + { + "epoch": 0.0031580074222095347, + "grad_norm": 3.7479207515716553, + "learning_rate": 4.9998774268917215e-05, + "loss": 7.3448, + "step": 531 + }, + { + "epoch": 0.003163954705490532, + "grad_norm": 2.660789966583252, + "learning_rate": 4.999876963919129e-05, + "loss": 7.8348, + "step": 532 + }, + { + "epoch": 0.003169901988771529, + "grad_norm": 2.6255943775177, + "learning_rate": 4.9998765000738556e-05, + "loss": 7.542, + "step": 533 + }, + { + "epoch": 0.0031758492720525265, + "grad_norm": 3.121521472930908, + "learning_rate": 4.9998760353559017e-05, + "loss": 7.46, + "step": 534 + }, + { + "epoch": 0.0031817965553335237, + "grad_norm": 2.958880662918091, + "learning_rate": 4.999875569765266e-05, + "loss": 7.5385, + "step": 535 + }, + { + "epoch": 0.003187743838614521, + "grad_norm": 3.4153661727905273, + "learning_rate": 4.99987510330195e-05, + "loss": 7.4989, + "step": 536 + }, + { + "epoch": 0.0031936911218955182, + "grad_norm": 3.0877597332000732, + "learning_rate": 4.999874635965953e-05, + "loss": 7.5512, + "step": 537 + }, + { + "epoch": 0.0031996384051765155, + "grad_norm": 3.109522581100464, + "learning_rate": 4.9998741677572756e-05, + "loss": 7.4679, + "step": 538 + }, + { + "epoch": 0.0032055856884575127, + "grad_norm": 3.4434239864349365, + "learning_rate": 4.999873698675919e-05, + "loss": 7.0599, + "step": 539 + }, + { + "epoch": 0.00321153297173851, + "grad_norm": 3.83335018157959, + "learning_rate": 4.999873228721882e-05, + "loss": 7.5355, + "step": 540 + }, + { + "epoch": 0.0032174802550195072, + "grad_norm": 3.0679752826690674, + "learning_rate": 4.999872757895164e-05, + "loss": 7.7231, + "step": 541 + }, + { + "epoch": 0.0032234275383005045, + "grad_norm": 3.272196054458618, + "learning_rate": 4.999872286195767e-05, + "loss": 7.6674, + "step": 542 + }, + { + "epoch": 0.0032293748215815017, + "grad_norm": 2.8453965187072754, + "learning_rate": 4.9998718136236897e-05, + "loss": 7.4451, + "step": 543 + }, + { + "epoch": 0.003235322104862499, + "grad_norm": 3.074399709701538, + "learning_rate": 4.999871340178934e-05, + "loss": 7.6011, + "step": 544 + }, + { + "epoch": 0.0032412693881434963, + "grad_norm": 3.173004150390625, + "learning_rate": 4.999870865861499e-05, + "loss": 7.5268, + "step": 545 + }, + { + "epoch": 0.003247216671424493, + "grad_norm": 2.820848226547241, + "learning_rate": 4.999870390671384e-05, + "loss": 7.9872, + "step": 546 + }, + { + "epoch": 0.0032531639547054903, + "grad_norm": 2.692702293395996, + "learning_rate": 4.9998699146085906e-05, + "loss": 7.4676, + "step": 547 + }, + { + "epoch": 0.0032591112379864876, + "grad_norm": 2.2766902446746826, + "learning_rate": 4.999869437673119e-05, + "loss": 7.3826, + "step": 548 + }, + { + "epoch": 0.003265058521267485, + "grad_norm": 2.1190011501312256, + "learning_rate": 4.9998689598649686e-05, + "loss": 7.4767, + "step": 549 + }, + { + "epoch": 0.003271005804548482, + "grad_norm": 2.687633514404297, + "learning_rate": 4.999868481184139e-05, + "loss": 7.9922, + "step": 550 + }, + { + "epoch": 0.0032769530878294794, + "grad_norm": 3.403298854827881, + "learning_rate": 4.999868001630632e-05, + "loss": 7.8035, + "step": 551 + }, + { + "epoch": 0.0032829003711104766, + "grad_norm": 3.074881076812744, + "learning_rate": 4.999867521204446e-05, + "loss": 7.7106, + "step": 552 + }, + { + "epoch": 0.003288847654391474, + "grad_norm": 3.28725004196167, + "learning_rate": 4.9998670399055827e-05, + "loss": 7.4661, + "step": 553 + }, + { + "epoch": 0.003294794937672471, + "grad_norm": 3.8624775409698486, + "learning_rate": 4.999866557734041e-05, + "loss": 7.7156, + "step": 554 + }, + { + "epoch": 0.0033007422209534684, + "grad_norm": 2.53586745262146, + "learning_rate": 4.999866074689823e-05, + "loss": 7.945, + "step": 555 + }, + { + "epoch": 0.0033066895042344656, + "grad_norm": 3.8261072635650635, + "learning_rate": 4.9998655907729265e-05, + "loss": 8.0446, + "step": 556 + }, + { + "epoch": 0.003312636787515463, + "grad_norm": 2.7173407077789307, + "learning_rate": 4.999865105983353e-05, + "loss": 7.8363, + "step": 557 + }, + { + "epoch": 0.00331858407079646, + "grad_norm": 4.68424654006958, + "learning_rate": 4.999864620321102e-05, + "loss": 7.667, + "step": 558 + }, + { + "epoch": 0.0033245313540774574, + "grad_norm": 2.8763632774353027, + "learning_rate": 4.999864133786175e-05, + "loss": 7.6133, + "step": 559 + }, + { + "epoch": 0.0033304786373584546, + "grad_norm": 3.0986382961273193, + "learning_rate": 4.9998636463785705e-05, + "loss": 7.6257, + "step": 560 + }, + { + "epoch": 0.003336425920639452, + "grad_norm": 2.6826348304748535, + "learning_rate": 4.9998631580982905e-05, + "loss": 7.5187, + "step": 561 + }, + { + "epoch": 0.003342373203920449, + "grad_norm": 2.2172515392303467, + "learning_rate": 4.9998626689453334e-05, + "loss": 7.961, + "step": 562 + }, + { + "epoch": 0.0033483204872014464, + "grad_norm": 2.6083858013153076, + "learning_rate": 4.9998621789197e-05, + "loss": 7.7887, + "step": 563 + }, + { + "epoch": 0.0033542677704824437, + "grad_norm": 3.6838009357452393, + "learning_rate": 4.99986168802139e-05, + "loss": 7.4945, + "step": 564 + }, + { + "epoch": 0.003360215053763441, + "grad_norm": 3.2091991901397705, + "learning_rate": 4.999861196250405e-05, + "loss": 7.4243, + "step": 565 + }, + { + "epoch": 0.003366162337044438, + "grad_norm": 3.142982244491577, + "learning_rate": 4.9998607036067434e-05, + "loss": 7.4684, + "step": 566 + }, + { + "epoch": 0.0033721096203254354, + "grad_norm": 3.7751007080078125, + "learning_rate": 4.9998602100904065e-05, + "loss": 7.3722, + "step": 567 + }, + { + "epoch": 0.0033780569036064327, + "grad_norm": 3.276843547821045, + "learning_rate": 4.9998597157013946e-05, + "loss": 7.4012, + "step": 568 + }, + { + "epoch": 0.00338400418688743, + "grad_norm": 2.840106725692749, + "learning_rate": 4.999859220439708e-05, + "loss": 7.4013, + "step": 569 + }, + { + "epoch": 0.003389951470168427, + "grad_norm": 2.7816810607910156, + "learning_rate": 4.999858724305346e-05, + "loss": 7.3136, + "step": 570 + }, + { + "epoch": 0.0033958987534494244, + "grad_norm": 4.523340225219727, + "learning_rate": 4.999858227298308e-05, + "loss": 7.0553, + "step": 571 + }, + { + "epoch": 0.0034018460367304217, + "grad_norm": 3.9653191566467285, + "learning_rate": 4.9998577294185964e-05, + "loss": 7.1907, + "step": 572 + }, + { + "epoch": 0.003407793320011419, + "grad_norm": 3.243089199066162, + "learning_rate": 4.999857230666211e-05, + "loss": 7.0749, + "step": 573 + }, + { + "epoch": 0.003413740603292416, + "grad_norm": 3.3622777462005615, + "learning_rate": 4.99985673104115e-05, + "loss": 7.0005, + "step": 574 + }, + { + "epoch": 0.0034196878865734135, + "grad_norm": 2.561732292175293, + "learning_rate": 4.9998562305434154e-05, + "loss": 7.271, + "step": 575 + }, + { + "epoch": 0.0034256351698544107, + "grad_norm": 3.1846745014190674, + "learning_rate": 4.999855729173006e-05, + "loss": 7.7333, + "step": 576 + }, + { + "epoch": 0.0034315824531354075, + "grad_norm": 3.0318918228149414, + "learning_rate": 4.999855226929924e-05, + "loss": 7.5535, + "step": 577 + }, + { + "epoch": 0.003437529736416405, + "grad_norm": 2.993086099624634, + "learning_rate": 4.999854723814168e-05, + "loss": 7.6272, + "step": 578 + }, + { + "epoch": 0.003443477019697402, + "grad_norm": 2.8511712551116943, + "learning_rate": 4.999854219825738e-05, + "loss": 7.6619, + "step": 579 + }, + { + "epoch": 0.0034494243029783993, + "grad_norm": 2.6181185245513916, + "learning_rate": 4.9998537149646355e-05, + "loss": 7.7452, + "step": 580 + }, + { + "epoch": 0.0034553715862593965, + "grad_norm": 2.9932363033294678, + "learning_rate": 4.9998532092308593e-05, + "loss": 7.7475, + "step": 581 + }, + { + "epoch": 0.003461318869540394, + "grad_norm": 3.541944742202759, + "learning_rate": 4.99985270262441e-05, + "loss": 7.5808, + "step": 582 + }, + { + "epoch": 0.003467266152821391, + "grad_norm": 2.780372381210327, + "learning_rate": 4.9998521951452895e-05, + "loss": 7.8167, + "step": 583 + }, + { + "epoch": 0.0034732134361023883, + "grad_norm": 2.9156363010406494, + "learning_rate": 4.9998516867934945e-05, + "loss": 7.74, + "step": 584 + }, + { + "epoch": 0.0034791607193833856, + "grad_norm": 3.9492485523223877, + "learning_rate": 4.9998511775690285e-05, + "loss": 7.1128, + "step": 585 + }, + { + "epoch": 0.003485108002664383, + "grad_norm": 2.8288252353668213, + "learning_rate": 4.9998506674718896e-05, + "loss": 7.4884, + "step": 586 + }, + { + "epoch": 0.00349105528594538, + "grad_norm": 2.8906798362731934, + "learning_rate": 4.999850156502078e-05, + "loss": 7.6378, + "step": 587 + }, + { + "epoch": 0.0034970025692263773, + "grad_norm": 2.8806405067443848, + "learning_rate": 4.9998496446595955e-05, + "loss": 7.4641, + "step": 588 + }, + { + "epoch": 0.0035029498525073746, + "grad_norm": 3.1794772148132324, + "learning_rate": 4.999849131944441e-05, + "loss": 7.1633, + "step": 589 + }, + { + "epoch": 0.003508897135788372, + "grad_norm": 2.886009454727173, + "learning_rate": 4.999848618356615e-05, + "loss": 7.1793, + "step": 590 + }, + { + "epoch": 0.003514844419069369, + "grad_norm": 2.76184344291687, + "learning_rate": 4.999848103896118e-05, + "loss": 7.1377, + "step": 591 + }, + { + "epoch": 0.0035207917023503663, + "grad_norm": 3.127793788909912, + "learning_rate": 4.999847588562949e-05, + "loss": 7.2793, + "step": 592 + }, + { + "epoch": 0.0035267389856313636, + "grad_norm": 3.7768073081970215, + "learning_rate": 4.99984707235711e-05, + "loss": 7.8203, + "step": 593 + }, + { + "epoch": 0.003532686268912361, + "grad_norm": 3.1750540733337402, + "learning_rate": 4.9998465552786e-05, + "loss": 7.7078, + "step": 594 + }, + { + "epoch": 0.003538633552193358, + "grad_norm": 2.8884522914886475, + "learning_rate": 4.999846037327419e-05, + "loss": 7.6864, + "step": 595 + }, + { + "epoch": 0.0035445808354743554, + "grad_norm": 2.783928394317627, + "learning_rate": 4.999845518503568e-05, + "loss": 7.7329, + "step": 596 + }, + { + "epoch": 0.0035505281187553526, + "grad_norm": 2.8093652725219727, + "learning_rate": 4.9998449988070465e-05, + "loss": 7.7157, + "step": 597 + }, + { + "epoch": 0.00355647540203635, + "grad_norm": 2.54380464553833, + "learning_rate": 4.999844478237855e-05, + "loss": 7.6353, + "step": 598 + }, + { + "epoch": 0.003562422685317347, + "grad_norm": 3.478878974914551, + "learning_rate": 4.999843956795993e-05, + "loss": 7.4221, + "step": 599 + }, + { + "epoch": 0.0035683699685983444, + "grad_norm": 3.882807493209839, + "learning_rate": 4.999843434481463e-05, + "loss": 7.4857, + "step": 600 + }, + { + "epoch": 0.0035743172518793416, + "grad_norm": 3.0975584983825684, + "learning_rate": 4.999842911294261e-05, + "loss": 7.5121, + "step": 601 + }, + { + "epoch": 0.003580264535160339, + "grad_norm": 3.1857712268829346, + "learning_rate": 4.999842387234391e-05, + "loss": 7.4469, + "step": 602 + }, + { + "epoch": 0.003586211818441336, + "grad_norm": 2.892927885055542, + "learning_rate": 4.999841862301853e-05, + "loss": 7.4047, + "step": 603 + }, + { + "epoch": 0.0035921591017223334, + "grad_norm": 4.186185359954834, + "learning_rate": 4.999841336496645e-05, + "loss": 7.5146, + "step": 604 + }, + { + "epoch": 0.0035981063850033307, + "grad_norm": 3.27422833442688, + "learning_rate": 4.9998408098187674e-05, + "loss": 7.3347, + "step": 605 + }, + { + "epoch": 0.003604053668284328, + "grad_norm": 4.817208290100098, + "learning_rate": 4.9998402822682225e-05, + "loss": 7.9883, + "step": 606 + }, + { + "epoch": 0.003610000951565325, + "grad_norm": 5.903015613555908, + "learning_rate": 4.999839753845008e-05, + "loss": 7.9043, + "step": 607 + }, + { + "epoch": 0.0036159482348463224, + "grad_norm": 4.720086574554443, + "learning_rate": 4.999839224549127e-05, + "loss": 7.8456, + "step": 608 + }, + { + "epoch": 0.0036218955181273192, + "grad_norm": 4.518443584442139, + "learning_rate": 4.9998386943805764e-05, + "loss": 7.3659, + "step": 609 + }, + { + "epoch": 0.0036278428014083165, + "grad_norm": 2.621833086013794, + "learning_rate": 4.999838163339358e-05, + "loss": 8.0512, + "step": 610 + }, + { + "epoch": 0.0036337900846893137, + "grad_norm": 4.015076160430908, + "learning_rate": 4.9998376314254726e-05, + "loss": 7.8581, + "step": 611 + }, + { + "epoch": 0.003639737367970311, + "grad_norm": 3.8145275115966797, + "learning_rate": 4.999837098638919e-05, + "loss": 7.4288, + "step": 612 + }, + { + "epoch": 0.0036456846512513083, + "grad_norm": 3.396488904953003, + "learning_rate": 4.9998365649796985e-05, + "loss": 7.7812, + "step": 613 + }, + { + "epoch": 0.0036516319345323055, + "grad_norm": 2.931187391281128, + "learning_rate": 4.999836030447811e-05, + "loss": 7.5898, + "step": 614 + }, + { + "epoch": 0.0036575792178133028, + "grad_norm": 2.6349267959594727, + "learning_rate": 4.999835495043257e-05, + "loss": 7.5345, + "step": 615 + }, + { + "epoch": 0.0036635265010943, + "grad_norm": 3.014085531234741, + "learning_rate": 4.999834958766035e-05, + "loss": 7.5985, + "step": 616 + }, + { + "epoch": 0.0036694737843752973, + "grad_norm": 2.971475124359131, + "learning_rate": 4.999834421616147e-05, + "loss": 7.589, + "step": 617 + }, + { + "epoch": 0.0036754210676562945, + "grad_norm": 3.867366075515747, + "learning_rate": 4.999833883593593e-05, + "loss": 7.4026, + "step": 618 + }, + { + "epoch": 0.0036813683509372918, + "grad_norm": 2.3917908668518066, + "learning_rate": 4.9998333446983734e-05, + "loss": 7.4361, + "step": 619 + }, + { + "epoch": 0.003687315634218289, + "grad_norm": 4.583080768585205, + "learning_rate": 4.999832804930487e-05, + "loss": 7.5525, + "step": 620 + }, + { + "epoch": 0.0036932629174992863, + "grad_norm": 2.6039721965789795, + "learning_rate": 4.999832264289934e-05, + "loss": 7.636, + "step": 621 + }, + { + "epoch": 0.0036992102007802835, + "grad_norm": 4.123409748077393, + "learning_rate": 4.9998317227767165e-05, + "loss": 7.7803, + "step": 622 + }, + { + "epoch": 0.003705157484061281, + "grad_norm": 4.220766544342041, + "learning_rate": 4.999831180390834e-05, + "loss": 7.8086, + "step": 623 + }, + { + "epoch": 0.003711104767342278, + "grad_norm": 3.0759594440460205, + "learning_rate": 4.999830637132285e-05, + "loss": 7.4815, + "step": 624 + }, + { + "epoch": 0.0037170520506232753, + "grad_norm": 2.7870442867279053, + "learning_rate": 4.999830093001071e-05, + "loss": 7.3925, + "step": 625 + }, + { + "epoch": 0.0037229993339042726, + "grad_norm": 2.5292582511901855, + "learning_rate": 4.999829547997193e-05, + "loss": 7.2049, + "step": 626 + }, + { + "epoch": 0.00372894661718527, + "grad_norm": 2.5836963653564453, + "learning_rate": 4.99982900212065e-05, + "loss": 7.2858, + "step": 627 + }, + { + "epoch": 0.003734893900466267, + "grad_norm": 2.6433279514312744, + "learning_rate": 4.9998284553714425e-05, + "loss": 7.5894, + "step": 628 + }, + { + "epoch": 0.0037408411837472643, + "grad_norm": 3.1093215942382812, + "learning_rate": 4.999827907749571e-05, + "loss": 7.2859, + "step": 629 + }, + { + "epoch": 0.0037467884670282616, + "grad_norm": 2.313305616378784, + "learning_rate": 4.9998273592550346e-05, + "loss": 7.6275, + "step": 630 + }, + { + "epoch": 0.003752735750309259, + "grad_norm": 3.7002785205841064, + "learning_rate": 4.9998268098878355e-05, + "loss": 7.7068, + "step": 631 + }, + { + "epoch": 0.003758683033590256, + "grad_norm": 3.090707778930664, + "learning_rate": 4.9998262596479715e-05, + "loss": 7.7304, + "step": 632 + }, + { + "epoch": 0.0037646303168712533, + "grad_norm": 2.425614833831787, + "learning_rate": 4.999825708535445e-05, + "loss": 7.927, + "step": 633 + }, + { + "epoch": 0.0037705776001522506, + "grad_norm": 2.1477420330047607, + "learning_rate": 4.999825156550254e-05, + "loss": 8.1082, + "step": 634 + }, + { + "epoch": 0.003776524883433248, + "grad_norm": 2.434638738632202, + "learning_rate": 4.999824603692401e-05, + "loss": 7.8808, + "step": 635 + }, + { + "epoch": 0.003782472166714245, + "grad_norm": 2.563283681869507, + "learning_rate": 4.999824049961884e-05, + "loss": 7.8515, + "step": 636 + }, + { + "epoch": 0.0037884194499952424, + "grad_norm": 2.6878623962402344, + "learning_rate": 4.9998234953587054e-05, + "loss": 7.6393, + "step": 637 + }, + { + "epoch": 0.0037943667332762396, + "grad_norm": 2.6270666122436523, + "learning_rate": 4.999822939882863e-05, + "loss": 7.8246, + "step": 638 + }, + { + "epoch": 0.003800314016557237, + "grad_norm": 3.300494909286499, + "learning_rate": 4.9998223835343596e-05, + "loss": 7.4991, + "step": 639 + }, + { + "epoch": 0.0038062612998382337, + "grad_norm": 2.726902723312378, + "learning_rate": 4.9998218263131925e-05, + "loss": 7.6663, + "step": 640 + }, + { + "epoch": 0.003812208583119231, + "grad_norm": 2.8147871494293213, + "learning_rate": 4.9998212682193645e-05, + "loss": 7.5272, + "step": 641 + }, + { + "epoch": 0.003818155866400228, + "grad_norm": 2.324422597885132, + "learning_rate": 4.9998207092528745e-05, + "loss": 7.6577, + "step": 642 + }, + { + "epoch": 0.0038241031496812255, + "grad_norm": 2.4525058269500732, + "learning_rate": 4.999820149413723e-05, + "loss": 7.6793, + "step": 643 + }, + { + "epoch": 0.0038300504329622227, + "grad_norm": 2.4011337757110596, + "learning_rate": 4.9998195887019094e-05, + "loss": 7.4869, + "step": 644 + }, + { + "epoch": 0.00383599771624322, + "grad_norm": 2.3403005599975586, + "learning_rate": 4.9998190271174364e-05, + "loss": 7.9552, + "step": 645 + }, + { + "epoch": 0.003841944999524217, + "grad_norm": 2.1421074867248535, + "learning_rate": 4.9998184646603005e-05, + "loss": 7.4021, + "step": 646 + }, + { + "epoch": 0.0038478922828052145, + "grad_norm": 2.4157450199127197, + "learning_rate": 4.9998179013305046e-05, + "loss": 7.6666, + "step": 647 + }, + { + "epoch": 0.0038538395660862117, + "grad_norm": 2.737692356109619, + "learning_rate": 4.999817337128048e-05, + "loss": 7.7441, + "step": 648 + }, + { + "epoch": 0.003859786849367209, + "grad_norm": 3.2240428924560547, + "learning_rate": 4.999816772052931e-05, + "loss": 7.5691, + "step": 649 + }, + { + "epoch": 0.0038657341326482062, + "grad_norm": 2.8538997173309326, + "learning_rate": 4.9998162061051534e-05, + "loss": 7.4994, + "step": 650 + }, + { + "epoch": 0.0038716814159292035, + "grad_norm": 2.6562373638153076, + "learning_rate": 4.9998156392847164e-05, + "loss": 7.5156, + "step": 651 + }, + { + "epoch": 0.0038776286992102007, + "grad_norm": 2.5513811111450195, + "learning_rate": 4.999815071591619e-05, + "loss": 7.6503, + "step": 652 + }, + { + "epoch": 0.003883575982491198, + "grad_norm": 2.4196572303771973, + "learning_rate": 4.999814503025863e-05, + "loss": 7.9868, + "step": 653 + }, + { + "epoch": 0.0038895232657721952, + "grad_norm": 3.0201921463012695, + "learning_rate": 4.999813933587447e-05, + "loss": 7.5405, + "step": 654 + }, + { + "epoch": 0.0038954705490531925, + "grad_norm": 2.352625846862793, + "learning_rate": 4.9998133632763714e-05, + "loss": 7.5461, + "step": 655 + }, + { + "epoch": 0.0039014178323341898, + "grad_norm": 2.5318710803985596, + "learning_rate": 4.999812792092637e-05, + "loss": 7.5596, + "step": 656 + }, + { + "epoch": 0.003907365115615187, + "grad_norm": 2.710785388946533, + "learning_rate": 4.9998122200362444e-05, + "loss": 7.4828, + "step": 657 + }, + { + "epoch": 0.003913312398896184, + "grad_norm": 2.7441353797912598, + "learning_rate": 4.999811647107192e-05, + "loss": 7.2496, + "step": 658 + }, + { + "epoch": 0.0039192596821771815, + "grad_norm": 2.4602885246276855, + "learning_rate": 4.9998110733054824e-05, + "loss": 7.6134, + "step": 659 + }, + { + "epoch": 0.003925206965458178, + "grad_norm": 2.6842973232269287, + "learning_rate": 4.999810498631114e-05, + "loss": 7.3544, + "step": 660 + }, + { + "epoch": 0.003931154248739176, + "grad_norm": 2.8062961101531982, + "learning_rate": 4.9998099230840875e-05, + "loss": 7.5162, + "step": 661 + }, + { + "epoch": 0.003937101532020173, + "grad_norm": 4.0753679275512695, + "learning_rate": 4.9998093466644036e-05, + "loss": 7.5241, + "step": 662 + }, + { + "epoch": 0.0039430488153011705, + "grad_norm": 3.0165748596191406, + "learning_rate": 4.999808769372061e-05, + "loss": 7.5313, + "step": 663 + }, + { + "epoch": 0.003948996098582167, + "grad_norm": 2.73825740814209, + "learning_rate": 4.9998081912070623e-05, + "loss": 7.4433, + "step": 664 + }, + { + "epoch": 0.003954943381863165, + "grad_norm": 2.6649749279022217, + "learning_rate": 4.9998076121694056e-05, + "loss": 7.4852, + "step": 665 + }, + { + "epoch": 0.003960890665144162, + "grad_norm": 2.609389066696167, + "learning_rate": 4.999807032259092e-05, + "loss": 7.4127, + "step": 666 + }, + { + "epoch": 0.0039668379484251596, + "grad_norm": 2.50502610206604, + "learning_rate": 4.999806451476122e-05, + "loss": 7.3113, + "step": 667 + }, + { + "epoch": 0.003972785231706156, + "grad_norm": 2.565142869949341, + "learning_rate": 4.999805869820495e-05, + "loss": 7.1875, + "step": 668 + }, + { + "epoch": 0.003978732514987154, + "grad_norm": 2.582742214202881, + "learning_rate": 4.9998052872922117e-05, + "loss": 7.3251, + "step": 669 + }, + { + "epoch": 0.003984679798268151, + "grad_norm": 2.718780279159546, + "learning_rate": 4.999804703891272e-05, + "loss": 7.3599, + "step": 670 + }, + { + "epoch": 0.003990627081549149, + "grad_norm": 2.5971410274505615, + "learning_rate": 4.999804119617677e-05, + "loss": 7.2304, + "step": 671 + }, + { + "epoch": 0.003996574364830145, + "grad_norm": 2.5905725955963135, + "learning_rate": 4.9998035344714255e-05, + "loss": 7.3664, + "step": 672 + }, + { + "epoch": 0.004002521648111143, + "grad_norm": 2.659102439880371, + "learning_rate": 4.999802948452519e-05, + "loss": 7.4296, + "step": 673 + }, + { + "epoch": 0.00400846893139214, + "grad_norm": 2.5933544635772705, + "learning_rate": 4.999802361560957e-05, + "loss": 7.4605, + "step": 674 + }, + { + "epoch": 0.004014416214673138, + "grad_norm": 3.3860044479370117, + "learning_rate": 4.999801773796739e-05, + "loss": 7.5159, + "step": 675 + }, + { + "epoch": 0.004020363497954134, + "grad_norm": 3.742635726928711, + "learning_rate": 4.9998011851598666e-05, + "loss": 7.4988, + "step": 676 + }, + { + "epoch": 0.004026310781235132, + "grad_norm": 3.5960240364074707, + "learning_rate": 4.999800595650339e-05, + "loss": 7.4607, + "step": 677 + }, + { + "epoch": 0.004032258064516129, + "grad_norm": 2.654444694519043, + "learning_rate": 4.9998000052681585e-05, + "loss": 7.2166, + "step": 678 + }, + { + "epoch": 0.004038205347797127, + "grad_norm": 2.4538326263427734, + "learning_rate": 4.999799414013322e-05, + "loss": 7.2334, + "step": 679 + }, + { + "epoch": 0.004044152631078123, + "grad_norm": 2.5899672508239746, + "learning_rate": 4.9997988218858316e-05, + "loss": 7.2754, + "step": 680 + }, + { + "epoch": 0.004050099914359121, + "grad_norm": 2.721224069595337, + "learning_rate": 4.999798228885687e-05, + "loss": 7.188, + "step": 681 + }, + { + "epoch": 0.004056047197640118, + "grad_norm": 6.5863189697265625, + "learning_rate": 4.9997976350128894e-05, + "loss": 7.369, + "step": 682 + }, + { + "epoch": 0.004061994480921116, + "grad_norm": 2.6562674045562744, + "learning_rate": 4.999797040267438e-05, + "loss": 7.176, + "step": 683 + }, + { + "epoch": 0.0040679417642021124, + "grad_norm": 2.503666877746582, + "learning_rate": 4.9997964446493326e-05, + "loss": 7.2765, + "step": 684 + }, + { + "epoch": 0.00407388904748311, + "grad_norm": 9.070426940917969, + "learning_rate": 4.9997958481585756e-05, + "loss": 7.5187, + "step": 685 + }, + { + "epoch": 0.004079836330764107, + "grad_norm": 2.7480480670928955, + "learning_rate": 4.9997952507951645e-05, + "loss": 7.5244, + "step": 686 + }, + { + "epoch": 0.004085783614045104, + "grad_norm": 3.8338348865509033, + "learning_rate": 4.999794652559101e-05, + "loss": 7.6672, + "step": 687 + }, + { + "epoch": 0.0040917308973261015, + "grad_norm": 3.1132454872131348, + "learning_rate": 4.999794053450385e-05, + "loss": 7.9594, + "step": 688 + }, + { + "epoch": 0.004097678180607098, + "grad_norm": 2.6279757022857666, + "learning_rate": 4.999793453469017e-05, + "loss": 7.4737, + "step": 689 + }, + { + "epoch": 0.004103625463888096, + "grad_norm": 3.440145492553711, + "learning_rate": 4.9997928526149966e-05, + "loss": 7.2968, + "step": 690 + }, + { + "epoch": 0.004109572747169093, + "grad_norm": 2.3300867080688477, + "learning_rate": 4.9997922508883244e-05, + "loss": 7.3693, + "step": 691 + }, + { + "epoch": 0.0041155200304500905, + "grad_norm": 2.9034078121185303, + "learning_rate": 4.999791648289001e-05, + "loss": 7.7227, + "step": 692 + }, + { + "epoch": 0.004121467313731087, + "grad_norm": 2.5685503482818604, + "learning_rate": 4.9997910448170254e-05, + "loss": 7.9706, + "step": 693 + }, + { + "epoch": 0.004127414597012085, + "grad_norm": 3.260779619216919, + "learning_rate": 4.9997904404723986e-05, + "loss": 7.7231, + "step": 694 + }, + { + "epoch": 0.004133361880293082, + "grad_norm": 2.668193817138672, + "learning_rate": 4.999789835255121e-05, + "loss": 7.7677, + "step": 695 + }, + { + "epoch": 0.0041393091635740795, + "grad_norm": 2.545276641845703, + "learning_rate": 4.999789229165193e-05, + "loss": 7.9297, + "step": 696 + }, + { + "epoch": 0.004145256446855076, + "grad_norm": 3.2137503623962402, + "learning_rate": 4.9997886222026146e-05, + "loss": 7.697, + "step": 697 + }, + { + "epoch": 0.004151203730136074, + "grad_norm": 2.7501730918884277, + "learning_rate": 4.999788014367385e-05, + "loss": 7.3686, + "step": 698 + }, + { + "epoch": 0.004157151013417071, + "grad_norm": 2.2456486225128174, + "learning_rate": 4.9997874056595055e-05, + "loss": 7.7238, + "step": 699 + }, + { + "epoch": 0.0041630982966980685, + "grad_norm": 2.3958070278167725, + "learning_rate": 4.9997867960789764e-05, + "loss": 7.8349, + "step": 700 + }, + { + "epoch": 0.004169045579979065, + "grad_norm": 2.509744644165039, + "learning_rate": 4.9997861856257974e-05, + "loss": 7.5884, + "step": 701 + }, + { + "epoch": 0.004174992863260063, + "grad_norm": 3.6095783710479736, + "learning_rate": 4.9997855742999684e-05, + "loss": 7.4726, + "step": 702 + }, + { + "epoch": 0.00418094014654106, + "grad_norm": 3.3515326976776123, + "learning_rate": 4.99978496210149e-05, + "loss": 7.5214, + "step": 703 + }, + { + "epoch": 0.0041868874298220575, + "grad_norm": 4.7553791999816895, + "learning_rate": 4.999784349030363e-05, + "loss": 7.4577, + "step": 704 + }, + { + "epoch": 0.004192834713103054, + "grad_norm": 5.959117412567139, + "learning_rate": 4.9997837350865874e-05, + "loss": 7.2559, + "step": 705 + }, + { + "epoch": 0.004198781996384052, + "grad_norm": 2.9650065898895264, + "learning_rate": 4.999783120270163e-05, + "loss": 7.3712, + "step": 706 + }, + { + "epoch": 0.004204729279665049, + "grad_norm": 3.4171416759490967, + "learning_rate": 4.9997825045810895e-05, + "loss": 7.5014, + "step": 707 + }, + { + "epoch": 0.0042106765629460466, + "grad_norm": 3.297393798828125, + "learning_rate": 4.9997818880193684e-05, + "loss": 7.4553, + "step": 708 + }, + { + "epoch": 0.004216623846227043, + "grad_norm": 3.193859338760376, + "learning_rate": 4.999781270584999e-05, + "loss": 7.3414, + "step": 709 + }, + { + "epoch": 0.004222571129508041, + "grad_norm": 2.5028324127197266, + "learning_rate": 4.999780652277982e-05, + "loss": 7.4615, + "step": 710 + }, + { + "epoch": 0.004228518412789038, + "grad_norm": 3.43390154838562, + "learning_rate": 4.999780033098317e-05, + "loss": 7.3801, + "step": 711 + }, + { + "epoch": 0.004234465696070036, + "grad_norm": 3.3093984127044678, + "learning_rate": 4.999779413046004e-05, + "loss": 7.2938, + "step": 712 + }, + { + "epoch": 0.004240412979351032, + "grad_norm": 2.6643831729888916, + "learning_rate": 4.999778792121046e-05, + "loss": 7.3916, + "step": 713 + }, + { + "epoch": 0.00424636026263203, + "grad_norm": 2.779407501220703, + "learning_rate": 4.999778170323439e-05, + "loss": 7.5783, + "step": 714 + }, + { + "epoch": 0.004252307545913027, + "grad_norm": 2.959345817565918, + "learning_rate": 4.999777547653186e-05, + "loss": 7.9854, + "step": 715 + }, + { + "epoch": 0.004258254829194025, + "grad_norm": 2.9909780025482178, + "learning_rate": 4.9997769241102866e-05, + "loss": 7.997, + "step": 716 + }, + { + "epoch": 0.004264202112475021, + "grad_norm": 3.081831932067871, + "learning_rate": 4.9997762996947405e-05, + "loss": 7.9393, + "step": 717 + }, + { + "epoch": 0.004270149395756018, + "grad_norm": 2.8901429176330566, + "learning_rate": 4.9997756744065485e-05, + "loss": 7.8152, + "step": 718 + }, + { + "epoch": 0.004276096679037016, + "grad_norm": 3.3065547943115234, + "learning_rate": 4.9997750482457106e-05, + "loss": 7.1176, + "step": 719 + }, + { + "epoch": 0.004282043962318013, + "grad_norm": 3.1083710193634033, + "learning_rate": 4.9997744212122276e-05, + "loss": 7.6215, + "step": 720 + }, + { + "epoch": 0.00428799124559901, + "grad_norm": 4.010551452636719, + "learning_rate": 4.9997737933060987e-05, + "loss": 7.7665, + "step": 721 + }, + { + "epoch": 0.004293938528880007, + "grad_norm": 3.9287984371185303, + "learning_rate": 4.9997731645273245e-05, + "loss": 7.7185, + "step": 722 + }, + { + "epoch": 0.004299885812161005, + "grad_norm": 2.7739338874816895, + "learning_rate": 4.999772534875905e-05, + "loss": 7.7226, + "step": 723 + }, + { + "epoch": 0.004305833095442002, + "grad_norm": 2.675567865371704, + "learning_rate": 4.9997719043518414e-05, + "loss": 7.686, + "step": 724 + }, + { + "epoch": 0.0043117803787229994, + "grad_norm": 3.8513898849487305, + "learning_rate": 4.999771272955133e-05, + "loss": 7.6584, + "step": 725 + }, + { + "epoch": 0.004317727662003996, + "grad_norm": 10.309504508972168, + "learning_rate": 4.99977064068578e-05, + "loss": 7.4006, + "step": 726 + }, + { + "epoch": 0.004323674945284994, + "grad_norm": 2.712939977645874, + "learning_rate": 4.9997700075437836e-05, + "loss": 7.6275, + "step": 727 + }, + { + "epoch": 0.004329622228565991, + "grad_norm": 2.7880115509033203, + "learning_rate": 4.999769373529143e-05, + "loss": 7.4154, + "step": 728 + }, + { + "epoch": 0.0043355695118469885, + "grad_norm": 3.2352819442749023, + "learning_rate": 4.999768738641859e-05, + "loss": 7.4827, + "step": 729 + }, + { + "epoch": 0.004341516795127985, + "grad_norm": 3.5176644325256348, + "learning_rate": 4.999768102881931e-05, + "loss": 7.4748, + "step": 730 + }, + { + "epoch": 0.004347464078408983, + "grad_norm": 2.996829032897949, + "learning_rate": 4.99976746624936e-05, + "loss": 7.445, + "step": 731 + }, + { + "epoch": 0.00435341136168998, + "grad_norm": 4.5892534255981445, + "learning_rate": 4.9997668287441454e-05, + "loss": 7.6464, + "step": 732 + }, + { + "epoch": 0.0043593586449709775, + "grad_norm": 3.689419984817505, + "learning_rate": 4.999766190366289e-05, + "loss": 7.4215, + "step": 733 + }, + { + "epoch": 0.004365305928251974, + "grad_norm": 2.9146885871887207, + "learning_rate": 4.9997655511157896e-05, + "loss": 7.4852, + "step": 734 + }, + { + "epoch": 0.004371253211532972, + "grad_norm": 3.8503024578094482, + "learning_rate": 4.9997649109926484e-05, + "loss": 7.4779, + "step": 735 + }, + { + "epoch": 0.004377200494813969, + "grad_norm": 3.929422616958618, + "learning_rate": 4.9997642699968646e-05, + "loss": 7.3526, + "step": 736 + }, + { + "epoch": 0.0043831477780949665, + "grad_norm": 3.3365838527679443, + "learning_rate": 4.999763628128439e-05, + "loss": 7.3895, + "step": 737 + }, + { + "epoch": 0.004389095061375963, + "grad_norm": 3.147660970687866, + "learning_rate": 4.999762985387372e-05, + "loss": 7.1885, + "step": 738 + }, + { + "epoch": 0.004395042344656961, + "grad_norm": 3.3230104446411133, + "learning_rate": 4.9997623417736626e-05, + "loss": 7.5839, + "step": 739 + }, + { + "epoch": 0.004400989627937958, + "grad_norm": 3.285144090652466, + "learning_rate": 4.999761697287313e-05, + "loss": 7.4859, + "step": 740 + }, + { + "epoch": 0.0044069369112189555, + "grad_norm": 3.3811442852020264, + "learning_rate": 4.9997610519283216e-05, + "loss": 7.4871, + "step": 741 + }, + { + "epoch": 0.004412884194499952, + "grad_norm": 2.9662907123565674, + "learning_rate": 4.9997604056966904e-05, + "loss": 7.2546, + "step": 742 + }, + { + "epoch": 0.00441883147778095, + "grad_norm": 3.1432855129241943, + "learning_rate": 4.999759758592418e-05, + "loss": 7.5273, + "step": 743 + }, + { + "epoch": 0.004424778761061947, + "grad_norm": 3.0559749603271484, + "learning_rate": 4.9997591106155054e-05, + "loss": 7.0754, + "step": 744 + }, + { + "epoch": 0.0044307260443429445, + "grad_norm": 2.6778409481048584, + "learning_rate": 4.999758461765953e-05, + "loss": 7.1723, + "step": 745 + }, + { + "epoch": 0.004436673327623941, + "grad_norm": 2.592228412628174, + "learning_rate": 4.9997578120437606e-05, + "loss": 7.2671, + "step": 746 + }, + { + "epoch": 0.004442620610904939, + "grad_norm": 2.5546112060546875, + "learning_rate": 4.999757161448928e-05, + "loss": 7.2571, + "step": 747 + }, + { + "epoch": 0.004448567894185936, + "grad_norm": 2.745755672454834, + "learning_rate": 4.999756509981457e-05, + "loss": 7.3895, + "step": 748 + }, + { + "epoch": 0.004454515177466933, + "grad_norm": 2.9785144329071045, + "learning_rate": 4.999755857641346e-05, + "loss": 7.2431, + "step": 749 + }, + { + "epoch": 0.00446046246074793, + "grad_norm": 2.918891191482544, + "learning_rate": 4.9997552044285965e-05, + "loss": 7.3805, + "step": 750 + }, + { + "epoch": 0.004466409744028927, + "grad_norm": 2.7858519554138184, + "learning_rate": 4.999754550343209e-05, + "loss": 7.5942, + "step": 751 + }, + { + "epoch": 0.004472357027309925, + "grad_norm": 2.7758638858795166, + "learning_rate": 4.999753895385181e-05, + "loss": 7.5896, + "step": 752 + }, + { + "epoch": 0.004478304310590922, + "grad_norm": 2.7125916481018066, + "learning_rate": 4.999753239554517e-05, + "loss": 7.4341, + "step": 753 + }, + { + "epoch": 0.004484251593871919, + "grad_norm": 4.241726875305176, + "learning_rate": 4.999752582851214e-05, + "loss": 7.0517, + "step": 754 + }, + { + "epoch": 0.004490198877152916, + "grad_norm": 2.9547781944274902, + "learning_rate": 4.999751925275272e-05, + "loss": 7.2616, + "step": 755 + }, + { + "epoch": 0.004496146160433914, + "grad_norm": 4.2594122886657715, + "learning_rate": 4.9997512668266945e-05, + "loss": 7.3069, + "step": 756 + }, + { + "epoch": 0.004502093443714911, + "grad_norm": 4.1758246421813965, + "learning_rate": 4.9997506075054776e-05, + "loss": 7.3417, + "step": 757 + }, + { + "epoch": 0.004508040726995908, + "grad_norm": 2.8398962020874023, + "learning_rate": 4.999749947311625e-05, + "loss": 7.107, + "step": 758 + }, + { + "epoch": 0.004513988010276905, + "grad_norm": 3.487478017807007, + "learning_rate": 4.9997492862451354e-05, + "loss": 7.0014, + "step": 759 + }, + { + "epoch": 0.004519935293557903, + "grad_norm": 2.883409261703491, + "learning_rate": 4.999748624306009e-05, + "loss": 7.4691, + "step": 760 + }, + { + "epoch": 0.0045258825768389, + "grad_norm": 3.0092155933380127, + "learning_rate": 4.999747961494246e-05, + "loss": 7.3771, + "step": 761 + }, + { + "epoch": 0.004531829860119897, + "grad_norm": 2.9571943283081055, + "learning_rate": 4.999747297809847e-05, + "loss": 7.4664, + "step": 762 + }, + { + "epoch": 0.004537777143400894, + "grad_norm": 2.7476816177368164, + "learning_rate": 4.999746633252812e-05, + "loss": 7.2943, + "step": 763 + }, + { + "epoch": 0.004543724426681892, + "grad_norm": 4.903059959411621, + "learning_rate": 4.9997459678231415e-05, + "loss": 7.3467, + "step": 764 + }, + { + "epoch": 0.004549671709962889, + "grad_norm": 3.8205373287200928, + "learning_rate": 4.999745301520835e-05, + "loss": 7.2807, + "step": 765 + }, + { + "epoch": 0.0045556189932438864, + "grad_norm": 2.6003127098083496, + "learning_rate": 4.9997446343458934e-05, + "loss": 7.2736, + "step": 766 + }, + { + "epoch": 0.004561566276524883, + "grad_norm": 3.288313627243042, + "learning_rate": 4.999743966298317e-05, + "loss": 7.3832, + "step": 767 + }, + { + "epoch": 0.004567513559805881, + "grad_norm": 3.4839234352111816, + "learning_rate": 4.999743297378106e-05, + "loss": 7.2932, + "step": 768 + }, + { + "epoch": 0.004573460843086878, + "grad_norm": 3.2667462825775146, + "learning_rate": 4.99974262758526e-05, + "loss": 7.4855, + "step": 769 + }, + { + "epoch": 0.0045794081263678755, + "grad_norm": 3.3637850284576416, + "learning_rate": 4.99974195691978e-05, + "loss": 7.4864, + "step": 770 + }, + { + "epoch": 0.004585355409648872, + "grad_norm": 4.691596508026123, + "learning_rate": 4.999741285381666e-05, + "loss": 7.4751, + "step": 771 + }, + { + "epoch": 0.00459130269292987, + "grad_norm": 3.8831942081451416, + "learning_rate": 4.999740612970918e-05, + "loss": 7.4554, + "step": 772 + }, + { + "epoch": 0.004597249976210867, + "grad_norm": 2.9129562377929688, + "learning_rate": 4.999739939687536e-05, + "loss": 7.7096, + "step": 773 + }, + { + "epoch": 0.0046031972594918645, + "grad_norm": 3.928882598876953, + "learning_rate": 4.9997392655315207e-05, + "loss": 7.6453, + "step": 774 + }, + { + "epoch": 0.004609144542772861, + "grad_norm": 4.19191312789917, + "learning_rate": 4.9997385905028726e-05, + "loss": 7.6038, + "step": 775 + }, + { + "epoch": 0.004615091826053859, + "grad_norm": 2.4585883617401123, + "learning_rate": 4.999737914601591e-05, + "loss": 7.5734, + "step": 776 + }, + { + "epoch": 0.004621039109334856, + "grad_norm": 3.500932455062866, + "learning_rate": 4.9997372378276776e-05, + "loss": 7.6535, + "step": 777 + }, + { + "epoch": 0.0046269863926158535, + "grad_norm": 3.1256210803985596, + "learning_rate": 4.9997365601811306e-05, + "loss": 7.4844, + "step": 778 + }, + { + "epoch": 0.00463293367589685, + "grad_norm": 2.083902597427368, + "learning_rate": 4.999735881661952e-05, + "loss": 7.646, + "step": 779 + }, + { + "epoch": 0.004638880959177847, + "grad_norm": 2.2990450859069824, + "learning_rate": 4.999735202270142e-05, + "loss": 7.5756, + "step": 780 + }, + { + "epoch": 0.004644828242458845, + "grad_norm": 2.782463550567627, + "learning_rate": 4.9997345220057004e-05, + "loss": 7.6191, + "step": 781 + }, + { + "epoch": 0.004650775525739842, + "grad_norm": 4.157378673553467, + "learning_rate": 4.9997338408686255e-05, + "loss": 7.5265, + "step": 782 + }, + { + "epoch": 0.004656722809020839, + "grad_norm": 2.850106716156006, + "learning_rate": 4.999733158858921e-05, + "loss": 7.4562, + "step": 783 + }, + { + "epoch": 0.004662670092301836, + "grad_norm": 2.8073840141296387, + "learning_rate": 4.999732475976585e-05, + "loss": 7.3913, + "step": 784 + }, + { + "epoch": 0.004668617375582834, + "grad_norm": 2.85048770904541, + "learning_rate": 4.999731792221618e-05, + "loss": 7.3945, + "step": 785 + }, + { + "epoch": 0.004674564658863831, + "grad_norm": 2.760990619659424, + "learning_rate": 4.999731107594021e-05, + "loss": 7.6088, + "step": 786 + }, + { + "epoch": 0.004680511942144828, + "grad_norm": 2.4395666122436523, + "learning_rate": 4.9997304220937933e-05, + "loss": 7.6996, + "step": 787 + }, + { + "epoch": 0.004686459225425825, + "grad_norm": 2.5826008319854736, + "learning_rate": 4.9997297357209354e-05, + "loss": 7.5888, + "step": 788 + }, + { + "epoch": 0.004692406508706823, + "grad_norm": 3.434957981109619, + "learning_rate": 4.999729048475448e-05, + "loss": 7.4659, + "step": 789 + }, + { + "epoch": 0.00469835379198782, + "grad_norm": 4.103111743927002, + "learning_rate": 4.9997283603573306e-05, + "loss": 7.6704, + "step": 790 + }, + { + "epoch": 0.004704301075268817, + "grad_norm": 3.7879343032836914, + "learning_rate": 4.999727671366584e-05, + "loss": 7.5387, + "step": 791 + }, + { + "epoch": 0.004710248358549814, + "grad_norm": 3.706599235534668, + "learning_rate": 4.999726981503209e-05, + "loss": 7.3413, + "step": 792 + }, + { + "epoch": 0.004716195641830812, + "grad_norm": 2.1999869346618652, + "learning_rate": 4.999726290767204e-05, + "loss": 7.1809, + "step": 793 + }, + { + "epoch": 0.004722142925111809, + "grad_norm": 2.8561251163482666, + "learning_rate": 4.999725599158571e-05, + "loss": 7.3496, + "step": 794 + }, + { + "epoch": 0.004728090208392806, + "grad_norm": 3.0696613788604736, + "learning_rate": 4.99972490667731e-05, + "loss": 7.542, + "step": 795 + }, + { + "epoch": 0.004734037491673803, + "grad_norm": 2.706404685974121, + "learning_rate": 4.99972421332342e-05, + "loss": 7.4233, + "step": 796 + }, + { + "epoch": 0.004739984774954801, + "grad_norm": 2.388360023498535, + "learning_rate": 4.9997235190969025e-05, + "loss": 7.5754, + "step": 797 + }, + { + "epoch": 0.004745932058235798, + "grad_norm": 2.3414177894592285, + "learning_rate": 4.999722823997758e-05, + "loss": 7.438, + "step": 798 + }, + { + "epoch": 0.004751879341516795, + "grad_norm": 2.46012544631958, + "learning_rate": 4.999722128025985e-05, + "loss": 6.9522, + "step": 799 + }, + { + "epoch": 0.004757826624797792, + "grad_norm": 2.5721335411071777, + "learning_rate": 4.9997214311815855e-05, + "loss": 6.9632, + "step": 800 + }, + { + "epoch": 0.00476377390807879, + "grad_norm": 2.4028279781341553, + "learning_rate": 4.999720733464559e-05, + "loss": 7.3834, + "step": 801 + }, + { + "epoch": 0.004769721191359787, + "grad_norm": 2.378971576690674, + "learning_rate": 4.9997200348749055e-05, + "loss": 7.7919, + "step": 802 + }, + { + "epoch": 0.004775668474640784, + "grad_norm": 2.1871516704559326, + "learning_rate": 4.999719335412626e-05, + "loss": 7.6832, + "step": 803 + }, + { + "epoch": 0.004781615757921781, + "grad_norm": 2.4183239936828613, + "learning_rate": 4.9997186350777206e-05, + "loss": 7.5013, + "step": 804 + }, + { + "epoch": 0.004787563041202779, + "grad_norm": 2.2322120666503906, + "learning_rate": 4.9997179338701884e-05, + "loss": 7.4224, + "step": 805 + }, + { + "epoch": 0.004793510324483776, + "grad_norm": 3.2633447647094727, + "learning_rate": 4.99971723179003e-05, + "loss": 7.1966, + "step": 806 + }, + { + "epoch": 0.004799457607764773, + "grad_norm": 3.1195995807647705, + "learning_rate": 4.999716528837247e-05, + "loss": 7.4057, + "step": 807 + }, + { + "epoch": 0.00480540489104577, + "grad_norm": 2.6904098987579346, + "learning_rate": 4.9997158250118395e-05, + "loss": 7.4585, + "step": 808 + }, + { + "epoch": 0.004811352174326768, + "grad_norm": 2.6955599784851074, + "learning_rate": 4.999715120313806e-05, + "loss": 7.6053, + "step": 809 + }, + { + "epoch": 0.004817299457607765, + "grad_norm": 3.569037675857544, + "learning_rate": 4.999714414743148e-05, + "loss": 7.5085, + "step": 810 + }, + { + "epoch": 0.004823246740888762, + "grad_norm": 3.5231528282165527, + "learning_rate": 4.9997137082998655e-05, + "loss": 7.4554, + "step": 811 + }, + { + "epoch": 0.004829194024169759, + "grad_norm": 2.7118120193481445, + "learning_rate": 4.999713000983959e-05, + "loss": 7.4323, + "step": 812 + }, + { + "epoch": 0.004835141307450756, + "grad_norm": 3.229548931121826, + "learning_rate": 4.9997122927954284e-05, + "loss": 7.3098, + "step": 813 + }, + { + "epoch": 0.004841088590731754, + "grad_norm": 2.4224696159362793, + "learning_rate": 4.999711583734273e-05, + "loss": 7.3488, + "step": 814 + }, + { + "epoch": 0.004847035874012751, + "grad_norm": 2.627565383911133, + "learning_rate": 4.999710873800496e-05, + "loss": 7.457, + "step": 815 + }, + { + "epoch": 0.004852983157293748, + "grad_norm": 2.5339515209198, + "learning_rate": 4.999710162994094e-05, + "loss": 7.6602, + "step": 816 + }, + { + "epoch": 0.004858930440574745, + "grad_norm": 2.663694143295288, + "learning_rate": 4.9997094513150706e-05, + "loss": 7.1064, + "step": 817 + }, + { + "epoch": 0.004864877723855743, + "grad_norm": 2.372504472732544, + "learning_rate": 4.9997087387634234e-05, + "loss": 7.341, + "step": 818 + }, + { + "epoch": 0.00487082500713674, + "grad_norm": 2.145191192626953, + "learning_rate": 4.999708025339154e-05, + "loss": 7.3216, + "step": 819 + }, + { + "epoch": 0.004876772290417737, + "grad_norm": 2.39685320854187, + "learning_rate": 4.9997073110422626e-05, + "loss": 7.3463, + "step": 820 + }, + { + "epoch": 0.004882719573698734, + "grad_norm": 2.2227275371551514, + "learning_rate": 4.999706595872749e-05, + "loss": 7.2517, + "step": 821 + }, + { + "epoch": 0.004888666856979732, + "grad_norm": 2.7770352363586426, + "learning_rate": 4.999705879830614e-05, + "loss": 7.3117, + "step": 822 + }, + { + "epoch": 0.004894614140260729, + "grad_norm": 2.448026180267334, + "learning_rate": 4.999705162915857e-05, + "loss": 6.9883, + "step": 823 + }, + { + "epoch": 0.004900561423541726, + "grad_norm": 2.2304437160491943, + "learning_rate": 4.999704445128479e-05, + "loss": 7.2644, + "step": 824 + }, + { + "epoch": 0.004906508706822723, + "grad_norm": 2.351707696914673, + "learning_rate": 4.9997037264684796e-05, + "loss": 7.1984, + "step": 825 + }, + { + "epoch": 0.004912455990103721, + "grad_norm": 2.7631921768188477, + "learning_rate": 4.99970300693586e-05, + "loss": 7.3774, + "step": 826 + }, + { + "epoch": 0.004918403273384718, + "grad_norm": 2.4636785984039307, + "learning_rate": 4.9997022865306195e-05, + "loss": 7.3778, + "step": 827 + }, + { + "epoch": 0.004924350556665715, + "grad_norm": 3.5510878562927246, + "learning_rate": 4.999701565252759e-05, + "loss": 7.166, + "step": 828 + }, + { + "epoch": 0.004930297839946712, + "grad_norm": 3.2581429481506348, + "learning_rate": 4.999700843102278e-05, + "loss": 7.286, + "step": 829 + }, + { + "epoch": 0.00493624512322771, + "grad_norm": 2.4304182529449463, + "learning_rate": 4.999700120079178e-05, + "loss": 7.5076, + "step": 830 + }, + { + "epoch": 0.004942192406508707, + "grad_norm": 2.428854465484619, + "learning_rate": 4.999699396183458e-05, + "loss": 7.405, + "step": 831 + }, + { + "epoch": 0.004948139689789704, + "grad_norm": 2.7680416107177734, + "learning_rate": 4.9996986714151195e-05, + "loss": 7.4944, + "step": 832 + }, + { + "epoch": 0.004954086973070701, + "grad_norm": 2.6787109375, + "learning_rate": 4.999697945774161e-05, + "loss": 7.5946, + "step": 833 + }, + { + "epoch": 0.004960034256351699, + "grad_norm": 2.6396615505218506, + "learning_rate": 4.9996972192605845e-05, + "loss": 7.5405, + "step": 834 + }, + { + "epoch": 0.004965981539632696, + "grad_norm": 2.89387583732605, + "learning_rate": 4.999696491874389e-05, + "loss": 7.3809, + "step": 835 + }, + { + "epoch": 0.004971928822913693, + "grad_norm": 2.332838535308838, + "learning_rate": 4.999695763615576e-05, + "loss": 7.3638, + "step": 836 + }, + { + "epoch": 0.00497787610619469, + "grad_norm": 2.2880585193634033, + "learning_rate": 4.9996950344841444e-05, + "loss": 7.3557, + "step": 837 + }, + { + "epoch": 0.004983823389475688, + "grad_norm": 2.7478256225585938, + "learning_rate": 4.999694304480096e-05, + "loss": 7.4, + "step": 838 + }, + { + "epoch": 0.004989770672756685, + "grad_norm": 3.4789531230926514, + "learning_rate": 4.999693573603429e-05, + "loss": 7.4438, + "step": 839 + }, + { + "epoch": 0.004995717956037682, + "grad_norm": 2.7377078533172607, + "learning_rate": 4.9996928418541455e-05, + "loss": 7.4074, + "step": 840 + }, + { + "epoch": 0.005001665239318679, + "grad_norm": 3.04420804977417, + "learning_rate": 4.9996921092322444e-05, + "loss": 7.3834, + "step": 841 + }, + { + "epoch": 0.005007612522599676, + "grad_norm": 2.759244203567505, + "learning_rate": 4.999691375737727e-05, + "loss": 7.4492, + "step": 842 + }, + { + "epoch": 0.005013559805880674, + "grad_norm": 2.5327556133270264, + "learning_rate": 4.9996906413705933e-05, + "loss": 7.4403, + "step": 843 + }, + { + "epoch": 0.0050195070891616705, + "grad_norm": 2.8170409202575684, + "learning_rate": 4.9996899061308434e-05, + "loss": 7.623, + "step": 844 + }, + { + "epoch": 0.005025454372442668, + "grad_norm": 3.8642547130584717, + "learning_rate": 4.9996891700184774e-05, + "loss": 7.6099, + "step": 845 + }, + { + "epoch": 0.005031401655723665, + "grad_norm": 4.704552173614502, + "learning_rate": 4.999688433033496e-05, + "loss": 7.6755, + "step": 846 + }, + { + "epoch": 0.005037348939004663, + "grad_norm": 4.128530979156494, + "learning_rate": 4.9996876951758986e-05, + "loss": 7.5246, + "step": 847 + }, + { + "epoch": 0.0050432962222856596, + "grad_norm": 2.233447551727295, + "learning_rate": 4.9996869564456865e-05, + "loss": 7.1139, + "step": 848 + }, + { + "epoch": 0.005049243505566657, + "grad_norm": 5.96085262298584, + "learning_rate": 4.999686216842859e-05, + "loss": 7.4114, + "step": 849 + }, + { + "epoch": 0.005055190788847654, + "grad_norm": 4.828244686126709, + "learning_rate": 4.9996854763674175e-05, + "loss": 7.6743, + "step": 850 + }, + { + "epoch": 0.005061138072128652, + "grad_norm": 3.0259342193603516, + "learning_rate": 4.999684735019362e-05, + "loss": 7.7537, + "step": 851 + }, + { + "epoch": 0.005067085355409649, + "grad_norm": 2.807244062423706, + "learning_rate": 4.999683992798692e-05, + "loss": 7.7744, + "step": 852 + }, + { + "epoch": 0.005073032638690646, + "grad_norm": 2.81384015083313, + "learning_rate": 4.999683249705408e-05, + "loss": 7.2922, + "step": 853 + }, + { + "epoch": 0.005078979921971643, + "grad_norm": 2.582836627960205, + "learning_rate": 4.9996825057395105e-05, + "loss": 7.3421, + "step": 854 + }, + { + "epoch": 0.005084927205252641, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9996817609009996e-05, + "loss": 7.6249, + "step": 855 + }, + { + "epoch": 0.005090874488533638, + "grad_norm": 2.3322219848632812, + "learning_rate": 4.999681015189875e-05, + "loss": 7.4695, + "step": 856 + }, + { + "epoch": 0.005096821771814635, + "grad_norm": 2.5582947731018066, + "learning_rate": 4.9996802686061384e-05, + "loss": 7.2747, + "step": 857 + }, + { + "epoch": 0.005102769055095632, + "grad_norm": 3.192093849182129, + "learning_rate": 4.999679521149789e-05, + "loss": 7.504, + "step": 858 + }, + { + "epoch": 0.00510871633837663, + "grad_norm": 4.1585588455200195, + "learning_rate": 4.999678772820827e-05, + "loss": 7.5966, + "step": 859 + }, + { + "epoch": 0.005114663621657627, + "grad_norm": 5.052750587463379, + "learning_rate": 4.999678023619253e-05, + "loss": 7.3243, + "step": 860 + }, + { + "epoch": 0.005120610904938624, + "grad_norm": 2.395909070968628, + "learning_rate": 4.999677273545068e-05, + "loss": 7.4477, + "step": 861 + }, + { + "epoch": 0.005126558188219621, + "grad_norm": 2.487334966659546, + "learning_rate": 4.999676522598271e-05, + "loss": 7.591, + "step": 862 + }, + { + "epoch": 0.005132505471500619, + "grad_norm": 3.7094171047210693, + "learning_rate": 4.999675770778863e-05, + "loss": 7.5387, + "step": 863 + }, + { + "epoch": 0.005138452754781616, + "grad_norm": 4.468298435211182, + "learning_rate": 4.9996750180868435e-05, + "loss": 7.5754, + "step": 864 + }, + { + "epoch": 0.005144400038062613, + "grad_norm": 3.2769386768341064, + "learning_rate": 4.999674264522213e-05, + "loss": 7.459, + "step": 865 + }, + { + "epoch": 0.00515034732134361, + "grad_norm": 2.7162864208221436, + "learning_rate": 4.9996735100849726e-05, + "loss": 7.3473, + "step": 866 + }, + { + "epoch": 0.005156294604624608, + "grad_norm": 3.646401882171631, + "learning_rate": 4.999672754775122e-05, + "loss": 7.4446, + "step": 867 + }, + { + "epoch": 0.005162241887905605, + "grad_norm": 8.917684555053711, + "learning_rate": 4.999671998592662e-05, + "loss": 7.2016, + "step": 868 + }, + { + "epoch": 0.005168189171186602, + "grad_norm": 2.949993133544922, + "learning_rate": 4.999671241537591e-05, + "loss": 7.3081, + "step": 869 + }, + { + "epoch": 0.005174136454467599, + "grad_norm": 2.4531025886535645, + "learning_rate": 4.999670483609912e-05, + "loss": 7.402, + "step": 870 + }, + { + "epoch": 0.005180083737748597, + "grad_norm": 3.1903798580169678, + "learning_rate": 4.999669724809623e-05, + "loss": 7.2514, + "step": 871 + }, + { + "epoch": 0.005186031021029594, + "grad_norm": 3.461353302001953, + "learning_rate": 4.999668965136726e-05, + "loss": 7.1637, + "step": 872 + }, + { + "epoch": 0.005191978304310591, + "grad_norm": 2.623075246810913, + "learning_rate": 4.9996682045912194e-05, + "loss": 7.5482, + "step": 873 + }, + { + "epoch": 0.005197925587591588, + "grad_norm": 2.9072840213775635, + "learning_rate": 4.9996674431731044e-05, + "loss": 7.484, + "step": 874 + }, + { + "epoch": 0.005203872870872585, + "grad_norm": 3.0219666957855225, + "learning_rate": 4.999666680882382e-05, + "loss": 7.5223, + "step": 875 + }, + { + "epoch": 0.005209820154153583, + "grad_norm": 2.9892475605010986, + "learning_rate": 4.9996659177190514e-05, + "loss": 7.3843, + "step": 876 + }, + { + "epoch": 0.0052157674374345795, + "grad_norm": 2.6199591159820557, + "learning_rate": 4.9996651536831126e-05, + "loss": 7.2728, + "step": 877 + }, + { + "epoch": 0.005221714720715577, + "grad_norm": 2.6897647380828857, + "learning_rate": 4.999664388774567e-05, + "loss": 7.5323, + "step": 878 + }, + { + "epoch": 0.005227662003996574, + "grad_norm": 3.5945560932159424, + "learning_rate": 4.9996636229934155e-05, + "loss": 7.5001, + "step": 879 + }, + { + "epoch": 0.005233609287277572, + "grad_norm": 2.9064812660217285, + "learning_rate": 4.9996628563396563e-05, + "loss": 7.5463, + "step": 880 + }, + { + "epoch": 0.0052395565705585685, + "grad_norm": 3.6150660514831543, + "learning_rate": 4.999662088813291e-05, + "loss": 7.6596, + "step": 881 + }, + { + "epoch": 0.005245503853839566, + "grad_norm": 2.729684591293335, + "learning_rate": 4.99966132041432e-05, + "loss": 7.5342, + "step": 882 + }, + { + "epoch": 0.005251451137120563, + "grad_norm": 2.6782853603363037, + "learning_rate": 4.9996605511427416e-05, + "loss": 7.5837, + "step": 883 + }, + { + "epoch": 0.005257398420401561, + "grad_norm": 4.171568393707275, + "learning_rate": 4.9996597809985576e-05, + "loss": 7.3626, + "step": 884 + }, + { + "epoch": 0.0052633457036825575, + "grad_norm": 2.189725637435913, + "learning_rate": 4.999659009981769e-05, + "loss": 7.5431, + "step": 885 + }, + { + "epoch": 0.005269292986963555, + "grad_norm": 2.2473320960998535, + "learning_rate": 4.999658238092375e-05, + "loss": 7.4731, + "step": 886 + }, + { + "epoch": 0.005275240270244552, + "grad_norm": 3.4393012523651123, + "learning_rate": 4.999657465330376e-05, + "loss": 7.6839, + "step": 887 + }, + { + "epoch": 0.00528118755352555, + "grad_norm": 2.717742919921875, + "learning_rate": 4.9996566916957735e-05, + "loss": 7.6812, + "step": 888 + }, + { + "epoch": 0.0052871348368065466, + "grad_norm": 3.829698085784912, + "learning_rate": 4.9996559171885655e-05, + "loss": 7.4525, + "step": 889 + }, + { + "epoch": 0.005293082120087544, + "grad_norm": 2.764598846435547, + "learning_rate": 4.9996551418087536e-05, + "loss": 7.5379, + "step": 890 + }, + { + "epoch": 0.005299029403368541, + "grad_norm": 2.4230268001556396, + "learning_rate": 4.999654365556338e-05, + "loss": 7.454, + "step": 891 + }, + { + "epoch": 0.005304976686649539, + "grad_norm": 2.31870436668396, + "learning_rate": 4.999653588431319e-05, + "loss": 7.5306, + "step": 892 + }, + { + "epoch": 0.005310923969930536, + "grad_norm": 2.332259178161621, + "learning_rate": 4.999652810433697e-05, + "loss": 7.4008, + "step": 893 + }, + { + "epoch": 0.005316871253211533, + "grad_norm": 2.630568504333496, + "learning_rate": 4.999652031563471e-05, + "loss": 7.4046, + "step": 894 + }, + { + "epoch": 0.00532281853649253, + "grad_norm": 3.327211856842041, + "learning_rate": 4.999651251820643e-05, + "loss": 7.2901, + "step": 895 + }, + { + "epoch": 0.005328765819773528, + "grad_norm": 2.2383713722229004, + "learning_rate": 4.999650471205213e-05, + "loss": 7.5116, + "step": 896 + }, + { + "epoch": 0.005334713103054525, + "grad_norm": 2.972820997238159, + "learning_rate": 4.99964968971718e-05, + "loss": 7.4013, + "step": 897 + }, + { + "epoch": 0.005340660386335522, + "grad_norm": 2.7254672050476074, + "learning_rate": 4.999648907356545e-05, + "loss": 7.3174, + "step": 898 + }, + { + "epoch": 0.005346607669616519, + "grad_norm": 2.6943607330322266, + "learning_rate": 4.9996481241233096e-05, + "loss": 7.386, + "step": 899 + }, + { + "epoch": 0.005352554952897517, + "grad_norm": 2.9217519760131836, + "learning_rate": 4.999647340017473e-05, + "loss": 7.5398, + "step": 900 + }, + { + "epoch": 0.005358502236178514, + "grad_norm": 2.7950780391693115, + "learning_rate": 4.999646555039034e-05, + "loss": 7.6336, + "step": 901 + }, + { + "epoch": 0.005364449519459511, + "grad_norm": 2.763364553451538, + "learning_rate": 4.999645769187995e-05, + "loss": 7.5161, + "step": 902 + }, + { + "epoch": 0.005370396802740508, + "grad_norm": 2.3095102310180664, + "learning_rate": 4.999644982464355e-05, + "loss": 7.5859, + "step": 903 + }, + { + "epoch": 0.005376344086021506, + "grad_norm": 2.7287917137145996, + "learning_rate": 4.999644194868115e-05, + "loss": 7.3983, + "step": 904 + }, + { + "epoch": 0.005382291369302503, + "grad_norm": 2.6175942420959473, + "learning_rate": 4.999643406399275e-05, + "loss": 7.4278, + "step": 905 + }, + { + "epoch": 0.0053882386525834994, + "grad_norm": 2.3898375034332275, + "learning_rate": 4.999642617057835e-05, + "loss": 7.4537, + "step": 906 + }, + { + "epoch": 0.005394185935864497, + "grad_norm": 2.964381694793701, + "learning_rate": 4.999641826843796e-05, + "loss": 7.3258, + "step": 907 + }, + { + "epoch": 0.005400133219145494, + "grad_norm": 3.1146717071533203, + "learning_rate": 4.999641035757158e-05, + "loss": 7.5412, + "step": 908 + }, + { + "epoch": 0.005406080502426492, + "grad_norm": 3.4733238220214844, + "learning_rate": 4.999640243797921e-05, + "loss": 7.423, + "step": 909 + }, + { + "epoch": 0.0054120277857074885, + "grad_norm": 3.621044158935547, + "learning_rate": 4.999639450966085e-05, + "loss": 7.5885, + "step": 910 + }, + { + "epoch": 0.005417975068988486, + "grad_norm": 2.4800662994384766, + "learning_rate": 4.999638657261651e-05, + "loss": 7.5231, + "step": 911 + }, + { + "epoch": 0.005423922352269483, + "grad_norm": 3.3247363567352295, + "learning_rate": 4.999637862684619e-05, + "loss": 7.2367, + "step": 912 + }, + { + "epoch": 0.005429869635550481, + "grad_norm": 4.293686866760254, + "learning_rate": 4.999637067234989e-05, + "loss": 6.8423, + "step": 913 + }, + { + "epoch": 0.0054358169188314775, + "grad_norm": 2.6713979244232178, + "learning_rate": 4.999636270912762e-05, + "loss": 6.7962, + "step": 914 + }, + { + "epoch": 0.005441764202112475, + "grad_norm": 2.9386653900146484, + "learning_rate": 4.9996354737179376e-05, + "loss": 6.7582, + "step": 915 + }, + { + "epoch": 0.005447711485393472, + "grad_norm": 2.8030481338500977, + "learning_rate": 4.999634675650516e-05, + "loss": 6.6516, + "step": 916 + }, + { + "epoch": 0.00545365876867447, + "grad_norm": 2.7315666675567627, + "learning_rate": 4.9996338767104985e-05, + "loss": 6.6159, + "step": 917 + }, + { + "epoch": 0.0054596060519554665, + "grad_norm": 3.116098403930664, + "learning_rate": 4.999633076897884e-05, + "loss": 7.2121, + "step": 918 + }, + { + "epoch": 0.005465553335236464, + "grad_norm": 2.867687940597534, + "learning_rate": 4.999632276212673e-05, + "loss": 7.5124, + "step": 919 + }, + { + "epoch": 0.005471500618517461, + "grad_norm": 2.9864203929901123, + "learning_rate": 4.9996314746548676e-05, + "loss": 7.5168, + "step": 920 + }, + { + "epoch": 0.005477447901798459, + "grad_norm": 2.9083375930786133, + "learning_rate": 4.9996306722244656e-05, + "loss": 7.5027, + "step": 921 + }, + { + "epoch": 0.0054833951850794555, + "grad_norm": 2.5569801330566406, + "learning_rate": 4.9996298689214686e-05, + "loss": 7.2988, + "step": 922 + }, + { + "epoch": 0.005489342468360453, + "grad_norm": 3.7101242542266846, + "learning_rate": 4.9996290647458765e-05, + "loss": 7.33, + "step": 923 + }, + { + "epoch": 0.00549528975164145, + "grad_norm": 2.848881244659424, + "learning_rate": 4.99962825969769e-05, + "loss": 7.4534, + "step": 924 + }, + { + "epoch": 0.005501237034922448, + "grad_norm": 3.072282075881958, + "learning_rate": 4.999627453776909e-05, + "loss": 7.4398, + "step": 925 + }, + { + "epoch": 0.0055071843182034445, + "grad_norm": 2.8132996559143066, + "learning_rate": 4.999626646983534e-05, + "loss": 7.5617, + "step": 926 + }, + { + "epoch": 0.005513131601484442, + "grad_norm": 2.2710142135620117, + "learning_rate": 4.999625839317565e-05, + "loss": 7.5975, + "step": 927 + }, + { + "epoch": 0.005519078884765439, + "grad_norm": 2.745007276535034, + "learning_rate": 4.9996250307790026e-05, + "loss": 7.4599, + "step": 928 + }, + { + "epoch": 0.005525026168046437, + "grad_norm": 3.2031302452087402, + "learning_rate": 4.999624221367847e-05, + "loss": 7.3528, + "step": 929 + }, + { + "epoch": 0.0055309734513274336, + "grad_norm": 6.417830467224121, + "learning_rate": 4.999623411084098e-05, + "loss": 7.5118, + "step": 930 + }, + { + "epoch": 0.005536920734608431, + "grad_norm": 2.7960314750671387, + "learning_rate": 4.999622599927756e-05, + "loss": 6.5016, + "step": 931 + }, + { + "epoch": 0.005542868017889428, + "grad_norm": 2.959507703781128, + "learning_rate": 4.999621787898822e-05, + "loss": 7.6521, + "step": 932 + }, + { + "epoch": 0.005548815301170426, + "grad_norm": 3.328834056854248, + "learning_rate": 4.999620974997296e-05, + "loss": 7.6267, + "step": 933 + }, + { + "epoch": 0.005554762584451423, + "grad_norm": 2.5232200622558594, + "learning_rate": 4.9996201612231786e-05, + "loss": 7.471, + "step": 934 + }, + { + "epoch": 0.00556070986773242, + "grad_norm": 2.2766942977905273, + "learning_rate": 4.999619346576468e-05, + "loss": 7.4204, + "step": 935 + }, + { + "epoch": 0.005566657151013417, + "grad_norm": 2.584068536758423, + "learning_rate": 4.999618531057168e-05, + "loss": 7.4384, + "step": 936 + }, + { + "epoch": 0.005572604434294414, + "grad_norm": 3.004523277282715, + "learning_rate": 4.999617714665276e-05, + "loss": 7.5681, + "step": 937 + }, + { + "epoch": 0.005578551717575412, + "grad_norm": 4.102936267852783, + "learning_rate": 4.999616897400794e-05, + "loss": 7.4571, + "step": 938 + }, + { + "epoch": 0.005584499000856408, + "grad_norm": 2.745293378829956, + "learning_rate": 4.99961607926372e-05, + "loss": 7.588, + "step": 939 + }, + { + "epoch": 0.005590446284137406, + "grad_norm": 2.9720282554626465, + "learning_rate": 4.9996152602540576e-05, + "loss": 7.4761, + "step": 940 + }, + { + "epoch": 0.005596393567418403, + "grad_norm": 3.150047540664673, + "learning_rate": 4.999614440371805e-05, + "loss": 7.4525, + "step": 941 + }, + { + "epoch": 0.005602340850699401, + "grad_norm": 2.6735856533050537, + "learning_rate": 4.999613619616962e-05, + "loss": 7.2754, + "step": 942 + }, + { + "epoch": 0.005608288133980397, + "grad_norm": 2.6451661586761475, + "learning_rate": 4.9996127979895304e-05, + "loss": 7.5742, + "step": 943 + }, + { + "epoch": 0.005614235417261395, + "grad_norm": 2.7551536560058594, + "learning_rate": 4.9996119754895095e-05, + "loss": 7.4981, + "step": 944 + }, + { + "epoch": 0.005620182700542392, + "grad_norm": 2.7445640563964844, + "learning_rate": 4.9996111521168995e-05, + "loss": 7.4761, + "step": 945 + }, + { + "epoch": 0.00562612998382339, + "grad_norm": 2.537924289703369, + "learning_rate": 4.9996103278717013e-05, + "loss": 7.5483, + "step": 946 + }, + { + "epoch": 0.0056320772671043864, + "grad_norm": 3.503661632537842, + "learning_rate": 4.9996095027539156e-05, + "loss": 7.3074, + "step": 947 + }, + { + "epoch": 0.005638024550385384, + "grad_norm": 2.8088479042053223, + "learning_rate": 4.999608676763542e-05, + "loss": 7.5675, + "step": 948 + }, + { + "epoch": 0.005643971833666381, + "grad_norm": 2.6219863891601562, + "learning_rate": 4.99960784990058e-05, + "loss": 7.6037, + "step": 949 + }, + { + "epoch": 0.005649919116947379, + "grad_norm": 2.88737416267395, + "learning_rate": 4.999607022165031e-05, + "loss": 7.4815, + "step": 950 + }, + { + "epoch": 0.0056558664002283755, + "grad_norm": 2.455707550048828, + "learning_rate": 4.999606193556895e-05, + "loss": 7.553, + "step": 951 + }, + { + "epoch": 0.005661813683509373, + "grad_norm": 2.2502405643463135, + "learning_rate": 4.999605364076173e-05, + "loss": 7.387, + "step": 952 + }, + { + "epoch": 0.00566776096679037, + "grad_norm": 2.754972457885742, + "learning_rate": 4.9996045337228635e-05, + "loss": 7.3088, + "step": 953 + }, + { + "epoch": 0.005673708250071368, + "grad_norm": 3.111553192138672, + "learning_rate": 4.9996037024969686e-05, + "loss": 7.5063, + "step": 954 + }, + { + "epoch": 0.0056796555333523645, + "grad_norm": 2.4000720977783203, + "learning_rate": 4.9996028703984875e-05, + "loss": 7.5705, + "step": 955 + }, + { + "epoch": 0.005685602816633362, + "grad_norm": 2.495659351348877, + "learning_rate": 4.9996020374274215e-05, + "loss": 7.5421, + "step": 956 + }, + { + "epoch": 0.005691550099914359, + "grad_norm": 3.025509834289551, + "learning_rate": 4.99960120358377e-05, + "loss": 7.5406, + "step": 957 + }, + { + "epoch": 0.005697497383195357, + "grad_norm": 2.224342107772827, + "learning_rate": 4.999600368867533e-05, + "loss": 7.4323, + "step": 958 + }, + { + "epoch": 0.0057034446664763535, + "grad_norm": 2.661423683166504, + "learning_rate": 4.999599533278712e-05, + "loss": 7.565, + "step": 959 + }, + { + "epoch": 0.005709391949757351, + "grad_norm": 2.503293037414551, + "learning_rate": 4.999598696817307e-05, + "loss": 7.3552, + "step": 960 + }, + { + "epoch": 0.005715339233038348, + "grad_norm": 2.2878923416137695, + "learning_rate": 4.999597859483316e-05, + "loss": 7.4542, + "step": 961 + }, + { + "epoch": 0.005721286516319346, + "grad_norm": 2.759594678878784, + "learning_rate": 4.999597021276743e-05, + "loss": 7.2349, + "step": 962 + }, + { + "epoch": 0.0057272337996003425, + "grad_norm": 4.5453314781188965, + "learning_rate": 4.999596182197586e-05, + "loss": 7.4728, + "step": 963 + }, + { + "epoch": 0.00573318108288134, + "grad_norm": 2.4369568824768066, + "learning_rate": 4.999595342245846e-05, + "loss": 7.4396, + "step": 964 + }, + { + "epoch": 0.005739128366162337, + "grad_norm": 2.4081692695617676, + "learning_rate": 4.999594501421523e-05, + "loss": 7.536, + "step": 965 + }, + { + "epoch": 0.005745075649443335, + "grad_norm": 3.0494678020477295, + "learning_rate": 4.9995936597246176e-05, + "loss": 7.4061, + "step": 966 + }, + { + "epoch": 0.0057510229327243315, + "grad_norm": 3.3492188453674316, + "learning_rate": 4.999592817155129e-05, + "loss": 7.5419, + "step": 967 + }, + { + "epoch": 0.005756970216005328, + "grad_norm": 2.254714012145996, + "learning_rate": 4.999591973713059e-05, + "loss": 7.4568, + "step": 968 + }, + { + "epoch": 0.005762917499286326, + "grad_norm": 2.3336634635925293, + "learning_rate": 4.999591129398407e-05, + "loss": 7.4386, + "step": 969 + }, + { + "epoch": 0.005768864782567323, + "grad_norm": 2.545154094696045, + "learning_rate": 4.999590284211174e-05, + "loss": 7.226, + "step": 970 + }, + { + "epoch": 0.0057748120658483205, + "grad_norm": 2.891068458557129, + "learning_rate": 4.99958943815136e-05, + "loss": 7.4235, + "step": 971 + }, + { + "epoch": 0.005780759349129317, + "grad_norm": 3.0321712493896484, + "learning_rate": 4.999588591218964e-05, + "loss": 7.2918, + "step": 972 + }, + { + "epoch": 0.005786706632410315, + "grad_norm": 2.935490846633911, + "learning_rate": 4.9995877434139884e-05, + "loss": 7.4172, + "step": 973 + }, + { + "epoch": 0.005792653915691312, + "grad_norm": 3.0021424293518066, + "learning_rate": 4.9995868947364324e-05, + "loss": 7.521, + "step": 974 + }, + { + "epoch": 0.0057986011989723096, + "grad_norm": 2.2784783840179443, + "learning_rate": 4.9995860451862964e-05, + "loss": 7.5716, + "step": 975 + }, + { + "epoch": 0.005804548482253306, + "grad_norm": 2.9321484565734863, + "learning_rate": 4.999585194763581e-05, + "loss": 7.0965, + "step": 976 + }, + { + "epoch": 0.005810495765534304, + "grad_norm": 2.284874439239502, + "learning_rate": 4.999584343468285e-05, + "loss": 7.4376, + "step": 977 + }, + { + "epoch": 0.005816443048815301, + "grad_norm": 2.2066683769226074, + "learning_rate": 4.9995834913004115e-05, + "loss": 7.4478, + "step": 978 + }, + { + "epoch": 0.005822390332096299, + "grad_norm": 2.286323070526123, + "learning_rate": 4.999582638259959e-05, + "loss": 7.4139, + "step": 979 + }, + { + "epoch": 0.005828337615377295, + "grad_norm": 2.5052928924560547, + "learning_rate": 4.999581784346927e-05, + "loss": 7.4278, + "step": 980 + }, + { + "epoch": 0.005834284898658293, + "grad_norm": 2.273698091506958, + "learning_rate": 4.9995809295613175e-05, + "loss": 7.4019, + "step": 981 + }, + { + "epoch": 0.00584023218193929, + "grad_norm": 2.729466676712036, + "learning_rate": 4.999580073903129e-05, + "loss": 7.4716, + "step": 982 + }, + { + "epoch": 0.005846179465220288, + "grad_norm": 2.5776185989379883, + "learning_rate": 4.999579217372365e-05, + "loss": 7.4708, + "step": 983 + }, + { + "epoch": 0.005852126748501284, + "grad_norm": 2.4125893115997314, + "learning_rate": 4.9995783599690226e-05, + "loss": 7.4505, + "step": 984 + }, + { + "epoch": 0.005858074031782282, + "grad_norm": 2.975911855697632, + "learning_rate": 4.9995775016931035e-05, + "loss": 7.4095, + "step": 985 + }, + { + "epoch": 0.005864021315063279, + "grad_norm": 2.4155962467193604, + "learning_rate": 4.9995766425446076e-05, + "loss": 7.3084, + "step": 986 + }, + { + "epoch": 0.005869968598344277, + "grad_norm": 2.436950922012329, + "learning_rate": 4.999575782523535e-05, + "loss": 7.2782, + "step": 987 + }, + { + "epoch": 0.0058759158816252734, + "grad_norm": 2.2371575832366943, + "learning_rate": 4.999574921629887e-05, + "loss": 7.3879, + "step": 988 + }, + { + "epoch": 0.005881863164906271, + "grad_norm": 2.3079733848571777, + "learning_rate": 4.999574059863663e-05, + "loss": 7.5117, + "step": 989 + }, + { + "epoch": 0.005887810448187268, + "grad_norm": 2.4018514156341553, + "learning_rate": 4.9995731972248626e-05, + "loss": 7.4486, + "step": 990 + }, + { + "epoch": 0.005893757731468266, + "grad_norm": 2.3437294960021973, + "learning_rate": 4.9995723337134884e-05, + "loss": 7.461, + "step": 991 + }, + { + "epoch": 0.0058997050147492625, + "grad_norm": 3.15254545211792, + "learning_rate": 4.999571469329538e-05, + "loss": 7.014, + "step": 992 + }, + { + "epoch": 0.00590565229803026, + "grad_norm": 2.4809768199920654, + "learning_rate": 4.999570604073014e-05, + "loss": 7.4339, + "step": 993 + }, + { + "epoch": 0.005911599581311257, + "grad_norm": 3.4286630153656006, + "learning_rate": 4.9995697379439154e-05, + "loss": 7.3086, + "step": 994 + }, + { + "epoch": 0.005917546864592255, + "grad_norm": 3.9362127780914307, + "learning_rate": 4.999568870942243e-05, + "loss": 7.2635, + "step": 995 + }, + { + "epoch": 0.0059234941478732515, + "grad_norm": 2.6632091999053955, + "learning_rate": 4.9995680030679965e-05, + "loss": 7.2779, + "step": 996 + }, + { + "epoch": 0.005929441431154249, + "grad_norm": 5.218096733093262, + "learning_rate": 4.999567134321177e-05, + "loss": 7.4285, + "step": 997 + }, + { + "epoch": 0.005935388714435246, + "grad_norm": 3.441894769668579, + "learning_rate": 4.9995662647017835e-05, + "loss": 7.5576, + "step": 998 + }, + { + "epoch": 0.005941335997716243, + "grad_norm": 2.560178279876709, + "learning_rate": 4.9995653942098184e-05, + "loss": 7.5692, + "step": 999 + }, + { + "epoch": 0.0059472832809972405, + "grad_norm": 2.458313226699829, + "learning_rate": 4.999564522845281e-05, + "loss": 7.0495, + "step": 1000 + }, + { + "epoch": 0.005953230564278237, + "grad_norm": 2.539314031600952, + "learning_rate": 4.999563650608171e-05, + "loss": 7.1919, + "step": 1001 + }, + { + "epoch": 0.005959177847559235, + "grad_norm": 3.6134390830993652, + "learning_rate": 4.999562777498489e-05, + "loss": 7.0725, + "step": 1002 + }, + { + "epoch": 0.005965125130840232, + "grad_norm": 2.6582295894622803, + "learning_rate": 4.9995619035162355e-05, + "loss": 7.3008, + "step": 1003 + }, + { + "epoch": 0.0059710724141212295, + "grad_norm": 2.4968035221099854, + "learning_rate": 4.999561028661411e-05, + "loss": 7.2862, + "step": 1004 + }, + { + "epoch": 0.005977019697402226, + "grad_norm": 3.002840042114258, + "learning_rate": 4.999560152934015e-05, + "loss": 7.1721, + "step": 1005 + }, + { + "epoch": 0.005982966980683224, + "grad_norm": 3.4327914714813232, + "learning_rate": 4.999559276334049e-05, + "loss": 7.242, + "step": 1006 + }, + { + "epoch": 0.005988914263964221, + "grad_norm": 2.4082493782043457, + "learning_rate": 4.999558398861513e-05, + "loss": 7.1588, + "step": 1007 + }, + { + "epoch": 0.0059948615472452185, + "grad_norm": 2.39475417137146, + "learning_rate": 4.9995575205164056e-05, + "loss": 7.1713, + "step": 1008 + }, + { + "epoch": 0.006000808830526215, + "grad_norm": 2.946331024169922, + "learning_rate": 4.99955664129873e-05, + "loss": 7.1553, + "step": 1009 + }, + { + "epoch": 0.006006756113807213, + "grad_norm": 2.4334871768951416, + "learning_rate": 4.999555761208484e-05, + "loss": 7.1898, + "step": 1010 + }, + { + "epoch": 0.00601270339708821, + "grad_norm": 2.3159971237182617, + "learning_rate": 4.999554880245669e-05, + "loss": 7.0642, + "step": 1011 + }, + { + "epoch": 0.0060186506803692075, + "grad_norm": 2.9773905277252197, + "learning_rate": 4.9995539984102854e-05, + "loss": 7.3285, + "step": 1012 + }, + { + "epoch": 0.006024597963650204, + "grad_norm": 3.444267749786377, + "learning_rate": 4.999553115702334e-05, + "loss": 7.1263, + "step": 1013 + }, + { + "epoch": 0.006030545246931202, + "grad_norm": 2.6518173217773438, + "learning_rate": 4.9995522321218136e-05, + "loss": 7.3915, + "step": 1014 + }, + { + "epoch": 0.006036492530212199, + "grad_norm": 2.46230149269104, + "learning_rate": 4.9995513476687254e-05, + "loss": 7.1808, + "step": 1015 + }, + { + "epoch": 0.0060424398134931966, + "grad_norm": 2.2243192195892334, + "learning_rate": 4.99955046234307e-05, + "loss": 7.4262, + "step": 1016 + }, + { + "epoch": 0.006048387096774193, + "grad_norm": 3.0834670066833496, + "learning_rate": 4.999549576144847e-05, + "loss": 7.4028, + "step": 1017 + }, + { + "epoch": 0.006054334380055191, + "grad_norm": 3.2453930377960205, + "learning_rate": 4.9995486890740573e-05, + "loss": 7.5537, + "step": 1018 + }, + { + "epoch": 0.006060281663336188, + "grad_norm": 2.7142229080200195, + "learning_rate": 4.9995478011307015e-05, + "loss": 7.4131, + "step": 1019 + }, + { + "epoch": 0.006066228946617186, + "grad_norm": 2.9567463397979736, + "learning_rate": 4.9995469123147784e-05, + "loss": 7.5969, + "step": 1020 + }, + { + "epoch": 0.006072176229898182, + "grad_norm": 2.5698695182800293, + "learning_rate": 4.99954602262629e-05, + "loss": 7.2721, + "step": 1021 + }, + { + "epoch": 0.00607812351317918, + "grad_norm": 2.3958864212036133, + "learning_rate": 4.999545132065235e-05, + "loss": 7.3414, + "step": 1022 + }, + { + "epoch": 0.006084070796460177, + "grad_norm": 2.528024911880493, + "learning_rate": 4.9995442406316156e-05, + "loss": 7.2821, + "step": 1023 + }, + { + "epoch": 0.006090018079741175, + "grad_norm": 2.6904075145721436, + "learning_rate": 4.999543348325431e-05, + "loss": 7.3726, + "step": 1024 + }, + { + "epoch": 0.006095965363022171, + "grad_norm": 2.8618202209472656, + "learning_rate": 4.999542455146681e-05, + "loss": 7.4232, + "step": 1025 + }, + { + "epoch": 0.006101912646303169, + "grad_norm": 1.978455662727356, + "learning_rate": 4.999541561095367e-05, + "loss": 7.5949, + "step": 1026 + }, + { + "epoch": 0.006107859929584166, + "grad_norm": 2.882568836212158, + "learning_rate": 4.999540666171489e-05, + "loss": 7.4868, + "step": 1027 + }, + { + "epoch": 0.006113807212865164, + "grad_norm": 2.9586474895477295, + "learning_rate": 4.999539770375047e-05, + "loss": 7.1556, + "step": 1028 + }, + { + "epoch": 0.00611975449614616, + "grad_norm": 2.5675363540649414, + "learning_rate": 4.999538873706041e-05, + "loss": 7.3306, + "step": 1029 + }, + { + "epoch": 0.006125701779427157, + "grad_norm": 3.440857410430908, + "learning_rate": 4.999537976164472e-05, + "loss": 7.3654, + "step": 1030 + }, + { + "epoch": 0.006131649062708155, + "grad_norm": 3.7741217613220215, + "learning_rate": 4.999537077750341e-05, + "loss": 6.8088, + "step": 1031 + }, + { + "epoch": 0.006137596345989152, + "grad_norm": 3.801609754562378, + "learning_rate": 4.999536178463647e-05, + "loss": 6.989, + "step": 1032 + }, + { + "epoch": 0.0061435436292701495, + "grad_norm": 2.627225875854492, + "learning_rate": 4.9995352783043905e-05, + "loss": 7.4066, + "step": 1033 + }, + { + "epoch": 0.006149490912551146, + "grad_norm": 3.3529040813446045, + "learning_rate": 4.9995343772725725e-05, + "loss": 7.0403, + "step": 1034 + }, + { + "epoch": 0.006155438195832144, + "grad_norm": 3.248558521270752, + "learning_rate": 4.999533475368192e-05, + "loss": 7.2664, + "step": 1035 + }, + { + "epoch": 0.006161385479113141, + "grad_norm": 3.1260814666748047, + "learning_rate": 4.9995325725912515e-05, + "loss": 7.3257, + "step": 1036 + }, + { + "epoch": 0.0061673327623941385, + "grad_norm": 2.379659414291382, + "learning_rate": 4.999531668941748e-05, + "loss": 7.4448, + "step": 1037 + }, + { + "epoch": 0.006173280045675135, + "grad_norm": 2.8478498458862305, + "learning_rate": 4.999530764419685e-05, + "loss": 7.3892, + "step": 1038 + }, + { + "epoch": 0.006179227328956133, + "grad_norm": 4.104954719543457, + "learning_rate": 4.999529859025062e-05, + "loss": 7.5172, + "step": 1039 + }, + { + "epoch": 0.00618517461223713, + "grad_norm": 2.50160813331604, + "learning_rate": 4.999528952757879e-05, + "loss": 7.1894, + "step": 1040 + }, + { + "epoch": 0.0061911218955181275, + "grad_norm": 2.5545871257781982, + "learning_rate": 4.999528045618136e-05, + "loss": 7.3892, + "step": 1041 + }, + { + "epoch": 0.006197069178799124, + "grad_norm": 2.9980626106262207, + "learning_rate": 4.999527137605833e-05, + "loss": 7.3517, + "step": 1042 + }, + { + "epoch": 0.006203016462080122, + "grad_norm": 2.5920562744140625, + "learning_rate": 4.999526228720971e-05, + "loss": 7.1716, + "step": 1043 + }, + { + "epoch": 0.006208963745361119, + "grad_norm": 2.5224244594573975, + "learning_rate": 4.999525318963551e-05, + "loss": 7.1892, + "step": 1044 + }, + { + "epoch": 0.0062149110286421165, + "grad_norm": 2.7092106342315674, + "learning_rate": 4.999524408333572e-05, + "loss": 7.178, + "step": 1045 + }, + { + "epoch": 0.006220858311923113, + "grad_norm": 2.523320198059082, + "learning_rate": 4.999523496831035e-05, + "loss": 7.1486, + "step": 1046 + }, + { + "epoch": 0.006226805595204111, + "grad_norm": 2.4491217136383057, + "learning_rate": 4.99952258445594e-05, + "loss": 7.121, + "step": 1047 + }, + { + "epoch": 0.006232752878485108, + "grad_norm": 2.29109263420105, + "learning_rate": 4.9995216712082875e-05, + "loss": 7.4323, + "step": 1048 + }, + { + "epoch": 0.0062387001617661055, + "grad_norm": 2.5234057903289795, + "learning_rate": 4.9995207570880783e-05, + "loss": 7.1552, + "step": 1049 + }, + { + "epoch": 0.006244647445047102, + "grad_norm": 2.301316499710083, + "learning_rate": 4.9995198420953115e-05, + "loss": 7.3625, + "step": 1050 + }, + { + "epoch": 0.0062505947283281, + "grad_norm": 2.4358527660369873, + "learning_rate": 4.999518926229989e-05, + "loss": 7.2462, + "step": 1051 + }, + { + "epoch": 0.006256542011609097, + "grad_norm": 2.3915181159973145, + "learning_rate": 4.999518009492109e-05, + "loss": 7.173, + "step": 1052 + }, + { + "epoch": 0.0062624892948900945, + "grad_norm": 2.5529091358184814, + "learning_rate": 4.999517091881674e-05, + "loss": 7.2463, + "step": 1053 + }, + { + "epoch": 0.006268436578171091, + "grad_norm": 3.235435724258423, + "learning_rate": 4.999516173398683e-05, + "loss": 7.1149, + "step": 1054 + }, + { + "epoch": 0.006274383861452089, + "grad_norm": 2.692140817642212, + "learning_rate": 4.9995152540431375e-05, + "loss": 7.3554, + "step": 1055 + }, + { + "epoch": 0.006280331144733086, + "grad_norm": 2.910116195678711, + "learning_rate": 4.999514333815036e-05, + "loss": 7.4424, + "step": 1056 + }, + { + "epoch": 0.0062862784280140836, + "grad_norm": 2.897463798522949, + "learning_rate": 4.9995134127143804e-05, + "loss": 7.2345, + "step": 1057 + }, + { + "epoch": 0.00629222571129508, + "grad_norm": 2.5925514698028564, + "learning_rate": 4.999512490741171e-05, + "loss": 7.1539, + "step": 1058 + }, + { + "epoch": 0.006298172994576078, + "grad_norm": 2.693816900253296, + "learning_rate": 4.999511567895407e-05, + "loss": 7.0905, + "step": 1059 + }, + { + "epoch": 0.006304120277857075, + "grad_norm": 3.3717474937438965, + "learning_rate": 4.9995106441770896e-05, + "loss": 7.1407, + "step": 1060 + }, + { + "epoch": 0.006310067561138072, + "grad_norm": 2.6128973960876465, + "learning_rate": 4.999509719586218e-05, + "loss": 7.2748, + "step": 1061 + }, + { + "epoch": 0.006316014844419069, + "grad_norm": 2.24324369430542, + "learning_rate": 4.999508794122795e-05, + "loss": 7.2553, + "step": 1062 + }, + { + "epoch": 0.006321962127700066, + "grad_norm": 2.7593698501586914, + "learning_rate": 4.999507867786818e-05, + "loss": 7.1039, + "step": 1063 + }, + { + "epoch": 0.006327909410981064, + "grad_norm": 2.6210618019104004, + "learning_rate": 4.999506940578289e-05, + "loss": 7.0247, + "step": 1064 + }, + { + "epoch": 0.006333856694262061, + "grad_norm": 2.410187244415283, + "learning_rate": 4.9995060124972084e-05, + "loss": 7.3931, + "step": 1065 + }, + { + "epoch": 0.006339803977543058, + "grad_norm": 2.795302391052246, + "learning_rate": 4.999505083543575e-05, + "loss": 7.3168, + "step": 1066 + }, + { + "epoch": 0.006345751260824055, + "grad_norm": 2.3720662593841553, + "learning_rate": 4.999504153717391e-05, + "loss": 7.3719, + "step": 1067 + }, + { + "epoch": 0.006351698544105053, + "grad_norm": 2.721585988998413, + "learning_rate": 4.9995032230186556e-05, + "loss": 7.3847, + "step": 1068 + }, + { + "epoch": 0.00635764582738605, + "grad_norm": 2.967153549194336, + "learning_rate": 4.99950229144737e-05, + "loss": 7.3224, + "step": 1069 + }, + { + "epoch": 0.006363593110667047, + "grad_norm": 3.8144783973693848, + "learning_rate": 4.999501359003533e-05, + "loss": 7.0767, + "step": 1070 + }, + { + "epoch": 0.006369540393948044, + "grad_norm": 3.7694199085235596, + "learning_rate": 4.999500425687147e-05, + "loss": 7.4486, + "step": 1071 + }, + { + "epoch": 0.006375487677229042, + "grad_norm": 2.9668312072753906, + "learning_rate": 4.999499491498211e-05, + "loss": 7.3415, + "step": 1072 + }, + { + "epoch": 0.006381434960510039, + "grad_norm": 4.196050643920898, + "learning_rate": 4.999498556436725e-05, + "loss": 7.3784, + "step": 1073 + }, + { + "epoch": 0.0063873822437910364, + "grad_norm": 4.676602363586426, + "learning_rate": 4.99949762050269e-05, + "loss": 7.3773, + "step": 1074 + }, + { + "epoch": 0.006393329527072033, + "grad_norm": 2.8828656673431396, + "learning_rate": 4.999496683696107e-05, + "loss": 7.2359, + "step": 1075 + }, + { + "epoch": 0.006399276810353031, + "grad_norm": 2.7532308101654053, + "learning_rate": 4.9994957460169745e-05, + "loss": 7.356, + "step": 1076 + }, + { + "epoch": 0.006405224093634028, + "grad_norm": 5.535451412200928, + "learning_rate": 4.999494807465293e-05, + "loss": 7.261, + "step": 1077 + }, + { + "epoch": 0.0064111713769150255, + "grad_norm": 3.6439530849456787, + "learning_rate": 4.999493868041066e-05, + "loss": 7.4664, + "step": 1078 + }, + { + "epoch": 0.006417118660196022, + "grad_norm": 3.563948154449463, + "learning_rate": 4.99949292774429e-05, + "loss": 7.0427, + "step": 1079 + }, + { + "epoch": 0.00642306594347702, + "grad_norm": 3.6243784427642822, + "learning_rate": 4.9994919865749675e-05, + "loss": 7.3292, + "step": 1080 + }, + { + "epoch": 0.006429013226758017, + "grad_norm": 5.1197590827941895, + "learning_rate": 4.999491044533098e-05, + "loss": 7.3717, + "step": 1081 + }, + { + "epoch": 0.0064349605100390145, + "grad_norm": 4.3969902992248535, + "learning_rate": 4.999490101618682e-05, + "loss": 7.2875, + "step": 1082 + }, + { + "epoch": 0.006440907793320011, + "grad_norm": 2.6302945613861084, + "learning_rate": 4.999489157831719e-05, + "loss": 7.1958, + "step": 1083 + }, + { + "epoch": 0.006446855076601009, + "grad_norm": 3.782078504562378, + "learning_rate": 4.9994882131722116e-05, + "loss": 7.2951, + "step": 1084 + }, + { + "epoch": 0.006452802359882006, + "grad_norm": 3.432082414627075, + "learning_rate": 4.999487267640158e-05, + "loss": 7.0974, + "step": 1085 + }, + { + "epoch": 0.0064587496431630035, + "grad_norm": 3.364793300628662, + "learning_rate": 4.999486321235559e-05, + "loss": 7.0847, + "step": 1086 + }, + { + "epoch": 0.006464696926444, + "grad_norm": 2.7063019275665283, + "learning_rate": 4.999485373958416e-05, + "loss": 7.1421, + "step": 1087 + }, + { + "epoch": 0.006470644209724998, + "grad_norm": 3.0648648738861084, + "learning_rate": 4.999484425808727e-05, + "loss": 7.2723, + "step": 1088 + }, + { + "epoch": 0.006476591493005995, + "grad_norm": 3.3968300819396973, + "learning_rate": 4.999483476786495e-05, + "loss": 7.1438, + "step": 1089 + }, + { + "epoch": 0.0064825387762869925, + "grad_norm": 2.864647150039673, + "learning_rate": 4.999482526891719e-05, + "loss": 7.1512, + "step": 1090 + }, + { + "epoch": 0.006488486059567989, + "grad_norm": 2.577043056488037, + "learning_rate": 4.999481576124399e-05, + "loss": 6.8914, + "step": 1091 + }, + { + "epoch": 0.006494433342848986, + "grad_norm": 2.83754563331604, + "learning_rate": 4.999480624484536e-05, + "loss": 6.9999, + "step": 1092 + }, + { + "epoch": 0.006500380626129984, + "grad_norm": 3.5623857975006104, + "learning_rate": 4.999479671972131e-05, + "loss": 7.0567, + "step": 1093 + }, + { + "epoch": 0.006506327909410981, + "grad_norm": 2.35555362701416, + "learning_rate": 4.9994787185871814e-05, + "loss": 7.3075, + "step": 1094 + }, + { + "epoch": 0.006512275192691978, + "grad_norm": 3.8677117824554443, + "learning_rate": 4.9994777643296914e-05, + "loss": 7.3608, + "step": 1095 + }, + { + "epoch": 0.006518222475972975, + "grad_norm": 3.8163843154907227, + "learning_rate": 4.999476809199659e-05, + "loss": 7.4368, + "step": 1096 + }, + { + "epoch": 0.006524169759253973, + "grad_norm": 2.5424652099609375, + "learning_rate": 4.999475853197085e-05, + "loss": 7.4968, + "step": 1097 + }, + { + "epoch": 0.00653011704253497, + "grad_norm": 2.876898765563965, + "learning_rate": 4.99947489632197e-05, + "loss": 6.9948, + "step": 1098 + }, + { + "epoch": 0.006536064325815967, + "grad_norm": 3.3934860229492188, + "learning_rate": 4.999473938574314e-05, + "loss": 6.9588, + "step": 1099 + }, + { + "epoch": 0.006542011609096964, + "grad_norm": 2.1184024810791016, + "learning_rate": 4.9994729799541176e-05, + "loss": 7.1933, + "step": 1100 + }, + { + "epoch": 0.006547958892377962, + "grad_norm": 2.2882895469665527, + "learning_rate": 4.999472020461381e-05, + "loss": 7.0796, + "step": 1101 + }, + { + "epoch": 0.006553906175658959, + "grad_norm": 3.239429235458374, + "learning_rate": 4.9994710600961045e-05, + "loss": 6.9535, + "step": 1102 + }, + { + "epoch": 0.006559853458939956, + "grad_norm": 2.4653263092041016, + "learning_rate": 4.9994700988582884e-05, + "loss": 6.9316, + "step": 1103 + }, + { + "epoch": 0.006565800742220953, + "grad_norm": 2.511516571044922, + "learning_rate": 4.999469136747933e-05, + "loss": 6.9844, + "step": 1104 + }, + { + "epoch": 0.006571748025501951, + "grad_norm": 2.9725844860076904, + "learning_rate": 4.9994681737650384e-05, + "loss": 7.1955, + "step": 1105 + }, + { + "epoch": 0.006577695308782948, + "grad_norm": 3.04697585105896, + "learning_rate": 4.9994672099096066e-05, + "loss": 7.1044, + "step": 1106 + }, + { + "epoch": 0.006583642592063945, + "grad_norm": 3.395076274871826, + "learning_rate": 4.999466245181635e-05, + "loss": 7.1968, + "step": 1107 + }, + { + "epoch": 0.006589589875344942, + "grad_norm": 2.362884044647217, + "learning_rate": 4.999465279581127e-05, + "loss": 7.3114, + "step": 1108 + }, + { + "epoch": 0.00659553715862594, + "grad_norm": 2.730980396270752, + "learning_rate": 4.99946431310808e-05, + "loss": 7.1978, + "step": 1109 + }, + { + "epoch": 0.006601484441906937, + "grad_norm": 3.288687229156494, + "learning_rate": 4.9994633457624974e-05, + "loss": 7.4397, + "step": 1110 + }, + { + "epoch": 0.006607431725187934, + "grad_norm": 3.3060662746429443, + "learning_rate": 4.999462377544377e-05, + "loss": 7.1638, + "step": 1111 + }, + { + "epoch": 0.006613379008468931, + "grad_norm": 2.2697036266326904, + "learning_rate": 4.9994614084537204e-05, + "loss": 7.2654, + "step": 1112 + }, + { + "epoch": 0.006619326291749929, + "grad_norm": 2.330495595932007, + "learning_rate": 4.999460438490528e-05, + "loss": 7.2132, + "step": 1113 + }, + { + "epoch": 0.006625273575030926, + "grad_norm": 2.8239340782165527, + "learning_rate": 4.999459467654799e-05, + "loss": 7.3477, + "step": 1114 + }, + { + "epoch": 0.0066312208583119234, + "grad_norm": 2.591614246368408, + "learning_rate": 4.999458495946535e-05, + "loss": 7.0377, + "step": 1115 + }, + { + "epoch": 0.00663716814159292, + "grad_norm": 4.554818630218506, + "learning_rate": 4.999457523365736e-05, + "loss": 7.1266, + "step": 1116 + }, + { + "epoch": 0.006643115424873918, + "grad_norm": 2.21018123626709, + "learning_rate": 4.999456549912401e-05, + "loss": 7.1433, + "step": 1117 + }, + { + "epoch": 0.006649062708154915, + "grad_norm": 2.0298593044281006, + "learning_rate": 4.999455575586533e-05, + "loss": 7.257, + "step": 1118 + }, + { + "epoch": 0.0066550099914359125, + "grad_norm": 2.4532642364501953, + "learning_rate": 4.9994546003881305e-05, + "loss": 7.0618, + "step": 1119 + }, + { + "epoch": 0.006660957274716909, + "grad_norm": 2.428380012512207, + "learning_rate": 4.999453624317194e-05, + "loss": 7.2039, + "step": 1120 + }, + { + "epoch": 0.006666904557997907, + "grad_norm": 2.5572609901428223, + "learning_rate": 4.999452647373724e-05, + "loss": 7.0991, + "step": 1121 + }, + { + "epoch": 0.006672851841278904, + "grad_norm": 2.379640817642212, + "learning_rate": 4.999451669557721e-05, + "loss": 7.1424, + "step": 1122 + }, + { + "epoch": 0.006678799124559901, + "grad_norm": 2.5764007568359375, + "learning_rate": 4.999450690869185e-05, + "loss": 7.1218, + "step": 1123 + }, + { + "epoch": 0.006684746407840898, + "grad_norm": 2.6560606956481934, + "learning_rate": 4.999449711308117e-05, + "loss": 7.2994, + "step": 1124 + }, + { + "epoch": 0.006690693691121895, + "grad_norm": 2.4687581062316895, + "learning_rate": 4.999448730874518e-05, + "loss": 7.4169, + "step": 1125 + }, + { + "epoch": 0.006696640974402893, + "grad_norm": 2.8232173919677734, + "learning_rate": 4.999447749568386e-05, + "loss": 7.291, + "step": 1126 + }, + { + "epoch": 0.00670258825768389, + "grad_norm": 2.6960325241088867, + "learning_rate": 4.9994467673897224e-05, + "loss": 7.3162, + "step": 1127 + }, + { + "epoch": 0.006708535540964887, + "grad_norm": 2.222391366958618, + "learning_rate": 4.999445784338528e-05, + "loss": 7.221, + "step": 1128 + }, + { + "epoch": 0.006714482824245884, + "grad_norm": 2.334995985031128, + "learning_rate": 4.9994448004148024e-05, + "loss": 7.4813, + "step": 1129 + }, + { + "epoch": 0.006720430107526882, + "grad_norm": 2.653491497039795, + "learning_rate": 4.999443815618548e-05, + "loss": 7.3515, + "step": 1130 + }, + { + "epoch": 0.006726377390807879, + "grad_norm": 2.6943631172180176, + "learning_rate": 4.999442829949762e-05, + "loss": 7.2674, + "step": 1131 + }, + { + "epoch": 0.006732324674088876, + "grad_norm": 2.395573377609253, + "learning_rate": 4.999441843408447e-05, + "loss": 7.483, + "step": 1132 + }, + { + "epoch": 0.006738271957369873, + "grad_norm": 2.3801541328430176, + "learning_rate": 4.999440855994603e-05, + "loss": 7.3355, + "step": 1133 + }, + { + "epoch": 0.006744219240650871, + "grad_norm": 2.8566555976867676, + "learning_rate": 4.999439867708229e-05, + "loss": 6.8323, + "step": 1134 + }, + { + "epoch": 0.006750166523931868, + "grad_norm": 2.5987985134124756, + "learning_rate": 4.999438878549327e-05, + "loss": 6.957, + "step": 1135 + }, + { + "epoch": 0.006756113807212865, + "grad_norm": 2.4411563873291016, + "learning_rate": 4.9994378885178964e-05, + "loss": 6.9935, + "step": 1136 + }, + { + "epoch": 0.006762061090493862, + "grad_norm": 2.4227802753448486, + "learning_rate": 4.9994368976139386e-05, + "loss": 7.2856, + "step": 1137 + }, + { + "epoch": 0.00676800837377486, + "grad_norm": 2.55317759513855, + "learning_rate": 4.999435905837453e-05, + "loss": 7.1741, + "step": 1138 + }, + { + "epoch": 0.006773955657055857, + "grad_norm": 2.3329968452453613, + "learning_rate": 4.9994349131884396e-05, + "loss": 7.2007, + "step": 1139 + }, + { + "epoch": 0.006779902940336854, + "grad_norm": 2.538499593734741, + "learning_rate": 4.999433919666899e-05, + "loss": 7.1755, + "step": 1140 + }, + { + "epoch": 0.006785850223617851, + "grad_norm": 2.3580374717712402, + "learning_rate": 4.999432925272833e-05, + "loss": 7.2249, + "step": 1141 + }, + { + "epoch": 0.006791797506898849, + "grad_norm": 2.2783255577087402, + "learning_rate": 4.99943193000624e-05, + "loss": 7.3627, + "step": 1142 + }, + { + "epoch": 0.006797744790179846, + "grad_norm": 3.0798208713531494, + "learning_rate": 4.999430933867122e-05, + "loss": 7.2718, + "step": 1143 + }, + { + "epoch": 0.006803692073460843, + "grad_norm": 2.703232526779175, + "learning_rate": 4.9994299368554776e-05, + "loss": 7.116, + "step": 1144 + }, + { + "epoch": 0.00680963935674184, + "grad_norm": 2.480327606201172, + "learning_rate": 4.9994289389713076e-05, + "loss": 6.9743, + "step": 1145 + }, + { + "epoch": 0.006815586640022838, + "grad_norm": 2.2707130908966064, + "learning_rate": 4.9994279402146137e-05, + "loss": 6.9919, + "step": 1146 + }, + { + "epoch": 0.006821533923303835, + "grad_norm": 2.0424580574035645, + "learning_rate": 4.999426940585396e-05, + "loss": 7.0366, + "step": 1147 + }, + { + "epoch": 0.006827481206584832, + "grad_norm": 1.9720054864883423, + "learning_rate": 4.999425940083653e-05, + "loss": 6.8622, + "step": 1148 + }, + { + "epoch": 0.006833428489865829, + "grad_norm": 2.7109742164611816, + "learning_rate": 4.9994249387093864e-05, + "loss": 7.5375, + "step": 1149 + }, + { + "epoch": 0.006839375773146827, + "grad_norm": 2.267328977584839, + "learning_rate": 4.999423936462596e-05, + "loss": 7.5606, + "step": 1150 + }, + { + "epoch": 0.006845323056427824, + "grad_norm": 2.958360433578491, + "learning_rate": 4.999422933343283e-05, + "loss": 7.3503, + "step": 1151 + }, + { + "epoch": 0.006851270339708821, + "grad_norm": 2.2681283950805664, + "learning_rate": 4.9994219293514475e-05, + "loss": 6.9278, + "step": 1152 + }, + { + "epoch": 0.006857217622989818, + "grad_norm": 2.4755337238311768, + "learning_rate": 4.999420924487089e-05, + "loss": 7.1385, + "step": 1153 + }, + { + "epoch": 0.006863164906270815, + "grad_norm": 2.283277988433838, + "learning_rate": 4.999419918750209e-05, + "loss": 6.9287, + "step": 1154 + }, + { + "epoch": 0.006869112189551813, + "grad_norm": 2.3692893981933594, + "learning_rate": 4.999418912140808e-05, + "loss": 7.0648, + "step": 1155 + }, + { + "epoch": 0.00687505947283281, + "grad_norm": 2.2676453590393066, + "learning_rate": 4.999417904658884e-05, + "loss": 6.9754, + "step": 1156 + }, + { + "epoch": 0.006881006756113807, + "grad_norm": 2.4106669425964355, + "learning_rate": 4.9994168963044405e-05, + "loss": 7.033, + "step": 1157 + }, + { + "epoch": 0.006886954039394804, + "grad_norm": 2.947758913040161, + "learning_rate": 4.9994158870774754e-05, + "loss": 7.0821, + "step": 1158 + }, + { + "epoch": 0.006892901322675802, + "grad_norm": 2.5338058471679688, + "learning_rate": 4.9994148769779905e-05, + "loss": 6.9426, + "step": 1159 + }, + { + "epoch": 0.006898848605956799, + "grad_norm": 2.4848148822784424, + "learning_rate": 4.999413866005985e-05, + "loss": 7.2488, + "step": 1160 + }, + { + "epoch": 0.006904795889237796, + "grad_norm": 2.444077730178833, + "learning_rate": 4.999412854161461e-05, + "loss": 6.871, + "step": 1161 + }, + { + "epoch": 0.006910743172518793, + "grad_norm": 2.376962661743164, + "learning_rate": 4.9994118414444174e-05, + "loss": 7.0258, + "step": 1162 + }, + { + "epoch": 0.006916690455799791, + "grad_norm": 3.502023458480835, + "learning_rate": 4.9994108278548545e-05, + "loss": 7.4869, + "step": 1163 + }, + { + "epoch": 0.006922637739080788, + "grad_norm": 3.117741584777832, + "learning_rate": 4.999409813392774e-05, + "loss": 7.4437, + "step": 1164 + }, + { + "epoch": 0.006928585022361785, + "grad_norm": 3.805560827255249, + "learning_rate": 4.999408798058175e-05, + "loss": 7.3796, + "step": 1165 + }, + { + "epoch": 0.006934532305642782, + "grad_norm": 3.67065167427063, + "learning_rate": 4.9994077818510576e-05, + "loss": 7.2304, + "step": 1166 + }, + { + "epoch": 0.00694047958892378, + "grad_norm": 2.5749545097351074, + "learning_rate": 4.9994067647714236e-05, + "loss": 7.0943, + "step": 1167 + }, + { + "epoch": 0.006946426872204777, + "grad_norm": 2.561405897140503, + "learning_rate": 4.9994057468192724e-05, + "loss": 6.9496, + "step": 1168 + }, + { + "epoch": 0.006952374155485774, + "grad_norm": 2.477344512939453, + "learning_rate": 4.999404727994604e-05, + "loss": 7.3494, + "step": 1169 + }, + { + "epoch": 0.006958321438766771, + "grad_norm": 2.897580146789551, + "learning_rate": 4.999403708297419e-05, + "loss": 7.6081, + "step": 1170 + }, + { + "epoch": 0.006964268722047769, + "grad_norm": 3.899249792098999, + "learning_rate": 4.999402687727719e-05, + "loss": 7.4448, + "step": 1171 + }, + { + "epoch": 0.006970216005328766, + "grad_norm": 3.0791561603546143, + "learning_rate": 4.9994016662855025e-05, + "loss": 7.1616, + "step": 1172 + }, + { + "epoch": 0.006976163288609763, + "grad_norm": 2.8212931156158447, + "learning_rate": 4.999400643970771e-05, + "loss": 7.1824, + "step": 1173 + }, + { + "epoch": 0.00698211057189076, + "grad_norm": 4.33271598815918, + "learning_rate": 4.9993996207835246e-05, + "loss": 7.2432, + "step": 1174 + }, + { + "epoch": 0.006988057855171758, + "grad_norm": 2.985125780105591, + "learning_rate": 4.999398596723764e-05, + "loss": 7.6521, + "step": 1175 + }, + { + "epoch": 0.006994005138452755, + "grad_norm": 3.1069905757904053, + "learning_rate": 4.9993975717914885e-05, + "loss": 7.0071, + "step": 1176 + }, + { + "epoch": 0.006999952421733752, + "grad_norm": 2.915214776992798, + "learning_rate": 4.9993965459866995e-05, + "loss": 7.6192, + "step": 1177 + }, + { + "epoch": 0.007005899705014749, + "grad_norm": 5.314033031463623, + "learning_rate": 4.999395519309397e-05, + "loss": 6.9447, + "step": 1178 + }, + { + "epoch": 0.007011846988295747, + "grad_norm": 2.2723114490509033, + "learning_rate": 4.999394491759581e-05, + "loss": 7.1228, + "step": 1179 + }, + { + "epoch": 0.007017794271576744, + "grad_norm": 2.936365842819214, + "learning_rate": 4.999393463337253e-05, + "loss": 7.136, + "step": 1180 + }, + { + "epoch": 0.007023741554857741, + "grad_norm": 2.864250898361206, + "learning_rate": 4.9993924340424115e-05, + "loss": 7.026, + "step": 1181 + }, + { + "epoch": 0.007029688838138738, + "grad_norm": 3.299370050430298, + "learning_rate": 4.9993914038750586e-05, + "loss": 7.1114, + "step": 1182 + }, + { + "epoch": 0.007035636121419736, + "grad_norm": 3.0609943866729736, + "learning_rate": 4.999390372835193e-05, + "loss": 7.3052, + "step": 1183 + }, + { + "epoch": 0.007041583404700733, + "grad_norm": 3.54488468170166, + "learning_rate": 4.9993893409228176e-05, + "loss": 7.4845, + "step": 1184 + }, + { + "epoch": 0.0070475306879817295, + "grad_norm": 2.5196385383605957, + "learning_rate": 4.99938830813793e-05, + "loss": 7.312, + "step": 1185 + }, + { + "epoch": 0.007053477971262727, + "grad_norm": 3.570802927017212, + "learning_rate": 4.9993872744805326e-05, + "loss": 7.0038, + "step": 1186 + }, + { + "epoch": 0.007059425254543724, + "grad_norm": 2.631058931350708, + "learning_rate": 4.999386239950624e-05, + "loss": 7.5574, + "step": 1187 + }, + { + "epoch": 0.007065372537824722, + "grad_norm": 3.027251958847046, + "learning_rate": 4.999385204548206e-05, + "loss": 6.9837, + "step": 1188 + }, + { + "epoch": 0.0070713198211057185, + "grad_norm": 3.00128173828125, + "learning_rate": 4.999384168273279e-05, + "loss": 7.4479, + "step": 1189 + }, + { + "epoch": 0.007077267104386716, + "grad_norm": 2.127028226852417, + "learning_rate": 4.999383131125842e-05, + "loss": 7.3609, + "step": 1190 + }, + { + "epoch": 0.007083214387667713, + "grad_norm": 2.375511646270752, + "learning_rate": 4.9993820931058965e-05, + "loss": 7.3695, + "step": 1191 + }, + { + "epoch": 0.007089161670948711, + "grad_norm": 2.527743101119995, + "learning_rate": 4.999381054213442e-05, + "loss": 7.1478, + "step": 1192 + }, + { + "epoch": 0.0070951089542297075, + "grad_norm": 2.1600632667541504, + "learning_rate": 4.99938001444848e-05, + "loss": 7.7111, + "step": 1193 + }, + { + "epoch": 0.007101056237510705, + "grad_norm": 2.3242850303649902, + "learning_rate": 4.99937897381101e-05, + "loss": 7.6751, + "step": 1194 + }, + { + "epoch": 0.007107003520791702, + "grad_norm": 3.4553158283233643, + "learning_rate": 4.9993779323010334e-05, + "loss": 7.775, + "step": 1195 + }, + { + "epoch": 0.0071129508040727, + "grad_norm": 2.4339516162872314, + "learning_rate": 4.999376889918549e-05, + "loss": 7.099, + "step": 1196 + }, + { + "epoch": 0.0071188980873536966, + "grad_norm": 2.531851291656494, + "learning_rate": 4.9993758466635574e-05, + "loss": 7.5222, + "step": 1197 + }, + { + "epoch": 0.007124845370634694, + "grad_norm": 2.6549220085144043, + "learning_rate": 4.999374802536061e-05, + "loss": 7.4917, + "step": 1198 + }, + { + "epoch": 0.007130792653915691, + "grad_norm": 2.9149320125579834, + "learning_rate": 4.999373757536058e-05, + "loss": 7.0438, + "step": 1199 + }, + { + "epoch": 0.007136739937196689, + "grad_norm": 3.0234971046447754, + "learning_rate": 4.999372711663549e-05, + "loss": 7.6838, + "step": 1200 + }, + { + "epoch": 0.007142687220477686, + "grad_norm": 2.4006800651550293, + "learning_rate": 4.999371664918535e-05, + "loss": 7.6607, + "step": 1201 + }, + { + "epoch": 0.007148634503758683, + "grad_norm": 2.6191699504852295, + "learning_rate": 4.9993706173010164e-05, + "loss": 7.4727, + "step": 1202 + }, + { + "epoch": 0.00715458178703968, + "grad_norm": 3.040844440460205, + "learning_rate": 4.999369568810993e-05, + "loss": 7.1459, + "step": 1203 + }, + { + "epoch": 0.007160529070320678, + "grad_norm": 2.8474466800689697, + "learning_rate": 4.9993685194484654e-05, + "loss": 7.4615, + "step": 1204 + }, + { + "epoch": 0.007166476353601675, + "grad_norm": 1.928662657737732, + "learning_rate": 4.999367469213435e-05, + "loss": 7.4259, + "step": 1205 + }, + { + "epoch": 0.007172423636882672, + "grad_norm": 2.369540214538574, + "learning_rate": 4.999366418105901e-05, + "loss": 6.9342, + "step": 1206 + }, + { + "epoch": 0.007178370920163669, + "grad_norm": 4.003239154815674, + "learning_rate": 4.999365366125863e-05, + "loss": 7.3289, + "step": 1207 + }, + { + "epoch": 0.007184318203444667, + "grad_norm": 4.491976261138916, + "learning_rate": 4.9993643132733234e-05, + "loss": 7.3479, + "step": 1208 + }, + { + "epoch": 0.007190265486725664, + "grad_norm": 2.3678557872772217, + "learning_rate": 4.9993632595482806e-05, + "loss": 7.3091, + "step": 1209 + }, + { + "epoch": 0.007196212770006661, + "grad_norm": 2.9310050010681152, + "learning_rate": 4.999362204950737e-05, + "loss": 7.1996, + "step": 1210 + }, + { + "epoch": 0.007202160053287658, + "grad_norm": 3.6861345767974854, + "learning_rate": 4.999361149480691e-05, + "loss": 7.43, + "step": 1211 + }, + { + "epoch": 0.007208107336568656, + "grad_norm": 2.657515287399292, + "learning_rate": 4.9993600931381446e-05, + "loss": 6.9888, + "step": 1212 + }, + { + "epoch": 0.007214054619849653, + "grad_norm": 2.8346996307373047, + "learning_rate": 4.999359035923097e-05, + "loss": 7.0366, + "step": 1213 + }, + { + "epoch": 0.00722000190313065, + "grad_norm": 3.494162082672119, + "learning_rate": 4.9993579778355487e-05, + "loss": 7.499, + "step": 1214 + }, + { + "epoch": 0.007225949186411647, + "grad_norm": 2.9848556518554688, + "learning_rate": 4.999356918875501e-05, + "loss": 7.2064, + "step": 1215 + }, + { + "epoch": 0.007231896469692645, + "grad_norm": 2.391390562057495, + "learning_rate": 4.999355859042953e-05, + "loss": 7.2752, + "step": 1216 + }, + { + "epoch": 0.007237843752973642, + "grad_norm": 2.872891902923584, + "learning_rate": 4.9993547983379065e-05, + "loss": 6.9865, + "step": 1217 + }, + { + "epoch": 0.0072437910362546385, + "grad_norm": 2.760213613510132, + "learning_rate": 4.99935373676036e-05, + "loss": 7.0211, + "step": 1218 + }, + { + "epoch": 0.007249738319535636, + "grad_norm": 2.8857531547546387, + "learning_rate": 4.9993526743103156e-05, + "loss": 6.9162, + "step": 1219 + }, + { + "epoch": 0.007255685602816633, + "grad_norm": 3.150836229324341, + "learning_rate": 4.999351610987772e-05, + "loss": 7.2929, + "step": 1220 + }, + { + "epoch": 0.007261632886097631, + "grad_norm": 2.2004289627075195, + "learning_rate": 4.999350546792732e-05, + "loss": 7.4729, + "step": 1221 + }, + { + "epoch": 0.0072675801693786275, + "grad_norm": 2.5004026889801025, + "learning_rate": 4.999349481725194e-05, + "loss": 7.5235, + "step": 1222 + }, + { + "epoch": 0.007273527452659625, + "grad_norm": 2.8355395793914795, + "learning_rate": 4.999348415785159e-05, + "loss": 7.3535, + "step": 1223 + }, + { + "epoch": 0.007279474735940622, + "grad_norm": 2.559330701828003, + "learning_rate": 4.9993473489726276e-05, + "loss": 6.9634, + "step": 1224 + }, + { + "epoch": 0.00728542201922162, + "grad_norm": 2.3559181690216064, + "learning_rate": 4.999346281287599e-05, + "loss": 6.9246, + "step": 1225 + }, + { + "epoch": 0.0072913693025026165, + "grad_norm": 2.3852717876434326, + "learning_rate": 4.999345212730075e-05, + "loss": 6.6417, + "step": 1226 + }, + { + "epoch": 0.007297316585783614, + "grad_norm": 2.2604117393493652, + "learning_rate": 4.999344143300055e-05, + "loss": 7.4182, + "step": 1227 + }, + { + "epoch": 0.007303263869064611, + "grad_norm": 2.57983660697937, + "learning_rate": 4.9993430729975396e-05, + "loss": 7.4841, + "step": 1228 + }, + { + "epoch": 0.007309211152345609, + "grad_norm": 2.653935670852661, + "learning_rate": 4.99934200182253e-05, + "loss": 7.5477, + "step": 1229 + }, + { + "epoch": 0.0073151584356266055, + "grad_norm": 2.0740158557891846, + "learning_rate": 4.999340929775026e-05, + "loss": 7.4359, + "step": 1230 + }, + { + "epoch": 0.007321105718907603, + "grad_norm": 2.62064528465271, + "learning_rate": 4.9993398568550275e-05, + "loss": 7.1817, + "step": 1231 + }, + { + "epoch": 0.0073270530021886, + "grad_norm": 2.318244457244873, + "learning_rate": 4.999338783062536e-05, + "loss": 7.1663, + "step": 1232 + }, + { + "epoch": 0.007333000285469598, + "grad_norm": 3.0533225536346436, + "learning_rate": 4.99933770839755e-05, + "loss": 7.3051, + "step": 1233 + }, + { + "epoch": 0.0073389475687505945, + "grad_norm": 4.821422100067139, + "learning_rate": 4.999336632860072e-05, + "loss": 7.3435, + "step": 1234 + }, + { + "epoch": 0.007344894852031592, + "grad_norm": 2.680873155593872, + "learning_rate": 4.999335556450101e-05, + "loss": 7.3447, + "step": 1235 + }, + { + "epoch": 0.007350842135312589, + "grad_norm": 3.287454605102539, + "learning_rate": 4.999334479167638e-05, + "loss": 7.1957, + "step": 1236 + }, + { + "epoch": 0.007356789418593587, + "grad_norm": 3.7452759742736816, + "learning_rate": 4.999333401012682e-05, + "loss": 7.2093, + "step": 1237 + }, + { + "epoch": 0.0073627367018745836, + "grad_norm": 3.363443374633789, + "learning_rate": 4.999332321985236e-05, + "loss": 7.297, + "step": 1238 + }, + { + "epoch": 0.007368683985155581, + "grad_norm": 3.070962905883789, + "learning_rate": 4.999331242085299e-05, + "loss": 7.0831, + "step": 1239 + }, + { + "epoch": 0.007374631268436578, + "grad_norm": 3.635183095932007, + "learning_rate": 4.9993301613128706e-05, + "loss": 7.3116, + "step": 1240 + }, + { + "epoch": 0.007380578551717576, + "grad_norm": 2.532179594039917, + "learning_rate": 4.9993290796679516e-05, + "loss": 7.5238, + "step": 1241 + }, + { + "epoch": 0.007386525834998573, + "grad_norm": 2.1147687435150146, + "learning_rate": 4.999327997150543e-05, + "loss": 7.2279, + "step": 1242 + }, + { + "epoch": 0.00739247311827957, + "grad_norm": 2.1221182346343994, + "learning_rate": 4.999326913760645e-05, + "loss": 7.6575, + "step": 1243 + }, + { + "epoch": 0.007398420401560567, + "grad_norm": 2.2920000553131104, + "learning_rate": 4.999325829498257e-05, + "loss": 7.5652, + "step": 1244 + }, + { + "epoch": 0.007404367684841565, + "grad_norm": 2.3444230556488037, + "learning_rate": 4.9993247443633814e-05, + "loss": 7.3992, + "step": 1245 + }, + { + "epoch": 0.007410314968122562, + "grad_norm": 2.2778663635253906, + "learning_rate": 4.9993236583560164e-05, + "loss": 7.1212, + "step": 1246 + }, + { + "epoch": 0.007416262251403559, + "grad_norm": 2.38369083404541, + "learning_rate": 4.999322571476164e-05, + "loss": 7.4605, + "step": 1247 + }, + { + "epoch": 0.007422209534684556, + "grad_norm": 3.578537702560425, + "learning_rate": 4.999321483723823e-05, + "loss": 7.1446, + "step": 1248 + }, + { + "epoch": 0.007428156817965553, + "grad_norm": 5.227176666259766, + "learning_rate": 4.9993203950989954e-05, + "loss": 7.2308, + "step": 1249 + }, + { + "epoch": 0.007434104101246551, + "grad_norm": 2.665844440460205, + "learning_rate": 4.9993193056016805e-05, + "loss": 7.102, + "step": 1250 + }, + { + "epoch": 0.007440051384527547, + "grad_norm": 4.462922096252441, + "learning_rate": 4.9993182152318796e-05, + "loss": 7.003, + "step": 1251 + }, + { + "epoch": 0.007445998667808545, + "grad_norm": 4.9459099769592285, + "learning_rate": 4.999317123989592e-05, + "loss": 7.1338, + "step": 1252 + }, + { + "epoch": 0.007451945951089542, + "grad_norm": 3.127427339553833, + "learning_rate": 4.9993160318748186e-05, + "loss": 7.045, + "step": 1253 + }, + { + "epoch": 0.00745789323437054, + "grad_norm": 3.03910231590271, + "learning_rate": 4.9993149388875606e-05, + "loss": 6.8523, + "step": 1254 + }, + { + "epoch": 0.0074638405176515365, + "grad_norm": 2.931033134460449, + "learning_rate": 4.9993138450278166e-05, + "loss": 7.3065, + "step": 1255 + }, + { + "epoch": 0.007469787800932534, + "grad_norm": 4.60735559463501, + "learning_rate": 4.999312750295588e-05, + "loss": 7.5384, + "step": 1256 + }, + { + "epoch": 0.007475735084213531, + "grad_norm": 3.0745065212249756, + "learning_rate": 4.9993116546908755e-05, + "loss": 7.6279, + "step": 1257 + }, + { + "epoch": 0.007481682367494529, + "grad_norm": 2.7158751487731934, + "learning_rate": 4.9993105582136804e-05, + "loss": 7.1885, + "step": 1258 + }, + { + "epoch": 0.0074876296507755255, + "grad_norm": 3.5049819946289062, + "learning_rate": 4.999309460864e-05, + "loss": 6.6833, + "step": 1259 + }, + { + "epoch": 0.007493576934056523, + "grad_norm": 3.229778289794922, + "learning_rate": 4.999308362641837e-05, + "loss": 6.784, + "step": 1260 + }, + { + "epoch": 0.00749952421733752, + "grad_norm": 2.7032854557037354, + "learning_rate": 4.999307263547191e-05, + "loss": 6.8003, + "step": 1261 + }, + { + "epoch": 0.007505471500618518, + "grad_norm": 5.892059326171875, + "learning_rate": 4.999306163580063e-05, + "loss": 7.2365, + "step": 1262 + }, + { + "epoch": 0.0075114187838995145, + "grad_norm": 5.8021135330200195, + "learning_rate": 4.999305062740453e-05, + "loss": 7.3822, + "step": 1263 + }, + { + "epoch": 0.007517366067180512, + "grad_norm": 5.1242899894714355, + "learning_rate": 4.9993039610283614e-05, + "loss": 7.2192, + "step": 1264 + }, + { + "epoch": 0.007523313350461509, + "grad_norm": 3.102980375289917, + "learning_rate": 4.9993028584437884e-05, + "loss": 7.4895, + "step": 1265 + }, + { + "epoch": 0.007529260633742507, + "grad_norm": 4.993838310241699, + "learning_rate": 4.999301754986735e-05, + "loss": 7.4771, + "step": 1266 + }, + { + "epoch": 0.0075352079170235035, + "grad_norm": 4.003589630126953, + "learning_rate": 4.999300650657201e-05, + "loss": 7.3591, + "step": 1267 + }, + { + "epoch": 0.007541155200304501, + "grad_norm": 3.6125710010528564, + "learning_rate": 4.999299545455187e-05, + "loss": 7.262, + "step": 1268 + }, + { + "epoch": 0.007547102483585498, + "grad_norm": 3.182196617126465, + "learning_rate": 4.999298439380693e-05, + "loss": 7.2689, + "step": 1269 + }, + { + "epoch": 0.007553049766866496, + "grad_norm": 2.428313732147217, + "learning_rate": 4.99929733243372e-05, + "loss": 7.2364, + "step": 1270 + }, + { + "epoch": 0.0075589970501474925, + "grad_norm": 2.673356771469116, + "learning_rate": 4.999296224614268e-05, + "loss": 7.2356, + "step": 1271 + }, + { + "epoch": 0.00756494433342849, + "grad_norm": 2.508026361465454, + "learning_rate": 4.9992951159223376e-05, + "loss": 7.1052, + "step": 1272 + }, + { + "epoch": 0.007570891616709487, + "grad_norm": 2.7501845359802246, + "learning_rate": 4.99929400635793e-05, + "loss": 7.5041, + "step": 1273 + }, + { + "epoch": 0.007576838899990485, + "grad_norm": 2.4604434967041016, + "learning_rate": 4.999292895921044e-05, + "loss": 7.5042, + "step": 1274 + }, + { + "epoch": 0.0075827861832714815, + "grad_norm": 2.4926865100860596, + "learning_rate": 4.99929178461168e-05, + "loss": 7.2104, + "step": 1275 + }, + { + "epoch": 0.007588733466552479, + "grad_norm": 2.631985664367676, + "learning_rate": 4.999290672429839e-05, + "loss": 6.8608, + "step": 1276 + }, + { + "epoch": 0.007594680749833476, + "grad_norm": 2.5684268474578857, + "learning_rate": 4.999289559375523e-05, + "loss": 7.1199, + "step": 1277 + }, + { + "epoch": 0.007600628033114474, + "grad_norm": 2.4312644004821777, + "learning_rate": 4.99928844544873e-05, + "loss": 7.1814, + "step": 1278 + }, + { + "epoch": 0.0076065753163954706, + "grad_norm": 2.794407367706299, + "learning_rate": 4.99928733064946e-05, + "loss": 7.2909, + "step": 1279 + }, + { + "epoch": 0.007612522599676467, + "grad_norm": 2.5903992652893066, + "learning_rate": 4.9992862149777166e-05, + "loss": 7.354, + "step": 1280 + }, + { + "epoch": 0.007618469882957465, + "grad_norm": 2.266364336013794, + "learning_rate": 4.999285098433497e-05, + "loss": 7.5697, + "step": 1281 + }, + { + "epoch": 0.007624417166238462, + "grad_norm": 3.1871070861816406, + "learning_rate": 4.999283981016803e-05, + "loss": 7.4393, + "step": 1282 + }, + { + "epoch": 0.00763036444951946, + "grad_norm": 2.137981653213501, + "learning_rate": 4.999282862727635e-05, + "loss": 7.3591, + "step": 1283 + }, + { + "epoch": 0.007636311732800456, + "grad_norm": 2.3166019916534424, + "learning_rate": 4.999281743565993e-05, + "loss": 7.4307, + "step": 1284 + }, + { + "epoch": 0.007642259016081454, + "grad_norm": 2.331110954284668, + "learning_rate": 4.999280623531878e-05, + "loss": 7.3214, + "step": 1285 + }, + { + "epoch": 0.007648206299362451, + "grad_norm": 2.7417728900909424, + "learning_rate": 4.999279502625289e-05, + "loss": 7.3593, + "step": 1286 + }, + { + "epoch": 0.007654153582643449, + "grad_norm": 3.089448928833008, + "learning_rate": 4.999278380846228e-05, + "loss": 7.3347, + "step": 1287 + }, + { + "epoch": 0.007660100865924445, + "grad_norm": 2.9446022510528564, + "learning_rate": 4.999277258194694e-05, + "loss": 7.3109, + "step": 1288 + }, + { + "epoch": 0.007666048149205443, + "grad_norm": 2.713355302810669, + "learning_rate": 4.9992761346706896e-05, + "loss": 7.2962, + "step": 1289 + }, + { + "epoch": 0.00767199543248644, + "grad_norm": 2.9480702877044678, + "learning_rate": 4.9992750102742125e-05, + "loss": 7.2081, + "step": 1290 + }, + { + "epoch": 0.007677942715767438, + "grad_norm": 2.737271785736084, + "learning_rate": 4.999273885005265e-05, + "loss": 7.2251, + "step": 1291 + }, + { + "epoch": 0.007683889999048434, + "grad_norm": 2.6954190731048584, + "learning_rate": 4.9992727588638466e-05, + "loss": 7.3437, + "step": 1292 + }, + { + "epoch": 0.007689837282329432, + "grad_norm": 3.0270752906799316, + "learning_rate": 4.999271631849958e-05, + "loss": 7.2516, + "step": 1293 + }, + { + "epoch": 0.007695784565610429, + "grad_norm": 2.824052333831787, + "learning_rate": 4.999270503963599e-05, + "loss": 7.2706, + "step": 1294 + }, + { + "epoch": 0.007701731848891427, + "grad_norm": 2.800713300704956, + "learning_rate": 4.999269375204771e-05, + "loss": 7.2497, + "step": 1295 + }, + { + "epoch": 0.0077076791321724234, + "grad_norm": 3.2510271072387695, + "learning_rate": 4.999268245573474e-05, + "loss": 7.025, + "step": 1296 + }, + { + "epoch": 0.007713626415453421, + "grad_norm": 3.095862627029419, + "learning_rate": 4.999267115069708e-05, + "loss": 7.1815, + "step": 1297 + }, + { + "epoch": 0.007719573698734418, + "grad_norm": 3.2238826751708984, + "learning_rate": 4.999265983693473e-05, + "loss": 7.2268, + "step": 1298 + }, + { + "epoch": 0.007725520982015416, + "grad_norm": 3.18687105178833, + "learning_rate": 4.999264851444771e-05, + "loss": 7.2076, + "step": 1299 + }, + { + "epoch": 0.0077314682652964125, + "grad_norm": 3.1385931968688965, + "learning_rate": 4.9992637183236016e-05, + "loss": 7.2323, + "step": 1300 + }, + { + "epoch": 0.00773741554857741, + "grad_norm": 2.3172361850738525, + "learning_rate": 4.999262584329964e-05, + "loss": 7.1225, + "step": 1301 + }, + { + "epoch": 0.007743362831858407, + "grad_norm": 3.3223013877868652, + "learning_rate": 4.99926144946386e-05, + "loss": 7.2108, + "step": 1302 + }, + { + "epoch": 0.007749310115139405, + "grad_norm": 3.197218894958496, + "learning_rate": 4.99926031372529e-05, + "loss": 7.5123, + "step": 1303 + }, + { + "epoch": 0.0077552573984204015, + "grad_norm": 2.8411800861358643, + "learning_rate": 4.999259177114254e-05, + "loss": 7.3047, + "step": 1304 + }, + { + "epoch": 0.007761204681701399, + "grad_norm": 2.7549736499786377, + "learning_rate": 4.9992580396307524e-05, + "loss": 7.3478, + "step": 1305 + }, + { + "epoch": 0.007767151964982396, + "grad_norm": 2.8829352855682373, + "learning_rate": 4.999256901274786e-05, + "loss": 7.1871, + "step": 1306 + }, + { + "epoch": 0.007773099248263394, + "grad_norm": 2.710076332092285, + "learning_rate": 4.999255762046354e-05, + "loss": 7.0891, + "step": 1307 + }, + { + "epoch": 0.0077790465315443905, + "grad_norm": 2.6598877906799316, + "learning_rate": 4.999254621945458e-05, + "loss": 7.6178, + "step": 1308 + }, + { + "epoch": 0.007784993814825388, + "grad_norm": 2.4012649059295654, + "learning_rate": 4.999253480972099e-05, + "loss": 7.5925, + "step": 1309 + }, + { + "epoch": 0.007790941098106385, + "grad_norm": 2.1501622200012207, + "learning_rate": 4.999252339126275e-05, + "loss": 7.6471, + "step": 1310 + }, + { + "epoch": 0.007796888381387382, + "grad_norm": 3.2150895595550537, + "learning_rate": 4.9992511964079886e-05, + "loss": 7.3995, + "step": 1311 + }, + { + "epoch": 0.0078028356646683795, + "grad_norm": 2.450465440750122, + "learning_rate": 4.9992500528172395e-05, + "loss": 7.219, + "step": 1312 + }, + { + "epoch": 0.007808782947949376, + "grad_norm": 2.714510679244995, + "learning_rate": 4.9992489083540274e-05, + "loss": 7.2023, + "step": 1313 + }, + { + "epoch": 0.007814730231230374, + "grad_norm": 2.660019636154175, + "learning_rate": 4.999247763018354e-05, + "loss": 6.8686, + "step": 1314 + }, + { + "epoch": 0.00782067751451137, + "grad_norm": 2.1031477451324463, + "learning_rate": 4.999246616810218e-05, + "loss": 7.305, + "step": 1315 + }, + { + "epoch": 0.007826624797792368, + "grad_norm": 3.0037856101989746, + "learning_rate": 4.999245469729622e-05, + "loss": 6.9788, + "step": 1316 + }, + { + "epoch": 0.007832572081073366, + "grad_norm": 3.1931207180023193, + "learning_rate": 4.999244321776565e-05, + "loss": 6.9312, + "step": 1317 + }, + { + "epoch": 0.007838519364354363, + "grad_norm": 2.7419891357421875, + "learning_rate": 4.999243172951047e-05, + "loss": 6.7732, + "step": 1318 + }, + { + "epoch": 0.00784446664763536, + "grad_norm": 2.772061824798584, + "learning_rate": 4.99924202325307e-05, + "loss": 6.9576, + "step": 1319 + }, + { + "epoch": 0.007850413930916357, + "grad_norm": 2.9300522804260254, + "learning_rate": 4.999240872682632e-05, + "loss": 6.8366, + "step": 1320 + }, + { + "epoch": 0.007856361214197355, + "grad_norm": 3.4697458744049072, + "learning_rate": 4.9992397212397365e-05, + "loss": 6.9234, + "step": 1321 + }, + { + "epoch": 0.007862308497478352, + "grad_norm": 3.044647693634033, + "learning_rate": 4.999238568924381e-05, + "loss": 6.8406, + "step": 1322 + }, + { + "epoch": 0.007868255780759349, + "grad_norm": 2.4429051876068115, + "learning_rate": 4.999237415736567e-05, + "loss": 6.9815, + "step": 1323 + }, + { + "epoch": 0.007874203064040346, + "grad_norm": 2.6193530559539795, + "learning_rate": 4.999236261676296e-05, + "loss": 7.3867, + "step": 1324 + }, + { + "epoch": 0.007880150347321344, + "grad_norm": 3.9543204307556152, + "learning_rate": 4.999235106743567e-05, + "loss": 7.2391, + "step": 1325 + }, + { + "epoch": 0.007886097630602341, + "grad_norm": 3.12777042388916, + "learning_rate": 4.9992339509383814e-05, + "loss": 7.0976, + "step": 1326 + }, + { + "epoch": 0.007892044913883338, + "grad_norm": 2.4543895721435547, + "learning_rate": 4.999232794260739e-05, + "loss": 7.1865, + "step": 1327 + }, + { + "epoch": 0.007897992197164335, + "grad_norm": 4.254832744598389, + "learning_rate": 4.999231636710639e-05, + "loss": 6.777, + "step": 1328 + }, + { + "epoch": 0.007903939480445333, + "grad_norm": 2.7835497856140137, + "learning_rate": 4.999230478288084e-05, + "loss": 6.8508, + "step": 1329 + }, + { + "epoch": 0.00790988676372633, + "grad_norm": 3.2724666595458984, + "learning_rate": 4.999229318993073e-05, + "loss": 6.7636, + "step": 1330 + }, + { + "epoch": 0.007915834047007327, + "grad_norm": 4.657248020172119, + "learning_rate": 4.9992281588256075e-05, + "loss": 7.3677, + "step": 1331 + }, + { + "epoch": 0.007921781330288324, + "grad_norm": 6.201416492462158, + "learning_rate": 4.999226997785686e-05, + "loss": 7.5804, + "step": 1332 + }, + { + "epoch": 0.007927728613569322, + "grad_norm": 4.955161094665527, + "learning_rate": 4.999225835873312e-05, + "loss": 7.1867, + "step": 1333 + }, + { + "epoch": 0.007933675896850319, + "grad_norm": 3.4105887413024902, + "learning_rate": 4.9992246730884826e-05, + "loss": 7.0948, + "step": 1334 + }, + { + "epoch": 0.007939623180131316, + "grad_norm": 2.514570951461792, + "learning_rate": 4.999223509431201e-05, + "loss": 6.9367, + "step": 1335 + }, + { + "epoch": 0.007945570463412313, + "grad_norm": 3.7689249515533447, + "learning_rate": 4.9992223449014654e-05, + "loss": 7.2209, + "step": 1336 + }, + { + "epoch": 0.007951517746693311, + "grad_norm": 4.997833728790283, + "learning_rate": 4.999221179499277e-05, + "loss": 7.3336, + "step": 1337 + }, + { + "epoch": 0.007957465029974308, + "grad_norm": 5.1314287185668945, + "learning_rate": 4.999220013224637e-05, + "loss": 6.933, + "step": 1338 + }, + { + "epoch": 0.007963412313255305, + "grad_norm": 3.708528518676758, + "learning_rate": 4.9992188460775447e-05, + "loss": 6.9598, + "step": 1339 + }, + { + "epoch": 0.007969359596536302, + "grad_norm": 3.029602289199829, + "learning_rate": 4.999217678058001e-05, + "loss": 7.3674, + "step": 1340 + }, + { + "epoch": 0.007975306879817299, + "grad_norm": 3.000312089920044, + "learning_rate": 4.999216509166006e-05, + "loss": 7.2705, + "step": 1341 + }, + { + "epoch": 0.007981254163098297, + "grad_norm": 4.852355480194092, + "learning_rate": 4.999215339401561e-05, + "loss": 7.1842, + "step": 1342 + }, + { + "epoch": 0.007987201446379294, + "grad_norm": 3.0430521965026855, + "learning_rate": 4.999214168764664e-05, + "loss": 7.5616, + "step": 1343 + }, + { + "epoch": 0.00799314872966029, + "grad_norm": 2.793760061264038, + "learning_rate": 4.999212997255319e-05, + "loss": 7.4867, + "step": 1344 + }, + { + "epoch": 0.007999096012941288, + "grad_norm": 3.516545295715332, + "learning_rate": 4.9992118248735245e-05, + "loss": 7.5857, + "step": 1345 + }, + { + "epoch": 0.008005043296222286, + "grad_norm": 4.272013187408447, + "learning_rate": 4.9992106516192796e-05, + "loss": 7.5686, + "step": 1346 + }, + { + "epoch": 0.008010990579503283, + "grad_norm": 3.176974058151245, + "learning_rate": 4.999209477492587e-05, + "loss": 7.1826, + "step": 1347 + }, + { + "epoch": 0.00801693786278428, + "grad_norm": 3.2615413665771484, + "learning_rate": 4.999208302493447e-05, + "loss": 7.3933, + "step": 1348 + }, + { + "epoch": 0.008022885146065277, + "grad_norm": 2.9548113346099854, + "learning_rate": 4.999207126621858e-05, + "loss": 7.339, + "step": 1349 + }, + { + "epoch": 0.008028832429346275, + "grad_norm": 3.445829153060913, + "learning_rate": 4.999205949877822e-05, + "loss": 7.4223, + "step": 1350 + }, + { + "epoch": 0.008034779712627272, + "grad_norm": 3.471991777420044, + "learning_rate": 4.999204772261338e-05, + "loss": 7.4192, + "step": 1351 + }, + { + "epoch": 0.008040726995908269, + "grad_norm": 3.1682589054107666, + "learning_rate": 4.999203593772409e-05, + "loss": 7.3433, + "step": 1352 + }, + { + "epoch": 0.008046674279189266, + "grad_norm": 4.693798065185547, + "learning_rate": 4.999202414411033e-05, + "loss": 7.1479, + "step": 1353 + }, + { + "epoch": 0.008052621562470264, + "grad_norm": 3.0599937438964844, + "learning_rate": 4.9992012341772114e-05, + "loss": 7.3137, + "step": 1354 + }, + { + "epoch": 0.008058568845751261, + "grad_norm": 2.9557557106018066, + "learning_rate": 4.999200053070945e-05, + "loss": 7.4466, + "step": 1355 + }, + { + "epoch": 0.008064516129032258, + "grad_norm": 2.5595791339874268, + "learning_rate": 4.999198871092233e-05, + "loss": 7.4716, + "step": 1356 + }, + { + "epoch": 0.008070463412313255, + "grad_norm": 2.919729709625244, + "learning_rate": 4.999197688241076e-05, + "loss": 7.0754, + "step": 1357 + }, + { + "epoch": 0.008076410695594253, + "grad_norm": 2.5880625247955322, + "learning_rate": 4.9991965045174763e-05, + "loss": 7.2794, + "step": 1358 + }, + { + "epoch": 0.00808235797887525, + "grad_norm": 2.9933066368103027, + "learning_rate": 4.999195319921432e-05, + "loss": 7.3547, + "step": 1359 + }, + { + "epoch": 0.008088305262156247, + "grad_norm": 5.097862243652344, + "learning_rate": 4.999194134452945e-05, + "loss": 7.1922, + "step": 1360 + }, + { + "epoch": 0.008094252545437244, + "grad_norm": 4.1795830726623535, + "learning_rate": 4.9991929481120146e-05, + "loss": 7.0437, + "step": 1361 + }, + { + "epoch": 0.008100199828718242, + "grad_norm": 3.292961835861206, + "learning_rate": 4.999191760898642e-05, + "loss": 6.8637, + "step": 1362 + }, + { + "epoch": 0.008106147111999239, + "grad_norm": 3.052610397338867, + "learning_rate": 4.999190572812828e-05, + "loss": 7.1675, + "step": 1363 + }, + { + "epoch": 0.008112094395280236, + "grad_norm": 2.975646734237671, + "learning_rate": 4.999189383854571e-05, + "loss": 7.1309, + "step": 1364 + }, + { + "epoch": 0.008118041678561233, + "grad_norm": 2.71195912361145, + "learning_rate": 4.999188194023874e-05, + "loss": 7.2247, + "step": 1365 + }, + { + "epoch": 0.008123988961842231, + "grad_norm": 2.751002311706543, + "learning_rate": 4.9991870033207354e-05, + "loss": 6.8553, + "step": 1366 + }, + { + "epoch": 0.008129936245123228, + "grad_norm": 3.4521234035491943, + "learning_rate": 4.999185811745157e-05, + "loss": 6.8373, + "step": 1367 + }, + { + "epoch": 0.008135883528404225, + "grad_norm": 3.054330348968506, + "learning_rate": 4.999184619297138e-05, + "loss": 6.6982, + "step": 1368 + }, + { + "epoch": 0.008141830811685222, + "grad_norm": 3.513794183731079, + "learning_rate": 4.99918342597668e-05, + "loss": 6.5567, + "step": 1369 + }, + { + "epoch": 0.00814777809496622, + "grad_norm": 3.681838274002075, + "learning_rate": 4.9991822317837836e-05, + "loss": 6.6335, + "step": 1370 + }, + { + "epoch": 0.008153725378247217, + "grad_norm": 4.144393444061279, + "learning_rate": 4.999181036718447e-05, + "loss": 6.5361, + "step": 1371 + }, + { + "epoch": 0.008159672661528214, + "grad_norm": 2.9771196842193604, + "learning_rate": 4.9991798407806736e-05, + "loss": 7.0085, + "step": 1372 + }, + { + "epoch": 0.00816561994480921, + "grad_norm": 3.114884376525879, + "learning_rate": 4.9991786439704615e-05, + "loss": 7.1498, + "step": 1373 + }, + { + "epoch": 0.008171567228090208, + "grad_norm": 2.76042103767395, + "learning_rate": 4.9991774462878115e-05, + "loss": 6.8462, + "step": 1374 + }, + { + "epoch": 0.008177514511371206, + "grad_norm": 3.257528066635132, + "learning_rate": 4.999176247732725e-05, + "loss": 6.4595, + "step": 1375 + }, + { + "epoch": 0.008183461794652203, + "grad_norm": 3.377774238586426, + "learning_rate": 4.999175048305202e-05, + "loss": 6.3131, + "step": 1376 + }, + { + "epoch": 0.0081894090779332, + "grad_norm": 3.029477834701538, + "learning_rate": 4.999173848005243e-05, + "loss": 6.7182, + "step": 1377 + }, + { + "epoch": 0.008195356361214197, + "grad_norm": 3.0353076457977295, + "learning_rate": 4.9991726468328476e-05, + "loss": 7.009, + "step": 1378 + }, + { + "epoch": 0.008201303644495195, + "grad_norm": 2.465014934539795, + "learning_rate": 4.999171444788017e-05, + "loss": 7.6277, + "step": 1379 + }, + { + "epoch": 0.008207250927776192, + "grad_norm": 3.025954484939575, + "learning_rate": 4.999170241870752e-05, + "loss": 7.2815, + "step": 1380 + }, + { + "epoch": 0.008213198211057189, + "grad_norm": 3.8414018154144287, + "learning_rate": 4.999169038081052e-05, + "loss": 7.2238, + "step": 1381 + }, + { + "epoch": 0.008219145494338186, + "grad_norm": 3.2927470207214355, + "learning_rate": 4.999167833418918e-05, + "loss": 7.1505, + "step": 1382 + }, + { + "epoch": 0.008225092777619184, + "grad_norm": 2.6132330894470215, + "learning_rate": 4.999166627884351e-05, + "loss": 7.2499, + "step": 1383 + }, + { + "epoch": 0.008231040060900181, + "grad_norm": 2.523366689682007, + "learning_rate": 4.9991654214773497e-05, + "loss": 6.9812, + "step": 1384 + }, + { + "epoch": 0.008236987344181178, + "grad_norm": 3.977471351623535, + "learning_rate": 4.9991642141979154e-05, + "loss": 7.3196, + "step": 1385 + }, + { + "epoch": 0.008242934627462175, + "grad_norm": 2.731952428817749, + "learning_rate": 4.99916300604605e-05, + "loss": 7.1014, + "step": 1386 + }, + { + "epoch": 0.008248881910743173, + "grad_norm": 2.6128756999969482, + "learning_rate": 4.999161797021752e-05, + "loss": 7.0235, + "step": 1387 + }, + { + "epoch": 0.00825482919402417, + "grad_norm": 2.263430595397949, + "learning_rate": 4.999160587125023e-05, + "loss": 7.0183, + "step": 1388 + }, + { + "epoch": 0.008260776477305167, + "grad_norm": 2.799994707107544, + "learning_rate": 4.9991593763558614e-05, + "loss": 6.9553, + "step": 1389 + }, + { + "epoch": 0.008266723760586164, + "grad_norm": 2.5443058013916016, + "learning_rate": 4.99915816471427e-05, + "loss": 7.2302, + "step": 1390 + }, + { + "epoch": 0.008272671043867162, + "grad_norm": 2.304185152053833, + "learning_rate": 4.999156952200248e-05, + "loss": 7.2589, + "step": 1391 + }, + { + "epoch": 0.008278618327148159, + "grad_norm": 2.1639649868011475, + "learning_rate": 4.999155738813797e-05, + "loss": 7.0067, + "step": 1392 + }, + { + "epoch": 0.008284565610429156, + "grad_norm": 2.276514768600464, + "learning_rate": 4.999154524554915e-05, + "loss": 7.2721, + "step": 1393 + }, + { + "epoch": 0.008290512893710153, + "grad_norm": 2.212200880050659, + "learning_rate": 4.9991533094236055e-05, + "loss": 7.1183, + "step": 1394 + }, + { + "epoch": 0.008296460176991151, + "grad_norm": 2.5289459228515625, + "learning_rate": 4.999152093419867e-05, + "loss": 7.0289, + "step": 1395 + }, + { + "epoch": 0.008302407460272148, + "grad_norm": 2.5915603637695312, + "learning_rate": 4.999150876543699e-05, + "loss": 6.7497, + "step": 1396 + }, + { + "epoch": 0.008308354743553145, + "grad_norm": 2.680513858795166, + "learning_rate": 4.999149658795105e-05, + "loss": 6.7139, + "step": 1397 + }, + { + "epoch": 0.008314302026834142, + "grad_norm": 2.65744948387146, + "learning_rate": 4.999148440174083e-05, + "loss": 6.6151, + "step": 1398 + }, + { + "epoch": 0.00832024931011514, + "grad_norm": 3.8028745651245117, + "learning_rate": 4.9991472206806334e-05, + "loss": 7.1992, + "step": 1399 + }, + { + "epoch": 0.008326196593396137, + "grad_norm": 2.8436119556427, + "learning_rate": 4.999146000314758e-05, + "loss": 7.165, + "step": 1400 + }, + { + "epoch": 0.008332143876677134, + "grad_norm": 2.6658496856689453, + "learning_rate": 4.999144779076457e-05, + "loss": 7.5945, + "step": 1401 + }, + { + "epoch": 0.00833809115995813, + "grad_norm": 2.909703016281128, + "learning_rate": 4.99914355696573e-05, + "loss": 7.6378, + "step": 1402 + }, + { + "epoch": 0.00834403844323913, + "grad_norm": 2.5827598571777344, + "learning_rate": 4.9991423339825776e-05, + "loss": 7.5441, + "step": 1403 + }, + { + "epoch": 0.008349985726520126, + "grad_norm": 3.0283706188201904, + "learning_rate": 4.999141110127e-05, + "loss": 7.1162, + "step": 1404 + }, + { + "epoch": 0.008355933009801123, + "grad_norm": 3.11690354347229, + "learning_rate": 4.999139885398999e-05, + "loss": 6.5123, + "step": 1405 + }, + { + "epoch": 0.00836188029308212, + "grad_norm": 2.6188690662384033, + "learning_rate": 4.999138659798574e-05, + "loss": 7.6384, + "step": 1406 + }, + { + "epoch": 0.008367827576363117, + "grad_norm": 3.4412481784820557, + "learning_rate": 4.999137433325725e-05, + "loss": 7.4067, + "step": 1407 + }, + { + "epoch": 0.008373774859644115, + "grad_norm": 3.1690893173217773, + "learning_rate": 4.999136205980454e-05, + "loss": 7.3937, + "step": 1408 + }, + { + "epoch": 0.008379722142925112, + "grad_norm": 2.1589877605438232, + "learning_rate": 4.999134977762759e-05, + "loss": 7.454, + "step": 1409 + }, + { + "epoch": 0.008385669426206109, + "grad_norm": 2.485901117324829, + "learning_rate": 4.999133748672642e-05, + "loss": 7.3421, + "step": 1410 + }, + { + "epoch": 0.008391616709487106, + "grad_norm": 2.543128252029419, + "learning_rate": 4.999132518710104e-05, + "loss": 7.3162, + "step": 1411 + }, + { + "epoch": 0.008397563992768104, + "grad_norm": 2.8048489093780518, + "learning_rate": 4.999131287875144e-05, + "loss": 7.297, + "step": 1412 + }, + { + "epoch": 0.008403511276049101, + "grad_norm": 3.0391035079956055, + "learning_rate": 4.9991300561677634e-05, + "loss": 7.2409, + "step": 1413 + }, + { + "epoch": 0.008409458559330098, + "grad_norm": 2.3196053504943848, + "learning_rate": 4.999128823587962e-05, + "loss": 7.1358, + "step": 1414 + }, + { + "epoch": 0.008415405842611095, + "grad_norm": 3.1876983642578125, + "learning_rate": 4.999127590135741e-05, + "loss": 7.1501, + "step": 1415 + }, + { + "epoch": 0.008421353125892093, + "grad_norm": 3.6832327842712402, + "learning_rate": 4.9991263558111e-05, + "loss": 7.181, + "step": 1416 + }, + { + "epoch": 0.00842730040917309, + "grad_norm": 3.7491936683654785, + "learning_rate": 4.99912512061404e-05, + "loss": 6.9669, + "step": 1417 + }, + { + "epoch": 0.008433247692454087, + "grad_norm": 3.1583478450775146, + "learning_rate": 4.9991238845445615e-05, + "loss": 7.2155, + "step": 1418 + }, + { + "epoch": 0.008439194975735084, + "grad_norm": 3.11611008644104, + "learning_rate": 4.999122647602664e-05, + "loss": 7.164, + "step": 1419 + }, + { + "epoch": 0.008445142259016082, + "grad_norm": 6.127118110656738, + "learning_rate": 4.9991214097883495e-05, + "loss": 7.232, + "step": 1420 + }, + { + "epoch": 0.008451089542297079, + "grad_norm": 4.736495494842529, + "learning_rate": 4.9991201711016166e-05, + "loss": 7.3685, + "step": 1421 + }, + { + "epoch": 0.008457036825578076, + "grad_norm": 2.9656684398651123, + "learning_rate": 4.999118931542467e-05, + "loss": 7.2658, + "step": 1422 + }, + { + "epoch": 0.008462984108859073, + "grad_norm": 2.5959243774414062, + "learning_rate": 4.999117691110901e-05, + "loss": 7.0908, + "step": 1423 + }, + { + "epoch": 0.008468931392140071, + "grad_norm": 4.546379089355469, + "learning_rate": 4.999116449806919e-05, + "loss": 7.1343, + "step": 1424 + }, + { + "epoch": 0.008474878675421068, + "grad_norm": 3.6856796741485596, + "learning_rate": 4.9991152076305206e-05, + "loss": 6.9205, + "step": 1425 + }, + { + "epoch": 0.008480825958702065, + "grad_norm": 3.293973922729492, + "learning_rate": 4.9991139645817075e-05, + "loss": 6.9954, + "step": 1426 + }, + { + "epoch": 0.008486773241983062, + "grad_norm": 3.2511162757873535, + "learning_rate": 4.999112720660479e-05, + "loss": 6.7661, + "step": 1427 + }, + { + "epoch": 0.00849272052526406, + "grad_norm": 3.990840196609497, + "learning_rate": 4.9991114758668364e-05, + "loss": 6.7402, + "step": 1428 + }, + { + "epoch": 0.008498667808545057, + "grad_norm": 3.306809186935425, + "learning_rate": 4.9991102302007804e-05, + "loss": 6.6801, + "step": 1429 + }, + { + "epoch": 0.008504615091826054, + "grad_norm": 5.208675384521484, + "learning_rate": 4.99910898366231e-05, + "loss": 7.0128, + "step": 1430 + }, + { + "epoch": 0.00851056237510705, + "grad_norm": 4.131346225738525, + "learning_rate": 4.9991077362514266e-05, + "loss": 7.0992, + "step": 1431 + }, + { + "epoch": 0.00851650965838805, + "grad_norm": 2.60927152633667, + "learning_rate": 4.99910648796813e-05, + "loss": 7.2731, + "step": 1432 + }, + { + "epoch": 0.008522456941669046, + "grad_norm": 5.654631614685059, + "learning_rate": 4.9991052388124224e-05, + "loss": 6.6105, + "step": 1433 + }, + { + "epoch": 0.008528404224950043, + "grad_norm": 6.108455657958984, + "learning_rate": 4.9991039887843025e-05, + "loss": 6.3548, + "step": 1434 + }, + { + "epoch": 0.00853435150823104, + "grad_norm": 3.758371591567993, + "learning_rate": 4.9991027378837705e-05, + "loss": 6.6171, + "step": 1435 + }, + { + "epoch": 0.008540298791512036, + "grad_norm": 2.1995320320129395, + "learning_rate": 4.9991014861108285e-05, + "loss": 6.5987, + "step": 1436 + }, + { + "epoch": 0.008546246074793035, + "grad_norm": 2.3778254985809326, + "learning_rate": 4.999100233465476e-05, + "loss": 6.8067, + "step": 1437 + }, + { + "epoch": 0.008552193358074032, + "grad_norm": 2.521928310394287, + "learning_rate": 4.999098979947713e-05, + "loss": 6.7756, + "step": 1438 + }, + { + "epoch": 0.008558140641355029, + "grad_norm": 2.109605073928833, + "learning_rate": 4.99909772555754e-05, + "loss": 6.7091, + "step": 1439 + }, + { + "epoch": 0.008564087924636025, + "grad_norm": 2.55838680267334, + "learning_rate": 4.9990964702949585e-05, + "loss": 6.8989, + "step": 1440 + }, + { + "epoch": 0.008570035207917024, + "grad_norm": 2.4499685764312744, + "learning_rate": 4.9990952141599675e-05, + "loss": 6.6241, + "step": 1441 + }, + { + "epoch": 0.00857598249119802, + "grad_norm": 2.265371322631836, + "learning_rate": 4.9990939571525685e-05, + "loss": 7.6681, + "step": 1442 + }, + { + "epoch": 0.008581929774479018, + "grad_norm": 2.4496965408325195, + "learning_rate": 4.999092699272762e-05, + "loss": 6.8177, + "step": 1443 + }, + { + "epoch": 0.008587877057760014, + "grad_norm": 2.5555005073547363, + "learning_rate": 4.999091440520548e-05, + "loss": 6.6402, + "step": 1444 + }, + { + "epoch": 0.008593824341041013, + "grad_norm": 2.042592763900757, + "learning_rate": 4.999090180895927e-05, + "loss": 6.6114, + "step": 1445 + }, + { + "epoch": 0.00859977162432201, + "grad_norm": 2.3100671768188477, + "learning_rate": 4.9990889203988986e-05, + "loss": 6.712, + "step": 1446 + }, + { + "epoch": 0.008605718907603007, + "grad_norm": 2.7600841522216797, + "learning_rate": 4.999087659029465e-05, + "loss": 6.6531, + "step": 1447 + }, + { + "epoch": 0.008611666190884004, + "grad_norm": 3.292684316635132, + "learning_rate": 4.999086396787625e-05, + "loss": 6.9896, + "step": 1448 + }, + { + "epoch": 0.008617613474165002, + "grad_norm": 2.7579386234283447, + "learning_rate": 4.999085133673381e-05, + "loss": 7.1559, + "step": 1449 + }, + { + "epoch": 0.008623560757445999, + "grad_norm": 2.7898707389831543, + "learning_rate": 4.999083869686731e-05, + "loss": 6.9861, + "step": 1450 + }, + { + "epoch": 0.008629508040726996, + "grad_norm": 3.439809799194336, + "learning_rate": 4.999082604827677e-05, + "loss": 6.759, + "step": 1451 + }, + { + "epoch": 0.008635455324007993, + "grad_norm": 2.924859046936035, + "learning_rate": 4.999081339096219e-05, + "loss": 6.5438, + "step": 1452 + }, + { + "epoch": 0.008641402607288991, + "grad_norm": 3.363886594772339, + "learning_rate": 4.999080072492358e-05, + "loss": 7.0477, + "step": 1453 + }, + { + "epoch": 0.008647349890569988, + "grad_norm": 2.924988031387329, + "learning_rate": 4.999078805016093e-05, + "loss": 6.9228, + "step": 1454 + }, + { + "epoch": 0.008653297173850985, + "grad_norm": 3.2283847332000732, + "learning_rate": 4.999077536667426e-05, + "loss": 6.8763, + "step": 1455 + }, + { + "epoch": 0.008659244457131982, + "grad_norm": 2.635744094848633, + "learning_rate": 4.999076267446357e-05, + "loss": 6.6438, + "step": 1456 + }, + { + "epoch": 0.00866519174041298, + "grad_norm": 2.829801559448242, + "learning_rate": 4.9990749973528864e-05, + "loss": 6.9466, + "step": 1457 + }, + { + "epoch": 0.008671139023693977, + "grad_norm": 3.3631057739257812, + "learning_rate": 4.999073726387014e-05, + "loss": 7.2652, + "step": 1458 + }, + { + "epoch": 0.008677086306974974, + "grad_norm": 3.9970719814300537, + "learning_rate": 4.999072454548741e-05, + "loss": 7.053, + "step": 1459 + }, + { + "epoch": 0.00868303359025597, + "grad_norm": 3.322787046432495, + "learning_rate": 4.9990711818380674e-05, + "loss": 7.0272, + "step": 1460 + }, + { + "epoch": 0.008688980873536969, + "grad_norm": 2.7370798587799072, + "learning_rate": 4.999069908254995e-05, + "loss": 6.8545, + "step": 1461 + }, + { + "epoch": 0.008694928156817966, + "grad_norm": 2.845191240310669, + "learning_rate": 4.999068633799522e-05, + "loss": 6.9393, + "step": 1462 + }, + { + "epoch": 0.008700875440098963, + "grad_norm": 3.064960241317749, + "learning_rate": 4.99906735847165e-05, + "loss": 6.7734, + "step": 1463 + }, + { + "epoch": 0.00870682272337996, + "grad_norm": 7.113090515136719, + "learning_rate": 4.99906608227138e-05, + "loss": 7.0532, + "step": 1464 + }, + { + "epoch": 0.008712770006660958, + "grad_norm": 5.90821647644043, + "learning_rate": 4.999064805198711e-05, + "loss": 7.1494, + "step": 1465 + }, + { + "epoch": 0.008718717289941955, + "grad_norm": 3.9366238117218018, + "learning_rate": 4.9990635272536454e-05, + "loss": 7.623, + "step": 1466 + }, + { + "epoch": 0.008724664573222952, + "grad_norm": 3.1239330768585205, + "learning_rate": 4.9990622484361814e-05, + "loss": 7.4938, + "step": 1467 + }, + { + "epoch": 0.008730611856503949, + "grad_norm": 2.6688928604125977, + "learning_rate": 4.9990609687463216e-05, + "loss": 7.3445, + "step": 1468 + }, + { + "epoch": 0.008736559139784945, + "grad_norm": 3.047154664993286, + "learning_rate": 4.9990596881840646e-05, + "loss": 7.158, + "step": 1469 + }, + { + "epoch": 0.008742506423065944, + "grad_norm": 2.5230467319488525, + "learning_rate": 4.999058406749412e-05, + "loss": 7.1368, + "step": 1470 + }, + { + "epoch": 0.00874845370634694, + "grad_norm": 2.729705333709717, + "learning_rate": 4.999057124442364e-05, + "loss": 7.0144, + "step": 1471 + }, + { + "epoch": 0.008754400989627938, + "grad_norm": 2.5796756744384766, + "learning_rate": 4.999055841262921e-05, + "loss": 7.2157, + "step": 1472 + }, + { + "epoch": 0.008760348272908934, + "grad_norm": 3.458691358566284, + "learning_rate": 4.999054557211084e-05, + "loss": 6.7631, + "step": 1473 + }, + { + "epoch": 0.008766295556189933, + "grad_norm": 2.7262747287750244, + "learning_rate": 4.999053272286851e-05, + "loss": 6.9784, + "step": 1474 + }, + { + "epoch": 0.00877224283947093, + "grad_norm": 2.6003808975219727, + "learning_rate": 4.9990519864902267e-05, + "loss": 7.1369, + "step": 1475 + }, + { + "epoch": 0.008778190122751927, + "grad_norm": 3.4032137393951416, + "learning_rate": 4.999050699821207e-05, + "loss": 6.9569, + "step": 1476 + }, + { + "epoch": 0.008784137406032923, + "grad_norm": 4.099828243255615, + "learning_rate": 4.9990494122797957e-05, + "loss": 6.9977, + "step": 1477 + }, + { + "epoch": 0.008790084689313922, + "grad_norm": 3.1837944984436035, + "learning_rate": 4.999048123865992e-05, + "loss": 7.1331, + "step": 1478 + }, + { + "epoch": 0.008796031972594919, + "grad_norm": 2.618847131729126, + "learning_rate": 4.999046834579796e-05, + "loss": 7.0043, + "step": 1479 + }, + { + "epoch": 0.008801979255875916, + "grad_norm": 3.0132501125335693, + "learning_rate": 4.999045544421209e-05, + "loss": 6.7836, + "step": 1480 + }, + { + "epoch": 0.008807926539156912, + "grad_norm": 2.4608371257781982, + "learning_rate": 4.999044253390231e-05, + "loss": 7.0721, + "step": 1481 + }, + { + "epoch": 0.008813873822437911, + "grad_norm": 3.280649423599243, + "learning_rate": 4.999042961486863e-05, + "loss": 7.959, + "step": 1482 + }, + { + "epoch": 0.008819821105718908, + "grad_norm": 2.7038395404815674, + "learning_rate": 4.999041668711104e-05, + "loss": 7.1256, + "step": 1483 + }, + { + "epoch": 0.008825768388999905, + "grad_norm": 2.1451892852783203, + "learning_rate": 4.9990403750629556e-05, + "loss": 7.2219, + "step": 1484 + }, + { + "epoch": 0.008831715672280901, + "grad_norm": 2.3731601238250732, + "learning_rate": 4.999039080542418e-05, + "loss": 7.2023, + "step": 1485 + }, + { + "epoch": 0.0088376629555619, + "grad_norm": 2.444089651107788, + "learning_rate": 4.999037785149492e-05, + "loss": 7.0988, + "step": 1486 + }, + { + "epoch": 0.008843610238842897, + "grad_norm": 2.644712448120117, + "learning_rate": 4.999036488884177e-05, + "loss": 7.1916, + "step": 1487 + }, + { + "epoch": 0.008849557522123894, + "grad_norm": 5.477145671844482, + "learning_rate": 4.999035191746475e-05, + "loss": 6.7256, + "step": 1488 + }, + { + "epoch": 0.00885550480540489, + "grad_norm": 2.2691709995269775, + "learning_rate": 4.999033893736386e-05, + "loss": 7.2505, + "step": 1489 + }, + { + "epoch": 0.008861452088685889, + "grad_norm": 2.5880343914031982, + "learning_rate": 4.999032594853909e-05, + "loss": 6.9549, + "step": 1490 + }, + { + "epoch": 0.008867399371966886, + "grad_norm": 2.2748520374298096, + "learning_rate": 4.999031295099046e-05, + "loss": 6.8269, + "step": 1491 + }, + { + "epoch": 0.008873346655247883, + "grad_norm": 2.262706995010376, + "learning_rate": 4.999029994471797e-05, + "loss": 6.8876, + "step": 1492 + }, + { + "epoch": 0.00887929393852888, + "grad_norm": 2.264256238937378, + "learning_rate": 4.999028692972162e-05, + "loss": 7.1545, + "step": 1493 + }, + { + "epoch": 0.008885241221809878, + "grad_norm": 2.489259719848633, + "learning_rate": 4.9990273906001424e-05, + "loss": 7.194, + "step": 1494 + }, + { + "epoch": 0.008891188505090875, + "grad_norm": 2.7545981407165527, + "learning_rate": 4.999026087355738e-05, + "loss": 7.0148, + "step": 1495 + }, + { + "epoch": 0.008897135788371872, + "grad_norm": 2.6869328022003174, + "learning_rate": 4.999024783238949e-05, + "loss": 7.2535, + "step": 1496 + }, + { + "epoch": 0.008903083071652869, + "grad_norm": 2.5216503143310547, + "learning_rate": 4.999023478249777e-05, + "loss": 6.4351, + "step": 1497 + }, + { + "epoch": 0.008909030354933865, + "grad_norm": 2.5090575218200684, + "learning_rate": 4.9990221723882216e-05, + "loss": 7.3068, + "step": 1498 + }, + { + "epoch": 0.008914977638214864, + "grad_norm": 2.5026490688323975, + "learning_rate": 4.999020865654283e-05, + "loss": 7.1274, + "step": 1499 + }, + { + "epoch": 0.00892092492149586, + "grad_norm": 2.8030898571014404, + "learning_rate": 4.999019558047963e-05, + "loss": 7.0016, + "step": 1500 + }, + { + "epoch": 0.008926872204776858, + "grad_norm": 2.533383846282959, + "learning_rate": 4.99901824956926e-05, + "loss": 6.8991, + "step": 1501 + }, + { + "epoch": 0.008932819488057854, + "grad_norm": 2.5584118366241455, + "learning_rate": 4.999016940218175e-05, + "loss": 6.9237, + "step": 1502 + }, + { + "epoch": 0.008938766771338853, + "grad_norm": 2.778592586517334, + "learning_rate": 4.99901562999471e-05, + "loss": 7.0941, + "step": 1503 + }, + { + "epoch": 0.00894471405461985, + "grad_norm": 4.023860931396484, + "learning_rate": 4.999014318898865e-05, + "loss": 6.5188, + "step": 1504 + }, + { + "epoch": 0.008950661337900847, + "grad_norm": 3.018118143081665, + "learning_rate": 4.999013006930639e-05, + "loss": 7.0557, + "step": 1505 + }, + { + "epoch": 0.008956608621181843, + "grad_norm": 2.802061080932617, + "learning_rate": 4.999011694090033e-05, + "loss": 7.2645, + "step": 1506 + }, + { + "epoch": 0.008962555904462842, + "grad_norm": 2.3782076835632324, + "learning_rate": 4.999010380377049e-05, + "loss": 7.3707, + "step": 1507 + }, + { + "epoch": 0.008968503187743839, + "grad_norm": 2.451878309249878, + "learning_rate": 4.999009065791686e-05, + "loss": 7.2783, + "step": 1508 + }, + { + "epoch": 0.008974450471024836, + "grad_norm": 3.85514235496521, + "learning_rate": 4.999007750333945e-05, + "loss": 6.3543, + "step": 1509 + }, + { + "epoch": 0.008980397754305832, + "grad_norm": 2.617177963256836, + "learning_rate": 4.999006434003825e-05, + "loss": 7.0175, + "step": 1510 + }, + { + "epoch": 0.008986345037586831, + "grad_norm": 2.6909587383270264, + "learning_rate": 4.999005116801329e-05, + "loss": 7.3282, + "step": 1511 + }, + { + "epoch": 0.008992292320867828, + "grad_norm": 2.332165241241455, + "learning_rate": 4.9990037987264546e-05, + "loss": 7.0993, + "step": 1512 + }, + { + "epoch": 0.008998239604148825, + "grad_norm": 2.5398497581481934, + "learning_rate": 4.9990024797792055e-05, + "loss": 7.2867, + "step": 1513 + }, + { + "epoch": 0.009004186887429821, + "grad_norm": 2.432264566421509, + "learning_rate": 4.9990011599595796e-05, + "loss": 7.1619, + "step": 1514 + }, + { + "epoch": 0.00901013417071082, + "grad_norm": 2.2937278747558594, + "learning_rate": 4.998999839267578e-05, + "loss": 7.1138, + "step": 1515 + }, + { + "epoch": 0.009016081453991817, + "grad_norm": 2.3305680751800537, + "learning_rate": 4.998998517703202e-05, + "loss": 7.0569, + "step": 1516 + }, + { + "epoch": 0.009022028737272814, + "grad_norm": 3.0785884857177734, + "learning_rate": 4.998997195266451e-05, + "loss": 7.0922, + "step": 1517 + }, + { + "epoch": 0.00902797602055381, + "grad_norm": 2.354283571243286, + "learning_rate": 4.998995871957326e-05, + "loss": 7.0024, + "step": 1518 + }, + { + "epoch": 0.009033923303834809, + "grad_norm": 2.488194465637207, + "learning_rate": 4.998994547775827e-05, + "loss": 7.0045, + "step": 1519 + }, + { + "epoch": 0.009039870587115806, + "grad_norm": 2.6196579933166504, + "learning_rate": 4.998993222721956e-05, + "loss": 6.9416, + "step": 1520 + }, + { + "epoch": 0.009045817870396803, + "grad_norm": 2.6524155139923096, + "learning_rate": 4.998991896795711e-05, + "loss": 6.9562, + "step": 1521 + }, + { + "epoch": 0.0090517651536778, + "grad_norm": 3.308661460876465, + "learning_rate": 4.998990569997094e-05, + "loss": 6.8602, + "step": 1522 + }, + { + "epoch": 0.009057712436958798, + "grad_norm": 2.7995994091033936, + "learning_rate": 4.9989892423261055e-05, + "loss": 7.7049, + "step": 1523 + }, + { + "epoch": 0.009063659720239795, + "grad_norm": 2.547189235687256, + "learning_rate": 4.9989879137827456e-05, + "loss": 7.0254, + "step": 1524 + }, + { + "epoch": 0.009069607003520792, + "grad_norm": 2.796393871307373, + "learning_rate": 4.998986584367015e-05, + "loss": 7.0124, + "step": 1525 + }, + { + "epoch": 0.009075554286801788, + "grad_norm": 2.9441823959350586, + "learning_rate": 4.9989852540789136e-05, + "loss": 7.0174, + "step": 1526 + }, + { + "epoch": 0.009081501570082787, + "grad_norm": 2.509150743484497, + "learning_rate": 4.998983922918443e-05, + "loss": 6.9405, + "step": 1527 + }, + { + "epoch": 0.009087448853363784, + "grad_norm": 2.3686184883117676, + "learning_rate": 4.998982590885603e-05, + "loss": 6.794, + "step": 1528 + }, + { + "epoch": 0.00909339613664478, + "grad_norm": 2.937530755996704, + "learning_rate": 4.998981257980393e-05, + "loss": 6.9716, + "step": 1529 + }, + { + "epoch": 0.009099343419925777, + "grad_norm": 2.493178606033325, + "learning_rate": 4.998979924202814e-05, + "loss": 6.5986, + "step": 1530 + }, + { + "epoch": 0.009105290703206774, + "grad_norm": 2.071356773376465, + "learning_rate": 4.9989785895528686e-05, + "loss": 6.536, + "step": 1531 + }, + { + "epoch": 0.009111237986487773, + "grad_norm": 1.9372920989990234, + "learning_rate": 4.998977254030554e-05, + "loss": 6.4036, + "step": 1532 + }, + { + "epoch": 0.00911718526976877, + "grad_norm": 2.3329098224639893, + "learning_rate": 4.998975917635873e-05, + "loss": 6.4861, + "step": 1533 + }, + { + "epoch": 0.009123132553049767, + "grad_norm": 2.9681191444396973, + "learning_rate": 4.998974580368826e-05, + "loss": 6.939, + "step": 1534 + }, + { + "epoch": 0.009129079836330763, + "grad_norm": 2.5993690490722656, + "learning_rate": 4.9989732422294125e-05, + "loss": 7.0809, + "step": 1535 + }, + { + "epoch": 0.009135027119611762, + "grad_norm": 2.827244997024536, + "learning_rate": 4.998971903217633e-05, + "loss": 7.597, + "step": 1536 + }, + { + "epoch": 0.009140974402892759, + "grad_norm": 2.712247848510742, + "learning_rate": 4.9989705633334884e-05, + "loss": 7.3695, + "step": 1537 + }, + { + "epoch": 0.009146921686173756, + "grad_norm": 1.7997468709945679, + "learning_rate": 4.998969222576978e-05, + "loss": 7.6497, + "step": 1538 + }, + { + "epoch": 0.009152868969454752, + "grad_norm": 2.234931230545044, + "learning_rate": 4.998967880948104e-05, + "loss": 7.1636, + "step": 1539 + }, + { + "epoch": 0.009158816252735751, + "grad_norm": 2.150766611099243, + "learning_rate": 4.9989665384468666e-05, + "loss": 6.8621, + "step": 1540 + }, + { + "epoch": 0.009164763536016748, + "grad_norm": 2.9628021717071533, + "learning_rate": 4.998965195073265e-05, + "loss": 6.5059, + "step": 1541 + }, + { + "epoch": 0.009170710819297745, + "grad_norm": 2.720155715942383, + "learning_rate": 4.998963850827301e-05, + "loss": 7.0129, + "step": 1542 + }, + { + "epoch": 0.009176658102578741, + "grad_norm": 2.994684934616089, + "learning_rate": 4.9989625057089744e-05, + "loss": 7.3621, + "step": 1543 + }, + { + "epoch": 0.00918260538585974, + "grad_norm": 2.5991618633270264, + "learning_rate": 4.998961159718286e-05, + "loss": 6.7278, + "step": 1544 + }, + { + "epoch": 0.009188552669140737, + "grad_norm": 2.406353712081909, + "learning_rate": 4.9989598128552355e-05, + "loss": 7.5987, + "step": 1545 + }, + { + "epoch": 0.009194499952421734, + "grad_norm": 3.1308467388153076, + "learning_rate": 4.998958465119824e-05, + "loss": 7.1947, + "step": 1546 + }, + { + "epoch": 0.00920044723570273, + "grad_norm": 2.5381908416748047, + "learning_rate": 4.998957116512053e-05, + "loss": 6.8415, + "step": 1547 + }, + { + "epoch": 0.009206394518983729, + "grad_norm": 2.666410446166992, + "learning_rate": 4.998955767031921e-05, + "loss": 6.9052, + "step": 1548 + }, + { + "epoch": 0.009212341802264726, + "grad_norm": 2.156036138534546, + "learning_rate": 4.9989544166794286e-05, + "loss": 7.6604, + "step": 1549 + }, + { + "epoch": 0.009218289085545723, + "grad_norm": 2.620114803314209, + "learning_rate": 4.998953065454578e-05, + "loss": 6.5475, + "step": 1550 + }, + { + "epoch": 0.00922423636882672, + "grad_norm": 3.2780802249908447, + "learning_rate": 4.9989517133573694e-05, + "loss": 7.0572, + "step": 1551 + }, + { + "epoch": 0.009230183652107718, + "grad_norm": 3.6108100414276123, + "learning_rate": 4.998950360387802e-05, + "loss": 7.0149, + "step": 1552 + }, + { + "epoch": 0.009236130935388715, + "grad_norm": 3.4336259365081787, + "learning_rate": 4.998949006545876e-05, + "loss": 7.2436, + "step": 1553 + }, + { + "epoch": 0.009242078218669712, + "grad_norm": 3.271630048751831, + "learning_rate": 4.9989476518315934e-05, + "loss": 7.3807, + "step": 1554 + }, + { + "epoch": 0.009248025501950708, + "grad_norm": 3.0718438625335693, + "learning_rate": 4.998946296244954e-05, + "loss": 7.2313, + "step": 1555 + }, + { + "epoch": 0.009253972785231707, + "grad_norm": 2.2010579109191895, + "learning_rate": 4.9989449397859575e-05, + "loss": 7.4269, + "step": 1556 + }, + { + "epoch": 0.009259920068512704, + "grad_norm": 2.9805495738983154, + "learning_rate": 4.998943582454607e-05, + "loss": 7.2107, + "step": 1557 + }, + { + "epoch": 0.0092658673517937, + "grad_norm": 2.8313159942626953, + "learning_rate": 4.9989422242508995e-05, + "loss": 7.0453, + "step": 1558 + }, + { + "epoch": 0.009271814635074697, + "grad_norm": 2.7660701274871826, + "learning_rate": 4.998940865174837e-05, + "loss": 7.2205, + "step": 1559 + }, + { + "epoch": 0.009277761918355694, + "grad_norm": 3.808122396469116, + "learning_rate": 4.998939505226421e-05, + "loss": 6.9966, + "step": 1560 + }, + { + "epoch": 0.009283709201636693, + "grad_norm": 3.188976526260376, + "learning_rate": 4.99893814440565e-05, + "loss": 7.0049, + "step": 1561 + }, + { + "epoch": 0.00928965648491769, + "grad_norm": 2.5491533279418945, + "learning_rate": 4.998936782712526e-05, + "loss": 7.0451, + "step": 1562 + }, + { + "epoch": 0.009295603768198686, + "grad_norm": 3.4607698917388916, + "learning_rate": 4.99893542014705e-05, + "loss": 7.0304, + "step": 1563 + }, + { + "epoch": 0.009301551051479683, + "grad_norm": 3.4761910438537598, + "learning_rate": 4.99893405670922e-05, + "loss": 6.9787, + "step": 1564 + }, + { + "epoch": 0.009307498334760682, + "grad_norm": 3.15938138961792, + "learning_rate": 4.998932692399039e-05, + "loss": 7.0203, + "step": 1565 + }, + { + "epoch": 0.009313445618041679, + "grad_norm": 2.600304126739502, + "learning_rate": 4.9989313272165064e-05, + "loss": 7.0782, + "step": 1566 + }, + { + "epoch": 0.009319392901322675, + "grad_norm": 2.54158616065979, + "learning_rate": 4.9989299611616216e-05, + "loss": 6.8354, + "step": 1567 + }, + { + "epoch": 0.009325340184603672, + "grad_norm": 3.4649429321289062, + "learning_rate": 4.9989285942343864e-05, + "loss": 6.8238, + "step": 1568 + }, + { + "epoch": 0.00933128746788467, + "grad_norm": 2.522388458251953, + "learning_rate": 4.998927226434802e-05, + "loss": 6.9544, + "step": 1569 + }, + { + "epoch": 0.009337234751165668, + "grad_norm": 4.074129581451416, + "learning_rate": 4.9989258577628675e-05, + "loss": 6.7229, + "step": 1570 + }, + { + "epoch": 0.009343182034446664, + "grad_norm": 3.395894765853882, + "learning_rate": 4.998924488218584e-05, + "loss": 7.1372, + "step": 1571 + }, + { + "epoch": 0.009349129317727661, + "grad_norm": 2.9850378036499023, + "learning_rate": 4.9989231178019516e-05, + "loss": 6.8966, + "step": 1572 + }, + { + "epoch": 0.00935507660100866, + "grad_norm": 3.1391544342041016, + "learning_rate": 4.9989217465129704e-05, + "loss": 6.6744, + "step": 1573 + }, + { + "epoch": 0.009361023884289657, + "grad_norm": 3.8727803230285645, + "learning_rate": 4.9989203743516414e-05, + "loss": 6.9359, + "step": 1574 + }, + { + "epoch": 0.009366971167570654, + "grad_norm": 3.466169595718384, + "learning_rate": 4.998919001317966e-05, + "loss": 6.979, + "step": 1575 + }, + { + "epoch": 0.00937291845085165, + "grad_norm": 3.3481826782226562, + "learning_rate": 4.998917627411943e-05, + "loss": 6.7749, + "step": 1576 + }, + { + "epoch": 0.009378865734132649, + "grad_norm": 2.425971031188965, + "learning_rate": 4.9989162526335745e-05, + "loss": 7.0127, + "step": 1577 + }, + { + "epoch": 0.009384813017413646, + "grad_norm": 2.8379313945770264, + "learning_rate": 4.9989148769828595e-05, + "loss": 6.5782, + "step": 1578 + }, + { + "epoch": 0.009390760300694643, + "grad_norm": 3.0456466674804688, + "learning_rate": 4.9989135004597994e-05, + "loss": 6.9832, + "step": 1579 + }, + { + "epoch": 0.00939670758397564, + "grad_norm": 2.690138101577759, + "learning_rate": 4.9989121230643944e-05, + "loss": 7.0079, + "step": 1580 + }, + { + "epoch": 0.009402654867256638, + "grad_norm": 3.683105945587158, + "learning_rate": 4.9989107447966444e-05, + "loss": 7.2734, + "step": 1581 + }, + { + "epoch": 0.009408602150537635, + "grad_norm": 2.3310985565185547, + "learning_rate": 4.9989093656565513e-05, + "loss": 7.2388, + "step": 1582 + }, + { + "epoch": 0.009414549433818632, + "grad_norm": 2.353322982788086, + "learning_rate": 4.998907985644115e-05, + "loss": 7.0612, + "step": 1583 + }, + { + "epoch": 0.009420496717099628, + "grad_norm": 2.8458571434020996, + "learning_rate": 4.9989066047593344e-05, + "loss": 7.3093, + "step": 1584 + }, + { + "epoch": 0.009426444000380627, + "grad_norm": 2.3322811126708984, + "learning_rate": 4.9989052230022125e-05, + "loss": 6.983, + "step": 1585 + }, + { + "epoch": 0.009432391283661624, + "grad_norm": 2.7431764602661133, + "learning_rate": 4.998903840372748e-05, + "loss": 6.9694, + "step": 1586 + }, + { + "epoch": 0.00943833856694262, + "grad_norm": 2.7704508304595947, + "learning_rate": 4.998902456870942e-05, + "loss": 6.7727, + "step": 1587 + }, + { + "epoch": 0.009444285850223617, + "grad_norm": 2.4920814037323, + "learning_rate": 4.998901072496796e-05, + "loss": 7.0612, + "step": 1588 + }, + { + "epoch": 0.009450233133504616, + "grad_norm": 2.5911498069763184, + "learning_rate": 4.998899687250308e-05, + "loss": 6.8774, + "step": 1589 + }, + { + "epoch": 0.009456180416785613, + "grad_norm": 2.7269680500030518, + "learning_rate": 4.998898301131481e-05, + "loss": 7.0782, + "step": 1590 + }, + { + "epoch": 0.00946212770006661, + "grad_norm": 2.9707436561584473, + "learning_rate": 4.998896914140314e-05, + "loss": 7.307, + "step": 1591 + }, + { + "epoch": 0.009468074983347606, + "grad_norm": 3.064683675765991, + "learning_rate": 4.998895526276808e-05, + "loss": 7.3708, + "step": 1592 + }, + { + "epoch": 0.009474022266628603, + "grad_norm": 2.4465317726135254, + "learning_rate": 4.998894137540963e-05, + "loss": 7.0085, + "step": 1593 + }, + { + "epoch": 0.009479969549909602, + "grad_norm": 3.3061211109161377, + "learning_rate": 4.99889274793278e-05, + "loss": 6.8353, + "step": 1594 + }, + { + "epoch": 0.009485916833190599, + "grad_norm": 3.283397912979126, + "learning_rate": 4.9988913574522594e-05, + "loss": 6.6848, + "step": 1595 + }, + { + "epoch": 0.009491864116471595, + "grad_norm": 2.770745277404785, + "learning_rate": 4.9988899660994014e-05, + "loss": 7.1742, + "step": 1596 + }, + { + "epoch": 0.009497811399752592, + "grad_norm": 2.7975432872772217, + "learning_rate": 4.998888573874207e-05, + "loss": 6.7329, + "step": 1597 + }, + { + "epoch": 0.00950375868303359, + "grad_norm": 2.545919418334961, + "learning_rate": 4.998887180776677e-05, + "loss": 6.7203, + "step": 1598 + }, + { + "epoch": 0.009509705966314588, + "grad_norm": 2.7961528301239014, + "learning_rate": 4.99888578680681e-05, + "loss": 7.384, + "step": 1599 + }, + { + "epoch": 0.009515653249595584, + "grad_norm": 2.570570230484009, + "learning_rate": 4.9988843919646096e-05, + "loss": 7.0246, + "step": 1600 + }, + { + "epoch": 0.009521600532876581, + "grad_norm": 2.5365843772888184, + "learning_rate": 4.9988829962500734e-05, + "loss": 6.8801, + "step": 1601 + }, + { + "epoch": 0.00952754781615758, + "grad_norm": 2.4713737964630127, + "learning_rate": 4.998881599663203e-05, + "loss": 7.1974, + "step": 1602 + }, + { + "epoch": 0.009533495099438577, + "grad_norm": 2.5286331176757812, + "learning_rate": 4.998880202203999e-05, + "loss": 7.26, + "step": 1603 + }, + { + "epoch": 0.009539442382719573, + "grad_norm": 2.2333719730377197, + "learning_rate": 4.998878803872461e-05, + "loss": 7.3254, + "step": 1604 + }, + { + "epoch": 0.00954538966600057, + "grad_norm": 2.544095277786255, + "learning_rate": 4.9988774046685915e-05, + "loss": 7.407, + "step": 1605 + }, + { + "epoch": 0.009551336949281569, + "grad_norm": 3.057140588760376, + "learning_rate": 4.9988760045923886e-05, + "loss": 6.5303, + "step": 1606 + }, + { + "epoch": 0.009557284232562566, + "grad_norm": 3.0190670490264893, + "learning_rate": 4.998874603643854e-05, + "loss": 6.3276, + "step": 1607 + }, + { + "epoch": 0.009563231515843562, + "grad_norm": 2.208249568939209, + "learning_rate": 4.998873201822989e-05, + "loss": 6.856, + "step": 1608 + }, + { + "epoch": 0.00956917879912456, + "grad_norm": 2.3519229888916016, + "learning_rate": 4.998871799129793e-05, + "loss": 6.9854, + "step": 1609 + }, + { + "epoch": 0.009575126082405558, + "grad_norm": 2.604816198348999, + "learning_rate": 4.9988703955642655e-05, + "loss": 7.3127, + "step": 1610 + }, + { + "epoch": 0.009581073365686555, + "grad_norm": 2.320030927658081, + "learning_rate": 4.9988689911264094e-05, + "loss": 7.216, + "step": 1611 + }, + { + "epoch": 0.009587020648967551, + "grad_norm": 2.8475282192230225, + "learning_rate": 4.998867585816224e-05, + "loss": 6.6743, + "step": 1612 + }, + { + "epoch": 0.009592967932248548, + "grad_norm": 2.518707036972046, + "learning_rate": 4.998866179633709e-05, + "loss": 7.0257, + "step": 1613 + }, + { + "epoch": 0.009598915215529547, + "grad_norm": 2.7348618507385254, + "learning_rate": 4.998864772578866e-05, + "loss": 7.1933, + "step": 1614 + }, + { + "epoch": 0.009604862498810544, + "grad_norm": 2.5701184272766113, + "learning_rate": 4.9988633646516946e-05, + "loss": 7.1071, + "step": 1615 + }, + { + "epoch": 0.00961080978209154, + "grad_norm": 2.916544198989868, + "learning_rate": 4.998861955852197e-05, + "loss": 7.1331, + "step": 1616 + }, + { + "epoch": 0.009616757065372537, + "grad_norm": 2.390934944152832, + "learning_rate": 4.998860546180371e-05, + "loss": 7.3252, + "step": 1617 + }, + { + "epoch": 0.009622704348653536, + "grad_norm": 2.6720097064971924, + "learning_rate": 4.998859135636219e-05, + "loss": 7.0105, + "step": 1618 + }, + { + "epoch": 0.009628651631934533, + "grad_norm": 2.3859329223632812, + "learning_rate": 4.998857724219742e-05, + "loss": 7.023, + "step": 1619 + }, + { + "epoch": 0.00963459891521553, + "grad_norm": 2.9713187217712402, + "learning_rate": 4.998856311930939e-05, + "loss": 7.0338, + "step": 1620 + }, + { + "epoch": 0.009640546198496526, + "grad_norm": 2.33858060836792, + "learning_rate": 4.998854898769811e-05, + "loss": 7.0103, + "step": 1621 + }, + { + "epoch": 0.009646493481777523, + "grad_norm": 2.8897042274475098, + "learning_rate": 4.9988534847363585e-05, + "loss": 7.1225, + "step": 1622 + }, + { + "epoch": 0.009652440765058522, + "grad_norm": 2.354513645172119, + "learning_rate": 4.9988520698305826e-05, + "loss": 6.9272, + "step": 1623 + }, + { + "epoch": 0.009658388048339519, + "grad_norm": 2.5571863651275635, + "learning_rate": 4.9988506540524826e-05, + "loss": 6.3418, + "step": 1624 + }, + { + "epoch": 0.009664335331620515, + "grad_norm": 2.342381238937378, + "learning_rate": 4.99884923740206e-05, + "loss": 6.4265, + "step": 1625 + }, + { + "epoch": 0.009670282614901512, + "grad_norm": 2.5594370365142822, + "learning_rate": 4.998847819879315e-05, + "loss": 6.9801, + "step": 1626 + }, + { + "epoch": 0.00967622989818251, + "grad_norm": 3.6932148933410645, + "learning_rate": 4.9988464014842476e-05, + "loss": 7.0231, + "step": 1627 + }, + { + "epoch": 0.009682177181463508, + "grad_norm": 2.713508367538452, + "learning_rate": 4.998844982216859e-05, + "loss": 6.9041, + "step": 1628 + }, + { + "epoch": 0.009688124464744504, + "grad_norm": 2.703103542327881, + "learning_rate": 4.99884356207715e-05, + "loss": 6.9272, + "step": 1629 + }, + { + "epoch": 0.009694071748025501, + "grad_norm": 3.228708267211914, + "learning_rate": 4.9988421410651197e-05, + "loss": 6.9242, + "step": 1630 + }, + { + "epoch": 0.0097000190313065, + "grad_norm": 3.3407063484191895, + "learning_rate": 4.9988407191807694e-05, + "loss": 6.8871, + "step": 1631 + }, + { + "epoch": 0.009705966314587497, + "grad_norm": 2.3833165168762207, + "learning_rate": 4.9988392964241005e-05, + "loss": 6.9667, + "step": 1632 + }, + { + "epoch": 0.009711913597868493, + "grad_norm": 3.607023239135742, + "learning_rate": 4.9988378727951123e-05, + "loss": 6.93, + "step": 1633 + }, + { + "epoch": 0.00971786088114949, + "grad_norm": 3.797107219696045, + "learning_rate": 4.9988364482938056e-05, + "loss": 6.8115, + "step": 1634 + }, + { + "epoch": 0.009723808164430489, + "grad_norm": 2.5586941242218018, + "learning_rate": 4.998835022920181e-05, + "loss": 6.7322, + "step": 1635 + }, + { + "epoch": 0.009729755447711486, + "grad_norm": 2.377680540084839, + "learning_rate": 4.9988335966742385e-05, + "loss": 6.7127, + "step": 1636 + }, + { + "epoch": 0.009735702730992482, + "grad_norm": 2.510584592819214, + "learning_rate": 4.998832169555979e-05, + "loss": 6.836, + "step": 1637 + }, + { + "epoch": 0.00974165001427348, + "grad_norm": 2.8817014694213867, + "learning_rate": 4.9988307415654025e-05, + "loss": 6.7812, + "step": 1638 + }, + { + "epoch": 0.009747597297554478, + "grad_norm": 2.878535509109497, + "learning_rate": 4.998829312702511e-05, + "loss": 6.7852, + "step": 1639 + }, + { + "epoch": 0.009753544580835475, + "grad_norm": 2.5870323181152344, + "learning_rate": 4.998827882967304e-05, + "loss": 6.8569, + "step": 1640 + }, + { + "epoch": 0.009759491864116471, + "grad_norm": 2.7275760173797607, + "learning_rate": 4.998826452359782e-05, + "loss": 6.8304, + "step": 1641 + }, + { + "epoch": 0.009765439147397468, + "grad_norm": 2.24550461769104, + "learning_rate": 4.998825020879945e-05, + "loss": 6.7609, + "step": 1642 + }, + { + "epoch": 0.009771386430678467, + "grad_norm": 2.2101621627807617, + "learning_rate": 4.9988235885277934e-05, + "loss": 6.7548, + "step": 1643 + }, + { + "epoch": 0.009777333713959464, + "grad_norm": 2.289870023727417, + "learning_rate": 4.9988221553033294e-05, + "loss": 6.8899, + "step": 1644 + }, + { + "epoch": 0.00978328099724046, + "grad_norm": 2.6337740421295166, + "learning_rate": 4.9988207212065516e-05, + "loss": 6.7605, + "step": 1645 + }, + { + "epoch": 0.009789228280521457, + "grad_norm": 2.442605972290039, + "learning_rate": 4.998819286237462e-05, + "loss": 6.6299, + "step": 1646 + }, + { + "epoch": 0.009795175563802456, + "grad_norm": 2.6570451259613037, + "learning_rate": 4.9988178503960606e-05, + "loss": 6.6933, + "step": 1647 + }, + { + "epoch": 0.009801122847083453, + "grad_norm": 2.597043752670288, + "learning_rate": 4.9988164136823467e-05, + "loss": 6.7667, + "step": 1648 + }, + { + "epoch": 0.00980707013036445, + "grad_norm": 3.2576608657836914, + "learning_rate": 4.998814976096323e-05, + "loss": 7.1774, + "step": 1649 + }, + { + "epoch": 0.009813017413645446, + "grad_norm": 3.110119342803955, + "learning_rate": 4.998813537637988e-05, + "loss": 7.2139, + "step": 1650 + }, + { + "epoch": 0.009818964696926445, + "grad_norm": 3.038086414337158, + "learning_rate": 4.998812098307343e-05, + "loss": 7.2752, + "step": 1651 + }, + { + "epoch": 0.009824911980207442, + "grad_norm": 2.965916872024536, + "learning_rate": 4.998810658104389e-05, + "loss": 7.1151, + "step": 1652 + }, + { + "epoch": 0.009830859263488438, + "grad_norm": 3.011476755142212, + "learning_rate": 4.998809217029126e-05, + "loss": 7.1335, + "step": 1653 + }, + { + "epoch": 0.009836806546769435, + "grad_norm": 3.8196349143981934, + "learning_rate": 4.9988077750815534e-05, + "loss": 7.0865, + "step": 1654 + }, + { + "epoch": 0.009842753830050432, + "grad_norm": 3.2577872276306152, + "learning_rate": 4.998806332261674e-05, + "loss": 7.4285, + "step": 1655 + }, + { + "epoch": 0.00984870111333143, + "grad_norm": 2.847039222717285, + "learning_rate": 4.998804888569487e-05, + "loss": 7.3251, + "step": 1656 + }, + { + "epoch": 0.009854648396612428, + "grad_norm": 3.4066355228424072, + "learning_rate": 4.998803444004992e-05, + "loss": 7.3137, + "step": 1657 + }, + { + "epoch": 0.009860595679893424, + "grad_norm": 3.6774044036865234, + "learning_rate": 4.998801998568192e-05, + "loss": 7.0772, + "step": 1658 + }, + { + "epoch": 0.009866542963174421, + "grad_norm": 3.1404600143432617, + "learning_rate": 4.998800552259085e-05, + "loss": 7.1143, + "step": 1659 + }, + { + "epoch": 0.00987249024645542, + "grad_norm": 3.6337625980377197, + "learning_rate": 4.998799105077674e-05, + "loss": 7.1296, + "step": 1660 + }, + { + "epoch": 0.009878437529736417, + "grad_norm": 4.551114082336426, + "learning_rate": 4.9987976570239566e-05, + "loss": 7.1343, + "step": 1661 + }, + { + "epoch": 0.009884384813017413, + "grad_norm": 3.2305374145507812, + "learning_rate": 4.998796208097935e-05, + "loss": 7.0852, + "step": 1662 + }, + { + "epoch": 0.00989033209629841, + "grad_norm": 2.5174615383148193, + "learning_rate": 4.99879475829961e-05, + "loss": 7.2315, + "step": 1663 + }, + { + "epoch": 0.009896279379579409, + "grad_norm": 3.623525381088257, + "learning_rate": 4.9987933076289804e-05, + "loss": 7.4222, + "step": 1664 + }, + { + "epoch": 0.009902226662860406, + "grad_norm": 4.217465877532959, + "learning_rate": 4.998791856086049e-05, + "loss": 7.4003, + "step": 1665 + }, + { + "epoch": 0.009908173946141402, + "grad_norm": 2.42301344871521, + "learning_rate": 4.998790403670815e-05, + "loss": 7.3295, + "step": 1666 + }, + { + "epoch": 0.0099141212294224, + "grad_norm": 2.3003029823303223, + "learning_rate": 4.998788950383279e-05, + "loss": 7.2072, + "step": 1667 + }, + { + "epoch": 0.009920068512703398, + "grad_norm": 3.3792307376861572, + "learning_rate": 4.9987874962234414e-05, + "loss": 7.2882, + "step": 1668 + }, + { + "epoch": 0.009926015795984395, + "grad_norm": 3.42130184173584, + "learning_rate": 4.998786041191303e-05, + "loss": 7.1231, + "step": 1669 + }, + { + "epoch": 0.009931963079265391, + "grad_norm": 3.496676445007324, + "learning_rate": 4.9987845852868644e-05, + "loss": 7.2535, + "step": 1670 + }, + { + "epoch": 0.009937910362546388, + "grad_norm": 2.695780038833618, + "learning_rate": 4.9987831285101255e-05, + "loss": 7.3784, + "step": 1671 + }, + { + "epoch": 0.009943857645827387, + "grad_norm": 2.2745561599731445, + "learning_rate": 4.998781670861088e-05, + "loss": 7.1184, + "step": 1672 + }, + { + "epoch": 0.009949804929108384, + "grad_norm": 3.8487844467163086, + "learning_rate": 4.99878021233975e-05, + "loss": 7.277, + "step": 1673 + }, + { + "epoch": 0.00995575221238938, + "grad_norm": 2.6628305912017822, + "learning_rate": 4.998778752946115e-05, + "loss": 6.8204, + "step": 1674 + }, + { + "epoch": 0.009961699495670377, + "grad_norm": 3.6330301761627197, + "learning_rate": 4.998777292680182e-05, + "loss": 7.3003, + "step": 1675 + }, + { + "epoch": 0.009967646778951376, + "grad_norm": 2.644237995147705, + "learning_rate": 4.998775831541952e-05, + "loss": 7.1492, + "step": 1676 + }, + { + "epoch": 0.009973594062232373, + "grad_norm": 2.895193099975586, + "learning_rate": 4.998774369531424e-05, + "loss": 7.3986, + "step": 1677 + }, + { + "epoch": 0.00997954134551337, + "grad_norm": 3.2180328369140625, + "learning_rate": 4.998772906648601e-05, + "loss": 7.1085, + "step": 1678 + }, + { + "epoch": 0.009985488628794366, + "grad_norm": 3.5874838829040527, + "learning_rate": 4.9987714428934815e-05, + "loss": 6.9554, + "step": 1679 + }, + { + "epoch": 0.009991435912075365, + "grad_norm": 2.419516086578369, + "learning_rate": 4.9987699782660666e-05, + "loss": 6.6222, + "step": 1680 + }, + { + "epoch": 0.009997383195356362, + "grad_norm": 2.715808153152466, + "learning_rate": 4.9987685127663574e-05, + "loss": 6.8417, + "step": 1681 + }, + { + "epoch": 0.010003330478637358, + "grad_norm": 2.2847111225128174, + "learning_rate": 4.9987670463943534e-05, + "loss": 7.1649, + "step": 1682 + }, + { + "epoch": 0.010009277761918355, + "grad_norm": 2.402684450149536, + "learning_rate": 4.998765579150056e-05, + "loss": 7.6113, + "step": 1683 + }, + { + "epoch": 0.010015225045199352, + "grad_norm": 2.54388689994812, + "learning_rate": 4.998764111033465e-05, + "loss": 7.1261, + "step": 1684 + }, + { + "epoch": 0.01002117232848035, + "grad_norm": 2.8077542781829834, + "learning_rate": 4.9987626420445823e-05, + "loss": 7.1349, + "step": 1685 + }, + { + "epoch": 0.010027119611761347, + "grad_norm": 2.228707790374756, + "learning_rate": 4.9987611721834063e-05, + "loss": 7.1123, + "step": 1686 + }, + { + "epoch": 0.010033066895042344, + "grad_norm": 2.648607015609741, + "learning_rate": 4.998759701449939e-05, + "loss": 7.0263, + "step": 1687 + }, + { + "epoch": 0.010039014178323341, + "grad_norm": 3.0278162956237793, + "learning_rate": 4.99875822984418e-05, + "loss": 6.6463, + "step": 1688 + }, + { + "epoch": 0.01004496146160434, + "grad_norm": 3.1550052165985107, + "learning_rate": 4.998756757366131e-05, + "loss": 6.8773, + "step": 1689 + }, + { + "epoch": 0.010050908744885336, + "grad_norm": 3.3911843299865723, + "learning_rate": 4.998755284015792e-05, + "loss": 7.5045, + "step": 1690 + }, + { + "epoch": 0.010056856028166333, + "grad_norm": 2.668861150741577, + "learning_rate": 4.998753809793162e-05, + "loss": 7.5545, + "step": 1691 + }, + { + "epoch": 0.01006280331144733, + "grad_norm": 2.182792901992798, + "learning_rate": 4.998752334698244e-05, + "loss": 7.2315, + "step": 1692 + }, + { + "epoch": 0.010068750594728329, + "grad_norm": 2.981476068496704, + "learning_rate": 4.998750858731037e-05, + "loss": 7.3455, + "step": 1693 + }, + { + "epoch": 0.010074697878009325, + "grad_norm": 3.1855525970458984, + "learning_rate": 4.998749381891542e-05, + "loss": 7.3408, + "step": 1694 + }, + { + "epoch": 0.010080645161290322, + "grad_norm": 2.5677361488342285, + "learning_rate": 4.998747904179759e-05, + "loss": 6.7591, + "step": 1695 + }, + { + "epoch": 0.010086592444571319, + "grad_norm": 2.7397539615631104, + "learning_rate": 4.9987464255956894e-05, + "loss": 7.3976, + "step": 1696 + }, + { + "epoch": 0.010092539727852318, + "grad_norm": 2.1141586303710938, + "learning_rate": 4.998744946139333e-05, + "loss": 7.4287, + "step": 1697 + }, + { + "epoch": 0.010098487011133314, + "grad_norm": 2.1999096870422363, + "learning_rate": 4.998743465810691e-05, + "loss": 7.4804, + "step": 1698 + }, + { + "epoch": 0.010104434294414311, + "grad_norm": 2.4150960445404053, + "learning_rate": 4.9987419846097634e-05, + "loss": 7.1743, + "step": 1699 + }, + { + "epoch": 0.010110381577695308, + "grad_norm": 2.564270496368408, + "learning_rate": 4.998740502536551e-05, + "loss": 7.262, + "step": 1700 + }, + { + "epoch": 0.010116328860976307, + "grad_norm": 3.045964241027832, + "learning_rate": 4.9987390195910536e-05, + "loss": 7.0778, + "step": 1701 + }, + { + "epoch": 0.010122276144257304, + "grad_norm": 3.2720210552215576, + "learning_rate": 4.998737535773272e-05, + "loss": 7.2188, + "step": 1702 + }, + { + "epoch": 0.0101282234275383, + "grad_norm": 2.54496693611145, + "learning_rate": 4.998736051083207e-05, + "loss": 6.9985, + "step": 1703 + }, + { + "epoch": 0.010134170710819297, + "grad_norm": 3.6252541542053223, + "learning_rate": 4.998734565520859e-05, + "loss": 7.3502, + "step": 1704 + }, + { + "epoch": 0.010140117994100296, + "grad_norm": 3.468963146209717, + "learning_rate": 4.99873307908623e-05, + "loss": 6.9642, + "step": 1705 + }, + { + "epoch": 0.010146065277381293, + "grad_norm": 2.8778045177459717, + "learning_rate": 4.9987315917793174e-05, + "loss": 6.8675, + "step": 1706 + }, + { + "epoch": 0.01015201256066229, + "grad_norm": 2.4492053985595703, + "learning_rate": 4.9987301036001236e-05, + "loss": 7.3484, + "step": 1707 + }, + { + "epoch": 0.010157959843943286, + "grad_norm": 2.5170838832855225, + "learning_rate": 4.99872861454865e-05, + "loss": 7.6004, + "step": 1708 + }, + { + "epoch": 0.010163907127224285, + "grad_norm": 2.3539648056030273, + "learning_rate": 4.998727124624895e-05, + "loss": 7.3304, + "step": 1709 + }, + { + "epoch": 0.010169854410505282, + "grad_norm": 2.6097705364227295, + "learning_rate": 4.998725633828861e-05, + "loss": 7.3227, + "step": 1710 + }, + { + "epoch": 0.010175801693786278, + "grad_norm": 2.5909392833709717, + "learning_rate": 4.9987241421605466e-05, + "loss": 7.3797, + "step": 1711 + }, + { + "epoch": 0.010181748977067275, + "grad_norm": 3.143157958984375, + "learning_rate": 4.998722649619954e-05, + "loss": 7.1236, + "step": 1712 + }, + { + "epoch": 0.010187696260348274, + "grad_norm": 2.0621843338012695, + "learning_rate": 4.9987211562070835e-05, + "loss": 7.5322, + "step": 1713 + }, + { + "epoch": 0.01019364354362927, + "grad_norm": 1.7781084775924683, + "learning_rate": 4.9987196619219354e-05, + "loss": 7.428, + "step": 1714 + }, + { + "epoch": 0.010199590826910267, + "grad_norm": 2.3108980655670166, + "learning_rate": 4.9987181667645094e-05, + "loss": 7.3814, + "step": 1715 + }, + { + "epoch": 0.010205538110191264, + "grad_norm": 2.5184621810913086, + "learning_rate": 4.998716670734807e-05, + "loss": 7.374, + "step": 1716 + }, + { + "epoch": 0.010211485393472261, + "grad_norm": 1.9185826778411865, + "learning_rate": 4.9987151738328284e-05, + "loss": 7.3352, + "step": 1717 + }, + { + "epoch": 0.01021743267675326, + "grad_norm": 2.794224262237549, + "learning_rate": 4.998713676058574e-05, + "loss": 7.0293, + "step": 1718 + }, + { + "epoch": 0.010223379960034256, + "grad_norm": 3.601804733276367, + "learning_rate": 4.998712177412045e-05, + "loss": 7.0277, + "step": 1719 + }, + { + "epoch": 0.010229327243315253, + "grad_norm": 3.3258707523345947, + "learning_rate": 4.998710677893241e-05, + "loss": 6.9478, + "step": 1720 + }, + { + "epoch": 0.01023527452659625, + "grad_norm": 3.147439956665039, + "learning_rate": 4.9987091775021625e-05, + "loss": 6.7295, + "step": 1721 + }, + { + "epoch": 0.010241221809877249, + "grad_norm": 2.7821006774902344, + "learning_rate": 4.998707676238811e-05, + "loss": 6.7587, + "step": 1722 + }, + { + "epoch": 0.010247169093158245, + "grad_norm": 2.580597400665283, + "learning_rate": 4.998706174103186e-05, + "loss": 6.9091, + "step": 1723 + }, + { + "epoch": 0.010253116376439242, + "grad_norm": 2.5501208305358887, + "learning_rate": 4.998704671095289e-05, + "loss": 7.3262, + "step": 1724 + }, + { + "epoch": 0.010259063659720239, + "grad_norm": 2.5460124015808105, + "learning_rate": 4.99870316721512e-05, + "loss": 7.278, + "step": 1725 + }, + { + "epoch": 0.010265010943001238, + "grad_norm": 2.0253796577453613, + "learning_rate": 4.998701662462679e-05, + "loss": 7.1757, + "step": 1726 + }, + { + "epoch": 0.010270958226282234, + "grad_norm": 2.3127388954162598, + "learning_rate": 4.998700156837968e-05, + "loss": 7.1057, + "step": 1727 + }, + { + "epoch": 0.010276905509563231, + "grad_norm": 2.931878089904785, + "learning_rate": 4.998698650340986e-05, + "loss": 6.9993, + "step": 1728 + }, + { + "epoch": 0.010282852792844228, + "grad_norm": 3.239272356033325, + "learning_rate": 4.998697142971734e-05, + "loss": 6.7754, + "step": 1729 + }, + { + "epoch": 0.010288800076125227, + "grad_norm": 2.388212203979492, + "learning_rate": 4.998695634730213e-05, + "loss": 7.2794, + "step": 1730 + }, + { + "epoch": 0.010294747359406223, + "grad_norm": 2.7766799926757812, + "learning_rate": 4.998694125616423e-05, + "loss": 7.4636, + "step": 1731 + }, + { + "epoch": 0.01030069464268722, + "grad_norm": 2.543757915496826, + "learning_rate": 4.9986926156303646e-05, + "loss": 6.8801, + "step": 1732 + }, + { + "epoch": 0.010306641925968217, + "grad_norm": 1.8907097578048706, + "learning_rate": 4.9986911047720384e-05, + "loss": 7.0353, + "step": 1733 + }, + { + "epoch": 0.010312589209249216, + "grad_norm": 1.9585598707199097, + "learning_rate": 4.9986895930414444e-05, + "loss": 7.0469, + "step": 1734 + }, + { + "epoch": 0.010318536492530212, + "grad_norm": 2.5191497802734375, + "learning_rate": 4.998688080438585e-05, + "loss": 7.1469, + "step": 1735 + }, + { + "epoch": 0.01032448377581121, + "grad_norm": 3.5709545612335205, + "learning_rate": 4.998686566963459e-05, + "loss": 7.0499, + "step": 1736 + }, + { + "epoch": 0.010330431059092206, + "grad_norm": 2.3778624534606934, + "learning_rate": 4.998685052616067e-05, + "loss": 7.5897, + "step": 1737 + }, + { + "epoch": 0.010336378342373205, + "grad_norm": 2.0795674324035645, + "learning_rate": 4.9986835373964094e-05, + "loss": 6.8778, + "step": 1738 + }, + { + "epoch": 0.010342325625654201, + "grad_norm": 2.7674901485443115, + "learning_rate": 4.9986820213044875e-05, + "loss": 6.4428, + "step": 1739 + }, + { + "epoch": 0.010348272908935198, + "grad_norm": 2.7203595638275146, + "learning_rate": 4.998680504340302e-05, + "loss": 7.4668, + "step": 1740 + }, + { + "epoch": 0.010354220192216195, + "grad_norm": 2.840240955352783, + "learning_rate": 4.998678986503853e-05, + "loss": 7.2219, + "step": 1741 + }, + { + "epoch": 0.010360167475497194, + "grad_norm": 2.7803452014923096, + "learning_rate": 4.9986774677951404e-05, + "loss": 6.5674, + "step": 1742 + }, + { + "epoch": 0.01036611475877819, + "grad_norm": 2.467574119567871, + "learning_rate": 4.998675948214165e-05, + "loss": 6.9621, + "step": 1743 + }, + { + "epoch": 0.010372062042059187, + "grad_norm": 2.1437904834747314, + "learning_rate": 4.998674427760929e-05, + "loss": 7.1564, + "step": 1744 + }, + { + "epoch": 0.010378009325340184, + "grad_norm": 2.504685163497925, + "learning_rate": 4.9986729064354304e-05, + "loss": 6.8836, + "step": 1745 + }, + { + "epoch": 0.010383956608621183, + "grad_norm": 2.401296615600586, + "learning_rate": 4.998671384237671e-05, + "loss": 7.2906, + "step": 1746 + }, + { + "epoch": 0.01038990389190218, + "grad_norm": 2.233701705932617, + "learning_rate": 4.9986698611676516e-05, + "loss": 6.6854, + "step": 1747 + }, + { + "epoch": 0.010395851175183176, + "grad_norm": 2.9597983360290527, + "learning_rate": 4.998668337225373e-05, + "loss": 6.8859, + "step": 1748 + }, + { + "epoch": 0.010401798458464173, + "grad_norm": 3.2164804935455322, + "learning_rate": 4.998666812410834e-05, + "loss": 6.8255, + "step": 1749 + }, + { + "epoch": 0.01040774574174517, + "grad_norm": 3.010002374649048, + "learning_rate": 4.9986652867240364e-05, + "loss": 6.7092, + "step": 1750 + }, + { + "epoch": 0.010413693025026169, + "grad_norm": 2.8442068099975586, + "learning_rate": 4.998663760164981e-05, + "loss": 6.7231, + "step": 1751 + }, + { + "epoch": 0.010419640308307165, + "grad_norm": 3.127922773361206, + "learning_rate": 4.9986622327336676e-05, + "loss": 6.6072, + "step": 1752 + }, + { + "epoch": 0.010425587591588162, + "grad_norm": 2.7306833267211914, + "learning_rate": 4.998660704430097e-05, + "loss": 6.696, + "step": 1753 + }, + { + "epoch": 0.010431534874869159, + "grad_norm": 2.9005799293518066, + "learning_rate": 4.99865917525427e-05, + "loss": 6.6598, + "step": 1754 + }, + { + "epoch": 0.010437482158150158, + "grad_norm": 3.17934513092041, + "learning_rate": 4.9986576452061865e-05, + "loss": 6.5887, + "step": 1755 + }, + { + "epoch": 0.010443429441431154, + "grad_norm": 2.9390244483947754, + "learning_rate": 4.9986561142858476e-05, + "loss": 6.5375, + "step": 1756 + }, + { + "epoch": 0.010449376724712151, + "grad_norm": 2.5547196865081787, + "learning_rate": 4.998654582493254e-05, + "loss": 6.7484, + "step": 1757 + }, + { + "epoch": 0.010455324007993148, + "grad_norm": 2.9969568252563477, + "learning_rate": 4.9986530498284054e-05, + "loss": 6.6496, + "step": 1758 + }, + { + "epoch": 0.010461271291274147, + "grad_norm": 2.843932867050171, + "learning_rate": 4.998651516291303e-05, + "loss": 6.5713, + "step": 1759 + }, + { + "epoch": 0.010467218574555143, + "grad_norm": 2.9114811420440674, + "learning_rate": 4.9986499818819476e-05, + "loss": 7.5248, + "step": 1760 + }, + { + "epoch": 0.01047316585783614, + "grad_norm": 3.0292229652404785, + "learning_rate": 4.998648446600339e-05, + "loss": 7.2346, + "step": 1761 + }, + { + "epoch": 0.010479113141117137, + "grad_norm": 2.553088426589966, + "learning_rate": 4.998646910446478e-05, + "loss": 7.1531, + "step": 1762 + }, + { + "epoch": 0.010485060424398136, + "grad_norm": 2.9838356971740723, + "learning_rate": 4.998645373420365e-05, + "loss": 6.6561, + "step": 1763 + }, + { + "epoch": 0.010491007707679132, + "grad_norm": 2.8948864936828613, + "learning_rate": 4.9986438355220014e-05, + "loss": 6.463, + "step": 1764 + }, + { + "epoch": 0.01049695499096013, + "grad_norm": 2.805084228515625, + "learning_rate": 4.9986422967513856e-05, + "loss": 6.701, + "step": 1765 + }, + { + "epoch": 0.010502902274241126, + "grad_norm": 2.748077869415283, + "learning_rate": 4.998640757108522e-05, + "loss": 7.3223, + "step": 1766 + }, + { + "epoch": 0.010508849557522125, + "grad_norm": 3.0048258304595947, + "learning_rate": 4.998639216593406e-05, + "loss": 7.2582, + "step": 1767 + }, + { + "epoch": 0.010514796840803121, + "grad_norm": 2.538522958755493, + "learning_rate": 4.998637675206043e-05, + "loss": 7.1208, + "step": 1768 + }, + { + "epoch": 0.010520744124084118, + "grad_norm": 2.2091188430786133, + "learning_rate": 4.99863613294643e-05, + "loss": 7.0577, + "step": 1769 + }, + { + "epoch": 0.010526691407365115, + "grad_norm": 2.8454909324645996, + "learning_rate": 4.998634589814569e-05, + "loss": 7.1296, + "step": 1770 + }, + { + "epoch": 0.010532638690646114, + "grad_norm": 3.4139351844787598, + "learning_rate": 4.998633045810461e-05, + "loss": 6.9565, + "step": 1771 + }, + { + "epoch": 0.01053858597392711, + "grad_norm": 2.3192107677459717, + "learning_rate": 4.9986315009341066e-05, + "loss": 6.6027, + "step": 1772 + }, + { + "epoch": 0.010544533257208107, + "grad_norm": 2.309290647506714, + "learning_rate": 4.998629955185505e-05, + "loss": 7.0417, + "step": 1773 + }, + { + "epoch": 0.010550480540489104, + "grad_norm": 3.2046520709991455, + "learning_rate": 4.998628408564657e-05, + "loss": 7.0368, + "step": 1774 + }, + { + "epoch": 0.010556427823770103, + "grad_norm": 2.459064483642578, + "learning_rate": 4.9986268610715646e-05, + "loss": 7.2726, + "step": 1775 + }, + { + "epoch": 0.0105623751070511, + "grad_norm": 2.602522134780884, + "learning_rate": 4.998625312706227e-05, + "loss": 7.3377, + "step": 1776 + }, + { + "epoch": 0.010568322390332096, + "grad_norm": 3.9599175453186035, + "learning_rate": 4.998623763468645e-05, + "loss": 6.9146, + "step": 1777 + }, + { + "epoch": 0.010574269673613093, + "grad_norm": 3.312527894973755, + "learning_rate": 4.99862221335882e-05, + "loss": 6.7457, + "step": 1778 + }, + { + "epoch": 0.01058021695689409, + "grad_norm": 2.5287606716156006, + "learning_rate": 4.9986206623767506e-05, + "loss": 7.2651, + "step": 1779 + }, + { + "epoch": 0.010586164240175088, + "grad_norm": 2.4065616130828857, + "learning_rate": 4.99861911052244e-05, + "loss": 7.1135, + "step": 1780 + }, + { + "epoch": 0.010592111523456085, + "grad_norm": 2.321385383605957, + "learning_rate": 4.998617557795886e-05, + "loss": 7.1985, + "step": 1781 + }, + { + "epoch": 0.010598058806737082, + "grad_norm": 2.118995189666748, + "learning_rate": 4.9986160041970906e-05, + "loss": 7.2832, + "step": 1782 + }, + { + "epoch": 0.010604006090018079, + "grad_norm": 2.2536606788635254, + "learning_rate": 4.9986144497260544e-05, + "loss": 7.191, + "step": 1783 + }, + { + "epoch": 0.010609953373299078, + "grad_norm": 2.2956738471984863, + "learning_rate": 4.998612894382778e-05, + "loss": 7.0496, + "step": 1784 + }, + { + "epoch": 0.010615900656580074, + "grad_norm": 2.4258289337158203, + "learning_rate": 4.9986113381672614e-05, + "loss": 7.2767, + "step": 1785 + }, + { + "epoch": 0.010621847939861071, + "grad_norm": 2.4731507301330566, + "learning_rate": 4.998609781079505e-05, + "loss": 6.8805, + "step": 1786 + }, + { + "epoch": 0.010627795223142068, + "grad_norm": 2.3245391845703125, + "learning_rate": 4.9986082231195105e-05, + "loss": 6.8921, + "step": 1787 + }, + { + "epoch": 0.010633742506423067, + "grad_norm": 2.6239898204803467, + "learning_rate": 4.998606664287278e-05, + "loss": 6.9353, + "step": 1788 + }, + { + "epoch": 0.010639689789704063, + "grad_norm": 2.186162233352661, + "learning_rate": 4.9986051045828065e-05, + "loss": 6.8466, + "step": 1789 + }, + { + "epoch": 0.01064563707298506, + "grad_norm": 2.2362232208251953, + "learning_rate": 4.998603544006098e-05, + "loss": 6.82, + "step": 1790 + }, + { + "epoch": 0.010651584356266057, + "grad_norm": 2.2302427291870117, + "learning_rate": 4.998601982557153e-05, + "loss": 6.7034, + "step": 1791 + }, + { + "epoch": 0.010657531639547056, + "grad_norm": 2.0393195152282715, + "learning_rate": 4.998600420235972e-05, + "loss": 6.6646, + "step": 1792 + }, + { + "epoch": 0.010663478922828052, + "grad_norm": 1.976536512374878, + "learning_rate": 4.9985988570425556e-05, + "loss": 6.4994, + "step": 1793 + }, + { + "epoch": 0.01066942620610905, + "grad_norm": 2.4167046546936035, + "learning_rate": 4.998597292976904e-05, + "loss": 6.7849, + "step": 1794 + }, + { + "epoch": 0.010675373489390046, + "grad_norm": 2.3077776432037354, + "learning_rate": 4.998595728039018e-05, + "loss": 6.8356, + "step": 1795 + }, + { + "epoch": 0.010681320772671045, + "grad_norm": 2.5263309478759766, + "learning_rate": 4.998594162228898e-05, + "loss": 6.6351, + "step": 1796 + }, + { + "epoch": 0.010687268055952041, + "grad_norm": 2.153365135192871, + "learning_rate": 4.9985925955465443e-05, + "loss": 6.7911, + "step": 1797 + }, + { + "epoch": 0.010693215339233038, + "grad_norm": 3.3034393787384033, + "learning_rate": 4.998591027991958e-05, + "loss": 6.7589, + "step": 1798 + }, + { + "epoch": 0.010699162622514035, + "grad_norm": 2.2177388668060303, + "learning_rate": 4.998589459565139e-05, + "loss": 6.571, + "step": 1799 + }, + { + "epoch": 0.010705109905795034, + "grad_norm": 2.3165230751037598, + "learning_rate": 4.9985878902660886e-05, + "loss": 6.9124, + "step": 1800 + }, + { + "epoch": 0.01071105718907603, + "grad_norm": 2.270045757293701, + "learning_rate": 4.998586320094807e-05, + "loss": 6.4442, + "step": 1801 + }, + { + "epoch": 0.010717004472357027, + "grad_norm": 2.1198744773864746, + "learning_rate": 4.9985847490512945e-05, + "loss": 6.555, + "step": 1802 + }, + { + "epoch": 0.010722951755638024, + "grad_norm": 2.5428359508514404, + "learning_rate": 4.998583177135552e-05, + "loss": 6.8991, + "step": 1803 + }, + { + "epoch": 0.010728899038919023, + "grad_norm": 1.983817219734192, + "learning_rate": 4.99858160434758e-05, + "loss": 6.6428, + "step": 1804 + }, + { + "epoch": 0.01073484632220002, + "grad_norm": 2.2749712467193604, + "learning_rate": 4.998580030687379e-05, + "loss": 6.7294, + "step": 1805 + }, + { + "epoch": 0.010740793605481016, + "grad_norm": 1.914762258529663, + "learning_rate": 4.998578456154949e-05, + "loss": 7.0395, + "step": 1806 + }, + { + "epoch": 0.010746740888762013, + "grad_norm": 1.6850765943527222, + "learning_rate": 4.998576880750292e-05, + "loss": 6.862, + "step": 1807 + }, + { + "epoch": 0.010752688172043012, + "grad_norm": 2.2930233478546143, + "learning_rate": 4.9985753044734076e-05, + "loss": 6.8213, + "step": 1808 + }, + { + "epoch": 0.010758635455324008, + "grad_norm": 2.193464756011963, + "learning_rate": 4.998573727324295e-05, + "loss": 6.9303, + "step": 1809 + }, + { + "epoch": 0.010764582738605005, + "grad_norm": 2.2451658248901367, + "learning_rate": 4.9985721493029576e-05, + "loss": 6.8061, + "step": 1810 + }, + { + "epoch": 0.010770530021886002, + "grad_norm": 2.164214849472046, + "learning_rate": 4.998570570409394e-05, + "loss": 6.6485, + "step": 1811 + }, + { + "epoch": 0.010776477305166999, + "grad_norm": 2.3530375957489014, + "learning_rate": 4.9985689906436054e-05, + "loss": 6.6826, + "step": 1812 + }, + { + "epoch": 0.010782424588447997, + "grad_norm": 3.007641553878784, + "learning_rate": 4.998567410005591e-05, + "loss": 6.0781, + "step": 1813 + }, + { + "epoch": 0.010788371871728994, + "grad_norm": 2.500411033630371, + "learning_rate": 4.998565828495354e-05, + "loss": 7.0544, + "step": 1814 + }, + { + "epoch": 0.010794319155009991, + "grad_norm": 2.329221725463867, + "learning_rate": 4.998564246112893e-05, + "loss": 7.2505, + "step": 1815 + }, + { + "epoch": 0.010800266438290988, + "grad_norm": 2.05120849609375, + "learning_rate": 4.998562662858209e-05, + "loss": 7.3094, + "step": 1816 + }, + { + "epoch": 0.010806213721571986, + "grad_norm": 1.83049738407135, + "learning_rate": 4.9985610787313023e-05, + "loss": 6.7752, + "step": 1817 + }, + { + "epoch": 0.010812161004852983, + "grad_norm": 2.2754576206207275, + "learning_rate": 4.998559493732174e-05, + "loss": 6.9396, + "step": 1818 + }, + { + "epoch": 0.01081810828813398, + "grad_norm": 2.104849338531494, + "learning_rate": 4.998557907860825e-05, + "loss": 7.2624, + "step": 1819 + }, + { + "epoch": 0.010824055571414977, + "grad_norm": 3.152069568634033, + "learning_rate": 4.998556321117254e-05, + "loss": 6.6763, + "step": 1820 + }, + { + "epoch": 0.010830002854695975, + "grad_norm": 3.4046475887298584, + "learning_rate": 4.9985547335014636e-05, + "loss": 6.7145, + "step": 1821 + }, + { + "epoch": 0.010835950137976972, + "grad_norm": 1.9208084344863892, + "learning_rate": 4.9985531450134534e-05, + "loss": 6.8985, + "step": 1822 + }, + { + "epoch": 0.010841897421257969, + "grad_norm": 2.4949824810028076, + "learning_rate": 4.998551555653224e-05, + "loss": 6.8196, + "step": 1823 + }, + { + "epoch": 0.010847844704538966, + "grad_norm": 2.613175392150879, + "learning_rate": 4.998549965420776e-05, + "loss": 6.7918, + "step": 1824 + }, + { + "epoch": 0.010853791987819965, + "grad_norm": 2.3322529792785645, + "learning_rate": 4.9985483743161105e-05, + "loss": 6.6133, + "step": 1825 + }, + { + "epoch": 0.010859739271100961, + "grad_norm": 3.116680860519409, + "learning_rate": 4.998546782339227e-05, + "loss": 7.4026, + "step": 1826 + }, + { + "epoch": 0.010865686554381958, + "grad_norm": 2.673938274383545, + "learning_rate": 4.998545189490127e-05, + "loss": 6.9181, + "step": 1827 + }, + { + "epoch": 0.010871633837662955, + "grad_norm": 2.135727643966675, + "learning_rate": 4.998543595768811e-05, + "loss": 6.9514, + "step": 1828 + }, + { + "epoch": 0.010877581120943954, + "grad_norm": 2.241696357727051, + "learning_rate": 4.9985420011752784e-05, + "loss": 7.126, + "step": 1829 + }, + { + "epoch": 0.01088352840422495, + "grad_norm": 2.316342830657959, + "learning_rate": 4.9985404057095315e-05, + "loss": 6.9752, + "step": 1830 + }, + { + "epoch": 0.010889475687505947, + "grad_norm": 2.591611623764038, + "learning_rate": 4.998538809371569e-05, + "loss": 6.8721, + "step": 1831 + }, + { + "epoch": 0.010895422970786944, + "grad_norm": 2.2846317291259766, + "learning_rate": 4.9985372121613935e-05, + "loss": 6.9468, + "step": 1832 + }, + { + "epoch": 0.010901370254067943, + "grad_norm": 2.0799343585968018, + "learning_rate": 4.998535614079004e-05, + "loss": 7.0839, + "step": 1833 + }, + { + "epoch": 0.01090731753734894, + "grad_norm": 2.1908833980560303, + "learning_rate": 4.998534015124401e-05, + "loss": 6.7228, + "step": 1834 + }, + { + "epoch": 0.010913264820629936, + "grad_norm": 2.329401969909668, + "learning_rate": 4.998532415297587e-05, + "loss": 6.715, + "step": 1835 + }, + { + "epoch": 0.010919212103910933, + "grad_norm": 1.9492794275283813, + "learning_rate": 4.998530814598559e-05, + "loss": 6.6762, + "step": 1836 + }, + { + "epoch": 0.010925159387191932, + "grad_norm": 1.9564979076385498, + "learning_rate": 4.998529213027321e-05, + "loss": 6.8545, + "step": 1837 + }, + { + "epoch": 0.010931106670472928, + "grad_norm": 1.8424931764602661, + "learning_rate": 4.998527610583872e-05, + "loss": 6.8505, + "step": 1838 + }, + { + "epoch": 0.010937053953753925, + "grad_norm": 1.9743967056274414, + "learning_rate": 4.998526007268213e-05, + "loss": 6.8413, + "step": 1839 + }, + { + "epoch": 0.010943001237034922, + "grad_norm": 2.31296968460083, + "learning_rate": 4.998524403080345e-05, + "loss": 6.7327, + "step": 1840 + }, + { + "epoch": 0.010948948520315919, + "grad_norm": 2.049689292907715, + "learning_rate": 4.9985227980202665e-05, + "loss": 7.0029, + "step": 1841 + }, + { + "epoch": 0.010954895803596917, + "grad_norm": 2.1640658378601074, + "learning_rate": 4.99852119208798e-05, + "loss": 7.0749, + "step": 1842 + }, + { + "epoch": 0.010960843086877914, + "grad_norm": 1.8896230459213257, + "learning_rate": 4.998519585283486e-05, + "loss": 6.7249, + "step": 1843 + }, + { + "epoch": 0.010966790370158911, + "grad_norm": 2.4835314750671387, + "learning_rate": 4.998517977606785e-05, + "loss": 6.5605, + "step": 1844 + }, + { + "epoch": 0.010972737653439908, + "grad_norm": 2.2472622394561768, + "learning_rate": 4.998516369057876e-05, + "loss": 6.8291, + "step": 1845 + }, + { + "epoch": 0.010978684936720906, + "grad_norm": 2.499096155166626, + "learning_rate": 4.998514759636762e-05, + "loss": 6.6921, + "step": 1846 + }, + { + "epoch": 0.010984632220001903, + "grad_norm": 2.296786308288574, + "learning_rate": 4.998513149343442e-05, + "loss": 7.0475, + "step": 1847 + }, + { + "epoch": 0.0109905795032829, + "grad_norm": 2.2896368503570557, + "learning_rate": 4.998511538177916e-05, + "loss": 6.775, + "step": 1848 + }, + { + "epoch": 0.010996526786563897, + "grad_norm": 2.025575637817383, + "learning_rate": 4.998509926140186e-05, + "loss": 6.9538, + "step": 1849 + }, + { + "epoch": 0.011002474069844895, + "grad_norm": 2.23502779006958, + "learning_rate": 4.9985083132302525e-05, + "loss": 7.0595, + "step": 1850 + }, + { + "epoch": 0.011008421353125892, + "grad_norm": 2.7158777713775635, + "learning_rate": 4.998506699448115e-05, + "loss": 7.0086, + "step": 1851 + }, + { + "epoch": 0.011014368636406889, + "grad_norm": 2.2707183361053467, + "learning_rate": 4.998505084793775e-05, + "loss": 6.6396, + "step": 1852 + }, + { + "epoch": 0.011020315919687886, + "grad_norm": 3.196085214614868, + "learning_rate": 4.998503469267232e-05, + "loss": 6.6026, + "step": 1853 + }, + { + "epoch": 0.011026263202968884, + "grad_norm": 2.4472603797912598, + "learning_rate": 4.9985018528684876e-05, + "loss": 7.1332, + "step": 1854 + }, + { + "epoch": 0.011032210486249881, + "grad_norm": 2.7070915699005127, + "learning_rate": 4.998500235597542e-05, + "loss": 6.9669, + "step": 1855 + }, + { + "epoch": 0.011038157769530878, + "grad_norm": 2.127729654312134, + "learning_rate": 4.998498617454396e-05, + "loss": 6.9589, + "step": 1856 + }, + { + "epoch": 0.011044105052811875, + "grad_norm": 2.2897160053253174, + "learning_rate": 4.99849699843905e-05, + "loss": 7.0402, + "step": 1857 + }, + { + "epoch": 0.011050052336092873, + "grad_norm": 1.888961672782898, + "learning_rate": 4.998495378551504e-05, + "loss": 6.9406, + "step": 1858 + }, + { + "epoch": 0.01105599961937387, + "grad_norm": 1.9889254570007324, + "learning_rate": 4.9984937577917594e-05, + "loss": 6.8392, + "step": 1859 + }, + { + "epoch": 0.011061946902654867, + "grad_norm": 3.042891025543213, + "learning_rate": 4.998492136159817e-05, + "loss": 6.7743, + "step": 1860 + }, + { + "epoch": 0.011067894185935864, + "grad_norm": 2.423988103866577, + "learning_rate": 4.998490513655676e-05, + "loss": 6.9802, + "step": 1861 + }, + { + "epoch": 0.011073841469216862, + "grad_norm": 2.6415674686431885, + "learning_rate": 4.998488890279338e-05, + "loss": 6.7104, + "step": 1862 + }, + { + "epoch": 0.01107978875249786, + "grad_norm": 2.686969518661499, + "learning_rate": 4.998487266030804e-05, + "loss": 7.0539, + "step": 1863 + }, + { + "epoch": 0.011085736035778856, + "grad_norm": 2.6695480346679688, + "learning_rate": 4.998485640910072e-05, + "loss": 6.9812, + "step": 1864 + }, + { + "epoch": 0.011091683319059853, + "grad_norm": 2.6251392364501953, + "learning_rate": 4.9984840149171466e-05, + "loss": 6.9954, + "step": 1865 + }, + { + "epoch": 0.011097630602340851, + "grad_norm": 2.487593650817871, + "learning_rate": 4.998482388052025e-05, + "loss": 7.0847, + "step": 1866 + }, + { + "epoch": 0.011103577885621848, + "grad_norm": 2.3249282836914062, + "learning_rate": 4.998480760314709e-05, + "loss": 6.9936, + "step": 1867 + }, + { + "epoch": 0.011109525168902845, + "grad_norm": 2.170452833175659, + "learning_rate": 4.9984791317052e-05, + "loss": 6.9155, + "step": 1868 + }, + { + "epoch": 0.011115472452183842, + "grad_norm": 3.331779718399048, + "learning_rate": 4.9984775022234975e-05, + "loss": 6.9128, + "step": 1869 + }, + { + "epoch": 0.01112141973546484, + "grad_norm": 2.7665064334869385, + "learning_rate": 4.9984758718696026e-05, + "loss": 6.9002, + "step": 1870 + }, + { + "epoch": 0.011127367018745837, + "grad_norm": 2.2872116565704346, + "learning_rate": 4.998474240643515e-05, + "loss": 6.9058, + "step": 1871 + }, + { + "epoch": 0.011133314302026834, + "grad_norm": 2.2125210762023926, + "learning_rate": 4.998472608545236e-05, + "loss": 6.932, + "step": 1872 + }, + { + "epoch": 0.011139261585307831, + "grad_norm": 2.1135666370391846, + "learning_rate": 4.998470975574766e-05, + "loss": 7.0018, + "step": 1873 + }, + { + "epoch": 0.011145208868588828, + "grad_norm": 2.0649492740631104, + "learning_rate": 4.998469341732105e-05, + "loss": 7.0132, + "step": 1874 + }, + { + "epoch": 0.011151156151869826, + "grad_norm": 4.0558576583862305, + "learning_rate": 4.9984677070172546e-05, + "loss": 6.8826, + "step": 1875 + }, + { + "epoch": 0.011157103435150823, + "grad_norm": 2.5675904750823975, + "learning_rate": 4.998466071430216e-05, + "loss": 7.0314, + "step": 1876 + }, + { + "epoch": 0.01116305071843182, + "grad_norm": 2.9773342609405518, + "learning_rate": 4.998464434970987e-05, + "loss": 6.8608, + "step": 1877 + }, + { + "epoch": 0.011168998001712817, + "grad_norm": 2.804995059967041, + "learning_rate": 4.9984627976395705e-05, + "loss": 6.6857, + "step": 1878 + }, + { + "epoch": 0.011174945284993815, + "grad_norm": 3.758509874343872, + "learning_rate": 4.9984611594359664e-05, + "loss": 6.9995, + "step": 1879 + }, + { + "epoch": 0.011180892568274812, + "grad_norm": 2.583061933517456, + "learning_rate": 4.998459520360176e-05, + "loss": 6.5844, + "step": 1880 + }, + { + "epoch": 0.011186839851555809, + "grad_norm": 2.357642889022827, + "learning_rate": 4.998457880412198e-05, + "loss": 6.6435, + "step": 1881 + }, + { + "epoch": 0.011192787134836806, + "grad_norm": 2.181558609008789, + "learning_rate": 4.9984562395920356e-05, + "loss": 7.045, + "step": 1882 + }, + { + "epoch": 0.011198734418117804, + "grad_norm": 2.4768264293670654, + "learning_rate": 4.998454597899688e-05, + "loss": 7.2053, + "step": 1883 + }, + { + "epoch": 0.011204681701398801, + "grad_norm": 2.4422380924224854, + "learning_rate": 4.998452955335154e-05, + "loss": 6.8038, + "step": 1884 + }, + { + "epoch": 0.011210628984679798, + "grad_norm": 3.3173701763153076, + "learning_rate": 4.998451311898437e-05, + "loss": 6.8619, + "step": 1885 + }, + { + "epoch": 0.011216576267960795, + "grad_norm": 2.4492833614349365, + "learning_rate": 4.9984496675895366e-05, + "loss": 6.6681, + "step": 1886 + }, + { + "epoch": 0.011222523551241793, + "grad_norm": 3.065016031265259, + "learning_rate": 4.998448022408453e-05, + "loss": 6.7439, + "step": 1887 + }, + { + "epoch": 0.01122847083452279, + "grad_norm": 3.327730655670166, + "learning_rate": 4.998446376355187e-05, + "loss": 6.735, + "step": 1888 + }, + { + "epoch": 0.011234418117803787, + "grad_norm": 3.428292751312256, + "learning_rate": 4.998444729429739e-05, + "loss": 6.5277, + "step": 1889 + }, + { + "epoch": 0.011240365401084784, + "grad_norm": 2.4982972145080566, + "learning_rate": 4.9984430816321095e-05, + "loss": 6.8228, + "step": 1890 + }, + { + "epoch": 0.011246312684365782, + "grad_norm": 2.568232297897339, + "learning_rate": 4.9984414329623e-05, + "loss": 7.0772, + "step": 1891 + }, + { + "epoch": 0.01125225996764678, + "grad_norm": 2.534109115600586, + "learning_rate": 4.99843978342031e-05, + "loss": 7.0259, + "step": 1892 + }, + { + "epoch": 0.011258207250927776, + "grad_norm": 2.6394994258880615, + "learning_rate": 4.998438133006141e-05, + "loss": 6.8692, + "step": 1893 + }, + { + "epoch": 0.011264154534208773, + "grad_norm": 2.4049339294433594, + "learning_rate": 4.998436481719792e-05, + "loss": 6.8653, + "step": 1894 + }, + { + "epoch": 0.011270101817489771, + "grad_norm": 2.661191701889038, + "learning_rate": 4.998434829561266e-05, + "loss": 6.628, + "step": 1895 + }, + { + "epoch": 0.011276049100770768, + "grad_norm": 2.395829916000366, + "learning_rate": 4.998433176530561e-05, + "loss": 6.9876, + "step": 1896 + }, + { + "epoch": 0.011281996384051765, + "grad_norm": 2.547858715057373, + "learning_rate": 4.99843152262768e-05, + "loss": 7.3832, + "step": 1897 + }, + { + "epoch": 0.011287943667332762, + "grad_norm": 2.364246368408203, + "learning_rate": 4.998429867852621e-05, + "loss": 7.3771, + "step": 1898 + }, + { + "epoch": 0.01129389095061376, + "grad_norm": 2.3385260105133057, + "learning_rate": 4.998428212205387e-05, + "loss": 6.971, + "step": 1899 + }, + { + "epoch": 0.011299838233894757, + "grad_norm": 2.253760576248169, + "learning_rate": 4.998426555685977e-05, + "loss": 7.0588, + "step": 1900 + }, + { + "epoch": 0.011305785517175754, + "grad_norm": 2.4103500843048096, + "learning_rate": 4.998424898294392e-05, + "loss": 6.8731, + "step": 1901 + }, + { + "epoch": 0.011311732800456751, + "grad_norm": 2.4819014072418213, + "learning_rate": 4.998423240030633e-05, + "loss": 6.9502, + "step": 1902 + }, + { + "epoch": 0.011317680083737748, + "grad_norm": 2.503901243209839, + "learning_rate": 4.998421580894701e-05, + "loss": 7.017, + "step": 1903 + }, + { + "epoch": 0.011323627367018746, + "grad_norm": 2.2224137783050537, + "learning_rate": 4.9984199208865943e-05, + "loss": 7.1938, + "step": 1904 + }, + { + "epoch": 0.011329574650299743, + "grad_norm": 2.1291286945343018, + "learning_rate": 4.998418260006316e-05, + "loss": 7.1152, + "step": 1905 + }, + { + "epoch": 0.01133552193358074, + "grad_norm": 2.4611241817474365, + "learning_rate": 4.9984165982538655e-05, + "loss": 7.0316, + "step": 1906 + }, + { + "epoch": 0.011341469216861737, + "grad_norm": 2.329432487487793, + "learning_rate": 4.998414935629243e-05, + "loss": 7.0032, + "step": 1907 + }, + { + "epoch": 0.011347416500142735, + "grad_norm": 2.0618371963500977, + "learning_rate": 4.9984132721324505e-05, + "loss": 7.2566, + "step": 1908 + }, + { + "epoch": 0.011353363783423732, + "grad_norm": 2.063511371612549, + "learning_rate": 4.998411607763487e-05, + "loss": 7.0144, + "step": 1909 + }, + { + "epoch": 0.011359311066704729, + "grad_norm": 2.188871145248413, + "learning_rate": 4.998409942522355e-05, + "loss": 6.9652, + "step": 1910 + }, + { + "epoch": 0.011365258349985726, + "grad_norm": 2.499746322631836, + "learning_rate": 4.998408276409053e-05, + "loss": 6.9173, + "step": 1911 + }, + { + "epoch": 0.011371205633266724, + "grad_norm": 2.2809276580810547, + "learning_rate": 4.9984066094235826e-05, + "loss": 6.9202, + "step": 1912 + }, + { + "epoch": 0.011377152916547721, + "grad_norm": 1.7967042922973633, + "learning_rate": 4.998404941565944e-05, + "loss": 7.0652, + "step": 1913 + }, + { + "epoch": 0.011383100199828718, + "grad_norm": 2.339747667312622, + "learning_rate": 4.9984032728361384e-05, + "loss": 6.943, + "step": 1914 + }, + { + "epoch": 0.011389047483109715, + "grad_norm": 2.65795636177063, + "learning_rate": 4.998401603234166e-05, + "loss": 6.7197, + "step": 1915 + }, + { + "epoch": 0.011394994766390713, + "grad_norm": 2.181105852127075, + "learning_rate": 4.998399932760027e-05, + "loss": 6.7358, + "step": 1916 + }, + { + "epoch": 0.01140094204967171, + "grad_norm": 2.4130990505218506, + "learning_rate": 4.998398261413723e-05, + "loss": 6.8653, + "step": 1917 + }, + { + "epoch": 0.011406889332952707, + "grad_norm": 2.23822021484375, + "learning_rate": 4.998396589195254e-05, + "loss": 7.2125, + "step": 1918 + }, + { + "epoch": 0.011412836616233704, + "grad_norm": 2.176309823989868, + "learning_rate": 4.9983949161046207e-05, + "loss": 7.1077, + "step": 1919 + }, + { + "epoch": 0.011418783899514702, + "grad_norm": 2.2468202114105225, + "learning_rate": 4.9983932421418226e-05, + "loss": 7.1411, + "step": 1920 + }, + { + "epoch": 0.0114247311827957, + "grad_norm": 2.0748138427734375, + "learning_rate": 4.998391567306862e-05, + "loss": 7.0605, + "step": 1921 + }, + { + "epoch": 0.011430678466076696, + "grad_norm": 2.93007230758667, + "learning_rate": 4.998389891599738e-05, + "loss": 6.5832, + "step": 1922 + }, + { + "epoch": 0.011436625749357693, + "grad_norm": 2.125582218170166, + "learning_rate": 4.9983882150204534e-05, + "loss": 7.0761, + "step": 1923 + }, + { + "epoch": 0.011442573032638691, + "grad_norm": 2.3291571140289307, + "learning_rate": 4.998386537569005e-05, + "loss": 6.8781, + "step": 1924 + }, + { + "epoch": 0.011448520315919688, + "grad_norm": 2.8930649757385254, + "learning_rate": 4.9983848592453975e-05, + "loss": 7.1694, + "step": 1925 + }, + { + "epoch": 0.011454467599200685, + "grad_norm": 2.8450441360473633, + "learning_rate": 4.998383180049629e-05, + "loss": 7.1474, + "step": 1926 + }, + { + "epoch": 0.011460414882481682, + "grad_norm": 2.5900778770446777, + "learning_rate": 4.9983814999817016e-05, + "loss": 7.0423, + "step": 1927 + }, + { + "epoch": 0.01146636216576268, + "grad_norm": 2.289428949356079, + "learning_rate": 4.998379819041614e-05, + "loss": 6.9777, + "step": 1928 + }, + { + "epoch": 0.011472309449043677, + "grad_norm": 2.609384059906006, + "learning_rate": 4.998378137229368e-05, + "loss": 7.0488, + "step": 1929 + }, + { + "epoch": 0.011478256732324674, + "grad_norm": 2.1039459705352783, + "learning_rate": 4.998376454544964e-05, + "loss": 6.9308, + "step": 1930 + }, + { + "epoch": 0.01148420401560567, + "grad_norm": 2.1776134967803955, + "learning_rate": 4.9983747709884024e-05, + "loss": 6.9951, + "step": 1931 + }, + { + "epoch": 0.01149015129888667, + "grad_norm": 2.3150827884674072, + "learning_rate": 4.998373086559684e-05, + "loss": 6.9165, + "step": 1932 + }, + { + "epoch": 0.011496098582167666, + "grad_norm": 2.308370590209961, + "learning_rate": 4.99837140125881e-05, + "loss": 7.0155, + "step": 1933 + }, + { + "epoch": 0.011502045865448663, + "grad_norm": 2.234208106994629, + "learning_rate": 4.99836971508578e-05, + "loss": 6.9901, + "step": 1934 + }, + { + "epoch": 0.01150799314872966, + "grad_norm": 2.2340307235717773, + "learning_rate": 4.9983680280405953e-05, + "loss": 7.004, + "step": 1935 + }, + { + "epoch": 0.011513940432010657, + "grad_norm": 2.9458208084106445, + "learning_rate": 4.998366340123256e-05, + "loss": 7.3797, + "step": 1936 + }, + { + "epoch": 0.011519887715291655, + "grad_norm": 2.8516271114349365, + "learning_rate": 4.998364651333762e-05, + "loss": 7.3503, + "step": 1937 + }, + { + "epoch": 0.011525834998572652, + "grad_norm": 1.974025845527649, + "learning_rate": 4.998362961672116e-05, + "loss": 7.21, + "step": 1938 + }, + { + "epoch": 0.011531782281853649, + "grad_norm": 2.110117197036743, + "learning_rate": 4.998361271138317e-05, + "loss": 6.9494, + "step": 1939 + }, + { + "epoch": 0.011537729565134646, + "grad_norm": 2.2003207206726074, + "learning_rate": 4.9983595797323646e-05, + "loss": 6.8858, + "step": 1940 + }, + { + "epoch": 0.011543676848415644, + "grad_norm": 2.200982093811035, + "learning_rate": 4.998357887454262e-05, + "loss": 6.9512, + "step": 1941 + }, + { + "epoch": 0.011549624131696641, + "grad_norm": 2.303903102874756, + "learning_rate": 4.998356194304008e-05, + "loss": 7.2823, + "step": 1942 + }, + { + "epoch": 0.011555571414977638, + "grad_norm": 2.1376724243164062, + "learning_rate": 4.9983545002816035e-05, + "loss": 7.0321, + "step": 1943 + }, + { + "epoch": 0.011561518698258635, + "grad_norm": 2.3128151893615723, + "learning_rate": 4.99835280538705e-05, + "loss": 6.9714, + "step": 1944 + }, + { + "epoch": 0.011567465981539633, + "grad_norm": 2.359212636947632, + "learning_rate": 4.9983511096203465e-05, + "loss": 7.0496, + "step": 1945 + }, + { + "epoch": 0.01157341326482063, + "grad_norm": 2.346946954727173, + "learning_rate": 4.9983494129814945e-05, + "loss": 6.9865, + "step": 1946 + }, + { + "epoch": 0.011579360548101627, + "grad_norm": 2.447598934173584, + "learning_rate": 4.998347715470495e-05, + "loss": 6.9609, + "step": 1947 + }, + { + "epoch": 0.011585307831382624, + "grad_norm": 2.355300188064575, + "learning_rate": 4.998346017087348e-05, + "loss": 7.03, + "step": 1948 + }, + { + "epoch": 0.011591255114663622, + "grad_norm": 2.3207437992095947, + "learning_rate": 4.9983443178320545e-05, + "loss": 6.8181, + "step": 1949 + }, + { + "epoch": 0.011597202397944619, + "grad_norm": 2.359839677810669, + "learning_rate": 4.998342617704615e-05, + "loss": 6.8828, + "step": 1950 + }, + { + "epoch": 0.011603149681225616, + "grad_norm": 2.264890432357788, + "learning_rate": 4.9983409167050284e-05, + "loss": 7.3467, + "step": 1951 + }, + { + "epoch": 0.011609096964506613, + "grad_norm": 2.2720789909362793, + "learning_rate": 4.998339214833298e-05, + "loss": 7.3912, + "step": 1952 + }, + { + "epoch": 0.011615044247787611, + "grad_norm": 2.414433240890503, + "learning_rate": 4.9983375120894226e-05, + "loss": 7.1505, + "step": 1953 + }, + { + "epoch": 0.011620991531068608, + "grad_norm": 2.095290422439575, + "learning_rate": 4.998335808473404e-05, + "loss": 7.1642, + "step": 1954 + }, + { + "epoch": 0.011626938814349605, + "grad_norm": 2.118901252746582, + "learning_rate": 4.998334103985242e-05, + "loss": 7.0528, + "step": 1955 + }, + { + "epoch": 0.011632886097630602, + "grad_norm": 2.4361472129821777, + "learning_rate": 4.998332398624937e-05, + "loss": 7.3064, + "step": 1956 + }, + { + "epoch": 0.0116388333809116, + "grad_norm": 2.0978667736053467, + "learning_rate": 4.99833069239249e-05, + "loss": 7.0041, + "step": 1957 + }, + { + "epoch": 0.011644780664192597, + "grad_norm": 3.156329393386841, + "learning_rate": 4.998328985287902e-05, + "loss": 6.9169, + "step": 1958 + }, + { + "epoch": 0.011650727947473594, + "grad_norm": 2.311004400253296, + "learning_rate": 4.9983272773111735e-05, + "loss": 7.1128, + "step": 1959 + }, + { + "epoch": 0.01165667523075459, + "grad_norm": 2.406993865966797, + "learning_rate": 4.9983255684623036e-05, + "loss": 7.1403, + "step": 1960 + }, + { + "epoch": 0.01166262251403559, + "grad_norm": 2.0262861251831055, + "learning_rate": 4.998323858741295e-05, + "loss": 7.1014, + "step": 1961 + }, + { + "epoch": 0.011668569797316586, + "grad_norm": 2.369420051574707, + "learning_rate": 4.998322148148147e-05, + "loss": 7.1422, + "step": 1962 + }, + { + "epoch": 0.011674517080597583, + "grad_norm": 2.156019687652588, + "learning_rate": 4.998320436682861e-05, + "loss": 6.8405, + "step": 1963 + }, + { + "epoch": 0.01168046436387858, + "grad_norm": 2.35737681388855, + "learning_rate": 4.998318724345436e-05, + "loss": 6.8004, + "step": 1964 + }, + { + "epoch": 0.011686411647159577, + "grad_norm": 2.443676233291626, + "learning_rate": 4.998317011135875e-05, + "loss": 7.1959, + "step": 1965 + }, + { + "epoch": 0.011692358930440575, + "grad_norm": 2.1023004055023193, + "learning_rate": 4.998315297054177e-05, + "loss": 7.0684, + "step": 1966 + }, + { + "epoch": 0.011698306213721572, + "grad_norm": 2.5166187286376953, + "learning_rate": 4.998313582100342e-05, + "loss": 6.5876, + "step": 1967 + }, + { + "epoch": 0.011704253497002569, + "grad_norm": 2.1868557929992676, + "learning_rate": 4.9983118662743726e-05, + "loss": 6.6097, + "step": 1968 + }, + { + "epoch": 0.011710200780283566, + "grad_norm": 2.196786880493164, + "learning_rate": 4.998310149576269e-05, + "loss": 6.9798, + "step": 1969 + }, + { + "epoch": 0.011716148063564564, + "grad_norm": 2.361915111541748, + "learning_rate": 4.998308432006029e-05, + "loss": 6.8441, + "step": 1970 + }, + { + "epoch": 0.011722095346845561, + "grad_norm": 2.3234047889709473, + "learning_rate": 4.998306713563657e-05, + "loss": 6.9481, + "step": 1971 + }, + { + "epoch": 0.011728042630126558, + "grad_norm": 2.4995763301849365, + "learning_rate": 4.9983049942491514e-05, + "loss": 6.9903, + "step": 1972 + }, + { + "epoch": 0.011733989913407555, + "grad_norm": 2.21274995803833, + "learning_rate": 4.998303274062514e-05, + "loss": 7.1484, + "step": 1973 + }, + { + "epoch": 0.011739937196688553, + "grad_norm": 2.4777519702911377, + "learning_rate": 4.998301553003743e-05, + "loss": 7.144, + "step": 1974 + }, + { + "epoch": 0.01174588447996955, + "grad_norm": 2.089796304702759, + "learning_rate": 4.9982998310728426e-05, + "loss": 6.6765, + "step": 1975 + }, + { + "epoch": 0.011751831763250547, + "grad_norm": 3.012753963470459, + "learning_rate": 4.998298108269811e-05, + "loss": 6.8501, + "step": 1976 + }, + { + "epoch": 0.011757779046531544, + "grad_norm": 2.5427911281585693, + "learning_rate": 4.9982963845946486e-05, + "loss": 7.0171, + "step": 1977 + }, + { + "epoch": 0.011763726329812542, + "grad_norm": 2.8591670989990234, + "learning_rate": 4.998294660047358e-05, + "loss": 6.9881, + "step": 1978 + }, + { + "epoch": 0.011769673613093539, + "grad_norm": 2.952085256576538, + "learning_rate": 4.998292934627937e-05, + "loss": 6.9459, + "step": 1979 + }, + { + "epoch": 0.011775620896374536, + "grad_norm": 2.451958656311035, + "learning_rate": 4.998291208336388e-05, + "loss": 6.9515, + "step": 1980 + }, + { + "epoch": 0.011781568179655533, + "grad_norm": 2.448319435119629, + "learning_rate": 4.998289481172713e-05, + "loss": 6.8618, + "step": 1981 + }, + { + "epoch": 0.011787515462936531, + "grad_norm": 3.1797080039978027, + "learning_rate": 4.99828775313691e-05, + "loss": 6.7528, + "step": 1982 + }, + { + "epoch": 0.011793462746217528, + "grad_norm": 2.841120719909668, + "learning_rate": 4.99828602422898e-05, + "loss": 6.8, + "step": 1983 + }, + { + "epoch": 0.011799410029498525, + "grad_norm": 3.128098726272583, + "learning_rate": 4.998284294448925e-05, + "loss": 6.7574, + "step": 1984 + }, + { + "epoch": 0.011805357312779522, + "grad_norm": 2.7724568843841553, + "learning_rate": 4.998282563796744e-05, + "loss": 6.6119, + "step": 1985 + }, + { + "epoch": 0.01181130459606052, + "grad_norm": 2.8025269508361816, + "learning_rate": 4.998280832272439e-05, + "loss": 6.4676, + "step": 1986 + }, + { + "epoch": 0.011817251879341517, + "grad_norm": 2.5756618976593018, + "learning_rate": 4.99827909987601e-05, + "loss": 6.5421, + "step": 1987 + }, + { + "epoch": 0.011823199162622514, + "grad_norm": 2.9116249084472656, + "learning_rate": 4.998277366607457e-05, + "loss": 6.5446, + "step": 1988 + }, + { + "epoch": 0.01182914644590351, + "grad_norm": 2.571019411087036, + "learning_rate": 4.9982756324667815e-05, + "loss": 6.7898, + "step": 1989 + }, + { + "epoch": 0.01183509372918451, + "grad_norm": 2.818885326385498, + "learning_rate": 4.998273897453984e-05, + "loss": 6.6604, + "step": 1990 + }, + { + "epoch": 0.011841041012465506, + "grad_norm": 2.8561007976531982, + "learning_rate": 4.998272161569064e-05, + "loss": 6.5473, + "step": 1991 + }, + { + "epoch": 0.011846988295746503, + "grad_norm": 2.5539605617523193, + "learning_rate": 4.998270424812024e-05, + "loss": 6.5492, + "step": 1992 + }, + { + "epoch": 0.0118529355790275, + "grad_norm": 2.3242900371551514, + "learning_rate": 4.998268687182863e-05, + "loss": 6.4577, + "step": 1993 + }, + { + "epoch": 0.011858882862308498, + "grad_norm": 2.874807596206665, + "learning_rate": 4.998266948681582e-05, + "loss": 6.6071, + "step": 1994 + }, + { + "epoch": 0.011864830145589495, + "grad_norm": 2.9014296531677246, + "learning_rate": 4.9982652093081827e-05, + "loss": 7.2221, + "step": 1995 + }, + { + "epoch": 0.011870777428870492, + "grad_norm": 2.5874252319335938, + "learning_rate": 4.998263469062665e-05, + "loss": 6.593, + "step": 1996 + }, + { + "epoch": 0.011876724712151489, + "grad_norm": 2.4252052307128906, + "learning_rate": 4.998261727945028e-05, + "loss": 7.0138, + "step": 1997 + }, + { + "epoch": 0.011882671995432486, + "grad_norm": 2.3569211959838867, + "learning_rate": 4.998259985955275e-05, + "loss": 6.8743, + "step": 1998 + }, + { + "epoch": 0.011888619278713484, + "grad_norm": 2.560659408569336, + "learning_rate": 4.9982582430934045e-05, + "loss": 6.8926, + "step": 1999 + }, + { + "epoch": 0.011894566561994481, + "grad_norm": 2.0855636596679688, + "learning_rate": 4.9982564993594184e-05, + "loss": 7.1691, + "step": 2000 + }, + { + "epoch": 0.011900513845275478, + "grad_norm": 2.024829387664795, + "learning_rate": 4.998254754753316e-05, + "loss": 7.1797, + "step": 2001 + }, + { + "epoch": 0.011906461128556475, + "grad_norm": 2.093733549118042, + "learning_rate": 4.998253009275099e-05, + "loss": 6.9706, + "step": 2002 + }, + { + "epoch": 0.011912408411837473, + "grad_norm": 1.9211688041687012, + "learning_rate": 4.998251262924768e-05, + "loss": 7.018, + "step": 2003 + }, + { + "epoch": 0.01191835569511847, + "grad_norm": 2.3146321773529053, + "learning_rate": 4.998249515702323e-05, + "loss": 6.9384, + "step": 2004 + }, + { + "epoch": 0.011924302978399467, + "grad_norm": 2.346309185028076, + "learning_rate": 4.998247767607765e-05, + "loss": 6.5674, + "step": 2005 + }, + { + "epoch": 0.011930250261680464, + "grad_norm": 2.39471697807312, + "learning_rate": 4.998246018641094e-05, + "loss": 6.769, + "step": 2006 + }, + { + "epoch": 0.011936197544961462, + "grad_norm": 2.1689298152923584, + "learning_rate": 4.998244268802312e-05, + "loss": 7.0945, + "step": 2007 + }, + { + "epoch": 0.011942144828242459, + "grad_norm": 2.4209859371185303, + "learning_rate": 4.998242518091418e-05, + "loss": 6.98, + "step": 2008 + }, + { + "epoch": 0.011948092111523456, + "grad_norm": 2.6378684043884277, + "learning_rate": 4.998240766508414e-05, + "loss": 6.6833, + "step": 2009 + }, + { + "epoch": 0.011954039394804453, + "grad_norm": 2.2804839611053467, + "learning_rate": 4.9982390140532995e-05, + "loss": 6.7129, + "step": 2010 + }, + { + "epoch": 0.011959986678085451, + "grad_norm": 2.1788251399993896, + "learning_rate": 4.998237260726075e-05, + "loss": 7.0175, + "step": 2011 + }, + { + "epoch": 0.011965933961366448, + "grad_norm": 1.8988546133041382, + "learning_rate": 4.998235506526743e-05, + "loss": 7.0857, + "step": 2012 + }, + { + "epoch": 0.011971881244647445, + "grad_norm": 2.560107469558716, + "learning_rate": 4.9982337514553026e-05, + "loss": 7.0771, + "step": 2013 + }, + { + "epoch": 0.011977828527928442, + "grad_norm": 2.1771798133850098, + "learning_rate": 4.998231995511754e-05, + "loss": 7.071, + "step": 2014 + }, + { + "epoch": 0.01198377581120944, + "grad_norm": 1.9619860649108887, + "learning_rate": 4.998230238696098e-05, + "loss": 6.9109, + "step": 2015 + }, + { + "epoch": 0.011989723094490437, + "grad_norm": 2.16719126701355, + "learning_rate": 4.998228481008337e-05, + "loss": 6.903, + "step": 2016 + }, + { + "epoch": 0.011995670377771434, + "grad_norm": 2.4643077850341797, + "learning_rate": 4.998226722448469e-05, + "loss": 6.5301, + "step": 2017 + }, + { + "epoch": 0.01200161766105243, + "grad_norm": 2.5153393745422363, + "learning_rate": 4.9982249630164965e-05, + "loss": 7.107, + "step": 2018 + }, + { + "epoch": 0.01200756494433343, + "grad_norm": 2.6180920600891113, + "learning_rate": 4.998223202712419e-05, + "loss": 6.9905, + "step": 2019 + }, + { + "epoch": 0.012013512227614426, + "grad_norm": 2.333186149597168, + "learning_rate": 4.998221441536238e-05, + "loss": 7.074, + "step": 2020 + }, + { + "epoch": 0.012019459510895423, + "grad_norm": 2.138176918029785, + "learning_rate": 4.998219679487953e-05, + "loss": 7.0211, + "step": 2021 + }, + { + "epoch": 0.01202540679417642, + "grad_norm": 2.9845499992370605, + "learning_rate": 4.998217916567567e-05, + "loss": 6.7341, + "step": 2022 + }, + { + "epoch": 0.012031354077457418, + "grad_norm": 3.1216208934783936, + "learning_rate": 4.998216152775077e-05, + "loss": 7.1569, + "step": 2023 + }, + { + "epoch": 0.012037301360738415, + "grad_norm": 2.4693727493286133, + "learning_rate": 4.998214388110487e-05, + "loss": 6.6427, + "step": 2024 + }, + { + "epoch": 0.012043248644019412, + "grad_norm": 2.784562349319458, + "learning_rate": 4.9982126225737955e-05, + "loss": 6.6898, + "step": 2025 + }, + { + "epoch": 0.012049195927300409, + "grad_norm": 3.0549166202545166, + "learning_rate": 4.9982108561650036e-05, + "loss": 6.6004, + "step": 2026 + }, + { + "epoch": 0.012055143210581406, + "grad_norm": 2.565505266189575, + "learning_rate": 4.998209088884113e-05, + "loss": 6.5981, + "step": 2027 + }, + { + "epoch": 0.012061090493862404, + "grad_norm": 2.862548828125, + "learning_rate": 4.998207320731122e-05, + "loss": 6.4329, + "step": 2028 + }, + { + "epoch": 0.012067037777143401, + "grad_norm": 2.835280179977417, + "learning_rate": 4.998205551706033e-05, + "loss": 6.6854, + "step": 2029 + }, + { + "epoch": 0.012072985060424398, + "grad_norm": 2.4550364017486572, + "learning_rate": 4.9982037818088474e-05, + "loss": 6.7115, + "step": 2030 + }, + { + "epoch": 0.012078932343705395, + "grad_norm": 2.9977426528930664, + "learning_rate": 4.998202011039564e-05, + "loss": 6.341, + "step": 2031 + }, + { + "epoch": 0.012084879626986393, + "grad_norm": 2.258370876312256, + "learning_rate": 4.998200239398184e-05, + "loss": 6.7094, + "step": 2032 + }, + { + "epoch": 0.01209082691026739, + "grad_norm": 2.4484050273895264, + "learning_rate": 4.9981984668847085e-05, + "loss": 7.1115, + "step": 2033 + }, + { + "epoch": 0.012096774193548387, + "grad_norm": 2.4668514728546143, + "learning_rate": 4.9981966934991366e-05, + "loss": 6.9411, + "step": 2034 + }, + { + "epoch": 0.012102721476829384, + "grad_norm": 2.218479871749878, + "learning_rate": 4.998194919241471e-05, + "loss": 6.7175, + "step": 2035 + }, + { + "epoch": 0.012108668760110382, + "grad_norm": 2.201815366744995, + "learning_rate": 4.9981931441117115e-05, + "loss": 6.8684, + "step": 2036 + }, + { + "epoch": 0.012114616043391379, + "grad_norm": 2.4610331058502197, + "learning_rate": 4.998191368109858e-05, + "loss": 6.7214, + "step": 2037 + }, + { + "epoch": 0.012120563326672376, + "grad_norm": 2.7274906635284424, + "learning_rate": 4.998189591235912e-05, + "loss": 6.7611, + "step": 2038 + }, + { + "epoch": 0.012126510609953373, + "grad_norm": 2.7716658115386963, + "learning_rate": 4.9981878134898735e-05, + "loss": 6.7679, + "step": 2039 + }, + { + "epoch": 0.012132457893234371, + "grad_norm": 3.3206236362457275, + "learning_rate": 4.9981860348717434e-05, + "loss": 6.6283, + "step": 2040 + }, + { + "epoch": 0.012138405176515368, + "grad_norm": 2.511906862258911, + "learning_rate": 4.9981842553815225e-05, + "loss": 6.9537, + "step": 2041 + }, + { + "epoch": 0.012144352459796365, + "grad_norm": 2.7797024250030518, + "learning_rate": 4.998182475019212e-05, + "loss": 7.0488, + "step": 2042 + }, + { + "epoch": 0.012150299743077362, + "grad_norm": 3.523092031478882, + "learning_rate": 4.998180693784811e-05, + "loss": 6.9249, + "step": 2043 + }, + { + "epoch": 0.01215624702635836, + "grad_norm": 3.1001851558685303, + "learning_rate": 4.998178911678322e-05, + "loss": 7.0998, + "step": 2044 + }, + { + "epoch": 0.012162194309639357, + "grad_norm": 2.5291028022766113, + "learning_rate": 4.998177128699743e-05, + "loss": 6.8381, + "step": 2045 + }, + { + "epoch": 0.012168141592920354, + "grad_norm": 3.308398723602295, + "learning_rate": 4.998175344849077e-05, + "loss": 6.6849, + "step": 2046 + }, + { + "epoch": 0.01217408887620135, + "grad_norm": 3.4255475997924805, + "learning_rate": 4.998173560126323e-05, + "loss": 6.7816, + "step": 2047 + }, + { + "epoch": 0.01218003615948235, + "grad_norm": 3.4510817527770996, + "learning_rate": 4.998171774531484e-05, + "loss": 6.7961, + "step": 2048 + }, + { + "epoch": 0.012185983442763346, + "grad_norm": 3.15468168258667, + "learning_rate": 4.998169988064558e-05, + "loss": 6.9409, + "step": 2049 + }, + { + "epoch": 0.012191930726044343, + "grad_norm": 2.5568132400512695, + "learning_rate": 4.998168200725547e-05, + "loss": 6.8573, + "step": 2050 + }, + { + "epoch": 0.01219787800932534, + "grad_norm": 1.9745045900344849, + "learning_rate": 4.9981664125144515e-05, + "loss": 6.7948, + "step": 2051 + }, + { + "epoch": 0.012203825292606338, + "grad_norm": 2.2304463386535645, + "learning_rate": 4.9981646234312714e-05, + "loss": 6.6896, + "step": 2052 + }, + { + "epoch": 0.012209772575887335, + "grad_norm": 2.4391567707061768, + "learning_rate": 4.998162833476008e-05, + "loss": 6.7129, + "step": 2053 + }, + { + "epoch": 0.012215719859168332, + "grad_norm": 3.243905544281006, + "learning_rate": 4.9981610426486615e-05, + "loss": 7.0744, + "step": 2054 + }, + { + "epoch": 0.012221667142449329, + "grad_norm": 3.2596933841705322, + "learning_rate": 4.998159250949233e-05, + "loss": 6.9361, + "step": 2055 + }, + { + "epoch": 0.012227614425730327, + "grad_norm": 2.554436445236206, + "learning_rate": 4.998157458377723e-05, + "loss": 6.9354, + "step": 2056 + }, + { + "epoch": 0.012233561709011324, + "grad_norm": 2.3636975288391113, + "learning_rate": 4.998155664934132e-05, + "loss": 6.849, + "step": 2057 + }, + { + "epoch": 0.01223950899229232, + "grad_norm": 2.224684953689575, + "learning_rate": 4.99815387061846e-05, + "loss": 6.7011, + "step": 2058 + }, + { + "epoch": 0.012245456275573318, + "grad_norm": 2.6892964839935303, + "learning_rate": 4.9981520754307096e-05, + "loss": 6.753, + "step": 2059 + }, + { + "epoch": 0.012251403558854315, + "grad_norm": 2.7645084857940674, + "learning_rate": 4.9981502793708796e-05, + "loss": 6.5437, + "step": 2060 + }, + { + "epoch": 0.012257350842135313, + "grad_norm": 2.1315746307373047, + "learning_rate": 4.9981484824389716e-05, + "loss": 6.8843, + "step": 2061 + }, + { + "epoch": 0.01226329812541631, + "grad_norm": 2.6275408267974854, + "learning_rate": 4.998146684634984e-05, + "loss": 6.7275, + "step": 2062 + }, + { + "epoch": 0.012269245408697307, + "grad_norm": 2.530688762664795, + "learning_rate": 4.998144885958921e-05, + "loss": 6.6089, + "step": 2063 + }, + { + "epoch": 0.012275192691978304, + "grad_norm": 2.0959835052490234, + "learning_rate": 4.998143086410781e-05, + "loss": 6.7425, + "step": 2064 + }, + { + "epoch": 0.012281139975259302, + "grad_norm": 2.887242078781128, + "learning_rate": 4.998141285990565e-05, + "loss": 6.6867, + "step": 2065 + }, + { + "epoch": 0.012287087258540299, + "grad_norm": 2.430122137069702, + "learning_rate": 4.9981394846982734e-05, + "loss": 6.6636, + "step": 2066 + }, + { + "epoch": 0.012293034541821296, + "grad_norm": 2.269162654876709, + "learning_rate": 4.998137682533907e-05, + "loss": 7.1165, + "step": 2067 + }, + { + "epoch": 0.012298981825102293, + "grad_norm": 2.6741089820861816, + "learning_rate": 4.998135879497467e-05, + "loss": 6.6678, + "step": 2068 + }, + { + "epoch": 0.012304929108383291, + "grad_norm": 2.3362507820129395, + "learning_rate": 4.998134075588953e-05, + "loss": 7.0103, + "step": 2069 + }, + { + "epoch": 0.012310876391664288, + "grad_norm": 2.310638189315796, + "learning_rate": 4.9981322708083666e-05, + "loss": 6.9235, + "step": 2070 + }, + { + "epoch": 0.012316823674945285, + "grad_norm": 2.161853790283203, + "learning_rate": 4.998130465155708e-05, + "loss": 6.9392, + "step": 2071 + }, + { + "epoch": 0.012322770958226282, + "grad_norm": 2.2609059810638428, + "learning_rate": 4.9981286586309786e-05, + "loss": 6.888, + "step": 2072 + }, + { + "epoch": 0.01232871824150728, + "grad_norm": 2.6072967052459717, + "learning_rate": 4.998126851234177e-05, + "loss": 6.7739, + "step": 2073 + }, + { + "epoch": 0.012334665524788277, + "grad_norm": 3.092834711074829, + "learning_rate": 4.9981250429653056e-05, + "loss": 6.5529, + "step": 2074 + }, + { + "epoch": 0.012340612808069274, + "grad_norm": 2.303149461746216, + "learning_rate": 4.998123233824366e-05, + "loss": 6.618, + "step": 2075 + }, + { + "epoch": 0.01234656009135027, + "grad_norm": 2.888063907623291, + "learning_rate": 4.998121423811355e-05, + "loss": 6.9224, + "step": 2076 + }, + { + "epoch": 0.012352507374631269, + "grad_norm": 2.990727424621582, + "learning_rate": 4.998119612926277e-05, + "loss": 6.94, + "step": 2077 + }, + { + "epoch": 0.012358454657912266, + "grad_norm": 3.016002893447876, + "learning_rate": 4.998117801169131e-05, + "loss": 6.6231, + "step": 2078 + }, + { + "epoch": 0.012364401941193263, + "grad_norm": 2.057124614715576, + "learning_rate": 4.998115988539918e-05, + "loss": 6.803, + "step": 2079 + }, + { + "epoch": 0.01237034922447426, + "grad_norm": 2.371136426925659, + "learning_rate": 4.998114175038639e-05, + "loss": 6.8244, + "step": 2080 + }, + { + "epoch": 0.012376296507755258, + "grad_norm": 2.804365873336792, + "learning_rate": 4.998112360665292e-05, + "loss": 6.8787, + "step": 2081 + }, + { + "epoch": 0.012382243791036255, + "grad_norm": 3.4987633228302, + "learning_rate": 4.998110545419882e-05, + "loss": 6.6946, + "step": 2082 + }, + { + "epoch": 0.012388191074317252, + "grad_norm": 2.950608968734741, + "learning_rate": 4.998108729302407e-05, + "loss": 6.7915, + "step": 2083 + }, + { + "epoch": 0.012394138357598249, + "grad_norm": 2.4327776432037354, + "learning_rate": 4.998106912312868e-05, + "loss": 6.727, + "step": 2084 + }, + { + "epoch": 0.012400085640879247, + "grad_norm": 2.46014142036438, + "learning_rate": 4.998105094451265e-05, + "loss": 6.6797, + "step": 2085 + }, + { + "epoch": 0.012406032924160244, + "grad_norm": 2.947566270828247, + "learning_rate": 4.9981032757175995e-05, + "loss": 6.6401, + "step": 2086 + }, + { + "epoch": 0.01241198020744124, + "grad_norm": 2.5999064445495605, + "learning_rate": 4.9981014561118724e-05, + "loss": 6.58, + "step": 2087 + }, + { + "epoch": 0.012417927490722238, + "grad_norm": 2.9761807918548584, + "learning_rate": 4.9980996356340836e-05, + "loss": 6.8538, + "step": 2088 + }, + { + "epoch": 0.012423874774003236, + "grad_norm": 2.690925121307373, + "learning_rate": 4.9980978142842336e-05, + "loss": 6.9087, + "step": 2089 + }, + { + "epoch": 0.012429822057284233, + "grad_norm": 2.218524217605591, + "learning_rate": 4.998095992062325e-05, + "loss": 6.7221, + "step": 2090 + }, + { + "epoch": 0.01243576934056523, + "grad_norm": 2.630094051361084, + "learning_rate": 4.998094168968355e-05, + "loss": 6.7346, + "step": 2091 + }, + { + "epoch": 0.012441716623846227, + "grad_norm": 2.7839179039001465, + "learning_rate": 4.9980923450023276e-05, + "loss": 6.8668, + "step": 2092 + }, + { + "epoch": 0.012447663907127223, + "grad_norm": 2.422914743423462, + "learning_rate": 4.9980905201642415e-05, + "loss": 6.7953, + "step": 2093 + }, + { + "epoch": 0.012453611190408222, + "grad_norm": 2.525883674621582, + "learning_rate": 4.998088694454097e-05, + "loss": 6.6322, + "step": 2094 + }, + { + "epoch": 0.012459558473689219, + "grad_norm": 2.515536308288574, + "learning_rate": 4.998086867871896e-05, + "loss": 7.4297, + "step": 2095 + }, + { + "epoch": 0.012465505756970216, + "grad_norm": 2.689542055130005, + "learning_rate": 4.998085040417639e-05, + "loss": 7.4316, + "step": 2096 + }, + { + "epoch": 0.012471453040251212, + "grad_norm": 2.4374492168426514, + "learning_rate": 4.998083212091327e-05, + "loss": 6.8035, + "step": 2097 + }, + { + "epoch": 0.012477400323532211, + "grad_norm": 2.284153699874878, + "learning_rate": 4.998081382892959e-05, + "loss": 6.6644, + "step": 2098 + }, + { + "epoch": 0.012483347606813208, + "grad_norm": 2.113539218902588, + "learning_rate": 4.9980795528225366e-05, + "loss": 6.5201, + "step": 2099 + }, + { + "epoch": 0.012489294890094205, + "grad_norm": 2.2590157985687256, + "learning_rate": 4.998077721880061e-05, + "loss": 6.8074, + "step": 2100 + }, + { + "epoch": 0.012495242173375202, + "grad_norm": 2.077986717224121, + "learning_rate": 4.9980758900655316e-05, + "loss": 6.6986, + "step": 2101 + }, + { + "epoch": 0.0125011894566562, + "grad_norm": 2.495882987976074, + "learning_rate": 4.99807405737895e-05, + "loss": 6.6949, + "step": 2102 + }, + { + "epoch": 0.012507136739937197, + "grad_norm": 2.224621295928955, + "learning_rate": 4.998072223820317e-05, + "loss": 6.5723, + "step": 2103 + }, + { + "epoch": 0.012513084023218194, + "grad_norm": 2.515867233276367, + "learning_rate": 4.998070389389632e-05, + "loss": 6.4327, + "step": 2104 + }, + { + "epoch": 0.01251903130649919, + "grad_norm": 2.3134326934814453, + "learning_rate": 4.998068554086897e-05, + "loss": 6.2818, + "step": 2105 + }, + { + "epoch": 0.012524978589780189, + "grad_norm": 2.7688093185424805, + "learning_rate": 4.998066717912112e-05, + "loss": 6.4585, + "step": 2106 + }, + { + "epoch": 0.012530925873061186, + "grad_norm": 3.211790084838867, + "learning_rate": 4.998064880865277e-05, + "loss": 6.5227, + "step": 2107 + }, + { + "epoch": 0.012536873156342183, + "grad_norm": 2.9701578617095947, + "learning_rate": 4.998063042946395e-05, + "loss": 6.5674, + "step": 2108 + }, + { + "epoch": 0.01254282043962318, + "grad_norm": 2.1295664310455322, + "learning_rate": 4.998061204155463e-05, + "loss": 6.5697, + "step": 2109 + }, + { + "epoch": 0.012548767722904178, + "grad_norm": 2.841683864593506, + "learning_rate": 4.998059364492485e-05, + "loss": 6.453, + "step": 2110 + }, + { + "epoch": 0.012554715006185175, + "grad_norm": 2.481001615524292, + "learning_rate": 4.99805752395746e-05, + "loss": 6.555, + "step": 2111 + }, + { + "epoch": 0.012560662289466172, + "grad_norm": 2.357745885848999, + "learning_rate": 4.998055682550389e-05, + "loss": 6.7916, + "step": 2112 + }, + { + "epoch": 0.012566609572747169, + "grad_norm": 2.349417209625244, + "learning_rate": 4.9980538402712725e-05, + "loss": 6.7257, + "step": 2113 + }, + { + "epoch": 0.012572556856028167, + "grad_norm": 2.846930742263794, + "learning_rate": 4.998051997120111e-05, + "loss": 6.7095, + "step": 2114 + }, + { + "epoch": 0.012578504139309164, + "grad_norm": 2.362506628036499, + "learning_rate": 4.998050153096906e-05, + "loss": 6.675, + "step": 2115 + }, + { + "epoch": 0.01258445142259016, + "grad_norm": 2.3275344371795654, + "learning_rate": 4.998048308201656e-05, + "loss": 6.9031, + "step": 2116 + }, + { + "epoch": 0.012590398705871158, + "grad_norm": 2.194359540939331, + "learning_rate": 4.9980464624343644e-05, + "loss": 6.8258, + "step": 2117 + }, + { + "epoch": 0.012596345989152156, + "grad_norm": 2.3926312923431396, + "learning_rate": 4.99804461579503e-05, + "loss": 6.7136, + "step": 2118 + }, + { + "epoch": 0.012602293272433153, + "grad_norm": 2.7430222034454346, + "learning_rate": 4.9980427682836546e-05, + "loss": 6.5475, + "step": 2119 + }, + { + "epoch": 0.01260824055571415, + "grad_norm": 2.1563844680786133, + "learning_rate": 4.998040919900237e-05, + "loss": 6.7105, + "step": 2120 + }, + { + "epoch": 0.012614187838995147, + "grad_norm": 2.1061437129974365, + "learning_rate": 4.998039070644781e-05, + "loss": 6.6411, + "step": 2121 + }, + { + "epoch": 0.012620135122276143, + "grad_norm": 2.6192378997802734, + "learning_rate": 4.9980372205172844e-05, + "loss": 6.6831, + "step": 2122 + }, + { + "epoch": 0.012626082405557142, + "grad_norm": 2.794616222381592, + "learning_rate": 4.9980353695177495e-05, + "loss": 6.8128, + "step": 2123 + }, + { + "epoch": 0.012632029688838139, + "grad_norm": 2.3656489849090576, + "learning_rate": 4.998033517646176e-05, + "loss": 6.8109, + "step": 2124 + }, + { + "epoch": 0.012637976972119136, + "grad_norm": 2.658433437347412, + "learning_rate": 4.998031664902564e-05, + "loss": 6.7979, + "step": 2125 + }, + { + "epoch": 0.012643924255400132, + "grad_norm": 2.889954090118408, + "learning_rate": 4.9980298112869154e-05, + "loss": 6.6745, + "step": 2126 + }, + { + "epoch": 0.012649871538681131, + "grad_norm": 2.469790458679199, + "learning_rate": 4.9980279567992304e-05, + "loss": 6.7056, + "step": 2127 + }, + { + "epoch": 0.012655818821962128, + "grad_norm": 2.4310262203216553, + "learning_rate": 4.9980261014395094e-05, + "loss": 6.8809, + "step": 2128 + }, + { + "epoch": 0.012661766105243125, + "grad_norm": 2.772359609603882, + "learning_rate": 4.998024245207754e-05, + "loss": 7.0383, + "step": 2129 + }, + { + "epoch": 0.012667713388524121, + "grad_norm": 2.292144775390625, + "learning_rate": 4.9980223881039635e-05, + "loss": 6.9062, + "step": 2130 + }, + { + "epoch": 0.01267366067180512, + "grad_norm": 2.590363025665283, + "learning_rate": 4.998020530128139e-05, + "loss": 6.5803, + "step": 2131 + }, + { + "epoch": 0.012679607955086117, + "grad_norm": 2.78432035446167, + "learning_rate": 4.9980186712802824e-05, + "loss": 6.788, + "step": 2132 + }, + { + "epoch": 0.012685555238367114, + "grad_norm": 2.6188290119171143, + "learning_rate": 4.998016811560392e-05, + "loss": 6.5827, + "step": 2133 + }, + { + "epoch": 0.01269150252164811, + "grad_norm": 2.868215560913086, + "learning_rate": 4.99801495096847e-05, + "loss": 6.5845, + "step": 2134 + }, + { + "epoch": 0.012697449804929109, + "grad_norm": 2.4738945960998535, + "learning_rate": 4.998013089504518e-05, + "loss": 6.5019, + "step": 2135 + }, + { + "epoch": 0.012703397088210106, + "grad_norm": 2.5315287113189697, + "learning_rate": 4.998011227168534e-05, + "loss": 6.6765, + "step": 2136 + }, + { + "epoch": 0.012709344371491103, + "grad_norm": 2.7871086597442627, + "learning_rate": 4.998009363960521e-05, + "loss": 6.64, + "step": 2137 + }, + { + "epoch": 0.0127152916547721, + "grad_norm": 2.267502784729004, + "learning_rate": 4.998007499880479e-05, + "loss": 6.8665, + "step": 2138 + }, + { + "epoch": 0.012721238938053098, + "grad_norm": 2.5014212131500244, + "learning_rate": 4.998005634928408e-05, + "loss": 6.6757, + "step": 2139 + }, + { + "epoch": 0.012727186221334095, + "grad_norm": 2.3600070476531982, + "learning_rate": 4.998003769104308e-05, + "loss": 6.5425, + "step": 2140 + }, + { + "epoch": 0.012733133504615092, + "grad_norm": 2.32123064994812, + "learning_rate": 4.998001902408182e-05, + "loss": 6.5192, + "step": 2141 + }, + { + "epoch": 0.012739080787896088, + "grad_norm": 2.5059258937835693, + "learning_rate": 4.998000034840029e-05, + "loss": 6.6315, + "step": 2142 + }, + { + "epoch": 0.012745028071177087, + "grad_norm": 2.2143092155456543, + "learning_rate": 4.99799816639985e-05, + "loss": 6.6058, + "step": 2143 + }, + { + "epoch": 0.012750975354458084, + "grad_norm": 2.3660342693328857, + "learning_rate": 4.997996297087645e-05, + "loss": 6.554, + "step": 2144 + }, + { + "epoch": 0.01275692263773908, + "grad_norm": 2.4286036491394043, + "learning_rate": 4.9979944269034164e-05, + "loss": 6.4857, + "step": 2145 + }, + { + "epoch": 0.012762869921020078, + "grad_norm": 2.4002180099487305, + "learning_rate": 4.997992555847163e-05, + "loss": 6.5083, + "step": 2146 + }, + { + "epoch": 0.012768817204301076, + "grad_norm": 2.418942451477051, + "learning_rate": 4.997990683918886e-05, + "loss": 6.5471, + "step": 2147 + }, + { + "epoch": 0.012774764487582073, + "grad_norm": 2.535654067993164, + "learning_rate": 4.997988811118587e-05, + "loss": 6.5999, + "step": 2148 + }, + { + "epoch": 0.01278071177086307, + "grad_norm": 2.581505298614502, + "learning_rate": 4.9979869374462655e-05, + "loss": 6.2525, + "step": 2149 + }, + { + "epoch": 0.012786659054144067, + "grad_norm": 2.681297779083252, + "learning_rate": 4.997985062901923e-05, + "loss": 6.1463, + "step": 2150 + }, + { + "epoch": 0.012792606337425065, + "grad_norm": 2.3542990684509277, + "learning_rate": 4.997983187485559e-05, + "loss": 6.433, + "step": 2151 + }, + { + "epoch": 0.012798553620706062, + "grad_norm": 2.2994048595428467, + "learning_rate": 4.997981311197175e-05, + "loss": 6.5952, + "step": 2152 + }, + { + "epoch": 0.012804500903987059, + "grad_norm": 2.4703454971313477, + "learning_rate": 4.9979794340367724e-05, + "loss": 6.5581, + "step": 2153 + }, + { + "epoch": 0.012810448187268056, + "grad_norm": 2.511383533477783, + "learning_rate": 4.9979775560043504e-05, + "loss": 6.577, + "step": 2154 + }, + { + "epoch": 0.012816395470549052, + "grad_norm": 2.3300156593322754, + "learning_rate": 4.99797567709991e-05, + "loss": 6.4349, + "step": 2155 + }, + { + "epoch": 0.012822342753830051, + "grad_norm": 2.523878574371338, + "learning_rate": 4.997973797323452e-05, + "loss": 6.5044, + "step": 2156 + }, + { + "epoch": 0.012828290037111048, + "grad_norm": 2.4185073375701904, + "learning_rate": 4.9979719166749776e-05, + "loss": 6.537, + "step": 2157 + }, + { + "epoch": 0.012834237320392045, + "grad_norm": 2.324090003967285, + "learning_rate": 4.997970035154487e-05, + "loss": 6.803, + "step": 2158 + }, + { + "epoch": 0.012840184603673041, + "grad_norm": 2.468872547149658, + "learning_rate": 4.9979681527619804e-05, + "loss": 7.0837, + "step": 2159 + }, + { + "epoch": 0.01284613188695404, + "grad_norm": 2.1467936038970947, + "learning_rate": 4.99796626949746e-05, + "loss": 6.7373, + "step": 2160 + }, + { + "epoch": 0.012852079170235037, + "grad_norm": 2.3208062648773193, + "learning_rate": 4.9979643853609246e-05, + "loss": 6.5483, + "step": 2161 + }, + { + "epoch": 0.012858026453516034, + "grad_norm": 2.2797584533691406, + "learning_rate": 4.997962500352376e-05, + "loss": 6.5857, + "step": 2162 + }, + { + "epoch": 0.01286397373679703, + "grad_norm": 2.3447721004486084, + "learning_rate": 4.9979606144718135e-05, + "loss": 6.8511, + "step": 2163 + }, + { + "epoch": 0.012869921020078029, + "grad_norm": 2.6456334590911865, + "learning_rate": 4.9979587277192395e-05, + "loss": 6.9457, + "step": 2164 + }, + { + "epoch": 0.012875868303359026, + "grad_norm": 3.2567737102508545, + "learning_rate": 4.997956840094654e-05, + "loss": 6.6405, + "step": 2165 + }, + { + "epoch": 0.012881815586640023, + "grad_norm": 2.847371816635132, + "learning_rate": 4.9979549515980574e-05, + "loss": 6.751, + "step": 2166 + }, + { + "epoch": 0.01288776286992102, + "grad_norm": 2.999779462814331, + "learning_rate": 4.99795306222945e-05, + "loss": 6.7437, + "step": 2167 + }, + { + "epoch": 0.012893710153202018, + "grad_norm": 2.3793458938598633, + "learning_rate": 4.9979511719888336e-05, + "loss": 6.6864, + "step": 2168 + }, + { + "epoch": 0.012899657436483015, + "grad_norm": 2.284724473953247, + "learning_rate": 4.9979492808762084e-05, + "loss": 6.4237, + "step": 2169 + }, + { + "epoch": 0.012905604719764012, + "grad_norm": 2.560758352279663, + "learning_rate": 4.997947388891575e-05, + "loss": 6.5964, + "step": 2170 + }, + { + "epoch": 0.012911552003045008, + "grad_norm": 2.7461421489715576, + "learning_rate": 4.997945496034934e-05, + "loss": 6.5354, + "step": 2171 + }, + { + "epoch": 0.012917499286326007, + "grad_norm": 3.0868208408355713, + "learning_rate": 4.9979436023062854e-05, + "loss": 6.6445, + "step": 2172 + }, + { + "epoch": 0.012923446569607004, + "grad_norm": 2.565009593963623, + "learning_rate": 4.997941707705631e-05, + "loss": 6.6015, + "step": 2173 + }, + { + "epoch": 0.012929393852888, + "grad_norm": 2.9424686431884766, + "learning_rate": 4.997939812232971e-05, + "loss": 6.4887, + "step": 2174 + }, + { + "epoch": 0.012935341136168997, + "grad_norm": 3.0674476623535156, + "learning_rate": 4.997937915888305e-05, + "loss": 6.4728, + "step": 2175 + }, + { + "epoch": 0.012941288419449996, + "grad_norm": 3.040189266204834, + "learning_rate": 4.997936018671636e-05, + "loss": 6.3788, + "step": 2176 + }, + { + "epoch": 0.012947235702730993, + "grad_norm": 2.756211042404175, + "learning_rate": 4.9979341205829626e-05, + "loss": 6.4167, + "step": 2177 + }, + { + "epoch": 0.01295318298601199, + "grad_norm": 2.6333322525024414, + "learning_rate": 4.997932221622287e-05, + "loss": 6.6392, + "step": 2178 + }, + { + "epoch": 0.012959130269292986, + "grad_norm": 2.6951076984405518, + "learning_rate": 4.997930321789608e-05, + "loss": 6.3299, + "step": 2179 + }, + { + "epoch": 0.012965077552573985, + "grad_norm": 2.5388028621673584, + "learning_rate": 4.997928421084928e-05, + "loss": 6.2646, + "step": 2180 + }, + { + "epoch": 0.012971024835854982, + "grad_norm": 3.312171459197998, + "learning_rate": 4.997926519508247e-05, + "loss": 6.6331, + "step": 2181 + }, + { + "epoch": 0.012976972119135979, + "grad_norm": 3.437025547027588, + "learning_rate": 4.997924617059565e-05, + "loss": 5.5981, + "step": 2182 + }, + { + "epoch": 0.012982919402416975, + "grad_norm": 2.74035906791687, + "learning_rate": 4.997922713738884e-05, + "loss": 5.1641, + "step": 2183 + }, + { + "epoch": 0.012988866685697972, + "grad_norm": 2.618525505065918, + "learning_rate": 4.9979208095462036e-05, + "loss": 5.9978, + "step": 2184 + }, + { + "epoch": 0.012994813968978971, + "grad_norm": 2.633692502975464, + "learning_rate": 4.9979189044815254e-05, + "loss": 6.2812, + "step": 2185 + }, + { + "epoch": 0.013000761252259968, + "grad_norm": 2.087557792663574, + "learning_rate": 4.997916998544849e-05, + "loss": 6.2864, + "step": 2186 + }, + { + "epoch": 0.013006708535540965, + "grad_norm": 3.365112066268921, + "learning_rate": 4.997915091736176e-05, + "loss": 5.3517, + "step": 2187 + }, + { + "epoch": 0.013012655818821961, + "grad_norm": 2.7561593055725098, + "learning_rate": 4.997913184055506e-05, + "loss": 6.3667, + "step": 2188 + }, + { + "epoch": 0.01301860310210296, + "grad_norm": 2.630976676940918, + "learning_rate": 4.9979112755028415e-05, + "loss": 6.5858, + "step": 2189 + }, + { + "epoch": 0.013024550385383957, + "grad_norm": 2.56007981300354, + "learning_rate": 4.9979093660781805e-05, + "loss": 6.6862, + "step": 2190 + }, + { + "epoch": 0.013030497668664954, + "grad_norm": 2.509631633758545, + "learning_rate": 4.997907455781526e-05, + "loss": 6.4699, + "step": 2191 + }, + { + "epoch": 0.01303644495194595, + "grad_norm": 2.442028522491455, + "learning_rate": 4.997905544612878e-05, + "loss": 6.5755, + "step": 2192 + }, + { + "epoch": 0.013042392235226949, + "grad_norm": 2.561016321182251, + "learning_rate": 4.997903632572236e-05, + "loss": 6.4529, + "step": 2193 + }, + { + "epoch": 0.013048339518507946, + "grad_norm": 2.585753917694092, + "learning_rate": 4.9979017196596025e-05, + "loss": 6.188, + "step": 2194 + }, + { + "epoch": 0.013054286801788943, + "grad_norm": 2.3657655715942383, + "learning_rate": 4.997899805874977e-05, + "loss": 6.1414, + "step": 2195 + }, + { + "epoch": 0.01306023408506994, + "grad_norm": 2.818251609802246, + "learning_rate": 4.997897891218361e-05, + "loss": 6.5276, + "step": 2196 + }, + { + "epoch": 0.013066181368350938, + "grad_norm": 2.9687695503234863, + "learning_rate": 4.997895975689754e-05, + "loss": 6.131, + "step": 2197 + }, + { + "epoch": 0.013072128651631935, + "grad_norm": 2.8505353927612305, + "learning_rate": 4.997894059289157e-05, + "loss": 6.5269, + "step": 2198 + }, + { + "epoch": 0.013078075934912932, + "grad_norm": 2.331573486328125, + "learning_rate": 4.997892142016573e-05, + "loss": 6.1101, + "step": 2199 + }, + { + "epoch": 0.013084023218193928, + "grad_norm": 2.3241569995880127, + "learning_rate": 4.997890223871998e-05, + "loss": 6.5081, + "step": 2200 + }, + { + "epoch": 0.013089970501474927, + "grad_norm": 2.658834218978882, + "learning_rate": 4.997888304855437e-05, + "loss": 6.554, + "step": 2201 + }, + { + "epoch": 0.013095917784755924, + "grad_norm": 2.703911304473877, + "learning_rate": 4.997886384966889e-05, + "loss": 6.337, + "step": 2202 + }, + { + "epoch": 0.01310186506803692, + "grad_norm": 3.020775318145752, + "learning_rate": 4.997884464206354e-05, + "loss": 6.4375, + "step": 2203 + }, + { + "epoch": 0.013107812351317917, + "grad_norm": 3.324218273162842, + "learning_rate": 4.9978825425738334e-05, + "loss": 6.4871, + "step": 2204 + }, + { + "epoch": 0.013113759634598916, + "grad_norm": 3.822019577026367, + "learning_rate": 4.9978806200693276e-05, + "loss": 6.6372, + "step": 2205 + }, + { + "epoch": 0.013119706917879913, + "grad_norm": 3.3639512062072754, + "learning_rate": 4.997878696692838e-05, + "loss": 6.1826, + "step": 2206 + }, + { + "epoch": 0.01312565420116091, + "grad_norm": 3.580603837966919, + "learning_rate": 4.997876772444365e-05, + "loss": 6.793, + "step": 2207 + }, + { + "epoch": 0.013131601484441906, + "grad_norm": 2.472733497619629, + "learning_rate": 4.9978748473239084e-05, + "loss": 6.9054, + "step": 2208 + }, + { + "epoch": 0.013137548767722905, + "grad_norm": 3.327461004257202, + "learning_rate": 4.99787292133147e-05, + "loss": 6.6735, + "step": 2209 + }, + { + "epoch": 0.013143496051003902, + "grad_norm": 3.493234157562256, + "learning_rate": 4.99787099446705e-05, + "loss": 6.9702, + "step": 2210 + }, + { + "epoch": 0.013149443334284899, + "grad_norm": 2.2516424655914307, + "learning_rate": 4.9978690667306483e-05, + "loss": 7.196, + "step": 2211 + }, + { + "epoch": 0.013155390617565895, + "grad_norm": 1.8846355676651, + "learning_rate": 4.9978671381222665e-05, + "loss": 7.0373, + "step": 2212 + }, + { + "epoch": 0.013161337900846894, + "grad_norm": 2.9334232807159424, + "learning_rate": 4.997865208641906e-05, + "loss": 6.2065, + "step": 2213 + }, + { + "epoch": 0.01316728518412789, + "grad_norm": 2.713006019592285, + "learning_rate": 4.997863278289565e-05, + "loss": 6.788, + "step": 2214 + }, + { + "epoch": 0.013173232467408888, + "grad_norm": 2.6246018409729004, + "learning_rate": 4.9978613470652466e-05, + "loss": 6.7979, + "step": 2215 + }, + { + "epoch": 0.013179179750689884, + "grad_norm": 2.2770373821258545, + "learning_rate": 4.997859414968951e-05, + "loss": 6.8307, + "step": 2216 + }, + { + "epoch": 0.013185127033970881, + "grad_norm": 2.6244993209838867, + "learning_rate": 4.997857482000679e-05, + "loss": 6.3176, + "step": 2217 + }, + { + "epoch": 0.01319107431725188, + "grad_norm": 3.4668054580688477, + "learning_rate": 4.997855548160429e-05, + "loss": 6.8962, + "step": 2218 + }, + { + "epoch": 0.013197021600532877, + "grad_norm": 2.711785078048706, + "learning_rate": 4.9978536134482047e-05, + "loss": 6.7111, + "step": 2219 + }, + { + "epoch": 0.013202968883813873, + "grad_norm": 2.6757078170776367, + "learning_rate": 4.997851677864005e-05, + "loss": 6.5501, + "step": 2220 + }, + { + "epoch": 0.01320891616709487, + "grad_norm": 2.150338888168335, + "learning_rate": 4.997849741407831e-05, + "loss": 6.43, + "step": 2221 + }, + { + "epoch": 0.013214863450375869, + "grad_norm": 3.115309953689575, + "learning_rate": 4.9978478040796836e-05, + "loss": 6.4074, + "step": 2222 + }, + { + "epoch": 0.013220810733656866, + "grad_norm": 2.8754189014434814, + "learning_rate": 4.997845865879564e-05, + "loss": 6.2663, + "step": 2223 + }, + { + "epoch": 0.013226758016937862, + "grad_norm": 2.6169707775115967, + "learning_rate": 4.9978439268074716e-05, + "loss": 6.5987, + "step": 2224 + }, + { + "epoch": 0.01323270530021886, + "grad_norm": 2.3814637660980225, + "learning_rate": 4.997841986863408e-05, + "loss": 6.8124, + "step": 2225 + }, + { + "epoch": 0.013238652583499858, + "grad_norm": 2.0276811122894287, + "learning_rate": 4.997840046047373e-05, + "loss": 6.6632, + "step": 2226 + }, + { + "epoch": 0.013244599866780855, + "grad_norm": 2.7943263053894043, + "learning_rate": 4.997838104359368e-05, + "loss": 6.5452, + "step": 2227 + }, + { + "epoch": 0.013250547150061852, + "grad_norm": 2.4058234691619873, + "learning_rate": 4.997836161799393e-05, + "loss": 6.4697, + "step": 2228 + }, + { + "epoch": 0.013256494433342848, + "grad_norm": 2.2487008571624756, + "learning_rate": 4.9978342183674504e-05, + "loss": 6.3361, + "step": 2229 + }, + { + "epoch": 0.013262441716623847, + "grad_norm": 2.3470170497894287, + "learning_rate": 4.997832274063539e-05, + "loss": 6.4024, + "step": 2230 + }, + { + "epoch": 0.013268388999904844, + "grad_norm": 2.589695692062378, + "learning_rate": 4.9978303288876606e-05, + "loss": 6.4184, + "step": 2231 + }, + { + "epoch": 0.01327433628318584, + "grad_norm": 2.691371440887451, + "learning_rate": 4.997828382839815e-05, + "loss": 6.4225, + "step": 2232 + }, + { + "epoch": 0.013280283566466837, + "grad_norm": 3.110410213470459, + "learning_rate": 4.997826435920003e-05, + "loss": 6.5307, + "step": 2233 + }, + { + "epoch": 0.013286230849747836, + "grad_norm": 2.688519239425659, + "learning_rate": 4.9978244881282266e-05, + "loss": 6.568, + "step": 2234 + }, + { + "epoch": 0.013292178133028833, + "grad_norm": 2.3346059322357178, + "learning_rate": 4.997822539464485e-05, + "loss": 6.8837, + "step": 2235 + }, + { + "epoch": 0.01329812541630983, + "grad_norm": 2.679826021194458, + "learning_rate": 4.997820589928779e-05, + "loss": 6.3961, + "step": 2236 + }, + { + "epoch": 0.013304072699590826, + "grad_norm": 2.388120412826538, + "learning_rate": 4.99781863952111e-05, + "loss": 6.4363, + "step": 2237 + }, + { + "epoch": 0.013310019982871825, + "grad_norm": 2.834341049194336, + "learning_rate": 4.997816688241478e-05, + "loss": 6.4855, + "step": 2238 + }, + { + "epoch": 0.013315967266152822, + "grad_norm": 2.8623831272125244, + "learning_rate": 4.997814736089885e-05, + "loss": 6.8607, + "step": 2239 + }, + { + "epoch": 0.013321914549433819, + "grad_norm": 3.001241683959961, + "learning_rate": 4.99781278306633e-05, + "loss": 6.9777, + "step": 2240 + }, + { + "epoch": 0.013327861832714815, + "grad_norm": 2.9721016883850098, + "learning_rate": 4.9978108291708135e-05, + "loss": 6.9821, + "step": 2241 + }, + { + "epoch": 0.013333809115995814, + "grad_norm": 2.798360824584961, + "learning_rate": 4.997808874403338e-05, + "loss": 7.0096, + "step": 2242 + }, + { + "epoch": 0.01333975639927681, + "grad_norm": 3.2242093086242676, + "learning_rate": 4.997806918763903e-05, + "loss": 6.9091, + "step": 2243 + }, + { + "epoch": 0.013345703682557808, + "grad_norm": 2.681920289993286, + "learning_rate": 4.99780496225251e-05, + "loss": 6.7769, + "step": 2244 + }, + { + "epoch": 0.013351650965838804, + "grad_norm": 3.199514865875244, + "learning_rate": 4.9978030048691584e-05, + "loss": 6.6202, + "step": 2245 + }, + { + "epoch": 0.013357598249119801, + "grad_norm": 2.89886474609375, + "learning_rate": 4.9978010466138496e-05, + "loss": 6.7075, + "step": 2246 + }, + { + "epoch": 0.0133635455324008, + "grad_norm": 2.7091262340545654, + "learning_rate": 4.997799087486584e-05, + "loss": 6.9129, + "step": 2247 + }, + { + "epoch": 0.013369492815681797, + "grad_norm": 2.2538888454437256, + "learning_rate": 4.997797127487364e-05, + "loss": 6.6412, + "step": 2248 + }, + { + "epoch": 0.013375440098962793, + "grad_norm": 2.668286085128784, + "learning_rate": 4.997795166616187e-05, + "loss": 6.8506, + "step": 2249 + }, + { + "epoch": 0.01338138738224379, + "grad_norm": 3.915975570678711, + "learning_rate": 4.997793204873057e-05, + "loss": 6.567, + "step": 2250 + }, + { + "epoch": 0.013387334665524789, + "grad_norm": 2.5549614429473877, + "learning_rate": 4.997791242257972e-05, + "loss": 6.7971, + "step": 2251 + }, + { + "epoch": 0.013393281948805786, + "grad_norm": 2.511810064315796, + "learning_rate": 4.997789278770935e-05, + "loss": 7.1949, + "step": 2252 + }, + { + "epoch": 0.013399229232086782, + "grad_norm": 2.026937484741211, + "learning_rate": 4.9977873144119445e-05, + "loss": 7.2067, + "step": 2253 + }, + { + "epoch": 0.01340517651536778, + "grad_norm": 3.6016058921813965, + "learning_rate": 4.997785349181002e-05, + "loss": 6.549, + "step": 2254 + }, + { + "epoch": 0.013411123798648778, + "grad_norm": 2.867418050765991, + "learning_rate": 4.9977833830781094e-05, + "loss": 6.5562, + "step": 2255 + }, + { + "epoch": 0.013417071081929775, + "grad_norm": 2.2168800830841064, + "learning_rate": 4.9977814161032665e-05, + "loss": 7.1798, + "step": 2256 + }, + { + "epoch": 0.013423018365210771, + "grad_norm": 2.728299856185913, + "learning_rate": 4.997779448256473e-05, + "loss": 6.9314, + "step": 2257 + }, + { + "epoch": 0.013428965648491768, + "grad_norm": 2.7336437702178955, + "learning_rate": 4.997777479537732e-05, + "loss": 7.0643, + "step": 2258 + }, + { + "epoch": 0.013434912931772767, + "grad_norm": 3.1546053886413574, + "learning_rate": 4.997775509947041e-05, + "loss": 6.8853, + "step": 2259 + }, + { + "epoch": 0.013440860215053764, + "grad_norm": 3.037036180496216, + "learning_rate": 4.997773539484404e-05, + "loss": 6.6892, + "step": 2260 + }, + { + "epoch": 0.01344680749833476, + "grad_norm": 2.8779382705688477, + "learning_rate": 4.997771568149818e-05, + "loss": 6.4991, + "step": 2261 + }, + { + "epoch": 0.013452754781615757, + "grad_norm": 3.1105282306671143, + "learning_rate": 4.997769595943288e-05, + "loss": 6.4253, + "step": 2262 + }, + { + "epoch": 0.013458702064896756, + "grad_norm": 4.604808330535889, + "learning_rate": 4.997767622864811e-05, + "loss": 6.504, + "step": 2263 + }, + { + "epoch": 0.013464649348177753, + "grad_norm": 4.345273017883301, + "learning_rate": 4.9977656489143896e-05, + "loss": 6.2, + "step": 2264 + }, + { + "epoch": 0.01347059663145875, + "grad_norm": 2.9744133949279785, + "learning_rate": 4.9977636740920243e-05, + "loss": 6.5458, + "step": 2265 + }, + { + "epoch": 0.013476543914739746, + "grad_norm": 3.3981447219848633, + "learning_rate": 4.9977616983977146e-05, + "loss": 6.9791, + "step": 2266 + }, + { + "epoch": 0.013482491198020745, + "grad_norm": 2.5855109691619873, + "learning_rate": 4.997759721831463e-05, + "loss": 6.7425, + "step": 2267 + }, + { + "epoch": 0.013488438481301742, + "grad_norm": 3.961195707321167, + "learning_rate": 4.997757744393269e-05, + "loss": 6.4042, + "step": 2268 + }, + { + "epoch": 0.013494385764582739, + "grad_norm": 3.8216230869293213, + "learning_rate": 4.997755766083133e-05, + "loss": 6.4962, + "step": 2269 + }, + { + "epoch": 0.013500333047863735, + "grad_norm": 3.077279567718506, + "learning_rate": 4.9977537869010574e-05, + "loss": 6.4298, + "step": 2270 + }, + { + "epoch": 0.013506280331144734, + "grad_norm": 2.56152081489563, + "learning_rate": 4.9977518068470406e-05, + "loss": 6.35, + "step": 2271 + }, + { + "epoch": 0.01351222761442573, + "grad_norm": 2.4069855213165283, + "learning_rate": 4.9977498259210854e-05, + "loss": 6.2923, + "step": 2272 + }, + { + "epoch": 0.013518174897706728, + "grad_norm": 2.9591124057769775, + "learning_rate": 4.9977478441231904e-05, + "loss": 6.2477, + "step": 2273 + }, + { + "epoch": 0.013524122180987724, + "grad_norm": 2.627110481262207, + "learning_rate": 4.997745861453359e-05, + "loss": 6.1012, + "step": 2274 + }, + { + "epoch": 0.013530069464268723, + "grad_norm": 2.3042867183685303, + "learning_rate": 4.997743877911589e-05, + "loss": 6.1155, + "step": 2275 + }, + { + "epoch": 0.01353601674754972, + "grad_norm": 2.709324359893799, + "learning_rate": 4.997741893497882e-05, + "loss": 6.0103, + "step": 2276 + }, + { + "epoch": 0.013541964030830717, + "grad_norm": 2.7087934017181396, + "learning_rate": 4.997739908212241e-05, + "loss": 6.0709, + "step": 2277 + }, + { + "epoch": 0.013547911314111713, + "grad_norm": 3.560149669647217, + "learning_rate": 4.997737922054664e-05, + "loss": 6.1775, + "step": 2278 + }, + { + "epoch": 0.01355385859739271, + "grad_norm": 4.623898506164551, + "learning_rate": 4.997735935025152e-05, + "loss": 6.1993, + "step": 2279 + }, + { + "epoch": 0.013559805880673709, + "grad_norm": 2.9960882663726807, + "learning_rate": 4.997733947123707e-05, + "loss": 6.4211, + "step": 2280 + }, + { + "epoch": 0.013565753163954706, + "grad_norm": 3.8918421268463135, + "learning_rate": 4.9977319583503276e-05, + "loss": 6.0194, + "step": 2281 + }, + { + "epoch": 0.013571700447235702, + "grad_norm": 3.4164741039276123, + "learning_rate": 4.997729968705017e-05, + "loss": 5.9824, + "step": 2282 + }, + { + "epoch": 0.0135776477305167, + "grad_norm": 2.4005794525146484, + "learning_rate": 4.997727978187774e-05, + "loss": 5.9727, + "step": 2283 + }, + { + "epoch": 0.013583595013797698, + "grad_norm": 2.4654550552368164, + "learning_rate": 4.9977259867986e-05, + "loss": 6.2681, + "step": 2284 + }, + { + "epoch": 0.013589542297078695, + "grad_norm": 3.193905830383301, + "learning_rate": 4.997723994537496e-05, + "loss": 6.4996, + "step": 2285 + }, + { + "epoch": 0.013595489580359691, + "grad_norm": 2.4845757484436035, + "learning_rate": 4.997722001404462e-05, + "loss": 7.0464, + "step": 2286 + }, + { + "epoch": 0.013601436863640688, + "grad_norm": 3.170182466506958, + "learning_rate": 4.9977200073995e-05, + "loss": 6.1071, + "step": 2287 + }, + { + "epoch": 0.013607384146921687, + "grad_norm": 2.2331149578094482, + "learning_rate": 4.997718012522609e-05, + "loss": 6.6823, + "step": 2288 + }, + { + "epoch": 0.013613331430202684, + "grad_norm": 2.4146671295166016, + "learning_rate": 4.9977160167737904e-05, + "loss": 6.4398, + "step": 2289 + }, + { + "epoch": 0.01361927871348368, + "grad_norm": 3.23956561088562, + "learning_rate": 4.9977140201530445e-05, + "loss": 6.9295, + "step": 2290 + }, + { + "epoch": 0.013625225996764677, + "grad_norm": 3.402979850769043, + "learning_rate": 4.997712022660374e-05, + "loss": 6.7116, + "step": 2291 + }, + { + "epoch": 0.013631173280045676, + "grad_norm": 3.241320848464966, + "learning_rate": 4.997710024295777e-05, + "loss": 6.8871, + "step": 2292 + }, + { + "epoch": 0.013637120563326673, + "grad_norm": 2.5378634929656982, + "learning_rate": 4.997708025059255e-05, + "loss": 6.9548, + "step": 2293 + }, + { + "epoch": 0.01364306784660767, + "grad_norm": 3.1968839168548584, + "learning_rate": 4.9977060249508087e-05, + "loss": 6.6388, + "step": 2294 + }, + { + "epoch": 0.013649015129888666, + "grad_norm": 2.6951656341552734, + "learning_rate": 4.99770402397044e-05, + "loss": 6.9654, + "step": 2295 + }, + { + "epoch": 0.013654962413169665, + "grad_norm": 2.4168484210968018, + "learning_rate": 4.997702022118147e-05, + "loss": 6.6666, + "step": 2296 + }, + { + "epoch": 0.013660909696450662, + "grad_norm": 3.1395177841186523, + "learning_rate": 4.997700019393934e-05, + "loss": 6.4957, + "step": 2297 + }, + { + "epoch": 0.013666856979731658, + "grad_norm": 3.1591687202453613, + "learning_rate": 4.9976980157977985e-05, + "loss": 6.4392, + "step": 2298 + }, + { + "epoch": 0.013672804263012655, + "grad_norm": 2.2415151596069336, + "learning_rate": 4.9976960113297436e-05, + "loss": 6.4543, + "step": 2299 + }, + { + "epoch": 0.013678751546293654, + "grad_norm": 3.9113616943359375, + "learning_rate": 4.997694005989767e-05, + "loss": 6.7088, + "step": 2300 + }, + { + "epoch": 0.01368469882957465, + "grad_norm": 4.218390941619873, + "learning_rate": 4.997691999777873e-05, + "loss": 6.7199, + "step": 2301 + }, + { + "epoch": 0.013690646112855647, + "grad_norm": 4.200760841369629, + "learning_rate": 4.997689992694059e-05, + "loss": 6.6343, + "step": 2302 + }, + { + "epoch": 0.013696593396136644, + "grad_norm": 3.7164547443389893, + "learning_rate": 4.997687984738328e-05, + "loss": 6.772, + "step": 2303 + }, + { + "epoch": 0.013702540679417643, + "grad_norm": 2.1898231506347656, + "learning_rate": 4.99768597591068e-05, + "loss": 6.6165, + "step": 2304 + }, + { + "epoch": 0.01370848796269864, + "grad_norm": 2.72632098197937, + "learning_rate": 4.9976839662111166e-05, + "loss": 6.6474, + "step": 2305 + }, + { + "epoch": 0.013714435245979636, + "grad_norm": 3.64900279045105, + "learning_rate": 4.997681955639636e-05, + "loss": 6.4322, + "step": 2306 + }, + { + "epoch": 0.013720382529260633, + "grad_norm": 3.978445053100586, + "learning_rate": 4.997679944196241e-05, + "loss": 6.5434, + "step": 2307 + }, + { + "epoch": 0.01372632981254163, + "grad_norm": 5.709702491760254, + "learning_rate": 4.997677931880931e-05, + "loss": 6.5234, + "step": 2308 + }, + { + "epoch": 0.013732277095822629, + "grad_norm": 3.0389838218688965, + "learning_rate": 4.997675918693708e-05, + "loss": 6.4163, + "step": 2309 + }, + { + "epoch": 0.013738224379103625, + "grad_norm": 2.695113182067871, + "learning_rate": 4.9976739046345725e-05, + "loss": 6.6956, + "step": 2310 + }, + { + "epoch": 0.013744171662384622, + "grad_norm": 2.9768142700195312, + "learning_rate": 4.997671889703525e-05, + "loss": 6.5315, + "step": 2311 + }, + { + "epoch": 0.01375011894566562, + "grad_norm": 3.750454902648926, + "learning_rate": 4.997669873900566e-05, + "loss": 6.5568, + "step": 2312 + }, + { + "epoch": 0.013756066228946618, + "grad_norm": 3.390232801437378, + "learning_rate": 4.9976678572256955e-05, + "loss": 6.4916, + "step": 2313 + }, + { + "epoch": 0.013762013512227615, + "grad_norm": 3.1487748622894287, + "learning_rate": 4.997665839678915e-05, + "loss": 6.6378, + "step": 2314 + }, + { + "epoch": 0.013767960795508611, + "grad_norm": 2.5654940605163574, + "learning_rate": 4.997663821260226e-05, + "loss": 6.5817, + "step": 2315 + }, + { + "epoch": 0.013773908078789608, + "grad_norm": 2.7092552185058594, + "learning_rate": 4.9976618019696275e-05, + "loss": 6.982, + "step": 2316 + }, + { + "epoch": 0.013779855362070607, + "grad_norm": 3.642826557159424, + "learning_rate": 4.9976597818071214e-05, + "loss": 6.7951, + "step": 2317 + }, + { + "epoch": 0.013785802645351604, + "grad_norm": 3.4288947582244873, + "learning_rate": 4.997657760772708e-05, + "loss": 6.4366, + "step": 2318 + }, + { + "epoch": 0.0137917499286326, + "grad_norm": 2.7620253562927246, + "learning_rate": 4.997655738866389e-05, + "loss": 6.6588, + "step": 2319 + }, + { + "epoch": 0.013797697211913597, + "grad_norm": 2.4266698360443115, + "learning_rate": 4.997653716088163e-05, + "loss": 6.697, + "step": 2320 + }, + { + "epoch": 0.013803644495194596, + "grad_norm": 2.289365768432617, + "learning_rate": 4.9976516924380325e-05, + "loss": 6.7583, + "step": 2321 + }, + { + "epoch": 0.013809591778475593, + "grad_norm": 2.4238948822021484, + "learning_rate": 4.9976496679159976e-05, + "loss": 6.7949, + "step": 2322 + }, + { + "epoch": 0.01381553906175659, + "grad_norm": 2.7347512245178223, + "learning_rate": 4.997647642522059e-05, + "loss": 6.5914, + "step": 2323 + }, + { + "epoch": 0.013821486345037586, + "grad_norm": 2.961089849472046, + "learning_rate": 4.997645616256217e-05, + "loss": 6.3513, + "step": 2324 + }, + { + "epoch": 0.013827433628318585, + "grad_norm": 2.437685251235962, + "learning_rate": 4.997643589118472e-05, + "loss": 6.4626, + "step": 2325 + }, + { + "epoch": 0.013833380911599582, + "grad_norm": 2.769731044769287, + "learning_rate": 4.9976415611088267e-05, + "loss": 6.2801, + "step": 2326 + }, + { + "epoch": 0.013839328194880578, + "grad_norm": 2.700697183609009, + "learning_rate": 4.9976395322272805e-05, + "loss": 6.1969, + "step": 2327 + }, + { + "epoch": 0.013845275478161575, + "grad_norm": 3.8049886226654053, + "learning_rate": 4.997637502473834e-05, + "loss": 6.769, + "step": 2328 + }, + { + "epoch": 0.013851222761442574, + "grad_norm": 3.748903512954712, + "learning_rate": 4.9976354718484875e-05, + "loss": 6.6486, + "step": 2329 + }, + { + "epoch": 0.01385717004472357, + "grad_norm": 3.7807834148406982, + "learning_rate": 4.9976334403512426e-05, + "loss": 6.6251, + "step": 2330 + }, + { + "epoch": 0.013863117328004567, + "grad_norm": 2.5358874797821045, + "learning_rate": 4.997631407982099e-05, + "loss": 6.4425, + "step": 2331 + }, + { + "epoch": 0.013869064611285564, + "grad_norm": 2.4619522094726562, + "learning_rate": 4.9976293747410596e-05, + "loss": 7.2166, + "step": 2332 + }, + { + "epoch": 0.013875011894566563, + "grad_norm": 2.740412473678589, + "learning_rate": 4.997627340628123e-05, + "loss": 6.8907, + "step": 2333 + }, + { + "epoch": 0.01388095917784756, + "grad_norm": 2.872852087020874, + "learning_rate": 4.9976253056432895e-05, + "loss": 6.6142, + "step": 2334 + }, + { + "epoch": 0.013886906461128556, + "grad_norm": 2.01629900932312, + "learning_rate": 4.997623269786562e-05, + "loss": 6.398, + "step": 2335 + }, + { + "epoch": 0.013892853744409553, + "grad_norm": 2.4405698776245117, + "learning_rate": 4.99762123305794e-05, + "loss": 6.9282, + "step": 2336 + }, + { + "epoch": 0.013898801027690552, + "grad_norm": 2.2520413398742676, + "learning_rate": 4.9976191954574235e-05, + "loss": 6.5565, + "step": 2337 + }, + { + "epoch": 0.013904748310971549, + "grad_norm": 2.314852476119995, + "learning_rate": 4.997617156985014e-05, + "loss": 6.3055, + "step": 2338 + }, + { + "epoch": 0.013910695594252545, + "grad_norm": 2.9049081802368164, + "learning_rate": 4.9976151176407124e-05, + "loss": 7.1806, + "step": 2339 + }, + { + "epoch": 0.013916642877533542, + "grad_norm": 2.7533769607543945, + "learning_rate": 4.9976130774245197e-05, + "loss": 7.0047, + "step": 2340 + }, + { + "epoch": 0.013922590160814539, + "grad_norm": 2.124826431274414, + "learning_rate": 4.997611036336435e-05, + "loss": 7.1897, + "step": 2341 + }, + { + "epoch": 0.013928537444095538, + "grad_norm": 2.5205366611480713, + "learning_rate": 4.997608994376461e-05, + "loss": 6.8592, + "step": 2342 + }, + { + "epoch": 0.013934484727376534, + "grad_norm": 2.8026719093322754, + "learning_rate": 4.9976069515445975e-05, + "loss": 6.6622, + "step": 2343 + }, + { + "epoch": 0.013940432010657531, + "grad_norm": 3.045438051223755, + "learning_rate": 4.997604907840845e-05, + "loss": 6.6176, + "step": 2344 + }, + { + "epoch": 0.013946379293938528, + "grad_norm": 2.820199489593506, + "learning_rate": 4.997602863265204e-05, + "loss": 6.4489, + "step": 2345 + }, + { + "epoch": 0.013952326577219527, + "grad_norm": 2.997990369796753, + "learning_rate": 4.997600817817676e-05, + "loss": 7.0989, + "step": 2346 + }, + { + "epoch": 0.013958273860500523, + "grad_norm": 3.316575050354004, + "learning_rate": 4.9975987714982606e-05, + "loss": 6.9042, + "step": 2347 + }, + { + "epoch": 0.01396422114378152, + "grad_norm": 2.3339803218841553, + "learning_rate": 4.99759672430696e-05, + "loss": 6.8831, + "step": 2348 + }, + { + "epoch": 0.013970168427062517, + "grad_norm": 2.510274648666382, + "learning_rate": 4.997594676243775e-05, + "loss": 7.1093, + "step": 2349 + }, + { + "epoch": 0.013976115710343516, + "grad_norm": 2.893909215927124, + "learning_rate": 4.997592627308705e-05, + "loss": 6.5477, + "step": 2350 + }, + { + "epoch": 0.013982062993624512, + "grad_norm": 3.6036674976348877, + "learning_rate": 4.9975905775017505e-05, + "loss": 6.3278, + "step": 2351 + }, + { + "epoch": 0.01398801027690551, + "grad_norm": 2.1260125637054443, + "learning_rate": 4.9975885268229127e-05, + "loss": 6.7883, + "step": 2352 + }, + { + "epoch": 0.013993957560186506, + "grad_norm": 2.328247308731079, + "learning_rate": 4.997586475272193e-05, + "loss": 6.4832, + "step": 2353 + }, + { + "epoch": 0.013999904843467505, + "grad_norm": 2.8075780868530273, + "learning_rate": 4.997584422849593e-05, + "loss": 6.9333, + "step": 2354 + }, + { + "epoch": 0.014005852126748502, + "grad_norm": 1.9339990615844727, + "learning_rate": 4.9975823695551106e-05, + "loss": 6.6856, + "step": 2355 + }, + { + "epoch": 0.014011799410029498, + "grad_norm": 2.842968225479126, + "learning_rate": 4.997580315388748e-05, + "loss": 6.48, + "step": 2356 + }, + { + "epoch": 0.014017746693310495, + "grad_norm": 1.8715558052062988, + "learning_rate": 4.997578260350506e-05, + "loss": 6.8702, + "step": 2357 + }, + { + "epoch": 0.014023693976591494, + "grad_norm": 2.4310202598571777, + "learning_rate": 4.9975762044403865e-05, + "loss": 7.0112, + "step": 2358 + }, + { + "epoch": 0.01402964125987249, + "grad_norm": 2.292121648788452, + "learning_rate": 4.997574147658387e-05, + "loss": 6.6505, + "step": 2359 + }, + { + "epoch": 0.014035588543153487, + "grad_norm": 2.374007225036621, + "learning_rate": 4.997572090004511e-05, + "loss": 6.7332, + "step": 2360 + }, + { + "epoch": 0.014041535826434484, + "grad_norm": 2.198131561279297, + "learning_rate": 4.997570031478759e-05, + "loss": 6.6358, + "step": 2361 + }, + { + "epoch": 0.014047483109715483, + "grad_norm": 2.3109302520751953, + "learning_rate": 4.997567972081131e-05, + "loss": 6.6194, + "step": 2362 + }, + { + "epoch": 0.01405343039299648, + "grad_norm": 2.49338698387146, + "learning_rate": 4.997565911811627e-05, + "loss": 6.5036, + "step": 2363 + }, + { + "epoch": 0.014059377676277476, + "grad_norm": 2.6462419033050537, + "learning_rate": 4.997563850670249e-05, + "loss": 6.4294, + "step": 2364 + }, + { + "epoch": 0.014065324959558473, + "grad_norm": 3.0072524547576904, + "learning_rate": 4.997561788656997e-05, + "loss": 6.8814, + "step": 2365 + }, + { + "epoch": 0.014071272242839472, + "grad_norm": 2.435209035873413, + "learning_rate": 4.997559725771872e-05, + "loss": 6.4684, + "step": 2366 + }, + { + "epoch": 0.014077219526120469, + "grad_norm": 2.8023672103881836, + "learning_rate": 4.997557662014875e-05, + "loss": 6.7922, + "step": 2367 + }, + { + "epoch": 0.014083166809401465, + "grad_norm": 2.6129658222198486, + "learning_rate": 4.9975555973860065e-05, + "loss": 6.4539, + "step": 2368 + }, + { + "epoch": 0.014089114092682462, + "grad_norm": 2.559117317199707, + "learning_rate": 4.997553531885267e-05, + "loss": 6.4713, + "step": 2369 + }, + { + "epoch": 0.014095061375963459, + "grad_norm": 2.4535956382751465, + "learning_rate": 4.9975514655126575e-05, + "loss": 6.963, + "step": 2370 + }, + { + "epoch": 0.014101008659244458, + "grad_norm": 2.3025150299072266, + "learning_rate": 4.997549398268178e-05, + "loss": 6.9299, + "step": 2371 + }, + { + "epoch": 0.014106955942525454, + "grad_norm": 2.834411382675171, + "learning_rate": 4.997547330151831e-05, + "loss": 6.299, + "step": 2372 + }, + { + "epoch": 0.014112903225806451, + "grad_norm": 2.8046083450317383, + "learning_rate": 4.997545261163615e-05, + "loss": 5.7691, + "step": 2373 + }, + { + "epoch": 0.014118850509087448, + "grad_norm": 2.663776159286499, + "learning_rate": 4.997543191303532e-05, + "loss": 5.969, + "step": 2374 + }, + { + "epoch": 0.014124797792368447, + "grad_norm": 2.725154161453247, + "learning_rate": 4.997541120571582e-05, + "loss": 5.7473, + "step": 2375 + }, + { + "epoch": 0.014130745075649443, + "grad_norm": 2.9021074771881104, + "learning_rate": 4.9975390489677663e-05, + "loss": 6.3177, + "step": 2376 + }, + { + "epoch": 0.01413669235893044, + "grad_norm": 2.4043307304382324, + "learning_rate": 4.9975369764920866e-05, + "loss": 6.358, + "step": 2377 + }, + { + "epoch": 0.014142639642211437, + "grad_norm": 2.4163010120391846, + "learning_rate": 4.997534903144542e-05, + "loss": 6.6807, + "step": 2378 + }, + { + "epoch": 0.014148586925492436, + "grad_norm": 3.0710666179656982, + "learning_rate": 4.9975328289251335e-05, + "loss": 6.2416, + "step": 2379 + }, + { + "epoch": 0.014154534208773432, + "grad_norm": 2.159627676010132, + "learning_rate": 4.997530753833862e-05, + "loss": 7.1434, + "step": 2380 + }, + { + "epoch": 0.01416048149205443, + "grad_norm": 2.308382034301758, + "learning_rate": 4.997528677870729e-05, + "loss": 7.1243, + "step": 2381 + }, + { + "epoch": 0.014166428775335426, + "grad_norm": 2.7461323738098145, + "learning_rate": 4.997526601035734e-05, + "loss": 6.3066, + "step": 2382 + }, + { + "epoch": 0.014172376058616425, + "grad_norm": 2.8835322856903076, + "learning_rate": 4.997524523328878e-05, + "loss": 6.28, + "step": 2383 + }, + { + "epoch": 0.014178323341897421, + "grad_norm": 2.5195534229278564, + "learning_rate": 4.997522444750162e-05, + "loss": 6.9561, + "step": 2384 + }, + { + "epoch": 0.014184270625178418, + "grad_norm": 3.1697885990142822, + "learning_rate": 4.997520365299587e-05, + "loss": 6.7432, + "step": 2385 + }, + { + "epoch": 0.014190217908459415, + "grad_norm": 3.6300339698791504, + "learning_rate": 4.997518284977154e-05, + "loss": 6.3676, + "step": 2386 + }, + { + "epoch": 0.014196165191740414, + "grad_norm": 3.261981964111328, + "learning_rate": 4.9975162037828625e-05, + "loss": 6.0991, + "step": 2387 + }, + { + "epoch": 0.01420211247502141, + "grad_norm": 3.6291120052337646, + "learning_rate": 4.9975141217167146e-05, + "loss": 6.1239, + "step": 2388 + }, + { + "epoch": 0.014208059758302407, + "grad_norm": 3.192958116531372, + "learning_rate": 4.997512038778709e-05, + "loss": 6.4455, + "step": 2389 + }, + { + "epoch": 0.014214007041583404, + "grad_norm": 2.8887948989868164, + "learning_rate": 4.997509954968849e-05, + "loss": 6.9441, + "step": 2390 + }, + { + "epoch": 0.014219954324864403, + "grad_norm": 2.3568248748779297, + "learning_rate": 4.9975078702871336e-05, + "loss": 7.0207, + "step": 2391 + }, + { + "epoch": 0.0142259016081454, + "grad_norm": 2.2629294395446777, + "learning_rate": 4.997505784733564e-05, + "loss": 6.9575, + "step": 2392 + }, + { + "epoch": 0.014231848891426396, + "grad_norm": 2.5458898544311523, + "learning_rate": 4.99750369830814e-05, + "loss": 6.8533, + "step": 2393 + }, + { + "epoch": 0.014237796174707393, + "grad_norm": 2.5125060081481934, + "learning_rate": 4.997501611010865e-05, + "loss": 6.8615, + "step": 2394 + }, + { + "epoch": 0.014243743457988392, + "grad_norm": 2.9903738498687744, + "learning_rate": 4.997499522841737e-05, + "loss": 6.6927, + "step": 2395 + }, + { + "epoch": 0.014249690741269389, + "grad_norm": 2.7536470890045166, + "learning_rate": 4.997497433800758e-05, + "loss": 6.6454, + "step": 2396 + }, + { + "epoch": 0.014255638024550385, + "grad_norm": 3.5041043758392334, + "learning_rate": 4.997495343887928e-05, + "loss": 6.485, + "step": 2397 + }, + { + "epoch": 0.014261585307831382, + "grad_norm": 3.8025100231170654, + "learning_rate": 4.997493253103249e-05, + "loss": 6.3731, + "step": 2398 + }, + { + "epoch": 0.01426753259111238, + "grad_norm": 3.2657718658447266, + "learning_rate": 4.99749116144672e-05, + "loss": 6.23, + "step": 2399 + }, + { + "epoch": 0.014273479874393378, + "grad_norm": 2.721632719039917, + "learning_rate": 4.997489068918343e-05, + "loss": 6.7292, + "step": 2400 + }, + { + "epoch": 0.014279427157674374, + "grad_norm": 2.3483569622039795, + "learning_rate": 4.9974869755181186e-05, + "loss": 6.4842, + "step": 2401 + }, + { + "epoch": 0.014285374440955371, + "grad_norm": 2.4931676387786865, + "learning_rate": 4.997484881246047e-05, + "loss": 7.0529, + "step": 2402 + }, + { + "epoch": 0.014291321724236368, + "grad_norm": 2.4944825172424316, + "learning_rate": 4.99748278610213e-05, + "loss": 7.0185, + "step": 2403 + }, + { + "epoch": 0.014297269007517367, + "grad_norm": 2.9124202728271484, + "learning_rate": 4.997480690086367e-05, + "loss": 6.9847, + "step": 2404 + }, + { + "epoch": 0.014303216290798363, + "grad_norm": 2.5802674293518066, + "learning_rate": 4.997478593198759e-05, + "loss": 7.0389, + "step": 2405 + }, + { + "epoch": 0.01430916357407936, + "grad_norm": 2.636709451675415, + "learning_rate": 4.9974764954393075e-05, + "loss": 6.7281, + "step": 2406 + }, + { + "epoch": 0.014315110857360357, + "grad_norm": 3.801760196685791, + "learning_rate": 4.997474396808012e-05, + "loss": 5.9962, + "step": 2407 + }, + { + "epoch": 0.014321058140641356, + "grad_norm": 3.7983996868133545, + "learning_rate": 4.997472297304875e-05, + "loss": 6.3821, + "step": 2408 + }, + { + "epoch": 0.014327005423922352, + "grad_norm": 2.863408088684082, + "learning_rate": 4.997470196929895e-05, + "loss": 6.2206, + "step": 2409 + }, + { + "epoch": 0.01433295270720335, + "grad_norm": 2.6187095642089844, + "learning_rate": 4.997468095683076e-05, + "loss": 6.2205, + "step": 2410 + }, + { + "epoch": 0.014338899990484346, + "grad_norm": 3.202986240386963, + "learning_rate": 4.997465993564414e-05, + "loss": 6.259, + "step": 2411 + }, + { + "epoch": 0.014344847273765345, + "grad_norm": 2.9131264686584473, + "learning_rate": 4.9974638905739146e-05, + "loss": 6.4159, + "step": 2412 + }, + { + "epoch": 0.014350794557046341, + "grad_norm": 2.384477376937866, + "learning_rate": 4.9974617867115754e-05, + "loss": 6.6669, + "step": 2413 + }, + { + "epoch": 0.014356741840327338, + "grad_norm": 2.448495626449585, + "learning_rate": 4.997459681977398e-05, + "loss": 6.5679, + "step": 2414 + }, + { + "epoch": 0.014362689123608335, + "grad_norm": 2.1945343017578125, + "learning_rate": 4.997457576371384e-05, + "loss": 6.3856, + "step": 2415 + }, + { + "epoch": 0.014368636406889334, + "grad_norm": 1.867848515510559, + "learning_rate": 4.997455469893533e-05, + "loss": 6.3127, + "step": 2416 + }, + { + "epoch": 0.01437458369017033, + "grad_norm": 2.560976266860962, + "learning_rate": 4.997453362543846e-05, + "loss": 6.4619, + "step": 2417 + }, + { + "epoch": 0.014380530973451327, + "grad_norm": 3.2440431118011475, + "learning_rate": 4.997451254322323e-05, + "loss": 6.399, + "step": 2418 + }, + { + "epoch": 0.014386478256732324, + "grad_norm": 3.0021307468414307, + "learning_rate": 4.9974491452289664e-05, + "loss": 6.174, + "step": 2419 + }, + { + "epoch": 0.014392425540013323, + "grad_norm": 2.6046524047851562, + "learning_rate": 4.997447035263776e-05, + "loss": 6.8284, + "step": 2420 + }, + { + "epoch": 0.01439837282329432, + "grad_norm": 3.1395344734191895, + "learning_rate": 4.997444924426753e-05, + "loss": 6.3395, + "step": 2421 + }, + { + "epoch": 0.014404320106575316, + "grad_norm": 3.056152582168579, + "learning_rate": 4.997442812717897e-05, + "loss": 6.3468, + "step": 2422 + }, + { + "epoch": 0.014410267389856313, + "grad_norm": 2.2532267570495605, + "learning_rate": 4.9974407001372105e-05, + "loss": 6.5187, + "step": 2423 + }, + { + "epoch": 0.014416214673137312, + "grad_norm": 2.0228383541107178, + "learning_rate": 4.997438586684693e-05, + "loss": 6.4452, + "step": 2424 + }, + { + "epoch": 0.014422161956418308, + "grad_norm": 3.2889909744262695, + "learning_rate": 4.997436472360345e-05, + "loss": 6.6466, + "step": 2425 + }, + { + "epoch": 0.014428109239699305, + "grad_norm": 2.957916498184204, + "learning_rate": 4.9974343571641677e-05, + "loss": 6.9617, + "step": 2426 + }, + { + "epoch": 0.014434056522980302, + "grad_norm": 2.7629241943359375, + "learning_rate": 4.997432241096162e-05, + "loss": 6.1687, + "step": 2427 + }, + { + "epoch": 0.0144400038062613, + "grad_norm": 2.849297285079956, + "learning_rate": 4.997430124156329e-05, + "loss": 6.4647, + "step": 2428 + }, + { + "epoch": 0.014445951089542297, + "grad_norm": 2.2432122230529785, + "learning_rate": 4.997428006344669e-05, + "loss": 7.1739, + "step": 2429 + }, + { + "epoch": 0.014451898372823294, + "grad_norm": 2.814807891845703, + "learning_rate": 4.997425887661181e-05, + "loss": 5.945, + "step": 2430 + }, + { + "epoch": 0.014457845656104291, + "grad_norm": 3.140153646469116, + "learning_rate": 4.997423768105869e-05, + "loss": 6.5948, + "step": 2431 + }, + { + "epoch": 0.01446379293938529, + "grad_norm": 2.5276620388031006, + "learning_rate": 4.997421647678732e-05, + "loss": 6.9813, + "step": 2432 + }, + { + "epoch": 0.014469740222666286, + "grad_norm": 2.462204694747925, + "learning_rate": 4.9974195263797705e-05, + "loss": 6.8987, + "step": 2433 + }, + { + "epoch": 0.014475687505947283, + "grad_norm": 3.117255210876465, + "learning_rate": 4.997417404208986e-05, + "loss": 5.883, + "step": 2434 + }, + { + "epoch": 0.01448163478922828, + "grad_norm": 2.6207518577575684, + "learning_rate": 4.997415281166379e-05, + "loss": 6.8065, + "step": 2435 + }, + { + "epoch": 0.014487582072509277, + "grad_norm": 2.996624231338501, + "learning_rate": 4.99741315725195e-05, + "loss": 6.5162, + "step": 2436 + }, + { + "epoch": 0.014493529355790276, + "grad_norm": 2.1946496963500977, + "learning_rate": 4.9974110324656996e-05, + "loss": 6.9521, + "step": 2437 + }, + { + "epoch": 0.014499476639071272, + "grad_norm": 2.273017406463623, + "learning_rate": 4.997408906807629e-05, + "loss": 7.0144, + "step": 2438 + }, + { + "epoch": 0.01450542392235227, + "grad_norm": 2.516509771347046, + "learning_rate": 4.997406780277739e-05, + "loss": 7.013, + "step": 2439 + }, + { + "epoch": 0.014511371205633266, + "grad_norm": 3.0296435356140137, + "learning_rate": 4.9974046528760296e-05, + "loss": 6.934, + "step": 2440 + }, + { + "epoch": 0.014517318488914265, + "grad_norm": 2.6135010719299316, + "learning_rate": 4.9974025246025024e-05, + "loss": 6.7151, + "step": 2441 + }, + { + "epoch": 0.014523265772195261, + "grad_norm": 2.6850788593292236, + "learning_rate": 4.997400395457158e-05, + "loss": 6.5223, + "step": 2442 + }, + { + "epoch": 0.014529213055476258, + "grad_norm": 3.0401692390441895, + "learning_rate": 4.9973982654399966e-05, + "loss": 7.2006, + "step": 2443 + }, + { + "epoch": 0.014535160338757255, + "grad_norm": 3.016805410385132, + "learning_rate": 4.997396134551019e-05, + "loss": 7.0633, + "step": 2444 + }, + { + "epoch": 0.014541107622038254, + "grad_norm": 3.107154130935669, + "learning_rate": 4.9973940027902264e-05, + "loss": 6.9096, + "step": 2445 + }, + { + "epoch": 0.01454705490531925, + "grad_norm": 2.720054864883423, + "learning_rate": 4.9973918701576196e-05, + "loss": 6.7061, + "step": 2446 + }, + { + "epoch": 0.014553002188600247, + "grad_norm": 2.386401414871216, + "learning_rate": 4.9973897366531984e-05, + "loss": 6.5877, + "step": 2447 + }, + { + "epoch": 0.014558949471881244, + "grad_norm": 2.488243579864502, + "learning_rate": 4.997387602276965e-05, + "loss": 6.7792, + "step": 2448 + }, + { + "epoch": 0.014564896755162243, + "grad_norm": 2.7504360675811768, + "learning_rate": 4.9973854670289196e-05, + "loss": 6.6164, + "step": 2449 + }, + { + "epoch": 0.01457084403844324, + "grad_norm": 3.001441240310669, + "learning_rate": 4.9973833309090626e-05, + "loss": 6.5933, + "step": 2450 + }, + { + "epoch": 0.014576791321724236, + "grad_norm": 2.6449999809265137, + "learning_rate": 4.997381193917394e-05, + "loss": 6.5323, + "step": 2451 + }, + { + "epoch": 0.014582738605005233, + "grad_norm": 2.81846022605896, + "learning_rate": 4.9973790560539156e-05, + "loss": 6.5146, + "step": 2452 + }, + { + "epoch": 0.014588685888286232, + "grad_norm": 2.662916421890259, + "learning_rate": 4.997376917318629e-05, + "loss": 6.161, + "step": 2453 + }, + { + "epoch": 0.014594633171567228, + "grad_norm": 2.689601421356201, + "learning_rate": 4.997374777711533e-05, + "loss": 6.2008, + "step": 2454 + }, + { + "epoch": 0.014600580454848225, + "grad_norm": 2.6690561771392822, + "learning_rate": 4.99737263723263e-05, + "loss": 6.4418, + "step": 2455 + }, + { + "epoch": 0.014606527738129222, + "grad_norm": 2.897270917892456, + "learning_rate": 4.997370495881919e-05, + "loss": 6.3968, + "step": 2456 + }, + { + "epoch": 0.01461247502141022, + "grad_norm": 2.9327831268310547, + "learning_rate": 4.997368353659402e-05, + "loss": 6.4665, + "step": 2457 + }, + { + "epoch": 0.014618422304691217, + "grad_norm": 2.658013343811035, + "learning_rate": 4.99736621056508e-05, + "loss": 6.399, + "step": 2458 + }, + { + "epoch": 0.014624369587972214, + "grad_norm": 2.6055238246917725, + "learning_rate": 4.997364066598953e-05, + "loss": 6.4679, + "step": 2459 + }, + { + "epoch": 0.014630316871253211, + "grad_norm": 3.0595951080322266, + "learning_rate": 4.997361921761022e-05, + "loss": 5.8797, + "step": 2460 + }, + { + "epoch": 0.01463626415453421, + "grad_norm": 2.994694471359253, + "learning_rate": 4.997359776051288e-05, + "loss": 5.704, + "step": 2461 + }, + { + "epoch": 0.014642211437815206, + "grad_norm": 2.78153657913208, + "learning_rate": 4.9973576294697514e-05, + "loss": 5.7289, + "step": 2462 + }, + { + "epoch": 0.014648158721096203, + "grad_norm": 2.5119385719299316, + "learning_rate": 4.997355482016414e-05, + "loss": 5.5494, + "step": 2463 + }, + { + "epoch": 0.0146541060043772, + "grad_norm": 2.7880990505218506, + "learning_rate": 4.997353333691274e-05, + "loss": 5.5905, + "step": 2464 + }, + { + "epoch": 0.014660053287658197, + "grad_norm": 2.827352523803711, + "learning_rate": 4.9973511844943346e-05, + "loss": 6.4429, + "step": 2465 + }, + { + "epoch": 0.014666000570939195, + "grad_norm": 2.4297358989715576, + "learning_rate": 4.997349034425595e-05, + "loss": 6.8647, + "step": 2466 + }, + { + "epoch": 0.014671947854220192, + "grad_norm": 2.649064064025879, + "learning_rate": 4.997346883485057e-05, + "loss": 6.5568, + "step": 2467 + }, + { + "epoch": 0.014677895137501189, + "grad_norm": 3.2215452194213867, + "learning_rate": 4.9973447316727215e-05, + "loss": 5.5684, + "step": 2468 + }, + { + "epoch": 0.014683842420782186, + "grad_norm": 2.8760056495666504, + "learning_rate": 4.9973425789885884e-05, + "loss": 5.6395, + "step": 2469 + }, + { + "epoch": 0.014689789704063184, + "grad_norm": 2.4002890586853027, + "learning_rate": 4.9973404254326585e-05, + "loss": 5.9525, + "step": 2470 + }, + { + "epoch": 0.014695736987344181, + "grad_norm": 2.32314395904541, + "learning_rate": 4.997338271004933e-05, + "loss": 6.9675, + "step": 2471 + }, + { + "epoch": 0.014701684270625178, + "grad_norm": 2.262680768966675, + "learning_rate": 4.997336115705413e-05, + "loss": 7.1361, + "step": 2472 + }, + { + "epoch": 0.014707631553906175, + "grad_norm": 2.2855215072631836, + "learning_rate": 4.997333959534098e-05, + "loss": 7.1141, + "step": 2473 + }, + { + "epoch": 0.014713578837187173, + "grad_norm": 2.5461738109588623, + "learning_rate": 4.99733180249099e-05, + "loss": 7.0492, + "step": 2474 + }, + { + "epoch": 0.01471952612046817, + "grad_norm": 2.455561399459839, + "learning_rate": 4.99732964457609e-05, + "loss": 6.9303, + "step": 2475 + }, + { + "epoch": 0.014725473403749167, + "grad_norm": 3.3767740726470947, + "learning_rate": 4.997327485789397e-05, + "loss": 6.8531, + "step": 2476 + }, + { + "epoch": 0.014731420687030164, + "grad_norm": 2.9320104122161865, + "learning_rate": 4.9973253261309125e-05, + "loss": 6.9258, + "step": 2477 + }, + { + "epoch": 0.014737367970311162, + "grad_norm": 2.380960464477539, + "learning_rate": 4.997323165600638e-05, + "loss": 6.8581, + "step": 2478 + }, + { + "epoch": 0.01474331525359216, + "grad_norm": 2.727154016494751, + "learning_rate": 4.997321004198574e-05, + "loss": 7.3814, + "step": 2479 + }, + { + "epoch": 0.014749262536873156, + "grad_norm": 2.8693020343780518, + "learning_rate": 4.997318841924721e-05, + "loss": 6.3793, + "step": 2480 + }, + { + "epoch": 0.014755209820154153, + "grad_norm": 2.941622734069824, + "learning_rate": 4.997316678779079e-05, + "loss": 7.3567, + "step": 2481 + }, + { + "epoch": 0.014761157103435152, + "grad_norm": 3.0310213565826416, + "learning_rate": 4.9973145147616505e-05, + "loss": 6.8832, + "step": 2482 + }, + { + "epoch": 0.014767104386716148, + "grad_norm": 1.9184696674346924, + "learning_rate": 4.9973123498724353e-05, + "loss": 6.7369, + "step": 2483 + }, + { + "epoch": 0.014773051669997145, + "grad_norm": 2.3090195655822754, + "learning_rate": 4.9973101841114335e-05, + "loss": 6.8927, + "step": 2484 + }, + { + "epoch": 0.014778998953278142, + "grad_norm": 2.2947685718536377, + "learning_rate": 4.997308017478647e-05, + "loss": 6.9441, + "step": 2485 + }, + { + "epoch": 0.01478494623655914, + "grad_norm": 2.363690137863159, + "learning_rate": 4.997305849974076e-05, + "loss": 6.9397, + "step": 2486 + }, + { + "epoch": 0.014790893519840137, + "grad_norm": 1.7546948194503784, + "learning_rate": 4.997303681597721e-05, + "loss": 6.7888, + "step": 2487 + }, + { + "epoch": 0.014796840803121134, + "grad_norm": 1.8824211359024048, + "learning_rate": 4.997301512349584e-05, + "loss": 6.6486, + "step": 2488 + }, + { + "epoch": 0.014802788086402131, + "grad_norm": 3.68865704536438, + "learning_rate": 4.9972993422296636e-05, + "loss": 7.0318, + "step": 2489 + }, + { + "epoch": 0.01480873536968313, + "grad_norm": 3.0788486003875732, + "learning_rate": 4.997297171237962e-05, + "loss": 6.814, + "step": 2490 + }, + { + "epoch": 0.014814682652964126, + "grad_norm": 2.6903607845306396, + "learning_rate": 4.997294999374481e-05, + "loss": 6.9752, + "step": 2491 + }, + { + "epoch": 0.014820629936245123, + "grad_norm": 2.6673712730407715, + "learning_rate": 4.9972928266392194e-05, + "loss": 6.9083, + "step": 2492 + }, + { + "epoch": 0.01482657721952612, + "grad_norm": 2.335632801055908, + "learning_rate": 4.9972906530321786e-05, + "loss": 7.027, + "step": 2493 + }, + { + "epoch": 0.014832524502807119, + "grad_norm": 3.2885966300964355, + "learning_rate": 4.997288478553359e-05, + "loss": 6.6551, + "step": 2494 + }, + { + "epoch": 0.014838471786088115, + "grad_norm": 2.7297918796539307, + "learning_rate": 4.997286303202762e-05, + "loss": 6.7345, + "step": 2495 + }, + { + "epoch": 0.014844419069369112, + "grad_norm": 2.640814781188965, + "learning_rate": 4.997284126980388e-05, + "loss": 6.743, + "step": 2496 + }, + { + "epoch": 0.014850366352650109, + "grad_norm": 2.699632167816162, + "learning_rate": 4.997281949886239e-05, + "loss": 6.4633, + "step": 2497 + }, + { + "epoch": 0.014856313635931106, + "grad_norm": 2.5185790061950684, + "learning_rate": 4.9972797719203135e-05, + "loss": 6.5496, + "step": 2498 + }, + { + "epoch": 0.014862260919212104, + "grad_norm": 2.659393548965454, + "learning_rate": 4.9972775930826144e-05, + "loss": 6.5066, + "step": 2499 + }, + { + "epoch": 0.014868208202493101, + "grad_norm": 2.160808563232422, + "learning_rate": 4.99727541337314e-05, + "loss": 6.9851, + "step": 2500 + }, + { + "epoch": 0.014874155485774098, + "grad_norm": 2.656506299972534, + "learning_rate": 4.997273232791894e-05, + "loss": 7.5696, + "step": 2501 + }, + { + "epoch": 0.014880102769055095, + "grad_norm": 2.490612506866455, + "learning_rate": 4.9972710513388754e-05, + "loss": 7.2623, + "step": 2502 + }, + { + "epoch": 0.014886050052336093, + "grad_norm": 2.1744866371154785, + "learning_rate": 4.997268869014085e-05, + "loss": 6.5208, + "step": 2503 + }, + { + "epoch": 0.01489199733561709, + "grad_norm": 2.8058252334594727, + "learning_rate": 4.9972666858175236e-05, + "loss": 6.1527, + "step": 2504 + }, + { + "epoch": 0.014897944618898087, + "grad_norm": 2.418827533721924, + "learning_rate": 4.997264501749193e-05, + "loss": 6.2244, + "step": 2505 + }, + { + "epoch": 0.014903891902179084, + "grad_norm": 2.499648332595825, + "learning_rate": 4.997262316809092e-05, + "loss": 6.8904, + "step": 2506 + }, + { + "epoch": 0.014909839185460082, + "grad_norm": 2.3598594665527344, + "learning_rate": 4.9972601309972235e-05, + "loss": 7.0794, + "step": 2507 + }, + { + "epoch": 0.01491578646874108, + "grad_norm": 2.2443082332611084, + "learning_rate": 4.997257944313587e-05, + "loss": 7.3078, + "step": 2508 + }, + { + "epoch": 0.014921733752022076, + "grad_norm": 2.407501459121704, + "learning_rate": 4.9972557567581835e-05, + "loss": 7.0677, + "step": 2509 + }, + { + "epoch": 0.014927681035303073, + "grad_norm": 2.060865640640259, + "learning_rate": 4.997253568331014e-05, + "loss": 6.7128, + "step": 2510 + }, + { + "epoch": 0.014933628318584071, + "grad_norm": 2.3876516819000244, + "learning_rate": 4.997251379032078e-05, + "loss": 6.7562, + "step": 2511 + }, + { + "epoch": 0.014939575601865068, + "grad_norm": 2.387176990509033, + "learning_rate": 4.997249188861379e-05, + "loss": 6.8237, + "step": 2512 + }, + { + "epoch": 0.014945522885146065, + "grad_norm": 2.7324886322021484, + "learning_rate": 4.997246997818915e-05, + "loss": 6.8963, + "step": 2513 + }, + { + "epoch": 0.014951470168427062, + "grad_norm": 2.3832128047943115, + "learning_rate": 4.997244805904689e-05, + "loss": 6.9467, + "step": 2514 + }, + { + "epoch": 0.01495741745170806, + "grad_norm": 1.8594162464141846, + "learning_rate": 4.9972426131187e-05, + "loss": 7.0712, + "step": 2515 + }, + { + "epoch": 0.014963364734989057, + "grad_norm": 2.322068691253662, + "learning_rate": 4.997240419460949e-05, + "loss": 6.8898, + "step": 2516 + }, + { + "epoch": 0.014969312018270054, + "grad_norm": 2.4850032329559326, + "learning_rate": 4.997238224931438e-05, + "loss": 6.5439, + "step": 2517 + }, + { + "epoch": 0.014975259301551051, + "grad_norm": 2.919579029083252, + "learning_rate": 4.997236029530166e-05, + "loss": 6.3987, + "step": 2518 + }, + { + "epoch": 0.01498120658483205, + "grad_norm": 2.651900053024292, + "learning_rate": 4.997233833257135e-05, + "loss": 6.2735, + "step": 2519 + }, + { + "epoch": 0.014987153868113046, + "grad_norm": 2.7912142276763916, + "learning_rate": 4.997231636112346e-05, + "loss": 6.9835, + "step": 2520 + }, + { + "epoch": 0.014993101151394043, + "grad_norm": 2.5735433101654053, + "learning_rate": 4.997229438095799e-05, + "loss": 7.1218, + "step": 2521 + }, + { + "epoch": 0.01499904843467504, + "grad_norm": 2.483186721801758, + "learning_rate": 4.997227239207494e-05, + "loss": 7.0343, + "step": 2522 + }, + { + "epoch": 0.015004995717956039, + "grad_norm": 2.9296681880950928, + "learning_rate": 4.997225039447434e-05, + "loss": 6.5455, + "step": 2523 + }, + { + "epoch": 0.015010943001237035, + "grad_norm": 2.5536422729492188, + "learning_rate": 4.997222838815618e-05, + "loss": 6.7173, + "step": 2524 + }, + { + "epoch": 0.015016890284518032, + "grad_norm": 6.365324020385742, + "learning_rate": 4.997220637312047e-05, + "loss": 6.0909, + "step": 2525 + }, + { + "epoch": 0.015022837567799029, + "grad_norm": 3.7258150577545166, + "learning_rate": 4.997218434936723e-05, + "loss": 5.9019, + "step": 2526 + }, + { + "epoch": 0.015028784851080026, + "grad_norm": 2.9021997451782227, + "learning_rate": 4.997216231689645e-05, + "loss": 5.8601, + "step": 2527 + }, + { + "epoch": 0.015034732134361024, + "grad_norm": 2.570988416671753, + "learning_rate": 4.997214027570815e-05, + "loss": 6.1513, + "step": 2528 + }, + { + "epoch": 0.015040679417642021, + "grad_norm": 3.013540029525757, + "learning_rate": 4.997211822580233e-05, + "loss": 6.6471, + "step": 2529 + }, + { + "epoch": 0.015046626700923018, + "grad_norm": 2.612210750579834, + "learning_rate": 4.997209616717901e-05, + "loss": 6.5523, + "step": 2530 + }, + { + "epoch": 0.015052573984204015, + "grad_norm": 2.93513822555542, + "learning_rate": 4.9972074099838186e-05, + "loss": 6.1845, + "step": 2531 + }, + { + "epoch": 0.015058521267485013, + "grad_norm": 3.569002389907837, + "learning_rate": 4.9972052023779865e-05, + "loss": 6.7383, + "step": 2532 + }, + { + "epoch": 0.01506446855076601, + "grad_norm": 2.560023784637451, + "learning_rate": 4.9972029939004064e-05, + "loss": 6.4978, + "step": 2533 + }, + { + "epoch": 0.015070415834047007, + "grad_norm": 2.304612398147583, + "learning_rate": 4.997200784551078e-05, + "loss": 6.3316, + "step": 2534 + }, + { + "epoch": 0.015076363117328004, + "grad_norm": 2.4442996978759766, + "learning_rate": 4.997198574330003e-05, + "loss": 6.4245, + "step": 2535 + }, + { + "epoch": 0.015082310400609002, + "grad_norm": 2.764831304550171, + "learning_rate": 4.997196363237181e-05, + "loss": 6.2251, + "step": 2536 + }, + { + "epoch": 0.01508825768389, + "grad_norm": 2.6534347534179688, + "learning_rate": 4.997194151272615e-05, + "loss": 6.6674, + "step": 2537 + }, + { + "epoch": 0.015094204967170996, + "grad_norm": 2.5901331901550293, + "learning_rate": 4.997191938436303e-05, + "loss": 6.5724, + "step": 2538 + }, + { + "epoch": 0.015100152250451993, + "grad_norm": 2.6827733516693115, + "learning_rate": 4.9971897247282474e-05, + "loss": 6.4774, + "step": 2539 + }, + { + "epoch": 0.015106099533732991, + "grad_norm": 2.087397813796997, + "learning_rate": 4.997187510148449e-05, + "loss": 6.5011, + "step": 2540 + }, + { + "epoch": 0.015112046817013988, + "grad_norm": 2.157935619354248, + "learning_rate": 4.9971852946969076e-05, + "loss": 6.3258, + "step": 2541 + }, + { + "epoch": 0.015117994100294985, + "grad_norm": 2.680481195449829, + "learning_rate": 4.997183078373625e-05, + "loss": 6.5631, + "step": 2542 + }, + { + "epoch": 0.015123941383575982, + "grad_norm": 2.897608995437622, + "learning_rate": 4.997180861178602e-05, + "loss": 6.7913, + "step": 2543 + }, + { + "epoch": 0.01512988866685698, + "grad_norm": 2.5714452266693115, + "learning_rate": 4.997178643111838e-05, + "loss": 6.767, + "step": 2544 + }, + { + "epoch": 0.015135835950137977, + "grad_norm": 2.096376419067383, + "learning_rate": 4.997176424173336e-05, + "loss": 6.7365, + "step": 2545 + }, + { + "epoch": 0.015141783233418974, + "grad_norm": 2.083101987838745, + "learning_rate": 4.9971742043630955e-05, + "loss": 6.4693, + "step": 2546 + }, + { + "epoch": 0.015147730516699971, + "grad_norm": 3.509512186050415, + "learning_rate": 4.997171983681116e-05, + "loss": 6.4068, + "step": 2547 + }, + { + "epoch": 0.01515367779998097, + "grad_norm": 3.055772304534912, + "learning_rate": 4.997169762127401e-05, + "loss": 6.3411, + "step": 2548 + }, + { + "epoch": 0.015159625083261966, + "grad_norm": 2.627429485321045, + "learning_rate": 4.997167539701949e-05, + "loss": 6.3788, + "step": 2549 + }, + { + "epoch": 0.015165572366542963, + "grad_norm": 2.408599853515625, + "learning_rate": 4.997165316404761e-05, + "loss": 6.2822, + "step": 2550 + }, + { + "epoch": 0.01517151964982396, + "grad_norm": 2.906006336212158, + "learning_rate": 4.997163092235839e-05, + "loss": 6.2615, + "step": 2551 + }, + { + "epoch": 0.015177466933104958, + "grad_norm": 2.4585347175598145, + "learning_rate": 4.997160867195183e-05, + "loss": 6.4076, + "step": 2552 + }, + { + "epoch": 0.015183414216385955, + "grad_norm": 2.495539665222168, + "learning_rate": 4.9971586412827944e-05, + "loss": 6.4893, + "step": 2553 + }, + { + "epoch": 0.015189361499666952, + "grad_norm": 2.719583034515381, + "learning_rate": 4.9971564144986734e-05, + "loss": 6.276, + "step": 2554 + }, + { + "epoch": 0.015195308782947949, + "grad_norm": 2.464207887649536, + "learning_rate": 4.9971541868428206e-05, + "loss": 6.2713, + "step": 2555 + }, + { + "epoch": 0.015201256066228947, + "grad_norm": 2.3604822158813477, + "learning_rate": 4.997151958315237e-05, + "loss": 6.2648, + "step": 2556 + }, + { + "epoch": 0.015207203349509944, + "grad_norm": 2.729820966720581, + "learning_rate": 4.997149728915924e-05, + "loss": 6.2985, + "step": 2557 + }, + { + "epoch": 0.015213150632790941, + "grad_norm": 2.565760612487793, + "learning_rate": 4.997147498644882e-05, + "loss": 6.401, + "step": 2558 + }, + { + "epoch": 0.015219097916071938, + "grad_norm": 3.091628074645996, + "learning_rate": 4.9971452675021104e-05, + "loss": 6.1774, + "step": 2559 + }, + { + "epoch": 0.015225045199352935, + "grad_norm": 2.452453851699829, + "learning_rate": 4.9971430354876125e-05, + "loss": 6.4669, + "step": 2560 + }, + { + "epoch": 0.015230992482633933, + "grad_norm": 2.4285218715667725, + "learning_rate": 4.997140802601387e-05, + "loss": 6.4086, + "step": 2561 + }, + { + "epoch": 0.01523693976591493, + "grad_norm": 2.094043254852295, + "learning_rate": 4.9971385688434356e-05, + "loss": 6.2502, + "step": 2562 + }, + { + "epoch": 0.015242887049195927, + "grad_norm": 2.5989573001861572, + "learning_rate": 4.9971363342137586e-05, + "loss": 6.2948, + "step": 2563 + }, + { + "epoch": 0.015248834332476924, + "grad_norm": 2.5372314453125, + "learning_rate": 4.9971340987123574e-05, + "loss": 6.5643, + "step": 2564 + }, + { + "epoch": 0.015254781615757922, + "grad_norm": 2.3666064739227295, + "learning_rate": 4.9971318623392325e-05, + "loss": 6.4807, + "step": 2565 + }, + { + "epoch": 0.01526072889903892, + "grad_norm": 2.3216497898101807, + "learning_rate": 4.997129625094385e-05, + "loss": 6.448, + "step": 2566 + }, + { + "epoch": 0.015266676182319916, + "grad_norm": 2.202665090560913, + "learning_rate": 4.9971273869778153e-05, + "loss": 6.3766, + "step": 2567 + }, + { + "epoch": 0.015272623465600913, + "grad_norm": 2.5678982734680176, + "learning_rate": 4.997125147989524e-05, + "loss": 6.0799, + "step": 2568 + }, + { + "epoch": 0.015278570748881911, + "grad_norm": 2.7904717922210693, + "learning_rate": 4.997122908129512e-05, + "loss": 6.3446, + "step": 2569 + }, + { + "epoch": 0.015284518032162908, + "grad_norm": 2.383120059967041, + "learning_rate": 4.99712066739778e-05, + "loss": 6.2398, + "step": 2570 + }, + { + "epoch": 0.015290465315443905, + "grad_norm": 2.4302077293395996, + "learning_rate": 4.9971184257943294e-05, + "loss": 6.2678, + "step": 2571 + }, + { + "epoch": 0.015296412598724902, + "grad_norm": 2.2923178672790527, + "learning_rate": 4.99711618331916e-05, + "loss": 6.4742, + "step": 2572 + }, + { + "epoch": 0.0153023598820059, + "grad_norm": 2.582810878753662, + "learning_rate": 4.9971139399722735e-05, + "loss": 6.4679, + "step": 2573 + }, + { + "epoch": 0.015308307165286897, + "grad_norm": 2.718228578567505, + "learning_rate": 4.997111695753671e-05, + "loss": 6.2475, + "step": 2574 + }, + { + "epoch": 0.015314254448567894, + "grad_norm": 2.4639811515808105, + "learning_rate": 4.997109450663352e-05, + "loss": 6.463, + "step": 2575 + }, + { + "epoch": 0.01532020173184889, + "grad_norm": 2.6998252868652344, + "learning_rate": 4.997107204701318e-05, + "loss": 6.2885, + "step": 2576 + }, + { + "epoch": 0.01532614901512989, + "grad_norm": 2.831291437149048, + "learning_rate": 4.997104957867569e-05, + "loss": 6.2056, + "step": 2577 + }, + { + "epoch": 0.015332096298410886, + "grad_norm": 2.9070980548858643, + "learning_rate": 4.997102710162107e-05, + "loss": 6.3247, + "step": 2578 + }, + { + "epoch": 0.015338043581691883, + "grad_norm": 2.2583134174346924, + "learning_rate": 4.997100461584933e-05, + "loss": 6.3241, + "step": 2579 + }, + { + "epoch": 0.01534399086497288, + "grad_norm": 2.1661887168884277, + "learning_rate": 4.997098212136045e-05, + "loss": 6.173, + "step": 2580 + }, + { + "epoch": 0.015349938148253878, + "grad_norm": 2.146256446838379, + "learning_rate": 4.997095961815448e-05, + "loss": 6.2267, + "step": 2581 + }, + { + "epoch": 0.015355885431534875, + "grad_norm": 2.5691211223602295, + "learning_rate": 4.997093710623139e-05, + "loss": 6.3302, + "step": 2582 + }, + { + "epoch": 0.015361832714815872, + "grad_norm": 2.5439505577087402, + "learning_rate": 4.997091458559121e-05, + "loss": 6.2111, + "step": 2583 + }, + { + "epoch": 0.015367779998096869, + "grad_norm": 2.451582670211792, + "learning_rate": 4.997089205623394e-05, + "loss": 6.2369, + "step": 2584 + }, + { + "epoch": 0.015373727281377867, + "grad_norm": 2.6275687217712402, + "learning_rate": 4.99708695181596e-05, + "loss": 6.1104, + "step": 2585 + }, + { + "epoch": 0.015379674564658864, + "grad_norm": 2.7068562507629395, + "learning_rate": 4.997084697136818e-05, + "loss": 6.1646, + "step": 2586 + }, + { + "epoch": 0.015385621847939861, + "grad_norm": 2.7819957733154297, + "learning_rate": 4.9970824415859694e-05, + "loss": 6.4203, + "step": 2587 + }, + { + "epoch": 0.015391569131220858, + "grad_norm": 2.7021708488464355, + "learning_rate": 4.9970801851634154e-05, + "loss": 6.1535, + "step": 2588 + }, + { + "epoch": 0.015397516414501855, + "grad_norm": 2.50740909576416, + "learning_rate": 4.997077927869156e-05, + "loss": 6.0139, + "step": 2589 + }, + { + "epoch": 0.015403463697782853, + "grad_norm": 2.5769078731536865, + "learning_rate": 4.997075669703193e-05, + "loss": 6.129, + "step": 2590 + }, + { + "epoch": 0.01540941098106385, + "grad_norm": 2.7379090785980225, + "learning_rate": 4.997073410665526e-05, + "loss": 6.4168, + "step": 2591 + }, + { + "epoch": 0.015415358264344847, + "grad_norm": 2.3530659675598145, + "learning_rate": 4.9970711507561565e-05, + "loss": 6.3114, + "step": 2592 + }, + { + "epoch": 0.015421305547625844, + "grad_norm": 2.6025893688201904, + "learning_rate": 4.997068889975086e-05, + "loss": 6.2506, + "step": 2593 + }, + { + "epoch": 0.015427252830906842, + "grad_norm": 2.311833143234253, + "learning_rate": 4.9970666283223145e-05, + "loss": 6.3372, + "step": 2594 + }, + { + "epoch": 0.015433200114187839, + "grad_norm": 2.339947462081909, + "learning_rate": 4.997064365797842e-05, + "loss": 6.2987, + "step": 2595 + }, + { + "epoch": 0.015439147397468836, + "grad_norm": 2.2132725715637207, + "learning_rate": 4.9970621024016714e-05, + "loss": 6.2473, + "step": 2596 + }, + { + "epoch": 0.015445094680749833, + "grad_norm": 2.7063987255096436, + "learning_rate": 4.9970598381338014e-05, + "loss": 6.1702, + "step": 2597 + }, + { + "epoch": 0.015451041964030831, + "grad_norm": 2.4952430725097656, + "learning_rate": 4.9970575729942335e-05, + "loss": 6.3301, + "step": 2598 + }, + { + "epoch": 0.015456989247311828, + "grad_norm": 2.7442502975463867, + "learning_rate": 4.997055306982969e-05, + "loss": 6.1922, + "step": 2599 + }, + { + "epoch": 0.015462936530592825, + "grad_norm": 2.860058069229126, + "learning_rate": 4.997053040100008e-05, + "loss": 6.0674, + "step": 2600 + }, + { + "epoch": 0.015468883813873822, + "grad_norm": 2.821620464324951, + "learning_rate": 4.997050772345352e-05, + "loss": 6.0445, + "step": 2601 + }, + { + "epoch": 0.01547483109715482, + "grad_norm": 2.369174003601074, + "learning_rate": 4.997048503719001e-05, + "loss": 5.8641, + "step": 2602 + }, + { + "epoch": 0.015480778380435817, + "grad_norm": 2.2836029529571533, + "learning_rate": 4.997046234220956e-05, + "loss": 5.7629, + "step": 2603 + }, + { + "epoch": 0.015486725663716814, + "grad_norm": 3.13094162940979, + "learning_rate": 4.997043963851218e-05, + "loss": 6.7871, + "step": 2604 + }, + { + "epoch": 0.01549267294699781, + "grad_norm": 2.884119749069214, + "learning_rate": 4.9970416926097885e-05, + "loss": 6.1079, + "step": 2605 + }, + { + "epoch": 0.01549862023027881, + "grad_norm": 3.0921716690063477, + "learning_rate": 4.997039420496666e-05, + "loss": 5.9221, + "step": 2606 + }, + { + "epoch": 0.015504567513559806, + "grad_norm": 2.6903741359710693, + "learning_rate": 4.997037147511855e-05, + "loss": 5.7377, + "step": 2607 + }, + { + "epoch": 0.015510514796840803, + "grad_norm": 2.177030086517334, + "learning_rate": 4.997034873655352e-05, + "loss": 5.7272, + "step": 2608 + }, + { + "epoch": 0.0155164620801218, + "grad_norm": 2.41406512260437, + "learning_rate": 4.997032598927162e-05, + "loss": 5.6456, + "step": 2609 + }, + { + "epoch": 0.015522409363402798, + "grad_norm": 2.6853182315826416, + "learning_rate": 4.997030323327282e-05, + "loss": 6.1634, + "step": 2610 + }, + { + "epoch": 0.015528356646683795, + "grad_norm": 2.734081983566284, + "learning_rate": 4.997028046855715e-05, + "loss": 6.1366, + "step": 2611 + }, + { + "epoch": 0.015534303929964792, + "grad_norm": 2.234046459197998, + "learning_rate": 4.997025769512461e-05, + "loss": 5.6773, + "step": 2612 + }, + { + "epoch": 0.015540251213245789, + "grad_norm": 2.467381715774536, + "learning_rate": 4.9970234912975226e-05, + "loss": 5.6409, + "step": 2613 + }, + { + "epoch": 0.015546198496526787, + "grad_norm": 2.4890551567077637, + "learning_rate": 4.997021212210897e-05, + "loss": 5.5961, + "step": 2614 + }, + { + "epoch": 0.015552145779807784, + "grad_norm": 2.254138708114624, + "learning_rate": 4.997018932252588e-05, + "loss": 5.6039, + "step": 2615 + }, + { + "epoch": 0.015558093063088781, + "grad_norm": 2.5773816108703613, + "learning_rate": 4.9970166514225955e-05, + "loss": 5.9935, + "step": 2616 + }, + { + "epoch": 0.015564040346369778, + "grad_norm": 2.308300733566284, + "learning_rate": 4.997014369720921e-05, + "loss": 5.8307, + "step": 2617 + }, + { + "epoch": 0.015569987629650776, + "grad_norm": 2.3276724815368652, + "learning_rate": 4.9970120871475634e-05, + "loss": 5.5819, + "step": 2618 + }, + { + "epoch": 0.015575934912931773, + "grad_norm": 2.7989203929901123, + "learning_rate": 4.997009803702526e-05, + "loss": 6.0816, + "step": 2619 + }, + { + "epoch": 0.01558188219621277, + "grad_norm": 2.5614469051361084, + "learning_rate": 4.997007519385807e-05, + "loss": 5.6677, + "step": 2620 + }, + { + "epoch": 0.015587829479493767, + "grad_norm": 2.4494402408599854, + "learning_rate": 4.9970052341974096e-05, + "loss": 5.7754, + "step": 2621 + }, + { + "epoch": 0.015593776762774764, + "grad_norm": 2.214578151702881, + "learning_rate": 4.997002948137333e-05, + "loss": 6.4244, + "step": 2622 + }, + { + "epoch": 0.015599724046055762, + "grad_norm": 2.8115196228027344, + "learning_rate": 4.9970006612055776e-05, + "loss": 5.9822, + "step": 2623 + }, + { + "epoch": 0.015605671329336759, + "grad_norm": 2.4020626544952393, + "learning_rate": 4.996998373402146e-05, + "loss": 6.0481, + "step": 2624 + }, + { + "epoch": 0.015611618612617756, + "grad_norm": 2.3936421871185303, + "learning_rate": 4.996996084727038e-05, + "loss": 6.0663, + "step": 2625 + }, + { + "epoch": 0.015617565895898753, + "grad_norm": 2.2710554599761963, + "learning_rate": 4.996993795180254e-05, + "loss": 6.0668, + "step": 2626 + }, + { + "epoch": 0.015623513179179751, + "grad_norm": 2.141789436340332, + "learning_rate": 4.9969915047617955e-05, + "loss": 6.2159, + "step": 2627 + }, + { + "epoch": 0.015629460462460748, + "grad_norm": 2.557889461517334, + "learning_rate": 4.9969892134716635e-05, + "loss": 6.262, + "step": 2628 + }, + { + "epoch": 0.015635407745741747, + "grad_norm": 2.3966641426086426, + "learning_rate": 4.9969869213098574e-05, + "loss": 6.0412, + "step": 2629 + }, + { + "epoch": 0.01564135502902274, + "grad_norm": 2.301426410675049, + "learning_rate": 4.99698462827638e-05, + "loss": 6.0798, + "step": 2630 + }, + { + "epoch": 0.01564730231230374, + "grad_norm": 2.4315614700317383, + "learning_rate": 4.996982334371231e-05, + "loss": 5.8736, + "step": 2631 + }, + { + "epoch": 0.015653249595584735, + "grad_norm": 2.5549440383911133, + "learning_rate": 4.9969800395944105e-05, + "loss": 5.7858, + "step": 2632 + }, + { + "epoch": 0.015659196878865734, + "grad_norm": 2.480375289916992, + "learning_rate": 4.99697774394592e-05, + "loss": 6.3261, + "step": 2633 + }, + { + "epoch": 0.015665144162146732, + "grad_norm": 2.42866849899292, + "learning_rate": 4.9969754474257614e-05, + "loss": 6.1729, + "step": 2634 + }, + { + "epoch": 0.015671091445427728, + "grad_norm": 2.32722544670105, + "learning_rate": 4.9969731500339335e-05, + "loss": 5.7746, + "step": 2635 + }, + { + "epoch": 0.015677038728708726, + "grad_norm": 2.6797266006469727, + "learning_rate": 4.996970851770438e-05, + "loss": 6.1657, + "step": 2636 + }, + { + "epoch": 0.015682986011989725, + "grad_norm": 2.87758731842041, + "learning_rate": 4.9969685526352775e-05, + "loss": 6.1475, + "step": 2637 + }, + { + "epoch": 0.01568893329527072, + "grad_norm": 2.898663282394409, + "learning_rate": 4.996966252628449e-05, + "loss": 6.2942, + "step": 2638 + }, + { + "epoch": 0.01569488057855172, + "grad_norm": 3.3087987899780273, + "learning_rate": 4.996963951749957e-05, + "loss": 5.9962, + "step": 2639 + }, + { + "epoch": 0.015700827861832713, + "grad_norm": 2.4418020248413086, + "learning_rate": 4.996961649999799e-05, + "loss": 6.1065, + "step": 2640 + }, + { + "epoch": 0.015706775145113712, + "grad_norm": 2.5839014053344727, + "learning_rate": 4.9969593473779786e-05, + "loss": 6.2303, + "step": 2641 + }, + { + "epoch": 0.01571272242839471, + "grad_norm": 2.683163642883301, + "learning_rate": 4.996957043884495e-05, + "loss": 5.7194, + "step": 2642 + }, + { + "epoch": 0.015718669711675706, + "grad_norm": 2.628574848175049, + "learning_rate": 4.99695473951935e-05, + "loss": 5.6239, + "step": 2643 + }, + { + "epoch": 0.015724616994956704, + "grad_norm": 3.0716800689697266, + "learning_rate": 4.9969524342825434e-05, + "loss": 6.1957, + "step": 2644 + }, + { + "epoch": 0.015730564278237703, + "grad_norm": 2.415626287460327, + "learning_rate": 4.996950128174077e-05, + "loss": 6.2953, + "step": 2645 + }, + { + "epoch": 0.015736511561518698, + "grad_norm": 2.6836612224578857, + "learning_rate": 4.996947821193951e-05, + "loss": 6.103, + "step": 2646 + }, + { + "epoch": 0.015742458844799696, + "grad_norm": 2.2673206329345703, + "learning_rate": 4.996945513342166e-05, + "loss": 6.2628, + "step": 2647 + }, + { + "epoch": 0.01574840612808069, + "grad_norm": 2.629955530166626, + "learning_rate": 4.996943204618724e-05, + "loss": 6.2444, + "step": 2648 + }, + { + "epoch": 0.01575435341136169, + "grad_norm": 2.6730127334594727, + "learning_rate": 4.996940895023623e-05, + "loss": 6.0595, + "step": 2649 + }, + { + "epoch": 0.01576030069464269, + "grad_norm": 2.607389450073242, + "learning_rate": 4.996938584556867e-05, + "loss": 6.0253, + "step": 2650 + }, + { + "epoch": 0.015766247977923684, + "grad_norm": 2.264345407485962, + "learning_rate": 4.996936273218456e-05, + "loss": 6.1011, + "step": 2651 + }, + { + "epoch": 0.015772195261204682, + "grad_norm": 2.218766450881958, + "learning_rate": 4.99693396100839e-05, + "loss": 6.0545, + "step": 2652 + }, + { + "epoch": 0.015778142544485677, + "grad_norm": 2.435213088989258, + "learning_rate": 4.99693164792667e-05, + "loss": 6.0679, + "step": 2653 + }, + { + "epoch": 0.015784089827766676, + "grad_norm": 2.2278120517730713, + "learning_rate": 4.996929333973297e-05, + "loss": 6.0864, + "step": 2654 + }, + { + "epoch": 0.015790037111047674, + "grad_norm": 1.983554482460022, + "learning_rate": 4.9969270191482715e-05, + "loss": 6.124, + "step": 2655 + }, + { + "epoch": 0.01579598439432867, + "grad_norm": 1.9382312297821045, + "learning_rate": 4.996924703451594e-05, + "loss": 6.392, + "step": 2656 + }, + { + "epoch": 0.015801931677609668, + "grad_norm": 2.8142831325531006, + "learning_rate": 4.9969223868832674e-05, + "loss": 6.017, + "step": 2657 + }, + { + "epoch": 0.015807878960890667, + "grad_norm": 2.3466787338256836, + "learning_rate": 4.9969200694432904e-05, + "loss": 5.9588, + "step": 2658 + }, + { + "epoch": 0.01581382624417166, + "grad_norm": 2.0172243118286133, + "learning_rate": 4.996917751131664e-05, + "loss": 5.9513, + "step": 2659 + }, + { + "epoch": 0.01581977352745266, + "grad_norm": 2.3778223991394043, + "learning_rate": 4.99691543194839e-05, + "loss": 6.2205, + "step": 2660 + }, + { + "epoch": 0.015825720810733655, + "grad_norm": 2.4351084232330322, + "learning_rate": 4.9969131118934675e-05, + "loss": 6.0916, + "step": 2661 + }, + { + "epoch": 0.015831668094014654, + "grad_norm": 2.22328519821167, + "learning_rate": 4.9969107909669e-05, + "loss": 6.5546, + "step": 2662 + }, + { + "epoch": 0.015837615377295652, + "grad_norm": 2.4626407623291016, + "learning_rate": 4.996908469168685e-05, + "loss": 6.522, + "step": 2663 + }, + { + "epoch": 0.015843562660576647, + "grad_norm": 2.1032283306121826, + "learning_rate": 4.9969061464988266e-05, + "loss": 6.3372, + "step": 2664 + }, + { + "epoch": 0.015849509943857646, + "grad_norm": 2.1436524391174316, + "learning_rate": 4.9969038229573236e-05, + "loss": 6.3792, + "step": 2665 + }, + { + "epoch": 0.015855457227138645, + "grad_norm": 2.42084002494812, + "learning_rate": 4.996901498544176e-05, + "loss": 6.701, + "step": 2666 + }, + { + "epoch": 0.01586140451041964, + "grad_norm": 2.854630947113037, + "learning_rate": 4.996899173259388e-05, + "loss": 6.3273, + "step": 2667 + }, + { + "epoch": 0.015867351793700638, + "grad_norm": 2.2480521202087402, + "learning_rate": 4.996896847102957e-05, + "loss": 6.4314, + "step": 2668 + }, + { + "epoch": 0.015873299076981633, + "grad_norm": 3.7074203491210938, + "learning_rate": 4.996894520074886e-05, + "loss": 5.9438, + "step": 2669 + }, + { + "epoch": 0.015879246360262632, + "grad_norm": 3.1037209033966064, + "learning_rate": 4.9968921921751735e-05, + "loss": 5.7915, + "step": 2670 + }, + { + "epoch": 0.01588519364354363, + "grad_norm": 2.8338170051574707, + "learning_rate": 4.996889863403823e-05, + "loss": 6.7765, + "step": 2671 + }, + { + "epoch": 0.015891140926824626, + "grad_norm": 2.6366934776306152, + "learning_rate": 4.996887533760833e-05, + "loss": 6.8019, + "step": 2672 + }, + { + "epoch": 0.015897088210105624, + "grad_norm": 2.3954126834869385, + "learning_rate": 4.996885203246207e-05, + "loss": 6.3946, + "step": 2673 + }, + { + "epoch": 0.015903035493386623, + "grad_norm": 2.5771238803863525, + "learning_rate": 4.996882871859943e-05, + "loss": 6.3767, + "step": 2674 + }, + { + "epoch": 0.015908982776667618, + "grad_norm": 3.8544304370880127, + "learning_rate": 4.9968805396020424e-05, + "loss": 7.0813, + "step": 2675 + }, + { + "epoch": 0.015914930059948616, + "grad_norm": 3.4221606254577637, + "learning_rate": 4.996878206472507e-05, + "loss": 6.4782, + "step": 2676 + }, + { + "epoch": 0.01592087734322961, + "grad_norm": 3.6425843238830566, + "learning_rate": 4.996875872471338e-05, + "loss": 5.8685, + "step": 2677 + }, + { + "epoch": 0.01592682462651061, + "grad_norm": 3.255345344543457, + "learning_rate": 4.996873537598535e-05, + "loss": 5.7099, + "step": 2678 + }, + { + "epoch": 0.01593277190979161, + "grad_norm": 2.5217175483703613, + "learning_rate": 4.9968712018540997e-05, + "loss": 5.8978, + "step": 2679 + }, + { + "epoch": 0.015938719193072604, + "grad_norm": 2.2415871620178223, + "learning_rate": 4.996868865238031e-05, + "loss": 6.8186, + "step": 2680 + }, + { + "epoch": 0.015944666476353602, + "grad_norm": 2.1412270069122314, + "learning_rate": 4.996866527750332e-05, + "loss": 6.8056, + "step": 2681 + }, + { + "epoch": 0.015950613759634597, + "grad_norm": 2.423093557357788, + "learning_rate": 4.996864189391004e-05, + "loss": 7.0769, + "step": 2682 + }, + { + "epoch": 0.015956561042915596, + "grad_norm": 2.2334039211273193, + "learning_rate": 4.9968618501600454e-05, + "loss": 6.9954, + "step": 2683 + }, + { + "epoch": 0.015962508326196594, + "grad_norm": 2.4311838150024414, + "learning_rate": 4.996859510057458e-05, + "loss": 6.8375, + "step": 2684 + }, + { + "epoch": 0.01596845560947759, + "grad_norm": 4.861137866973877, + "learning_rate": 4.996857169083242e-05, + "loss": 6.2628, + "step": 2685 + }, + { + "epoch": 0.015974402892758588, + "grad_norm": 3.064213991165161, + "learning_rate": 4.996854827237401e-05, + "loss": 6.4316, + "step": 2686 + }, + { + "epoch": 0.015980350176039586, + "grad_norm": 2.307011365890503, + "learning_rate": 4.996852484519932e-05, + "loss": 6.6212, + "step": 2687 + }, + { + "epoch": 0.01598629745932058, + "grad_norm": 2.5157034397125244, + "learning_rate": 4.9968501409308374e-05, + "loss": 7.153, + "step": 2688 + }, + { + "epoch": 0.01599224474260158, + "grad_norm": 2.4122424125671387, + "learning_rate": 4.996847796470119e-05, + "loss": 7.2244, + "step": 2689 + }, + { + "epoch": 0.015998192025882575, + "grad_norm": 2.305055618286133, + "learning_rate": 4.9968454511377773e-05, + "loss": 7.4751, + "step": 2690 + }, + { + "epoch": 0.016004139309163574, + "grad_norm": 3.068027973175049, + "learning_rate": 4.9968431049338116e-05, + "loss": 6.5709, + "step": 2691 + }, + { + "epoch": 0.016010086592444572, + "grad_norm": 2.09893798828125, + "learning_rate": 4.9968407578582246e-05, + "loss": 6.7212, + "step": 2692 + }, + { + "epoch": 0.016016033875725567, + "grad_norm": 2.3161933422088623, + "learning_rate": 4.9968384099110163e-05, + "loss": 6.6243, + "step": 2693 + }, + { + "epoch": 0.016021981159006566, + "grad_norm": 2.913304090499878, + "learning_rate": 4.9968360610921874e-05, + "loss": 6.1946, + "step": 2694 + }, + { + "epoch": 0.016027928442287565, + "grad_norm": 2.746368408203125, + "learning_rate": 4.9968337114017386e-05, + "loss": 6.3783, + "step": 2695 + }, + { + "epoch": 0.01603387572556856, + "grad_norm": 2.40331768989563, + "learning_rate": 4.9968313608396705e-05, + "loss": 6.9898, + "step": 2696 + }, + { + "epoch": 0.016039823008849558, + "grad_norm": 2.214869976043701, + "learning_rate": 4.9968290094059844e-05, + "loss": 6.4497, + "step": 2697 + }, + { + "epoch": 0.016045770292130553, + "grad_norm": 2.050436019897461, + "learning_rate": 4.996826657100682e-05, + "loss": 6.8897, + "step": 2698 + }, + { + "epoch": 0.016051717575411552, + "grad_norm": 2.294149398803711, + "learning_rate": 4.996824303923763e-05, + "loss": 6.5583, + "step": 2699 + }, + { + "epoch": 0.01605766485869255, + "grad_norm": 2.26918625831604, + "learning_rate": 4.996821949875228e-05, + "loss": 6.7411, + "step": 2700 + }, + { + "epoch": 0.016063612141973545, + "grad_norm": 2.1330158710479736, + "learning_rate": 4.9968195949550775e-05, + "loss": 6.8068, + "step": 2701 + }, + { + "epoch": 0.016069559425254544, + "grad_norm": 1.8605769872665405, + "learning_rate": 4.996817239163315e-05, + "loss": 6.4833, + "step": 2702 + }, + { + "epoch": 0.016075506708535543, + "grad_norm": 3.132803440093994, + "learning_rate": 4.996814882499938e-05, + "loss": 5.8281, + "step": 2703 + }, + { + "epoch": 0.016081453991816538, + "grad_norm": 3.1079390048980713, + "learning_rate": 4.996812524964949e-05, + "loss": 5.6894, + "step": 2704 + }, + { + "epoch": 0.016087401275097536, + "grad_norm": 2.2877023220062256, + "learning_rate": 4.996810166558349e-05, + "loss": 7.0128, + "step": 2705 + }, + { + "epoch": 0.01609334855837853, + "grad_norm": 2.415696859359741, + "learning_rate": 4.996807807280138e-05, + "loss": 6.8098, + "step": 2706 + }, + { + "epoch": 0.01609929584165953, + "grad_norm": 2.342111110687256, + "learning_rate": 4.996805447130317e-05, + "loss": 7.2452, + "step": 2707 + }, + { + "epoch": 0.01610524312494053, + "grad_norm": 2.6504852771759033, + "learning_rate": 4.996803086108887e-05, + "loss": 6.6731, + "step": 2708 + }, + { + "epoch": 0.016111190408221523, + "grad_norm": 2.6157166957855225, + "learning_rate": 4.996800724215849e-05, + "loss": 6.9377, + "step": 2709 + }, + { + "epoch": 0.016117137691502522, + "grad_norm": 2.6289443969726562, + "learning_rate": 4.9967983614512036e-05, + "loss": 6.639, + "step": 2710 + }, + { + "epoch": 0.01612308497478352, + "grad_norm": 2.966489791870117, + "learning_rate": 4.996795997814952e-05, + "loss": 6.3681, + "step": 2711 + }, + { + "epoch": 0.016129032258064516, + "grad_norm": 3.7333364486694336, + "learning_rate": 4.9967936333070944e-05, + "loss": 5.6015, + "step": 2712 + }, + { + "epoch": 0.016134979541345514, + "grad_norm": 2.942728281021118, + "learning_rate": 4.9967912679276316e-05, + "loss": 5.6548, + "step": 2713 + }, + { + "epoch": 0.01614092682462651, + "grad_norm": 2.394622802734375, + "learning_rate": 4.996788901676566e-05, + "loss": 6.5119, + "step": 2714 + }, + { + "epoch": 0.016146874107907508, + "grad_norm": 2.8388447761535645, + "learning_rate": 4.9967865345538963e-05, + "loss": 6.4424, + "step": 2715 + }, + { + "epoch": 0.016152821391188506, + "grad_norm": 2.7682905197143555, + "learning_rate": 4.9967841665596245e-05, + "loss": 6.4688, + "step": 2716 + }, + { + "epoch": 0.0161587686744695, + "grad_norm": 3.0281460285186768, + "learning_rate": 4.996781797693751e-05, + "loss": 6.52, + "step": 2717 + }, + { + "epoch": 0.0161647159577505, + "grad_norm": 2.9734318256378174, + "learning_rate": 4.996779427956276e-05, + "loss": 6.4307, + "step": 2718 + }, + { + "epoch": 0.016170663241031495, + "grad_norm": 2.7653586864471436, + "learning_rate": 4.996777057347202e-05, + "loss": 6.1783, + "step": 2719 + }, + { + "epoch": 0.016176610524312494, + "grad_norm": 2.9418516159057617, + "learning_rate": 4.996774685866529e-05, + "loss": 6.5466, + "step": 2720 + }, + { + "epoch": 0.016182557807593492, + "grad_norm": 2.789217233657837, + "learning_rate": 4.996772313514258e-05, + "loss": 6.9296, + "step": 2721 + }, + { + "epoch": 0.016188505090874487, + "grad_norm": 2.8092539310455322, + "learning_rate": 4.996769940290389e-05, + "loss": 6.6186, + "step": 2722 + }, + { + "epoch": 0.016194452374155486, + "grad_norm": 2.696572780609131, + "learning_rate": 4.996767566194923e-05, + "loss": 6.5361, + "step": 2723 + }, + { + "epoch": 0.016200399657436484, + "grad_norm": 2.5987300872802734, + "learning_rate": 4.996765191227862e-05, + "loss": 6.4029, + "step": 2724 + }, + { + "epoch": 0.01620634694071748, + "grad_norm": 2.083057165145874, + "learning_rate": 4.996762815389205e-05, + "loss": 6.4747, + "step": 2725 + }, + { + "epoch": 0.016212294223998478, + "grad_norm": 2.912338972091675, + "learning_rate": 4.9967604386789555e-05, + "loss": 6.8869, + "step": 2726 + }, + { + "epoch": 0.016218241507279473, + "grad_norm": 2.642224073410034, + "learning_rate": 4.9967580610971124e-05, + "loss": 6.6701, + "step": 2727 + }, + { + "epoch": 0.016224188790560472, + "grad_norm": 2.673652410507202, + "learning_rate": 4.996755682643676e-05, + "loss": 6.8624, + "step": 2728 + }, + { + "epoch": 0.01623013607384147, + "grad_norm": 2.5223872661590576, + "learning_rate": 4.996753303318648e-05, + "loss": 6.8247, + "step": 2729 + }, + { + "epoch": 0.016236083357122465, + "grad_norm": 2.252037525177002, + "learning_rate": 4.99675092312203e-05, + "loss": 6.7924, + "step": 2730 + }, + { + "epoch": 0.016242030640403464, + "grad_norm": 2.2854461669921875, + "learning_rate": 4.9967485420538216e-05, + "loss": 6.4761, + "step": 2731 + }, + { + "epoch": 0.016247977923684463, + "grad_norm": 2.426912546157837, + "learning_rate": 4.9967461601140244e-05, + "loss": 6.6028, + "step": 2732 + }, + { + "epoch": 0.016253925206965458, + "grad_norm": 2.7375681400299072, + "learning_rate": 4.9967437773026384e-05, + "loss": 6.5283, + "step": 2733 + }, + { + "epoch": 0.016259872490246456, + "grad_norm": 2.7669689655303955, + "learning_rate": 4.996741393619665e-05, + "loss": 6.4382, + "step": 2734 + }, + { + "epoch": 0.01626581977352745, + "grad_norm": 2.294597864151001, + "learning_rate": 4.996739009065105e-05, + "loss": 6.7479, + "step": 2735 + }, + { + "epoch": 0.01627176705680845, + "grad_norm": 2.4791014194488525, + "learning_rate": 4.996736623638959e-05, + "loss": 6.7043, + "step": 2736 + }, + { + "epoch": 0.01627771434008945, + "grad_norm": 2.4080021381378174, + "learning_rate": 4.9967342373412286e-05, + "loss": 6.6046, + "step": 2737 + }, + { + "epoch": 0.016283661623370443, + "grad_norm": 2.463109254837036, + "learning_rate": 4.996731850171914e-05, + "loss": 6.3895, + "step": 2738 + }, + { + "epoch": 0.016289608906651442, + "grad_norm": 2.665908098220825, + "learning_rate": 4.9967294621310155e-05, + "loss": 6.6482, + "step": 2739 + }, + { + "epoch": 0.01629555618993244, + "grad_norm": 2.399526357650757, + "learning_rate": 4.996727073218536e-05, + "loss": 6.7098, + "step": 2740 + }, + { + "epoch": 0.016301503473213436, + "grad_norm": 2.678091287612915, + "learning_rate": 4.996724683434473e-05, + "loss": 6.419, + "step": 2741 + }, + { + "epoch": 0.016307450756494434, + "grad_norm": 2.5573642253875732, + "learning_rate": 4.99672229277883e-05, + "loss": 6.4703, + "step": 2742 + }, + { + "epoch": 0.01631339803977543, + "grad_norm": 2.644097089767456, + "learning_rate": 4.996719901251607e-05, + "loss": 5.9854, + "step": 2743 + }, + { + "epoch": 0.016319345323056428, + "grad_norm": 2.6165592670440674, + "learning_rate": 4.996717508852805e-05, + "loss": 6.1776, + "step": 2744 + }, + { + "epoch": 0.016325292606337426, + "grad_norm": 2.175647020339966, + "learning_rate": 4.996715115582426e-05, + "loss": 6.5533, + "step": 2745 + }, + { + "epoch": 0.01633123988961842, + "grad_norm": 2.112217664718628, + "learning_rate": 4.996712721440467e-05, + "loss": 6.5572, + "step": 2746 + }, + { + "epoch": 0.01633718717289942, + "grad_norm": 2.165111541748047, + "learning_rate": 4.996710326426933e-05, + "loss": 6.2798, + "step": 2747 + }, + { + "epoch": 0.016343134456180415, + "grad_norm": 2.5812315940856934, + "learning_rate": 4.996707930541823e-05, + "loss": 6.0831, + "step": 2748 + }, + { + "epoch": 0.016349081739461414, + "grad_norm": 2.2306227684020996, + "learning_rate": 4.996705533785138e-05, + "loss": 6.5833, + "step": 2749 + }, + { + "epoch": 0.016355029022742412, + "grad_norm": 1.999974250793457, + "learning_rate": 4.996703136156878e-05, + "loss": 6.2461, + "step": 2750 + }, + { + "epoch": 0.016360976306023407, + "grad_norm": 2.0521416664123535, + "learning_rate": 4.996700737657046e-05, + "loss": 6.4606, + "step": 2751 + }, + { + "epoch": 0.016366923589304406, + "grad_norm": 1.8630053997039795, + "learning_rate": 4.996698338285642e-05, + "loss": 6.1375, + "step": 2752 + }, + { + "epoch": 0.016372870872585404, + "grad_norm": 1.7525913715362549, + "learning_rate": 4.9966959380426646e-05, + "loss": 6.1769, + "step": 2753 + }, + { + "epoch": 0.0163788181558664, + "grad_norm": 2.8151230812072754, + "learning_rate": 4.996693536928118e-05, + "loss": 5.9066, + "step": 2754 + }, + { + "epoch": 0.016384765439147398, + "grad_norm": 2.503230571746826, + "learning_rate": 4.9966911349420004e-05, + "loss": 6.3725, + "step": 2755 + }, + { + "epoch": 0.016390712722428393, + "grad_norm": 2.676284074783325, + "learning_rate": 4.996688732084314e-05, + "loss": 6.9086, + "step": 2756 + }, + { + "epoch": 0.01639666000570939, + "grad_norm": 2.3367252349853516, + "learning_rate": 4.99668632835506e-05, + "loss": 6.1323, + "step": 2757 + }, + { + "epoch": 0.01640260728899039, + "grad_norm": 3.3071084022521973, + "learning_rate": 4.996683923754237e-05, + "loss": 6.162, + "step": 2758 + }, + { + "epoch": 0.016408554572271385, + "grad_norm": 2.64388370513916, + "learning_rate": 4.9966815182818494e-05, + "loss": 6.171, + "step": 2759 + }, + { + "epoch": 0.016414501855552384, + "grad_norm": 2.2378199100494385, + "learning_rate": 4.996679111937895e-05, + "loss": 6.4466, + "step": 2760 + }, + { + "epoch": 0.016420449138833382, + "grad_norm": 2.5944395065307617, + "learning_rate": 4.996676704722376e-05, + "loss": 6.7034, + "step": 2761 + }, + { + "epoch": 0.016426396422114378, + "grad_norm": 2.768211841583252, + "learning_rate": 4.996674296635293e-05, + "loss": 6.7551, + "step": 2762 + }, + { + "epoch": 0.016432343705395376, + "grad_norm": 2.80188250541687, + "learning_rate": 4.9966718876766467e-05, + "loss": 6.8437, + "step": 2763 + }, + { + "epoch": 0.01643829098867637, + "grad_norm": 2.2422847747802734, + "learning_rate": 4.996669477846438e-05, + "loss": 6.5365, + "step": 2764 + }, + { + "epoch": 0.01644423827195737, + "grad_norm": 2.526724100112915, + "learning_rate": 4.996667067144668e-05, + "loss": 6.3735, + "step": 2765 + }, + { + "epoch": 0.01645018555523837, + "grad_norm": 3.2267372608184814, + "learning_rate": 4.996664655571337e-05, + "loss": 6.0508, + "step": 2766 + }, + { + "epoch": 0.016456132838519363, + "grad_norm": 3.393270969390869, + "learning_rate": 4.996662243126446e-05, + "loss": 6.5543, + "step": 2767 + }, + { + "epoch": 0.016462080121800362, + "grad_norm": 2.7712342739105225, + "learning_rate": 4.996659829809996e-05, + "loss": 6.5891, + "step": 2768 + }, + { + "epoch": 0.01646802740508136, + "grad_norm": 2.5687179565429688, + "learning_rate": 4.996657415621988e-05, + "loss": 6.464, + "step": 2769 + }, + { + "epoch": 0.016473974688362356, + "grad_norm": 3.059953451156616, + "learning_rate": 4.996655000562424e-05, + "loss": 6.4286, + "step": 2770 + }, + { + "epoch": 0.016479921971643354, + "grad_norm": 3.3729803562164307, + "learning_rate": 4.9966525846313015e-05, + "loss": 6.5937, + "step": 2771 + }, + { + "epoch": 0.01648586925492435, + "grad_norm": 2.907397985458374, + "learning_rate": 4.996650167828624e-05, + "loss": 6.2559, + "step": 2772 + }, + { + "epoch": 0.016491816538205348, + "grad_norm": 3.5011706352233887, + "learning_rate": 4.996647750154392e-05, + "loss": 5.7897, + "step": 2773 + }, + { + "epoch": 0.016497763821486346, + "grad_norm": 2.5495986938476562, + "learning_rate": 4.996645331608607e-05, + "loss": 6.688, + "step": 2774 + }, + { + "epoch": 0.01650371110476734, + "grad_norm": 2.486416816711426, + "learning_rate": 4.9966429121912675e-05, + "loss": 6.8169, + "step": 2775 + }, + { + "epoch": 0.01650965838804834, + "grad_norm": 2.272162437438965, + "learning_rate": 4.9966404919023755e-05, + "loss": 6.696, + "step": 2776 + }, + { + "epoch": 0.016515605671329335, + "grad_norm": 2.9408323764801025, + "learning_rate": 4.9966380707419334e-05, + "loss": 6.1711, + "step": 2777 + }, + { + "epoch": 0.016521552954610334, + "grad_norm": 3.361907958984375, + "learning_rate": 4.99663564870994e-05, + "loss": 5.6029, + "step": 2778 + }, + { + "epoch": 0.016527500237891332, + "grad_norm": 3.06835675239563, + "learning_rate": 4.996633225806397e-05, + "loss": 5.332, + "step": 2779 + }, + { + "epoch": 0.016533447521172327, + "grad_norm": 3.058638572692871, + "learning_rate": 4.9966308020313054e-05, + "loss": 6.3345, + "step": 2780 + }, + { + "epoch": 0.016539394804453326, + "grad_norm": 2.8265507221221924, + "learning_rate": 4.9966283773846654e-05, + "loss": 5.4231, + "step": 2781 + }, + { + "epoch": 0.016545342087734324, + "grad_norm": 3.128094434738159, + "learning_rate": 4.996625951866478e-05, + "loss": 5.4144, + "step": 2782 + }, + { + "epoch": 0.01655128937101532, + "grad_norm": 2.6830554008483887, + "learning_rate": 4.9966235254767445e-05, + "loss": 6.0084, + "step": 2783 + }, + { + "epoch": 0.016557236654296318, + "grad_norm": 2.7146122455596924, + "learning_rate": 4.996621098215466e-05, + "loss": 6.7104, + "step": 2784 + }, + { + "epoch": 0.016563183937577313, + "grad_norm": 3.518169403076172, + "learning_rate": 4.9966186700826425e-05, + "loss": 5.4509, + "step": 2785 + }, + { + "epoch": 0.01656913122085831, + "grad_norm": 2.7607035636901855, + "learning_rate": 4.9966162410782755e-05, + "loss": 6.2149, + "step": 2786 + }, + { + "epoch": 0.01657507850413931, + "grad_norm": 2.897862195968628, + "learning_rate": 4.996613811202365e-05, + "loss": 6.4713, + "step": 2787 + }, + { + "epoch": 0.016581025787420305, + "grad_norm": 2.6984574794769287, + "learning_rate": 4.9966113804549134e-05, + "loss": 6.2298, + "step": 2788 + }, + { + "epoch": 0.016586973070701304, + "grad_norm": 2.7281908988952637, + "learning_rate": 4.996608948835919e-05, + "loss": 6.0244, + "step": 2789 + }, + { + "epoch": 0.016592920353982302, + "grad_norm": 2.314769983291626, + "learning_rate": 4.996606516345386e-05, + "loss": 6.8523, + "step": 2790 + }, + { + "epoch": 0.016598867637263297, + "grad_norm": 2.887943744659424, + "learning_rate": 4.9966040829833115e-05, + "loss": 6.8407, + "step": 2791 + }, + { + "epoch": 0.016604814920544296, + "grad_norm": 3.4924309253692627, + "learning_rate": 4.9966016487497e-05, + "loss": 6.3646, + "step": 2792 + }, + { + "epoch": 0.01661076220382529, + "grad_norm": 2.3095340728759766, + "learning_rate": 4.9965992136445495e-05, + "loss": 6.407, + "step": 2793 + }, + { + "epoch": 0.01661670948710629, + "grad_norm": 3.771980047225952, + "learning_rate": 4.9965967776678627e-05, + "loss": 6.0596, + "step": 2794 + }, + { + "epoch": 0.016622656770387288, + "grad_norm": 3.452252149581909, + "learning_rate": 4.99659434081964e-05, + "loss": 6.1351, + "step": 2795 + }, + { + "epoch": 0.016628604053668283, + "grad_norm": 2.4391021728515625, + "learning_rate": 4.996591903099881e-05, + "loss": 6.3304, + "step": 2796 + }, + { + "epoch": 0.016634551336949282, + "grad_norm": 2.7057220935821533, + "learning_rate": 4.9965894645085885e-05, + "loss": 6.8328, + "step": 2797 + }, + { + "epoch": 0.01664049862023028, + "grad_norm": 2.392627716064453, + "learning_rate": 4.996587025045762e-05, + "loss": 6.8491, + "step": 2798 + }, + { + "epoch": 0.016646445903511276, + "grad_norm": 2.47928786277771, + "learning_rate": 4.9965845847114024e-05, + "loss": 6.6323, + "step": 2799 + }, + { + "epoch": 0.016652393186792274, + "grad_norm": 2.438870668411255, + "learning_rate": 4.9965821435055115e-05, + "loss": 6.3832, + "step": 2800 + }, + { + "epoch": 0.01665834047007327, + "grad_norm": 2.6875247955322266, + "learning_rate": 4.9965797014280895e-05, + "loss": 6.6994, + "step": 2801 + }, + { + "epoch": 0.016664287753354268, + "grad_norm": 2.71785044670105, + "learning_rate": 4.996577258479137e-05, + "loss": 6.2505, + "step": 2802 + }, + { + "epoch": 0.016670235036635266, + "grad_norm": 2.32853102684021, + "learning_rate": 4.996574814658655e-05, + "loss": 6.4409, + "step": 2803 + }, + { + "epoch": 0.01667618231991626, + "grad_norm": 2.271027088165283, + "learning_rate": 4.996572369966646e-05, + "loss": 6.4928, + "step": 2804 + }, + { + "epoch": 0.01668212960319726, + "grad_norm": 2.621448278427124, + "learning_rate": 4.996569924403108e-05, + "loss": 6.7248, + "step": 2805 + }, + { + "epoch": 0.01668807688647826, + "grad_norm": 3.621654748916626, + "learning_rate": 4.9965674779680435e-05, + "loss": 6.7268, + "step": 2806 + }, + { + "epoch": 0.016694024169759254, + "grad_norm": 2.2045094966888428, + "learning_rate": 4.9965650306614534e-05, + "loss": 6.6406, + "step": 2807 + }, + { + "epoch": 0.016699971453040252, + "grad_norm": 2.4885873794555664, + "learning_rate": 4.9965625824833376e-05, + "loss": 6.611, + "step": 2808 + }, + { + "epoch": 0.016705918736321247, + "grad_norm": 2.796971082687378, + "learning_rate": 4.996560133433697e-05, + "loss": 6.455, + "step": 2809 + }, + { + "epoch": 0.016711866019602246, + "grad_norm": 2.539395570755005, + "learning_rate": 4.996557683512535e-05, + "loss": 6.8169, + "step": 2810 + }, + { + "epoch": 0.016717813302883244, + "grad_norm": 2.322824239730835, + "learning_rate": 4.99655523271985e-05, + "loss": 6.3217, + "step": 2811 + }, + { + "epoch": 0.01672376058616424, + "grad_norm": 2.4404520988464355, + "learning_rate": 4.9965527810556424e-05, + "loss": 6.5026, + "step": 2812 + }, + { + "epoch": 0.016729707869445238, + "grad_norm": 2.287362575531006, + "learning_rate": 4.996550328519915e-05, + "loss": 6.9183, + "step": 2813 + }, + { + "epoch": 0.016735655152726233, + "grad_norm": 2.369877815246582, + "learning_rate": 4.996547875112667e-05, + "loss": 6.7488, + "step": 2814 + }, + { + "epoch": 0.01674160243600723, + "grad_norm": 2.323082685470581, + "learning_rate": 4.996545420833899e-05, + "loss": 6.6177, + "step": 2815 + }, + { + "epoch": 0.01674754971928823, + "grad_norm": 2.221214532852173, + "learning_rate": 4.9965429656836145e-05, + "loss": 6.6844, + "step": 2816 + }, + { + "epoch": 0.016753497002569225, + "grad_norm": 2.246819496154785, + "learning_rate": 4.9965405096618116e-05, + "loss": 6.5631, + "step": 2817 + }, + { + "epoch": 0.016759444285850224, + "grad_norm": 2.411806583404541, + "learning_rate": 4.996538052768493e-05, + "loss": 6.4037, + "step": 2818 + }, + { + "epoch": 0.016765391569131222, + "grad_norm": 1.941197395324707, + "learning_rate": 4.996535595003658e-05, + "loss": 6.5232, + "step": 2819 + }, + { + "epoch": 0.016771338852412217, + "grad_norm": 2.149991750717163, + "learning_rate": 4.996533136367309e-05, + "loss": 6.4166, + "step": 2820 + }, + { + "epoch": 0.016777286135693216, + "grad_norm": 2.5388433933258057, + "learning_rate": 4.9965306768594454e-05, + "loss": 6.5733, + "step": 2821 + }, + { + "epoch": 0.01678323341897421, + "grad_norm": 2.1857333183288574, + "learning_rate": 4.9965282164800694e-05, + "loss": 6.5558, + "step": 2822 + }, + { + "epoch": 0.01678918070225521, + "grad_norm": 2.1090164184570312, + "learning_rate": 4.9965257552291804e-05, + "loss": 6.6916, + "step": 2823 + }, + { + "epoch": 0.016795127985536208, + "grad_norm": 2.1102349758148193, + "learning_rate": 4.9965232931067806e-05, + "loss": 6.5852, + "step": 2824 + }, + { + "epoch": 0.016801075268817203, + "grad_norm": 2.384660005569458, + "learning_rate": 4.99652083011287e-05, + "loss": 6.5033, + "step": 2825 + }, + { + "epoch": 0.016807022552098202, + "grad_norm": 2.314896821975708, + "learning_rate": 4.9965183662474504e-05, + "loss": 6.4108, + "step": 2826 + }, + { + "epoch": 0.0168129698353792, + "grad_norm": 2.4358227252960205, + "learning_rate": 4.9965159015105215e-05, + "loss": 6.5309, + "step": 2827 + }, + { + "epoch": 0.016818917118660195, + "grad_norm": 2.179905652999878, + "learning_rate": 4.9965134359020844e-05, + "loss": 6.4593, + "step": 2828 + }, + { + "epoch": 0.016824864401941194, + "grad_norm": 2.2742464542388916, + "learning_rate": 4.99651096942214e-05, + "loss": 6.6654, + "step": 2829 + }, + { + "epoch": 0.01683081168522219, + "grad_norm": 2.211026668548584, + "learning_rate": 4.9965085020706906e-05, + "loss": 6.4527, + "step": 2830 + }, + { + "epoch": 0.016836758968503188, + "grad_norm": 2.552072763442993, + "learning_rate": 4.996506033847735e-05, + "loss": 6.5338, + "step": 2831 + }, + { + "epoch": 0.016842706251784186, + "grad_norm": 2.3208038806915283, + "learning_rate": 4.996503564753276e-05, + "loss": 6.473, + "step": 2832 + }, + { + "epoch": 0.01684865353506518, + "grad_norm": 2.3756048679351807, + "learning_rate": 4.996501094787312e-05, + "loss": 6.4223, + "step": 2833 + }, + { + "epoch": 0.01685460081834618, + "grad_norm": 2.386152982711792, + "learning_rate": 4.996498623949846e-05, + "loss": 6.317, + "step": 2834 + }, + { + "epoch": 0.01686054810162718, + "grad_norm": 2.144510507583618, + "learning_rate": 4.996496152240878e-05, + "loss": 6.4039, + "step": 2835 + }, + { + "epoch": 0.016866495384908173, + "grad_norm": 2.3362607955932617, + "learning_rate": 4.996493679660409e-05, + "loss": 6.5411, + "step": 2836 + }, + { + "epoch": 0.016872442668189172, + "grad_norm": 2.156428337097168, + "learning_rate": 4.9964912062084404e-05, + "loss": 6.3399, + "step": 2837 + }, + { + "epoch": 0.016878389951470167, + "grad_norm": 2.3429903984069824, + "learning_rate": 4.9964887318849715e-05, + "loss": 6.5159, + "step": 2838 + }, + { + "epoch": 0.016884337234751166, + "grad_norm": 2.1888442039489746, + "learning_rate": 4.9964862566900045e-05, + "loss": 6.3906, + "step": 2839 + }, + { + "epoch": 0.016890284518032164, + "grad_norm": 2.3973047733306885, + "learning_rate": 4.9964837806235396e-05, + "loss": 6.3452, + "step": 2840 + }, + { + "epoch": 0.01689623180131316, + "grad_norm": 2.232057809829712, + "learning_rate": 4.996481303685578e-05, + "loss": 6.5203, + "step": 2841 + }, + { + "epoch": 0.016902179084594158, + "grad_norm": 2.672342300415039, + "learning_rate": 4.996478825876122e-05, + "loss": 6.8615, + "step": 2842 + }, + { + "epoch": 0.016908126367875153, + "grad_norm": 2.603943347930908, + "learning_rate": 4.996476347195171e-05, + "loss": 7.1632, + "step": 2843 + }, + { + "epoch": 0.01691407365115615, + "grad_norm": 2.684616804122925, + "learning_rate": 4.9964738676427234e-05, + "loss": 6.5546, + "step": 2844 + }, + { + "epoch": 0.01692002093443715, + "grad_norm": 2.1103904247283936, + "learning_rate": 4.996471387218785e-05, + "loss": 6.4666, + "step": 2845 + }, + { + "epoch": 0.016925968217718145, + "grad_norm": 2.8278937339782715, + "learning_rate": 4.9964689059233525e-05, + "loss": 6.3685, + "step": 2846 + }, + { + "epoch": 0.016931915500999144, + "grad_norm": 3.2611489295959473, + "learning_rate": 4.9964664237564296e-05, + "loss": 6.5537, + "step": 2847 + }, + { + "epoch": 0.016937862784280142, + "grad_norm": 3.029353141784668, + "learning_rate": 4.9964639407180155e-05, + "loss": 6.6097, + "step": 2848 + }, + { + "epoch": 0.016943810067561137, + "grad_norm": 2.6735312938690186, + "learning_rate": 4.996461456808112e-05, + "loss": 6.5854, + "step": 2849 + }, + { + "epoch": 0.016949757350842136, + "grad_norm": 2.7619409561157227, + "learning_rate": 4.99645897202672e-05, + "loss": 6.5944, + "step": 2850 + }, + { + "epoch": 0.01695570463412313, + "grad_norm": 3.0398738384246826, + "learning_rate": 4.9964564863738396e-05, + "loss": 6.3804, + "step": 2851 + }, + { + "epoch": 0.01696165191740413, + "grad_norm": 3.5388784408569336, + "learning_rate": 4.996453999849472e-05, + "loss": 7.0993, + "step": 2852 + }, + { + "epoch": 0.016967599200685128, + "grad_norm": 2.3602113723754883, + "learning_rate": 4.9964515124536185e-05, + "loss": 6.4981, + "step": 2853 + }, + { + "epoch": 0.016973546483966123, + "grad_norm": 2.346632957458496, + "learning_rate": 4.996449024186278e-05, + "loss": 6.4892, + "step": 2854 + }, + { + "epoch": 0.016979493767247122, + "grad_norm": 2.9653544425964355, + "learning_rate": 4.996446535047454e-05, + "loss": 6.2772, + "step": 2855 + }, + { + "epoch": 0.01698544105052812, + "grad_norm": 3.1064538955688477, + "learning_rate": 4.996444045037147e-05, + "loss": 6.238, + "step": 2856 + }, + { + "epoch": 0.016991388333809115, + "grad_norm": 2.9617815017700195, + "learning_rate": 4.9964415541553564e-05, + "loss": 6.2991, + "step": 2857 + }, + { + "epoch": 0.016997335617090114, + "grad_norm": 2.5993905067443848, + "learning_rate": 4.996439062402084e-05, + "loss": 6.5482, + "step": 2858 + }, + { + "epoch": 0.01700328290037111, + "grad_norm": 2.5469226837158203, + "learning_rate": 4.996436569777331e-05, + "loss": 6.437, + "step": 2859 + }, + { + "epoch": 0.017009230183652108, + "grad_norm": 2.709184408187866, + "learning_rate": 4.9964340762810965e-05, + "loss": 6.1362, + "step": 2860 + }, + { + "epoch": 0.017015177466933106, + "grad_norm": 2.843942880630493, + "learning_rate": 4.9964315819133837e-05, + "loss": 6.2443, + "step": 2861 + }, + { + "epoch": 0.0170211247502141, + "grad_norm": 3.022735357284546, + "learning_rate": 4.9964290866741925e-05, + "loss": 6.3161, + "step": 2862 + }, + { + "epoch": 0.0170270720334951, + "grad_norm": 2.487271308898926, + "learning_rate": 4.996426590563523e-05, + "loss": 6.3352, + "step": 2863 + }, + { + "epoch": 0.0170330193167761, + "grad_norm": 2.624000072479248, + "learning_rate": 4.996424093581377e-05, + "loss": 6.3575, + "step": 2864 + }, + { + "epoch": 0.017038966600057093, + "grad_norm": 2.378368854522705, + "learning_rate": 4.996421595727756e-05, + "loss": 6.3284, + "step": 2865 + }, + { + "epoch": 0.017044913883338092, + "grad_norm": 2.6903984546661377, + "learning_rate": 4.996419097002659e-05, + "loss": 6.271, + "step": 2866 + }, + { + "epoch": 0.017050861166619087, + "grad_norm": 2.536391019821167, + "learning_rate": 4.9964165974060875e-05, + "loss": 6.1276, + "step": 2867 + }, + { + "epoch": 0.017056808449900086, + "grad_norm": 2.470395803451538, + "learning_rate": 4.9964140969380434e-05, + "loss": 6.1032, + "step": 2868 + }, + { + "epoch": 0.017062755733181084, + "grad_norm": 2.929818630218506, + "learning_rate": 4.996411595598528e-05, + "loss": 6.0994, + "step": 2869 + }, + { + "epoch": 0.01706870301646208, + "grad_norm": 2.548701763153076, + "learning_rate": 4.99640909338754e-05, + "loss": 6.2227, + "step": 2870 + }, + { + "epoch": 0.017074650299743078, + "grad_norm": 2.6044397354125977, + "learning_rate": 4.99640659030508e-05, + "loss": 6.0778, + "step": 2871 + }, + { + "epoch": 0.017080597583024073, + "grad_norm": 2.687392473220825, + "learning_rate": 4.996404086351153e-05, + "loss": 6.2975, + "step": 2872 + }, + { + "epoch": 0.01708654486630507, + "grad_norm": 2.740201711654663, + "learning_rate": 4.9964015815257556e-05, + "loss": 6.5955, + "step": 2873 + }, + { + "epoch": 0.01709249214958607, + "grad_norm": 2.605958938598633, + "learning_rate": 4.99639907582889e-05, + "loss": 6.2112, + "step": 2874 + }, + { + "epoch": 0.017098439432867065, + "grad_norm": 2.9691529273986816, + "learning_rate": 4.996396569260558e-05, + "loss": 6.1435, + "step": 2875 + }, + { + "epoch": 0.017104386716148064, + "grad_norm": 2.822201728820801, + "learning_rate": 4.9963940618207593e-05, + "loss": 6.1949, + "step": 2876 + }, + { + "epoch": 0.017110333999429062, + "grad_norm": 2.6231529712677, + "learning_rate": 4.996391553509495e-05, + "loss": 6.5082, + "step": 2877 + }, + { + "epoch": 0.017116281282710057, + "grad_norm": 2.6511785984039307, + "learning_rate": 4.9963890443267666e-05, + "loss": 6.4461, + "step": 2878 + }, + { + "epoch": 0.017122228565991056, + "grad_norm": 2.4790167808532715, + "learning_rate": 4.996386534272575e-05, + "loss": 6.4642, + "step": 2879 + }, + { + "epoch": 0.01712817584927205, + "grad_norm": 3.6982533931732178, + "learning_rate": 4.99638402334692e-05, + "loss": 6.2957, + "step": 2880 + }, + { + "epoch": 0.01713412313255305, + "grad_norm": 2.380385160446167, + "learning_rate": 4.996381511549804e-05, + "loss": 6.3174, + "step": 2881 + }, + { + "epoch": 0.017140070415834048, + "grad_norm": 2.425537347793579, + "learning_rate": 4.996378998881226e-05, + "loss": 6.2055, + "step": 2882 + }, + { + "epoch": 0.017146017699115043, + "grad_norm": 2.4667842388153076, + "learning_rate": 4.996376485341188e-05, + "loss": 6.245, + "step": 2883 + }, + { + "epoch": 0.01715196498239604, + "grad_norm": 2.6306424140930176, + "learning_rate": 4.996373970929691e-05, + "loss": 6.1162, + "step": 2884 + }, + { + "epoch": 0.01715791226567704, + "grad_norm": 4.439255714416504, + "learning_rate": 4.996371455646736e-05, + "loss": 5.9868, + "step": 2885 + }, + { + "epoch": 0.017163859548958035, + "grad_norm": 3.3248472213745117, + "learning_rate": 4.9963689394923224e-05, + "loss": 5.861, + "step": 2886 + }, + { + "epoch": 0.017169806832239034, + "grad_norm": 2.45271897315979, + "learning_rate": 4.996366422466453e-05, + "loss": 6.1588, + "step": 2887 + }, + { + "epoch": 0.01717575411552003, + "grad_norm": 3.1748130321502686, + "learning_rate": 4.996363904569128e-05, + "loss": 6.3607, + "step": 2888 + }, + { + "epoch": 0.017181701398801028, + "grad_norm": 3.300736427307129, + "learning_rate": 4.996361385800348e-05, + "loss": 6.0709, + "step": 2889 + }, + { + "epoch": 0.017187648682082026, + "grad_norm": 2.720550060272217, + "learning_rate": 4.9963588661601136e-05, + "loss": 6.0496, + "step": 2890 + }, + { + "epoch": 0.01719359596536302, + "grad_norm": 2.251845121383667, + "learning_rate": 4.9963563456484266e-05, + "loss": 6.0088, + "step": 2891 + }, + { + "epoch": 0.01719954324864402, + "grad_norm": 2.7863035202026367, + "learning_rate": 4.996353824265288e-05, + "loss": 5.9478, + "step": 2892 + }, + { + "epoch": 0.01720549053192502, + "grad_norm": 2.831744432449341, + "learning_rate": 4.996351302010697e-05, + "loss": 6.1629, + "step": 2893 + }, + { + "epoch": 0.017211437815206013, + "grad_norm": 4.583891868591309, + "learning_rate": 4.9963487788846556e-05, + "loss": 6.7936, + "step": 2894 + }, + { + "epoch": 0.017217385098487012, + "grad_norm": 2.4525468349456787, + "learning_rate": 4.996346254887165e-05, + "loss": 6.3188, + "step": 2895 + }, + { + "epoch": 0.017223332381768007, + "grad_norm": 3.0866281986236572, + "learning_rate": 4.9963437300182254e-05, + "loss": 6.0207, + "step": 2896 + }, + { + "epoch": 0.017229279665049006, + "grad_norm": 3.1188113689422607, + "learning_rate": 4.996341204277838e-05, + "loss": 5.9873, + "step": 2897 + }, + { + "epoch": 0.017235226948330004, + "grad_norm": 2.4119350910186768, + "learning_rate": 4.996338677666004e-05, + "loss": 5.8104, + "step": 2898 + }, + { + "epoch": 0.017241174231611, + "grad_norm": 1.9601647853851318, + "learning_rate": 4.996336150182724e-05, + "loss": 6.2166, + "step": 2899 + }, + { + "epoch": 0.017247121514891998, + "grad_norm": 3.428379535675049, + "learning_rate": 4.9963336218279986e-05, + "loss": 6.4284, + "step": 2900 + }, + { + "epoch": 0.017253068798172993, + "grad_norm": 2.629446506500244, + "learning_rate": 4.996331092601829e-05, + "loss": 6.4916, + "step": 2901 + }, + { + "epoch": 0.01725901608145399, + "grad_norm": 2.3860316276550293, + "learning_rate": 4.996328562504216e-05, + "loss": 6.5035, + "step": 2902 + }, + { + "epoch": 0.01726496336473499, + "grad_norm": 2.6754682064056396, + "learning_rate": 4.996326031535161e-05, + "loss": 6.6374, + "step": 2903 + }, + { + "epoch": 0.017270910648015985, + "grad_norm": 2.737901210784912, + "learning_rate": 4.9963234996946635e-05, + "loss": 6.5023, + "step": 2904 + }, + { + "epoch": 0.017276857931296984, + "grad_norm": 2.481691837310791, + "learning_rate": 4.996320966982726e-05, + "loss": 6.5211, + "step": 2905 + }, + { + "epoch": 0.017282805214577982, + "grad_norm": 3.3993568420410156, + "learning_rate": 4.996318433399348e-05, + "loss": 6.4239, + "step": 2906 + }, + { + "epoch": 0.017288752497858977, + "grad_norm": 3.9149057865142822, + "learning_rate": 4.9963158989445316e-05, + "loss": 6.3874, + "step": 2907 + }, + { + "epoch": 0.017294699781139976, + "grad_norm": 2.3808562755584717, + "learning_rate": 4.996313363618276e-05, + "loss": 6.2887, + "step": 2908 + }, + { + "epoch": 0.01730064706442097, + "grad_norm": 2.6186649799346924, + "learning_rate": 4.996310827420585e-05, + "loss": 6.2944, + "step": 2909 + }, + { + "epoch": 0.01730659434770197, + "grad_norm": 2.5251142978668213, + "learning_rate": 4.9963082903514554e-05, + "loss": 6.0944, + "step": 2910 + }, + { + "epoch": 0.017312541630982968, + "grad_norm": 2.8212270736694336, + "learning_rate": 4.9963057524108926e-05, + "loss": 6.6621, + "step": 2911 + }, + { + "epoch": 0.017318488914263963, + "grad_norm": 2.477485418319702, + "learning_rate": 4.996303213598894e-05, + "loss": 6.3941, + "step": 2912 + }, + { + "epoch": 0.01732443619754496, + "grad_norm": 3.6508305072784424, + "learning_rate": 4.996300673915462e-05, + "loss": 6.3234, + "step": 2913 + }, + { + "epoch": 0.01733038348082596, + "grad_norm": 2.1635468006134033, + "learning_rate": 4.996298133360598e-05, + "loss": 6.2877, + "step": 2914 + }, + { + "epoch": 0.017336330764106955, + "grad_norm": 3.431082010269165, + "learning_rate": 4.9962955919343004e-05, + "loss": 6.2627, + "step": 2915 + }, + { + "epoch": 0.017342278047387954, + "grad_norm": 3.272376775741577, + "learning_rate": 4.9962930496365736e-05, + "loss": 6.1458, + "step": 2916 + }, + { + "epoch": 0.01734822533066895, + "grad_norm": 3.5927000045776367, + "learning_rate": 4.996290506467415e-05, + "loss": 5.9828, + "step": 2917 + }, + { + "epoch": 0.017354172613949947, + "grad_norm": 3.569641351699829, + "learning_rate": 4.996287962426829e-05, + "loss": 6.5957, + "step": 2918 + }, + { + "epoch": 0.017360119897230946, + "grad_norm": 3.281855344772339, + "learning_rate": 4.9962854175148134e-05, + "loss": 6.3393, + "step": 2919 + }, + { + "epoch": 0.01736606718051194, + "grad_norm": 2.6009061336517334, + "learning_rate": 4.9962828717313706e-05, + "loss": 6.3537, + "step": 2920 + }, + { + "epoch": 0.01737201446379294, + "grad_norm": 3.964467763900757, + "learning_rate": 4.996280325076501e-05, + "loss": 6.0281, + "step": 2921 + }, + { + "epoch": 0.017377961747073938, + "grad_norm": 3.9164865016937256, + "learning_rate": 4.9962777775502064e-05, + "loss": 6.5255, + "step": 2922 + }, + { + "epoch": 0.017383909030354933, + "grad_norm": 2.349709987640381, + "learning_rate": 4.996275229152486e-05, + "loss": 6.2459, + "step": 2923 + }, + { + "epoch": 0.017389856313635932, + "grad_norm": 2.5735161304473877, + "learning_rate": 4.9962726798833425e-05, + "loss": 6.0463, + "step": 2924 + }, + { + "epoch": 0.017395803596916927, + "grad_norm": 2.228271961212158, + "learning_rate": 4.9962701297427764e-05, + "loss": 6.1147, + "step": 2925 + }, + { + "epoch": 0.017401750880197926, + "grad_norm": 2.4587175846099854, + "learning_rate": 4.9962675787307875e-05, + "loss": 7.0868, + "step": 2926 + }, + { + "epoch": 0.017407698163478924, + "grad_norm": 2.2712674140930176, + "learning_rate": 4.996265026847378e-05, + "loss": 6.175, + "step": 2927 + }, + { + "epoch": 0.01741364544675992, + "grad_norm": 3.0724384784698486, + "learning_rate": 4.996262474092547e-05, + "loss": 6.5354, + "step": 2928 + }, + { + "epoch": 0.017419592730040918, + "grad_norm": 4.872220039367676, + "learning_rate": 4.996259920466297e-05, + "loss": 6.1938, + "step": 2929 + }, + { + "epoch": 0.017425540013321916, + "grad_norm": 4.508706569671631, + "learning_rate": 4.996257365968629e-05, + "loss": 6.1813, + "step": 2930 + }, + { + "epoch": 0.01743148729660291, + "grad_norm": 3.0419485569000244, + "learning_rate": 4.996254810599543e-05, + "loss": 5.9529, + "step": 2931 + }, + { + "epoch": 0.01743743457988391, + "grad_norm": 2.8372066020965576, + "learning_rate": 4.996252254359041e-05, + "loss": 5.9422, + "step": 2932 + }, + { + "epoch": 0.017443381863164905, + "grad_norm": 4.554285526275635, + "learning_rate": 4.996249697247122e-05, + "loss": 6.9073, + "step": 2933 + }, + { + "epoch": 0.017449329146445904, + "grad_norm": 3.121094226837158, + "learning_rate": 4.996247139263788e-05, + "loss": 6.2827, + "step": 2934 + }, + { + "epoch": 0.017455276429726902, + "grad_norm": 3.936596632003784, + "learning_rate": 4.996244580409041e-05, + "loss": 6.7863, + "step": 2935 + }, + { + "epoch": 0.017461223713007897, + "grad_norm": 3.5771539211273193, + "learning_rate": 4.99624202068288e-05, + "loss": 7.0691, + "step": 2936 + }, + { + "epoch": 0.017467170996288896, + "grad_norm": 2.0674471855163574, + "learning_rate": 4.996239460085307e-05, + "loss": 6.9768, + "step": 2937 + }, + { + "epoch": 0.01747311827956989, + "grad_norm": 2.600167989730835, + "learning_rate": 4.996236898616322e-05, + "loss": 6.4235, + "step": 2938 + }, + { + "epoch": 0.01747906556285089, + "grad_norm": 2.9444847106933594, + "learning_rate": 4.9962343362759267e-05, + "loss": 6.7305, + "step": 2939 + }, + { + "epoch": 0.017485012846131888, + "grad_norm": 3.721101999282837, + "learning_rate": 4.996231773064122e-05, + "loss": 6.5147, + "step": 2940 + }, + { + "epoch": 0.017490960129412883, + "grad_norm": 5.715269565582275, + "learning_rate": 4.9962292089809086e-05, + "loss": 6.1433, + "step": 2941 + }, + { + "epoch": 0.01749690741269388, + "grad_norm": 4.245530128479004, + "learning_rate": 4.996226644026287e-05, + "loss": 6.2163, + "step": 2942 + }, + { + "epoch": 0.01750285469597488, + "grad_norm": 2.7717039585113525, + "learning_rate": 4.996224078200259e-05, + "loss": 5.877, + "step": 2943 + }, + { + "epoch": 0.017508801979255875, + "grad_norm": 3.4189441204071045, + "learning_rate": 4.9962215115028255e-05, + "loss": 5.9575, + "step": 2944 + }, + { + "epoch": 0.017514749262536874, + "grad_norm": 3.754513740539551, + "learning_rate": 4.996218943933986e-05, + "loss": 5.7512, + "step": 2945 + }, + { + "epoch": 0.01752069654581787, + "grad_norm": 3.4231228828430176, + "learning_rate": 4.9962163754937426e-05, + "loss": 6.4566, + "step": 2946 + }, + { + "epoch": 0.017526643829098867, + "grad_norm": 2.7481472492218018, + "learning_rate": 4.996213806182095e-05, + "loss": 6.1385, + "step": 2947 + }, + { + "epoch": 0.017532591112379866, + "grad_norm": 2.802342414855957, + "learning_rate": 4.996211235999046e-05, + "loss": 5.6656, + "step": 2948 + }, + { + "epoch": 0.01753853839566086, + "grad_norm": 2.60530686378479, + "learning_rate": 4.996208664944595e-05, + "loss": 5.7339, + "step": 2949 + }, + { + "epoch": 0.01754448567894186, + "grad_norm": 2.476100206375122, + "learning_rate": 4.996206093018744e-05, + "loss": 6.0447, + "step": 2950 + }, + { + "epoch": 0.017550432962222858, + "grad_norm": 2.3516924381256104, + "learning_rate": 4.9962035202214916e-05, + "loss": 6.2046, + "step": 2951 + }, + { + "epoch": 0.017556380245503853, + "grad_norm": 2.447519302368164, + "learning_rate": 4.996200946552842e-05, + "loss": 6.0279, + "step": 2952 + }, + { + "epoch": 0.017562327528784852, + "grad_norm": 2.679766893386841, + "learning_rate": 4.996198372012794e-05, + "loss": 5.9072, + "step": 2953 + }, + { + "epoch": 0.017568274812065847, + "grad_norm": 2.3413944244384766, + "learning_rate": 4.9961957966013486e-05, + "loss": 5.9214, + "step": 2954 + }, + { + "epoch": 0.017574222095346845, + "grad_norm": 2.273725986480713, + "learning_rate": 4.996193220318507e-05, + "loss": 6.2107, + "step": 2955 + }, + { + "epoch": 0.017580169378627844, + "grad_norm": 2.9424052238464355, + "learning_rate": 4.99619064316427e-05, + "loss": 5.8618, + "step": 2956 + }, + { + "epoch": 0.01758611666190884, + "grad_norm": 2.40987229347229, + "learning_rate": 4.9961880651386394e-05, + "loss": 6.1306, + "step": 2957 + }, + { + "epoch": 0.017592063945189838, + "grad_norm": 2.542084217071533, + "learning_rate": 4.9961854862416144e-05, + "loss": 6.2225, + "step": 2958 + }, + { + "epoch": 0.017598011228470836, + "grad_norm": 2.06935977935791, + "learning_rate": 4.996182906473198e-05, + "loss": 5.9899, + "step": 2959 + }, + { + "epoch": 0.01760395851175183, + "grad_norm": 2.1998584270477295, + "learning_rate": 4.99618032583339e-05, + "loss": 6.2268, + "step": 2960 + }, + { + "epoch": 0.01760990579503283, + "grad_norm": 2.5595617294311523, + "learning_rate": 4.99617774432219e-05, + "loss": 6.2856, + "step": 2961 + }, + { + "epoch": 0.017615853078313825, + "grad_norm": 2.9262382984161377, + "learning_rate": 4.9961751619396e-05, + "loss": 6.2747, + "step": 2962 + }, + { + "epoch": 0.017621800361594823, + "grad_norm": 2.3705809116363525, + "learning_rate": 4.996172578685622e-05, + "loss": 6.1376, + "step": 2963 + }, + { + "epoch": 0.017627747644875822, + "grad_norm": 2.20991849899292, + "learning_rate": 4.996169994560256e-05, + "loss": 6.0118, + "step": 2964 + }, + { + "epoch": 0.017633694928156817, + "grad_norm": 2.2801706790924072, + "learning_rate": 4.996167409563502e-05, + "loss": 6.0924, + "step": 2965 + }, + { + "epoch": 0.017639642211437816, + "grad_norm": 2.5618062019348145, + "learning_rate": 4.996164823695362e-05, + "loss": 6.0931, + "step": 2966 + }, + { + "epoch": 0.01764558949471881, + "grad_norm": 2.2933573722839355, + "learning_rate": 4.996162236955837e-05, + "loss": 6.1584, + "step": 2967 + }, + { + "epoch": 0.01765153677799981, + "grad_norm": 2.2387471199035645, + "learning_rate": 4.996159649344928e-05, + "loss": 6.1224, + "step": 2968 + }, + { + "epoch": 0.017657484061280808, + "grad_norm": 2.425929069519043, + "learning_rate": 4.9961570608626347e-05, + "loss": 6.2419, + "step": 2969 + }, + { + "epoch": 0.017663431344561803, + "grad_norm": 3.0279812812805176, + "learning_rate": 4.996154471508959e-05, + "loss": 6.0478, + "step": 2970 + }, + { + "epoch": 0.0176693786278428, + "grad_norm": 2.8950276374816895, + "learning_rate": 4.9961518812839015e-05, + "loss": 5.9663, + "step": 2971 + }, + { + "epoch": 0.0176753259111238, + "grad_norm": 2.9908859729766846, + "learning_rate": 4.996149290187463e-05, + "loss": 5.8101, + "step": 2972 + }, + { + "epoch": 0.017681273194404795, + "grad_norm": 2.900987148284912, + "learning_rate": 4.996146698219645e-05, + "loss": 6.133, + "step": 2973 + }, + { + "epoch": 0.017687220477685794, + "grad_norm": 3.3194754123687744, + "learning_rate": 4.996144105380447e-05, + "loss": 5.9763, + "step": 2974 + }, + { + "epoch": 0.01769316776096679, + "grad_norm": 2.4997923374176025, + "learning_rate": 4.996141511669872e-05, + "loss": 6.1062, + "step": 2975 + }, + { + "epoch": 0.017699115044247787, + "grad_norm": 2.3048369884490967, + "learning_rate": 4.996138917087919e-05, + "loss": 6.138, + "step": 2976 + }, + { + "epoch": 0.017705062327528786, + "grad_norm": 2.3391027450561523, + "learning_rate": 4.99613632163459e-05, + "loss": 6.0612, + "step": 2977 + }, + { + "epoch": 0.01771100961080978, + "grad_norm": 2.6164605617523193, + "learning_rate": 4.996133725309886e-05, + "loss": 6.0402, + "step": 2978 + }, + { + "epoch": 0.01771695689409078, + "grad_norm": 2.6534295082092285, + "learning_rate": 4.996131128113807e-05, + "loss": 5.9027, + "step": 2979 + }, + { + "epoch": 0.017722904177371778, + "grad_norm": 2.1807172298431396, + "learning_rate": 4.996128530046354e-05, + "loss": 5.7083, + "step": 2980 + }, + { + "epoch": 0.017728851460652773, + "grad_norm": 2.433762550354004, + "learning_rate": 4.9961259311075296e-05, + "loss": 6.1587, + "step": 2981 + }, + { + "epoch": 0.017734798743933772, + "grad_norm": 2.4656107425689697, + "learning_rate": 4.996123331297333e-05, + "loss": 5.9831, + "step": 2982 + }, + { + "epoch": 0.017740746027214767, + "grad_norm": 2.536060333251953, + "learning_rate": 4.996120730615765e-05, + "loss": 5.9083, + "step": 2983 + }, + { + "epoch": 0.017746693310495765, + "grad_norm": 2.2993409633636475, + "learning_rate": 4.996118129062828e-05, + "loss": 6.0156, + "step": 2984 + }, + { + "epoch": 0.017752640593776764, + "grad_norm": 2.0221481323242188, + "learning_rate": 4.996115526638521e-05, + "loss": 5.9836, + "step": 2985 + }, + { + "epoch": 0.01775858787705776, + "grad_norm": 2.401350498199463, + "learning_rate": 4.996112923342846e-05, + "loss": 5.8071, + "step": 2986 + }, + { + "epoch": 0.017764535160338758, + "grad_norm": 2.469214677810669, + "learning_rate": 4.996110319175804e-05, + "loss": 5.8784, + "step": 2987 + }, + { + "epoch": 0.017770482443619756, + "grad_norm": 2.454481601715088, + "learning_rate": 4.9961077141373955e-05, + "loss": 5.9168, + "step": 2988 + }, + { + "epoch": 0.01777642972690075, + "grad_norm": 2.3173487186431885, + "learning_rate": 4.996105108227621e-05, + "loss": 5.8797, + "step": 2989 + }, + { + "epoch": 0.01778237701018175, + "grad_norm": 2.1967554092407227, + "learning_rate": 4.996102501446483e-05, + "loss": 5.972, + "step": 2990 + }, + { + "epoch": 0.017788324293462745, + "grad_norm": 2.1263201236724854, + "learning_rate": 4.996099893793981e-05, + "loss": 5.9301, + "step": 2991 + }, + { + "epoch": 0.017794271576743743, + "grad_norm": 2.1959195137023926, + "learning_rate": 4.9960972852701165e-05, + "loss": 6.0422, + "step": 2992 + }, + { + "epoch": 0.017800218860024742, + "grad_norm": 2.3290374279022217, + "learning_rate": 4.99609467587489e-05, + "loss": 6.1926, + "step": 2993 + }, + { + "epoch": 0.017806166143305737, + "grad_norm": 2.3518059253692627, + "learning_rate": 4.996092065608303e-05, + "loss": 5.8583, + "step": 2994 + }, + { + "epoch": 0.017812113426586736, + "grad_norm": 2.4263339042663574, + "learning_rate": 4.996089454470355e-05, + "loss": 5.8149, + "step": 2995 + }, + { + "epoch": 0.01781806070986773, + "grad_norm": 2.0764389038085938, + "learning_rate": 4.99608684246105e-05, + "loss": 5.8782, + "step": 2996 + }, + { + "epoch": 0.01782400799314873, + "grad_norm": 2.086904764175415, + "learning_rate": 4.996084229580385e-05, + "loss": 5.7885, + "step": 2997 + }, + { + "epoch": 0.017829955276429728, + "grad_norm": 2.1907291412353516, + "learning_rate": 4.996081615828363e-05, + "loss": 5.9246, + "step": 2998 + }, + { + "epoch": 0.017835902559710723, + "grad_norm": 2.4596495628356934, + "learning_rate": 4.9960790012049854e-05, + "loss": 5.7786, + "step": 2999 + }, + { + "epoch": 0.01784184984299172, + "grad_norm": 2.0762453079223633, + "learning_rate": 4.996076385710252e-05, + "loss": 5.9901, + "step": 3000 + }, + { + "epoch": 0.01784779712627272, + "grad_norm": 2.068714141845703, + "learning_rate": 4.996073769344164e-05, + "loss": 5.9437, + "step": 3001 + }, + { + "epoch": 0.017853744409553715, + "grad_norm": 2.4760496616363525, + "learning_rate": 4.9960711521067226e-05, + "loss": 5.8633, + "step": 3002 + }, + { + "epoch": 0.017859691692834714, + "grad_norm": 2.395643949508667, + "learning_rate": 4.996068533997928e-05, + "loss": 5.8024, + "step": 3003 + }, + { + "epoch": 0.01786563897611571, + "grad_norm": 2.120586633682251, + "learning_rate": 4.996065915017783e-05, + "loss": 6.0712, + "step": 3004 + }, + { + "epoch": 0.017871586259396707, + "grad_norm": 2.384794235229492, + "learning_rate": 4.9960632951662866e-05, + "loss": 5.9089, + "step": 3005 + }, + { + "epoch": 0.017877533542677706, + "grad_norm": 2.24297833442688, + "learning_rate": 4.99606067444344e-05, + "loss": 6.0263, + "step": 3006 + }, + { + "epoch": 0.0178834808259587, + "grad_norm": 1.983299732208252, + "learning_rate": 4.996058052849245e-05, + "loss": 5.8706, + "step": 3007 + }, + { + "epoch": 0.0178894281092397, + "grad_norm": 2.2866950035095215, + "learning_rate": 4.996055430383701e-05, + "loss": 5.9031, + "step": 3008 + }, + { + "epoch": 0.017895375392520698, + "grad_norm": 2.3343560695648193, + "learning_rate": 4.996052807046811e-05, + "loss": 5.9155, + "step": 3009 + }, + { + "epoch": 0.017901322675801693, + "grad_norm": 2.079763650894165, + "learning_rate": 4.9960501828385734e-05, + "loss": 5.8102, + "step": 3010 + }, + { + "epoch": 0.01790726995908269, + "grad_norm": 2.0398895740509033, + "learning_rate": 4.996047557758991e-05, + "loss": 5.773, + "step": 3011 + }, + { + "epoch": 0.017913217242363687, + "grad_norm": 2.2478318214416504, + "learning_rate": 4.996044931808064e-05, + "loss": 5.8584, + "step": 3012 + }, + { + "epoch": 0.017919164525644685, + "grad_norm": 2.301398992538452, + "learning_rate": 4.996042304985794e-05, + "loss": 5.9053, + "step": 3013 + }, + { + "epoch": 0.017925111808925684, + "grad_norm": 2.0428216457366943, + "learning_rate": 4.996039677292181e-05, + "loss": 5.9571, + "step": 3014 + }, + { + "epoch": 0.01793105909220668, + "grad_norm": 2.049572467803955, + "learning_rate": 4.9960370487272266e-05, + "loss": 5.9464, + "step": 3015 + }, + { + "epoch": 0.017937006375487678, + "grad_norm": 2.1681618690490723, + "learning_rate": 4.996034419290931e-05, + "loss": 5.9969, + "step": 3016 + }, + { + "epoch": 0.017942953658768676, + "grad_norm": 2.3879425525665283, + "learning_rate": 4.996031788983296e-05, + "loss": 5.7962, + "step": 3017 + }, + { + "epoch": 0.01794890094204967, + "grad_norm": 2.232508420944214, + "learning_rate": 4.996029157804323e-05, + "loss": 5.8479, + "step": 3018 + }, + { + "epoch": 0.01795484822533067, + "grad_norm": 2.222257137298584, + "learning_rate": 4.9960265257540104e-05, + "loss": 5.952, + "step": 3019 + }, + { + "epoch": 0.017960795508611665, + "grad_norm": 2.213777542114258, + "learning_rate": 4.996023892832362e-05, + "loss": 5.9891, + "step": 3020 + }, + { + "epoch": 0.017966742791892663, + "grad_norm": 2.286097764968872, + "learning_rate": 4.996021259039377e-05, + "loss": 5.8995, + "step": 3021 + }, + { + "epoch": 0.017972690075173662, + "grad_norm": 2.1588432788848877, + "learning_rate": 4.996018624375056e-05, + "loss": 5.988, + "step": 3022 + }, + { + "epoch": 0.017978637358454657, + "grad_norm": 2.2468602657318115, + "learning_rate": 4.996015988839402e-05, + "loss": 5.9303, + "step": 3023 + }, + { + "epoch": 0.017984584641735656, + "grad_norm": 2.1732120513916016, + "learning_rate": 4.9960133524324135e-05, + "loss": 5.8696, + "step": 3024 + }, + { + "epoch": 0.01799053192501665, + "grad_norm": 2.2985105514526367, + "learning_rate": 4.996010715154093e-05, + "loss": 5.9251, + "step": 3025 + }, + { + "epoch": 0.01799647920829765, + "grad_norm": 2.1920788288116455, + "learning_rate": 4.996008077004441e-05, + "loss": 5.8023, + "step": 3026 + }, + { + "epoch": 0.018002426491578648, + "grad_norm": 1.9393725395202637, + "learning_rate": 4.996005437983458e-05, + "loss": 5.9576, + "step": 3027 + }, + { + "epoch": 0.018008373774859643, + "grad_norm": 2.115035057067871, + "learning_rate": 4.9960027980911455e-05, + "loss": 5.9105, + "step": 3028 + }, + { + "epoch": 0.01801432105814064, + "grad_norm": 2.143432855606079, + "learning_rate": 4.996000157327504e-05, + "loss": 5.9951, + "step": 3029 + }, + { + "epoch": 0.01802026834142164, + "grad_norm": 2.4353296756744385, + "learning_rate": 4.995997515692536e-05, + "loss": 5.9761, + "step": 3030 + }, + { + "epoch": 0.018026215624702635, + "grad_norm": 1.999054193496704, + "learning_rate": 4.995994873186239e-05, + "loss": 6.028, + "step": 3031 + }, + { + "epoch": 0.018032162907983634, + "grad_norm": 2.05645751953125, + "learning_rate": 4.995992229808617e-05, + "loss": 5.9778, + "step": 3032 + }, + { + "epoch": 0.01803811019126463, + "grad_norm": 1.948923110961914, + "learning_rate": 4.99598958555967e-05, + "loss": 5.8735, + "step": 3033 + }, + { + "epoch": 0.018044057474545627, + "grad_norm": 2.1208486557006836, + "learning_rate": 4.995986940439399e-05, + "loss": 5.7913, + "step": 3034 + }, + { + "epoch": 0.018050004757826626, + "grad_norm": 2.051079750061035, + "learning_rate": 4.995984294447804e-05, + "loss": 5.8097, + "step": 3035 + }, + { + "epoch": 0.01805595204110762, + "grad_norm": 2.021207571029663, + "learning_rate": 4.995981647584887e-05, + "loss": 5.8425, + "step": 3036 + }, + { + "epoch": 0.01806189932438862, + "grad_norm": 2.471315622329712, + "learning_rate": 4.995978999850649e-05, + "loss": 5.7735, + "step": 3037 + }, + { + "epoch": 0.018067846607669618, + "grad_norm": 2.604836940765381, + "learning_rate": 4.9959763512450896e-05, + "loss": 6.4525, + "step": 3038 + }, + { + "epoch": 0.018073793890950613, + "grad_norm": 2.375361919403076, + "learning_rate": 4.995973701768212e-05, + "loss": 5.8072, + "step": 3039 + }, + { + "epoch": 0.01807974117423161, + "grad_norm": 2.354280471801758, + "learning_rate": 4.995971051420014e-05, + "loss": 5.9434, + "step": 3040 + }, + { + "epoch": 0.018085688457512607, + "grad_norm": 2.7335755825042725, + "learning_rate": 4.9959684002005e-05, + "loss": 5.5899, + "step": 3041 + }, + { + "epoch": 0.018091635740793605, + "grad_norm": 2.244917869567871, + "learning_rate": 4.995965748109668e-05, + "loss": 5.799, + "step": 3042 + }, + { + "epoch": 0.018097583024074604, + "grad_norm": 2.2413697242736816, + "learning_rate": 4.995963095147521e-05, + "loss": 5.8635, + "step": 3043 + }, + { + "epoch": 0.0181035303073556, + "grad_norm": 2.122586488723755, + "learning_rate": 4.9959604413140584e-05, + "loss": 5.8098, + "step": 3044 + }, + { + "epoch": 0.018109477590636597, + "grad_norm": 2.407517910003662, + "learning_rate": 4.995957786609282e-05, + "loss": 6.0319, + "step": 3045 + }, + { + "epoch": 0.018115424873917596, + "grad_norm": 2.5628743171691895, + "learning_rate": 4.9959551310331934e-05, + "loss": 5.9561, + "step": 3046 + }, + { + "epoch": 0.01812137215719859, + "grad_norm": 2.335650682449341, + "learning_rate": 4.995952474585791e-05, + "loss": 6.1168, + "step": 3047 + }, + { + "epoch": 0.01812731944047959, + "grad_norm": 2.169771432876587, + "learning_rate": 4.995949817267078e-05, + "loss": 6.0555, + "step": 3048 + }, + { + "epoch": 0.018133266723760585, + "grad_norm": 2.2245211601257324, + "learning_rate": 4.995947159077056e-05, + "loss": 5.9084, + "step": 3049 + }, + { + "epoch": 0.018139214007041583, + "grad_norm": 2.2296931743621826, + "learning_rate": 4.995944500015723e-05, + "loss": 5.8878, + "step": 3050 + }, + { + "epoch": 0.018145161290322582, + "grad_norm": 2.2372493743896484, + "learning_rate": 4.995941840083082e-05, + "loss": 5.9521, + "step": 3051 + }, + { + "epoch": 0.018151108573603577, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.995939179279134e-05, + "loss": 5.899, + "step": 3052 + }, + { + "epoch": 0.018157055856884576, + "grad_norm": 2.218245267868042, + "learning_rate": 4.995936517603879e-05, + "loss": 6.0311, + "step": 3053 + }, + { + "epoch": 0.018163003140165574, + "grad_norm": 2.2877273559570312, + "learning_rate": 4.995933855057318e-05, + "loss": 6.0052, + "step": 3054 + }, + { + "epoch": 0.01816895042344657, + "grad_norm": 2.225764751434326, + "learning_rate": 4.995931191639453e-05, + "loss": 6.0373, + "step": 3055 + }, + { + "epoch": 0.018174897706727568, + "grad_norm": 2.5069313049316406, + "learning_rate": 4.995928527350284e-05, + "loss": 5.8729, + "step": 3056 + }, + { + "epoch": 0.018180844990008563, + "grad_norm": 2.089759588241577, + "learning_rate": 4.995925862189812e-05, + "loss": 5.9462, + "step": 3057 + }, + { + "epoch": 0.01818679227328956, + "grad_norm": 2.0159049034118652, + "learning_rate": 4.9959231961580376e-05, + "loss": 5.9276, + "step": 3058 + }, + { + "epoch": 0.01819273955657056, + "grad_norm": 2.207636594772339, + "learning_rate": 4.995920529254963e-05, + "loss": 5.9921, + "step": 3059 + }, + { + "epoch": 0.018198686839851555, + "grad_norm": 2.380232810974121, + "learning_rate": 4.995917861480588e-05, + "loss": 5.9092, + "step": 3060 + }, + { + "epoch": 0.018204634123132554, + "grad_norm": 2.073237895965576, + "learning_rate": 4.9959151928349134e-05, + "loss": 5.8472, + "step": 3061 + }, + { + "epoch": 0.01821058140641355, + "grad_norm": 1.824062705039978, + "learning_rate": 4.995912523317942e-05, + "loss": 5.7958, + "step": 3062 + }, + { + "epoch": 0.018216528689694547, + "grad_norm": 2.3961215019226074, + "learning_rate": 4.995909852929672e-05, + "loss": 6.1388, + "step": 3063 + }, + { + "epoch": 0.018222475972975546, + "grad_norm": 2.8391239643096924, + "learning_rate": 4.9959071816701065e-05, + "loss": 5.7564, + "step": 3064 + }, + { + "epoch": 0.01822842325625654, + "grad_norm": 2.4684112071990967, + "learning_rate": 4.995904509539244e-05, + "loss": 5.8372, + "step": 3065 + }, + { + "epoch": 0.01823437053953754, + "grad_norm": 2.419983386993408, + "learning_rate": 4.995901836537089e-05, + "loss": 5.9332, + "step": 3066 + }, + { + "epoch": 0.018240317822818538, + "grad_norm": 2.500227928161621, + "learning_rate": 4.99589916266364e-05, + "loss": 6.0848, + "step": 3067 + }, + { + "epoch": 0.018246265106099533, + "grad_norm": 2.1683971881866455, + "learning_rate": 4.9958964879188976e-05, + "loss": 6.0911, + "step": 3068 + }, + { + "epoch": 0.01825221238938053, + "grad_norm": 2.2345223426818848, + "learning_rate": 4.995893812302864e-05, + "loss": 6.016, + "step": 3069 + }, + { + "epoch": 0.018258159672661527, + "grad_norm": 2.318321466445923, + "learning_rate": 4.995891135815539e-05, + "loss": 5.9622, + "step": 3070 + }, + { + "epoch": 0.018264106955942525, + "grad_norm": 2.294602155685425, + "learning_rate": 4.9958884584569255e-05, + "loss": 5.8908, + "step": 3071 + }, + { + "epoch": 0.018270054239223524, + "grad_norm": 2.5472419261932373, + "learning_rate": 4.995885780227022e-05, + "loss": 5.7906, + "step": 3072 + }, + { + "epoch": 0.01827600152250452, + "grad_norm": 2.319101095199585, + "learning_rate": 4.995883101125831e-05, + "loss": 6.3366, + "step": 3073 + }, + { + "epoch": 0.018281948805785517, + "grad_norm": 2.3564186096191406, + "learning_rate": 4.995880421153353e-05, + "loss": 5.9863, + "step": 3074 + }, + { + "epoch": 0.018287896089066516, + "grad_norm": 2.434756278991699, + "learning_rate": 4.995877740309589e-05, + "loss": 5.885, + "step": 3075 + }, + { + "epoch": 0.01829384337234751, + "grad_norm": 2.062861442565918, + "learning_rate": 4.99587505859454e-05, + "loss": 6.0813, + "step": 3076 + }, + { + "epoch": 0.01829979065562851, + "grad_norm": 2.127049684524536, + "learning_rate": 4.995872376008206e-05, + "loss": 6.1226, + "step": 3077 + }, + { + "epoch": 0.018305737938909505, + "grad_norm": 2.288405656814575, + "learning_rate": 4.995869692550589e-05, + "loss": 5.9625, + "step": 3078 + }, + { + "epoch": 0.018311685222190503, + "grad_norm": 2.2387006282806396, + "learning_rate": 4.9958670082216905e-05, + "loss": 5.9479, + "step": 3079 + }, + { + "epoch": 0.018317632505471502, + "grad_norm": 2.18864107131958, + "learning_rate": 4.9958643230215096e-05, + "loss": 5.9223, + "step": 3080 + }, + { + "epoch": 0.018323579788752497, + "grad_norm": 2.3457415103912354, + "learning_rate": 4.995861636950049e-05, + "loss": 5.7857, + "step": 3081 + }, + { + "epoch": 0.018329527072033495, + "grad_norm": 2.6946494579315186, + "learning_rate": 4.995858950007309e-05, + "loss": 5.5546, + "step": 3082 + }, + { + "epoch": 0.018335474355314494, + "grad_norm": 2.5135412216186523, + "learning_rate": 4.99585626219329e-05, + "loss": 5.5624, + "step": 3083 + }, + { + "epoch": 0.01834142163859549, + "grad_norm": 2.6617767810821533, + "learning_rate": 4.9958535735079934e-05, + "loss": 5.8789, + "step": 3084 + }, + { + "epoch": 0.018347368921876488, + "grad_norm": 2.099261522293091, + "learning_rate": 4.9958508839514196e-05, + "loss": 5.9365, + "step": 3085 + }, + { + "epoch": 0.018353316205157483, + "grad_norm": 2.5267064571380615, + "learning_rate": 4.9958481935235715e-05, + "loss": 6.0935, + "step": 3086 + }, + { + "epoch": 0.01835926348843848, + "grad_norm": 2.3353283405303955, + "learning_rate": 4.995845502224447e-05, + "loss": 5.909, + "step": 3087 + }, + { + "epoch": 0.01836521077171948, + "grad_norm": 2.396430492401123, + "learning_rate": 4.9958428100540496e-05, + "loss": 6.0272, + "step": 3088 + }, + { + "epoch": 0.018371158055000475, + "grad_norm": 2.095308303833008, + "learning_rate": 4.9958401170123784e-05, + "loss": 5.9791, + "step": 3089 + }, + { + "epoch": 0.018377105338281473, + "grad_norm": 2.7606077194213867, + "learning_rate": 4.9958374230994357e-05, + "loss": 5.9716, + "step": 3090 + }, + { + "epoch": 0.01838305262156247, + "grad_norm": 2.4490914344787598, + "learning_rate": 4.995834728315222e-05, + "loss": 5.8763, + "step": 3091 + }, + { + "epoch": 0.018388999904843467, + "grad_norm": 2.709092855453491, + "learning_rate": 4.9958320326597385e-05, + "loss": 5.74, + "step": 3092 + }, + { + "epoch": 0.018394947188124466, + "grad_norm": 2.8829305171966553, + "learning_rate": 4.9958293361329856e-05, + "loss": 5.8469, + "step": 3093 + }, + { + "epoch": 0.01840089447140546, + "grad_norm": 2.6500396728515625, + "learning_rate": 4.995826638734964e-05, + "loss": 5.8578, + "step": 3094 + }, + { + "epoch": 0.01840684175468646, + "grad_norm": 2.0665056705474854, + "learning_rate": 4.9958239404656755e-05, + "loss": 5.9662, + "step": 3095 + }, + { + "epoch": 0.018412789037967458, + "grad_norm": 2.3198931217193604, + "learning_rate": 4.9958212413251205e-05, + "loss": 6.0663, + "step": 3096 + }, + { + "epoch": 0.018418736321248453, + "grad_norm": 2.9056031703948975, + "learning_rate": 4.9958185413133e-05, + "loss": 5.8015, + "step": 3097 + }, + { + "epoch": 0.01842468360452945, + "grad_norm": 2.446164131164551, + "learning_rate": 4.995815840430216e-05, + "loss": 5.6878, + "step": 3098 + }, + { + "epoch": 0.018430630887810447, + "grad_norm": 2.797506093978882, + "learning_rate": 4.995813138675867e-05, + "loss": 5.7675, + "step": 3099 + }, + { + "epoch": 0.018436578171091445, + "grad_norm": 3.2914962768554688, + "learning_rate": 4.995810436050256e-05, + "loss": 6.3661, + "step": 3100 + }, + { + "epoch": 0.018442525454372444, + "grad_norm": 2.444363594055176, + "learning_rate": 4.995807732553384e-05, + "loss": 5.9251, + "step": 3101 + }, + { + "epoch": 0.01844847273765344, + "grad_norm": 2.526951551437378, + "learning_rate": 4.9958050281852505e-05, + "loss": 5.8202, + "step": 3102 + }, + { + "epoch": 0.018454420020934437, + "grad_norm": 2.2046117782592773, + "learning_rate": 4.995802322945857e-05, + "loss": 6.0572, + "step": 3103 + }, + { + "epoch": 0.018460367304215436, + "grad_norm": 2.5484018325805664, + "learning_rate": 4.9957996168352055e-05, + "loss": 6.1215, + "step": 3104 + }, + { + "epoch": 0.01846631458749643, + "grad_norm": 2.4785003662109375, + "learning_rate": 4.9957969098532965e-05, + "loss": 5.9524, + "step": 3105 + }, + { + "epoch": 0.01847226187077743, + "grad_norm": 2.9028711318969727, + "learning_rate": 4.9957942020001294e-05, + "loss": 6.1175, + "step": 3106 + }, + { + "epoch": 0.018478209154058425, + "grad_norm": 2.1766602993011475, + "learning_rate": 4.995791493275707e-05, + "loss": 5.9746, + "step": 3107 + }, + { + "epoch": 0.018484156437339423, + "grad_norm": 2.079423189163208, + "learning_rate": 4.995788783680029e-05, + "loss": 5.9463, + "step": 3108 + }, + { + "epoch": 0.018490103720620422, + "grad_norm": 2.285184144973755, + "learning_rate": 4.995786073213098e-05, + "loss": 5.5174, + "step": 3109 + }, + { + "epoch": 0.018496051003901417, + "grad_norm": 2.170018196105957, + "learning_rate": 4.9957833618749126e-05, + "loss": 5.7948, + "step": 3110 + }, + { + "epoch": 0.018501998287182415, + "grad_norm": 2.284517526626587, + "learning_rate": 4.9957806496654754e-05, + "loss": 5.9455, + "step": 3111 + }, + { + "epoch": 0.018507945570463414, + "grad_norm": 2.5539982318878174, + "learning_rate": 4.9957779365847876e-05, + "loss": 5.9791, + "step": 3112 + }, + { + "epoch": 0.01851389285374441, + "grad_norm": 2.1735522747039795, + "learning_rate": 4.995775222632849e-05, + "loss": 5.9549, + "step": 3113 + }, + { + "epoch": 0.018519840137025408, + "grad_norm": 2.2272653579711914, + "learning_rate": 4.995772507809662e-05, + "loss": 5.8618, + "step": 3114 + }, + { + "epoch": 0.018525787420306403, + "grad_norm": 1.9390417337417603, + "learning_rate": 4.995769792115225e-05, + "loss": 5.9617, + "step": 3115 + }, + { + "epoch": 0.0185317347035874, + "grad_norm": 2.6526312828063965, + "learning_rate": 4.9957670755495414e-05, + "loss": 5.9296, + "step": 3116 + }, + { + "epoch": 0.0185376819868684, + "grad_norm": 2.533996105194092, + "learning_rate": 4.995764358112611e-05, + "loss": 6.0045, + "step": 3117 + }, + { + "epoch": 0.018543629270149395, + "grad_norm": 2.183347225189209, + "learning_rate": 4.995761639804436e-05, + "loss": 5.9254, + "step": 3118 + }, + { + "epoch": 0.018549576553430393, + "grad_norm": 1.9411321878433228, + "learning_rate": 4.995758920625015e-05, + "loss": 5.9404, + "step": 3119 + }, + { + "epoch": 0.01855552383671139, + "grad_norm": 4.914453029632568, + "learning_rate": 4.9957562005743514e-05, + "loss": 5.8139, + "step": 3120 + }, + { + "epoch": 0.018561471119992387, + "grad_norm": 2.3052754402160645, + "learning_rate": 4.9957534796524444e-05, + "loss": 5.6525, + "step": 3121 + }, + { + "epoch": 0.018567418403273386, + "grad_norm": 2.424464464187622, + "learning_rate": 4.995750757859296e-05, + "loss": 5.9599, + "step": 3122 + }, + { + "epoch": 0.01857336568655438, + "grad_norm": 2.1392033100128174, + "learning_rate": 4.995748035194907e-05, + "loss": 5.9558, + "step": 3123 + }, + { + "epoch": 0.01857931296983538, + "grad_norm": 4.67656135559082, + "learning_rate": 4.995745311659278e-05, + "loss": 5.7606, + "step": 3124 + }, + { + "epoch": 0.018585260253116378, + "grad_norm": 2.0772082805633545, + "learning_rate": 4.99574258725241e-05, + "loss": 5.9328, + "step": 3125 + }, + { + "epoch": 0.018591207536397373, + "grad_norm": 2.0255486965179443, + "learning_rate": 4.995739861974303e-05, + "loss": 5.9395, + "step": 3126 + }, + { + "epoch": 0.01859715481967837, + "grad_norm": 2.3629064559936523, + "learning_rate": 4.995737135824961e-05, + "loss": 5.9663, + "step": 3127 + }, + { + "epoch": 0.018603102102959367, + "grad_norm": 1.9924237728118896, + "learning_rate": 4.9957344088043814e-05, + "loss": 5.8998, + "step": 3128 + }, + { + "epoch": 0.018609049386240365, + "grad_norm": 2.096774101257324, + "learning_rate": 4.9957316809125676e-05, + "loss": 5.7178, + "step": 3129 + }, + { + "epoch": 0.018614996669521364, + "grad_norm": 2.2288100719451904, + "learning_rate": 4.9957289521495194e-05, + "loss": 5.9096, + "step": 3130 + }, + { + "epoch": 0.01862094395280236, + "grad_norm": 2.456099033355713, + "learning_rate": 4.995726222515238e-05, + "loss": 5.7738, + "step": 3131 + }, + { + "epoch": 0.018626891236083357, + "grad_norm": 2.238218069076538, + "learning_rate": 4.995723492009724e-05, + "loss": 5.6929, + "step": 3132 + }, + { + "epoch": 0.018632838519364356, + "grad_norm": 1.8309845924377441, + "learning_rate": 4.9957207606329795e-05, + "loss": 5.9339, + "step": 3133 + }, + { + "epoch": 0.01863878580264535, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.995718028385003e-05, + "loss": 5.9704, + "step": 3134 + }, + { + "epoch": 0.01864473308592635, + "grad_norm": 2.0929813385009766, + "learning_rate": 4.9957152952657995e-05, + "loss": 5.7598, + "step": 3135 + }, + { + "epoch": 0.018650680369207345, + "grad_norm": 2.2813265323638916, + "learning_rate": 4.995712561275366e-05, + "loss": 5.7986, + "step": 3136 + }, + { + "epoch": 0.018656627652488343, + "grad_norm": 2.1189653873443604, + "learning_rate": 4.995709826413705e-05, + "loss": 5.6603, + "step": 3137 + }, + { + "epoch": 0.01866257493576934, + "grad_norm": 2.1439480781555176, + "learning_rate": 4.9957070906808185e-05, + "loss": 5.6952, + "step": 3138 + }, + { + "epoch": 0.018668522219050337, + "grad_norm": 2.4345993995666504, + "learning_rate": 4.995704354076706e-05, + "loss": 5.7531, + "step": 3139 + }, + { + "epoch": 0.018674469502331335, + "grad_norm": 2.5551047325134277, + "learning_rate": 4.995701616601368e-05, + "loss": 5.544, + "step": 3140 + }, + { + "epoch": 0.018680416785612334, + "grad_norm": 2.333603620529175, + "learning_rate": 4.9956988782548075e-05, + "loss": 5.5732, + "step": 3141 + }, + { + "epoch": 0.01868636406889333, + "grad_norm": 2.2983827590942383, + "learning_rate": 4.995696139037024e-05, + "loss": 5.8779, + "step": 3142 + }, + { + "epoch": 0.018692311352174328, + "grad_norm": 2.7525672912597656, + "learning_rate": 4.995693398948018e-05, + "loss": 5.5998, + "step": 3143 + }, + { + "epoch": 0.018698258635455323, + "grad_norm": 2.3622052669525146, + "learning_rate": 4.995690657987793e-05, + "loss": 5.8851, + "step": 3144 + }, + { + "epoch": 0.01870420591873632, + "grad_norm": 2.4975669384002686, + "learning_rate": 4.995687916156346e-05, + "loss": 5.6388, + "step": 3145 + }, + { + "epoch": 0.01871015320201732, + "grad_norm": 2.5763049125671387, + "learning_rate": 4.9956851734536816e-05, + "loss": 5.4931, + "step": 3146 + }, + { + "epoch": 0.018716100485298315, + "grad_norm": 2.7156779766082764, + "learning_rate": 4.995682429879799e-05, + "loss": 5.8035, + "step": 3147 + }, + { + "epoch": 0.018722047768579313, + "grad_norm": 2.259134292602539, + "learning_rate": 4.995679685434699e-05, + "loss": 5.9519, + "step": 3148 + }, + { + "epoch": 0.018727995051860312, + "grad_norm": 2.544829845428467, + "learning_rate": 4.995676940118383e-05, + "loss": 5.7373, + "step": 3149 + }, + { + "epoch": 0.018733942335141307, + "grad_norm": 2.326660633087158, + "learning_rate": 4.995674193930853e-05, + "loss": 5.7719, + "step": 3150 + }, + { + "epoch": 0.018739889618422306, + "grad_norm": 2.25370192527771, + "learning_rate": 4.995671446872108e-05, + "loss": 5.813, + "step": 3151 + }, + { + "epoch": 0.0187458369017033, + "grad_norm": 2.1467692852020264, + "learning_rate": 4.99566869894215e-05, + "loss": 5.5836, + "step": 3152 + }, + { + "epoch": 0.0187517841849843, + "grad_norm": 2.30096697807312, + "learning_rate": 4.9956659501409796e-05, + "loss": 5.8249, + "step": 3153 + }, + { + "epoch": 0.018757731468265298, + "grad_norm": 2.3050386905670166, + "learning_rate": 4.9956632004685986e-05, + "loss": 5.6806, + "step": 3154 + }, + { + "epoch": 0.018763678751546293, + "grad_norm": 2.473008632659912, + "learning_rate": 4.995660449925007e-05, + "loss": 5.4512, + "step": 3155 + }, + { + "epoch": 0.01876962603482729, + "grad_norm": 2.0691702365875244, + "learning_rate": 4.995657698510206e-05, + "loss": 5.6582, + "step": 3156 + }, + { + "epoch": 0.018775573318108287, + "grad_norm": 2.332423686981201, + "learning_rate": 4.995654946224197e-05, + "loss": 5.6017, + "step": 3157 + }, + { + "epoch": 0.018781520601389285, + "grad_norm": 2.6423730850219727, + "learning_rate": 4.9956521930669806e-05, + "loss": 5.619, + "step": 3158 + }, + { + "epoch": 0.018787467884670284, + "grad_norm": 3.0884950160980225, + "learning_rate": 4.995649439038558e-05, + "loss": 5.7813, + "step": 3159 + }, + { + "epoch": 0.01879341516795128, + "grad_norm": 2.4923598766326904, + "learning_rate": 4.995646684138929e-05, + "loss": 5.8089, + "step": 3160 + }, + { + "epoch": 0.018799362451232277, + "grad_norm": 2.5505683422088623, + "learning_rate": 4.9956439283680965e-05, + "loss": 5.8171, + "step": 3161 + }, + { + "epoch": 0.018805309734513276, + "grad_norm": 2.7343056201934814, + "learning_rate": 4.99564117172606e-05, + "loss": 6.3472, + "step": 3162 + }, + { + "epoch": 0.01881125701779427, + "grad_norm": 2.9170796871185303, + "learning_rate": 4.995638414212821e-05, + "loss": 5.7478, + "step": 3163 + }, + { + "epoch": 0.01881720430107527, + "grad_norm": 2.392648696899414, + "learning_rate": 4.9956356558283815e-05, + "loss": 5.8105, + "step": 3164 + }, + { + "epoch": 0.018823151584356265, + "grad_norm": 2.532207727432251, + "learning_rate": 4.9956328965727394e-05, + "loss": 5.9285, + "step": 3165 + }, + { + "epoch": 0.018829098867637263, + "grad_norm": 2.6717050075531006, + "learning_rate": 4.995630136445899e-05, + "loss": 6.0344, + "step": 3166 + }, + { + "epoch": 0.01883504615091826, + "grad_norm": 2.1829564571380615, + "learning_rate": 4.99562737544786e-05, + "loss": 6.0078, + "step": 3167 + }, + { + "epoch": 0.018840993434199257, + "grad_norm": 2.2728323936462402, + "learning_rate": 4.995624613578622e-05, + "loss": 5.8211, + "step": 3168 + }, + { + "epoch": 0.018846940717480255, + "grad_norm": 2.046717882156372, + "learning_rate": 4.995621850838189e-05, + "loss": 5.9685, + "step": 3169 + }, + { + "epoch": 0.018852888000761254, + "grad_norm": 2.737494945526123, + "learning_rate": 4.995619087226559e-05, + "loss": 5.649, + "step": 3170 + }, + { + "epoch": 0.01885883528404225, + "grad_norm": 2.276503801345825, + "learning_rate": 4.9956163227437345e-05, + "loss": 5.8137, + "step": 3171 + }, + { + "epoch": 0.018864782567323247, + "grad_norm": 2.2799227237701416, + "learning_rate": 4.9956135573897155e-05, + "loss": 5.8277, + "step": 3172 + }, + { + "epoch": 0.018870729850604243, + "grad_norm": 2.131425619125366, + "learning_rate": 4.995610791164505e-05, + "loss": 5.8909, + "step": 3173 + }, + { + "epoch": 0.01887667713388524, + "grad_norm": 2.2295737266540527, + "learning_rate": 4.995608024068102e-05, + "loss": 5.8236, + "step": 3174 + }, + { + "epoch": 0.01888262441716624, + "grad_norm": 2.30082631111145, + "learning_rate": 4.9956052561005076e-05, + "loss": 5.7331, + "step": 3175 + }, + { + "epoch": 0.018888571700447235, + "grad_norm": 2.751847505569458, + "learning_rate": 4.9956024872617225e-05, + "loss": 5.8673, + "step": 3176 + }, + { + "epoch": 0.018894518983728233, + "grad_norm": 2.4597535133361816, + "learning_rate": 4.995599717551749e-05, + "loss": 5.7561, + "step": 3177 + }, + { + "epoch": 0.018900466267009232, + "grad_norm": 2.1418228149414062, + "learning_rate": 4.9955969469705874e-05, + "loss": 5.7112, + "step": 3178 + }, + { + "epoch": 0.018906413550290227, + "grad_norm": 2.0560619831085205, + "learning_rate": 4.9955941755182395e-05, + "loss": 5.7764, + "step": 3179 + }, + { + "epoch": 0.018912360833571226, + "grad_norm": 2.268781900405884, + "learning_rate": 4.9955914031947046e-05, + "loss": 5.7319, + "step": 3180 + }, + { + "epoch": 0.01891830811685222, + "grad_norm": 2.6272811889648438, + "learning_rate": 4.995588629999985e-05, + "loss": 6.0601, + "step": 3181 + }, + { + "epoch": 0.01892425540013322, + "grad_norm": 2.1991870403289795, + "learning_rate": 4.995585855934081e-05, + "loss": 5.602, + "step": 3182 + }, + { + "epoch": 0.018930202683414218, + "grad_norm": 2.0521514415740967, + "learning_rate": 4.995583080996994e-05, + "loss": 5.8075, + "step": 3183 + }, + { + "epoch": 0.018936149966695213, + "grad_norm": 2.153473138809204, + "learning_rate": 4.995580305188724e-05, + "loss": 5.8219, + "step": 3184 + }, + { + "epoch": 0.01894209724997621, + "grad_norm": 2.0663251876831055, + "learning_rate": 4.9955775285092735e-05, + "loss": 5.836, + "step": 3185 + }, + { + "epoch": 0.018948044533257206, + "grad_norm": 1.8808318376541138, + "learning_rate": 4.995574750958642e-05, + "loss": 5.7938, + "step": 3186 + }, + { + "epoch": 0.018953991816538205, + "grad_norm": 2.256012201309204, + "learning_rate": 4.995571972536831e-05, + "loss": 5.6404, + "step": 3187 + }, + { + "epoch": 0.018959939099819204, + "grad_norm": 2.29636287689209, + "learning_rate": 4.995569193243843e-05, + "loss": 5.7161, + "step": 3188 + }, + { + "epoch": 0.0189658863831002, + "grad_norm": 2.728804588317871, + "learning_rate": 4.995566413079676e-05, + "loss": 5.8165, + "step": 3189 + }, + { + "epoch": 0.018971833666381197, + "grad_norm": 2.3115599155426025, + "learning_rate": 4.995563632044333e-05, + "loss": 5.7004, + "step": 3190 + }, + { + "epoch": 0.018977780949662196, + "grad_norm": 2.1607725620269775, + "learning_rate": 4.995560850137815e-05, + "loss": 5.7788, + "step": 3191 + }, + { + "epoch": 0.01898372823294319, + "grad_norm": 2.322132110595703, + "learning_rate": 4.995558067360122e-05, + "loss": 5.5677, + "step": 3192 + }, + { + "epoch": 0.01898967551622419, + "grad_norm": 2.148022174835205, + "learning_rate": 4.995555283711256e-05, + "loss": 5.7708, + "step": 3193 + }, + { + "epoch": 0.018995622799505184, + "grad_norm": 2.339812994003296, + "learning_rate": 4.9955524991912165e-05, + "loss": 5.7945, + "step": 3194 + }, + { + "epoch": 0.019001570082786183, + "grad_norm": 1.9469980001449585, + "learning_rate": 4.995549713800006e-05, + "loss": 5.695, + "step": 3195 + }, + { + "epoch": 0.01900751736606718, + "grad_norm": 2.1744890213012695, + "learning_rate": 4.9955469275376254e-05, + "loss": 5.7544, + "step": 3196 + }, + { + "epoch": 0.019013464649348177, + "grad_norm": 2.175123691558838, + "learning_rate": 4.9955441404040745e-05, + "loss": 5.598, + "step": 3197 + }, + { + "epoch": 0.019019411932629175, + "grad_norm": 2.3011369705200195, + "learning_rate": 4.995541352399355e-05, + "loss": 5.7069, + "step": 3198 + }, + { + "epoch": 0.019025359215910174, + "grad_norm": 2.2227025032043457, + "learning_rate": 4.9955385635234675e-05, + "loss": 5.6854, + "step": 3199 + }, + { + "epoch": 0.01903130649919117, + "grad_norm": 2.5465073585510254, + "learning_rate": 4.995535773776414e-05, + "loss": 5.9085, + "step": 3200 + }, + { + "epoch": 0.019037253782472167, + "grad_norm": 2.936612844467163, + "learning_rate": 4.995532983158194e-05, + "loss": 6.0519, + "step": 3201 + }, + { + "epoch": 0.019043201065753163, + "grad_norm": 2.8298418521881104, + "learning_rate": 4.9955301916688094e-05, + "loss": 5.9473, + "step": 3202 + }, + { + "epoch": 0.01904914834903416, + "grad_norm": 2.2295944690704346, + "learning_rate": 4.9955273993082615e-05, + "loss": 5.9652, + "step": 3203 + }, + { + "epoch": 0.01905509563231516, + "grad_norm": 2.7771801948547363, + "learning_rate": 4.9955246060765505e-05, + "loss": 5.9291, + "step": 3204 + }, + { + "epoch": 0.019061042915596155, + "grad_norm": 3.0721678733825684, + "learning_rate": 4.9955218119736776e-05, + "loss": 6.2319, + "step": 3205 + }, + { + "epoch": 0.019066990198877153, + "grad_norm": 2.7866547107696533, + "learning_rate": 4.9955190169996434e-05, + "loss": 6.0412, + "step": 3206 + }, + { + "epoch": 0.019072937482158152, + "grad_norm": 2.287216901779175, + "learning_rate": 4.99551622115445e-05, + "loss": 5.6435, + "step": 3207 + }, + { + "epoch": 0.019078884765439147, + "grad_norm": 2.3618898391723633, + "learning_rate": 4.995513424438098e-05, + "loss": 5.7711, + "step": 3208 + }, + { + "epoch": 0.019084832048720145, + "grad_norm": 2.192997932434082, + "learning_rate": 4.995510626850587e-05, + "loss": 5.8351, + "step": 3209 + }, + { + "epoch": 0.01909077933200114, + "grad_norm": 2.252722978591919, + "learning_rate": 4.995507828391919e-05, + "loss": 5.5989, + "step": 3210 + }, + { + "epoch": 0.01909672661528214, + "grad_norm": 2.451167106628418, + "learning_rate": 4.995505029062095e-05, + "loss": 5.8533, + "step": 3211 + }, + { + "epoch": 0.019102673898563138, + "grad_norm": 2.1897904872894287, + "learning_rate": 4.995502228861116e-05, + "loss": 6.2807, + "step": 3212 + }, + { + "epoch": 0.019108621181844133, + "grad_norm": 2.196805715560913, + "learning_rate": 4.995499427788984e-05, + "loss": 5.9418, + "step": 3213 + }, + { + "epoch": 0.01911456846512513, + "grad_norm": 1.9791160821914673, + "learning_rate": 4.995496625845698e-05, + "loss": 5.9909, + "step": 3214 + }, + { + "epoch": 0.019120515748406126, + "grad_norm": 2.3592171669006348, + "learning_rate": 4.995493823031261e-05, + "loss": 5.807, + "step": 3215 + }, + { + "epoch": 0.019126463031687125, + "grad_norm": 2.8238747119903564, + "learning_rate": 4.9954910193456713e-05, + "loss": 5.7587, + "step": 3216 + }, + { + "epoch": 0.019132410314968123, + "grad_norm": 2.4695584774017334, + "learning_rate": 4.9954882147889326e-05, + "loss": 5.746, + "step": 3217 + }, + { + "epoch": 0.01913835759824912, + "grad_norm": 2.3983800411224365, + "learning_rate": 4.995485409361044e-05, + "loss": 5.9364, + "step": 3218 + }, + { + "epoch": 0.019144304881530117, + "grad_norm": 2.1279618740081787, + "learning_rate": 4.995482603062008e-05, + "loss": 5.9383, + "step": 3219 + }, + { + "epoch": 0.019150252164811116, + "grad_norm": 18.583581924438477, + "learning_rate": 4.9954797958918244e-05, + "loss": 5.8596, + "step": 3220 + }, + { + "epoch": 0.01915619944809211, + "grad_norm": 2.1420741081237793, + "learning_rate": 4.995476987850495e-05, + "loss": 5.9311, + "step": 3221 + }, + { + "epoch": 0.01916214673137311, + "grad_norm": 2.314380645751953, + "learning_rate": 4.99547417893802e-05, + "loss": 5.8229, + "step": 3222 + }, + { + "epoch": 0.019168094014654104, + "grad_norm": 2.3818936347961426, + "learning_rate": 4.9954713691544004e-05, + "loss": 6.1124, + "step": 3223 + }, + { + "epoch": 0.019174041297935103, + "grad_norm": 2.521789789199829, + "learning_rate": 4.9954685584996377e-05, + "loss": 5.8939, + "step": 3224 + }, + { + "epoch": 0.0191799885812161, + "grad_norm": 1.9583165645599365, + "learning_rate": 4.9954657469737334e-05, + "loss": 6.0005, + "step": 3225 + }, + { + "epoch": 0.019185935864497097, + "grad_norm": 2.349581241607666, + "learning_rate": 4.995462934576687e-05, + "loss": 5.8467, + "step": 3226 + }, + { + "epoch": 0.019191883147778095, + "grad_norm": 2.081836223602295, + "learning_rate": 4.9954601213085e-05, + "loss": 6.1001, + "step": 3227 + }, + { + "epoch": 0.019197830431059094, + "grad_norm": 2.3207972049713135, + "learning_rate": 4.995457307169175e-05, + "loss": 5.794, + "step": 3228 + }, + { + "epoch": 0.01920377771434009, + "grad_norm": 1.8516380786895752, + "learning_rate": 4.99545449215871e-05, + "loss": 5.785, + "step": 3229 + }, + { + "epoch": 0.019209724997621087, + "grad_norm": 2.3822309970855713, + "learning_rate": 4.995451676277109e-05, + "loss": 5.7861, + "step": 3230 + }, + { + "epoch": 0.019215672280902082, + "grad_norm": 2.857161283493042, + "learning_rate": 4.995448859524371e-05, + "loss": 5.8333, + "step": 3231 + }, + { + "epoch": 0.01922161956418308, + "grad_norm": 2.201551914215088, + "learning_rate": 4.9954460419004974e-05, + "loss": 5.8653, + "step": 3232 + }, + { + "epoch": 0.01922756684746408, + "grad_norm": 2.1707022190093994, + "learning_rate": 4.995443223405489e-05, + "loss": 5.772, + "step": 3233 + }, + { + "epoch": 0.019233514130745075, + "grad_norm": 2.1242458820343018, + "learning_rate": 4.995440404039348e-05, + "loss": 5.8806, + "step": 3234 + }, + { + "epoch": 0.019239461414026073, + "grad_norm": 2.106945514678955, + "learning_rate": 4.995437583802074e-05, + "loss": 5.6746, + "step": 3235 + }, + { + "epoch": 0.019245408697307072, + "grad_norm": 2.083181858062744, + "learning_rate": 4.995434762693669e-05, + "loss": 5.9332, + "step": 3236 + }, + { + "epoch": 0.019251355980588067, + "grad_norm": 2.1857783794403076, + "learning_rate": 4.995431940714134e-05, + "loss": 5.6663, + "step": 3237 + }, + { + "epoch": 0.019257303263869065, + "grad_norm": 2.031041145324707, + "learning_rate": 4.995429117863468e-05, + "loss": 5.6734, + "step": 3238 + }, + { + "epoch": 0.01926325054715006, + "grad_norm": 2.31980037689209, + "learning_rate": 4.995426294141674e-05, + "loss": 5.8851, + "step": 3239 + }, + { + "epoch": 0.01926919783043106, + "grad_norm": 2.102965831756592, + "learning_rate": 4.9954234695487535e-05, + "loss": 5.7092, + "step": 3240 + }, + { + "epoch": 0.019275145113712058, + "grad_norm": 2.031169891357422, + "learning_rate": 4.995420644084705e-05, + "loss": 5.9755, + "step": 3241 + }, + { + "epoch": 0.019281092396993053, + "grad_norm": 2.2460241317749023, + "learning_rate": 4.995417817749532e-05, + "loss": 5.8895, + "step": 3242 + }, + { + "epoch": 0.01928703968027405, + "grad_norm": 2.618539571762085, + "learning_rate": 4.9954149905432336e-05, + "loss": 5.6964, + "step": 3243 + }, + { + "epoch": 0.019292986963555046, + "grad_norm": 2.1615748405456543, + "learning_rate": 4.995412162465812e-05, + "loss": 5.7162, + "step": 3244 + }, + { + "epoch": 0.019298934246836045, + "grad_norm": 2.363663673400879, + "learning_rate": 4.995409333517268e-05, + "loss": 5.7957, + "step": 3245 + }, + { + "epoch": 0.019304881530117043, + "grad_norm": 2.131084680557251, + "learning_rate": 4.9954065036976025e-05, + "loss": 5.7925, + "step": 3246 + }, + { + "epoch": 0.01931082881339804, + "grad_norm": 2.4043118953704834, + "learning_rate": 4.9954036730068155e-05, + "loss": 5.7895, + "step": 3247 + }, + { + "epoch": 0.019316776096679037, + "grad_norm": 2.521756887435913, + "learning_rate": 4.995400841444909e-05, + "loss": 5.6279, + "step": 3248 + }, + { + "epoch": 0.019322723379960036, + "grad_norm": 2.1791021823883057, + "learning_rate": 4.9953980090118846e-05, + "loss": 5.717, + "step": 3249 + }, + { + "epoch": 0.01932867066324103, + "grad_norm": 2.6562376022338867, + "learning_rate": 4.995395175707742e-05, + "loss": 5.7407, + "step": 3250 + }, + { + "epoch": 0.01933461794652203, + "grad_norm": 2.4377942085266113, + "learning_rate": 4.995392341532483e-05, + "loss": 5.539, + "step": 3251 + }, + { + "epoch": 0.019340565229803024, + "grad_norm": 2.3716847896575928, + "learning_rate": 4.995389506486109e-05, + "loss": 5.7251, + "step": 3252 + }, + { + "epoch": 0.019346512513084023, + "grad_norm": 2.2509348392486572, + "learning_rate": 4.995386670568619e-05, + "loss": 5.8749, + "step": 3253 + }, + { + "epoch": 0.01935245979636502, + "grad_norm": 2.265608072280884, + "learning_rate": 4.995383833780016e-05, + "loss": 5.8236, + "step": 3254 + }, + { + "epoch": 0.019358407079646017, + "grad_norm": 1.972179651260376, + "learning_rate": 4.9953809961203e-05, + "loss": 5.9235, + "step": 3255 + }, + { + "epoch": 0.019364354362927015, + "grad_norm": 2.314030170440674, + "learning_rate": 4.9953781575894723e-05, + "loss": 5.7355, + "step": 3256 + }, + { + "epoch": 0.019370301646208014, + "grad_norm": 2.3061349391937256, + "learning_rate": 4.995375318187534e-05, + "loss": 5.7337, + "step": 3257 + }, + { + "epoch": 0.01937624892948901, + "grad_norm": 1.9106477499008179, + "learning_rate": 4.9953724779144864e-05, + "loss": 5.8342, + "step": 3258 + }, + { + "epoch": 0.019382196212770007, + "grad_norm": 2.313750982284546, + "learning_rate": 4.9953696367703296e-05, + "loss": 5.7981, + "step": 3259 + }, + { + "epoch": 0.019388143496051002, + "grad_norm": 2.4477834701538086, + "learning_rate": 4.9953667947550644e-05, + "loss": 5.8212, + "step": 3260 + }, + { + "epoch": 0.019394090779332, + "grad_norm": 2.072659730911255, + "learning_rate": 4.9953639518686936e-05, + "loss": 5.7335, + "step": 3261 + }, + { + "epoch": 0.019400038062613, + "grad_norm": 2.0848984718322754, + "learning_rate": 4.995361108111216e-05, + "loss": 5.7427, + "step": 3262 + }, + { + "epoch": 0.019405985345893995, + "grad_norm": 1.938265323638916, + "learning_rate": 4.9953582634826345e-05, + "loss": 5.7946, + "step": 3263 + }, + { + "epoch": 0.019411932629174993, + "grad_norm": 2.227194309234619, + "learning_rate": 4.995355417982949e-05, + "loss": 5.9095, + "step": 3264 + }, + { + "epoch": 0.01941787991245599, + "grad_norm": 2.3245849609375, + "learning_rate": 4.9953525716121604e-05, + "loss": 5.802, + "step": 3265 + }, + { + "epoch": 0.019423827195736987, + "grad_norm": 2.08950138092041, + "learning_rate": 4.9953497243702696e-05, + "loss": 5.9001, + "step": 3266 + }, + { + "epoch": 0.019429774479017985, + "grad_norm": 1.93153715133667, + "learning_rate": 4.9953468762572786e-05, + "loss": 5.9042, + "step": 3267 + }, + { + "epoch": 0.01943572176229898, + "grad_norm": 2.4099066257476807, + "learning_rate": 4.9953440272731874e-05, + "loss": 5.8181, + "step": 3268 + }, + { + "epoch": 0.01944166904557998, + "grad_norm": 2.078752279281616, + "learning_rate": 4.995341177417998e-05, + "loss": 5.8771, + "step": 3269 + }, + { + "epoch": 0.019447616328860978, + "grad_norm": 2.012592077255249, + "learning_rate": 4.9953383266917106e-05, + "loss": 5.8135, + "step": 3270 + }, + { + "epoch": 0.019453563612141973, + "grad_norm": 2.0364151000976562, + "learning_rate": 4.995335475094326e-05, + "loss": 5.8767, + "step": 3271 + }, + { + "epoch": 0.01945951089542297, + "grad_norm": 2.0447049140930176, + "learning_rate": 4.995332622625846e-05, + "loss": 5.8236, + "step": 3272 + }, + { + "epoch": 0.01946545817870397, + "grad_norm": 2.2354300022125244, + "learning_rate": 4.995329769286271e-05, + "loss": 5.7794, + "step": 3273 + }, + { + "epoch": 0.019471405461984965, + "grad_norm": 2.031331777572632, + "learning_rate": 4.995326915075602e-05, + "loss": 5.87, + "step": 3274 + }, + { + "epoch": 0.019477352745265963, + "grad_norm": 2.2116496562957764, + "learning_rate": 4.99532405999384e-05, + "loss": 5.885, + "step": 3275 + }, + { + "epoch": 0.01948330002854696, + "grad_norm": 1.9008034467697144, + "learning_rate": 4.995321204040987e-05, + "loss": 5.8646, + "step": 3276 + }, + { + "epoch": 0.019489247311827957, + "grad_norm": 2.1743087768554688, + "learning_rate": 4.995318347217042e-05, + "loss": 5.9742, + "step": 3277 + }, + { + "epoch": 0.019495194595108956, + "grad_norm": 2.09171724319458, + "learning_rate": 4.995315489522008e-05, + "loss": 5.882, + "step": 3278 + }, + { + "epoch": 0.01950114187838995, + "grad_norm": 1.816938042640686, + "learning_rate": 4.995312630955885e-05, + "loss": 5.9164, + "step": 3279 + }, + { + "epoch": 0.01950708916167095, + "grad_norm": 2.065207004547119, + "learning_rate": 4.995309771518674e-05, + "loss": 5.9273, + "step": 3280 + }, + { + "epoch": 0.019513036444951944, + "grad_norm": 2.1037240028381348, + "learning_rate": 4.9953069112103757e-05, + "loss": 5.863, + "step": 3281 + }, + { + "epoch": 0.019518983728232943, + "grad_norm": 2.011705160140991, + "learning_rate": 4.995304050030992e-05, + "loss": 5.712, + "step": 3282 + }, + { + "epoch": 0.01952493101151394, + "grad_norm": 2.2053868770599365, + "learning_rate": 4.995301187980523e-05, + "loss": 5.6988, + "step": 3283 + }, + { + "epoch": 0.019530878294794937, + "grad_norm": 2.0522396564483643, + "learning_rate": 4.995298325058971e-05, + "loss": 5.6831, + "step": 3284 + }, + { + "epoch": 0.019536825578075935, + "grad_norm": 1.9751875400543213, + "learning_rate": 4.995295461266336e-05, + "loss": 6.0187, + "step": 3285 + }, + { + "epoch": 0.019542772861356934, + "grad_norm": 2.79711651802063, + "learning_rate": 4.9952925966026185e-05, + "loss": 6.4995, + "step": 3286 + }, + { + "epoch": 0.01954872014463793, + "grad_norm": 2.1059019565582275, + "learning_rate": 4.9952897310678206e-05, + "loss": 5.9603, + "step": 3287 + }, + { + "epoch": 0.019554667427918927, + "grad_norm": 2.169428825378418, + "learning_rate": 4.995286864661942e-05, + "loss": 5.7973, + "step": 3288 + }, + { + "epoch": 0.019560614711199922, + "grad_norm": 2.165508985519409, + "learning_rate": 4.995283997384985e-05, + "loss": 5.9132, + "step": 3289 + }, + { + "epoch": 0.01956656199448092, + "grad_norm": 2.248450994491577, + "learning_rate": 4.9952811292369506e-05, + "loss": 5.8202, + "step": 3290 + }, + { + "epoch": 0.01957250927776192, + "grad_norm": 2.3068084716796875, + "learning_rate": 4.9952782602178394e-05, + "loss": 5.8223, + "step": 3291 + }, + { + "epoch": 0.019578456561042915, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.9952753903276516e-05, + "loss": 5.6231, + "step": 3292 + }, + { + "epoch": 0.019584403844323913, + "grad_norm": 2.136564254760742, + "learning_rate": 4.9952725195663895e-05, + "loss": 5.9859, + "step": 3293 + }, + { + "epoch": 0.01959035112760491, + "grad_norm": 2.6265337467193604, + "learning_rate": 4.9952696479340535e-05, + "loss": 5.9126, + "step": 3294 + }, + { + "epoch": 0.019596298410885907, + "grad_norm": 2.442678928375244, + "learning_rate": 4.9952667754306445e-05, + "loss": 5.9361, + "step": 3295 + }, + { + "epoch": 0.019602245694166905, + "grad_norm": 2.0740134716033936, + "learning_rate": 4.9952639020561644e-05, + "loss": 5.913, + "step": 3296 + }, + { + "epoch": 0.0196081929774479, + "grad_norm": 2.4088518619537354, + "learning_rate": 4.995261027810612e-05, + "loss": 5.8297, + "step": 3297 + }, + { + "epoch": 0.0196141402607289, + "grad_norm": 2.1514804363250732, + "learning_rate": 4.995258152693991e-05, + "loss": 5.8256, + "step": 3298 + }, + { + "epoch": 0.019620087544009897, + "grad_norm": 2.921570062637329, + "learning_rate": 4.9952552767063e-05, + "loss": 6.0243, + "step": 3299 + }, + { + "epoch": 0.019626034827290893, + "grad_norm": 2.398749828338623, + "learning_rate": 4.995252399847542e-05, + "loss": 6.004, + "step": 3300 + }, + { + "epoch": 0.01963198211057189, + "grad_norm": 2.2024805545806885, + "learning_rate": 4.995249522117717e-05, + "loss": 5.9201, + "step": 3301 + }, + { + "epoch": 0.01963792939385289, + "grad_norm": 2.112269401550293, + "learning_rate": 4.9952466435168266e-05, + "loss": 5.8488, + "step": 3302 + }, + { + "epoch": 0.019643876677133885, + "grad_norm": 2.04632568359375, + "learning_rate": 4.99524376404487e-05, + "loss": 5.8054, + "step": 3303 + }, + { + "epoch": 0.019649823960414883, + "grad_norm": 2.6293606758117676, + "learning_rate": 4.995240883701851e-05, + "loss": 5.6799, + "step": 3304 + }, + { + "epoch": 0.01965577124369588, + "grad_norm": 2.5172793865203857, + "learning_rate": 4.995238002487769e-05, + "loss": 5.712, + "step": 3305 + }, + { + "epoch": 0.019661718526976877, + "grad_norm": 2.549194097518921, + "learning_rate": 4.995235120402625e-05, + "loss": 5.7208, + "step": 3306 + }, + { + "epoch": 0.019667665810257876, + "grad_norm": 2.2993295192718506, + "learning_rate": 4.99523223744642e-05, + "loss": 5.7952, + "step": 3307 + }, + { + "epoch": 0.01967361309353887, + "grad_norm": 2.1270902156829834, + "learning_rate": 4.9952293536191555e-05, + "loss": 5.6988, + "step": 3308 + }, + { + "epoch": 0.01967956037681987, + "grad_norm": 2.349858283996582, + "learning_rate": 4.9952264689208315e-05, + "loss": 5.623, + "step": 3309 + }, + { + "epoch": 0.019685507660100864, + "grad_norm": 2.1501529216766357, + "learning_rate": 4.9952235833514506e-05, + "loss": 5.6498, + "step": 3310 + }, + { + "epoch": 0.019691454943381863, + "grad_norm": 2.0577821731567383, + "learning_rate": 4.995220696911012e-05, + "loss": 5.6863, + "step": 3311 + }, + { + "epoch": 0.01969740222666286, + "grad_norm": 2.0787386894226074, + "learning_rate": 4.9952178095995185e-05, + "loss": 5.6314, + "step": 3312 + }, + { + "epoch": 0.019703349509943856, + "grad_norm": 2.4042680263519287, + "learning_rate": 4.99521492141697e-05, + "loss": 5.6152, + "step": 3313 + }, + { + "epoch": 0.019709296793224855, + "grad_norm": 2.444410800933838, + "learning_rate": 4.995212032363368e-05, + "loss": 5.5375, + "step": 3314 + }, + { + "epoch": 0.019715244076505854, + "grad_norm": 2.1678028106689453, + "learning_rate": 4.995209142438712e-05, + "loss": 5.6239, + "step": 3315 + }, + { + "epoch": 0.01972119135978685, + "grad_norm": 2.5436410903930664, + "learning_rate": 4.9952062516430054e-05, + "loss": 5.4234, + "step": 3316 + }, + { + "epoch": 0.019727138643067847, + "grad_norm": 2.454561471939087, + "learning_rate": 4.9952033599762484e-05, + "loss": 5.4198, + "step": 3317 + }, + { + "epoch": 0.019733085926348842, + "grad_norm": 2.388125419616699, + "learning_rate": 4.9952004674384413e-05, + "loss": 5.5073, + "step": 3318 + }, + { + "epoch": 0.01973903320962984, + "grad_norm": 2.1900579929351807, + "learning_rate": 4.995197574029585e-05, + "loss": 5.3463, + "step": 3319 + }, + { + "epoch": 0.01974498049291084, + "grad_norm": 2.5625739097595215, + "learning_rate": 4.995194679749681e-05, + "loss": 5.4291, + "step": 3320 + }, + { + "epoch": 0.019750927776191834, + "grad_norm": 2.52402400970459, + "learning_rate": 4.995191784598731e-05, + "loss": 5.3826, + "step": 3321 + }, + { + "epoch": 0.019756875059472833, + "grad_norm": 2.5888168811798096, + "learning_rate": 4.995188888576735e-05, + "loss": 5.381, + "step": 3322 + }, + { + "epoch": 0.01976282234275383, + "grad_norm": 2.637080669403076, + "learning_rate": 4.995185991683694e-05, + "loss": 5.3321, + "step": 3323 + }, + { + "epoch": 0.019768769626034827, + "grad_norm": 2.46553111076355, + "learning_rate": 4.9951830939196095e-05, + "loss": 5.3663, + "step": 3324 + }, + { + "epoch": 0.019774716909315825, + "grad_norm": 2.2397992610931396, + "learning_rate": 4.9951801952844826e-05, + "loss": 5.3237, + "step": 3325 + }, + { + "epoch": 0.01978066419259682, + "grad_norm": 2.3519208431243896, + "learning_rate": 4.9951772957783144e-05, + "loss": 5.4166, + "step": 3326 + }, + { + "epoch": 0.01978661147587782, + "grad_norm": 2.6235291957855225, + "learning_rate": 4.9951743954011056e-05, + "loss": 5.8094, + "step": 3327 + }, + { + "epoch": 0.019792558759158817, + "grad_norm": 2.162285327911377, + "learning_rate": 4.995171494152856e-05, + "loss": 5.6491, + "step": 3328 + }, + { + "epoch": 0.019798506042439813, + "grad_norm": 2.231853485107422, + "learning_rate": 4.995168592033569e-05, + "loss": 5.69, + "step": 3329 + }, + { + "epoch": 0.01980445332572081, + "grad_norm": 2.7305827140808105, + "learning_rate": 4.995165689043244e-05, + "loss": 5.5028, + "step": 3330 + }, + { + "epoch": 0.01981040060900181, + "grad_norm": 2.9917726516723633, + "learning_rate": 4.9951627851818824e-05, + "loss": 5.3227, + "step": 3331 + }, + { + "epoch": 0.019816347892282805, + "grad_norm": 3.0039985179901123, + "learning_rate": 4.995159880449486e-05, + "loss": 5.5965, + "step": 3332 + }, + { + "epoch": 0.019822295175563803, + "grad_norm": 3.081099510192871, + "learning_rate": 4.995156974846054e-05, + "loss": 5.6945, + "step": 3333 + }, + { + "epoch": 0.0198282424588448, + "grad_norm": 2.042445182800293, + "learning_rate": 4.995154068371589e-05, + "loss": 5.693, + "step": 3334 + }, + { + "epoch": 0.019834189742125797, + "grad_norm": 2.8875865936279297, + "learning_rate": 4.995151161026091e-05, + "loss": 5.5981, + "step": 3335 + }, + { + "epoch": 0.019840137025406795, + "grad_norm": 2.4203453063964844, + "learning_rate": 4.9951482528095615e-05, + "loss": 5.6269, + "step": 3336 + }, + { + "epoch": 0.01984608430868779, + "grad_norm": 2.332151174545288, + "learning_rate": 4.995145343722002e-05, + "loss": 5.6002, + "step": 3337 + }, + { + "epoch": 0.01985203159196879, + "grad_norm": 2.556549310684204, + "learning_rate": 4.995142433763413e-05, + "loss": 5.7715, + "step": 3338 + }, + { + "epoch": 0.019857978875249784, + "grad_norm": 2.453113079071045, + "learning_rate": 4.995139522933796e-05, + "loss": 5.8958, + "step": 3339 + }, + { + "epoch": 0.019863926158530783, + "grad_norm": 1.9842414855957031, + "learning_rate": 4.995136611233151e-05, + "loss": 5.9781, + "step": 3340 + }, + { + "epoch": 0.01986987344181178, + "grad_norm": 2.3725521564483643, + "learning_rate": 4.995133698661479e-05, + "loss": 5.9902, + "step": 3341 + }, + { + "epoch": 0.019875820725092776, + "grad_norm": 2.679001808166504, + "learning_rate": 4.9951307852187824e-05, + "loss": 5.9526, + "step": 3342 + }, + { + "epoch": 0.019881768008373775, + "grad_norm": 2.272595167160034, + "learning_rate": 4.995127870905061e-05, + "loss": 5.9685, + "step": 3343 + }, + { + "epoch": 0.019887715291654774, + "grad_norm": 2.0300357341766357, + "learning_rate": 4.995124955720317e-05, + "loss": 5.7702, + "step": 3344 + }, + { + "epoch": 0.01989366257493577, + "grad_norm": 2.5023481845855713, + "learning_rate": 4.9951220396645504e-05, + "loss": 5.6612, + "step": 3345 + }, + { + "epoch": 0.019899609858216767, + "grad_norm": 2.426457166671753, + "learning_rate": 4.995119122737762e-05, + "loss": 5.767, + "step": 3346 + }, + { + "epoch": 0.019905557141497762, + "grad_norm": 2.4919028282165527, + "learning_rate": 4.995116204939954e-05, + "loss": 6.0578, + "step": 3347 + }, + { + "epoch": 0.01991150442477876, + "grad_norm": 3.099792957305908, + "learning_rate": 4.995113286271126e-05, + "loss": 7.053, + "step": 3348 + }, + { + "epoch": 0.01991745170805976, + "grad_norm": 2.597169876098633, + "learning_rate": 4.9951103667312795e-05, + "loss": 5.8467, + "step": 3349 + }, + { + "epoch": 0.019923398991340754, + "grad_norm": 2.1132469177246094, + "learning_rate": 4.995107446320416e-05, + "loss": 5.7296, + "step": 3350 + }, + { + "epoch": 0.019929346274621753, + "grad_norm": 2.4141721725463867, + "learning_rate": 4.995104525038537e-05, + "loss": 5.8705, + "step": 3351 + }, + { + "epoch": 0.01993529355790275, + "grad_norm": 1.9012199640274048, + "learning_rate": 4.995101602885642e-05, + "loss": 5.8759, + "step": 3352 + }, + { + "epoch": 0.019941240841183747, + "grad_norm": 2.168673038482666, + "learning_rate": 4.9950986798617335e-05, + "loss": 5.8161, + "step": 3353 + }, + { + "epoch": 0.019947188124464745, + "grad_norm": 2.1579155921936035, + "learning_rate": 4.995095755966811e-05, + "loss": 5.8699, + "step": 3354 + }, + { + "epoch": 0.01995313540774574, + "grad_norm": 2.1460800170898438, + "learning_rate": 4.9950928312008774e-05, + "loss": 5.9144, + "step": 3355 + }, + { + "epoch": 0.01995908269102674, + "grad_norm": 2.402167558670044, + "learning_rate": 4.995089905563932e-05, + "loss": 5.8857, + "step": 3356 + }, + { + "epoch": 0.019965029974307737, + "grad_norm": 2.6381726264953613, + "learning_rate": 4.995086979055976e-05, + "loss": 6.0021, + "step": 3357 + }, + { + "epoch": 0.019970977257588732, + "grad_norm": 2.5577943325042725, + "learning_rate": 4.995084051677012e-05, + "loss": 5.9425, + "step": 3358 + }, + { + "epoch": 0.01997692454086973, + "grad_norm": 2.188215494155884, + "learning_rate": 4.995081123427039e-05, + "loss": 6.0656, + "step": 3359 + }, + { + "epoch": 0.01998287182415073, + "grad_norm": 1.8278366327285767, + "learning_rate": 4.9950781943060596e-05, + "loss": 5.8229, + "step": 3360 + }, + { + "epoch": 0.019988819107431725, + "grad_norm": 1.9054077863693237, + "learning_rate": 4.995075264314074e-05, + "loss": 5.8158, + "step": 3361 + }, + { + "epoch": 0.019994766390712723, + "grad_norm": 2.1255416870117188, + "learning_rate": 4.9950723334510826e-05, + "loss": 5.8816, + "step": 3362 + }, + { + "epoch": 0.02000071367399372, + "grad_norm": 2.026923656463623, + "learning_rate": 4.995069401717088e-05, + "loss": 5.7463, + "step": 3363 + }, + { + "epoch": 0.020006660957274717, + "grad_norm": 2.015178680419922, + "learning_rate": 4.9950664691120905e-05, + "loss": 5.6689, + "step": 3364 + }, + { + "epoch": 0.020012608240555715, + "grad_norm": 1.7729417085647583, + "learning_rate": 4.995063535636091e-05, + "loss": 5.701, + "step": 3365 + }, + { + "epoch": 0.02001855552383671, + "grad_norm": 1.9893600940704346, + "learning_rate": 4.9950606012890905e-05, + "loss": 5.7502, + "step": 3366 + }, + { + "epoch": 0.02002450280711771, + "grad_norm": 1.8950870037078857, + "learning_rate": 4.99505766607109e-05, + "loss": 5.6094, + "step": 3367 + }, + { + "epoch": 0.020030450090398704, + "grad_norm": 2.4140830039978027, + "learning_rate": 4.995054729982091e-05, + "loss": 5.8387, + "step": 3368 + }, + { + "epoch": 0.020036397373679703, + "grad_norm": 2.1887669563293457, + "learning_rate": 4.995051793022094e-05, + "loss": 5.7348, + "step": 3369 + }, + { + "epoch": 0.0200423446569607, + "grad_norm": 1.9632731676101685, + "learning_rate": 4.9950488551911e-05, + "loss": 5.5568, + "step": 3370 + }, + { + "epoch": 0.020048291940241696, + "grad_norm": 2.116834878921509, + "learning_rate": 4.995045916489111e-05, + "loss": 5.461, + "step": 3371 + }, + { + "epoch": 0.020054239223522695, + "grad_norm": 2.021256923675537, + "learning_rate": 4.9950429769161266e-05, + "loss": 5.6601, + "step": 3372 + }, + { + "epoch": 0.020060186506803693, + "grad_norm": 2.1648659706115723, + "learning_rate": 4.9950400364721486e-05, + "loss": 5.5364, + "step": 3373 + }, + { + "epoch": 0.02006613379008469, + "grad_norm": 2.043499231338501, + "learning_rate": 4.9950370951571775e-05, + "loss": 5.7273, + "step": 3374 + }, + { + "epoch": 0.020072081073365687, + "grad_norm": 2.296121597290039, + "learning_rate": 4.995034152971215e-05, + "loss": 5.8494, + "step": 3375 + }, + { + "epoch": 0.020078028356646682, + "grad_norm": 2.401031494140625, + "learning_rate": 4.995031209914261e-05, + "loss": 5.719, + "step": 3376 + }, + { + "epoch": 0.02008397563992768, + "grad_norm": 2.3130364418029785, + "learning_rate": 4.995028265986319e-05, + "loss": 5.7998, + "step": 3377 + }, + { + "epoch": 0.02008992292320868, + "grad_norm": 2.3820009231567383, + "learning_rate": 4.9950253211873874e-05, + "loss": 6.0632, + "step": 3378 + }, + { + "epoch": 0.020095870206489674, + "grad_norm": 2.1970956325531006, + "learning_rate": 4.995022375517469e-05, + "loss": 5.9776, + "step": 3379 + }, + { + "epoch": 0.020101817489770673, + "grad_norm": 1.912102460861206, + "learning_rate": 4.995019428976564e-05, + "loss": 5.7194, + "step": 3380 + }, + { + "epoch": 0.02010776477305167, + "grad_norm": 2.3187389373779297, + "learning_rate": 4.995016481564673e-05, + "loss": 6.0225, + "step": 3381 + }, + { + "epoch": 0.020113712056332667, + "grad_norm": 1.959000587463379, + "learning_rate": 4.995013533281797e-05, + "loss": 5.8453, + "step": 3382 + }, + { + "epoch": 0.020119659339613665, + "grad_norm": 2.0283286571502686, + "learning_rate": 4.995010584127938e-05, + "loss": 5.6837, + "step": 3383 + }, + { + "epoch": 0.02012560662289466, + "grad_norm": 2.410351037979126, + "learning_rate": 4.995007634103097e-05, + "loss": 5.8172, + "step": 3384 + }, + { + "epoch": 0.02013155390617566, + "grad_norm": 2.2864298820495605, + "learning_rate": 4.995004683207275e-05, + "loss": 5.8995, + "step": 3385 + }, + { + "epoch": 0.020137501189456657, + "grad_norm": 2.830883026123047, + "learning_rate": 4.995001731440472e-05, + "loss": 5.7273, + "step": 3386 + }, + { + "epoch": 0.020143448472737652, + "grad_norm": 2.486783981323242, + "learning_rate": 4.9949987788026896e-05, + "loss": 5.88, + "step": 3387 + }, + { + "epoch": 0.02014939575601865, + "grad_norm": 2.109975576400757, + "learning_rate": 4.994995825293929e-05, + "loss": 5.8618, + "step": 3388 + }, + { + "epoch": 0.02015534303929965, + "grad_norm": 2.249293327331543, + "learning_rate": 4.994992870914191e-05, + "loss": 5.8511, + "step": 3389 + }, + { + "epoch": 0.020161290322580645, + "grad_norm": 2.5433366298675537, + "learning_rate": 4.9949899156634774e-05, + "loss": 5.7375, + "step": 3390 + }, + { + "epoch": 0.020167237605861643, + "grad_norm": 2.7013652324676514, + "learning_rate": 4.9949869595417876e-05, + "loss": 5.8886, + "step": 3391 + }, + { + "epoch": 0.020173184889142638, + "grad_norm": 2.536972761154175, + "learning_rate": 4.994984002549124e-05, + "loss": 5.4203, + "step": 3392 + }, + { + "epoch": 0.020179132172423637, + "grad_norm": 2.596230983734131, + "learning_rate": 4.9949810446854876e-05, + "loss": 5.7882, + "step": 3393 + }, + { + "epoch": 0.020185079455704635, + "grad_norm": 2.6889936923980713, + "learning_rate": 4.9949780859508786e-05, + "loss": 5.6822, + "step": 3394 + }, + { + "epoch": 0.02019102673898563, + "grad_norm": 2.541027069091797, + "learning_rate": 4.994975126345299e-05, + "loss": 5.7394, + "step": 3395 + }, + { + "epoch": 0.02019697402226663, + "grad_norm": 2.2267251014709473, + "learning_rate": 4.9949721658687485e-05, + "loss": 5.7847, + "step": 3396 + }, + { + "epoch": 0.020202921305547628, + "grad_norm": 2.439689874649048, + "learning_rate": 4.994969204521231e-05, + "loss": 5.6222, + "step": 3397 + }, + { + "epoch": 0.020208868588828623, + "grad_norm": 2.9407742023468018, + "learning_rate": 4.9949662423027434e-05, + "loss": 5.6629, + "step": 3398 + }, + { + "epoch": 0.02021481587210962, + "grad_norm": 2.42802357673645, + "learning_rate": 4.9949632792132894e-05, + "loss": 5.3369, + "step": 3399 + }, + { + "epoch": 0.020220763155390616, + "grad_norm": 2.465508222579956, + "learning_rate": 4.99496031525287e-05, + "loss": 5.3365, + "step": 3400 + }, + { + "epoch": 0.020226710438671615, + "grad_norm": 2.408794403076172, + "learning_rate": 4.9949573504214854e-05, + "loss": 5.3156, + "step": 3401 + }, + { + "epoch": 0.020232657721952613, + "grad_norm": 2.229372978210449, + "learning_rate": 4.9949543847191374e-05, + "loss": 5.9194, + "step": 3402 + }, + { + "epoch": 0.02023860500523361, + "grad_norm": 4.567020416259766, + "learning_rate": 4.9949514181458254e-05, + "loss": 6.3379, + "step": 3403 + }, + { + "epoch": 0.020244552288514607, + "grad_norm": 3.9927520751953125, + "learning_rate": 4.9949484507015534e-05, + "loss": 6.3351, + "step": 3404 + }, + { + "epoch": 0.020250499571795602, + "grad_norm": 2.4830081462860107, + "learning_rate": 4.9949454823863195e-05, + "loss": 6.4046, + "step": 3405 + }, + { + "epoch": 0.0202564468550766, + "grad_norm": 2.282722234725952, + "learning_rate": 4.994942513200126e-05, + "loss": 6.5473, + "step": 3406 + }, + { + "epoch": 0.0202623941383576, + "grad_norm": 2.411367416381836, + "learning_rate": 4.994939543142973e-05, + "loss": 5.7898, + "step": 3407 + }, + { + "epoch": 0.020268341421638594, + "grad_norm": 3.2052342891693115, + "learning_rate": 4.994936572214864e-05, + "loss": 5.6695, + "step": 3408 + }, + { + "epoch": 0.020274288704919593, + "grad_norm": 4.142974853515625, + "learning_rate": 4.994933600415798e-05, + "loss": 6.2037, + "step": 3409 + }, + { + "epoch": 0.02028023598820059, + "grad_norm": 2.839066982269287, + "learning_rate": 4.994930627745776e-05, + "loss": 6.7308, + "step": 3410 + }, + { + "epoch": 0.020286183271481587, + "grad_norm": 3.3138885498046875, + "learning_rate": 4.9949276542048e-05, + "loss": 5.8873, + "step": 3411 + }, + { + "epoch": 0.020292130554762585, + "grad_norm": 2.6651928424835205, + "learning_rate": 4.9949246797928704e-05, + "loss": 6.6325, + "step": 3412 + }, + { + "epoch": 0.02029807783804358, + "grad_norm": 2.919436454772949, + "learning_rate": 4.994921704509988e-05, + "loss": 6.3239, + "step": 3413 + }, + { + "epoch": 0.02030402512132458, + "grad_norm": 2.6901097297668457, + "learning_rate": 4.994918728356155e-05, + "loss": 6.1712, + "step": 3414 + }, + { + "epoch": 0.020309972404605577, + "grad_norm": 2.573249340057373, + "learning_rate": 4.9949157513313704e-05, + "loss": 5.8194, + "step": 3415 + }, + { + "epoch": 0.020315919687886572, + "grad_norm": 3.0603950023651123, + "learning_rate": 4.994912773435637e-05, + "loss": 6.3881, + "step": 3416 + }, + { + "epoch": 0.02032186697116757, + "grad_norm": 3.1800057888031006, + "learning_rate": 4.994909794668956e-05, + "loss": 5.9486, + "step": 3417 + }, + { + "epoch": 0.02032781425444857, + "grad_norm": 2.537182092666626, + "learning_rate": 4.994906815031327e-05, + "loss": 6.5454, + "step": 3418 + }, + { + "epoch": 0.020333761537729565, + "grad_norm": 2.474705457687378, + "learning_rate": 4.9949038345227525e-05, + "loss": 6.5356, + "step": 3419 + }, + { + "epoch": 0.020339708821010563, + "grad_norm": 3.054689645767212, + "learning_rate": 4.994900853143232e-05, + "loss": 6.4526, + "step": 3420 + }, + { + "epoch": 0.020345656104291558, + "grad_norm": 2.587644100189209, + "learning_rate": 4.994897870892769e-05, + "loss": 6.2811, + "step": 3421 + }, + { + "epoch": 0.020351603387572557, + "grad_norm": 2.110041618347168, + "learning_rate": 4.994894887771361e-05, + "loss": 6.0428, + "step": 3422 + }, + { + "epoch": 0.020357550670853555, + "grad_norm": 2.4931492805480957, + "learning_rate": 4.9948919037790115e-05, + "loss": 6.3683, + "step": 3423 + }, + { + "epoch": 0.02036349795413455, + "grad_norm": 2.7169463634490967, + "learning_rate": 4.994888918915721e-05, + "loss": 6.5335, + "step": 3424 + }, + { + "epoch": 0.02036944523741555, + "grad_norm": 2.164363145828247, + "learning_rate": 4.994885933181491e-05, + "loss": 6.0409, + "step": 3425 + }, + { + "epoch": 0.020375392520696547, + "grad_norm": 2.480468273162842, + "learning_rate": 4.994882946576322e-05, + "loss": 5.8816, + "step": 3426 + }, + { + "epoch": 0.020381339803977543, + "grad_norm": 2.928361415863037, + "learning_rate": 4.994879959100215e-05, + "loss": 6.1706, + "step": 3427 + }, + { + "epoch": 0.02038728708725854, + "grad_norm": 2.1536660194396973, + "learning_rate": 4.994876970753171e-05, + "loss": 6.0559, + "step": 3428 + }, + { + "epoch": 0.020393234370539536, + "grad_norm": 2.6913530826568604, + "learning_rate": 4.994873981535192e-05, + "loss": 6.7411, + "step": 3429 + }, + { + "epoch": 0.020399181653820535, + "grad_norm": 2.647124767303467, + "learning_rate": 4.994870991446278e-05, + "loss": 6.5251, + "step": 3430 + }, + { + "epoch": 0.020405128937101533, + "grad_norm": 2.621612310409546, + "learning_rate": 4.994868000486429e-05, + "loss": 6.7029, + "step": 3431 + }, + { + "epoch": 0.02041107622038253, + "grad_norm": 2.1986844539642334, + "learning_rate": 4.994865008655649e-05, + "loss": 6.4561, + "step": 3432 + }, + { + "epoch": 0.020417023503663527, + "grad_norm": 2.706897735595703, + "learning_rate": 4.994862015953936e-05, + "loss": 6.3125, + "step": 3433 + }, + { + "epoch": 0.020422970786944522, + "grad_norm": 2.403346300125122, + "learning_rate": 4.994859022381294e-05, + "loss": 6.0808, + "step": 3434 + }, + { + "epoch": 0.02042891807022552, + "grad_norm": 2.367835521697998, + "learning_rate": 4.994856027937722e-05, + "loss": 6.2634, + "step": 3435 + }, + { + "epoch": 0.02043486535350652, + "grad_norm": 2.8564250469207764, + "learning_rate": 4.9948530326232205e-05, + "loss": 6.579, + "step": 3436 + }, + { + "epoch": 0.020440812636787514, + "grad_norm": 2.9472100734710693, + "learning_rate": 4.9948500364377925e-05, + "loss": 6.3873, + "step": 3437 + }, + { + "epoch": 0.020446759920068513, + "grad_norm": 2.3005917072296143, + "learning_rate": 4.994847039381438e-05, + "loss": 6.2316, + "step": 3438 + }, + { + "epoch": 0.02045270720334951, + "grad_norm": 2.0548787117004395, + "learning_rate": 4.9948440414541584e-05, + "loss": 6.5022, + "step": 3439 + }, + { + "epoch": 0.020458654486630506, + "grad_norm": 2.1332197189331055, + "learning_rate": 4.9948410426559536e-05, + "loss": 6.1486, + "step": 3440 + }, + { + "epoch": 0.020464601769911505, + "grad_norm": 2.112738847732544, + "learning_rate": 4.994838042986827e-05, + "loss": 5.9125, + "step": 3441 + }, + { + "epoch": 0.0204705490531925, + "grad_norm": 2.714627981185913, + "learning_rate": 4.9948350424467774e-05, + "loss": 6.1164, + "step": 3442 + }, + { + "epoch": 0.0204764963364735, + "grad_norm": 2.337571382522583, + "learning_rate": 4.994832041035806e-05, + "loss": 6.0567, + "step": 3443 + }, + { + "epoch": 0.020482443619754497, + "grad_norm": 2.354389190673828, + "learning_rate": 4.994829038753915e-05, + "loss": 5.5922, + "step": 3444 + }, + { + "epoch": 0.020488390903035492, + "grad_norm": 2.3885531425476074, + "learning_rate": 4.994826035601106e-05, + "loss": 6.4178, + "step": 3445 + }, + { + "epoch": 0.02049433818631649, + "grad_norm": 2.931328058242798, + "learning_rate": 4.994823031577378e-05, + "loss": 6.356, + "step": 3446 + }, + { + "epoch": 0.02050028546959749, + "grad_norm": 2.4858877658843994, + "learning_rate": 4.994820026682733e-05, + "loss": 6.0601, + "step": 3447 + }, + { + "epoch": 0.020506232752878484, + "grad_norm": 2.626811981201172, + "learning_rate": 4.9948170209171725e-05, + "loss": 6.4372, + "step": 3448 + }, + { + "epoch": 0.020512180036159483, + "grad_norm": 2.2917356491088867, + "learning_rate": 4.994814014280696e-05, + "loss": 5.9828, + "step": 3449 + }, + { + "epoch": 0.020518127319440478, + "grad_norm": 2.174531936645508, + "learning_rate": 4.9948110067733075e-05, + "loss": 6.3382, + "step": 3450 + }, + { + "epoch": 0.020524074602721477, + "grad_norm": 2.9880006313323975, + "learning_rate": 4.994807998395005e-05, + "loss": 6.7493, + "step": 3451 + }, + { + "epoch": 0.020530021886002475, + "grad_norm": 2.6577212810516357, + "learning_rate": 4.994804989145792e-05, + "loss": 6.853, + "step": 3452 + }, + { + "epoch": 0.02053596916928347, + "grad_norm": 2.8832437992095947, + "learning_rate": 4.994801979025667e-05, + "loss": 6.5829, + "step": 3453 + }, + { + "epoch": 0.02054191645256447, + "grad_norm": 2.473177194595337, + "learning_rate": 4.994798968034633e-05, + "loss": 6.2879, + "step": 3454 + }, + { + "epoch": 0.020547863735845467, + "grad_norm": 2.7484633922576904, + "learning_rate": 4.994795956172691e-05, + "loss": 6.2037, + "step": 3455 + }, + { + "epoch": 0.020553811019126463, + "grad_norm": 1.6647555828094482, + "learning_rate": 4.9947929434398403e-05, + "loss": 6.5639, + "step": 3456 + }, + { + "epoch": 0.02055975830240746, + "grad_norm": 3.71087908744812, + "learning_rate": 4.994789929836084e-05, + "loss": 6.8464, + "step": 3457 + }, + { + "epoch": 0.020565705585688456, + "grad_norm": 2.705892324447632, + "learning_rate": 4.994786915361422e-05, + "loss": 6.8316, + "step": 3458 + }, + { + "epoch": 0.020571652868969455, + "grad_norm": 2.3619437217712402, + "learning_rate": 4.994783900015856e-05, + "loss": 6.3441, + "step": 3459 + }, + { + "epoch": 0.020577600152250453, + "grad_norm": 2.490499258041382, + "learning_rate": 4.9947808837993864e-05, + "loss": 6.1467, + "step": 3460 + }, + { + "epoch": 0.02058354743553145, + "grad_norm": 2.546614170074463, + "learning_rate": 4.994777866712015e-05, + "loss": 5.6677, + "step": 3461 + }, + { + "epoch": 0.020589494718812447, + "grad_norm": 2.473695755004883, + "learning_rate": 4.994774848753741e-05, + "loss": 5.7815, + "step": 3462 + }, + { + "epoch": 0.020595442002093442, + "grad_norm": 2.0494625568389893, + "learning_rate": 4.994771829924569e-05, + "loss": 5.674, + "step": 3463 + }, + { + "epoch": 0.02060138928537444, + "grad_norm": 2.1504273414611816, + "learning_rate": 4.9947688102244964e-05, + "loss": 5.5299, + "step": 3464 + }, + { + "epoch": 0.02060733656865544, + "grad_norm": 2.908170700073242, + "learning_rate": 4.994765789653526e-05, + "loss": 5.8448, + "step": 3465 + }, + { + "epoch": 0.020613283851936434, + "grad_norm": 3.1434714794158936, + "learning_rate": 4.994762768211659e-05, + "loss": 5.8413, + "step": 3466 + }, + { + "epoch": 0.020619231135217433, + "grad_norm": 2.4688189029693604, + "learning_rate": 4.994759745898896e-05, + "loss": 5.6458, + "step": 3467 + }, + { + "epoch": 0.02062517841849843, + "grad_norm": 2.172083854675293, + "learning_rate": 4.994756722715238e-05, + "loss": 5.723, + "step": 3468 + }, + { + "epoch": 0.020631125701779426, + "grad_norm": 2.0702707767486572, + "learning_rate": 4.994753698660687e-05, + "loss": 5.6199, + "step": 3469 + }, + { + "epoch": 0.020637072985060425, + "grad_norm": 2.2142136096954346, + "learning_rate": 4.9947506737352425e-05, + "loss": 5.5476, + "step": 3470 + }, + { + "epoch": 0.02064302026834142, + "grad_norm": 2.156874179840088, + "learning_rate": 4.994747647938907e-05, + "loss": 5.4773, + "step": 3471 + }, + { + "epoch": 0.02064896755162242, + "grad_norm": 3.3683371543884277, + "learning_rate": 4.9947446212716795e-05, + "loss": 6.4804, + "step": 3472 + }, + { + "epoch": 0.020654914834903417, + "grad_norm": 2.2435977458953857, + "learning_rate": 4.9947415937335635e-05, + "loss": 6.0622, + "step": 3473 + }, + { + "epoch": 0.020660862118184412, + "grad_norm": 3.0824263095855713, + "learning_rate": 4.994738565324558e-05, + "loss": 6.8809, + "step": 3474 + }, + { + "epoch": 0.02066680940146541, + "grad_norm": 2.6978909969329834, + "learning_rate": 4.9947355360446664e-05, + "loss": 6.823, + "step": 3475 + }, + { + "epoch": 0.02067275668474641, + "grad_norm": 3.041680097579956, + "learning_rate": 4.9947325058938874e-05, + "loss": 6.4268, + "step": 3476 + }, + { + "epoch": 0.020678703968027404, + "grad_norm": 3.5326781272888184, + "learning_rate": 4.9947294748722237e-05, + "loss": 6.3516, + "step": 3477 + }, + { + "epoch": 0.020684651251308403, + "grad_norm": 2.7611732482910156, + "learning_rate": 4.994726442979675e-05, + "loss": 6.2206, + "step": 3478 + }, + { + "epoch": 0.020690598534589398, + "grad_norm": 3.8533458709716797, + "learning_rate": 4.994723410216244e-05, + "loss": 6.7907, + "step": 3479 + }, + { + "epoch": 0.020696545817870397, + "grad_norm": 2.8091351985931396, + "learning_rate": 4.99472037658193e-05, + "loss": 6.7468, + "step": 3480 + }, + { + "epoch": 0.020702493101151395, + "grad_norm": 2.4317073822021484, + "learning_rate": 4.994717342076736e-05, + "loss": 6.4682, + "step": 3481 + }, + { + "epoch": 0.02070844038443239, + "grad_norm": 2.5132029056549072, + "learning_rate": 4.994714306700661e-05, + "loss": 6.1966, + "step": 3482 + }, + { + "epoch": 0.02071438766771339, + "grad_norm": 2.8161535263061523, + "learning_rate": 4.994711270453707e-05, + "loss": 5.6045, + "step": 3483 + }, + { + "epoch": 0.020720334950994387, + "grad_norm": 2.654115915298462, + "learning_rate": 4.994708233335875e-05, + "loss": 5.8983, + "step": 3484 + }, + { + "epoch": 0.020726282234275382, + "grad_norm": 2.5971553325653076, + "learning_rate": 4.9947051953471664e-05, + "loss": 5.4422, + "step": 3485 + }, + { + "epoch": 0.02073222951755638, + "grad_norm": 2.5758557319641113, + "learning_rate": 4.9947021564875816e-05, + "loss": 5.5921, + "step": 3486 + }, + { + "epoch": 0.020738176800837376, + "grad_norm": 2.635345458984375, + "learning_rate": 4.994699116757122e-05, + "loss": 6.2316, + "step": 3487 + }, + { + "epoch": 0.020744124084118375, + "grad_norm": 2.573514938354492, + "learning_rate": 4.9946960761557896e-05, + "loss": 6.5069, + "step": 3488 + }, + { + "epoch": 0.020750071367399373, + "grad_norm": 2.587735176086426, + "learning_rate": 4.994693034683584e-05, + "loss": 5.9114, + "step": 3489 + }, + { + "epoch": 0.02075601865068037, + "grad_norm": 2.4980244636535645, + "learning_rate": 4.9946899923405075e-05, + "loss": 6.1805, + "step": 3490 + }, + { + "epoch": 0.020761965933961367, + "grad_norm": 2.614003896713257, + "learning_rate": 4.9946869491265594e-05, + "loss": 6.2294, + "step": 3491 + }, + { + "epoch": 0.020767913217242365, + "grad_norm": 3.3819997310638428, + "learning_rate": 4.994683905041743e-05, + "loss": 5.4716, + "step": 3492 + }, + { + "epoch": 0.02077386050052336, + "grad_norm": 3.168170213699341, + "learning_rate": 4.994680860086057e-05, + "loss": 5.4041, + "step": 3493 + }, + { + "epoch": 0.02077980778380436, + "grad_norm": 3.05253267288208, + "learning_rate": 4.994677814259504e-05, + "loss": 5.4958, + "step": 3494 + }, + { + "epoch": 0.020785755067085354, + "grad_norm": 2.8560431003570557, + "learning_rate": 4.994674767562085e-05, + "loss": 5.4153, + "step": 3495 + }, + { + "epoch": 0.020791702350366353, + "grad_norm": 2.790382146835327, + "learning_rate": 4.994671719993801e-05, + "loss": 6.3581, + "step": 3496 + }, + { + "epoch": 0.02079764963364735, + "grad_norm": 2.9860496520996094, + "learning_rate": 4.9946686715546535e-05, + "loss": 6.5779, + "step": 3497 + }, + { + "epoch": 0.020803596916928346, + "grad_norm": 2.744859457015991, + "learning_rate": 4.994665622244642e-05, + "loss": 6.5748, + "step": 3498 + }, + { + "epoch": 0.020809544200209345, + "grad_norm": 2.7951292991638184, + "learning_rate": 4.9946625720637683e-05, + "loss": 6.1954, + "step": 3499 + }, + { + "epoch": 0.02081549148349034, + "grad_norm": 3.2961854934692383, + "learning_rate": 4.994659521012034e-05, + "loss": 6.243, + "step": 3500 + }, + { + "epoch": 0.02082143876677134, + "grad_norm": 2.934246301651001, + "learning_rate": 4.99465646908944e-05, + "loss": 6.1307, + "step": 3501 + }, + { + "epoch": 0.020827386050052337, + "grad_norm": 3.9152729511260986, + "learning_rate": 4.994653416295987e-05, + "loss": 6.0167, + "step": 3502 + }, + { + "epoch": 0.020833333333333332, + "grad_norm": 4.510169506072998, + "learning_rate": 4.994650362631676e-05, + "loss": 6.533, + "step": 3503 + }, + { + "epoch": 0.02083928061661433, + "grad_norm": 3.415665864944458, + "learning_rate": 4.994647308096509e-05, + "loss": 6.4978, + "step": 3504 + }, + { + "epoch": 0.02084522789989533, + "grad_norm": 2.6515185832977295, + "learning_rate": 4.9946442526904856e-05, + "loss": 6.3859, + "step": 3505 + }, + { + "epoch": 0.020851175183176324, + "grad_norm": 2.8215248584747314, + "learning_rate": 4.994641196413609e-05, + "loss": 6.243, + "step": 3506 + }, + { + "epoch": 0.020857122466457323, + "grad_norm": 2.644529104232788, + "learning_rate": 4.9946381392658773e-05, + "loss": 6.2954, + "step": 3507 + }, + { + "epoch": 0.020863069749738318, + "grad_norm": 3.349699020385742, + "learning_rate": 4.994635081247294e-05, + "loss": 6.5617, + "step": 3508 + }, + { + "epoch": 0.020869017033019317, + "grad_norm": 3.3669090270996094, + "learning_rate": 4.9946320223578596e-05, + "loss": 6.6458, + "step": 3509 + }, + { + "epoch": 0.020874964316300315, + "grad_norm": 2.5562078952789307, + "learning_rate": 4.994628962597575e-05, + "loss": 5.5041, + "step": 3510 + }, + { + "epoch": 0.02088091159958131, + "grad_norm": 2.851809501647949, + "learning_rate": 4.994625901966441e-05, + "loss": 5.4607, + "step": 3511 + }, + { + "epoch": 0.02088685888286231, + "grad_norm": 3.2769458293914795, + "learning_rate": 4.994622840464458e-05, + "loss": 5.3115, + "step": 3512 + }, + { + "epoch": 0.020892806166143307, + "grad_norm": 2.5495102405548096, + "learning_rate": 4.994619778091629e-05, + "loss": 5.9997, + "step": 3513 + }, + { + "epoch": 0.020898753449424302, + "grad_norm": 2.609463930130005, + "learning_rate": 4.994616714847954e-05, + "loss": 6.562, + "step": 3514 + }, + { + "epoch": 0.0209047007327053, + "grad_norm": 2.5731685161590576, + "learning_rate": 4.994613650733433e-05, + "loss": 6.5341, + "step": 3515 + }, + { + "epoch": 0.020910648015986296, + "grad_norm": 2.481297254562378, + "learning_rate": 4.99461058574807e-05, + "loss": 6.5878, + "step": 3516 + }, + { + "epoch": 0.020916595299267295, + "grad_norm": 2.4096593856811523, + "learning_rate": 4.9946075198918624e-05, + "loss": 6.5054, + "step": 3517 + }, + { + "epoch": 0.020922542582548293, + "grad_norm": 2.4417459964752197, + "learning_rate": 4.994604453164814e-05, + "loss": 6.3292, + "step": 3518 + }, + { + "epoch": 0.020928489865829288, + "grad_norm": 2.7062435150146484, + "learning_rate": 4.994601385566925e-05, + "loss": 5.564, + "step": 3519 + }, + { + "epoch": 0.020934437149110287, + "grad_norm": 2.613614559173584, + "learning_rate": 4.9945983170981955e-05, + "loss": 5.3929, + "step": 3520 + }, + { + "epoch": 0.020940384432391285, + "grad_norm": 2.4933719635009766, + "learning_rate": 4.994595247758629e-05, + "loss": 6.1841, + "step": 3521 + }, + { + "epoch": 0.02094633171567228, + "grad_norm": 2.251507043838501, + "learning_rate": 4.994592177548224e-05, + "loss": 6.3109, + "step": 3522 + }, + { + "epoch": 0.02095227899895328, + "grad_norm": 2.3830223083496094, + "learning_rate": 4.994589106466983e-05, + "loss": 5.9421, + "step": 3523 + }, + { + "epoch": 0.020958226282234274, + "grad_norm": 2.2940196990966797, + "learning_rate": 4.994586034514906e-05, + "loss": 6.0858, + "step": 3524 + }, + { + "epoch": 0.020964173565515273, + "grad_norm": 2.916836977005005, + "learning_rate": 4.994582961691996e-05, + "loss": 5.166, + "step": 3525 + }, + { + "epoch": 0.02097012084879627, + "grad_norm": 2.7183029651641846, + "learning_rate": 4.994579887998252e-05, + "loss": 6.9732, + "step": 3526 + }, + { + "epoch": 0.020976068132077266, + "grad_norm": 2.70143985748291, + "learning_rate": 4.994576813433676e-05, + "loss": 5.917, + "step": 3527 + }, + { + "epoch": 0.020982015415358265, + "grad_norm": 2.7375986576080322, + "learning_rate": 4.994573737998269e-05, + "loss": 5.3025, + "step": 3528 + }, + { + "epoch": 0.02098796269863926, + "grad_norm": 2.656982183456421, + "learning_rate": 4.994570661692033e-05, + "loss": 5.2383, + "step": 3529 + }, + { + "epoch": 0.02099390998192026, + "grad_norm": 2.2119734287261963, + "learning_rate": 4.994567584514968e-05, + "loss": 6.0456, + "step": 3530 + }, + { + "epoch": 0.020999857265201257, + "grad_norm": 2.9191582202911377, + "learning_rate": 4.9945645064670737e-05, + "loss": 6.3808, + "step": 3531 + }, + { + "epoch": 0.021005804548482252, + "grad_norm": 3.124101400375366, + "learning_rate": 4.994561427548354e-05, + "loss": 5.3631, + "step": 3532 + }, + { + "epoch": 0.02101175183176325, + "grad_norm": 2.803938150405884, + "learning_rate": 4.994558347758808e-05, + "loss": 5.3172, + "step": 3533 + }, + { + "epoch": 0.02101769911504425, + "grad_norm": 2.6231577396392822, + "learning_rate": 4.994555267098438e-05, + "loss": 6.4466, + "step": 3534 + }, + { + "epoch": 0.021023646398325244, + "grad_norm": 2.735590696334839, + "learning_rate": 4.994552185567244e-05, + "loss": 5.3115, + "step": 3535 + }, + { + "epoch": 0.021029593681606243, + "grad_norm": 2.730459690093994, + "learning_rate": 4.994549103165228e-05, + "loss": 5.2311, + "step": 3536 + }, + { + "epoch": 0.021035540964887238, + "grad_norm": 2.1241424083709717, + "learning_rate": 4.994546019892391e-05, + "loss": 5.6599, + "step": 3537 + }, + { + "epoch": 0.021041488248168237, + "grad_norm": 2.607807159423828, + "learning_rate": 4.994542935748733e-05, + "loss": 6.1182, + "step": 3538 + }, + { + "epoch": 0.021047435531449235, + "grad_norm": 2.6896564960479736, + "learning_rate": 4.9945398507342567e-05, + "loss": 6.2827, + "step": 3539 + }, + { + "epoch": 0.02105338281473023, + "grad_norm": 2.9237961769104004, + "learning_rate": 4.994536764848962e-05, + "loss": 5.9629, + "step": 3540 + }, + { + "epoch": 0.02105933009801123, + "grad_norm": 2.7576143741607666, + "learning_rate": 4.99453367809285e-05, + "loss": 5.7612, + "step": 3541 + }, + { + "epoch": 0.021065277381292227, + "grad_norm": 3.1622097492218018, + "learning_rate": 4.9945305904659226e-05, + "loss": 6.0415, + "step": 3542 + }, + { + "epoch": 0.021071224664573222, + "grad_norm": 2.471127510070801, + "learning_rate": 4.994527501968179e-05, + "loss": 6.1264, + "step": 3543 + }, + { + "epoch": 0.02107717194785422, + "grad_norm": 2.797504425048828, + "learning_rate": 4.994524412599623e-05, + "loss": 6.3515, + "step": 3544 + }, + { + "epoch": 0.021083119231135216, + "grad_norm": 2.4932103157043457, + "learning_rate": 4.9945213223602535e-05, + "loss": 6.4327, + "step": 3545 + }, + { + "epoch": 0.021089066514416215, + "grad_norm": 2.5194599628448486, + "learning_rate": 4.9945182312500725e-05, + "loss": 6.4003, + "step": 3546 + }, + { + "epoch": 0.021095013797697213, + "grad_norm": 2.287858247756958, + "learning_rate": 4.9945151392690814e-05, + "loss": 6.3287, + "step": 3547 + }, + { + "epoch": 0.021100961080978208, + "grad_norm": 2.941619873046875, + "learning_rate": 4.994512046417281e-05, + "loss": 6.1364, + "step": 3548 + }, + { + "epoch": 0.021106908364259207, + "grad_norm": 3.1448967456817627, + "learning_rate": 4.994508952694672e-05, + "loss": 5.8638, + "step": 3549 + }, + { + "epoch": 0.021112855647540205, + "grad_norm": 2.869966983795166, + "learning_rate": 4.994505858101255e-05, + "loss": 6.0122, + "step": 3550 + }, + { + "epoch": 0.0211188029308212, + "grad_norm": 2.421264886856079, + "learning_rate": 4.9945027626370325e-05, + "loss": 6.1243, + "step": 3551 + }, + { + "epoch": 0.0211247502141022, + "grad_norm": 2.599456310272217, + "learning_rate": 4.9944996663020047e-05, + "loss": 5.9484, + "step": 3552 + }, + { + "epoch": 0.021130697497383194, + "grad_norm": 3.1029574871063232, + "learning_rate": 4.994496569096173e-05, + "loss": 5.9347, + "step": 3553 + }, + { + "epoch": 0.021136644780664193, + "grad_norm": 3.02494478225708, + "learning_rate": 4.994493471019538e-05, + "loss": 5.814, + "step": 3554 + }, + { + "epoch": 0.02114259206394519, + "grad_norm": 2.359682559967041, + "learning_rate": 4.994490372072101e-05, + "loss": 5.8533, + "step": 3555 + }, + { + "epoch": 0.021148539347226186, + "grad_norm": 2.7072582244873047, + "learning_rate": 4.994487272253864e-05, + "loss": 5.855, + "step": 3556 + }, + { + "epoch": 0.021154486630507185, + "grad_norm": 2.3102664947509766, + "learning_rate": 4.994484171564826e-05, + "loss": 5.6701, + "step": 3557 + }, + { + "epoch": 0.02116043391378818, + "grad_norm": 2.3804259300231934, + "learning_rate": 4.9944810700049906e-05, + "loss": 5.5096, + "step": 3558 + }, + { + "epoch": 0.02116638119706918, + "grad_norm": 2.463280439376831, + "learning_rate": 4.994477967574357e-05, + "loss": 5.5178, + "step": 3559 + }, + { + "epoch": 0.021172328480350177, + "grad_norm": 2.884152412414551, + "learning_rate": 4.9944748642729265e-05, + "loss": 6.1013, + "step": 3560 + }, + { + "epoch": 0.021178275763631172, + "grad_norm": 3.009460210800171, + "learning_rate": 4.9944717601007006e-05, + "loss": 6.2725, + "step": 3561 + }, + { + "epoch": 0.02118422304691217, + "grad_norm": 2.5930371284484863, + "learning_rate": 4.9944686550576814e-05, + "loss": 6.1138, + "step": 3562 + }, + { + "epoch": 0.02119017033019317, + "grad_norm": 2.8212878704071045, + "learning_rate": 4.9944655491438684e-05, + "loss": 5.6209, + "step": 3563 + }, + { + "epoch": 0.021196117613474164, + "grad_norm": 2.9814743995666504, + "learning_rate": 4.9944624423592634e-05, + "loss": 5.8912, + "step": 3564 + }, + { + "epoch": 0.021202064896755163, + "grad_norm": 3.1456093788146973, + "learning_rate": 4.994459334703867e-05, + "loss": 5.961, + "step": 3565 + }, + { + "epoch": 0.021208012180036158, + "grad_norm": 2.9300050735473633, + "learning_rate": 4.9944562261776805e-05, + "loss": 6.773, + "step": 3566 + }, + { + "epoch": 0.021213959463317156, + "grad_norm": 2.570685625076294, + "learning_rate": 4.994453116780705e-05, + "loss": 6.3575, + "step": 3567 + }, + { + "epoch": 0.021219906746598155, + "grad_norm": 2.7060914039611816, + "learning_rate": 4.994450006512943e-05, + "loss": 6.249, + "step": 3568 + }, + { + "epoch": 0.02122585402987915, + "grad_norm": 3.0027518272399902, + "learning_rate": 4.994446895374393e-05, + "loss": 5.8243, + "step": 3569 + }, + { + "epoch": 0.02123180131316015, + "grad_norm": 2.785888195037842, + "learning_rate": 4.994443783365058e-05, + "loss": 5.9836, + "step": 3570 + }, + { + "epoch": 0.021237748596441147, + "grad_norm": 2.5480010509490967, + "learning_rate": 4.994440670484938e-05, + "loss": 6.4237, + "step": 3571 + }, + { + "epoch": 0.021243695879722142, + "grad_norm": 2.687121629714966, + "learning_rate": 4.9944375567340345e-05, + "loss": 6.4497, + "step": 3572 + }, + { + "epoch": 0.02124964316300314, + "grad_norm": 2.6066362857818604, + "learning_rate": 4.994434442112349e-05, + "loss": 6.3853, + "step": 3573 + }, + { + "epoch": 0.021255590446284136, + "grad_norm": 2.880352020263672, + "learning_rate": 4.994431326619882e-05, + "loss": 6.382, + "step": 3574 + }, + { + "epoch": 0.021261537729565134, + "grad_norm": 3.0415213108062744, + "learning_rate": 4.9944282102566345e-05, + "loss": 6.4472, + "step": 3575 + }, + { + "epoch": 0.021267485012846133, + "grad_norm": 2.4917140007019043, + "learning_rate": 4.994425093022609e-05, + "loss": 6.2546, + "step": 3576 + }, + { + "epoch": 0.021273432296127128, + "grad_norm": 2.53648042678833, + "learning_rate": 4.9944219749178044e-05, + "loss": 6.37, + "step": 3577 + }, + { + "epoch": 0.021279379579408127, + "grad_norm": 2.796342134475708, + "learning_rate": 4.994418855942223e-05, + "loss": 6.1691, + "step": 3578 + }, + { + "epoch": 0.021285326862689125, + "grad_norm": 2.9148125648498535, + "learning_rate": 4.9944157360958656e-05, + "loss": 6.2552, + "step": 3579 + }, + { + "epoch": 0.02129127414597012, + "grad_norm": 3.0777838230133057, + "learning_rate": 4.994412615378734e-05, + "loss": 6.2359, + "step": 3580 + }, + { + "epoch": 0.02129722142925112, + "grad_norm": 2.5878093242645264, + "learning_rate": 4.994409493790828e-05, + "loss": 6.0746, + "step": 3581 + }, + { + "epoch": 0.021303168712532114, + "grad_norm": 3.2084906101226807, + "learning_rate": 4.99440637133215e-05, + "loss": 6.1357, + "step": 3582 + }, + { + "epoch": 0.021309115995813113, + "grad_norm": 3.7210965156555176, + "learning_rate": 4.9944032480027004e-05, + "loss": 6.5117, + "step": 3583 + }, + { + "epoch": 0.02131506327909411, + "grad_norm": 2.8332109451293945, + "learning_rate": 4.994400123802481e-05, + "loss": 6.0908, + "step": 3584 + }, + { + "epoch": 0.021321010562375106, + "grad_norm": 2.83854341506958, + "learning_rate": 4.994396998731491e-05, + "loss": 6.1522, + "step": 3585 + }, + { + "epoch": 0.021326957845656105, + "grad_norm": 2.5171611309051514, + "learning_rate": 4.9943938727897335e-05, + "loss": 6.2253, + "step": 3586 + }, + { + "epoch": 0.0213329051289371, + "grad_norm": 2.2111763954162598, + "learning_rate": 4.9943907459772086e-05, + "loss": 5.7673, + "step": 3587 + }, + { + "epoch": 0.0213388524122181, + "grad_norm": 2.5147926807403564, + "learning_rate": 4.994387618293918e-05, + "loss": 6.8327, + "step": 3588 + }, + { + "epoch": 0.021344799695499097, + "grad_norm": 2.969285488128662, + "learning_rate": 4.9943844897398626e-05, + "loss": 6.9995, + "step": 3589 + }, + { + "epoch": 0.021350746978780092, + "grad_norm": 4.00917911529541, + "learning_rate": 4.994381360315043e-05, + "loss": 6.6377, + "step": 3590 + }, + { + "epoch": 0.02135669426206109, + "grad_norm": 3.899319887161255, + "learning_rate": 4.994378230019461e-05, + "loss": 6.162, + "step": 3591 + }, + { + "epoch": 0.02136264154534209, + "grad_norm": 2.9522764682769775, + "learning_rate": 4.994375098853117e-05, + "loss": 6.4405, + "step": 3592 + }, + { + "epoch": 0.021368588828623084, + "grad_norm": 3.0569825172424316, + "learning_rate": 4.994371966816012e-05, + "loss": 6.2631, + "step": 3593 + }, + { + "epoch": 0.021374536111904083, + "grad_norm": 2.9470009803771973, + "learning_rate": 4.994368833908148e-05, + "loss": 6.4785, + "step": 3594 + }, + { + "epoch": 0.021380483395185078, + "grad_norm": 2.913940668106079, + "learning_rate": 4.994365700129525e-05, + "loss": 6.6566, + "step": 3595 + }, + { + "epoch": 0.021386430678466076, + "grad_norm": 2.6037404537200928, + "learning_rate": 4.9943625654801465e-05, + "loss": 6.2535, + "step": 3596 + }, + { + "epoch": 0.021392377961747075, + "grad_norm": 2.998276948928833, + "learning_rate": 4.99435942996001e-05, + "loss": 6.8851, + "step": 3597 + }, + { + "epoch": 0.02139832524502807, + "grad_norm": 2.2189996242523193, + "learning_rate": 4.994356293569119e-05, + "loss": 6.8707, + "step": 3598 + }, + { + "epoch": 0.02140427252830907, + "grad_norm": 2.4528486728668213, + "learning_rate": 4.994353156307474e-05, + "loss": 6.9166, + "step": 3599 + }, + { + "epoch": 0.021410219811590067, + "grad_norm": 3.0538241863250732, + "learning_rate": 4.994350018175076e-05, + "loss": 6.3258, + "step": 3600 + }, + { + "epoch": 0.021416167094871062, + "grad_norm": 3.789745569229126, + "learning_rate": 4.994346879171926e-05, + "loss": 6.1962, + "step": 3601 + }, + { + "epoch": 0.02142211437815206, + "grad_norm": 3.2789254188537598, + "learning_rate": 4.994343739298025e-05, + "loss": 6.2126, + "step": 3602 + }, + { + "epoch": 0.021428061661433056, + "grad_norm": 3.0887696743011475, + "learning_rate": 4.994340598553375e-05, + "loss": 6.2395, + "step": 3603 + }, + { + "epoch": 0.021434008944714054, + "grad_norm": 2.9189252853393555, + "learning_rate": 4.994337456937977e-05, + "loss": 6.193, + "step": 3604 + }, + { + "epoch": 0.021439956227995053, + "grad_norm": 2.8582170009613037, + "learning_rate": 4.9943343144518306e-05, + "loss": 6.1077, + "step": 3605 + }, + { + "epoch": 0.021445903511276048, + "grad_norm": 3.076979160308838, + "learning_rate": 4.994331171094938e-05, + "loss": 6.0474, + "step": 3606 + }, + { + "epoch": 0.021451850794557047, + "grad_norm": 3.482161045074463, + "learning_rate": 4.994328026867301e-05, + "loss": 6.0551, + "step": 3607 + }, + { + "epoch": 0.021457798077838045, + "grad_norm": 3.001046895980835, + "learning_rate": 4.994324881768919e-05, + "loss": 6.0393, + "step": 3608 + }, + { + "epoch": 0.02146374536111904, + "grad_norm": 2.8006365299224854, + "learning_rate": 4.994321735799794e-05, + "loss": 6.0042, + "step": 3609 + }, + { + "epoch": 0.02146969264440004, + "grad_norm": 3.10727858543396, + "learning_rate": 4.994318588959927e-05, + "loss": 5.8981, + "step": 3610 + }, + { + "epoch": 0.021475639927681034, + "grad_norm": 2.660557985305786, + "learning_rate": 4.9943154412493194e-05, + "loss": 6.0426, + "step": 3611 + }, + { + "epoch": 0.021481587210962032, + "grad_norm": 2.8504562377929688, + "learning_rate": 4.994312292667972e-05, + "loss": 6.9774, + "step": 3612 + }, + { + "epoch": 0.02148753449424303, + "grad_norm": 3.0076539516448975, + "learning_rate": 4.994309143215886e-05, + "loss": 6.3238, + "step": 3613 + }, + { + "epoch": 0.021493481777524026, + "grad_norm": 2.2966883182525635, + "learning_rate": 4.9943059928930626e-05, + "loss": 7.0015, + "step": 3614 + }, + { + "epoch": 0.021499429060805025, + "grad_norm": 2.5054080486297607, + "learning_rate": 4.994302841699502e-05, + "loss": 6.9226, + "step": 3615 + }, + { + "epoch": 0.021505376344086023, + "grad_norm": 2.856278657913208, + "learning_rate": 4.9942996896352066e-05, + "loss": 6.7836, + "step": 3616 + }, + { + "epoch": 0.02151132362736702, + "grad_norm": 2.4902377128601074, + "learning_rate": 4.994296536700177e-05, + "loss": 6.7077, + "step": 3617 + }, + { + "epoch": 0.021517270910648017, + "grad_norm": 2.477932929992676, + "learning_rate": 4.994293382894414e-05, + "loss": 6.8284, + "step": 3618 + }, + { + "epoch": 0.021523218193929012, + "grad_norm": 2.3034260272979736, + "learning_rate": 4.994290228217919e-05, + "loss": 6.8012, + "step": 3619 + }, + { + "epoch": 0.02152916547721001, + "grad_norm": 2.3850560188293457, + "learning_rate": 4.9942870726706934e-05, + "loss": 6.6208, + "step": 3620 + }, + { + "epoch": 0.02153511276049101, + "grad_norm": 2.4397644996643066, + "learning_rate": 4.994283916252738e-05, + "loss": 6.7522, + "step": 3621 + }, + { + "epoch": 0.021541060043772004, + "grad_norm": 2.400846242904663, + "learning_rate": 4.994280758964053e-05, + "loss": 6.7529, + "step": 3622 + }, + { + "epoch": 0.021547007327053003, + "grad_norm": 2.358290195465088, + "learning_rate": 4.994277600804641e-05, + "loss": 6.6812, + "step": 3623 + }, + { + "epoch": 0.021552954610333998, + "grad_norm": 2.7409300804138184, + "learning_rate": 4.994274441774503e-05, + "loss": 6.668, + "step": 3624 + }, + { + "epoch": 0.021558901893614996, + "grad_norm": 2.6890954971313477, + "learning_rate": 4.994271281873639e-05, + "loss": 6.5537, + "step": 3625 + }, + { + "epoch": 0.021564849176895995, + "grad_norm": 2.8959596157073975, + "learning_rate": 4.9942681211020505e-05, + "loss": 6.4492, + "step": 3626 + }, + { + "epoch": 0.02157079646017699, + "grad_norm": 2.4325244426727295, + "learning_rate": 4.994264959459738e-05, + "loss": 6.9819, + "step": 3627 + }, + { + "epoch": 0.02157674374345799, + "grad_norm": 2.92891263961792, + "learning_rate": 4.9942617969467045e-05, + "loss": 6.9266, + "step": 3628 + }, + { + "epoch": 0.021582691026738987, + "grad_norm": 2.4398467540740967, + "learning_rate": 4.994258633562951e-05, + "loss": 6.514, + "step": 3629 + }, + { + "epoch": 0.021588638310019982, + "grad_norm": 2.577467203140259, + "learning_rate": 4.9942554693084756e-05, + "loss": 6.7248, + "step": 3630 + }, + { + "epoch": 0.02159458559330098, + "grad_norm": 2.3682591915130615, + "learning_rate": 4.9942523041832824e-05, + "loss": 6.7798, + "step": 3631 + }, + { + "epoch": 0.021600532876581976, + "grad_norm": 2.1863434314727783, + "learning_rate": 4.9942491381873705e-05, + "loss": 6.6636, + "step": 3632 + }, + { + "epoch": 0.021606480159862974, + "grad_norm": 2.0172441005706787, + "learning_rate": 4.9942459713207426e-05, + "loss": 6.6772, + "step": 3633 + }, + { + "epoch": 0.021612427443143973, + "grad_norm": 1.8671952486038208, + "learning_rate": 4.9942428035834e-05, + "loss": 6.3648, + "step": 3634 + }, + { + "epoch": 0.021618374726424968, + "grad_norm": 3.226900815963745, + "learning_rate": 4.9942396349753416e-05, + "loss": 6.4127, + "step": 3635 + }, + { + "epoch": 0.021624322009705967, + "grad_norm": 2.7766973972320557, + "learning_rate": 4.994236465496571e-05, + "loss": 6.4476, + "step": 3636 + }, + { + "epoch": 0.021630269292986965, + "grad_norm": 2.157118082046509, + "learning_rate": 4.9942332951470875e-05, + "loss": 6.5876, + "step": 3637 + }, + { + "epoch": 0.02163621657626796, + "grad_norm": 2.3870396614074707, + "learning_rate": 4.994230123926893e-05, + "loss": 6.5861, + "step": 3638 + }, + { + "epoch": 0.02164216385954896, + "grad_norm": 2.8139939308166504, + "learning_rate": 4.994226951835989e-05, + "loss": 6.4845, + "step": 3639 + }, + { + "epoch": 0.021648111142829954, + "grad_norm": 2.856207847595215, + "learning_rate": 4.9942237788743764e-05, + "loss": 6.1514, + "step": 3640 + }, + { + "epoch": 0.021654058426110952, + "grad_norm": 3.523162603378296, + "learning_rate": 4.9942206050420545e-05, + "loss": 5.8114, + "step": 3641 + }, + { + "epoch": 0.02166000570939195, + "grad_norm": 2.746587038040161, + "learning_rate": 4.9942174303390274e-05, + "loss": 5.7397, + "step": 3642 + }, + { + "epoch": 0.021665952992672946, + "grad_norm": 2.902067184448242, + "learning_rate": 4.9942142547652946e-05, + "loss": 6.4353, + "step": 3643 + }, + { + "epoch": 0.021671900275953945, + "grad_norm": 2.981391191482544, + "learning_rate": 4.994211078320857e-05, + "loss": 6.2153, + "step": 3644 + }, + { + "epoch": 0.021677847559234943, + "grad_norm": 2.6004254817962646, + "learning_rate": 4.994207901005716e-05, + "loss": 6.2365, + "step": 3645 + }, + { + "epoch": 0.021683794842515938, + "grad_norm": 2.748678684234619, + "learning_rate": 4.994204722819873e-05, + "loss": 5.8126, + "step": 3646 + }, + { + "epoch": 0.021689742125796937, + "grad_norm": 2.675466299057007, + "learning_rate": 4.994201543763329e-05, + "loss": 6.3032, + "step": 3647 + }, + { + "epoch": 0.021695689409077932, + "grad_norm": 2.681823253631592, + "learning_rate": 4.9941983638360855e-05, + "loss": 6.2706, + "step": 3648 + }, + { + "epoch": 0.02170163669235893, + "grad_norm": 2.481586217880249, + "learning_rate": 4.994195183038142e-05, + "loss": 6.1792, + "step": 3649 + }, + { + "epoch": 0.02170758397563993, + "grad_norm": 2.3379831314086914, + "learning_rate": 4.9941920013695024e-05, + "loss": 6.2689, + "step": 3650 + }, + { + "epoch": 0.021713531258920924, + "grad_norm": 2.5885238647460938, + "learning_rate": 4.994188818830164e-05, + "loss": 6.3018, + "step": 3651 + }, + { + "epoch": 0.021719478542201923, + "grad_norm": 2.341939687728882, + "learning_rate": 4.994185635420131e-05, + "loss": 5.6178, + "step": 3652 + }, + { + "epoch": 0.021725425825482918, + "grad_norm": 2.4126031398773193, + "learning_rate": 4.9941824511394044e-05, + "loss": 5.4044, + "step": 3653 + }, + { + "epoch": 0.021731373108763916, + "grad_norm": 2.2289719581604004, + "learning_rate": 4.994179265987983e-05, + "loss": 5.4134, + "step": 3654 + }, + { + "epoch": 0.021737320392044915, + "grad_norm": 2.5151331424713135, + "learning_rate": 4.994176079965871e-05, + "loss": 5.3321, + "step": 3655 + }, + { + "epoch": 0.02174326767532591, + "grad_norm": 2.0761523246765137, + "learning_rate": 4.9941728930730665e-05, + "loss": 5.3363, + "step": 3656 + }, + { + "epoch": 0.02174921495860691, + "grad_norm": 2.272510051727295, + "learning_rate": 4.994169705309573e-05, + "loss": 6.0208, + "step": 3657 + }, + { + "epoch": 0.021755162241887907, + "grad_norm": 2.6145198345184326, + "learning_rate": 4.994166516675389e-05, + "loss": 6.299, + "step": 3658 + }, + { + "epoch": 0.021761109525168902, + "grad_norm": 2.978618621826172, + "learning_rate": 4.994163327170519e-05, + "loss": 5.1248, + "step": 3659 + }, + { + "epoch": 0.0217670568084499, + "grad_norm": 2.398813247680664, + "learning_rate": 4.994160136794962e-05, + "loss": 5.1217, + "step": 3660 + }, + { + "epoch": 0.021773004091730896, + "grad_norm": 2.1145291328430176, + "learning_rate": 4.994156945548719e-05, + "loss": 5.2676, + "step": 3661 + }, + { + "epoch": 0.021778951375011894, + "grad_norm": 2.045334577560425, + "learning_rate": 4.9941537534317915e-05, + "loss": 5.2088, + "step": 3662 + }, + { + "epoch": 0.021784898658292893, + "grad_norm": 2.0598506927490234, + "learning_rate": 4.9941505604441806e-05, + "loss": 5.363, + "step": 3663 + }, + { + "epoch": 0.021790845941573888, + "grad_norm": 2.189143657684326, + "learning_rate": 4.9941473665858884e-05, + "loss": 6.0592, + "step": 3664 + }, + { + "epoch": 0.021796793224854887, + "grad_norm": 6.8580780029296875, + "learning_rate": 4.994144171856915e-05, + "loss": 6.0323, + "step": 3665 + }, + { + "epoch": 0.021802740508135885, + "grad_norm": 2.0607001781463623, + "learning_rate": 4.994140976257261e-05, + "loss": 6.0883, + "step": 3666 + }, + { + "epoch": 0.02180868779141688, + "grad_norm": 2.1669631004333496, + "learning_rate": 4.9941377797869284e-05, + "loss": 6.0546, + "step": 3667 + }, + { + "epoch": 0.02181463507469788, + "grad_norm": 2.912822961807251, + "learning_rate": 4.994134582445917e-05, + "loss": 6.0285, + "step": 3668 + }, + { + "epoch": 0.021820582357978874, + "grad_norm": 2.3223111629486084, + "learning_rate": 4.994131384234231e-05, + "loss": 6.0948, + "step": 3669 + }, + { + "epoch": 0.021826529641259872, + "grad_norm": 2.067002296447754, + "learning_rate": 4.994128185151868e-05, + "loss": 6.2908, + "step": 3670 + }, + { + "epoch": 0.02183247692454087, + "grad_norm": 2.593642473220825, + "learning_rate": 4.9941249851988317e-05, + "loss": 6.2878, + "step": 3671 + }, + { + "epoch": 0.021838424207821866, + "grad_norm": 2.6345975399017334, + "learning_rate": 4.994121784375121e-05, + "loss": 6.0796, + "step": 3672 + }, + { + "epoch": 0.021844371491102865, + "grad_norm": 2.398861885070801, + "learning_rate": 4.994118582680739e-05, + "loss": 6.096, + "step": 3673 + }, + { + "epoch": 0.021850318774383863, + "grad_norm": 2.102933883666992, + "learning_rate": 4.994115380115686e-05, + "loss": 6.1347, + "step": 3674 + }, + { + "epoch": 0.021856266057664858, + "grad_norm": 2.43632435798645, + "learning_rate": 4.994112176679963e-05, + "loss": 6.074, + "step": 3675 + }, + { + "epoch": 0.021862213340945857, + "grad_norm": 2.304213523864746, + "learning_rate": 4.9941089723735706e-05, + "loss": 5.8897, + "step": 3676 + }, + { + "epoch": 0.021868160624226852, + "grad_norm": 2.6283092498779297, + "learning_rate": 4.9941057671965106e-05, + "loss": 5.9605, + "step": 3677 + }, + { + "epoch": 0.02187410790750785, + "grad_norm": 2.0781428813934326, + "learning_rate": 4.994102561148785e-05, + "loss": 6.0645, + "step": 3678 + }, + { + "epoch": 0.02188005519078885, + "grad_norm": 2.229210376739502, + "learning_rate": 4.994099354230393e-05, + "loss": 6.223, + "step": 3679 + }, + { + "epoch": 0.021886002474069844, + "grad_norm": 2.4410789012908936, + "learning_rate": 4.9940961464413374e-05, + "loss": 6.1115, + "step": 3680 + }, + { + "epoch": 0.021891949757350843, + "grad_norm": 2.99076771736145, + "learning_rate": 4.994092937781618e-05, + "loss": 5.9028, + "step": 3681 + }, + { + "epoch": 0.021897897040631838, + "grad_norm": 2.8403074741363525, + "learning_rate": 4.994089728251237e-05, + "loss": 5.7286, + "step": 3682 + }, + { + "epoch": 0.021903844323912836, + "grad_norm": 2.0928149223327637, + "learning_rate": 4.994086517850195e-05, + "loss": 5.849, + "step": 3683 + }, + { + "epoch": 0.021909791607193835, + "grad_norm": 2.320279836654663, + "learning_rate": 4.994083306578492e-05, + "loss": 5.6767, + "step": 3684 + }, + { + "epoch": 0.02191573889047483, + "grad_norm": 3.0701658725738525, + "learning_rate": 4.994080094436132e-05, + "loss": 5.9555, + "step": 3685 + }, + { + "epoch": 0.02192168617375583, + "grad_norm": 2.1042048931121826, + "learning_rate": 4.994076881423113e-05, + "loss": 5.7651, + "step": 3686 + }, + { + "epoch": 0.021927633457036827, + "grad_norm": 2.35819673538208, + "learning_rate": 4.9940736675394385e-05, + "loss": 6.0203, + "step": 3687 + }, + { + "epoch": 0.021933580740317822, + "grad_norm": 2.659224510192871, + "learning_rate": 4.994070452785108e-05, + "loss": 5.9935, + "step": 3688 + }, + { + "epoch": 0.02193952802359882, + "grad_norm": 2.4628207683563232, + "learning_rate": 4.994067237160124e-05, + "loss": 5.9135, + "step": 3689 + }, + { + "epoch": 0.021945475306879816, + "grad_norm": 3.7227911949157715, + "learning_rate": 4.9940640206644865e-05, + "loss": 5.8365, + "step": 3690 + }, + { + "epoch": 0.021951422590160814, + "grad_norm": 3.5226151943206787, + "learning_rate": 4.994060803298197e-05, + "loss": 5.7807, + "step": 3691 + }, + { + "epoch": 0.021957369873441813, + "grad_norm": 2.3665735721588135, + "learning_rate": 4.994057585061256e-05, + "loss": 5.9632, + "step": 3692 + }, + { + "epoch": 0.021963317156722808, + "grad_norm": 2.877263069152832, + "learning_rate": 4.9940543659536666e-05, + "loss": 5.6425, + "step": 3693 + }, + { + "epoch": 0.021969264440003806, + "grad_norm": 2.5431532859802246, + "learning_rate": 4.994051145975428e-05, + "loss": 5.6531, + "step": 3694 + }, + { + "epoch": 0.021975211723284805, + "grad_norm": 2.7033538818359375, + "learning_rate": 4.9940479251265415e-05, + "loss": 5.6907, + "step": 3695 + }, + { + "epoch": 0.0219811590065658, + "grad_norm": 3.6627206802368164, + "learning_rate": 4.9940447034070093e-05, + "loss": 5.9118, + "step": 3696 + }, + { + "epoch": 0.0219871062898468, + "grad_norm": 3.896959066390991, + "learning_rate": 4.994041480816831e-05, + "loss": 5.9926, + "step": 3697 + }, + { + "epoch": 0.021993053573127794, + "grad_norm": 3.37575626373291, + "learning_rate": 4.994038257356009e-05, + "loss": 5.9768, + "step": 3698 + }, + { + "epoch": 0.021999000856408792, + "grad_norm": 2.7694313526153564, + "learning_rate": 4.9940350330245444e-05, + "loss": 5.8486, + "step": 3699 + }, + { + "epoch": 0.02200494813968979, + "grad_norm": 2.3815293312072754, + "learning_rate": 4.9940318078224376e-05, + "loss": 6.0663, + "step": 3700 + }, + { + "epoch": 0.022010895422970786, + "grad_norm": 2.3171627521514893, + "learning_rate": 4.99402858174969e-05, + "loss": 5.8543, + "step": 3701 + }, + { + "epoch": 0.022016842706251784, + "grad_norm": 2.5090551376342773, + "learning_rate": 4.994025354806303e-05, + "loss": 5.7005, + "step": 3702 + }, + { + "epoch": 0.022022789989532783, + "grad_norm": 2.7024855613708496, + "learning_rate": 4.9940221269922774e-05, + "loss": 5.7375, + "step": 3703 + }, + { + "epoch": 0.022028737272813778, + "grad_norm": 2.7900679111480713, + "learning_rate": 4.994018898307614e-05, + "loss": 6.0094, + "step": 3704 + }, + { + "epoch": 0.022034684556094777, + "grad_norm": 2.3678438663482666, + "learning_rate": 4.994015668752315e-05, + "loss": 5.822, + "step": 3705 + }, + { + "epoch": 0.022040631839375772, + "grad_norm": 2.5406653881073, + "learning_rate": 4.9940124383263807e-05, + "loss": 5.8984, + "step": 3706 + }, + { + "epoch": 0.02204657912265677, + "grad_norm": 2.371800422668457, + "learning_rate": 4.994009207029813e-05, + "loss": 5.9821, + "step": 3707 + }, + { + "epoch": 0.02205252640593777, + "grad_norm": 2.004669666290283, + "learning_rate": 4.994005974862612e-05, + "loss": 5.8801, + "step": 3708 + }, + { + "epoch": 0.022058473689218764, + "grad_norm": 2.777472972869873, + "learning_rate": 4.9940027418247787e-05, + "loss": 5.8821, + "step": 3709 + }, + { + "epoch": 0.022064420972499763, + "grad_norm": 2.599883556365967, + "learning_rate": 4.9939995079163156e-05, + "loss": 5.8716, + "step": 3710 + }, + { + "epoch": 0.022070368255780758, + "grad_norm": 2.5891127586364746, + "learning_rate": 4.993996273137223e-05, + "loss": 5.7607, + "step": 3711 + }, + { + "epoch": 0.022076315539061756, + "grad_norm": 2.3737518787384033, + "learning_rate": 4.993993037487501e-05, + "loss": 5.7825, + "step": 3712 + }, + { + "epoch": 0.022082262822342755, + "grad_norm": 2.421785831451416, + "learning_rate": 4.9939898009671524e-05, + "loss": 5.7143, + "step": 3713 + }, + { + "epoch": 0.02208821010562375, + "grad_norm": 2.4267804622650146, + "learning_rate": 4.9939865635761785e-05, + "loss": 5.8031, + "step": 3714 + }, + { + "epoch": 0.02209415738890475, + "grad_norm": 2.390333414077759, + "learning_rate": 4.993983325314579e-05, + "loss": 5.7985, + "step": 3715 + }, + { + "epoch": 0.022100104672185747, + "grad_norm": 2.2265970706939697, + "learning_rate": 4.993980086182356e-05, + "loss": 5.6261, + "step": 3716 + }, + { + "epoch": 0.022106051955466742, + "grad_norm": 2.3872458934783936, + "learning_rate": 4.99397684617951e-05, + "loss": 5.8185, + "step": 3717 + }, + { + "epoch": 0.02211199923874774, + "grad_norm": 2.077075958251953, + "learning_rate": 4.9939736053060425e-05, + "loss": 5.6252, + "step": 3718 + }, + { + "epoch": 0.022117946522028736, + "grad_norm": 2.0642287731170654, + "learning_rate": 4.993970363561954e-05, + "loss": 5.8034, + "step": 3719 + }, + { + "epoch": 0.022123893805309734, + "grad_norm": 3.5353951454162598, + "learning_rate": 4.9939671209472474e-05, + "loss": 6.7808, + "step": 3720 + }, + { + "epoch": 0.022129841088590733, + "grad_norm": 2.910531520843506, + "learning_rate": 4.9939638774619216e-05, + "loss": 5.9323, + "step": 3721 + }, + { + "epoch": 0.022135788371871728, + "grad_norm": 2.7450106143951416, + "learning_rate": 4.9939606331059794e-05, + "loss": 5.9926, + "step": 3722 + }, + { + "epoch": 0.022141735655152726, + "grad_norm": 2.7628188133239746, + "learning_rate": 4.993957387879421e-05, + "loss": 5.9129, + "step": 3723 + }, + { + "epoch": 0.022147682938433725, + "grad_norm": 2.6644890308380127, + "learning_rate": 4.9939541417822485e-05, + "loss": 5.7038, + "step": 3724 + }, + { + "epoch": 0.02215363022171472, + "grad_norm": 2.143744707107544, + "learning_rate": 4.993950894814461e-05, + "loss": 5.5821, + "step": 3725 + }, + { + "epoch": 0.02215957750499572, + "grad_norm": 2.1691160202026367, + "learning_rate": 4.993947646976063e-05, + "loss": 5.5929, + "step": 3726 + }, + { + "epoch": 0.022165524788276714, + "grad_norm": 2.1479709148406982, + "learning_rate": 4.993944398267052e-05, + "loss": 5.6653, + "step": 3727 + }, + { + "epoch": 0.022171472071557712, + "grad_norm": 2.7749600410461426, + "learning_rate": 4.993941148687431e-05, + "loss": 5.5682, + "step": 3728 + }, + { + "epoch": 0.02217741935483871, + "grad_norm": 2.668672561645508, + "learning_rate": 4.993937898237201e-05, + "loss": 5.5968, + "step": 3729 + }, + { + "epoch": 0.022183366638119706, + "grad_norm": 2.3903374671936035, + "learning_rate": 4.993934646916364e-05, + "loss": 5.7541, + "step": 3730 + }, + { + "epoch": 0.022189313921400704, + "grad_norm": 1.8555344343185425, + "learning_rate": 4.993931394724919e-05, + "loss": 5.5449, + "step": 3731 + }, + { + "epoch": 0.022195261204681703, + "grad_norm": 2.1140637397766113, + "learning_rate": 4.993928141662869e-05, + "loss": 5.8201, + "step": 3732 + }, + { + "epoch": 0.022201208487962698, + "grad_norm": 2.221573829650879, + "learning_rate": 4.993924887730213e-05, + "loss": 5.7583, + "step": 3733 + }, + { + "epoch": 0.022207155771243697, + "grad_norm": 2.0801634788513184, + "learning_rate": 4.993921632926956e-05, + "loss": 5.7083, + "step": 3734 + }, + { + "epoch": 0.02221310305452469, + "grad_norm": 2.0167016983032227, + "learning_rate": 4.993918377253095e-05, + "loss": 5.7798, + "step": 3735 + }, + { + "epoch": 0.02221905033780569, + "grad_norm": 2.104529619216919, + "learning_rate": 4.993915120708634e-05, + "loss": 5.7346, + "step": 3736 + }, + { + "epoch": 0.02222499762108669, + "grad_norm": 2.0807201862335205, + "learning_rate": 4.993911863293572e-05, + "loss": 5.7663, + "step": 3737 + }, + { + "epoch": 0.022230944904367684, + "grad_norm": 1.9223891496658325, + "learning_rate": 4.9939086050079115e-05, + "loss": 5.648, + "step": 3738 + }, + { + "epoch": 0.022236892187648682, + "grad_norm": 2.3831584453582764, + "learning_rate": 4.9939053458516535e-05, + "loss": 5.7988, + "step": 3739 + }, + { + "epoch": 0.02224283947092968, + "grad_norm": 2.433318853378296, + "learning_rate": 4.993902085824799e-05, + "loss": 5.7794, + "step": 3740 + }, + { + "epoch": 0.022248786754210676, + "grad_norm": 2.2488365173339844, + "learning_rate": 4.993898824927348e-05, + "loss": 5.7332, + "step": 3741 + }, + { + "epoch": 0.022254734037491675, + "grad_norm": 2.2924392223358154, + "learning_rate": 4.993895563159303e-05, + "loss": 5.8977, + "step": 3742 + }, + { + "epoch": 0.02226068132077267, + "grad_norm": 2.1601176261901855, + "learning_rate": 4.9938923005206664e-05, + "loss": 5.8588, + "step": 3743 + }, + { + "epoch": 0.02226662860405367, + "grad_norm": 2.256439447402954, + "learning_rate": 4.993889037011436e-05, + "loss": 5.6111, + "step": 3744 + }, + { + "epoch": 0.022272575887334667, + "grad_norm": 2.184950828552246, + "learning_rate": 4.993885772631615e-05, + "loss": 5.7544, + "step": 3745 + }, + { + "epoch": 0.022278523170615662, + "grad_norm": 2.250422716140747, + "learning_rate": 4.993882507381205e-05, + "loss": 5.6534, + "step": 3746 + }, + { + "epoch": 0.02228447045389666, + "grad_norm": 2.473811626434326, + "learning_rate": 4.9938792412602056e-05, + "loss": 5.5699, + "step": 3747 + }, + { + "epoch": 0.022290417737177656, + "grad_norm": 2.2859978675842285, + "learning_rate": 4.993875974268619e-05, + "loss": 5.8712, + "step": 3748 + }, + { + "epoch": 0.022296365020458654, + "grad_norm": 2.4002318382263184, + "learning_rate": 4.993872706406446e-05, + "loss": 5.8121, + "step": 3749 + }, + { + "epoch": 0.022302312303739653, + "grad_norm": 2.2692153453826904, + "learning_rate": 4.9938694376736884e-05, + "loss": 5.5516, + "step": 3750 + }, + { + "epoch": 0.022308259587020648, + "grad_norm": 2.1874892711639404, + "learning_rate": 4.9938661680703456e-05, + "loss": 5.8264, + "step": 3751 + }, + { + "epoch": 0.022314206870301646, + "grad_norm": 2.3802871704101562, + "learning_rate": 4.993862897596421e-05, + "loss": 5.6523, + "step": 3752 + }, + { + "epoch": 0.022320154153582645, + "grad_norm": 2.514646530151367, + "learning_rate": 4.9938596262519145e-05, + "loss": 5.5193, + "step": 3753 + }, + { + "epoch": 0.02232610143686364, + "grad_norm": 2.3175413608551025, + "learning_rate": 4.993856354036827e-05, + "loss": 5.5372, + "step": 3754 + }, + { + "epoch": 0.02233204872014464, + "grad_norm": 2.2071855068206787, + "learning_rate": 4.9938530809511595e-05, + "loss": 5.5002, + "step": 3755 + }, + { + "epoch": 0.022337996003425634, + "grad_norm": 2.046440839767456, + "learning_rate": 4.9938498069949144e-05, + "loss": 5.585, + "step": 3756 + }, + { + "epoch": 0.022343943286706632, + "grad_norm": 2.3971145153045654, + "learning_rate": 4.9938465321680915e-05, + "loss": 5.7858, + "step": 3757 + }, + { + "epoch": 0.02234989056998763, + "grad_norm": 2.462597131729126, + "learning_rate": 4.9938432564706936e-05, + "loss": 5.5606, + "step": 3758 + }, + { + "epoch": 0.022355837853268626, + "grad_norm": 2.3134138584136963, + "learning_rate": 4.99383997990272e-05, + "loss": 5.4587, + "step": 3759 + }, + { + "epoch": 0.022361785136549624, + "grad_norm": 2.137929916381836, + "learning_rate": 4.993836702464173e-05, + "loss": 5.4768, + "step": 3760 + }, + { + "epoch": 0.022367732419830623, + "grad_norm": 2.647691011428833, + "learning_rate": 4.993833424155053e-05, + "loss": 5.7902, + "step": 3761 + }, + { + "epoch": 0.022373679703111618, + "grad_norm": 2.535640239715576, + "learning_rate": 4.993830144975361e-05, + "loss": 5.8263, + "step": 3762 + }, + { + "epoch": 0.022379626986392617, + "grad_norm": 2.422997236251831, + "learning_rate": 4.9938268649251e-05, + "loss": 5.7751, + "step": 3763 + }, + { + "epoch": 0.02238557426967361, + "grad_norm": 2.6906728744506836, + "learning_rate": 4.9938235840042694e-05, + "loss": 5.5974, + "step": 3764 + }, + { + "epoch": 0.02239152155295461, + "grad_norm": 2.0284483432769775, + "learning_rate": 4.99382030221287e-05, + "loss": 5.6816, + "step": 3765 + }, + { + "epoch": 0.02239746883623561, + "grad_norm": 2.6392064094543457, + "learning_rate": 4.9938170195509035e-05, + "loss": 5.9052, + "step": 3766 + }, + { + "epoch": 0.022403416119516604, + "grad_norm": 2.6770617961883545, + "learning_rate": 4.993813736018372e-05, + "loss": 5.9041, + "step": 3767 + }, + { + "epoch": 0.022409363402797602, + "grad_norm": 2.5972392559051514, + "learning_rate": 4.993810451615276e-05, + "loss": 5.7834, + "step": 3768 + }, + { + "epoch": 0.0224153106860786, + "grad_norm": 2.0095736980438232, + "learning_rate": 4.993807166341616e-05, + "loss": 5.6074, + "step": 3769 + }, + { + "epoch": 0.022421257969359596, + "grad_norm": 2.412578582763672, + "learning_rate": 4.9938038801973945e-05, + "loss": 5.742, + "step": 3770 + }, + { + "epoch": 0.022427205252640595, + "grad_norm": 2.1285388469696045, + "learning_rate": 4.993800593182612e-05, + "loss": 5.7665, + "step": 3771 + }, + { + "epoch": 0.02243315253592159, + "grad_norm": 2.091252326965332, + "learning_rate": 4.993797305297268e-05, + "loss": 5.7165, + "step": 3772 + }, + { + "epoch": 0.022439099819202588, + "grad_norm": 2.5366342067718506, + "learning_rate": 4.993794016541367e-05, + "loss": 6.259, + "step": 3773 + }, + { + "epoch": 0.022445047102483587, + "grad_norm": 2.2637953758239746, + "learning_rate": 4.9937907269149063e-05, + "loss": 6.2132, + "step": 3774 + }, + { + "epoch": 0.022450994385764582, + "grad_norm": 2.570979595184326, + "learning_rate": 4.99378743641789e-05, + "loss": 5.9656, + "step": 3775 + }, + { + "epoch": 0.02245694166904558, + "grad_norm": 2.0587873458862305, + "learning_rate": 4.993784145050319e-05, + "loss": 5.7096, + "step": 3776 + }, + { + "epoch": 0.022462888952326576, + "grad_norm": 2.396812677383423, + "learning_rate": 4.993780852812192e-05, + "loss": 5.7258, + "step": 3777 + }, + { + "epoch": 0.022468836235607574, + "grad_norm": 2.081541061401367, + "learning_rate": 4.993777559703513e-05, + "loss": 5.6777, + "step": 3778 + }, + { + "epoch": 0.022474783518888573, + "grad_norm": 2.5242559909820557, + "learning_rate": 4.993774265724281e-05, + "loss": 5.961, + "step": 3779 + }, + { + "epoch": 0.022480730802169568, + "grad_norm": 2.4249329566955566, + "learning_rate": 4.993770970874499e-05, + "loss": 6.0494, + "step": 3780 + }, + { + "epoch": 0.022486678085450566, + "grad_norm": 2.7482552528381348, + "learning_rate": 4.993767675154169e-05, + "loss": 5.7579, + "step": 3781 + }, + { + "epoch": 0.022492625368731565, + "grad_norm": 4.115204811096191, + "learning_rate": 4.993764378563288e-05, + "loss": 6.3891, + "step": 3782 + }, + { + "epoch": 0.02249857265201256, + "grad_norm": 2.51346755027771, + "learning_rate": 4.99376108110186e-05, + "loss": 5.7982, + "step": 3783 + }, + { + "epoch": 0.02250451993529356, + "grad_norm": 2.2737278938293457, + "learning_rate": 4.993757782769887e-05, + "loss": 5.7576, + "step": 3784 + }, + { + "epoch": 0.022510467218574554, + "grad_norm": 2.2068402767181396, + "learning_rate": 4.9937544835673674e-05, + "loss": 5.9801, + "step": 3785 + }, + { + "epoch": 0.022516414501855552, + "grad_norm": 1.8548356294631958, + "learning_rate": 4.993751183494305e-05, + "loss": 6.2054, + "step": 3786 + }, + { + "epoch": 0.02252236178513655, + "grad_norm": 2.3499045372009277, + "learning_rate": 4.993747882550699e-05, + "loss": 6.0694, + "step": 3787 + }, + { + "epoch": 0.022528309068417546, + "grad_norm": 2.2253386974334717, + "learning_rate": 4.993744580736552e-05, + "loss": 5.709, + "step": 3788 + }, + { + "epoch": 0.022534256351698544, + "grad_norm": 2.1136696338653564, + "learning_rate": 4.993741278051864e-05, + "loss": 5.9546, + "step": 3789 + }, + { + "epoch": 0.022540203634979543, + "grad_norm": 1.8777605295181274, + "learning_rate": 4.9937379744966375e-05, + "loss": 5.7587, + "step": 3790 + }, + { + "epoch": 0.022546150918260538, + "grad_norm": 2.527571201324463, + "learning_rate": 4.9937346700708723e-05, + "loss": 5.0992, + "step": 3791 + }, + { + "epoch": 0.022552098201541537, + "grad_norm": 2.515805244445801, + "learning_rate": 4.99373136477457e-05, + "loss": 4.9766, + "step": 3792 + }, + { + "epoch": 0.02255804548482253, + "grad_norm": 2.442979574203491, + "learning_rate": 4.9937280586077315e-05, + "loss": 5.0981, + "step": 3793 + }, + { + "epoch": 0.02256399276810353, + "grad_norm": 2.575383424758911, + "learning_rate": 4.993724751570359e-05, + "loss": 5.0809, + "step": 3794 + }, + { + "epoch": 0.02256994005138453, + "grad_norm": 2.0855023860931396, + "learning_rate": 4.9937214436624524e-05, + "loss": 5.5744, + "step": 3795 + }, + { + "epoch": 0.022575887334665524, + "grad_norm": 2.237565040588379, + "learning_rate": 4.993718134884013e-05, + "loss": 5.6796, + "step": 3796 + }, + { + "epoch": 0.022581834617946522, + "grad_norm": 2.5895159244537354, + "learning_rate": 4.993714825235044e-05, + "loss": 5.2068, + "step": 3797 + }, + { + "epoch": 0.02258778190122752, + "grad_norm": 2.1277096271514893, + "learning_rate": 4.993711514715544e-05, + "loss": 5.5588, + "step": 3798 + }, + { + "epoch": 0.022593729184508516, + "grad_norm": 2.7074246406555176, + "learning_rate": 4.993708203325515e-05, + "loss": 5.0104, + "step": 3799 + }, + { + "epoch": 0.022599676467789515, + "grad_norm": 2.114569664001465, + "learning_rate": 4.993704891064958e-05, + "loss": 5.0453, + "step": 3800 + }, + { + "epoch": 0.02260562375107051, + "grad_norm": 2.4222404956817627, + "learning_rate": 4.9937015779338746e-05, + "loss": 5.3799, + "step": 3801 + }, + { + "epoch": 0.022611571034351508, + "grad_norm": 2.238755941390991, + "learning_rate": 4.993698263932266e-05, + "loss": 5.0075, + "step": 3802 + }, + { + "epoch": 0.022617518317632507, + "grad_norm": 2.0748255252838135, + "learning_rate": 4.993694949060133e-05, + "loss": 5.0007, + "step": 3803 + }, + { + "epoch": 0.022623465600913502, + "grad_norm": 2.1528635025024414, + "learning_rate": 4.993691633317477e-05, + "loss": 5.1048, + "step": 3804 + }, + { + "epoch": 0.0226294128841945, + "grad_norm": 2.0237200260162354, + "learning_rate": 4.993688316704298e-05, + "loss": 5.1465, + "step": 3805 + }, + { + "epoch": 0.022635360167475495, + "grad_norm": 2.2698304653167725, + "learning_rate": 4.993684999220599e-05, + "loss": 4.9642, + "step": 3806 + }, + { + "epoch": 0.022641307450756494, + "grad_norm": 2.7863757610321045, + "learning_rate": 4.993681680866381e-05, + "loss": 5.6277, + "step": 3807 + }, + { + "epoch": 0.022647254734037493, + "grad_norm": 2.394087553024292, + "learning_rate": 4.9936783616416436e-05, + "loss": 6.0895, + "step": 3808 + }, + { + "epoch": 0.022653202017318488, + "grad_norm": 2.8036317825317383, + "learning_rate": 4.993675041546389e-05, + "loss": 6.2002, + "step": 3809 + }, + { + "epoch": 0.022659149300599486, + "grad_norm": 2.4970054626464844, + "learning_rate": 4.993671720580618e-05, + "loss": 5.5114, + "step": 3810 + }, + { + "epoch": 0.022665096583880485, + "grad_norm": 3.2434241771698, + "learning_rate": 4.993668398744332e-05, + "loss": 5.0366, + "step": 3811 + }, + { + "epoch": 0.02267104386716148, + "grad_norm": 2.707104206085205, + "learning_rate": 4.9936650760375326e-05, + "loss": 5.5132, + "step": 3812 + }, + { + "epoch": 0.02267699115044248, + "grad_norm": 2.540231466293335, + "learning_rate": 4.9936617524602204e-05, + "loss": 5.8026, + "step": 3813 + }, + { + "epoch": 0.022682938433723474, + "grad_norm": 2.8549184799194336, + "learning_rate": 4.993658428012397e-05, + "loss": 6.0854, + "step": 3814 + }, + { + "epoch": 0.022688885717004472, + "grad_norm": 2.5972952842712402, + "learning_rate": 4.993655102694062e-05, + "loss": 5.8055, + "step": 3815 + }, + { + "epoch": 0.02269483300028547, + "grad_norm": 3.1625113487243652, + "learning_rate": 4.9936517765052184e-05, + "loss": 5.9683, + "step": 3816 + }, + { + "epoch": 0.022700780283566466, + "grad_norm": 3.239820718765259, + "learning_rate": 4.993648449445867e-05, + "loss": 5.9725, + "step": 3817 + }, + { + "epoch": 0.022706727566847464, + "grad_norm": 2.9632809162139893, + "learning_rate": 4.993645121516008e-05, + "loss": 5.9767, + "step": 3818 + }, + { + "epoch": 0.022712674850128463, + "grad_norm": 2.7486021518707275, + "learning_rate": 4.9936417927156435e-05, + "loss": 6.3471, + "step": 3819 + }, + { + "epoch": 0.022718622133409458, + "grad_norm": 3.8044490814208984, + "learning_rate": 4.993638463044775e-05, + "loss": 6.1275, + "step": 3820 + }, + { + "epoch": 0.022724569416690456, + "grad_norm": 4.851193428039551, + "learning_rate": 4.9936351325034024e-05, + "loss": 5.6658, + "step": 3821 + }, + { + "epoch": 0.02273051669997145, + "grad_norm": 3.1302716732025146, + "learning_rate": 4.993631801091528e-05, + "loss": 5.5256, + "step": 3822 + }, + { + "epoch": 0.02273646398325245, + "grad_norm": 5.310885906219482, + "learning_rate": 4.9936284688091526e-05, + "loss": 5.4771, + "step": 3823 + }, + { + "epoch": 0.02274241126653345, + "grad_norm": 5.493198394775391, + "learning_rate": 4.9936251356562765e-05, + "loss": 6.0993, + "step": 3824 + }, + { + "epoch": 0.022748358549814444, + "grad_norm": 3.5346286296844482, + "learning_rate": 4.993621801632902e-05, + "loss": 6.6862, + "step": 3825 + }, + { + "epoch": 0.022754305833095442, + "grad_norm": 4.550736904144287, + "learning_rate": 4.9936184667390304e-05, + "loss": 6.5658, + "step": 3826 + }, + { + "epoch": 0.02276025311637644, + "grad_norm": 3.3957576751708984, + "learning_rate": 4.993615130974662e-05, + "loss": 6.0596, + "step": 3827 + }, + { + "epoch": 0.022766200399657436, + "grad_norm": 2.614089012145996, + "learning_rate": 4.993611794339798e-05, + "loss": 6.77, + "step": 3828 + }, + { + "epoch": 0.022772147682938434, + "grad_norm": 3.712106704711914, + "learning_rate": 4.99360845683444e-05, + "loss": 6.4084, + "step": 3829 + }, + { + "epoch": 0.02277809496621943, + "grad_norm": 3.7331995964050293, + "learning_rate": 4.99360511845859e-05, + "loss": 6.2627, + "step": 3830 + }, + { + "epoch": 0.022784042249500428, + "grad_norm": 3.8898067474365234, + "learning_rate": 4.993601779212247e-05, + "loss": 6.6476, + "step": 3831 + }, + { + "epoch": 0.022789989532781427, + "grad_norm": 2.829078435897827, + "learning_rate": 4.9935984390954136e-05, + "loss": 6.2307, + "step": 3832 + }, + { + "epoch": 0.022795936816062422, + "grad_norm": 3.467954635620117, + "learning_rate": 4.9935950981080906e-05, + "loss": 6.5283, + "step": 3833 + }, + { + "epoch": 0.02280188409934342, + "grad_norm": 2.317840099334717, + "learning_rate": 4.99359175625028e-05, + "loss": 6.4549, + "step": 3834 + }, + { + "epoch": 0.02280783138262442, + "grad_norm": 2.7261998653411865, + "learning_rate": 4.9935884135219825e-05, + "loss": 6.2049, + "step": 3835 + }, + { + "epoch": 0.022813778665905414, + "grad_norm": 2.623098373413086, + "learning_rate": 4.993585069923198e-05, + "loss": 6.3847, + "step": 3836 + }, + { + "epoch": 0.022819725949186413, + "grad_norm": 2.4825377464294434, + "learning_rate": 4.993581725453929e-05, + "loss": 6.3532, + "step": 3837 + }, + { + "epoch": 0.022825673232467408, + "grad_norm": 2.278151750564575, + "learning_rate": 4.993578380114176e-05, + "loss": 5.8885, + "step": 3838 + }, + { + "epoch": 0.022831620515748406, + "grad_norm": 2.045839548110962, + "learning_rate": 4.9935750339039425e-05, + "loss": 6.6852, + "step": 3839 + }, + { + "epoch": 0.022837567799029405, + "grad_norm": 2.4009597301483154, + "learning_rate": 4.993571686823226e-05, + "loss": 6.1676, + "step": 3840 + }, + { + "epoch": 0.0228435150823104, + "grad_norm": 2.759819507598877, + "learning_rate": 4.9935683388720296e-05, + "loss": 6.3913, + "step": 3841 + }, + { + "epoch": 0.0228494623655914, + "grad_norm": 2.798785924911499, + "learning_rate": 4.9935649900503546e-05, + "loss": 6.8169, + "step": 3842 + }, + { + "epoch": 0.022855409648872393, + "grad_norm": 2.389890432357788, + "learning_rate": 4.9935616403582015e-05, + "loss": 6.7506, + "step": 3843 + }, + { + "epoch": 0.022861356932153392, + "grad_norm": 2.882474184036255, + "learning_rate": 4.9935582897955715e-05, + "loss": 6.2458, + "step": 3844 + }, + { + "epoch": 0.02286730421543439, + "grad_norm": 2.2487478256225586, + "learning_rate": 4.993554938362467e-05, + "loss": 6.7296, + "step": 3845 + }, + { + "epoch": 0.022873251498715386, + "grad_norm": 1.9563521146774292, + "learning_rate": 4.993551586058888e-05, + "loss": 6.6878, + "step": 3846 + }, + { + "epoch": 0.022879198781996384, + "grad_norm": 7.555780410766602, + "learning_rate": 4.993548232884835e-05, + "loss": 6.3309, + "step": 3847 + }, + { + "epoch": 0.022885146065277383, + "grad_norm": 2.2573931217193604, + "learning_rate": 4.99354487884031e-05, + "loss": 6.3384, + "step": 3848 + }, + { + "epoch": 0.022891093348558378, + "grad_norm": 2.063267946243286, + "learning_rate": 4.993541523925316e-05, + "loss": 6.2342, + "step": 3849 + }, + { + "epoch": 0.022897040631839376, + "grad_norm": 2.1032445430755615, + "learning_rate": 4.9935381681398505e-05, + "loss": 6.5458, + "step": 3850 + }, + { + "epoch": 0.02290298791512037, + "grad_norm": 2.233400583267212, + "learning_rate": 4.9935348114839176e-05, + "loss": 6.46, + "step": 3851 + }, + { + "epoch": 0.02290893519840137, + "grad_norm": 2.069182872772217, + "learning_rate": 4.9935314539575174e-05, + "loss": 6.4829, + "step": 3852 + }, + { + "epoch": 0.02291488248168237, + "grad_norm": 1.9986059665679932, + "learning_rate": 4.993528095560651e-05, + "loss": 6.4651, + "step": 3853 + }, + { + "epoch": 0.022920829764963364, + "grad_norm": 2.0529284477233887, + "learning_rate": 4.99352473629332e-05, + "loss": 6.1151, + "step": 3854 + }, + { + "epoch": 0.022926777048244362, + "grad_norm": 1.9643630981445312, + "learning_rate": 4.993521376155525e-05, + "loss": 5.991, + "step": 3855 + }, + { + "epoch": 0.02293272433152536, + "grad_norm": 2.2183501720428467, + "learning_rate": 4.9935180151472674e-05, + "loss": 6.8568, + "step": 3856 + }, + { + "epoch": 0.022938671614806356, + "grad_norm": 2.2095682621002197, + "learning_rate": 4.993514653268548e-05, + "loss": 6.8145, + "step": 3857 + }, + { + "epoch": 0.022944618898087354, + "grad_norm": 2.194451332092285, + "learning_rate": 4.9935112905193694e-05, + "loss": 6.4781, + "step": 3858 + }, + { + "epoch": 0.02295056618136835, + "grad_norm": 2.2242066860198975, + "learning_rate": 4.9935079268997306e-05, + "loss": 6.0535, + "step": 3859 + }, + { + "epoch": 0.022956513464649348, + "grad_norm": 2.336190938949585, + "learning_rate": 4.9935045624096354e-05, + "loss": 6.2453, + "step": 3860 + }, + { + "epoch": 0.022962460747930347, + "grad_norm": 1.9997279644012451, + "learning_rate": 4.9935011970490824e-05, + "loss": 6.3852, + "step": 3861 + }, + { + "epoch": 0.02296840803121134, + "grad_norm": 2.9107778072357178, + "learning_rate": 4.993497830818074e-05, + "loss": 6.0891, + "step": 3862 + }, + { + "epoch": 0.02297435531449234, + "grad_norm": 2.1357171535491943, + "learning_rate": 4.993494463716612e-05, + "loss": 6.5111, + "step": 3863 + }, + { + "epoch": 0.02298030259777334, + "grad_norm": 2.0228497982025146, + "learning_rate": 4.9934910957446954e-05, + "loss": 6.6009, + "step": 3864 + }, + { + "epoch": 0.022986249881054334, + "grad_norm": 2.8057942390441895, + "learning_rate": 4.993487726902328e-05, + "loss": 6.414, + "step": 3865 + }, + { + "epoch": 0.022992197164335332, + "grad_norm": 3.0660998821258545, + "learning_rate": 4.99348435718951e-05, + "loss": 6.3673, + "step": 3866 + }, + { + "epoch": 0.022998144447616328, + "grad_norm": 2.2440497875213623, + "learning_rate": 4.9934809866062416e-05, + "loss": 6.1793, + "step": 3867 + }, + { + "epoch": 0.023004091730897326, + "grad_norm": 2.342358350753784, + "learning_rate": 4.993477615152525e-05, + "loss": 6.5279, + "step": 3868 + }, + { + "epoch": 0.023010039014178325, + "grad_norm": 1.9231956005096436, + "learning_rate": 4.993474242828361e-05, + "loss": 6.4975, + "step": 3869 + }, + { + "epoch": 0.02301598629745932, + "grad_norm": 2.503028631210327, + "learning_rate": 4.9934708696337516e-05, + "loss": 6.5261, + "step": 3870 + }, + { + "epoch": 0.02302193358074032, + "grad_norm": 2.2343928813934326, + "learning_rate": 4.993467495568697e-05, + "loss": 6.0525, + "step": 3871 + }, + { + "epoch": 0.023027880864021313, + "grad_norm": 2.851964235305786, + "learning_rate": 4.993464120633198e-05, + "loss": 6.1271, + "step": 3872 + }, + { + "epoch": 0.023033828147302312, + "grad_norm": 2.580017328262329, + "learning_rate": 4.993460744827257e-05, + "loss": 6.2018, + "step": 3873 + }, + { + "epoch": 0.02303977543058331, + "grad_norm": 2.227879047393799, + "learning_rate": 4.9934573681508744e-05, + "loss": 6.0177, + "step": 3874 + }, + { + "epoch": 0.023045722713864306, + "grad_norm": 2.696531295776367, + "learning_rate": 4.993453990604051e-05, + "loss": 6.627, + "step": 3875 + }, + { + "epoch": 0.023051669997145304, + "grad_norm": 2.3439393043518066, + "learning_rate": 4.99345061218679e-05, + "loss": 6.5388, + "step": 3876 + }, + { + "epoch": 0.023057617280426303, + "grad_norm": 2.5400748252868652, + "learning_rate": 4.99344723289909e-05, + "loss": 5.9162, + "step": 3877 + }, + { + "epoch": 0.023063564563707298, + "grad_norm": 2.658193588256836, + "learning_rate": 4.9934438527409535e-05, + "loss": 5.6645, + "step": 3878 + }, + { + "epoch": 0.023069511846988296, + "grad_norm": 2.3102848529815674, + "learning_rate": 4.9934404717123814e-05, + "loss": 5.9969, + "step": 3879 + }, + { + "epoch": 0.02307545913026929, + "grad_norm": 2.6107916831970215, + "learning_rate": 4.993437089813376e-05, + "loss": 6.1776, + "step": 3880 + }, + { + "epoch": 0.02308140641355029, + "grad_norm": 2.6275434494018555, + "learning_rate": 4.993433707043937e-05, + "loss": 6.2563, + "step": 3881 + }, + { + "epoch": 0.02308735369683129, + "grad_norm": 2.8595218658447266, + "learning_rate": 4.993430323404066e-05, + "loss": 5.9371, + "step": 3882 + }, + { + "epoch": 0.023093300980112284, + "grad_norm": 2.2947659492492676, + "learning_rate": 4.993426938893764e-05, + "loss": 5.7263, + "step": 3883 + }, + { + "epoch": 0.023099248263393282, + "grad_norm": 3.3769729137420654, + "learning_rate": 4.9934235535130326e-05, + "loss": 6.2706, + "step": 3884 + }, + { + "epoch": 0.02310519554667428, + "grad_norm": 2.792043447494507, + "learning_rate": 4.9934201672618716e-05, + "loss": 5.9264, + "step": 3885 + }, + { + "epoch": 0.023111142829955276, + "grad_norm": 2.592167615890503, + "learning_rate": 4.993416780140285e-05, + "loss": 6.4031, + "step": 3886 + }, + { + "epoch": 0.023117090113236274, + "grad_norm": 2.429898977279663, + "learning_rate": 4.9934133921482716e-05, + "loss": 6.4609, + "step": 3887 + }, + { + "epoch": 0.02312303739651727, + "grad_norm": 2.1771554946899414, + "learning_rate": 4.993410003285834e-05, + "loss": 6.2873, + "step": 3888 + }, + { + "epoch": 0.023128984679798268, + "grad_norm": 2.7799339294433594, + "learning_rate": 4.9934066135529724e-05, + "loss": 5.7405, + "step": 3889 + }, + { + "epoch": 0.023134931963079267, + "grad_norm": 2.626492977142334, + "learning_rate": 4.993403222949688e-05, + "loss": 5.783, + "step": 3890 + }, + { + "epoch": 0.02314087924636026, + "grad_norm": 2.837663412094116, + "learning_rate": 4.993399831475982e-05, + "loss": 5.8039, + "step": 3891 + }, + { + "epoch": 0.02314682652964126, + "grad_norm": 2.68230938911438, + "learning_rate": 4.9933964391318564e-05, + "loss": 5.6587, + "step": 3892 + }, + { + "epoch": 0.02315277381292226, + "grad_norm": 3.2064061164855957, + "learning_rate": 4.993393045917312e-05, + "loss": 5.9516, + "step": 3893 + }, + { + "epoch": 0.023158721096203254, + "grad_norm": 3.5179402828216553, + "learning_rate": 4.99338965183235e-05, + "loss": 5.7925, + "step": 3894 + }, + { + "epoch": 0.023164668379484252, + "grad_norm": 2.9261434078216553, + "learning_rate": 4.993386256876971e-05, + "loss": 5.8677, + "step": 3895 + }, + { + "epoch": 0.023170615662765248, + "grad_norm": 3.092033624649048, + "learning_rate": 4.9933828610511766e-05, + "loss": 5.6248, + "step": 3896 + }, + { + "epoch": 0.023176562946046246, + "grad_norm": 2.7650182247161865, + "learning_rate": 4.9933794643549683e-05, + "loss": 5.7371, + "step": 3897 + }, + { + "epoch": 0.023182510229327245, + "grad_norm": 2.402839422225952, + "learning_rate": 4.993376066788347e-05, + "loss": 5.4802, + "step": 3898 + }, + { + "epoch": 0.02318845751260824, + "grad_norm": 2.606062889099121, + "learning_rate": 4.993372668351314e-05, + "loss": 5.5766, + "step": 3899 + }, + { + "epoch": 0.023194404795889238, + "grad_norm": 2.2177329063415527, + "learning_rate": 4.99336926904387e-05, + "loss": 5.5744, + "step": 3900 + }, + { + "epoch": 0.023200352079170233, + "grad_norm": 2.6953063011169434, + "learning_rate": 4.9933658688660166e-05, + "loss": 5.6414, + "step": 3901 + }, + { + "epoch": 0.023206299362451232, + "grad_norm": 2.90512752532959, + "learning_rate": 4.993362467817755e-05, + "loss": 5.5445, + "step": 3902 + }, + { + "epoch": 0.02321224664573223, + "grad_norm": 3.724168062210083, + "learning_rate": 4.993359065899086e-05, + "loss": 5.7733, + "step": 3903 + }, + { + "epoch": 0.023218193929013226, + "grad_norm": 2.9355592727661133, + "learning_rate": 4.993355663110012e-05, + "loss": 5.579, + "step": 3904 + }, + { + "epoch": 0.023224141212294224, + "grad_norm": 2.7822163105010986, + "learning_rate": 4.993352259450532e-05, + "loss": 5.5105, + "step": 3905 + }, + { + "epoch": 0.023230088495575223, + "grad_norm": 3.672539710998535, + "learning_rate": 4.99334885492065e-05, + "loss": 6.3865, + "step": 3906 + }, + { + "epoch": 0.023236035778856218, + "grad_norm": 2.26755952835083, + "learning_rate": 4.993345449520364e-05, + "loss": 5.5472, + "step": 3907 + }, + { + "epoch": 0.023241983062137216, + "grad_norm": 2.8935770988464355, + "learning_rate": 4.993342043249678e-05, + "loss": 5.5948, + "step": 3908 + }, + { + "epoch": 0.02324793034541821, + "grad_norm": 3.077798366546631, + "learning_rate": 4.9933386361085924e-05, + "loss": 5.288, + "step": 3909 + }, + { + "epoch": 0.02325387762869921, + "grad_norm": 2.479198694229126, + "learning_rate": 4.993335228097107e-05, + "loss": 5.3743, + "step": 3910 + }, + { + "epoch": 0.02325982491198021, + "grad_norm": 2.429049015045166, + "learning_rate": 4.9933318192152244e-05, + "loss": 5.6709, + "step": 3911 + }, + { + "epoch": 0.023265772195261204, + "grad_norm": 2.4515016078948975, + "learning_rate": 4.993328409462945e-05, + "loss": 5.4946, + "step": 3912 + }, + { + "epoch": 0.023271719478542202, + "grad_norm": 2.3859386444091797, + "learning_rate": 4.993324998840271e-05, + "loss": 5.5947, + "step": 3913 + }, + { + "epoch": 0.0232776667618232, + "grad_norm": 2.746438503265381, + "learning_rate": 4.993321587347203e-05, + "loss": 5.6743, + "step": 3914 + }, + { + "epoch": 0.023283614045104196, + "grad_norm": 2.416118621826172, + "learning_rate": 4.993318174983742e-05, + "loss": 5.7073, + "step": 3915 + }, + { + "epoch": 0.023289561328385194, + "grad_norm": 2.3427727222442627, + "learning_rate": 4.99331476174989e-05, + "loss": 5.5933, + "step": 3916 + }, + { + "epoch": 0.02329550861166619, + "grad_norm": 2.2179009914398193, + "learning_rate": 4.993311347645647e-05, + "loss": 5.7726, + "step": 3917 + }, + { + "epoch": 0.023301455894947188, + "grad_norm": 2.732923984527588, + "learning_rate": 4.993307932671014e-05, + "loss": 5.5783, + "step": 3918 + }, + { + "epoch": 0.023307403178228187, + "grad_norm": 2.5090553760528564, + "learning_rate": 4.993304516825994e-05, + "loss": 5.6598, + "step": 3919 + }, + { + "epoch": 0.02331335046150918, + "grad_norm": 2.690276622772217, + "learning_rate": 4.993301100110587e-05, + "loss": 5.9688, + "step": 3920 + }, + { + "epoch": 0.02331929774479018, + "grad_norm": 2.559215784072876, + "learning_rate": 4.993297682524794e-05, + "loss": 6.3315, + "step": 3921 + }, + { + "epoch": 0.02332524502807118, + "grad_norm": 2.2800240516662598, + "learning_rate": 4.993294264068617e-05, + "loss": 6.2787, + "step": 3922 + }, + { + "epoch": 0.023331192311352174, + "grad_norm": 2.478898525238037, + "learning_rate": 4.993290844742057e-05, + "loss": 6.1145, + "step": 3923 + }, + { + "epoch": 0.023337139594633172, + "grad_norm": 2.4902184009552, + "learning_rate": 4.993287424545115e-05, + "loss": 6.0665, + "step": 3924 + }, + { + "epoch": 0.023343086877914167, + "grad_norm": 2.4157116413116455, + "learning_rate": 4.9932840034777906e-05, + "loss": 6.1697, + "step": 3925 + }, + { + "epoch": 0.023349034161195166, + "grad_norm": 2.340575933456421, + "learning_rate": 4.993280581540087e-05, + "loss": 6.1121, + "step": 3926 + }, + { + "epoch": 0.023354981444476165, + "grad_norm": 2.586881160736084, + "learning_rate": 4.993277158732006e-05, + "loss": 6.1792, + "step": 3927 + }, + { + "epoch": 0.02336092872775716, + "grad_norm": 2.448880910873413, + "learning_rate": 4.9932737350535476e-05, + "loss": 6.084, + "step": 3928 + }, + { + "epoch": 0.023366876011038158, + "grad_norm": 2.525082588195801, + "learning_rate": 4.993270310504712e-05, + "loss": 5.6726, + "step": 3929 + }, + { + "epoch": 0.023372823294319153, + "grad_norm": 2.310445547103882, + "learning_rate": 4.993266885085503e-05, + "loss": 5.9496, + "step": 3930 + }, + { + "epoch": 0.023378770577600152, + "grad_norm": 2.275416612625122, + "learning_rate": 4.993263458795918e-05, + "loss": 6.0042, + "step": 3931 + }, + { + "epoch": 0.02338471786088115, + "grad_norm": 2.481973648071289, + "learning_rate": 4.993260031635963e-05, + "loss": 5.6177, + "step": 3932 + }, + { + "epoch": 0.023390665144162145, + "grad_norm": 2.439544677734375, + "learning_rate": 4.993256603605635e-05, + "loss": 5.9745, + "step": 3933 + }, + { + "epoch": 0.023396612427443144, + "grad_norm": 2.1909360885620117, + "learning_rate": 4.993253174704937e-05, + "loss": 5.9966, + "step": 3934 + }, + { + "epoch": 0.023402559710724143, + "grad_norm": 2.1893911361694336, + "learning_rate": 4.993249744933871e-05, + "loss": 6.0643, + "step": 3935 + }, + { + "epoch": 0.023408506994005138, + "grad_norm": 3.2023842334747314, + "learning_rate": 4.993246314292437e-05, + "loss": 6.2284, + "step": 3936 + }, + { + "epoch": 0.023414454277286136, + "grad_norm": 2.980842113494873, + "learning_rate": 4.9932428827806356e-05, + "loss": 6.2359, + "step": 3937 + }, + { + "epoch": 0.02342040156056713, + "grad_norm": 2.6659433841705322, + "learning_rate": 4.99323945039847e-05, + "loss": 6.2901, + "step": 3938 + }, + { + "epoch": 0.02342634884384813, + "grad_norm": 2.2173492908477783, + "learning_rate": 4.993236017145939e-05, + "loss": 5.8157, + "step": 3939 + }, + { + "epoch": 0.02343229612712913, + "grad_norm": 2.592771530151367, + "learning_rate": 4.993232583023046e-05, + "loss": 5.7747, + "step": 3940 + }, + { + "epoch": 0.023438243410410124, + "grad_norm": 2.328951835632324, + "learning_rate": 4.9932291480297915e-05, + "loss": 5.7367, + "step": 3941 + }, + { + "epoch": 0.023444190693691122, + "grad_norm": 2.3135616779327393, + "learning_rate": 4.993225712166176e-05, + "loss": 6.0592, + "step": 3942 + }, + { + "epoch": 0.02345013797697212, + "grad_norm": 2.49661922454834, + "learning_rate": 4.993222275432201e-05, + "loss": 5.9737, + "step": 3943 + }, + { + "epoch": 0.023456085260253116, + "grad_norm": 2.6462106704711914, + "learning_rate": 4.9932188378278683e-05, + "loss": 5.7053, + "step": 3944 + }, + { + "epoch": 0.023462032543534114, + "grad_norm": 2.102663516998291, + "learning_rate": 4.993215399353178e-05, + "loss": 5.9006, + "step": 3945 + }, + { + "epoch": 0.02346797982681511, + "grad_norm": 2.474500894546509, + "learning_rate": 4.9932119600081326e-05, + "loss": 6.092, + "step": 3946 + }, + { + "epoch": 0.023473927110096108, + "grad_norm": 2.6023428440093994, + "learning_rate": 4.993208519792732e-05, + "loss": 5.9045, + "step": 3947 + }, + { + "epoch": 0.023479874393377106, + "grad_norm": 2.76432466506958, + "learning_rate": 4.99320507870698e-05, + "loss": 5.8178, + "step": 3948 + }, + { + "epoch": 0.0234858216766581, + "grad_norm": 2.250816822052002, + "learning_rate": 4.993201636750874e-05, + "loss": 5.9091, + "step": 3949 + }, + { + "epoch": 0.0234917689599391, + "grad_norm": 2.1984071731567383, + "learning_rate": 4.993198193924417e-05, + "loss": 5.8804, + "step": 3950 + }, + { + "epoch": 0.0234977162432201, + "grad_norm": 2.5217959880828857, + "learning_rate": 4.993194750227611e-05, + "loss": 5.9879, + "step": 3951 + }, + { + "epoch": 0.023503663526501094, + "grad_norm": 2.080110788345337, + "learning_rate": 4.993191305660456e-05, + "loss": 5.6352, + "step": 3952 + }, + { + "epoch": 0.023509610809782092, + "grad_norm": 2.637500286102295, + "learning_rate": 4.9931878602229545e-05, + "loss": 5.7924, + "step": 3953 + }, + { + "epoch": 0.023515558093063087, + "grad_norm": 2.660531759262085, + "learning_rate": 4.9931844139151056e-05, + "loss": 6.1936, + "step": 3954 + }, + { + "epoch": 0.023521505376344086, + "grad_norm": 2.423699378967285, + "learning_rate": 4.993180966736913e-05, + "loss": 5.8974, + "step": 3955 + }, + { + "epoch": 0.023527452659625085, + "grad_norm": 2.581876277923584, + "learning_rate": 4.993177518688375e-05, + "loss": 5.833, + "step": 3956 + }, + { + "epoch": 0.02353339994290608, + "grad_norm": 2.586538076400757, + "learning_rate": 4.9931740697694965e-05, + "loss": 5.9649, + "step": 3957 + }, + { + "epoch": 0.023539347226187078, + "grad_norm": 2.5123441219329834, + "learning_rate": 4.993170619980276e-05, + "loss": 6.1251, + "step": 3958 + }, + { + "epoch": 0.023545294509468077, + "grad_norm": 3.076904535293579, + "learning_rate": 4.993167169320715e-05, + "loss": 5.9559, + "step": 3959 + }, + { + "epoch": 0.023551241792749072, + "grad_norm": 2.572312593460083, + "learning_rate": 4.9931637177908153e-05, + "loss": 6.0291, + "step": 3960 + }, + { + "epoch": 0.02355718907603007, + "grad_norm": 1.9910492897033691, + "learning_rate": 4.9931602653905776e-05, + "loss": 5.8413, + "step": 3961 + }, + { + "epoch": 0.023563136359311065, + "grad_norm": 2.530710458755493, + "learning_rate": 4.993156812120004e-05, + "loss": 6.1217, + "step": 3962 + }, + { + "epoch": 0.023569083642592064, + "grad_norm": 2.3089046478271484, + "learning_rate": 4.993153357979095e-05, + "loss": 5.822, + "step": 3963 + }, + { + "epoch": 0.023575030925873063, + "grad_norm": 2.8980624675750732, + "learning_rate": 4.993149902967852e-05, + "loss": 6.3906, + "step": 3964 + }, + { + "epoch": 0.023580978209154058, + "grad_norm": 2.2176012992858887, + "learning_rate": 4.993146447086275e-05, + "loss": 5.9259, + "step": 3965 + }, + { + "epoch": 0.023586925492435056, + "grad_norm": 2.01096773147583, + "learning_rate": 4.993142990334367e-05, + "loss": 6.3141, + "step": 3966 + }, + { + "epoch": 0.02359287277571605, + "grad_norm": 3.4096288681030273, + "learning_rate": 4.993139532712129e-05, + "loss": 6.3165, + "step": 3967 + }, + { + "epoch": 0.02359882005899705, + "grad_norm": 2.20595645904541, + "learning_rate": 4.9931360742195623e-05, + "loss": 6.016, + "step": 3968 + }, + { + "epoch": 0.02360476734227805, + "grad_norm": 3.543301820755005, + "learning_rate": 4.993132614856666e-05, + "loss": 5.722, + "step": 3969 + }, + { + "epoch": 0.023610714625559043, + "grad_norm": 2.82092547416687, + "learning_rate": 4.993129154623444e-05, + "loss": 5.8217, + "step": 3970 + }, + { + "epoch": 0.023616661908840042, + "grad_norm": 2.4585440158843994, + "learning_rate": 4.9931256935198954e-05, + "loss": 6.3298, + "step": 3971 + }, + { + "epoch": 0.02362260919212104, + "grad_norm": 2.104340076446533, + "learning_rate": 4.993122231546024e-05, + "loss": 5.9174, + "step": 3972 + }, + { + "epoch": 0.023628556475402036, + "grad_norm": 2.5130183696746826, + "learning_rate": 4.993118768701828e-05, + "loss": 6.3075, + "step": 3973 + }, + { + "epoch": 0.023634503758683034, + "grad_norm": 2.4567196369171143, + "learning_rate": 4.99311530498731e-05, + "loss": 6.0088, + "step": 3974 + }, + { + "epoch": 0.02364045104196403, + "grad_norm": 2.5174858570098877, + "learning_rate": 4.993111840402471e-05, + "loss": 6.6739, + "step": 3975 + }, + { + "epoch": 0.023646398325245028, + "grad_norm": 2.0032241344451904, + "learning_rate": 4.9931083749473136e-05, + "loss": 5.7052, + "step": 3976 + }, + { + "epoch": 0.023652345608526026, + "grad_norm": 2.9536757469177246, + "learning_rate": 4.993104908621837e-05, + "loss": 5.415, + "step": 3977 + }, + { + "epoch": 0.02365829289180702, + "grad_norm": 2.6650888919830322, + "learning_rate": 4.9931014414260435e-05, + "loss": 5.4333, + "step": 3978 + }, + { + "epoch": 0.02366424017508802, + "grad_norm": 2.3574490547180176, + "learning_rate": 4.9930979733599334e-05, + "loss": 5.5802, + "step": 3979 + }, + { + "epoch": 0.02367018745836902, + "grad_norm": 2.855534791946411, + "learning_rate": 4.99309450442351e-05, + "loss": 5.5131, + "step": 3980 + }, + { + "epoch": 0.023676134741650014, + "grad_norm": 2.430943727493286, + "learning_rate": 4.993091034616772e-05, + "loss": 6.2497, + "step": 3981 + }, + { + "epoch": 0.023682082024931012, + "grad_norm": 2.1671106815338135, + "learning_rate": 4.993087563939722e-05, + "loss": 5.9994, + "step": 3982 + }, + { + "epoch": 0.023688029308212007, + "grad_norm": 2.3268723487854004, + "learning_rate": 4.9930840923923606e-05, + "loss": 5.4779, + "step": 3983 + }, + { + "epoch": 0.023693976591493006, + "grad_norm": 2.3953616619110107, + "learning_rate": 4.993080619974689e-05, + "loss": 5.4044, + "step": 3984 + }, + { + "epoch": 0.023699923874774004, + "grad_norm": 2.043724775314331, + "learning_rate": 4.993077146686709e-05, + "loss": 5.6252, + "step": 3985 + }, + { + "epoch": 0.023705871158055, + "grad_norm": 2.5629520416259766, + "learning_rate": 4.9930736725284224e-05, + "loss": 5.1765, + "step": 3986 + }, + { + "epoch": 0.023711818441335998, + "grad_norm": 2.2148349285125732, + "learning_rate": 4.993070197499828e-05, + "loss": 5.5452, + "step": 3987 + }, + { + "epoch": 0.023717765724616997, + "grad_norm": 2.3913650512695312, + "learning_rate": 4.9930667216009295e-05, + "loss": 6.0882, + "step": 3988 + }, + { + "epoch": 0.02372371300789799, + "grad_norm": 2.619607925415039, + "learning_rate": 4.993063244831727e-05, + "loss": 6.4482, + "step": 3989 + }, + { + "epoch": 0.02372966029117899, + "grad_norm": 2.0585055351257324, + "learning_rate": 4.993059767192222e-05, + "loss": 6.0467, + "step": 3990 + }, + { + "epoch": 0.023735607574459985, + "grad_norm": 2.3380227088928223, + "learning_rate": 4.993056288682416e-05, + "loss": 5.9382, + "step": 3991 + }, + { + "epoch": 0.023741554857740984, + "grad_norm": 2.7252683639526367, + "learning_rate": 4.9930528093023085e-05, + "loss": 6.0444, + "step": 3992 + }, + { + "epoch": 0.023747502141021982, + "grad_norm": 2.333296060562134, + "learning_rate": 4.993049329051903e-05, + "loss": 5.6614, + "step": 3993 + }, + { + "epoch": 0.023753449424302978, + "grad_norm": 2.3571507930755615, + "learning_rate": 4.9930458479312e-05, + "loss": 6.328, + "step": 3994 + }, + { + "epoch": 0.023759396707583976, + "grad_norm": 2.7106499671936035, + "learning_rate": 4.9930423659402005e-05, + "loss": 6.0347, + "step": 3995 + }, + { + "epoch": 0.02376534399086497, + "grad_norm": 3.000009298324585, + "learning_rate": 4.9930388830789043e-05, + "loss": 5.5511, + "step": 3996 + }, + { + "epoch": 0.02377129127414597, + "grad_norm": 2.787912130355835, + "learning_rate": 4.993035399347316e-05, + "loss": 5.2059, + "step": 3997 + }, + { + "epoch": 0.02377723855742697, + "grad_norm": 2.7351326942443848, + "learning_rate": 4.993031914745433e-05, + "loss": 5.2997, + "step": 3998 + }, + { + "epoch": 0.023783185840707963, + "grad_norm": 2.770566701889038, + "learning_rate": 4.993028429273259e-05, + "loss": 5.8871, + "step": 3999 + }, + { + "epoch": 0.023789133123988962, + "grad_norm": 2.9528706073760986, + "learning_rate": 4.993024942930794e-05, + "loss": 5.8177, + "step": 4000 + }, + { + "epoch": 0.02379508040726996, + "grad_norm": 2.543329954147339, + "learning_rate": 4.993021455718041e-05, + "loss": 5.6446, + "step": 4001 + }, + { + "epoch": 0.023801027690550956, + "grad_norm": 2.7284936904907227, + "learning_rate": 4.993017967634999e-05, + "loss": 5.8404, + "step": 4002 + }, + { + "epoch": 0.023806974973831954, + "grad_norm": 2.752187728881836, + "learning_rate": 4.99301447868167e-05, + "loss": 5.6959, + "step": 4003 + }, + { + "epoch": 0.02381292225711295, + "grad_norm": 2.86651611328125, + "learning_rate": 4.993010988858056e-05, + "loss": 5.6329, + "step": 4004 + }, + { + "epoch": 0.023818869540393948, + "grad_norm": 3.9363176822662354, + "learning_rate": 4.9930074981641574e-05, + "loss": 5.31, + "step": 4005 + }, + { + "epoch": 0.023824816823674946, + "grad_norm": 3.41188907623291, + "learning_rate": 4.9930040065999764e-05, + "loss": 5.9905, + "step": 4006 + }, + { + "epoch": 0.02383076410695594, + "grad_norm": 3.4761459827423096, + "learning_rate": 4.9930005141655125e-05, + "loss": 6.0575, + "step": 4007 + }, + { + "epoch": 0.02383671139023694, + "grad_norm": 3.1562440395355225, + "learning_rate": 4.992997020860768e-05, + "loss": 5.9915, + "step": 4008 + }, + { + "epoch": 0.02384265867351794, + "grad_norm": 2.884049415588379, + "learning_rate": 4.992993526685744e-05, + "loss": 5.8051, + "step": 4009 + }, + { + "epoch": 0.023848605956798934, + "grad_norm": 3.3188138008117676, + "learning_rate": 4.992990031640442e-05, + "loss": 5.9637, + "step": 4010 + }, + { + "epoch": 0.023854553240079932, + "grad_norm": 3.2048282623291016, + "learning_rate": 4.992986535724862e-05, + "loss": 6.631, + "step": 4011 + }, + { + "epoch": 0.023860500523360927, + "grad_norm": 2.80204701423645, + "learning_rate": 4.992983038939008e-05, + "loss": 6.0063, + "step": 4012 + }, + { + "epoch": 0.023866447806641926, + "grad_norm": 2.993398427963257, + "learning_rate": 4.992979541282877e-05, + "loss": 5.9778, + "step": 4013 + }, + { + "epoch": 0.023872395089922924, + "grad_norm": 2.7519168853759766, + "learning_rate": 4.9929760427564744e-05, + "loss": 6.4272, + "step": 4014 + }, + { + "epoch": 0.02387834237320392, + "grad_norm": 2.9606168270111084, + "learning_rate": 4.992972543359799e-05, + "loss": 5.5372, + "step": 4015 + }, + { + "epoch": 0.023884289656484918, + "grad_norm": 2.1724514961242676, + "learning_rate": 4.992969043092853e-05, + "loss": 6.3115, + "step": 4016 + }, + { + "epoch": 0.023890236939765917, + "grad_norm": 2.1742191314697266, + "learning_rate": 4.9929655419556365e-05, + "loss": 6.5097, + "step": 4017 + }, + { + "epoch": 0.02389618422304691, + "grad_norm": 1.9729878902435303, + "learning_rate": 4.9929620399481526e-05, + "loss": 6.7061, + "step": 4018 + }, + { + "epoch": 0.02390213150632791, + "grad_norm": 2.6273725032806396, + "learning_rate": 4.9929585370704e-05, + "loss": 6.2838, + "step": 4019 + }, + { + "epoch": 0.023908078789608905, + "grad_norm": 2.5495283603668213, + "learning_rate": 4.9929550333223826e-05, + "loss": 6.1175, + "step": 4020 + }, + { + "epoch": 0.023914026072889904, + "grad_norm": 2.50193452835083, + "learning_rate": 4.9929515287041e-05, + "loss": 5.7689, + "step": 4021 + }, + { + "epoch": 0.023919973356170902, + "grad_norm": 2.402991771697998, + "learning_rate": 4.992948023215553e-05, + "loss": 6.4222, + "step": 4022 + }, + { + "epoch": 0.023925920639451898, + "grad_norm": 2.1722981929779053, + "learning_rate": 4.9929445168567444e-05, + "loss": 6.2335, + "step": 4023 + }, + { + "epoch": 0.023931867922732896, + "grad_norm": 1.6895688772201538, + "learning_rate": 4.992941009627675e-05, + "loss": 6.163, + "step": 4024 + }, + { + "epoch": 0.02393781520601389, + "grad_norm": 1.9944639205932617, + "learning_rate": 4.992937501528345e-05, + "loss": 6.2622, + "step": 4025 + }, + { + "epoch": 0.02394376248929489, + "grad_norm": 2.6157150268554688, + "learning_rate": 4.9929339925587565e-05, + "loss": 6.4582, + "step": 4026 + }, + { + "epoch": 0.023949709772575888, + "grad_norm": 2.021772623062134, + "learning_rate": 4.992930482718911e-05, + "loss": 6.2921, + "step": 4027 + }, + { + "epoch": 0.023955657055856883, + "grad_norm": 2.465402603149414, + "learning_rate": 4.992926972008808e-05, + "loss": 6.6426, + "step": 4028 + }, + { + "epoch": 0.023961604339137882, + "grad_norm": 2.337763547897339, + "learning_rate": 4.99292346042845e-05, + "loss": 6.4988, + "step": 4029 + }, + { + "epoch": 0.02396755162241888, + "grad_norm": 2.400064706802368, + "learning_rate": 4.9929199479778394e-05, + "loss": 6.6666, + "step": 4030 + }, + { + "epoch": 0.023973498905699876, + "grad_norm": 2.4205784797668457, + "learning_rate": 4.9929164346569756e-05, + "loss": 5.8805, + "step": 4031 + }, + { + "epoch": 0.023979446188980874, + "grad_norm": 2.312434673309326, + "learning_rate": 4.9929129204658605e-05, + "loss": 6.5161, + "step": 4032 + }, + { + "epoch": 0.02398539347226187, + "grad_norm": 2.02748966217041, + "learning_rate": 4.9929094054044944e-05, + "loss": 6.1272, + "step": 4033 + }, + { + "epoch": 0.023991340755542868, + "grad_norm": 2.280242443084717, + "learning_rate": 4.992905889472881e-05, + "loss": 5.7217, + "step": 4034 + }, + { + "epoch": 0.023997288038823866, + "grad_norm": 2.3911778926849365, + "learning_rate": 4.992902372671019e-05, + "loss": 5.7441, + "step": 4035 + }, + { + "epoch": 0.02400323532210486, + "grad_norm": 2.1767921447753906, + "learning_rate": 4.99289885499891e-05, + "loss": 5.7212, + "step": 4036 + }, + { + "epoch": 0.02400918260538586, + "grad_norm": 2.3067142963409424, + "learning_rate": 4.992895336456557e-05, + "loss": 5.6689, + "step": 4037 + }, + { + "epoch": 0.02401512988866686, + "grad_norm": 2.1564273834228516, + "learning_rate": 4.992891817043959e-05, + "loss": 6.1445, + "step": 4038 + }, + { + "epoch": 0.024021077171947854, + "grad_norm": 2.4852945804595947, + "learning_rate": 4.9928882967611184e-05, + "loss": 6.1883, + "step": 4039 + }, + { + "epoch": 0.024027024455228852, + "grad_norm": 2.9280812740325928, + "learning_rate": 4.992884775608036e-05, + "loss": 6.097, + "step": 4040 + }, + { + "epoch": 0.024032971738509847, + "grad_norm": 2.3219356536865234, + "learning_rate": 4.992881253584714e-05, + "loss": 6.3163, + "step": 4041 + }, + { + "epoch": 0.024038919021790846, + "grad_norm": 2.672386884689331, + "learning_rate": 4.9928777306911525e-05, + "loss": 5.9615, + "step": 4042 + }, + { + "epoch": 0.024044866305071844, + "grad_norm": 2.5886473655700684, + "learning_rate": 4.992874206927353e-05, + "loss": 6.0114, + "step": 4043 + }, + { + "epoch": 0.02405081358835284, + "grad_norm": 2.991230010986328, + "learning_rate": 4.992870682293318e-05, + "loss": 5.6805, + "step": 4044 + }, + { + "epoch": 0.024056760871633838, + "grad_norm": 2.3270034790039062, + "learning_rate": 4.9928671567890464e-05, + "loss": 5.7503, + "step": 4045 + }, + { + "epoch": 0.024062708154914837, + "grad_norm": 2.591627359390259, + "learning_rate": 4.99286363041454e-05, + "loss": 5.5707, + "step": 4046 + }, + { + "epoch": 0.02406865543819583, + "grad_norm": 2.1936891078948975, + "learning_rate": 4.992860103169802e-05, + "loss": 5.6503, + "step": 4047 + }, + { + "epoch": 0.02407460272147683, + "grad_norm": 2.2928214073181152, + "learning_rate": 4.992856575054832e-05, + "loss": 5.6067, + "step": 4048 + }, + { + "epoch": 0.024080550004757825, + "grad_norm": 2.4503591060638428, + "learning_rate": 4.992853046069632e-05, + "loss": 6.0067, + "step": 4049 + }, + { + "epoch": 0.024086497288038824, + "grad_norm": 2.84260630607605, + "learning_rate": 4.992849516214202e-05, + "loss": 6.4533, + "step": 4050 + }, + { + "epoch": 0.024092444571319822, + "grad_norm": 2.7172651290893555, + "learning_rate": 4.992845985488543e-05, + "loss": 6.4901, + "step": 4051 + }, + { + "epoch": 0.024098391854600817, + "grad_norm": 2.2101316452026367, + "learning_rate": 4.992842453892659e-05, + "loss": 6.3481, + "step": 4052 + }, + { + "epoch": 0.024104339137881816, + "grad_norm": 2.488199234008789, + "learning_rate": 4.992838921426549e-05, + "loss": 6.4893, + "step": 4053 + }, + { + "epoch": 0.02411028642116281, + "grad_norm": 2.3767058849334717, + "learning_rate": 4.992835388090215e-05, + "loss": 5.9828, + "step": 4054 + }, + { + "epoch": 0.02411623370444381, + "grad_norm": 2.3979814052581787, + "learning_rate": 4.992831853883657e-05, + "loss": 5.7607, + "step": 4055 + }, + { + "epoch": 0.024122180987724808, + "grad_norm": 2.766644239425659, + "learning_rate": 4.992828318806877e-05, + "loss": 5.523, + "step": 4056 + }, + { + "epoch": 0.024128128271005803, + "grad_norm": 3.3954427242279053, + "learning_rate": 4.9928247828598775e-05, + "loss": 6.1247, + "step": 4057 + }, + { + "epoch": 0.024134075554286802, + "grad_norm": 3.5597097873687744, + "learning_rate": 4.9928212460426585e-05, + "loss": 6.0877, + "step": 4058 + }, + { + "epoch": 0.0241400228375678, + "grad_norm": 2.8089418411254883, + "learning_rate": 4.992817708355221e-05, + "loss": 5.324, + "step": 4059 + }, + { + "epoch": 0.024145970120848795, + "grad_norm": 2.6756842136383057, + "learning_rate": 4.992814169797566e-05, + "loss": 5.5516, + "step": 4060 + }, + { + "epoch": 0.024151917404129794, + "grad_norm": 2.1218929290771484, + "learning_rate": 4.992810630369696e-05, + "loss": 6.102, + "step": 4061 + }, + { + "epoch": 0.02415786468741079, + "grad_norm": 2.7189652919769287, + "learning_rate": 4.992807090071611e-05, + "loss": 6.4258, + "step": 4062 + }, + { + "epoch": 0.024163811970691788, + "grad_norm": 2.4340744018554688, + "learning_rate": 4.992803548903313e-05, + "loss": 5.8059, + "step": 4063 + }, + { + "epoch": 0.024169759253972786, + "grad_norm": 2.46604323387146, + "learning_rate": 4.992800006864804e-05, + "loss": 5.8963, + "step": 4064 + }, + { + "epoch": 0.02417570653725378, + "grad_norm": 2.1969218254089355, + "learning_rate": 4.9927964639560835e-05, + "loss": 5.7835, + "step": 4065 + }, + { + "epoch": 0.02418165382053478, + "grad_norm": 2.4529223442077637, + "learning_rate": 4.9927929201771535e-05, + "loss": 6.3405, + "step": 4066 + }, + { + "epoch": 0.02418760110381578, + "grad_norm": 2.145331859588623, + "learning_rate": 4.992789375528015e-05, + "loss": 6.14, + "step": 4067 + }, + { + "epoch": 0.024193548387096774, + "grad_norm": 2.212646961212158, + "learning_rate": 4.99278583000867e-05, + "loss": 5.8793, + "step": 4068 + }, + { + "epoch": 0.024199495670377772, + "grad_norm": 2.3249876499176025, + "learning_rate": 4.992782283619118e-05, + "loss": 5.8702, + "step": 4069 + }, + { + "epoch": 0.024205442953658767, + "grad_norm": 2.180964946746826, + "learning_rate": 4.9927787363593634e-05, + "loss": 6.216, + "step": 4070 + }, + { + "epoch": 0.024211390236939766, + "grad_norm": 2.5633153915405273, + "learning_rate": 4.992775188229405e-05, + "loss": 6.031, + "step": 4071 + }, + { + "epoch": 0.024217337520220764, + "grad_norm": 2.867342233657837, + "learning_rate": 4.992771639229244e-05, + "loss": 5.9853, + "step": 4072 + }, + { + "epoch": 0.02422328480350176, + "grad_norm": 2.111253023147583, + "learning_rate": 4.992768089358882e-05, + "loss": 5.8404, + "step": 4073 + }, + { + "epoch": 0.024229232086782758, + "grad_norm": 1.9325549602508545, + "learning_rate": 4.992764538618321e-05, + "loss": 6.0175, + "step": 4074 + }, + { + "epoch": 0.024235179370063756, + "grad_norm": 2.721740484237671, + "learning_rate": 4.992760987007561e-05, + "loss": 5.9274, + "step": 4075 + }, + { + "epoch": 0.02424112665334475, + "grad_norm": 3.5240588188171387, + "learning_rate": 4.992757434526604e-05, + "loss": 5.3593, + "step": 4076 + }, + { + "epoch": 0.02424707393662575, + "grad_norm": 2.744248867034912, + "learning_rate": 4.9927538811754516e-05, + "loss": 5.8938, + "step": 4077 + }, + { + "epoch": 0.024253021219906745, + "grad_norm": 2.545384645462036, + "learning_rate": 4.992750326954104e-05, + "loss": 6.2127, + "step": 4078 + }, + { + "epoch": 0.024258968503187744, + "grad_norm": 2.7550806999206543, + "learning_rate": 4.992746771862563e-05, + "loss": 6.0784, + "step": 4079 + }, + { + "epoch": 0.024264915786468742, + "grad_norm": 2.408040761947632, + "learning_rate": 4.9927432159008305e-05, + "loss": 5.5908, + "step": 4080 + }, + { + "epoch": 0.024270863069749737, + "grad_norm": 2.581378698348999, + "learning_rate": 4.9927396590689066e-05, + "loss": 5.4438, + "step": 4081 + }, + { + "epoch": 0.024276810353030736, + "grad_norm": 2.4320218563079834, + "learning_rate": 4.992736101366794e-05, + "loss": 5.6239, + "step": 4082 + }, + { + "epoch": 0.024282757636311735, + "grad_norm": 2.4725472927093506, + "learning_rate": 4.992732542794492e-05, + "loss": 6.237, + "step": 4083 + }, + { + "epoch": 0.02428870491959273, + "grad_norm": 2.3081839084625244, + "learning_rate": 4.992728983352003e-05, + "loss": 5.9917, + "step": 4084 + }, + { + "epoch": 0.024294652202873728, + "grad_norm": 1.9090701341629028, + "learning_rate": 4.9927254230393287e-05, + "loss": 5.9125, + "step": 4085 + }, + { + "epoch": 0.024300599486154723, + "grad_norm": 2.3943240642547607, + "learning_rate": 4.992721861856468e-05, + "loss": 5.3431, + "step": 4086 + }, + { + "epoch": 0.024306546769435722, + "grad_norm": 2.226968765258789, + "learning_rate": 4.992718299803425e-05, + "loss": 5.4328, + "step": 4087 + }, + { + "epoch": 0.02431249405271672, + "grad_norm": 2.238218307495117, + "learning_rate": 4.9927147368801994e-05, + "loss": 5.4877, + "step": 4088 + }, + { + "epoch": 0.024318441335997715, + "grad_norm": 2.216540575027466, + "learning_rate": 4.992711173086794e-05, + "loss": 5.4037, + "step": 4089 + }, + { + "epoch": 0.024324388619278714, + "grad_norm": 2.3136301040649414, + "learning_rate": 4.992707608423208e-05, + "loss": 5.4576, + "step": 4090 + }, + { + "epoch": 0.02433033590255971, + "grad_norm": 2.0434980392456055, + "learning_rate": 4.9927040428894436e-05, + "loss": 5.8044, + "step": 4091 + }, + { + "epoch": 0.024336283185840708, + "grad_norm": 2.7837064266204834, + "learning_rate": 4.992700476485502e-05, + "loss": 6.4183, + "step": 4092 + }, + { + "epoch": 0.024342230469121706, + "grad_norm": 2.580411195755005, + "learning_rate": 4.992696909211384e-05, + "loss": 5.4545, + "step": 4093 + }, + { + "epoch": 0.0243481777524027, + "grad_norm": 2.1215696334838867, + "learning_rate": 4.9926933410670916e-05, + "loss": 5.5629, + "step": 4094 + }, + { + "epoch": 0.0243541250356837, + "grad_norm": 1.9621074199676514, + "learning_rate": 4.992689772052626e-05, + "loss": 5.5248, + "step": 4095 + }, + { + "epoch": 0.0243600723189647, + "grad_norm": 2.1773006916046143, + "learning_rate": 4.992686202167988e-05, + "loss": 5.3285, + "step": 4096 + }, + { + "epoch": 0.024366019602245693, + "grad_norm": 1.9506359100341797, + "learning_rate": 4.992682631413179e-05, + "loss": 5.7989, + "step": 4097 + }, + { + "epoch": 0.024371966885526692, + "grad_norm": 1.9154741764068604, + "learning_rate": 4.9926790597882e-05, + "loss": 5.6029, + "step": 4098 + }, + { + "epoch": 0.024377914168807687, + "grad_norm": 2.2147481441497803, + "learning_rate": 4.9926754872930524e-05, + "loss": 5.5406, + "step": 4099 + }, + { + "epoch": 0.024383861452088686, + "grad_norm": 2.1268460750579834, + "learning_rate": 4.992671913927738e-05, + "loss": 5.6434, + "step": 4100 + }, + { + "epoch": 0.024389808735369684, + "grad_norm": 2.1212456226348877, + "learning_rate": 4.992668339692258e-05, + "loss": 5.6888, + "step": 4101 + }, + { + "epoch": 0.02439575601865068, + "grad_norm": 2.2292001247406006, + "learning_rate": 4.992664764586612e-05, + "loss": 5.3982, + "step": 4102 + }, + { + "epoch": 0.024401703301931678, + "grad_norm": 2.2713210582733154, + "learning_rate": 4.9926611886108035e-05, + "loss": 5.3521, + "step": 4103 + }, + { + "epoch": 0.024407650585212676, + "grad_norm": 2.273437738418579, + "learning_rate": 4.9926576117648314e-05, + "loss": 5.474, + "step": 4104 + }, + { + "epoch": 0.02441359786849367, + "grad_norm": 2.2879083156585693, + "learning_rate": 4.9926540340487e-05, + "loss": 5.4474, + "step": 4105 + }, + { + "epoch": 0.02441954515177467, + "grad_norm": 2.2517430782318115, + "learning_rate": 4.992650455462408e-05, + "loss": 5.5013, + "step": 4106 + }, + { + "epoch": 0.024425492435055665, + "grad_norm": 2.1391677856445312, + "learning_rate": 4.992646876005957e-05, + "loss": 5.3899, + "step": 4107 + }, + { + "epoch": 0.024431439718336664, + "grad_norm": 2.2989962100982666, + "learning_rate": 4.9926432956793494e-05, + "loss": 5.7995, + "step": 4108 + }, + { + "epoch": 0.024437387001617662, + "grad_norm": 2.550706386566162, + "learning_rate": 4.992639714482586e-05, + "loss": 5.6599, + "step": 4109 + }, + { + "epoch": 0.024443334284898657, + "grad_norm": 2.321398973464966, + "learning_rate": 4.992636132415667e-05, + "loss": 5.6852, + "step": 4110 + }, + { + "epoch": 0.024449281568179656, + "grad_norm": 2.300795555114746, + "learning_rate": 4.992632549478595e-05, + "loss": 5.7318, + "step": 4111 + }, + { + "epoch": 0.024455228851460654, + "grad_norm": 2.229156970977783, + "learning_rate": 4.992628965671371e-05, + "loss": 5.6617, + "step": 4112 + }, + { + "epoch": 0.02446117613474165, + "grad_norm": 2.253934144973755, + "learning_rate": 4.992625380993995e-05, + "loss": 5.5762, + "step": 4113 + }, + { + "epoch": 0.024467123418022648, + "grad_norm": 2.0932998657226562, + "learning_rate": 4.992621795446471e-05, + "loss": 5.568, + "step": 4114 + }, + { + "epoch": 0.024473070701303643, + "grad_norm": 2.5969886779785156, + "learning_rate": 4.9926182090287966e-05, + "loss": 5.6626, + "step": 4115 + }, + { + "epoch": 0.02447901798458464, + "grad_norm": 2.5260698795318604, + "learning_rate": 4.992614621740976e-05, + "loss": 5.6333, + "step": 4116 + }, + { + "epoch": 0.02448496526786564, + "grad_norm": 2.0017902851104736, + "learning_rate": 4.992611033583009e-05, + "loss": 5.793, + "step": 4117 + }, + { + "epoch": 0.024490912551146635, + "grad_norm": 2.1847705841064453, + "learning_rate": 4.992607444554898e-05, + "loss": 5.8348, + "step": 4118 + }, + { + "epoch": 0.024496859834427634, + "grad_norm": 2.141007900238037, + "learning_rate": 4.992603854656642e-05, + "loss": 5.7835, + "step": 4119 + }, + { + "epoch": 0.02450280711770863, + "grad_norm": 2.294605255126953, + "learning_rate": 4.992600263888245e-05, + "loss": 5.6615, + "step": 4120 + }, + { + "epoch": 0.024508754400989628, + "grad_norm": 2.433936357498169, + "learning_rate": 4.9925966722497064e-05, + "loss": 5.6479, + "step": 4121 + }, + { + "epoch": 0.024514701684270626, + "grad_norm": 2.1522979736328125, + "learning_rate": 4.992593079741028e-05, + "loss": 5.5761, + "step": 4122 + }, + { + "epoch": 0.02452064896755162, + "grad_norm": 2.141065835952759, + "learning_rate": 4.9925894863622114e-05, + "loss": 5.602, + "step": 4123 + }, + { + "epoch": 0.02452659625083262, + "grad_norm": 2.187838554382324, + "learning_rate": 4.9925858921132576e-05, + "loss": 5.6337, + "step": 4124 + }, + { + "epoch": 0.02453254353411362, + "grad_norm": 2.303027629852295, + "learning_rate": 4.992582296994167e-05, + "loss": 5.6126, + "step": 4125 + }, + { + "epoch": 0.024538490817394613, + "grad_norm": 1.9233589172363281, + "learning_rate": 4.992578701004943e-05, + "loss": 5.5852, + "step": 4126 + }, + { + "epoch": 0.024544438100675612, + "grad_norm": 2.0383386611938477, + "learning_rate": 4.992575104145585e-05, + "loss": 5.6477, + "step": 4127 + }, + { + "epoch": 0.024550385383956607, + "grad_norm": 2.2752933502197266, + "learning_rate": 4.9925715064160946e-05, + "loss": 5.6263, + "step": 4128 + }, + { + "epoch": 0.024556332667237606, + "grad_norm": 2.400083541870117, + "learning_rate": 4.9925679078164734e-05, + "loss": 5.5249, + "step": 4129 + }, + { + "epoch": 0.024562279950518604, + "grad_norm": 2.167365312576294, + "learning_rate": 4.992564308346722e-05, + "loss": 5.7299, + "step": 4130 + }, + { + "epoch": 0.0245682272337996, + "grad_norm": 1.9696096181869507, + "learning_rate": 4.9925607080068426e-05, + "loss": 5.7961, + "step": 4131 + }, + { + "epoch": 0.024574174517080598, + "grad_norm": 2.1817007064819336, + "learning_rate": 4.992557106796836e-05, + "loss": 5.7973, + "step": 4132 + }, + { + "epoch": 0.024580121800361596, + "grad_norm": 2.4329075813293457, + "learning_rate": 4.992553504716704e-05, + "loss": 6.2428, + "step": 4133 + }, + { + "epoch": 0.02458606908364259, + "grad_norm": 2.159193754196167, + "learning_rate": 4.9925499017664464e-05, + "loss": 5.5784, + "step": 4134 + }, + { + "epoch": 0.02459201636692359, + "grad_norm": 2.2614853382110596, + "learning_rate": 4.992546297946066e-05, + "loss": 5.7572, + "step": 4135 + }, + { + "epoch": 0.024597963650204585, + "grad_norm": 2.2874412536621094, + "learning_rate": 4.992542693255563e-05, + "loss": 5.5726, + "step": 4136 + }, + { + "epoch": 0.024603910933485584, + "grad_norm": 2.1634466648101807, + "learning_rate": 4.992539087694939e-05, + "loss": 5.5112, + "step": 4137 + }, + { + "epoch": 0.024609858216766582, + "grad_norm": 2.195528507232666, + "learning_rate": 4.9925354812641955e-05, + "loss": 5.6073, + "step": 4138 + }, + { + "epoch": 0.024615805500047577, + "grad_norm": 2.0328054428100586, + "learning_rate": 4.992531873963334e-05, + "loss": 5.5686, + "step": 4139 + }, + { + "epoch": 0.024621752783328576, + "grad_norm": 2.244218349456787, + "learning_rate": 4.992528265792355e-05, + "loss": 5.6871, + "step": 4140 + }, + { + "epoch": 0.024627700066609574, + "grad_norm": 2.081721544265747, + "learning_rate": 4.992524656751261e-05, + "loss": 5.5327, + "step": 4141 + }, + { + "epoch": 0.02463364734989057, + "grad_norm": 1.9305940866470337, + "learning_rate": 4.992521046840051e-05, + "loss": 5.5265, + "step": 4142 + }, + { + "epoch": 0.024639594633171568, + "grad_norm": 2.624286651611328, + "learning_rate": 4.992517436058728e-05, + "loss": 5.3881, + "step": 4143 + }, + { + "epoch": 0.024645541916452563, + "grad_norm": 2.204803705215454, + "learning_rate": 4.9925138244072935e-05, + "loss": 5.6686, + "step": 4144 + }, + { + "epoch": 0.02465148919973356, + "grad_norm": 2.4664852619171143, + "learning_rate": 4.992510211885748e-05, + "loss": 5.3152, + "step": 4145 + }, + { + "epoch": 0.02465743648301456, + "grad_norm": 2.3428542613983154, + "learning_rate": 4.992506598494093e-05, + "loss": 5.5875, + "step": 4146 + }, + { + "epoch": 0.024663383766295555, + "grad_norm": 2.1902847290039062, + "learning_rate": 4.992502984232329e-05, + "loss": 5.4826, + "step": 4147 + }, + { + "epoch": 0.024669331049576554, + "grad_norm": 2.0401039123535156, + "learning_rate": 4.992499369100459e-05, + "loss": 5.518, + "step": 4148 + }, + { + "epoch": 0.02467527833285755, + "grad_norm": 2.5250306129455566, + "learning_rate": 4.9924957530984825e-05, + "loss": 5.5744, + "step": 4149 + }, + { + "epoch": 0.024681225616138548, + "grad_norm": 1.9975959062576294, + "learning_rate": 4.9924921362264016e-05, + "loss": 5.6834, + "step": 4150 + }, + { + "epoch": 0.024687172899419546, + "grad_norm": 2.047011375427246, + "learning_rate": 4.992488518484217e-05, + "loss": 5.6703, + "step": 4151 + }, + { + "epoch": 0.02469312018270054, + "grad_norm": 2.142411470413208, + "learning_rate": 4.9924848998719314e-05, + "loss": 5.781, + "step": 4152 + }, + { + "epoch": 0.02469906746598154, + "grad_norm": 2.1012768745422363, + "learning_rate": 4.992481280389545e-05, + "loss": 5.618, + "step": 4153 + }, + { + "epoch": 0.024705014749262538, + "grad_norm": 2.4698173999786377, + "learning_rate": 4.9924776600370584e-05, + "loss": 6.4773, + "step": 4154 + }, + { + "epoch": 0.024710962032543533, + "grad_norm": 2.4975368976593018, + "learning_rate": 4.992474038814474e-05, + "loss": 5.2568, + "step": 4155 + }, + { + "epoch": 0.024716909315824532, + "grad_norm": 1.8329259157180786, + "learning_rate": 4.992470416721793e-05, + "loss": 5.775, + "step": 4156 + }, + { + "epoch": 0.024722856599105527, + "grad_norm": 1.9757754802703857, + "learning_rate": 4.992466793759015e-05, + "loss": 5.5408, + "step": 4157 + }, + { + "epoch": 0.024728803882386526, + "grad_norm": 1.8300005197525024, + "learning_rate": 4.9924631699261434e-05, + "loss": 5.5356, + "step": 4158 + }, + { + "epoch": 0.024734751165667524, + "grad_norm": 2.099102735519409, + "learning_rate": 4.992459545223179e-05, + "loss": 5.6811, + "step": 4159 + }, + { + "epoch": 0.02474069844894852, + "grad_norm": 2.000169277191162, + "learning_rate": 4.992455919650123e-05, + "loss": 5.511, + "step": 4160 + }, + { + "epoch": 0.024746645732229518, + "grad_norm": 2.0555150508880615, + "learning_rate": 4.992452293206976e-05, + "loss": 5.7553, + "step": 4161 + }, + { + "epoch": 0.024752593015510516, + "grad_norm": 2.0416486263275146, + "learning_rate": 4.99244866589374e-05, + "loss": 5.6965, + "step": 4162 + }, + { + "epoch": 0.02475854029879151, + "grad_norm": 2.0028059482574463, + "learning_rate": 4.9924450377104146e-05, + "loss": 5.7211, + "step": 4163 + }, + { + "epoch": 0.02476448758207251, + "grad_norm": 2.22377872467041, + "learning_rate": 4.992441408657004e-05, + "loss": 5.6384, + "step": 4164 + }, + { + "epoch": 0.024770434865353505, + "grad_norm": 2.038804531097412, + "learning_rate": 4.9924377787335064e-05, + "loss": 5.6351, + "step": 4165 + }, + { + "epoch": 0.024776382148634504, + "grad_norm": 2.357773542404175, + "learning_rate": 4.992434147939925e-05, + "loss": 5.2791, + "step": 4166 + }, + { + "epoch": 0.024782329431915502, + "grad_norm": 2.1949357986450195, + "learning_rate": 4.992430516276261e-05, + "loss": 5.7389, + "step": 4167 + }, + { + "epoch": 0.024788276715196497, + "grad_norm": 2.1015608310699463, + "learning_rate": 4.992426883742516e-05, + "loss": 5.632, + "step": 4168 + }, + { + "epoch": 0.024794223998477496, + "grad_norm": 2.166201591491699, + "learning_rate": 4.992423250338689e-05, + "loss": 5.5701, + "step": 4169 + }, + { + "epoch": 0.024800171281758494, + "grad_norm": 2.0805492401123047, + "learning_rate": 4.9924196160647836e-05, + "loss": 5.5955, + "step": 4170 + }, + { + "epoch": 0.02480611856503949, + "grad_norm": 1.803229570388794, + "learning_rate": 4.9924159809208e-05, + "loss": 5.6267, + "step": 4171 + }, + { + "epoch": 0.024812065848320488, + "grad_norm": 2.008639335632324, + "learning_rate": 4.9924123449067393e-05, + "loss": 5.6667, + "step": 4172 + }, + { + "epoch": 0.024818013131601483, + "grad_norm": 1.9843655824661255, + "learning_rate": 4.9924087080226044e-05, + "loss": 5.5981, + "step": 4173 + }, + { + "epoch": 0.02482396041488248, + "grad_norm": 2.10270357131958, + "learning_rate": 4.9924050702683946e-05, + "loss": 5.5293, + "step": 4174 + }, + { + "epoch": 0.02482990769816348, + "grad_norm": 2.315976142883301, + "learning_rate": 4.992401431644112e-05, + "loss": 5.6046, + "step": 4175 + }, + { + "epoch": 0.024835854981444475, + "grad_norm": 2.168473482131958, + "learning_rate": 4.992397792149758e-05, + "loss": 5.4271, + "step": 4176 + }, + { + "epoch": 0.024841802264725474, + "grad_norm": 2.1870200634002686, + "learning_rate": 4.9923941517853335e-05, + "loss": 5.6399, + "step": 4177 + }, + { + "epoch": 0.024847749548006472, + "grad_norm": 2.2944717407226562, + "learning_rate": 4.9923905105508394e-05, + "loss": 5.4483, + "step": 4178 + }, + { + "epoch": 0.024853696831287467, + "grad_norm": 2.1662731170654297, + "learning_rate": 4.9923868684462785e-05, + "loss": 5.6773, + "step": 4179 + }, + { + "epoch": 0.024859644114568466, + "grad_norm": 1.7448937892913818, + "learning_rate": 4.992383225471651e-05, + "loss": 5.6097, + "step": 4180 + }, + { + "epoch": 0.02486559139784946, + "grad_norm": 2.3577585220336914, + "learning_rate": 4.9923795816269576e-05, + "loss": 5.5003, + "step": 4181 + }, + { + "epoch": 0.02487153868113046, + "grad_norm": 2.4175360202789307, + "learning_rate": 4.9923759369122e-05, + "loss": 5.4925, + "step": 4182 + }, + { + "epoch": 0.024877485964411458, + "grad_norm": 2.199329137802124, + "learning_rate": 4.992372291327381e-05, + "loss": 5.6239, + "step": 4183 + }, + { + "epoch": 0.024883433247692453, + "grad_norm": 2.054450511932373, + "learning_rate": 4.9923686448724994e-05, + "loss": 5.59, + "step": 4184 + }, + { + "epoch": 0.024889380530973452, + "grad_norm": 2.0354533195495605, + "learning_rate": 4.9923649975475585e-05, + "loss": 5.6092, + "step": 4185 + }, + { + "epoch": 0.024895327814254447, + "grad_norm": 2.0409371852874756, + "learning_rate": 4.9923613493525576e-05, + "loss": 5.5009, + "step": 4186 + }, + { + "epoch": 0.024901275097535445, + "grad_norm": 2.3314719200134277, + "learning_rate": 4.992357700287501e-05, + "loss": 5.5077, + "step": 4187 + }, + { + "epoch": 0.024907222380816444, + "grad_norm": 2.050706386566162, + "learning_rate": 4.9923540503523865e-05, + "loss": 5.5857, + "step": 4188 + }, + { + "epoch": 0.02491316966409744, + "grad_norm": 2.3477721214294434, + "learning_rate": 4.992350399547218e-05, + "loss": 5.5119, + "step": 4189 + }, + { + "epoch": 0.024919116947378438, + "grad_norm": 2.365171194076538, + "learning_rate": 4.992346747871994e-05, + "loss": 5.583, + "step": 4190 + }, + { + "epoch": 0.024925064230659436, + "grad_norm": 1.9642738103866577, + "learning_rate": 4.992343095326719e-05, + "loss": 5.3527, + "step": 4191 + }, + { + "epoch": 0.02493101151394043, + "grad_norm": 2.25437593460083, + "learning_rate": 4.992339441911392e-05, + "loss": 5.4751, + "step": 4192 + }, + { + "epoch": 0.02493695879722143, + "grad_norm": 2.0476715564727783, + "learning_rate": 4.992335787626016e-05, + "loss": 5.5808, + "step": 4193 + }, + { + "epoch": 0.024942906080502425, + "grad_norm": 2.248382329940796, + "learning_rate": 4.992332132470591e-05, + "loss": 5.5771, + "step": 4194 + }, + { + "epoch": 0.024948853363783424, + "grad_norm": 2.279232978820801, + "learning_rate": 4.992328476445118e-05, + "loss": 5.3803, + "step": 4195 + }, + { + "epoch": 0.024954800647064422, + "grad_norm": 2.0171918869018555, + "learning_rate": 4.992324819549599e-05, + "loss": 5.662, + "step": 4196 + }, + { + "epoch": 0.024960747930345417, + "grad_norm": 2.14736008644104, + "learning_rate": 4.992321161784036e-05, + "loss": 5.6422, + "step": 4197 + }, + { + "epoch": 0.024966695213626416, + "grad_norm": 2.1694438457489014, + "learning_rate": 4.9923175031484284e-05, + "loss": 5.4377, + "step": 4198 + }, + { + "epoch": 0.024972642496907414, + "grad_norm": 1.9280356168746948, + "learning_rate": 4.9923138436427784e-05, + "loss": 5.5499, + "step": 4199 + }, + { + "epoch": 0.02497858978018841, + "grad_norm": 2.185974359512329, + "learning_rate": 4.992310183267088e-05, + "loss": 5.6404, + "step": 4200 + }, + { + "epoch": 0.024984537063469408, + "grad_norm": 2.102681875228882, + "learning_rate": 4.9923065220213585e-05, + "loss": 5.5888, + "step": 4201 + }, + { + "epoch": 0.024990484346750403, + "grad_norm": 2.07100772857666, + "learning_rate": 4.99230285990559e-05, + "loss": 5.6473, + "step": 4202 + }, + { + "epoch": 0.0249964316300314, + "grad_norm": 2.088634967803955, + "learning_rate": 4.992299196919784e-05, + "loss": 5.4993, + "step": 4203 + }, + { + "epoch": 0.0250023789133124, + "grad_norm": 2.2086873054504395, + "learning_rate": 4.992295533063942e-05, + "loss": 5.5797, + "step": 4204 + }, + { + "epoch": 0.025008326196593395, + "grad_norm": 2.250753164291382, + "learning_rate": 4.992291868338066e-05, + "loss": 5.5666, + "step": 4205 + }, + { + "epoch": 0.025014273479874394, + "grad_norm": 2.132636785507202, + "learning_rate": 4.992288202742156e-05, + "loss": 5.6715, + "step": 4206 + }, + { + "epoch": 0.025020220763155392, + "grad_norm": 2.8332200050354004, + "learning_rate": 4.992284536276214e-05, + "loss": 4.9687, + "step": 4207 + }, + { + "epoch": 0.025026168046436387, + "grad_norm": 2.345991849899292, + "learning_rate": 4.992280868940241e-05, + "loss": 5.2181, + "step": 4208 + }, + { + "epoch": 0.025032115329717386, + "grad_norm": 2.149568557739258, + "learning_rate": 4.992277200734239e-05, + "loss": 5.5336, + "step": 4209 + }, + { + "epoch": 0.02503806261299838, + "grad_norm": 2.031353235244751, + "learning_rate": 4.992273531658209e-05, + "loss": 5.5779, + "step": 4210 + }, + { + "epoch": 0.02504400989627938, + "grad_norm": 2.217374086380005, + "learning_rate": 4.9922698617121524e-05, + "loss": 5.782, + "step": 4211 + }, + { + "epoch": 0.025049957179560378, + "grad_norm": 2.3629000186920166, + "learning_rate": 4.992266190896069e-05, + "loss": 5.7916, + "step": 4212 + }, + { + "epoch": 0.025055904462841373, + "grad_norm": 2.2439091205596924, + "learning_rate": 4.9922625192099616e-05, + "loss": 5.8002, + "step": 4213 + }, + { + "epoch": 0.025061851746122372, + "grad_norm": 2.1707634925842285, + "learning_rate": 4.992258846653831e-05, + "loss": 6.5789, + "step": 4214 + }, + { + "epoch": 0.025067799029403367, + "grad_norm": 3.1655468940734863, + "learning_rate": 4.992255173227679e-05, + "loss": 6.3867, + "step": 4215 + }, + { + "epoch": 0.025073746312684365, + "grad_norm": 3.1309874057769775, + "learning_rate": 4.992251498931506e-05, + "loss": 6.2682, + "step": 4216 + }, + { + "epoch": 0.025079693595965364, + "grad_norm": 3.2077460289001465, + "learning_rate": 4.992247823765315e-05, + "loss": 5.8593, + "step": 4217 + }, + { + "epoch": 0.02508564087924636, + "grad_norm": 2.2944962978363037, + "learning_rate": 4.992244147729105e-05, + "loss": 5.7994, + "step": 4218 + }, + { + "epoch": 0.025091588162527358, + "grad_norm": 2.2380926609039307, + "learning_rate": 4.9922404708228776e-05, + "loss": 5.7606, + "step": 4219 + }, + { + "epoch": 0.025097535445808356, + "grad_norm": 2.601795196533203, + "learning_rate": 4.992236793046636e-05, + "loss": 5.7585, + "step": 4220 + }, + { + "epoch": 0.02510348272908935, + "grad_norm": 2.494765520095825, + "learning_rate": 4.99223311440038e-05, + "loss": 5.8102, + "step": 4221 + }, + { + "epoch": 0.02510943001237035, + "grad_norm": 2.4690544605255127, + "learning_rate": 4.992229434884111e-05, + "loss": 5.8682, + "step": 4222 + }, + { + "epoch": 0.025115377295651345, + "grad_norm": 2.1011085510253906, + "learning_rate": 4.99222575449783e-05, + "loss": 5.6982, + "step": 4223 + }, + { + "epoch": 0.025121324578932343, + "grad_norm": 2.2298128604888916, + "learning_rate": 4.992222073241539e-05, + "loss": 5.7606, + "step": 4224 + }, + { + "epoch": 0.025127271862213342, + "grad_norm": 1.93464994430542, + "learning_rate": 4.99221839111524e-05, + "loss": 5.7097, + "step": 4225 + }, + { + "epoch": 0.025133219145494337, + "grad_norm": 2.15191650390625, + "learning_rate": 4.9922147081189324e-05, + "loss": 5.5852, + "step": 4226 + }, + { + "epoch": 0.025139166428775336, + "grad_norm": 2.086954355239868, + "learning_rate": 4.992211024252619e-05, + "loss": 5.5871, + "step": 4227 + }, + { + "epoch": 0.025145113712056334, + "grad_norm": 2.212296724319458, + "learning_rate": 4.9922073395162995e-05, + "loss": 5.562, + "step": 4228 + }, + { + "epoch": 0.02515106099533733, + "grad_norm": 2.0786778926849365, + "learning_rate": 4.992203653909977e-05, + "loss": 5.6599, + "step": 4229 + }, + { + "epoch": 0.025157008278618328, + "grad_norm": 2.3243489265441895, + "learning_rate": 4.9921999674336514e-05, + "loss": 5.9791, + "step": 4230 + }, + { + "epoch": 0.025162955561899323, + "grad_norm": 2.1922898292541504, + "learning_rate": 4.9921962800873247e-05, + "loss": 5.7352, + "step": 4231 + }, + { + "epoch": 0.02516890284518032, + "grad_norm": 2.1154398918151855, + "learning_rate": 4.992192591870998e-05, + "loss": 5.6408, + "step": 4232 + }, + { + "epoch": 0.02517485012846132, + "grad_norm": 2.3520143032073975, + "learning_rate": 4.992188902784673e-05, + "loss": 5.6318, + "step": 4233 + }, + { + "epoch": 0.025180797411742315, + "grad_norm": 2.16597580909729, + "learning_rate": 4.99218521282835e-05, + "loss": 5.4978, + "step": 4234 + }, + { + "epoch": 0.025186744695023314, + "grad_norm": 2.2510032653808594, + "learning_rate": 4.992181522002032e-05, + "loss": 5.4863, + "step": 4235 + }, + { + "epoch": 0.025192691978304312, + "grad_norm": 1.9984945058822632, + "learning_rate": 4.9921778303057174e-05, + "loss": 5.7514, + "step": 4236 + }, + { + "epoch": 0.025198639261585307, + "grad_norm": 2.019435167312622, + "learning_rate": 4.9921741377394106e-05, + "loss": 5.6481, + "step": 4237 + }, + { + "epoch": 0.025204586544866306, + "grad_norm": 1.8546136617660522, + "learning_rate": 4.9921704443031114e-05, + "loss": 5.5907, + "step": 4238 + }, + { + "epoch": 0.0252105338281473, + "grad_norm": 2.012821912765503, + "learning_rate": 4.9921667499968214e-05, + "loss": 5.6942, + "step": 4239 + }, + { + "epoch": 0.0252164811114283, + "grad_norm": 2.215322971343994, + "learning_rate": 4.992163054820541e-05, + "loss": 5.6248, + "step": 4240 + }, + { + "epoch": 0.025222428394709298, + "grad_norm": 2.1009631156921387, + "learning_rate": 4.9921593587742726e-05, + "loss": 5.7769, + "step": 4241 + }, + { + "epoch": 0.025228375677990293, + "grad_norm": 2.280970335006714, + "learning_rate": 4.992155661858017e-05, + "loss": 5.4233, + "step": 4242 + }, + { + "epoch": 0.025234322961271292, + "grad_norm": 2.324589729309082, + "learning_rate": 4.992151964071776e-05, + "loss": 5.7138, + "step": 4243 + }, + { + "epoch": 0.025240270244552287, + "grad_norm": 2.01705002784729, + "learning_rate": 4.9921482654155506e-05, + "loss": 5.6946, + "step": 4244 + }, + { + "epoch": 0.025246217527833285, + "grad_norm": 2.0912036895751953, + "learning_rate": 4.9921445658893414e-05, + "loss": 5.8085, + "step": 4245 + }, + { + "epoch": 0.025252164811114284, + "grad_norm": 2.03450870513916, + "learning_rate": 4.99214086549315e-05, + "loss": 5.9129, + "step": 4246 + }, + { + "epoch": 0.02525811209439528, + "grad_norm": 2.1532092094421387, + "learning_rate": 4.9921371642269786e-05, + "loss": 5.708, + "step": 4247 + }, + { + "epoch": 0.025264059377676278, + "grad_norm": 2.2842540740966797, + "learning_rate": 4.992133462090828e-05, + "loss": 5.6693, + "step": 4248 + }, + { + "epoch": 0.025270006660957276, + "grad_norm": 2.0693325996398926, + "learning_rate": 4.9921297590846997e-05, + "loss": 5.7278, + "step": 4249 + }, + { + "epoch": 0.02527595394423827, + "grad_norm": 2.0139124393463135, + "learning_rate": 4.9921260552085934e-05, + "loss": 5.5897, + "step": 4250 + }, + { + "epoch": 0.02528190122751927, + "grad_norm": 2.4587321281433105, + "learning_rate": 4.9921223504625125e-05, + "loss": 5.6884, + "step": 4251 + }, + { + "epoch": 0.025287848510800265, + "grad_norm": 2.062640428543091, + "learning_rate": 4.992118644846457e-05, + "loss": 5.6189, + "step": 4252 + }, + { + "epoch": 0.025293795794081263, + "grad_norm": 1.9889299869537354, + "learning_rate": 4.992114938360429e-05, + "loss": 5.7326, + "step": 4253 + }, + { + "epoch": 0.025299743077362262, + "grad_norm": 2.001913547515869, + "learning_rate": 4.992111231004429e-05, + "loss": 5.6765, + "step": 4254 + }, + { + "epoch": 0.025305690360643257, + "grad_norm": 2.0345358848571777, + "learning_rate": 4.992107522778459e-05, + "loss": 5.5783, + "step": 4255 + }, + { + "epoch": 0.025311637643924256, + "grad_norm": 2.277817487716675, + "learning_rate": 4.9921038136825205e-05, + "loss": 5.6672, + "step": 4256 + }, + { + "epoch": 0.025317584927205254, + "grad_norm": 1.8992491960525513, + "learning_rate": 4.992100103716614e-05, + "loss": 5.532, + "step": 4257 + }, + { + "epoch": 0.02532353221048625, + "grad_norm": 2.202746629714966, + "learning_rate": 4.992096392880741e-05, + "loss": 5.697, + "step": 4258 + }, + { + "epoch": 0.025329479493767248, + "grad_norm": 2.020514488220215, + "learning_rate": 4.992092681174903e-05, + "loss": 5.9102, + "step": 4259 + }, + { + "epoch": 0.025335426777048243, + "grad_norm": 2.0697989463806152, + "learning_rate": 4.9920889685991e-05, + "loss": 5.5165, + "step": 4260 + }, + { + "epoch": 0.02534137406032924, + "grad_norm": 2.619258165359497, + "learning_rate": 4.992085255153336e-05, + "loss": 5.6577, + "step": 4261 + }, + { + "epoch": 0.02534732134361024, + "grad_norm": 2.1612637042999268, + "learning_rate": 4.99208154083761e-05, + "loss": 5.8193, + "step": 4262 + }, + { + "epoch": 0.025353268626891235, + "grad_norm": 1.9237465858459473, + "learning_rate": 4.9920778256519244e-05, + "loss": 5.6533, + "step": 4263 + }, + { + "epoch": 0.025359215910172234, + "grad_norm": 2.164339065551758, + "learning_rate": 4.99207410959628e-05, + "loss": 5.5566, + "step": 4264 + }, + { + "epoch": 0.025365163193453232, + "grad_norm": 2.0753626823425293, + "learning_rate": 4.992070392670678e-05, + "loss": 5.8444, + "step": 4265 + }, + { + "epoch": 0.025371110476734227, + "grad_norm": 1.977522850036621, + "learning_rate": 4.992066674875121e-05, + "loss": 5.6615, + "step": 4266 + }, + { + "epoch": 0.025377057760015226, + "grad_norm": 1.9911431074142456, + "learning_rate": 4.992062956209608e-05, + "loss": 5.6366, + "step": 4267 + }, + { + "epoch": 0.02538300504329622, + "grad_norm": 2.0334808826446533, + "learning_rate": 4.992059236674142e-05, + "loss": 5.8399, + "step": 4268 + }, + { + "epoch": 0.02538895232657722, + "grad_norm": 2.2869162559509277, + "learning_rate": 4.992055516268724e-05, + "loss": 5.7302, + "step": 4269 + }, + { + "epoch": 0.025394899609858218, + "grad_norm": 2.0845389366149902, + "learning_rate": 4.9920517949933556e-05, + "loss": 5.619, + "step": 4270 + }, + { + "epoch": 0.025400846893139213, + "grad_norm": 2.290881633758545, + "learning_rate": 4.9920480728480376e-05, + "loss": 5.5629, + "step": 4271 + }, + { + "epoch": 0.02540679417642021, + "grad_norm": 2.0897767543792725, + "learning_rate": 4.9920443498327706e-05, + "loss": 5.7009, + "step": 4272 + }, + { + "epoch": 0.025412741459701207, + "grad_norm": 1.8389668464660645, + "learning_rate": 4.9920406259475574e-05, + "loss": 5.6359, + "step": 4273 + }, + { + "epoch": 0.025418688742982205, + "grad_norm": 2.0262937545776367, + "learning_rate": 4.992036901192399e-05, + "loss": 5.6707, + "step": 4274 + }, + { + "epoch": 0.025424636026263204, + "grad_norm": 2.04280686378479, + "learning_rate": 4.992033175567295e-05, + "loss": 5.7917, + "step": 4275 + }, + { + "epoch": 0.0254305833095442, + "grad_norm": 2.0945205688476562, + "learning_rate": 4.992029449072249e-05, + "loss": 5.7208, + "step": 4276 + }, + { + "epoch": 0.025436530592825198, + "grad_norm": 1.9662036895751953, + "learning_rate": 4.992025721707261e-05, + "loss": 5.7141, + "step": 4277 + }, + { + "epoch": 0.025442477876106196, + "grad_norm": 2.582284450531006, + "learning_rate": 4.9920219934723316e-05, + "loss": 5.9514, + "step": 4278 + }, + { + "epoch": 0.02544842515938719, + "grad_norm": 1.9792051315307617, + "learning_rate": 4.992018264367464e-05, + "loss": 5.3867, + "step": 4279 + }, + { + "epoch": 0.02545437244266819, + "grad_norm": 2.0107717514038086, + "learning_rate": 4.992014534392658e-05, + "loss": 5.5985, + "step": 4280 + }, + { + "epoch": 0.025460319725949185, + "grad_norm": 2.2035727500915527, + "learning_rate": 4.9920108035479166e-05, + "loss": 5.6356, + "step": 4281 + }, + { + "epoch": 0.025466267009230183, + "grad_norm": 2.1973958015441895, + "learning_rate": 4.992007071833239e-05, + "loss": 5.3557, + "step": 4282 + }, + { + "epoch": 0.025472214292511182, + "grad_norm": 2.031371831893921, + "learning_rate": 4.9920033392486275e-05, + "loss": 5.484, + "step": 4283 + }, + { + "epoch": 0.025478161575792177, + "grad_norm": 1.9966185092926025, + "learning_rate": 4.991999605794084e-05, + "loss": 5.4137, + "step": 4284 + }, + { + "epoch": 0.025484108859073176, + "grad_norm": 1.699460506439209, + "learning_rate": 4.9919958714696085e-05, + "loss": 5.7099, + "step": 4285 + }, + { + "epoch": 0.025490056142354174, + "grad_norm": 2.270535945892334, + "learning_rate": 4.991992136275203e-05, + "loss": 5.6654, + "step": 4286 + }, + { + "epoch": 0.02549600342563517, + "grad_norm": 2.0636515617370605, + "learning_rate": 4.99198840021087e-05, + "loss": 5.6996, + "step": 4287 + }, + { + "epoch": 0.025501950708916168, + "grad_norm": 2.217365026473999, + "learning_rate": 4.991984663276608e-05, + "loss": 5.6148, + "step": 4288 + }, + { + "epoch": 0.025507897992197163, + "grad_norm": 2.182109832763672, + "learning_rate": 4.99198092547242e-05, + "loss": 5.6469, + "step": 4289 + }, + { + "epoch": 0.02551384527547816, + "grad_norm": 1.995924472808838, + "learning_rate": 4.9919771867983084e-05, + "loss": 5.7607, + "step": 4290 + }, + { + "epoch": 0.02551979255875916, + "grad_norm": 1.9308382272720337, + "learning_rate": 4.991973447254272e-05, + "loss": 5.7219, + "step": 4291 + }, + { + "epoch": 0.025525739842040155, + "grad_norm": 2.2675700187683105, + "learning_rate": 4.991969706840315e-05, + "loss": 5.7348, + "step": 4292 + }, + { + "epoch": 0.025531687125321154, + "grad_norm": 2.0441880226135254, + "learning_rate": 4.991965965556435e-05, + "loss": 5.5827, + "step": 4293 + }, + { + "epoch": 0.025537634408602152, + "grad_norm": 2.0111331939697266, + "learning_rate": 4.9919622234026376e-05, + "loss": 5.5355, + "step": 4294 + }, + { + "epoch": 0.025543581691883147, + "grad_norm": 2.214946985244751, + "learning_rate": 4.991958480378921e-05, + "loss": 5.5327, + "step": 4295 + }, + { + "epoch": 0.025549528975164146, + "grad_norm": 1.9673919677734375, + "learning_rate": 4.991954736485287e-05, + "loss": 5.5744, + "step": 4296 + }, + { + "epoch": 0.02555547625844514, + "grad_norm": 2.0662097930908203, + "learning_rate": 4.991950991721738e-05, + "loss": 5.5301, + "step": 4297 + }, + { + "epoch": 0.02556142354172614, + "grad_norm": 2.1912949085235596, + "learning_rate": 4.991947246088274e-05, + "loss": 5.6505, + "step": 4298 + }, + { + "epoch": 0.025567370825007138, + "grad_norm": 2.1073548793792725, + "learning_rate": 4.991943499584898e-05, + "loss": 5.7429, + "step": 4299 + }, + { + "epoch": 0.025573318108288133, + "grad_norm": 2.4015331268310547, + "learning_rate": 4.9919397522116096e-05, + "loss": 5.9959, + "step": 4300 + }, + { + "epoch": 0.02557926539156913, + "grad_norm": 2.5571470260620117, + "learning_rate": 4.99193600396841e-05, + "loss": 5.9058, + "step": 4301 + }, + { + "epoch": 0.02558521267485013, + "grad_norm": 2.148449182510376, + "learning_rate": 4.9919322548553026e-05, + "loss": 5.6298, + "step": 4302 + }, + { + "epoch": 0.025591159958131125, + "grad_norm": 2.3006222248077393, + "learning_rate": 4.991928504872287e-05, + "loss": 5.4854, + "step": 4303 + }, + { + "epoch": 0.025597107241412124, + "grad_norm": 2.2384679317474365, + "learning_rate": 4.9919247540193646e-05, + "loss": 5.7089, + "step": 4304 + }, + { + "epoch": 0.02560305452469312, + "grad_norm": 2.195736885070801, + "learning_rate": 4.9919210022965376e-05, + "loss": 5.986, + "step": 4305 + }, + { + "epoch": 0.025609001807974117, + "grad_norm": 2.3446342945098877, + "learning_rate": 4.991917249703806e-05, + "loss": 5.88, + "step": 4306 + }, + { + "epoch": 0.025614949091255116, + "grad_norm": 2.3800623416900635, + "learning_rate": 4.9919134962411724e-05, + "loss": 5.6897, + "step": 4307 + }, + { + "epoch": 0.02562089637453611, + "grad_norm": 1.8407396078109741, + "learning_rate": 4.991909741908637e-05, + "loss": 5.7359, + "step": 4308 + }, + { + "epoch": 0.02562684365781711, + "grad_norm": 2.3566956520080566, + "learning_rate": 4.9919059867062026e-05, + "loss": 5.5606, + "step": 4309 + }, + { + "epoch": 0.025632790941098105, + "grad_norm": 2.149317741394043, + "learning_rate": 4.991902230633869e-05, + "loss": 5.6966, + "step": 4310 + }, + { + "epoch": 0.025638738224379103, + "grad_norm": 2.3567728996276855, + "learning_rate": 4.991898473691638e-05, + "loss": 5.4694, + "step": 4311 + }, + { + "epoch": 0.025644685507660102, + "grad_norm": 1.9388068914413452, + "learning_rate": 4.9918947158795106e-05, + "loss": 5.5947, + "step": 4312 + }, + { + "epoch": 0.025650632790941097, + "grad_norm": 1.844419002532959, + "learning_rate": 4.9918909571974893e-05, + "loss": 5.6159, + "step": 4313 + }, + { + "epoch": 0.025656580074222095, + "grad_norm": 1.8664250373840332, + "learning_rate": 4.991887197645574e-05, + "loss": 5.7211, + "step": 4314 + }, + { + "epoch": 0.025662527357503094, + "grad_norm": 2.073004961013794, + "learning_rate": 4.991883437223767e-05, + "loss": 5.8873, + "step": 4315 + }, + { + "epoch": 0.02566847464078409, + "grad_norm": 2.316938877105713, + "learning_rate": 4.991879675932068e-05, + "loss": 5.4372, + "step": 4316 + }, + { + "epoch": 0.025674421924065088, + "grad_norm": 2.2646546363830566, + "learning_rate": 4.991875913770481e-05, + "loss": 5.5486, + "step": 4317 + }, + { + "epoch": 0.025680369207346083, + "grad_norm": 2.2417361736297607, + "learning_rate": 4.991872150739005e-05, + "loss": 5.2264, + "step": 4318 + }, + { + "epoch": 0.02568631649062708, + "grad_norm": 2.271566867828369, + "learning_rate": 4.9918683868376437e-05, + "loss": 5.1546, + "step": 4319 + }, + { + "epoch": 0.02569226377390808, + "grad_norm": 2.211650848388672, + "learning_rate": 4.9918646220663954e-05, + "loss": 5.382, + "step": 4320 + }, + { + "epoch": 0.025698211057189075, + "grad_norm": 2.3627288341522217, + "learning_rate": 4.991860856425263e-05, + "loss": 5.6099, + "step": 4321 + }, + { + "epoch": 0.025704158340470074, + "grad_norm": 2.3968141078948975, + "learning_rate": 4.991857089914249e-05, + "loss": 5.3689, + "step": 4322 + }, + { + "epoch": 0.025710105623751072, + "grad_norm": 2.3576786518096924, + "learning_rate": 4.991853322533352e-05, + "loss": 5.4441, + "step": 4323 + }, + { + "epoch": 0.025716052907032067, + "grad_norm": 2.0814530849456787, + "learning_rate": 4.991849554282575e-05, + "loss": 5.6137, + "step": 4324 + }, + { + "epoch": 0.025722000190313066, + "grad_norm": 2.103505849838257, + "learning_rate": 4.991845785161919e-05, + "loss": 5.5518, + "step": 4325 + }, + { + "epoch": 0.02572794747359406, + "grad_norm": 2.188350200653076, + "learning_rate": 4.991842015171386e-05, + "loss": 5.5958, + "step": 4326 + }, + { + "epoch": 0.02573389475687506, + "grad_norm": 2.124088764190674, + "learning_rate": 4.9918382443109766e-05, + "loss": 5.3851, + "step": 4327 + }, + { + "epoch": 0.025739842040156058, + "grad_norm": 2.181466579437256, + "learning_rate": 4.991834472580692e-05, + "loss": 5.4629, + "step": 4328 + }, + { + "epoch": 0.025745789323437053, + "grad_norm": 1.9634013175964355, + "learning_rate": 4.9918306999805344e-05, + "loss": 5.4768, + "step": 4329 + }, + { + "epoch": 0.02575173660671805, + "grad_norm": 2.2046115398406982, + "learning_rate": 4.991826926510503e-05, + "loss": 5.3977, + "step": 4330 + }, + { + "epoch": 0.02575768388999905, + "grad_norm": 1.8660465478897095, + "learning_rate": 4.9918231521706014e-05, + "loss": 5.4837, + "step": 4331 + }, + { + "epoch": 0.025763631173280045, + "grad_norm": 1.9825572967529297, + "learning_rate": 4.99181937696083e-05, + "loss": 5.5158, + "step": 4332 + }, + { + "epoch": 0.025769578456561044, + "grad_norm": 1.9114030599594116, + "learning_rate": 4.9918156008811906e-05, + "loss": 5.3291, + "step": 4333 + }, + { + "epoch": 0.02577552573984204, + "grad_norm": 2.008059024810791, + "learning_rate": 4.9918118239316835e-05, + "loss": 5.2993, + "step": 4334 + }, + { + "epoch": 0.025781473023123037, + "grad_norm": 2.0090153217315674, + "learning_rate": 4.991808046112311e-05, + "loss": 5.2951, + "step": 4335 + }, + { + "epoch": 0.025787420306404036, + "grad_norm": 2.013878345489502, + "learning_rate": 4.991804267423074e-05, + "loss": 5.3491, + "step": 4336 + }, + { + "epoch": 0.02579336758968503, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.9918004878639734e-05, + "loss": 5.2744, + "step": 4337 + }, + { + "epoch": 0.02579931487296603, + "grad_norm": 1.9945006370544434, + "learning_rate": 4.991796707435012e-05, + "loss": 5.5176, + "step": 4338 + }, + { + "epoch": 0.025805262156247025, + "grad_norm": 2.1205811500549316, + "learning_rate": 4.9917929261361894e-05, + "loss": 5.6534, + "step": 4339 + }, + { + "epoch": 0.025811209439528023, + "grad_norm": 2.6607353687286377, + "learning_rate": 4.991789143967508e-05, + "loss": 6.343, + "step": 4340 + }, + { + "epoch": 0.025817156722809022, + "grad_norm": 2.241818904876709, + "learning_rate": 4.991785360928968e-05, + "loss": 5.6774, + "step": 4341 + }, + { + "epoch": 0.025823104006090017, + "grad_norm": 1.9817326068878174, + "learning_rate": 4.9917815770205723e-05, + "loss": 5.7686, + "step": 4342 + }, + { + "epoch": 0.025829051289371015, + "grad_norm": 2.323802947998047, + "learning_rate": 4.991777792242321e-05, + "loss": 5.9564, + "step": 4343 + }, + { + "epoch": 0.025834998572652014, + "grad_norm": 2.3318228721618652, + "learning_rate": 4.991774006594216e-05, + "loss": 5.9057, + "step": 4344 + }, + { + "epoch": 0.02584094585593301, + "grad_norm": 2.032776355743408, + "learning_rate": 4.991770220076258e-05, + "loss": 5.9753, + "step": 4345 + }, + { + "epoch": 0.025846893139214008, + "grad_norm": 2.116837739944458, + "learning_rate": 4.9917664326884495e-05, + "loss": 5.8458, + "step": 4346 + }, + { + "epoch": 0.025852840422495003, + "grad_norm": 2.312878370285034, + "learning_rate": 4.991762644430791e-05, + "loss": 5.5128, + "step": 4347 + }, + { + "epoch": 0.025858787705776, + "grad_norm": 2.3003859519958496, + "learning_rate": 4.991758855303283e-05, + "loss": 5.7192, + "step": 4348 + }, + { + "epoch": 0.025864734989057, + "grad_norm": 1.898258924484253, + "learning_rate": 4.9917550653059286e-05, + "loss": 5.6422, + "step": 4349 + }, + { + "epoch": 0.025870682272337995, + "grad_norm": 1.9477754831314087, + "learning_rate": 4.9917512744387276e-05, + "loss": 5.7885, + "step": 4350 + }, + { + "epoch": 0.025876629555618993, + "grad_norm": 2.479979991912842, + "learning_rate": 4.991747482701683e-05, + "loss": 5.4692, + "step": 4351 + }, + { + "epoch": 0.025882576838899992, + "grad_norm": 2.324336290359497, + "learning_rate": 4.991743690094794e-05, + "loss": 5.4186, + "step": 4352 + }, + { + "epoch": 0.025888524122180987, + "grad_norm": 2.076723337173462, + "learning_rate": 4.9917398966180625e-05, + "loss": 5.4363, + "step": 4353 + }, + { + "epoch": 0.025894471405461986, + "grad_norm": 1.9004534482955933, + "learning_rate": 4.991736102271492e-05, + "loss": 5.6451, + "step": 4354 + }, + { + "epoch": 0.02590041868874298, + "grad_norm": 1.8098558187484741, + "learning_rate": 4.991732307055082e-05, + "loss": 5.8666, + "step": 4355 + }, + { + "epoch": 0.02590636597202398, + "grad_norm": 2.1158571243286133, + "learning_rate": 4.991728510968833e-05, + "loss": 5.5421, + "step": 4356 + }, + { + "epoch": 0.025912313255304978, + "grad_norm": 2.1235690116882324, + "learning_rate": 4.991724714012748e-05, + "loss": 5.9947, + "step": 4357 + }, + { + "epoch": 0.025918260538585973, + "grad_norm": 2.1306662559509277, + "learning_rate": 4.9917209161868276e-05, + "loss": 5.4648, + "step": 4358 + }, + { + "epoch": 0.02592420782186697, + "grad_norm": 1.7927355766296387, + "learning_rate": 4.991717117491073e-05, + "loss": 5.4339, + "step": 4359 + }, + { + "epoch": 0.02593015510514797, + "grad_norm": 2.314069986343384, + "learning_rate": 4.991713317925485e-05, + "loss": 5.5534, + "step": 4360 + }, + { + "epoch": 0.025936102388428965, + "grad_norm": 2.2628493309020996, + "learning_rate": 4.9917095174900665e-05, + "loss": 5.5996, + "step": 4361 + }, + { + "epoch": 0.025942049671709964, + "grad_norm": 2.1669869422912598, + "learning_rate": 4.991705716184818e-05, + "loss": 5.704, + "step": 4362 + }, + { + "epoch": 0.02594799695499096, + "grad_norm": 2.2048137187957764, + "learning_rate": 4.99170191400974e-05, + "loss": 5.6576, + "step": 4363 + }, + { + "epoch": 0.025953944238271957, + "grad_norm": 2.172398328781128, + "learning_rate": 4.991698110964835e-05, + "loss": 5.7254, + "step": 4364 + }, + { + "epoch": 0.025959891521552956, + "grad_norm": 1.9689068794250488, + "learning_rate": 4.9916943070501047e-05, + "loss": 5.7303, + "step": 4365 + }, + { + "epoch": 0.02596583880483395, + "grad_norm": 1.7037044763565063, + "learning_rate": 4.991690502265549e-05, + "loss": 5.6542, + "step": 4366 + }, + { + "epoch": 0.02597178608811495, + "grad_norm": 1.7666655778884888, + "learning_rate": 4.9916866966111695e-05, + "loss": 5.7833, + "step": 4367 + }, + { + "epoch": 0.025977733371395945, + "grad_norm": 2.0178141593933105, + "learning_rate": 4.991682890086968e-05, + "loss": 5.7759, + "step": 4368 + }, + { + "epoch": 0.025983680654676943, + "grad_norm": 1.7989983558654785, + "learning_rate": 4.991679082692946e-05, + "loss": 5.8772, + "step": 4369 + }, + { + "epoch": 0.025989627937957942, + "grad_norm": 1.8004199266433716, + "learning_rate": 4.9916752744291054e-05, + "loss": 5.6145, + "step": 4370 + }, + { + "epoch": 0.025995575221238937, + "grad_norm": 1.837074637413025, + "learning_rate": 4.991671465295446e-05, + "loss": 5.4874, + "step": 4371 + }, + { + "epoch": 0.026001522504519935, + "grad_norm": 1.7436491250991821, + "learning_rate": 4.991667655291969e-05, + "loss": 5.7212, + "step": 4372 + }, + { + "epoch": 0.026007469787800934, + "grad_norm": 1.7802095413208008, + "learning_rate": 4.991663844418678e-05, + "loss": 5.7004, + "step": 4373 + }, + { + "epoch": 0.02601341707108193, + "grad_norm": 2.112487316131592, + "learning_rate": 4.991660032675572e-05, + "loss": 5.5579, + "step": 4374 + }, + { + "epoch": 0.026019364354362928, + "grad_norm": 2.0917413234710693, + "learning_rate": 4.9916562200626535e-05, + "loss": 5.7825, + "step": 4375 + }, + { + "epoch": 0.026025311637643923, + "grad_norm": 1.8323053121566772, + "learning_rate": 4.991652406579924e-05, + "loss": 5.7699, + "step": 4376 + }, + { + "epoch": 0.02603125892092492, + "grad_norm": 1.9480723142623901, + "learning_rate": 4.9916485922273835e-05, + "loss": 5.6591, + "step": 4377 + }, + { + "epoch": 0.02603720620420592, + "grad_norm": 2.000739812850952, + "learning_rate": 4.991644777005035e-05, + "loss": 5.8919, + "step": 4378 + }, + { + "epoch": 0.026043153487486915, + "grad_norm": 2.093573808670044, + "learning_rate": 4.991640960912879e-05, + "loss": 5.7357, + "step": 4379 + }, + { + "epoch": 0.026049100770767913, + "grad_norm": 1.932019591331482, + "learning_rate": 4.991637143950916e-05, + "loss": 5.7268, + "step": 4380 + }, + { + "epoch": 0.026055048054048912, + "grad_norm": 1.820102572441101, + "learning_rate": 4.991633326119149e-05, + "loss": 5.8733, + "step": 4381 + }, + { + "epoch": 0.026060995337329907, + "grad_norm": 1.9091769456863403, + "learning_rate": 4.991629507417578e-05, + "loss": 5.5532, + "step": 4382 + }, + { + "epoch": 0.026066942620610906, + "grad_norm": 2.0037779808044434, + "learning_rate": 4.991625687846205e-05, + "loss": 5.7841, + "step": 4383 + }, + { + "epoch": 0.0260728899038919, + "grad_norm": 1.7106568813323975, + "learning_rate": 4.991621867405032e-05, + "loss": 5.4486, + "step": 4384 + }, + { + "epoch": 0.0260788371871729, + "grad_norm": 1.7802643775939941, + "learning_rate": 4.9916180460940585e-05, + "loss": 5.7494, + "step": 4385 + }, + { + "epoch": 0.026084784470453898, + "grad_norm": 2.089503288269043, + "learning_rate": 4.991614223913288e-05, + "loss": 5.6044, + "step": 4386 + }, + { + "epoch": 0.026090731753734893, + "grad_norm": 2.3315577507019043, + "learning_rate": 4.99161040086272e-05, + "loss": 5.9552, + "step": 4387 + }, + { + "epoch": 0.02609667903701589, + "grad_norm": 2.1202025413513184, + "learning_rate": 4.9916065769423566e-05, + "loss": 5.778, + "step": 4388 + }, + { + "epoch": 0.02610262632029689, + "grad_norm": 2.3448777198791504, + "learning_rate": 4.991602752152199e-05, + "loss": 5.8014, + "step": 4389 + }, + { + "epoch": 0.026108573603577885, + "grad_norm": 2.1613330841064453, + "learning_rate": 4.9915989264922495e-05, + "loss": 5.731, + "step": 4390 + }, + { + "epoch": 0.026114520886858884, + "grad_norm": 2.0314743518829346, + "learning_rate": 4.991595099962507e-05, + "loss": 5.8181, + "step": 4391 + }, + { + "epoch": 0.02612046817013988, + "grad_norm": 2.053994655609131, + "learning_rate": 4.9915912725629755e-05, + "loss": 5.7264, + "step": 4392 + }, + { + "epoch": 0.026126415453420877, + "grad_norm": 1.8720483779907227, + "learning_rate": 4.991587444293655e-05, + "loss": 5.5229, + "step": 4393 + }, + { + "epoch": 0.026132362736701876, + "grad_norm": 1.8745067119598389, + "learning_rate": 4.991583615154547e-05, + "loss": 5.612, + "step": 4394 + }, + { + "epoch": 0.02613831001998287, + "grad_norm": 2.124157428741455, + "learning_rate": 4.9915797851456525e-05, + "loss": 5.7276, + "step": 4395 + }, + { + "epoch": 0.02614425730326387, + "grad_norm": 2.2587873935699463, + "learning_rate": 4.991575954266974e-05, + "loss": 5.7994, + "step": 4396 + }, + { + "epoch": 0.026150204586544865, + "grad_norm": 1.9030078649520874, + "learning_rate": 4.9915721225185116e-05, + "loss": 5.7491, + "step": 4397 + }, + { + "epoch": 0.026156151869825863, + "grad_norm": 2.2278738021850586, + "learning_rate": 4.991568289900267e-05, + "loss": 5.4701, + "step": 4398 + }, + { + "epoch": 0.02616209915310686, + "grad_norm": 2.190974473953247, + "learning_rate": 4.991564456412242e-05, + "loss": 5.6731, + "step": 4399 + }, + { + "epoch": 0.026168046436387857, + "grad_norm": 2.3491454124450684, + "learning_rate": 4.991560622054438e-05, + "loss": 5.4041, + "step": 4400 + }, + { + "epoch": 0.026173993719668855, + "grad_norm": 2.2767796516418457, + "learning_rate": 4.991556786826854e-05, + "loss": 5.9005, + "step": 4401 + }, + { + "epoch": 0.026179941002949854, + "grad_norm": 2.3645145893096924, + "learning_rate": 4.991552950729496e-05, + "loss": 6.3108, + "step": 4402 + }, + { + "epoch": 0.02618588828623085, + "grad_norm": 2.1715476512908936, + "learning_rate": 4.9915491137623605e-05, + "loss": 5.8186, + "step": 4403 + }, + { + "epoch": 0.026191835569511848, + "grad_norm": 2.195758581161499, + "learning_rate": 4.991545275925452e-05, + "loss": 5.692, + "step": 4404 + }, + { + "epoch": 0.026197782852792843, + "grad_norm": 2.1124489307403564, + "learning_rate": 4.9915414372187705e-05, + "loss": 5.6582, + "step": 4405 + }, + { + "epoch": 0.02620373013607384, + "grad_norm": 1.9873831272125244, + "learning_rate": 4.991537597642317e-05, + "loss": 5.6309, + "step": 4406 + }, + { + "epoch": 0.02620967741935484, + "grad_norm": 1.9675770998001099, + "learning_rate": 4.991533757196094e-05, + "loss": 5.7095, + "step": 4407 + }, + { + "epoch": 0.026215624702635835, + "grad_norm": 1.9072648286819458, + "learning_rate": 4.991529915880103e-05, + "loss": 5.6449, + "step": 4408 + }, + { + "epoch": 0.026221571985916833, + "grad_norm": 2.3060495853424072, + "learning_rate": 4.9915260736943435e-05, + "loss": 5.6712, + "step": 4409 + }, + { + "epoch": 0.026227519269197832, + "grad_norm": 2.4438107013702393, + "learning_rate": 4.991522230638819e-05, + "loss": 5.2384, + "step": 4410 + }, + { + "epoch": 0.026233466552478827, + "grad_norm": 1.8102613687515259, + "learning_rate": 4.991518386713529e-05, + "loss": 5.5508, + "step": 4411 + }, + { + "epoch": 0.026239413835759826, + "grad_norm": 2.0226693153381348, + "learning_rate": 4.991514541918476e-05, + "loss": 5.4049, + "step": 4412 + }, + { + "epoch": 0.02624536111904082, + "grad_norm": 2.261418104171753, + "learning_rate": 4.991510696253661e-05, + "loss": 5.3324, + "step": 4413 + }, + { + "epoch": 0.02625130840232182, + "grad_norm": 2.232844352722168, + "learning_rate": 4.9915068497190856e-05, + "loss": 5.2601, + "step": 4414 + }, + { + "epoch": 0.026257255685602818, + "grad_norm": 2.2306487560272217, + "learning_rate": 4.99150300231475e-05, + "loss": 5.3329, + "step": 4415 + }, + { + "epoch": 0.026263202968883813, + "grad_norm": 2.1368730068206787, + "learning_rate": 4.9914991540406574e-05, + "loss": 5.573, + "step": 4416 + }, + { + "epoch": 0.02626915025216481, + "grad_norm": 1.984078288078308, + "learning_rate": 4.991495304896808e-05, + "loss": 5.6518, + "step": 4417 + }, + { + "epoch": 0.02627509753544581, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.9914914548832034e-05, + "loss": 5.7076, + "step": 4418 + }, + { + "epoch": 0.026281044818726805, + "grad_norm": 1.9880858659744263, + "learning_rate": 4.991487603999845e-05, + "loss": 5.6533, + "step": 4419 + }, + { + "epoch": 0.026286992102007804, + "grad_norm": 2.0475687980651855, + "learning_rate": 4.991483752246734e-05, + "loss": 5.6311, + "step": 4420 + }, + { + "epoch": 0.0262929393852888, + "grad_norm": 2.2796714305877686, + "learning_rate": 4.991479899623871e-05, + "loss": 5.364, + "step": 4421 + }, + { + "epoch": 0.026298886668569797, + "grad_norm": 1.8535730838775635, + "learning_rate": 4.991476046131259e-05, + "loss": 5.6153, + "step": 4422 + }, + { + "epoch": 0.026304833951850796, + "grad_norm": 1.97511887550354, + "learning_rate": 4.9914721917688976e-05, + "loss": 5.5682, + "step": 4423 + }, + { + "epoch": 0.02631078123513179, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.99146833653679e-05, + "loss": 5.5609, + "step": 4424 + }, + { + "epoch": 0.02631672851841279, + "grad_norm": 1.9997434616088867, + "learning_rate": 4.9914644804349356e-05, + "loss": 5.6196, + "step": 4425 + }, + { + "epoch": 0.026322675801693788, + "grad_norm": 1.6116957664489746, + "learning_rate": 4.991460623463337e-05, + "loss": 5.5003, + "step": 4426 + }, + { + "epoch": 0.026328623084974783, + "grad_norm": 1.8156583309173584, + "learning_rate": 4.991456765621996e-05, + "loss": 5.5875, + "step": 4427 + }, + { + "epoch": 0.02633457036825578, + "grad_norm": 2.0364272594451904, + "learning_rate": 4.991452906910912e-05, + "loss": 5.6541, + "step": 4428 + }, + { + "epoch": 0.026340517651536777, + "grad_norm": 1.8430767059326172, + "learning_rate": 4.991449047330088e-05, + "loss": 5.5408, + "step": 4429 + }, + { + "epoch": 0.026346464934817775, + "grad_norm": 2.049476385116577, + "learning_rate": 4.991445186879525e-05, + "loss": 5.5644, + "step": 4430 + }, + { + "epoch": 0.026352412218098774, + "grad_norm": 1.9186240434646606, + "learning_rate": 4.991441325559224e-05, + "loss": 5.5977, + "step": 4431 + }, + { + "epoch": 0.02635835950137977, + "grad_norm": 1.80244779586792, + "learning_rate": 4.991437463369186e-05, + "loss": 5.5114, + "step": 4432 + }, + { + "epoch": 0.026364306784660767, + "grad_norm": 2.2580177783966064, + "learning_rate": 4.991433600309414e-05, + "loss": 5.4132, + "step": 4433 + }, + { + "epoch": 0.026370254067941763, + "grad_norm": 2.0970637798309326, + "learning_rate": 4.991429736379908e-05, + "loss": 5.6211, + "step": 4434 + }, + { + "epoch": 0.02637620135122276, + "grad_norm": 2.0690932273864746, + "learning_rate": 4.9914258715806696e-05, + "loss": 5.6511, + "step": 4435 + }, + { + "epoch": 0.02638214863450376, + "grad_norm": 2.063052177429199, + "learning_rate": 4.9914220059117e-05, + "loss": 5.5169, + "step": 4436 + }, + { + "epoch": 0.026388095917784755, + "grad_norm": 1.990708827972412, + "learning_rate": 4.991418139373001e-05, + "loss": 5.5018, + "step": 4437 + }, + { + "epoch": 0.026394043201065753, + "grad_norm": 2.1311633586883545, + "learning_rate": 4.9914142719645736e-05, + "loss": 5.4714, + "step": 4438 + }, + { + "epoch": 0.026399990484346752, + "grad_norm": 1.7688508033752441, + "learning_rate": 4.991410403686419e-05, + "loss": 5.5208, + "step": 4439 + }, + { + "epoch": 0.026405937767627747, + "grad_norm": 2.3486130237579346, + "learning_rate": 4.9914065345385383e-05, + "loss": 5.4524, + "step": 4440 + }, + { + "epoch": 0.026411885050908745, + "grad_norm": 2.0333707332611084, + "learning_rate": 4.9914026645209344e-05, + "loss": 5.6747, + "step": 4441 + }, + { + "epoch": 0.02641783233418974, + "grad_norm": 1.8731845617294312, + "learning_rate": 4.991398793633607e-05, + "loss": 5.6436, + "step": 4442 + }, + { + "epoch": 0.02642377961747074, + "grad_norm": 2.003361225128174, + "learning_rate": 4.991394921876558e-05, + "loss": 5.4628, + "step": 4443 + }, + { + "epoch": 0.026429726900751738, + "grad_norm": 2.1195411682128906, + "learning_rate": 4.991391049249789e-05, + "loss": 5.4096, + "step": 4444 + }, + { + "epoch": 0.026435674184032733, + "grad_norm": 1.857364535331726, + "learning_rate": 4.991387175753301e-05, + "loss": 5.3928, + "step": 4445 + }, + { + "epoch": 0.02644162146731373, + "grad_norm": 1.8932915925979614, + "learning_rate": 4.991383301387095e-05, + "loss": 5.4917, + "step": 4446 + }, + { + "epoch": 0.02644756875059473, + "grad_norm": 1.8743010759353638, + "learning_rate": 4.991379426151174e-05, + "loss": 5.6766, + "step": 4447 + }, + { + "epoch": 0.026453516033875725, + "grad_norm": 1.910796046257019, + "learning_rate": 4.991375550045537e-05, + "loss": 5.4347, + "step": 4448 + }, + { + "epoch": 0.026459463317156724, + "grad_norm": 1.7901744842529297, + "learning_rate": 4.991371673070187e-05, + "loss": 5.5339, + "step": 4449 + }, + { + "epoch": 0.02646541060043772, + "grad_norm": 1.86943519115448, + "learning_rate": 4.9913677952251244e-05, + "loss": 5.4867, + "step": 4450 + }, + { + "epoch": 0.026471357883718717, + "grad_norm": 1.8662208318710327, + "learning_rate": 4.991363916510352e-05, + "loss": 5.4992, + "step": 4451 + }, + { + "epoch": 0.026477305166999716, + "grad_norm": 1.7465355396270752, + "learning_rate": 4.99136003692587e-05, + "loss": 5.5243, + "step": 4452 + }, + { + "epoch": 0.02648325245028071, + "grad_norm": 1.9097687005996704, + "learning_rate": 4.9913561564716794e-05, + "loss": 5.5096, + "step": 4453 + }, + { + "epoch": 0.02648919973356171, + "grad_norm": 2.1472127437591553, + "learning_rate": 4.991352275147783e-05, + "loss": 5.4462, + "step": 4454 + }, + { + "epoch": 0.026495147016842708, + "grad_norm": 2.3966939449310303, + "learning_rate": 4.9913483929541806e-05, + "loss": 5.2938, + "step": 4455 + }, + { + "epoch": 0.026501094300123703, + "grad_norm": 2.1738977432250977, + "learning_rate": 4.991344509890874e-05, + "loss": 5.317, + "step": 4456 + }, + { + "epoch": 0.0265070415834047, + "grad_norm": 1.963944435119629, + "learning_rate": 4.9913406259578646e-05, + "loss": 5.3827, + "step": 4457 + }, + { + "epoch": 0.026512988866685697, + "grad_norm": 2.1755871772766113, + "learning_rate": 4.991336741155155e-05, + "loss": 5.2941, + "step": 4458 + }, + { + "epoch": 0.026518936149966695, + "grad_norm": 2.2461934089660645, + "learning_rate": 4.991332855482744e-05, + "loss": 5.3503, + "step": 4459 + }, + { + "epoch": 0.026524883433247694, + "grad_norm": 2.2270491123199463, + "learning_rate": 4.9913289689406355e-05, + "loss": 5.417, + "step": 4460 + }, + { + "epoch": 0.02653083071652869, + "grad_norm": 2.437074661254883, + "learning_rate": 4.991325081528829e-05, + "loss": 5.1938, + "step": 4461 + }, + { + "epoch": 0.026536777999809687, + "grad_norm": 2.159170150756836, + "learning_rate": 4.991321193247328e-05, + "loss": 5.2088, + "step": 4462 + }, + { + "epoch": 0.026542725283090682, + "grad_norm": 2.08797287940979, + "learning_rate": 4.9913173040961315e-05, + "loss": 5.1829, + "step": 4463 + }, + { + "epoch": 0.02654867256637168, + "grad_norm": 2.805191993713379, + "learning_rate": 4.991313414075242e-05, + "loss": 6.3049, + "step": 4464 + }, + { + "epoch": 0.02655461984965268, + "grad_norm": 2.3204843997955322, + "learning_rate": 4.991309523184661e-05, + "loss": 5.3831, + "step": 4465 + }, + { + "epoch": 0.026560567132933675, + "grad_norm": 2.217212200164795, + "learning_rate": 4.991305631424389e-05, + "loss": 5.4647, + "step": 4466 + }, + { + "epoch": 0.026566514416214673, + "grad_norm": 2.1094207763671875, + "learning_rate": 4.991301738794429e-05, + "loss": 5.5837, + "step": 4467 + }, + { + "epoch": 0.026572461699495672, + "grad_norm": 2.225660562515259, + "learning_rate": 4.99129784529478e-05, + "loss": 5.8316, + "step": 4468 + }, + { + "epoch": 0.026578408982776667, + "grad_norm": 2.361238956451416, + "learning_rate": 4.991293950925446e-05, + "loss": 5.8358, + "step": 4469 + }, + { + "epoch": 0.026584356266057665, + "grad_norm": 2.3268609046936035, + "learning_rate": 4.991290055686426e-05, + "loss": 5.732, + "step": 4470 + }, + { + "epoch": 0.02659030354933866, + "grad_norm": 2.1456172466278076, + "learning_rate": 4.9912861595777226e-05, + "loss": 5.9, + "step": 4471 + }, + { + "epoch": 0.02659625083261966, + "grad_norm": 2.114696979522705, + "learning_rate": 4.991282262599337e-05, + "loss": 5.4464, + "step": 4472 + }, + { + "epoch": 0.026602198115900658, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9912783647512705e-05, + "loss": 5.5053, + "step": 4473 + }, + { + "epoch": 0.026608145399181653, + "grad_norm": 1.9743404388427734, + "learning_rate": 4.9912744660335245e-05, + "loss": 5.5877, + "step": 4474 + }, + { + "epoch": 0.02661409268246265, + "grad_norm": 2.052358865737915, + "learning_rate": 4.991270566446101e-05, + "loss": 5.5891, + "step": 4475 + }, + { + "epoch": 0.02662003996574365, + "grad_norm": 2.1602041721343994, + "learning_rate": 4.991266665989e-05, + "loss": 5.581, + "step": 4476 + }, + { + "epoch": 0.026625987249024645, + "grad_norm": 2.241586685180664, + "learning_rate": 4.9912627646622236e-05, + "loss": 5.5375, + "step": 4477 + }, + { + "epoch": 0.026631934532305643, + "grad_norm": 1.7952601909637451, + "learning_rate": 4.991258862465773e-05, + "loss": 5.5273, + "step": 4478 + }, + { + "epoch": 0.02663788181558664, + "grad_norm": 1.9767752885818481, + "learning_rate": 4.991254959399649e-05, + "loss": 5.4476, + "step": 4479 + }, + { + "epoch": 0.026643829098867637, + "grad_norm": 1.7997682094573975, + "learning_rate": 4.991251055463855e-05, + "loss": 5.5666, + "step": 4480 + }, + { + "epoch": 0.026649776382148636, + "grad_norm": 2.3247575759887695, + "learning_rate": 4.9912471506583905e-05, + "loss": 5.5247, + "step": 4481 + }, + { + "epoch": 0.02665572366542963, + "grad_norm": 2.165900230407715, + "learning_rate": 4.991243244983257e-05, + "loss": 5.6807, + "step": 4482 + }, + { + "epoch": 0.02666167094871063, + "grad_norm": 2.598257303237915, + "learning_rate": 4.991239338438456e-05, + "loss": 5.6609, + "step": 4483 + }, + { + "epoch": 0.026667618231991628, + "grad_norm": 2.2752041816711426, + "learning_rate": 4.991235431023989e-05, + "loss": 5.5199, + "step": 4484 + }, + { + "epoch": 0.026673565515272623, + "grad_norm": 2.3482842445373535, + "learning_rate": 4.9912315227398586e-05, + "loss": 5.6438, + "step": 4485 + }, + { + "epoch": 0.02667951279855362, + "grad_norm": 2.034403085708618, + "learning_rate": 4.991227613586065e-05, + "loss": 5.6191, + "step": 4486 + }, + { + "epoch": 0.026685460081834617, + "grad_norm": 1.9002971649169922, + "learning_rate": 4.9912237035626085e-05, + "loss": 5.6627, + "step": 4487 + }, + { + "epoch": 0.026691407365115615, + "grad_norm": 2.0305564403533936, + "learning_rate": 4.9912197926694924e-05, + "loss": 5.7009, + "step": 4488 + }, + { + "epoch": 0.026697354648396614, + "grad_norm": 2.029777765274048, + "learning_rate": 4.991215880906717e-05, + "loss": 5.5201, + "step": 4489 + }, + { + "epoch": 0.02670330193167761, + "grad_norm": 1.8889492750167847, + "learning_rate": 4.991211968274283e-05, + "loss": 5.602, + "step": 4490 + }, + { + "epoch": 0.026709249214958607, + "grad_norm": 1.9616930484771729, + "learning_rate": 4.9912080547721934e-05, + "loss": 5.5352, + "step": 4491 + }, + { + "epoch": 0.026715196498239602, + "grad_norm": 2.449345827102661, + "learning_rate": 4.9912041404004485e-05, + "loss": 5.7103, + "step": 4492 + }, + { + "epoch": 0.0267211437815206, + "grad_norm": 2.5550389289855957, + "learning_rate": 4.991200225159051e-05, + "loss": 5.5593, + "step": 4493 + }, + { + "epoch": 0.0267270910648016, + "grad_norm": 2.2512362003326416, + "learning_rate": 4.9911963090479996e-05, + "loss": 5.6329, + "step": 4494 + }, + { + "epoch": 0.026733038348082595, + "grad_norm": 2.0346968173980713, + "learning_rate": 4.9911923920672984e-05, + "loss": 5.5966, + "step": 4495 + }, + { + "epoch": 0.026738985631363593, + "grad_norm": 2.013648271560669, + "learning_rate": 4.991188474216947e-05, + "loss": 5.6532, + "step": 4496 + }, + { + "epoch": 0.026744932914644592, + "grad_norm": 1.8361715078353882, + "learning_rate": 4.9911845554969484e-05, + "loss": 5.519, + "step": 4497 + }, + { + "epoch": 0.026750880197925587, + "grad_norm": 2.1487016677856445, + "learning_rate": 4.991180635907302e-05, + "loss": 5.436, + "step": 4498 + }, + { + "epoch": 0.026756827481206585, + "grad_norm": 2.277714967727661, + "learning_rate": 4.991176715448011e-05, + "loss": 5.3574, + "step": 4499 + }, + { + "epoch": 0.02676277476448758, + "grad_norm": 2.3313565254211426, + "learning_rate": 4.9911727941190755e-05, + "loss": 5.5408, + "step": 4500 + }, + { + "epoch": 0.02676872204776858, + "grad_norm": 2.105825662612915, + "learning_rate": 4.9911688719204975e-05, + "loss": 5.4801, + "step": 4501 + }, + { + "epoch": 0.026774669331049578, + "grad_norm": 2.122138261795044, + "learning_rate": 4.991164948852278e-05, + "loss": 5.4645, + "step": 4502 + }, + { + "epoch": 0.026780616614330573, + "grad_norm": 1.8742777109146118, + "learning_rate": 4.991161024914419e-05, + "loss": 5.5646, + "step": 4503 + }, + { + "epoch": 0.02678656389761157, + "grad_norm": 1.762276291847229, + "learning_rate": 4.991157100106921e-05, + "loss": 5.5672, + "step": 4504 + }, + { + "epoch": 0.02679251118089257, + "grad_norm": 1.9174740314483643, + "learning_rate": 4.9911531744297855e-05, + "loss": 5.4296, + "step": 4505 + }, + { + "epoch": 0.026798458464173565, + "grad_norm": 2.0585875511169434, + "learning_rate": 4.991149247883015e-05, + "loss": 5.5685, + "step": 4506 + }, + { + "epoch": 0.026804405747454563, + "grad_norm": 1.8675988912582397, + "learning_rate": 4.9911453204666094e-05, + "loss": 5.4757, + "step": 4507 + }, + { + "epoch": 0.02681035303073556, + "grad_norm": 2.3117783069610596, + "learning_rate": 4.99114139218057e-05, + "loss": 5.7057, + "step": 4508 + }, + { + "epoch": 0.026816300314016557, + "grad_norm": 2.5439465045928955, + "learning_rate": 4.9911374630249007e-05, + "loss": 5.7393, + "step": 4509 + }, + { + "epoch": 0.026822247597297556, + "grad_norm": 2.4611666202545166, + "learning_rate": 4.9911335329996e-05, + "loss": 5.7215, + "step": 4510 + }, + { + "epoch": 0.02682819488057855, + "grad_norm": 2.1540768146514893, + "learning_rate": 4.99112960210467e-05, + "loss": 5.7059, + "step": 4511 + }, + { + "epoch": 0.02683414216385955, + "grad_norm": 2.1183645725250244, + "learning_rate": 4.9911256703401134e-05, + "loss": 5.4454, + "step": 4512 + }, + { + "epoch": 0.026840089447140548, + "grad_norm": 2.1757540702819824, + "learning_rate": 4.9911217377059295e-05, + "loss": 5.6851, + "step": 4513 + }, + { + "epoch": 0.026846036730421543, + "grad_norm": 2.2770378589630127, + "learning_rate": 4.9911178042021214e-05, + "loss": 5.5957, + "step": 4514 + }, + { + "epoch": 0.02685198401370254, + "grad_norm": 2.320993185043335, + "learning_rate": 4.9911138698286895e-05, + "loss": 5.4674, + "step": 4515 + }, + { + "epoch": 0.026857931296983537, + "grad_norm": 2.2340428829193115, + "learning_rate": 4.991109934585636e-05, + "loss": 5.4514, + "step": 4516 + }, + { + "epoch": 0.026863878580264535, + "grad_norm": 2.1531431674957275, + "learning_rate": 4.991105998472962e-05, + "loss": 5.4386, + "step": 4517 + }, + { + "epoch": 0.026869825863545534, + "grad_norm": 2.1567044258117676, + "learning_rate": 4.991102061490667e-05, + "loss": 5.422, + "step": 4518 + }, + { + "epoch": 0.02687577314682653, + "grad_norm": 2.1181681156158447, + "learning_rate": 4.9910981236387554e-05, + "loss": 5.7214, + "step": 4519 + }, + { + "epoch": 0.026881720430107527, + "grad_norm": 2.3410873413085938, + "learning_rate": 4.9910941849172263e-05, + "loss": 5.8603, + "step": 4520 + }, + { + "epoch": 0.026887667713388526, + "grad_norm": 2.4943840503692627, + "learning_rate": 4.9910902453260824e-05, + "loss": 5.7084, + "step": 4521 + }, + { + "epoch": 0.02689361499666952, + "grad_norm": 2.1420044898986816, + "learning_rate": 4.991086304865325e-05, + "loss": 5.528, + "step": 4522 + }, + { + "epoch": 0.02689956227995052, + "grad_norm": 2.3257980346679688, + "learning_rate": 4.991082363534955e-05, + "loss": 5.6791, + "step": 4523 + }, + { + "epoch": 0.026905509563231515, + "grad_norm": 2.335049867630005, + "learning_rate": 4.991078421334974e-05, + "loss": 5.6184, + "step": 4524 + }, + { + "epoch": 0.026911456846512513, + "grad_norm": 3.7381551265716553, + "learning_rate": 4.9910744782653825e-05, + "loss": 5.954, + "step": 4525 + }, + { + "epoch": 0.02691740412979351, + "grad_norm": 3.1807587146759033, + "learning_rate": 4.991070534326183e-05, + "loss": 6.5662, + "step": 4526 + }, + { + "epoch": 0.026923351413074507, + "grad_norm": 2.378366708755493, + "learning_rate": 4.991066589517376e-05, + "loss": 6.2312, + "step": 4527 + }, + { + "epoch": 0.026929298696355505, + "grad_norm": 2.5797109603881836, + "learning_rate": 4.991062643838964e-05, + "loss": 5.9969, + "step": 4528 + }, + { + "epoch": 0.0269352459796365, + "grad_norm": 2.522815704345703, + "learning_rate": 4.991058697290948e-05, + "loss": 5.919, + "step": 4529 + }, + { + "epoch": 0.0269411932629175, + "grad_norm": 2.5215437412261963, + "learning_rate": 4.991054749873329e-05, + "loss": 5.8812, + "step": 4530 + }, + { + "epoch": 0.026947140546198498, + "grad_norm": 2.1608335971832275, + "learning_rate": 4.991050801586108e-05, + "loss": 5.8381, + "step": 4531 + }, + { + "epoch": 0.026953087829479493, + "grad_norm": 2.37752366065979, + "learning_rate": 4.991046852429288e-05, + "loss": 5.7612, + "step": 4532 + }, + { + "epoch": 0.02695903511276049, + "grad_norm": 2.117534875869751, + "learning_rate": 4.991042902402868e-05, + "loss": 5.6762, + "step": 4533 + }, + { + "epoch": 0.02696498239604149, + "grad_norm": 2.595797061920166, + "learning_rate": 4.991038951506851e-05, + "loss": 6.19, + "step": 4534 + }, + { + "epoch": 0.026970929679322485, + "grad_norm": 2.2216086387634277, + "learning_rate": 4.991034999741239e-05, + "loss": 6.1612, + "step": 4535 + }, + { + "epoch": 0.026976876962603483, + "grad_norm": 2.829735279083252, + "learning_rate": 4.991031047106032e-05, + "loss": 5.6955, + "step": 4536 + }, + { + "epoch": 0.02698282424588448, + "grad_norm": 2.5018115043640137, + "learning_rate": 4.991027093601231e-05, + "loss": 5.4966, + "step": 4537 + }, + { + "epoch": 0.026988771529165477, + "grad_norm": 2.334052085876465, + "learning_rate": 4.9910231392268385e-05, + "loss": 6.1603, + "step": 4538 + }, + { + "epoch": 0.026994718812446476, + "grad_norm": 2.497351884841919, + "learning_rate": 4.991019183982856e-05, + "loss": 6.0128, + "step": 4539 + }, + { + "epoch": 0.02700066609572747, + "grad_norm": 2.2976267337799072, + "learning_rate": 4.991015227869284e-05, + "loss": 5.6696, + "step": 4540 + }, + { + "epoch": 0.02700661337900847, + "grad_norm": 2.6851742267608643, + "learning_rate": 4.991011270886125e-05, + "loss": 5.7996, + "step": 4541 + }, + { + "epoch": 0.027012560662289468, + "grad_norm": 2.531029224395752, + "learning_rate": 4.991007313033379e-05, + "loss": 5.6671, + "step": 4542 + }, + { + "epoch": 0.027018507945570463, + "grad_norm": 2.195552110671997, + "learning_rate": 4.991003354311048e-05, + "loss": 6.3213, + "step": 4543 + }, + { + "epoch": 0.02702445522885146, + "grad_norm": 2.2973361015319824, + "learning_rate": 4.9909993947191336e-05, + "loss": 6.1523, + "step": 4544 + }, + { + "epoch": 0.027030402512132456, + "grad_norm": 2.4766385555267334, + "learning_rate": 4.990995434257637e-05, + "loss": 5.7894, + "step": 4545 + }, + { + "epoch": 0.027036349795413455, + "grad_norm": 2.486384630203247, + "learning_rate": 4.9909914729265606e-05, + "loss": 6.2814, + "step": 4546 + }, + { + "epoch": 0.027042297078694454, + "grad_norm": 2.5054233074188232, + "learning_rate": 4.9909875107259036e-05, + "loss": 6.2859, + "step": 4547 + }, + { + "epoch": 0.02704824436197545, + "grad_norm": 2.70576548576355, + "learning_rate": 4.990983547655669e-05, + "loss": 6.2424, + "step": 4548 + }, + { + "epoch": 0.027054191645256447, + "grad_norm": 3.0937716960906982, + "learning_rate": 4.990979583715858e-05, + "loss": 6.4392, + "step": 4549 + }, + { + "epoch": 0.027060138928537446, + "grad_norm": 2.6290581226348877, + "learning_rate": 4.9909756189064714e-05, + "loss": 6.3565, + "step": 4550 + }, + { + "epoch": 0.02706608621181844, + "grad_norm": 2.5180583000183105, + "learning_rate": 4.990971653227511e-05, + "loss": 6.1482, + "step": 4551 + }, + { + "epoch": 0.02707203349509944, + "grad_norm": 2.6096208095550537, + "learning_rate": 4.990967686678978e-05, + "loss": 5.7724, + "step": 4552 + }, + { + "epoch": 0.027077980778380435, + "grad_norm": 3.187276840209961, + "learning_rate": 4.990963719260874e-05, + "loss": 5.682, + "step": 4553 + }, + { + "epoch": 0.027083928061661433, + "grad_norm": 2.3522419929504395, + "learning_rate": 4.9909597509732006e-05, + "loss": 6.7045, + "step": 4554 + }, + { + "epoch": 0.02708987534494243, + "grad_norm": 2.6016366481781006, + "learning_rate": 4.990955781815959e-05, + "loss": 6.0653, + "step": 4555 + }, + { + "epoch": 0.027095822628223427, + "grad_norm": 2.5409183502197266, + "learning_rate": 4.99095181178915e-05, + "loss": 5.861, + "step": 4556 + }, + { + "epoch": 0.027101769911504425, + "grad_norm": 2.5297863483428955, + "learning_rate": 4.9909478408927754e-05, + "loss": 5.5301, + "step": 4557 + }, + { + "epoch": 0.02710771719478542, + "grad_norm": 2.4822275638580322, + "learning_rate": 4.990943869126837e-05, + "loss": 5.6919, + "step": 4558 + }, + { + "epoch": 0.02711366447806642, + "grad_norm": 2.3832650184631348, + "learning_rate": 4.9909398964913365e-05, + "loss": 5.9589, + "step": 4559 + }, + { + "epoch": 0.027119611761347417, + "grad_norm": 2.0038483142852783, + "learning_rate": 4.9909359229862734e-05, + "loss": 6.1847, + "step": 4560 + }, + { + "epoch": 0.027125559044628413, + "grad_norm": 2.3678700923919678, + "learning_rate": 4.990931948611651e-05, + "loss": 6.4794, + "step": 4561 + }, + { + "epoch": 0.02713150632790941, + "grad_norm": 2.7433204650878906, + "learning_rate": 4.990927973367469e-05, + "loss": 6.6997, + "step": 4562 + }, + { + "epoch": 0.02713745361119041, + "grad_norm": 3.5579798221588135, + "learning_rate": 4.990923997253731e-05, + "loss": 6.1809, + "step": 4563 + }, + { + "epoch": 0.027143400894471405, + "grad_norm": 3.254093647003174, + "learning_rate": 4.990920020270436e-05, + "loss": 6.1446, + "step": 4564 + }, + { + "epoch": 0.027149348177752403, + "grad_norm": 3.0661215782165527, + "learning_rate": 4.990916042417588e-05, + "loss": 6.6702, + "step": 4565 + }, + { + "epoch": 0.0271552954610334, + "grad_norm": 2.641291618347168, + "learning_rate": 4.9909120636951864e-05, + "loss": 6.4951, + "step": 4566 + }, + { + "epoch": 0.027161242744314397, + "grad_norm": 2.050675868988037, + "learning_rate": 4.990908084103233e-05, + "loss": 6.3365, + "step": 4567 + }, + { + "epoch": 0.027167190027595396, + "grad_norm": 2.081108331680298, + "learning_rate": 4.990904103641729e-05, + "loss": 6.1874, + "step": 4568 + }, + { + "epoch": 0.02717313731087639, + "grad_norm": 2.5833899974823, + "learning_rate": 4.9909001223106766e-05, + "loss": 6.0892, + "step": 4569 + }, + { + "epoch": 0.02717908459415739, + "grad_norm": 2.7387397289276123, + "learning_rate": 4.990896140110076e-05, + "loss": 6.1036, + "step": 4570 + }, + { + "epoch": 0.027185031877438388, + "grad_norm": 2.5665578842163086, + "learning_rate": 4.99089215703993e-05, + "loss": 5.9577, + "step": 4571 + }, + { + "epoch": 0.027190979160719383, + "grad_norm": 2.3825178146362305, + "learning_rate": 4.990888173100239e-05, + "loss": 5.9654, + "step": 4572 + }, + { + "epoch": 0.02719692644400038, + "grad_norm": 2.562509059906006, + "learning_rate": 4.990884188291005e-05, + "loss": 6.009, + "step": 4573 + }, + { + "epoch": 0.027202873727281376, + "grad_norm": 2.141941785812378, + "learning_rate": 4.9908802026122284e-05, + "loss": 5.8315, + "step": 4574 + }, + { + "epoch": 0.027208821010562375, + "grad_norm": 2.5348474979400635, + "learning_rate": 4.990876216063912e-05, + "loss": 6.3763, + "step": 4575 + }, + { + "epoch": 0.027214768293843374, + "grad_norm": 2.751520872116089, + "learning_rate": 4.990872228646056e-05, + "loss": 6.5684, + "step": 4576 + }, + { + "epoch": 0.02722071557712437, + "grad_norm": 4.626354694366455, + "learning_rate": 4.990868240358662e-05, + "loss": 6.115, + "step": 4577 + }, + { + "epoch": 0.027226662860405367, + "grad_norm": 2.648479700088501, + "learning_rate": 4.990864251201732e-05, + "loss": 6.0879, + "step": 4578 + }, + { + "epoch": 0.027232610143686366, + "grad_norm": 2.21056866645813, + "learning_rate": 4.990860261175268e-05, + "loss": 6.2923, + "step": 4579 + }, + { + "epoch": 0.02723855742696736, + "grad_norm": 2.3460421562194824, + "learning_rate": 4.9908562702792684e-05, + "loss": 6.4044, + "step": 4580 + }, + { + "epoch": 0.02724450471024836, + "grad_norm": 2.6087262630462646, + "learning_rate": 4.990852278513738e-05, + "loss": 6.5131, + "step": 4581 + }, + { + "epoch": 0.027250451993529354, + "grad_norm": 2.6969377994537354, + "learning_rate": 4.9908482858786765e-05, + "loss": 6.3483, + "step": 4582 + }, + { + "epoch": 0.027256399276810353, + "grad_norm": 2.64043927192688, + "learning_rate": 4.990844292374085e-05, + "loss": 5.8712, + "step": 4583 + }, + { + "epoch": 0.02726234656009135, + "grad_norm": 2.5738205909729004, + "learning_rate": 4.9908402979999654e-05, + "loss": 5.9165, + "step": 4584 + }, + { + "epoch": 0.027268293843372347, + "grad_norm": 2.2725625038146973, + "learning_rate": 4.99083630275632e-05, + "loss": 5.8454, + "step": 4585 + }, + { + "epoch": 0.027274241126653345, + "grad_norm": 2.5911824703216553, + "learning_rate": 4.9908323066431494e-05, + "loss": 5.6729, + "step": 4586 + }, + { + "epoch": 0.02728018840993434, + "grad_norm": 2.6691668033599854, + "learning_rate": 4.9908283096604546e-05, + "loss": 5.7726, + "step": 4587 + }, + { + "epoch": 0.02728613569321534, + "grad_norm": 2.6512796878814697, + "learning_rate": 4.990824311808238e-05, + "loss": 6.1295, + "step": 4588 + }, + { + "epoch": 0.027292082976496337, + "grad_norm": 2.816943645477295, + "learning_rate": 4.9908203130865e-05, + "loss": 5.5172, + "step": 4589 + }, + { + "epoch": 0.027298030259777332, + "grad_norm": 2.6252098083496094, + "learning_rate": 4.990816313495242e-05, + "loss": 5.5955, + "step": 4590 + }, + { + "epoch": 0.02730397754305833, + "grad_norm": 2.3711740970611572, + "learning_rate": 4.990812313034466e-05, + "loss": 5.3348, + "step": 4591 + }, + { + "epoch": 0.02730992482633933, + "grad_norm": 2.355436086654663, + "learning_rate": 4.990808311704173e-05, + "loss": 5.6171, + "step": 4592 + }, + { + "epoch": 0.027315872109620325, + "grad_norm": 2.3344695568084717, + "learning_rate": 4.990804309504365e-05, + "loss": 5.46, + "step": 4593 + }, + { + "epoch": 0.027321819392901323, + "grad_norm": 2.3890786170959473, + "learning_rate": 4.990800306435043e-05, + "loss": 5.5658, + "step": 4594 + }, + { + "epoch": 0.02732776667618232, + "grad_norm": 2.5606987476348877, + "learning_rate": 4.990796302496208e-05, + "loss": 5.4778, + "step": 4595 + }, + { + "epoch": 0.027333713959463317, + "grad_norm": 2.2443172931671143, + "learning_rate": 4.9907922976878616e-05, + "loss": 5.486, + "step": 4596 + }, + { + "epoch": 0.027339661242744315, + "grad_norm": 2.3428351879119873, + "learning_rate": 4.990788292010005e-05, + "loss": 5.3332, + "step": 4597 + }, + { + "epoch": 0.02734560852602531, + "grad_norm": 2.6336300373077393, + "learning_rate": 4.9907842854626406e-05, + "loss": 5.4606, + "step": 4598 + }, + { + "epoch": 0.02735155580930631, + "grad_norm": 2.3052382469177246, + "learning_rate": 4.990780278045769e-05, + "loss": 5.4028, + "step": 4599 + }, + { + "epoch": 0.027357503092587308, + "grad_norm": 2.4661340713500977, + "learning_rate": 4.990776269759392e-05, + "loss": 5.6011, + "step": 4600 + }, + { + "epoch": 0.027363450375868303, + "grad_norm": 2.400527238845825, + "learning_rate": 4.99077226060351e-05, + "loss": 5.5952, + "step": 4601 + }, + { + "epoch": 0.0273693976591493, + "grad_norm": 2.364900827407837, + "learning_rate": 4.9907682505781256e-05, + "loss": 5.2125, + "step": 4602 + }, + { + "epoch": 0.027375344942430296, + "grad_norm": 2.383680820465088, + "learning_rate": 4.99076423968324e-05, + "loss": 5.4253, + "step": 4603 + }, + { + "epoch": 0.027381292225711295, + "grad_norm": 2.681903839111328, + "learning_rate": 4.990760227918854e-05, + "loss": 5.3741, + "step": 4604 + }, + { + "epoch": 0.027387239508992293, + "grad_norm": 2.3454341888427734, + "learning_rate": 4.990756215284969e-05, + "loss": 5.3032, + "step": 4605 + }, + { + "epoch": 0.02739318679227329, + "grad_norm": 2.439807653427124, + "learning_rate": 4.990752201781587e-05, + "loss": 5.3368, + "step": 4606 + }, + { + "epoch": 0.027399134075554287, + "grad_norm": 2.938976764678955, + "learning_rate": 4.990748187408709e-05, + "loss": 6.1251, + "step": 4607 + }, + { + "epoch": 0.027405081358835286, + "grad_norm": 3.353973865509033, + "learning_rate": 4.990744172166337e-05, + "loss": 6.72, + "step": 4608 + }, + { + "epoch": 0.02741102864211628, + "grad_norm": 2.4661834239959717, + "learning_rate": 4.990740156054472e-05, + "loss": 5.7156, + "step": 4609 + }, + { + "epoch": 0.02741697592539728, + "grad_norm": 2.303976058959961, + "learning_rate": 4.990736139073116e-05, + "loss": 5.3493, + "step": 4610 + }, + { + "epoch": 0.027422923208678274, + "grad_norm": 2.4225149154663086, + "learning_rate": 4.990732121222268e-05, + "loss": 5.4831, + "step": 4611 + }, + { + "epoch": 0.027428870491959273, + "grad_norm": 2.5566627979278564, + "learning_rate": 4.990728102501932e-05, + "loss": 5.9159, + "step": 4612 + }, + { + "epoch": 0.02743481777524027, + "grad_norm": 2.64258074760437, + "learning_rate": 4.9907240829121085e-05, + "loss": 6.7137, + "step": 4613 + }, + { + "epoch": 0.027440765058521267, + "grad_norm": 2.967501640319824, + "learning_rate": 4.9907200624527986e-05, + "loss": 6.3333, + "step": 4614 + }, + { + "epoch": 0.027446712341802265, + "grad_norm": 2.6084952354431152, + "learning_rate": 4.990716041124005e-05, + "loss": 6.1201, + "step": 4615 + }, + { + "epoch": 0.02745265962508326, + "grad_norm": 3.0721616744995117, + "learning_rate": 4.990712018925727e-05, + "loss": 6.396, + "step": 4616 + }, + { + "epoch": 0.02745860690836426, + "grad_norm": 2.888263463973999, + "learning_rate": 4.990707995857968e-05, + "loss": 6.0773, + "step": 4617 + }, + { + "epoch": 0.027464554191645257, + "grad_norm": 2.7506093978881836, + "learning_rate": 4.990703971920728e-05, + "loss": 5.9909, + "step": 4618 + }, + { + "epoch": 0.027470501474926252, + "grad_norm": 2.8273298740386963, + "learning_rate": 4.99069994711401e-05, + "loss": 5.9591, + "step": 4619 + }, + { + "epoch": 0.02747644875820725, + "grad_norm": 2.451011896133423, + "learning_rate": 4.990695921437813e-05, + "loss": 6.1596, + "step": 4620 + }, + { + "epoch": 0.02748239604148825, + "grad_norm": 2.762265920639038, + "learning_rate": 4.990691894892141e-05, + "loss": 6.6233, + "step": 4621 + }, + { + "epoch": 0.027488343324769245, + "grad_norm": 2.4570846557617188, + "learning_rate": 4.990687867476994e-05, + "loss": 6.5025, + "step": 4622 + }, + { + "epoch": 0.027494290608050243, + "grad_norm": 3.108992576599121, + "learning_rate": 4.990683839192373e-05, + "loss": 5.921, + "step": 4623 + }, + { + "epoch": 0.02750023789133124, + "grad_norm": 2.887580156326294, + "learning_rate": 4.99067981003828e-05, + "loss": 5.9266, + "step": 4624 + }, + { + "epoch": 0.027506185174612237, + "grad_norm": 3.083556890487671, + "learning_rate": 4.990675780014718e-05, + "loss": 5.765, + "step": 4625 + }, + { + "epoch": 0.027512132457893235, + "grad_norm": 2.710231304168701, + "learning_rate": 4.990671749121685e-05, + "loss": 5.7674, + "step": 4626 + }, + { + "epoch": 0.02751807974117423, + "grad_norm": 2.738926410675049, + "learning_rate": 4.9906677173591845e-05, + "loss": 5.801, + "step": 4627 + }, + { + "epoch": 0.02752402702445523, + "grad_norm": 2.6737735271453857, + "learning_rate": 4.9906636847272176e-05, + "loss": 6.2581, + "step": 4628 + }, + { + "epoch": 0.027529974307736228, + "grad_norm": 2.623969554901123, + "learning_rate": 4.990659651225786e-05, + "loss": 5.5044, + "step": 4629 + }, + { + "epoch": 0.027535921591017223, + "grad_norm": 3.069460153579712, + "learning_rate": 4.990655616854891e-05, + "loss": 5.9639, + "step": 4630 + }, + { + "epoch": 0.02754186887429822, + "grad_norm": 2.6889147758483887, + "learning_rate": 4.990651581614534e-05, + "loss": 6.3032, + "step": 4631 + }, + { + "epoch": 0.027547816157579216, + "grad_norm": 3.5284838676452637, + "learning_rate": 4.990647545504716e-05, + "loss": 6.4104, + "step": 4632 + }, + { + "epoch": 0.027553763440860215, + "grad_norm": 2.326162338256836, + "learning_rate": 4.9906435085254384e-05, + "loss": 6.2593, + "step": 4633 + }, + { + "epoch": 0.027559710724141213, + "grad_norm": 1.946542739868164, + "learning_rate": 4.990639470676703e-05, + "loss": 6.1522, + "step": 4634 + }, + { + "epoch": 0.02756565800742221, + "grad_norm": 2.26143741607666, + "learning_rate": 4.990635431958511e-05, + "loss": 6.0189, + "step": 4635 + }, + { + "epoch": 0.027571605290703207, + "grad_norm": 2.8332626819610596, + "learning_rate": 4.990631392370865e-05, + "loss": 5.6226, + "step": 4636 + }, + { + "epoch": 0.027577552573984206, + "grad_norm": 3.919443130493164, + "learning_rate": 4.9906273519137636e-05, + "loss": 6.2147, + "step": 4637 + }, + { + "epoch": 0.0275834998572652, + "grad_norm": 2.4030275344848633, + "learning_rate": 4.9906233105872115e-05, + "loss": 5.6589, + "step": 4638 + }, + { + "epoch": 0.0275894471405462, + "grad_norm": 2.7806994915008545, + "learning_rate": 4.990619268391207e-05, + "loss": 5.4349, + "step": 4639 + }, + { + "epoch": 0.027595394423827194, + "grad_norm": 2.5759501457214355, + "learning_rate": 4.990615225325754e-05, + "loss": 6.1171, + "step": 4640 + }, + { + "epoch": 0.027601341707108193, + "grad_norm": 2.337517023086548, + "learning_rate": 4.990611181390853e-05, + "loss": 5.5514, + "step": 4641 + }, + { + "epoch": 0.02760728899038919, + "grad_norm": 2.6464250087738037, + "learning_rate": 4.990607136586505e-05, + "loss": 6.1852, + "step": 4642 + }, + { + "epoch": 0.027613236273670187, + "grad_norm": 2.030210256576538, + "learning_rate": 4.9906030909127125e-05, + "loss": 6.0919, + "step": 4643 + }, + { + "epoch": 0.027619183556951185, + "grad_norm": 2.4546520709991455, + "learning_rate": 4.990599044369475e-05, + "loss": 6.3018, + "step": 4644 + }, + { + "epoch": 0.027625130840232184, + "grad_norm": 2.508500337600708, + "learning_rate": 4.990594996956796e-05, + "loss": 5.7933, + "step": 4645 + }, + { + "epoch": 0.02763107812351318, + "grad_norm": 2.3363263607025146, + "learning_rate": 4.990590948674676e-05, + "loss": 6.4252, + "step": 4646 + }, + { + "epoch": 0.027637025406794177, + "grad_norm": 2.794673442840576, + "learning_rate": 4.990586899523116e-05, + "loss": 5.3554, + "step": 4647 + }, + { + "epoch": 0.027642972690075172, + "grad_norm": 2.5396835803985596, + "learning_rate": 4.990582849502118e-05, + "loss": 5.2352, + "step": 4648 + }, + { + "epoch": 0.02764891997335617, + "grad_norm": 2.6878976821899414, + "learning_rate": 4.990578798611684e-05, + "loss": 4.9262, + "step": 4649 + }, + { + "epoch": 0.02765486725663717, + "grad_norm": 2.2143187522888184, + "learning_rate": 4.9905747468518136e-05, + "loss": 6.0785, + "step": 4650 + }, + { + "epoch": 0.027660814539918165, + "grad_norm": 2.6812448501586914, + "learning_rate": 4.9905706942225094e-05, + "loss": 5.1692, + "step": 4651 + }, + { + "epoch": 0.027666761823199163, + "grad_norm": 2.5155227184295654, + "learning_rate": 4.9905666407237726e-05, + "loss": 5.0194, + "step": 4652 + }, + { + "epoch": 0.027672709106480158, + "grad_norm": 2.406834363937378, + "learning_rate": 4.9905625863556047e-05, + "loss": 5.1249, + "step": 4653 + }, + { + "epoch": 0.027678656389761157, + "grad_norm": 3.3666698932647705, + "learning_rate": 4.990558531118008e-05, + "loss": 5.9619, + "step": 4654 + }, + { + "epoch": 0.027684603673042155, + "grad_norm": 2.6557607650756836, + "learning_rate": 4.9905544750109826e-05, + "loss": 5.9118, + "step": 4655 + }, + { + "epoch": 0.02769055095632315, + "grad_norm": 2.60469651222229, + "learning_rate": 4.9905504180345304e-05, + "loss": 6.3746, + "step": 4656 + }, + { + "epoch": 0.02769649823960415, + "grad_norm": 2.5417349338531494, + "learning_rate": 4.9905463601886526e-05, + "loss": 5.6975, + "step": 4657 + }, + { + "epoch": 0.027702445522885148, + "grad_norm": 2.723829984664917, + "learning_rate": 4.990542301473351e-05, + "loss": 5.6189, + "step": 4658 + }, + { + "epoch": 0.027708392806166143, + "grad_norm": 3.0544204711914062, + "learning_rate": 4.990538241888627e-05, + "loss": 5.4999, + "step": 4659 + }, + { + "epoch": 0.02771434008944714, + "grad_norm": 3.0536513328552246, + "learning_rate": 4.990534181434481e-05, + "loss": 6.0636, + "step": 4660 + }, + { + "epoch": 0.027720287372728136, + "grad_norm": 3.0618786811828613, + "learning_rate": 4.990530120110916e-05, + "loss": 6.0856, + "step": 4661 + }, + { + "epoch": 0.027726234656009135, + "grad_norm": 2.6602306365966797, + "learning_rate": 4.9905260579179325e-05, + "loss": 5.8341, + "step": 4662 + }, + { + "epoch": 0.027732181939290133, + "grad_norm": 2.729137420654297, + "learning_rate": 4.990521994855532e-05, + "loss": 6.7052, + "step": 4663 + }, + { + "epoch": 0.02773812922257113, + "grad_norm": 3.0878489017486572, + "learning_rate": 4.990517930923716e-05, + "loss": 6.1308, + "step": 4664 + }, + { + "epoch": 0.027744076505852127, + "grad_norm": 2.524418354034424, + "learning_rate": 4.990513866122486e-05, + "loss": 6.2547, + "step": 4665 + }, + { + "epoch": 0.027750023789133126, + "grad_norm": 2.457075595855713, + "learning_rate": 4.990509800451844e-05, + "loss": 6.6615, + "step": 4666 + }, + { + "epoch": 0.02775597107241412, + "grad_norm": 2.474487543106079, + "learning_rate": 4.9905057339117894e-05, + "loss": 6.63, + "step": 4667 + }, + { + "epoch": 0.02776191835569512, + "grad_norm": 2.611098289489746, + "learning_rate": 4.9905016665023254e-05, + "loss": 5.8232, + "step": 4668 + }, + { + "epoch": 0.027767865638976114, + "grad_norm": 2.8012242317199707, + "learning_rate": 4.990497598223454e-05, + "loss": 5.8478, + "step": 4669 + }, + { + "epoch": 0.027773812922257113, + "grad_norm": 2.706725597381592, + "learning_rate": 4.990493529075174e-05, + "loss": 5.8585, + "step": 4670 + }, + { + "epoch": 0.02777976020553811, + "grad_norm": 2.490032196044922, + "learning_rate": 4.99048945905749e-05, + "loss": 6.2181, + "step": 4671 + }, + { + "epoch": 0.027785707488819106, + "grad_norm": 2.4735357761383057, + "learning_rate": 4.990485388170401e-05, + "loss": 6.2153, + "step": 4672 + }, + { + "epoch": 0.027791654772100105, + "grad_norm": 2.7573068141937256, + "learning_rate": 4.9904813164139094e-05, + "loss": 6.217, + "step": 4673 + }, + { + "epoch": 0.027797602055381104, + "grad_norm": 2.4663283824920654, + "learning_rate": 4.990477243788017e-05, + "loss": 6.4153, + "step": 4674 + }, + { + "epoch": 0.0278035493386621, + "grad_norm": 2.737656831741333, + "learning_rate": 4.9904731702927234e-05, + "loss": 6.5209, + "step": 4675 + }, + { + "epoch": 0.027809496621943097, + "grad_norm": 2.5112721920013428, + "learning_rate": 4.990469095928032e-05, + "loss": 5.979, + "step": 4676 + }, + { + "epoch": 0.027815443905224092, + "grad_norm": 2.6602795124053955, + "learning_rate": 4.990465020693944e-05, + "loss": 5.9206, + "step": 4677 + }, + { + "epoch": 0.02782139118850509, + "grad_norm": 2.460538625717163, + "learning_rate": 4.9904609445904606e-05, + "loss": 5.9855, + "step": 4678 + }, + { + "epoch": 0.02782733847178609, + "grad_norm": 2.750138998031616, + "learning_rate": 4.990456867617582e-05, + "loss": 5.8425, + "step": 4679 + }, + { + "epoch": 0.027833285755067085, + "grad_norm": 2.9843833446502686, + "learning_rate": 4.9904527897753114e-05, + "loss": 6.1385, + "step": 4680 + }, + { + "epoch": 0.027839233038348083, + "grad_norm": 2.586923360824585, + "learning_rate": 4.99044871106365e-05, + "loss": 5.6278, + "step": 4681 + }, + { + "epoch": 0.027845180321629078, + "grad_norm": 3.114211082458496, + "learning_rate": 4.990444631482597e-05, + "loss": 6.1259, + "step": 4682 + }, + { + "epoch": 0.027851127604910077, + "grad_norm": 2.3222453594207764, + "learning_rate": 4.990440551032157e-05, + "loss": 6.3048, + "step": 4683 + }, + { + "epoch": 0.027857074888191075, + "grad_norm": 2.15678334236145, + "learning_rate": 4.99043646971233e-05, + "loss": 5.9082, + "step": 4684 + }, + { + "epoch": 0.02786302217147207, + "grad_norm": 3.946350574493408, + "learning_rate": 4.990432387523116e-05, + "loss": 5.6907, + "step": 4685 + }, + { + "epoch": 0.02786896945475307, + "grad_norm": 2.9612419605255127, + "learning_rate": 4.9904283044645185e-05, + "loss": 5.3894, + "step": 4686 + }, + { + "epoch": 0.027874916738034067, + "grad_norm": 2.3602261543273926, + "learning_rate": 4.990424220536538e-05, + "loss": 6.0716, + "step": 4687 + }, + { + "epoch": 0.027880864021315063, + "grad_norm": 2.822300672531128, + "learning_rate": 4.990420135739177e-05, + "loss": 5.9788, + "step": 4688 + }, + { + "epoch": 0.02788681130459606, + "grad_norm": 2.766280174255371, + "learning_rate": 4.990416050072435e-05, + "loss": 5.9945, + "step": 4689 + }, + { + "epoch": 0.027892758587877056, + "grad_norm": 2.810359239578247, + "learning_rate": 4.990411963536315e-05, + "loss": 6.0598, + "step": 4690 + }, + { + "epoch": 0.027898705871158055, + "grad_norm": 2.510014295578003, + "learning_rate": 4.990407876130818e-05, + "loss": 6.1793, + "step": 4691 + }, + { + "epoch": 0.027904653154439053, + "grad_norm": 2.5394086837768555, + "learning_rate": 4.990403787855945e-05, + "loss": 6.1309, + "step": 4692 + }, + { + "epoch": 0.02791060043772005, + "grad_norm": 2.922084093093872, + "learning_rate": 4.990399698711698e-05, + "loss": 6.1956, + "step": 4693 + }, + { + "epoch": 0.027916547721001047, + "grad_norm": 3.6614181995391846, + "learning_rate": 4.9903956086980785e-05, + "loss": 6.535, + "step": 4694 + }, + { + "epoch": 0.027922495004282046, + "grad_norm": 3.3680684566497803, + "learning_rate": 4.990391517815087e-05, + "loss": 6.5729, + "step": 4695 + }, + { + "epoch": 0.02792844228756304, + "grad_norm": 2.522193431854248, + "learning_rate": 4.990387426062726e-05, + "loss": 5.9406, + "step": 4696 + }, + { + "epoch": 0.02793438957084404, + "grad_norm": 2.9665534496307373, + "learning_rate": 4.990383333440996e-05, + "loss": 6.0281, + "step": 4697 + }, + { + "epoch": 0.027940336854125034, + "grad_norm": 2.643218755722046, + "learning_rate": 4.9903792399498996e-05, + "loss": 5.8965, + "step": 4698 + }, + { + "epoch": 0.027946284137406033, + "grad_norm": 2.498765230178833, + "learning_rate": 4.990375145589436e-05, + "loss": 6.0975, + "step": 4699 + }, + { + "epoch": 0.02795223142068703, + "grad_norm": 4.380255699157715, + "learning_rate": 4.99037105035961e-05, + "loss": 6.6298, + "step": 4700 + }, + { + "epoch": 0.027958178703968026, + "grad_norm": 3.925454616546631, + "learning_rate": 4.990366954260421e-05, + "loss": 6.5742, + "step": 4701 + }, + { + "epoch": 0.027964125987249025, + "grad_norm": 2.5388591289520264, + "learning_rate": 4.99036285729187e-05, + "loss": 6.6102, + "step": 4702 + }, + { + "epoch": 0.027970073270530024, + "grad_norm": 2.6793510913848877, + "learning_rate": 4.9903587594539594e-05, + "loss": 6.4265, + "step": 4703 + }, + { + "epoch": 0.02797602055381102, + "grad_norm": 2.8652729988098145, + "learning_rate": 4.9903546607466903e-05, + "loss": 6.4567, + "step": 4704 + }, + { + "epoch": 0.027981967837092017, + "grad_norm": 2.936021089553833, + "learning_rate": 4.990350561170063e-05, + "loss": 6.404, + "step": 4705 + }, + { + "epoch": 0.027987915120373012, + "grad_norm": 3.256253719329834, + "learning_rate": 4.9903464607240816e-05, + "loss": 6.2291, + "step": 4706 + }, + { + "epoch": 0.02799386240365401, + "grad_norm": 2.8268187046051025, + "learning_rate": 4.990342359408745e-05, + "loss": 6.2582, + "step": 4707 + }, + { + "epoch": 0.02799980968693501, + "grad_norm": 2.5889041423797607, + "learning_rate": 4.9903382572240556e-05, + "loss": 6.3325, + "step": 4708 + }, + { + "epoch": 0.028005756970216004, + "grad_norm": 2.635388135910034, + "learning_rate": 4.9903341541700154e-05, + "loss": 6.1256, + "step": 4709 + }, + { + "epoch": 0.028011704253497003, + "grad_norm": 2.562976360321045, + "learning_rate": 4.990330050246625e-05, + "loss": 5.9333, + "step": 4710 + }, + { + "epoch": 0.028017651536777998, + "grad_norm": 3.488809585571289, + "learning_rate": 4.990325945453887e-05, + "loss": 6.3651, + "step": 4711 + }, + { + "epoch": 0.028023598820058997, + "grad_norm": 2.963324546813965, + "learning_rate": 4.9903218397918e-05, + "loss": 6.718, + "step": 4712 + }, + { + "epoch": 0.028029546103339995, + "grad_norm": 2.4070823192596436, + "learning_rate": 4.990317733260369e-05, + "loss": 6.2502, + "step": 4713 + }, + { + "epoch": 0.02803549338662099, + "grad_norm": 2.711190938949585, + "learning_rate": 4.9903136258595925e-05, + "loss": 6.0397, + "step": 4714 + }, + { + "epoch": 0.02804144066990199, + "grad_norm": 2.466150999069214, + "learning_rate": 4.9903095175894746e-05, + "loss": 5.9344, + "step": 4715 + }, + { + "epoch": 0.028047387953182987, + "grad_norm": 2.4558048248291016, + "learning_rate": 4.990305408450014e-05, + "loss": 6.1121, + "step": 4716 + }, + { + "epoch": 0.028053335236463982, + "grad_norm": 2.4023051261901855, + "learning_rate": 4.990301298441215e-05, + "loss": 6.0202, + "step": 4717 + }, + { + "epoch": 0.02805928251974498, + "grad_norm": 3.118098258972168, + "learning_rate": 4.9902971875630765e-05, + "loss": 6.5365, + "step": 4718 + }, + { + "epoch": 0.028065229803025976, + "grad_norm": 2.3716087341308594, + "learning_rate": 4.990293075815602e-05, + "loss": 6.1382, + "step": 4719 + }, + { + "epoch": 0.028071177086306975, + "grad_norm": 2.4663496017456055, + "learning_rate": 4.990288963198791e-05, + "loss": 5.9804, + "step": 4720 + }, + { + "epoch": 0.028077124369587973, + "grad_norm": 2.2623326778411865, + "learning_rate": 4.9902848497126466e-05, + "loss": 5.9666, + "step": 4721 + }, + { + "epoch": 0.02808307165286897, + "grad_norm": 2.4884161949157715, + "learning_rate": 4.990280735357168e-05, + "loss": 6.0203, + "step": 4722 + }, + { + "epoch": 0.028089018936149967, + "grad_norm": 2.6154520511627197, + "learning_rate": 4.990276620132359e-05, + "loss": 5.9191, + "step": 4723 + }, + { + "epoch": 0.028094966219430965, + "grad_norm": 2.692396879196167, + "learning_rate": 4.990272504038221e-05, + "loss": 6.5314, + "step": 4724 + }, + { + "epoch": 0.02810091350271196, + "grad_norm": 2.483306407928467, + "learning_rate": 4.990268387074754e-05, + "loss": 6.6522, + "step": 4725 + }, + { + "epoch": 0.02810686078599296, + "grad_norm": 3.2098593711853027, + "learning_rate": 4.99026426924196e-05, + "loss": 5.8712, + "step": 4726 + }, + { + "epoch": 0.028112808069273954, + "grad_norm": 2.7335867881774902, + "learning_rate": 4.99026015053984e-05, + "loss": 5.7678, + "step": 4727 + }, + { + "epoch": 0.028118755352554953, + "grad_norm": 2.7587473392486572, + "learning_rate": 4.990256030968396e-05, + "loss": 6.4233, + "step": 4728 + }, + { + "epoch": 0.02812470263583595, + "grad_norm": 2.7686030864715576, + "learning_rate": 4.99025191052763e-05, + "loss": 6.4572, + "step": 4729 + }, + { + "epoch": 0.028130649919116946, + "grad_norm": 2.755916118621826, + "learning_rate": 4.990247789217543e-05, + "loss": 5.9858, + "step": 4730 + }, + { + "epoch": 0.028136597202397945, + "grad_norm": 2.614316463470459, + "learning_rate": 4.990243667038135e-05, + "loss": 6.2315, + "step": 4731 + }, + { + "epoch": 0.028142544485678943, + "grad_norm": 2.0796027183532715, + "learning_rate": 4.990239543989409e-05, + "loss": 6.236, + "step": 4732 + }, + { + "epoch": 0.02814849176895994, + "grad_norm": 2.623412847518921, + "learning_rate": 4.9902354200713665e-05, + "loss": 6.3962, + "step": 4733 + }, + { + "epoch": 0.028154439052240937, + "grad_norm": 2.2746191024780273, + "learning_rate": 4.9902312952840086e-05, + "loss": 5.9101, + "step": 4734 + }, + { + "epoch": 0.028160386335521932, + "grad_norm": 2.102444887161255, + "learning_rate": 4.990227169627336e-05, + "loss": 6.4652, + "step": 4735 + }, + { + "epoch": 0.02816633361880293, + "grad_norm": 2.7720580101013184, + "learning_rate": 4.990223043101352e-05, + "loss": 5.8981, + "step": 4736 + }, + { + "epoch": 0.02817228090208393, + "grad_norm": 2.4479453563690186, + "learning_rate": 4.9902189157060564e-05, + "loss": 6.3554, + "step": 4737 + }, + { + "epoch": 0.028178228185364924, + "grad_norm": 2.7894740104675293, + "learning_rate": 4.990214787441451e-05, + "loss": 6.0017, + "step": 4738 + }, + { + "epoch": 0.028184175468645923, + "grad_norm": 2.869884490966797, + "learning_rate": 4.990210658307537e-05, + "loss": 5.9419, + "step": 4739 + }, + { + "epoch": 0.028190122751926918, + "grad_norm": 2.262723207473755, + "learning_rate": 4.990206528304316e-05, + "loss": 6.172, + "step": 4740 + }, + { + "epoch": 0.028196070035207917, + "grad_norm": 2.179358720779419, + "learning_rate": 4.99020239743179e-05, + "loss": 6.5204, + "step": 4741 + }, + { + "epoch": 0.028202017318488915, + "grad_norm": 2.085179328918457, + "learning_rate": 4.9901982656899606e-05, + "loss": 6.3972, + "step": 4742 + }, + { + "epoch": 0.02820796460176991, + "grad_norm": 1.657567024230957, + "learning_rate": 4.990194133078828e-05, + "loss": 6.4199, + "step": 4743 + }, + { + "epoch": 0.02821391188505091, + "grad_norm": 1.8054349422454834, + "learning_rate": 4.990189999598395e-05, + "loss": 6.3768, + "step": 4744 + }, + { + "epoch": 0.028219859168331907, + "grad_norm": 2.0365710258483887, + "learning_rate": 4.990185865248662e-05, + "loss": 6.3228, + "step": 4745 + }, + { + "epoch": 0.028225806451612902, + "grad_norm": 2.069211006164551, + "learning_rate": 4.9901817300296304e-05, + "loss": 5.9874, + "step": 4746 + }, + { + "epoch": 0.0282317537348939, + "grad_norm": 2.3339149951934814, + "learning_rate": 4.9901775939413026e-05, + "loss": 6.1526, + "step": 4747 + }, + { + "epoch": 0.028237701018174896, + "grad_norm": 2.0425326824188232, + "learning_rate": 4.99017345698368e-05, + "loss": 6.2157, + "step": 4748 + }, + { + "epoch": 0.028243648301455895, + "grad_norm": 2.1598799228668213, + "learning_rate": 4.9901693191567625e-05, + "loss": 6.2653, + "step": 4749 + }, + { + "epoch": 0.028249595584736893, + "grad_norm": 2.066566228866577, + "learning_rate": 4.990165180460553e-05, + "loss": 6.3788, + "step": 4750 + }, + { + "epoch": 0.02825554286801789, + "grad_norm": 2.2870383262634277, + "learning_rate": 4.9901610408950527e-05, + "loss": 6.2608, + "step": 4751 + }, + { + "epoch": 0.028261490151298887, + "grad_norm": 2.3180785179138184, + "learning_rate": 4.990156900460263e-05, + "loss": 6.3545, + "step": 4752 + }, + { + "epoch": 0.028267437434579885, + "grad_norm": 2.55261492729187, + "learning_rate": 4.990152759156185e-05, + "loss": 6.3888, + "step": 4753 + }, + { + "epoch": 0.02827338471786088, + "grad_norm": 2.087925910949707, + "learning_rate": 4.990148616982821e-05, + "loss": 6.3585, + "step": 4754 + }, + { + "epoch": 0.02827933200114188, + "grad_norm": 2.2446579933166504, + "learning_rate": 4.9901444739401714e-05, + "loss": 6.4655, + "step": 4755 + }, + { + "epoch": 0.028285279284422874, + "grad_norm": 2.2980077266693115, + "learning_rate": 4.990140330028238e-05, + "loss": 6.3776, + "step": 4756 + }, + { + "epoch": 0.028291226567703873, + "grad_norm": 2.0658226013183594, + "learning_rate": 4.9901361852470224e-05, + "loss": 6.0412, + "step": 4757 + }, + { + "epoch": 0.02829717385098487, + "grad_norm": 2.8402137756347656, + "learning_rate": 4.990132039596526e-05, + "loss": 6.0017, + "step": 4758 + }, + { + "epoch": 0.028303121134265866, + "grad_norm": 2.4620237350463867, + "learning_rate": 4.99012789307675e-05, + "loss": 5.9235, + "step": 4759 + }, + { + "epoch": 0.028309068417546865, + "grad_norm": 2.3318607807159424, + "learning_rate": 4.990123745687697e-05, + "loss": 6.2464, + "step": 4760 + }, + { + "epoch": 0.028315015700827863, + "grad_norm": 2.4998981952667236, + "learning_rate": 4.9901195974293666e-05, + "loss": 6.2731, + "step": 4761 + }, + { + "epoch": 0.02832096298410886, + "grad_norm": 2.4374287128448486, + "learning_rate": 4.9901154483017614e-05, + "loss": 6.362, + "step": 4762 + }, + { + "epoch": 0.028326910267389857, + "grad_norm": 2.6257424354553223, + "learning_rate": 4.990111298304882e-05, + "loss": 6.1456, + "step": 4763 + }, + { + "epoch": 0.028332857550670852, + "grad_norm": 2.74934458732605, + "learning_rate": 4.990107147438732e-05, + "loss": 6.0121, + "step": 4764 + }, + { + "epoch": 0.02833880483395185, + "grad_norm": 2.33137583732605, + "learning_rate": 4.9901029957033106e-05, + "loss": 6.0207, + "step": 4765 + }, + { + "epoch": 0.02834475211723285, + "grad_norm": 1.9006321430206299, + "learning_rate": 4.9900988430986196e-05, + "loss": 5.8946, + "step": 4766 + }, + { + "epoch": 0.028350699400513844, + "grad_norm": 1.9786534309387207, + "learning_rate": 4.990094689624661e-05, + "loss": 5.7782, + "step": 4767 + }, + { + "epoch": 0.028356646683794843, + "grad_norm": 2.1215951442718506, + "learning_rate": 4.9900905352814365e-05, + "loss": 5.8129, + "step": 4768 + }, + { + "epoch": 0.02836259396707584, + "grad_norm": 2.9569597244262695, + "learning_rate": 4.9900863800689465e-05, + "loss": 5.7882, + "step": 4769 + }, + { + "epoch": 0.028368541250356837, + "grad_norm": 2.720447540283203, + "learning_rate": 4.990082223987193e-05, + "loss": 5.9075, + "step": 4770 + }, + { + "epoch": 0.028374488533637835, + "grad_norm": 2.8727002143859863, + "learning_rate": 4.990078067036178e-05, + "loss": 6.1571, + "step": 4771 + }, + { + "epoch": 0.02838043581691883, + "grad_norm": 2.2992594242095947, + "learning_rate": 4.990073909215902e-05, + "loss": 6.0195, + "step": 4772 + }, + { + "epoch": 0.02838638310019983, + "grad_norm": 2.0323293209075928, + "learning_rate": 4.990069750526368e-05, + "loss": 5.8049, + "step": 4773 + }, + { + "epoch": 0.028392330383480827, + "grad_norm": 2.938795328140259, + "learning_rate": 4.9900655909675755e-05, + "loss": 6.9215, + "step": 4774 + }, + { + "epoch": 0.028398277666761822, + "grad_norm": 2.6333048343658447, + "learning_rate": 4.990061430539527e-05, + "loss": 5.868, + "step": 4775 + }, + { + "epoch": 0.02840422495004282, + "grad_norm": 2.8569674491882324, + "learning_rate": 4.990057269242223e-05, + "loss": 5.8782, + "step": 4776 + }, + { + "epoch": 0.028410172233323816, + "grad_norm": 2.62206768989563, + "learning_rate": 4.9900531070756666e-05, + "loss": 5.7751, + "step": 4777 + }, + { + "epoch": 0.028416119516604815, + "grad_norm": 2.2112414836883545, + "learning_rate": 4.990048944039858e-05, + "loss": 5.7985, + "step": 4778 + }, + { + "epoch": 0.028422066799885813, + "grad_norm": 2.1571342945098877, + "learning_rate": 4.990044780134799e-05, + "loss": 5.9089, + "step": 4779 + }, + { + "epoch": 0.028428014083166808, + "grad_norm": 2.4310410022735596, + "learning_rate": 4.9900406153604916e-05, + "loss": 5.6728, + "step": 4780 + }, + { + "epoch": 0.028433961366447807, + "grad_norm": 2.25822377204895, + "learning_rate": 4.990036449716937e-05, + "loss": 5.5808, + "step": 4781 + }, + { + "epoch": 0.028439908649728805, + "grad_norm": 2.3068299293518066, + "learning_rate": 4.990032283204136e-05, + "loss": 5.729, + "step": 4782 + }, + { + "epoch": 0.0284458559330098, + "grad_norm": 2.0582191944122314, + "learning_rate": 4.9900281158220905e-05, + "loss": 5.6877, + "step": 4783 + }, + { + "epoch": 0.0284518032162908, + "grad_norm": 2.572824239730835, + "learning_rate": 4.9900239475708015e-05, + "loss": 5.9522, + "step": 4784 + }, + { + "epoch": 0.028457750499571794, + "grad_norm": 2.299001693725586, + "learning_rate": 4.990019778450271e-05, + "loss": 5.7579, + "step": 4785 + }, + { + "epoch": 0.028463697782852793, + "grad_norm": 2.231381893157959, + "learning_rate": 4.990015608460501e-05, + "loss": 5.756, + "step": 4786 + }, + { + "epoch": 0.02846964506613379, + "grad_norm": 1.7982486486434937, + "learning_rate": 4.990011437601492e-05, + "loss": 5.8076, + "step": 4787 + }, + { + "epoch": 0.028475592349414786, + "grad_norm": 1.8788951635360718, + "learning_rate": 4.990007265873245e-05, + "loss": 5.8798, + "step": 4788 + }, + { + "epoch": 0.028481539632695785, + "grad_norm": 1.6190022230148315, + "learning_rate": 4.9900030932757623e-05, + "loss": 5.5695, + "step": 4789 + }, + { + "epoch": 0.028487486915976783, + "grad_norm": 1.9226019382476807, + "learning_rate": 4.9899989198090455e-05, + "loss": 5.671, + "step": 4790 + }, + { + "epoch": 0.02849343419925778, + "grad_norm": 1.7437139749526978, + "learning_rate": 4.989994745473097e-05, + "loss": 5.6728, + "step": 4791 + }, + { + "epoch": 0.028499381482538777, + "grad_norm": 1.624126672744751, + "learning_rate": 4.989990570267915e-05, + "loss": 5.6209, + "step": 4792 + }, + { + "epoch": 0.028505328765819772, + "grad_norm": 2.1894004344940186, + "learning_rate": 4.9899863941935046e-05, + "loss": 5.6669, + "step": 4793 + }, + { + "epoch": 0.02851127604910077, + "grad_norm": 2.2243428230285645, + "learning_rate": 4.9899822172498646e-05, + "loss": 5.4557, + "step": 4794 + }, + { + "epoch": 0.02851722333238177, + "grad_norm": 2.032611608505249, + "learning_rate": 4.989978039436998e-05, + "loss": 5.7883, + "step": 4795 + }, + { + "epoch": 0.028523170615662764, + "grad_norm": 1.8496538400650024, + "learning_rate": 4.989973860754906e-05, + "loss": 5.6329, + "step": 4796 + }, + { + "epoch": 0.028529117898943763, + "grad_norm": 1.7072707414627075, + "learning_rate": 4.989969681203589e-05, + "loss": 5.7242, + "step": 4797 + }, + { + "epoch": 0.02853506518222476, + "grad_norm": 1.7351912260055542, + "learning_rate": 4.9899655007830504e-05, + "loss": 5.648, + "step": 4798 + }, + { + "epoch": 0.028541012465505756, + "grad_norm": 2.514162302017212, + "learning_rate": 4.9899613194932904e-05, + "loss": 5.556, + "step": 4799 + }, + { + "epoch": 0.028546959748786755, + "grad_norm": 10.245063781738281, + "learning_rate": 4.98995713733431e-05, + "loss": 5.5922, + "step": 4800 + }, + { + "epoch": 0.02855290703206775, + "grad_norm": 2.012106418609619, + "learning_rate": 4.989952954306112e-05, + "loss": 5.5092, + "step": 4801 + }, + { + "epoch": 0.02855885431534875, + "grad_norm": 1.8654139041900635, + "learning_rate": 4.9899487704086966e-05, + "loss": 5.4164, + "step": 4802 + }, + { + "epoch": 0.028564801598629747, + "grad_norm": 1.778798222541809, + "learning_rate": 4.9899445856420656e-05, + "loss": 5.5537, + "step": 4803 + }, + { + "epoch": 0.028570748881910742, + "grad_norm": 2.205038547515869, + "learning_rate": 4.989940400006221e-05, + "loss": 5.9338, + "step": 4804 + }, + { + "epoch": 0.02857669616519174, + "grad_norm": 2.3908839225769043, + "learning_rate": 4.989936213501164e-05, + "loss": 5.8962, + "step": 4805 + }, + { + "epoch": 0.028582643448472736, + "grad_norm": 2.3438172340393066, + "learning_rate": 4.9899320261268966e-05, + "loss": 5.8133, + "step": 4806 + }, + { + "epoch": 0.028588590731753735, + "grad_norm": 2.4021737575531006, + "learning_rate": 4.989927837883419e-05, + "loss": 5.8366, + "step": 4807 + }, + { + "epoch": 0.028594538015034733, + "grad_norm": 1.9976004362106323, + "learning_rate": 4.989923648770734e-05, + "loss": 5.6976, + "step": 4808 + }, + { + "epoch": 0.028600485298315728, + "grad_norm": 2.2234697341918945, + "learning_rate": 4.989919458788841e-05, + "loss": 5.7871, + "step": 4809 + }, + { + "epoch": 0.028606432581596727, + "grad_norm": 2.203223705291748, + "learning_rate": 4.989915267937744e-05, + "loss": 5.5799, + "step": 4810 + }, + { + "epoch": 0.028612379864877725, + "grad_norm": 2.2155261039733887, + "learning_rate": 4.989911076217442e-05, + "loss": 5.6022, + "step": 4811 + }, + { + "epoch": 0.02861832714815872, + "grad_norm": 1.9379621744155884, + "learning_rate": 4.989906883627939e-05, + "loss": 5.8647, + "step": 4812 + }, + { + "epoch": 0.02862427443143972, + "grad_norm": 2.0589749813079834, + "learning_rate": 4.9899026901692345e-05, + "loss": 5.6048, + "step": 4813 + }, + { + "epoch": 0.028630221714720714, + "grad_norm": 2.3813774585723877, + "learning_rate": 4.9898984958413315e-05, + "loss": 5.6726, + "step": 4814 + }, + { + "epoch": 0.028636168998001713, + "grad_norm": 2.06425142288208, + "learning_rate": 4.98989430064423e-05, + "loss": 5.8505, + "step": 4815 + }, + { + "epoch": 0.02864211628128271, + "grad_norm": 2.199697494506836, + "learning_rate": 4.9898901045779326e-05, + "loss": 5.6114, + "step": 4816 + }, + { + "epoch": 0.028648063564563706, + "grad_norm": 2.136411428451538, + "learning_rate": 4.98988590764244e-05, + "loss": 5.3987, + "step": 4817 + }, + { + "epoch": 0.028654010847844705, + "grad_norm": 1.914929986000061, + "learning_rate": 4.9898817098377534e-05, + "loss": 5.702, + "step": 4818 + }, + { + "epoch": 0.028659958131125703, + "grad_norm": 2.316027879714966, + "learning_rate": 4.989877511163876e-05, + "loss": 5.5886, + "step": 4819 + }, + { + "epoch": 0.0286659054144067, + "grad_norm": 3.2775018215179443, + "learning_rate": 4.9898733116208076e-05, + "loss": 5.5337, + "step": 4820 + }, + { + "epoch": 0.028671852697687697, + "grad_norm": 2.16430926322937, + "learning_rate": 4.989869111208549e-05, + "loss": 5.7189, + "step": 4821 + }, + { + "epoch": 0.028677799980968692, + "grad_norm": 2.1936638355255127, + "learning_rate": 4.9898649099271046e-05, + "loss": 5.2942, + "step": 4822 + }, + { + "epoch": 0.02868374726424969, + "grad_norm": 2.262485980987549, + "learning_rate": 4.9898607077764736e-05, + "loss": 5.4284, + "step": 4823 + }, + { + "epoch": 0.02868969454753069, + "grad_norm": 1.7890170812606812, + "learning_rate": 4.989856504756657e-05, + "loss": 5.6021, + "step": 4824 + }, + { + "epoch": 0.028695641830811684, + "grad_norm": 1.747862696647644, + "learning_rate": 4.9898523008676585e-05, + "loss": 5.72, + "step": 4825 + }, + { + "epoch": 0.028701589114092683, + "grad_norm": 1.9750064611434937, + "learning_rate": 4.989848096109477e-05, + "loss": 5.8923, + "step": 4826 + }, + { + "epoch": 0.02870753639737368, + "grad_norm": 2.0249626636505127, + "learning_rate": 4.989843890482117e-05, + "loss": 5.4866, + "step": 4827 + }, + { + "epoch": 0.028713483680654676, + "grad_norm": 2.2737395763397217, + "learning_rate": 4.9898396839855765e-05, + "loss": 5.5498, + "step": 4828 + }, + { + "epoch": 0.028719430963935675, + "grad_norm": 2.2852187156677246, + "learning_rate": 4.98983547661986e-05, + "loss": 5.672, + "step": 4829 + }, + { + "epoch": 0.02872537824721667, + "grad_norm": 1.9441994428634644, + "learning_rate": 4.989831268384967e-05, + "loss": 5.4933, + "step": 4830 + }, + { + "epoch": 0.02873132553049767, + "grad_norm": 1.9561070203781128, + "learning_rate": 4.989827059280899e-05, + "loss": 5.7465, + "step": 4831 + }, + { + "epoch": 0.028737272813778667, + "grad_norm": 2.482849597930908, + "learning_rate": 4.9898228493076594e-05, + "loss": 5.4338, + "step": 4832 + }, + { + "epoch": 0.028743220097059662, + "grad_norm": 1.8582524061203003, + "learning_rate": 4.989818638465247e-05, + "loss": 5.5378, + "step": 4833 + }, + { + "epoch": 0.02874916738034066, + "grad_norm": 2.119783639907837, + "learning_rate": 4.9898144267536654e-05, + "loss": 5.6012, + "step": 4834 + }, + { + "epoch": 0.028755114663621656, + "grad_norm": 2.333965301513672, + "learning_rate": 4.989810214172915e-05, + "loss": 5.7376, + "step": 4835 + }, + { + "epoch": 0.028761061946902654, + "grad_norm": 2.600861072540283, + "learning_rate": 4.989806000722999e-05, + "loss": 6.2747, + "step": 4836 + }, + { + "epoch": 0.028767009230183653, + "grad_norm": 2.3250534534454346, + "learning_rate": 4.989801786403916e-05, + "loss": 5.5993, + "step": 4837 + }, + { + "epoch": 0.028772956513464648, + "grad_norm": 2.507377862930298, + "learning_rate": 4.9897975712156686e-05, + "loss": 5.3919, + "step": 4838 + }, + { + "epoch": 0.028778903796745647, + "grad_norm": 1.9882018566131592, + "learning_rate": 4.9897933551582596e-05, + "loss": 5.5939, + "step": 4839 + }, + { + "epoch": 0.028784851080026645, + "grad_norm": 2.235269784927368, + "learning_rate": 4.989789138231688e-05, + "loss": 5.4036, + "step": 4840 + }, + { + "epoch": 0.02879079836330764, + "grad_norm": 1.895071029663086, + "learning_rate": 4.989784920435959e-05, + "loss": 5.7259, + "step": 4841 + }, + { + "epoch": 0.02879674564658864, + "grad_norm": 2.0197908878326416, + "learning_rate": 4.989780701771071e-05, + "loss": 5.5114, + "step": 4842 + }, + { + "epoch": 0.028802692929869634, + "grad_norm": 1.9679557085037231, + "learning_rate": 4.989776482237025e-05, + "loss": 5.5798, + "step": 4843 + }, + { + "epoch": 0.028808640213150633, + "grad_norm": 1.980610728263855, + "learning_rate": 4.989772261833825e-05, + "loss": 5.5509, + "step": 4844 + }, + { + "epoch": 0.02881458749643163, + "grad_norm": 2.4565272331237793, + "learning_rate": 4.989768040561471e-05, + "loss": 5.4723, + "step": 4845 + }, + { + "epoch": 0.028820534779712626, + "grad_norm": 2.0567848682403564, + "learning_rate": 4.989763818419964e-05, + "loss": 5.546, + "step": 4846 + }, + { + "epoch": 0.028826482062993625, + "grad_norm": 2.0259108543395996, + "learning_rate": 4.989759595409307e-05, + "loss": 5.4138, + "step": 4847 + }, + { + "epoch": 0.028832429346274623, + "grad_norm": 1.9334442615509033, + "learning_rate": 4.9897553715295003e-05, + "loss": 5.7036, + "step": 4848 + }, + { + "epoch": 0.02883837662955562, + "grad_norm": 1.8335916996002197, + "learning_rate": 4.989751146780546e-05, + "loss": 5.6399, + "step": 4849 + }, + { + "epoch": 0.028844323912836617, + "grad_norm": 2.129821538925171, + "learning_rate": 4.989746921162445e-05, + "loss": 5.7108, + "step": 4850 + }, + { + "epoch": 0.028850271196117612, + "grad_norm": 2.4127001762390137, + "learning_rate": 4.9897426946751994e-05, + "loss": 5.3901, + "step": 4851 + }, + { + "epoch": 0.02885621847939861, + "grad_norm": 1.9506126642227173, + "learning_rate": 4.98973846731881e-05, + "loss": 5.7781, + "step": 4852 + }, + { + "epoch": 0.02886216576267961, + "grad_norm": 1.6746875047683716, + "learning_rate": 4.9897342390932786e-05, + "loss": 5.7408, + "step": 4853 + }, + { + "epoch": 0.028868113045960604, + "grad_norm": 1.95681893825531, + "learning_rate": 4.989730009998607e-05, + "loss": 5.7181, + "step": 4854 + }, + { + "epoch": 0.028874060329241603, + "grad_norm": 1.782030701637268, + "learning_rate": 4.9897257800347964e-05, + "loss": 5.5901, + "step": 4855 + }, + { + "epoch": 0.0288800076125226, + "grad_norm": 1.7590057849884033, + "learning_rate": 4.9897215492018476e-05, + "loss": 5.4566, + "step": 4856 + }, + { + "epoch": 0.028885954895803596, + "grad_norm": 2.4675025939941406, + "learning_rate": 4.989717317499764e-05, + "loss": 5.7738, + "step": 4857 + }, + { + "epoch": 0.028891902179084595, + "grad_norm": 2.221975326538086, + "learning_rate": 4.989713084928545e-05, + "loss": 5.591, + "step": 4858 + }, + { + "epoch": 0.02889784946236559, + "grad_norm": 2.21158504486084, + "learning_rate": 4.989708851488192e-05, + "loss": 5.7755, + "step": 4859 + }, + { + "epoch": 0.02890379674564659, + "grad_norm": 2.2253987789154053, + "learning_rate": 4.989704617178709e-05, + "loss": 5.8653, + "step": 4860 + }, + { + "epoch": 0.028909744028927587, + "grad_norm": 2.3298027515411377, + "learning_rate": 4.989700382000094e-05, + "loss": 5.3371, + "step": 4861 + }, + { + "epoch": 0.028915691312208582, + "grad_norm": 2.1918935775756836, + "learning_rate": 4.989696145952352e-05, + "loss": 5.4893, + "step": 4862 + }, + { + "epoch": 0.02892163859548958, + "grad_norm": 2.422117233276367, + "learning_rate": 4.989691909035482e-05, + "loss": 5.8775, + "step": 4863 + }, + { + "epoch": 0.02892758587877058, + "grad_norm": 2.4346981048583984, + "learning_rate": 4.989687671249487e-05, + "loss": 6.3671, + "step": 4864 + }, + { + "epoch": 0.028933533162051574, + "grad_norm": 2.094780921936035, + "learning_rate": 4.989683432594367e-05, + "loss": 5.7814, + "step": 4865 + }, + { + "epoch": 0.028939480445332573, + "grad_norm": 2.240318775177002, + "learning_rate": 4.9896791930701244e-05, + "loss": 5.6606, + "step": 4866 + }, + { + "epoch": 0.028945427728613568, + "grad_norm": 2.102381706237793, + "learning_rate": 4.989674952676761e-05, + "loss": 5.8477, + "step": 4867 + }, + { + "epoch": 0.028951375011894567, + "grad_norm": 2.2786238193511963, + "learning_rate": 4.989670711414277e-05, + "loss": 5.8786, + "step": 4868 + }, + { + "epoch": 0.028957322295175565, + "grad_norm": 2.079899549484253, + "learning_rate": 4.989666469282675e-05, + "loss": 6.2171, + "step": 4869 + }, + { + "epoch": 0.02896326957845656, + "grad_norm": 2.024061679840088, + "learning_rate": 4.989662226281956e-05, + "loss": 6.2889, + "step": 4870 + }, + { + "epoch": 0.02896921686173756, + "grad_norm": 2.1397578716278076, + "learning_rate": 4.989657982412122e-05, + "loss": 6.2477, + "step": 4871 + }, + { + "epoch": 0.028975164145018554, + "grad_norm": 2.1303393840789795, + "learning_rate": 4.989653737673174e-05, + "loss": 6.3005, + "step": 4872 + }, + { + "epoch": 0.028981111428299552, + "grad_norm": 2.4091451168060303, + "learning_rate": 4.989649492065114e-05, + "loss": 5.997, + "step": 4873 + }, + { + "epoch": 0.02898705871158055, + "grad_norm": 2.2236886024475098, + "learning_rate": 4.989645245587942e-05, + "loss": 5.7886, + "step": 4874 + }, + { + "epoch": 0.028993005994861546, + "grad_norm": 2.6160736083984375, + "learning_rate": 4.989640998241661e-05, + "loss": 6.1542, + "step": 4875 + }, + { + "epoch": 0.028998953278142545, + "grad_norm": 2.4163296222686768, + "learning_rate": 4.989636750026273e-05, + "loss": 6.392, + "step": 4876 + }, + { + "epoch": 0.029004900561423543, + "grad_norm": 2.079172372817993, + "learning_rate": 4.989632500941778e-05, + "loss": 6.2886, + "step": 4877 + }, + { + "epoch": 0.02901084784470454, + "grad_norm": 2.628694772720337, + "learning_rate": 4.989628250988178e-05, + "loss": 6.0359, + "step": 4878 + }, + { + "epoch": 0.029016795127985537, + "grad_norm": 2.2080392837524414, + "learning_rate": 4.989624000165474e-05, + "loss": 5.9916, + "step": 4879 + }, + { + "epoch": 0.029022742411266532, + "grad_norm": 2.4130380153656006, + "learning_rate": 4.9896197484736685e-05, + "loss": 6.3835, + "step": 4880 + }, + { + "epoch": 0.02902868969454753, + "grad_norm": 2.328511953353882, + "learning_rate": 4.989615495912762e-05, + "loss": 5.838, + "step": 4881 + }, + { + "epoch": 0.02903463697782853, + "grad_norm": 2.273345470428467, + "learning_rate": 4.989611242482757e-05, + "loss": 5.8764, + "step": 4882 + }, + { + "epoch": 0.029040584261109524, + "grad_norm": 2.1498537063598633, + "learning_rate": 4.9896069881836535e-05, + "loss": 6.1562, + "step": 4883 + }, + { + "epoch": 0.029046531544390523, + "grad_norm": 2.497267723083496, + "learning_rate": 4.989602733015455e-05, + "loss": 5.6708, + "step": 4884 + }, + { + "epoch": 0.02905247882767152, + "grad_norm": 2.232802152633667, + "learning_rate": 4.989598476978161e-05, + "loss": 5.6854, + "step": 4885 + }, + { + "epoch": 0.029058426110952516, + "grad_norm": 2.0582375526428223, + "learning_rate": 4.989594220071775e-05, + "loss": 6.5288, + "step": 4886 + }, + { + "epoch": 0.029064373394233515, + "grad_norm": 3.2556731700897217, + "learning_rate": 4.989589962296296e-05, + "loss": 5.9985, + "step": 4887 + }, + { + "epoch": 0.02907032067751451, + "grad_norm": 2.2807655334472656, + "learning_rate": 4.989585703651728e-05, + "loss": 6.1802, + "step": 4888 + }, + { + "epoch": 0.02907626796079551, + "grad_norm": 2.379136085510254, + "learning_rate": 4.989581444138071e-05, + "loss": 6.3531, + "step": 4889 + }, + { + "epoch": 0.029082215244076507, + "grad_norm": 2.9518685340881348, + "learning_rate": 4.989577183755327e-05, + "loss": 6.0689, + "step": 4890 + }, + { + "epoch": 0.029088162527357502, + "grad_norm": 2.823340654373169, + "learning_rate": 4.9895729225034973e-05, + "loss": 6.3405, + "step": 4891 + }, + { + "epoch": 0.0290941098106385, + "grad_norm": 2.4327731132507324, + "learning_rate": 4.989568660382583e-05, + "loss": 6.4928, + "step": 4892 + }, + { + "epoch": 0.0291000570939195, + "grad_norm": 2.0744240283966064, + "learning_rate": 4.9895643973925864e-05, + "loss": 6.2664, + "step": 4893 + }, + { + "epoch": 0.029106004377200494, + "grad_norm": 2.373710870742798, + "learning_rate": 4.9895601335335085e-05, + "loss": 5.9738, + "step": 4894 + }, + { + "epoch": 0.029111951660481493, + "grad_norm": 2.2934412956237793, + "learning_rate": 4.9895558688053505e-05, + "loss": 6.1353, + "step": 4895 + }, + { + "epoch": 0.029117898943762488, + "grad_norm": 2.4360926151275635, + "learning_rate": 4.989551603208114e-05, + "loss": 5.4768, + "step": 4896 + }, + { + "epoch": 0.029123846227043487, + "grad_norm": 2.8072469234466553, + "learning_rate": 4.989547336741802e-05, + "loss": 5.977, + "step": 4897 + }, + { + "epoch": 0.029129793510324485, + "grad_norm": 2.7759921550750732, + "learning_rate": 4.9895430694064135e-05, + "loss": 6.3918, + "step": 4898 + }, + { + "epoch": 0.02913574079360548, + "grad_norm": 2.4547574520111084, + "learning_rate": 4.989538801201953e-05, + "loss": 6.0461, + "step": 4899 + }, + { + "epoch": 0.02914168807688648, + "grad_norm": 2.6097168922424316, + "learning_rate": 4.9895345321284184e-05, + "loss": 5.88, + "step": 4900 + }, + { + "epoch": 0.029147635360167474, + "grad_norm": 2.8312575817108154, + "learning_rate": 4.989530262185814e-05, + "loss": 6.0314, + "step": 4901 + }, + { + "epoch": 0.029153582643448472, + "grad_norm": 2.928974151611328, + "learning_rate": 4.98952599137414e-05, + "loss": 6.3698, + "step": 4902 + }, + { + "epoch": 0.02915952992672947, + "grad_norm": 2.527578115463257, + "learning_rate": 4.989521719693398e-05, + "loss": 6.4301, + "step": 4903 + }, + { + "epoch": 0.029165477210010466, + "grad_norm": 2.392106771469116, + "learning_rate": 4.9895174471435904e-05, + "loss": 6.3515, + "step": 4904 + }, + { + "epoch": 0.029171424493291465, + "grad_norm": 1.9899437427520752, + "learning_rate": 4.989513173724717e-05, + "loss": 6.3265, + "step": 4905 + }, + { + "epoch": 0.029177371776572463, + "grad_norm": 2.057600736618042, + "learning_rate": 4.9895088994367806e-05, + "loss": 6.2402, + "step": 4906 + }, + { + "epoch": 0.029183319059853458, + "grad_norm": 2.8310391902923584, + "learning_rate": 4.989504624279783e-05, + "loss": 5.9056, + "step": 4907 + }, + { + "epoch": 0.029189266343134457, + "grad_norm": 2.904785394668579, + "learning_rate": 4.989500348253724e-05, + "loss": 5.8847, + "step": 4908 + }, + { + "epoch": 0.029195213626415452, + "grad_norm": 2.7728030681610107, + "learning_rate": 4.989496071358607e-05, + "loss": 5.8997, + "step": 4909 + }, + { + "epoch": 0.02920116090969645, + "grad_norm": 2.768862009048462, + "learning_rate": 4.989491793594432e-05, + "loss": 6.1267, + "step": 4910 + }, + { + "epoch": 0.02920710819297745, + "grad_norm": 2.4353668689727783, + "learning_rate": 4.989487514961201e-05, + "loss": 5.9087, + "step": 4911 + }, + { + "epoch": 0.029213055476258444, + "grad_norm": 2.5170469284057617, + "learning_rate": 4.9894832354589164e-05, + "loss": 6.0971, + "step": 4912 + }, + { + "epoch": 0.029219002759539443, + "grad_norm": 2.345998764038086, + "learning_rate": 4.9894789550875784e-05, + "loss": 6.2518, + "step": 4913 + }, + { + "epoch": 0.02922495004282044, + "grad_norm": 2.429123878479004, + "learning_rate": 4.98947467384719e-05, + "loss": 6.238, + "step": 4914 + }, + { + "epoch": 0.029230897326101436, + "grad_norm": 2.531514883041382, + "learning_rate": 4.9894703917377506e-05, + "loss": 6.0177, + "step": 4915 + }, + { + "epoch": 0.029236844609382435, + "grad_norm": 2.833874464035034, + "learning_rate": 4.9894661087592634e-05, + "loss": 6.2018, + "step": 4916 + }, + { + "epoch": 0.02924279189266343, + "grad_norm": 2.521381378173828, + "learning_rate": 4.9894618249117287e-05, + "loss": 6.1777, + "step": 4917 + }, + { + "epoch": 0.02924873917594443, + "grad_norm": 2.731703758239746, + "learning_rate": 4.989457540195149e-05, + "loss": 6.0237, + "step": 4918 + }, + { + "epoch": 0.029254686459225427, + "grad_norm": 2.918398141860962, + "learning_rate": 4.989453254609525e-05, + "loss": 6.5688, + "step": 4919 + }, + { + "epoch": 0.029260633742506422, + "grad_norm": 2.407552480697632, + "learning_rate": 4.989448968154859e-05, + "loss": 5.9751, + "step": 4920 + }, + { + "epoch": 0.02926658102578742, + "grad_norm": 2.575258731842041, + "learning_rate": 4.989444680831152e-05, + "loss": 5.7587, + "step": 4921 + }, + { + "epoch": 0.02927252830906842, + "grad_norm": 2.6550750732421875, + "learning_rate": 4.989440392638406e-05, + "loss": 6.6404, + "step": 4922 + }, + { + "epoch": 0.029278475592349414, + "grad_norm": 2.569438934326172, + "learning_rate": 4.989436103576621e-05, + "loss": 5.8615, + "step": 4923 + }, + { + "epoch": 0.029284422875630413, + "grad_norm": 2.4601991176605225, + "learning_rate": 4.989431813645801e-05, + "loss": 5.8969, + "step": 4924 + }, + { + "epoch": 0.029290370158911408, + "grad_norm": 3.579819917678833, + "learning_rate": 4.989427522845945e-05, + "loss": 5.8832, + "step": 4925 + }, + { + "epoch": 0.029296317442192406, + "grad_norm": 2.5762264728546143, + "learning_rate": 4.9894232311770556e-05, + "loss": 5.4841, + "step": 4926 + }, + { + "epoch": 0.029302264725473405, + "grad_norm": 3.352381706237793, + "learning_rate": 4.989418938639134e-05, + "loss": 5.8936, + "step": 4927 + }, + { + "epoch": 0.0293082120087544, + "grad_norm": 2.824322462081909, + "learning_rate": 4.9894146452321835e-05, + "loss": 5.8291, + "step": 4928 + }, + { + "epoch": 0.0293141592920354, + "grad_norm": 2.6431384086608887, + "learning_rate": 4.9894103509562026e-05, + "loss": 6.2519, + "step": 4929 + }, + { + "epoch": 0.029320106575316394, + "grad_norm": 3.0580949783325195, + "learning_rate": 4.989406055811195e-05, + "loss": 6.4141, + "step": 4930 + }, + { + "epoch": 0.029326053858597392, + "grad_norm": 2.757420778274536, + "learning_rate": 4.989401759797161e-05, + "loss": 6.1427, + "step": 4931 + }, + { + "epoch": 0.02933200114187839, + "grad_norm": 2.713111639022827, + "learning_rate": 4.989397462914103e-05, + "loss": 6.4107, + "step": 4932 + }, + { + "epoch": 0.029337948425159386, + "grad_norm": 2.7954351902008057, + "learning_rate": 4.9893931651620215e-05, + "loss": 5.7657, + "step": 4933 + }, + { + "epoch": 0.029343895708440385, + "grad_norm": 2.3637917041778564, + "learning_rate": 4.9893888665409196e-05, + "loss": 5.8209, + "step": 4934 + }, + { + "epoch": 0.029349842991721383, + "grad_norm": 2.938631296157837, + "learning_rate": 4.9893845670507964e-05, + "loss": 6.0502, + "step": 4935 + }, + { + "epoch": 0.029355790275002378, + "grad_norm": 2.8911824226379395, + "learning_rate": 4.989380266691655e-05, + "loss": 5.9736, + "step": 4936 + }, + { + "epoch": 0.029361737558283377, + "grad_norm": 2.9410245418548584, + "learning_rate": 4.989375965463498e-05, + "loss": 5.2824, + "step": 4937 + }, + { + "epoch": 0.029367684841564372, + "grad_norm": 2.4925217628479004, + "learning_rate": 4.9893716633663244e-05, + "loss": 5.5829, + "step": 4938 + }, + { + "epoch": 0.02937363212484537, + "grad_norm": 2.485349178314209, + "learning_rate": 4.9893673604001366e-05, + "loss": 5.8812, + "step": 4939 + }, + { + "epoch": 0.02937957940812637, + "grad_norm": 2.3950133323669434, + "learning_rate": 4.9893630565649376e-05, + "loss": 5.9314, + "step": 4940 + }, + { + "epoch": 0.029385526691407364, + "grad_norm": 2.28104829788208, + "learning_rate": 4.989358751860726e-05, + "loss": 6.1768, + "step": 4941 + }, + { + "epoch": 0.029391473974688363, + "grad_norm": 2.4479010105133057, + "learning_rate": 4.989354446287507e-05, + "loss": 6.1645, + "step": 4942 + }, + { + "epoch": 0.02939742125796936, + "grad_norm": 1.9465230703353882, + "learning_rate": 4.989350139845279e-05, + "loss": 5.7145, + "step": 4943 + }, + { + "epoch": 0.029403368541250356, + "grad_norm": 2.4120032787323, + "learning_rate": 4.989345832534045e-05, + "loss": 5.695, + "step": 4944 + }, + { + "epoch": 0.029409315824531355, + "grad_norm": 2.6345109939575195, + "learning_rate": 4.989341524353805e-05, + "loss": 5.4805, + "step": 4945 + }, + { + "epoch": 0.02941526310781235, + "grad_norm": 2.8750240802764893, + "learning_rate": 4.989337215304563e-05, + "loss": 5.0352, + "step": 4946 + }, + { + "epoch": 0.02942121039109335, + "grad_norm": 2.7220489978790283, + "learning_rate": 4.989332905386318e-05, + "loss": 5.1646, + "step": 4947 + }, + { + "epoch": 0.029427157674374347, + "grad_norm": 2.464871883392334, + "learning_rate": 4.9893285945990734e-05, + "loss": 4.9989, + "step": 4948 + }, + { + "epoch": 0.029433104957655342, + "grad_norm": 2.261049270629883, + "learning_rate": 4.989324282942829e-05, + "loss": 6.2217, + "step": 4949 + }, + { + "epoch": 0.02943905224093634, + "grad_norm": 2.224818468093872, + "learning_rate": 4.9893199704175876e-05, + "loss": 6.3964, + "step": 4950 + }, + { + "epoch": 0.02944499952421734, + "grad_norm": 2.366520643234253, + "learning_rate": 4.989315657023351e-05, + "loss": 6.3572, + "step": 4951 + }, + { + "epoch": 0.029450946807498334, + "grad_norm": 2.4811010360717773, + "learning_rate": 4.989311342760119e-05, + "loss": 5.7867, + "step": 4952 + }, + { + "epoch": 0.029456894090779333, + "grad_norm": 2.246730089187622, + "learning_rate": 4.989307027627895e-05, + "loss": 6.0865, + "step": 4953 + }, + { + "epoch": 0.029462841374060328, + "grad_norm": 2.297379493713379, + "learning_rate": 4.989302711626679e-05, + "loss": 5.9257, + "step": 4954 + }, + { + "epoch": 0.029468788657341326, + "grad_norm": 2.5890488624572754, + "learning_rate": 4.989298394756473e-05, + "loss": 5.7631, + "step": 4955 + }, + { + "epoch": 0.029474735940622325, + "grad_norm": 3.3777449131011963, + "learning_rate": 4.989294077017279e-05, + "loss": 5.4014, + "step": 4956 + }, + { + "epoch": 0.02948068322390332, + "grad_norm": 2.0395402908325195, + "learning_rate": 4.9892897584090986e-05, + "loss": 6.2429, + "step": 4957 + }, + { + "epoch": 0.02948663050718432, + "grad_norm": 2.0414693355560303, + "learning_rate": 4.989285438931932e-05, + "loss": 6.4685, + "step": 4958 + }, + { + "epoch": 0.029492577790465314, + "grad_norm": 2.2383265495300293, + "learning_rate": 4.989281118585783e-05, + "loss": 6.1651, + "step": 4959 + }, + { + "epoch": 0.029498525073746312, + "grad_norm": 2.559720754623413, + "learning_rate": 4.98927679737065e-05, + "loss": 6.3822, + "step": 4960 + }, + { + "epoch": 0.02950447235702731, + "grad_norm": 2.810699939727783, + "learning_rate": 4.989272475286537e-05, + "loss": 6.2076, + "step": 4961 + }, + { + "epoch": 0.029510419640308306, + "grad_norm": 2.9151525497436523, + "learning_rate": 4.989268152333445e-05, + "loss": 5.9892, + "step": 4962 + }, + { + "epoch": 0.029516366923589304, + "grad_norm": 2.295197010040283, + "learning_rate": 4.9892638285113744e-05, + "loss": 6.1392, + "step": 4963 + }, + { + "epoch": 0.029522314206870303, + "grad_norm": 2.271088123321533, + "learning_rate": 4.989259503820328e-05, + "loss": 6.6991, + "step": 4964 + }, + { + "epoch": 0.029528261490151298, + "grad_norm": 2.338074207305908, + "learning_rate": 4.9892551782603064e-05, + "loss": 5.9615, + "step": 4965 + }, + { + "epoch": 0.029534208773432297, + "grad_norm": 2.3510494232177734, + "learning_rate": 4.989250851831312e-05, + "loss": 5.8894, + "step": 4966 + }, + { + "epoch": 0.029540156056713292, + "grad_norm": 2.1170454025268555, + "learning_rate": 4.989246524533345e-05, + "loss": 5.6921, + "step": 4967 + }, + { + "epoch": 0.02954610333999429, + "grad_norm": 3.289508104324341, + "learning_rate": 4.989242196366409e-05, + "loss": 6.1689, + "step": 4968 + }, + { + "epoch": 0.02955205062327529, + "grad_norm": 2.068229913711548, + "learning_rate": 4.989237867330504e-05, + "loss": 6.3342, + "step": 4969 + }, + { + "epoch": 0.029557997906556284, + "grad_norm": 2.198928117752075, + "learning_rate": 4.9892335374256316e-05, + "loss": 6.5125, + "step": 4970 + }, + { + "epoch": 0.029563945189837283, + "grad_norm": 2.3634228706359863, + "learning_rate": 4.989229206651793e-05, + "loss": 5.8328, + "step": 4971 + }, + { + "epoch": 0.02956989247311828, + "grad_norm": 2.1632115840911865, + "learning_rate": 4.989224875008991e-05, + "loss": 6.0702, + "step": 4972 + }, + { + "epoch": 0.029575839756399276, + "grad_norm": 2.461888313293457, + "learning_rate": 4.989220542497226e-05, + "loss": 6.01, + "step": 4973 + }, + { + "epoch": 0.029581787039680275, + "grad_norm": 2.668333053588867, + "learning_rate": 4.9892162091164997e-05, + "loss": 6.0369, + "step": 4974 + }, + { + "epoch": 0.02958773432296127, + "grad_norm": 3.0210723876953125, + "learning_rate": 4.9892118748668135e-05, + "loss": 6.0652, + "step": 4975 + }, + { + "epoch": 0.02959368160624227, + "grad_norm": 2.937350034713745, + "learning_rate": 4.98920753974817e-05, + "loss": 6.0205, + "step": 4976 + }, + { + "epoch": 0.029599628889523267, + "grad_norm": 2.904499053955078, + "learning_rate": 4.9892032037605685e-05, + "loss": 5.9561, + "step": 4977 + }, + { + "epoch": 0.029605576172804262, + "grad_norm": 2.218867778778076, + "learning_rate": 4.989198866904013e-05, + "loss": 5.4173, + "step": 4978 + }, + { + "epoch": 0.02961152345608526, + "grad_norm": 3.009920835494995, + "learning_rate": 4.9891945291785034e-05, + "loss": 5.5577, + "step": 4979 + }, + { + "epoch": 0.02961747073936626, + "grad_norm": 2.731687545776367, + "learning_rate": 4.9891901905840424e-05, + "loss": 5.6591, + "step": 4980 + }, + { + "epoch": 0.029623418022647254, + "grad_norm": 2.244101047515869, + "learning_rate": 4.98918585112063e-05, + "loss": 6.1434, + "step": 4981 + }, + { + "epoch": 0.029629365305928253, + "grad_norm": 2.3366870880126953, + "learning_rate": 4.989181510788269e-05, + "loss": 6.0132, + "step": 4982 + }, + { + "epoch": 0.029635312589209248, + "grad_norm": 3.2757890224456787, + "learning_rate": 4.98917716958696e-05, + "loss": 5.7486, + "step": 4983 + }, + { + "epoch": 0.029641259872490246, + "grad_norm": 2.361041784286499, + "learning_rate": 4.989172827516705e-05, + "loss": 5.8192, + "step": 4984 + }, + { + "epoch": 0.029647207155771245, + "grad_norm": 3.3433775901794434, + "learning_rate": 4.9891684845775054e-05, + "loss": 5.8688, + "step": 4985 + }, + { + "epoch": 0.02965315443905224, + "grad_norm": 2.6427462100982666, + "learning_rate": 4.9891641407693635e-05, + "loss": 5.9459, + "step": 4986 + }, + { + "epoch": 0.02965910172233324, + "grad_norm": 3.0931055545806885, + "learning_rate": 4.9891597960922795e-05, + "loss": 6.4822, + "step": 4987 + }, + { + "epoch": 0.029665049005614237, + "grad_norm": 2.598477840423584, + "learning_rate": 4.989155450546256e-05, + "loss": 6.0362, + "step": 4988 + }, + { + "epoch": 0.029670996288895232, + "grad_norm": 2.460313081741333, + "learning_rate": 4.989151104131294e-05, + "loss": 5.6209, + "step": 4989 + }, + { + "epoch": 0.02967694357217623, + "grad_norm": 2.4712390899658203, + "learning_rate": 4.989146756847395e-05, + "loss": 6.3849, + "step": 4990 + }, + { + "epoch": 0.029682890855457226, + "grad_norm": 2.365860939025879, + "learning_rate": 4.98914240869456e-05, + "loss": 6.2791, + "step": 4991 + }, + { + "epoch": 0.029688838138738224, + "grad_norm": 2.6213366985321045, + "learning_rate": 4.9891380596727915e-05, + "loss": 6.2888, + "step": 4992 + }, + { + "epoch": 0.029694785422019223, + "grad_norm": 2.742213487625122, + "learning_rate": 4.989133709782091e-05, + "loss": 6.3522, + "step": 4993 + }, + { + "epoch": 0.029700732705300218, + "grad_norm": 2.2428665161132812, + "learning_rate": 4.9891293590224594e-05, + "loss": 6.6735, + "step": 4994 + }, + { + "epoch": 0.029706679988581217, + "grad_norm": 2.4242279529571533, + "learning_rate": 4.989125007393898e-05, + "loss": 6.2283, + "step": 4995 + }, + { + "epoch": 0.02971262727186221, + "grad_norm": 2.422177314758301, + "learning_rate": 4.989120654896409e-05, + "loss": 6.0273, + "step": 4996 + }, + { + "epoch": 0.02971857455514321, + "grad_norm": 2.4325926303863525, + "learning_rate": 4.989116301529994e-05, + "loss": 5.9504, + "step": 4997 + }, + { + "epoch": 0.02972452183842421, + "grad_norm": 2.42901873588562, + "learning_rate": 4.9891119472946544e-05, + "loss": 5.8156, + "step": 4998 + }, + { + "epoch": 0.029730469121705204, + "grad_norm": 2.4361307621002197, + "learning_rate": 4.989107592190391e-05, + "loss": 5.9025, + "step": 4999 + }, + { + "epoch": 0.029736416404986202, + "grad_norm": 2.9486470222473145, + "learning_rate": 4.9891032362172065e-05, + "loss": 6.3204, + "step": 5000 + }, + { + "epoch": 0.0297423636882672, + "grad_norm": 2.456681966781616, + "learning_rate": 4.989098879375101e-05, + "loss": 5.8203, + "step": 5001 + }, + { + "epoch": 0.029748310971548196, + "grad_norm": 2.5065391063690186, + "learning_rate": 4.9890945216640775e-05, + "loss": 6.452, + "step": 5002 + }, + { + "epoch": 0.029754258254829195, + "grad_norm": 2.386488199234009, + "learning_rate": 4.989090163084136e-05, + "loss": 5.9195, + "step": 5003 + }, + { + "epoch": 0.02976020553811019, + "grad_norm": 2.1387040615081787, + "learning_rate": 4.9890858036352796e-05, + "loss": 6.2127, + "step": 5004 + }, + { + "epoch": 0.02976615282139119, + "grad_norm": 2.518099784851074, + "learning_rate": 4.989081443317508e-05, + "loss": 6.1099, + "step": 5005 + }, + { + "epoch": 0.029772100104672187, + "grad_norm": 3.2108826637268066, + "learning_rate": 4.989077082130825e-05, + "loss": 5.9808, + "step": 5006 + }, + { + "epoch": 0.029778047387953182, + "grad_norm": 2.176065444946289, + "learning_rate": 4.9890727200752304e-05, + "loss": 6.0825, + "step": 5007 + }, + { + "epoch": 0.02978399467123418, + "grad_norm": 2.2961249351501465, + "learning_rate": 4.9890683571507265e-05, + "loss": 5.968, + "step": 5008 + }, + { + "epoch": 0.02978994195451518, + "grad_norm": 2.1954386234283447, + "learning_rate": 4.9890639933573144e-05, + "loss": 6.0799, + "step": 5009 + }, + { + "epoch": 0.029795889237796174, + "grad_norm": 2.256039619445801, + "learning_rate": 4.989059628694995e-05, + "loss": 5.9503, + "step": 5010 + }, + { + "epoch": 0.029801836521077173, + "grad_norm": 2.4350922107696533, + "learning_rate": 4.9890552631637715e-05, + "loss": 5.6741, + "step": 5011 + }, + { + "epoch": 0.029807783804358168, + "grad_norm": 2.68904447555542, + "learning_rate": 4.989050896763645e-05, + "loss": 5.5872, + "step": 5012 + }, + { + "epoch": 0.029813731087639166, + "grad_norm": 2.2877871990203857, + "learning_rate": 4.989046529494615e-05, + "loss": 6.1273, + "step": 5013 + }, + { + "epoch": 0.029819678370920165, + "grad_norm": 2.350348711013794, + "learning_rate": 4.989042161356686e-05, + "loss": 6.1113, + "step": 5014 + }, + { + "epoch": 0.02982562565420116, + "grad_norm": 2.295382499694824, + "learning_rate": 4.989037792349858e-05, + "loss": 6.036, + "step": 5015 + }, + { + "epoch": 0.02983157293748216, + "grad_norm": 2.317863941192627, + "learning_rate": 4.989033422474131e-05, + "loss": 5.961, + "step": 5016 + }, + { + "epoch": 0.029837520220763157, + "grad_norm": 2.286289930343628, + "learning_rate": 4.9890290517295095e-05, + "loss": 5.8163, + "step": 5017 + }, + { + "epoch": 0.029843467504044152, + "grad_norm": 2.246863842010498, + "learning_rate": 4.989024680115993e-05, + "loss": 5.9689, + "step": 5018 + }, + { + "epoch": 0.02984941478732515, + "grad_norm": 1.8732661008834839, + "learning_rate": 4.989020307633585e-05, + "loss": 5.9046, + "step": 5019 + }, + { + "epoch": 0.029855362070606146, + "grad_norm": 2.0211753845214844, + "learning_rate": 4.989015934282285e-05, + "loss": 5.95, + "step": 5020 + }, + { + "epoch": 0.029861309353887144, + "grad_norm": 2.014890193939209, + "learning_rate": 4.9890115600620946e-05, + "loss": 5.7312, + "step": 5021 + }, + { + "epoch": 0.029867256637168143, + "grad_norm": 2.2749524116516113, + "learning_rate": 4.989007184973017e-05, + "loss": 6.2573, + "step": 5022 + }, + { + "epoch": 0.029873203920449138, + "grad_norm": 2.080747604370117, + "learning_rate": 4.989002809015052e-05, + "loss": 5.7607, + "step": 5023 + }, + { + "epoch": 0.029879151203730137, + "grad_norm": 2.3403279781341553, + "learning_rate": 4.988998432188202e-05, + "loss": 5.7876, + "step": 5024 + }, + { + "epoch": 0.02988509848701113, + "grad_norm": 2.573802947998047, + "learning_rate": 4.988994054492468e-05, + "loss": 5.9036, + "step": 5025 + }, + { + "epoch": 0.02989104577029213, + "grad_norm": 2.267409324645996, + "learning_rate": 4.988989675927853e-05, + "loss": 5.7433, + "step": 5026 + }, + { + "epoch": 0.02989699305357313, + "grad_norm": 2.8241517543792725, + "learning_rate": 4.9889852964943566e-05, + "loss": 6.2338, + "step": 5027 + }, + { + "epoch": 0.029902940336854124, + "grad_norm": 2.338927745819092, + "learning_rate": 4.988980916191982e-05, + "loss": 6.0226, + "step": 5028 + }, + { + "epoch": 0.029908887620135122, + "grad_norm": 2.0798492431640625, + "learning_rate": 4.9889765350207285e-05, + "loss": 5.6919, + "step": 5029 + }, + { + "epoch": 0.02991483490341612, + "grad_norm": 2.3199923038482666, + "learning_rate": 4.9889721529806e-05, + "loss": 5.7533, + "step": 5030 + }, + { + "epoch": 0.029920782186697116, + "grad_norm": 2.1074399948120117, + "learning_rate": 4.988967770071596e-05, + "loss": 5.7486, + "step": 5031 + }, + { + "epoch": 0.029926729469978115, + "grad_norm": 2.2539381980895996, + "learning_rate": 4.9889633862937205e-05, + "loss": 5.6816, + "step": 5032 + }, + { + "epoch": 0.02993267675325911, + "grad_norm": 2.1393015384674072, + "learning_rate": 4.9889590016469726e-05, + "loss": 5.6635, + "step": 5033 + }, + { + "epoch": 0.029938624036540108, + "grad_norm": 2.6661975383758545, + "learning_rate": 4.988954616131355e-05, + "loss": 6.0218, + "step": 5034 + }, + { + "epoch": 0.029944571319821107, + "grad_norm": 2.6529600620269775, + "learning_rate": 4.988950229746869e-05, + "loss": 5.8847, + "step": 5035 + }, + { + "epoch": 0.029950518603102102, + "grad_norm": 2.510859966278076, + "learning_rate": 4.988945842493517e-05, + "loss": 5.7154, + "step": 5036 + }, + { + "epoch": 0.0299564658863831, + "grad_norm": 2.875394105911255, + "learning_rate": 4.9889414543712985e-05, + "loss": 5.6304, + "step": 5037 + }, + { + "epoch": 0.0299624131696641, + "grad_norm": 2.718808650970459, + "learning_rate": 4.988937065380217e-05, + "loss": 5.6562, + "step": 5038 + }, + { + "epoch": 0.029968360452945094, + "grad_norm": 2.702265501022339, + "learning_rate": 4.988932675520273e-05, + "loss": 5.6484, + "step": 5039 + }, + { + "epoch": 0.029974307736226093, + "grad_norm": 2.765209436416626, + "learning_rate": 4.988928284791469e-05, + "loss": 5.793, + "step": 5040 + }, + { + "epoch": 0.029980255019507088, + "grad_norm": 3.386352062225342, + "learning_rate": 4.9889238931938047e-05, + "loss": 5.5392, + "step": 5041 + }, + { + "epoch": 0.029986202302788086, + "grad_norm": 2.1632583141326904, + "learning_rate": 4.988919500727284e-05, + "loss": 5.8032, + "step": 5042 + }, + { + "epoch": 0.029992149586069085, + "grad_norm": 2.4121060371398926, + "learning_rate": 4.9889151073919064e-05, + "loss": 5.9793, + "step": 5043 + }, + { + "epoch": 0.02999809686935008, + "grad_norm": 2.2160584926605225, + "learning_rate": 4.988910713187674e-05, + "loss": 5.8802, + "step": 5044 + }, + { + "epoch": 0.03000404415263108, + "grad_norm": 3.120509386062622, + "learning_rate": 4.988906318114589e-05, + "loss": 5.5691, + "step": 5045 + }, + { + "epoch": 0.030009991435912077, + "grad_norm": 3.0660078525543213, + "learning_rate": 4.988901922172652e-05, + "loss": 5.3687, + "step": 5046 + }, + { + "epoch": 0.030015938719193072, + "grad_norm": 1.939757227897644, + "learning_rate": 4.988897525361867e-05, + "loss": 5.526, + "step": 5047 + }, + { + "epoch": 0.03002188600247407, + "grad_norm": 2.2970168590545654, + "learning_rate": 4.9888931276822315e-05, + "loss": 5.6334, + "step": 5048 + }, + { + "epoch": 0.030027833285755066, + "grad_norm": 2.162632942199707, + "learning_rate": 4.988888729133749e-05, + "loss": 5.8887, + "step": 5049 + }, + { + "epoch": 0.030033780569036064, + "grad_norm": 2.027017831802368, + "learning_rate": 4.9888843297164223e-05, + "loss": 5.9237, + "step": 5050 + }, + { + "epoch": 0.030039727852317063, + "grad_norm": 1.9226456880569458, + "learning_rate": 4.988879929430251e-05, + "loss": 5.6833, + "step": 5051 + }, + { + "epoch": 0.030045675135598058, + "grad_norm": 1.6490615606307983, + "learning_rate": 4.9888755282752384e-05, + "loss": 5.5738, + "step": 5052 + }, + { + "epoch": 0.030051622418879056, + "grad_norm": 2.456385850906372, + "learning_rate": 4.9888711262513846e-05, + "loss": 5.3771, + "step": 5053 + }, + { + "epoch": 0.03005756970216005, + "grad_norm": 2.480044364929199, + "learning_rate": 4.988866723358692e-05, + "loss": 5.2456, + "step": 5054 + }, + { + "epoch": 0.03006351698544105, + "grad_norm": 2.4033162593841553, + "learning_rate": 4.988862319597161e-05, + "loss": 5.1629, + "step": 5055 + }, + { + "epoch": 0.03006946426872205, + "grad_norm": 2.7228541374206543, + "learning_rate": 4.9888579149667935e-05, + "loss": 5.0195, + "step": 5056 + }, + { + "epoch": 0.030075411552003044, + "grad_norm": 2.4641635417938232, + "learning_rate": 4.9888535094675926e-05, + "loss": 5.3259, + "step": 5057 + }, + { + "epoch": 0.030081358835284042, + "grad_norm": 2.443666458129883, + "learning_rate": 4.9888491030995575e-05, + "loss": 5.4212, + "step": 5058 + }, + { + "epoch": 0.03008730611856504, + "grad_norm": 2.3267531394958496, + "learning_rate": 4.988844695862692e-05, + "loss": 5.6517, + "step": 5059 + }, + { + "epoch": 0.030093253401846036, + "grad_norm": 1.9090640544891357, + "learning_rate": 4.988840287756996e-05, + "loss": 5.7946, + "step": 5060 + }, + { + "epoch": 0.030099200685127035, + "grad_norm": 1.6169202327728271, + "learning_rate": 4.988835878782472e-05, + "loss": 5.7332, + "step": 5061 + }, + { + "epoch": 0.03010514796840803, + "grad_norm": 1.9369432926177979, + "learning_rate": 4.9888314689391205e-05, + "loss": 5.5954, + "step": 5062 + }, + { + "epoch": 0.030111095251689028, + "grad_norm": 2.0444133281707764, + "learning_rate": 4.9888270582269434e-05, + "loss": 5.5332, + "step": 5063 + }, + { + "epoch": 0.030117042534970027, + "grad_norm": 1.949061632156372, + "learning_rate": 4.988822646645943e-05, + "loss": 5.6064, + "step": 5064 + }, + { + "epoch": 0.030122989818251022, + "grad_norm": 1.5208648443222046, + "learning_rate": 4.988818234196121e-05, + "loss": 5.6615, + "step": 5065 + }, + { + "epoch": 0.03012893710153202, + "grad_norm": 1.8466709852218628, + "learning_rate": 4.988813820877477e-05, + "loss": 5.79, + "step": 5066 + }, + { + "epoch": 0.03013488438481302, + "grad_norm": 1.7094037532806396, + "learning_rate": 4.988809406690015e-05, + "loss": 5.8194, + "step": 5067 + }, + { + "epoch": 0.030140831668094014, + "grad_norm": 1.5698916912078857, + "learning_rate": 4.988804991633734e-05, + "loss": 5.5981, + "step": 5068 + }, + { + "epoch": 0.030146778951375013, + "grad_norm": 2.032156467437744, + "learning_rate": 4.988800575708638e-05, + "loss": 5.6729, + "step": 5069 + }, + { + "epoch": 0.030152726234656008, + "grad_norm": 1.9716484546661377, + "learning_rate": 4.988796158914727e-05, + "loss": 5.5227, + "step": 5070 + }, + { + "epoch": 0.030158673517937006, + "grad_norm": 1.8809682130813599, + "learning_rate": 4.988791741252002e-05, + "loss": 5.6231, + "step": 5071 + }, + { + "epoch": 0.030164620801218005, + "grad_norm": 1.8293371200561523, + "learning_rate": 4.9887873227204675e-05, + "loss": 5.5067, + "step": 5072 + }, + { + "epoch": 0.030170568084499, + "grad_norm": 2.225281000137329, + "learning_rate": 4.988782903320122e-05, + "loss": 5.3056, + "step": 5073 + }, + { + "epoch": 0.03017651536778, + "grad_norm": 2.0776474475860596, + "learning_rate": 4.988778483050968e-05, + "loss": 5.206, + "step": 5074 + }, + { + "epoch": 0.030182462651060997, + "grad_norm": 2.068323850631714, + "learning_rate": 4.9887740619130076e-05, + "loss": 5.5975, + "step": 5075 + }, + { + "epoch": 0.030188409934341992, + "grad_norm": 2.077782392501831, + "learning_rate": 4.988769639906241e-05, + "loss": 5.6967, + "step": 5076 + }, + { + "epoch": 0.03019435721762299, + "grad_norm": 1.9837195873260498, + "learning_rate": 4.988765217030672e-05, + "loss": 5.7834, + "step": 5077 + }, + { + "epoch": 0.030200304500903986, + "grad_norm": 1.9612236022949219, + "learning_rate": 4.9887607932863e-05, + "loss": 5.5472, + "step": 5078 + }, + { + "epoch": 0.030206251784184984, + "grad_norm": 2.022251605987549, + "learning_rate": 4.988756368673127e-05, + "loss": 5.704, + "step": 5079 + }, + { + "epoch": 0.030212199067465983, + "grad_norm": 2.02227783203125, + "learning_rate": 4.988751943191156e-05, + "loss": 5.4125, + "step": 5080 + }, + { + "epoch": 0.030218146350746978, + "grad_norm": 2.0527732372283936, + "learning_rate": 4.9887475168403856e-05, + "loss": 5.464, + "step": 5081 + }, + { + "epoch": 0.030224093634027976, + "grad_norm": 2.1465423107147217, + "learning_rate": 4.9887430896208205e-05, + "loss": 5.3415, + "step": 5082 + }, + { + "epoch": 0.03023004091730897, + "grad_norm": 1.9170550107955933, + "learning_rate": 4.9887386615324606e-05, + "loss": 5.5762, + "step": 5083 + }, + { + "epoch": 0.03023598820058997, + "grad_norm": 3.367650032043457, + "learning_rate": 4.988734232575307e-05, + "loss": 6.26, + "step": 5084 + }, + { + "epoch": 0.03024193548387097, + "grad_norm": 2.0784621238708496, + "learning_rate": 4.988729802749363e-05, + "loss": 5.5316, + "step": 5085 + }, + { + "epoch": 0.030247882767151964, + "grad_norm": 1.9531089067459106, + "learning_rate": 4.988725372054629e-05, + "loss": 5.5901, + "step": 5086 + }, + { + "epoch": 0.030253830050432962, + "grad_norm": 1.9677239656448364, + "learning_rate": 4.988720940491106e-05, + "loss": 5.4963, + "step": 5087 + }, + { + "epoch": 0.03025977733371396, + "grad_norm": 1.9835426807403564, + "learning_rate": 4.988716508058797e-05, + "loss": 5.6355, + "step": 5088 + }, + { + "epoch": 0.030265724616994956, + "grad_norm": 1.908250331878662, + "learning_rate": 4.988712074757703e-05, + "loss": 5.165, + "step": 5089 + }, + { + "epoch": 0.030271671900275954, + "grad_norm": 1.9852073192596436, + "learning_rate": 4.9887076405878246e-05, + "loss": 5.6623, + "step": 5090 + }, + { + "epoch": 0.03027761918355695, + "grad_norm": 1.9073505401611328, + "learning_rate": 4.988703205549164e-05, + "loss": 5.6685, + "step": 5091 + }, + { + "epoch": 0.030283566466837948, + "grad_norm": 1.744931697845459, + "learning_rate": 4.988698769641724e-05, + "loss": 5.4004, + "step": 5092 + }, + { + "epoch": 0.030289513750118947, + "grad_norm": 2.0623345375061035, + "learning_rate": 4.9886943328655034e-05, + "loss": 5.3846, + "step": 5093 + }, + { + "epoch": 0.030295461033399942, + "grad_norm": 1.647375226020813, + "learning_rate": 4.9886898952205064e-05, + "loss": 5.5823, + "step": 5094 + }, + { + "epoch": 0.03030140831668094, + "grad_norm": 2.2364108562469482, + "learning_rate": 4.9886854567067334e-05, + "loss": 5.5959, + "step": 5095 + }, + { + "epoch": 0.03030735559996194, + "grad_norm": 2.059187650680542, + "learning_rate": 4.988681017324185e-05, + "loss": 5.6043, + "step": 5096 + }, + { + "epoch": 0.030313302883242934, + "grad_norm": 1.8996437788009644, + "learning_rate": 4.988676577072865e-05, + "loss": 5.4366, + "step": 5097 + }, + { + "epoch": 0.030319250166523933, + "grad_norm": 2.0983266830444336, + "learning_rate": 4.988672135952773e-05, + "loss": 5.5568, + "step": 5098 + }, + { + "epoch": 0.030325197449804928, + "grad_norm": 2.065119743347168, + "learning_rate": 4.988667693963911e-05, + "loss": 5.4239, + "step": 5099 + }, + { + "epoch": 0.030331144733085926, + "grad_norm": 1.9394044876098633, + "learning_rate": 4.988663251106282e-05, + "loss": 5.573, + "step": 5100 + }, + { + "epoch": 0.030337092016366925, + "grad_norm": 2.225097417831421, + "learning_rate": 4.9886588073798855e-05, + "loss": 5.5877, + "step": 5101 + }, + { + "epoch": 0.03034303929964792, + "grad_norm": 2.185018539428711, + "learning_rate": 4.9886543627847236e-05, + "loss": 5.6884, + "step": 5102 + }, + { + "epoch": 0.03034898658292892, + "grad_norm": 1.9751871824264526, + "learning_rate": 4.988649917320799e-05, + "loss": 5.4836, + "step": 5103 + }, + { + "epoch": 0.030354933866209917, + "grad_norm": 1.8753101825714111, + "learning_rate": 4.988645470988113e-05, + "loss": 5.4049, + "step": 5104 + }, + { + "epoch": 0.030360881149490912, + "grad_norm": 2.12246036529541, + "learning_rate": 4.988641023786665e-05, + "loss": 5.5365, + "step": 5105 + }, + { + "epoch": 0.03036682843277191, + "grad_norm": 2.1078991889953613, + "learning_rate": 4.988636575716459e-05, + "loss": 5.5269, + "step": 5106 + }, + { + "epoch": 0.030372775716052906, + "grad_norm": 1.9127923250198364, + "learning_rate": 4.9886321267774946e-05, + "loss": 5.48, + "step": 5107 + }, + { + "epoch": 0.030378722999333904, + "grad_norm": 1.8971906900405884, + "learning_rate": 4.988627676969776e-05, + "loss": 5.5202, + "step": 5108 + }, + { + "epoch": 0.030384670282614903, + "grad_norm": 2.162097454071045, + "learning_rate": 4.9886232262933024e-05, + "loss": 5.5229, + "step": 5109 + }, + { + "epoch": 0.030390617565895898, + "grad_norm": 2.21211838722229, + "learning_rate": 4.988618774748076e-05, + "loss": 5.3648, + "step": 5110 + }, + { + "epoch": 0.030396564849176896, + "grad_norm": 1.8907619714736938, + "learning_rate": 4.988614322334099e-05, + "loss": 5.4338, + "step": 5111 + }, + { + "epoch": 0.030402512132457895, + "grad_norm": 2.0131993293762207, + "learning_rate": 4.9886098690513725e-05, + "loss": 5.4005, + "step": 5112 + }, + { + "epoch": 0.03040845941573889, + "grad_norm": 1.9474748373031616, + "learning_rate": 4.9886054148998975e-05, + "loss": 5.5544, + "step": 5113 + }, + { + "epoch": 0.03041440669901989, + "grad_norm": 1.9809894561767578, + "learning_rate": 4.988600959879676e-05, + "loss": 5.6204, + "step": 5114 + }, + { + "epoch": 0.030420353982300884, + "grad_norm": 2.1792514324188232, + "learning_rate": 4.9885965039907104e-05, + "loss": 5.5368, + "step": 5115 + }, + { + "epoch": 0.030426301265581882, + "grad_norm": 2.050903081893921, + "learning_rate": 4.9885920472330004e-05, + "loss": 5.4717, + "step": 5116 + }, + { + "epoch": 0.03043224854886288, + "grad_norm": 1.9938042163848877, + "learning_rate": 4.988587589606549e-05, + "loss": 5.5373, + "step": 5117 + }, + { + "epoch": 0.030438195832143876, + "grad_norm": 1.7375110387802124, + "learning_rate": 4.988583131111358e-05, + "loss": 5.5621, + "step": 5118 + }, + { + "epoch": 0.030444143115424874, + "grad_norm": 2.077605962753296, + "learning_rate": 4.988578671747428e-05, + "loss": 5.5451, + "step": 5119 + }, + { + "epoch": 0.03045009039870587, + "grad_norm": 2.071706771850586, + "learning_rate": 4.988574211514761e-05, + "loss": 5.327, + "step": 5120 + }, + { + "epoch": 0.030456037681986868, + "grad_norm": 1.8317911624908447, + "learning_rate": 4.9885697504133574e-05, + "loss": 5.4123, + "step": 5121 + }, + { + "epoch": 0.030461984965267867, + "grad_norm": 2.1231188774108887, + "learning_rate": 4.988565288443221e-05, + "loss": 5.3789, + "step": 5122 + }, + { + "epoch": 0.03046793224854886, + "grad_norm": 2.1298999786376953, + "learning_rate": 4.988560825604352e-05, + "loss": 5.4382, + "step": 5123 + }, + { + "epoch": 0.03047387953182986, + "grad_norm": 1.791053056716919, + "learning_rate": 4.9885563618967525e-05, + "loss": 5.3918, + "step": 5124 + }, + { + "epoch": 0.03047982681511086, + "grad_norm": 1.9610999822616577, + "learning_rate": 4.988551897320423e-05, + "loss": 5.3232, + "step": 5125 + }, + { + "epoch": 0.030485774098391854, + "grad_norm": 1.9926520586013794, + "learning_rate": 4.9885474318753654e-05, + "loss": 5.4316, + "step": 5126 + }, + { + "epoch": 0.030491721381672852, + "grad_norm": 1.8942431211471558, + "learning_rate": 4.988542965561582e-05, + "loss": 5.4055, + "step": 5127 + }, + { + "epoch": 0.030497668664953848, + "grad_norm": 1.7872856855392456, + "learning_rate": 4.988538498379074e-05, + "loss": 5.5117, + "step": 5128 + }, + { + "epoch": 0.030503615948234846, + "grad_norm": 2.040205478668213, + "learning_rate": 4.988534030327843e-05, + "loss": 5.4068, + "step": 5129 + }, + { + "epoch": 0.030509563231515845, + "grad_norm": 2.0108931064605713, + "learning_rate": 4.988529561407891e-05, + "loss": 5.3636, + "step": 5130 + }, + { + "epoch": 0.03051551051479684, + "grad_norm": 2.0339555740356445, + "learning_rate": 4.988525091619218e-05, + "loss": 5.2811, + "step": 5131 + }, + { + "epoch": 0.03052145779807784, + "grad_norm": 1.7631195783615112, + "learning_rate": 4.988520620961828e-05, + "loss": 5.3407, + "step": 5132 + }, + { + "epoch": 0.030527405081358837, + "grad_norm": 1.6906533241271973, + "learning_rate": 4.988516149435719e-05, + "loss": 5.3121, + "step": 5133 + }, + { + "epoch": 0.030533352364639832, + "grad_norm": 2.0753448009490967, + "learning_rate": 4.988511677040897e-05, + "loss": 5.4532, + "step": 5134 + }, + { + "epoch": 0.03053929964792083, + "grad_norm": 1.9836634397506714, + "learning_rate": 4.9885072037773595e-05, + "loss": 5.4345, + "step": 5135 + }, + { + "epoch": 0.030545246931201826, + "grad_norm": 1.8526780605316162, + "learning_rate": 4.988502729645111e-05, + "loss": 5.446, + "step": 5136 + }, + { + "epoch": 0.030551194214482824, + "grad_norm": 2.126626968383789, + "learning_rate": 4.988498254644152e-05, + "loss": 5.703, + "step": 5137 + }, + { + "epoch": 0.030557141497763823, + "grad_norm": 1.9711220264434814, + "learning_rate": 4.988493778774483e-05, + "loss": 5.5872, + "step": 5138 + }, + { + "epoch": 0.030563088781044818, + "grad_norm": 2.070727586746216, + "learning_rate": 4.988489302036107e-05, + "loss": 5.4407, + "step": 5139 + }, + { + "epoch": 0.030569036064325816, + "grad_norm": 2.1414859294891357, + "learning_rate": 4.988484824429025e-05, + "loss": 5.5291, + "step": 5140 + }, + { + "epoch": 0.030574983347606815, + "grad_norm": 2.01366925239563, + "learning_rate": 4.9884803459532384e-05, + "loss": 5.3561, + "step": 5141 + }, + { + "epoch": 0.03058093063088781, + "grad_norm": 1.851836085319519, + "learning_rate": 4.988475866608749e-05, + "loss": 5.679, + "step": 5142 + }, + { + "epoch": 0.03058687791416881, + "grad_norm": 1.6984909772872925, + "learning_rate": 4.988471386395559e-05, + "loss": 5.6075, + "step": 5143 + }, + { + "epoch": 0.030592825197449804, + "grad_norm": 1.9371756315231323, + "learning_rate": 4.9884669053136696e-05, + "loss": 5.7062, + "step": 5144 + }, + { + "epoch": 0.030598772480730802, + "grad_norm": 1.9286617040634155, + "learning_rate": 4.9884624233630815e-05, + "loss": 5.573, + "step": 5145 + }, + { + "epoch": 0.0306047197640118, + "grad_norm": 2.7633650302886963, + "learning_rate": 4.988457940543797e-05, + "loss": 6.2082, + "step": 5146 + }, + { + "epoch": 0.030610667047292796, + "grad_norm": 2.6948676109313965, + "learning_rate": 4.9884534568558173e-05, + "loss": 5.7475, + "step": 5147 + }, + { + "epoch": 0.030616614330573794, + "grad_norm": 2.1618316173553467, + "learning_rate": 4.988448972299145e-05, + "loss": 5.4049, + "step": 5148 + }, + { + "epoch": 0.03062256161385479, + "grad_norm": 2.417043685913086, + "learning_rate": 4.98844448687378e-05, + "loss": 5.3663, + "step": 5149 + }, + { + "epoch": 0.030628508897135788, + "grad_norm": 1.9748867750167847, + "learning_rate": 4.988440000579725e-05, + "loss": 5.1876, + "step": 5150 + }, + { + "epoch": 0.030634456180416787, + "grad_norm": 2.0534770488739014, + "learning_rate": 4.988435513416981e-05, + "loss": 5.4519, + "step": 5151 + }, + { + "epoch": 0.03064040346369778, + "grad_norm": 1.9772714376449585, + "learning_rate": 4.98843102538555e-05, + "loss": 5.5241, + "step": 5152 + }, + { + "epoch": 0.03064635074697878, + "grad_norm": 2.4160993099212646, + "learning_rate": 4.988426536485434e-05, + "loss": 5.6535, + "step": 5153 + }, + { + "epoch": 0.03065229803025978, + "grad_norm": 1.9931175708770752, + "learning_rate": 4.9884220467166345e-05, + "loss": 5.6693, + "step": 5154 + }, + { + "epoch": 0.030658245313540774, + "grad_norm": 1.9071956872940063, + "learning_rate": 4.9884175560791516e-05, + "loss": 5.5533, + "step": 5155 + }, + { + "epoch": 0.030664192596821772, + "grad_norm": 1.8562983274459839, + "learning_rate": 4.9884130645729876e-05, + "loss": 5.5621, + "step": 5156 + }, + { + "epoch": 0.030670139880102767, + "grad_norm": 2.087606430053711, + "learning_rate": 4.9884085721981446e-05, + "loss": 5.5256, + "step": 5157 + }, + { + "epoch": 0.030676087163383766, + "grad_norm": 2.3242955207824707, + "learning_rate": 4.988404078954624e-05, + "loss": 5.3906, + "step": 5158 + }, + { + "epoch": 0.030682034446664765, + "grad_norm": 2.221330404281616, + "learning_rate": 4.988399584842427e-05, + "loss": 5.5719, + "step": 5159 + }, + { + "epoch": 0.03068798172994576, + "grad_norm": 1.7819960117340088, + "learning_rate": 4.988395089861556e-05, + "loss": 5.5823, + "step": 5160 + }, + { + "epoch": 0.030693929013226758, + "grad_norm": 1.781802773475647, + "learning_rate": 4.988390594012011e-05, + "loss": 5.6087, + "step": 5161 + }, + { + "epoch": 0.030699876296507757, + "grad_norm": 2.0003581047058105, + "learning_rate": 4.988386097293796e-05, + "loss": 5.5695, + "step": 5162 + }, + { + "epoch": 0.030705823579788752, + "grad_norm": 1.9411736726760864, + "learning_rate": 4.98838159970691e-05, + "loss": 5.441, + "step": 5163 + }, + { + "epoch": 0.03071177086306975, + "grad_norm": 2.159541368484497, + "learning_rate": 4.9883771012513556e-05, + "loss": 5.6191, + "step": 5164 + }, + { + "epoch": 0.030717718146350746, + "grad_norm": 2.1045689582824707, + "learning_rate": 4.988372601927135e-05, + "loss": 5.3261, + "step": 5165 + }, + { + "epoch": 0.030723665429631744, + "grad_norm": 2.004770040512085, + "learning_rate": 4.988368101734249e-05, + "loss": 5.3392, + "step": 5166 + }, + { + "epoch": 0.030729612712912743, + "grad_norm": 2.1851232051849365, + "learning_rate": 4.9883636006726996e-05, + "loss": 5.3048, + "step": 5167 + }, + { + "epoch": 0.030735559996193738, + "grad_norm": 2.1333882808685303, + "learning_rate": 4.988359098742488e-05, + "loss": 5.336, + "step": 5168 + }, + { + "epoch": 0.030741507279474736, + "grad_norm": 2.1911604404449463, + "learning_rate": 4.9883545959436165e-05, + "loss": 5.757, + "step": 5169 + }, + { + "epoch": 0.030747454562755735, + "grad_norm": 2.0385994911193848, + "learning_rate": 4.988350092276085e-05, + "loss": 5.7889, + "step": 5170 + }, + { + "epoch": 0.03075340184603673, + "grad_norm": 2.2300381660461426, + "learning_rate": 4.988345587739897e-05, + "loss": 5.3812, + "step": 5171 + }, + { + "epoch": 0.03075934912931773, + "grad_norm": 2.4643938541412354, + "learning_rate": 4.988341082335053e-05, + "loss": 5.2503, + "step": 5172 + }, + { + "epoch": 0.030765296412598724, + "grad_norm": 2.0791194438934326, + "learning_rate": 4.988336576061555e-05, + "loss": 5.2958, + "step": 5173 + }, + { + "epoch": 0.030771243695879722, + "grad_norm": 2.1123111248016357, + "learning_rate": 4.988332068919405e-05, + "loss": 5.3656, + "step": 5174 + }, + { + "epoch": 0.03077719097916072, + "grad_norm": 2.199747323989868, + "learning_rate": 4.9883275609086026e-05, + "loss": 5.7015, + "step": 5175 + }, + { + "epoch": 0.030783138262441716, + "grad_norm": 2.0083510875701904, + "learning_rate": 4.988323052029151e-05, + "loss": 5.7068, + "step": 5176 + }, + { + "epoch": 0.030789085545722714, + "grad_norm": 2.1027777194976807, + "learning_rate": 4.988318542281053e-05, + "loss": 5.6986, + "step": 5177 + }, + { + "epoch": 0.03079503282900371, + "grad_norm": 1.8593190908432007, + "learning_rate": 4.9883140316643074e-05, + "loss": 5.7194, + "step": 5178 + }, + { + "epoch": 0.030800980112284708, + "grad_norm": 1.9712544679641724, + "learning_rate": 4.988309520178918e-05, + "loss": 5.6472, + "step": 5179 + }, + { + "epoch": 0.030806927395565707, + "grad_norm": 2.1114501953125, + "learning_rate": 4.9883050078248836e-05, + "loss": 5.6767, + "step": 5180 + }, + { + "epoch": 0.0308128746788467, + "grad_norm": 3.0505895614624023, + "learning_rate": 4.988300494602209e-05, + "loss": 5.3705, + "step": 5181 + }, + { + "epoch": 0.0308188219621277, + "grad_norm": 2.648364782333374, + "learning_rate": 4.988295980510895e-05, + "loss": 5.3072, + "step": 5182 + }, + { + "epoch": 0.0308247692454087, + "grad_norm": 2.2162837982177734, + "learning_rate": 4.9882914655509414e-05, + "loss": 5.3359, + "step": 5183 + }, + { + "epoch": 0.030830716528689694, + "grad_norm": 2.16666316986084, + "learning_rate": 4.988286949722352e-05, + "loss": 5.3446, + "step": 5184 + }, + { + "epoch": 0.030836663811970692, + "grad_norm": 2.951157569885254, + "learning_rate": 4.988282433025126e-05, + "loss": 5.7776, + "step": 5185 + }, + { + "epoch": 0.030842611095251687, + "grad_norm": 2.9967124462127686, + "learning_rate": 4.988277915459267e-05, + "loss": 5.6004, + "step": 5186 + }, + { + "epoch": 0.030848558378532686, + "grad_norm": 2.3998372554779053, + "learning_rate": 4.988273397024777e-05, + "loss": 5.3562, + "step": 5187 + }, + { + "epoch": 0.030854505661813685, + "grad_norm": 2.290592670440674, + "learning_rate": 4.9882688777216544e-05, + "loss": 5.3211, + "step": 5188 + }, + { + "epoch": 0.03086045294509468, + "grad_norm": 2.0349433422088623, + "learning_rate": 4.988264357549904e-05, + "loss": 5.2917, + "step": 5189 + }, + { + "epoch": 0.030866400228375678, + "grad_norm": 1.922006607055664, + "learning_rate": 4.988259836509526e-05, + "loss": 5.2297, + "step": 5190 + }, + { + "epoch": 0.030872347511656677, + "grad_norm": 1.9518259763717651, + "learning_rate": 4.9882553146005225e-05, + "loss": 5.2232, + "step": 5191 + }, + { + "epoch": 0.030878294794937672, + "grad_norm": 2.1054210662841797, + "learning_rate": 4.988250791822894e-05, + "loss": 5.3705, + "step": 5192 + }, + { + "epoch": 0.03088424207821867, + "grad_norm": 2.0954079627990723, + "learning_rate": 4.988246268176644e-05, + "loss": 5.2522, + "step": 5193 + }, + { + "epoch": 0.030890189361499665, + "grad_norm": 1.8628660440444946, + "learning_rate": 4.9882417436617724e-05, + "loss": 5.3856, + "step": 5194 + }, + { + "epoch": 0.030896136644780664, + "grad_norm": 2.2788021564483643, + "learning_rate": 4.988237218278281e-05, + "loss": 5.4399, + "step": 5195 + }, + { + "epoch": 0.030902083928061663, + "grad_norm": 1.981086015701294, + "learning_rate": 4.9882326920261717e-05, + "loss": 5.2853, + "step": 5196 + }, + { + "epoch": 0.030908031211342658, + "grad_norm": 1.9278241395950317, + "learning_rate": 4.988228164905446e-05, + "loss": 5.3997, + "step": 5197 + }, + { + "epoch": 0.030913978494623656, + "grad_norm": 1.842748999595642, + "learning_rate": 4.988223636916106e-05, + "loss": 5.3215, + "step": 5198 + }, + { + "epoch": 0.030919925777904655, + "grad_norm": 1.9974339008331299, + "learning_rate": 4.988219108058153e-05, + "loss": 5.4851, + "step": 5199 + }, + { + "epoch": 0.03092587306118565, + "grad_norm": 2.015939474105835, + "learning_rate": 4.988214578331588e-05, + "loss": 5.322, + "step": 5200 + }, + { + "epoch": 0.03093182034446665, + "grad_norm": 2.035209894180298, + "learning_rate": 4.9882100477364135e-05, + "loss": 5.3896, + "step": 5201 + }, + { + "epoch": 0.030937767627747643, + "grad_norm": 1.9803009033203125, + "learning_rate": 4.9882055162726296e-05, + "loss": 5.2624, + "step": 5202 + }, + { + "epoch": 0.030943714911028642, + "grad_norm": 1.9504352807998657, + "learning_rate": 4.98820098394024e-05, + "loss": 5.2333, + "step": 5203 + }, + { + "epoch": 0.03094966219430964, + "grad_norm": 1.850542664527893, + "learning_rate": 4.9881964507392443e-05, + "loss": 5.5632, + "step": 5204 + }, + { + "epoch": 0.030955609477590636, + "grad_norm": 1.8594067096710205, + "learning_rate": 4.9881919166696456e-05, + "loss": 5.3775, + "step": 5205 + }, + { + "epoch": 0.030961556760871634, + "grad_norm": 2.019274950027466, + "learning_rate": 4.988187381731444e-05, + "loss": 5.4565, + "step": 5206 + }, + { + "epoch": 0.030967504044152633, + "grad_norm": 1.7151249647140503, + "learning_rate": 4.988182845924643e-05, + "loss": 5.5984, + "step": 5207 + }, + { + "epoch": 0.030973451327433628, + "grad_norm": 2.5127339363098145, + "learning_rate": 4.988178309249242e-05, + "loss": 6.2724, + "step": 5208 + }, + { + "epoch": 0.030979398610714626, + "grad_norm": 1.869344711303711, + "learning_rate": 4.9881737717052436e-05, + "loss": 5.5408, + "step": 5209 + }, + { + "epoch": 0.03098534589399562, + "grad_norm": 2.035419225692749, + "learning_rate": 4.98816923329265e-05, + "loss": 5.4154, + "step": 5210 + }, + { + "epoch": 0.03099129317727662, + "grad_norm": 1.7084250450134277, + "learning_rate": 4.9881646940114624e-05, + "loss": 5.6327, + "step": 5211 + }, + { + "epoch": 0.03099724046055762, + "grad_norm": 2.1035211086273193, + "learning_rate": 4.9881601538616816e-05, + "loss": 5.5041, + "step": 5212 + }, + { + "epoch": 0.031003187743838614, + "grad_norm": 1.920366883277893, + "learning_rate": 4.9881556128433105e-05, + "loss": 5.5919, + "step": 5213 + }, + { + "epoch": 0.031009135027119612, + "grad_norm": 2.000555992126465, + "learning_rate": 4.988151070956349e-05, + "loss": 5.5078, + "step": 5214 + }, + { + "epoch": 0.031015082310400607, + "grad_norm": 1.9930146932601929, + "learning_rate": 4.9881465282008e-05, + "loss": 5.5002, + "step": 5215 + }, + { + "epoch": 0.031021029593681606, + "grad_norm": 2.163329839706421, + "learning_rate": 4.988141984576665e-05, + "loss": 5.3504, + "step": 5216 + }, + { + "epoch": 0.031026976876962604, + "grad_norm": 1.766228437423706, + "learning_rate": 4.988137440083946e-05, + "loss": 5.5304, + "step": 5217 + }, + { + "epoch": 0.0310329241602436, + "grad_norm": 2.1399648189544678, + "learning_rate": 4.988132894722644e-05, + "loss": 5.4757, + "step": 5218 + }, + { + "epoch": 0.031038871443524598, + "grad_norm": 2.2287001609802246, + "learning_rate": 4.988128348492759e-05, + "loss": 5.4902, + "step": 5219 + }, + { + "epoch": 0.031044818726805597, + "grad_norm": 2.095080852508545, + "learning_rate": 4.988123801394295e-05, + "loss": 5.3462, + "step": 5220 + }, + { + "epoch": 0.031050766010086592, + "grad_norm": 2.0873003005981445, + "learning_rate": 4.988119253427253e-05, + "loss": 5.2825, + "step": 5221 + }, + { + "epoch": 0.03105671329336759, + "grad_norm": 2.0918655395507812, + "learning_rate": 4.988114704591633e-05, + "loss": 5.2859, + "step": 5222 + }, + { + "epoch": 0.031062660576648585, + "grad_norm": 1.9637762308120728, + "learning_rate": 4.9881101548874384e-05, + "loss": 5.4687, + "step": 5223 + }, + { + "epoch": 0.031068607859929584, + "grad_norm": 2.046672821044922, + "learning_rate": 4.988105604314671e-05, + "loss": 5.5095, + "step": 5224 + }, + { + "epoch": 0.031074555143210583, + "grad_norm": 2.0264053344726562, + "learning_rate": 4.988101052873332e-05, + "loss": 5.4221, + "step": 5225 + }, + { + "epoch": 0.031080502426491578, + "grad_norm": 1.9367676973342896, + "learning_rate": 4.9880965005634216e-05, + "loss": 5.1881, + "step": 5226 + }, + { + "epoch": 0.031086449709772576, + "grad_norm": 2.0398001670837402, + "learning_rate": 4.9880919473849425e-05, + "loss": 5.4938, + "step": 5227 + }, + { + "epoch": 0.031092396993053575, + "grad_norm": 2.037411689758301, + "learning_rate": 4.988087393337896e-05, + "loss": 5.0893, + "step": 5228 + }, + { + "epoch": 0.03109834427633457, + "grad_norm": 2.1337075233459473, + "learning_rate": 4.988082838422285e-05, + "loss": 4.9822, + "step": 5229 + }, + { + "epoch": 0.03110429155961557, + "grad_norm": 1.9911794662475586, + "learning_rate": 4.988078282638109e-05, + "loss": 5.2472, + "step": 5230 + }, + { + "epoch": 0.031110238842896563, + "grad_norm": 2.1050829887390137, + "learning_rate": 4.98807372598537e-05, + "loss": 5.3478, + "step": 5231 + }, + { + "epoch": 0.031116186126177562, + "grad_norm": 1.9364343881607056, + "learning_rate": 4.988069168464071e-05, + "loss": 5.2551, + "step": 5232 + }, + { + "epoch": 0.03112213340945856, + "grad_norm": 1.9834885597229004, + "learning_rate": 4.988064610074213e-05, + "loss": 5.2147, + "step": 5233 + }, + { + "epoch": 0.031128080692739556, + "grad_norm": 2.0815906524658203, + "learning_rate": 4.9880600508157974e-05, + "loss": 5.1607, + "step": 5234 + }, + { + "epoch": 0.031134027976020554, + "grad_norm": 1.9558357000350952, + "learning_rate": 4.988055490688825e-05, + "loss": 5.4, + "step": 5235 + }, + { + "epoch": 0.031139975259301553, + "grad_norm": 1.9036076068878174, + "learning_rate": 4.9880509296932986e-05, + "loss": 5.4953, + "step": 5236 + }, + { + "epoch": 0.031145922542582548, + "grad_norm": 2.4709548950195312, + "learning_rate": 4.98804636782922e-05, + "loss": 5.2628, + "step": 5237 + }, + { + "epoch": 0.031151869825863546, + "grad_norm": 2.2380030155181885, + "learning_rate": 4.988041805096589e-05, + "loss": 5.2423, + "step": 5238 + }, + { + "epoch": 0.03115781710914454, + "grad_norm": 2.348639726638794, + "learning_rate": 4.988037241495409e-05, + "loss": 5.1966, + "step": 5239 + }, + { + "epoch": 0.03116376439242554, + "grad_norm": 1.9384468793869019, + "learning_rate": 4.9880326770256805e-05, + "loss": 5.47, + "step": 5240 + }, + { + "epoch": 0.03116971167570654, + "grad_norm": 2.2664244174957275, + "learning_rate": 4.988028111687406e-05, + "loss": 5.5511, + "step": 5241 + }, + { + "epoch": 0.031175658958987534, + "grad_norm": 2.1356422901153564, + "learning_rate": 4.988023545480586e-05, + "loss": 5.6462, + "step": 5242 + }, + { + "epoch": 0.031181606242268532, + "grad_norm": 2.240190267562866, + "learning_rate": 4.9880189784052226e-05, + "loss": 5.3494, + "step": 5243 + }, + { + "epoch": 0.031187553525549527, + "grad_norm": 1.8032485246658325, + "learning_rate": 4.988014410461318e-05, + "loss": 5.2305, + "step": 5244 + }, + { + "epoch": 0.031193500808830526, + "grad_norm": 2.177501678466797, + "learning_rate": 4.988009841648873e-05, + "loss": 5.1891, + "step": 5245 + }, + { + "epoch": 0.031199448092111524, + "grad_norm": 2.157317876815796, + "learning_rate": 4.988005271967889e-05, + "loss": 5.1038, + "step": 5246 + }, + { + "epoch": 0.03120539537539252, + "grad_norm": 1.9995821714401245, + "learning_rate": 4.988000701418369e-05, + "loss": 5.1098, + "step": 5247 + }, + { + "epoch": 0.031211342658673518, + "grad_norm": 2.201558828353882, + "learning_rate": 4.987996130000313e-05, + "loss": 5.0702, + "step": 5248 + }, + { + "epoch": 0.031217289941954517, + "grad_norm": 2.065645933151245, + "learning_rate": 4.987991557713724e-05, + "loss": 5.2012, + "step": 5249 + }, + { + "epoch": 0.03122323722523551, + "grad_norm": 1.908347487449646, + "learning_rate": 4.9879869845586024e-05, + "loss": 5.0913, + "step": 5250 + }, + { + "epoch": 0.03122918450851651, + "grad_norm": 1.913979411125183, + "learning_rate": 4.98798241053495e-05, + "loss": 5.0036, + "step": 5251 + }, + { + "epoch": 0.031235131791797505, + "grad_norm": 2.217616558074951, + "learning_rate": 4.9879778356427686e-05, + "loss": 5.0621, + "step": 5252 + }, + { + "epoch": 0.031241079075078504, + "grad_norm": 2.419713258743286, + "learning_rate": 4.9879732598820605e-05, + "loss": 5.1264, + "step": 5253 + }, + { + "epoch": 0.031247026358359502, + "grad_norm": 2.298295497894287, + "learning_rate": 4.987968683252826e-05, + "loss": 5.0576, + "step": 5254 + }, + { + "epoch": 0.0312529736416405, + "grad_norm": 2.120589256286621, + "learning_rate": 4.987964105755067e-05, + "loss": 5.175, + "step": 5255 + }, + { + "epoch": 0.031258920924921496, + "grad_norm": 2.3129806518554688, + "learning_rate": 4.987959527388787e-05, + "loss": 5.1827, + "step": 5256 + }, + { + "epoch": 0.03126486820820249, + "grad_norm": 2.251680612564087, + "learning_rate": 4.9879549481539846e-05, + "loss": 5.0473, + "step": 5257 + }, + { + "epoch": 0.03127081549148349, + "grad_norm": 2.101229429244995, + "learning_rate": 4.987950368050663e-05, + "loss": 5.0453, + "step": 5258 + }, + { + "epoch": 0.03127676277476449, + "grad_norm": 2.189565420150757, + "learning_rate": 4.987945787078824e-05, + "loss": 5.087, + "step": 5259 + }, + { + "epoch": 0.03128271005804548, + "grad_norm": 2.05485463142395, + "learning_rate": 4.9879412052384687e-05, + "loss": 5.0192, + "step": 5260 + }, + { + "epoch": 0.031288657341326485, + "grad_norm": 1.8166489601135254, + "learning_rate": 4.9879366225295994e-05, + "loss": 5.0456, + "step": 5261 + }, + { + "epoch": 0.03129460462460748, + "grad_norm": 2.1403279304504395, + "learning_rate": 4.9879320389522165e-05, + "loss": 4.9455, + "step": 5262 + }, + { + "epoch": 0.031300551907888476, + "grad_norm": 1.8833802938461304, + "learning_rate": 4.9879274545063226e-05, + "loss": 5.0891, + "step": 5263 + }, + { + "epoch": 0.03130649919116947, + "grad_norm": 2.000692367553711, + "learning_rate": 4.987922869191918e-05, + "loss": 5.1125, + "step": 5264 + }, + { + "epoch": 0.03131244647445047, + "grad_norm": 1.947544813156128, + "learning_rate": 4.9879182830090065e-05, + "loss": 4.9139, + "step": 5265 + }, + { + "epoch": 0.03131839375773147, + "grad_norm": 1.8827823400497437, + "learning_rate": 4.987913695957588e-05, + "loss": 5.0154, + "step": 5266 + }, + { + "epoch": 0.03132434104101246, + "grad_norm": 2.268115997314453, + "learning_rate": 4.987909108037664e-05, + "loss": 5.0379, + "step": 5267 + }, + { + "epoch": 0.031330288324293465, + "grad_norm": 1.85139000415802, + "learning_rate": 4.987904519249237e-05, + "loss": 4.9428, + "step": 5268 + }, + { + "epoch": 0.03133623560757446, + "grad_norm": 2.208338737487793, + "learning_rate": 4.987899929592308e-05, + "loss": 4.9366, + "step": 5269 + }, + { + "epoch": 0.031342182890855455, + "grad_norm": 3.5571236610412598, + "learning_rate": 4.987895339066879e-05, + "loss": 6.8471, + "step": 5270 + }, + { + "epoch": 0.03134813017413646, + "grad_norm": 2.000157594680786, + "learning_rate": 4.9878907476729516e-05, + "loss": 5.025, + "step": 5271 + }, + { + "epoch": 0.03135407745741745, + "grad_norm": 2.0588366985321045, + "learning_rate": 4.987886155410527e-05, + "loss": 4.8955, + "step": 5272 + }, + { + "epoch": 0.03136002474069845, + "grad_norm": 2.217839241027832, + "learning_rate": 4.9878815622796074e-05, + "loss": 4.9889, + "step": 5273 + }, + { + "epoch": 0.03136597202397945, + "grad_norm": 2.2453126907348633, + "learning_rate": 4.987876968280194e-05, + "loss": 5.3774, + "step": 5274 + }, + { + "epoch": 0.031371919307260444, + "grad_norm": 1.9839471578598022, + "learning_rate": 4.9878723734122876e-05, + "loss": 4.993, + "step": 5275 + }, + { + "epoch": 0.03137786659054144, + "grad_norm": 1.9534602165222168, + "learning_rate": 4.987867777675892e-05, + "loss": 4.9079, + "step": 5276 + }, + { + "epoch": 0.031383813873822435, + "grad_norm": 1.96163809299469, + "learning_rate": 4.9878631810710066e-05, + "loss": 4.9829, + "step": 5277 + }, + { + "epoch": 0.03138976115710344, + "grad_norm": 2.0814366340637207, + "learning_rate": 4.987858583597634e-05, + "loss": 4.8731, + "step": 5278 + }, + { + "epoch": 0.03139570844038443, + "grad_norm": 1.9846211671829224, + "learning_rate": 4.987853985255776e-05, + "loss": 4.9495, + "step": 5279 + }, + { + "epoch": 0.03140165572366543, + "grad_norm": 2.1237289905548096, + "learning_rate": 4.9878493860454335e-05, + "loss": 5.3887, + "step": 5280 + }, + { + "epoch": 0.03140760300694643, + "grad_norm": 2.1526784896850586, + "learning_rate": 4.9878447859666086e-05, + "loss": 5.3603, + "step": 5281 + }, + { + "epoch": 0.031413550290227424, + "grad_norm": 2.0563082695007324, + "learning_rate": 4.987840185019303e-05, + "loss": 5.4104, + "step": 5282 + }, + { + "epoch": 0.03141949757350842, + "grad_norm": 2.0586647987365723, + "learning_rate": 4.9878355832035175e-05, + "loss": 5.517, + "step": 5283 + }, + { + "epoch": 0.03142544485678942, + "grad_norm": 1.8817695379257202, + "learning_rate": 4.9878309805192546e-05, + "loss": 5.3616, + "step": 5284 + }, + { + "epoch": 0.031431392140070416, + "grad_norm": 2.0987086296081543, + "learning_rate": 4.987826376966516e-05, + "loss": 5.3237, + "step": 5285 + }, + { + "epoch": 0.03143733942335141, + "grad_norm": 2.3505301475524902, + "learning_rate": 4.987821772545302e-05, + "loss": 5.5165, + "step": 5286 + }, + { + "epoch": 0.03144328670663241, + "grad_norm": 2.1199939250946045, + "learning_rate": 4.987817167255616e-05, + "loss": 5.3029, + "step": 5287 + }, + { + "epoch": 0.03144923398991341, + "grad_norm": 1.7463518381118774, + "learning_rate": 4.987812561097458e-05, + "loss": 5.3589, + "step": 5288 + }, + { + "epoch": 0.0314551812731944, + "grad_norm": 1.9957356452941895, + "learning_rate": 4.987807954070831e-05, + "loss": 5.2459, + "step": 5289 + }, + { + "epoch": 0.031461128556475405, + "grad_norm": 1.7865337133407593, + "learning_rate": 4.987803346175736e-05, + "loss": 5.3041, + "step": 5290 + }, + { + "epoch": 0.0314670758397564, + "grad_norm": 1.82949960231781, + "learning_rate": 4.9877987374121744e-05, + "loss": 5.5761, + "step": 5291 + }, + { + "epoch": 0.031473023123037396, + "grad_norm": 1.974692940711975, + "learning_rate": 4.9877941277801475e-05, + "loss": 5.5033, + "step": 5292 + }, + { + "epoch": 0.03147897040631839, + "grad_norm": 2.1808922290802, + "learning_rate": 4.9877895172796577e-05, + "loss": 5.6739, + "step": 5293 + }, + { + "epoch": 0.03148491768959939, + "grad_norm": 2.7555716037750244, + "learning_rate": 4.987784905910706e-05, + "loss": 5.2489, + "step": 5294 + }, + { + "epoch": 0.03149086497288039, + "grad_norm": 2.475541353225708, + "learning_rate": 4.9877802936732955e-05, + "loss": 5.2304, + "step": 5295 + }, + { + "epoch": 0.03149681225616138, + "grad_norm": 1.945482611656189, + "learning_rate": 4.987775680567425e-05, + "loss": 5.4085, + "step": 5296 + }, + { + "epoch": 0.031502759539442385, + "grad_norm": 1.9879848957061768, + "learning_rate": 4.987771066593099e-05, + "loss": 5.5372, + "step": 5297 + }, + { + "epoch": 0.03150870682272338, + "grad_norm": 2.0529556274414062, + "learning_rate": 4.987766451750317e-05, + "loss": 5.578, + "step": 5298 + }, + { + "epoch": 0.031514654106004375, + "grad_norm": 1.7769572734832764, + "learning_rate": 4.9877618360390816e-05, + "loss": 5.5348, + "step": 5299 + }, + { + "epoch": 0.03152060138928538, + "grad_norm": 1.9111005067825317, + "learning_rate": 4.987757219459395e-05, + "loss": 5.4267, + "step": 5300 + }, + { + "epoch": 0.03152654867256637, + "grad_norm": 1.9047571420669556, + "learning_rate": 4.987752602011256e-05, + "loss": 5.433, + "step": 5301 + }, + { + "epoch": 0.03153249595584737, + "grad_norm": 1.9031875133514404, + "learning_rate": 4.98774798369467e-05, + "loss": 5.4929, + "step": 5302 + }, + { + "epoch": 0.03153844323912837, + "grad_norm": 1.858656883239746, + "learning_rate": 4.987743364509637e-05, + "loss": 5.3583, + "step": 5303 + }, + { + "epoch": 0.031544390522409364, + "grad_norm": 1.9254835844039917, + "learning_rate": 4.987738744456158e-05, + "loss": 5.4885, + "step": 5304 + }, + { + "epoch": 0.03155033780569036, + "grad_norm": 1.96173095703125, + "learning_rate": 4.987734123534235e-05, + "loss": 5.4869, + "step": 5305 + }, + { + "epoch": 0.031556285088971354, + "grad_norm": 1.7857433557510376, + "learning_rate": 4.98772950174387e-05, + "loss": 5.3845, + "step": 5306 + }, + { + "epoch": 0.031562232372252357, + "grad_norm": 1.9360556602478027, + "learning_rate": 4.9877248790850636e-05, + "loss": 5.3809, + "step": 5307 + }, + { + "epoch": 0.03156817965553335, + "grad_norm": 2.2044126987457275, + "learning_rate": 4.9877202555578197e-05, + "loss": 5.2413, + "step": 5308 + }, + { + "epoch": 0.03157412693881435, + "grad_norm": 1.8200992345809937, + "learning_rate": 4.9877156311621365e-05, + "loss": 5.6241, + "step": 5309 + }, + { + "epoch": 0.03158007422209535, + "grad_norm": 2.0771358013153076, + "learning_rate": 4.987711005898019e-05, + "loss": 5.6854, + "step": 5310 + }, + { + "epoch": 0.031586021505376344, + "grad_norm": 1.8330012559890747, + "learning_rate": 4.987706379765466e-05, + "loss": 5.712, + "step": 5311 + }, + { + "epoch": 0.03159196878865734, + "grad_norm": 1.941501498222351, + "learning_rate": 4.987701752764481e-05, + "loss": 5.4131, + "step": 5312 + }, + { + "epoch": 0.03159791607193834, + "grad_norm": 1.8688616752624512, + "learning_rate": 4.987697124895065e-05, + "loss": 5.3719, + "step": 5313 + }, + { + "epoch": 0.031603863355219336, + "grad_norm": 1.8723224401474, + "learning_rate": 4.98769249615722e-05, + "loss": 5.665, + "step": 5314 + }, + { + "epoch": 0.03160981063850033, + "grad_norm": 1.9460058212280273, + "learning_rate": 4.9876878665509474e-05, + "loss": 5.7048, + "step": 5315 + }, + { + "epoch": 0.03161575792178133, + "grad_norm": 1.9752602577209473, + "learning_rate": 4.987683236076248e-05, + "loss": 5.7098, + "step": 5316 + }, + { + "epoch": 0.03162170520506233, + "grad_norm": 1.8122695684432983, + "learning_rate": 4.9876786047331244e-05, + "loss": 5.2717, + "step": 5317 + }, + { + "epoch": 0.03162765248834332, + "grad_norm": 1.961983323097229, + "learning_rate": 4.9876739725215775e-05, + "loss": 5.5593, + "step": 5318 + }, + { + "epoch": 0.031633599771624325, + "grad_norm": 1.7362732887268066, + "learning_rate": 4.98766933944161e-05, + "loss": 5.5002, + "step": 5319 + }, + { + "epoch": 0.03163954705490532, + "grad_norm": 2.084033489227295, + "learning_rate": 4.9876647054932226e-05, + "loss": 5.5398, + "step": 5320 + }, + { + "epoch": 0.031645494338186315, + "grad_norm": 1.869452953338623, + "learning_rate": 4.9876600706764165e-05, + "loss": 5.5985, + "step": 5321 + }, + { + "epoch": 0.03165144162146731, + "grad_norm": 3.597667694091797, + "learning_rate": 4.9876554349911943e-05, + "loss": 5.4143, + "step": 5322 + }, + { + "epoch": 0.03165738890474831, + "grad_norm": 2.2364773750305176, + "learning_rate": 4.9876507984375574e-05, + "loss": 5.3756, + "step": 5323 + }, + { + "epoch": 0.03166333618802931, + "grad_norm": 2.0204551219940186, + "learning_rate": 4.987646161015508e-05, + "loss": 5.4964, + "step": 5324 + }, + { + "epoch": 0.0316692834713103, + "grad_norm": 1.7375823259353638, + "learning_rate": 4.987641522725046e-05, + "loss": 5.5249, + "step": 5325 + }, + { + "epoch": 0.031675230754591305, + "grad_norm": 1.661597728729248, + "learning_rate": 4.987636883566175e-05, + "loss": 5.4828, + "step": 5326 + }, + { + "epoch": 0.0316811780378723, + "grad_norm": 1.8612693548202515, + "learning_rate": 4.9876322435388944e-05, + "loss": 5.4711, + "step": 5327 + }, + { + "epoch": 0.031687125321153295, + "grad_norm": 1.8282328844070435, + "learning_rate": 4.987627602643208e-05, + "loss": 5.5234, + "step": 5328 + }, + { + "epoch": 0.0316930726044343, + "grad_norm": 1.951170802116394, + "learning_rate": 4.987622960879116e-05, + "loss": 5.4117, + "step": 5329 + }, + { + "epoch": 0.03169901988771529, + "grad_norm": 1.819174885749817, + "learning_rate": 4.9876183182466207e-05, + "loss": 5.3446, + "step": 5330 + }, + { + "epoch": 0.03170496717099629, + "grad_norm": 1.8710874319076538, + "learning_rate": 4.9876136747457245e-05, + "loss": 5.3755, + "step": 5331 + }, + { + "epoch": 0.03171091445427729, + "grad_norm": 2.1957387924194336, + "learning_rate": 4.9876090303764264e-05, + "loss": 6.3036, + "step": 5332 + }, + { + "epoch": 0.031716861737558284, + "grad_norm": 1.774741530418396, + "learning_rate": 4.987604385138731e-05, + "loss": 5.3822, + "step": 5333 + }, + { + "epoch": 0.03172280902083928, + "grad_norm": 1.793230414390564, + "learning_rate": 4.987599739032638e-05, + "loss": 5.4224, + "step": 5334 + }, + { + "epoch": 0.031728756304120274, + "grad_norm": 1.7986340522766113, + "learning_rate": 4.98759509205815e-05, + "loss": 5.3939, + "step": 5335 + }, + { + "epoch": 0.031734703587401276, + "grad_norm": 1.7775462865829468, + "learning_rate": 4.9875904442152675e-05, + "loss": 5.4356, + "step": 5336 + }, + { + "epoch": 0.03174065087068227, + "grad_norm": 1.882104516029358, + "learning_rate": 4.987585795503994e-05, + "loss": 5.2852, + "step": 5337 + }, + { + "epoch": 0.03174659815396327, + "grad_norm": 1.9842430353164673, + "learning_rate": 4.987581145924329e-05, + "loss": 5.4089, + "step": 5338 + }, + { + "epoch": 0.03175254543724427, + "grad_norm": 1.7098103761672974, + "learning_rate": 4.9875764954762754e-05, + "loss": 5.2442, + "step": 5339 + }, + { + "epoch": 0.031758492720525264, + "grad_norm": 1.8304857015609741, + "learning_rate": 4.9875718441598354e-05, + "loss": 5.5403, + "step": 5340 + }, + { + "epoch": 0.03176444000380626, + "grad_norm": 2.0763137340545654, + "learning_rate": 4.987567191975009e-05, + "loss": 5.8295, + "step": 5341 + }, + { + "epoch": 0.03177038728708726, + "grad_norm": 1.907271385192871, + "learning_rate": 4.9875625389217984e-05, + "loss": 5.6979, + "step": 5342 + }, + { + "epoch": 0.031776334570368256, + "grad_norm": 2.1263620853424072, + "learning_rate": 4.9875578850002056e-05, + "loss": 5.7713, + "step": 5343 + }, + { + "epoch": 0.03178228185364925, + "grad_norm": 2.038358211517334, + "learning_rate": 4.987553230210232e-05, + "loss": 6.0019, + "step": 5344 + }, + { + "epoch": 0.03178822913693025, + "grad_norm": 1.5671371221542358, + "learning_rate": 4.987548574551879e-05, + "loss": 5.9237, + "step": 5345 + }, + { + "epoch": 0.03179417642021125, + "grad_norm": 1.9159321784973145, + "learning_rate": 4.987543918025149e-05, + "loss": 6.0363, + "step": 5346 + }, + { + "epoch": 0.03180012370349224, + "grad_norm": 1.8012747764587402, + "learning_rate": 4.987539260630043e-05, + "loss": 5.901, + "step": 5347 + }, + { + "epoch": 0.031806070986773245, + "grad_norm": 2.154933214187622, + "learning_rate": 4.9875346023665625e-05, + "loss": 5.6379, + "step": 5348 + }, + { + "epoch": 0.03181201827005424, + "grad_norm": 2.191539764404297, + "learning_rate": 4.98752994323471e-05, + "loss": 5.5322, + "step": 5349 + }, + { + "epoch": 0.031817965553335235, + "grad_norm": 2.0007123947143555, + "learning_rate": 4.9875252832344856e-05, + "loss": 5.7398, + "step": 5350 + }, + { + "epoch": 0.03182391283661623, + "grad_norm": 1.7119163274765015, + "learning_rate": 4.9875206223658924e-05, + "loss": 5.8507, + "step": 5351 + }, + { + "epoch": 0.03182986011989723, + "grad_norm": 1.8882098197937012, + "learning_rate": 4.987515960628931e-05, + "loss": 5.8668, + "step": 5352 + }, + { + "epoch": 0.03183580740317823, + "grad_norm": 2.005493402481079, + "learning_rate": 4.987511298023604e-05, + "loss": 5.9672, + "step": 5353 + }, + { + "epoch": 0.03184175468645922, + "grad_norm": 1.858807921409607, + "learning_rate": 4.987506634549912e-05, + "loss": 5.9344, + "step": 5354 + }, + { + "epoch": 0.031847701969740225, + "grad_norm": 2.2698724269866943, + "learning_rate": 4.987501970207858e-05, + "loss": 5.6553, + "step": 5355 + }, + { + "epoch": 0.03185364925302122, + "grad_norm": 1.7690725326538086, + "learning_rate": 4.987497304997442e-05, + "loss": 5.6255, + "step": 5356 + }, + { + "epoch": 0.031859596536302215, + "grad_norm": 2.008002758026123, + "learning_rate": 4.987492638918667e-05, + "loss": 5.5578, + "step": 5357 + }, + { + "epoch": 0.03186554381958322, + "grad_norm": 1.6483304500579834, + "learning_rate": 4.987487971971533e-05, + "loss": 5.4786, + "step": 5358 + }, + { + "epoch": 0.03187149110286421, + "grad_norm": 1.9136204719543457, + "learning_rate": 4.987483304156044e-05, + "loss": 5.6043, + "step": 5359 + }, + { + "epoch": 0.03187743838614521, + "grad_norm": 1.9811625480651855, + "learning_rate": 4.987478635472199e-05, + "loss": 5.6172, + "step": 5360 + }, + { + "epoch": 0.03188338566942621, + "grad_norm": 2.012134075164795, + "learning_rate": 4.987473965920002e-05, + "loss": 5.6715, + "step": 5361 + }, + { + "epoch": 0.031889332952707204, + "grad_norm": 1.930550217628479, + "learning_rate": 4.987469295499453e-05, + "loss": 5.516, + "step": 5362 + }, + { + "epoch": 0.0318952802359882, + "grad_norm": 2.1190578937530518, + "learning_rate": 4.987464624210554e-05, + "loss": 5.5176, + "step": 5363 + }, + { + "epoch": 0.031901227519269194, + "grad_norm": 2.428710699081421, + "learning_rate": 4.987459952053307e-05, + "loss": 5.4088, + "step": 5364 + }, + { + "epoch": 0.031907174802550196, + "grad_norm": 1.8820819854736328, + "learning_rate": 4.987455279027713e-05, + "loss": 5.3753, + "step": 5365 + }, + { + "epoch": 0.03191312208583119, + "grad_norm": 1.6506859064102173, + "learning_rate": 4.987450605133775e-05, + "loss": 5.6018, + "step": 5366 + }, + { + "epoch": 0.03191906936911219, + "grad_norm": 2.060772657394409, + "learning_rate": 4.9874459303714925e-05, + "loss": 5.3587, + "step": 5367 + }, + { + "epoch": 0.03192501665239319, + "grad_norm": 2.3591532707214355, + "learning_rate": 4.9874412547408694e-05, + "loss": 5.7685, + "step": 5368 + }, + { + "epoch": 0.031930963935674184, + "grad_norm": 2.140322685241699, + "learning_rate": 4.987436578241906e-05, + "loss": 5.9015, + "step": 5369 + }, + { + "epoch": 0.03193691121895518, + "grad_norm": 2.2479233741760254, + "learning_rate": 4.987431900874604e-05, + "loss": 5.6079, + "step": 5370 + }, + { + "epoch": 0.03194285850223618, + "grad_norm": 2.0334317684173584, + "learning_rate": 4.987427222638965e-05, + "loss": 5.6364, + "step": 5371 + }, + { + "epoch": 0.031948805785517176, + "grad_norm": 2.0599231719970703, + "learning_rate": 4.987422543534991e-05, + "loss": 5.6578, + "step": 5372 + }, + { + "epoch": 0.03195475306879817, + "grad_norm": 2.237504720687866, + "learning_rate": 4.9874178635626836e-05, + "loss": 5.5784, + "step": 5373 + }, + { + "epoch": 0.03196070035207917, + "grad_norm": 2.013193130493164, + "learning_rate": 4.987413182722044e-05, + "loss": 5.4874, + "step": 5374 + }, + { + "epoch": 0.03196664763536017, + "grad_norm": 1.9806950092315674, + "learning_rate": 4.987408501013075e-05, + "loss": 5.41, + "step": 5375 + }, + { + "epoch": 0.03197259491864116, + "grad_norm": 1.7534204721450806, + "learning_rate": 4.9874038184357766e-05, + "loss": 5.4596, + "step": 5376 + }, + { + "epoch": 0.031978542201922165, + "grad_norm": 1.5722386837005615, + "learning_rate": 4.987399134990152e-05, + "loss": 5.508, + "step": 5377 + }, + { + "epoch": 0.03198448948520316, + "grad_norm": 7.868972301483154, + "learning_rate": 4.987394450676201e-05, + "loss": 5.1734, + "step": 5378 + }, + { + "epoch": 0.031990436768484155, + "grad_norm": 2.2103798389434814, + "learning_rate": 4.9873897654939274e-05, + "loss": 5.6766, + "step": 5379 + }, + { + "epoch": 0.03199638405176515, + "grad_norm": 1.9590017795562744, + "learning_rate": 4.9873850794433306e-05, + "loss": 5.7764, + "step": 5380 + }, + { + "epoch": 0.03200233133504615, + "grad_norm": 1.96006441116333, + "learning_rate": 4.9873803925244146e-05, + "loss": 5.7933, + "step": 5381 + }, + { + "epoch": 0.03200827861832715, + "grad_norm": 1.7377163171768188, + "learning_rate": 4.987375704737178e-05, + "loss": 5.692, + "step": 5382 + }, + { + "epoch": 0.03201422590160814, + "grad_norm": 2.0734782218933105, + "learning_rate": 4.9873710160816256e-05, + "loss": 5.5466, + "step": 5383 + }, + { + "epoch": 0.032020173184889145, + "grad_norm": 2.4700942039489746, + "learning_rate": 4.9873663265577574e-05, + "loss": 5.5837, + "step": 5384 + }, + { + "epoch": 0.03202612046817014, + "grad_norm": 2.067009925842285, + "learning_rate": 4.987361636165576e-05, + "loss": 5.4777, + "step": 5385 + }, + { + "epoch": 0.032032067751451135, + "grad_norm": 1.9585732221603394, + "learning_rate": 4.9873569449050815e-05, + "loss": 5.62, + "step": 5386 + }, + { + "epoch": 0.03203801503473214, + "grad_norm": 2.0210976600646973, + "learning_rate": 4.9873522527762766e-05, + "loss": 5.3554, + "step": 5387 + }, + { + "epoch": 0.03204396231801313, + "grad_norm": 2.0345299243927, + "learning_rate": 4.987347559779163e-05, + "loss": 5.3912, + "step": 5388 + }, + { + "epoch": 0.03204990960129413, + "grad_norm": 2.0960853099823, + "learning_rate": 4.987342865913742e-05, + "loss": 5.3497, + "step": 5389 + }, + { + "epoch": 0.03205585688457513, + "grad_norm": 2.0156044960021973, + "learning_rate": 4.987338171180015e-05, + "loss": 5.2769, + "step": 5390 + }, + { + "epoch": 0.032061804167856124, + "grad_norm": 2.0021722316741943, + "learning_rate": 4.987333475577984e-05, + "loss": 5.2338, + "step": 5391 + }, + { + "epoch": 0.03206775145113712, + "grad_norm": 1.8502025604248047, + "learning_rate": 4.987328779107651e-05, + "loss": 5.4231, + "step": 5392 + }, + { + "epoch": 0.03207369873441812, + "grad_norm": 2.0788064002990723, + "learning_rate": 4.987324081769016e-05, + "loss": 5.3989, + "step": 5393 + }, + { + "epoch": 0.032079646017699116, + "grad_norm": 5.172029495239258, + "learning_rate": 4.987319383562083e-05, + "loss": 6.5943, + "step": 5394 + }, + { + "epoch": 0.03208559330098011, + "grad_norm": 1.8732082843780518, + "learning_rate": 4.987314684486852e-05, + "loss": 5.3085, + "step": 5395 + }, + { + "epoch": 0.032091540584261107, + "grad_norm": 2.0511786937713623, + "learning_rate": 4.987309984543326e-05, + "loss": 5.1598, + "step": 5396 + }, + { + "epoch": 0.03209748786754211, + "grad_norm": 2.1821703910827637, + "learning_rate": 4.987305283731505e-05, + "loss": 5.3575, + "step": 5397 + }, + { + "epoch": 0.032103435150823104, + "grad_norm": 2.1190478801727295, + "learning_rate": 4.9873005820513906e-05, + "loss": 5.2371, + "step": 5398 + }, + { + "epoch": 0.0321093824341041, + "grad_norm": 2.1476964950561523, + "learning_rate": 4.987295879502987e-05, + "loss": 5.1378, + "step": 5399 + }, + { + "epoch": 0.0321153297173851, + "grad_norm": 2.3466129302978516, + "learning_rate": 4.987291176086293e-05, + "loss": 5.0642, + "step": 5400 + }, + { + "epoch": 0.032121277000666096, + "grad_norm": 2.267949104309082, + "learning_rate": 4.9872864718013115e-05, + "loss": 5.6835, + "step": 5401 + }, + { + "epoch": 0.03212722428394709, + "grad_norm": 3.1235604286193848, + "learning_rate": 4.987281766648044e-05, + "loss": 6.2094, + "step": 5402 + }, + { + "epoch": 0.03213317156722809, + "grad_norm": 2.494929790496826, + "learning_rate": 4.987277060626493e-05, + "loss": 6.2387, + "step": 5403 + }, + { + "epoch": 0.03213911885050909, + "grad_norm": 2.554422616958618, + "learning_rate": 4.987272353736658e-05, + "loss": 5.9655, + "step": 5404 + }, + { + "epoch": 0.03214506613379008, + "grad_norm": 3.688295841217041, + "learning_rate": 4.987267645978543e-05, + "loss": 6.3994, + "step": 5405 + }, + { + "epoch": 0.032151013417071085, + "grad_norm": 2.773847818374634, + "learning_rate": 4.987262937352147e-05, + "loss": 5.515, + "step": 5406 + }, + { + "epoch": 0.03215696070035208, + "grad_norm": 3.067812204360962, + "learning_rate": 4.987258227857475e-05, + "loss": 5.7388, + "step": 5407 + }, + { + "epoch": 0.032162907983633075, + "grad_norm": 3.0557258129119873, + "learning_rate": 4.987253517494525e-05, + "loss": 6.0334, + "step": 5408 + }, + { + "epoch": 0.03216885526691407, + "grad_norm": 2.2864489555358887, + "learning_rate": 4.9872488062633026e-05, + "loss": 6.2805, + "step": 5409 + }, + { + "epoch": 0.03217480255019507, + "grad_norm": 3.2848916053771973, + "learning_rate": 4.987244094163807e-05, + "loss": 6.4782, + "step": 5410 + }, + { + "epoch": 0.03218074983347607, + "grad_norm": 3.7147631645202637, + "learning_rate": 4.987239381196039e-05, + "loss": 6.6618, + "step": 5411 + }, + { + "epoch": 0.03218669711675706, + "grad_norm": 2.740705966949463, + "learning_rate": 4.9872346673600017e-05, + "loss": 6.0261, + "step": 5412 + }, + { + "epoch": 0.032192644400038065, + "grad_norm": 2.6408498287200928, + "learning_rate": 4.9872299526556965e-05, + "loss": 5.8645, + "step": 5413 + }, + { + "epoch": 0.03219859168331906, + "grad_norm": 2.8298256397247314, + "learning_rate": 4.987225237083125e-05, + "loss": 5.9263, + "step": 5414 + }, + { + "epoch": 0.032204538966600055, + "grad_norm": 2.9417197704315186, + "learning_rate": 4.987220520642289e-05, + "loss": 5.8018, + "step": 5415 + }, + { + "epoch": 0.03221048624988106, + "grad_norm": 3.2862906455993652, + "learning_rate": 4.9872158033331904e-05, + "loss": 5.8429, + "step": 5416 + }, + { + "epoch": 0.03221643353316205, + "grad_norm": 2.7724359035491943, + "learning_rate": 4.9872110851558306e-05, + "loss": 5.9504, + "step": 5417 + }, + { + "epoch": 0.03222238081644305, + "grad_norm": 2.2753829956054688, + "learning_rate": 4.9872063661102106e-05, + "loss": 5.6443, + "step": 5418 + }, + { + "epoch": 0.03222832809972405, + "grad_norm": 2.597649097442627, + "learning_rate": 4.987201646196332e-05, + "loss": 6.4441, + "step": 5419 + }, + { + "epoch": 0.032234275383005044, + "grad_norm": 2.7298800945281982, + "learning_rate": 4.987196925414198e-05, + "loss": 6.2988, + "step": 5420 + }, + { + "epoch": 0.03224022266628604, + "grad_norm": 3.2329537868499756, + "learning_rate": 4.987192203763809e-05, + "loss": 5.8743, + "step": 5421 + }, + { + "epoch": 0.03224616994956704, + "grad_norm": 3.033226251602173, + "learning_rate": 4.987187481245167e-05, + "loss": 5.4863, + "step": 5422 + }, + { + "epoch": 0.032252117232848036, + "grad_norm": 2.7728521823883057, + "learning_rate": 4.987182757858273e-05, + "loss": 5.5722, + "step": 5423 + }, + { + "epoch": 0.03225806451612903, + "grad_norm": 2.6083309650421143, + "learning_rate": 4.98717803360313e-05, + "loss": 6.5257, + "step": 5424 + }, + { + "epoch": 0.032264011799410026, + "grad_norm": 2.5422329902648926, + "learning_rate": 4.987173308479738e-05, + "loss": 6.5582, + "step": 5425 + }, + { + "epoch": 0.03226995908269103, + "grad_norm": 2.7634811401367188, + "learning_rate": 4.9871685824881e-05, + "loss": 6.0987, + "step": 5426 + }, + { + "epoch": 0.032275906365972024, + "grad_norm": 3.631476640701294, + "learning_rate": 4.987163855628217e-05, + "loss": 5.8506, + "step": 5427 + }, + { + "epoch": 0.03228185364925302, + "grad_norm": 2.9783661365509033, + "learning_rate": 4.9871591279000904e-05, + "loss": 5.9387, + "step": 5428 + }, + { + "epoch": 0.03228780093253402, + "grad_norm": 2.369645357131958, + "learning_rate": 4.9871543993037225e-05, + "loss": 5.8097, + "step": 5429 + }, + { + "epoch": 0.032293748215815016, + "grad_norm": 2.782055616378784, + "learning_rate": 4.9871496698391155e-05, + "loss": 5.5301, + "step": 5430 + }, + { + "epoch": 0.03229969549909601, + "grad_norm": 2.408205270767212, + "learning_rate": 4.98714493950627e-05, + "loss": 5.6514, + "step": 5431 + }, + { + "epoch": 0.03230564278237701, + "grad_norm": 2.0641589164733887, + "learning_rate": 4.987140208305187e-05, + "loss": 5.6168, + "step": 5432 + }, + { + "epoch": 0.03231159006565801, + "grad_norm": 2.109773874282837, + "learning_rate": 4.987135476235869e-05, + "loss": 5.6678, + "step": 5433 + }, + { + "epoch": 0.032317537348939, + "grad_norm": 2.9809730052948, + "learning_rate": 4.987130743298318e-05, + "loss": 6.0531, + "step": 5434 + }, + { + "epoch": 0.032323484632220005, + "grad_norm": 2.5728509426116943, + "learning_rate": 4.9871260094925365e-05, + "loss": 6.05, + "step": 5435 + }, + { + "epoch": 0.032329431915501, + "grad_norm": 2.477074146270752, + "learning_rate": 4.9871212748185236e-05, + "loss": 6.351, + "step": 5436 + }, + { + "epoch": 0.032335379198781995, + "grad_norm": 2.3485517501831055, + "learning_rate": 4.987116539276283e-05, + "loss": 6.3033, + "step": 5437 + }, + { + "epoch": 0.03234132648206299, + "grad_norm": 2.4214296340942383, + "learning_rate": 4.987111802865816e-05, + "loss": 6.1152, + "step": 5438 + }, + { + "epoch": 0.03234727376534399, + "grad_norm": 3.5628256797790527, + "learning_rate": 4.9871070655871234e-05, + "loss": 5.6502, + "step": 5439 + }, + { + "epoch": 0.03235322104862499, + "grad_norm": 3.190075159072876, + "learning_rate": 4.987102327440208e-05, + "loss": 5.4164, + "step": 5440 + }, + { + "epoch": 0.03235916833190598, + "grad_norm": 2.402754306793213, + "learning_rate": 4.9870975884250696e-05, + "loss": 5.7116, + "step": 5441 + }, + { + "epoch": 0.032365115615186985, + "grad_norm": 2.846653938293457, + "learning_rate": 4.987092848541712e-05, + "loss": 6.1456, + "step": 5442 + }, + { + "epoch": 0.03237106289846798, + "grad_norm": 2.6700549125671387, + "learning_rate": 4.987088107790136e-05, + "loss": 5.9777, + "step": 5443 + }, + { + "epoch": 0.032377010181748975, + "grad_norm": 2.8929460048675537, + "learning_rate": 4.987083366170343e-05, + "loss": 6.1459, + "step": 5444 + }, + { + "epoch": 0.03238295746502998, + "grad_norm": 2.524376153945923, + "learning_rate": 4.987078623682335e-05, + "loss": 6.4341, + "step": 5445 + }, + { + "epoch": 0.03238890474831097, + "grad_norm": 2.0901076793670654, + "learning_rate": 4.987073880326114e-05, + "loss": 6.3968, + "step": 5446 + }, + { + "epoch": 0.03239485203159197, + "grad_norm": 3.0033867359161377, + "learning_rate": 4.9870691361016805e-05, + "loss": 5.8656, + "step": 5447 + }, + { + "epoch": 0.03240079931487297, + "grad_norm": 2.7715492248535156, + "learning_rate": 4.987064391009038e-05, + "loss": 6.1634, + "step": 5448 + }, + { + "epoch": 0.032406746598153964, + "grad_norm": 2.6102347373962402, + "learning_rate": 4.9870596450481855e-05, + "loss": 6.2521, + "step": 5449 + }, + { + "epoch": 0.03241269388143496, + "grad_norm": 2.326253890991211, + "learning_rate": 4.9870548982191265e-05, + "loss": 6.2517, + "step": 5450 + }, + { + "epoch": 0.03241864116471596, + "grad_norm": 2.3012197017669678, + "learning_rate": 4.987050150521863e-05, + "loss": 6.2261, + "step": 5451 + }, + { + "epoch": 0.032424588447996956, + "grad_norm": 2.100337505340576, + "learning_rate": 4.987045401956396e-05, + "loss": 5.6291, + "step": 5452 + }, + { + "epoch": 0.03243053573127795, + "grad_norm": 3.094754219055176, + "learning_rate": 4.987040652522727e-05, + "loss": 5.897, + "step": 5453 + }, + { + "epoch": 0.032436483014558946, + "grad_norm": 2.7406179904937744, + "learning_rate": 4.987035902220857e-05, + "loss": 6.0083, + "step": 5454 + }, + { + "epoch": 0.03244243029783995, + "grad_norm": 2.4106287956237793, + "learning_rate": 4.9870311510507895e-05, + "loss": 5.8538, + "step": 5455 + }, + { + "epoch": 0.032448377581120944, + "grad_norm": 2.7335946559906006, + "learning_rate": 4.987026399012525e-05, + "loss": 5.9181, + "step": 5456 + }, + { + "epoch": 0.03245432486440194, + "grad_norm": 2.796175003051758, + "learning_rate": 4.987021646106064e-05, + "loss": 5.6461, + "step": 5457 + }, + { + "epoch": 0.03246027214768294, + "grad_norm": 3.086470127105713, + "learning_rate": 4.987016892331411e-05, + "loss": 5.6692, + "step": 5458 + }, + { + "epoch": 0.032466219430963936, + "grad_norm": 2.394465923309326, + "learning_rate": 4.9870121376885656e-05, + "loss": 6.3046, + "step": 5459 + }, + { + "epoch": 0.03247216671424493, + "grad_norm": 2.0745291709899902, + "learning_rate": 4.98700738217753e-05, + "loss": 6.0491, + "step": 5460 + }, + { + "epoch": 0.03247811399752593, + "grad_norm": 2.66359281539917, + "learning_rate": 4.987002625798305e-05, + "loss": 5.6468, + "step": 5461 + }, + { + "epoch": 0.03248406128080693, + "grad_norm": 2.392833948135376, + "learning_rate": 4.9869978685508936e-05, + "loss": 5.8421, + "step": 5462 + }, + { + "epoch": 0.03249000856408792, + "grad_norm": 2.671710252761841, + "learning_rate": 4.9869931104352975e-05, + "loss": 5.6892, + "step": 5463 + }, + { + "epoch": 0.032495955847368925, + "grad_norm": 2.7013144493103027, + "learning_rate": 4.986988351451517e-05, + "loss": 5.7911, + "step": 5464 + }, + { + "epoch": 0.03250190313064992, + "grad_norm": 1.926703929901123, + "learning_rate": 4.9869835915995555e-05, + "loss": 5.5492, + "step": 5465 + }, + { + "epoch": 0.032507850413930915, + "grad_norm": 2.5668530464172363, + "learning_rate": 4.986978830879413e-05, + "loss": 5.8949, + "step": 5466 + }, + { + "epoch": 0.03251379769721191, + "grad_norm": 2.555305004119873, + "learning_rate": 4.986974069291092e-05, + "loss": 5.7408, + "step": 5467 + }, + { + "epoch": 0.03251974498049291, + "grad_norm": 2.551226854324341, + "learning_rate": 4.986969306834594e-05, + "loss": 5.7738, + "step": 5468 + }, + { + "epoch": 0.03252569226377391, + "grad_norm": 2.3194847106933594, + "learning_rate": 4.986964543509921e-05, + "loss": 6.2837, + "step": 5469 + }, + { + "epoch": 0.0325316395470549, + "grad_norm": 1.9618690013885498, + "learning_rate": 4.986959779317074e-05, + "loss": 5.9236, + "step": 5470 + }, + { + "epoch": 0.032537586830335904, + "grad_norm": 2.351971387863159, + "learning_rate": 4.986955014256055e-05, + "loss": 5.591, + "step": 5471 + }, + { + "epoch": 0.0325435341136169, + "grad_norm": 2.3772034645080566, + "learning_rate": 4.986950248326866e-05, + "loss": 5.6785, + "step": 5472 + }, + { + "epoch": 0.032549481396897895, + "grad_norm": 2.5764195919036865, + "learning_rate": 4.9869454815295085e-05, + "loss": 5.525, + "step": 5473 + }, + { + "epoch": 0.0325554286801789, + "grad_norm": 2.231048107147217, + "learning_rate": 4.986940713863984e-05, + "loss": 5.6789, + "step": 5474 + }, + { + "epoch": 0.03256137596345989, + "grad_norm": 2.8053946495056152, + "learning_rate": 4.986935945330294e-05, + "loss": 5.6319, + "step": 5475 + }, + { + "epoch": 0.03256732324674089, + "grad_norm": 3.4610519409179688, + "learning_rate": 4.98693117592844e-05, + "loss": 5.9855, + "step": 5476 + }, + { + "epoch": 0.03257327053002189, + "grad_norm": 2.5019664764404297, + "learning_rate": 4.986926405658425e-05, + "loss": 5.9997, + "step": 5477 + }, + { + "epoch": 0.032579217813302884, + "grad_norm": 2.6583313941955566, + "learning_rate": 4.986921634520249e-05, + "loss": 6.3755, + "step": 5478 + }, + { + "epoch": 0.03258516509658388, + "grad_norm": 2.990699291229248, + "learning_rate": 4.986916862513914e-05, + "loss": 5.8932, + "step": 5479 + }, + { + "epoch": 0.03259111237986488, + "grad_norm": 3.282546043395996, + "learning_rate": 4.986912089639423e-05, + "loss": 5.5508, + "step": 5480 + }, + { + "epoch": 0.032597059663145876, + "grad_norm": 3.1012487411499023, + "learning_rate": 4.9869073158967755e-05, + "loss": 5.5567, + "step": 5481 + }, + { + "epoch": 0.03260300694642687, + "grad_norm": 2.141892433166504, + "learning_rate": 4.986902541285975e-05, + "loss": 5.6195, + "step": 5482 + }, + { + "epoch": 0.032608954229707866, + "grad_norm": 2.173670530319214, + "learning_rate": 4.986897765807023e-05, + "loss": 5.6913, + "step": 5483 + }, + { + "epoch": 0.03261490151298887, + "grad_norm": 2.4076435565948486, + "learning_rate": 4.98689298945992e-05, + "loss": 5.8324, + "step": 5484 + }, + { + "epoch": 0.03262084879626986, + "grad_norm": 2.8968818187713623, + "learning_rate": 4.986888212244668e-05, + "loss": 6.0086, + "step": 5485 + }, + { + "epoch": 0.03262679607955086, + "grad_norm": 2.2434191703796387, + "learning_rate": 4.9868834341612696e-05, + "loss": 5.9645, + "step": 5486 + }, + { + "epoch": 0.03263274336283186, + "grad_norm": 1.9683157205581665, + "learning_rate": 4.9868786552097255e-05, + "loss": 5.9173, + "step": 5487 + }, + { + "epoch": 0.032638690646112856, + "grad_norm": 2.369816303253174, + "learning_rate": 4.9868738753900384e-05, + "loss": 6.2728, + "step": 5488 + }, + { + "epoch": 0.03264463792939385, + "grad_norm": 2.1152775287628174, + "learning_rate": 4.986869094702209e-05, + "loss": 6.0474, + "step": 5489 + }, + { + "epoch": 0.03265058521267485, + "grad_norm": 2.3219857215881348, + "learning_rate": 4.9868643131462397e-05, + "loss": 5.7451, + "step": 5490 + }, + { + "epoch": 0.03265653249595585, + "grad_norm": 2.236046075820923, + "learning_rate": 4.986859530722131e-05, + "loss": 5.7775, + "step": 5491 + }, + { + "epoch": 0.03266247977923684, + "grad_norm": 2.3334364891052246, + "learning_rate": 4.986854747429886e-05, + "loss": 5.7429, + "step": 5492 + }, + { + "epoch": 0.032668427062517845, + "grad_norm": 2.5464704036712646, + "learning_rate": 4.986849963269505e-05, + "loss": 5.5781, + "step": 5493 + }, + { + "epoch": 0.03267437434579884, + "grad_norm": 2.104419469833374, + "learning_rate": 4.986845178240991e-05, + "loss": 5.6378, + "step": 5494 + }, + { + "epoch": 0.032680321629079835, + "grad_norm": 2.3115224838256836, + "learning_rate": 4.9868403923443444e-05, + "loss": 5.7617, + "step": 5495 + }, + { + "epoch": 0.03268626891236083, + "grad_norm": 2.3370540142059326, + "learning_rate": 4.9868356055795685e-05, + "loss": 6.1278, + "step": 5496 + }, + { + "epoch": 0.03269221619564183, + "grad_norm": 2.8618736267089844, + "learning_rate": 4.986830817946663e-05, + "loss": 6.0879, + "step": 5497 + }, + { + "epoch": 0.03269816347892283, + "grad_norm": 2.3229949474334717, + "learning_rate": 4.986826029445631e-05, + "loss": 6.0915, + "step": 5498 + }, + { + "epoch": 0.03270411076220382, + "grad_norm": 2.549914598464966, + "learning_rate": 4.986821240076473e-05, + "loss": 6.2375, + "step": 5499 + }, + { + "epoch": 0.032710058045484824, + "grad_norm": 2.595916271209717, + "learning_rate": 4.986816449839192e-05, + "loss": 6.095, + "step": 5500 + }, + { + "epoch": 0.03271600532876582, + "grad_norm": 2.4409420490264893, + "learning_rate": 4.98681165873379e-05, + "loss": 5.353, + "step": 5501 + }, + { + "epoch": 0.032721952612046815, + "grad_norm": 2.550156593322754, + "learning_rate": 4.986806866760266e-05, + "loss": 5.558, + "step": 5502 + }, + { + "epoch": 0.03272789989532782, + "grad_norm": 2.7811737060546875, + "learning_rate": 4.986802073918625e-05, + "loss": 5.7174, + "step": 5503 + }, + { + "epoch": 0.03273384717860881, + "grad_norm": 2.8430123329162598, + "learning_rate": 4.986797280208866e-05, + "loss": 5.5644, + "step": 5504 + }, + { + "epoch": 0.03273979446188981, + "grad_norm": 3.021040201187134, + "learning_rate": 4.986792485630992e-05, + "loss": 5.9451, + "step": 5505 + }, + { + "epoch": 0.03274574174517081, + "grad_norm": 2.69866681098938, + "learning_rate": 4.986787690185005e-05, + "loss": 5.9934, + "step": 5506 + }, + { + "epoch": 0.032751689028451804, + "grad_norm": 2.7202444076538086, + "learning_rate": 4.986782893870906e-05, + "loss": 6.1298, + "step": 5507 + }, + { + "epoch": 0.0327576363117328, + "grad_norm": 2.223405122756958, + "learning_rate": 4.986778096688696e-05, + "loss": 5.8968, + "step": 5508 + }, + { + "epoch": 0.0327635835950138, + "grad_norm": 2.5733680725097656, + "learning_rate": 4.986773298638378e-05, + "loss": 6.0928, + "step": 5509 + }, + { + "epoch": 0.032769530878294796, + "grad_norm": 2.584397554397583, + "learning_rate": 4.986768499719953e-05, + "loss": 5.7879, + "step": 5510 + }, + { + "epoch": 0.03277547816157579, + "grad_norm": 3.160489797592163, + "learning_rate": 4.986763699933423e-05, + "loss": 5.6413, + "step": 5511 + }, + { + "epoch": 0.032781425444856786, + "grad_norm": 2.8224406242370605, + "learning_rate": 4.9867588992787894e-05, + "loss": 6.1476, + "step": 5512 + }, + { + "epoch": 0.03278737272813779, + "grad_norm": 2.2565996646881104, + "learning_rate": 4.986754097756054e-05, + "loss": 6.208, + "step": 5513 + }, + { + "epoch": 0.03279332001141878, + "grad_norm": 2.5425479412078857, + "learning_rate": 4.9867492953652184e-05, + "loss": 5.934, + "step": 5514 + }, + { + "epoch": 0.03279926729469978, + "grad_norm": 2.6598689556121826, + "learning_rate": 4.986744492106284e-05, + "loss": 5.7433, + "step": 5515 + }, + { + "epoch": 0.03280521457798078, + "grad_norm": 2.419388771057129, + "learning_rate": 4.986739687979253e-05, + "loss": 5.378, + "step": 5516 + }, + { + "epoch": 0.032811161861261776, + "grad_norm": 2.72784161567688, + "learning_rate": 4.986734882984127e-05, + "loss": 5.4089, + "step": 5517 + }, + { + "epoch": 0.03281710914454277, + "grad_norm": 3.0592923164367676, + "learning_rate": 4.9867300771209075e-05, + "loss": 5.9573, + "step": 5518 + }, + { + "epoch": 0.03282305642782377, + "grad_norm": 2.7681832313537598, + "learning_rate": 4.9867252703895965e-05, + "loss": 5.5325, + "step": 5519 + }, + { + "epoch": 0.03282900371110477, + "grad_norm": 2.6752777099609375, + "learning_rate": 4.9867204627901946e-05, + "loss": 5.7543, + "step": 5520 + }, + { + "epoch": 0.03283495099438576, + "grad_norm": 2.481203317642212, + "learning_rate": 4.9867156543227046e-05, + "loss": 5.575, + "step": 5521 + }, + { + "epoch": 0.032840898277666765, + "grad_norm": 2.6403908729553223, + "learning_rate": 4.986710844987128e-05, + "loss": 5.4381, + "step": 5522 + }, + { + "epoch": 0.03284684556094776, + "grad_norm": 2.6146085262298584, + "learning_rate": 4.986706034783466e-05, + "loss": 5.8672, + "step": 5523 + }, + { + "epoch": 0.032852792844228755, + "grad_norm": 3.453666925430298, + "learning_rate": 4.986701223711722e-05, + "loss": 5.8353, + "step": 5524 + }, + { + "epoch": 0.03285874012750975, + "grad_norm": 2.511216640472412, + "learning_rate": 4.986696411771895e-05, + "loss": 5.9567, + "step": 5525 + }, + { + "epoch": 0.03286468741079075, + "grad_norm": 2.57395601272583, + "learning_rate": 4.986691598963988e-05, + "loss": 5.6396, + "step": 5526 + }, + { + "epoch": 0.03287063469407175, + "grad_norm": 2.778801441192627, + "learning_rate": 4.986686785288003e-05, + "loss": 6.0237, + "step": 5527 + }, + { + "epoch": 0.03287658197735274, + "grad_norm": 2.5216047763824463, + "learning_rate": 4.986681970743941e-05, + "loss": 6.1305, + "step": 5528 + }, + { + "epoch": 0.032882529260633744, + "grad_norm": 2.5105085372924805, + "learning_rate": 4.986677155331804e-05, + "loss": 6.4951, + "step": 5529 + }, + { + "epoch": 0.03288847654391474, + "grad_norm": 2.4105372428894043, + "learning_rate": 4.9866723390515946e-05, + "loss": 6.291, + "step": 5530 + }, + { + "epoch": 0.032894423827195735, + "grad_norm": 2.740095853805542, + "learning_rate": 4.9866675219033125e-05, + "loss": 5.762, + "step": 5531 + }, + { + "epoch": 0.03290037111047674, + "grad_norm": 2.327892541885376, + "learning_rate": 4.9866627038869605e-05, + "loss": 6.1023, + "step": 5532 + }, + { + "epoch": 0.03290631839375773, + "grad_norm": 2.71732497215271, + "learning_rate": 4.9866578850025414e-05, + "loss": 6.0739, + "step": 5533 + }, + { + "epoch": 0.03291226567703873, + "grad_norm": 2.1895039081573486, + "learning_rate": 4.9866530652500545e-05, + "loss": 5.801, + "step": 5534 + }, + { + "epoch": 0.03291821296031973, + "grad_norm": 2.39670729637146, + "learning_rate": 4.986648244629503e-05, + "loss": 6.0105, + "step": 5535 + }, + { + "epoch": 0.032924160243600724, + "grad_norm": 2.14630126953125, + "learning_rate": 4.986643423140889e-05, + "loss": 5.8457, + "step": 5536 + }, + { + "epoch": 0.03293010752688172, + "grad_norm": 2.111196994781494, + "learning_rate": 4.9866386007842125e-05, + "loss": 6.0804, + "step": 5537 + }, + { + "epoch": 0.03293605481016272, + "grad_norm": 2.8245434761047363, + "learning_rate": 4.986633777559476e-05, + "loss": 6.3152, + "step": 5538 + }, + { + "epoch": 0.032942002093443716, + "grad_norm": 2.3561060428619385, + "learning_rate": 4.9866289534666824e-05, + "loss": 6.286, + "step": 5539 + }, + { + "epoch": 0.03294794937672471, + "grad_norm": 3.21701979637146, + "learning_rate": 4.986624128505832e-05, + "loss": 5.9775, + "step": 5540 + }, + { + "epoch": 0.032953896660005706, + "grad_norm": 3.9414072036743164, + "learning_rate": 4.9866193026769265e-05, + "loss": 5.9413, + "step": 5541 + }, + { + "epoch": 0.03295984394328671, + "grad_norm": 2.7801051139831543, + "learning_rate": 4.986614475979968e-05, + "loss": 5.8642, + "step": 5542 + }, + { + "epoch": 0.0329657912265677, + "grad_norm": 2.7095935344696045, + "learning_rate": 4.986609648414958e-05, + "loss": 5.6952, + "step": 5543 + }, + { + "epoch": 0.0329717385098487, + "grad_norm": 2.5800812244415283, + "learning_rate": 4.986604819981898e-05, + "loss": 6.0285, + "step": 5544 + }, + { + "epoch": 0.0329776857931297, + "grad_norm": 2.6105730533599854, + "learning_rate": 4.9865999906807904e-05, + "loss": 5.6683, + "step": 5545 + }, + { + "epoch": 0.032983633076410696, + "grad_norm": 2.635570764541626, + "learning_rate": 4.9865951605116366e-05, + "loss": 5.9092, + "step": 5546 + }, + { + "epoch": 0.03298958035969169, + "grad_norm": 2.3708200454711914, + "learning_rate": 4.9865903294744373e-05, + "loss": 6.0034, + "step": 5547 + }, + { + "epoch": 0.03299552764297269, + "grad_norm": 2.437201499938965, + "learning_rate": 4.986585497569196e-05, + "loss": 6.2587, + "step": 5548 + }, + { + "epoch": 0.03300147492625369, + "grad_norm": 2.076016426086426, + "learning_rate": 4.9865806647959126e-05, + "loss": 6.358, + "step": 5549 + }, + { + "epoch": 0.03300742220953468, + "grad_norm": 1.8261257410049438, + "learning_rate": 4.98657583115459e-05, + "loss": 6.0431, + "step": 5550 + }, + { + "epoch": 0.033013369492815685, + "grad_norm": 2.8339858055114746, + "learning_rate": 4.98657099664523e-05, + "loss": 5.7956, + "step": 5551 + }, + { + "epoch": 0.03301931677609668, + "grad_norm": 2.7288596630096436, + "learning_rate": 4.986566161267833e-05, + "loss": 5.7092, + "step": 5552 + }, + { + "epoch": 0.033025264059377675, + "grad_norm": 2.7197329998016357, + "learning_rate": 4.986561325022402e-05, + "loss": 5.649, + "step": 5553 + }, + { + "epoch": 0.03303121134265867, + "grad_norm": 2.6161739826202393, + "learning_rate": 4.986556487908937e-05, + "loss": 5.6935, + "step": 5554 + }, + { + "epoch": 0.03303715862593967, + "grad_norm": 2.695068597793579, + "learning_rate": 4.986551649927441e-05, + "loss": 5.6901, + "step": 5555 + }, + { + "epoch": 0.03304310590922067, + "grad_norm": 3.0315186977386475, + "learning_rate": 4.986546811077917e-05, + "loss": 5.6317, + "step": 5556 + }, + { + "epoch": 0.03304905319250166, + "grad_norm": 2.3597543239593506, + "learning_rate": 4.986541971360364e-05, + "loss": 5.8129, + "step": 5557 + }, + { + "epoch": 0.033055000475782664, + "grad_norm": 2.8090550899505615, + "learning_rate": 4.986537130774785e-05, + "loss": 6.4427, + "step": 5558 + }, + { + "epoch": 0.03306094775906366, + "grad_norm": 3.4232771396636963, + "learning_rate": 4.986532289321182e-05, + "loss": 6.5737, + "step": 5559 + }, + { + "epoch": 0.033066895042344654, + "grad_norm": 2.1425294876098633, + "learning_rate": 4.986527446999556e-05, + "loss": 6.2395, + "step": 5560 + }, + { + "epoch": 0.033072842325625657, + "grad_norm": 2.5348880290985107, + "learning_rate": 4.986522603809909e-05, + "loss": 6.0425, + "step": 5561 + }, + { + "epoch": 0.03307878960890665, + "grad_norm": 3.0824179649353027, + "learning_rate": 4.986517759752242e-05, + "loss": 5.8785, + "step": 5562 + }, + { + "epoch": 0.03308473689218765, + "grad_norm": 2.297706365585327, + "learning_rate": 4.986512914826558e-05, + "loss": 5.8989, + "step": 5563 + }, + { + "epoch": 0.03309068417546865, + "grad_norm": 2.866257667541504, + "learning_rate": 4.986508069032858e-05, + "loss": 5.8905, + "step": 5564 + }, + { + "epoch": 0.033096631458749644, + "grad_norm": 2.2450008392333984, + "learning_rate": 4.9865032223711436e-05, + "loss": 6.3302, + "step": 5565 + }, + { + "epoch": 0.03310257874203064, + "grad_norm": 2.235558271408081, + "learning_rate": 4.9864983748414166e-05, + "loss": 6.4235, + "step": 5566 + }, + { + "epoch": 0.03310852602531164, + "grad_norm": 2.5197713375091553, + "learning_rate": 4.986493526443679e-05, + "loss": 6.3999, + "step": 5567 + }, + { + "epoch": 0.033114473308592636, + "grad_norm": 2.5716195106506348, + "learning_rate": 4.986488677177932e-05, + "loss": 6.0258, + "step": 5568 + }, + { + "epoch": 0.03312042059187363, + "grad_norm": 2.468663454055786, + "learning_rate": 4.986483827044177e-05, + "loss": 6.7553, + "step": 5569 + }, + { + "epoch": 0.033126367875154626, + "grad_norm": 2.4334170818328857, + "learning_rate": 4.986478976042417e-05, + "loss": 6.4722, + "step": 5570 + }, + { + "epoch": 0.03313231515843563, + "grad_norm": 2.234487533569336, + "learning_rate": 4.986474124172652e-05, + "loss": 5.7158, + "step": 5571 + }, + { + "epoch": 0.03313826244171662, + "grad_norm": 2.8017537593841553, + "learning_rate": 4.9864692714348857e-05, + "loss": 5.9552, + "step": 5572 + }, + { + "epoch": 0.03314420972499762, + "grad_norm": 3.171354055404663, + "learning_rate": 4.986464417829118e-05, + "loss": 6.027, + "step": 5573 + }, + { + "epoch": 0.03315015700827862, + "grad_norm": 2.890169620513916, + "learning_rate": 4.9864595633553516e-05, + "loss": 6.2768, + "step": 5574 + }, + { + "epoch": 0.033156104291559615, + "grad_norm": 3.010934829711914, + "learning_rate": 4.986454708013587e-05, + "loss": 6.4054, + "step": 5575 + }, + { + "epoch": 0.03316205157484061, + "grad_norm": 2.143833875656128, + "learning_rate": 4.9864498518038274e-05, + "loss": 6.3771, + "step": 5576 + }, + { + "epoch": 0.03316799885812161, + "grad_norm": 2.2067418098449707, + "learning_rate": 4.986444994726074e-05, + "loss": 6.0158, + "step": 5577 + }, + { + "epoch": 0.03317394614140261, + "grad_norm": 2.3396403789520264, + "learning_rate": 4.986440136780328e-05, + "loss": 6.4286, + "step": 5578 + }, + { + "epoch": 0.0331798934246836, + "grad_norm": 2.8305866718292236, + "learning_rate": 4.9864352779665915e-05, + "loss": 5.7804, + "step": 5579 + }, + { + "epoch": 0.033185840707964605, + "grad_norm": 2.748194456100464, + "learning_rate": 4.9864304182848664e-05, + "loss": 6.1711, + "step": 5580 + }, + { + "epoch": 0.0331917879912456, + "grad_norm": 2.329761505126953, + "learning_rate": 4.9864255577351534e-05, + "loss": 6.2722, + "step": 5581 + }, + { + "epoch": 0.033197735274526595, + "grad_norm": 2.4633524417877197, + "learning_rate": 4.986420696317457e-05, + "loss": 6.1349, + "step": 5582 + }, + { + "epoch": 0.03320368255780759, + "grad_norm": 1.8909802436828613, + "learning_rate": 4.986415834031775e-05, + "loss": 6.2181, + "step": 5583 + }, + { + "epoch": 0.03320962984108859, + "grad_norm": 2.1794517040252686, + "learning_rate": 4.9864109708781104e-05, + "loss": 6.2808, + "step": 5584 + }, + { + "epoch": 0.03321557712436959, + "grad_norm": 2.1766669750213623, + "learning_rate": 4.986406106856466e-05, + "loss": 6.3004, + "step": 5585 + }, + { + "epoch": 0.03322152440765058, + "grad_norm": 2.27526593208313, + "learning_rate": 4.986401241966844e-05, + "loss": 5.9225, + "step": 5586 + }, + { + "epoch": 0.033227471690931584, + "grad_norm": 3.2843096256256104, + "learning_rate": 4.986396376209244e-05, + "loss": 5.8364, + "step": 5587 + }, + { + "epoch": 0.03323341897421258, + "grad_norm": 2.509831666946411, + "learning_rate": 4.9863915095836685e-05, + "loss": 5.6958, + "step": 5588 + }, + { + "epoch": 0.033239366257493574, + "grad_norm": 2.5235815048217773, + "learning_rate": 4.98638664209012e-05, + "loss": 5.4937, + "step": 5589 + }, + { + "epoch": 0.033245313540774576, + "grad_norm": 2.918334484100342, + "learning_rate": 4.986381773728599e-05, + "loss": 5.8284, + "step": 5590 + }, + { + "epoch": 0.03325126082405557, + "grad_norm": 2.8091490268707275, + "learning_rate": 4.986376904499108e-05, + "loss": 5.8126, + "step": 5591 + }, + { + "epoch": 0.03325720810733657, + "grad_norm": 2.555173635482788, + "learning_rate": 4.986372034401649e-05, + "loss": 5.6393, + "step": 5592 + }, + { + "epoch": 0.03326315539061757, + "grad_norm": 2.6366164684295654, + "learning_rate": 4.986367163436223e-05, + "loss": 6.6675, + "step": 5593 + }, + { + "epoch": 0.033269102673898564, + "grad_norm": 2.5691051483154297, + "learning_rate": 4.9863622916028316e-05, + "loss": 6.5808, + "step": 5594 + }, + { + "epoch": 0.03327504995717956, + "grad_norm": 2.239384889602661, + "learning_rate": 4.986357418901477e-05, + "loss": 6.0191, + "step": 5595 + }, + { + "epoch": 0.03328099724046056, + "grad_norm": 2.3877806663513184, + "learning_rate": 4.9863525453321614e-05, + "loss": 5.7429, + "step": 5596 + }, + { + "epoch": 0.033286944523741556, + "grad_norm": 2.559633731842041, + "learning_rate": 4.9863476708948846e-05, + "loss": 5.4866, + "step": 5597 + }, + { + "epoch": 0.03329289180702255, + "grad_norm": 3.7681171894073486, + "learning_rate": 4.98634279558965e-05, + "loss": 5.6139, + "step": 5598 + }, + { + "epoch": 0.033298839090303546, + "grad_norm": 3.999264717102051, + "learning_rate": 4.9863379194164594e-05, + "loss": 5.6031, + "step": 5599 + }, + { + "epoch": 0.03330478637358455, + "grad_norm": 3.1031601428985596, + "learning_rate": 4.986333042375313e-05, + "loss": 5.5397, + "step": 5600 + }, + { + "epoch": 0.03331073365686554, + "grad_norm": 3.104998826980591, + "learning_rate": 4.986328164466214e-05, + "loss": 5.4274, + "step": 5601 + }, + { + "epoch": 0.03331668094014654, + "grad_norm": 2.9426207542419434, + "learning_rate": 4.986323285689163e-05, + "loss": 5.5859, + "step": 5602 + }, + { + "epoch": 0.03332262822342754, + "grad_norm": 2.6912827491760254, + "learning_rate": 4.986318406044163e-05, + "loss": 5.7375, + "step": 5603 + }, + { + "epoch": 0.033328575506708535, + "grad_norm": 4.394237041473389, + "learning_rate": 4.9863135255312145e-05, + "loss": 5.8246, + "step": 5604 + }, + { + "epoch": 0.03333452278998953, + "grad_norm": 2.812197685241699, + "learning_rate": 4.986308644150319e-05, + "loss": 5.6263, + "step": 5605 + }, + { + "epoch": 0.03334047007327053, + "grad_norm": 3.1969878673553467, + "learning_rate": 4.98630376190148e-05, + "loss": 5.4174, + "step": 5606 + }, + { + "epoch": 0.03334641735655153, + "grad_norm": 2.6018595695495605, + "learning_rate": 4.9862988787846975e-05, + "loss": 5.3917, + "step": 5607 + }, + { + "epoch": 0.03335236463983252, + "grad_norm": 2.5274007320404053, + "learning_rate": 4.986293994799974e-05, + "loss": 5.4252, + "step": 5608 + }, + { + "epoch": 0.033358311923113525, + "grad_norm": 2.57043194770813, + "learning_rate": 4.9862891099473105e-05, + "loss": 5.5321, + "step": 5609 + }, + { + "epoch": 0.03336425920639452, + "grad_norm": 3.4353785514831543, + "learning_rate": 4.986284224226709e-05, + "loss": 5.6599, + "step": 5610 + }, + { + "epoch": 0.033370206489675515, + "grad_norm": 3.308945894241333, + "learning_rate": 4.986279337638172e-05, + "loss": 5.8668, + "step": 5611 + }, + { + "epoch": 0.03337615377295652, + "grad_norm": 2.789703607559204, + "learning_rate": 4.9862744501817006e-05, + "loss": 5.8352, + "step": 5612 + }, + { + "epoch": 0.03338210105623751, + "grad_norm": 1.9887118339538574, + "learning_rate": 4.986269561857296e-05, + "loss": 5.7527, + "step": 5613 + }, + { + "epoch": 0.03338804833951851, + "grad_norm": 2.5447990894317627, + "learning_rate": 4.986264672664961e-05, + "loss": 5.5539, + "step": 5614 + }, + { + "epoch": 0.0333939956227995, + "grad_norm": 2.2903668880462646, + "learning_rate": 4.9862597826046965e-05, + "loss": 5.4555, + "step": 5615 + }, + { + "epoch": 0.033399942906080504, + "grad_norm": 3.1669414043426514, + "learning_rate": 4.986254891676504e-05, + "loss": 5.6852, + "step": 5616 + }, + { + "epoch": 0.0334058901893615, + "grad_norm": 3.7491395473480225, + "learning_rate": 4.986249999880386e-05, + "loss": 5.682, + "step": 5617 + }, + { + "epoch": 0.033411837472642494, + "grad_norm": 3.0548582077026367, + "learning_rate": 4.986245107216343e-05, + "loss": 5.7844, + "step": 5618 + }, + { + "epoch": 0.033417784755923496, + "grad_norm": 2.628957509994507, + "learning_rate": 4.986240213684378e-05, + "loss": 5.5646, + "step": 5619 + }, + { + "epoch": 0.03342373203920449, + "grad_norm": 2.050936460494995, + "learning_rate": 4.986235319284492e-05, + "loss": 5.7187, + "step": 5620 + }, + { + "epoch": 0.03342967932248549, + "grad_norm": 2.2839999198913574, + "learning_rate": 4.986230424016688e-05, + "loss": 5.6613, + "step": 5621 + }, + { + "epoch": 0.03343562660576649, + "grad_norm": 2.177778959274292, + "learning_rate": 4.986225527880966e-05, + "loss": 5.7205, + "step": 5622 + }, + { + "epoch": 0.033441573889047484, + "grad_norm": 2.1690266132354736, + "learning_rate": 4.9862206308773286e-05, + "loss": 5.4344, + "step": 5623 + }, + { + "epoch": 0.03344752117232848, + "grad_norm": 2.0134127140045166, + "learning_rate": 4.9862157330057766e-05, + "loss": 5.7872, + "step": 5624 + }, + { + "epoch": 0.03345346845560948, + "grad_norm": 2.0246710777282715, + "learning_rate": 4.986210834266313e-05, + "loss": 5.3291, + "step": 5625 + }, + { + "epoch": 0.033459415738890476, + "grad_norm": 2.020939350128174, + "learning_rate": 4.986205934658939e-05, + "loss": 5.3966, + "step": 5626 + }, + { + "epoch": 0.03346536302217147, + "grad_norm": 2.3261308670043945, + "learning_rate": 4.986201034183655e-05, + "loss": 5.4667, + "step": 5627 + }, + { + "epoch": 0.033471310305452466, + "grad_norm": 2.135641574859619, + "learning_rate": 4.9861961328404646e-05, + "loss": 5.4925, + "step": 5628 + }, + { + "epoch": 0.03347725758873347, + "grad_norm": 2.3122894763946533, + "learning_rate": 4.986191230629369e-05, + "loss": 5.6665, + "step": 5629 + }, + { + "epoch": 0.03348320487201446, + "grad_norm": 2.4461214542388916, + "learning_rate": 4.98618632755037e-05, + "loss": 5.8442, + "step": 5630 + }, + { + "epoch": 0.03348915215529546, + "grad_norm": 2.189009189605713, + "learning_rate": 4.9861814236034685e-05, + "loss": 5.5793, + "step": 5631 + }, + { + "epoch": 0.03349509943857646, + "grad_norm": 2.1961586475372314, + "learning_rate": 4.986176518788667e-05, + "loss": 5.5364, + "step": 5632 + }, + { + "epoch": 0.033501046721857455, + "grad_norm": 2.120177745819092, + "learning_rate": 4.986171613105967e-05, + "loss": 5.4042, + "step": 5633 + }, + { + "epoch": 0.03350699400513845, + "grad_norm": 1.9021252393722534, + "learning_rate": 4.9861667065553696e-05, + "loss": 5.2665, + "step": 5634 + }, + { + "epoch": 0.03351294128841945, + "grad_norm": 1.8944766521453857, + "learning_rate": 4.986161799136878e-05, + "loss": 5.3853, + "step": 5635 + }, + { + "epoch": 0.03351888857170045, + "grad_norm": 2.059847354888916, + "learning_rate": 4.9861568908504916e-05, + "loss": 5.3046, + "step": 5636 + }, + { + "epoch": 0.03352483585498144, + "grad_norm": 2.1350111961364746, + "learning_rate": 4.9861519816962155e-05, + "loss": 5.3684, + "step": 5637 + }, + { + "epoch": 0.033530783138262445, + "grad_norm": 2.0733792781829834, + "learning_rate": 4.986147071674048e-05, + "loss": 5.4581, + "step": 5638 + }, + { + "epoch": 0.03353673042154344, + "grad_norm": 2.0736827850341797, + "learning_rate": 4.986142160783993e-05, + "loss": 5.7019, + "step": 5639 + }, + { + "epoch": 0.033542677704824435, + "grad_norm": 2.1903107166290283, + "learning_rate": 4.986137249026051e-05, + "loss": 5.4353, + "step": 5640 + }, + { + "epoch": 0.03354862498810544, + "grad_norm": 2.2678940296173096, + "learning_rate": 4.9861323364002244e-05, + "loss": 5.4951, + "step": 5641 + }, + { + "epoch": 0.03355457227138643, + "grad_norm": 3.590702772140503, + "learning_rate": 4.9861274229065145e-05, + "loss": 6.1522, + "step": 5642 + }, + { + "epoch": 0.03356051955466743, + "grad_norm": 2.0955893993377686, + "learning_rate": 4.9861225085449224e-05, + "loss": 5.3544, + "step": 5643 + }, + { + "epoch": 0.03356646683794842, + "grad_norm": 1.9370301961898804, + "learning_rate": 4.986117593315452e-05, + "loss": 5.4732, + "step": 5644 + }, + { + "epoch": 0.033572414121229424, + "grad_norm": 2.141752243041992, + "learning_rate": 4.986112677218103e-05, + "loss": 5.5768, + "step": 5645 + }, + { + "epoch": 0.03357836140451042, + "grad_norm": 1.9236360788345337, + "learning_rate": 4.986107760252878e-05, + "loss": 5.7641, + "step": 5646 + }, + { + "epoch": 0.033584308687791414, + "grad_norm": 1.8353725671768188, + "learning_rate": 4.9861028424197785e-05, + "loss": 5.8011, + "step": 5647 + }, + { + "epoch": 0.033590255971072416, + "grad_norm": 2.0918078422546387, + "learning_rate": 4.9860979237188055e-05, + "loss": 5.6862, + "step": 5648 + }, + { + "epoch": 0.03359620325435341, + "grad_norm": 2.2244462966918945, + "learning_rate": 4.986093004149962e-05, + "loss": 5.472, + "step": 5649 + }, + { + "epoch": 0.033602150537634407, + "grad_norm": 2.1517422199249268, + "learning_rate": 4.9860880837132495e-05, + "loss": 5.3655, + "step": 5650 + }, + { + "epoch": 0.03360809782091541, + "grad_norm": 2.241863489151001, + "learning_rate": 4.986083162408669e-05, + "loss": 5.5385, + "step": 5651 + }, + { + "epoch": 0.033614045104196404, + "grad_norm": 2.458171844482422, + "learning_rate": 4.986078240236222e-05, + "loss": 5.5531, + "step": 5652 + }, + { + "epoch": 0.0336199923874774, + "grad_norm": 2.2601864337921143, + "learning_rate": 4.986073317195911e-05, + "loss": 5.9313, + "step": 5653 + }, + { + "epoch": 0.0336259396707584, + "grad_norm": 2.243647575378418, + "learning_rate": 4.986068393287738e-05, + "loss": 5.4064, + "step": 5654 + }, + { + "epoch": 0.033631886954039396, + "grad_norm": 2.283515453338623, + "learning_rate": 4.986063468511704e-05, + "loss": 5.295, + "step": 5655 + }, + { + "epoch": 0.03363783423732039, + "grad_norm": 2.701770305633545, + "learning_rate": 4.986058542867811e-05, + "loss": 5.8548, + "step": 5656 + }, + { + "epoch": 0.033643781520601386, + "grad_norm": 2.8186864852905273, + "learning_rate": 4.98605361635606e-05, + "loss": 5.378, + "step": 5657 + }, + { + "epoch": 0.03364972880388239, + "grad_norm": 2.6508500576019287, + "learning_rate": 4.9860486889764536e-05, + "loss": 5.469, + "step": 5658 + }, + { + "epoch": 0.03365567608716338, + "grad_norm": 2.3984878063201904, + "learning_rate": 4.986043760728994e-05, + "loss": 5.3978, + "step": 5659 + }, + { + "epoch": 0.03366162337044438, + "grad_norm": 3.64663028717041, + "learning_rate": 4.9860388316136814e-05, + "loss": 5.502, + "step": 5660 + }, + { + "epoch": 0.03366757065372538, + "grad_norm": 3.1112046241760254, + "learning_rate": 4.986033901630519e-05, + "loss": 5.7347, + "step": 5661 + }, + { + "epoch": 0.033673517937006375, + "grad_norm": 2.619877338409424, + "learning_rate": 4.9860289707795074e-05, + "loss": 6.2099, + "step": 5662 + }, + { + "epoch": 0.03367946522028737, + "grad_norm": 2.0318470001220703, + "learning_rate": 4.986024039060648e-05, + "loss": 6.246, + "step": 5663 + }, + { + "epoch": 0.03368541250356837, + "grad_norm": 2.1484673023223877, + "learning_rate": 4.986019106473945e-05, + "loss": 6.1689, + "step": 5664 + }, + { + "epoch": 0.03369135978684937, + "grad_norm": 2.6159844398498535, + "learning_rate": 4.9860141730193974e-05, + "loss": 5.8217, + "step": 5665 + }, + { + "epoch": 0.03369730707013036, + "grad_norm": 2.5019965171813965, + "learning_rate": 4.9860092386970084e-05, + "loss": 6.1138, + "step": 5666 + }, + { + "epoch": 0.033703254353411365, + "grad_norm": 2.962315797805786, + "learning_rate": 4.9860043035067785e-05, + "loss": 5.7057, + "step": 5667 + }, + { + "epoch": 0.03370920163669236, + "grad_norm": 2.455721139907837, + "learning_rate": 4.9859993674487106e-05, + "loss": 5.6203, + "step": 5668 + }, + { + "epoch": 0.033715148919973355, + "grad_norm": 2.432368278503418, + "learning_rate": 4.9859944305228066e-05, + "loss": 6.2337, + "step": 5669 + }, + { + "epoch": 0.03372109620325436, + "grad_norm": 2.3222782611846924, + "learning_rate": 4.985989492729067e-05, + "loss": 6.2845, + "step": 5670 + }, + { + "epoch": 0.03372704348653535, + "grad_norm": 2.107440948486328, + "learning_rate": 4.985984554067494e-05, + "loss": 6.2404, + "step": 5671 + }, + { + "epoch": 0.03373299076981635, + "grad_norm": 1.9450268745422363, + "learning_rate": 4.98597961453809e-05, + "loss": 6.1679, + "step": 5672 + }, + { + "epoch": 0.03373893805309734, + "grad_norm": 1.7591795921325684, + "learning_rate": 4.9859746741408554e-05, + "loss": 6.3425, + "step": 5673 + }, + { + "epoch": 0.033744885336378344, + "grad_norm": 2.009420871734619, + "learning_rate": 4.985969732875794e-05, + "loss": 6.3607, + "step": 5674 + }, + { + "epoch": 0.03375083261965934, + "grad_norm": 2.097215175628662, + "learning_rate": 4.9859647907429054e-05, + "loss": 6.2009, + "step": 5675 + }, + { + "epoch": 0.033756779902940334, + "grad_norm": 1.7670379877090454, + "learning_rate": 4.985959847742192e-05, + "loss": 5.935, + "step": 5676 + }, + { + "epoch": 0.033762727186221336, + "grad_norm": 2.052022695541382, + "learning_rate": 4.985954903873656e-05, + "loss": 5.4054, + "step": 5677 + }, + { + "epoch": 0.03376867446950233, + "grad_norm": 1.9225167036056519, + "learning_rate": 4.985949959137298e-05, + "loss": 5.6905, + "step": 5678 + }, + { + "epoch": 0.033774621752783326, + "grad_norm": 2.4080653190612793, + "learning_rate": 4.985945013533122e-05, + "loss": 6.5566, + "step": 5679 + }, + { + "epoch": 0.03378056903606433, + "grad_norm": 2.8340251445770264, + "learning_rate": 4.985940067061128e-05, + "loss": 6.3556, + "step": 5680 + }, + { + "epoch": 0.033786516319345324, + "grad_norm": 2.2872672080993652, + "learning_rate": 4.985935119721317e-05, + "loss": 6.1806, + "step": 5681 + }, + { + "epoch": 0.03379246360262632, + "grad_norm": 3.309203863143921, + "learning_rate": 4.985930171513692e-05, + "loss": 6.1766, + "step": 5682 + }, + { + "epoch": 0.03379841088590732, + "grad_norm": 2.936709403991699, + "learning_rate": 4.985925222438255e-05, + "loss": 5.907, + "step": 5683 + }, + { + "epoch": 0.033804358169188316, + "grad_norm": 2.3226964473724365, + "learning_rate": 4.985920272495007e-05, + "loss": 5.5734, + "step": 5684 + }, + { + "epoch": 0.03381030545246931, + "grad_norm": 2.3053154945373535, + "learning_rate": 4.98591532168395e-05, + "loss": 6.5688, + "step": 5685 + }, + { + "epoch": 0.033816252735750306, + "grad_norm": 2.2494077682495117, + "learning_rate": 4.985910370005086e-05, + "loss": 6.3539, + "step": 5686 + }, + { + "epoch": 0.03382220001903131, + "grad_norm": 1.9559924602508545, + "learning_rate": 4.9859054174584155e-05, + "loss": 6.2015, + "step": 5687 + }, + { + "epoch": 0.0338281473023123, + "grad_norm": 2.7915425300598145, + "learning_rate": 4.985900464043942e-05, + "loss": 5.7426, + "step": 5688 + }, + { + "epoch": 0.0338340945855933, + "grad_norm": 2.448496103286743, + "learning_rate": 4.985895509761665e-05, + "loss": 6.2697, + "step": 5689 + }, + { + "epoch": 0.0338400418688743, + "grad_norm": 1.7736696004867554, + "learning_rate": 4.9858905546115885e-05, + "loss": 6.5513, + "step": 5690 + }, + { + "epoch": 0.033845989152155295, + "grad_norm": 1.668285608291626, + "learning_rate": 4.9858855985937136e-05, + "loss": 6.0179, + "step": 5691 + }, + { + "epoch": 0.03385193643543629, + "grad_norm": 2.157799243927002, + "learning_rate": 4.985880641708042e-05, + "loss": 6.1863, + "step": 5692 + }, + { + "epoch": 0.03385788371871729, + "grad_norm": 2.2437758445739746, + "learning_rate": 4.985875683954574e-05, + "loss": 6.128, + "step": 5693 + }, + { + "epoch": 0.03386383100199829, + "grad_norm": 2.8323628902435303, + "learning_rate": 4.9858707253333124e-05, + "loss": 6.2746, + "step": 5694 + }, + { + "epoch": 0.03386977828527928, + "grad_norm": 2.270587205886841, + "learning_rate": 4.98586576584426e-05, + "loss": 6.1002, + "step": 5695 + }, + { + "epoch": 0.033875725568560285, + "grad_norm": 1.9165533781051636, + "learning_rate": 4.985860805487417e-05, + "loss": 5.7016, + "step": 5696 + }, + { + "epoch": 0.03388167285184128, + "grad_norm": 2.230407953262329, + "learning_rate": 4.985855844262786e-05, + "loss": 5.9649, + "step": 5697 + }, + { + "epoch": 0.033887620135122275, + "grad_norm": 2.5094211101531982, + "learning_rate": 4.985850882170368e-05, + "loss": 6.0184, + "step": 5698 + }, + { + "epoch": 0.03389356741840328, + "grad_norm": 2.6195943355560303, + "learning_rate": 4.9858459192101656e-05, + "loss": 5.8501, + "step": 5699 + }, + { + "epoch": 0.03389951470168427, + "grad_norm": 2.747486114501953, + "learning_rate": 4.9858409553821794e-05, + "loss": 5.7066, + "step": 5700 + }, + { + "epoch": 0.03390546198496527, + "grad_norm": 2.154109001159668, + "learning_rate": 4.985835990686413e-05, + "loss": 6.1072, + "step": 5701 + }, + { + "epoch": 0.03391140926824626, + "grad_norm": 2.4329216480255127, + "learning_rate": 4.9858310251228655e-05, + "loss": 5.9552, + "step": 5702 + }, + { + "epoch": 0.033917356551527264, + "grad_norm": 2.4760935306549072, + "learning_rate": 4.9858260586915405e-05, + "loss": 5.9023, + "step": 5703 + }, + { + "epoch": 0.03392330383480826, + "grad_norm": 2.400474786758423, + "learning_rate": 4.9858210913924397e-05, + "loss": 6.1688, + "step": 5704 + }, + { + "epoch": 0.033929251118089254, + "grad_norm": 2.402930498123169, + "learning_rate": 4.9858161232255644e-05, + "loss": 6.0776, + "step": 5705 + }, + { + "epoch": 0.033935198401370256, + "grad_norm": 2.0408313274383545, + "learning_rate": 4.985811154190916e-05, + "loss": 6.1841, + "step": 5706 + }, + { + "epoch": 0.03394114568465125, + "grad_norm": 1.889190912246704, + "learning_rate": 4.9858061842884976e-05, + "loss": 5.9689, + "step": 5707 + }, + { + "epoch": 0.033947092967932246, + "grad_norm": 2.2231624126434326, + "learning_rate": 4.9858012135183086e-05, + "loss": 6.0009, + "step": 5708 + }, + { + "epoch": 0.03395304025121325, + "grad_norm": 2.0229554176330566, + "learning_rate": 4.985796241880353e-05, + "loss": 6.3237, + "step": 5709 + }, + { + "epoch": 0.033958987534494244, + "grad_norm": 2.0570971965789795, + "learning_rate": 4.985791269374631e-05, + "loss": 6.3104, + "step": 5710 + }, + { + "epoch": 0.03396493481777524, + "grad_norm": 2.584663152694702, + "learning_rate": 4.9857862960011454e-05, + "loss": 5.8493, + "step": 5711 + }, + { + "epoch": 0.03397088210105624, + "grad_norm": 1.7870328426361084, + "learning_rate": 4.985781321759897e-05, + "loss": 6.2321, + "step": 5712 + }, + { + "epoch": 0.033976829384337236, + "grad_norm": 2.201756000518799, + "learning_rate": 4.9857763466508886e-05, + "loss": 6.1936, + "step": 5713 + }, + { + "epoch": 0.03398277666761823, + "grad_norm": 2.4489476680755615, + "learning_rate": 4.9857713706741216e-05, + "loss": 6.11, + "step": 5714 + }, + { + "epoch": 0.033988723950899226, + "grad_norm": 2.007643461227417, + "learning_rate": 4.9857663938295964e-05, + "loss": 6.288, + "step": 5715 + }, + { + "epoch": 0.03399467123418023, + "grad_norm": 1.8299764394760132, + "learning_rate": 4.9857614161173165e-05, + "loss": 6.0719, + "step": 5716 + }, + { + "epoch": 0.03400061851746122, + "grad_norm": 1.7619884014129639, + "learning_rate": 4.985756437537283e-05, + "loss": 6.1418, + "step": 5717 + }, + { + "epoch": 0.03400656580074222, + "grad_norm": 1.9445360898971558, + "learning_rate": 4.985751458089498e-05, + "loss": 6.1223, + "step": 5718 + }, + { + "epoch": 0.03401251308402322, + "grad_norm": 2.2320010662078857, + "learning_rate": 4.985746477773962e-05, + "loss": 5.5239, + "step": 5719 + }, + { + "epoch": 0.034018460367304215, + "grad_norm": 2.631765365600586, + "learning_rate": 4.985741496590678e-05, + "loss": 5.6348, + "step": 5720 + }, + { + "epoch": 0.03402440765058521, + "grad_norm": 2.4715576171875, + "learning_rate": 4.985736514539647e-05, + "loss": 5.9608, + "step": 5721 + }, + { + "epoch": 0.03403035493386621, + "grad_norm": 2.633188009262085, + "learning_rate": 4.985731531620871e-05, + "loss": 5.602, + "step": 5722 + }, + { + "epoch": 0.03403630221714721, + "grad_norm": 2.4303035736083984, + "learning_rate": 4.9857265478343526e-05, + "loss": 5.495, + "step": 5723 + }, + { + "epoch": 0.0340422495004282, + "grad_norm": 2.463447332382202, + "learning_rate": 4.985721563180092e-05, + "loss": 5.4633, + "step": 5724 + }, + { + "epoch": 0.034048196783709204, + "grad_norm": 2.349965810775757, + "learning_rate": 4.985716577658092e-05, + "loss": 6.0067, + "step": 5725 + }, + { + "epoch": 0.0340541440669902, + "grad_norm": 1.8741793632507324, + "learning_rate": 4.985711591268354e-05, + "loss": 5.8658, + "step": 5726 + }, + { + "epoch": 0.034060091350271195, + "grad_norm": 1.957612156867981, + "learning_rate": 4.98570660401088e-05, + "loss": 6.2016, + "step": 5727 + }, + { + "epoch": 0.0340660386335522, + "grad_norm": 2.4883556365966797, + "learning_rate": 4.985701615885671e-05, + "loss": 6.3056, + "step": 5728 + }, + { + "epoch": 0.03407198591683319, + "grad_norm": 2.6959800720214844, + "learning_rate": 4.98569662689273e-05, + "loss": 5.7267, + "step": 5729 + }, + { + "epoch": 0.03407793320011419, + "grad_norm": 2.579802989959717, + "learning_rate": 4.985691637032057e-05, + "loss": 5.2467, + "step": 5730 + }, + { + "epoch": 0.03408388048339518, + "grad_norm": 2.136262893676758, + "learning_rate": 4.985686646303656e-05, + "loss": 5.7071, + "step": 5731 + }, + { + "epoch": 0.034089827766676184, + "grad_norm": 2.1442244052886963, + "learning_rate": 4.985681654707526e-05, + "loss": 6.3961, + "step": 5732 + }, + { + "epoch": 0.03409577504995718, + "grad_norm": 2.164340019226074, + "learning_rate": 4.9856766622436714e-05, + "loss": 6.2455, + "step": 5733 + }, + { + "epoch": 0.034101722333238174, + "grad_norm": 2.199791193008423, + "learning_rate": 4.985671668912092e-05, + "loss": 5.8804, + "step": 5734 + }, + { + "epoch": 0.034107669616519176, + "grad_norm": 2.0359933376312256, + "learning_rate": 4.9856666747127905e-05, + "loss": 6.359, + "step": 5735 + }, + { + "epoch": 0.03411361689980017, + "grad_norm": 2.17069935798645, + "learning_rate": 4.985661679645769e-05, + "loss": 6.6736, + "step": 5736 + }, + { + "epoch": 0.034119564183081166, + "grad_norm": 1.9114634990692139, + "learning_rate": 4.9856566837110275e-05, + "loss": 5.9629, + "step": 5737 + }, + { + "epoch": 0.03412551146636217, + "grad_norm": 2.2872474193573, + "learning_rate": 4.9856516869085704e-05, + "loss": 5.5856, + "step": 5738 + }, + { + "epoch": 0.03413145874964316, + "grad_norm": 2.0800466537475586, + "learning_rate": 4.9856466892383965e-05, + "loss": 5.7732, + "step": 5739 + }, + { + "epoch": 0.03413740603292416, + "grad_norm": 2.37117338180542, + "learning_rate": 4.98564169070051e-05, + "loss": 5.667, + "step": 5740 + }, + { + "epoch": 0.03414335331620516, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.985636691294911e-05, + "loss": 5.4874, + "step": 5741 + }, + { + "epoch": 0.034149300599486156, + "grad_norm": 2.0097250938415527, + "learning_rate": 4.9856316910216024e-05, + "loss": 5.5469, + "step": 5742 + }, + { + "epoch": 0.03415524788276715, + "grad_norm": 2.430954933166504, + "learning_rate": 4.985626689880586e-05, + "loss": 5.7635, + "step": 5743 + }, + { + "epoch": 0.034161195166048146, + "grad_norm": 2.1000874042510986, + "learning_rate": 4.985621687871862e-05, + "loss": 5.7102, + "step": 5744 + }, + { + "epoch": 0.03416714244932915, + "grad_norm": 2.2048611640930176, + "learning_rate": 4.9856166849954336e-05, + "loss": 5.8156, + "step": 5745 + }, + { + "epoch": 0.03417308973261014, + "grad_norm": 2.145538330078125, + "learning_rate": 4.985611681251302e-05, + "loss": 5.9101, + "step": 5746 + }, + { + "epoch": 0.03417903701589114, + "grad_norm": 2.86169695854187, + "learning_rate": 4.9856066766394685e-05, + "loss": 5.7358, + "step": 5747 + }, + { + "epoch": 0.03418498429917214, + "grad_norm": 2.0648229122161865, + "learning_rate": 4.985601671159936e-05, + "loss": 6.0529, + "step": 5748 + }, + { + "epoch": 0.034190931582453135, + "grad_norm": 2.191251039505005, + "learning_rate": 4.985596664812706e-05, + "loss": 6.1999, + "step": 5749 + }, + { + "epoch": 0.03419687886573413, + "grad_norm": 2.556640148162842, + "learning_rate": 4.985591657597779e-05, + "loss": 6.0671, + "step": 5750 + }, + { + "epoch": 0.03420282614901513, + "grad_norm": 2.1796281337738037, + "learning_rate": 4.985586649515158e-05, + "loss": 6.1537, + "step": 5751 + }, + { + "epoch": 0.03420877343229613, + "grad_norm": 2.1884169578552246, + "learning_rate": 4.985581640564845e-05, + "loss": 5.7667, + "step": 5752 + }, + { + "epoch": 0.03421472071557712, + "grad_norm": 2.3836331367492676, + "learning_rate": 4.9855766307468404e-05, + "loss": 5.6608, + "step": 5753 + }, + { + "epoch": 0.034220667998858124, + "grad_norm": 2.0464322566986084, + "learning_rate": 4.985571620061147e-05, + "loss": 5.5317, + "step": 5754 + }, + { + "epoch": 0.03422661528213912, + "grad_norm": 2.3275644779205322, + "learning_rate": 4.9855666085077654e-05, + "loss": 5.8611, + "step": 5755 + }, + { + "epoch": 0.034232562565420115, + "grad_norm": 2.7268338203430176, + "learning_rate": 4.9855615960867e-05, + "loss": 5.6323, + "step": 5756 + }, + { + "epoch": 0.03423850984870112, + "grad_norm": 2.578986406326294, + "learning_rate": 4.985556582797949e-05, + "loss": 5.6108, + "step": 5757 + }, + { + "epoch": 0.03424445713198211, + "grad_norm": 2.4127955436706543, + "learning_rate": 4.985551568641516e-05, + "loss": 5.7054, + "step": 5758 + }, + { + "epoch": 0.03425040441526311, + "grad_norm": 2.1954357624053955, + "learning_rate": 4.985546553617404e-05, + "loss": 6.194, + "step": 5759 + }, + { + "epoch": 0.0342563516985441, + "grad_norm": 2.43851900100708, + "learning_rate": 4.985541537725612e-05, + "loss": 5.9067, + "step": 5760 + }, + { + "epoch": 0.034262298981825104, + "grad_norm": 2.0910801887512207, + "learning_rate": 4.9855365209661445e-05, + "loss": 6.1017, + "step": 5761 + }, + { + "epoch": 0.0342682462651061, + "grad_norm": 1.9936187267303467, + "learning_rate": 4.985531503339e-05, + "loss": 6.1239, + "step": 5762 + }, + { + "epoch": 0.034274193548387094, + "grad_norm": 2.0663299560546875, + "learning_rate": 4.985526484844183e-05, + "loss": 6.0514, + "step": 5763 + }, + { + "epoch": 0.034280140831668096, + "grad_norm": 2.4357266426086426, + "learning_rate": 4.985521465481695e-05, + "loss": 5.3695, + "step": 5764 + }, + { + "epoch": 0.03428608811494909, + "grad_norm": 2.12214994430542, + "learning_rate": 4.985516445251537e-05, + "loss": 5.5531, + "step": 5765 + }, + { + "epoch": 0.034292035398230086, + "grad_norm": 2.731661319732666, + "learning_rate": 4.9855114241537105e-05, + "loss": 6.2403, + "step": 5766 + }, + { + "epoch": 0.03429798268151109, + "grad_norm": 2.0668931007385254, + "learning_rate": 4.985506402188217e-05, + "loss": 6.0873, + "step": 5767 + }, + { + "epoch": 0.03430392996479208, + "grad_norm": 2.3165833950042725, + "learning_rate": 4.98550137935506e-05, + "loss": 5.9365, + "step": 5768 + }, + { + "epoch": 0.03430987724807308, + "grad_norm": 1.8637720346450806, + "learning_rate": 4.98549635565424e-05, + "loss": 6.0837, + "step": 5769 + }, + { + "epoch": 0.03431582453135408, + "grad_norm": 2.1689205169677734, + "learning_rate": 4.985491331085758e-05, + "loss": 5.703, + "step": 5770 + }, + { + "epoch": 0.034321771814635076, + "grad_norm": 2.245283365249634, + "learning_rate": 4.985486305649618e-05, + "loss": 6.0134, + "step": 5771 + }, + { + "epoch": 0.03432771909791607, + "grad_norm": 2.2685303688049316, + "learning_rate": 4.98548127934582e-05, + "loss": 5.279, + "step": 5772 + }, + { + "epoch": 0.034333666381197066, + "grad_norm": 2.376253128051758, + "learning_rate": 4.985476252174365e-05, + "loss": 5.5812, + "step": 5773 + }, + { + "epoch": 0.03433961366447807, + "grad_norm": 2.2636559009552, + "learning_rate": 4.985471224135257e-05, + "loss": 5.6906, + "step": 5774 + }, + { + "epoch": 0.03434556094775906, + "grad_norm": 2.22103214263916, + "learning_rate": 4.9854661952284965e-05, + "loss": 6.2066, + "step": 5775 + }, + { + "epoch": 0.03435150823104006, + "grad_norm": 2.308610439300537, + "learning_rate": 4.985461165454085e-05, + "loss": 6.1582, + "step": 5776 + }, + { + "epoch": 0.03435745551432106, + "grad_norm": 1.9191935062408447, + "learning_rate": 4.985456134812026e-05, + "loss": 5.4587, + "step": 5777 + }, + { + "epoch": 0.034363402797602055, + "grad_norm": 2.3127100467681885, + "learning_rate": 4.9854511033023184e-05, + "loss": 5.3375, + "step": 5778 + }, + { + "epoch": 0.03436935008088305, + "grad_norm": 2.4817371368408203, + "learning_rate": 4.985446070924966e-05, + "loss": 5.4961, + "step": 5779 + }, + { + "epoch": 0.03437529736416405, + "grad_norm": 2.0995922088623047, + "learning_rate": 4.9854410376799695e-05, + "loss": 5.7676, + "step": 5780 + }, + { + "epoch": 0.03438124464744505, + "grad_norm": 2.261229991912842, + "learning_rate": 4.985436003567332e-05, + "loss": 5.4446, + "step": 5781 + }, + { + "epoch": 0.03438719193072604, + "grad_norm": 2.275536060333252, + "learning_rate": 4.985430968587055e-05, + "loss": 5.4297, + "step": 5782 + }, + { + "epoch": 0.034393139214007044, + "grad_norm": 2.3733773231506348, + "learning_rate": 4.985425932739138e-05, + "loss": 5.7658, + "step": 5783 + }, + { + "epoch": 0.03439908649728804, + "grad_norm": 2.201716184616089, + "learning_rate": 4.985420896023586e-05, + "loss": 5.5502, + "step": 5784 + }, + { + "epoch": 0.034405033780569035, + "grad_norm": 2.1012730598449707, + "learning_rate": 4.9854158584403985e-05, + "loss": 5.7199, + "step": 5785 + }, + { + "epoch": 0.03441098106385004, + "grad_norm": 2.065568685531616, + "learning_rate": 4.985410819989579e-05, + "loss": 6.1547, + "step": 5786 + }, + { + "epoch": 0.03441692834713103, + "grad_norm": 1.9217867851257324, + "learning_rate": 4.9854057806711275e-05, + "loss": 6.2556, + "step": 5787 + }, + { + "epoch": 0.03442287563041203, + "grad_norm": 2.028602123260498, + "learning_rate": 4.985400740485047e-05, + "loss": 5.9347, + "step": 5788 + }, + { + "epoch": 0.03442882291369302, + "grad_norm": 2.002855062484741, + "learning_rate": 4.9853956994313376e-05, + "loss": 5.3966, + "step": 5789 + }, + { + "epoch": 0.034434770196974024, + "grad_norm": 2.3740642070770264, + "learning_rate": 4.985390657510003e-05, + "loss": 5.7801, + "step": 5790 + }, + { + "epoch": 0.03444071748025502, + "grad_norm": 2.1149635314941406, + "learning_rate": 4.9853856147210444e-05, + "loss": 5.6504, + "step": 5791 + }, + { + "epoch": 0.034446664763536014, + "grad_norm": 2.3519630432128906, + "learning_rate": 4.985380571064463e-05, + "loss": 5.9172, + "step": 5792 + }, + { + "epoch": 0.034452612046817016, + "grad_norm": 2.38930082321167, + "learning_rate": 4.985375526540261e-05, + "loss": 5.6196, + "step": 5793 + }, + { + "epoch": 0.03445855933009801, + "grad_norm": 2.245596408843994, + "learning_rate": 4.98537048114844e-05, + "loss": 5.5034, + "step": 5794 + }, + { + "epoch": 0.034464506613379006, + "grad_norm": 2.272158622741699, + "learning_rate": 4.985365434889002e-05, + "loss": 5.5867, + "step": 5795 + }, + { + "epoch": 0.03447045389666001, + "grad_norm": 2.2090094089508057, + "learning_rate": 4.9853603877619485e-05, + "loss": 5.68, + "step": 5796 + }, + { + "epoch": 0.034476401179941, + "grad_norm": 2.0545220375061035, + "learning_rate": 4.985355339767281e-05, + "loss": 5.8382, + "step": 5797 + }, + { + "epoch": 0.034482348463222, + "grad_norm": 2.143134593963623, + "learning_rate": 4.985350290905003e-05, + "loss": 5.5753, + "step": 5798 + }, + { + "epoch": 0.034488295746503, + "grad_norm": 2.3938257694244385, + "learning_rate": 4.985345241175114e-05, + "loss": 5.7545, + "step": 5799 + }, + { + "epoch": 0.034494243029783996, + "grad_norm": 2.132998466491699, + "learning_rate": 4.985340190577616e-05, + "loss": 5.5477, + "step": 5800 + }, + { + "epoch": 0.03450019031306499, + "grad_norm": 3.141417980194092, + "learning_rate": 4.9853351391125126e-05, + "loss": 5.3509, + "step": 5801 + }, + { + "epoch": 0.034506137596345986, + "grad_norm": 2.4776933193206787, + "learning_rate": 4.9853300867798034e-05, + "loss": 6.1052, + "step": 5802 + }, + { + "epoch": 0.03451208487962699, + "grad_norm": 2.1782073974609375, + "learning_rate": 4.985325033579492e-05, + "loss": 5.9599, + "step": 5803 + }, + { + "epoch": 0.03451803216290798, + "grad_norm": 2.2631704807281494, + "learning_rate": 4.9853199795115794e-05, + "loss": 5.534, + "step": 5804 + }, + { + "epoch": 0.03452397944618898, + "grad_norm": 2.140612840652466, + "learning_rate": 4.985314924576066e-05, + "loss": 5.7479, + "step": 5805 + }, + { + "epoch": 0.03452992672946998, + "grad_norm": 2.726651668548584, + "learning_rate": 4.9853098687729563e-05, + "loss": 5.4639, + "step": 5806 + }, + { + "epoch": 0.034535874012750975, + "grad_norm": 1.852423071861267, + "learning_rate": 4.985304812102249e-05, + "loss": 5.4209, + "step": 5807 + }, + { + "epoch": 0.03454182129603197, + "grad_norm": 2.5236833095550537, + "learning_rate": 4.9852997545639485e-05, + "loss": 5.9653, + "step": 5808 + }, + { + "epoch": 0.03454776857931297, + "grad_norm": 2.2740652561187744, + "learning_rate": 4.985294696158056e-05, + "loss": 5.9457, + "step": 5809 + }, + { + "epoch": 0.03455371586259397, + "grad_norm": 2.931777000427246, + "learning_rate": 4.9852896368845715e-05, + "loss": 5.6709, + "step": 5810 + }, + { + "epoch": 0.03455966314587496, + "grad_norm": 2.6981759071350098, + "learning_rate": 4.9852845767434986e-05, + "loss": 5.1747, + "step": 5811 + }, + { + "epoch": 0.034565610429155964, + "grad_norm": 2.2675211429595947, + "learning_rate": 4.985279515734839e-05, + "loss": 5.2393, + "step": 5812 + }, + { + "epoch": 0.03457155771243696, + "grad_norm": 2.535473346710205, + "learning_rate": 4.985274453858594e-05, + "loss": 6.2184, + "step": 5813 + }, + { + "epoch": 0.034577504995717954, + "grad_norm": 2.8692495822906494, + "learning_rate": 4.985269391114765e-05, + "loss": 5.2557, + "step": 5814 + }, + { + "epoch": 0.034583452278998957, + "grad_norm": 2.908472776412964, + "learning_rate": 4.985264327503354e-05, + "loss": 5.1559, + "step": 5815 + }, + { + "epoch": 0.03458939956227995, + "grad_norm": 2.3630192279815674, + "learning_rate": 4.985259263024363e-05, + "loss": 5.3159, + "step": 5816 + }, + { + "epoch": 0.03459534684556095, + "grad_norm": 2.1287102699279785, + "learning_rate": 4.9852541976777933e-05, + "loss": 5.2069, + "step": 5817 + }, + { + "epoch": 0.03460129412884194, + "grad_norm": 2.751567840576172, + "learning_rate": 4.985249131463647e-05, + "loss": 5.6561, + "step": 5818 + }, + { + "epoch": 0.034607241412122944, + "grad_norm": 2.505608081817627, + "learning_rate": 4.985244064381927e-05, + "loss": 5.9708, + "step": 5819 + }, + { + "epoch": 0.03461318869540394, + "grad_norm": 2.351593255996704, + "learning_rate": 4.9852389964326337e-05, + "loss": 5.9046, + "step": 5820 + }, + { + "epoch": 0.034619135978684934, + "grad_norm": 2.3037939071655273, + "learning_rate": 4.985233927615769e-05, + "loss": 6.0069, + "step": 5821 + }, + { + "epoch": 0.034625083261965936, + "grad_norm": 2.2482705116271973, + "learning_rate": 4.985228857931334e-05, + "loss": 5.9492, + "step": 5822 + }, + { + "epoch": 0.03463103054524693, + "grad_norm": 2.23640513420105, + "learning_rate": 4.985223787379332e-05, + "loss": 5.6631, + "step": 5823 + }, + { + "epoch": 0.034636977828527926, + "grad_norm": 2.710275411605835, + "learning_rate": 4.985218715959764e-05, + "loss": 5.5961, + "step": 5824 + }, + { + "epoch": 0.03464292511180893, + "grad_norm": 2.7220160961151123, + "learning_rate": 4.9852136436726313e-05, + "loss": 5.6922, + "step": 5825 + }, + { + "epoch": 0.03464887239508992, + "grad_norm": 2.4542758464813232, + "learning_rate": 4.985208570517937e-05, + "loss": 5.4742, + "step": 5826 + }, + { + "epoch": 0.03465481967837092, + "grad_norm": 2.7492685317993164, + "learning_rate": 4.9852034964956816e-05, + "loss": 5.4598, + "step": 5827 + }, + { + "epoch": 0.03466076696165192, + "grad_norm": 2.757937431335449, + "learning_rate": 4.9851984216058677e-05, + "loss": 6.1865, + "step": 5828 + }, + { + "epoch": 0.034666714244932915, + "grad_norm": 2.835890531539917, + "learning_rate": 4.985193345848497e-05, + "loss": 5.3368, + "step": 5829 + }, + { + "epoch": 0.03467266152821391, + "grad_norm": 2.694884777069092, + "learning_rate": 4.98518826922357e-05, + "loss": 5.3654, + "step": 5830 + }, + { + "epoch": 0.03467860881149491, + "grad_norm": 2.443784236907959, + "learning_rate": 4.98518319173109e-05, + "loss": 5.7879, + "step": 5831 + }, + { + "epoch": 0.03468455609477591, + "grad_norm": 2.0198488235473633, + "learning_rate": 4.985178113371058e-05, + "loss": 5.766, + "step": 5832 + }, + { + "epoch": 0.0346905033780569, + "grad_norm": 2.8718788623809814, + "learning_rate": 4.985173034143476e-05, + "loss": 5.5506, + "step": 5833 + }, + { + "epoch": 0.0346964506613379, + "grad_norm": 2.4353652000427246, + "learning_rate": 4.9851679540483455e-05, + "loss": 5.7139, + "step": 5834 + }, + { + "epoch": 0.0347023979446189, + "grad_norm": 1.9376598596572876, + "learning_rate": 4.985162873085669e-05, + "loss": 6.2326, + "step": 5835 + }, + { + "epoch": 0.034708345227899895, + "grad_norm": 2.2225289344787598, + "learning_rate": 4.985157791255448e-05, + "loss": 5.5997, + "step": 5836 + }, + { + "epoch": 0.03471429251118089, + "grad_norm": 2.011493682861328, + "learning_rate": 4.985152708557684e-05, + "loss": 5.6882, + "step": 5837 + }, + { + "epoch": 0.03472023979446189, + "grad_norm": 1.8679020404815674, + "learning_rate": 4.985147624992378e-05, + "loss": 5.5427, + "step": 5838 + }, + { + "epoch": 0.03472618707774289, + "grad_norm": 1.9470884799957275, + "learning_rate": 4.9851425405595334e-05, + "loss": 5.5957, + "step": 5839 + }, + { + "epoch": 0.03473213436102388, + "grad_norm": 2.0765669345855713, + "learning_rate": 4.985137455259151e-05, + "loss": 5.4416, + "step": 5840 + }, + { + "epoch": 0.034738081644304884, + "grad_norm": 2.0521979331970215, + "learning_rate": 4.985132369091233e-05, + "loss": 5.4641, + "step": 5841 + }, + { + "epoch": 0.03474402892758588, + "grad_norm": 1.7439172267913818, + "learning_rate": 4.985127282055781e-05, + "loss": 5.1998, + "step": 5842 + }, + { + "epoch": 0.034749976210866874, + "grad_norm": 1.7347313165664673, + "learning_rate": 4.985122194152797e-05, + "loss": 5.2392, + "step": 5843 + }, + { + "epoch": 0.034755923494147876, + "grad_norm": 1.7362169027328491, + "learning_rate": 4.985117105382282e-05, + "loss": 5.1769, + "step": 5844 + }, + { + "epoch": 0.03476187077742887, + "grad_norm": 1.7468090057373047, + "learning_rate": 4.985112015744239e-05, + "loss": 5.3915, + "step": 5845 + }, + { + "epoch": 0.03476781806070987, + "grad_norm": 1.8685250282287598, + "learning_rate": 4.985106925238668e-05, + "loss": 5.6119, + "step": 5846 + }, + { + "epoch": 0.03477376534399086, + "grad_norm": 1.9595715999603271, + "learning_rate": 4.985101833865572e-05, + "loss": 5.5536, + "step": 5847 + }, + { + "epoch": 0.034779712627271864, + "grad_norm": 1.8454965353012085, + "learning_rate": 4.985096741624953e-05, + "loss": 5.8127, + "step": 5848 + }, + { + "epoch": 0.03478565991055286, + "grad_norm": 1.9182006120681763, + "learning_rate": 4.985091648516813e-05, + "loss": 5.8807, + "step": 5849 + }, + { + "epoch": 0.034791607193833854, + "grad_norm": 2.042923927307129, + "learning_rate": 4.9850865545411526e-05, + "loss": 5.9013, + "step": 5850 + }, + { + "epoch": 0.034797554477114856, + "grad_norm": 2.341055393218994, + "learning_rate": 4.985081459697974e-05, + "loss": 6.214, + "step": 5851 + }, + { + "epoch": 0.03480350176039585, + "grad_norm": 2.026190996170044, + "learning_rate": 4.985076363987279e-05, + "loss": 5.3693, + "step": 5852 + }, + { + "epoch": 0.034809449043676846, + "grad_norm": 2.045264482498169, + "learning_rate": 4.98507126740907e-05, + "loss": 5.6325, + "step": 5853 + }, + { + "epoch": 0.03481539632695785, + "grad_norm": 2.2710580825805664, + "learning_rate": 4.985066169963348e-05, + "loss": 5.8355, + "step": 5854 + }, + { + "epoch": 0.03482134361023884, + "grad_norm": 1.8813494443893433, + "learning_rate": 4.985061071650115e-05, + "loss": 5.5849, + "step": 5855 + }, + { + "epoch": 0.03482729089351984, + "grad_norm": 2.2177746295928955, + "learning_rate": 4.985055972469373e-05, + "loss": 5.5518, + "step": 5856 + }, + { + "epoch": 0.03483323817680084, + "grad_norm": 1.897653341293335, + "learning_rate": 4.9850508724211234e-05, + "loss": 5.6035, + "step": 5857 + }, + { + "epoch": 0.034839185460081835, + "grad_norm": 2.349821090698242, + "learning_rate": 4.985045771505369e-05, + "loss": 5.8181, + "step": 5858 + }, + { + "epoch": 0.03484513274336283, + "grad_norm": 1.900538682937622, + "learning_rate": 4.98504066972211e-05, + "loss": 5.2751, + "step": 5859 + }, + { + "epoch": 0.03485108002664383, + "grad_norm": 2.1902174949645996, + "learning_rate": 4.985035567071349e-05, + "loss": 5.2709, + "step": 5860 + }, + { + "epoch": 0.03485702730992483, + "grad_norm": 1.7833307981491089, + "learning_rate": 4.9850304635530884e-05, + "loss": 5.2104, + "step": 5861 + }, + { + "epoch": 0.03486297459320582, + "grad_norm": 2.017603874206543, + "learning_rate": 4.985025359167329e-05, + "loss": 5.2257, + "step": 5862 + }, + { + "epoch": 0.03486892187648682, + "grad_norm": 1.9828181266784668, + "learning_rate": 4.9850202539140724e-05, + "loss": 5.2303, + "step": 5863 + }, + { + "epoch": 0.03487486915976782, + "grad_norm": 2.0273706912994385, + "learning_rate": 4.9850151477933216e-05, + "loss": 5.1743, + "step": 5864 + }, + { + "epoch": 0.034880816443048815, + "grad_norm": 1.9634721279144287, + "learning_rate": 4.985010040805077e-05, + "loss": 5.1541, + "step": 5865 + }, + { + "epoch": 0.03488676372632981, + "grad_norm": 2.2766621112823486, + "learning_rate": 4.985004932949342e-05, + "loss": 5.1372, + "step": 5866 + }, + { + "epoch": 0.03489271100961081, + "grad_norm": 2.0768795013427734, + "learning_rate": 4.984999824226117e-05, + "loss": 5.2567, + "step": 5867 + }, + { + "epoch": 0.03489865829289181, + "grad_norm": 1.8665590286254883, + "learning_rate": 4.984994714635404e-05, + "loss": 5.1356, + "step": 5868 + }, + { + "epoch": 0.0349046055761728, + "grad_norm": 2.056450843811035, + "learning_rate": 4.984989604177205e-05, + "loss": 5.1667, + "step": 5869 + }, + { + "epoch": 0.034910552859453804, + "grad_norm": 2.1191976070404053, + "learning_rate": 4.984984492851522e-05, + "loss": 5.1898, + "step": 5870 + }, + { + "epoch": 0.0349165001427348, + "grad_norm": 2.049450397491455, + "learning_rate": 4.9849793806583566e-05, + "loss": 5.1568, + "step": 5871 + }, + { + "epoch": 0.034922447426015794, + "grad_norm": 1.79837167263031, + "learning_rate": 4.984974267597711e-05, + "loss": 5.1288, + "step": 5872 + }, + { + "epoch": 0.034928394709296796, + "grad_norm": 1.959088683128357, + "learning_rate": 4.984969153669585e-05, + "loss": 5.1063, + "step": 5873 + }, + { + "epoch": 0.03493434199257779, + "grad_norm": 1.9193873405456543, + "learning_rate": 4.9849640388739836e-05, + "loss": 5.1608, + "step": 5874 + }, + { + "epoch": 0.03494028927585879, + "grad_norm": 1.6684316396713257, + "learning_rate": 4.9849589232109065e-05, + "loss": 5.0926, + "step": 5875 + }, + { + "epoch": 0.03494623655913978, + "grad_norm": 1.8383700847625732, + "learning_rate": 4.984953806680356e-05, + "loss": 5.0474, + "step": 5876 + }, + { + "epoch": 0.034952183842420784, + "grad_norm": 2.233779191970825, + "learning_rate": 4.984948689282333e-05, + "loss": 5.5046, + "step": 5877 + }, + { + "epoch": 0.03495813112570178, + "grad_norm": 2.2267282009124756, + "learning_rate": 4.9849435710168415e-05, + "loss": 5.6235, + "step": 5878 + }, + { + "epoch": 0.034964078408982774, + "grad_norm": 1.7933586835861206, + "learning_rate": 4.9849384518838804e-05, + "loss": 5.0968, + "step": 5879 + }, + { + "epoch": 0.034970025692263776, + "grad_norm": 2.0050230026245117, + "learning_rate": 4.984933331883453e-05, + "loss": 4.9789, + "step": 5880 + }, + { + "epoch": 0.03497597297554477, + "grad_norm": 1.7422970533370972, + "learning_rate": 4.9849282110155627e-05, + "loss": 5.1556, + "step": 5881 + }, + { + "epoch": 0.034981920258825766, + "grad_norm": 2.1242151260375977, + "learning_rate": 4.984923089280209e-05, + "loss": 5.7039, + "step": 5882 + }, + { + "epoch": 0.03498786754210677, + "grad_norm": 1.8656666278839111, + "learning_rate": 4.9849179666773934e-05, + "loss": 5.7185, + "step": 5883 + }, + { + "epoch": 0.03499381482538776, + "grad_norm": 1.6954991817474365, + "learning_rate": 4.984912843207119e-05, + "loss": 5.5686, + "step": 5884 + }, + { + "epoch": 0.03499976210866876, + "grad_norm": 1.7692710161209106, + "learning_rate": 4.984907718869387e-05, + "loss": 5.4058, + "step": 5885 + }, + { + "epoch": 0.03500570939194976, + "grad_norm": 1.8496350049972534, + "learning_rate": 4.9849025936642004e-05, + "loss": 5.5037, + "step": 5886 + }, + { + "epoch": 0.035011656675230755, + "grad_norm": 2.0124640464782715, + "learning_rate": 4.984897467591559e-05, + "loss": 5.6146, + "step": 5887 + }, + { + "epoch": 0.03501760395851175, + "grad_norm": 2.5522549152374268, + "learning_rate": 4.984892340651466e-05, + "loss": 5.6403, + "step": 5888 + }, + { + "epoch": 0.03502355124179275, + "grad_norm": 2.2127344608306885, + "learning_rate": 4.9848872128439224e-05, + "loss": 5.6277, + "step": 5889 + }, + { + "epoch": 0.03502949852507375, + "grad_norm": 2.578322172164917, + "learning_rate": 4.9848820841689305e-05, + "loss": 5.849, + "step": 5890 + }, + { + "epoch": 0.03503544580835474, + "grad_norm": 1.8083957433700562, + "learning_rate": 4.9848769546264915e-05, + "loss": 5.4407, + "step": 5891 + }, + { + "epoch": 0.03504139309163574, + "grad_norm": 1.885387897491455, + "learning_rate": 4.984871824216609e-05, + "loss": 5.4486, + "step": 5892 + }, + { + "epoch": 0.03504734037491674, + "grad_norm": 1.9450737237930298, + "learning_rate": 4.9848666929392817e-05, + "loss": 5.4196, + "step": 5893 + }, + { + "epoch": 0.035053287658197735, + "grad_norm": 1.9072003364562988, + "learning_rate": 4.984861560794514e-05, + "loss": 5.6293, + "step": 5894 + }, + { + "epoch": 0.03505923494147873, + "grad_norm": 2.064192056655884, + "learning_rate": 4.984856427782307e-05, + "loss": 5.7105, + "step": 5895 + }, + { + "epoch": 0.03506518222475973, + "grad_norm": 2.0101802349090576, + "learning_rate": 4.984851293902663e-05, + "loss": 5.5623, + "step": 5896 + }, + { + "epoch": 0.03507112950804073, + "grad_norm": 1.9813642501831055, + "learning_rate": 4.984846159155581e-05, + "loss": 5.653, + "step": 5897 + }, + { + "epoch": 0.03507707679132172, + "grad_norm": 1.9213227033615112, + "learning_rate": 4.9848410235410666e-05, + "loss": 5.5194, + "step": 5898 + }, + { + "epoch": 0.035083024074602724, + "grad_norm": 1.803076982498169, + "learning_rate": 4.984835887059119e-05, + "loss": 5.4101, + "step": 5899 + }, + { + "epoch": 0.03508897135788372, + "grad_norm": 1.8419232368469238, + "learning_rate": 4.9848307497097414e-05, + "loss": 5.7329, + "step": 5900 + }, + { + "epoch": 0.035094918641164714, + "grad_norm": 1.9258531332015991, + "learning_rate": 4.984825611492935e-05, + "loss": 5.559, + "step": 5901 + }, + { + "epoch": 0.035100865924445716, + "grad_norm": 1.869529366493225, + "learning_rate": 4.984820472408701e-05, + "loss": 5.5682, + "step": 5902 + }, + { + "epoch": 0.03510681320772671, + "grad_norm": 1.753365159034729, + "learning_rate": 4.984815332457042e-05, + "loss": 5.6241, + "step": 5903 + }, + { + "epoch": 0.035112760491007707, + "grad_norm": 1.6581326723098755, + "learning_rate": 4.98481019163796e-05, + "loss": 5.4752, + "step": 5904 + }, + { + "epoch": 0.0351187077742887, + "grad_norm": 1.9120882749557495, + "learning_rate": 4.9848050499514565e-05, + "loss": 5.5678, + "step": 5905 + }, + { + "epoch": 0.035124655057569704, + "grad_norm": 1.9840329885482788, + "learning_rate": 4.984799907397533e-05, + "loss": 5.5369, + "step": 5906 + }, + { + "epoch": 0.0351306023408507, + "grad_norm": 1.7970712184906006, + "learning_rate": 4.9847947639761914e-05, + "loss": 5.5857, + "step": 5907 + }, + { + "epoch": 0.035136549624131694, + "grad_norm": 1.7219270467758179, + "learning_rate": 4.984789619687435e-05, + "loss": 5.609, + "step": 5908 + }, + { + "epoch": 0.035142496907412696, + "grad_norm": 1.8945105075836182, + "learning_rate": 4.984784474531262e-05, + "loss": 5.5893, + "step": 5909 + }, + { + "epoch": 0.03514844419069369, + "grad_norm": 1.8570127487182617, + "learning_rate": 4.984779328507678e-05, + "loss": 5.4556, + "step": 5910 + }, + { + "epoch": 0.035154391473974686, + "grad_norm": 1.9291017055511475, + "learning_rate": 4.984774181616683e-05, + "loss": 5.476, + "step": 5911 + }, + { + "epoch": 0.03516033875725569, + "grad_norm": 1.9138598442077637, + "learning_rate": 4.984769033858278e-05, + "loss": 5.6329, + "step": 5912 + }, + { + "epoch": 0.03516628604053668, + "grad_norm": 1.9484977722167969, + "learning_rate": 4.9847638852324665e-05, + "loss": 5.5305, + "step": 5913 + }, + { + "epoch": 0.03517223332381768, + "grad_norm": 1.7338584661483765, + "learning_rate": 4.984758735739249e-05, + "loss": 5.4842, + "step": 5914 + }, + { + "epoch": 0.03517818060709868, + "grad_norm": 1.8625437021255493, + "learning_rate": 4.984753585378629e-05, + "loss": 5.3696, + "step": 5915 + }, + { + "epoch": 0.035184127890379675, + "grad_norm": 1.798782229423523, + "learning_rate": 4.984748434150607e-05, + "loss": 5.5803, + "step": 5916 + }, + { + "epoch": 0.03519007517366067, + "grad_norm": 2.0596888065338135, + "learning_rate": 4.9847432820551845e-05, + "loss": 5.3274, + "step": 5917 + }, + { + "epoch": 0.03519602245694167, + "grad_norm": 2.0848498344421387, + "learning_rate": 4.984738129092364e-05, + "loss": 5.3334, + "step": 5918 + }, + { + "epoch": 0.03520196974022267, + "grad_norm": 2.000460386276245, + "learning_rate": 4.984732975262147e-05, + "loss": 5.4411, + "step": 5919 + }, + { + "epoch": 0.03520791702350366, + "grad_norm": 1.676957607269287, + "learning_rate": 4.9847278205645355e-05, + "loss": 5.47, + "step": 5920 + }, + { + "epoch": 0.03521386430678466, + "grad_norm": 1.911482334136963, + "learning_rate": 4.984722664999531e-05, + "loss": 5.5736, + "step": 5921 + }, + { + "epoch": 0.03521981159006566, + "grad_norm": 1.9573029279708862, + "learning_rate": 4.9847175085671356e-05, + "loss": 5.5509, + "step": 5922 + }, + { + "epoch": 0.035225758873346655, + "grad_norm": 1.8878334760665894, + "learning_rate": 4.984712351267351e-05, + "loss": 5.6437, + "step": 5923 + }, + { + "epoch": 0.03523170615662765, + "grad_norm": 1.9107712507247925, + "learning_rate": 4.984707193100179e-05, + "loss": 5.4471, + "step": 5924 + }, + { + "epoch": 0.03523765343990865, + "grad_norm": 1.7408612966537476, + "learning_rate": 4.9847020340656215e-05, + "loss": 5.3706, + "step": 5925 + }, + { + "epoch": 0.03524360072318965, + "grad_norm": 1.9594995975494385, + "learning_rate": 4.98469687416368e-05, + "loss": 5.4113, + "step": 5926 + }, + { + "epoch": 0.03524954800647064, + "grad_norm": 1.8772166967391968, + "learning_rate": 4.984691713394356e-05, + "loss": 5.368, + "step": 5927 + }, + { + "epoch": 0.035255495289751644, + "grad_norm": 2.1143953800201416, + "learning_rate": 4.9846865517576524e-05, + "loss": 5.3829, + "step": 5928 + }, + { + "epoch": 0.03526144257303264, + "grad_norm": 2.0923383235931396, + "learning_rate": 4.984681389253571e-05, + "loss": 5.9834, + "step": 5929 + }, + { + "epoch": 0.035267389856313634, + "grad_norm": 2.016749620437622, + "learning_rate": 4.984676225882112e-05, + "loss": 5.68, + "step": 5930 + }, + { + "epoch": 0.035273337139594636, + "grad_norm": 1.6040265560150146, + "learning_rate": 4.984671061643279e-05, + "loss": 5.7406, + "step": 5931 + }, + { + "epoch": 0.03527928442287563, + "grad_norm": 2.100774049758911, + "learning_rate": 4.984665896537072e-05, + "loss": 5.5545, + "step": 5932 + }, + { + "epoch": 0.035285231706156626, + "grad_norm": 2.008575439453125, + "learning_rate": 4.984660730563494e-05, + "loss": 5.3769, + "step": 5933 + }, + { + "epoch": 0.03529117898943762, + "grad_norm": 1.9622136354446411, + "learning_rate": 4.984655563722547e-05, + "loss": 5.5792, + "step": 5934 + }, + { + "epoch": 0.035297126272718624, + "grad_norm": 1.764647364616394, + "learning_rate": 4.9846503960142325e-05, + "loss": 5.6543, + "step": 5935 + }, + { + "epoch": 0.03530307355599962, + "grad_norm": 1.6166809797286987, + "learning_rate": 4.984645227438552e-05, + "loss": 5.7948, + "step": 5936 + }, + { + "epoch": 0.035309020839280614, + "grad_norm": 1.7368977069854736, + "learning_rate": 4.9846400579955074e-05, + "loss": 5.6288, + "step": 5937 + }, + { + "epoch": 0.035314968122561616, + "grad_norm": 1.649059772491455, + "learning_rate": 4.984634887685101e-05, + "loss": 5.8538, + "step": 5938 + }, + { + "epoch": 0.03532091540584261, + "grad_norm": 1.6092652082443237, + "learning_rate": 4.984629716507334e-05, + "loss": 5.7077, + "step": 5939 + }, + { + "epoch": 0.035326862689123606, + "grad_norm": 1.76821768283844, + "learning_rate": 4.984624544462209e-05, + "loss": 5.4206, + "step": 5940 + }, + { + "epoch": 0.03533280997240461, + "grad_norm": 1.5885004997253418, + "learning_rate": 4.984619371549727e-05, + "loss": 5.3997, + "step": 5941 + }, + { + "epoch": 0.0353387572556856, + "grad_norm": 1.6730574369430542, + "learning_rate": 4.984614197769889e-05, + "loss": 5.4952, + "step": 5942 + }, + { + "epoch": 0.0353447045389666, + "grad_norm": 1.9951595067977905, + "learning_rate": 4.984609023122699e-05, + "loss": 5.5658, + "step": 5943 + }, + { + "epoch": 0.0353506518222476, + "grad_norm": 1.8277794122695923, + "learning_rate": 4.984603847608157e-05, + "loss": 5.5313, + "step": 5944 + }, + { + "epoch": 0.035356599105528595, + "grad_norm": 1.5988150835037231, + "learning_rate": 4.984598671226266e-05, + "loss": 5.4661, + "step": 5945 + }, + { + "epoch": 0.03536254638880959, + "grad_norm": 1.8313721418380737, + "learning_rate": 4.9845934939770264e-05, + "loss": 5.3005, + "step": 5946 + }, + { + "epoch": 0.03536849367209059, + "grad_norm": 1.8441407680511475, + "learning_rate": 4.984588315860442e-05, + "loss": 5.4564, + "step": 5947 + }, + { + "epoch": 0.03537444095537159, + "grad_norm": 2.8165388107299805, + "learning_rate": 4.9845831368765126e-05, + "loss": 5.4582, + "step": 5948 + }, + { + "epoch": 0.03538038823865258, + "grad_norm": 1.8860023021697998, + "learning_rate": 4.9845779570252415e-05, + "loss": 5.4952, + "step": 5949 + }, + { + "epoch": 0.03538633552193358, + "grad_norm": 1.7752633094787598, + "learning_rate": 4.98457277630663e-05, + "loss": 5.4301, + "step": 5950 + }, + { + "epoch": 0.03539228280521458, + "grad_norm": 1.9038548469543457, + "learning_rate": 4.984567594720679e-05, + "loss": 5.2591, + "step": 5951 + }, + { + "epoch": 0.035398230088495575, + "grad_norm": 2.6449787616729736, + "learning_rate": 4.984562412267392e-05, + "loss": 5.9317, + "step": 5952 + }, + { + "epoch": 0.03540417737177657, + "grad_norm": 1.95949125289917, + "learning_rate": 4.98455722894677e-05, + "loss": 5.4686, + "step": 5953 + }, + { + "epoch": 0.03541012465505757, + "grad_norm": 2.0208640098571777, + "learning_rate": 4.984552044758814e-05, + "loss": 5.6361, + "step": 5954 + }, + { + "epoch": 0.03541607193833857, + "grad_norm": 2.2328197956085205, + "learning_rate": 4.9845468597035274e-05, + "loss": 5.455, + "step": 5955 + }, + { + "epoch": 0.03542201922161956, + "grad_norm": 2.115952968597412, + "learning_rate": 4.9845416737809105e-05, + "loss": 5.3275, + "step": 5956 + }, + { + "epoch": 0.035427966504900564, + "grad_norm": 2.023791790008545, + "learning_rate": 4.984536486990966e-05, + "loss": 5.3135, + "step": 5957 + }, + { + "epoch": 0.03543391378818156, + "grad_norm": 1.9721077680587769, + "learning_rate": 4.9845312993336945e-05, + "loss": 5.3429, + "step": 5958 + }, + { + "epoch": 0.035439861071462554, + "grad_norm": 2.047588586807251, + "learning_rate": 4.9845261108091e-05, + "loss": 5.4027, + "step": 5959 + }, + { + "epoch": 0.035445808354743556, + "grad_norm": 1.9019498825073242, + "learning_rate": 4.9845209214171826e-05, + "loss": 5.3867, + "step": 5960 + }, + { + "epoch": 0.03545175563802455, + "grad_norm": 1.9442843198776245, + "learning_rate": 4.984515731157945e-05, + "loss": 5.3189, + "step": 5961 + }, + { + "epoch": 0.035457702921305546, + "grad_norm": 2.051422357559204, + "learning_rate": 4.9845105400313885e-05, + "loss": 5.5713, + "step": 5962 + }, + { + "epoch": 0.03546365020458654, + "grad_norm": 1.811908483505249, + "learning_rate": 4.9845053480375145e-05, + "loss": 5.6221, + "step": 5963 + }, + { + "epoch": 0.035469597487867544, + "grad_norm": 2.017991542816162, + "learning_rate": 4.984500155176326e-05, + "loss": 5.2774, + "step": 5964 + }, + { + "epoch": 0.03547554477114854, + "grad_norm": 1.972644329071045, + "learning_rate": 4.9844949614478244e-05, + "loss": 5.3208, + "step": 5965 + }, + { + "epoch": 0.035481492054429534, + "grad_norm": 1.9937026500701904, + "learning_rate": 4.984489766852011e-05, + "loss": 5.455, + "step": 5966 + }, + { + "epoch": 0.035487439337710536, + "grad_norm": 1.7297019958496094, + "learning_rate": 4.984484571388887e-05, + "loss": 5.3829, + "step": 5967 + }, + { + "epoch": 0.03549338662099153, + "grad_norm": 1.6428204774856567, + "learning_rate": 4.984479375058456e-05, + "loss": 5.3638, + "step": 5968 + }, + { + "epoch": 0.035499333904272526, + "grad_norm": 1.9522719383239746, + "learning_rate": 4.9844741778607186e-05, + "loss": 5.3379, + "step": 5969 + }, + { + "epoch": 0.03550528118755353, + "grad_norm": 2.0280921459198, + "learning_rate": 4.984468979795677e-05, + "loss": 5.4366, + "step": 5970 + }, + { + "epoch": 0.03551122847083452, + "grad_norm": 2.0396251678466797, + "learning_rate": 4.9844637808633334e-05, + "loss": 5.5681, + "step": 5971 + }, + { + "epoch": 0.03551717575411552, + "grad_norm": 1.5256271362304688, + "learning_rate": 4.984458581063689e-05, + "loss": 5.602, + "step": 5972 + }, + { + "epoch": 0.03552312303739652, + "grad_norm": 1.8829892873764038, + "learning_rate": 4.984453380396745e-05, + "loss": 5.3851, + "step": 5973 + }, + { + "epoch": 0.035529070320677515, + "grad_norm": 2.047106981277466, + "learning_rate": 4.984448178862505e-05, + "loss": 5.3724, + "step": 5974 + }, + { + "epoch": 0.03553501760395851, + "grad_norm": 2.066572904586792, + "learning_rate": 4.984442976460969e-05, + "loss": 5.3352, + "step": 5975 + }, + { + "epoch": 0.03554096488723951, + "grad_norm": 1.9785430431365967, + "learning_rate": 4.98443777319214e-05, + "loss": 5.2641, + "step": 5976 + }, + { + "epoch": 0.03554691217052051, + "grad_norm": 1.8999443054199219, + "learning_rate": 4.98443256905602e-05, + "loss": 5.3402, + "step": 5977 + }, + { + "epoch": 0.0355528594538015, + "grad_norm": 1.8599263429641724, + "learning_rate": 4.98442736405261e-05, + "loss": 5.2612, + "step": 5978 + }, + { + "epoch": 0.0355588067370825, + "grad_norm": 1.7216875553131104, + "learning_rate": 4.984422158181911e-05, + "loss": 5.4041, + "step": 5979 + }, + { + "epoch": 0.0355647540203635, + "grad_norm": 2.0259687900543213, + "learning_rate": 4.984416951443926e-05, + "loss": 5.4895, + "step": 5980 + }, + { + "epoch": 0.035570701303644495, + "grad_norm": 1.705736756324768, + "learning_rate": 4.9844117438386583e-05, + "loss": 5.5845, + "step": 5981 + }, + { + "epoch": 0.03557664858692549, + "grad_norm": 1.9546462297439575, + "learning_rate": 4.9844065353661074e-05, + "loss": 5.6803, + "step": 5982 + }, + { + "epoch": 0.03558259587020649, + "grad_norm": 1.829689383506775, + "learning_rate": 4.984401326026275e-05, + "loss": 5.5816, + "step": 5983 + }, + { + "epoch": 0.03558854315348749, + "grad_norm": 1.6464663743972778, + "learning_rate": 4.984396115819164e-05, + "loss": 5.5738, + "step": 5984 + }, + { + "epoch": 0.03559449043676848, + "grad_norm": 1.7786076068878174, + "learning_rate": 4.984390904744777e-05, + "loss": 5.3667, + "step": 5985 + }, + { + "epoch": 0.035600437720049484, + "grad_norm": 2.210754871368408, + "learning_rate": 4.984385692803114e-05, + "loss": 5.5259, + "step": 5986 + }, + { + "epoch": 0.03560638500333048, + "grad_norm": 1.7361842393875122, + "learning_rate": 4.984380479994179e-05, + "loss": 5.6108, + "step": 5987 + }, + { + "epoch": 0.035612332286611474, + "grad_norm": 1.926477313041687, + "learning_rate": 4.9843752663179703e-05, + "loss": 5.593, + "step": 5988 + }, + { + "epoch": 0.035618279569892476, + "grad_norm": 1.6683733463287354, + "learning_rate": 4.984370051774493e-05, + "loss": 5.6305, + "step": 5989 + }, + { + "epoch": 0.03562422685317347, + "grad_norm": 1.790499210357666, + "learning_rate": 4.9843648363637475e-05, + "loss": 5.596, + "step": 5990 + }, + { + "epoch": 0.035630174136454466, + "grad_norm": 1.8355207443237305, + "learning_rate": 4.984359620085736e-05, + "loss": 5.5818, + "step": 5991 + }, + { + "epoch": 0.03563612141973546, + "grad_norm": 1.9352680444717407, + "learning_rate": 4.98435440294046e-05, + "loss": 5.187, + "step": 5992 + }, + { + "epoch": 0.03564206870301646, + "grad_norm": 2.063159465789795, + "learning_rate": 4.9843491849279225e-05, + "loss": 5.3245, + "step": 5993 + }, + { + "epoch": 0.03564801598629746, + "grad_norm": 1.6848958730697632, + "learning_rate": 4.984343966048123e-05, + "loss": 5.4454, + "step": 5994 + }, + { + "epoch": 0.035653963269578454, + "grad_norm": 2.1244423389434814, + "learning_rate": 4.9843387463010654e-05, + "loss": 5.5018, + "step": 5995 + }, + { + "epoch": 0.035659910552859456, + "grad_norm": 1.9100427627563477, + "learning_rate": 4.9843335256867505e-05, + "loss": 5.5597, + "step": 5996 + }, + { + "epoch": 0.03566585783614045, + "grad_norm": 1.9130252599716187, + "learning_rate": 4.984328304205181e-05, + "loss": 5.4538, + "step": 5997 + }, + { + "epoch": 0.035671805119421446, + "grad_norm": 1.6285213232040405, + "learning_rate": 4.984323081856358e-05, + "loss": 5.7361, + "step": 5998 + }, + { + "epoch": 0.03567775240270245, + "grad_norm": 1.6690980195999146, + "learning_rate": 4.984317858640283e-05, + "loss": 5.7537, + "step": 5999 + }, + { + "epoch": 0.03568369968598344, + "grad_norm": 1.5258572101593018, + "learning_rate": 4.984312634556959e-05, + "loss": 5.7419, + "step": 6000 + }, + { + "epoch": 0.03568964696926444, + "grad_norm": 1.9586881399154663, + "learning_rate": 4.984307409606386e-05, + "loss": 5.4449, + "step": 6001 + }, + { + "epoch": 0.03569559425254544, + "grad_norm": 2.1795685291290283, + "learning_rate": 4.9843021837885684e-05, + "loss": 5.3833, + "step": 6002 + }, + { + "epoch": 0.035701541535826435, + "grad_norm": 2.1241326332092285, + "learning_rate": 4.984296957103506e-05, + "loss": 5.3064, + "step": 6003 + }, + { + "epoch": 0.03570748881910743, + "grad_norm": 1.9621204137802124, + "learning_rate": 4.9842917295512004e-05, + "loss": 5.3002, + "step": 6004 + }, + { + "epoch": 0.03571343610238843, + "grad_norm": 2.041503429412842, + "learning_rate": 4.984286501131655e-05, + "loss": 5.2885, + "step": 6005 + }, + { + "epoch": 0.03571938338566943, + "grad_norm": 2.1099791526794434, + "learning_rate": 4.984281271844871e-05, + "loss": 5.3038, + "step": 6006 + }, + { + "epoch": 0.03572533066895042, + "grad_norm": 2.0209009647369385, + "learning_rate": 4.98427604169085e-05, + "loss": 5.8373, + "step": 6007 + }, + { + "epoch": 0.03573127795223142, + "grad_norm": 1.7534282207489014, + "learning_rate": 4.9842708106695934e-05, + "loss": 5.6522, + "step": 6008 + }, + { + "epoch": 0.03573722523551242, + "grad_norm": 2.3014237880706787, + "learning_rate": 4.984265578781104e-05, + "loss": 5.462, + "step": 6009 + }, + { + "epoch": 0.035743172518793415, + "grad_norm": 2.123767614364624, + "learning_rate": 4.984260346025382e-05, + "loss": 5.3901, + "step": 6010 + }, + { + "epoch": 0.03574911980207441, + "grad_norm": 2.4190175533294678, + "learning_rate": 4.9842551124024315e-05, + "loss": 5.1526, + "step": 6011 + }, + { + "epoch": 0.03575506708535541, + "grad_norm": 1.9972834587097168, + "learning_rate": 4.984249877912254e-05, + "loss": 5.2987, + "step": 6012 + }, + { + "epoch": 0.03576101436863641, + "grad_norm": 2.002969980239868, + "learning_rate": 4.9842446425548494e-05, + "loss": 5.5244, + "step": 6013 + }, + { + "epoch": 0.0357669616519174, + "grad_norm": 2.8208391666412354, + "learning_rate": 4.984239406330221e-05, + "loss": 5.834, + "step": 6014 + }, + { + "epoch": 0.035772908935198404, + "grad_norm": 2.409303665161133, + "learning_rate": 4.98423416923837e-05, + "loss": 5.1709, + "step": 6015 + }, + { + "epoch": 0.0357788562184794, + "grad_norm": 2.215888500213623, + "learning_rate": 4.984228931279298e-05, + "loss": 5.38, + "step": 6016 + }, + { + "epoch": 0.035784803501760394, + "grad_norm": 1.9130421876907349, + "learning_rate": 4.9842236924530086e-05, + "loss": 5.4551, + "step": 6017 + }, + { + "epoch": 0.035790750785041396, + "grad_norm": 1.8963314294815063, + "learning_rate": 4.9842184527595015e-05, + "loss": 5.3512, + "step": 6018 + }, + { + "epoch": 0.03579669806832239, + "grad_norm": 2.0085666179656982, + "learning_rate": 4.98421321219878e-05, + "loss": 5.3013, + "step": 6019 + }, + { + "epoch": 0.035802645351603386, + "grad_norm": 2.1059834957122803, + "learning_rate": 4.9842079707708446e-05, + "loss": 5.4052, + "step": 6020 + }, + { + "epoch": 0.03580859263488438, + "grad_norm": 1.965694785118103, + "learning_rate": 4.984202728475699e-05, + "loss": 5.5392, + "step": 6021 + }, + { + "epoch": 0.03581453991816538, + "grad_norm": 1.9495680332183838, + "learning_rate": 4.9841974853133425e-05, + "loss": 5.309, + "step": 6022 + }, + { + "epoch": 0.03582048720144638, + "grad_norm": 1.9762555360794067, + "learning_rate": 4.9841922412837795e-05, + "loss": 5.3979, + "step": 6023 + }, + { + "epoch": 0.035826434484727374, + "grad_norm": 1.7825839519500732, + "learning_rate": 4.98418699638701e-05, + "loss": 5.3502, + "step": 6024 + }, + { + "epoch": 0.035832381768008376, + "grad_norm": 1.9636192321777344, + "learning_rate": 4.984181750623037e-05, + "loss": 5.6341, + "step": 6025 + }, + { + "epoch": 0.03583832905128937, + "grad_norm": 1.833883285522461, + "learning_rate": 4.984176503991861e-05, + "loss": 5.5861, + "step": 6026 + }, + { + "epoch": 0.035844276334570366, + "grad_norm": 1.91568124294281, + "learning_rate": 4.984171256493485e-05, + "loss": 5.591, + "step": 6027 + }, + { + "epoch": 0.03585022361785137, + "grad_norm": 2.153472423553467, + "learning_rate": 4.9841660081279105e-05, + "loss": 5.3463, + "step": 6028 + }, + { + "epoch": 0.03585617090113236, + "grad_norm": 1.8164830207824707, + "learning_rate": 4.984160758895139e-05, + "loss": 5.4886, + "step": 6029 + }, + { + "epoch": 0.03586211818441336, + "grad_norm": 2.0216922760009766, + "learning_rate": 4.984155508795174e-05, + "loss": 5.5777, + "step": 6030 + }, + { + "epoch": 0.03586806546769436, + "grad_norm": 1.966779351234436, + "learning_rate": 4.984150257828014e-05, + "loss": 5.1867, + "step": 6031 + }, + { + "epoch": 0.035874012750975355, + "grad_norm": 2.091109275817871, + "learning_rate": 4.9841450059936645e-05, + "loss": 5.5302, + "step": 6032 + }, + { + "epoch": 0.03587996003425635, + "grad_norm": 1.8772802352905273, + "learning_rate": 4.984139753292125e-05, + "loss": 5.2904, + "step": 6033 + }, + { + "epoch": 0.03588590731753735, + "grad_norm": 2.049431800842285, + "learning_rate": 4.984134499723397e-05, + "loss": 5.293, + "step": 6034 + }, + { + "epoch": 0.03589185460081835, + "grad_norm": 2.0902609825134277, + "learning_rate": 4.984129245287485e-05, + "loss": 5.2689, + "step": 6035 + }, + { + "epoch": 0.03589780188409934, + "grad_norm": 1.91702139377594, + "learning_rate": 4.9841239899843886e-05, + "loss": 5.255, + "step": 6036 + }, + { + "epoch": 0.03590374916738034, + "grad_norm": 1.7073708772659302, + "learning_rate": 4.984118733814109e-05, + "loss": 5.3272, + "step": 6037 + }, + { + "epoch": 0.03590969645066134, + "grad_norm": 1.625712275505066, + "learning_rate": 4.9841134767766506e-05, + "loss": 5.5366, + "step": 6038 + }, + { + "epoch": 0.035915643733942335, + "grad_norm": 1.8465087413787842, + "learning_rate": 4.984108218872014e-05, + "loss": 5.3373, + "step": 6039 + }, + { + "epoch": 0.03592159101722333, + "grad_norm": 2.2392280101776123, + "learning_rate": 4.9841029601002e-05, + "loss": 5.5898, + "step": 6040 + }, + { + "epoch": 0.03592753830050433, + "grad_norm": 2.6571459770202637, + "learning_rate": 4.984097700461212e-05, + "loss": 5.963, + "step": 6041 + }, + { + "epoch": 0.03593348558378533, + "grad_norm": 2.7220845222473145, + "learning_rate": 4.98409243995505e-05, + "loss": 5.6997, + "step": 6042 + }, + { + "epoch": 0.03593943286706632, + "grad_norm": 2.430968999862671, + "learning_rate": 4.9840871785817185e-05, + "loss": 5.2949, + "step": 6043 + }, + { + "epoch": 0.035945380150347324, + "grad_norm": 2.3006606101989746, + "learning_rate": 4.984081916341217e-05, + "loss": 5.2045, + "step": 6044 + }, + { + "epoch": 0.03595132743362832, + "grad_norm": 2.2382659912109375, + "learning_rate": 4.984076653233548e-05, + "loss": 5.417, + "step": 6045 + }, + { + "epoch": 0.035957274716909314, + "grad_norm": 2.1896233558654785, + "learning_rate": 4.9840713892587146e-05, + "loss": 5.7215, + "step": 6046 + }, + { + "epoch": 0.035963222000190316, + "grad_norm": 1.8175956010818481, + "learning_rate": 4.9840661244167166e-05, + "loss": 5.569, + "step": 6047 + }, + { + "epoch": 0.03596916928347131, + "grad_norm": 2.066828727722168, + "learning_rate": 4.984060858707557e-05, + "loss": 5.6285, + "step": 6048 + }, + { + "epoch": 0.035975116566752306, + "grad_norm": 2.246291160583496, + "learning_rate": 4.984055592131237e-05, + "loss": 5.5583, + "step": 6049 + }, + { + "epoch": 0.0359810638500333, + "grad_norm": 2.2394871711730957, + "learning_rate": 4.984050324687759e-05, + "loss": 5.3917, + "step": 6050 + }, + { + "epoch": 0.0359870111333143, + "grad_norm": 2.5051162242889404, + "learning_rate": 4.984045056377125e-05, + "loss": 5.6955, + "step": 6051 + }, + { + "epoch": 0.0359929584165953, + "grad_norm": 2.1360414028167725, + "learning_rate": 4.984039787199336e-05, + "loss": 5.5451, + "step": 6052 + }, + { + "epoch": 0.035998905699876294, + "grad_norm": 2.0267562866210938, + "learning_rate": 4.984034517154395e-05, + "loss": 5.4559, + "step": 6053 + }, + { + "epoch": 0.036004852983157296, + "grad_norm": 1.7683112621307373, + "learning_rate": 4.984029246242303e-05, + "loss": 5.4663, + "step": 6054 + }, + { + "epoch": 0.03601080026643829, + "grad_norm": 2.0600638389587402, + "learning_rate": 4.9840239744630626e-05, + "loss": 5.5081, + "step": 6055 + }, + { + "epoch": 0.036016747549719286, + "grad_norm": 2.093698740005493, + "learning_rate": 4.984018701816674e-05, + "loss": 5.5435, + "step": 6056 + }, + { + "epoch": 0.03602269483300029, + "grad_norm": 2.217721462249756, + "learning_rate": 4.984013428303141e-05, + "loss": 5.7482, + "step": 6057 + }, + { + "epoch": 0.03602864211628128, + "grad_norm": 1.9680962562561035, + "learning_rate": 4.9840081539224636e-05, + "loss": 5.9722, + "step": 6058 + }, + { + "epoch": 0.03603458939956228, + "grad_norm": 1.8606425523757935, + "learning_rate": 4.9840028786746455e-05, + "loss": 5.8379, + "step": 6059 + }, + { + "epoch": 0.03604053668284328, + "grad_norm": 2.0129475593566895, + "learning_rate": 4.983997602559688e-05, + "loss": 5.7199, + "step": 6060 + }, + { + "epoch": 0.036046483966124275, + "grad_norm": 1.9370187520980835, + "learning_rate": 4.9839923255775917e-05, + "loss": 5.3563, + "step": 6061 + }, + { + "epoch": 0.03605243124940527, + "grad_norm": 1.775894284248352, + "learning_rate": 4.983987047728359e-05, + "loss": 5.5201, + "step": 6062 + }, + { + "epoch": 0.03605837853268627, + "grad_norm": 1.9943023920059204, + "learning_rate": 4.9839817690119934e-05, + "loss": 5.4034, + "step": 6063 + }, + { + "epoch": 0.03606432581596727, + "grad_norm": 1.9605768918991089, + "learning_rate": 4.983976489428494e-05, + "loss": 5.5314, + "step": 6064 + }, + { + "epoch": 0.03607027309924826, + "grad_norm": 1.7820254564285278, + "learning_rate": 4.983971208977866e-05, + "loss": 5.6131, + "step": 6065 + }, + { + "epoch": 0.03607622038252926, + "grad_norm": 2.010796070098877, + "learning_rate": 4.983965927660108e-05, + "loss": 5.5114, + "step": 6066 + }, + { + "epoch": 0.03608216766581026, + "grad_norm": 1.8461687564849854, + "learning_rate": 4.983960645475223e-05, + "loss": 5.4752, + "step": 6067 + }, + { + "epoch": 0.036088114949091255, + "grad_norm": 2.048119068145752, + "learning_rate": 4.983955362423214e-05, + "loss": 5.3325, + "step": 6068 + }, + { + "epoch": 0.03609406223237225, + "grad_norm": 2.021646499633789, + "learning_rate": 4.9839500785040804e-05, + "loss": 5.2238, + "step": 6069 + }, + { + "epoch": 0.03610000951565325, + "grad_norm": 1.9979503154754639, + "learning_rate": 4.9839447937178264e-05, + "loss": 5.4054, + "step": 6070 + }, + { + "epoch": 0.03610595679893425, + "grad_norm": 1.980776071548462, + "learning_rate": 4.983939508064453e-05, + "loss": 5.4094, + "step": 6071 + }, + { + "epoch": 0.03611190408221524, + "grad_norm": 1.8364293575286865, + "learning_rate": 4.9839342215439615e-05, + "loss": 5.4372, + "step": 6072 + }, + { + "epoch": 0.036117851365496244, + "grad_norm": 1.8870443105697632, + "learning_rate": 4.983928934156354e-05, + "loss": 5.4075, + "step": 6073 + }, + { + "epoch": 0.03612379864877724, + "grad_norm": 2.176180124282837, + "learning_rate": 4.9839236459016337e-05, + "loss": 5.4302, + "step": 6074 + }, + { + "epoch": 0.036129745932058234, + "grad_norm": 2.054960012435913, + "learning_rate": 4.983918356779801e-05, + "loss": 5.3796, + "step": 6075 + }, + { + "epoch": 0.036135693215339236, + "grad_norm": 2.2146401405334473, + "learning_rate": 4.9839130667908576e-05, + "loss": 5.651, + "step": 6076 + }, + { + "epoch": 0.03614164049862023, + "grad_norm": 1.908640742301941, + "learning_rate": 4.983907775934806e-05, + "loss": 5.3002, + "step": 6077 + }, + { + "epoch": 0.036147587781901226, + "grad_norm": 1.9364973306655884, + "learning_rate": 4.983902484211648e-05, + "loss": 5.2299, + "step": 6078 + }, + { + "epoch": 0.03615353506518223, + "grad_norm": 1.7405542135238647, + "learning_rate": 4.983897191621385e-05, + "loss": 5.268, + "step": 6079 + }, + { + "epoch": 0.03615948234846322, + "grad_norm": 2.0347912311553955, + "learning_rate": 4.9838918981640195e-05, + "loss": 5.4887, + "step": 6080 + }, + { + "epoch": 0.03616542963174422, + "grad_norm": 2.0755162239074707, + "learning_rate": 4.9838866038395524e-05, + "loss": 5.2208, + "step": 6081 + }, + { + "epoch": 0.03617137691502521, + "grad_norm": 1.9119634628295898, + "learning_rate": 4.9838813086479865e-05, + "loss": 5.2659, + "step": 6082 + }, + { + "epoch": 0.036177324198306215, + "grad_norm": 1.9172658920288086, + "learning_rate": 4.983876012589324e-05, + "loss": 5.4098, + "step": 6083 + }, + { + "epoch": 0.03618327148158721, + "grad_norm": 2.09004545211792, + "learning_rate": 4.983870715663565e-05, + "loss": 5.5866, + "step": 6084 + }, + { + "epoch": 0.036189218764868206, + "grad_norm": 2.0952436923980713, + "learning_rate": 4.983865417870712e-05, + "loss": 5.5288, + "step": 6085 + }, + { + "epoch": 0.03619516604814921, + "grad_norm": 1.8599412441253662, + "learning_rate": 4.9838601192107686e-05, + "loss": 5.7538, + "step": 6086 + }, + { + "epoch": 0.0362011133314302, + "grad_norm": 1.8318936824798584, + "learning_rate": 4.983854819683735e-05, + "loss": 5.9613, + "step": 6087 + }, + { + "epoch": 0.0362070606147112, + "grad_norm": 1.8312503099441528, + "learning_rate": 4.983849519289613e-05, + "loss": 5.2749, + "step": 6088 + }, + { + "epoch": 0.0362130078979922, + "grad_norm": 2.157576560974121, + "learning_rate": 4.983844218028405e-05, + "loss": 5.2826, + "step": 6089 + }, + { + "epoch": 0.036218955181273195, + "grad_norm": 2.1377198696136475, + "learning_rate": 4.983838915900112e-05, + "loss": 5.2843, + "step": 6090 + }, + { + "epoch": 0.03622490246455419, + "grad_norm": 2.0167126655578613, + "learning_rate": 4.983833612904737e-05, + "loss": 5.4713, + "step": 6091 + }, + { + "epoch": 0.03623084974783519, + "grad_norm": 1.748759388923645, + "learning_rate": 4.9838283090422814e-05, + "loss": 5.3685, + "step": 6092 + }, + { + "epoch": 0.03623679703111619, + "grad_norm": 2.0344316959381104, + "learning_rate": 4.983823004312747e-05, + "loss": 5.1093, + "step": 6093 + }, + { + "epoch": 0.03624274431439718, + "grad_norm": 1.9061161279678345, + "learning_rate": 4.9838176987161356e-05, + "loss": 5.2035, + "step": 6094 + }, + { + "epoch": 0.03624869159767818, + "grad_norm": 1.9090344905853271, + "learning_rate": 4.983812392252449e-05, + "loss": 5.3863, + "step": 6095 + }, + { + "epoch": 0.03625463888095918, + "grad_norm": 1.9536118507385254, + "learning_rate": 4.9838070849216894e-05, + "loss": 5.5349, + "step": 6096 + }, + { + "epoch": 0.036260586164240174, + "grad_norm": 1.89446222782135, + "learning_rate": 4.983801776723858e-05, + "loss": 5.7098, + "step": 6097 + }, + { + "epoch": 0.03626653344752117, + "grad_norm": 1.6403870582580566, + "learning_rate": 4.983796467658958e-05, + "loss": 5.6726, + "step": 6098 + }, + { + "epoch": 0.03627248073080217, + "grad_norm": 1.7792481184005737, + "learning_rate": 4.983791157726989e-05, + "loss": 5.6761, + "step": 6099 + }, + { + "epoch": 0.03627842801408317, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.9837858469279554e-05, + "loss": 5.6576, + "step": 6100 + }, + { + "epoch": 0.03628437529736416, + "grad_norm": 1.9885895252227783, + "learning_rate": 4.983780535261857e-05, + "loss": 5.5944, + "step": 6101 + }, + { + "epoch": 0.036290322580645164, + "grad_norm": 1.771620750427246, + "learning_rate": 4.983775222728697e-05, + "loss": 5.7949, + "step": 6102 + }, + { + "epoch": 0.03629626986392616, + "grad_norm": 1.684471845626831, + "learning_rate": 4.9837699093284765e-05, + "loss": 5.5435, + "step": 6103 + }, + { + "epoch": 0.036302217147207154, + "grad_norm": 1.8454065322875977, + "learning_rate": 4.9837645950611966e-05, + "loss": 5.4526, + "step": 6104 + }, + { + "epoch": 0.036308164430488156, + "grad_norm": 1.6522735357284546, + "learning_rate": 4.983759279926862e-05, + "loss": 5.7302, + "step": 6105 + }, + { + "epoch": 0.03631411171376915, + "grad_norm": 1.8691065311431885, + "learning_rate": 4.9837539639254713e-05, + "loss": 5.6494, + "step": 6106 + }, + { + "epoch": 0.036320058997050146, + "grad_norm": 1.9420015811920166, + "learning_rate": 4.9837486470570286e-05, + "loss": 5.77, + "step": 6107 + }, + { + "epoch": 0.03632600628033115, + "grad_norm": 1.8399784564971924, + "learning_rate": 4.9837433293215344e-05, + "loss": 5.6669, + "step": 6108 + }, + { + "epoch": 0.03633195356361214, + "grad_norm": 1.799460530281067, + "learning_rate": 4.983738010718991e-05, + "loss": 5.5557, + "step": 6109 + }, + { + "epoch": 0.03633790084689314, + "grad_norm": 1.8826879262924194, + "learning_rate": 4.9837326912494e-05, + "loss": 5.4865, + "step": 6110 + }, + { + "epoch": 0.03634384813017413, + "grad_norm": 1.9582240581512451, + "learning_rate": 4.983727370912764e-05, + "loss": 5.5882, + "step": 6111 + }, + { + "epoch": 0.036349795413455135, + "grad_norm": 2.011892795562744, + "learning_rate": 4.9837220497090846e-05, + "loss": 5.4932, + "step": 6112 + }, + { + "epoch": 0.03635574269673613, + "grad_norm": 1.7751367092132568, + "learning_rate": 4.983716727638363e-05, + "loss": 5.4981, + "step": 6113 + }, + { + "epoch": 0.036361689980017126, + "grad_norm": 1.984121322631836, + "learning_rate": 4.983711404700603e-05, + "loss": 5.4801, + "step": 6114 + }, + { + "epoch": 0.03636763726329813, + "grad_norm": 1.9601882696151733, + "learning_rate": 4.983706080895804e-05, + "loss": 5.218, + "step": 6115 + }, + { + "epoch": 0.03637358454657912, + "grad_norm": 1.800227165222168, + "learning_rate": 4.9837007562239684e-05, + "loss": 5.5178, + "step": 6116 + }, + { + "epoch": 0.03637953182986012, + "grad_norm": 1.9257889986038208, + "learning_rate": 4.983695430685099e-05, + "loss": 5.6695, + "step": 6117 + }, + { + "epoch": 0.03638547911314112, + "grad_norm": 1.8011913299560547, + "learning_rate": 4.9836901042791976e-05, + "loss": 5.7478, + "step": 6118 + }, + { + "epoch": 0.036391426396422115, + "grad_norm": 1.8668690919876099, + "learning_rate": 4.983684777006264e-05, + "loss": 5.7027, + "step": 6119 + }, + { + "epoch": 0.03639737367970311, + "grad_norm": 1.898126244544983, + "learning_rate": 4.983679448866304e-05, + "loss": 5.5206, + "step": 6120 + }, + { + "epoch": 0.03640332096298411, + "grad_norm": 1.8264409303665161, + "learning_rate": 4.983674119859316e-05, + "loss": 5.4686, + "step": 6121 + }, + { + "epoch": 0.03640926824626511, + "grad_norm": 1.8090230226516724, + "learning_rate": 4.983668789985303e-05, + "loss": 5.4761, + "step": 6122 + }, + { + "epoch": 0.0364152155295461, + "grad_norm": 1.8193403482437134, + "learning_rate": 4.983663459244266e-05, + "loss": 5.3443, + "step": 6123 + }, + { + "epoch": 0.0364211628128271, + "grad_norm": 1.8199255466461182, + "learning_rate": 4.9836581276362095e-05, + "loss": 5.427, + "step": 6124 + }, + { + "epoch": 0.0364271100961081, + "grad_norm": 1.72145414352417, + "learning_rate": 4.9836527951611325e-05, + "loss": 5.4372, + "step": 6125 + }, + { + "epoch": 0.036433057379389094, + "grad_norm": 1.8164423704147339, + "learning_rate": 4.9836474618190386e-05, + "loss": 5.4702, + "step": 6126 + }, + { + "epoch": 0.03643900466267009, + "grad_norm": 1.897775650024414, + "learning_rate": 4.9836421276099287e-05, + "loss": 5.4259, + "step": 6127 + }, + { + "epoch": 0.03644495194595109, + "grad_norm": 1.851101279258728, + "learning_rate": 4.9836367925338046e-05, + "loss": 5.3837, + "step": 6128 + }, + { + "epoch": 0.03645089922923209, + "grad_norm": 1.749374508857727, + "learning_rate": 4.98363145659067e-05, + "loss": 5.3232, + "step": 6129 + }, + { + "epoch": 0.03645684651251308, + "grad_norm": 1.95986008644104, + "learning_rate": 4.9836261197805235e-05, + "loss": 5.2692, + "step": 6130 + }, + { + "epoch": 0.036462793795794084, + "grad_norm": 1.7947750091552734, + "learning_rate": 4.98362078210337e-05, + "loss": 5.409, + "step": 6131 + }, + { + "epoch": 0.03646874107907508, + "grad_norm": 2.119044303894043, + "learning_rate": 4.983615443559209e-05, + "loss": 5.5924, + "step": 6132 + }, + { + "epoch": 0.036474688362356074, + "grad_norm": 1.7285267114639282, + "learning_rate": 4.983610104148044e-05, + "loss": 5.6955, + "step": 6133 + }, + { + "epoch": 0.036480635645637076, + "grad_norm": 2.1711652278900146, + "learning_rate": 4.983604763869877e-05, + "loss": 5.1941, + "step": 6134 + }, + { + "epoch": 0.03648658292891807, + "grad_norm": 2.060039758682251, + "learning_rate": 4.983599422724709e-05, + "loss": 5.5131, + "step": 6135 + }, + { + "epoch": 0.036492530212199066, + "grad_norm": 1.6212393045425415, + "learning_rate": 4.9835940807125415e-05, + "loss": 5.4856, + "step": 6136 + }, + { + "epoch": 0.03649847749548007, + "grad_norm": 1.7602918148040771, + "learning_rate": 4.983588737833378e-05, + "loss": 5.4177, + "step": 6137 + }, + { + "epoch": 0.03650442477876106, + "grad_norm": 2.660930633544922, + "learning_rate": 4.983583394087218e-05, + "loss": 5.5879, + "step": 6138 + }, + { + "epoch": 0.03651037206204206, + "grad_norm": 2.3608336448669434, + "learning_rate": 4.9835780494740655e-05, + "loss": 5.3894, + "step": 6139 + }, + { + "epoch": 0.03651631934532305, + "grad_norm": 2.071632146835327, + "learning_rate": 4.983572703993922e-05, + "loss": 5.6185, + "step": 6140 + }, + { + "epoch": 0.036522266628604055, + "grad_norm": 1.7023842334747314, + "learning_rate": 4.983567357646788e-05, + "loss": 5.5648, + "step": 6141 + }, + { + "epoch": 0.03652821391188505, + "grad_norm": 2.2168798446655273, + "learning_rate": 4.983562010432667e-05, + "loss": 5.4578, + "step": 6142 + }, + { + "epoch": 0.036534161195166046, + "grad_norm": 2.0916104316711426, + "learning_rate": 4.98355666235156e-05, + "loss": 5.4977, + "step": 6143 + }, + { + "epoch": 0.03654010847844705, + "grad_norm": 1.7101606130599976, + "learning_rate": 4.9835513134034686e-05, + "loss": 5.4081, + "step": 6144 + }, + { + "epoch": 0.03654605576172804, + "grad_norm": 1.9058302640914917, + "learning_rate": 4.983545963588395e-05, + "loss": 5.2145, + "step": 6145 + }, + { + "epoch": 0.03655200304500904, + "grad_norm": 2.319023847579956, + "learning_rate": 4.9835406129063424e-05, + "loss": 5.3023, + "step": 6146 + }, + { + "epoch": 0.03655795032829004, + "grad_norm": 2.1135916709899902, + "learning_rate": 4.98353526135731e-05, + "loss": 5.4796, + "step": 6147 + }, + { + "epoch": 0.036563897611571035, + "grad_norm": 2.409088373184204, + "learning_rate": 4.983529908941302e-05, + "loss": 5.3124, + "step": 6148 + }, + { + "epoch": 0.03656984489485203, + "grad_norm": 1.8679871559143066, + "learning_rate": 4.9835245556583185e-05, + "loss": 5.3741, + "step": 6149 + }, + { + "epoch": 0.03657579217813303, + "grad_norm": 1.9335602521896362, + "learning_rate": 4.983519201508363e-05, + "loss": 5.3231, + "step": 6150 + }, + { + "epoch": 0.03658173946141403, + "grad_norm": 2.0352535247802734, + "learning_rate": 4.9835138464914366e-05, + "loss": 5.4643, + "step": 6151 + }, + { + "epoch": 0.03658768674469502, + "grad_norm": 2.4156594276428223, + "learning_rate": 4.983508490607541e-05, + "loss": 5.4092, + "step": 6152 + }, + { + "epoch": 0.03659363402797602, + "grad_norm": 2.1936473846435547, + "learning_rate": 4.983503133856678e-05, + "loss": 5.5093, + "step": 6153 + }, + { + "epoch": 0.03659958131125702, + "grad_norm": 1.6346958875656128, + "learning_rate": 4.98349777623885e-05, + "loss": 5.512, + "step": 6154 + }, + { + "epoch": 0.036605528594538014, + "grad_norm": 1.9810141324996948, + "learning_rate": 4.9834924177540584e-05, + "loss": 5.4981, + "step": 6155 + }, + { + "epoch": 0.03661147587781901, + "grad_norm": 2.1253950595855713, + "learning_rate": 4.9834870584023055e-05, + "loss": 5.4022, + "step": 6156 + }, + { + "epoch": 0.03661742316110001, + "grad_norm": 2.011754274368286, + "learning_rate": 4.9834816981835926e-05, + "loss": 5.6107, + "step": 6157 + }, + { + "epoch": 0.036623370444381007, + "grad_norm": 2.210934638977051, + "learning_rate": 4.983476337097922e-05, + "loss": 5.4348, + "step": 6158 + }, + { + "epoch": 0.036629317727662, + "grad_norm": 2.1351871490478516, + "learning_rate": 4.983470975145296e-05, + "loss": 5.2022, + "step": 6159 + }, + { + "epoch": 0.036635265010943004, + "grad_norm": 2.1564714908599854, + "learning_rate": 4.983465612325715e-05, + "loss": 5.3583, + "step": 6160 + }, + { + "epoch": 0.036641212294224, + "grad_norm": 1.9411755800247192, + "learning_rate": 4.983460248639182e-05, + "loss": 5.4643, + "step": 6161 + }, + { + "epoch": 0.036647159577504994, + "grad_norm": 2.129741907119751, + "learning_rate": 4.983454884085699e-05, + "loss": 5.3834, + "step": 6162 + }, + { + "epoch": 0.036653106860785996, + "grad_norm": 2.12172269821167, + "learning_rate": 4.983449518665268e-05, + "loss": 5.4418, + "step": 6163 + }, + { + "epoch": 0.03665905414406699, + "grad_norm": 2.097452163696289, + "learning_rate": 4.9834441523778893e-05, + "loss": 5.3741, + "step": 6164 + }, + { + "epoch": 0.036665001427347986, + "grad_norm": 2.0458765029907227, + "learning_rate": 4.983438785223567e-05, + "loss": 5.373, + "step": 6165 + }, + { + "epoch": 0.03667094871062899, + "grad_norm": 1.9431376457214355, + "learning_rate": 4.983433417202301e-05, + "loss": 5.4003, + "step": 6166 + }, + { + "epoch": 0.03667689599390998, + "grad_norm": 2.136819362640381, + "learning_rate": 4.983428048314095e-05, + "loss": 5.503, + "step": 6167 + }, + { + "epoch": 0.03668284327719098, + "grad_norm": 1.863153338432312, + "learning_rate": 4.983422678558949e-05, + "loss": 5.4357, + "step": 6168 + }, + { + "epoch": 0.03668879056047197, + "grad_norm": 1.9198437929153442, + "learning_rate": 4.9834173079368665e-05, + "loss": 5.4304, + "step": 6169 + }, + { + "epoch": 0.036694737843752975, + "grad_norm": 1.9080480337142944, + "learning_rate": 4.9834119364478484e-05, + "loss": 5.4329, + "step": 6170 + }, + { + "epoch": 0.03670068512703397, + "grad_norm": 1.9116952419281006, + "learning_rate": 4.983406564091897e-05, + "loss": 5.3248, + "step": 6171 + }, + { + "epoch": 0.036706632410314965, + "grad_norm": 2.007685661315918, + "learning_rate": 4.983401190869014e-05, + "loss": 5.3554, + "step": 6172 + }, + { + "epoch": 0.03671257969359597, + "grad_norm": 1.8134535551071167, + "learning_rate": 4.983395816779201e-05, + "loss": 5.2907, + "step": 6173 + }, + { + "epoch": 0.03671852697687696, + "grad_norm": 2.093061685562134, + "learning_rate": 4.9833904418224606e-05, + "loss": 5.4055, + "step": 6174 + }, + { + "epoch": 0.03672447426015796, + "grad_norm": 2.1263599395751953, + "learning_rate": 4.9833850659987934e-05, + "loss": 5.2758, + "step": 6175 + }, + { + "epoch": 0.03673042154343896, + "grad_norm": 1.9442895650863647, + "learning_rate": 4.983379689308203e-05, + "loss": 5.4183, + "step": 6176 + }, + { + "epoch": 0.036736368826719955, + "grad_norm": 1.9587830305099487, + "learning_rate": 4.98337431175069e-05, + "loss": 5.3624, + "step": 6177 + }, + { + "epoch": 0.03674231611000095, + "grad_norm": 1.9845789670944214, + "learning_rate": 4.9833689333262565e-05, + "loss": 5.3933, + "step": 6178 + }, + { + "epoch": 0.03674826339328195, + "grad_norm": 1.9748643636703491, + "learning_rate": 4.9833635540349055e-05, + "loss": 5.5221, + "step": 6179 + }, + { + "epoch": 0.03675421067656295, + "grad_norm": 1.8139559030532837, + "learning_rate": 4.983358173876638e-05, + "loss": 5.5524, + "step": 6180 + }, + { + "epoch": 0.03676015795984394, + "grad_norm": 1.93784499168396, + "learning_rate": 4.9833527928514546e-05, + "loss": 5.7145, + "step": 6181 + }, + { + "epoch": 0.03676610524312494, + "grad_norm": 1.9064222574234009, + "learning_rate": 4.9833474109593594e-05, + "loss": 5.5283, + "step": 6182 + }, + { + "epoch": 0.03677205252640594, + "grad_norm": 1.7044670581817627, + "learning_rate": 4.9833420282003524e-05, + "loss": 5.2877, + "step": 6183 + }, + { + "epoch": 0.036777999809686934, + "grad_norm": 1.8328427076339722, + "learning_rate": 4.983336644574437e-05, + "loss": 5.5019, + "step": 6184 + }, + { + "epoch": 0.03678394709296793, + "grad_norm": 1.600780725479126, + "learning_rate": 4.983331260081614e-05, + "loss": 5.5347, + "step": 6185 + }, + { + "epoch": 0.03678989437624893, + "grad_norm": 1.8333978652954102, + "learning_rate": 4.983325874721886e-05, + "loss": 5.5127, + "step": 6186 + }, + { + "epoch": 0.036795841659529926, + "grad_norm": 1.8825682401657104, + "learning_rate": 4.9833204884952546e-05, + "loss": 5.5338, + "step": 6187 + }, + { + "epoch": 0.03680178894281092, + "grad_norm": 1.6875951290130615, + "learning_rate": 4.983315101401721e-05, + "loss": 5.2465, + "step": 6188 + }, + { + "epoch": 0.036807736226091924, + "grad_norm": 1.6224017143249512, + "learning_rate": 4.983309713441289e-05, + "loss": 5.4741, + "step": 6189 + }, + { + "epoch": 0.03681368350937292, + "grad_norm": 1.991721272468567, + "learning_rate": 4.983304324613958e-05, + "loss": 5.4547, + "step": 6190 + }, + { + "epoch": 0.036819630792653914, + "grad_norm": 1.843961238861084, + "learning_rate": 4.983298934919732e-05, + "loss": 5.3262, + "step": 6191 + }, + { + "epoch": 0.036825578075934916, + "grad_norm": 1.8342533111572266, + "learning_rate": 4.983293544358612e-05, + "loss": 5.6808, + "step": 6192 + }, + { + "epoch": 0.03683152535921591, + "grad_norm": 1.8796159029006958, + "learning_rate": 4.983288152930599e-05, + "loss": 5.5454, + "step": 6193 + }, + { + "epoch": 0.036837472642496906, + "grad_norm": 1.9033316373825073, + "learning_rate": 4.983282760635696e-05, + "loss": 5.3566, + "step": 6194 + }, + { + "epoch": 0.03684341992577791, + "grad_norm": 1.915873408317566, + "learning_rate": 4.9832773674739054e-05, + "loss": 5.4555, + "step": 6195 + }, + { + "epoch": 0.0368493672090589, + "grad_norm": 1.8510993719100952, + "learning_rate": 4.983271973445228e-05, + "loss": 5.5042, + "step": 6196 + }, + { + "epoch": 0.0368553144923399, + "grad_norm": 1.7180782556533813, + "learning_rate": 4.983266578549666e-05, + "loss": 5.4671, + "step": 6197 + }, + { + "epoch": 0.03686126177562089, + "grad_norm": 1.7828874588012695, + "learning_rate": 4.983261182787221e-05, + "loss": 5.4943, + "step": 6198 + }, + { + "epoch": 0.036867209058901895, + "grad_norm": 1.5032141208648682, + "learning_rate": 4.983255786157895e-05, + "loss": 5.3881, + "step": 6199 + }, + { + "epoch": 0.03687315634218289, + "grad_norm": 2.530954599380493, + "learning_rate": 4.983250388661691e-05, + "loss": 5.4449, + "step": 6200 + }, + { + "epoch": 0.036879103625463885, + "grad_norm": 2.011044979095459, + "learning_rate": 4.983244990298609e-05, + "loss": 5.2722, + "step": 6201 + }, + { + "epoch": 0.03688505090874489, + "grad_norm": 2.2209532260894775, + "learning_rate": 4.9832395910686525e-05, + "loss": 5.0932, + "step": 6202 + }, + { + "epoch": 0.03689099819202588, + "grad_norm": 1.8695623874664307, + "learning_rate": 4.983234190971823e-05, + "loss": 5.2891, + "step": 6203 + }, + { + "epoch": 0.03689694547530688, + "grad_norm": 2.172349691390991, + "learning_rate": 4.983228790008121e-05, + "loss": 5.578, + "step": 6204 + }, + { + "epoch": 0.03690289275858788, + "grad_norm": 2.1099209785461426, + "learning_rate": 4.9832233881775505e-05, + "loss": 5.3708, + "step": 6205 + }, + { + "epoch": 0.036908840041868875, + "grad_norm": 2.16737961769104, + "learning_rate": 4.9832179854801116e-05, + "loss": 5.303, + "step": 6206 + }, + { + "epoch": 0.03691478732514987, + "grad_norm": 2.248220682144165, + "learning_rate": 4.983212581915807e-05, + "loss": 5.362, + "step": 6207 + }, + { + "epoch": 0.03692073460843087, + "grad_norm": 2.0701045989990234, + "learning_rate": 4.983207177484639e-05, + "loss": 5.4528, + "step": 6208 + }, + { + "epoch": 0.03692668189171187, + "grad_norm": 1.9989019632339478, + "learning_rate": 4.983201772186609e-05, + "loss": 5.786, + "step": 6209 + }, + { + "epoch": 0.03693262917499286, + "grad_norm": 1.9126088619232178, + "learning_rate": 4.983196366021719e-05, + "loss": 5.2312, + "step": 6210 + }, + { + "epoch": 0.03693857645827386, + "grad_norm": 2.1317548751831055, + "learning_rate": 4.9831909589899695e-05, + "loss": 5.3028, + "step": 6211 + }, + { + "epoch": 0.03694452374155486, + "grad_norm": 2.164898157119751, + "learning_rate": 4.983185551091365e-05, + "loss": 5.3186, + "step": 6212 + }, + { + "epoch": 0.036950471024835854, + "grad_norm": 2.1085855960845947, + "learning_rate": 4.983180142325906e-05, + "loss": 5.3026, + "step": 6213 + }, + { + "epoch": 0.03695641830811685, + "grad_norm": 1.8321222066879272, + "learning_rate": 4.983174732693594e-05, + "loss": 5.6632, + "step": 6214 + }, + { + "epoch": 0.03696236559139785, + "grad_norm": 2.0537941455841064, + "learning_rate": 4.983169322194432e-05, + "loss": 5.2269, + "step": 6215 + }, + { + "epoch": 0.036968312874678846, + "grad_norm": 1.9598063230514526, + "learning_rate": 4.98316391082842e-05, + "loss": 5.4974, + "step": 6216 + }, + { + "epoch": 0.03697426015795984, + "grad_norm": 2.3764376640319824, + "learning_rate": 4.983158498595563e-05, + "loss": 5.7715, + "step": 6217 + }, + { + "epoch": 0.036980207441240844, + "grad_norm": 1.8938835859298706, + "learning_rate": 4.9831530854958595e-05, + "loss": 5.5577, + "step": 6218 + }, + { + "epoch": 0.03698615472452184, + "grad_norm": 2.2023189067840576, + "learning_rate": 4.9831476715293134e-05, + "loss": 5.2596, + "step": 6219 + }, + { + "epoch": 0.036992102007802834, + "grad_norm": 1.9010800123214722, + "learning_rate": 4.9831422566959266e-05, + "loss": 5.3313, + "step": 6220 + }, + { + "epoch": 0.036998049291083836, + "grad_norm": 1.9679474830627441, + "learning_rate": 4.9831368409957e-05, + "loss": 5.2701, + "step": 6221 + }, + { + "epoch": 0.03700399657436483, + "grad_norm": 1.903558373451233, + "learning_rate": 4.983131424428635e-05, + "loss": 5.2821, + "step": 6222 + }, + { + "epoch": 0.037009943857645826, + "grad_norm": 1.976114273071289, + "learning_rate": 4.983126006994736e-05, + "loss": 5.374, + "step": 6223 + }, + { + "epoch": 0.03701589114092683, + "grad_norm": 2.9803311824798584, + "learning_rate": 4.983120588694003e-05, + "loss": 5.3576, + "step": 6224 + }, + { + "epoch": 0.03702183842420782, + "grad_norm": 1.5921218395233154, + "learning_rate": 4.983115169526438e-05, + "loss": 5.1654, + "step": 6225 + }, + { + "epoch": 0.03702778570748882, + "grad_norm": 1.7458349466323853, + "learning_rate": 4.983109749492043e-05, + "loss": 5.1038, + "step": 6226 + }, + { + "epoch": 0.03703373299076981, + "grad_norm": 1.9425132274627686, + "learning_rate": 4.983104328590821e-05, + "loss": 5.3815, + "step": 6227 + }, + { + "epoch": 0.037039680274050815, + "grad_norm": 1.9506715536117554, + "learning_rate": 4.983098906822772e-05, + "loss": 5.2215, + "step": 6228 + }, + { + "epoch": 0.03704562755733181, + "grad_norm": 1.8596410751342773, + "learning_rate": 4.983093484187899e-05, + "loss": 5.2058, + "step": 6229 + }, + { + "epoch": 0.037051574840612805, + "grad_norm": 1.720473289489746, + "learning_rate": 4.9830880606862043e-05, + "loss": 5.2701, + "step": 6230 + }, + { + "epoch": 0.03705752212389381, + "grad_norm": 1.7786411046981812, + "learning_rate": 4.983082636317688e-05, + "loss": 5.3216, + "step": 6231 + }, + { + "epoch": 0.0370634694071748, + "grad_norm": 3.6291537284851074, + "learning_rate": 4.983077211082354e-05, + "loss": 5.2282, + "step": 6232 + }, + { + "epoch": 0.0370694166904558, + "grad_norm": 1.7453030347824097, + "learning_rate": 4.983071784980203e-05, + "loss": 5.2667, + "step": 6233 + }, + { + "epoch": 0.0370753639737368, + "grad_norm": 1.7036694288253784, + "learning_rate": 4.983066358011238e-05, + "loss": 5.3023, + "step": 6234 + }, + { + "epoch": 0.037081311257017795, + "grad_norm": 1.7196505069732666, + "learning_rate": 4.9830609301754595e-05, + "loss": 5.2211, + "step": 6235 + }, + { + "epoch": 0.03708725854029879, + "grad_norm": 3.4630305767059326, + "learning_rate": 4.983055501472871e-05, + "loss": 5.6159, + "step": 6236 + }, + { + "epoch": 0.03709320582357979, + "grad_norm": 2.9739367961883545, + "learning_rate": 4.9830500719034726e-05, + "loss": 5.4477, + "step": 6237 + }, + { + "epoch": 0.03709915310686079, + "grad_norm": 2.760664463043213, + "learning_rate": 4.983044641467267e-05, + "loss": 5.0879, + "step": 6238 + }, + { + "epoch": 0.03710510039014178, + "grad_norm": 2.166203022003174, + "learning_rate": 4.9830392101642566e-05, + "loss": 5.5635, + "step": 6239 + }, + { + "epoch": 0.03711104767342278, + "grad_norm": 2.3798410892486572, + "learning_rate": 4.9830337779944425e-05, + "loss": 5.0676, + "step": 6240 + }, + { + "epoch": 0.03711699495670378, + "grad_norm": 2.3990557193756104, + "learning_rate": 4.983028344957827e-05, + "loss": 5.2788, + "step": 6241 + }, + { + "epoch": 0.037122942239984774, + "grad_norm": 2.487978458404541, + "learning_rate": 4.9830229110544124e-05, + "loss": 5.852, + "step": 6242 + }, + { + "epoch": 0.03712888952326577, + "grad_norm": 2.304749011993408, + "learning_rate": 4.9830174762842e-05, + "loss": 6.0886, + "step": 6243 + }, + { + "epoch": 0.03713483680654677, + "grad_norm": 2.169614791870117, + "learning_rate": 4.983012040647191e-05, + "loss": 6.1178, + "step": 6244 + }, + { + "epoch": 0.037140784089827766, + "grad_norm": 2.119131326675415, + "learning_rate": 4.98300660414339e-05, + "loss": 6.25, + "step": 6245 + }, + { + "epoch": 0.03714673137310876, + "grad_norm": 2.3797547817230225, + "learning_rate": 4.9830011667727964e-05, + "loss": 5.879, + "step": 6246 + }, + { + "epoch": 0.03715267865638976, + "grad_norm": 2.303718328475952, + "learning_rate": 4.982995728535411e-05, + "loss": 6.0015, + "step": 6247 + }, + { + "epoch": 0.03715862593967076, + "grad_norm": 2.867103099822998, + "learning_rate": 4.9829902894312396e-05, + "loss": 5.8726, + "step": 6248 + }, + { + "epoch": 0.037164573222951754, + "grad_norm": 2.4248557090759277, + "learning_rate": 4.9829848494602806e-05, + "loss": 5.6579, + "step": 6249 + }, + { + "epoch": 0.037170520506232756, + "grad_norm": 2.2622148990631104, + "learning_rate": 4.982979408622538e-05, + "loss": 5.7677, + "step": 6250 + }, + { + "epoch": 0.03717646778951375, + "grad_norm": 2.320502996444702, + "learning_rate": 4.9829739669180126e-05, + "loss": 5.7362, + "step": 6251 + }, + { + "epoch": 0.037182415072794746, + "grad_norm": 2.2096636295318604, + "learning_rate": 4.9829685243467065e-05, + "loss": 5.9069, + "step": 6252 + }, + { + "epoch": 0.03718836235607575, + "grad_norm": 2.620361089706421, + "learning_rate": 4.982963080908623e-05, + "loss": 5.9419, + "step": 6253 + }, + { + "epoch": 0.03719430963935674, + "grad_norm": 2.478158950805664, + "learning_rate": 4.982957636603761e-05, + "loss": 6.4776, + "step": 6254 + }, + { + "epoch": 0.03720025692263774, + "grad_norm": 2.5912528038024902, + "learning_rate": 4.982952191432125e-05, + "loss": 5.7176, + "step": 6255 + }, + { + "epoch": 0.03720620420591873, + "grad_norm": 2.57177734375, + "learning_rate": 4.982946745393716e-05, + "loss": 5.4271, + "step": 6256 + }, + { + "epoch": 0.037212151489199735, + "grad_norm": 2.424567699432373, + "learning_rate": 4.982941298488535e-05, + "loss": 5.82, + "step": 6257 + }, + { + "epoch": 0.03721809877248073, + "grad_norm": 2.477827548980713, + "learning_rate": 4.9829358507165856e-05, + "loss": 5.7961, + "step": 6258 + }, + { + "epoch": 0.037224046055761725, + "grad_norm": 2.0598270893096924, + "learning_rate": 4.982930402077869e-05, + "loss": 5.9264, + "step": 6259 + }, + { + "epoch": 0.03722999333904273, + "grad_norm": 2.0599095821380615, + "learning_rate": 4.9829249525723875e-05, + "loss": 6.0518, + "step": 6260 + }, + { + "epoch": 0.03723594062232372, + "grad_norm": 2.110170841217041, + "learning_rate": 4.982919502200142e-05, + "loss": 5.8631, + "step": 6261 + }, + { + "epoch": 0.03724188790560472, + "grad_norm": 2.333972930908203, + "learning_rate": 4.982914050961135e-05, + "loss": 5.5361, + "step": 6262 + }, + { + "epoch": 0.03724783518888572, + "grad_norm": 2.2322769165039062, + "learning_rate": 4.982908598855369e-05, + "loss": 5.8002, + "step": 6263 + }, + { + "epoch": 0.037253782472166715, + "grad_norm": 1.9915717840194702, + "learning_rate": 4.982903145882845e-05, + "loss": 5.7096, + "step": 6264 + }, + { + "epoch": 0.03725972975544771, + "grad_norm": 2.2031619548797607, + "learning_rate": 4.9828976920435645e-05, + "loss": 5.5716, + "step": 6265 + }, + { + "epoch": 0.03726567703872871, + "grad_norm": 2.9422314167022705, + "learning_rate": 4.9828922373375295e-05, + "loss": 5.929, + "step": 6266 + }, + { + "epoch": 0.03727162432200971, + "grad_norm": 3.264784336090088, + "learning_rate": 4.982886781764744e-05, + "loss": 5.9801, + "step": 6267 + }, + { + "epoch": 0.0372775716052907, + "grad_norm": 2.8314197063446045, + "learning_rate": 4.982881325325208e-05, + "loss": 6.0173, + "step": 6268 + }, + { + "epoch": 0.0372835188885717, + "grad_norm": 2.9550328254699707, + "learning_rate": 4.9828758680189234e-05, + "loss": 5.9838, + "step": 6269 + }, + { + "epoch": 0.0372894661718527, + "grad_norm": 2.6827526092529297, + "learning_rate": 4.9828704098458924e-05, + "loss": 6.0235, + "step": 6270 + }, + { + "epoch": 0.037295413455133694, + "grad_norm": 2.7174222469329834, + "learning_rate": 4.982864950806118e-05, + "loss": 5.8315, + "step": 6271 + }, + { + "epoch": 0.03730136073841469, + "grad_norm": 2.6177315711975098, + "learning_rate": 4.9828594908996e-05, + "loss": 5.8577, + "step": 6272 + }, + { + "epoch": 0.03730730802169569, + "grad_norm": 2.449669361114502, + "learning_rate": 4.982854030126342e-05, + "loss": 5.9591, + "step": 6273 + }, + { + "epoch": 0.037313255304976686, + "grad_norm": 2.5328989028930664, + "learning_rate": 4.9828485684863446e-05, + "loss": 5.7764, + "step": 6274 + }, + { + "epoch": 0.03731920258825768, + "grad_norm": 2.2581989765167236, + "learning_rate": 4.982843105979611e-05, + "loss": 5.9524, + "step": 6275 + }, + { + "epoch": 0.03732514987153868, + "grad_norm": 2.261212110519409, + "learning_rate": 4.982837642606142e-05, + "loss": 5.5814, + "step": 6276 + }, + { + "epoch": 0.03733109715481968, + "grad_norm": 2.2957348823547363, + "learning_rate": 4.98283217836594e-05, + "loss": 5.6967, + "step": 6277 + }, + { + "epoch": 0.037337044438100674, + "grad_norm": 2.814037322998047, + "learning_rate": 4.982826713259008e-05, + "loss": 5.8787, + "step": 6278 + }, + { + "epoch": 0.037342991721381676, + "grad_norm": 2.678133249282837, + "learning_rate": 4.9828212472853464e-05, + "loss": 5.94, + "step": 6279 + }, + { + "epoch": 0.03734893900466267, + "grad_norm": 2.2949652671813965, + "learning_rate": 4.982815780444957e-05, + "loss": 5.7263, + "step": 6280 + }, + { + "epoch": 0.037354886287943666, + "grad_norm": 2.4542131423950195, + "learning_rate": 4.982810312737842e-05, + "loss": 5.8317, + "step": 6281 + }, + { + "epoch": 0.03736083357122467, + "grad_norm": 2.7850544452667236, + "learning_rate": 4.982804844164005e-05, + "loss": 5.5631, + "step": 6282 + }, + { + "epoch": 0.03736678085450566, + "grad_norm": 2.6285061836242676, + "learning_rate": 4.9827993747234454e-05, + "loss": 5.6212, + "step": 6283 + }, + { + "epoch": 0.03737272813778666, + "grad_norm": 2.602590799331665, + "learning_rate": 4.9827939044161666e-05, + "loss": 5.5529, + "step": 6284 + }, + { + "epoch": 0.03737867542106765, + "grad_norm": 2.6196670532226562, + "learning_rate": 4.98278843324217e-05, + "loss": 5.6915, + "step": 6285 + }, + { + "epoch": 0.037384622704348655, + "grad_norm": 2.7072317600250244, + "learning_rate": 4.982782961201457e-05, + "loss": 5.7535, + "step": 6286 + }, + { + "epoch": 0.03739056998762965, + "grad_norm": 2.626033067703247, + "learning_rate": 4.982777488294031e-05, + "loss": 5.6053, + "step": 6287 + }, + { + "epoch": 0.037396517270910645, + "grad_norm": 1.8426648378372192, + "learning_rate": 4.982772014519892e-05, + "loss": 5.6167, + "step": 6288 + }, + { + "epoch": 0.03740246455419165, + "grad_norm": 2.5587830543518066, + "learning_rate": 4.9827665398790445e-05, + "loss": 5.6442, + "step": 6289 + }, + { + "epoch": 0.03740841183747264, + "grad_norm": 2.6163039207458496, + "learning_rate": 4.9827610643714877e-05, + "loss": 5.699, + "step": 6290 + }, + { + "epoch": 0.03741435912075364, + "grad_norm": 2.5752358436584473, + "learning_rate": 4.982755587997225e-05, + "loss": 5.666, + "step": 6291 + }, + { + "epoch": 0.03742030640403464, + "grad_norm": 2.6609575748443604, + "learning_rate": 4.982750110756258e-05, + "loss": 5.5634, + "step": 6292 + }, + { + "epoch": 0.037426253687315635, + "grad_norm": 2.724731683731079, + "learning_rate": 4.9827446326485884e-05, + "loss": 5.6259, + "step": 6293 + }, + { + "epoch": 0.03743220097059663, + "grad_norm": 2.5849807262420654, + "learning_rate": 4.9827391536742185e-05, + "loss": 5.6182, + "step": 6294 + }, + { + "epoch": 0.03743814825387763, + "grad_norm": 2.6737449169158936, + "learning_rate": 4.9827336738331496e-05, + "loss": 5.5426, + "step": 6295 + }, + { + "epoch": 0.03744409553715863, + "grad_norm": 2.5739669799804688, + "learning_rate": 4.9827281931253844e-05, + "loss": 5.6283, + "step": 6296 + }, + { + "epoch": 0.03745004282043962, + "grad_norm": 2.652730703353882, + "learning_rate": 4.982722711550924e-05, + "loss": 5.5241, + "step": 6297 + }, + { + "epoch": 0.037455990103720624, + "grad_norm": 2.7140653133392334, + "learning_rate": 4.982717229109772e-05, + "loss": 5.7052, + "step": 6298 + }, + { + "epoch": 0.03746193738700162, + "grad_norm": 2.1617860794067383, + "learning_rate": 4.982711745801928e-05, + "loss": 5.6224, + "step": 6299 + }, + { + "epoch": 0.037467884670282614, + "grad_norm": 2.1400585174560547, + "learning_rate": 4.982706261627395e-05, + "loss": 5.5753, + "step": 6300 + }, + { + "epoch": 0.03747383195356361, + "grad_norm": 2.4439101219177246, + "learning_rate": 4.9827007765861754e-05, + "loss": 5.6219, + "step": 6301 + }, + { + "epoch": 0.03747977923684461, + "grad_norm": 2.507141351699829, + "learning_rate": 4.9826952906782697e-05, + "loss": 5.6666, + "step": 6302 + }, + { + "epoch": 0.037485726520125606, + "grad_norm": 2.2664029598236084, + "learning_rate": 4.982689803903682e-05, + "loss": 5.7792, + "step": 6303 + }, + { + "epoch": 0.0374916738034066, + "grad_norm": 2.49678635597229, + "learning_rate": 4.982684316262411e-05, + "loss": 5.5899, + "step": 6304 + }, + { + "epoch": 0.0374976210866876, + "grad_norm": 2.244603395462036, + "learning_rate": 4.9826788277544625e-05, + "loss": 5.4624, + "step": 6305 + }, + { + "epoch": 0.0375035683699686, + "grad_norm": 2.144343376159668, + "learning_rate": 4.9826733383798366e-05, + "loss": 5.3428, + "step": 6306 + }, + { + "epoch": 0.037509515653249594, + "grad_norm": 1.7709565162658691, + "learning_rate": 4.982667848138534e-05, + "loss": 5.3596, + "step": 6307 + }, + { + "epoch": 0.037515462936530596, + "grad_norm": 2.0245232582092285, + "learning_rate": 4.9826623570305574e-05, + "loss": 5.4005, + "step": 6308 + }, + { + "epoch": 0.03752141021981159, + "grad_norm": 2.5346829891204834, + "learning_rate": 4.9826568650559095e-05, + "loss": 5.5089, + "step": 6309 + }, + { + "epoch": 0.037527357503092586, + "grad_norm": 2.638684034347534, + "learning_rate": 4.982651372214592e-05, + "loss": 5.6847, + "step": 6310 + }, + { + "epoch": 0.03753330478637359, + "grad_norm": 2.024423122406006, + "learning_rate": 4.982645878506606e-05, + "loss": 5.3633, + "step": 6311 + }, + { + "epoch": 0.03753925206965458, + "grad_norm": 1.983167290687561, + "learning_rate": 4.982640383931955e-05, + "loss": 5.2086, + "step": 6312 + }, + { + "epoch": 0.03754519935293558, + "grad_norm": 1.8388524055480957, + "learning_rate": 4.982634888490639e-05, + "loss": 5.1904, + "step": 6313 + }, + { + "epoch": 0.03755114663621657, + "grad_norm": 1.8280584812164307, + "learning_rate": 4.982629392182661e-05, + "loss": 5.3072, + "step": 6314 + }, + { + "epoch": 0.037557093919497575, + "grad_norm": 1.6278408765792847, + "learning_rate": 4.982623895008023e-05, + "loss": 5.3003, + "step": 6315 + }, + { + "epoch": 0.03756304120277857, + "grad_norm": 2.0519096851348877, + "learning_rate": 4.982618396966726e-05, + "loss": 5.3494, + "step": 6316 + }, + { + "epoch": 0.037568988486059565, + "grad_norm": 1.935744285583496, + "learning_rate": 4.982612898058773e-05, + "loss": 5.6993, + "step": 6317 + }, + { + "epoch": 0.03757493576934057, + "grad_norm": 1.882163166999817, + "learning_rate": 4.9826073982841656e-05, + "loss": 5.758, + "step": 6318 + }, + { + "epoch": 0.03758088305262156, + "grad_norm": 1.7747882604599, + "learning_rate": 4.982601897642906e-05, + "loss": 5.1501, + "step": 6319 + }, + { + "epoch": 0.03758683033590256, + "grad_norm": 2.044093370437622, + "learning_rate": 4.982596396134995e-05, + "loss": 5.2801, + "step": 6320 + }, + { + "epoch": 0.03759277761918356, + "grad_norm": 1.739441990852356, + "learning_rate": 4.9825908937604346e-05, + "loss": 5.1619, + "step": 6321 + }, + { + "epoch": 0.037598724902464555, + "grad_norm": 2.0353312492370605, + "learning_rate": 4.982585390519229e-05, + "loss": 5.6796, + "step": 6322 + }, + { + "epoch": 0.03760467218574555, + "grad_norm": 2.076667308807373, + "learning_rate": 4.9825798864113774e-05, + "loss": 6.2522, + "step": 6323 + }, + { + "epoch": 0.03761061946902655, + "grad_norm": 2.773676633834839, + "learning_rate": 4.982574381436883e-05, + "loss": 5.879, + "step": 6324 + }, + { + "epoch": 0.03761656675230755, + "grad_norm": 2.2013933658599854, + "learning_rate": 4.982568875595748e-05, + "loss": 6.0341, + "step": 6325 + }, + { + "epoch": 0.03762251403558854, + "grad_norm": 2.288806915283203, + "learning_rate": 4.9825633688879736e-05, + "loss": 6.219, + "step": 6326 + }, + { + "epoch": 0.037628461318869544, + "grad_norm": 2.874372720718384, + "learning_rate": 4.982557861313561e-05, + "loss": 5.7616, + "step": 6327 + }, + { + "epoch": 0.03763440860215054, + "grad_norm": 2.7471537590026855, + "learning_rate": 4.982552352872515e-05, + "loss": 5.7214, + "step": 6328 + }, + { + "epoch": 0.037640355885431534, + "grad_norm": 2.475513458251953, + "learning_rate": 4.982546843564834e-05, + "loss": 6.0039, + "step": 6329 + }, + { + "epoch": 0.03764630316871253, + "grad_norm": 2.5376412868499756, + "learning_rate": 4.982541333390523e-05, + "loss": 6.3042, + "step": 6330 + }, + { + "epoch": 0.03765225045199353, + "grad_norm": 2.599989414215088, + "learning_rate": 4.9825358223495814e-05, + "loss": 6.488, + "step": 6331 + }, + { + "epoch": 0.037658197735274526, + "grad_norm": 2.2657089233398438, + "learning_rate": 4.9825303104420115e-05, + "loss": 6.2743, + "step": 6332 + }, + { + "epoch": 0.03766414501855552, + "grad_norm": 2.303926467895508, + "learning_rate": 4.982524797667818e-05, + "loss": 6.3888, + "step": 6333 + }, + { + "epoch": 0.03767009230183652, + "grad_norm": 2.771775007247925, + "learning_rate": 4.982519284026999e-05, + "loss": 6.0911, + "step": 6334 + }, + { + "epoch": 0.03767603958511752, + "grad_norm": 2.492748260498047, + "learning_rate": 4.982513769519559e-05, + "loss": 5.9905, + "step": 6335 + }, + { + "epoch": 0.03768198686839851, + "grad_norm": 2.294985771179199, + "learning_rate": 4.982508254145498e-05, + "loss": 6.4574, + "step": 6336 + }, + { + "epoch": 0.037687934151679515, + "grad_norm": 2.6514554023742676, + "learning_rate": 4.9825027379048205e-05, + "loss": 6.1541, + "step": 6337 + }, + { + "epoch": 0.03769388143496051, + "grad_norm": 2.0114963054656982, + "learning_rate": 4.982497220797526e-05, + "loss": 6.0602, + "step": 6338 + }, + { + "epoch": 0.037699828718241506, + "grad_norm": 2.6345295906066895, + "learning_rate": 4.982491702823618e-05, + "loss": 6.024, + "step": 6339 + }, + { + "epoch": 0.03770577600152251, + "grad_norm": 2.619980573654175, + "learning_rate": 4.982486183983097e-05, + "loss": 6.0642, + "step": 6340 + }, + { + "epoch": 0.0377117232848035, + "grad_norm": 2.491279125213623, + "learning_rate": 4.9824806642759664e-05, + "loss": 5.8517, + "step": 6341 + }, + { + "epoch": 0.0377176705680845, + "grad_norm": 2.5161385536193848, + "learning_rate": 4.982475143702227e-05, + "loss": 5.7467, + "step": 6342 + }, + { + "epoch": 0.03772361785136549, + "grad_norm": 2.3237602710723877, + "learning_rate": 4.982469622261882e-05, + "loss": 5.801, + "step": 6343 + }, + { + "epoch": 0.037729565134646495, + "grad_norm": 2.21382999420166, + "learning_rate": 4.9824640999549314e-05, + "loss": 5.968, + "step": 6344 + }, + { + "epoch": 0.03773551241792749, + "grad_norm": 2.1770498752593994, + "learning_rate": 4.9824585767813794e-05, + "loss": 6.2998, + "step": 6345 + }, + { + "epoch": 0.037741459701208485, + "grad_norm": 2.321563720703125, + "learning_rate": 4.982453052741225e-05, + "loss": 5.631, + "step": 6346 + }, + { + "epoch": 0.03774740698448949, + "grad_norm": 3.2769439220428467, + "learning_rate": 4.982447527834473e-05, + "loss": 5.4845, + "step": 6347 + }, + { + "epoch": 0.03775335426777048, + "grad_norm": 2.954331874847412, + "learning_rate": 4.9824420020611244e-05, + "loss": 5.2, + "step": 6348 + }, + { + "epoch": 0.03775930155105148, + "grad_norm": 2.735182523727417, + "learning_rate": 4.98243647542118e-05, + "loss": 5.1907, + "step": 6349 + }, + { + "epoch": 0.03776524883433248, + "grad_norm": 2.872142791748047, + "learning_rate": 4.982430947914644e-05, + "loss": 5.5159, + "step": 6350 + }, + { + "epoch": 0.037771196117613474, + "grad_norm": 3.14219331741333, + "learning_rate": 4.982425419541517e-05, + "loss": 5.0843, + "step": 6351 + }, + { + "epoch": 0.03777714340089447, + "grad_norm": 2.2689874172210693, + "learning_rate": 4.9824198903018e-05, + "loss": 6.0446, + "step": 6352 + }, + { + "epoch": 0.03778309068417547, + "grad_norm": 2.3468856811523438, + "learning_rate": 4.982414360195496e-05, + "loss": 5.952, + "step": 6353 + }, + { + "epoch": 0.03778903796745647, + "grad_norm": 2.944509983062744, + "learning_rate": 4.9824088292226065e-05, + "loss": 5.4918, + "step": 6354 + }, + { + "epoch": 0.03779498525073746, + "grad_norm": 2.8139286041259766, + "learning_rate": 4.982403297383135e-05, + "loss": 5.3296, + "step": 6355 + }, + { + "epoch": 0.037800932534018464, + "grad_norm": 2.540224552154541, + "learning_rate": 4.982397764677081e-05, + "loss": 5.3464, + "step": 6356 + }, + { + "epoch": 0.03780687981729946, + "grad_norm": 2.56709885597229, + "learning_rate": 4.982392231104448e-05, + "loss": 5.2313, + "step": 6357 + }, + { + "epoch": 0.037812827100580454, + "grad_norm": 2.2051165103912354, + "learning_rate": 4.982386696665238e-05, + "loss": 5.7783, + "step": 6358 + }, + { + "epoch": 0.03781877438386145, + "grad_norm": 2.5773870944976807, + "learning_rate": 4.9823811613594515e-05, + "loss": 5.6691, + "step": 6359 + }, + { + "epoch": 0.03782472166714245, + "grad_norm": 2.5163073539733887, + "learning_rate": 4.982375625187092e-05, + "loss": 5.7936, + "step": 6360 + }, + { + "epoch": 0.037830668950423446, + "grad_norm": 2.4268851280212402, + "learning_rate": 4.98237008814816e-05, + "loss": 5.8116, + "step": 6361 + }, + { + "epoch": 0.03783661623370444, + "grad_norm": 2.397402286529541, + "learning_rate": 4.9823645502426597e-05, + "loss": 5.9895, + "step": 6362 + }, + { + "epoch": 0.03784256351698544, + "grad_norm": 2.590672731399536, + "learning_rate": 4.98235901147059e-05, + "loss": 5.9022, + "step": 6363 + }, + { + "epoch": 0.03784851080026644, + "grad_norm": 2.268540859222412, + "learning_rate": 4.9823534718319557e-05, + "loss": 5.8958, + "step": 6364 + }, + { + "epoch": 0.03785445808354743, + "grad_norm": 2.1419460773468018, + "learning_rate": 4.982347931326757e-05, + "loss": 5.8446, + "step": 6365 + }, + { + "epoch": 0.037860405366828435, + "grad_norm": 2.3988053798675537, + "learning_rate": 4.9823423899549957e-05, + "loss": 6.2267, + "step": 6366 + }, + { + "epoch": 0.03786635265010943, + "grad_norm": 2.120121955871582, + "learning_rate": 4.9823368477166755e-05, + "loss": 6.1352, + "step": 6367 + }, + { + "epoch": 0.037872299933390426, + "grad_norm": 2.274610996246338, + "learning_rate": 4.982331304611796e-05, + "loss": 6.1342, + "step": 6368 + }, + { + "epoch": 0.03787824721667143, + "grad_norm": 1.6934765577316284, + "learning_rate": 4.98232576064036e-05, + "loss": 5.7969, + "step": 6369 + }, + { + "epoch": 0.03788419449995242, + "grad_norm": 2.62416672706604, + "learning_rate": 4.982320215802371e-05, + "loss": 5.9669, + "step": 6370 + }, + { + "epoch": 0.03789014178323342, + "grad_norm": 2.416639804840088, + "learning_rate": 4.98231467009783e-05, + "loss": 5.9628, + "step": 6371 + }, + { + "epoch": 0.03789608906651441, + "grad_norm": 2.049412965774536, + "learning_rate": 4.9823091235267375e-05, + "loss": 5.658, + "step": 6372 + }, + { + "epoch": 0.037902036349795415, + "grad_norm": 2.0502147674560547, + "learning_rate": 4.982303576089097e-05, + "loss": 5.9114, + "step": 6373 + }, + { + "epoch": 0.03790798363307641, + "grad_norm": 2.1566948890686035, + "learning_rate": 4.982298027784909e-05, + "loss": 5.6932, + "step": 6374 + }, + { + "epoch": 0.037913930916357405, + "grad_norm": 2.394083261489868, + "learning_rate": 4.9822924786141774e-05, + "loss": 6.3041, + "step": 6375 + }, + { + "epoch": 0.03791987819963841, + "grad_norm": 2.545910120010376, + "learning_rate": 4.9822869285769024e-05, + "loss": 6.2125, + "step": 6376 + }, + { + "epoch": 0.0379258254829194, + "grad_norm": 2.271461248397827, + "learning_rate": 4.9822813776730875e-05, + "loss": 6.2322, + "step": 6377 + }, + { + "epoch": 0.0379317727662004, + "grad_norm": 2.3840630054473877, + "learning_rate": 4.9822758259027336e-05, + "loss": 6.0167, + "step": 6378 + }, + { + "epoch": 0.0379377200494814, + "grad_norm": 2.600618600845337, + "learning_rate": 4.9822702732658426e-05, + "loss": 5.6722, + "step": 6379 + }, + { + "epoch": 0.037943667332762394, + "grad_norm": 2.0911965370178223, + "learning_rate": 4.982264719762417e-05, + "loss": 5.579, + "step": 6380 + }, + { + "epoch": 0.03794961461604339, + "grad_norm": 2.015505075454712, + "learning_rate": 4.9822591653924575e-05, + "loss": 5.9747, + "step": 6381 + }, + { + "epoch": 0.03795556189932439, + "grad_norm": 2.237262010574341, + "learning_rate": 4.982253610155968e-05, + "loss": 6.3792, + "step": 6382 + }, + { + "epoch": 0.03796150918260539, + "grad_norm": 2.1448137760162354, + "learning_rate": 4.982248054052949e-05, + "loss": 6.1049, + "step": 6383 + }, + { + "epoch": 0.03796745646588638, + "grad_norm": 2.2597758769989014, + "learning_rate": 4.9822424970834034e-05, + "loss": 5.8428, + "step": 6384 + }, + { + "epoch": 0.037973403749167384, + "grad_norm": 1.9935969114303589, + "learning_rate": 4.982236939247332e-05, + "loss": 6.0032, + "step": 6385 + }, + { + "epoch": 0.03797935103244838, + "grad_norm": 2.506916046142578, + "learning_rate": 4.982231380544737e-05, + "loss": 5.9221, + "step": 6386 + }, + { + "epoch": 0.037985298315729374, + "grad_norm": 2.083393096923828, + "learning_rate": 4.9822258209756214e-05, + "loss": 5.8862, + "step": 6387 + }, + { + "epoch": 0.03799124559901037, + "grad_norm": 2.631091594696045, + "learning_rate": 4.982220260539987e-05, + "loss": 5.6593, + "step": 6388 + }, + { + "epoch": 0.03799719288229137, + "grad_norm": 2.5732531547546387, + "learning_rate": 4.982214699237834e-05, + "loss": 5.5084, + "step": 6389 + }, + { + "epoch": 0.038003140165572366, + "grad_norm": 2.7797791957855225, + "learning_rate": 4.982209137069166e-05, + "loss": 5.6792, + "step": 6390 + }, + { + "epoch": 0.03800908744885336, + "grad_norm": 2.2800772190093994, + "learning_rate": 4.982203574033984e-05, + "loss": 5.6299, + "step": 6391 + }, + { + "epoch": 0.03801503473213436, + "grad_norm": 2.4182863235473633, + "learning_rate": 4.9821980101322905e-05, + "loss": 5.71, + "step": 6392 + }, + { + "epoch": 0.03802098201541536, + "grad_norm": 2.2968835830688477, + "learning_rate": 4.982192445364088e-05, + "loss": 5.6112, + "step": 6393 + }, + { + "epoch": 0.03802692929869635, + "grad_norm": 2.3713324069976807, + "learning_rate": 4.982186879729377e-05, + "loss": 5.423, + "step": 6394 + }, + { + "epoch": 0.038032876581977355, + "grad_norm": 2.745352268218994, + "learning_rate": 4.98218131322816e-05, + "loss": 5.5145, + "step": 6395 + }, + { + "epoch": 0.03803882386525835, + "grad_norm": 2.755211353302002, + "learning_rate": 4.98217574586044e-05, + "loss": 5.4399, + "step": 6396 + }, + { + "epoch": 0.038044771148539346, + "grad_norm": 2.5452096462249756, + "learning_rate": 4.982170177626217e-05, + "loss": 5.5691, + "step": 6397 + }, + { + "epoch": 0.03805071843182035, + "grad_norm": 2.6195876598358154, + "learning_rate": 4.9821646085254954e-05, + "loss": 5.4512, + "step": 6398 + }, + { + "epoch": 0.03805666571510134, + "grad_norm": 2.4931671619415283, + "learning_rate": 4.982159038558275e-05, + "loss": 6.0505, + "step": 6399 + }, + { + "epoch": 0.03806261299838234, + "grad_norm": 2.45062255859375, + "learning_rate": 4.982153467724558e-05, + "loss": 6.2367, + "step": 6400 + }, + { + "epoch": 0.03806856028166333, + "grad_norm": 2.688624620437622, + "learning_rate": 4.982147896024348e-05, + "loss": 6.0522, + "step": 6401 + }, + { + "epoch": 0.038074507564944335, + "grad_norm": 2.421660900115967, + "learning_rate": 4.982142323457645e-05, + "loss": 5.8166, + "step": 6402 + }, + { + "epoch": 0.03808045484822533, + "grad_norm": 2.594134569168091, + "learning_rate": 4.982136750024452e-05, + "loss": 5.5476, + "step": 6403 + }, + { + "epoch": 0.038086402131506325, + "grad_norm": 2.4492971897125244, + "learning_rate": 4.982131175724771e-05, + "loss": 5.2302, + "step": 6404 + }, + { + "epoch": 0.03809234941478733, + "grad_norm": 2.4200360774993896, + "learning_rate": 4.9821256005586036e-05, + "loss": 6.1404, + "step": 6405 + }, + { + "epoch": 0.03809829669806832, + "grad_norm": 2.1949775218963623, + "learning_rate": 4.982120024525951e-05, + "loss": 5.9589, + "step": 6406 + }, + { + "epoch": 0.03810424398134932, + "grad_norm": 2.3570375442504883, + "learning_rate": 4.9821144476268164e-05, + "loss": 5.9022, + "step": 6407 + }, + { + "epoch": 0.03811019126463032, + "grad_norm": 2.16460919380188, + "learning_rate": 4.9821088698612016e-05, + "loss": 5.8535, + "step": 6408 + }, + { + "epoch": 0.038116138547911314, + "grad_norm": 1.8189443349838257, + "learning_rate": 4.982103291229108e-05, + "loss": 5.9345, + "step": 6409 + }, + { + "epoch": 0.03812208583119231, + "grad_norm": 2.553919792175293, + "learning_rate": 4.9820977117305376e-05, + "loss": 5.31, + "step": 6410 + }, + { + "epoch": 0.03812803311447331, + "grad_norm": 2.8085403442382812, + "learning_rate": 4.982092131365493e-05, + "loss": 4.9902, + "step": 6411 + }, + { + "epoch": 0.03813398039775431, + "grad_norm": 2.3698999881744385, + "learning_rate": 4.982086550133976e-05, + "loss": 5.4982, + "step": 6412 + }, + { + "epoch": 0.0381399276810353, + "grad_norm": 1.996026873588562, + "learning_rate": 4.9820809680359876e-05, + "loss": 5.6556, + "step": 6413 + }, + { + "epoch": 0.038145874964316304, + "grad_norm": 2.0816900730133057, + "learning_rate": 4.9820753850715305e-05, + "loss": 5.8823, + "step": 6414 + }, + { + "epoch": 0.0381518222475973, + "grad_norm": 2.282745122909546, + "learning_rate": 4.982069801240606e-05, + "loss": 5.1641, + "step": 6415 + }, + { + "epoch": 0.038157769530878294, + "grad_norm": 2.043991804122925, + "learning_rate": 4.982064216543217e-05, + "loss": 5.7569, + "step": 6416 + }, + { + "epoch": 0.03816371681415929, + "grad_norm": 2.086071014404297, + "learning_rate": 4.982058630979365e-05, + "loss": 5.9586, + "step": 6417 + }, + { + "epoch": 0.03816966409744029, + "grad_norm": 2.295060873031616, + "learning_rate": 4.9820530445490525e-05, + "loss": 5.3733, + "step": 6418 + }, + { + "epoch": 0.038175611380721286, + "grad_norm": 2.512267827987671, + "learning_rate": 4.98204745725228e-05, + "loss": 5.0399, + "step": 6419 + }, + { + "epoch": 0.03818155866400228, + "grad_norm": 2.5434467792510986, + "learning_rate": 4.982041869089051e-05, + "loss": 4.7907, + "step": 6420 + }, + { + "epoch": 0.03818750594728328, + "grad_norm": 2.4192142486572266, + "learning_rate": 4.9820362800593666e-05, + "loss": 4.9116, + "step": 6421 + }, + { + "epoch": 0.03819345323056428, + "grad_norm": 2.867542028427124, + "learning_rate": 4.9820306901632296e-05, + "loss": 5.9905, + "step": 6422 + }, + { + "epoch": 0.03819940051384527, + "grad_norm": 2.3099327087402344, + "learning_rate": 4.982025099400641e-05, + "loss": 5.9319, + "step": 6423 + }, + { + "epoch": 0.038205347797126275, + "grad_norm": 2.28169584274292, + "learning_rate": 4.9820195077716026e-05, + "loss": 6.2533, + "step": 6424 + }, + { + "epoch": 0.03821129508040727, + "grad_norm": 2.1065595149993896, + "learning_rate": 4.9820139152761167e-05, + "loss": 5.7123, + "step": 6425 + }, + { + "epoch": 0.038217242363688265, + "grad_norm": 2.0210213661193848, + "learning_rate": 4.9820083219141865e-05, + "loss": 5.7758, + "step": 6426 + }, + { + "epoch": 0.03822318964696927, + "grad_norm": 1.6545369625091553, + "learning_rate": 4.9820027276858114e-05, + "loss": 5.6792, + "step": 6427 + }, + { + "epoch": 0.03822913693025026, + "grad_norm": 2.177621841430664, + "learning_rate": 4.981997132590996e-05, + "loss": 6.0167, + "step": 6428 + }, + { + "epoch": 0.03823508421353126, + "grad_norm": 2.3910553455352783, + "learning_rate": 4.981991536629741e-05, + "loss": 6.1161, + "step": 6429 + }, + { + "epoch": 0.03824103149681225, + "grad_norm": 2.4915859699249268, + "learning_rate": 4.981985939802047e-05, + "loss": 5.6449, + "step": 6430 + }, + { + "epoch": 0.038246978780093255, + "grad_norm": 2.0343215465545654, + "learning_rate": 4.981980342107919e-05, + "loss": 5.967, + "step": 6431 + }, + { + "epoch": 0.03825292606337425, + "grad_norm": 1.8326199054718018, + "learning_rate": 4.9819747435473565e-05, + "loss": 5.9183, + "step": 6432 + }, + { + "epoch": 0.038258873346655245, + "grad_norm": 2.1482350826263428, + "learning_rate": 4.981969144120362e-05, + "loss": 5.794, + "step": 6433 + }, + { + "epoch": 0.03826482062993625, + "grad_norm": 2.346355438232422, + "learning_rate": 4.9819635438269384e-05, + "loss": 5.6775, + "step": 6434 + }, + { + "epoch": 0.03827076791321724, + "grad_norm": 2.252150774002075, + "learning_rate": 4.981957942667087e-05, + "loss": 5.9383, + "step": 6435 + }, + { + "epoch": 0.03827671519649824, + "grad_norm": 2.1851654052734375, + "learning_rate": 4.981952340640809e-05, + "loss": 6.0555, + "step": 6436 + }, + { + "epoch": 0.03828266247977924, + "grad_norm": 2.0609381198883057, + "learning_rate": 4.9819467377481076e-05, + "loss": 6.3209, + "step": 6437 + }, + { + "epoch": 0.038288609763060234, + "grad_norm": 2.4882800579071045, + "learning_rate": 4.981941133988984e-05, + "loss": 6.2411, + "step": 6438 + }, + { + "epoch": 0.03829455704634123, + "grad_norm": 1.8794118165969849, + "learning_rate": 4.981935529363441e-05, + "loss": 5.5696, + "step": 6439 + }, + { + "epoch": 0.03830050432962223, + "grad_norm": 2.542656660079956, + "learning_rate": 4.981929923871479e-05, + "loss": 5.8106, + "step": 6440 + }, + { + "epoch": 0.038306451612903226, + "grad_norm": 2.3871288299560547, + "learning_rate": 4.981924317513101e-05, + "loss": 5.6354, + "step": 6441 + }, + { + "epoch": 0.03831239889618422, + "grad_norm": 2.4628939628601074, + "learning_rate": 4.981918710288309e-05, + "loss": 5.9695, + "step": 6442 + }, + { + "epoch": 0.038318346179465224, + "grad_norm": 2.908543586730957, + "learning_rate": 4.9819131021971056e-05, + "loss": 5.2742, + "step": 6443 + }, + { + "epoch": 0.03832429346274622, + "grad_norm": 3.353813886642456, + "learning_rate": 4.9819074932394916e-05, + "loss": 5.3823, + "step": 6444 + }, + { + "epoch": 0.038330240746027214, + "grad_norm": 2.5253870487213135, + "learning_rate": 4.981901883415469e-05, + "loss": 5.7, + "step": 6445 + }, + { + "epoch": 0.03833618802930821, + "grad_norm": 2.3375632762908936, + "learning_rate": 4.98189627272504e-05, + "loss": 5.2862, + "step": 6446 + }, + { + "epoch": 0.03834213531258921, + "grad_norm": 2.534599542617798, + "learning_rate": 4.981890661168207e-05, + "loss": 5.3961, + "step": 6447 + }, + { + "epoch": 0.038348082595870206, + "grad_norm": 2.383511781692505, + "learning_rate": 4.9818850487449716e-05, + "loss": 6.4658, + "step": 6448 + }, + { + "epoch": 0.0383540298791512, + "grad_norm": 2.2824161052703857, + "learning_rate": 4.981879435455336e-05, + "loss": 5.5221, + "step": 6449 + }, + { + "epoch": 0.0383599771624322, + "grad_norm": 2.355271100997925, + "learning_rate": 4.981873821299301e-05, + "loss": 5.5054, + "step": 6450 + }, + { + "epoch": 0.0383659244457132, + "grad_norm": 2.0071253776550293, + "learning_rate": 4.981868206276871e-05, + "loss": 5.5911, + "step": 6451 + }, + { + "epoch": 0.03837187172899419, + "grad_norm": 2.2770705223083496, + "learning_rate": 4.9818625903880445e-05, + "loss": 5.8978, + "step": 6452 + }, + { + "epoch": 0.038377819012275195, + "grad_norm": 2.2425332069396973, + "learning_rate": 4.981856973632827e-05, + "loss": 6.3189, + "step": 6453 + }, + { + "epoch": 0.03838376629555619, + "grad_norm": 2.300560235977173, + "learning_rate": 4.981851356011218e-05, + "loss": 5.745, + "step": 6454 + }, + { + "epoch": 0.038389713578837185, + "grad_norm": 2.4516983032226562, + "learning_rate": 4.981845737523221e-05, + "loss": 5.8978, + "step": 6455 + }, + { + "epoch": 0.03839566086211819, + "grad_norm": 2.3463354110717773, + "learning_rate": 4.981840118168837e-05, + "loss": 5.668, + "step": 6456 + }, + { + "epoch": 0.03840160814539918, + "grad_norm": 2.623608112335205, + "learning_rate": 4.981834497948068e-05, + "loss": 5.471, + "step": 6457 + }, + { + "epoch": 0.03840755542868018, + "grad_norm": 2.441089391708374, + "learning_rate": 4.9818288768609166e-05, + "loss": 5.0986, + "step": 6458 + }, + { + "epoch": 0.03841350271196117, + "grad_norm": 2.597635507583618, + "learning_rate": 4.981823254907384e-05, + "loss": 5.1046, + "step": 6459 + }, + { + "epoch": 0.038419449995242175, + "grad_norm": 2.344855785369873, + "learning_rate": 4.9818176320874727e-05, + "loss": 5.8878, + "step": 6460 + }, + { + "epoch": 0.03842539727852317, + "grad_norm": 2.2569222450256348, + "learning_rate": 4.981812008401184e-05, + "loss": 5.342, + "step": 6461 + }, + { + "epoch": 0.038431344561804165, + "grad_norm": 2.276780843734741, + "learning_rate": 4.981806383848522e-05, + "loss": 5.566, + "step": 6462 + }, + { + "epoch": 0.03843729184508517, + "grad_norm": 2.1354174613952637, + "learning_rate": 4.9818007584294856e-05, + "loss": 5.8678, + "step": 6463 + }, + { + "epoch": 0.03844323912836616, + "grad_norm": 2.164092779159546, + "learning_rate": 4.981795132144078e-05, + "loss": 5.7937, + "step": 6464 + }, + { + "epoch": 0.03844918641164716, + "grad_norm": 2.3034324645996094, + "learning_rate": 4.981789504992303e-05, + "loss": 5.843, + "step": 6465 + }, + { + "epoch": 0.03845513369492816, + "grad_norm": 1.9616999626159668, + "learning_rate": 4.9817838769741584e-05, + "loss": 6.0563, + "step": 6466 + }, + { + "epoch": 0.038461080978209154, + "grad_norm": 2.2784626483917236, + "learning_rate": 4.9817782480896505e-05, + "loss": 6.4152, + "step": 6467 + }, + { + "epoch": 0.03846702826149015, + "grad_norm": 1.8581526279449463, + "learning_rate": 4.981772618338779e-05, + "loss": 5.9833, + "step": 6468 + }, + { + "epoch": 0.03847297554477115, + "grad_norm": 2.2493395805358887, + "learning_rate": 4.9817669877215466e-05, + "loss": 6.2985, + "step": 6469 + }, + { + "epoch": 0.038478922828052146, + "grad_norm": 2.289125919342041, + "learning_rate": 4.981761356237955e-05, + "loss": 5.8555, + "step": 6470 + }, + { + "epoch": 0.03848487011133314, + "grad_norm": 2.11012601852417, + "learning_rate": 4.981755723888006e-05, + "loss": 6.6137, + "step": 6471 + }, + { + "epoch": 0.038490817394614144, + "grad_norm": 2.1793103218078613, + "learning_rate": 4.981750090671702e-05, + "loss": 6.0117, + "step": 6472 + }, + { + "epoch": 0.03849676467789514, + "grad_norm": 2.1857750415802, + "learning_rate": 4.9817444565890436e-05, + "loss": 5.9877, + "step": 6473 + }, + { + "epoch": 0.038502711961176134, + "grad_norm": 1.7430874109268188, + "learning_rate": 4.981738821640035e-05, + "loss": 5.829, + "step": 6474 + }, + { + "epoch": 0.03850865924445713, + "grad_norm": 1.8017771244049072, + "learning_rate": 4.981733185824676e-05, + "loss": 6.3853, + "step": 6475 + }, + { + "epoch": 0.03851460652773813, + "grad_norm": 2.1420724391937256, + "learning_rate": 4.9817275491429705e-05, + "loss": 5.982, + "step": 6476 + }, + { + "epoch": 0.038520553811019126, + "grad_norm": 2.441521167755127, + "learning_rate": 4.9817219115949195e-05, + "loss": 6.1159, + "step": 6477 + }, + { + "epoch": 0.03852650109430012, + "grad_norm": 2.158682346343994, + "learning_rate": 4.9817162731805246e-05, + "loss": 6.1306, + "step": 6478 + }, + { + "epoch": 0.03853244837758112, + "grad_norm": 2.154538869857788, + "learning_rate": 4.9817106338997884e-05, + "loss": 6.0745, + "step": 6479 + }, + { + "epoch": 0.03853839566086212, + "grad_norm": 2.077674388885498, + "learning_rate": 4.981704993752713e-05, + "loss": 6.2171, + "step": 6480 + }, + { + "epoch": 0.03854434294414311, + "grad_norm": 2.181500196456909, + "learning_rate": 4.981699352739299e-05, + "loss": 6.228, + "step": 6481 + }, + { + "epoch": 0.038550290227424115, + "grad_norm": 2.678189992904663, + "learning_rate": 4.98169371085955e-05, + "loss": 5.965, + "step": 6482 + }, + { + "epoch": 0.03855623751070511, + "grad_norm": 2.713480234146118, + "learning_rate": 4.981688068113467e-05, + "loss": 5.9078, + "step": 6483 + }, + { + "epoch": 0.038562184793986105, + "grad_norm": 2.4872853755950928, + "learning_rate": 4.981682424501053e-05, + "loss": 5.7525, + "step": 6484 + }, + { + "epoch": 0.03856813207726711, + "grad_norm": 2.274711847305298, + "learning_rate": 4.98167678002231e-05, + "loss": 5.9193, + "step": 6485 + }, + { + "epoch": 0.0385740793605481, + "grad_norm": 2.4730162620544434, + "learning_rate": 4.981671134677238e-05, + "loss": 6.2961, + "step": 6486 + }, + { + "epoch": 0.0385800266438291, + "grad_norm": 1.7856062650680542, + "learning_rate": 4.9816654884658396e-05, + "loss": 5.9005, + "step": 6487 + }, + { + "epoch": 0.03858597392711009, + "grad_norm": 1.8812140226364136, + "learning_rate": 4.981659841388119e-05, + "loss": 5.9428, + "step": 6488 + }, + { + "epoch": 0.038591921210391095, + "grad_norm": 1.9963254928588867, + "learning_rate": 4.9816541934440756e-05, + "loss": 6.0136, + "step": 6489 + }, + { + "epoch": 0.03859786849367209, + "grad_norm": 2.741892099380493, + "learning_rate": 4.981648544633713e-05, + "loss": 6.5065, + "step": 6490 + }, + { + "epoch": 0.038603815776953085, + "grad_norm": 2.226672410964966, + "learning_rate": 4.981642894957032e-05, + "loss": 5.9705, + "step": 6491 + }, + { + "epoch": 0.03860976306023409, + "grad_norm": 2.015429973602295, + "learning_rate": 4.981637244414036e-05, + "loss": 6.1418, + "step": 6492 + }, + { + "epoch": 0.03861571034351508, + "grad_norm": 2.032304286956787, + "learning_rate": 4.981631593004725e-05, + "loss": 6.2104, + "step": 6493 + }, + { + "epoch": 0.03862165762679608, + "grad_norm": 2.0174217224121094, + "learning_rate": 4.981625940729102e-05, + "loss": 5.9861, + "step": 6494 + }, + { + "epoch": 0.03862760491007708, + "grad_norm": 1.9466323852539062, + "learning_rate": 4.98162028758717e-05, + "loss": 6.0958, + "step": 6495 + }, + { + "epoch": 0.038633552193358074, + "grad_norm": 1.6796106100082397, + "learning_rate": 4.9816146335789296e-05, + "loss": 6.0708, + "step": 6496 + }, + { + "epoch": 0.03863949947663907, + "grad_norm": 2.0496580600738525, + "learning_rate": 4.9816089787043826e-05, + "loss": 6.0137, + "step": 6497 + }, + { + "epoch": 0.03864544675992007, + "grad_norm": 2.5402488708496094, + "learning_rate": 4.9816033229635324e-05, + "loss": 6.1389, + "step": 6498 + }, + { + "epoch": 0.038651394043201066, + "grad_norm": 2.2701938152313232, + "learning_rate": 4.9815976663563795e-05, + "loss": 6.1277, + "step": 6499 + }, + { + "epoch": 0.03865734132648206, + "grad_norm": 2.328554630279541, + "learning_rate": 4.9815920088829273e-05, + "loss": 6.0402, + "step": 6500 + }, + { + "epoch": 0.038663288609763063, + "grad_norm": 2.1817965507507324, + "learning_rate": 4.981586350543176e-05, + "loss": 6.2732, + "step": 6501 + }, + { + "epoch": 0.03866923589304406, + "grad_norm": 2.4273757934570312, + "learning_rate": 4.981580691337129e-05, + "loss": 6.1842, + "step": 6502 + }, + { + "epoch": 0.038675183176325054, + "grad_norm": 2.1365530490875244, + "learning_rate": 4.981575031264787e-05, + "loss": 6.1527, + "step": 6503 + }, + { + "epoch": 0.03868113045960605, + "grad_norm": 2.2198991775512695, + "learning_rate": 4.981569370326154e-05, + "loss": 6.0841, + "step": 6504 + }, + { + "epoch": 0.03868707774288705, + "grad_norm": 2.0078141689300537, + "learning_rate": 4.98156370852123e-05, + "loss": 6.0401, + "step": 6505 + }, + { + "epoch": 0.038693025026168046, + "grad_norm": 2.0243566036224365, + "learning_rate": 4.9815580458500184e-05, + "loss": 5.9111, + "step": 6506 + }, + { + "epoch": 0.03869897230944904, + "grad_norm": 2.3084707260131836, + "learning_rate": 4.98155238231252e-05, + "loss": 5.9865, + "step": 6507 + }, + { + "epoch": 0.03870491959273004, + "grad_norm": 1.8110517263412476, + "learning_rate": 4.981546717908738e-05, + "loss": 5.9132, + "step": 6508 + }, + { + "epoch": 0.03871086687601104, + "grad_norm": 2.2639706134796143, + "learning_rate": 4.981541052638673e-05, + "loss": 5.8195, + "step": 6509 + }, + { + "epoch": 0.03871681415929203, + "grad_norm": 2.2684152126312256, + "learning_rate": 4.981535386502327e-05, + "loss": 6.4894, + "step": 6510 + }, + { + "epoch": 0.038722761442573035, + "grad_norm": 2.363118886947632, + "learning_rate": 4.981529719499704e-05, + "loss": 6.1888, + "step": 6511 + }, + { + "epoch": 0.03872870872585403, + "grad_norm": 2.2158865928649902, + "learning_rate": 4.9815240516308045e-05, + "loss": 6.3361, + "step": 6512 + }, + { + "epoch": 0.038734656009135025, + "grad_norm": 2.096928834915161, + "learning_rate": 4.98151838289563e-05, + "loss": 5.8554, + "step": 6513 + }, + { + "epoch": 0.03874060329241603, + "grad_norm": 2.2228331565856934, + "learning_rate": 4.981512713294183e-05, + "loss": 5.9961, + "step": 6514 + }, + { + "epoch": 0.03874655057569702, + "grad_norm": 1.8646903038024902, + "learning_rate": 4.981507042826466e-05, + "loss": 6.1471, + "step": 6515 + }, + { + "epoch": 0.03875249785897802, + "grad_norm": 2.227267265319824, + "learning_rate": 4.98150137149248e-05, + "loss": 5.9655, + "step": 6516 + }, + { + "epoch": 0.03875844514225902, + "grad_norm": 2.6884701251983643, + "learning_rate": 4.981495699292228e-05, + "loss": 5.7958, + "step": 6517 + }, + { + "epoch": 0.038764392425540015, + "grad_norm": 2.953523635864258, + "learning_rate": 4.981490026225711e-05, + "loss": 5.8305, + "step": 6518 + }, + { + "epoch": 0.03877033970882101, + "grad_norm": 2.5009984970092773, + "learning_rate": 4.981484352292932e-05, + "loss": 5.7838, + "step": 6519 + }, + { + "epoch": 0.038776286992102005, + "grad_norm": 2.2291715145111084, + "learning_rate": 4.981478677493892e-05, + "loss": 5.7622, + "step": 6520 + }, + { + "epoch": 0.03878223427538301, + "grad_norm": 2.1492466926574707, + "learning_rate": 4.9814730018285935e-05, + "loss": 5.5379, + "step": 6521 + }, + { + "epoch": 0.038788181558664, + "grad_norm": 1.8914062976837158, + "learning_rate": 4.981467325297039e-05, + "loss": 5.8368, + "step": 6522 + }, + { + "epoch": 0.038794128841945, + "grad_norm": 2.301670789718628, + "learning_rate": 4.981461647899229e-05, + "loss": 5.9019, + "step": 6523 + }, + { + "epoch": 0.038800076125226, + "grad_norm": 2.2850520610809326, + "learning_rate": 4.981455969635167e-05, + "loss": 5.6616, + "step": 6524 + }, + { + "epoch": 0.038806023408506994, + "grad_norm": 2.4155313968658447, + "learning_rate": 4.9814502905048546e-05, + "loss": 5.7842, + "step": 6525 + }, + { + "epoch": 0.03881197069178799, + "grad_norm": 2.0731799602508545, + "learning_rate": 4.981444610508293e-05, + "loss": 6.084, + "step": 6526 + }, + { + "epoch": 0.03881791797506899, + "grad_norm": 2.990232229232788, + "learning_rate": 4.981438929645484e-05, + "loss": 5.2556, + "step": 6527 + }, + { + "epoch": 0.038823865258349986, + "grad_norm": 3.0814263820648193, + "learning_rate": 4.981433247916432e-05, + "loss": 5.1895, + "step": 6528 + }, + { + "epoch": 0.03882981254163098, + "grad_norm": 3.197000503540039, + "learning_rate": 4.9814275653211365e-05, + "loss": 4.9539, + "step": 6529 + }, + { + "epoch": 0.03883575982491198, + "grad_norm": 3.062098979949951, + "learning_rate": 4.9814218818596e-05, + "loss": 4.8417, + "step": 6530 + }, + { + "epoch": 0.03884170710819298, + "grad_norm": 3.092667579650879, + "learning_rate": 4.981416197531825e-05, + "loss": 5.0479, + "step": 6531 + }, + { + "epoch": 0.038847654391473974, + "grad_norm": 3.00508713722229, + "learning_rate": 4.981410512337813e-05, + "loss": 5.864, + "step": 6532 + }, + { + "epoch": 0.03885360167475497, + "grad_norm": 3.3760926723480225, + "learning_rate": 4.981404826277567e-05, + "loss": 6.5745, + "step": 6533 + }, + { + "epoch": 0.03885954895803597, + "grad_norm": 2.6170921325683594, + "learning_rate": 4.981399139351087e-05, + "loss": 5.7959, + "step": 6534 + }, + { + "epoch": 0.038865496241316966, + "grad_norm": 2.9855849742889404, + "learning_rate": 4.981393451558377e-05, + "loss": 4.9118, + "step": 6535 + }, + { + "epoch": 0.03887144352459796, + "grad_norm": 2.885373830795288, + "learning_rate": 4.981387762899438e-05, + "loss": 4.8342, + "step": 6536 + }, + { + "epoch": 0.03887739080787896, + "grad_norm": 2.6936960220336914, + "learning_rate": 4.981382073374272e-05, + "loss": 4.7323, + "step": 6537 + }, + { + "epoch": 0.03888333809115996, + "grad_norm": 2.7214853763580322, + "learning_rate": 4.981376382982882e-05, + "loss": 5.5414, + "step": 6538 + }, + { + "epoch": 0.03888928537444095, + "grad_norm": 2.449828863143921, + "learning_rate": 4.981370691725269e-05, + "loss": 5.6385, + "step": 6539 + }, + { + "epoch": 0.038895232657721955, + "grad_norm": 2.551046133041382, + "learning_rate": 4.981364999601434e-05, + "loss": 5.4699, + "step": 6540 + }, + { + "epoch": 0.03890117994100295, + "grad_norm": 2.1208136081695557, + "learning_rate": 4.981359306611381e-05, + "loss": 5.6674, + "step": 6541 + }, + { + "epoch": 0.038907127224283945, + "grad_norm": 2.4039392471313477, + "learning_rate": 4.9813536127551105e-05, + "loss": 6.1872, + "step": 6542 + }, + { + "epoch": 0.03891307450756495, + "grad_norm": 2.0119946002960205, + "learning_rate": 4.9813479180326256e-05, + "loss": 6.0917, + "step": 6543 + }, + { + "epoch": 0.03891902179084594, + "grad_norm": 3.2959303855895996, + "learning_rate": 4.9813422224439275e-05, + "loss": 5.5646, + "step": 6544 + }, + { + "epoch": 0.03892496907412694, + "grad_norm": 2.9011316299438477, + "learning_rate": 4.981336525989019e-05, + "loss": 5.5324, + "step": 6545 + }, + { + "epoch": 0.03893091635740794, + "grad_norm": 2.2984118461608887, + "learning_rate": 4.981330828667901e-05, + "loss": 5.4961, + "step": 6546 + }, + { + "epoch": 0.038936863640688935, + "grad_norm": 2.1745059490203857, + "learning_rate": 4.981325130480576e-05, + "loss": 5.6631, + "step": 6547 + }, + { + "epoch": 0.03894281092396993, + "grad_norm": 2.3001794815063477, + "learning_rate": 4.981319431427046e-05, + "loss": 5.5897, + "step": 6548 + }, + { + "epoch": 0.038948758207250925, + "grad_norm": 2.329446315765381, + "learning_rate": 4.9813137315073136e-05, + "loss": 5.4599, + "step": 6549 + }, + { + "epoch": 0.03895470549053193, + "grad_norm": 2.4700307846069336, + "learning_rate": 4.98130803072138e-05, + "loss": 5.2788, + "step": 6550 + }, + { + "epoch": 0.03896065277381292, + "grad_norm": 2.309767484664917, + "learning_rate": 4.9813023290692467e-05, + "loss": 5.3828, + "step": 6551 + }, + { + "epoch": 0.03896660005709392, + "grad_norm": 2.1923089027404785, + "learning_rate": 4.981296626550917e-05, + "loss": 5.225, + "step": 6552 + }, + { + "epoch": 0.03897254734037492, + "grad_norm": 2.424954652786255, + "learning_rate": 4.981290923166392e-05, + "loss": 5.2007, + "step": 6553 + }, + { + "epoch": 0.038978494623655914, + "grad_norm": 2.53446102142334, + "learning_rate": 4.981285218915674e-05, + "loss": 5.142, + "step": 6554 + }, + { + "epoch": 0.03898444190693691, + "grad_norm": 2.492788791656494, + "learning_rate": 4.9812795137987655e-05, + "loss": 5.5755, + "step": 6555 + }, + { + "epoch": 0.03899038919021791, + "grad_norm": 2.8081278800964355, + "learning_rate": 4.9812738078156674e-05, + "loss": 4.9815, + "step": 6556 + }, + { + "epoch": 0.038996336473498906, + "grad_norm": 2.535109758377075, + "learning_rate": 4.981268100966383e-05, + "loss": 5.3678, + "step": 6557 + }, + { + "epoch": 0.0390022837567799, + "grad_norm": 2.36004900932312, + "learning_rate": 4.981262393250913e-05, + "loss": 5.0422, + "step": 6558 + }, + { + "epoch": 0.0390082310400609, + "grad_norm": 2.2315657138824463, + "learning_rate": 4.98125668466926e-05, + "loss": 5.0345, + "step": 6559 + }, + { + "epoch": 0.0390141783233419, + "grad_norm": 2.293947696685791, + "learning_rate": 4.981250975221425e-05, + "loss": 4.9308, + "step": 6560 + }, + { + "epoch": 0.039020125606622894, + "grad_norm": 2.239915132522583, + "learning_rate": 4.9812452649074124e-05, + "loss": 5.3504, + "step": 6561 + }, + { + "epoch": 0.03902607288990389, + "grad_norm": 1.8740140199661255, + "learning_rate": 4.981239553727222e-05, + "loss": 5.9432, + "step": 6562 + }, + { + "epoch": 0.03903202017318489, + "grad_norm": 1.7221744060516357, + "learning_rate": 4.981233841680857e-05, + "loss": 5.8387, + "step": 6563 + }, + { + "epoch": 0.039037967456465886, + "grad_norm": 1.9648221731185913, + "learning_rate": 4.981228128768318e-05, + "loss": 5.7836, + "step": 6564 + }, + { + "epoch": 0.03904391473974688, + "grad_norm": 1.7790826559066772, + "learning_rate": 4.981222414989608e-05, + "loss": 5.842, + "step": 6565 + }, + { + "epoch": 0.03904986202302788, + "grad_norm": 2.039483070373535, + "learning_rate": 4.9812167003447296e-05, + "loss": 5.6509, + "step": 6566 + }, + { + "epoch": 0.03905580930630888, + "grad_norm": 2.1241865158081055, + "learning_rate": 4.981210984833684e-05, + "loss": 5.5626, + "step": 6567 + }, + { + "epoch": 0.03906175658958987, + "grad_norm": 2.1290524005889893, + "learning_rate": 4.981205268456473e-05, + "loss": 5.5114, + "step": 6568 + }, + { + "epoch": 0.039067703872870875, + "grad_norm": 2.181558132171631, + "learning_rate": 4.981199551213099e-05, + "loss": 5.5356, + "step": 6569 + }, + { + "epoch": 0.03907365115615187, + "grad_norm": 2.1696360111236572, + "learning_rate": 4.9811938331035635e-05, + "loss": 5.5684, + "step": 6570 + }, + { + "epoch": 0.039079598439432865, + "grad_norm": 1.8040674924850464, + "learning_rate": 4.98118811412787e-05, + "loss": 5.605, + "step": 6571 + }, + { + "epoch": 0.03908554572271387, + "grad_norm": 2.4475252628326416, + "learning_rate": 4.981182394286018e-05, + "loss": 6.4733, + "step": 6572 + }, + { + "epoch": 0.03909149300599486, + "grad_norm": 2.0800678730010986, + "learning_rate": 4.981176673578011e-05, + "loss": 5.5613, + "step": 6573 + }, + { + "epoch": 0.03909744028927586, + "grad_norm": 1.7632306814193726, + "learning_rate": 4.981170952003852e-05, + "loss": 5.5971, + "step": 6574 + }, + { + "epoch": 0.03910338757255686, + "grad_norm": 1.6671072244644165, + "learning_rate": 4.981165229563541e-05, + "loss": 5.4462, + "step": 6575 + }, + { + "epoch": 0.039109334855837855, + "grad_norm": 1.8972923755645752, + "learning_rate": 4.981159506257081e-05, + "loss": 5.7747, + "step": 6576 + }, + { + "epoch": 0.03911528213911885, + "grad_norm": 1.8343021869659424, + "learning_rate": 4.981153782084473e-05, + "loss": 5.7542, + "step": 6577 + }, + { + "epoch": 0.039121229422399845, + "grad_norm": 1.669877529144287, + "learning_rate": 4.9811480570457216e-05, + "loss": 5.6736, + "step": 6578 + }, + { + "epoch": 0.03912717670568085, + "grad_norm": 1.9555165767669678, + "learning_rate": 4.981142331140825e-05, + "loss": 5.2997, + "step": 6579 + }, + { + "epoch": 0.03913312398896184, + "grad_norm": 2.5131587982177734, + "learning_rate": 4.981136604369789e-05, + "loss": 5.2093, + "step": 6580 + }, + { + "epoch": 0.03913907127224284, + "grad_norm": 2.0637567043304443, + "learning_rate": 4.9811308767326134e-05, + "loss": 5.1671, + "step": 6581 + }, + { + "epoch": 0.03914501855552384, + "grad_norm": 2.140839099884033, + "learning_rate": 4.9811251482293e-05, + "loss": 5.3237, + "step": 6582 + }, + { + "epoch": 0.039150965838804834, + "grad_norm": 1.968489408493042, + "learning_rate": 4.981119418859852e-05, + "loss": 5.6015, + "step": 6583 + }, + { + "epoch": 0.03915691312208583, + "grad_norm": 1.873827338218689, + "learning_rate": 4.9811136886242705e-05, + "loss": 5.3316, + "step": 6584 + }, + { + "epoch": 0.03916286040536683, + "grad_norm": 1.9897359609603882, + "learning_rate": 4.981107957522558e-05, + "loss": 5.1548, + "step": 6585 + }, + { + "epoch": 0.039168807688647826, + "grad_norm": 2.004457950592041, + "learning_rate": 4.9811022255547165e-05, + "loss": 5.1977, + "step": 6586 + }, + { + "epoch": 0.03917475497192882, + "grad_norm": 2.1058437824249268, + "learning_rate": 4.9810964927207485e-05, + "loss": 5.0217, + "step": 6587 + }, + { + "epoch": 0.03918070225520982, + "grad_norm": 1.9846851825714111, + "learning_rate": 4.981090759020654e-05, + "loss": 5.1123, + "step": 6588 + }, + { + "epoch": 0.03918664953849082, + "grad_norm": 2.018026828765869, + "learning_rate": 4.981085024454437e-05, + "loss": 5.0516, + "step": 6589 + }, + { + "epoch": 0.039192596821771813, + "grad_norm": 1.7792260646820068, + "learning_rate": 4.9810792890220995e-05, + "loss": 5.5266, + "step": 6590 + }, + { + "epoch": 0.03919854410505281, + "grad_norm": 2.0855109691619873, + "learning_rate": 4.981073552723642e-05, + "loss": 5.5504, + "step": 6591 + }, + { + "epoch": 0.03920449138833381, + "grad_norm": 1.9998018741607666, + "learning_rate": 4.9810678155590676e-05, + "loss": 5.3447, + "step": 6592 + }, + { + "epoch": 0.039210438671614806, + "grad_norm": 2.332714557647705, + "learning_rate": 4.981062077528377e-05, + "loss": 5.6166, + "step": 6593 + }, + { + "epoch": 0.0392163859548958, + "grad_norm": 1.9647892713546753, + "learning_rate": 4.981056338631575e-05, + "loss": 5.0113, + "step": 6594 + }, + { + "epoch": 0.0392223332381768, + "grad_norm": 1.9961154460906982, + "learning_rate": 4.9810505988686604e-05, + "loss": 5.0143, + "step": 6595 + }, + { + "epoch": 0.0392282805214578, + "grad_norm": 1.9039133787155151, + "learning_rate": 4.981044858239637e-05, + "loss": 5.3602, + "step": 6596 + }, + { + "epoch": 0.03923422780473879, + "grad_norm": 1.9076604843139648, + "learning_rate": 4.981039116744507e-05, + "loss": 5.4165, + "step": 6597 + }, + { + "epoch": 0.039240175088019795, + "grad_norm": 1.6676216125488281, + "learning_rate": 4.981033374383272e-05, + "loss": 5.4018, + "step": 6598 + }, + { + "epoch": 0.03924612237130079, + "grad_norm": 1.7158783674240112, + "learning_rate": 4.981027631155933e-05, + "loss": 5.3233, + "step": 6599 + }, + { + "epoch": 0.039252069654581785, + "grad_norm": 1.6659481525421143, + "learning_rate": 4.9810218870624945e-05, + "loss": 5.4671, + "step": 6600 + }, + { + "epoch": 0.03925801693786279, + "grad_norm": 2.008171319961548, + "learning_rate": 4.981016142102956e-05, + "loss": 5.6424, + "step": 6601 + }, + { + "epoch": 0.03926396422114378, + "grad_norm": 2.213045835494995, + "learning_rate": 4.9810103962773204e-05, + "loss": 5.419, + "step": 6602 + }, + { + "epoch": 0.03926991150442478, + "grad_norm": 2.0159718990325928, + "learning_rate": 4.981004649585589e-05, + "loss": 5.4301, + "step": 6603 + }, + { + "epoch": 0.03927585878770578, + "grad_norm": 1.982701063156128, + "learning_rate": 4.9809989020277646e-05, + "loss": 5.6001, + "step": 6604 + }, + { + "epoch": 0.039281806070986774, + "grad_norm": 2.1933834552764893, + "learning_rate": 4.98099315360385e-05, + "loss": 5.6756, + "step": 6605 + }, + { + "epoch": 0.03928775335426777, + "grad_norm": 1.858798623085022, + "learning_rate": 4.980987404313846e-05, + "loss": 5.43, + "step": 6606 + }, + { + "epoch": 0.039293700637548765, + "grad_norm": 1.8233433961868286, + "learning_rate": 4.980981654157755e-05, + "loss": 5.4638, + "step": 6607 + }, + { + "epoch": 0.03929964792082977, + "grad_norm": 2.0368216037750244, + "learning_rate": 4.9809759031355784e-05, + "loss": 5.71, + "step": 6608 + }, + { + "epoch": 0.03930559520411076, + "grad_norm": 1.9923310279846191, + "learning_rate": 4.9809701512473196e-05, + "loss": 5.6443, + "step": 6609 + }, + { + "epoch": 0.03931154248739176, + "grad_norm": 2.391463279724121, + "learning_rate": 4.9809643984929785e-05, + "loss": 5.4701, + "step": 6610 + }, + { + "epoch": 0.03931748977067276, + "grad_norm": 1.8456658124923706, + "learning_rate": 4.98095864487256e-05, + "loss": 5.4346, + "step": 6611 + }, + { + "epoch": 0.039323437053953754, + "grad_norm": 1.7941107749938965, + "learning_rate": 4.980952890386063e-05, + "loss": 5.4198, + "step": 6612 + }, + { + "epoch": 0.03932938433723475, + "grad_norm": 1.8455369472503662, + "learning_rate": 4.980947135033492e-05, + "loss": 5.3915, + "step": 6613 + }, + { + "epoch": 0.03933533162051575, + "grad_norm": 1.8710846900939941, + "learning_rate": 4.980941378814847e-05, + "loss": 5.2744, + "step": 6614 + }, + { + "epoch": 0.039341278903796746, + "grad_norm": 2.203129768371582, + "learning_rate": 4.980935621730132e-05, + "loss": 5.4409, + "step": 6615 + }, + { + "epoch": 0.03934722618707774, + "grad_norm": 1.8944141864776611, + "learning_rate": 4.980929863779348e-05, + "loss": 5.4661, + "step": 6616 + }, + { + "epoch": 0.03935317347035874, + "grad_norm": 1.8268091678619385, + "learning_rate": 4.9809241049624966e-05, + "loss": 5.4088, + "step": 6617 + }, + { + "epoch": 0.03935912075363974, + "grad_norm": 1.838927984237671, + "learning_rate": 4.98091834527958e-05, + "loss": 5.5335, + "step": 6618 + }, + { + "epoch": 0.03936506803692073, + "grad_norm": 1.8441804647445679, + "learning_rate": 4.9809125847306e-05, + "loss": 5.4639, + "step": 6619 + }, + { + "epoch": 0.03937101532020173, + "grad_norm": 2.012754440307617, + "learning_rate": 4.980906823315561e-05, + "loss": 5.5606, + "step": 6620 + }, + { + "epoch": 0.03937696260348273, + "grad_norm": 1.8358973264694214, + "learning_rate": 4.980901061034461e-05, + "loss": 5.4217, + "step": 6621 + }, + { + "epoch": 0.039382909886763726, + "grad_norm": 2.0668959617614746, + "learning_rate": 4.980895297887305e-05, + "loss": 5.5164, + "step": 6622 + }, + { + "epoch": 0.03938885717004472, + "grad_norm": 2.032320976257324, + "learning_rate": 4.9808895338740934e-05, + "loss": 5.4914, + "step": 6623 + }, + { + "epoch": 0.03939480445332572, + "grad_norm": 1.8650145530700684, + "learning_rate": 4.980883768994829e-05, + "loss": 5.3718, + "step": 6624 + }, + { + "epoch": 0.03940075173660672, + "grad_norm": 4.494358539581299, + "learning_rate": 4.980878003249515e-05, + "loss": 5.5253, + "step": 6625 + }, + { + "epoch": 0.03940669901988771, + "grad_norm": 1.9295374155044556, + "learning_rate": 4.980872236638151e-05, + "loss": 5.3187, + "step": 6626 + }, + { + "epoch": 0.039412646303168715, + "grad_norm": 2.089717388153076, + "learning_rate": 4.980866469160741e-05, + "loss": 5.5311, + "step": 6627 + }, + { + "epoch": 0.03941859358644971, + "grad_norm": 1.701429843902588, + "learning_rate": 4.980860700817285e-05, + "loss": 5.4529, + "step": 6628 + }, + { + "epoch": 0.039424540869730705, + "grad_norm": 1.8336073160171509, + "learning_rate": 4.980854931607787e-05, + "loss": 5.2987, + "step": 6629 + }, + { + "epoch": 0.03943048815301171, + "grad_norm": 2.7922565937042236, + "learning_rate": 4.9808491615322475e-05, + "loss": 5.3492, + "step": 6630 + }, + { + "epoch": 0.0394364354362927, + "grad_norm": 1.8253742456436157, + "learning_rate": 4.980843390590669e-05, + "loss": 5.3928, + "step": 6631 + }, + { + "epoch": 0.0394423827195737, + "grad_norm": 2.646916151046753, + "learning_rate": 4.980837618783055e-05, + "loss": 5.4329, + "step": 6632 + }, + { + "epoch": 0.0394483300028547, + "grad_norm": 2.1956236362457275, + "learning_rate": 4.980831846109405e-05, + "loss": 5.4794, + "step": 6633 + }, + { + "epoch": 0.039454277286135694, + "grad_norm": 2.7274577617645264, + "learning_rate": 4.980826072569723e-05, + "loss": 5.9666, + "step": 6634 + }, + { + "epoch": 0.03946022456941669, + "grad_norm": 1.9890350103378296, + "learning_rate": 4.98082029816401e-05, + "loss": 5.5518, + "step": 6635 + }, + { + "epoch": 0.039466171852697685, + "grad_norm": 2.7760517597198486, + "learning_rate": 4.980814522892268e-05, + "loss": 5.2777, + "step": 6636 + }, + { + "epoch": 0.03947211913597869, + "grad_norm": 2.035254716873169, + "learning_rate": 4.9808087467544995e-05, + "loss": 5.5872, + "step": 6637 + }, + { + "epoch": 0.03947806641925968, + "grad_norm": 1.9728864431381226, + "learning_rate": 4.980802969750706e-05, + "loss": 5.3357, + "step": 6638 + }, + { + "epoch": 0.03948401370254068, + "grad_norm": 1.795480489730835, + "learning_rate": 4.98079719188089e-05, + "loss": 5.6414, + "step": 6639 + }, + { + "epoch": 0.03948996098582168, + "grad_norm": 1.7882109880447388, + "learning_rate": 4.980791413145054e-05, + "loss": 5.3499, + "step": 6640 + }, + { + "epoch": 0.039495908269102674, + "grad_norm": 1.8416422605514526, + "learning_rate": 4.9807856335431994e-05, + "loss": 5.3292, + "step": 6641 + }, + { + "epoch": 0.03950185555238367, + "grad_norm": 1.9525254964828491, + "learning_rate": 4.9807798530753266e-05, + "loss": 5.2782, + "step": 6642 + }, + { + "epoch": 0.03950780283566467, + "grad_norm": 1.5100830793380737, + "learning_rate": 4.9807740717414406e-05, + "loss": 5.2807, + "step": 6643 + }, + { + "epoch": 0.039513750118945666, + "grad_norm": 2.029430866241455, + "learning_rate": 4.9807682895415406e-05, + "loss": 5.4496, + "step": 6644 + }, + { + "epoch": 0.03951969740222666, + "grad_norm": 1.7976901531219482, + "learning_rate": 4.9807625064756315e-05, + "loss": 5.1021, + "step": 6645 + }, + { + "epoch": 0.03952564468550766, + "grad_norm": 1.5770336389541626, + "learning_rate": 4.980756722543714e-05, + "loss": 5.3946, + "step": 6646 + }, + { + "epoch": 0.03953159196878866, + "grad_norm": 1.8289496898651123, + "learning_rate": 4.980750937745788e-05, + "loss": 5.4821, + "step": 6647 + }, + { + "epoch": 0.03953753925206965, + "grad_norm": 1.7413506507873535, + "learning_rate": 4.980745152081859e-05, + "loss": 5.4827, + "step": 6648 + }, + { + "epoch": 0.03954348653535065, + "grad_norm": 2.048400402069092, + "learning_rate": 4.980739365551927e-05, + "loss": 5.2359, + "step": 6649 + }, + { + "epoch": 0.03954943381863165, + "grad_norm": 2.331897735595703, + "learning_rate": 4.980733578155995e-05, + "loss": 5.2988, + "step": 6650 + }, + { + "epoch": 0.039555381101912646, + "grad_norm": 2.1224608421325684, + "learning_rate": 4.980727789894065e-05, + "loss": 5.1228, + "step": 6651 + }, + { + "epoch": 0.03956132838519364, + "grad_norm": 1.5331578254699707, + "learning_rate": 4.9807220007661374e-05, + "loss": 5.184, + "step": 6652 + }, + { + "epoch": 0.03956727566847464, + "grad_norm": 1.773489236831665, + "learning_rate": 4.980716210772216e-05, + "loss": 5.1883, + "step": 6653 + }, + { + "epoch": 0.03957322295175564, + "grad_norm": 2.119302749633789, + "learning_rate": 4.9807104199123016e-05, + "loss": 5.5437, + "step": 6654 + }, + { + "epoch": 0.03957917023503663, + "grad_norm": 2.0695033073425293, + "learning_rate": 4.9807046281863974e-05, + "loss": 5.5951, + "step": 6655 + }, + { + "epoch": 0.039585117518317635, + "grad_norm": 2.0522243976593018, + "learning_rate": 4.980698835594505e-05, + "loss": 5.2736, + "step": 6656 + }, + { + "epoch": 0.03959106480159863, + "grad_norm": 2.3200113773345947, + "learning_rate": 4.980693042136626e-05, + "loss": 5.5701, + "step": 6657 + }, + { + "epoch": 0.039597012084879625, + "grad_norm": 1.8731193542480469, + "learning_rate": 4.980687247812762e-05, + "loss": 5.3929, + "step": 6658 + }, + { + "epoch": 0.03960295936816063, + "grad_norm": 1.8390223979949951, + "learning_rate": 4.980681452622916e-05, + "loss": 5.1684, + "step": 6659 + }, + { + "epoch": 0.03960890665144162, + "grad_norm": 2.24766206741333, + "learning_rate": 4.980675656567091e-05, + "loss": 5.0232, + "step": 6660 + }, + { + "epoch": 0.03961485393472262, + "grad_norm": 2.2592451572418213, + "learning_rate": 4.980669859645286e-05, + "loss": 4.9878, + "step": 6661 + }, + { + "epoch": 0.03962080121800362, + "grad_norm": 2.14709734916687, + "learning_rate": 4.9806640618575064e-05, + "loss": 5.1036, + "step": 6662 + }, + { + "epoch": 0.039626748501284614, + "grad_norm": 2.133910655975342, + "learning_rate": 4.9806582632037516e-05, + "loss": 5.0356, + "step": 6663 + }, + { + "epoch": 0.03963269578456561, + "grad_norm": 2.2513222694396973, + "learning_rate": 4.980652463684025e-05, + "loss": 5.2357, + "step": 6664 + }, + { + "epoch": 0.039638643067846605, + "grad_norm": 2.078355312347412, + "learning_rate": 4.980646663298328e-05, + "loss": 5.3857, + "step": 6665 + }, + { + "epoch": 0.03964459035112761, + "grad_norm": 2.3798105716705322, + "learning_rate": 4.980640862046663e-05, + "loss": 5.0888, + "step": 6666 + }, + { + "epoch": 0.0396505376344086, + "grad_norm": 2.241868019104004, + "learning_rate": 4.980635059929032e-05, + "loss": 5.1397, + "step": 6667 + }, + { + "epoch": 0.0396564849176896, + "grad_norm": 2.2053534984588623, + "learning_rate": 4.9806292569454365e-05, + "loss": 4.799, + "step": 6668 + }, + { + "epoch": 0.0396624322009706, + "grad_norm": 2.2996716499328613, + "learning_rate": 4.980623453095879e-05, + "loss": 4.9597, + "step": 6669 + }, + { + "epoch": 0.039668379484251594, + "grad_norm": 1.9892657995224, + "learning_rate": 4.9806176483803615e-05, + "loss": 5.0784, + "step": 6670 + }, + { + "epoch": 0.03967432676753259, + "grad_norm": 2.2087242603302, + "learning_rate": 4.980611842798887e-05, + "loss": 5.4099, + "step": 6671 + }, + { + "epoch": 0.03968027405081359, + "grad_norm": 2.215728521347046, + "learning_rate": 4.980606036351455e-05, + "loss": 5.2889, + "step": 6672 + }, + { + "epoch": 0.039686221334094586, + "grad_norm": 2.228073835372925, + "learning_rate": 4.9806002290380705e-05, + "loss": 5.3816, + "step": 6673 + }, + { + "epoch": 0.03969216861737558, + "grad_norm": 2.209808826446533, + "learning_rate": 4.980594420858733e-05, + "loss": 5.6233, + "step": 6674 + }, + { + "epoch": 0.03969811590065658, + "grad_norm": 1.8294177055358887, + "learning_rate": 4.980588611813446e-05, + "loss": 5.5756, + "step": 6675 + }, + { + "epoch": 0.03970406318393758, + "grad_norm": 2.236435890197754, + "learning_rate": 4.980582801902212e-05, + "loss": 5.4807, + "step": 6676 + }, + { + "epoch": 0.03971001046721857, + "grad_norm": 2.528804063796997, + "learning_rate": 4.980576991125031e-05, + "loss": 5.6503, + "step": 6677 + }, + { + "epoch": 0.03971595775049957, + "grad_norm": 2.312063217163086, + "learning_rate": 4.9805711794819065e-05, + "loss": 5.5517, + "step": 6678 + }, + { + "epoch": 0.03972190503378057, + "grad_norm": 2.336134672164917, + "learning_rate": 4.98056536697284e-05, + "loss": 5.5708, + "step": 6679 + }, + { + "epoch": 0.039727852317061566, + "grad_norm": 2.2809929847717285, + "learning_rate": 4.980559553597834e-05, + "loss": 5.453, + "step": 6680 + }, + { + "epoch": 0.03973379960034256, + "grad_norm": 2.0603368282318115, + "learning_rate": 4.98055373935689e-05, + "loss": 5.3482, + "step": 6681 + }, + { + "epoch": 0.03973974688362356, + "grad_norm": 1.9654933214187622, + "learning_rate": 4.980547924250011e-05, + "loss": 5.29, + "step": 6682 + }, + { + "epoch": 0.03974569416690456, + "grad_norm": 2.4211983680725098, + "learning_rate": 4.9805421082771985e-05, + "loss": 5.4261, + "step": 6683 + }, + { + "epoch": 0.03975164145018555, + "grad_norm": 2.129987955093384, + "learning_rate": 4.9805362914384533e-05, + "loss": 5.3551, + "step": 6684 + }, + { + "epoch": 0.039757588733466555, + "grad_norm": 2.127936601638794, + "learning_rate": 4.9805304737337796e-05, + "loss": 5.4647, + "step": 6685 + }, + { + "epoch": 0.03976353601674755, + "grad_norm": 2.303382158279419, + "learning_rate": 4.980524655163178e-05, + "loss": 5.1699, + "step": 6686 + }, + { + "epoch": 0.039769483300028545, + "grad_norm": 2.6889941692352295, + "learning_rate": 4.98051883572665e-05, + "loss": 5.2031, + "step": 6687 + }, + { + "epoch": 0.03977543058330955, + "grad_norm": 3.321950674057007, + "learning_rate": 4.9805130154242e-05, + "loss": 4.9815, + "step": 6688 + }, + { + "epoch": 0.03978137786659054, + "grad_norm": 3.1951568126678467, + "learning_rate": 4.980507194255827e-05, + "loss": 4.8946, + "step": 6689 + }, + { + "epoch": 0.03978732514987154, + "grad_norm": 2.355271816253662, + "learning_rate": 4.9805013722215355e-05, + "loss": 5.9223, + "step": 6690 + }, + { + "epoch": 0.03979327243315254, + "grad_norm": 2.3401644229888916, + "learning_rate": 4.9804955493213264e-05, + "loss": 6.1826, + "step": 6691 + }, + { + "epoch": 0.039799219716433534, + "grad_norm": 2.191997766494751, + "learning_rate": 4.980489725555202e-05, + "loss": 5.5617, + "step": 6692 + }, + { + "epoch": 0.03980516699971453, + "grad_norm": 2.377803087234497, + "learning_rate": 4.9804839009231644e-05, + "loss": 5.684, + "step": 6693 + }, + { + "epoch": 0.039811114282995524, + "grad_norm": 1.9084972143173218, + "learning_rate": 4.980478075425215e-05, + "loss": 6.0291, + "step": 6694 + }, + { + "epoch": 0.039817061566276526, + "grad_norm": 2.185628890991211, + "learning_rate": 4.9804722490613566e-05, + "loss": 5.5808, + "step": 6695 + }, + { + "epoch": 0.03982300884955752, + "grad_norm": 2.3253934383392334, + "learning_rate": 4.980466421831591e-05, + "loss": 5.7076, + "step": 6696 + }, + { + "epoch": 0.03982895613283852, + "grad_norm": 2.1599392890930176, + "learning_rate": 4.98046059373592e-05, + "loss": 5.9607, + "step": 6697 + }, + { + "epoch": 0.03983490341611952, + "grad_norm": 2.093137741088867, + "learning_rate": 4.980454764774346e-05, + "loss": 6.0014, + "step": 6698 + }, + { + "epoch": 0.039840850699400514, + "grad_norm": 2.4242093563079834, + "learning_rate": 4.980448934946871e-05, + "loss": 5.6255, + "step": 6699 + }, + { + "epoch": 0.03984679798268151, + "grad_norm": 2.523277521133423, + "learning_rate": 4.980443104253497e-05, + "loss": 5.5302, + "step": 6700 + }, + { + "epoch": 0.03985274526596251, + "grad_norm": 1.7926498651504517, + "learning_rate": 4.980437272694225e-05, + "loss": 5.6467, + "step": 6701 + }, + { + "epoch": 0.039858692549243506, + "grad_norm": 1.7630435228347778, + "learning_rate": 4.980431440269059e-05, + "loss": 5.9615, + "step": 6702 + }, + { + "epoch": 0.0398646398325245, + "grad_norm": 1.8051058053970337, + "learning_rate": 4.980425606978e-05, + "loss": 6.13, + "step": 6703 + }, + { + "epoch": 0.0398705871158055, + "grad_norm": 2.104901075363159, + "learning_rate": 4.98041977282105e-05, + "loss": 6.142, + "step": 6704 + }, + { + "epoch": 0.0398765343990865, + "grad_norm": 1.7022942304611206, + "learning_rate": 4.98041393779821e-05, + "loss": 5.6764, + "step": 6705 + }, + { + "epoch": 0.03988248168236749, + "grad_norm": 2.140230178833008, + "learning_rate": 4.980408101909485e-05, + "loss": 5.9796, + "step": 6706 + }, + { + "epoch": 0.03988842896564849, + "grad_norm": 1.9564754962921143, + "learning_rate": 4.9804022651548734e-05, + "loss": 6.005, + "step": 6707 + }, + { + "epoch": 0.03989437624892949, + "grad_norm": 1.9460588693618774, + "learning_rate": 4.9803964275343795e-05, + "loss": 5.9784, + "step": 6708 + }, + { + "epoch": 0.039900323532210485, + "grad_norm": 1.7314271926879883, + "learning_rate": 4.980390589048005e-05, + "loss": 5.7766, + "step": 6709 + }, + { + "epoch": 0.03990627081549148, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.9803847496957524e-05, + "loss": 5.7386, + "step": 6710 + }, + { + "epoch": 0.03991221809877248, + "grad_norm": 2.3194711208343506, + "learning_rate": 4.980378909477622e-05, + "loss": 6.1324, + "step": 6711 + }, + { + "epoch": 0.03991816538205348, + "grad_norm": 2.3532958030700684, + "learning_rate": 4.980373068393618e-05, + "loss": 6.027, + "step": 6712 + }, + { + "epoch": 0.03992411266533447, + "grad_norm": 2.5944385528564453, + "learning_rate": 4.980367226443741e-05, + "loss": 6.2892, + "step": 6713 + }, + { + "epoch": 0.039930059948615475, + "grad_norm": 1.5707015991210938, + "learning_rate": 4.9803613836279926e-05, + "loss": 5.6525, + "step": 6714 + }, + { + "epoch": 0.03993600723189647, + "grad_norm": 2.022613286972046, + "learning_rate": 4.980355539946376e-05, + "loss": 5.8943, + "step": 6715 + }, + { + "epoch": 0.039941954515177465, + "grad_norm": 1.7783907651901245, + "learning_rate": 4.980349695398894e-05, + "loss": 5.6451, + "step": 6716 + }, + { + "epoch": 0.03994790179845847, + "grad_norm": 2.098841428756714, + "learning_rate": 4.980343849985547e-05, + "loss": 6.1143, + "step": 6717 + }, + { + "epoch": 0.03995384908173946, + "grad_norm": 2.045955181121826, + "learning_rate": 4.9803380037063374e-05, + "loss": 6.1802, + "step": 6718 + }, + { + "epoch": 0.03995979636502046, + "grad_norm": 1.7324507236480713, + "learning_rate": 4.980332156561267e-05, + "loss": 6.081, + "step": 6719 + }, + { + "epoch": 0.03996574364830146, + "grad_norm": 1.795184850692749, + "learning_rate": 4.9803263085503385e-05, + "loss": 5.6075, + "step": 6720 + }, + { + "epoch": 0.039971690931582454, + "grad_norm": 2.1466586589813232, + "learning_rate": 4.980320459673554e-05, + "loss": 6.045, + "step": 6721 + }, + { + "epoch": 0.03997763821486345, + "grad_norm": 2.1261258125305176, + "learning_rate": 4.980314609930915e-05, + "loss": 6.0589, + "step": 6722 + }, + { + "epoch": 0.039983585498144444, + "grad_norm": 2.559584617614746, + "learning_rate": 4.980308759322424e-05, + "loss": 6.3894, + "step": 6723 + }, + { + "epoch": 0.039989532781425446, + "grad_norm": 2.4580929279327393, + "learning_rate": 4.980302907848083e-05, + "loss": 6.3979, + "step": 6724 + }, + { + "epoch": 0.03999548006470644, + "grad_norm": 1.8877859115600586, + "learning_rate": 4.9802970555078934e-05, + "loss": 5.5076, + "step": 6725 + }, + { + "epoch": 0.04000142734798744, + "grad_norm": 2.145123243331909, + "learning_rate": 4.9802912023018585e-05, + "loss": 6.1913, + "step": 6726 + }, + { + "epoch": 0.04000737463126844, + "grad_norm": 1.9321368932724, + "learning_rate": 4.980285348229979e-05, + "loss": 5.9614, + "step": 6727 + }, + { + "epoch": 0.040013321914549434, + "grad_norm": 1.883589506149292, + "learning_rate": 4.9802794932922577e-05, + "loss": 5.4293, + "step": 6728 + }, + { + "epoch": 0.04001926919783043, + "grad_norm": 1.9066367149353027, + "learning_rate": 4.980273637488696e-05, + "loss": 5.4299, + "step": 6729 + }, + { + "epoch": 0.04002521648111143, + "grad_norm": 1.845290184020996, + "learning_rate": 4.9802677808192963e-05, + "loss": 5.596, + "step": 6730 + }, + { + "epoch": 0.040031163764392426, + "grad_norm": 2.3295016288757324, + "learning_rate": 4.980261923284062e-05, + "loss": 6.1266, + "step": 6731 + }, + { + "epoch": 0.04003711104767342, + "grad_norm": 2.451676368713379, + "learning_rate": 4.980256064882993e-05, + "loss": 6.0578, + "step": 6732 + }, + { + "epoch": 0.04004305833095442, + "grad_norm": 2.1317830085754395, + "learning_rate": 4.9802502056160915e-05, + "loss": 6.2627, + "step": 6733 + }, + { + "epoch": 0.04004900561423542, + "grad_norm": 2.223085641860962, + "learning_rate": 4.980244345483361e-05, + "loss": 5.5751, + "step": 6734 + }, + { + "epoch": 0.04005495289751641, + "grad_norm": 2.508385181427002, + "learning_rate": 4.9802384844848035e-05, + "loss": 5.572, + "step": 6735 + }, + { + "epoch": 0.04006090018079741, + "grad_norm": 2.5150837898254395, + "learning_rate": 4.98023262262042e-05, + "loss": 5.3443, + "step": 6736 + }, + { + "epoch": 0.04006684746407841, + "grad_norm": 2.293503761291504, + "learning_rate": 4.980226759890212e-05, + "loss": 5.37, + "step": 6737 + }, + { + "epoch": 0.040072794747359405, + "grad_norm": 1.8764920234680176, + "learning_rate": 4.9802208962941834e-05, + "loss": 5.3804, + "step": 6738 + }, + { + "epoch": 0.0400787420306404, + "grad_norm": 1.8443305492401123, + "learning_rate": 4.980215031832335e-05, + "loss": 5.7787, + "step": 6739 + }, + { + "epoch": 0.0400846893139214, + "grad_norm": 2.6707816123962402, + "learning_rate": 4.980209166504669e-05, + "loss": 6.2858, + "step": 6740 + }, + { + "epoch": 0.0400906365972024, + "grad_norm": 2.3520665168762207, + "learning_rate": 4.980203300311188e-05, + "loss": 5.8069, + "step": 6741 + }, + { + "epoch": 0.04009658388048339, + "grad_norm": 2.0564348697662354, + "learning_rate": 4.980197433251893e-05, + "loss": 6.1698, + "step": 6742 + }, + { + "epoch": 0.040102531163764395, + "grad_norm": 2.205469846725464, + "learning_rate": 4.9801915653267875e-05, + "loss": 5.8401, + "step": 6743 + }, + { + "epoch": 0.04010847844704539, + "grad_norm": 2.042363405227661, + "learning_rate": 4.980185696535873e-05, + "loss": 5.9673, + "step": 6744 + }, + { + "epoch": 0.040114425730326385, + "grad_norm": 1.7575644254684448, + "learning_rate": 4.98017982687915e-05, + "loss": 5.7852, + "step": 6745 + }, + { + "epoch": 0.04012037301360739, + "grad_norm": 1.968548059463501, + "learning_rate": 4.980173956356623e-05, + "loss": 6.2085, + "step": 6746 + }, + { + "epoch": 0.04012632029688838, + "grad_norm": 2.0365097522735596, + "learning_rate": 4.980168084968292e-05, + "loss": 6.4235, + "step": 6747 + }, + { + "epoch": 0.04013226758016938, + "grad_norm": 2.7265079021453857, + "learning_rate": 4.9801622127141605e-05, + "loss": 6.0804, + "step": 6748 + }, + { + "epoch": 0.04013821486345038, + "grad_norm": 2.1604299545288086, + "learning_rate": 4.98015633959423e-05, + "loss": 5.942, + "step": 6749 + }, + { + "epoch": 0.040144162146731374, + "grad_norm": 2.4122307300567627, + "learning_rate": 4.980150465608502e-05, + "loss": 6.2877, + "step": 6750 + }, + { + "epoch": 0.04015010943001237, + "grad_norm": 2.040780782699585, + "learning_rate": 4.98014459075698e-05, + "loss": 5.645, + "step": 6751 + }, + { + "epoch": 0.040156056713293364, + "grad_norm": 2.3660147190093994, + "learning_rate": 4.980138715039665e-05, + "loss": 5.975, + "step": 6752 + }, + { + "epoch": 0.040162003996574366, + "grad_norm": 2.2332143783569336, + "learning_rate": 4.980132838456558e-05, + "loss": 6.1383, + "step": 6753 + }, + { + "epoch": 0.04016795127985536, + "grad_norm": 2.7028262615203857, + "learning_rate": 4.9801269610076635e-05, + "loss": 6.3817, + "step": 6754 + }, + { + "epoch": 0.04017389856313636, + "grad_norm": 2.4653360843658447, + "learning_rate": 4.980121082692982e-05, + "loss": 6.3079, + "step": 6755 + }, + { + "epoch": 0.04017984584641736, + "grad_norm": 2.1470963954925537, + "learning_rate": 4.980115203512515e-05, + "loss": 6.063, + "step": 6756 + }, + { + "epoch": 0.040185793129698354, + "grad_norm": 2.3440990447998047, + "learning_rate": 4.9801093234662666e-05, + "loss": 5.818, + "step": 6757 + }, + { + "epoch": 0.04019174041297935, + "grad_norm": 2.120245933532715, + "learning_rate": 4.980103442554237e-05, + "loss": 5.5867, + "step": 6758 + }, + { + "epoch": 0.04019768769626035, + "grad_norm": 3.196829080581665, + "learning_rate": 4.980097560776429e-05, + "loss": 6.0369, + "step": 6759 + }, + { + "epoch": 0.040203634979541346, + "grad_norm": 2.247997522354126, + "learning_rate": 4.9800916781328456e-05, + "loss": 5.8383, + "step": 6760 + }, + { + "epoch": 0.04020958226282234, + "grad_norm": 2.26254940032959, + "learning_rate": 4.9800857946234866e-05, + "loss": 5.8477, + "step": 6761 + }, + { + "epoch": 0.04021552954610334, + "grad_norm": 2.200495958328247, + "learning_rate": 4.9800799102483556e-05, + "loss": 5.681, + "step": 6762 + }, + { + "epoch": 0.04022147682938434, + "grad_norm": 2.136009454727173, + "learning_rate": 4.980074025007454e-05, + "loss": 5.6453, + "step": 6763 + }, + { + "epoch": 0.04022742411266533, + "grad_norm": 2.3510351181030273, + "learning_rate": 4.980068138900785e-05, + "loss": 5.5735, + "step": 6764 + }, + { + "epoch": 0.040233371395946335, + "grad_norm": 2.249199628829956, + "learning_rate": 4.980062251928349e-05, + "loss": 5.9883, + "step": 6765 + }, + { + "epoch": 0.04023931867922733, + "grad_norm": 2.426816463470459, + "learning_rate": 4.9800563640901494e-05, + "loss": 6.1658, + "step": 6766 + }, + { + "epoch": 0.040245265962508325, + "grad_norm": 2.1044836044311523, + "learning_rate": 4.9800504753861874e-05, + "loss": 5.8627, + "step": 6767 + }, + { + "epoch": 0.04025121324578932, + "grad_norm": 1.9563783407211304, + "learning_rate": 4.9800445858164656e-05, + "loss": 5.9642, + "step": 6768 + }, + { + "epoch": 0.04025716052907032, + "grad_norm": 2.3810997009277344, + "learning_rate": 4.980038695380986e-05, + "loss": 5.2938, + "step": 6769 + }, + { + "epoch": 0.04026310781235132, + "grad_norm": 2.3180932998657227, + "learning_rate": 4.98003280407975e-05, + "loss": 5.7682, + "step": 6770 + }, + { + "epoch": 0.04026905509563231, + "grad_norm": 2.420954704284668, + "learning_rate": 4.980026911912761e-05, + "loss": 5.5724, + "step": 6771 + }, + { + "epoch": 0.040275002378913315, + "grad_norm": 2.447460651397705, + "learning_rate": 4.9800210188800193e-05, + "loss": 5.4844, + "step": 6772 + }, + { + "epoch": 0.04028094966219431, + "grad_norm": 2.4059863090515137, + "learning_rate": 4.980015124981529e-05, + "loss": 5.604, + "step": 6773 + }, + { + "epoch": 0.040286896945475305, + "grad_norm": 2.251492977142334, + "learning_rate": 4.9800092302172894e-05, + "loss": 5.4565, + "step": 6774 + }, + { + "epoch": 0.04029284422875631, + "grad_norm": 2.478682279586792, + "learning_rate": 4.980003334587305e-05, + "loss": 5.9416, + "step": 6775 + }, + { + "epoch": 0.0402987915120373, + "grad_norm": 2.2685835361480713, + "learning_rate": 4.9799974380915785e-05, + "loss": 5.9659, + "step": 6776 + }, + { + "epoch": 0.0403047387953183, + "grad_norm": 2.833101987838745, + "learning_rate": 4.979991540730108e-05, + "loss": 5.3406, + "step": 6777 + }, + { + "epoch": 0.0403106860785993, + "grad_norm": 3.0967416763305664, + "learning_rate": 4.9799856425029e-05, + "loss": 5.5848, + "step": 6778 + }, + { + "epoch": 0.040316633361880294, + "grad_norm": 2.3081796169281006, + "learning_rate": 4.9799797434099536e-05, + "loss": 5.5964, + "step": 6779 + }, + { + "epoch": 0.04032258064516129, + "grad_norm": 2.359531879425049, + "learning_rate": 4.9799738434512724e-05, + "loss": 5.6614, + "step": 6780 + }, + { + "epoch": 0.040328527928442284, + "grad_norm": 2.1566221714019775, + "learning_rate": 4.979967942626858e-05, + "loss": 6.0517, + "step": 6781 + }, + { + "epoch": 0.040334475211723286, + "grad_norm": 2.3964991569519043, + "learning_rate": 4.979962040936712e-05, + "loss": 5.9516, + "step": 6782 + }, + { + "epoch": 0.04034042249500428, + "grad_norm": 1.9913266897201538, + "learning_rate": 4.9799561383808365e-05, + "loss": 5.9144, + "step": 6783 + }, + { + "epoch": 0.040346369778285276, + "grad_norm": 1.7329169511795044, + "learning_rate": 4.979950234959235e-05, + "loss": 6.0393, + "step": 6784 + }, + { + "epoch": 0.04035231706156628, + "grad_norm": 1.8278034925460815, + "learning_rate": 4.979944330671908e-05, + "loss": 5.9318, + "step": 6785 + }, + { + "epoch": 0.040358264344847274, + "grad_norm": 2.089806318283081, + "learning_rate": 4.979938425518858e-05, + "loss": 5.5726, + "step": 6786 + }, + { + "epoch": 0.04036421162812827, + "grad_norm": 2.03664231300354, + "learning_rate": 4.9799325195000874e-05, + "loss": 5.8265, + "step": 6787 + }, + { + "epoch": 0.04037015891140927, + "grad_norm": 1.8801567554473877, + "learning_rate": 4.979926612615597e-05, + "loss": 5.7575, + "step": 6788 + }, + { + "epoch": 0.040376106194690266, + "grad_norm": 1.814959168434143, + "learning_rate": 4.979920704865391e-05, + "loss": 5.8737, + "step": 6789 + }, + { + "epoch": 0.04038205347797126, + "grad_norm": 1.7018035650253296, + "learning_rate": 4.97991479624947e-05, + "loss": 5.6768, + "step": 6790 + }, + { + "epoch": 0.04038800076125226, + "grad_norm": 2.21545147895813, + "learning_rate": 4.979908886767837e-05, + "loss": 5.4206, + "step": 6791 + }, + { + "epoch": 0.04039394804453326, + "grad_norm": 2.6184499263763428, + "learning_rate": 4.979902976420492e-05, + "loss": 5.0255, + "step": 6792 + }, + { + "epoch": 0.04039989532781425, + "grad_norm": 2.3914453983306885, + "learning_rate": 4.9798970652074396e-05, + "loss": 4.884, + "step": 6793 + }, + { + "epoch": 0.040405842611095255, + "grad_norm": 2.4367334842681885, + "learning_rate": 4.97989115312868e-05, + "loss": 4.7445, + "step": 6794 + }, + { + "epoch": 0.04041178989437625, + "grad_norm": 2.794490337371826, + "learning_rate": 4.9798852401842165e-05, + "loss": 4.9686, + "step": 6795 + }, + { + "epoch": 0.040417737177657245, + "grad_norm": 2.665395736694336, + "learning_rate": 4.979879326374051e-05, + "loss": 4.854, + "step": 6796 + }, + { + "epoch": 0.04042368446093824, + "grad_norm": 2.0832581520080566, + "learning_rate": 4.979873411698184e-05, + "loss": 5.0371, + "step": 6797 + }, + { + "epoch": 0.04042963174421924, + "grad_norm": 2.4604554176330566, + "learning_rate": 4.979867496156619e-05, + "loss": 4.7524, + "step": 6798 + }, + { + "epoch": 0.04043557902750024, + "grad_norm": 2.3760480880737305, + "learning_rate": 4.979861579749359e-05, + "loss": 4.7645, + "step": 6799 + }, + { + "epoch": 0.04044152631078123, + "grad_norm": 2.468043088912964, + "learning_rate": 4.979855662476405e-05, + "loss": 4.7791, + "step": 6800 + }, + { + "epoch": 0.040447473594062235, + "grad_norm": 2.516026258468628, + "learning_rate": 4.979849744337758e-05, + "loss": 4.7978, + "step": 6801 + }, + { + "epoch": 0.04045342087734323, + "grad_norm": 2.1882307529449463, + "learning_rate": 4.979843825333421e-05, + "loss": 5.002, + "step": 6802 + }, + { + "epoch": 0.040459368160624225, + "grad_norm": 2.423140525817871, + "learning_rate": 4.979837905463397e-05, + "loss": 5.0161, + "step": 6803 + }, + { + "epoch": 0.04046531544390523, + "grad_norm": 2.485739231109619, + "learning_rate": 4.979831984727687e-05, + "loss": 4.7613, + "step": 6804 + }, + { + "epoch": 0.04047126272718622, + "grad_norm": 2.267744302749634, + "learning_rate": 4.979826063126293e-05, + "loss": 4.7496, + "step": 6805 + }, + { + "epoch": 0.04047721001046722, + "grad_norm": 2.3172249794006348, + "learning_rate": 4.9798201406592176e-05, + "loss": 4.8153, + "step": 6806 + }, + { + "epoch": 0.04048315729374822, + "grad_norm": 2.309471607208252, + "learning_rate": 4.979814217326463e-05, + "loss": 4.9874, + "step": 6807 + }, + { + "epoch": 0.040489104577029214, + "grad_norm": 1.989372968673706, + "learning_rate": 4.97980829312803e-05, + "loss": 5.1254, + "step": 6808 + }, + { + "epoch": 0.04049505186031021, + "grad_norm": 2.4409830570220947, + "learning_rate": 4.9798023680639216e-05, + "loss": 4.6476, + "step": 6809 + }, + { + "epoch": 0.040500999143591204, + "grad_norm": 2.5192453861236572, + "learning_rate": 4.97979644213414e-05, + "loss": 4.6933, + "step": 6810 + }, + { + "epoch": 0.040506946426872206, + "grad_norm": 2.294718027114868, + "learning_rate": 4.979790515338688e-05, + "loss": 4.8266, + "step": 6811 + }, + { + "epoch": 0.0405128937101532, + "grad_norm": 2.294550657272339, + "learning_rate": 4.979784587677565e-05, + "loss": 4.6691, + "step": 6812 + }, + { + "epoch": 0.040518840993434196, + "grad_norm": 2.332326889038086, + "learning_rate": 4.979778659150776e-05, + "loss": 4.8366, + "step": 6813 + }, + { + "epoch": 0.0405247882767152, + "grad_norm": 2.325439929962158, + "learning_rate": 4.979772729758322e-05, + "loss": 4.8149, + "step": 6814 + }, + { + "epoch": 0.040530735559996194, + "grad_norm": 2.165926456451416, + "learning_rate": 4.979766799500204e-05, + "loss": 4.7309, + "step": 6815 + }, + { + "epoch": 0.04053668284327719, + "grad_norm": 2.3184943199157715, + "learning_rate": 4.9797608683764264e-05, + "loss": 4.7163, + "step": 6816 + }, + { + "epoch": 0.04054263012655819, + "grad_norm": 2.2161147594451904, + "learning_rate": 4.979754936386989e-05, + "loss": 4.5549, + "step": 6817 + }, + { + "epoch": 0.040548577409839186, + "grad_norm": 2.415496587753296, + "learning_rate": 4.979749003531895e-05, + "loss": 4.7676, + "step": 6818 + }, + { + "epoch": 0.04055452469312018, + "grad_norm": 2.1700618267059326, + "learning_rate": 4.979743069811146e-05, + "loss": 4.8448, + "step": 6819 + }, + { + "epoch": 0.04056047197640118, + "grad_norm": 2.4978747367858887, + "learning_rate": 4.9797371352247446e-05, + "loss": 6.363, + "step": 6820 + }, + { + "epoch": 0.04056641925968218, + "grad_norm": 1.9293922185897827, + "learning_rate": 4.979731199772693e-05, + "loss": 5.6502, + "step": 6821 + }, + { + "epoch": 0.04057236654296317, + "grad_norm": 2.5583136081695557, + "learning_rate": 4.9797252634549915e-05, + "loss": 4.874, + "step": 6822 + }, + { + "epoch": 0.040578313826244175, + "grad_norm": 2.263460159301758, + "learning_rate": 4.979719326271645e-05, + "loss": 5.8457, + "step": 6823 + }, + { + "epoch": 0.04058426110952517, + "grad_norm": 2.5630266666412354, + "learning_rate": 4.979713388222653e-05, + "loss": 4.8668, + "step": 6824 + }, + { + "epoch": 0.040590208392806165, + "grad_norm": 2.2965216636657715, + "learning_rate": 4.9797074493080186e-05, + "loss": 5.0049, + "step": 6825 + }, + { + "epoch": 0.04059615567608716, + "grad_norm": 2.222405433654785, + "learning_rate": 4.979701509527745e-05, + "loss": 5.0204, + "step": 6826 + }, + { + "epoch": 0.04060210295936816, + "grad_norm": 2.4425504207611084, + "learning_rate": 4.979695568881833e-05, + "loss": 5.687, + "step": 6827 + }, + { + "epoch": 0.04060805024264916, + "grad_norm": 2.329901933670044, + "learning_rate": 4.979689627370284e-05, + "loss": 5.9447, + "step": 6828 + }, + { + "epoch": 0.04061399752593015, + "grad_norm": 2.3041510581970215, + "learning_rate": 4.9796836849931015e-05, + "loss": 5.9277, + "step": 6829 + }, + { + "epoch": 0.040619944809211155, + "grad_norm": 2.3020026683807373, + "learning_rate": 4.979677741750287e-05, + "loss": 5.9675, + "step": 6830 + }, + { + "epoch": 0.04062589209249215, + "grad_norm": 2.1861371994018555, + "learning_rate": 4.9796717976418426e-05, + "loss": 6.1312, + "step": 6831 + }, + { + "epoch": 0.040631839375773145, + "grad_norm": 1.9544565677642822, + "learning_rate": 4.979665852667771e-05, + "loss": 5.9218, + "step": 6832 + }, + { + "epoch": 0.04063778665905415, + "grad_norm": 2.346431016921997, + "learning_rate": 4.979659906828073e-05, + "loss": 6.1668, + "step": 6833 + }, + { + "epoch": 0.04064373394233514, + "grad_norm": 2.0405263900756836, + "learning_rate": 4.979653960122751e-05, + "loss": 6.0501, + "step": 6834 + }, + { + "epoch": 0.04064968122561614, + "grad_norm": 1.7645004987716675, + "learning_rate": 4.979648012551809e-05, + "loss": 6.0299, + "step": 6835 + }, + { + "epoch": 0.04065562850889714, + "grad_norm": 2.284703016281128, + "learning_rate": 4.979642064115246e-05, + "loss": 5.5501, + "step": 6836 + }, + { + "epoch": 0.040661575792178134, + "grad_norm": 1.7246543169021606, + "learning_rate": 4.979636114813066e-05, + "loss": 5.5733, + "step": 6837 + }, + { + "epoch": 0.04066752307545913, + "grad_norm": 2.0958921909332275, + "learning_rate": 4.9796301646452705e-05, + "loss": 5.8998, + "step": 6838 + }, + { + "epoch": 0.040673470358740124, + "grad_norm": 2.2123169898986816, + "learning_rate": 4.979624213611862e-05, + "loss": 6.0322, + "step": 6839 + }, + { + "epoch": 0.040679417642021126, + "grad_norm": 1.9541656970977783, + "learning_rate": 4.9796182617128426e-05, + "loss": 5.9255, + "step": 6840 + }, + { + "epoch": 0.04068536492530212, + "grad_norm": 2.077601909637451, + "learning_rate": 4.979612308948213e-05, + "loss": 5.6975, + "step": 6841 + }, + { + "epoch": 0.040691312208583116, + "grad_norm": 2.0595803260803223, + "learning_rate": 4.979606355317977e-05, + "loss": 6.0696, + "step": 6842 + }, + { + "epoch": 0.04069725949186412, + "grad_norm": 1.9800641536712646, + "learning_rate": 4.979600400822136e-05, + "loss": 5.7357, + "step": 6843 + }, + { + "epoch": 0.040703206775145113, + "grad_norm": 2.26238751411438, + "learning_rate": 4.979594445460692e-05, + "loss": 5.9119, + "step": 6844 + }, + { + "epoch": 0.04070915405842611, + "grad_norm": 2.0941457748413086, + "learning_rate": 4.979588489233648e-05, + "loss": 5.945, + "step": 6845 + }, + { + "epoch": 0.04071510134170711, + "grad_norm": 2.1995291709899902, + "learning_rate": 4.979582532141005e-05, + "loss": 5.8406, + "step": 6846 + }, + { + "epoch": 0.040721048624988106, + "grad_norm": 2.0138349533081055, + "learning_rate": 4.9795765741827646e-05, + "loss": 5.7984, + "step": 6847 + }, + { + "epoch": 0.0407269959082691, + "grad_norm": 1.9314415454864502, + "learning_rate": 4.9795706153589304e-05, + "loss": 5.8686, + "step": 6848 + }, + { + "epoch": 0.0407329431915501, + "grad_norm": 2.1324212551116943, + "learning_rate": 4.979564655669503e-05, + "loss": 5.8477, + "step": 6849 + }, + { + "epoch": 0.0407388904748311, + "grad_norm": 1.9601761102676392, + "learning_rate": 4.979558695114486e-05, + "loss": 5.9078, + "step": 6850 + }, + { + "epoch": 0.04074483775811209, + "grad_norm": 2.004333734512329, + "learning_rate": 4.97955273369388e-05, + "loss": 5.9852, + "step": 6851 + }, + { + "epoch": 0.040750785041393095, + "grad_norm": 1.9015164375305176, + "learning_rate": 4.979546771407688e-05, + "loss": 5.6286, + "step": 6852 + }, + { + "epoch": 0.04075673232467409, + "grad_norm": 1.9674208164215088, + "learning_rate": 4.979540808255911e-05, + "loss": 5.8715, + "step": 6853 + }, + { + "epoch": 0.040762679607955085, + "grad_norm": 2.0473713874816895, + "learning_rate": 4.9795348442385534e-05, + "loss": 5.7488, + "step": 6854 + }, + { + "epoch": 0.04076862689123608, + "grad_norm": 1.9536950588226318, + "learning_rate": 4.979528879355615e-05, + "loss": 5.6755, + "step": 6855 + }, + { + "epoch": 0.04077457417451708, + "grad_norm": 2.189659595489502, + "learning_rate": 4.979522913607099e-05, + "loss": 5.7934, + "step": 6856 + }, + { + "epoch": 0.04078052145779808, + "grad_norm": 1.999742031097412, + "learning_rate": 4.9795169469930067e-05, + "loss": 5.7341, + "step": 6857 + }, + { + "epoch": 0.04078646874107907, + "grad_norm": 2.1212494373321533, + "learning_rate": 4.9795109795133414e-05, + "loss": 5.8465, + "step": 6858 + }, + { + "epoch": 0.040792416024360074, + "grad_norm": 1.966467261314392, + "learning_rate": 4.979505011168104e-05, + "loss": 5.8699, + "step": 6859 + }, + { + "epoch": 0.04079836330764107, + "grad_norm": 2.290205955505371, + "learning_rate": 4.979499041957297e-05, + "loss": 6.387, + "step": 6860 + }, + { + "epoch": 0.040804310590922065, + "grad_norm": 2.41827130317688, + "learning_rate": 4.979493071880923e-05, + "loss": 6.893, + "step": 6861 + }, + { + "epoch": 0.04081025787420307, + "grad_norm": 2.0652520656585693, + "learning_rate": 4.979487100938983e-05, + "loss": 6.6435, + "step": 6862 + }, + { + "epoch": 0.04081620515748406, + "grad_norm": 1.8594858646392822, + "learning_rate": 4.979481129131479e-05, + "loss": 5.7441, + "step": 6863 + }, + { + "epoch": 0.04082215244076506, + "grad_norm": 2.269240617752075, + "learning_rate": 4.979475156458415e-05, + "loss": 5.8468, + "step": 6864 + }, + { + "epoch": 0.04082809972404606, + "grad_norm": 2.2355518341064453, + "learning_rate": 4.979469182919792e-05, + "loss": 5.8717, + "step": 6865 + }, + { + "epoch": 0.040834047007327054, + "grad_norm": 1.9578050374984741, + "learning_rate": 4.9794632085156105e-05, + "loss": 5.6777, + "step": 6866 + }, + { + "epoch": 0.04083999429060805, + "grad_norm": 2.354609727859497, + "learning_rate": 4.979457233245875e-05, + "loss": 5.7993, + "step": 6867 + }, + { + "epoch": 0.040845941573889044, + "grad_norm": 1.978289008140564, + "learning_rate": 4.9794512571105865e-05, + "loss": 5.7429, + "step": 6868 + }, + { + "epoch": 0.040851888857170046, + "grad_norm": 1.9695252180099487, + "learning_rate": 4.979445280109747e-05, + "loss": 6.1322, + "step": 6869 + }, + { + "epoch": 0.04085783614045104, + "grad_norm": 2.172510862350464, + "learning_rate": 4.9794393022433586e-05, + "loss": 5.9443, + "step": 6870 + }, + { + "epoch": 0.040863783423732036, + "grad_norm": 2.1992416381835938, + "learning_rate": 4.9794333235114244e-05, + "loss": 6.4094, + "step": 6871 + }, + { + "epoch": 0.04086973070701304, + "grad_norm": 2.1804773807525635, + "learning_rate": 4.979427343913945e-05, + "loss": 6.3871, + "step": 6872 + }, + { + "epoch": 0.04087567799029403, + "grad_norm": 2.2877554893493652, + "learning_rate": 4.979421363450923e-05, + "loss": 6.2509, + "step": 6873 + }, + { + "epoch": 0.04088162527357503, + "grad_norm": 2.0697927474975586, + "learning_rate": 4.979415382122361e-05, + "loss": 5.9008, + "step": 6874 + }, + { + "epoch": 0.04088757255685603, + "grad_norm": 2.2907917499542236, + "learning_rate": 4.97940939992826e-05, + "loss": 5.6137, + "step": 6875 + }, + { + "epoch": 0.040893519840137026, + "grad_norm": 1.9960983991622925, + "learning_rate": 4.979403416868623e-05, + "loss": 5.7283, + "step": 6876 + }, + { + "epoch": 0.04089946712341802, + "grad_norm": 2.2767558097839355, + "learning_rate": 4.9793974329434525e-05, + "loss": 5.3632, + "step": 6877 + }, + { + "epoch": 0.04090541440669902, + "grad_norm": 2.295635461807251, + "learning_rate": 4.97939144815275e-05, + "loss": 5.4524, + "step": 6878 + }, + { + "epoch": 0.04091136168998002, + "grad_norm": 2.247194766998291, + "learning_rate": 4.9793854624965166e-05, + "loss": 5.7846, + "step": 6879 + }, + { + "epoch": 0.04091730897326101, + "grad_norm": 2.2641420364379883, + "learning_rate": 4.9793794759747565e-05, + "loss": 5.7479, + "step": 6880 + }, + { + "epoch": 0.040923256256542015, + "grad_norm": 2.002126455307007, + "learning_rate": 4.97937348858747e-05, + "loss": 5.2694, + "step": 6881 + }, + { + "epoch": 0.04092920353982301, + "grad_norm": 2.079157590866089, + "learning_rate": 4.9793675003346596e-05, + "loss": 6.2711, + "step": 6882 + }, + { + "epoch": 0.040935150823104005, + "grad_norm": 1.9030524492263794, + "learning_rate": 4.979361511216328e-05, + "loss": 5.7259, + "step": 6883 + }, + { + "epoch": 0.040941098106385, + "grad_norm": 1.9157373905181885, + "learning_rate": 4.9793555212324774e-05, + "loss": 6.086, + "step": 6884 + }, + { + "epoch": 0.040947045389666, + "grad_norm": 1.8622015714645386, + "learning_rate": 4.979349530383108e-05, + "loss": 6.1318, + "step": 6885 + }, + { + "epoch": 0.040952992672947, + "grad_norm": 2.3341257572174072, + "learning_rate": 4.9793435386682256e-05, + "loss": 5.9421, + "step": 6886 + }, + { + "epoch": 0.04095893995622799, + "grad_norm": 2.6894209384918213, + "learning_rate": 4.979337546087828e-05, + "loss": 5.5351, + "step": 6887 + }, + { + "epoch": 0.040964887239508994, + "grad_norm": 2.5316739082336426, + "learning_rate": 4.979331552641919e-05, + "loss": 5.5056, + "step": 6888 + }, + { + "epoch": 0.04097083452278999, + "grad_norm": 2.5129077434539795, + "learning_rate": 4.979325558330502e-05, + "loss": 5.3091, + "step": 6889 + }, + { + "epoch": 0.040976781806070985, + "grad_norm": 2.275536298751831, + "learning_rate": 4.979319563153578e-05, + "loss": 5.494, + "step": 6890 + }, + { + "epoch": 0.04098272908935199, + "grad_norm": 2.749375104904175, + "learning_rate": 4.9793135671111494e-05, + "loss": 6.0139, + "step": 6891 + }, + { + "epoch": 0.04098867637263298, + "grad_norm": 2.419163227081299, + "learning_rate": 4.9793075702032177e-05, + "loss": 6.1102, + "step": 6892 + }, + { + "epoch": 0.04099462365591398, + "grad_norm": 2.311450958251953, + "learning_rate": 4.9793015724297856e-05, + "loss": 5.9798, + "step": 6893 + }, + { + "epoch": 0.04100057093919498, + "grad_norm": 2.0522212982177734, + "learning_rate": 4.979295573790854e-05, + "loss": 5.9247, + "step": 6894 + }, + { + "epoch": 0.041006518222475974, + "grad_norm": 2.1928513050079346, + "learning_rate": 4.979289574286427e-05, + "loss": 5.8001, + "step": 6895 + }, + { + "epoch": 0.04101246550575697, + "grad_norm": 2.1945207118988037, + "learning_rate": 4.979283573916505e-05, + "loss": 5.9975, + "step": 6896 + }, + { + "epoch": 0.041018412789037964, + "grad_norm": 2.274843454360962, + "learning_rate": 4.979277572681091e-05, + "loss": 5.693, + "step": 6897 + }, + { + "epoch": 0.041024360072318966, + "grad_norm": 2.2715282440185547, + "learning_rate": 4.979271570580186e-05, + "loss": 5.9952, + "step": 6898 + }, + { + "epoch": 0.04103030735559996, + "grad_norm": 2.4459903240203857, + "learning_rate": 4.9792655676137943e-05, + "loss": 6.0305, + "step": 6899 + }, + { + "epoch": 0.041036254638880956, + "grad_norm": 2.8737339973449707, + "learning_rate": 4.9792595637819165e-05, + "loss": 6.0982, + "step": 6900 + }, + { + "epoch": 0.04104220192216196, + "grad_norm": 2.382143974304199, + "learning_rate": 4.979253559084553e-05, + "loss": 5.6122, + "step": 6901 + }, + { + "epoch": 0.04104814920544295, + "grad_norm": 2.4127237796783447, + "learning_rate": 4.97924755352171e-05, + "loss": 5.7723, + "step": 6902 + }, + { + "epoch": 0.04105409648872395, + "grad_norm": 2.3108956813812256, + "learning_rate": 4.979241547093386e-05, + "loss": 6.1655, + "step": 6903 + }, + { + "epoch": 0.04106004377200495, + "grad_norm": 2.250555992126465, + "learning_rate": 4.979235539799584e-05, + "loss": 6.0627, + "step": 6904 + }, + { + "epoch": 0.041065991055285946, + "grad_norm": 2.187957525253296, + "learning_rate": 4.979229531640307e-05, + "loss": 6.1438, + "step": 6905 + }, + { + "epoch": 0.04107193833856694, + "grad_norm": 1.9089539051055908, + "learning_rate": 4.979223522615557e-05, + "loss": 6.1431, + "step": 6906 + }, + { + "epoch": 0.04107788562184794, + "grad_norm": 2.343569040298462, + "learning_rate": 4.979217512725336e-05, + "loss": 5.9774, + "step": 6907 + }, + { + "epoch": 0.04108383290512894, + "grad_norm": 2.759631633758545, + "learning_rate": 4.979211501969645e-05, + "loss": 5.7982, + "step": 6908 + }, + { + "epoch": 0.04108978018840993, + "grad_norm": 2.295811414718628, + "learning_rate": 4.979205490348487e-05, + "loss": 6.0843, + "step": 6909 + }, + { + "epoch": 0.041095727471690935, + "grad_norm": 2.6259605884552, + "learning_rate": 4.979199477861864e-05, + "loss": 5.6498, + "step": 6910 + }, + { + "epoch": 0.04110167475497193, + "grad_norm": 2.396895408630371, + "learning_rate": 4.9791934645097785e-05, + "loss": 5.9936, + "step": 6911 + }, + { + "epoch": 0.041107622038252925, + "grad_norm": 2.020845651626587, + "learning_rate": 4.979187450292231e-05, + "loss": 5.4867, + "step": 6912 + }, + { + "epoch": 0.04111356932153392, + "grad_norm": 2.6473753452301025, + "learning_rate": 4.979181435209226e-05, + "loss": 5.3556, + "step": 6913 + }, + { + "epoch": 0.04111951660481492, + "grad_norm": 2.353158712387085, + "learning_rate": 4.9791754192607636e-05, + "loss": 6.3122, + "step": 6914 + }, + { + "epoch": 0.04112546388809592, + "grad_norm": 2.499817132949829, + "learning_rate": 4.9791694024468474e-05, + "loss": 5.816, + "step": 6915 + }, + { + "epoch": 0.04113141117137691, + "grad_norm": 2.009239673614502, + "learning_rate": 4.979163384767478e-05, + "loss": 5.5982, + "step": 6916 + }, + { + "epoch": 0.041137358454657914, + "grad_norm": 2.3885819911956787, + "learning_rate": 4.9791573662226586e-05, + "loss": 5.7403, + "step": 6917 + }, + { + "epoch": 0.04114330573793891, + "grad_norm": 2.3135135173797607, + "learning_rate": 4.979151346812391e-05, + "loss": 5.3151, + "step": 6918 + }, + { + "epoch": 0.041149253021219905, + "grad_norm": 1.9801241159439087, + "learning_rate": 4.979145326536677e-05, + "loss": 5.5148, + "step": 6919 + }, + { + "epoch": 0.04115520030450091, + "grad_norm": 2.0724904537200928, + "learning_rate": 4.979139305395519e-05, + "loss": 5.5355, + "step": 6920 + }, + { + "epoch": 0.0411611475877819, + "grad_norm": 1.8104170560836792, + "learning_rate": 4.97913328338892e-05, + "loss": 5.4861, + "step": 6921 + }, + { + "epoch": 0.0411670948710629, + "grad_norm": 1.81072998046875, + "learning_rate": 4.9791272605168804e-05, + "loss": 5.5075, + "step": 6922 + }, + { + "epoch": 0.0411730421543439, + "grad_norm": 1.709191083908081, + "learning_rate": 4.979121236779403e-05, + "loss": 6.1353, + "step": 6923 + }, + { + "epoch": 0.041178989437624894, + "grad_norm": 2.004974126815796, + "learning_rate": 4.9791152121764903e-05, + "loss": 5.478, + "step": 6924 + }, + { + "epoch": 0.04118493672090589, + "grad_norm": 1.937933325767517, + "learning_rate": 4.979109186708144e-05, + "loss": 5.4022, + "step": 6925 + }, + { + "epoch": 0.041190884004186884, + "grad_norm": 1.9453305006027222, + "learning_rate": 4.979103160374367e-05, + "loss": 5.243, + "step": 6926 + }, + { + "epoch": 0.041196831287467886, + "grad_norm": 1.8552072048187256, + "learning_rate": 4.979097133175159e-05, + "loss": 5.3104, + "step": 6927 + }, + { + "epoch": 0.04120277857074888, + "grad_norm": 1.9148203134536743, + "learning_rate": 4.9790911051105246e-05, + "loss": 5.5538, + "step": 6928 + }, + { + "epoch": 0.041208725854029876, + "grad_norm": 1.9658032655715942, + "learning_rate": 4.979085076180466e-05, + "loss": 5.5285, + "step": 6929 + }, + { + "epoch": 0.04121467313731088, + "grad_norm": 1.7332781553268433, + "learning_rate": 4.9790790463849835e-05, + "loss": 5.1959, + "step": 6930 + }, + { + "epoch": 0.04122062042059187, + "grad_norm": 1.5762557983398438, + "learning_rate": 4.9790730157240804e-05, + "loss": 5.3672, + "step": 6931 + }, + { + "epoch": 0.04122656770387287, + "grad_norm": 1.7899656295776367, + "learning_rate": 4.979066984197759e-05, + "loss": 5.3588, + "step": 6932 + }, + { + "epoch": 0.04123251498715387, + "grad_norm": 1.5992622375488281, + "learning_rate": 4.97906095180602e-05, + "loss": 5.275, + "step": 6933 + }, + { + "epoch": 0.041238462270434866, + "grad_norm": 1.875116229057312, + "learning_rate": 4.9790549185488666e-05, + "loss": 5.3428, + "step": 6934 + }, + { + "epoch": 0.04124440955371586, + "grad_norm": 1.8110510110855103, + "learning_rate": 4.979048884426301e-05, + "loss": 5.2416, + "step": 6935 + }, + { + "epoch": 0.04125035683699686, + "grad_norm": 1.5512267351150513, + "learning_rate": 4.979042849438325e-05, + "loss": 5.3643, + "step": 6936 + }, + { + "epoch": 0.04125630412027786, + "grad_norm": 1.8929630517959595, + "learning_rate": 4.979036813584941e-05, + "loss": 5.4232, + "step": 6937 + }, + { + "epoch": 0.04126225140355885, + "grad_norm": 1.8569291830062866, + "learning_rate": 4.9790307768661504e-05, + "loss": 5.2949, + "step": 6938 + }, + { + "epoch": 0.041268198686839855, + "grad_norm": 1.6058611869812012, + "learning_rate": 4.9790247392819564e-05, + "loss": 5.3736, + "step": 6939 + }, + { + "epoch": 0.04127414597012085, + "grad_norm": 1.8455227613449097, + "learning_rate": 4.97901870083236e-05, + "loss": 5.2768, + "step": 6940 + }, + { + "epoch": 0.041280093253401845, + "grad_norm": 1.9346935749053955, + "learning_rate": 4.979012661517364e-05, + "loss": 5.4316, + "step": 6941 + }, + { + "epoch": 0.04128604053668284, + "grad_norm": 1.8085594177246094, + "learning_rate": 4.97900662133697e-05, + "loss": 5.365, + "step": 6942 + }, + { + "epoch": 0.04129198781996384, + "grad_norm": 1.73456871509552, + "learning_rate": 4.9790005802911804e-05, + "loss": 5.2726, + "step": 6943 + }, + { + "epoch": 0.04129793510324484, + "grad_norm": 2.1071617603302, + "learning_rate": 4.978994538379997e-05, + "loss": 6.2313, + "step": 6944 + }, + { + "epoch": 0.04130388238652583, + "grad_norm": 1.7098963260650635, + "learning_rate": 4.978988495603423e-05, + "loss": 5.3162, + "step": 6945 + }, + { + "epoch": 0.041309829669806834, + "grad_norm": 1.8131905794143677, + "learning_rate": 4.978982451961459e-05, + "loss": 5.2486, + "step": 6946 + }, + { + "epoch": 0.04131577695308783, + "grad_norm": 1.8162381649017334, + "learning_rate": 4.978976407454109e-05, + "loss": 5.2806, + "step": 6947 + }, + { + "epoch": 0.041321724236368824, + "grad_norm": 1.9250297546386719, + "learning_rate": 4.9789703620813734e-05, + "loss": 5.1742, + "step": 6948 + }, + { + "epoch": 0.041327671519649826, + "grad_norm": 1.8263678550720215, + "learning_rate": 4.978964315843254e-05, + "loss": 5.1786, + "step": 6949 + }, + { + "epoch": 0.04133361880293082, + "grad_norm": 1.6751807928085327, + "learning_rate": 4.9789582687397546e-05, + "loss": 5.4798, + "step": 6950 + }, + { + "epoch": 0.04133956608621182, + "grad_norm": 1.7842947244644165, + "learning_rate": 4.9789522207708764e-05, + "loss": 5.201, + "step": 6951 + }, + { + "epoch": 0.04134551336949282, + "grad_norm": 1.6785067319869995, + "learning_rate": 4.978946171936621e-05, + "loss": 5.3852, + "step": 6952 + }, + { + "epoch": 0.041351460652773814, + "grad_norm": 1.5475291013717651, + "learning_rate": 4.978940122236992e-05, + "loss": 5.4083, + "step": 6953 + }, + { + "epoch": 0.04135740793605481, + "grad_norm": 1.7445106506347656, + "learning_rate": 4.97893407167199e-05, + "loss": 5.3125, + "step": 6954 + }, + { + "epoch": 0.041363355219335804, + "grad_norm": 1.7334082126617432, + "learning_rate": 4.9789280202416175e-05, + "loss": 5.5388, + "step": 6955 + }, + { + "epoch": 0.041369302502616806, + "grad_norm": 1.7267119884490967, + "learning_rate": 4.9789219679458774e-05, + "loss": 5.5175, + "step": 6956 + }, + { + "epoch": 0.0413752497858978, + "grad_norm": 1.8033246994018555, + "learning_rate": 4.978915914784771e-05, + "loss": 5.3523, + "step": 6957 + }, + { + "epoch": 0.041381197069178796, + "grad_norm": 1.9836528301239014, + "learning_rate": 4.978909860758301e-05, + "loss": 5.3808, + "step": 6958 + }, + { + "epoch": 0.0413871443524598, + "grad_norm": 1.6260416507720947, + "learning_rate": 4.978903805866469e-05, + "loss": 5.4642, + "step": 6959 + }, + { + "epoch": 0.04139309163574079, + "grad_norm": 1.7260626554489136, + "learning_rate": 4.978897750109277e-05, + "loss": 5.4975, + "step": 6960 + }, + { + "epoch": 0.04139903891902179, + "grad_norm": 1.6948668956756592, + "learning_rate": 4.978891693486728e-05, + "loss": 5.5768, + "step": 6961 + }, + { + "epoch": 0.04140498620230279, + "grad_norm": 1.7885476350784302, + "learning_rate": 4.978885635998824e-05, + "loss": 5.4156, + "step": 6962 + }, + { + "epoch": 0.041410933485583785, + "grad_norm": 1.8626813888549805, + "learning_rate": 4.978879577645565e-05, + "loss": 5.354, + "step": 6963 + }, + { + "epoch": 0.04141688076886478, + "grad_norm": 1.867090106010437, + "learning_rate": 4.9788735184269553e-05, + "loss": 5.2934, + "step": 6964 + }, + { + "epoch": 0.04142282805214578, + "grad_norm": 1.7208340167999268, + "learning_rate": 4.9788674583429974e-05, + "loss": 5.2116, + "step": 6965 + }, + { + "epoch": 0.04142877533542678, + "grad_norm": 1.934480905532837, + "learning_rate": 4.9788613973936916e-05, + "loss": 5.5801, + "step": 6966 + }, + { + "epoch": 0.04143472261870777, + "grad_norm": 1.6263724565505981, + "learning_rate": 4.978855335579041e-05, + "loss": 5.3835, + "step": 6967 + }, + { + "epoch": 0.041440669901988775, + "grad_norm": 1.743996262550354, + "learning_rate": 4.9788492728990474e-05, + "loss": 5.3281, + "step": 6968 + }, + { + "epoch": 0.04144661718526977, + "grad_norm": 1.5556843280792236, + "learning_rate": 4.978843209353714e-05, + "loss": 5.442, + "step": 6969 + }, + { + "epoch": 0.041452564468550765, + "grad_norm": 1.5540435314178467, + "learning_rate": 4.978837144943041e-05, + "loss": 5.3621, + "step": 6970 + }, + { + "epoch": 0.04145851175183176, + "grad_norm": 1.7884414196014404, + "learning_rate": 4.9788310796670326e-05, + "loss": 5.571, + "step": 6971 + }, + { + "epoch": 0.04146445903511276, + "grad_norm": 1.7550957202911377, + "learning_rate": 4.9788250135256886e-05, + "loss": 5.61, + "step": 6972 + }, + { + "epoch": 0.04147040631839376, + "grad_norm": 1.9336804151535034, + "learning_rate": 4.978818946519013e-05, + "loss": 5.6142, + "step": 6973 + }, + { + "epoch": 0.04147635360167475, + "grad_norm": 1.8888505697250366, + "learning_rate": 4.978812878647008e-05, + "loss": 5.4908, + "step": 6974 + }, + { + "epoch": 0.041482300884955754, + "grad_norm": 1.940371036529541, + "learning_rate": 4.978806809909674e-05, + "loss": 5.5407, + "step": 6975 + }, + { + "epoch": 0.04148824816823675, + "grad_norm": 2.0182151794433594, + "learning_rate": 4.9788007403070146e-05, + "loss": 5.3643, + "step": 6976 + }, + { + "epoch": 0.041494195451517744, + "grad_norm": 1.7960541248321533, + "learning_rate": 4.978794669839032e-05, + "loss": 5.4994, + "step": 6977 + }, + { + "epoch": 0.041500142734798746, + "grad_norm": 1.8403207063674927, + "learning_rate": 4.978788598505727e-05, + "loss": 5.4501, + "step": 6978 + }, + { + "epoch": 0.04150609001807974, + "grad_norm": 1.7232698202133179, + "learning_rate": 4.978782526307103e-05, + "loss": 5.5406, + "step": 6979 + }, + { + "epoch": 0.04151203730136074, + "grad_norm": 1.7003169059753418, + "learning_rate": 4.9787764532431615e-05, + "loss": 5.3427, + "step": 6980 + }, + { + "epoch": 0.04151798458464174, + "grad_norm": 2.041384696960449, + "learning_rate": 4.978770379313904e-05, + "loss": 5.5121, + "step": 6981 + }, + { + "epoch": 0.041523931867922734, + "grad_norm": 1.5773900747299194, + "learning_rate": 4.978764304519334e-05, + "loss": 5.4604, + "step": 6982 + }, + { + "epoch": 0.04152987915120373, + "grad_norm": 1.8834172487258911, + "learning_rate": 4.9787582288594535e-05, + "loss": 5.5141, + "step": 6983 + }, + { + "epoch": 0.04153582643448473, + "grad_norm": 1.7956576347351074, + "learning_rate": 4.978752152334264e-05, + "loss": 5.5664, + "step": 6984 + }, + { + "epoch": 0.041541773717765726, + "grad_norm": 1.8676495552062988, + "learning_rate": 4.978746074943767e-05, + "loss": 5.2846, + "step": 6985 + }, + { + "epoch": 0.04154772100104672, + "grad_norm": 1.7709665298461914, + "learning_rate": 4.9787399966879654e-05, + "loss": 5.3375, + "step": 6986 + }, + { + "epoch": 0.041553668284327716, + "grad_norm": 2.012941837310791, + "learning_rate": 4.978733917566862e-05, + "loss": 5.6973, + "step": 6987 + }, + { + "epoch": 0.04155961556760872, + "grad_norm": 1.8220570087432861, + "learning_rate": 4.978727837580458e-05, + "loss": 5.191, + "step": 6988 + }, + { + "epoch": 0.04156556285088971, + "grad_norm": 1.6511586904525757, + "learning_rate": 4.978721756728755e-05, + "loss": 5.2787, + "step": 6989 + }, + { + "epoch": 0.04157151013417071, + "grad_norm": 1.9026141166687012, + "learning_rate": 4.978715675011757e-05, + "loss": 5.4456, + "step": 6990 + }, + { + "epoch": 0.04157745741745171, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.9787095924294633e-05, + "loss": 5.5013, + "step": 6991 + }, + { + "epoch": 0.041583404700732705, + "grad_norm": 1.8720741271972656, + "learning_rate": 4.978703508981879e-05, + "loss": 5.3952, + "step": 6992 + }, + { + "epoch": 0.0415893519840137, + "grad_norm": 1.817356824874878, + "learning_rate": 4.978697424669005e-05, + "loss": 5.4719, + "step": 6993 + }, + { + "epoch": 0.0415952992672947, + "grad_norm": 1.740702509880066, + "learning_rate": 4.978691339490843e-05, + "loss": 5.6484, + "step": 6994 + }, + { + "epoch": 0.0416012465505757, + "grad_norm": 1.8752427101135254, + "learning_rate": 4.978685253447395e-05, + "loss": 5.6394, + "step": 6995 + }, + { + "epoch": 0.04160719383385669, + "grad_norm": 1.8180509805679321, + "learning_rate": 4.978679166538665e-05, + "loss": 5.3401, + "step": 6996 + }, + { + "epoch": 0.041613141117137695, + "grad_norm": 1.9002251625061035, + "learning_rate": 4.9786730787646516e-05, + "loss": 5.3237, + "step": 6997 + }, + { + "epoch": 0.04161908840041869, + "grad_norm": 1.741176724433899, + "learning_rate": 4.978666990125361e-05, + "loss": 5.2311, + "step": 6998 + }, + { + "epoch": 0.041625035683699685, + "grad_norm": 2.0994246006011963, + "learning_rate": 4.9786609006207925e-05, + "loss": 5.3549, + "step": 6999 + }, + { + "epoch": 0.04163098296698068, + "grad_norm": 1.8438987731933594, + "learning_rate": 4.978654810250949e-05, + "loss": 5.4322, + "step": 7000 + }, + { + "epoch": 0.04163693025026168, + "grad_norm": 1.7411181926727295, + "learning_rate": 4.978648719015833e-05, + "loss": 5.455, + "step": 7001 + }, + { + "epoch": 0.04164287753354268, + "grad_norm": 1.6879174709320068, + "learning_rate": 4.978642626915446e-05, + "loss": 5.3676, + "step": 7002 + }, + { + "epoch": 0.04164882481682367, + "grad_norm": 1.8912461996078491, + "learning_rate": 4.9786365339497906e-05, + "loss": 5.6181, + "step": 7003 + }, + { + "epoch": 0.041654772100104674, + "grad_norm": 1.9234617948532104, + "learning_rate": 4.978630440118869e-05, + "loss": 5.5388, + "step": 7004 + }, + { + "epoch": 0.04166071938338567, + "grad_norm": 2.1059048175811768, + "learning_rate": 4.9786243454226824e-05, + "loss": 5.6856, + "step": 7005 + }, + { + "epoch": 0.041666666666666664, + "grad_norm": 2.1900687217712402, + "learning_rate": 4.9786182498612347e-05, + "loss": 6.2426, + "step": 7006 + }, + { + "epoch": 0.041672613949947666, + "grad_norm": 1.7580265998840332, + "learning_rate": 4.9786121534345265e-05, + "loss": 5.2342, + "step": 7007 + }, + { + "epoch": 0.04167856123322866, + "grad_norm": 1.4747200012207031, + "learning_rate": 4.97860605614256e-05, + "loss": 5.1977, + "step": 7008 + }, + { + "epoch": 0.04168450851650966, + "grad_norm": 1.8164165019989014, + "learning_rate": 4.978599957985338e-05, + "loss": 5.1362, + "step": 7009 + }, + { + "epoch": 0.04169045579979066, + "grad_norm": 1.468550443649292, + "learning_rate": 4.978593858962863e-05, + "loss": 5.1265, + "step": 7010 + }, + { + "epoch": 0.041696403083071654, + "grad_norm": 1.584343433380127, + "learning_rate": 4.9785877590751356e-05, + "loss": 5.2611, + "step": 7011 + }, + { + "epoch": 0.04170235036635265, + "grad_norm": 1.7864785194396973, + "learning_rate": 4.978581658322159e-05, + "loss": 5.5214, + "step": 7012 + }, + { + "epoch": 0.04170829764963365, + "grad_norm": 1.8359016180038452, + "learning_rate": 4.978575556703936e-05, + "loss": 5.3808, + "step": 7013 + }, + { + "epoch": 0.041714244932914646, + "grad_norm": 1.8298325538635254, + "learning_rate": 4.978569454220467e-05, + "loss": 5.5606, + "step": 7014 + }, + { + "epoch": 0.04172019221619564, + "grad_norm": 2.1555540561676025, + "learning_rate": 4.978563350871755e-05, + "loss": 5.6592, + "step": 7015 + }, + { + "epoch": 0.041726139499476636, + "grad_norm": 2.5251846313476562, + "learning_rate": 4.9785572466578026e-05, + "loss": 5.5771, + "step": 7016 + }, + { + "epoch": 0.04173208678275764, + "grad_norm": 1.7765661478042603, + "learning_rate": 4.9785511415786115e-05, + "loss": 5.5558, + "step": 7017 + }, + { + "epoch": 0.04173803406603863, + "grad_norm": 1.9711554050445557, + "learning_rate": 4.978545035634183e-05, + "loss": 5.5565, + "step": 7018 + }, + { + "epoch": 0.04174398134931963, + "grad_norm": 1.8080202341079712, + "learning_rate": 4.978538928824521e-05, + "loss": 5.5037, + "step": 7019 + }, + { + "epoch": 0.04174992863260063, + "grad_norm": 1.7506872415542603, + "learning_rate": 4.978532821149626e-05, + "loss": 5.3362, + "step": 7020 + }, + { + "epoch": 0.041755875915881625, + "grad_norm": 1.5606149435043335, + "learning_rate": 4.978526712609501e-05, + "loss": 5.3541, + "step": 7021 + }, + { + "epoch": 0.04176182319916262, + "grad_norm": 1.8840737342834473, + "learning_rate": 4.9785206032041476e-05, + "loss": 5.2315, + "step": 7022 + }, + { + "epoch": 0.04176777048244362, + "grad_norm": 2.118178606033325, + "learning_rate": 4.978514492933569e-05, + "loss": 5.6174, + "step": 7023 + }, + { + "epoch": 0.04177371776572462, + "grad_norm": 2.043907403945923, + "learning_rate": 4.978508381797766e-05, + "loss": 5.6272, + "step": 7024 + }, + { + "epoch": 0.04177966504900561, + "grad_norm": 1.764411211013794, + "learning_rate": 4.978502269796742e-05, + "loss": 5.6153, + "step": 7025 + }, + { + "epoch": 0.041785612332286615, + "grad_norm": 1.5760626792907715, + "learning_rate": 4.978496156930498e-05, + "loss": 5.5734, + "step": 7026 + }, + { + "epoch": 0.04179155961556761, + "grad_norm": 1.8857802152633667, + "learning_rate": 4.9784900431990366e-05, + "loss": 5.5295, + "step": 7027 + }, + { + "epoch": 0.041797506898848605, + "grad_norm": 1.7287275791168213, + "learning_rate": 4.97848392860236e-05, + "loss": 5.3175, + "step": 7028 + }, + { + "epoch": 0.0418034541821296, + "grad_norm": 1.915263295173645, + "learning_rate": 4.97847781314047e-05, + "loss": 5.4838, + "step": 7029 + }, + { + "epoch": 0.0418094014654106, + "grad_norm": 2.049435615539551, + "learning_rate": 4.97847169681337e-05, + "loss": 5.5508, + "step": 7030 + }, + { + "epoch": 0.0418153487486916, + "grad_norm": 1.8955415487289429, + "learning_rate": 4.97846557962106e-05, + "loss": 5.4618, + "step": 7031 + }, + { + "epoch": 0.04182129603197259, + "grad_norm": 1.8957183361053467, + "learning_rate": 4.978459461563543e-05, + "loss": 5.5293, + "step": 7032 + }, + { + "epoch": 0.041827243315253594, + "grad_norm": 2.050734043121338, + "learning_rate": 4.978453342640822e-05, + "loss": 5.8002, + "step": 7033 + }, + { + "epoch": 0.04183319059853459, + "grad_norm": 1.9867476224899292, + "learning_rate": 4.978447222852899e-05, + "loss": 5.466, + "step": 7034 + }, + { + "epoch": 0.041839137881815584, + "grad_norm": 1.7928507328033447, + "learning_rate": 4.978441102199775e-05, + "loss": 5.3312, + "step": 7035 + }, + { + "epoch": 0.041845085165096586, + "grad_norm": 1.7984018325805664, + "learning_rate": 4.978434980681453e-05, + "loss": 5.2936, + "step": 7036 + }, + { + "epoch": 0.04185103244837758, + "grad_norm": 1.8011672496795654, + "learning_rate": 4.9784288582979355e-05, + "loss": 5.484, + "step": 7037 + }, + { + "epoch": 0.041856979731658576, + "grad_norm": 1.9439928531646729, + "learning_rate": 4.9784227350492236e-05, + "loss": 5.4563, + "step": 7038 + }, + { + "epoch": 0.04186292701493958, + "grad_norm": 1.71321439743042, + "learning_rate": 4.97841661093532e-05, + "loss": 5.3909, + "step": 7039 + }, + { + "epoch": 0.041868874298220574, + "grad_norm": 1.629333734512329, + "learning_rate": 4.9784104859562266e-05, + "loss": 5.3112, + "step": 7040 + }, + { + "epoch": 0.04187482158150157, + "grad_norm": 1.5248417854309082, + "learning_rate": 4.9784043601119456e-05, + "loss": 5.3724, + "step": 7041 + }, + { + "epoch": 0.04188076886478257, + "grad_norm": 1.8886220455169678, + "learning_rate": 4.97839823340248e-05, + "loss": 5.443, + "step": 7042 + }, + { + "epoch": 0.041886716148063566, + "grad_norm": 1.5902595520019531, + "learning_rate": 4.9783921058278307e-05, + "loss": 5.4249, + "step": 7043 + }, + { + "epoch": 0.04189266343134456, + "grad_norm": 1.837579369544983, + "learning_rate": 4.978385977388e-05, + "loss": 5.3767, + "step": 7044 + }, + { + "epoch": 0.041898610714625556, + "grad_norm": 1.8306061029434204, + "learning_rate": 4.9783798480829905e-05, + "loss": 5.4206, + "step": 7045 + }, + { + "epoch": 0.04190455799790656, + "grad_norm": 1.6887965202331543, + "learning_rate": 4.9783737179128044e-05, + "loss": 5.5327, + "step": 7046 + }, + { + "epoch": 0.04191050528118755, + "grad_norm": 1.8081728219985962, + "learning_rate": 4.978367586877444e-05, + "loss": 5.4547, + "step": 7047 + }, + { + "epoch": 0.04191645256446855, + "grad_norm": 1.8341114521026611, + "learning_rate": 4.97836145497691e-05, + "loss": 5.4175, + "step": 7048 + }, + { + "epoch": 0.04192239984774955, + "grad_norm": 1.965240240097046, + "learning_rate": 4.978355322211207e-05, + "loss": 5.4253, + "step": 7049 + }, + { + "epoch": 0.041928347131030545, + "grad_norm": 1.7060484886169434, + "learning_rate": 4.9783491885803343e-05, + "loss": 5.3493, + "step": 7050 + }, + { + "epoch": 0.04193429441431154, + "grad_norm": 1.8203076124191284, + "learning_rate": 4.978343054084297e-05, + "loss": 5.4601, + "step": 7051 + }, + { + "epoch": 0.04194024169759254, + "grad_norm": 1.919954538345337, + "learning_rate": 4.9783369187230945e-05, + "loss": 5.4921, + "step": 7052 + }, + { + "epoch": 0.04194618898087354, + "grad_norm": 1.4519730806350708, + "learning_rate": 4.9783307824967306e-05, + "loss": 5.4922, + "step": 7053 + }, + { + "epoch": 0.04195213626415453, + "grad_norm": 1.8431898355484009, + "learning_rate": 4.9783246454052066e-05, + "loss": 5.384, + "step": 7054 + }, + { + "epoch": 0.041958083547435535, + "grad_norm": 1.5493370294570923, + "learning_rate": 4.978318507448526e-05, + "loss": 5.5294, + "step": 7055 + }, + { + "epoch": 0.04196403083071653, + "grad_norm": 1.6405844688415527, + "learning_rate": 4.97831236862669e-05, + "loss": 5.492, + "step": 7056 + }, + { + "epoch": 0.041969978113997525, + "grad_norm": 1.7830392122268677, + "learning_rate": 4.9783062289396996e-05, + "loss": 5.2977, + "step": 7057 + }, + { + "epoch": 0.04197592539727852, + "grad_norm": 1.8268102407455444, + "learning_rate": 4.9783000883875595e-05, + "loss": 5.3396, + "step": 7058 + }, + { + "epoch": 0.04198187268055952, + "grad_norm": 1.942901849746704, + "learning_rate": 4.9782939469702694e-05, + "loss": 5.3338, + "step": 7059 + }, + { + "epoch": 0.04198781996384052, + "grad_norm": 1.5793414115905762, + "learning_rate": 4.9782878046878334e-05, + "loss": 5.3286, + "step": 7060 + }, + { + "epoch": 0.04199376724712151, + "grad_norm": 1.5777463912963867, + "learning_rate": 4.9782816615402515e-05, + "loss": 5.2942, + "step": 7061 + }, + { + "epoch": 0.041999714530402514, + "grad_norm": 1.6393412351608276, + "learning_rate": 4.978275517527528e-05, + "loss": 5.2557, + "step": 7062 + }, + { + "epoch": 0.04200566181368351, + "grad_norm": 1.9657515287399292, + "learning_rate": 4.978269372649664e-05, + "loss": 5.3875, + "step": 7063 + }, + { + "epoch": 0.042011609096964504, + "grad_norm": 2.1419737339019775, + "learning_rate": 4.9782632269066623e-05, + "loss": 5.2014, + "step": 7064 + }, + { + "epoch": 0.042017556380245506, + "grad_norm": 2.0425620079040527, + "learning_rate": 4.978257080298523e-05, + "loss": 5.194, + "step": 7065 + }, + { + "epoch": 0.0420235036635265, + "grad_norm": 1.7248409986495972, + "learning_rate": 4.978250932825251e-05, + "loss": 5.1922, + "step": 7066 + }, + { + "epoch": 0.042029450946807496, + "grad_norm": 1.8265177011489868, + "learning_rate": 4.978244784486847e-05, + "loss": 5.4474, + "step": 7067 + }, + { + "epoch": 0.0420353982300885, + "grad_norm": 1.803701400756836, + "learning_rate": 4.9782386352833134e-05, + "loss": 6.2155, + "step": 7068 + }, + { + "epoch": 0.042041345513369494, + "grad_norm": 1.9970064163208008, + "learning_rate": 4.978232485214652e-05, + "loss": 5.3622, + "step": 7069 + }, + { + "epoch": 0.04204729279665049, + "grad_norm": 1.7449073791503906, + "learning_rate": 4.978226334280865e-05, + "loss": 5.3146, + "step": 7070 + }, + { + "epoch": 0.04205324007993149, + "grad_norm": 2.0284547805786133, + "learning_rate": 4.978220182481955e-05, + "loss": 5.0169, + "step": 7071 + }, + { + "epoch": 0.042059187363212486, + "grad_norm": 1.6801714897155762, + "learning_rate": 4.978214029817924e-05, + "loss": 5.1294, + "step": 7072 + }, + { + "epoch": 0.04206513464649348, + "grad_norm": 2.160585641860962, + "learning_rate": 4.978207876288774e-05, + "loss": 5.072, + "step": 7073 + }, + { + "epoch": 0.042071081929774476, + "grad_norm": 2.07739520072937, + "learning_rate": 4.978201721894508e-05, + "loss": 5.2065, + "step": 7074 + }, + { + "epoch": 0.04207702921305548, + "grad_norm": 2.1396286487579346, + "learning_rate": 4.978195566635127e-05, + "loss": 5.1066, + "step": 7075 + }, + { + "epoch": 0.04208297649633647, + "grad_norm": 1.883280634880066, + "learning_rate": 4.978189410510633e-05, + "loss": 5.2842, + "step": 7076 + }, + { + "epoch": 0.04208892377961747, + "grad_norm": 1.9917101860046387, + "learning_rate": 4.978183253521029e-05, + "loss": 5.0799, + "step": 7077 + }, + { + "epoch": 0.04209487106289847, + "grad_norm": 1.9387022256851196, + "learning_rate": 4.9781770956663164e-05, + "loss": 5.1898, + "step": 7078 + }, + { + "epoch": 0.042100818346179465, + "grad_norm": 1.9767060279846191, + "learning_rate": 4.978170936946498e-05, + "loss": 5.0692, + "step": 7079 + }, + { + "epoch": 0.04210676562946046, + "grad_norm": 2.0076138973236084, + "learning_rate": 4.978164777361576e-05, + "loss": 5.0255, + "step": 7080 + }, + { + "epoch": 0.04211271291274146, + "grad_norm": 1.8253445625305176, + "learning_rate": 4.978158616911552e-05, + "loss": 5.0111, + "step": 7081 + }, + { + "epoch": 0.04211866019602246, + "grad_norm": 1.6551930904388428, + "learning_rate": 4.978152455596429e-05, + "loss": 4.9849, + "step": 7082 + }, + { + "epoch": 0.04212460747930345, + "grad_norm": 1.8462406396865845, + "learning_rate": 4.9781462934162084e-05, + "loss": 5.0862, + "step": 7083 + }, + { + "epoch": 0.042130554762584455, + "grad_norm": 2.0828206539154053, + "learning_rate": 4.978140130370892e-05, + "loss": 5.031, + "step": 7084 + }, + { + "epoch": 0.04213650204586545, + "grad_norm": 1.7917357683181763, + "learning_rate": 4.978133966460483e-05, + "loss": 5.0028, + "step": 7085 + }, + { + "epoch": 0.042142449329146445, + "grad_norm": 1.7324126958847046, + "learning_rate": 4.9781278016849834e-05, + "loss": 4.9759, + "step": 7086 + }, + { + "epoch": 0.04214839661242744, + "grad_norm": 1.8673282861709595, + "learning_rate": 4.978121636044394e-05, + "loss": 5.3631, + "step": 7087 + }, + { + "epoch": 0.04215434389570844, + "grad_norm": 1.7723935842514038, + "learning_rate": 4.9781154695387186e-05, + "loss": 5.3427, + "step": 7088 + }, + { + "epoch": 0.04216029117898944, + "grad_norm": 1.4671146869659424, + "learning_rate": 4.978109302167958e-05, + "loss": 5.3003, + "step": 7089 + }, + { + "epoch": 0.04216623846227043, + "grad_norm": 1.9667481184005737, + "learning_rate": 4.9781031339321156e-05, + "loss": 5.0957, + "step": 7090 + }, + { + "epoch": 0.042172185745551434, + "grad_norm": 1.8162986040115356, + "learning_rate": 4.978096964831193e-05, + "loss": 5.1472, + "step": 7091 + }, + { + "epoch": 0.04217813302883243, + "grad_norm": 1.7793545722961426, + "learning_rate": 4.9780907948651926e-05, + "loss": 5.1771, + "step": 7092 + }, + { + "epoch": 0.042184080312113424, + "grad_norm": 1.8093308210372925, + "learning_rate": 4.9780846240341156e-05, + "loss": 5.1611, + "step": 7093 + }, + { + "epoch": 0.042190027595394426, + "grad_norm": 1.7010010480880737, + "learning_rate": 4.978078452337965e-05, + "loss": 5.4478, + "step": 7094 + }, + { + "epoch": 0.04219597487867542, + "grad_norm": 1.7978744506835938, + "learning_rate": 4.9780722797767434e-05, + "loss": 5.4443, + "step": 7095 + }, + { + "epoch": 0.042201922161956416, + "grad_norm": 1.4861794710159302, + "learning_rate": 4.9780661063504516e-05, + "loss": 5.3773, + "step": 7096 + }, + { + "epoch": 0.04220786944523742, + "grad_norm": 1.7805769443511963, + "learning_rate": 4.978059932059093e-05, + "loss": 5.0896, + "step": 7097 + }, + { + "epoch": 0.042213816728518413, + "grad_norm": 1.7392783164978027, + "learning_rate": 4.9780537569026695e-05, + "loss": 5.0602, + "step": 7098 + }, + { + "epoch": 0.04221976401179941, + "grad_norm": 1.8742554187774658, + "learning_rate": 4.978047580881182e-05, + "loss": 5.2595, + "step": 7099 + }, + { + "epoch": 0.04222571129508041, + "grad_norm": 1.6077641248703003, + "learning_rate": 4.978041403994635e-05, + "loss": 5.0925, + "step": 7100 + }, + { + "epoch": 0.042231658578361406, + "grad_norm": 1.7536481618881226, + "learning_rate": 4.9780352262430286e-05, + "loss": 5.2546, + "step": 7101 + }, + { + "epoch": 0.0422376058616424, + "grad_norm": 1.6404869556427002, + "learning_rate": 4.9780290476263656e-05, + "loss": 5.1349, + "step": 7102 + }, + { + "epoch": 0.042243553144923396, + "grad_norm": 1.7223635911941528, + "learning_rate": 4.978022868144649e-05, + "loss": 5.2894, + "step": 7103 + }, + { + "epoch": 0.0422495004282044, + "grad_norm": 1.7856663465499878, + "learning_rate": 4.9780166877978796e-05, + "loss": 5.384, + "step": 7104 + }, + { + "epoch": 0.04225544771148539, + "grad_norm": 1.6434816122055054, + "learning_rate": 4.978010506586061e-05, + "loss": 5.257, + "step": 7105 + }, + { + "epoch": 0.04226139499476639, + "grad_norm": 1.668371558189392, + "learning_rate": 4.9780043245091936e-05, + "loss": 5.2698, + "step": 7106 + }, + { + "epoch": 0.04226734227804739, + "grad_norm": 1.7553619146347046, + "learning_rate": 4.97799814156728e-05, + "loss": 5.1591, + "step": 7107 + }, + { + "epoch": 0.042273289561328385, + "grad_norm": 1.6918652057647705, + "learning_rate": 4.977991957760324e-05, + "loss": 5.2727, + "step": 7108 + }, + { + "epoch": 0.04227923684460938, + "grad_norm": 1.6634269952774048, + "learning_rate": 4.977985773088326e-05, + "loss": 5.3099, + "step": 7109 + }, + { + "epoch": 0.04228518412789038, + "grad_norm": 2.131647825241089, + "learning_rate": 4.977979587551289e-05, + "loss": 5.0885, + "step": 7110 + }, + { + "epoch": 0.04229113141117138, + "grad_norm": 1.6632722616195679, + "learning_rate": 4.977973401149215e-05, + "loss": 5.1546, + "step": 7111 + }, + { + "epoch": 0.04229707869445237, + "grad_norm": 1.762418270111084, + "learning_rate": 4.977967213882107e-05, + "loss": 5.0884, + "step": 7112 + }, + { + "epoch": 0.042303025977733374, + "grad_norm": 1.9325755834579468, + "learning_rate": 4.977961025749964e-05, + "loss": 5.1857, + "step": 7113 + }, + { + "epoch": 0.04230897326101437, + "grad_norm": 1.8359284400939941, + "learning_rate": 4.9779548367527926e-05, + "loss": 5.165, + "step": 7114 + }, + { + "epoch": 0.042314920544295365, + "grad_norm": 1.8305978775024414, + "learning_rate": 4.977948646890591e-05, + "loss": 5.1347, + "step": 7115 + }, + { + "epoch": 0.04232086782757636, + "grad_norm": 1.7374697923660278, + "learning_rate": 4.9779424561633644e-05, + "loss": 5.5219, + "step": 7116 + }, + { + "epoch": 0.04232681511085736, + "grad_norm": 1.9947689771652222, + "learning_rate": 4.9779362645711135e-05, + "loss": 5.4445, + "step": 7117 + }, + { + "epoch": 0.04233276239413836, + "grad_norm": 1.6639795303344727, + "learning_rate": 4.97793007211384e-05, + "loss": 5.3798, + "step": 7118 + }, + { + "epoch": 0.04233870967741935, + "grad_norm": 1.6983096599578857, + "learning_rate": 4.977923878791547e-05, + "loss": 5.2847, + "step": 7119 + }, + { + "epoch": 0.042344656960700354, + "grad_norm": 1.7397092580795288, + "learning_rate": 4.9779176846042366e-05, + "loss": 5.3175, + "step": 7120 + }, + { + "epoch": 0.04235060424398135, + "grad_norm": 1.5255639553070068, + "learning_rate": 4.977911489551911e-05, + "loss": 5.2735, + "step": 7121 + }, + { + "epoch": 0.042356551527262344, + "grad_norm": 1.5646785497665405, + "learning_rate": 4.9779052936345715e-05, + "loss": 5.3892, + "step": 7122 + }, + { + "epoch": 0.042362498810543346, + "grad_norm": 1.7479640245437622, + "learning_rate": 4.977899096852221e-05, + "loss": 5.4341, + "step": 7123 + }, + { + "epoch": 0.04236844609382434, + "grad_norm": 1.6275604963302612, + "learning_rate": 4.9778928992048615e-05, + "loss": 5.5209, + "step": 7124 + }, + { + "epoch": 0.042374393377105336, + "grad_norm": 1.6917749643325806, + "learning_rate": 4.977886700692496e-05, + "loss": 5.5779, + "step": 7125 + }, + { + "epoch": 0.04238034066038634, + "grad_norm": 1.683716058731079, + "learning_rate": 4.977880501315125e-05, + "loss": 5.475, + "step": 7126 + }, + { + "epoch": 0.04238628794366733, + "grad_norm": 1.7665706872940063, + "learning_rate": 4.977874301072751e-05, + "loss": 5.3666, + "step": 7127 + }, + { + "epoch": 0.04239223522694833, + "grad_norm": 1.715329885482788, + "learning_rate": 4.977868099965377e-05, + "loss": 5.407, + "step": 7128 + }, + { + "epoch": 0.04239818251022933, + "grad_norm": 1.8468618392944336, + "learning_rate": 4.977861897993006e-05, + "loss": 5.328, + "step": 7129 + }, + { + "epoch": 0.042404129793510326, + "grad_norm": 1.59178626537323, + "learning_rate": 4.977855695155638e-05, + "loss": 5.7797, + "step": 7130 + }, + { + "epoch": 0.04241007707679132, + "grad_norm": 1.4733757972717285, + "learning_rate": 4.977849491453277e-05, + "loss": 5.3019, + "step": 7131 + }, + { + "epoch": 0.042416024360072316, + "grad_norm": 1.4632091522216797, + "learning_rate": 4.977843286885923e-05, + "loss": 5.1754, + "step": 7132 + }, + { + "epoch": 0.04242197164335332, + "grad_norm": 1.530564308166504, + "learning_rate": 4.97783708145358e-05, + "loss": 5.3613, + "step": 7133 + }, + { + "epoch": 0.04242791892663431, + "grad_norm": 1.954219102859497, + "learning_rate": 4.97783087515625e-05, + "loss": 5.4013, + "step": 7134 + }, + { + "epoch": 0.04243386620991531, + "grad_norm": 1.8276890516281128, + "learning_rate": 4.977824667993935e-05, + "loss": 5.3611, + "step": 7135 + }, + { + "epoch": 0.04243981349319631, + "grad_norm": 2.1430561542510986, + "learning_rate": 4.977818459966637e-05, + "loss": 5.1501, + "step": 7136 + }, + { + "epoch": 0.042445760776477305, + "grad_norm": 1.9150115251541138, + "learning_rate": 4.977812251074357e-05, + "loss": 5.1778, + "step": 7137 + }, + { + "epoch": 0.0424517080597583, + "grad_norm": 1.6958523988723755, + "learning_rate": 4.9778060413171004e-05, + "loss": 5.5029, + "step": 7138 + }, + { + "epoch": 0.0424576553430393, + "grad_norm": 1.7183772325515747, + "learning_rate": 4.977799830694866e-05, + "loss": 5.4323, + "step": 7139 + }, + { + "epoch": 0.0424636026263203, + "grad_norm": 1.717731237411499, + "learning_rate": 4.977793619207657e-05, + "loss": 5.3418, + "step": 7140 + }, + { + "epoch": 0.04246954990960129, + "grad_norm": 1.8155564069747925, + "learning_rate": 4.9777874068554766e-05, + "loss": 5.2865, + "step": 7141 + }, + { + "epoch": 0.042475497192882294, + "grad_norm": 1.9890762567520142, + "learning_rate": 4.9777811936383254e-05, + "loss": 5.4101, + "step": 7142 + }, + { + "epoch": 0.04248144447616329, + "grad_norm": 1.8181748390197754, + "learning_rate": 4.977774979556207e-05, + "loss": 5.2719, + "step": 7143 + }, + { + "epoch": 0.042487391759444285, + "grad_norm": 1.7353019714355469, + "learning_rate": 4.9777687646091234e-05, + "loss": 5.4202, + "step": 7144 + }, + { + "epoch": 0.04249333904272528, + "grad_norm": 1.6121984720230103, + "learning_rate": 4.977762548797076e-05, + "loss": 5.3174, + "step": 7145 + }, + { + "epoch": 0.04249928632600628, + "grad_norm": 1.9579551219940186, + "learning_rate": 4.977756332120067e-05, + "loss": 5.135, + "step": 7146 + }, + { + "epoch": 0.04250523360928728, + "grad_norm": 1.9396319389343262, + "learning_rate": 4.977750114578099e-05, + "loss": 5.7521, + "step": 7147 + }, + { + "epoch": 0.04251118089256827, + "grad_norm": 1.8567198514938354, + "learning_rate": 4.977743896171173e-05, + "loss": 5.7521, + "step": 7148 + }, + { + "epoch": 0.042517128175849274, + "grad_norm": 2.139861583709717, + "learning_rate": 4.977737676899293e-05, + "loss": 5.472, + "step": 7149 + }, + { + "epoch": 0.04252307545913027, + "grad_norm": 1.6526445150375366, + "learning_rate": 4.977731456762461e-05, + "loss": 5.5557, + "step": 7150 + }, + { + "epoch": 0.042529022742411264, + "grad_norm": 1.7761725187301636, + "learning_rate": 4.9777252357606784e-05, + "loss": 5.1922, + "step": 7151 + }, + { + "epoch": 0.042534970025692266, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.977719013893947e-05, + "loss": 5.5067, + "step": 7152 + }, + { + "epoch": 0.04254091730897326, + "grad_norm": 1.746470332145691, + "learning_rate": 4.97771279116227e-05, + "loss": 5.28, + "step": 7153 + }, + { + "epoch": 0.042546864592254256, + "grad_norm": 1.9258379936218262, + "learning_rate": 4.9777065675656484e-05, + "loss": 5.7223, + "step": 7154 + }, + { + "epoch": 0.04255281187553526, + "grad_norm": 1.9928748607635498, + "learning_rate": 4.977700343104086e-05, + "loss": 5.727, + "step": 7155 + }, + { + "epoch": 0.04255875915881625, + "grad_norm": 1.7435163259506226, + "learning_rate": 4.9776941177775824e-05, + "loss": 5.6636, + "step": 7156 + }, + { + "epoch": 0.04256470644209725, + "grad_norm": 1.6818004846572876, + "learning_rate": 4.977687891586143e-05, + "loss": 5.6589, + "step": 7157 + }, + { + "epoch": 0.04257065372537825, + "grad_norm": 1.812779426574707, + "learning_rate": 4.9776816645297676e-05, + "loss": 5.2705, + "step": 7158 + }, + { + "epoch": 0.042576601008659246, + "grad_norm": 1.7637232542037964, + "learning_rate": 4.977675436608459e-05, + "loss": 5.2872, + "step": 7159 + }, + { + "epoch": 0.04258254829194024, + "grad_norm": 1.9504014253616333, + "learning_rate": 4.97766920782222e-05, + "loss": 5.1324, + "step": 7160 + }, + { + "epoch": 0.042588495575221236, + "grad_norm": 1.7741994857788086, + "learning_rate": 4.9776629781710525e-05, + "loss": 5.4164, + "step": 7161 + }, + { + "epoch": 0.04259444285850224, + "grad_norm": 2.0005195140838623, + "learning_rate": 4.9776567476549576e-05, + "loss": 5.4667, + "step": 7162 + }, + { + "epoch": 0.04260039014178323, + "grad_norm": 2.256420612335205, + "learning_rate": 4.977650516273939e-05, + "loss": 5.1116, + "step": 7163 + }, + { + "epoch": 0.04260633742506423, + "grad_norm": 2.0806920528411865, + "learning_rate": 4.977644284027998e-05, + "loss": 5.2333, + "step": 7164 + }, + { + "epoch": 0.04261228470834523, + "grad_norm": 1.898760199546814, + "learning_rate": 4.9776380509171364e-05, + "loss": 5.4761, + "step": 7165 + }, + { + "epoch": 0.042618231991626225, + "grad_norm": 1.7251659631729126, + "learning_rate": 4.977631816941358e-05, + "loss": 5.5584, + "step": 7166 + }, + { + "epoch": 0.04262417927490722, + "grad_norm": 1.741645336151123, + "learning_rate": 4.977625582100664e-05, + "loss": 5.4133, + "step": 7167 + }, + { + "epoch": 0.04263012655818822, + "grad_norm": 1.921617031097412, + "learning_rate": 4.977619346395055e-05, + "loss": 5.1829, + "step": 7168 + }, + { + "epoch": 0.04263607384146922, + "grad_norm": 1.7597262859344482, + "learning_rate": 4.977613109824536e-05, + "loss": 5.1743, + "step": 7169 + }, + { + "epoch": 0.04264202112475021, + "grad_norm": 1.8069764375686646, + "learning_rate": 4.977606872389107e-05, + "loss": 5.4004, + "step": 7170 + }, + { + "epoch": 0.042647968408031214, + "grad_norm": 1.7694367170333862, + "learning_rate": 4.9776006340887714e-05, + "loss": 5.2018, + "step": 7171 + }, + { + "epoch": 0.04265391569131221, + "grad_norm": 1.8260759115219116, + "learning_rate": 4.9775943949235316e-05, + "loss": 5.4115, + "step": 7172 + }, + { + "epoch": 0.042659862974593205, + "grad_norm": 1.71034574508667, + "learning_rate": 4.9775881548933884e-05, + "loss": 5.2781, + "step": 7173 + }, + { + "epoch": 0.0426658102578742, + "grad_norm": 1.7208900451660156, + "learning_rate": 4.977581913998345e-05, + "loss": 5.4686, + "step": 7174 + }, + { + "epoch": 0.0426717575411552, + "grad_norm": 1.8545277118682861, + "learning_rate": 4.977575672238404e-05, + "loss": 5.4545, + "step": 7175 + }, + { + "epoch": 0.0426777048244362, + "grad_norm": 1.7892229557037354, + "learning_rate": 4.9775694296135656e-05, + "loss": 5.6612, + "step": 7176 + }, + { + "epoch": 0.04268365210771719, + "grad_norm": 1.8321889638900757, + "learning_rate": 4.9775631861238343e-05, + "loss": 5.5889, + "step": 7177 + }, + { + "epoch": 0.042689599390998194, + "grad_norm": 1.7925626039505005, + "learning_rate": 4.977556941769211e-05, + "loss": 5.6218, + "step": 7178 + }, + { + "epoch": 0.04269554667427919, + "grad_norm": 1.9650121927261353, + "learning_rate": 4.9775506965496984e-05, + "loss": 5.5228, + "step": 7179 + }, + { + "epoch": 0.042701493957560184, + "grad_norm": 1.9050647020339966, + "learning_rate": 4.977544450465298e-05, + "loss": 5.5547, + "step": 7180 + }, + { + "epoch": 0.042707441240841186, + "grad_norm": 1.8334670066833496, + "learning_rate": 4.977538203516013e-05, + "loss": 5.3895, + "step": 7181 + }, + { + "epoch": 0.04271338852412218, + "grad_norm": 1.803544521331787, + "learning_rate": 4.9775319557018444e-05, + "loss": 5.6288, + "step": 7182 + }, + { + "epoch": 0.042719335807403176, + "grad_norm": 1.823440432548523, + "learning_rate": 4.9775257070227956e-05, + "loss": 5.4996, + "step": 7183 + }, + { + "epoch": 0.04272528309068418, + "grad_norm": 1.9730159044265747, + "learning_rate": 4.977519457478868e-05, + "loss": 5.5004, + "step": 7184 + }, + { + "epoch": 0.04273123037396517, + "grad_norm": 1.9566004276275635, + "learning_rate": 4.977513207070064e-05, + "loss": 5.5496, + "step": 7185 + }, + { + "epoch": 0.04273717765724617, + "grad_norm": 2.0958995819091797, + "learning_rate": 4.977506955796385e-05, + "loss": 5.5256, + "step": 7186 + }, + { + "epoch": 0.04274312494052717, + "grad_norm": 1.8957890272140503, + "learning_rate": 4.977500703657835e-05, + "loss": 5.3337, + "step": 7187 + }, + { + "epoch": 0.042749072223808166, + "grad_norm": 1.8224141597747803, + "learning_rate": 4.977494450654414e-05, + "loss": 5.1362, + "step": 7188 + }, + { + "epoch": 0.04275501950708916, + "grad_norm": 1.648296594619751, + "learning_rate": 4.977488196786126e-05, + "loss": 5.3398, + "step": 7189 + }, + { + "epoch": 0.042760966790370156, + "grad_norm": 1.6238311529159546, + "learning_rate": 4.977481942052972e-05, + "loss": 5.2083, + "step": 7190 + }, + { + "epoch": 0.04276691407365116, + "grad_norm": 1.7399996519088745, + "learning_rate": 4.977475686454956e-05, + "loss": 5.2403, + "step": 7191 + }, + { + "epoch": 0.04277286135693215, + "grad_norm": 1.7260342836380005, + "learning_rate": 4.977469429992077e-05, + "loss": 5.2282, + "step": 7192 + }, + { + "epoch": 0.04277880864021315, + "grad_norm": 4.4954447746276855, + "learning_rate": 4.9774631726643396e-05, + "loss": 5.1044, + "step": 7193 + }, + { + "epoch": 0.04278475592349415, + "grad_norm": 1.879869818687439, + "learning_rate": 4.977456914471746e-05, + "loss": 5.3431, + "step": 7194 + }, + { + "epoch": 0.042790703206775145, + "grad_norm": 1.8826582431793213, + "learning_rate": 4.977450655414297e-05, + "loss": 5.2951, + "step": 7195 + }, + { + "epoch": 0.04279665049005614, + "grad_norm": 1.8973712921142578, + "learning_rate": 4.977444395491996e-05, + "loss": 5.343, + "step": 7196 + }, + { + "epoch": 0.04280259777333714, + "grad_norm": 1.6125551462173462, + "learning_rate": 4.977438134704845e-05, + "loss": 5.2849, + "step": 7197 + }, + { + "epoch": 0.04280854505661814, + "grad_norm": 1.441159963607788, + "learning_rate": 4.9774318730528456e-05, + "loss": 5.2955, + "step": 7198 + }, + { + "epoch": 0.04281449233989913, + "grad_norm": 1.9655884504318237, + "learning_rate": 4.9774256105360004e-05, + "loss": 5.2093, + "step": 7199 + }, + { + "epoch": 0.042820439623180134, + "grad_norm": 1.7824043035507202, + "learning_rate": 4.9774193471543116e-05, + "loss": 5.2105, + "step": 7200 + }, + { + "epoch": 0.04282638690646113, + "grad_norm": 1.8331031799316406, + "learning_rate": 4.977413082907781e-05, + "loss": 5.3359, + "step": 7201 + }, + { + "epoch": 0.042832334189742124, + "grad_norm": 1.8695242404937744, + "learning_rate": 4.977406817796412e-05, + "loss": 5.3686, + "step": 7202 + }, + { + "epoch": 0.042838281473023126, + "grad_norm": 1.70205557346344, + "learning_rate": 4.977400551820205e-05, + "loss": 5.2689, + "step": 7203 + }, + { + "epoch": 0.04284422875630412, + "grad_norm": 1.700307846069336, + "learning_rate": 4.9773942849791635e-05, + "loss": 5.3946, + "step": 7204 + }, + { + "epoch": 0.04285017603958512, + "grad_norm": 1.625637173652649, + "learning_rate": 4.977388017273288e-05, + "loss": 5.095, + "step": 7205 + }, + { + "epoch": 0.04285612332286611, + "grad_norm": 1.7689390182495117, + "learning_rate": 4.977381748702583e-05, + "loss": 5.0097, + "step": 7206 + }, + { + "epoch": 0.042862070606147114, + "grad_norm": 1.856493353843689, + "learning_rate": 4.97737547926705e-05, + "loss": 5.0551, + "step": 7207 + }, + { + "epoch": 0.04286801788942811, + "grad_norm": 1.6497242450714111, + "learning_rate": 4.97736920896669e-05, + "loss": 5.031, + "step": 7208 + }, + { + "epoch": 0.042873965172709104, + "grad_norm": 1.5884608030319214, + "learning_rate": 4.977362937801506e-05, + "loss": 5.0758, + "step": 7209 + }, + { + "epoch": 0.042879912455990106, + "grad_norm": 1.5206499099731445, + "learning_rate": 4.9773566657715006e-05, + "loss": 5.049, + "step": 7210 + }, + { + "epoch": 0.0428858597392711, + "grad_norm": 1.7026933431625366, + "learning_rate": 4.977350392876676e-05, + "loss": 5.001, + "step": 7211 + }, + { + "epoch": 0.042891807022552096, + "grad_norm": 1.4197289943695068, + "learning_rate": 4.977344119117034e-05, + "loss": 5.0446, + "step": 7212 + }, + { + "epoch": 0.0428977543058331, + "grad_norm": 1.498713731765747, + "learning_rate": 4.977337844492576e-05, + "loss": 5.0574, + "step": 7213 + }, + { + "epoch": 0.04290370158911409, + "grad_norm": 1.7583528757095337, + "learning_rate": 4.9773315690033054e-05, + "loss": 4.994, + "step": 7214 + }, + { + "epoch": 0.04290964887239509, + "grad_norm": 1.8511004447937012, + "learning_rate": 4.9773252926492236e-05, + "loss": 4.9888, + "step": 7215 + }, + { + "epoch": 0.04291559615567609, + "grad_norm": 1.5799078941345215, + "learning_rate": 4.9773190154303334e-05, + "loss": 5.0028, + "step": 7216 + }, + { + "epoch": 0.042921543438957085, + "grad_norm": 1.6737205982208252, + "learning_rate": 4.977312737346637e-05, + "loss": 5.0701, + "step": 7217 + }, + { + "epoch": 0.04292749072223808, + "grad_norm": 1.537049412727356, + "learning_rate": 4.977306458398136e-05, + "loss": 5.0747, + "step": 7218 + }, + { + "epoch": 0.042933438005519076, + "grad_norm": 1.7501899003982544, + "learning_rate": 4.977300178584833e-05, + "loss": 5.0172, + "step": 7219 + }, + { + "epoch": 0.04293938528880008, + "grad_norm": 1.5130890607833862, + "learning_rate": 4.9772938979067294e-05, + "loss": 5.0196, + "step": 7220 + }, + { + "epoch": 0.04294533257208107, + "grad_norm": 1.628053903579712, + "learning_rate": 4.977287616363829e-05, + "loss": 5.0526, + "step": 7221 + }, + { + "epoch": 0.04295127985536207, + "grad_norm": 1.6736811399459839, + "learning_rate": 4.977281333956133e-05, + "loss": 5.0093, + "step": 7222 + }, + { + "epoch": 0.04295722713864307, + "grad_norm": 1.6157552003860474, + "learning_rate": 4.977275050683643e-05, + "loss": 4.9562, + "step": 7223 + }, + { + "epoch": 0.042963174421924065, + "grad_norm": 1.6699459552764893, + "learning_rate": 4.9772687665463625e-05, + "loss": 4.9603, + "step": 7224 + }, + { + "epoch": 0.04296912170520506, + "grad_norm": 1.4698256254196167, + "learning_rate": 4.9772624815442925e-05, + "loss": 4.9908, + "step": 7225 + }, + { + "epoch": 0.04297506898848606, + "grad_norm": 1.5310906171798706, + "learning_rate": 4.9772561956774365e-05, + "loss": 5.0081, + "step": 7226 + }, + { + "epoch": 0.04298101627176706, + "grad_norm": 1.6135941743850708, + "learning_rate": 4.977249908945795e-05, + "loss": 5.1394, + "step": 7227 + }, + { + "epoch": 0.04298696355504805, + "grad_norm": 1.7632607221603394, + "learning_rate": 4.977243621349372e-05, + "loss": 4.9992, + "step": 7228 + }, + { + "epoch": 0.042992910838329054, + "grad_norm": 1.574826955795288, + "learning_rate": 4.977237332888168e-05, + "loss": 4.9361, + "step": 7229 + }, + { + "epoch": 0.04299885812161005, + "grad_norm": 1.6633859872817993, + "learning_rate": 4.9772310435621874e-05, + "loss": 4.9085, + "step": 7230 + }, + { + "epoch": 0.043004805404891044, + "grad_norm": 1.6180634498596191, + "learning_rate": 4.97722475337143e-05, + "loss": 4.939, + "step": 7231 + }, + { + "epoch": 0.043010752688172046, + "grad_norm": 1.959694266319275, + "learning_rate": 4.9772184623158996e-05, + "loss": 5.231, + "step": 7232 + }, + { + "epoch": 0.04301669997145304, + "grad_norm": 1.6264785528182983, + "learning_rate": 4.977212170395598e-05, + "loss": 5.3228, + "step": 7233 + }, + { + "epoch": 0.04302264725473404, + "grad_norm": 2.109292507171631, + "learning_rate": 4.9772058776105264e-05, + "loss": 5.4579, + "step": 7234 + }, + { + "epoch": 0.04302859453801503, + "grad_norm": 1.991877555847168, + "learning_rate": 4.977199583960688e-05, + "loss": 5.355, + "step": 7235 + }, + { + "epoch": 0.043034541821296034, + "grad_norm": 2.23330020904541, + "learning_rate": 4.977193289446085e-05, + "loss": 5.3233, + "step": 7236 + }, + { + "epoch": 0.04304048910457703, + "grad_norm": 2.077359914779663, + "learning_rate": 4.9771869940667194e-05, + "loss": 5.2003, + "step": 7237 + }, + { + "epoch": 0.043046436387858024, + "grad_norm": 1.652498722076416, + "learning_rate": 4.977180697822593e-05, + "loss": 5.0232, + "step": 7238 + }, + { + "epoch": 0.043052383671139026, + "grad_norm": 1.9277194738388062, + "learning_rate": 4.977174400713709e-05, + "loss": 5.3826, + "step": 7239 + }, + { + "epoch": 0.04305833095442002, + "grad_norm": 1.9263273477554321, + "learning_rate": 4.9771681027400694e-05, + "loss": 5.5258, + "step": 7240 + }, + { + "epoch": 0.043064278237701016, + "grad_norm": 2.066934108734131, + "learning_rate": 4.9771618039016756e-05, + "loss": 5.6398, + "step": 7241 + }, + { + "epoch": 0.04307022552098202, + "grad_norm": 1.7810741662979126, + "learning_rate": 4.9771555041985295e-05, + "loss": 5.3716, + "step": 7242 + }, + { + "epoch": 0.04307617280426301, + "grad_norm": 1.7068313360214233, + "learning_rate": 4.977149203630635e-05, + "loss": 5.4042, + "step": 7243 + }, + { + "epoch": 0.04308212008754401, + "grad_norm": 1.8587994575500488, + "learning_rate": 4.977142902197992e-05, + "loss": 5.3635, + "step": 7244 + }, + { + "epoch": 0.04308806737082501, + "grad_norm": 2.101649284362793, + "learning_rate": 4.9771365999006054e-05, + "loss": 5.5292, + "step": 7245 + }, + { + "epoch": 0.043094014654106005, + "grad_norm": 1.8571972846984863, + "learning_rate": 4.9771302967384756e-05, + "loss": 5.4577, + "step": 7246 + }, + { + "epoch": 0.043099961937387, + "grad_norm": 1.9837383031845093, + "learning_rate": 4.9771239927116045e-05, + "loss": 5.4976, + "step": 7247 + }, + { + "epoch": 0.043105909220667996, + "grad_norm": 1.7688343524932861, + "learning_rate": 4.977117687819996e-05, + "loss": 5.448, + "step": 7248 + }, + { + "epoch": 0.043111856503949, + "grad_norm": 1.923824429512024, + "learning_rate": 4.9771113820636505e-05, + "loss": 5.3436, + "step": 7249 + }, + { + "epoch": 0.04311780378722999, + "grad_norm": 1.4405949115753174, + "learning_rate": 4.9771050754425715e-05, + "loss": 5.2751, + "step": 7250 + }, + { + "epoch": 0.04312375107051099, + "grad_norm": 1.7337450981140137, + "learning_rate": 4.977098767956761e-05, + "loss": 5.4693, + "step": 7251 + }, + { + "epoch": 0.04312969835379199, + "grad_norm": 2.063887119293213, + "learning_rate": 4.977092459606221e-05, + "loss": 5.4576, + "step": 7252 + }, + { + "epoch": 0.043135645637072985, + "grad_norm": 1.576517105102539, + "learning_rate": 4.9770861503909524e-05, + "loss": 5.4052, + "step": 7253 + }, + { + "epoch": 0.04314159292035398, + "grad_norm": 1.8137834072113037, + "learning_rate": 4.9770798403109596e-05, + "loss": 5.5732, + "step": 7254 + }, + { + "epoch": 0.04314754020363498, + "grad_norm": 1.7954564094543457, + "learning_rate": 4.977073529366244e-05, + "loss": 5.4213, + "step": 7255 + }, + { + "epoch": 0.04315348748691598, + "grad_norm": 1.993961215019226, + "learning_rate": 4.977067217556807e-05, + "loss": 5.2909, + "step": 7256 + }, + { + "epoch": 0.04315943477019697, + "grad_norm": 1.6993632316589355, + "learning_rate": 4.977060904882651e-05, + "loss": 5.4523, + "step": 7257 + }, + { + "epoch": 0.043165382053477974, + "grad_norm": 1.8541932106018066, + "learning_rate": 4.977054591343779e-05, + "loss": 5.3182, + "step": 7258 + }, + { + "epoch": 0.04317132933675897, + "grad_norm": 1.7425625324249268, + "learning_rate": 4.9770482769401935e-05, + "loss": 5.2527, + "step": 7259 + }, + { + "epoch": 0.043177276620039964, + "grad_norm": 1.7028024196624756, + "learning_rate": 4.9770419616718955e-05, + "loss": 5.1305, + "step": 7260 + }, + { + "epoch": 0.043183223903320966, + "grad_norm": 1.745316982269287, + "learning_rate": 4.977035645538888e-05, + "loss": 5.0368, + "step": 7261 + }, + { + "epoch": 0.04318917118660196, + "grad_norm": 1.8373509645462036, + "learning_rate": 4.977029328541173e-05, + "loss": 5.353, + "step": 7262 + }, + { + "epoch": 0.04319511846988296, + "grad_norm": 1.9976449012756348, + "learning_rate": 4.9770230106787526e-05, + "loss": 5.363, + "step": 7263 + }, + { + "epoch": 0.04320106575316395, + "grad_norm": 1.7109822034835815, + "learning_rate": 4.977016691951629e-05, + "loss": 5.3462, + "step": 7264 + }, + { + "epoch": 0.043207013036444954, + "grad_norm": 1.8688478469848633, + "learning_rate": 4.9770103723598036e-05, + "loss": 5.3564, + "step": 7265 + }, + { + "epoch": 0.04321296031972595, + "grad_norm": 1.8680217266082764, + "learning_rate": 4.9770040519032804e-05, + "loss": 5.2713, + "step": 7266 + }, + { + "epoch": 0.043218907603006944, + "grad_norm": 1.8022522926330566, + "learning_rate": 4.976997730582061e-05, + "loss": 5.153, + "step": 7267 + }, + { + "epoch": 0.043224854886287946, + "grad_norm": 1.7128162384033203, + "learning_rate": 4.976991408396147e-05, + "loss": 5.3107, + "step": 7268 + }, + { + "epoch": 0.04323080216956894, + "grad_norm": 1.8222606182098389, + "learning_rate": 4.9769850853455404e-05, + "loss": 5.3599, + "step": 7269 + }, + { + "epoch": 0.043236749452849936, + "grad_norm": 1.829373836517334, + "learning_rate": 4.976978761430244e-05, + "loss": 5.3991, + "step": 7270 + }, + { + "epoch": 0.04324269673613094, + "grad_norm": 1.8270717859268188, + "learning_rate": 4.97697243665026e-05, + "loss": 5.2434, + "step": 7271 + }, + { + "epoch": 0.04324864401941193, + "grad_norm": 1.9759695529937744, + "learning_rate": 4.976966111005591e-05, + "loss": 5.4585, + "step": 7272 + }, + { + "epoch": 0.04325459130269293, + "grad_norm": 2.0235564708709717, + "learning_rate": 4.9769597844962376e-05, + "loss": 5.3996, + "step": 7273 + }, + { + "epoch": 0.04326053858597393, + "grad_norm": 1.9220880270004272, + "learning_rate": 4.976953457122204e-05, + "loss": 5.344, + "step": 7274 + }, + { + "epoch": 0.043266485869254925, + "grad_norm": 1.6257338523864746, + "learning_rate": 4.976947128883492e-05, + "loss": 5.4012, + "step": 7275 + }, + { + "epoch": 0.04327243315253592, + "grad_norm": 1.6390771865844727, + "learning_rate": 4.976940799780103e-05, + "loss": 5.3693, + "step": 7276 + }, + { + "epoch": 0.043278380435816916, + "grad_norm": 1.5769712924957275, + "learning_rate": 4.976934469812039e-05, + "loss": 5.3214, + "step": 7277 + }, + { + "epoch": 0.04328432771909792, + "grad_norm": 1.539920687675476, + "learning_rate": 4.9769281389793035e-05, + "loss": 5.2784, + "step": 7278 + }, + { + "epoch": 0.04329027500237891, + "grad_norm": 1.662835717201233, + "learning_rate": 4.976921807281897e-05, + "loss": 5.2717, + "step": 7279 + }, + { + "epoch": 0.04329622228565991, + "grad_norm": 1.3613345623016357, + "learning_rate": 4.9769154747198234e-05, + "loss": 5.4241, + "step": 7280 + }, + { + "epoch": 0.04330216956894091, + "grad_norm": 1.5267658233642578, + "learning_rate": 4.976909141293084e-05, + "loss": 5.454, + "step": 7281 + }, + { + "epoch": 0.043308116852221905, + "grad_norm": 1.5050435066223145, + "learning_rate": 4.976902807001681e-05, + "loss": 5.4975, + "step": 7282 + }, + { + "epoch": 0.0433140641355029, + "grad_norm": 1.292698621749878, + "learning_rate": 4.976896471845617e-05, + "loss": 5.4071, + "step": 7283 + }, + { + "epoch": 0.0433200114187839, + "grad_norm": 1.6818265914916992, + "learning_rate": 4.9768901358248946e-05, + "loss": 5.3561, + "step": 7284 + }, + { + "epoch": 0.0433259587020649, + "grad_norm": 1.5995383262634277, + "learning_rate": 4.976883798939515e-05, + "loss": 5.2623, + "step": 7285 + }, + { + "epoch": 0.04333190598534589, + "grad_norm": 1.6959342956542969, + "learning_rate": 4.976877461189481e-05, + "loss": 5.3193, + "step": 7286 + }, + { + "epoch": 0.043337853268626894, + "grad_norm": 1.6978071928024292, + "learning_rate": 4.976871122574794e-05, + "loss": 5.5653, + "step": 7287 + }, + { + "epoch": 0.04334380055190789, + "grad_norm": 1.7587183713912964, + "learning_rate": 4.976864783095457e-05, + "loss": 5.545, + "step": 7288 + }, + { + "epoch": 0.043349747835188884, + "grad_norm": 1.6225430965423584, + "learning_rate": 4.976858442751473e-05, + "loss": 5.5804, + "step": 7289 + }, + { + "epoch": 0.043355695118469886, + "grad_norm": 1.5895410776138306, + "learning_rate": 4.976852101542843e-05, + "loss": 5.4798, + "step": 7290 + }, + { + "epoch": 0.04336164240175088, + "grad_norm": 1.759022831916809, + "learning_rate": 4.976845759469569e-05, + "loss": 5.4794, + "step": 7291 + }, + { + "epoch": 0.043367589685031877, + "grad_norm": 1.483383059501648, + "learning_rate": 4.976839416531654e-05, + "loss": 5.2547, + "step": 7292 + }, + { + "epoch": 0.04337353696831287, + "grad_norm": 2.136172294616699, + "learning_rate": 4.9768330727291e-05, + "loss": 5.1655, + "step": 7293 + }, + { + "epoch": 0.043379484251593874, + "grad_norm": 1.9202553033828735, + "learning_rate": 4.9768267280619094e-05, + "loss": 5.1945, + "step": 7294 + }, + { + "epoch": 0.04338543153487487, + "grad_norm": 1.7927708625793457, + "learning_rate": 4.976820382530084e-05, + "loss": 5.4936, + "step": 7295 + }, + { + "epoch": 0.043391378818155864, + "grad_norm": 1.597887396812439, + "learning_rate": 4.976814036133626e-05, + "loss": 5.5516, + "step": 7296 + }, + { + "epoch": 0.043397326101436866, + "grad_norm": 1.493356466293335, + "learning_rate": 4.9768076888725376e-05, + "loss": 5.552, + "step": 7297 + }, + { + "epoch": 0.04340327338471786, + "grad_norm": 1.6748720407485962, + "learning_rate": 4.976801340746822e-05, + "loss": 5.3957, + "step": 7298 + }, + { + "epoch": 0.043409220667998856, + "grad_norm": 1.541945457458496, + "learning_rate": 4.9767949917564794e-05, + "loss": 5.5558, + "step": 7299 + }, + { + "epoch": 0.04341516795127986, + "grad_norm": 1.6436586380004883, + "learning_rate": 4.976788641901514e-05, + "loss": 5.4918, + "step": 7300 + }, + { + "epoch": 0.04342111523456085, + "grad_norm": 1.69910728931427, + "learning_rate": 4.9767822911819274e-05, + "loss": 5.4688, + "step": 7301 + }, + { + "epoch": 0.04342706251784185, + "grad_norm": 1.8294274806976318, + "learning_rate": 4.976775939597721e-05, + "loss": 5.505, + "step": 7302 + }, + { + "epoch": 0.04343300980112285, + "grad_norm": 1.720880389213562, + "learning_rate": 4.976769587148899e-05, + "loss": 5.3509, + "step": 7303 + }, + { + "epoch": 0.043438957084403845, + "grad_norm": 1.5898194313049316, + "learning_rate": 4.976763233835461e-05, + "loss": 5.2955, + "step": 7304 + }, + { + "epoch": 0.04344490436768484, + "grad_norm": 1.569218397140503, + "learning_rate": 4.976756879657412e-05, + "loss": 5.5695, + "step": 7305 + }, + { + "epoch": 0.043450851650965835, + "grad_norm": 1.5551841259002686, + "learning_rate": 4.976750524614752e-05, + "loss": 5.5313, + "step": 7306 + }, + { + "epoch": 0.04345679893424684, + "grad_norm": 1.5870057344436646, + "learning_rate": 4.9767441687074834e-05, + "loss": 5.7525, + "step": 7307 + }, + { + "epoch": 0.04346274621752783, + "grad_norm": 1.5421022176742554, + "learning_rate": 4.97673781193561e-05, + "loss": 5.6176, + "step": 7308 + }, + { + "epoch": 0.04346869350080883, + "grad_norm": 1.9368326663970947, + "learning_rate": 4.976731454299132e-05, + "loss": 5.4239, + "step": 7309 + }, + { + "epoch": 0.04347464078408983, + "grad_norm": 1.719084620475769, + "learning_rate": 4.976725095798053e-05, + "loss": 5.3526, + "step": 7310 + }, + { + "epoch": 0.043480588067370825, + "grad_norm": 1.8004268407821655, + "learning_rate": 4.9767187364323756e-05, + "loss": 5.7112, + "step": 7311 + }, + { + "epoch": 0.04348653535065182, + "grad_norm": 1.9922735691070557, + "learning_rate": 4.9767123762021003e-05, + "loss": 5.4993, + "step": 7312 + }, + { + "epoch": 0.04349248263393282, + "grad_norm": 1.6768959760665894, + "learning_rate": 4.976706015107231e-05, + "loss": 5.4713, + "step": 7313 + }, + { + "epoch": 0.04349842991721382, + "grad_norm": 1.6070122718811035, + "learning_rate": 4.976699653147768e-05, + "loss": 5.4695, + "step": 7314 + }, + { + "epoch": 0.04350437720049481, + "grad_norm": 1.5641200542449951, + "learning_rate": 4.976693290323716e-05, + "loss": 5.3596, + "step": 7315 + }, + { + "epoch": 0.043510324483775814, + "grad_norm": 3.0344419479370117, + "learning_rate": 4.976686926635076e-05, + "loss": 5.7371, + "step": 7316 + }, + { + "epoch": 0.04351627176705681, + "grad_norm": 1.8784242868423462, + "learning_rate": 4.9766805620818494e-05, + "loss": 5.5142, + "step": 7317 + }, + { + "epoch": 0.043522219050337804, + "grad_norm": 2.0644166469573975, + "learning_rate": 4.9766741966640394e-05, + "loss": 5.276, + "step": 7318 + }, + { + "epoch": 0.043528166333618806, + "grad_norm": 1.8128771781921387, + "learning_rate": 4.976667830381649e-05, + "loss": 5.3515, + "step": 7319 + }, + { + "epoch": 0.0435341136168998, + "grad_norm": 1.8899081945419312, + "learning_rate": 4.9766614632346786e-05, + "loss": 5.3981, + "step": 7320 + }, + { + "epoch": 0.043540060900180796, + "grad_norm": 1.89181649684906, + "learning_rate": 4.976655095223131e-05, + "loss": 5.4378, + "step": 7321 + }, + { + "epoch": 0.04354600818346179, + "grad_norm": 1.6332184076309204, + "learning_rate": 4.976648726347009e-05, + "loss": 5.4023, + "step": 7322 + }, + { + "epoch": 0.043551955466742794, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.976642356606315e-05, + "loss": 5.8375, + "step": 7323 + }, + { + "epoch": 0.04355790275002379, + "grad_norm": 2.029244899749756, + "learning_rate": 4.97663598600105e-05, + "loss": 5.5617, + "step": 7324 + }, + { + "epoch": 0.043563850033304784, + "grad_norm": 2.138946056365967, + "learning_rate": 4.9766296145312175e-05, + "loss": 5.5076, + "step": 7325 + }, + { + "epoch": 0.043569797316585786, + "grad_norm": 1.8702884912490845, + "learning_rate": 4.9766232421968184e-05, + "loss": 5.123, + "step": 7326 + }, + { + "epoch": 0.04357574459986678, + "grad_norm": 1.8917137384414673, + "learning_rate": 4.976616868997856e-05, + "loss": 5.4809, + "step": 7327 + }, + { + "epoch": 0.043581691883147776, + "grad_norm": 2.2203474044799805, + "learning_rate": 4.976610494934333e-05, + "loss": 5.6359, + "step": 7328 + }, + { + "epoch": 0.04358763916642878, + "grad_norm": 2.4505302906036377, + "learning_rate": 4.976604120006251e-05, + "loss": 6.1423, + "step": 7329 + }, + { + "epoch": 0.04359358644970977, + "grad_norm": 2.4601128101348877, + "learning_rate": 4.976597744213611e-05, + "loss": 6.0908, + "step": 7330 + }, + { + "epoch": 0.04359953373299077, + "grad_norm": 1.9502687454223633, + "learning_rate": 4.976591367556417e-05, + "loss": 5.918, + "step": 7331 + }, + { + "epoch": 0.04360548101627177, + "grad_norm": 2.180250644683838, + "learning_rate": 4.9765849900346696e-05, + "loss": 5.7203, + "step": 7332 + }, + { + "epoch": 0.043611428299552765, + "grad_norm": 2.125669002532959, + "learning_rate": 4.9765786116483726e-05, + "loss": 5.7875, + "step": 7333 + }, + { + "epoch": 0.04361737558283376, + "grad_norm": 2.0372321605682373, + "learning_rate": 4.9765722323975286e-05, + "loss": 5.6777, + "step": 7334 + }, + { + "epoch": 0.043623322866114755, + "grad_norm": 2.5857362747192383, + "learning_rate": 4.976565852282137e-05, + "loss": 5.2989, + "step": 7335 + }, + { + "epoch": 0.04362927014939576, + "grad_norm": 2.5774800777435303, + "learning_rate": 4.976559471302203e-05, + "loss": 6.0479, + "step": 7336 + }, + { + "epoch": 0.04363521743267675, + "grad_norm": 2.0820937156677246, + "learning_rate": 4.976553089457727e-05, + "loss": 5.7636, + "step": 7337 + }, + { + "epoch": 0.04364116471595775, + "grad_norm": 2.287719964981079, + "learning_rate": 4.9765467067487126e-05, + "loss": 5.7706, + "step": 7338 + }, + { + "epoch": 0.04364711199923875, + "grad_norm": 2.6578378677368164, + "learning_rate": 4.9765403231751614e-05, + "loss": 6.1506, + "step": 7339 + }, + { + "epoch": 0.043653059282519745, + "grad_norm": 2.503955841064453, + "learning_rate": 4.976533938737075e-05, + "loss": 6.0658, + "step": 7340 + }, + { + "epoch": 0.04365900656580074, + "grad_norm": 2.28857684135437, + "learning_rate": 4.976527553434456e-05, + "loss": 5.833, + "step": 7341 + }, + { + "epoch": 0.04366495384908174, + "grad_norm": 2.327331781387329, + "learning_rate": 4.976521167267307e-05, + "loss": 5.934, + "step": 7342 + }, + { + "epoch": 0.04367090113236274, + "grad_norm": 1.7726761102676392, + "learning_rate": 4.976514780235631e-05, + "loss": 6.034, + "step": 7343 + }, + { + "epoch": 0.04367684841564373, + "grad_norm": 2.180790662765503, + "learning_rate": 4.9765083923394285e-05, + "loss": 6.1377, + "step": 7344 + }, + { + "epoch": 0.043682795698924734, + "grad_norm": 2.031378984451294, + "learning_rate": 4.9765020035787024e-05, + "loss": 5.7203, + "step": 7345 + }, + { + "epoch": 0.04368874298220573, + "grad_norm": 2.453611135482788, + "learning_rate": 4.9764956139534545e-05, + "loss": 5.9798, + "step": 7346 + }, + { + "epoch": 0.043694690265486724, + "grad_norm": 2.3802528381347656, + "learning_rate": 4.976489223463688e-05, + "loss": 5.9343, + "step": 7347 + }, + { + "epoch": 0.043700637548767726, + "grad_norm": 2.771704912185669, + "learning_rate": 4.976482832109406e-05, + "loss": 6.5202, + "step": 7348 + }, + { + "epoch": 0.04370658483204872, + "grad_norm": 1.9455180168151855, + "learning_rate": 4.9764764398906084e-05, + "loss": 6.1159, + "step": 7349 + }, + { + "epoch": 0.043712532115329716, + "grad_norm": 1.9527102708816528, + "learning_rate": 4.9764700468072976e-05, + "loss": 5.7773, + "step": 7350 + }, + { + "epoch": 0.04371847939861071, + "grad_norm": 1.9531358480453491, + "learning_rate": 4.976463652859478e-05, + "loss": 5.9918, + "step": 7351 + }, + { + "epoch": 0.043724426681891713, + "grad_norm": 2.375239849090576, + "learning_rate": 4.97645725804715e-05, + "loss": 5.5054, + "step": 7352 + }, + { + "epoch": 0.04373037396517271, + "grad_norm": 2.156553030014038, + "learning_rate": 4.9764508623703166e-05, + "loss": 5.664, + "step": 7353 + }, + { + "epoch": 0.043736321248453704, + "grad_norm": 2.317331075668335, + "learning_rate": 4.9764444658289796e-05, + "loss": 5.4473, + "step": 7354 + }, + { + "epoch": 0.043742268531734706, + "grad_norm": 2.1958348751068115, + "learning_rate": 4.976438068423141e-05, + "loss": 5.3584, + "step": 7355 + }, + { + "epoch": 0.0437482158150157, + "grad_norm": 2.152045249938965, + "learning_rate": 4.976431670152803e-05, + "loss": 5.4388, + "step": 7356 + }, + { + "epoch": 0.043754163098296696, + "grad_norm": 2.0661544799804688, + "learning_rate": 4.976425271017971e-05, + "loss": 5.3866, + "step": 7357 + }, + { + "epoch": 0.0437601103815777, + "grad_norm": 2.106480598449707, + "learning_rate": 4.976418871018642e-05, + "loss": 5.5928, + "step": 7358 + }, + { + "epoch": 0.04376605766485869, + "grad_norm": 2.5921759605407715, + "learning_rate": 4.976412470154821e-05, + "loss": 6.0133, + "step": 7359 + }, + { + "epoch": 0.04377200494813969, + "grad_norm": 2.4117794036865234, + "learning_rate": 4.97640606842651e-05, + "loss": 6.0988, + "step": 7360 + }, + { + "epoch": 0.04377795223142069, + "grad_norm": 1.9839050769805908, + "learning_rate": 4.976399665833712e-05, + "loss": 5.9568, + "step": 7361 + }, + { + "epoch": 0.043783899514701685, + "grad_norm": 2.166215419769287, + "learning_rate": 4.9763932623764285e-05, + "loss": 5.9205, + "step": 7362 + }, + { + "epoch": 0.04378984679798268, + "grad_norm": 2.8216545581817627, + "learning_rate": 4.9763868580546616e-05, + "loss": 5.792, + "step": 7363 + }, + { + "epoch": 0.043795794081263675, + "grad_norm": 2.907707929611206, + "learning_rate": 4.976380452868413e-05, + "loss": 5.5824, + "step": 7364 + }, + { + "epoch": 0.04380174136454468, + "grad_norm": 2.173025369644165, + "learning_rate": 4.976374046817686e-05, + "loss": 6.2752, + "step": 7365 + }, + { + "epoch": 0.04380768864782567, + "grad_norm": 2.1098685264587402, + "learning_rate": 4.9763676399024814e-05, + "loss": 5.8052, + "step": 7366 + }, + { + "epoch": 0.04381363593110667, + "grad_norm": 2.1980762481689453, + "learning_rate": 4.9763612321228035e-05, + "loss": 5.3456, + "step": 7367 + }, + { + "epoch": 0.04381958321438767, + "grad_norm": 2.091327667236328, + "learning_rate": 4.976354823478654e-05, + "loss": 5.211, + "step": 7368 + }, + { + "epoch": 0.043825530497668665, + "grad_norm": 2.37920880317688, + "learning_rate": 4.976348413970033e-05, + "loss": 5.8652, + "step": 7369 + }, + { + "epoch": 0.04383147778094966, + "grad_norm": 2.454202175140381, + "learning_rate": 4.976342003596946e-05, + "loss": 5.9654, + "step": 7370 + }, + { + "epoch": 0.04383742506423066, + "grad_norm": 2.04577898979187, + "learning_rate": 4.9763355923593927e-05, + "loss": 6.3042, + "step": 7371 + }, + { + "epoch": 0.04384337234751166, + "grad_norm": 2.358250141143799, + "learning_rate": 4.976329180257376e-05, + "loss": 6.1403, + "step": 7372 + }, + { + "epoch": 0.04384931963079265, + "grad_norm": 2.177819013595581, + "learning_rate": 4.9763227672909e-05, + "loss": 5.8993, + "step": 7373 + }, + { + "epoch": 0.043855266914073654, + "grad_norm": 2.24910569190979, + "learning_rate": 4.976316353459963e-05, + "loss": 5.9763, + "step": 7374 + }, + { + "epoch": 0.04386121419735465, + "grad_norm": 2.3985965251922607, + "learning_rate": 4.976309938764571e-05, + "loss": 6.2288, + "step": 7375 + }, + { + "epoch": 0.043867161480635644, + "grad_norm": 2.1250808238983154, + "learning_rate": 4.9763035232047244e-05, + "loss": 6.1588, + "step": 7376 + }, + { + "epoch": 0.043873108763916646, + "grad_norm": 1.9815669059753418, + "learning_rate": 4.976297106780426e-05, + "loss": 6.3202, + "step": 7377 + }, + { + "epoch": 0.04387905604719764, + "grad_norm": 2.181999683380127, + "learning_rate": 4.976290689491677e-05, + "loss": 5.9125, + "step": 7378 + }, + { + "epoch": 0.043885003330478636, + "grad_norm": 2.365546703338623, + "learning_rate": 4.9762842713384815e-05, + "loss": 6.0991, + "step": 7379 + }, + { + "epoch": 0.04389095061375963, + "grad_norm": 2.0843441486358643, + "learning_rate": 4.9762778523208406e-05, + "loss": 5.9675, + "step": 7380 + }, + { + "epoch": 0.04389689789704063, + "grad_norm": 2.271576404571533, + "learning_rate": 4.9762714324387566e-05, + "loss": 5.5703, + "step": 7381 + }, + { + "epoch": 0.04390284518032163, + "grad_norm": 2.244211435317993, + "learning_rate": 4.9762650116922314e-05, + "loss": 5.4674, + "step": 7382 + }, + { + "epoch": 0.043908792463602624, + "grad_norm": 1.728034257888794, + "learning_rate": 4.9762585900812684e-05, + "loss": 5.6264, + "step": 7383 + }, + { + "epoch": 0.043914739746883626, + "grad_norm": 2.400587320327759, + "learning_rate": 4.976252167605869e-05, + "loss": 6.052, + "step": 7384 + }, + { + "epoch": 0.04392068703016462, + "grad_norm": 1.9865821599960327, + "learning_rate": 4.9762457442660346e-05, + "loss": 5.8544, + "step": 7385 + }, + { + "epoch": 0.043926634313445616, + "grad_norm": 2.236527681350708, + "learning_rate": 4.97623932006177e-05, + "loss": 5.5033, + "step": 7386 + }, + { + "epoch": 0.04393258159672662, + "grad_norm": 2.0424020290374756, + "learning_rate": 4.9762328949930746e-05, + "loss": 5.4088, + "step": 7387 + }, + { + "epoch": 0.04393852888000761, + "grad_norm": 2.0601999759674072, + "learning_rate": 4.976226469059952e-05, + "loss": 5.8599, + "step": 7388 + }, + { + "epoch": 0.04394447616328861, + "grad_norm": 2.5052783489227295, + "learning_rate": 4.976220042262404e-05, + "loss": 5.8202, + "step": 7389 + }, + { + "epoch": 0.04395042344656961, + "grad_norm": 2.178549289703369, + "learning_rate": 4.9762136146004344e-05, + "loss": 5.4554, + "step": 7390 + }, + { + "epoch": 0.043956370729850605, + "grad_norm": 1.9407802820205688, + "learning_rate": 4.976207186074043e-05, + "loss": 5.4062, + "step": 7391 + }, + { + "epoch": 0.0439623180131316, + "grad_norm": 1.4814093112945557, + "learning_rate": 4.9762007566832336e-05, + "loss": 5.4662, + "step": 7392 + }, + { + "epoch": 0.043968265296412595, + "grad_norm": 1.8808835744857788, + "learning_rate": 4.9761943264280086e-05, + "loss": 6.1617, + "step": 7393 + }, + { + "epoch": 0.0439742125796936, + "grad_norm": 1.9318643808364868, + "learning_rate": 4.97618789530837e-05, + "loss": 6.1357, + "step": 7394 + }, + { + "epoch": 0.04398015986297459, + "grad_norm": 2.2515900135040283, + "learning_rate": 4.976181463324319e-05, + "loss": 6.11, + "step": 7395 + }, + { + "epoch": 0.04398610714625559, + "grad_norm": 2.375298023223877, + "learning_rate": 4.9761750304758584e-05, + "loss": 6.1121, + "step": 7396 + }, + { + "epoch": 0.04399205442953659, + "grad_norm": 2.2254321575164795, + "learning_rate": 4.9761685967629914e-05, + "loss": 6.0136, + "step": 7397 + }, + { + "epoch": 0.043998001712817585, + "grad_norm": 2.146164894104004, + "learning_rate": 4.976162162185719e-05, + "loss": 5.8391, + "step": 7398 + }, + { + "epoch": 0.04400394899609858, + "grad_norm": 2.3237650394439697, + "learning_rate": 4.976155726744044e-05, + "loss": 5.461, + "step": 7399 + }, + { + "epoch": 0.04400989627937958, + "grad_norm": 2.2263002395629883, + "learning_rate": 4.976149290437969e-05, + "loss": 5.5885, + "step": 7400 + }, + { + "epoch": 0.04401584356266058, + "grad_norm": 1.9597729444503784, + "learning_rate": 4.9761428532674956e-05, + "loss": 5.348, + "step": 7401 + }, + { + "epoch": 0.04402179084594157, + "grad_norm": 2.2215018272399902, + "learning_rate": 4.976136415232626e-05, + "loss": 5.933, + "step": 7402 + }, + { + "epoch": 0.044027738129222574, + "grad_norm": 2.258618116378784, + "learning_rate": 4.9761299763333635e-05, + "loss": 6.0685, + "step": 7403 + }, + { + "epoch": 0.04403368541250357, + "grad_norm": 2.3045873641967773, + "learning_rate": 4.976123536569709e-05, + "loss": 5.7277, + "step": 7404 + }, + { + "epoch": 0.044039632695784564, + "grad_norm": 2.546252489089966, + "learning_rate": 4.976117095941666e-05, + "loss": 5.8839, + "step": 7405 + }, + { + "epoch": 0.044045579979065566, + "grad_norm": 1.8963768482208252, + "learning_rate": 4.976110654449235e-05, + "loss": 6.1247, + "step": 7406 + }, + { + "epoch": 0.04405152726234656, + "grad_norm": 2.6287784576416016, + "learning_rate": 4.976104212092421e-05, + "loss": 5.9712, + "step": 7407 + }, + { + "epoch": 0.044057474545627556, + "grad_norm": 2.562612295150757, + "learning_rate": 4.976097768871223e-05, + "loss": 6.1226, + "step": 7408 + }, + { + "epoch": 0.04406342182890855, + "grad_norm": 2.2308688163757324, + "learning_rate": 4.976091324785645e-05, + "loss": 6.3235, + "step": 7409 + }, + { + "epoch": 0.04406936911218955, + "grad_norm": 2.4595553874969482, + "learning_rate": 4.976084879835691e-05, + "loss": 5.8164, + "step": 7410 + }, + { + "epoch": 0.04407531639547055, + "grad_norm": 2.3693978786468506, + "learning_rate": 4.97607843402136e-05, + "loss": 5.7727, + "step": 7411 + }, + { + "epoch": 0.044081263678751544, + "grad_norm": 4.144592761993408, + "learning_rate": 4.9760719873426546e-05, + "loss": 5.6382, + "step": 7412 + }, + { + "epoch": 0.044087210962032546, + "grad_norm": 2.5423779487609863, + "learning_rate": 4.9760655397995794e-05, + "loss": 5.7526, + "step": 7413 + }, + { + "epoch": 0.04409315824531354, + "grad_norm": 2.119281053543091, + "learning_rate": 4.976059091392135e-05, + "loss": 5.7246, + "step": 7414 + }, + { + "epoch": 0.044099105528594536, + "grad_norm": 2.177074432373047, + "learning_rate": 4.976052642120324e-05, + "loss": 5.7296, + "step": 7415 + }, + { + "epoch": 0.04410505281187554, + "grad_norm": 1.8897806406021118, + "learning_rate": 4.9760461919841486e-05, + "loss": 5.6349, + "step": 7416 + }, + { + "epoch": 0.04411100009515653, + "grad_norm": 2.445082187652588, + "learning_rate": 4.97603974098361e-05, + "loss": 5.7414, + "step": 7417 + }, + { + "epoch": 0.04411694737843753, + "grad_norm": 2.2564280033111572, + "learning_rate": 4.976033289118713e-05, + "loss": 5.6709, + "step": 7418 + }, + { + "epoch": 0.04412289466171853, + "grad_norm": 2.1907529830932617, + "learning_rate": 4.976026836389458e-05, + "loss": 5.6067, + "step": 7419 + }, + { + "epoch": 0.044128841944999525, + "grad_norm": 2.1872594356536865, + "learning_rate": 4.976020382795848e-05, + "loss": 5.5166, + "step": 7420 + }, + { + "epoch": 0.04413478922828052, + "grad_norm": 1.7740691900253296, + "learning_rate": 4.9760139283378835e-05, + "loss": 5.5833, + "step": 7421 + }, + { + "epoch": 0.044140736511561515, + "grad_norm": 2.128389358520508, + "learning_rate": 4.976007473015569e-05, + "loss": 5.6403, + "step": 7422 + }, + { + "epoch": 0.04414668379484252, + "grad_norm": 2.6193220615386963, + "learning_rate": 4.9760010168289053e-05, + "loss": 5.8139, + "step": 7423 + }, + { + "epoch": 0.04415263107812351, + "grad_norm": 2.727902412414551, + "learning_rate": 4.9759945597778955e-05, + "loss": 5.3286, + "step": 7424 + }, + { + "epoch": 0.04415857836140451, + "grad_norm": 2.4500436782836914, + "learning_rate": 4.975988101862542e-05, + "loss": 5.2647, + "step": 7425 + }, + { + "epoch": 0.04416452564468551, + "grad_norm": 2.1040356159210205, + "learning_rate": 4.975981643082846e-05, + "loss": 6.0935, + "step": 7426 + }, + { + "epoch": 0.044170472927966505, + "grad_norm": 1.9168792963027954, + "learning_rate": 4.975975183438811e-05, + "loss": 5.5147, + "step": 7427 + }, + { + "epoch": 0.0441764202112475, + "grad_norm": 2.0156469345092773, + "learning_rate": 4.9759687229304384e-05, + "loss": 6.2896, + "step": 7428 + }, + { + "epoch": 0.0441823674945285, + "grad_norm": 2.362933874130249, + "learning_rate": 4.975962261557731e-05, + "loss": 5.9514, + "step": 7429 + }, + { + "epoch": 0.0441883147778095, + "grad_norm": 2.2892727851867676, + "learning_rate": 4.9759557993206906e-05, + "loss": 5.5646, + "step": 7430 + }, + { + "epoch": 0.04419426206109049, + "grad_norm": 2.287722587585449, + "learning_rate": 4.97594933621932e-05, + "loss": 5.364, + "step": 7431 + }, + { + "epoch": 0.044200209344371494, + "grad_norm": 2.0421855449676514, + "learning_rate": 4.9759428722536194e-05, + "loss": 5.6838, + "step": 7432 + }, + { + "epoch": 0.04420615662765249, + "grad_norm": 2.2392499446868896, + "learning_rate": 4.9759364074235944e-05, + "loss": 6.0727, + "step": 7433 + }, + { + "epoch": 0.044212103910933484, + "grad_norm": 2.084768295288086, + "learning_rate": 4.975929941729245e-05, + "loss": 6.1208, + "step": 7434 + }, + { + "epoch": 0.044218051194214486, + "grad_norm": 1.817015528678894, + "learning_rate": 4.975923475170574e-05, + "loss": 6.3405, + "step": 7435 + }, + { + "epoch": 0.04422399847749548, + "grad_norm": 1.974926233291626, + "learning_rate": 4.9759170077475834e-05, + "loss": 5.9607, + "step": 7436 + }, + { + "epoch": 0.044229945760776476, + "grad_norm": 2.1244025230407715, + "learning_rate": 4.975910539460277e-05, + "loss": 6.2579, + "step": 7437 + }, + { + "epoch": 0.04423589304405747, + "grad_norm": 1.9459706544876099, + "learning_rate": 4.975904070308655e-05, + "loss": 5.5877, + "step": 7438 + }, + { + "epoch": 0.04424184032733847, + "grad_norm": 2.1891977787017822, + "learning_rate": 4.97589760029272e-05, + "loss": 5.9913, + "step": 7439 + }, + { + "epoch": 0.04424778761061947, + "grad_norm": 2.0368902683258057, + "learning_rate": 4.9758911294124756e-05, + "loss": 5.9478, + "step": 7440 + }, + { + "epoch": 0.044253734893900463, + "grad_norm": 2.2937796115875244, + "learning_rate": 4.975884657667922e-05, + "loss": 6.1529, + "step": 7441 + }, + { + "epoch": 0.044259682177181466, + "grad_norm": 2.601637125015259, + "learning_rate": 4.975878185059064e-05, + "loss": 5.4446, + "step": 7442 + }, + { + "epoch": 0.04426562946046246, + "grad_norm": 2.2025954723358154, + "learning_rate": 4.975871711585902e-05, + "loss": 5.8911, + "step": 7443 + }, + { + "epoch": 0.044271576743743456, + "grad_norm": 2.0498836040496826, + "learning_rate": 4.975865237248438e-05, + "loss": 6.0604, + "step": 7444 + }, + { + "epoch": 0.04427752402702446, + "grad_norm": 2.308239459991455, + "learning_rate": 4.975858762046676e-05, + "loss": 5.9599, + "step": 7445 + }, + { + "epoch": 0.04428347131030545, + "grad_norm": 2.286747455596924, + "learning_rate": 4.9758522859806165e-05, + "loss": 6.3528, + "step": 7446 + }, + { + "epoch": 0.04428941859358645, + "grad_norm": 2.2376902103424072, + "learning_rate": 4.975845809050264e-05, + "loss": 6.205, + "step": 7447 + }, + { + "epoch": 0.04429536587686745, + "grad_norm": 1.8052057027816772, + "learning_rate": 4.9758393312556176e-05, + "loss": 6.2188, + "step": 7448 + }, + { + "epoch": 0.044301313160148445, + "grad_norm": 1.9839476346969604, + "learning_rate": 4.975832852596682e-05, + "loss": 6.1479, + "step": 7449 + }, + { + "epoch": 0.04430726044342944, + "grad_norm": 1.8890517950057983, + "learning_rate": 4.975826373073459e-05, + "loss": 6.2524, + "step": 7450 + }, + { + "epoch": 0.04431320772671044, + "grad_norm": 2.049192428588867, + "learning_rate": 4.97581989268595e-05, + "loss": 5.5486, + "step": 7451 + }, + { + "epoch": 0.04431915500999144, + "grad_norm": 2.8271291255950928, + "learning_rate": 4.975813411434158e-05, + "loss": 5.1916, + "step": 7452 + }, + { + "epoch": 0.04432510229327243, + "grad_norm": 1.94833505153656, + "learning_rate": 4.975806929318085e-05, + "loss": 5.6747, + "step": 7453 + }, + { + "epoch": 0.04433104957655343, + "grad_norm": 2.14536190032959, + "learning_rate": 4.975800446337734e-05, + "loss": 5.4066, + "step": 7454 + }, + { + "epoch": 0.04433699685983443, + "grad_norm": 2.5557188987731934, + "learning_rate": 4.975793962493106e-05, + "loss": 5.2257, + "step": 7455 + }, + { + "epoch": 0.044342944143115424, + "grad_norm": 2.4718832969665527, + "learning_rate": 4.975787477784205e-05, + "loss": 6.0248, + "step": 7456 + }, + { + "epoch": 0.04434889142639642, + "grad_norm": 2.8627419471740723, + "learning_rate": 4.975780992211031e-05, + "loss": 5.3245, + "step": 7457 + }, + { + "epoch": 0.04435483870967742, + "grad_norm": 2.932990789413452, + "learning_rate": 4.9757745057735876e-05, + "loss": 4.8914, + "step": 7458 + }, + { + "epoch": 0.04436078599295842, + "grad_norm": 2.6231770515441895, + "learning_rate": 4.975768018471877e-05, + "loss": 5.3323, + "step": 7459 + }, + { + "epoch": 0.04436673327623941, + "grad_norm": 2.5591986179351807, + "learning_rate": 4.975761530305901e-05, + "loss": 5.4972, + "step": 7460 + }, + { + "epoch": 0.044372680559520414, + "grad_norm": 2.4060492515563965, + "learning_rate": 4.975755041275664e-05, + "loss": 5.5988, + "step": 7461 + }, + { + "epoch": 0.04437862784280141, + "grad_norm": 2.377260446548462, + "learning_rate": 4.975748551381164e-05, + "loss": 5.2137, + "step": 7462 + }, + { + "epoch": 0.044384575126082404, + "grad_norm": 2.171934127807617, + "learning_rate": 4.9757420606224076e-05, + "loss": 5.6313, + "step": 7463 + }, + { + "epoch": 0.044390522409363406, + "grad_norm": 2.1225788593292236, + "learning_rate": 4.975735568999394e-05, + "loss": 5.839, + "step": 7464 + }, + { + "epoch": 0.0443964696926444, + "grad_norm": 2.271127939224243, + "learning_rate": 4.975729076512128e-05, + "loss": 5.7111, + "step": 7465 + }, + { + "epoch": 0.044402416975925396, + "grad_norm": 2.7138264179229736, + "learning_rate": 4.975722583160609e-05, + "loss": 5.3169, + "step": 7466 + }, + { + "epoch": 0.04440836425920639, + "grad_norm": 2.8181982040405273, + "learning_rate": 4.9757160889448416e-05, + "loss": 5.3323, + "step": 7467 + }, + { + "epoch": 0.04441431154248739, + "grad_norm": 2.680816411972046, + "learning_rate": 4.975709593864828e-05, + "loss": 5.6924, + "step": 7468 + }, + { + "epoch": 0.04442025882576839, + "grad_norm": 2.3682074546813965, + "learning_rate": 4.975703097920569e-05, + "loss": 6.0049, + "step": 7469 + }, + { + "epoch": 0.04442620610904938, + "grad_norm": 2.3080508708953857, + "learning_rate": 4.9756966011120674e-05, + "loss": 6.4438, + "step": 7470 + }, + { + "epoch": 0.044432153392330385, + "grad_norm": 2.2631113529205322, + "learning_rate": 4.9756901034393265e-05, + "loss": 5.9296, + "step": 7471 + }, + { + "epoch": 0.04443810067561138, + "grad_norm": 2.283712148666382, + "learning_rate": 4.975683604902347e-05, + "loss": 5.831, + "step": 7472 + }, + { + "epoch": 0.044444047958892376, + "grad_norm": 2.2130608558654785, + "learning_rate": 4.975677105501132e-05, + "loss": 5.8757, + "step": 7473 + }, + { + "epoch": 0.04444999524217338, + "grad_norm": 1.9392763376235962, + "learning_rate": 4.975670605235684e-05, + "loss": 5.5836, + "step": 7474 + }, + { + "epoch": 0.04445594252545437, + "grad_norm": 2.097076416015625, + "learning_rate": 4.975664104106005e-05, + "loss": 6.0782, + "step": 7475 + }, + { + "epoch": 0.04446188980873537, + "grad_norm": 2.063021183013916, + "learning_rate": 4.975657602112097e-05, + "loss": 6.2171, + "step": 7476 + }, + { + "epoch": 0.04446783709201637, + "grad_norm": 2.4466049671173096, + "learning_rate": 4.9756510992539626e-05, + "loss": 5.8649, + "step": 7477 + }, + { + "epoch": 0.044473784375297365, + "grad_norm": 2.2160751819610596, + "learning_rate": 4.975644595531605e-05, + "loss": 5.9297, + "step": 7478 + }, + { + "epoch": 0.04447973165857836, + "grad_norm": 2.69352650642395, + "learning_rate": 4.975638090945024e-05, + "loss": 6.1062, + "step": 7479 + }, + { + "epoch": 0.04448567894185936, + "grad_norm": 2.2830610275268555, + "learning_rate": 4.975631585494224e-05, + "loss": 6.1663, + "step": 7480 + }, + { + "epoch": 0.04449162622514036, + "grad_norm": 2.936842203140259, + "learning_rate": 4.975625079179206e-05, + "loss": 5.9952, + "step": 7481 + }, + { + "epoch": 0.04449757350842135, + "grad_norm": 2.1398322582244873, + "learning_rate": 4.9756185719999725e-05, + "loss": 6.0005, + "step": 7482 + }, + { + "epoch": 0.04450352079170235, + "grad_norm": 2.2835536003112793, + "learning_rate": 4.9756120639565275e-05, + "loss": 5.7155, + "step": 7483 + }, + { + "epoch": 0.04450946807498335, + "grad_norm": 2.22917103767395, + "learning_rate": 4.975605555048871e-05, + "loss": 5.7134, + "step": 7484 + }, + { + "epoch": 0.044515415358264344, + "grad_norm": 2.0195605754852295, + "learning_rate": 4.975599045277006e-05, + "loss": 5.6369, + "step": 7485 + }, + { + "epoch": 0.04452136264154534, + "grad_norm": 1.8495477437973022, + "learning_rate": 4.975592534640936e-05, + "loss": 5.9035, + "step": 7486 + }, + { + "epoch": 0.04452730992482634, + "grad_norm": 2.4814226627349854, + "learning_rate": 4.9755860231406616e-05, + "loss": 6.1024, + "step": 7487 + }, + { + "epoch": 0.04453325720810734, + "grad_norm": 2.221820831298828, + "learning_rate": 4.975579510776186e-05, + "loss": 6.1193, + "step": 7488 + }, + { + "epoch": 0.04453920449138833, + "grad_norm": 1.935722827911377, + "learning_rate": 4.975572997547511e-05, + "loss": 6.1088, + "step": 7489 + }, + { + "epoch": 0.044545151774669334, + "grad_norm": 2.1287481784820557, + "learning_rate": 4.975566483454638e-05, + "loss": 6.1064, + "step": 7490 + }, + { + "epoch": 0.04455109905795033, + "grad_norm": 2.1914093494415283, + "learning_rate": 4.9755599684975716e-05, + "loss": 6.072, + "step": 7491 + }, + { + "epoch": 0.044557046341231324, + "grad_norm": 2.1979966163635254, + "learning_rate": 4.975553452676312e-05, + "loss": 6.1447, + "step": 7492 + }, + { + "epoch": 0.044562993624512326, + "grad_norm": 2.108259916305542, + "learning_rate": 4.975546935990863e-05, + "loss": 6.0109, + "step": 7493 + }, + { + "epoch": 0.04456894090779332, + "grad_norm": 2.2454450130462646, + "learning_rate": 4.975540418441226e-05, + "loss": 5.8627, + "step": 7494 + }, + { + "epoch": 0.044574888191074316, + "grad_norm": 2.151130437850952, + "learning_rate": 4.9755339000274027e-05, + "loss": 6.0241, + "step": 7495 + }, + { + "epoch": 0.04458083547435531, + "grad_norm": 1.9150489568710327, + "learning_rate": 4.975527380749397e-05, + "loss": 6.0179, + "step": 7496 + }, + { + "epoch": 0.04458678275763631, + "grad_norm": 1.9065133333206177, + "learning_rate": 4.97552086060721e-05, + "loss": 5.9991, + "step": 7497 + }, + { + "epoch": 0.04459273004091731, + "grad_norm": 1.9627622365951538, + "learning_rate": 4.975514339600844e-05, + "loss": 5.9633, + "step": 7498 + }, + { + "epoch": 0.0445986773241983, + "grad_norm": 1.7777502536773682, + "learning_rate": 4.975507817730302e-05, + "loss": 5.9426, + "step": 7499 + }, + { + "epoch": 0.044604624607479305, + "grad_norm": 1.6735023260116577, + "learning_rate": 4.9755012949955846e-05, + "loss": 5.9432, + "step": 7500 + }, + { + "epoch": 0.0446105718907603, + "grad_norm": 2.1570491790771484, + "learning_rate": 4.975494771396697e-05, + "loss": 6.2032, + "step": 7501 + }, + { + "epoch": 0.044616519174041296, + "grad_norm": 2.286522150039673, + "learning_rate": 4.9754882469336387e-05, + "loss": 5.7226, + "step": 7502 + }, + { + "epoch": 0.0446224664573223, + "grad_norm": 2.1940622329711914, + "learning_rate": 4.975481721606413e-05, + "loss": 6.2215, + "step": 7503 + }, + { + "epoch": 0.04462841374060329, + "grad_norm": 2.329263210296631, + "learning_rate": 4.9754751954150224e-05, + "loss": 5.5403, + "step": 7504 + }, + { + "epoch": 0.04463436102388429, + "grad_norm": 2.112712860107422, + "learning_rate": 4.975468668359469e-05, + "loss": 5.7581, + "step": 7505 + }, + { + "epoch": 0.04464030830716529, + "grad_norm": 2.2875239849090576, + "learning_rate": 4.975462140439755e-05, + "loss": 5.9593, + "step": 7506 + }, + { + "epoch": 0.044646255590446285, + "grad_norm": 2.282121419906616, + "learning_rate": 4.975455611655883e-05, + "loss": 5.8684, + "step": 7507 + }, + { + "epoch": 0.04465220287372728, + "grad_norm": 1.8482197523117065, + "learning_rate": 4.975449082007855e-05, + "loss": 5.753, + "step": 7508 + }, + { + "epoch": 0.04465815015700828, + "grad_norm": 2.6635684967041016, + "learning_rate": 4.9754425514956724e-05, + "loss": 5.0732, + "step": 7509 + }, + { + "epoch": 0.04466409744028928, + "grad_norm": 2.6632800102233887, + "learning_rate": 4.9754360201193395e-05, + "loss": 5.1644, + "step": 7510 + }, + { + "epoch": 0.04467004472357027, + "grad_norm": 2.630445718765259, + "learning_rate": 4.9754294878788574e-05, + "loss": 5.0322, + "step": 7511 + }, + { + "epoch": 0.04467599200685127, + "grad_norm": 2.4036223888397217, + "learning_rate": 4.975422954774228e-05, + "loss": 4.8949, + "step": 7512 + }, + { + "epoch": 0.04468193929013227, + "grad_norm": 2.381810426712036, + "learning_rate": 4.9754164208054535e-05, + "loss": 5.7921, + "step": 7513 + }, + { + "epoch": 0.044687886573413264, + "grad_norm": 2.570949077606201, + "learning_rate": 4.9754098859725377e-05, + "loss": 5.9612, + "step": 7514 + }, + { + "epoch": 0.04469383385669426, + "grad_norm": 2.510998010635376, + "learning_rate": 4.9754033502754815e-05, + "loss": 5.7273, + "step": 7515 + }, + { + "epoch": 0.04469978113997526, + "grad_norm": 2.6216115951538086, + "learning_rate": 4.975396813714288e-05, + "loss": 5.7601, + "step": 7516 + }, + { + "epoch": 0.04470572842325626, + "grad_norm": 2.5298542976379395, + "learning_rate": 4.975390276288958e-05, + "loss": 5.8007, + "step": 7517 + }, + { + "epoch": 0.04471167570653725, + "grad_norm": 2.6195290088653564, + "learning_rate": 4.975383737999496e-05, + "loss": 5.6071, + "step": 7518 + }, + { + "epoch": 0.044717622989818254, + "grad_norm": 2.5432629585266113, + "learning_rate": 4.975377198845902e-05, + "loss": 6.0224, + "step": 7519 + }, + { + "epoch": 0.04472357027309925, + "grad_norm": 2.2290337085723877, + "learning_rate": 4.97537065882818e-05, + "loss": 5.7141, + "step": 7520 + }, + { + "epoch": 0.044729517556380244, + "grad_norm": 2.627206802368164, + "learning_rate": 4.975364117946332e-05, + "loss": 6.2518, + "step": 7521 + }, + { + "epoch": 0.044735464839661246, + "grad_norm": 2.386993169784546, + "learning_rate": 4.975357576200359e-05, + "loss": 6.0494, + "step": 7522 + }, + { + "epoch": 0.04474141212294224, + "grad_norm": 2.20511794090271, + "learning_rate": 4.9753510335902656e-05, + "loss": 6.2563, + "step": 7523 + }, + { + "epoch": 0.044747359406223236, + "grad_norm": 2.5564749240875244, + "learning_rate": 4.975344490116052e-05, + "loss": 6.2498, + "step": 7524 + }, + { + "epoch": 0.04475330668950423, + "grad_norm": 2.6001932621002197, + "learning_rate": 4.975337945777721e-05, + "loss": 5.6721, + "step": 7525 + }, + { + "epoch": 0.04475925397278523, + "grad_norm": 2.6677772998809814, + "learning_rate": 4.975331400575275e-05, + "loss": 5.88, + "step": 7526 + }, + { + "epoch": 0.04476520125606623, + "grad_norm": 3.616734027862549, + "learning_rate": 4.975324854508716e-05, + "loss": 5.4835, + "step": 7527 + }, + { + "epoch": 0.04477114853934722, + "grad_norm": 3.0301461219787598, + "learning_rate": 4.975318307578048e-05, + "loss": 5.326, + "step": 7528 + }, + { + "epoch": 0.044777095822628225, + "grad_norm": 2.029836893081665, + "learning_rate": 4.975311759783271e-05, + "loss": 5.3516, + "step": 7529 + }, + { + "epoch": 0.04478304310590922, + "grad_norm": 1.9886969327926636, + "learning_rate": 4.9753052111243885e-05, + "loss": 5.3442, + "step": 7530 + }, + { + "epoch": 0.044788990389190216, + "grad_norm": 2.4227612018585205, + "learning_rate": 4.975298661601403e-05, + "loss": 5.4273, + "step": 7531 + }, + { + "epoch": 0.04479493767247122, + "grad_norm": 2.8426849842071533, + "learning_rate": 4.975292111214316e-05, + "loss": 5.6604, + "step": 7532 + }, + { + "epoch": 0.04480088495575221, + "grad_norm": 2.4818854331970215, + "learning_rate": 4.97528555996313e-05, + "loss": 6.4941, + "step": 7533 + }, + { + "epoch": 0.04480683223903321, + "grad_norm": 2.291642904281616, + "learning_rate": 4.9752790078478465e-05, + "loss": 6.404, + "step": 7534 + }, + { + "epoch": 0.04481277952231421, + "grad_norm": 2.4973669052124023, + "learning_rate": 4.9752724548684695e-05, + "loss": 5.6068, + "step": 7535 + }, + { + "epoch": 0.044818726805595205, + "grad_norm": 2.273130416870117, + "learning_rate": 4.975265901025001e-05, + "loss": 6.1689, + "step": 7536 + }, + { + "epoch": 0.0448246740888762, + "grad_norm": 3.362520456314087, + "learning_rate": 4.9752593463174424e-05, + "loss": 5.5346, + "step": 7537 + }, + { + "epoch": 0.0448306213721572, + "grad_norm": 5.170871257781982, + "learning_rate": 4.9752527907457956e-05, + "loss": 5.3831, + "step": 7538 + }, + { + "epoch": 0.0448365686554382, + "grad_norm": 4.224242687225342, + "learning_rate": 4.975246234310064e-05, + "loss": 5.2511, + "step": 7539 + }, + { + "epoch": 0.04484251593871919, + "grad_norm": 3.1753036975860596, + "learning_rate": 4.97523967701025e-05, + "loss": 5.06, + "step": 7540 + }, + { + "epoch": 0.04484846322200019, + "grad_norm": 2.4226467609405518, + "learning_rate": 4.975233118846355e-05, + "loss": 5.5225, + "step": 7541 + }, + { + "epoch": 0.04485441050528119, + "grad_norm": 2.5356781482696533, + "learning_rate": 4.9752265598183814e-05, + "loss": 5.5865, + "step": 7542 + }, + { + "epoch": 0.044860357788562184, + "grad_norm": 2.1505908966064453, + "learning_rate": 4.9752199999263326e-05, + "loss": 5.7436, + "step": 7543 + }, + { + "epoch": 0.04486630507184318, + "grad_norm": 2.675703763961792, + "learning_rate": 4.97521343917021e-05, + "loss": 5.3693, + "step": 7544 + }, + { + "epoch": 0.04487225235512418, + "grad_norm": 3.5228023529052734, + "learning_rate": 4.975206877550015e-05, + "loss": 4.8527, + "step": 7545 + }, + { + "epoch": 0.044878199638405177, + "grad_norm": 3.1165566444396973, + "learning_rate": 4.975200315065752e-05, + "loss": 4.7971, + "step": 7546 + }, + { + "epoch": 0.04488414692168617, + "grad_norm": 2.6216177940368652, + "learning_rate": 4.975193751717421e-05, + "loss": 4.9328, + "step": 7547 + }, + { + "epoch": 0.044890094204967174, + "grad_norm": 2.352031707763672, + "learning_rate": 4.975187187505026e-05, + "loss": 5.0021, + "step": 7548 + }, + { + "epoch": 0.04489604148824817, + "grad_norm": 1.8147127628326416, + "learning_rate": 4.975180622428569e-05, + "loss": 5.7009, + "step": 7549 + }, + { + "epoch": 0.044901988771529164, + "grad_norm": 2.1674726009368896, + "learning_rate": 4.9751740564880516e-05, + "loss": 5.2545, + "step": 7550 + }, + { + "epoch": 0.044907936054810166, + "grad_norm": 2.2935330867767334, + "learning_rate": 4.975167489683477e-05, + "loss": 5.2351, + "step": 7551 + }, + { + "epoch": 0.04491388333809116, + "grad_norm": 2.2964932918548584, + "learning_rate": 4.975160922014846e-05, + "loss": 5.483, + "step": 7552 + }, + { + "epoch": 0.044919830621372156, + "grad_norm": 1.8180936574935913, + "learning_rate": 4.9751543534821635e-05, + "loss": 5.668, + "step": 7553 + }, + { + "epoch": 0.04492577790465315, + "grad_norm": 1.906435251235962, + "learning_rate": 4.9751477840854286e-05, + "loss": 5.6664, + "step": 7554 + }, + { + "epoch": 0.04493172518793415, + "grad_norm": 2.459702253341675, + "learning_rate": 4.9751412138246455e-05, + "loss": 5.5272, + "step": 7555 + }, + { + "epoch": 0.04493767247121515, + "grad_norm": 2.1219170093536377, + "learning_rate": 4.975134642699817e-05, + "loss": 5.638, + "step": 7556 + }, + { + "epoch": 0.04494361975449614, + "grad_norm": 2.1492953300476074, + "learning_rate": 4.975128070710944e-05, + "loss": 5.9422, + "step": 7557 + }, + { + "epoch": 0.044949567037777145, + "grad_norm": 1.813988208770752, + "learning_rate": 4.97512149785803e-05, + "loss": 5.9875, + "step": 7558 + }, + { + "epoch": 0.04495551432105814, + "grad_norm": 1.6336817741394043, + "learning_rate": 4.975114924141075e-05, + "loss": 5.9245, + "step": 7559 + }, + { + "epoch": 0.044961461604339135, + "grad_norm": 1.9339455366134644, + "learning_rate": 4.9751083495600847e-05, + "loss": 5.3263, + "step": 7560 + }, + { + "epoch": 0.04496740888762014, + "grad_norm": 2.3459293842315674, + "learning_rate": 4.975101774115059e-05, + "loss": 5.4625, + "step": 7561 + }, + { + "epoch": 0.04497335617090113, + "grad_norm": 2.2994346618652344, + "learning_rate": 4.9750951978060004e-05, + "loss": 5.6327, + "step": 7562 + }, + { + "epoch": 0.04497930345418213, + "grad_norm": 2.1627299785614014, + "learning_rate": 4.975088620632912e-05, + "loss": 5.4882, + "step": 7563 + }, + { + "epoch": 0.04498525073746313, + "grad_norm": 2.763397693634033, + "learning_rate": 4.9750820425957954e-05, + "loss": 5.727, + "step": 7564 + }, + { + "epoch": 0.044991198020744125, + "grad_norm": 2.0107216835021973, + "learning_rate": 4.975075463694654e-05, + "loss": 5.3852, + "step": 7565 + }, + { + "epoch": 0.04499714530402512, + "grad_norm": 1.8424763679504395, + "learning_rate": 4.975068883929489e-05, + "loss": 5.3072, + "step": 7566 + }, + { + "epoch": 0.04500309258730612, + "grad_norm": 1.946702003479004, + "learning_rate": 4.975062303300303e-05, + "loss": 5.3184, + "step": 7567 + }, + { + "epoch": 0.04500903987058712, + "grad_norm": 2.1091182231903076, + "learning_rate": 4.9750557218070984e-05, + "loss": 5.0689, + "step": 7568 + }, + { + "epoch": 0.04501498715386811, + "grad_norm": 2.0064187049865723, + "learning_rate": 4.975049139449877e-05, + "loss": 4.8495, + "step": 7569 + }, + { + "epoch": 0.04502093443714911, + "grad_norm": 1.7544279098510742, + "learning_rate": 4.9750425562286416e-05, + "loss": 4.9524, + "step": 7570 + }, + { + "epoch": 0.04502688172043011, + "grad_norm": 2.0814568996429443, + "learning_rate": 4.9750359721433945e-05, + "loss": 4.798, + "step": 7571 + }, + { + "epoch": 0.045032829003711104, + "grad_norm": 2.1185543537139893, + "learning_rate": 4.975029387194139e-05, + "loss": 4.9313, + "step": 7572 + }, + { + "epoch": 0.0450387762869921, + "grad_norm": 2.3774518966674805, + "learning_rate": 4.975022801380875e-05, + "loss": 5.5954, + "step": 7573 + }, + { + "epoch": 0.0450447235702731, + "grad_norm": 2.261306047439575, + "learning_rate": 4.975016214703606e-05, + "loss": 5.5598, + "step": 7574 + }, + { + "epoch": 0.045050670853554096, + "grad_norm": 2.128244161605835, + "learning_rate": 4.975009627162335e-05, + "loss": 5.359, + "step": 7575 + }, + { + "epoch": 0.04505661813683509, + "grad_norm": 2.0767438411712646, + "learning_rate": 4.975003038757064e-05, + "loss": 5.6855, + "step": 7576 + }, + { + "epoch": 0.045062565420116094, + "grad_norm": 1.9789010286331177, + "learning_rate": 4.974996449487794e-05, + "loss": 5.1807, + "step": 7577 + }, + { + "epoch": 0.04506851270339709, + "grad_norm": 1.9136112928390503, + "learning_rate": 4.97498985935453e-05, + "loss": 5.3811, + "step": 7578 + }, + { + "epoch": 0.045074459986678084, + "grad_norm": 2.150641441345215, + "learning_rate": 4.974983268357271e-05, + "loss": 5.3281, + "step": 7579 + }, + { + "epoch": 0.045080407269959086, + "grad_norm": 1.9636656045913696, + "learning_rate": 4.9749766764960215e-05, + "loss": 5.5003, + "step": 7580 + }, + { + "epoch": 0.04508635455324008, + "grad_norm": 1.826335072517395, + "learning_rate": 4.974970083770783e-05, + "loss": 5.4687, + "step": 7581 + }, + { + "epoch": 0.045092301836521076, + "grad_norm": 1.9246041774749756, + "learning_rate": 4.974963490181558e-05, + "loss": 5.5373, + "step": 7582 + }, + { + "epoch": 0.04509824911980207, + "grad_norm": 1.8421686887741089, + "learning_rate": 4.974956895728349e-05, + "loss": 5.386, + "step": 7583 + }, + { + "epoch": 0.04510419640308307, + "grad_norm": 1.8685556650161743, + "learning_rate": 4.974950300411158e-05, + "loss": 5.5857, + "step": 7584 + }, + { + "epoch": 0.04511014368636407, + "grad_norm": 1.7022168636322021, + "learning_rate": 4.974943704229987e-05, + "loss": 5.2562, + "step": 7585 + }, + { + "epoch": 0.04511609096964506, + "grad_norm": 1.876855731010437, + "learning_rate": 4.97493710718484e-05, + "loss": 5.1359, + "step": 7586 + }, + { + "epoch": 0.045122038252926065, + "grad_norm": 1.8728361129760742, + "learning_rate": 4.974930509275717e-05, + "loss": 5.3124, + "step": 7587 + }, + { + "epoch": 0.04512798553620706, + "grad_norm": 1.930086612701416, + "learning_rate": 4.974923910502622e-05, + "loss": 5.3261, + "step": 7588 + }, + { + "epoch": 0.045133932819488055, + "grad_norm": 2.0309081077575684, + "learning_rate": 4.9749173108655564e-05, + "loss": 5.1138, + "step": 7589 + }, + { + "epoch": 0.04513988010276906, + "grad_norm": 2.042174816131592, + "learning_rate": 4.974910710364522e-05, + "loss": 5.3521, + "step": 7590 + }, + { + "epoch": 0.04514582738605005, + "grad_norm": 1.5278770923614502, + "learning_rate": 4.9749041089995224e-05, + "loss": 5.4075, + "step": 7591 + }, + { + "epoch": 0.04515177466933105, + "grad_norm": 1.7624976634979248, + "learning_rate": 4.974897506770559e-05, + "loss": 5.1698, + "step": 7592 + }, + { + "epoch": 0.04515772195261205, + "grad_norm": 1.9077380895614624, + "learning_rate": 4.974890903677635e-05, + "loss": 5.3973, + "step": 7593 + }, + { + "epoch": 0.045163669235893045, + "grad_norm": 1.5724380016326904, + "learning_rate": 4.974884299720752e-05, + "loss": 5.6325, + "step": 7594 + }, + { + "epoch": 0.04516961651917404, + "grad_norm": 1.9702832698822021, + "learning_rate": 4.974877694899913e-05, + "loss": 5.247, + "step": 7595 + }, + { + "epoch": 0.04517556380245504, + "grad_norm": 1.9913853406906128, + "learning_rate": 4.974871089215118e-05, + "loss": 5.6393, + "step": 7596 + }, + { + "epoch": 0.04518151108573604, + "grad_norm": 1.806470274925232, + "learning_rate": 4.974864482666372e-05, + "loss": 5.302, + "step": 7597 + }, + { + "epoch": 0.04518745836901703, + "grad_norm": 1.7056912183761597, + "learning_rate": 4.974857875253678e-05, + "loss": 5.4066, + "step": 7598 + }, + { + "epoch": 0.04519340565229803, + "grad_norm": 1.5990647077560425, + "learning_rate": 4.974851266977035e-05, + "loss": 5.4087, + "step": 7599 + }, + { + "epoch": 0.04519935293557903, + "grad_norm": 1.9233685731887817, + "learning_rate": 4.974844657836447e-05, + "loss": 5.4891, + "step": 7600 + }, + { + "epoch": 0.045205300218860024, + "grad_norm": 1.8654414415359497, + "learning_rate": 4.9748380478319165e-05, + "loss": 5.4955, + "step": 7601 + }, + { + "epoch": 0.04521124750214102, + "grad_norm": 1.7592424154281616, + "learning_rate": 4.974831436963446e-05, + "loss": 5.2298, + "step": 7602 + }, + { + "epoch": 0.04521719478542202, + "grad_norm": 1.8132792711257935, + "learning_rate": 4.974824825231037e-05, + "loss": 5.3487, + "step": 7603 + }, + { + "epoch": 0.045223142068703016, + "grad_norm": 1.8109947443008423, + "learning_rate": 4.974818212634692e-05, + "loss": 5.4511, + "step": 7604 + }, + { + "epoch": 0.04522908935198401, + "grad_norm": 1.96711266040802, + "learning_rate": 4.974811599174414e-05, + "loss": 5.3249, + "step": 7605 + }, + { + "epoch": 0.045235036635265014, + "grad_norm": 1.9123655557632446, + "learning_rate": 4.9748049848502054e-05, + "loss": 5.3681, + "step": 7606 + }, + { + "epoch": 0.04524098391854601, + "grad_norm": 1.7210376262664795, + "learning_rate": 4.974798369662067e-05, + "loss": 5.3441, + "step": 7607 + }, + { + "epoch": 0.045246931201827004, + "grad_norm": 1.590617060661316, + "learning_rate": 4.974791753610002e-05, + "loss": 5.5619, + "step": 7608 + }, + { + "epoch": 0.045252878485108006, + "grad_norm": 1.77785062789917, + "learning_rate": 4.974785136694013e-05, + "loss": 5.4717, + "step": 7609 + }, + { + "epoch": 0.045258825768389, + "grad_norm": 1.66475510597229, + "learning_rate": 4.9747785189141025e-05, + "loss": 5.3501, + "step": 7610 + }, + { + "epoch": 0.045264773051669996, + "grad_norm": 1.9176442623138428, + "learning_rate": 4.974771900270272e-05, + "loss": 5.1197, + "step": 7611 + }, + { + "epoch": 0.04527072033495099, + "grad_norm": 1.8143234252929688, + "learning_rate": 4.974765280762525e-05, + "loss": 5.3103, + "step": 7612 + }, + { + "epoch": 0.04527666761823199, + "grad_norm": 1.8954168558120728, + "learning_rate": 4.974758660390861e-05, + "loss": 5.2009, + "step": 7613 + }, + { + "epoch": 0.04528261490151299, + "grad_norm": 1.7779622077941895, + "learning_rate": 4.974752039155286e-05, + "loss": 5.519, + "step": 7614 + }, + { + "epoch": 0.04528856218479398, + "grad_norm": 1.8181761503219604, + "learning_rate": 4.9747454170558e-05, + "loss": 5.4967, + "step": 7615 + }, + { + "epoch": 0.045294509468074985, + "grad_norm": 1.657665491104126, + "learning_rate": 4.9747387940924064e-05, + "loss": 5.6437, + "step": 7616 + }, + { + "epoch": 0.04530045675135598, + "grad_norm": 1.7993237972259521, + "learning_rate": 4.974732170265107e-05, + "loss": 5.3094, + "step": 7617 + }, + { + "epoch": 0.045306404034636975, + "grad_norm": 1.8798805475234985, + "learning_rate": 4.974725545573904e-05, + "loss": 5.3268, + "step": 7618 + }, + { + "epoch": 0.04531235131791798, + "grad_norm": 1.9271420240402222, + "learning_rate": 4.974718920018799e-05, + "loss": 5.3405, + "step": 7619 + }, + { + "epoch": 0.04531829860119897, + "grad_norm": 1.9256294965744019, + "learning_rate": 4.9747122935997967e-05, + "loss": 5.3118, + "step": 7620 + }, + { + "epoch": 0.04532424588447997, + "grad_norm": 2.3345041275024414, + "learning_rate": 4.9747056663168965e-05, + "loss": 4.9813, + "step": 7621 + }, + { + "epoch": 0.04533019316776097, + "grad_norm": 1.7056258916854858, + "learning_rate": 4.974699038170103e-05, + "loss": 5.4725, + "step": 7622 + }, + { + "epoch": 0.045336140451041965, + "grad_norm": 2.075711250305176, + "learning_rate": 4.9746924091594174e-05, + "loss": 5.2215, + "step": 7623 + }, + { + "epoch": 0.04534208773432296, + "grad_norm": 1.818048357963562, + "learning_rate": 4.974685779284843e-05, + "loss": 5.0463, + "step": 7624 + }, + { + "epoch": 0.04534803501760396, + "grad_norm": 1.6590908765792847, + "learning_rate": 4.9746791485463806e-05, + "loss": 5.2476, + "step": 7625 + }, + { + "epoch": 0.04535398230088496, + "grad_norm": 2.2024991512298584, + "learning_rate": 4.974672516944033e-05, + "loss": 5.6437, + "step": 7626 + }, + { + "epoch": 0.04535992958416595, + "grad_norm": 1.71639883518219, + "learning_rate": 4.974665884477803e-05, + "loss": 5.2418, + "step": 7627 + }, + { + "epoch": 0.04536587686744695, + "grad_norm": 1.75436270236969, + "learning_rate": 4.974659251147693e-05, + "loss": 5.2209, + "step": 7628 + }, + { + "epoch": 0.04537182415072795, + "grad_norm": 2.577916383743286, + "learning_rate": 4.974652616953705e-05, + "loss": 5.2385, + "step": 7629 + }, + { + "epoch": 0.045377771434008944, + "grad_norm": 1.9784717559814453, + "learning_rate": 4.9746459818958416e-05, + "loss": 5.265, + "step": 7630 + }, + { + "epoch": 0.04538371871728994, + "grad_norm": 1.971383810043335, + "learning_rate": 4.974639345974104e-05, + "loss": 5.0548, + "step": 7631 + }, + { + "epoch": 0.04538966600057094, + "grad_norm": 2.096876621246338, + "learning_rate": 4.974632709188496e-05, + "loss": 5.1491, + "step": 7632 + }, + { + "epoch": 0.045395613283851936, + "grad_norm": 1.6079102754592896, + "learning_rate": 4.974626071539019e-05, + "loss": 5.1959, + "step": 7633 + }, + { + "epoch": 0.04540156056713293, + "grad_norm": 1.6881030797958374, + "learning_rate": 4.9746194330256755e-05, + "loss": 5.1772, + "step": 7634 + }, + { + "epoch": 0.04540750785041393, + "grad_norm": 1.7459675073623657, + "learning_rate": 4.974612793648469e-05, + "loss": 5.1885, + "step": 7635 + }, + { + "epoch": 0.04541345513369493, + "grad_norm": 1.739272117614746, + "learning_rate": 4.9746061534073993e-05, + "loss": 5.318, + "step": 7636 + }, + { + "epoch": 0.045419402416975924, + "grad_norm": 1.7761027812957764, + "learning_rate": 4.974599512302471e-05, + "loss": 5.1525, + "step": 7637 + }, + { + "epoch": 0.045425349700256926, + "grad_norm": 1.8695855140686035, + "learning_rate": 4.9745928703336854e-05, + "loss": 5.5754, + "step": 7638 + }, + { + "epoch": 0.04543129698353792, + "grad_norm": 1.8737404346466064, + "learning_rate": 4.9745862275010446e-05, + "loss": 5.2908, + "step": 7639 + }, + { + "epoch": 0.045437244266818916, + "grad_norm": 1.731676459312439, + "learning_rate": 4.9745795838045515e-05, + "loss": 5.2671, + "step": 7640 + }, + { + "epoch": 0.04544319155009991, + "grad_norm": 1.6687474250793457, + "learning_rate": 4.974572939244209e-05, + "loss": 5.1629, + "step": 7641 + }, + { + "epoch": 0.04544913883338091, + "grad_norm": 2.1376633644104004, + "learning_rate": 4.974566293820018e-05, + "loss": 5.2853, + "step": 7642 + }, + { + "epoch": 0.04545508611666191, + "grad_norm": 2.0989861488342285, + "learning_rate": 4.974559647531981e-05, + "loss": 5.1311, + "step": 7643 + }, + { + "epoch": 0.0454610333999429, + "grad_norm": 2.3433620929718018, + "learning_rate": 4.974553000380102e-05, + "loss": 4.9854, + "step": 7644 + }, + { + "epoch": 0.045466980683223905, + "grad_norm": 2.306170701980591, + "learning_rate": 4.974546352364381e-05, + "loss": 5.3152, + "step": 7645 + }, + { + "epoch": 0.0454729279665049, + "grad_norm": 1.9588537216186523, + "learning_rate": 4.974539703484822e-05, + "loss": 5.3903, + "step": 7646 + }, + { + "epoch": 0.045478875249785895, + "grad_norm": 1.7994736433029175, + "learning_rate": 4.9745330537414265e-05, + "loss": 5.2505, + "step": 7647 + }, + { + "epoch": 0.0454848225330669, + "grad_norm": 1.983175277709961, + "learning_rate": 4.974526403134197e-05, + "loss": 5.2607, + "step": 7648 + }, + { + "epoch": 0.04549076981634789, + "grad_norm": 1.8853832483291626, + "learning_rate": 4.974519751663136e-05, + "loss": 5.1475, + "step": 7649 + }, + { + "epoch": 0.04549671709962889, + "grad_norm": 1.9374700784683228, + "learning_rate": 4.9745130993282464e-05, + "loss": 5.2039, + "step": 7650 + }, + { + "epoch": 0.04550266438290989, + "grad_norm": 1.8200404644012451, + "learning_rate": 4.974506446129529e-05, + "loss": 5.2794, + "step": 7651 + }, + { + "epoch": 0.045508611666190885, + "grad_norm": 1.8375320434570312, + "learning_rate": 4.974499792066987e-05, + "loss": 5.1149, + "step": 7652 + }, + { + "epoch": 0.04551455894947188, + "grad_norm": 1.7842520475387573, + "learning_rate": 4.974493137140623e-05, + "loss": 5.0332, + "step": 7653 + }, + { + "epoch": 0.04552050623275288, + "grad_norm": 2.0220818519592285, + "learning_rate": 4.974486481350439e-05, + "loss": 5.0277, + "step": 7654 + }, + { + "epoch": 0.04552645351603388, + "grad_norm": 2.0787746906280518, + "learning_rate": 4.9744798246964375e-05, + "loss": 5.0587, + "step": 7655 + }, + { + "epoch": 0.04553240079931487, + "grad_norm": 1.7024985551834106, + "learning_rate": 4.97447316717862e-05, + "loss": 5.0184, + "step": 7656 + }, + { + "epoch": 0.04553834808259587, + "grad_norm": 1.9057540893554688, + "learning_rate": 4.97446650879699e-05, + "loss": 5.3945, + "step": 7657 + }, + { + "epoch": 0.04554429536587687, + "grad_norm": 1.7963287830352783, + "learning_rate": 4.974459849551549e-05, + "loss": 4.9869, + "step": 7658 + }, + { + "epoch": 0.045550242649157864, + "grad_norm": 2.027353286743164, + "learning_rate": 4.974453189442299e-05, + "loss": 5.1389, + "step": 7659 + }, + { + "epoch": 0.04555618993243886, + "grad_norm": 1.7137126922607422, + "learning_rate": 4.9744465284692445e-05, + "loss": 5.058, + "step": 7660 + }, + { + "epoch": 0.04556213721571986, + "grad_norm": 2.0363876819610596, + "learning_rate": 4.9744398666323854e-05, + "loss": 4.9174, + "step": 7661 + }, + { + "epoch": 0.045568084499000856, + "grad_norm": 2.1440837383270264, + "learning_rate": 4.9744332039317255e-05, + "loss": 4.8894, + "step": 7662 + }, + { + "epoch": 0.04557403178228185, + "grad_norm": 1.9582308530807495, + "learning_rate": 4.9744265403672655e-05, + "loss": 5.0666, + "step": 7663 + }, + { + "epoch": 0.04557997906556285, + "grad_norm": 1.9997116327285767, + "learning_rate": 4.97441987593901e-05, + "loss": 5.0804, + "step": 7664 + }, + { + "epoch": 0.04558592634884385, + "grad_norm": 2.067361831665039, + "learning_rate": 4.9744132106469586e-05, + "loss": 4.8655, + "step": 7665 + }, + { + "epoch": 0.045591873632124844, + "grad_norm": 1.7066930532455444, + "learning_rate": 4.9744065444911165e-05, + "loss": 4.792, + "step": 7666 + }, + { + "epoch": 0.045597820915405846, + "grad_norm": 1.8526182174682617, + "learning_rate": 4.974399877471484e-05, + "loss": 4.755, + "step": 7667 + }, + { + "epoch": 0.04560376819868684, + "grad_norm": 1.8744564056396484, + "learning_rate": 4.9743932095880644e-05, + "loss": 4.7732, + "step": 7668 + }, + { + "epoch": 0.045609715481967836, + "grad_norm": 1.849574327468872, + "learning_rate": 4.97438654084086e-05, + "loss": 4.7743, + "step": 7669 + }, + { + "epoch": 0.04561566276524884, + "grad_norm": 1.87284255027771, + "learning_rate": 4.9743798712298714e-05, + "loss": 5.0582, + "step": 7670 + }, + { + "epoch": 0.04562161004852983, + "grad_norm": 2.206273078918457, + "learning_rate": 4.974373200755104e-05, + "loss": 5.4683, + "step": 7671 + }, + { + "epoch": 0.04562755733181083, + "grad_norm": 1.9849058389663696, + "learning_rate": 4.974366529416557e-05, + "loss": 5.4087, + "step": 7672 + }, + { + "epoch": 0.04563350461509182, + "grad_norm": 1.9440083503723145, + "learning_rate": 4.974359857214235e-05, + "loss": 4.9607, + "step": 7673 + }, + { + "epoch": 0.045639451898372825, + "grad_norm": 1.7112319469451904, + "learning_rate": 4.974353184148139e-05, + "loss": 5.6589, + "step": 7674 + }, + { + "epoch": 0.04564539918165382, + "grad_norm": 1.921215295791626, + "learning_rate": 4.974346510218273e-05, + "loss": 5.4495, + "step": 7675 + }, + { + "epoch": 0.045651346464934815, + "grad_norm": 1.9582061767578125, + "learning_rate": 4.974339835424637e-05, + "loss": 5.2459, + "step": 7676 + }, + { + "epoch": 0.04565729374821582, + "grad_norm": 1.9781824350357056, + "learning_rate": 4.974333159767235e-05, + "loss": 5.3424, + "step": 7677 + }, + { + "epoch": 0.04566324103149681, + "grad_norm": 1.7183479070663452, + "learning_rate": 4.974326483246069e-05, + "loss": 5.3741, + "step": 7678 + }, + { + "epoch": 0.04566918831477781, + "grad_norm": 1.7942447662353516, + "learning_rate": 4.974319805861141e-05, + "loss": 5.4008, + "step": 7679 + }, + { + "epoch": 0.04567513559805881, + "grad_norm": 1.8255115747451782, + "learning_rate": 4.974313127612454e-05, + "loss": 5.1849, + "step": 7680 + }, + { + "epoch": 0.045681082881339805, + "grad_norm": 1.7907564640045166, + "learning_rate": 4.974306448500009e-05, + "loss": 5.1757, + "step": 7681 + }, + { + "epoch": 0.0456870301646208, + "grad_norm": 2.911489486694336, + "learning_rate": 4.97429976852381e-05, + "loss": 4.8909, + "step": 7682 + }, + { + "epoch": 0.0456929774479018, + "grad_norm": 2.849125623703003, + "learning_rate": 4.9742930876838576e-05, + "loss": 4.7733, + "step": 7683 + }, + { + "epoch": 0.0456989247311828, + "grad_norm": 2.4196949005126953, + "learning_rate": 4.9742864059801565e-05, + "loss": 4.8571, + "step": 7684 + }, + { + "epoch": 0.04570487201446379, + "grad_norm": 1.9430558681488037, + "learning_rate": 4.974279723412706e-05, + "loss": 5.1338, + "step": 7685 + }, + { + "epoch": 0.04571081929774479, + "grad_norm": 1.7538554668426514, + "learning_rate": 4.9742730399815105e-05, + "loss": 5.5524, + "step": 7686 + }, + { + "epoch": 0.04571676658102579, + "grad_norm": 2.006115198135376, + "learning_rate": 4.9742663556865724e-05, + "loss": 5.3343, + "step": 7687 + }, + { + "epoch": 0.045722713864306784, + "grad_norm": 2.554234027862549, + "learning_rate": 4.974259670527893e-05, + "loss": 5.8426, + "step": 7688 + }, + { + "epoch": 0.04572866114758778, + "grad_norm": 2.656747579574585, + "learning_rate": 4.974252984505475e-05, + "loss": 5.1578, + "step": 7689 + }, + { + "epoch": 0.04573460843086878, + "grad_norm": 2.800208568572998, + "learning_rate": 4.9742462976193216e-05, + "loss": 4.8019, + "step": 7690 + }, + { + "epoch": 0.045740555714149776, + "grad_norm": 2.674938201904297, + "learning_rate": 4.974239609869433e-05, + "loss": 4.7177, + "step": 7691 + }, + { + "epoch": 0.04574650299743077, + "grad_norm": 2.751533269882202, + "learning_rate": 4.974232921255815e-05, + "loss": 4.7568, + "step": 7692 + }, + { + "epoch": 0.04575245028071177, + "grad_norm": 2.623917818069458, + "learning_rate": 4.974226231778466e-05, + "loss": 4.5908, + "step": 7693 + }, + { + "epoch": 0.04575839756399277, + "grad_norm": 2.2248899936676025, + "learning_rate": 4.9742195414373904e-05, + "loss": 5.4066, + "step": 7694 + }, + { + "epoch": 0.045764344847273764, + "grad_norm": 1.7959388494491577, + "learning_rate": 4.974212850232591e-05, + "loss": 6.1414, + "step": 7695 + }, + { + "epoch": 0.045770292130554766, + "grad_norm": 2.0049352645874023, + "learning_rate": 4.974206158164069e-05, + "loss": 6.0106, + "step": 7696 + }, + { + "epoch": 0.04577623941383576, + "grad_norm": 2.4794270992279053, + "learning_rate": 4.9741994652318276e-05, + "loss": 5.8647, + "step": 7697 + }, + { + "epoch": 0.045782186697116756, + "grad_norm": 3.9380109310150146, + "learning_rate": 4.974192771435868e-05, + "loss": 5.719, + "step": 7698 + }, + { + "epoch": 0.04578813398039776, + "grad_norm": 2.564023017883301, + "learning_rate": 4.974186076776194e-05, + "loss": 4.7294, + "step": 7699 + }, + { + "epoch": 0.04579408126367875, + "grad_norm": 3.7082693576812744, + "learning_rate": 4.974179381252807e-05, + "loss": 5.1975, + "step": 7700 + }, + { + "epoch": 0.04580002854695975, + "grad_norm": 4.0067524909973145, + "learning_rate": 4.97417268486571e-05, + "loss": 5.4047, + "step": 7701 + }, + { + "epoch": 0.04580597583024074, + "grad_norm": 3.978787660598755, + "learning_rate": 4.974165987614904e-05, + "loss": 5.7023, + "step": 7702 + }, + { + "epoch": 0.045811923113521745, + "grad_norm": 4.597605228424072, + "learning_rate": 4.974159289500392e-05, + "loss": 6.5186, + "step": 7703 + }, + { + "epoch": 0.04581787039680274, + "grad_norm": 2.8793985843658447, + "learning_rate": 4.974152590522177e-05, + "loss": 6.1476, + "step": 7704 + }, + { + "epoch": 0.045823817680083735, + "grad_norm": 2.466089963912964, + "learning_rate": 4.974145890680262e-05, + "loss": 5.5154, + "step": 7705 + }, + { + "epoch": 0.04582976496336474, + "grad_norm": 2.937228202819824, + "learning_rate": 4.974139189974647e-05, + "loss": 5.5146, + "step": 7706 + }, + { + "epoch": 0.04583571224664573, + "grad_norm": 2.4580399990081787, + "learning_rate": 4.974132488405336e-05, + "loss": 6.214, + "step": 7707 + }, + { + "epoch": 0.04584165952992673, + "grad_norm": 4.910717010498047, + "learning_rate": 4.97412578597233e-05, + "loss": 5.819, + "step": 7708 + }, + { + "epoch": 0.04584760681320773, + "grad_norm": 5.372139930725098, + "learning_rate": 4.974119082675634e-05, + "loss": 5.3242, + "step": 7709 + }, + { + "epoch": 0.045853554096488724, + "grad_norm": 2.050492525100708, + "learning_rate": 4.9741123785152474e-05, + "loss": 6.0468, + "step": 7710 + }, + { + "epoch": 0.04585950137976972, + "grad_norm": 1.7090541124343872, + "learning_rate": 4.974105673491174e-05, + "loss": 5.7652, + "step": 7711 + }, + { + "epoch": 0.04586544866305072, + "grad_norm": 2.512538194656372, + "learning_rate": 4.974098967603415e-05, + "loss": 5.3184, + "step": 7712 + }, + { + "epoch": 0.04587139594633172, + "grad_norm": 3.311289072036743, + "learning_rate": 4.974092260851975e-05, + "loss": 5.5379, + "step": 7713 + }, + { + "epoch": 0.04587734322961271, + "grad_norm": 3.3318710327148438, + "learning_rate": 4.974085553236854e-05, + "loss": 5.5543, + "step": 7714 + }, + { + "epoch": 0.04588329051289371, + "grad_norm": 2.6384379863739014, + "learning_rate": 4.9740788447580555e-05, + "loss": 6.3475, + "step": 7715 + }, + { + "epoch": 0.04588923779617471, + "grad_norm": 2.0066304206848145, + "learning_rate": 4.974072135415582e-05, + "loss": 6.3685, + "step": 7716 + }, + { + "epoch": 0.045895185079455704, + "grad_norm": 2.4189116954803467, + "learning_rate": 4.9740654252094356e-05, + "loss": 5.4128, + "step": 7717 + }, + { + "epoch": 0.0459011323627367, + "grad_norm": 2.431011438369751, + "learning_rate": 4.974058714139618e-05, + "loss": 5.34, + "step": 7718 + }, + { + "epoch": 0.0459070796460177, + "grad_norm": 2.1997156143188477, + "learning_rate": 4.974052002206132e-05, + "loss": 5.4223, + "step": 7719 + }, + { + "epoch": 0.045913026929298696, + "grad_norm": 2.0700082778930664, + "learning_rate": 4.9740452894089806e-05, + "loss": 5.4255, + "step": 7720 + }, + { + "epoch": 0.04591897421257969, + "grad_norm": 2.3476040363311768, + "learning_rate": 4.974038575748165e-05, + "loss": 5.5055, + "step": 7721 + }, + { + "epoch": 0.04592492149586069, + "grad_norm": 4.2995524406433105, + "learning_rate": 4.974031861223688e-05, + "loss": 5.8869, + "step": 7722 + }, + { + "epoch": 0.04593086877914169, + "grad_norm": 4.690639495849609, + "learning_rate": 4.974025145835552e-05, + "loss": 6.0808, + "step": 7723 + }, + { + "epoch": 0.04593681606242268, + "grad_norm": 3.9823479652404785, + "learning_rate": 4.97401842958376e-05, + "loss": 6.0844, + "step": 7724 + }, + { + "epoch": 0.045942763345703685, + "grad_norm": 3.69808030128479, + "learning_rate": 4.9740117124683136e-05, + "loss": 5.9611, + "step": 7725 + }, + { + "epoch": 0.04594871062898468, + "grad_norm": 2.5912535190582275, + "learning_rate": 4.974004994489215e-05, + "loss": 5.9669, + "step": 7726 + }, + { + "epoch": 0.045954657912265676, + "grad_norm": 2.0894482135772705, + "learning_rate": 4.973998275646467e-05, + "loss": 5.6717, + "step": 7727 + }, + { + "epoch": 0.04596060519554668, + "grad_norm": 2.179302930831909, + "learning_rate": 4.973991555940072e-05, + "loss": 5.4077, + "step": 7728 + }, + { + "epoch": 0.04596655247882767, + "grad_norm": 2.4919214248657227, + "learning_rate": 4.973984835370031e-05, + "loss": 6.118, + "step": 7729 + }, + { + "epoch": 0.04597249976210867, + "grad_norm": 3.5036723613739014, + "learning_rate": 4.9739781139363485e-05, + "loss": 5.436, + "step": 7730 + }, + { + "epoch": 0.04597844704538966, + "grad_norm": 4.129561424255371, + "learning_rate": 4.973971391639026e-05, + "loss": 4.8414, + "step": 7731 + }, + { + "epoch": 0.045984394328670665, + "grad_norm": 2.867039203643799, + "learning_rate": 4.973964668478065e-05, + "loss": 4.7385, + "step": 7732 + }, + { + "epoch": 0.04599034161195166, + "grad_norm": 2.754023313522339, + "learning_rate": 4.973957944453469e-05, + "loss": 4.6063, + "step": 7733 + }, + { + "epoch": 0.045996288895232655, + "grad_norm": 2.1025235652923584, + "learning_rate": 4.973951219565239e-05, + "loss": 5.3233, + "step": 7734 + }, + { + "epoch": 0.04600223617851366, + "grad_norm": 2.352883815765381, + "learning_rate": 4.973944493813379e-05, + "loss": 5.5648, + "step": 7735 + }, + { + "epoch": 0.04600818346179465, + "grad_norm": 2.049377679824829, + "learning_rate": 4.97393776719789e-05, + "loss": 6.1241, + "step": 7736 + }, + { + "epoch": 0.04601413074507565, + "grad_norm": 1.7124110460281372, + "learning_rate": 4.9739310397187756e-05, + "loss": 6.1258, + "step": 7737 + }, + { + "epoch": 0.04602007802835665, + "grad_norm": 2.2592861652374268, + "learning_rate": 4.9739243113760364e-05, + "loss": 6.1972, + "step": 7738 + }, + { + "epoch": 0.046026025311637644, + "grad_norm": 2.3926188945770264, + "learning_rate": 4.973917582169677e-05, + "loss": 6.1681, + "step": 7739 + }, + { + "epoch": 0.04603197259491864, + "grad_norm": 1.9956084489822388, + "learning_rate": 4.973910852099698e-05, + "loss": 6.2068, + "step": 7740 + }, + { + "epoch": 0.04603791987819964, + "grad_norm": 1.924467921257019, + "learning_rate": 4.973904121166102e-05, + "loss": 6.4391, + "step": 7741 + }, + { + "epoch": 0.04604386716148064, + "grad_norm": 1.9410041570663452, + "learning_rate": 4.973897389368891e-05, + "loss": 5.9378, + "step": 7742 + }, + { + "epoch": 0.04604981444476163, + "grad_norm": 2.0418617725372314, + "learning_rate": 4.9738906567080686e-05, + "loss": 5.8823, + "step": 7743 + }, + { + "epoch": 0.04605576172804263, + "grad_norm": 2.696143627166748, + "learning_rate": 4.973883923183637e-05, + "loss": 5.8551, + "step": 7744 + }, + { + "epoch": 0.04606170901132363, + "grad_norm": 2.482703447341919, + "learning_rate": 4.973877188795598e-05, + "loss": 5.5752, + "step": 7745 + }, + { + "epoch": 0.046067656294604624, + "grad_norm": 2.520437240600586, + "learning_rate": 4.973870453543954e-05, + "loss": 5.571, + "step": 7746 + }, + { + "epoch": 0.04607360357788562, + "grad_norm": 2.568150758743286, + "learning_rate": 4.973863717428707e-05, + "loss": 5.9145, + "step": 7747 + }, + { + "epoch": 0.04607955086116662, + "grad_norm": 2.6373183727264404, + "learning_rate": 4.9738569804498605e-05, + "loss": 5.9414, + "step": 7748 + }, + { + "epoch": 0.046085498144447616, + "grad_norm": 2.1663565635681152, + "learning_rate": 4.973850242607415e-05, + "loss": 6.2316, + "step": 7749 + }, + { + "epoch": 0.04609144542772861, + "grad_norm": 2.044316053390503, + "learning_rate": 4.973843503901374e-05, + "loss": 5.7232, + "step": 7750 + }, + { + "epoch": 0.04609739271100961, + "grad_norm": 2.1740782260894775, + "learning_rate": 4.9738367643317405e-05, + "loss": 6.0388, + "step": 7751 + }, + { + "epoch": 0.04610333999429061, + "grad_norm": 2.0643458366394043, + "learning_rate": 4.973830023898516e-05, + "loss": 5.8201, + "step": 7752 + }, + { + "epoch": 0.0461092872775716, + "grad_norm": 1.7433217763900757, + "learning_rate": 4.973823282601703e-05, + "loss": 6.0464, + "step": 7753 + }, + { + "epoch": 0.046115234560852605, + "grad_norm": 2.657677412033081, + "learning_rate": 4.9738165404413037e-05, + "loss": 5.2849, + "step": 7754 + }, + { + "epoch": 0.0461211818441336, + "grad_norm": 1.7317034006118774, + "learning_rate": 4.9738097974173205e-05, + "loss": 6.0619, + "step": 7755 + }, + { + "epoch": 0.046127129127414596, + "grad_norm": 1.6109949350357056, + "learning_rate": 4.973803053529756e-05, + "loss": 5.7832, + "step": 7756 + }, + { + "epoch": 0.0461330764106956, + "grad_norm": 2.2980475425720215, + "learning_rate": 4.9737963087786125e-05, + "loss": 5.4346, + "step": 7757 + }, + { + "epoch": 0.04613902369397659, + "grad_norm": 2.5162737369537354, + "learning_rate": 4.973789563163892e-05, + "loss": 5.3723, + "step": 7758 + }, + { + "epoch": 0.04614497097725759, + "grad_norm": 2.3493261337280273, + "learning_rate": 4.973782816685597e-05, + "loss": 5.7474, + "step": 7759 + }, + { + "epoch": 0.04615091826053858, + "grad_norm": 2.1428544521331787, + "learning_rate": 4.9737760693437306e-05, + "loss": 5.6318, + "step": 7760 + }, + { + "epoch": 0.046156865543819585, + "grad_norm": 2.11627197265625, + "learning_rate": 4.973769321138294e-05, + "loss": 5.38, + "step": 7761 + }, + { + "epoch": 0.04616281282710058, + "grad_norm": 2.411957263946533, + "learning_rate": 4.9737625720692906e-05, + "loss": 5.1822, + "step": 7762 + }, + { + "epoch": 0.046168760110381575, + "grad_norm": 2.3566222190856934, + "learning_rate": 4.973755822136722e-05, + "loss": 5.0405, + "step": 7763 + }, + { + "epoch": 0.04617470739366258, + "grad_norm": 2.2235679626464844, + "learning_rate": 4.973749071340591e-05, + "loss": 5.4746, + "step": 7764 + }, + { + "epoch": 0.04618065467694357, + "grad_norm": 2.4175586700439453, + "learning_rate": 4.973742319680899e-05, + "loss": 5.7519, + "step": 7765 + }, + { + "epoch": 0.04618660196022457, + "grad_norm": 2.3386452198028564, + "learning_rate": 4.9737355671576496e-05, + "loss": 6.1765, + "step": 7766 + }, + { + "epoch": 0.04619254924350557, + "grad_norm": 2.084333658218384, + "learning_rate": 4.973728813770845e-05, + "loss": 6.1439, + "step": 7767 + }, + { + "epoch": 0.046198496526786564, + "grad_norm": 2.0523531436920166, + "learning_rate": 4.973722059520487e-05, + "loss": 6.294, + "step": 7768 + }, + { + "epoch": 0.04620444381006756, + "grad_norm": 2.1187572479248047, + "learning_rate": 4.973715304406578e-05, + "loss": 5.3679, + "step": 7769 + }, + { + "epoch": 0.04621039109334856, + "grad_norm": 2.5249836444854736, + "learning_rate": 4.9737085484291204e-05, + "loss": 5.9086, + "step": 7770 + }, + { + "epoch": 0.04621633837662956, + "grad_norm": 2.35662841796875, + "learning_rate": 4.973701791588117e-05, + "loss": 6.3135, + "step": 7771 + }, + { + "epoch": 0.04622228565991055, + "grad_norm": 2.070955276489258, + "learning_rate": 4.9736950338835695e-05, + "loss": 5.8748, + "step": 7772 + }, + { + "epoch": 0.04622823294319155, + "grad_norm": 2.151587963104248, + "learning_rate": 4.9736882753154814e-05, + "loss": 6.2053, + "step": 7773 + }, + { + "epoch": 0.04623418022647255, + "grad_norm": 2.2187843322753906, + "learning_rate": 4.9736815158838534e-05, + "loss": 5.762, + "step": 7774 + }, + { + "epoch": 0.046240127509753544, + "grad_norm": 1.8676223754882812, + "learning_rate": 4.973674755588689e-05, + "loss": 6.06, + "step": 7775 + }, + { + "epoch": 0.04624607479303454, + "grad_norm": 2.2110252380371094, + "learning_rate": 4.9736679944299906e-05, + "loss": 5.6474, + "step": 7776 + }, + { + "epoch": 0.04625202207631554, + "grad_norm": 2.0635151863098145, + "learning_rate": 4.9736612324077605e-05, + "loss": 5.5579, + "step": 7777 + }, + { + "epoch": 0.046257969359596536, + "grad_norm": 2.1654598712921143, + "learning_rate": 4.973654469522e-05, + "loss": 5.5388, + "step": 7778 + }, + { + "epoch": 0.04626391664287753, + "grad_norm": 2.3735673427581787, + "learning_rate": 4.973647705772713e-05, + "loss": 5.4383, + "step": 7779 + }, + { + "epoch": 0.04626986392615853, + "grad_norm": 2.344160318374634, + "learning_rate": 4.9736409411599e-05, + "loss": 5.6501, + "step": 7780 + }, + { + "epoch": 0.04627581120943953, + "grad_norm": 3.023350477218628, + "learning_rate": 4.973634175683566e-05, + "loss": 5.2688, + "step": 7781 + }, + { + "epoch": 0.04628175849272052, + "grad_norm": 2.8814494609832764, + "learning_rate": 4.973627409343711e-05, + "loss": 5.08, + "step": 7782 + }, + { + "epoch": 0.046287705776001525, + "grad_norm": 2.475191831588745, + "learning_rate": 4.973620642140339e-05, + "loss": 5.0761, + "step": 7783 + }, + { + "epoch": 0.04629365305928252, + "grad_norm": 2.5567755699157715, + "learning_rate": 4.9736138740734504e-05, + "loss": 5.46, + "step": 7784 + }, + { + "epoch": 0.046299600342563516, + "grad_norm": 2.9225175380706787, + "learning_rate": 4.973607105143049e-05, + "loss": 5.5219, + "step": 7785 + }, + { + "epoch": 0.04630554762584452, + "grad_norm": 2.3112781047821045, + "learning_rate": 4.973600335349138e-05, + "loss": 6.4204, + "step": 7786 + }, + { + "epoch": 0.04631149490912551, + "grad_norm": 2.228182554244995, + "learning_rate": 4.973593564691717e-05, + "loss": 6.3299, + "step": 7787 + }, + { + "epoch": 0.04631744219240651, + "grad_norm": 1.8612277507781982, + "learning_rate": 4.973586793170792e-05, + "loss": 5.994, + "step": 7788 + }, + { + "epoch": 0.0463233894756875, + "grad_norm": 1.9788155555725098, + "learning_rate": 4.9735800207863626e-05, + "loss": 6.1676, + "step": 7789 + }, + { + "epoch": 0.046329336758968505, + "grad_norm": 2.2335264682769775, + "learning_rate": 4.973573247538431e-05, + "loss": 6.3112, + "step": 7790 + }, + { + "epoch": 0.0463352840422495, + "grad_norm": 2.168656349182129, + "learning_rate": 4.973566473427001e-05, + "loss": 5.8326, + "step": 7791 + }, + { + "epoch": 0.046341231325530495, + "grad_norm": 1.9187591075897217, + "learning_rate": 4.9735596984520755e-05, + "loss": 5.8734, + "step": 7792 + }, + { + "epoch": 0.0463471786088115, + "grad_norm": 2.195242166519165, + "learning_rate": 4.973552922613655e-05, + "loss": 6.1325, + "step": 7793 + }, + { + "epoch": 0.04635312589209249, + "grad_norm": 1.9698888063430786, + "learning_rate": 4.973546145911743e-05, + "loss": 5.8586, + "step": 7794 + }, + { + "epoch": 0.04635907317537349, + "grad_norm": 2.2149972915649414, + "learning_rate": 4.973539368346342e-05, + "loss": 5.4087, + "step": 7795 + }, + { + "epoch": 0.04636502045865449, + "grad_norm": 1.8587820529937744, + "learning_rate": 4.973532589917453e-05, + "loss": 5.9956, + "step": 7796 + }, + { + "epoch": 0.046370967741935484, + "grad_norm": 2.022866725921631, + "learning_rate": 4.97352581062508e-05, + "loss": 6.0905, + "step": 7797 + }, + { + "epoch": 0.04637691502521648, + "grad_norm": 2.0257678031921387, + "learning_rate": 4.973519030469225e-05, + "loss": 6.02, + "step": 7798 + }, + { + "epoch": 0.04638286230849748, + "grad_norm": 1.6909089088439941, + "learning_rate": 4.973512249449889e-05, + "loss": 5.727, + "step": 7799 + }, + { + "epoch": 0.046388809591778477, + "grad_norm": 1.8882997035980225, + "learning_rate": 4.9735054675670754e-05, + "loss": 5.655, + "step": 7800 + }, + { + "epoch": 0.04639475687505947, + "grad_norm": 2.1775193214416504, + "learning_rate": 4.9734986848207876e-05, + "loss": 5.8067, + "step": 7801 + }, + { + "epoch": 0.04640070415834047, + "grad_norm": 2.136690139770508, + "learning_rate": 4.973491901211027e-05, + "loss": 5.5515, + "step": 7802 + }, + { + "epoch": 0.04640665144162147, + "grad_norm": 1.8036144971847534, + "learning_rate": 4.973485116737795e-05, + "loss": 5.8404, + "step": 7803 + }, + { + "epoch": 0.046412598724902464, + "grad_norm": 2.1350481510162354, + "learning_rate": 4.973478331401096e-05, + "loss": 6.1635, + "step": 7804 + }, + { + "epoch": 0.04641854600818346, + "grad_norm": 2.4152462482452393, + "learning_rate": 4.97347154520093e-05, + "loss": 5.9882, + "step": 7805 + }, + { + "epoch": 0.04642449329146446, + "grad_norm": 2.166402578353882, + "learning_rate": 4.9734647581373015e-05, + "loss": 5.8982, + "step": 7806 + }, + { + "epoch": 0.046430440574745456, + "grad_norm": 1.8684437274932861, + "learning_rate": 4.973457970210211e-05, + "loss": 5.9501, + "step": 7807 + }, + { + "epoch": 0.04643638785802645, + "grad_norm": 1.775829792022705, + "learning_rate": 4.973451181419663e-05, + "loss": 5.83, + "step": 7808 + }, + { + "epoch": 0.04644233514130745, + "grad_norm": 1.7500759363174438, + "learning_rate": 4.973444391765659e-05, + "loss": 6.0084, + "step": 7809 + }, + { + "epoch": 0.04644828242458845, + "grad_norm": 2.3920938968658447, + "learning_rate": 4.9734376012482e-05, + "loss": 5.559, + "step": 7810 + }, + { + "epoch": 0.04645422970786944, + "grad_norm": 2.7680983543395996, + "learning_rate": 4.97343080986729e-05, + "loss": 5.3521, + "step": 7811 + }, + { + "epoch": 0.046460176991150445, + "grad_norm": 2.6618781089782715, + "learning_rate": 4.9734240176229316e-05, + "loss": 5.6917, + "step": 7812 + }, + { + "epoch": 0.04646612427443144, + "grad_norm": 2.086775541305542, + "learning_rate": 4.9734172245151256e-05, + "loss": 5.582, + "step": 7813 + }, + { + "epoch": 0.046472071557712435, + "grad_norm": 2.190012216567993, + "learning_rate": 4.973410430543875e-05, + "loss": 5.9132, + "step": 7814 + }, + { + "epoch": 0.04647801884099344, + "grad_norm": 2.317610740661621, + "learning_rate": 4.973403635709183e-05, + "loss": 5.7055, + "step": 7815 + }, + { + "epoch": 0.04648396612427443, + "grad_norm": 2.1291167736053467, + "learning_rate": 4.973396840011051e-05, + "loss": 5.6711, + "step": 7816 + }, + { + "epoch": 0.04648991340755543, + "grad_norm": 1.5421113967895508, + "learning_rate": 4.9733900434494815e-05, + "loss": 5.6433, + "step": 7817 + }, + { + "epoch": 0.04649586069083642, + "grad_norm": 2.222355604171753, + "learning_rate": 4.973383246024477e-05, + "loss": 5.3685, + "step": 7818 + }, + { + "epoch": 0.046501807974117425, + "grad_norm": 2.097116708755493, + "learning_rate": 4.97337644773604e-05, + "loss": 5.6528, + "step": 7819 + }, + { + "epoch": 0.04650775525739842, + "grad_norm": 2.0224382877349854, + "learning_rate": 4.973369648584174e-05, + "loss": 5.8849, + "step": 7820 + }, + { + "epoch": 0.046513702540679415, + "grad_norm": 2.1581428050994873, + "learning_rate": 4.973362848568879e-05, + "loss": 5.985, + "step": 7821 + }, + { + "epoch": 0.04651964982396042, + "grad_norm": 2.43945574760437, + "learning_rate": 4.9733560476901584e-05, + "loss": 5.5682, + "step": 7822 + }, + { + "epoch": 0.04652559710724141, + "grad_norm": 3.174143075942993, + "learning_rate": 4.9733492459480157e-05, + "loss": 4.832, + "step": 7823 + }, + { + "epoch": 0.04653154439052241, + "grad_norm": 2.269339084625244, + "learning_rate": 4.973342443342452e-05, + "loss": 5.5804, + "step": 7824 + }, + { + "epoch": 0.04653749167380341, + "grad_norm": 2.3775289058685303, + "learning_rate": 4.9733356398734695e-05, + "loss": 5.8299, + "step": 7825 + }, + { + "epoch": 0.046543438957084404, + "grad_norm": 2.065579414367676, + "learning_rate": 4.9733288355410716e-05, + "loss": 5.6985, + "step": 7826 + }, + { + "epoch": 0.0465493862403654, + "grad_norm": 1.9699875116348267, + "learning_rate": 4.9733220303452604e-05, + "loss": 6.0161, + "step": 7827 + }, + { + "epoch": 0.0465553335236464, + "grad_norm": 2.1414806842803955, + "learning_rate": 4.9733152242860374e-05, + "loss": 6.2534, + "step": 7828 + }, + { + "epoch": 0.046561280806927396, + "grad_norm": 2.414738416671753, + "learning_rate": 4.973308417363406e-05, + "loss": 5.8402, + "step": 7829 + }, + { + "epoch": 0.04656722809020839, + "grad_norm": 2.4105031490325928, + "learning_rate": 4.973301609577368e-05, + "loss": 5.8728, + "step": 7830 + }, + { + "epoch": 0.04657317537348939, + "grad_norm": 2.7718660831451416, + "learning_rate": 4.9732948009279264e-05, + "loss": 5.637, + "step": 7831 + }, + { + "epoch": 0.04657912265677039, + "grad_norm": 2.205103874206543, + "learning_rate": 4.9732879914150824e-05, + "loss": 5.4119, + "step": 7832 + }, + { + "epoch": 0.046585069940051384, + "grad_norm": 1.9080390930175781, + "learning_rate": 4.9732811810388394e-05, + "loss": 5.3387, + "step": 7833 + }, + { + "epoch": 0.04659101722333238, + "grad_norm": 1.6600725650787354, + "learning_rate": 4.9732743697992e-05, + "loss": 5.3192, + "step": 7834 + }, + { + "epoch": 0.04659696450661338, + "grad_norm": 1.9428787231445312, + "learning_rate": 4.973267557696165e-05, + "loss": 5.3127, + "step": 7835 + }, + { + "epoch": 0.046602911789894376, + "grad_norm": 2.174811840057373, + "learning_rate": 4.973260744729738e-05, + "loss": 5.7181, + "step": 7836 + }, + { + "epoch": 0.04660885907317537, + "grad_norm": 2.5420422554016113, + "learning_rate": 4.9732539308999224e-05, + "loss": 5.934, + "step": 7837 + }, + { + "epoch": 0.04661480635645637, + "grad_norm": 2.079343795776367, + "learning_rate": 4.973247116206719e-05, + "loss": 5.236, + "step": 7838 + }, + { + "epoch": 0.04662075363973737, + "grad_norm": 1.7748003005981445, + "learning_rate": 4.97324030065013e-05, + "loss": 5.2929, + "step": 7839 + }, + { + "epoch": 0.04662670092301836, + "grad_norm": 2.2746875286102295, + "learning_rate": 4.973233484230159e-05, + "loss": 5.182, + "step": 7840 + }, + { + "epoch": 0.046632648206299365, + "grad_norm": 1.7846394777297974, + "learning_rate": 4.9732266669468074e-05, + "loss": 5.2682, + "step": 7841 + }, + { + "epoch": 0.04663859548958036, + "grad_norm": 2.078132152557373, + "learning_rate": 4.973219848800078e-05, + "loss": 5.3245, + "step": 7842 + }, + { + "epoch": 0.046644542772861355, + "grad_norm": 1.7784876823425293, + "learning_rate": 4.9732130297899726e-05, + "loss": 5.4582, + "step": 7843 + }, + { + "epoch": 0.04665049005614236, + "grad_norm": 1.8421920537948608, + "learning_rate": 4.973206209916495e-05, + "loss": 5.3504, + "step": 7844 + }, + { + "epoch": 0.04665643733942335, + "grad_norm": 1.9958820343017578, + "learning_rate": 4.9731993891796455e-05, + "loss": 5.2914, + "step": 7845 + }, + { + "epoch": 0.04666238462270435, + "grad_norm": 2.0615813732147217, + "learning_rate": 4.9731925675794286e-05, + "loss": 5.3318, + "step": 7846 + }, + { + "epoch": 0.04666833190598534, + "grad_norm": 1.7690422534942627, + "learning_rate": 4.973185745115846e-05, + "loss": 5.3169, + "step": 7847 + }, + { + "epoch": 0.046674279189266345, + "grad_norm": 1.7990578413009644, + "learning_rate": 4.9731789217888994e-05, + "loss": 5.3136, + "step": 7848 + }, + { + "epoch": 0.04668022647254734, + "grad_norm": 2.0028672218322754, + "learning_rate": 4.9731720975985905e-05, + "loss": 5.2115, + "step": 7849 + }, + { + "epoch": 0.046686173755828335, + "grad_norm": 2.0703940391540527, + "learning_rate": 4.973165272544924e-05, + "loss": 5.2439, + "step": 7850 + }, + { + "epoch": 0.04669212103910934, + "grad_norm": 2.1105704307556152, + "learning_rate": 4.973158446627901e-05, + "loss": 5.5812, + "step": 7851 + }, + { + "epoch": 0.04669806832239033, + "grad_norm": 1.7391036748886108, + "learning_rate": 4.9731516198475236e-05, + "loss": 5.229, + "step": 7852 + }, + { + "epoch": 0.04670401560567133, + "grad_norm": 1.6907505989074707, + "learning_rate": 4.973144792203795e-05, + "loss": 5.2674, + "step": 7853 + }, + { + "epoch": 0.04670996288895233, + "grad_norm": 1.608168125152588, + "learning_rate": 4.973137963696717e-05, + "loss": 5.389, + "step": 7854 + }, + { + "epoch": 0.046715910172233324, + "grad_norm": 1.7521610260009766, + "learning_rate": 4.9731311343262913e-05, + "loss": 5.2436, + "step": 7855 + }, + { + "epoch": 0.04672185745551432, + "grad_norm": 2.0182595252990723, + "learning_rate": 4.973124304092522e-05, + "loss": 5.2746, + "step": 7856 + }, + { + "epoch": 0.04672780473879532, + "grad_norm": 1.7990871667861938, + "learning_rate": 4.97311747299541e-05, + "loss": 5.4241, + "step": 7857 + }, + { + "epoch": 0.046733752022076316, + "grad_norm": 2.124717950820923, + "learning_rate": 4.973110641034958e-05, + "loss": 5.5133, + "step": 7858 + }, + { + "epoch": 0.04673969930535731, + "grad_norm": 2.066869020462036, + "learning_rate": 4.973103808211169e-05, + "loss": 5.252, + "step": 7859 + }, + { + "epoch": 0.04674564658863831, + "grad_norm": 1.8004878759384155, + "learning_rate": 4.9730969745240455e-05, + "loss": 5.483, + "step": 7860 + }, + { + "epoch": 0.04675159387191931, + "grad_norm": 1.6822713613510132, + "learning_rate": 4.9730901399735886e-05, + "loss": 5.3916, + "step": 7861 + }, + { + "epoch": 0.046757541155200304, + "grad_norm": 1.7024493217468262, + "learning_rate": 4.973083304559802e-05, + "loss": 5.3504, + "step": 7862 + }, + { + "epoch": 0.0467634884384813, + "grad_norm": 1.5939997434616089, + "learning_rate": 4.973076468282687e-05, + "loss": 5.4151, + "step": 7863 + }, + { + "epoch": 0.0467694357217623, + "grad_norm": 1.7603535652160645, + "learning_rate": 4.9730696311422475e-05, + "loss": 5.351, + "step": 7864 + }, + { + "epoch": 0.046775383005043296, + "grad_norm": 1.737897276878357, + "learning_rate": 4.973062793138484e-05, + "loss": 5.0834, + "step": 7865 + }, + { + "epoch": 0.04678133028832429, + "grad_norm": 2.4130520820617676, + "learning_rate": 4.973055954271401e-05, + "loss": 4.833, + "step": 7866 + }, + { + "epoch": 0.04678727757160529, + "grad_norm": 1.9712201356887817, + "learning_rate": 4.9730491145409987e-05, + "loss": 5.0048, + "step": 7867 + }, + { + "epoch": 0.04679322485488629, + "grad_norm": 1.808608055114746, + "learning_rate": 4.97304227394728e-05, + "loss": 5.3134, + "step": 7868 + }, + { + "epoch": 0.04679917213816728, + "grad_norm": 1.8121775388717651, + "learning_rate": 4.973035432490249e-05, + "loss": 5.2594, + "step": 7869 + }, + { + "epoch": 0.046805119421448285, + "grad_norm": 1.7191296815872192, + "learning_rate": 4.9730285901699064e-05, + "loss": 5.206, + "step": 7870 + }, + { + "epoch": 0.04681106670472928, + "grad_norm": 1.931894063949585, + "learning_rate": 4.973021746986255e-05, + "loss": 5.3349, + "step": 7871 + }, + { + "epoch": 0.046817013988010275, + "grad_norm": 2.5420172214508057, + "learning_rate": 4.973014902939297e-05, + "loss": 5.2894, + "step": 7872 + }, + { + "epoch": 0.04682296127129128, + "grad_norm": 2.5522336959838867, + "learning_rate": 4.973008058029036e-05, + "loss": 5.2144, + "step": 7873 + }, + { + "epoch": 0.04682890855457227, + "grad_norm": 3.1389801502227783, + "learning_rate": 4.973001212255472e-05, + "loss": 5.7229, + "step": 7874 + }, + { + "epoch": 0.04683485583785327, + "grad_norm": 1.8687554597854614, + "learning_rate": 4.97299436561861e-05, + "loss": 5.483, + "step": 7875 + }, + { + "epoch": 0.04684080312113426, + "grad_norm": 2.2526602745056152, + "learning_rate": 4.972987518118451e-05, + "loss": 5.4562, + "step": 7876 + }, + { + "epoch": 0.046846750404415265, + "grad_norm": 2.108677625656128, + "learning_rate": 4.972980669754997e-05, + "loss": 5.2005, + "step": 7877 + }, + { + "epoch": 0.04685269768769626, + "grad_norm": 2.023118019104004, + "learning_rate": 4.972973820528252e-05, + "loss": 5.3674, + "step": 7878 + }, + { + "epoch": 0.046858644970977255, + "grad_norm": 1.6553964614868164, + "learning_rate": 4.9729669704382165e-05, + "loss": 5.3256, + "step": 7879 + }, + { + "epoch": 0.04686459225425826, + "grad_norm": 1.8197314739227295, + "learning_rate": 4.972960119484894e-05, + "loss": 5.1738, + "step": 7880 + }, + { + "epoch": 0.04687053953753925, + "grad_norm": 1.6142289638519287, + "learning_rate": 4.972953267668287e-05, + "loss": 5.245, + "step": 7881 + }, + { + "epoch": 0.04687648682082025, + "grad_norm": 1.4962797164916992, + "learning_rate": 4.972946414988398e-05, + "loss": 5.3121, + "step": 7882 + }, + { + "epoch": 0.04688243410410125, + "grad_norm": 1.487801432609558, + "learning_rate": 4.972939561445228e-05, + "loss": 5.1828, + "step": 7883 + }, + { + "epoch": 0.046888381387382244, + "grad_norm": 1.9139772653579712, + "learning_rate": 4.972932707038781e-05, + "loss": 5.2432, + "step": 7884 + }, + { + "epoch": 0.04689432867066324, + "grad_norm": 1.7533615827560425, + "learning_rate": 4.972925851769058e-05, + "loss": 5.6451, + "step": 7885 + }, + { + "epoch": 0.04690027595394424, + "grad_norm": 1.8561608791351318, + "learning_rate": 4.972918995636062e-05, + "loss": 5.4293, + "step": 7886 + }, + { + "epoch": 0.046906223237225236, + "grad_norm": 1.6891844272613525, + "learning_rate": 4.972912138639797e-05, + "loss": 5.2736, + "step": 7887 + }, + { + "epoch": 0.04691217052050623, + "grad_norm": 1.9279890060424805, + "learning_rate": 4.972905280780262e-05, + "loss": 5.5733, + "step": 7888 + }, + { + "epoch": 0.04691811780378723, + "grad_norm": 1.7810181379318237, + "learning_rate": 4.9728984220574624e-05, + "loss": 5.2036, + "step": 7889 + }, + { + "epoch": 0.04692406508706823, + "grad_norm": 1.6455233097076416, + "learning_rate": 4.9728915624714004e-05, + "loss": 5.3493, + "step": 7890 + }, + { + "epoch": 0.046930012370349224, + "grad_norm": 1.5345048904418945, + "learning_rate": 4.9728847020220756e-05, + "loss": 5.2528, + "step": 7891 + }, + { + "epoch": 0.04693595965363022, + "grad_norm": 1.455165982246399, + "learning_rate": 4.9728778407094935e-05, + "loss": 5.2769, + "step": 7892 + }, + { + "epoch": 0.04694190693691122, + "grad_norm": 1.577910304069519, + "learning_rate": 4.972870978533655e-05, + "loss": 5.2182, + "step": 7893 + }, + { + "epoch": 0.046947854220192216, + "grad_norm": 1.728143334388733, + "learning_rate": 4.972864115494563e-05, + "loss": 5.3446, + "step": 7894 + }, + { + "epoch": 0.04695380150347321, + "grad_norm": 1.6157398223876953, + "learning_rate": 4.972857251592219e-05, + "loss": 5.4866, + "step": 7895 + }, + { + "epoch": 0.04695974878675421, + "grad_norm": 1.5386699438095093, + "learning_rate": 4.9728503868266266e-05, + "loss": 5.4626, + "step": 7896 + }, + { + "epoch": 0.04696569607003521, + "grad_norm": 1.874915599822998, + "learning_rate": 4.972843521197788e-05, + "loss": 5.4152, + "step": 7897 + }, + { + "epoch": 0.0469716433533162, + "grad_norm": 1.7093253135681152, + "learning_rate": 4.9728366547057046e-05, + "loss": 5.2852, + "step": 7898 + }, + { + "epoch": 0.046977590636597205, + "grad_norm": 1.6435173749923706, + "learning_rate": 4.9728297873503806e-05, + "loss": 5.3985, + "step": 7899 + }, + { + "epoch": 0.0469835379198782, + "grad_norm": 1.5776588916778564, + "learning_rate": 4.972822919131816e-05, + "loss": 5.2914, + "step": 7900 + }, + { + "epoch": 0.046989485203159195, + "grad_norm": 2.051072835922241, + "learning_rate": 4.972816050050015e-05, + "loss": 5.343, + "step": 7901 + }, + { + "epoch": 0.0469954324864402, + "grad_norm": 2.003816604614258, + "learning_rate": 4.972809180104979e-05, + "loss": 5.3577, + "step": 7902 + }, + { + "epoch": 0.04700137976972119, + "grad_norm": 1.9092657566070557, + "learning_rate": 4.9728023092967116e-05, + "loss": 5.551, + "step": 7903 + }, + { + "epoch": 0.04700732705300219, + "grad_norm": 1.763007640838623, + "learning_rate": 4.972795437625214e-05, + "loss": 5.5611, + "step": 7904 + }, + { + "epoch": 0.04701327433628318, + "grad_norm": 2.637850046157837, + "learning_rate": 4.9727885650904895e-05, + "loss": 5.937, + "step": 7905 + }, + { + "epoch": 0.047019221619564185, + "grad_norm": 1.6650307178497314, + "learning_rate": 4.9727816916925395e-05, + "loss": 5.6418, + "step": 7906 + }, + { + "epoch": 0.04702516890284518, + "grad_norm": 1.6943029165267944, + "learning_rate": 4.972774817431367e-05, + "loss": 5.4826, + "step": 7907 + }, + { + "epoch": 0.047031116186126175, + "grad_norm": 1.4689685106277466, + "learning_rate": 4.972767942306975e-05, + "loss": 5.4849, + "step": 7908 + }, + { + "epoch": 0.04703706346940718, + "grad_norm": 1.759244441986084, + "learning_rate": 4.9727610663193644e-05, + "loss": 5.3496, + "step": 7909 + }, + { + "epoch": 0.04704301075268817, + "grad_norm": 1.8706889152526855, + "learning_rate": 4.9727541894685395e-05, + "loss": 5.2836, + "step": 7910 + }, + { + "epoch": 0.04704895803596917, + "grad_norm": 1.486164927482605, + "learning_rate": 4.972747311754501e-05, + "loss": 5.4125, + "step": 7911 + }, + { + "epoch": 0.04705490531925017, + "grad_norm": 1.6479889154434204, + "learning_rate": 4.972740433177252e-05, + "loss": 5.1986, + "step": 7912 + }, + { + "epoch": 0.047060852602531164, + "grad_norm": 1.5741796493530273, + "learning_rate": 4.9727335537367944e-05, + "loss": 5.4761, + "step": 7913 + }, + { + "epoch": 0.04706679988581216, + "grad_norm": 1.5001682043075562, + "learning_rate": 4.972726673433131e-05, + "loss": 5.6267, + "step": 7914 + }, + { + "epoch": 0.04707274716909316, + "grad_norm": 1.774282455444336, + "learning_rate": 4.972719792266265e-05, + "loss": 5.5944, + "step": 7915 + }, + { + "epoch": 0.047078694452374156, + "grad_norm": 1.6656653881072998, + "learning_rate": 4.972712910236198e-05, + "loss": 5.4159, + "step": 7916 + }, + { + "epoch": 0.04708464173565515, + "grad_norm": 1.7174065113067627, + "learning_rate": 4.972706027342933e-05, + "loss": 5.4239, + "step": 7917 + }, + { + "epoch": 0.04709058901893615, + "grad_norm": 1.607878565788269, + "learning_rate": 4.9726991435864705e-05, + "loss": 5.4517, + "step": 7918 + }, + { + "epoch": 0.04709653630221715, + "grad_norm": 1.9639167785644531, + "learning_rate": 4.972692258966815e-05, + "loss": 5.5371, + "step": 7919 + }, + { + "epoch": 0.047102483585498144, + "grad_norm": 1.5418875217437744, + "learning_rate": 4.9726853734839684e-05, + "loss": 5.4798, + "step": 7920 + }, + { + "epoch": 0.04710843086877914, + "grad_norm": 1.54796302318573, + "learning_rate": 4.9726784871379326e-05, + "loss": 5.5329, + "step": 7921 + }, + { + "epoch": 0.04711437815206014, + "grad_norm": 1.8075921535491943, + "learning_rate": 4.97267159992871e-05, + "loss": 5.6049, + "step": 7922 + }, + { + "epoch": 0.047120325435341136, + "grad_norm": 1.4973857402801514, + "learning_rate": 4.972664711856304e-05, + "loss": 5.27, + "step": 7923 + }, + { + "epoch": 0.04712627271862213, + "grad_norm": 2.1028542518615723, + "learning_rate": 4.9726578229207155e-05, + "loss": 5.3626, + "step": 7924 + }, + { + "epoch": 0.04713222000190313, + "grad_norm": 2.2057480812072754, + "learning_rate": 4.9726509331219485e-05, + "loss": 5.1767, + "step": 7925 + }, + { + "epoch": 0.04713816728518413, + "grad_norm": 2.0549347400665283, + "learning_rate": 4.972644042460004e-05, + "loss": 5.3362, + "step": 7926 + }, + { + "epoch": 0.04714411456846512, + "grad_norm": 2.0960693359375, + "learning_rate": 4.972637150934885e-05, + "loss": 5.5162, + "step": 7927 + }, + { + "epoch": 0.047150061851746125, + "grad_norm": 2.2022509574890137, + "learning_rate": 4.9726302585465945e-05, + "loss": 5.3263, + "step": 7928 + }, + { + "epoch": 0.04715600913502712, + "grad_norm": 1.7065988779067993, + "learning_rate": 4.9726233652951335e-05, + "loss": 5.4349, + "step": 7929 + }, + { + "epoch": 0.047161956418308115, + "grad_norm": 1.742591142654419, + "learning_rate": 4.972616471180506e-05, + "loss": 5.2396, + "step": 7930 + }, + { + "epoch": 0.04716790370158912, + "grad_norm": 1.888846755027771, + "learning_rate": 4.972609576202713e-05, + "loss": 5.3453, + "step": 7931 + }, + { + "epoch": 0.04717385098487011, + "grad_norm": 1.6499360799789429, + "learning_rate": 4.972602680361758e-05, + "loss": 5.2819, + "step": 7932 + }, + { + "epoch": 0.04717979826815111, + "grad_norm": 1.8801236152648926, + "learning_rate": 4.9725957836576434e-05, + "loss": 5.2456, + "step": 7933 + }, + { + "epoch": 0.0471857455514321, + "grad_norm": 2.050522565841675, + "learning_rate": 4.97258888609037e-05, + "loss": 5.2069, + "step": 7934 + }, + { + "epoch": 0.047191692834713105, + "grad_norm": 2.0722391605377197, + "learning_rate": 4.972581987659942e-05, + "loss": 5.5057, + "step": 7935 + }, + { + "epoch": 0.0471976401179941, + "grad_norm": 2.728468179702759, + "learning_rate": 4.972575088366361e-05, + "loss": 5.5485, + "step": 7936 + }, + { + "epoch": 0.047203587401275095, + "grad_norm": 2.0293211936950684, + "learning_rate": 4.9725681882096295e-05, + "loss": 5.7126, + "step": 7937 + }, + { + "epoch": 0.0472095346845561, + "grad_norm": 2.1351194381713867, + "learning_rate": 4.97256128718975e-05, + "loss": 5.7313, + "step": 7938 + }, + { + "epoch": 0.04721548196783709, + "grad_norm": 1.9040015935897827, + "learning_rate": 4.972554385306726e-05, + "loss": 5.696, + "step": 7939 + }, + { + "epoch": 0.04722142925111809, + "grad_norm": 1.640110731124878, + "learning_rate": 4.9725474825605574e-05, + "loss": 5.2626, + "step": 7940 + }, + { + "epoch": 0.04722737653439909, + "grad_norm": 1.887408971786499, + "learning_rate": 4.972540578951249e-05, + "loss": 5.2734, + "step": 7941 + }, + { + "epoch": 0.047233323817680084, + "grad_norm": 1.8867583274841309, + "learning_rate": 4.972533674478801e-05, + "loss": 5.6811, + "step": 7942 + }, + { + "epoch": 0.04723927110096108, + "grad_norm": 1.811104655265808, + "learning_rate": 4.9725267691432174e-05, + "loss": 5.575, + "step": 7943 + }, + { + "epoch": 0.04724521838424208, + "grad_norm": 1.8644812107086182, + "learning_rate": 4.9725198629445014e-05, + "loss": 5.5718, + "step": 7944 + }, + { + "epoch": 0.047251165667523076, + "grad_norm": 1.693788766860962, + "learning_rate": 4.972512955882653e-05, + "loss": 5.5924, + "step": 7945 + }, + { + "epoch": 0.04725711295080407, + "grad_norm": 1.8305641412734985, + "learning_rate": 4.9725060479576766e-05, + "loss": 5.6529, + "step": 7946 + }, + { + "epoch": 0.04726306023408507, + "grad_norm": 1.7662039995193481, + "learning_rate": 4.9724991391695734e-05, + "loss": 5.6709, + "step": 7947 + }, + { + "epoch": 0.04726900751736607, + "grad_norm": 2.1799724102020264, + "learning_rate": 4.972492229518347e-05, + "loss": 5.6266, + "step": 7948 + }, + { + "epoch": 0.047274954800647064, + "grad_norm": 1.9300130605697632, + "learning_rate": 4.972485319003998e-05, + "loss": 5.6494, + "step": 7949 + }, + { + "epoch": 0.04728090208392806, + "grad_norm": 1.9196375608444214, + "learning_rate": 4.9724784076265307e-05, + "loss": 5.571, + "step": 7950 + }, + { + "epoch": 0.04728684936720906, + "grad_norm": 1.906616449356079, + "learning_rate": 4.972471495385947e-05, + "loss": 5.6537, + "step": 7951 + }, + { + "epoch": 0.047292796650490056, + "grad_norm": 1.826536774635315, + "learning_rate": 4.972464582282249e-05, + "loss": 5.6251, + "step": 7952 + }, + { + "epoch": 0.04729874393377105, + "grad_norm": 1.7790716886520386, + "learning_rate": 4.972457668315438e-05, + "loss": 5.3488, + "step": 7953 + }, + { + "epoch": 0.04730469121705205, + "grad_norm": 1.8892159461975098, + "learning_rate": 4.972450753485519e-05, + "loss": 5.4794, + "step": 7954 + }, + { + "epoch": 0.04731063850033305, + "grad_norm": 1.9409239292144775, + "learning_rate": 4.972443837792492e-05, + "loss": 5.6058, + "step": 7955 + }, + { + "epoch": 0.04731658578361404, + "grad_norm": 1.9935575723648071, + "learning_rate": 4.972436921236361e-05, + "loss": 5.6481, + "step": 7956 + }, + { + "epoch": 0.047322533066895045, + "grad_norm": 1.8507076501846313, + "learning_rate": 4.9724300038171276e-05, + "loss": 5.4723, + "step": 7957 + }, + { + "epoch": 0.04732848035017604, + "grad_norm": 1.9355841875076294, + "learning_rate": 4.972423085534794e-05, + "loss": 5.3843, + "step": 7958 + }, + { + "epoch": 0.047334427633457035, + "grad_norm": 1.9815531969070435, + "learning_rate": 4.972416166389363e-05, + "loss": 5.5635, + "step": 7959 + }, + { + "epoch": 0.04734037491673804, + "grad_norm": 1.7955007553100586, + "learning_rate": 4.972409246380838e-05, + "loss": 5.6002, + "step": 7960 + }, + { + "epoch": 0.04734632220001903, + "grad_norm": 2.0184547901153564, + "learning_rate": 4.97240232550922e-05, + "loss": 5.5458, + "step": 7961 + }, + { + "epoch": 0.04735226948330003, + "grad_norm": 1.7418156862258911, + "learning_rate": 4.972395403774512e-05, + "loss": 5.6443, + "step": 7962 + }, + { + "epoch": 0.04735821676658102, + "grad_norm": 1.9832762479782104, + "learning_rate": 4.972388481176716e-05, + "loss": 5.3799, + "step": 7963 + }, + { + "epoch": 0.047364164049862024, + "grad_norm": 1.8777718544006348, + "learning_rate": 4.972381557715835e-05, + "loss": 5.4349, + "step": 7964 + }, + { + "epoch": 0.04737011133314302, + "grad_norm": 1.519038438796997, + "learning_rate": 4.972374633391871e-05, + "loss": 5.2418, + "step": 7965 + }, + { + "epoch": 0.047376058616424015, + "grad_norm": 1.6425752639770508, + "learning_rate": 4.972367708204826e-05, + "loss": 5.1648, + "step": 7966 + }, + { + "epoch": 0.04738200589970502, + "grad_norm": 1.7461836338043213, + "learning_rate": 4.972360782154704e-05, + "loss": 5.1745, + "step": 7967 + }, + { + "epoch": 0.04738795318298601, + "grad_norm": 1.7991663217544556, + "learning_rate": 4.9723538552415064e-05, + "loss": 5.2268, + "step": 7968 + }, + { + "epoch": 0.04739390046626701, + "grad_norm": 1.9127873182296753, + "learning_rate": 4.9723469274652345e-05, + "loss": 5.5205, + "step": 7969 + }, + { + "epoch": 0.04739984774954801, + "grad_norm": 1.8836725950241089, + "learning_rate": 4.972339998825893e-05, + "loss": 5.3803, + "step": 7970 + }, + { + "epoch": 0.047405795032829004, + "grad_norm": 1.8391705751419067, + "learning_rate": 4.9723330693234825e-05, + "loss": 5.3084, + "step": 7971 + }, + { + "epoch": 0.04741174231611, + "grad_norm": 1.6707972288131714, + "learning_rate": 4.9723261389580063e-05, + "loss": 5.3275, + "step": 7972 + }, + { + "epoch": 0.047417689599391, + "grad_norm": 1.8807258605957031, + "learning_rate": 4.972319207729467e-05, + "loss": 5.0766, + "step": 7973 + }, + { + "epoch": 0.047423636882671996, + "grad_norm": 1.8980032205581665, + "learning_rate": 4.9723122756378655e-05, + "loss": 5.185, + "step": 7974 + }, + { + "epoch": 0.04742958416595299, + "grad_norm": 1.9011166095733643, + "learning_rate": 4.9723053426832055e-05, + "loss": 5.2494, + "step": 7975 + }, + { + "epoch": 0.04743553144923399, + "grad_norm": 1.6457782983779907, + "learning_rate": 4.97229840886549e-05, + "loss": 5.4205, + "step": 7976 + }, + { + "epoch": 0.04744147873251499, + "grad_norm": 1.558515191078186, + "learning_rate": 4.9722914741847206e-05, + "loss": 5.2111, + "step": 7977 + }, + { + "epoch": 0.04744742601579598, + "grad_norm": 1.4780910015106201, + "learning_rate": 4.9722845386409e-05, + "loss": 5.3365, + "step": 7978 + }, + { + "epoch": 0.04745337329907698, + "grad_norm": 1.529249668121338, + "learning_rate": 4.9722776022340296e-05, + "loss": 5.1323, + "step": 7979 + }, + { + "epoch": 0.04745932058235798, + "grad_norm": 1.66848886013031, + "learning_rate": 4.972270664964113e-05, + "loss": 5.2057, + "step": 7980 + }, + { + "epoch": 0.047465267865638976, + "grad_norm": 1.5645034313201904, + "learning_rate": 4.972263726831152e-05, + "loss": 5.1537, + "step": 7981 + }, + { + "epoch": 0.04747121514891997, + "grad_norm": 1.8793894052505493, + "learning_rate": 4.9722567878351496e-05, + "loss": 5.4403, + "step": 7982 + }, + { + "epoch": 0.04747716243220097, + "grad_norm": 1.7316640615463257, + "learning_rate": 4.972249847976108e-05, + "loss": 5.3642, + "step": 7983 + }, + { + "epoch": 0.04748310971548197, + "grad_norm": 1.7195171117782593, + "learning_rate": 4.972242907254029e-05, + "loss": 5.2603, + "step": 7984 + }, + { + "epoch": 0.04748905699876296, + "grad_norm": 1.6860026121139526, + "learning_rate": 4.972235965668916e-05, + "loss": 5.356, + "step": 7985 + }, + { + "epoch": 0.047495004282043965, + "grad_norm": 1.5396910905838013, + "learning_rate": 4.972229023220771e-05, + "loss": 5.2566, + "step": 7986 + }, + { + "epoch": 0.04750095156532496, + "grad_norm": 1.694547176361084, + "learning_rate": 4.9722220799095956e-05, + "loss": 5.0897, + "step": 7987 + }, + { + "epoch": 0.047506898848605955, + "grad_norm": 1.7608548402786255, + "learning_rate": 4.972215135735394e-05, + "loss": 5.4084, + "step": 7988 + }, + { + "epoch": 0.04751284613188696, + "grad_norm": 1.697198748588562, + "learning_rate": 4.9722081906981675e-05, + "loss": 5.4133, + "step": 7989 + }, + { + "epoch": 0.04751879341516795, + "grad_norm": 1.6107436418533325, + "learning_rate": 4.972201244797918e-05, + "loss": 5.2839, + "step": 7990 + }, + { + "epoch": 0.04752474069844895, + "grad_norm": 1.8178008794784546, + "learning_rate": 4.972194298034649e-05, + "loss": 5.3722, + "step": 7991 + }, + { + "epoch": 0.04753068798172994, + "grad_norm": 1.6542725563049316, + "learning_rate": 4.972187350408363e-05, + "loss": 5.3434, + "step": 7992 + }, + { + "epoch": 0.047536635265010944, + "grad_norm": 1.8194152116775513, + "learning_rate": 4.972180401919061e-05, + "loss": 5.3763, + "step": 7993 + }, + { + "epoch": 0.04754258254829194, + "grad_norm": 1.890317678451538, + "learning_rate": 4.9721734525667476e-05, + "loss": 5.529, + "step": 7994 + }, + { + "epoch": 0.047548529831572935, + "grad_norm": 1.813226342201233, + "learning_rate": 4.972166502351423e-05, + "loss": 5.0826, + "step": 7995 + }, + { + "epoch": 0.04755447711485394, + "grad_norm": 1.7679328918457031, + "learning_rate": 4.9721595512730905e-05, + "loss": 5.3589, + "step": 7996 + }, + { + "epoch": 0.04756042439813493, + "grad_norm": 1.8390278816223145, + "learning_rate": 4.972152599331753e-05, + "loss": 5.1568, + "step": 7997 + }, + { + "epoch": 0.04756637168141593, + "grad_norm": 2.9323909282684326, + "learning_rate": 4.972145646527413e-05, + "loss": 5.6457, + "step": 7998 + }, + { + "epoch": 0.04757231896469693, + "grad_norm": 1.8839350938796997, + "learning_rate": 4.972138692860072e-05, + "loss": 5.1204, + "step": 7999 + }, + { + "epoch": 0.047578266247977924, + "grad_norm": 1.9047685861587524, + "learning_rate": 4.972131738329733e-05, + "loss": 5.2741, + "step": 8000 + }, + { + "epoch": 0.04758421353125892, + "grad_norm": 2.39807391166687, + "learning_rate": 4.972124782936398e-05, + "loss": 5.0134, + "step": 8001 + }, + { + "epoch": 0.04759016081453992, + "grad_norm": 2.197404146194458, + "learning_rate": 4.972117826680071e-05, + "loss": 5.3012, + "step": 8002 + }, + { + "epoch": 0.047596108097820916, + "grad_norm": 2.2648651599884033, + "learning_rate": 4.9721108695607515e-05, + "loss": 5.7196, + "step": 8003 + }, + { + "epoch": 0.04760205538110191, + "grad_norm": 1.7686847448349, + "learning_rate": 4.972103911578444e-05, + "loss": 5.4261, + "step": 8004 + }, + { + "epoch": 0.04760800266438291, + "grad_norm": 1.726653814315796, + "learning_rate": 4.972096952733152e-05, + "loss": 5.33, + "step": 8005 + }, + { + "epoch": 0.04761394994766391, + "grad_norm": 1.6855807304382324, + "learning_rate": 4.972089993024875e-05, + "loss": 5.2382, + "step": 8006 + }, + { + "epoch": 0.0476198972309449, + "grad_norm": 1.644954800605774, + "learning_rate": 4.972083032453617e-05, + "loss": 5.3309, + "step": 8007 + }, + { + "epoch": 0.0476258445142259, + "grad_norm": 1.8630400896072388, + "learning_rate": 4.9720760710193816e-05, + "loss": 5.282, + "step": 8008 + }, + { + "epoch": 0.0476317917975069, + "grad_norm": 1.862716555595398, + "learning_rate": 4.972069108722168e-05, + "loss": 5.3307, + "step": 8009 + }, + { + "epoch": 0.047637739080787896, + "grad_norm": 1.8025259971618652, + "learning_rate": 4.972062145561982e-05, + "loss": 5.2236, + "step": 8010 + }, + { + "epoch": 0.04764368636406889, + "grad_norm": 1.7213356494903564, + "learning_rate": 4.972055181538825e-05, + "loss": 5.0635, + "step": 8011 + }, + { + "epoch": 0.04764963364734989, + "grad_norm": 1.5237104892730713, + "learning_rate": 4.9720482166526986e-05, + "loss": 5.3089, + "step": 8012 + }, + { + "epoch": 0.04765558093063089, + "grad_norm": 1.628957748413086, + "learning_rate": 4.972041250903605e-05, + "loss": 5.2299, + "step": 8013 + }, + { + "epoch": 0.04766152821391188, + "grad_norm": 1.9217725992202759, + "learning_rate": 4.972034284291548e-05, + "loss": 5.2504, + "step": 8014 + }, + { + "epoch": 0.047667475497192885, + "grad_norm": 2.114549160003662, + "learning_rate": 4.97202731681653e-05, + "loss": 5.219, + "step": 8015 + }, + { + "epoch": 0.04767342278047388, + "grad_norm": 1.9268896579742432, + "learning_rate": 4.9720203484785525e-05, + "loss": 5.145, + "step": 8016 + }, + { + "epoch": 0.047679370063754875, + "grad_norm": 2.04050874710083, + "learning_rate": 4.9720133792776166e-05, + "loss": 5.354, + "step": 8017 + }, + { + "epoch": 0.04768531734703588, + "grad_norm": 1.8002599477767944, + "learning_rate": 4.972006409213728e-05, + "loss": 5.0547, + "step": 8018 + }, + { + "epoch": 0.04769126463031687, + "grad_norm": 1.9655365943908691, + "learning_rate": 4.9719994382868876e-05, + "loss": 5.2188, + "step": 8019 + }, + { + "epoch": 0.04769721191359787, + "grad_norm": 1.7188535928726196, + "learning_rate": 4.971992466497097e-05, + "loss": 5.1792, + "step": 8020 + }, + { + "epoch": 0.04770315919687886, + "grad_norm": 1.582184910774231, + "learning_rate": 4.97198549384436e-05, + "loss": 5.2295, + "step": 8021 + }, + { + "epoch": 0.047709106480159864, + "grad_norm": 1.4490164518356323, + "learning_rate": 4.971978520328677e-05, + "loss": 5.1677, + "step": 8022 + }, + { + "epoch": 0.04771505376344086, + "grad_norm": 1.472896695137024, + "learning_rate": 4.971971545950054e-05, + "loss": 4.9954, + "step": 8023 + }, + { + "epoch": 0.047721001046721855, + "grad_norm": 1.5845187902450562, + "learning_rate": 4.97196457070849e-05, + "loss": 5.1273, + "step": 8024 + }, + { + "epoch": 0.04772694833000286, + "grad_norm": 1.6418551206588745, + "learning_rate": 4.9719575946039887e-05, + "loss": 5.0835, + "step": 8025 + }, + { + "epoch": 0.04773289561328385, + "grad_norm": 1.379805088043213, + "learning_rate": 4.971950617636553e-05, + "loss": 5.1058, + "step": 8026 + }, + { + "epoch": 0.04773884289656485, + "grad_norm": 1.7939400672912598, + "learning_rate": 4.9719436398061835e-05, + "loss": 5.0105, + "step": 8027 + }, + { + "epoch": 0.04774479017984585, + "grad_norm": 1.5610185861587524, + "learning_rate": 4.971936661112886e-05, + "loss": 5.032, + "step": 8028 + }, + { + "epoch": 0.047750737463126844, + "grad_norm": 1.524402379989624, + "learning_rate": 4.9719296815566594e-05, + "loss": 5.1376, + "step": 8029 + }, + { + "epoch": 0.04775668474640784, + "grad_norm": 1.7448087930679321, + "learning_rate": 4.971922701137509e-05, + "loss": 4.9496, + "step": 8030 + }, + { + "epoch": 0.04776263202968884, + "grad_norm": 1.7382763624191284, + "learning_rate": 4.971915719855435e-05, + "loss": 4.9755, + "step": 8031 + }, + { + "epoch": 0.047768579312969836, + "grad_norm": 1.6728250980377197, + "learning_rate": 4.971908737710441e-05, + "loss": 5.1436, + "step": 8032 + }, + { + "epoch": 0.04777452659625083, + "grad_norm": 1.4256306886672974, + "learning_rate": 4.971901754702529e-05, + "loss": 4.9739, + "step": 8033 + }, + { + "epoch": 0.04778047387953183, + "grad_norm": 1.660714864730835, + "learning_rate": 4.971894770831702e-05, + "loss": 5.1337, + "step": 8034 + }, + { + "epoch": 0.04778642116281283, + "grad_norm": 1.5240182876586914, + "learning_rate": 4.9718877860979615e-05, + "loss": 5.1143, + "step": 8035 + }, + { + "epoch": 0.04779236844609382, + "grad_norm": 1.478852391242981, + "learning_rate": 4.971880800501311e-05, + "loss": 4.968, + "step": 8036 + }, + { + "epoch": 0.04779831572937482, + "grad_norm": 1.5343812704086304, + "learning_rate": 4.971873814041752e-05, + "loss": 4.9393, + "step": 8037 + }, + { + "epoch": 0.04780426301265582, + "grad_norm": 1.6728276014328003, + "learning_rate": 4.971866826719288e-05, + "loss": 5.0535, + "step": 8038 + }, + { + "epoch": 0.047810210295936816, + "grad_norm": 1.4831758737564087, + "learning_rate": 4.971859838533921e-05, + "loss": 5.0705, + "step": 8039 + }, + { + "epoch": 0.04781615757921781, + "grad_norm": 1.7412161827087402, + "learning_rate": 4.971852849485653e-05, + "loss": 4.9338, + "step": 8040 + }, + { + "epoch": 0.04782210486249881, + "grad_norm": 1.4696041345596313, + "learning_rate": 4.971845859574487e-05, + "loss": 5.0643, + "step": 8041 + }, + { + "epoch": 0.04782805214577981, + "grad_norm": 1.4190481901168823, + "learning_rate": 4.9718388688004235e-05, + "loss": 5.0743, + "step": 8042 + }, + { + "epoch": 0.0478339994290608, + "grad_norm": 1.513454556465149, + "learning_rate": 4.9718318771634686e-05, + "loss": 4.8832, + "step": 8043 + }, + { + "epoch": 0.047839946712341805, + "grad_norm": 1.7310774326324463, + "learning_rate": 4.9718248846636216e-05, + "loss": 4.957, + "step": 8044 + }, + { + "epoch": 0.0478458939956228, + "grad_norm": 1.4895838499069214, + "learning_rate": 4.971817891300886e-05, + "loss": 4.9121, + "step": 8045 + }, + { + "epoch": 0.047851841278903795, + "grad_norm": 1.6848632097244263, + "learning_rate": 4.9718108970752656e-05, + "loss": 5.1337, + "step": 8046 + }, + { + "epoch": 0.0478577885621848, + "grad_norm": 1.7145766019821167, + "learning_rate": 4.97180390198676e-05, + "loss": 5.1827, + "step": 8047 + }, + { + "epoch": 0.04786373584546579, + "grad_norm": 1.668140172958374, + "learning_rate": 4.971796906035374e-05, + "loss": 5.4071, + "step": 8048 + }, + { + "epoch": 0.04786968312874679, + "grad_norm": 1.6927748918533325, + "learning_rate": 4.9717899092211094e-05, + "loss": 5.4319, + "step": 8049 + }, + { + "epoch": 0.04787563041202778, + "grad_norm": 1.6696170568466187, + "learning_rate": 4.971782911543968e-05, + "loss": 5.4137, + "step": 8050 + }, + { + "epoch": 0.047881577695308784, + "grad_norm": 1.9299427270889282, + "learning_rate": 4.971775913003953e-05, + "loss": 5.6676, + "step": 8051 + }, + { + "epoch": 0.04788752497858978, + "grad_norm": 1.7163755893707275, + "learning_rate": 4.971768913601066e-05, + "loss": 5.2916, + "step": 8052 + }, + { + "epoch": 0.047893472261870774, + "grad_norm": 1.7822209596633911, + "learning_rate": 4.971761913335311e-05, + "loss": 5.6364, + "step": 8053 + }, + { + "epoch": 0.047899419545151777, + "grad_norm": 1.725375771522522, + "learning_rate": 4.971754912206689e-05, + "loss": 5.045, + "step": 8054 + }, + { + "epoch": 0.04790536682843277, + "grad_norm": 1.5243995189666748, + "learning_rate": 4.9717479102152027e-05, + "loss": 5.4691, + "step": 8055 + }, + { + "epoch": 0.04791131411171377, + "grad_norm": 1.6673872470855713, + "learning_rate": 4.971740907360854e-05, + "loss": 5.4851, + "step": 8056 + }, + { + "epoch": 0.04791726139499477, + "grad_norm": 1.6378693580627441, + "learning_rate": 4.971733903643647e-05, + "loss": 5.2574, + "step": 8057 + }, + { + "epoch": 0.047923208678275764, + "grad_norm": 1.484250545501709, + "learning_rate": 4.9717268990635835e-05, + "loss": 5.2988, + "step": 8058 + }, + { + "epoch": 0.04792915596155676, + "grad_norm": 1.626955270767212, + "learning_rate": 4.971719893620665e-05, + "loss": 5.3502, + "step": 8059 + }, + { + "epoch": 0.04793510324483776, + "grad_norm": 2.1421375274658203, + "learning_rate": 4.9717128873148954e-05, + "loss": 5.3006, + "step": 8060 + }, + { + "epoch": 0.047941050528118756, + "grad_norm": 1.5175740718841553, + "learning_rate": 4.971705880146276e-05, + "loss": 5.4144, + "step": 8061 + }, + { + "epoch": 0.04794699781139975, + "grad_norm": 1.6170361042022705, + "learning_rate": 4.9716988721148095e-05, + "loss": 5.3635, + "step": 8062 + }, + { + "epoch": 0.04795294509468075, + "grad_norm": 1.7269384860992432, + "learning_rate": 4.971691863220499e-05, + "loss": 5.2813, + "step": 8063 + }, + { + "epoch": 0.04795889237796175, + "grad_norm": 1.5144844055175781, + "learning_rate": 4.971684853463345e-05, + "loss": 5.3242, + "step": 8064 + }, + { + "epoch": 0.04796483966124274, + "grad_norm": 1.7125827074050903, + "learning_rate": 4.971677842843353e-05, + "loss": 5.2968, + "step": 8065 + }, + { + "epoch": 0.04797078694452374, + "grad_norm": 1.6067146062850952, + "learning_rate": 4.9716708313605234e-05, + "loss": 5.4446, + "step": 8066 + }, + { + "epoch": 0.04797673422780474, + "grad_norm": 1.8911150693893433, + "learning_rate": 4.9716638190148585e-05, + "loss": 5.1875, + "step": 8067 + }, + { + "epoch": 0.047982681511085735, + "grad_norm": 1.6865830421447754, + "learning_rate": 4.971656805806362e-05, + "loss": 5.1909, + "step": 8068 + }, + { + "epoch": 0.04798862879436673, + "grad_norm": 2.009566068649292, + "learning_rate": 4.9716497917350345e-05, + "loss": 4.9392, + "step": 8069 + }, + { + "epoch": 0.04799457607764773, + "grad_norm": 1.8578897714614868, + "learning_rate": 4.97164277680088e-05, + "loss": 5.3101, + "step": 8070 + }, + { + "epoch": 0.04800052336092873, + "grad_norm": 1.8935741186141968, + "learning_rate": 4.971635761003901e-05, + "loss": 5.3952, + "step": 8071 + }, + { + "epoch": 0.04800647064420972, + "grad_norm": 2.0030407905578613, + "learning_rate": 4.9716287443440994e-05, + "loss": 5.1685, + "step": 8072 + }, + { + "epoch": 0.048012417927490725, + "grad_norm": 2.0079195499420166, + "learning_rate": 4.9716217268214775e-05, + "loss": 5.4942, + "step": 8073 + }, + { + "epoch": 0.04801836521077172, + "grad_norm": 1.7105878591537476, + "learning_rate": 4.971614708436038e-05, + "loss": 5.4124, + "step": 8074 + }, + { + "epoch": 0.048024312494052715, + "grad_norm": 1.7642161846160889, + "learning_rate": 4.971607689187784e-05, + "loss": 5.3187, + "step": 8075 + }, + { + "epoch": 0.04803025977733372, + "grad_norm": 1.7304610013961792, + "learning_rate": 4.9716006690767165e-05, + "loss": 5.308, + "step": 8076 + }, + { + "epoch": 0.04803620706061471, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.971593648102839e-05, + "loss": 5.4581, + "step": 8077 + }, + { + "epoch": 0.04804215434389571, + "grad_norm": 1.8008997440338135, + "learning_rate": 4.971586626266154e-05, + "loss": 5.3266, + "step": 8078 + }, + { + "epoch": 0.0480481016271767, + "grad_norm": 1.8691446781158447, + "learning_rate": 4.971579603566663e-05, + "loss": 5.2847, + "step": 8079 + }, + { + "epoch": 0.048054048910457704, + "grad_norm": 1.7805777788162231, + "learning_rate": 4.97157258000437e-05, + "loss": 5.446, + "step": 8080 + }, + { + "epoch": 0.0480599961937387, + "grad_norm": 1.4973244667053223, + "learning_rate": 4.971565555579275e-05, + "loss": 5.412, + "step": 8081 + }, + { + "epoch": 0.048065943477019694, + "grad_norm": 1.5994775295257568, + "learning_rate": 4.971558530291384e-05, + "loss": 5.3285, + "step": 8082 + }, + { + "epoch": 0.048071890760300696, + "grad_norm": 1.7743935585021973, + "learning_rate": 4.971551504140696e-05, + "loss": 5.326, + "step": 8083 + }, + { + "epoch": 0.04807783804358169, + "grad_norm": 1.5922112464904785, + "learning_rate": 4.9715444771272154e-05, + "loss": 5.3338, + "step": 8084 + }, + { + "epoch": 0.04808378532686269, + "grad_norm": 1.5587191581726074, + "learning_rate": 4.971537449250944e-05, + "loss": 5.2437, + "step": 8085 + }, + { + "epoch": 0.04808973261014369, + "grad_norm": 1.4972636699676514, + "learning_rate": 4.971530420511884e-05, + "loss": 5.2271, + "step": 8086 + }, + { + "epoch": 0.048095679893424684, + "grad_norm": 1.6221843957901, + "learning_rate": 4.971523390910039e-05, + "loss": 5.3225, + "step": 8087 + }, + { + "epoch": 0.04810162717670568, + "grad_norm": 1.5826990604400635, + "learning_rate": 4.971516360445411e-05, + "loss": 5.2955, + "step": 8088 + }, + { + "epoch": 0.04810757445998668, + "grad_norm": 1.729963779449463, + "learning_rate": 4.971509329118001e-05, + "loss": 5.3263, + "step": 8089 + }, + { + "epoch": 0.048113521743267676, + "grad_norm": 1.680851697921753, + "learning_rate": 4.971502296927813e-05, + "loss": 5.3579, + "step": 8090 + }, + { + "epoch": 0.04811946902654867, + "grad_norm": 2.028024673461914, + "learning_rate": 4.9714952638748504e-05, + "loss": 5.3632, + "step": 8091 + }, + { + "epoch": 0.04812541630982967, + "grad_norm": 1.6236159801483154, + "learning_rate": 4.9714882299591127e-05, + "loss": 5.222, + "step": 8092 + }, + { + "epoch": 0.04813136359311067, + "grad_norm": 1.7522811889648438, + "learning_rate": 4.971481195180605e-05, + "loss": 5.3752, + "step": 8093 + }, + { + "epoch": 0.04813731087639166, + "grad_norm": 1.7108362913131714, + "learning_rate": 4.9714741595393274e-05, + "loss": 5.2994, + "step": 8094 + }, + { + "epoch": 0.04814325815967266, + "grad_norm": 1.7863954305648804, + "learning_rate": 4.971467123035285e-05, + "loss": 5.2386, + "step": 8095 + }, + { + "epoch": 0.04814920544295366, + "grad_norm": 2.0054473876953125, + "learning_rate": 4.971460085668479e-05, + "loss": 5.3565, + "step": 8096 + }, + { + "epoch": 0.048155152726234655, + "grad_norm": 1.6878743171691895, + "learning_rate": 4.971453047438911e-05, + "loss": 5.3448, + "step": 8097 + }, + { + "epoch": 0.04816110000951565, + "grad_norm": 1.8534557819366455, + "learning_rate": 4.971446008346585e-05, + "loss": 5.1446, + "step": 8098 + }, + { + "epoch": 0.04816704729279665, + "grad_norm": 1.8549425601959229, + "learning_rate": 4.9714389683915025e-05, + "loss": 5.2433, + "step": 8099 + }, + { + "epoch": 0.04817299457607765, + "grad_norm": 1.5624927282333374, + "learning_rate": 4.9714319275736666e-05, + "loss": 5.0645, + "step": 8100 + }, + { + "epoch": 0.04817894185935864, + "grad_norm": 1.670462965965271, + "learning_rate": 4.971424885893078e-05, + "loss": 5.1213, + "step": 8101 + }, + { + "epoch": 0.048184889142639645, + "grad_norm": 2.039595603942871, + "learning_rate": 4.9714178433497414e-05, + "loss": 5.1797, + "step": 8102 + }, + { + "epoch": 0.04819083642592064, + "grad_norm": 1.9546380043029785, + "learning_rate": 4.971410799943659e-05, + "loss": 5.2432, + "step": 8103 + }, + { + "epoch": 0.048196783709201635, + "grad_norm": 1.892397403717041, + "learning_rate": 4.971403755674832e-05, + "loss": 5.1775, + "step": 8104 + }, + { + "epoch": 0.04820273099248264, + "grad_norm": 1.7021955251693726, + "learning_rate": 4.971396710543263e-05, + "loss": 5.2242, + "step": 8105 + }, + { + "epoch": 0.04820867827576363, + "grad_norm": 1.7652686834335327, + "learning_rate": 4.9713896645489556e-05, + "loss": 5.1419, + "step": 8106 + }, + { + "epoch": 0.04821462555904463, + "grad_norm": 1.8669620752334595, + "learning_rate": 4.971382617691911e-05, + "loss": 5.1392, + "step": 8107 + }, + { + "epoch": 0.04822057284232562, + "grad_norm": 1.8774491548538208, + "learning_rate": 4.971375569972133e-05, + "loss": 5.1853, + "step": 8108 + }, + { + "epoch": 0.048226520125606624, + "grad_norm": 1.6108628511428833, + "learning_rate": 4.971368521389623e-05, + "loss": 5.4858, + "step": 8109 + }, + { + "epoch": 0.04823246740888762, + "grad_norm": 1.6839191913604736, + "learning_rate": 4.9713614719443835e-05, + "loss": 5.4217, + "step": 8110 + }, + { + "epoch": 0.048238414692168614, + "grad_norm": 1.9300925731658936, + "learning_rate": 4.9713544216364176e-05, + "loss": 5.2259, + "step": 8111 + }, + { + "epoch": 0.048244361975449616, + "grad_norm": 1.9142355918884277, + "learning_rate": 4.971347370465728e-05, + "loss": 5.2, + "step": 8112 + }, + { + "epoch": 0.04825030925873061, + "grad_norm": 1.8046603202819824, + "learning_rate": 4.971340318432315e-05, + "loss": 5.0951, + "step": 8113 + }, + { + "epoch": 0.04825625654201161, + "grad_norm": 1.9129396677017212, + "learning_rate": 4.971333265536184e-05, + "loss": 5.0376, + "step": 8114 + }, + { + "epoch": 0.04826220382529261, + "grad_norm": 1.6774524450302124, + "learning_rate": 4.971326211777335e-05, + "loss": 5.4313, + "step": 8115 + }, + { + "epoch": 0.048268151108573604, + "grad_norm": 1.8156472444534302, + "learning_rate": 4.971319157155773e-05, + "loss": 5.4336, + "step": 8116 + }, + { + "epoch": 0.0482740983918546, + "grad_norm": 1.5704171657562256, + "learning_rate": 4.9713121016714976e-05, + "loss": 5.6878, + "step": 8117 + }, + { + "epoch": 0.0482800456751356, + "grad_norm": 1.585528016090393, + "learning_rate": 4.9713050453245135e-05, + "loss": 5.6208, + "step": 8118 + }, + { + "epoch": 0.048285992958416596, + "grad_norm": 1.3975930213928223, + "learning_rate": 4.9712979881148215e-05, + "loss": 5.8001, + "step": 8119 + }, + { + "epoch": 0.04829194024169759, + "grad_norm": 1.8124761581420898, + "learning_rate": 4.971290930042426e-05, + "loss": 5.6006, + "step": 8120 + }, + { + "epoch": 0.04829788752497859, + "grad_norm": 1.8448232412338257, + "learning_rate": 4.971283871107327e-05, + "loss": 5.4324, + "step": 8121 + }, + { + "epoch": 0.04830383480825959, + "grad_norm": 1.772218108177185, + "learning_rate": 4.97127681130953e-05, + "loss": 6.0943, + "step": 8122 + }, + { + "epoch": 0.04830978209154058, + "grad_norm": 2.038703441619873, + "learning_rate": 4.9712697506490345e-05, + "loss": 5.4224, + "step": 8123 + }, + { + "epoch": 0.04831572937482158, + "grad_norm": 1.576430320739746, + "learning_rate": 4.971262689125845e-05, + "loss": 5.351, + "step": 8124 + }, + { + "epoch": 0.04832167665810258, + "grad_norm": 1.857021450996399, + "learning_rate": 4.971255626739963e-05, + "loss": 5.258, + "step": 8125 + }, + { + "epoch": 0.048327623941383575, + "grad_norm": 1.7989404201507568, + "learning_rate": 4.971248563491391e-05, + "loss": 5.3925, + "step": 8126 + }, + { + "epoch": 0.04833357122466457, + "grad_norm": 1.8104023933410645, + "learning_rate": 4.9712414993801314e-05, + "loss": 5.4326, + "step": 8127 + }, + { + "epoch": 0.04833951850794557, + "grad_norm": 1.898054838180542, + "learning_rate": 4.971234434406188e-05, + "loss": 5.2094, + "step": 8128 + }, + { + "epoch": 0.04834546579122657, + "grad_norm": 1.436633586883545, + "learning_rate": 4.971227368569561e-05, + "loss": 5.2994, + "step": 8129 + }, + { + "epoch": 0.04835141307450756, + "grad_norm": 1.4576120376586914, + "learning_rate": 4.971220301870255e-05, + "loss": 5.3504, + "step": 8130 + }, + { + "epoch": 0.048357360357788565, + "grad_norm": 1.7260229587554932, + "learning_rate": 4.971213234308271e-05, + "loss": 5.1083, + "step": 8131 + }, + { + "epoch": 0.04836330764106956, + "grad_norm": 1.8110415935516357, + "learning_rate": 4.971206165883612e-05, + "loss": 5.1298, + "step": 8132 + }, + { + "epoch": 0.048369254924350555, + "grad_norm": 2.1696786880493164, + "learning_rate": 4.9711990965962804e-05, + "loss": 5.8155, + "step": 8133 + }, + { + "epoch": 0.04837520220763156, + "grad_norm": 1.9905856847763062, + "learning_rate": 4.971192026446279e-05, + "loss": 5.5814, + "step": 8134 + }, + { + "epoch": 0.04838114949091255, + "grad_norm": 1.7459521293640137, + "learning_rate": 4.97118495543361e-05, + "loss": 5.4358, + "step": 8135 + }, + { + "epoch": 0.04838709677419355, + "grad_norm": 1.8495198488235474, + "learning_rate": 4.9711778835582756e-05, + "loss": 5.3652, + "step": 8136 + }, + { + "epoch": 0.04839304405747455, + "grad_norm": 1.782850742340088, + "learning_rate": 4.971170810820279e-05, + "loss": 5.2361, + "step": 8137 + }, + { + "epoch": 0.048398991340755544, + "grad_norm": 1.7327016592025757, + "learning_rate": 4.971163737219622e-05, + "loss": 5.0802, + "step": 8138 + }, + { + "epoch": 0.04840493862403654, + "grad_norm": 1.663620114326477, + "learning_rate": 4.9711566627563066e-05, + "loss": 5.1566, + "step": 8139 + }, + { + "epoch": 0.048410885907317534, + "grad_norm": 1.5109026432037354, + "learning_rate": 4.971149587430336e-05, + "loss": 5.1499, + "step": 8140 + }, + { + "epoch": 0.048416833190598536, + "grad_norm": 1.3494226932525635, + "learning_rate": 4.971142511241714e-05, + "loss": 5.1684, + "step": 8141 + }, + { + "epoch": 0.04842278047387953, + "grad_norm": 1.721880555152893, + "learning_rate": 4.97113543419044e-05, + "loss": 5.0199, + "step": 8142 + }, + { + "epoch": 0.048428727757160527, + "grad_norm": 1.7465516328811646, + "learning_rate": 4.971128356276519e-05, + "loss": 5.1181, + "step": 8143 + }, + { + "epoch": 0.04843467504044153, + "grad_norm": 1.8127025365829468, + "learning_rate": 4.971121277499953e-05, + "loss": 5.6514, + "step": 8144 + }, + { + "epoch": 0.048440622323722524, + "grad_norm": 1.6027450561523438, + "learning_rate": 4.971114197860743e-05, + "loss": 5.3408, + "step": 8145 + }, + { + "epoch": 0.04844656960700352, + "grad_norm": 1.6985208988189697, + "learning_rate": 4.971107117358894e-05, + "loss": 5.2002, + "step": 8146 + }, + { + "epoch": 0.04845251689028452, + "grad_norm": 1.681305170059204, + "learning_rate": 4.971100035994406e-05, + "loss": 5.1389, + "step": 8147 + }, + { + "epoch": 0.048458464173565516, + "grad_norm": 1.6053674221038818, + "learning_rate": 4.971092953767282e-05, + "loss": 5.0665, + "step": 8148 + }, + { + "epoch": 0.04846441145684651, + "grad_norm": 1.743134617805481, + "learning_rate": 4.9710858706775266e-05, + "loss": 5.1427, + "step": 8149 + }, + { + "epoch": 0.04847035874012751, + "grad_norm": 1.4901342391967773, + "learning_rate": 4.9710787867251396e-05, + "loss": 5.1957, + "step": 8150 + }, + { + "epoch": 0.04847630602340851, + "grad_norm": 1.6003857851028442, + "learning_rate": 4.971071701910125e-05, + "loss": 5.0658, + "step": 8151 + }, + { + "epoch": 0.0484822533066895, + "grad_norm": 1.7036428451538086, + "learning_rate": 4.971064616232484e-05, + "loss": 5.0823, + "step": 8152 + }, + { + "epoch": 0.0484882005899705, + "grad_norm": 1.5894789695739746, + "learning_rate": 4.97105752969222e-05, + "loss": 5.093, + "step": 8153 + }, + { + "epoch": 0.0484941478732515, + "grad_norm": 1.487648367881775, + "learning_rate": 4.9710504422893364e-05, + "loss": 5.0089, + "step": 8154 + }, + { + "epoch": 0.048500095156532495, + "grad_norm": 2.0251479148864746, + "learning_rate": 4.971043354023834e-05, + "loss": 5.0552, + "step": 8155 + }, + { + "epoch": 0.04850604243981349, + "grad_norm": 1.7097325325012207, + "learning_rate": 4.971036264895715e-05, + "loss": 5.2737, + "step": 8156 + }, + { + "epoch": 0.04851198972309449, + "grad_norm": 1.784836769104004, + "learning_rate": 4.971029174904984e-05, + "loss": 5.2863, + "step": 8157 + }, + { + "epoch": 0.04851793700637549, + "grad_norm": 1.4765781164169312, + "learning_rate": 4.9710220840516416e-05, + "loss": 5.4057, + "step": 8158 + }, + { + "epoch": 0.04852388428965648, + "grad_norm": 1.4173041582107544, + "learning_rate": 4.9710149923356915e-05, + "loss": 5.187, + "step": 8159 + }, + { + "epoch": 0.048529831572937485, + "grad_norm": 1.488173007965088, + "learning_rate": 4.971007899757135e-05, + "loss": 4.975, + "step": 8160 + }, + { + "epoch": 0.04853577885621848, + "grad_norm": 1.391435980796814, + "learning_rate": 4.9710008063159756e-05, + "loss": 5.0782, + "step": 8161 + }, + { + "epoch": 0.048541726139499475, + "grad_norm": 1.7100436687469482, + "learning_rate": 4.970993712012215e-05, + "loss": 5.4953, + "step": 8162 + }, + { + "epoch": 0.04854767342278048, + "grad_norm": 1.8748459815979004, + "learning_rate": 4.970986616845856e-05, + "loss": 5.4535, + "step": 8163 + }, + { + "epoch": 0.04855362070606147, + "grad_norm": 1.901802897453308, + "learning_rate": 4.970979520816902e-05, + "loss": 5.3619, + "step": 8164 + }, + { + "epoch": 0.04855956798934247, + "grad_norm": 1.9850586652755737, + "learning_rate": 4.970972423925354e-05, + "loss": 5.039, + "step": 8165 + }, + { + "epoch": 0.04856551527262347, + "grad_norm": 1.5195177793502808, + "learning_rate": 4.970965326171214e-05, + "loss": 5.1721, + "step": 8166 + }, + { + "epoch": 0.048571462555904464, + "grad_norm": 1.4180214405059814, + "learning_rate": 4.9709582275544866e-05, + "loss": 5.2319, + "step": 8167 + }, + { + "epoch": 0.04857740983918546, + "grad_norm": 1.3797354698181152, + "learning_rate": 4.970951128075173e-05, + "loss": 5.1813, + "step": 8168 + }, + { + "epoch": 0.048583357122466454, + "grad_norm": 1.6448336839675903, + "learning_rate": 4.970944027733276e-05, + "loss": 5.1968, + "step": 8169 + }, + { + "epoch": 0.048589304405747456, + "grad_norm": 1.6626337766647339, + "learning_rate": 4.9709369265287986e-05, + "loss": 5.1303, + "step": 8170 + }, + { + "epoch": 0.04859525168902845, + "grad_norm": 1.5715514421463013, + "learning_rate": 4.970929824461742e-05, + "loss": 5.1609, + "step": 8171 + }, + { + "epoch": 0.048601198972309446, + "grad_norm": 1.5971697568893433, + "learning_rate": 4.970922721532108e-05, + "loss": 5.1489, + "step": 8172 + }, + { + "epoch": 0.04860714625559045, + "grad_norm": 1.6784114837646484, + "learning_rate": 4.970915617739903e-05, + "loss": 5.2778, + "step": 8173 + }, + { + "epoch": 0.048613093538871444, + "grad_norm": 1.7507476806640625, + "learning_rate": 4.970908513085125e-05, + "loss": 5.5719, + "step": 8174 + }, + { + "epoch": 0.04861904082215244, + "grad_norm": 1.7017735242843628, + "learning_rate": 4.970901407567779e-05, + "loss": 5.5197, + "step": 8175 + }, + { + "epoch": 0.04862498810543344, + "grad_norm": 1.8569817543029785, + "learning_rate": 4.9708943011878674e-05, + "loss": 5.3823, + "step": 8176 + }, + { + "epoch": 0.048630935388714436, + "grad_norm": 1.5183817148208618, + "learning_rate": 4.970887193945391e-05, + "loss": 5.5518, + "step": 8177 + }, + { + "epoch": 0.04863688267199543, + "grad_norm": 1.4175498485565186, + "learning_rate": 4.970880085840354e-05, + "loss": 5.4526, + "step": 8178 + }, + { + "epoch": 0.04864282995527643, + "grad_norm": 1.7228561639785767, + "learning_rate": 4.970872976872758e-05, + "loss": 5.5162, + "step": 8179 + }, + { + "epoch": 0.04864877723855743, + "grad_norm": 2.043182849884033, + "learning_rate": 4.970865867042606e-05, + "loss": 5.4212, + "step": 8180 + }, + { + "epoch": 0.04865472452183842, + "grad_norm": 1.377565622329712, + "learning_rate": 4.970858756349901e-05, + "loss": 5.2817, + "step": 8181 + }, + { + "epoch": 0.04866067180511942, + "grad_norm": 1.6977208852767944, + "learning_rate": 4.970851644794643e-05, + "loss": 5.4081, + "step": 8182 + }, + { + "epoch": 0.04866661908840042, + "grad_norm": 1.3136184215545654, + "learning_rate": 4.970844532376838e-05, + "loss": 5.4272, + "step": 8183 + }, + { + "epoch": 0.048672566371681415, + "grad_norm": 1.8863121271133423, + "learning_rate": 4.9708374190964854e-05, + "loss": 5.441, + "step": 8184 + }, + { + "epoch": 0.04867851365496241, + "grad_norm": 1.6755374670028687, + "learning_rate": 4.97083030495359e-05, + "loss": 5.5045, + "step": 8185 + }, + { + "epoch": 0.04868446093824341, + "grad_norm": 1.8439961671829224, + "learning_rate": 4.970823189948153e-05, + "loss": 5.5252, + "step": 8186 + }, + { + "epoch": 0.04869040822152441, + "grad_norm": 1.9662889242172241, + "learning_rate": 4.9708160740801765e-05, + "loss": 5.4379, + "step": 8187 + }, + { + "epoch": 0.0486963555048054, + "grad_norm": 1.691857099533081, + "learning_rate": 4.970808957349664e-05, + "loss": 5.3652, + "step": 8188 + }, + { + "epoch": 0.048702302788086405, + "grad_norm": 1.7482357025146484, + "learning_rate": 4.970801839756618e-05, + "loss": 5.1436, + "step": 8189 + }, + { + "epoch": 0.0487082500713674, + "grad_norm": 1.9221199750900269, + "learning_rate": 4.9707947213010396e-05, + "loss": 5.1936, + "step": 8190 + }, + { + "epoch": 0.048714197354648395, + "grad_norm": 1.9124062061309814, + "learning_rate": 4.970787601982933e-05, + "loss": 5.28, + "step": 8191 + }, + { + "epoch": 0.0487201446379294, + "grad_norm": 1.8999123573303223, + "learning_rate": 4.9707804818023e-05, + "loss": 5.3262, + "step": 8192 + }, + { + "epoch": 0.04872609192121039, + "grad_norm": 1.7711995840072632, + "learning_rate": 4.970773360759143e-05, + "loss": 5.1764, + "step": 8193 + }, + { + "epoch": 0.04873203920449139, + "grad_norm": 2.122689962387085, + "learning_rate": 4.970766238853465e-05, + "loss": 5.4345, + "step": 8194 + }, + { + "epoch": 0.04873798648777239, + "grad_norm": 2.1027848720550537, + "learning_rate": 4.9707591160852675e-05, + "loss": 5.4547, + "step": 8195 + }, + { + "epoch": 0.048743933771053384, + "grad_norm": 1.6944631338119507, + "learning_rate": 4.970751992454553e-05, + "loss": 5.3638, + "step": 8196 + }, + { + "epoch": 0.04874988105433438, + "grad_norm": 1.7444918155670166, + "learning_rate": 4.9707448679613256e-05, + "loss": 5.2378, + "step": 8197 + }, + { + "epoch": 0.048755828337615374, + "grad_norm": 1.8864104747772217, + "learning_rate": 4.970737742605586e-05, + "loss": 5.3142, + "step": 8198 + }, + { + "epoch": 0.048761775620896376, + "grad_norm": 1.968748927116394, + "learning_rate": 4.970730616387338e-05, + "loss": 5.0824, + "step": 8199 + }, + { + "epoch": 0.04876772290417737, + "grad_norm": 2.166405439376831, + "learning_rate": 4.9707234893065824e-05, + "loss": 5.0999, + "step": 8200 + }, + { + "epoch": 0.048773670187458366, + "grad_norm": 1.9185746908187866, + "learning_rate": 4.970716361363323e-05, + "loss": 5.1465, + "step": 8201 + }, + { + "epoch": 0.04877961747073937, + "grad_norm": 1.9191651344299316, + "learning_rate": 4.9707092325575635e-05, + "loss": 5.0713, + "step": 8202 + }, + { + "epoch": 0.048785564754020364, + "grad_norm": 1.6470153331756592, + "learning_rate": 4.9707021028893034e-05, + "loss": 5.0816, + "step": 8203 + }, + { + "epoch": 0.04879151203730136, + "grad_norm": 1.6995042562484741, + "learning_rate": 4.9706949723585475e-05, + "loss": 5.0207, + "step": 8204 + }, + { + "epoch": 0.04879745932058236, + "grad_norm": 1.8208703994750977, + "learning_rate": 4.970687840965297e-05, + "loss": 4.9789, + "step": 8205 + }, + { + "epoch": 0.048803406603863356, + "grad_norm": 1.8558207750320435, + "learning_rate": 4.9706807087095555e-05, + "loss": 5.0655, + "step": 8206 + }, + { + "epoch": 0.04880935388714435, + "grad_norm": 1.6349478960037231, + "learning_rate": 4.9706735755913234e-05, + "loss": 5.2657, + "step": 8207 + }, + { + "epoch": 0.04881530117042535, + "grad_norm": 1.587143063545227, + "learning_rate": 4.9706664416106065e-05, + "loss": 5.0765, + "step": 8208 + }, + { + "epoch": 0.04882124845370635, + "grad_norm": 1.8467018604278564, + "learning_rate": 4.9706593067674047e-05, + "loss": 5.1458, + "step": 8209 + }, + { + "epoch": 0.04882719573698734, + "grad_norm": 1.8066186904907227, + "learning_rate": 4.9706521710617214e-05, + "loss": 5.0656, + "step": 8210 + }, + { + "epoch": 0.04883314302026834, + "grad_norm": 1.7981528043746948, + "learning_rate": 4.9706450344935586e-05, + "loss": 5.1448, + "step": 8211 + }, + { + "epoch": 0.04883909030354934, + "grad_norm": 1.8924201726913452, + "learning_rate": 4.97063789706292e-05, + "loss": 4.748, + "step": 8212 + }, + { + "epoch": 0.048845037586830335, + "grad_norm": 2.091324806213379, + "learning_rate": 4.9706307587698064e-05, + "loss": 5.6537, + "step": 8213 + }, + { + "epoch": 0.04885098487011133, + "grad_norm": 3.1737043857574463, + "learning_rate": 4.970623619614221e-05, + "loss": 5.6898, + "step": 8214 + }, + { + "epoch": 0.04885693215339233, + "grad_norm": 2.194577932357788, + "learning_rate": 4.970616479596167e-05, + "loss": 5.4958, + "step": 8215 + }, + { + "epoch": 0.04886287943667333, + "grad_norm": 2.2362759113311768, + "learning_rate": 4.970609338715646e-05, + "loss": 4.9919, + "step": 8216 + }, + { + "epoch": 0.04886882671995432, + "grad_norm": 1.703684687614441, + "learning_rate": 4.970602196972661e-05, + "loss": 4.8733, + "step": 8217 + }, + { + "epoch": 0.048874774003235325, + "grad_norm": 2.0205307006835938, + "learning_rate": 4.970595054367214e-05, + "loss": 5.1177, + "step": 8218 + }, + { + "epoch": 0.04888072128651632, + "grad_norm": 2.1270928382873535, + "learning_rate": 4.970587910899308e-05, + "loss": 5.6208, + "step": 8219 + }, + { + "epoch": 0.048886668569797315, + "grad_norm": 1.8992488384246826, + "learning_rate": 4.9705807665689455e-05, + "loss": 5.7754, + "step": 8220 + }, + { + "epoch": 0.04889261585307832, + "grad_norm": 2.279099225997925, + "learning_rate": 4.9705736213761286e-05, + "loss": 5.5924, + "step": 8221 + }, + { + "epoch": 0.04889856313635931, + "grad_norm": 1.9186346530914307, + "learning_rate": 4.9705664753208594e-05, + "loss": 5.9424, + "step": 8222 + }, + { + "epoch": 0.04890451041964031, + "grad_norm": 2.0286009311676025, + "learning_rate": 4.970559328403141e-05, + "loss": 5.8461, + "step": 8223 + }, + { + "epoch": 0.04891045770292131, + "grad_norm": 1.797555685043335, + "learning_rate": 4.970552180622977e-05, + "loss": 5.4929, + "step": 8224 + }, + { + "epoch": 0.048916404986202304, + "grad_norm": 2.4879684448242188, + "learning_rate": 4.970545031980368e-05, + "loss": 5.5253, + "step": 8225 + }, + { + "epoch": 0.0489223522694833, + "grad_norm": 2.749763011932373, + "learning_rate": 4.970537882475318e-05, + "loss": 5.6001, + "step": 8226 + }, + { + "epoch": 0.048928299552764294, + "grad_norm": 2.2076292037963867, + "learning_rate": 4.970530732107827e-05, + "loss": 5.5876, + "step": 8227 + }, + { + "epoch": 0.048934246836045296, + "grad_norm": 2.6566662788391113, + "learning_rate": 4.970523580877901e-05, + "loss": 5.7151, + "step": 8228 + }, + { + "epoch": 0.04894019411932629, + "grad_norm": 2.4873850345611572, + "learning_rate": 4.97051642878554e-05, + "loss": 5.7124, + "step": 8229 + }, + { + "epoch": 0.048946141402607286, + "grad_norm": 1.8365200757980347, + "learning_rate": 4.970509275830748e-05, + "loss": 5.292, + "step": 8230 + }, + { + "epoch": 0.04895208868588829, + "grad_norm": 2.064730644226074, + "learning_rate": 4.9705021220135254e-05, + "loss": 5.2854, + "step": 8231 + }, + { + "epoch": 0.04895803596916928, + "grad_norm": 1.969298005104065, + "learning_rate": 4.970494967333877e-05, + "loss": 5.2113, + "step": 8232 + }, + { + "epoch": 0.04896398325245028, + "grad_norm": 1.8438071012496948, + "learning_rate": 4.9704878117918044e-05, + "loss": 5.2281, + "step": 8233 + }, + { + "epoch": 0.04896993053573128, + "grad_norm": 1.9163525104522705, + "learning_rate": 4.97048065538731e-05, + "loss": 5.043, + "step": 8234 + }, + { + "epoch": 0.048975877819012276, + "grad_norm": 1.802356243133545, + "learning_rate": 4.970473498120395e-05, + "loss": 5.2079, + "step": 8235 + }, + { + "epoch": 0.04898182510229327, + "grad_norm": 1.7572704553604126, + "learning_rate": 4.9704663399910645e-05, + "loss": 5.1119, + "step": 8236 + }, + { + "epoch": 0.04898777238557427, + "grad_norm": 1.848747730255127, + "learning_rate": 4.970459180999319e-05, + "loss": 5.0233, + "step": 8237 + }, + { + "epoch": 0.04899371966885527, + "grad_norm": 2.023036003112793, + "learning_rate": 4.9704520211451624e-05, + "loss": 5.2793, + "step": 8238 + }, + { + "epoch": 0.04899966695213626, + "grad_norm": 1.6738852262496948, + "learning_rate": 4.9704448604285965e-05, + "loss": 5.5255, + "step": 8239 + }, + { + "epoch": 0.04900561423541726, + "grad_norm": 1.6676057577133179, + "learning_rate": 4.970437698849624e-05, + "loss": 5.4287, + "step": 8240 + }, + { + "epoch": 0.04901156151869826, + "grad_norm": 1.9960590600967407, + "learning_rate": 4.970430536408247e-05, + "loss": 5.2939, + "step": 8241 + }, + { + "epoch": 0.049017508801979255, + "grad_norm": 2.7218708992004395, + "learning_rate": 4.9704233731044675e-05, + "loss": 5.9019, + "step": 8242 + }, + { + "epoch": 0.04902345608526025, + "grad_norm": 2.385664224624634, + "learning_rate": 4.970416208938289e-05, + "loss": 5.9146, + "step": 8243 + }, + { + "epoch": 0.04902940336854125, + "grad_norm": 2.2598092555999756, + "learning_rate": 4.970409043909714e-05, + "loss": 5.7451, + "step": 8244 + }, + { + "epoch": 0.04903535065182225, + "grad_norm": 2.3063299655914307, + "learning_rate": 4.970401878018745e-05, + "loss": 5.8675, + "step": 8245 + }, + { + "epoch": 0.04904129793510324, + "grad_norm": 2.1543853282928467, + "learning_rate": 4.9703947112653836e-05, + "loss": 5.9136, + "step": 8246 + }, + { + "epoch": 0.049047245218384244, + "grad_norm": 2.267531633377075, + "learning_rate": 4.970387543649634e-05, + "loss": 5.6834, + "step": 8247 + }, + { + "epoch": 0.04905319250166524, + "grad_norm": 2.047351121902466, + "learning_rate": 4.970380375171496e-05, + "loss": 5.5754, + "step": 8248 + }, + { + "epoch": 0.049059139784946235, + "grad_norm": 2.2565114498138428, + "learning_rate": 4.9703732058309745e-05, + "loss": 5.7067, + "step": 8249 + }, + { + "epoch": 0.04906508706822724, + "grad_norm": 1.7584022283554077, + "learning_rate": 4.970366035628073e-05, + "loss": 5.3926, + "step": 8250 + }, + { + "epoch": 0.04907103435150823, + "grad_norm": 1.9898183345794678, + "learning_rate": 4.9703588645627896e-05, + "loss": 5.7163, + "step": 8251 + }, + { + "epoch": 0.04907698163478923, + "grad_norm": 2.4134786128997803, + "learning_rate": 4.970351692635131e-05, + "loss": 5.672, + "step": 8252 + }, + { + "epoch": 0.04908292891807023, + "grad_norm": 2.1059436798095703, + "learning_rate": 4.970344519845097e-05, + "loss": 5.7719, + "step": 8253 + }, + { + "epoch": 0.049088876201351224, + "grad_norm": 2.0731539726257324, + "learning_rate": 4.970337346192692e-05, + "loss": 5.7104, + "step": 8254 + }, + { + "epoch": 0.04909482348463222, + "grad_norm": 2.3058536052703857, + "learning_rate": 4.970330171677918e-05, + "loss": 5.7435, + "step": 8255 + }, + { + "epoch": 0.049100770767913214, + "grad_norm": 2.051424980163574, + "learning_rate": 4.970322996300777e-05, + "loss": 5.7371, + "step": 8256 + }, + { + "epoch": 0.049106718051194216, + "grad_norm": 2.1715517044067383, + "learning_rate": 4.970315820061271e-05, + "loss": 5.5805, + "step": 8257 + }, + { + "epoch": 0.04911266533447521, + "grad_norm": 2.136617422103882, + "learning_rate": 4.9703086429594034e-05, + "loss": 5.8689, + "step": 8258 + }, + { + "epoch": 0.049118612617756206, + "grad_norm": 1.7089059352874756, + "learning_rate": 4.970301464995178e-05, + "loss": 6.0614, + "step": 8259 + }, + { + "epoch": 0.04912455990103721, + "grad_norm": 2.410067319869995, + "learning_rate": 4.970294286168595e-05, + "loss": 5.8762, + "step": 8260 + }, + { + "epoch": 0.0491305071843182, + "grad_norm": 2.2186291217803955, + "learning_rate": 4.970287106479657e-05, + "loss": 5.4903, + "step": 8261 + }, + { + "epoch": 0.0491364544675992, + "grad_norm": 2.312793016433716, + "learning_rate": 4.970279925928368e-05, + "loss": 6.2488, + "step": 8262 + }, + { + "epoch": 0.0491424017508802, + "grad_norm": 2.127859354019165, + "learning_rate": 4.9702727445147305e-05, + "loss": 5.9976, + "step": 8263 + }, + { + "epoch": 0.049148349034161196, + "grad_norm": 2.604367733001709, + "learning_rate": 4.9702655622387454e-05, + "loss": 5.4153, + "step": 8264 + }, + { + "epoch": 0.04915429631744219, + "grad_norm": 1.7832142114639282, + "learning_rate": 4.9702583791004165e-05, + "loss": 5.4024, + "step": 8265 + }, + { + "epoch": 0.04916024360072319, + "grad_norm": 2.04298734664917, + "learning_rate": 4.970251195099746e-05, + "loss": 5.7034, + "step": 8266 + }, + { + "epoch": 0.04916619088400419, + "grad_norm": 2.1806769371032715, + "learning_rate": 4.970244010236736e-05, + "loss": 6.1212, + "step": 8267 + }, + { + "epoch": 0.04917213816728518, + "grad_norm": 1.8740427494049072, + "learning_rate": 4.970236824511389e-05, + "loss": 5.7562, + "step": 8268 + }, + { + "epoch": 0.04917808545056618, + "grad_norm": 1.7718658447265625, + "learning_rate": 4.970229637923709e-05, + "loss": 5.5126, + "step": 8269 + }, + { + "epoch": 0.04918403273384718, + "grad_norm": 1.4966565370559692, + "learning_rate": 4.970222450473696e-05, + "loss": 5.5422, + "step": 8270 + }, + { + "epoch": 0.049189980017128175, + "grad_norm": 1.8283390998840332, + "learning_rate": 4.970215262161355e-05, + "loss": 5.9333, + "step": 8271 + }, + { + "epoch": 0.04919592730040917, + "grad_norm": 2.087460517883301, + "learning_rate": 4.970208072986687e-05, + "loss": 5.5413, + "step": 8272 + }, + { + "epoch": 0.04920187458369017, + "grad_norm": 2.2952873706817627, + "learning_rate": 4.970200882949694e-05, + "loss": 5.7848, + "step": 8273 + }, + { + "epoch": 0.04920782186697117, + "grad_norm": 1.9511842727661133, + "learning_rate": 4.9701936920503804e-05, + "loss": 5.6172, + "step": 8274 + }, + { + "epoch": 0.04921376915025216, + "grad_norm": 1.992211937904358, + "learning_rate": 4.970186500288748e-05, + "loss": 5.48, + "step": 8275 + }, + { + "epoch": 0.049219716433533164, + "grad_norm": 1.739013910293579, + "learning_rate": 4.9701793076647984e-05, + "loss": 5.6351, + "step": 8276 + }, + { + "epoch": 0.04922566371681416, + "grad_norm": 2.150797128677368, + "learning_rate": 4.970172114178534e-05, + "loss": 5.5957, + "step": 8277 + }, + { + "epoch": 0.049231611000095155, + "grad_norm": 2.074070930480957, + "learning_rate": 4.9701649198299594e-05, + "loss": 5.4751, + "step": 8278 + }, + { + "epoch": 0.04923755828337616, + "grad_norm": 2.2276322841644287, + "learning_rate": 4.970157724619075e-05, + "loss": 5.4434, + "step": 8279 + }, + { + "epoch": 0.04924350556665715, + "grad_norm": 1.9707896709442139, + "learning_rate": 4.970150528545884e-05, + "loss": 5.6935, + "step": 8280 + }, + { + "epoch": 0.04924945284993815, + "grad_norm": 2.07774019241333, + "learning_rate": 4.9701433316103895e-05, + "loss": 6.0455, + "step": 8281 + }, + { + "epoch": 0.04925540013321915, + "grad_norm": 2.3262722492218018, + "learning_rate": 4.970136133812593e-05, + "loss": 5.6039, + "step": 8282 + }, + { + "epoch": 0.049261347416500144, + "grad_norm": 2.4353108406066895, + "learning_rate": 4.970128935152498e-05, + "loss": 5.3823, + "step": 8283 + }, + { + "epoch": 0.04926729469978114, + "grad_norm": 2.7383084297180176, + "learning_rate": 4.970121735630106e-05, + "loss": 5.4039, + "step": 8284 + }, + { + "epoch": 0.049273241983062134, + "grad_norm": 2.9022698402404785, + "learning_rate": 4.9701145352454205e-05, + "loss": 5.3571, + "step": 8285 + }, + { + "epoch": 0.049279189266343136, + "grad_norm": 2.314373731613159, + "learning_rate": 4.970107333998443e-05, + "loss": 5.4877, + "step": 8286 + }, + { + "epoch": 0.04928513654962413, + "grad_norm": 1.9494023323059082, + "learning_rate": 4.970100131889177e-05, + "loss": 5.5171, + "step": 8287 + }, + { + "epoch": 0.049291083832905126, + "grad_norm": 2.7892074584960938, + "learning_rate": 4.9700929289176245e-05, + "loss": 5.5347, + "step": 8288 + }, + { + "epoch": 0.04929703111618613, + "grad_norm": 2.305204391479492, + "learning_rate": 4.970085725083788e-05, + "loss": 5.8689, + "step": 8289 + }, + { + "epoch": 0.04930297839946712, + "grad_norm": 2.4212634563446045, + "learning_rate": 4.97007852038767e-05, + "loss": 5.8982, + "step": 8290 + }, + { + "epoch": 0.04930892568274812, + "grad_norm": 3.584625482559204, + "learning_rate": 4.9700713148292734e-05, + "loss": 5.2341, + "step": 8291 + }, + { + "epoch": 0.04931487296602912, + "grad_norm": 2.874703884124756, + "learning_rate": 4.9700641084086e-05, + "loss": 5.2312, + "step": 8292 + }, + { + "epoch": 0.049320820249310116, + "grad_norm": 2.113234519958496, + "learning_rate": 4.9700569011256524e-05, + "loss": 5.5779, + "step": 8293 + }, + { + "epoch": 0.04932676753259111, + "grad_norm": 3.027318000793457, + "learning_rate": 4.970049692980434e-05, + "loss": 5.3899, + "step": 8294 + }, + { + "epoch": 0.04933271481587211, + "grad_norm": 2.779520273208618, + "learning_rate": 4.970042483972947e-05, + "loss": 5.4023, + "step": 8295 + }, + { + "epoch": 0.04933866209915311, + "grad_norm": 2.4358251094818115, + "learning_rate": 4.970035274103193e-05, + "loss": 5.4932, + "step": 8296 + }, + { + "epoch": 0.0493446093824341, + "grad_norm": 1.926193118095398, + "learning_rate": 4.970028063371176e-05, + "loss": 5.4058, + "step": 8297 + }, + { + "epoch": 0.0493505566657151, + "grad_norm": 1.7216569185256958, + "learning_rate": 4.970020851776898e-05, + "loss": 5.3265, + "step": 8298 + }, + { + "epoch": 0.0493565039489961, + "grad_norm": 1.9850976467132568, + "learning_rate": 4.97001363932036e-05, + "loss": 5.1626, + "step": 8299 + }, + { + "epoch": 0.049362451232277095, + "grad_norm": 2.1380982398986816, + "learning_rate": 4.9700064260015666e-05, + "loss": 5.3285, + "step": 8300 + }, + { + "epoch": 0.04936839851555809, + "grad_norm": 2.118781566619873, + "learning_rate": 4.969999211820518e-05, + "loss": 5.3544, + "step": 8301 + }, + { + "epoch": 0.04937434579883909, + "grad_norm": 2.0255584716796875, + "learning_rate": 4.96999199677722e-05, + "loss": 5.4256, + "step": 8302 + }, + { + "epoch": 0.04938029308212009, + "grad_norm": 2.0269806385040283, + "learning_rate": 4.9699847808716724e-05, + "loss": 5.9744, + "step": 8303 + }, + { + "epoch": 0.04938624036540108, + "grad_norm": 2.60446834564209, + "learning_rate": 4.969977564103879e-05, + "loss": 5.3926, + "step": 8304 + }, + { + "epoch": 0.049392187648682084, + "grad_norm": 2.1011881828308105, + "learning_rate": 4.9699703464738426e-05, + "loss": 5.4278, + "step": 8305 + }, + { + "epoch": 0.04939813493196308, + "grad_norm": 1.9267319440841675, + "learning_rate": 4.969963127981564e-05, + "loss": 5.6232, + "step": 8306 + }, + { + "epoch": 0.049404082215244075, + "grad_norm": 2.1958322525024414, + "learning_rate": 4.969955908627048e-05, + "loss": 5.8577, + "step": 8307 + }, + { + "epoch": 0.049410029498525077, + "grad_norm": 2.392241954803467, + "learning_rate": 4.969948688410294e-05, + "loss": 5.8013, + "step": 8308 + }, + { + "epoch": 0.04941597678180607, + "grad_norm": 2.8284695148468018, + "learning_rate": 4.969941467331308e-05, + "loss": 6.1246, + "step": 8309 + }, + { + "epoch": 0.04942192406508707, + "grad_norm": 2.8590078353881836, + "learning_rate": 4.96993424539009e-05, + "loss": 6.1068, + "step": 8310 + }, + { + "epoch": 0.04942787134836807, + "grad_norm": 1.876207709312439, + "learning_rate": 4.969927022586644e-05, + "loss": 5.5493, + "step": 8311 + }, + { + "epoch": 0.049433818631649064, + "grad_norm": 1.988061547279358, + "learning_rate": 4.969919798920972e-05, + "loss": 5.7059, + "step": 8312 + }, + { + "epoch": 0.04943976591493006, + "grad_norm": 2.8230605125427246, + "learning_rate": 4.969912574393077e-05, + "loss": 5.9381, + "step": 8313 + }, + { + "epoch": 0.049445713198211054, + "grad_norm": 2.4622697830200195, + "learning_rate": 4.96990534900296e-05, + "loss": 6.0935, + "step": 8314 + }, + { + "epoch": 0.049451660481492056, + "grad_norm": 2.0811798572540283, + "learning_rate": 4.9698981227506254e-05, + "loss": 6.3475, + "step": 8315 + }, + { + "epoch": 0.04945760776477305, + "grad_norm": 2.099489212036133, + "learning_rate": 4.9698908956360745e-05, + "loss": 5.7266, + "step": 8316 + }, + { + "epoch": 0.049463555048054046, + "grad_norm": 2.1711854934692383, + "learning_rate": 4.9698836676593104e-05, + "loss": 5.6067, + "step": 8317 + }, + { + "epoch": 0.04946950233133505, + "grad_norm": 2.195296287536621, + "learning_rate": 4.969876438820335e-05, + "loss": 5.3896, + "step": 8318 + }, + { + "epoch": 0.04947544961461604, + "grad_norm": 2.114830255508423, + "learning_rate": 4.969869209119151e-05, + "loss": 5.6922, + "step": 8319 + }, + { + "epoch": 0.04948139689789704, + "grad_norm": 2.1534018516540527, + "learning_rate": 4.969861978555762e-05, + "loss": 6.1372, + "step": 8320 + }, + { + "epoch": 0.04948734418117804, + "grad_norm": 2.151495933532715, + "learning_rate": 4.9698547471301696e-05, + "loss": 6.0915, + "step": 8321 + }, + { + "epoch": 0.049493291464459035, + "grad_norm": 1.8232096433639526, + "learning_rate": 4.9698475148423764e-05, + "loss": 6.1492, + "step": 8322 + }, + { + "epoch": 0.04949923874774003, + "grad_norm": 2.1538467407226562, + "learning_rate": 4.9698402816923844e-05, + "loss": 5.6253, + "step": 8323 + }, + { + "epoch": 0.04950518603102103, + "grad_norm": 2.278797149658203, + "learning_rate": 4.969833047680197e-05, + "loss": 6.0055, + "step": 8324 + }, + { + "epoch": 0.04951113331430203, + "grad_norm": 2.479342460632324, + "learning_rate": 4.9698258128058164e-05, + "loss": 5.7909, + "step": 8325 + }, + { + "epoch": 0.04951708059758302, + "grad_norm": 2.2959346771240234, + "learning_rate": 4.969818577069245e-05, + "loss": 5.6888, + "step": 8326 + }, + { + "epoch": 0.04952302788086402, + "grad_norm": 1.841544270515442, + "learning_rate": 4.969811340470486e-05, + "loss": 5.5091, + "step": 8327 + }, + { + "epoch": 0.04952897516414502, + "grad_norm": 2.4512903690338135, + "learning_rate": 4.969804103009541e-05, + "loss": 5.7271, + "step": 8328 + }, + { + "epoch": 0.049534922447426015, + "grad_norm": 2.035473585128784, + "learning_rate": 4.969796864686413e-05, + "loss": 5.3056, + "step": 8329 + }, + { + "epoch": 0.04954086973070701, + "grad_norm": 2.030576705932617, + "learning_rate": 4.9697896255011046e-05, + "loss": 5.2765, + "step": 8330 + }, + { + "epoch": 0.04954681701398801, + "grad_norm": 1.680253505706787, + "learning_rate": 4.9697823854536175e-05, + "loss": 5.1968, + "step": 8331 + }, + { + "epoch": 0.04955276429726901, + "grad_norm": 1.962259292602539, + "learning_rate": 4.969775144543955e-05, + "loss": 5.0743, + "step": 8332 + }, + { + "epoch": 0.04955871158055, + "grad_norm": 2.499044895172119, + "learning_rate": 4.96976790277212e-05, + "loss": 5.5204, + "step": 8333 + }, + { + "epoch": 0.049564658863831004, + "grad_norm": 2.004849672317505, + "learning_rate": 4.969760660138114e-05, + "loss": 5.5714, + "step": 8334 + }, + { + "epoch": 0.049570606147112, + "grad_norm": 2.255171775817871, + "learning_rate": 4.9697534166419405e-05, + "loss": 5.0766, + "step": 8335 + }, + { + "epoch": 0.049576553430392994, + "grad_norm": 2.1219112873077393, + "learning_rate": 4.969746172283601e-05, + "loss": 5.0613, + "step": 8336 + }, + { + "epoch": 0.049582500713673996, + "grad_norm": 1.9718400239944458, + "learning_rate": 4.9697389270631004e-05, + "loss": 5.0007, + "step": 8337 + }, + { + "epoch": 0.04958844799695499, + "grad_norm": 1.87917160987854, + "learning_rate": 4.969731680980437e-05, + "loss": 4.9533, + "step": 8338 + }, + { + "epoch": 0.04959439528023599, + "grad_norm": 1.9610000848770142, + "learning_rate": 4.969724434035618e-05, + "loss": 4.9761, + "step": 8339 + }, + { + "epoch": 0.04960034256351699, + "grad_norm": 1.859434723854065, + "learning_rate": 4.969717186228642e-05, + "loss": 5.2373, + "step": 8340 + }, + { + "epoch": 0.049606289846797984, + "grad_norm": 1.9905357360839844, + "learning_rate": 4.9697099375595144e-05, + "loss": 4.8858, + "step": 8341 + }, + { + "epoch": 0.04961223713007898, + "grad_norm": 1.995355486869812, + "learning_rate": 4.969702688028236e-05, + "loss": 4.9468, + "step": 8342 + }, + { + "epoch": 0.049618184413359974, + "grad_norm": 1.9970706701278687, + "learning_rate": 4.96969543763481e-05, + "loss": 4.8891, + "step": 8343 + }, + { + "epoch": 0.049624131696640976, + "grad_norm": 1.9036997556686401, + "learning_rate": 4.9696881863792385e-05, + "loss": 4.7622, + "step": 8344 + }, + { + "epoch": 0.04963007897992197, + "grad_norm": 1.9532603025436401, + "learning_rate": 4.9696809342615245e-05, + "loss": 4.7832, + "step": 8345 + }, + { + "epoch": 0.049636026263202966, + "grad_norm": 1.9032143354415894, + "learning_rate": 4.969673681281671e-05, + "loss": 4.7569, + "step": 8346 + }, + { + "epoch": 0.04964197354648397, + "grad_norm": 3.4294323921203613, + "learning_rate": 4.96966642743968e-05, + "loss": 5.9381, + "step": 8347 + }, + { + "epoch": 0.04964792082976496, + "grad_norm": 4.137698173522949, + "learning_rate": 4.969659172735554e-05, + "loss": 6.4081, + "step": 8348 + }, + { + "epoch": 0.04965386811304596, + "grad_norm": 2.774838447570801, + "learning_rate": 4.969651917169295e-05, + "loss": 5.9888, + "step": 8349 + }, + { + "epoch": 0.04965981539632696, + "grad_norm": 2.4056432247161865, + "learning_rate": 4.9696446607409054e-05, + "loss": 6.1239, + "step": 8350 + }, + { + "epoch": 0.049665762679607955, + "grad_norm": 2.098475456237793, + "learning_rate": 4.969637403450389e-05, + "loss": 6.4226, + "step": 8351 + }, + { + "epoch": 0.04967170996288895, + "grad_norm": 2.1402597427368164, + "learning_rate": 4.9696301452977475e-05, + "loss": 5.8836, + "step": 8352 + }, + { + "epoch": 0.04967765724616995, + "grad_norm": 2.8023130893707275, + "learning_rate": 4.9696228862829844e-05, + "loss": 6.2452, + "step": 8353 + }, + { + "epoch": 0.04968360452945095, + "grad_norm": 2.7669503688812256, + "learning_rate": 4.9696156264061e-05, + "loss": 6.0093, + "step": 8354 + }, + { + "epoch": 0.04968955181273194, + "grad_norm": 2.2357375621795654, + "learning_rate": 4.9696083656671e-05, + "loss": 6.0614, + "step": 8355 + }, + { + "epoch": 0.049695499096012945, + "grad_norm": 2.1435539722442627, + "learning_rate": 4.969601104065984e-05, + "loss": 6.0718, + "step": 8356 + }, + { + "epoch": 0.04970144637929394, + "grad_norm": 2.6372897624969482, + "learning_rate": 4.969593841602757e-05, + "loss": 5.4878, + "step": 8357 + }, + { + "epoch": 0.049707393662574935, + "grad_norm": 1.9730110168457031, + "learning_rate": 4.9695865782774186e-05, + "loss": 5.8913, + "step": 8358 + }, + { + "epoch": 0.04971334094585593, + "grad_norm": 2.262437105178833, + "learning_rate": 4.9695793140899737e-05, + "loss": 5.0382, + "step": 8359 + }, + { + "epoch": 0.04971928822913693, + "grad_norm": 1.794268250465393, + "learning_rate": 4.9695720490404254e-05, + "loss": 5.784, + "step": 8360 + }, + { + "epoch": 0.04972523551241793, + "grad_norm": 1.9568414688110352, + "learning_rate": 4.969564783128773e-05, + "loss": 5.8939, + "step": 8361 + }, + { + "epoch": 0.04973118279569892, + "grad_norm": 2.0560479164123535, + "learning_rate": 4.969557516355022e-05, + "loss": 5.8806, + "step": 8362 + }, + { + "epoch": 0.049737130078979924, + "grad_norm": 1.9009175300598145, + "learning_rate": 4.9695502487191746e-05, + "loss": 5.5568, + "step": 8363 + }, + { + "epoch": 0.04974307736226092, + "grad_norm": 2.1240882873535156, + "learning_rate": 4.9695429802212325e-05, + "loss": 5.4514, + "step": 8364 + }, + { + "epoch": 0.049749024645541914, + "grad_norm": 2.0803675651550293, + "learning_rate": 4.969535710861198e-05, + "loss": 5.7679, + "step": 8365 + }, + { + "epoch": 0.049754971928822916, + "grad_norm": 1.9357428550720215, + "learning_rate": 4.969528440639074e-05, + "loss": 6.1658, + "step": 8366 + }, + { + "epoch": 0.04976091921210391, + "grad_norm": 1.89462411403656, + "learning_rate": 4.9695211695548635e-05, + "loss": 6.0559, + "step": 8367 + }, + { + "epoch": 0.04976686649538491, + "grad_norm": 1.5986123085021973, + "learning_rate": 4.969513897608569e-05, + "loss": 5.7787, + "step": 8368 + }, + { + "epoch": 0.04977281377866591, + "grad_norm": 2.0391738414764404, + "learning_rate": 4.969506624800192e-05, + "loss": 5.5559, + "step": 8369 + }, + { + "epoch": 0.049778761061946904, + "grad_norm": 2.1463794708251953, + "learning_rate": 4.969499351129736e-05, + "loss": 5.5734, + "step": 8370 + }, + { + "epoch": 0.0497847083452279, + "grad_norm": 2.1488826274871826, + "learning_rate": 4.969492076597203e-05, + "loss": 5.7502, + "step": 8371 + }, + { + "epoch": 0.049790655628508894, + "grad_norm": 2.214439868927002, + "learning_rate": 4.9694848012025966e-05, + "loss": 5.8829, + "step": 8372 + }, + { + "epoch": 0.049796602911789896, + "grad_norm": 2.366196632385254, + "learning_rate": 4.969477524945918e-05, + "loss": 5.3428, + "step": 8373 + }, + { + "epoch": 0.04980255019507089, + "grad_norm": 2.239044189453125, + "learning_rate": 4.96947024782717e-05, + "loss": 5.7258, + "step": 8374 + }, + { + "epoch": 0.049808497478351886, + "grad_norm": 2.315492868423462, + "learning_rate": 4.9694629698463554e-05, + "loss": 5.6542, + "step": 8375 + }, + { + "epoch": 0.04981444476163289, + "grad_norm": 2.340740919113159, + "learning_rate": 4.969455691003478e-05, + "loss": 5.0699, + "step": 8376 + }, + { + "epoch": 0.04982039204491388, + "grad_norm": 2.644800901412964, + "learning_rate": 4.9694484112985386e-05, + "loss": 5.3808, + "step": 8377 + }, + { + "epoch": 0.04982633932819488, + "grad_norm": 2.7073781490325928, + "learning_rate": 4.96944113073154e-05, + "loss": 5.5233, + "step": 8378 + }, + { + "epoch": 0.04983228661147588, + "grad_norm": 2.5480713844299316, + "learning_rate": 4.969433849302485e-05, + "loss": 5.3908, + "step": 8379 + }, + { + "epoch": 0.049838233894756875, + "grad_norm": 2.494356155395508, + "learning_rate": 4.969426567011376e-05, + "loss": 5.3528, + "step": 8380 + }, + { + "epoch": 0.04984418117803787, + "grad_norm": 2.4249942302703857, + "learning_rate": 4.9694192838582155e-05, + "loss": 5.2995, + "step": 8381 + }, + { + "epoch": 0.04985012846131887, + "grad_norm": 2.5930840969085693, + "learning_rate": 4.9694119998430066e-05, + "loss": 6.0202, + "step": 8382 + }, + { + "epoch": 0.04985607574459987, + "grad_norm": 2.391972541809082, + "learning_rate": 4.969404714965752e-05, + "loss": 6.0247, + "step": 8383 + }, + { + "epoch": 0.04986202302788086, + "grad_norm": 2.2849159240722656, + "learning_rate": 4.9693974292264535e-05, + "loss": 5.892, + "step": 8384 + }, + { + "epoch": 0.049867970311161865, + "grad_norm": 2.1887097358703613, + "learning_rate": 4.9693901426251134e-05, + "loss": 6.0196, + "step": 8385 + }, + { + "epoch": 0.04987391759444286, + "grad_norm": 2.3988685607910156, + "learning_rate": 4.969382855161735e-05, + "loss": 5.5596, + "step": 8386 + }, + { + "epoch": 0.049879864877723855, + "grad_norm": 2.675144910812378, + "learning_rate": 4.9693755668363204e-05, + "loss": 5.3495, + "step": 8387 + }, + { + "epoch": 0.04988581216100485, + "grad_norm": 2.3753585815429688, + "learning_rate": 4.969368277648873e-05, + "loss": 5.8823, + "step": 8388 + }, + { + "epoch": 0.04989175944428585, + "grad_norm": 2.3168766498565674, + "learning_rate": 4.969360987599394e-05, + "loss": 5.9768, + "step": 8389 + }, + { + "epoch": 0.04989770672756685, + "grad_norm": 2.427138566970825, + "learning_rate": 4.969353696687886e-05, + "loss": 6.1823, + "step": 8390 + }, + { + "epoch": 0.04990365401084784, + "grad_norm": 2.304731845855713, + "learning_rate": 4.9693464049143526e-05, + "loss": 5.8697, + "step": 8391 + }, + { + "epoch": 0.049909601294128844, + "grad_norm": 2.2139687538146973, + "learning_rate": 4.9693391122787966e-05, + "loss": 6.0274, + "step": 8392 + }, + { + "epoch": 0.04991554857740984, + "grad_norm": 2.1165316104888916, + "learning_rate": 4.9693318187812185e-05, + "loss": 5.2499, + "step": 8393 + }, + { + "epoch": 0.049921495860690834, + "grad_norm": 2.5213639736175537, + "learning_rate": 4.969324524421624e-05, + "loss": 4.9105, + "step": 8394 + }, + { + "epoch": 0.049927443143971836, + "grad_norm": 2.2188315391540527, + "learning_rate": 4.9693172292000125e-05, + "loss": 4.8652, + "step": 8395 + }, + { + "epoch": 0.04993339042725283, + "grad_norm": 2.393179416656494, + "learning_rate": 4.9693099331163886e-05, + "loss": 4.924, + "step": 8396 + }, + { + "epoch": 0.04993933771053383, + "grad_norm": 2.150264024734497, + "learning_rate": 4.969302636170753e-05, + "loss": 4.9168, + "step": 8397 + }, + { + "epoch": 0.04994528499381483, + "grad_norm": 2.252499580383301, + "learning_rate": 4.96929533836311e-05, + "loss": 4.7822, + "step": 8398 + }, + { + "epoch": 0.049951232277095824, + "grad_norm": 2.342132806777954, + "learning_rate": 4.969288039693461e-05, + "loss": 5.3691, + "step": 8399 + }, + { + "epoch": 0.04995717956037682, + "grad_norm": 2.3533523082733154, + "learning_rate": 4.96928074016181e-05, + "loss": 5.9989, + "step": 8400 + }, + { + "epoch": 0.049963126843657814, + "grad_norm": 2.185727834701538, + "learning_rate": 4.969273439768158e-05, + "loss": 5.6101, + "step": 8401 + }, + { + "epoch": 0.049969074126938816, + "grad_norm": 2.3396189212799072, + "learning_rate": 4.969266138512509e-05, + "loss": 5.845, + "step": 8402 + }, + { + "epoch": 0.04997502141021981, + "grad_norm": 2.2145371437072754, + "learning_rate": 4.969258836394864e-05, + "loss": 5.6657, + "step": 8403 + }, + { + "epoch": 0.049980968693500806, + "grad_norm": 2.2084364891052246, + "learning_rate": 4.969251533415226e-05, + "loss": 5.8823, + "step": 8404 + }, + { + "epoch": 0.04998691597678181, + "grad_norm": 1.7423903942108154, + "learning_rate": 4.9692442295735984e-05, + "loss": 5.8209, + "step": 8405 + }, + { + "epoch": 0.0499928632600628, + "grad_norm": 2.3057217597961426, + "learning_rate": 4.9692369248699824e-05, + "loss": 5.8352, + "step": 8406 + }, + { + "epoch": 0.0499988105433438, + "grad_norm": 2.1800148487091064, + "learning_rate": 4.969229619304382e-05, + "loss": 5.783, + "step": 8407 + }, + { + "epoch": 0.0500047578266248, + "grad_norm": 1.8594306707382202, + "learning_rate": 4.969222312876799e-05, + "loss": 6.01, + "step": 8408 + }, + { + "epoch": 0.050010705109905795, + "grad_norm": 2.119917392730713, + "learning_rate": 4.9692150055872355e-05, + "loss": 5.7282, + "step": 8409 + }, + { + "epoch": 0.05001665239318679, + "grad_norm": 2.5282747745513916, + "learning_rate": 4.969207697435695e-05, + "loss": 5.0853, + "step": 8410 + }, + { + "epoch": 0.05002259967646779, + "grad_norm": 2.5683388710021973, + "learning_rate": 4.969200388422179e-05, + "loss": 4.9841, + "step": 8411 + }, + { + "epoch": 0.05002854695974879, + "grad_norm": 2.649918794631958, + "learning_rate": 4.969193078546692e-05, + "loss": 5.6365, + "step": 8412 + }, + { + "epoch": 0.05003449424302978, + "grad_norm": 2.3040120601654053, + "learning_rate": 4.969185767809234e-05, + "loss": 5.8272, + "step": 8413 + }, + { + "epoch": 0.050040441526310785, + "grad_norm": 2.033600330352783, + "learning_rate": 4.9691784562098084e-05, + "loss": 5.9779, + "step": 8414 + }, + { + "epoch": 0.05004638880959178, + "grad_norm": 2.1903419494628906, + "learning_rate": 4.96917114374842e-05, + "loss": 5.8651, + "step": 8415 + }, + { + "epoch": 0.050052336092872775, + "grad_norm": 2.4431047439575195, + "learning_rate": 4.969163830425068e-05, + "loss": 4.7787, + "step": 8416 + }, + { + "epoch": 0.05005828337615377, + "grad_norm": 2.6652824878692627, + "learning_rate": 4.969156516239756e-05, + "loss": 4.7133, + "step": 8417 + }, + { + "epoch": 0.05006423065943477, + "grad_norm": 2.4090182781219482, + "learning_rate": 4.969149201192488e-05, + "loss": 4.4506, + "step": 8418 + }, + { + "epoch": 0.05007017794271577, + "grad_norm": 2.5310218334198, + "learning_rate": 4.969141885283265e-05, + "loss": 4.5286, + "step": 8419 + }, + { + "epoch": 0.05007612522599676, + "grad_norm": 2.5333101749420166, + "learning_rate": 4.9691345685120905e-05, + "loss": 4.6012, + "step": 8420 + }, + { + "epoch": 0.050082072509277764, + "grad_norm": 2.172724485397339, + "learning_rate": 4.9691272508789665e-05, + "loss": 4.9161, + "step": 8421 + }, + { + "epoch": 0.05008801979255876, + "grad_norm": 2.034684181213379, + "learning_rate": 4.969119932383896e-05, + "loss": 5.3105, + "step": 8422 + }, + { + "epoch": 0.050093967075839754, + "grad_norm": 1.9046155214309692, + "learning_rate": 4.969112613026881e-05, + "loss": 5.4308, + "step": 8423 + }, + { + "epoch": 0.050099914359120756, + "grad_norm": 1.7256773710250854, + "learning_rate": 4.9691052928079226e-05, + "loss": 5.2232, + "step": 8424 + }, + { + "epoch": 0.05010586164240175, + "grad_norm": 2.0075321197509766, + "learning_rate": 4.969097971727027e-05, + "loss": 6.1764, + "step": 8425 + }, + { + "epoch": 0.050111808925682746, + "grad_norm": 2.1523852348327637, + "learning_rate": 4.9690906497841946e-05, + "loss": 5.8419, + "step": 8426 + }, + { + "epoch": 0.05011775620896375, + "grad_norm": 1.9675406217575073, + "learning_rate": 4.969083326979428e-05, + "loss": 5.7919, + "step": 8427 + }, + { + "epoch": 0.050123703492244744, + "grad_norm": 2.0327789783477783, + "learning_rate": 4.9690760033127295e-05, + "loss": 5.0232, + "step": 8428 + }, + { + "epoch": 0.05012965077552574, + "grad_norm": 1.677471399307251, + "learning_rate": 4.969068678784102e-05, + "loss": 5.1106, + "step": 8429 + }, + { + "epoch": 0.050135598058806734, + "grad_norm": 1.727847933769226, + "learning_rate": 4.9690613533935496e-05, + "loss": 5.1589, + "step": 8430 + }, + { + "epoch": 0.050141545342087736, + "grad_norm": 1.8167927265167236, + "learning_rate": 4.9690540271410726e-05, + "loss": 5.1207, + "step": 8431 + }, + { + "epoch": 0.05014749262536873, + "grad_norm": 2.277425527572632, + "learning_rate": 4.969046700026674e-05, + "loss": 5.6614, + "step": 8432 + }, + { + "epoch": 0.050153439908649726, + "grad_norm": 1.6471065282821655, + "learning_rate": 4.969039372050356e-05, + "loss": 5.2065, + "step": 8433 + }, + { + "epoch": 0.05015938719193073, + "grad_norm": 1.9049899578094482, + "learning_rate": 4.9690320432121226e-05, + "loss": 5.7453, + "step": 8434 + }, + { + "epoch": 0.05016533447521172, + "grad_norm": 1.9145495891571045, + "learning_rate": 4.969024713511976e-05, + "loss": 6.2207, + "step": 8435 + }, + { + "epoch": 0.05017128175849272, + "grad_norm": 1.6634061336517334, + "learning_rate": 4.969017382949918e-05, + "loss": 6.1694, + "step": 8436 + }, + { + "epoch": 0.05017722904177372, + "grad_norm": 1.9804925918579102, + "learning_rate": 4.969010051525952e-05, + "loss": 6.2917, + "step": 8437 + }, + { + "epoch": 0.050183176325054715, + "grad_norm": 1.9674698114395142, + "learning_rate": 4.969002719240079e-05, + "loss": 6.3105, + "step": 8438 + }, + { + "epoch": 0.05018912360833571, + "grad_norm": 2.1540520191192627, + "learning_rate": 4.968995386092303e-05, + "loss": 5.964, + "step": 8439 + }, + { + "epoch": 0.05019507089161671, + "grad_norm": 1.8545453548431396, + "learning_rate": 4.9689880520826274e-05, + "loss": 5.8744, + "step": 8440 + }, + { + "epoch": 0.05020101817489771, + "grad_norm": 1.8022514581680298, + "learning_rate": 4.968980717211053e-05, + "loss": 6.1547, + "step": 8441 + }, + { + "epoch": 0.0502069654581787, + "grad_norm": 1.6297475099563599, + "learning_rate": 4.968973381477582e-05, + "loss": 6.1397, + "step": 8442 + }, + { + "epoch": 0.050212912741459705, + "grad_norm": 1.6256400346755981, + "learning_rate": 4.968966044882219e-05, + "loss": 6.0529, + "step": 8443 + }, + { + "epoch": 0.0502188600247407, + "grad_norm": 1.5988365411758423, + "learning_rate": 4.968958707424965e-05, + "loss": 6.0653, + "step": 8444 + }, + { + "epoch": 0.050224807308021695, + "grad_norm": 1.7062568664550781, + "learning_rate": 4.968951369105823e-05, + "loss": 5.6761, + "step": 8445 + }, + { + "epoch": 0.05023075459130269, + "grad_norm": 2.6108970642089844, + "learning_rate": 4.968944029924796e-05, + "loss": 5.7222, + "step": 8446 + }, + { + "epoch": 0.05023670187458369, + "grad_norm": 2.2341887950897217, + "learning_rate": 4.9689366898818854e-05, + "loss": 6.057, + "step": 8447 + }, + { + "epoch": 0.05024264915786469, + "grad_norm": 2.1819159984588623, + "learning_rate": 4.968929348977095e-05, + "loss": 6.0386, + "step": 8448 + }, + { + "epoch": 0.05024859644114568, + "grad_norm": 1.9941349029541016, + "learning_rate": 4.968922007210427e-05, + "loss": 6.132, + "step": 8449 + }, + { + "epoch": 0.050254543724426684, + "grad_norm": 1.7330418825149536, + "learning_rate": 4.968914664581883e-05, + "loss": 6.0834, + "step": 8450 + }, + { + "epoch": 0.05026049100770768, + "grad_norm": 1.8946608304977417, + "learning_rate": 4.968907321091467e-05, + "loss": 5.9147, + "step": 8451 + }, + { + "epoch": 0.050266438290988674, + "grad_norm": 2.314767599105835, + "learning_rate": 4.9688999767391815e-05, + "loss": 5.7087, + "step": 8452 + }, + { + "epoch": 0.050272385574269676, + "grad_norm": 2.604673147201538, + "learning_rate": 4.968892631525028e-05, + "loss": 5.7348, + "step": 8453 + }, + { + "epoch": 0.05027833285755067, + "grad_norm": 2.3386125564575195, + "learning_rate": 4.9688852854490097e-05, + "loss": 5.7509, + "step": 8454 + }, + { + "epoch": 0.050284280140831666, + "grad_norm": 2.3919529914855957, + "learning_rate": 4.968877938511129e-05, + "loss": 5.5851, + "step": 8455 + }, + { + "epoch": 0.05029022742411267, + "grad_norm": 2.0978026390075684, + "learning_rate": 4.9688705907113886e-05, + "loss": 5.3663, + "step": 8456 + }, + { + "epoch": 0.050296174707393664, + "grad_norm": 2.1700327396392822, + "learning_rate": 4.9688632420497904e-05, + "loss": 6.0197, + "step": 8457 + }, + { + "epoch": 0.05030212199067466, + "grad_norm": 2.1657676696777344, + "learning_rate": 4.968855892526338e-05, + "loss": 6.1721, + "step": 8458 + }, + { + "epoch": 0.050308069273955654, + "grad_norm": 2.434732437133789, + "learning_rate": 4.968848542141033e-05, + "loss": 6.0217, + "step": 8459 + }, + { + "epoch": 0.050314016557236656, + "grad_norm": 1.8453216552734375, + "learning_rate": 4.96884119089388e-05, + "loss": 6.4071, + "step": 8460 + }, + { + "epoch": 0.05031996384051765, + "grad_norm": 1.930168628692627, + "learning_rate": 4.9688338387848784e-05, + "loss": 6.5024, + "step": 8461 + }, + { + "epoch": 0.050325911123798646, + "grad_norm": 2.1785950660705566, + "learning_rate": 4.968826485814033e-05, + "loss": 5.803, + "step": 8462 + }, + { + "epoch": 0.05033185840707965, + "grad_norm": 2.003187894821167, + "learning_rate": 4.968819131981346e-05, + "loss": 6.2269, + "step": 8463 + }, + { + "epoch": 0.05033780569036064, + "grad_norm": 2.9522452354431152, + "learning_rate": 4.9688117772868195e-05, + "loss": 5.5603, + "step": 8464 + }, + { + "epoch": 0.05034375297364164, + "grad_norm": 1.9813052415847778, + "learning_rate": 4.968804421730457e-05, + "loss": 6.0101, + "step": 8465 + }, + { + "epoch": 0.05034970025692264, + "grad_norm": 2.370225667953491, + "learning_rate": 4.9687970653122596e-05, + "loss": 6.3236, + "step": 8466 + }, + { + "epoch": 0.050355647540203635, + "grad_norm": 1.9233943223953247, + "learning_rate": 4.968789708032231e-05, + "loss": 6.2962, + "step": 8467 + }, + { + "epoch": 0.05036159482348463, + "grad_norm": 1.8740222454071045, + "learning_rate": 4.968782349890373e-05, + "loss": 5.5454, + "step": 8468 + }, + { + "epoch": 0.05036754210676563, + "grad_norm": 1.8627724647521973, + "learning_rate": 4.968774990886689e-05, + "loss": 5.9242, + "step": 8469 + }, + { + "epoch": 0.05037348939004663, + "grad_norm": 1.7016552686691284, + "learning_rate": 4.968767631021181e-05, + "loss": 6.3302, + "step": 8470 + }, + { + "epoch": 0.05037943667332762, + "grad_norm": 1.8826018571853638, + "learning_rate": 4.9687602702938515e-05, + "loss": 6.3308, + "step": 8471 + }, + { + "epoch": 0.050385383956608625, + "grad_norm": 1.777480959892273, + "learning_rate": 4.9687529087047036e-05, + "loss": 6.3948, + "step": 8472 + }, + { + "epoch": 0.05039133123988962, + "grad_norm": 2.10075306892395, + "learning_rate": 4.9687455462537396e-05, + "loss": 6.1615, + "step": 8473 + }, + { + "epoch": 0.050397278523170615, + "grad_norm": 2.3484537601470947, + "learning_rate": 4.9687381829409616e-05, + "loss": 5.8286, + "step": 8474 + }, + { + "epoch": 0.05040322580645161, + "grad_norm": 1.8243837356567383, + "learning_rate": 4.968730818766373e-05, + "loss": 6.014, + "step": 8475 + }, + { + "epoch": 0.05040917308973261, + "grad_norm": 1.8149470090866089, + "learning_rate": 4.9687234537299765e-05, + "loss": 5.9723, + "step": 8476 + }, + { + "epoch": 0.05041512037301361, + "grad_norm": 2.400754451751709, + "learning_rate": 4.968716087831773e-05, + "loss": 5.237, + "step": 8477 + }, + { + "epoch": 0.0504210676562946, + "grad_norm": 2.4394338130950928, + "learning_rate": 4.968708721071767e-05, + "loss": 5.1106, + "step": 8478 + }, + { + "epoch": 0.050427014939575604, + "grad_norm": 2.210686445236206, + "learning_rate": 4.96870135344996e-05, + "loss": 5.0002, + "step": 8479 + }, + { + "epoch": 0.0504329622228566, + "grad_norm": 2.302997589111328, + "learning_rate": 4.968693984966355e-05, + "loss": 5.689, + "step": 8480 + }, + { + "epoch": 0.050438909506137594, + "grad_norm": 2.0761525630950928, + "learning_rate": 4.9686866156209546e-05, + "loss": 5.4452, + "step": 8481 + }, + { + "epoch": 0.050444856789418596, + "grad_norm": 2.3239383697509766, + "learning_rate": 4.968679245413761e-05, + "loss": 5.4427, + "step": 8482 + }, + { + "epoch": 0.05045080407269959, + "grad_norm": 3.2064802646636963, + "learning_rate": 4.9686718743447766e-05, + "loss": 5.2947, + "step": 8483 + }, + { + "epoch": 0.050456751355980586, + "grad_norm": 2.680786371231079, + "learning_rate": 4.968664502414004e-05, + "loss": 5.4776, + "step": 8484 + }, + { + "epoch": 0.05046269863926159, + "grad_norm": 2.107583522796631, + "learning_rate": 4.9686571296214476e-05, + "loss": 5.5172, + "step": 8485 + }, + { + "epoch": 0.050468645922542583, + "grad_norm": 1.939788579940796, + "learning_rate": 4.9686497559671075e-05, + "loss": 5.6056, + "step": 8486 + }, + { + "epoch": 0.05047459320582358, + "grad_norm": 1.883991003036499, + "learning_rate": 4.968642381450987e-05, + "loss": 5.6511, + "step": 8487 + }, + { + "epoch": 0.050480540489104574, + "grad_norm": 1.8518444299697876, + "learning_rate": 4.96863500607309e-05, + "loss": 5.5897, + "step": 8488 + }, + { + "epoch": 0.050486487772385576, + "grad_norm": 1.6704350709915161, + "learning_rate": 4.968627629833418e-05, + "loss": 5.5002, + "step": 8489 + }, + { + "epoch": 0.05049243505566657, + "grad_norm": 1.755231261253357, + "learning_rate": 4.968620252731972e-05, + "loss": 5.6012, + "step": 8490 + }, + { + "epoch": 0.050498382338947566, + "grad_norm": 1.8532077074050903, + "learning_rate": 4.968612874768758e-05, + "loss": 5.4443, + "step": 8491 + }, + { + "epoch": 0.05050432962222857, + "grad_norm": 1.787781000137329, + "learning_rate": 4.9686054959437756e-05, + "loss": 5.5623, + "step": 8492 + }, + { + "epoch": 0.05051027690550956, + "grad_norm": 1.6963365077972412, + "learning_rate": 4.9685981162570295e-05, + "loss": 5.5349, + "step": 8493 + }, + { + "epoch": 0.05051622418879056, + "grad_norm": 4.328898906707764, + "learning_rate": 4.96859073570852e-05, + "loss": 5.8026, + "step": 8494 + }, + { + "epoch": 0.05052217147207156, + "grad_norm": 1.6906582117080688, + "learning_rate": 4.968583354298252e-05, + "loss": 5.4804, + "step": 8495 + }, + { + "epoch": 0.050528118755352555, + "grad_norm": 1.5316333770751953, + "learning_rate": 4.968575972026227e-05, + "loss": 5.6005, + "step": 8496 + }, + { + "epoch": 0.05053406603863355, + "grad_norm": 1.6029349565505981, + "learning_rate": 4.968568588892447e-05, + "loss": 5.5991, + "step": 8497 + }, + { + "epoch": 0.05054001332191455, + "grad_norm": 2.246537685394287, + "learning_rate": 4.968561204896916e-05, + "loss": 5.8537, + "step": 8498 + }, + { + "epoch": 0.05054596060519555, + "grad_norm": 2.0347564220428467, + "learning_rate": 4.9685538200396355e-05, + "loss": 5.7968, + "step": 8499 + }, + { + "epoch": 0.05055190788847654, + "grad_norm": 1.7635436058044434, + "learning_rate": 4.968546434320608e-05, + "loss": 5.6324, + "step": 8500 + }, + { + "epoch": 0.050557855171757544, + "grad_norm": 2.415397882461548, + "learning_rate": 4.9685390477398363e-05, + "loss": 5.3795, + "step": 8501 + }, + { + "epoch": 0.05056380245503854, + "grad_norm": 2.1499149799346924, + "learning_rate": 4.9685316602973245e-05, + "loss": 5.5638, + "step": 8502 + }, + { + "epoch": 0.050569749738319535, + "grad_norm": 2.0479557514190674, + "learning_rate": 4.9685242719930725e-05, + "loss": 5.3902, + "step": 8503 + }, + { + "epoch": 0.05057569702160053, + "grad_norm": 1.874993085861206, + "learning_rate": 4.9685168828270845e-05, + "loss": 5.4607, + "step": 8504 + }, + { + "epoch": 0.05058164430488153, + "grad_norm": 1.6361217498779297, + "learning_rate": 4.9685094927993623e-05, + "loss": 5.4378, + "step": 8505 + }, + { + "epoch": 0.05058759158816253, + "grad_norm": 1.598026990890503, + "learning_rate": 4.9685021019099096e-05, + "loss": 5.4336, + "step": 8506 + }, + { + "epoch": 0.05059353887144352, + "grad_norm": 1.7636823654174805, + "learning_rate": 4.968494710158728e-05, + "loss": 5.4757, + "step": 8507 + }, + { + "epoch": 0.050599486154724524, + "grad_norm": 1.7823325395584106, + "learning_rate": 4.968487317545821e-05, + "loss": 5.4872, + "step": 8508 + }, + { + "epoch": 0.05060543343800552, + "grad_norm": 2.39149808883667, + "learning_rate": 4.9684799240711896e-05, + "loss": 5.039, + "step": 8509 + }, + { + "epoch": 0.050611380721286514, + "grad_norm": 2.0295841693878174, + "learning_rate": 4.968472529734838e-05, + "loss": 5.1086, + "step": 8510 + }, + { + "epoch": 0.050617328004567516, + "grad_norm": 2.6830973625183105, + "learning_rate": 4.9684651345367684e-05, + "loss": 4.8889, + "step": 8511 + }, + { + "epoch": 0.05062327528784851, + "grad_norm": 2.3600027561187744, + "learning_rate": 4.9684577384769825e-05, + "loss": 5.5305, + "step": 8512 + }, + { + "epoch": 0.050629222571129506, + "grad_norm": 2.1680233478546143, + "learning_rate": 4.968450341555484e-05, + "loss": 5.8196, + "step": 8513 + }, + { + "epoch": 0.05063516985441051, + "grad_norm": 1.800645351409912, + "learning_rate": 4.968442943772275e-05, + "loss": 5.2689, + "step": 8514 + }, + { + "epoch": 0.0506411171376915, + "grad_norm": 1.983245849609375, + "learning_rate": 4.9684355451273566e-05, + "loss": 4.7782, + "step": 8515 + }, + { + "epoch": 0.0506470644209725, + "grad_norm": 2.12082576751709, + "learning_rate": 4.968428145620735e-05, + "loss": 4.7946, + "step": 8516 + }, + { + "epoch": 0.050653011704253494, + "grad_norm": 1.7249135971069336, + "learning_rate": 4.968420745252409e-05, + "loss": 4.7055, + "step": 8517 + }, + { + "epoch": 0.050658958987534496, + "grad_norm": 1.971240758895874, + "learning_rate": 4.968413344022384e-05, + "loss": 4.7343, + "step": 8518 + }, + { + "epoch": 0.05066490627081549, + "grad_norm": 1.780387282371521, + "learning_rate": 4.968405941930661e-05, + "loss": 4.7502, + "step": 8519 + }, + { + "epoch": 0.050670853554096486, + "grad_norm": 1.772007942199707, + "learning_rate": 4.968398538977242e-05, + "loss": 4.7439, + "step": 8520 + }, + { + "epoch": 0.05067680083737749, + "grad_norm": 1.9167592525482178, + "learning_rate": 4.9683911351621324e-05, + "loss": 4.6393, + "step": 8521 + }, + { + "epoch": 0.05068274812065848, + "grad_norm": 2.0527031421661377, + "learning_rate": 4.968383730485331e-05, + "loss": 4.6379, + "step": 8522 + }, + { + "epoch": 0.05068869540393948, + "grad_norm": 2.0608508586883545, + "learning_rate": 4.968376324946844e-05, + "loss": 4.6128, + "step": 8523 + }, + { + "epoch": 0.05069464268722048, + "grad_norm": 1.984731674194336, + "learning_rate": 4.968368918546672e-05, + "loss": 4.5969, + "step": 8524 + }, + { + "epoch": 0.050700589970501475, + "grad_norm": 1.7904438972473145, + "learning_rate": 4.968361511284817e-05, + "loss": 4.6853, + "step": 8525 + }, + { + "epoch": 0.05070653725378247, + "grad_norm": 1.8095389604568481, + "learning_rate": 4.968354103161283e-05, + "loss": 4.5748, + "step": 8526 + }, + { + "epoch": 0.05071248453706347, + "grad_norm": 1.8565012216567993, + "learning_rate": 4.968346694176073e-05, + "loss": 4.5249, + "step": 8527 + }, + { + "epoch": 0.05071843182034447, + "grad_norm": 1.7721836566925049, + "learning_rate": 4.968339284329188e-05, + "loss": 4.6593, + "step": 8528 + }, + { + "epoch": 0.05072437910362546, + "grad_norm": 1.9470161199569702, + "learning_rate": 4.968331873620631e-05, + "loss": 4.5432, + "step": 8529 + }, + { + "epoch": 0.050730326386906464, + "grad_norm": 1.8639118671417236, + "learning_rate": 4.968324462050404e-05, + "loss": 4.4464, + "step": 8530 + }, + { + "epoch": 0.05073627367018746, + "grad_norm": 1.9226467609405518, + "learning_rate": 4.9683170496185114e-05, + "loss": 4.4364, + "step": 8531 + }, + { + "epoch": 0.050742220953468455, + "grad_norm": 1.988198161125183, + "learning_rate": 4.9683096363249545e-05, + "loss": 4.6614, + "step": 8532 + }, + { + "epoch": 0.05074816823674945, + "grad_norm": 1.903645396232605, + "learning_rate": 4.9683022221697374e-05, + "loss": 4.5168, + "step": 8533 + }, + { + "epoch": 0.05075411552003045, + "grad_norm": 1.903448224067688, + "learning_rate": 4.96829480715286e-05, + "loss": 4.5899, + "step": 8534 + }, + { + "epoch": 0.05076006280331145, + "grad_norm": 1.864522099494934, + "learning_rate": 4.9682873912743274e-05, + "loss": 4.5896, + "step": 8535 + }, + { + "epoch": 0.05076601008659244, + "grad_norm": 1.8760302066802979, + "learning_rate": 4.9682799745341406e-05, + "loss": 4.593, + "step": 8536 + }, + { + "epoch": 0.050771957369873444, + "grad_norm": 1.9024009704589844, + "learning_rate": 4.968272556932303e-05, + "loss": 4.9861, + "step": 8537 + }, + { + "epoch": 0.05077790465315444, + "grad_norm": 2.190634250640869, + "learning_rate": 4.9682651384688176e-05, + "loss": 5.6755, + "step": 8538 + }, + { + "epoch": 0.050783851936435434, + "grad_norm": 1.758934736251831, + "learning_rate": 4.9682577191436854e-05, + "loss": 5.4334, + "step": 8539 + }, + { + "epoch": 0.050789799219716436, + "grad_norm": 2.3531200885772705, + "learning_rate": 4.968250298956909e-05, + "loss": 4.9819, + "step": 8540 + }, + { + "epoch": 0.05079574650299743, + "grad_norm": 1.901681661605835, + "learning_rate": 4.968242877908494e-05, + "loss": 5.1642, + "step": 8541 + }, + { + "epoch": 0.050801693786278426, + "grad_norm": 1.7250633239746094, + "learning_rate": 4.96823545599844e-05, + "loss": 5.4847, + "step": 8542 + }, + { + "epoch": 0.05080764106955943, + "grad_norm": 1.7400966882705688, + "learning_rate": 4.968228033226751e-05, + "loss": 5.5902, + "step": 8543 + }, + { + "epoch": 0.05081358835284042, + "grad_norm": 1.5469578504562378, + "learning_rate": 4.968220609593428e-05, + "loss": 5.6432, + "step": 8544 + }, + { + "epoch": 0.05081953563612142, + "grad_norm": 1.8277182579040527, + "learning_rate": 4.968213185098475e-05, + "loss": 5.3296, + "step": 8545 + }, + { + "epoch": 0.050825482919402414, + "grad_norm": 2.0535261631011963, + "learning_rate": 4.9682057597418943e-05, + "loss": 5.5278, + "step": 8546 + }, + { + "epoch": 0.050831430202683416, + "grad_norm": 1.8631746768951416, + "learning_rate": 4.9681983335236894e-05, + "loss": 5.556, + "step": 8547 + }, + { + "epoch": 0.05083737748596441, + "grad_norm": 1.6663711071014404, + "learning_rate": 4.968190906443861e-05, + "loss": 5.4321, + "step": 8548 + }, + { + "epoch": 0.050843324769245406, + "grad_norm": 1.8302260637283325, + "learning_rate": 4.968183478502413e-05, + "loss": 5.4746, + "step": 8549 + }, + { + "epoch": 0.05084927205252641, + "grad_norm": 1.9203182458877563, + "learning_rate": 4.968176049699347e-05, + "loss": 5.4334, + "step": 8550 + }, + { + "epoch": 0.0508552193358074, + "grad_norm": 2.0406670570373535, + "learning_rate": 4.9681686200346674e-05, + "loss": 5.6509, + "step": 8551 + }, + { + "epoch": 0.0508611666190884, + "grad_norm": 2.3438572883605957, + "learning_rate": 4.968161189508374e-05, + "loss": 5.8662, + "step": 8552 + }, + { + "epoch": 0.0508671139023694, + "grad_norm": 1.9612985849380493, + "learning_rate": 4.968153758120473e-05, + "loss": 5.6813, + "step": 8553 + }, + { + "epoch": 0.050873061185650395, + "grad_norm": 1.4175993204116821, + "learning_rate": 4.968146325870964e-05, + "loss": 5.4593, + "step": 8554 + }, + { + "epoch": 0.05087900846893139, + "grad_norm": 1.3445212841033936, + "learning_rate": 4.96813889275985e-05, + "loss": 5.4195, + "step": 8555 + }, + { + "epoch": 0.05088495575221239, + "grad_norm": 1.9938427209854126, + "learning_rate": 4.968131458787135e-05, + "loss": 5.8791, + "step": 8556 + }, + { + "epoch": 0.05089090303549339, + "grad_norm": 1.7449276447296143, + "learning_rate": 4.9681240239528216e-05, + "loss": 5.3574, + "step": 8557 + }, + { + "epoch": 0.05089685031877438, + "grad_norm": 2.0117087364196777, + "learning_rate": 4.96811658825691e-05, + "loss": 5.3548, + "step": 8558 + }, + { + "epoch": 0.050902797602055384, + "grad_norm": 1.97372567653656, + "learning_rate": 4.968109151699406e-05, + "loss": 5.5281, + "step": 8559 + }, + { + "epoch": 0.05090874488533638, + "grad_norm": 1.8815237283706665, + "learning_rate": 4.9681017142803095e-05, + "loss": 5.4849, + "step": 8560 + }, + { + "epoch": 0.050914692168617375, + "grad_norm": 1.627252221107483, + "learning_rate": 4.968094275999624e-05, + "loss": 5.2125, + "step": 8561 + }, + { + "epoch": 0.05092063945189837, + "grad_norm": 1.4768601655960083, + "learning_rate": 4.968086836857353e-05, + "loss": 5.0817, + "step": 8562 + }, + { + "epoch": 0.05092658673517937, + "grad_norm": 2.0249485969543457, + "learning_rate": 4.968079396853498e-05, + "loss": 5.4025, + "step": 8563 + }, + { + "epoch": 0.05093253401846037, + "grad_norm": 2.0904550552368164, + "learning_rate": 4.968071955988062e-05, + "loss": 5.4404, + "step": 8564 + }, + { + "epoch": 0.05093848130174136, + "grad_norm": 1.935063123703003, + "learning_rate": 4.9680645142610475e-05, + "loss": 5.4961, + "step": 8565 + }, + { + "epoch": 0.050944428585022364, + "grad_norm": 1.9836292266845703, + "learning_rate": 4.968057071672457e-05, + "loss": 5.2469, + "step": 8566 + }, + { + "epoch": 0.05095037586830336, + "grad_norm": 1.8337205648422241, + "learning_rate": 4.9680496282222944e-05, + "loss": 5.4432, + "step": 8567 + }, + { + "epoch": 0.050956323151584354, + "grad_norm": 1.9169154167175293, + "learning_rate": 4.9680421839105604e-05, + "loss": 5.2606, + "step": 8568 + }, + { + "epoch": 0.050962270434865356, + "grad_norm": 1.5869332551956177, + "learning_rate": 4.968034738737258e-05, + "loss": 5.006, + "step": 8569 + }, + { + "epoch": 0.05096821771814635, + "grad_norm": 1.5824979543685913, + "learning_rate": 4.968027292702391e-05, + "loss": 5.2078, + "step": 8570 + }, + { + "epoch": 0.050974165001427346, + "grad_norm": 1.7121458053588867, + "learning_rate": 4.96801984580596e-05, + "loss": 5.3913, + "step": 8571 + }, + { + "epoch": 0.05098011228470835, + "grad_norm": 1.7111082077026367, + "learning_rate": 4.96801239804797e-05, + "loss": 5.3957, + "step": 8572 + }, + { + "epoch": 0.05098605956798934, + "grad_norm": 1.834083080291748, + "learning_rate": 4.968004949428421e-05, + "loss": 5.501, + "step": 8573 + }, + { + "epoch": 0.05099200685127034, + "grad_norm": 1.773421287536621, + "learning_rate": 4.967997499947318e-05, + "loss": 5.429, + "step": 8574 + }, + { + "epoch": 0.05099795413455134, + "grad_norm": 1.7471132278442383, + "learning_rate": 4.967990049604663e-05, + "loss": 5.4853, + "step": 8575 + }, + { + "epoch": 0.051003901417832335, + "grad_norm": 1.7264289855957031, + "learning_rate": 4.967982598400457e-05, + "loss": 5.4415, + "step": 8576 + }, + { + "epoch": 0.05100984870111333, + "grad_norm": 1.750982403755188, + "learning_rate": 4.9679751463347044e-05, + "loss": 5.1731, + "step": 8577 + }, + { + "epoch": 0.051015795984394326, + "grad_norm": 1.6106518507003784, + "learning_rate": 4.967967693407407e-05, + "loss": 5.2692, + "step": 8578 + }, + { + "epoch": 0.05102174326767533, + "grad_norm": 1.8728212118148804, + "learning_rate": 4.967960239618568e-05, + "loss": 5.2416, + "step": 8579 + }, + { + "epoch": 0.05102769055095632, + "grad_norm": 1.6410562992095947, + "learning_rate": 4.967952784968189e-05, + "loss": 5.1824, + "step": 8580 + }, + { + "epoch": 0.05103363783423732, + "grad_norm": 1.7119427919387817, + "learning_rate": 4.967945329456274e-05, + "loss": 5.2316, + "step": 8581 + }, + { + "epoch": 0.05103958511751832, + "grad_norm": 1.667602300643921, + "learning_rate": 4.967937873082824e-05, + "loss": 4.9599, + "step": 8582 + }, + { + "epoch": 0.051045532400799315, + "grad_norm": 1.9595974683761597, + "learning_rate": 4.967930415847842e-05, + "loss": 4.9613, + "step": 8583 + }, + { + "epoch": 0.05105147968408031, + "grad_norm": 1.70210862159729, + "learning_rate": 4.967922957751332e-05, + "loss": 5.3587, + "step": 8584 + }, + { + "epoch": 0.05105742696736131, + "grad_norm": 2.101145029067993, + "learning_rate": 4.967915498793295e-05, + "loss": 5.2782, + "step": 8585 + }, + { + "epoch": 0.05106337425064231, + "grad_norm": 1.8836926221847534, + "learning_rate": 4.9679080389737344e-05, + "loss": 5.3128, + "step": 8586 + }, + { + "epoch": 0.0510693215339233, + "grad_norm": 1.7542184591293335, + "learning_rate": 4.967900578292652e-05, + "loss": 5.2236, + "step": 8587 + }, + { + "epoch": 0.051075268817204304, + "grad_norm": 1.8415964841842651, + "learning_rate": 4.967893116750052e-05, + "loss": 5.1267, + "step": 8588 + }, + { + "epoch": 0.0510812161004853, + "grad_norm": 1.7702316045761108, + "learning_rate": 4.967885654345936e-05, + "loss": 5.6495, + "step": 8589 + }, + { + "epoch": 0.051087163383766294, + "grad_norm": 1.7790406942367554, + "learning_rate": 4.967878191080306e-05, + "loss": 5.2561, + "step": 8590 + }, + { + "epoch": 0.05109311066704729, + "grad_norm": 1.7282217741012573, + "learning_rate": 4.967870726953165e-05, + "loss": 5.2589, + "step": 8591 + }, + { + "epoch": 0.05109905795032829, + "grad_norm": 1.6590560674667358, + "learning_rate": 4.967863261964517e-05, + "loss": 5.1952, + "step": 8592 + }, + { + "epoch": 0.05110500523360929, + "grad_norm": 1.5948386192321777, + "learning_rate": 4.9678557961143625e-05, + "loss": 5.297, + "step": 8593 + }, + { + "epoch": 0.05111095251689028, + "grad_norm": 1.8219022750854492, + "learning_rate": 4.9678483294027046e-05, + "loss": 5.3391, + "step": 8594 + }, + { + "epoch": 0.051116899800171284, + "grad_norm": 1.547616720199585, + "learning_rate": 4.967840861829547e-05, + "loss": 5.4224, + "step": 8595 + }, + { + "epoch": 0.05112284708345228, + "grad_norm": 1.7924590110778809, + "learning_rate": 4.9678333933948914e-05, + "loss": 5.2371, + "step": 8596 + }, + { + "epoch": 0.051128794366733274, + "grad_norm": 1.7630747556686401, + "learning_rate": 4.9678259240987416e-05, + "loss": 5.4849, + "step": 8597 + }, + { + "epoch": 0.051134741650014276, + "grad_norm": 1.7853891849517822, + "learning_rate": 4.967818453941098e-05, + "loss": 5.1753, + "step": 8598 + }, + { + "epoch": 0.05114068893329527, + "grad_norm": 1.6572301387786865, + "learning_rate": 4.9678109829219654e-05, + "loss": 5.3747, + "step": 8599 + }, + { + "epoch": 0.051146636216576266, + "grad_norm": 1.6574329137802124, + "learning_rate": 4.9678035110413445e-05, + "loss": 5.417, + "step": 8600 + }, + { + "epoch": 0.05115258349985727, + "grad_norm": 1.7093894481658936, + "learning_rate": 4.9677960382992396e-05, + "loss": 5.4605, + "step": 8601 + }, + { + "epoch": 0.05115853078313826, + "grad_norm": 1.6304559707641602, + "learning_rate": 4.967788564695652e-05, + "loss": 5.6186, + "step": 8602 + }, + { + "epoch": 0.05116447806641926, + "grad_norm": 1.6134929656982422, + "learning_rate": 4.967781090230586e-05, + "loss": 5.5084, + "step": 8603 + }, + { + "epoch": 0.05117042534970026, + "grad_norm": 1.7007251977920532, + "learning_rate": 4.9677736149040426e-05, + "loss": 5.2542, + "step": 8604 + }, + { + "epoch": 0.051176372632981255, + "grad_norm": 1.6648818254470825, + "learning_rate": 4.967766138716025e-05, + "loss": 5.4136, + "step": 8605 + }, + { + "epoch": 0.05118231991626225, + "grad_norm": 1.5595816373825073, + "learning_rate": 4.967758661666535e-05, + "loss": 5.181, + "step": 8606 + }, + { + "epoch": 0.051188267199543246, + "grad_norm": 1.7358763217926025, + "learning_rate": 4.967751183755577e-05, + "loss": 5.3509, + "step": 8607 + }, + { + "epoch": 0.05119421448282425, + "grad_norm": 1.6836191415786743, + "learning_rate": 4.967743704983152e-05, + "loss": 5.4656, + "step": 8608 + }, + { + "epoch": 0.05120016176610524, + "grad_norm": 1.4641087055206299, + "learning_rate": 4.967736225349263e-05, + "loss": 5.5304, + "step": 8609 + }, + { + "epoch": 0.05120610904938624, + "grad_norm": 1.6273541450500488, + "learning_rate": 4.967728744853913e-05, + "loss": 5.4029, + "step": 8610 + }, + { + "epoch": 0.05121205633266724, + "grad_norm": 1.6471314430236816, + "learning_rate": 4.967721263497105e-05, + "loss": 5.4333, + "step": 8611 + }, + { + "epoch": 0.051218003615948235, + "grad_norm": 1.798155665397644, + "learning_rate": 4.96771378127884e-05, + "loss": 5.5214, + "step": 8612 + }, + { + "epoch": 0.05122395089922923, + "grad_norm": 1.8606700897216797, + "learning_rate": 4.967706298199122e-05, + "loss": 4.8808, + "step": 8613 + }, + { + "epoch": 0.05122989818251023, + "grad_norm": 1.7144849300384521, + "learning_rate": 4.967698814257953e-05, + "loss": 4.9451, + "step": 8614 + }, + { + "epoch": 0.05123584546579123, + "grad_norm": 1.7411640882492065, + "learning_rate": 4.9676913294553364e-05, + "loss": 4.9771, + "step": 8615 + }, + { + "epoch": 0.05124179274907222, + "grad_norm": 1.7012072801589966, + "learning_rate": 4.9676838437912736e-05, + "loss": 4.9028, + "step": 8616 + }, + { + "epoch": 0.051247740032353224, + "grad_norm": 1.8154243230819702, + "learning_rate": 4.967676357265768e-05, + "loss": 5.4115, + "step": 8617 + }, + { + "epoch": 0.05125368731563422, + "grad_norm": 2.7746822834014893, + "learning_rate": 4.967668869878823e-05, + "loss": 5.5487, + "step": 8618 + }, + { + "epoch": 0.051259634598915214, + "grad_norm": 1.8362152576446533, + "learning_rate": 4.9676613816304395e-05, + "loss": 5.486, + "step": 8619 + }, + { + "epoch": 0.05126558188219621, + "grad_norm": 1.975853681564331, + "learning_rate": 4.967653892520621e-05, + "loss": 5.4348, + "step": 8620 + }, + { + "epoch": 0.05127152916547721, + "grad_norm": 1.8126581907272339, + "learning_rate": 4.96764640254937e-05, + "loss": 5.4558, + "step": 8621 + }, + { + "epoch": 0.05127747644875821, + "grad_norm": 1.6068531274795532, + "learning_rate": 4.967638911716689e-05, + "loss": 5.4672, + "step": 8622 + }, + { + "epoch": 0.0512834237320392, + "grad_norm": 1.6384878158569336, + "learning_rate": 4.9676314200225804e-05, + "loss": 5.1591, + "step": 8623 + }, + { + "epoch": 0.051289371015320204, + "grad_norm": 2.0413742065429688, + "learning_rate": 4.9676239274670474e-05, + "loss": 4.8992, + "step": 8624 + }, + { + "epoch": 0.0512953182986012, + "grad_norm": 1.7591389417648315, + "learning_rate": 4.967616434050093e-05, + "loss": 5.3629, + "step": 8625 + }, + { + "epoch": 0.051301265581882194, + "grad_norm": 1.9222301244735718, + "learning_rate": 4.967608939771719e-05, + "loss": 5.5082, + "step": 8626 + }, + { + "epoch": 0.051307212865163196, + "grad_norm": 1.8040579557418823, + "learning_rate": 4.967601444631928e-05, + "loss": 5.4019, + "step": 8627 + }, + { + "epoch": 0.05131316014844419, + "grad_norm": 2.0685603618621826, + "learning_rate": 4.967593948630723e-05, + "loss": 5.1959, + "step": 8628 + }, + { + "epoch": 0.051319107431725186, + "grad_norm": 1.446341872215271, + "learning_rate": 4.967586451768106e-05, + "loss": 5.4233, + "step": 8629 + }, + { + "epoch": 0.05132505471500619, + "grad_norm": 1.4487289190292358, + "learning_rate": 4.9675789540440806e-05, + "loss": 5.4065, + "step": 8630 + }, + { + "epoch": 0.05133100199828718, + "grad_norm": 2.367469310760498, + "learning_rate": 4.967571455458648e-05, + "loss": 5.3512, + "step": 8631 + }, + { + "epoch": 0.05133694928156818, + "grad_norm": 2.7115249633789062, + "learning_rate": 4.967563956011812e-05, + "loss": 5.4494, + "step": 8632 + }, + { + "epoch": 0.05134289656484918, + "grad_norm": 2.6692097187042236, + "learning_rate": 4.967556455703576e-05, + "loss": 5.2747, + "step": 8633 + }, + { + "epoch": 0.051348843848130175, + "grad_norm": 2.516005754470825, + "learning_rate": 4.967548954533941e-05, + "loss": 5.2305, + "step": 8634 + }, + { + "epoch": 0.05135479113141117, + "grad_norm": 1.6234782934188843, + "learning_rate": 4.96754145250291e-05, + "loss": 5.5192, + "step": 8635 + }, + { + "epoch": 0.051360738414692166, + "grad_norm": 1.9273806810379028, + "learning_rate": 4.9675339496104855e-05, + "loss": 5.4479, + "step": 8636 + }, + { + "epoch": 0.05136668569797317, + "grad_norm": 2.510847568511963, + "learning_rate": 4.967526445856671e-05, + "loss": 4.9858, + "step": 8637 + }, + { + "epoch": 0.05137263298125416, + "grad_norm": 2.3722991943359375, + "learning_rate": 4.967518941241468e-05, + "loss": 5.2287, + "step": 8638 + }, + { + "epoch": 0.05137858026453516, + "grad_norm": 2.286569118499756, + "learning_rate": 4.96751143576488e-05, + "loss": 5.2643, + "step": 8639 + }, + { + "epoch": 0.05138452754781616, + "grad_norm": 2.493534803390503, + "learning_rate": 4.9675039294269086e-05, + "loss": 5.1207, + "step": 8640 + }, + { + "epoch": 0.051390474831097155, + "grad_norm": 2.622694969177246, + "learning_rate": 4.967496422227558e-05, + "loss": 4.9735, + "step": 8641 + }, + { + "epoch": 0.05139642211437815, + "grad_norm": 1.7518365383148193, + "learning_rate": 4.967488914166829e-05, + "loss": 5.8818, + "step": 8642 + }, + { + "epoch": 0.05140236939765915, + "grad_norm": 2.0281870365142822, + "learning_rate": 4.9674814052447256e-05, + "loss": 6.3773, + "step": 8643 + }, + { + "epoch": 0.05140831668094015, + "grad_norm": 1.880083441734314, + "learning_rate": 4.96747389546125e-05, + "loss": 5.831, + "step": 8644 + }, + { + "epoch": 0.05141426396422114, + "grad_norm": 2.0792593955993652, + "learning_rate": 4.967466384816404e-05, + "loss": 5.8799, + "step": 8645 + }, + { + "epoch": 0.051420211247502144, + "grad_norm": 2.4550280570983887, + "learning_rate": 4.967458873310192e-05, + "loss": 5.2983, + "step": 8646 + }, + { + "epoch": 0.05142615853078314, + "grad_norm": 2.5590765476226807, + "learning_rate": 4.967451360942615e-05, + "loss": 5.1157, + "step": 8647 + }, + { + "epoch": 0.051432105814064134, + "grad_norm": 2.2328450679779053, + "learning_rate": 4.967443847713677e-05, + "loss": 5.047, + "step": 8648 + }, + { + "epoch": 0.05143805309734513, + "grad_norm": 2.0624022483825684, + "learning_rate": 4.9674363336233786e-05, + "loss": 5.6819, + "step": 8649 + }, + { + "epoch": 0.05144400038062613, + "grad_norm": 2.075239658355713, + "learning_rate": 4.9674288186717246e-05, + "loss": 5.895, + "step": 8650 + }, + { + "epoch": 0.05144994766390713, + "grad_norm": 1.7228562831878662, + "learning_rate": 4.967421302858716e-05, + "loss": 5.9199, + "step": 8651 + }, + { + "epoch": 0.05145589494718812, + "grad_norm": 2.235020637512207, + "learning_rate": 4.967413786184356e-05, + "loss": 5.0644, + "step": 8652 + }, + { + "epoch": 0.051461842230469124, + "grad_norm": 1.8620972633361816, + "learning_rate": 4.967406268648648e-05, + "loss": 5.7956, + "step": 8653 + }, + { + "epoch": 0.05146778951375012, + "grad_norm": 1.7914378643035889, + "learning_rate": 4.967398750251594e-05, + "loss": 5.742, + "step": 8654 + }, + { + "epoch": 0.051473736797031114, + "grad_norm": 2.0010504722595215, + "learning_rate": 4.967391230993196e-05, + "loss": 5.7808, + "step": 8655 + }, + { + "epoch": 0.051479684080312116, + "grad_norm": 2.1851212978363037, + "learning_rate": 4.9673837108734575e-05, + "loss": 5.4217, + "step": 8656 + }, + { + "epoch": 0.05148563136359311, + "grad_norm": 1.6896641254425049, + "learning_rate": 4.967376189892382e-05, + "loss": 6.321, + "step": 8657 + }, + { + "epoch": 0.051491578646874106, + "grad_norm": 1.7083675861358643, + "learning_rate": 4.967368668049969e-05, + "loss": 5.495, + "step": 8658 + }, + { + "epoch": 0.05149752593015511, + "grad_norm": 2.537256956100464, + "learning_rate": 4.967361145346224e-05, + "loss": 5.4096, + "step": 8659 + }, + { + "epoch": 0.0515034732134361, + "grad_norm": 2.3463892936706543, + "learning_rate": 4.967353621781149e-05, + "loss": 6.2461, + "step": 8660 + }, + { + "epoch": 0.0515094204967171, + "grad_norm": 1.6834701299667358, + "learning_rate": 4.967346097354746e-05, + "loss": 6.1007, + "step": 8661 + }, + { + "epoch": 0.0515153677799981, + "grad_norm": 2.140557289123535, + "learning_rate": 4.9673385720670184e-05, + "loss": 5.9908, + "step": 8662 + }, + { + "epoch": 0.051521315063279095, + "grad_norm": 2.211639165878296, + "learning_rate": 4.9673310459179676e-05, + "loss": 6.4192, + "step": 8663 + }, + { + "epoch": 0.05152726234656009, + "grad_norm": 1.8421399593353271, + "learning_rate": 4.9673235189075975e-05, + "loss": 6.099, + "step": 8664 + }, + { + "epoch": 0.051533209629841085, + "grad_norm": 1.7775965929031372, + "learning_rate": 4.96731599103591e-05, + "loss": 5.9572, + "step": 8665 + }, + { + "epoch": 0.05153915691312209, + "grad_norm": 1.7500132322311401, + "learning_rate": 4.967308462302909e-05, + "loss": 6.0987, + "step": 8666 + }, + { + "epoch": 0.05154510419640308, + "grad_norm": 1.7952892780303955, + "learning_rate": 4.967300932708595e-05, + "loss": 6.0235, + "step": 8667 + }, + { + "epoch": 0.05155105147968408, + "grad_norm": 1.7696008682250977, + "learning_rate": 4.967293402252972e-05, + "loss": 5.8253, + "step": 8668 + }, + { + "epoch": 0.05155699876296508, + "grad_norm": 1.848975419998169, + "learning_rate": 4.967285870936042e-05, + "loss": 6.0942, + "step": 8669 + }, + { + "epoch": 0.051562946046246075, + "grad_norm": 2.412909507751465, + "learning_rate": 4.967278338757808e-05, + "loss": 5.5752, + "step": 8670 + }, + { + "epoch": 0.05156889332952707, + "grad_norm": 2.0214738845825195, + "learning_rate": 4.967270805718273e-05, + "loss": 5.5721, + "step": 8671 + }, + { + "epoch": 0.05157484061280807, + "grad_norm": 2.3830201625823975, + "learning_rate": 4.967263271817439e-05, + "loss": 6.034, + "step": 8672 + }, + { + "epoch": 0.05158078789608907, + "grad_norm": 2.213979959487915, + "learning_rate": 4.9672557370553094e-05, + "loss": 6.0169, + "step": 8673 + }, + { + "epoch": 0.05158673517937006, + "grad_norm": 1.9657354354858398, + "learning_rate": 4.967248201431887e-05, + "loss": 6.0159, + "step": 8674 + }, + { + "epoch": 0.051592682462651064, + "grad_norm": 2.0882673263549805, + "learning_rate": 4.967240664947172e-05, + "loss": 6.1088, + "step": 8675 + }, + { + "epoch": 0.05159862974593206, + "grad_norm": 2.291152000427246, + "learning_rate": 4.96723312760117e-05, + "loss": 5.4534, + "step": 8676 + }, + { + "epoch": 0.051604577029213054, + "grad_norm": 2.3495421409606934, + "learning_rate": 4.967225589393881e-05, + "loss": 5.5524, + "step": 8677 + }, + { + "epoch": 0.05161052431249405, + "grad_norm": 2.2665255069732666, + "learning_rate": 4.9672180503253106e-05, + "loss": 5.5208, + "step": 8678 + }, + { + "epoch": 0.05161647159577505, + "grad_norm": 2.1587207317352295, + "learning_rate": 4.9672105103954594e-05, + "loss": 5.7016, + "step": 8679 + }, + { + "epoch": 0.051622418879056046, + "grad_norm": 2.2260420322418213, + "learning_rate": 4.96720296960433e-05, + "loss": 5.6179, + "step": 8680 + }, + { + "epoch": 0.05162836616233704, + "grad_norm": 3.1678147315979004, + "learning_rate": 4.967195427951926e-05, + "loss": 5.4655, + "step": 8681 + }, + { + "epoch": 0.051634313445618044, + "grad_norm": 3.0126166343688965, + "learning_rate": 4.967187885438249e-05, + "loss": 5.5663, + "step": 8682 + }, + { + "epoch": 0.05164026072889904, + "grad_norm": 2.290069341659546, + "learning_rate": 4.9671803420633034e-05, + "loss": 5.7462, + "step": 8683 + }, + { + "epoch": 0.051646208012180034, + "grad_norm": 2.1958532333374023, + "learning_rate": 4.96717279782709e-05, + "loss": 5.8359, + "step": 8684 + }, + { + "epoch": 0.051652155295461036, + "grad_norm": 2.063312530517578, + "learning_rate": 4.967165252729611e-05, + "loss": 5.847, + "step": 8685 + }, + { + "epoch": 0.05165810257874203, + "grad_norm": 1.8041539192199707, + "learning_rate": 4.967157706770872e-05, + "loss": 5.9408, + "step": 8686 + }, + { + "epoch": 0.051664049862023026, + "grad_norm": 1.684831976890564, + "learning_rate": 4.967150159950873e-05, + "loss": 6.019, + "step": 8687 + }, + { + "epoch": 0.05166999714530403, + "grad_norm": 2.4915740489959717, + "learning_rate": 4.967142612269616e-05, + "loss": 5.357, + "step": 8688 + }, + { + "epoch": 0.05167594442858502, + "grad_norm": 2.2621138095855713, + "learning_rate": 4.967135063727106e-05, + "loss": 5.7726, + "step": 8689 + }, + { + "epoch": 0.05168189171186602, + "grad_norm": 1.9304747581481934, + "learning_rate": 4.967127514323345e-05, + "loss": 6.0958, + "step": 8690 + }, + { + "epoch": 0.05168783899514702, + "grad_norm": 1.7657890319824219, + "learning_rate": 4.9671199640583354e-05, + "loss": 6.1036, + "step": 8691 + }, + { + "epoch": 0.051693786278428015, + "grad_norm": 1.7449486255645752, + "learning_rate": 4.9671124129320794e-05, + "loss": 6.0843, + "step": 8692 + }, + { + "epoch": 0.05169973356170901, + "grad_norm": 2.0155117511749268, + "learning_rate": 4.96710486094458e-05, + "loss": 5.9626, + "step": 8693 + }, + { + "epoch": 0.051705680844990005, + "grad_norm": 2.1015188694000244, + "learning_rate": 4.967097308095839e-05, + "loss": 5.6053, + "step": 8694 + }, + { + "epoch": 0.05171162812827101, + "grad_norm": 1.9602909088134766, + "learning_rate": 4.967089754385861e-05, + "loss": 5.1988, + "step": 8695 + }, + { + "epoch": 0.051717575411552, + "grad_norm": 2.141657590866089, + "learning_rate": 4.9670821998146474e-05, + "loss": 5.2994, + "step": 8696 + }, + { + "epoch": 0.051723522694833, + "grad_norm": 2.1301774978637695, + "learning_rate": 4.9670746443822006e-05, + "loss": 5.7935, + "step": 8697 + }, + { + "epoch": 0.051729469978114, + "grad_norm": 1.9465678930282593, + "learning_rate": 4.9670670880885225e-05, + "loss": 5.1861, + "step": 8698 + }, + { + "epoch": 0.051735417261394995, + "grad_norm": 2.177234411239624, + "learning_rate": 4.967059530933618e-05, + "loss": 5.1114, + "step": 8699 + }, + { + "epoch": 0.05174136454467599, + "grad_norm": 2.0886077880859375, + "learning_rate": 4.967051972917488e-05, + "loss": 5.2905, + "step": 8700 + }, + { + "epoch": 0.05174731182795699, + "grad_norm": 1.8517125844955444, + "learning_rate": 4.967044414040136e-05, + "loss": 5.1672, + "step": 8701 + }, + { + "epoch": 0.05175325911123799, + "grad_norm": 1.7342808246612549, + "learning_rate": 4.967036854301564e-05, + "loss": 5.2767, + "step": 8702 + }, + { + "epoch": 0.05175920639451898, + "grad_norm": 1.7315362691879272, + "learning_rate": 4.9670292937017746e-05, + "loss": 5.2897, + "step": 8703 + }, + { + "epoch": 0.051765153677799984, + "grad_norm": 1.8794540166854858, + "learning_rate": 4.967021732240772e-05, + "loss": 5.3808, + "step": 8704 + }, + { + "epoch": 0.05177110096108098, + "grad_norm": 1.8047478199005127, + "learning_rate": 4.9670141699185565e-05, + "loss": 5.1074, + "step": 8705 + }, + { + "epoch": 0.051777048244361974, + "grad_norm": 1.699475884437561, + "learning_rate": 4.967006606735132e-05, + "loss": 5.8162, + "step": 8706 + }, + { + "epoch": 0.05178299552764297, + "grad_norm": 2.008352518081665, + "learning_rate": 4.966999042690501e-05, + "loss": 6.3593, + "step": 8707 + }, + { + "epoch": 0.05178894281092397, + "grad_norm": 1.8776370286941528, + "learning_rate": 4.966991477784667e-05, + "loss": 6.3419, + "step": 8708 + }, + { + "epoch": 0.051794890094204966, + "grad_norm": 2.018157720565796, + "learning_rate": 4.9669839120176306e-05, + "loss": 6.1927, + "step": 8709 + }, + { + "epoch": 0.05180083737748596, + "grad_norm": 1.833764910697937, + "learning_rate": 4.966976345389396e-05, + "loss": 5.0803, + "step": 8710 + }, + { + "epoch": 0.051806784660766964, + "grad_norm": 1.7809339761734009, + "learning_rate": 4.9669687778999655e-05, + "loss": 5.3891, + "step": 8711 + }, + { + "epoch": 0.05181273194404796, + "grad_norm": 1.9905017614364624, + "learning_rate": 4.966961209549341e-05, + "loss": 6.247, + "step": 8712 + }, + { + "epoch": 0.051818679227328954, + "grad_norm": 2.1396658420562744, + "learning_rate": 4.966953640337527e-05, + "loss": 6.2506, + "step": 8713 + }, + { + "epoch": 0.051824626510609956, + "grad_norm": 1.778996467590332, + "learning_rate": 4.9669460702645244e-05, + "loss": 6.1333, + "step": 8714 + }, + { + "epoch": 0.05183057379389095, + "grad_norm": 1.9936842918395996, + "learning_rate": 4.9669384993303366e-05, + "loss": 5.6486, + "step": 8715 + }, + { + "epoch": 0.051836521077171946, + "grad_norm": 1.8064475059509277, + "learning_rate": 4.9669309275349656e-05, + "loss": 6.1217, + "step": 8716 + }, + { + "epoch": 0.05184246836045295, + "grad_norm": 1.9532819986343384, + "learning_rate": 4.966923354878414e-05, + "loss": 5.5402, + "step": 8717 + }, + { + "epoch": 0.05184841564373394, + "grad_norm": 2.4843015670776367, + "learning_rate": 4.966915781360686e-05, + "loss": 4.7674, + "step": 8718 + }, + { + "epoch": 0.05185436292701494, + "grad_norm": 2.7453129291534424, + "learning_rate": 4.9669082069817835e-05, + "loss": 4.4489, + "step": 8719 + }, + { + "epoch": 0.05186031021029594, + "grad_norm": 3.0180628299713135, + "learning_rate": 4.9669006317417084e-05, + "loss": 4.1401, + "step": 8720 + }, + { + "epoch": 0.051866257493576935, + "grad_norm": 2.44638991355896, + "learning_rate": 4.966893055640464e-05, + "loss": 4.7241, + "step": 8721 + }, + { + "epoch": 0.05187220477685793, + "grad_norm": 2.0131804943084717, + "learning_rate": 4.9668854786780514e-05, + "loss": 5.6495, + "step": 8722 + }, + { + "epoch": 0.051878152060138925, + "grad_norm": 2.0331337451934814, + "learning_rate": 4.966877900854476e-05, + "loss": 5.6812, + "step": 8723 + }, + { + "epoch": 0.05188409934341993, + "grad_norm": 2.5784926414489746, + "learning_rate": 4.9668703221697385e-05, + "loss": 5.3617, + "step": 8724 + }, + { + "epoch": 0.05189004662670092, + "grad_norm": 2.599321126937866, + "learning_rate": 4.9668627426238425e-05, + "loss": 5.6273, + "step": 8725 + }, + { + "epoch": 0.05189599390998192, + "grad_norm": 2.53541898727417, + "learning_rate": 4.966855162216789e-05, + "loss": 5.2916, + "step": 8726 + }, + { + "epoch": 0.05190194119326292, + "grad_norm": 2.165160655975342, + "learning_rate": 4.9668475809485825e-05, + "loss": 5.6152, + "step": 8727 + }, + { + "epoch": 0.051907888476543915, + "grad_norm": 2.4488654136657715, + "learning_rate": 4.966839998819225e-05, + "loss": 5.4163, + "step": 8728 + }, + { + "epoch": 0.05191383575982491, + "grad_norm": 2.2756056785583496, + "learning_rate": 4.96683241582872e-05, + "loss": 5.9449, + "step": 8729 + }, + { + "epoch": 0.05191978304310591, + "grad_norm": 2.7889063358306885, + "learning_rate": 4.9668248319770683e-05, + "loss": 5.9502, + "step": 8730 + }, + { + "epoch": 0.05192573032638691, + "grad_norm": 2.620378255844116, + "learning_rate": 4.9668172472642735e-05, + "loss": 4.8344, + "step": 8731 + }, + { + "epoch": 0.0519316776096679, + "grad_norm": 2.2405688762664795, + "learning_rate": 4.9668096616903395e-05, + "loss": 5.598, + "step": 8732 + }, + { + "epoch": 0.051937624892948904, + "grad_norm": 2.3559701442718506, + "learning_rate": 4.9668020752552664e-05, + "loss": 5.7951, + "step": 8733 + }, + { + "epoch": 0.0519435721762299, + "grad_norm": 1.9856364727020264, + "learning_rate": 4.966794487959058e-05, + "loss": 5.3907, + "step": 8734 + }, + { + "epoch": 0.051949519459510894, + "grad_norm": 2.345541000366211, + "learning_rate": 4.966786899801718e-05, + "loss": 5.9875, + "step": 8735 + }, + { + "epoch": 0.05195546674279189, + "grad_norm": 2.4069056510925293, + "learning_rate": 4.9667793107832485e-05, + "loss": 6.0062, + "step": 8736 + }, + { + "epoch": 0.05196141402607289, + "grad_norm": 1.9191378355026245, + "learning_rate": 4.966771720903651e-05, + "loss": 6.1341, + "step": 8737 + }, + { + "epoch": 0.051967361309353886, + "grad_norm": 2.135986089706421, + "learning_rate": 4.9667641301629284e-05, + "loss": 5.6993, + "step": 8738 + }, + { + "epoch": 0.05197330859263488, + "grad_norm": 2.0774824619293213, + "learning_rate": 4.966756538561085e-05, + "loss": 5.9791, + "step": 8739 + }, + { + "epoch": 0.051979255875915883, + "grad_norm": 2.1451659202575684, + "learning_rate": 4.9667489460981224e-05, + "loss": 5.8181, + "step": 8740 + }, + { + "epoch": 0.05198520315919688, + "grad_norm": 2.2769901752471924, + "learning_rate": 4.966741352774043e-05, + "loss": 5.6799, + "step": 8741 + }, + { + "epoch": 0.051991150442477874, + "grad_norm": 2.22038197517395, + "learning_rate": 4.9667337585888494e-05, + "loss": 5.8781, + "step": 8742 + }, + { + "epoch": 0.051997097725758876, + "grad_norm": 2.417508125305176, + "learning_rate": 4.9667261635425446e-05, + "loss": 5.3458, + "step": 8743 + }, + { + "epoch": 0.05200304500903987, + "grad_norm": 2.0334360599517822, + "learning_rate": 4.966718567635131e-05, + "loss": 5.5241, + "step": 8744 + }, + { + "epoch": 0.052008992292320866, + "grad_norm": 2.3476316928863525, + "learning_rate": 4.9667109708666126e-05, + "loss": 5.8786, + "step": 8745 + }, + { + "epoch": 0.05201493957560187, + "grad_norm": 2.160106897354126, + "learning_rate": 4.96670337323699e-05, + "loss": 5.616, + "step": 8746 + }, + { + "epoch": 0.05202088685888286, + "grad_norm": 2.0048086643218994, + "learning_rate": 4.9666957747462665e-05, + "loss": 5.5787, + "step": 8747 + }, + { + "epoch": 0.05202683414216386, + "grad_norm": 2.9226925373077393, + "learning_rate": 4.966688175394446e-05, + "loss": 5.3708, + "step": 8748 + }, + { + "epoch": 0.05203278142544486, + "grad_norm": 1.9020568132400513, + "learning_rate": 4.9666805751815294e-05, + "loss": 5.6037, + "step": 8749 + }, + { + "epoch": 0.052038728708725855, + "grad_norm": 2.218637466430664, + "learning_rate": 4.966672974107519e-05, + "loss": 5.2983, + "step": 8750 + }, + { + "epoch": 0.05204467599200685, + "grad_norm": 2.906625270843506, + "learning_rate": 4.96666537217242e-05, + "loss": 5.1234, + "step": 8751 + }, + { + "epoch": 0.052050623275287845, + "grad_norm": 2.0095551013946533, + "learning_rate": 4.966657769376234e-05, + "loss": 5.2695, + "step": 8752 + }, + { + "epoch": 0.05205657055856885, + "grad_norm": 2.1369643211364746, + "learning_rate": 4.966650165718963e-05, + "loss": 5.5426, + "step": 8753 + }, + { + "epoch": 0.05206251784184984, + "grad_norm": 2.4762122631073, + "learning_rate": 4.966642561200608e-05, + "loss": 5.5595, + "step": 8754 + }, + { + "epoch": 0.05206846512513084, + "grad_norm": 2.199430227279663, + "learning_rate": 4.966634955821176e-05, + "loss": 5.5155, + "step": 8755 + }, + { + "epoch": 0.05207441240841184, + "grad_norm": 2.132460355758667, + "learning_rate": 4.966627349580666e-05, + "loss": 5.5344, + "step": 8756 + }, + { + "epoch": 0.052080359691692835, + "grad_norm": 2.4437100887298584, + "learning_rate": 4.966619742479082e-05, + "loss": 5.0135, + "step": 8757 + }, + { + "epoch": 0.05208630697497383, + "grad_norm": 1.5223499536514282, + "learning_rate": 4.9666121345164265e-05, + "loss": 5.5467, + "step": 8758 + }, + { + "epoch": 0.05209225425825483, + "grad_norm": 2.101797580718994, + "learning_rate": 4.966604525692702e-05, + "loss": 5.9493, + "step": 8759 + }, + { + "epoch": 0.05209820154153583, + "grad_norm": 1.9338927268981934, + "learning_rate": 4.966596916007912e-05, + "loss": 5.6625, + "step": 8760 + }, + { + "epoch": 0.05210414882481682, + "grad_norm": 2.1328654289245605, + "learning_rate": 4.966589305462058e-05, + "loss": 6.3202, + "step": 8761 + }, + { + "epoch": 0.052110096108097824, + "grad_norm": 1.963287115097046, + "learning_rate": 4.9665816940551434e-05, + "loss": 5.8885, + "step": 8762 + }, + { + "epoch": 0.05211604339137882, + "grad_norm": 2.124155282974243, + "learning_rate": 4.96657408178717e-05, + "loss": 5.6015, + "step": 8763 + }, + { + "epoch": 0.052121990674659814, + "grad_norm": 2.1011505126953125, + "learning_rate": 4.966566468658142e-05, + "loss": 5.7786, + "step": 8764 + }, + { + "epoch": 0.05212793795794081, + "grad_norm": 1.769573450088501, + "learning_rate": 4.966558854668061e-05, + "loss": 5.8229, + "step": 8765 + }, + { + "epoch": 0.05213388524122181, + "grad_norm": 1.7712751626968384, + "learning_rate": 4.966551239816929e-05, + "loss": 5.733, + "step": 8766 + }, + { + "epoch": 0.052139832524502806, + "grad_norm": 1.68185555934906, + "learning_rate": 4.9665436241047503e-05, + "loss": 6.015, + "step": 8767 + }, + { + "epoch": 0.0521457798077838, + "grad_norm": 1.8619519472122192, + "learning_rate": 4.966536007531526e-05, + "loss": 5.9545, + "step": 8768 + }, + { + "epoch": 0.0521517270910648, + "grad_norm": 1.6538097858428955, + "learning_rate": 4.96652839009726e-05, + "loss": 5.6138, + "step": 8769 + }, + { + "epoch": 0.0521576743743458, + "grad_norm": 1.721737027168274, + "learning_rate": 4.966520771801955e-05, + "loss": 6.0001, + "step": 8770 + }, + { + "epoch": 0.052163621657626794, + "grad_norm": 1.8449060916900635, + "learning_rate": 4.966513152645612e-05, + "loss": 5.6811, + "step": 8771 + }, + { + "epoch": 0.052169568940907796, + "grad_norm": 2.3810017108917236, + "learning_rate": 4.966505532628235e-05, + "loss": 5.4662, + "step": 8772 + }, + { + "epoch": 0.05217551622418879, + "grad_norm": 2.9262144565582275, + "learning_rate": 4.9664979117498265e-05, + "loss": 5.3555, + "step": 8773 + }, + { + "epoch": 0.052181463507469786, + "grad_norm": 2.1560001373291016, + "learning_rate": 4.966490290010389e-05, + "loss": 5.988, + "step": 8774 + }, + { + "epoch": 0.05218741079075079, + "grad_norm": 1.8220587968826294, + "learning_rate": 4.966482667409925e-05, + "loss": 5.8334, + "step": 8775 + }, + { + "epoch": 0.05219335807403178, + "grad_norm": 2.393651008605957, + "learning_rate": 4.9664750439484375e-05, + "loss": 5.5866, + "step": 8776 + }, + { + "epoch": 0.05219930535731278, + "grad_norm": 2.193864583969116, + "learning_rate": 4.966467419625929e-05, + "loss": 5.6642, + "step": 8777 + }, + { + "epoch": 0.05220525264059378, + "grad_norm": 2.24094820022583, + "learning_rate": 4.966459794442403e-05, + "loss": 5.7149, + "step": 8778 + }, + { + "epoch": 0.052211199923874775, + "grad_norm": 2.447439670562744, + "learning_rate": 4.9664521683978606e-05, + "loss": 5.4759, + "step": 8779 + }, + { + "epoch": 0.05221714720715577, + "grad_norm": 1.9538700580596924, + "learning_rate": 4.9664445414923055e-05, + "loss": 5.7, + "step": 8780 + }, + { + "epoch": 0.052223094490436765, + "grad_norm": 1.8960500955581665, + "learning_rate": 4.966436913725739e-05, + "loss": 5.7852, + "step": 8781 + }, + { + "epoch": 0.05222904177371777, + "grad_norm": 1.9234421253204346, + "learning_rate": 4.966429285098166e-05, + "loss": 5.9842, + "step": 8782 + }, + { + "epoch": 0.05223498905699876, + "grad_norm": 2.2879858016967773, + "learning_rate": 4.966421655609588e-05, + "loss": 5.6572, + "step": 8783 + }, + { + "epoch": 0.05224093634027976, + "grad_norm": 2.287932872772217, + "learning_rate": 4.966414025260008e-05, + "loss": 6.0675, + "step": 8784 + }, + { + "epoch": 0.05224688362356076, + "grad_norm": 1.6395118236541748, + "learning_rate": 4.9664063940494275e-05, + "loss": 5.6846, + "step": 8785 + }, + { + "epoch": 0.052252830906841755, + "grad_norm": 1.7121644020080566, + "learning_rate": 4.966398761977851e-05, + "loss": 5.7014, + "step": 8786 + }, + { + "epoch": 0.05225877819012275, + "grad_norm": 1.6225544214248657, + "learning_rate": 4.966391129045279e-05, + "loss": 5.6152, + "step": 8787 + }, + { + "epoch": 0.05226472547340375, + "grad_norm": 1.8484382629394531, + "learning_rate": 4.966383495251716e-05, + "loss": 5.8109, + "step": 8788 + }, + { + "epoch": 0.05227067275668475, + "grad_norm": 1.8225692510604858, + "learning_rate": 4.966375860597164e-05, + "loss": 6.0587, + "step": 8789 + }, + { + "epoch": 0.05227662003996574, + "grad_norm": 2.0333876609802246, + "learning_rate": 4.9663682250816255e-05, + "loss": 6.1406, + "step": 8790 + }, + { + "epoch": 0.052282567323246744, + "grad_norm": 2.0004124641418457, + "learning_rate": 4.9663605887051036e-05, + "loss": 5.6227, + "step": 8791 + }, + { + "epoch": 0.05228851460652774, + "grad_norm": 1.723655343055725, + "learning_rate": 4.9663529514676005e-05, + "loss": 5.5013, + "step": 8792 + }, + { + "epoch": 0.052294461889808734, + "grad_norm": 1.8351995944976807, + "learning_rate": 4.966345313369119e-05, + "loss": 5.3327, + "step": 8793 + }, + { + "epoch": 0.05230040917308973, + "grad_norm": 1.7514569759368896, + "learning_rate": 4.9663376744096615e-05, + "loss": 5.235, + "step": 8794 + }, + { + "epoch": 0.05230635645637073, + "grad_norm": 1.6678166389465332, + "learning_rate": 4.966330034589232e-05, + "loss": 5.2269, + "step": 8795 + }, + { + "epoch": 0.052312303739651726, + "grad_norm": 1.82132887840271, + "learning_rate": 4.9663223939078315e-05, + "loss": 5.0288, + "step": 8796 + }, + { + "epoch": 0.05231825102293272, + "grad_norm": 1.7815704345703125, + "learning_rate": 4.966314752365463e-05, + "loss": 5.4489, + "step": 8797 + }, + { + "epoch": 0.05232419830621372, + "grad_norm": 2.5268197059631348, + "learning_rate": 4.96630710996213e-05, + "loss": 5.0321, + "step": 8798 + }, + { + "epoch": 0.05233014558949472, + "grad_norm": 2.921208620071411, + "learning_rate": 4.9662994666978346e-05, + "loss": 5.0826, + "step": 8799 + }, + { + "epoch": 0.052336092872775714, + "grad_norm": 2.83243727684021, + "learning_rate": 4.9662918225725794e-05, + "loss": 4.9754, + "step": 8800 + }, + { + "epoch": 0.052342040156056716, + "grad_norm": 2.960346221923828, + "learning_rate": 4.966284177586368e-05, + "loss": 5.5808, + "step": 8801 + }, + { + "epoch": 0.05234798743933771, + "grad_norm": 2.479055643081665, + "learning_rate": 4.966276531739201e-05, + "loss": 5.3779, + "step": 8802 + }, + { + "epoch": 0.052353934722618706, + "grad_norm": 2.8753128051757812, + "learning_rate": 4.966268885031083e-05, + "loss": 5.4023, + "step": 8803 + }, + { + "epoch": 0.05235988200589971, + "grad_norm": 2.1152822971343994, + "learning_rate": 4.966261237462016e-05, + "loss": 6.1181, + "step": 8804 + }, + { + "epoch": 0.0523658292891807, + "grad_norm": 2.7178313732147217, + "learning_rate": 4.966253589032003e-05, + "loss": 5.1597, + "step": 8805 + }, + { + "epoch": 0.0523717765724617, + "grad_norm": 2.6567695140838623, + "learning_rate": 4.966245939741045e-05, + "loss": 5.0582, + "step": 8806 + }, + { + "epoch": 0.0523777238557427, + "grad_norm": 3.0211431980133057, + "learning_rate": 4.966238289589147e-05, + "loss": 4.8331, + "step": 8807 + }, + { + "epoch": 0.052383671139023695, + "grad_norm": 2.9341561794281006, + "learning_rate": 4.9662306385763114e-05, + "loss": 4.8482, + "step": 8808 + }, + { + "epoch": 0.05238961842230469, + "grad_norm": 2.781118631362915, + "learning_rate": 4.966222986702539e-05, + "loss": 4.9199, + "step": 8809 + }, + { + "epoch": 0.052395565705585685, + "grad_norm": 2.459233283996582, + "learning_rate": 4.9662153339678344e-05, + "loss": 5.4156, + "step": 8810 + }, + { + "epoch": 0.05240151298886669, + "grad_norm": 1.9862231016159058, + "learning_rate": 4.966207680372199e-05, + "loss": 5.3937, + "step": 8811 + }, + { + "epoch": 0.05240746027214768, + "grad_norm": 3.3698437213897705, + "learning_rate": 4.966200025915636e-05, + "loss": 4.6231, + "step": 8812 + }, + { + "epoch": 0.05241340755542868, + "grad_norm": 2.9254424571990967, + "learning_rate": 4.9661923705981486e-05, + "loss": 4.5612, + "step": 8813 + }, + { + "epoch": 0.05241935483870968, + "grad_norm": 2.684386968612671, + "learning_rate": 4.966184714419738e-05, + "loss": 4.8646, + "step": 8814 + }, + { + "epoch": 0.052425302121990675, + "grad_norm": 2.812406539916992, + "learning_rate": 4.966177057380409e-05, + "loss": 4.5116, + "step": 8815 + }, + { + "epoch": 0.05243124940527167, + "grad_norm": 2.1739046573638916, + "learning_rate": 4.966169399480162e-05, + "loss": 5.3369, + "step": 8816 + }, + { + "epoch": 0.05243719668855267, + "grad_norm": 2.408341407775879, + "learning_rate": 4.966161740719001e-05, + "loss": 5.0368, + "step": 8817 + }, + { + "epoch": 0.05244314397183367, + "grad_norm": 2.2844927310943604, + "learning_rate": 4.966154081096929e-05, + "loss": 5.0657, + "step": 8818 + }, + { + "epoch": 0.05244909125511466, + "grad_norm": 2.5329723358154297, + "learning_rate": 4.9661464206139475e-05, + "loss": 5.2006, + "step": 8819 + }, + { + "epoch": 0.052455038538395664, + "grad_norm": 2.154224395751953, + "learning_rate": 4.9661387592700595e-05, + "loss": 5.238, + "step": 8820 + }, + { + "epoch": 0.05246098582167666, + "grad_norm": 2.1069657802581787, + "learning_rate": 4.966131097065269e-05, + "loss": 5.0894, + "step": 8821 + }, + { + "epoch": 0.052466933104957654, + "grad_norm": 2.165954351425171, + "learning_rate": 4.9661234339995763e-05, + "loss": 5.1148, + "step": 8822 + }, + { + "epoch": 0.052472880388238656, + "grad_norm": 1.8859459161758423, + "learning_rate": 4.9661157700729866e-05, + "loss": 5.1703, + "step": 8823 + }, + { + "epoch": 0.05247882767151965, + "grad_norm": 1.9739452600479126, + "learning_rate": 4.9661081052855004e-05, + "loss": 5.3978, + "step": 8824 + }, + { + "epoch": 0.052484774954800646, + "grad_norm": 1.95566987991333, + "learning_rate": 4.966100439637122e-05, + "loss": 5.3592, + "step": 8825 + }, + { + "epoch": 0.05249072223808164, + "grad_norm": 1.8613550662994385, + "learning_rate": 4.966092773127853e-05, + "loss": 5.3746, + "step": 8826 + }, + { + "epoch": 0.05249666952136264, + "grad_norm": 2.001701831817627, + "learning_rate": 4.9660851057576966e-05, + "loss": 5.3269, + "step": 8827 + }, + { + "epoch": 0.05250261680464364, + "grad_norm": 1.8846383094787598, + "learning_rate": 4.9660774375266556e-05, + "loss": 5.7906, + "step": 8828 + }, + { + "epoch": 0.052508564087924633, + "grad_norm": 1.982998251914978, + "learning_rate": 4.966069768434732e-05, + "loss": 5.6609, + "step": 8829 + }, + { + "epoch": 0.052514511371205636, + "grad_norm": 2.3036038875579834, + "learning_rate": 4.9660620984819294e-05, + "loss": 5.6172, + "step": 8830 + }, + { + "epoch": 0.05252045865448663, + "grad_norm": 1.9227113723754883, + "learning_rate": 4.9660544276682496e-05, + "loss": 5.4734, + "step": 8831 + }, + { + "epoch": 0.052526405937767626, + "grad_norm": 2.038203716278076, + "learning_rate": 4.9660467559936964e-05, + "loss": 5.6484, + "step": 8832 + }, + { + "epoch": 0.05253235322104863, + "grad_norm": 2.217108964920044, + "learning_rate": 4.9660390834582704e-05, + "loss": 5.4064, + "step": 8833 + }, + { + "epoch": 0.05253830050432962, + "grad_norm": 2.4458765983581543, + "learning_rate": 4.966031410061976e-05, + "loss": 5.605, + "step": 8834 + }, + { + "epoch": 0.05254424778761062, + "grad_norm": 2.2767014503479004, + "learning_rate": 4.966023735804817e-05, + "loss": 5.4258, + "step": 8835 + }, + { + "epoch": 0.05255019507089162, + "grad_norm": 2.3594579696655273, + "learning_rate": 4.9660160606867936e-05, + "loss": 5.5138, + "step": 8836 + }, + { + "epoch": 0.052556142354172615, + "grad_norm": 1.8961461782455444, + "learning_rate": 4.966008384707909e-05, + "loss": 5.9879, + "step": 8837 + }, + { + "epoch": 0.05256208963745361, + "grad_norm": 1.824751615524292, + "learning_rate": 4.966000707868167e-05, + "loss": 5.4558, + "step": 8838 + }, + { + "epoch": 0.052568036920734605, + "grad_norm": 2.005291223526001, + "learning_rate": 4.9659930301675694e-05, + "loss": 5.821, + "step": 8839 + }, + { + "epoch": 0.05257398420401561, + "grad_norm": 2.0951414108276367, + "learning_rate": 4.965985351606119e-05, + "loss": 5.2816, + "step": 8840 + }, + { + "epoch": 0.0525799314872966, + "grad_norm": 2.236849069595337, + "learning_rate": 4.9659776721838194e-05, + "loss": 5.4734, + "step": 8841 + }, + { + "epoch": 0.0525858787705776, + "grad_norm": 1.8877390623092651, + "learning_rate": 4.965969991900671e-05, + "loss": 5.2445, + "step": 8842 + }, + { + "epoch": 0.0525918260538586, + "grad_norm": 2.726071834564209, + "learning_rate": 4.9659623107566785e-05, + "loss": 5.6059, + "step": 8843 + }, + { + "epoch": 0.052597773337139594, + "grad_norm": 2.279759168624878, + "learning_rate": 4.965954628751844e-05, + "loss": 5.6755, + "step": 8844 + }, + { + "epoch": 0.05260372062042059, + "grad_norm": 1.9941623210906982, + "learning_rate": 4.965946945886171e-05, + "loss": 5.5222, + "step": 8845 + }, + { + "epoch": 0.05260966790370159, + "grad_norm": 2.0556750297546387, + "learning_rate": 4.965939262159661e-05, + "loss": 5.6064, + "step": 8846 + }, + { + "epoch": 0.05261561518698259, + "grad_norm": 1.9260958433151245, + "learning_rate": 4.965931577572317e-05, + "loss": 5.6264, + "step": 8847 + }, + { + "epoch": 0.05262156247026358, + "grad_norm": 2.1252758502960205, + "learning_rate": 4.9659238921241413e-05, + "loss": 5.9832, + "step": 8848 + }, + { + "epoch": 0.052627509753544584, + "grad_norm": 1.8081480264663696, + "learning_rate": 4.9659162058151377e-05, + "loss": 5.4391, + "step": 8849 + }, + { + "epoch": 0.05263345703682558, + "grad_norm": 1.8439849615097046, + "learning_rate": 4.965908518645308e-05, + "loss": 5.5351, + "step": 8850 + }, + { + "epoch": 0.052639404320106574, + "grad_norm": 2.1782681941986084, + "learning_rate": 4.9659008306146556e-05, + "loss": 5.9692, + "step": 8851 + }, + { + "epoch": 0.052645351603387576, + "grad_norm": 2.0206944942474365, + "learning_rate": 4.965893141723182e-05, + "loss": 5.4736, + "step": 8852 + }, + { + "epoch": 0.05265129888666857, + "grad_norm": 2.283517360687256, + "learning_rate": 4.965885451970891e-05, + "loss": 5.4504, + "step": 8853 + }, + { + "epoch": 0.052657246169949566, + "grad_norm": 2.701608180999756, + "learning_rate": 4.965877761357784e-05, + "loss": 5.318, + "step": 8854 + }, + { + "epoch": 0.05266319345323056, + "grad_norm": 2.8494722843170166, + "learning_rate": 4.965870069883866e-05, + "loss": 4.9835, + "step": 8855 + }, + { + "epoch": 0.05266914073651156, + "grad_norm": 2.0555408000946045, + "learning_rate": 4.965862377549137e-05, + "loss": 5.7587, + "step": 8856 + }, + { + "epoch": 0.05267508801979256, + "grad_norm": 2.3476004600524902, + "learning_rate": 4.9658546843536014e-05, + "loss": 5.8775, + "step": 8857 + }, + { + "epoch": 0.05268103530307355, + "grad_norm": 1.8152700662612915, + "learning_rate": 4.965846990297262e-05, + "loss": 5.6274, + "step": 8858 + }, + { + "epoch": 0.052686982586354555, + "grad_norm": 2.1541671752929688, + "learning_rate": 4.965839295380119e-05, + "loss": 5.6786, + "step": 8859 + }, + { + "epoch": 0.05269292986963555, + "grad_norm": 2.1708984375, + "learning_rate": 4.965831599602179e-05, + "loss": 5.8817, + "step": 8860 + }, + { + "epoch": 0.052698877152916546, + "grad_norm": 1.6558966636657715, + "learning_rate": 4.9658239029634415e-05, + "loss": 5.5375, + "step": 8861 + }, + { + "epoch": 0.05270482443619755, + "grad_norm": 2.1165130138397217, + "learning_rate": 4.9658162054639115e-05, + "loss": 5.5936, + "step": 8862 + }, + { + "epoch": 0.05271077171947854, + "grad_norm": 2.4143176078796387, + "learning_rate": 4.9658085071035893e-05, + "loss": 5.71, + "step": 8863 + }, + { + "epoch": 0.05271671900275954, + "grad_norm": 1.9471622705459595, + "learning_rate": 4.965800807882479e-05, + "loss": 5.7588, + "step": 8864 + }, + { + "epoch": 0.05272266628604054, + "grad_norm": 2.2014408111572266, + "learning_rate": 4.9657931078005835e-05, + "loss": 5.7699, + "step": 8865 + }, + { + "epoch": 0.052728613569321535, + "grad_norm": 1.7588191032409668, + "learning_rate": 4.965785406857905e-05, + "loss": 5.3921, + "step": 8866 + }, + { + "epoch": 0.05273456085260253, + "grad_norm": 1.835635781288147, + "learning_rate": 4.965777705054446e-05, + "loss": 5.1531, + "step": 8867 + }, + { + "epoch": 0.052740508135883525, + "grad_norm": 2.3071937561035156, + "learning_rate": 4.96577000239021e-05, + "loss": 5.5926, + "step": 8868 + }, + { + "epoch": 0.05274645541916453, + "grad_norm": 2.195712089538574, + "learning_rate": 4.9657622988651995e-05, + "loss": 5.4579, + "step": 8869 + }, + { + "epoch": 0.05275240270244552, + "grad_norm": 2.273738145828247, + "learning_rate": 4.9657545944794156e-05, + "loss": 5.6138, + "step": 8870 + }, + { + "epoch": 0.05275834998572652, + "grad_norm": 2.208343982696533, + "learning_rate": 4.9657468892328626e-05, + "loss": 5.5508, + "step": 8871 + }, + { + "epoch": 0.05276429726900752, + "grad_norm": 2.2111566066741943, + "learning_rate": 4.965739183125544e-05, + "loss": 5.7044, + "step": 8872 + }, + { + "epoch": 0.052770244552288514, + "grad_norm": 1.7516666650772095, + "learning_rate": 4.96573147615746e-05, + "loss": 5.4357, + "step": 8873 + }, + { + "epoch": 0.05277619183556951, + "grad_norm": 2.0703322887420654, + "learning_rate": 4.9657237683286155e-05, + "loss": 5.5383, + "step": 8874 + }, + { + "epoch": 0.05278213911885051, + "grad_norm": 1.796243667602539, + "learning_rate": 4.965716059639012e-05, + "loss": 5.5024, + "step": 8875 + }, + { + "epoch": 0.05278808640213151, + "grad_norm": 2.322397232055664, + "learning_rate": 4.9657083500886526e-05, + "loss": 5.8814, + "step": 8876 + }, + { + "epoch": 0.0527940336854125, + "grad_norm": 2.6743311882019043, + "learning_rate": 4.96570063967754e-05, + "loss": 5.4989, + "step": 8877 + }, + { + "epoch": 0.052799980968693504, + "grad_norm": 2.4381649494171143, + "learning_rate": 4.965692928405676e-05, + "loss": 5.5807, + "step": 8878 + }, + { + "epoch": 0.0528059282519745, + "grad_norm": 2.3703296184539795, + "learning_rate": 4.9656852162730646e-05, + "loss": 5.5586, + "step": 8879 + }, + { + "epoch": 0.052811875535255494, + "grad_norm": 1.7828437089920044, + "learning_rate": 4.9656775032797075e-05, + "loss": 5.2553, + "step": 8880 + }, + { + "epoch": 0.052817822818536496, + "grad_norm": 1.730290412902832, + "learning_rate": 4.9656697894256085e-05, + "loss": 5.3558, + "step": 8881 + }, + { + "epoch": 0.05282377010181749, + "grad_norm": 1.6909739971160889, + "learning_rate": 4.9656620747107694e-05, + "loss": 5.4397, + "step": 8882 + }, + { + "epoch": 0.052829717385098486, + "grad_norm": 1.9772145748138428, + "learning_rate": 4.965654359135193e-05, + "loss": 5.5786, + "step": 8883 + }, + { + "epoch": 0.05283566466837948, + "grad_norm": 1.8624964952468872, + "learning_rate": 4.965646642698883e-05, + "loss": 5.5466, + "step": 8884 + }, + { + "epoch": 0.05284161195166048, + "grad_norm": 1.7061936855316162, + "learning_rate": 4.96563892540184e-05, + "loss": 5.3439, + "step": 8885 + }, + { + "epoch": 0.05284755923494148, + "grad_norm": 1.715483546257019, + "learning_rate": 4.965631207244069e-05, + "loss": 5.2732, + "step": 8886 + }, + { + "epoch": 0.05285350651822247, + "grad_norm": 1.7801883220672607, + "learning_rate": 4.965623488225571e-05, + "loss": 5.2427, + "step": 8887 + }, + { + "epoch": 0.052859453801503475, + "grad_norm": 1.5122452974319458, + "learning_rate": 4.9656157683463495e-05, + "loss": 5.2812, + "step": 8888 + }, + { + "epoch": 0.05286540108478447, + "grad_norm": 1.878077507019043, + "learning_rate": 4.965608047606407e-05, + "loss": 5.6385, + "step": 8889 + }, + { + "epoch": 0.052871348368065466, + "grad_norm": 2.0781304836273193, + "learning_rate": 4.965600326005746e-05, + "loss": 5.3345, + "step": 8890 + }, + { + "epoch": 0.05287729565134647, + "grad_norm": 1.953302264213562, + "learning_rate": 4.965592603544369e-05, + "loss": 5.2694, + "step": 8891 + }, + { + "epoch": 0.05288324293462746, + "grad_norm": 1.9993265867233276, + "learning_rate": 4.96558488022228e-05, + "loss": 5.3323, + "step": 8892 + }, + { + "epoch": 0.05288919021790846, + "grad_norm": 1.7653480768203735, + "learning_rate": 4.96557715603948e-05, + "loss": 5.389, + "step": 8893 + }, + { + "epoch": 0.05289513750118946, + "grad_norm": 1.8843438625335693, + "learning_rate": 4.965569430995973e-05, + "loss": 5.3334, + "step": 8894 + }, + { + "epoch": 0.052901084784470455, + "grad_norm": 1.6673407554626465, + "learning_rate": 4.9655617050917616e-05, + "loss": 5.4469, + "step": 8895 + }, + { + "epoch": 0.05290703206775145, + "grad_norm": 1.8208844661712646, + "learning_rate": 4.9655539783268476e-05, + "loss": 5.6288, + "step": 8896 + }, + { + "epoch": 0.052912979351032445, + "grad_norm": 1.755162000656128, + "learning_rate": 4.965546250701234e-05, + "loss": 5.4388, + "step": 8897 + }, + { + "epoch": 0.05291892663431345, + "grad_norm": 1.9435405731201172, + "learning_rate": 4.965538522214924e-05, + "loss": 5.5877, + "step": 8898 + }, + { + "epoch": 0.05292487391759444, + "grad_norm": 1.8579509258270264, + "learning_rate": 4.9655307928679196e-05, + "loss": 5.4405, + "step": 8899 + }, + { + "epoch": 0.05293082120087544, + "grad_norm": 1.8897236585617065, + "learning_rate": 4.9655230626602246e-05, + "loss": 5.2931, + "step": 8900 + }, + { + "epoch": 0.05293676848415644, + "grad_norm": 1.928133487701416, + "learning_rate": 4.9655153315918403e-05, + "loss": 5.2345, + "step": 8901 + }, + { + "epoch": 0.052942715767437434, + "grad_norm": 1.8830339908599854, + "learning_rate": 4.96550759966277e-05, + "loss": 5.3288, + "step": 8902 + }, + { + "epoch": 0.05294866305071843, + "grad_norm": 1.6774102449417114, + "learning_rate": 4.9654998668730167e-05, + "loss": 5.2939, + "step": 8903 + }, + { + "epoch": 0.05295461033399943, + "grad_norm": 1.7440418004989624, + "learning_rate": 4.9654921332225826e-05, + "loss": 5.4663, + "step": 8904 + }, + { + "epoch": 0.05296055761728043, + "grad_norm": 1.92295241355896, + "learning_rate": 4.965484398711471e-05, + "loss": 5.556, + "step": 8905 + }, + { + "epoch": 0.05296650490056142, + "grad_norm": 1.5319017171859741, + "learning_rate": 4.965476663339684e-05, + "loss": 5.5267, + "step": 8906 + }, + { + "epoch": 0.052972452183842424, + "grad_norm": 1.7626374959945679, + "learning_rate": 4.9654689271072255e-05, + "loss": 5.3774, + "step": 8907 + }, + { + "epoch": 0.05297839946712342, + "grad_norm": 1.745743989944458, + "learning_rate": 4.965461190014096e-05, + "loss": 5.4877, + "step": 8908 + }, + { + "epoch": 0.052984346750404414, + "grad_norm": 1.6091177463531494, + "learning_rate": 4.9654534520603e-05, + "loss": 5.2969, + "step": 8909 + }, + { + "epoch": 0.052990294033685416, + "grad_norm": 1.7392489910125732, + "learning_rate": 4.96544571324584e-05, + "loss": 5.4247, + "step": 8910 + }, + { + "epoch": 0.05299624131696641, + "grad_norm": 1.9275293350219727, + "learning_rate": 4.965437973570718e-05, + "loss": 5.2184, + "step": 8911 + }, + { + "epoch": 0.053002188600247406, + "grad_norm": 1.6901222467422485, + "learning_rate": 4.965430233034937e-05, + "loss": 5.1459, + "step": 8912 + }, + { + "epoch": 0.0530081358835284, + "grad_norm": 1.9212596416473389, + "learning_rate": 4.965422491638499e-05, + "loss": 5.2439, + "step": 8913 + }, + { + "epoch": 0.0530140831668094, + "grad_norm": 1.814706802368164, + "learning_rate": 4.965414749381409e-05, + "loss": 5.5608, + "step": 8914 + }, + { + "epoch": 0.0530200304500904, + "grad_norm": 1.7997081279754639, + "learning_rate": 4.965407006263668e-05, + "loss": 5.6099, + "step": 8915 + }, + { + "epoch": 0.05302597773337139, + "grad_norm": 1.8545546531677246, + "learning_rate": 4.9653992622852777e-05, + "loss": 5.5844, + "step": 8916 + }, + { + "epoch": 0.053031925016652395, + "grad_norm": 1.665958285331726, + "learning_rate": 4.965391517446243e-05, + "loss": 5.4967, + "step": 8917 + }, + { + "epoch": 0.05303787229993339, + "grad_norm": 1.6157240867614746, + "learning_rate": 4.9653837717465655e-05, + "loss": 5.2523, + "step": 8918 + }, + { + "epoch": 0.053043819583214386, + "grad_norm": 1.9782540798187256, + "learning_rate": 4.965376025186248e-05, + "loss": 5.2384, + "step": 8919 + }, + { + "epoch": 0.05304976686649539, + "grad_norm": 2.0229971408843994, + "learning_rate": 4.9653682777652925e-05, + "loss": 5.1703, + "step": 8920 + }, + { + "epoch": 0.05305571414977638, + "grad_norm": 1.8299061059951782, + "learning_rate": 4.965360529483703e-05, + "loss": 5.0257, + "step": 8921 + }, + { + "epoch": 0.05306166143305738, + "grad_norm": 1.9080857038497925, + "learning_rate": 4.965352780341482e-05, + "loss": 5.2516, + "step": 8922 + }, + { + "epoch": 0.05306760871633838, + "grad_norm": 1.9998538494110107, + "learning_rate": 4.965345030338631e-05, + "loss": 5.1991, + "step": 8923 + }, + { + "epoch": 0.053073555999619375, + "grad_norm": 1.7606618404388428, + "learning_rate": 4.965337279475154e-05, + "loss": 5.2194, + "step": 8924 + }, + { + "epoch": 0.05307950328290037, + "grad_norm": 1.9633625745773315, + "learning_rate": 4.9653295277510525e-05, + "loss": 5.2463, + "step": 8925 + }, + { + "epoch": 0.053085450566181365, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.9653217751663306e-05, + "loss": 5.2737, + "step": 8926 + }, + { + "epoch": 0.05309139784946237, + "grad_norm": 1.836289405822754, + "learning_rate": 4.965314021720991e-05, + "loss": 5.1157, + "step": 8927 + }, + { + "epoch": 0.05309734513274336, + "grad_norm": 1.8526496887207031, + "learning_rate": 4.965306267415035e-05, + "loss": 5.6541, + "step": 8928 + }, + { + "epoch": 0.05310329241602436, + "grad_norm": 1.9928539991378784, + "learning_rate": 4.965298512248466e-05, + "loss": 5.194, + "step": 8929 + }, + { + "epoch": 0.05310923969930536, + "grad_norm": 1.601536512374878, + "learning_rate": 4.9652907562212867e-05, + "loss": 5.285, + "step": 8930 + }, + { + "epoch": 0.053115186982586354, + "grad_norm": 1.8940081596374512, + "learning_rate": 4.9652829993335e-05, + "loss": 5.1791, + "step": 8931 + }, + { + "epoch": 0.05312113426586735, + "grad_norm": 1.7984519004821777, + "learning_rate": 4.9652752415851085e-05, + "loss": 5.2225, + "step": 8932 + }, + { + "epoch": 0.05312708154914835, + "grad_norm": 1.7474113702774048, + "learning_rate": 4.965267482976115e-05, + "loss": 5.0099, + "step": 8933 + }, + { + "epoch": 0.053133028832429346, + "grad_norm": 1.7044427394866943, + "learning_rate": 4.9652597235065214e-05, + "loss": 5.1456, + "step": 8934 + }, + { + "epoch": 0.05313897611571034, + "grad_norm": 1.5422965288162231, + "learning_rate": 4.9652519631763316e-05, + "loss": 5.0714, + "step": 8935 + }, + { + "epoch": 0.053144923398991344, + "grad_norm": 1.6831375360488892, + "learning_rate": 4.965244201985548e-05, + "loss": 5.0742, + "step": 8936 + }, + { + "epoch": 0.05315087068227234, + "grad_norm": 1.7648097276687622, + "learning_rate": 4.9652364399341734e-05, + "loss": 5.1108, + "step": 8937 + }, + { + "epoch": 0.053156817965553334, + "grad_norm": 1.669393539428711, + "learning_rate": 4.965228677022209e-05, + "loss": 5.1801, + "step": 8938 + }, + { + "epoch": 0.053162765248834336, + "grad_norm": 2.0252909660339355, + "learning_rate": 4.96522091324966e-05, + "loss": 5.3955, + "step": 8939 + }, + { + "epoch": 0.05316871253211533, + "grad_norm": 1.686355710029602, + "learning_rate": 4.965213148616527e-05, + "loss": 5.2626, + "step": 8940 + }, + { + "epoch": 0.053174659815396326, + "grad_norm": 1.7601011991500854, + "learning_rate": 4.965205383122814e-05, + "loss": 5.1603, + "step": 8941 + }, + { + "epoch": 0.05318060709867732, + "grad_norm": 1.7249791622161865, + "learning_rate": 4.9651976167685235e-05, + "loss": 5.4245, + "step": 8942 + }, + { + "epoch": 0.05318655438195832, + "grad_norm": 1.869367003440857, + "learning_rate": 4.9651898495536574e-05, + "loss": 5.2269, + "step": 8943 + }, + { + "epoch": 0.05319250166523932, + "grad_norm": 1.8296380043029785, + "learning_rate": 4.965182081478219e-05, + "loss": 5.3236, + "step": 8944 + }, + { + "epoch": 0.05319844894852031, + "grad_norm": 1.8211008310317993, + "learning_rate": 4.9651743125422115e-05, + "loss": 5.269, + "step": 8945 + }, + { + "epoch": 0.053204396231801315, + "grad_norm": 1.868295431137085, + "learning_rate": 4.965166542745637e-05, + "loss": 5.2733, + "step": 8946 + }, + { + "epoch": 0.05321034351508231, + "grad_norm": 1.6603426933288574, + "learning_rate": 4.965158772088498e-05, + "loss": 5.2685, + "step": 8947 + }, + { + "epoch": 0.053216290798363305, + "grad_norm": 1.680565357208252, + "learning_rate": 4.965151000570798e-05, + "loss": 5.4452, + "step": 8948 + }, + { + "epoch": 0.05322223808164431, + "grad_norm": 1.6473147869110107, + "learning_rate": 4.9651432281925394e-05, + "loss": 5.4476, + "step": 8949 + }, + { + "epoch": 0.0532281853649253, + "grad_norm": 1.5291423797607422, + "learning_rate": 4.965135454953724e-05, + "loss": 5.4617, + "step": 8950 + }, + { + "epoch": 0.0532341326482063, + "grad_norm": 1.4708455801010132, + "learning_rate": 4.965127680854356e-05, + "loss": 5.5431, + "step": 8951 + }, + { + "epoch": 0.0532400799314873, + "grad_norm": 1.4297362565994263, + "learning_rate": 4.9651199058944366e-05, + "loss": 5.431, + "step": 8952 + }, + { + "epoch": 0.053246027214768295, + "grad_norm": 1.726123571395874, + "learning_rate": 4.96511213007397e-05, + "loss": 5.2801, + "step": 8953 + }, + { + "epoch": 0.05325197449804929, + "grad_norm": 1.7977174520492554, + "learning_rate": 4.9651043533929584e-05, + "loss": 5.3273, + "step": 8954 + }, + { + "epoch": 0.053257921781330285, + "grad_norm": 1.8125461339950562, + "learning_rate": 4.9650965758514034e-05, + "loss": 5.3135, + "step": 8955 + }, + { + "epoch": 0.05326386906461129, + "grad_norm": 1.4925352334976196, + "learning_rate": 4.965088797449309e-05, + "loss": 5.1454, + "step": 8956 + }, + { + "epoch": 0.05326981634789228, + "grad_norm": 1.6977181434631348, + "learning_rate": 4.965081018186678e-05, + "loss": 5.3207, + "step": 8957 + }, + { + "epoch": 0.05327576363117328, + "grad_norm": 1.7767595052719116, + "learning_rate": 4.965073238063512e-05, + "loss": 5.203, + "step": 8958 + }, + { + "epoch": 0.05328171091445428, + "grad_norm": 1.53665292263031, + "learning_rate": 4.965065457079815e-05, + "loss": 5.3088, + "step": 8959 + }, + { + "epoch": 0.053287658197735274, + "grad_norm": 1.724476933479309, + "learning_rate": 4.965057675235589e-05, + "loss": 5.2628, + "step": 8960 + }, + { + "epoch": 0.05329360548101627, + "grad_norm": 1.7339463233947754, + "learning_rate": 4.965049892530837e-05, + "loss": 5.3174, + "step": 8961 + }, + { + "epoch": 0.05329955276429727, + "grad_norm": 1.8414005041122437, + "learning_rate": 4.965042108965561e-05, + "loss": 5.2121, + "step": 8962 + }, + { + "epoch": 0.053305500047578266, + "grad_norm": 1.7969903945922852, + "learning_rate": 4.9650343245397655e-05, + "loss": 5.0947, + "step": 8963 + }, + { + "epoch": 0.05331144733085926, + "grad_norm": 1.573320746421814, + "learning_rate": 4.965026539253451e-05, + "loss": 5.0624, + "step": 8964 + }, + { + "epoch": 0.053317394614140264, + "grad_norm": 1.7296351194381714, + "learning_rate": 4.9650187531066204e-05, + "loss": 5.5497, + "step": 8965 + }, + { + "epoch": 0.05332334189742126, + "grad_norm": 1.931847095489502, + "learning_rate": 4.9650109660992784e-05, + "loss": 5.537, + "step": 8966 + }, + { + "epoch": 0.053329289180702254, + "grad_norm": 1.8911564350128174, + "learning_rate": 4.965003178231427e-05, + "loss": 5.4891, + "step": 8967 + }, + { + "epoch": 0.053335236463983256, + "grad_norm": 1.933401107788086, + "learning_rate": 4.964995389503067e-05, + "loss": 5.3157, + "step": 8968 + }, + { + "epoch": 0.05334118374726425, + "grad_norm": 1.8299031257629395, + "learning_rate": 4.964987599914204e-05, + "loss": 5.2955, + "step": 8969 + }, + { + "epoch": 0.053347131030545246, + "grad_norm": 1.5823233127593994, + "learning_rate": 4.964979809464838e-05, + "loss": 5.2708, + "step": 8970 + }, + { + "epoch": 0.05335307831382624, + "grad_norm": 1.602689504623413, + "learning_rate": 4.9649720181549737e-05, + "loss": 5.3646, + "step": 8971 + }, + { + "epoch": 0.05335902559710724, + "grad_norm": 2.2379884719848633, + "learning_rate": 4.964964225984613e-05, + "loss": 5.5453, + "step": 8972 + }, + { + "epoch": 0.05336497288038824, + "grad_norm": 2.2210440635681152, + "learning_rate": 4.964956432953759e-05, + "loss": 5.2123, + "step": 8973 + }, + { + "epoch": 0.05337092016366923, + "grad_norm": 2.4450249671936035, + "learning_rate": 4.964948639062413e-05, + "loss": 5.172, + "step": 8974 + }, + { + "epoch": 0.053376867446950235, + "grad_norm": 1.7727516889572144, + "learning_rate": 4.9649408443105806e-05, + "loss": 5.3447, + "step": 8975 + }, + { + "epoch": 0.05338281473023123, + "grad_norm": 1.8239831924438477, + "learning_rate": 4.964933048698262e-05, + "loss": 5.3628, + "step": 8976 + }, + { + "epoch": 0.053388762013512225, + "grad_norm": 1.9517360925674438, + "learning_rate": 4.964925252225461e-05, + "loss": 5.6118, + "step": 8977 + }, + { + "epoch": 0.05339470929679323, + "grad_norm": 2.1735262870788574, + "learning_rate": 4.9649174548921796e-05, + "loss": 5.7332, + "step": 8978 + }, + { + "epoch": 0.05340065658007422, + "grad_norm": 1.4132062196731567, + "learning_rate": 4.964909656698421e-05, + "loss": 5.8078, + "step": 8979 + }, + { + "epoch": 0.05340660386335522, + "grad_norm": 1.5568846464157104, + "learning_rate": 4.964901857644188e-05, + "loss": 5.6328, + "step": 8980 + }, + { + "epoch": 0.05341255114663622, + "grad_norm": 1.6015586853027344, + "learning_rate": 4.964894057729484e-05, + "loss": 5.3738, + "step": 8981 + }, + { + "epoch": 0.053418498429917215, + "grad_norm": 1.492748737335205, + "learning_rate": 4.9648862569543105e-05, + "loss": 5.4336, + "step": 8982 + }, + { + "epoch": 0.05342444571319821, + "grad_norm": 1.9008845090866089, + "learning_rate": 4.96487845531867e-05, + "loss": 5.455, + "step": 8983 + }, + { + "epoch": 0.053430392996479205, + "grad_norm": 1.9590948820114136, + "learning_rate": 4.9648706528225664e-05, + "loss": 5.3308, + "step": 8984 + }, + { + "epoch": 0.05343634027976021, + "grad_norm": 1.9980428218841553, + "learning_rate": 4.964862849466002e-05, + "loss": 5.3777, + "step": 8985 + }, + { + "epoch": 0.0534422875630412, + "grad_norm": 1.769711971282959, + "learning_rate": 4.964855045248979e-05, + "loss": 5.4451, + "step": 8986 + }, + { + "epoch": 0.0534482348463222, + "grad_norm": 1.769977331161499, + "learning_rate": 4.964847240171502e-05, + "loss": 5.277, + "step": 8987 + }, + { + "epoch": 0.0534541821296032, + "grad_norm": 1.6647396087646484, + "learning_rate": 4.9648394342335705e-05, + "loss": 5.4655, + "step": 8988 + }, + { + "epoch": 0.053460129412884194, + "grad_norm": 1.861554503440857, + "learning_rate": 4.9648316274351906e-05, + "loss": 5.308, + "step": 8989 + }, + { + "epoch": 0.05346607669616519, + "grad_norm": 1.9457745552062988, + "learning_rate": 4.964823819776362e-05, + "loss": 6.2361, + "step": 8990 + }, + { + "epoch": 0.05347202397944619, + "grad_norm": 1.7702157497406006, + "learning_rate": 4.9648160112570896e-05, + "loss": 5.366, + "step": 8991 + }, + { + "epoch": 0.053477971262727186, + "grad_norm": 2.0074565410614014, + "learning_rate": 4.964808201877375e-05, + "loss": 5.3598, + "step": 8992 + }, + { + "epoch": 0.05348391854600818, + "grad_norm": 1.8686721324920654, + "learning_rate": 4.964800391637222e-05, + "loss": 5.4607, + "step": 8993 + }, + { + "epoch": 0.053489865829289183, + "grad_norm": 1.9749736785888672, + "learning_rate": 4.964792580536632e-05, + "loss": 5.3734, + "step": 8994 + }, + { + "epoch": 0.05349581311257018, + "grad_norm": 1.8435015678405762, + "learning_rate": 4.964784768575609e-05, + "loss": 5.3815, + "step": 8995 + }, + { + "epoch": 0.053501760395851174, + "grad_norm": 2.01983380317688, + "learning_rate": 4.9647769557541546e-05, + "loss": 5.4089, + "step": 8996 + }, + { + "epoch": 0.053507707679132176, + "grad_norm": 2.014798402786255, + "learning_rate": 4.964769142072272e-05, + "loss": 5.3906, + "step": 8997 + }, + { + "epoch": 0.05351365496241317, + "grad_norm": 1.8822753429412842, + "learning_rate": 4.9647613275299644e-05, + "loss": 5.3598, + "step": 8998 + }, + { + "epoch": 0.053519602245694166, + "grad_norm": 1.6534459590911865, + "learning_rate": 4.9647535121272334e-05, + "loss": 5.4577, + "step": 8999 + }, + { + "epoch": 0.05352554952897516, + "grad_norm": 1.6497015953063965, + "learning_rate": 4.964745695864083e-05, + "loss": 5.3915, + "step": 9000 + }, + { + "epoch": 0.05353149681225616, + "grad_norm": 1.5535780191421509, + "learning_rate": 4.964737878740515e-05, + "loss": 5.2444, + "step": 9001 + }, + { + "epoch": 0.05353744409553716, + "grad_norm": 1.6840674877166748, + "learning_rate": 4.964730060756533e-05, + "loss": 5.3439, + "step": 9002 + }, + { + "epoch": 0.05354339137881815, + "grad_norm": 1.7857226133346558, + "learning_rate": 4.9647222419121384e-05, + "loss": 5.3231, + "step": 9003 + }, + { + "epoch": 0.053549338662099155, + "grad_norm": 1.6067994832992554, + "learning_rate": 4.964714422207335e-05, + "loss": 5.4019, + "step": 9004 + }, + { + "epoch": 0.05355528594538015, + "grad_norm": 1.7026724815368652, + "learning_rate": 4.964706601642125e-05, + "loss": 5.2716, + "step": 9005 + }, + { + "epoch": 0.053561233228661145, + "grad_norm": 1.632804036140442, + "learning_rate": 4.964698780216512e-05, + "loss": 5.4132, + "step": 9006 + }, + { + "epoch": 0.05356718051194215, + "grad_norm": 1.6569499969482422, + "learning_rate": 4.964690957930498e-05, + "loss": 5.294, + "step": 9007 + }, + { + "epoch": 0.05357312779522314, + "grad_norm": 1.8141810894012451, + "learning_rate": 4.964683134784086e-05, + "loss": 5.3365, + "step": 9008 + }, + { + "epoch": 0.05357907507850414, + "grad_norm": 1.6555678844451904, + "learning_rate": 4.964675310777278e-05, + "loss": 5.3488, + "step": 9009 + }, + { + "epoch": 0.05358502236178514, + "grad_norm": 1.8363603353500366, + "learning_rate": 4.964667485910078e-05, + "loss": 5.3679, + "step": 9010 + }, + { + "epoch": 0.053590969645066135, + "grad_norm": 1.7839024066925049, + "learning_rate": 4.9646596601824874e-05, + "loss": 5.2514, + "step": 9011 + }, + { + "epoch": 0.05359691692834713, + "grad_norm": 1.8712091445922852, + "learning_rate": 4.96465183359451e-05, + "loss": 5.4313, + "step": 9012 + }, + { + "epoch": 0.053602864211628125, + "grad_norm": 1.9677501916885376, + "learning_rate": 4.964644006146148e-05, + "loss": 5.2442, + "step": 9013 + }, + { + "epoch": 0.05360881149490913, + "grad_norm": 1.8567090034484863, + "learning_rate": 4.964636177837404e-05, + "loss": 5.105, + "step": 9014 + }, + { + "epoch": 0.05361475877819012, + "grad_norm": 1.7319908142089844, + "learning_rate": 4.964628348668281e-05, + "loss": 5.2962, + "step": 9015 + }, + { + "epoch": 0.05362070606147112, + "grad_norm": 1.6412272453308105, + "learning_rate": 4.9646205186387824e-05, + "loss": 5.2302, + "step": 9016 + }, + { + "epoch": 0.05362665334475212, + "grad_norm": 1.9401088953018188, + "learning_rate": 4.96461268774891e-05, + "loss": 5.4425, + "step": 9017 + }, + { + "epoch": 0.053632600628033114, + "grad_norm": 1.7045506238937378, + "learning_rate": 4.964604855998666e-05, + "loss": 5.2325, + "step": 9018 + }, + { + "epoch": 0.05363854791131411, + "grad_norm": 1.8232519626617432, + "learning_rate": 4.9645970233880545e-05, + "loss": 5.5047, + "step": 9019 + }, + { + "epoch": 0.05364449519459511, + "grad_norm": 1.718833327293396, + "learning_rate": 4.964589189917077e-05, + "loss": 5.3323, + "step": 9020 + }, + { + "epoch": 0.053650442477876106, + "grad_norm": 1.608774185180664, + "learning_rate": 4.9645813555857376e-05, + "loss": 5.2374, + "step": 9021 + }, + { + "epoch": 0.0536563897611571, + "grad_norm": 1.6789363622665405, + "learning_rate": 4.964573520394039e-05, + "loss": 5.3291, + "step": 9022 + }, + { + "epoch": 0.0536623370444381, + "grad_norm": 1.6596689224243164, + "learning_rate": 4.964565684341982e-05, + "loss": 5.308, + "step": 9023 + }, + { + "epoch": 0.0536682843277191, + "grad_norm": 1.8141522407531738, + "learning_rate": 4.9645578474295703e-05, + "loss": 5.2033, + "step": 9024 + }, + { + "epoch": 0.053674231611000094, + "grad_norm": 1.428606390953064, + "learning_rate": 4.964550009656808e-05, + "loss": 5.2441, + "step": 9025 + }, + { + "epoch": 0.053680178894281096, + "grad_norm": 1.5033652782440186, + "learning_rate": 4.9645421710236965e-05, + "loss": 5.2132, + "step": 9026 + }, + { + "epoch": 0.05368612617756209, + "grad_norm": 1.7123147249221802, + "learning_rate": 4.9645343315302385e-05, + "loss": 5.3145, + "step": 9027 + }, + { + "epoch": 0.053692073460843086, + "grad_norm": 1.5851943492889404, + "learning_rate": 4.9645264911764376e-05, + "loss": 5.353, + "step": 9028 + }, + { + "epoch": 0.05369802074412408, + "grad_norm": 1.6627084016799927, + "learning_rate": 4.964518649962295e-05, + "loss": 5.1049, + "step": 9029 + }, + { + "epoch": 0.05370396802740508, + "grad_norm": 1.51585853099823, + "learning_rate": 4.964510807887815e-05, + "loss": 4.9433, + "step": 9030 + }, + { + "epoch": 0.05370991531068608, + "grad_norm": 1.7350785732269287, + "learning_rate": 4.964502964952999e-05, + "loss": 5.1761, + "step": 9031 + }, + { + "epoch": 0.05371586259396707, + "grad_norm": 1.925410509109497, + "learning_rate": 4.964495121157852e-05, + "loss": 5.0528, + "step": 9032 + }, + { + "epoch": 0.053721809877248075, + "grad_norm": 1.794162631034851, + "learning_rate": 4.964487276502374e-05, + "loss": 5.2009, + "step": 9033 + }, + { + "epoch": 0.05372775716052907, + "grad_norm": 1.6729109287261963, + "learning_rate": 4.964479430986569e-05, + "loss": 5.16, + "step": 9034 + }, + { + "epoch": 0.053733704443810065, + "grad_norm": 1.8543394804000854, + "learning_rate": 4.9644715846104406e-05, + "loss": 5.3545, + "step": 9035 + }, + { + "epoch": 0.05373965172709107, + "grad_norm": 1.6876883506774902, + "learning_rate": 4.96446373737399e-05, + "loss": 5.2074, + "step": 9036 + }, + { + "epoch": 0.05374559901037206, + "grad_norm": 1.816701054573059, + "learning_rate": 4.9644558892772205e-05, + "loss": 5.154, + "step": 9037 + }, + { + "epoch": 0.05375154629365306, + "grad_norm": 1.471283197402954, + "learning_rate": 4.964448040320135e-05, + "loss": 5.2577, + "step": 9038 + }, + { + "epoch": 0.05375749357693406, + "grad_norm": 1.5764297246932983, + "learning_rate": 4.964440190502736e-05, + "loss": 5.0115, + "step": 9039 + }, + { + "epoch": 0.053763440860215055, + "grad_norm": 1.6854795217514038, + "learning_rate": 4.964432339825027e-05, + "loss": 5.1957, + "step": 9040 + }, + { + "epoch": 0.05376938814349605, + "grad_norm": 1.889570951461792, + "learning_rate": 4.964424488287009e-05, + "loss": 5.1229, + "step": 9041 + }, + { + "epoch": 0.05377533542677705, + "grad_norm": 1.7528218030929565, + "learning_rate": 4.964416635888687e-05, + "loss": 5.0002, + "step": 9042 + }, + { + "epoch": 0.05378128271005805, + "grad_norm": 1.68081796169281, + "learning_rate": 4.964408782630062e-05, + "loss": 5.0567, + "step": 9043 + }, + { + "epoch": 0.05378722999333904, + "grad_norm": 1.6083979606628418, + "learning_rate": 4.9644009285111384e-05, + "loss": 5.0775, + "step": 9044 + }, + { + "epoch": 0.05379317727662004, + "grad_norm": 1.676720380783081, + "learning_rate": 4.9643930735319164e-05, + "loss": 5.0446, + "step": 9045 + }, + { + "epoch": 0.05379912455990104, + "grad_norm": 1.6502453088760376, + "learning_rate": 4.964385217692401e-05, + "loss": 5.3751, + "step": 9046 + }, + { + "epoch": 0.053805071843182034, + "grad_norm": 1.9226343631744385, + "learning_rate": 4.9643773609925935e-05, + "loss": 5.2442, + "step": 9047 + }, + { + "epoch": 0.05381101912646303, + "grad_norm": 1.8054014444351196, + "learning_rate": 4.964369503432498e-05, + "loss": 5.4844, + "step": 9048 + }, + { + "epoch": 0.05381696640974403, + "grad_norm": 1.5151008367538452, + "learning_rate": 4.9643616450121166e-05, + "loss": 5.2834, + "step": 9049 + }, + { + "epoch": 0.053822913693025026, + "grad_norm": 2.0237820148468018, + "learning_rate": 4.964353785731452e-05, + "loss": 5.3166, + "step": 9050 + }, + { + "epoch": 0.05382886097630602, + "grad_norm": 2.145364999771118, + "learning_rate": 4.964345925590507e-05, + "loss": 5.3803, + "step": 9051 + }, + { + "epoch": 0.05383480825958702, + "grad_norm": 1.747369408607483, + "learning_rate": 4.964338064589284e-05, + "loss": 6.1041, + "step": 9052 + }, + { + "epoch": 0.05384075554286802, + "grad_norm": 1.9964301586151123, + "learning_rate": 4.964330202727786e-05, + "loss": 5.1707, + "step": 9053 + }, + { + "epoch": 0.053846702826149014, + "grad_norm": 1.630233645439148, + "learning_rate": 4.9643223400060155e-05, + "loss": 4.9385, + "step": 9054 + }, + { + "epoch": 0.053852650109430016, + "grad_norm": 1.5782960653305054, + "learning_rate": 4.9643144764239765e-05, + "loss": 4.9953, + "step": 9055 + }, + { + "epoch": 0.05385859739271101, + "grad_norm": 2.1511783599853516, + "learning_rate": 4.9643066119816706e-05, + "loss": 5.4329, + "step": 9056 + }, + { + "epoch": 0.053864544675992006, + "grad_norm": 2.2133493423461914, + "learning_rate": 4.9642987466791004e-05, + "loss": 5.7347, + "step": 9057 + }, + { + "epoch": 0.053870491959273, + "grad_norm": 1.7669782638549805, + "learning_rate": 4.9642908805162686e-05, + "loss": 5.4129, + "step": 9058 + }, + { + "epoch": 0.053876439242554, + "grad_norm": 1.8005794286727905, + "learning_rate": 4.9642830134931787e-05, + "loss": 5.2397, + "step": 9059 + }, + { + "epoch": 0.053882386525835, + "grad_norm": 1.697607398033142, + "learning_rate": 4.9642751456098325e-05, + "loss": 5.3388, + "step": 9060 + }, + { + "epoch": 0.05388833380911599, + "grad_norm": 1.4916869401931763, + "learning_rate": 4.9642672768662344e-05, + "loss": 5.2574, + "step": 9061 + }, + { + "epoch": 0.053894281092396995, + "grad_norm": 1.7112784385681152, + "learning_rate": 4.964259407262385e-05, + "loss": 4.9881, + "step": 9062 + }, + { + "epoch": 0.05390022837567799, + "grad_norm": 1.4831846952438354, + "learning_rate": 4.964251536798289e-05, + "loss": 5.3976, + "step": 9063 + }, + { + "epoch": 0.053906175658958985, + "grad_norm": 1.626370906829834, + "learning_rate": 4.9642436654739476e-05, + "loss": 5.2409, + "step": 9064 + }, + { + "epoch": 0.05391212294223999, + "grad_norm": 1.7369413375854492, + "learning_rate": 4.964235793289365e-05, + "loss": 5.2732, + "step": 9065 + }, + { + "epoch": 0.05391807022552098, + "grad_norm": 1.7028629779815674, + "learning_rate": 4.964227920244542e-05, + "loss": 5.3161, + "step": 9066 + }, + { + "epoch": 0.05392401750880198, + "grad_norm": 1.9031678438186646, + "learning_rate": 4.964220046339483e-05, + "loss": 5.2517, + "step": 9067 + }, + { + "epoch": 0.05392996479208298, + "grad_norm": 1.8210735321044922, + "learning_rate": 4.96421217157419e-05, + "loss": 5.2819, + "step": 9068 + }, + { + "epoch": 0.053935912075363975, + "grad_norm": 1.7334645986557007, + "learning_rate": 4.9642042959486666e-05, + "loss": 5.4296, + "step": 9069 + }, + { + "epoch": 0.05394185935864497, + "grad_norm": 1.732790231704712, + "learning_rate": 4.964196419462914e-05, + "loss": 5.3589, + "step": 9070 + }, + { + "epoch": 0.05394780664192597, + "grad_norm": 1.417751669883728, + "learning_rate": 4.964188542116937e-05, + "loss": 5.0958, + "step": 9071 + }, + { + "epoch": 0.05395375392520697, + "grad_norm": 1.8562361001968384, + "learning_rate": 4.964180663910737e-05, + "loss": 5.2622, + "step": 9072 + }, + { + "epoch": 0.05395970120848796, + "grad_norm": 1.7366154193878174, + "learning_rate": 4.9641727848443166e-05, + "loss": 5.2329, + "step": 9073 + }, + { + "epoch": 0.05396564849176896, + "grad_norm": 1.8587182760238647, + "learning_rate": 4.9641649049176785e-05, + "loss": 4.9392, + "step": 9074 + }, + { + "epoch": 0.05397159577504996, + "grad_norm": 1.6152398586273193, + "learning_rate": 4.964157024130827e-05, + "loss": 5.473, + "step": 9075 + }, + { + "epoch": 0.053977543058330954, + "grad_norm": 1.5967273712158203, + "learning_rate": 4.9641491424837626e-05, + "loss": 5.2877, + "step": 9076 + }, + { + "epoch": 0.05398349034161195, + "grad_norm": 1.4986391067504883, + "learning_rate": 4.96414125997649e-05, + "loss": 5.2163, + "step": 9077 + }, + { + "epoch": 0.05398943762489295, + "grad_norm": 1.563905119895935, + "learning_rate": 4.964133376609011e-05, + "loss": 5.2043, + "step": 9078 + }, + { + "epoch": 0.053995384908173946, + "grad_norm": 1.5690317153930664, + "learning_rate": 4.964125492381329e-05, + "loss": 5.2226, + "step": 9079 + }, + { + "epoch": 0.05400133219145494, + "grad_norm": 1.7732517719268799, + "learning_rate": 4.9641176072934446e-05, + "loss": 5.3123, + "step": 9080 + }, + { + "epoch": 0.05400727947473594, + "grad_norm": 1.7045226097106934, + "learning_rate": 4.964109721345364e-05, + "loss": 5.0872, + "step": 9081 + }, + { + "epoch": 0.05401322675801694, + "grad_norm": 1.6405664682388306, + "learning_rate": 4.964101834537087e-05, + "loss": 5.3863, + "step": 9082 + }, + { + "epoch": 0.054019174041297933, + "grad_norm": 1.7410979270935059, + "learning_rate": 4.964093946868618e-05, + "loss": 5.0952, + "step": 9083 + }, + { + "epoch": 0.054025121324578936, + "grad_norm": 2.0102951526641846, + "learning_rate": 4.964086058339959e-05, + "loss": 4.9484, + "step": 9084 + }, + { + "epoch": 0.05403106860785993, + "grad_norm": 1.8228510618209839, + "learning_rate": 4.9640781689511133e-05, + "loss": 5.1141, + "step": 9085 + }, + { + "epoch": 0.054037015891140926, + "grad_norm": 1.7363582849502563, + "learning_rate": 4.964070278702083e-05, + "loss": 5.1164, + "step": 9086 + }, + { + "epoch": 0.05404296317442192, + "grad_norm": 1.6060153245925903, + "learning_rate": 4.9640623875928714e-05, + "loss": 5.1746, + "step": 9087 + }, + { + "epoch": 0.05404891045770292, + "grad_norm": 1.6690374612808228, + "learning_rate": 4.9640544956234814e-05, + "loss": 5.0931, + "step": 9088 + }, + { + "epoch": 0.05405485774098392, + "grad_norm": 1.613527774810791, + "learning_rate": 4.964046602793916e-05, + "loss": 5.2224, + "step": 9089 + }, + { + "epoch": 0.05406080502426491, + "grad_norm": 1.6461642980575562, + "learning_rate": 4.964038709104176e-05, + "loss": 5.3175, + "step": 9090 + }, + { + "epoch": 0.054066752307545915, + "grad_norm": 1.839709758758545, + "learning_rate": 4.9640308145542664e-05, + "loss": 5.3247, + "step": 9091 + }, + { + "epoch": 0.05407269959082691, + "grad_norm": 1.8977348804473877, + "learning_rate": 4.9640229191441886e-05, + "loss": 5.4256, + "step": 9092 + }, + { + "epoch": 0.054078646874107905, + "grad_norm": 1.9805532693862915, + "learning_rate": 4.9640150228739454e-05, + "loss": 4.9413, + "step": 9093 + }, + { + "epoch": 0.05408459415738891, + "grad_norm": 2.0237114429473877, + "learning_rate": 4.964007125743542e-05, + "loss": 4.8808, + "step": 9094 + }, + { + "epoch": 0.0540905414406699, + "grad_norm": 1.9848511219024658, + "learning_rate": 4.963999227752977e-05, + "loss": 5.0295, + "step": 9095 + }, + { + "epoch": 0.0540964887239509, + "grad_norm": 1.925876498222351, + "learning_rate": 4.9639913289022564e-05, + "loss": 5.0129, + "step": 9096 + }, + { + "epoch": 0.0541024360072319, + "grad_norm": 1.4887725114822388, + "learning_rate": 4.963983429191382e-05, + "loss": 4.9706, + "step": 9097 + }, + { + "epoch": 0.054108383290512894, + "grad_norm": 1.615160584449768, + "learning_rate": 4.963975528620356e-05, + "loss": 5.0066, + "step": 9098 + }, + { + "epoch": 0.05411433057379389, + "grad_norm": 1.969086766242981, + "learning_rate": 4.9639676271891816e-05, + "loss": 4.9539, + "step": 9099 + }, + { + "epoch": 0.05412027785707489, + "grad_norm": 1.8290555477142334, + "learning_rate": 4.963959724897862e-05, + "loss": 5.2467, + "step": 9100 + }, + { + "epoch": 0.05412622514035589, + "grad_norm": 2.004157066345215, + "learning_rate": 4.963951821746399e-05, + "loss": 4.8, + "step": 9101 + }, + { + "epoch": 0.05413217242363688, + "grad_norm": 1.9732778072357178, + "learning_rate": 4.9639439177347955e-05, + "loss": 4.8828, + "step": 9102 + }, + { + "epoch": 0.05413811970691788, + "grad_norm": 1.8653557300567627, + "learning_rate": 4.963936012863056e-05, + "loss": 5.0591, + "step": 9103 + }, + { + "epoch": 0.05414406699019888, + "grad_norm": 1.7854375839233398, + "learning_rate": 4.9639281071311804e-05, + "loss": 5.0914, + "step": 9104 + }, + { + "epoch": 0.054150014273479874, + "grad_norm": 1.7956377267837524, + "learning_rate": 4.963920200539174e-05, + "loss": 5.3484, + "step": 9105 + }, + { + "epoch": 0.05415596155676087, + "grad_norm": 1.7851346731185913, + "learning_rate": 4.963912293087039e-05, + "loss": 5.3146, + "step": 9106 + }, + { + "epoch": 0.05416190884004187, + "grad_norm": 1.72859787940979, + "learning_rate": 4.9639043847747756e-05, + "loss": 5.1611, + "step": 9107 + }, + { + "epoch": 0.054167856123322866, + "grad_norm": 1.5961265563964844, + "learning_rate": 4.9638964756023904e-05, + "loss": 5.247, + "step": 9108 + }, + { + "epoch": 0.05417380340660386, + "grad_norm": 1.7507922649383545, + "learning_rate": 4.963888565569884e-05, + "loss": 5.2011, + "step": 9109 + }, + { + "epoch": 0.05417975068988486, + "grad_norm": 1.8338440656661987, + "learning_rate": 4.9638806546772594e-05, + "loss": 5.2413, + "step": 9110 + }, + { + "epoch": 0.05418569797316586, + "grad_norm": 1.8935306072235107, + "learning_rate": 4.963872742924519e-05, + "loss": 5.1042, + "step": 9111 + }, + { + "epoch": 0.05419164525644685, + "grad_norm": 1.6512808799743652, + "learning_rate": 4.963864830311667e-05, + "loss": 5.2437, + "step": 9112 + }, + { + "epoch": 0.054197592539727855, + "grad_norm": 1.6099332571029663, + "learning_rate": 4.963856916838705e-05, + "loss": 5.2828, + "step": 9113 + }, + { + "epoch": 0.05420353982300885, + "grad_norm": 2.114581823348999, + "learning_rate": 4.9638490025056355e-05, + "loss": 6.1534, + "step": 9114 + }, + { + "epoch": 0.054209487106289846, + "grad_norm": 1.762335181236267, + "learning_rate": 4.963841087312462e-05, + "loss": 5.1504, + "step": 9115 + }, + { + "epoch": 0.05421543438957084, + "grad_norm": 1.7669222354888916, + "learning_rate": 4.963833171259187e-05, + "loss": 5.0365, + "step": 9116 + }, + { + "epoch": 0.05422138167285184, + "grad_norm": 1.7319819927215576, + "learning_rate": 4.963825254345814e-05, + "loss": 5.0724, + "step": 9117 + }, + { + "epoch": 0.05422732895613284, + "grad_norm": 1.618116021156311, + "learning_rate": 4.9638173365723444e-05, + "loss": 5.0964, + "step": 9118 + }, + { + "epoch": 0.05423327623941383, + "grad_norm": 1.6506006717681885, + "learning_rate": 4.9638094179387814e-05, + "loss": 5.1189, + "step": 9119 + }, + { + "epoch": 0.054239223522694835, + "grad_norm": 1.7512328624725342, + "learning_rate": 4.963801498445129e-05, + "loss": 5.2732, + "step": 9120 + }, + { + "epoch": 0.05424517080597583, + "grad_norm": 1.5639985799789429, + "learning_rate": 4.963793578091388e-05, + "loss": 5.0718, + "step": 9121 + }, + { + "epoch": 0.054251118089256825, + "grad_norm": 1.7059093713760376, + "learning_rate": 4.963785656877562e-05, + "loss": 5.0744, + "step": 9122 + }, + { + "epoch": 0.05425706537253783, + "grad_norm": 1.574802279472351, + "learning_rate": 4.9637777348036546e-05, + "loss": 5.2663, + "step": 9123 + }, + { + "epoch": 0.05426301265581882, + "grad_norm": 1.7343204021453857, + "learning_rate": 4.9637698118696674e-05, + "loss": 5.0805, + "step": 9124 + }, + { + "epoch": 0.05426895993909982, + "grad_norm": 1.6154165267944336, + "learning_rate": 4.963761888075604e-05, + "loss": 5.1402, + "step": 9125 + }, + { + "epoch": 0.05427490722238082, + "grad_norm": 1.6474148035049438, + "learning_rate": 4.9637539634214666e-05, + "loss": 5.0601, + "step": 9126 + }, + { + "epoch": 0.054280854505661814, + "grad_norm": 1.7573519945144653, + "learning_rate": 4.963746037907258e-05, + "loss": 5.1846, + "step": 9127 + }, + { + "epoch": 0.05428680178894281, + "grad_norm": 1.4558652639389038, + "learning_rate": 4.963738111532981e-05, + "loss": 5.3132, + "step": 9128 + }, + { + "epoch": 0.05429274907222381, + "grad_norm": 1.6261000633239746, + "learning_rate": 4.963730184298639e-05, + "loss": 5.2843, + "step": 9129 + }, + { + "epoch": 0.05429869635550481, + "grad_norm": 1.4502191543579102, + "learning_rate": 4.963722256204234e-05, + "loss": 5.14, + "step": 9130 + }, + { + "epoch": 0.0543046436387858, + "grad_norm": 1.6366747617721558, + "learning_rate": 4.9637143272497686e-05, + "loss": 5.1496, + "step": 9131 + }, + { + "epoch": 0.0543105909220668, + "grad_norm": 1.603745698928833, + "learning_rate": 4.963706397435246e-05, + "loss": 5.0644, + "step": 9132 + }, + { + "epoch": 0.0543165382053478, + "grad_norm": 1.419536828994751, + "learning_rate": 4.963698466760669e-05, + "loss": 5.3182, + "step": 9133 + }, + { + "epoch": 0.054322485488628794, + "grad_norm": 1.511765480041504, + "learning_rate": 4.963690535226041e-05, + "loss": 5.2808, + "step": 9134 + }, + { + "epoch": 0.05432843277190979, + "grad_norm": 1.4999688863754272, + "learning_rate": 4.963682602831364e-05, + "loss": 4.9235, + "step": 9135 + }, + { + "epoch": 0.05433438005519079, + "grad_norm": 1.5918420553207397, + "learning_rate": 4.96367466957664e-05, + "loss": 4.9293, + "step": 9136 + }, + { + "epoch": 0.054340327338471786, + "grad_norm": 1.502748727798462, + "learning_rate": 4.963666735461874e-05, + "loss": 5.2692, + "step": 9137 + }, + { + "epoch": 0.05434627462175278, + "grad_norm": 1.6474169492721558, + "learning_rate": 4.963658800487066e-05, + "loss": 5.1638, + "step": 9138 + }, + { + "epoch": 0.05435222190503378, + "grad_norm": 2.0195884704589844, + "learning_rate": 4.9636508646522204e-05, + "loss": 5.1085, + "step": 9139 + }, + { + "epoch": 0.05435816918831478, + "grad_norm": 1.7266180515289307, + "learning_rate": 4.9636429279573406e-05, + "loss": 5.0747, + "step": 9140 + }, + { + "epoch": 0.05436411647159577, + "grad_norm": 1.6965065002441406, + "learning_rate": 4.963634990402428e-05, + "loss": 5.1246, + "step": 9141 + }, + { + "epoch": 0.054370063754876775, + "grad_norm": 1.7629759311676025, + "learning_rate": 4.9636270519874856e-05, + "loss": 5.274, + "step": 9142 + }, + { + "epoch": 0.05437601103815777, + "grad_norm": 1.6365042924880981, + "learning_rate": 4.9636191127125164e-05, + "loss": 5.2469, + "step": 9143 + }, + { + "epoch": 0.054381958321438766, + "grad_norm": 1.6777831315994263, + "learning_rate": 4.9636111725775235e-05, + "loss": 5.3041, + "step": 9144 + }, + { + "epoch": 0.05438790560471976, + "grad_norm": 1.5354039669036865, + "learning_rate": 4.9636032315825096e-05, + "loss": 5.1799, + "step": 9145 + }, + { + "epoch": 0.05439385288800076, + "grad_norm": 1.508083701133728, + "learning_rate": 4.9635952897274773e-05, + "loss": 5.0822, + "step": 9146 + }, + { + "epoch": 0.05439980017128176, + "grad_norm": 1.5960441827774048, + "learning_rate": 4.963587347012429e-05, + "loss": 5.1618, + "step": 9147 + }, + { + "epoch": 0.05440574745456275, + "grad_norm": 1.4927520751953125, + "learning_rate": 4.9635794034373675e-05, + "loss": 5.1464, + "step": 9148 + }, + { + "epoch": 0.054411694737843755, + "grad_norm": 1.7420401573181152, + "learning_rate": 4.9635714590022966e-05, + "loss": 5.2866, + "step": 9149 + }, + { + "epoch": 0.05441764202112475, + "grad_norm": 1.7907800674438477, + "learning_rate": 4.9635635137072176e-05, + "loss": 5.1042, + "step": 9150 + }, + { + "epoch": 0.054423589304405745, + "grad_norm": 1.7073547840118408, + "learning_rate": 4.963555567552135e-05, + "loss": 5.1986, + "step": 9151 + }, + { + "epoch": 0.05442953658768675, + "grad_norm": 1.894405484199524, + "learning_rate": 4.96354762053705e-05, + "loss": 5.225, + "step": 9152 + }, + { + "epoch": 0.05443548387096774, + "grad_norm": 1.5830878019332886, + "learning_rate": 4.9635396726619656e-05, + "loss": 5.2902, + "step": 9153 + }, + { + "epoch": 0.05444143115424874, + "grad_norm": 1.5435214042663574, + "learning_rate": 4.963531723926885e-05, + "loss": 5.0773, + "step": 9154 + }, + { + "epoch": 0.05444737843752974, + "grad_norm": 1.4262596368789673, + "learning_rate": 4.9635237743318117e-05, + "loss": 5.129, + "step": 9155 + }, + { + "epoch": 0.054453325720810734, + "grad_norm": 1.5793390274047852, + "learning_rate": 4.9635158238767475e-05, + "loss": 5.1693, + "step": 9156 + }, + { + "epoch": 0.05445927300409173, + "grad_norm": 1.767318606376648, + "learning_rate": 4.963507872561695e-05, + "loss": 5.2541, + "step": 9157 + }, + { + "epoch": 0.05446522028737273, + "grad_norm": 1.5084065198898315, + "learning_rate": 4.963499920386658e-05, + "loss": 5.2531, + "step": 9158 + }, + { + "epoch": 0.05447116757065373, + "grad_norm": 1.797877311706543, + "learning_rate": 4.963491967351638e-05, + "loss": 5.2278, + "step": 9159 + }, + { + "epoch": 0.05447711485393472, + "grad_norm": 1.7463361024856567, + "learning_rate": 4.963484013456639e-05, + "loss": 5.1005, + "step": 9160 + }, + { + "epoch": 0.05448306213721572, + "grad_norm": 1.8208277225494385, + "learning_rate": 4.9634760587016626e-05, + "loss": 5.1437, + "step": 9161 + }, + { + "epoch": 0.05448900942049672, + "grad_norm": 1.9020015001296997, + "learning_rate": 4.9634681030867116e-05, + "loss": 5.1554, + "step": 9162 + }, + { + "epoch": 0.054494956703777714, + "grad_norm": 1.8370200395584106, + "learning_rate": 4.9634601466117904e-05, + "loss": 5.2418, + "step": 9163 + }, + { + "epoch": 0.05450090398705871, + "grad_norm": 1.785875678062439, + "learning_rate": 4.9634521892769004e-05, + "loss": 5.1916, + "step": 9164 + }, + { + "epoch": 0.05450685127033971, + "grad_norm": 1.7501643896102905, + "learning_rate": 4.963444231082045e-05, + "loss": 5.0887, + "step": 9165 + }, + { + "epoch": 0.054512798553620706, + "grad_norm": 1.6924220323562622, + "learning_rate": 4.963436272027227e-05, + "loss": 5.2458, + "step": 9166 + }, + { + "epoch": 0.0545187458369017, + "grad_norm": 1.895605206489563, + "learning_rate": 4.963428312112447e-05, + "loss": 5.1286, + "step": 9167 + }, + { + "epoch": 0.0545246931201827, + "grad_norm": 1.842207908630371, + "learning_rate": 4.963420351337711e-05, + "loss": 5.1177, + "step": 9168 + }, + { + "epoch": 0.0545306404034637, + "grad_norm": 1.7467048168182373, + "learning_rate": 4.963412389703021e-05, + "loss": 5.1616, + "step": 9169 + }, + { + "epoch": 0.05453658768674469, + "grad_norm": 1.8047499656677246, + "learning_rate": 4.963404427208378e-05, + "loss": 5.0543, + "step": 9170 + }, + { + "epoch": 0.054542534970025695, + "grad_norm": 1.5830637216567993, + "learning_rate": 4.963396463853786e-05, + "loss": 5.0989, + "step": 9171 + }, + { + "epoch": 0.05454848225330669, + "grad_norm": 1.7481937408447266, + "learning_rate": 4.9633884996392485e-05, + "loss": 5.1686, + "step": 9172 + }, + { + "epoch": 0.054554429536587686, + "grad_norm": 1.7132925987243652, + "learning_rate": 4.9633805345647664e-05, + "loss": 4.9683, + "step": 9173 + }, + { + "epoch": 0.05456037681986868, + "grad_norm": 1.8369117975234985, + "learning_rate": 4.9633725686303445e-05, + "loss": 5.154, + "step": 9174 + }, + { + "epoch": 0.05456632410314968, + "grad_norm": 1.615011215209961, + "learning_rate": 4.963364601835985e-05, + "loss": 5.0982, + "step": 9175 + }, + { + "epoch": 0.05457227138643068, + "grad_norm": 1.853742003440857, + "learning_rate": 4.963356634181689e-05, + "loss": 6.0599, + "step": 9176 + }, + { + "epoch": 0.05457821866971167, + "grad_norm": 1.5529752969741821, + "learning_rate": 4.963348665667462e-05, + "loss": 5.1355, + "step": 9177 + }, + { + "epoch": 0.054584165952992675, + "grad_norm": 1.5113881826400757, + "learning_rate": 4.963340696293305e-05, + "loss": 5.1947, + "step": 9178 + }, + { + "epoch": 0.05459011323627367, + "grad_norm": 1.6840931177139282, + "learning_rate": 4.963332726059221e-05, + "loss": 5.2163, + "step": 9179 + }, + { + "epoch": 0.054596060519554665, + "grad_norm": 1.7720422744750977, + "learning_rate": 4.963324754965214e-05, + "loss": 5.4737, + "step": 9180 + }, + { + "epoch": 0.05460200780283567, + "grad_norm": 1.632574200630188, + "learning_rate": 4.963316783011285e-05, + "loss": 5.2274, + "step": 9181 + }, + { + "epoch": 0.05460795508611666, + "grad_norm": 1.5859557390213013, + "learning_rate": 4.963308810197437e-05, + "loss": 5.3503, + "step": 9182 + }, + { + "epoch": 0.05461390236939766, + "grad_norm": 1.8342604637145996, + "learning_rate": 4.963300836523674e-05, + "loss": 5.1967, + "step": 9183 + }, + { + "epoch": 0.05461984965267866, + "grad_norm": 1.7443957328796387, + "learning_rate": 4.963292861989998e-05, + "loss": 5.0935, + "step": 9184 + }, + { + "epoch": 0.054625796935959654, + "grad_norm": 1.9289584159851074, + "learning_rate": 4.963284886596412e-05, + "loss": 5.1817, + "step": 9185 + }, + { + "epoch": 0.05463174421924065, + "grad_norm": 1.8695822954177856, + "learning_rate": 4.9632769103429186e-05, + "loss": 5.4304, + "step": 9186 + }, + { + "epoch": 0.05463769150252165, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.9632689332295206e-05, + "loss": 5.3924, + "step": 9187 + }, + { + "epoch": 0.054643638785802646, + "grad_norm": 1.6061500310897827, + "learning_rate": 4.963260955256221e-05, + "loss": 5.2309, + "step": 9188 + }, + { + "epoch": 0.05464958606908364, + "grad_norm": 1.5478893518447876, + "learning_rate": 4.963252976423022e-05, + "loss": 5.2615, + "step": 9189 + }, + { + "epoch": 0.05465553335236464, + "grad_norm": 1.4304052591323853, + "learning_rate": 4.9632449967299276e-05, + "loss": 5.2116, + "step": 9190 + }, + { + "epoch": 0.05466148063564564, + "grad_norm": 1.5438693761825562, + "learning_rate": 4.9632370161769395e-05, + "loss": 5.1176, + "step": 9191 + }, + { + "epoch": 0.054667427918926634, + "grad_norm": 1.6602065563201904, + "learning_rate": 4.9632290347640606e-05, + "loss": 5.1521, + "step": 9192 + }, + { + "epoch": 0.05467337520220763, + "grad_norm": 1.530038595199585, + "learning_rate": 4.9632210524912934e-05, + "loss": 5.1437, + "step": 9193 + }, + { + "epoch": 0.05467932248548863, + "grad_norm": 1.617691159248352, + "learning_rate": 4.963213069358643e-05, + "loss": 5.0601, + "step": 9194 + }, + { + "epoch": 0.054685269768769626, + "grad_norm": 1.722401738166809, + "learning_rate": 4.963205085366108e-05, + "loss": 5.2664, + "step": 9195 + }, + { + "epoch": 0.05469121705205062, + "grad_norm": 1.803673267364502, + "learning_rate": 4.963197100513696e-05, + "loss": 5.4164, + "step": 9196 + }, + { + "epoch": 0.05469716433533162, + "grad_norm": 1.8565739393234253, + "learning_rate": 4.963189114801405e-05, + "loss": 5.225, + "step": 9197 + }, + { + "epoch": 0.05470311161861262, + "grad_norm": 1.780698299407959, + "learning_rate": 4.963181128229242e-05, + "loss": 5.1694, + "step": 9198 + }, + { + "epoch": 0.05470905890189361, + "grad_norm": 1.820416808128357, + "learning_rate": 4.963173140797207e-05, + "loss": 5.3305, + "step": 9199 + }, + { + "epoch": 0.054715006185174615, + "grad_norm": 1.471983551979065, + "learning_rate": 4.963165152505304e-05, + "loss": 5.3217, + "step": 9200 + }, + { + "epoch": 0.05472095346845561, + "grad_norm": 1.504616141319275, + "learning_rate": 4.9631571633535354e-05, + "loss": 5.3349, + "step": 9201 + }, + { + "epoch": 0.054726900751736605, + "grad_norm": 1.5888862609863281, + "learning_rate": 4.963149173341903e-05, + "loss": 5.3431, + "step": 9202 + }, + { + "epoch": 0.0547328480350176, + "grad_norm": 1.6633155345916748, + "learning_rate": 4.963141182470412e-05, + "loss": 5.2678, + "step": 9203 + }, + { + "epoch": 0.0547387953182986, + "grad_norm": 1.7259690761566162, + "learning_rate": 4.9631331907390636e-05, + "loss": 5.348, + "step": 9204 + }, + { + "epoch": 0.0547447426015796, + "grad_norm": 1.703925371170044, + "learning_rate": 4.963125198147861e-05, + "loss": 5.4123, + "step": 9205 + }, + { + "epoch": 0.05475068988486059, + "grad_norm": 1.6619760990142822, + "learning_rate": 4.963117204696807e-05, + "loss": 5.1732, + "step": 9206 + }, + { + "epoch": 0.054756637168141595, + "grad_norm": 1.7368190288543701, + "learning_rate": 4.963109210385903e-05, + "loss": 5.0843, + "step": 9207 + }, + { + "epoch": 0.05476258445142259, + "grad_norm": 1.781179666519165, + "learning_rate": 4.9631012152151545e-05, + "loss": 5.1343, + "step": 9208 + }, + { + "epoch": 0.054768531734703585, + "grad_norm": 1.674793004989624, + "learning_rate": 4.9630932191845624e-05, + "loss": 5.4079, + "step": 9209 + }, + { + "epoch": 0.05477447901798459, + "grad_norm": 1.7708344459533691, + "learning_rate": 4.9630852222941296e-05, + "loss": 4.9702, + "step": 9210 + }, + { + "epoch": 0.05478042630126558, + "grad_norm": 1.684725046157837, + "learning_rate": 4.9630772245438594e-05, + "loss": 5.263, + "step": 9211 + }, + { + "epoch": 0.05478637358454658, + "grad_norm": 1.6064784526824951, + "learning_rate": 4.963069225933754e-05, + "loss": 5.3402, + "step": 9212 + }, + { + "epoch": 0.05479232086782758, + "grad_norm": 1.5189318656921387, + "learning_rate": 4.963061226463816e-05, + "loss": 5.1928, + "step": 9213 + }, + { + "epoch": 0.054798268151108574, + "grad_norm": 1.8095827102661133, + "learning_rate": 4.96305322613405e-05, + "loss": 5.262, + "step": 9214 + }, + { + "epoch": 0.05480421543438957, + "grad_norm": 1.8325434923171997, + "learning_rate": 4.963045224944458e-05, + "loss": 5.4975, + "step": 9215 + }, + { + "epoch": 0.05481016271767057, + "grad_norm": 1.6597868204116821, + "learning_rate": 4.963037222895042e-05, + "loss": 5.6232, + "step": 9216 + }, + { + "epoch": 0.054816110000951566, + "grad_norm": 1.6402417421340942, + "learning_rate": 4.9630292199858044e-05, + "loss": 5.5358, + "step": 9217 + }, + { + "epoch": 0.05482205728423256, + "grad_norm": 1.3956371545791626, + "learning_rate": 4.963021216216749e-05, + "loss": 5.2563, + "step": 9218 + }, + { + "epoch": 0.05482800456751356, + "grad_norm": 1.5958374738693237, + "learning_rate": 4.963013211587878e-05, + "loss": 5.1539, + "step": 9219 + }, + { + "epoch": 0.05483395185079456, + "grad_norm": 1.6152080297470093, + "learning_rate": 4.963005206099195e-05, + "loss": 5.4025, + "step": 9220 + }, + { + "epoch": 0.054839899134075554, + "grad_norm": 1.392427921295166, + "learning_rate": 4.962997199750702e-05, + "loss": 5.4149, + "step": 9221 + }, + { + "epoch": 0.05484584641735655, + "grad_norm": 1.5625338554382324, + "learning_rate": 4.962989192542403e-05, + "loss": 5.5837, + "step": 9222 + }, + { + "epoch": 0.05485179370063755, + "grad_norm": 1.6465163230895996, + "learning_rate": 4.962981184474299e-05, + "loss": 5.2934, + "step": 9223 + }, + { + "epoch": 0.054857740983918546, + "grad_norm": 1.5344611406326294, + "learning_rate": 4.962973175546394e-05, + "loss": 5.4734, + "step": 9224 + }, + { + "epoch": 0.05486368826719954, + "grad_norm": 1.2378648519515991, + "learning_rate": 4.962965165758691e-05, + "loss": 5.3368, + "step": 9225 + }, + { + "epoch": 0.05486963555048054, + "grad_norm": 1.396785020828247, + "learning_rate": 4.9629571551111915e-05, + "loss": 5.3163, + "step": 9226 + }, + { + "epoch": 0.05487558283376154, + "grad_norm": 1.639452338218689, + "learning_rate": 4.9629491436038994e-05, + "loss": 5.3933, + "step": 9227 + }, + { + "epoch": 0.05488153011704253, + "grad_norm": 1.5648834705352783, + "learning_rate": 4.9629411312368166e-05, + "loss": 5.3717, + "step": 9228 + }, + { + "epoch": 0.054887477400323535, + "grad_norm": 1.4774922132492065, + "learning_rate": 4.962933118009947e-05, + "loss": 5.1318, + "step": 9229 + }, + { + "epoch": 0.05489342468360453, + "grad_norm": 1.4987083673477173, + "learning_rate": 4.9629251039232935e-05, + "loss": 5.1436, + "step": 9230 + }, + { + "epoch": 0.054899371966885525, + "grad_norm": 1.660605788230896, + "learning_rate": 4.9629170889768586e-05, + "loss": 5.1841, + "step": 9231 + }, + { + "epoch": 0.05490531925016652, + "grad_norm": 1.4441273212432861, + "learning_rate": 4.962909073170643e-05, + "loss": 5.3108, + "step": 9232 + }, + { + "epoch": 0.05491126653344752, + "grad_norm": 1.3297922611236572, + "learning_rate": 4.962901056504653e-05, + "loss": 5.1441, + "step": 9233 + }, + { + "epoch": 0.05491721381672852, + "grad_norm": 1.2989814281463623, + "learning_rate": 4.9628930389788886e-05, + "loss": 5.5146, + "step": 9234 + }, + { + "epoch": 0.05492316110000951, + "grad_norm": 1.350948452949524, + "learning_rate": 4.962885020593354e-05, + "loss": 5.2832, + "step": 9235 + }, + { + "epoch": 0.054929108383290515, + "grad_norm": 1.5801438093185425, + "learning_rate": 4.962877001348052e-05, + "loss": 5.4251, + "step": 9236 + }, + { + "epoch": 0.05493505566657151, + "grad_norm": 1.4355653524398804, + "learning_rate": 4.9628689812429854e-05, + "loss": 5.4092, + "step": 9237 + }, + { + "epoch": 0.054941002949852505, + "grad_norm": 1.692746639251709, + "learning_rate": 4.962860960278156e-05, + "loss": 5.3858, + "step": 9238 + }, + { + "epoch": 0.05494695023313351, + "grad_norm": 1.5125641822814941, + "learning_rate": 4.962852938453567e-05, + "loss": 5.6584, + "step": 9239 + }, + { + "epoch": 0.0549528975164145, + "grad_norm": 1.4158848524093628, + "learning_rate": 4.962844915769221e-05, + "loss": 5.652, + "step": 9240 + }, + { + "epoch": 0.0549588447996955, + "grad_norm": 1.314286231994629, + "learning_rate": 4.9628368922251235e-05, + "loss": 5.501, + "step": 9241 + }, + { + "epoch": 0.0549647920829765, + "grad_norm": 1.4003247022628784, + "learning_rate": 4.962828867821273e-05, + "loss": 5.448, + "step": 9242 + }, + { + "epoch": 0.054970739366257494, + "grad_norm": 1.7670220136642456, + "learning_rate": 4.962820842557675e-05, + "loss": 5.4854, + "step": 9243 + }, + { + "epoch": 0.05497668664953849, + "grad_norm": 1.9435075521469116, + "learning_rate": 4.962812816434332e-05, + "loss": 5.3824, + "step": 9244 + }, + { + "epoch": 0.05498263393281949, + "grad_norm": 2.1733458042144775, + "learning_rate": 4.9628047894512466e-05, + "loss": 5.6771, + "step": 9245 + }, + { + "epoch": 0.054988581216100486, + "grad_norm": 1.5455420017242432, + "learning_rate": 4.962796761608421e-05, + "loss": 5.4634, + "step": 9246 + }, + { + "epoch": 0.05499452849938148, + "grad_norm": 1.623382806777954, + "learning_rate": 4.962788732905859e-05, + "loss": 5.8441, + "step": 9247 + }, + { + "epoch": 0.05500047578266248, + "grad_norm": 1.928788423538208, + "learning_rate": 4.962780703343563e-05, + "loss": 5.6553, + "step": 9248 + }, + { + "epoch": 0.05500642306594348, + "grad_norm": 1.660984992980957, + "learning_rate": 4.962772672921535e-05, + "loss": 5.5953, + "step": 9249 + }, + { + "epoch": 0.055012370349224474, + "grad_norm": 2.081026792526245, + "learning_rate": 4.962764641639779e-05, + "loss": 5.7065, + "step": 9250 + }, + { + "epoch": 0.05501831763250547, + "grad_norm": 1.8750234842300415, + "learning_rate": 4.962756609498297e-05, + "loss": 5.8814, + "step": 9251 + }, + { + "epoch": 0.05502426491578647, + "grad_norm": 1.9573127031326294, + "learning_rate": 4.9627485764970916e-05, + "loss": 5.7415, + "step": 9252 + }, + { + "epoch": 0.055030212199067466, + "grad_norm": 1.7536600828170776, + "learning_rate": 4.962740542636167e-05, + "loss": 5.5638, + "step": 9253 + }, + { + "epoch": 0.05503615948234846, + "grad_norm": 1.692557692527771, + "learning_rate": 4.962732507915525e-05, + "loss": 5.5362, + "step": 9254 + }, + { + "epoch": 0.05504210676562946, + "grad_norm": 1.9066821336746216, + "learning_rate": 4.962724472335168e-05, + "loss": 5.3094, + "step": 9255 + }, + { + "epoch": 0.05504805404891046, + "grad_norm": 2.069007158279419, + "learning_rate": 4.9627164358951e-05, + "loss": 5.766, + "step": 9256 + }, + { + "epoch": 0.05505400133219145, + "grad_norm": 2.0293545722961426, + "learning_rate": 4.9627083985953227e-05, + "loss": 5.7769, + "step": 9257 + }, + { + "epoch": 0.055059948615472455, + "grad_norm": 1.7953507900238037, + "learning_rate": 4.962700360435839e-05, + "loss": 5.8435, + "step": 9258 + }, + { + "epoch": 0.05506589589875345, + "grad_norm": 1.9281821250915527, + "learning_rate": 4.9626923214166535e-05, + "loss": 5.8342, + "step": 9259 + }, + { + "epoch": 0.055071843182034445, + "grad_norm": 1.4612617492675781, + "learning_rate": 4.962684281537766e-05, + "loss": 5.8273, + "step": 9260 + }, + { + "epoch": 0.05507779046531545, + "grad_norm": 1.8589900732040405, + "learning_rate": 4.9626762407991817e-05, + "loss": 5.7607, + "step": 9261 + }, + { + "epoch": 0.05508373774859644, + "grad_norm": 1.9395030736923218, + "learning_rate": 4.9626681992009025e-05, + "loss": 5.7573, + "step": 9262 + }, + { + "epoch": 0.05508968503187744, + "grad_norm": 1.7344708442687988, + "learning_rate": 4.962660156742931e-05, + "loss": 5.7999, + "step": 9263 + }, + { + "epoch": 0.05509563231515843, + "grad_norm": 1.7719827890396118, + "learning_rate": 4.9626521134252704e-05, + "loss": 5.7882, + "step": 9264 + }, + { + "epoch": 0.055101579598439435, + "grad_norm": 1.4955536127090454, + "learning_rate": 4.9626440692479236e-05, + "loss": 5.639, + "step": 9265 + }, + { + "epoch": 0.05510752688172043, + "grad_norm": 2.0087990760803223, + "learning_rate": 4.9626360242108925e-05, + "loss": 5.841, + "step": 9266 + }, + { + "epoch": 0.055113474165001425, + "grad_norm": 1.7334564924240112, + "learning_rate": 4.962627978314181e-05, + "loss": 5.4267, + "step": 9267 + }, + { + "epoch": 0.05511942144828243, + "grad_norm": 2.1204535961151123, + "learning_rate": 4.962619931557792e-05, + "loss": 5.4451, + "step": 9268 + }, + { + "epoch": 0.05512536873156342, + "grad_norm": 2.2374279499053955, + "learning_rate": 4.962611883941727e-05, + "loss": 5.5095, + "step": 9269 + }, + { + "epoch": 0.05513131601484442, + "grad_norm": 1.735070824623108, + "learning_rate": 4.9626038354659904e-05, + "loss": 5.3609, + "step": 9270 + }, + { + "epoch": 0.05513726329812542, + "grad_norm": 1.9748501777648926, + "learning_rate": 4.9625957861305837e-05, + "loss": 5.3366, + "step": 9271 + }, + { + "epoch": 0.055143210581406414, + "grad_norm": 1.8736618757247925, + "learning_rate": 4.96258773593551e-05, + "loss": 5.4706, + "step": 9272 + }, + { + "epoch": 0.05514915786468741, + "grad_norm": 2.571755886077881, + "learning_rate": 4.9625796848807736e-05, + "loss": 5.0393, + "step": 9273 + }, + { + "epoch": 0.05515510514796841, + "grad_norm": 2.1467013359069824, + "learning_rate": 4.962571632966375e-05, + "loss": 5.5798, + "step": 9274 + }, + { + "epoch": 0.055161052431249406, + "grad_norm": 2.4553916454315186, + "learning_rate": 4.962563580192319e-05, + "loss": 5.4323, + "step": 9275 + }, + { + "epoch": 0.0551669997145304, + "grad_norm": 2.4478797912597656, + "learning_rate": 4.962555526558607e-05, + "loss": 5.2591, + "step": 9276 + }, + { + "epoch": 0.055172946997811396, + "grad_norm": 2.2164270877838135, + "learning_rate": 4.9625474720652416e-05, + "loss": 5.3404, + "step": 9277 + }, + { + "epoch": 0.0551788942810924, + "grad_norm": 1.9161698818206787, + "learning_rate": 4.962539416712227e-05, + "loss": 5.2591, + "step": 9278 + }, + { + "epoch": 0.055184841564373394, + "grad_norm": 2.348734140396118, + "learning_rate": 4.962531360499565e-05, + "loss": 5.8132, + "step": 9279 + }, + { + "epoch": 0.05519078884765439, + "grad_norm": 2.400090456008911, + "learning_rate": 4.962523303427259e-05, + "loss": 5.7786, + "step": 9280 + }, + { + "epoch": 0.05519673613093539, + "grad_norm": 2.1626594066619873, + "learning_rate": 4.9625152454953115e-05, + "loss": 5.8488, + "step": 9281 + }, + { + "epoch": 0.055202683414216386, + "grad_norm": 1.7470853328704834, + "learning_rate": 4.962507186703725e-05, + "loss": 5.72, + "step": 9282 + }, + { + "epoch": 0.05520863069749738, + "grad_norm": 1.9191921949386597, + "learning_rate": 4.962499127052503e-05, + "loss": 5.6321, + "step": 9283 + }, + { + "epoch": 0.05521457798077838, + "grad_norm": 2.1550769805908203, + "learning_rate": 4.962491066541649e-05, + "loss": 5.4521, + "step": 9284 + }, + { + "epoch": 0.05522052526405938, + "grad_norm": 2.0529074668884277, + "learning_rate": 4.9624830051711634e-05, + "loss": 5.4108, + "step": 9285 + }, + { + "epoch": 0.05522647254734037, + "grad_norm": 1.7673834562301636, + "learning_rate": 4.962474942941051e-05, + "loss": 5.5955, + "step": 9286 + }, + { + "epoch": 0.055232419830621375, + "grad_norm": 1.9575849771499634, + "learning_rate": 4.9624668798513143e-05, + "loss": 5.6295, + "step": 9287 + }, + { + "epoch": 0.05523836711390237, + "grad_norm": 1.8054029941558838, + "learning_rate": 4.9624588159019546e-05, + "loss": 5.3372, + "step": 9288 + }, + { + "epoch": 0.055244314397183365, + "grad_norm": 1.8002424240112305, + "learning_rate": 4.962450751092978e-05, + "loss": 5.4404, + "step": 9289 + }, + { + "epoch": 0.05525026168046437, + "grad_norm": 2.052530527114868, + "learning_rate": 4.962442685424383e-05, + "loss": 5.4921, + "step": 9290 + }, + { + "epoch": 0.05525620896374536, + "grad_norm": 1.8559443950653076, + "learning_rate": 4.962434618896176e-05, + "loss": 5.5776, + "step": 9291 + }, + { + "epoch": 0.05526215624702636, + "grad_norm": 1.8794355392456055, + "learning_rate": 4.962426551508359e-05, + "loss": 5.5818, + "step": 9292 + }, + { + "epoch": 0.05526810353030735, + "grad_norm": 1.8995412588119507, + "learning_rate": 4.962418483260933e-05, + "loss": 5.6274, + "step": 9293 + }, + { + "epoch": 0.055274050813588355, + "grad_norm": 1.8608371019363403, + "learning_rate": 4.962410414153903e-05, + "loss": 5.4655, + "step": 9294 + }, + { + "epoch": 0.05527999809686935, + "grad_norm": 2.0378072261810303, + "learning_rate": 4.9624023441872715e-05, + "loss": 5.5579, + "step": 9295 + }, + { + "epoch": 0.055285945380150345, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.9623942733610397e-05, + "loss": 5.6663, + "step": 9296 + }, + { + "epoch": 0.05529189266343135, + "grad_norm": 2.4487335681915283, + "learning_rate": 4.962386201675212e-05, + "loss": 5.6792, + "step": 9297 + }, + { + "epoch": 0.05529783994671234, + "grad_norm": 2.0460383892059326, + "learning_rate": 4.96237812912979e-05, + "loss": 5.917, + "step": 9298 + }, + { + "epoch": 0.05530378722999334, + "grad_norm": 2.4838030338287354, + "learning_rate": 4.962370055724778e-05, + "loss": 5.1067, + "step": 9299 + }, + { + "epoch": 0.05530973451327434, + "grad_norm": 1.9340513944625854, + "learning_rate": 4.962361981460178e-05, + "loss": 5.2529, + "step": 9300 + }, + { + "epoch": 0.055315681796555334, + "grad_norm": 2.201068878173828, + "learning_rate": 4.9623539063359925e-05, + "loss": 5.6055, + "step": 9301 + }, + { + "epoch": 0.05532162907983633, + "grad_norm": 2.0552330017089844, + "learning_rate": 4.962345830352225e-05, + "loss": 5.3531, + "step": 9302 + }, + { + "epoch": 0.05532757636311733, + "grad_norm": 2.611407995223999, + "learning_rate": 4.9623377535088785e-05, + "loss": 5.5829, + "step": 9303 + }, + { + "epoch": 0.055333523646398326, + "grad_norm": 2.2239346504211426, + "learning_rate": 4.962329675805955e-05, + "loss": 5.3558, + "step": 9304 + }, + { + "epoch": 0.05533947092967932, + "grad_norm": 2.3899872303009033, + "learning_rate": 4.9623215972434566e-05, + "loss": 5.7277, + "step": 9305 + }, + { + "epoch": 0.055345418212960316, + "grad_norm": 2.8471267223358154, + "learning_rate": 4.962313517821389e-05, + "loss": 6.1046, + "step": 9306 + }, + { + "epoch": 0.05535136549624132, + "grad_norm": 2.426400661468506, + "learning_rate": 4.962305437539752e-05, + "loss": 5.8942, + "step": 9307 + }, + { + "epoch": 0.055357312779522314, + "grad_norm": 2.3548812866210938, + "learning_rate": 4.962297356398549e-05, + "loss": 6.0552, + "step": 9308 + }, + { + "epoch": 0.05536326006280331, + "grad_norm": 1.8423515558242798, + "learning_rate": 4.9622892743977844e-05, + "loss": 5.9377, + "step": 9309 + }, + { + "epoch": 0.05536920734608431, + "grad_norm": 2.1509203910827637, + "learning_rate": 4.96228119153746e-05, + "loss": 5.7195, + "step": 9310 + }, + { + "epoch": 0.055375154629365306, + "grad_norm": 2.3096275329589844, + "learning_rate": 4.962273107817579e-05, + "loss": 5.3461, + "step": 9311 + }, + { + "epoch": 0.0553811019126463, + "grad_norm": 1.980205774307251, + "learning_rate": 4.962265023238143e-05, + "loss": 5.8851, + "step": 9312 + }, + { + "epoch": 0.0553870491959273, + "grad_norm": 1.8162591457366943, + "learning_rate": 4.962256937799156e-05, + "loss": 5.7092, + "step": 9313 + }, + { + "epoch": 0.0553929964792083, + "grad_norm": 1.873853087425232, + "learning_rate": 4.962248851500621e-05, + "loss": 5.8939, + "step": 9314 + }, + { + "epoch": 0.05539894376248929, + "grad_norm": 1.8039345741271973, + "learning_rate": 4.96224076434254e-05, + "loss": 5.9289, + "step": 9315 + }, + { + "epoch": 0.055404891045770295, + "grad_norm": 2.3106470108032227, + "learning_rate": 4.962232676324916e-05, + "loss": 5.9103, + "step": 9316 + }, + { + "epoch": 0.05541083832905129, + "grad_norm": 2.2209455966949463, + "learning_rate": 4.962224587447752e-05, + "loss": 6.0053, + "step": 9317 + }, + { + "epoch": 0.055416785612332285, + "grad_norm": 2.0624780654907227, + "learning_rate": 4.962216497711052e-05, + "loss": 5.9258, + "step": 9318 + }, + { + "epoch": 0.05542273289561329, + "grad_norm": 2.371662139892578, + "learning_rate": 4.962208407114817e-05, + "loss": 6.4127, + "step": 9319 + }, + { + "epoch": 0.05542868017889428, + "grad_norm": 2.7035610675811768, + "learning_rate": 4.96220031565905e-05, + "loss": 5.9742, + "step": 9320 + }, + { + "epoch": 0.05543462746217528, + "grad_norm": 2.060577392578125, + "learning_rate": 4.9621922233437544e-05, + "loss": 5.9729, + "step": 9321 + }, + { + "epoch": 0.05544057474545627, + "grad_norm": 1.7935984134674072, + "learning_rate": 4.962184130168933e-05, + "loss": 5.4077, + "step": 9322 + }, + { + "epoch": 0.055446522028737275, + "grad_norm": 1.8716622591018677, + "learning_rate": 4.9621760361345885e-05, + "loss": 5.4554, + "step": 9323 + }, + { + "epoch": 0.05545246931201827, + "grad_norm": 1.9150923490524292, + "learning_rate": 4.962167941240724e-05, + "loss": 5.8121, + "step": 9324 + }, + { + "epoch": 0.055458416595299265, + "grad_norm": 1.9207059144973755, + "learning_rate": 4.962159845487342e-05, + "loss": 5.8593, + "step": 9325 + }, + { + "epoch": 0.05546436387858027, + "grad_norm": 1.962039589881897, + "learning_rate": 4.9621517488744454e-05, + "loss": 6.0174, + "step": 9326 + }, + { + "epoch": 0.05547031116186126, + "grad_norm": 2.0445704460144043, + "learning_rate": 4.9621436514020376e-05, + "loss": 5.5782, + "step": 9327 + }, + { + "epoch": 0.05547625844514226, + "grad_norm": 2.0861823558807373, + "learning_rate": 4.9621355530701204e-05, + "loss": 5.6102, + "step": 9328 + }, + { + "epoch": 0.05548220572842326, + "grad_norm": 2.0184309482574463, + "learning_rate": 4.962127453878697e-05, + "loss": 5.8072, + "step": 9329 + }, + { + "epoch": 0.055488153011704254, + "grad_norm": 1.899994134902954, + "learning_rate": 4.962119353827771e-05, + "loss": 5.7361, + "step": 9330 + }, + { + "epoch": 0.05549410029498525, + "grad_norm": 1.8874105215072632, + "learning_rate": 4.962111252917344e-05, + "loss": 5.7988, + "step": 9331 + }, + { + "epoch": 0.05550004757826625, + "grad_norm": 2.046682119369507, + "learning_rate": 4.9621031511474194e-05, + "loss": 5.7037, + "step": 9332 + }, + { + "epoch": 0.055505994861547246, + "grad_norm": 2.2552926540374756, + "learning_rate": 4.962095048517999e-05, + "loss": 5.7556, + "step": 9333 + }, + { + "epoch": 0.05551194214482824, + "grad_norm": 2.1904358863830566, + "learning_rate": 4.962086945029089e-05, + "loss": 5.6529, + "step": 9334 + }, + { + "epoch": 0.055517889428109236, + "grad_norm": 2.03745698928833, + "learning_rate": 4.9620788406806883e-05, + "loss": 5.8504, + "step": 9335 + }, + { + "epoch": 0.05552383671139024, + "grad_norm": 1.81668221950531, + "learning_rate": 4.9620707354728017e-05, + "loss": 5.3275, + "step": 9336 + }, + { + "epoch": 0.055529783994671233, + "grad_norm": 2.570976734161377, + "learning_rate": 4.962062629405432e-05, + "loss": 5.666, + "step": 9337 + }, + { + "epoch": 0.05553573127795223, + "grad_norm": 2.6855766773223877, + "learning_rate": 4.962054522478581e-05, + "loss": 5.7798, + "step": 9338 + }, + { + "epoch": 0.05554167856123323, + "grad_norm": 2.329690933227539, + "learning_rate": 4.962046414692252e-05, + "loss": 5.9334, + "step": 9339 + }, + { + "epoch": 0.055547625844514226, + "grad_norm": 1.6809495687484741, + "learning_rate": 4.962038306046449e-05, + "loss": 5.8506, + "step": 9340 + }, + { + "epoch": 0.05555357312779522, + "grad_norm": 1.7170113325119019, + "learning_rate": 4.962030196541173e-05, + "loss": 6.0863, + "step": 9341 + }, + { + "epoch": 0.05555952041107622, + "grad_norm": 2.247680902481079, + "learning_rate": 4.962022086176428e-05, + "loss": 5.2188, + "step": 9342 + }, + { + "epoch": 0.05556546769435722, + "grad_norm": 2.680091381072998, + "learning_rate": 4.9620139749522165e-05, + "loss": 4.8506, + "step": 9343 + }, + { + "epoch": 0.05557141497763821, + "grad_norm": 2.1886465549468994, + "learning_rate": 4.962005862868542e-05, + "loss": 5.5164, + "step": 9344 + }, + { + "epoch": 0.055577362260919215, + "grad_norm": 2.061368227005005, + "learning_rate": 4.961997749925405e-05, + "loss": 5.4491, + "step": 9345 + }, + { + "epoch": 0.05558330954420021, + "grad_norm": 2.368156909942627, + "learning_rate": 4.961989636122812e-05, + "loss": 5.9053, + "step": 9346 + }, + { + "epoch": 0.055589256827481205, + "grad_norm": 2.562565803527832, + "learning_rate": 4.961981521460763e-05, + "loss": 5.7683, + "step": 9347 + }, + { + "epoch": 0.05559520411076221, + "grad_norm": 2.388779640197754, + "learning_rate": 4.961973405939262e-05, + "loss": 5.1235, + "step": 9348 + }, + { + "epoch": 0.0556011513940432, + "grad_norm": 2.546994686126709, + "learning_rate": 4.9619652895583104e-05, + "loss": 4.7793, + "step": 9349 + }, + { + "epoch": 0.0556070986773242, + "grad_norm": 2.379549026489258, + "learning_rate": 4.9619571723179135e-05, + "loss": 4.8949, + "step": 9350 + }, + { + "epoch": 0.05561304596060519, + "grad_norm": 2.1621344089508057, + "learning_rate": 4.961949054218072e-05, + "loss": 4.6824, + "step": 9351 + }, + { + "epoch": 0.055618993243886194, + "grad_norm": 2.136289119720459, + "learning_rate": 4.96194093525879e-05, + "loss": 4.834, + "step": 9352 + }, + { + "epoch": 0.05562494052716719, + "grad_norm": 2.3572680950164795, + "learning_rate": 4.9619328154400694e-05, + "loss": 4.9755, + "step": 9353 + }, + { + "epoch": 0.055630887810448185, + "grad_norm": 2.2439966201782227, + "learning_rate": 4.961924694761913e-05, + "loss": 5.7662, + "step": 9354 + }, + { + "epoch": 0.05563683509372919, + "grad_norm": 2.287597894668579, + "learning_rate": 4.961916573224326e-05, + "loss": 4.6108, + "step": 9355 + }, + { + "epoch": 0.05564278237701018, + "grad_norm": 2.1382369995117188, + "learning_rate": 4.961908450827308e-05, + "loss": 4.5993, + "step": 9356 + }, + { + "epoch": 0.05564872966029118, + "grad_norm": 2.112348794937134, + "learning_rate": 4.961900327570863e-05, + "loss": 4.6798, + "step": 9357 + }, + { + "epoch": 0.05565467694357218, + "grad_norm": 2.0453972816467285, + "learning_rate": 4.9618922034549946e-05, + "loss": 4.5424, + "step": 9358 + }, + { + "epoch": 0.055660624226853174, + "grad_norm": 2.0547754764556885, + "learning_rate": 4.961884078479705e-05, + "loss": 5.0661, + "step": 9359 + }, + { + "epoch": 0.05566657151013417, + "grad_norm": 2.5003650188446045, + "learning_rate": 4.9618759526449965e-05, + "loss": 5.3388, + "step": 9360 + }, + { + "epoch": 0.05567251879341517, + "grad_norm": 2.0582423210144043, + "learning_rate": 4.9618678259508736e-05, + "loss": 5.8437, + "step": 9361 + }, + { + "epoch": 0.055678466076696166, + "grad_norm": 1.7867279052734375, + "learning_rate": 4.9618596983973376e-05, + "loss": 5.369, + "step": 9362 + }, + { + "epoch": 0.05568441335997716, + "grad_norm": 2.03729248046875, + "learning_rate": 4.961851569984392e-05, + "loss": 5.9932, + "step": 9363 + }, + { + "epoch": 0.055690360643258156, + "grad_norm": 2.2527456283569336, + "learning_rate": 4.961843440712038e-05, + "loss": 5.893, + "step": 9364 + }, + { + "epoch": 0.05569630792653916, + "grad_norm": 2.0027201175689697, + "learning_rate": 4.9618353105802815e-05, + "loss": 5.8216, + "step": 9365 + }, + { + "epoch": 0.05570225520982015, + "grad_norm": 2.236548662185669, + "learning_rate": 4.961827179589124e-05, + "loss": 5.5371, + "step": 9366 + }, + { + "epoch": 0.05570820249310115, + "grad_norm": 2.4477334022521973, + "learning_rate": 4.9618190477385666e-05, + "loss": 5.6552, + "step": 9367 + }, + { + "epoch": 0.05571414977638215, + "grad_norm": 2.504549026489258, + "learning_rate": 4.9618109150286145e-05, + "loss": 5.5732, + "step": 9368 + }, + { + "epoch": 0.055720097059663146, + "grad_norm": 2.1413187980651855, + "learning_rate": 4.9618027814592695e-05, + "loss": 5.1792, + "step": 9369 + }, + { + "epoch": 0.05572604434294414, + "grad_norm": 2.1714866161346436, + "learning_rate": 4.9617946470305344e-05, + "loss": 5.3444, + "step": 9370 + }, + { + "epoch": 0.05573199162622514, + "grad_norm": 1.7478383779525757, + "learning_rate": 4.9617865117424126e-05, + "loss": 5.7151, + "step": 9371 + }, + { + "epoch": 0.05573793890950614, + "grad_norm": 2.0415220260620117, + "learning_rate": 4.9617783755949067e-05, + "loss": 5.8765, + "step": 9372 + }, + { + "epoch": 0.05574388619278713, + "grad_norm": 1.917108416557312, + "learning_rate": 4.961770238588019e-05, + "loss": 6.0797, + "step": 9373 + }, + { + "epoch": 0.055749833476068135, + "grad_norm": 1.9404850006103516, + "learning_rate": 4.961762100721753e-05, + "loss": 6.1376, + "step": 9374 + }, + { + "epoch": 0.05575578075934913, + "grad_norm": 1.7101916074752808, + "learning_rate": 4.9617539619961104e-05, + "loss": 5.9375, + "step": 9375 + }, + { + "epoch": 0.055761728042630125, + "grad_norm": 2.591960906982422, + "learning_rate": 4.9617458224110954e-05, + "loss": 5.3716, + "step": 9376 + }, + { + "epoch": 0.05576767532591113, + "grad_norm": 2.070600986480713, + "learning_rate": 4.961737681966711e-05, + "loss": 5.3822, + "step": 9377 + }, + { + "epoch": 0.05577362260919212, + "grad_norm": 2.100820302963257, + "learning_rate": 4.9617295406629594e-05, + "loss": 5.7703, + "step": 9378 + }, + { + "epoch": 0.05577956989247312, + "grad_norm": 2.2413878440856934, + "learning_rate": 4.961721398499843e-05, + "loss": 4.9197, + "step": 9379 + }, + { + "epoch": 0.05578551717575411, + "grad_norm": 1.9762401580810547, + "learning_rate": 4.961713255477365e-05, + "loss": 5.6705, + "step": 9380 + }, + { + "epoch": 0.055791464459035114, + "grad_norm": 2.22676420211792, + "learning_rate": 4.961705111595528e-05, + "loss": 5.0196, + "step": 9381 + }, + { + "epoch": 0.05579741174231611, + "grad_norm": 2.0652241706848145, + "learning_rate": 4.9616969668543364e-05, + "loss": 5.3894, + "step": 9382 + }, + { + "epoch": 0.055803359025597105, + "grad_norm": 2.156890630722046, + "learning_rate": 4.96168882125379e-05, + "loss": 5.3063, + "step": 9383 + }, + { + "epoch": 0.05580930630887811, + "grad_norm": 2.131964683532715, + "learning_rate": 4.961680674793895e-05, + "loss": 5.9304, + "step": 9384 + }, + { + "epoch": 0.0558152535921591, + "grad_norm": 2.2117621898651123, + "learning_rate": 4.9616725274746525e-05, + "loss": 5.9553, + "step": 9385 + }, + { + "epoch": 0.0558212008754401, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.9616643792960654e-05, + "loss": 5.9911, + "step": 9386 + }, + { + "epoch": 0.0558271481587211, + "grad_norm": 1.7709077596664429, + "learning_rate": 4.961656230258136e-05, + "loss": 5.6291, + "step": 9387 + }, + { + "epoch": 0.055833095442002094, + "grad_norm": 1.838767170906067, + "learning_rate": 4.961648080360869e-05, + "loss": 6.0152, + "step": 9388 + }, + { + "epoch": 0.05583904272528309, + "grad_norm": 2.117058515548706, + "learning_rate": 4.9616399296042656e-05, + "loss": 4.8079, + "step": 9389 + }, + { + "epoch": 0.05584499000856409, + "grad_norm": 2.147491693496704, + "learning_rate": 4.9616317779883293e-05, + "loss": 4.6489, + "step": 9390 + }, + { + "epoch": 0.055850937291845086, + "grad_norm": 2.1025705337524414, + "learning_rate": 4.961623625513062e-05, + "loss": 4.4984, + "step": 9391 + }, + { + "epoch": 0.05585688457512608, + "grad_norm": 1.799986720085144, + "learning_rate": 4.961615472178468e-05, + "loss": 5.1008, + "step": 9392 + }, + { + "epoch": 0.055862831858407076, + "grad_norm": 2.2975053787231445, + "learning_rate": 4.961607317984549e-05, + "loss": 5.9754, + "step": 9393 + }, + { + "epoch": 0.05586877914168808, + "grad_norm": 1.9996155500411987, + "learning_rate": 4.961599162931309e-05, + "loss": 5.9255, + "step": 9394 + }, + { + "epoch": 0.05587472642496907, + "grad_norm": 1.7344794273376465, + "learning_rate": 4.9615910070187496e-05, + "loss": 6.0873, + "step": 9395 + }, + { + "epoch": 0.05588067370825007, + "grad_norm": 2.260706901550293, + "learning_rate": 4.961582850246875e-05, + "loss": 5.9454, + "step": 9396 + }, + { + "epoch": 0.05588662099153107, + "grad_norm": 2.1810765266418457, + "learning_rate": 4.961574692615686e-05, + "loss": 5.7548, + "step": 9397 + }, + { + "epoch": 0.055892568274812066, + "grad_norm": 2.0940003395080566, + "learning_rate": 4.961566534125188e-05, + "loss": 5.8184, + "step": 9398 + }, + { + "epoch": 0.05589851555809306, + "grad_norm": 2.066464900970459, + "learning_rate": 4.961558374775382e-05, + "loss": 5.7867, + "step": 9399 + }, + { + "epoch": 0.05590446284137406, + "grad_norm": 1.7197705507278442, + "learning_rate": 4.961550214566271e-05, + "loss": 5.9211, + "step": 9400 + }, + { + "epoch": 0.05591041012465506, + "grad_norm": 2.3055293560028076, + "learning_rate": 4.9615420534978583e-05, + "loss": 5.9531, + "step": 9401 + }, + { + "epoch": 0.05591635740793605, + "grad_norm": 2.0974669456481934, + "learning_rate": 4.961533891570147e-05, + "loss": 5.9347, + "step": 9402 + }, + { + "epoch": 0.055922304691217055, + "grad_norm": 2.5196354389190674, + "learning_rate": 4.96152572878314e-05, + "loss": 5.0729, + "step": 9403 + }, + { + "epoch": 0.05592825197449805, + "grad_norm": 2.157181978225708, + "learning_rate": 4.9615175651368395e-05, + "loss": 5.9513, + "step": 9404 + }, + { + "epoch": 0.055934199257779045, + "grad_norm": 1.94083833694458, + "learning_rate": 4.9615094006312485e-05, + "loss": 5.9239, + "step": 9405 + }, + { + "epoch": 0.05594014654106005, + "grad_norm": 2.2118191719055176, + "learning_rate": 4.9615012352663704e-05, + "loss": 5.6936, + "step": 9406 + }, + { + "epoch": 0.05594609382434104, + "grad_norm": 2.2255051136016846, + "learning_rate": 4.9614930690422065e-05, + "loss": 5.7475, + "step": 9407 + }, + { + "epoch": 0.05595204110762204, + "grad_norm": 2.1640844345092773, + "learning_rate": 4.961484901958762e-05, + "loss": 5.8138, + "step": 9408 + }, + { + "epoch": 0.05595798839090303, + "grad_norm": 2.2722928524017334, + "learning_rate": 4.961476734016038e-05, + "loss": 5.5784, + "step": 9409 + }, + { + "epoch": 0.055963935674184034, + "grad_norm": 2.0541749000549316, + "learning_rate": 4.961468565214039e-05, + "loss": 5.6871, + "step": 9410 + }, + { + "epoch": 0.05596988295746503, + "grad_norm": 2.3496010303497314, + "learning_rate": 4.9614603955527655e-05, + "loss": 5.4195, + "step": 9411 + }, + { + "epoch": 0.055975830240746025, + "grad_norm": 2.333435297012329, + "learning_rate": 4.9614522250322215e-05, + "loss": 5.4257, + "step": 9412 + }, + { + "epoch": 0.05598177752402703, + "grad_norm": 2.339057445526123, + "learning_rate": 4.9614440536524106e-05, + "loss": 5.4158, + "step": 9413 + }, + { + "epoch": 0.05598772480730802, + "grad_norm": 2.4383058547973633, + "learning_rate": 4.961435881413335e-05, + "loss": 5.4569, + "step": 9414 + }, + { + "epoch": 0.05599367209058902, + "grad_norm": 2.1405389308929443, + "learning_rate": 4.961427708314997e-05, + "loss": 5.6178, + "step": 9415 + }, + { + "epoch": 0.05599961937387002, + "grad_norm": 2.2082836627960205, + "learning_rate": 4.961419534357401e-05, + "loss": 5.386, + "step": 9416 + }, + { + "epoch": 0.056005566657151014, + "grad_norm": 2.0305027961730957, + "learning_rate": 4.961411359540548e-05, + "loss": 5.2822, + "step": 9417 + }, + { + "epoch": 0.05601151394043201, + "grad_norm": 2.606452226638794, + "learning_rate": 4.961403183864442e-05, + "loss": 5.2691, + "step": 9418 + }, + { + "epoch": 0.05601746122371301, + "grad_norm": 2.3506669998168945, + "learning_rate": 4.961395007329086e-05, + "loss": 5.3307, + "step": 9419 + }, + { + "epoch": 0.056023408506994006, + "grad_norm": 2.3472225666046143, + "learning_rate": 4.961386829934482e-05, + "loss": 5.2247, + "step": 9420 + }, + { + "epoch": 0.056029355790275, + "grad_norm": 2.1121721267700195, + "learning_rate": 4.961378651680633e-05, + "loss": 5.2857, + "step": 9421 + }, + { + "epoch": 0.056035303073555996, + "grad_norm": 2.4357142448425293, + "learning_rate": 4.9613704725675427e-05, + "loss": 5.3398, + "step": 9422 + }, + { + "epoch": 0.056041250356837, + "grad_norm": 2.639418125152588, + "learning_rate": 4.961362292595213e-05, + "loss": 5.3008, + "step": 9423 + }, + { + "epoch": 0.05604719764011799, + "grad_norm": 3.297189712524414, + "learning_rate": 4.961354111763647e-05, + "loss": 5.5908, + "step": 9424 + }, + { + "epoch": 0.05605314492339899, + "grad_norm": 2.095613718032837, + "learning_rate": 4.961345930072848e-05, + "loss": 5.2389, + "step": 9425 + }, + { + "epoch": 0.05605909220667999, + "grad_norm": 2.2495081424713135, + "learning_rate": 4.9613377475228186e-05, + "loss": 5.474, + "step": 9426 + }, + { + "epoch": 0.056065039489960986, + "grad_norm": 2.282697916030884, + "learning_rate": 4.961329564113562e-05, + "loss": 5.3253, + "step": 9427 + }, + { + "epoch": 0.05607098677324198, + "grad_norm": 2.515075206756592, + "learning_rate": 4.96132137984508e-05, + "loss": 5.238, + "step": 9428 + }, + { + "epoch": 0.05607693405652298, + "grad_norm": 2.072274684906006, + "learning_rate": 4.961313194717376e-05, + "loss": 5.3627, + "step": 9429 + }, + { + "epoch": 0.05608288133980398, + "grad_norm": 2.4552547931671143, + "learning_rate": 4.961305008730454e-05, + "loss": 6.1799, + "step": 9430 + }, + { + "epoch": 0.05608882862308497, + "grad_norm": 2.2289538383483887, + "learning_rate": 4.9612968218843146e-05, + "loss": 5.5477, + "step": 9431 + }, + { + "epoch": 0.056094775906365975, + "grad_norm": 2.6174185276031494, + "learning_rate": 4.9612886341789635e-05, + "loss": 5.1779, + "step": 9432 + }, + { + "epoch": 0.05610072318964697, + "grad_norm": 2.4489150047302246, + "learning_rate": 4.9612804456144005e-05, + "loss": 5.2067, + "step": 9433 + }, + { + "epoch": 0.056106670472927965, + "grad_norm": 2.2651829719543457, + "learning_rate": 4.96127225619063e-05, + "loss": 5.3582, + "step": 9434 + }, + { + "epoch": 0.05611261775620897, + "grad_norm": 2.1985251903533936, + "learning_rate": 4.9612640659076556e-05, + "loss": 5.2034, + "step": 9435 + }, + { + "epoch": 0.05611856503948996, + "grad_norm": 1.9510128498077393, + "learning_rate": 4.961255874765479e-05, + "loss": 5.2263, + "step": 9436 + }, + { + "epoch": 0.05612451232277096, + "grad_norm": 2.338815212249756, + "learning_rate": 4.961247682764104e-05, + "loss": 5.9091, + "step": 9437 + }, + { + "epoch": 0.05613045960605195, + "grad_norm": 2.097111225128174, + "learning_rate": 4.961239489903532e-05, + "loss": 6.3285, + "step": 9438 + }, + { + "epoch": 0.056136406889332954, + "grad_norm": 1.9965720176696777, + "learning_rate": 4.961231296183767e-05, + "loss": 6.3141, + "step": 9439 + }, + { + "epoch": 0.05614235417261395, + "grad_norm": 2.2406206130981445, + "learning_rate": 4.9612231016048114e-05, + "loss": 5.7335, + "step": 9440 + }, + { + "epoch": 0.056148301455894944, + "grad_norm": 2.2798993587493896, + "learning_rate": 4.961214906166668e-05, + "loss": 4.9959, + "step": 9441 + }, + { + "epoch": 0.056154248739175947, + "grad_norm": 2.482706069946289, + "learning_rate": 4.96120670986934e-05, + "loss": 5.295, + "step": 9442 + }, + { + "epoch": 0.05616019602245694, + "grad_norm": 2.398867607116699, + "learning_rate": 4.961198512712831e-05, + "loss": 4.9592, + "step": 9443 + }, + { + "epoch": 0.05616614330573794, + "grad_norm": 2.1979055404663086, + "learning_rate": 4.961190314697143e-05, + "loss": 5.1003, + "step": 9444 + }, + { + "epoch": 0.05617209058901894, + "grad_norm": 2.3249244689941406, + "learning_rate": 4.961182115822278e-05, + "loss": 5.1408, + "step": 9445 + }, + { + "epoch": 0.056178037872299934, + "grad_norm": 2.3679821491241455, + "learning_rate": 4.96117391608824e-05, + "loss": 5.4006, + "step": 9446 + }, + { + "epoch": 0.05618398515558093, + "grad_norm": 1.8706363439559937, + "learning_rate": 4.961165715495032e-05, + "loss": 6.1741, + "step": 9447 + }, + { + "epoch": 0.05618993243886193, + "grad_norm": 2.1825344562530518, + "learning_rate": 4.961157514042656e-05, + "loss": 6.0869, + "step": 9448 + }, + { + "epoch": 0.056195879722142926, + "grad_norm": 1.85076904296875, + "learning_rate": 4.961149311731116e-05, + "loss": 5.9252, + "step": 9449 + }, + { + "epoch": 0.05620182700542392, + "grad_norm": 1.9433631896972656, + "learning_rate": 4.961141108560413e-05, + "loss": 5.968, + "step": 9450 + }, + { + "epoch": 0.056207774288704916, + "grad_norm": 2.5718259811401367, + "learning_rate": 4.961132904530552e-05, + "loss": 5.4274, + "step": 9451 + }, + { + "epoch": 0.05621372157198592, + "grad_norm": 1.919552206993103, + "learning_rate": 4.961124699641535e-05, + "loss": 5.1943, + "step": 9452 + }, + { + "epoch": 0.05621966885526691, + "grad_norm": 2.1371817588806152, + "learning_rate": 4.961116493893364e-05, + "loss": 5.9949, + "step": 9453 + }, + { + "epoch": 0.05622561613854791, + "grad_norm": 2.5715489387512207, + "learning_rate": 4.961108287286044e-05, + "loss": 6.2061, + "step": 9454 + }, + { + "epoch": 0.05623156342182891, + "grad_norm": 2.1871471405029297, + "learning_rate": 4.961100079819575e-05, + "loss": 5.7872, + "step": 9455 + }, + { + "epoch": 0.056237510705109905, + "grad_norm": 2.011925220489502, + "learning_rate": 4.961091871493962e-05, + "loss": 5.7992, + "step": 9456 + }, + { + "epoch": 0.0562434579883909, + "grad_norm": 2.516580820083618, + "learning_rate": 4.9610836623092074e-05, + "loss": 5.9154, + "step": 9457 + }, + { + "epoch": 0.0562494052716719, + "grad_norm": 1.9336326122283936, + "learning_rate": 4.961075452265314e-05, + "loss": 5.7933, + "step": 9458 + }, + { + "epoch": 0.0562553525549529, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.961067241362285e-05, + "loss": 6.1897, + "step": 9459 + }, + { + "epoch": 0.05626129983823389, + "grad_norm": 1.9757578372955322, + "learning_rate": 4.961059029600122e-05, + "loss": 6.0909, + "step": 9460 + }, + { + "epoch": 0.056267247121514895, + "grad_norm": 1.9767241477966309, + "learning_rate": 4.9610508169788294e-05, + "loss": 6.2212, + "step": 9461 + }, + { + "epoch": 0.05627319440479589, + "grad_norm": 1.9890403747558594, + "learning_rate": 4.961042603498409e-05, + "loss": 6.5071, + "step": 9462 + }, + { + "epoch": 0.056279141688076885, + "grad_norm": 1.9011937379837036, + "learning_rate": 4.961034389158864e-05, + "loss": 5.8098, + "step": 9463 + }, + { + "epoch": 0.05628508897135789, + "grad_norm": 2.236356735229492, + "learning_rate": 4.961026173960197e-05, + "loss": 4.8901, + "step": 9464 + }, + { + "epoch": 0.05629103625463888, + "grad_norm": 1.9147372245788574, + "learning_rate": 4.961017957902412e-05, + "loss": 5.1372, + "step": 9465 + }, + { + "epoch": 0.05629698353791988, + "grad_norm": 1.9628163576126099, + "learning_rate": 4.9610097409855106e-05, + "loss": 5.1161, + "step": 9466 + }, + { + "epoch": 0.05630293082120087, + "grad_norm": 2.0323991775512695, + "learning_rate": 4.961001523209496e-05, + "loss": 5.1493, + "step": 9467 + }, + { + "epoch": 0.056308878104481874, + "grad_norm": 1.7026360034942627, + "learning_rate": 4.9609933045743714e-05, + "loss": 5.2349, + "step": 9468 + }, + { + "epoch": 0.05631482538776287, + "grad_norm": 1.7758761644363403, + "learning_rate": 4.9609850850801394e-05, + "loss": 5.231, + "step": 9469 + }, + { + "epoch": 0.056320772671043864, + "grad_norm": 2.3305037021636963, + "learning_rate": 4.9609768647268026e-05, + "loss": 5.9209, + "step": 9470 + }, + { + "epoch": 0.056326719954324866, + "grad_norm": 2.2628681659698486, + "learning_rate": 4.960968643514365e-05, + "loss": 5.4753, + "step": 9471 + }, + { + "epoch": 0.05633266723760586, + "grad_norm": 2.4022347927093506, + "learning_rate": 4.9609604214428286e-05, + "loss": 4.8414, + "step": 9472 + }, + { + "epoch": 0.05633861452088686, + "grad_norm": 2.2767343521118164, + "learning_rate": 4.9609521985121955e-05, + "loss": 4.7178, + "step": 9473 + }, + { + "epoch": 0.05634456180416786, + "grad_norm": 2.547600507736206, + "learning_rate": 4.96094397472247e-05, + "loss": 4.7365, + "step": 9474 + }, + { + "epoch": 0.056350509087448854, + "grad_norm": 2.3546998500823975, + "learning_rate": 4.960935750073654e-05, + "loss": 5.4846, + "step": 9475 + }, + { + "epoch": 0.05635645637072985, + "grad_norm": 2.9641268253326416, + "learning_rate": 4.960927524565751e-05, + "loss": 5.7409, + "step": 9476 + }, + { + "epoch": 0.05636240365401085, + "grad_norm": 3.1727824211120605, + "learning_rate": 4.960919298198764e-05, + "loss": 5.8456, + "step": 9477 + }, + { + "epoch": 0.056368350937291846, + "grad_norm": 2.620507001876831, + "learning_rate": 4.960911070972695e-05, + "loss": 5.6295, + "step": 9478 + }, + { + "epoch": 0.05637429822057284, + "grad_norm": 2.6132571697235107, + "learning_rate": 4.960902842887548e-05, + "loss": 5.697, + "step": 9479 + }, + { + "epoch": 0.056380245503853836, + "grad_norm": 2.2931299209594727, + "learning_rate": 4.960894613943324e-05, + "loss": 5.4723, + "step": 9480 + }, + { + "epoch": 0.05638619278713484, + "grad_norm": 2.176729202270508, + "learning_rate": 4.9608863841400284e-05, + "loss": 5.7403, + "step": 9481 + }, + { + "epoch": 0.05639214007041583, + "grad_norm": 1.932180404663086, + "learning_rate": 4.9608781534776616e-05, + "loss": 5.9256, + "step": 9482 + }, + { + "epoch": 0.05639808735369683, + "grad_norm": 1.7315243482589722, + "learning_rate": 4.9608699219562286e-05, + "loss": 5.9176, + "step": 9483 + }, + { + "epoch": 0.05640403463697783, + "grad_norm": 1.6548408269882202, + "learning_rate": 4.9608616895757306e-05, + "loss": 5.7495, + "step": 9484 + }, + { + "epoch": 0.056409981920258825, + "grad_norm": 1.8549202680587769, + "learning_rate": 4.960853456336172e-05, + "loss": 5.5261, + "step": 9485 + }, + { + "epoch": 0.05641592920353982, + "grad_norm": 2.5990993976593018, + "learning_rate": 4.9608452222375544e-05, + "loss": 5.5934, + "step": 9486 + }, + { + "epoch": 0.05642187648682082, + "grad_norm": 1.705051302909851, + "learning_rate": 4.9608369872798815e-05, + "loss": 5.3613, + "step": 9487 + }, + { + "epoch": 0.05642782377010182, + "grad_norm": 1.6170406341552734, + "learning_rate": 4.960828751463156e-05, + "loss": 5.2743, + "step": 9488 + }, + { + "epoch": 0.05643377105338281, + "grad_norm": 1.6247482299804688, + "learning_rate": 4.9608205147873796e-05, + "loss": 5.2772, + "step": 9489 + }, + { + "epoch": 0.056439718336663815, + "grad_norm": 1.7574137449264526, + "learning_rate": 4.9608122772525575e-05, + "loss": 5.3464, + "step": 9490 + }, + { + "epoch": 0.05644566561994481, + "grad_norm": 1.8814537525177002, + "learning_rate": 4.960804038858691e-05, + "loss": 5.3092, + "step": 9491 + }, + { + "epoch": 0.056451612903225805, + "grad_norm": 2.0222842693328857, + "learning_rate": 4.9607957996057816e-05, + "loss": 4.8234, + "step": 9492 + }, + { + "epoch": 0.05645756018650681, + "grad_norm": 1.6224759817123413, + "learning_rate": 4.960787559493836e-05, + "loss": 5.3962, + "step": 9493 + }, + { + "epoch": 0.0564635074697878, + "grad_norm": 1.4097533226013184, + "learning_rate": 4.960779318522853e-05, + "loss": 5.8302, + "step": 9494 + }, + { + "epoch": 0.0564694547530688, + "grad_norm": 1.7296205759048462, + "learning_rate": 4.960771076692839e-05, + "loss": 5.5679, + "step": 9495 + }, + { + "epoch": 0.05647540203634979, + "grad_norm": 1.6300212144851685, + "learning_rate": 4.960762834003794e-05, + "loss": 5.4315, + "step": 9496 + }, + { + "epoch": 0.056481349319630794, + "grad_norm": 1.8587864637374878, + "learning_rate": 4.960754590455723e-05, + "loss": 5.5492, + "step": 9497 + }, + { + "epoch": 0.05648729660291179, + "grad_norm": 1.8136985301971436, + "learning_rate": 4.960746346048628e-05, + "loss": 5.6363, + "step": 9498 + }, + { + "epoch": 0.056493243886192784, + "grad_norm": 2.1277284622192383, + "learning_rate": 4.960738100782511e-05, + "loss": 5.593, + "step": 9499 + }, + { + "epoch": 0.056499191169473786, + "grad_norm": 2.0262863636016846, + "learning_rate": 4.960729854657377e-05, + "loss": 5.6396, + "step": 9500 + }, + { + "epoch": 0.05650513845275478, + "grad_norm": 1.7870309352874756, + "learning_rate": 4.9607216076732266e-05, + "loss": 5.6523, + "step": 9501 + }, + { + "epoch": 0.05651108573603578, + "grad_norm": 1.734782099723816, + "learning_rate": 4.9607133598300636e-05, + "loss": 5.5313, + "step": 9502 + }, + { + "epoch": 0.05651703301931678, + "grad_norm": 2.2485032081604004, + "learning_rate": 4.9607051111278914e-05, + "loss": 5.3814, + "step": 9503 + }, + { + "epoch": 0.056522980302597774, + "grad_norm": 1.5091774463653564, + "learning_rate": 4.9606968615667125e-05, + "loss": 5.5277, + "step": 9504 + }, + { + "epoch": 0.05652892758587877, + "grad_norm": 1.7117774486541748, + "learning_rate": 4.9606886111465303e-05, + "loss": 5.2649, + "step": 9505 + }, + { + "epoch": 0.05653487486915977, + "grad_norm": 1.7309353351593018, + "learning_rate": 4.960680359867346e-05, + "loss": 5.2276, + "step": 9506 + }, + { + "epoch": 0.056540822152440766, + "grad_norm": 1.7058963775634766, + "learning_rate": 4.960672107729164e-05, + "loss": 5.1848, + "step": 9507 + }, + { + "epoch": 0.05654676943572176, + "grad_norm": 1.7862296104431152, + "learning_rate": 4.960663854731987e-05, + "loss": 5.2424, + "step": 9508 + }, + { + "epoch": 0.05655271671900276, + "grad_norm": 1.8900794982910156, + "learning_rate": 4.960655600875818e-05, + "loss": 5.283, + "step": 9509 + }, + { + "epoch": 0.05655866400228376, + "grad_norm": 1.9991587400436401, + "learning_rate": 4.960647346160658e-05, + "loss": 5.3525, + "step": 9510 + }, + { + "epoch": 0.05656461128556475, + "grad_norm": 1.6889851093292236, + "learning_rate": 4.960639090586513e-05, + "loss": 5.0592, + "step": 9511 + }, + { + "epoch": 0.05657055856884575, + "grad_norm": 1.6314234733581543, + "learning_rate": 4.9606308341533844e-05, + "loss": 5.1733, + "step": 9512 + }, + { + "epoch": 0.05657650585212675, + "grad_norm": 1.7801847457885742, + "learning_rate": 4.960622576861275e-05, + "loss": 5.2358, + "step": 9513 + }, + { + "epoch": 0.056582453135407745, + "grad_norm": 1.6572017669677734, + "learning_rate": 4.9606143187101864e-05, + "loss": 5.2429, + "step": 9514 + }, + { + "epoch": 0.05658840041868874, + "grad_norm": 1.7574421167373657, + "learning_rate": 4.960606059700124e-05, + "loss": 5.0717, + "step": 9515 + }, + { + "epoch": 0.05659434770196974, + "grad_norm": 1.8162970542907715, + "learning_rate": 4.960597799831088e-05, + "loss": 5.1513, + "step": 9516 + }, + { + "epoch": 0.05660029498525074, + "grad_norm": 1.9231795072555542, + "learning_rate": 4.960589539103084e-05, + "loss": 5.1539, + "step": 9517 + }, + { + "epoch": 0.05660624226853173, + "grad_norm": 1.624566674232483, + "learning_rate": 4.9605812775161136e-05, + "loss": 5.0999, + "step": 9518 + }, + { + "epoch": 0.056612189551812735, + "grad_norm": 1.4293668270111084, + "learning_rate": 4.960573015070179e-05, + "loss": 5.2365, + "step": 9519 + }, + { + "epoch": 0.05661813683509373, + "grad_norm": 1.789515495300293, + "learning_rate": 4.960564751765284e-05, + "loss": 5.2233, + "step": 9520 + }, + { + "epoch": 0.056624084118374725, + "grad_norm": 1.7212306261062622, + "learning_rate": 4.960556487601432e-05, + "loss": 5.1902, + "step": 9521 + }, + { + "epoch": 0.05663003140165573, + "grad_norm": 1.7691519260406494, + "learning_rate": 4.960548222578625e-05, + "loss": 5.2136, + "step": 9522 + }, + { + "epoch": 0.05663597868493672, + "grad_norm": 1.5925794839859009, + "learning_rate": 4.960539956696866e-05, + "loss": 5.4808, + "step": 9523 + }, + { + "epoch": 0.05664192596821772, + "grad_norm": 1.7014095783233643, + "learning_rate": 4.960531689956157e-05, + "loss": 5.1934, + "step": 9524 + }, + { + "epoch": 0.05664787325149871, + "grad_norm": 1.3620802164077759, + "learning_rate": 4.960523422356502e-05, + "loss": 5.0169, + "step": 9525 + }, + { + "epoch": 0.056653820534779714, + "grad_norm": 1.4778205156326294, + "learning_rate": 4.960515153897904e-05, + "loss": 5.1535, + "step": 9526 + }, + { + "epoch": 0.05665976781806071, + "grad_norm": 1.6393300294876099, + "learning_rate": 4.960506884580366e-05, + "loss": 5.2494, + "step": 9527 + }, + { + "epoch": 0.056665715101341704, + "grad_norm": 1.6070711612701416, + "learning_rate": 4.96049861440389e-05, + "loss": 5.3117, + "step": 9528 + }, + { + "epoch": 0.056671662384622706, + "grad_norm": 1.6023461818695068, + "learning_rate": 4.96049034336848e-05, + "loss": 5.1554, + "step": 9529 + }, + { + "epoch": 0.0566776096679037, + "grad_norm": 1.6061514616012573, + "learning_rate": 4.9604820714741374e-05, + "loss": 5.4123, + "step": 9530 + }, + { + "epoch": 0.056683556951184697, + "grad_norm": 1.8043792247772217, + "learning_rate": 4.960473798720866e-05, + "loss": 5.2582, + "step": 9531 + }, + { + "epoch": 0.0566895042344657, + "grad_norm": 1.6002432107925415, + "learning_rate": 4.960465525108669e-05, + "loss": 5.211, + "step": 9532 + }, + { + "epoch": 0.056695451517746694, + "grad_norm": 1.851266622543335, + "learning_rate": 4.960457250637549e-05, + "loss": 5.0949, + "step": 9533 + }, + { + "epoch": 0.05670139880102769, + "grad_norm": 1.7806520462036133, + "learning_rate": 4.9604489753075085e-05, + "loss": 5.1178, + "step": 9534 + }, + { + "epoch": 0.05670734608430869, + "grad_norm": 1.9938620328903198, + "learning_rate": 4.9604406991185506e-05, + "loss": 5.098, + "step": 9535 + }, + { + "epoch": 0.056713293367589686, + "grad_norm": 1.7983622550964355, + "learning_rate": 4.960432422070679e-05, + "loss": 4.98, + "step": 9536 + }, + { + "epoch": 0.05671924065087068, + "grad_norm": 1.845821499824524, + "learning_rate": 4.960424144163895e-05, + "loss": 4.951, + "step": 9537 + }, + { + "epoch": 0.05672518793415168, + "grad_norm": 1.8922109603881836, + "learning_rate": 4.960415865398202e-05, + "loss": 5.0327, + "step": 9538 + }, + { + "epoch": 0.05673113521743268, + "grad_norm": 2.159832239151001, + "learning_rate": 4.960407585773604e-05, + "loss": 5.5287, + "step": 9539 + }, + { + "epoch": 0.05673708250071367, + "grad_norm": 1.9966739416122437, + "learning_rate": 4.960399305290103e-05, + "loss": 5.7114, + "step": 9540 + }, + { + "epoch": 0.05674302978399467, + "grad_norm": 1.8796072006225586, + "learning_rate": 4.9603910239477026e-05, + "loss": 5.4673, + "step": 9541 + }, + { + "epoch": 0.05674897706727567, + "grad_norm": 1.6589174270629883, + "learning_rate": 4.9603827417464045e-05, + "loss": 5.3755, + "step": 9542 + }, + { + "epoch": 0.056754924350556665, + "grad_norm": 1.975807547569275, + "learning_rate": 4.960374458686212e-05, + "loss": 5.0648, + "step": 9543 + }, + { + "epoch": 0.05676087163383766, + "grad_norm": 1.7437241077423096, + "learning_rate": 4.960366174767128e-05, + "loss": 5.2338, + "step": 9544 + }, + { + "epoch": 0.05676681891711866, + "grad_norm": 1.8508884906768799, + "learning_rate": 4.9603578899891564e-05, + "loss": 5.3432, + "step": 9545 + }, + { + "epoch": 0.05677276620039966, + "grad_norm": 2.2117562294006348, + "learning_rate": 4.960349604352299e-05, + "loss": 5.0623, + "step": 9546 + }, + { + "epoch": 0.05677871348368065, + "grad_norm": 1.7681034803390503, + "learning_rate": 4.9603413178565586e-05, + "loss": 5.1998, + "step": 9547 + }, + { + "epoch": 0.056784660766961655, + "grad_norm": 2.4477179050445557, + "learning_rate": 4.960333030501939e-05, + "loss": 5.3317, + "step": 9548 + }, + { + "epoch": 0.05679060805024265, + "grad_norm": 1.8297652006149292, + "learning_rate": 4.9603247422884426e-05, + "loss": 5.3608, + "step": 9549 + }, + { + "epoch": 0.056796555333523645, + "grad_norm": 1.8361153602600098, + "learning_rate": 4.9603164532160715e-05, + "loss": 5.3914, + "step": 9550 + }, + { + "epoch": 0.05680250261680465, + "grad_norm": 1.748226523399353, + "learning_rate": 4.96030816328483e-05, + "loss": 5.3436, + "step": 9551 + }, + { + "epoch": 0.05680844990008564, + "grad_norm": 1.744964599609375, + "learning_rate": 4.96029987249472e-05, + "loss": 5.4287, + "step": 9552 + }, + { + "epoch": 0.05681439718336664, + "grad_norm": 1.9512866735458374, + "learning_rate": 4.9602915808457454e-05, + "loss": 5.3601, + "step": 9553 + }, + { + "epoch": 0.05682034446664763, + "grad_norm": 1.5863629579544067, + "learning_rate": 4.9602832883379077e-05, + "loss": 5.5491, + "step": 9554 + }, + { + "epoch": 0.056826291749928634, + "grad_norm": 1.967677354812622, + "learning_rate": 4.96027499497121e-05, + "loss": 5.2402, + "step": 9555 + }, + { + "epoch": 0.05683223903320963, + "grad_norm": 2.277714252471924, + "learning_rate": 4.960266700745657e-05, + "loss": 5.5155, + "step": 9556 + }, + { + "epoch": 0.056838186316490624, + "grad_norm": 1.8371034860610962, + "learning_rate": 4.96025840566125e-05, + "loss": 5.2694, + "step": 9557 + }, + { + "epoch": 0.056844133599771626, + "grad_norm": 1.723008155822754, + "learning_rate": 4.9602501097179915e-05, + "loss": 5.4983, + "step": 9558 + }, + { + "epoch": 0.05685008088305262, + "grad_norm": 1.6955413818359375, + "learning_rate": 4.960241812915886e-05, + "loss": 5.6888, + "step": 9559 + }, + { + "epoch": 0.056856028166333616, + "grad_norm": 1.5899012088775635, + "learning_rate": 4.960233515254935e-05, + "loss": 5.4241, + "step": 9560 + }, + { + "epoch": 0.05686197544961462, + "grad_norm": 1.493268370628357, + "learning_rate": 4.9602252167351416e-05, + "loss": 5.1889, + "step": 9561 + }, + { + "epoch": 0.056867922732895614, + "grad_norm": 1.8037081956863403, + "learning_rate": 4.9602169173565094e-05, + "loss": 5.1785, + "step": 9562 + }, + { + "epoch": 0.05687387001617661, + "grad_norm": 1.6377664804458618, + "learning_rate": 4.960208617119041e-05, + "loss": 5.2593, + "step": 9563 + }, + { + "epoch": 0.05687981729945761, + "grad_norm": 2.077209234237671, + "learning_rate": 4.960200316022739e-05, + "loss": 5.1012, + "step": 9564 + }, + { + "epoch": 0.056885764582738606, + "grad_norm": 2.3584885597229004, + "learning_rate": 4.9601920140676064e-05, + "loss": 5.1141, + "step": 9565 + }, + { + "epoch": 0.0568917118660196, + "grad_norm": 1.990319013595581, + "learning_rate": 4.960183711253646e-05, + "loss": 4.9336, + "step": 9566 + }, + { + "epoch": 0.0568976591493006, + "grad_norm": 2.037742853164673, + "learning_rate": 4.960175407580861e-05, + "loss": 4.8494, + "step": 9567 + }, + { + "epoch": 0.0569036064325816, + "grad_norm": 1.8493839502334595, + "learning_rate": 4.9601671030492546e-05, + "loss": 5.337, + "step": 9568 + }, + { + "epoch": 0.05690955371586259, + "grad_norm": 1.9864604473114014, + "learning_rate": 4.960158797658829e-05, + "loss": 5.5684, + "step": 9569 + }, + { + "epoch": 0.05691550099914359, + "grad_norm": 1.9740629196166992, + "learning_rate": 4.960150491409587e-05, + "loss": 5.444, + "step": 9570 + }, + { + "epoch": 0.05692144828242459, + "grad_norm": 1.9429807662963867, + "learning_rate": 4.960142184301533e-05, + "loss": 5.277, + "step": 9571 + }, + { + "epoch": 0.056927395565705585, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.960133876334668e-05, + "loss": 5.1694, + "step": 9572 + }, + { + "epoch": 0.05693334284898658, + "grad_norm": 1.7716888189315796, + "learning_rate": 4.960125567508996e-05, + "loss": 5.1383, + "step": 9573 + }, + { + "epoch": 0.05693929013226758, + "grad_norm": 1.8266246318817139, + "learning_rate": 4.9601172578245194e-05, + "loss": 5.4019, + "step": 9574 + }, + { + "epoch": 0.05694523741554858, + "grad_norm": 1.8929648399353027, + "learning_rate": 4.9601089472812414e-05, + "loss": 5.3948, + "step": 9575 + }, + { + "epoch": 0.05695118469882957, + "grad_norm": 1.9918208122253418, + "learning_rate": 4.960100635879165e-05, + "loss": 5.3195, + "step": 9576 + }, + { + "epoch": 0.056957131982110575, + "grad_norm": 1.4987989664077759, + "learning_rate": 4.960092323618292e-05, + "loss": 5.5292, + "step": 9577 + }, + { + "epoch": 0.05696307926539157, + "grad_norm": 1.683800220489502, + "learning_rate": 4.960084010498627e-05, + "loss": 5.5069, + "step": 9578 + }, + { + "epoch": 0.056969026548672565, + "grad_norm": 1.767561435699463, + "learning_rate": 4.960075696520171e-05, + "loss": 5.4134, + "step": 9579 + }, + { + "epoch": 0.05697497383195357, + "grad_norm": 2.077564239501953, + "learning_rate": 4.960067381682929e-05, + "loss": 5.3362, + "step": 9580 + }, + { + "epoch": 0.05698092111523456, + "grad_norm": 2.0167109966278076, + "learning_rate": 4.960059065986903e-05, + "loss": 5.4235, + "step": 9581 + }, + { + "epoch": 0.05698686839851556, + "grad_norm": 1.647669792175293, + "learning_rate": 4.9600507494320953e-05, + "loss": 5.3273, + "step": 9582 + }, + { + "epoch": 0.05699281568179655, + "grad_norm": 1.6051719188690186, + "learning_rate": 4.960042432018509e-05, + "loss": 5.2486, + "step": 9583 + }, + { + "epoch": 0.056998762965077554, + "grad_norm": 1.9283394813537598, + "learning_rate": 4.960034113746148e-05, + "loss": 5.233, + "step": 9584 + }, + { + "epoch": 0.05700471024835855, + "grad_norm": 1.6215802431106567, + "learning_rate": 4.960025794615014e-05, + "loss": 5.2322, + "step": 9585 + }, + { + "epoch": 0.057010657531639544, + "grad_norm": 1.8902918100357056, + "learning_rate": 4.960017474625111e-05, + "loss": 5.063, + "step": 9586 + }, + { + "epoch": 0.057016604814920546, + "grad_norm": 2.4694666862487793, + "learning_rate": 4.9600091537764415e-05, + "loss": 4.498, + "step": 9587 + }, + { + "epoch": 0.05702255209820154, + "grad_norm": 1.98915433883667, + "learning_rate": 4.960000832069007e-05, + "loss": 4.8781, + "step": 9588 + }, + { + "epoch": 0.057028499381482536, + "grad_norm": 2.0424818992614746, + "learning_rate": 4.9599925095028126e-05, + "loss": 5.5803, + "step": 9589 + }, + { + "epoch": 0.05703444666476354, + "grad_norm": 1.471275806427002, + "learning_rate": 4.95998418607786e-05, + "loss": 5.5604, + "step": 9590 + }, + { + "epoch": 0.057040393948044534, + "grad_norm": 1.6512761116027832, + "learning_rate": 4.959975861794152e-05, + "loss": 5.2147, + "step": 9591 + }, + { + "epoch": 0.05704634123132553, + "grad_norm": 1.6902865171432495, + "learning_rate": 4.959967536651693e-05, + "loss": 5.2654, + "step": 9592 + }, + { + "epoch": 0.05705228851460653, + "grad_norm": 1.5656665563583374, + "learning_rate": 4.9599592106504835e-05, + "loss": 5.1106, + "step": 9593 + }, + { + "epoch": 0.057058235797887526, + "grad_norm": 1.760901927947998, + "learning_rate": 4.959950883790528e-05, + "loss": 5.1833, + "step": 9594 + }, + { + "epoch": 0.05706418308116852, + "grad_norm": 1.5585325956344604, + "learning_rate": 4.9599425560718294e-05, + "loss": 5.202, + "step": 9595 + }, + { + "epoch": 0.05707013036444952, + "grad_norm": 1.5477479696273804, + "learning_rate": 4.959934227494389e-05, + "loss": 5.121, + "step": 9596 + }, + { + "epoch": 0.05707607764773052, + "grad_norm": 1.9299825429916382, + "learning_rate": 4.959925898058213e-05, + "loss": 5.0026, + "step": 9597 + }, + { + "epoch": 0.05708202493101151, + "grad_norm": 1.866237759590149, + "learning_rate": 4.959917567763301e-05, + "loss": 4.999, + "step": 9598 + }, + { + "epoch": 0.05708797221429251, + "grad_norm": 1.6670162677764893, + "learning_rate": 4.959909236609657e-05, + "loss": 5.4047, + "step": 9599 + }, + { + "epoch": 0.05709391949757351, + "grad_norm": 1.4666836261749268, + "learning_rate": 4.9599009045972844e-05, + "loss": 5.3598, + "step": 9600 + }, + { + "epoch": 0.057099866780854505, + "grad_norm": 1.928645372390747, + "learning_rate": 4.959892571726186e-05, + "loss": 5.7015, + "step": 9601 + }, + { + "epoch": 0.0571058140641355, + "grad_norm": 1.9761322736740112, + "learning_rate": 4.959884237996365e-05, + "loss": 4.8682, + "step": 9602 + }, + { + "epoch": 0.0571117613474165, + "grad_norm": 1.9823036193847656, + "learning_rate": 4.959875903407823e-05, + "loss": 4.8752, + "step": 9603 + }, + { + "epoch": 0.0571177086306975, + "grad_norm": 1.9242253303527832, + "learning_rate": 4.959867567960564e-05, + "loss": 4.9314, + "step": 9604 + }, + { + "epoch": 0.05712365591397849, + "grad_norm": 1.740980625152588, + "learning_rate": 4.9598592316545904e-05, + "loss": 4.9843, + "step": 9605 + }, + { + "epoch": 0.057129603197259494, + "grad_norm": 2.0768508911132812, + "learning_rate": 4.959850894489906e-05, + "loss": 4.8528, + "step": 9606 + }, + { + "epoch": 0.05713555048054049, + "grad_norm": 1.7417833805084229, + "learning_rate": 4.959842556466513e-05, + "loss": 5.1374, + "step": 9607 + }, + { + "epoch": 0.057141497763821485, + "grad_norm": 1.933691382408142, + "learning_rate": 4.959834217584414e-05, + "loss": 5.349, + "step": 9608 + }, + { + "epoch": 0.05714744504710249, + "grad_norm": 1.8035194873809814, + "learning_rate": 4.959825877843612e-05, + "loss": 5.0212, + "step": 9609 + }, + { + "epoch": 0.05715339233038348, + "grad_norm": 2.323709487915039, + "learning_rate": 4.9598175372441106e-05, + "loss": 5.5346, + "step": 9610 + }, + { + "epoch": 0.05715933961366448, + "grad_norm": 1.755983591079712, + "learning_rate": 4.959809195785912e-05, + "loss": 4.8425, + "step": 9611 + }, + { + "epoch": 0.05716528689694547, + "grad_norm": 1.6614432334899902, + "learning_rate": 4.95980085346902e-05, + "loss": 4.912, + "step": 9612 + }, + { + "epoch": 0.057171234180226474, + "grad_norm": 1.8319662809371948, + "learning_rate": 4.959792510293436e-05, + "loss": 5.0125, + "step": 9613 + }, + { + "epoch": 0.05717718146350747, + "grad_norm": 1.8528090715408325, + "learning_rate": 4.959784166259165e-05, + "loss": 4.898, + "step": 9614 + }, + { + "epoch": 0.057183128746788464, + "grad_norm": 2.163757562637329, + "learning_rate": 4.959775821366208e-05, + "loss": 5.2041, + "step": 9615 + }, + { + "epoch": 0.057189076030069466, + "grad_norm": 1.939430832862854, + "learning_rate": 4.959767475614569e-05, + "loss": 5.3337, + "step": 9616 + }, + { + "epoch": 0.05719502331335046, + "grad_norm": 1.7198511362075806, + "learning_rate": 4.959759129004251e-05, + "loss": 5.2682, + "step": 9617 + }, + { + "epoch": 0.057200970596631456, + "grad_norm": 1.7674570083618164, + "learning_rate": 4.959750781535255e-05, + "loss": 5.4188, + "step": 9618 + }, + { + "epoch": 0.05720691787991246, + "grad_norm": 1.7197433710098267, + "learning_rate": 4.959742433207587e-05, + "loss": 5.1725, + "step": 9619 + }, + { + "epoch": 0.05721286516319345, + "grad_norm": 1.6682969331741333, + "learning_rate": 4.959734084021248e-05, + "loss": 5.1349, + "step": 9620 + }, + { + "epoch": 0.05721881244647445, + "grad_norm": 1.3784568309783936, + "learning_rate": 4.959725733976241e-05, + "loss": 5.2408, + "step": 9621 + }, + { + "epoch": 0.05722475972975545, + "grad_norm": 1.690483808517456, + "learning_rate": 4.9597173830725686e-05, + "loss": 5.2616, + "step": 9622 + }, + { + "epoch": 0.057230707013036446, + "grad_norm": 1.5313903093338013, + "learning_rate": 4.959709031310235e-05, + "loss": 5.1481, + "step": 9623 + }, + { + "epoch": 0.05723665429631744, + "grad_norm": 1.6266121864318848, + "learning_rate": 4.959700678689242e-05, + "loss": 5.0192, + "step": 9624 + }, + { + "epoch": 0.05724260157959844, + "grad_norm": 2.3125410079956055, + "learning_rate": 4.959692325209593e-05, + "loss": 4.5513, + "step": 9625 + }, + { + "epoch": 0.05724854886287944, + "grad_norm": 1.6884924173355103, + "learning_rate": 4.9596839708712913e-05, + "loss": 5.1917, + "step": 9626 + }, + { + "epoch": 0.05725449614616043, + "grad_norm": 1.5797723531723022, + "learning_rate": 4.9596756156743385e-05, + "loss": 5.5674, + "step": 9627 + }, + { + "epoch": 0.05726044342944143, + "grad_norm": 1.6152269840240479, + "learning_rate": 4.959667259618739e-05, + "loss": 5.4566, + "step": 9628 + }, + { + "epoch": 0.05726639071272243, + "grad_norm": 1.611608624458313, + "learning_rate": 4.959658902704495e-05, + "loss": 5.3678, + "step": 9629 + }, + { + "epoch": 0.057272337996003425, + "grad_norm": 1.774327278137207, + "learning_rate": 4.9596505449316086e-05, + "loss": 5.2438, + "step": 9630 + }, + { + "epoch": 0.05727828527928442, + "grad_norm": 1.7961443662643433, + "learning_rate": 4.9596421863000856e-05, + "loss": 5.3061, + "step": 9631 + }, + { + "epoch": 0.05728423256256542, + "grad_norm": 1.709675669670105, + "learning_rate": 4.959633826809925e-05, + "loss": 5.0095, + "step": 9632 + }, + { + "epoch": 0.05729017984584642, + "grad_norm": 1.7140734195709229, + "learning_rate": 4.959625466461132e-05, + "loss": 5.313, + "step": 9633 + }, + { + "epoch": 0.05729612712912741, + "grad_norm": 1.8302016258239746, + "learning_rate": 4.95961710525371e-05, + "loss": 5.4008, + "step": 9634 + }, + { + "epoch": 0.057302074412408414, + "grad_norm": 1.8570395708084106, + "learning_rate": 4.95960874318766e-05, + "loss": 5.513, + "step": 9635 + }, + { + "epoch": 0.05730802169568941, + "grad_norm": 1.6907027959823608, + "learning_rate": 4.959600380262987e-05, + "loss": 5.1933, + "step": 9636 + }, + { + "epoch": 0.057313968978970405, + "grad_norm": 1.6505299806594849, + "learning_rate": 4.9595920164796926e-05, + "loss": 5.1537, + "step": 9637 + }, + { + "epoch": 0.05731991626225141, + "grad_norm": 1.5248258113861084, + "learning_rate": 4.95958365183778e-05, + "loss": 5.4232, + "step": 9638 + }, + { + "epoch": 0.0573258635455324, + "grad_norm": 1.4630048274993896, + "learning_rate": 4.9595752863372524e-05, + "loss": 5.565, + "step": 9639 + }, + { + "epoch": 0.0573318108288134, + "grad_norm": 1.5858573913574219, + "learning_rate": 4.959566919978112e-05, + "loss": 5.4364, + "step": 9640 + }, + { + "epoch": 0.05733775811209439, + "grad_norm": 1.7803694009780884, + "learning_rate": 4.9595585527603625e-05, + "loss": 5.1727, + "step": 9641 + }, + { + "epoch": 0.057343705395375394, + "grad_norm": 1.639163851737976, + "learning_rate": 4.959550184684007e-05, + "loss": 5.5538, + "step": 9642 + }, + { + "epoch": 0.05734965267865639, + "grad_norm": 1.5917890071868896, + "learning_rate": 4.959541815749046e-05, + "loss": 5.6788, + "step": 9643 + }, + { + "epoch": 0.057355599961937384, + "grad_norm": 1.5524990558624268, + "learning_rate": 4.959533445955487e-05, + "loss": 5.7832, + "step": 9644 + }, + { + "epoch": 0.057361547245218386, + "grad_norm": 1.7229019403457642, + "learning_rate": 4.959525075303328e-05, + "loss": 5.4417, + "step": 9645 + }, + { + "epoch": 0.05736749452849938, + "grad_norm": 1.5434623956680298, + "learning_rate": 4.959516703792575e-05, + "loss": 5.3629, + "step": 9646 + }, + { + "epoch": 0.057373441811780376, + "grad_norm": 1.4929866790771484, + "learning_rate": 4.9595083314232306e-05, + "loss": 5.8586, + "step": 9647 + }, + { + "epoch": 0.05737938909506138, + "grad_norm": 1.209796667098999, + "learning_rate": 4.959499958195297e-05, + "loss": 5.5001, + "step": 9648 + }, + { + "epoch": 0.05738533637834237, + "grad_norm": 2.703871488571167, + "learning_rate": 4.9594915841087775e-05, + "loss": 5.6564, + "step": 9649 + }, + { + "epoch": 0.05739128366162337, + "grad_norm": 1.9408828020095825, + "learning_rate": 4.959483209163674e-05, + "loss": 5.6683, + "step": 9650 + }, + { + "epoch": 0.05739723094490437, + "grad_norm": 1.8055803775787354, + "learning_rate": 4.9594748333599914e-05, + "loss": 5.3046, + "step": 9651 + }, + { + "epoch": 0.057403178228185366, + "grad_norm": 2.3453104496002197, + "learning_rate": 4.959466456697731e-05, + "loss": 6.1944, + "step": 9652 + }, + { + "epoch": 0.05740912551146636, + "grad_norm": 2.3799800872802734, + "learning_rate": 4.959458079176897e-05, + "loss": 5.6706, + "step": 9653 + }, + { + "epoch": 0.05741507279474736, + "grad_norm": 2.111069440841675, + "learning_rate": 4.959449700797491e-05, + "loss": 5.1808, + "step": 9654 + }, + { + "epoch": 0.05742102007802836, + "grad_norm": 2.237873077392578, + "learning_rate": 4.9594413215595164e-05, + "loss": 5.0609, + "step": 9655 + }, + { + "epoch": 0.05742696736130935, + "grad_norm": 1.956520438194275, + "learning_rate": 4.959432941462977e-05, + "loss": 5.1431, + "step": 9656 + }, + { + "epoch": 0.05743291464459035, + "grad_norm": 2.3761603832244873, + "learning_rate": 4.9594245605078735e-05, + "loss": 4.8722, + "step": 9657 + }, + { + "epoch": 0.05743886192787135, + "grad_norm": 1.820745825767517, + "learning_rate": 4.959416178694212e-05, + "loss": 5.0149, + "step": 9658 + }, + { + "epoch": 0.057444809211152345, + "grad_norm": 2.0804755687713623, + "learning_rate": 4.9594077960219924e-05, + "loss": 5.7698, + "step": 9659 + }, + { + "epoch": 0.05745075649443334, + "grad_norm": 1.9319117069244385, + "learning_rate": 4.9593994124912196e-05, + "loss": 5.3054, + "step": 9660 + }, + { + "epoch": 0.05745670377771434, + "grad_norm": 2.386338472366333, + "learning_rate": 4.959391028101896e-05, + "loss": 5.2093, + "step": 9661 + }, + { + "epoch": 0.05746265106099534, + "grad_norm": 1.852386474609375, + "learning_rate": 4.9593826428540244e-05, + "loss": 5.1943, + "step": 9662 + }, + { + "epoch": 0.05746859834427633, + "grad_norm": 1.9619694948196411, + "learning_rate": 4.959374256747607e-05, + "loss": 4.8275, + "step": 9663 + }, + { + "epoch": 0.057474545627557334, + "grad_norm": 2.4797024726867676, + "learning_rate": 4.9593658697826485e-05, + "loss": 5.5257, + "step": 9664 + }, + { + "epoch": 0.05748049291083833, + "grad_norm": 2.1713874340057373, + "learning_rate": 4.959357481959149e-05, + "loss": 5.4486, + "step": 9665 + }, + { + "epoch": 0.057486440194119325, + "grad_norm": 1.9605398178100586, + "learning_rate": 4.9593490932771145e-05, + "loss": 5.1512, + "step": 9666 + }, + { + "epoch": 0.05749238747740033, + "grad_norm": 1.9853549003601074, + "learning_rate": 4.959340703736547e-05, + "loss": 5.665, + "step": 9667 + }, + { + "epoch": 0.05749833476068132, + "grad_norm": 1.984279990196228, + "learning_rate": 4.9593323133374494e-05, + "loss": 5.7797, + "step": 9668 + }, + { + "epoch": 0.05750428204396232, + "grad_norm": 1.8343236446380615, + "learning_rate": 4.9593239220798225e-05, + "loss": 5.0261, + "step": 9669 + }, + { + "epoch": 0.05751022932724331, + "grad_norm": 1.8675687313079834, + "learning_rate": 4.959315529963673e-05, + "loss": 4.8754, + "step": 9670 + }, + { + "epoch": 0.057516176610524314, + "grad_norm": 1.9129834175109863, + "learning_rate": 4.959307136989e-05, + "loss": 5.1056, + "step": 9671 + }, + { + "epoch": 0.05752212389380531, + "grad_norm": 3.142893075942993, + "learning_rate": 4.95929874315581e-05, + "loss": 5.6029, + "step": 9672 + }, + { + "epoch": 0.057528071177086304, + "grad_norm": 1.80843985080719, + "learning_rate": 4.9592903484641026e-05, + "loss": 5.57, + "step": 9673 + }, + { + "epoch": 0.057534018460367306, + "grad_norm": 1.9195841550827026, + "learning_rate": 4.9592819529138835e-05, + "loss": 5.6964, + "step": 9674 + }, + { + "epoch": 0.0575399657436483, + "grad_norm": 2.026477813720703, + "learning_rate": 4.959273556505154e-05, + "loss": 5.8544, + "step": 9675 + }, + { + "epoch": 0.057545913026929296, + "grad_norm": 2.111274003982544, + "learning_rate": 4.959265159237918e-05, + "loss": 5.8014, + "step": 9676 + }, + { + "epoch": 0.0575518603102103, + "grad_norm": 1.9789505004882812, + "learning_rate": 4.9592567611121776e-05, + "loss": 5.7646, + "step": 9677 + }, + { + "epoch": 0.05755780759349129, + "grad_norm": 1.8776015043258667, + "learning_rate": 4.9592483621279365e-05, + "loss": 6.1603, + "step": 9678 + }, + { + "epoch": 0.05756375487677229, + "grad_norm": 2.135849714279175, + "learning_rate": 4.9592399622851956e-05, + "loss": 5.6372, + "step": 9679 + }, + { + "epoch": 0.05756970216005329, + "grad_norm": 2.3335585594177246, + "learning_rate": 4.959231561583961e-05, + "loss": 5.5515, + "step": 9680 + }, + { + "epoch": 0.057575649443334286, + "grad_norm": 1.9315869808197021, + "learning_rate": 4.9592231600242337e-05, + "loss": 5.9287, + "step": 9681 + }, + { + "epoch": 0.05758159672661528, + "grad_norm": 2.4559311866760254, + "learning_rate": 4.959214757606017e-05, + "loss": 5.6079, + "step": 9682 + }, + { + "epoch": 0.05758754400989628, + "grad_norm": 2.6558609008789062, + "learning_rate": 4.959206354329314e-05, + "loss": 5.5728, + "step": 9683 + }, + { + "epoch": 0.05759349129317728, + "grad_norm": 2.2376396656036377, + "learning_rate": 4.9591979501941274e-05, + "loss": 5.5318, + "step": 9684 + }, + { + "epoch": 0.05759943857645827, + "grad_norm": 1.8506240844726562, + "learning_rate": 4.95918954520046e-05, + "loss": 5.7957, + "step": 9685 + }, + { + "epoch": 0.05760538585973927, + "grad_norm": 2.2428138256073, + "learning_rate": 4.9591811393483144e-05, + "loss": 5.7223, + "step": 9686 + }, + { + "epoch": 0.05761133314302027, + "grad_norm": 2.5734875202178955, + "learning_rate": 4.9591727326376955e-05, + "loss": 5.3401, + "step": 9687 + }, + { + "epoch": 0.057617280426301265, + "grad_norm": 2.567263126373291, + "learning_rate": 4.959164325068604e-05, + "loss": 5.4853, + "step": 9688 + }, + { + "epoch": 0.05762322770958226, + "grad_norm": 2.4430556297302246, + "learning_rate": 4.959155916641043e-05, + "loss": 5.9845, + "step": 9689 + }, + { + "epoch": 0.05762917499286326, + "grad_norm": 2.039846181869507, + "learning_rate": 4.959147507355017e-05, + "loss": 6.0689, + "step": 9690 + }, + { + "epoch": 0.05763512227614426, + "grad_norm": 2.207920551300049, + "learning_rate": 4.959139097210528e-05, + "loss": 5.6658, + "step": 9691 + }, + { + "epoch": 0.05764106955942525, + "grad_norm": 1.7421616315841675, + "learning_rate": 4.959130686207578e-05, + "loss": 6.0915, + "step": 9692 + }, + { + "epoch": 0.057647016842706254, + "grad_norm": 1.7738968133926392, + "learning_rate": 4.9591222743461716e-05, + "loss": 6.2092, + "step": 9693 + }, + { + "epoch": 0.05765296412598725, + "grad_norm": 1.8665943145751953, + "learning_rate": 4.959113861626311e-05, + "loss": 6.0922, + "step": 9694 + }, + { + "epoch": 0.057658911409268244, + "grad_norm": 2.0272347927093506, + "learning_rate": 4.959105448047999e-05, + "loss": 5.8291, + "step": 9695 + }, + { + "epoch": 0.057664858692549247, + "grad_norm": 2.8527796268463135, + "learning_rate": 4.9590970336112395e-05, + "loss": 5.428, + "step": 9696 + }, + { + "epoch": 0.05767080597583024, + "grad_norm": 1.8518950939178467, + "learning_rate": 4.959088618316033e-05, + "loss": 5.4199, + "step": 9697 + }, + { + "epoch": 0.05767675325911124, + "grad_norm": 2.38712739944458, + "learning_rate": 4.959080202162386e-05, + "loss": 5.1627, + "step": 9698 + }, + { + "epoch": 0.05768270054239223, + "grad_norm": 1.8407059907913208, + "learning_rate": 4.959071785150298e-05, + "loss": 5.1827, + "step": 9699 + }, + { + "epoch": 0.057688647825673234, + "grad_norm": 2.431151866912842, + "learning_rate": 4.9590633672797744e-05, + "loss": 6.1722, + "step": 9700 + }, + { + "epoch": 0.05769459510895423, + "grad_norm": 2.498046398162842, + "learning_rate": 4.9590549485508165e-05, + "loss": 6.2321, + "step": 9701 + }, + { + "epoch": 0.057700542392235224, + "grad_norm": 1.8793575763702393, + "learning_rate": 4.959046528963428e-05, + "loss": 5.4019, + "step": 9702 + }, + { + "epoch": 0.057706489675516226, + "grad_norm": 2.137622117996216, + "learning_rate": 4.9590381085176115e-05, + "loss": 5.9118, + "step": 9703 + }, + { + "epoch": 0.05771243695879722, + "grad_norm": 1.9514268636703491, + "learning_rate": 4.959029687213371e-05, + "loss": 5.6651, + "step": 9704 + }, + { + "epoch": 0.057718384242078216, + "grad_norm": 2.3678367137908936, + "learning_rate": 4.9590212650507085e-05, + "loss": 5.2054, + "step": 9705 + }, + { + "epoch": 0.05772433152535922, + "grad_norm": 2.8808276653289795, + "learning_rate": 4.9590128420296266e-05, + "loss": 5.3066, + "step": 9706 + }, + { + "epoch": 0.05773027880864021, + "grad_norm": 2.2405474185943604, + "learning_rate": 4.9590044181501297e-05, + "loss": 5.2904, + "step": 9707 + }, + { + "epoch": 0.05773622609192121, + "grad_norm": 2.3762283325195312, + "learning_rate": 4.958995993412219e-05, + "loss": 5.5847, + "step": 9708 + }, + { + "epoch": 0.05774217337520221, + "grad_norm": 2.5258681774139404, + "learning_rate": 4.958987567815898e-05, + "loss": 5.4852, + "step": 9709 + }, + { + "epoch": 0.057748120658483205, + "grad_norm": 2.31478214263916, + "learning_rate": 4.9589791413611704e-05, + "loss": 5.5658, + "step": 9710 + }, + { + "epoch": 0.0577540679417642, + "grad_norm": 1.735771894454956, + "learning_rate": 4.958970714048038e-05, + "loss": 6.0311, + "step": 9711 + }, + { + "epoch": 0.0577600152250452, + "grad_norm": 2.2843849658966064, + "learning_rate": 4.958962285876505e-05, + "loss": 5.9535, + "step": 9712 + }, + { + "epoch": 0.0577659625083262, + "grad_norm": 2.3449392318725586, + "learning_rate": 4.958953856846573e-05, + "loss": 5.9835, + "step": 9713 + }, + { + "epoch": 0.05777190979160719, + "grad_norm": 2.319952964782715, + "learning_rate": 4.9589454269582456e-05, + "loss": 5.5318, + "step": 9714 + }, + { + "epoch": 0.05777785707488819, + "grad_norm": 2.6801493167877197, + "learning_rate": 4.958936996211526e-05, + "loss": 4.8672, + "step": 9715 + }, + { + "epoch": 0.05778380435816919, + "grad_norm": 2.622528553009033, + "learning_rate": 4.958928564606418e-05, + "loss": 6.0755, + "step": 9716 + }, + { + "epoch": 0.057789751641450185, + "grad_norm": 1.973480224609375, + "learning_rate": 4.9589201321429216e-05, + "loss": 5.8197, + "step": 9717 + }, + { + "epoch": 0.05779569892473118, + "grad_norm": 2.060497760772705, + "learning_rate": 4.958911698821043e-05, + "loss": 5.2838, + "step": 9718 + }, + { + "epoch": 0.05780164620801218, + "grad_norm": 2.068103551864624, + "learning_rate": 4.958903264640783e-05, + "loss": 5.4917, + "step": 9719 + }, + { + "epoch": 0.05780759349129318, + "grad_norm": 2.5899293422698975, + "learning_rate": 4.958894829602145e-05, + "loss": 5.1312, + "step": 9720 + }, + { + "epoch": 0.05781354077457417, + "grad_norm": 3.2153897285461426, + "learning_rate": 4.958886393705132e-05, + "loss": 4.7502, + "step": 9721 + }, + { + "epoch": 0.057819488057855174, + "grad_norm": 2.805802345275879, + "learning_rate": 4.9588779569497484e-05, + "loss": 4.6876, + "step": 9722 + }, + { + "epoch": 0.05782543534113617, + "grad_norm": 2.3670101165771484, + "learning_rate": 4.958869519335995e-05, + "loss": 4.6025, + "step": 9723 + }, + { + "epoch": 0.057831382624417164, + "grad_norm": 1.992903709411621, + "learning_rate": 4.9588610808638755e-05, + "loss": 5.3602, + "step": 9724 + }, + { + "epoch": 0.057837329907698166, + "grad_norm": 2.249572277069092, + "learning_rate": 4.958852641533394e-05, + "loss": 4.9574, + "step": 9725 + }, + { + "epoch": 0.05784327719097916, + "grad_norm": 2.500433921813965, + "learning_rate": 4.958844201344552e-05, + "loss": 5.3656, + "step": 9726 + }, + { + "epoch": 0.05784922447426016, + "grad_norm": 2.0277605056762695, + "learning_rate": 4.9588357602973526e-05, + "loss": 5.6467, + "step": 9727 + }, + { + "epoch": 0.05785517175754116, + "grad_norm": 2.1196112632751465, + "learning_rate": 4.958827318391799e-05, + "loss": 5.6257, + "step": 9728 + }, + { + "epoch": 0.057861119040822154, + "grad_norm": 3.160593271255493, + "learning_rate": 4.9588188756278945e-05, + "loss": 4.9618, + "step": 9729 + }, + { + "epoch": 0.05786706632410315, + "grad_norm": 1.90407395362854, + "learning_rate": 4.958810432005642e-05, + "loss": 5.4551, + "step": 9730 + }, + { + "epoch": 0.057873013607384144, + "grad_norm": 2.0096004009246826, + "learning_rate": 4.958801987525043e-05, + "loss": 5.6562, + "step": 9731 + }, + { + "epoch": 0.057878960890665146, + "grad_norm": 2.617847442626953, + "learning_rate": 4.958793542186103e-05, + "loss": 5.747, + "step": 9732 + }, + { + "epoch": 0.05788490817394614, + "grad_norm": 2.3982057571411133, + "learning_rate": 4.9587850959888226e-05, + "loss": 5.6146, + "step": 9733 + }, + { + "epoch": 0.057890855457227136, + "grad_norm": 2.0222113132476807, + "learning_rate": 4.9587766489332065e-05, + "loss": 6.0204, + "step": 9734 + }, + { + "epoch": 0.05789680274050814, + "grad_norm": 2.1110177040100098, + "learning_rate": 4.958768201019257e-05, + "loss": 5.2957, + "step": 9735 + }, + { + "epoch": 0.05790275002378913, + "grad_norm": 1.8278865814208984, + "learning_rate": 4.958759752246977e-05, + "loss": 5.9902, + "step": 9736 + }, + { + "epoch": 0.05790869730707013, + "grad_norm": 2.2461514472961426, + "learning_rate": 4.958751302616368e-05, + "loss": 5.8572, + "step": 9737 + }, + { + "epoch": 0.05791464459035113, + "grad_norm": 1.7453250885009766, + "learning_rate": 4.958742852127435e-05, + "loss": 5.6658, + "step": 9738 + }, + { + "epoch": 0.057920591873632125, + "grad_norm": 2.480726718902588, + "learning_rate": 4.95873440078018e-05, + "loss": 5.4231, + "step": 9739 + }, + { + "epoch": 0.05792653915691312, + "grad_norm": 2.2310776710510254, + "learning_rate": 4.958725948574607e-05, + "loss": 5.4768, + "step": 9740 + }, + { + "epoch": 0.05793248644019412, + "grad_norm": 1.9454891681671143, + "learning_rate": 4.958717495510718e-05, + "loss": 5.4503, + "step": 9741 + }, + { + "epoch": 0.05793843372347512, + "grad_norm": 2.196054458618164, + "learning_rate": 4.958709041588516e-05, + "loss": 5.1987, + "step": 9742 + }, + { + "epoch": 0.05794438100675611, + "grad_norm": 2.385000228881836, + "learning_rate": 4.958700586808004e-05, + "loss": 5.8413, + "step": 9743 + }, + { + "epoch": 0.05795032829003711, + "grad_norm": 2.0967705249786377, + "learning_rate": 4.958692131169185e-05, + "loss": 5.8531, + "step": 9744 + }, + { + "epoch": 0.05795627557331811, + "grad_norm": 2.186253309249878, + "learning_rate": 4.958683674672062e-05, + "loss": 5.8241, + "step": 9745 + }, + { + "epoch": 0.057962222856599105, + "grad_norm": 1.8932995796203613, + "learning_rate": 4.958675217316638e-05, + "loss": 5.8724, + "step": 9746 + }, + { + "epoch": 0.0579681701398801, + "grad_norm": 1.9706943035125732, + "learning_rate": 4.958666759102916e-05, + "loss": 5.6565, + "step": 9747 + }, + { + "epoch": 0.0579741174231611, + "grad_norm": 1.7686703205108643, + "learning_rate": 4.958658300030898e-05, + "loss": 5.6299, + "step": 9748 + }, + { + "epoch": 0.0579800647064421, + "grad_norm": 2.309403419494629, + "learning_rate": 4.958649840100589e-05, + "loss": 4.6907, + "step": 9749 + }, + { + "epoch": 0.05798601198972309, + "grad_norm": 2.139760971069336, + "learning_rate": 4.95864137931199e-05, + "loss": 4.7311, + "step": 9750 + }, + { + "epoch": 0.057991959273004094, + "grad_norm": 1.960402011871338, + "learning_rate": 4.958632917665105e-05, + "loss": 5.598, + "step": 9751 + }, + { + "epoch": 0.05799790655628509, + "grad_norm": 1.721853256225586, + "learning_rate": 4.958624455159936e-05, + "loss": 6.0519, + "step": 9752 + }, + { + "epoch": 0.058003853839566084, + "grad_norm": 1.8527748584747314, + "learning_rate": 4.958615991796487e-05, + "loss": 5.3347, + "step": 9753 + }, + { + "epoch": 0.058009801122847086, + "grad_norm": 2.070084810256958, + "learning_rate": 4.958607527574761e-05, + "loss": 4.6653, + "step": 9754 + }, + { + "epoch": 0.05801574840612808, + "grad_norm": 2.143115997314453, + "learning_rate": 4.9585990624947605e-05, + "loss": 4.6522, + "step": 9755 + }, + { + "epoch": 0.05802169568940908, + "grad_norm": 2.2870991230010986, + "learning_rate": 4.9585905965564884e-05, + "loss": 4.7037, + "step": 9756 + }, + { + "epoch": 0.05802764297269008, + "grad_norm": 2.0633544921875, + "learning_rate": 4.958582129759947e-05, + "loss": 4.689, + "step": 9757 + }, + { + "epoch": 0.058033590255971074, + "grad_norm": 1.8845857381820679, + "learning_rate": 4.95857366210514e-05, + "loss": 4.8077, + "step": 9758 + }, + { + "epoch": 0.05803953753925207, + "grad_norm": 1.7319310903549194, + "learning_rate": 4.9585651935920715e-05, + "loss": 5.3528, + "step": 9759 + }, + { + "epoch": 0.058045484822533064, + "grad_norm": 2.2369909286499023, + "learning_rate": 4.958556724220742e-05, + "loss": 4.6549, + "step": 9760 + }, + { + "epoch": 0.058051432105814066, + "grad_norm": 2.076901912689209, + "learning_rate": 4.9585482539911566e-05, + "loss": 4.4642, + "step": 9761 + }, + { + "epoch": 0.05805737938909506, + "grad_norm": 2.0487091541290283, + "learning_rate": 4.958539782903318e-05, + "loss": 4.6575, + "step": 9762 + }, + { + "epoch": 0.058063326672376056, + "grad_norm": 2.2116169929504395, + "learning_rate": 4.9585313109572274e-05, + "loss": 4.4866, + "step": 9763 + }, + { + "epoch": 0.05806927395565706, + "grad_norm": 1.9818168878555298, + "learning_rate": 4.958522838152889e-05, + "loss": 4.7502, + "step": 9764 + }, + { + "epoch": 0.05807522123893805, + "grad_norm": 2.1484010219573975, + "learning_rate": 4.958514364490306e-05, + "loss": 5.7809, + "step": 9765 + }, + { + "epoch": 0.05808116852221905, + "grad_norm": 2.4087398052215576, + "learning_rate": 4.958505889969481e-05, + "loss": 5.5236, + "step": 9766 + }, + { + "epoch": 0.05808711580550005, + "grad_norm": 2.000459909439087, + "learning_rate": 4.9584974145904165e-05, + "loss": 4.7356, + "step": 9767 + }, + { + "epoch": 0.058093063088781045, + "grad_norm": 2.3958399295806885, + "learning_rate": 4.958488938353116e-05, + "loss": 4.3695, + "step": 9768 + }, + { + "epoch": 0.05809901037206204, + "grad_norm": 2.039053440093994, + "learning_rate": 4.958480461257584e-05, + "loss": 4.6128, + "step": 9769 + }, + { + "epoch": 0.05810495765534304, + "grad_norm": 1.7663822174072266, + "learning_rate": 4.95847198330382e-05, + "loss": 4.8533, + "step": 9770 + }, + { + "epoch": 0.05811090493862404, + "grad_norm": 2.594289779663086, + "learning_rate": 4.9584635044918295e-05, + "loss": 5.3048, + "step": 9771 + }, + { + "epoch": 0.05811685222190503, + "grad_norm": 2.712372303009033, + "learning_rate": 4.958455024821615e-05, + "loss": 5.4435, + "step": 9772 + }, + { + "epoch": 0.05812279950518603, + "grad_norm": 2.4295241832733154, + "learning_rate": 4.9584465442931794e-05, + "loss": 5.2665, + "step": 9773 + }, + { + "epoch": 0.05812874678846703, + "grad_norm": 2.5820906162261963, + "learning_rate": 4.9584380629065245e-05, + "loss": 5.6227, + "step": 9774 + }, + { + "epoch": 0.058134694071748025, + "grad_norm": 2.140291213989258, + "learning_rate": 4.958429580661655e-05, + "loss": 5.1792, + "step": 9775 + }, + { + "epoch": 0.05814064135502902, + "grad_norm": 2.111551523208618, + "learning_rate": 4.9584210975585734e-05, + "loss": 5.7262, + "step": 9776 + }, + { + "epoch": 0.05814658863831002, + "grad_norm": 2.5887086391448975, + "learning_rate": 4.958412613597282e-05, + "loss": 5.1613, + "step": 9777 + }, + { + "epoch": 0.05815253592159102, + "grad_norm": 1.9678863286972046, + "learning_rate": 4.9584041287777835e-05, + "loss": 5.7693, + "step": 9778 + }, + { + "epoch": 0.05815848320487201, + "grad_norm": 2.000265121459961, + "learning_rate": 4.958395643100083e-05, + "loss": 5.654, + "step": 9779 + }, + { + "epoch": 0.058164430488153014, + "grad_norm": 1.8926239013671875, + "learning_rate": 4.958387156564181e-05, + "loss": 5.3004, + "step": 9780 + }, + { + "epoch": 0.05817037777143401, + "grad_norm": 2.3557002544403076, + "learning_rate": 4.958378669170082e-05, + "loss": 5.5437, + "step": 9781 + }, + { + "epoch": 0.058176325054715004, + "grad_norm": 1.9434150457382202, + "learning_rate": 4.958370180917787e-05, + "loss": 5.8442, + "step": 9782 + }, + { + "epoch": 0.058182272337996006, + "grad_norm": 1.875900387763977, + "learning_rate": 4.9583616918073026e-05, + "loss": 5.9312, + "step": 9783 + }, + { + "epoch": 0.058188219621277, + "grad_norm": 1.8945306539535522, + "learning_rate": 4.958353201838628e-05, + "loss": 5.7166, + "step": 9784 + }, + { + "epoch": 0.058194166904557997, + "grad_norm": 1.7081416845321655, + "learning_rate": 4.9583447110117684e-05, + "loss": 6.0803, + "step": 9785 + }, + { + "epoch": 0.058200114187839, + "grad_norm": 1.6520098447799683, + "learning_rate": 4.958336219326725e-05, + "loss": 6.0181, + "step": 9786 + }, + { + "epoch": 0.058206061471119994, + "grad_norm": 1.90665602684021, + "learning_rate": 4.9583277267835024e-05, + "loss": 5.586, + "step": 9787 + }, + { + "epoch": 0.05821200875440099, + "grad_norm": 1.8179740905761719, + "learning_rate": 4.958319233382104e-05, + "loss": 5.8637, + "step": 9788 + }, + { + "epoch": 0.058217956037681984, + "grad_norm": 1.8228380680084229, + "learning_rate": 4.95831073912253e-05, + "loss": 5.7406, + "step": 9789 + }, + { + "epoch": 0.058223903320962986, + "grad_norm": 1.691999912261963, + "learning_rate": 4.958302244004786e-05, + "loss": 5.8021, + "step": 9790 + }, + { + "epoch": 0.05822985060424398, + "grad_norm": 1.8590795993804932, + "learning_rate": 4.958293748028875e-05, + "loss": 5.5897, + "step": 9791 + }, + { + "epoch": 0.058235797887524976, + "grad_norm": 1.5923960208892822, + "learning_rate": 4.958285251194797e-05, + "loss": 5.7424, + "step": 9792 + }, + { + "epoch": 0.05824174517080598, + "grad_norm": 1.6928486824035645, + "learning_rate": 4.958276753502559e-05, + "loss": 5.905, + "step": 9793 + }, + { + "epoch": 0.05824769245408697, + "grad_norm": 2.120725393295288, + "learning_rate": 4.958268254952161e-05, + "loss": 5.9974, + "step": 9794 + }, + { + "epoch": 0.05825363973736797, + "grad_norm": 1.850441813468933, + "learning_rate": 4.9582597555436075e-05, + "loss": 5.7171, + "step": 9795 + }, + { + "epoch": 0.05825958702064897, + "grad_norm": 2.196037530899048, + "learning_rate": 4.9582512552769e-05, + "loss": 6.1243, + "step": 9796 + }, + { + "epoch": 0.058265534303929965, + "grad_norm": 1.9170193672180176, + "learning_rate": 4.9582427541520423e-05, + "loss": 5.8087, + "step": 9797 + }, + { + "epoch": 0.05827148158721096, + "grad_norm": 1.974478006362915, + "learning_rate": 4.958234252169039e-05, + "loss": 5.794, + "step": 9798 + }, + { + "epoch": 0.05827742887049196, + "grad_norm": 1.824965476989746, + "learning_rate": 4.9582257493278904e-05, + "loss": 5.6904, + "step": 9799 + }, + { + "epoch": 0.05828337615377296, + "grad_norm": 1.828037142753601, + "learning_rate": 4.9582172456286e-05, + "loss": 5.6793, + "step": 9800 + }, + { + "epoch": 0.05828932343705395, + "grad_norm": 1.8949617147445679, + "learning_rate": 4.9582087410711726e-05, + "loss": 5.6685, + "step": 9801 + }, + { + "epoch": 0.05829527072033495, + "grad_norm": 1.8183050155639648, + "learning_rate": 4.958200235655609e-05, + "loss": 5.7754, + "step": 9802 + }, + { + "epoch": 0.05830121800361595, + "grad_norm": 1.6816062927246094, + "learning_rate": 4.9581917293819135e-05, + "loss": 5.6931, + "step": 9803 + }, + { + "epoch": 0.058307165286896945, + "grad_norm": 1.875659465789795, + "learning_rate": 4.958183222250089e-05, + "loss": 5.7568, + "step": 9804 + }, + { + "epoch": 0.05831311257017794, + "grad_norm": 2.162404775619507, + "learning_rate": 4.958174714260137e-05, + "loss": 5.7969, + "step": 9805 + }, + { + "epoch": 0.05831905985345894, + "grad_norm": 2.2122790813446045, + "learning_rate": 4.958166205412064e-05, + "loss": 5.7301, + "step": 9806 + }, + { + "epoch": 0.05832500713673994, + "grad_norm": 1.8822424411773682, + "learning_rate": 4.9581576957058686e-05, + "loss": 5.7034, + "step": 9807 + }, + { + "epoch": 0.05833095442002093, + "grad_norm": 1.8780319690704346, + "learning_rate": 4.958149185141556e-05, + "loss": 5.6573, + "step": 9808 + }, + { + "epoch": 0.058336901703301934, + "grad_norm": 1.9177708625793457, + "learning_rate": 4.958140673719129e-05, + "loss": 5.6619, + "step": 9809 + }, + { + "epoch": 0.05834284898658293, + "grad_norm": 1.8662844896316528, + "learning_rate": 4.95813216143859e-05, + "loss": 5.5857, + "step": 9810 + }, + { + "epoch": 0.058348796269863924, + "grad_norm": 2.1798834800720215, + "learning_rate": 4.958123648299944e-05, + "loss": 5.5811, + "step": 9811 + }, + { + "epoch": 0.058354743553144926, + "grad_norm": 2.1575138568878174, + "learning_rate": 4.958115134303191e-05, + "loss": 5.6761, + "step": 9812 + }, + { + "epoch": 0.05836069083642592, + "grad_norm": 2.055314302444458, + "learning_rate": 4.958106619448336e-05, + "loss": 5.721, + "step": 9813 + }, + { + "epoch": 0.058366638119706916, + "grad_norm": 1.8962149620056152, + "learning_rate": 4.958098103735381e-05, + "loss": 5.6132, + "step": 9814 + }, + { + "epoch": 0.05837258540298792, + "grad_norm": 1.7715760469436646, + "learning_rate": 4.95808958716433e-05, + "loss": 5.6461, + "step": 9815 + }, + { + "epoch": 0.058378532686268914, + "grad_norm": 1.9166070222854614, + "learning_rate": 4.958081069735184e-05, + "loss": 5.5628, + "step": 9816 + }, + { + "epoch": 0.05838447996954991, + "grad_norm": 1.8872902393341064, + "learning_rate": 4.9580725514479484e-05, + "loss": 5.6476, + "step": 9817 + }, + { + "epoch": 0.058390427252830904, + "grad_norm": 1.8257521390914917, + "learning_rate": 4.9580640323026254e-05, + "loss": 5.6175, + "step": 9818 + }, + { + "epoch": 0.058396374536111906, + "grad_norm": 1.919291377067566, + "learning_rate": 4.958055512299217e-05, + "loss": 5.5954, + "step": 9819 + }, + { + "epoch": 0.0584023218193929, + "grad_norm": 1.8318076133728027, + "learning_rate": 4.958046991437726e-05, + "loss": 5.6255, + "step": 9820 + }, + { + "epoch": 0.058408269102673896, + "grad_norm": 1.9153858423233032, + "learning_rate": 4.958038469718158e-05, + "loss": 5.6787, + "step": 9821 + }, + { + "epoch": 0.0584142163859549, + "grad_norm": 1.967021107673645, + "learning_rate": 4.958029947140513e-05, + "loss": 5.6714, + "step": 9822 + }, + { + "epoch": 0.05842016366923589, + "grad_norm": 1.654997706413269, + "learning_rate": 4.958021423704795e-05, + "loss": 5.4809, + "step": 9823 + }, + { + "epoch": 0.05842611095251689, + "grad_norm": 1.8183335065841675, + "learning_rate": 4.9580128994110074e-05, + "loss": 5.5223, + "step": 9824 + }, + { + "epoch": 0.05843205823579789, + "grad_norm": 1.7665660381317139, + "learning_rate": 4.958004374259153e-05, + "loss": 5.5639, + "step": 9825 + }, + { + "epoch": 0.058438005519078885, + "grad_norm": 1.8233551979064941, + "learning_rate": 4.957995848249235e-05, + "loss": 5.6358, + "step": 9826 + }, + { + "epoch": 0.05844395280235988, + "grad_norm": 1.721301555633545, + "learning_rate": 4.957987321381256e-05, + "loss": 5.4989, + "step": 9827 + }, + { + "epoch": 0.05844990008564088, + "grad_norm": 1.6921659708023071, + "learning_rate": 4.957978793655218e-05, + "loss": 5.448, + "step": 9828 + }, + { + "epoch": 0.05845584736892188, + "grad_norm": 1.810354232788086, + "learning_rate": 4.957970265071126e-05, + "loss": 5.4501, + "step": 9829 + }, + { + "epoch": 0.05846179465220287, + "grad_norm": 1.7205116748809814, + "learning_rate": 4.957961735628982e-05, + "loss": 5.5222, + "step": 9830 + }, + { + "epoch": 0.05846774193548387, + "grad_norm": 1.9636965990066528, + "learning_rate": 4.957953205328788e-05, + "loss": 5.5894, + "step": 9831 + }, + { + "epoch": 0.05847368921876487, + "grad_norm": 1.9312820434570312, + "learning_rate": 4.9579446741705485e-05, + "loss": 5.6543, + "step": 9832 + }, + { + "epoch": 0.058479636502045865, + "grad_norm": 1.870448112487793, + "learning_rate": 4.9579361421542665e-05, + "loss": 5.6707, + "step": 9833 + }, + { + "epoch": 0.05848558378532686, + "grad_norm": 1.5943735837936401, + "learning_rate": 4.9579276092799435e-05, + "loss": 5.5184, + "step": 9834 + }, + { + "epoch": 0.05849153106860786, + "grad_norm": 1.6929852962493896, + "learning_rate": 4.957919075547584e-05, + "loss": 5.5188, + "step": 9835 + }, + { + "epoch": 0.05849747835188886, + "grad_norm": 2.0268075466156006, + "learning_rate": 4.95791054095719e-05, + "loss": 5.4909, + "step": 9836 + }, + { + "epoch": 0.05850342563516985, + "grad_norm": 2.047982931137085, + "learning_rate": 4.957902005508765e-05, + "loss": 5.6459, + "step": 9837 + }, + { + "epoch": 0.058509372918450854, + "grad_norm": 1.7938467264175415, + "learning_rate": 4.957893469202311e-05, + "loss": 5.4805, + "step": 9838 + }, + { + "epoch": 0.05851532020173185, + "grad_norm": 1.803093433380127, + "learning_rate": 4.957884932037833e-05, + "loss": 5.4092, + "step": 9839 + }, + { + "epoch": 0.058521267485012844, + "grad_norm": 1.8001232147216797, + "learning_rate": 4.957876394015333e-05, + "loss": 5.9168, + "step": 9840 + }, + { + "epoch": 0.058527214768293846, + "grad_norm": 1.9442622661590576, + "learning_rate": 4.9578678551348125e-05, + "loss": 6.0317, + "step": 9841 + }, + { + "epoch": 0.05853316205157484, + "grad_norm": 2.013845205307007, + "learning_rate": 4.957859315396276e-05, + "loss": 5.6855, + "step": 9842 + }, + { + "epoch": 0.058539109334855836, + "grad_norm": 2.7557523250579834, + "learning_rate": 4.9578507747997264e-05, + "loss": 5.3782, + "step": 9843 + }, + { + "epoch": 0.05854505661813684, + "grad_norm": 1.9822032451629639, + "learning_rate": 4.957842233345167e-05, + "loss": 6.22, + "step": 9844 + }, + { + "epoch": 0.058551003901417834, + "grad_norm": 1.7408699989318848, + "learning_rate": 4.9578336910326e-05, + "loss": 5.2347, + "step": 9845 + }, + { + "epoch": 0.05855695118469883, + "grad_norm": 3.2186660766601562, + "learning_rate": 4.957825147862028e-05, + "loss": 5.3282, + "step": 9846 + }, + { + "epoch": 0.058562898467979824, + "grad_norm": 3.3589892387390137, + "learning_rate": 4.957816603833455e-05, + "loss": 5.5689, + "step": 9847 + }, + { + "epoch": 0.058568845751260826, + "grad_norm": 3.4228861331939697, + "learning_rate": 4.957808058946883e-05, + "loss": 5.5797, + "step": 9848 + }, + { + "epoch": 0.05857479303454182, + "grad_norm": 2.420506238937378, + "learning_rate": 4.957799513202317e-05, + "loss": 5.735, + "step": 9849 + }, + { + "epoch": 0.058580740317822816, + "grad_norm": 1.8269212245941162, + "learning_rate": 4.957790966599758e-05, + "loss": 5.7571, + "step": 9850 + }, + { + "epoch": 0.05858668760110382, + "grad_norm": 2.011110305786133, + "learning_rate": 4.957782419139209e-05, + "loss": 5.9786, + "step": 9851 + }, + { + "epoch": 0.05859263488438481, + "grad_norm": 2.3139355182647705, + "learning_rate": 4.957773870820674e-05, + "loss": 5.8356, + "step": 9852 + }, + { + "epoch": 0.05859858216766581, + "grad_norm": 2.3406572341918945, + "learning_rate": 4.957765321644155e-05, + "loss": 5.8426, + "step": 9853 + }, + { + "epoch": 0.05860452945094681, + "grad_norm": 2.1194591522216797, + "learning_rate": 4.957756771609657e-05, + "loss": 5.6152, + "step": 9854 + }, + { + "epoch": 0.058610476734227805, + "grad_norm": 1.9966599941253662, + "learning_rate": 4.95774822071718e-05, + "loss": 5.8189, + "step": 9855 + }, + { + "epoch": 0.0586164240175088, + "grad_norm": 1.8953092098236084, + "learning_rate": 4.95773966896673e-05, + "loss": 5.8185, + "step": 9856 + }, + { + "epoch": 0.0586223713007898, + "grad_norm": 1.9035093784332275, + "learning_rate": 4.957731116358307e-05, + "loss": 5.6554, + "step": 9857 + }, + { + "epoch": 0.0586283185840708, + "grad_norm": 3.507546901702881, + "learning_rate": 4.9577225628919157e-05, + "loss": 5.8906, + "step": 9858 + }, + { + "epoch": 0.05863426586735179, + "grad_norm": 2.1840403079986572, + "learning_rate": 4.9577140085675586e-05, + "loss": 5.6084, + "step": 9859 + }, + { + "epoch": 0.05864021315063279, + "grad_norm": 2.008424758911133, + "learning_rate": 4.95770545338524e-05, + "loss": 5.8435, + "step": 9860 + }, + { + "epoch": 0.05864616043391379, + "grad_norm": 1.9004656076431274, + "learning_rate": 4.957696897344961e-05, + "loss": 5.5906, + "step": 9861 + }, + { + "epoch": 0.058652107717194785, + "grad_norm": 1.8043147325515747, + "learning_rate": 4.9576883404467255e-05, + "loss": 5.6057, + "step": 9862 + }, + { + "epoch": 0.05865805500047578, + "grad_norm": 1.6765285730361938, + "learning_rate": 4.957679782690537e-05, + "loss": 5.7246, + "step": 9863 + }, + { + "epoch": 0.05866400228375678, + "grad_norm": 2.0207018852233887, + "learning_rate": 4.9576712240763974e-05, + "loss": 5.8459, + "step": 9864 + }, + { + "epoch": 0.05866994956703778, + "grad_norm": 1.975874423980713, + "learning_rate": 4.95766266460431e-05, + "loss": 5.7313, + "step": 9865 + }, + { + "epoch": 0.05867589685031877, + "grad_norm": 2.085277557373047, + "learning_rate": 4.957654104274279e-05, + "loss": 5.1359, + "step": 9866 + }, + { + "epoch": 0.058681844133599774, + "grad_norm": 2.039437770843506, + "learning_rate": 4.957645543086305e-05, + "loss": 5.5673, + "step": 9867 + }, + { + "epoch": 0.05868779141688077, + "grad_norm": 2.0692098140716553, + "learning_rate": 4.9576369810403926e-05, + "loss": 5.6326, + "step": 9868 + }, + { + "epoch": 0.058693738700161764, + "grad_norm": 2.3873767852783203, + "learning_rate": 4.957628418136545e-05, + "loss": 5.5133, + "step": 9869 + }, + { + "epoch": 0.058699685983442766, + "grad_norm": 2.9347658157348633, + "learning_rate": 4.957619854374764e-05, + "loss": 5.5444, + "step": 9870 + }, + { + "epoch": 0.05870563326672376, + "grad_norm": 2.955348014831543, + "learning_rate": 4.957611289755054e-05, + "loss": 5.4883, + "step": 9871 + }, + { + "epoch": 0.058711580550004756, + "grad_norm": 2.147033214569092, + "learning_rate": 4.957602724277417e-05, + "loss": 5.4554, + "step": 9872 + }, + { + "epoch": 0.05871752783328576, + "grad_norm": 2.1422510147094727, + "learning_rate": 4.957594157941856e-05, + "loss": 5.56, + "step": 9873 + }, + { + "epoch": 0.05872347511656675, + "grad_norm": 2.018935203552246, + "learning_rate": 4.957585590748375e-05, + "loss": 5.5176, + "step": 9874 + }, + { + "epoch": 0.05872942239984775, + "grad_norm": 3.0146446228027344, + "learning_rate": 4.957577022696976e-05, + "loss": 5.2623, + "step": 9875 + }, + { + "epoch": 0.058735369683128744, + "grad_norm": 2.923011064529419, + "learning_rate": 4.957568453787662e-05, + "loss": 5.1828, + "step": 9876 + }, + { + "epoch": 0.058741316966409746, + "grad_norm": 2.7203526496887207, + "learning_rate": 4.9575598840204366e-05, + "loss": 5.1565, + "step": 9877 + }, + { + "epoch": 0.05874726424969074, + "grad_norm": 2.056260108947754, + "learning_rate": 4.9575513133953025e-05, + "loss": 5.1345, + "step": 9878 + }, + { + "epoch": 0.058753211532971736, + "grad_norm": 2.3120932579040527, + "learning_rate": 4.9575427419122616e-05, + "loss": 5.1792, + "step": 9879 + }, + { + "epoch": 0.05875915881625274, + "grad_norm": 2.1298701763153076, + "learning_rate": 4.9575341695713186e-05, + "loss": 5.1447, + "step": 9880 + }, + { + "epoch": 0.05876510609953373, + "grad_norm": 2.393869638442993, + "learning_rate": 4.9575255963724756e-05, + "loss": 5.2938, + "step": 9881 + }, + { + "epoch": 0.05877105338281473, + "grad_norm": 2.324061155319214, + "learning_rate": 4.9575170223157366e-05, + "loss": 5.1488, + "step": 9882 + }, + { + "epoch": 0.05877700066609573, + "grad_norm": 2.1416141986846924, + "learning_rate": 4.957508447401103e-05, + "loss": 5.0551, + "step": 9883 + }, + { + "epoch": 0.058782947949376725, + "grad_norm": 2.127350091934204, + "learning_rate": 4.9574998716285795e-05, + "loss": 5.03, + "step": 9884 + }, + { + "epoch": 0.05878889523265772, + "grad_norm": 2.317267417907715, + "learning_rate": 4.957491294998167e-05, + "loss": 5.049, + "step": 9885 + }, + { + "epoch": 0.05879484251593872, + "grad_norm": 2.3667004108428955, + "learning_rate": 4.9574827175098704e-05, + "loss": 5.009, + "step": 9886 + }, + { + "epoch": 0.05880078979921972, + "grad_norm": 2.4034934043884277, + "learning_rate": 4.9574741391636915e-05, + "loss": 4.9419, + "step": 9887 + }, + { + "epoch": 0.05880673708250071, + "grad_norm": 2.3792901039123535, + "learning_rate": 4.957465559959634e-05, + "loss": 4.8517, + "step": 9888 + }, + { + "epoch": 0.05881268436578171, + "grad_norm": 2.139249086380005, + "learning_rate": 4.957456979897701e-05, + "loss": 5.0767, + "step": 9889 + }, + { + "epoch": 0.05881863164906271, + "grad_norm": 2.5370614528656006, + "learning_rate": 4.957448398977894e-05, + "loss": 5.0243, + "step": 9890 + }, + { + "epoch": 0.058824578932343705, + "grad_norm": 2.0474746227264404, + "learning_rate": 4.957439817200218e-05, + "loss": 4.988, + "step": 9891 + }, + { + "epoch": 0.0588305262156247, + "grad_norm": 2.1323394775390625, + "learning_rate": 4.957431234564675e-05, + "loss": 5.7499, + "step": 9892 + }, + { + "epoch": 0.0588364734989057, + "grad_norm": 2.135988473892212, + "learning_rate": 4.957422651071269e-05, + "loss": 6.0197, + "step": 9893 + }, + { + "epoch": 0.0588424207821867, + "grad_norm": 2.4457356929779053, + "learning_rate": 4.957414066720001e-05, + "loss": 5.4461, + "step": 9894 + }, + { + "epoch": 0.05884836806546769, + "grad_norm": 2.3973019123077393, + "learning_rate": 4.957405481510876e-05, + "loss": 5.0372, + "step": 9895 + }, + { + "epoch": 0.058854315348748694, + "grad_norm": 2.5532052516937256, + "learning_rate": 4.957396895443896e-05, + "loss": 5.1462, + "step": 9896 + }, + { + "epoch": 0.05886026263202969, + "grad_norm": 2.3662166595458984, + "learning_rate": 4.9573883085190633e-05, + "loss": 5.1894, + "step": 9897 + }, + { + "epoch": 0.058866209915310684, + "grad_norm": 2.153883695602417, + "learning_rate": 4.9573797207363825e-05, + "loss": 5.6859, + "step": 9898 + }, + { + "epoch": 0.058872157198591686, + "grad_norm": 1.9541380405426025, + "learning_rate": 4.957371132095856e-05, + "loss": 5.5487, + "step": 9899 + }, + { + "epoch": 0.05887810448187268, + "grad_norm": 1.7920335531234741, + "learning_rate": 4.957362542597486e-05, + "loss": 5.4021, + "step": 9900 + }, + { + "epoch": 0.058884051765153676, + "grad_norm": 2.351090431213379, + "learning_rate": 4.9573539522412756e-05, + "loss": 4.9377, + "step": 9901 + }, + { + "epoch": 0.05888999904843468, + "grad_norm": 2.4780900478363037, + "learning_rate": 4.95734536102723e-05, + "loss": 5.04, + "step": 9902 + }, + { + "epoch": 0.05889594633171567, + "grad_norm": 1.7211192846298218, + "learning_rate": 4.957336768955349e-05, + "loss": 5.2959, + "step": 9903 + }, + { + "epoch": 0.05890189361499667, + "grad_norm": 1.9051212072372437, + "learning_rate": 4.957328176025638e-05, + "loss": 5.5587, + "step": 9904 + }, + { + "epoch": 0.058907840898277664, + "grad_norm": 2.009725332260132, + "learning_rate": 4.957319582238099e-05, + "loss": 5.5366, + "step": 9905 + }, + { + "epoch": 0.058913788181558666, + "grad_norm": 1.835423231124878, + "learning_rate": 4.957310987592735e-05, + "loss": 5.2522, + "step": 9906 + }, + { + "epoch": 0.05891973546483966, + "grad_norm": 1.6150819063186646, + "learning_rate": 4.957302392089549e-05, + "loss": 5.3935, + "step": 9907 + }, + { + "epoch": 0.058925682748120656, + "grad_norm": 1.825942873954773, + "learning_rate": 4.9572937957285435e-05, + "loss": 5.5435, + "step": 9908 + }, + { + "epoch": 0.05893163003140166, + "grad_norm": 1.5434985160827637, + "learning_rate": 4.957285198509724e-05, + "loss": 5.2508, + "step": 9909 + }, + { + "epoch": 0.05893757731468265, + "grad_norm": 1.7675530910491943, + "learning_rate": 4.9572766004330894e-05, + "loss": 5.2811, + "step": 9910 + }, + { + "epoch": 0.05894352459796365, + "grad_norm": 1.5196996927261353, + "learning_rate": 4.957268001498646e-05, + "loss": 5.1829, + "step": 9911 + }, + { + "epoch": 0.05894947188124465, + "grad_norm": 1.5598126649856567, + "learning_rate": 4.9572594017063964e-05, + "loss": 5.2067, + "step": 9912 + }, + { + "epoch": 0.058955419164525645, + "grad_norm": 1.6600217819213867, + "learning_rate": 4.957250801056342e-05, + "loss": 5.1591, + "step": 9913 + }, + { + "epoch": 0.05896136644780664, + "grad_norm": 2.040682315826416, + "learning_rate": 4.957242199548487e-05, + "loss": 4.8792, + "step": 9914 + }, + { + "epoch": 0.05896731373108764, + "grad_norm": 2.0122241973876953, + "learning_rate": 4.9572335971828346e-05, + "loss": 5.9489, + "step": 9915 + }, + { + "epoch": 0.05897326101436864, + "grad_norm": 2.4522452354431152, + "learning_rate": 4.957224993959386e-05, + "loss": 5.943, + "step": 9916 + }, + { + "epoch": 0.05897920829764963, + "grad_norm": 1.9101065397262573, + "learning_rate": 4.957216389878147e-05, + "loss": 5.858, + "step": 9917 + }, + { + "epoch": 0.05898515558093063, + "grad_norm": 1.6488839387893677, + "learning_rate": 4.957207784939118e-05, + "loss": 5.4935, + "step": 9918 + }, + { + "epoch": 0.05899110286421163, + "grad_norm": 1.7620775699615479, + "learning_rate": 4.957199179142303e-05, + "loss": 5.6067, + "step": 9919 + }, + { + "epoch": 0.058997050147492625, + "grad_norm": 2.6018314361572266, + "learning_rate": 4.957190572487707e-05, + "loss": 5.5249, + "step": 9920 + }, + { + "epoch": 0.05900299743077362, + "grad_norm": 1.810274600982666, + "learning_rate": 4.957181964975329e-05, + "loss": 5.4063, + "step": 9921 + }, + { + "epoch": 0.05900894471405462, + "grad_norm": 1.7467454671859741, + "learning_rate": 4.957173356605176e-05, + "loss": 5.4476, + "step": 9922 + }, + { + "epoch": 0.05901489199733562, + "grad_norm": 1.9074509143829346, + "learning_rate": 4.9571647473772483e-05, + "loss": 5.8014, + "step": 9923 + }, + { + "epoch": 0.05902083928061661, + "grad_norm": 1.6376137733459473, + "learning_rate": 4.9571561372915496e-05, + "loss": 5.6813, + "step": 9924 + }, + { + "epoch": 0.059026786563897614, + "grad_norm": 1.9984129667282104, + "learning_rate": 4.957147526348083e-05, + "loss": 5.9534, + "step": 9925 + }, + { + "epoch": 0.05903273384717861, + "grad_norm": 2.38493013381958, + "learning_rate": 4.957138914546852e-05, + "loss": 5.6903, + "step": 9926 + }, + { + "epoch": 0.059038681130459604, + "grad_norm": 1.86250901222229, + "learning_rate": 4.957130301887859e-05, + "loss": 5.1777, + "step": 9927 + }, + { + "epoch": 0.059044628413740606, + "grad_norm": 1.6241644620895386, + "learning_rate": 4.957121688371107e-05, + "loss": 5.1693, + "step": 9928 + }, + { + "epoch": 0.0590505756970216, + "grad_norm": 1.5627753734588623, + "learning_rate": 4.9571130739965996e-05, + "loss": 5.0313, + "step": 9929 + }, + { + "epoch": 0.059056522980302596, + "grad_norm": 1.6763062477111816, + "learning_rate": 4.957104458764339e-05, + "loss": 4.9973, + "step": 9930 + }, + { + "epoch": 0.0590624702635836, + "grad_norm": 1.6215085983276367, + "learning_rate": 4.957095842674329e-05, + "loss": 5.2216, + "step": 9931 + }, + { + "epoch": 0.05906841754686459, + "grad_norm": 1.5599844455718994, + "learning_rate": 4.957087225726572e-05, + "loss": 5.4525, + "step": 9932 + }, + { + "epoch": 0.05907436483014559, + "grad_norm": 1.3916441202163696, + "learning_rate": 4.957078607921072e-05, + "loss": 5.4434, + "step": 9933 + }, + { + "epoch": 0.059080312113426584, + "grad_norm": 1.524478554725647, + "learning_rate": 4.9570699892578295e-05, + "loss": 5.3979, + "step": 9934 + }, + { + "epoch": 0.059086259396707586, + "grad_norm": 1.264108657836914, + "learning_rate": 4.9570613697368505e-05, + "loss": 5.2892, + "step": 9935 + }, + { + "epoch": 0.05909220667998858, + "grad_norm": 1.7481588125228882, + "learning_rate": 4.957052749358137e-05, + "loss": 4.8539, + "step": 9936 + }, + { + "epoch": 0.059098153963269576, + "grad_norm": 1.675515055656433, + "learning_rate": 4.957044128121692e-05, + "loss": 5.4645, + "step": 9937 + }, + { + "epoch": 0.05910410124655058, + "grad_norm": 1.6560577154159546, + "learning_rate": 4.957035506027517e-05, + "loss": 4.9354, + "step": 9938 + }, + { + "epoch": 0.05911004852983157, + "grad_norm": 1.5030722618103027, + "learning_rate": 4.9570268830756174e-05, + "loss": 5.206, + "step": 9939 + }, + { + "epoch": 0.05911599581311257, + "grad_norm": 1.65435791015625, + "learning_rate": 4.957018259265994e-05, + "loss": 5.2132, + "step": 9940 + }, + { + "epoch": 0.05912194309639357, + "grad_norm": 1.6701000928878784, + "learning_rate": 4.9570096345986515e-05, + "loss": 5.2313, + "step": 9941 + }, + { + "epoch": 0.059127890379674565, + "grad_norm": 1.412954330444336, + "learning_rate": 4.957001009073593e-05, + "loss": 5.2511, + "step": 9942 + }, + { + "epoch": 0.05913383766295556, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.95699238269082e-05, + "loss": 5.3646, + "step": 9943 + }, + { + "epoch": 0.05913978494623656, + "grad_norm": 1.6969150304794312, + "learning_rate": 4.9569837554503365e-05, + "loss": 5.3001, + "step": 9944 + }, + { + "epoch": 0.05914573222951756, + "grad_norm": 1.8579715490341187, + "learning_rate": 4.9569751273521454e-05, + "loss": 5.0944, + "step": 9945 + }, + { + "epoch": 0.05915167951279855, + "grad_norm": 1.6907633543014526, + "learning_rate": 4.956966498396249e-05, + "loss": 5.1447, + "step": 9946 + }, + { + "epoch": 0.059157626796079554, + "grad_norm": 1.7581912279129028, + "learning_rate": 4.9569578685826525e-05, + "loss": 5.2065, + "step": 9947 + }, + { + "epoch": 0.05916357407936055, + "grad_norm": 1.4447051286697388, + "learning_rate": 4.9569492379113555e-05, + "loss": 5.081, + "step": 9948 + }, + { + "epoch": 0.059169521362641544, + "grad_norm": 1.731697916984558, + "learning_rate": 4.9569406063823644e-05, + "loss": 5.241, + "step": 9949 + }, + { + "epoch": 0.05917546864592254, + "grad_norm": 1.6483672857284546, + "learning_rate": 4.956931973995681e-05, + "loss": 5.306, + "step": 9950 + }, + { + "epoch": 0.05918141592920354, + "grad_norm": 2.2123141288757324, + "learning_rate": 4.956923340751306e-05, + "loss": 5.6134, + "step": 9951 + }, + { + "epoch": 0.05918736321248454, + "grad_norm": 1.8569937944412231, + "learning_rate": 4.956914706649246e-05, + "loss": 5.4819, + "step": 9952 + }, + { + "epoch": 0.05919331049576553, + "grad_norm": 1.8417435884475708, + "learning_rate": 4.956906071689502e-05, + "loss": 5.4116, + "step": 9953 + }, + { + "epoch": 0.059199257779046534, + "grad_norm": 1.7050427198410034, + "learning_rate": 4.956897435872078e-05, + "loss": 5.238, + "step": 9954 + }, + { + "epoch": 0.05920520506232753, + "grad_norm": 1.6636401414871216, + "learning_rate": 4.956888799196976e-05, + "loss": 5.0962, + "step": 9955 + }, + { + "epoch": 0.059211152345608524, + "grad_norm": 1.9194599390029907, + "learning_rate": 4.9568801616642e-05, + "loss": 5.2078, + "step": 9956 + }, + { + "epoch": 0.059217099628889526, + "grad_norm": 1.6154237985610962, + "learning_rate": 4.956871523273752e-05, + "loss": 5.3562, + "step": 9957 + }, + { + "epoch": 0.05922304691217052, + "grad_norm": 1.4500404596328735, + "learning_rate": 4.956862884025636e-05, + "loss": 5.2061, + "step": 9958 + }, + { + "epoch": 0.059228994195451516, + "grad_norm": 1.6681636571884155, + "learning_rate": 4.956854243919854e-05, + "loss": 5.3455, + "step": 9959 + }, + { + "epoch": 0.05923494147873252, + "grad_norm": 1.7175511121749878, + "learning_rate": 4.9568456029564104e-05, + "loss": 5.2967, + "step": 9960 + }, + { + "epoch": 0.05924088876201351, + "grad_norm": 1.5013905763626099, + "learning_rate": 4.956836961135306e-05, + "loss": 4.9836, + "step": 9961 + }, + { + "epoch": 0.05924683604529451, + "grad_norm": 1.6521363258361816, + "learning_rate": 4.956828318456546e-05, + "loss": 5.0295, + "step": 9962 + }, + { + "epoch": 0.0592527833285755, + "grad_norm": 1.5945814847946167, + "learning_rate": 4.9568196749201326e-05, + "loss": 4.9511, + "step": 9963 + }, + { + "epoch": 0.059258730611856505, + "grad_norm": 1.508301854133606, + "learning_rate": 4.95681103052607e-05, + "loss": 4.9469, + "step": 9964 + }, + { + "epoch": 0.0592646778951375, + "grad_norm": 1.5902310609817505, + "learning_rate": 4.956802385274358e-05, + "loss": 4.9761, + "step": 9965 + }, + { + "epoch": 0.059270625178418496, + "grad_norm": 1.739424467086792, + "learning_rate": 4.956793739165003e-05, + "loss": 5.2443, + "step": 9966 + }, + { + "epoch": 0.0592765724616995, + "grad_norm": 1.8317997455596924, + "learning_rate": 4.9567850921980056e-05, + "loss": 5.0046, + "step": 9967 + }, + { + "epoch": 0.05928251974498049, + "grad_norm": 1.8073506355285645, + "learning_rate": 4.956776444373371e-05, + "loss": 5.1779, + "step": 9968 + }, + { + "epoch": 0.05928846702826149, + "grad_norm": 1.8806017637252808, + "learning_rate": 4.956767795691101e-05, + "loss": 5.2956, + "step": 9969 + }, + { + "epoch": 0.05929441431154249, + "grad_norm": 1.8397493362426758, + "learning_rate": 4.956759146151198e-05, + "loss": 5.1775, + "step": 9970 + }, + { + "epoch": 0.059300361594823485, + "grad_norm": 2.001387119293213, + "learning_rate": 4.9567504957536656e-05, + "loss": 5.2149, + "step": 9971 + }, + { + "epoch": 0.05930630887810448, + "grad_norm": 2.011504650115967, + "learning_rate": 4.956741844498508e-05, + "loss": 5.2384, + "step": 9972 + }, + { + "epoch": 0.05931225616138548, + "grad_norm": 1.7936465740203857, + "learning_rate": 4.956733192385727e-05, + "loss": 5.2297, + "step": 9973 + }, + { + "epoch": 0.05931820344466648, + "grad_norm": 1.7336666584014893, + "learning_rate": 4.9567245394153255e-05, + "loss": 5.1637, + "step": 9974 + }, + { + "epoch": 0.05932415072794747, + "grad_norm": 1.7429137229919434, + "learning_rate": 4.956715885587307e-05, + "loss": 5.1315, + "step": 9975 + }, + { + "epoch": 0.059330098011228474, + "grad_norm": 1.6609208583831787, + "learning_rate": 4.956707230901674e-05, + "loss": 5.1554, + "step": 9976 + }, + { + "epoch": 0.05933604529450947, + "grad_norm": 1.630026936531067, + "learning_rate": 4.95669857535843e-05, + "loss": 5.1569, + "step": 9977 + }, + { + "epoch": 0.059341992577790464, + "grad_norm": 1.6968966722488403, + "learning_rate": 4.956689918957579e-05, + "loss": 5.06, + "step": 9978 + }, + { + "epoch": 0.05934793986107146, + "grad_norm": 1.6973050832748413, + "learning_rate": 4.9566812616991214e-05, + "loss": 5.2044, + "step": 9979 + }, + { + "epoch": 0.05935388714435246, + "grad_norm": 1.436073899269104, + "learning_rate": 4.9566726035830624e-05, + "loss": 5.2638, + "step": 9980 + }, + { + "epoch": 0.05935983442763346, + "grad_norm": 1.7667059898376465, + "learning_rate": 4.956663944609404e-05, + "loss": 5.0912, + "step": 9981 + }, + { + "epoch": 0.05936578171091445, + "grad_norm": 2.277327060699463, + "learning_rate": 4.9566552847781504e-05, + "loss": 5.6089, + "step": 9982 + }, + { + "epoch": 0.059371728994195454, + "grad_norm": 1.521134376525879, + "learning_rate": 4.956646624089304e-05, + "loss": 5.0213, + "step": 9983 + }, + { + "epoch": 0.05937767627747645, + "grad_norm": 1.556511402130127, + "learning_rate": 4.956637962542867e-05, + "loss": 5.1126, + "step": 9984 + }, + { + "epoch": 0.059383623560757444, + "grad_norm": 1.6691070795059204, + "learning_rate": 4.9566293001388423e-05, + "loss": 5.1351, + "step": 9985 + }, + { + "epoch": 0.059389570844038446, + "grad_norm": 1.5213310718536377, + "learning_rate": 4.956620636877235e-05, + "loss": 5.2402, + "step": 9986 + }, + { + "epoch": 0.05939551812731944, + "grad_norm": 1.5169057846069336, + "learning_rate": 4.956611972758046e-05, + "loss": 5.214, + "step": 9987 + }, + { + "epoch": 0.059401465410600436, + "grad_norm": 1.6076115369796753, + "learning_rate": 4.956603307781279e-05, + "loss": 5.1081, + "step": 9988 + }, + { + "epoch": 0.05940741269388144, + "grad_norm": 1.7340706586837769, + "learning_rate": 4.9565946419469376e-05, + "loss": 5.1582, + "step": 9989 + }, + { + "epoch": 0.05941335997716243, + "grad_norm": 1.5118008852005005, + "learning_rate": 4.956585975255025e-05, + "loss": 5.0515, + "step": 9990 + }, + { + "epoch": 0.05941930726044343, + "grad_norm": 1.8852020502090454, + "learning_rate": 4.956577307705543e-05, + "loss": 5.3811, + "step": 9991 + }, + { + "epoch": 0.05942525454372442, + "grad_norm": 1.7066764831542969, + "learning_rate": 4.9565686392984955e-05, + "loss": 5.4599, + "step": 9992 + }, + { + "epoch": 0.059431201827005425, + "grad_norm": 1.5517010688781738, + "learning_rate": 4.956559970033885e-05, + "loss": 5.0728, + "step": 9993 + }, + { + "epoch": 0.05943714911028642, + "grad_norm": 1.508901596069336, + "learning_rate": 4.956551299911715e-05, + "loss": 5.1857, + "step": 9994 + }, + { + "epoch": 0.059443096393567416, + "grad_norm": 1.8867852687835693, + "learning_rate": 4.9565426289319874e-05, + "loss": 5.2223, + "step": 9995 + }, + { + "epoch": 0.05944904367684842, + "grad_norm": 1.4767159223556519, + "learning_rate": 4.9565339570947076e-05, + "loss": 5.1404, + "step": 9996 + }, + { + "epoch": 0.05945499096012941, + "grad_norm": 1.6351869106292725, + "learning_rate": 4.956525284399876e-05, + "loss": 5.3235, + "step": 9997 + }, + { + "epoch": 0.05946093824341041, + "grad_norm": 1.543565273284912, + "learning_rate": 4.956516610847497e-05, + "loss": 5.3365, + "step": 9998 + }, + { + "epoch": 0.05946688552669141, + "grad_norm": 1.4907768964767456, + "learning_rate": 4.9565079364375746e-05, + "loss": 5.4215, + "step": 9999 + }, + { + "epoch": 0.059472832809972405, + "grad_norm": 1.5810034275054932, + "learning_rate": 4.956499261170109e-05, + "loss": 5.3899, + "step": 10000 + }, + { + "epoch": 0.0594787800932534, + "grad_norm": 1.6342787742614746, + "learning_rate": 4.956490585045106e-05, + "loss": 5.4278, + "step": 10001 + }, + { + "epoch": 0.0594847273765344, + "grad_norm": 1.5474039316177368, + "learning_rate": 4.956481908062567e-05, + "loss": 5.1232, + "step": 10002 + }, + { + "epoch": 0.0594906746598154, + "grad_norm": 1.5679951906204224, + "learning_rate": 4.956473230222496e-05, + "loss": 5.3245, + "step": 10003 + }, + { + "epoch": 0.05949662194309639, + "grad_norm": 1.4851021766662598, + "learning_rate": 4.9564645515248955e-05, + "loss": 5.1806, + "step": 10004 + }, + { + "epoch": 0.059502569226377394, + "grad_norm": 1.8518844842910767, + "learning_rate": 4.956455871969768e-05, + "loss": 5.2543, + "step": 10005 + }, + { + "epoch": 0.05950851650965839, + "grad_norm": 1.7865514755249023, + "learning_rate": 4.956447191557118e-05, + "loss": 5.405, + "step": 10006 + }, + { + "epoch": 0.059514463792939384, + "grad_norm": 1.9051682949066162, + "learning_rate": 4.956438510286946e-05, + "loss": 5.0509, + "step": 10007 + }, + { + "epoch": 0.05952041107622038, + "grad_norm": 1.5150926113128662, + "learning_rate": 4.956429828159258e-05, + "loss": 5.0065, + "step": 10008 + }, + { + "epoch": 0.05952635835950138, + "grad_norm": 1.6085938215255737, + "learning_rate": 4.956421145174056e-05, + "loss": 5.2295, + "step": 10009 + }, + { + "epoch": 0.05953230564278238, + "grad_norm": 1.6337605714797974, + "learning_rate": 4.9564124613313424e-05, + "loss": 5.1666, + "step": 10010 + }, + { + "epoch": 0.05953825292606337, + "grad_norm": 1.5093178749084473, + "learning_rate": 4.9564037766311205e-05, + "loss": 5.2268, + "step": 10011 + }, + { + "epoch": 0.059544200209344374, + "grad_norm": 1.5047305822372437, + "learning_rate": 4.9563950910733936e-05, + "loss": 5.1065, + "step": 10012 + }, + { + "epoch": 0.05955014749262537, + "grad_norm": 1.6275629997253418, + "learning_rate": 4.9563864046581645e-05, + "loss": 5.2366, + "step": 10013 + }, + { + "epoch": 0.059556094775906364, + "grad_norm": 1.535582184791565, + "learning_rate": 4.956377717385436e-05, + "loss": 5.1799, + "step": 10014 + }, + { + "epoch": 0.059562042059187366, + "grad_norm": 1.448477864265442, + "learning_rate": 4.956369029255211e-05, + "loss": 5.2207, + "step": 10015 + }, + { + "epoch": 0.05956798934246836, + "grad_norm": 1.5288492441177368, + "learning_rate": 4.956360340267494e-05, + "loss": 5.3646, + "step": 10016 + }, + { + "epoch": 0.059573936625749356, + "grad_norm": 1.5746785402297974, + "learning_rate": 4.956351650422287e-05, + "loss": 5.1941, + "step": 10017 + }, + { + "epoch": 0.05957988390903036, + "grad_norm": 1.7088212966918945, + "learning_rate": 4.956342959719592e-05, + "loss": 5.1667, + "step": 10018 + }, + { + "epoch": 0.05958583119231135, + "grad_norm": 1.7666717767715454, + "learning_rate": 4.956334268159414e-05, + "loss": 5.1808, + "step": 10019 + }, + { + "epoch": 0.05959177847559235, + "grad_norm": 1.6472598314285278, + "learning_rate": 4.956325575741755e-05, + "loss": 5.3369, + "step": 10020 + }, + { + "epoch": 0.05959772575887334, + "grad_norm": 1.7340562343597412, + "learning_rate": 4.9563168824666174e-05, + "loss": 5.5623, + "step": 10021 + }, + { + "epoch": 0.059603673042154345, + "grad_norm": 1.9677515029907227, + "learning_rate": 4.9563081883340054e-05, + "loss": 4.7612, + "step": 10022 + }, + { + "epoch": 0.05960962032543534, + "grad_norm": 1.4823256731033325, + "learning_rate": 4.9562994933439215e-05, + "loss": 5.4504, + "step": 10023 + }, + { + "epoch": 0.059615567608716336, + "grad_norm": 1.5346739292144775, + "learning_rate": 4.956290797496369e-05, + "loss": 5.5455, + "step": 10024 + }, + { + "epoch": 0.05962151489199734, + "grad_norm": 1.5420036315917969, + "learning_rate": 4.956282100791351e-05, + "loss": 5.1363, + "step": 10025 + }, + { + "epoch": 0.05962746217527833, + "grad_norm": 1.7927091121673584, + "learning_rate": 4.956273403228869e-05, + "loss": 5.0768, + "step": 10026 + }, + { + "epoch": 0.05963340945855933, + "grad_norm": 1.7139612436294556, + "learning_rate": 4.9562647048089287e-05, + "loss": 5.2046, + "step": 10027 + }, + { + "epoch": 0.05963935674184033, + "grad_norm": 1.627684473991394, + "learning_rate": 4.956256005531531e-05, + "loss": 5.3844, + "step": 10028 + }, + { + "epoch": 0.059645304025121325, + "grad_norm": 1.5006085634231567, + "learning_rate": 4.9562473053966805e-05, + "loss": 5.4948, + "step": 10029 + }, + { + "epoch": 0.05965125130840232, + "grad_norm": 1.5670723915100098, + "learning_rate": 4.956238604404378e-05, + "loss": 5.5465, + "step": 10030 + }, + { + "epoch": 0.05965719859168332, + "grad_norm": 1.5671201944351196, + "learning_rate": 4.95622990255463e-05, + "loss": 5.1969, + "step": 10031 + }, + { + "epoch": 0.05966314587496432, + "grad_norm": 2.1628634929656982, + "learning_rate": 4.956221199847436e-05, + "loss": 5.0244, + "step": 10032 + }, + { + "epoch": 0.05966909315824531, + "grad_norm": 1.5766685009002686, + "learning_rate": 4.956212496282801e-05, + "loss": 5.4698, + "step": 10033 + }, + { + "epoch": 0.059675040441526314, + "grad_norm": 1.625812292098999, + "learning_rate": 4.956203791860728e-05, + "loss": 5.3825, + "step": 10034 + }, + { + "epoch": 0.05968098772480731, + "grad_norm": 1.4307054281234741, + "learning_rate": 4.956195086581219e-05, + "loss": 5.3576, + "step": 10035 + }, + { + "epoch": 0.059686935008088304, + "grad_norm": 1.4459644556045532, + "learning_rate": 4.9561863804442785e-05, + "loss": 5.3478, + "step": 10036 + }, + { + "epoch": 0.0596928822913693, + "grad_norm": 1.8038474321365356, + "learning_rate": 4.9561776734499075e-05, + "loss": 5.4967, + "step": 10037 + }, + { + "epoch": 0.0596988295746503, + "grad_norm": 1.41011381149292, + "learning_rate": 4.9561689655981115e-05, + "loss": 5.4224, + "step": 10038 + }, + { + "epoch": 0.059704776857931297, + "grad_norm": 1.6678937673568726, + "learning_rate": 4.956160256888891e-05, + "loss": 5.27, + "step": 10039 + }, + { + "epoch": 0.05971072414121229, + "grad_norm": 1.794647455215454, + "learning_rate": 4.956151547322251e-05, + "loss": 5.2822, + "step": 10040 + }, + { + "epoch": 0.059716671424493294, + "grad_norm": 1.5010912418365479, + "learning_rate": 4.9561428368981944e-05, + "loss": 5.3778, + "step": 10041 + }, + { + "epoch": 0.05972261870777429, + "grad_norm": 1.785395860671997, + "learning_rate": 4.9561341256167234e-05, + "loss": 5.4213, + "step": 10042 + }, + { + "epoch": 0.059728565991055284, + "grad_norm": 1.889667272567749, + "learning_rate": 4.956125413477841e-05, + "loss": 5.2795, + "step": 10043 + }, + { + "epoch": 0.059734513274336286, + "grad_norm": 2.209780216217041, + "learning_rate": 4.95611670048155e-05, + "loss": 5.6823, + "step": 10044 + }, + { + "epoch": 0.05974046055761728, + "grad_norm": 1.979069471359253, + "learning_rate": 4.956107986627855e-05, + "loss": 5.3437, + "step": 10045 + }, + { + "epoch": 0.059746407840898276, + "grad_norm": 1.8391239643096924, + "learning_rate": 4.9560992719167584e-05, + "loss": 5.2246, + "step": 10046 + }, + { + "epoch": 0.05975235512417928, + "grad_norm": 2.0196359157562256, + "learning_rate": 4.956090556348262e-05, + "loss": 5.3549, + "step": 10047 + }, + { + "epoch": 0.05975830240746027, + "grad_norm": 1.7103056907653809, + "learning_rate": 4.95608183992237e-05, + "loss": 5.4016, + "step": 10048 + }, + { + "epoch": 0.05976424969074127, + "grad_norm": 1.543308138847351, + "learning_rate": 4.956073122639085e-05, + "loss": 5.2628, + "step": 10049 + }, + { + "epoch": 0.05977019697402226, + "grad_norm": 2.0719797611236572, + "learning_rate": 4.956064404498411e-05, + "loss": 5.3149, + "step": 10050 + }, + { + "epoch": 0.059776144257303265, + "grad_norm": 1.9024063348770142, + "learning_rate": 4.95605568550035e-05, + "loss": 5.2804, + "step": 10051 + }, + { + "epoch": 0.05978209154058426, + "grad_norm": 1.6171611547470093, + "learning_rate": 4.9560469656449046e-05, + "loss": 5.2558, + "step": 10052 + }, + { + "epoch": 0.059788038823865255, + "grad_norm": 1.5416970252990723, + "learning_rate": 4.9560382449320795e-05, + "loss": 5.3164, + "step": 10053 + }, + { + "epoch": 0.05979398610714626, + "grad_norm": 1.6956002712249756, + "learning_rate": 4.956029523361877e-05, + "loss": 5.2123, + "step": 10054 + }, + { + "epoch": 0.05979993339042725, + "grad_norm": 1.6414602994918823, + "learning_rate": 4.956020800934299e-05, + "loss": 5.3302, + "step": 10055 + }, + { + "epoch": 0.05980588067370825, + "grad_norm": 1.6868051290512085, + "learning_rate": 4.95601207764935e-05, + "loss": 5.2076, + "step": 10056 + }, + { + "epoch": 0.05981182795698925, + "grad_norm": 1.7299697399139404, + "learning_rate": 4.956003353507033e-05, + "loss": 5.3502, + "step": 10057 + }, + { + "epoch": 0.059817775240270245, + "grad_norm": 1.4923878908157349, + "learning_rate": 4.95599462850735e-05, + "loss": 5.3081, + "step": 10058 + }, + { + "epoch": 0.05982372252355124, + "grad_norm": 1.571413516998291, + "learning_rate": 4.9559859026503045e-05, + "loss": 5.1434, + "step": 10059 + }, + { + "epoch": 0.05982966980683224, + "grad_norm": 1.6265422105789185, + "learning_rate": 4.9559771759359e-05, + "loss": 5.2455, + "step": 10060 + }, + { + "epoch": 0.05983561709011324, + "grad_norm": 1.7889208793640137, + "learning_rate": 4.9559684483641395e-05, + "loss": 5.2429, + "step": 10061 + }, + { + "epoch": 0.05984156437339423, + "grad_norm": 1.5957598686218262, + "learning_rate": 4.955959719935025e-05, + "loss": 5.2299, + "step": 10062 + }, + { + "epoch": 0.059847511656675234, + "grad_norm": 1.6366177797317505, + "learning_rate": 4.955950990648561e-05, + "loss": 5.366, + "step": 10063 + }, + { + "epoch": 0.05985345893995623, + "grad_norm": 1.6712719202041626, + "learning_rate": 4.95594226050475e-05, + "loss": 5.3602, + "step": 10064 + }, + { + "epoch": 0.059859406223237224, + "grad_norm": 1.8273069858551025, + "learning_rate": 4.955933529503595e-05, + "loss": 5.3586, + "step": 10065 + }, + { + "epoch": 0.05986535350651822, + "grad_norm": 1.6638576984405518, + "learning_rate": 4.955924797645098e-05, + "loss": 5.2359, + "step": 10066 + }, + { + "epoch": 0.05987130078979922, + "grad_norm": 1.8127614259719849, + "learning_rate": 4.955916064929264e-05, + "loss": 5.3815, + "step": 10067 + }, + { + "epoch": 0.059877248073080216, + "grad_norm": 1.7204198837280273, + "learning_rate": 4.955907331356095e-05, + "loss": 5.5576, + "step": 10068 + }, + { + "epoch": 0.05988319535636121, + "grad_norm": 1.9153103828430176, + "learning_rate": 4.9558985969255936e-05, + "loss": 5.4363, + "step": 10069 + }, + { + "epoch": 0.059889142639642214, + "grad_norm": 1.6427290439605713, + "learning_rate": 4.9558898616377634e-05, + "loss": 5.4497, + "step": 10070 + }, + { + "epoch": 0.05989508992292321, + "grad_norm": 1.660217046737671, + "learning_rate": 4.955881125492608e-05, + "loss": 5.4988, + "step": 10071 + }, + { + "epoch": 0.059901037206204204, + "grad_norm": 1.7776225805282593, + "learning_rate": 4.955872388490129e-05, + "loss": 5.2714, + "step": 10072 + }, + { + "epoch": 0.059906984489485206, + "grad_norm": 1.5099388360977173, + "learning_rate": 4.9558636506303314e-05, + "loss": 5.4714, + "step": 10073 + }, + { + "epoch": 0.0599129317727662, + "grad_norm": 1.523537039756775, + "learning_rate": 4.955854911913217e-05, + "loss": 5.3528, + "step": 10074 + }, + { + "epoch": 0.059918879056047196, + "grad_norm": 1.3424321413040161, + "learning_rate": 4.9558461723387885e-05, + "loss": 5.3385, + "step": 10075 + }, + { + "epoch": 0.0599248263393282, + "grad_norm": 1.3843169212341309, + "learning_rate": 4.955837431907049e-05, + "loss": 5.383, + "step": 10076 + }, + { + "epoch": 0.05993077362260919, + "grad_norm": 1.4927351474761963, + "learning_rate": 4.955828690618003e-05, + "loss": 5.3536, + "step": 10077 + }, + { + "epoch": 0.05993672090589019, + "grad_norm": 1.5207486152648926, + "learning_rate": 4.955819948471653e-05, + "loss": 5.3557, + "step": 10078 + }, + { + "epoch": 0.05994266818917118, + "grad_norm": 1.5589584112167358, + "learning_rate": 4.9558112054680004e-05, + "loss": 5.3747, + "step": 10079 + }, + { + "epoch": 0.059948615472452185, + "grad_norm": 1.436951756477356, + "learning_rate": 4.9558024616070496e-05, + "loss": 5.2807, + "step": 10080 + }, + { + "epoch": 0.05995456275573318, + "grad_norm": 1.4345866441726685, + "learning_rate": 4.955793716888804e-05, + "loss": 5.4, + "step": 10081 + }, + { + "epoch": 0.059960510039014175, + "grad_norm": 1.2811249494552612, + "learning_rate": 4.955784971313267e-05, + "loss": 5.2531, + "step": 10082 + }, + { + "epoch": 0.05996645732229518, + "grad_norm": 1.5558568239212036, + "learning_rate": 4.955776224880439e-05, + "loss": 5.1136, + "step": 10083 + }, + { + "epoch": 0.05997240460557617, + "grad_norm": 1.3918567895889282, + "learning_rate": 4.955767477590326e-05, + "loss": 5.2748, + "step": 10084 + }, + { + "epoch": 0.05997835188885717, + "grad_norm": 1.3277204036712646, + "learning_rate": 4.9557587294429295e-05, + "loss": 5.2346, + "step": 10085 + }, + { + "epoch": 0.05998429917213817, + "grad_norm": 1.2874623537063599, + "learning_rate": 4.955749980438253e-05, + "loss": 5.2616, + "step": 10086 + }, + { + "epoch": 0.059990246455419165, + "grad_norm": 1.7534229755401611, + "learning_rate": 4.9557412305763004e-05, + "loss": 5.2509, + "step": 10087 + }, + { + "epoch": 0.05999619373870016, + "grad_norm": 1.4560372829437256, + "learning_rate": 4.955732479857072e-05, + "loss": 5.2385, + "step": 10088 + }, + { + "epoch": 0.06000214102198116, + "grad_norm": 1.232779860496521, + "learning_rate": 4.955723728280575e-05, + "loss": 5.2726, + "step": 10089 + }, + { + "epoch": 0.06000808830526216, + "grad_norm": 1.6178683042526245, + "learning_rate": 4.955714975846809e-05, + "loss": 5.3816, + "step": 10090 + }, + { + "epoch": 0.06001403558854315, + "grad_norm": 1.5438450574874878, + "learning_rate": 4.955706222555779e-05, + "loss": 5.2706, + "step": 10091 + }, + { + "epoch": 0.060019982871824154, + "grad_norm": 1.5367876291275024, + "learning_rate": 4.955697468407486e-05, + "loss": 5.1955, + "step": 10092 + }, + { + "epoch": 0.06002593015510515, + "grad_norm": 1.2902512550354004, + "learning_rate": 4.955688713401936e-05, + "loss": 5.166, + "step": 10093 + }, + { + "epoch": 0.060031877438386144, + "grad_norm": 1.5516488552093506, + "learning_rate": 4.95567995753913e-05, + "loss": 5.1256, + "step": 10094 + }, + { + "epoch": 0.06003782472166714, + "grad_norm": 1.3104857206344604, + "learning_rate": 4.9556712008190706e-05, + "loss": 5.1604, + "step": 10095 + }, + { + "epoch": 0.06004377200494814, + "grad_norm": 1.6237741708755493, + "learning_rate": 4.955662443241762e-05, + "loss": 5.2686, + "step": 10096 + }, + { + "epoch": 0.060049719288229136, + "grad_norm": 1.6566027402877808, + "learning_rate": 4.955653684807208e-05, + "loss": 5.3376, + "step": 10097 + }, + { + "epoch": 0.06005566657151013, + "grad_norm": 1.4010981321334839, + "learning_rate": 4.9556449255154106e-05, + "loss": 5.4008, + "step": 10098 + }, + { + "epoch": 0.060061613854791134, + "grad_norm": 1.6399116516113281, + "learning_rate": 4.955636165366372e-05, + "loss": 5.2718, + "step": 10099 + }, + { + "epoch": 0.06006756113807213, + "grad_norm": 1.5371499061584473, + "learning_rate": 4.955627404360096e-05, + "loss": 5.2107, + "step": 10100 + }, + { + "epoch": 0.060073508421353124, + "grad_norm": 1.598186731338501, + "learning_rate": 4.955618642496587e-05, + "loss": 5.3482, + "step": 10101 + }, + { + "epoch": 0.060079455704634126, + "grad_norm": 1.526595115661621, + "learning_rate": 4.955609879775846e-05, + "loss": 5.2335, + "step": 10102 + }, + { + "epoch": 0.06008540298791512, + "grad_norm": 1.509990930557251, + "learning_rate": 4.955601116197877e-05, + "loss": 5.168, + "step": 10103 + }, + { + "epoch": 0.060091350271196116, + "grad_norm": 1.368203043937683, + "learning_rate": 4.9555923517626836e-05, + "loss": 5.2183, + "step": 10104 + }, + { + "epoch": 0.06009729755447712, + "grad_norm": 1.5153454542160034, + "learning_rate": 4.955583586470268e-05, + "loss": 5.2558, + "step": 10105 + }, + { + "epoch": 0.06010324483775811, + "grad_norm": 2.9330217838287354, + "learning_rate": 4.955574820320633e-05, + "loss": 5.6863, + "step": 10106 + }, + { + "epoch": 0.06010919212103911, + "grad_norm": 1.6096080541610718, + "learning_rate": 4.9555660533137825e-05, + "loss": 5.2243, + "step": 10107 + }, + { + "epoch": 0.0601151394043201, + "grad_norm": 1.5425163507461548, + "learning_rate": 4.95555728544972e-05, + "loss": 5.4308, + "step": 10108 + }, + { + "epoch": 0.060121086687601105, + "grad_norm": 1.4898573160171509, + "learning_rate": 4.955548516728447e-05, + "loss": 5.389, + "step": 10109 + }, + { + "epoch": 0.0601270339708821, + "grad_norm": 1.5746946334838867, + "learning_rate": 4.955539747149968e-05, + "loss": 5.1414, + "step": 10110 + }, + { + "epoch": 0.060132981254163095, + "grad_norm": 1.7621461153030396, + "learning_rate": 4.955530976714285e-05, + "loss": 5.4572, + "step": 10111 + }, + { + "epoch": 0.0601389285374441, + "grad_norm": 1.4524224996566772, + "learning_rate": 4.9555222054214015e-05, + "loss": 5.4577, + "step": 10112 + }, + { + "epoch": 0.06014487582072509, + "grad_norm": 1.5630146265029907, + "learning_rate": 4.95551343327132e-05, + "loss": 5.277, + "step": 10113 + }, + { + "epoch": 0.06015082310400609, + "grad_norm": 1.9279972314834595, + "learning_rate": 4.955504660264045e-05, + "loss": 5.1485, + "step": 10114 + }, + { + "epoch": 0.06015677038728709, + "grad_norm": 1.618775725364685, + "learning_rate": 4.9554958863995786e-05, + "loss": 5.1262, + "step": 10115 + }, + { + "epoch": 0.060162717670568085, + "grad_norm": 1.8578898906707764, + "learning_rate": 4.955487111677924e-05, + "loss": 5.3451, + "step": 10116 + }, + { + "epoch": 0.06016866495384908, + "grad_norm": 1.5652815103530884, + "learning_rate": 4.955478336099084e-05, + "loss": 5.2326, + "step": 10117 + }, + { + "epoch": 0.06017461223713008, + "grad_norm": 1.4957774877548218, + "learning_rate": 4.9554695596630616e-05, + "loss": 5.3332, + "step": 10118 + }, + { + "epoch": 0.06018055952041108, + "grad_norm": 1.428112506866455, + "learning_rate": 4.9554607823698606e-05, + "loss": 5.2647, + "step": 10119 + }, + { + "epoch": 0.06018650680369207, + "grad_norm": 1.9383279085159302, + "learning_rate": 4.955452004219484e-05, + "loss": 5.5897, + "step": 10120 + }, + { + "epoch": 0.060192454086973074, + "grad_norm": 1.8523132801055908, + "learning_rate": 4.955443225211934e-05, + "loss": 5.6204, + "step": 10121 + }, + { + "epoch": 0.06019840137025407, + "grad_norm": 1.7980049848556519, + "learning_rate": 4.955434445347214e-05, + "loss": 5.4383, + "step": 10122 + }, + { + "epoch": 0.060204348653535064, + "grad_norm": 1.7927988767623901, + "learning_rate": 4.9554256646253274e-05, + "loss": 5.6066, + "step": 10123 + }, + { + "epoch": 0.06021029593681606, + "grad_norm": 1.8549528121948242, + "learning_rate": 4.955416883046277e-05, + "loss": 5.2963, + "step": 10124 + }, + { + "epoch": 0.06021624322009706, + "grad_norm": 1.7140870094299316, + "learning_rate": 4.955408100610066e-05, + "loss": 5.4636, + "step": 10125 + }, + { + "epoch": 0.060222190503378056, + "grad_norm": 1.3744412660598755, + "learning_rate": 4.955399317316697e-05, + "loss": 5.2985, + "step": 10126 + }, + { + "epoch": 0.06022813778665905, + "grad_norm": 1.572782278060913, + "learning_rate": 4.9553905331661734e-05, + "loss": 5.2598, + "step": 10127 + }, + { + "epoch": 0.06023408506994005, + "grad_norm": 1.6485692262649536, + "learning_rate": 4.955381748158499e-05, + "loss": 5.3764, + "step": 10128 + }, + { + "epoch": 0.06024003235322105, + "grad_norm": 1.5442413091659546, + "learning_rate": 4.955372962293676e-05, + "loss": 5.2504, + "step": 10129 + }, + { + "epoch": 0.060245979636502044, + "grad_norm": 1.807518482208252, + "learning_rate": 4.9553641755717075e-05, + "loss": 5.2853, + "step": 10130 + }, + { + "epoch": 0.060251926919783046, + "grad_norm": 1.5858244895935059, + "learning_rate": 4.9553553879925965e-05, + "loss": 5.2645, + "step": 10131 + }, + { + "epoch": 0.06025787420306404, + "grad_norm": 1.596307396888733, + "learning_rate": 4.955346599556347e-05, + "loss": 5.4094, + "step": 10132 + }, + { + "epoch": 0.060263821486345036, + "grad_norm": 1.4624857902526855, + "learning_rate": 4.955337810262961e-05, + "loss": 5.4366, + "step": 10133 + }, + { + "epoch": 0.06026976876962604, + "grad_norm": 1.426866888999939, + "learning_rate": 4.955329020112442e-05, + "loss": 5.324, + "step": 10134 + }, + { + "epoch": 0.06027571605290703, + "grad_norm": 1.6577516794204712, + "learning_rate": 4.955320229104793e-05, + "loss": 5.2937, + "step": 10135 + }, + { + "epoch": 0.06028166333618803, + "grad_norm": 1.3958433866500854, + "learning_rate": 4.9553114372400166e-05, + "loss": 5.421, + "step": 10136 + }, + { + "epoch": 0.06028761061946902, + "grad_norm": 1.3242517709732056, + "learning_rate": 4.9553026445181173e-05, + "loss": 5.2697, + "step": 10137 + }, + { + "epoch": 0.060293557902750025, + "grad_norm": 1.519018530845642, + "learning_rate": 4.955293850939096e-05, + "loss": 5.1432, + "step": 10138 + }, + { + "epoch": 0.06029950518603102, + "grad_norm": 1.528515338897705, + "learning_rate": 4.955285056502958e-05, + "loss": 5.1388, + "step": 10139 + }, + { + "epoch": 0.060305452469312015, + "grad_norm": 1.4830992221832275, + "learning_rate": 4.955276261209705e-05, + "loss": 5.3222, + "step": 10140 + }, + { + "epoch": 0.06031139975259302, + "grad_norm": 1.4149411916732788, + "learning_rate": 4.95526746505934e-05, + "loss": 5.2706, + "step": 10141 + }, + { + "epoch": 0.06031734703587401, + "grad_norm": 1.4466478824615479, + "learning_rate": 4.9552586680518676e-05, + "loss": 5.2309, + "step": 10142 + }, + { + "epoch": 0.06032329431915501, + "grad_norm": 1.4246203899383545, + "learning_rate": 4.9552498701872884e-05, + "loss": 5.1539, + "step": 10143 + }, + { + "epoch": 0.06032924160243601, + "grad_norm": 1.632572889328003, + "learning_rate": 4.955241071465608e-05, + "loss": 5.3788, + "step": 10144 + }, + { + "epoch": 0.060335188885717005, + "grad_norm": 1.5974568128585815, + "learning_rate": 4.955232271886828e-05, + "loss": 5.3558, + "step": 10145 + }, + { + "epoch": 0.060341136168998, + "grad_norm": 1.6396468877792358, + "learning_rate": 4.9552234714509516e-05, + "loss": 5.2162, + "step": 10146 + }, + { + "epoch": 0.060347083452279, + "grad_norm": 1.5349491834640503, + "learning_rate": 4.9552146701579815e-05, + "loss": 5.212, + "step": 10147 + }, + { + "epoch": 0.06035303073556, + "grad_norm": 1.5236495733261108, + "learning_rate": 4.955205868007922e-05, + "loss": 5.2984, + "step": 10148 + }, + { + "epoch": 0.06035897801884099, + "grad_norm": 1.4593411684036255, + "learning_rate": 4.955197065000775e-05, + "loss": 5.268, + "step": 10149 + }, + { + "epoch": 0.060364925302121994, + "grad_norm": 1.4498536586761475, + "learning_rate": 4.955188261136545e-05, + "loss": 5.1437, + "step": 10150 + }, + { + "epoch": 0.06037087258540299, + "grad_norm": 1.5059176683425903, + "learning_rate": 4.9551794564152334e-05, + "loss": 5.3011, + "step": 10151 + }, + { + "epoch": 0.060376819868683984, + "grad_norm": 1.5773544311523438, + "learning_rate": 4.9551706508368445e-05, + "loss": 5.2066, + "step": 10152 + }, + { + "epoch": 0.06038276715196498, + "grad_norm": 1.4858072996139526, + "learning_rate": 4.95516184440138e-05, + "loss": 5.2757, + "step": 10153 + }, + { + "epoch": 0.06038871443524598, + "grad_norm": 1.486055612564087, + "learning_rate": 4.955153037108845e-05, + "loss": 5.1416, + "step": 10154 + }, + { + "epoch": 0.060394661718526976, + "grad_norm": 1.3411048650741577, + "learning_rate": 4.955144228959241e-05, + "loss": 5.1708, + "step": 10155 + }, + { + "epoch": 0.06040060900180797, + "grad_norm": 1.2979127168655396, + "learning_rate": 4.9551354199525714e-05, + "loss": 5.1421, + "step": 10156 + }, + { + "epoch": 0.06040655628508897, + "grad_norm": 1.4928209781646729, + "learning_rate": 4.9551266100888395e-05, + "loss": 5.2185, + "step": 10157 + }, + { + "epoch": 0.06041250356836997, + "grad_norm": 1.58747398853302, + "learning_rate": 4.955117799368048e-05, + "loss": 5.2587, + "step": 10158 + }, + { + "epoch": 0.060418450851650964, + "grad_norm": 1.1862558126449585, + "learning_rate": 4.9551089877902e-05, + "loss": 5.2405, + "step": 10159 + }, + { + "epoch": 0.060424398134931966, + "grad_norm": 1.5547248125076294, + "learning_rate": 4.955100175355299e-05, + "loss": 5.2326, + "step": 10160 + }, + { + "epoch": 0.06043034541821296, + "grad_norm": 1.6986664533615112, + "learning_rate": 4.955091362063349e-05, + "loss": 5.2261, + "step": 10161 + }, + { + "epoch": 0.060436292701493956, + "grad_norm": 1.531891107559204, + "learning_rate": 4.95508254791435e-05, + "loss": 5.4475, + "step": 10162 + }, + { + "epoch": 0.06044223998477496, + "grad_norm": 1.57411789894104, + "learning_rate": 4.955073732908309e-05, + "loss": 5.1346, + "step": 10163 + }, + { + "epoch": 0.06044818726805595, + "grad_norm": 1.548439383506775, + "learning_rate": 4.9550649170452255e-05, + "loss": 5.1953, + "step": 10164 + }, + { + "epoch": 0.06045413455133695, + "grad_norm": 1.645850419998169, + "learning_rate": 4.955056100325105e-05, + "loss": 5.2728, + "step": 10165 + }, + { + "epoch": 0.06046008183461794, + "grad_norm": 1.6308786869049072, + "learning_rate": 4.95504728274795e-05, + "loss": 5.3134, + "step": 10166 + }, + { + "epoch": 0.060466029117898945, + "grad_norm": 1.4754101037979126, + "learning_rate": 4.955038464313763e-05, + "loss": 5.3938, + "step": 10167 + }, + { + "epoch": 0.06047197640117994, + "grad_norm": 2.408869981765747, + "learning_rate": 4.955029645022548e-05, + "loss": 5.4687, + "step": 10168 + }, + { + "epoch": 0.060477923684460935, + "grad_norm": 1.6601638793945312, + "learning_rate": 4.955020824874307e-05, + "loss": 5.165, + "step": 10169 + }, + { + "epoch": 0.06048387096774194, + "grad_norm": 1.5239113569259644, + "learning_rate": 4.955012003869043e-05, + "loss": 5.133, + "step": 10170 + }, + { + "epoch": 0.06048981825102293, + "grad_norm": 1.6661083698272705, + "learning_rate": 4.955003182006761e-05, + "loss": 5.2033, + "step": 10171 + }, + { + "epoch": 0.06049576553430393, + "grad_norm": 1.4320698976516724, + "learning_rate": 4.9549943592874615e-05, + "loss": 5.1842, + "step": 10172 + }, + { + "epoch": 0.06050171281758493, + "grad_norm": 1.789302110671997, + "learning_rate": 4.95498553571115e-05, + "loss": 5.1052, + "step": 10173 + }, + { + "epoch": 0.060507660100865925, + "grad_norm": 1.598085880279541, + "learning_rate": 4.954976711277828e-05, + "loss": 5.3194, + "step": 10174 + }, + { + "epoch": 0.06051360738414692, + "grad_norm": 1.4569145441055298, + "learning_rate": 4.954967885987498e-05, + "loss": 5.2009, + "step": 10175 + }, + { + "epoch": 0.06051955466742792, + "grad_norm": 1.5980345010757446, + "learning_rate": 4.954959059840165e-05, + "loss": 5.1686, + "step": 10176 + }, + { + "epoch": 0.06052550195070892, + "grad_norm": 1.5382320880889893, + "learning_rate": 4.954950232835831e-05, + "loss": 5.303, + "step": 10177 + }, + { + "epoch": 0.06053144923398991, + "grad_norm": 1.5568296909332275, + "learning_rate": 4.954941404974499e-05, + "loss": 5.2044, + "step": 10178 + }, + { + "epoch": 0.060537396517270914, + "grad_norm": 1.6732075214385986, + "learning_rate": 4.954932576256173e-05, + "loss": 5.3133, + "step": 10179 + }, + { + "epoch": 0.06054334380055191, + "grad_norm": 1.6905434131622314, + "learning_rate": 4.954923746680855e-05, + "loss": 5.3868, + "step": 10180 + }, + { + "epoch": 0.060549291083832904, + "grad_norm": 1.4349027872085571, + "learning_rate": 4.954914916248549e-05, + "loss": 5.2215, + "step": 10181 + }, + { + "epoch": 0.0605552383671139, + "grad_norm": 1.5257092714309692, + "learning_rate": 4.9549060849592566e-05, + "loss": 5.2148, + "step": 10182 + }, + { + "epoch": 0.0605611856503949, + "grad_norm": 1.5402655601501465, + "learning_rate": 4.954897252812982e-05, + "loss": 5.3069, + "step": 10183 + }, + { + "epoch": 0.060567132933675896, + "grad_norm": 1.801798701286316, + "learning_rate": 4.954888419809729e-05, + "loss": 5.0786, + "step": 10184 + }, + { + "epoch": 0.06057308021695689, + "grad_norm": 1.4860090017318726, + "learning_rate": 4.954879585949499e-05, + "loss": 4.8878, + "step": 10185 + }, + { + "epoch": 0.06057902750023789, + "grad_norm": 1.7319056987762451, + "learning_rate": 4.954870751232296e-05, + "loss": 4.9013, + "step": 10186 + }, + { + "epoch": 0.06058497478351889, + "grad_norm": 1.4376243352890015, + "learning_rate": 4.954861915658123e-05, + "loss": 5.37, + "step": 10187 + }, + { + "epoch": 0.060590922066799884, + "grad_norm": 1.2903879880905151, + "learning_rate": 4.954853079226983e-05, + "loss": 5.5355, + "step": 10188 + }, + { + "epoch": 0.060596869350080886, + "grad_norm": 1.5223259925842285, + "learning_rate": 4.95484424193888e-05, + "loss": 5.3451, + "step": 10189 + }, + { + "epoch": 0.06060281663336188, + "grad_norm": 1.283892035484314, + "learning_rate": 4.954835403793815e-05, + "loss": 5.2245, + "step": 10190 + }, + { + "epoch": 0.060608763916642876, + "grad_norm": 1.5581207275390625, + "learning_rate": 4.9548265647917936e-05, + "loss": 5.303, + "step": 10191 + }, + { + "epoch": 0.06061471119992388, + "grad_norm": 1.4258673191070557, + "learning_rate": 4.9548177249328164e-05, + "loss": 5.4569, + "step": 10192 + }, + { + "epoch": 0.06062065848320487, + "grad_norm": 1.4326061010360718, + "learning_rate": 4.9548088842168886e-05, + "loss": 5.2761, + "step": 10193 + }, + { + "epoch": 0.06062660576648587, + "grad_norm": 1.9100563526153564, + "learning_rate": 4.9548000426440114e-05, + "loss": 4.9366, + "step": 10194 + }, + { + "epoch": 0.06063255304976687, + "grad_norm": 1.7059932947158813, + "learning_rate": 4.9547912002141895e-05, + "loss": 4.9135, + "step": 10195 + }, + { + "epoch": 0.060638500333047865, + "grad_norm": 1.6715087890625, + "learning_rate": 4.954782356927425e-05, + "loss": 5.0662, + "step": 10196 + }, + { + "epoch": 0.06064444761632886, + "grad_norm": 1.966430902481079, + "learning_rate": 4.9547735127837223e-05, + "loss": 4.7995, + "step": 10197 + }, + { + "epoch": 0.060650394899609855, + "grad_norm": 1.7138090133666992, + "learning_rate": 4.954764667783083e-05, + "loss": 4.9745, + "step": 10198 + }, + { + "epoch": 0.06065634218289086, + "grad_norm": 1.832889199256897, + "learning_rate": 4.95475582192551e-05, + "loss": 4.9795, + "step": 10199 + }, + { + "epoch": 0.06066228946617185, + "grad_norm": 1.883525013923645, + "learning_rate": 4.954746975211008e-05, + "loss": 4.8523, + "step": 10200 + }, + { + "epoch": 0.06066823674945285, + "grad_norm": 1.747101068496704, + "learning_rate": 4.954738127639579e-05, + "loss": 4.9402, + "step": 10201 + }, + { + "epoch": 0.06067418403273385, + "grad_norm": 1.583900809288025, + "learning_rate": 4.9547292792112256e-05, + "loss": 5.176, + "step": 10202 + }, + { + "epoch": 0.060680131316014845, + "grad_norm": 1.6390752792358398, + "learning_rate": 4.954720429925953e-05, + "loss": 5.1014, + "step": 10203 + }, + { + "epoch": 0.06068607859929584, + "grad_norm": 1.4499305486679077, + "learning_rate": 4.954711579783762e-05, + "loss": 5.1473, + "step": 10204 + }, + { + "epoch": 0.06069202588257684, + "grad_norm": 1.2734607458114624, + "learning_rate": 4.954702728784656e-05, + "loss": 5.0919, + "step": 10205 + }, + { + "epoch": 0.06069797316585784, + "grad_norm": 1.4447498321533203, + "learning_rate": 4.954693876928639e-05, + "loss": 5.0145, + "step": 10206 + }, + { + "epoch": 0.06070392044913883, + "grad_norm": 1.7052301168441772, + "learning_rate": 4.954685024215714e-05, + "loss": 5.109, + "step": 10207 + }, + { + "epoch": 0.060709867732419834, + "grad_norm": 1.6922130584716797, + "learning_rate": 4.9546761706458836e-05, + "loss": 5.2519, + "step": 10208 + }, + { + "epoch": 0.06071581501570083, + "grad_norm": 1.7998334169387817, + "learning_rate": 4.954667316219151e-05, + "loss": 5.2272, + "step": 10209 + }, + { + "epoch": 0.060721762298981824, + "grad_norm": 1.6331555843353271, + "learning_rate": 4.95465846093552e-05, + "loss": 5.1382, + "step": 10210 + }, + { + "epoch": 0.06072770958226282, + "grad_norm": 1.4777888059616089, + "learning_rate": 4.954649604794993e-05, + "loss": 5.0601, + "step": 10211 + }, + { + "epoch": 0.06073365686554382, + "grad_norm": 1.6776998043060303, + "learning_rate": 4.954640747797573e-05, + "loss": 5.0229, + "step": 10212 + }, + { + "epoch": 0.060739604148824816, + "grad_norm": 1.9567780494689941, + "learning_rate": 4.9546318899432634e-05, + "loss": 5.483, + "step": 10213 + }, + { + "epoch": 0.06074555143210581, + "grad_norm": 1.7381116151809692, + "learning_rate": 4.9546230312320664e-05, + "loss": 5.4088, + "step": 10214 + }, + { + "epoch": 0.06075149871538681, + "grad_norm": 2.290041446685791, + "learning_rate": 4.954614171663986e-05, + "loss": 5.0879, + "step": 10215 + }, + { + "epoch": 0.06075744599866781, + "grad_norm": 1.680309534072876, + "learning_rate": 4.9546053112390255e-05, + "loss": 5.1931, + "step": 10216 + }, + { + "epoch": 0.0607633932819488, + "grad_norm": 1.997379183769226, + "learning_rate": 4.9545964499571885e-05, + "loss": 5.0834, + "step": 10217 + }, + { + "epoch": 0.060769340565229805, + "grad_norm": 1.9145865440368652, + "learning_rate": 4.954587587818476e-05, + "loss": 5.3478, + "step": 10218 + }, + { + "epoch": 0.0607752878485108, + "grad_norm": 1.565874457359314, + "learning_rate": 4.954578724822893e-05, + "loss": 5.2579, + "step": 10219 + }, + { + "epoch": 0.060781235131791796, + "grad_norm": 1.5997511148452759, + "learning_rate": 4.9545698609704416e-05, + "loss": 5.233, + "step": 10220 + }, + { + "epoch": 0.0607871824150728, + "grad_norm": 2.205021619796753, + "learning_rate": 4.954560996261125e-05, + "loss": 5.227, + "step": 10221 + }, + { + "epoch": 0.06079312969835379, + "grad_norm": 1.5360487699508667, + "learning_rate": 4.954552130694947e-05, + "loss": 5.182, + "step": 10222 + }, + { + "epoch": 0.06079907698163479, + "grad_norm": 1.5571166276931763, + "learning_rate": 4.95454326427191e-05, + "loss": 5.3671, + "step": 10223 + }, + { + "epoch": 0.06080502426491579, + "grad_norm": 1.7289685010910034, + "learning_rate": 4.9545343969920175e-05, + "loss": 5.1256, + "step": 10224 + }, + { + "epoch": 0.060810971548196785, + "grad_norm": 1.7945314645767212, + "learning_rate": 4.954525528855272e-05, + "loss": 5.0339, + "step": 10225 + }, + { + "epoch": 0.06081691883147778, + "grad_norm": 1.7037841081619263, + "learning_rate": 4.954516659861678e-05, + "loss": 4.9308, + "step": 10226 + }, + { + "epoch": 0.060822866114758775, + "grad_norm": 1.8096303939819336, + "learning_rate": 4.954507790011237e-05, + "loss": 5.1173, + "step": 10227 + }, + { + "epoch": 0.06082881339803978, + "grad_norm": 1.7563896179199219, + "learning_rate": 4.954498919303952e-05, + "loss": 5.1713, + "step": 10228 + }, + { + "epoch": 0.06083476068132077, + "grad_norm": 1.8820421695709229, + "learning_rate": 4.954490047739827e-05, + "loss": 5.2372, + "step": 10229 + }, + { + "epoch": 0.06084070796460177, + "grad_norm": 2.7050085067749023, + "learning_rate": 4.954481175318865e-05, + "loss": 5.6108, + "step": 10230 + }, + { + "epoch": 0.06084665524788277, + "grad_norm": 1.6424611806869507, + "learning_rate": 4.954472302041069e-05, + "loss": 5.1423, + "step": 10231 + }, + { + "epoch": 0.060852602531163764, + "grad_norm": 1.7690013647079468, + "learning_rate": 4.954463427906443e-05, + "loss": 5.0232, + "step": 10232 + }, + { + "epoch": 0.06085854981444476, + "grad_norm": 1.8925920724868774, + "learning_rate": 4.9544545529149874e-05, + "loss": 4.8949, + "step": 10233 + }, + { + "epoch": 0.06086449709772576, + "grad_norm": 1.7629793882369995, + "learning_rate": 4.954445677066709e-05, + "loss": 4.8832, + "step": 10234 + }, + { + "epoch": 0.06087044438100676, + "grad_norm": 1.5553311109542847, + "learning_rate": 4.9544368003616084e-05, + "loss": 4.8787, + "step": 10235 + }, + { + "epoch": 0.06087639166428775, + "grad_norm": 1.6236152648925781, + "learning_rate": 4.9544279227996884e-05, + "loss": 4.8583, + "step": 10236 + }, + { + "epoch": 0.060882338947568754, + "grad_norm": 1.7591924667358398, + "learning_rate": 4.954419044380954e-05, + "loss": 5.1468, + "step": 10237 + }, + { + "epoch": 0.06088828623084975, + "grad_norm": 1.8084702491760254, + "learning_rate": 4.954410165105406e-05, + "loss": 5.3178, + "step": 10238 + }, + { + "epoch": 0.060894233514130744, + "grad_norm": 1.6629832983016968, + "learning_rate": 4.9544012849730495e-05, + "loss": 5.2955, + "step": 10239 + }, + { + "epoch": 0.06090018079741174, + "grad_norm": 1.6681956052780151, + "learning_rate": 4.954392403983887e-05, + "loss": 4.9919, + "step": 10240 + }, + { + "epoch": 0.06090612808069274, + "grad_norm": 1.7849150896072388, + "learning_rate": 4.954383522137922e-05, + "loss": 4.9667, + "step": 10241 + }, + { + "epoch": 0.060912075363973736, + "grad_norm": 1.6313222646713257, + "learning_rate": 4.954374639435157e-05, + "loss": 4.9842, + "step": 10242 + }, + { + "epoch": 0.06091802264725473, + "grad_norm": 1.3376604318618774, + "learning_rate": 4.954365755875594e-05, + "loss": 5.2643, + "step": 10243 + }, + { + "epoch": 0.06092396993053573, + "grad_norm": 1.5971726179122925, + "learning_rate": 4.954356871459238e-05, + "loss": 5.2225, + "step": 10244 + }, + { + "epoch": 0.06092991721381673, + "grad_norm": 1.638786792755127, + "learning_rate": 4.954347986186091e-05, + "loss": 5.2855, + "step": 10245 + }, + { + "epoch": 0.06093586449709772, + "grad_norm": 1.6273027658462524, + "learning_rate": 4.954339100056157e-05, + "loss": 5.3825, + "step": 10246 + }, + { + "epoch": 0.060941811780378725, + "grad_norm": 1.4666591882705688, + "learning_rate": 4.954330213069438e-05, + "loss": 5.3148, + "step": 10247 + }, + { + "epoch": 0.06094775906365972, + "grad_norm": 1.447332501411438, + "learning_rate": 4.954321325225938e-05, + "loss": 5.1907, + "step": 10248 + }, + { + "epoch": 0.060953706346940716, + "grad_norm": 1.7162379026412964, + "learning_rate": 4.95431243652566e-05, + "loss": 5.289, + "step": 10249 + }, + { + "epoch": 0.06095965363022172, + "grad_norm": 1.7236372232437134, + "learning_rate": 4.954303546968606e-05, + "loss": 5.1839, + "step": 10250 + }, + { + "epoch": 0.06096560091350271, + "grad_norm": 1.76384437084198, + "learning_rate": 4.954294656554781e-05, + "loss": 5.1665, + "step": 10251 + }, + { + "epoch": 0.06097154819678371, + "grad_norm": 1.595041275024414, + "learning_rate": 4.954285765284187e-05, + "loss": 5.2667, + "step": 10252 + }, + { + "epoch": 0.06097749548006471, + "grad_norm": 1.6735886335372925, + "learning_rate": 4.954276873156827e-05, + "loss": 5.3367, + "step": 10253 + }, + { + "epoch": 0.060983442763345705, + "grad_norm": 1.656801462173462, + "learning_rate": 4.9542679801727044e-05, + "loss": 5.3188, + "step": 10254 + }, + { + "epoch": 0.0609893900466267, + "grad_norm": 1.7149133682250977, + "learning_rate": 4.9542590863318214e-05, + "loss": 5.0618, + "step": 10255 + }, + { + "epoch": 0.060995337329907695, + "grad_norm": 1.715561032295227, + "learning_rate": 4.954250191634183e-05, + "loss": 5.2589, + "step": 10256 + }, + { + "epoch": 0.0610012846131887, + "grad_norm": 1.4005486965179443, + "learning_rate": 4.95424129607979e-05, + "loss": 5.1061, + "step": 10257 + }, + { + "epoch": 0.06100723189646969, + "grad_norm": 1.6608542203903198, + "learning_rate": 4.954232399668648e-05, + "loss": 5.3779, + "step": 10258 + }, + { + "epoch": 0.06101317917975069, + "grad_norm": 1.5471054315567017, + "learning_rate": 4.954223502400758e-05, + "loss": 5.448, + "step": 10259 + }, + { + "epoch": 0.06101912646303169, + "grad_norm": 1.6794294118881226, + "learning_rate": 4.9542146042761246e-05, + "loss": 5.1452, + "step": 10260 + }, + { + "epoch": 0.061025073746312684, + "grad_norm": 1.5416966676712036, + "learning_rate": 4.95420570529475e-05, + "loss": 5.2192, + "step": 10261 + }, + { + "epoch": 0.06103102102959368, + "grad_norm": 1.6667221784591675, + "learning_rate": 4.954196805456637e-05, + "loss": 5.3682, + "step": 10262 + }, + { + "epoch": 0.06103696831287468, + "grad_norm": 1.3199689388275146, + "learning_rate": 4.95418790476179e-05, + "loss": 5.1038, + "step": 10263 + }, + { + "epoch": 0.06104291559615568, + "grad_norm": 1.5326366424560547, + "learning_rate": 4.954179003210211e-05, + "loss": 5.3002, + "step": 10264 + }, + { + "epoch": 0.06104886287943667, + "grad_norm": 1.529453992843628, + "learning_rate": 4.954170100801904e-05, + "loss": 5.4515, + "step": 10265 + }, + { + "epoch": 0.061054810162717674, + "grad_norm": 1.719894528388977, + "learning_rate": 4.954161197536871e-05, + "loss": 5.4161, + "step": 10266 + }, + { + "epoch": 0.06106075744599867, + "grad_norm": 1.4632771015167236, + "learning_rate": 4.954152293415115e-05, + "loss": 5.4669, + "step": 10267 + }, + { + "epoch": 0.061066704729279664, + "grad_norm": 1.7698414325714111, + "learning_rate": 4.954143388436641e-05, + "loss": 5.4045, + "step": 10268 + }, + { + "epoch": 0.06107265201256066, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.95413448260145e-05, + "loss": 5.3637, + "step": 10269 + }, + { + "epoch": 0.06107859929584166, + "grad_norm": 1.6832401752471924, + "learning_rate": 4.954125575909547e-05, + "loss": 5.2123, + "step": 10270 + }, + { + "epoch": 0.061084546579122656, + "grad_norm": 1.6782628297805786, + "learning_rate": 4.954116668360933e-05, + "loss": 5.3007, + "step": 10271 + }, + { + "epoch": 0.06109049386240365, + "grad_norm": 1.598941683769226, + "learning_rate": 4.954107759955613e-05, + "loss": 5.1452, + "step": 10272 + }, + { + "epoch": 0.06109644114568465, + "grad_norm": 1.4137005805969238, + "learning_rate": 4.954098850693589e-05, + "loss": 5.1348, + "step": 10273 + }, + { + "epoch": 0.06110238842896565, + "grad_norm": 1.388108730316162, + "learning_rate": 4.9540899405748646e-05, + "loss": 5.4108, + "step": 10274 + }, + { + "epoch": 0.06110833571224664, + "grad_norm": 1.5997217893600464, + "learning_rate": 4.954081029599443e-05, + "loss": 5.3727, + "step": 10275 + }, + { + "epoch": 0.061114282995527645, + "grad_norm": 1.5805003643035889, + "learning_rate": 4.954072117767327e-05, + "loss": 5.4151, + "step": 10276 + }, + { + "epoch": 0.06112023027880864, + "grad_norm": 1.402063250541687, + "learning_rate": 4.9540632050785194e-05, + "loss": 5.287, + "step": 10277 + }, + { + "epoch": 0.061126177562089636, + "grad_norm": 1.6100205183029175, + "learning_rate": 4.9540542915330236e-05, + "loss": 5.2047, + "step": 10278 + }, + { + "epoch": 0.06113212484537064, + "grad_norm": 1.6199030876159668, + "learning_rate": 4.9540453771308435e-05, + "loss": 5.2141, + "step": 10279 + }, + { + "epoch": 0.06113807212865163, + "grad_norm": 1.485408067703247, + "learning_rate": 4.95403646187198e-05, + "loss": 5.1893, + "step": 10280 + }, + { + "epoch": 0.06114401941193263, + "grad_norm": 1.5842605829238892, + "learning_rate": 4.9540275457564395e-05, + "loss": 5.1383, + "step": 10281 + }, + { + "epoch": 0.06114996669521363, + "grad_norm": 1.5824682712554932, + "learning_rate": 4.9540186287842225e-05, + "loss": 5.1754, + "step": 10282 + }, + { + "epoch": 0.061155913978494625, + "grad_norm": 1.7714753150939941, + "learning_rate": 4.954009710955333e-05, + "loss": 5.2951, + "step": 10283 + }, + { + "epoch": 0.06116186126177562, + "grad_norm": 1.6528159379959106, + "learning_rate": 4.954000792269774e-05, + "loss": 5.2391, + "step": 10284 + }, + { + "epoch": 0.061167808545056615, + "grad_norm": 1.54135262966156, + "learning_rate": 4.953991872727549e-05, + "loss": 5.3849, + "step": 10285 + }, + { + "epoch": 0.06117375582833762, + "grad_norm": 1.4225090742111206, + "learning_rate": 4.953982952328661e-05, + "loss": 5.2211, + "step": 10286 + }, + { + "epoch": 0.06117970311161861, + "grad_norm": 1.7174444198608398, + "learning_rate": 4.953974031073112e-05, + "loss": 5.2873, + "step": 10287 + }, + { + "epoch": 0.06118565039489961, + "grad_norm": 1.4754962921142578, + "learning_rate": 4.953965108960907e-05, + "loss": 5.3137, + "step": 10288 + }, + { + "epoch": 0.06119159767818061, + "grad_norm": 1.6911029815673828, + "learning_rate": 4.9539561859920475e-05, + "loss": 5.1914, + "step": 10289 + }, + { + "epoch": 0.061197544961461604, + "grad_norm": 1.5569958686828613, + "learning_rate": 4.953947262166537e-05, + "loss": 5.2141, + "step": 10290 + }, + { + "epoch": 0.0612034922447426, + "grad_norm": 1.5939570665359497, + "learning_rate": 4.9539383374843794e-05, + "loss": 5.2059, + "step": 10291 + }, + { + "epoch": 0.0612094395280236, + "grad_norm": 1.7220442295074463, + "learning_rate": 4.953929411945577e-05, + "loss": 5.3399, + "step": 10292 + }, + { + "epoch": 0.061215386811304597, + "grad_norm": 1.7158905267715454, + "learning_rate": 4.953920485550134e-05, + "loss": 5.3392, + "step": 10293 + }, + { + "epoch": 0.06122133409458559, + "grad_norm": 1.5761021375656128, + "learning_rate": 4.9539115582980525e-05, + "loss": 5.1523, + "step": 10294 + }, + { + "epoch": 0.061227281377866594, + "grad_norm": 1.7746198177337646, + "learning_rate": 4.953902630189335e-05, + "loss": 5.1577, + "step": 10295 + }, + { + "epoch": 0.06123322866114759, + "grad_norm": 1.9633466005325317, + "learning_rate": 4.953893701223986e-05, + "loss": 5.448, + "step": 10296 + }, + { + "epoch": 0.061239175944428584, + "grad_norm": 1.7086774110794067, + "learning_rate": 4.953884771402007e-05, + "loss": 5.2624, + "step": 10297 + }, + { + "epoch": 0.06124512322770958, + "grad_norm": 1.5247907638549805, + "learning_rate": 4.953875840723403e-05, + "loss": 5.1644, + "step": 10298 + }, + { + "epoch": 0.06125107051099058, + "grad_norm": 1.7014293670654297, + "learning_rate": 4.953866909188177e-05, + "loss": 5.2118, + "step": 10299 + }, + { + "epoch": 0.061257017794271576, + "grad_norm": 1.390368103981018, + "learning_rate": 4.9538579767963305e-05, + "loss": 5.3159, + "step": 10300 + }, + { + "epoch": 0.06126296507755257, + "grad_norm": 1.4748090505599976, + "learning_rate": 4.953849043547868e-05, + "loss": 5.5283, + "step": 10301 + }, + { + "epoch": 0.06126891236083357, + "grad_norm": 1.6433857679367065, + "learning_rate": 4.953840109442792e-05, + "loss": 5.3388, + "step": 10302 + }, + { + "epoch": 0.06127485964411457, + "grad_norm": 1.6636543273925781, + "learning_rate": 4.9538311744811056e-05, + "loss": 5.4523, + "step": 10303 + }, + { + "epoch": 0.06128080692739556, + "grad_norm": 1.6074668169021606, + "learning_rate": 4.953822238662812e-05, + "loss": 5.2963, + "step": 10304 + }, + { + "epoch": 0.061286754210676565, + "grad_norm": 1.8746674060821533, + "learning_rate": 4.9538133019879155e-05, + "loss": 5.359, + "step": 10305 + }, + { + "epoch": 0.06129270149395756, + "grad_norm": 1.5438963174819946, + "learning_rate": 4.953804364456417e-05, + "loss": 5.2039, + "step": 10306 + }, + { + "epoch": 0.061298648777238555, + "grad_norm": 1.5594170093536377, + "learning_rate": 4.9537954260683205e-05, + "loss": 5.3003, + "step": 10307 + }, + { + "epoch": 0.06130459606051956, + "grad_norm": 1.3331657648086548, + "learning_rate": 4.95378648682363e-05, + "loss": 5.3051, + "step": 10308 + }, + { + "epoch": 0.06131054334380055, + "grad_norm": 1.5514707565307617, + "learning_rate": 4.953777546722348e-05, + "loss": 5.3344, + "step": 10309 + }, + { + "epoch": 0.06131649062708155, + "grad_norm": 1.6396936178207397, + "learning_rate": 4.953768605764477e-05, + "loss": 5.1244, + "step": 10310 + }, + { + "epoch": 0.06132243791036255, + "grad_norm": 1.576407551765442, + "learning_rate": 4.953759663950022e-05, + "loss": 5.1908, + "step": 10311 + }, + { + "epoch": 0.061328385193643545, + "grad_norm": 1.5868182182312012, + "learning_rate": 4.953750721278984e-05, + "loss": 5.2538, + "step": 10312 + }, + { + "epoch": 0.06133433247692454, + "grad_norm": 1.7734450101852417, + "learning_rate": 4.9537417777513664e-05, + "loss": 5.3727, + "step": 10313 + }, + { + "epoch": 0.061340279760205535, + "grad_norm": 1.5105754137039185, + "learning_rate": 4.953732833367174e-05, + "loss": 5.3547, + "step": 10314 + }, + { + "epoch": 0.06134622704348654, + "grad_norm": 1.5607833862304688, + "learning_rate": 4.953723888126408e-05, + "loss": 5.2265, + "step": 10315 + }, + { + "epoch": 0.06135217432676753, + "grad_norm": 1.2882065773010254, + "learning_rate": 4.9537149420290726e-05, + "loss": 4.9719, + "step": 10316 + }, + { + "epoch": 0.06135812161004853, + "grad_norm": 1.4349958896636963, + "learning_rate": 4.953705995075171e-05, + "loss": 5.2773, + "step": 10317 + }, + { + "epoch": 0.06136406889332953, + "grad_norm": 2.3595380783081055, + "learning_rate": 4.953697047264706e-05, + "loss": 5.7403, + "step": 10318 + }, + { + "epoch": 0.061370016176610524, + "grad_norm": 1.6126785278320312, + "learning_rate": 4.9536880985976805e-05, + "loss": 5.5316, + "step": 10319 + }, + { + "epoch": 0.06137596345989152, + "grad_norm": 1.7738999128341675, + "learning_rate": 4.953679149074098e-05, + "loss": 5.602, + "step": 10320 + }, + { + "epoch": 0.06138191074317252, + "grad_norm": 1.9263441562652588, + "learning_rate": 4.953670198693961e-05, + "loss": 5.0669, + "step": 10321 + }, + { + "epoch": 0.061387858026453516, + "grad_norm": 1.6290051937103271, + "learning_rate": 4.953661247457273e-05, + "loss": 5.2163, + "step": 10322 + }, + { + "epoch": 0.06139380530973451, + "grad_norm": 1.6354936361312866, + "learning_rate": 4.9536522953640374e-05, + "loss": 5.1678, + "step": 10323 + }, + { + "epoch": 0.061399752593015514, + "grad_norm": 1.7600759267807007, + "learning_rate": 4.953643342414257e-05, + "loss": 5.946, + "step": 10324 + }, + { + "epoch": 0.06140569987629651, + "grad_norm": 2.0515828132629395, + "learning_rate": 4.9536343886079357e-05, + "loss": 5.463, + "step": 10325 + }, + { + "epoch": 0.061411647159577504, + "grad_norm": 1.9990586042404175, + "learning_rate": 4.9536254339450754e-05, + "loss": 5.3084, + "step": 10326 + }, + { + "epoch": 0.0614175944428585, + "grad_norm": 1.7596598863601685, + "learning_rate": 4.95361647842568e-05, + "loss": 5.9268, + "step": 10327 + }, + { + "epoch": 0.0614235417261395, + "grad_norm": 1.8702850341796875, + "learning_rate": 4.953607522049752e-05, + "loss": 5.4303, + "step": 10328 + }, + { + "epoch": 0.061429489009420496, + "grad_norm": 1.9598991870880127, + "learning_rate": 4.953598564817296e-05, + "loss": 5.1813, + "step": 10329 + }, + { + "epoch": 0.06143543629270149, + "grad_norm": 1.5180566310882568, + "learning_rate": 4.953589606728314e-05, + "loss": 5.6051, + "step": 10330 + }, + { + "epoch": 0.06144138357598249, + "grad_norm": 1.4654324054718018, + "learning_rate": 4.953580647782808e-05, + "loss": 5.7188, + "step": 10331 + }, + { + "epoch": 0.06144733085926349, + "grad_norm": 1.351413607597351, + "learning_rate": 4.9535716879807835e-05, + "loss": 5.6928, + "step": 10332 + }, + { + "epoch": 0.06145327814254448, + "grad_norm": 1.4495320320129395, + "learning_rate": 4.953562727322242e-05, + "loss": 5.5576, + "step": 10333 + }, + { + "epoch": 0.061459225425825485, + "grad_norm": 1.4851731061935425, + "learning_rate": 4.953553765807187e-05, + "loss": 5.31, + "step": 10334 + }, + { + "epoch": 0.06146517270910648, + "grad_norm": 1.9790018796920776, + "learning_rate": 4.953544803435622e-05, + "loss": 5.5375, + "step": 10335 + }, + { + "epoch": 0.061471119992387475, + "grad_norm": 1.6931076049804688, + "learning_rate": 4.953535840207549e-05, + "loss": 5.6863, + "step": 10336 + }, + { + "epoch": 0.06147706727566848, + "grad_norm": 1.7479010820388794, + "learning_rate": 4.9535268761229735e-05, + "loss": 5.571, + "step": 10337 + }, + { + "epoch": 0.06148301455894947, + "grad_norm": 2.0722434520721436, + "learning_rate": 4.953517911181896e-05, + "loss": 5.2462, + "step": 10338 + }, + { + "epoch": 0.06148896184223047, + "grad_norm": 2.125288486480713, + "learning_rate": 4.953508945384322e-05, + "loss": 5.6343, + "step": 10339 + }, + { + "epoch": 0.06149490912551147, + "grad_norm": 2.0187058448791504, + "learning_rate": 4.953499978730252e-05, + "loss": 5.8642, + "step": 10340 + }, + { + "epoch": 0.061500856408792465, + "grad_norm": 1.6849068403244019, + "learning_rate": 4.9534910112196906e-05, + "loss": 5.5534, + "step": 10341 + }, + { + "epoch": 0.06150680369207346, + "grad_norm": 2.008009433746338, + "learning_rate": 4.953482042852641e-05, + "loss": 5.464, + "step": 10342 + }, + { + "epoch": 0.061512750975354455, + "grad_norm": 1.7537699937820435, + "learning_rate": 4.953473073629107e-05, + "loss": 5.9052, + "step": 10343 + }, + { + "epoch": 0.06151869825863546, + "grad_norm": 1.5746090412139893, + "learning_rate": 4.95346410354909e-05, + "loss": 5.6898, + "step": 10344 + }, + { + "epoch": 0.06152464554191645, + "grad_norm": 2.027543783187866, + "learning_rate": 4.9534551326125944e-05, + "loss": 6.0481, + "step": 10345 + }, + { + "epoch": 0.06153059282519745, + "grad_norm": 1.6113003492355347, + "learning_rate": 4.9534461608196224e-05, + "loss": 5.4792, + "step": 10346 + }, + { + "epoch": 0.06153654010847845, + "grad_norm": 1.5709928274154663, + "learning_rate": 4.953437188170178e-05, + "loss": 5.7601, + "step": 10347 + }, + { + "epoch": 0.061542487391759444, + "grad_norm": 1.7116700410842896, + "learning_rate": 4.953428214664265e-05, + "loss": 5.7284, + "step": 10348 + }, + { + "epoch": 0.06154843467504044, + "grad_norm": 2.262103796005249, + "learning_rate": 4.953419240301884e-05, + "loss": 5.7247, + "step": 10349 + }, + { + "epoch": 0.06155438195832144, + "grad_norm": 1.8536508083343506, + "learning_rate": 4.9534102650830406e-05, + "loss": 5.7509, + "step": 10350 + }, + { + "epoch": 0.061560329241602436, + "grad_norm": 2.1372785568237305, + "learning_rate": 4.953401289007737e-05, + "loss": 5.8436, + "step": 10351 + }, + { + "epoch": 0.06156627652488343, + "grad_norm": 2.5555527210235596, + "learning_rate": 4.953392312075976e-05, + "loss": 5.6481, + "step": 10352 + }, + { + "epoch": 0.061572223808164434, + "grad_norm": 2.607111692428589, + "learning_rate": 4.953383334287761e-05, + "loss": 5.4822, + "step": 10353 + }, + { + "epoch": 0.06157817109144543, + "grad_norm": 2.728994369506836, + "learning_rate": 4.953374355643095e-05, + "loss": 5.4327, + "step": 10354 + }, + { + "epoch": 0.061584118374726424, + "grad_norm": 2.3375606536865234, + "learning_rate": 4.953365376141983e-05, + "loss": 5.537, + "step": 10355 + }, + { + "epoch": 0.06159006565800742, + "grad_norm": 2.4509146213531494, + "learning_rate": 4.953356395784425e-05, + "loss": 5.5717, + "step": 10356 + }, + { + "epoch": 0.06159601294128842, + "grad_norm": 2.412198781967163, + "learning_rate": 4.953347414570426e-05, + "loss": 5.5216, + "step": 10357 + }, + { + "epoch": 0.061601960224569416, + "grad_norm": 1.7105822563171387, + "learning_rate": 4.9533384324999886e-05, + "loss": 5.6661, + "step": 10358 + }, + { + "epoch": 0.06160790750785041, + "grad_norm": 2.2394793033599854, + "learning_rate": 4.953329449573116e-05, + "loss": 5.2062, + "step": 10359 + }, + { + "epoch": 0.06161385479113141, + "grad_norm": 2.1791203022003174, + "learning_rate": 4.9533204657898127e-05, + "loss": 5.1961, + "step": 10360 + }, + { + "epoch": 0.06161980207441241, + "grad_norm": 2.0430495738983154, + "learning_rate": 4.953311481150079e-05, + "loss": 5.1492, + "step": 10361 + }, + { + "epoch": 0.0616257493576934, + "grad_norm": 2.157975435256958, + "learning_rate": 4.9533024956539204e-05, + "loss": 4.9354, + "step": 10362 + }, + { + "epoch": 0.061631696640974405, + "grad_norm": 2.101484537124634, + "learning_rate": 4.953293509301339e-05, + "loss": 4.9212, + "step": 10363 + }, + { + "epoch": 0.0616376439242554, + "grad_norm": 1.740793228149414, + "learning_rate": 4.953284522092338e-05, + "loss": 5.1234, + "step": 10364 + }, + { + "epoch": 0.061643591207536395, + "grad_norm": 1.9694514274597168, + "learning_rate": 4.953275534026921e-05, + "loss": 5.3688, + "step": 10365 + }, + { + "epoch": 0.0616495384908174, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.953266545105091e-05, + "loss": 4.7194, + "step": 10366 + }, + { + "epoch": 0.06165548577409839, + "grad_norm": 2.016284942626953, + "learning_rate": 4.95325755532685e-05, + "loss": 4.7397, + "step": 10367 + }, + { + "epoch": 0.06166143305737939, + "grad_norm": 2.3073251247406006, + "learning_rate": 4.9532485646922036e-05, + "loss": 4.59, + "step": 10368 + }, + { + "epoch": 0.06166738034066039, + "grad_norm": 2.265873670578003, + "learning_rate": 4.9532395732011524e-05, + "loss": 4.7713, + "step": 10369 + }, + { + "epoch": 0.061673327623941385, + "grad_norm": 1.8176212310791016, + "learning_rate": 4.953230580853701e-05, + "loss": 5.2288, + "step": 10370 + }, + { + "epoch": 0.06167927490722238, + "grad_norm": 2.3636794090270996, + "learning_rate": 4.953221587649852e-05, + "loss": 5.1683, + "step": 10371 + }, + { + "epoch": 0.061685222190503375, + "grad_norm": 1.8074215650558472, + "learning_rate": 4.953212593589609e-05, + "loss": 6.037, + "step": 10372 + }, + { + "epoch": 0.06169116947378438, + "grad_norm": 2.1368768215179443, + "learning_rate": 4.953203598672975e-05, + "loss": 5.8481, + "step": 10373 + }, + { + "epoch": 0.06169711675706537, + "grad_norm": 2.924474000930786, + "learning_rate": 4.953194602899952e-05, + "loss": 4.327, + "step": 10374 + }, + { + "epoch": 0.06170306404034637, + "grad_norm": 2.412336826324463, + "learning_rate": 4.953185606270545e-05, + "loss": 4.3885, + "step": 10375 + }, + { + "epoch": 0.06170901132362737, + "grad_norm": 1.9676904678344727, + "learning_rate": 4.953176608784756e-05, + "loss": 5.4581, + "step": 10376 + }, + { + "epoch": 0.061714958606908364, + "grad_norm": 2.1357827186584473, + "learning_rate": 4.953167610442588e-05, + "loss": 6.1762, + "step": 10377 + }, + { + "epoch": 0.06172090589018936, + "grad_norm": 1.912763237953186, + "learning_rate": 4.953158611244045e-05, + "loss": 6.3403, + "step": 10378 + }, + { + "epoch": 0.06172685317347036, + "grad_norm": 2.0528855323791504, + "learning_rate": 4.95314961118913e-05, + "loss": 6.1921, + "step": 10379 + }, + { + "epoch": 0.061732800456751356, + "grad_norm": 2.1858723163604736, + "learning_rate": 4.953140610277846e-05, + "loss": 5.1944, + "step": 10380 + }, + { + "epoch": 0.06173874774003235, + "grad_norm": 2.04040265083313, + "learning_rate": 4.9531316085101944e-05, + "loss": 5.1866, + "step": 10381 + }, + { + "epoch": 0.06174469502331335, + "grad_norm": 2.216113567352295, + "learning_rate": 4.953122605886181e-05, + "loss": 5.5625, + "step": 10382 + }, + { + "epoch": 0.06175064230659435, + "grad_norm": 1.7107234001159668, + "learning_rate": 4.9531136024058076e-05, + "loss": 5.917, + "step": 10383 + }, + { + "epoch": 0.061756589589875344, + "grad_norm": 1.983104944229126, + "learning_rate": 4.9531045980690776e-05, + "loss": 6.0113, + "step": 10384 + }, + { + "epoch": 0.06176253687315634, + "grad_norm": 2.0186147689819336, + "learning_rate": 4.9530955928759945e-05, + "loss": 6.5227, + "step": 10385 + }, + { + "epoch": 0.06176848415643734, + "grad_norm": 1.8337477445602417, + "learning_rate": 4.9530865868265605e-05, + "loss": 5.9586, + "step": 10386 + }, + { + "epoch": 0.061774431439718336, + "grad_norm": 1.6523345708847046, + "learning_rate": 4.9530775799207795e-05, + "loss": 5.7073, + "step": 10387 + }, + { + "epoch": 0.06178037872299933, + "grad_norm": 1.617838740348816, + "learning_rate": 4.953068572158654e-05, + "loss": 5.3771, + "step": 10388 + }, + { + "epoch": 0.06178632600628033, + "grad_norm": 1.7327697277069092, + "learning_rate": 4.953059563540189e-05, + "loss": 5.3021, + "step": 10389 + }, + { + "epoch": 0.06179227328956133, + "grad_norm": 2.726762294769287, + "learning_rate": 4.9530505540653856e-05, + "loss": 5.2568, + "step": 10390 + }, + { + "epoch": 0.06179822057284232, + "grad_norm": 2.540090560913086, + "learning_rate": 4.953041543734247e-05, + "loss": 5.114, + "step": 10391 + }, + { + "epoch": 0.061804167856123325, + "grad_norm": 2.26487135887146, + "learning_rate": 4.953032532546777e-05, + "loss": 5.2552, + "step": 10392 + }, + { + "epoch": 0.06181011513940432, + "grad_norm": 1.9986075162887573, + "learning_rate": 4.95302352050298e-05, + "loss": 5.3555, + "step": 10393 + }, + { + "epoch": 0.061816062422685315, + "grad_norm": 2.2121987342834473, + "learning_rate": 4.9530145076028564e-05, + "loss": 5.665, + "step": 10394 + }, + { + "epoch": 0.06182200970596632, + "grad_norm": 1.892927646636963, + "learning_rate": 4.953005493846411e-05, + "loss": 5.2536, + "step": 10395 + }, + { + "epoch": 0.06182795698924731, + "grad_norm": 2.1083126068115234, + "learning_rate": 4.952996479233647e-05, + "loss": 6.1748, + "step": 10396 + }, + { + "epoch": 0.06183390427252831, + "grad_norm": 2.2235448360443115, + "learning_rate": 4.9529874637645675e-05, + "loss": 6.0676, + "step": 10397 + }, + { + "epoch": 0.06183985155580931, + "grad_norm": 2.0888702869415283, + "learning_rate": 4.952978447439175e-05, + "loss": 5.2515, + "step": 10398 + }, + { + "epoch": 0.061845798839090305, + "grad_norm": 1.826622724533081, + "learning_rate": 4.9529694302574736e-05, + "loss": 5.6849, + "step": 10399 + }, + { + "epoch": 0.0618517461223713, + "grad_norm": 1.9772933721542358, + "learning_rate": 4.952960412219465e-05, + "loss": 5.7702, + "step": 10400 + }, + { + "epoch": 0.061857693405652295, + "grad_norm": 2.2230029106140137, + "learning_rate": 4.952951393325154e-05, + "loss": 5.5747, + "step": 10401 + }, + { + "epoch": 0.0618636406889333, + "grad_norm": 1.9372552633285522, + "learning_rate": 4.9529423735745425e-05, + "loss": 5.4728, + "step": 10402 + }, + { + "epoch": 0.06186958797221429, + "grad_norm": 2.2238845825195312, + "learning_rate": 4.952933352967635e-05, + "loss": 5.2462, + "step": 10403 + }, + { + "epoch": 0.06187553525549529, + "grad_norm": 1.7716748714447021, + "learning_rate": 4.952924331504433e-05, + "loss": 5.5651, + "step": 10404 + }, + { + "epoch": 0.06188148253877629, + "grad_norm": 2.2933645248413086, + "learning_rate": 4.9529153091849405e-05, + "loss": 5.8684, + "step": 10405 + }, + { + "epoch": 0.061887429822057284, + "grad_norm": 2.222883939743042, + "learning_rate": 4.9529062860091616e-05, + "loss": 5.8427, + "step": 10406 + }, + { + "epoch": 0.06189337710533828, + "grad_norm": 1.645338773727417, + "learning_rate": 4.9528972619770975e-05, + "loss": 5.7001, + "step": 10407 + }, + { + "epoch": 0.06189932438861928, + "grad_norm": 2.1029653549194336, + "learning_rate": 4.952888237088752e-05, + "loss": 5.728, + "step": 10408 + }, + { + "epoch": 0.061905271671900276, + "grad_norm": 2.2689831256866455, + "learning_rate": 4.952879211344129e-05, + "loss": 5.4678, + "step": 10409 + }, + { + "epoch": 0.06191121895518127, + "grad_norm": 1.908469557762146, + "learning_rate": 4.9528701847432315e-05, + "loss": 6.007, + "step": 10410 + }, + { + "epoch": 0.06191716623846227, + "grad_norm": 1.819381833076477, + "learning_rate": 4.952861157286062e-05, + "loss": 6.2041, + "step": 10411 + }, + { + "epoch": 0.06192311352174327, + "grad_norm": 2.16945743560791, + "learning_rate": 4.952852128972624e-05, + "loss": 5.7757, + "step": 10412 + }, + { + "epoch": 0.061929060805024264, + "grad_norm": 2.1671459674835205, + "learning_rate": 4.952843099802921e-05, + "loss": 5.5212, + "step": 10413 + }, + { + "epoch": 0.061935008088305266, + "grad_norm": 1.730073094367981, + "learning_rate": 4.952834069776956e-05, + "loss": 5.809, + "step": 10414 + }, + { + "epoch": 0.06194095537158626, + "grad_norm": 2.1048457622528076, + "learning_rate": 4.952825038894732e-05, + "loss": 5.7219, + "step": 10415 + }, + { + "epoch": 0.061946902654867256, + "grad_norm": 2.7438642978668213, + "learning_rate": 4.9528160071562516e-05, + "loss": 5.6367, + "step": 10416 + }, + { + "epoch": 0.06195284993814825, + "grad_norm": 2.0103960037231445, + "learning_rate": 4.952806974561518e-05, + "loss": 5.1429, + "step": 10417 + }, + { + "epoch": 0.06195879722142925, + "grad_norm": 2.1754884719848633, + "learning_rate": 4.9527979411105354e-05, + "loss": 5.9337, + "step": 10418 + }, + { + "epoch": 0.06196474450471025, + "grad_norm": 2.553421974182129, + "learning_rate": 4.9527889068033063e-05, + "loss": 5.7076, + "step": 10419 + }, + { + "epoch": 0.06197069178799124, + "grad_norm": 2.0601327419281006, + "learning_rate": 4.952779871639834e-05, + "loss": 5.7855, + "step": 10420 + }, + { + "epoch": 0.061976639071272245, + "grad_norm": 2.0958025455474854, + "learning_rate": 4.952770835620122e-05, + "loss": 5.8621, + "step": 10421 + }, + { + "epoch": 0.06198258635455324, + "grad_norm": 2.2658755779266357, + "learning_rate": 4.952761798744172e-05, + "loss": 5.9306, + "step": 10422 + }, + { + "epoch": 0.061988533637834235, + "grad_norm": 1.933090090751648, + "learning_rate": 4.9527527610119896e-05, + "loss": 5.1557, + "step": 10423 + }, + { + "epoch": 0.06199448092111524, + "grad_norm": 2.5761375427246094, + "learning_rate": 4.952743722423575e-05, + "loss": 5.4438, + "step": 10424 + }, + { + "epoch": 0.06200042820439623, + "grad_norm": 2.0499768257141113, + "learning_rate": 4.9527346829789344e-05, + "loss": 5.4153, + "step": 10425 + }, + { + "epoch": 0.06200637548767723, + "grad_norm": 1.970674991607666, + "learning_rate": 4.952725642678069e-05, + "loss": 5.8678, + "step": 10426 + }, + { + "epoch": 0.06201232277095823, + "grad_norm": 2.4563233852386475, + "learning_rate": 4.9527166015209814e-05, + "loss": 4.926, + "step": 10427 + }, + { + "epoch": 0.062018270054239225, + "grad_norm": 1.8380508422851562, + "learning_rate": 4.9527075595076763e-05, + "loss": 4.9619, + "step": 10428 + }, + { + "epoch": 0.06202421733752022, + "grad_norm": 1.8930846452713013, + "learning_rate": 4.9526985166381565e-05, + "loss": 4.8252, + "step": 10429 + }, + { + "epoch": 0.062030164620801215, + "grad_norm": 2.401026725769043, + "learning_rate": 4.952689472912426e-05, + "loss": 4.5023, + "step": 10430 + }, + { + "epoch": 0.06203611190408222, + "grad_norm": 2.2801949977874756, + "learning_rate": 4.952680428330486e-05, + "loss": 4.6461, + "step": 10431 + }, + { + "epoch": 0.06204205918736321, + "grad_norm": 2.2466189861297607, + "learning_rate": 4.95267138289234e-05, + "loss": 4.5946, + "step": 10432 + }, + { + "epoch": 0.06204800647064421, + "grad_norm": 2.1723902225494385, + "learning_rate": 4.952662336597993e-05, + "loss": 5.6417, + "step": 10433 + }, + { + "epoch": 0.06205395375392521, + "grad_norm": 1.9614545106887817, + "learning_rate": 4.952653289447446e-05, + "loss": 5.0758, + "step": 10434 + }, + { + "epoch": 0.062059901037206204, + "grad_norm": 2.465252637863159, + "learning_rate": 4.9526442414407036e-05, + "loss": 4.6159, + "step": 10435 + }, + { + "epoch": 0.0620658483204872, + "grad_norm": 2.2298080921173096, + "learning_rate": 4.9526351925777684e-05, + "loss": 5.24, + "step": 10436 + }, + { + "epoch": 0.0620717956037682, + "grad_norm": 2.1284472942352295, + "learning_rate": 4.952626142858643e-05, + "loss": 4.5255, + "step": 10437 + }, + { + "epoch": 0.062077742887049196, + "grad_norm": 2.1340067386627197, + "learning_rate": 4.9526170922833314e-05, + "loss": 4.5931, + "step": 10438 + }, + { + "epoch": 0.06208369017033019, + "grad_norm": 2.20354962348938, + "learning_rate": 4.952608040851837e-05, + "loss": 4.7688, + "step": 10439 + }, + { + "epoch": 0.06208963745361119, + "grad_norm": 1.5250015258789062, + "learning_rate": 4.952598988564162e-05, + "loss": 5.3292, + "step": 10440 + }, + { + "epoch": 0.06209558473689219, + "grad_norm": 2.1667168140411377, + "learning_rate": 4.95258993542031e-05, + "loss": 5.6216, + "step": 10441 + }, + { + "epoch": 0.062101532020173184, + "grad_norm": 1.8172663450241089, + "learning_rate": 4.9525808814202846e-05, + "loss": 5.5813, + "step": 10442 + }, + { + "epoch": 0.062107479303454186, + "grad_norm": 1.9832731485366821, + "learning_rate": 4.9525718265640884e-05, + "loss": 5.4444, + "step": 10443 + }, + { + "epoch": 0.06211342658673518, + "grad_norm": 2.051358699798584, + "learning_rate": 4.952562770851724e-05, + "loss": 5.3488, + "step": 10444 + }, + { + "epoch": 0.062119373870016176, + "grad_norm": 2.1487104892730713, + "learning_rate": 4.952553714283196e-05, + "loss": 5.3803, + "step": 10445 + }, + { + "epoch": 0.06212532115329717, + "grad_norm": 2.086853504180908, + "learning_rate": 4.952544656858507e-05, + "loss": 5.4585, + "step": 10446 + }, + { + "epoch": 0.06213126843657817, + "grad_norm": 2.1599764823913574, + "learning_rate": 4.95253559857766e-05, + "loss": 5.3728, + "step": 10447 + }, + { + "epoch": 0.06213721571985917, + "grad_norm": 1.877626657485962, + "learning_rate": 4.9525265394406576e-05, + "loss": 5.433, + "step": 10448 + }, + { + "epoch": 0.06214316300314016, + "grad_norm": 2.022185802459717, + "learning_rate": 4.952517479447504e-05, + "loss": 5.6472, + "step": 10449 + }, + { + "epoch": 0.062149110286421165, + "grad_norm": 2.1667773723602295, + "learning_rate": 4.9525084185982015e-05, + "loss": 5.3174, + "step": 10450 + }, + { + "epoch": 0.06215505756970216, + "grad_norm": 1.6227883100509644, + "learning_rate": 4.952499356892753e-05, + "loss": 5.3747, + "step": 10451 + }, + { + "epoch": 0.062161004852983155, + "grad_norm": 1.935307502746582, + "learning_rate": 4.952490294331164e-05, + "loss": 5.7716, + "step": 10452 + }, + { + "epoch": 0.06216695213626416, + "grad_norm": 2.6584694385528564, + "learning_rate": 4.952481230913435e-05, + "loss": 5.3525, + "step": 10453 + }, + { + "epoch": 0.06217289941954515, + "grad_norm": 2.626344919204712, + "learning_rate": 4.9524721666395705e-05, + "loss": 5.2118, + "step": 10454 + }, + { + "epoch": 0.06217884670282615, + "grad_norm": 2.525580644607544, + "learning_rate": 4.9524631015095735e-05, + "loss": 5.1231, + "step": 10455 + }, + { + "epoch": 0.06218479398610715, + "grad_norm": 2.274801015853882, + "learning_rate": 4.9524540355234464e-05, + "loss": 5.0637, + "step": 10456 + }, + { + "epoch": 0.062190741269388145, + "grad_norm": 1.9937769174575806, + "learning_rate": 4.952444968681193e-05, + "loss": 5.8196, + "step": 10457 + }, + { + "epoch": 0.06219668855266914, + "grad_norm": 2.124290943145752, + "learning_rate": 4.952435900982816e-05, + "loss": 5.5221, + "step": 10458 + }, + { + "epoch": 0.062202635835950135, + "grad_norm": 2.2544684410095215, + "learning_rate": 4.95242683242832e-05, + "loss": 5.6656, + "step": 10459 + }, + { + "epoch": 0.06220858311923114, + "grad_norm": 2.2626397609710693, + "learning_rate": 4.952417763017706e-05, + "loss": 5.5836, + "step": 10460 + }, + { + "epoch": 0.06221453040251213, + "grad_norm": 1.9299595355987549, + "learning_rate": 4.9524086927509796e-05, + "loss": 5.6637, + "step": 10461 + }, + { + "epoch": 0.06222047768579313, + "grad_norm": 1.769463062286377, + "learning_rate": 4.952399621628142e-05, + "loss": 5.4836, + "step": 10462 + }, + { + "epoch": 0.06222642496907413, + "grad_norm": 1.6773936748504639, + "learning_rate": 4.952390549649196e-05, + "loss": 5.2894, + "step": 10463 + }, + { + "epoch": 0.062232372252355124, + "grad_norm": 1.7612723112106323, + "learning_rate": 4.952381476814148e-05, + "loss": 5.5438, + "step": 10464 + }, + { + "epoch": 0.06223831953563612, + "grad_norm": 2.5255069732666016, + "learning_rate": 4.952372403122997e-05, + "loss": 5.7864, + "step": 10465 + }, + { + "epoch": 0.06224426681891712, + "grad_norm": 2.1128363609313965, + "learning_rate": 4.9523633285757486e-05, + "loss": 5.6207, + "step": 10466 + }, + { + "epoch": 0.062250214102198116, + "grad_norm": 1.8612544536590576, + "learning_rate": 4.952354253172407e-05, + "loss": 5.9177, + "step": 10467 + }, + { + "epoch": 0.06225616138547911, + "grad_norm": 2.092707633972168, + "learning_rate": 4.9523451769129715e-05, + "loss": 5.6047, + "step": 10468 + }, + { + "epoch": 0.06226210866876011, + "grad_norm": 2.6695668697357178, + "learning_rate": 4.952336099797449e-05, + "loss": 5.4931, + "step": 10469 + }, + { + "epoch": 0.06226805595204111, + "grad_norm": 2.2714614868164062, + "learning_rate": 4.9523270218258414e-05, + "loss": 5.4481, + "step": 10470 + }, + { + "epoch": 0.0622740032353221, + "grad_norm": 2.035304307937622, + "learning_rate": 4.952317942998151e-05, + "loss": 5.3609, + "step": 10471 + }, + { + "epoch": 0.062279950518603105, + "grad_norm": 2.295647144317627, + "learning_rate": 4.952308863314382e-05, + "loss": 5.5687, + "step": 10472 + }, + { + "epoch": 0.0622858978018841, + "grad_norm": 1.8365178108215332, + "learning_rate": 4.9522997827745375e-05, + "loss": 5.4207, + "step": 10473 + }, + { + "epoch": 0.062291845085165096, + "grad_norm": 1.6130415201187134, + "learning_rate": 4.9522907013786206e-05, + "loss": 5.1894, + "step": 10474 + }, + { + "epoch": 0.06229779236844609, + "grad_norm": 2.01560115814209, + "learning_rate": 4.952281619126634e-05, + "loss": 5.4956, + "step": 10475 + }, + { + "epoch": 0.06230373965172709, + "grad_norm": 2.7854549884796143, + "learning_rate": 4.952272536018582e-05, + "loss": 5.2341, + "step": 10476 + }, + { + "epoch": 0.06230968693500809, + "grad_norm": 2.7532944679260254, + "learning_rate": 4.9522634520544666e-05, + "loss": 5.1863, + "step": 10477 + }, + { + "epoch": 0.06231563421828908, + "grad_norm": 2.193084239959717, + "learning_rate": 4.952254367234291e-05, + "loss": 5.5187, + "step": 10478 + }, + { + "epoch": 0.062321581501570085, + "grad_norm": 2.245664119720459, + "learning_rate": 4.952245281558059e-05, + "loss": 5.1275, + "step": 10479 + }, + { + "epoch": 0.06232752878485108, + "grad_norm": 2.0522654056549072, + "learning_rate": 4.9522361950257734e-05, + "loss": 5.2887, + "step": 10480 + }, + { + "epoch": 0.062333476068132075, + "grad_norm": 2.132280111312866, + "learning_rate": 4.952227107637437e-05, + "loss": 5.8767, + "step": 10481 + }, + { + "epoch": 0.06233942335141308, + "grad_norm": 2.155574083328247, + "learning_rate": 4.952218019393055e-05, + "loss": 5.9499, + "step": 10482 + }, + { + "epoch": 0.06234537063469407, + "grad_norm": 2.3979780673980713, + "learning_rate": 4.952208930292627e-05, + "loss": 5.7622, + "step": 10483 + }, + { + "epoch": 0.06235131791797507, + "grad_norm": 2.444812297821045, + "learning_rate": 4.9521998403361595e-05, + "loss": 5.3332, + "step": 10484 + }, + { + "epoch": 0.06235726520125607, + "grad_norm": 2.369248867034912, + "learning_rate": 4.952190749523654e-05, + "loss": 5.109, + "step": 10485 + }, + { + "epoch": 0.062363212484537064, + "grad_norm": 1.9160844087600708, + "learning_rate": 4.952181657855114e-05, + "loss": 5.1783, + "step": 10486 + }, + { + "epoch": 0.06236915976781806, + "grad_norm": 2.1532788276672363, + "learning_rate": 4.952172565330543e-05, + "loss": 5.913, + "step": 10487 + }, + { + "epoch": 0.062375107051099055, + "grad_norm": 2.132382392883301, + "learning_rate": 4.9521634719499435e-05, + "loss": 5.7748, + "step": 10488 + }, + { + "epoch": 0.06238105433438006, + "grad_norm": 2.22267484664917, + "learning_rate": 4.9521543777133194e-05, + "loss": 5.6464, + "step": 10489 + }, + { + "epoch": 0.06238700161766105, + "grad_norm": 2.0619423389434814, + "learning_rate": 4.952145282620674e-05, + "loss": 5.4881, + "step": 10490 + }, + { + "epoch": 0.06239294890094205, + "grad_norm": 2.9574310779571533, + "learning_rate": 4.952136186672009e-05, + "loss": 5.4401, + "step": 10491 + }, + { + "epoch": 0.06239889618422305, + "grad_norm": 1.7362775802612305, + "learning_rate": 4.952127089867329e-05, + "loss": 6.0755, + "step": 10492 + }, + { + "epoch": 0.062404843467504044, + "grad_norm": 1.8244996070861816, + "learning_rate": 4.952117992206637e-05, + "loss": 6.2588, + "step": 10493 + }, + { + "epoch": 0.06241079075078504, + "grad_norm": 1.8556538820266724, + "learning_rate": 4.952108893689936e-05, + "loss": 6.0827, + "step": 10494 + }, + { + "epoch": 0.06241673803406604, + "grad_norm": 2.2471442222595215, + "learning_rate": 4.9520997943172285e-05, + "loss": 5.98, + "step": 10495 + }, + { + "epoch": 0.062422685317347036, + "grad_norm": 3.0217249393463135, + "learning_rate": 4.9520906940885186e-05, + "loss": 5.5116, + "step": 10496 + }, + { + "epoch": 0.06242863260062803, + "grad_norm": 2.02962064743042, + "learning_rate": 4.9520815930038086e-05, + "loss": 5.9341, + "step": 10497 + }, + { + "epoch": 0.06243457988390903, + "grad_norm": 1.6286019086837769, + "learning_rate": 4.9520724910631034e-05, + "loss": 5.1944, + "step": 10498 + }, + { + "epoch": 0.06244052716719003, + "grad_norm": 1.9963330030441284, + "learning_rate": 4.9520633882664044e-05, + "loss": 6.0584, + "step": 10499 + }, + { + "epoch": 0.06244647445047102, + "grad_norm": 1.884988784790039, + "learning_rate": 4.9520542846137155e-05, + "loss": 6.2744, + "step": 10500 + }, + { + "epoch": 0.062452421733752025, + "grad_norm": 1.9402821063995361, + "learning_rate": 4.95204518010504e-05, + "loss": 5.9201, + "step": 10501 + }, + { + "epoch": 0.06245836901703302, + "grad_norm": 1.9304310083389282, + "learning_rate": 4.9520360747403805e-05, + "loss": 5.7227, + "step": 10502 + }, + { + "epoch": 0.062464316300314016, + "grad_norm": 2.8199663162231445, + "learning_rate": 4.9520269685197405e-05, + "loss": 6.4819, + "step": 10503 + }, + { + "epoch": 0.06247026358359501, + "grad_norm": 1.456852912902832, + "learning_rate": 4.9520178614431236e-05, + "loss": 5.3169, + "step": 10504 + }, + { + "epoch": 0.06247621086687601, + "grad_norm": 2.3753762245178223, + "learning_rate": 4.9520087535105324e-05, + "loss": 5.9817, + "step": 10505 + }, + { + "epoch": 0.06248215815015701, + "grad_norm": 2.329932928085327, + "learning_rate": 4.951999644721971e-05, + "loss": 6.0266, + "step": 10506 + }, + { + "epoch": 0.062488105433438, + "grad_norm": 1.772615671157837, + "learning_rate": 4.951990535077441e-05, + "loss": 5.2548, + "step": 10507 + }, + { + "epoch": 0.062494052716719005, + "grad_norm": 2.1240997314453125, + "learning_rate": 4.951981424576946e-05, + "loss": 5.3991, + "step": 10508 + }, + { + "epoch": 0.0625, + "grad_norm": 1.7283856868743896, + "learning_rate": 4.9519723132204905e-05, + "loss": 5.2065, + "step": 10509 + }, + { + "epoch": 0.062505947283281, + "grad_norm": 2.197404384613037, + "learning_rate": 4.951963201008076e-05, + "loss": 5.7282, + "step": 10510 + }, + { + "epoch": 0.06251189456656199, + "grad_norm": 1.8550727367401123, + "learning_rate": 4.9519540879397075e-05, + "loss": 6.0125, + "step": 10511 + }, + { + "epoch": 0.06251784184984299, + "grad_norm": 1.5998154878616333, + "learning_rate": 4.951944974015387e-05, + "loss": 5.9371, + "step": 10512 + }, + { + "epoch": 0.062523789133124, + "grad_norm": 1.644454836845398, + "learning_rate": 4.951935859235117e-05, + "loss": 5.9315, + "step": 10513 + }, + { + "epoch": 0.06252973641640498, + "grad_norm": 1.9119540452957153, + "learning_rate": 4.951926743598902e-05, + "loss": 5.7104, + "step": 10514 + }, + { + "epoch": 0.06253568369968598, + "grad_norm": 1.8863649368286133, + "learning_rate": 4.951917627106745e-05, + "loss": 5.8639, + "step": 10515 + }, + { + "epoch": 0.06254163098296699, + "grad_norm": 2.1626899242401123, + "learning_rate": 4.951908509758648e-05, + "loss": 5.9727, + "step": 10516 + }, + { + "epoch": 0.06254757826624797, + "grad_norm": 1.9397778511047363, + "learning_rate": 4.9518993915546155e-05, + "loss": 5.9771, + "step": 10517 + }, + { + "epoch": 0.06255352554952898, + "grad_norm": 1.7723463773727417, + "learning_rate": 4.951890272494651e-05, + "loss": 5.8684, + "step": 10518 + }, + { + "epoch": 0.06255947283280998, + "grad_norm": 1.9191977977752686, + "learning_rate": 4.9518811525787565e-05, + "loss": 5.7242, + "step": 10519 + }, + { + "epoch": 0.06256542011609097, + "grad_norm": 1.7599314451217651, + "learning_rate": 4.951872031806935e-05, + "loss": 5.5234, + "step": 10520 + }, + { + "epoch": 0.06257136739937197, + "grad_norm": 1.6560989618301392, + "learning_rate": 4.951862910179191e-05, + "loss": 5.5907, + "step": 10521 + }, + { + "epoch": 0.06257731468265297, + "grad_norm": 1.9756556749343872, + "learning_rate": 4.9518537876955265e-05, + "loss": 6.0013, + "step": 10522 + }, + { + "epoch": 0.06258326196593396, + "grad_norm": 1.9012173414230347, + "learning_rate": 4.9518446643559454e-05, + "loss": 5.8073, + "step": 10523 + }, + { + "epoch": 0.06258920924921496, + "grad_norm": 1.8992196321487427, + "learning_rate": 4.951835540160451e-05, + "loss": 5.8571, + "step": 10524 + }, + { + "epoch": 0.06259515653249595, + "grad_norm": 1.8002395629882812, + "learning_rate": 4.9518264151090455e-05, + "loss": 5.7798, + "step": 10525 + }, + { + "epoch": 0.06260110381577695, + "grad_norm": 1.732063889503479, + "learning_rate": 4.9518172892017335e-05, + "loss": 5.8167, + "step": 10526 + }, + { + "epoch": 0.06260705109905795, + "grad_norm": 1.6961164474487305, + "learning_rate": 4.951808162438517e-05, + "loss": 5.8797, + "step": 10527 + }, + { + "epoch": 0.06261299838233894, + "grad_norm": 1.904102087020874, + "learning_rate": 4.9517990348193996e-05, + "loss": 5.7109, + "step": 10528 + }, + { + "epoch": 0.06261894566561994, + "grad_norm": 1.6908652782440186, + "learning_rate": 4.951789906344384e-05, + "loss": 5.8435, + "step": 10529 + }, + { + "epoch": 0.06262489294890095, + "grad_norm": 1.8550028800964355, + "learning_rate": 4.951780777013475e-05, + "loss": 5.6218, + "step": 10530 + }, + { + "epoch": 0.06263084023218193, + "grad_norm": 1.7106919288635254, + "learning_rate": 4.951771646826674e-05, + "loss": 5.6668, + "step": 10531 + }, + { + "epoch": 0.06263678751546294, + "grad_norm": 1.5522899627685547, + "learning_rate": 4.951762515783984e-05, + "loss": 5.418, + "step": 10532 + }, + { + "epoch": 0.06264273479874394, + "grad_norm": 1.7510137557983398, + "learning_rate": 4.9517533838854104e-05, + "loss": 5.6595, + "step": 10533 + }, + { + "epoch": 0.06264868208202493, + "grad_norm": 2.1222739219665527, + "learning_rate": 4.9517442511309544e-05, + "loss": 6.0008, + "step": 10534 + }, + { + "epoch": 0.06265462936530593, + "grad_norm": 1.977807641029358, + "learning_rate": 4.95173511752062e-05, + "loss": 5.8263, + "step": 10535 + }, + { + "epoch": 0.06266057664858693, + "grad_norm": 1.6423957347869873, + "learning_rate": 4.9517259830544105e-05, + "loss": 6.2078, + "step": 10536 + }, + { + "epoch": 0.06266652393186792, + "grad_norm": 1.9365674257278442, + "learning_rate": 4.9517168477323286e-05, + "loss": 6.0972, + "step": 10537 + }, + { + "epoch": 0.06267247121514892, + "grad_norm": 1.6738137006759644, + "learning_rate": 4.951707711554377e-05, + "loss": 5.7439, + "step": 10538 + }, + { + "epoch": 0.06267841849842992, + "grad_norm": 2.4281718730926514, + "learning_rate": 4.95169857452056e-05, + "loss": 5.4822, + "step": 10539 + }, + { + "epoch": 0.06268436578171091, + "grad_norm": 2.53411602973938, + "learning_rate": 4.951689436630881e-05, + "loss": 5.4883, + "step": 10540 + }, + { + "epoch": 0.06269031306499191, + "grad_norm": 2.116520643234253, + "learning_rate": 4.951680297885342e-05, + "loss": 5.6123, + "step": 10541 + }, + { + "epoch": 0.06269626034827291, + "grad_norm": 1.8546512126922607, + "learning_rate": 4.951671158283946e-05, + "loss": 5.443, + "step": 10542 + }, + { + "epoch": 0.0627022076315539, + "grad_norm": 2.0048365592956543, + "learning_rate": 4.9516620178266975e-05, + "loss": 5.7759, + "step": 10543 + }, + { + "epoch": 0.0627081549148349, + "grad_norm": 1.6800916194915771, + "learning_rate": 4.9516528765136e-05, + "loss": 5.6767, + "step": 10544 + }, + { + "epoch": 0.0627141021981159, + "grad_norm": 1.7444523572921753, + "learning_rate": 4.9516437343446544e-05, + "loss": 5.297, + "step": 10545 + }, + { + "epoch": 0.0627200494813969, + "grad_norm": 1.8653407096862793, + "learning_rate": 4.951634591319866e-05, + "loss": 5.6999, + "step": 10546 + }, + { + "epoch": 0.0627259967646779, + "grad_norm": 1.7988131046295166, + "learning_rate": 4.9516254474392376e-05, + "loss": 5.5244, + "step": 10547 + }, + { + "epoch": 0.0627319440479589, + "grad_norm": 1.7915012836456299, + "learning_rate": 4.951616302702772e-05, + "loss": 5.6766, + "step": 10548 + }, + { + "epoch": 0.06273789133123989, + "grad_norm": 1.8351629972457886, + "learning_rate": 4.951607157110471e-05, + "loss": 5.6332, + "step": 10549 + }, + { + "epoch": 0.06274383861452089, + "grad_norm": 1.6819947957992554, + "learning_rate": 4.951598010662341e-05, + "loss": 5.5773, + "step": 10550 + }, + { + "epoch": 0.06274978589780189, + "grad_norm": 2.2969119548797607, + "learning_rate": 4.951588863358383e-05, + "loss": 5.6847, + "step": 10551 + }, + { + "epoch": 0.06275573318108288, + "grad_norm": 2.346092939376831, + "learning_rate": 4.951579715198601e-05, + "loss": 5.404, + "step": 10552 + }, + { + "epoch": 0.06276168046436388, + "grad_norm": 1.8255709409713745, + "learning_rate": 4.951570566182997e-05, + "loss": 5.9009, + "step": 10553 + }, + { + "epoch": 0.06276762774764487, + "grad_norm": 2.4000492095947266, + "learning_rate": 4.951561416311575e-05, + "loss": 5.4395, + "step": 10554 + }, + { + "epoch": 0.06277357503092587, + "grad_norm": 2.1519010066986084, + "learning_rate": 4.951552265584339e-05, + "loss": 5.6447, + "step": 10555 + }, + { + "epoch": 0.06277952231420687, + "grad_norm": 1.7821810245513916, + "learning_rate": 4.9515431140012915e-05, + "loss": 5.3495, + "step": 10556 + }, + { + "epoch": 0.06278546959748786, + "grad_norm": 1.8359061479568481, + "learning_rate": 4.9515339615624356e-05, + "loss": 5.7258, + "step": 10557 + }, + { + "epoch": 0.06279141688076886, + "grad_norm": 1.899970293045044, + "learning_rate": 4.951524808267774e-05, + "loss": 5.9683, + "step": 10558 + }, + { + "epoch": 0.06279736416404987, + "grad_norm": 1.6407743692398071, + "learning_rate": 4.951515654117311e-05, + "loss": 6.001, + "step": 10559 + }, + { + "epoch": 0.06280331144733085, + "grad_norm": 1.5474567413330078, + "learning_rate": 4.9515064991110485e-05, + "loss": 5.673, + "step": 10560 + }, + { + "epoch": 0.06280925873061186, + "grad_norm": 1.7129321098327637, + "learning_rate": 4.951497343248991e-05, + "loss": 5.7232, + "step": 10561 + }, + { + "epoch": 0.06281520601389286, + "grad_norm": 1.948367953300476, + "learning_rate": 4.95148818653114e-05, + "loss": 5.9378, + "step": 10562 + }, + { + "epoch": 0.06282115329717385, + "grad_norm": 1.788724422454834, + "learning_rate": 4.951479028957501e-05, + "loss": 5.9077, + "step": 10563 + }, + { + "epoch": 0.06282710058045485, + "grad_norm": 1.7036423683166504, + "learning_rate": 4.951469870528076e-05, + "loss": 5.7688, + "step": 10564 + }, + { + "epoch": 0.06283304786373585, + "grad_norm": 1.6055458784103394, + "learning_rate": 4.9514607112428676e-05, + "loss": 5.7234, + "step": 10565 + }, + { + "epoch": 0.06283899514701684, + "grad_norm": 1.9353829622268677, + "learning_rate": 4.95145155110188e-05, + "loss": 6.1046, + "step": 10566 + }, + { + "epoch": 0.06284494243029784, + "grad_norm": 1.6070129871368408, + "learning_rate": 4.9514423901051157e-05, + "loss": 5.7379, + "step": 10567 + }, + { + "epoch": 0.06285088971357884, + "grad_norm": 1.447828769683838, + "learning_rate": 4.951433228252579e-05, + "loss": 5.2944, + "step": 10568 + }, + { + "epoch": 0.06285683699685983, + "grad_norm": 2.5256540775299072, + "learning_rate": 4.951424065544271e-05, + "loss": 5.1358, + "step": 10569 + }, + { + "epoch": 0.06286278428014083, + "grad_norm": 2.29848051071167, + "learning_rate": 4.951414901980197e-05, + "loss": 5.1967, + "step": 10570 + }, + { + "epoch": 0.06286873156342183, + "grad_norm": 1.9477180242538452, + "learning_rate": 4.951405737560359e-05, + "loss": 5.7509, + "step": 10571 + }, + { + "epoch": 0.06287467884670282, + "grad_norm": 1.9303146600723267, + "learning_rate": 4.951396572284761e-05, + "loss": 5.7052, + "step": 10572 + }, + { + "epoch": 0.06288062612998382, + "grad_norm": 1.5632199048995972, + "learning_rate": 4.951387406153405e-05, + "loss": 5.5001, + "step": 10573 + }, + { + "epoch": 0.06288657341326483, + "grad_norm": 1.6798962354660034, + "learning_rate": 4.951378239166296e-05, + "loss": 5.5537, + "step": 10574 + }, + { + "epoch": 0.06289252069654581, + "grad_norm": 1.7395051717758179, + "learning_rate": 4.9513690713234355e-05, + "loss": 5.736, + "step": 10575 + }, + { + "epoch": 0.06289846797982682, + "grad_norm": 1.726020097732544, + "learning_rate": 4.951359902624828e-05, + "loss": 5.6802, + "step": 10576 + }, + { + "epoch": 0.06290441526310782, + "grad_norm": 1.8063993453979492, + "learning_rate": 4.9513507330704755e-05, + "loss": 5.6077, + "step": 10577 + }, + { + "epoch": 0.0629103625463888, + "grad_norm": 1.6284246444702148, + "learning_rate": 4.951341562660382e-05, + "loss": 5.8327, + "step": 10578 + }, + { + "epoch": 0.06291630982966981, + "grad_norm": 2.635869026184082, + "learning_rate": 4.95133239139455e-05, + "loss": 5.8252, + "step": 10579 + }, + { + "epoch": 0.06292225711295081, + "grad_norm": 2.5127367973327637, + "learning_rate": 4.9513232192729845e-05, + "loss": 5.7431, + "step": 10580 + }, + { + "epoch": 0.0629282043962318, + "grad_norm": 2.0740721225738525, + "learning_rate": 4.951314046295686e-05, + "loss": 5.4582, + "step": 10581 + }, + { + "epoch": 0.0629341516795128, + "grad_norm": 2.32232666015625, + "learning_rate": 4.95130487246266e-05, + "loss": 5.2523, + "step": 10582 + }, + { + "epoch": 0.06294009896279379, + "grad_norm": 2.164407730102539, + "learning_rate": 4.951295697773908e-05, + "loss": 5.6436, + "step": 10583 + }, + { + "epoch": 0.06294604624607479, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.951286522229435e-05, + "loss": 5.5333, + "step": 10584 + }, + { + "epoch": 0.0629519935293558, + "grad_norm": 2.025470733642578, + "learning_rate": 4.951277345829242e-05, + "loss": 5.5041, + "step": 10585 + }, + { + "epoch": 0.06295794081263678, + "grad_norm": 1.9415414333343506, + "learning_rate": 4.951268168573334e-05, + "loss": 5.2148, + "step": 10586 + }, + { + "epoch": 0.06296388809591778, + "grad_norm": 1.9229072332382202, + "learning_rate": 4.9512589904617135e-05, + "loss": 5.1461, + "step": 10587 + }, + { + "epoch": 0.06296983537919879, + "grad_norm": 2.414041757583618, + "learning_rate": 4.951249811494384e-05, + "loss": 5.5023, + "step": 10588 + }, + { + "epoch": 0.06297578266247977, + "grad_norm": 2.49826979637146, + "learning_rate": 4.9512406316713486e-05, + "loss": 5.3566, + "step": 10589 + }, + { + "epoch": 0.06298172994576078, + "grad_norm": 1.7222081422805786, + "learning_rate": 4.951231450992611e-05, + "loss": 5.3128, + "step": 10590 + }, + { + "epoch": 0.06298767722904178, + "grad_norm": 1.7181445360183716, + "learning_rate": 4.9512222694581725e-05, + "loss": 5.4598, + "step": 10591 + }, + { + "epoch": 0.06299362451232277, + "grad_norm": 1.547813892364502, + "learning_rate": 4.9512130870680385e-05, + "loss": 5.3997, + "step": 10592 + }, + { + "epoch": 0.06299957179560377, + "grad_norm": 1.6273536682128906, + "learning_rate": 4.95120390382221e-05, + "loss": 5.1668, + "step": 10593 + }, + { + "epoch": 0.06300551907888477, + "grad_norm": 1.6771745681762695, + "learning_rate": 4.9511947197206934e-05, + "loss": 5.2368, + "step": 10594 + }, + { + "epoch": 0.06301146636216576, + "grad_norm": 2.439664125442505, + "learning_rate": 4.951185534763489e-05, + "loss": 5.2178, + "step": 10595 + }, + { + "epoch": 0.06301741364544676, + "grad_norm": 2.194408655166626, + "learning_rate": 4.951176348950601e-05, + "loss": 5.3593, + "step": 10596 + }, + { + "epoch": 0.06302336092872776, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.9511671622820334e-05, + "loss": 6.3141, + "step": 10597 + }, + { + "epoch": 0.06302930821200875, + "grad_norm": 1.9550800323486328, + "learning_rate": 4.951157974757789e-05, + "loss": 5.8944, + "step": 10598 + }, + { + "epoch": 0.06303525549528975, + "grad_norm": 1.764724612236023, + "learning_rate": 4.9511487863778693e-05, + "loss": 5.5796, + "step": 10599 + }, + { + "epoch": 0.06304120277857075, + "grad_norm": 1.7987425327301025, + "learning_rate": 4.951139597142279e-05, + "loss": 5.5231, + "step": 10600 + }, + { + "epoch": 0.06304715006185174, + "grad_norm": 1.495875358581543, + "learning_rate": 4.951130407051022e-05, + "loss": 5.5019, + "step": 10601 + }, + { + "epoch": 0.06305309734513274, + "grad_norm": 2.7586476802825928, + "learning_rate": 4.9511212161041e-05, + "loss": 5.7043, + "step": 10602 + }, + { + "epoch": 0.06305904462841375, + "grad_norm": 2.1746270656585693, + "learning_rate": 4.951112024301517e-05, + "loss": 5.351, + "step": 10603 + }, + { + "epoch": 0.06306499191169473, + "grad_norm": 1.8681105375289917, + "learning_rate": 4.951102831643277e-05, + "loss": 5.4847, + "step": 10604 + }, + { + "epoch": 0.06307093919497574, + "grad_norm": 1.772286057472229, + "learning_rate": 4.951093638129382e-05, + "loss": 5.767, + "step": 10605 + }, + { + "epoch": 0.06307688647825674, + "grad_norm": 1.847748875617981, + "learning_rate": 4.951084443759835e-05, + "loss": 5.7737, + "step": 10606 + }, + { + "epoch": 0.06308283376153773, + "grad_norm": 1.9219080209732056, + "learning_rate": 4.95107524853464e-05, + "loss": 5.9414, + "step": 10607 + }, + { + "epoch": 0.06308878104481873, + "grad_norm": 1.6497199535369873, + "learning_rate": 4.9510660524538e-05, + "loss": 5.7124, + "step": 10608 + }, + { + "epoch": 0.06309472832809973, + "grad_norm": 1.8772788047790527, + "learning_rate": 4.951056855517318e-05, + "loss": 5.6784, + "step": 10609 + }, + { + "epoch": 0.06310067561138072, + "grad_norm": 2.035104990005493, + "learning_rate": 4.951047657725197e-05, + "loss": 5.5975, + "step": 10610 + }, + { + "epoch": 0.06310662289466172, + "grad_norm": 2.000922918319702, + "learning_rate": 4.9510384590774414e-05, + "loss": 5.2133, + "step": 10611 + }, + { + "epoch": 0.06311257017794271, + "grad_norm": 2.2581655979156494, + "learning_rate": 4.9510292595740536e-05, + "loss": 5.468, + "step": 10612 + }, + { + "epoch": 0.06311851746122371, + "grad_norm": 2.0332419872283936, + "learning_rate": 4.9510200592150365e-05, + "loss": 5.4923, + "step": 10613 + }, + { + "epoch": 0.06312446474450471, + "grad_norm": 1.9499238729476929, + "learning_rate": 4.9510108580003934e-05, + "loss": 5.5535, + "step": 10614 + }, + { + "epoch": 0.0631304120277857, + "grad_norm": 2.017491579055786, + "learning_rate": 4.951001655930128e-05, + "loss": 5.3771, + "step": 10615 + }, + { + "epoch": 0.0631363593110667, + "grad_norm": 2.355508804321289, + "learning_rate": 4.950992453004243e-05, + "loss": 5.0035, + "step": 10616 + }, + { + "epoch": 0.0631423065943477, + "grad_norm": 2.0470683574676514, + "learning_rate": 4.9509832492227426e-05, + "loss": 5.6073, + "step": 10617 + }, + { + "epoch": 0.0631482538776287, + "grad_norm": 1.7955858707427979, + "learning_rate": 4.9509740445856284e-05, + "loss": 5.8097, + "step": 10618 + }, + { + "epoch": 0.0631542011609097, + "grad_norm": 2.0126395225524902, + "learning_rate": 4.9509648390929045e-05, + "loss": 5.5989, + "step": 10619 + }, + { + "epoch": 0.0631601484441907, + "grad_norm": 1.8632375001907349, + "learning_rate": 4.950955632744575e-05, + "loss": 5.5585, + "step": 10620 + }, + { + "epoch": 0.06316609572747169, + "grad_norm": 2.2190446853637695, + "learning_rate": 4.950946425540641e-05, + "loss": 5.5182, + "step": 10621 + }, + { + "epoch": 0.06317204301075269, + "grad_norm": 2.082871675491333, + "learning_rate": 4.9509372174811074e-05, + "loss": 5.7849, + "step": 10622 + }, + { + "epoch": 0.06317799029403369, + "grad_norm": 2.17744517326355, + "learning_rate": 4.9509280085659774e-05, + "loss": 5.2332, + "step": 10623 + }, + { + "epoch": 0.06318393757731468, + "grad_norm": 1.7662746906280518, + "learning_rate": 4.950918798795253e-05, + "loss": 5.4136, + "step": 10624 + }, + { + "epoch": 0.06318988486059568, + "grad_norm": 1.6879531145095825, + "learning_rate": 4.950909588168939e-05, + "loss": 5.3747, + "step": 10625 + }, + { + "epoch": 0.06319583214387668, + "grad_norm": 2.0174877643585205, + "learning_rate": 4.950900376687038e-05, + "loss": 5.2927, + "step": 10626 + }, + { + "epoch": 0.06320177942715767, + "grad_norm": 1.9052749872207642, + "learning_rate": 4.950891164349552e-05, + "loss": 5.1492, + "step": 10627 + }, + { + "epoch": 0.06320772671043867, + "grad_norm": 1.7647850513458252, + "learning_rate": 4.950881951156485e-05, + "loss": 5.4182, + "step": 10628 + }, + { + "epoch": 0.06321367399371967, + "grad_norm": 1.9794502258300781, + "learning_rate": 4.950872737107841e-05, + "loss": 5.3838, + "step": 10629 + }, + { + "epoch": 0.06321962127700066, + "grad_norm": 2.3403780460357666, + "learning_rate": 4.950863522203623e-05, + "loss": 5.4542, + "step": 10630 + }, + { + "epoch": 0.06322556856028166, + "grad_norm": 1.8747358322143555, + "learning_rate": 4.9508543064438336e-05, + "loss": 5.4949, + "step": 10631 + }, + { + "epoch": 0.06323151584356267, + "grad_norm": 1.9435046911239624, + "learning_rate": 4.950845089828476e-05, + "loss": 5.6136, + "step": 10632 + }, + { + "epoch": 0.06323746312684365, + "grad_norm": 2.095583438873291, + "learning_rate": 4.9508358723575544e-05, + "loss": 5.2864, + "step": 10633 + }, + { + "epoch": 0.06324341041012466, + "grad_norm": 1.8254145383834839, + "learning_rate": 4.9508266540310705e-05, + "loss": 5.4732, + "step": 10634 + }, + { + "epoch": 0.06324935769340566, + "grad_norm": 2.303638458251953, + "learning_rate": 4.950817434849029e-05, + "loss": 5.1501, + "step": 10635 + }, + { + "epoch": 0.06325530497668665, + "grad_norm": 2.5389420986175537, + "learning_rate": 4.950808214811432e-05, + "loss": 5.0723, + "step": 10636 + }, + { + "epoch": 0.06326125225996765, + "grad_norm": 2.1702539920806885, + "learning_rate": 4.950798993918283e-05, + "loss": 4.8838, + "step": 10637 + }, + { + "epoch": 0.06326719954324865, + "grad_norm": 1.921650767326355, + "learning_rate": 4.9507897721695855e-05, + "loss": 5.9958, + "step": 10638 + }, + { + "epoch": 0.06327314682652964, + "grad_norm": 2.2247352600097656, + "learning_rate": 4.950780549565343e-05, + "loss": 4.9319, + "step": 10639 + }, + { + "epoch": 0.06327909410981064, + "grad_norm": 2.3517649173736572, + "learning_rate": 4.950771326105558e-05, + "loss": 4.6033, + "step": 10640 + }, + { + "epoch": 0.06328504139309163, + "grad_norm": 2.053856134414673, + "learning_rate": 4.950762101790234e-05, + "loss": 4.3799, + "step": 10641 + }, + { + "epoch": 0.06329098867637263, + "grad_norm": 1.8055500984191895, + "learning_rate": 4.9507528766193746e-05, + "loss": 5.244, + "step": 10642 + }, + { + "epoch": 0.06329693595965363, + "grad_norm": 2.0694682598114014, + "learning_rate": 4.950743650592983e-05, + "loss": 5.1965, + "step": 10643 + }, + { + "epoch": 0.06330288324293462, + "grad_norm": 2.027399778366089, + "learning_rate": 4.950734423711061e-05, + "loss": 4.5576, + "step": 10644 + }, + { + "epoch": 0.06330883052621562, + "grad_norm": 2.22308087348938, + "learning_rate": 4.950725195973614e-05, + "loss": 4.4679, + "step": 10645 + }, + { + "epoch": 0.06331477780949663, + "grad_norm": 2.1807515621185303, + "learning_rate": 4.9507159673806436e-05, + "loss": 4.6147, + "step": 10646 + }, + { + "epoch": 0.06332072509277761, + "grad_norm": 2.0173258781433105, + "learning_rate": 4.9507067379321536e-05, + "loss": 4.5657, + "step": 10647 + }, + { + "epoch": 0.06332667237605862, + "grad_norm": 1.832610845565796, + "learning_rate": 4.9506975076281474e-05, + "loss": 4.7433, + "step": 10648 + }, + { + "epoch": 0.06333261965933962, + "grad_norm": 2.027352809906006, + "learning_rate": 4.950688276468628e-05, + "loss": 5.0426, + "step": 10649 + }, + { + "epoch": 0.0633385669426206, + "grad_norm": 1.856307864189148, + "learning_rate": 4.950679044453599e-05, + "loss": 5.2838, + "step": 10650 + }, + { + "epoch": 0.06334451422590161, + "grad_norm": 2.0875375270843506, + "learning_rate": 4.950669811583062e-05, + "loss": 4.5728, + "step": 10651 + }, + { + "epoch": 0.06335046150918261, + "grad_norm": 2.1067941188812256, + "learning_rate": 4.950660577857023e-05, + "loss": 4.5313, + "step": 10652 + }, + { + "epoch": 0.0633564087924636, + "grad_norm": 2.1747500896453857, + "learning_rate": 4.9506513432754825e-05, + "loss": 4.432, + "step": 10653 + }, + { + "epoch": 0.0633623560757446, + "grad_norm": 1.769059181213379, + "learning_rate": 4.950642107838446e-05, + "loss": 5.4667, + "step": 10654 + }, + { + "epoch": 0.0633683033590256, + "grad_norm": 2.2065072059631348, + "learning_rate": 4.9506328715459146e-05, + "loss": 5.9873, + "step": 10655 + }, + { + "epoch": 0.06337425064230659, + "grad_norm": 1.679431438446045, + "learning_rate": 4.950623634397893e-05, + "loss": 5.851, + "step": 10656 + }, + { + "epoch": 0.06338019792558759, + "grad_norm": 1.919668197631836, + "learning_rate": 4.950614396394384e-05, + "loss": 5.8613, + "step": 10657 + }, + { + "epoch": 0.0633861452088686, + "grad_norm": 1.5296612977981567, + "learning_rate": 4.9506051575353915e-05, + "loss": 5.7067, + "step": 10658 + }, + { + "epoch": 0.06339209249214958, + "grad_norm": 2.1283507347106934, + "learning_rate": 4.950595917820917e-05, + "loss": 5.1141, + "step": 10659 + }, + { + "epoch": 0.06339803977543058, + "grad_norm": 1.7011604309082031, + "learning_rate": 4.950586677250966e-05, + "loss": 6.0463, + "step": 10660 + }, + { + "epoch": 0.06340398705871159, + "grad_norm": 1.7479497194290161, + "learning_rate": 4.9505774358255396e-05, + "loss": 5.8942, + "step": 10661 + }, + { + "epoch": 0.06340993434199257, + "grad_norm": 1.939471960067749, + "learning_rate": 4.950568193544642e-05, + "loss": 5.562, + "step": 10662 + }, + { + "epoch": 0.06341588162527358, + "grad_norm": 1.871993899345398, + "learning_rate": 4.9505589504082764e-05, + "loss": 5.746, + "step": 10663 + }, + { + "epoch": 0.06342182890855458, + "grad_norm": 2.173109292984009, + "learning_rate": 4.950549706416446e-05, + "loss": 5.5927, + "step": 10664 + }, + { + "epoch": 0.06342777619183557, + "grad_norm": 1.809971809387207, + "learning_rate": 4.950540461569154e-05, + "loss": 5.8983, + "step": 10665 + }, + { + "epoch": 0.06343372347511657, + "grad_norm": 1.6344120502471924, + "learning_rate": 4.950531215866404e-05, + "loss": 5.5301, + "step": 10666 + }, + { + "epoch": 0.06343967075839757, + "grad_norm": 2.080425500869751, + "learning_rate": 4.9505219693081985e-05, + "loss": 6.0214, + "step": 10667 + }, + { + "epoch": 0.06344561804167856, + "grad_norm": 1.9382790327072144, + "learning_rate": 4.9505127218945415e-05, + "loss": 5.676, + "step": 10668 + }, + { + "epoch": 0.06345156532495956, + "grad_norm": 1.6945782899856567, + "learning_rate": 4.9505034736254354e-05, + "loss": 5.9337, + "step": 10669 + }, + { + "epoch": 0.06345751260824055, + "grad_norm": 1.6129313707351685, + "learning_rate": 4.9504942245008836e-05, + "loss": 5.6561, + "step": 10670 + }, + { + "epoch": 0.06346345989152155, + "grad_norm": 2.002903461456299, + "learning_rate": 4.95048497452089e-05, + "loss": 5.6302, + "step": 10671 + }, + { + "epoch": 0.06346940717480255, + "grad_norm": 1.6016403436660767, + "learning_rate": 4.950475723685457e-05, + "loss": 5.8275, + "step": 10672 + }, + { + "epoch": 0.06347535445808354, + "grad_norm": 1.7645297050476074, + "learning_rate": 4.9504664719945895e-05, + "loss": 5.5541, + "step": 10673 + }, + { + "epoch": 0.06348130174136454, + "grad_norm": 1.9627439975738525, + "learning_rate": 4.950457219448288e-05, + "loss": 5.6425, + "step": 10674 + }, + { + "epoch": 0.06348724902464555, + "grad_norm": 1.6297314167022705, + "learning_rate": 4.950447966046558e-05, + "loss": 5.5735, + "step": 10675 + }, + { + "epoch": 0.06349319630792653, + "grad_norm": 1.7911304235458374, + "learning_rate": 4.9504387117894014e-05, + "loss": 5.7736, + "step": 10676 + }, + { + "epoch": 0.06349914359120754, + "grad_norm": 1.627543330192566, + "learning_rate": 4.950429456676823e-05, + "loss": 5.736, + "step": 10677 + }, + { + "epoch": 0.06350509087448854, + "grad_norm": 1.9574320316314697, + "learning_rate": 4.950420200708824e-05, + "loss": 5.365, + "step": 10678 + }, + { + "epoch": 0.06351103815776953, + "grad_norm": 1.7698450088500977, + "learning_rate": 4.950410943885408e-05, + "loss": 5.5742, + "step": 10679 + }, + { + "epoch": 0.06351698544105053, + "grad_norm": 1.7660366296768188, + "learning_rate": 4.9504016862065806e-05, + "loss": 5.9064, + "step": 10680 + }, + { + "epoch": 0.06352293272433153, + "grad_norm": 2.0279083251953125, + "learning_rate": 4.9503924276723425e-05, + "loss": 5.7938, + "step": 10681 + }, + { + "epoch": 0.06352888000761252, + "grad_norm": 2.101827621459961, + "learning_rate": 4.9503831682826974e-05, + "loss": 5.4898, + "step": 10682 + }, + { + "epoch": 0.06353482729089352, + "grad_norm": 2.04978084564209, + "learning_rate": 4.9503739080376486e-05, + "loss": 5.3753, + "step": 10683 + }, + { + "epoch": 0.06354077457417452, + "grad_norm": 1.8539999723434448, + "learning_rate": 4.950364646937201e-05, + "loss": 5.5575, + "step": 10684 + }, + { + "epoch": 0.06354672185745551, + "grad_norm": 2.077073097229004, + "learning_rate": 4.9503553849813556e-05, + "loss": 5.4628, + "step": 10685 + }, + { + "epoch": 0.06355266914073651, + "grad_norm": 1.8130167722702026, + "learning_rate": 4.950346122170116e-05, + "loss": 5.1648, + "step": 10686 + }, + { + "epoch": 0.06355861642401751, + "grad_norm": 1.810944676399231, + "learning_rate": 4.950336858503486e-05, + "loss": 5.8371, + "step": 10687 + }, + { + "epoch": 0.0635645637072985, + "grad_norm": 2.0081756114959717, + "learning_rate": 4.950327593981469e-05, + "loss": 5.6933, + "step": 10688 + }, + { + "epoch": 0.0635705109905795, + "grad_norm": 1.5824620723724365, + "learning_rate": 4.950318328604068e-05, + "loss": 5.4494, + "step": 10689 + }, + { + "epoch": 0.0635764582738605, + "grad_norm": 1.6470626592636108, + "learning_rate": 4.950309062371286e-05, + "loss": 6.2401, + "step": 10690 + }, + { + "epoch": 0.0635824055571415, + "grad_norm": 1.799074649810791, + "learning_rate": 4.950299795283127e-05, + "loss": 6.1075, + "step": 10691 + }, + { + "epoch": 0.0635883528404225, + "grad_norm": 2.0551035404205322, + "learning_rate": 4.950290527339593e-05, + "loss": 5.6646, + "step": 10692 + }, + { + "epoch": 0.0635943001237035, + "grad_norm": 2.3543875217437744, + "learning_rate": 4.9502812585406875e-05, + "loss": 4.9341, + "step": 10693 + }, + { + "epoch": 0.06360024740698449, + "grad_norm": 2.0479071140289307, + "learning_rate": 4.950271988886415e-05, + "loss": 5.3351, + "step": 10694 + }, + { + "epoch": 0.06360619469026549, + "grad_norm": 1.9331302642822266, + "learning_rate": 4.950262718376778e-05, + "loss": 5.6269, + "step": 10695 + }, + { + "epoch": 0.06361214197354649, + "grad_norm": 1.9922640323638916, + "learning_rate": 4.950253447011779e-05, + "loss": 5.5113, + "step": 10696 + }, + { + "epoch": 0.06361808925682748, + "grad_norm": 1.769916296005249, + "learning_rate": 4.950244174791422e-05, + "loss": 5.5902, + "step": 10697 + }, + { + "epoch": 0.06362403654010848, + "grad_norm": 2.8808071613311768, + "learning_rate": 4.95023490171571e-05, + "loss": 4.9506, + "step": 10698 + }, + { + "epoch": 0.06362998382338947, + "grad_norm": 2.0609331130981445, + "learning_rate": 4.9502256277846466e-05, + "loss": 5.4256, + "step": 10699 + }, + { + "epoch": 0.06363593110667047, + "grad_norm": 2.0112223625183105, + "learning_rate": 4.950216352998234e-05, + "loss": 6.1121, + "step": 10700 + }, + { + "epoch": 0.06364187838995147, + "grad_norm": 1.5665667057037354, + "learning_rate": 4.9502070773564765e-05, + "loss": 5.1959, + "step": 10701 + }, + { + "epoch": 0.06364782567323246, + "grad_norm": 1.9731864929199219, + "learning_rate": 4.9501978008593774e-05, + "loss": 5.2887, + "step": 10702 + }, + { + "epoch": 0.06365377295651346, + "grad_norm": 1.7925242185592651, + "learning_rate": 4.9501885235069404e-05, + "loss": 5.7386, + "step": 10703 + }, + { + "epoch": 0.06365972023979447, + "grad_norm": 1.6686629056930542, + "learning_rate": 4.950179245299166e-05, + "loss": 5.7279, + "step": 10704 + }, + { + "epoch": 0.06366566752307545, + "grad_norm": 2.034392833709717, + "learning_rate": 4.95016996623606e-05, + "loss": 5.6148, + "step": 10705 + }, + { + "epoch": 0.06367161480635646, + "grad_norm": 2.1711995601654053, + "learning_rate": 4.9501606863176254e-05, + "loss": 5.7088, + "step": 10706 + }, + { + "epoch": 0.06367756208963746, + "grad_norm": 2.3276829719543457, + "learning_rate": 4.950151405543865e-05, + "loss": 5.3658, + "step": 10707 + }, + { + "epoch": 0.06368350937291845, + "grad_norm": 2.174130916595459, + "learning_rate": 4.9501421239147824e-05, + "loss": 5.3459, + "step": 10708 + }, + { + "epoch": 0.06368945665619945, + "grad_norm": 1.8721747398376465, + "learning_rate": 4.9501328414303794e-05, + "loss": 5.3375, + "step": 10709 + }, + { + "epoch": 0.06369540393948045, + "grad_norm": 1.8677324056625366, + "learning_rate": 4.9501235580906615e-05, + "loss": 5.8192, + "step": 10710 + }, + { + "epoch": 0.06370135122276144, + "grad_norm": 2.0901246070861816, + "learning_rate": 4.9501142738956294e-05, + "loss": 6.1188, + "step": 10711 + }, + { + "epoch": 0.06370729850604244, + "grad_norm": 1.7860997915267944, + "learning_rate": 4.9501049888452885e-05, + "loss": 5.4011, + "step": 10712 + }, + { + "epoch": 0.06371324578932344, + "grad_norm": 2.000946283340454, + "learning_rate": 4.950095702939642e-05, + "loss": 5.16, + "step": 10713 + }, + { + "epoch": 0.06371919307260443, + "grad_norm": 2.47086501121521, + "learning_rate": 4.950086416178691e-05, + "loss": 5.1543, + "step": 10714 + }, + { + "epoch": 0.06372514035588543, + "grad_norm": 1.8694473505020142, + "learning_rate": 4.9500771285624415e-05, + "loss": 5.3576, + "step": 10715 + }, + { + "epoch": 0.06373108763916643, + "grad_norm": 1.8921676874160767, + "learning_rate": 4.9500678400908946e-05, + "loss": 5.0827, + "step": 10716 + }, + { + "epoch": 0.06373703492244742, + "grad_norm": 1.8423974514007568, + "learning_rate": 4.950058550764054e-05, + "loss": 4.9912, + "step": 10717 + }, + { + "epoch": 0.06374298220572842, + "grad_norm": 1.6893757581710815, + "learning_rate": 4.950049260581924e-05, + "loss": 5.2792, + "step": 10718 + }, + { + "epoch": 0.06374892948900943, + "grad_norm": 1.720799446105957, + "learning_rate": 4.950039969544507e-05, + "loss": 5.4355, + "step": 10719 + }, + { + "epoch": 0.06375487677229041, + "grad_norm": 1.717527151107788, + "learning_rate": 4.9500306776518065e-05, + "loss": 5.2802, + "step": 10720 + }, + { + "epoch": 0.06376082405557142, + "grad_norm": 1.876207947731018, + "learning_rate": 4.950021384903825e-05, + "loss": 5.4667, + "step": 10721 + }, + { + "epoch": 0.06376677133885242, + "grad_norm": 1.7892308235168457, + "learning_rate": 4.9500120913005666e-05, + "loss": 5.6635, + "step": 10722 + }, + { + "epoch": 0.0637727186221334, + "grad_norm": 1.828092336654663, + "learning_rate": 4.950002796842034e-05, + "loss": 5.5301, + "step": 10723 + }, + { + "epoch": 0.06377866590541441, + "grad_norm": 1.5860785245895386, + "learning_rate": 4.949993501528232e-05, + "loss": 5.2337, + "step": 10724 + }, + { + "epoch": 0.06378461318869541, + "grad_norm": 1.731295108795166, + "learning_rate": 4.949984205359161e-05, + "loss": 5.4115, + "step": 10725 + }, + { + "epoch": 0.0637905604719764, + "grad_norm": 2.194288969039917, + "learning_rate": 4.949974908334827e-05, + "loss": 5.4736, + "step": 10726 + }, + { + "epoch": 0.0637965077552574, + "grad_norm": 1.6036415100097656, + "learning_rate": 4.949965610455231e-05, + "loss": 5.4563, + "step": 10727 + }, + { + "epoch": 0.06380245503853839, + "grad_norm": 1.6228232383728027, + "learning_rate": 4.949956311720378e-05, + "loss": 5.4695, + "step": 10728 + }, + { + "epoch": 0.06380840232181939, + "grad_norm": 1.3040069341659546, + "learning_rate": 4.94994701213027e-05, + "loss": 5.0126, + "step": 10729 + }, + { + "epoch": 0.06381434960510039, + "grad_norm": 1.5976930856704712, + "learning_rate": 4.9499377116849116e-05, + "loss": 5.0165, + "step": 10730 + }, + { + "epoch": 0.06382029688838138, + "grad_norm": 1.5877797603607178, + "learning_rate": 4.9499284103843046e-05, + "loss": 5.1634, + "step": 10731 + }, + { + "epoch": 0.06382624417166238, + "grad_norm": 1.6466439962387085, + "learning_rate": 4.949919108228453e-05, + "loss": 5.3954, + "step": 10732 + }, + { + "epoch": 0.06383219145494338, + "grad_norm": 1.5188345909118652, + "learning_rate": 4.949909805217361e-05, + "loss": 5.2876, + "step": 10733 + }, + { + "epoch": 0.06383813873822437, + "grad_norm": 1.836227297782898, + "learning_rate": 4.94990050135103e-05, + "loss": 5.4966, + "step": 10734 + }, + { + "epoch": 0.06384408602150538, + "grad_norm": 1.5542840957641602, + "learning_rate": 4.9498911966294635e-05, + "loss": 5.2188, + "step": 10735 + }, + { + "epoch": 0.06385003330478638, + "grad_norm": 1.3053034543991089, + "learning_rate": 4.9498818910526656e-05, + "loss": 5.3834, + "step": 10736 + }, + { + "epoch": 0.06385598058806737, + "grad_norm": 1.4250247478485107, + "learning_rate": 4.9498725846206395e-05, + "loss": 5.1852, + "step": 10737 + }, + { + "epoch": 0.06386192787134837, + "grad_norm": 1.5885393619537354, + "learning_rate": 4.9498632773333886e-05, + "loss": 5.2518, + "step": 10738 + }, + { + "epoch": 0.06386787515462937, + "grad_norm": 1.5664896965026855, + "learning_rate": 4.949853969190915e-05, + "loss": 5.1186, + "step": 10739 + }, + { + "epoch": 0.06387382243791036, + "grad_norm": 1.5156123638153076, + "learning_rate": 4.949844660193223e-05, + "loss": 5.1111, + "step": 10740 + }, + { + "epoch": 0.06387976972119136, + "grad_norm": 1.5308325290679932, + "learning_rate": 4.949835350340316e-05, + "loss": 5.1577, + "step": 10741 + }, + { + "epoch": 0.06388571700447236, + "grad_norm": 1.3338321447372437, + "learning_rate": 4.949826039632196e-05, + "loss": 5.2386, + "step": 10742 + }, + { + "epoch": 0.06389166428775335, + "grad_norm": 1.5307821035385132, + "learning_rate": 4.9498167280688676e-05, + "loss": 5.1173, + "step": 10743 + }, + { + "epoch": 0.06389761157103435, + "grad_norm": 1.607913613319397, + "learning_rate": 4.9498074156503325e-05, + "loss": 5.3077, + "step": 10744 + }, + { + "epoch": 0.06390355885431535, + "grad_norm": 1.6242469549179077, + "learning_rate": 4.949798102376596e-05, + "loss": 5.3319, + "step": 10745 + }, + { + "epoch": 0.06390950613759634, + "grad_norm": 1.62213134765625, + "learning_rate": 4.9497887882476604e-05, + "loss": 5.3494, + "step": 10746 + }, + { + "epoch": 0.06391545342087734, + "grad_norm": 1.4064897298812866, + "learning_rate": 4.949779473263528e-05, + "loss": 5.207, + "step": 10747 + }, + { + "epoch": 0.06392140070415835, + "grad_norm": 1.7431879043579102, + "learning_rate": 4.949770157424203e-05, + "loss": 5.4068, + "step": 10748 + }, + { + "epoch": 0.06392734798743933, + "grad_norm": 1.5815304517745972, + "learning_rate": 4.949760840729689e-05, + "loss": 5.3917, + "step": 10749 + }, + { + "epoch": 0.06393329527072034, + "grad_norm": 1.576541543006897, + "learning_rate": 4.949751523179988e-05, + "loss": 5.4123, + "step": 10750 + }, + { + "epoch": 0.06393924255400134, + "grad_norm": 1.6717814207077026, + "learning_rate": 4.9497422047751054e-05, + "loss": 5.3028, + "step": 10751 + }, + { + "epoch": 0.06394518983728233, + "grad_norm": 1.4091792106628418, + "learning_rate": 4.9497328855150424e-05, + "loss": 5.2231, + "step": 10752 + }, + { + "epoch": 0.06395113712056333, + "grad_norm": 1.4366726875305176, + "learning_rate": 4.949723565399803e-05, + "loss": 5.2908, + "step": 10753 + }, + { + "epoch": 0.06395708440384433, + "grad_norm": 1.6679248809814453, + "learning_rate": 4.9497142444293906e-05, + "loss": 5.1079, + "step": 10754 + }, + { + "epoch": 0.06396303168712532, + "grad_norm": 1.6619216203689575, + "learning_rate": 4.949704922603808e-05, + "loss": 5.1504, + "step": 10755 + }, + { + "epoch": 0.06396897897040632, + "grad_norm": 1.7149940729141235, + "learning_rate": 4.9496955999230586e-05, + "loss": 5.3031, + "step": 10756 + }, + { + "epoch": 0.06397492625368732, + "grad_norm": 1.711256504058838, + "learning_rate": 4.9496862763871456e-05, + "loss": 5.2146, + "step": 10757 + }, + { + "epoch": 0.06398087353696831, + "grad_norm": 1.654680609703064, + "learning_rate": 4.949676951996073e-05, + "loss": 5.2774, + "step": 10758 + }, + { + "epoch": 0.06398682082024931, + "grad_norm": 1.5115636587142944, + "learning_rate": 4.949667626749843e-05, + "loss": 5.2155, + "step": 10759 + }, + { + "epoch": 0.0639927681035303, + "grad_norm": 1.7153947353363037, + "learning_rate": 4.9496583006484596e-05, + "loss": 5.2711, + "step": 10760 + }, + { + "epoch": 0.0639987153868113, + "grad_norm": 1.8497945070266724, + "learning_rate": 4.949648973691926e-05, + "loss": 5.2864, + "step": 10761 + }, + { + "epoch": 0.0640046626700923, + "grad_norm": 1.5251562595367432, + "learning_rate": 4.9496396458802455e-05, + "loss": 5.2532, + "step": 10762 + }, + { + "epoch": 0.0640106099533733, + "grad_norm": 1.5916621685028076, + "learning_rate": 4.94963031721342e-05, + "loss": 5.2136, + "step": 10763 + }, + { + "epoch": 0.0640165572366543, + "grad_norm": 1.5781627893447876, + "learning_rate": 4.949620987691455e-05, + "loss": 5.3188, + "step": 10764 + }, + { + "epoch": 0.0640225045199353, + "grad_norm": 1.7783690690994263, + "learning_rate": 4.9496116573143515e-05, + "loss": 5.4196, + "step": 10765 + }, + { + "epoch": 0.06402845180321629, + "grad_norm": 1.5746928453445435, + "learning_rate": 4.949602326082115e-05, + "loss": 5.3724, + "step": 10766 + }, + { + "epoch": 0.06403439908649729, + "grad_norm": 1.677771806716919, + "learning_rate": 4.9495929939947475e-05, + "loss": 5.2894, + "step": 10767 + }, + { + "epoch": 0.06404034636977829, + "grad_norm": 1.7747725248336792, + "learning_rate": 4.949583661052252e-05, + "loss": 5.0527, + "step": 10768 + }, + { + "epoch": 0.06404629365305928, + "grad_norm": 1.6927893161773682, + "learning_rate": 4.9495743272546314e-05, + "loss": 5.0999, + "step": 10769 + }, + { + "epoch": 0.06405224093634028, + "grad_norm": 1.6289039850234985, + "learning_rate": 4.949564992601891e-05, + "loss": 5.4197, + "step": 10770 + }, + { + "epoch": 0.06405818821962128, + "grad_norm": 1.742658019065857, + "learning_rate": 4.9495556570940316e-05, + "loss": 5.2927, + "step": 10771 + }, + { + "epoch": 0.06406413550290227, + "grad_norm": 1.6643215417861938, + "learning_rate": 4.949546320731059e-05, + "loss": 5.3262, + "step": 10772 + }, + { + "epoch": 0.06407008278618327, + "grad_norm": 1.6400927305221558, + "learning_rate": 4.949536983512974e-05, + "loss": 5.1072, + "step": 10773 + }, + { + "epoch": 0.06407603006946427, + "grad_norm": 1.7093544006347656, + "learning_rate": 4.949527645439781e-05, + "loss": 5.1849, + "step": 10774 + }, + { + "epoch": 0.06408197735274526, + "grad_norm": 1.6980849504470825, + "learning_rate": 4.949518306511484e-05, + "loss": 5.3661, + "step": 10775 + }, + { + "epoch": 0.06408792463602626, + "grad_norm": 1.7241551876068115, + "learning_rate": 4.949508966728085e-05, + "loss": 5.3315, + "step": 10776 + }, + { + "epoch": 0.06409387191930727, + "grad_norm": 1.8421318531036377, + "learning_rate": 4.9494996260895874e-05, + "loss": 5.3506, + "step": 10777 + }, + { + "epoch": 0.06409981920258825, + "grad_norm": 1.835738182067871, + "learning_rate": 4.949490284595995e-05, + "loss": 5.2087, + "step": 10778 + }, + { + "epoch": 0.06410576648586926, + "grad_norm": 1.6622625589370728, + "learning_rate": 4.949480942247311e-05, + "loss": 5.0072, + "step": 10779 + }, + { + "epoch": 0.06411171376915026, + "grad_norm": 1.5437613725662231, + "learning_rate": 4.949471599043539e-05, + "loss": 5.182, + "step": 10780 + }, + { + "epoch": 0.06411766105243125, + "grad_norm": 1.620758295059204, + "learning_rate": 4.949462254984681e-05, + "loss": 5.2771, + "step": 10781 + }, + { + "epoch": 0.06412360833571225, + "grad_norm": 1.6143954992294312, + "learning_rate": 4.949452910070741e-05, + "loss": 5.1175, + "step": 10782 + }, + { + "epoch": 0.06412955561899325, + "grad_norm": 1.8173086643218994, + "learning_rate": 4.949443564301722e-05, + "loss": 5.175, + "step": 10783 + }, + { + "epoch": 0.06413550290227424, + "grad_norm": 1.75434148311615, + "learning_rate": 4.9494342176776284e-05, + "loss": 5.1133, + "step": 10784 + }, + { + "epoch": 0.06414145018555524, + "grad_norm": 1.7278660535812378, + "learning_rate": 4.949424870198462e-05, + "loss": 5.0704, + "step": 10785 + }, + { + "epoch": 0.06414739746883624, + "grad_norm": 1.793285608291626, + "learning_rate": 4.949415521864228e-05, + "loss": 5.1567, + "step": 10786 + }, + { + "epoch": 0.06415334475211723, + "grad_norm": 1.7892498970031738, + "learning_rate": 4.949406172674927e-05, + "loss": 5.201, + "step": 10787 + }, + { + "epoch": 0.06415929203539823, + "grad_norm": 2.276643991470337, + "learning_rate": 4.9493968226305645e-05, + "loss": 5.5555, + "step": 10788 + }, + { + "epoch": 0.06416523931867922, + "grad_norm": 1.5785993337631226, + "learning_rate": 4.9493874717311416e-05, + "loss": 5.2692, + "step": 10789 + }, + { + "epoch": 0.06417118660196022, + "grad_norm": 1.3982635736465454, + "learning_rate": 4.949378119976664e-05, + "loss": 5.24, + "step": 10790 + }, + { + "epoch": 0.06417713388524122, + "grad_norm": 1.4310967922210693, + "learning_rate": 4.949368767367133e-05, + "loss": 5.2032, + "step": 10791 + }, + { + "epoch": 0.06418308116852221, + "grad_norm": 1.5635451078414917, + "learning_rate": 4.949359413902554e-05, + "loss": 5.2589, + "step": 10792 + }, + { + "epoch": 0.06418902845180322, + "grad_norm": 1.5000566244125366, + "learning_rate": 4.949350059582927e-05, + "loss": 5.147, + "step": 10793 + }, + { + "epoch": 0.06419497573508422, + "grad_norm": 1.7782738208770752, + "learning_rate": 4.9493407044082585e-05, + "loss": 5.1987, + "step": 10794 + }, + { + "epoch": 0.0642009230183652, + "grad_norm": 1.5931564569473267, + "learning_rate": 4.94933134837855e-05, + "loss": 5.2591, + "step": 10795 + }, + { + "epoch": 0.06420687030164621, + "grad_norm": 1.619287371635437, + "learning_rate": 4.9493219914938055e-05, + "loss": 5.1041, + "step": 10796 + }, + { + "epoch": 0.06421281758492721, + "grad_norm": 1.5174281597137451, + "learning_rate": 4.949312633754028e-05, + "loss": 5.1798, + "step": 10797 + }, + { + "epoch": 0.0642187648682082, + "grad_norm": 1.6485828161239624, + "learning_rate": 4.9493032751592205e-05, + "loss": 5.1086, + "step": 10798 + }, + { + "epoch": 0.0642247121514892, + "grad_norm": 1.830984354019165, + "learning_rate": 4.949293915709386e-05, + "loss": 5.2241, + "step": 10799 + }, + { + "epoch": 0.0642306594347702, + "grad_norm": 1.9102944135665894, + "learning_rate": 4.94928455540453e-05, + "loss": 4.9652, + "step": 10800 + }, + { + "epoch": 0.06423660671805119, + "grad_norm": 1.6826778650283813, + "learning_rate": 4.949275194244653e-05, + "loss": 5.0479, + "step": 10801 + }, + { + "epoch": 0.06424255400133219, + "grad_norm": 1.7545628547668457, + "learning_rate": 4.9492658322297595e-05, + "loss": 4.9263, + "step": 10802 + }, + { + "epoch": 0.0642485012846132, + "grad_norm": 1.621121883392334, + "learning_rate": 4.949256469359852e-05, + "loss": 4.9095, + "step": 10803 + }, + { + "epoch": 0.06425444856789418, + "grad_norm": 1.727095603942871, + "learning_rate": 4.9492471056349356e-05, + "loss": 5.1913, + "step": 10804 + }, + { + "epoch": 0.06426039585117518, + "grad_norm": 1.749241590499878, + "learning_rate": 4.949237741055011e-05, + "loss": 5.4284, + "step": 10805 + }, + { + "epoch": 0.06426634313445619, + "grad_norm": 1.627784252166748, + "learning_rate": 4.9492283756200834e-05, + "loss": 5.547, + "step": 10806 + }, + { + "epoch": 0.06427229041773717, + "grad_norm": 1.8133957386016846, + "learning_rate": 4.949219009330155e-05, + "loss": 5.5841, + "step": 10807 + }, + { + "epoch": 0.06427823770101818, + "grad_norm": 1.6667630672454834, + "learning_rate": 4.949209642185231e-05, + "loss": 5.4091, + "step": 10808 + }, + { + "epoch": 0.06428418498429918, + "grad_norm": 1.601288914680481, + "learning_rate": 4.949200274185312e-05, + "loss": 4.9647, + "step": 10809 + }, + { + "epoch": 0.06429013226758017, + "grad_norm": 1.4544743299484253, + "learning_rate": 4.9491909053304025e-05, + "loss": 5.477, + "step": 10810 + }, + { + "epoch": 0.06429607955086117, + "grad_norm": 1.65786874294281, + "learning_rate": 4.949181535620506e-05, + "loss": 5.2401, + "step": 10811 + }, + { + "epoch": 0.06430202683414217, + "grad_norm": 1.561251163482666, + "learning_rate": 4.949172165055625e-05, + "loss": 5.7689, + "step": 10812 + }, + { + "epoch": 0.06430797411742316, + "grad_norm": 1.465378999710083, + "learning_rate": 4.949162793635764e-05, + "loss": 5.4109, + "step": 10813 + }, + { + "epoch": 0.06431392140070416, + "grad_norm": 1.3914259672164917, + "learning_rate": 4.949153421360926e-05, + "loss": 5.5144, + "step": 10814 + }, + { + "epoch": 0.06431986868398516, + "grad_norm": 1.6016005277633667, + "learning_rate": 4.949144048231113e-05, + "loss": 5.2708, + "step": 10815 + }, + { + "epoch": 0.06432581596726615, + "grad_norm": 1.4063479900360107, + "learning_rate": 4.94913467424633e-05, + "loss": 5.0303, + "step": 10816 + }, + { + "epoch": 0.06433176325054715, + "grad_norm": 1.5708017349243164, + "learning_rate": 4.9491252994065785e-05, + "loss": 5.3104, + "step": 10817 + }, + { + "epoch": 0.06433771053382814, + "grad_norm": 1.5542651414871216, + "learning_rate": 4.9491159237118626e-05, + "loss": 5.1308, + "step": 10818 + }, + { + "epoch": 0.06434365781710914, + "grad_norm": 1.3946558237075806, + "learning_rate": 4.9491065471621855e-05, + "loss": 5.243, + "step": 10819 + }, + { + "epoch": 0.06434960510039014, + "grad_norm": 1.3560529947280884, + "learning_rate": 4.9490971697575513e-05, + "loss": 4.9319, + "step": 10820 + }, + { + "epoch": 0.06435555238367113, + "grad_norm": 1.6921281814575195, + "learning_rate": 4.949087791497963e-05, + "loss": 5.2203, + "step": 10821 + }, + { + "epoch": 0.06436149966695213, + "grad_norm": 1.5226655006408691, + "learning_rate": 4.9490784123834225e-05, + "loss": 5.1879, + "step": 10822 + }, + { + "epoch": 0.06436744695023314, + "grad_norm": 1.5012669563293457, + "learning_rate": 4.9490690324139346e-05, + "loss": 5.2373, + "step": 10823 + }, + { + "epoch": 0.06437339423351413, + "grad_norm": 1.8050286769866943, + "learning_rate": 4.949059651589502e-05, + "loss": 5.0441, + "step": 10824 + }, + { + "epoch": 0.06437934151679513, + "grad_norm": 1.6800918579101562, + "learning_rate": 4.9490502699101274e-05, + "loss": 5.0871, + "step": 10825 + }, + { + "epoch": 0.06438528880007613, + "grad_norm": 1.4211550951004028, + "learning_rate": 4.949040887375814e-05, + "loss": 5.118, + "step": 10826 + }, + { + "epoch": 0.06439123608335712, + "grad_norm": 1.7064868211746216, + "learning_rate": 4.949031503986568e-05, + "loss": 5.2285, + "step": 10827 + }, + { + "epoch": 0.06439718336663812, + "grad_norm": 1.862491250038147, + "learning_rate": 4.949022119742388e-05, + "loss": 5.0958, + "step": 10828 + }, + { + "epoch": 0.06440313064991912, + "grad_norm": 1.933610200881958, + "learning_rate": 4.949012734643281e-05, + "loss": 5.1282, + "step": 10829 + }, + { + "epoch": 0.06440907793320011, + "grad_norm": 1.6140058040618896, + "learning_rate": 4.949003348689249e-05, + "loss": 4.9913, + "step": 10830 + }, + { + "epoch": 0.06441502521648111, + "grad_norm": 1.6881496906280518, + "learning_rate": 4.948993961880295e-05, + "loss": 5.1017, + "step": 10831 + }, + { + "epoch": 0.06442097249976211, + "grad_norm": 1.7887358665466309, + "learning_rate": 4.948984574216422e-05, + "loss": 5.1503, + "step": 10832 + }, + { + "epoch": 0.0644269197830431, + "grad_norm": 1.635720133781433, + "learning_rate": 4.948975185697634e-05, + "loss": 5.3381, + "step": 10833 + }, + { + "epoch": 0.0644328670663241, + "grad_norm": 1.6106109619140625, + "learning_rate": 4.9489657963239346e-05, + "loss": 5.0498, + "step": 10834 + }, + { + "epoch": 0.0644388143496051, + "grad_norm": 1.740438461303711, + "learning_rate": 4.9489564060953266e-05, + "loss": 5.0302, + "step": 10835 + }, + { + "epoch": 0.0644447616328861, + "grad_norm": 1.663994312286377, + "learning_rate": 4.9489470150118124e-05, + "loss": 5.1976, + "step": 10836 + }, + { + "epoch": 0.0644507089161671, + "grad_norm": 1.6748932600021362, + "learning_rate": 4.9489376230733965e-05, + "loss": 5.0055, + "step": 10837 + }, + { + "epoch": 0.0644566561994481, + "grad_norm": 1.7139437198638916, + "learning_rate": 4.948928230280082e-05, + "loss": 4.9617, + "step": 10838 + }, + { + "epoch": 0.06446260348272909, + "grad_norm": 1.698791742324829, + "learning_rate": 4.948918836631872e-05, + "loss": 4.9725, + "step": 10839 + }, + { + "epoch": 0.06446855076601009, + "grad_norm": 1.6961768865585327, + "learning_rate": 4.94890944212877e-05, + "loss": 4.9126, + "step": 10840 + }, + { + "epoch": 0.06447449804929109, + "grad_norm": 1.6551483869552612, + "learning_rate": 4.948900046770778e-05, + "loss": 5.0775, + "step": 10841 + }, + { + "epoch": 0.06448044533257208, + "grad_norm": 1.5863447189331055, + "learning_rate": 4.948890650557901e-05, + "loss": 5.0467, + "step": 10842 + }, + { + "epoch": 0.06448639261585308, + "grad_norm": 1.5629637241363525, + "learning_rate": 4.9488812534901414e-05, + "loss": 5.0012, + "step": 10843 + }, + { + "epoch": 0.06449233989913408, + "grad_norm": 1.5247453451156616, + "learning_rate": 4.948871855567503e-05, + "loss": 4.9928, + "step": 10844 + }, + { + "epoch": 0.06449828718241507, + "grad_norm": 1.7595921754837036, + "learning_rate": 4.948862456789988e-05, + "loss": 4.9256, + "step": 10845 + }, + { + "epoch": 0.06450423446569607, + "grad_norm": 1.6370458602905273, + "learning_rate": 4.948853057157601e-05, + "loss": 4.9499, + "step": 10846 + }, + { + "epoch": 0.06451018174897706, + "grad_norm": 1.7747406959533691, + "learning_rate": 4.948843656670345e-05, + "loss": 4.9246, + "step": 10847 + }, + { + "epoch": 0.06451612903225806, + "grad_norm": 1.6769739389419556, + "learning_rate": 4.948834255328222e-05, + "loss": 4.9561, + "step": 10848 + }, + { + "epoch": 0.06452207631553906, + "grad_norm": 1.60416841506958, + "learning_rate": 4.948824853131236e-05, + "loss": 5.0318, + "step": 10849 + }, + { + "epoch": 0.06452802359882005, + "grad_norm": 2.1050093173980713, + "learning_rate": 4.948815450079392e-05, + "loss": 5.5308, + "step": 10850 + }, + { + "epoch": 0.06453397088210105, + "grad_norm": 1.7474935054779053, + "learning_rate": 4.948806046172691e-05, + "loss": 5.0752, + "step": 10851 + }, + { + "epoch": 0.06453991816538206, + "grad_norm": 1.8992688655853271, + "learning_rate": 4.948796641411138e-05, + "loss": 5.3704, + "step": 10852 + }, + { + "epoch": 0.06454586544866305, + "grad_norm": 1.9632636308670044, + "learning_rate": 4.948787235794734e-05, + "loss": 5.4173, + "step": 10853 + }, + { + "epoch": 0.06455181273194405, + "grad_norm": 1.9034284353256226, + "learning_rate": 4.948777829323484e-05, + "loss": 5.2655, + "step": 10854 + }, + { + "epoch": 0.06455776001522505, + "grad_norm": 1.716711163520813, + "learning_rate": 4.9487684219973914e-05, + "loss": 5.4192, + "step": 10855 + }, + { + "epoch": 0.06456370729850604, + "grad_norm": 1.7886557579040527, + "learning_rate": 4.948759013816459e-05, + "loss": 5.2828, + "step": 10856 + }, + { + "epoch": 0.06456965458178704, + "grad_norm": 2.004117250442505, + "learning_rate": 4.9487496047806905e-05, + "loss": 4.9521, + "step": 10857 + }, + { + "epoch": 0.06457560186506804, + "grad_norm": 1.627955436706543, + "learning_rate": 4.948740194890088e-05, + "loss": 5.4288, + "step": 10858 + }, + { + "epoch": 0.06458154914834903, + "grad_norm": 2.2537145614624023, + "learning_rate": 4.948730784144656e-05, + "loss": 5.8176, + "step": 10859 + }, + { + "epoch": 0.06458749643163003, + "grad_norm": 2.216066837310791, + "learning_rate": 4.948721372544397e-05, + "loss": 5.4569, + "step": 10860 + }, + { + "epoch": 0.06459344371491103, + "grad_norm": 1.7641898393630981, + "learning_rate": 4.948711960089315e-05, + "loss": 5.659, + "step": 10861 + }, + { + "epoch": 0.06459939099819202, + "grad_norm": 1.9137814044952393, + "learning_rate": 4.948702546779413e-05, + "loss": 5.6275, + "step": 10862 + }, + { + "epoch": 0.06460533828147302, + "grad_norm": 2.2355434894561768, + "learning_rate": 4.948693132614694e-05, + "loss": 5.1712, + "step": 10863 + }, + { + "epoch": 0.06461128556475403, + "grad_norm": 1.780849814414978, + "learning_rate": 4.9486837175951616e-05, + "loss": 5.4521, + "step": 10864 + }, + { + "epoch": 0.06461723284803501, + "grad_norm": 1.8078423738479614, + "learning_rate": 4.948674301720819e-05, + "loss": 5.3609, + "step": 10865 + }, + { + "epoch": 0.06462318013131602, + "grad_norm": 1.590707540512085, + "learning_rate": 4.94866488499167e-05, + "loss": 5.4121, + "step": 10866 + }, + { + "epoch": 0.06462912741459702, + "grad_norm": 1.4369510412216187, + "learning_rate": 4.948655467407717e-05, + "loss": 5.418, + "step": 10867 + }, + { + "epoch": 0.064635074697878, + "grad_norm": 1.5800751447677612, + "learning_rate": 4.9486460489689634e-05, + "loss": 5.3492, + "step": 10868 + }, + { + "epoch": 0.06464102198115901, + "grad_norm": 1.5271484851837158, + "learning_rate": 4.948636629675413e-05, + "loss": 5.2758, + "step": 10869 + }, + { + "epoch": 0.06464696926444001, + "grad_norm": 1.7175722122192383, + "learning_rate": 4.948627209527069e-05, + "loss": 5.2939, + "step": 10870 + }, + { + "epoch": 0.064652916547721, + "grad_norm": 1.568851113319397, + "learning_rate": 4.948617788523935e-05, + "loss": 5.2559, + "step": 10871 + }, + { + "epoch": 0.064658863831002, + "grad_norm": 1.4012210369110107, + "learning_rate": 4.9486083666660135e-05, + "loss": 5.3195, + "step": 10872 + }, + { + "epoch": 0.064664811114283, + "grad_norm": 1.5386475324630737, + "learning_rate": 4.948598943953308e-05, + "loss": 5.293, + "step": 10873 + }, + { + "epoch": 0.06467075839756399, + "grad_norm": 1.4143292903900146, + "learning_rate": 4.948589520385821e-05, + "loss": 5.2181, + "step": 10874 + }, + { + "epoch": 0.06467670568084499, + "grad_norm": 1.392470121383667, + "learning_rate": 4.9485800959635576e-05, + "loss": 5.3074, + "step": 10875 + }, + { + "epoch": 0.06468265296412598, + "grad_norm": 1.7176567316055298, + "learning_rate": 4.94857067068652e-05, + "loss": 5.3024, + "step": 10876 + }, + { + "epoch": 0.06468860024740698, + "grad_norm": 1.5002285242080688, + "learning_rate": 4.9485612445547115e-05, + "loss": 5.1543, + "step": 10877 + }, + { + "epoch": 0.06469454753068798, + "grad_norm": 1.5615242719650269, + "learning_rate": 4.9485518175681364e-05, + "loss": 5.371, + "step": 10878 + }, + { + "epoch": 0.06470049481396897, + "grad_norm": 1.4294706583023071, + "learning_rate": 4.9485423897267966e-05, + "loss": 5.4151, + "step": 10879 + }, + { + "epoch": 0.06470644209724997, + "grad_norm": 2.0147571563720703, + "learning_rate": 4.948532961030695e-05, + "loss": 5.3082, + "step": 10880 + }, + { + "epoch": 0.06471238938053098, + "grad_norm": 1.5661358833312988, + "learning_rate": 4.948523531479837e-05, + "loss": 5.8232, + "step": 10881 + }, + { + "epoch": 0.06471833666381197, + "grad_norm": 1.5608779191970825, + "learning_rate": 4.9485141010742245e-05, + "loss": 5.5648, + "step": 10882 + }, + { + "epoch": 0.06472428394709297, + "grad_norm": 2.3148789405822754, + "learning_rate": 4.948504669813861e-05, + "loss": 4.8802, + "step": 10883 + }, + { + "epoch": 0.06473023123037397, + "grad_norm": 1.9495759010314941, + "learning_rate": 4.9484952376987504e-05, + "loss": 5.1985, + "step": 10884 + }, + { + "epoch": 0.06473617851365496, + "grad_norm": 2.031764268875122, + "learning_rate": 4.9484858047288944e-05, + "loss": 5.0772, + "step": 10885 + }, + { + "epoch": 0.06474212579693596, + "grad_norm": 1.6575301885604858, + "learning_rate": 4.948476370904298e-05, + "loss": 5.2157, + "step": 10886 + }, + { + "epoch": 0.06474807308021696, + "grad_norm": 1.6381278038024902, + "learning_rate": 4.948466936224964e-05, + "loss": 5.1168, + "step": 10887 + }, + { + "epoch": 0.06475402036349795, + "grad_norm": 1.672555923461914, + "learning_rate": 4.9484575006908945e-05, + "loss": 5.2839, + "step": 10888 + }, + { + "epoch": 0.06475996764677895, + "grad_norm": 1.8838026523590088, + "learning_rate": 4.9484480643020944e-05, + "loss": 5.301, + "step": 10889 + }, + { + "epoch": 0.06476591493005995, + "grad_norm": 1.935205101966858, + "learning_rate": 4.9484386270585656e-05, + "loss": 5.2898, + "step": 10890 + }, + { + "epoch": 0.06477186221334094, + "grad_norm": 1.630003809928894, + "learning_rate": 4.9484291889603134e-05, + "loss": 5.181, + "step": 10891 + }, + { + "epoch": 0.06477780949662194, + "grad_norm": 1.5095784664154053, + "learning_rate": 4.948419750007339e-05, + "loss": 5.3159, + "step": 10892 + }, + { + "epoch": 0.06478375677990295, + "grad_norm": 1.7217234373092651, + "learning_rate": 4.948410310199647e-05, + "loss": 5.3395, + "step": 10893 + }, + { + "epoch": 0.06478970406318393, + "grad_norm": 1.727953314781189, + "learning_rate": 4.94840086953724e-05, + "loss": 5.1374, + "step": 10894 + }, + { + "epoch": 0.06479565134646494, + "grad_norm": 1.7891777753829956, + "learning_rate": 4.9483914280201224e-05, + "loss": 5.2145, + "step": 10895 + }, + { + "epoch": 0.06480159862974594, + "grad_norm": 1.7402048110961914, + "learning_rate": 4.9483819856482956e-05, + "loss": 5.1723, + "step": 10896 + }, + { + "epoch": 0.06480754591302693, + "grad_norm": 1.6635658740997314, + "learning_rate": 4.9483725424217644e-05, + "loss": 5.0995, + "step": 10897 + }, + { + "epoch": 0.06481349319630793, + "grad_norm": 1.6190650463104248, + "learning_rate": 4.9483630983405317e-05, + "loss": 5.2062, + "step": 10898 + }, + { + "epoch": 0.06481944047958893, + "grad_norm": 1.6335800886154175, + "learning_rate": 4.9483536534046006e-05, + "loss": 5.4298, + "step": 10899 + }, + { + "epoch": 0.06482538776286992, + "grad_norm": 1.7549209594726562, + "learning_rate": 4.948344207613974e-05, + "loss": 5.1833, + "step": 10900 + }, + { + "epoch": 0.06483133504615092, + "grad_norm": 1.6011431217193604, + "learning_rate": 4.948334760968656e-05, + "loss": 5.2329, + "step": 10901 + }, + { + "epoch": 0.06483728232943192, + "grad_norm": 1.627424955368042, + "learning_rate": 4.9483253134686505e-05, + "loss": 5.3059, + "step": 10902 + }, + { + "epoch": 0.06484322961271291, + "grad_norm": 1.593361258506775, + "learning_rate": 4.948315865113959e-05, + "loss": 5.2711, + "step": 10903 + }, + { + "epoch": 0.06484917689599391, + "grad_norm": 1.5899426937103271, + "learning_rate": 4.9483064159045854e-05, + "loss": 5.2449, + "step": 10904 + }, + { + "epoch": 0.0648551241792749, + "grad_norm": 1.6572548151016235, + "learning_rate": 4.948296965840534e-05, + "loss": 5.18, + "step": 10905 + }, + { + "epoch": 0.0648610714625559, + "grad_norm": 1.649928092956543, + "learning_rate": 4.948287514921808e-05, + "loss": 5.2434, + "step": 10906 + }, + { + "epoch": 0.0648670187458369, + "grad_norm": 1.4546284675598145, + "learning_rate": 4.9482780631484094e-05, + "loss": 5.405, + "step": 10907 + }, + { + "epoch": 0.06487296602911789, + "grad_norm": 1.624617338180542, + "learning_rate": 4.9482686105203425e-05, + "loss": 5.3537, + "step": 10908 + }, + { + "epoch": 0.0648789133123989, + "grad_norm": 1.5108991861343384, + "learning_rate": 4.94825915703761e-05, + "loss": 5.1709, + "step": 10909 + }, + { + "epoch": 0.0648848605956799, + "grad_norm": 1.571028470993042, + "learning_rate": 4.948249702700215e-05, + "loss": 5.1374, + "step": 10910 + }, + { + "epoch": 0.06489080787896088, + "grad_norm": 1.3280094861984253, + "learning_rate": 4.948240247508162e-05, + "loss": 5.3469, + "step": 10911 + }, + { + "epoch": 0.06489675516224189, + "grad_norm": 1.8487119674682617, + "learning_rate": 4.948230791461454e-05, + "loss": 5.4673, + "step": 10912 + }, + { + "epoch": 0.06490270244552289, + "grad_norm": 1.6253544092178345, + "learning_rate": 4.9482213345600936e-05, + "loss": 5.2096, + "step": 10913 + }, + { + "epoch": 0.06490864972880388, + "grad_norm": 1.8487451076507568, + "learning_rate": 4.9482118768040844e-05, + "loss": 5.1452, + "step": 10914 + }, + { + "epoch": 0.06491459701208488, + "grad_norm": 1.6638668775558472, + "learning_rate": 4.948202418193429e-05, + "loss": 5.2382, + "step": 10915 + }, + { + "epoch": 0.06492054429536588, + "grad_norm": 1.662256121635437, + "learning_rate": 4.9481929587281326e-05, + "loss": 5.3125, + "step": 10916 + }, + { + "epoch": 0.06492649157864687, + "grad_norm": 1.5133339166641235, + "learning_rate": 4.948183498408197e-05, + "loss": 5.2494, + "step": 10917 + }, + { + "epoch": 0.06493243886192787, + "grad_norm": 1.5063300132751465, + "learning_rate": 4.9481740372336256e-05, + "loss": 5.1778, + "step": 10918 + }, + { + "epoch": 0.06493838614520887, + "grad_norm": 1.5223631858825684, + "learning_rate": 4.948164575204421e-05, + "loss": 5.1773, + "step": 10919 + }, + { + "epoch": 0.06494433342848986, + "grad_norm": 1.6163926124572754, + "learning_rate": 4.948155112320589e-05, + "loss": 5.2669, + "step": 10920 + }, + { + "epoch": 0.06495028071177086, + "grad_norm": 1.4077887535095215, + "learning_rate": 4.948145648582131e-05, + "loss": 5.1711, + "step": 10921 + }, + { + "epoch": 0.06495622799505187, + "grad_norm": 1.5710374116897583, + "learning_rate": 4.9481361839890505e-05, + "loss": 5.1687, + "step": 10922 + }, + { + "epoch": 0.06496217527833285, + "grad_norm": 1.5444159507751465, + "learning_rate": 4.9481267185413506e-05, + "loss": 5.2681, + "step": 10923 + }, + { + "epoch": 0.06496812256161386, + "grad_norm": 1.4816917181015015, + "learning_rate": 4.948117252239035e-05, + "loss": 5.2897, + "step": 10924 + }, + { + "epoch": 0.06497406984489486, + "grad_norm": 1.3373851776123047, + "learning_rate": 4.9481077850821075e-05, + "loss": 5.1607, + "step": 10925 + }, + { + "epoch": 0.06498001712817585, + "grad_norm": 1.7353702783584595, + "learning_rate": 4.948098317070571e-05, + "loss": 5.2546, + "step": 10926 + }, + { + "epoch": 0.06498596441145685, + "grad_norm": 1.4494054317474365, + "learning_rate": 4.948088848204428e-05, + "loss": 5.2244, + "step": 10927 + }, + { + "epoch": 0.06499191169473785, + "grad_norm": 1.6031813621520996, + "learning_rate": 4.9480793784836825e-05, + "loss": 5.2487, + "step": 10928 + }, + { + "epoch": 0.06499785897801884, + "grad_norm": 1.4134970903396606, + "learning_rate": 4.948069907908338e-05, + "loss": 5.2224, + "step": 10929 + }, + { + "epoch": 0.06500380626129984, + "grad_norm": 1.5790150165557861, + "learning_rate": 4.948060436478398e-05, + "loss": 5.3096, + "step": 10930 + }, + { + "epoch": 0.06500975354458084, + "grad_norm": 1.3925936222076416, + "learning_rate": 4.9480509641938644e-05, + "loss": 5.1823, + "step": 10931 + }, + { + "epoch": 0.06501570082786183, + "grad_norm": 1.40078866481781, + "learning_rate": 4.948041491054742e-05, + "loss": 5.1352, + "step": 10932 + }, + { + "epoch": 0.06502164811114283, + "grad_norm": 1.509726881980896, + "learning_rate": 4.948032017061034e-05, + "loss": 5.199, + "step": 10933 + }, + { + "epoch": 0.06502759539442382, + "grad_norm": 1.5671876668930054, + "learning_rate": 4.948022542212743e-05, + "loss": 5.2323, + "step": 10934 + }, + { + "epoch": 0.06503354267770482, + "grad_norm": 1.5019149780273438, + "learning_rate": 4.948013066509872e-05, + "loss": 5.244, + "step": 10935 + }, + { + "epoch": 0.06503948996098582, + "grad_norm": 1.576842188835144, + "learning_rate": 4.948003589952426e-05, + "loss": 5.153, + "step": 10936 + }, + { + "epoch": 0.06504543724426681, + "grad_norm": 1.4069315195083618, + "learning_rate": 4.9479941125404074e-05, + "loss": 5.3396, + "step": 10937 + }, + { + "epoch": 0.06505138452754781, + "grad_norm": 1.6663076877593994, + "learning_rate": 4.947984634273818e-05, + "loss": 5.223, + "step": 10938 + }, + { + "epoch": 0.06505733181082882, + "grad_norm": 1.5132073163986206, + "learning_rate": 4.947975155152663e-05, + "loss": 5.1335, + "step": 10939 + }, + { + "epoch": 0.0650632790941098, + "grad_norm": 1.59386146068573, + "learning_rate": 4.9479656751769455e-05, + "loss": 5.4893, + "step": 10940 + }, + { + "epoch": 0.06506922637739081, + "grad_norm": 1.3486778736114502, + "learning_rate": 4.9479561943466686e-05, + "loss": 5.2164, + "step": 10941 + }, + { + "epoch": 0.06507517366067181, + "grad_norm": 1.4107574224472046, + "learning_rate": 4.947946712661835e-05, + "loss": 5.2337, + "step": 10942 + }, + { + "epoch": 0.0650811209439528, + "grad_norm": 1.6905080080032349, + "learning_rate": 4.947937230122449e-05, + "loss": 5.1749, + "step": 10943 + }, + { + "epoch": 0.0650870682272338, + "grad_norm": 1.5062333345413208, + "learning_rate": 4.947927746728513e-05, + "loss": 5.2227, + "step": 10944 + }, + { + "epoch": 0.0650930155105148, + "grad_norm": 1.4318712949752808, + "learning_rate": 4.947918262480031e-05, + "loss": 5.1565, + "step": 10945 + }, + { + "epoch": 0.06509896279379579, + "grad_norm": 1.5121338367462158, + "learning_rate": 4.9479087773770055e-05, + "loss": 5.3718, + "step": 10946 + }, + { + "epoch": 0.06510491007707679, + "grad_norm": 1.2901450395584106, + "learning_rate": 4.947899291419441e-05, + "loss": 5.291, + "step": 10947 + }, + { + "epoch": 0.0651108573603578, + "grad_norm": 1.5350853204727173, + "learning_rate": 4.9478898046073394e-05, + "loss": 5.411, + "step": 10948 + }, + { + "epoch": 0.06511680464363878, + "grad_norm": 1.5083260536193848, + "learning_rate": 4.947880316940705e-05, + "loss": 4.9143, + "step": 10949 + }, + { + "epoch": 0.06512275192691978, + "grad_norm": 1.462415099143982, + "learning_rate": 4.947870828419541e-05, + "loss": 5.0059, + "step": 10950 + }, + { + "epoch": 0.06512869921020079, + "grad_norm": 1.9356911182403564, + "learning_rate": 4.947861339043851e-05, + "loss": 5.3886, + "step": 10951 + }, + { + "epoch": 0.06513464649348177, + "grad_norm": 1.4918417930603027, + "learning_rate": 4.947851848813637e-05, + "loss": 5.3456, + "step": 10952 + }, + { + "epoch": 0.06514059377676278, + "grad_norm": 1.8015687465667725, + "learning_rate": 4.9478423577289044e-05, + "loss": 5.4599, + "step": 10953 + }, + { + "epoch": 0.06514654106004378, + "grad_norm": 1.663827657699585, + "learning_rate": 4.947832865789654e-05, + "loss": 5.4448, + "step": 10954 + }, + { + "epoch": 0.06515248834332477, + "grad_norm": 1.7196985483169556, + "learning_rate": 4.947823372995891e-05, + "loss": 5.4799, + "step": 10955 + }, + { + "epoch": 0.06515843562660577, + "grad_norm": 1.341449499130249, + "learning_rate": 4.947813879347619e-05, + "loss": 5.0305, + "step": 10956 + }, + { + "epoch": 0.06516438290988677, + "grad_norm": 1.9917103052139282, + "learning_rate": 4.9478043848448394e-05, + "loss": 4.9911, + "step": 10957 + }, + { + "epoch": 0.06517033019316776, + "grad_norm": 1.8540695905685425, + "learning_rate": 4.947794889487557e-05, + "loss": 4.9725, + "step": 10958 + }, + { + "epoch": 0.06517627747644876, + "grad_norm": 1.6755226850509644, + "learning_rate": 4.9477853932757744e-05, + "loss": 5.1452, + "step": 10959 + }, + { + "epoch": 0.06518222475972976, + "grad_norm": 1.613694667816162, + "learning_rate": 4.9477758962094954e-05, + "loss": 5.1241, + "step": 10960 + }, + { + "epoch": 0.06518817204301075, + "grad_norm": 1.4891341924667358, + "learning_rate": 4.9477663982887235e-05, + "loss": 5.2139, + "step": 10961 + }, + { + "epoch": 0.06519411932629175, + "grad_norm": 1.451180100440979, + "learning_rate": 4.947756899513461e-05, + "loss": 5.216, + "step": 10962 + }, + { + "epoch": 0.06520006660957274, + "grad_norm": 1.7225643396377563, + "learning_rate": 4.947747399883712e-05, + "loss": 4.9342, + "step": 10963 + }, + { + "epoch": 0.06520601389285374, + "grad_norm": 1.5917341709136963, + "learning_rate": 4.94773789939948e-05, + "loss": 4.9196, + "step": 10964 + }, + { + "epoch": 0.06521196117613474, + "grad_norm": 1.3010936975479126, + "learning_rate": 4.947728398060768e-05, + "loss": 4.8165, + "step": 10965 + }, + { + "epoch": 0.06521790845941573, + "grad_norm": 1.6672911643981934, + "learning_rate": 4.947718895867579e-05, + "loss": 5.082, + "step": 10966 + }, + { + "epoch": 0.06522385574269673, + "grad_norm": 1.5662728548049927, + "learning_rate": 4.947709392819916e-05, + "loss": 5.1654, + "step": 10967 + }, + { + "epoch": 0.06522980302597774, + "grad_norm": 1.3455015420913696, + "learning_rate": 4.947699888917784e-05, + "loss": 4.6897, + "step": 10968 + }, + { + "epoch": 0.06523575030925872, + "grad_norm": 1.6042569875717163, + "learning_rate": 4.947690384161185e-05, + "loss": 4.6814, + "step": 10969 + }, + { + "epoch": 0.06524169759253973, + "grad_norm": 1.436345100402832, + "learning_rate": 4.947680878550123e-05, + "loss": 4.6052, + "step": 10970 + }, + { + "epoch": 0.06524764487582073, + "grad_norm": 1.3438220024108887, + "learning_rate": 4.9476713720846e-05, + "loss": 4.6385, + "step": 10971 + }, + { + "epoch": 0.06525359215910172, + "grad_norm": 1.378206729888916, + "learning_rate": 4.94766186476462e-05, + "loss": 4.5546, + "step": 10972 + }, + { + "epoch": 0.06525953944238272, + "grad_norm": 1.5776808261871338, + "learning_rate": 4.9476523565901874e-05, + "loss": 4.7728, + "step": 10973 + }, + { + "epoch": 0.06526548672566372, + "grad_norm": 1.8892265558242798, + "learning_rate": 4.947642847561305e-05, + "loss": 5.3423, + "step": 10974 + }, + { + "epoch": 0.06527143400894471, + "grad_norm": 1.279730200767517, + "learning_rate": 4.9476333376779746e-05, + "loss": 4.649, + "step": 10975 + }, + { + "epoch": 0.06527738129222571, + "grad_norm": 1.6268417835235596, + "learning_rate": 4.947623826940201e-05, + "loss": 4.6534, + "step": 10976 + }, + { + "epoch": 0.06528332857550671, + "grad_norm": 1.4456939697265625, + "learning_rate": 4.947614315347987e-05, + "loss": 4.6636, + "step": 10977 + }, + { + "epoch": 0.0652892758587877, + "grad_norm": 1.4848358631134033, + "learning_rate": 4.947604802901337e-05, + "loss": 4.6823, + "step": 10978 + }, + { + "epoch": 0.0652952231420687, + "grad_norm": 1.4143959283828735, + "learning_rate": 4.947595289600253e-05, + "loss": 4.546, + "step": 10979 + }, + { + "epoch": 0.0653011704253497, + "grad_norm": 1.7399781942367554, + "learning_rate": 4.947585775444739e-05, + "loss": 5.1456, + "step": 10980 + }, + { + "epoch": 0.0653071177086307, + "grad_norm": 1.9160579442977905, + "learning_rate": 4.947576260434797e-05, + "loss": 5.4101, + "step": 10981 + }, + { + "epoch": 0.0653130649919117, + "grad_norm": 1.9356415271759033, + "learning_rate": 4.947566744570433e-05, + "loss": 5.6235, + "step": 10982 + }, + { + "epoch": 0.0653190122751927, + "grad_norm": 1.756996512413025, + "learning_rate": 4.947557227851648e-05, + "loss": 5.6458, + "step": 10983 + }, + { + "epoch": 0.06532495955847369, + "grad_norm": 1.790447473526001, + "learning_rate": 4.947547710278446e-05, + "loss": 5.1529, + "step": 10984 + }, + { + "epoch": 0.06533090684175469, + "grad_norm": 1.8125256299972534, + "learning_rate": 4.94753819185083e-05, + "loss": 4.8824, + "step": 10985 + }, + { + "epoch": 0.06533685412503569, + "grad_norm": 1.72708261013031, + "learning_rate": 4.947528672568804e-05, + "loss": 5.1252, + "step": 10986 + }, + { + "epoch": 0.06534280140831668, + "grad_norm": 1.5867630243301392, + "learning_rate": 4.9475191524323714e-05, + "loss": 5.2007, + "step": 10987 + }, + { + "epoch": 0.06534874869159768, + "grad_norm": 1.8278383016586304, + "learning_rate": 4.9475096314415356e-05, + "loss": 5.1268, + "step": 10988 + }, + { + "epoch": 0.06535469597487868, + "grad_norm": 1.6850647926330566, + "learning_rate": 4.947500109596298e-05, + "loss": 5.0058, + "step": 10989 + }, + { + "epoch": 0.06536064325815967, + "grad_norm": 1.4993211030960083, + "learning_rate": 4.9474905868966645e-05, + "loss": 5.1911, + "step": 10990 + }, + { + "epoch": 0.06536659054144067, + "grad_norm": 1.4816709756851196, + "learning_rate": 4.947481063342637e-05, + "loss": 5.073, + "step": 10991 + }, + { + "epoch": 0.06537253782472166, + "grad_norm": 1.5394763946533203, + "learning_rate": 4.9474715389342194e-05, + "loss": 5.3133, + "step": 10992 + }, + { + "epoch": 0.06537848510800266, + "grad_norm": 1.6095061302185059, + "learning_rate": 4.9474620136714144e-05, + "loss": 5.1657, + "step": 10993 + }, + { + "epoch": 0.06538443239128366, + "grad_norm": 1.707533597946167, + "learning_rate": 4.947452487554226e-05, + "loss": 5.2022, + "step": 10994 + }, + { + "epoch": 0.06539037967456465, + "grad_norm": 1.6304863691329956, + "learning_rate": 4.947442960582657e-05, + "loss": 5.1454, + "step": 10995 + }, + { + "epoch": 0.06539632695784565, + "grad_norm": 1.5767943859100342, + "learning_rate": 4.9474334327567103e-05, + "loss": 5.0317, + "step": 10996 + }, + { + "epoch": 0.06540227424112666, + "grad_norm": 1.6779369115829468, + "learning_rate": 4.9474239040763916e-05, + "loss": 5.1932, + "step": 10997 + }, + { + "epoch": 0.06540822152440764, + "grad_norm": 1.6607457399368286, + "learning_rate": 4.947414374541701e-05, + "loss": 5.2488, + "step": 10998 + }, + { + "epoch": 0.06541416880768865, + "grad_norm": 1.5271342992782593, + "learning_rate": 4.947404844152644e-05, + "loss": 5.2225, + "step": 10999 + }, + { + "epoch": 0.06542011609096965, + "grad_norm": 1.3633404970169067, + "learning_rate": 4.947395312909223e-05, + "loss": 5.2228, + "step": 11000 + }, + { + "epoch": 0.06542606337425064, + "grad_norm": 1.4911702871322632, + "learning_rate": 4.9473857808114416e-05, + "loss": 5.3533, + "step": 11001 + }, + { + "epoch": 0.06543201065753164, + "grad_norm": 1.350714087486267, + "learning_rate": 4.947376247859303e-05, + "loss": 5.2553, + "step": 11002 + }, + { + "epoch": 0.06543795794081264, + "grad_norm": 1.531064510345459, + "learning_rate": 4.9473667140528116e-05, + "loss": 5.0982, + "step": 11003 + }, + { + "epoch": 0.06544390522409363, + "grad_norm": 1.4037193059921265, + "learning_rate": 4.947357179391968e-05, + "loss": 5.2129, + "step": 11004 + }, + { + "epoch": 0.06544985250737463, + "grad_norm": 1.5746560096740723, + "learning_rate": 4.9473476438767784e-05, + "loss": 5.2561, + "step": 11005 + }, + { + "epoch": 0.06545579979065563, + "grad_norm": 1.4906586408615112, + "learning_rate": 4.947338107507245e-05, + "loss": 5.2584, + "step": 11006 + }, + { + "epoch": 0.06546174707393662, + "grad_norm": 1.687965989112854, + "learning_rate": 4.947328570283371e-05, + "loss": 5.0578, + "step": 11007 + }, + { + "epoch": 0.06546769435721762, + "grad_norm": 1.6732810735702515, + "learning_rate": 4.94731903220516e-05, + "loss": 5.1301, + "step": 11008 + }, + { + "epoch": 0.06547364164049863, + "grad_norm": 1.465431809425354, + "learning_rate": 4.947309493272615e-05, + "loss": 5.2479, + "step": 11009 + }, + { + "epoch": 0.06547958892377961, + "grad_norm": 1.4699040651321411, + "learning_rate": 4.94729995348574e-05, + "loss": 5.263, + "step": 11010 + }, + { + "epoch": 0.06548553620706062, + "grad_norm": 1.5757801532745361, + "learning_rate": 4.947290412844537e-05, + "loss": 5.2938, + "step": 11011 + }, + { + "epoch": 0.06549148349034162, + "grad_norm": 1.5458070039749146, + "learning_rate": 4.947280871349011e-05, + "loss": 5.2755, + "step": 11012 + }, + { + "epoch": 0.0654974307736226, + "grad_norm": 1.4919404983520508, + "learning_rate": 4.9472713289991644e-05, + "loss": 5.1432, + "step": 11013 + }, + { + "epoch": 0.06550337805690361, + "grad_norm": 1.513539433479309, + "learning_rate": 4.947261785795001e-05, + "loss": 5.3262, + "step": 11014 + }, + { + "epoch": 0.06550932534018461, + "grad_norm": 1.610257863998413, + "learning_rate": 4.947252241736523e-05, + "loss": 5.1444, + "step": 11015 + }, + { + "epoch": 0.0655152726234656, + "grad_norm": 1.5597975254058838, + "learning_rate": 4.947242696823735e-05, + "loss": 5.1581, + "step": 11016 + }, + { + "epoch": 0.0655212199067466, + "grad_norm": 1.686418056488037, + "learning_rate": 4.94723315105664e-05, + "loss": 5.1608, + "step": 11017 + }, + { + "epoch": 0.0655271671900276, + "grad_norm": 1.5329445600509644, + "learning_rate": 4.94722360443524e-05, + "loss": 5.1716, + "step": 11018 + }, + { + "epoch": 0.06553311447330859, + "grad_norm": 1.4718917608261108, + "learning_rate": 4.94721405695954e-05, + "loss": 5.0924, + "step": 11019 + }, + { + "epoch": 0.06553906175658959, + "grad_norm": 1.4442907571792603, + "learning_rate": 4.947204508629544e-05, + "loss": 5.3967, + "step": 11020 + }, + { + "epoch": 0.06554500903987058, + "grad_norm": 1.523834466934204, + "learning_rate": 4.947194959445253e-05, + "loss": 5.2068, + "step": 11021 + }, + { + "epoch": 0.06555095632315158, + "grad_norm": 1.4898262023925781, + "learning_rate": 4.947185409406672e-05, + "loss": 5.1664, + "step": 11022 + }, + { + "epoch": 0.06555690360643258, + "grad_norm": 1.504695177078247, + "learning_rate": 4.947175858513804e-05, + "loss": 5.2349, + "step": 11023 + }, + { + "epoch": 0.06556285088971357, + "grad_norm": 1.3538787364959717, + "learning_rate": 4.9471663067666516e-05, + "loss": 5.1034, + "step": 11024 + }, + { + "epoch": 0.06556879817299457, + "grad_norm": 1.3748440742492676, + "learning_rate": 4.94715675416522e-05, + "loss": 4.9759, + "step": 11025 + }, + { + "epoch": 0.06557474545627558, + "grad_norm": 1.5980280637741089, + "learning_rate": 4.94714720070951e-05, + "loss": 5.3042, + "step": 11026 + }, + { + "epoch": 0.06558069273955656, + "grad_norm": 1.641076683998108, + "learning_rate": 4.9471376463995266e-05, + "loss": 5.3373, + "step": 11027 + }, + { + "epoch": 0.06558664002283757, + "grad_norm": 1.5320390462875366, + "learning_rate": 4.947128091235273e-05, + "loss": 5.2308, + "step": 11028 + }, + { + "epoch": 0.06559258730611857, + "grad_norm": 1.5777555704116821, + "learning_rate": 4.9471185352167514e-05, + "loss": 5.2242, + "step": 11029 + }, + { + "epoch": 0.06559853458939956, + "grad_norm": 1.5055029392242432, + "learning_rate": 4.947108978343967e-05, + "loss": 5.1974, + "step": 11030 + }, + { + "epoch": 0.06560448187268056, + "grad_norm": 1.3923927545547485, + "learning_rate": 4.947099420616922e-05, + "loss": 5.3244, + "step": 11031 + }, + { + "epoch": 0.06561042915596156, + "grad_norm": 1.40999174118042, + "learning_rate": 4.9470898620356186e-05, + "loss": 5.3315, + "step": 11032 + }, + { + "epoch": 0.06561637643924255, + "grad_norm": 1.418296456336975, + "learning_rate": 4.947080302600063e-05, + "loss": 5.3942, + "step": 11033 + }, + { + "epoch": 0.06562232372252355, + "grad_norm": 1.7927478551864624, + "learning_rate": 4.9470707423102566e-05, + "loss": 5.3084, + "step": 11034 + }, + { + "epoch": 0.06562827100580455, + "grad_norm": 1.385011911392212, + "learning_rate": 4.947061181166203e-05, + "loss": 5.2043, + "step": 11035 + }, + { + "epoch": 0.06563421828908554, + "grad_norm": 1.5702954530715942, + "learning_rate": 4.9470516191679054e-05, + "loss": 5.9851, + "step": 11036 + }, + { + "epoch": 0.06564016557236654, + "grad_norm": 1.4196525812149048, + "learning_rate": 4.947042056315367e-05, + "loss": 5.2592, + "step": 11037 + }, + { + "epoch": 0.06564611285564755, + "grad_norm": 1.8318798542022705, + "learning_rate": 4.947032492608592e-05, + "loss": 5.3181, + "step": 11038 + }, + { + "epoch": 0.06565206013892853, + "grad_norm": 1.615460991859436, + "learning_rate": 4.947022928047583e-05, + "loss": 5.4053, + "step": 11039 + }, + { + "epoch": 0.06565800742220954, + "grad_norm": 1.384602427482605, + "learning_rate": 4.947013362632344e-05, + "loss": 5.3955, + "step": 11040 + }, + { + "epoch": 0.06566395470549054, + "grad_norm": 1.5959913730621338, + "learning_rate": 4.947003796362878e-05, + "loss": 5.4737, + "step": 11041 + }, + { + "epoch": 0.06566990198877153, + "grad_norm": 1.483659029006958, + "learning_rate": 4.946994229239188e-05, + "loss": 5.3804, + "step": 11042 + }, + { + "epoch": 0.06567584927205253, + "grad_norm": 1.2752004861831665, + "learning_rate": 4.946984661261277e-05, + "loss": 5.3806, + "step": 11043 + }, + { + "epoch": 0.06568179655533353, + "grad_norm": 2.0671582221984863, + "learning_rate": 4.946975092429149e-05, + "loss": 5.3047, + "step": 11044 + }, + { + "epoch": 0.06568774383861452, + "grad_norm": 1.6126081943511963, + "learning_rate": 4.946965522742808e-05, + "loss": 5.1905, + "step": 11045 + }, + { + "epoch": 0.06569369112189552, + "grad_norm": 1.6867598295211792, + "learning_rate": 4.946955952202257e-05, + "loss": 5.1543, + "step": 11046 + }, + { + "epoch": 0.06569963840517652, + "grad_norm": 1.3493974208831787, + "learning_rate": 4.946946380807498e-05, + "loss": 5.1527, + "step": 11047 + }, + { + "epoch": 0.06570558568845751, + "grad_norm": 1.4694898128509521, + "learning_rate": 4.946936808558536e-05, + "loss": 5.238, + "step": 11048 + }, + { + "epoch": 0.06571153297173851, + "grad_norm": 1.7940189838409424, + "learning_rate": 4.946927235455373e-05, + "loss": 5.0666, + "step": 11049 + }, + { + "epoch": 0.0657174802550195, + "grad_norm": 1.7015198469161987, + "learning_rate": 4.946917661498013e-05, + "loss": 5.5182, + "step": 11050 + }, + { + "epoch": 0.0657234275383005, + "grad_norm": 2.214686632156372, + "learning_rate": 4.946908086686459e-05, + "loss": 5.9424, + "step": 11051 + }, + { + "epoch": 0.0657293748215815, + "grad_norm": 1.7855008840560913, + "learning_rate": 4.9468985110207154e-05, + "loss": 5.8496, + "step": 11052 + }, + { + "epoch": 0.06573532210486249, + "grad_norm": 1.8354082107543945, + "learning_rate": 4.946888934500785e-05, + "loss": 5.8044, + "step": 11053 + }, + { + "epoch": 0.0657412693881435, + "grad_norm": 2.0321154594421387, + "learning_rate": 4.9468793571266705e-05, + "loss": 5.9488, + "step": 11054 + }, + { + "epoch": 0.0657472166714245, + "grad_norm": 2.2285213470458984, + "learning_rate": 4.946869778898376e-05, + "loss": 5.1819, + "step": 11055 + }, + { + "epoch": 0.06575316395470548, + "grad_norm": 1.9831287860870361, + "learning_rate": 4.946860199815904e-05, + "loss": 5.2068, + "step": 11056 + }, + { + "epoch": 0.06575911123798649, + "grad_norm": 2.1150667667388916, + "learning_rate": 4.946850619879259e-05, + "loss": 5.1523, + "step": 11057 + }, + { + "epoch": 0.06576505852126749, + "grad_norm": 1.9136968851089478, + "learning_rate": 4.946841039088444e-05, + "loss": 5.0084, + "step": 11058 + }, + { + "epoch": 0.06577100580454848, + "grad_norm": 1.9802511930465698, + "learning_rate": 4.9468314574434604e-05, + "loss": 4.9223, + "step": 11059 + }, + { + "epoch": 0.06577695308782948, + "grad_norm": 1.940656065940857, + "learning_rate": 4.946821874944315e-05, + "loss": 4.9662, + "step": 11060 + }, + { + "epoch": 0.06578290037111048, + "grad_norm": 1.8476706743240356, + "learning_rate": 4.9468122915910084e-05, + "loss": 4.8863, + "step": 11061 + }, + { + "epoch": 0.06578884765439147, + "grad_norm": 2.0490243434906006, + "learning_rate": 4.946802707383546e-05, + "loss": 4.8459, + "step": 11062 + }, + { + "epoch": 0.06579479493767247, + "grad_norm": 1.8996137380599976, + "learning_rate": 4.946793122321928e-05, + "loss": 4.7574, + "step": 11063 + }, + { + "epoch": 0.06580074222095347, + "grad_norm": 1.8910033702850342, + "learning_rate": 4.946783536406161e-05, + "loss": 4.8808, + "step": 11064 + }, + { + "epoch": 0.06580668950423446, + "grad_norm": 2.123816967010498, + "learning_rate": 4.946773949636247e-05, + "loss": 4.8486, + "step": 11065 + }, + { + "epoch": 0.06581263678751546, + "grad_norm": 1.7508260011672974, + "learning_rate": 4.9467643620121906e-05, + "loss": 4.9856, + "step": 11066 + }, + { + "epoch": 0.06581858407079647, + "grad_norm": 1.728398084640503, + "learning_rate": 4.9467547735339926e-05, + "loss": 4.9634, + "step": 11067 + }, + { + "epoch": 0.06582453135407745, + "grad_norm": 2.1020689010620117, + "learning_rate": 4.946745184201659e-05, + "loss": 4.6133, + "step": 11068 + }, + { + "epoch": 0.06583047863735846, + "grad_norm": 2.106549024581909, + "learning_rate": 4.9467355940151904e-05, + "loss": 4.7124, + "step": 11069 + }, + { + "epoch": 0.06583642592063946, + "grad_norm": 2.078505039215088, + "learning_rate": 4.9467260029745924e-05, + "loss": 4.5828, + "step": 11070 + }, + { + "epoch": 0.06584237320392045, + "grad_norm": 1.987950325012207, + "learning_rate": 4.946716411079868e-05, + "loss": 4.5823, + "step": 11071 + }, + { + "epoch": 0.06584832048720145, + "grad_norm": 1.9027208089828491, + "learning_rate": 4.94670681833102e-05, + "loss": 4.8063, + "step": 11072 + }, + { + "epoch": 0.06585426777048245, + "grad_norm": 2.001823902130127, + "learning_rate": 4.946697224728052e-05, + "loss": 4.5405, + "step": 11073 + }, + { + "epoch": 0.06586021505376344, + "grad_norm": 2.1472394466400146, + "learning_rate": 4.946687630270967e-05, + "loss": 4.6565, + "step": 11074 + }, + { + "epoch": 0.06586616233704444, + "grad_norm": 2.0731146335601807, + "learning_rate": 4.946678034959769e-05, + "loss": 4.5022, + "step": 11075 + }, + { + "epoch": 0.06587210962032544, + "grad_norm": 2.0769810676574707, + "learning_rate": 4.946668438794461e-05, + "loss": 4.5248, + "step": 11076 + }, + { + "epoch": 0.06587805690360643, + "grad_norm": 2.183871269226074, + "learning_rate": 4.946658841775046e-05, + "loss": 4.5723, + "step": 11077 + }, + { + "epoch": 0.06588400418688743, + "grad_norm": 2.0304160118103027, + "learning_rate": 4.9466492439015275e-05, + "loss": 4.5928, + "step": 11078 + }, + { + "epoch": 0.06588995147016842, + "grad_norm": 1.9167170524597168, + "learning_rate": 4.94663964517391e-05, + "loss": 4.4162, + "step": 11079 + }, + { + "epoch": 0.06589589875344942, + "grad_norm": 2.1295299530029297, + "learning_rate": 4.9466300455921946e-05, + "loss": 4.6662, + "step": 11080 + }, + { + "epoch": 0.06590184603673042, + "grad_norm": 2.180253744125366, + "learning_rate": 4.946620445156386e-05, + "loss": 4.5101, + "step": 11081 + }, + { + "epoch": 0.06590779332001141, + "grad_norm": 1.887289047241211, + "learning_rate": 4.9466108438664885e-05, + "loss": 4.3611, + "step": 11082 + }, + { + "epoch": 0.06591374060329241, + "grad_norm": 1.8323948383331299, + "learning_rate": 4.946601241722504e-05, + "loss": 4.8711, + "step": 11083 + }, + { + "epoch": 0.06591968788657342, + "grad_norm": 1.944860577583313, + "learning_rate": 4.946591638724436e-05, + "loss": 4.5288, + "step": 11084 + }, + { + "epoch": 0.0659256351698544, + "grad_norm": 1.9748528003692627, + "learning_rate": 4.946582034872288e-05, + "loss": 4.3819, + "step": 11085 + }, + { + "epoch": 0.0659315824531354, + "grad_norm": 2.017582416534424, + "learning_rate": 4.9465724301660635e-05, + "loss": 4.4508, + "step": 11086 + }, + { + "epoch": 0.06593752973641641, + "grad_norm": 1.8043986558914185, + "learning_rate": 4.946562824605766e-05, + "loss": 4.5948, + "step": 11087 + }, + { + "epoch": 0.0659434770196974, + "grad_norm": 1.8695666790008545, + "learning_rate": 4.946553218191399e-05, + "loss": 4.2691, + "step": 11088 + }, + { + "epoch": 0.0659494243029784, + "grad_norm": 2.027717351913452, + "learning_rate": 4.9465436109229656e-05, + "loss": 4.4152, + "step": 11089 + }, + { + "epoch": 0.0659553715862594, + "grad_norm": 1.989127278327942, + "learning_rate": 4.946534002800469e-05, + "loss": 4.5155, + "step": 11090 + }, + { + "epoch": 0.06596131886954039, + "grad_norm": 1.9889907836914062, + "learning_rate": 4.9465243938239124e-05, + "loss": 4.4047, + "step": 11091 + }, + { + "epoch": 0.06596726615282139, + "grad_norm": 2.077021837234497, + "learning_rate": 4.946514783993299e-05, + "loss": 4.5199, + "step": 11092 + }, + { + "epoch": 0.0659732134361024, + "grad_norm": 1.9180271625518799, + "learning_rate": 4.946505173308633e-05, + "loss": 4.4511, + "step": 11093 + }, + { + "epoch": 0.06597916071938338, + "grad_norm": 2.120338201522827, + "learning_rate": 4.946495561769918e-05, + "loss": 4.3034, + "step": 11094 + }, + { + "epoch": 0.06598510800266438, + "grad_norm": 1.9632322788238525, + "learning_rate": 4.946485949377156e-05, + "loss": 5.2411, + "step": 11095 + }, + { + "epoch": 0.06599105528594539, + "grad_norm": 2.0921249389648438, + "learning_rate": 4.946476336130351e-05, + "loss": 4.5768, + "step": 11096 + }, + { + "epoch": 0.06599700256922637, + "grad_norm": 2.1472532749176025, + "learning_rate": 4.9464667220295066e-05, + "loss": 4.6279, + "step": 11097 + }, + { + "epoch": 0.06600294985250738, + "grad_norm": 2.472062349319458, + "learning_rate": 4.946457107074626e-05, + "loss": 5.703, + "step": 11098 + }, + { + "epoch": 0.06600889713578838, + "grad_norm": 1.8995217084884644, + "learning_rate": 4.946447491265712e-05, + "loss": 4.5265, + "step": 11099 + }, + { + "epoch": 0.06601484441906937, + "grad_norm": 2.173339605331421, + "learning_rate": 4.946437874602769e-05, + "loss": 4.5356, + "step": 11100 + }, + { + "epoch": 0.06602079170235037, + "grad_norm": 1.8179867267608643, + "learning_rate": 4.9464282570858e-05, + "loss": 4.3765, + "step": 11101 + }, + { + "epoch": 0.06602673898563137, + "grad_norm": 2.367713212966919, + "learning_rate": 4.946418638714808e-05, + "loss": 5.6831, + "step": 11102 + }, + { + "epoch": 0.06603268626891236, + "grad_norm": 2.3576571941375732, + "learning_rate": 4.9464090194897964e-05, + "loss": 5.563, + "step": 11103 + }, + { + "epoch": 0.06603863355219336, + "grad_norm": 2.0476090908050537, + "learning_rate": 4.946399399410768e-05, + "loss": 5.7503, + "step": 11104 + }, + { + "epoch": 0.06604458083547436, + "grad_norm": 2.104295253753662, + "learning_rate": 4.946389778477728e-05, + "loss": 5.669, + "step": 11105 + }, + { + "epoch": 0.06605052811875535, + "grad_norm": 2.1458580493927, + "learning_rate": 4.946380156690677e-05, + "loss": 5.5317, + "step": 11106 + }, + { + "epoch": 0.06605647540203635, + "grad_norm": 2.0373425483703613, + "learning_rate": 4.946370534049621e-05, + "loss": 5.5952, + "step": 11107 + }, + { + "epoch": 0.06606242268531734, + "grad_norm": 2.232574701309204, + "learning_rate": 4.946360910554563e-05, + "loss": 5.6076, + "step": 11108 + }, + { + "epoch": 0.06606836996859834, + "grad_norm": 2.1477861404418945, + "learning_rate": 4.946351286205505e-05, + "loss": 5.5862, + "step": 11109 + }, + { + "epoch": 0.06607431725187934, + "grad_norm": 2.105203866958618, + "learning_rate": 4.946341661002451e-05, + "loss": 5.5089, + "step": 11110 + }, + { + "epoch": 0.06608026453516033, + "grad_norm": 2.1524410247802734, + "learning_rate": 4.9463320349454047e-05, + "loss": 5.419, + "step": 11111 + }, + { + "epoch": 0.06608621181844133, + "grad_norm": 2.132504463195801, + "learning_rate": 4.946322408034369e-05, + "loss": 5.3421, + "step": 11112 + }, + { + "epoch": 0.06609215910172234, + "grad_norm": 1.7870386838912964, + "learning_rate": 4.9463127802693474e-05, + "loss": 5.1829, + "step": 11113 + }, + { + "epoch": 0.06609810638500332, + "grad_norm": 1.9586358070373535, + "learning_rate": 4.946303151650343e-05, + "loss": 5.228, + "step": 11114 + }, + { + "epoch": 0.06610405366828433, + "grad_norm": 2.092473030090332, + "learning_rate": 4.9462935221773594e-05, + "loss": 5.4616, + "step": 11115 + }, + { + "epoch": 0.06611000095156533, + "grad_norm": 2.204131603240967, + "learning_rate": 4.946283891850401e-05, + "loss": 5.4552, + "step": 11116 + }, + { + "epoch": 0.06611594823484632, + "grad_norm": 1.998795747756958, + "learning_rate": 4.946274260669469e-05, + "loss": 5.5193, + "step": 11117 + }, + { + "epoch": 0.06612189551812732, + "grad_norm": 1.9446638822555542, + "learning_rate": 4.9462646286345684e-05, + "loss": 5.3923, + "step": 11118 + }, + { + "epoch": 0.06612784280140832, + "grad_norm": 1.828114628791809, + "learning_rate": 4.946254995745702e-05, + "loss": 5.4306, + "step": 11119 + }, + { + "epoch": 0.06613379008468931, + "grad_norm": 2.1322944164276123, + "learning_rate": 4.946245362002873e-05, + "loss": 5.3831, + "step": 11120 + }, + { + "epoch": 0.06613973736797031, + "grad_norm": 2.1194324493408203, + "learning_rate": 4.9462357274060856e-05, + "loss": 5.2805, + "step": 11121 + }, + { + "epoch": 0.06614568465125131, + "grad_norm": 2.011417865753174, + "learning_rate": 4.946226091955342e-05, + "loss": 5.3052, + "step": 11122 + }, + { + "epoch": 0.0661516319345323, + "grad_norm": 2.202887773513794, + "learning_rate": 4.9462164556506464e-05, + "loss": 5.5263, + "step": 11123 + }, + { + "epoch": 0.0661575792178133, + "grad_norm": 2.075645685195923, + "learning_rate": 4.946206818492002e-05, + "loss": 5.1033, + "step": 11124 + }, + { + "epoch": 0.0661635265010943, + "grad_norm": 2.0723443031311035, + "learning_rate": 4.946197180479412e-05, + "loss": 4.8365, + "step": 11125 + }, + { + "epoch": 0.0661694737843753, + "grad_norm": 2.245961904525757, + "learning_rate": 4.94618754161288e-05, + "loss": 5.0123, + "step": 11126 + }, + { + "epoch": 0.0661754210676563, + "grad_norm": 2.0513699054718018, + "learning_rate": 4.9461779018924096e-05, + "loss": 4.9909, + "step": 11127 + }, + { + "epoch": 0.0661813683509373, + "grad_norm": 2.1552181243896484, + "learning_rate": 4.9461682613180024e-05, + "loss": 5.165, + "step": 11128 + }, + { + "epoch": 0.06618731563421829, + "grad_norm": 2.1207263469696045, + "learning_rate": 4.946158619889664e-05, + "loss": 5.3254, + "step": 11129 + }, + { + "epoch": 0.06619326291749929, + "grad_norm": 1.8278319835662842, + "learning_rate": 4.946148977607397e-05, + "loss": 5.2462, + "step": 11130 + }, + { + "epoch": 0.06619921020078029, + "grad_norm": 2.434661865234375, + "learning_rate": 4.9461393344712046e-05, + "loss": 5.28, + "step": 11131 + }, + { + "epoch": 0.06620515748406128, + "grad_norm": 2.3434953689575195, + "learning_rate": 4.9461296904810904e-05, + "loss": 5.112, + "step": 11132 + }, + { + "epoch": 0.06621110476734228, + "grad_norm": 2.010430335998535, + "learning_rate": 4.946120045637057e-05, + "loss": 5.1236, + "step": 11133 + }, + { + "epoch": 0.06621705205062328, + "grad_norm": 2.19608736038208, + "learning_rate": 4.946110399939109e-05, + "loss": 5.122, + "step": 11134 + }, + { + "epoch": 0.06622299933390427, + "grad_norm": 1.9471449851989746, + "learning_rate": 4.946100753387249e-05, + "loss": 5.2849, + "step": 11135 + }, + { + "epoch": 0.06622894661718527, + "grad_norm": 2.0541727542877197, + "learning_rate": 4.94609110598148e-05, + "loss": 5.4196, + "step": 11136 + }, + { + "epoch": 0.06623489390046626, + "grad_norm": 2.268826723098755, + "learning_rate": 4.946081457721806e-05, + "loss": 5.449, + "step": 11137 + }, + { + "epoch": 0.06624084118374726, + "grad_norm": 2.075227975845337, + "learning_rate": 4.9460718086082307e-05, + "loss": 5.5463, + "step": 11138 + }, + { + "epoch": 0.06624678846702826, + "grad_norm": 2.0949649810791016, + "learning_rate": 4.9460621586407567e-05, + "loss": 5.3737, + "step": 11139 + }, + { + "epoch": 0.06625273575030925, + "grad_norm": 2.1247878074645996, + "learning_rate": 4.9460525078193874e-05, + "loss": 5.2766, + "step": 11140 + }, + { + "epoch": 0.06625868303359025, + "grad_norm": 1.8304489850997925, + "learning_rate": 4.9460428561441276e-05, + "loss": 5.181, + "step": 11141 + }, + { + "epoch": 0.06626463031687126, + "grad_norm": 2.160853862762451, + "learning_rate": 4.946033203614978e-05, + "loss": 5.5222, + "step": 11142 + }, + { + "epoch": 0.06627057760015224, + "grad_norm": 1.9857962131500244, + "learning_rate": 4.9460235502319446e-05, + "loss": 5.574, + "step": 11143 + }, + { + "epoch": 0.06627652488343325, + "grad_norm": 2.016709804534912, + "learning_rate": 4.9460138959950294e-05, + "loss": 5.5255, + "step": 11144 + }, + { + "epoch": 0.06628247216671425, + "grad_norm": 1.8675861358642578, + "learning_rate": 4.946004240904235e-05, + "loss": 5.3604, + "step": 11145 + }, + { + "epoch": 0.06628841944999524, + "grad_norm": 1.9159897565841675, + "learning_rate": 4.945994584959567e-05, + "loss": 5.5348, + "step": 11146 + }, + { + "epoch": 0.06629436673327624, + "grad_norm": 2.0460150241851807, + "learning_rate": 4.945984928161027e-05, + "loss": 5.3267, + "step": 11147 + }, + { + "epoch": 0.06630031401655724, + "grad_norm": 1.8361427783966064, + "learning_rate": 4.9459752705086196e-05, + "loss": 5.3309, + "step": 11148 + }, + { + "epoch": 0.06630626129983823, + "grad_norm": 1.5448495149612427, + "learning_rate": 4.945965612002347e-05, + "loss": 5.0789, + "step": 11149 + }, + { + "epoch": 0.06631220858311923, + "grad_norm": 1.4580925703048706, + "learning_rate": 4.9459559526422125e-05, + "loss": 5.2011, + "step": 11150 + }, + { + "epoch": 0.06631815586640023, + "grad_norm": 1.606593370437622, + "learning_rate": 4.945946292428221e-05, + "loss": 5.2061, + "step": 11151 + }, + { + "epoch": 0.06632410314968122, + "grad_norm": 1.4270994663238525, + "learning_rate": 4.945936631360375e-05, + "loss": 5.089, + "step": 11152 + }, + { + "epoch": 0.06633005043296222, + "grad_norm": 1.6082873344421387, + "learning_rate": 4.9459269694386766e-05, + "loss": 5.2502, + "step": 11153 + }, + { + "epoch": 0.06633599771624323, + "grad_norm": 1.5378412008285522, + "learning_rate": 4.945917306663131e-05, + "loss": 5.4431, + "step": 11154 + }, + { + "epoch": 0.06634194499952421, + "grad_norm": 1.2726879119873047, + "learning_rate": 4.9459076430337416e-05, + "loss": 5.4568, + "step": 11155 + }, + { + "epoch": 0.06634789228280522, + "grad_norm": 1.6131432056427002, + "learning_rate": 4.94589797855051e-05, + "loss": 5.2507, + "step": 11156 + }, + { + "epoch": 0.06635383956608622, + "grad_norm": 1.5835362672805786, + "learning_rate": 4.945888313213442e-05, + "loss": 5.1122, + "step": 11157 + }, + { + "epoch": 0.0663597868493672, + "grad_norm": 1.5903444290161133, + "learning_rate": 4.945878647022539e-05, + "loss": 5.3236, + "step": 11158 + }, + { + "epoch": 0.06636573413264821, + "grad_norm": 1.7948551177978516, + "learning_rate": 4.945868979977805e-05, + "loss": 5.5939, + "step": 11159 + }, + { + "epoch": 0.06637168141592921, + "grad_norm": 2.1183457374572754, + "learning_rate": 4.945859312079243e-05, + "loss": 5.3639, + "step": 11160 + }, + { + "epoch": 0.0663776286992102, + "grad_norm": 1.5584137439727783, + "learning_rate": 4.945849643326857e-05, + "loss": 5.4302, + "step": 11161 + }, + { + "epoch": 0.0663835759824912, + "grad_norm": 1.5150829553604126, + "learning_rate": 4.9458399737206504e-05, + "loss": 5.2485, + "step": 11162 + }, + { + "epoch": 0.0663895232657722, + "grad_norm": 1.421235203742981, + "learning_rate": 4.9458303032606264e-05, + "loss": 5.2149, + "step": 11163 + }, + { + "epoch": 0.06639547054905319, + "grad_norm": 1.640207052230835, + "learning_rate": 4.945820631946788e-05, + "loss": 5.2807, + "step": 11164 + }, + { + "epoch": 0.06640141783233419, + "grad_norm": 1.5021215677261353, + "learning_rate": 4.945810959779139e-05, + "loss": 5.3684, + "step": 11165 + }, + { + "epoch": 0.06640736511561518, + "grad_norm": 1.802828073501587, + "learning_rate": 4.945801286757682e-05, + "loss": 5.2153, + "step": 11166 + }, + { + "epoch": 0.06641331239889618, + "grad_norm": 1.556386947631836, + "learning_rate": 4.945791612882422e-05, + "loss": 5.1908, + "step": 11167 + }, + { + "epoch": 0.06641925968217718, + "grad_norm": 1.5906118154525757, + "learning_rate": 4.9457819381533616e-05, + "loss": 5.2183, + "step": 11168 + }, + { + "epoch": 0.06642520696545817, + "grad_norm": 1.5778700113296509, + "learning_rate": 4.945772262570503e-05, + "loss": 5.2465, + "step": 11169 + }, + { + "epoch": 0.06643115424873917, + "grad_norm": 1.4705984592437744, + "learning_rate": 4.945762586133852e-05, + "loss": 5.1496, + "step": 11170 + }, + { + "epoch": 0.06643710153202018, + "grad_norm": 1.5118781328201294, + "learning_rate": 4.9457529088434093e-05, + "loss": 5.1764, + "step": 11171 + }, + { + "epoch": 0.06644304881530116, + "grad_norm": 1.5784192085266113, + "learning_rate": 4.94574323069918e-05, + "loss": 5.165, + "step": 11172 + }, + { + "epoch": 0.06644899609858217, + "grad_norm": 1.517220139503479, + "learning_rate": 4.9457335517011666e-05, + "loss": 5.1718, + "step": 11173 + }, + { + "epoch": 0.06645494338186317, + "grad_norm": 1.3823192119598389, + "learning_rate": 4.9457238718493734e-05, + "loss": 5.1945, + "step": 11174 + }, + { + "epoch": 0.06646089066514416, + "grad_norm": 1.4499212503433228, + "learning_rate": 4.945714191143803e-05, + "loss": 5.1044, + "step": 11175 + }, + { + "epoch": 0.06646683794842516, + "grad_norm": 1.4904807806015015, + "learning_rate": 4.945704509584459e-05, + "loss": 5.1781, + "step": 11176 + }, + { + "epoch": 0.06647278523170616, + "grad_norm": 1.6798325777053833, + "learning_rate": 4.945694827171345e-05, + "loss": 4.8879, + "step": 11177 + }, + { + "epoch": 0.06647873251498715, + "grad_norm": 1.3890799283981323, + "learning_rate": 4.945685143904464e-05, + "loss": 4.9941, + "step": 11178 + }, + { + "epoch": 0.06648467979826815, + "grad_norm": 1.4167201519012451, + "learning_rate": 4.94567545978382e-05, + "loss": 5.016, + "step": 11179 + }, + { + "epoch": 0.06649062708154915, + "grad_norm": 1.5122467279434204, + "learning_rate": 4.9456657748094145e-05, + "loss": 4.9937, + "step": 11180 + }, + { + "epoch": 0.06649657436483014, + "grad_norm": 1.4347165822982788, + "learning_rate": 4.9456560889812543e-05, + "loss": 5.0486, + "step": 11181 + }, + { + "epoch": 0.06650252164811114, + "grad_norm": 1.6328964233398438, + "learning_rate": 4.94564640229934e-05, + "loss": 5.1891, + "step": 11182 + }, + { + "epoch": 0.06650846893139215, + "grad_norm": 1.5832617282867432, + "learning_rate": 4.9456367147636765e-05, + "loss": 5.2947, + "step": 11183 + }, + { + "epoch": 0.06651441621467313, + "grad_norm": 1.6932839155197144, + "learning_rate": 4.9456270263742655e-05, + "loss": 5.0755, + "step": 11184 + }, + { + "epoch": 0.06652036349795414, + "grad_norm": 1.6238216161727905, + "learning_rate": 4.945617337131111e-05, + "loss": 5.1903, + "step": 11185 + }, + { + "epoch": 0.06652631078123514, + "grad_norm": 2.362353801727295, + "learning_rate": 4.945607647034218e-05, + "loss": 5.3641, + "step": 11186 + }, + { + "epoch": 0.06653225806451613, + "grad_norm": 1.6447978019714355, + "learning_rate": 4.9455979560835874e-05, + "loss": 5.0174, + "step": 11187 + }, + { + "epoch": 0.06653820534779713, + "grad_norm": 1.6059958934783936, + "learning_rate": 4.945588264279225e-05, + "loss": 4.884, + "step": 11188 + }, + { + "epoch": 0.06654415263107813, + "grad_norm": 1.6291608810424805, + "learning_rate": 4.9455785716211325e-05, + "loss": 4.9735, + "step": 11189 + }, + { + "epoch": 0.06655009991435912, + "grad_norm": 1.6926389932632446, + "learning_rate": 4.9455688781093135e-05, + "loss": 4.9294, + "step": 11190 + }, + { + "epoch": 0.06655604719764012, + "grad_norm": 1.5816938877105713, + "learning_rate": 4.945559183743772e-05, + "loss": 4.9161, + "step": 11191 + }, + { + "epoch": 0.06656199448092112, + "grad_norm": 1.5514836311340332, + "learning_rate": 4.9455494885245115e-05, + "loss": 4.9102, + "step": 11192 + }, + { + "epoch": 0.06656794176420211, + "grad_norm": 1.6787114143371582, + "learning_rate": 4.9455397924515346e-05, + "loss": 4.9628, + "step": 11193 + }, + { + "epoch": 0.06657388904748311, + "grad_norm": 1.5264941453933716, + "learning_rate": 4.945530095524844e-05, + "loss": 5.1685, + "step": 11194 + }, + { + "epoch": 0.06657983633076411, + "grad_norm": 1.80072820186615, + "learning_rate": 4.945520397744445e-05, + "loss": 4.8308, + "step": 11195 + }, + { + "epoch": 0.0665857836140451, + "grad_norm": 1.7497553825378418, + "learning_rate": 4.945510699110341e-05, + "loss": 4.8846, + "step": 11196 + }, + { + "epoch": 0.0665917308973261, + "grad_norm": 1.8938134908676147, + "learning_rate": 4.945500999622533e-05, + "loss": 4.8303, + "step": 11197 + }, + { + "epoch": 0.06659767818060709, + "grad_norm": 1.7286055088043213, + "learning_rate": 4.9454912992810264e-05, + "loss": 4.7686, + "step": 11198 + }, + { + "epoch": 0.0666036254638881, + "grad_norm": 1.7573840618133545, + "learning_rate": 4.945481598085824e-05, + "loss": 4.7527, + "step": 11199 + }, + { + "epoch": 0.0666095727471691, + "grad_norm": 1.9013001918792725, + "learning_rate": 4.94547189603693e-05, + "loss": 5.0987, + "step": 11200 + }, + { + "epoch": 0.06661552003045008, + "grad_norm": 1.5453308820724487, + "learning_rate": 4.945462193134346e-05, + "loss": 5.3799, + "step": 11201 + }, + { + "epoch": 0.06662146731373109, + "grad_norm": 1.763839602470398, + "learning_rate": 4.945452489378076e-05, + "loss": 5.2904, + "step": 11202 + }, + { + "epoch": 0.06662741459701209, + "grad_norm": 1.650407075881958, + "learning_rate": 4.945442784768125e-05, + "loss": 5.3007, + "step": 11203 + }, + { + "epoch": 0.06663336188029308, + "grad_norm": 1.6620690822601318, + "learning_rate": 4.945433079304495e-05, + "loss": 5.394, + "step": 11204 + }, + { + "epoch": 0.06663930916357408, + "grad_norm": 1.5000416040420532, + "learning_rate": 4.945423372987189e-05, + "loss": 5.0648, + "step": 11205 + }, + { + "epoch": 0.06664525644685508, + "grad_norm": 2.1791460514068604, + "learning_rate": 4.945413665816211e-05, + "loss": 5.5261, + "step": 11206 + }, + { + "epoch": 0.06665120373013607, + "grad_norm": 2.084258556365967, + "learning_rate": 4.945403957791565e-05, + "loss": 5.5796, + "step": 11207 + }, + { + "epoch": 0.06665715101341707, + "grad_norm": 1.9391356706619263, + "learning_rate": 4.945394248913253e-05, + "loss": 5.4855, + "step": 11208 + }, + { + "epoch": 0.06666309829669807, + "grad_norm": 1.8323030471801758, + "learning_rate": 4.9453845391812803e-05, + "loss": 5.5711, + "step": 11209 + }, + { + "epoch": 0.06666904557997906, + "grad_norm": 1.9193792343139648, + "learning_rate": 4.945374828595648e-05, + "loss": 5.2585, + "step": 11210 + }, + { + "epoch": 0.06667499286326006, + "grad_norm": 1.7111014127731323, + "learning_rate": 4.9453651171563606e-05, + "loss": 5.1965, + "step": 11211 + }, + { + "epoch": 0.06668094014654107, + "grad_norm": 1.8574761152267456, + "learning_rate": 4.9453554048634224e-05, + "loss": 5.2538, + "step": 11212 + }, + { + "epoch": 0.06668688742982205, + "grad_norm": 2.18009352684021, + "learning_rate": 4.945345691716835e-05, + "loss": 5.2486, + "step": 11213 + }, + { + "epoch": 0.06669283471310306, + "grad_norm": 2.167819023132324, + "learning_rate": 4.945335977716603e-05, + "loss": 5.1877, + "step": 11214 + }, + { + "epoch": 0.06669878199638406, + "grad_norm": 2.086603879928589, + "learning_rate": 4.9453262628627297e-05, + "loss": 5.32, + "step": 11215 + }, + { + "epoch": 0.06670472927966505, + "grad_norm": 2.239917039871216, + "learning_rate": 4.945316547155218e-05, + "loss": 5.5289, + "step": 11216 + }, + { + "epoch": 0.06671067656294605, + "grad_norm": 1.9402177333831787, + "learning_rate": 4.945306830594072e-05, + "loss": 5.5159, + "step": 11217 + }, + { + "epoch": 0.06671662384622705, + "grad_norm": 2.2730953693389893, + "learning_rate": 4.945297113179294e-05, + "loss": 5.5132, + "step": 11218 + }, + { + "epoch": 0.06672257112950804, + "grad_norm": 2.4021079540252686, + "learning_rate": 4.945287394910888e-05, + "loss": 5.7505, + "step": 11219 + }, + { + "epoch": 0.06672851841278904, + "grad_norm": 1.8272559642791748, + "learning_rate": 4.945277675788859e-05, + "loss": 5.7324, + "step": 11220 + }, + { + "epoch": 0.06673446569607004, + "grad_norm": 1.641192078590393, + "learning_rate": 4.945267955813206e-05, + "loss": 5.7665, + "step": 11221 + }, + { + "epoch": 0.06674041297935103, + "grad_norm": 2.1081202030181885, + "learning_rate": 4.945258234983938e-05, + "loss": 5.3633, + "step": 11222 + }, + { + "epoch": 0.06674636026263203, + "grad_norm": 1.7172397375106812, + "learning_rate": 4.945248513301054e-05, + "loss": 5.775, + "step": 11223 + }, + { + "epoch": 0.06675230754591303, + "grad_norm": 1.9968703985214233, + "learning_rate": 4.9452387907645594e-05, + "loss": 5.4817, + "step": 11224 + }, + { + "epoch": 0.06675825482919402, + "grad_norm": 1.9165494441986084, + "learning_rate": 4.9452290673744575e-05, + "loss": 5.6977, + "step": 11225 + }, + { + "epoch": 0.06676420211247502, + "grad_norm": 1.832783579826355, + "learning_rate": 4.945219343130751e-05, + "loss": 5.2065, + "step": 11226 + }, + { + "epoch": 0.06677014939575601, + "grad_norm": 2.073590040206909, + "learning_rate": 4.945209618033444e-05, + "loss": 5.0158, + "step": 11227 + }, + { + "epoch": 0.06677609667903701, + "grad_norm": 2.0305895805358887, + "learning_rate": 4.9451998920825395e-05, + "loss": 4.8452, + "step": 11228 + }, + { + "epoch": 0.06678204396231802, + "grad_norm": 1.8843696117401123, + "learning_rate": 4.945190165278041e-05, + "loss": 5.5082, + "step": 11229 + }, + { + "epoch": 0.066787991245599, + "grad_norm": 1.66866934299469, + "learning_rate": 4.945180437619951e-05, + "loss": 5.4151, + "step": 11230 + }, + { + "epoch": 0.06679393852888, + "grad_norm": 1.8018205165863037, + "learning_rate": 4.9451707091082746e-05, + "loss": 5.124, + "step": 11231 + }, + { + "epoch": 0.06679988581216101, + "grad_norm": 1.760339379310608, + "learning_rate": 4.9451609797430146e-05, + "loss": 4.9834, + "step": 11232 + }, + { + "epoch": 0.066805833095442, + "grad_norm": 1.609376072883606, + "learning_rate": 4.945151249524174e-05, + "loss": 5.0217, + "step": 11233 + }, + { + "epoch": 0.066811780378723, + "grad_norm": 1.5468369722366333, + "learning_rate": 4.9451415184517556e-05, + "loss": 5.1881, + "step": 11234 + }, + { + "epoch": 0.066817727662004, + "grad_norm": 1.2027482986450195, + "learning_rate": 4.945131786525764e-05, + "loss": 5.1014, + "step": 11235 + }, + { + "epoch": 0.06682367494528499, + "grad_norm": 1.6050941944122314, + "learning_rate": 4.945122053746203e-05, + "loss": 5.0314, + "step": 11236 + }, + { + "epoch": 0.06682962222856599, + "grad_norm": 1.4980865716934204, + "learning_rate": 4.9451123201130746e-05, + "loss": 4.9371, + "step": 11237 + }, + { + "epoch": 0.06683556951184699, + "grad_norm": 1.6754953861236572, + "learning_rate": 4.9451025856263824e-05, + "loss": 4.9733, + "step": 11238 + }, + { + "epoch": 0.06684151679512798, + "grad_norm": 1.5051567554473877, + "learning_rate": 4.9450928502861303e-05, + "loss": 4.8994, + "step": 11239 + }, + { + "epoch": 0.06684746407840898, + "grad_norm": 1.5211920738220215, + "learning_rate": 4.945083114092321e-05, + "loss": 4.8459, + "step": 11240 + }, + { + "epoch": 0.06685341136168998, + "grad_norm": 1.6717231273651123, + "learning_rate": 4.9450733770449596e-05, + "loss": 5.1029, + "step": 11241 + }, + { + "epoch": 0.06685935864497097, + "grad_norm": 1.4853429794311523, + "learning_rate": 4.945063639144048e-05, + "loss": 5.2199, + "step": 11242 + }, + { + "epoch": 0.06686530592825198, + "grad_norm": 1.6102755069732666, + "learning_rate": 4.9450539003895894e-05, + "loss": 5.1191, + "step": 11243 + }, + { + "epoch": 0.06687125321153298, + "grad_norm": 1.6091139316558838, + "learning_rate": 4.9450441607815876e-05, + "loss": 5.2492, + "step": 11244 + }, + { + "epoch": 0.06687720049481397, + "grad_norm": 1.5190162658691406, + "learning_rate": 4.945034420320047e-05, + "loss": 5.1763, + "step": 11245 + }, + { + "epoch": 0.06688314777809497, + "grad_norm": 1.636243462562561, + "learning_rate": 4.94502467900497e-05, + "loss": 5.4906, + "step": 11246 + }, + { + "epoch": 0.06688909506137597, + "grad_norm": 1.5214428901672363, + "learning_rate": 4.9450149368363594e-05, + "loss": 5.3554, + "step": 11247 + }, + { + "epoch": 0.06689504234465696, + "grad_norm": 1.696183681488037, + "learning_rate": 4.9450051938142205e-05, + "loss": 5.3185, + "step": 11248 + }, + { + "epoch": 0.06690098962793796, + "grad_norm": 1.5344911813735962, + "learning_rate": 4.944995449938555e-05, + "loss": 5.345, + "step": 11249 + }, + { + "epoch": 0.06690693691121896, + "grad_norm": 1.598035454750061, + "learning_rate": 4.944985705209366e-05, + "loss": 5.2271, + "step": 11250 + }, + { + "epoch": 0.06691288419449995, + "grad_norm": 1.501841425895691, + "learning_rate": 4.944975959626659e-05, + "loss": 5.1807, + "step": 11251 + }, + { + "epoch": 0.06691883147778095, + "grad_norm": 1.3818657398223877, + "learning_rate": 4.944966213190436e-05, + "loss": 5.2953, + "step": 11252 + }, + { + "epoch": 0.06692477876106195, + "grad_norm": 1.5480642318725586, + "learning_rate": 4.9449564659007e-05, + "loss": 5.3048, + "step": 11253 + }, + { + "epoch": 0.06693072604434294, + "grad_norm": 1.5553090572357178, + "learning_rate": 4.9449467177574546e-05, + "loss": 5.1365, + "step": 11254 + }, + { + "epoch": 0.06693667332762394, + "grad_norm": 1.581534743309021, + "learning_rate": 4.944936968760705e-05, + "loss": 5.1498, + "step": 11255 + }, + { + "epoch": 0.06694262061090493, + "grad_norm": 1.8294548988342285, + "learning_rate": 4.944927218910452e-05, + "loss": 5.1331, + "step": 11256 + }, + { + "epoch": 0.06694856789418593, + "grad_norm": 1.3404508829116821, + "learning_rate": 4.944917468206701e-05, + "loss": 5.5092, + "step": 11257 + }, + { + "epoch": 0.06695451517746694, + "grad_norm": 1.5146483182907104, + "learning_rate": 4.944907716649454e-05, + "loss": 5.2797, + "step": 11258 + }, + { + "epoch": 0.06696046246074792, + "grad_norm": 1.571393609046936, + "learning_rate": 4.944897964238715e-05, + "loss": 5.4528, + "step": 11259 + }, + { + "epoch": 0.06696640974402893, + "grad_norm": 1.640459656715393, + "learning_rate": 4.944888210974487e-05, + "loss": 5.1032, + "step": 11260 + }, + { + "epoch": 0.06697235702730993, + "grad_norm": 1.5397419929504395, + "learning_rate": 4.944878456856774e-05, + "loss": 5.2333, + "step": 11261 + }, + { + "epoch": 0.06697830431059092, + "grad_norm": 1.4423824548721313, + "learning_rate": 4.94486870188558e-05, + "loss": 5.1765, + "step": 11262 + }, + { + "epoch": 0.06698425159387192, + "grad_norm": 1.366347074508667, + "learning_rate": 4.9448589460609066e-05, + "loss": 5.2257, + "step": 11263 + }, + { + "epoch": 0.06699019887715292, + "grad_norm": 1.370089054107666, + "learning_rate": 4.944849189382759e-05, + "loss": 5.4681, + "step": 11264 + }, + { + "epoch": 0.06699614616043391, + "grad_norm": 1.3014042377471924, + "learning_rate": 4.9448394318511394e-05, + "loss": 5.3434, + "step": 11265 + }, + { + "epoch": 0.06700209344371491, + "grad_norm": 1.4719784259796143, + "learning_rate": 4.9448296734660516e-05, + "loss": 5.3064, + "step": 11266 + }, + { + "epoch": 0.06700804072699591, + "grad_norm": 1.6640921831130981, + "learning_rate": 4.944819914227499e-05, + "loss": 5.2896, + "step": 11267 + }, + { + "epoch": 0.0670139880102769, + "grad_norm": 1.4969593286514282, + "learning_rate": 4.9448101541354845e-05, + "loss": 5.1413, + "step": 11268 + }, + { + "epoch": 0.0670199352935579, + "grad_norm": 1.4021313190460205, + "learning_rate": 4.9448003931900126e-05, + "loss": 5.2609, + "step": 11269 + }, + { + "epoch": 0.0670258825768389, + "grad_norm": 1.6506398916244507, + "learning_rate": 4.9447906313910865e-05, + "loss": 5.3365, + "step": 11270 + }, + { + "epoch": 0.0670318298601199, + "grad_norm": 1.6469614505767822, + "learning_rate": 4.9447808687387084e-05, + "loss": 5.0384, + "step": 11271 + }, + { + "epoch": 0.0670377771434009, + "grad_norm": 1.5047974586486816, + "learning_rate": 4.944771105232883e-05, + "loss": 5.3565, + "step": 11272 + }, + { + "epoch": 0.0670437244266819, + "grad_norm": 1.4467194080352783, + "learning_rate": 4.9447613408736135e-05, + "loss": 5.5576, + "step": 11273 + }, + { + "epoch": 0.06704967170996289, + "grad_norm": 1.4636478424072266, + "learning_rate": 4.9447515756609034e-05, + "loss": 5.6407, + "step": 11274 + }, + { + "epoch": 0.06705561899324389, + "grad_norm": 1.373046875, + "learning_rate": 4.944741809594755e-05, + "loss": 5.4286, + "step": 11275 + }, + { + "epoch": 0.06706156627652489, + "grad_norm": 1.5114089250564575, + "learning_rate": 4.944732042675172e-05, + "loss": 5.6425, + "step": 11276 + }, + { + "epoch": 0.06706751355980588, + "grad_norm": 1.8263514041900635, + "learning_rate": 4.9447222749021596e-05, + "loss": 5.2469, + "step": 11277 + }, + { + "epoch": 0.06707346084308688, + "grad_norm": 1.780553936958313, + "learning_rate": 4.944712506275719e-05, + "loss": 5.3306, + "step": 11278 + }, + { + "epoch": 0.06707940812636788, + "grad_norm": 1.6208360195159912, + "learning_rate": 4.9447027367958556e-05, + "loss": 5.5365, + "step": 11279 + }, + { + "epoch": 0.06708535540964887, + "grad_norm": 1.336965560913086, + "learning_rate": 4.9446929664625705e-05, + "loss": 5.2694, + "step": 11280 + }, + { + "epoch": 0.06709130269292987, + "grad_norm": 1.6100155115127563, + "learning_rate": 4.9446831952758685e-05, + "loss": 5.5489, + "step": 11281 + }, + { + "epoch": 0.06709724997621087, + "grad_norm": 1.8020440340042114, + "learning_rate": 4.944673423235753e-05, + "loss": 5.3396, + "step": 11282 + }, + { + "epoch": 0.06710319725949186, + "grad_norm": 1.5315353870391846, + "learning_rate": 4.9446636503422276e-05, + "loss": 5.3687, + "step": 11283 + }, + { + "epoch": 0.06710914454277286, + "grad_norm": 2.2560019493103027, + "learning_rate": 4.9446538765952953e-05, + "loss": 5.4584, + "step": 11284 + }, + { + "epoch": 0.06711509182605385, + "grad_norm": 1.4653301239013672, + "learning_rate": 4.94464410199496e-05, + "loss": 5.3438, + "step": 11285 + }, + { + "epoch": 0.06712103910933485, + "grad_norm": 1.5931557416915894, + "learning_rate": 4.9446343265412243e-05, + "loss": 5.5802, + "step": 11286 + }, + { + "epoch": 0.06712698639261586, + "grad_norm": 1.5282461643218994, + "learning_rate": 4.944624550234092e-05, + "loss": 5.5634, + "step": 11287 + }, + { + "epoch": 0.06713293367589684, + "grad_norm": 1.7275618314743042, + "learning_rate": 4.944614773073566e-05, + "loss": 5.3797, + "step": 11288 + }, + { + "epoch": 0.06713888095917785, + "grad_norm": 1.6453620195388794, + "learning_rate": 4.944604995059651e-05, + "loss": 5.4693, + "step": 11289 + }, + { + "epoch": 0.06714482824245885, + "grad_norm": 1.870483636856079, + "learning_rate": 4.944595216192349e-05, + "loss": 5.4693, + "step": 11290 + }, + { + "epoch": 0.06715077552573984, + "grad_norm": 1.5478577613830566, + "learning_rate": 4.944585436471665e-05, + "loss": 5.694, + "step": 11291 + }, + { + "epoch": 0.06715672280902084, + "grad_norm": 1.9456945657730103, + "learning_rate": 4.944575655897601e-05, + "loss": 5.6687, + "step": 11292 + }, + { + "epoch": 0.06716267009230184, + "grad_norm": 1.808176875114441, + "learning_rate": 4.944565874470161e-05, + "loss": 5.7444, + "step": 11293 + }, + { + "epoch": 0.06716861737558283, + "grad_norm": 1.8066149950027466, + "learning_rate": 4.944556092189347e-05, + "loss": 5.5264, + "step": 11294 + }, + { + "epoch": 0.06717456465886383, + "grad_norm": 2.2896971702575684, + "learning_rate": 4.9445463090551656e-05, + "loss": 4.7624, + "step": 11295 + }, + { + "epoch": 0.06718051194214483, + "grad_norm": 1.7178759574890137, + "learning_rate": 4.9445365250676165e-05, + "loss": 5.79, + "step": 11296 + }, + { + "epoch": 0.06718645922542582, + "grad_norm": 1.8841933012008667, + "learning_rate": 4.944526740226707e-05, + "loss": 5.9792, + "step": 11297 + }, + { + "epoch": 0.06719240650870682, + "grad_norm": 1.8618090152740479, + "learning_rate": 4.944516954532437e-05, + "loss": 5.957, + "step": 11298 + }, + { + "epoch": 0.06719835379198782, + "grad_norm": 1.7545913457870483, + "learning_rate": 4.944507167984812e-05, + "loss": 5.4484, + "step": 11299 + }, + { + "epoch": 0.06720430107526881, + "grad_norm": 2.023158073425293, + "learning_rate": 4.9444973805838345e-05, + "loss": 5.0873, + "step": 11300 + }, + { + "epoch": 0.06721024835854982, + "grad_norm": 1.893340826034546, + "learning_rate": 4.944487592329509e-05, + "loss": 5.042, + "step": 11301 + }, + { + "epoch": 0.06721619564183082, + "grad_norm": 1.981518268585205, + "learning_rate": 4.944477803221837e-05, + "loss": 5.1463, + "step": 11302 + }, + { + "epoch": 0.0672221429251118, + "grad_norm": 2.47416090965271, + "learning_rate": 4.9444680132608236e-05, + "loss": 5.2885, + "step": 11303 + }, + { + "epoch": 0.06722809020839281, + "grad_norm": 2.3973519802093506, + "learning_rate": 4.944458222446472e-05, + "loss": 5.3321, + "step": 11304 + }, + { + "epoch": 0.06723403749167381, + "grad_norm": 1.9117941856384277, + "learning_rate": 4.9444484307787846e-05, + "loss": 5.2159, + "step": 11305 + }, + { + "epoch": 0.0672399847749548, + "grad_norm": 1.8732513189315796, + "learning_rate": 4.9444386382577656e-05, + "loss": 5.222, + "step": 11306 + }, + { + "epoch": 0.0672459320582358, + "grad_norm": 1.9202747344970703, + "learning_rate": 4.9444288448834184e-05, + "loss": 5.5766, + "step": 11307 + }, + { + "epoch": 0.0672518793415168, + "grad_norm": 1.8956191539764404, + "learning_rate": 4.944419050655747e-05, + "loss": 5.7129, + "step": 11308 + }, + { + "epoch": 0.06725782662479779, + "grad_norm": 2.7075235843658447, + "learning_rate": 4.9444092555747534e-05, + "loss": 5.2199, + "step": 11309 + }, + { + "epoch": 0.06726377390807879, + "grad_norm": 2.396125078201294, + "learning_rate": 4.944399459640442e-05, + "loss": 5.3548, + "step": 11310 + }, + { + "epoch": 0.0672697211913598, + "grad_norm": 2.6050171852111816, + "learning_rate": 4.9443896628528166e-05, + "loss": 5.616, + "step": 11311 + }, + { + "epoch": 0.06727566847464078, + "grad_norm": 2.512720823287964, + "learning_rate": 4.94437986521188e-05, + "loss": 5.3699, + "step": 11312 + }, + { + "epoch": 0.06728161575792178, + "grad_norm": 2.509716510772705, + "learning_rate": 4.9443700667176345e-05, + "loss": 5.431, + "step": 11313 + }, + { + "epoch": 0.06728756304120277, + "grad_norm": 2.2237601280212402, + "learning_rate": 4.944360267370085e-05, + "loss": 5.3985, + "step": 11314 + }, + { + "epoch": 0.06729351032448377, + "grad_norm": 1.982344627380371, + "learning_rate": 4.9443504671692356e-05, + "loss": 5.4849, + "step": 11315 + }, + { + "epoch": 0.06729945760776478, + "grad_norm": 2.1006124019622803, + "learning_rate": 4.9443406661150874e-05, + "loss": 5.227, + "step": 11316 + }, + { + "epoch": 0.06730540489104576, + "grad_norm": 2.0929529666900635, + "learning_rate": 4.9443308642076456e-05, + "loss": 5.524, + "step": 11317 + }, + { + "epoch": 0.06731135217432677, + "grad_norm": 1.9268262386322021, + "learning_rate": 4.944321061446914e-05, + "loss": 6.0622, + "step": 11318 + }, + { + "epoch": 0.06731729945760777, + "grad_norm": 2.257065773010254, + "learning_rate": 4.944311257832894e-05, + "loss": 4.9455, + "step": 11319 + }, + { + "epoch": 0.06732324674088876, + "grad_norm": 2.056244373321533, + "learning_rate": 4.944301453365591e-05, + "loss": 5.4157, + "step": 11320 + }, + { + "epoch": 0.06732919402416976, + "grad_norm": 2.1667540073394775, + "learning_rate": 4.944291648045007e-05, + "loss": 5.5767, + "step": 11321 + }, + { + "epoch": 0.06733514130745076, + "grad_norm": 1.9596853256225586, + "learning_rate": 4.944281841871146e-05, + "loss": 5.6532, + "step": 11322 + }, + { + "epoch": 0.06734108859073175, + "grad_norm": 1.7050867080688477, + "learning_rate": 4.9442720348440116e-05, + "loss": 5.8881, + "step": 11323 + }, + { + "epoch": 0.06734703587401275, + "grad_norm": 1.8681753873825073, + "learning_rate": 4.944262226963607e-05, + "loss": 5.9369, + "step": 11324 + }, + { + "epoch": 0.06735298315729375, + "grad_norm": 1.9432111978530884, + "learning_rate": 4.9442524182299365e-05, + "loss": 5.9163, + "step": 11325 + }, + { + "epoch": 0.06735893044057474, + "grad_norm": 1.8099175691604614, + "learning_rate": 4.9442426086430026e-05, + "loss": 5.809, + "step": 11326 + }, + { + "epoch": 0.06736487772385574, + "grad_norm": 1.6179800033569336, + "learning_rate": 4.944232798202808e-05, + "loss": 5.5609, + "step": 11327 + }, + { + "epoch": 0.06737082500713674, + "grad_norm": 2.303189992904663, + "learning_rate": 4.944222986909357e-05, + "loss": 5.9291, + "step": 11328 + }, + { + "epoch": 0.06737677229041773, + "grad_norm": 1.913813829421997, + "learning_rate": 4.944213174762654e-05, + "loss": 5.8672, + "step": 11329 + }, + { + "epoch": 0.06738271957369873, + "grad_norm": 2.1856813430786133, + "learning_rate": 4.944203361762701e-05, + "loss": 5.2632, + "step": 11330 + }, + { + "epoch": 0.06738866685697974, + "grad_norm": 2.019679069519043, + "learning_rate": 4.9441935479095016e-05, + "loss": 5.3707, + "step": 11331 + }, + { + "epoch": 0.06739461414026073, + "grad_norm": 1.8531097173690796, + "learning_rate": 4.944183733203059e-05, + "loss": 5.6689, + "step": 11332 + }, + { + "epoch": 0.06740056142354173, + "grad_norm": 2.068208694458008, + "learning_rate": 4.944173917643378e-05, + "loss": 5.6111, + "step": 11333 + }, + { + "epoch": 0.06740650870682273, + "grad_norm": 1.8021270036697388, + "learning_rate": 4.944164101230461e-05, + "loss": 6.0865, + "step": 11334 + }, + { + "epoch": 0.06741245599010372, + "grad_norm": 1.9051427841186523, + "learning_rate": 4.944154283964312e-05, + "loss": 5.5862, + "step": 11335 + }, + { + "epoch": 0.06741840327338472, + "grad_norm": 1.718483805656433, + "learning_rate": 4.944144465844933e-05, + "loss": 5.2505, + "step": 11336 + }, + { + "epoch": 0.06742435055666572, + "grad_norm": 2.205167531967163, + "learning_rate": 4.944134646872329e-05, + "loss": 5.3181, + "step": 11337 + }, + { + "epoch": 0.06743029783994671, + "grad_norm": 1.550945520401001, + "learning_rate": 4.944124827046502e-05, + "loss": 5.4129, + "step": 11338 + }, + { + "epoch": 0.06743624512322771, + "grad_norm": 2.08793044090271, + "learning_rate": 4.944115006367458e-05, + "loss": 5.9705, + "step": 11339 + }, + { + "epoch": 0.06744219240650871, + "grad_norm": 1.8955761194229126, + "learning_rate": 4.944105184835197e-05, + "loss": 4.9629, + "step": 11340 + }, + { + "epoch": 0.0674481396897897, + "grad_norm": 1.7287909984588623, + "learning_rate": 4.944095362449724e-05, + "loss": 5.1097, + "step": 11341 + }, + { + "epoch": 0.0674540869730707, + "grad_norm": 1.8718771934509277, + "learning_rate": 4.944085539211044e-05, + "loss": 5.6443, + "step": 11342 + }, + { + "epoch": 0.06746003425635169, + "grad_norm": 2.220863103866577, + "learning_rate": 4.9440757151191585e-05, + "loss": 5.5042, + "step": 11343 + }, + { + "epoch": 0.0674659815396327, + "grad_norm": 1.9501415491104126, + "learning_rate": 4.944065890174071e-05, + "loss": 5.6788, + "step": 11344 + }, + { + "epoch": 0.0674719288229137, + "grad_norm": 1.8566590547561646, + "learning_rate": 4.944056064375786e-05, + "loss": 5.6531, + "step": 11345 + }, + { + "epoch": 0.06747787610619468, + "grad_norm": 1.895409345626831, + "learning_rate": 4.9440462377243055e-05, + "loss": 5.6441, + "step": 11346 + }, + { + "epoch": 0.06748382338947569, + "grad_norm": 2.1746973991394043, + "learning_rate": 4.9440364102196345e-05, + "loss": 5.8624, + "step": 11347 + }, + { + "epoch": 0.06748977067275669, + "grad_norm": 1.9661751985549927, + "learning_rate": 4.944026581861775e-05, + "loss": 5.6075, + "step": 11348 + }, + { + "epoch": 0.06749571795603768, + "grad_norm": 1.8591458797454834, + "learning_rate": 4.944016752650731e-05, + "loss": 5.9115, + "step": 11349 + }, + { + "epoch": 0.06750166523931868, + "grad_norm": 1.6491025686264038, + "learning_rate": 4.9440069225865065e-05, + "loss": 6.0548, + "step": 11350 + }, + { + "epoch": 0.06750761252259968, + "grad_norm": 1.857928991317749, + "learning_rate": 4.9439970916691045e-05, + "loss": 5.4326, + "step": 11351 + }, + { + "epoch": 0.06751355980588067, + "grad_norm": 1.8189151287078857, + "learning_rate": 4.943987259898528e-05, + "loss": 5.7744, + "step": 11352 + }, + { + "epoch": 0.06751950708916167, + "grad_norm": 1.7486300468444824, + "learning_rate": 4.943977427274781e-05, + "loss": 5.7128, + "step": 11353 + }, + { + "epoch": 0.06752545437244267, + "grad_norm": 1.7272138595581055, + "learning_rate": 4.943967593797866e-05, + "loss": 5.9922, + "step": 11354 + }, + { + "epoch": 0.06753140165572366, + "grad_norm": 1.740860939025879, + "learning_rate": 4.9439577594677875e-05, + "loss": 5.8486, + "step": 11355 + }, + { + "epoch": 0.06753734893900466, + "grad_norm": 1.9054155349731445, + "learning_rate": 4.9439479242845494e-05, + "loss": 5.4694, + "step": 11356 + }, + { + "epoch": 0.06754329622228566, + "grad_norm": 1.9783501625061035, + "learning_rate": 4.943938088248154e-05, + "loss": 5.5185, + "step": 11357 + }, + { + "epoch": 0.06754924350556665, + "grad_norm": 1.8267238140106201, + "learning_rate": 4.943928251358605e-05, + "loss": 5.7589, + "step": 11358 + }, + { + "epoch": 0.06755519078884765, + "grad_norm": 1.6957738399505615, + "learning_rate": 4.943918413615906e-05, + "loss": 5.5716, + "step": 11359 + }, + { + "epoch": 0.06756113807212866, + "grad_norm": 2.0818982124328613, + "learning_rate": 4.94390857502006e-05, + "loss": 5.8969, + "step": 11360 + }, + { + "epoch": 0.06756708535540965, + "grad_norm": 1.8012073040008545, + "learning_rate": 4.9438987355710703e-05, + "loss": 6.1053, + "step": 11361 + }, + { + "epoch": 0.06757303263869065, + "grad_norm": 2.2209696769714355, + "learning_rate": 4.943888895268942e-05, + "loss": 5.9714, + "step": 11362 + }, + { + "epoch": 0.06757897992197165, + "grad_norm": 1.8006336688995361, + "learning_rate": 4.943879054113676e-05, + "loss": 5.6427, + "step": 11363 + }, + { + "epoch": 0.06758492720525264, + "grad_norm": 1.7628017663955688, + "learning_rate": 4.9438692121052775e-05, + "loss": 5.8639, + "step": 11364 + }, + { + "epoch": 0.06759087448853364, + "grad_norm": 1.8574492931365967, + "learning_rate": 4.94385936924375e-05, + "loss": 5.892, + "step": 11365 + }, + { + "epoch": 0.06759682177181464, + "grad_norm": 1.7926831245422363, + "learning_rate": 4.9438495255290964e-05, + "loss": 5.9024, + "step": 11366 + }, + { + "epoch": 0.06760276905509563, + "grad_norm": 2.503370761871338, + "learning_rate": 4.94383968096132e-05, + "loss": 5.994, + "step": 11367 + }, + { + "epoch": 0.06760871633837663, + "grad_norm": 1.7123390436172485, + "learning_rate": 4.943829835540424e-05, + "loss": 5.8052, + "step": 11368 + }, + { + "epoch": 0.06761466362165763, + "grad_norm": 2.0890092849731445, + "learning_rate": 4.943819989266413e-05, + "loss": 5.067, + "step": 11369 + }, + { + "epoch": 0.06762061090493862, + "grad_norm": 1.8000640869140625, + "learning_rate": 4.9438101421392894e-05, + "loss": 5.3562, + "step": 11370 + }, + { + "epoch": 0.06762655818821962, + "grad_norm": 2.254873514175415, + "learning_rate": 4.9438002941590564e-05, + "loss": 5.0557, + "step": 11371 + }, + { + "epoch": 0.06763250547150061, + "grad_norm": 1.8080449104309082, + "learning_rate": 4.943790445325719e-05, + "loss": 5.6702, + "step": 11372 + }, + { + "epoch": 0.06763845275478161, + "grad_norm": 2.0175933837890625, + "learning_rate": 4.943780595639279e-05, + "loss": 5.6227, + "step": 11373 + }, + { + "epoch": 0.06764440003806262, + "grad_norm": 1.9859650135040283, + "learning_rate": 4.943770745099741e-05, + "loss": 5.4437, + "step": 11374 + }, + { + "epoch": 0.0676503473213436, + "grad_norm": 1.975573182106018, + "learning_rate": 4.943760893707107e-05, + "loss": 5.3101, + "step": 11375 + }, + { + "epoch": 0.0676562946046246, + "grad_norm": 2.2590208053588867, + "learning_rate": 4.943751041461382e-05, + "loss": 5.2544, + "step": 11376 + }, + { + "epoch": 0.06766224188790561, + "grad_norm": 1.8615392446517944, + "learning_rate": 4.943741188362568e-05, + "loss": 5.5266, + "step": 11377 + }, + { + "epoch": 0.0676681891711866, + "grad_norm": 2.056810140609741, + "learning_rate": 4.943731334410669e-05, + "loss": 5.1994, + "step": 11378 + }, + { + "epoch": 0.0676741364544676, + "grad_norm": 2.0275685787200928, + "learning_rate": 4.94372147960569e-05, + "loss": 5.7385, + "step": 11379 + }, + { + "epoch": 0.0676800837377486, + "grad_norm": 2.082963466644287, + "learning_rate": 4.9437116239476325e-05, + "loss": 5.1531, + "step": 11380 + }, + { + "epoch": 0.06768603102102959, + "grad_norm": 2.176421642303467, + "learning_rate": 4.9437017674365004e-05, + "loss": 5.521, + "step": 11381 + }, + { + "epoch": 0.06769197830431059, + "grad_norm": 2.1424365043640137, + "learning_rate": 4.9436919100722964e-05, + "loss": 5.4543, + "step": 11382 + }, + { + "epoch": 0.06769792558759159, + "grad_norm": 2.07836651802063, + "learning_rate": 4.9436820518550266e-05, + "loss": 5.5166, + "step": 11383 + }, + { + "epoch": 0.06770387287087258, + "grad_norm": 1.9776746034622192, + "learning_rate": 4.9436721927846915e-05, + "loss": 5.4621, + "step": 11384 + }, + { + "epoch": 0.06770982015415358, + "grad_norm": 1.9985042810440063, + "learning_rate": 4.943662332861296e-05, + "loss": 5.3835, + "step": 11385 + }, + { + "epoch": 0.06771576743743458, + "grad_norm": 1.6877795457839966, + "learning_rate": 4.943652472084843e-05, + "loss": 5.185, + "step": 11386 + }, + { + "epoch": 0.06772171472071557, + "grad_norm": 1.8307565450668335, + "learning_rate": 4.943642610455336e-05, + "loss": 5.117, + "step": 11387 + }, + { + "epoch": 0.06772766200399657, + "grad_norm": 2.0381922721862793, + "learning_rate": 4.943632747972779e-05, + "loss": 5.6004, + "step": 11388 + }, + { + "epoch": 0.06773360928727758, + "grad_norm": 1.9554756879806519, + "learning_rate": 4.943622884637175e-05, + "loss": 5.9638, + "step": 11389 + }, + { + "epoch": 0.06773955657055857, + "grad_norm": 1.878861665725708, + "learning_rate": 4.9436130204485274e-05, + "loss": 5.7961, + "step": 11390 + }, + { + "epoch": 0.06774550385383957, + "grad_norm": 2.040012836456299, + "learning_rate": 4.94360315540684e-05, + "loss": 5.7175, + "step": 11391 + }, + { + "epoch": 0.06775145113712057, + "grad_norm": 2.262408494949341, + "learning_rate": 4.943593289512115e-05, + "loss": 4.8581, + "step": 11392 + }, + { + "epoch": 0.06775739842040156, + "grad_norm": 2.201751232147217, + "learning_rate": 4.943583422764358e-05, + "loss": 5.0647, + "step": 11393 + }, + { + "epoch": 0.06776334570368256, + "grad_norm": 1.9768764972686768, + "learning_rate": 4.943573555163571e-05, + "loss": 5.8836, + "step": 11394 + }, + { + "epoch": 0.06776929298696356, + "grad_norm": 2.1048574447631836, + "learning_rate": 4.9435636867097575e-05, + "loss": 5.9746, + "step": 11395 + }, + { + "epoch": 0.06777524027024455, + "grad_norm": 1.5297552347183228, + "learning_rate": 4.943553817402921e-05, + "loss": 4.912, + "step": 11396 + }, + { + "epoch": 0.06778118755352555, + "grad_norm": 1.5313429832458496, + "learning_rate": 4.943543947243066e-05, + "loss": 4.975, + "step": 11397 + }, + { + "epoch": 0.06778713483680655, + "grad_norm": 1.8882219791412354, + "learning_rate": 4.943534076230194e-05, + "loss": 5.2183, + "step": 11398 + }, + { + "epoch": 0.06779308212008754, + "grad_norm": 1.698997139930725, + "learning_rate": 4.9435242043643094e-05, + "loss": 5.8019, + "step": 11399 + }, + { + "epoch": 0.06779902940336854, + "grad_norm": 1.775140404701233, + "learning_rate": 4.943514331645417e-05, + "loss": 5.7451, + "step": 11400 + }, + { + "epoch": 0.06780497668664953, + "grad_norm": 2.273650884628296, + "learning_rate": 4.943504458073518e-05, + "loss": 4.7727, + "step": 11401 + }, + { + "epoch": 0.06781092396993053, + "grad_norm": 2.166961908340454, + "learning_rate": 4.943494583648617e-05, + "loss": 5.4537, + "step": 11402 + }, + { + "epoch": 0.06781687125321154, + "grad_norm": 2.147876024246216, + "learning_rate": 4.943484708370717e-05, + "loss": 5.2635, + "step": 11403 + }, + { + "epoch": 0.06782281853649252, + "grad_norm": 1.968397855758667, + "learning_rate": 4.943474832239822e-05, + "loss": 5.6591, + "step": 11404 + }, + { + "epoch": 0.06782876581977353, + "grad_norm": 1.8838316202163696, + "learning_rate": 4.943464955255935e-05, + "loss": 5.5462, + "step": 11405 + }, + { + "epoch": 0.06783471310305453, + "grad_norm": 2.4205315113067627, + "learning_rate": 4.94345507741906e-05, + "loss": 4.859, + "step": 11406 + }, + { + "epoch": 0.06784066038633552, + "grad_norm": 2.1272950172424316, + "learning_rate": 4.9434451987292e-05, + "loss": 5.1791, + "step": 11407 + }, + { + "epoch": 0.06784660766961652, + "grad_norm": 2.345055341720581, + "learning_rate": 4.9434353191863595e-05, + "loss": 5.1616, + "step": 11408 + }, + { + "epoch": 0.06785255495289752, + "grad_norm": 2.3967537879943848, + "learning_rate": 4.9434254387905395e-05, + "loss": 5.1805, + "step": 11409 + }, + { + "epoch": 0.06785850223617851, + "grad_norm": 2.2108283042907715, + "learning_rate": 4.943415557541745e-05, + "loss": 5.381, + "step": 11410 + }, + { + "epoch": 0.06786444951945951, + "grad_norm": 2.178776979446411, + "learning_rate": 4.94340567543998e-05, + "loss": 5.4016, + "step": 11411 + }, + { + "epoch": 0.06787039680274051, + "grad_norm": 2.003169059753418, + "learning_rate": 4.943395792485247e-05, + "loss": 5.5632, + "step": 11412 + }, + { + "epoch": 0.0678763440860215, + "grad_norm": 2.0337789058685303, + "learning_rate": 4.9433859086775506e-05, + "loss": 5.4476, + "step": 11413 + }, + { + "epoch": 0.0678822913693025, + "grad_norm": 1.784868836402893, + "learning_rate": 4.943376024016892e-05, + "loss": 5.3578, + "step": 11414 + }, + { + "epoch": 0.0678882386525835, + "grad_norm": 1.7282286882400513, + "learning_rate": 4.943366138503277e-05, + "loss": 5.6202, + "step": 11415 + }, + { + "epoch": 0.06789418593586449, + "grad_norm": 1.9716618061065674, + "learning_rate": 4.943356252136707e-05, + "loss": 4.9861, + "step": 11416 + }, + { + "epoch": 0.0679001332191455, + "grad_norm": 2.399317502975464, + "learning_rate": 4.943346364917188e-05, + "loss": 4.4494, + "step": 11417 + }, + { + "epoch": 0.0679060805024265, + "grad_norm": 2.142995834350586, + "learning_rate": 4.943336476844722e-05, + "loss": 4.5989, + "step": 11418 + }, + { + "epoch": 0.06791202778570748, + "grad_norm": 1.9394404888153076, + "learning_rate": 4.943326587919311e-05, + "loss": 4.4944, + "step": 11419 + }, + { + "epoch": 0.06791797506898849, + "grad_norm": 2.41937518119812, + "learning_rate": 4.9433166981409615e-05, + "loss": 5.1687, + "step": 11420 + }, + { + "epoch": 0.06792392235226949, + "grad_norm": 2.1686136722564697, + "learning_rate": 4.943306807509675e-05, + "loss": 6.2976, + "step": 11421 + }, + { + "epoch": 0.06792986963555048, + "grad_norm": 1.9649391174316406, + "learning_rate": 4.943296916025455e-05, + "loss": 6.0242, + "step": 11422 + }, + { + "epoch": 0.06793581691883148, + "grad_norm": 1.9251484870910645, + "learning_rate": 4.943287023688305e-05, + "loss": 5.9777, + "step": 11423 + }, + { + "epoch": 0.06794176420211248, + "grad_norm": 1.838348388671875, + "learning_rate": 4.9432771304982296e-05, + "loss": 5.8669, + "step": 11424 + }, + { + "epoch": 0.06794771148539347, + "grad_norm": 2.5417487621307373, + "learning_rate": 4.94326723645523e-05, + "loss": 5.5131, + "step": 11425 + }, + { + "epoch": 0.06795365876867447, + "grad_norm": 2.2175936698913574, + "learning_rate": 4.943257341559312e-05, + "loss": 5.4657, + "step": 11426 + }, + { + "epoch": 0.06795960605195547, + "grad_norm": 2.4474873542785645, + "learning_rate": 4.943247445810478e-05, + "loss": 5.2401, + "step": 11427 + }, + { + "epoch": 0.06796555333523646, + "grad_norm": 2.176483392715454, + "learning_rate": 4.9432375492087324e-05, + "loss": 5.7295, + "step": 11428 + }, + { + "epoch": 0.06797150061851746, + "grad_norm": 1.9311527013778687, + "learning_rate": 4.943227651754077e-05, + "loss": 5.8135, + "step": 11429 + }, + { + "epoch": 0.06797744790179845, + "grad_norm": 2.2462544441223145, + "learning_rate": 4.943217753446516e-05, + "loss": 6.0761, + "step": 11430 + }, + { + "epoch": 0.06798339518507945, + "grad_norm": 2.3158276081085205, + "learning_rate": 4.943207854286053e-05, + "loss": 6.0223, + "step": 11431 + }, + { + "epoch": 0.06798934246836046, + "grad_norm": 1.6222623586654663, + "learning_rate": 4.9431979542726914e-05, + "loss": 5.9417, + "step": 11432 + }, + { + "epoch": 0.06799528975164144, + "grad_norm": 1.9809083938598633, + "learning_rate": 4.9431880534064345e-05, + "loss": 5.7476, + "step": 11433 + }, + { + "epoch": 0.06800123703492245, + "grad_norm": 1.9575468301773071, + "learning_rate": 4.9431781516872865e-05, + "loss": 5.6169, + "step": 11434 + }, + { + "epoch": 0.06800718431820345, + "grad_norm": 2.1103882789611816, + "learning_rate": 4.9431682491152495e-05, + "loss": 5.5119, + "step": 11435 + }, + { + "epoch": 0.06801313160148444, + "grad_norm": 2.280287265777588, + "learning_rate": 4.943158345690328e-05, + "loss": 5.2622, + "step": 11436 + }, + { + "epoch": 0.06801907888476544, + "grad_norm": 2.582737684249878, + "learning_rate": 4.943148441412525e-05, + "loss": 5.2644, + "step": 11437 + }, + { + "epoch": 0.06802502616804644, + "grad_norm": 2.1919124126434326, + "learning_rate": 4.9431385362818446e-05, + "loss": 5.0717, + "step": 11438 + }, + { + "epoch": 0.06803097345132743, + "grad_norm": 2.3036141395568848, + "learning_rate": 4.9431286302982896e-05, + "loss": 5.0049, + "step": 11439 + }, + { + "epoch": 0.06803692073460843, + "grad_norm": 2.3675789833068848, + "learning_rate": 4.943118723461864e-05, + "loss": 5.4686, + "step": 11440 + }, + { + "epoch": 0.06804286801788943, + "grad_norm": 2.8305327892303467, + "learning_rate": 4.94310881577257e-05, + "loss": 5.3409, + "step": 11441 + }, + { + "epoch": 0.06804881530117042, + "grad_norm": 1.562173843383789, + "learning_rate": 4.9430989072304126e-05, + "loss": 5.6801, + "step": 11442 + }, + { + "epoch": 0.06805476258445142, + "grad_norm": 1.9728971719741821, + "learning_rate": 4.9430889978353945e-05, + "loss": 5.4252, + "step": 11443 + }, + { + "epoch": 0.06806070986773242, + "grad_norm": 2.054025173187256, + "learning_rate": 4.9430790875875185e-05, + "loss": 5.1155, + "step": 11444 + }, + { + "epoch": 0.06806665715101341, + "grad_norm": 1.8511056900024414, + "learning_rate": 4.9430691764867895e-05, + "loss": 5.102, + "step": 11445 + }, + { + "epoch": 0.06807260443429441, + "grad_norm": 1.9024226665496826, + "learning_rate": 4.943059264533211e-05, + "loss": 5.0761, + "step": 11446 + }, + { + "epoch": 0.06807855171757542, + "grad_norm": 2.4767966270446777, + "learning_rate": 4.9430493517267843e-05, + "loss": 4.9809, + "step": 11447 + }, + { + "epoch": 0.0680844990008564, + "grad_norm": 2.393517255783081, + "learning_rate": 4.943039438067515e-05, + "loss": 5.1191, + "step": 11448 + }, + { + "epoch": 0.06809044628413741, + "grad_norm": 1.9510548114776611, + "learning_rate": 4.9430295235554055e-05, + "loss": 5.7117, + "step": 11449 + }, + { + "epoch": 0.06809639356741841, + "grad_norm": 2.1002418994903564, + "learning_rate": 4.9430196081904605e-05, + "loss": 5.7003, + "step": 11450 + }, + { + "epoch": 0.0681023408506994, + "grad_norm": 2.5328590869903564, + "learning_rate": 4.943009691972682e-05, + "loss": 6.1835, + "step": 11451 + }, + { + "epoch": 0.0681082881339804, + "grad_norm": 1.9173791408538818, + "learning_rate": 4.9429997749020743e-05, + "loss": 5.9596, + "step": 11452 + }, + { + "epoch": 0.0681142354172614, + "grad_norm": 2.0781052112579346, + "learning_rate": 4.9429898569786406e-05, + "loss": 5.7335, + "step": 11453 + }, + { + "epoch": 0.06812018270054239, + "grad_norm": 2.4210550785064697, + "learning_rate": 4.942979938202384e-05, + "loss": 4.9888, + "step": 11454 + }, + { + "epoch": 0.06812612998382339, + "grad_norm": 1.8438634872436523, + "learning_rate": 4.942970018573309e-05, + "loss": 5.8027, + "step": 11455 + }, + { + "epoch": 0.0681320772671044, + "grad_norm": 2.122882843017578, + "learning_rate": 4.942960098091418e-05, + "loss": 5.8569, + "step": 11456 + }, + { + "epoch": 0.06813802455038538, + "grad_norm": 1.6002168655395508, + "learning_rate": 4.942950176756715e-05, + "loss": 5.7362, + "step": 11457 + }, + { + "epoch": 0.06814397183366638, + "grad_norm": 1.8086539506912231, + "learning_rate": 4.942940254569203e-05, + "loss": 5.7537, + "step": 11458 + }, + { + "epoch": 0.06814991911694737, + "grad_norm": 2.0441513061523438, + "learning_rate": 4.942930331528886e-05, + "loss": 5.8255, + "step": 11459 + }, + { + "epoch": 0.06815586640022837, + "grad_norm": 1.8272675275802612, + "learning_rate": 4.942920407635767e-05, + "loss": 5.6915, + "step": 11460 + }, + { + "epoch": 0.06816181368350938, + "grad_norm": 3.3902077674865723, + "learning_rate": 4.94291048288985e-05, + "loss": 4.719, + "step": 11461 + }, + { + "epoch": 0.06816776096679036, + "grad_norm": 3.1770875453948975, + "learning_rate": 4.9429005572911385e-05, + "loss": 4.401, + "step": 11462 + }, + { + "epoch": 0.06817370825007137, + "grad_norm": 1.9011846780776978, + "learning_rate": 4.9428906308396355e-05, + "loss": 5.4768, + "step": 11463 + }, + { + "epoch": 0.06817965553335237, + "grad_norm": 1.7608321905136108, + "learning_rate": 4.9428807035353443e-05, + "loss": 5.5755, + "step": 11464 + }, + { + "epoch": 0.06818560281663336, + "grad_norm": 1.8250397443771362, + "learning_rate": 4.9428707753782686e-05, + "loss": 5.7804, + "step": 11465 + }, + { + "epoch": 0.06819155009991436, + "grad_norm": 2.566436290740967, + "learning_rate": 4.942860846368412e-05, + "loss": 5.0442, + "step": 11466 + }, + { + "epoch": 0.06819749738319536, + "grad_norm": 3.336547613143921, + "learning_rate": 4.942850916505779e-05, + "loss": 4.5331, + "step": 11467 + }, + { + "epoch": 0.06820344466647635, + "grad_norm": 2.6383185386657715, + "learning_rate": 4.9428409857903714e-05, + "loss": 4.5301, + "step": 11468 + }, + { + "epoch": 0.06820939194975735, + "grad_norm": 2.3853955268859863, + "learning_rate": 4.9428310542221924e-05, + "loss": 4.3398, + "step": 11469 + }, + { + "epoch": 0.06821533923303835, + "grad_norm": 2.3954038619995117, + "learning_rate": 4.942821121801246e-05, + "loss": 5.0841, + "step": 11470 + }, + { + "epoch": 0.06822128651631934, + "grad_norm": 2.922161340713501, + "learning_rate": 4.942811188527537e-05, + "loss": 4.5573, + "step": 11471 + }, + { + "epoch": 0.06822723379960034, + "grad_norm": 2.7202560901641846, + "learning_rate": 4.942801254401068e-05, + "loss": 4.5047, + "step": 11472 + }, + { + "epoch": 0.06823318108288134, + "grad_norm": 2.2289440631866455, + "learning_rate": 4.9427913194218424e-05, + "loss": 5.4686, + "step": 11473 + }, + { + "epoch": 0.06823912836616233, + "grad_norm": 2.2033851146698, + "learning_rate": 4.9427813835898635e-05, + "loss": 5.3554, + "step": 11474 + }, + { + "epoch": 0.06824507564944333, + "grad_norm": 2.171147346496582, + "learning_rate": 4.9427714469051345e-05, + "loss": 5.504, + "step": 11475 + }, + { + "epoch": 0.06825102293272434, + "grad_norm": 2.0110602378845215, + "learning_rate": 4.9427615093676594e-05, + "loss": 5.6126, + "step": 11476 + }, + { + "epoch": 0.06825697021600532, + "grad_norm": 2.08642840385437, + "learning_rate": 4.942751570977441e-05, + "loss": 6.0948, + "step": 11477 + }, + { + "epoch": 0.06826291749928633, + "grad_norm": 2.12245774269104, + "learning_rate": 4.9427416317344835e-05, + "loss": 5.2845, + "step": 11478 + }, + { + "epoch": 0.06826886478256733, + "grad_norm": 1.9155166149139404, + "learning_rate": 4.942731691638791e-05, + "loss": 5.4674, + "step": 11479 + }, + { + "epoch": 0.06827481206584832, + "grad_norm": 2.3452367782592773, + "learning_rate": 4.942721750690365e-05, + "loss": 5.2368, + "step": 11480 + }, + { + "epoch": 0.06828075934912932, + "grad_norm": 2.1282498836517334, + "learning_rate": 4.9427118088892105e-05, + "loss": 5.348, + "step": 11481 + }, + { + "epoch": 0.06828670663241032, + "grad_norm": 1.9251933097839355, + "learning_rate": 4.9427018662353306e-05, + "loss": 5.2588, + "step": 11482 + }, + { + "epoch": 0.06829265391569131, + "grad_norm": 1.9481078386306763, + "learning_rate": 4.942691922728728e-05, + "loss": 5.2775, + "step": 11483 + }, + { + "epoch": 0.06829860119897231, + "grad_norm": 1.9506112337112427, + "learning_rate": 4.942681978369408e-05, + "loss": 5.6865, + "step": 11484 + }, + { + "epoch": 0.06830454848225331, + "grad_norm": 2.0636112689971924, + "learning_rate": 4.942672033157373e-05, + "loss": 6.218, + "step": 11485 + }, + { + "epoch": 0.0683104957655343, + "grad_norm": 1.8479397296905518, + "learning_rate": 4.9426620870926256e-05, + "loss": 6.1283, + "step": 11486 + }, + { + "epoch": 0.0683164430488153, + "grad_norm": 1.9079830646514893, + "learning_rate": 4.94265214017517e-05, + "loss": 6.127, + "step": 11487 + }, + { + "epoch": 0.06832239033209629, + "grad_norm": 2.1076481342315674, + "learning_rate": 4.9426421924050105e-05, + "loss": 5.9978, + "step": 11488 + }, + { + "epoch": 0.0683283376153773, + "grad_norm": 1.885231375694275, + "learning_rate": 4.942632243782149e-05, + "loss": 5.8269, + "step": 11489 + }, + { + "epoch": 0.0683342848986583, + "grad_norm": 1.968980073928833, + "learning_rate": 4.942622294306591e-05, + "loss": 5.899, + "step": 11490 + }, + { + "epoch": 0.06834023218193928, + "grad_norm": 1.9857345819473267, + "learning_rate": 4.9426123439783376e-05, + "loss": 5.9416, + "step": 11491 + }, + { + "epoch": 0.06834617946522029, + "grad_norm": 1.8433799743652344, + "learning_rate": 4.942602392797394e-05, + "loss": 6.0714, + "step": 11492 + }, + { + "epoch": 0.06835212674850129, + "grad_norm": 1.9299565553665161, + "learning_rate": 4.942592440763764e-05, + "loss": 6.14, + "step": 11493 + }, + { + "epoch": 0.06835807403178228, + "grad_norm": 1.5700571537017822, + "learning_rate": 4.9425824878774486e-05, + "loss": 6.0496, + "step": 11494 + }, + { + "epoch": 0.06836402131506328, + "grad_norm": 1.6914032697677612, + "learning_rate": 4.942572534138454e-05, + "loss": 5.8301, + "step": 11495 + }, + { + "epoch": 0.06836996859834428, + "grad_norm": 1.6765984296798706, + "learning_rate": 4.942562579546782e-05, + "loss": 6.0701, + "step": 11496 + }, + { + "epoch": 0.06837591588162527, + "grad_norm": 1.715425729751587, + "learning_rate": 4.9425526241024364e-05, + "loss": 5.9499, + "step": 11497 + }, + { + "epoch": 0.06838186316490627, + "grad_norm": 1.8849130868911743, + "learning_rate": 4.942542667805422e-05, + "loss": 5.7088, + "step": 11498 + }, + { + "epoch": 0.06838781044818727, + "grad_norm": 2.1290276050567627, + "learning_rate": 4.9425327106557405e-05, + "loss": 5.9329, + "step": 11499 + }, + { + "epoch": 0.06839375773146826, + "grad_norm": 1.9105192422866821, + "learning_rate": 4.942522752653396e-05, + "loss": 5.9068, + "step": 11500 + }, + { + "epoch": 0.06839970501474926, + "grad_norm": 1.9120036363601685, + "learning_rate": 4.9425127937983926e-05, + "loss": 5.8411, + "step": 11501 + }, + { + "epoch": 0.06840565229803026, + "grad_norm": 2.1045427322387695, + "learning_rate": 4.942502834090732e-05, + "loss": 6.1575, + "step": 11502 + }, + { + "epoch": 0.06841159958131125, + "grad_norm": 1.8271901607513428, + "learning_rate": 4.94249287353042e-05, + "loss": 6.0732, + "step": 11503 + }, + { + "epoch": 0.06841754686459225, + "grad_norm": 1.4770866632461548, + "learning_rate": 4.942482912117459e-05, + "loss": 6.0823, + "step": 11504 + }, + { + "epoch": 0.06842349414787326, + "grad_norm": 1.7055792808532715, + "learning_rate": 4.942472949851852e-05, + "loss": 6.0738, + "step": 11505 + }, + { + "epoch": 0.06842944143115424, + "grad_norm": 1.588705062866211, + "learning_rate": 4.942462986733602e-05, + "loss": 5.9731, + "step": 11506 + }, + { + "epoch": 0.06843538871443525, + "grad_norm": 2.662527561187744, + "learning_rate": 4.942453022762715e-05, + "loss": 5.7745, + "step": 11507 + }, + { + "epoch": 0.06844133599771625, + "grad_norm": 2.0649495124816895, + "learning_rate": 4.9424430579391925e-05, + "loss": 5.7173, + "step": 11508 + }, + { + "epoch": 0.06844728328099724, + "grad_norm": 1.647801160812378, + "learning_rate": 4.942433092263038e-05, + "loss": 6.1516, + "step": 11509 + }, + { + "epoch": 0.06845323056427824, + "grad_norm": 1.743788480758667, + "learning_rate": 4.942423125734256e-05, + "loss": 6.0211, + "step": 11510 + }, + { + "epoch": 0.06845917784755924, + "grad_norm": 1.898647665977478, + "learning_rate": 4.942413158352849e-05, + "loss": 6.0106, + "step": 11511 + }, + { + "epoch": 0.06846512513084023, + "grad_norm": 1.5159860849380493, + "learning_rate": 4.94240319011882e-05, + "loss": 5.8759, + "step": 11512 + }, + { + "epoch": 0.06847107241412123, + "grad_norm": 3.265730142593384, + "learning_rate": 4.9423932210321744e-05, + "loss": 4.7228, + "step": 11513 + }, + { + "epoch": 0.06847701969740223, + "grad_norm": 2.9290871620178223, + "learning_rate": 4.9423832510929136e-05, + "loss": 4.5315, + "step": 11514 + }, + { + "epoch": 0.06848296698068322, + "grad_norm": 2.4189975261688232, + "learning_rate": 4.942373280301042e-05, + "loss": 4.5803, + "step": 11515 + }, + { + "epoch": 0.06848891426396422, + "grad_norm": 2.4018993377685547, + "learning_rate": 4.9423633086565645e-05, + "loss": 5.1411, + "step": 11516 + }, + { + "epoch": 0.06849486154724521, + "grad_norm": 2.4697556495666504, + "learning_rate": 4.9423533361594824e-05, + "loss": 5.1523, + "step": 11517 + }, + { + "epoch": 0.06850080883052621, + "grad_norm": 2.1573715209960938, + "learning_rate": 4.942343362809799e-05, + "loss": 5.3488, + "step": 11518 + }, + { + "epoch": 0.06850675611380722, + "grad_norm": 1.9723131656646729, + "learning_rate": 4.9423333886075205e-05, + "loss": 5.2315, + "step": 11519 + }, + { + "epoch": 0.0685127033970882, + "grad_norm": 1.6925430297851562, + "learning_rate": 4.9423234135526475e-05, + "loss": 5.3055, + "step": 11520 + }, + { + "epoch": 0.0685186506803692, + "grad_norm": 2.8665122985839844, + "learning_rate": 4.942313437645185e-05, + "loss": 4.4905, + "step": 11521 + }, + { + "epoch": 0.06852459796365021, + "grad_norm": 2.7538015842437744, + "learning_rate": 4.942303460885136e-05, + "loss": 4.3863, + "step": 11522 + }, + { + "epoch": 0.0685305452469312, + "grad_norm": 2.335664987564087, + "learning_rate": 4.942293483272504e-05, + "loss": 4.4571, + "step": 11523 + }, + { + "epoch": 0.0685364925302122, + "grad_norm": 1.7987995147705078, + "learning_rate": 4.942283504807293e-05, + "loss": 5.1802, + "step": 11524 + }, + { + "epoch": 0.0685424398134932, + "grad_norm": 2.3286690711975098, + "learning_rate": 4.9422735254895056e-05, + "loss": 5.2883, + "step": 11525 + }, + { + "epoch": 0.06854838709677419, + "grad_norm": 2.093317747116089, + "learning_rate": 4.9422635453191466e-05, + "loss": 5.2589, + "step": 11526 + }, + { + "epoch": 0.06855433438005519, + "grad_norm": 1.914236307144165, + "learning_rate": 4.942253564296218e-05, + "loss": 5.4347, + "step": 11527 + }, + { + "epoch": 0.06856028166333619, + "grad_norm": 1.602265477180481, + "learning_rate": 4.942243582420724e-05, + "loss": 5.8021, + "step": 11528 + }, + { + "epoch": 0.06856622894661718, + "grad_norm": 1.4433797597885132, + "learning_rate": 4.9422335996926674e-05, + "loss": 5.7432, + "step": 11529 + }, + { + "epoch": 0.06857217622989818, + "grad_norm": 1.3481166362762451, + "learning_rate": 4.942223616112053e-05, + "loss": 5.2946, + "step": 11530 + }, + { + "epoch": 0.06857812351317918, + "grad_norm": 1.879550576210022, + "learning_rate": 4.942213631678883e-05, + "loss": 5.2669, + "step": 11531 + }, + { + "epoch": 0.06858407079646017, + "grad_norm": 2.7241995334625244, + "learning_rate": 4.942203646393162e-05, + "loss": 5.2248, + "step": 11532 + }, + { + "epoch": 0.06859001807974117, + "grad_norm": 1.9870814085006714, + "learning_rate": 4.942193660254892e-05, + "loss": 5.4025, + "step": 11533 + }, + { + "epoch": 0.06859596536302218, + "grad_norm": 1.89231276512146, + "learning_rate": 4.942183673264079e-05, + "loss": 5.6046, + "step": 11534 + }, + { + "epoch": 0.06860191264630316, + "grad_norm": 2.024684429168701, + "learning_rate": 4.9421736854207235e-05, + "loss": 5.4031, + "step": 11535 + }, + { + "epoch": 0.06860785992958417, + "grad_norm": 1.6764521598815918, + "learning_rate": 4.942163696724831e-05, + "loss": 5.702, + "step": 11536 + }, + { + "epoch": 0.06861380721286517, + "grad_norm": 1.7738621234893799, + "learning_rate": 4.942153707176405e-05, + "loss": 5.1491, + "step": 11537 + }, + { + "epoch": 0.06861975449614616, + "grad_norm": 1.416986346244812, + "learning_rate": 4.942143716775447e-05, + "loss": 5.3883, + "step": 11538 + }, + { + "epoch": 0.06862570177942716, + "grad_norm": 1.837067723274231, + "learning_rate": 4.942133725521963e-05, + "loss": 5.2945, + "step": 11539 + }, + { + "epoch": 0.06863164906270816, + "grad_norm": 1.995610237121582, + "learning_rate": 4.942123733415955e-05, + "loss": 5.2589, + "step": 11540 + }, + { + "epoch": 0.06863759634598915, + "grad_norm": 1.9689414501190186, + "learning_rate": 4.9421137404574264e-05, + "loss": 5.3715, + "step": 11541 + }, + { + "epoch": 0.06864354362927015, + "grad_norm": 1.6984235048294067, + "learning_rate": 4.942103746646382e-05, + "loss": 5.3987, + "step": 11542 + }, + { + "epoch": 0.06864949091255115, + "grad_norm": 1.2645832300186157, + "learning_rate": 4.9420937519828234e-05, + "loss": 5.2142, + "step": 11543 + }, + { + "epoch": 0.06865543819583214, + "grad_norm": 1.6830233335494995, + "learning_rate": 4.9420837564667556e-05, + "loss": 5.1172, + "step": 11544 + }, + { + "epoch": 0.06866138547911314, + "grad_norm": 1.5734926462173462, + "learning_rate": 4.9420737600981816e-05, + "loss": 5.3789, + "step": 11545 + }, + { + "epoch": 0.06866733276239413, + "grad_norm": 1.7375764846801758, + "learning_rate": 4.942063762877105e-05, + "loss": 5.5311, + "step": 11546 + }, + { + "epoch": 0.06867328004567513, + "grad_norm": 1.5421762466430664, + "learning_rate": 4.942053764803529e-05, + "loss": 5.1722, + "step": 11547 + }, + { + "epoch": 0.06867922732895614, + "grad_norm": 1.6282575130462646, + "learning_rate": 4.942043765877457e-05, + "loss": 5.4754, + "step": 11548 + }, + { + "epoch": 0.06868517461223712, + "grad_norm": 1.5595266819000244, + "learning_rate": 4.9420337660988936e-05, + "loss": 5.3516, + "step": 11549 + }, + { + "epoch": 0.06869112189551813, + "grad_norm": 1.5642317533493042, + "learning_rate": 4.9420237654678405e-05, + "loss": 5.2364, + "step": 11550 + }, + { + "epoch": 0.06869706917879913, + "grad_norm": 1.5491602420806885, + "learning_rate": 4.942013763984302e-05, + "loss": 5.1566, + "step": 11551 + }, + { + "epoch": 0.06870301646208012, + "grad_norm": 1.4256258010864258, + "learning_rate": 4.942003761648283e-05, + "loss": 5.1592, + "step": 11552 + }, + { + "epoch": 0.06870896374536112, + "grad_norm": 1.756016492843628, + "learning_rate": 4.9419937584597846e-05, + "loss": 5.012, + "step": 11553 + }, + { + "epoch": 0.06871491102864212, + "grad_norm": 2.5290040969848633, + "learning_rate": 4.941983754418812e-05, + "loss": 4.571, + "step": 11554 + }, + { + "epoch": 0.06872085831192311, + "grad_norm": 2.6146528720855713, + "learning_rate": 4.9419737495253685e-05, + "loss": 4.3515, + "step": 11555 + }, + { + "epoch": 0.06872680559520411, + "grad_norm": 2.3333144187927246, + "learning_rate": 4.941963743779456e-05, + "loss": 4.3032, + "step": 11556 + }, + { + "epoch": 0.06873275287848511, + "grad_norm": 2.342433452606201, + "learning_rate": 4.9419537371810795e-05, + "loss": 4.2942, + "step": 11557 + }, + { + "epoch": 0.0687387001617661, + "grad_norm": 2.423696517944336, + "learning_rate": 4.941943729730243e-05, + "loss": 4.4, + "step": 11558 + }, + { + "epoch": 0.0687446474450471, + "grad_norm": 2.3420050144195557, + "learning_rate": 4.941933721426948e-05, + "loss": 5.0466, + "step": 11559 + }, + { + "epoch": 0.0687505947283281, + "grad_norm": 2.7115821838378906, + "learning_rate": 4.9419237122712e-05, + "loss": 5.1197, + "step": 11560 + }, + { + "epoch": 0.06875654201160909, + "grad_norm": 2.7316489219665527, + "learning_rate": 4.9419137022630014e-05, + "loss": 5.2435, + "step": 11561 + }, + { + "epoch": 0.0687624892948901, + "grad_norm": 2.291551113128662, + "learning_rate": 4.941903691402356e-05, + "loss": 5.0345, + "step": 11562 + }, + { + "epoch": 0.0687684365781711, + "grad_norm": 2.4499049186706543, + "learning_rate": 4.941893679689267e-05, + "loss": 4.503, + "step": 11563 + }, + { + "epoch": 0.06877438386145208, + "grad_norm": 2.7120168209075928, + "learning_rate": 4.9418836671237385e-05, + "loss": 4.2954, + "step": 11564 + }, + { + "epoch": 0.06878033114473309, + "grad_norm": 2.8483526706695557, + "learning_rate": 4.941873653705774e-05, + "loss": 6.269, + "step": 11565 + }, + { + "epoch": 0.06878627842801409, + "grad_norm": 2.3191473484039307, + "learning_rate": 4.941863639435376e-05, + "loss": 6.1628, + "step": 11566 + }, + { + "epoch": 0.06879222571129508, + "grad_norm": 3.4622583389282227, + "learning_rate": 4.9418536243125486e-05, + "loss": 5.6115, + "step": 11567 + }, + { + "epoch": 0.06879817299457608, + "grad_norm": 1.7118897438049316, + "learning_rate": 4.941843608337295e-05, + "loss": 5.4801, + "step": 11568 + }, + { + "epoch": 0.06880412027785708, + "grad_norm": 2.876338243484497, + "learning_rate": 4.9418335915096195e-05, + "loss": 5.0806, + "step": 11569 + }, + { + "epoch": 0.06881006756113807, + "grad_norm": 2.2875587940216064, + "learning_rate": 4.941823573829525e-05, + "loss": 5.2833, + "step": 11570 + }, + { + "epoch": 0.06881601484441907, + "grad_norm": 1.797743320465088, + "learning_rate": 4.9418135552970155e-05, + "loss": 6.1407, + "step": 11571 + }, + { + "epoch": 0.06882196212770007, + "grad_norm": 1.957331895828247, + "learning_rate": 4.941803535912094e-05, + "loss": 5.8743, + "step": 11572 + }, + { + "epoch": 0.06882790941098106, + "grad_norm": 1.9552925825119019, + "learning_rate": 4.9417935156747644e-05, + "loss": 5.584, + "step": 11573 + }, + { + "epoch": 0.06883385669426206, + "grad_norm": 2.057610034942627, + "learning_rate": 4.94178349458503e-05, + "loss": 5.8445, + "step": 11574 + }, + { + "epoch": 0.06883980397754305, + "grad_norm": 1.7856727838516235, + "learning_rate": 4.941773472642893e-05, + "loss": 6.0133, + "step": 11575 + }, + { + "epoch": 0.06884575126082405, + "grad_norm": 1.4494417905807495, + "learning_rate": 4.941763449848359e-05, + "loss": 5.888, + "step": 11576 + }, + { + "epoch": 0.06885169854410506, + "grad_norm": 2.1377499103546143, + "learning_rate": 4.9417534262014306e-05, + "loss": 6.0604, + "step": 11577 + }, + { + "epoch": 0.06885764582738604, + "grad_norm": 1.769888162612915, + "learning_rate": 4.9417434017021105e-05, + "loss": 5.8815, + "step": 11578 + }, + { + "epoch": 0.06886359311066705, + "grad_norm": 1.933935523033142, + "learning_rate": 4.9417333763504036e-05, + "loss": 5.6601, + "step": 11579 + }, + { + "epoch": 0.06886954039394805, + "grad_norm": 1.8672062158584595, + "learning_rate": 4.941723350146313e-05, + "loss": 5.8143, + "step": 11580 + }, + { + "epoch": 0.06887548767722904, + "grad_norm": 1.9899057149887085, + "learning_rate": 4.941713323089842e-05, + "loss": 5.8465, + "step": 11581 + }, + { + "epoch": 0.06888143496051004, + "grad_norm": 2.1053643226623535, + "learning_rate": 4.941703295180994e-05, + "loss": 5.4582, + "step": 11582 + }, + { + "epoch": 0.06888738224379104, + "grad_norm": 1.9435245990753174, + "learning_rate": 4.9416932664197726e-05, + "loss": 5.8503, + "step": 11583 + }, + { + "epoch": 0.06889332952707203, + "grad_norm": 1.9407175779342651, + "learning_rate": 4.941683236806181e-05, + "loss": 5.706, + "step": 11584 + }, + { + "epoch": 0.06889927681035303, + "grad_norm": 2.0505893230438232, + "learning_rate": 4.941673206340224e-05, + "loss": 6.01, + "step": 11585 + }, + { + "epoch": 0.06890522409363403, + "grad_norm": 1.6713486909866333, + "learning_rate": 4.941663175021903e-05, + "loss": 5.8347, + "step": 11586 + }, + { + "epoch": 0.06891117137691502, + "grad_norm": 1.5333812236785889, + "learning_rate": 4.941653142851223e-05, + "loss": 5.8493, + "step": 11587 + }, + { + "epoch": 0.06891711866019602, + "grad_norm": 2.10982346534729, + "learning_rate": 4.9416431098281865e-05, + "loss": 5.4037, + "step": 11588 + }, + { + "epoch": 0.06892306594347702, + "grad_norm": 1.766663908958435, + "learning_rate": 4.9416330759527985e-05, + "loss": 5.0335, + "step": 11589 + }, + { + "epoch": 0.06892901322675801, + "grad_norm": 2.0600688457489014, + "learning_rate": 4.9416230412250615e-05, + "loss": 5.4017, + "step": 11590 + }, + { + "epoch": 0.06893496051003901, + "grad_norm": 1.6271671056747437, + "learning_rate": 4.941613005644979e-05, + "loss": 5.903, + "step": 11591 + }, + { + "epoch": 0.06894090779332002, + "grad_norm": 1.9222697019577026, + "learning_rate": 4.9416029692125544e-05, + "loss": 5.1666, + "step": 11592 + }, + { + "epoch": 0.068946855076601, + "grad_norm": 1.7405030727386475, + "learning_rate": 4.941592931927792e-05, + "loss": 5.0799, + "step": 11593 + }, + { + "epoch": 0.068952802359882, + "grad_norm": 1.7639994621276855, + "learning_rate": 4.941582893790694e-05, + "loss": 5.7596, + "step": 11594 + }, + { + "epoch": 0.06895874964316301, + "grad_norm": 1.9628292322158813, + "learning_rate": 4.941572854801265e-05, + "loss": 4.4573, + "step": 11595 + }, + { + "epoch": 0.068964696926444, + "grad_norm": 1.7616615295410156, + "learning_rate": 4.941562814959508e-05, + "loss": 4.6399, + "step": 11596 + }, + { + "epoch": 0.068970644209725, + "grad_norm": 1.8174281120300293, + "learning_rate": 4.9415527742654265e-05, + "loss": 5.6279, + "step": 11597 + }, + { + "epoch": 0.068976591493006, + "grad_norm": 1.563138723373413, + "learning_rate": 4.941542732719025e-05, + "loss": 5.8696, + "step": 11598 + }, + { + "epoch": 0.06898253877628699, + "grad_norm": 1.4704676866531372, + "learning_rate": 4.9415326903203055e-05, + "loss": 5.7129, + "step": 11599 + }, + { + "epoch": 0.06898848605956799, + "grad_norm": 2.484572410583496, + "learning_rate": 4.9415226470692724e-05, + "loss": 5.336, + "step": 11600 + }, + { + "epoch": 0.068994433342849, + "grad_norm": 1.882876992225647, + "learning_rate": 4.9415126029659284e-05, + "loss": 5.4273, + "step": 11601 + }, + { + "epoch": 0.06900038062612998, + "grad_norm": 1.7827874422073364, + "learning_rate": 4.941502558010278e-05, + "loss": 5.6699, + "step": 11602 + }, + { + "epoch": 0.06900632790941098, + "grad_norm": 1.5609276294708252, + "learning_rate": 4.941492512202325e-05, + "loss": 5.648, + "step": 11603 + }, + { + "epoch": 0.06901227519269197, + "grad_norm": 1.6941063404083252, + "learning_rate": 4.941482465542071e-05, + "loss": 5.633, + "step": 11604 + }, + { + "epoch": 0.06901822247597297, + "grad_norm": 1.768922209739685, + "learning_rate": 4.941472418029521e-05, + "loss": 5.6072, + "step": 11605 + }, + { + "epoch": 0.06902416975925398, + "grad_norm": 2.225846767425537, + "learning_rate": 4.941462369664679e-05, + "loss": 4.9314, + "step": 11606 + }, + { + "epoch": 0.06903011704253496, + "grad_norm": 2.4479281902313232, + "learning_rate": 4.941452320447546e-05, + "loss": 5.0563, + "step": 11607 + }, + { + "epoch": 0.06903606432581597, + "grad_norm": 2.358238935470581, + "learning_rate": 4.941442270378129e-05, + "loss": 4.9379, + "step": 11608 + }, + { + "epoch": 0.06904201160909697, + "grad_norm": 2.2679247856140137, + "learning_rate": 4.941432219456429e-05, + "loss": 5.0655, + "step": 11609 + }, + { + "epoch": 0.06904795889237796, + "grad_norm": 2.524176597595215, + "learning_rate": 4.94142216768245e-05, + "loss": 4.8694, + "step": 11610 + }, + { + "epoch": 0.06905390617565896, + "grad_norm": 2.1919515132904053, + "learning_rate": 4.9414121150561966e-05, + "loss": 5.0889, + "step": 11611 + }, + { + "epoch": 0.06905985345893996, + "grad_norm": 2.2838563919067383, + "learning_rate": 4.94140206157767e-05, + "loss": 4.9942, + "step": 11612 + }, + { + "epoch": 0.06906580074222095, + "grad_norm": 2.2270026206970215, + "learning_rate": 4.9413920072468764e-05, + "loss": 4.9885, + "step": 11613 + }, + { + "epoch": 0.06907174802550195, + "grad_norm": 2.175245761871338, + "learning_rate": 4.9413819520638176e-05, + "loss": 4.9829, + "step": 11614 + }, + { + "epoch": 0.06907769530878295, + "grad_norm": 2.128441572189331, + "learning_rate": 4.941371896028498e-05, + "loss": 4.9802, + "step": 11615 + }, + { + "epoch": 0.06908364259206394, + "grad_norm": 2.7656328678131104, + "learning_rate": 4.94136183914092e-05, + "loss": 5.1302, + "step": 11616 + }, + { + "epoch": 0.06908958987534494, + "grad_norm": 2.23917818069458, + "learning_rate": 4.941351781401088e-05, + "loss": 4.8766, + "step": 11617 + }, + { + "epoch": 0.06909553715862594, + "grad_norm": 1.861399531364441, + "learning_rate": 4.941341722809005e-05, + "loss": 5.8151, + "step": 11618 + }, + { + "epoch": 0.06910148444190693, + "grad_norm": 2.13590145111084, + "learning_rate": 4.9413316633646754e-05, + "loss": 5.6892, + "step": 11619 + }, + { + "epoch": 0.06910743172518793, + "grad_norm": 1.8261966705322266, + "learning_rate": 4.9413216030681024e-05, + "loss": 6.1387, + "step": 11620 + }, + { + "epoch": 0.06911337900846894, + "grad_norm": 2.5121877193450928, + "learning_rate": 4.941311541919289e-05, + "loss": 5.3217, + "step": 11621 + }, + { + "epoch": 0.06911932629174992, + "grad_norm": 2.1011979579925537, + "learning_rate": 4.941301479918239e-05, + "loss": 5.048, + "step": 11622 + }, + { + "epoch": 0.06912527357503093, + "grad_norm": 2.214597225189209, + "learning_rate": 4.941291417064956e-05, + "loss": 5.4312, + "step": 11623 + }, + { + "epoch": 0.06913122085831193, + "grad_norm": 2.6525864601135254, + "learning_rate": 4.941281353359443e-05, + "loss": 4.4151, + "step": 11624 + }, + { + "epoch": 0.06913716814159292, + "grad_norm": 1.9638911485671997, + "learning_rate": 4.941271288801704e-05, + "loss": 5.0091, + "step": 11625 + }, + { + "epoch": 0.06914311542487392, + "grad_norm": 2.062688112258911, + "learning_rate": 4.941261223391742e-05, + "loss": 5.503, + "step": 11626 + }, + { + "epoch": 0.06914906270815492, + "grad_norm": 2.219430685043335, + "learning_rate": 4.941251157129561e-05, + "loss": 4.984, + "step": 11627 + }, + { + "epoch": 0.06915500999143591, + "grad_norm": 2.0745718479156494, + "learning_rate": 4.941241090015165e-05, + "loss": 5.3094, + "step": 11628 + }, + { + "epoch": 0.06916095727471691, + "grad_norm": 1.8852496147155762, + "learning_rate": 4.941231022048557e-05, + "loss": 5.2424, + "step": 11629 + }, + { + "epoch": 0.06916690455799791, + "grad_norm": 2.335723400115967, + "learning_rate": 4.9412209532297404e-05, + "loss": 5.6031, + "step": 11630 + }, + { + "epoch": 0.0691728518412789, + "grad_norm": 2.167698621749878, + "learning_rate": 4.941210883558719e-05, + "loss": 5.3132, + "step": 11631 + }, + { + "epoch": 0.0691787991245599, + "grad_norm": 2.213068962097168, + "learning_rate": 4.941200813035495e-05, + "loss": 5.2049, + "step": 11632 + }, + { + "epoch": 0.06918474640784089, + "grad_norm": 1.9697870016098022, + "learning_rate": 4.941190741660075e-05, + "loss": 5.3118, + "step": 11633 + }, + { + "epoch": 0.0691906936911219, + "grad_norm": 1.7360777854919434, + "learning_rate": 4.941180669432458e-05, + "loss": 5.444, + "step": 11634 + }, + { + "epoch": 0.0691966409744029, + "grad_norm": 1.8400771617889404, + "learning_rate": 4.9411705963526514e-05, + "loss": 5.6975, + "step": 11635 + }, + { + "epoch": 0.06920258825768388, + "grad_norm": 1.492242693901062, + "learning_rate": 4.941160522420657e-05, + "loss": 5.5617, + "step": 11636 + }, + { + "epoch": 0.06920853554096489, + "grad_norm": 1.6014543771743774, + "learning_rate": 4.9411504476364794e-05, + "loss": 5.7317, + "step": 11637 + }, + { + "epoch": 0.06921448282424589, + "grad_norm": 1.7973628044128418, + "learning_rate": 4.9411403720001215e-05, + "loss": 5.3105, + "step": 11638 + }, + { + "epoch": 0.06922043010752688, + "grad_norm": 1.8314461708068848, + "learning_rate": 4.9411302955115853e-05, + "loss": 5.624, + "step": 11639 + }, + { + "epoch": 0.06922637739080788, + "grad_norm": 1.621315836906433, + "learning_rate": 4.941120218170877e-05, + "loss": 5.8243, + "step": 11640 + }, + { + "epoch": 0.06923232467408888, + "grad_norm": 2.0378596782684326, + "learning_rate": 4.941110139977998e-05, + "loss": 4.9275, + "step": 11641 + }, + { + "epoch": 0.06923827195736987, + "grad_norm": 1.8713582754135132, + "learning_rate": 4.941100060932954e-05, + "loss": 5.1218, + "step": 11642 + }, + { + "epoch": 0.06924421924065087, + "grad_norm": 1.878404140472412, + "learning_rate": 4.941089981035746e-05, + "loss": 5.4997, + "step": 11643 + }, + { + "epoch": 0.06925016652393187, + "grad_norm": 1.7230712175369263, + "learning_rate": 4.941079900286379e-05, + "loss": 5.5514, + "step": 11644 + }, + { + "epoch": 0.06925611380721286, + "grad_norm": 1.6272276639938354, + "learning_rate": 4.941069818684856e-05, + "loss": 5.7186, + "step": 11645 + }, + { + "epoch": 0.06926206109049386, + "grad_norm": 1.5610454082489014, + "learning_rate": 4.9410597362311814e-05, + "loss": 5.8929, + "step": 11646 + }, + { + "epoch": 0.06926800837377486, + "grad_norm": 1.7373837232589722, + "learning_rate": 4.941049652925358e-05, + "loss": 5.6428, + "step": 11647 + }, + { + "epoch": 0.06927395565705585, + "grad_norm": 1.9722628593444824, + "learning_rate": 4.9410395687673886e-05, + "loss": 5.9562, + "step": 11648 + }, + { + "epoch": 0.06927990294033685, + "grad_norm": 1.5603039264678955, + "learning_rate": 4.941029483757278e-05, + "loss": 6.031, + "step": 11649 + }, + { + "epoch": 0.06928585022361786, + "grad_norm": 1.6971800327301025, + "learning_rate": 4.941019397895029e-05, + "loss": 5.7527, + "step": 11650 + }, + { + "epoch": 0.06929179750689884, + "grad_norm": 1.9559118747711182, + "learning_rate": 4.9410093111806456e-05, + "loss": 5.0904, + "step": 11651 + }, + { + "epoch": 0.06929774479017985, + "grad_norm": 1.561122179031372, + "learning_rate": 4.9409992236141315e-05, + "loss": 5.7438, + "step": 11652 + }, + { + "epoch": 0.06930369207346085, + "grad_norm": 1.6071819067001343, + "learning_rate": 4.940989135195489e-05, + "loss": 5.8852, + "step": 11653 + }, + { + "epoch": 0.06930963935674184, + "grad_norm": 1.6804322004318237, + "learning_rate": 4.940979045924723e-05, + "loss": 5.7174, + "step": 11654 + }, + { + "epoch": 0.06931558664002284, + "grad_norm": 1.5802178382873535, + "learning_rate": 4.940968955801836e-05, + "loss": 5.8755, + "step": 11655 + }, + { + "epoch": 0.06932153392330384, + "grad_norm": 2.1002743244171143, + "learning_rate": 4.940958864826832e-05, + "loss": 5.6323, + "step": 11656 + }, + { + "epoch": 0.06932748120658483, + "grad_norm": 1.8874709606170654, + "learning_rate": 4.9409487729997144e-05, + "loss": 5.6798, + "step": 11657 + }, + { + "epoch": 0.06933342848986583, + "grad_norm": 1.6967203617095947, + "learning_rate": 4.940938680320487e-05, + "loss": 5.8461, + "step": 11658 + }, + { + "epoch": 0.06933937577314683, + "grad_norm": 1.9648679494857788, + "learning_rate": 4.9409285867891534e-05, + "loss": 5.842, + "step": 11659 + }, + { + "epoch": 0.06934532305642782, + "grad_norm": 1.8681408166885376, + "learning_rate": 4.940918492405716e-05, + "loss": 5.8859, + "step": 11660 + }, + { + "epoch": 0.06935127033970882, + "grad_norm": 2.0480551719665527, + "learning_rate": 4.9409083971701805e-05, + "loss": 5.6415, + "step": 11661 + }, + { + "epoch": 0.06935721762298983, + "grad_norm": 2.102832555770874, + "learning_rate": 4.940898301082548e-05, + "loss": 5.6163, + "step": 11662 + }, + { + "epoch": 0.06936316490627081, + "grad_norm": 1.7471407651901245, + "learning_rate": 4.940888204142824e-05, + "loss": 5.7973, + "step": 11663 + }, + { + "epoch": 0.06936911218955182, + "grad_norm": 1.9675641059875488, + "learning_rate": 4.94087810635101e-05, + "loss": 5.1125, + "step": 11664 + }, + { + "epoch": 0.0693750594728328, + "grad_norm": 1.6316107511520386, + "learning_rate": 4.940868007707111e-05, + "loss": 5.5067, + "step": 11665 + }, + { + "epoch": 0.0693810067561138, + "grad_norm": 1.8663619756698608, + "learning_rate": 4.940857908211131e-05, + "loss": 5.5552, + "step": 11666 + }, + { + "epoch": 0.06938695403939481, + "grad_norm": 2.155702590942383, + "learning_rate": 4.940847807863072e-05, + "loss": 6.0919, + "step": 11667 + }, + { + "epoch": 0.0693929013226758, + "grad_norm": 1.968467354774475, + "learning_rate": 4.9408377066629384e-05, + "loss": 5.8105, + "step": 11668 + }, + { + "epoch": 0.0693988486059568, + "grad_norm": 1.5245625972747803, + "learning_rate": 4.940827604610734e-05, + "loss": 5.8901, + "step": 11669 + }, + { + "epoch": 0.0694047958892378, + "grad_norm": 1.7377501726150513, + "learning_rate": 4.940817501706461e-05, + "loss": 5.5917, + "step": 11670 + }, + { + "epoch": 0.06941074317251879, + "grad_norm": 1.9668710231781006, + "learning_rate": 4.940807397950125e-05, + "loss": 5.6857, + "step": 11671 + }, + { + "epoch": 0.06941669045579979, + "grad_norm": 1.8168022632598877, + "learning_rate": 4.9407972933417266e-05, + "loss": 5.7032, + "step": 11672 + }, + { + "epoch": 0.06942263773908079, + "grad_norm": 2.4009077548980713, + "learning_rate": 4.940787187881273e-05, + "loss": 5.6767, + "step": 11673 + }, + { + "epoch": 0.06942858502236178, + "grad_norm": 1.8541746139526367, + "learning_rate": 4.940777081568765e-05, + "loss": 5.6327, + "step": 11674 + }, + { + "epoch": 0.06943453230564278, + "grad_norm": 2.028602361679077, + "learning_rate": 4.940766974404206e-05, + "loss": 5.0819, + "step": 11675 + }, + { + "epoch": 0.06944047958892378, + "grad_norm": 2.0870065689086914, + "learning_rate": 4.940756866387602e-05, + "loss": 5.1645, + "step": 11676 + }, + { + "epoch": 0.06944642687220477, + "grad_norm": 1.8009755611419678, + "learning_rate": 4.940746757518954e-05, + "loss": 4.9832, + "step": 11677 + }, + { + "epoch": 0.06945237415548577, + "grad_norm": 2.20975399017334, + "learning_rate": 4.9407366477982675e-05, + "loss": 4.9683, + "step": 11678 + }, + { + "epoch": 0.06945832143876678, + "grad_norm": 1.89133882522583, + "learning_rate": 4.940726537225544e-05, + "loss": 4.7736, + "step": 11679 + }, + { + "epoch": 0.06946426872204776, + "grad_norm": 1.7583657503128052, + "learning_rate": 4.940716425800789e-05, + "loss": 5.4275, + "step": 11680 + }, + { + "epoch": 0.06947021600532877, + "grad_norm": 2.1929352283477783, + "learning_rate": 4.940706313524004e-05, + "loss": 4.8441, + "step": 11681 + }, + { + "epoch": 0.06947616328860977, + "grad_norm": 2.1098999977111816, + "learning_rate": 4.940696200395194e-05, + "loss": 5.065, + "step": 11682 + }, + { + "epoch": 0.06948211057189076, + "grad_norm": 1.7651045322418213, + "learning_rate": 4.940686086414363e-05, + "loss": 5.7086, + "step": 11683 + }, + { + "epoch": 0.06948805785517176, + "grad_norm": 1.6675828695297241, + "learning_rate": 4.9406759715815134e-05, + "loss": 5.89, + "step": 11684 + }, + { + "epoch": 0.06949400513845276, + "grad_norm": 1.9754993915557861, + "learning_rate": 4.940665855896648e-05, + "loss": 5.7752, + "step": 11685 + }, + { + "epoch": 0.06949995242173375, + "grad_norm": 1.7652478218078613, + "learning_rate": 4.940655739359773e-05, + "loss": 5.6518, + "step": 11686 + }, + { + "epoch": 0.06950589970501475, + "grad_norm": 1.898997187614441, + "learning_rate": 4.940645621970889e-05, + "loss": 5.4579, + "step": 11687 + }, + { + "epoch": 0.06951184698829575, + "grad_norm": 2.1233060359954834, + "learning_rate": 4.940635503730001e-05, + "loss": 4.3979, + "step": 11688 + }, + { + "epoch": 0.06951779427157674, + "grad_norm": 2.0859549045562744, + "learning_rate": 4.940625384637113e-05, + "loss": 4.4309, + "step": 11689 + }, + { + "epoch": 0.06952374155485774, + "grad_norm": 2.051492929458618, + "learning_rate": 4.940615264692228e-05, + "loss": 4.4332, + "step": 11690 + }, + { + "epoch": 0.06952968883813875, + "grad_norm": 2.0359628200531006, + "learning_rate": 4.940605143895348e-05, + "loss": 4.29, + "step": 11691 + }, + { + "epoch": 0.06953563612141973, + "grad_norm": 2.0122604370117188, + "learning_rate": 4.940595022246479e-05, + "loss": 4.4391, + "step": 11692 + }, + { + "epoch": 0.06954158340470074, + "grad_norm": 2.059694290161133, + "learning_rate": 4.940584899745624e-05, + "loss": 4.3993, + "step": 11693 + }, + { + "epoch": 0.06954753068798172, + "grad_norm": 2.0355825424194336, + "learning_rate": 4.940574776392786e-05, + "loss": 4.2829, + "step": 11694 + }, + { + "epoch": 0.06955347797126273, + "grad_norm": 1.933385968208313, + "learning_rate": 4.940564652187967e-05, + "loss": 4.372, + "step": 11695 + }, + { + "epoch": 0.06955942525454373, + "grad_norm": 2.0848586559295654, + "learning_rate": 4.940554527131174e-05, + "loss": 4.3064, + "step": 11696 + }, + { + "epoch": 0.06956537253782472, + "grad_norm": 1.889845848083496, + "learning_rate": 4.940544401222407e-05, + "loss": 4.3811, + "step": 11697 + }, + { + "epoch": 0.06957131982110572, + "grad_norm": 2.0076160430908203, + "learning_rate": 4.9405342744616724e-05, + "loss": 4.3382, + "step": 11698 + }, + { + "epoch": 0.06957726710438672, + "grad_norm": 1.9708037376403809, + "learning_rate": 4.940524146848971e-05, + "loss": 4.4659, + "step": 11699 + }, + { + "epoch": 0.06958321438766771, + "grad_norm": 2.086454153060913, + "learning_rate": 4.940514018384309e-05, + "loss": 4.196, + "step": 11700 + }, + { + "epoch": 0.06958916167094871, + "grad_norm": 2.095062255859375, + "learning_rate": 4.940503889067689e-05, + "loss": 4.2062, + "step": 11701 + }, + { + "epoch": 0.06959510895422971, + "grad_norm": 2.0661754608154297, + "learning_rate": 4.940493758899114e-05, + "loss": 4.3468, + "step": 11702 + }, + { + "epoch": 0.0696010562375107, + "grad_norm": 2.073573350906372, + "learning_rate": 4.9404836278785875e-05, + "loss": 4.248, + "step": 11703 + }, + { + "epoch": 0.0696070035207917, + "grad_norm": 2.104018449783325, + "learning_rate": 4.940473496006114e-05, + "loss": 4.1523, + "step": 11704 + }, + { + "epoch": 0.0696129508040727, + "grad_norm": 2.067532777786255, + "learning_rate": 4.9404633632816954e-05, + "loss": 4.2721, + "step": 11705 + }, + { + "epoch": 0.06961889808735369, + "grad_norm": 2.036736249923706, + "learning_rate": 4.9404532297053376e-05, + "loss": 4.4057, + "step": 11706 + }, + { + "epoch": 0.0696248453706347, + "grad_norm": 1.9911088943481445, + "learning_rate": 4.940443095277042e-05, + "loss": 4.1875, + "step": 11707 + }, + { + "epoch": 0.0696307926539157, + "grad_norm": 2.017457962036133, + "learning_rate": 4.9404329599968124e-05, + "loss": 4.1506, + "step": 11708 + }, + { + "epoch": 0.06963673993719668, + "grad_norm": 1.8043596744537354, + "learning_rate": 4.940422823864654e-05, + "loss": 4.3937, + "step": 11709 + }, + { + "epoch": 0.06964268722047769, + "grad_norm": 2.0362250804901123, + "learning_rate": 4.9404126868805687e-05, + "loss": 3.8076, + "step": 11710 + }, + { + "epoch": 0.06964863450375869, + "grad_norm": 2.10723876953125, + "learning_rate": 4.940402549044561e-05, + "loss": 4.2487, + "step": 11711 + }, + { + "epoch": 0.06965458178703968, + "grad_norm": 2.1901967525482178, + "learning_rate": 4.940392410356632e-05, + "loss": 4.1183, + "step": 11712 + }, + { + "epoch": 0.06966052907032068, + "grad_norm": 2.196518659591675, + "learning_rate": 4.9403822708167896e-05, + "loss": 4.2959, + "step": 11713 + }, + { + "epoch": 0.06966647635360168, + "grad_norm": 2.1917595863342285, + "learning_rate": 4.940372130425034e-05, + "loss": 4.1011, + "step": 11714 + }, + { + "epoch": 0.06967242363688267, + "grad_norm": 2.14424991607666, + "learning_rate": 4.9403619891813696e-05, + "loss": 3.9033, + "step": 11715 + }, + { + "epoch": 0.06967837092016367, + "grad_norm": 1.9970608949661255, + "learning_rate": 4.9403518470858004e-05, + "loss": 3.9243, + "step": 11716 + }, + { + "epoch": 0.06968431820344467, + "grad_norm": 2.215721607208252, + "learning_rate": 4.9403417041383294e-05, + "loss": 4.0036, + "step": 11717 + }, + { + "epoch": 0.06969026548672566, + "grad_norm": 1.9153071641921997, + "learning_rate": 4.94033156033896e-05, + "loss": 5.6849, + "step": 11718 + }, + { + "epoch": 0.06969621277000666, + "grad_norm": 2.287951707839966, + "learning_rate": 4.9403214156876966e-05, + "loss": 4.3569, + "step": 11719 + }, + { + "epoch": 0.06970216005328767, + "grad_norm": 2.1257216930389404, + "learning_rate": 4.940311270184542e-05, + "loss": 4.1051, + "step": 11720 + }, + { + "epoch": 0.06970810733656865, + "grad_norm": 2.164879560470581, + "learning_rate": 4.9403011238295e-05, + "loss": 4.0754, + "step": 11721 + }, + { + "epoch": 0.06971405461984966, + "grad_norm": 2.2430567741394043, + "learning_rate": 4.940290976622574e-05, + "loss": 4.1251, + "step": 11722 + }, + { + "epoch": 0.06972000190313064, + "grad_norm": 2.2621891498565674, + "learning_rate": 4.940280828563768e-05, + "loss": 4.2302, + "step": 11723 + }, + { + "epoch": 0.06972594918641165, + "grad_norm": 2.0096445083618164, + "learning_rate": 4.940270679653085e-05, + "loss": 4.2853, + "step": 11724 + }, + { + "epoch": 0.06973189646969265, + "grad_norm": 2.211843729019165, + "learning_rate": 4.940260529890528e-05, + "loss": 3.6609, + "step": 11725 + }, + { + "epoch": 0.06973784375297364, + "grad_norm": 1.8500425815582275, + "learning_rate": 4.940250379276102e-05, + "loss": 3.8701, + "step": 11726 + }, + { + "epoch": 0.06974379103625464, + "grad_norm": 2.09136962890625, + "learning_rate": 4.94024022780981e-05, + "loss": 4.5569, + "step": 11727 + }, + { + "epoch": 0.06974973831953564, + "grad_norm": 1.9922528266906738, + "learning_rate": 4.940230075491655e-05, + "loss": 4.4055, + "step": 11728 + }, + { + "epoch": 0.06975568560281663, + "grad_norm": 2.253831624984741, + "learning_rate": 4.940219922321641e-05, + "loss": 4.114, + "step": 11729 + }, + { + "epoch": 0.06976163288609763, + "grad_norm": 2.0647006034851074, + "learning_rate": 4.94020976829977e-05, + "loss": 4.9004, + "step": 11730 + }, + { + "epoch": 0.06976758016937863, + "grad_norm": 2.5659384727478027, + "learning_rate": 4.940199613426049e-05, + "loss": 5.0852, + "step": 11731 + }, + { + "epoch": 0.06977352745265962, + "grad_norm": 2.227599859237671, + "learning_rate": 4.9401894577004796e-05, + "loss": 5.1603, + "step": 11732 + }, + { + "epoch": 0.06977947473594062, + "grad_norm": 1.8170785903930664, + "learning_rate": 4.940179301123063e-05, + "loss": 5.8334, + "step": 11733 + }, + { + "epoch": 0.06978542201922162, + "grad_norm": 2.1795544624328613, + "learning_rate": 4.940169143693807e-05, + "loss": 5.668, + "step": 11734 + }, + { + "epoch": 0.06979136930250261, + "grad_norm": 2.1248555183410645, + "learning_rate": 4.940158985412713e-05, + "loss": 5.7604, + "step": 11735 + }, + { + "epoch": 0.06979731658578361, + "grad_norm": 1.9677635431289673, + "learning_rate": 4.9401488262797845e-05, + "loss": 5.6568, + "step": 11736 + }, + { + "epoch": 0.06980326386906462, + "grad_norm": 1.9796242713928223, + "learning_rate": 4.940138666295025e-05, + "loss": 5.4303, + "step": 11737 + }, + { + "epoch": 0.0698092111523456, + "grad_norm": 1.7489395141601562, + "learning_rate": 4.9401285054584385e-05, + "loss": 6.1782, + "step": 11738 + }, + { + "epoch": 0.0698151584356266, + "grad_norm": 1.8067989349365234, + "learning_rate": 4.940118343770028e-05, + "loss": 6.0974, + "step": 11739 + }, + { + "epoch": 0.06982110571890761, + "grad_norm": 1.7377318143844604, + "learning_rate": 4.940108181229798e-05, + "loss": 5.8477, + "step": 11740 + }, + { + "epoch": 0.0698270530021886, + "grad_norm": 2.297499656677246, + "learning_rate": 4.940098017837751e-05, + "loss": 4.8027, + "step": 11741 + }, + { + "epoch": 0.0698330002854696, + "grad_norm": 1.7340888977050781, + "learning_rate": 4.940087853593891e-05, + "loss": 5.5897, + "step": 11742 + }, + { + "epoch": 0.0698389475687506, + "grad_norm": 2.019639730453491, + "learning_rate": 4.9400776884982216e-05, + "loss": 5.4493, + "step": 11743 + }, + { + "epoch": 0.06984489485203159, + "grad_norm": 1.7959356307983398, + "learning_rate": 4.9400675225507466e-05, + "loss": 5.5995, + "step": 11744 + }, + { + "epoch": 0.06985084213531259, + "grad_norm": 2.234757661819458, + "learning_rate": 4.940057355751468e-05, + "loss": 5.9542, + "step": 11745 + }, + { + "epoch": 0.06985678941859359, + "grad_norm": 2.047755241394043, + "learning_rate": 4.9400471881003925e-05, + "loss": 5.9125, + "step": 11746 + }, + { + "epoch": 0.06986273670187458, + "grad_norm": 1.9563192129135132, + "learning_rate": 4.940037019597521e-05, + "loss": 5.7298, + "step": 11747 + }, + { + "epoch": 0.06986868398515558, + "grad_norm": 2.7170934677124023, + "learning_rate": 4.940026850242857e-05, + "loss": 5.5172, + "step": 11748 + }, + { + "epoch": 0.06987463126843659, + "grad_norm": 2.326277494430542, + "learning_rate": 4.9400166800364056e-05, + "loss": 5.685, + "step": 11749 + }, + { + "epoch": 0.06988057855171757, + "grad_norm": 1.708383321762085, + "learning_rate": 4.94000650897817e-05, + "loss": 5.3879, + "step": 11750 + }, + { + "epoch": 0.06988652583499858, + "grad_norm": 1.897631049156189, + "learning_rate": 4.9399963370681527e-05, + "loss": 5.6856, + "step": 11751 + }, + { + "epoch": 0.06989247311827956, + "grad_norm": 2.227720260620117, + "learning_rate": 4.939986164306357e-05, + "loss": 5.4487, + "step": 11752 + }, + { + "epoch": 0.06989842040156057, + "grad_norm": 2.7821953296661377, + "learning_rate": 4.939975990692789e-05, + "loss": 5.7276, + "step": 11753 + }, + { + "epoch": 0.06990436768484157, + "grad_norm": 1.8389033079147339, + "learning_rate": 4.939965816227449e-05, + "loss": 5.6933, + "step": 11754 + }, + { + "epoch": 0.06991031496812256, + "grad_norm": 1.7653162479400635, + "learning_rate": 4.939955640910343e-05, + "loss": 5.6079, + "step": 11755 + }, + { + "epoch": 0.06991626225140356, + "grad_norm": 1.7504348754882812, + "learning_rate": 4.939945464741475e-05, + "loss": 6.0413, + "step": 11756 + }, + { + "epoch": 0.06992220953468456, + "grad_norm": 2.118326187133789, + "learning_rate": 4.939935287720845e-05, + "loss": 5.8937, + "step": 11757 + }, + { + "epoch": 0.06992815681796555, + "grad_norm": 1.9626812934875488, + "learning_rate": 4.93992510984846e-05, + "loss": 5.9564, + "step": 11758 + }, + { + "epoch": 0.06993410410124655, + "grad_norm": 1.9915722608566284, + "learning_rate": 4.939914931124322e-05, + "loss": 5.6851, + "step": 11759 + }, + { + "epoch": 0.06994005138452755, + "grad_norm": 1.7959195375442505, + "learning_rate": 4.939904751548435e-05, + "loss": 4.785, + "step": 11760 + }, + { + "epoch": 0.06994599866780854, + "grad_norm": 1.8472923040390015, + "learning_rate": 4.9398945711208025e-05, + "loss": 5.2683, + "step": 11761 + }, + { + "epoch": 0.06995194595108954, + "grad_norm": 1.4207996129989624, + "learning_rate": 4.9398843898414274e-05, + "loss": 5.5402, + "step": 11762 + }, + { + "epoch": 0.06995789323437054, + "grad_norm": 2.122070550918579, + "learning_rate": 4.9398742077103146e-05, + "loss": 5.5397, + "step": 11763 + }, + { + "epoch": 0.06996384051765153, + "grad_norm": 2.285970687866211, + "learning_rate": 4.939864024727467e-05, + "loss": 5.1401, + "step": 11764 + }, + { + "epoch": 0.06996978780093253, + "grad_norm": 2.1245667934417725, + "learning_rate": 4.9398538408928874e-05, + "loss": 5.2009, + "step": 11765 + }, + { + "epoch": 0.06997573508421354, + "grad_norm": 1.8151131868362427, + "learning_rate": 4.939843656206581e-05, + "loss": 4.8635, + "step": 11766 + }, + { + "epoch": 0.06998168236749452, + "grad_norm": 1.9139370918273926, + "learning_rate": 4.9398334706685494e-05, + "loss": 5.5998, + "step": 11767 + }, + { + "epoch": 0.06998762965077553, + "grad_norm": 1.6889853477478027, + "learning_rate": 4.9398232842787976e-05, + "loss": 5.6183, + "step": 11768 + }, + { + "epoch": 0.06999357693405653, + "grad_norm": 1.773409366607666, + "learning_rate": 4.939813097037329e-05, + "loss": 5.5083, + "step": 11769 + }, + { + "epoch": 0.06999952421733752, + "grad_norm": 2.195955991744995, + "learning_rate": 4.9398029089441465e-05, + "loss": 6.4436, + "step": 11770 + }, + { + "epoch": 0.07000547150061852, + "grad_norm": 2.058687448501587, + "learning_rate": 4.939792719999254e-05, + "loss": 6.2875, + "step": 11771 + }, + { + "epoch": 0.07001141878389952, + "grad_norm": 1.9074562788009644, + "learning_rate": 4.939782530202655e-05, + "loss": 5.8764, + "step": 11772 + }, + { + "epoch": 0.07001736606718051, + "grad_norm": 2.163663864135742, + "learning_rate": 4.9397723395543535e-05, + "loss": 5.4666, + "step": 11773 + }, + { + "epoch": 0.07002331335046151, + "grad_norm": 2.2188286781311035, + "learning_rate": 4.939762148054352e-05, + "loss": 6.0679, + "step": 11774 + }, + { + "epoch": 0.07002926063374251, + "grad_norm": 1.8202224969863892, + "learning_rate": 4.9397519557026553e-05, + "loss": 6.0465, + "step": 11775 + }, + { + "epoch": 0.0700352079170235, + "grad_norm": 1.9515994787216187, + "learning_rate": 4.939741762499266e-05, + "loss": 5.9634, + "step": 11776 + }, + { + "epoch": 0.0700411552003045, + "grad_norm": 1.772741675376892, + "learning_rate": 4.9397315684441886e-05, + "loss": 5.3117, + "step": 11777 + }, + { + "epoch": 0.0700471024835855, + "grad_norm": 1.7377926111221313, + "learning_rate": 4.9397213735374256e-05, + "loss": 5.7082, + "step": 11778 + }, + { + "epoch": 0.0700530497668665, + "grad_norm": 1.881205439567566, + "learning_rate": 4.939711177778982e-05, + "loss": 5.8463, + "step": 11779 + }, + { + "epoch": 0.0700589970501475, + "grad_norm": 1.893402099609375, + "learning_rate": 4.939700981168859e-05, + "loss": 5.8321, + "step": 11780 + }, + { + "epoch": 0.07006494433342848, + "grad_norm": 1.6830201148986816, + "learning_rate": 4.939690783707063e-05, + "loss": 5.8655, + "step": 11781 + }, + { + "epoch": 0.07007089161670949, + "grad_norm": 1.9164643287658691, + "learning_rate": 4.939680585393595e-05, + "loss": 5.7089, + "step": 11782 + }, + { + "epoch": 0.07007683889999049, + "grad_norm": 1.5564945936203003, + "learning_rate": 4.93967038622846e-05, + "loss": 5.8671, + "step": 11783 + }, + { + "epoch": 0.07008278618327148, + "grad_norm": 1.6557695865631104, + "learning_rate": 4.939660186211662e-05, + "loss": 5.7461, + "step": 11784 + }, + { + "epoch": 0.07008873346655248, + "grad_norm": 1.7161173820495605, + "learning_rate": 4.9396499853432035e-05, + "loss": 5.0569, + "step": 11785 + }, + { + "epoch": 0.07009468074983348, + "grad_norm": 1.6760550737380981, + "learning_rate": 4.939639783623088e-05, + "loss": 5.4683, + "step": 11786 + }, + { + "epoch": 0.07010062803311447, + "grad_norm": 1.818652629852295, + "learning_rate": 4.9396295810513196e-05, + "loss": 4.9676, + "step": 11787 + }, + { + "epoch": 0.07010657531639547, + "grad_norm": 2.016510009765625, + "learning_rate": 4.939619377627901e-05, + "loss": 5.255, + "step": 11788 + }, + { + "epoch": 0.07011252259967647, + "grad_norm": 2.1893560886383057, + "learning_rate": 4.939609173352838e-05, + "loss": 5.0798, + "step": 11789 + }, + { + "epoch": 0.07011846988295746, + "grad_norm": 1.8063241243362427, + "learning_rate": 4.939598968226132e-05, + "loss": 5.049, + "step": 11790 + }, + { + "epoch": 0.07012441716623846, + "grad_norm": 1.7766486406326294, + "learning_rate": 4.939588762247786e-05, + "loss": 4.8375, + "step": 11791 + }, + { + "epoch": 0.07013036444951946, + "grad_norm": 1.6848721504211426, + "learning_rate": 4.9395785554178066e-05, + "loss": 4.7944, + "step": 11792 + }, + { + "epoch": 0.07013631173280045, + "grad_norm": 1.5173190832138062, + "learning_rate": 4.939568347736195e-05, + "loss": 4.8558, + "step": 11793 + }, + { + "epoch": 0.07014225901608145, + "grad_norm": 1.9625753164291382, + "learning_rate": 4.939558139202955e-05, + "loss": 5.0129, + "step": 11794 + }, + { + "epoch": 0.07014820629936246, + "grad_norm": 2.1610453128814697, + "learning_rate": 4.93954792981809e-05, + "loss": 5.7208, + "step": 11795 + }, + { + "epoch": 0.07015415358264344, + "grad_norm": 2.272775411605835, + "learning_rate": 4.939537719581605e-05, + "loss": 5.3673, + "step": 11796 + }, + { + "epoch": 0.07016010086592445, + "grad_norm": 1.8652429580688477, + "learning_rate": 4.9395275084935025e-05, + "loss": 5.7692, + "step": 11797 + }, + { + "epoch": 0.07016604814920545, + "grad_norm": 1.6594206094741821, + "learning_rate": 4.939517296553786e-05, + "loss": 5.7201, + "step": 11798 + }, + { + "epoch": 0.07017199543248644, + "grad_norm": 1.7499476671218872, + "learning_rate": 4.939507083762459e-05, + "loss": 5.6471, + "step": 11799 + }, + { + "epoch": 0.07017794271576744, + "grad_norm": 2.050825834274292, + "learning_rate": 4.939496870119525e-05, + "loss": 5.4805, + "step": 11800 + }, + { + "epoch": 0.07018388999904844, + "grad_norm": 2.033815383911133, + "learning_rate": 4.939486655624988e-05, + "loss": 5.7465, + "step": 11801 + }, + { + "epoch": 0.07018983728232943, + "grad_norm": 1.7499231100082397, + "learning_rate": 4.939476440278852e-05, + "loss": 5.0271, + "step": 11802 + }, + { + "epoch": 0.07019578456561043, + "grad_norm": 2.331024646759033, + "learning_rate": 4.939466224081119e-05, + "loss": 5.0491, + "step": 11803 + }, + { + "epoch": 0.07020173184889143, + "grad_norm": 2.089859962463379, + "learning_rate": 4.939456007031794e-05, + "loss": 5.6678, + "step": 11804 + }, + { + "epoch": 0.07020767913217242, + "grad_norm": 2.0704381465911865, + "learning_rate": 4.93944578913088e-05, + "loss": 5.5128, + "step": 11805 + }, + { + "epoch": 0.07021362641545342, + "grad_norm": 2.3215534687042236, + "learning_rate": 4.939435570378381e-05, + "loss": 4.8886, + "step": 11806 + }, + { + "epoch": 0.07021957369873442, + "grad_norm": 2.2506353855133057, + "learning_rate": 4.9394253507743004e-05, + "loss": 4.8606, + "step": 11807 + }, + { + "epoch": 0.07022552098201541, + "grad_norm": 1.9065401554107666, + "learning_rate": 4.939415130318641e-05, + "loss": 5.4306, + "step": 11808 + }, + { + "epoch": 0.07023146826529642, + "grad_norm": 1.9229549169540405, + "learning_rate": 4.9394049090114076e-05, + "loss": 5.5586, + "step": 11809 + }, + { + "epoch": 0.0702374155485774, + "grad_norm": 1.857392430305481, + "learning_rate": 4.939394686852603e-05, + "loss": 5.382, + "step": 11810 + }, + { + "epoch": 0.0702433628318584, + "grad_norm": 2.0430874824523926, + "learning_rate": 4.939384463842231e-05, + "loss": 5.4362, + "step": 11811 + }, + { + "epoch": 0.07024931011513941, + "grad_norm": 1.839227318763733, + "learning_rate": 4.939374239980294e-05, + "loss": 5.0285, + "step": 11812 + }, + { + "epoch": 0.0702552573984204, + "grad_norm": 1.9690957069396973, + "learning_rate": 4.939364015266798e-05, + "loss": 5.5512, + "step": 11813 + }, + { + "epoch": 0.0702612046817014, + "grad_norm": 1.819841980934143, + "learning_rate": 4.939353789701745e-05, + "loss": 5.4886, + "step": 11814 + }, + { + "epoch": 0.0702671519649824, + "grad_norm": 1.7670280933380127, + "learning_rate": 4.939343563285138e-05, + "loss": 5.0925, + "step": 11815 + }, + { + "epoch": 0.07027309924826339, + "grad_norm": 1.478452444076538, + "learning_rate": 4.9393333360169824e-05, + "loss": 5.6562, + "step": 11816 + }, + { + "epoch": 0.07027904653154439, + "grad_norm": 1.7796739339828491, + "learning_rate": 4.93932310789728e-05, + "loss": 5.7462, + "step": 11817 + }, + { + "epoch": 0.07028499381482539, + "grad_norm": 1.425431728363037, + "learning_rate": 4.939312878926036e-05, + "loss": 5.6002, + "step": 11818 + }, + { + "epoch": 0.07029094109810638, + "grad_norm": 1.7066885232925415, + "learning_rate": 4.939302649103252e-05, + "loss": 5.3827, + "step": 11819 + }, + { + "epoch": 0.07029688838138738, + "grad_norm": 1.5144743919372559, + "learning_rate": 4.939292418428933e-05, + "loss": 5.094, + "step": 11820 + }, + { + "epoch": 0.07030283566466838, + "grad_norm": 1.5426355600357056, + "learning_rate": 4.939282186903082e-05, + "loss": 5.4808, + "step": 11821 + }, + { + "epoch": 0.07030878294794937, + "grad_norm": 1.5655393600463867, + "learning_rate": 4.9392719545257034e-05, + "loss": 5.5422, + "step": 11822 + }, + { + "epoch": 0.07031473023123037, + "grad_norm": 1.2810043096542358, + "learning_rate": 4.9392617212967995e-05, + "loss": 5.5069, + "step": 11823 + }, + { + "epoch": 0.07032067751451138, + "grad_norm": 1.534588098526001, + "learning_rate": 4.9392514872163754e-05, + "loss": 5.4887, + "step": 11824 + }, + { + "epoch": 0.07032662479779236, + "grad_norm": 1.6692357063293457, + "learning_rate": 4.9392412522844325e-05, + "loss": 5.4235, + "step": 11825 + }, + { + "epoch": 0.07033257208107337, + "grad_norm": 2.1246654987335205, + "learning_rate": 4.939231016500977e-05, + "loss": 5.4533, + "step": 11826 + }, + { + "epoch": 0.07033851936435437, + "grad_norm": 2.0235774517059326, + "learning_rate": 4.9392207798660106e-05, + "loss": 5.0393, + "step": 11827 + }, + { + "epoch": 0.07034446664763536, + "grad_norm": 1.7843154668807983, + "learning_rate": 4.939210542379537e-05, + "loss": 5.2501, + "step": 11828 + }, + { + "epoch": 0.07035041393091636, + "grad_norm": 2.1056478023529053, + "learning_rate": 4.939200304041561e-05, + "loss": 5.7809, + "step": 11829 + }, + { + "epoch": 0.07035636121419736, + "grad_norm": 2.0902159214019775, + "learning_rate": 4.939190064852085e-05, + "loss": 5.591, + "step": 11830 + }, + { + "epoch": 0.07036230849747835, + "grad_norm": 2.3349802494049072, + "learning_rate": 4.9391798248111134e-05, + "loss": 4.7641, + "step": 11831 + }, + { + "epoch": 0.07036825578075935, + "grad_norm": 1.6848636865615845, + "learning_rate": 4.939169583918648e-05, + "loss": 5.5082, + "step": 11832 + }, + { + "epoch": 0.07037420306404035, + "grad_norm": 1.958947777748108, + "learning_rate": 4.939159342174695e-05, + "loss": 5.433, + "step": 11833 + }, + { + "epoch": 0.07038015034732134, + "grad_norm": 1.7382566928863525, + "learning_rate": 4.939149099579256e-05, + "loss": 5.5014, + "step": 11834 + }, + { + "epoch": 0.07038609763060234, + "grad_norm": 2.469529867172241, + "learning_rate": 4.939138856132336e-05, + "loss": 4.6383, + "step": 11835 + }, + { + "epoch": 0.07039204491388334, + "grad_norm": 2.127711057662964, + "learning_rate": 4.939128611833937e-05, + "loss": 5.6088, + "step": 11836 + }, + { + "epoch": 0.07039799219716433, + "grad_norm": 2.252210855484009, + "learning_rate": 4.9391183666840636e-05, + "loss": 5.027, + "step": 11837 + }, + { + "epoch": 0.07040393948044534, + "grad_norm": 1.990277647972107, + "learning_rate": 4.9391081206827194e-05, + "loss": 5.6389, + "step": 11838 + }, + { + "epoch": 0.07040988676372632, + "grad_norm": 2.170099973678589, + "learning_rate": 4.939097873829908e-05, + "loss": 5.5588, + "step": 11839 + }, + { + "epoch": 0.07041583404700733, + "grad_norm": 2.4616951942443848, + "learning_rate": 4.939087626125632e-05, + "loss": 5.6505, + "step": 11840 + }, + { + "epoch": 0.07042178133028833, + "grad_norm": 1.9600075483322144, + "learning_rate": 4.9390773775698964e-05, + "loss": 5.1086, + "step": 11841 + }, + { + "epoch": 0.07042772861356932, + "grad_norm": 2.173632860183716, + "learning_rate": 4.939067128162703e-05, + "loss": 5.8069, + "step": 11842 + }, + { + "epoch": 0.07043367589685032, + "grad_norm": 1.9921432733535767, + "learning_rate": 4.939056877904058e-05, + "loss": 5.3222, + "step": 11843 + }, + { + "epoch": 0.07043962318013132, + "grad_norm": 2.1605379581451416, + "learning_rate": 4.939046626793962e-05, + "loss": 5.1565, + "step": 11844 + }, + { + "epoch": 0.07044557046341231, + "grad_norm": 2.2240231037139893, + "learning_rate": 4.9390363748324206e-05, + "loss": 5.3633, + "step": 11845 + }, + { + "epoch": 0.07045151774669331, + "grad_norm": 2.1935648918151855, + "learning_rate": 4.9390261220194374e-05, + "loss": 5.3715, + "step": 11846 + }, + { + "epoch": 0.07045746502997431, + "grad_norm": 2.3079628944396973, + "learning_rate": 4.9390158683550146e-05, + "loss": 5.4728, + "step": 11847 + }, + { + "epoch": 0.0704634123132553, + "grad_norm": 2.1652259826660156, + "learning_rate": 4.939005613839157e-05, + "loss": 5.276, + "step": 11848 + }, + { + "epoch": 0.0704693595965363, + "grad_norm": 1.75044846534729, + "learning_rate": 4.938995358471867e-05, + "loss": 5.3, + "step": 11849 + }, + { + "epoch": 0.0704753068798173, + "grad_norm": 2.11893892288208, + "learning_rate": 4.93898510225315e-05, + "loss": 5.3949, + "step": 11850 + }, + { + "epoch": 0.07048125416309829, + "grad_norm": 1.8546398878097534, + "learning_rate": 4.938974845183008e-05, + "loss": 5.3606, + "step": 11851 + }, + { + "epoch": 0.0704872014463793, + "grad_norm": 2.2334201335906982, + "learning_rate": 4.9389645872614456e-05, + "loss": 5.1987, + "step": 11852 + }, + { + "epoch": 0.0704931487296603, + "grad_norm": 2.0545856952667236, + "learning_rate": 4.938954328488465e-05, + "loss": 5.2742, + "step": 11853 + }, + { + "epoch": 0.07049909601294128, + "grad_norm": 2.011322498321533, + "learning_rate": 4.938944068864071e-05, + "loss": 5.3738, + "step": 11854 + }, + { + "epoch": 0.07050504329622229, + "grad_norm": 1.6539164781570435, + "learning_rate": 4.9389338083882664e-05, + "loss": 5.1915, + "step": 11855 + }, + { + "epoch": 0.07051099057950329, + "grad_norm": 1.9423818588256836, + "learning_rate": 4.9389235470610564e-05, + "loss": 5.4432, + "step": 11856 + }, + { + "epoch": 0.07051693786278428, + "grad_norm": 1.9459011554718018, + "learning_rate": 4.938913284882442e-05, + "loss": 5.2929, + "step": 11857 + }, + { + "epoch": 0.07052288514606528, + "grad_norm": 2.0341713428497314, + "learning_rate": 4.938903021852429e-05, + "loss": 5.1413, + "step": 11858 + }, + { + "epoch": 0.07052883242934628, + "grad_norm": 2.1413371562957764, + "learning_rate": 4.93889275797102e-05, + "loss": 5.0283, + "step": 11859 + }, + { + "epoch": 0.07053477971262727, + "grad_norm": 1.9965273141860962, + "learning_rate": 4.9388824932382185e-05, + "loss": 5.0919, + "step": 11860 + }, + { + "epoch": 0.07054072699590827, + "grad_norm": 1.9912536144256592, + "learning_rate": 4.938872227654028e-05, + "loss": 4.72, + "step": 11861 + }, + { + "epoch": 0.07054667427918927, + "grad_norm": 2.267775058746338, + "learning_rate": 4.9388619612184533e-05, + "loss": 5.3942, + "step": 11862 + }, + { + "epoch": 0.07055262156247026, + "grad_norm": 2.0529544353485107, + "learning_rate": 4.9388516939314965e-05, + "loss": 5.504, + "step": 11863 + }, + { + "epoch": 0.07055856884575126, + "grad_norm": 2.124903678894043, + "learning_rate": 4.938841425793162e-05, + "loss": 5.3684, + "step": 11864 + }, + { + "epoch": 0.07056451612903226, + "grad_norm": 2.2070152759552, + "learning_rate": 4.938831156803453e-05, + "loss": 5.1349, + "step": 11865 + }, + { + "epoch": 0.07057046341231325, + "grad_norm": 1.717877745628357, + "learning_rate": 4.9388208869623734e-05, + "loss": 5.2605, + "step": 11866 + }, + { + "epoch": 0.07057641069559425, + "grad_norm": 2.258847951889038, + "learning_rate": 4.9388106162699266e-05, + "loss": 4.9048, + "step": 11867 + }, + { + "epoch": 0.07058235797887524, + "grad_norm": 2.065905809402466, + "learning_rate": 4.938800344726117e-05, + "loss": 5.0523, + "step": 11868 + }, + { + "epoch": 0.07058830526215625, + "grad_norm": 2.13053035736084, + "learning_rate": 4.9387900723309455e-05, + "loss": 5.1551, + "step": 11869 + }, + { + "epoch": 0.07059425254543725, + "grad_norm": 2.0323257446289062, + "learning_rate": 4.938779799084419e-05, + "loss": 5.0807, + "step": 11870 + }, + { + "epoch": 0.07060019982871824, + "grad_norm": 2.0503158569335938, + "learning_rate": 4.9387695249865396e-05, + "loss": 5.1946, + "step": 11871 + }, + { + "epoch": 0.07060614711199924, + "grad_norm": 2.069227933883667, + "learning_rate": 4.9387592500373105e-05, + "loss": 5.0027, + "step": 11872 + }, + { + "epoch": 0.07061209439528024, + "grad_norm": 2.0208382606506348, + "learning_rate": 4.9387489742367354e-05, + "loss": 5.0877, + "step": 11873 + }, + { + "epoch": 0.07061804167856123, + "grad_norm": 2.0159859657287598, + "learning_rate": 4.9387386975848196e-05, + "loss": 4.864, + "step": 11874 + }, + { + "epoch": 0.07062398896184223, + "grad_norm": 1.9365311861038208, + "learning_rate": 4.9387284200815645e-05, + "loss": 4.7373, + "step": 11875 + }, + { + "epoch": 0.07062993624512323, + "grad_norm": 2.1024274826049805, + "learning_rate": 4.9387181417269736e-05, + "loss": 5.0155, + "step": 11876 + }, + { + "epoch": 0.07063588352840422, + "grad_norm": 2.5438032150268555, + "learning_rate": 4.938707862521052e-05, + "loss": 5.3267, + "step": 11877 + }, + { + "epoch": 0.07064183081168522, + "grad_norm": 2.129715919494629, + "learning_rate": 4.938697582463804e-05, + "loss": 5.104, + "step": 11878 + }, + { + "epoch": 0.07064777809496622, + "grad_norm": 2.237442970275879, + "learning_rate": 4.9386873015552303e-05, + "loss": 5.134, + "step": 11879 + }, + { + "epoch": 0.07065372537824721, + "grad_norm": 2.2773404121398926, + "learning_rate": 4.9386770197953366e-05, + "loss": 5.269, + "step": 11880 + }, + { + "epoch": 0.07065967266152821, + "grad_norm": 2.0882620811462402, + "learning_rate": 4.938666737184125e-05, + "loss": 4.8091, + "step": 11881 + }, + { + "epoch": 0.07066561994480922, + "grad_norm": 2.0649476051330566, + "learning_rate": 4.938656453721602e-05, + "loss": 4.9143, + "step": 11882 + }, + { + "epoch": 0.0706715672280902, + "grad_norm": 2.19030499458313, + "learning_rate": 4.938646169407768e-05, + "loss": 4.7439, + "step": 11883 + }, + { + "epoch": 0.0706775145113712, + "grad_norm": 2.8669347763061523, + "learning_rate": 4.938635884242628e-05, + "loss": 4.3684, + "step": 11884 + }, + { + "epoch": 0.07068346179465221, + "grad_norm": 2.3018336296081543, + "learning_rate": 4.9386255982261854e-05, + "loss": 4.8602, + "step": 11885 + }, + { + "epoch": 0.0706894090779332, + "grad_norm": 2.7775471210479736, + "learning_rate": 4.938615311358443e-05, + "loss": 5.2401, + "step": 11886 + }, + { + "epoch": 0.0706953563612142, + "grad_norm": 2.1075756549835205, + "learning_rate": 4.938605023639406e-05, + "loss": 5.1085, + "step": 11887 + }, + { + "epoch": 0.0707013036444952, + "grad_norm": 2.456530809402466, + "learning_rate": 4.9385947350690776e-05, + "loss": 5.0506, + "step": 11888 + }, + { + "epoch": 0.07070725092777619, + "grad_norm": 1.76799738407135, + "learning_rate": 4.9385844456474605e-05, + "loss": 4.8233, + "step": 11889 + }, + { + "epoch": 0.07071319821105719, + "grad_norm": 2.0819127559661865, + "learning_rate": 4.938574155374559e-05, + "loss": 4.4198, + "step": 11890 + }, + { + "epoch": 0.07071914549433819, + "grad_norm": 2.221586227416992, + "learning_rate": 4.9385638642503765e-05, + "loss": 4.2423, + "step": 11891 + }, + { + "epoch": 0.07072509277761918, + "grad_norm": 2.108182668685913, + "learning_rate": 4.938553572274916e-05, + "loss": 4.2564, + "step": 11892 + }, + { + "epoch": 0.07073104006090018, + "grad_norm": 1.9631624221801758, + "learning_rate": 4.938543279448182e-05, + "loss": 4.1641, + "step": 11893 + }, + { + "epoch": 0.07073698734418118, + "grad_norm": 1.9730273485183716, + "learning_rate": 4.938532985770178e-05, + "loss": 4.0728, + "step": 11894 + }, + { + "epoch": 0.07074293462746217, + "grad_norm": 1.9632551670074463, + "learning_rate": 4.9385226912409065e-05, + "loss": 4.2014, + "step": 11895 + }, + { + "epoch": 0.07074888191074317, + "grad_norm": 1.9986671209335327, + "learning_rate": 4.9385123958603726e-05, + "loss": 4.0299, + "step": 11896 + }, + { + "epoch": 0.07075482919402416, + "grad_norm": 2.2256031036376953, + "learning_rate": 4.9385020996285794e-05, + "loss": 4.1397, + "step": 11897 + }, + { + "epoch": 0.07076077647730517, + "grad_norm": 2.231462001800537, + "learning_rate": 4.9384918025455296e-05, + "loss": 4.0977, + "step": 11898 + }, + { + "epoch": 0.07076672376058617, + "grad_norm": 2.0946438312530518, + "learning_rate": 4.938481504611227e-05, + "loss": 3.9446, + "step": 11899 + }, + { + "epoch": 0.07077267104386716, + "grad_norm": 1.6953986883163452, + "learning_rate": 4.938471205825677e-05, + "loss": 4.6809, + "step": 11900 + }, + { + "epoch": 0.07077861832714816, + "grad_norm": 2.1963350772857666, + "learning_rate": 4.938460906188882e-05, + "loss": 4.3626, + "step": 11901 + }, + { + "epoch": 0.07078456561042916, + "grad_norm": 2.2069251537323, + "learning_rate": 4.938450605700845e-05, + "loss": 4.1057, + "step": 11902 + }, + { + "epoch": 0.07079051289371015, + "grad_norm": 2.1809592247009277, + "learning_rate": 4.9384403043615694e-05, + "loss": 3.5619, + "step": 11903 + }, + { + "epoch": 0.07079646017699115, + "grad_norm": 2.305171012878418, + "learning_rate": 4.938430002171061e-05, + "loss": 5.8033, + "step": 11904 + }, + { + "epoch": 0.07080240746027215, + "grad_norm": 2.1984407901763916, + "learning_rate": 4.9384196991293205e-05, + "loss": 3.5869, + "step": 11905 + }, + { + "epoch": 0.07080835474355314, + "grad_norm": 1.8870881795883179, + "learning_rate": 4.938409395236353e-05, + "loss": 4.8027, + "step": 11906 + }, + { + "epoch": 0.07081430202683414, + "grad_norm": 2.11314058303833, + "learning_rate": 4.938399090492163e-05, + "loss": 4.1942, + "step": 11907 + }, + { + "epoch": 0.07082024931011514, + "grad_norm": 2.143794298171997, + "learning_rate": 4.938388784896752e-05, + "loss": 3.8526, + "step": 11908 + }, + { + "epoch": 0.07082619659339613, + "grad_norm": 2.4311232566833496, + "learning_rate": 4.938378478450125e-05, + "loss": 3.8572, + "step": 11909 + }, + { + "epoch": 0.07083214387667713, + "grad_norm": 2.0959818363189697, + "learning_rate": 4.9383681711522855e-05, + "loss": 4.3465, + "step": 11910 + }, + { + "epoch": 0.07083809115995814, + "grad_norm": 1.9161559343338013, + "learning_rate": 4.938357863003237e-05, + "loss": 5.5608, + "step": 11911 + }, + { + "epoch": 0.07084403844323912, + "grad_norm": 1.8549482822418213, + "learning_rate": 4.9383475540029824e-05, + "loss": 5.9874, + "step": 11912 + }, + { + "epoch": 0.07084998572652013, + "grad_norm": 1.8600444793701172, + "learning_rate": 4.9383372441515255e-05, + "loss": 6.0579, + "step": 11913 + }, + { + "epoch": 0.07085593300980113, + "grad_norm": 1.6985594034194946, + "learning_rate": 4.938326933448871e-05, + "loss": 5.7963, + "step": 11914 + }, + { + "epoch": 0.07086188029308212, + "grad_norm": 2.06860613822937, + "learning_rate": 4.9383166218950216e-05, + "loss": 5.4789, + "step": 11915 + }, + { + "epoch": 0.07086782757636312, + "grad_norm": 2.8111190795898438, + "learning_rate": 4.938306309489982e-05, + "loss": 5.2546, + "step": 11916 + }, + { + "epoch": 0.07087377485964412, + "grad_norm": 2.700589895248413, + "learning_rate": 4.9382959962337536e-05, + "loss": 5.2021, + "step": 11917 + }, + { + "epoch": 0.07087972214292511, + "grad_norm": 2.364793539047241, + "learning_rate": 4.938285682126341e-05, + "loss": 4.9508, + "step": 11918 + }, + { + "epoch": 0.07088566942620611, + "grad_norm": 2.4212446212768555, + "learning_rate": 4.938275367167749e-05, + "loss": 5.1269, + "step": 11919 + }, + { + "epoch": 0.07089161670948711, + "grad_norm": 1.785733699798584, + "learning_rate": 4.93826505135798e-05, + "loss": 5.7357, + "step": 11920 + }, + { + "epoch": 0.0708975639927681, + "grad_norm": 1.6912823915481567, + "learning_rate": 4.9382547346970376e-05, + "loss": 5.4003, + "step": 11921 + }, + { + "epoch": 0.0709035112760491, + "grad_norm": 1.8408714532852173, + "learning_rate": 4.938244417184926e-05, + "loss": 5.3169, + "step": 11922 + }, + { + "epoch": 0.0709094585593301, + "grad_norm": 2.3245468139648438, + "learning_rate": 4.938234098821648e-05, + "loss": 4.9588, + "step": 11923 + }, + { + "epoch": 0.07091540584261109, + "grad_norm": 1.922179102897644, + "learning_rate": 4.938223779607208e-05, + "loss": 5.431, + "step": 11924 + }, + { + "epoch": 0.0709213531258921, + "grad_norm": 1.8331208229064941, + "learning_rate": 4.9382134595416094e-05, + "loss": 5.9121, + "step": 11925 + }, + { + "epoch": 0.07092730040917308, + "grad_norm": 2.15932297706604, + "learning_rate": 4.9382031386248556e-05, + "loss": 5.058, + "step": 11926 + }, + { + "epoch": 0.07093324769245409, + "grad_norm": 2.2255606651306152, + "learning_rate": 4.93819281685695e-05, + "loss": 4.9215, + "step": 11927 + }, + { + "epoch": 0.07093919497573509, + "grad_norm": 2.3665359020233154, + "learning_rate": 4.938182494237897e-05, + "loss": 4.8405, + "step": 11928 + }, + { + "epoch": 0.07094514225901608, + "grad_norm": 2.1564438343048096, + "learning_rate": 4.938172170767699e-05, + "loss": 4.9598, + "step": 11929 + }, + { + "epoch": 0.07095108954229708, + "grad_norm": 2.2083945274353027, + "learning_rate": 4.938161846446361e-05, + "loss": 4.8603, + "step": 11930 + }, + { + "epoch": 0.07095703682557808, + "grad_norm": 2.3422255516052246, + "learning_rate": 4.938151521273885e-05, + "loss": 4.8926, + "step": 11931 + }, + { + "epoch": 0.07096298410885907, + "grad_norm": 2.5269415378570557, + "learning_rate": 4.9381411952502764e-05, + "loss": 4.876, + "step": 11932 + }, + { + "epoch": 0.07096893139214007, + "grad_norm": 2.1761882305145264, + "learning_rate": 4.9381308683755376e-05, + "loss": 4.7533, + "step": 11933 + }, + { + "epoch": 0.07097487867542107, + "grad_norm": 2.078146457672119, + "learning_rate": 4.938120540649672e-05, + "loss": 4.9606, + "step": 11934 + }, + { + "epoch": 0.07098082595870206, + "grad_norm": 2.3086254596710205, + "learning_rate": 4.9381102120726846e-05, + "loss": 4.7763, + "step": 11935 + }, + { + "epoch": 0.07098677324198306, + "grad_norm": 1.8531124591827393, + "learning_rate": 4.938099882644578e-05, + "loss": 5.0218, + "step": 11936 + }, + { + "epoch": 0.07099272052526406, + "grad_norm": 2.2169790267944336, + "learning_rate": 4.938089552365355e-05, + "loss": 6.0072, + "step": 11937 + }, + { + "epoch": 0.07099866780854505, + "grad_norm": 1.8759880065917969, + "learning_rate": 4.938079221235021e-05, + "loss": 5.8259, + "step": 11938 + }, + { + "epoch": 0.07100461509182605, + "grad_norm": 2.026217222213745, + "learning_rate": 4.938068889253579e-05, + "loss": 5.4426, + "step": 11939 + }, + { + "epoch": 0.07101056237510706, + "grad_norm": 2.5047786235809326, + "learning_rate": 4.938058556421031e-05, + "loss": 4.7276, + "step": 11940 + }, + { + "epoch": 0.07101650965838804, + "grad_norm": 2.243281602859497, + "learning_rate": 4.938048222737383e-05, + "loss": 4.9284, + "step": 11941 + }, + { + "epoch": 0.07102245694166905, + "grad_norm": 1.989563226699829, + "learning_rate": 4.938037888202637e-05, + "loss": 5.7744, + "step": 11942 + }, + { + "epoch": 0.07102840422495005, + "grad_norm": 1.829290509223938, + "learning_rate": 4.9380275528167974e-05, + "loss": 5.6942, + "step": 11943 + }, + { + "epoch": 0.07103435150823104, + "grad_norm": 1.8001593351364136, + "learning_rate": 4.938017216579868e-05, + "loss": 5.6928, + "step": 11944 + }, + { + "epoch": 0.07104029879151204, + "grad_norm": 1.7705434560775757, + "learning_rate": 4.938006879491851e-05, + "loss": 5.6954, + "step": 11945 + }, + { + "epoch": 0.07104624607479304, + "grad_norm": 1.8746812343597412, + "learning_rate": 4.937996541552752e-05, + "loss": 5.7184, + "step": 11946 + }, + { + "epoch": 0.07105219335807403, + "grad_norm": 1.6931661367416382, + "learning_rate": 4.937986202762573e-05, + "loss": 5.398, + "step": 11947 + }, + { + "epoch": 0.07105814064135503, + "grad_norm": 2.0784003734588623, + "learning_rate": 4.937975863121318e-05, + "loss": 5.7164, + "step": 11948 + }, + { + "epoch": 0.07106408792463603, + "grad_norm": 1.8495618104934692, + "learning_rate": 4.937965522628991e-05, + "loss": 5.7093, + "step": 11949 + }, + { + "epoch": 0.07107003520791702, + "grad_norm": 1.7720533609390259, + "learning_rate": 4.9379551812855964e-05, + "loss": 5.7548, + "step": 11950 + }, + { + "epoch": 0.07107598249119802, + "grad_norm": 1.721205472946167, + "learning_rate": 4.937944839091135e-05, + "loss": 5.7496, + "step": 11951 + }, + { + "epoch": 0.07108192977447902, + "grad_norm": 1.896657109260559, + "learning_rate": 4.9379344960456145e-05, + "loss": 5.5989, + "step": 11952 + }, + { + "epoch": 0.07108787705776001, + "grad_norm": 1.4022153615951538, + "learning_rate": 4.9379241521490344e-05, + "loss": 5.5029, + "step": 11953 + }, + { + "epoch": 0.07109382434104101, + "grad_norm": 1.9068467617034912, + "learning_rate": 4.937913807401401e-05, + "loss": 5.6915, + "step": 11954 + }, + { + "epoch": 0.071099771624322, + "grad_norm": 1.6542187929153442, + "learning_rate": 4.9379034618027164e-05, + "loss": 5.6409, + "step": 11955 + }, + { + "epoch": 0.071105718907603, + "grad_norm": 1.5280201435089111, + "learning_rate": 4.937893115352986e-05, + "loss": 5.6264, + "step": 11956 + }, + { + "epoch": 0.07111166619088401, + "grad_norm": 1.767232060432434, + "learning_rate": 4.937882768052211e-05, + "loss": 5.4562, + "step": 11957 + }, + { + "epoch": 0.071117613474165, + "grad_norm": 1.571892261505127, + "learning_rate": 4.9378724199003975e-05, + "loss": 5.7949, + "step": 11958 + }, + { + "epoch": 0.071123560757446, + "grad_norm": 1.9400190114974976, + "learning_rate": 4.937862070897548e-05, + "loss": 5.5872, + "step": 11959 + }, + { + "epoch": 0.071129508040727, + "grad_norm": 1.7246766090393066, + "learning_rate": 4.937851721043665e-05, + "loss": 5.8455, + "step": 11960 + }, + { + "epoch": 0.07113545532400799, + "grad_norm": 1.937168002128601, + "learning_rate": 4.9378413703387534e-05, + "loss": 5.0864, + "step": 11961 + }, + { + "epoch": 0.07114140260728899, + "grad_norm": 2.3808209896087646, + "learning_rate": 4.937831018782817e-05, + "loss": 4.5918, + "step": 11962 + }, + { + "epoch": 0.07114734989056999, + "grad_norm": 2.567026138305664, + "learning_rate": 4.937820666375859e-05, + "loss": 4.7375, + "step": 11963 + }, + { + "epoch": 0.07115329717385098, + "grad_norm": 1.8941316604614258, + "learning_rate": 4.937810313117882e-05, + "loss": 5.811, + "step": 11964 + }, + { + "epoch": 0.07115924445713198, + "grad_norm": 1.9301189184188843, + "learning_rate": 4.9377999590088916e-05, + "loss": 5.7947, + "step": 11965 + }, + { + "epoch": 0.07116519174041298, + "grad_norm": 2.281784772872925, + "learning_rate": 4.93778960404889e-05, + "loss": 5.5993, + "step": 11966 + }, + { + "epoch": 0.07117113902369397, + "grad_norm": 1.7826297283172607, + "learning_rate": 4.937779248237882e-05, + "loss": 6.1836, + "step": 11967 + }, + { + "epoch": 0.07117708630697497, + "grad_norm": 2.8714182376861572, + "learning_rate": 4.9377688915758694e-05, + "loss": 5.3955, + "step": 11968 + }, + { + "epoch": 0.07118303359025598, + "grad_norm": 2.3284013271331787, + "learning_rate": 4.937758534062857e-05, + "loss": 5.3027, + "step": 11969 + }, + { + "epoch": 0.07118898087353696, + "grad_norm": 1.8880923986434937, + "learning_rate": 4.937748175698849e-05, + "loss": 5.8408, + "step": 11970 + }, + { + "epoch": 0.07119492815681797, + "grad_norm": 2.8952460289001465, + "learning_rate": 4.937737816483847e-05, + "loss": 4.7325, + "step": 11971 + }, + { + "epoch": 0.07120087544009897, + "grad_norm": 2.5028738975524902, + "learning_rate": 4.9377274564178574e-05, + "loss": 4.5854, + "step": 11972 + }, + { + "epoch": 0.07120682272337996, + "grad_norm": 1.8834285736083984, + "learning_rate": 4.9377170955008815e-05, + "loss": 5.5415, + "step": 11973 + }, + { + "epoch": 0.07121277000666096, + "grad_norm": 2.162062644958496, + "learning_rate": 4.937706733732924e-05, + "loss": 5.2187, + "step": 11974 + }, + { + "epoch": 0.07121871728994196, + "grad_norm": 2.1506881713867188, + "learning_rate": 4.937696371113988e-05, + "loss": 5.1746, + "step": 11975 + }, + { + "epoch": 0.07122466457322295, + "grad_norm": 2.0309176445007324, + "learning_rate": 4.937686007644078e-05, + "loss": 5.1708, + "step": 11976 + }, + { + "epoch": 0.07123061185650395, + "grad_norm": 2.251579523086548, + "learning_rate": 4.9376756433231966e-05, + "loss": 6.0623, + "step": 11977 + }, + { + "epoch": 0.07123655913978495, + "grad_norm": 2.161918878555298, + "learning_rate": 4.937665278151348e-05, + "loss": 6.2297, + "step": 11978 + }, + { + "epoch": 0.07124250642306594, + "grad_norm": 1.703783631324768, + "learning_rate": 4.937654912128535e-05, + "loss": 5.9388, + "step": 11979 + }, + { + "epoch": 0.07124845370634694, + "grad_norm": 1.7420361042022705, + "learning_rate": 4.937644545254763e-05, + "loss": 5.5426, + "step": 11980 + }, + { + "epoch": 0.07125440098962794, + "grad_norm": 1.8634297847747803, + "learning_rate": 4.937634177530033e-05, + "loss": 5.8412, + "step": 11981 + }, + { + "epoch": 0.07126034827290893, + "grad_norm": 1.8084121942520142, + "learning_rate": 4.937623808954351e-05, + "loss": 6.266, + "step": 11982 + }, + { + "epoch": 0.07126629555618993, + "grad_norm": 1.5925266742706299, + "learning_rate": 4.93761343952772e-05, + "loss": 5.7173, + "step": 11983 + }, + { + "epoch": 0.07127224283947092, + "grad_norm": 1.7778257131576538, + "learning_rate": 4.937603069250143e-05, + "loss": 5.8119, + "step": 11984 + }, + { + "epoch": 0.07127819012275192, + "grad_norm": 1.6839842796325684, + "learning_rate": 4.9375926981216235e-05, + "loss": 5.9446, + "step": 11985 + }, + { + "epoch": 0.07128413740603293, + "grad_norm": 1.7892810106277466, + "learning_rate": 4.937582326142166e-05, + "loss": 5.9564, + "step": 11986 + }, + { + "epoch": 0.07129008468931392, + "grad_norm": 1.7179774045944214, + "learning_rate": 4.9375719533117734e-05, + "loss": 6.1969, + "step": 11987 + }, + { + "epoch": 0.07129603197259492, + "grad_norm": 1.3788355588912964, + "learning_rate": 4.93756157963045e-05, + "loss": 6.0409, + "step": 11988 + }, + { + "epoch": 0.07130197925587592, + "grad_norm": 1.6451042890548706, + "learning_rate": 4.9375512050981986e-05, + "loss": 5.8116, + "step": 11989 + }, + { + "epoch": 0.07130792653915691, + "grad_norm": 1.8904451131820679, + "learning_rate": 4.937540829715024e-05, + "loss": 5.7952, + "step": 11990 + }, + { + "epoch": 0.07131387382243791, + "grad_norm": 1.4976747035980225, + "learning_rate": 4.9375304534809284e-05, + "loss": 5.7092, + "step": 11991 + }, + { + "epoch": 0.07131982110571891, + "grad_norm": 1.5585631132125854, + "learning_rate": 4.937520076395916e-05, + "loss": 6.0693, + "step": 11992 + }, + { + "epoch": 0.0713257683889999, + "grad_norm": 1.8329144716262817, + "learning_rate": 4.937509698459991e-05, + "loss": 5.5883, + "step": 11993 + }, + { + "epoch": 0.0713317156722809, + "grad_norm": 2.6030189990997314, + "learning_rate": 4.937499319673157e-05, + "loss": 5.1776, + "step": 11994 + }, + { + "epoch": 0.0713376629555619, + "grad_norm": 1.744042992591858, + "learning_rate": 4.9374889400354165e-05, + "loss": 5.4105, + "step": 11995 + }, + { + "epoch": 0.07134361023884289, + "grad_norm": 1.819018006324768, + "learning_rate": 4.937478559546774e-05, + "loss": 5.5695, + "step": 11996 + }, + { + "epoch": 0.0713495575221239, + "grad_norm": 1.754894733428955, + "learning_rate": 4.9374681782072325e-05, + "loss": 5.7519, + "step": 11997 + }, + { + "epoch": 0.0713555048054049, + "grad_norm": 2.132507085800171, + "learning_rate": 4.9374577960167964e-05, + "loss": 4.9783, + "step": 11998 + }, + { + "epoch": 0.07136145208868588, + "grad_norm": 2.0926709175109863, + "learning_rate": 4.937447412975469e-05, + "loss": 4.905, + "step": 11999 + }, + { + "epoch": 0.07136739937196689, + "grad_norm": 2.1235594749450684, + "learning_rate": 4.937437029083254e-05, + "loss": 4.7978, + "step": 12000 + }, + { + "epoch": 0.07137334665524789, + "grad_norm": 2.217911720275879, + "learning_rate": 4.937426644340154e-05, + "loss": 4.9506, + "step": 12001 + }, + { + "epoch": 0.07137929393852888, + "grad_norm": 2.0362601280212402, + "learning_rate": 4.937416258746175e-05, + "loss": 5.0299, + "step": 12002 + }, + { + "epoch": 0.07138524122180988, + "grad_norm": 2.2846896648406982, + "learning_rate": 4.937405872301318e-05, + "loss": 5.0606, + "step": 12003 + }, + { + "epoch": 0.07139118850509088, + "grad_norm": 2.2545530796051025, + "learning_rate": 4.937395485005588e-05, + "loss": 4.8651, + "step": 12004 + }, + { + "epoch": 0.07139713578837187, + "grad_norm": 2.32738995552063, + "learning_rate": 4.937385096858989e-05, + "loss": 4.7908, + "step": 12005 + }, + { + "epoch": 0.07140308307165287, + "grad_norm": 2.239215850830078, + "learning_rate": 4.9373747078615235e-05, + "loss": 4.7545, + "step": 12006 + }, + { + "epoch": 0.07140903035493387, + "grad_norm": 2.4766969680786133, + "learning_rate": 4.937364318013196e-05, + "loss": 5.0795, + "step": 12007 + }, + { + "epoch": 0.07141497763821486, + "grad_norm": 2.602111577987671, + "learning_rate": 4.937353927314009e-05, + "loss": 4.6898, + "step": 12008 + }, + { + "epoch": 0.07142092492149586, + "grad_norm": 2.8508496284484863, + "learning_rate": 4.937343535763968e-05, + "loss": 4.3136, + "step": 12009 + }, + { + "epoch": 0.07142687220477686, + "grad_norm": 2.4613311290740967, + "learning_rate": 4.9373331433630754e-05, + "loss": 4.4826, + "step": 12010 + }, + { + "epoch": 0.07143281948805785, + "grad_norm": 2.561643362045288, + "learning_rate": 4.937322750111334e-05, + "loss": 4.251, + "step": 12011 + }, + { + "epoch": 0.07143876677133885, + "grad_norm": 2.397507667541504, + "learning_rate": 4.93731235600875e-05, + "loss": 4.3018, + "step": 12012 + }, + { + "epoch": 0.07144471405461984, + "grad_norm": 2.250120162963867, + "learning_rate": 4.937301961055324e-05, + "loss": 4.1796, + "step": 12013 + }, + { + "epoch": 0.07145066133790084, + "grad_norm": 2.337451934814453, + "learning_rate": 4.9372915652510615e-05, + "loss": 4.2362, + "step": 12014 + }, + { + "epoch": 0.07145660862118185, + "grad_norm": 2.357034921646118, + "learning_rate": 4.937281168595966e-05, + "loss": 4.0961, + "step": 12015 + }, + { + "epoch": 0.07146255590446284, + "grad_norm": 2.0843617916107178, + "learning_rate": 4.93727077109004e-05, + "loss": 4.4584, + "step": 12016 + }, + { + "epoch": 0.07146850318774384, + "grad_norm": 2.149707317352295, + "learning_rate": 4.937260372733289e-05, + "loss": 4.2248, + "step": 12017 + }, + { + "epoch": 0.07147445047102484, + "grad_norm": 2.149765729904175, + "learning_rate": 4.937249973525715e-05, + "loss": 4.154, + "step": 12018 + }, + { + "epoch": 0.07148039775430583, + "grad_norm": 2.1572682857513428, + "learning_rate": 4.937239573467323e-05, + "loss": 4.2345, + "step": 12019 + }, + { + "epoch": 0.07148634503758683, + "grad_norm": 2.246751070022583, + "learning_rate": 4.9372291725581145e-05, + "loss": 3.9739, + "step": 12020 + }, + { + "epoch": 0.07149229232086783, + "grad_norm": 2.2735042572021484, + "learning_rate": 4.9372187707980955e-05, + "loss": 4.0442, + "step": 12021 + }, + { + "epoch": 0.07149823960414882, + "grad_norm": 2.2270023822784424, + "learning_rate": 4.9372083681872684e-05, + "loss": 4.0374, + "step": 12022 + }, + { + "epoch": 0.07150418688742982, + "grad_norm": 2.2228193283081055, + "learning_rate": 4.937197964725637e-05, + "loss": 4.0503, + "step": 12023 + }, + { + "epoch": 0.07151013417071082, + "grad_norm": 2.2630691528320312, + "learning_rate": 4.9371875604132046e-05, + "loss": 4.0431, + "step": 12024 + }, + { + "epoch": 0.07151608145399181, + "grad_norm": 2.2461886405944824, + "learning_rate": 4.937177155249976e-05, + "loss": 4.1164, + "step": 12025 + }, + { + "epoch": 0.07152202873727281, + "grad_norm": 1.9476062059402466, + "learning_rate": 4.937166749235953e-05, + "loss": 4.317, + "step": 12026 + }, + { + "epoch": 0.07152797602055382, + "grad_norm": 2.33138370513916, + "learning_rate": 4.937156342371141e-05, + "loss": 4.1309, + "step": 12027 + }, + { + "epoch": 0.0715339233038348, + "grad_norm": 3.3887436389923096, + "learning_rate": 4.937145934655543e-05, + "loss": 5.1713, + "step": 12028 + }, + { + "epoch": 0.0715398705871158, + "grad_norm": 2.499302625656128, + "learning_rate": 4.937135526089162e-05, + "loss": 4.0553, + "step": 12029 + }, + { + "epoch": 0.07154581787039681, + "grad_norm": 2.4269003868103027, + "learning_rate": 4.937125116672002e-05, + "loss": 4.0425, + "step": 12030 + }, + { + "epoch": 0.0715517651536778, + "grad_norm": 2.1819067001342773, + "learning_rate": 4.937114706404067e-05, + "loss": 4.0591, + "step": 12031 + }, + { + "epoch": 0.0715577124369588, + "grad_norm": 1.8021305799484253, + "learning_rate": 4.937104295285361e-05, + "loss": 4.9171, + "step": 12032 + }, + { + "epoch": 0.0715636597202398, + "grad_norm": 2.1833691596984863, + "learning_rate": 4.937093883315887e-05, + "loss": 4.053, + "step": 12033 + }, + { + "epoch": 0.07156960700352079, + "grad_norm": 2.1684465408325195, + "learning_rate": 4.9370834704956484e-05, + "loss": 4.0692, + "step": 12034 + }, + { + "epoch": 0.07157555428680179, + "grad_norm": 2.1576929092407227, + "learning_rate": 4.937073056824649e-05, + "loss": 3.9958, + "step": 12035 + }, + { + "epoch": 0.07158150157008279, + "grad_norm": 1.5627915859222412, + "learning_rate": 4.9370626423028924e-05, + "loss": 5.3373, + "step": 12036 + }, + { + "epoch": 0.07158744885336378, + "grad_norm": 1.6166819334030151, + "learning_rate": 4.937052226930383e-05, + "loss": 5.801, + "step": 12037 + }, + { + "epoch": 0.07159339613664478, + "grad_norm": 1.4187299013137817, + "learning_rate": 4.937041810707124e-05, + "loss": 5.5937, + "step": 12038 + }, + { + "epoch": 0.07159934341992578, + "grad_norm": 1.5857088565826416, + "learning_rate": 4.937031393633118e-05, + "loss": 5.6268, + "step": 12039 + }, + { + "epoch": 0.07160529070320677, + "grad_norm": 1.5691097974777222, + "learning_rate": 4.93702097570837e-05, + "loss": 5.7414, + "step": 12040 + }, + { + "epoch": 0.07161123798648777, + "grad_norm": 1.4723674058914185, + "learning_rate": 4.9370105569328835e-05, + "loss": 5.4711, + "step": 12041 + }, + { + "epoch": 0.07161718526976876, + "grad_norm": 1.686745047569275, + "learning_rate": 4.937000137306661e-05, + "loss": 5.4302, + "step": 12042 + }, + { + "epoch": 0.07162313255304976, + "grad_norm": 1.7394465208053589, + "learning_rate": 4.936989716829707e-05, + "loss": 5.1609, + "step": 12043 + }, + { + "epoch": 0.07162907983633077, + "grad_norm": 1.4348796606063843, + "learning_rate": 4.9369792955020264e-05, + "loss": 5.2468, + "step": 12044 + }, + { + "epoch": 0.07163502711961175, + "grad_norm": 1.674187421798706, + "learning_rate": 4.93696887332362e-05, + "loss": 5.2451, + "step": 12045 + }, + { + "epoch": 0.07164097440289276, + "grad_norm": 1.6606419086456299, + "learning_rate": 4.9369584502944934e-05, + "loss": 5.2744, + "step": 12046 + }, + { + "epoch": 0.07164692168617376, + "grad_norm": 1.4020198583602905, + "learning_rate": 4.93694802641465e-05, + "loss": 5.2914, + "step": 12047 + }, + { + "epoch": 0.07165286896945475, + "grad_norm": 1.4234102964401245, + "learning_rate": 4.936937601684093e-05, + "loss": 5.2405, + "step": 12048 + }, + { + "epoch": 0.07165881625273575, + "grad_norm": 1.261983036994934, + "learning_rate": 4.936927176102827e-05, + "loss": 5.1532, + "step": 12049 + }, + { + "epoch": 0.07166476353601675, + "grad_norm": 1.3787094354629517, + "learning_rate": 4.9369167496708534e-05, + "loss": 5.2033, + "step": 12050 + }, + { + "epoch": 0.07167071081929774, + "grad_norm": 1.405142068862915, + "learning_rate": 4.9369063223881786e-05, + "loss": 5.0391, + "step": 12051 + }, + { + "epoch": 0.07167665810257874, + "grad_norm": 1.513554573059082, + "learning_rate": 4.936895894254804e-05, + "loss": 5.0236, + "step": 12052 + }, + { + "epoch": 0.07168260538585974, + "grad_norm": 1.4279611110687256, + "learning_rate": 4.9368854652707355e-05, + "loss": 5.1429, + "step": 12053 + }, + { + "epoch": 0.07168855266914073, + "grad_norm": 1.4320182800292969, + "learning_rate": 4.936875035435974e-05, + "loss": 5.0519, + "step": 12054 + }, + { + "epoch": 0.07169449995242173, + "grad_norm": 1.415925145149231, + "learning_rate": 4.936864604750526e-05, + "loss": 4.9904, + "step": 12055 + }, + { + "epoch": 0.07170044723570274, + "grad_norm": 1.403998851776123, + "learning_rate": 4.936854173214393e-05, + "loss": 4.8988, + "step": 12056 + }, + { + "epoch": 0.07170639451898372, + "grad_norm": 1.744532585144043, + "learning_rate": 4.936843740827579e-05, + "loss": 4.9661, + "step": 12057 + }, + { + "epoch": 0.07171234180226473, + "grad_norm": 1.4900517463684082, + "learning_rate": 4.9368333075900884e-05, + "loss": 5.1887, + "step": 12058 + }, + { + "epoch": 0.07171828908554573, + "grad_norm": 1.454063057899475, + "learning_rate": 4.936822873501925e-05, + "loss": 5.2801, + "step": 12059 + }, + { + "epoch": 0.07172423636882672, + "grad_norm": 1.5426071882247925, + "learning_rate": 4.936812438563092e-05, + "loss": 5.1987, + "step": 12060 + }, + { + "epoch": 0.07173018365210772, + "grad_norm": 1.7365894317626953, + "learning_rate": 4.936802002773592e-05, + "loss": 5.1933, + "step": 12061 + }, + { + "epoch": 0.07173613093538872, + "grad_norm": 1.5046216249465942, + "learning_rate": 4.9367915661334295e-05, + "loss": 5.1688, + "step": 12062 + }, + { + "epoch": 0.07174207821866971, + "grad_norm": 1.6715713739395142, + "learning_rate": 4.936781128642609e-05, + "loss": 5.3649, + "step": 12063 + }, + { + "epoch": 0.07174802550195071, + "grad_norm": 1.6386772394180298, + "learning_rate": 4.936770690301134e-05, + "loss": 5.4107, + "step": 12064 + }, + { + "epoch": 0.07175397278523171, + "grad_norm": 1.604153037071228, + "learning_rate": 4.936760251109006e-05, + "loss": 5.2952, + "step": 12065 + }, + { + "epoch": 0.0717599200685127, + "grad_norm": 1.7100228071212769, + "learning_rate": 4.9367498110662306e-05, + "loss": 5.202, + "step": 12066 + }, + { + "epoch": 0.0717658673517937, + "grad_norm": 1.4062007665634155, + "learning_rate": 4.9367393701728116e-05, + "loss": 5.2246, + "step": 12067 + }, + { + "epoch": 0.0717718146350747, + "grad_norm": 1.4552310705184937, + "learning_rate": 4.9367289284287514e-05, + "loss": 5.5919, + "step": 12068 + }, + { + "epoch": 0.07177776191835569, + "grad_norm": 1.5134438276290894, + "learning_rate": 4.9367184858340546e-05, + "loss": 5.3921, + "step": 12069 + }, + { + "epoch": 0.0717837092016367, + "grad_norm": 1.724139928817749, + "learning_rate": 4.9367080423887246e-05, + "loss": 5.6409, + "step": 12070 + }, + { + "epoch": 0.07178965648491768, + "grad_norm": 1.7401317358016968, + "learning_rate": 4.9366975980927655e-05, + "loss": 4.8093, + "step": 12071 + }, + { + "epoch": 0.07179560376819868, + "grad_norm": 2.3226993083953857, + "learning_rate": 4.93668715294618e-05, + "loss": 4.2685, + "step": 12072 + }, + { + "epoch": 0.07180155105147969, + "grad_norm": 2.200608730316162, + "learning_rate": 4.9366767069489715e-05, + "loss": 4.1155, + "step": 12073 + }, + { + "epoch": 0.07180749833476067, + "grad_norm": 2.381131649017334, + "learning_rate": 4.936666260101145e-05, + "loss": 3.9837, + "step": 12074 + }, + { + "epoch": 0.07181344561804168, + "grad_norm": 2.2567548751831055, + "learning_rate": 4.936655812402704e-05, + "loss": 4.0642, + "step": 12075 + }, + { + "epoch": 0.07181939290132268, + "grad_norm": 2.253011703491211, + "learning_rate": 4.9366453638536506e-05, + "loss": 4.0683, + "step": 12076 + }, + { + "epoch": 0.07182534018460367, + "grad_norm": 2.3459978103637695, + "learning_rate": 4.93663491445399e-05, + "loss": 4.0525, + "step": 12077 + }, + { + "epoch": 0.07183128746788467, + "grad_norm": 2.3964619636535645, + "learning_rate": 4.9366244642037254e-05, + "loss": 4.0198, + "step": 12078 + }, + { + "epoch": 0.07183723475116567, + "grad_norm": 2.392293930053711, + "learning_rate": 4.93661401310286e-05, + "loss": 3.7765, + "step": 12079 + }, + { + "epoch": 0.07184318203444666, + "grad_norm": 2.3027987480163574, + "learning_rate": 4.936603561151398e-05, + "loss": 4.0315, + "step": 12080 + }, + { + "epoch": 0.07184912931772766, + "grad_norm": 2.3942925930023193, + "learning_rate": 4.936593108349343e-05, + "loss": 4.1308, + "step": 12081 + }, + { + "epoch": 0.07185507660100866, + "grad_norm": 2.183898687362671, + "learning_rate": 4.9365826546966984e-05, + "loss": 4.0779, + "step": 12082 + }, + { + "epoch": 0.07186102388428965, + "grad_norm": 2.3463728427886963, + "learning_rate": 4.936572200193468e-05, + "loss": 4.0035, + "step": 12083 + }, + { + "epoch": 0.07186697116757065, + "grad_norm": 2.3459651470184326, + "learning_rate": 4.9365617448396556e-05, + "loss": 4.0577, + "step": 12084 + }, + { + "epoch": 0.07187291845085166, + "grad_norm": 2.169189691543579, + "learning_rate": 4.936551288635264e-05, + "loss": 4.2678, + "step": 12085 + }, + { + "epoch": 0.07187886573413264, + "grad_norm": 2.3313188552856445, + "learning_rate": 4.936540831580299e-05, + "loss": 4.9956, + "step": 12086 + }, + { + "epoch": 0.07188481301741365, + "grad_norm": 2.431053400039673, + "learning_rate": 4.936530373674761e-05, + "loss": 5.2317, + "step": 12087 + }, + { + "epoch": 0.07189076030069465, + "grad_norm": 1.8984981775283813, + "learning_rate": 4.936519914918656e-05, + "loss": 5.4541, + "step": 12088 + }, + { + "epoch": 0.07189670758397564, + "grad_norm": 1.8862982988357544, + "learning_rate": 4.9365094553119877e-05, + "loss": 5.6448, + "step": 12089 + }, + { + "epoch": 0.07190265486725664, + "grad_norm": 1.7802925109863281, + "learning_rate": 4.936498994854759e-05, + "loss": 5.3182, + "step": 12090 + }, + { + "epoch": 0.07190860215053764, + "grad_norm": 1.7578701972961426, + "learning_rate": 4.9364885335469734e-05, + "loss": 6.0188, + "step": 12091 + }, + { + "epoch": 0.07191454943381863, + "grad_norm": 1.6750003099441528, + "learning_rate": 4.9364780713886345e-05, + "loss": 6.0822, + "step": 12092 + }, + { + "epoch": 0.07192049671709963, + "grad_norm": 1.4945881366729736, + "learning_rate": 4.936467608379747e-05, + "loss": 6.0554, + "step": 12093 + }, + { + "epoch": 0.07192644400038063, + "grad_norm": 1.5508134365081787, + "learning_rate": 4.936457144520313e-05, + "loss": 5.9712, + "step": 12094 + }, + { + "epoch": 0.07193239128366162, + "grad_norm": 1.4133291244506836, + "learning_rate": 4.936446679810337e-05, + "loss": 5.9137, + "step": 12095 + }, + { + "epoch": 0.07193833856694262, + "grad_norm": 1.415930986404419, + "learning_rate": 4.936436214249823e-05, + "loss": 5.9957, + "step": 12096 + }, + { + "epoch": 0.07194428585022362, + "grad_norm": 1.682356595993042, + "learning_rate": 4.936425747838774e-05, + "loss": 6.2381, + "step": 12097 + }, + { + "epoch": 0.07195023313350461, + "grad_norm": 1.693535566329956, + "learning_rate": 4.9364152805771946e-05, + "loss": 6.0523, + "step": 12098 + }, + { + "epoch": 0.07195618041678561, + "grad_norm": 1.7577873468399048, + "learning_rate": 4.9364048124650875e-05, + "loss": 5.8243, + "step": 12099 + }, + { + "epoch": 0.0719621277000666, + "grad_norm": 1.6486074924468994, + "learning_rate": 4.936394343502457e-05, + "loss": 5.8072, + "step": 12100 + }, + { + "epoch": 0.0719680749833476, + "grad_norm": 1.5245120525360107, + "learning_rate": 4.936383873689306e-05, + "loss": 5.9013, + "step": 12101 + }, + { + "epoch": 0.0719740222666286, + "grad_norm": 1.4771286249160767, + "learning_rate": 4.936373403025638e-05, + "loss": 6.1314, + "step": 12102 + }, + { + "epoch": 0.0719799695499096, + "grad_norm": 1.7547197341918945, + "learning_rate": 4.936362931511458e-05, + "loss": 5.9725, + "step": 12103 + }, + { + "epoch": 0.0719859168331906, + "grad_norm": 1.9942286014556885, + "learning_rate": 4.936352459146769e-05, + "loss": 5.82, + "step": 12104 + }, + { + "epoch": 0.0719918641164716, + "grad_norm": 1.8367860317230225, + "learning_rate": 4.936341985931574e-05, + "loss": 5.8653, + "step": 12105 + }, + { + "epoch": 0.07199781139975259, + "grad_norm": 1.8277100324630737, + "learning_rate": 4.936331511865877e-05, + "loss": 5.6998, + "step": 12106 + }, + { + "epoch": 0.07200375868303359, + "grad_norm": 1.5308998823165894, + "learning_rate": 4.936321036949683e-05, + "loss": 5.822, + "step": 12107 + }, + { + "epoch": 0.07200970596631459, + "grad_norm": 1.7100377082824707, + "learning_rate": 4.936310561182993e-05, + "loss": 5.991, + "step": 12108 + }, + { + "epoch": 0.07201565324959558, + "grad_norm": 1.8563333749771118, + "learning_rate": 4.936300084565813e-05, + "loss": 5.8438, + "step": 12109 + }, + { + "epoch": 0.07202160053287658, + "grad_norm": 1.9967303276062012, + "learning_rate": 4.936289607098146e-05, + "loss": 5.6786, + "step": 12110 + }, + { + "epoch": 0.07202754781615758, + "grad_norm": 2.1997451782226562, + "learning_rate": 4.9362791287799945e-05, + "loss": 5.2983, + "step": 12111 + }, + { + "epoch": 0.07203349509943857, + "grad_norm": 2.144521713256836, + "learning_rate": 4.9362686496113644e-05, + "loss": 5.2942, + "step": 12112 + }, + { + "epoch": 0.07203944238271957, + "grad_norm": 2.0747883319854736, + "learning_rate": 4.936258169592257e-05, + "loss": 5.473, + "step": 12113 + }, + { + "epoch": 0.07204538966600058, + "grad_norm": 2.0386881828308105, + "learning_rate": 4.9362476887226776e-05, + "loss": 5.2557, + "step": 12114 + }, + { + "epoch": 0.07205133694928156, + "grad_norm": 2.190687894821167, + "learning_rate": 4.93623720700263e-05, + "loss": 5.3251, + "step": 12115 + }, + { + "epoch": 0.07205728423256257, + "grad_norm": 1.9349397420883179, + "learning_rate": 4.936226724432116e-05, + "loss": 5.242, + "step": 12116 + }, + { + "epoch": 0.07206323151584357, + "grad_norm": 2.175943613052368, + "learning_rate": 4.93621624101114e-05, + "loss": 5.185, + "step": 12117 + }, + { + "epoch": 0.07206917879912456, + "grad_norm": 2.053994655609131, + "learning_rate": 4.936205756739708e-05, + "loss": 5.0755, + "step": 12118 + }, + { + "epoch": 0.07207512608240556, + "grad_norm": 2.0012362003326416, + "learning_rate": 4.93619527161782e-05, + "loss": 5.1797, + "step": 12119 + }, + { + "epoch": 0.07208107336568656, + "grad_norm": 1.9441219568252563, + "learning_rate": 4.936184785645482e-05, + "loss": 5.5583, + "step": 12120 + }, + { + "epoch": 0.07208702064896755, + "grad_norm": 2.990767002105713, + "learning_rate": 4.936174298822696e-05, + "loss": 4.8348, + "step": 12121 + }, + { + "epoch": 0.07209296793224855, + "grad_norm": 2.8385918140411377, + "learning_rate": 4.936163811149469e-05, + "loss": 4.7299, + "step": 12122 + }, + { + "epoch": 0.07209891521552955, + "grad_norm": 2.5228044986724854, + "learning_rate": 4.9361533226258006e-05, + "loss": 4.622, + "step": 12123 + }, + { + "epoch": 0.07210486249881054, + "grad_norm": 2.317598581314087, + "learning_rate": 4.936142833251697e-05, + "loss": 4.588, + "step": 12124 + }, + { + "epoch": 0.07211080978209154, + "grad_norm": 2.369335889816284, + "learning_rate": 4.936132343027161e-05, + "loss": 4.3843, + "step": 12125 + }, + { + "epoch": 0.07211675706537254, + "grad_norm": 2.4761011600494385, + "learning_rate": 4.936121851952196e-05, + "loss": 4.4101, + "step": 12126 + }, + { + "epoch": 0.07212270434865353, + "grad_norm": 2.3830130100250244, + "learning_rate": 4.9361113600268065e-05, + "loss": 4.5065, + "step": 12127 + }, + { + "epoch": 0.07212865163193453, + "grad_norm": 2.4977028369903564, + "learning_rate": 4.936100867250996e-05, + "loss": 4.4469, + "step": 12128 + }, + { + "epoch": 0.07213459891521554, + "grad_norm": 2.3377795219421387, + "learning_rate": 4.9360903736247663e-05, + "loss": 4.4045, + "step": 12129 + }, + { + "epoch": 0.07214054619849652, + "grad_norm": 2.268906831741333, + "learning_rate": 4.9360798791481245e-05, + "loss": 4.4224, + "step": 12130 + }, + { + "epoch": 0.07214649348177753, + "grad_norm": 2.316899538040161, + "learning_rate": 4.936069383821072e-05, + "loss": 4.3704, + "step": 12131 + }, + { + "epoch": 0.07215244076505851, + "grad_norm": 2.419618606567383, + "learning_rate": 4.936058887643612e-05, + "loss": 5.493, + "step": 12132 + }, + { + "epoch": 0.07215838804833952, + "grad_norm": 2.081756353378296, + "learning_rate": 4.93604839061575e-05, + "loss": 6.2328, + "step": 12133 + }, + { + "epoch": 0.07216433533162052, + "grad_norm": 2.1638660430908203, + "learning_rate": 4.936037892737487e-05, + "loss": 6.3089, + "step": 12134 + }, + { + "epoch": 0.07217028261490151, + "grad_norm": 1.7972848415374756, + "learning_rate": 4.93602739400883e-05, + "loss": 6.4013, + "step": 12135 + }, + { + "epoch": 0.07217622989818251, + "grad_norm": 1.7160871028900146, + "learning_rate": 4.93601689442978e-05, + "loss": 6.1717, + "step": 12136 + }, + { + "epoch": 0.07218217718146351, + "grad_norm": 2.0931475162506104, + "learning_rate": 4.936006394000342e-05, + "loss": 5.3515, + "step": 12137 + }, + { + "epoch": 0.0721881244647445, + "grad_norm": 2.2872977256774902, + "learning_rate": 4.93599589272052e-05, + "loss": 5.8342, + "step": 12138 + }, + { + "epoch": 0.0721940717480255, + "grad_norm": 2.4082720279693604, + "learning_rate": 4.9359853905903166e-05, + "loss": 6.1651, + "step": 12139 + }, + { + "epoch": 0.0722000190313065, + "grad_norm": 2.120962381362915, + "learning_rate": 4.935974887609735e-05, + "loss": 6.1182, + "step": 12140 + }, + { + "epoch": 0.07220596631458749, + "grad_norm": 2.0507090091705322, + "learning_rate": 4.9359643837787805e-05, + "loss": 5.7158, + "step": 12141 + }, + { + "epoch": 0.0722119135978685, + "grad_norm": 2.099963426589966, + "learning_rate": 4.9359538790974556e-05, + "loss": 5.6952, + "step": 12142 + }, + { + "epoch": 0.0722178608811495, + "grad_norm": 1.7631537914276123, + "learning_rate": 4.935943373565765e-05, + "loss": 5.6649, + "step": 12143 + }, + { + "epoch": 0.07222380816443048, + "grad_norm": 1.739601492881775, + "learning_rate": 4.9359328671837115e-05, + "loss": 5.7258, + "step": 12144 + }, + { + "epoch": 0.07222975544771149, + "grad_norm": 1.630116581916809, + "learning_rate": 4.9359223599512996e-05, + "loss": 5.7305, + "step": 12145 + }, + { + "epoch": 0.07223570273099249, + "grad_norm": 1.6106374263763428, + "learning_rate": 4.935911851868531e-05, + "loss": 5.6779, + "step": 12146 + }, + { + "epoch": 0.07224165001427348, + "grad_norm": 1.945662021636963, + "learning_rate": 4.935901342935412e-05, + "loss": 5.716, + "step": 12147 + }, + { + "epoch": 0.07224759729755448, + "grad_norm": 1.8601467609405518, + "learning_rate": 4.935890833151944e-05, + "loss": 5.7539, + "step": 12148 + }, + { + "epoch": 0.07225354458083548, + "grad_norm": 1.8324257135391235, + "learning_rate": 4.9358803225181324e-05, + "loss": 5.7309, + "step": 12149 + }, + { + "epoch": 0.07225949186411647, + "grad_norm": 2.0564095973968506, + "learning_rate": 4.93586981103398e-05, + "loss": 5.7201, + "step": 12150 + }, + { + "epoch": 0.07226543914739747, + "grad_norm": 1.925706386566162, + "learning_rate": 4.93585929869949e-05, + "loss": 5.5736, + "step": 12151 + }, + { + "epoch": 0.07227138643067847, + "grad_norm": 1.5965845584869385, + "learning_rate": 4.935848785514667e-05, + "loss": 5.4351, + "step": 12152 + }, + { + "epoch": 0.07227733371395946, + "grad_norm": 2.2522077560424805, + "learning_rate": 4.935838271479515e-05, + "loss": 5.8261, + "step": 12153 + }, + { + "epoch": 0.07228328099724046, + "grad_norm": 2.242398738861084, + "learning_rate": 4.935827756594036e-05, + "loss": 5.9923, + "step": 12154 + }, + { + "epoch": 0.07228922828052146, + "grad_norm": 2.043266534805298, + "learning_rate": 4.935817240858236e-05, + "loss": 5.6127, + "step": 12155 + }, + { + "epoch": 0.07229517556380245, + "grad_norm": 2.4922964572906494, + "learning_rate": 4.935806724272116e-05, + "loss": 5.3549, + "step": 12156 + }, + { + "epoch": 0.07230112284708345, + "grad_norm": 2.5241329669952393, + "learning_rate": 4.935796206835682e-05, + "loss": 5.2194, + "step": 12157 + }, + { + "epoch": 0.07230707013036446, + "grad_norm": 2.4680237770080566, + "learning_rate": 4.9357856885489365e-05, + "loss": 5.1154, + "step": 12158 + }, + { + "epoch": 0.07231301741364544, + "grad_norm": 2.1012492179870605, + "learning_rate": 4.9357751694118824e-05, + "loss": 4.8526, + "step": 12159 + }, + { + "epoch": 0.07231896469692645, + "grad_norm": 1.9997994899749756, + "learning_rate": 4.935764649424526e-05, + "loss": 4.9778, + "step": 12160 + }, + { + "epoch": 0.07232491198020743, + "grad_norm": 1.770112156867981, + "learning_rate": 4.935754128586868e-05, + "loss": 5.0855, + "step": 12161 + }, + { + "epoch": 0.07233085926348844, + "grad_norm": 2.0865485668182373, + "learning_rate": 4.935743606898914e-05, + "loss": 5.1566, + "step": 12162 + }, + { + "epoch": 0.07233680654676944, + "grad_norm": 2.0801351070404053, + "learning_rate": 4.9357330843606677e-05, + "loss": 5.0611, + "step": 12163 + }, + { + "epoch": 0.07234275383005043, + "grad_norm": 1.8675305843353271, + "learning_rate": 4.935722560972131e-05, + "loss": 4.9216, + "step": 12164 + }, + { + "epoch": 0.07234870111333143, + "grad_norm": 1.9125452041625977, + "learning_rate": 4.935712036733309e-05, + "loss": 4.8363, + "step": 12165 + }, + { + "epoch": 0.07235464839661243, + "grad_norm": 2.4954965114593506, + "learning_rate": 4.935701511644205e-05, + "loss": 4.9816, + "step": 12166 + }, + { + "epoch": 0.07236059567989342, + "grad_norm": 2.412381410598755, + "learning_rate": 4.935690985704823e-05, + "loss": 4.9616, + "step": 12167 + }, + { + "epoch": 0.07236654296317442, + "grad_norm": 2.356994152069092, + "learning_rate": 4.9356804589151665e-05, + "loss": 4.8326, + "step": 12168 + }, + { + "epoch": 0.07237249024645542, + "grad_norm": 2.2399415969848633, + "learning_rate": 4.93566993127524e-05, + "loss": 4.8955, + "step": 12169 + }, + { + "epoch": 0.07237843752973641, + "grad_norm": 2.691772222518921, + "learning_rate": 4.935659402785044e-05, + "loss": 5.6475, + "step": 12170 + }, + { + "epoch": 0.07238438481301741, + "grad_norm": 2.954955816268921, + "learning_rate": 4.9356488734445865e-05, + "loss": 6.2151, + "step": 12171 + }, + { + "epoch": 0.07239033209629842, + "grad_norm": 2.010998010635376, + "learning_rate": 4.935638343253869e-05, + "loss": 5.9124, + "step": 12172 + }, + { + "epoch": 0.0723962793795794, + "grad_norm": 2.2737836837768555, + "learning_rate": 4.935627812212894e-05, + "loss": 5.4068, + "step": 12173 + }, + { + "epoch": 0.0724022266628604, + "grad_norm": 2.2700793743133545, + "learning_rate": 4.9356172803216675e-05, + "loss": 4.8156, + "step": 12174 + }, + { + "epoch": 0.07240817394614141, + "grad_norm": 2.2795162200927734, + "learning_rate": 4.935606747580192e-05, + "loss": 4.7882, + "step": 12175 + }, + { + "epoch": 0.0724141212294224, + "grad_norm": 2.1849277019500732, + "learning_rate": 4.9355962139884715e-05, + "loss": 4.9914, + "step": 12176 + }, + { + "epoch": 0.0724200685127034, + "grad_norm": 2.5336532592773438, + "learning_rate": 4.935585679546509e-05, + "loss": 4.8487, + "step": 12177 + }, + { + "epoch": 0.0724260157959844, + "grad_norm": 2.624995708465576, + "learning_rate": 4.935575144254309e-05, + "loss": 4.9523, + "step": 12178 + }, + { + "epoch": 0.07243196307926539, + "grad_norm": 2.5450191497802734, + "learning_rate": 4.935564608111875e-05, + "loss": 4.9958, + "step": 12179 + }, + { + "epoch": 0.07243791036254639, + "grad_norm": 2.2714452743530273, + "learning_rate": 4.9355540711192107e-05, + "loss": 5.301, + "step": 12180 + }, + { + "epoch": 0.07244385764582739, + "grad_norm": 2.0173168182373047, + "learning_rate": 4.935543533276319e-05, + "loss": 5.7992, + "step": 12181 + }, + { + "epoch": 0.07244980492910838, + "grad_norm": 2.9326014518737793, + "learning_rate": 4.9355329945832054e-05, + "loss": 5.6065, + "step": 12182 + }, + { + "epoch": 0.07245575221238938, + "grad_norm": 2.142066478729248, + "learning_rate": 4.935522455039871e-05, + "loss": 5.5339, + "step": 12183 + }, + { + "epoch": 0.07246169949567038, + "grad_norm": 1.8901113271713257, + "learning_rate": 4.9355119146463214e-05, + "loss": 5.8829, + "step": 12184 + }, + { + "epoch": 0.07246764677895137, + "grad_norm": 1.996052622795105, + "learning_rate": 4.93550137340256e-05, + "loss": 6.2189, + "step": 12185 + }, + { + "epoch": 0.07247359406223237, + "grad_norm": 1.7420963048934937, + "learning_rate": 4.93549083130859e-05, + "loss": 5.9254, + "step": 12186 + }, + { + "epoch": 0.07247954134551338, + "grad_norm": 2.8487229347229004, + "learning_rate": 4.935480288364416e-05, + "loss": 5.8643, + "step": 12187 + }, + { + "epoch": 0.07248548862879436, + "grad_norm": 3.0168306827545166, + "learning_rate": 4.93546974457004e-05, + "loss": 5.811, + "step": 12188 + }, + { + "epoch": 0.07249143591207537, + "grad_norm": 2.841353416442871, + "learning_rate": 4.935459199925467e-05, + "loss": 5.6832, + "step": 12189 + }, + { + "epoch": 0.07249738319535635, + "grad_norm": 2.3517918586730957, + "learning_rate": 4.9354486544307e-05, + "loss": 4.3651, + "step": 12190 + }, + { + "epoch": 0.07250333047863736, + "grad_norm": 2.3511440753936768, + "learning_rate": 4.935438108085744e-05, + "loss": 4.2884, + "step": 12191 + }, + { + "epoch": 0.07250927776191836, + "grad_norm": 2.0812551975250244, + "learning_rate": 4.935427560890601e-05, + "loss": 4.168, + "step": 12192 + }, + { + "epoch": 0.07251522504519935, + "grad_norm": 2.0546631813049316, + "learning_rate": 4.935417012845275e-05, + "loss": 3.862, + "step": 12193 + }, + { + "epoch": 0.07252117232848035, + "grad_norm": 2.130612850189209, + "learning_rate": 4.935406463949771e-05, + "loss": 3.6729, + "step": 12194 + }, + { + "epoch": 0.07252711961176135, + "grad_norm": 2.35225510597229, + "learning_rate": 4.9353959142040917e-05, + "loss": 3.7075, + "step": 12195 + }, + { + "epoch": 0.07253306689504234, + "grad_norm": 2.418698310852051, + "learning_rate": 4.93538536360824e-05, + "loss": 3.679, + "step": 12196 + }, + { + "epoch": 0.07253901417832334, + "grad_norm": 2.4452991485595703, + "learning_rate": 4.9353748121622214e-05, + "loss": 3.7827, + "step": 12197 + }, + { + "epoch": 0.07254496146160434, + "grad_norm": 2.3787992000579834, + "learning_rate": 4.935364259866038e-05, + "loss": 3.7484, + "step": 12198 + }, + { + "epoch": 0.07255090874488533, + "grad_norm": 2.299149751663208, + "learning_rate": 4.935353706719694e-05, + "loss": 3.6186, + "step": 12199 + }, + { + "epoch": 0.07255685602816633, + "grad_norm": 2.666121244430542, + "learning_rate": 4.9353431527231944e-05, + "loss": 3.5323, + "step": 12200 + }, + { + "epoch": 0.07256280331144734, + "grad_norm": 2.4448325634002686, + "learning_rate": 4.9353325978765404e-05, + "loss": 3.8176, + "step": 12201 + }, + { + "epoch": 0.07256875059472832, + "grad_norm": 2.5082852840423584, + "learning_rate": 4.935322042179737e-05, + "loss": 3.7838, + "step": 12202 + }, + { + "epoch": 0.07257469787800933, + "grad_norm": 2.3247005939483643, + "learning_rate": 4.935311485632788e-05, + "loss": 3.8036, + "step": 12203 + }, + { + "epoch": 0.07258064516129033, + "grad_norm": 2.4917871952056885, + "learning_rate": 4.9353009282356974e-05, + "loss": 3.6734, + "step": 12204 + }, + { + "epoch": 0.07258659244457132, + "grad_norm": 2.2535903453826904, + "learning_rate": 4.935290369988468e-05, + "loss": 3.7451, + "step": 12205 + }, + { + "epoch": 0.07259253972785232, + "grad_norm": 2.355896472930908, + "learning_rate": 4.9352798108911036e-05, + "loss": 3.5963, + "step": 12206 + }, + { + "epoch": 0.07259848701113332, + "grad_norm": 2.21923828125, + "learning_rate": 4.935269250943609e-05, + "loss": 3.5492, + "step": 12207 + }, + { + "epoch": 0.07260443429441431, + "grad_norm": 2.3795714378356934, + "learning_rate": 4.935258690145986e-05, + "loss": 3.7146, + "step": 12208 + }, + { + "epoch": 0.07261038157769531, + "grad_norm": 2.3866682052612305, + "learning_rate": 4.93524812849824e-05, + "loss": 3.7359, + "step": 12209 + }, + { + "epoch": 0.07261632886097631, + "grad_norm": 2.411289691925049, + "learning_rate": 4.935237566000374e-05, + "loss": 3.6958, + "step": 12210 + }, + { + "epoch": 0.0726222761442573, + "grad_norm": 2.3831989765167236, + "learning_rate": 4.935227002652392e-05, + "loss": 3.6696, + "step": 12211 + }, + { + "epoch": 0.0726282234275383, + "grad_norm": 2.1831908226013184, + "learning_rate": 4.935216438454297e-05, + "loss": 3.905, + "step": 12212 + }, + { + "epoch": 0.0726341707108193, + "grad_norm": 2.1136345863342285, + "learning_rate": 4.9352058734060934e-05, + "loss": 5.0188, + "step": 12213 + }, + { + "epoch": 0.07264011799410029, + "grad_norm": 2.2617692947387695, + "learning_rate": 4.935195307507784e-05, + "loss": 5.1883, + "step": 12214 + }, + { + "epoch": 0.0726460652773813, + "grad_norm": 2.4442226886749268, + "learning_rate": 4.935184740759374e-05, + "loss": 5.1883, + "step": 12215 + }, + { + "epoch": 0.0726520125606623, + "grad_norm": 2.300234794616699, + "learning_rate": 4.935174173160865e-05, + "loss": 4.9925, + "step": 12216 + }, + { + "epoch": 0.07265795984394328, + "grad_norm": 2.1512858867645264, + "learning_rate": 4.935163604712263e-05, + "loss": 4.883, + "step": 12217 + }, + { + "epoch": 0.07266390712722429, + "grad_norm": 2.210825204849243, + "learning_rate": 4.93515303541357e-05, + "loss": 5.165, + "step": 12218 + }, + { + "epoch": 0.07266985441050527, + "grad_norm": 2.1589086055755615, + "learning_rate": 4.935142465264791e-05, + "loss": 4.931, + "step": 12219 + }, + { + "epoch": 0.07267580169378628, + "grad_norm": 2.0527892112731934, + "learning_rate": 4.935131894265927e-05, + "loss": 5.0566, + "step": 12220 + }, + { + "epoch": 0.07268174897706728, + "grad_norm": 2.202828884124756, + "learning_rate": 4.935121322416985e-05, + "loss": 4.9519, + "step": 12221 + }, + { + "epoch": 0.07268769626034827, + "grad_norm": 2.262834310531616, + "learning_rate": 4.935110749717967e-05, + "loss": 4.9596, + "step": 12222 + }, + { + "epoch": 0.07269364354362927, + "grad_norm": 2.169311761856079, + "learning_rate": 4.935100176168877e-05, + "loss": 4.8968, + "step": 12223 + }, + { + "epoch": 0.07269959082691027, + "grad_norm": 2.137746572494507, + "learning_rate": 4.935089601769719e-05, + "loss": 4.8535, + "step": 12224 + }, + { + "epoch": 0.07270553811019126, + "grad_norm": 2.060861587524414, + "learning_rate": 4.935079026520496e-05, + "loss": 5.0784, + "step": 12225 + }, + { + "epoch": 0.07271148539347226, + "grad_norm": 2.235352039337158, + "learning_rate": 4.935068450421213e-05, + "loss": 4.7351, + "step": 12226 + }, + { + "epoch": 0.07271743267675326, + "grad_norm": 2.3832550048828125, + "learning_rate": 4.935057873471872e-05, + "loss": 4.618, + "step": 12227 + }, + { + "epoch": 0.07272337996003425, + "grad_norm": 2.3591537475585938, + "learning_rate": 4.935047295672477e-05, + "loss": 4.7029, + "step": 12228 + }, + { + "epoch": 0.07272932724331525, + "grad_norm": 2.2797207832336426, + "learning_rate": 4.935036717023033e-05, + "loss": 4.9199, + "step": 12229 + }, + { + "epoch": 0.07273527452659626, + "grad_norm": 2.4931957721710205, + "learning_rate": 4.935026137523542e-05, + "loss": 4.5923, + "step": 12230 + }, + { + "epoch": 0.07274122180987724, + "grad_norm": 2.152064323425293, + "learning_rate": 4.9350155571740095e-05, + "loss": 5.1495, + "step": 12231 + }, + { + "epoch": 0.07274716909315825, + "grad_norm": 2.470526695251465, + "learning_rate": 4.935004975974438e-05, + "loss": 4.8257, + "step": 12232 + }, + { + "epoch": 0.07275311637643925, + "grad_norm": 2.262578248977661, + "learning_rate": 4.9349943939248304e-05, + "loss": 5.7004, + "step": 12233 + }, + { + "epoch": 0.07275906365972024, + "grad_norm": 2.0813188552856445, + "learning_rate": 4.934983811025192e-05, + "loss": 5.6048, + "step": 12234 + }, + { + "epoch": 0.07276501094300124, + "grad_norm": 2.4882686138153076, + "learning_rate": 4.934973227275527e-05, + "loss": 5.8121, + "step": 12235 + }, + { + "epoch": 0.07277095822628224, + "grad_norm": 2.5181429386138916, + "learning_rate": 4.9349626426758364e-05, + "loss": 4.5581, + "step": 12236 + }, + { + "epoch": 0.07277690550956323, + "grad_norm": 2.6369354724884033, + "learning_rate": 4.934952057226127e-05, + "loss": 4.7938, + "step": 12237 + }, + { + "epoch": 0.07278285279284423, + "grad_norm": 1.8615930080413818, + "learning_rate": 4.9349414709264e-05, + "loss": 5.2097, + "step": 12238 + }, + { + "epoch": 0.07278880007612523, + "grad_norm": 1.4905575513839722, + "learning_rate": 4.93493088377666e-05, + "loss": 5.5717, + "step": 12239 + }, + { + "epoch": 0.07279474735940622, + "grad_norm": 1.8339897394180298, + "learning_rate": 4.9349202957769106e-05, + "loss": 5.6908, + "step": 12240 + }, + { + "epoch": 0.07280069464268722, + "grad_norm": 1.5875110626220703, + "learning_rate": 4.934909706927156e-05, + "loss": 5.6246, + "step": 12241 + }, + { + "epoch": 0.07280664192596822, + "grad_norm": 1.8365919589996338, + "learning_rate": 4.934899117227399e-05, + "loss": 5.394, + "step": 12242 + }, + { + "epoch": 0.07281258920924921, + "grad_norm": 1.9548145532608032, + "learning_rate": 4.934888526677645e-05, + "loss": 5.2427, + "step": 12243 + }, + { + "epoch": 0.07281853649253021, + "grad_norm": 1.8174974918365479, + "learning_rate": 4.934877935277896e-05, + "loss": 5.5844, + "step": 12244 + }, + { + "epoch": 0.07282448377581122, + "grad_norm": 1.800117015838623, + "learning_rate": 4.934867343028157e-05, + "loss": 4.9386, + "step": 12245 + }, + { + "epoch": 0.0728304310590922, + "grad_norm": 2.0356900691986084, + "learning_rate": 4.93485674992843e-05, + "loss": 4.6911, + "step": 12246 + }, + { + "epoch": 0.0728363783423732, + "grad_norm": 2.009455442428589, + "learning_rate": 4.93484615597872e-05, + "loss": 4.6121, + "step": 12247 + }, + { + "epoch": 0.0728423256256542, + "grad_norm": 1.9252879619598389, + "learning_rate": 4.934835561179031e-05, + "loss": 4.737, + "step": 12248 + }, + { + "epoch": 0.0728482729089352, + "grad_norm": 2.3497977256774902, + "learning_rate": 4.934824965529365e-05, + "loss": 5.6921, + "step": 12249 + }, + { + "epoch": 0.0728542201922162, + "grad_norm": 2.0821962356567383, + "learning_rate": 4.934814369029727e-05, + "loss": 5.3845, + "step": 12250 + }, + { + "epoch": 0.07286016747549719, + "grad_norm": 1.9725046157836914, + "learning_rate": 4.934803771680121e-05, + "loss": 5.5557, + "step": 12251 + }, + { + "epoch": 0.07286611475877819, + "grad_norm": 2.290238618850708, + "learning_rate": 4.93479317348055e-05, + "loss": 5.4258, + "step": 12252 + }, + { + "epoch": 0.07287206204205919, + "grad_norm": 1.9502376317977905, + "learning_rate": 4.934782574431017e-05, + "loss": 5.0531, + "step": 12253 + }, + { + "epoch": 0.07287800932534018, + "grad_norm": 2.128431797027588, + "learning_rate": 4.9347719745315275e-05, + "loss": 5.0241, + "step": 12254 + }, + { + "epoch": 0.07288395660862118, + "grad_norm": 1.9173803329467773, + "learning_rate": 4.934761373782084e-05, + "loss": 5.7107, + "step": 12255 + }, + { + "epoch": 0.07288990389190218, + "grad_norm": 1.5167652368545532, + "learning_rate": 4.93475077218269e-05, + "loss": 5.2304, + "step": 12256 + }, + { + "epoch": 0.07289585117518317, + "grad_norm": 1.4125497341156006, + "learning_rate": 4.9347401697333505e-05, + "loss": 5.1099, + "step": 12257 + }, + { + "epoch": 0.07290179845846417, + "grad_norm": 2.384801149368286, + "learning_rate": 4.934729566434068e-05, + "loss": 5.0051, + "step": 12258 + }, + { + "epoch": 0.07290774574174518, + "grad_norm": 1.9343961477279663, + "learning_rate": 4.934718962284846e-05, + "loss": 5.3367, + "step": 12259 + }, + { + "epoch": 0.07291369302502616, + "grad_norm": 2.048220157623291, + "learning_rate": 4.93470835728569e-05, + "loss": 5.8502, + "step": 12260 + }, + { + "epoch": 0.07291964030830717, + "grad_norm": 2.037167549133301, + "learning_rate": 4.934697751436601e-05, + "loss": 5.1993, + "step": 12261 + }, + { + "epoch": 0.07292558759158817, + "grad_norm": 1.8141452074050903, + "learning_rate": 4.9346871447375854e-05, + "loss": 5.8308, + "step": 12262 + }, + { + "epoch": 0.07293153487486916, + "grad_norm": 1.7525955438613892, + "learning_rate": 4.934676537188645e-05, + "loss": 5.5946, + "step": 12263 + }, + { + "epoch": 0.07293748215815016, + "grad_norm": 1.9784163236618042, + "learning_rate": 4.9346659287897846e-05, + "loss": 5.7214, + "step": 12264 + }, + { + "epoch": 0.07294342944143116, + "grad_norm": 1.8948242664337158, + "learning_rate": 4.934655319541007e-05, + "loss": 5.7434, + "step": 12265 + }, + { + "epoch": 0.07294937672471215, + "grad_norm": 1.698625087738037, + "learning_rate": 4.934644709442317e-05, + "loss": 5.7828, + "step": 12266 + }, + { + "epoch": 0.07295532400799315, + "grad_norm": 1.6057854890823364, + "learning_rate": 4.934634098493717e-05, + "loss": 5.8815, + "step": 12267 + }, + { + "epoch": 0.07296127129127415, + "grad_norm": 1.4753777980804443, + "learning_rate": 4.9346234866952125e-05, + "loss": 5.8368, + "step": 12268 + }, + { + "epoch": 0.07296721857455514, + "grad_norm": 1.8265280723571777, + "learning_rate": 4.9346128740468046e-05, + "loss": 5.7511, + "step": 12269 + }, + { + "epoch": 0.07297316585783614, + "grad_norm": 1.7212530374526978, + "learning_rate": 4.9346022605485e-05, + "loss": 5.6741, + "step": 12270 + }, + { + "epoch": 0.07297911314111714, + "grad_norm": 1.8423148393630981, + "learning_rate": 4.9345916462002996e-05, + "loss": 5.5199, + "step": 12271 + }, + { + "epoch": 0.07298506042439813, + "grad_norm": 1.7754487991333008, + "learning_rate": 4.934581031002209e-05, + "loss": 5.9655, + "step": 12272 + }, + { + "epoch": 0.07299100770767913, + "grad_norm": 1.794704794883728, + "learning_rate": 4.9345704149542313e-05, + "loss": 5.886, + "step": 12273 + }, + { + "epoch": 0.07299695499096014, + "grad_norm": 1.807165503501892, + "learning_rate": 4.93455979805637e-05, + "loss": 5.5222, + "step": 12274 + }, + { + "epoch": 0.07300290227424112, + "grad_norm": 1.6476585865020752, + "learning_rate": 4.934549180308629e-05, + "loss": 5.6588, + "step": 12275 + }, + { + "epoch": 0.07300884955752213, + "grad_norm": 1.8332840204238892, + "learning_rate": 4.9345385617110125e-05, + "loss": 5.0781, + "step": 12276 + }, + { + "epoch": 0.07301479684080311, + "grad_norm": 1.837471842765808, + "learning_rate": 4.934527942263523e-05, + "loss": 5.8881, + "step": 12277 + }, + { + "epoch": 0.07302074412408412, + "grad_norm": 1.538299798965454, + "learning_rate": 4.934517321966165e-05, + "loss": 6.0547, + "step": 12278 + }, + { + "epoch": 0.07302669140736512, + "grad_norm": 1.9346814155578613, + "learning_rate": 4.934506700818943e-05, + "loss": 5.7853, + "step": 12279 + }, + { + "epoch": 0.0730326386906461, + "grad_norm": 1.9108514785766602, + "learning_rate": 4.93449607882186e-05, + "loss": 5.8034, + "step": 12280 + }, + { + "epoch": 0.07303858597392711, + "grad_norm": 2.0216846466064453, + "learning_rate": 4.934485455974919e-05, + "loss": 5.5127, + "step": 12281 + }, + { + "epoch": 0.07304453325720811, + "grad_norm": 2.2365148067474365, + "learning_rate": 4.9344748322781244e-05, + "loss": 5.5519, + "step": 12282 + }, + { + "epoch": 0.0730504805404891, + "grad_norm": 1.872934103012085, + "learning_rate": 4.934464207731479e-05, + "loss": 5.783, + "step": 12283 + }, + { + "epoch": 0.0730564278237701, + "grad_norm": 1.944606900215149, + "learning_rate": 4.934453582334988e-05, + "loss": 5.9803, + "step": 12284 + }, + { + "epoch": 0.0730623751070511, + "grad_norm": 1.765257477760315, + "learning_rate": 4.934442956088654e-05, + "loss": 5.8434, + "step": 12285 + }, + { + "epoch": 0.07306832239033209, + "grad_norm": 1.9726130962371826, + "learning_rate": 4.934432328992482e-05, + "loss": 5.6173, + "step": 12286 + }, + { + "epoch": 0.0730742696736131, + "grad_norm": 2.0510616302490234, + "learning_rate": 4.934421701046474e-05, + "loss": 5.4661, + "step": 12287 + }, + { + "epoch": 0.0730802169568941, + "grad_norm": 1.6038832664489746, + "learning_rate": 4.934411072250635e-05, + "loss": 5.2786, + "step": 12288 + }, + { + "epoch": 0.07308616424017508, + "grad_norm": 2.0088446140289307, + "learning_rate": 4.934400442604968e-05, + "loss": 4.9999, + "step": 12289 + }, + { + "epoch": 0.07309211152345609, + "grad_norm": 1.4760913848876953, + "learning_rate": 4.934389812109477e-05, + "loss": 4.785, + "step": 12290 + }, + { + "epoch": 0.07309805880673709, + "grad_norm": 2.2036757469177246, + "learning_rate": 4.934379180764166e-05, + "loss": 5.8303, + "step": 12291 + }, + { + "epoch": 0.07310400609001808, + "grad_norm": 2.0261359214782715, + "learning_rate": 4.9343685485690385e-05, + "loss": 5.6823, + "step": 12292 + }, + { + "epoch": 0.07310995337329908, + "grad_norm": 1.7493160963058472, + "learning_rate": 4.934357915524097e-05, + "loss": 5.6144, + "step": 12293 + }, + { + "epoch": 0.07311590065658008, + "grad_norm": 1.887373685836792, + "learning_rate": 4.934347281629347e-05, + "loss": 5.9405, + "step": 12294 + }, + { + "epoch": 0.07312184793986107, + "grad_norm": 1.6655008792877197, + "learning_rate": 4.9343366468847915e-05, + "loss": 5.8376, + "step": 12295 + }, + { + "epoch": 0.07312779522314207, + "grad_norm": 1.9241079092025757, + "learning_rate": 4.9343260112904345e-05, + "loss": 5.6072, + "step": 12296 + }, + { + "epoch": 0.07313374250642307, + "grad_norm": 1.7873997688293457, + "learning_rate": 4.934315374846279e-05, + "loss": 5.539, + "step": 12297 + }, + { + "epoch": 0.07313968978970406, + "grad_norm": 1.9266597032546997, + "learning_rate": 4.9343047375523296e-05, + "loss": 5.3921, + "step": 12298 + }, + { + "epoch": 0.07314563707298506, + "grad_norm": 1.9283325672149658, + "learning_rate": 4.934294099408589e-05, + "loss": 5.2326, + "step": 12299 + }, + { + "epoch": 0.07315158435626606, + "grad_norm": 1.739047884941101, + "learning_rate": 4.934283460415062e-05, + "loss": 5.4831, + "step": 12300 + }, + { + "epoch": 0.07315753163954705, + "grad_norm": 1.6729072332382202, + "learning_rate": 4.934272820571752e-05, + "loss": 5.633, + "step": 12301 + }, + { + "epoch": 0.07316347892282805, + "grad_norm": 1.6901992559432983, + "learning_rate": 4.9342621798786616e-05, + "loss": 5.6121, + "step": 12302 + }, + { + "epoch": 0.07316942620610906, + "grad_norm": 1.8640037775039673, + "learning_rate": 4.9342515383357956e-05, + "loss": 5.6498, + "step": 12303 + }, + { + "epoch": 0.07317537348939004, + "grad_norm": 1.9629018306732178, + "learning_rate": 4.9342408959431576e-05, + "loss": 5.9364, + "step": 12304 + }, + { + "epoch": 0.07318132077267105, + "grad_norm": 1.9370427131652832, + "learning_rate": 4.934230252700752e-05, + "loss": 5.8945, + "step": 12305 + }, + { + "epoch": 0.07318726805595203, + "grad_norm": 1.6541575193405151, + "learning_rate": 4.9342196086085814e-05, + "loss": 5.5826, + "step": 12306 + }, + { + "epoch": 0.07319321533923304, + "grad_norm": 1.6640154123306274, + "learning_rate": 4.934208963666649e-05, + "loss": 5.7065, + "step": 12307 + }, + { + "epoch": 0.07319916262251404, + "grad_norm": 1.596665620803833, + "learning_rate": 4.934198317874961e-05, + "loss": 5.6764, + "step": 12308 + }, + { + "epoch": 0.07320510990579503, + "grad_norm": 1.841260552406311, + "learning_rate": 4.9341876712335176e-05, + "loss": 5.624, + "step": 12309 + }, + { + "epoch": 0.07321105718907603, + "grad_norm": 1.921162724494934, + "learning_rate": 4.9341770237423254e-05, + "loss": 5.3177, + "step": 12310 + }, + { + "epoch": 0.07321700447235703, + "grad_norm": 1.844192624092102, + "learning_rate": 4.934166375401388e-05, + "loss": 5.6236, + "step": 12311 + }, + { + "epoch": 0.07322295175563802, + "grad_norm": 1.9088208675384521, + "learning_rate": 4.934155726210707e-05, + "loss": 5.7487, + "step": 12312 + }, + { + "epoch": 0.07322889903891902, + "grad_norm": 2.1057817935943604, + "learning_rate": 4.934145076170288e-05, + "loss": 5.3372, + "step": 12313 + }, + { + "epoch": 0.07323484632220002, + "grad_norm": 1.9507678747177124, + "learning_rate": 4.9341344252801335e-05, + "loss": 5.9318, + "step": 12314 + }, + { + "epoch": 0.07324079360548101, + "grad_norm": 1.9885265827178955, + "learning_rate": 4.934123773540249e-05, + "loss": 5.7724, + "step": 12315 + }, + { + "epoch": 0.07324674088876201, + "grad_norm": 1.81960129737854, + "learning_rate": 4.934113120950636e-05, + "loss": 5.7624, + "step": 12316 + }, + { + "epoch": 0.07325268817204302, + "grad_norm": 1.7848392724990845, + "learning_rate": 4.9341024675112994e-05, + "loss": 5.8135, + "step": 12317 + }, + { + "epoch": 0.073258635455324, + "grad_norm": 1.8326808214187622, + "learning_rate": 4.9340918132222436e-05, + "loss": 5.9725, + "step": 12318 + }, + { + "epoch": 0.073264582738605, + "grad_norm": 1.731719970703125, + "learning_rate": 4.93408115808347e-05, + "loss": 5.8932, + "step": 12319 + }, + { + "epoch": 0.07327053002188601, + "grad_norm": 1.7635269165039062, + "learning_rate": 4.934070502094985e-05, + "loss": 5.4953, + "step": 12320 + }, + { + "epoch": 0.073276477305167, + "grad_norm": 1.61715829372406, + "learning_rate": 4.934059845256791e-05, + "loss": 5.4043, + "step": 12321 + }, + { + "epoch": 0.073282424588448, + "grad_norm": 1.9188543558120728, + "learning_rate": 4.9340491875688914e-05, + "loss": 5.2762, + "step": 12322 + }, + { + "epoch": 0.073288371871729, + "grad_norm": 2.098680019378662, + "learning_rate": 4.9340385290312904e-05, + "loss": 5.4673, + "step": 12323 + }, + { + "epoch": 0.07329431915500999, + "grad_norm": 2.15560245513916, + "learning_rate": 4.934027869643992e-05, + "loss": 5.9124, + "step": 12324 + }, + { + "epoch": 0.07330026643829099, + "grad_norm": 1.9819902181625366, + "learning_rate": 4.934017209407e-05, + "loss": 5.5686, + "step": 12325 + }, + { + "epoch": 0.07330621372157199, + "grad_norm": 2.517003059387207, + "learning_rate": 4.934006548320317e-05, + "loss": 3.9751, + "step": 12326 + }, + { + "epoch": 0.07331216100485298, + "grad_norm": 2.458714723587036, + "learning_rate": 4.9339958863839474e-05, + "loss": 3.7976, + "step": 12327 + }, + { + "epoch": 0.07331810828813398, + "grad_norm": 2.2642102241516113, + "learning_rate": 4.9339852235978955e-05, + "loss": 3.8853, + "step": 12328 + }, + { + "epoch": 0.07332405557141498, + "grad_norm": 2.3097565174102783, + "learning_rate": 4.9339745599621645e-05, + "loss": 3.5699, + "step": 12329 + }, + { + "epoch": 0.07333000285469597, + "grad_norm": 2.312995195388794, + "learning_rate": 4.933963895476758e-05, + "loss": 3.8338, + "step": 12330 + }, + { + "epoch": 0.07333595013797697, + "grad_norm": 2.69657826423645, + "learning_rate": 4.93395323014168e-05, + "loss": 5.3459, + "step": 12331 + }, + { + "epoch": 0.07334189742125798, + "grad_norm": 2.263038396835327, + "learning_rate": 4.9339425639569336e-05, + "loss": 5.712, + "step": 12332 + }, + { + "epoch": 0.07334784470453896, + "grad_norm": 1.9429599046707153, + "learning_rate": 4.9339318969225235e-05, + "loss": 5.7465, + "step": 12333 + }, + { + "epoch": 0.07335379198781997, + "grad_norm": 2.07045841217041, + "learning_rate": 4.933921229038453e-05, + "loss": 5.6726, + "step": 12334 + }, + { + "epoch": 0.07335973927110095, + "grad_norm": 2.0304102897644043, + "learning_rate": 4.933910560304725e-05, + "loss": 5.8084, + "step": 12335 + }, + { + "epoch": 0.07336568655438196, + "grad_norm": 1.8316701650619507, + "learning_rate": 4.933899890721344e-05, + "loss": 5.3852, + "step": 12336 + }, + { + "epoch": 0.07337163383766296, + "grad_norm": 2.1406614780426025, + "learning_rate": 4.933889220288315e-05, + "loss": 5.1097, + "step": 12337 + }, + { + "epoch": 0.07337758112094395, + "grad_norm": 1.7518030405044556, + "learning_rate": 4.9338785490056395e-05, + "loss": 5.2038, + "step": 12338 + }, + { + "epoch": 0.07338352840422495, + "grad_norm": 1.8387973308563232, + "learning_rate": 4.933867876873322e-05, + "loss": 5.0847, + "step": 12339 + }, + { + "epoch": 0.07338947568750595, + "grad_norm": 1.692947506904602, + "learning_rate": 4.933857203891367e-05, + "loss": 5.6124, + "step": 12340 + }, + { + "epoch": 0.07339542297078694, + "grad_norm": 1.6367069482803345, + "learning_rate": 4.933846530059776e-05, + "loss": 5.7119, + "step": 12341 + }, + { + "epoch": 0.07340137025406794, + "grad_norm": 2.0395610332489014, + "learning_rate": 4.933835855378556e-05, + "loss": 5.4164, + "step": 12342 + }, + { + "epoch": 0.07340731753734894, + "grad_norm": 2.074073314666748, + "learning_rate": 4.933825179847709e-05, + "loss": 5.3952, + "step": 12343 + }, + { + "epoch": 0.07341326482062993, + "grad_norm": 2.2825684547424316, + "learning_rate": 4.9338145034672376e-05, + "loss": 5.4019, + "step": 12344 + }, + { + "epoch": 0.07341921210391093, + "grad_norm": 2.006591796875, + "learning_rate": 4.9338038262371476e-05, + "loss": 5.4422, + "step": 12345 + }, + { + "epoch": 0.07342515938719194, + "grad_norm": 2.10418701171875, + "learning_rate": 4.9337931481574415e-05, + "loss": 5.3801, + "step": 12346 + }, + { + "epoch": 0.07343110667047292, + "grad_norm": 1.9998257160186768, + "learning_rate": 4.9337824692281233e-05, + "loss": 5.1673, + "step": 12347 + }, + { + "epoch": 0.07343705395375393, + "grad_norm": 2.175896644592285, + "learning_rate": 4.933771789449197e-05, + "loss": 5.118, + "step": 12348 + }, + { + "epoch": 0.07344300123703493, + "grad_norm": 2.075164318084717, + "learning_rate": 4.933761108820666e-05, + "loss": 5.1662, + "step": 12349 + }, + { + "epoch": 0.07344894852031592, + "grad_norm": 2.0672569274902344, + "learning_rate": 4.933750427342534e-05, + "loss": 5.0957, + "step": 12350 + }, + { + "epoch": 0.07345489580359692, + "grad_norm": 2.0570287704467773, + "learning_rate": 4.9337397450148055e-05, + "loss": 5.2772, + "step": 12351 + }, + { + "epoch": 0.07346084308687792, + "grad_norm": 2.0653116703033447, + "learning_rate": 4.933729061837483e-05, + "loss": 5.4755, + "step": 12352 + }, + { + "epoch": 0.07346679037015891, + "grad_norm": 2.832578420639038, + "learning_rate": 4.933718377810571e-05, + "loss": 4.8128, + "step": 12353 + }, + { + "epoch": 0.07347273765343991, + "grad_norm": 2.378556251525879, + "learning_rate": 4.933707692934073e-05, + "loss": 5.109, + "step": 12354 + }, + { + "epoch": 0.07347868493672091, + "grad_norm": 2.1819205284118652, + "learning_rate": 4.933697007207993e-05, + "loss": 4.8603, + "step": 12355 + }, + { + "epoch": 0.0734846322200019, + "grad_norm": 2.104738473892212, + "learning_rate": 4.9336863206323345e-05, + "loss": 4.7806, + "step": 12356 + }, + { + "epoch": 0.0734905795032829, + "grad_norm": 1.8287266492843628, + "learning_rate": 4.933675633207101e-05, + "loss": 4.7082, + "step": 12357 + }, + { + "epoch": 0.0734965267865639, + "grad_norm": 2.0478014945983887, + "learning_rate": 4.933664944932297e-05, + "loss": 4.6145, + "step": 12358 + }, + { + "epoch": 0.07350247406984489, + "grad_norm": 2.208263397216797, + "learning_rate": 4.9336542558079244e-05, + "loss": 4.7523, + "step": 12359 + }, + { + "epoch": 0.0735084213531259, + "grad_norm": 2.1506083011627197, + "learning_rate": 4.93364356583399e-05, + "loss": 4.7444, + "step": 12360 + }, + { + "epoch": 0.0735143686364069, + "grad_norm": 2.04584002494812, + "learning_rate": 4.933632875010494e-05, + "loss": 4.6706, + "step": 12361 + }, + { + "epoch": 0.07352031591968788, + "grad_norm": 1.8598030805587769, + "learning_rate": 4.933622183337443e-05, + "loss": 4.6404, + "step": 12362 + }, + { + "epoch": 0.07352626320296889, + "grad_norm": 2.5650441646575928, + "learning_rate": 4.93361149081484e-05, + "loss": 5.382, + "step": 12363 + }, + { + "epoch": 0.07353221048624987, + "grad_norm": 2.1182446479797363, + "learning_rate": 4.933600797442688e-05, + "loss": 5.9041, + "step": 12364 + }, + { + "epoch": 0.07353815776953088, + "grad_norm": 1.8753353357315063, + "learning_rate": 4.933590103220991e-05, + "loss": 5.6615, + "step": 12365 + }, + { + "epoch": 0.07354410505281188, + "grad_norm": 1.9428893327713013, + "learning_rate": 4.933579408149752e-05, + "loss": 5.3549, + "step": 12366 + }, + { + "epoch": 0.07355005233609287, + "grad_norm": 1.809191346168518, + "learning_rate": 4.9335687122289766e-05, + "loss": 5.5603, + "step": 12367 + }, + { + "epoch": 0.07355599961937387, + "grad_norm": 1.7782649993896484, + "learning_rate": 4.933558015458667e-05, + "loss": 5.2848, + "step": 12368 + }, + { + "epoch": 0.07356194690265487, + "grad_norm": 1.71909499168396, + "learning_rate": 4.933547317838828e-05, + "loss": 5.3774, + "step": 12369 + }, + { + "epoch": 0.07356789418593586, + "grad_norm": 1.6399723291397095, + "learning_rate": 4.9335366193694625e-05, + "loss": 5.629, + "step": 12370 + }, + { + "epoch": 0.07357384146921686, + "grad_norm": 1.8646855354309082, + "learning_rate": 4.9335259200505746e-05, + "loss": 5.6297, + "step": 12371 + }, + { + "epoch": 0.07357978875249786, + "grad_norm": 1.5271104574203491, + "learning_rate": 4.9335152198821676e-05, + "loss": 5.6112, + "step": 12372 + }, + { + "epoch": 0.07358573603577885, + "grad_norm": 1.6217905282974243, + "learning_rate": 4.933504518864246e-05, + "loss": 5.2959, + "step": 12373 + }, + { + "epoch": 0.07359168331905985, + "grad_norm": 1.5774266719818115, + "learning_rate": 4.933493816996812e-05, + "loss": 5.4181, + "step": 12374 + }, + { + "epoch": 0.07359763060234085, + "grad_norm": 1.3641432523727417, + "learning_rate": 4.933483114279872e-05, + "loss": 5.3903, + "step": 12375 + }, + { + "epoch": 0.07360357788562184, + "grad_norm": 1.67635178565979, + "learning_rate": 4.933472410713428e-05, + "loss": 5.6771, + "step": 12376 + }, + { + "epoch": 0.07360952516890285, + "grad_norm": 1.6944624185562134, + "learning_rate": 4.933461706297483e-05, + "loss": 5.6008, + "step": 12377 + }, + { + "epoch": 0.07361547245218385, + "grad_norm": 1.3603699207305908, + "learning_rate": 4.933451001032042e-05, + "loss": 5.5396, + "step": 12378 + }, + { + "epoch": 0.07362141973546484, + "grad_norm": 1.6585369110107422, + "learning_rate": 4.9334402949171086e-05, + "loss": 5.5697, + "step": 12379 + }, + { + "epoch": 0.07362736701874584, + "grad_norm": 1.503786563873291, + "learning_rate": 4.9334295879526865e-05, + "loss": 5.4539, + "step": 12380 + }, + { + "epoch": 0.07363331430202684, + "grad_norm": 1.4761176109313965, + "learning_rate": 4.933418880138779e-05, + "loss": 5.4573, + "step": 12381 + }, + { + "epoch": 0.07363926158530783, + "grad_norm": 1.671972393989563, + "learning_rate": 4.93340817147539e-05, + "loss": 5.4143, + "step": 12382 + }, + { + "epoch": 0.07364520886858883, + "grad_norm": 1.5486379861831665, + "learning_rate": 4.9333974619625236e-05, + "loss": 5.4134, + "step": 12383 + }, + { + "epoch": 0.07365115615186983, + "grad_norm": 1.340108036994934, + "learning_rate": 4.933386751600183e-05, + "loss": 5.4587, + "step": 12384 + }, + { + "epoch": 0.07365710343515082, + "grad_norm": 1.3910952806472778, + "learning_rate": 4.933376040388372e-05, + "loss": 5.4129, + "step": 12385 + }, + { + "epoch": 0.07366305071843182, + "grad_norm": 1.5878056287765503, + "learning_rate": 4.9333653283270955e-05, + "loss": 5.3633, + "step": 12386 + }, + { + "epoch": 0.07366899800171282, + "grad_norm": 1.6040968894958496, + "learning_rate": 4.933354615416356e-05, + "loss": 5.2486, + "step": 12387 + }, + { + "epoch": 0.07367494528499381, + "grad_norm": 1.4824137687683105, + "learning_rate": 4.933343901656157e-05, + "loss": 5.2947, + "step": 12388 + }, + { + "epoch": 0.07368089256827481, + "grad_norm": 1.6114120483398438, + "learning_rate": 4.933333187046503e-05, + "loss": 5.2948, + "step": 12389 + }, + { + "epoch": 0.07368683985155582, + "grad_norm": 1.4269661903381348, + "learning_rate": 4.933322471587398e-05, + "loss": 5.1633, + "step": 12390 + }, + { + "epoch": 0.0736927871348368, + "grad_norm": 1.430588960647583, + "learning_rate": 4.933311755278844e-05, + "loss": 5.2846, + "step": 12391 + }, + { + "epoch": 0.0736987344181178, + "grad_norm": 1.3490641117095947, + "learning_rate": 4.9333010381208476e-05, + "loss": 5.2067, + "step": 12392 + }, + { + "epoch": 0.0737046817013988, + "grad_norm": 1.9292722940444946, + "learning_rate": 4.9332903201134104e-05, + "loss": 5.6196, + "step": 12393 + }, + { + "epoch": 0.0737106289846798, + "grad_norm": 1.8885586261749268, + "learning_rate": 4.933279601256536e-05, + "loss": 5.5225, + "step": 12394 + }, + { + "epoch": 0.0737165762679608, + "grad_norm": 1.5985313653945923, + "learning_rate": 4.93326888155023e-05, + "loss": 5.7447, + "step": 12395 + }, + { + "epoch": 0.07372252355124179, + "grad_norm": 2.819392681121826, + "learning_rate": 4.933258160994494e-05, + "loss": 6.002, + "step": 12396 + }, + { + "epoch": 0.07372847083452279, + "grad_norm": 2.006615161895752, + "learning_rate": 4.933247439589333e-05, + "loss": 5.7733, + "step": 12397 + }, + { + "epoch": 0.07373441811780379, + "grad_norm": 1.628408432006836, + "learning_rate": 4.933236717334751e-05, + "loss": 5.3899, + "step": 12398 + }, + { + "epoch": 0.07374036540108478, + "grad_norm": 1.5265247821807861, + "learning_rate": 4.93322599423075e-05, + "loss": 5.3891, + "step": 12399 + }, + { + "epoch": 0.07374631268436578, + "grad_norm": 1.6663800477981567, + "learning_rate": 4.933215270277336e-05, + "loss": 5.6172, + "step": 12400 + }, + { + "epoch": 0.07375225996764678, + "grad_norm": 1.7699551582336426, + "learning_rate": 4.933204545474511e-05, + "loss": 5.7088, + "step": 12401 + }, + { + "epoch": 0.07375820725092777, + "grad_norm": 1.5542314052581787, + "learning_rate": 4.93319381982228e-05, + "loss": 5.5925, + "step": 12402 + }, + { + "epoch": 0.07376415453420877, + "grad_norm": 1.5389710664749146, + "learning_rate": 4.933183093320646e-05, + "loss": 5.572, + "step": 12403 + }, + { + "epoch": 0.07377010181748977, + "grad_norm": 1.381242275238037, + "learning_rate": 4.9331723659696124e-05, + "loss": 5.4964, + "step": 12404 + }, + { + "epoch": 0.07377604910077076, + "grad_norm": 1.5536670684814453, + "learning_rate": 4.933161637769184e-05, + "loss": 5.3748, + "step": 12405 + }, + { + "epoch": 0.07378199638405177, + "grad_norm": 1.6656473875045776, + "learning_rate": 4.933150908719364e-05, + "loss": 5.3267, + "step": 12406 + }, + { + "epoch": 0.07378794366733277, + "grad_norm": 1.9200701713562012, + "learning_rate": 4.933140178820156e-05, + "loss": 5.2928, + "step": 12407 + }, + { + "epoch": 0.07379389095061376, + "grad_norm": 1.6290313005447388, + "learning_rate": 4.933129448071564e-05, + "loss": 5.4969, + "step": 12408 + }, + { + "epoch": 0.07379983823389476, + "grad_norm": 1.7247267961502075, + "learning_rate": 4.933118716473592e-05, + "loss": 5.564, + "step": 12409 + }, + { + "epoch": 0.07380578551717576, + "grad_norm": 1.4726417064666748, + "learning_rate": 4.933107984026243e-05, + "loss": 5.1759, + "step": 12410 + }, + { + "epoch": 0.07381173280045675, + "grad_norm": 1.4726674556732178, + "learning_rate": 4.933097250729522e-05, + "loss": 5.1731, + "step": 12411 + }, + { + "epoch": 0.07381768008373775, + "grad_norm": 1.4694938659667969, + "learning_rate": 4.93308651658343e-05, + "loss": 5.4539, + "step": 12412 + }, + { + "epoch": 0.07382362736701875, + "grad_norm": 1.5212653875350952, + "learning_rate": 4.9330757815879734e-05, + "loss": 5.5035, + "step": 12413 + }, + { + "epoch": 0.07382957465029974, + "grad_norm": 1.3731454610824585, + "learning_rate": 4.933065045743156e-05, + "loss": 5.415, + "step": 12414 + }, + { + "epoch": 0.07383552193358074, + "grad_norm": 1.5576610565185547, + "learning_rate": 4.93305430904898e-05, + "loss": 5.2776, + "step": 12415 + }, + { + "epoch": 0.07384146921686174, + "grad_norm": 1.72965407371521, + "learning_rate": 4.93304357150545e-05, + "loss": 5.3598, + "step": 12416 + }, + { + "epoch": 0.07384741650014273, + "grad_norm": 1.5218521356582642, + "learning_rate": 4.93303283311257e-05, + "loss": 5.295, + "step": 12417 + }, + { + "epoch": 0.07385336378342373, + "grad_norm": 1.5174230337142944, + "learning_rate": 4.933022093870343e-05, + "loss": 5.3506, + "step": 12418 + }, + { + "epoch": 0.07385931106670474, + "grad_norm": 1.3844187259674072, + "learning_rate": 4.933011353778773e-05, + "loss": 5.4345, + "step": 12419 + }, + { + "epoch": 0.07386525834998572, + "grad_norm": 1.5130188465118408, + "learning_rate": 4.9330006128378645e-05, + "loss": 5.4359, + "step": 12420 + }, + { + "epoch": 0.07387120563326673, + "grad_norm": 1.599004864692688, + "learning_rate": 4.93298987104762e-05, + "loss": 5.1631, + "step": 12421 + }, + { + "epoch": 0.07387715291654771, + "grad_norm": 1.6220343112945557, + "learning_rate": 4.932979128408044e-05, + "loss": 5.1244, + "step": 12422 + }, + { + "epoch": 0.07388310019982872, + "grad_norm": 1.5366616249084473, + "learning_rate": 4.93296838491914e-05, + "loss": 5.0368, + "step": 12423 + }, + { + "epoch": 0.07388904748310972, + "grad_norm": 1.5800726413726807, + "learning_rate": 4.932957640580912e-05, + "loss": 4.9906, + "step": 12424 + }, + { + "epoch": 0.0738949947663907, + "grad_norm": 1.6035537719726562, + "learning_rate": 4.9329468953933637e-05, + "loss": 5.0616, + "step": 12425 + }, + { + "epoch": 0.07390094204967171, + "grad_norm": 1.580127239227295, + "learning_rate": 4.932936149356499e-05, + "loss": 5.145, + "step": 12426 + }, + { + "epoch": 0.07390688933295271, + "grad_norm": 1.724788784980774, + "learning_rate": 4.932925402470321e-05, + "loss": 4.9589, + "step": 12427 + }, + { + "epoch": 0.0739128366162337, + "grad_norm": 1.5442367792129517, + "learning_rate": 4.932914654734834e-05, + "loss": 5.077, + "step": 12428 + }, + { + "epoch": 0.0739187838995147, + "grad_norm": 1.3692456483840942, + "learning_rate": 4.932903906150042e-05, + "loss": 5.1778, + "step": 12429 + }, + { + "epoch": 0.0739247311827957, + "grad_norm": 1.8229175806045532, + "learning_rate": 4.932893156715948e-05, + "loss": 5.4053, + "step": 12430 + }, + { + "epoch": 0.07393067846607669, + "grad_norm": 1.7769286632537842, + "learning_rate": 4.9328824064325566e-05, + "loss": 5.2541, + "step": 12431 + }, + { + "epoch": 0.07393662574935769, + "grad_norm": 1.7022631168365479, + "learning_rate": 4.93287165529987e-05, + "loss": 4.8555, + "step": 12432 + }, + { + "epoch": 0.0739425730326387, + "grad_norm": 1.5031015872955322, + "learning_rate": 4.932860903317894e-05, + "loss": 5.019, + "step": 12433 + }, + { + "epoch": 0.07394852031591968, + "grad_norm": 1.352550983428955, + "learning_rate": 4.932850150486631e-05, + "loss": 5.239, + "step": 12434 + }, + { + "epoch": 0.07395446759920069, + "grad_norm": 1.5571177005767822, + "learning_rate": 4.932839396806085e-05, + "loss": 5.2511, + "step": 12435 + }, + { + "epoch": 0.07396041488248169, + "grad_norm": 1.7673511505126953, + "learning_rate": 4.93282864227626e-05, + "loss": 5.1811, + "step": 12436 + }, + { + "epoch": 0.07396636216576268, + "grad_norm": 1.6385267972946167, + "learning_rate": 4.932817886897161e-05, + "loss": 5.1644, + "step": 12437 + }, + { + "epoch": 0.07397230944904368, + "grad_norm": 1.6142395734786987, + "learning_rate": 4.932807130668788e-05, + "loss": 5.173, + "step": 12438 + }, + { + "epoch": 0.07397825673232468, + "grad_norm": 1.6966745853424072, + "learning_rate": 4.932796373591149e-05, + "loss": 5.1495, + "step": 12439 + }, + { + "epoch": 0.07398420401560567, + "grad_norm": 1.6631567478179932, + "learning_rate": 4.932785615664245e-05, + "loss": 5.1787, + "step": 12440 + }, + { + "epoch": 0.07399015129888667, + "grad_norm": 1.7747845649719238, + "learning_rate": 4.9327748568880816e-05, + "loss": 5.1303, + "step": 12441 + }, + { + "epoch": 0.07399609858216767, + "grad_norm": 1.457535982131958, + "learning_rate": 4.932764097262661e-05, + "loss": 5.1573, + "step": 12442 + }, + { + "epoch": 0.07400204586544866, + "grad_norm": 1.602452039718628, + "learning_rate": 4.9327533367879875e-05, + "loss": 5.1039, + "step": 12443 + }, + { + "epoch": 0.07400799314872966, + "grad_norm": 1.644687294960022, + "learning_rate": 4.932742575464065e-05, + "loss": 5.3112, + "step": 12444 + }, + { + "epoch": 0.07401394043201066, + "grad_norm": 1.5873420238494873, + "learning_rate": 4.932731813290897e-05, + "loss": 5.1128, + "step": 12445 + }, + { + "epoch": 0.07401988771529165, + "grad_norm": 1.8046668767929077, + "learning_rate": 4.932721050268489e-05, + "loss": 4.9776, + "step": 12446 + }, + { + "epoch": 0.07402583499857265, + "grad_norm": 1.6964846849441528, + "learning_rate": 4.932710286396841e-05, + "loss": 5.0039, + "step": 12447 + }, + { + "epoch": 0.07403178228185366, + "grad_norm": 1.5332229137420654, + "learning_rate": 4.93269952167596e-05, + "loss": 4.9873, + "step": 12448 + }, + { + "epoch": 0.07403772956513464, + "grad_norm": 1.6128625869750977, + "learning_rate": 4.9326887561058485e-05, + "loss": 5.1139, + "step": 12449 + }, + { + "epoch": 0.07404367684841565, + "grad_norm": 1.5800291299819946, + "learning_rate": 4.932677989686511e-05, + "loss": 4.9687, + "step": 12450 + }, + { + "epoch": 0.07404962413169663, + "grad_norm": 1.6543092727661133, + "learning_rate": 4.932667222417951e-05, + "loss": 4.8345, + "step": 12451 + }, + { + "epoch": 0.07405557141497764, + "grad_norm": 1.4438380002975464, + "learning_rate": 4.932656454300171e-05, + "loss": 4.9677, + "step": 12452 + }, + { + "epoch": 0.07406151869825864, + "grad_norm": 1.6437597274780273, + "learning_rate": 4.932645685333176e-05, + "loss": 4.9016, + "step": 12453 + }, + { + "epoch": 0.07406746598153963, + "grad_norm": 1.5359379053115845, + "learning_rate": 4.932634915516969e-05, + "loss": 4.8357, + "step": 12454 + }, + { + "epoch": 0.07407341326482063, + "grad_norm": 1.6683440208435059, + "learning_rate": 4.9326241448515554e-05, + "loss": 4.8715, + "step": 12455 + }, + { + "epoch": 0.07407936054810163, + "grad_norm": 1.5654494762420654, + "learning_rate": 4.932613373336937e-05, + "loss": 4.8993, + "step": 12456 + }, + { + "epoch": 0.07408530783138262, + "grad_norm": 1.5333384275436401, + "learning_rate": 4.932602600973119e-05, + "loss": 4.9181, + "step": 12457 + }, + { + "epoch": 0.07409125511466362, + "grad_norm": 1.5674177408218384, + "learning_rate": 4.9325918277601046e-05, + "loss": 4.905, + "step": 12458 + }, + { + "epoch": 0.07409720239794462, + "grad_norm": 1.410294771194458, + "learning_rate": 4.9325810536978965e-05, + "loss": 4.8645, + "step": 12459 + }, + { + "epoch": 0.07410314968122561, + "grad_norm": 1.4950916767120361, + "learning_rate": 4.9325702787865006e-05, + "loss": 4.8289, + "step": 12460 + }, + { + "epoch": 0.07410909696450661, + "grad_norm": 1.7529935836791992, + "learning_rate": 4.9325595030259195e-05, + "loss": 4.8917, + "step": 12461 + }, + { + "epoch": 0.07411504424778761, + "grad_norm": 3.5575430393218994, + "learning_rate": 4.932548726416157e-05, + "loss": 5.5795, + "step": 12462 + }, + { + "epoch": 0.0741209915310686, + "grad_norm": 1.5091896057128906, + "learning_rate": 4.9325379489572165e-05, + "loss": 4.9864, + "step": 12463 + }, + { + "epoch": 0.0741269388143496, + "grad_norm": 1.6818382740020752, + "learning_rate": 4.932527170649102e-05, + "loss": 5.3386, + "step": 12464 + }, + { + "epoch": 0.07413288609763061, + "grad_norm": 1.7938569784164429, + "learning_rate": 4.932516391491818e-05, + "loss": 5.2668, + "step": 12465 + }, + { + "epoch": 0.0741388333809116, + "grad_norm": 1.89009428024292, + "learning_rate": 4.932505611485367e-05, + "loss": 5.1755, + "step": 12466 + }, + { + "epoch": 0.0741447806641926, + "grad_norm": 1.5277502536773682, + "learning_rate": 4.932494830629753e-05, + "loss": 5.3271, + "step": 12467 + }, + { + "epoch": 0.0741507279474736, + "grad_norm": 1.7720823287963867, + "learning_rate": 4.932484048924981e-05, + "loss": 5.7089, + "step": 12468 + }, + { + "epoch": 0.07415667523075459, + "grad_norm": 1.6797159910202026, + "learning_rate": 4.932473266371054e-05, + "loss": 5.5563, + "step": 12469 + }, + { + "epoch": 0.07416262251403559, + "grad_norm": 1.6536195278167725, + "learning_rate": 4.932462482967976e-05, + "loss": 5.4271, + "step": 12470 + }, + { + "epoch": 0.07416856979731659, + "grad_norm": 1.5667130947113037, + "learning_rate": 4.93245169871575e-05, + "loss": 5.3703, + "step": 12471 + }, + { + "epoch": 0.07417451708059758, + "grad_norm": 1.3659738302230835, + "learning_rate": 4.93244091361438e-05, + "loss": 5.4114, + "step": 12472 + }, + { + "epoch": 0.07418046436387858, + "grad_norm": 1.5106414556503296, + "learning_rate": 4.9324301276638705e-05, + "loss": 5.386, + "step": 12473 + }, + { + "epoch": 0.07418641164715958, + "grad_norm": 1.5054755210876465, + "learning_rate": 4.932419340864225e-05, + "loss": 5.3067, + "step": 12474 + }, + { + "epoch": 0.07419235893044057, + "grad_norm": 1.4413330554962158, + "learning_rate": 4.932408553215446e-05, + "loss": 5.358, + "step": 12475 + }, + { + "epoch": 0.07419830621372157, + "grad_norm": 1.3034652471542358, + "learning_rate": 4.932397764717539e-05, + "loss": 5.2942, + "step": 12476 + }, + { + "epoch": 0.07420425349700258, + "grad_norm": 1.494664192199707, + "learning_rate": 4.9323869753705074e-05, + "loss": 5.4243, + "step": 12477 + }, + { + "epoch": 0.07421020078028356, + "grad_norm": 1.2644178867340088, + "learning_rate": 4.932376185174354e-05, + "loss": 5.2212, + "step": 12478 + }, + { + "epoch": 0.07421614806356457, + "grad_norm": 1.5576590299606323, + "learning_rate": 4.9323653941290836e-05, + "loss": 5.2077, + "step": 12479 + }, + { + "epoch": 0.07422209534684555, + "grad_norm": 1.5699479579925537, + "learning_rate": 4.932354602234699e-05, + "loss": 5.3849, + "step": 12480 + }, + { + "epoch": 0.07422804263012656, + "grad_norm": 1.6582329273223877, + "learning_rate": 4.932343809491205e-05, + "loss": 5.3961, + "step": 12481 + }, + { + "epoch": 0.07423398991340756, + "grad_norm": 1.6159483194351196, + "learning_rate": 4.932333015898605e-05, + "loss": 5.3711, + "step": 12482 + }, + { + "epoch": 0.07423993719668855, + "grad_norm": 1.453933596611023, + "learning_rate": 4.932322221456902e-05, + "loss": 5.2899, + "step": 12483 + }, + { + "epoch": 0.07424588447996955, + "grad_norm": 1.3830047845840454, + "learning_rate": 4.9323114261661014e-05, + "loss": 5.3839, + "step": 12484 + }, + { + "epoch": 0.07425183176325055, + "grad_norm": 1.5541338920593262, + "learning_rate": 4.932300630026205e-05, + "loss": 5.257, + "step": 12485 + }, + { + "epoch": 0.07425777904653154, + "grad_norm": 1.5887267589569092, + "learning_rate": 4.932289833037219e-05, + "loss": 5.2079, + "step": 12486 + }, + { + "epoch": 0.07426372632981254, + "grad_norm": 1.6341818571090698, + "learning_rate": 4.932279035199144e-05, + "loss": 5.2529, + "step": 12487 + }, + { + "epoch": 0.07426967361309354, + "grad_norm": 1.5520392656326294, + "learning_rate": 4.9322682365119866e-05, + "loss": 5.2416, + "step": 12488 + }, + { + "epoch": 0.07427562089637453, + "grad_norm": 1.610711693763733, + "learning_rate": 4.93225743697575e-05, + "loss": 5.3172, + "step": 12489 + }, + { + "epoch": 0.07428156817965553, + "grad_norm": 1.5997258424758911, + "learning_rate": 4.932246636590436e-05, + "loss": 5.2343, + "step": 12490 + }, + { + "epoch": 0.07428751546293653, + "grad_norm": 1.5319284200668335, + "learning_rate": 4.932235835356051e-05, + "loss": 5.2021, + "step": 12491 + }, + { + "epoch": 0.07429346274621752, + "grad_norm": 1.6516488790512085, + "learning_rate": 4.932225033272597e-05, + "loss": 5.2678, + "step": 12492 + }, + { + "epoch": 0.07429941002949852, + "grad_norm": 1.9008166790008545, + "learning_rate": 4.9322142303400786e-05, + "loss": 5.1424, + "step": 12493 + }, + { + "epoch": 0.07430535731277953, + "grad_norm": 1.8372108936309814, + "learning_rate": 4.932203426558499e-05, + "loss": 5.321, + "step": 12494 + }, + { + "epoch": 0.07431130459606052, + "grad_norm": 1.4764071702957153, + "learning_rate": 4.932192621927863e-05, + "loss": 5.3627, + "step": 12495 + }, + { + "epoch": 0.07431725187934152, + "grad_norm": 1.6356589794158936, + "learning_rate": 4.932181816448173e-05, + "loss": 5.2061, + "step": 12496 + }, + { + "epoch": 0.07432319916262252, + "grad_norm": 1.6335545778274536, + "learning_rate": 4.932171010119434e-05, + "loss": 5.2283, + "step": 12497 + }, + { + "epoch": 0.07432914644590351, + "grad_norm": 1.499968409538269, + "learning_rate": 4.932160202941649e-05, + "loss": 5.4862, + "step": 12498 + }, + { + "epoch": 0.07433509372918451, + "grad_norm": 1.7292691469192505, + "learning_rate": 4.932149394914822e-05, + "loss": 5.4055, + "step": 12499 + }, + { + "epoch": 0.07434104101246551, + "grad_norm": 1.6818633079528809, + "learning_rate": 4.932138586038957e-05, + "loss": 5.5262, + "step": 12500 + }, + { + "epoch": 0.0743469882957465, + "grad_norm": 1.4048001766204834, + "learning_rate": 4.932127776314057e-05, + "loss": 5.1876, + "step": 12501 + }, + { + "epoch": 0.0743529355790275, + "grad_norm": 1.6041479110717773, + "learning_rate": 4.9321169657401264e-05, + "loss": 5.0791, + "step": 12502 + }, + { + "epoch": 0.0743588828623085, + "grad_norm": 1.3542897701263428, + "learning_rate": 4.932106154317169e-05, + "loss": 5.189, + "step": 12503 + }, + { + "epoch": 0.07436483014558949, + "grad_norm": 1.7782005071640015, + "learning_rate": 4.932095342045189e-05, + "loss": 5.2823, + "step": 12504 + }, + { + "epoch": 0.0743707774288705, + "grad_norm": 1.5981978178024292, + "learning_rate": 4.932084528924189e-05, + "loss": 5.3978, + "step": 12505 + }, + { + "epoch": 0.0743767247121515, + "grad_norm": 1.5224134922027588, + "learning_rate": 4.9320737149541734e-05, + "loss": 5.336, + "step": 12506 + }, + { + "epoch": 0.07438267199543248, + "grad_norm": 1.4827311038970947, + "learning_rate": 4.932062900135147e-05, + "loss": 5.2284, + "step": 12507 + }, + { + "epoch": 0.07438861927871349, + "grad_norm": 1.4394789934158325, + "learning_rate": 4.932052084467111e-05, + "loss": 5.1672, + "step": 12508 + }, + { + "epoch": 0.07439456656199447, + "grad_norm": 1.5112950801849365, + "learning_rate": 4.9320412679500715e-05, + "loss": 5.4069, + "step": 12509 + }, + { + "epoch": 0.07440051384527548, + "grad_norm": 1.4547615051269531, + "learning_rate": 4.932030450584032e-05, + "loss": 5.3317, + "step": 12510 + }, + { + "epoch": 0.07440646112855648, + "grad_norm": 1.5839279890060425, + "learning_rate": 4.9320196323689946e-05, + "loss": 5.2042, + "step": 12511 + }, + { + "epoch": 0.07441240841183747, + "grad_norm": 1.6392362117767334, + "learning_rate": 4.9320088133049655e-05, + "loss": 5.2595, + "step": 12512 + }, + { + "epoch": 0.07441835569511847, + "grad_norm": 1.530236840248108, + "learning_rate": 4.931997993391947e-05, + "loss": 5.4417, + "step": 12513 + }, + { + "epoch": 0.07442430297839947, + "grad_norm": 1.7665959596633911, + "learning_rate": 4.931987172629943e-05, + "loss": 5.5164, + "step": 12514 + }, + { + "epoch": 0.07443025026168046, + "grad_norm": 1.5256375074386597, + "learning_rate": 4.931976351018957e-05, + "loss": 5.3645, + "step": 12515 + }, + { + "epoch": 0.07443619754496146, + "grad_norm": 1.5948551893234253, + "learning_rate": 4.9319655285589937e-05, + "loss": 5.1964, + "step": 12516 + }, + { + "epoch": 0.07444214482824246, + "grad_norm": 1.451249361038208, + "learning_rate": 4.931954705250056e-05, + "loss": 5.3043, + "step": 12517 + }, + { + "epoch": 0.07444809211152345, + "grad_norm": 1.5874381065368652, + "learning_rate": 4.931943881092148e-05, + "loss": 5.3769, + "step": 12518 + }, + { + "epoch": 0.07445403939480445, + "grad_norm": 1.597102165222168, + "learning_rate": 4.931933056085274e-05, + "loss": 5.2909, + "step": 12519 + }, + { + "epoch": 0.07445998667808545, + "grad_norm": 1.3787156343460083, + "learning_rate": 4.9319222302294364e-05, + "loss": 5.5499, + "step": 12520 + }, + { + "epoch": 0.07446593396136644, + "grad_norm": 1.5816805362701416, + "learning_rate": 4.931911403524641e-05, + "loss": 5.255, + "step": 12521 + }, + { + "epoch": 0.07447188124464744, + "grad_norm": 1.636619210243225, + "learning_rate": 4.93190057597089e-05, + "loss": 5.3816, + "step": 12522 + }, + { + "epoch": 0.07447782852792845, + "grad_norm": 1.518872857093811, + "learning_rate": 4.931889747568187e-05, + "loss": 5.3376, + "step": 12523 + }, + { + "epoch": 0.07448377581120944, + "grad_norm": 1.9586291313171387, + "learning_rate": 4.931878918316537e-05, + "loss": 5.6678, + "step": 12524 + }, + { + "epoch": 0.07448972309449044, + "grad_norm": 1.5893887281417847, + "learning_rate": 4.9318680882159435e-05, + "loss": 5.266, + "step": 12525 + }, + { + "epoch": 0.07449567037777144, + "grad_norm": 1.5339915752410889, + "learning_rate": 4.93185725726641e-05, + "loss": 5.1891, + "step": 12526 + }, + { + "epoch": 0.07450161766105243, + "grad_norm": 1.730128288269043, + "learning_rate": 4.9318464254679396e-05, + "loss": 5.1534, + "step": 12527 + }, + { + "epoch": 0.07450756494433343, + "grad_norm": 1.691015362739563, + "learning_rate": 4.931835592820537e-05, + "loss": 5.2599, + "step": 12528 + }, + { + "epoch": 0.07451351222761443, + "grad_norm": 1.2936137914657593, + "learning_rate": 4.9318247593242056e-05, + "loss": 5.2432, + "step": 12529 + }, + { + "epoch": 0.07451945951089542, + "grad_norm": 1.4507200717926025, + "learning_rate": 4.93181392497895e-05, + "loss": 5.1539, + "step": 12530 + }, + { + "epoch": 0.07452540679417642, + "grad_norm": 1.6212667226791382, + "learning_rate": 4.931803089784772e-05, + "loss": 5.1212, + "step": 12531 + }, + { + "epoch": 0.07453135407745742, + "grad_norm": 1.48690927028656, + "learning_rate": 4.9317922537416775e-05, + "loss": 5.168, + "step": 12532 + }, + { + "epoch": 0.07453730136073841, + "grad_norm": 1.5102870464324951, + "learning_rate": 4.931781416849669e-05, + "loss": 5.2024, + "step": 12533 + }, + { + "epoch": 0.07454324864401941, + "grad_norm": 1.4186264276504517, + "learning_rate": 4.9317705791087516e-05, + "loss": 5.1154, + "step": 12534 + }, + { + "epoch": 0.07454919592730042, + "grad_norm": 1.623822569847107, + "learning_rate": 4.931759740518928e-05, + "loss": 5.0244, + "step": 12535 + }, + { + "epoch": 0.0745551432105814, + "grad_norm": 1.4694246053695679, + "learning_rate": 4.9317489010802015e-05, + "loss": 5.1737, + "step": 12536 + }, + { + "epoch": 0.0745610904938624, + "grad_norm": 1.553551435470581, + "learning_rate": 4.931738060792577e-05, + "loss": 5.1339, + "step": 12537 + }, + { + "epoch": 0.0745670377771434, + "grad_norm": 1.744367003440857, + "learning_rate": 4.9317272196560575e-05, + "loss": 5.1564, + "step": 12538 + }, + { + "epoch": 0.0745729850604244, + "grad_norm": 1.6584309339523315, + "learning_rate": 4.931716377670648e-05, + "loss": 5.1871, + "step": 12539 + }, + { + "epoch": 0.0745789323437054, + "grad_norm": 1.6894947290420532, + "learning_rate": 4.931705534836351e-05, + "loss": 5.1432, + "step": 12540 + }, + { + "epoch": 0.07458487962698639, + "grad_norm": 1.467315912246704, + "learning_rate": 4.93169469115317e-05, + "loss": 5.2072, + "step": 12541 + }, + { + "epoch": 0.07459082691026739, + "grad_norm": 1.478841781616211, + "learning_rate": 4.93168384662111e-05, + "loss": 5.3644, + "step": 12542 + }, + { + "epoch": 0.07459677419354839, + "grad_norm": 1.6001938581466675, + "learning_rate": 4.9316730012401745e-05, + "loss": 5.2031, + "step": 12543 + }, + { + "epoch": 0.07460272147682938, + "grad_norm": 1.480236530303955, + "learning_rate": 4.931662155010367e-05, + "loss": 5.0113, + "step": 12544 + }, + { + "epoch": 0.07460866876011038, + "grad_norm": 1.490511178970337, + "learning_rate": 4.9316513079316914e-05, + "loss": 5.0416, + "step": 12545 + }, + { + "epoch": 0.07461461604339138, + "grad_norm": 1.7327873706817627, + "learning_rate": 4.931640460004152e-05, + "loss": 5.0578, + "step": 12546 + }, + { + "epoch": 0.07462056332667237, + "grad_norm": 1.6410421133041382, + "learning_rate": 4.9316296112277514e-05, + "loss": 5.0239, + "step": 12547 + }, + { + "epoch": 0.07462651060995337, + "grad_norm": 1.5255141258239746, + "learning_rate": 4.9316187616024936e-05, + "loss": 5.1592, + "step": 12548 + }, + { + "epoch": 0.07463245789323437, + "grad_norm": 1.5555649995803833, + "learning_rate": 4.9316079111283835e-05, + "loss": 5.3981, + "step": 12549 + }, + { + "epoch": 0.07463840517651536, + "grad_norm": 1.4196929931640625, + "learning_rate": 4.931597059805424e-05, + "loss": 5.0682, + "step": 12550 + }, + { + "epoch": 0.07464435245979636, + "grad_norm": 1.562338948249817, + "learning_rate": 4.93158620763362e-05, + "loss": 5.3551, + "step": 12551 + }, + { + "epoch": 0.07465029974307737, + "grad_norm": 1.5955942869186401, + "learning_rate": 4.931575354612973e-05, + "loss": 5.3108, + "step": 12552 + }, + { + "epoch": 0.07465624702635835, + "grad_norm": 1.4173908233642578, + "learning_rate": 4.9315645007434885e-05, + "loss": 5.3793, + "step": 12553 + }, + { + "epoch": 0.07466219430963936, + "grad_norm": 1.4075239896774292, + "learning_rate": 4.93155364602517e-05, + "loss": 5.4409, + "step": 12554 + }, + { + "epoch": 0.07466814159292036, + "grad_norm": 1.3041841983795166, + "learning_rate": 4.9315427904580216e-05, + "loss": 5.5285, + "step": 12555 + }, + { + "epoch": 0.07467408887620135, + "grad_norm": 1.4277441501617432, + "learning_rate": 4.9315319340420465e-05, + "loss": 5.5017, + "step": 12556 + }, + { + "epoch": 0.07468003615948235, + "grad_norm": 1.407895803451538, + "learning_rate": 4.931521076777248e-05, + "loss": 5.3675, + "step": 12557 + }, + { + "epoch": 0.07468598344276335, + "grad_norm": 1.429131031036377, + "learning_rate": 4.931510218663632e-05, + "loss": 5.3712, + "step": 12558 + }, + { + "epoch": 0.07469193072604434, + "grad_norm": 1.7229793071746826, + "learning_rate": 4.9314993597011995e-05, + "loss": 5.4513, + "step": 12559 + }, + { + "epoch": 0.07469787800932534, + "grad_norm": 1.5961774587631226, + "learning_rate": 4.9314884998899565e-05, + "loss": 5.5478, + "step": 12560 + }, + { + "epoch": 0.07470382529260634, + "grad_norm": 1.4570807218551636, + "learning_rate": 4.931477639229906e-05, + "loss": 5.3973, + "step": 12561 + }, + { + "epoch": 0.07470977257588733, + "grad_norm": 1.6308903694152832, + "learning_rate": 4.931466777721052e-05, + "loss": 5.1951, + "step": 12562 + }, + { + "epoch": 0.07471571985916833, + "grad_norm": 1.438491940498352, + "learning_rate": 4.9314559153633974e-05, + "loss": 5.4237, + "step": 12563 + }, + { + "epoch": 0.07472166714244934, + "grad_norm": 1.7219120264053345, + "learning_rate": 4.931445052156947e-05, + "loss": 5.2303, + "step": 12564 + }, + { + "epoch": 0.07472761442573032, + "grad_norm": 1.557895302772522, + "learning_rate": 4.931434188101704e-05, + "loss": 5.2383, + "step": 12565 + }, + { + "epoch": 0.07473356170901133, + "grad_norm": 1.3585479259490967, + "learning_rate": 4.931423323197672e-05, + "loss": 5.2698, + "step": 12566 + }, + { + "epoch": 0.07473950899229233, + "grad_norm": 1.643608808517456, + "learning_rate": 4.931412457444857e-05, + "loss": 5.285, + "step": 12567 + }, + { + "epoch": 0.07474545627557332, + "grad_norm": 1.7847453355789185, + "learning_rate": 4.93140159084326e-05, + "loss": 5.413, + "step": 12568 + }, + { + "epoch": 0.07475140355885432, + "grad_norm": 1.5010985136032104, + "learning_rate": 4.931390723392886e-05, + "loss": 5.3665, + "step": 12569 + }, + { + "epoch": 0.0747573508421353, + "grad_norm": 1.3640403747558594, + "learning_rate": 4.931379855093738e-05, + "loss": 5.2253, + "step": 12570 + }, + { + "epoch": 0.07476329812541631, + "grad_norm": 1.4886012077331543, + "learning_rate": 4.9313689859458214e-05, + "loss": 5.5954, + "step": 12571 + }, + { + "epoch": 0.07476924540869731, + "grad_norm": 1.6626142263412476, + "learning_rate": 4.931358115949138e-05, + "loss": 5.3558, + "step": 12572 + }, + { + "epoch": 0.0747751926919783, + "grad_norm": 1.6350460052490234, + "learning_rate": 4.931347245103693e-05, + "loss": 5.3222, + "step": 12573 + }, + { + "epoch": 0.0747811399752593, + "grad_norm": 1.586182951927185, + "learning_rate": 4.93133637340949e-05, + "loss": 5.2056, + "step": 12574 + }, + { + "epoch": 0.0747870872585403, + "grad_norm": 1.6866692304611206, + "learning_rate": 4.931325500866532e-05, + "loss": 5.2698, + "step": 12575 + }, + { + "epoch": 0.07479303454182129, + "grad_norm": 1.4165509939193726, + "learning_rate": 4.9313146274748235e-05, + "loss": 5.2572, + "step": 12576 + }, + { + "epoch": 0.07479898182510229, + "grad_norm": 1.6259573698043823, + "learning_rate": 4.931303753234369e-05, + "loss": 5.2585, + "step": 12577 + }, + { + "epoch": 0.0748049291083833, + "grad_norm": 1.4159972667694092, + "learning_rate": 4.931292878145171e-05, + "loss": 5.1748, + "step": 12578 + }, + { + "epoch": 0.07481087639166428, + "grad_norm": 1.3880494832992554, + "learning_rate": 4.931282002207234e-05, + "loss": 5.2181, + "step": 12579 + }, + { + "epoch": 0.07481682367494528, + "grad_norm": 1.4466285705566406, + "learning_rate": 4.931271125420561e-05, + "loss": 5.2041, + "step": 12580 + }, + { + "epoch": 0.07482277095822629, + "grad_norm": 1.5111972093582153, + "learning_rate": 4.931260247785157e-05, + "loss": 5.2388, + "step": 12581 + }, + { + "epoch": 0.07482871824150727, + "grad_norm": 1.368296504020691, + "learning_rate": 4.9312493693010245e-05, + "loss": 5.0964, + "step": 12582 + }, + { + "epoch": 0.07483466552478828, + "grad_norm": 1.5604379177093506, + "learning_rate": 4.931238489968168e-05, + "loss": 5.2031, + "step": 12583 + }, + { + "epoch": 0.07484061280806928, + "grad_norm": 1.6104371547698975, + "learning_rate": 4.9312276097865916e-05, + "loss": 5.1122, + "step": 12584 + }, + { + "epoch": 0.07484656009135027, + "grad_norm": 1.5082486867904663, + "learning_rate": 4.931216728756299e-05, + "loss": 5.2092, + "step": 12585 + }, + { + "epoch": 0.07485250737463127, + "grad_norm": 2.1802000999450684, + "learning_rate": 4.931205846877293e-05, + "loss": 5.859, + "step": 12586 + }, + { + "epoch": 0.07485845465791227, + "grad_norm": 1.7069321870803833, + "learning_rate": 4.931194964149579e-05, + "loss": 4.9751, + "step": 12587 + }, + { + "epoch": 0.07486440194119326, + "grad_norm": 1.3614740371704102, + "learning_rate": 4.931184080573159e-05, + "loss": 5.2341, + "step": 12588 + }, + { + "epoch": 0.07487034922447426, + "grad_norm": 1.3952617645263672, + "learning_rate": 4.931173196148039e-05, + "loss": 5.0472, + "step": 12589 + }, + { + "epoch": 0.07487629650775526, + "grad_norm": 1.435829758644104, + "learning_rate": 4.9311623108742205e-05, + "loss": 5.0165, + "step": 12590 + }, + { + "epoch": 0.07488224379103625, + "grad_norm": 1.3875840902328491, + "learning_rate": 4.931151424751709e-05, + "loss": 5.5455, + "step": 12591 + }, + { + "epoch": 0.07488819107431725, + "grad_norm": 1.4364032745361328, + "learning_rate": 4.931140537780508e-05, + "loss": 5.5106, + "step": 12592 + }, + { + "epoch": 0.07489413835759826, + "grad_norm": 1.5878878831863403, + "learning_rate": 4.9311296499606194e-05, + "loss": 5.2372, + "step": 12593 + }, + { + "epoch": 0.07490008564087924, + "grad_norm": 1.5724025964736938, + "learning_rate": 4.9311187612920495e-05, + "loss": 5.3771, + "step": 12594 + }, + { + "epoch": 0.07490603292416025, + "grad_norm": 1.4630738496780396, + "learning_rate": 4.9311078717748014e-05, + "loss": 5.3378, + "step": 12595 + }, + { + "epoch": 0.07491198020744125, + "grad_norm": 1.4438437223434448, + "learning_rate": 4.931096981408878e-05, + "loss": 5.3019, + "step": 12596 + }, + { + "epoch": 0.07491792749072224, + "grad_norm": 1.674564242362976, + "learning_rate": 4.931086090194285e-05, + "loss": 5.2957, + "step": 12597 + }, + { + "epoch": 0.07492387477400324, + "grad_norm": 1.237748384475708, + "learning_rate": 4.9310751981310236e-05, + "loss": 5.1994, + "step": 12598 + }, + { + "epoch": 0.07492982205728423, + "grad_norm": 1.5828932523727417, + "learning_rate": 4.9310643052191e-05, + "loss": 5.2326, + "step": 12599 + }, + { + "epoch": 0.07493576934056523, + "grad_norm": 1.2774053812026978, + "learning_rate": 4.931053411458516e-05, + "loss": 5.2496, + "step": 12600 + }, + { + "epoch": 0.07494171662384623, + "grad_norm": 1.2986499071121216, + "learning_rate": 4.9310425168492766e-05, + "loss": 5.3061, + "step": 12601 + }, + { + "epoch": 0.07494766390712722, + "grad_norm": 1.3973673582077026, + "learning_rate": 4.931031621391386e-05, + "loss": 5.1437, + "step": 12602 + }, + { + "epoch": 0.07495361119040822, + "grad_norm": 1.4217787981033325, + "learning_rate": 4.9310207250848475e-05, + "loss": 5.1636, + "step": 12603 + }, + { + "epoch": 0.07495955847368922, + "grad_norm": 1.5062726736068726, + "learning_rate": 4.9310098279296634e-05, + "loss": 5.2944, + "step": 12604 + }, + { + "epoch": 0.07496550575697021, + "grad_norm": 1.4844671487808228, + "learning_rate": 4.9309989299258404e-05, + "loss": 5.1899, + "step": 12605 + }, + { + "epoch": 0.07497145304025121, + "grad_norm": 1.3542430400848389, + "learning_rate": 4.9309880310733805e-05, + "loss": 5.1636, + "step": 12606 + }, + { + "epoch": 0.07497740032353221, + "grad_norm": 1.58526611328125, + "learning_rate": 4.930977131372287e-05, + "loss": 5.5748, + "step": 12607 + }, + { + "epoch": 0.0749833476068132, + "grad_norm": 1.6003972291946411, + "learning_rate": 4.930966230822564e-05, + "loss": 5.3992, + "step": 12608 + }, + { + "epoch": 0.0749892948900942, + "grad_norm": 1.6475237607955933, + "learning_rate": 4.930955329424218e-05, + "loss": 5.4515, + "step": 12609 + }, + { + "epoch": 0.0749952421733752, + "grad_norm": 1.5395694971084595, + "learning_rate": 4.9309444271772486e-05, + "loss": 5.5117, + "step": 12610 + }, + { + "epoch": 0.0750011894566562, + "grad_norm": 1.3863389492034912, + "learning_rate": 4.930933524081663e-05, + "loss": 5.5771, + "step": 12611 + }, + { + "epoch": 0.0750071367399372, + "grad_norm": 1.431830644607544, + "learning_rate": 4.9309226201374626e-05, + "loss": 5.412, + "step": 12612 + }, + { + "epoch": 0.0750130840232182, + "grad_norm": 1.4647631645202637, + "learning_rate": 4.930911715344653e-05, + "loss": 5.1849, + "step": 12613 + }, + { + "epoch": 0.07501903130649919, + "grad_norm": 2.126068592071533, + "learning_rate": 4.930900809703237e-05, + "loss": 5.1712, + "step": 12614 + }, + { + "epoch": 0.07502497858978019, + "grad_norm": 1.3078912496566772, + "learning_rate": 4.9308899032132183e-05, + "loss": 5.3937, + "step": 12615 + }, + { + "epoch": 0.07503092587306119, + "grad_norm": 1.2535938024520874, + "learning_rate": 4.9308789958746016e-05, + "loss": 5.5708, + "step": 12616 + }, + { + "epoch": 0.07503687315634218, + "grad_norm": 1.3942710161209106, + "learning_rate": 4.9308680876873894e-05, + "loss": 5.5907, + "step": 12617 + }, + { + "epoch": 0.07504282043962318, + "grad_norm": 1.3061814308166504, + "learning_rate": 4.930857178651587e-05, + "loss": 5.2515, + "step": 12618 + }, + { + "epoch": 0.07504876772290418, + "grad_norm": 1.8493753671646118, + "learning_rate": 4.930846268767197e-05, + "loss": 4.9958, + "step": 12619 + }, + { + "epoch": 0.07505471500618517, + "grad_norm": 1.5966380834579468, + "learning_rate": 4.9308353580342234e-05, + "loss": 4.8784, + "step": 12620 + }, + { + "epoch": 0.07506066228946617, + "grad_norm": 1.6849051713943481, + "learning_rate": 4.930824446452671e-05, + "loss": 5.1549, + "step": 12621 + }, + { + "epoch": 0.07506660957274718, + "grad_norm": 1.5844405889511108, + "learning_rate": 4.9308135340225426e-05, + "loss": 4.9807, + "step": 12622 + }, + { + "epoch": 0.07507255685602816, + "grad_norm": 1.520621418952942, + "learning_rate": 4.9308026207438424e-05, + "loss": 5.2237, + "step": 12623 + }, + { + "epoch": 0.07507850413930917, + "grad_norm": 1.5273483991622925, + "learning_rate": 4.9307917066165744e-05, + "loss": 5.4053, + "step": 12624 + }, + { + "epoch": 0.07508445142259017, + "grad_norm": 1.7137775421142578, + "learning_rate": 4.9307807916407414e-05, + "loss": 5.0427, + "step": 12625 + }, + { + "epoch": 0.07509039870587116, + "grad_norm": 1.7140679359436035, + "learning_rate": 4.930769875816348e-05, + "loss": 5.0354, + "step": 12626 + }, + { + "epoch": 0.07509634598915216, + "grad_norm": 1.5592498779296875, + "learning_rate": 4.930758959143399e-05, + "loss": 4.9663, + "step": 12627 + }, + { + "epoch": 0.07510229327243315, + "grad_norm": 1.4611366987228394, + "learning_rate": 4.930748041621896e-05, + "loss": 4.9469, + "step": 12628 + }, + { + "epoch": 0.07510824055571415, + "grad_norm": 1.4682248830795288, + "learning_rate": 4.930737123251844e-05, + "loss": 5.0217, + "step": 12629 + }, + { + "epoch": 0.07511418783899515, + "grad_norm": 1.5643991231918335, + "learning_rate": 4.9307262040332474e-05, + "loss": 5.0488, + "step": 12630 + }, + { + "epoch": 0.07512013512227614, + "grad_norm": 1.680577278137207, + "learning_rate": 4.9307152839661094e-05, + "loss": 5.0813, + "step": 12631 + }, + { + "epoch": 0.07512608240555714, + "grad_norm": 1.9138245582580566, + "learning_rate": 4.9307043630504334e-05, + "loss": 5.0965, + "step": 12632 + }, + { + "epoch": 0.07513202968883814, + "grad_norm": 1.7382584810256958, + "learning_rate": 4.9306934412862236e-05, + "loss": 5.3726, + "step": 12633 + }, + { + "epoch": 0.07513797697211913, + "grad_norm": 1.684213638305664, + "learning_rate": 4.930682518673484e-05, + "loss": 5.2511, + "step": 12634 + }, + { + "epoch": 0.07514392425540013, + "grad_norm": 1.6976017951965332, + "learning_rate": 4.9306715952122185e-05, + "loss": 4.9669, + "step": 12635 + }, + { + "epoch": 0.07514987153868113, + "grad_norm": 1.526212453842163, + "learning_rate": 4.930660670902431e-05, + "loss": 4.9405, + "step": 12636 + }, + { + "epoch": 0.07515581882196212, + "grad_norm": 1.6616593599319458, + "learning_rate": 4.930649745744124e-05, + "loss": 5.0266, + "step": 12637 + }, + { + "epoch": 0.07516176610524312, + "grad_norm": 1.7911401987075806, + "learning_rate": 4.930638819737303e-05, + "loss": 4.8774, + "step": 12638 + }, + { + "epoch": 0.07516771338852413, + "grad_norm": 1.3613603115081787, + "learning_rate": 4.93062789288197e-05, + "loss": 5.4048, + "step": 12639 + }, + { + "epoch": 0.07517366067180511, + "grad_norm": 1.5945172309875488, + "learning_rate": 4.930616965178131e-05, + "loss": 5.1918, + "step": 12640 + }, + { + "epoch": 0.07517960795508612, + "grad_norm": 1.816091775894165, + "learning_rate": 4.930606036625789e-05, + "loss": 5.3138, + "step": 12641 + }, + { + "epoch": 0.07518555523836712, + "grad_norm": 1.642877459526062, + "learning_rate": 4.930595107224947e-05, + "loss": 5.2438, + "step": 12642 + }, + { + "epoch": 0.07519150252164811, + "grad_norm": 1.8904980421066284, + "learning_rate": 4.930584176975609e-05, + "loss": 5.1565, + "step": 12643 + }, + { + "epoch": 0.07519744980492911, + "grad_norm": 1.6247447729110718, + "learning_rate": 4.93057324587778e-05, + "loss": 5.1795, + "step": 12644 + }, + { + "epoch": 0.07520339708821011, + "grad_norm": 1.4699510335922241, + "learning_rate": 4.930562313931461e-05, + "loss": 5.3628, + "step": 12645 + }, + { + "epoch": 0.0752093443714911, + "grad_norm": 1.537920355796814, + "learning_rate": 4.93055138113666e-05, + "loss": 5.492, + "step": 12646 + }, + { + "epoch": 0.0752152916547721, + "grad_norm": 1.3268204927444458, + "learning_rate": 4.930540447493378e-05, + "loss": 5.2169, + "step": 12647 + }, + { + "epoch": 0.0752212389380531, + "grad_norm": 1.627005934715271, + "learning_rate": 4.930529513001619e-05, + "loss": 5.9358, + "step": 12648 + }, + { + "epoch": 0.07522718622133409, + "grad_norm": 1.445926547050476, + "learning_rate": 4.930518577661388e-05, + "loss": 5.0762, + "step": 12649 + }, + { + "epoch": 0.0752331335046151, + "grad_norm": 1.5958713293075562, + "learning_rate": 4.930507641472688e-05, + "loss": 5.2345, + "step": 12650 + }, + { + "epoch": 0.0752390807878961, + "grad_norm": 1.470540165901184, + "learning_rate": 4.9304967044355225e-05, + "loss": 5.1259, + "step": 12651 + }, + { + "epoch": 0.07524502807117708, + "grad_norm": 1.4679489135742188, + "learning_rate": 4.930485766549896e-05, + "loss": 5.1456, + "step": 12652 + }, + { + "epoch": 0.07525097535445809, + "grad_norm": 1.3032207489013672, + "learning_rate": 4.930474827815812e-05, + "loss": 5.1479, + "step": 12653 + }, + { + "epoch": 0.07525692263773909, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.930463888233274e-05, + "loss": 5.173, + "step": 12654 + }, + { + "epoch": 0.07526286992102008, + "grad_norm": 1.5788590908050537, + "learning_rate": 4.930452947802286e-05, + "loss": 5.0608, + "step": 12655 + }, + { + "epoch": 0.07526881720430108, + "grad_norm": 1.4392722845077515, + "learning_rate": 4.9304420065228526e-05, + "loss": 5.1209, + "step": 12656 + }, + { + "epoch": 0.07527476448758207, + "grad_norm": 1.4725446701049805, + "learning_rate": 4.930431064394977e-05, + "loss": 5.1249, + "step": 12657 + }, + { + "epoch": 0.07528071177086307, + "grad_norm": 1.4239790439605713, + "learning_rate": 4.930420121418663e-05, + "loss": 5.0262, + "step": 12658 + }, + { + "epoch": 0.07528665905414407, + "grad_norm": 1.3037468194961548, + "learning_rate": 4.930409177593914e-05, + "loss": 5.1158, + "step": 12659 + }, + { + "epoch": 0.07529260633742506, + "grad_norm": 1.430015206336975, + "learning_rate": 4.930398232920734e-05, + "loss": 5.1362, + "step": 12660 + }, + { + "epoch": 0.07529855362070606, + "grad_norm": 1.2381033897399902, + "learning_rate": 4.930387287399127e-05, + "loss": 5.2351, + "step": 12661 + }, + { + "epoch": 0.07530450090398706, + "grad_norm": 1.4459912776947021, + "learning_rate": 4.930376341029098e-05, + "loss": 5.1413, + "step": 12662 + }, + { + "epoch": 0.07531044818726805, + "grad_norm": 1.4875576496124268, + "learning_rate": 4.93036539381065e-05, + "loss": 5.0556, + "step": 12663 + }, + { + "epoch": 0.07531639547054905, + "grad_norm": 1.1632124185562134, + "learning_rate": 4.930354445743785e-05, + "loss": 5.2317, + "step": 12664 + }, + { + "epoch": 0.07532234275383005, + "grad_norm": 1.324722170829773, + "learning_rate": 4.9303434968285096e-05, + "loss": 5.0562, + "step": 12665 + }, + { + "epoch": 0.07532829003711104, + "grad_norm": 1.4292213916778564, + "learning_rate": 4.9303325470648254e-05, + "loss": 5.0991, + "step": 12666 + }, + { + "epoch": 0.07533423732039204, + "grad_norm": 1.4528483152389526, + "learning_rate": 4.930321596452738e-05, + "loss": 5.0675, + "step": 12667 + }, + { + "epoch": 0.07534018460367305, + "grad_norm": 1.5489269495010376, + "learning_rate": 4.9303106449922504e-05, + "loss": 4.9073, + "step": 12668 + }, + { + "epoch": 0.07534613188695403, + "grad_norm": 1.440854787826538, + "learning_rate": 4.9302996926833664e-05, + "loss": 5.0401, + "step": 12669 + }, + { + "epoch": 0.07535207917023504, + "grad_norm": 1.4586740732192993, + "learning_rate": 4.9302887395260894e-05, + "loss": 5.0483, + "step": 12670 + }, + { + "epoch": 0.07535802645351604, + "grad_norm": 1.390376091003418, + "learning_rate": 4.930277785520424e-05, + "loss": 5.1417, + "step": 12671 + }, + { + "epoch": 0.07536397373679703, + "grad_norm": 1.296410083770752, + "learning_rate": 4.9302668306663736e-05, + "loss": 5.461, + "step": 12672 + }, + { + "epoch": 0.07536992102007803, + "grad_norm": 1.5190175771713257, + "learning_rate": 4.930255874963943e-05, + "loss": 5.4972, + "step": 12673 + }, + { + "epoch": 0.07537586830335903, + "grad_norm": 1.4567232131958008, + "learning_rate": 4.930244918413134e-05, + "loss": 5.1921, + "step": 12674 + }, + { + "epoch": 0.07538181558664002, + "grad_norm": 1.7850147485733032, + "learning_rate": 4.930233961013953e-05, + "loss": 5.0658, + "step": 12675 + }, + { + "epoch": 0.07538776286992102, + "grad_norm": 1.5736637115478516, + "learning_rate": 4.930223002766401e-05, + "loss": 5.6874, + "step": 12676 + }, + { + "epoch": 0.07539371015320202, + "grad_norm": 1.5202080011367798, + "learning_rate": 4.9302120436704836e-05, + "loss": 5.7279, + "step": 12677 + }, + { + "epoch": 0.07539965743648301, + "grad_norm": 1.4259493350982666, + "learning_rate": 4.930201083726205e-05, + "loss": 5.5445, + "step": 12678 + }, + { + "epoch": 0.07540560471976401, + "grad_norm": 1.5141973495483398, + "learning_rate": 4.9301901229335674e-05, + "loss": 5.5086, + "step": 12679 + }, + { + "epoch": 0.07541155200304502, + "grad_norm": 1.5044218301773071, + "learning_rate": 4.930179161292576e-05, + "loss": 5.4279, + "step": 12680 + }, + { + "epoch": 0.075417499286326, + "grad_norm": 1.5342620611190796, + "learning_rate": 4.930168198803234e-05, + "loss": 5.0885, + "step": 12681 + }, + { + "epoch": 0.075423446569607, + "grad_norm": 1.8139567375183105, + "learning_rate": 4.930157235465546e-05, + "loss": 5.5586, + "step": 12682 + }, + { + "epoch": 0.07542939385288801, + "grad_norm": 1.606778621673584, + "learning_rate": 4.9301462712795144e-05, + "loss": 5.4007, + "step": 12683 + }, + { + "epoch": 0.075435341136169, + "grad_norm": 1.6451623439788818, + "learning_rate": 4.930135306245144e-05, + "loss": 5.2882, + "step": 12684 + }, + { + "epoch": 0.07544128841945, + "grad_norm": 1.915991187095642, + "learning_rate": 4.9301243403624385e-05, + "loss": 5.0727, + "step": 12685 + }, + { + "epoch": 0.07544723570273099, + "grad_norm": 1.536456823348999, + "learning_rate": 4.930113373631402e-05, + "loss": 5.2154, + "step": 12686 + }, + { + "epoch": 0.07545318298601199, + "grad_norm": 1.5820670127868652, + "learning_rate": 4.9301024060520375e-05, + "loss": 5.0613, + "step": 12687 + }, + { + "epoch": 0.07545913026929299, + "grad_norm": 1.5905929803848267, + "learning_rate": 4.93009143762435e-05, + "loss": 5.08, + "step": 12688 + }, + { + "epoch": 0.07546507755257398, + "grad_norm": 1.5759062767028809, + "learning_rate": 4.9300804683483426e-05, + "loss": 5.0874, + "step": 12689 + }, + { + "epoch": 0.07547102483585498, + "grad_norm": 1.4619840383529663, + "learning_rate": 4.9300694982240186e-05, + "loss": 5.1803, + "step": 12690 + }, + { + "epoch": 0.07547697211913598, + "grad_norm": 1.2742846012115479, + "learning_rate": 4.930058527251383e-05, + "loss": 5.2721, + "step": 12691 + }, + { + "epoch": 0.07548291940241697, + "grad_norm": 1.4095741510391235, + "learning_rate": 4.930047555430439e-05, + "loss": 5.055, + "step": 12692 + }, + { + "epoch": 0.07548886668569797, + "grad_norm": 1.3399991989135742, + "learning_rate": 4.93003658276119e-05, + "loss": 5.0315, + "step": 12693 + }, + { + "epoch": 0.07549481396897897, + "grad_norm": 1.4075208902359009, + "learning_rate": 4.9300256092436407e-05, + "loss": 5.2634, + "step": 12694 + }, + { + "epoch": 0.07550076125225996, + "grad_norm": 1.681321144104004, + "learning_rate": 4.930014634877795e-05, + "loss": 4.9749, + "step": 12695 + }, + { + "epoch": 0.07550670853554096, + "grad_norm": 1.842136263847351, + "learning_rate": 4.9300036596636555e-05, + "loss": 4.797, + "step": 12696 + }, + { + "epoch": 0.07551265581882197, + "grad_norm": 1.8733257055282593, + "learning_rate": 4.929992683601228e-05, + "loss": 5.4726, + "step": 12697 + }, + { + "epoch": 0.07551860310210295, + "grad_norm": 1.747514009475708, + "learning_rate": 4.929981706690514e-05, + "loss": 5.1081, + "step": 12698 + }, + { + "epoch": 0.07552455038538396, + "grad_norm": 1.8107210397720337, + "learning_rate": 4.9299707289315187e-05, + "loss": 4.983, + "step": 12699 + }, + { + "epoch": 0.07553049766866496, + "grad_norm": 1.6319682598114014, + "learning_rate": 4.929959750324246e-05, + "loss": 4.9968, + "step": 12700 + }, + { + "epoch": 0.07553644495194595, + "grad_norm": 1.4653065204620361, + "learning_rate": 4.9299487708687e-05, + "loss": 5.3013, + "step": 12701 + }, + { + "epoch": 0.07554239223522695, + "grad_norm": 1.4665262699127197, + "learning_rate": 4.929937790564883e-05, + "loss": 5.4431, + "step": 12702 + }, + { + "epoch": 0.07554833951850795, + "grad_norm": 1.4962518215179443, + "learning_rate": 4.9299268094127996e-05, + "loss": 5.3692, + "step": 12703 + }, + { + "epoch": 0.07555428680178894, + "grad_norm": 1.7913219928741455, + "learning_rate": 4.929915827412454e-05, + "loss": 5.0082, + "step": 12704 + }, + { + "epoch": 0.07556023408506994, + "grad_norm": 1.5508856773376465, + "learning_rate": 4.929904844563851e-05, + "loss": 5.1501, + "step": 12705 + }, + { + "epoch": 0.07556618136835094, + "grad_norm": 1.5882935523986816, + "learning_rate": 4.929893860866993e-05, + "loss": 4.9579, + "step": 12706 + }, + { + "epoch": 0.07557212865163193, + "grad_norm": 1.4550399780273438, + "learning_rate": 4.9298828763218833e-05, + "loss": 5.0165, + "step": 12707 + }, + { + "epoch": 0.07557807593491293, + "grad_norm": 1.5075403451919556, + "learning_rate": 4.929871890928527e-05, + "loss": 4.933, + "step": 12708 + }, + { + "epoch": 0.07558402321819394, + "grad_norm": 1.7094134092330933, + "learning_rate": 4.929860904686928e-05, + "loss": 4.8842, + "step": 12709 + }, + { + "epoch": 0.07558997050147492, + "grad_norm": 1.5615170001983643, + "learning_rate": 4.929849917597089e-05, + "loss": 5.5301, + "step": 12710 + }, + { + "epoch": 0.07559591778475593, + "grad_norm": 1.6687208414077759, + "learning_rate": 4.929838929659015e-05, + "loss": 4.9325, + "step": 12711 + }, + { + "epoch": 0.07560186506803693, + "grad_norm": 1.3476423025131226, + "learning_rate": 4.9298279408727086e-05, + "loss": 5.1274, + "step": 12712 + }, + { + "epoch": 0.07560781235131792, + "grad_norm": 1.359786868095398, + "learning_rate": 4.929816951238175e-05, + "loss": 4.7549, + "step": 12713 + }, + { + "epoch": 0.07561375963459892, + "grad_norm": 1.305482029914856, + "learning_rate": 4.9298059607554184e-05, + "loss": 4.7371, + "step": 12714 + }, + { + "epoch": 0.0756197069178799, + "grad_norm": 1.408693790435791, + "learning_rate": 4.92979496942444e-05, + "loss": 5.0733, + "step": 12715 + }, + { + "epoch": 0.07562565420116091, + "grad_norm": 1.3604625463485718, + "learning_rate": 4.9297839772452456e-05, + "loss": 4.7947, + "step": 12716 + }, + { + "epoch": 0.07563160148444191, + "grad_norm": 1.4101814031600952, + "learning_rate": 4.929772984217839e-05, + "loss": 5.2003, + "step": 12717 + }, + { + "epoch": 0.0756375487677229, + "grad_norm": 1.4409375190734863, + "learning_rate": 4.929761990342224e-05, + "loss": 5.167, + "step": 12718 + }, + { + "epoch": 0.0756434960510039, + "grad_norm": 1.4309754371643066, + "learning_rate": 4.9297509956184044e-05, + "loss": 5.1499, + "step": 12719 + }, + { + "epoch": 0.0756494433342849, + "grad_norm": 1.6380341053009033, + "learning_rate": 4.929740000046382e-05, + "loss": 4.8282, + "step": 12720 + }, + { + "epoch": 0.07565539061756589, + "grad_norm": 1.6795456409454346, + "learning_rate": 4.929729003626164e-05, + "loss": 4.708, + "step": 12721 + }, + { + "epoch": 0.07566133790084689, + "grad_norm": 1.7367075681686401, + "learning_rate": 4.929718006357753e-05, + "loss": 5.3364, + "step": 12722 + }, + { + "epoch": 0.0756672851841279, + "grad_norm": 1.5842353105545044, + "learning_rate": 4.929707008241152e-05, + "loss": 5.2025, + "step": 12723 + }, + { + "epoch": 0.07567323246740888, + "grad_norm": 1.5129985809326172, + "learning_rate": 4.9296960092763657e-05, + "loss": 5.1788, + "step": 12724 + }, + { + "epoch": 0.07567917975068988, + "grad_norm": 1.4276295900344849, + "learning_rate": 4.929685009463397e-05, + "loss": 5.2597, + "step": 12725 + }, + { + "epoch": 0.07568512703397089, + "grad_norm": 1.499213457107544, + "learning_rate": 4.9296740088022506e-05, + "loss": 5.1778, + "step": 12726 + }, + { + "epoch": 0.07569107431725187, + "grad_norm": 1.4656083583831787, + "learning_rate": 4.92966300729293e-05, + "loss": 5.2689, + "step": 12727 + }, + { + "epoch": 0.07569702160053288, + "grad_norm": 1.6160268783569336, + "learning_rate": 4.9296520049354393e-05, + "loss": 5.1829, + "step": 12728 + }, + { + "epoch": 0.07570296888381388, + "grad_norm": 1.514891266822815, + "learning_rate": 4.929641001729782e-05, + "loss": 5.2586, + "step": 12729 + }, + { + "epoch": 0.07570891616709487, + "grad_norm": 1.4635345935821533, + "learning_rate": 4.929629997675963e-05, + "loss": 5.2159, + "step": 12730 + }, + { + "epoch": 0.07571486345037587, + "grad_norm": 1.704380750656128, + "learning_rate": 4.9296189927739846e-05, + "loss": 5.1068, + "step": 12731 + }, + { + "epoch": 0.07572081073365687, + "grad_norm": 1.5786374807357788, + "learning_rate": 4.929607987023851e-05, + "loss": 5.2306, + "step": 12732 + }, + { + "epoch": 0.07572675801693786, + "grad_norm": 1.5011721849441528, + "learning_rate": 4.929596980425567e-05, + "loss": 5.1594, + "step": 12733 + }, + { + "epoch": 0.07573270530021886, + "grad_norm": 1.4532456398010254, + "learning_rate": 4.9295859729791354e-05, + "loss": 5.0955, + "step": 12734 + }, + { + "epoch": 0.07573865258349986, + "grad_norm": 1.5734699964523315, + "learning_rate": 4.9295749646845604e-05, + "loss": 5.1523, + "step": 12735 + }, + { + "epoch": 0.07574459986678085, + "grad_norm": 1.578141450881958, + "learning_rate": 4.929563955541846e-05, + "loss": 5.0784, + "step": 12736 + }, + { + "epoch": 0.07575054715006185, + "grad_norm": 1.408524513244629, + "learning_rate": 4.929552945550996e-05, + "loss": 5.1411, + "step": 12737 + }, + { + "epoch": 0.07575649443334286, + "grad_norm": 1.4755773544311523, + "learning_rate": 4.929541934712014e-05, + "loss": 5.0666, + "step": 12738 + }, + { + "epoch": 0.07576244171662384, + "grad_norm": 1.5521161556243896, + "learning_rate": 4.929530923024904e-05, + "loss": 5.0938, + "step": 12739 + }, + { + "epoch": 0.07576838899990485, + "grad_norm": 1.4772706031799316, + "learning_rate": 4.929519910489671e-05, + "loss": 5.1178, + "step": 12740 + }, + { + "epoch": 0.07577433628318585, + "grad_norm": 1.2669662237167358, + "learning_rate": 4.9295088971063164e-05, + "loss": 5.2565, + "step": 12741 + }, + { + "epoch": 0.07578028356646684, + "grad_norm": 1.5846413373947144, + "learning_rate": 4.929497882874845e-05, + "loss": 5.2109, + "step": 12742 + }, + { + "epoch": 0.07578623084974784, + "grad_norm": 1.779228687286377, + "learning_rate": 4.929486867795262e-05, + "loss": 5.0196, + "step": 12743 + }, + { + "epoch": 0.07579217813302883, + "grad_norm": 1.6306418180465698, + "learning_rate": 4.92947585186757e-05, + "loss": 5.1982, + "step": 12744 + }, + { + "epoch": 0.07579812541630983, + "grad_norm": 1.5107831954956055, + "learning_rate": 4.9294648350917726e-05, + "loss": 5.0652, + "step": 12745 + }, + { + "epoch": 0.07580407269959083, + "grad_norm": 1.3846759796142578, + "learning_rate": 4.9294538174678744e-05, + "loss": 5.0322, + "step": 12746 + }, + { + "epoch": 0.07581001998287182, + "grad_norm": 1.4558676481246948, + "learning_rate": 4.9294427989958794e-05, + "loss": 4.9626, + "step": 12747 + }, + { + "epoch": 0.07581596726615282, + "grad_norm": 1.3155016899108887, + "learning_rate": 4.92943177967579e-05, + "loss": 4.9965, + "step": 12748 + }, + { + "epoch": 0.07582191454943382, + "grad_norm": 1.3237980604171753, + "learning_rate": 4.9294207595076125e-05, + "loss": 4.9697, + "step": 12749 + }, + { + "epoch": 0.07582786183271481, + "grad_norm": 1.4439423084259033, + "learning_rate": 4.929409738491349e-05, + "loss": 5.0636, + "step": 12750 + }, + { + "epoch": 0.07583380911599581, + "grad_norm": 1.4793460369110107, + "learning_rate": 4.9293987166270024e-05, + "loss": 5.1122, + "step": 12751 + }, + { + "epoch": 0.07583975639927681, + "grad_norm": 1.5353471040725708, + "learning_rate": 4.929387693914578e-05, + "loss": 5.174, + "step": 12752 + }, + { + "epoch": 0.0758457036825578, + "grad_norm": 1.690537452697754, + "learning_rate": 4.929376670354081e-05, + "loss": 5.1515, + "step": 12753 + }, + { + "epoch": 0.0758516509658388, + "grad_norm": 1.4602952003479004, + "learning_rate": 4.9293656459455124e-05, + "loss": 5.1244, + "step": 12754 + }, + { + "epoch": 0.0758575982491198, + "grad_norm": 1.5871785879135132, + "learning_rate": 4.929354620688878e-05, + "loss": 5.2856, + "step": 12755 + }, + { + "epoch": 0.0758635455324008, + "grad_norm": 1.588065505027771, + "learning_rate": 4.92934359458418e-05, + "loss": 5.3694, + "step": 12756 + }, + { + "epoch": 0.0758694928156818, + "grad_norm": 1.5489270687103271, + "learning_rate": 4.929332567631424e-05, + "loss": 5.3546, + "step": 12757 + }, + { + "epoch": 0.0758754400989628, + "grad_norm": 1.493815541267395, + "learning_rate": 4.9293215398306136e-05, + "loss": 5.0878, + "step": 12758 + }, + { + "epoch": 0.07588138738224379, + "grad_norm": 1.3329546451568604, + "learning_rate": 4.929310511181751e-05, + "loss": 5.2171, + "step": 12759 + }, + { + "epoch": 0.07588733466552479, + "grad_norm": 1.5299288034439087, + "learning_rate": 4.929299481684842e-05, + "loss": 5.1695, + "step": 12760 + }, + { + "epoch": 0.07589328194880579, + "grad_norm": 1.5130664110183716, + "learning_rate": 4.9292884513398894e-05, + "loss": 5.3169, + "step": 12761 + }, + { + "epoch": 0.07589922923208678, + "grad_norm": 1.420339584350586, + "learning_rate": 4.9292774201468974e-05, + "loss": 5.1995, + "step": 12762 + }, + { + "epoch": 0.07590517651536778, + "grad_norm": 1.4740930795669556, + "learning_rate": 4.9292663881058696e-05, + "loss": 5.3321, + "step": 12763 + }, + { + "epoch": 0.07591112379864878, + "grad_norm": 1.448968768119812, + "learning_rate": 4.92925535521681e-05, + "loss": 5.1292, + "step": 12764 + }, + { + "epoch": 0.07591707108192977, + "grad_norm": 1.3219209909439087, + "learning_rate": 4.929244321479722e-05, + "loss": 5.1873, + "step": 12765 + }, + { + "epoch": 0.07592301836521077, + "grad_norm": 1.3336325883865356, + "learning_rate": 4.929233286894611e-05, + "loss": 5.248, + "step": 12766 + }, + { + "epoch": 0.07592896564849178, + "grad_norm": 1.4230278730392456, + "learning_rate": 4.9292222514614795e-05, + "loss": 5.2072, + "step": 12767 + }, + { + "epoch": 0.07593491293177276, + "grad_norm": 1.4522627592086792, + "learning_rate": 4.929211215180331e-05, + "loss": 5.4323, + "step": 12768 + }, + { + "epoch": 0.07594086021505377, + "grad_norm": 1.4863537549972534, + "learning_rate": 4.929200178051171e-05, + "loss": 5.241, + "step": 12769 + }, + { + "epoch": 0.07594680749833477, + "grad_norm": 1.7619402408599854, + "learning_rate": 4.929189140074001e-05, + "loss": 5.4853, + "step": 12770 + }, + { + "epoch": 0.07595275478161576, + "grad_norm": 1.6116011142730713, + "learning_rate": 4.929178101248827e-05, + "loss": 5.4793, + "step": 12771 + }, + { + "epoch": 0.07595870206489676, + "grad_norm": 1.8669662475585938, + "learning_rate": 4.9291670615756516e-05, + "loss": 5.4062, + "step": 12772 + }, + { + "epoch": 0.07596464934817775, + "grad_norm": 1.6439383029937744, + "learning_rate": 4.9291560210544796e-05, + "loss": 5.148, + "step": 12773 + }, + { + "epoch": 0.07597059663145875, + "grad_norm": 1.4800657033920288, + "learning_rate": 4.929144979685314e-05, + "loss": 5.3895, + "step": 12774 + }, + { + "epoch": 0.07597654391473975, + "grad_norm": 1.4091606140136719, + "learning_rate": 4.929133937468159e-05, + "loss": 5.3307, + "step": 12775 + }, + { + "epoch": 0.07598249119802074, + "grad_norm": 1.3786438703536987, + "learning_rate": 4.9291228944030176e-05, + "loss": 5.0786, + "step": 12776 + }, + { + "epoch": 0.07598843848130174, + "grad_norm": 1.6039817333221436, + "learning_rate": 4.929111850489896e-05, + "loss": 5.0606, + "step": 12777 + }, + { + "epoch": 0.07599438576458274, + "grad_norm": 1.5277283191680908, + "learning_rate": 4.929100805728796e-05, + "loss": 5.1949, + "step": 12778 + }, + { + "epoch": 0.07600033304786373, + "grad_norm": 1.6756436824798584, + "learning_rate": 4.929089760119722e-05, + "loss": 5.125, + "step": 12779 + }, + { + "epoch": 0.07600628033114473, + "grad_norm": 1.7082979679107666, + "learning_rate": 4.929078713662677e-05, + "loss": 5.1984, + "step": 12780 + }, + { + "epoch": 0.07601222761442573, + "grad_norm": 1.607293963432312, + "learning_rate": 4.929067666357666e-05, + "loss": 5.1809, + "step": 12781 + }, + { + "epoch": 0.07601817489770672, + "grad_norm": 1.5133613348007202, + "learning_rate": 4.9290566182046936e-05, + "loss": 5.2602, + "step": 12782 + }, + { + "epoch": 0.07602412218098772, + "grad_norm": 1.6572481393814087, + "learning_rate": 4.9290455692037616e-05, + "loss": 5.0959, + "step": 12783 + }, + { + "epoch": 0.07603006946426873, + "grad_norm": 1.6593372821807861, + "learning_rate": 4.929034519354876e-05, + "loss": 5.1672, + "step": 12784 + }, + { + "epoch": 0.07603601674754971, + "grad_norm": 1.4214340448379517, + "learning_rate": 4.929023468658038e-05, + "loss": 5.1064, + "step": 12785 + }, + { + "epoch": 0.07604196403083072, + "grad_norm": 1.4875116348266602, + "learning_rate": 4.929012417113255e-05, + "loss": 5.0657, + "step": 12786 + }, + { + "epoch": 0.07604791131411172, + "grad_norm": 1.7354154586791992, + "learning_rate": 4.929001364720527e-05, + "loss": 5.0415, + "step": 12787 + }, + { + "epoch": 0.0760538585973927, + "grad_norm": 1.5597622394561768, + "learning_rate": 4.928990311479861e-05, + "loss": 5.1404, + "step": 12788 + }, + { + "epoch": 0.07605980588067371, + "grad_norm": 1.6819382905960083, + "learning_rate": 4.928979257391258e-05, + "loss": 4.9487, + "step": 12789 + }, + { + "epoch": 0.07606575316395471, + "grad_norm": 1.4722174406051636, + "learning_rate": 4.928968202454725e-05, + "loss": 5.1677, + "step": 12790 + }, + { + "epoch": 0.0760717004472357, + "grad_norm": 1.5145434141159058, + "learning_rate": 4.9289571466702635e-05, + "loss": 5.2197, + "step": 12791 + }, + { + "epoch": 0.0760776477305167, + "grad_norm": 1.6052699089050293, + "learning_rate": 4.9289460900378784e-05, + "loss": 5.2508, + "step": 12792 + }, + { + "epoch": 0.0760835950137977, + "grad_norm": 1.3738253116607666, + "learning_rate": 4.9289350325575734e-05, + "loss": 5.1253, + "step": 12793 + }, + { + "epoch": 0.07608954229707869, + "grad_norm": 1.2580832242965698, + "learning_rate": 4.9289239742293524e-05, + "loss": 5.2497, + "step": 12794 + }, + { + "epoch": 0.0760954895803597, + "grad_norm": 1.6756019592285156, + "learning_rate": 4.928912915053219e-05, + "loss": 5.2471, + "step": 12795 + }, + { + "epoch": 0.0761014368636407, + "grad_norm": 1.6785964965820312, + "learning_rate": 4.928901855029177e-05, + "loss": 4.9893, + "step": 12796 + }, + { + "epoch": 0.07610738414692168, + "grad_norm": 1.6926941871643066, + "learning_rate": 4.92889079415723e-05, + "loss": 5.1558, + "step": 12797 + }, + { + "epoch": 0.07611333143020269, + "grad_norm": 1.4381680488586426, + "learning_rate": 4.9288797324373835e-05, + "loss": 4.9754, + "step": 12798 + }, + { + "epoch": 0.07611927871348369, + "grad_norm": 1.4430698156356812, + "learning_rate": 4.9288686698696393e-05, + "loss": 5.0197, + "step": 12799 + }, + { + "epoch": 0.07612522599676468, + "grad_norm": 1.4745796918869019, + "learning_rate": 4.928857606454002e-05, + "loss": 4.8857, + "step": 12800 + }, + { + "epoch": 0.07613117328004568, + "grad_norm": 1.5430330038070679, + "learning_rate": 4.928846542190477e-05, + "loss": 5.0407, + "step": 12801 + }, + { + "epoch": 0.07613712056332667, + "grad_norm": 1.6061021089553833, + "learning_rate": 4.928835477079066e-05, + "loss": 5.068, + "step": 12802 + }, + { + "epoch": 0.07614306784660767, + "grad_norm": 1.699568510055542, + "learning_rate": 4.9288244111197734e-05, + "loss": 4.9067, + "step": 12803 + }, + { + "epoch": 0.07614901512988867, + "grad_norm": 1.4770212173461914, + "learning_rate": 4.928813344312603e-05, + "loss": 5.0807, + "step": 12804 + }, + { + "epoch": 0.07615496241316966, + "grad_norm": 1.4657871723175049, + "learning_rate": 4.928802276657559e-05, + "loss": 5.1982, + "step": 12805 + }, + { + "epoch": 0.07616090969645066, + "grad_norm": 1.7897653579711914, + "learning_rate": 4.928791208154646e-05, + "loss": 5.1154, + "step": 12806 + }, + { + "epoch": 0.07616685697973166, + "grad_norm": 1.6905261278152466, + "learning_rate": 4.928780138803866e-05, + "loss": 5.3129, + "step": 12807 + }, + { + "epoch": 0.07617280426301265, + "grad_norm": 1.4763284921646118, + "learning_rate": 4.928769068605225e-05, + "loss": 5.2104, + "step": 12808 + }, + { + "epoch": 0.07617875154629365, + "grad_norm": 1.38632333278656, + "learning_rate": 4.928757997558725e-05, + "loss": 5.0857, + "step": 12809 + }, + { + "epoch": 0.07618469882957465, + "grad_norm": 1.5099103450775146, + "learning_rate": 4.928746925664371e-05, + "loss": 5.1264, + "step": 12810 + }, + { + "epoch": 0.07619064611285564, + "grad_norm": 1.285243272781372, + "learning_rate": 4.928735852922167e-05, + "loss": 5.1177, + "step": 12811 + }, + { + "epoch": 0.07619659339613664, + "grad_norm": 1.2749274969100952, + "learning_rate": 4.928724779332116e-05, + "loss": 5.0831, + "step": 12812 + }, + { + "epoch": 0.07620254067941765, + "grad_norm": 2.413712978363037, + "learning_rate": 4.928713704894222e-05, + "loss": 5.2416, + "step": 12813 + }, + { + "epoch": 0.07620848796269863, + "grad_norm": 1.602721929550171, + "learning_rate": 4.9287026296084895e-05, + "loss": 4.9799, + "step": 12814 + }, + { + "epoch": 0.07621443524597964, + "grad_norm": 1.515821099281311, + "learning_rate": 4.928691553474921e-05, + "loss": 5.034, + "step": 12815 + }, + { + "epoch": 0.07622038252926064, + "grad_norm": 1.3245290517807007, + "learning_rate": 4.928680476493523e-05, + "loss": 4.9559, + "step": 12816 + }, + { + "epoch": 0.07622632981254163, + "grad_norm": 1.5383784770965576, + "learning_rate": 4.928669398664297e-05, + "loss": 4.9085, + "step": 12817 + }, + { + "epoch": 0.07623227709582263, + "grad_norm": 1.4406317472457886, + "learning_rate": 4.928658319987247e-05, + "loss": 5.0073, + "step": 12818 + }, + { + "epoch": 0.07623822437910363, + "grad_norm": 1.6843304634094238, + "learning_rate": 4.928647240462378e-05, + "loss": 5.0262, + "step": 12819 + }, + { + "epoch": 0.07624417166238462, + "grad_norm": 1.655497431755066, + "learning_rate": 4.928636160089693e-05, + "loss": 5.0633, + "step": 12820 + }, + { + "epoch": 0.07625011894566562, + "grad_norm": 1.4143035411834717, + "learning_rate": 4.9286250788691973e-05, + "loss": 5.1131, + "step": 12821 + }, + { + "epoch": 0.07625606622894662, + "grad_norm": 1.5316637754440308, + "learning_rate": 4.9286139968008926e-05, + "loss": 5.2727, + "step": 12822 + }, + { + "epoch": 0.07626201351222761, + "grad_norm": 1.6708348989486694, + "learning_rate": 4.9286029138847844e-05, + "loss": 5.1469, + "step": 12823 + }, + { + "epoch": 0.07626796079550861, + "grad_norm": 1.48544180393219, + "learning_rate": 4.928591830120876e-05, + "loss": 5.0916, + "step": 12824 + }, + { + "epoch": 0.07627390807878962, + "grad_norm": 1.3884835243225098, + "learning_rate": 4.9285807455091715e-05, + "loss": 5.1451, + "step": 12825 + }, + { + "epoch": 0.0762798553620706, + "grad_norm": 1.7265839576721191, + "learning_rate": 4.928569660049674e-05, + "loss": 5.0478, + "step": 12826 + }, + { + "epoch": 0.0762858026453516, + "grad_norm": 1.678852915763855, + "learning_rate": 4.9285585737423875e-05, + "loss": 5.2127, + "step": 12827 + }, + { + "epoch": 0.07629174992863261, + "grad_norm": 1.4907126426696777, + "learning_rate": 4.928547486587317e-05, + "loss": 4.9706, + "step": 12828 + }, + { + "epoch": 0.0762976972119136, + "grad_norm": 1.610822319984436, + "learning_rate": 4.928536398584466e-05, + "loss": 5.2416, + "step": 12829 + }, + { + "epoch": 0.0763036444951946, + "grad_norm": 1.5226528644561768, + "learning_rate": 4.9285253097338375e-05, + "loss": 5.2665, + "step": 12830 + }, + { + "epoch": 0.07630959177847559, + "grad_norm": 1.6021392345428467, + "learning_rate": 4.928514220035436e-05, + "loss": 5.2129, + "step": 12831 + }, + { + "epoch": 0.07631553906175659, + "grad_norm": 1.4113723039627075, + "learning_rate": 4.928503129489265e-05, + "loss": 5.3568, + "step": 12832 + }, + { + "epoch": 0.07632148634503759, + "grad_norm": 1.7851402759552002, + "learning_rate": 4.928492038095329e-05, + "loss": 5.2028, + "step": 12833 + }, + { + "epoch": 0.07632743362831858, + "grad_norm": 2.0881283283233643, + "learning_rate": 4.928480945853631e-05, + "loss": 5.2721, + "step": 12834 + }, + { + "epoch": 0.07633338091159958, + "grad_norm": 1.376695156097412, + "learning_rate": 4.928469852764176e-05, + "loss": 5.0203, + "step": 12835 + }, + { + "epoch": 0.07633932819488058, + "grad_norm": 1.585046648979187, + "learning_rate": 4.928458758826967e-05, + "loss": 5.4281, + "step": 12836 + }, + { + "epoch": 0.07634527547816157, + "grad_norm": 1.7124192714691162, + "learning_rate": 4.928447664042008e-05, + "loss": 5.4921, + "step": 12837 + }, + { + "epoch": 0.07635122276144257, + "grad_norm": 1.5693449974060059, + "learning_rate": 4.928436568409304e-05, + "loss": 5.5729, + "step": 12838 + }, + { + "epoch": 0.07635717004472357, + "grad_norm": 2.072880506515503, + "learning_rate": 4.928425471928857e-05, + "loss": 5.1023, + "step": 12839 + }, + { + "epoch": 0.07636311732800456, + "grad_norm": 1.674325704574585, + "learning_rate": 4.928414374600672e-05, + "loss": 5.5319, + "step": 12840 + }, + { + "epoch": 0.07636906461128556, + "grad_norm": 1.3941127061843872, + "learning_rate": 4.9284032764247523e-05, + "loss": 5.4425, + "step": 12841 + }, + { + "epoch": 0.07637501189456657, + "grad_norm": 1.670743703842163, + "learning_rate": 4.9283921774011025e-05, + "loss": 5.2595, + "step": 12842 + }, + { + "epoch": 0.07638095917784755, + "grad_norm": 2.852534294128418, + "learning_rate": 4.928381077529726e-05, + "loss": 5.321, + "step": 12843 + }, + { + "epoch": 0.07638690646112856, + "grad_norm": 1.930977463722229, + "learning_rate": 4.928369976810626e-05, + "loss": 5.2649, + "step": 12844 + }, + { + "epoch": 0.07639285374440956, + "grad_norm": 1.8886314630508423, + "learning_rate": 4.928358875243808e-05, + "loss": 5.1882, + "step": 12845 + }, + { + "epoch": 0.07639880102769055, + "grad_norm": 1.793514609336853, + "learning_rate": 4.9283477728292745e-05, + "loss": 5.0946, + "step": 12846 + }, + { + "epoch": 0.07640474831097155, + "grad_norm": 1.8616431951522827, + "learning_rate": 4.9283366695670304e-05, + "loss": 5.1097, + "step": 12847 + }, + { + "epoch": 0.07641069559425255, + "grad_norm": 1.9281915426254272, + "learning_rate": 4.9283255654570785e-05, + "loss": 5.0054, + "step": 12848 + }, + { + "epoch": 0.07641664287753354, + "grad_norm": 2.036522150039673, + "learning_rate": 4.9283144604994234e-05, + "loss": 4.9115, + "step": 12849 + }, + { + "epoch": 0.07642259016081454, + "grad_norm": 1.7962864637374878, + "learning_rate": 4.928303354694069e-05, + "loss": 4.8951, + "step": 12850 + }, + { + "epoch": 0.07642853744409554, + "grad_norm": 2.1671249866485596, + "learning_rate": 4.9282922480410195e-05, + "loss": 5.1393, + "step": 12851 + }, + { + "epoch": 0.07643448472737653, + "grad_norm": 1.9870150089263916, + "learning_rate": 4.9282811405402774e-05, + "loss": 5.5572, + "step": 12852 + }, + { + "epoch": 0.07644043201065753, + "grad_norm": 2.1498360633850098, + "learning_rate": 4.928270032191847e-05, + "loss": 5.7031, + "step": 12853 + }, + { + "epoch": 0.07644637929393854, + "grad_norm": 2.06821870803833, + "learning_rate": 4.928258922995734e-05, + "loss": 5.723, + "step": 12854 + }, + { + "epoch": 0.07645232657721952, + "grad_norm": 2.283720016479492, + "learning_rate": 4.92824781295194e-05, + "loss": 5.2129, + "step": 12855 + }, + { + "epoch": 0.07645827386050053, + "grad_norm": 2.1862099170684814, + "learning_rate": 4.9282367020604704e-05, + "loss": 4.7535, + "step": 12856 + }, + { + "epoch": 0.07646422114378153, + "grad_norm": 1.7297099828720093, + "learning_rate": 4.928225590321328e-05, + "loss": 5.1965, + "step": 12857 + }, + { + "epoch": 0.07647016842706252, + "grad_norm": 2.0406720638275146, + "learning_rate": 4.9282144777345176e-05, + "loss": 5.289, + "step": 12858 + }, + { + "epoch": 0.07647611571034352, + "grad_norm": 1.8368127346038818, + "learning_rate": 4.928203364300042e-05, + "loss": 5.5448, + "step": 12859 + }, + { + "epoch": 0.0764820629936245, + "grad_norm": 1.837804913520813, + "learning_rate": 4.9281922500179054e-05, + "loss": 5.5284, + "step": 12860 + }, + { + "epoch": 0.07648801027690551, + "grad_norm": 1.7191063165664673, + "learning_rate": 4.928181134888113e-05, + "loss": 5.8212, + "step": 12861 + }, + { + "epoch": 0.07649395756018651, + "grad_norm": 1.757323980331421, + "learning_rate": 4.928170018910667e-05, + "loss": 5.8421, + "step": 12862 + }, + { + "epoch": 0.0764999048434675, + "grad_norm": 1.9213273525238037, + "learning_rate": 4.928158902085572e-05, + "loss": 5.1923, + "step": 12863 + }, + { + "epoch": 0.0765058521267485, + "grad_norm": 1.888006567955017, + "learning_rate": 4.928147784412832e-05, + "loss": 5.4282, + "step": 12864 + }, + { + "epoch": 0.0765117994100295, + "grad_norm": 1.555870771408081, + "learning_rate": 4.9281366658924506e-05, + "loss": 5.8256, + "step": 12865 + }, + { + "epoch": 0.07651774669331049, + "grad_norm": 1.8194485902786255, + "learning_rate": 4.9281255465244314e-05, + "loss": 5.5886, + "step": 12866 + }, + { + "epoch": 0.07652369397659149, + "grad_norm": 1.7867372035980225, + "learning_rate": 4.9281144263087795e-05, + "loss": 5.4818, + "step": 12867 + }, + { + "epoch": 0.0765296412598725, + "grad_norm": 1.8511155843734741, + "learning_rate": 4.928103305245497e-05, + "loss": 5.519, + "step": 12868 + }, + { + "epoch": 0.07653558854315348, + "grad_norm": 2.728428602218628, + "learning_rate": 4.928092183334589e-05, + "loss": 5.0085, + "step": 12869 + }, + { + "epoch": 0.07654153582643448, + "grad_norm": 2.5393402576446533, + "learning_rate": 4.92808106057606e-05, + "loss": 5.0862, + "step": 12870 + }, + { + "epoch": 0.07654748310971549, + "grad_norm": 2.494248151779175, + "learning_rate": 4.928069936969912e-05, + "loss": 5.5557, + "step": 12871 + }, + { + "epoch": 0.07655343039299647, + "grad_norm": 2.4287991523742676, + "learning_rate": 4.9280588125161496e-05, + "loss": 5.6646, + "step": 12872 + }, + { + "epoch": 0.07655937767627748, + "grad_norm": 2.188556432723999, + "learning_rate": 4.928047687214778e-05, + "loss": 5.6618, + "step": 12873 + }, + { + "epoch": 0.07656532495955848, + "grad_norm": 2.7367382049560547, + "learning_rate": 4.9280365610657996e-05, + "loss": 4.6788, + "step": 12874 + }, + { + "epoch": 0.07657127224283947, + "grad_norm": 2.492922067642212, + "learning_rate": 4.9280254340692187e-05, + "loss": 4.4132, + "step": 12875 + }, + { + "epoch": 0.07657721952612047, + "grad_norm": 2.361133575439453, + "learning_rate": 4.928014306225039e-05, + "loss": 4.3957, + "step": 12876 + }, + { + "epoch": 0.07658316680940147, + "grad_norm": 2.652127742767334, + "learning_rate": 4.9280031775332646e-05, + "loss": 4.4568, + "step": 12877 + }, + { + "epoch": 0.07658911409268246, + "grad_norm": 2.40895938873291, + "learning_rate": 4.9279920479938995e-05, + "loss": 4.6276, + "step": 12878 + }, + { + "epoch": 0.07659506137596346, + "grad_norm": 1.9418548345565796, + "learning_rate": 4.927980917606948e-05, + "loss": 5.6008, + "step": 12879 + }, + { + "epoch": 0.07660100865924446, + "grad_norm": 1.7706143856048584, + "learning_rate": 4.9279697863724125e-05, + "loss": 5.4946, + "step": 12880 + }, + { + "epoch": 0.07660695594252545, + "grad_norm": 2.856342077255249, + "learning_rate": 4.9279586542902986e-05, + "loss": 4.9182, + "step": 12881 + }, + { + "epoch": 0.07661290322580645, + "grad_norm": 2.713515043258667, + "learning_rate": 4.927947521360608e-05, + "loss": 5.2341, + "step": 12882 + }, + { + "epoch": 0.07661885050908745, + "grad_norm": 2.186169147491455, + "learning_rate": 4.927936387583348e-05, + "loss": 5.1348, + "step": 12883 + }, + { + "epoch": 0.07662479779236844, + "grad_norm": 2.3114492893218994, + "learning_rate": 4.9279252529585195e-05, + "loss": 5.0016, + "step": 12884 + }, + { + "epoch": 0.07663074507564945, + "grad_norm": 2.256502866744995, + "learning_rate": 4.927914117486128e-05, + "loss": 5.1759, + "step": 12885 + }, + { + "epoch": 0.07663669235893045, + "grad_norm": 2.281243324279785, + "learning_rate": 4.927902981166176e-05, + "loss": 5.1437, + "step": 12886 + }, + { + "epoch": 0.07664263964221144, + "grad_norm": 2.3553836345672607, + "learning_rate": 4.927891843998668e-05, + "loss": 5.1622, + "step": 12887 + }, + { + "epoch": 0.07664858692549244, + "grad_norm": 2.420192003250122, + "learning_rate": 4.927880705983609e-05, + "loss": 4.994, + "step": 12888 + }, + { + "epoch": 0.07665453420877343, + "grad_norm": 2.3391306400299072, + "learning_rate": 4.927869567121001e-05, + "loss": 4.9445, + "step": 12889 + }, + { + "epoch": 0.07666048149205443, + "grad_norm": 2.2093355655670166, + "learning_rate": 4.9278584274108484e-05, + "loss": 5.05, + "step": 12890 + }, + { + "epoch": 0.07666642877533543, + "grad_norm": 2.3378305435180664, + "learning_rate": 4.927847286853157e-05, + "loss": 4.8694, + "step": 12891 + }, + { + "epoch": 0.07667237605861642, + "grad_norm": 2.2110583782196045, + "learning_rate": 4.927836145447928e-05, + "loss": 4.8622, + "step": 12892 + }, + { + "epoch": 0.07667832334189742, + "grad_norm": 2.2865991592407227, + "learning_rate": 4.927825003195167e-05, + "loss": 4.9485, + "step": 12893 + }, + { + "epoch": 0.07668427062517842, + "grad_norm": 2.343135356903076, + "learning_rate": 4.927813860094878e-05, + "loss": 4.8874, + "step": 12894 + }, + { + "epoch": 0.07669021790845941, + "grad_norm": 2.1939613819122314, + "learning_rate": 4.927802716147063e-05, + "loss": 4.8349, + "step": 12895 + }, + { + "epoch": 0.07669616519174041, + "grad_norm": 2.866560697555542, + "learning_rate": 4.927791571351728e-05, + "loss": 5.1409, + "step": 12896 + }, + { + "epoch": 0.07670211247502141, + "grad_norm": 2.1052801609039307, + "learning_rate": 4.927780425708876e-05, + "loss": 5.3716, + "step": 12897 + }, + { + "epoch": 0.0767080597583024, + "grad_norm": 2.141184091567993, + "learning_rate": 4.9277692792185106e-05, + "loss": 5.2985, + "step": 12898 + }, + { + "epoch": 0.0767140070415834, + "grad_norm": 1.93148934841156, + "learning_rate": 4.927758131880636e-05, + "loss": 5.6222, + "step": 12899 + }, + { + "epoch": 0.0767199543248644, + "grad_norm": 1.8454651832580566, + "learning_rate": 4.927746983695256e-05, + "loss": 5.6966, + "step": 12900 + }, + { + "epoch": 0.0767259016081454, + "grad_norm": 1.764281153678894, + "learning_rate": 4.9277358346623746e-05, + "loss": 5.4979, + "step": 12901 + }, + { + "epoch": 0.0767318488914264, + "grad_norm": 1.6969131231307983, + "learning_rate": 4.9277246847819965e-05, + "loss": 5.5221, + "step": 12902 + }, + { + "epoch": 0.0767377961747074, + "grad_norm": 1.7118967771530151, + "learning_rate": 4.927713534054124e-05, + "loss": 5.6067, + "step": 12903 + }, + { + "epoch": 0.07674374345798839, + "grad_norm": 2.1508536338806152, + "learning_rate": 4.9277023824787625e-05, + "loss": 5.8241, + "step": 12904 + }, + { + "epoch": 0.07674969074126939, + "grad_norm": 1.8613126277923584, + "learning_rate": 4.927691230055914e-05, + "loss": 5.7141, + "step": 12905 + }, + { + "epoch": 0.07675563802455039, + "grad_norm": 1.8942763805389404, + "learning_rate": 4.927680076785585e-05, + "loss": 5.6909, + "step": 12906 + }, + { + "epoch": 0.07676158530783138, + "grad_norm": 1.8824634552001953, + "learning_rate": 4.927668922667777e-05, + "loss": 5.5055, + "step": 12907 + }, + { + "epoch": 0.07676753259111238, + "grad_norm": 1.8920915126800537, + "learning_rate": 4.927657767702495e-05, + "loss": 5.1783, + "step": 12908 + }, + { + "epoch": 0.07677347987439338, + "grad_norm": 1.8226712942123413, + "learning_rate": 4.927646611889743e-05, + "loss": 5.7529, + "step": 12909 + }, + { + "epoch": 0.07677942715767437, + "grad_norm": 1.88478684425354, + "learning_rate": 4.9276354552295245e-05, + "loss": 5.7034, + "step": 12910 + }, + { + "epoch": 0.07678537444095537, + "grad_norm": 1.6312634944915771, + "learning_rate": 4.927624297721844e-05, + "loss": 5.6476, + "step": 12911 + }, + { + "epoch": 0.07679132172423637, + "grad_norm": 1.5183994770050049, + "learning_rate": 4.927613139366704e-05, + "loss": 5.8517, + "step": 12912 + }, + { + "epoch": 0.07679726900751736, + "grad_norm": 1.6718844175338745, + "learning_rate": 4.92760198016411e-05, + "loss": 5.9619, + "step": 12913 + }, + { + "epoch": 0.07680321629079837, + "grad_norm": 2.575932741165161, + "learning_rate": 4.9275908201140654e-05, + "loss": 5.6903, + "step": 12914 + }, + { + "epoch": 0.07680916357407937, + "grad_norm": 2.2863197326660156, + "learning_rate": 4.927579659216574e-05, + "loss": 5.7517, + "step": 12915 + }, + { + "epoch": 0.07681511085736036, + "grad_norm": 2.231417417526245, + "learning_rate": 4.9275684974716384e-05, + "loss": 5.2323, + "step": 12916 + }, + { + "epoch": 0.07682105814064136, + "grad_norm": 1.9159691333770752, + "learning_rate": 4.927557334879265e-05, + "loss": 5.2548, + "step": 12917 + }, + { + "epoch": 0.07682700542392235, + "grad_norm": 1.6682984828948975, + "learning_rate": 4.927546171439455e-05, + "loss": 5.4639, + "step": 12918 + }, + { + "epoch": 0.07683295270720335, + "grad_norm": 2.1923654079437256, + "learning_rate": 4.927535007152215e-05, + "loss": 5.6016, + "step": 12919 + }, + { + "epoch": 0.07683889999048435, + "grad_norm": 2.2393245697021484, + "learning_rate": 4.9275238420175474e-05, + "loss": 5.9433, + "step": 12920 + }, + { + "epoch": 0.07684484727376534, + "grad_norm": 1.8611164093017578, + "learning_rate": 4.9275126760354565e-05, + "loss": 5.3477, + "step": 12921 + }, + { + "epoch": 0.07685079455704634, + "grad_norm": 1.902567982673645, + "learning_rate": 4.927501509205945e-05, + "loss": 5.4417, + "step": 12922 + }, + { + "epoch": 0.07685674184032734, + "grad_norm": 1.7735011577606201, + "learning_rate": 4.9274903415290184e-05, + "loss": 5.652, + "step": 12923 + }, + { + "epoch": 0.07686268912360833, + "grad_norm": 1.886060357093811, + "learning_rate": 4.927479173004681e-05, + "loss": 5.5927, + "step": 12924 + }, + { + "epoch": 0.07686863640688933, + "grad_norm": 1.8315941095352173, + "learning_rate": 4.927468003632935e-05, + "loss": 5.6559, + "step": 12925 + }, + { + "epoch": 0.07687458369017033, + "grad_norm": 1.7790045738220215, + "learning_rate": 4.927456833413784e-05, + "loss": 5.463, + "step": 12926 + }, + { + "epoch": 0.07688053097345132, + "grad_norm": 1.9559917449951172, + "learning_rate": 4.927445662347234e-05, + "loss": 5.6154, + "step": 12927 + }, + { + "epoch": 0.07688647825673232, + "grad_norm": 1.7274752855300903, + "learning_rate": 4.927434490433287e-05, + "loss": 5.5621, + "step": 12928 + }, + { + "epoch": 0.07689242554001333, + "grad_norm": 1.594190001487732, + "learning_rate": 4.9274233176719486e-05, + "loss": 5.4674, + "step": 12929 + }, + { + "epoch": 0.07689837282329431, + "grad_norm": 1.79281485080719, + "learning_rate": 4.927412144063222e-05, + "loss": 5.5166, + "step": 12930 + }, + { + "epoch": 0.07690432010657532, + "grad_norm": 1.6584967374801636, + "learning_rate": 4.92740096960711e-05, + "loss": 5.4249, + "step": 12931 + }, + { + "epoch": 0.07691026738985632, + "grad_norm": 1.8458021879196167, + "learning_rate": 4.927389794303617e-05, + "loss": 5.6073, + "step": 12932 + }, + { + "epoch": 0.0769162146731373, + "grad_norm": 1.5526570081710815, + "learning_rate": 4.927378618152748e-05, + "loss": 5.3992, + "step": 12933 + }, + { + "epoch": 0.07692216195641831, + "grad_norm": 1.6043710708618164, + "learning_rate": 4.927367441154507e-05, + "loss": 5.3786, + "step": 12934 + }, + { + "epoch": 0.07692810923969931, + "grad_norm": 1.6580268144607544, + "learning_rate": 4.927356263308896e-05, + "loss": 5.5177, + "step": 12935 + }, + { + "epoch": 0.0769340565229803, + "grad_norm": 1.7199897766113281, + "learning_rate": 4.9273450846159194e-05, + "loss": 5.4281, + "step": 12936 + }, + { + "epoch": 0.0769400038062613, + "grad_norm": 1.6920559406280518, + "learning_rate": 4.9273339050755835e-05, + "loss": 5.562, + "step": 12937 + }, + { + "epoch": 0.0769459510895423, + "grad_norm": 1.8027700185775757, + "learning_rate": 4.9273227246878894e-05, + "loss": 5.5473, + "step": 12938 + }, + { + "epoch": 0.07695189837282329, + "grad_norm": 1.6055867671966553, + "learning_rate": 4.927311543452842e-05, + "loss": 5.4903, + "step": 12939 + }, + { + "epoch": 0.07695784565610429, + "grad_norm": 1.5789201259613037, + "learning_rate": 4.9273003613704456e-05, + "loss": 5.4514, + "step": 12940 + }, + { + "epoch": 0.0769637929393853, + "grad_norm": 1.6153863668441772, + "learning_rate": 4.9272891784407034e-05, + "loss": 5.4343, + "step": 12941 + }, + { + "epoch": 0.07696974022266628, + "grad_norm": 1.8802043199539185, + "learning_rate": 4.927277994663619e-05, + "loss": 5.4691, + "step": 12942 + }, + { + "epoch": 0.07697568750594729, + "grad_norm": 1.869836688041687, + "learning_rate": 4.9272668100391984e-05, + "loss": 5.5037, + "step": 12943 + }, + { + "epoch": 0.07698163478922829, + "grad_norm": 1.9082410335540771, + "learning_rate": 4.927255624567443e-05, + "loss": 5.4814, + "step": 12944 + }, + { + "epoch": 0.07698758207250928, + "grad_norm": 1.5890675783157349, + "learning_rate": 4.927244438248358e-05, + "loss": 5.4627, + "step": 12945 + }, + { + "epoch": 0.07699352935579028, + "grad_norm": 1.7432551383972168, + "learning_rate": 4.9272332510819475e-05, + "loss": 5.4301, + "step": 12946 + }, + { + "epoch": 0.07699947663907127, + "grad_norm": 1.7112667560577393, + "learning_rate": 4.927222063068214e-05, + "loss": 5.4028, + "step": 12947 + }, + { + "epoch": 0.07700542392235227, + "grad_norm": 1.7046465873718262, + "learning_rate": 4.9272108742071634e-05, + "loss": 5.4688, + "step": 12948 + }, + { + "epoch": 0.07701137120563327, + "grad_norm": 1.6928964853286743, + "learning_rate": 4.927199684498798e-05, + "loss": 5.4553, + "step": 12949 + }, + { + "epoch": 0.07701731848891426, + "grad_norm": 1.8731732368469238, + "learning_rate": 4.927188493943122e-05, + "loss": 5.3542, + "step": 12950 + }, + { + "epoch": 0.07702326577219526, + "grad_norm": 1.6586295366287231, + "learning_rate": 4.92717730254014e-05, + "loss": 5.2852, + "step": 12951 + }, + { + "epoch": 0.07702921305547626, + "grad_norm": 1.724252462387085, + "learning_rate": 4.927166110289855e-05, + "loss": 5.3982, + "step": 12952 + }, + { + "epoch": 0.07703516033875725, + "grad_norm": 1.7133373022079468, + "learning_rate": 4.9271549171922716e-05, + "loss": 5.3642, + "step": 12953 + }, + { + "epoch": 0.07704110762203825, + "grad_norm": 1.779291033744812, + "learning_rate": 4.927143723247394e-05, + "loss": 5.3949, + "step": 12954 + }, + { + "epoch": 0.07704705490531925, + "grad_norm": 1.8439239263534546, + "learning_rate": 4.927132528455225e-05, + "loss": 5.3829, + "step": 12955 + }, + { + "epoch": 0.07705300218860024, + "grad_norm": 1.7440255880355835, + "learning_rate": 4.927121332815769e-05, + "loss": 5.3881, + "step": 12956 + }, + { + "epoch": 0.07705894947188124, + "grad_norm": 1.8459028005599976, + "learning_rate": 4.927110136329031e-05, + "loss": 5.3575, + "step": 12957 + }, + { + "epoch": 0.07706489675516225, + "grad_norm": 2.8051815032958984, + "learning_rate": 4.927098938995013e-05, + "loss": 5.2814, + "step": 12958 + }, + { + "epoch": 0.07707084403844323, + "grad_norm": 1.8814127445220947, + "learning_rate": 4.9270877408137194e-05, + "loss": 5.3614, + "step": 12959 + }, + { + "epoch": 0.07707679132172424, + "grad_norm": 1.570408821105957, + "learning_rate": 4.927076541785156e-05, + "loss": 5.3453, + "step": 12960 + }, + { + "epoch": 0.07708273860500524, + "grad_norm": 1.607393741607666, + "learning_rate": 4.927065341909324e-05, + "loss": 5.4766, + "step": 12961 + }, + { + "epoch": 0.07708868588828623, + "grad_norm": 1.475420594215393, + "learning_rate": 4.927054141186229e-05, + "loss": 5.4511, + "step": 12962 + }, + { + "epoch": 0.07709463317156723, + "grad_norm": 1.7785848379135132, + "learning_rate": 4.927042939615875e-05, + "loss": 5.3839, + "step": 12963 + }, + { + "epoch": 0.07710058045484823, + "grad_norm": 1.7313402891159058, + "learning_rate": 4.9270317371982645e-05, + "loss": 5.3398, + "step": 12964 + }, + { + "epoch": 0.07710652773812922, + "grad_norm": 1.666938066482544, + "learning_rate": 4.927020533933403e-05, + "loss": 5.4462, + "step": 12965 + }, + { + "epoch": 0.07711247502141022, + "grad_norm": 1.5219112634658813, + "learning_rate": 4.9270093298212933e-05, + "loss": 5.7593, + "step": 12966 + }, + { + "epoch": 0.07711842230469122, + "grad_norm": 2.0760631561279297, + "learning_rate": 4.92699812486194e-05, + "loss": 5.5765, + "step": 12967 + }, + { + "epoch": 0.07712436958797221, + "grad_norm": 1.7648851871490479, + "learning_rate": 4.926986919055346e-05, + "loss": 5.8786, + "step": 12968 + }, + { + "epoch": 0.07713031687125321, + "grad_norm": 1.832141399383545, + "learning_rate": 4.926975712401517e-05, + "loss": 5.6695, + "step": 12969 + }, + { + "epoch": 0.07713626415453421, + "grad_norm": 1.9032765626907349, + "learning_rate": 4.926964504900455e-05, + "loss": 5.701, + "step": 12970 + }, + { + "epoch": 0.0771422114378152, + "grad_norm": 1.7294973134994507, + "learning_rate": 4.9269532965521656e-05, + "loss": 5.6569, + "step": 12971 + }, + { + "epoch": 0.0771481587210962, + "grad_norm": 1.927510142326355, + "learning_rate": 4.926942087356651e-05, + "loss": 5.1289, + "step": 12972 + }, + { + "epoch": 0.07715410600437721, + "grad_norm": 1.6945842504501343, + "learning_rate": 4.926930877313917e-05, + "loss": 5.5703, + "step": 12973 + }, + { + "epoch": 0.0771600532876582, + "grad_norm": 1.7665363550186157, + "learning_rate": 4.926919666423966e-05, + "loss": 5.822, + "step": 12974 + }, + { + "epoch": 0.0771660005709392, + "grad_norm": 1.5802277326583862, + "learning_rate": 4.926908454686801e-05, + "loss": 5.5438, + "step": 12975 + }, + { + "epoch": 0.07717194785422019, + "grad_norm": 1.9065684080123901, + "learning_rate": 4.9268972421024295e-05, + "loss": 5.5556, + "step": 12976 + }, + { + "epoch": 0.07717789513750119, + "grad_norm": 1.7630208730697632, + "learning_rate": 4.9268860286708526e-05, + "loss": 5.6079, + "step": 12977 + }, + { + "epoch": 0.07718384242078219, + "grad_norm": 1.6295850276947021, + "learning_rate": 4.9268748143920746e-05, + "loss": 5.6163, + "step": 12978 + }, + { + "epoch": 0.07718978970406318, + "grad_norm": 1.753202199935913, + "learning_rate": 4.926863599266099e-05, + "loss": 5.549, + "step": 12979 + }, + { + "epoch": 0.07719573698734418, + "grad_norm": 1.7823643684387207, + "learning_rate": 4.9268523832929314e-05, + "loss": 5.6917, + "step": 12980 + }, + { + "epoch": 0.07720168427062518, + "grad_norm": 1.7990792989730835, + "learning_rate": 4.926841166472574e-05, + "loss": 5.5897, + "step": 12981 + }, + { + "epoch": 0.07720763155390617, + "grad_norm": 1.7813109159469604, + "learning_rate": 4.926829948805033e-05, + "loss": 5.5953, + "step": 12982 + }, + { + "epoch": 0.07721357883718717, + "grad_norm": 1.7127541303634644, + "learning_rate": 4.926818730290309e-05, + "loss": 5.5476, + "step": 12983 + }, + { + "epoch": 0.07721952612046817, + "grad_norm": 2.0513558387756348, + "learning_rate": 4.9268075109284084e-05, + "loss": 5.5721, + "step": 12984 + }, + { + "epoch": 0.07722547340374916, + "grad_norm": 1.8053756952285767, + "learning_rate": 4.9267962907193346e-05, + "loss": 5.5344, + "step": 12985 + }, + { + "epoch": 0.07723142068703016, + "grad_norm": 1.7184503078460693, + "learning_rate": 4.9267850696630904e-05, + "loss": 5.602, + "step": 12986 + }, + { + "epoch": 0.07723736797031117, + "grad_norm": 1.8753174543380737, + "learning_rate": 4.926773847759682e-05, + "loss": 5.701, + "step": 12987 + }, + { + "epoch": 0.07724331525359215, + "grad_norm": 1.7761272192001343, + "learning_rate": 4.9267626250091106e-05, + "loss": 5.5026, + "step": 12988 + }, + { + "epoch": 0.07724926253687316, + "grad_norm": 1.6833654642105103, + "learning_rate": 4.926751401411381e-05, + "loss": 5.5615, + "step": 12989 + }, + { + "epoch": 0.07725520982015416, + "grad_norm": 1.8640247583389282, + "learning_rate": 4.926740176966499e-05, + "loss": 5.8367, + "step": 12990 + }, + { + "epoch": 0.07726115710343515, + "grad_norm": 2.036540985107422, + "learning_rate": 4.9267289516744665e-05, + "loss": 5.6258, + "step": 12991 + }, + { + "epoch": 0.07726710438671615, + "grad_norm": 2.0168917179107666, + "learning_rate": 4.926717725535288e-05, + "loss": 5.1961, + "step": 12992 + }, + { + "epoch": 0.07727305166999715, + "grad_norm": 2.149548292160034, + "learning_rate": 4.9267064985489674e-05, + "loss": 5.1735, + "step": 12993 + }, + { + "epoch": 0.07727899895327814, + "grad_norm": 1.7929832935333252, + "learning_rate": 4.926695270715508e-05, + "loss": 5.6889, + "step": 12994 + }, + { + "epoch": 0.07728494623655914, + "grad_norm": 1.7964575290679932, + "learning_rate": 4.926684042034916e-05, + "loss": 5.0576, + "step": 12995 + }, + { + "epoch": 0.07729089351984014, + "grad_norm": 1.8207305669784546, + "learning_rate": 4.926672812507192e-05, + "loss": 5.2703, + "step": 12996 + }, + { + "epoch": 0.07729684080312113, + "grad_norm": 1.6263490915298462, + "learning_rate": 4.9266615821323425e-05, + "loss": 5.5999, + "step": 12997 + }, + { + "epoch": 0.07730278808640213, + "grad_norm": 2.0018131732940674, + "learning_rate": 4.92665035091037e-05, + "loss": 4.9439, + "step": 12998 + }, + { + "epoch": 0.07730873536968313, + "grad_norm": 2.32818341255188, + "learning_rate": 4.926639118841279e-05, + "loss": 4.6071, + "step": 12999 + }, + { + "epoch": 0.07731468265296412, + "grad_norm": 2.3354949951171875, + "learning_rate": 4.926627885925074e-05, + "loss": 4.6642, + "step": 13000 + }, + { + "epoch": 0.07732062993624512, + "grad_norm": 1.71230149269104, + "learning_rate": 4.926616652161757e-05, + "loss": 5.161, + "step": 13001 + }, + { + "epoch": 0.07732657721952613, + "grad_norm": 1.4890326261520386, + "learning_rate": 4.9266054175513345e-05, + "loss": 5.1714, + "step": 13002 + }, + { + "epoch": 0.07733252450280712, + "grad_norm": 1.5844224691390991, + "learning_rate": 4.926594182093809e-05, + "loss": 4.869, + "step": 13003 + }, + { + "epoch": 0.07733847178608812, + "grad_norm": 2.328636884689331, + "learning_rate": 4.926582945789185e-05, + "loss": 5.1571, + "step": 13004 + }, + { + "epoch": 0.0773444190693691, + "grad_norm": 2.067760467529297, + "learning_rate": 4.926571708637464e-05, + "loss": 5.4416, + "step": 13005 + }, + { + "epoch": 0.07735036635265011, + "grad_norm": 1.7148468494415283, + "learning_rate": 4.926560470638653e-05, + "loss": 5.464, + "step": 13006 + }, + { + "epoch": 0.07735631363593111, + "grad_norm": 1.6869080066680908, + "learning_rate": 4.926549231792755e-05, + "loss": 5.5537, + "step": 13007 + }, + { + "epoch": 0.0773622609192121, + "grad_norm": 2.239408254623413, + "learning_rate": 4.9265379920997735e-05, + "loss": 5.1551, + "step": 13008 + }, + { + "epoch": 0.0773682082024931, + "grad_norm": 2.4059038162231445, + "learning_rate": 4.926526751559713e-05, + "loss": 5.2639, + "step": 13009 + }, + { + "epoch": 0.0773741554857741, + "grad_norm": 2.0787813663482666, + "learning_rate": 4.926515510172577e-05, + "loss": 5.3485, + "step": 13010 + }, + { + "epoch": 0.07738010276905509, + "grad_norm": 1.912137508392334, + "learning_rate": 4.9265042679383685e-05, + "loss": 5.551, + "step": 13011 + }, + { + "epoch": 0.07738605005233609, + "grad_norm": 2.0865983963012695, + "learning_rate": 4.926493024857094e-05, + "loss": 5.0343, + "step": 13012 + }, + { + "epoch": 0.0773919973356171, + "grad_norm": 1.9341247081756592, + "learning_rate": 4.926481780928754e-05, + "loss": 5.5904, + "step": 13013 + }, + { + "epoch": 0.07739794461889808, + "grad_norm": 1.7777684926986694, + "learning_rate": 4.926470536153356e-05, + "loss": 5.5396, + "step": 13014 + }, + { + "epoch": 0.07740389190217908, + "grad_norm": 1.7952098846435547, + "learning_rate": 4.926459290530902e-05, + "loss": 5.3212, + "step": 13015 + }, + { + "epoch": 0.07740983918546009, + "grad_norm": 1.7674907445907593, + "learning_rate": 4.926448044061396e-05, + "loss": 5.3316, + "step": 13016 + }, + { + "epoch": 0.07741578646874107, + "grad_norm": 1.8327823877334595, + "learning_rate": 4.926436796744841e-05, + "loss": 5.3129, + "step": 13017 + }, + { + "epoch": 0.07742173375202208, + "grad_norm": 1.613867998123169, + "learning_rate": 4.9264255485812425e-05, + "loss": 5.4935, + "step": 13018 + }, + { + "epoch": 0.07742768103530308, + "grad_norm": 1.7167906761169434, + "learning_rate": 4.9264142995706044e-05, + "loss": 5.3054, + "step": 13019 + }, + { + "epoch": 0.07743362831858407, + "grad_norm": 2.272038698196411, + "learning_rate": 4.92640304971293e-05, + "loss": 5.1327, + "step": 13020 + }, + { + "epoch": 0.07743957560186507, + "grad_norm": 1.6358660459518433, + "learning_rate": 4.926391799008223e-05, + "loss": 5.3285, + "step": 13021 + }, + { + "epoch": 0.07744552288514607, + "grad_norm": 2.166813373565674, + "learning_rate": 4.926380547456488e-05, + "loss": 5.2846, + "step": 13022 + }, + { + "epoch": 0.07745147016842706, + "grad_norm": 2.3251235485076904, + "learning_rate": 4.926369295057729e-05, + "loss": 5.2482, + "step": 13023 + }, + { + "epoch": 0.07745741745170806, + "grad_norm": 1.9402974843978882, + "learning_rate": 4.926358041811949e-05, + "loss": 5.3514, + "step": 13024 + }, + { + "epoch": 0.07746336473498906, + "grad_norm": 2.1346986293792725, + "learning_rate": 4.9263467877191525e-05, + "loss": 5.1912, + "step": 13025 + }, + { + "epoch": 0.07746931201827005, + "grad_norm": 2.0809762477874756, + "learning_rate": 4.926335532779344e-05, + "loss": 5.0547, + "step": 13026 + }, + { + "epoch": 0.07747525930155105, + "grad_norm": 2.110558032989502, + "learning_rate": 4.9263242769925256e-05, + "loss": 5.2177, + "step": 13027 + }, + { + "epoch": 0.07748120658483205, + "grad_norm": 2.3498575687408447, + "learning_rate": 4.926313020358704e-05, + "loss": 4.9997, + "step": 13028 + }, + { + "epoch": 0.07748715386811304, + "grad_norm": 2.4052765369415283, + "learning_rate": 4.92630176287788e-05, + "loss": 4.9736, + "step": 13029 + }, + { + "epoch": 0.07749310115139404, + "grad_norm": 2.3132238388061523, + "learning_rate": 4.9262905045500603e-05, + "loss": 4.9149, + "step": 13030 + }, + { + "epoch": 0.07749904843467505, + "grad_norm": 2.315483331680298, + "learning_rate": 4.926279245375247e-05, + "loss": 4.9096, + "step": 13031 + }, + { + "epoch": 0.07750499571795604, + "grad_norm": 2.0887367725372314, + "learning_rate": 4.926267985353445e-05, + "loss": 5.3274, + "step": 13032 + }, + { + "epoch": 0.07751094300123704, + "grad_norm": 2.3138368129730225, + "learning_rate": 4.926256724484658e-05, + "loss": 4.8627, + "step": 13033 + }, + { + "epoch": 0.07751689028451804, + "grad_norm": 2.348411798477173, + "learning_rate": 4.926245462768889e-05, + "loss": 4.9815, + "step": 13034 + }, + { + "epoch": 0.07752283756779903, + "grad_norm": 1.7357233762741089, + "learning_rate": 4.926234200206144e-05, + "loss": 5.2836, + "step": 13035 + }, + { + "epoch": 0.07752878485108003, + "grad_norm": 1.8633183240890503, + "learning_rate": 4.9262229367964255e-05, + "loss": 5.1838, + "step": 13036 + }, + { + "epoch": 0.07753473213436102, + "grad_norm": 1.736359715461731, + "learning_rate": 4.926211672539737e-05, + "loss": 5.6746, + "step": 13037 + }, + { + "epoch": 0.07754067941764202, + "grad_norm": 2.368511915206909, + "learning_rate": 4.9262004074360834e-05, + "loss": 4.5786, + "step": 13038 + }, + { + "epoch": 0.07754662670092302, + "grad_norm": 1.859297752380371, + "learning_rate": 4.926189141485468e-05, + "loss": 5.8459, + "step": 13039 + }, + { + "epoch": 0.07755257398420401, + "grad_norm": 2.2050845623016357, + "learning_rate": 4.9261778746878955e-05, + "loss": 5.8982, + "step": 13040 + }, + { + "epoch": 0.07755852126748501, + "grad_norm": 1.7485835552215576, + "learning_rate": 4.926166607043369e-05, + "loss": 5.789, + "step": 13041 + }, + { + "epoch": 0.07756446855076601, + "grad_norm": 1.7780888080596924, + "learning_rate": 4.9261553385518936e-05, + "loss": 5.48, + "step": 13042 + }, + { + "epoch": 0.077570415834047, + "grad_norm": 1.8764269351959229, + "learning_rate": 4.9261440692134716e-05, + "loss": 5.093, + "step": 13043 + }, + { + "epoch": 0.077576363117328, + "grad_norm": 1.784196376800537, + "learning_rate": 4.926132799028108e-05, + "loss": 5.4335, + "step": 13044 + }, + { + "epoch": 0.077582310400609, + "grad_norm": 2.173844337463379, + "learning_rate": 4.926121527995806e-05, + "loss": 4.5078, + "step": 13045 + }, + { + "epoch": 0.07758825768389, + "grad_norm": 2.410778045654297, + "learning_rate": 4.9261102561165705e-05, + "loss": 5.2113, + "step": 13046 + }, + { + "epoch": 0.077594204967171, + "grad_norm": 2.0470073223114014, + "learning_rate": 4.9260989833904057e-05, + "loss": 5.4695, + "step": 13047 + }, + { + "epoch": 0.077600152250452, + "grad_norm": 1.619314193725586, + "learning_rate": 4.926087709817314e-05, + "loss": 5.8778, + "step": 13048 + }, + { + "epoch": 0.07760609953373299, + "grad_norm": 2.2353031635284424, + "learning_rate": 4.9260764353973e-05, + "loss": 5.2482, + "step": 13049 + }, + { + "epoch": 0.07761204681701399, + "grad_norm": 2.0858941078186035, + "learning_rate": 4.926065160130369e-05, + "loss": 5.2752, + "step": 13050 + }, + { + "epoch": 0.07761799410029499, + "grad_norm": 2.275660514831543, + "learning_rate": 4.926053884016522e-05, + "loss": 5.004, + "step": 13051 + }, + { + "epoch": 0.07762394138357598, + "grad_norm": 1.9338358640670776, + "learning_rate": 4.926042607055765e-05, + "loss": 5.4688, + "step": 13052 + }, + { + "epoch": 0.07762988866685698, + "grad_norm": 1.7377573251724243, + "learning_rate": 4.926031329248103e-05, + "loss": 5.6429, + "step": 13053 + }, + { + "epoch": 0.07763583595013798, + "grad_norm": 1.8915661573410034, + "learning_rate": 4.9260200505935374e-05, + "loss": 5.543, + "step": 13054 + }, + { + "epoch": 0.07764178323341897, + "grad_norm": 1.7961910963058472, + "learning_rate": 4.926008771092073e-05, + "loss": 5.4245, + "step": 13055 + }, + { + "epoch": 0.07764773051669997, + "grad_norm": 1.9412139654159546, + "learning_rate": 4.9259974907437145e-05, + "loss": 5.5858, + "step": 13056 + }, + { + "epoch": 0.07765367779998097, + "grad_norm": 2.458508253097534, + "learning_rate": 4.925986209548466e-05, + "loss": 5.3307, + "step": 13057 + }, + { + "epoch": 0.07765962508326196, + "grad_norm": 2.23331880569458, + "learning_rate": 4.92597492750633e-05, + "loss": 5.6979, + "step": 13058 + }, + { + "epoch": 0.07766557236654296, + "grad_norm": 2.38264536857605, + "learning_rate": 4.9259636446173104e-05, + "loss": 5.5771, + "step": 13059 + }, + { + "epoch": 0.07767151964982397, + "grad_norm": 2.0892632007598877, + "learning_rate": 4.925952360881413e-05, + "loss": 5.8596, + "step": 13060 + }, + { + "epoch": 0.07767746693310495, + "grad_norm": 1.82732355594635, + "learning_rate": 4.92594107629864e-05, + "loss": 5.3724, + "step": 13061 + }, + { + "epoch": 0.07768341421638596, + "grad_norm": 1.821089506149292, + "learning_rate": 4.925929790868997e-05, + "loss": 5.6499, + "step": 13062 + }, + { + "epoch": 0.07768936149966696, + "grad_norm": 1.9662789106369019, + "learning_rate": 4.925918504592487e-05, + "loss": 5.5132, + "step": 13063 + }, + { + "epoch": 0.07769530878294795, + "grad_norm": 1.830101490020752, + "learning_rate": 4.925907217469113e-05, + "loss": 5.4492, + "step": 13064 + }, + { + "epoch": 0.07770125606622895, + "grad_norm": 1.8362375497817993, + "learning_rate": 4.9258959294988804e-05, + "loss": 5.8314, + "step": 13065 + }, + { + "epoch": 0.07770720334950994, + "grad_norm": 2.23861026763916, + "learning_rate": 4.9258846406817926e-05, + "loss": 6.2564, + "step": 13066 + }, + { + "epoch": 0.07771315063279094, + "grad_norm": 2.2672650814056396, + "learning_rate": 4.9258733510178536e-05, + "loss": 6.3396, + "step": 13067 + }, + { + "epoch": 0.07771909791607194, + "grad_norm": 1.8667620420455933, + "learning_rate": 4.9258620605070665e-05, + "loss": 5.8509, + "step": 13068 + }, + { + "epoch": 0.07772504519935293, + "grad_norm": 1.7386364936828613, + "learning_rate": 4.925850769149436e-05, + "loss": 5.567, + "step": 13069 + }, + { + "epoch": 0.07773099248263393, + "grad_norm": 1.3638315200805664, + "learning_rate": 4.9258394769449675e-05, + "loss": 5.6892, + "step": 13070 + }, + { + "epoch": 0.07773693976591493, + "grad_norm": 1.7117588520050049, + "learning_rate": 4.9258281838936624e-05, + "loss": 5.461, + "step": 13071 + }, + { + "epoch": 0.07774288704919592, + "grad_norm": 1.7597805261611938, + "learning_rate": 4.925816889995526e-05, + "loss": 5.6783, + "step": 13072 + }, + { + "epoch": 0.07774883433247692, + "grad_norm": 1.8734283447265625, + "learning_rate": 4.9258055952505624e-05, + "loss": 5.633, + "step": 13073 + }, + { + "epoch": 0.07775478161575793, + "grad_norm": 1.5552877187728882, + "learning_rate": 4.9257942996587744e-05, + "loss": 5.8804, + "step": 13074 + }, + { + "epoch": 0.07776072889903891, + "grad_norm": 1.2786669731140137, + "learning_rate": 4.925783003220167e-05, + "loss": 5.3208, + "step": 13075 + }, + { + "epoch": 0.07776667618231992, + "grad_norm": 1.558182954788208, + "learning_rate": 4.925771705934744e-05, + "loss": 5.4023, + "step": 13076 + }, + { + "epoch": 0.07777262346560092, + "grad_norm": 1.3482223749160767, + "learning_rate": 4.925760407802509e-05, + "loss": 5.3879, + "step": 13077 + }, + { + "epoch": 0.0777785707488819, + "grad_norm": 1.5111918449401855, + "learning_rate": 4.925749108823466e-05, + "loss": 5.329, + "step": 13078 + }, + { + "epoch": 0.07778451803216291, + "grad_norm": 1.7119463682174683, + "learning_rate": 4.925737808997619e-05, + "loss": 5.7282, + "step": 13079 + }, + { + "epoch": 0.07779046531544391, + "grad_norm": 1.7753342390060425, + "learning_rate": 4.925726508324972e-05, + "loss": 5.2677, + "step": 13080 + }, + { + "epoch": 0.0777964125987249, + "grad_norm": 1.8957557678222656, + "learning_rate": 4.925715206805529e-05, + "loss": 4.7193, + "step": 13081 + }, + { + "epoch": 0.0778023598820059, + "grad_norm": 2.503037214279175, + "learning_rate": 4.9257039044392935e-05, + "loss": 5.034, + "step": 13082 + }, + { + "epoch": 0.0778083071652869, + "grad_norm": 2.031312942504883, + "learning_rate": 4.92569260122627e-05, + "loss": 5.1982, + "step": 13083 + }, + { + "epoch": 0.07781425444856789, + "grad_norm": 1.8345115184783936, + "learning_rate": 4.9256812971664635e-05, + "loss": 5.6059, + "step": 13084 + }, + { + "epoch": 0.07782020173184889, + "grad_norm": 2.134131669998169, + "learning_rate": 4.925669992259875e-05, + "loss": 5.8174, + "step": 13085 + }, + { + "epoch": 0.0778261490151299, + "grad_norm": 1.9598990678787231, + "learning_rate": 4.9256586865065114e-05, + "loss": 5.76, + "step": 13086 + }, + { + "epoch": 0.07783209629841088, + "grad_norm": 1.8105463981628418, + "learning_rate": 4.925647379906375e-05, + "loss": 5.5112, + "step": 13087 + }, + { + "epoch": 0.07783804358169188, + "grad_norm": 1.5290614366531372, + "learning_rate": 4.9256360724594696e-05, + "loss": 5.7122, + "step": 13088 + }, + { + "epoch": 0.07784399086497289, + "grad_norm": 1.6188294887542725, + "learning_rate": 4.9256247641658005e-05, + "loss": 5.58, + "step": 13089 + }, + { + "epoch": 0.07784993814825387, + "grad_norm": 1.8662221431732178, + "learning_rate": 4.925613455025371e-05, + "loss": 5.4975, + "step": 13090 + }, + { + "epoch": 0.07785588543153488, + "grad_norm": 1.808813452720642, + "learning_rate": 4.925602145038184e-05, + "loss": 5.6704, + "step": 13091 + }, + { + "epoch": 0.07786183271481588, + "grad_norm": 1.776418924331665, + "learning_rate": 4.925590834204245e-05, + "loss": 5.7558, + "step": 13092 + }, + { + "epoch": 0.07786777999809687, + "grad_norm": 1.704537034034729, + "learning_rate": 4.925579522523557e-05, + "loss": 5.6667, + "step": 13093 + }, + { + "epoch": 0.07787372728137787, + "grad_norm": 2.115651845932007, + "learning_rate": 4.9255682099961246e-05, + "loss": 5.5823, + "step": 13094 + }, + { + "epoch": 0.07787967456465886, + "grad_norm": 1.851914882659912, + "learning_rate": 4.9255568966219504e-05, + "loss": 5.6749, + "step": 13095 + }, + { + "epoch": 0.07788562184793986, + "grad_norm": 1.8792526721954346, + "learning_rate": 4.92554558240104e-05, + "loss": 5.8539, + "step": 13096 + }, + { + "epoch": 0.07789156913122086, + "grad_norm": 1.805280327796936, + "learning_rate": 4.925534267333397e-05, + "loss": 5.8522, + "step": 13097 + }, + { + "epoch": 0.07789751641450185, + "grad_norm": 1.7457916736602783, + "learning_rate": 4.925522951419025e-05, + "loss": 5.9419, + "step": 13098 + }, + { + "epoch": 0.07790346369778285, + "grad_norm": 1.6427416801452637, + "learning_rate": 4.925511634657928e-05, + "loss": 5.8924, + "step": 13099 + }, + { + "epoch": 0.07790941098106385, + "grad_norm": 1.7034873962402344, + "learning_rate": 4.9255003170501095e-05, + "loss": 5.8701, + "step": 13100 + }, + { + "epoch": 0.07791535826434484, + "grad_norm": 1.6852953433990479, + "learning_rate": 4.925488998595574e-05, + "loss": 5.771, + "step": 13101 + }, + { + "epoch": 0.07792130554762584, + "grad_norm": 1.6478735208511353, + "learning_rate": 4.9254776792943255e-05, + "loss": 5.4274, + "step": 13102 + }, + { + "epoch": 0.07792725283090685, + "grad_norm": 1.5896925926208496, + "learning_rate": 4.925466359146368e-05, + "loss": 5.8217, + "step": 13103 + }, + { + "epoch": 0.07793320011418783, + "grad_norm": 1.649539828300476, + "learning_rate": 4.9254550381517054e-05, + "loss": 5.7899, + "step": 13104 + }, + { + "epoch": 0.07793914739746884, + "grad_norm": 1.5224459171295166, + "learning_rate": 4.925443716310341e-05, + "loss": 5.7931, + "step": 13105 + }, + { + "epoch": 0.07794509468074984, + "grad_norm": 2.009038209915161, + "learning_rate": 4.9254323936222796e-05, + "loss": 5.854, + "step": 13106 + }, + { + "epoch": 0.07795104196403083, + "grad_norm": 1.5545878410339355, + "learning_rate": 4.9254210700875245e-05, + "loss": 5.7212, + "step": 13107 + }, + { + "epoch": 0.07795698924731183, + "grad_norm": 2.0804193019866943, + "learning_rate": 4.92540974570608e-05, + "loss": 5.7195, + "step": 13108 + }, + { + "epoch": 0.07796293653059283, + "grad_norm": 1.940432071685791, + "learning_rate": 4.92539842047795e-05, + "loss": 5.4998, + "step": 13109 + }, + { + "epoch": 0.07796888381387382, + "grad_norm": 2.3788061141967773, + "learning_rate": 4.925387094403139e-05, + "loss": 5.5975, + "step": 13110 + }, + { + "epoch": 0.07797483109715482, + "grad_norm": 1.6193798780441284, + "learning_rate": 4.92537576748165e-05, + "loss": 5.4489, + "step": 13111 + }, + { + "epoch": 0.07798077838043582, + "grad_norm": 1.7056760787963867, + "learning_rate": 4.9253644397134866e-05, + "loss": 5.5584, + "step": 13112 + }, + { + "epoch": 0.07798672566371681, + "grad_norm": 1.2604116201400757, + "learning_rate": 4.925353111098655e-05, + "loss": 5.5681, + "step": 13113 + }, + { + "epoch": 0.07799267294699781, + "grad_norm": 1.305413842201233, + "learning_rate": 4.925341781637157e-05, + "loss": 5.6966, + "step": 13114 + }, + { + "epoch": 0.07799862023027881, + "grad_norm": 2.6248581409454346, + "learning_rate": 4.9253304513289975e-05, + "loss": 5.3666, + "step": 13115 + }, + { + "epoch": 0.0780045675135598, + "grad_norm": 1.687741994857788, + "learning_rate": 4.92531912017418e-05, + "loss": 5.5511, + "step": 13116 + }, + { + "epoch": 0.0780105147968408, + "grad_norm": 1.5827749967575073, + "learning_rate": 4.9253077881727086e-05, + "loss": 5.3363, + "step": 13117 + }, + { + "epoch": 0.0780164620801218, + "grad_norm": 1.5989108085632324, + "learning_rate": 4.925296455324587e-05, + "loss": 5.472, + "step": 13118 + }, + { + "epoch": 0.0780224093634028, + "grad_norm": 1.5687717199325562, + "learning_rate": 4.9252851216298194e-05, + "loss": 5.6894, + "step": 13119 + }, + { + "epoch": 0.0780283566466838, + "grad_norm": 1.312949538230896, + "learning_rate": 4.9252737870884106e-05, + "loss": 5.6735, + "step": 13120 + }, + { + "epoch": 0.0780343039299648, + "grad_norm": 1.5779353380203247, + "learning_rate": 4.925262451700363e-05, + "loss": 5.3281, + "step": 13121 + }, + { + "epoch": 0.07804025121324579, + "grad_norm": 1.6127909421920776, + "learning_rate": 4.9252511154656825e-05, + "loss": 5.27, + "step": 13122 + }, + { + "epoch": 0.07804619849652679, + "grad_norm": 1.6496199369430542, + "learning_rate": 4.925239778384371e-05, + "loss": 5.4913, + "step": 13123 + }, + { + "epoch": 0.07805214577980778, + "grad_norm": 2.394230842590332, + "learning_rate": 4.925228440456433e-05, + "loss": 5.1788, + "step": 13124 + }, + { + "epoch": 0.07805809306308878, + "grad_norm": 2.169250249862671, + "learning_rate": 4.925217101681873e-05, + "loss": 5.4087, + "step": 13125 + }, + { + "epoch": 0.07806404034636978, + "grad_norm": 2.150338649749756, + "learning_rate": 4.925205762060695e-05, + "loss": 5.5004, + "step": 13126 + }, + { + "epoch": 0.07806998762965077, + "grad_norm": 2.0131516456604004, + "learning_rate": 4.925194421592903e-05, + "loss": 5.5791, + "step": 13127 + }, + { + "epoch": 0.07807593491293177, + "grad_norm": 1.8154455423355103, + "learning_rate": 4.925183080278501e-05, + "loss": 5.5479, + "step": 13128 + }, + { + "epoch": 0.07808188219621277, + "grad_norm": 1.7489157915115356, + "learning_rate": 4.925171738117492e-05, + "loss": 5.7169, + "step": 13129 + }, + { + "epoch": 0.07808782947949376, + "grad_norm": 1.6712158918380737, + "learning_rate": 4.92516039510988e-05, + "loss": 6.0751, + "step": 13130 + }, + { + "epoch": 0.07809377676277476, + "grad_norm": 1.7542296648025513, + "learning_rate": 4.9251490512556706e-05, + "loss": 5.8998, + "step": 13131 + }, + { + "epoch": 0.07809972404605577, + "grad_norm": 1.5962193012237549, + "learning_rate": 4.9251377065548666e-05, + "loss": 5.7781, + "step": 13132 + }, + { + "epoch": 0.07810567132933675, + "grad_norm": 1.783756136894226, + "learning_rate": 4.9251263610074714e-05, + "loss": 5.8384, + "step": 13133 + }, + { + "epoch": 0.07811161861261776, + "grad_norm": 1.6608144044876099, + "learning_rate": 4.92511501461349e-05, + "loss": 5.7603, + "step": 13134 + }, + { + "epoch": 0.07811756589589876, + "grad_norm": 1.8659160137176514, + "learning_rate": 4.925103667372926e-05, + "loss": 5.5039, + "step": 13135 + }, + { + "epoch": 0.07812351317917975, + "grad_norm": 1.591565489768982, + "learning_rate": 4.925092319285783e-05, + "loss": 5.7034, + "step": 13136 + }, + { + "epoch": 0.07812946046246075, + "grad_norm": 1.5772358179092407, + "learning_rate": 4.925080970352066e-05, + "loss": 5.6347, + "step": 13137 + }, + { + "epoch": 0.07813540774574175, + "grad_norm": 1.7196561098098755, + "learning_rate": 4.925069620571778e-05, + "loss": 5.7086, + "step": 13138 + }, + { + "epoch": 0.07814135502902274, + "grad_norm": 1.9582041501998901, + "learning_rate": 4.9250582699449237e-05, + "loss": 5.9774, + "step": 13139 + }, + { + "epoch": 0.07814730231230374, + "grad_norm": 2.0566928386688232, + "learning_rate": 4.9250469184715064e-05, + "loss": 5.8527, + "step": 13140 + }, + { + "epoch": 0.07815324959558474, + "grad_norm": 1.9961296319961548, + "learning_rate": 4.92503556615153e-05, + "loss": 5.65, + "step": 13141 + }, + { + "epoch": 0.07815919687886573, + "grad_norm": 1.672601342201233, + "learning_rate": 4.925024212984999e-05, + "loss": 5.7242, + "step": 13142 + }, + { + "epoch": 0.07816514416214673, + "grad_norm": 1.6791996955871582, + "learning_rate": 4.9250128589719166e-05, + "loss": 5.7365, + "step": 13143 + }, + { + "epoch": 0.07817109144542773, + "grad_norm": 2.4464364051818848, + "learning_rate": 4.925001504112288e-05, + "loss": 4.9673, + "step": 13144 + }, + { + "epoch": 0.07817703872870872, + "grad_norm": 2.0053181648254395, + "learning_rate": 4.9249901484061156e-05, + "loss": 5.7916, + "step": 13145 + }, + { + "epoch": 0.07818298601198972, + "grad_norm": 2.512120246887207, + "learning_rate": 4.924978791853405e-05, + "loss": 5.914, + "step": 13146 + }, + { + "epoch": 0.07818893329527073, + "grad_norm": 2.2429497241973877, + "learning_rate": 4.924967434454159e-05, + "loss": 5.8806, + "step": 13147 + }, + { + "epoch": 0.07819488057855171, + "grad_norm": 1.9966307878494263, + "learning_rate": 4.924956076208381e-05, + "loss": 5.8883, + "step": 13148 + }, + { + "epoch": 0.07820082786183272, + "grad_norm": 2.492926836013794, + "learning_rate": 4.924944717116077e-05, + "loss": 5.361, + "step": 13149 + }, + { + "epoch": 0.07820677514511372, + "grad_norm": 2.050769090652466, + "learning_rate": 4.92493335717725e-05, + "loss": 5.5682, + "step": 13150 + }, + { + "epoch": 0.07821272242839471, + "grad_norm": 2.2797789573669434, + "learning_rate": 4.9249219963919037e-05, + "loss": 5.8695, + "step": 13151 + }, + { + "epoch": 0.07821866971167571, + "grad_norm": 2.1034891605377197, + "learning_rate": 4.924910634760041e-05, + "loss": 4.987, + "step": 13152 + }, + { + "epoch": 0.0782246169949567, + "grad_norm": 1.7718714475631714, + "learning_rate": 4.924899272281669e-05, + "loss": 5.112, + "step": 13153 + }, + { + "epoch": 0.0782305642782377, + "grad_norm": 1.730656385421753, + "learning_rate": 4.9248879089567884e-05, + "loss": 5.6589, + "step": 13154 + }, + { + "epoch": 0.0782365115615187, + "grad_norm": 1.7784979343414307, + "learning_rate": 4.9248765447854054e-05, + "loss": 5.6812, + "step": 13155 + }, + { + "epoch": 0.07824245884479969, + "grad_norm": 1.5646599531173706, + "learning_rate": 4.9248651797675213e-05, + "loss": 5.7598, + "step": 13156 + }, + { + "epoch": 0.07824840612808069, + "grad_norm": 2.6416964530944824, + "learning_rate": 4.924853813903144e-05, + "loss": 5.9888, + "step": 13157 + }, + { + "epoch": 0.0782543534113617, + "grad_norm": 1.978983998298645, + "learning_rate": 4.924842447192274e-05, + "loss": 5.8919, + "step": 13158 + }, + { + "epoch": 0.07826030069464268, + "grad_norm": 2.3622004985809326, + "learning_rate": 4.924831079634916e-05, + "loss": 5.706, + "step": 13159 + }, + { + "epoch": 0.07826624797792368, + "grad_norm": 2.4118547439575195, + "learning_rate": 4.9248197112310754e-05, + "loss": 5.529, + "step": 13160 + }, + { + "epoch": 0.07827219526120469, + "grad_norm": 1.9290462732315063, + "learning_rate": 4.9248083419807554e-05, + "loss": 5.6403, + "step": 13161 + }, + { + "epoch": 0.07827814254448567, + "grad_norm": 1.9591599702835083, + "learning_rate": 4.92479697188396e-05, + "loss": 5.3365, + "step": 13162 + }, + { + "epoch": 0.07828408982776668, + "grad_norm": 1.7800555229187012, + "learning_rate": 4.9247856009406924e-05, + "loss": 6.4051, + "step": 13163 + }, + { + "epoch": 0.07829003711104768, + "grad_norm": 1.8390953540802002, + "learning_rate": 4.924774229150958e-05, + "loss": 5.775, + "step": 13164 + }, + { + "epoch": 0.07829598439432867, + "grad_norm": 1.8265724182128906, + "learning_rate": 4.924762856514759e-05, + "loss": 6.1238, + "step": 13165 + }, + { + "epoch": 0.07830193167760967, + "grad_norm": 1.5573666095733643, + "learning_rate": 4.9247514830321005e-05, + "loss": 5.9823, + "step": 13166 + }, + { + "epoch": 0.07830787896089067, + "grad_norm": 2.2647573947906494, + "learning_rate": 4.924740108702987e-05, + "loss": 5.0975, + "step": 13167 + }, + { + "epoch": 0.07831382624417166, + "grad_norm": 2.509573459625244, + "learning_rate": 4.924728733527422e-05, + "loss": 5.1327, + "step": 13168 + }, + { + "epoch": 0.07831977352745266, + "grad_norm": 2.2974681854248047, + "learning_rate": 4.924717357505408e-05, + "loss": 5.1493, + "step": 13169 + }, + { + "epoch": 0.07832572081073366, + "grad_norm": 1.958938717842102, + "learning_rate": 4.924705980636951e-05, + "loss": 6.0291, + "step": 13170 + }, + { + "epoch": 0.07833166809401465, + "grad_norm": 1.7714133262634277, + "learning_rate": 4.924694602922054e-05, + "loss": 5.9623, + "step": 13171 + }, + { + "epoch": 0.07833761537729565, + "grad_norm": 1.7545043230056763, + "learning_rate": 4.924683224360721e-05, + "loss": 5.9123, + "step": 13172 + }, + { + "epoch": 0.07834356266057665, + "grad_norm": 1.4791491031646729, + "learning_rate": 4.924671844952957e-05, + "loss": 5.8959, + "step": 13173 + }, + { + "epoch": 0.07834950994385764, + "grad_norm": 1.783353567123413, + "learning_rate": 4.924660464698764e-05, + "loss": 5.732, + "step": 13174 + }, + { + "epoch": 0.07835545722713864, + "grad_norm": 1.9444235563278198, + "learning_rate": 4.9246490835981474e-05, + "loss": 5.5167, + "step": 13175 + }, + { + "epoch": 0.07836140451041965, + "grad_norm": 1.9656537771224976, + "learning_rate": 4.924637701651111e-05, + "loss": 5.4557, + "step": 13176 + }, + { + "epoch": 0.07836735179370063, + "grad_norm": 1.8164803981781006, + "learning_rate": 4.9246263188576594e-05, + "loss": 5.44, + "step": 13177 + }, + { + "epoch": 0.07837329907698164, + "grad_norm": 1.8245429992675781, + "learning_rate": 4.9246149352177946e-05, + "loss": 5.2164, + "step": 13178 + }, + { + "epoch": 0.07837924636026264, + "grad_norm": 1.76225745677948, + "learning_rate": 4.924603550731522e-05, + "loss": 5.2325, + "step": 13179 + }, + { + "epoch": 0.07838519364354363, + "grad_norm": 2.052314519882202, + "learning_rate": 4.924592165398846e-05, + "loss": 5.7905, + "step": 13180 + }, + { + "epoch": 0.07839114092682463, + "grad_norm": 1.63084077835083, + "learning_rate": 4.924580779219769e-05, + "loss": 5.2703, + "step": 13181 + }, + { + "epoch": 0.07839708821010562, + "grad_norm": 1.9269503355026245, + "learning_rate": 4.9245693921942965e-05, + "loss": 5.5974, + "step": 13182 + }, + { + "epoch": 0.07840303549338662, + "grad_norm": 2.201376438140869, + "learning_rate": 4.9245580043224315e-05, + "loss": 5.1298, + "step": 13183 + }, + { + "epoch": 0.07840898277666762, + "grad_norm": 2.3778293132781982, + "learning_rate": 4.924546615604179e-05, + "loss": 5.2289, + "step": 13184 + }, + { + "epoch": 0.07841493005994861, + "grad_norm": 2.5284171104431152, + "learning_rate": 4.9245352260395414e-05, + "loss": 5.0038, + "step": 13185 + }, + { + "epoch": 0.07842087734322961, + "grad_norm": 2.230825424194336, + "learning_rate": 4.9245238356285244e-05, + "loss": 5.0699, + "step": 13186 + }, + { + "epoch": 0.07842682462651061, + "grad_norm": 2.1288161277770996, + "learning_rate": 4.924512444371131e-05, + "loss": 5.1093, + "step": 13187 + }, + { + "epoch": 0.0784327719097916, + "grad_norm": 1.912685751914978, + "learning_rate": 4.924501052267365e-05, + "loss": 5.5926, + "step": 13188 + }, + { + "epoch": 0.0784387191930726, + "grad_norm": 2.394078254699707, + "learning_rate": 4.924489659317231e-05, + "loss": 5.129, + "step": 13189 + }, + { + "epoch": 0.0784446664763536, + "grad_norm": 2.7360801696777344, + "learning_rate": 4.924478265520733e-05, + "loss": 4.9682, + "step": 13190 + }, + { + "epoch": 0.0784506137596346, + "grad_norm": 2.4817416667938232, + "learning_rate": 4.924466870877874e-05, + "loss": 5.0193, + "step": 13191 + }, + { + "epoch": 0.0784565610429156, + "grad_norm": 2.5156679153442383, + "learning_rate": 4.92445547538866e-05, + "loss": 5.0044, + "step": 13192 + }, + { + "epoch": 0.0784625083261966, + "grad_norm": 2.519080638885498, + "learning_rate": 4.924444079053092e-05, + "loss": 5.0109, + "step": 13193 + }, + { + "epoch": 0.07846845560947759, + "grad_norm": 2.3944201469421387, + "learning_rate": 4.924432681871176e-05, + "loss": 5.0032, + "step": 13194 + }, + { + "epoch": 0.07847440289275859, + "grad_norm": 2.4199647903442383, + "learning_rate": 4.924421283842916e-05, + "loss": 4.8158, + "step": 13195 + }, + { + "epoch": 0.07848035017603959, + "grad_norm": 2.4517173767089844, + "learning_rate": 4.924409884968316e-05, + "loss": 4.8194, + "step": 13196 + }, + { + "epoch": 0.07848629745932058, + "grad_norm": 2.231703042984009, + "learning_rate": 4.924398485247379e-05, + "loss": 4.882, + "step": 13197 + }, + { + "epoch": 0.07849224474260158, + "grad_norm": 2.218252182006836, + "learning_rate": 4.924387084680109e-05, + "loss": 4.872, + "step": 13198 + }, + { + "epoch": 0.07849819202588258, + "grad_norm": 2.2126224040985107, + "learning_rate": 4.924375683266511e-05, + "loss": 5.019, + "step": 13199 + }, + { + "epoch": 0.07850413930916357, + "grad_norm": 2.197240114212036, + "learning_rate": 4.924364281006589e-05, + "loss": 4.9801, + "step": 13200 + }, + { + "epoch": 0.07851008659244457, + "grad_norm": 2.11427640914917, + "learning_rate": 4.9243528779003456e-05, + "loss": 4.992, + "step": 13201 + }, + { + "epoch": 0.07851603387572557, + "grad_norm": 1.9424201250076294, + "learning_rate": 4.9243414739477864e-05, + "loss": 4.9275, + "step": 13202 + }, + { + "epoch": 0.07852198115900656, + "grad_norm": 1.897208571434021, + "learning_rate": 4.9243300691489146e-05, + "loss": 5.0482, + "step": 13203 + }, + { + "epoch": 0.07852792844228756, + "grad_norm": 1.7149171829223633, + "learning_rate": 4.924318663503734e-05, + "loss": 5.4713, + "step": 13204 + }, + { + "epoch": 0.07853387572556857, + "grad_norm": 1.770279049873352, + "learning_rate": 4.924307257012248e-05, + "loss": 5.5565, + "step": 13205 + }, + { + "epoch": 0.07853982300884955, + "grad_norm": 2.043506145477295, + "learning_rate": 4.924295849674463e-05, + "loss": 4.9129, + "step": 13206 + }, + { + "epoch": 0.07854577029213056, + "grad_norm": 1.91255521774292, + "learning_rate": 4.92428444149038e-05, + "loss": 5.5405, + "step": 13207 + }, + { + "epoch": 0.07855171757541156, + "grad_norm": 2.371006965637207, + "learning_rate": 4.924273032460005e-05, + "loss": 5.8047, + "step": 13208 + }, + { + "epoch": 0.07855766485869255, + "grad_norm": 2.1126253604888916, + "learning_rate": 4.9242616225833416e-05, + "loss": 5.6397, + "step": 13209 + }, + { + "epoch": 0.07856361214197355, + "grad_norm": 1.9398634433746338, + "learning_rate": 4.9242502118603925e-05, + "loss": 5.7703, + "step": 13210 + }, + { + "epoch": 0.07856955942525454, + "grad_norm": 1.7660777568817139, + "learning_rate": 4.924238800291164e-05, + "loss": 5.6485, + "step": 13211 + }, + { + "epoch": 0.07857550670853554, + "grad_norm": 1.835633397102356, + "learning_rate": 4.924227387875658e-05, + "loss": 5.701, + "step": 13212 + }, + { + "epoch": 0.07858145399181654, + "grad_norm": 1.8192920684814453, + "learning_rate": 4.9242159746138796e-05, + "loss": 5.5682, + "step": 13213 + }, + { + "epoch": 0.07858740127509753, + "grad_norm": 1.8342156410217285, + "learning_rate": 4.924204560505832e-05, + "loss": 5.2546, + "step": 13214 + }, + { + "epoch": 0.07859334855837853, + "grad_norm": 1.855446696281433, + "learning_rate": 4.92419314555152e-05, + "loss": 5.7471, + "step": 13215 + }, + { + "epoch": 0.07859929584165953, + "grad_norm": 1.7786341905593872, + "learning_rate": 4.924181729750946e-05, + "loss": 5.8774, + "step": 13216 + }, + { + "epoch": 0.07860524312494052, + "grad_norm": 1.7919361591339111, + "learning_rate": 4.9241703131041175e-05, + "loss": 5.7796, + "step": 13217 + }, + { + "epoch": 0.07861119040822152, + "grad_norm": 2.1065824031829834, + "learning_rate": 4.924158895611034e-05, + "loss": 5.2471, + "step": 13218 + }, + { + "epoch": 0.07861713769150253, + "grad_norm": 2.18803334236145, + "learning_rate": 4.9241474772717036e-05, + "loss": 4.8654, + "step": 13219 + }, + { + "epoch": 0.07862308497478351, + "grad_norm": 2.156651020050049, + "learning_rate": 4.924136058086127e-05, + "loss": 4.7614, + "step": 13220 + }, + { + "epoch": 0.07862903225806452, + "grad_norm": 2.098242998123169, + "learning_rate": 4.9241246380543095e-05, + "loss": 4.8152, + "step": 13221 + }, + { + "epoch": 0.07863497954134552, + "grad_norm": 1.9857498407363892, + "learning_rate": 4.924113217176256e-05, + "loss": 4.7955, + "step": 13222 + }, + { + "epoch": 0.0786409268246265, + "grad_norm": 2.046926259994507, + "learning_rate": 4.9241017954519685e-05, + "loss": 4.9851, + "step": 13223 + }, + { + "epoch": 0.07864687410790751, + "grad_norm": 1.804005742073059, + "learning_rate": 4.924090372881454e-05, + "loss": 5.5084, + "step": 13224 + }, + { + "epoch": 0.07865282139118851, + "grad_norm": 1.8413509130477905, + "learning_rate": 4.924078949464713e-05, + "loss": 5.462, + "step": 13225 + }, + { + "epoch": 0.0786587686744695, + "grad_norm": 1.7599927186965942, + "learning_rate": 4.924067525201751e-05, + "loss": 5.4255, + "step": 13226 + }, + { + "epoch": 0.0786647159577505, + "grad_norm": 1.7645682096481323, + "learning_rate": 4.924056100092573e-05, + "loss": 5.4837, + "step": 13227 + }, + { + "epoch": 0.0786706632410315, + "grad_norm": 1.7478766441345215, + "learning_rate": 4.924044674137182e-05, + "loss": 5.2957, + "step": 13228 + }, + { + "epoch": 0.07867661052431249, + "grad_norm": 1.7865453958511353, + "learning_rate": 4.924033247335581e-05, + "loss": 5.1909, + "step": 13229 + }, + { + "epoch": 0.07868255780759349, + "grad_norm": 1.8167400360107422, + "learning_rate": 4.924021819687776e-05, + "loss": 5.2732, + "step": 13230 + }, + { + "epoch": 0.0786885050908745, + "grad_norm": 1.8745819330215454, + "learning_rate": 4.92401039119377e-05, + "loss": 5.3222, + "step": 13231 + }, + { + "epoch": 0.07869445237415548, + "grad_norm": 1.7355458736419678, + "learning_rate": 4.9239989618535665e-05, + "loss": 5.4142, + "step": 13232 + }, + { + "epoch": 0.07870039965743648, + "grad_norm": 1.7634247541427612, + "learning_rate": 4.9239875316671705e-05, + "loss": 5.3114, + "step": 13233 + }, + { + "epoch": 0.07870634694071749, + "grad_norm": 1.8516123294830322, + "learning_rate": 4.9239761006345845e-05, + "loss": 5.3014, + "step": 13234 + }, + { + "epoch": 0.07871229422399847, + "grad_norm": 1.8192317485809326, + "learning_rate": 4.9239646687558146e-05, + "loss": 5.407, + "step": 13235 + }, + { + "epoch": 0.07871824150727948, + "grad_norm": 1.6944139003753662, + "learning_rate": 4.923953236030863e-05, + "loss": 5.4235, + "step": 13236 + }, + { + "epoch": 0.07872418879056048, + "grad_norm": 1.681746006011963, + "learning_rate": 4.923941802459735e-05, + "loss": 5.3367, + "step": 13237 + }, + { + "epoch": 0.07873013607384147, + "grad_norm": 1.6417745351791382, + "learning_rate": 4.9239303680424334e-05, + "loss": 5.253, + "step": 13238 + }, + { + "epoch": 0.07873608335712247, + "grad_norm": 1.6522557735443115, + "learning_rate": 4.9239189327789626e-05, + "loss": 5.0855, + "step": 13239 + }, + { + "epoch": 0.07874203064040346, + "grad_norm": 1.7547293901443481, + "learning_rate": 4.9239074966693275e-05, + "loss": 5.9017, + "step": 13240 + }, + { + "epoch": 0.07874797792368446, + "grad_norm": 1.998478889465332, + "learning_rate": 4.923896059713531e-05, + "loss": 5.4774, + "step": 13241 + }, + { + "epoch": 0.07875392520696546, + "grad_norm": 1.869710922241211, + "learning_rate": 4.9238846219115774e-05, + "loss": 5.4591, + "step": 13242 + }, + { + "epoch": 0.07875987249024645, + "grad_norm": 1.8957170248031616, + "learning_rate": 4.923873183263471e-05, + "loss": 5.2823, + "step": 13243 + }, + { + "epoch": 0.07876581977352745, + "grad_norm": 1.9052289724349976, + "learning_rate": 4.9238617437692146e-05, + "loss": 5.4753, + "step": 13244 + }, + { + "epoch": 0.07877176705680845, + "grad_norm": 1.8786853551864624, + "learning_rate": 4.923850303428814e-05, + "loss": 5.2234, + "step": 13245 + }, + { + "epoch": 0.07877771434008944, + "grad_norm": 2.298356533050537, + "learning_rate": 4.923838862242271e-05, + "loss": 4.7138, + "step": 13246 + }, + { + "epoch": 0.07878366162337044, + "grad_norm": 2.1191911697387695, + "learning_rate": 4.923827420209592e-05, + "loss": 4.6354, + "step": 13247 + }, + { + "epoch": 0.07878960890665145, + "grad_norm": 2.1735050678253174, + "learning_rate": 4.923815977330781e-05, + "loss": 4.454, + "step": 13248 + }, + { + "epoch": 0.07879555618993243, + "grad_norm": 2.0126335620880127, + "learning_rate": 4.923804533605839e-05, + "loss": 4.3387, + "step": 13249 + }, + { + "epoch": 0.07880150347321344, + "grad_norm": 2.00081729888916, + "learning_rate": 4.9237930890347726e-05, + "loss": 4.4009, + "step": 13250 + }, + { + "epoch": 0.07880745075649444, + "grad_norm": 2.198625326156616, + "learning_rate": 4.923781643617586e-05, + "loss": 4.4334, + "step": 13251 + }, + { + "epoch": 0.07881339803977543, + "grad_norm": 2.0630993843078613, + "learning_rate": 4.923770197354281e-05, + "loss": 4.6349, + "step": 13252 + }, + { + "epoch": 0.07881934532305643, + "grad_norm": 1.7470935583114624, + "learning_rate": 4.923758750244863e-05, + "loss": 5.1363, + "step": 13253 + }, + { + "epoch": 0.07882529260633743, + "grad_norm": 1.5461190938949585, + "learning_rate": 4.923747302289335e-05, + "loss": 5.7365, + "step": 13254 + }, + { + "epoch": 0.07883123988961842, + "grad_norm": 1.800528645515442, + "learning_rate": 4.9237358534877036e-05, + "loss": 5.949, + "step": 13255 + }, + { + "epoch": 0.07883718717289942, + "grad_norm": 2.096055746078491, + "learning_rate": 4.923724403839971e-05, + "loss": 5.4203, + "step": 13256 + }, + { + "epoch": 0.07884313445618042, + "grad_norm": 2.0838513374328613, + "learning_rate": 4.92371295334614e-05, + "loss": 5.0542, + "step": 13257 + }, + { + "epoch": 0.07884908173946141, + "grad_norm": 1.711534023284912, + "learning_rate": 4.923701502006217e-05, + "loss": 5.7168, + "step": 13258 + }, + { + "epoch": 0.07885502902274241, + "grad_norm": 1.6610822677612305, + "learning_rate": 4.9236900498202035e-05, + "loss": 5.5605, + "step": 13259 + }, + { + "epoch": 0.07886097630602341, + "grad_norm": 1.549854040145874, + "learning_rate": 4.9236785967881064e-05, + "loss": 5.7792, + "step": 13260 + }, + { + "epoch": 0.0788669235893044, + "grad_norm": 1.9194339513778687, + "learning_rate": 4.923667142909927e-05, + "loss": 5.5481, + "step": 13261 + }, + { + "epoch": 0.0788728708725854, + "grad_norm": 1.6644178628921509, + "learning_rate": 4.923655688185671e-05, + "loss": 5.7271, + "step": 13262 + }, + { + "epoch": 0.0788788181558664, + "grad_norm": 1.820898175239563, + "learning_rate": 4.9236442326153414e-05, + "loss": 6.2458, + "step": 13263 + }, + { + "epoch": 0.0788847654391474, + "grad_norm": 1.732539176940918, + "learning_rate": 4.923632776198943e-05, + "loss": 5.5854, + "step": 13264 + }, + { + "epoch": 0.0788907127224284, + "grad_norm": 1.769140601158142, + "learning_rate": 4.923621318936479e-05, + "loss": 5.5511, + "step": 13265 + }, + { + "epoch": 0.0788966600057094, + "grad_norm": 1.728833556175232, + "learning_rate": 4.923609860827955e-05, + "loss": 5.6215, + "step": 13266 + }, + { + "epoch": 0.07890260728899039, + "grad_norm": 1.5940407514572144, + "learning_rate": 4.923598401873373e-05, + "loss": 5.6572, + "step": 13267 + }, + { + "epoch": 0.07890855457227139, + "grad_norm": 2.153200149536133, + "learning_rate": 4.923586942072737e-05, + "loss": 5.0235, + "step": 13268 + }, + { + "epoch": 0.07891450185555238, + "grad_norm": 1.6448415517807007, + "learning_rate": 4.9235754814260526e-05, + "loss": 5.5353, + "step": 13269 + }, + { + "epoch": 0.07892044913883338, + "grad_norm": 1.706984281539917, + "learning_rate": 4.9235640199333235e-05, + "loss": 5.5278, + "step": 13270 + }, + { + "epoch": 0.07892639642211438, + "grad_norm": 1.6129798889160156, + "learning_rate": 4.923552557594553e-05, + "loss": 5.4643, + "step": 13271 + }, + { + "epoch": 0.07893234370539537, + "grad_norm": 1.612748384475708, + "learning_rate": 4.923541094409745e-05, + "loss": 5.4994, + "step": 13272 + }, + { + "epoch": 0.07893829098867637, + "grad_norm": 1.6947647333145142, + "learning_rate": 4.923529630378904e-05, + "loss": 5.5117, + "step": 13273 + }, + { + "epoch": 0.07894423827195737, + "grad_norm": 1.629684567451477, + "learning_rate": 4.9235181655020336e-05, + "loss": 5.4266, + "step": 13274 + }, + { + "epoch": 0.07895018555523836, + "grad_norm": 1.6417474746704102, + "learning_rate": 4.923506699779139e-05, + "loss": 5.4803, + "step": 13275 + }, + { + "epoch": 0.07895613283851936, + "grad_norm": 1.5188243389129639, + "learning_rate": 4.9234952332102226e-05, + "loss": 5.4066, + "step": 13276 + }, + { + "epoch": 0.07896208012180037, + "grad_norm": 1.4906466007232666, + "learning_rate": 4.9234837657952885e-05, + "loss": 5.4622, + "step": 13277 + }, + { + "epoch": 0.07896802740508135, + "grad_norm": 1.745351791381836, + "learning_rate": 4.9234722975343414e-05, + "loss": 5.458, + "step": 13278 + }, + { + "epoch": 0.07897397468836236, + "grad_norm": 1.734399676322937, + "learning_rate": 4.9234608284273866e-05, + "loss": 5.3542, + "step": 13279 + }, + { + "epoch": 0.07897992197164336, + "grad_norm": 2.396031379699707, + "learning_rate": 4.9234493584744254e-05, + "loss": 5.0978, + "step": 13280 + }, + { + "epoch": 0.07898586925492435, + "grad_norm": 2.0151939392089844, + "learning_rate": 4.9234378876754626e-05, + "loss": 5.5051, + "step": 13281 + }, + { + "epoch": 0.07899181653820535, + "grad_norm": 2.1796762943267822, + "learning_rate": 4.9234264160305036e-05, + "loss": 5.2788, + "step": 13282 + }, + { + "epoch": 0.07899776382148635, + "grad_norm": 2.069291830062866, + "learning_rate": 4.923414943539552e-05, + "loss": 5.4454, + "step": 13283 + }, + { + "epoch": 0.07900371110476734, + "grad_norm": 2.034498929977417, + "learning_rate": 4.92340347020261e-05, + "loss": 5.3849, + "step": 13284 + }, + { + "epoch": 0.07900965838804834, + "grad_norm": 1.8353052139282227, + "learning_rate": 4.9233919960196835e-05, + "loss": 5.3975, + "step": 13285 + }, + { + "epoch": 0.07901560567132934, + "grad_norm": 1.9896777868270874, + "learning_rate": 4.923380520990776e-05, + "loss": 5.1199, + "step": 13286 + }, + { + "epoch": 0.07902155295461033, + "grad_norm": 1.9539830684661865, + "learning_rate": 4.923369045115891e-05, + "loss": 5.3908, + "step": 13287 + }, + { + "epoch": 0.07902750023789133, + "grad_norm": 1.682651162147522, + "learning_rate": 4.923357568395033e-05, + "loss": 5.4719, + "step": 13288 + }, + { + "epoch": 0.07903344752117233, + "grad_norm": 2.0095672607421875, + "learning_rate": 4.923346090828206e-05, + "loss": 5.9258, + "step": 13289 + }, + { + "epoch": 0.07903939480445332, + "grad_norm": 1.7949076890945435, + "learning_rate": 4.923334612415413e-05, + "loss": 5.646, + "step": 13290 + }, + { + "epoch": 0.07904534208773432, + "grad_norm": 2.1651079654693604, + "learning_rate": 4.92332313315666e-05, + "loss": 5.2527, + "step": 13291 + }, + { + "epoch": 0.07905128937101533, + "grad_norm": 2.0362184047698975, + "learning_rate": 4.92331165305195e-05, + "loss": 5.2671, + "step": 13292 + }, + { + "epoch": 0.07905723665429631, + "grad_norm": 1.5425541400909424, + "learning_rate": 4.923300172101287e-05, + "loss": 5.5149, + "step": 13293 + }, + { + "epoch": 0.07906318393757732, + "grad_norm": 2.13031005859375, + "learning_rate": 4.923288690304675e-05, + "loss": 5.9304, + "step": 13294 + }, + { + "epoch": 0.07906913122085832, + "grad_norm": 2.165199041366577, + "learning_rate": 4.923277207662117e-05, + "loss": 5.9153, + "step": 13295 + }, + { + "epoch": 0.0790750785041393, + "grad_norm": 2.1479499340057373, + "learning_rate": 4.923265724173619e-05, + "loss": 5.7215, + "step": 13296 + }, + { + "epoch": 0.07908102578742031, + "grad_norm": 1.8908145427703857, + "learning_rate": 4.923254239839183e-05, + "loss": 5.5801, + "step": 13297 + }, + { + "epoch": 0.0790869730707013, + "grad_norm": 1.7739901542663574, + "learning_rate": 4.9232427546588145e-05, + "loss": 5.283, + "step": 13298 + }, + { + "epoch": 0.0790929203539823, + "grad_norm": 1.8153715133666992, + "learning_rate": 4.9232312686325175e-05, + "loss": 5.4626, + "step": 13299 + }, + { + "epoch": 0.0790988676372633, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.923219781760295e-05, + "loss": 5.5246, + "step": 13300 + }, + { + "epoch": 0.07910481492054429, + "grad_norm": 2.161536455154419, + "learning_rate": 4.923208294042152e-05, + "loss": 5.6865, + "step": 13301 + }, + { + "epoch": 0.07911076220382529, + "grad_norm": 2.5373623371124268, + "learning_rate": 4.9231968054780905e-05, + "loss": 5.8634, + "step": 13302 + }, + { + "epoch": 0.0791167094871063, + "grad_norm": 2.4957666397094727, + "learning_rate": 4.923185316068117e-05, + "loss": 4.9065, + "step": 13303 + }, + { + "epoch": 0.07912265677038728, + "grad_norm": 2.260540246963501, + "learning_rate": 4.923173825812235e-05, + "loss": 5.0815, + "step": 13304 + }, + { + "epoch": 0.07912860405366828, + "grad_norm": 2.406765937805176, + "learning_rate": 4.923162334710448e-05, + "loss": 4.8599, + "step": 13305 + }, + { + "epoch": 0.07913455133694929, + "grad_norm": 2.282153606414795, + "learning_rate": 4.923150842762759e-05, + "loss": 5.1024, + "step": 13306 + }, + { + "epoch": 0.07914049862023027, + "grad_norm": 1.8351432085037231, + "learning_rate": 4.9231393499691744e-05, + "loss": 5.3715, + "step": 13307 + }, + { + "epoch": 0.07914644590351128, + "grad_norm": 1.8290963172912598, + "learning_rate": 4.9231278563296965e-05, + "loss": 5.4456, + "step": 13308 + }, + { + "epoch": 0.07915239318679228, + "grad_norm": 1.7157766819000244, + "learning_rate": 4.923116361844329e-05, + "loss": 5.4952, + "step": 13309 + }, + { + "epoch": 0.07915834047007327, + "grad_norm": 2.051391124725342, + "learning_rate": 4.923104866513077e-05, + "loss": 5.7754, + "step": 13310 + }, + { + "epoch": 0.07916428775335427, + "grad_norm": 1.8714796304702759, + "learning_rate": 4.923093370335944e-05, + "loss": 5.4118, + "step": 13311 + }, + { + "epoch": 0.07917023503663527, + "grad_norm": 2.4251246452331543, + "learning_rate": 4.923081873312935e-05, + "loss": 4.9677, + "step": 13312 + }, + { + "epoch": 0.07917618231991626, + "grad_norm": 3.490328550338745, + "learning_rate": 4.923070375444052e-05, + "loss": 4.5336, + "step": 13313 + }, + { + "epoch": 0.07918212960319726, + "grad_norm": 2.820434331893921, + "learning_rate": 4.9230588767293004e-05, + "loss": 4.2865, + "step": 13314 + }, + { + "epoch": 0.07918807688647826, + "grad_norm": 2.3713653087615967, + "learning_rate": 4.923047377168685e-05, + "loss": 4.2558, + "step": 13315 + }, + { + "epoch": 0.07919402416975925, + "grad_norm": 2.484199285507202, + "learning_rate": 4.923035876762208e-05, + "loss": 3.9565, + "step": 13316 + }, + { + "epoch": 0.07919997145304025, + "grad_norm": 2.771982431411743, + "learning_rate": 4.9230243755098735e-05, + "loss": 3.9478, + "step": 13317 + }, + { + "epoch": 0.07920591873632125, + "grad_norm": 2.613006591796875, + "learning_rate": 4.9230128734116874e-05, + "loss": 4.0285, + "step": 13318 + }, + { + "epoch": 0.07921186601960224, + "grad_norm": 2.378276824951172, + "learning_rate": 4.923001370467653e-05, + "loss": 4.129, + "step": 13319 + }, + { + "epoch": 0.07921781330288324, + "grad_norm": 2.6948869228363037, + "learning_rate": 4.922989866677772e-05, + "loss": 5.7581, + "step": 13320 + }, + { + "epoch": 0.07922376058616425, + "grad_norm": 2.058387517929077, + "learning_rate": 4.922978362042051e-05, + "loss": 5.7589, + "step": 13321 + }, + { + "epoch": 0.07922970786944523, + "grad_norm": 2.2277138233184814, + "learning_rate": 4.9229668565604936e-05, + "loss": 5.691, + "step": 13322 + }, + { + "epoch": 0.07923565515272624, + "grad_norm": 1.827525019645691, + "learning_rate": 4.922955350233104e-05, + "loss": 5.6555, + "step": 13323 + }, + { + "epoch": 0.07924160243600724, + "grad_norm": 1.5456974506378174, + "learning_rate": 4.922943843059885e-05, + "loss": 5.445, + "step": 13324 + }, + { + "epoch": 0.07924754971928823, + "grad_norm": 1.859805703163147, + "learning_rate": 4.922932335040842e-05, + "loss": 5.5864, + "step": 13325 + }, + { + "epoch": 0.07925349700256923, + "grad_norm": 2.0083398818969727, + "learning_rate": 4.922920826175977e-05, + "loss": 5.7598, + "step": 13326 + }, + { + "epoch": 0.07925944428585022, + "grad_norm": 1.9759368896484375, + "learning_rate": 4.922909316465296e-05, + "loss": 5.7778, + "step": 13327 + }, + { + "epoch": 0.07926539156913122, + "grad_norm": 1.9937580823898315, + "learning_rate": 4.9228978059088035e-05, + "loss": 5.7291, + "step": 13328 + }, + { + "epoch": 0.07927133885241222, + "grad_norm": 2.6860668659210205, + "learning_rate": 4.922886294506501e-05, + "loss": 5.0277, + "step": 13329 + }, + { + "epoch": 0.07927728613569321, + "grad_norm": 2.03318190574646, + "learning_rate": 4.9228747822583945e-05, + "loss": 5.2387, + "step": 13330 + }, + { + "epoch": 0.07928323341897421, + "grad_norm": 2.250929117202759, + "learning_rate": 4.9228632691644874e-05, + "loss": 5.2348, + "step": 13331 + }, + { + "epoch": 0.07928918070225521, + "grad_norm": 2.0255093574523926, + "learning_rate": 4.922851755224784e-05, + "loss": 5.6585, + "step": 13332 + }, + { + "epoch": 0.0792951279855362, + "grad_norm": 1.9353551864624023, + "learning_rate": 4.922840240439288e-05, + "loss": 5.3989, + "step": 13333 + }, + { + "epoch": 0.0793010752688172, + "grad_norm": 1.9392589330673218, + "learning_rate": 4.922828724808003e-05, + "loss": 5.9127, + "step": 13334 + }, + { + "epoch": 0.0793070225520982, + "grad_norm": 2.312340021133423, + "learning_rate": 4.922817208330934e-05, + "loss": 5.656, + "step": 13335 + }, + { + "epoch": 0.0793129698353792, + "grad_norm": 2.1480720043182373, + "learning_rate": 4.9228056910080845e-05, + "loss": 5.4582, + "step": 13336 + }, + { + "epoch": 0.0793189171186602, + "grad_norm": 2.0460312366485596, + "learning_rate": 4.922794172839458e-05, + "loss": 5.5177, + "step": 13337 + }, + { + "epoch": 0.0793248644019412, + "grad_norm": 1.8319480419158936, + "learning_rate": 4.92278265382506e-05, + "loss": 5.5872, + "step": 13338 + }, + { + "epoch": 0.07933081168522219, + "grad_norm": 1.610379934310913, + "learning_rate": 4.922771133964893e-05, + "loss": 5.5398, + "step": 13339 + }, + { + "epoch": 0.07933675896850319, + "grad_norm": 1.767022728919983, + "learning_rate": 4.9227596132589616e-05, + "loss": 6.0004, + "step": 13340 + }, + { + "epoch": 0.07934270625178419, + "grad_norm": 2.108621835708618, + "learning_rate": 4.92274809170727e-05, + "loss": 5.1513, + "step": 13341 + }, + { + "epoch": 0.07934865353506518, + "grad_norm": 2.2562835216522217, + "learning_rate": 4.922736569309822e-05, + "loss": 4.7642, + "step": 13342 + }, + { + "epoch": 0.07935460081834618, + "grad_norm": 1.7953063249588013, + "learning_rate": 4.922725046066622e-05, + "loss": 5.2453, + "step": 13343 + }, + { + "epoch": 0.07936054810162718, + "grad_norm": 1.8957513570785522, + "learning_rate": 4.922713521977673e-05, + "loss": 5.0673, + "step": 13344 + }, + { + "epoch": 0.07936649538490817, + "grad_norm": 1.8375275135040283, + "learning_rate": 4.922701997042981e-05, + "loss": 5.0301, + "step": 13345 + }, + { + "epoch": 0.07937244266818917, + "grad_norm": 2.306138515472412, + "learning_rate": 4.9226904712625473e-05, + "loss": 4.7415, + "step": 13346 + }, + { + "epoch": 0.07937838995147017, + "grad_norm": 2.058403730392456, + "learning_rate": 4.922678944636379e-05, + "loss": 5.4454, + "step": 13347 + }, + { + "epoch": 0.07938433723475116, + "grad_norm": 1.9230997562408447, + "learning_rate": 4.922667417164477e-05, + "loss": 5.3755, + "step": 13348 + }, + { + "epoch": 0.07939028451803216, + "grad_norm": 1.9053308963775635, + "learning_rate": 4.922655888846848e-05, + "loss": 5.7708, + "step": 13349 + }, + { + "epoch": 0.07939623180131317, + "grad_norm": 1.8009783029556274, + "learning_rate": 4.922644359683494e-05, + "loss": 4.9939, + "step": 13350 + }, + { + "epoch": 0.07940217908459415, + "grad_norm": 1.6748642921447754, + "learning_rate": 4.92263282967442e-05, + "loss": 5.4869, + "step": 13351 + }, + { + "epoch": 0.07940812636787516, + "grad_norm": 1.532475471496582, + "learning_rate": 4.92262129881963e-05, + "loss": 5.755, + "step": 13352 + }, + { + "epoch": 0.07941407365115616, + "grad_norm": 1.513795018196106, + "learning_rate": 4.9226097671191284e-05, + "loss": 5.4083, + "step": 13353 + }, + { + "epoch": 0.07942002093443715, + "grad_norm": 1.66012442111969, + "learning_rate": 4.922598234572918e-05, + "loss": 5.5185, + "step": 13354 + }, + { + "epoch": 0.07942596821771815, + "grad_norm": 1.6519379615783691, + "learning_rate": 4.922586701181005e-05, + "loss": 5.3482, + "step": 13355 + }, + { + "epoch": 0.07943191550099914, + "grad_norm": 1.4444184303283691, + "learning_rate": 4.922575166943391e-05, + "loss": 5.4466, + "step": 13356 + }, + { + "epoch": 0.07943786278428014, + "grad_norm": 1.4603393077850342, + "learning_rate": 4.92256363186008e-05, + "loss": 5.4343, + "step": 13357 + }, + { + "epoch": 0.07944381006756114, + "grad_norm": 1.4669525623321533, + "learning_rate": 4.922552095931078e-05, + "loss": 5.4224, + "step": 13358 + }, + { + "epoch": 0.07944975735084213, + "grad_norm": 1.3054184913635254, + "learning_rate": 4.922540559156389e-05, + "loss": 5.4801, + "step": 13359 + }, + { + "epoch": 0.07945570463412313, + "grad_norm": 1.6295130252838135, + "learning_rate": 4.922529021536015e-05, + "loss": 5.4593, + "step": 13360 + }, + { + "epoch": 0.07946165191740413, + "grad_norm": 1.6684668064117432, + "learning_rate": 4.922517483069962e-05, + "loss": 5.2817, + "step": 13361 + }, + { + "epoch": 0.07946759920068512, + "grad_norm": 1.580409049987793, + "learning_rate": 4.922505943758232e-05, + "loss": 5.4399, + "step": 13362 + }, + { + "epoch": 0.07947354648396612, + "grad_norm": 1.613756775856018, + "learning_rate": 4.922494403600831e-05, + "loss": 5.2646, + "step": 13363 + }, + { + "epoch": 0.07947949376724713, + "grad_norm": 1.4371063709259033, + "learning_rate": 4.9224828625977616e-05, + "loss": 5.2866, + "step": 13364 + }, + { + "epoch": 0.07948544105052811, + "grad_norm": 1.5926525592803955, + "learning_rate": 4.9224713207490294e-05, + "loss": 5.5958, + "step": 13365 + }, + { + "epoch": 0.07949138833380912, + "grad_norm": 1.5216618776321411, + "learning_rate": 4.9224597780546365e-05, + "loss": 5.6094, + "step": 13366 + }, + { + "epoch": 0.07949733561709012, + "grad_norm": 1.7261598110198975, + "learning_rate": 4.922448234514588e-05, + "loss": 5.2781, + "step": 13367 + }, + { + "epoch": 0.0795032829003711, + "grad_norm": 1.6909232139587402, + "learning_rate": 4.922436690128889e-05, + "loss": 5.3299, + "step": 13368 + }, + { + "epoch": 0.07950923018365211, + "grad_norm": 1.6486754417419434, + "learning_rate": 4.922425144897541e-05, + "loss": 5.2478, + "step": 13369 + }, + { + "epoch": 0.07951517746693311, + "grad_norm": 1.4019837379455566, + "learning_rate": 4.922413598820551e-05, + "loss": 5.2383, + "step": 13370 + }, + { + "epoch": 0.0795211247502141, + "grad_norm": 1.7588412761688232, + "learning_rate": 4.92240205189792e-05, + "loss": 5.3224, + "step": 13371 + }, + { + "epoch": 0.0795270720334951, + "grad_norm": 1.5354480743408203, + "learning_rate": 4.922390504129654e-05, + "loss": 5.1617, + "step": 13372 + }, + { + "epoch": 0.0795330193167761, + "grad_norm": 1.5183011293411255, + "learning_rate": 4.922378955515756e-05, + "loss": 5.3082, + "step": 13373 + }, + { + "epoch": 0.07953896660005709, + "grad_norm": 1.436281681060791, + "learning_rate": 4.922367406056232e-05, + "loss": 5.4446, + "step": 13374 + }, + { + "epoch": 0.07954491388333809, + "grad_norm": 1.526934266090393, + "learning_rate": 4.922355855751083e-05, + "loss": 5.3067, + "step": 13375 + }, + { + "epoch": 0.0795508611666191, + "grad_norm": 1.516784906387329, + "learning_rate": 4.922344304600315e-05, + "loss": 5.4982, + "step": 13376 + }, + { + "epoch": 0.07955680844990008, + "grad_norm": 1.5154777765274048, + "learning_rate": 4.922332752603932e-05, + "loss": 5.3459, + "step": 13377 + }, + { + "epoch": 0.07956275573318108, + "grad_norm": 1.542508840560913, + "learning_rate": 4.9223211997619376e-05, + "loss": 5.3677, + "step": 13378 + }, + { + "epoch": 0.07956870301646209, + "grad_norm": 1.3413010835647583, + "learning_rate": 4.922309646074336e-05, + "loss": 5.2684, + "step": 13379 + }, + { + "epoch": 0.07957465029974307, + "grad_norm": 1.6295002698898315, + "learning_rate": 4.9222980915411306e-05, + "loss": 5.2737, + "step": 13380 + }, + { + "epoch": 0.07958059758302408, + "grad_norm": 1.5810730457305908, + "learning_rate": 4.922286536162326e-05, + "loss": 5.2471, + "step": 13381 + }, + { + "epoch": 0.07958654486630508, + "grad_norm": 1.3186451196670532, + "learning_rate": 4.9222749799379266e-05, + "loss": 5.3081, + "step": 13382 + }, + { + "epoch": 0.07959249214958607, + "grad_norm": 1.3897243738174438, + "learning_rate": 4.922263422867936e-05, + "loss": 5.2658, + "step": 13383 + }, + { + "epoch": 0.07959843943286707, + "grad_norm": 1.3873858451843262, + "learning_rate": 4.922251864952358e-05, + "loss": 5.334, + "step": 13384 + }, + { + "epoch": 0.07960438671614806, + "grad_norm": 1.4205409288406372, + "learning_rate": 4.922240306191197e-05, + "loss": 5.3007, + "step": 13385 + }, + { + "epoch": 0.07961033399942906, + "grad_norm": 1.3726485967636108, + "learning_rate": 4.922228746584457e-05, + "loss": 5.1949, + "step": 13386 + }, + { + "epoch": 0.07961628128271006, + "grad_norm": 1.708837628364563, + "learning_rate": 4.922217186132142e-05, + "loss": 5.2061, + "step": 13387 + }, + { + "epoch": 0.07962222856599105, + "grad_norm": 1.7818368673324585, + "learning_rate": 4.9222056248342556e-05, + "loss": 5.1182, + "step": 13388 + }, + { + "epoch": 0.07962817584927205, + "grad_norm": 1.4941715002059937, + "learning_rate": 4.9221940626908024e-05, + "loss": 5.0899, + "step": 13389 + }, + { + "epoch": 0.07963412313255305, + "grad_norm": 1.3581326007843018, + "learning_rate": 4.922182499701787e-05, + "loss": 5.0551, + "step": 13390 + }, + { + "epoch": 0.07964007041583404, + "grad_norm": 1.5772393941879272, + "learning_rate": 4.922170935867212e-05, + "loss": 5.245, + "step": 13391 + }, + { + "epoch": 0.07964601769911504, + "grad_norm": 1.9635555744171143, + "learning_rate": 4.922159371187082e-05, + "loss": 5.2898, + "step": 13392 + }, + { + "epoch": 0.07965196498239605, + "grad_norm": 1.535050392150879, + "learning_rate": 4.922147805661402e-05, + "loss": 5.2043, + "step": 13393 + }, + { + "epoch": 0.07965791226567703, + "grad_norm": 1.4985787868499756, + "learning_rate": 4.922136239290175e-05, + "loss": 5.1682, + "step": 13394 + }, + { + "epoch": 0.07966385954895804, + "grad_norm": 1.5314218997955322, + "learning_rate": 4.922124672073405e-05, + "loss": 5.321, + "step": 13395 + }, + { + "epoch": 0.07966980683223904, + "grad_norm": 1.440621018409729, + "learning_rate": 4.9221131040110954e-05, + "loss": 5.3013, + "step": 13396 + }, + { + "epoch": 0.07967575411552003, + "grad_norm": 1.5103110074996948, + "learning_rate": 4.9221015351032527e-05, + "loss": 5.2825, + "step": 13397 + }, + { + "epoch": 0.07968170139880103, + "grad_norm": 1.3581254482269287, + "learning_rate": 4.9220899653498786e-05, + "loss": 5.2433, + "step": 13398 + }, + { + "epoch": 0.07968764868208203, + "grad_norm": 1.5673763751983643, + "learning_rate": 4.922078394750978e-05, + "loss": 5.2279, + "step": 13399 + }, + { + "epoch": 0.07969359596536302, + "grad_norm": 1.5550049543380737, + "learning_rate": 4.922066823306555e-05, + "loss": 5.0406, + "step": 13400 + }, + { + "epoch": 0.07969954324864402, + "grad_norm": 1.6366932392120361, + "learning_rate": 4.922055251016613e-05, + "loss": 5.1299, + "step": 13401 + }, + { + "epoch": 0.07970549053192502, + "grad_norm": 1.45979642868042, + "learning_rate": 4.922043677881157e-05, + "loss": 4.9527, + "step": 13402 + }, + { + "epoch": 0.07971143781520601, + "grad_norm": 1.594494104385376, + "learning_rate": 4.922032103900191e-05, + "loss": 5.6511, + "step": 13403 + }, + { + "epoch": 0.07971738509848701, + "grad_norm": 1.419045329093933, + "learning_rate": 4.9220205290737175e-05, + "loss": 5.0936, + "step": 13404 + }, + { + "epoch": 0.07972333238176801, + "grad_norm": 1.5998183488845825, + "learning_rate": 4.922008953401742e-05, + "loss": 5.2774, + "step": 13405 + }, + { + "epoch": 0.079729279665049, + "grad_norm": 1.3942409753799438, + "learning_rate": 4.9219973768842685e-05, + "loss": 5.5466, + "step": 13406 + }, + { + "epoch": 0.07973522694833, + "grad_norm": 1.4478344917297363, + "learning_rate": 4.9219857995213015e-05, + "loss": 5.5757, + "step": 13407 + }, + { + "epoch": 0.079741174231611, + "grad_norm": 1.4197556972503662, + "learning_rate": 4.921974221312843e-05, + "loss": 5.3194, + "step": 13408 + }, + { + "epoch": 0.079747121514892, + "grad_norm": 1.7690924406051636, + "learning_rate": 4.9219626422588996e-05, + "loss": 5.3551, + "step": 13409 + }, + { + "epoch": 0.079753068798173, + "grad_norm": 1.8233799934387207, + "learning_rate": 4.921951062359473e-05, + "loss": 5.3143, + "step": 13410 + }, + { + "epoch": 0.079759016081454, + "grad_norm": 1.738848090171814, + "learning_rate": 4.921939481614568e-05, + "loss": 5.0194, + "step": 13411 + }, + { + "epoch": 0.07976496336473499, + "grad_norm": 1.6401729583740234, + "learning_rate": 4.92192790002419e-05, + "loss": 5.3347, + "step": 13412 + }, + { + "epoch": 0.07977091064801599, + "grad_norm": 1.425485372543335, + "learning_rate": 4.921916317588341e-05, + "loss": 5.0384, + "step": 13413 + }, + { + "epoch": 0.07977685793129698, + "grad_norm": 1.6337133646011353, + "learning_rate": 4.921904734307027e-05, + "loss": 5.3213, + "step": 13414 + }, + { + "epoch": 0.07978280521457798, + "grad_norm": 1.561292052268982, + "learning_rate": 4.92189315018025e-05, + "loss": 5.1502, + "step": 13415 + }, + { + "epoch": 0.07978875249785898, + "grad_norm": 1.6225664615631104, + "learning_rate": 4.921881565208016e-05, + "loss": 5.2638, + "step": 13416 + }, + { + "epoch": 0.07979469978113997, + "grad_norm": 1.5074353218078613, + "learning_rate": 4.921869979390328e-05, + "loss": 5.0872, + "step": 13417 + }, + { + "epoch": 0.07980064706442097, + "grad_norm": 1.4769634008407593, + "learning_rate": 4.92185839272719e-05, + "loss": 5.1341, + "step": 13418 + }, + { + "epoch": 0.07980659434770197, + "grad_norm": 1.5929937362670898, + "learning_rate": 4.921846805218607e-05, + "loss": 5.2799, + "step": 13419 + }, + { + "epoch": 0.07981254163098296, + "grad_norm": 1.4583854675292969, + "learning_rate": 4.921835216864581e-05, + "loss": 5.0822, + "step": 13420 + }, + { + "epoch": 0.07981848891426396, + "grad_norm": 1.4904375076293945, + "learning_rate": 4.921823627665119e-05, + "loss": 5.055, + "step": 13421 + }, + { + "epoch": 0.07982443619754497, + "grad_norm": 1.6971831321716309, + "learning_rate": 4.921812037620221e-05, + "loss": 5.1968, + "step": 13422 + }, + { + "epoch": 0.07983038348082595, + "grad_norm": 1.5604689121246338, + "learning_rate": 4.9218004467298956e-05, + "loss": 4.9681, + "step": 13423 + }, + { + "epoch": 0.07983633076410696, + "grad_norm": 1.678427815437317, + "learning_rate": 4.9217888549941436e-05, + "loss": 5.2044, + "step": 13424 + }, + { + "epoch": 0.07984227804738796, + "grad_norm": 1.521996259689331, + "learning_rate": 4.921777262412971e-05, + "loss": 4.9741, + "step": 13425 + }, + { + "epoch": 0.07984822533066895, + "grad_norm": 1.5315868854522705, + "learning_rate": 4.92176566898638e-05, + "loss": 5.0064, + "step": 13426 + }, + { + "epoch": 0.07985417261394995, + "grad_norm": 1.465867280960083, + "learning_rate": 4.9217540747143765e-05, + "loss": 4.942, + "step": 13427 + }, + { + "epoch": 0.07986011989723095, + "grad_norm": 1.4323827028274536, + "learning_rate": 4.9217424795969634e-05, + "loss": 4.8934, + "step": 13428 + }, + { + "epoch": 0.07986606718051194, + "grad_norm": 1.4645717144012451, + "learning_rate": 4.921730883634145e-05, + "loss": 5.0473, + "step": 13429 + }, + { + "epoch": 0.07987201446379294, + "grad_norm": 1.5992658138275146, + "learning_rate": 4.9217192868259246e-05, + "loss": 4.8968, + "step": 13430 + }, + { + "epoch": 0.07987796174707394, + "grad_norm": 1.4294894933700562, + "learning_rate": 4.921707689172308e-05, + "loss": 5.0719, + "step": 13431 + }, + { + "epoch": 0.07988390903035493, + "grad_norm": 1.5885019302368164, + "learning_rate": 4.921696090673298e-05, + "loss": 5.1505, + "step": 13432 + }, + { + "epoch": 0.07988985631363593, + "grad_norm": 1.4929580688476562, + "learning_rate": 4.921684491328898e-05, + "loss": 5.016, + "step": 13433 + }, + { + "epoch": 0.07989580359691693, + "grad_norm": 1.4980381727218628, + "learning_rate": 4.921672891139114e-05, + "loss": 5.0601, + "step": 13434 + }, + { + "epoch": 0.07990175088019792, + "grad_norm": 1.5698089599609375, + "learning_rate": 4.9216612901039495e-05, + "loss": 5.0251, + "step": 13435 + }, + { + "epoch": 0.07990769816347892, + "grad_norm": 1.459037184715271, + "learning_rate": 4.921649688223407e-05, + "loss": 4.8417, + "step": 13436 + }, + { + "epoch": 0.07991364544675993, + "grad_norm": 1.5418161153793335, + "learning_rate": 4.921638085497492e-05, + "loss": 5.1989, + "step": 13437 + }, + { + "epoch": 0.07991959273004091, + "grad_norm": 1.546325922012329, + "learning_rate": 4.9216264819262084e-05, + "loss": 5.3004, + "step": 13438 + }, + { + "epoch": 0.07992554001332192, + "grad_norm": 1.5820508003234863, + "learning_rate": 4.9216148775095594e-05, + "loss": 5.3327, + "step": 13439 + }, + { + "epoch": 0.07993148729660292, + "grad_norm": 1.5077866315841675, + "learning_rate": 4.9216032722475504e-05, + "loss": 5.2423, + "step": 13440 + }, + { + "epoch": 0.0799374345798839, + "grad_norm": 1.3654597997665405, + "learning_rate": 4.921591666140184e-05, + "loss": 5.1563, + "step": 13441 + }, + { + "epoch": 0.07994338186316491, + "grad_norm": 1.6721473932266235, + "learning_rate": 4.921580059187466e-05, + "loss": 5.1848, + "step": 13442 + }, + { + "epoch": 0.0799493291464459, + "grad_norm": 1.5349076986312866, + "learning_rate": 4.921568451389398e-05, + "loss": 5.1836, + "step": 13443 + }, + { + "epoch": 0.0799552764297269, + "grad_norm": 1.6246919631958008, + "learning_rate": 4.921556842745987e-05, + "loss": 4.8715, + "step": 13444 + }, + { + "epoch": 0.0799612237130079, + "grad_norm": 1.5361920595169067, + "learning_rate": 4.921545233257234e-05, + "loss": 4.8203, + "step": 13445 + }, + { + "epoch": 0.07996717099628889, + "grad_norm": 1.6185765266418457, + "learning_rate": 4.921533622923146e-05, + "loss": 4.8039, + "step": 13446 + }, + { + "epoch": 0.07997311827956989, + "grad_norm": 1.402462363243103, + "learning_rate": 4.9215220117437246e-05, + "loss": 4.8524, + "step": 13447 + }, + { + "epoch": 0.07997906556285089, + "grad_norm": 1.5282337665557861, + "learning_rate": 4.921510399718975e-05, + "loss": 4.8081, + "step": 13448 + }, + { + "epoch": 0.07998501284613188, + "grad_norm": 1.336254596710205, + "learning_rate": 4.921498786848902e-05, + "loss": 4.8468, + "step": 13449 + }, + { + "epoch": 0.07999096012941288, + "grad_norm": 1.4701998233795166, + "learning_rate": 4.921487173133508e-05, + "loss": 4.6873, + "step": 13450 + }, + { + "epoch": 0.07999690741269389, + "grad_norm": 1.6340824365615845, + "learning_rate": 4.921475558572798e-05, + "loss": 4.6779, + "step": 13451 + }, + { + "epoch": 0.08000285469597487, + "grad_norm": 1.557027816772461, + "learning_rate": 4.921463943166775e-05, + "loss": 4.6467, + "step": 13452 + }, + { + "epoch": 0.08000880197925588, + "grad_norm": 1.6390316486358643, + "learning_rate": 4.9214523269154454e-05, + "loss": 4.7376, + "step": 13453 + }, + { + "epoch": 0.08001474926253688, + "grad_norm": 2.3929800987243652, + "learning_rate": 4.921440709818811e-05, + "loss": 5.2623, + "step": 13454 + }, + { + "epoch": 0.08002069654581787, + "grad_norm": 1.5896660089492798, + "learning_rate": 4.921429091876877e-05, + "loss": 4.6952, + "step": 13455 + }, + { + "epoch": 0.08002664382909887, + "grad_norm": 1.6705348491668701, + "learning_rate": 4.921417473089647e-05, + "loss": 4.7963, + "step": 13456 + }, + { + "epoch": 0.08003259111237987, + "grad_norm": 1.5925310850143433, + "learning_rate": 4.9214058534571253e-05, + "loss": 4.7398, + "step": 13457 + }, + { + "epoch": 0.08003853839566086, + "grad_norm": 1.5314396619796753, + "learning_rate": 4.921394232979316e-05, + "loss": 4.7578, + "step": 13458 + }, + { + "epoch": 0.08004448567894186, + "grad_norm": 1.6665661334991455, + "learning_rate": 4.921382611656222e-05, + "loss": 4.7767, + "step": 13459 + }, + { + "epoch": 0.08005043296222286, + "grad_norm": 1.5145021677017212, + "learning_rate": 4.9213709894878495e-05, + "loss": 4.7892, + "step": 13460 + }, + { + "epoch": 0.08005638024550385, + "grad_norm": 1.8332866430282593, + "learning_rate": 4.921359366474201e-05, + "loss": 4.6434, + "step": 13461 + }, + { + "epoch": 0.08006232752878485, + "grad_norm": 1.467970371246338, + "learning_rate": 4.921347742615281e-05, + "loss": 4.6611, + "step": 13462 + }, + { + "epoch": 0.08006827481206585, + "grad_norm": 1.5667515993118286, + "learning_rate": 4.9213361179110936e-05, + "loss": 4.5792, + "step": 13463 + }, + { + "epoch": 0.08007422209534684, + "grad_norm": 1.5370365381240845, + "learning_rate": 4.9213244923616434e-05, + "loss": 4.6724, + "step": 13464 + }, + { + "epoch": 0.08008016937862784, + "grad_norm": 1.7298029661178589, + "learning_rate": 4.921312865966933e-05, + "loss": 4.7808, + "step": 13465 + }, + { + "epoch": 0.08008611666190885, + "grad_norm": 1.5497710704803467, + "learning_rate": 4.921301238726966e-05, + "loss": 4.8228, + "step": 13466 + }, + { + "epoch": 0.08009206394518983, + "grad_norm": 1.4589923620224, + "learning_rate": 4.92128961064175e-05, + "loss": 4.757, + "step": 13467 + }, + { + "epoch": 0.08009801122847084, + "grad_norm": 1.6503071784973145, + "learning_rate": 4.921277981711286e-05, + "loss": 4.6074, + "step": 13468 + }, + { + "epoch": 0.08010395851175184, + "grad_norm": 1.621209979057312, + "learning_rate": 4.921266351935578e-05, + "loss": 4.6338, + "step": 13469 + }, + { + "epoch": 0.08010990579503283, + "grad_norm": 1.6513469219207764, + "learning_rate": 4.921254721314632e-05, + "loss": 4.7399, + "step": 13470 + }, + { + "epoch": 0.08011585307831383, + "grad_norm": 1.5691003799438477, + "learning_rate": 4.9212430898484505e-05, + "loss": 4.8002, + "step": 13471 + }, + { + "epoch": 0.08012180036159482, + "grad_norm": 1.6764090061187744, + "learning_rate": 4.921231457537039e-05, + "loss": 4.7913, + "step": 13472 + }, + { + "epoch": 0.08012774764487582, + "grad_norm": 1.5193006992340088, + "learning_rate": 4.9212198243804e-05, + "loss": 4.8346, + "step": 13473 + }, + { + "epoch": 0.08013369492815682, + "grad_norm": 1.722706913948059, + "learning_rate": 4.921208190378538e-05, + "loss": 4.6969, + "step": 13474 + }, + { + "epoch": 0.08013964221143781, + "grad_norm": 1.6551017761230469, + "learning_rate": 4.921196555531457e-05, + "loss": 4.6504, + "step": 13475 + }, + { + "epoch": 0.08014558949471881, + "grad_norm": 1.462902307510376, + "learning_rate": 4.921184919839162e-05, + "loss": 4.7678, + "step": 13476 + }, + { + "epoch": 0.08015153677799981, + "grad_norm": 1.4332460165023804, + "learning_rate": 4.9211732833016554e-05, + "loss": 4.7563, + "step": 13477 + }, + { + "epoch": 0.0801574840612808, + "grad_norm": 1.466042160987854, + "learning_rate": 4.9211616459189434e-05, + "loss": 4.7071, + "step": 13478 + }, + { + "epoch": 0.0801634313445618, + "grad_norm": 1.5814018249511719, + "learning_rate": 4.9211500076910275e-05, + "loss": 4.7497, + "step": 13479 + }, + { + "epoch": 0.0801693786278428, + "grad_norm": 1.5666007995605469, + "learning_rate": 4.921138368617915e-05, + "loss": 4.7757, + "step": 13480 + }, + { + "epoch": 0.0801753259111238, + "grad_norm": 1.6804678440093994, + "learning_rate": 4.9211267286996064e-05, + "loss": 4.6921, + "step": 13481 + }, + { + "epoch": 0.0801812731944048, + "grad_norm": 1.6126580238342285, + "learning_rate": 4.921115087936108e-05, + "loss": 4.746, + "step": 13482 + }, + { + "epoch": 0.0801872204776858, + "grad_norm": 1.5597195625305176, + "learning_rate": 4.9211034463274235e-05, + "loss": 4.8135, + "step": 13483 + }, + { + "epoch": 0.08019316776096679, + "grad_norm": 1.4779510498046875, + "learning_rate": 4.9210918038735565e-05, + "loss": 4.9011, + "step": 13484 + }, + { + "epoch": 0.08019911504424779, + "grad_norm": 1.449723243713379, + "learning_rate": 4.921080160574512e-05, + "loss": 4.648, + "step": 13485 + }, + { + "epoch": 0.08020506232752879, + "grad_norm": 1.609134554862976, + "learning_rate": 4.921068516430293e-05, + "loss": 4.6809, + "step": 13486 + }, + { + "epoch": 0.08021100961080978, + "grad_norm": 1.5483453273773193, + "learning_rate": 4.921056871440905e-05, + "loss": 4.7247, + "step": 13487 + }, + { + "epoch": 0.08021695689409078, + "grad_norm": 1.5850282907485962, + "learning_rate": 4.921045225606349e-05, + "loss": 4.6378, + "step": 13488 + }, + { + "epoch": 0.08022290417737178, + "grad_norm": 1.746030569076538, + "learning_rate": 4.9210335789266325e-05, + "loss": 4.6986, + "step": 13489 + }, + { + "epoch": 0.08022885146065277, + "grad_norm": 1.5930465459823608, + "learning_rate": 4.921021931401758e-05, + "loss": 4.6339, + "step": 13490 + }, + { + "epoch": 0.08023479874393377, + "grad_norm": 1.5435012578964233, + "learning_rate": 4.92101028303173e-05, + "loss": 4.5761, + "step": 13491 + }, + { + "epoch": 0.08024074602721477, + "grad_norm": 1.8166500329971313, + "learning_rate": 4.920998633816552e-05, + "loss": 4.5668, + "step": 13492 + }, + { + "epoch": 0.08024669331049576, + "grad_norm": 1.659976601600647, + "learning_rate": 4.920986983756228e-05, + "loss": 4.7431, + "step": 13493 + }, + { + "epoch": 0.08025264059377676, + "grad_norm": 1.6075677871704102, + "learning_rate": 4.920975332850762e-05, + "loss": 4.7744, + "step": 13494 + }, + { + "epoch": 0.08025858787705777, + "grad_norm": 1.6895835399627686, + "learning_rate": 4.9209636811001605e-05, + "loss": 4.638, + "step": 13495 + }, + { + "epoch": 0.08026453516033875, + "grad_norm": 1.4848902225494385, + "learning_rate": 4.9209520285044244e-05, + "loss": 4.7314, + "step": 13496 + }, + { + "epoch": 0.08027048244361976, + "grad_norm": 1.6041605472564697, + "learning_rate": 4.920940375063559e-05, + "loss": 4.7329, + "step": 13497 + }, + { + "epoch": 0.08027642972690076, + "grad_norm": 1.5055692195892334, + "learning_rate": 4.920928720777568e-05, + "loss": 4.721, + "step": 13498 + }, + { + "epoch": 0.08028237701018175, + "grad_norm": 1.3238314390182495, + "learning_rate": 4.920917065646456e-05, + "loss": 5.3071, + "step": 13499 + }, + { + "epoch": 0.08028832429346275, + "grad_norm": 1.463626742362976, + "learning_rate": 4.9209054096702266e-05, + "loss": 5.1885, + "step": 13500 + }, + { + "epoch": 0.08029427157674375, + "grad_norm": 1.4844539165496826, + "learning_rate": 4.9208937528488844e-05, + "loss": 5.2873, + "step": 13501 + }, + { + "epoch": 0.08030021886002474, + "grad_norm": 1.5207467079162598, + "learning_rate": 4.920882095182434e-05, + "loss": 5.1049, + "step": 13502 + }, + { + "epoch": 0.08030616614330574, + "grad_norm": 1.3113683462142944, + "learning_rate": 4.920870436670878e-05, + "loss": 5.1821, + "step": 13503 + }, + { + "epoch": 0.08031211342658673, + "grad_norm": 1.3822054862976074, + "learning_rate": 4.920858777314221e-05, + "loss": 5.1467, + "step": 13504 + }, + { + "epoch": 0.08031806070986773, + "grad_norm": 1.7611572742462158, + "learning_rate": 4.920847117112467e-05, + "loss": 5.0616, + "step": 13505 + }, + { + "epoch": 0.08032400799314873, + "grad_norm": 1.632802963256836, + "learning_rate": 4.920835456065621e-05, + "loss": 5.1535, + "step": 13506 + }, + { + "epoch": 0.08032995527642972, + "grad_norm": 1.6254185438156128, + "learning_rate": 4.920823794173686e-05, + "loss": 5.211, + "step": 13507 + }, + { + "epoch": 0.08033590255971072, + "grad_norm": 1.4769513607025146, + "learning_rate": 4.920812131436666e-05, + "loss": 5.0879, + "step": 13508 + }, + { + "epoch": 0.08034184984299172, + "grad_norm": 1.531504511833191, + "learning_rate": 4.920800467854566e-05, + "loss": 4.9068, + "step": 13509 + }, + { + "epoch": 0.08034779712627271, + "grad_norm": 1.6325825452804565, + "learning_rate": 4.9207888034273895e-05, + "loss": 5.0463, + "step": 13510 + }, + { + "epoch": 0.08035374440955372, + "grad_norm": 1.3797351121902466, + "learning_rate": 4.9207771381551406e-05, + "loss": 5.0644, + "step": 13511 + }, + { + "epoch": 0.08035969169283472, + "grad_norm": 1.7325141429901123, + "learning_rate": 4.920765472037823e-05, + "loss": 4.9095, + "step": 13512 + }, + { + "epoch": 0.0803656389761157, + "grad_norm": 1.3197063207626343, + "learning_rate": 4.920753805075442e-05, + "loss": 5.1837, + "step": 13513 + }, + { + "epoch": 0.08037158625939671, + "grad_norm": 1.532212734222412, + "learning_rate": 4.9207421372680006e-05, + "loss": 5.1011, + "step": 13514 + }, + { + "epoch": 0.08037753354267771, + "grad_norm": 1.2958672046661377, + "learning_rate": 4.9207304686155034e-05, + "loss": 5.1349, + "step": 13515 + }, + { + "epoch": 0.0803834808259587, + "grad_norm": 2.914010524749756, + "learning_rate": 4.9207187991179533e-05, + "loss": 5.4637, + "step": 13516 + }, + { + "epoch": 0.0803894281092397, + "grad_norm": 1.490577220916748, + "learning_rate": 4.920707128775356e-05, + "loss": 5.2322, + "step": 13517 + }, + { + "epoch": 0.0803953753925207, + "grad_norm": 1.5756994485855103, + "learning_rate": 4.920695457587714e-05, + "loss": 5.1501, + "step": 13518 + }, + { + "epoch": 0.08040132267580169, + "grad_norm": 1.7483723163604736, + "learning_rate": 4.920683785555033e-05, + "loss": 5.131, + "step": 13519 + }, + { + "epoch": 0.08040726995908269, + "grad_norm": 1.426866054534912, + "learning_rate": 4.920672112677316e-05, + "loss": 5.5304, + "step": 13520 + }, + { + "epoch": 0.0804132172423637, + "grad_norm": 1.3744142055511475, + "learning_rate": 4.920660438954568e-05, + "loss": 5.1042, + "step": 13521 + }, + { + "epoch": 0.08041916452564468, + "grad_norm": 1.5924170017242432, + "learning_rate": 4.9206487643867916e-05, + "loss": 5.261, + "step": 13522 + }, + { + "epoch": 0.08042511180892568, + "grad_norm": 1.566296935081482, + "learning_rate": 4.920637088973992e-05, + "loss": 5.0451, + "step": 13523 + }, + { + "epoch": 0.08043105909220669, + "grad_norm": 1.4542006254196167, + "learning_rate": 4.9206254127161734e-05, + "loss": 5.0351, + "step": 13524 + }, + { + "epoch": 0.08043700637548767, + "grad_norm": 1.4084336757659912, + "learning_rate": 4.920613735613339e-05, + "loss": 5.1177, + "step": 13525 + }, + { + "epoch": 0.08044295365876868, + "grad_norm": 1.5498062372207642, + "learning_rate": 4.920602057665493e-05, + "loss": 4.9068, + "step": 13526 + }, + { + "epoch": 0.08044890094204968, + "grad_norm": 1.4482768774032593, + "learning_rate": 4.920590378872641e-05, + "loss": 4.9393, + "step": 13527 + }, + { + "epoch": 0.08045484822533067, + "grad_norm": 1.4438153505325317, + "learning_rate": 4.920578699234785e-05, + "loss": 5.0109, + "step": 13528 + }, + { + "epoch": 0.08046079550861167, + "grad_norm": 1.5769532918930054, + "learning_rate": 4.9205670187519305e-05, + "loss": 4.916, + "step": 13529 + }, + { + "epoch": 0.08046674279189267, + "grad_norm": 1.6127451658248901, + "learning_rate": 4.9205553374240806e-05, + "loss": 5.0038, + "step": 13530 + }, + { + "epoch": 0.08047269007517366, + "grad_norm": 1.5733160972595215, + "learning_rate": 4.92054365525124e-05, + "loss": 5.2705, + "step": 13531 + }, + { + "epoch": 0.08047863735845466, + "grad_norm": 1.956769585609436, + "learning_rate": 4.920531972233413e-05, + "loss": 5.0572, + "step": 13532 + }, + { + "epoch": 0.08048458464173565, + "grad_norm": 1.614670753479004, + "learning_rate": 4.9205202883706025e-05, + "loss": 5.0323, + "step": 13533 + }, + { + "epoch": 0.08049053192501665, + "grad_norm": 1.3706777095794678, + "learning_rate": 4.920508603662814e-05, + "loss": 5.1335, + "step": 13534 + }, + { + "epoch": 0.08049647920829765, + "grad_norm": 1.5787118673324585, + "learning_rate": 4.9204969181100505e-05, + "loss": 4.9626, + "step": 13535 + }, + { + "epoch": 0.08050242649157864, + "grad_norm": 1.6258914470672607, + "learning_rate": 4.9204852317123175e-05, + "loss": 5.1592, + "step": 13536 + }, + { + "epoch": 0.08050837377485964, + "grad_norm": 1.662347435951233, + "learning_rate": 4.920473544469617e-05, + "loss": 5.053, + "step": 13537 + }, + { + "epoch": 0.08051432105814064, + "grad_norm": 1.8060719966888428, + "learning_rate": 4.920461856381955e-05, + "loss": 5.0823, + "step": 13538 + }, + { + "epoch": 0.08052026834142163, + "grad_norm": 1.7381904125213623, + "learning_rate": 4.920450167449334e-05, + "loss": 4.7485, + "step": 13539 + }, + { + "epoch": 0.08052621562470264, + "grad_norm": 1.838526964187622, + "learning_rate": 4.9204384776717594e-05, + "loss": 5.1404, + "step": 13540 + }, + { + "epoch": 0.08053216290798364, + "grad_norm": 1.8131240606307983, + "learning_rate": 4.920426787049234e-05, + "loss": 5.2337, + "step": 13541 + }, + { + "epoch": 0.08053811019126463, + "grad_norm": 1.7523903846740723, + "learning_rate": 4.9204150955817635e-05, + "loss": 5.2375, + "step": 13542 + }, + { + "epoch": 0.08054405747454563, + "grad_norm": 1.5962380170822144, + "learning_rate": 4.9204034032693505e-05, + "loss": 5.1667, + "step": 13543 + }, + { + "epoch": 0.08055000475782663, + "grad_norm": 1.566009283065796, + "learning_rate": 4.920391710112e-05, + "loss": 5.1105, + "step": 13544 + }, + { + "epoch": 0.08055595204110762, + "grad_norm": 1.6253767013549805, + "learning_rate": 4.920380016109716e-05, + "loss": 5.2942, + "step": 13545 + }, + { + "epoch": 0.08056189932438862, + "grad_norm": 1.538004994392395, + "learning_rate": 4.920368321262502e-05, + "loss": 5.1847, + "step": 13546 + }, + { + "epoch": 0.08056784660766962, + "grad_norm": 1.6407667398452759, + "learning_rate": 4.9203566255703625e-05, + "loss": 5.1368, + "step": 13547 + }, + { + "epoch": 0.08057379389095061, + "grad_norm": 1.5777368545532227, + "learning_rate": 4.9203449290333016e-05, + "loss": 5.1507, + "step": 13548 + }, + { + "epoch": 0.08057974117423161, + "grad_norm": 1.5601979494094849, + "learning_rate": 4.920333231651323e-05, + "loss": 5.0926, + "step": 13549 + }, + { + "epoch": 0.08058568845751261, + "grad_norm": 1.4342397451400757, + "learning_rate": 4.9203215334244315e-05, + "loss": 4.9536, + "step": 13550 + }, + { + "epoch": 0.0805916357407936, + "grad_norm": 1.6202988624572754, + "learning_rate": 4.9203098343526305e-05, + "loss": 4.9009, + "step": 13551 + }, + { + "epoch": 0.0805975830240746, + "grad_norm": 1.4504165649414062, + "learning_rate": 4.9202981344359243e-05, + "loss": 5.3843, + "step": 13552 + }, + { + "epoch": 0.0806035303073556, + "grad_norm": 1.6187599897384644, + "learning_rate": 4.920286433674317e-05, + "loss": 5.3396, + "step": 13553 + }, + { + "epoch": 0.0806094775906366, + "grad_norm": 1.6162225008010864, + "learning_rate": 4.920274732067813e-05, + "loss": 5.3163, + "step": 13554 + }, + { + "epoch": 0.0806154248739176, + "grad_norm": 1.6445814371109009, + "learning_rate": 4.920263029616416e-05, + "loss": 5.207, + "step": 13555 + }, + { + "epoch": 0.0806213721571986, + "grad_norm": 1.5133748054504395, + "learning_rate": 4.9202513263201296e-05, + "loss": 5.4284, + "step": 13556 + }, + { + "epoch": 0.08062731944047959, + "grad_norm": 1.5004390478134155, + "learning_rate": 4.920239622178959e-05, + "loss": 5.0013, + "step": 13557 + }, + { + "epoch": 0.08063326672376059, + "grad_norm": 1.6617141962051392, + "learning_rate": 4.920227917192908e-05, + "loss": 5.346, + "step": 13558 + }, + { + "epoch": 0.08063921400704159, + "grad_norm": 1.5505567789077759, + "learning_rate": 4.92021621136198e-05, + "loss": 5.2799, + "step": 13559 + }, + { + "epoch": 0.08064516129032258, + "grad_norm": 1.5264419317245483, + "learning_rate": 4.92020450468618e-05, + "loss": 5.1277, + "step": 13560 + }, + { + "epoch": 0.08065110857360358, + "grad_norm": 1.6758075952529907, + "learning_rate": 4.920192797165511e-05, + "loss": 5.2519, + "step": 13561 + }, + { + "epoch": 0.08065705585688457, + "grad_norm": 1.5858482122421265, + "learning_rate": 4.920181088799978e-05, + "loss": 5.3231, + "step": 13562 + }, + { + "epoch": 0.08066300314016557, + "grad_norm": 1.5122928619384766, + "learning_rate": 4.920169379589585e-05, + "loss": 5.1791, + "step": 13563 + }, + { + "epoch": 0.08066895042344657, + "grad_norm": 1.4593915939331055, + "learning_rate": 4.9201576695343354e-05, + "loss": 5.0555, + "step": 13564 + }, + { + "epoch": 0.08067489770672756, + "grad_norm": 1.6524077653884888, + "learning_rate": 4.9201459586342336e-05, + "loss": 5.1981, + "step": 13565 + }, + { + "epoch": 0.08068084499000856, + "grad_norm": 1.5063152313232422, + "learning_rate": 4.920134246889285e-05, + "loss": 5.0406, + "step": 13566 + }, + { + "epoch": 0.08068679227328956, + "grad_norm": 1.3544602394104004, + "learning_rate": 4.9201225342994914e-05, + "loss": 5.0385, + "step": 13567 + }, + { + "epoch": 0.08069273955657055, + "grad_norm": 1.5672118663787842, + "learning_rate": 4.920110820864858e-05, + "loss": 5.2393, + "step": 13568 + }, + { + "epoch": 0.08069868683985155, + "grad_norm": 1.5031840801239014, + "learning_rate": 4.92009910658539e-05, + "loss": 5.1584, + "step": 13569 + }, + { + "epoch": 0.08070463412313256, + "grad_norm": 1.682307243347168, + "learning_rate": 4.920087391461089e-05, + "loss": 4.8473, + "step": 13570 + }, + { + "epoch": 0.08071058140641355, + "grad_norm": 1.5047411918640137, + "learning_rate": 4.9200756754919616e-05, + "loss": 4.8286, + "step": 13571 + }, + { + "epoch": 0.08071652868969455, + "grad_norm": 1.4234607219696045, + "learning_rate": 4.920063958678011e-05, + "loss": 4.8309, + "step": 13572 + }, + { + "epoch": 0.08072247597297555, + "grad_norm": 1.5061196088790894, + "learning_rate": 4.920052241019239e-05, + "loss": 5.0132, + "step": 13573 + }, + { + "epoch": 0.08072842325625654, + "grad_norm": 1.5565897226333618, + "learning_rate": 4.920040522515654e-05, + "loss": 4.9357, + "step": 13574 + }, + { + "epoch": 0.08073437053953754, + "grad_norm": 1.442288875579834, + "learning_rate": 4.920028803167257e-05, + "loss": 4.7943, + "step": 13575 + }, + { + "epoch": 0.08074031782281854, + "grad_norm": 1.6255996227264404, + "learning_rate": 4.9200170829740534e-05, + "loss": 4.824, + "step": 13576 + }, + { + "epoch": 0.08074626510609953, + "grad_norm": 1.7027612924575806, + "learning_rate": 4.920005361936047e-05, + "loss": 5.1223, + "step": 13577 + }, + { + "epoch": 0.08075221238938053, + "grad_norm": 2.5931310653686523, + "learning_rate": 4.919993640053241e-05, + "loss": 5.3487, + "step": 13578 + }, + { + "epoch": 0.08075815967266153, + "grad_norm": 1.5481868982315063, + "learning_rate": 4.91998191732564e-05, + "loss": 5.0844, + "step": 13579 + }, + { + "epoch": 0.08076410695594252, + "grad_norm": 1.3663432598114014, + "learning_rate": 4.919970193753248e-05, + "loss": 5.2151, + "step": 13580 + }, + { + "epoch": 0.08077005423922352, + "grad_norm": 1.4602998495101929, + "learning_rate": 4.919958469336071e-05, + "loss": 5.3133, + "step": 13581 + }, + { + "epoch": 0.08077600152250453, + "grad_norm": 1.6350071430206299, + "learning_rate": 4.919946744074111e-05, + "loss": 5.5026, + "step": 13582 + }, + { + "epoch": 0.08078194880578551, + "grad_norm": 1.4492799043655396, + "learning_rate": 4.919935017967372e-05, + "loss": 5.4211, + "step": 13583 + }, + { + "epoch": 0.08078789608906652, + "grad_norm": 1.398373007774353, + "learning_rate": 4.919923291015859e-05, + "loss": 5.2947, + "step": 13584 + }, + { + "epoch": 0.08079384337234752, + "grad_norm": 1.543583869934082, + "learning_rate": 4.9199115632195755e-05, + "loss": 5.0361, + "step": 13585 + }, + { + "epoch": 0.0807997906556285, + "grad_norm": 1.7753655910491943, + "learning_rate": 4.9198998345785265e-05, + "loss": 5.1897, + "step": 13586 + }, + { + "epoch": 0.08080573793890951, + "grad_norm": 1.668168544769287, + "learning_rate": 4.919888105092715e-05, + "loss": 5.3786, + "step": 13587 + }, + { + "epoch": 0.08081168522219051, + "grad_norm": 1.3956975936889648, + "learning_rate": 4.919876374762145e-05, + "loss": 5.4662, + "step": 13588 + }, + { + "epoch": 0.0808176325054715, + "grad_norm": 1.3362425565719604, + "learning_rate": 4.9198646435868226e-05, + "loss": 5.4723, + "step": 13589 + }, + { + "epoch": 0.0808235797887525, + "grad_norm": 1.3419675827026367, + "learning_rate": 4.919852911566749e-05, + "loss": 5.3888, + "step": 13590 + }, + { + "epoch": 0.08082952707203349, + "grad_norm": 1.5144484043121338, + "learning_rate": 4.9198411787019304e-05, + "loss": 5.292, + "step": 13591 + }, + { + "epoch": 0.08083547435531449, + "grad_norm": 1.4561097621917725, + "learning_rate": 4.91982944499237e-05, + "loss": 5.3688, + "step": 13592 + }, + { + "epoch": 0.08084142163859549, + "grad_norm": 1.4536436796188354, + "learning_rate": 4.919817710438073e-05, + "loss": 5.3606, + "step": 13593 + }, + { + "epoch": 0.08084736892187648, + "grad_norm": 1.3266935348510742, + "learning_rate": 4.919805975039041e-05, + "loss": 5.3999, + "step": 13594 + }, + { + "epoch": 0.08085331620515748, + "grad_norm": 1.4032717943191528, + "learning_rate": 4.919794238795281e-05, + "loss": 5.3494, + "step": 13595 + }, + { + "epoch": 0.08085926348843848, + "grad_norm": 1.6235400438308716, + "learning_rate": 4.919782501706796e-05, + "loss": 5.1499, + "step": 13596 + }, + { + "epoch": 0.08086521077171947, + "grad_norm": 1.349752426147461, + "learning_rate": 4.919770763773589e-05, + "loss": 5.3599, + "step": 13597 + }, + { + "epoch": 0.08087115805500047, + "grad_norm": 1.9415758848190308, + "learning_rate": 4.919759024995666e-05, + "loss": 5.3427, + "step": 13598 + }, + { + "epoch": 0.08087710533828148, + "grad_norm": 1.688825249671936, + "learning_rate": 4.9197472853730296e-05, + "loss": 5.2918, + "step": 13599 + }, + { + "epoch": 0.08088305262156247, + "grad_norm": 1.55258309841156, + "learning_rate": 4.919735544905685e-05, + "loss": 5.3016, + "step": 13600 + }, + { + "epoch": 0.08088899990484347, + "grad_norm": 1.3860005140304565, + "learning_rate": 4.919723803593634e-05, + "loss": 5.3049, + "step": 13601 + }, + { + "epoch": 0.08089494718812447, + "grad_norm": 1.289819359779358, + "learning_rate": 4.919712061436884e-05, + "loss": 5.1657, + "step": 13602 + }, + { + "epoch": 0.08090089447140546, + "grad_norm": 1.5799275636672974, + "learning_rate": 4.9197003184354375e-05, + "loss": 5.2638, + "step": 13603 + }, + { + "epoch": 0.08090684175468646, + "grad_norm": 1.5292985439300537, + "learning_rate": 4.919688574589299e-05, + "loss": 5.2643, + "step": 13604 + }, + { + "epoch": 0.08091278903796746, + "grad_norm": 1.6338304281234741, + "learning_rate": 4.919676829898471e-05, + "loss": 5.2377, + "step": 13605 + }, + { + "epoch": 0.08091873632124845, + "grad_norm": 1.7117339372634888, + "learning_rate": 4.919665084362959e-05, + "loss": 5.262, + "step": 13606 + }, + { + "epoch": 0.08092468360452945, + "grad_norm": 1.606644868850708, + "learning_rate": 4.919653337982767e-05, + "loss": 5.2308, + "step": 13607 + }, + { + "epoch": 0.08093063088781045, + "grad_norm": 1.5751184225082397, + "learning_rate": 4.9196415907578994e-05, + "loss": 5.1455, + "step": 13608 + }, + { + "epoch": 0.08093657817109144, + "grad_norm": 1.7105200290679932, + "learning_rate": 4.9196298426883595e-05, + "loss": 5.2608, + "step": 13609 + }, + { + "epoch": 0.08094252545437244, + "grad_norm": 1.4504178762435913, + "learning_rate": 4.919618093774152e-05, + "loss": 5.3592, + "step": 13610 + }, + { + "epoch": 0.08094847273765345, + "grad_norm": 1.2036757469177246, + "learning_rate": 4.9196063440152804e-05, + "loss": 5.3256, + "step": 13611 + }, + { + "epoch": 0.08095442002093443, + "grad_norm": 1.4795072078704834, + "learning_rate": 4.9195945934117507e-05, + "loss": 5.2968, + "step": 13612 + }, + { + "epoch": 0.08096036730421544, + "grad_norm": 1.2796508073806763, + "learning_rate": 4.9195828419635644e-05, + "loss": 5.1288, + "step": 13613 + }, + { + "epoch": 0.08096631458749644, + "grad_norm": 1.4119127988815308, + "learning_rate": 4.9195710896707264e-05, + "loss": 5.3238, + "step": 13614 + }, + { + "epoch": 0.08097226187077743, + "grad_norm": 1.618862509727478, + "learning_rate": 4.919559336533241e-05, + "loss": 5.301, + "step": 13615 + }, + { + "epoch": 0.08097820915405843, + "grad_norm": 1.5049046277999878, + "learning_rate": 4.919547582551114e-05, + "loss": 5.3395, + "step": 13616 + }, + { + "epoch": 0.08098415643733943, + "grad_norm": 1.3821018934249878, + "learning_rate": 4.9195358277243464e-05, + "loss": 5.4033, + "step": 13617 + }, + { + "epoch": 0.08099010372062042, + "grad_norm": 1.4585113525390625, + "learning_rate": 4.9195240720529446e-05, + "loss": 5.3098, + "step": 13618 + }, + { + "epoch": 0.08099605100390142, + "grad_norm": 1.5766072273254395, + "learning_rate": 4.9195123155369114e-05, + "loss": 5.2672, + "step": 13619 + }, + { + "epoch": 0.08100199828718241, + "grad_norm": 1.5132715702056885, + "learning_rate": 4.919500558176252e-05, + "loss": 5.1707, + "step": 13620 + }, + { + "epoch": 0.08100794557046341, + "grad_norm": 1.594093918800354, + "learning_rate": 4.91948879997097e-05, + "loss": 5.2988, + "step": 13621 + }, + { + "epoch": 0.08101389285374441, + "grad_norm": 1.529877781867981, + "learning_rate": 4.919477040921069e-05, + "loss": 5.4418, + "step": 13622 + }, + { + "epoch": 0.0810198401370254, + "grad_norm": 1.4329211711883545, + "learning_rate": 4.919465281026554e-05, + "loss": 5.308, + "step": 13623 + }, + { + "epoch": 0.0810257874203064, + "grad_norm": 1.4308300018310547, + "learning_rate": 4.919453520287428e-05, + "loss": 5.259, + "step": 13624 + }, + { + "epoch": 0.0810317347035874, + "grad_norm": 1.248282790184021, + "learning_rate": 4.919441758703697e-05, + "loss": 5.2129, + "step": 13625 + }, + { + "epoch": 0.08103768198686839, + "grad_norm": 1.4535733461380005, + "learning_rate": 4.919429996275363e-05, + "loss": 5.1989, + "step": 13626 + }, + { + "epoch": 0.0810436292701494, + "grad_norm": 1.6055153608322144, + "learning_rate": 4.9194182330024306e-05, + "loss": 5.1669, + "step": 13627 + }, + { + "epoch": 0.0810495765534304, + "grad_norm": 1.6016899347305298, + "learning_rate": 4.919406468884905e-05, + "loss": 5.1958, + "step": 13628 + }, + { + "epoch": 0.08105552383671139, + "grad_norm": 1.4217112064361572, + "learning_rate": 4.91939470392279e-05, + "loss": 4.9775, + "step": 13629 + }, + { + "epoch": 0.08106147111999239, + "grad_norm": 1.4405405521392822, + "learning_rate": 4.919382938116088e-05, + "loss": 5.1865, + "step": 13630 + }, + { + "epoch": 0.08106741840327339, + "grad_norm": 1.3826597929000854, + "learning_rate": 4.919371171464805e-05, + "loss": 5.1909, + "step": 13631 + }, + { + "epoch": 0.08107336568655438, + "grad_norm": 1.942305088043213, + "learning_rate": 4.919359403968944e-05, + "loss": 5.227, + "step": 13632 + }, + { + "epoch": 0.08107931296983538, + "grad_norm": 1.8932685852050781, + "learning_rate": 4.919347635628511e-05, + "loss": 5.3257, + "step": 13633 + }, + { + "epoch": 0.08108526025311638, + "grad_norm": 1.8511128425598145, + "learning_rate": 4.9193358664435074e-05, + "loss": 5.4229, + "step": 13634 + }, + { + "epoch": 0.08109120753639737, + "grad_norm": 1.6317822933197021, + "learning_rate": 4.919324096413939e-05, + "loss": 5.3067, + "step": 13635 + }, + { + "epoch": 0.08109715481967837, + "grad_norm": 1.835503101348877, + "learning_rate": 4.91931232553981e-05, + "loss": 5.3246, + "step": 13636 + }, + { + "epoch": 0.08110310210295937, + "grad_norm": 1.8521870374679565, + "learning_rate": 4.919300553821124e-05, + "loss": 5.3367, + "step": 13637 + }, + { + "epoch": 0.08110904938624036, + "grad_norm": 1.7814146280288696, + "learning_rate": 4.9192887812578844e-05, + "loss": 5.2949, + "step": 13638 + }, + { + "epoch": 0.08111499666952136, + "grad_norm": 1.6024845838546753, + "learning_rate": 4.919277007850097e-05, + "loss": 5.3159, + "step": 13639 + }, + { + "epoch": 0.08112094395280237, + "grad_norm": 2.955554246902466, + "learning_rate": 4.919265233597765e-05, + "loss": 4.8802, + "step": 13640 + }, + { + "epoch": 0.08112689123608335, + "grad_norm": 1.7217108011245728, + "learning_rate": 4.919253458500892e-05, + "loss": 5.08, + "step": 13641 + }, + { + "epoch": 0.08113283851936436, + "grad_norm": 1.686672329902649, + "learning_rate": 4.9192416825594825e-05, + "loss": 5.1349, + "step": 13642 + }, + { + "epoch": 0.08113878580264536, + "grad_norm": 1.5377975702285767, + "learning_rate": 4.9192299057735416e-05, + "loss": 5.1327, + "step": 13643 + }, + { + "epoch": 0.08114473308592635, + "grad_norm": 1.7383031845092773, + "learning_rate": 4.9192181281430716e-05, + "loss": 5.0938, + "step": 13644 + }, + { + "epoch": 0.08115068036920735, + "grad_norm": 1.6174112558364868, + "learning_rate": 4.919206349668077e-05, + "loss": 5.0123, + "step": 13645 + }, + { + "epoch": 0.08115662765248835, + "grad_norm": 1.5967239141464233, + "learning_rate": 4.9191945703485646e-05, + "loss": 5.0334, + "step": 13646 + }, + { + "epoch": 0.08116257493576934, + "grad_norm": 1.5330301523208618, + "learning_rate": 4.919182790184534e-05, + "loss": 5.1615, + "step": 13647 + }, + { + "epoch": 0.08116852221905034, + "grad_norm": 1.5532622337341309, + "learning_rate": 4.919171009175993e-05, + "loss": 5.1565, + "step": 13648 + }, + { + "epoch": 0.08117446950233133, + "grad_norm": 1.4814139604568481, + "learning_rate": 4.919159227322945e-05, + "loss": 5.0991, + "step": 13649 + }, + { + "epoch": 0.08118041678561233, + "grad_norm": 1.2586545944213867, + "learning_rate": 4.919147444625392e-05, + "loss": 5.2482, + "step": 13650 + }, + { + "epoch": 0.08118636406889333, + "grad_norm": 1.5292212963104248, + "learning_rate": 4.91913566108334e-05, + "loss": 5.1787, + "step": 13651 + }, + { + "epoch": 0.08119231135217432, + "grad_norm": 1.5354405641555786, + "learning_rate": 4.919123876696793e-05, + "loss": 5.0046, + "step": 13652 + }, + { + "epoch": 0.08119825863545532, + "grad_norm": 1.3921040296554565, + "learning_rate": 4.919112091465755e-05, + "loss": 5.2199, + "step": 13653 + }, + { + "epoch": 0.08120420591873632, + "grad_norm": 1.471068263053894, + "learning_rate": 4.91910030539023e-05, + "loss": 5.0445, + "step": 13654 + }, + { + "epoch": 0.08121015320201731, + "grad_norm": 1.3318332433700562, + "learning_rate": 4.919088518470222e-05, + "loss": 5.1973, + "step": 13655 + }, + { + "epoch": 0.08121610048529831, + "grad_norm": 1.5445464849472046, + "learning_rate": 4.919076730705735e-05, + "loss": 5.4165, + "step": 13656 + }, + { + "epoch": 0.08122204776857932, + "grad_norm": 1.3854666948318481, + "learning_rate": 4.9190649420967735e-05, + "loss": 5.336, + "step": 13657 + }, + { + "epoch": 0.0812279950518603, + "grad_norm": 1.4703121185302734, + "learning_rate": 4.919053152643342e-05, + "loss": 5.4837, + "step": 13658 + }, + { + "epoch": 0.08123394233514131, + "grad_norm": 1.3189783096313477, + "learning_rate": 4.9190413623454425e-05, + "loss": 5.4163, + "step": 13659 + }, + { + "epoch": 0.08123988961842231, + "grad_norm": 1.469601035118103, + "learning_rate": 4.919029571203081e-05, + "loss": 5.2772, + "step": 13660 + }, + { + "epoch": 0.0812458369017033, + "grad_norm": 1.4215590953826904, + "learning_rate": 4.919017779216262e-05, + "loss": 5.5008, + "step": 13661 + }, + { + "epoch": 0.0812517841849843, + "grad_norm": 1.577255129814148, + "learning_rate": 4.919005986384989e-05, + "loss": 5.2565, + "step": 13662 + }, + { + "epoch": 0.0812577314682653, + "grad_norm": 1.5910719633102417, + "learning_rate": 4.918994192709265e-05, + "loss": 5.1143, + "step": 13663 + }, + { + "epoch": 0.08126367875154629, + "grad_norm": 1.5665141344070435, + "learning_rate": 4.9189823981890964e-05, + "loss": 5.1911, + "step": 13664 + }, + { + "epoch": 0.08126962603482729, + "grad_norm": 1.6348809003829956, + "learning_rate": 4.918970602824485e-05, + "loss": 5.2257, + "step": 13665 + }, + { + "epoch": 0.0812755733181083, + "grad_norm": 1.4213917255401611, + "learning_rate": 4.9189588066154365e-05, + "loss": 5.0528, + "step": 13666 + }, + { + "epoch": 0.08128152060138928, + "grad_norm": 1.497758388519287, + "learning_rate": 4.918947009561955e-05, + "loss": 5.2421, + "step": 13667 + }, + { + "epoch": 0.08128746788467028, + "grad_norm": 1.4052904844284058, + "learning_rate": 4.918935211664043e-05, + "loss": 5.5054, + "step": 13668 + }, + { + "epoch": 0.08129341516795129, + "grad_norm": 1.5615813732147217, + "learning_rate": 4.9189234129217064e-05, + "loss": 5.2711, + "step": 13669 + }, + { + "epoch": 0.08129936245123227, + "grad_norm": 1.2366914749145508, + "learning_rate": 4.9189116133349485e-05, + "loss": 5.4035, + "step": 13670 + }, + { + "epoch": 0.08130530973451328, + "grad_norm": 1.5328080654144287, + "learning_rate": 4.918899812903773e-05, + "loss": 5.3269, + "step": 13671 + }, + { + "epoch": 0.08131125701779428, + "grad_norm": 1.6515448093414307, + "learning_rate": 4.918888011628185e-05, + "loss": 5.1734, + "step": 13672 + }, + { + "epoch": 0.08131720430107527, + "grad_norm": 1.385549783706665, + "learning_rate": 4.918876209508188e-05, + "loss": 5.3769, + "step": 13673 + }, + { + "epoch": 0.08132315158435627, + "grad_norm": 1.4133338928222656, + "learning_rate": 4.9188644065437875e-05, + "loss": 5.2607, + "step": 13674 + }, + { + "epoch": 0.08132909886763727, + "grad_norm": 1.6652443408966064, + "learning_rate": 4.918852602734984e-05, + "loss": 5.3939, + "step": 13675 + }, + { + "epoch": 0.08133504615091826, + "grad_norm": 1.455493450164795, + "learning_rate": 4.918840798081786e-05, + "loss": 5.3051, + "step": 13676 + }, + { + "epoch": 0.08134099343419926, + "grad_norm": 1.5490756034851074, + "learning_rate": 4.918828992584196e-05, + "loss": 5.4309, + "step": 13677 + }, + { + "epoch": 0.08134694071748025, + "grad_norm": 1.5857222080230713, + "learning_rate": 4.918817186242216e-05, + "loss": 5.1158, + "step": 13678 + }, + { + "epoch": 0.08135288800076125, + "grad_norm": 1.6051661968231201, + "learning_rate": 4.918805379055853e-05, + "loss": 5.2668, + "step": 13679 + }, + { + "epoch": 0.08135883528404225, + "grad_norm": 1.6476162672042847, + "learning_rate": 4.91879357102511e-05, + "loss": 5.2367, + "step": 13680 + }, + { + "epoch": 0.08136478256732324, + "grad_norm": 1.4255136251449585, + "learning_rate": 4.918781762149991e-05, + "loss": 5.0348, + "step": 13681 + }, + { + "epoch": 0.08137072985060424, + "grad_norm": 1.4585214853286743, + "learning_rate": 4.9187699524305e-05, + "loss": 5.2323, + "step": 13682 + }, + { + "epoch": 0.08137667713388524, + "grad_norm": 1.3733863830566406, + "learning_rate": 4.9187581418666415e-05, + "loss": 5.0898, + "step": 13683 + }, + { + "epoch": 0.08138262441716623, + "grad_norm": 1.5789494514465332, + "learning_rate": 4.91874633045842e-05, + "loss": 5.0886, + "step": 13684 + }, + { + "epoch": 0.08138857170044723, + "grad_norm": 1.4390051364898682, + "learning_rate": 4.918734518205839e-05, + "loss": 5.4305, + "step": 13685 + }, + { + "epoch": 0.08139451898372824, + "grad_norm": 1.8984171152114868, + "learning_rate": 4.9187227051089025e-05, + "loss": 5.0593, + "step": 13686 + }, + { + "epoch": 0.08140046626700922, + "grad_norm": 1.940045714378357, + "learning_rate": 4.918710891167615e-05, + "loss": 5.3115, + "step": 13687 + }, + { + "epoch": 0.08140641355029023, + "grad_norm": 1.6479912996292114, + "learning_rate": 4.918699076381981e-05, + "loss": 5.1585, + "step": 13688 + }, + { + "epoch": 0.08141236083357123, + "grad_norm": 1.554114818572998, + "learning_rate": 4.918687260752003e-05, + "loss": 5.1581, + "step": 13689 + }, + { + "epoch": 0.08141830811685222, + "grad_norm": 1.6920353174209595, + "learning_rate": 4.9186754442776874e-05, + "loss": 5.2263, + "step": 13690 + }, + { + "epoch": 0.08142425540013322, + "grad_norm": 1.572787880897522, + "learning_rate": 4.9186636269590366e-05, + "loss": 5.1019, + "step": 13691 + }, + { + "epoch": 0.08143020268341422, + "grad_norm": 1.646004319190979, + "learning_rate": 4.918651808796055e-05, + "loss": 5.1426, + "step": 13692 + }, + { + "epoch": 0.08143614996669521, + "grad_norm": 1.578749179840088, + "learning_rate": 4.9186399897887475e-05, + "loss": 4.9682, + "step": 13693 + }, + { + "epoch": 0.08144209724997621, + "grad_norm": 1.7725828886032104, + "learning_rate": 4.918628169937118e-05, + "loss": 5.0772, + "step": 13694 + }, + { + "epoch": 0.08144804453325721, + "grad_norm": 1.808596134185791, + "learning_rate": 4.91861634924117e-05, + "loss": 5.077, + "step": 13695 + }, + { + "epoch": 0.0814539918165382, + "grad_norm": 1.8685991764068604, + "learning_rate": 4.9186045277009084e-05, + "loss": 5.1322, + "step": 13696 + }, + { + "epoch": 0.0814599390998192, + "grad_norm": 1.6144567728042603, + "learning_rate": 4.9185927053163366e-05, + "loss": 5.3354, + "step": 13697 + }, + { + "epoch": 0.0814658863831002, + "grad_norm": 1.767673373222351, + "learning_rate": 4.918580882087459e-05, + "loss": 5.0358, + "step": 13698 + }, + { + "epoch": 0.0814718336663812, + "grad_norm": 1.7151973247528076, + "learning_rate": 4.9185690580142805e-05, + "loss": 5.0371, + "step": 13699 + }, + { + "epoch": 0.0814777809496622, + "grad_norm": 1.710990071296692, + "learning_rate": 4.918557233096803e-05, + "loss": 4.9236, + "step": 13700 + }, + { + "epoch": 0.0814837282329432, + "grad_norm": 1.8118677139282227, + "learning_rate": 4.9185454073350335e-05, + "loss": 4.9112, + "step": 13701 + }, + { + "epoch": 0.08148967551622419, + "grad_norm": 2.0120832920074463, + "learning_rate": 4.918533580728974e-05, + "loss": 4.8201, + "step": 13702 + }, + { + "epoch": 0.08149562279950519, + "grad_norm": 1.742125153541565, + "learning_rate": 4.91852175327863e-05, + "loss": 5.0618, + "step": 13703 + }, + { + "epoch": 0.08150157008278619, + "grad_norm": 1.6496554613113403, + "learning_rate": 4.9185099249840054e-05, + "loss": 5.217, + "step": 13704 + }, + { + "epoch": 0.08150751736606718, + "grad_norm": 1.6782381534576416, + "learning_rate": 4.9184980958451034e-05, + "loss": 5.0362, + "step": 13705 + }, + { + "epoch": 0.08151346464934818, + "grad_norm": 1.8002519607543945, + "learning_rate": 4.918486265861929e-05, + "loss": 4.8812, + "step": 13706 + }, + { + "epoch": 0.08151941193262917, + "grad_norm": 1.5939546823501587, + "learning_rate": 4.918474435034486e-05, + "loss": 5.0571, + "step": 13707 + }, + { + "epoch": 0.08152535921591017, + "grad_norm": 1.6342964172363281, + "learning_rate": 4.918462603362778e-05, + "loss": 5.087, + "step": 13708 + }, + { + "epoch": 0.08153130649919117, + "grad_norm": 1.549822449684143, + "learning_rate": 4.91845077084681e-05, + "loss": 5.1654, + "step": 13709 + }, + { + "epoch": 0.08153725378247216, + "grad_norm": 1.5732479095458984, + "learning_rate": 4.9184389374865855e-05, + "loss": 4.9085, + "step": 13710 + }, + { + "epoch": 0.08154320106575316, + "grad_norm": 1.4182745218276978, + "learning_rate": 4.9184271032821094e-05, + "loss": 4.8846, + "step": 13711 + }, + { + "epoch": 0.08154914834903416, + "grad_norm": 1.3679918050765991, + "learning_rate": 4.918415268233385e-05, + "loss": 5.0263, + "step": 13712 + }, + { + "epoch": 0.08155509563231515, + "grad_norm": 1.4714219570159912, + "learning_rate": 4.918403432340418e-05, + "loss": 5.5169, + "step": 13713 + }, + { + "epoch": 0.08156104291559615, + "grad_norm": 1.8351292610168457, + "learning_rate": 4.91839159560321e-05, + "loss": 5.215, + "step": 13714 + }, + { + "epoch": 0.08156699019887716, + "grad_norm": 1.530781865119934, + "learning_rate": 4.918379758021767e-05, + "loss": 5.0882, + "step": 13715 + }, + { + "epoch": 0.08157293748215814, + "grad_norm": 1.799901008605957, + "learning_rate": 4.918367919596093e-05, + "loss": 5.2248, + "step": 13716 + }, + { + "epoch": 0.08157888476543915, + "grad_norm": 1.7563488483428955, + "learning_rate": 4.9183560803261915e-05, + "loss": 5.3192, + "step": 13717 + }, + { + "epoch": 0.08158483204872015, + "grad_norm": 1.7521497011184692, + "learning_rate": 4.918344240212066e-05, + "loss": 5.4841, + "step": 13718 + }, + { + "epoch": 0.08159077933200114, + "grad_norm": 1.7345610857009888, + "learning_rate": 4.918332399253722e-05, + "loss": 5.0716, + "step": 13719 + }, + { + "epoch": 0.08159672661528214, + "grad_norm": 1.4790915250778198, + "learning_rate": 4.918320557451164e-05, + "loss": 5.1833, + "step": 13720 + }, + { + "epoch": 0.08160267389856314, + "grad_norm": 1.4721198081970215, + "learning_rate": 4.918308714804395e-05, + "loss": 5.1355, + "step": 13721 + }, + { + "epoch": 0.08160862118184413, + "grad_norm": 1.4949108362197876, + "learning_rate": 4.918296871313419e-05, + "loss": 4.9666, + "step": 13722 + }, + { + "epoch": 0.08161456846512513, + "grad_norm": 1.3814501762390137, + "learning_rate": 4.91828502697824e-05, + "loss": 5.0575, + "step": 13723 + }, + { + "epoch": 0.08162051574840613, + "grad_norm": 1.4503964185714722, + "learning_rate": 4.918273181798864e-05, + "loss": 5.4112, + "step": 13724 + }, + { + "epoch": 0.08162646303168712, + "grad_norm": 1.5512415170669556, + "learning_rate": 4.9182613357752925e-05, + "loss": 5.1501, + "step": 13725 + }, + { + "epoch": 0.08163241031496812, + "grad_norm": 1.7429851293563843, + "learning_rate": 4.9182494889075315e-05, + "loss": 5.2736, + "step": 13726 + }, + { + "epoch": 0.08163835759824913, + "grad_norm": 1.325498104095459, + "learning_rate": 4.918237641195584e-05, + "loss": 5.3702, + "step": 13727 + }, + { + "epoch": 0.08164430488153011, + "grad_norm": 1.2677874565124512, + "learning_rate": 4.918225792639456e-05, + "loss": 5.2681, + "step": 13728 + }, + { + "epoch": 0.08165025216481112, + "grad_norm": 1.4957364797592163, + "learning_rate": 4.918213943239149e-05, + "loss": 5.4956, + "step": 13729 + }, + { + "epoch": 0.08165619944809212, + "grad_norm": 1.3380833864212036, + "learning_rate": 4.91820209299467e-05, + "loss": 5.3286, + "step": 13730 + }, + { + "epoch": 0.0816621467313731, + "grad_norm": 1.6803557872772217, + "learning_rate": 4.918190241906021e-05, + "loss": 5.3119, + "step": 13731 + }, + { + "epoch": 0.08166809401465411, + "grad_norm": 1.7933920621871948, + "learning_rate": 4.918178389973206e-05, + "loss": 5.139, + "step": 13732 + }, + { + "epoch": 0.08167404129793511, + "grad_norm": 1.5846813917160034, + "learning_rate": 4.91816653719623e-05, + "loss": 5.4431, + "step": 13733 + }, + { + "epoch": 0.0816799885812161, + "grad_norm": 1.9218448400497437, + "learning_rate": 4.918154683575098e-05, + "loss": 5.3245, + "step": 13734 + }, + { + "epoch": 0.0816859358644971, + "grad_norm": 1.4883100986480713, + "learning_rate": 4.918142829109813e-05, + "loss": 5.3007, + "step": 13735 + }, + { + "epoch": 0.08169188314777809, + "grad_norm": 1.4396723508834839, + "learning_rate": 4.918130973800379e-05, + "loss": 5.1956, + "step": 13736 + }, + { + "epoch": 0.08169783043105909, + "grad_norm": 1.4395633935928345, + "learning_rate": 4.918119117646801e-05, + "loss": 5.1637, + "step": 13737 + }, + { + "epoch": 0.08170377771434009, + "grad_norm": 1.540003776550293, + "learning_rate": 4.9181072606490816e-05, + "loss": 5.2278, + "step": 13738 + }, + { + "epoch": 0.08170972499762108, + "grad_norm": 1.446815848350525, + "learning_rate": 4.918095402807227e-05, + "loss": 5.1627, + "step": 13739 + }, + { + "epoch": 0.08171567228090208, + "grad_norm": 1.4501028060913086, + "learning_rate": 4.918083544121239e-05, + "loss": 5.0747, + "step": 13740 + }, + { + "epoch": 0.08172161956418308, + "grad_norm": 1.217608094215393, + "learning_rate": 4.9180716845911244e-05, + "loss": 5.0668, + "step": 13741 + }, + { + "epoch": 0.08172756684746407, + "grad_norm": 1.6321865320205688, + "learning_rate": 4.918059824216885e-05, + "loss": 5.2785, + "step": 13742 + }, + { + "epoch": 0.08173351413074507, + "grad_norm": 1.5838396549224854, + "learning_rate": 4.9180479629985265e-05, + "loss": 5.1675, + "step": 13743 + }, + { + "epoch": 0.08173946141402608, + "grad_norm": 1.7023003101348877, + "learning_rate": 4.918036100936052e-05, + "loss": 5.1664, + "step": 13744 + }, + { + "epoch": 0.08174540869730706, + "grad_norm": 1.767067790031433, + "learning_rate": 4.918024238029466e-05, + "loss": 5.0157, + "step": 13745 + }, + { + "epoch": 0.08175135598058807, + "grad_norm": 1.6058627367019653, + "learning_rate": 4.918012374278773e-05, + "loss": 5.1772, + "step": 13746 + }, + { + "epoch": 0.08175730326386907, + "grad_norm": 1.7853416204452515, + "learning_rate": 4.9180005096839766e-05, + "loss": 5.2678, + "step": 13747 + }, + { + "epoch": 0.08176325054715006, + "grad_norm": 1.4799201488494873, + "learning_rate": 4.917988644245082e-05, + "loss": 5.3153, + "step": 13748 + }, + { + "epoch": 0.08176919783043106, + "grad_norm": 1.4581291675567627, + "learning_rate": 4.917976777962092e-05, + "loss": 5.2755, + "step": 13749 + }, + { + "epoch": 0.08177514511371206, + "grad_norm": 1.7151737213134766, + "learning_rate": 4.917964910835011e-05, + "loss": 5.1761, + "step": 13750 + }, + { + "epoch": 0.08178109239699305, + "grad_norm": 1.5101522207260132, + "learning_rate": 4.917953042863843e-05, + "loss": 5.0003, + "step": 13751 + }, + { + "epoch": 0.08178703968027405, + "grad_norm": 1.4508110284805298, + "learning_rate": 4.9179411740485935e-05, + "loss": 5.1158, + "step": 13752 + }, + { + "epoch": 0.08179298696355505, + "grad_norm": 1.5012980699539185, + "learning_rate": 4.917929304389266e-05, + "loss": 5.2762, + "step": 13753 + }, + { + "epoch": 0.08179893424683604, + "grad_norm": 1.5914186239242554, + "learning_rate": 4.9179174338858635e-05, + "loss": 5.1422, + "step": 13754 + }, + { + "epoch": 0.08180488153011704, + "grad_norm": 1.5001139640808105, + "learning_rate": 4.9179055625383915e-05, + "loss": 5.2158, + "step": 13755 + }, + { + "epoch": 0.08181082881339805, + "grad_norm": 1.382815957069397, + "learning_rate": 4.917893690346853e-05, + "loss": 5.2562, + "step": 13756 + }, + { + "epoch": 0.08181677609667903, + "grad_norm": 1.3576865196228027, + "learning_rate": 4.9178818173112535e-05, + "loss": 5.221, + "step": 13757 + }, + { + "epoch": 0.08182272337996004, + "grad_norm": 1.5542206764221191, + "learning_rate": 4.917869943431596e-05, + "loss": 5.071, + "step": 13758 + }, + { + "epoch": 0.08182867066324104, + "grad_norm": 1.6010403633117676, + "learning_rate": 4.9178580687078855e-05, + "loss": 5.2052, + "step": 13759 + }, + { + "epoch": 0.08183461794652203, + "grad_norm": 1.3808842897415161, + "learning_rate": 4.9178461931401254e-05, + "loss": 5.3007, + "step": 13760 + }, + { + "epoch": 0.08184056522980303, + "grad_norm": 1.3584518432617188, + "learning_rate": 4.91783431672832e-05, + "loss": 5.3137, + "step": 13761 + }, + { + "epoch": 0.08184651251308403, + "grad_norm": 1.4467449188232422, + "learning_rate": 4.917822439472474e-05, + "loss": 5.2208, + "step": 13762 + }, + { + "epoch": 0.08185245979636502, + "grad_norm": 1.298618197441101, + "learning_rate": 4.917810561372591e-05, + "loss": 5.2161, + "step": 13763 + }, + { + "epoch": 0.08185840707964602, + "grad_norm": 2.5304789543151855, + "learning_rate": 4.9177986824286756e-05, + "loss": 4.6644, + "step": 13764 + }, + { + "epoch": 0.08186435436292701, + "grad_norm": 1.607969880104065, + "learning_rate": 4.917786802640732e-05, + "loss": 5.2116, + "step": 13765 + }, + { + "epoch": 0.08187030164620801, + "grad_norm": 1.401207685470581, + "learning_rate": 4.917774922008763e-05, + "loss": 5.2847, + "step": 13766 + }, + { + "epoch": 0.08187624892948901, + "grad_norm": 1.1652514934539795, + "learning_rate": 4.9177630405327746e-05, + "loss": 5.2939, + "step": 13767 + }, + { + "epoch": 0.08188219621277, + "grad_norm": 1.2998749017715454, + "learning_rate": 4.9177511582127694e-05, + "loss": 5.251, + "step": 13768 + }, + { + "epoch": 0.081888143496051, + "grad_norm": 1.33558988571167, + "learning_rate": 4.917739275048753e-05, + "loss": 5.2749, + "step": 13769 + }, + { + "epoch": 0.081894090779332, + "grad_norm": 1.1457966566085815, + "learning_rate": 4.917727391040728e-05, + "loss": 5.3153, + "step": 13770 + }, + { + "epoch": 0.08190003806261299, + "grad_norm": 1.493249773979187, + "learning_rate": 4.917715506188699e-05, + "loss": 5.3702, + "step": 13771 + }, + { + "epoch": 0.081905985345894, + "grad_norm": 1.2591760158538818, + "learning_rate": 4.917703620492672e-05, + "loss": 5.2019, + "step": 13772 + }, + { + "epoch": 0.081911932629175, + "grad_norm": 1.2480885982513428, + "learning_rate": 4.917691733952648e-05, + "loss": 5.1904, + "step": 13773 + }, + { + "epoch": 0.08191787991245598, + "grad_norm": 1.3278160095214844, + "learning_rate": 4.917679846568634e-05, + "loss": 5.0424, + "step": 13774 + }, + { + "epoch": 0.08192382719573699, + "grad_norm": 1.2930511236190796, + "learning_rate": 4.9176679583406325e-05, + "loss": 5.2437, + "step": 13775 + }, + { + "epoch": 0.08192977447901799, + "grad_norm": 1.39852774143219, + "learning_rate": 4.9176560692686485e-05, + "loss": 5.3683, + "step": 13776 + }, + { + "epoch": 0.08193572176229898, + "grad_norm": 1.3392889499664307, + "learning_rate": 4.917644179352685e-05, + "loss": 5.1894, + "step": 13777 + }, + { + "epoch": 0.08194166904557998, + "grad_norm": 1.318595051765442, + "learning_rate": 4.917632288592747e-05, + "loss": 5.382, + "step": 13778 + }, + { + "epoch": 0.08194761632886098, + "grad_norm": 1.0992580652236938, + "learning_rate": 4.9176203969888395e-05, + "loss": 5.1979, + "step": 13779 + }, + { + "epoch": 0.08195356361214197, + "grad_norm": 1.2092480659484863, + "learning_rate": 4.917608504540965e-05, + "loss": 5.2253, + "step": 13780 + }, + { + "epoch": 0.08195951089542297, + "grad_norm": 1.2495516538619995, + "learning_rate": 4.9175966112491286e-05, + "loss": 5.1951, + "step": 13781 + }, + { + "epoch": 0.08196545817870397, + "grad_norm": 1.642177700996399, + "learning_rate": 4.917584717113334e-05, + "loss": 4.9648, + "step": 13782 + }, + { + "epoch": 0.08197140546198496, + "grad_norm": 1.4849772453308105, + "learning_rate": 4.9175728221335856e-05, + "loss": 4.8231, + "step": 13783 + }, + { + "epoch": 0.08197735274526596, + "grad_norm": 1.1743687391281128, + "learning_rate": 4.917560926309888e-05, + "loss": 4.7685, + "step": 13784 + }, + { + "epoch": 0.08198330002854697, + "grad_norm": 1.2688218355178833, + "learning_rate": 4.9175490296422436e-05, + "loss": 5.3023, + "step": 13785 + }, + { + "epoch": 0.08198924731182795, + "grad_norm": 1.2325210571289062, + "learning_rate": 4.9175371321306584e-05, + "loss": 4.8373, + "step": 13786 + }, + { + "epoch": 0.08199519459510896, + "grad_norm": 1.5414066314697266, + "learning_rate": 4.9175252337751364e-05, + "loss": 5.005, + "step": 13787 + }, + { + "epoch": 0.08200114187838996, + "grad_norm": 2.1581833362579346, + "learning_rate": 4.917513334575681e-05, + "loss": 5.5065, + "step": 13788 + }, + { + "epoch": 0.08200708916167095, + "grad_norm": 2.0199508666992188, + "learning_rate": 4.917501434532297e-05, + "loss": 5.8826, + "step": 13789 + }, + { + "epoch": 0.08201303644495195, + "grad_norm": 1.727602481842041, + "learning_rate": 4.917489533644987e-05, + "loss": 5.6967, + "step": 13790 + }, + { + "epoch": 0.08201898372823295, + "grad_norm": 1.5649336576461792, + "learning_rate": 4.917477631913757e-05, + "loss": 5.783, + "step": 13791 + }, + { + "epoch": 0.08202493101151394, + "grad_norm": 1.7326582670211792, + "learning_rate": 4.9174657293386115e-05, + "loss": 5.6705, + "step": 13792 + }, + { + "epoch": 0.08203087829479494, + "grad_norm": 1.8611500263214111, + "learning_rate": 4.917453825919553e-05, + "loss": 5.4881, + "step": 13793 + }, + { + "epoch": 0.08203682557807593, + "grad_norm": 1.9762206077575684, + "learning_rate": 4.917441921656586e-05, + "loss": 5.4826, + "step": 13794 + }, + { + "epoch": 0.08204277286135693, + "grad_norm": 1.6816489696502686, + "learning_rate": 4.9174300165497154e-05, + "loss": 5.466, + "step": 13795 + }, + { + "epoch": 0.08204872014463793, + "grad_norm": 1.8922536373138428, + "learning_rate": 4.9174181105989445e-05, + "loss": 5.3603, + "step": 13796 + }, + { + "epoch": 0.08205466742791892, + "grad_norm": 2.094996213912964, + "learning_rate": 4.917406203804279e-05, + "loss": 5.8687, + "step": 13797 + }, + { + "epoch": 0.08206061471119992, + "grad_norm": 1.8656450510025024, + "learning_rate": 4.9173942961657215e-05, + "loss": 6.2551, + "step": 13798 + }, + { + "epoch": 0.08206656199448092, + "grad_norm": 1.871787428855896, + "learning_rate": 4.917382387683276e-05, + "loss": 5.6612, + "step": 13799 + }, + { + "epoch": 0.08207250927776191, + "grad_norm": 1.8721636533737183, + "learning_rate": 4.9173704783569475e-05, + "loss": 5.8918, + "step": 13800 + }, + { + "epoch": 0.08207845656104291, + "grad_norm": 2.0554919242858887, + "learning_rate": 4.917358568186741e-05, + "loss": 5.6398, + "step": 13801 + }, + { + "epoch": 0.08208440384432392, + "grad_norm": 1.9311691522598267, + "learning_rate": 4.917346657172658e-05, + "loss": 5.6507, + "step": 13802 + }, + { + "epoch": 0.0820903511276049, + "grad_norm": 1.7426981925964355, + "learning_rate": 4.917334745314705e-05, + "loss": 5.3193, + "step": 13803 + }, + { + "epoch": 0.0820962984108859, + "grad_norm": 1.783890724182129, + "learning_rate": 4.9173228326128856e-05, + "loss": 5.1274, + "step": 13804 + }, + { + "epoch": 0.08210224569416691, + "grad_norm": 1.8739385604858398, + "learning_rate": 4.917310919067203e-05, + "loss": 5.378, + "step": 13805 + }, + { + "epoch": 0.0821081929774479, + "grad_norm": 1.6748543977737427, + "learning_rate": 4.917299004677663e-05, + "loss": 5.4772, + "step": 13806 + }, + { + "epoch": 0.0821141402607289, + "grad_norm": 1.498864769935608, + "learning_rate": 4.917287089444269e-05, + "loss": 5.4485, + "step": 13807 + }, + { + "epoch": 0.0821200875440099, + "grad_norm": 1.6129908561706543, + "learning_rate": 4.917275173367024e-05, + "loss": 5.5245, + "step": 13808 + }, + { + "epoch": 0.08212603482729089, + "grad_norm": 1.4655383825302124, + "learning_rate": 4.917263256445934e-05, + "loss": 5.5513, + "step": 13809 + }, + { + "epoch": 0.08213198211057189, + "grad_norm": 1.765244483947754, + "learning_rate": 4.917251338681003e-05, + "loss": 5.5322, + "step": 13810 + }, + { + "epoch": 0.0821379293938529, + "grad_norm": 2.002889633178711, + "learning_rate": 4.917239420072233e-05, + "loss": 5.1273, + "step": 13811 + }, + { + "epoch": 0.08214387667713388, + "grad_norm": 2.4380993843078613, + "learning_rate": 4.917227500619631e-05, + "loss": 4.8983, + "step": 13812 + }, + { + "epoch": 0.08214982396041488, + "grad_norm": 2.0864169597625732, + "learning_rate": 4.917215580323199e-05, + "loss": 5.077, + "step": 13813 + }, + { + "epoch": 0.08215577124369589, + "grad_norm": 2.2942094802856445, + "learning_rate": 4.917203659182942e-05, + "loss": 5.4359, + "step": 13814 + }, + { + "epoch": 0.08216171852697687, + "grad_norm": 2.067659616470337, + "learning_rate": 4.917191737198865e-05, + "loss": 5.7409, + "step": 13815 + }, + { + "epoch": 0.08216766581025788, + "grad_norm": 2.010085344314575, + "learning_rate": 4.917179814370971e-05, + "loss": 5.2279, + "step": 13816 + }, + { + "epoch": 0.08217361309353888, + "grad_norm": 1.8540743589401245, + "learning_rate": 4.917167890699264e-05, + "loss": 5.6146, + "step": 13817 + }, + { + "epoch": 0.08217956037681987, + "grad_norm": 1.9126391410827637, + "learning_rate": 4.917155966183749e-05, + "loss": 5.7007, + "step": 13818 + }, + { + "epoch": 0.08218550766010087, + "grad_norm": 1.6382626295089722, + "learning_rate": 4.91714404082443e-05, + "loss": 5.3641, + "step": 13819 + }, + { + "epoch": 0.08219145494338187, + "grad_norm": 1.8019288778305054, + "learning_rate": 4.9171321146213105e-05, + "loss": 5.1853, + "step": 13820 + }, + { + "epoch": 0.08219740222666286, + "grad_norm": 1.681685447692871, + "learning_rate": 4.917120187574395e-05, + "loss": 5.4141, + "step": 13821 + }, + { + "epoch": 0.08220334950994386, + "grad_norm": 1.9356689453125, + "learning_rate": 4.9171082596836896e-05, + "loss": 5.5379, + "step": 13822 + }, + { + "epoch": 0.08220929679322485, + "grad_norm": 1.9538071155548096, + "learning_rate": 4.917096330949195e-05, + "loss": 5.5723, + "step": 13823 + }, + { + "epoch": 0.08221524407650585, + "grad_norm": 1.7350852489471436, + "learning_rate": 4.9170844013709175e-05, + "loss": 5.5622, + "step": 13824 + }, + { + "epoch": 0.08222119135978685, + "grad_norm": 1.790276050567627, + "learning_rate": 4.9170724709488606e-05, + "loss": 5.5194, + "step": 13825 + }, + { + "epoch": 0.08222713864306784, + "grad_norm": 2.2997219562530518, + "learning_rate": 4.917060539683028e-05, + "loss": 5.0646, + "step": 13826 + }, + { + "epoch": 0.08223308592634884, + "grad_norm": 1.729131817817688, + "learning_rate": 4.9170486075734254e-05, + "loss": 5.5588, + "step": 13827 + }, + { + "epoch": 0.08223903320962984, + "grad_norm": 1.8754487037658691, + "learning_rate": 4.9170366746200566e-05, + "loss": 5.5435, + "step": 13828 + }, + { + "epoch": 0.08224498049291083, + "grad_norm": 1.8330692052841187, + "learning_rate": 4.9170247408229244e-05, + "loss": 5.598, + "step": 13829 + }, + { + "epoch": 0.08225092777619183, + "grad_norm": 1.8318592309951782, + "learning_rate": 4.917012806182034e-05, + "loss": 5.5165, + "step": 13830 + }, + { + "epoch": 0.08225687505947284, + "grad_norm": 1.6818424463272095, + "learning_rate": 4.9170008706973895e-05, + "loss": 5.3377, + "step": 13831 + }, + { + "epoch": 0.08226282234275382, + "grad_norm": 1.7040458917617798, + "learning_rate": 4.916988934368995e-05, + "loss": 5.4644, + "step": 13832 + }, + { + "epoch": 0.08226876962603483, + "grad_norm": 1.8902777433395386, + "learning_rate": 4.916976997196855e-05, + "loss": 5.4526, + "step": 13833 + }, + { + "epoch": 0.08227471690931583, + "grad_norm": 1.7484904527664185, + "learning_rate": 4.9169650591809724e-05, + "loss": 5.3, + "step": 13834 + }, + { + "epoch": 0.08228066419259682, + "grad_norm": 1.726083517074585, + "learning_rate": 4.916953120321353e-05, + "loss": 5.4451, + "step": 13835 + }, + { + "epoch": 0.08228661147587782, + "grad_norm": 1.791942834854126, + "learning_rate": 4.916941180618e-05, + "loss": 5.444, + "step": 13836 + }, + { + "epoch": 0.08229255875915882, + "grad_norm": 1.9032018184661865, + "learning_rate": 4.916929240070918e-05, + "loss": 5.4411, + "step": 13837 + }, + { + "epoch": 0.08229850604243981, + "grad_norm": 1.6170588731765747, + "learning_rate": 4.91691729868011e-05, + "loss": 5.4293, + "step": 13838 + }, + { + "epoch": 0.08230445332572081, + "grad_norm": 1.3972853422164917, + "learning_rate": 4.9169053564455825e-05, + "loss": 5.2889, + "step": 13839 + }, + { + "epoch": 0.08231040060900181, + "grad_norm": 1.782913088798523, + "learning_rate": 4.916893413367338e-05, + "loss": 5.4092, + "step": 13840 + }, + { + "epoch": 0.0823163478922828, + "grad_norm": 1.83617103099823, + "learning_rate": 4.9168814694453807e-05, + "loss": 5.3997, + "step": 13841 + }, + { + "epoch": 0.0823222951755638, + "grad_norm": 1.92609703540802, + "learning_rate": 4.9168695246797146e-05, + "loss": 5.3469, + "step": 13842 + }, + { + "epoch": 0.0823282424588448, + "grad_norm": 2.20027756690979, + "learning_rate": 4.9168575790703454e-05, + "loss": 5.5999, + "step": 13843 + }, + { + "epoch": 0.0823341897421258, + "grad_norm": 3.096323251724243, + "learning_rate": 4.916845632617275e-05, + "loss": 5.3997, + "step": 13844 + }, + { + "epoch": 0.0823401370254068, + "grad_norm": 2.433900833129883, + "learning_rate": 4.91683368532051e-05, + "loss": 5.4937, + "step": 13845 + }, + { + "epoch": 0.0823460843086878, + "grad_norm": 2.371389389038086, + "learning_rate": 4.9168217371800526e-05, + "loss": 5.966, + "step": 13846 + }, + { + "epoch": 0.08235203159196879, + "grad_norm": 1.5628182888031006, + "learning_rate": 4.9168097881959076e-05, + "loss": 5.5971, + "step": 13847 + }, + { + "epoch": 0.08235797887524979, + "grad_norm": 2.733569622039795, + "learning_rate": 4.91679783836808e-05, + "loss": 5.2696, + "step": 13848 + }, + { + "epoch": 0.08236392615853079, + "grad_norm": 2.117197275161743, + "learning_rate": 4.916785887696572e-05, + "loss": 5.3729, + "step": 13849 + }, + { + "epoch": 0.08236987344181178, + "grad_norm": 2.040476083755493, + "learning_rate": 4.9167739361813905e-05, + "loss": 5.6568, + "step": 13850 + }, + { + "epoch": 0.08237582072509278, + "grad_norm": 2.127465009689331, + "learning_rate": 4.916761983822536e-05, + "loss": 5.9168, + "step": 13851 + }, + { + "epoch": 0.08238176800837377, + "grad_norm": 2.00907301902771, + "learning_rate": 4.916750030620017e-05, + "loss": 5.9104, + "step": 13852 + }, + { + "epoch": 0.08238771529165477, + "grad_norm": 1.721428394317627, + "learning_rate": 4.916738076573835e-05, + "loss": 5.8126, + "step": 13853 + }, + { + "epoch": 0.08239366257493577, + "grad_norm": 1.5760809183120728, + "learning_rate": 4.9167261216839946e-05, + "loss": 6.0134, + "step": 13854 + }, + { + "epoch": 0.08239960985821676, + "grad_norm": 1.648639440536499, + "learning_rate": 4.9167141659505e-05, + "loss": 5.3878, + "step": 13855 + }, + { + "epoch": 0.08240555714149776, + "grad_norm": 1.4113967418670654, + "learning_rate": 4.916702209373355e-05, + "loss": 5.8159, + "step": 13856 + }, + { + "epoch": 0.08241150442477876, + "grad_norm": 1.725477933883667, + "learning_rate": 4.916690251952565e-05, + "loss": 5.7185, + "step": 13857 + }, + { + "epoch": 0.08241745170805975, + "grad_norm": 1.8538665771484375, + "learning_rate": 4.9166782936881326e-05, + "loss": 5.1804, + "step": 13858 + }, + { + "epoch": 0.08242339899134075, + "grad_norm": 1.5203232765197754, + "learning_rate": 4.9166663345800635e-05, + "loss": 5.1486, + "step": 13859 + }, + { + "epoch": 0.08242934627462176, + "grad_norm": 1.8738161325454712, + "learning_rate": 4.916654374628361e-05, + "loss": 5.0062, + "step": 13860 + }, + { + "epoch": 0.08243529355790274, + "grad_norm": 1.689563512802124, + "learning_rate": 4.916642413833029e-05, + "loss": 4.9508, + "step": 13861 + }, + { + "epoch": 0.08244124084118375, + "grad_norm": 1.8749178647994995, + "learning_rate": 4.916630452194073e-05, + "loss": 5.4645, + "step": 13862 + }, + { + "epoch": 0.08244718812446475, + "grad_norm": 2.779536247253418, + "learning_rate": 4.9166184897114956e-05, + "loss": 5.9364, + "step": 13863 + }, + { + "epoch": 0.08245313540774574, + "grad_norm": 2.41239333152771, + "learning_rate": 4.9166065263853014e-05, + "loss": 5.9045, + "step": 13864 + }, + { + "epoch": 0.08245908269102674, + "grad_norm": 1.624475359916687, + "learning_rate": 4.916594562215495e-05, + "loss": 5.4222, + "step": 13865 + }, + { + "epoch": 0.08246502997430774, + "grad_norm": 1.6841174364089966, + "learning_rate": 4.916582597202081e-05, + "loss": 5.3455, + "step": 13866 + }, + { + "epoch": 0.08247097725758873, + "grad_norm": 1.6790028810501099, + "learning_rate": 4.916570631345062e-05, + "loss": 5.5397, + "step": 13867 + }, + { + "epoch": 0.08247692454086973, + "grad_norm": 1.87303626537323, + "learning_rate": 4.9165586646444436e-05, + "loss": 5.6022, + "step": 13868 + }, + { + "epoch": 0.08248287182415073, + "grad_norm": 1.7747167348861694, + "learning_rate": 4.91654669710023e-05, + "loss": 5.4631, + "step": 13869 + }, + { + "epoch": 0.08248881910743172, + "grad_norm": 1.694941759109497, + "learning_rate": 4.9165347287124244e-05, + "loss": 5.5634, + "step": 13870 + }, + { + "epoch": 0.08249476639071272, + "grad_norm": 1.8258243799209595, + "learning_rate": 4.9165227594810316e-05, + "loss": 5.526, + "step": 13871 + }, + { + "epoch": 0.08250071367399373, + "grad_norm": 1.708798885345459, + "learning_rate": 4.9165107894060556e-05, + "loss": 5.5127, + "step": 13872 + }, + { + "epoch": 0.08250666095727471, + "grad_norm": 1.7820818424224854, + "learning_rate": 4.916498818487501e-05, + "loss": 5.4169, + "step": 13873 + }, + { + "epoch": 0.08251260824055572, + "grad_norm": 2.38067626953125, + "learning_rate": 4.916486846725372e-05, + "loss": 5.8063, + "step": 13874 + }, + { + "epoch": 0.08251855552383672, + "grad_norm": 1.8507468700408936, + "learning_rate": 4.916474874119671e-05, + "loss": 5.4871, + "step": 13875 + }, + { + "epoch": 0.0825245028071177, + "grad_norm": 1.8866678476333618, + "learning_rate": 4.916462900670404e-05, + "loss": 5.5452, + "step": 13876 + }, + { + "epoch": 0.08253045009039871, + "grad_norm": 1.853668212890625, + "learning_rate": 4.916450926377576e-05, + "loss": 5.8262, + "step": 13877 + }, + { + "epoch": 0.08253639737367971, + "grad_norm": 1.7404545545578003, + "learning_rate": 4.916438951241189e-05, + "loss": 5.5978, + "step": 13878 + }, + { + "epoch": 0.0825423446569607, + "grad_norm": 1.844139814376831, + "learning_rate": 4.916426975261248e-05, + "loss": 5.765, + "step": 13879 + }, + { + "epoch": 0.0825482919402417, + "grad_norm": 1.9454487562179565, + "learning_rate": 4.916414998437758e-05, + "loss": 5.5458, + "step": 13880 + }, + { + "epoch": 0.08255423922352269, + "grad_norm": 1.317144751548767, + "learning_rate": 4.916403020770722e-05, + "loss": 5.7694, + "step": 13881 + }, + { + "epoch": 0.08256018650680369, + "grad_norm": 1.718024730682373, + "learning_rate": 4.916391042260145e-05, + "loss": 5.7369, + "step": 13882 + }, + { + "epoch": 0.08256613379008469, + "grad_norm": 1.4623572826385498, + "learning_rate": 4.9163790629060305e-05, + "loss": 5.72, + "step": 13883 + }, + { + "epoch": 0.08257208107336568, + "grad_norm": 1.908839225769043, + "learning_rate": 4.916367082708383e-05, + "loss": 5.7175, + "step": 13884 + }, + { + "epoch": 0.08257802835664668, + "grad_norm": 1.7910356521606445, + "learning_rate": 4.916355101667206e-05, + "loss": 5.4446, + "step": 13885 + }, + { + "epoch": 0.08258397563992768, + "grad_norm": 2.132512092590332, + "learning_rate": 4.9163431197825055e-05, + "loss": 5.2315, + "step": 13886 + }, + { + "epoch": 0.08258992292320867, + "grad_norm": 2.223329782485962, + "learning_rate": 4.9163311370542844e-05, + "loss": 5.2953, + "step": 13887 + }, + { + "epoch": 0.08259587020648967, + "grad_norm": 2.6441519260406494, + "learning_rate": 4.916319153482547e-05, + "loss": 5.2637, + "step": 13888 + }, + { + "epoch": 0.08260181748977068, + "grad_norm": 2.1528780460357666, + "learning_rate": 4.9163071690672973e-05, + "loss": 5.1602, + "step": 13889 + }, + { + "epoch": 0.08260776477305166, + "grad_norm": 2.6483633518218994, + "learning_rate": 4.91629518380854e-05, + "loss": 5.2487, + "step": 13890 + }, + { + "epoch": 0.08261371205633267, + "grad_norm": 2.276808738708496, + "learning_rate": 4.916283197706279e-05, + "loss": 5.064, + "step": 13891 + }, + { + "epoch": 0.08261965933961367, + "grad_norm": 1.8921101093292236, + "learning_rate": 4.9162712107605184e-05, + "loss": 5.3979, + "step": 13892 + }, + { + "epoch": 0.08262560662289466, + "grad_norm": 2.2009568214416504, + "learning_rate": 4.9162592229712625e-05, + "loss": 5.2434, + "step": 13893 + }, + { + "epoch": 0.08263155390617566, + "grad_norm": 2.199380874633789, + "learning_rate": 4.916247234338516e-05, + "loss": 4.7187, + "step": 13894 + }, + { + "epoch": 0.08263750118945666, + "grad_norm": 2.3620400428771973, + "learning_rate": 4.916235244862282e-05, + "loss": 4.7371, + "step": 13895 + }, + { + "epoch": 0.08264344847273765, + "grad_norm": 2.100086212158203, + "learning_rate": 4.9162232545425646e-05, + "loss": 4.5239, + "step": 13896 + }, + { + "epoch": 0.08264939575601865, + "grad_norm": 2.100106954574585, + "learning_rate": 4.91621126337937e-05, + "loss": 4.5555, + "step": 13897 + }, + { + "epoch": 0.08265534303929965, + "grad_norm": 2.005345344543457, + "learning_rate": 4.9161992713727e-05, + "loss": 4.397, + "step": 13898 + }, + { + "epoch": 0.08266129032258064, + "grad_norm": 1.9393454790115356, + "learning_rate": 4.91618727852256e-05, + "loss": 4.7327, + "step": 13899 + }, + { + "epoch": 0.08266723760586164, + "grad_norm": 2.0109846591949463, + "learning_rate": 4.916175284828955e-05, + "loss": 4.4987, + "step": 13900 + }, + { + "epoch": 0.08267318488914265, + "grad_norm": 2.0040533542633057, + "learning_rate": 4.916163290291886e-05, + "loss": 4.4703, + "step": 13901 + }, + { + "epoch": 0.08267913217242363, + "grad_norm": 2.014885902404785, + "learning_rate": 4.916151294911361e-05, + "loss": 4.374, + "step": 13902 + }, + { + "epoch": 0.08268507945570464, + "grad_norm": 1.9490050077438354, + "learning_rate": 4.916139298687382e-05, + "loss": 4.6281, + "step": 13903 + }, + { + "epoch": 0.08269102673898564, + "grad_norm": 2.0691943168640137, + "learning_rate": 4.916127301619954e-05, + "loss": 4.5008, + "step": 13904 + }, + { + "epoch": 0.08269697402226663, + "grad_norm": 2.1290805339813232, + "learning_rate": 4.916115303709081e-05, + "loss": 5.4876, + "step": 13905 + }, + { + "epoch": 0.08270292130554763, + "grad_norm": 1.981466293334961, + "learning_rate": 4.916103304954767e-05, + "loss": 5.7699, + "step": 13906 + }, + { + "epoch": 0.08270886858882863, + "grad_norm": 1.8898048400878906, + "learning_rate": 4.916091305357016e-05, + "loss": 5.7874, + "step": 13907 + }, + { + "epoch": 0.08271481587210962, + "grad_norm": 1.7809741497039795, + "learning_rate": 4.916079304915833e-05, + "loss": 5.6264, + "step": 13908 + }, + { + "epoch": 0.08272076315539062, + "grad_norm": 1.7516652345657349, + "learning_rate": 4.916067303631221e-05, + "loss": 5.5751, + "step": 13909 + }, + { + "epoch": 0.08272671043867161, + "grad_norm": 1.9051094055175781, + "learning_rate": 4.916055301503185e-05, + "loss": 5.7984, + "step": 13910 + }, + { + "epoch": 0.08273265772195261, + "grad_norm": 1.7115057706832886, + "learning_rate": 4.9160432985317295e-05, + "loss": 5.6187, + "step": 13911 + }, + { + "epoch": 0.08273860500523361, + "grad_norm": 1.790529727935791, + "learning_rate": 4.916031294716858e-05, + "loss": 5.6276, + "step": 13912 + }, + { + "epoch": 0.0827445522885146, + "grad_norm": 1.742039442062378, + "learning_rate": 4.9160192900585754e-05, + "loss": 5.3783, + "step": 13913 + }, + { + "epoch": 0.0827504995717956, + "grad_norm": 1.7544314861297607, + "learning_rate": 4.916007284556885e-05, + "loss": 5.5276, + "step": 13914 + }, + { + "epoch": 0.0827564468550766, + "grad_norm": 2.0135440826416016, + "learning_rate": 4.915995278211791e-05, + "loss": 5.5177, + "step": 13915 + }, + { + "epoch": 0.08276239413835759, + "grad_norm": 1.5759433507919312, + "learning_rate": 4.915983271023299e-05, + "loss": 5.4652, + "step": 13916 + }, + { + "epoch": 0.0827683414216386, + "grad_norm": 1.7974358797073364, + "learning_rate": 4.915971262991411e-05, + "loss": 5.4463, + "step": 13917 + }, + { + "epoch": 0.0827742887049196, + "grad_norm": 1.847692608833313, + "learning_rate": 4.9159592541161335e-05, + "loss": 5.4247, + "step": 13918 + }, + { + "epoch": 0.08278023598820058, + "grad_norm": 1.6701977252960205, + "learning_rate": 4.915947244397469e-05, + "loss": 5.3451, + "step": 13919 + }, + { + "epoch": 0.08278618327148159, + "grad_norm": 1.9226999282836914, + "learning_rate": 4.915935233835423e-05, + "loss": 5.1159, + "step": 13920 + }, + { + "epoch": 0.08279213055476259, + "grad_norm": 2.430760383605957, + "learning_rate": 4.915923222429998e-05, + "loss": 4.9746, + "step": 13921 + }, + { + "epoch": 0.08279807783804358, + "grad_norm": 1.7708054780960083, + "learning_rate": 4.915911210181199e-05, + "loss": 5.4986, + "step": 13922 + }, + { + "epoch": 0.08280402512132458, + "grad_norm": 1.7802354097366333, + "learning_rate": 4.915899197089031e-05, + "loss": 5.4283, + "step": 13923 + }, + { + "epoch": 0.08280997240460558, + "grad_norm": 2.347226142883301, + "learning_rate": 4.9158871831534984e-05, + "loss": 5.2917, + "step": 13924 + }, + { + "epoch": 0.08281591968788657, + "grad_norm": 2.5685782432556152, + "learning_rate": 4.915875168374603e-05, + "loss": 5.243, + "step": 13925 + }, + { + "epoch": 0.08282186697116757, + "grad_norm": 2.460383176803589, + "learning_rate": 4.915863152752351e-05, + "loss": 4.9241, + "step": 13926 + }, + { + "epoch": 0.08282781425444857, + "grad_norm": 2.2505056858062744, + "learning_rate": 4.915851136286747e-05, + "loss": 5.0951, + "step": 13927 + }, + { + "epoch": 0.08283376153772956, + "grad_norm": 2.517544984817505, + "learning_rate": 4.915839118977793e-05, + "loss": 5.151, + "step": 13928 + }, + { + "epoch": 0.08283970882101056, + "grad_norm": 2.445645809173584, + "learning_rate": 4.915827100825495e-05, + "loss": 5.1831, + "step": 13929 + }, + { + "epoch": 0.08284565610429157, + "grad_norm": 2.347383737564087, + "learning_rate": 4.9158150818298564e-05, + "loss": 5.0299, + "step": 13930 + }, + { + "epoch": 0.08285160338757255, + "grad_norm": 2.1791892051696777, + "learning_rate": 4.915803061990882e-05, + "loss": 5.4083, + "step": 13931 + }, + { + "epoch": 0.08285755067085356, + "grad_norm": 1.9959020614624023, + "learning_rate": 4.9157910413085764e-05, + "loss": 5.9036, + "step": 13932 + }, + { + "epoch": 0.08286349795413456, + "grad_norm": 2.3419620990753174, + "learning_rate": 4.915779019782942e-05, + "loss": 4.9082, + "step": 13933 + }, + { + "epoch": 0.08286944523741555, + "grad_norm": 2.452756643295288, + "learning_rate": 4.915766997413985e-05, + "loss": 4.8272, + "step": 13934 + }, + { + "epoch": 0.08287539252069655, + "grad_norm": 2.344353675842285, + "learning_rate": 4.915754974201708e-05, + "loss": 5.0269, + "step": 13935 + }, + { + "epoch": 0.08288133980397755, + "grad_norm": 2.366218090057373, + "learning_rate": 4.9157429501461175e-05, + "loss": 4.8898, + "step": 13936 + }, + { + "epoch": 0.08288728708725854, + "grad_norm": 1.7986581325531006, + "learning_rate": 4.915730925247214e-05, + "loss": 4.9316, + "step": 13937 + }, + { + "epoch": 0.08289323437053954, + "grad_norm": 2.059094190597534, + "learning_rate": 4.915718899505005e-05, + "loss": 5.1297, + "step": 13938 + }, + { + "epoch": 0.08289918165382054, + "grad_norm": 1.9630707502365112, + "learning_rate": 4.915706872919493e-05, + "loss": 5.4844, + "step": 13939 + }, + { + "epoch": 0.08290512893710153, + "grad_norm": 2.0281238555908203, + "learning_rate": 4.9156948454906825e-05, + "loss": 5.9276, + "step": 13940 + }, + { + "epoch": 0.08291107622038253, + "grad_norm": 1.8783270120620728, + "learning_rate": 4.9156828172185786e-05, + "loss": 5.7085, + "step": 13941 + }, + { + "epoch": 0.08291702350366352, + "grad_norm": 2.190317153930664, + "learning_rate": 4.915670788103184e-05, + "loss": 4.9619, + "step": 13942 + }, + { + "epoch": 0.08292297078694452, + "grad_norm": 2.2746498584747314, + "learning_rate": 4.915658758144505e-05, + "loss": 4.8965, + "step": 13943 + }, + { + "epoch": 0.08292891807022552, + "grad_norm": 1.940510630607605, + "learning_rate": 4.915646727342543e-05, + "loss": 5.0367, + "step": 13944 + }, + { + "epoch": 0.08293486535350651, + "grad_norm": 1.9016308784484863, + "learning_rate": 4.915634695697304e-05, + "loss": 5.5002, + "step": 13945 + }, + { + "epoch": 0.08294081263678751, + "grad_norm": 2.0041022300720215, + "learning_rate": 4.915622663208792e-05, + "loss": 5.4193, + "step": 13946 + }, + { + "epoch": 0.08294675992006852, + "grad_norm": 2.0117805004119873, + "learning_rate": 4.9156106298770115e-05, + "loss": 5.2697, + "step": 13947 + }, + { + "epoch": 0.0829527072033495, + "grad_norm": 1.864820957183838, + "learning_rate": 4.9155985957019654e-05, + "loss": 5.1594, + "step": 13948 + }, + { + "epoch": 0.0829586544866305, + "grad_norm": 1.7407771348953247, + "learning_rate": 4.91558656068366e-05, + "loss": 5.1189, + "step": 13949 + }, + { + "epoch": 0.08296460176991151, + "grad_norm": 2.027552366256714, + "learning_rate": 4.9155745248220976e-05, + "loss": 5.6257, + "step": 13950 + }, + { + "epoch": 0.0829705490531925, + "grad_norm": 1.6893701553344727, + "learning_rate": 4.9155624881172834e-05, + "loss": 5.1268, + "step": 13951 + }, + { + "epoch": 0.0829764963364735, + "grad_norm": 1.7216230630874634, + "learning_rate": 4.915550450569221e-05, + "loss": 5.2768, + "step": 13952 + }, + { + "epoch": 0.0829824436197545, + "grad_norm": 1.6723179817199707, + "learning_rate": 4.915538412177915e-05, + "loss": 5.7059, + "step": 13953 + }, + { + "epoch": 0.08298839090303549, + "grad_norm": 1.7645996809005737, + "learning_rate": 4.915526372943369e-05, + "loss": 5.6065, + "step": 13954 + }, + { + "epoch": 0.08299433818631649, + "grad_norm": 1.9206926822662354, + "learning_rate": 4.915514332865588e-05, + "loss": 4.9229, + "step": 13955 + }, + { + "epoch": 0.08300028546959749, + "grad_norm": 1.9269802570343018, + "learning_rate": 4.9155022919445766e-05, + "loss": 5.5678, + "step": 13956 + }, + { + "epoch": 0.08300623275287848, + "grad_norm": 2.378319501876831, + "learning_rate": 4.915490250180338e-05, + "loss": 4.7271, + "step": 13957 + }, + { + "epoch": 0.08301218003615948, + "grad_norm": 1.73631751537323, + "learning_rate": 4.915478207572876e-05, + "loss": 5.1302, + "step": 13958 + }, + { + "epoch": 0.08301812731944049, + "grad_norm": 1.6520816087722778, + "learning_rate": 4.915466164122196e-05, + "loss": 6.0497, + "step": 13959 + }, + { + "epoch": 0.08302407460272147, + "grad_norm": 1.7382736206054688, + "learning_rate": 4.915454119828302e-05, + "loss": 6.0155, + "step": 13960 + }, + { + "epoch": 0.08303002188600248, + "grad_norm": 1.6733272075653076, + "learning_rate": 4.915442074691197e-05, + "loss": 5.2624, + "step": 13961 + }, + { + "epoch": 0.08303596916928348, + "grad_norm": 2.0024397373199463, + "learning_rate": 4.915430028710887e-05, + "loss": 5.4794, + "step": 13962 + }, + { + "epoch": 0.08304191645256447, + "grad_norm": 1.9784339666366577, + "learning_rate": 4.915417981887375e-05, + "loss": 5.1546, + "step": 13963 + }, + { + "epoch": 0.08304786373584547, + "grad_norm": 1.7146525382995605, + "learning_rate": 4.915405934220666e-05, + "loss": 5.6269, + "step": 13964 + }, + { + "epoch": 0.08305381101912647, + "grad_norm": 1.7252057790756226, + "learning_rate": 4.9153938857107626e-05, + "loss": 5.7015, + "step": 13965 + }, + { + "epoch": 0.08305975830240746, + "grad_norm": 1.6623241901397705, + "learning_rate": 4.9153818363576715e-05, + "loss": 5.5249, + "step": 13966 + }, + { + "epoch": 0.08306570558568846, + "grad_norm": 2.0701472759246826, + "learning_rate": 4.9153697861613944e-05, + "loss": 5.3528, + "step": 13967 + }, + { + "epoch": 0.08307165286896946, + "grad_norm": 1.6600522994995117, + "learning_rate": 4.915357735121938e-05, + "loss": 5.3454, + "step": 13968 + }, + { + "epoch": 0.08307760015225045, + "grad_norm": 2.093092918395996, + "learning_rate": 4.915345683239304e-05, + "loss": 5.2417, + "step": 13969 + }, + { + "epoch": 0.08308354743553145, + "grad_norm": 1.9673899412155151, + "learning_rate": 4.915333630513498e-05, + "loss": 5.1908, + "step": 13970 + }, + { + "epoch": 0.08308949471881244, + "grad_norm": 1.8442246913909912, + "learning_rate": 4.915321576944524e-05, + "loss": 5.6287, + "step": 13971 + }, + { + "epoch": 0.08309544200209344, + "grad_norm": 1.5737566947937012, + "learning_rate": 4.9153095225323864e-05, + "loss": 5.7533, + "step": 13972 + }, + { + "epoch": 0.08310138928537444, + "grad_norm": 1.7948611974716187, + "learning_rate": 4.915297467277089e-05, + "loss": 5.5739, + "step": 13973 + }, + { + "epoch": 0.08310733656865543, + "grad_norm": 2.0080626010894775, + "learning_rate": 4.915285411178637e-05, + "loss": 5.5505, + "step": 13974 + }, + { + "epoch": 0.08311328385193643, + "grad_norm": 1.7838460206985474, + "learning_rate": 4.915273354237033e-05, + "loss": 6.0133, + "step": 13975 + }, + { + "epoch": 0.08311923113521744, + "grad_norm": 1.7599917650222778, + "learning_rate": 4.915261296452282e-05, + "loss": 5.6552, + "step": 13976 + }, + { + "epoch": 0.08312517841849842, + "grad_norm": 1.6211295127868652, + "learning_rate": 4.915249237824388e-05, + "loss": 5.6797, + "step": 13977 + }, + { + "epoch": 0.08313112570177943, + "grad_norm": 1.7404415607452393, + "learning_rate": 4.9152371783533565e-05, + "loss": 5.5134, + "step": 13978 + }, + { + "epoch": 0.08313707298506043, + "grad_norm": 1.8577871322631836, + "learning_rate": 4.9152251180391895e-05, + "loss": 5.5823, + "step": 13979 + }, + { + "epoch": 0.08314302026834142, + "grad_norm": 1.6060470342636108, + "learning_rate": 4.915213056881893e-05, + "loss": 5.5875, + "step": 13980 + }, + { + "epoch": 0.08314896755162242, + "grad_norm": 1.915451169013977, + "learning_rate": 4.91520099488147e-05, + "loss": 5.279, + "step": 13981 + }, + { + "epoch": 0.08315491483490342, + "grad_norm": 2.281404972076416, + "learning_rate": 4.9151889320379265e-05, + "loss": 5.0863, + "step": 13982 + }, + { + "epoch": 0.08316086211818441, + "grad_norm": 1.9069279432296753, + "learning_rate": 4.9151768683512646e-05, + "loss": 5.3055, + "step": 13983 + }, + { + "epoch": 0.08316680940146541, + "grad_norm": 1.810571312904358, + "learning_rate": 4.915164803821489e-05, + "loss": 5.4988, + "step": 13984 + }, + { + "epoch": 0.08317275668474641, + "grad_norm": 1.788197636604309, + "learning_rate": 4.915152738448605e-05, + "loss": 5.6627, + "step": 13985 + }, + { + "epoch": 0.0831787039680274, + "grad_norm": 2.294187545776367, + "learning_rate": 4.9151406722326165e-05, + "loss": 5.1977, + "step": 13986 + }, + { + "epoch": 0.0831846512513084, + "grad_norm": 2.584395170211792, + "learning_rate": 4.915128605173527e-05, + "loss": 5.1909, + "step": 13987 + }, + { + "epoch": 0.0831905985345894, + "grad_norm": 2.249406576156616, + "learning_rate": 4.9151165372713405e-05, + "loss": 5.1109, + "step": 13988 + }, + { + "epoch": 0.0831965458178704, + "grad_norm": 1.8678929805755615, + "learning_rate": 4.915104468526062e-05, + "loss": 5.1035, + "step": 13989 + }, + { + "epoch": 0.0832024931011514, + "grad_norm": 2.139711856842041, + "learning_rate": 4.915092398937696e-05, + "loss": 5.0151, + "step": 13990 + }, + { + "epoch": 0.0832084403844324, + "grad_norm": 2.1683461666107178, + "learning_rate": 4.915080328506246e-05, + "loss": 5.1097, + "step": 13991 + }, + { + "epoch": 0.08321438766771339, + "grad_norm": 2.1205332279205322, + "learning_rate": 4.9150682572317165e-05, + "loss": 4.9998, + "step": 13992 + }, + { + "epoch": 0.08322033495099439, + "grad_norm": 1.8642542362213135, + "learning_rate": 4.915056185114111e-05, + "loss": 5.8554, + "step": 13993 + }, + { + "epoch": 0.08322628223427539, + "grad_norm": 2.1150970458984375, + "learning_rate": 4.915044112153435e-05, + "loss": 5.5297, + "step": 13994 + }, + { + "epoch": 0.08323222951755638, + "grad_norm": 2.584157943725586, + "learning_rate": 4.9150320383496915e-05, + "loss": 5.0058, + "step": 13995 + }, + { + "epoch": 0.08323817680083738, + "grad_norm": 2.305853843688965, + "learning_rate": 4.9150199637028854e-05, + "loss": 5.0785, + "step": 13996 + }, + { + "epoch": 0.08324412408411838, + "grad_norm": 2.0386359691619873, + "learning_rate": 4.9150078882130214e-05, + "loss": 5.1104, + "step": 13997 + }, + { + "epoch": 0.08325007136739937, + "grad_norm": 1.6055399179458618, + "learning_rate": 4.914995811880102e-05, + "loss": 5.778, + "step": 13998 + }, + { + "epoch": 0.08325601865068037, + "grad_norm": 1.635704517364502, + "learning_rate": 4.9149837347041334e-05, + "loss": 6.1107, + "step": 13999 + }, + { + "epoch": 0.08326196593396136, + "grad_norm": 1.8098101615905762, + "learning_rate": 4.9149716566851184e-05, + "loss": 6.1197, + "step": 14000 + }, + { + "epoch": 0.08326791321724236, + "grad_norm": 1.5740363597869873, + "learning_rate": 4.914959577823062e-05, + "loss": 5.7821, + "step": 14001 + }, + { + "epoch": 0.08327386050052336, + "grad_norm": 1.4634822607040405, + "learning_rate": 4.914947498117968e-05, + "loss": 5.7062, + "step": 14002 + }, + { + "epoch": 0.08327980778380435, + "grad_norm": 1.7310374975204468, + "learning_rate": 4.914935417569841e-05, + "loss": 5.6689, + "step": 14003 + }, + { + "epoch": 0.08328575506708535, + "grad_norm": 1.5742056369781494, + "learning_rate": 4.914923336178685e-05, + "loss": 5.6529, + "step": 14004 + }, + { + "epoch": 0.08329170235036636, + "grad_norm": 1.6353307962417603, + "learning_rate": 4.914911253944504e-05, + "loss": 5.4564, + "step": 14005 + }, + { + "epoch": 0.08329764963364734, + "grad_norm": 1.8744231462478638, + "learning_rate": 4.9148991708673024e-05, + "loss": 5.305, + "step": 14006 + }, + { + "epoch": 0.08330359691692835, + "grad_norm": 1.9766863584518433, + "learning_rate": 4.914887086947085e-05, + "loss": 5.711, + "step": 14007 + }, + { + "epoch": 0.08330954420020935, + "grad_norm": 2.1832756996154785, + "learning_rate": 4.914875002183855e-05, + "loss": 4.9322, + "step": 14008 + }, + { + "epoch": 0.08331549148349034, + "grad_norm": 2.2370998859405518, + "learning_rate": 4.914862916577617e-05, + "loss": 4.512, + "step": 14009 + }, + { + "epoch": 0.08332143876677134, + "grad_norm": 2.2743804454803467, + "learning_rate": 4.914850830128376e-05, + "loss": 4.5716, + "step": 14010 + }, + { + "epoch": 0.08332738605005234, + "grad_norm": 2.3644347190856934, + "learning_rate": 4.914838742836134e-05, + "loss": 4.1288, + "step": 14011 + }, + { + "epoch": 0.08333333333333333, + "grad_norm": 3.1034274101257324, + "learning_rate": 4.9148266547008984e-05, + "loss": 5.2864, + "step": 14012 + }, + { + "epoch": 0.08333928061661433, + "grad_norm": 2.240302801132202, + "learning_rate": 4.914814565722671e-05, + "loss": 5.3452, + "step": 14013 + }, + { + "epoch": 0.08334522789989533, + "grad_norm": 2.0743885040283203, + "learning_rate": 4.9148024759014566e-05, + "loss": 5.4338, + "step": 14014 + }, + { + "epoch": 0.08335117518317632, + "grad_norm": 2.0169663429260254, + "learning_rate": 4.91479038523726e-05, + "loss": 5.5108, + "step": 14015 + }, + { + "epoch": 0.08335712246645732, + "grad_norm": 1.9730015993118286, + "learning_rate": 4.914778293730085e-05, + "loss": 5.6413, + "step": 14016 + }, + { + "epoch": 0.08336306974973832, + "grad_norm": 2.3047432899475098, + "learning_rate": 4.914766201379936e-05, + "loss": 5.4111, + "step": 14017 + }, + { + "epoch": 0.08336901703301931, + "grad_norm": 3.079416275024414, + "learning_rate": 4.914754108186816e-05, + "loss": 5.5591, + "step": 14018 + }, + { + "epoch": 0.08337496431630032, + "grad_norm": 1.9374867677688599, + "learning_rate": 4.9147420141507314e-05, + "loss": 5.9295, + "step": 14019 + }, + { + "epoch": 0.08338091159958132, + "grad_norm": 1.874292016029358, + "learning_rate": 4.9147299192716855e-05, + "loss": 5.6846, + "step": 14020 + }, + { + "epoch": 0.0833868588828623, + "grad_norm": 1.8852506875991821, + "learning_rate": 4.914717823549682e-05, + "loss": 5.621, + "step": 14021 + }, + { + "epoch": 0.08339280616614331, + "grad_norm": 1.9332367181777954, + "learning_rate": 4.914705726984725e-05, + "loss": 5.8584, + "step": 14022 + }, + { + "epoch": 0.08339875344942431, + "grad_norm": 1.6252962350845337, + "learning_rate": 4.91469362957682e-05, + "loss": 5.8173, + "step": 14023 + }, + { + "epoch": 0.0834047007327053, + "grad_norm": 1.6760259866714478, + "learning_rate": 4.9146815313259695e-05, + "loss": 5.5441, + "step": 14024 + }, + { + "epoch": 0.0834106480159863, + "grad_norm": 1.4979921579360962, + "learning_rate": 4.9146694322321785e-05, + "loss": 6.1467, + "step": 14025 + }, + { + "epoch": 0.0834165952992673, + "grad_norm": 1.4720534086227417, + "learning_rate": 4.914657332295453e-05, + "loss": 5.8626, + "step": 14026 + }, + { + "epoch": 0.08342254258254829, + "grad_norm": 1.6709620952606201, + "learning_rate": 4.914645231515794e-05, + "loss": 5.8468, + "step": 14027 + }, + { + "epoch": 0.08342848986582929, + "grad_norm": 1.6389116048812866, + "learning_rate": 4.9146331298932075e-05, + "loss": 5.9222, + "step": 14028 + }, + { + "epoch": 0.08343443714911028, + "grad_norm": 1.4344384670257568, + "learning_rate": 4.9146210274276974e-05, + "loss": 5.5457, + "step": 14029 + }, + { + "epoch": 0.08344038443239128, + "grad_norm": 1.472469449043274, + "learning_rate": 4.914608924119268e-05, + "loss": 5.608, + "step": 14030 + }, + { + "epoch": 0.08344633171567228, + "grad_norm": 1.6688710451126099, + "learning_rate": 4.914596819967925e-05, + "loss": 5.7982, + "step": 14031 + }, + { + "epoch": 0.08345227899895327, + "grad_norm": 1.6417087316513062, + "learning_rate": 4.9145847149736704e-05, + "loss": 5.6498, + "step": 14032 + }, + { + "epoch": 0.08345822628223427, + "grad_norm": 1.5726937055587769, + "learning_rate": 4.9145726091365084e-05, + "loss": 5.8723, + "step": 14033 + }, + { + "epoch": 0.08346417356551528, + "grad_norm": 1.7523616552352905, + "learning_rate": 4.914560502456444e-05, + "loss": 6.1967, + "step": 14034 + }, + { + "epoch": 0.08347012084879626, + "grad_norm": 1.8270281553268433, + "learning_rate": 4.914548394933483e-05, + "loss": 6.0493, + "step": 14035 + }, + { + "epoch": 0.08347606813207727, + "grad_norm": 1.8113981485366821, + "learning_rate": 4.914536286567627e-05, + "loss": 5.2815, + "step": 14036 + }, + { + "epoch": 0.08348201541535827, + "grad_norm": 1.7894388437271118, + "learning_rate": 4.914524177358881e-05, + "loss": 5.2606, + "step": 14037 + }, + { + "epoch": 0.08348796269863926, + "grad_norm": 1.7994349002838135, + "learning_rate": 4.9145120673072505e-05, + "loss": 5.025, + "step": 14038 + }, + { + "epoch": 0.08349390998192026, + "grad_norm": 1.6934137344360352, + "learning_rate": 4.914499956412738e-05, + "loss": 5.0455, + "step": 14039 + }, + { + "epoch": 0.08349985726520126, + "grad_norm": 1.549500823020935, + "learning_rate": 4.914487844675349e-05, + "loss": 5.3836, + "step": 14040 + }, + { + "epoch": 0.08350580454848225, + "grad_norm": 1.7452481985092163, + "learning_rate": 4.9144757320950873e-05, + "loss": 5.0175, + "step": 14041 + }, + { + "epoch": 0.08351175183176325, + "grad_norm": 1.9420257806777954, + "learning_rate": 4.914463618671957e-05, + "loss": 5.0146, + "step": 14042 + }, + { + "epoch": 0.08351769911504425, + "grad_norm": 1.798431158065796, + "learning_rate": 4.914451504405962e-05, + "loss": 4.7656, + "step": 14043 + }, + { + "epoch": 0.08352364639832524, + "grad_norm": 1.7167326211929321, + "learning_rate": 4.914439389297107e-05, + "loss": 4.7518, + "step": 14044 + }, + { + "epoch": 0.08352959368160624, + "grad_norm": 1.7150487899780273, + "learning_rate": 4.914427273345397e-05, + "loss": 4.8298, + "step": 14045 + }, + { + "epoch": 0.08353554096488724, + "grad_norm": 1.7048633098602295, + "learning_rate": 4.914415156550834e-05, + "loss": 5.0039, + "step": 14046 + }, + { + "epoch": 0.08354148824816823, + "grad_norm": 1.364012598991394, + "learning_rate": 4.914403038913425e-05, + "loss": 5.3718, + "step": 14047 + }, + { + "epoch": 0.08354743553144924, + "grad_norm": 2.29878830909729, + "learning_rate": 4.9143909204331716e-05, + "loss": 4.8874, + "step": 14048 + }, + { + "epoch": 0.08355338281473024, + "grad_norm": 2.1153953075408936, + "learning_rate": 4.91437880111008e-05, + "loss": 4.6646, + "step": 14049 + }, + { + "epoch": 0.08355933009801123, + "grad_norm": 2.289346218109131, + "learning_rate": 4.914366680944153e-05, + "loss": 4.7966, + "step": 14050 + }, + { + "epoch": 0.08356527738129223, + "grad_norm": 1.8394019603729248, + "learning_rate": 4.9143545599353965e-05, + "loss": 5.1788, + "step": 14051 + }, + { + "epoch": 0.08357122466457323, + "grad_norm": 2.192802667617798, + "learning_rate": 4.9143424380838136e-05, + "loss": 5.4549, + "step": 14052 + }, + { + "epoch": 0.08357717194785422, + "grad_norm": 2.128356695175171, + "learning_rate": 4.9143303153894085e-05, + "loss": 5.6652, + "step": 14053 + }, + { + "epoch": 0.08358311923113522, + "grad_norm": 2.0716452598571777, + "learning_rate": 4.914318191852186e-05, + "loss": 5.7013, + "step": 14054 + }, + { + "epoch": 0.08358906651441622, + "grad_norm": 2.298940658569336, + "learning_rate": 4.91430606747215e-05, + "loss": 5.565, + "step": 14055 + }, + { + "epoch": 0.08359501379769721, + "grad_norm": 2.250102996826172, + "learning_rate": 4.914293942249304e-05, + "loss": 5.6935, + "step": 14056 + }, + { + "epoch": 0.08360096108097821, + "grad_norm": 2.123037576675415, + "learning_rate": 4.914281816183653e-05, + "loss": 5.624, + "step": 14057 + }, + { + "epoch": 0.0836069083642592, + "grad_norm": 1.833024501800537, + "learning_rate": 4.9142696892752013e-05, + "loss": 5.4329, + "step": 14058 + }, + { + "epoch": 0.0836128556475402, + "grad_norm": 1.8438977003097534, + "learning_rate": 4.9142575615239526e-05, + "loss": 5.294, + "step": 14059 + }, + { + "epoch": 0.0836188029308212, + "grad_norm": 1.805525541305542, + "learning_rate": 4.914245432929913e-05, + "loss": 5.3778, + "step": 14060 + }, + { + "epoch": 0.08362475021410219, + "grad_norm": 1.5750529766082764, + "learning_rate": 4.9142333034930835e-05, + "loss": 5.357, + "step": 14061 + }, + { + "epoch": 0.0836306974973832, + "grad_norm": 1.3928825855255127, + "learning_rate": 4.914221173213471e-05, + "loss": 5.5141, + "step": 14062 + }, + { + "epoch": 0.0836366447806642, + "grad_norm": 1.6307804584503174, + "learning_rate": 4.914209042091079e-05, + "loss": 5.3687, + "step": 14063 + }, + { + "epoch": 0.08364259206394518, + "grad_norm": 1.533963680267334, + "learning_rate": 4.914196910125911e-05, + "loss": 5.7295, + "step": 14064 + }, + { + "epoch": 0.08364853934722619, + "grad_norm": 1.4950587749481201, + "learning_rate": 4.914184777317972e-05, + "loss": 5.816, + "step": 14065 + }, + { + "epoch": 0.08365448663050719, + "grad_norm": 1.3246190547943115, + "learning_rate": 4.914172643667266e-05, + "loss": 5.6925, + "step": 14066 + }, + { + "epoch": 0.08366043391378818, + "grad_norm": 1.4816724061965942, + "learning_rate": 4.9141605091737975e-05, + "loss": 5.6528, + "step": 14067 + }, + { + "epoch": 0.08366638119706918, + "grad_norm": 1.6656372547149658, + "learning_rate": 4.914148373837571e-05, + "loss": 5.4619, + "step": 14068 + }, + { + "epoch": 0.08367232848035018, + "grad_norm": 1.2973356246948242, + "learning_rate": 4.914136237658589e-05, + "loss": 5.5467, + "step": 14069 + }, + { + "epoch": 0.08367827576363117, + "grad_norm": 1.7669901847839355, + "learning_rate": 4.914124100636857e-05, + "loss": 5.2213, + "step": 14070 + }, + { + "epoch": 0.08368422304691217, + "grad_norm": 1.7352882623672485, + "learning_rate": 4.91411196277238e-05, + "loss": 5.2938, + "step": 14071 + }, + { + "epoch": 0.08369017033019317, + "grad_norm": 1.5912410020828247, + "learning_rate": 4.914099824065161e-05, + "loss": 5.4139, + "step": 14072 + }, + { + "epoch": 0.08369611761347416, + "grad_norm": 1.46699059009552, + "learning_rate": 4.914087684515205e-05, + "loss": 5.2317, + "step": 14073 + }, + { + "epoch": 0.08370206489675516, + "grad_norm": 3.0727121829986572, + "learning_rate": 4.914075544122516e-05, + "loss": 5.2324, + "step": 14074 + }, + { + "epoch": 0.08370801218003616, + "grad_norm": 1.4887278079986572, + "learning_rate": 4.914063402887098e-05, + "loss": 5.0331, + "step": 14075 + }, + { + "epoch": 0.08371395946331715, + "grad_norm": 1.4677956104278564, + "learning_rate": 4.9140512608089555e-05, + "loss": 5.0892, + "step": 14076 + }, + { + "epoch": 0.08371990674659816, + "grad_norm": 1.3760831356048584, + "learning_rate": 4.914039117888093e-05, + "loss": 5.3738, + "step": 14077 + }, + { + "epoch": 0.08372585402987916, + "grad_norm": 1.6125822067260742, + "learning_rate": 4.9140269741245135e-05, + "loss": 5.4629, + "step": 14078 + }, + { + "epoch": 0.08373180131316015, + "grad_norm": 1.6336333751678467, + "learning_rate": 4.9140148295182226e-05, + "loss": 5.2533, + "step": 14079 + }, + { + "epoch": 0.08373774859644115, + "grad_norm": 1.6296573877334595, + "learning_rate": 4.9140026840692247e-05, + "loss": 4.8288, + "step": 14080 + }, + { + "epoch": 0.08374369587972215, + "grad_norm": 1.6058591604232788, + "learning_rate": 4.913990537777522e-05, + "loss": 5.0549, + "step": 14081 + }, + { + "epoch": 0.08374964316300314, + "grad_norm": 1.6199642419815063, + "learning_rate": 4.9139783906431214e-05, + "loss": 5.2387, + "step": 14082 + }, + { + "epoch": 0.08375559044628414, + "grad_norm": 1.7537976503372192, + "learning_rate": 4.913966242666025e-05, + "loss": 5.2766, + "step": 14083 + }, + { + "epoch": 0.08376153772956514, + "grad_norm": 1.579128384590149, + "learning_rate": 4.9139540938462384e-05, + "loss": 5.2251, + "step": 14084 + }, + { + "epoch": 0.08376748501284613, + "grad_norm": 1.7070518732070923, + "learning_rate": 4.913941944183765e-05, + "loss": 5.0699, + "step": 14085 + }, + { + "epoch": 0.08377343229612713, + "grad_norm": 1.4739151000976562, + "learning_rate": 4.91392979367861e-05, + "loss": 5.229, + "step": 14086 + }, + { + "epoch": 0.08377937957940812, + "grad_norm": 1.6380045413970947, + "learning_rate": 4.9139176423307764e-05, + "loss": 5.0977, + "step": 14087 + }, + { + "epoch": 0.08378532686268912, + "grad_norm": 1.640865445137024, + "learning_rate": 4.91390549014027e-05, + "loss": 5.1106, + "step": 14088 + }, + { + "epoch": 0.08379127414597012, + "grad_norm": 1.7274518013000488, + "learning_rate": 4.913893337107093e-05, + "loss": 5.2093, + "step": 14089 + }, + { + "epoch": 0.08379722142925111, + "grad_norm": 1.7702603340148926, + "learning_rate": 4.913881183231251e-05, + "loss": 5.1314, + "step": 14090 + }, + { + "epoch": 0.08380316871253211, + "grad_norm": 1.766479253768921, + "learning_rate": 4.913869028512749e-05, + "loss": 5.1266, + "step": 14091 + }, + { + "epoch": 0.08380911599581312, + "grad_norm": 1.5863205194473267, + "learning_rate": 4.91385687295159e-05, + "loss": 5.1487, + "step": 14092 + }, + { + "epoch": 0.0838150632790941, + "grad_norm": 1.6770803928375244, + "learning_rate": 4.913844716547777e-05, + "loss": 5.2479, + "step": 14093 + }, + { + "epoch": 0.0838210105623751, + "grad_norm": 1.8650991916656494, + "learning_rate": 4.913832559301317e-05, + "loss": 5.2748, + "step": 14094 + }, + { + "epoch": 0.08382695784565611, + "grad_norm": 1.7304933071136475, + "learning_rate": 4.913820401212213e-05, + "loss": 5.2572, + "step": 14095 + }, + { + "epoch": 0.0838329051289371, + "grad_norm": 1.7103501558303833, + "learning_rate": 4.9138082422804695e-05, + "loss": 5.1145, + "step": 14096 + }, + { + "epoch": 0.0838388524122181, + "grad_norm": 1.8390073776245117, + "learning_rate": 4.91379608250609e-05, + "loss": 5.1171, + "step": 14097 + }, + { + "epoch": 0.0838447996954991, + "grad_norm": 1.815047264099121, + "learning_rate": 4.913783921889079e-05, + "loss": 5.2329, + "step": 14098 + }, + { + "epoch": 0.08385074697878009, + "grad_norm": 1.4381682872772217, + "learning_rate": 4.9137717604294415e-05, + "loss": 5.1098, + "step": 14099 + }, + { + "epoch": 0.08385669426206109, + "grad_norm": 1.6523853540420532, + "learning_rate": 4.9137595981271815e-05, + "loss": 5.1352, + "step": 14100 + }, + { + "epoch": 0.08386264154534209, + "grad_norm": 1.377199649810791, + "learning_rate": 4.913747434982302e-05, + "loss": 5.1191, + "step": 14101 + }, + { + "epoch": 0.08386858882862308, + "grad_norm": 1.5858699083328247, + "learning_rate": 4.913735270994809e-05, + "loss": 5.0569, + "step": 14102 + }, + { + "epoch": 0.08387453611190408, + "grad_norm": 1.608522891998291, + "learning_rate": 4.913723106164705e-05, + "loss": 4.8834, + "step": 14103 + }, + { + "epoch": 0.08388048339518508, + "grad_norm": 1.7063453197479248, + "learning_rate": 4.913710940491996e-05, + "loss": 4.9019, + "step": 14104 + }, + { + "epoch": 0.08388643067846607, + "grad_norm": 1.5008784532546997, + "learning_rate": 4.913698773976685e-05, + "loss": 4.8423, + "step": 14105 + }, + { + "epoch": 0.08389237796174707, + "grad_norm": 1.8743178844451904, + "learning_rate": 4.913686606618777e-05, + "loss": 4.9256, + "step": 14106 + }, + { + "epoch": 0.08389832524502808, + "grad_norm": 1.813094973564148, + "learning_rate": 4.9136744384182764e-05, + "loss": 4.9245, + "step": 14107 + }, + { + "epoch": 0.08390427252830907, + "grad_norm": 1.9561067819595337, + "learning_rate": 4.913662269375186e-05, + "loss": 4.8459, + "step": 14108 + }, + { + "epoch": 0.08391021981159007, + "grad_norm": 1.6159533262252808, + "learning_rate": 4.913650099489512e-05, + "loss": 4.8092, + "step": 14109 + }, + { + "epoch": 0.08391616709487107, + "grad_norm": 1.5819872617721558, + "learning_rate": 4.913637928761257e-05, + "loss": 4.9047, + "step": 14110 + }, + { + "epoch": 0.08392211437815206, + "grad_norm": 1.6294678449630737, + "learning_rate": 4.913625757190426e-05, + "loss": 4.6908, + "step": 14111 + }, + { + "epoch": 0.08392806166143306, + "grad_norm": 1.5048410892486572, + "learning_rate": 4.913613584777024e-05, + "loss": 5.2021, + "step": 14112 + }, + { + "epoch": 0.08393400894471406, + "grad_norm": 1.626280665397644, + "learning_rate": 4.9136014115210525e-05, + "loss": 5.4592, + "step": 14113 + }, + { + "epoch": 0.08393995622799505, + "grad_norm": 1.662269115447998, + "learning_rate": 4.91358923742252e-05, + "loss": 5.0027, + "step": 14114 + }, + { + "epoch": 0.08394590351127605, + "grad_norm": 1.5630388259887695, + "learning_rate": 4.913577062481427e-05, + "loss": 5.3327, + "step": 14115 + }, + { + "epoch": 0.08395185079455704, + "grad_norm": 1.4223047494888306, + "learning_rate": 4.913564886697779e-05, + "loss": 5.5081, + "step": 14116 + }, + { + "epoch": 0.08395779807783804, + "grad_norm": 1.3298295736312866, + "learning_rate": 4.9135527100715814e-05, + "loss": 5.3783, + "step": 14117 + }, + { + "epoch": 0.08396374536111904, + "grad_norm": 1.335779070854187, + "learning_rate": 4.913540532602837e-05, + "loss": 5.3901, + "step": 14118 + }, + { + "epoch": 0.08396969264440003, + "grad_norm": 1.5331017971038818, + "learning_rate": 4.913528354291551e-05, + "loss": 5.5643, + "step": 14119 + }, + { + "epoch": 0.08397563992768103, + "grad_norm": 1.703400731086731, + "learning_rate": 4.913516175137727e-05, + "loss": 5.4256, + "step": 14120 + }, + { + "epoch": 0.08398158721096204, + "grad_norm": 1.5330191850662231, + "learning_rate": 4.913503995141369e-05, + "loss": 5.2509, + "step": 14121 + }, + { + "epoch": 0.08398753449424302, + "grad_norm": 1.7405961751937866, + "learning_rate": 4.913491814302482e-05, + "loss": 5.4171, + "step": 14122 + }, + { + "epoch": 0.08399348177752403, + "grad_norm": 1.2550197839736938, + "learning_rate": 4.9134796326210696e-05, + "loss": 5.3908, + "step": 14123 + }, + { + "epoch": 0.08399942906080503, + "grad_norm": 1.2029253244400024, + "learning_rate": 4.9134674500971366e-05, + "loss": 5.5355, + "step": 14124 + }, + { + "epoch": 0.08400537634408602, + "grad_norm": 1.2968589067459106, + "learning_rate": 4.913455266730687e-05, + "loss": 5.4007, + "step": 14125 + }, + { + "epoch": 0.08401132362736702, + "grad_norm": 1.2636605501174927, + "learning_rate": 4.913443082521725e-05, + "loss": 5.2402, + "step": 14126 + }, + { + "epoch": 0.08401727091064802, + "grad_norm": 1.2112632989883423, + "learning_rate": 4.9134308974702554e-05, + "loss": 5.2595, + "step": 14127 + }, + { + "epoch": 0.08402321819392901, + "grad_norm": 1.447730302810669, + "learning_rate": 4.913418711576282e-05, + "loss": 5.2688, + "step": 14128 + }, + { + "epoch": 0.08402916547721001, + "grad_norm": 1.4328616857528687, + "learning_rate": 4.913406524839809e-05, + "loss": 5.2368, + "step": 14129 + }, + { + "epoch": 0.08403511276049101, + "grad_norm": 1.4782198667526245, + "learning_rate": 4.91339433726084e-05, + "loss": 5.2019, + "step": 14130 + }, + { + "epoch": 0.084041060043772, + "grad_norm": 1.499373197555542, + "learning_rate": 4.913382148839381e-05, + "loss": 5.3352, + "step": 14131 + }, + { + "epoch": 0.084047007327053, + "grad_norm": 1.37551748752594, + "learning_rate": 4.9133699595754346e-05, + "loss": 5.1566, + "step": 14132 + }, + { + "epoch": 0.084052954610334, + "grad_norm": 1.6400420665740967, + "learning_rate": 4.913357769469006e-05, + "loss": 5.5225, + "step": 14133 + }, + { + "epoch": 0.08405890189361499, + "grad_norm": 1.3855832815170288, + "learning_rate": 4.913345578520099e-05, + "loss": 5.4466, + "step": 14134 + }, + { + "epoch": 0.084064849176896, + "grad_norm": 1.783508062362671, + "learning_rate": 4.913333386728718e-05, + "loss": 5.1713, + "step": 14135 + }, + { + "epoch": 0.084070796460177, + "grad_norm": 2.435201406478882, + "learning_rate": 4.913321194094866e-05, + "loss": 4.9899, + "step": 14136 + }, + { + "epoch": 0.08407674374345799, + "grad_norm": 1.708850622177124, + "learning_rate": 4.91330900061855e-05, + "loss": 5.0808, + "step": 14137 + }, + { + "epoch": 0.08408269102673899, + "grad_norm": 1.583473801612854, + "learning_rate": 4.913296806299773e-05, + "loss": 5.0164, + "step": 14138 + }, + { + "epoch": 0.08408863831001999, + "grad_norm": 1.6990292072296143, + "learning_rate": 4.9132846111385386e-05, + "loss": 4.9476, + "step": 14139 + }, + { + "epoch": 0.08409458559330098, + "grad_norm": 1.6386258602142334, + "learning_rate": 4.913272415134851e-05, + "loss": 4.9357, + "step": 14140 + }, + { + "epoch": 0.08410053287658198, + "grad_norm": 1.258575439453125, + "learning_rate": 4.9132602182887156e-05, + "loss": 4.7666, + "step": 14141 + }, + { + "epoch": 0.08410648015986298, + "grad_norm": 1.3333406448364258, + "learning_rate": 4.913248020600135e-05, + "loss": 4.698, + "step": 14142 + }, + { + "epoch": 0.08411242744314397, + "grad_norm": 1.3663051128387451, + "learning_rate": 4.913235822069116e-05, + "loss": 4.9414, + "step": 14143 + }, + { + "epoch": 0.08411837472642497, + "grad_norm": 1.6906498670578003, + "learning_rate": 4.91322362269566e-05, + "loss": 5.281, + "step": 14144 + }, + { + "epoch": 0.08412432200970596, + "grad_norm": 1.2671558856964111, + "learning_rate": 4.9132114224797735e-05, + "loss": 5.2566, + "step": 14145 + }, + { + "epoch": 0.08413026929298696, + "grad_norm": 1.4022216796875, + "learning_rate": 4.9131992214214586e-05, + "loss": 5.128, + "step": 14146 + }, + { + "epoch": 0.08413621657626796, + "grad_norm": 1.4810549020767212, + "learning_rate": 4.913187019520722e-05, + "loss": 5.0172, + "step": 14147 + }, + { + "epoch": 0.08414216385954895, + "grad_norm": 1.2757905721664429, + "learning_rate": 4.913174816777566e-05, + "loss": 5.3796, + "step": 14148 + }, + { + "epoch": 0.08414811114282995, + "grad_norm": 1.4088176488876343, + "learning_rate": 4.913162613191996e-05, + "loss": 5.4586, + "step": 14149 + }, + { + "epoch": 0.08415405842611096, + "grad_norm": 1.5218896865844727, + "learning_rate": 4.9131504087640154e-05, + "loss": 5.1652, + "step": 14150 + }, + { + "epoch": 0.08416000570939194, + "grad_norm": 1.4234968423843384, + "learning_rate": 4.913138203493629e-05, + "loss": 5.1917, + "step": 14151 + }, + { + "epoch": 0.08416595299267295, + "grad_norm": 1.4841183423995972, + "learning_rate": 4.913125997380842e-05, + "loss": 5.2818, + "step": 14152 + }, + { + "epoch": 0.08417190027595395, + "grad_norm": 1.8631536960601807, + "learning_rate": 4.9131137904256564e-05, + "loss": 5.4848, + "step": 14153 + }, + { + "epoch": 0.08417784755923494, + "grad_norm": 1.5508880615234375, + "learning_rate": 4.913101582628078e-05, + "loss": 5.3698, + "step": 14154 + }, + { + "epoch": 0.08418379484251594, + "grad_norm": 1.2428319454193115, + "learning_rate": 4.913089373988111e-05, + "loss": 5.2071, + "step": 14155 + }, + { + "epoch": 0.08418974212579694, + "grad_norm": 1.405325174331665, + "learning_rate": 4.91307716450576e-05, + "loss": 5.1774, + "step": 14156 + }, + { + "epoch": 0.08419568940907793, + "grad_norm": 1.6800439357757568, + "learning_rate": 4.913064954181028e-05, + "loss": 5.3735, + "step": 14157 + }, + { + "epoch": 0.08420163669235893, + "grad_norm": 1.475174069404602, + "learning_rate": 4.9130527430139194e-05, + "loss": 5.3303, + "step": 14158 + }, + { + "epoch": 0.08420758397563993, + "grad_norm": 1.5441967248916626, + "learning_rate": 4.91304053100444e-05, + "loss": 5.3007, + "step": 14159 + }, + { + "epoch": 0.08421353125892092, + "grad_norm": 1.3798770904541016, + "learning_rate": 4.913028318152593e-05, + "loss": 5.287, + "step": 14160 + }, + { + "epoch": 0.08421947854220192, + "grad_norm": 1.4294620752334595, + "learning_rate": 4.913016104458382e-05, + "loss": 5.3159, + "step": 14161 + }, + { + "epoch": 0.08422542582548292, + "grad_norm": 1.4971884489059448, + "learning_rate": 4.913003889921812e-05, + "loss": 5.4701, + "step": 14162 + }, + { + "epoch": 0.08423137310876391, + "grad_norm": 1.447045922279358, + "learning_rate": 4.912991674542888e-05, + "loss": 5.306, + "step": 14163 + }, + { + "epoch": 0.08423732039204491, + "grad_norm": 1.7867134809494019, + "learning_rate": 4.9129794583216135e-05, + "loss": 4.8653, + "step": 14164 + }, + { + "epoch": 0.08424326767532592, + "grad_norm": 1.6931066513061523, + "learning_rate": 4.912967241257993e-05, + "loss": 4.7628, + "step": 14165 + }, + { + "epoch": 0.0842492149586069, + "grad_norm": 1.6567879915237427, + "learning_rate": 4.91295502335203e-05, + "loss": 4.7857, + "step": 14166 + }, + { + "epoch": 0.08425516224188791, + "grad_norm": 1.6891521215438843, + "learning_rate": 4.91294280460373e-05, + "loss": 4.7873, + "step": 14167 + }, + { + "epoch": 0.08426110952516891, + "grad_norm": 1.6237304210662842, + "learning_rate": 4.912930585013095e-05, + "loss": 4.8596, + "step": 14168 + }, + { + "epoch": 0.0842670568084499, + "grad_norm": 1.585802674293518, + "learning_rate": 4.912918364580132e-05, + "loss": 4.8226, + "step": 14169 + }, + { + "epoch": 0.0842730040917309, + "grad_norm": 1.6892811059951782, + "learning_rate": 4.912906143304844e-05, + "loss": 4.8307, + "step": 14170 + }, + { + "epoch": 0.0842789513750119, + "grad_norm": 1.8254313468933105, + "learning_rate": 4.912893921187236e-05, + "loss": 4.8508, + "step": 14171 + }, + { + "epoch": 0.08428489865829289, + "grad_norm": 1.5577294826507568, + "learning_rate": 4.912881698227311e-05, + "loss": 4.7303, + "step": 14172 + }, + { + "epoch": 0.08429084594157389, + "grad_norm": 1.5635697841644287, + "learning_rate": 4.912869474425074e-05, + "loss": 4.9597, + "step": 14173 + }, + { + "epoch": 0.08429679322485488, + "grad_norm": 1.6620457172393799, + "learning_rate": 4.9128572497805294e-05, + "loss": 5.1012, + "step": 14174 + }, + { + "epoch": 0.08430274050813588, + "grad_norm": 1.4082841873168945, + "learning_rate": 4.912845024293681e-05, + "loss": 5.1785, + "step": 14175 + }, + { + "epoch": 0.08430868779141688, + "grad_norm": 1.5914233922958374, + "learning_rate": 4.9128327979645336e-05, + "loss": 5.2035, + "step": 14176 + }, + { + "epoch": 0.08431463507469787, + "grad_norm": 1.3170946836471558, + "learning_rate": 4.912820570793091e-05, + "loss": 5.35, + "step": 14177 + }, + { + "epoch": 0.08432058235797887, + "grad_norm": 1.3059190511703491, + "learning_rate": 4.912808342779357e-05, + "loss": 5.1428, + "step": 14178 + }, + { + "epoch": 0.08432652964125988, + "grad_norm": 1.438844919204712, + "learning_rate": 4.912796113923337e-05, + "loss": 5.2154, + "step": 14179 + }, + { + "epoch": 0.08433247692454086, + "grad_norm": 1.401469349861145, + "learning_rate": 4.912783884225035e-05, + "loss": 5.0941, + "step": 14180 + }, + { + "epoch": 0.08433842420782187, + "grad_norm": 1.6718204021453857, + "learning_rate": 4.912771653684456e-05, + "loss": 5.3221, + "step": 14181 + }, + { + "epoch": 0.08434437149110287, + "grad_norm": 1.51036536693573, + "learning_rate": 4.912759422301602e-05, + "loss": 5.2619, + "step": 14182 + }, + { + "epoch": 0.08435031877438386, + "grad_norm": 1.6579569578170776, + "learning_rate": 4.9127471900764795e-05, + "loss": 5.1176, + "step": 14183 + }, + { + "epoch": 0.08435626605766486, + "grad_norm": 1.5300757884979248, + "learning_rate": 4.912734957009091e-05, + "loss": 5.1625, + "step": 14184 + }, + { + "epoch": 0.08436221334094586, + "grad_norm": 1.2839969396591187, + "learning_rate": 4.912722723099442e-05, + "loss": 5.0852, + "step": 14185 + }, + { + "epoch": 0.08436816062422685, + "grad_norm": 1.7074840068817139, + "learning_rate": 4.9127104883475364e-05, + "loss": 5.1611, + "step": 14186 + }, + { + "epoch": 0.08437410790750785, + "grad_norm": 1.790992021560669, + "learning_rate": 4.9126982527533797e-05, + "loss": 5.0386, + "step": 14187 + }, + { + "epoch": 0.08438005519078885, + "grad_norm": 1.5269246101379395, + "learning_rate": 4.912686016316973e-05, + "loss": 5.0272, + "step": 14188 + }, + { + "epoch": 0.08438600247406984, + "grad_norm": 1.510847806930542, + "learning_rate": 4.9126737790383234e-05, + "loss": 5.2073, + "step": 14189 + }, + { + "epoch": 0.08439194975735084, + "grad_norm": 1.6551074981689453, + "learning_rate": 4.912661540917435e-05, + "loss": 5.0436, + "step": 14190 + }, + { + "epoch": 0.08439789704063184, + "grad_norm": 1.3152271509170532, + "learning_rate": 4.91264930195431e-05, + "loss": 5.0981, + "step": 14191 + }, + { + "epoch": 0.08440384432391283, + "grad_norm": 1.478190302848816, + "learning_rate": 4.912637062148955e-05, + "loss": 5.1172, + "step": 14192 + }, + { + "epoch": 0.08440979160719383, + "grad_norm": 1.4574978351593018, + "learning_rate": 4.912624821501373e-05, + "loss": 4.9757, + "step": 14193 + }, + { + "epoch": 0.08441573889047484, + "grad_norm": 1.600182056427002, + "learning_rate": 4.912612580011568e-05, + "loss": 5.1763, + "step": 14194 + }, + { + "epoch": 0.08442168617375582, + "grad_norm": 1.5805768966674805, + "learning_rate": 4.912600337679546e-05, + "loss": 5.1949, + "step": 14195 + }, + { + "epoch": 0.08442763345703683, + "grad_norm": 1.465785264968872, + "learning_rate": 4.9125880945053106e-05, + "loss": 5.0695, + "step": 14196 + }, + { + "epoch": 0.08443358074031783, + "grad_norm": 1.6188615560531616, + "learning_rate": 4.912575850488864e-05, + "loss": 5.1263, + "step": 14197 + }, + { + "epoch": 0.08443952802359882, + "grad_norm": 2.4953408241271973, + "learning_rate": 4.9125636056302125e-05, + "loss": 5.6462, + "step": 14198 + }, + { + "epoch": 0.08444547530687982, + "grad_norm": 1.6779934167861938, + "learning_rate": 4.91255135992936e-05, + "loss": 5.1673, + "step": 14199 + }, + { + "epoch": 0.08445142259016082, + "grad_norm": 1.648706316947937, + "learning_rate": 4.912539113386312e-05, + "loss": 5.3792, + "step": 14200 + }, + { + "epoch": 0.08445736987344181, + "grad_norm": 1.4866549968719482, + "learning_rate": 4.91252686600107e-05, + "loss": 5.2828, + "step": 14201 + }, + { + "epoch": 0.08446331715672281, + "grad_norm": 1.6002475023269653, + "learning_rate": 4.912514617773641e-05, + "loss": 5.3255, + "step": 14202 + }, + { + "epoch": 0.0844692644400038, + "grad_norm": 1.4162862300872803, + "learning_rate": 4.912502368704027e-05, + "loss": 5.3363, + "step": 14203 + }, + { + "epoch": 0.0844752117232848, + "grad_norm": 1.4465757608413696, + "learning_rate": 4.912490118792234e-05, + "loss": 5.586, + "step": 14204 + }, + { + "epoch": 0.0844811590065658, + "grad_norm": 1.8178991079330444, + "learning_rate": 4.912477868038266e-05, + "loss": 5.3029, + "step": 14205 + }, + { + "epoch": 0.08448710628984679, + "grad_norm": 1.4270378351211548, + "learning_rate": 4.912465616442126e-05, + "loss": 5.3864, + "step": 14206 + }, + { + "epoch": 0.0844930535731278, + "grad_norm": 1.5574913024902344, + "learning_rate": 4.91245336400382e-05, + "loss": 5.7667, + "step": 14207 + }, + { + "epoch": 0.0844990008564088, + "grad_norm": 1.3866809606552124, + "learning_rate": 4.91244111072335e-05, + "loss": 5.683, + "step": 14208 + }, + { + "epoch": 0.08450494813968978, + "grad_norm": 1.3390960693359375, + "learning_rate": 4.912428856600722e-05, + "loss": 5.7286, + "step": 14209 + }, + { + "epoch": 0.08451089542297079, + "grad_norm": 1.4317498207092285, + "learning_rate": 4.912416601635942e-05, + "loss": 5.6913, + "step": 14210 + }, + { + "epoch": 0.08451684270625179, + "grad_norm": 1.3110778331756592, + "learning_rate": 4.91240434582901e-05, + "loss": 5.6325, + "step": 14211 + }, + { + "epoch": 0.08452278998953278, + "grad_norm": 1.3288872241973877, + "learning_rate": 4.9123920891799344e-05, + "loss": 5.6343, + "step": 14212 + }, + { + "epoch": 0.08452873727281378, + "grad_norm": 1.2967199087142944, + "learning_rate": 4.912379831688716e-05, + "loss": 5.6514, + "step": 14213 + }, + { + "epoch": 0.08453468455609478, + "grad_norm": 1.6022506952285767, + "learning_rate": 4.912367573355362e-05, + "loss": 5.4006, + "step": 14214 + }, + { + "epoch": 0.08454063183937577, + "grad_norm": 1.6698434352874756, + "learning_rate": 4.912355314179875e-05, + "loss": 5.1543, + "step": 14215 + }, + { + "epoch": 0.08454657912265677, + "grad_norm": 1.6759408712387085, + "learning_rate": 4.9123430541622594e-05, + "loss": 4.9744, + "step": 14216 + }, + { + "epoch": 0.08455252640593777, + "grad_norm": 2.470752239227295, + "learning_rate": 4.91233079330252e-05, + "loss": 5.7614, + "step": 14217 + }, + { + "epoch": 0.08455847368921876, + "grad_norm": 2.1985907554626465, + "learning_rate": 4.91231853160066e-05, + "loss": 6.037, + "step": 14218 + }, + { + "epoch": 0.08456442097249976, + "grad_norm": 2.079569101333618, + "learning_rate": 4.912306269056686e-05, + "loss": 5.4943, + "step": 14219 + }, + { + "epoch": 0.08457036825578076, + "grad_norm": 2.2941744327545166, + "learning_rate": 4.9122940056706e-05, + "loss": 5.3733, + "step": 14220 + }, + { + "epoch": 0.08457631553906175, + "grad_norm": 1.9538209438323975, + "learning_rate": 4.912281741442407e-05, + "loss": 5.6362, + "step": 14221 + }, + { + "epoch": 0.08458226282234275, + "grad_norm": 1.7498515844345093, + "learning_rate": 4.9122694763721124e-05, + "loss": 5.7129, + "step": 14222 + }, + { + "epoch": 0.08458821010562376, + "grad_norm": 2.1728787422180176, + "learning_rate": 4.912257210459718e-05, + "loss": 5.4633, + "step": 14223 + }, + { + "epoch": 0.08459415738890474, + "grad_norm": 2.2436587810516357, + "learning_rate": 4.91224494370523e-05, + "loss": 5.3996, + "step": 14224 + }, + { + "epoch": 0.08460010467218575, + "grad_norm": 2.400299549102783, + "learning_rate": 4.912232676108653e-05, + "loss": 5.3994, + "step": 14225 + }, + { + "epoch": 0.08460605195546675, + "grad_norm": 1.9408513307571411, + "learning_rate": 4.91222040766999e-05, + "loss": 5.4537, + "step": 14226 + }, + { + "epoch": 0.08461199923874774, + "grad_norm": 2.4801602363586426, + "learning_rate": 4.912208138389245e-05, + "loss": 4.6625, + "step": 14227 + }, + { + "epoch": 0.08461794652202874, + "grad_norm": 2.021916627883911, + "learning_rate": 4.912195868266424e-05, + "loss": 4.5642, + "step": 14228 + }, + { + "epoch": 0.08462389380530974, + "grad_norm": 1.9586929082870483, + "learning_rate": 4.91218359730153e-05, + "loss": 4.6361, + "step": 14229 + }, + { + "epoch": 0.08462984108859073, + "grad_norm": 1.8478419780731201, + "learning_rate": 4.912171325494568e-05, + "loss": 4.5632, + "step": 14230 + }, + { + "epoch": 0.08463578837187173, + "grad_norm": 1.7078584432601929, + "learning_rate": 4.9121590528455406e-05, + "loss": 4.7259, + "step": 14231 + }, + { + "epoch": 0.08464173565515272, + "grad_norm": 1.7676106691360474, + "learning_rate": 4.912146779354455e-05, + "loss": 5.2565, + "step": 14232 + }, + { + "epoch": 0.08464768293843372, + "grad_norm": 1.8230634927749634, + "learning_rate": 4.912134505021313e-05, + "loss": 5.7668, + "step": 14233 + }, + { + "epoch": 0.08465363022171472, + "grad_norm": 1.8570215702056885, + "learning_rate": 4.91212222984612e-05, + "loss": 6.1849, + "step": 14234 + }, + { + "epoch": 0.08465957750499571, + "grad_norm": 1.7698529958724976, + "learning_rate": 4.9121099538288805e-05, + "loss": 6.0298, + "step": 14235 + }, + { + "epoch": 0.08466552478827671, + "grad_norm": 1.9919711351394653, + "learning_rate": 4.912097676969597e-05, + "loss": 5.7423, + "step": 14236 + }, + { + "epoch": 0.08467147207155772, + "grad_norm": 1.9937268495559692, + "learning_rate": 4.912085399268277e-05, + "loss": 5.8415, + "step": 14237 + }, + { + "epoch": 0.0846774193548387, + "grad_norm": 1.9489192962646484, + "learning_rate": 4.912073120724921e-05, + "loss": 5.812, + "step": 14238 + }, + { + "epoch": 0.0846833666381197, + "grad_norm": 1.6114327907562256, + "learning_rate": 4.9120608413395366e-05, + "loss": 5.9458, + "step": 14239 + }, + { + "epoch": 0.08468931392140071, + "grad_norm": 1.5803523063659668, + "learning_rate": 4.9120485611121265e-05, + "loss": 5.8837, + "step": 14240 + }, + { + "epoch": 0.0846952612046817, + "grad_norm": 1.8166266679763794, + "learning_rate": 4.9120362800426946e-05, + "loss": 5.5997, + "step": 14241 + }, + { + "epoch": 0.0847012084879627, + "grad_norm": 2.2683627605438232, + "learning_rate": 4.912023998131246e-05, + "loss": 5.4089, + "step": 14242 + }, + { + "epoch": 0.0847071557712437, + "grad_norm": 1.959498405456543, + "learning_rate": 4.9120117153777846e-05, + "loss": 5.5651, + "step": 14243 + }, + { + "epoch": 0.08471310305452469, + "grad_norm": 2.2388527393341064, + "learning_rate": 4.9119994317823155e-05, + "loss": 6.1511, + "step": 14244 + }, + { + "epoch": 0.08471905033780569, + "grad_norm": 1.9563941955566406, + "learning_rate": 4.911987147344842e-05, + "loss": 6.0499, + "step": 14245 + }, + { + "epoch": 0.08472499762108669, + "grad_norm": 1.7460871934890747, + "learning_rate": 4.911974862065368e-05, + "loss": 5.8368, + "step": 14246 + }, + { + "epoch": 0.08473094490436768, + "grad_norm": 1.820356845855713, + "learning_rate": 4.911962575943899e-05, + "loss": 5.3679, + "step": 14247 + }, + { + "epoch": 0.08473689218764868, + "grad_norm": 2.2215917110443115, + "learning_rate": 4.911950288980439e-05, + "loss": 5.0686, + "step": 14248 + }, + { + "epoch": 0.08474283947092968, + "grad_norm": 1.7801320552825928, + "learning_rate": 4.9119380011749914e-05, + "loss": 5.7665, + "step": 14249 + }, + { + "epoch": 0.08474878675421067, + "grad_norm": 1.8713878393173218, + "learning_rate": 4.911925712527562e-05, + "loss": 5.7, + "step": 14250 + }, + { + "epoch": 0.08475473403749167, + "grad_norm": 1.9371087551116943, + "learning_rate": 4.911913423038154e-05, + "loss": 5.6707, + "step": 14251 + }, + { + "epoch": 0.08476068132077268, + "grad_norm": 2.2298929691314697, + "learning_rate": 4.9119011327067724e-05, + "loss": 5.7042, + "step": 14252 + }, + { + "epoch": 0.08476662860405366, + "grad_norm": 1.7787251472473145, + "learning_rate": 4.91188884153342e-05, + "loss": 5.9205, + "step": 14253 + }, + { + "epoch": 0.08477257588733467, + "grad_norm": 2.0264973640441895, + "learning_rate": 4.911876549518102e-05, + "loss": 5.2057, + "step": 14254 + }, + { + "epoch": 0.08477852317061567, + "grad_norm": 2.7479963302612305, + "learning_rate": 4.911864256660824e-05, + "loss": 4.3828, + "step": 14255 + }, + { + "epoch": 0.08478447045389666, + "grad_norm": 2.3911163806915283, + "learning_rate": 4.9118519629615886e-05, + "loss": 4.1959, + "step": 14256 + }, + { + "epoch": 0.08479041773717766, + "grad_norm": 2.5100319385528564, + "learning_rate": 4.9118396684204005e-05, + "loss": 4.3845, + "step": 14257 + }, + { + "epoch": 0.08479636502045866, + "grad_norm": 2.575680732727051, + "learning_rate": 4.911827373037264e-05, + "loss": 4.1927, + "step": 14258 + }, + { + "epoch": 0.08480231230373965, + "grad_norm": 2.64941143989563, + "learning_rate": 4.9118150768121837e-05, + "loss": 4.2398, + "step": 14259 + }, + { + "epoch": 0.08480825958702065, + "grad_norm": 3.4619154930114746, + "learning_rate": 4.911802779745163e-05, + "loss": 5.9141, + "step": 14260 + }, + { + "epoch": 0.08481420687030164, + "grad_norm": 2.5471723079681396, + "learning_rate": 4.911790481836208e-05, + "loss": 4.1887, + "step": 14261 + }, + { + "epoch": 0.08482015415358264, + "grad_norm": 2.9113502502441406, + "learning_rate": 4.911778183085321e-05, + "loss": 4.3556, + "step": 14262 + }, + { + "epoch": 0.08482610143686364, + "grad_norm": 2.5952084064483643, + "learning_rate": 4.9117658834925076e-05, + "loss": 5.0408, + "step": 14263 + }, + { + "epoch": 0.08483204872014463, + "grad_norm": 2.60726261138916, + "learning_rate": 4.911753583057771e-05, + "loss": 5.5094, + "step": 14264 + }, + { + "epoch": 0.08483799600342563, + "grad_norm": 1.9005889892578125, + "learning_rate": 4.911741281781117e-05, + "loss": 5.2637, + "step": 14265 + }, + { + "epoch": 0.08484394328670664, + "grad_norm": 1.6408629417419434, + "learning_rate": 4.911728979662549e-05, + "loss": 5.4722, + "step": 14266 + }, + { + "epoch": 0.08484989056998762, + "grad_norm": 1.840955376625061, + "learning_rate": 4.911716676702071e-05, + "loss": 5.5073, + "step": 14267 + }, + { + "epoch": 0.08485583785326863, + "grad_norm": 1.8430123329162598, + "learning_rate": 4.911704372899687e-05, + "loss": 6.0372, + "step": 14268 + }, + { + "epoch": 0.08486178513654963, + "grad_norm": 3.2100231647491455, + "learning_rate": 4.911692068255402e-05, + "loss": 5.0497, + "step": 14269 + }, + { + "epoch": 0.08486773241983062, + "grad_norm": 3.191558837890625, + "learning_rate": 4.911679762769221e-05, + "loss": 5.0467, + "step": 14270 + }, + { + "epoch": 0.08487367970311162, + "grad_norm": 3.04190731048584, + "learning_rate": 4.911667456441148e-05, + "loss": 4.8008, + "step": 14271 + }, + { + "epoch": 0.08487962698639262, + "grad_norm": 2.6688694953918457, + "learning_rate": 4.911655149271186e-05, + "loss": 4.722, + "step": 14272 + }, + { + "epoch": 0.08488557426967361, + "grad_norm": 2.1458704471588135, + "learning_rate": 4.9116428412593394e-05, + "loss": 4.788, + "step": 14273 + }, + { + "epoch": 0.08489152155295461, + "grad_norm": 2.345972776412964, + "learning_rate": 4.911630532405615e-05, + "loss": 4.7955, + "step": 14274 + }, + { + "epoch": 0.08489746883623561, + "grad_norm": 2.2022581100463867, + "learning_rate": 4.911618222710014e-05, + "loss": 4.815, + "step": 14275 + }, + { + "epoch": 0.0849034161195166, + "grad_norm": 2.311004877090454, + "learning_rate": 4.911605912172542e-05, + "loss": 4.8632, + "step": 14276 + }, + { + "epoch": 0.0849093634027976, + "grad_norm": 2.5007429122924805, + "learning_rate": 4.911593600793204e-05, + "loss": 4.7273, + "step": 14277 + }, + { + "epoch": 0.0849153106860786, + "grad_norm": 2.257115364074707, + "learning_rate": 4.9115812885720026e-05, + "loss": 4.9697, + "step": 14278 + }, + { + "epoch": 0.08492125796935959, + "grad_norm": 2.7667057514190674, + "learning_rate": 4.9115689755089436e-05, + "loss": 5.1607, + "step": 14279 + }, + { + "epoch": 0.0849272052526406, + "grad_norm": 2.4240612983703613, + "learning_rate": 4.911556661604031e-05, + "loss": 4.9873, + "step": 14280 + }, + { + "epoch": 0.0849331525359216, + "grad_norm": 1.9951629638671875, + "learning_rate": 4.911544346857269e-05, + "loss": 4.9961, + "step": 14281 + }, + { + "epoch": 0.08493909981920258, + "grad_norm": 1.8532124757766724, + "learning_rate": 4.9115320312686605e-05, + "loss": 4.9467, + "step": 14282 + }, + { + "epoch": 0.08494504710248359, + "grad_norm": 2.41200590133667, + "learning_rate": 4.9115197148382126e-05, + "loss": 4.9865, + "step": 14283 + }, + { + "epoch": 0.08495099438576459, + "grad_norm": 2.2735655307769775, + "learning_rate": 4.911507397565928e-05, + "loss": 4.9223, + "step": 14284 + }, + { + "epoch": 0.08495694166904558, + "grad_norm": 2.29052734375, + "learning_rate": 4.91149507945181e-05, + "loss": 4.9479, + "step": 14285 + }, + { + "epoch": 0.08496288895232658, + "grad_norm": 2.71832275390625, + "learning_rate": 4.911482760495865e-05, + "loss": 4.9537, + "step": 14286 + }, + { + "epoch": 0.08496883623560758, + "grad_norm": 2.1351630687713623, + "learning_rate": 4.911470440698096e-05, + "loss": 5.3776, + "step": 14287 + }, + { + "epoch": 0.08497478351888857, + "grad_norm": 2.514810085296631, + "learning_rate": 4.9114581200585066e-05, + "loss": 5.6067, + "step": 14288 + }, + { + "epoch": 0.08498073080216957, + "grad_norm": 1.787312626838684, + "learning_rate": 4.9114457985771036e-05, + "loss": 5.4929, + "step": 14289 + }, + { + "epoch": 0.08498667808545056, + "grad_norm": 1.7784658670425415, + "learning_rate": 4.911433476253889e-05, + "loss": 5.5471, + "step": 14290 + }, + { + "epoch": 0.08499262536873156, + "grad_norm": 1.6120775938034058, + "learning_rate": 4.9114211530888676e-05, + "loss": 5.5455, + "step": 14291 + }, + { + "epoch": 0.08499857265201256, + "grad_norm": 1.6809823513031006, + "learning_rate": 4.9114088290820446e-05, + "loss": 5.7674, + "step": 14292 + }, + { + "epoch": 0.08500451993529355, + "grad_norm": 1.784569501876831, + "learning_rate": 4.9113965042334234e-05, + "loss": 5.554, + "step": 14293 + }, + { + "epoch": 0.08501046721857455, + "grad_norm": 1.8622018098831177, + "learning_rate": 4.9113841785430094e-05, + "loss": 5.5718, + "step": 14294 + }, + { + "epoch": 0.08501641450185556, + "grad_norm": 1.8970091342926025, + "learning_rate": 4.911371852010805e-05, + "loss": 5.6398, + "step": 14295 + }, + { + "epoch": 0.08502236178513654, + "grad_norm": 1.9560039043426514, + "learning_rate": 4.911359524636816e-05, + "loss": 5.3627, + "step": 14296 + }, + { + "epoch": 0.08502830906841755, + "grad_norm": 1.7574408054351807, + "learning_rate": 4.911347196421046e-05, + "loss": 5.6245, + "step": 14297 + }, + { + "epoch": 0.08503425635169855, + "grad_norm": 2.0868546962738037, + "learning_rate": 4.9113348673635004e-05, + "loss": 5.6092, + "step": 14298 + }, + { + "epoch": 0.08504020363497954, + "grad_norm": 2.1157326698303223, + "learning_rate": 4.9113225374641816e-05, + "loss": 5.0796, + "step": 14299 + }, + { + "epoch": 0.08504615091826054, + "grad_norm": 1.7721058130264282, + "learning_rate": 4.911310206723096e-05, + "loss": 5.148, + "step": 14300 + }, + { + "epoch": 0.08505209820154154, + "grad_norm": 1.586799144744873, + "learning_rate": 4.911297875140246e-05, + "loss": 5.5425, + "step": 14301 + }, + { + "epoch": 0.08505804548482253, + "grad_norm": 1.9669803380966187, + "learning_rate": 4.9112855427156376e-05, + "loss": 5.1675, + "step": 14302 + }, + { + "epoch": 0.08506399276810353, + "grad_norm": 2.279446601867676, + "learning_rate": 4.911273209449274e-05, + "loss": 5.8068, + "step": 14303 + }, + { + "epoch": 0.08506994005138453, + "grad_norm": 2.036482572555542, + "learning_rate": 4.9112608753411605e-05, + "loss": 5.3995, + "step": 14304 + }, + { + "epoch": 0.08507588733466552, + "grad_norm": 1.833946704864502, + "learning_rate": 4.9112485403913e-05, + "loss": 6.069, + "step": 14305 + }, + { + "epoch": 0.08508183461794652, + "grad_norm": 1.6984084844589233, + "learning_rate": 4.9112362045996976e-05, + "loss": 5.7842, + "step": 14306 + }, + { + "epoch": 0.08508778190122752, + "grad_norm": 1.6729326248168945, + "learning_rate": 4.911223867966358e-05, + "loss": 5.5225, + "step": 14307 + }, + { + "epoch": 0.08509372918450851, + "grad_norm": 2.046747922897339, + "learning_rate": 4.911211530491284e-05, + "loss": 4.967, + "step": 14308 + }, + { + "epoch": 0.08509967646778951, + "grad_norm": 1.967058539390564, + "learning_rate": 4.911199192174482e-05, + "loss": 5.8046, + "step": 14309 + }, + { + "epoch": 0.08510562375107052, + "grad_norm": 1.8341583013534546, + "learning_rate": 4.911186853015955e-05, + "loss": 4.8317, + "step": 14310 + }, + { + "epoch": 0.0851115710343515, + "grad_norm": 1.9655890464782715, + "learning_rate": 4.911174513015707e-05, + "loss": 4.6122, + "step": 14311 + }, + { + "epoch": 0.0851175183176325, + "grad_norm": 1.7953969240188599, + "learning_rate": 4.9111621721737445e-05, + "loss": 5.3151, + "step": 14312 + }, + { + "epoch": 0.08512346560091351, + "grad_norm": 1.7074720859527588, + "learning_rate": 4.9111498304900684e-05, + "loss": 5.337, + "step": 14313 + }, + { + "epoch": 0.0851294128841945, + "grad_norm": 1.8258756399154663, + "learning_rate": 4.9111374879646854e-05, + "loss": 5.3245, + "step": 14314 + }, + { + "epoch": 0.0851353601674755, + "grad_norm": 1.731689691543579, + "learning_rate": 4.9111251445976e-05, + "loss": 5.149, + "step": 14315 + }, + { + "epoch": 0.0851413074507565, + "grad_norm": 1.9083631038665771, + "learning_rate": 4.9111128003888154e-05, + "loss": 5.2409, + "step": 14316 + }, + { + "epoch": 0.08514725473403749, + "grad_norm": 1.739311933517456, + "learning_rate": 4.911100455338336e-05, + "loss": 5.0946, + "step": 14317 + }, + { + "epoch": 0.08515320201731849, + "grad_norm": 1.6812219619750977, + "learning_rate": 4.9110881094461655e-05, + "loss": 5.3062, + "step": 14318 + }, + { + "epoch": 0.08515914930059948, + "grad_norm": 1.8215876817703247, + "learning_rate": 4.9110757627123096e-05, + "loss": 5.5774, + "step": 14319 + }, + { + "epoch": 0.08516509658388048, + "grad_norm": 1.9548031091690063, + "learning_rate": 4.9110634151367725e-05, + "loss": 5.7895, + "step": 14320 + }, + { + "epoch": 0.08517104386716148, + "grad_norm": 2.266925096511841, + "learning_rate": 4.911051066719558e-05, + "loss": 4.6526, + "step": 14321 + }, + { + "epoch": 0.08517699115044247, + "grad_norm": 2.304807424545288, + "learning_rate": 4.9110387174606695e-05, + "loss": 5.2573, + "step": 14322 + }, + { + "epoch": 0.08518293843372347, + "grad_norm": 2.019482135772705, + "learning_rate": 4.911026367360114e-05, + "loss": 5.2739, + "step": 14323 + }, + { + "epoch": 0.08518888571700448, + "grad_norm": 2.0559775829315186, + "learning_rate": 4.911014016417893e-05, + "loss": 5.7166, + "step": 14324 + }, + { + "epoch": 0.08519483300028546, + "grad_norm": 2.0565741062164307, + "learning_rate": 4.911001664634012e-05, + "loss": 5.6359, + "step": 14325 + }, + { + "epoch": 0.08520078028356647, + "grad_norm": 1.8766587972640991, + "learning_rate": 4.910989312008475e-05, + "loss": 5.2667, + "step": 14326 + }, + { + "epoch": 0.08520672756684747, + "grad_norm": 1.669317364692688, + "learning_rate": 4.910976958541287e-05, + "loss": 5.7565, + "step": 14327 + }, + { + "epoch": 0.08521267485012846, + "grad_norm": 1.9138641357421875, + "learning_rate": 4.910964604232452e-05, + "loss": 5.9362, + "step": 14328 + }, + { + "epoch": 0.08521862213340946, + "grad_norm": 1.740892767906189, + "learning_rate": 4.9109522490819734e-05, + "loss": 5.6964, + "step": 14329 + }, + { + "epoch": 0.08522456941669046, + "grad_norm": 1.788825511932373, + "learning_rate": 4.9109398930898576e-05, + "loss": 5.4266, + "step": 14330 + }, + { + "epoch": 0.08523051669997145, + "grad_norm": 2.035877227783203, + "learning_rate": 4.910927536256106e-05, + "loss": 5.5609, + "step": 14331 + }, + { + "epoch": 0.08523646398325245, + "grad_norm": 2.078150987625122, + "learning_rate": 4.9109151785807265e-05, + "loss": 5.0074, + "step": 14332 + }, + { + "epoch": 0.08524241126653345, + "grad_norm": 2.601290225982666, + "learning_rate": 4.91090282006372e-05, + "loss": 5.2021, + "step": 14333 + }, + { + "epoch": 0.08524835854981444, + "grad_norm": 1.7069159746170044, + "learning_rate": 4.910890460705092e-05, + "loss": 5.0313, + "step": 14334 + }, + { + "epoch": 0.08525430583309544, + "grad_norm": 1.8937885761260986, + "learning_rate": 4.9108781005048473e-05, + "loss": 4.6001, + "step": 14335 + }, + { + "epoch": 0.08526025311637644, + "grad_norm": 2.3120486736297607, + "learning_rate": 4.91086573946299e-05, + "loss": 4.4027, + "step": 14336 + }, + { + "epoch": 0.08526620039965743, + "grad_norm": 2.064420223236084, + "learning_rate": 4.910853377579524e-05, + "loss": 4.8853, + "step": 14337 + }, + { + "epoch": 0.08527214768293843, + "grad_norm": 1.80779230594635, + "learning_rate": 4.910841014854455e-05, + "loss": 5.5493, + "step": 14338 + }, + { + "epoch": 0.08527809496621944, + "grad_norm": 1.6364500522613525, + "learning_rate": 4.910828651287786e-05, + "loss": 5.6569, + "step": 14339 + }, + { + "epoch": 0.08528404224950042, + "grad_norm": 1.7472214698791504, + "learning_rate": 4.910816286879522e-05, + "loss": 5.4057, + "step": 14340 + }, + { + "epoch": 0.08528998953278143, + "grad_norm": 1.6311333179473877, + "learning_rate": 4.910803921629666e-05, + "loss": 5.8406, + "step": 14341 + }, + { + "epoch": 0.08529593681606243, + "grad_norm": 2.2367610931396484, + "learning_rate": 4.9107915555382236e-05, + "loss": 4.9339, + "step": 14342 + }, + { + "epoch": 0.08530188409934342, + "grad_norm": 2.033160924911499, + "learning_rate": 4.910779188605199e-05, + "loss": 4.8923, + "step": 14343 + }, + { + "epoch": 0.08530783138262442, + "grad_norm": 1.852645993232727, + "learning_rate": 4.910766820830596e-05, + "loss": 5.2208, + "step": 14344 + }, + { + "epoch": 0.08531377866590542, + "grad_norm": 1.9810596704483032, + "learning_rate": 4.910754452214419e-05, + "loss": 5.0119, + "step": 14345 + }, + { + "epoch": 0.08531972594918641, + "grad_norm": 1.92807137966156, + "learning_rate": 4.910742082756673e-05, + "loss": 5.6388, + "step": 14346 + }, + { + "epoch": 0.08532567323246741, + "grad_norm": 1.783923864364624, + "learning_rate": 4.910729712457361e-05, + "loss": 5.2831, + "step": 14347 + }, + { + "epoch": 0.0853316205157484, + "grad_norm": 2.008113145828247, + "learning_rate": 4.91071734131649e-05, + "loss": 5.085, + "step": 14348 + }, + { + "epoch": 0.0853375677990294, + "grad_norm": 2.2313408851623535, + "learning_rate": 4.910704969334061e-05, + "loss": 5.243, + "step": 14349 + }, + { + "epoch": 0.0853435150823104, + "grad_norm": 2.155491590499878, + "learning_rate": 4.9106925965100806e-05, + "loss": 6.0776, + "step": 14350 + }, + { + "epoch": 0.08534946236559139, + "grad_norm": 1.995848536491394, + "learning_rate": 4.910680222844551e-05, + "loss": 5.6763, + "step": 14351 + }, + { + "epoch": 0.0853554096488724, + "grad_norm": 2.033620595932007, + "learning_rate": 4.910667848337479e-05, + "loss": 4.4634, + "step": 14352 + }, + { + "epoch": 0.0853613569321534, + "grad_norm": 2.036668062210083, + "learning_rate": 4.910655472988868e-05, + "loss": 4.6367, + "step": 14353 + }, + { + "epoch": 0.08536730421543438, + "grad_norm": 1.9862895011901855, + "learning_rate": 4.910643096798721e-05, + "loss": 4.4623, + "step": 14354 + }, + { + "epoch": 0.08537325149871539, + "grad_norm": 1.9778163433074951, + "learning_rate": 4.910630719767044e-05, + "loss": 4.3706, + "step": 14355 + }, + { + "epoch": 0.08537919878199639, + "grad_norm": 1.984913945198059, + "learning_rate": 4.9106183418938404e-05, + "loss": 4.4573, + "step": 14356 + }, + { + "epoch": 0.08538514606527738, + "grad_norm": 2.0571017265319824, + "learning_rate": 4.910605963179116e-05, + "loss": 4.2782, + "step": 14357 + }, + { + "epoch": 0.08539109334855838, + "grad_norm": 2.028339147567749, + "learning_rate": 4.910593583622872e-05, + "loss": 4.3874, + "step": 14358 + }, + { + "epoch": 0.08539704063183938, + "grad_norm": 2.03485369682312, + "learning_rate": 4.9105812032251165e-05, + "loss": 4.5877, + "step": 14359 + }, + { + "epoch": 0.08540298791512037, + "grad_norm": 1.950490951538086, + "learning_rate": 4.910568821985851e-05, + "loss": 4.6547, + "step": 14360 + }, + { + "epoch": 0.08540893519840137, + "grad_norm": 2.1270785331726074, + "learning_rate": 4.910556439905081e-05, + "loss": 5.3685, + "step": 14361 + }, + { + "epoch": 0.08541488248168237, + "grad_norm": 2.094545364379883, + "learning_rate": 4.910544056982811e-05, + "loss": 6.1109, + "step": 14362 + }, + { + "epoch": 0.08542082976496336, + "grad_norm": 2.2988197803497314, + "learning_rate": 4.910531673219044e-05, + "loss": 5.4789, + "step": 14363 + }, + { + "epoch": 0.08542677704824436, + "grad_norm": 2.2927358150482178, + "learning_rate": 4.910519288613786e-05, + "loss": 5.3853, + "step": 14364 + }, + { + "epoch": 0.08543272433152536, + "grad_norm": 2.223668098449707, + "learning_rate": 4.910506903167041e-05, + "loss": 5.3572, + "step": 14365 + }, + { + "epoch": 0.08543867161480635, + "grad_norm": 2.0522570610046387, + "learning_rate": 4.910494516878813e-05, + "loss": 5.3581, + "step": 14366 + }, + { + "epoch": 0.08544461889808735, + "grad_norm": 2.4349021911621094, + "learning_rate": 4.910482129749106e-05, + "loss": 5.4082, + "step": 14367 + }, + { + "epoch": 0.08545056618136836, + "grad_norm": 1.976344347000122, + "learning_rate": 4.910469741777924e-05, + "loss": 5.6107, + "step": 14368 + }, + { + "epoch": 0.08545651346464934, + "grad_norm": 1.8476877212524414, + "learning_rate": 4.910457352965272e-05, + "loss": 5.5059, + "step": 14369 + }, + { + "epoch": 0.08546246074793035, + "grad_norm": 1.6204098463058472, + "learning_rate": 4.910444963311155e-05, + "loss": 5.6578, + "step": 14370 + }, + { + "epoch": 0.08546840803121135, + "grad_norm": 1.808021903038025, + "learning_rate": 4.910432572815576e-05, + "loss": 5.8263, + "step": 14371 + }, + { + "epoch": 0.08547435531449234, + "grad_norm": 1.4975682497024536, + "learning_rate": 4.91042018147854e-05, + "loss": 5.582, + "step": 14372 + }, + { + "epoch": 0.08548030259777334, + "grad_norm": 1.644845724105835, + "learning_rate": 4.910407789300051e-05, + "loss": 5.7127, + "step": 14373 + }, + { + "epoch": 0.08548624988105434, + "grad_norm": 1.5433874130249023, + "learning_rate": 4.910395396280114e-05, + "loss": 5.6941, + "step": 14374 + }, + { + "epoch": 0.08549219716433533, + "grad_norm": 1.7267838716506958, + "learning_rate": 4.910383002418732e-05, + "loss": 5.632, + "step": 14375 + }, + { + "epoch": 0.08549814444761633, + "grad_norm": 1.4142215251922607, + "learning_rate": 4.9103706077159116e-05, + "loss": 5.6108, + "step": 14376 + }, + { + "epoch": 0.08550409173089732, + "grad_norm": 1.8514180183410645, + "learning_rate": 4.9103582121716554e-05, + "loss": 5.828, + "step": 14377 + }, + { + "epoch": 0.08551003901417832, + "grad_norm": 1.633837103843689, + "learning_rate": 4.9103458157859674e-05, + "loss": 5.8585, + "step": 14378 + }, + { + "epoch": 0.08551598629745932, + "grad_norm": 1.9934178590774536, + "learning_rate": 4.910333418558853e-05, + "loss": 5.5907, + "step": 14379 + }, + { + "epoch": 0.08552193358074031, + "grad_norm": 1.8934741020202637, + "learning_rate": 4.910321020490316e-05, + "loss": 5.579, + "step": 14380 + }, + { + "epoch": 0.08552788086402131, + "grad_norm": 1.9341318607330322, + "learning_rate": 4.910308621580361e-05, + "loss": 5.8737, + "step": 14381 + }, + { + "epoch": 0.08553382814730232, + "grad_norm": 2.1566226482391357, + "learning_rate": 4.9102962218289915e-05, + "loss": 5.6105, + "step": 14382 + }, + { + "epoch": 0.0855397754305833, + "grad_norm": 1.707112431526184, + "learning_rate": 4.910283821236213e-05, + "loss": 5.6875, + "step": 14383 + }, + { + "epoch": 0.0855457227138643, + "grad_norm": 2.8415439128875732, + "learning_rate": 4.9102714198020296e-05, + "loss": 4.9292, + "step": 14384 + }, + { + "epoch": 0.08555166999714531, + "grad_norm": 2.2043650150299072, + "learning_rate": 4.9102590175264445e-05, + "loss": 5.7264, + "step": 14385 + }, + { + "epoch": 0.0855576172804263, + "grad_norm": 2.2063820362091064, + "learning_rate": 4.9102466144094636e-05, + "loss": 5.1616, + "step": 14386 + }, + { + "epoch": 0.0855635645637073, + "grad_norm": 1.9087328910827637, + "learning_rate": 4.9102342104510903e-05, + "loss": 5.1897, + "step": 14387 + }, + { + "epoch": 0.0855695118469883, + "grad_norm": 1.6418956518173218, + "learning_rate": 4.910221805651329e-05, + "loss": 5.0923, + "step": 14388 + }, + { + "epoch": 0.08557545913026929, + "grad_norm": 1.5215847492218018, + "learning_rate": 4.9102094000101836e-05, + "loss": 4.9602, + "step": 14389 + }, + { + "epoch": 0.08558140641355029, + "grad_norm": 2.249983072280884, + "learning_rate": 4.91019699352766e-05, + "loss": 5.1167, + "step": 14390 + }, + { + "epoch": 0.08558735369683129, + "grad_norm": 1.89960777759552, + "learning_rate": 4.9101845862037615e-05, + "loss": 6.1589, + "step": 14391 + }, + { + "epoch": 0.08559330098011228, + "grad_norm": 1.8243924379348755, + "learning_rate": 4.910172178038492e-05, + "loss": 5.8661, + "step": 14392 + }, + { + "epoch": 0.08559924826339328, + "grad_norm": 1.8313872814178467, + "learning_rate": 4.9101597690318567e-05, + "loss": 5.6129, + "step": 14393 + }, + { + "epoch": 0.08560519554667428, + "grad_norm": 1.8717663288116455, + "learning_rate": 4.9101473591838593e-05, + "loss": 5.6346, + "step": 14394 + }, + { + "epoch": 0.08561114282995527, + "grad_norm": 1.6444953680038452, + "learning_rate": 4.910134948494504e-05, + "loss": 5.7237, + "step": 14395 + }, + { + "epoch": 0.08561709011323627, + "grad_norm": 1.8138811588287354, + "learning_rate": 4.910122536963796e-05, + "loss": 5.7682, + "step": 14396 + }, + { + "epoch": 0.08562303739651728, + "grad_norm": 2.629892110824585, + "learning_rate": 4.9101101245917394e-05, + "loss": 5.89, + "step": 14397 + }, + { + "epoch": 0.08562898467979826, + "grad_norm": 1.8197498321533203, + "learning_rate": 4.910097711378337e-05, + "loss": 5.6768, + "step": 14398 + }, + { + "epoch": 0.08563493196307927, + "grad_norm": 2.1121623516082764, + "learning_rate": 4.9100852973235955e-05, + "loss": 5.672, + "step": 14399 + }, + { + "epoch": 0.08564087924636027, + "grad_norm": 1.8823927640914917, + "learning_rate": 4.910072882427518e-05, + "loss": 5.6717, + "step": 14400 + }, + { + "epoch": 0.08564682652964126, + "grad_norm": 2.602023124694824, + "learning_rate": 4.9100604666901084e-05, + "loss": 5.4193, + "step": 14401 + }, + { + "epoch": 0.08565277381292226, + "grad_norm": 2.420342445373535, + "learning_rate": 4.910048050111372e-05, + "loss": 5.2811, + "step": 14402 + }, + { + "epoch": 0.08565872109620326, + "grad_norm": 2.593797206878662, + "learning_rate": 4.910035632691313e-05, + "loss": 5.2942, + "step": 14403 + }, + { + "epoch": 0.08566466837948425, + "grad_norm": 1.9292038679122925, + "learning_rate": 4.910023214429935e-05, + "loss": 5.0231, + "step": 14404 + }, + { + "epoch": 0.08567061566276525, + "grad_norm": 2.159935712814331, + "learning_rate": 4.9100107953272434e-05, + "loss": 4.8778, + "step": 14405 + }, + { + "epoch": 0.08567656294604625, + "grad_norm": 2.2363314628601074, + "learning_rate": 4.9099983753832416e-05, + "loss": 4.8828, + "step": 14406 + }, + { + "epoch": 0.08568251022932724, + "grad_norm": 2.149986505508423, + "learning_rate": 4.909985954597934e-05, + "loss": 5.4351, + "step": 14407 + }, + { + "epoch": 0.08568845751260824, + "grad_norm": 2.05991268157959, + "learning_rate": 4.909973532971325e-05, + "loss": 5.3759, + "step": 14408 + }, + { + "epoch": 0.08569440479588923, + "grad_norm": 2.0030369758605957, + "learning_rate": 4.9099611105034196e-05, + "loss": 5.5126, + "step": 14409 + }, + { + "epoch": 0.08570035207917023, + "grad_norm": 1.7764592170715332, + "learning_rate": 4.9099486871942216e-05, + "loss": 5.1808, + "step": 14410 + }, + { + "epoch": 0.08570629936245124, + "grad_norm": 1.8827999830245972, + "learning_rate": 4.909936263043735e-05, + "loss": 5.5076, + "step": 14411 + }, + { + "epoch": 0.08571224664573222, + "grad_norm": 2.0153589248657227, + "learning_rate": 4.9099238380519655e-05, + "loss": 5.2955, + "step": 14412 + }, + { + "epoch": 0.08571819392901323, + "grad_norm": 2.0739622116088867, + "learning_rate": 4.909911412218916e-05, + "loss": 5.2463, + "step": 14413 + }, + { + "epoch": 0.08572414121229423, + "grad_norm": 2.4668188095092773, + "learning_rate": 4.909898985544591e-05, + "loss": 5.1859, + "step": 14414 + }, + { + "epoch": 0.08573008849557522, + "grad_norm": 2.245546340942383, + "learning_rate": 4.9098865580289956e-05, + "loss": 5.5472, + "step": 14415 + }, + { + "epoch": 0.08573603577885622, + "grad_norm": 2.244086980819702, + "learning_rate": 4.909874129672133e-05, + "loss": 5.5531, + "step": 14416 + }, + { + "epoch": 0.08574198306213722, + "grad_norm": 2.2983627319335938, + "learning_rate": 4.909861700474009e-05, + "loss": 5.6178, + "step": 14417 + }, + { + "epoch": 0.08574793034541821, + "grad_norm": 1.9792771339416504, + "learning_rate": 4.9098492704346265e-05, + "loss": 5.364, + "step": 14418 + }, + { + "epoch": 0.08575387762869921, + "grad_norm": 1.8312867879867554, + "learning_rate": 4.9098368395539914e-05, + "loss": 5.3105, + "step": 14419 + }, + { + "epoch": 0.08575982491198021, + "grad_norm": 1.8415101766586304, + "learning_rate": 4.909824407832107e-05, + "loss": 5.3182, + "step": 14420 + }, + { + "epoch": 0.0857657721952612, + "grad_norm": 1.965531349182129, + "learning_rate": 4.909811975268977e-05, + "loss": 5.496, + "step": 14421 + }, + { + "epoch": 0.0857717194785422, + "grad_norm": 1.9116218090057373, + "learning_rate": 4.909799541864607e-05, + "loss": 5.2531, + "step": 14422 + }, + { + "epoch": 0.0857776667618232, + "grad_norm": 1.863571286201477, + "learning_rate": 4.909787107619001e-05, + "loss": 5.535, + "step": 14423 + }, + { + "epoch": 0.08578361404510419, + "grad_norm": 1.966637372970581, + "learning_rate": 4.909774672532163e-05, + "loss": 5.5072, + "step": 14424 + }, + { + "epoch": 0.0857895613283852, + "grad_norm": 1.9251974821090698, + "learning_rate": 4.9097622366040974e-05, + "loss": 5.1989, + "step": 14425 + }, + { + "epoch": 0.0857955086116662, + "grad_norm": 1.6277741193771362, + "learning_rate": 4.90974979983481e-05, + "loss": 5.357, + "step": 14426 + }, + { + "epoch": 0.08580145589494718, + "grad_norm": 1.6832202672958374, + "learning_rate": 4.909737362224302e-05, + "loss": 5.3485, + "step": 14427 + }, + { + "epoch": 0.08580740317822819, + "grad_norm": 1.7656053304672241, + "learning_rate": 4.909724923772581e-05, + "loss": 5.3965, + "step": 14428 + }, + { + "epoch": 0.08581335046150919, + "grad_norm": 1.748529076576233, + "learning_rate": 4.909712484479649e-05, + "loss": 5.3895, + "step": 14429 + }, + { + "epoch": 0.08581929774479018, + "grad_norm": 2.1317241191864014, + "learning_rate": 4.909700044345511e-05, + "loss": 5.1703, + "step": 14430 + }, + { + "epoch": 0.08582524502807118, + "grad_norm": 2.6896255016326904, + "learning_rate": 4.909687603370172e-05, + "loss": 5.3942, + "step": 14431 + }, + { + "epoch": 0.08583119231135218, + "grad_norm": 2.1061718463897705, + "learning_rate": 4.909675161553637e-05, + "loss": 5.3545, + "step": 14432 + }, + { + "epoch": 0.08583713959463317, + "grad_norm": 2.7201108932495117, + "learning_rate": 4.9096627188959085e-05, + "loss": 4.9659, + "step": 14433 + }, + { + "epoch": 0.08584308687791417, + "grad_norm": 2.0352578163146973, + "learning_rate": 4.909650275396991e-05, + "loss": 5.2667, + "step": 14434 + }, + { + "epoch": 0.08584903416119517, + "grad_norm": 1.6980863809585571, + "learning_rate": 4.9096378310568905e-05, + "loss": 5.4036, + "step": 14435 + }, + { + "epoch": 0.08585498144447616, + "grad_norm": 1.677700161933899, + "learning_rate": 4.90962538587561e-05, + "loss": 5.3104, + "step": 14436 + }, + { + "epoch": 0.08586092872775716, + "grad_norm": 1.995198369026184, + "learning_rate": 4.9096129398531534e-05, + "loss": 5.4235, + "step": 14437 + }, + { + "epoch": 0.08586687601103815, + "grad_norm": 2.136059284210205, + "learning_rate": 4.909600492989527e-05, + "loss": 5.1867, + "step": 14438 + }, + { + "epoch": 0.08587282329431915, + "grad_norm": 1.9917269945144653, + "learning_rate": 4.909588045284733e-05, + "loss": 5.5507, + "step": 14439 + }, + { + "epoch": 0.08587877057760016, + "grad_norm": 1.7341989278793335, + "learning_rate": 4.909575596738777e-05, + "loss": 5.4782, + "step": 14440 + }, + { + "epoch": 0.08588471786088114, + "grad_norm": 2.058920383453369, + "learning_rate": 4.9095631473516635e-05, + "loss": 5.51, + "step": 14441 + }, + { + "epoch": 0.08589066514416215, + "grad_norm": 1.7856314182281494, + "learning_rate": 4.9095506971233965e-05, + "loss": 5.4189, + "step": 14442 + }, + { + "epoch": 0.08589661242744315, + "grad_norm": 1.5290231704711914, + "learning_rate": 4.90953824605398e-05, + "loss": 5.4398, + "step": 14443 + }, + { + "epoch": 0.08590255971072414, + "grad_norm": 1.6302571296691895, + "learning_rate": 4.909525794143418e-05, + "loss": 5.4468, + "step": 14444 + }, + { + "epoch": 0.08590850699400514, + "grad_norm": 1.9898178577423096, + "learning_rate": 4.909513341391716e-05, + "loss": 5.5514, + "step": 14445 + }, + { + "epoch": 0.08591445427728614, + "grad_norm": 2.539473533630371, + "learning_rate": 4.909500887798878e-05, + "loss": 5.0985, + "step": 14446 + }, + { + "epoch": 0.08592040156056713, + "grad_norm": 2.109477996826172, + "learning_rate": 4.909488433364907e-05, + "loss": 5.1304, + "step": 14447 + }, + { + "epoch": 0.08592634884384813, + "grad_norm": 1.627647042274475, + "learning_rate": 4.9094759780898096e-05, + "loss": 5.7772, + "step": 14448 + }, + { + "epoch": 0.08593229612712913, + "grad_norm": 1.7776944637298584, + "learning_rate": 4.909463521973588e-05, + "loss": 6.3219, + "step": 14449 + }, + { + "epoch": 0.08593824341041012, + "grad_norm": 1.8342489004135132, + "learning_rate": 4.909451065016249e-05, + "loss": 5.7136, + "step": 14450 + }, + { + "epoch": 0.08594419069369112, + "grad_norm": 2.109060764312744, + "learning_rate": 4.9094386072177945e-05, + "loss": 5.449, + "step": 14451 + }, + { + "epoch": 0.08595013797697212, + "grad_norm": 2.5615251064300537, + "learning_rate": 4.909426148578231e-05, + "loss": 4.7441, + "step": 14452 + }, + { + "epoch": 0.08595608526025311, + "grad_norm": 1.7670586109161377, + "learning_rate": 4.909413689097561e-05, + "loss": 5.4488, + "step": 14453 + }, + { + "epoch": 0.08596203254353411, + "grad_norm": 1.9190126657485962, + "learning_rate": 4.909401228775789e-05, + "loss": 5.3128, + "step": 14454 + }, + { + "epoch": 0.08596797982681512, + "grad_norm": 1.679866909980774, + "learning_rate": 4.90938876761292e-05, + "loss": 5.4575, + "step": 14455 + }, + { + "epoch": 0.0859739271100961, + "grad_norm": 1.6199991703033447, + "learning_rate": 4.909376305608959e-05, + "loss": 5.541, + "step": 14456 + }, + { + "epoch": 0.0859798743933771, + "grad_norm": 1.876761794090271, + "learning_rate": 4.9093638427639096e-05, + "loss": 5.7256, + "step": 14457 + }, + { + "epoch": 0.08598582167665811, + "grad_norm": 1.7833212614059448, + "learning_rate": 4.909351379077776e-05, + "loss": 5.6512, + "step": 14458 + }, + { + "epoch": 0.0859917689599391, + "grad_norm": 2.249696731567383, + "learning_rate": 4.909338914550562e-05, + "loss": 5.6517, + "step": 14459 + }, + { + "epoch": 0.0859977162432201, + "grad_norm": 1.8037621974945068, + "learning_rate": 4.909326449182273e-05, + "loss": 5.7564, + "step": 14460 + }, + { + "epoch": 0.0860036635265011, + "grad_norm": 1.4057918787002563, + "learning_rate": 4.909313982972914e-05, + "loss": 5.6259, + "step": 14461 + }, + { + "epoch": 0.08600961080978209, + "grad_norm": 1.5501145124435425, + "learning_rate": 4.9093015159224874e-05, + "loss": 5.6626, + "step": 14462 + }, + { + "epoch": 0.08601555809306309, + "grad_norm": 1.8189458847045898, + "learning_rate": 4.909289048030999e-05, + "loss": 5.4682, + "step": 14463 + }, + { + "epoch": 0.08602150537634409, + "grad_norm": 1.6819778680801392, + "learning_rate": 4.909276579298452e-05, + "loss": 5.3511, + "step": 14464 + }, + { + "epoch": 0.08602745265962508, + "grad_norm": 1.8401011228561401, + "learning_rate": 4.909264109724853e-05, + "loss": 5.531, + "step": 14465 + }, + { + "epoch": 0.08603339994290608, + "grad_norm": 1.6418116092681885, + "learning_rate": 4.909251639310203e-05, + "loss": 5.2885, + "step": 14466 + }, + { + "epoch": 0.08603934722618707, + "grad_norm": 1.4331059455871582, + "learning_rate": 4.909239168054509e-05, + "loss": 5.2792, + "step": 14467 + }, + { + "epoch": 0.08604529450946807, + "grad_norm": 1.4047703742980957, + "learning_rate": 4.9092266959577745e-05, + "loss": 5.2179, + "step": 14468 + }, + { + "epoch": 0.08605124179274908, + "grad_norm": 1.641930103302002, + "learning_rate": 4.909214223020003e-05, + "loss": 5.475, + "step": 14469 + }, + { + "epoch": 0.08605718907603006, + "grad_norm": 1.9879019260406494, + "learning_rate": 4.909201749241201e-05, + "loss": 5.3893, + "step": 14470 + }, + { + "epoch": 0.08606313635931107, + "grad_norm": 1.4790434837341309, + "learning_rate": 4.909189274621371e-05, + "loss": 5.3011, + "step": 14471 + }, + { + "epoch": 0.08606908364259207, + "grad_norm": 1.4283875226974487, + "learning_rate": 4.909176799160518e-05, + "loss": 5.4181, + "step": 14472 + }, + { + "epoch": 0.08607503092587306, + "grad_norm": 1.6676496267318726, + "learning_rate": 4.909164322858646e-05, + "loss": 5.4682, + "step": 14473 + }, + { + "epoch": 0.08608097820915406, + "grad_norm": 1.4858648777008057, + "learning_rate": 4.9091518457157605e-05, + "loss": 5.3073, + "step": 14474 + }, + { + "epoch": 0.08608692549243506, + "grad_norm": 1.5135246515274048, + "learning_rate": 4.909139367731864e-05, + "loss": 5.4039, + "step": 14475 + }, + { + "epoch": 0.08609287277571605, + "grad_norm": 1.353051781654358, + "learning_rate": 4.909126888906962e-05, + "loss": 5.5455, + "step": 14476 + }, + { + "epoch": 0.08609882005899705, + "grad_norm": 1.2824941873550415, + "learning_rate": 4.909114409241059e-05, + "loss": 5.6465, + "step": 14477 + }, + { + "epoch": 0.08610476734227805, + "grad_norm": 1.3398411273956299, + "learning_rate": 4.909101928734159e-05, + "loss": 5.5299, + "step": 14478 + }, + { + "epoch": 0.08611071462555904, + "grad_norm": 1.167169213294983, + "learning_rate": 4.909089447386266e-05, + "loss": 5.4376, + "step": 14479 + }, + { + "epoch": 0.08611666190884004, + "grad_norm": 1.2469842433929443, + "learning_rate": 4.9090769651973846e-05, + "loss": 5.4945, + "step": 14480 + }, + { + "epoch": 0.08612260919212104, + "grad_norm": 1.3025931119918823, + "learning_rate": 4.90906448216752e-05, + "loss": 5.3283, + "step": 14481 + }, + { + "epoch": 0.08612855647540203, + "grad_norm": 1.597223162651062, + "learning_rate": 4.909051998296675e-05, + "loss": 5.0729, + "step": 14482 + }, + { + "epoch": 0.08613450375868303, + "grad_norm": 1.53999662399292, + "learning_rate": 4.909039513584856e-05, + "loss": 5.2956, + "step": 14483 + }, + { + "epoch": 0.08614045104196404, + "grad_norm": 1.462623953819275, + "learning_rate": 4.909027028032066e-05, + "loss": 5.2748, + "step": 14484 + }, + { + "epoch": 0.08614639832524502, + "grad_norm": 1.380196452140808, + "learning_rate": 4.909014541638309e-05, + "loss": 5.4184, + "step": 14485 + }, + { + "epoch": 0.08615234560852603, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.90900205440359e-05, + "loss": 5.2064, + "step": 14486 + }, + { + "epoch": 0.08615829289180703, + "grad_norm": 1.406848430633545, + "learning_rate": 4.9089895663279136e-05, + "loss": 5.2019, + "step": 14487 + }, + { + "epoch": 0.08616424017508802, + "grad_norm": 1.3956660032272339, + "learning_rate": 4.908977077411283e-05, + "loss": 5.128, + "step": 14488 + }, + { + "epoch": 0.08617018745836902, + "grad_norm": 1.4705348014831543, + "learning_rate": 4.9089645876537044e-05, + "loss": 5.3451, + "step": 14489 + }, + { + "epoch": 0.08617613474165002, + "grad_norm": 1.4385737180709839, + "learning_rate": 4.9089520970551804e-05, + "loss": 5.0668, + "step": 14490 + }, + { + "epoch": 0.08618208202493101, + "grad_norm": 1.584478735923767, + "learning_rate": 4.908939605615717e-05, + "loss": 4.9412, + "step": 14491 + }, + { + "epoch": 0.08618802930821201, + "grad_norm": 1.2740134000778198, + "learning_rate": 4.908927113335317e-05, + "loss": 4.8684, + "step": 14492 + }, + { + "epoch": 0.08619397659149301, + "grad_norm": 1.5669810771942139, + "learning_rate": 4.9089146202139856e-05, + "loss": 5.1903, + "step": 14493 + }, + { + "epoch": 0.086199923874774, + "grad_norm": 1.6113348007202148, + "learning_rate": 4.908902126251727e-05, + "loss": 5.1217, + "step": 14494 + }, + { + "epoch": 0.086205871158055, + "grad_norm": 1.6401634216308594, + "learning_rate": 4.908889631448546e-05, + "loss": 5.2241, + "step": 14495 + }, + { + "epoch": 0.08621181844133599, + "grad_norm": 1.522625207901001, + "learning_rate": 4.9088771358044456e-05, + "loss": 5.1858, + "step": 14496 + }, + { + "epoch": 0.086217765724617, + "grad_norm": 1.3802037239074707, + "learning_rate": 4.9088646393194316e-05, + "loss": 5.2349, + "step": 14497 + }, + { + "epoch": 0.086223713007898, + "grad_norm": 1.5226190090179443, + "learning_rate": 4.9088521419935076e-05, + "loss": 5.2612, + "step": 14498 + }, + { + "epoch": 0.08622966029117898, + "grad_norm": 1.3293451070785522, + "learning_rate": 4.9088396438266785e-05, + "loss": 5.169, + "step": 14499 + }, + { + "epoch": 0.08623560757445999, + "grad_norm": 1.334403157234192, + "learning_rate": 4.908827144818948e-05, + "loss": 5.1139, + "step": 14500 + }, + { + "epoch": 0.08624155485774099, + "grad_norm": 1.5195876359939575, + "learning_rate": 4.908814644970321e-05, + "loss": 5.1473, + "step": 14501 + }, + { + "epoch": 0.08624750214102198, + "grad_norm": 1.3367561101913452, + "learning_rate": 4.908802144280802e-05, + "loss": 5.1148, + "step": 14502 + }, + { + "epoch": 0.08625344942430298, + "grad_norm": 1.485002875328064, + "learning_rate": 4.908789642750395e-05, + "loss": 5.0796, + "step": 14503 + }, + { + "epoch": 0.08625939670758398, + "grad_norm": 1.3907506465911865, + "learning_rate": 4.9087771403791037e-05, + "loss": 5.1382, + "step": 14504 + }, + { + "epoch": 0.08626534399086497, + "grad_norm": 1.5129644870758057, + "learning_rate": 4.9087646371669336e-05, + "loss": 5.037, + "step": 14505 + }, + { + "epoch": 0.08627129127414597, + "grad_norm": 1.4666407108306885, + "learning_rate": 4.9087521331138896e-05, + "loss": 5.1877, + "step": 14506 + }, + { + "epoch": 0.08627723855742697, + "grad_norm": 1.5812102556228638, + "learning_rate": 4.9087396282199736e-05, + "loss": 5.2588, + "step": 14507 + }, + { + "epoch": 0.08628318584070796, + "grad_norm": 2.976067066192627, + "learning_rate": 4.908727122485193e-05, + "loss": 4.7477, + "step": 14508 + }, + { + "epoch": 0.08628913312398896, + "grad_norm": 1.5401511192321777, + "learning_rate": 4.90871461590955e-05, + "loss": 5.2242, + "step": 14509 + }, + { + "epoch": 0.08629508040726996, + "grad_norm": 1.3266774415969849, + "learning_rate": 4.9087021084930486e-05, + "loss": 5.2792, + "step": 14510 + }, + { + "epoch": 0.08630102769055095, + "grad_norm": 1.3292385339736938, + "learning_rate": 4.9086896002356956e-05, + "loss": 5.2434, + "step": 14511 + }, + { + "epoch": 0.08630697497383195, + "grad_norm": 1.237931489944458, + "learning_rate": 4.908677091137493e-05, + "loss": 5.2173, + "step": 14512 + }, + { + "epoch": 0.08631292225711296, + "grad_norm": 1.2488665580749512, + "learning_rate": 4.908664581198447e-05, + "loss": 5.1262, + "step": 14513 + }, + { + "epoch": 0.08631886954039394, + "grad_norm": 1.5126835107803345, + "learning_rate": 4.9086520704185604e-05, + "loss": 5.2258, + "step": 14514 + }, + { + "epoch": 0.08632481682367495, + "grad_norm": 1.3975410461425781, + "learning_rate": 4.908639558797839e-05, + "loss": 4.9266, + "step": 14515 + }, + { + "epoch": 0.08633076410695595, + "grad_norm": 1.2499217987060547, + "learning_rate": 4.908627046336285e-05, + "loss": 5.1564, + "step": 14516 + }, + { + "epoch": 0.08633671139023694, + "grad_norm": 1.6880254745483398, + "learning_rate": 4.908614533033905e-05, + "loss": 5.0906, + "step": 14517 + }, + { + "epoch": 0.08634265867351794, + "grad_norm": 1.498849630355835, + "learning_rate": 4.908602018890702e-05, + "loss": 5.0771, + "step": 14518 + }, + { + "epoch": 0.08634860595679894, + "grad_norm": 1.9192509651184082, + "learning_rate": 4.908589503906682e-05, + "loss": 5.2173, + "step": 14519 + }, + { + "epoch": 0.08635455324007993, + "grad_norm": 1.8038657903671265, + "learning_rate": 4.9085769880818475e-05, + "loss": 5.3003, + "step": 14520 + }, + { + "epoch": 0.08636050052336093, + "grad_norm": 1.3908354043960571, + "learning_rate": 4.9085644714162037e-05, + "loss": 5.1943, + "step": 14521 + }, + { + "epoch": 0.08636644780664193, + "grad_norm": 1.336630940437317, + "learning_rate": 4.9085519539097556e-05, + "loss": 5.2693, + "step": 14522 + }, + { + "epoch": 0.08637239508992292, + "grad_norm": 1.6008005142211914, + "learning_rate": 4.908539435562506e-05, + "loss": 5.2779, + "step": 14523 + }, + { + "epoch": 0.08637834237320392, + "grad_norm": 1.4620133638381958, + "learning_rate": 4.9085269163744605e-05, + "loss": 5.0467, + "step": 14524 + }, + { + "epoch": 0.08638428965648491, + "grad_norm": 1.5825145244598389, + "learning_rate": 4.9085143963456236e-05, + "loss": 4.9838, + "step": 14525 + }, + { + "epoch": 0.08639023693976591, + "grad_norm": 1.751550555229187, + "learning_rate": 4.9085018754759995e-05, + "loss": 5.0467, + "step": 14526 + }, + { + "epoch": 0.08639618422304692, + "grad_norm": 1.5967564582824707, + "learning_rate": 4.908489353765591e-05, + "loss": 5.0685, + "step": 14527 + }, + { + "epoch": 0.0864021315063279, + "grad_norm": 1.646323800086975, + "learning_rate": 4.908476831214405e-05, + "loss": 4.9341, + "step": 14528 + }, + { + "epoch": 0.0864080787896089, + "grad_norm": 1.482224464416504, + "learning_rate": 4.908464307822443e-05, + "loss": 4.9893, + "step": 14529 + }, + { + "epoch": 0.08641402607288991, + "grad_norm": 1.5190521478652954, + "learning_rate": 4.908451783589713e-05, + "loss": 5.0747, + "step": 14530 + }, + { + "epoch": 0.0864199733561709, + "grad_norm": 1.41251802444458, + "learning_rate": 4.908439258516215e-05, + "loss": 5.0098, + "step": 14531 + }, + { + "epoch": 0.0864259206394519, + "grad_norm": 1.678646445274353, + "learning_rate": 4.9084267326019576e-05, + "loss": 5.0224, + "step": 14532 + }, + { + "epoch": 0.0864318679227329, + "grad_norm": 1.5203865766525269, + "learning_rate": 4.908414205846943e-05, + "loss": 5.109, + "step": 14533 + }, + { + "epoch": 0.08643781520601389, + "grad_norm": 1.5437216758728027, + "learning_rate": 4.9084016782511754e-05, + "loss": 5.1168, + "step": 14534 + }, + { + "epoch": 0.08644376248929489, + "grad_norm": 1.3460302352905273, + "learning_rate": 4.90838914981466e-05, + "loss": 5.1038, + "step": 14535 + }, + { + "epoch": 0.08644970977257589, + "grad_norm": 1.4768339395523071, + "learning_rate": 4.908376620537401e-05, + "loss": 5.129, + "step": 14536 + }, + { + "epoch": 0.08645565705585688, + "grad_norm": 1.2669035196304321, + "learning_rate": 4.9083640904194025e-05, + "loss": 5.0856, + "step": 14537 + }, + { + "epoch": 0.08646160433913788, + "grad_norm": 1.5692600011825562, + "learning_rate": 4.9083515594606686e-05, + "loss": 5.0897, + "step": 14538 + }, + { + "epoch": 0.08646755162241888, + "grad_norm": 1.4857045412063599, + "learning_rate": 4.9083390276612044e-05, + "loss": 4.9654, + "step": 14539 + }, + { + "epoch": 0.08647349890569987, + "grad_norm": 1.5537325143814087, + "learning_rate": 4.908326495021014e-05, + "loss": 5.0431, + "step": 14540 + }, + { + "epoch": 0.08647944618898087, + "grad_norm": 1.483089566230774, + "learning_rate": 4.908313961540101e-05, + "loss": 5.0737, + "step": 14541 + }, + { + "epoch": 0.08648539347226188, + "grad_norm": 1.5829899311065674, + "learning_rate": 4.9083014272184716e-05, + "loss": 4.9844, + "step": 14542 + }, + { + "epoch": 0.08649134075554286, + "grad_norm": 1.3660348653793335, + "learning_rate": 4.908288892056128e-05, + "loss": 5.0384, + "step": 14543 + }, + { + "epoch": 0.08649728803882387, + "grad_norm": 1.3721328973770142, + "learning_rate": 4.9082763560530764e-05, + "loss": 5.0993, + "step": 14544 + }, + { + "epoch": 0.08650323532210487, + "grad_norm": 1.412381887435913, + "learning_rate": 4.90826381920932e-05, + "loss": 4.9359, + "step": 14545 + }, + { + "epoch": 0.08650918260538586, + "grad_norm": 1.5164285898208618, + "learning_rate": 4.9082512815248635e-05, + "loss": 5.0156, + "step": 14546 + }, + { + "epoch": 0.08651512988866686, + "grad_norm": 1.5244861841201782, + "learning_rate": 4.9082387429997117e-05, + "loss": 5.0719, + "step": 14547 + }, + { + "epoch": 0.08652107717194786, + "grad_norm": 1.304221510887146, + "learning_rate": 4.908226203633869e-05, + "loss": 4.9553, + "step": 14548 + }, + { + "epoch": 0.08652702445522885, + "grad_norm": 1.328220009803772, + "learning_rate": 4.908213663427338e-05, + "loss": 4.9761, + "step": 14549 + }, + { + "epoch": 0.08653297173850985, + "grad_norm": 1.4459906816482544, + "learning_rate": 4.908201122380126e-05, + "loss": 5.0422, + "step": 14550 + }, + { + "epoch": 0.08653891902179085, + "grad_norm": 1.5402530431747437, + "learning_rate": 4.908188580492235e-05, + "loss": 4.8856, + "step": 14551 + }, + { + "epoch": 0.08654486630507184, + "grad_norm": 1.6573606729507446, + "learning_rate": 4.90817603776367e-05, + "loss": 5.0958, + "step": 14552 + }, + { + "epoch": 0.08655081358835284, + "grad_norm": 1.5214189291000366, + "learning_rate": 4.9081634941944365e-05, + "loss": 4.9494, + "step": 14553 + }, + { + "epoch": 0.08655676087163383, + "grad_norm": 1.4977836608886719, + "learning_rate": 4.908150949784538e-05, + "loss": 4.9166, + "step": 14554 + }, + { + "epoch": 0.08656270815491483, + "grad_norm": 1.4952701330184937, + "learning_rate": 4.908138404533979e-05, + "loss": 4.9371, + "step": 14555 + }, + { + "epoch": 0.08656865543819584, + "grad_norm": 1.2652736902236938, + "learning_rate": 4.9081258584427626e-05, + "loss": 4.9424, + "step": 14556 + }, + { + "epoch": 0.08657460272147682, + "grad_norm": 1.4386261701583862, + "learning_rate": 4.908113311510895e-05, + "loss": 4.8909, + "step": 14557 + }, + { + "epoch": 0.08658055000475783, + "grad_norm": 1.4800533056259155, + "learning_rate": 4.90810076373838e-05, + "loss": 4.9226, + "step": 14558 + }, + { + "epoch": 0.08658649728803883, + "grad_norm": 1.4734489917755127, + "learning_rate": 4.908088215125222e-05, + "loss": 4.9774, + "step": 14559 + }, + { + "epoch": 0.08659244457131982, + "grad_norm": 1.47382390499115, + "learning_rate": 4.9080756656714245e-05, + "loss": 4.9001, + "step": 14560 + }, + { + "epoch": 0.08659839185460082, + "grad_norm": 1.4358749389648438, + "learning_rate": 4.908063115376994e-05, + "loss": 4.8537, + "step": 14561 + }, + { + "epoch": 0.08660433913788182, + "grad_norm": 1.3895947933197021, + "learning_rate": 4.908050564241933e-05, + "loss": 4.9445, + "step": 14562 + }, + { + "epoch": 0.08661028642116281, + "grad_norm": 1.6166354417800903, + "learning_rate": 4.908038012266246e-05, + "loss": 4.9447, + "step": 14563 + }, + { + "epoch": 0.08661623370444381, + "grad_norm": 1.4621998071670532, + "learning_rate": 4.908025459449938e-05, + "loss": 5.0405, + "step": 14564 + }, + { + "epoch": 0.08662218098772481, + "grad_norm": 1.4160699844360352, + "learning_rate": 4.908012905793013e-05, + "loss": 5.1246, + "step": 14565 + }, + { + "epoch": 0.0866281282710058, + "grad_norm": 1.3748950958251953, + "learning_rate": 4.9080003512954756e-05, + "loss": 5.0856, + "step": 14566 + }, + { + "epoch": 0.0866340755542868, + "grad_norm": 1.5496206283569336, + "learning_rate": 4.9079877959573303e-05, + "loss": 5.1539, + "step": 14567 + }, + { + "epoch": 0.0866400228375678, + "grad_norm": 1.2577475309371948, + "learning_rate": 4.9079752397785814e-05, + "loss": 5.033, + "step": 14568 + }, + { + "epoch": 0.08664597012084879, + "grad_norm": 1.3565775156021118, + "learning_rate": 4.9079626827592336e-05, + "loss": 4.977, + "step": 14569 + }, + { + "epoch": 0.0866519174041298, + "grad_norm": 1.869673252105713, + "learning_rate": 4.90795012489929e-05, + "loss": 5.0452, + "step": 14570 + }, + { + "epoch": 0.0866578646874108, + "grad_norm": 1.3931822776794434, + "learning_rate": 4.907937566198757e-05, + "loss": 5.0182, + "step": 14571 + }, + { + "epoch": 0.08666381197069178, + "grad_norm": 1.5796258449554443, + "learning_rate": 4.907925006657637e-05, + "loss": 5.0167, + "step": 14572 + }, + { + "epoch": 0.08666975925397279, + "grad_norm": 1.439174771308899, + "learning_rate": 4.9079124462759356e-05, + "loss": 5.0223, + "step": 14573 + }, + { + "epoch": 0.08667570653725379, + "grad_norm": 1.5269712209701538, + "learning_rate": 4.907899885053657e-05, + "loss": 5.0726, + "step": 14574 + }, + { + "epoch": 0.08668165382053478, + "grad_norm": 1.6334160566329956, + "learning_rate": 4.9078873229908054e-05, + "loss": 4.902, + "step": 14575 + }, + { + "epoch": 0.08668760110381578, + "grad_norm": 1.2883020639419556, + "learning_rate": 4.9078747600873846e-05, + "loss": 5.0168, + "step": 14576 + }, + { + "epoch": 0.08669354838709678, + "grad_norm": 1.3399035930633545, + "learning_rate": 4.9078621963434e-05, + "loss": 5.1285, + "step": 14577 + }, + { + "epoch": 0.08669949567037777, + "grad_norm": 1.6066272258758545, + "learning_rate": 4.9078496317588556e-05, + "loss": 5.1761, + "step": 14578 + }, + { + "epoch": 0.08670544295365877, + "grad_norm": 1.5316112041473389, + "learning_rate": 4.907837066333756e-05, + "loss": 4.9691, + "step": 14579 + }, + { + "epoch": 0.08671139023693977, + "grad_norm": 1.2680541276931763, + "learning_rate": 4.907824500068105e-05, + "loss": 4.984, + "step": 14580 + }, + { + "epoch": 0.08671733752022076, + "grad_norm": 1.3451861143112183, + "learning_rate": 4.9078119329619076e-05, + "loss": 5.1079, + "step": 14581 + }, + { + "epoch": 0.08672328480350176, + "grad_norm": 1.4813716411590576, + "learning_rate": 4.907799365015168e-05, + "loss": 5.0822, + "step": 14582 + }, + { + "epoch": 0.08672923208678275, + "grad_norm": 1.2526417970657349, + "learning_rate": 4.90778679622789e-05, + "loss": 5.0981, + "step": 14583 + }, + { + "epoch": 0.08673517937006375, + "grad_norm": 1.320970058441162, + "learning_rate": 4.907774226600079e-05, + "loss": 5.2046, + "step": 14584 + }, + { + "epoch": 0.08674112665334476, + "grad_norm": 1.4376531839370728, + "learning_rate": 4.907761656131739e-05, + "loss": 5.0422, + "step": 14585 + }, + { + "epoch": 0.08674707393662574, + "grad_norm": 1.3290382623672485, + "learning_rate": 4.907749084822873e-05, + "loss": 4.9587, + "step": 14586 + }, + { + "epoch": 0.08675302121990675, + "grad_norm": 1.4613630771636963, + "learning_rate": 4.907736512673489e-05, + "loss": 5.0141, + "step": 14587 + }, + { + "epoch": 0.08675896850318775, + "grad_norm": 1.2996604442596436, + "learning_rate": 4.907723939683587e-05, + "loss": 5.0881, + "step": 14588 + }, + { + "epoch": 0.08676491578646874, + "grad_norm": 1.5718237161636353, + "learning_rate": 4.907711365853174e-05, + "loss": 5.0104, + "step": 14589 + }, + { + "epoch": 0.08677086306974974, + "grad_norm": 1.5009227991104126, + "learning_rate": 4.907698791182255e-05, + "loss": 4.9257, + "step": 14590 + }, + { + "epoch": 0.08677681035303074, + "grad_norm": 1.4179331064224243, + "learning_rate": 4.907686215670831e-05, + "loss": 5.0209, + "step": 14591 + }, + { + "epoch": 0.08678275763631173, + "grad_norm": 1.3447542190551758, + "learning_rate": 4.9076736393189105e-05, + "loss": 5.0633, + "step": 14592 + }, + { + "epoch": 0.08678870491959273, + "grad_norm": 1.4221898317337036, + "learning_rate": 4.907661062126495e-05, + "loss": 4.907, + "step": 14593 + }, + { + "epoch": 0.08679465220287373, + "grad_norm": 1.5112396478652954, + "learning_rate": 4.907648484093591e-05, + "loss": 5.0703, + "step": 14594 + }, + { + "epoch": 0.08680059948615472, + "grad_norm": 1.3118572235107422, + "learning_rate": 4.907635905220201e-05, + "loss": 5.0089, + "step": 14595 + }, + { + "epoch": 0.08680654676943572, + "grad_norm": 1.6776518821716309, + "learning_rate": 4.90762332550633e-05, + "loss": 4.9705, + "step": 14596 + }, + { + "epoch": 0.08681249405271672, + "grad_norm": 1.467530608177185, + "learning_rate": 4.9076107449519824e-05, + "loss": 5.0596, + "step": 14597 + }, + { + "epoch": 0.08681844133599771, + "grad_norm": 1.5924569368362427, + "learning_rate": 4.907598163557163e-05, + "loss": 4.9904, + "step": 14598 + }, + { + "epoch": 0.08682438861927871, + "grad_norm": 1.1862461566925049, + "learning_rate": 4.907585581321877e-05, + "loss": 5.2065, + "step": 14599 + }, + { + "epoch": 0.08683033590255972, + "grad_norm": 1.5537490844726562, + "learning_rate": 4.9075729982461265e-05, + "loss": 4.9604, + "step": 14600 + }, + { + "epoch": 0.0868362831858407, + "grad_norm": 1.5608946084976196, + "learning_rate": 4.9075604143299176e-05, + "loss": 4.9951, + "step": 14601 + }, + { + "epoch": 0.0868422304691217, + "grad_norm": 1.3890982866287231, + "learning_rate": 4.907547829573254e-05, + "loss": 5.1994, + "step": 14602 + }, + { + "epoch": 0.08684817775240271, + "grad_norm": 1.5367194414138794, + "learning_rate": 4.907535243976141e-05, + "loss": 5.008, + "step": 14603 + }, + { + "epoch": 0.0868541250356837, + "grad_norm": 1.5362403392791748, + "learning_rate": 4.9075226575385814e-05, + "loss": 5.0239, + "step": 14604 + }, + { + "epoch": 0.0868600723189647, + "grad_norm": 1.3252228498458862, + "learning_rate": 4.9075100702605814e-05, + "loss": 4.9663, + "step": 14605 + }, + { + "epoch": 0.0868660196022457, + "grad_norm": 1.4381712675094604, + "learning_rate": 4.907497482142144e-05, + "loss": 5.1457, + "step": 14606 + }, + { + "epoch": 0.08687196688552669, + "grad_norm": 1.5137197971343994, + "learning_rate": 4.907484893183274e-05, + "loss": 4.9831, + "step": 14607 + }, + { + "epoch": 0.08687791416880769, + "grad_norm": 1.5544081926345825, + "learning_rate": 4.907472303383976e-05, + "loss": 5.0485, + "step": 14608 + }, + { + "epoch": 0.08688386145208869, + "grad_norm": 1.4613279104232788, + "learning_rate": 4.907459712744254e-05, + "loss": 5.3929, + "step": 14609 + }, + { + "epoch": 0.08688980873536968, + "grad_norm": 1.2830102443695068, + "learning_rate": 4.907447121264113e-05, + "loss": 5.4241, + "step": 14610 + }, + { + "epoch": 0.08689575601865068, + "grad_norm": 1.2168337106704712, + "learning_rate": 4.907434528943558e-05, + "loss": 5.4678, + "step": 14611 + }, + { + "epoch": 0.08690170330193167, + "grad_norm": 1.3995872735977173, + "learning_rate": 4.907421935782591e-05, + "loss": 5.2, + "step": 14612 + }, + { + "epoch": 0.08690765058521267, + "grad_norm": 1.4081990718841553, + "learning_rate": 4.907409341781219e-05, + "loss": 5.4356, + "step": 14613 + }, + { + "epoch": 0.08691359786849367, + "grad_norm": 1.4506621360778809, + "learning_rate": 4.9073967469394436e-05, + "loss": 5.3816, + "step": 14614 + }, + { + "epoch": 0.08691954515177466, + "grad_norm": 1.3564461469650269, + "learning_rate": 4.907384151257272e-05, + "loss": 5.2808, + "step": 14615 + }, + { + "epoch": 0.08692549243505567, + "grad_norm": 1.3663856983184814, + "learning_rate": 4.907371554734708e-05, + "loss": 5.4286, + "step": 14616 + }, + { + "epoch": 0.08693143971833667, + "grad_norm": 1.5905755758285522, + "learning_rate": 4.907358957371755e-05, + "loss": 5.3404, + "step": 14617 + }, + { + "epoch": 0.08693738700161766, + "grad_norm": 1.6172430515289307, + "learning_rate": 4.9073463591684175e-05, + "loss": 5.2511, + "step": 14618 + }, + { + "epoch": 0.08694333428489866, + "grad_norm": 1.362925410270691, + "learning_rate": 4.9073337601247e-05, + "loss": 5.3786, + "step": 14619 + }, + { + "epoch": 0.08694928156817966, + "grad_norm": 1.4276455640792847, + "learning_rate": 4.907321160240608e-05, + "loss": 5.1243, + "step": 14620 + }, + { + "epoch": 0.08695522885146065, + "grad_norm": 1.5211840867996216, + "learning_rate": 4.907308559516145e-05, + "loss": 5.1465, + "step": 14621 + }, + { + "epoch": 0.08696117613474165, + "grad_norm": 1.4728838205337524, + "learning_rate": 4.9072959579513146e-05, + "loss": 4.9585, + "step": 14622 + }, + { + "epoch": 0.08696712341802265, + "grad_norm": 1.5337111949920654, + "learning_rate": 4.907283355546123e-05, + "loss": 5.0553, + "step": 14623 + }, + { + "epoch": 0.08697307070130364, + "grad_norm": 1.3105639219284058, + "learning_rate": 4.907270752300573e-05, + "loss": 5.2724, + "step": 14624 + }, + { + "epoch": 0.08697901798458464, + "grad_norm": 1.4726678133010864, + "learning_rate": 4.90725814821467e-05, + "loss": 5.2771, + "step": 14625 + }, + { + "epoch": 0.08698496526786564, + "grad_norm": 1.5226463079452515, + "learning_rate": 4.907245543288418e-05, + "loss": 5.2294, + "step": 14626 + }, + { + "epoch": 0.08699091255114663, + "grad_norm": 1.4187650680541992, + "learning_rate": 4.9072329375218215e-05, + "loss": 5.0003, + "step": 14627 + }, + { + "epoch": 0.08699685983442763, + "grad_norm": 1.3565301895141602, + "learning_rate": 4.907220330914885e-05, + "loss": 5.0616, + "step": 14628 + }, + { + "epoch": 0.08700280711770864, + "grad_norm": 1.3763781785964966, + "learning_rate": 4.907207723467612e-05, + "loss": 5.1036, + "step": 14629 + }, + { + "epoch": 0.08700875440098962, + "grad_norm": 1.350926160812378, + "learning_rate": 4.907195115180009e-05, + "loss": 5.3433, + "step": 14630 + }, + { + "epoch": 0.08701470168427063, + "grad_norm": 1.4927095174789429, + "learning_rate": 4.907182506052078e-05, + "loss": 5.3726, + "step": 14631 + }, + { + "epoch": 0.08702064896755163, + "grad_norm": 1.9378905296325684, + "learning_rate": 4.907169896083824e-05, + "loss": 4.9942, + "step": 14632 + }, + { + "epoch": 0.08702659625083262, + "grad_norm": 1.2046253681182861, + "learning_rate": 4.907157285275253e-05, + "loss": 5.2877, + "step": 14633 + }, + { + "epoch": 0.08703254353411362, + "grad_norm": 1.352828025817871, + "learning_rate": 4.907144673626368e-05, + "loss": 5.264, + "step": 14634 + }, + { + "epoch": 0.08703849081739462, + "grad_norm": 1.4438698291778564, + "learning_rate": 4.907132061137173e-05, + "loss": 5.1767, + "step": 14635 + }, + { + "epoch": 0.08704443810067561, + "grad_norm": 1.4066534042358398, + "learning_rate": 4.9071194478076734e-05, + "loss": 5.0919, + "step": 14636 + }, + { + "epoch": 0.08705038538395661, + "grad_norm": 1.4313786029815674, + "learning_rate": 4.9071068336378736e-05, + "loss": 5.0307, + "step": 14637 + }, + { + "epoch": 0.08705633266723761, + "grad_norm": 1.3995366096496582, + "learning_rate": 4.907094218627778e-05, + "loss": 4.9508, + "step": 14638 + }, + { + "epoch": 0.0870622799505186, + "grad_norm": 1.395270824432373, + "learning_rate": 4.90708160277739e-05, + "loss": 5.1403, + "step": 14639 + }, + { + "epoch": 0.0870682272337996, + "grad_norm": 1.4280959367752075, + "learning_rate": 4.9070689860867144e-05, + "loss": 5.1675, + "step": 14640 + }, + { + "epoch": 0.08707417451708059, + "grad_norm": 1.5028926134109497, + "learning_rate": 4.907056368555757e-05, + "loss": 5.1178, + "step": 14641 + }, + { + "epoch": 0.08708012180036159, + "grad_norm": 1.480936884880066, + "learning_rate": 4.90704375018452e-05, + "loss": 5.1681, + "step": 14642 + }, + { + "epoch": 0.0870860690836426, + "grad_norm": 1.474708914756775, + "learning_rate": 4.907031130973009e-05, + "loss": 4.998, + "step": 14643 + }, + { + "epoch": 0.08709201636692358, + "grad_norm": 1.719551920890808, + "learning_rate": 4.907018510921229e-05, + "loss": 5.0486, + "step": 14644 + }, + { + "epoch": 0.08709796365020459, + "grad_norm": 1.6314032077789307, + "learning_rate": 4.907005890029184e-05, + "loss": 4.9233, + "step": 14645 + }, + { + "epoch": 0.08710391093348559, + "grad_norm": 1.635712742805481, + "learning_rate": 4.906993268296877e-05, + "loss": 4.7026, + "step": 14646 + }, + { + "epoch": 0.08710985821676658, + "grad_norm": 1.5682891607284546, + "learning_rate": 4.906980645724314e-05, + "loss": 4.7681, + "step": 14647 + }, + { + "epoch": 0.08711580550004758, + "grad_norm": 1.5149590969085693, + "learning_rate": 4.906968022311499e-05, + "loss": 4.6026, + "step": 14648 + }, + { + "epoch": 0.08712175278332858, + "grad_norm": 1.666756510734558, + "learning_rate": 4.906955398058436e-05, + "loss": 4.6652, + "step": 14649 + }, + { + "epoch": 0.08712770006660957, + "grad_norm": 1.563281536102295, + "learning_rate": 4.906942772965129e-05, + "loss": 4.8195, + "step": 14650 + }, + { + "epoch": 0.08713364734989057, + "grad_norm": 1.3730766773223877, + "learning_rate": 4.906930147031585e-05, + "loss": 5.3917, + "step": 14651 + }, + { + "epoch": 0.08713959463317157, + "grad_norm": 1.344741940498352, + "learning_rate": 4.906917520257805e-05, + "loss": 5.4866, + "step": 14652 + }, + { + "epoch": 0.08714554191645256, + "grad_norm": 1.4403667449951172, + "learning_rate": 4.906904892643796e-05, + "loss": 5.3869, + "step": 14653 + }, + { + "epoch": 0.08715148919973356, + "grad_norm": 1.4251221418380737, + "learning_rate": 4.906892264189561e-05, + "loss": 5.5564, + "step": 14654 + }, + { + "epoch": 0.08715743648301456, + "grad_norm": 1.0403032302856445, + "learning_rate": 4.9068796348951055e-05, + "loss": 5.3422, + "step": 14655 + }, + { + "epoch": 0.08716338376629555, + "grad_norm": 1.4933732748031616, + "learning_rate": 4.9068670047604313e-05, + "loss": 4.9035, + "step": 14656 + }, + { + "epoch": 0.08716933104957655, + "grad_norm": 1.820141315460205, + "learning_rate": 4.9068543737855466e-05, + "loss": 4.8447, + "step": 14657 + }, + { + "epoch": 0.08717527833285756, + "grad_norm": 1.5337603092193604, + "learning_rate": 4.9068417419704526e-05, + "loss": 4.7122, + "step": 14658 + }, + { + "epoch": 0.08718122561613854, + "grad_norm": 1.6933845281600952, + "learning_rate": 4.9068291093151555e-05, + "loss": 4.6246, + "step": 14659 + }, + { + "epoch": 0.08718717289941955, + "grad_norm": 1.607749342918396, + "learning_rate": 4.906816475819659e-05, + "loss": 4.5246, + "step": 14660 + }, + { + "epoch": 0.08719312018270055, + "grad_norm": 1.6468732357025146, + "learning_rate": 4.906803841483969e-05, + "loss": 4.5529, + "step": 14661 + }, + { + "epoch": 0.08719906746598154, + "grad_norm": 1.7252613306045532, + "learning_rate": 4.906791206308087e-05, + "loss": 4.5866, + "step": 14662 + }, + { + "epoch": 0.08720501474926254, + "grad_norm": 1.8178141117095947, + "learning_rate": 4.90677857029202e-05, + "loss": 4.6312, + "step": 14663 + }, + { + "epoch": 0.08721096203254354, + "grad_norm": 1.6173008680343628, + "learning_rate": 4.906765933435771e-05, + "loss": 4.5964, + "step": 14664 + }, + { + "epoch": 0.08721690931582453, + "grad_norm": 1.4914458990097046, + "learning_rate": 4.9067532957393444e-05, + "loss": 4.7123, + "step": 14665 + }, + { + "epoch": 0.08722285659910553, + "grad_norm": 1.5310544967651367, + "learning_rate": 4.9067406572027465e-05, + "loss": 4.6907, + "step": 14666 + }, + { + "epoch": 0.08722880388238653, + "grad_norm": 1.4311203956604004, + "learning_rate": 4.9067280178259794e-05, + "loss": 4.7749, + "step": 14667 + }, + { + "epoch": 0.08723475116566752, + "grad_norm": 1.6848034858703613, + "learning_rate": 4.9067153776090484e-05, + "loss": 5.1676, + "step": 14668 + }, + { + "epoch": 0.08724069844894852, + "grad_norm": 1.510909914970398, + "learning_rate": 4.906702736551958e-05, + "loss": 5.1237, + "step": 14669 + }, + { + "epoch": 0.08724664573222951, + "grad_norm": 1.4135887622833252, + "learning_rate": 4.906690094654713e-05, + "loss": 5.131, + "step": 14670 + }, + { + "epoch": 0.08725259301551051, + "grad_norm": 1.5739595890045166, + "learning_rate": 4.906677451917317e-05, + "loss": 5.2374, + "step": 14671 + }, + { + "epoch": 0.08725854029879151, + "grad_norm": 1.592644214630127, + "learning_rate": 4.9066648083397746e-05, + "loss": 5.0424, + "step": 14672 + }, + { + "epoch": 0.0872644875820725, + "grad_norm": 1.3842464685440063, + "learning_rate": 4.906652163922091e-05, + "loss": 5.106, + "step": 14673 + }, + { + "epoch": 0.0872704348653535, + "grad_norm": 1.4318630695343018, + "learning_rate": 4.906639518664269e-05, + "loss": 5.1223, + "step": 14674 + }, + { + "epoch": 0.08727638214863451, + "grad_norm": 1.5598502159118652, + "learning_rate": 4.906626872566314e-05, + "loss": 5.0363, + "step": 14675 + }, + { + "epoch": 0.0872823294319155, + "grad_norm": 1.9367897510528564, + "learning_rate": 4.9066142256282316e-05, + "loss": 4.8822, + "step": 14676 + }, + { + "epoch": 0.0872882767151965, + "grad_norm": 1.8134979009628296, + "learning_rate": 4.906601577850024e-05, + "loss": 4.7218, + "step": 14677 + }, + { + "epoch": 0.0872942239984775, + "grad_norm": 1.5139638185501099, + "learning_rate": 4.9065889292316976e-05, + "loss": 5.0311, + "step": 14678 + }, + { + "epoch": 0.08730017128175849, + "grad_norm": 1.5324028730392456, + "learning_rate": 4.906576279773255e-05, + "loss": 5.2366, + "step": 14679 + }, + { + "epoch": 0.08730611856503949, + "grad_norm": 1.4219286441802979, + "learning_rate": 4.906563629474702e-05, + "loss": 5.1362, + "step": 14680 + }, + { + "epoch": 0.08731206584832049, + "grad_norm": 1.4673584699630737, + "learning_rate": 4.906550978336042e-05, + "loss": 5.1336, + "step": 14681 + }, + { + "epoch": 0.08731801313160148, + "grad_norm": 1.2611639499664307, + "learning_rate": 4.906538326357281e-05, + "loss": 5.1791, + "step": 14682 + }, + { + "epoch": 0.08732396041488248, + "grad_norm": 1.283827543258667, + "learning_rate": 4.9065256735384205e-05, + "loss": 5.0889, + "step": 14683 + }, + { + "epoch": 0.08732990769816348, + "grad_norm": 1.4508111476898193, + "learning_rate": 4.906513019879468e-05, + "loss": 4.9832, + "step": 14684 + }, + { + "epoch": 0.08733585498144447, + "grad_norm": 1.3923978805541992, + "learning_rate": 4.906500365380427e-05, + "loss": 4.8147, + "step": 14685 + }, + { + "epoch": 0.08734180226472547, + "grad_norm": 1.3737010955810547, + "learning_rate": 4.906487710041301e-05, + "loss": 4.8448, + "step": 14686 + }, + { + "epoch": 0.08734774954800648, + "grad_norm": 1.4765465259552002, + "learning_rate": 4.906475053862095e-05, + "loss": 4.8601, + "step": 14687 + }, + { + "epoch": 0.08735369683128746, + "grad_norm": 1.527372121810913, + "learning_rate": 4.906462396842813e-05, + "loss": 4.8898, + "step": 14688 + }, + { + "epoch": 0.08735964411456847, + "grad_norm": 1.2455743551254272, + "learning_rate": 4.9064497389834604e-05, + "loss": 4.9954, + "step": 14689 + }, + { + "epoch": 0.08736559139784947, + "grad_norm": 1.3169753551483154, + "learning_rate": 4.906437080284041e-05, + "loss": 5.1384, + "step": 14690 + }, + { + "epoch": 0.08737153868113046, + "grad_norm": 1.3158196210861206, + "learning_rate": 4.906424420744559e-05, + "loss": 5.032, + "step": 14691 + }, + { + "epoch": 0.08737748596441146, + "grad_norm": 1.5421653985977173, + "learning_rate": 4.9064117603650197e-05, + "loss": 4.6448, + "step": 14692 + }, + { + "epoch": 0.08738343324769246, + "grad_norm": 1.4324442148208618, + "learning_rate": 4.906399099145427e-05, + "loss": 4.819, + "step": 14693 + }, + { + "epoch": 0.08738938053097345, + "grad_norm": 1.299877643585205, + "learning_rate": 4.9063864370857836e-05, + "loss": 5.4793, + "step": 14694 + }, + { + "epoch": 0.08739532781425445, + "grad_norm": 1.8289762735366821, + "learning_rate": 4.906373774186097e-05, + "loss": 5.0972, + "step": 14695 + }, + { + "epoch": 0.08740127509753545, + "grad_norm": 1.5460636615753174, + "learning_rate": 4.9063611104463705e-05, + "loss": 5.0992, + "step": 14696 + }, + { + "epoch": 0.08740722238081644, + "grad_norm": 1.4720163345336914, + "learning_rate": 4.9063484458666076e-05, + "loss": 5.0918, + "step": 14697 + }, + { + "epoch": 0.08741316966409744, + "grad_norm": 1.4653000831604004, + "learning_rate": 4.906335780446813e-05, + "loss": 5.1523, + "step": 14698 + }, + { + "epoch": 0.08741911694737843, + "grad_norm": 1.461012840270996, + "learning_rate": 4.9063231141869914e-05, + "loss": 5.1848, + "step": 14699 + }, + { + "epoch": 0.08742506423065943, + "grad_norm": 1.6757450103759766, + "learning_rate": 4.906310447087148e-05, + "loss": 4.9809, + "step": 14700 + }, + { + "epoch": 0.08743101151394043, + "grad_norm": 1.498402714729309, + "learning_rate": 4.906297779147286e-05, + "loss": 5.1451, + "step": 14701 + }, + { + "epoch": 0.08743695879722142, + "grad_norm": 1.341667652130127, + "learning_rate": 4.906285110367411e-05, + "loss": 5.1973, + "step": 14702 + }, + { + "epoch": 0.08744290608050242, + "grad_norm": 1.5008035898208618, + "learning_rate": 4.9062724407475255e-05, + "loss": 5.0961, + "step": 14703 + }, + { + "epoch": 0.08744885336378343, + "grad_norm": 1.6110866069793701, + "learning_rate": 4.9062597702876354e-05, + "loss": 4.7201, + "step": 14704 + }, + { + "epoch": 0.08745480064706442, + "grad_norm": 1.5154603719711304, + "learning_rate": 4.906247098987746e-05, + "loss": 4.6537, + "step": 14705 + }, + { + "epoch": 0.08746074793034542, + "grad_norm": 1.6169204711914062, + "learning_rate": 4.90623442684786e-05, + "loss": 4.512, + "step": 14706 + }, + { + "epoch": 0.08746669521362642, + "grad_norm": 1.4967073202133179, + "learning_rate": 4.9062217538679824e-05, + "loss": 4.7159, + "step": 14707 + }, + { + "epoch": 0.08747264249690741, + "grad_norm": 1.4621938467025757, + "learning_rate": 4.9062090800481174e-05, + "loss": 4.7553, + "step": 14708 + }, + { + "epoch": 0.08747858978018841, + "grad_norm": 1.694868564605713, + "learning_rate": 4.9061964053882694e-05, + "loss": 4.6801, + "step": 14709 + }, + { + "epoch": 0.08748453706346941, + "grad_norm": 1.6228396892547607, + "learning_rate": 4.906183729888444e-05, + "loss": 4.5402, + "step": 14710 + }, + { + "epoch": 0.0874904843467504, + "grad_norm": 1.388859748840332, + "learning_rate": 4.9061710535486435e-05, + "loss": 4.5645, + "step": 14711 + }, + { + "epoch": 0.0874964316300314, + "grad_norm": 1.546074390411377, + "learning_rate": 4.9061583763688746e-05, + "loss": 4.4146, + "step": 14712 + }, + { + "epoch": 0.0875023789133124, + "grad_norm": 1.5526363849639893, + "learning_rate": 4.90614569834914e-05, + "loss": 4.6027, + "step": 14713 + }, + { + "epoch": 0.08750832619659339, + "grad_norm": 1.6809604167938232, + "learning_rate": 4.9061330194894454e-05, + "loss": 4.4927, + "step": 14714 + }, + { + "epoch": 0.0875142734798744, + "grad_norm": 1.8013920783996582, + "learning_rate": 4.906120339789795e-05, + "loss": 4.6949, + "step": 14715 + }, + { + "epoch": 0.0875202207631554, + "grad_norm": 1.587863564491272, + "learning_rate": 4.906107659250192e-05, + "loss": 4.7255, + "step": 14716 + }, + { + "epoch": 0.08752616804643638, + "grad_norm": 1.4871174097061157, + "learning_rate": 4.9060949778706415e-05, + "loss": 4.6753, + "step": 14717 + }, + { + "epoch": 0.08753211532971739, + "grad_norm": 1.5521314144134521, + "learning_rate": 4.9060822956511485e-05, + "loss": 4.6963, + "step": 14718 + }, + { + "epoch": 0.08753806261299839, + "grad_norm": 1.5176832675933838, + "learning_rate": 4.906069612591717e-05, + "loss": 4.7475, + "step": 14719 + }, + { + "epoch": 0.08754400989627938, + "grad_norm": 1.7381534576416016, + "learning_rate": 4.906056928692352e-05, + "loss": 4.6952, + "step": 14720 + }, + { + "epoch": 0.08754995717956038, + "grad_norm": 1.604637622833252, + "learning_rate": 4.9060442439530564e-05, + "loss": 4.5792, + "step": 14721 + }, + { + "epoch": 0.08755590446284138, + "grad_norm": 1.6367937326431274, + "learning_rate": 4.9060315583738356e-05, + "loss": 4.6422, + "step": 14722 + }, + { + "epoch": 0.08756185174612237, + "grad_norm": 1.5177057981491089, + "learning_rate": 4.906018871954695e-05, + "loss": 4.5682, + "step": 14723 + }, + { + "epoch": 0.08756779902940337, + "grad_norm": 1.5539237260818481, + "learning_rate": 4.906006184695637e-05, + "loss": 4.5194, + "step": 14724 + }, + { + "epoch": 0.08757374631268437, + "grad_norm": 1.7041072845458984, + "learning_rate": 4.905993496596668e-05, + "loss": 4.6526, + "step": 14725 + }, + { + "epoch": 0.08757969359596536, + "grad_norm": 1.7187644243240356, + "learning_rate": 4.9059808076577914e-05, + "loss": 4.6251, + "step": 14726 + }, + { + "epoch": 0.08758564087924636, + "grad_norm": 1.6393675804138184, + "learning_rate": 4.905968117879012e-05, + "loss": 4.7242, + "step": 14727 + }, + { + "epoch": 0.08759158816252735, + "grad_norm": 1.6426397562026978, + "learning_rate": 4.905955427260333e-05, + "loss": 4.6272, + "step": 14728 + }, + { + "epoch": 0.08759753544580835, + "grad_norm": 1.3231829404830933, + "learning_rate": 4.9059427358017605e-05, + "loss": 4.621, + "step": 14729 + }, + { + "epoch": 0.08760348272908935, + "grad_norm": 1.3970234394073486, + "learning_rate": 4.905930043503298e-05, + "loss": 4.6356, + "step": 14730 + }, + { + "epoch": 0.08760943001237034, + "grad_norm": 1.511977195739746, + "learning_rate": 4.90591735036495e-05, + "loss": 4.7408, + "step": 14731 + }, + { + "epoch": 0.08761537729565134, + "grad_norm": 1.284788727760315, + "learning_rate": 4.9059046563867216e-05, + "loss": 5.2573, + "step": 14732 + }, + { + "epoch": 0.08762132457893235, + "grad_norm": 1.5148005485534668, + "learning_rate": 4.905891961568617e-05, + "loss": 5.0465, + "step": 14733 + }, + { + "epoch": 0.08762727186221334, + "grad_norm": 1.3727401494979858, + "learning_rate": 4.905879265910639e-05, + "loss": 5.0424, + "step": 14734 + }, + { + "epoch": 0.08763321914549434, + "grad_norm": 1.4994157552719116, + "learning_rate": 4.9058665694127945e-05, + "loss": 5.1662, + "step": 14735 + }, + { + "epoch": 0.08763916642877534, + "grad_norm": 1.5002670288085938, + "learning_rate": 4.905853872075087e-05, + "loss": 5.0872, + "step": 14736 + }, + { + "epoch": 0.08764511371205633, + "grad_norm": 1.580439567565918, + "learning_rate": 4.90584117389752e-05, + "loss": 5.1315, + "step": 14737 + }, + { + "epoch": 0.08765106099533733, + "grad_norm": 1.416154384613037, + "learning_rate": 4.9058284748801e-05, + "loss": 5.1066, + "step": 14738 + }, + { + "epoch": 0.08765700827861833, + "grad_norm": 1.5391058921813965, + "learning_rate": 4.905815775022828e-05, + "loss": 5.1724, + "step": 14739 + }, + { + "epoch": 0.08766295556189932, + "grad_norm": 1.20875883102417, + "learning_rate": 4.905803074325712e-05, + "loss": 5.152, + "step": 14740 + }, + { + "epoch": 0.08766890284518032, + "grad_norm": 1.27827787399292, + "learning_rate": 4.9057903727887556e-05, + "loss": 5.0271, + "step": 14741 + }, + { + "epoch": 0.08767485012846132, + "grad_norm": 1.1356613636016846, + "learning_rate": 4.9057776704119615e-05, + "loss": 5.0078, + "step": 14742 + }, + { + "epoch": 0.08768079741174231, + "grad_norm": 1.3931230306625366, + "learning_rate": 4.9057649671953355e-05, + "loss": 5.1253, + "step": 14743 + }, + { + "epoch": 0.08768674469502331, + "grad_norm": 1.553105115890503, + "learning_rate": 4.905752263138882e-05, + "loss": 5.1259, + "step": 14744 + }, + { + "epoch": 0.08769269197830432, + "grad_norm": 1.4004448652267456, + "learning_rate": 4.905739558242605e-05, + "loss": 5.1104, + "step": 14745 + }, + { + "epoch": 0.0876986392615853, + "grad_norm": 1.6295247077941895, + "learning_rate": 4.905726852506509e-05, + "loss": 5.0718, + "step": 14746 + }, + { + "epoch": 0.0877045865448663, + "grad_norm": 1.5966804027557373, + "learning_rate": 4.9057141459306e-05, + "loss": 5.1922, + "step": 14747 + }, + { + "epoch": 0.08771053382814731, + "grad_norm": 1.5448883771896362, + "learning_rate": 4.9057014385148795e-05, + "loss": 4.9715, + "step": 14748 + }, + { + "epoch": 0.0877164811114283, + "grad_norm": 1.5252676010131836, + "learning_rate": 4.905688730259354e-05, + "loss": 5.2128, + "step": 14749 + }, + { + "epoch": 0.0877224283947093, + "grad_norm": 1.387237310409546, + "learning_rate": 4.9056760211640274e-05, + "loss": 5.0933, + "step": 14750 + }, + { + "epoch": 0.0877283756779903, + "grad_norm": 1.3318862915039062, + "learning_rate": 4.905663311228904e-05, + "loss": 5.1849, + "step": 14751 + }, + { + "epoch": 0.08773432296127129, + "grad_norm": 1.4328356981277466, + "learning_rate": 4.905650600453989e-05, + "loss": 5.2287, + "step": 14752 + }, + { + "epoch": 0.08774027024455229, + "grad_norm": 1.4316518306732178, + "learning_rate": 4.905637888839285e-05, + "loss": 4.9774, + "step": 14753 + }, + { + "epoch": 0.08774621752783329, + "grad_norm": 1.1666837930679321, + "learning_rate": 4.9056251763847996e-05, + "loss": 5.2098, + "step": 14754 + }, + { + "epoch": 0.08775216481111428, + "grad_norm": 1.4383636713027954, + "learning_rate": 4.9056124630905333e-05, + "loss": 5.2438, + "step": 14755 + }, + { + "epoch": 0.08775811209439528, + "grad_norm": 2.6009883880615234, + "learning_rate": 4.9055997489564936e-05, + "loss": 5.7232, + "step": 14756 + }, + { + "epoch": 0.08776405937767627, + "grad_norm": 1.3072876930236816, + "learning_rate": 4.905587033982684e-05, + "loss": 5.1811, + "step": 14757 + }, + { + "epoch": 0.08777000666095727, + "grad_norm": 1.2538501024246216, + "learning_rate": 4.9055743181691084e-05, + "loss": 5.1557, + "step": 14758 + }, + { + "epoch": 0.08777595394423827, + "grad_norm": 1.2565419673919678, + "learning_rate": 4.905561601515771e-05, + "loss": 5.129, + "step": 14759 + }, + { + "epoch": 0.08778190122751926, + "grad_norm": 1.3041788339614868, + "learning_rate": 4.905548884022678e-05, + "loss": 5.2048, + "step": 14760 + }, + { + "epoch": 0.08778784851080026, + "grad_norm": 1.4548598527908325, + "learning_rate": 4.905536165689832e-05, + "loss": 5.2405, + "step": 14761 + }, + { + "epoch": 0.08779379579408127, + "grad_norm": 1.1748031377792358, + "learning_rate": 4.905523446517239e-05, + "loss": 5.1804, + "step": 14762 + }, + { + "epoch": 0.08779974307736226, + "grad_norm": 1.210534930229187, + "learning_rate": 4.905510726504902e-05, + "loss": 5.1383, + "step": 14763 + }, + { + "epoch": 0.08780569036064326, + "grad_norm": 1.2154903411865234, + "learning_rate": 4.9054980056528264e-05, + "loss": 5.2757, + "step": 14764 + }, + { + "epoch": 0.08781163764392426, + "grad_norm": 1.4123867750167847, + "learning_rate": 4.9054852839610166e-05, + "loss": 5.1268, + "step": 14765 + }, + { + "epoch": 0.08781758492720525, + "grad_norm": 1.3136295080184937, + "learning_rate": 4.905472561429476e-05, + "loss": 5.2186, + "step": 14766 + }, + { + "epoch": 0.08782353221048625, + "grad_norm": 1.2741068601608276, + "learning_rate": 4.905459838058209e-05, + "loss": 4.9737, + "step": 14767 + }, + { + "epoch": 0.08782947949376725, + "grad_norm": 1.2963054180145264, + "learning_rate": 4.9054471138472225e-05, + "loss": 5.1712, + "step": 14768 + }, + { + "epoch": 0.08783542677704824, + "grad_norm": 1.5352611541748047, + "learning_rate": 4.905434388796519e-05, + "loss": 4.9473, + "step": 14769 + }, + { + "epoch": 0.08784137406032924, + "grad_norm": 1.3399711847305298, + "learning_rate": 4.905421662906103e-05, + "loss": 5.2402, + "step": 14770 + }, + { + "epoch": 0.08784732134361024, + "grad_norm": 1.4278292655944824, + "learning_rate": 4.9054089361759794e-05, + "loss": 4.9331, + "step": 14771 + }, + { + "epoch": 0.08785326862689123, + "grad_norm": 1.5057200193405151, + "learning_rate": 4.905396208606151e-05, + "loss": 5.1553, + "step": 14772 + }, + { + "epoch": 0.08785921591017223, + "grad_norm": 1.4660797119140625, + "learning_rate": 4.905383480196625e-05, + "loss": 5.0792, + "step": 14773 + }, + { + "epoch": 0.08786516319345324, + "grad_norm": 1.4386217594146729, + "learning_rate": 4.905370750947405e-05, + "loss": 4.8363, + "step": 14774 + }, + { + "epoch": 0.08787111047673422, + "grad_norm": 1.4555455446243286, + "learning_rate": 4.905358020858493e-05, + "loss": 4.8934, + "step": 14775 + }, + { + "epoch": 0.08787705776001523, + "grad_norm": 1.5161443948745728, + "learning_rate": 4.905345289929897e-05, + "loss": 4.8227, + "step": 14776 + }, + { + "epoch": 0.08788300504329623, + "grad_norm": 1.2704185247421265, + "learning_rate": 4.9053325581616185e-05, + "loss": 4.9612, + "step": 14777 + }, + { + "epoch": 0.08788895232657722, + "grad_norm": 1.6396795511245728, + "learning_rate": 4.905319825553664e-05, + "loss": 4.8947, + "step": 14778 + }, + { + "epoch": 0.08789489960985822, + "grad_norm": 1.49285888671875, + "learning_rate": 4.905307092106037e-05, + "loss": 5.0814, + "step": 14779 + }, + { + "epoch": 0.08790084689313922, + "grad_norm": 1.3829785585403442, + "learning_rate": 4.9052943578187424e-05, + "loss": 5.3864, + "step": 14780 + }, + { + "epoch": 0.08790679417642021, + "grad_norm": 1.517054557800293, + "learning_rate": 4.905281622691784e-05, + "loss": 5.3053, + "step": 14781 + }, + { + "epoch": 0.08791274145970121, + "grad_norm": 1.491402506828308, + "learning_rate": 4.905268886725167e-05, + "loss": 5.3685, + "step": 14782 + }, + { + "epoch": 0.08791868874298221, + "grad_norm": 1.5034211874008179, + "learning_rate": 4.905256149918895e-05, + "loss": 5.2139, + "step": 14783 + }, + { + "epoch": 0.0879246360262632, + "grad_norm": 1.4021977186203003, + "learning_rate": 4.905243412272974e-05, + "loss": 5.301, + "step": 14784 + }, + { + "epoch": 0.0879305833095442, + "grad_norm": 1.44327974319458, + "learning_rate": 4.9052306737874064e-05, + "loss": 5.296, + "step": 14785 + }, + { + "epoch": 0.08793653059282519, + "grad_norm": 1.4733220338821411, + "learning_rate": 4.905217934462198e-05, + "loss": 5.3302, + "step": 14786 + }, + { + "epoch": 0.08794247787610619, + "grad_norm": 1.3308794498443604, + "learning_rate": 4.9052051942973533e-05, + "loss": 5.1835, + "step": 14787 + }, + { + "epoch": 0.0879484251593872, + "grad_norm": 1.2667236328125, + "learning_rate": 4.905192453292876e-05, + "loss": 5.1801, + "step": 14788 + }, + { + "epoch": 0.08795437244266818, + "grad_norm": 1.3284921646118164, + "learning_rate": 4.90517971144877e-05, + "loss": 5.106, + "step": 14789 + }, + { + "epoch": 0.08796031972594918, + "grad_norm": 1.4089261293411255, + "learning_rate": 4.9051669687650415e-05, + "loss": 5.133, + "step": 14790 + }, + { + "epoch": 0.08796626700923019, + "grad_norm": 1.1701233386993408, + "learning_rate": 4.905154225241694e-05, + "loss": 5.1602, + "step": 14791 + }, + { + "epoch": 0.08797221429251117, + "grad_norm": 1.169570803642273, + "learning_rate": 4.9051414808787324e-05, + "loss": 5.1231, + "step": 14792 + }, + { + "epoch": 0.08797816157579218, + "grad_norm": 1.5104409456253052, + "learning_rate": 4.90512873567616e-05, + "loss": 5.0774, + "step": 14793 + }, + { + "epoch": 0.08798410885907318, + "grad_norm": 1.3065992593765259, + "learning_rate": 4.9051159896339816e-05, + "loss": 4.9547, + "step": 14794 + }, + { + "epoch": 0.08799005614235417, + "grad_norm": 1.6417936086654663, + "learning_rate": 4.905103242752203e-05, + "loss": 5.2734, + "step": 14795 + }, + { + "epoch": 0.08799600342563517, + "grad_norm": 2.1529974937438965, + "learning_rate": 4.905090495030827e-05, + "loss": 5.1999, + "step": 14796 + }, + { + "epoch": 0.08800195070891617, + "grad_norm": 1.6746312379837036, + "learning_rate": 4.90507774646986e-05, + "loss": 4.959, + "step": 14797 + }, + { + "epoch": 0.08800789799219716, + "grad_norm": 1.4422825574874878, + "learning_rate": 4.905064997069304e-05, + "loss": 5.0581, + "step": 14798 + }, + { + "epoch": 0.08801384527547816, + "grad_norm": 1.658833622932434, + "learning_rate": 4.9050522468291646e-05, + "loss": 4.9591, + "step": 14799 + }, + { + "epoch": 0.08801979255875916, + "grad_norm": 1.4971596002578735, + "learning_rate": 4.9050394957494464e-05, + "loss": 5.2515, + "step": 14800 + }, + { + "epoch": 0.08802573984204015, + "grad_norm": 1.5866429805755615, + "learning_rate": 4.9050267438301546e-05, + "loss": 5.1084, + "step": 14801 + }, + { + "epoch": 0.08803168712532115, + "grad_norm": 1.5049015283584595, + "learning_rate": 4.9050139910712925e-05, + "loss": 5.1102, + "step": 14802 + }, + { + "epoch": 0.08803763440860216, + "grad_norm": 1.6711664199829102, + "learning_rate": 4.905001237472864e-05, + "loss": 5.0215, + "step": 14803 + }, + { + "epoch": 0.08804358169188314, + "grad_norm": 1.6390610933303833, + "learning_rate": 4.904988483034875e-05, + "loss": 4.978, + "step": 14804 + }, + { + "epoch": 0.08804952897516415, + "grad_norm": 1.5968292951583862, + "learning_rate": 4.9049757277573295e-05, + "loss": 5.0183, + "step": 14805 + }, + { + "epoch": 0.08805547625844515, + "grad_norm": 1.4864193201065063, + "learning_rate": 4.9049629716402325e-05, + "loss": 5.5199, + "step": 14806 + }, + { + "epoch": 0.08806142354172614, + "grad_norm": 1.5658420324325562, + "learning_rate": 4.904950214683587e-05, + "loss": 5.4906, + "step": 14807 + }, + { + "epoch": 0.08806737082500714, + "grad_norm": 1.5811707973480225, + "learning_rate": 4.9049374568873975e-05, + "loss": 5.5795, + "step": 14808 + }, + { + "epoch": 0.08807331810828814, + "grad_norm": 1.418641448020935, + "learning_rate": 4.90492469825167e-05, + "loss": 5.3616, + "step": 14809 + }, + { + "epoch": 0.08807926539156913, + "grad_norm": 1.323500633239746, + "learning_rate": 4.904911938776408e-05, + "loss": 5.2641, + "step": 14810 + }, + { + "epoch": 0.08808521267485013, + "grad_norm": 1.590867280960083, + "learning_rate": 4.904899178461616e-05, + "loss": 5.3782, + "step": 14811 + }, + { + "epoch": 0.08809115995813113, + "grad_norm": 1.243213176727295, + "learning_rate": 4.904886417307299e-05, + "loss": 5.4743, + "step": 14812 + }, + { + "epoch": 0.08809710724141212, + "grad_norm": 1.5051169395446777, + "learning_rate": 4.9048736553134614e-05, + "loss": 5.3046, + "step": 14813 + }, + { + "epoch": 0.08810305452469312, + "grad_norm": 1.334234356880188, + "learning_rate": 4.904860892480106e-05, + "loss": 5.2673, + "step": 14814 + }, + { + "epoch": 0.08810900180797411, + "grad_norm": 1.4352458715438843, + "learning_rate": 4.904848128807239e-05, + "loss": 5.3465, + "step": 14815 + }, + { + "epoch": 0.08811494909125511, + "grad_norm": 1.6878329515457153, + "learning_rate": 4.904835364294864e-05, + "loss": 5.3467, + "step": 14816 + }, + { + "epoch": 0.08812089637453611, + "grad_norm": 1.542100191116333, + "learning_rate": 4.904822598942986e-05, + "loss": 5.4147, + "step": 14817 + }, + { + "epoch": 0.0881268436578171, + "grad_norm": 1.5099046230316162, + "learning_rate": 4.90480983275161e-05, + "loss": 5.7198, + "step": 14818 + }, + { + "epoch": 0.0881327909410981, + "grad_norm": 1.6120097637176514, + "learning_rate": 4.9047970657207395e-05, + "loss": 5.4417, + "step": 14819 + }, + { + "epoch": 0.0881387382243791, + "grad_norm": 1.455407977104187, + "learning_rate": 4.904784297850379e-05, + "loss": 5.3028, + "step": 14820 + }, + { + "epoch": 0.0881446855076601, + "grad_norm": 1.589712381362915, + "learning_rate": 4.904771529140533e-05, + "loss": 5.2493, + "step": 14821 + }, + { + "epoch": 0.0881506327909411, + "grad_norm": 1.5051584243774414, + "learning_rate": 4.904758759591206e-05, + "loss": 5.2225, + "step": 14822 + }, + { + "epoch": 0.0881565800742221, + "grad_norm": 1.3623727560043335, + "learning_rate": 4.9047459892024026e-05, + "loss": 5.1738, + "step": 14823 + }, + { + "epoch": 0.08816252735750309, + "grad_norm": 1.4643206596374512, + "learning_rate": 4.9047332179741274e-05, + "loss": 5.123, + "step": 14824 + }, + { + "epoch": 0.08816847464078409, + "grad_norm": 1.4233453273773193, + "learning_rate": 4.904720445906384e-05, + "loss": 4.9263, + "step": 14825 + }, + { + "epoch": 0.08817442192406509, + "grad_norm": 1.6479318141937256, + "learning_rate": 4.9047076729991786e-05, + "loss": 4.9663, + "step": 14826 + }, + { + "epoch": 0.08818036920734608, + "grad_norm": 1.4759633541107178, + "learning_rate": 4.9046948992525145e-05, + "loss": 5.0326, + "step": 14827 + }, + { + "epoch": 0.08818631649062708, + "grad_norm": 1.435533046722412, + "learning_rate": 4.904682124666395e-05, + "loss": 5.0819, + "step": 14828 + }, + { + "epoch": 0.08819226377390808, + "grad_norm": 1.4540610313415527, + "learning_rate": 4.904669349240827e-05, + "loss": 5.391, + "step": 14829 + }, + { + "epoch": 0.08819821105718907, + "grad_norm": 1.6308038234710693, + "learning_rate": 4.904656572975814e-05, + "loss": 4.9723, + "step": 14830 + }, + { + "epoch": 0.08820415834047007, + "grad_norm": 1.453600287437439, + "learning_rate": 4.90464379587136e-05, + "loss": 5.1689, + "step": 14831 + }, + { + "epoch": 0.08821010562375108, + "grad_norm": 1.4876199960708618, + "learning_rate": 4.904631017927469e-05, + "loss": 5.1163, + "step": 14832 + }, + { + "epoch": 0.08821605290703206, + "grad_norm": 1.4240463972091675, + "learning_rate": 4.9046182391441466e-05, + "loss": 5.1154, + "step": 14833 + }, + { + "epoch": 0.08822200019031307, + "grad_norm": 1.4176205396652222, + "learning_rate": 4.904605459521397e-05, + "loss": 5.1587, + "step": 14834 + }, + { + "epoch": 0.08822794747359407, + "grad_norm": 1.302998423576355, + "learning_rate": 4.9045926790592244e-05, + "loss": 5.1302, + "step": 14835 + }, + { + "epoch": 0.08823389475687506, + "grad_norm": 1.4490020275115967, + "learning_rate": 4.904579897757633e-05, + "loss": 5.0817, + "step": 14836 + }, + { + "epoch": 0.08823984204015606, + "grad_norm": 1.4430203437805176, + "learning_rate": 4.9045671156166276e-05, + "loss": 5.1334, + "step": 14837 + }, + { + "epoch": 0.08824578932343706, + "grad_norm": 1.326277494430542, + "learning_rate": 4.9045543326362134e-05, + "loss": 5.3292, + "step": 14838 + }, + { + "epoch": 0.08825173660671805, + "grad_norm": 1.373415470123291, + "learning_rate": 4.9045415488163936e-05, + "loss": 5.454, + "step": 14839 + }, + { + "epoch": 0.08825768388999905, + "grad_norm": 1.4334250688552856, + "learning_rate": 4.904528764157173e-05, + "loss": 5.2735, + "step": 14840 + }, + { + "epoch": 0.08826363117328005, + "grad_norm": 1.4029041528701782, + "learning_rate": 4.904515978658556e-05, + "loss": 5.0549, + "step": 14841 + }, + { + "epoch": 0.08826957845656104, + "grad_norm": 1.355177879333496, + "learning_rate": 4.904503192320548e-05, + "loss": 5.2569, + "step": 14842 + }, + { + "epoch": 0.08827552573984204, + "grad_norm": 1.2063989639282227, + "learning_rate": 4.904490405143153e-05, + "loss": 5.2469, + "step": 14843 + }, + { + "epoch": 0.08828147302312303, + "grad_norm": 1.2290265560150146, + "learning_rate": 4.904477617126374e-05, + "loss": 5.255, + "step": 14844 + }, + { + "epoch": 0.08828742030640403, + "grad_norm": 1.0648494958877563, + "learning_rate": 4.904464828270218e-05, + "loss": 5.2423, + "step": 14845 + }, + { + "epoch": 0.08829336758968503, + "grad_norm": 1.362572431564331, + "learning_rate": 4.904452038574687e-05, + "loss": 5.3856, + "step": 14846 + }, + { + "epoch": 0.08829931487296602, + "grad_norm": 1.3004114627838135, + "learning_rate": 4.9044392480397886e-05, + "loss": 5.0672, + "step": 14847 + }, + { + "epoch": 0.08830526215624702, + "grad_norm": 1.4852789640426636, + "learning_rate": 4.904426456665523e-05, + "loss": 5.2145, + "step": 14848 + }, + { + "epoch": 0.08831120943952803, + "grad_norm": 1.4221493005752563, + "learning_rate": 4.9044136644518976e-05, + "loss": 5.4544, + "step": 14849 + }, + { + "epoch": 0.08831715672280901, + "grad_norm": 1.4444363117218018, + "learning_rate": 4.904400871398917e-05, + "loss": 5.3342, + "step": 14850 + }, + { + "epoch": 0.08832310400609002, + "grad_norm": 1.1723617315292358, + "learning_rate": 4.904388077506585e-05, + "loss": 5.3846, + "step": 14851 + }, + { + "epoch": 0.08832905128937102, + "grad_norm": 1.3458356857299805, + "learning_rate": 4.904375282774905e-05, + "loss": 5.3903, + "step": 14852 + }, + { + "epoch": 0.08833499857265201, + "grad_norm": 1.4839876890182495, + "learning_rate": 4.904362487203883e-05, + "loss": 5.0889, + "step": 14853 + }, + { + "epoch": 0.08834094585593301, + "grad_norm": 1.6487696170806885, + "learning_rate": 4.904349690793523e-05, + "loss": 5.0904, + "step": 14854 + }, + { + "epoch": 0.08834689313921401, + "grad_norm": 1.5201997756958008, + "learning_rate": 4.904336893543829e-05, + "loss": 4.9017, + "step": 14855 + }, + { + "epoch": 0.088352840422495, + "grad_norm": 1.5502886772155762, + "learning_rate": 4.904324095454806e-05, + "loss": 4.931, + "step": 14856 + }, + { + "epoch": 0.088358787705776, + "grad_norm": 1.4996228218078613, + "learning_rate": 4.904311296526458e-05, + "loss": 5.0773, + "step": 14857 + }, + { + "epoch": 0.088364734989057, + "grad_norm": 1.7004456520080566, + "learning_rate": 4.90429849675879e-05, + "loss": 4.9913, + "step": 14858 + }, + { + "epoch": 0.08837068227233799, + "grad_norm": 1.426007866859436, + "learning_rate": 4.904285696151806e-05, + "loss": 5.1312, + "step": 14859 + }, + { + "epoch": 0.088376629555619, + "grad_norm": 1.4049350023269653, + "learning_rate": 4.904272894705512e-05, + "loss": 5.0539, + "step": 14860 + }, + { + "epoch": 0.0883825768389, + "grad_norm": 1.558273434638977, + "learning_rate": 4.9042600924199096e-05, + "loss": 5.0822, + "step": 14861 + }, + { + "epoch": 0.08838852412218098, + "grad_norm": 1.6177934408187866, + "learning_rate": 4.9042472892950055e-05, + "loss": 5.1646, + "step": 14862 + }, + { + "epoch": 0.08839447140546199, + "grad_norm": 1.5152839422225952, + "learning_rate": 4.904234485330803e-05, + "loss": 5.0144, + "step": 14863 + }, + { + "epoch": 0.08840041868874299, + "grad_norm": 1.474231243133545, + "learning_rate": 4.904221680527308e-05, + "loss": 5.1063, + "step": 14864 + }, + { + "epoch": 0.08840636597202398, + "grad_norm": 1.5897177457809448, + "learning_rate": 4.904208874884523e-05, + "loss": 4.9724, + "step": 14865 + }, + { + "epoch": 0.08841231325530498, + "grad_norm": 1.604368805885315, + "learning_rate": 4.904196068402454e-05, + "loss": 4.8905, + "step": 14866 + }, + { + "epoch": 0.08841826053858598, + "grad_norm": 1.338458776473999, + "learning_rate": 4.904183261081105e-05, + "loss": 4.7829, + "step": 14867 + }, + { + "epoch": 0.08842420782186697, + "grad_norm": 1.62189781665802, + "learning_rate": 4.9041704529204806e-05, + "loss": 4.8025, + "step": 14868 + }, + { + "epoch": 0.08843015510514797, + "grad_norm": 1.555298089981079, + "learning_rate": 4.904157643920585e-05, + "loss": 4.9098, + "step": 14869 + }, + { + "epoch": 0.08843610238842897, + "grad_norm": 1.5110834836959839, + "learning_rate": 4.904144834081423e-05, + "loss": 4.8648, + "step": 14870 + }, + { + "epoch": 0.08844204967170996, + "grad_norm": 1.59073805809021, + "learning_rate": 4.904132023402999e-05, + "loss": 4.8997, + "step": 14871 + }, + { + "epoch": 0.08844799695499096, + "grad_norm": 1.5218732357025146, + "learning_rate": 4.904119211885316e-05, + "loss": 5.352, + "step": 14872 + }, + { + "epoch": 0.08845394423827196, + "grad_norm": 1.5263079404830933, + "learning_rate": 4.904106399528382e-05, + "loss": 4.8921, + "step": 14873 + }, + { + "epoch": 0.08845989152155295, + "grad_norm": 1.6151986122131348, + "learning_rate": 4.904093586332198e-05, + "loss": 5.0086, + "step": 14874 + }, + { + "epoch": 0.08846583880483395, + "grad_norm": 1.4971787929534912, + "learning_rate": 4.90408077229677e-05, + "loss": 5.0119, + "step": 14875 + }, + { + "epoch": 0.08847178608811494, + "grad_norm": 1.4897308349609375, + "learning_rate": 4.904067957422102e-05, + "loss": 5.0175, + "step": 14876 + }, + { + "epoch": 0.08847773337139594, + "grad_norm": 1.4023786783218384, + "learning_rate": 4.904055141708199e-05, + "loss": 5.0361, + "step": 14877 + }, + { + "epoch": 0.08848368065467695, + "grad_norm": 1.4664498567581177, + "learning_rate": 4.904042325155065e-05, + "loss": 4.9784, + "step": 14878 + }, + { + "epoch": 0.08848962793795793, + "grad_norm": 1.390824556350708, + "learning_rate": 4.904029507762704e-05, + "loss": 4.9922, + "step": 14879 + }, + { + "epoch": 0.08849557522123894, + "grad_norm": 1.9508315324783325, + "learning_rate": 4.904016689531122e-05, + "loss": 5.6352, + "step": 14880 + }, + { + "epoch": 0.08850152250451994, + "grad_norm": 1.4192322492599487, + "learning_rate": 4.904003870460323e-05, + "loss": 5.0654, + "step": 14881 + }, + { + "epoch": 0.08850746978780093, + "grad_norm": 1.5868372917175293, + "learning_rate": 4.903991050550311e-05, + "loss": 4.9631, + "step": 14882 + }, + { + "epoch": 0.08851341707108193, + "grad_norm": 1.405555009841919, + "learning_rate": 4.903978229801089e-05, + "loss": 5.1311, + "step": 14883 + }, + { + "epoch": 0.08851936435436293, + "grad_norm": 1.453817367553711, + "learning_rate": 4.9039654082126646e-05, + "loss": 5.0866, + "step": 14884 + }, + { + "epoch": 0.08852531163764392, + "grad_norm": 1.5051809549331665, + "learning_rate": 4.9039525857850404e-05, + "loss": 5.1606, + "step": 14885 + }, + { + "epoch": 0.08853125892092492, + "grad_norm": 1.5323255062103271, + "learning_rate": 4.9039397625182206e-05, + "loss": 5.1564, + "step": 14886 + }, + { + "epoch": 0.08853720620420592, + "grad_norm": 1.5018506050109863, + "learning_rate": 4.903926938412211e-05, + "loss": 4.9672, + "step": 14887 + }, + { + "epoch": 0.08854315348748691, + "grad_norm": 1.488289713859558, + "learning_rate": 4.903914113467015e-05, + "loss": 4.882, + "step": 14888 + }, + { + "epoch": 0.08854910077076791, + "grad_norm": 1.434045672416687, + "learning_rate": 4.903901287682637e-05, + "loss": 5.0748, + "step": 14889 + }, + { + "epoch": 0.08855504805404892, + "grad_norm": 1.5172244310379028, + "learning_rate": 4.903888461059083e-05, + "loss": 5.065, + "step": 14890 + }, + { + "epoch": 0.0885609953373299, + "grad_norm": 1.545283555984497, + "learning_rate": 4.903875633596356e-05, + "loss": 5.2187, + "step": 14891 + }, + { + "epoch": 0.0885669426206109, + "grad_norm": 1.3149688243865967, + "learning_rate": 4.90386280529446e-05, + "loss": 4.9977, + "step": 14892 + }, + { + "epoch": 0.08857288990389191, + "grad_norm": 1.4925106763839722, + "learning_rate": 4.903849976153401e-05, + "loss": 5.0622, + "step": 14893 + }, + { + "epoch": 0.0885788371871729, + "grad_norm": 1.6073296070098877, + "learning_rate": 4.903837146173183e-05, + "loss": 5.0823, + "step": 14894 + }, + { + "epoch": 0.0885847844704539, + "grad_norm": 1.2879148721694946, + "learning_rate": 4.9038243153538096e-05, + "loss": 5.1574, + "step": 14895 + }, + { + "epoch": 0.0885907317537349, + "grad_norm": 1.6396079063415527, + "learning_rate": 4.903811483695287e-05, + "loss": 5.1748, + "step": 14896 + }, + { + "epoch": 0.08859667903701589, + "grad_norm": 1.426180124282837, + "learning_rate": 4.903798651197618e-05, + "loss": 5.0374, + "step": 14897 + }, + { + "epoch": 0.08860262632029689, + "grad_norm": 1.3685684204101562, + "learning_rate": 4.9037858178608076e-05, + "loss": 4.9373, + "step": 14898 + }, + { + "epoch": 0.08860857360357789, + "grad_norm": 1.5495455265045166, + "learning_rate": 4.903772983684861e-05, + "loss": 5.0696, + "step": 14899 + }, + { + "epoch": 0.08861452088685888, + "grad_norm": 1.4423854351043701, + "learning_rate": 4.9037601486697815e-05, + "loss": 5.1359, + "step": 14900 + }, + { + "epoch": 0.08862046817013988, + "grad_norm": 1.4704400300979614, + "learning_rate": 4.9037473128155745e-05, + "loss": 5.0438, + "step": 14901 + }, + { + "epoch": 0.08862641545342088, + "grad_norm": 1.49704909324646, + "learning_rate": 4.903734476122244e-05, + "loss": 5.0305, + "step": 14902 + }, + { + "epoch": 0.08863236273670187, + "grad_norm": 1.3732075691223145, + "learning_rate": 4.903721638589795e-05, + "loss": 4.9659, + "step": 14903 + }, + { + "epoch": 0.08863831001998287, + "grad_norm": 1.5920335054397583, + "learning_rate": 4.903708800218231e-05, + "loss": 4.9936, + "step": 14904 + }, + { + "epoch": 0.08864425730326386, + "grad_norm": 1.6084437370300293, + "learning_rate": 4.9036959610075575e-05, + "loss": 5.0048, + "step": 14905 + }, + { + "epoch": 0.08865020458654486, + "grad_norm": 1.2329050302505493, + "learning_rate": 4.903683120957778e-05, + "loss": 4.9729, + "step": 14906 + }, + { + "epoch": 0.08865615186982587, + "grad_norm": 1.4001328945159912, + "learning_rate": 4.903670280068898e-05, + "loss": 4.9577, + "step": 14907 + }, + { + "epoch": 0.08866209915310685, + "grad_norm": 1.3499484062194824, + "learning_rate": 4.903657438340921e-05, + "loss": 4.8696, + "step": 14908 + }, + { + "epoch": 0.08866804643638786, + "grad_norm": 1.3606812953948975, + "learning_rate": 4.903644595773853e-05, + "loss": 4.9142, + "step": 14909 + }, + { + "epoch": 0.08867399371966886, + "grad_norm": 1.3275173902511597, + "learning_rate": 4.9036317523676964e-05, + "loss": 5.032, + "step": 14910 + }, + { + "epoch": 0.08867994100294985, + "grad_norm": 1.5485349893569946, + "learning_rate": 4.903618908122458e-05, + "loss": 4.9252, + "step": 14911 + }, + { + "epoch": 0.08868588828623085, + "grad_norm": 1.4325098991394043, + "learning_rate": 4.9036060630381395e-05, + "loss": 4.9971, + "step": 14912 + }, + { + "epoch": 0.08869183556951185, + "grad_norm": 1.4953216314315796, + "learning_rate": 4.903593217114748e-05, + "loss": 4.8228, + "step": 14913 + }, + { + "epoch": 0.08869778285279284, + "grad_norm": 1.4761654138565063, + "learning_rate": 4.9035803703522876e-05, + "loss": 4.8365, + "step": 14914 + }, + { + "epoch": 0.08870373013607384, + "grad_norm": 1.3572559356689453, + "learning_rate": 4.9035675227507615e-05, + "loss": 4.8409, + "step": 14915 + }, + { + "epoch": 0.08870967741935484, + "grad_norm": 1.3793766498565674, + "learning_rate": 4.903554674310175e-05, + "loss": 4.8748, + "step": 14916 + }, + { + "epoch": 0.08871562470263583, + "grad_norm": 1.2097266912460327, + "learning_rate": 4.9035418250305314e-05, + "loss": 4.9695, + "step": 14917 + }, + { + "epoch": 0.08872157198591683, + "grad_norm": 1.5097788572311401, + "learning_rate": 4.903528974911837e-05, + "loss": 4.9205, + "step": 14918 + }, + { + "epoch": 0.08872751926919784, + "grad_norm": 1.474219560623169, + "learning_rate": 4.903516123954095e-05, + "loss": 4.9382, + "step": 14919 + }, + { + "epoch": 0.08873346655247882, + "grad_norm": 1.4695779085159302, + "learning_rate": 4.903503272157311e-05, + "loss": 5.1486, + "step": 14920 + }, + { + "epoch": 0.08873941383575983, + "grad_norm": 1.6874669790267944, + "learning_rate": 4.903490419521488e-05, + "loss": 5.6441, + "step": 14921 + }, + { + "epoch": 0.08874536111904083, + "grad_norm": 1.5862348079681396, + "learning_rate": 4.903477566046632e-05, + "loss": 5.1457, + "step": 14922 + }, + { + "epoch": 0.08875130840232182, + "grad_norm": 1.5781593322753906, + "learning_rate": 4.903464711732747e-05, + "loss": 4.915, + "step": 14923 + }, + { + "epoch": 0.08875725568560282, + "grad_norm": 1.5252950191497803, + "learning_rate": 4.903451856579837e-05, + "loss": 5.0672, + "step": 14924 + }, + { + "epoch": 0.08876320296888382, + "grad_norm": 1.575958013534546, + "learning_rate": 4.9034390005879065e-05, + "loss": 4.9914, + "step": 14925 + }, + { + "epoch": 0.08876915025216481, + "grad_norm": 1.3837618827819824, + "learning_rate": 4.90342614375696e-05, + "loss": 5.1778, + "step": 14926 + }, + { + "epoch": 0.08877509753544581, + "grad_norm": 1.4716275930404663, + "learning_rate": 4.9034132860870036e-05, + "loss": 5.2625, + "step": 14927 + }, + { + "epoch": 0.08878104481872681, + "grad_norm": 1.2883623838424683, + "learning_rate": 4.90340042757804e-05, + "loss": 5.2357, + "step": 14928 + }, + { + "epoch": 0.0887869921020078, + "grad_norm": 1.521010398864746, + "learning_rate": 4.9033875682300736e-05, + "loss": 5.4941, + "step": 14929 + }, + { + "epoch": 0.0887929393852888, + "grad_norm": 1.5457875728607178, + "learning_rate": 4.903374708043109e-05, + "loss": 5.3108, + "step": 14930 + }, + { + "epoch": 0.0887988866685698, + "grad_norm": 1.4583250284194946, + "learning_rate": 4.903361847017152e-05, + "loss": 5.425, + "step": 14931 + }, + { + "epoch": 0.08880483395185079, + "grad_norm": 1.561854362487793, + "learning_rate": 4.903348985152206e-05, + "loss": 5.4267, + "step": 14932 + }, + { + "epoch": 0.0888107812351318, + "grad_norm": 1.6274350881576538, + "learning_rate": 4.9033361224482756e-05, + "loss": 5.3266, + "step": 14933 + }, + { + "epoch": 0.08881672851841278, + "grad_norm": 1.3476616144180298, + "learning_rate": 4.903323258905366e-05, + "loss": 5.248, + "step": 14934 + }, + { + "epoch": 0.08882267580169378, + "grad_norm": 1.3584541082382202, + "learning_rate": 4.90331039452348e-05, + "loss": 5.3101, + "step": 14935 + }, + { + "epoch": 0.08882862308497479, + "grad_norm": 1.5269302129745483, + "learning_rate": 4.903297529302624e-05, + "loss": 5.3451, + "step": 14936 + }, + { + "epoch": 0.08883457036825577, + "grad_norm": 1.5320923328399658, + "learning_rate": 4.903284663242801e-05, + "loss": 5.4289, + "step": 14937 + }, + { + "epoch": 0.08884051765153678, + "grad_norm": 1.5647650957107544, + "learning_rate": 4.9032717963440166e-05, + "loss": 5.2925, + "step": 14938 + }, + { + "epoch": 0.08884646493481778, + "grad_norm": 1.3379693031311035, + "learning_rate": 4.9032589286062744e-05, + "loss": 5.3314, + "step": 14939 + }, + { + "epoch": 0.08885241221809877, + "grad_norm": 1.5872068405151367, + "learning_rate": 4.90324606002958e-05, + "loss": 5.3521, + "step": 14940 + }, + { + "epoch": 0.08885835950137977, + "grad_norm": 1.473799228668213, + "learning_rate": 4.9032331906139373e-05, + "loss": 5.3697, + "step": 14941 + }, + { + "epoch": 0.08886430678466077, + "grad_norm": 2.2111928462982178, + "learning_rate": 4.90322032035935e-05, + "loss": 5.0139, + "step": 14942 + }, + { + "epoch": 0.08887025406794176, + "grad_norm": 1.386910319328308, + "learning_rate": 4.903207449265824e-05, + "loss": 5.3982, + "step": 14943 + }, + { + "epoch": 0.08887620135122276, + "grad_norm": 1.4972623586654663, + "learning_rate": 4.9031945773333624e-05, + "loss": 5.4207, + "step": 14944 + }, + { + "epoch": 0.08888214863450376, + "grad_norm": 1.6061536073684692, + "learning_rate": 4.903181704561971e-05, + "loss": 5.4265, + "step": 14945 + }, + { + "epoch": 0.08888809591778475, + "grad_norm": 1.5003243684768677, + "learning_rate": 4.903168830951653e-05, + "loss": 5.2323, + "step": 14946 + }, + { + "epoch": 0.08889404320106575, + "grad_norm": 1.4466320276260376, + "learning_rate": 4.9031559565024144e-05, + "loss": 5.3054, + "step": 14947 + }, + { + "epoch": 0.08889999048434676, + "grad_norm": 1.4495269060134888, + "learning_rate": 4.9031430812142584e-05, + "loss": 5.2725, + "step": 14948 + }, + { + "epoch": 0.08890593776762774, + "grad_norm": 1.2909798622131348, + "learning_rate": 4.9031302050871896e-05, + "loss": 5.13, + "step": 14949 + }, + { + "epoch": 0.08891188505090875, + "grad_norm": 1.368377685546875, + "learning_rate": 4.903117328121214e-05, + "loss": 5.0471, + "step": 14950 + }, + { + "epoch": 0.08891783233418975, + "grad_norm": 1.3496042490005493, + "learning_rate": 4.903104450316334e-05, + "loss": 5.1209, + "step": 14951 + }, + { + "epoch": 0.08892377961747074, + "grad_norm": 1.593047022819519, + "learning_rate": 4.9030915716725554e-05, + "loss": 5.2551, + "step": 14952 + }, + { + "epoch": 0.08892972690075174, + "grad_norm": 1.3550326824188232, + "learning_rate": 4.903078692189882e-05, + "loss": 5.2543, + "step": 14953 + }, + { + "epoch": 0.08893567418403274, + "grad_norm": 1.4302785396575928, + "learning_rate": 4.903065811868319e-05, + "loss": 5.2828, + "step": 14954 + }, + { + "epoch": 0.08894162146731373, + "grad_norm": 1.578244686126709, + "learning_rate": 4.903052930707871e-05, + "loss": 5.0593, + "step": 14955 + }, + { + "epoch": 0.08894756875059473, + "grad_norm": 1.248634696006775, + "learning_rate": 4.903040048708541e-05, + "loss": 5.0644, + "step": 14956 + }, + { + "epoch": 0.08895351603387573, + "grad_norm": 1.4040237665176392, + "learning_rate": 4.903027165870336e-05, + "loss": 5.0951, + "step": 14957 + }, + { + "epoch": 0.08895946331715672, + "grad_norm": 1.1941477060317993, + "learning_rate": 4.903014282193258e-05, + "loss": 5.0298, + "step": 14958 + }, + { + "epoch": 0.08896541060043772, + "grad_norm": 1.4292995929718018, + "learning_rate": 4.9030013976773125e-05, + "loss": 5.1567, + "step": 14959 + }, + { + "epoch": 0.08897135788371872, + "grad_norm": 1.4789859056472778, + "learning_rate": 4.902988512322505e-05, + "loss": 5.2172, + "step": 14960 + }, + { + "epoch": 0.08897730516699971, + "grad_norm": 2.160266876220703, + "learning_rate": 4.9029756261288376e-05, + "loss": 5.3458, + "step": 14961 + }, + { + "epoch": 0.08898325245028071, + "grad_norm": 1.8164606094360352, + "learning_rate": 4.902962739096317e-05, + "loss": 5.2795, + "step": 14962 + }, + { + "epoch": 0.0889891997335617, + "grad_norm": 2.0879664421081543, + "learning_rate": 4.902949851224947e-05, + "loss": 5.595, + "step": 14963 + }, + { + "epoch": 0.0889951470168427, + "grad_norm": 2.59543514251709, + "learning_rate": 4.9029369625147324e-05, + "loss": 5.3626, + "step": 14964 + }, + { + "epoch": 0.0890010943001237, + "grad_norm": 2.0679430961608887, + "learning_rate": 4.9029240729656764e-05, + "loss": 5.4222, + "step": 14965 + }, + { + "epoch": 0.0890070415834047, + "grad_norm": 1.90644109249115, + "learning_rate": 4.902911182577785e-05, + "loss": 6.1042, + "step": 14966 + }, + { + "epoch": 0.0890129888666857, + "grad_norm": 1.8565638065338135, + "learning_rate": 4.9028982913510626e-05, + "loss": 6.0312, + "step": 14967 + }, + { + "epoch": 0.0890189361499667, + "grad_norm": 1.717623233795166, + "learning_rate": 4.902885399285512e-05, + "loss": 5.794, + "step": 14968 + }, + { + "epoch": 0.08902488343324769, + "grad_norm": 2.2094457149505615, + "learning_rate": 4.90287250638114e-05, + "loss": 5.2517, + "step": 14969 + }, + { + "epoch": 0.08903083071652869, + "grad_norm": 2.2559561729431152, + "learning_rate": 4.9028596126379493e-05, + "loss": 5.2155, + "step": 14970 + }, + { + "epoch": 0.08903677799980969, + "grad_norm": 2.5394740104675293, + "learning_rate": 4.9028467180559455e-05, + "loss": 5.0829, + "step": 14971 + }, + { + "epoch": 0.08904272528309068, + "grad_norm": 1.9542546272277832, + "learning_rate": 4.902833822635133e-05, + "loss": 4.856, + "step": 14972 + }, + { + "epoch": 0.08904867256637168, + "grad_norm": 1.9541314840316772, + "learning_rate": 4.9028209263755154e-05, + "loss": 4.9858, + "step": 14973 + }, + { + "epoch": 0.08905461984965268, + "grad_norm": 1.8625229597091675, + "learning_rate": 4.9028080292770986e-05, + "loss": 4.976, + "step": 14974 + }, + { + "epoch": 0.08906056713293367, + "grad_norm": 2.254417657852173, + "learning_rate": 4.9027951313398855e-05, + "loss": 4.9765, + "step": 14975 + }, + { + "epoch": 0.08906651441621467, + "grad_norm": 2.3143160343170166, + "learning_rate": 4.902782232563882e-05, + "loss": 4.9562, + "step": 14976 + }, + { + "epoch": 0.08907246169949568, + "grad_norm": 2.320388078689575, + "learning_rate": 4.902769332949092e-05, + "loss": 4.9988, + "step": 14977 + }, + { + "epoch": 0.08907840898277666, + "grad_norm": 2.378101348876953, + "learning_rate": 4.90275643249552e-05, + "loss": 5.0869, + "step": 14978 + }, + { + "epoch": 0.08908435626605767, + "grad_norm": 2.5663437843322754, + "learning_rate": 4.90274353120317e-05, + "loss": 5.1124, + "step": 14979 + }, + { + "epoch": 0.08909030354933867, + "grad_norm": 2.2866733074188232, + "learning_rate": 4.902730629072048e-05, + "loss": 5.0564, + "step": 14980 + }, + { + "epoch": 0.08909625083261966, + "grad_norm": 2.060153007507324, + "learning_rate": 4.902717726102157e-05, + "loss": 4.9419, + "step": 14981 + }, + { + "epoch": 0.08910219811590066, + "grad_norm": 2.1555984020233154, + "learning_rate": 4.902704822293502e-05, + "loss": 4.6593, + "step": 14982 + }, + { + "epoch": 0.08910814539918166, + "grad_norm": 2.2045845985412598, + "learning_rate": 4.902691917646088e-05, + "loss": 4.6824, + "step": 14983 + }, + { + "epoch": 0.08911409268246265, + "grad_norm": 2.2891733646392822, + "learning_rate": 4.9026790121599185e-05, + "loss": 4.6378, + "step": 14984 + }, + { + "epoch": 0.08912003996574365, + "grad_norm": 2.0503318309783936, + "learning_rate": 4.902666105834999e-05, + "loss": 4.8051, + "step": 14985 + }, + { + "epoch": 0.08912598724902465, + "grad_norm": 2.2125399112701416, + "learning_rate": 4.9026531986713336e-05, + "loss": 5.0773, + "step": 14986 + }, + { + "epoch": 0.08913193453230564, + "grad_norm": 2.1177804470062256, + "learning_rate": 4.902640290668927e-05, + "loss": 5.0995, + "step": 14987 + }, + { + "epoch": 0.08913788181558664, + "grad_norm": 2.1028857231140137, + "learning_rate": 4.902627381827783e-05, + "loss": 4.3883, + "step": 14988 + }, + { + "epoch": 0.08914382909886764, + "grad_norm": 1.9426429271697998, + "learning_rate": 4.9026144721479065e-05, + "loss": 4.6539, + "step": 14989 + }, + { + "epoch": 0.08914977638214863, + "grad_norm": 2.2325892448425293, + "learning_rate": 4.902601561629302e-05, + "loss": 4.731, + "step": 14990 + }, + { + "epoch": 0.08915572366542963, + "grad_norm": 2.3903300762176514, + "learning_rate": 4.9025886502719756e-05, + "loss": 4.5786, + "step": 14991 + }, + { + "epoch": 0.08916167094871062, + "grad_norm": 2.368431806564331, + "learning_rate": 4.9025757380759284e-05, + "loss": 4.8904, + "step": 14992 + }, + { + "epoch": 0.08916761823199162, + "grad_norm": 2.1727442741394043, + "learning_rate": 4.902562825041168e-05, + "loss": 4.6276, + "step": 14993 + }, + { + "epoch": 0.08917356551527263, + "grad_norm": 2.2038626670837402, + "learning_rate": 4.9025499111676975e-05, + "loss": 4.7451, + "step": 14994 + }, + { + "epoch": 0.08917951279855361, + "grad_norm": 2.3933217525482178, + "learning_rate": 4.902536996455521e-05, + "loss": 4.8129, + "step": 14995 + }, + { + "epoch": 0.08918546008183462, + "grad_norm": 2.473212242126465, + "learning_rate": 4.902524080904645e-05, + "loss": 4.6171, + "step": 14996 + }, + { + "epoch": 0.08919140736511562, + "grad_norm": 2.2226645946502686, + "learning_rate": 4.902511164515071e-05, + "loss": 4.3847, + "step": 14997 + }, + { + "epoch": 0.0891973546483966, + "grad_norm": 2.0874104499816895, + "learning_rate": 4.9024982472868065e-05, + "loss": 4.801, + "step": 14998 + }, + { + "epoch": 0.08920330193167761, + "grad_norm": 1.9831374883651733, + "learning_rate": 4.902485329219854e-05, + "loss": 4.8995, + "step": 14999 + }, + { + "epoch": 0.08920924921495861, + "grad_norm": 2.1662073135375977, + "learning_rate": 4.9024724103142196e-05, + "loss": 4.7221, + "step": 15000 + }, + { + "epoch": 0.0892151964982396, + "grad_norm": 2.335336685180664, + "learning_rate": 4.902459490569906e-05, + "loss": 4.5051, + "step": 15001 + }, + { + "epoch": 0.0892211437815206, + "grad_norm": 2.2647337913513184, + "learning_rate": 4.902446569986919e-05, + "loss": 4.5274, + "step": 15002 + }, + { + "epoch": 0.0892270910648016, + "grad_norm": 2.1781129837036133, + "learning_rate": 4.9024336485652625e-05, + "loss": 4.5661, + "step": 15003 + }, + { + "epoch": 0.08923303834808259, + "grad_norm": 2.6452128887176514, + "learning_rate": 4.902420726304941e-05, + "loss": 5.0087, + "step": 15004 + }, + { + "epoch": 0.0892389856313636, + "grad_norm": 2.10276460647583, + "learning_rate": 4.90240780320596e-05, + "loss": 4.5003, + "step": 15005 + }, + { + "epoch": 0.0892449329146446, + "grad_norm": 2.1297876834869385, + "learning_rate": 4.902394879268323e-05, + "loss": 4.7603, + "step": 15006 + }, + { + "epoch": 0.08925088019792558, + "grad_norm": 2.288257122039795, + "learning_rate": 4.902381954492033e-05, + "loss": 4.7433, + "step": 15007 + }, + { + "epoch": 0.08925682748120659, + "grad_norm": 2.422492742538452, + "learning_rate": 4.902369028877098e-05, + "loss": 4.7823, + "step": 15008 + }, + { + "epoch": 0.08926277476448759, + "grad_norm": 2.4264109134674072, + "learning_rate": 4.9023561024235215e-05, + "loss": 4.9725, + "step": 15009 + }, + { + "epoch": 0.08926872204776858, + "grad_norm": 2.191776752471924, + "learning_rate": 4.902343175131307e-05, + "loss": 4.7893, + "step": 15010 + }, + { + "epoch": 0.08927466933104958, + "grad_norm": 2.0434861183166504, + "learning_rate": 4.9023302470004584e-05, + "loss": 5.3321, + "step": 15011 + }, + { + "epoch": 0.08928061661433058, + "grad_norm": 2.3108692169189453, + "learning_rate": 4.902317318030981e-05, + "loss": 4.848, + "step": 15012 + }, + { + "epoch": 0.08928656389761157, + "grad_norm": 1.8814477920532227, + "learning_rate": 4.9023043882228805e-05, + "loss": 4.9666, + "step": 15013 + }, + { + "epoch": 0.08929251118089257, + "grad_norm": 1.7109707593917847, + "learning_rate": 4.902291457576159e-05, + "loss": 5.0996, + "step": 15014 + }, + { + "epoch": 0.08929845846417357, + "grad_norm": 1.4246928691864014, + "learning_rate": 4.902278526090823e-05, + "loss": 5.1413, + "step": 15015 + }, + { + "epoch": 0.08930440574745456, + "grad_norm": 1.5714298486709595, + "learning_rate": 4.902265593766877e-05, + "loss": 5.4028, + "step": 15016 + }, + { + "epoch": 0.08931035303073556, + "grad_norm": 1.4553309679031372, + "learning_rate": 4.902252660604324e-05, + "loss": 5.1903, + "step": 15017 + }, + { + "epoch": 0.08931630031401656, + "grad_norm": 1.3266233205795288, + "learning_rate": 4.902239726603171e-05, + "loss": 5.1093, + "step": 15018 + }, + { + "epoch": 0.08932224759729755, + "grad_norm": 1.3145966529846191, + "learning_rate": 4.902226791763419e-05, + "loss": 5.0704, + "step": 15019 + }, + { + "epoch": 0.08932819488057855, + "grad_norm": 1.4367384910583496, + "learning_rate": 4.9022138560850754e-05, + "loss": 4.9669, + "step": 15020 + }, + { + "epoch": 0.08933414216385954, + "grad_norm": 1.4239497184753418, + "learning_rate": 4.902200919568144e-05, + "loss": 5.1035, + "step": 15021 + }, + { + "epoch": 0.08934008944714054, + "grad_norm": 1.323853611946106, + "learning_rate": 4.9021879822126284e-05, + "loss": 4.989, + "step": 15022 + }, + { + "epoch": 0.08934603673042155, + "grad_norm": 1.596498727798462, + "learning_rate": 4.9021750440185345e-05, + "loss": 5.0445, + "step": 15023 + }, + { + "epoch": 0.08935198401370253, + "grad_norm": 1.3866841793060303, + "learning_rate": 4.902162104985865e-05, + "loss": 4.9832, + "step": 15024 + }, + { + "epoch": 0.08935793129698354, + "grad_norm": 1.2495089769363403, + "learning_rate": 4.9021491651146265e-05, + "loss": 5.1337, + "step": 15025 + }, + { + "epoch": 0.08936387858026454, + "grad_norm": 1.2082443237304688, + "learning_rate": 4.902136224404822e-05, + "loss": 5.1038, + "step": 15026 + }, + { + "epoch": 0.08936982586354553, + "grad_norm": 1.5153082609176636, + "learning_rate": 4.9021232828564564e-05, + "loss": 5.122, + "step": 15027 + }, + { + "epoch": 0.08937577314682653, + "grad_norm": 1.5340677499771118, + "learning_rate": 4.902110340469536e-05, + "loss": 5.2675, + "step": 15028 + }, + { + "epoch": 0.08938172043010753, + "grad_norm": 1.9367091655731201, + "learning_rate": 4.9020973972440624e-05, + "loss": 5.4528, + "step": 15029 + }, + { + "epoch": 0.08938766771338852, + "grad_norm": 1.7637518644332886, + "learning_rate": 4.902084453180041e-05, + "loss": 5.4686, + "step": 15030 + }, + { + "epoch": 0.08939361499666952, + "grad_norm": 1.668220043182373, + "learning_rate": 4.902071508277477e-05, + "loss": 5.5889, + "step": 15031 + }, + { + "epoch": 0.08939956227995052, + "grad_norm": 2.0754151344299316, + "learning_rate": 4.902058562536375e-05, + "loss": 5.7398, + "step": 15032 + }, + { + "epoch": 0.08940550956323151, + "grad_norm": 1.9756910800933838, + "learning_rate": 4.902045615956739e-05, + "loss": 5.528, + "step": 15033 + }, + { + "epoch": 0.08941145684651251, + "grad_norm": 1.6614958047866821, + "learning_rate": 4.9020326685385735e-05, + "loss": 5.5761, + "step": 15034 + }, + { + "epoch": 0.08941740412979352, + "grad_norm": 2.0193135738372803, + "learning_rate": 4.902019720281884e-05, + "loss": 5.1836, + "step": 15035 + }, + { + "epoch": 0.0894233514130745, + "grad_norm": 2.164290428161621, + "learning_rate": 4.9020067711866735e-05, + "loss": 5.0216, + "step": 15036 + }, + { + "epoch": 0.0894292986963555, + "grad_norm": 2.3957648277282715, + "learning_rate": 4.901993821252947e-05, + "loss": 4.9631, + "step": 15037 + }, + { + "epoch": 0.08943524597963651, + "grad_norm": 2.204258680343628, + "learning_rate": 4.90198087048071e-05, + "loss": 4.774, + "step": 15038 + }, + { + "epoch": 0.0894411932629175, + "grad_norm": 1.7879102230072021, + "learning_rate": 4.9019679188699666e-05, + "loss": 5.716, + "step": 15039 + }, + { + "epoch": 0.0894471405461985, + "grad_norm": 1.6019984483718872, + "learning_rate": 4.9019549664207196e-05, + "loss": 5.3657, + "step": 15040 + }, + { + "epoch": 0.0894530878294795, + "grad_norm": 2.079514741897583, + "learning_rate": 4.901942013132976e-05, + "loss": 5.0526, + "step": 15041 + }, + { + "epoch": 0.08945903511276049, + "grad_norm": 1.9381201267242432, + "learning_rate": 4.901929059006739e-05, + "loss": 4.9585, + "step": 15042 + }, + { + "epoch": 0.08946498239604149, + "grad_norm": 1.6514472961425781, + "learning_rate": 4.9019161040420134e-05, + "loss": 5.4721, + "step": 15043 + }, + { + "epoch": 0.08947092967932249, + "grad_norm": 1.7294371128082275, + "learning_rate": 4.901903148238804e-05, + "loss": 5.4401, + "step": 15044 + }, + { + "epoch": 0.08947687696260348, + "grad_norm": 1.7769347429275513, + "learning_rate": 4.901890191597115e-05, + "loss": 5.4324, + "step": 15045 + }, + { + "epoch": 0.08948282424588448, + "grad_norm": 1.6517225503921509, + "learning_rate": 4.9018772341169505e-05, + "loss": 5.2967, + "step": 15046 + }, + { + "epoch": 0.08948877152916548, + "grad_norm": 1.5310052633285522, + "learning_rate": 4.901864275798316e-05, + "loss": 5.4017, + "step": 15047 + }, + { + "epoch": 0.08949471881244647, + "grad_norm": 1.9703199863433838, + "learning_rate": 4.9018513166412146e-05, + "loss": 4.9813, + "step": 15048 + }, + { + "epoch": 0.08950066609572747, + "grad_norm": 1.991087555885315, + "learning_rate": 4.901838356645652e-05, + "loss": 5.2911, + "step": 15049 + }, + { + "epoch": 0.08950661337900846, + "grad_norm": 1.7992926836013794, + "learning_rate": 4.9018253958116334e-05, + "loss": 5.2996, + "step": 15050 + }, + { + "epoch": 0.08951256066228946, + "grad_norm": 1.5164752006530762, + "learning_rate": 4.901812434139161e-05, + "loss": 5.8002, + "step": 15051 + }, + { + "epoch": 0.08951850794557047, + "grad_norm": 1.8143075704574585, + "learning_rate": 4.9017994716282415e-05, + "loss": 5.241, + "step": 15052 + }, + { + "epoch": 0.08952445522885145, + "grad_norm": 1.9806342124938965, + "learning_rate": 4.9017865082788785e-05, + "loss": 5.3656, + "step": 15053 + }, + { + "epoch": 0.08953040251213246, + "grad_norm": 2.403789520263672, + "learning_rate": 4.901773544091077e-05, + "loss": 5.1024, + "step": 15054 + }, + { + "epoch": 0.08953634979541346, + "grad_norm": 1.5903408527374268, + "learning_rate": 4.90176057906484e-05, + "loss": 5.3849, + "step": 15055 + }, + { + "epoch": 0.08954229707869445, + "grad_norm": 1.764125943183899, + "learning_rate": 4.901747613200175e-05, + "loss": 5.0757, + "step": 15056 + }, + { + "epoch": 0.08954824436197545, + "grad_norm": 2.1031241416931152, + "learning_rate": 4.901734646497084e-05, + "loss": 5.2114, + "step": 15057 + }, + { + "epoch": 0.08955419164525645, + "grad_norm": 1.9965282678604126, + "learning_rate": 4.901721678955571e-05, + "loss": 5.1136, + "step": 15058 + }, + { + "epoch": 0.08956013892853744, + "grad_norm": 1.9062676429748535, + "learning_rate": 4.9017087105756434e-05, + "loss": 4.9166, + "step": 15059 + }, + { + "epoch": 0.08956608621181844, + "grad_norm": 2.0963199138641357, + "learning_rate": 4.901695741357303e-05, + "loss": 4.7587, + "step": 15060 + }, + { + "epoch": 0.08957203349509944, + "grad_norm": 1.7062407732009888, + "learning_rate": 4.901682771300556e-05, + "loss": 5.3046, + "step": 15061 + }, + { + "epoch": 0.08957798077838043, + "grad_norm": 1.574013352394104, + "learning_rate": 4.9016698004054065e-05, + "loss": 5.3007, + "step": 15062 + }, + { + "epoch": 0.08958392806166143, + "grad_norm": 1.7540260553359985, + "learning_rate": 4.9016568286718586e-05, + "loss": 5.5824, + "step": 15063 + }, + { + "epoch": 0.08958987534494244, + "grad_norm": 1.4875624179840088, + "learning_rate": 4.901643856099917e-05, + "loss": 5.4569, + "step": 15064 + }, + { + "epoch": 0.08959582262822342, + "grad_norm": 1.6023603677749634, + "learning_rate": 4.901630882689586e-05, + "loss": 5.5397, + "step": 15065 + }, + { + "epoch": 0.08960176991150443, + "grad_norm": 2.1851913928985596, + "learning_rate": 4.9016179084408706e-05, + "loss": 4.9882, + "step": 15066 + }, + { + "epoch": 0.08960771719478543, + "grad_norm": 1.4636015892028809, + "learning_rate": 4.901604933353776e-05, + "loss": 5.4568, + "step": 15067 + }, + { + "epoch": 0.08961366447806642, + "grad_norm": 2.6841142177581787, + "learning_rate": 4.901591957428305e-05, + "loss": 5.8365, + "step": 15068 + }, + { + "epoch": 0.08961961176134742, + "grad_norm": 2.2015743255615234, + "learning_rate": 4.9015789806644643e-05, + "loss": 5.4798, + "step": 15069 + }, + { + "epoch": 0.08962555904462842, + "grad_norm": 2.3934903144836426, + "learning_rate": 4.901566003062256e-05, + "loss": 5.3355, + "step": 15070 + }, + { + "epoch": 0.08963150632790941, + "grad_norm": 2.418919801712036, + "learning_rate": 4.9015530246216866e-05, + "loss": 5.2546, + "step": 15071 + }, + { + "epoch": 0.08963745361119041, + "grad_norm": 2.2773303985595703, + "learning_rate": 4.90154004534276e-05, + "loss": 5.3306, + "step": 15072 + }, + { + "epoch": 0.08964340089447141, + "grad_norm": 2.09413743019104, + "learning_rate": 4.9015270652254796e-05, + "loss": 5.4715, + "step": 15073 + }, + { + "epoch": 0.0896493481777524, + "grad_norm": 1.8905339241027832, + "learning_rate": 4.901514084269852e-05, + "loss": 5.2248, + "step": 15074 + }, + { + "epoch": 0.0896552954610334, + "grad_norm": 1.7001872062683105, + "learning_rate": 4.9015011024758794e-05, + "loss": 5.2869, + "step": 15075 + }, + { + "epoch": 0.0896612427443144, + "grad_norm": 1.7953561544418335, + "learning_rate": 4.901488119843568e-05, + "loss": 5.2027, + "step": 15076 + }, + { + "epoch": 0.08966719002759539, + "grad_norm": 1.8996349573135376, + "learning_rate": 4.9014751363729225e-05, + "loss": 5.8168, + "step": 15077 + }, + { + "epoch": 0.0896731373108764, + "grad_norm": 1.6294323205947876, + "learning_rate": 4.901462152063946e-05, + "loss": 5.0331, + "step": 15078 + }, + { + "epoch": 0.08967908459415738, + "grad_norm": 1.4392082691192627, + "learning_rate": 4.901449166916645e-05, + "loss": 4.9094, + "step": 15079 + }, + { + "epoch": 0.08968503187743838, + "grad_norm": 1.6613532304763794, + "learning_rate": 4.9014361809310216e-05, + "loss": 5.1426, + "step": 15080 + }, + { + "epoch": 0.08969097916071939, + "grad_norm": 1.7502686977386475, + "learning_rate": 4.9014231941070823e-05, + "loss": 5.4298, + "step": 15081 + }, + { + "epoch": 0.08969692644400037, + "grad_norm": 1.9276418685913086, + "learning_rate": 4.9014102064448305e-05, + "loss": 5.8383, + "step": 15082 + }, + { + "epoch": 0.08970287372728138, + "grad_norm": 2.471407651901245, + "learning_rate": 4.901397217944272e-05, + "loss": 6.1879, + "step": 15083 + }, + { + "epoch": 0.08970882101056238, + "grad_norm": 2.0759341716766357, + "learning_rate": 4.90138422860541e-05, + "loss": 6.0929, + "step": 15084 + }, + { + "epoch": 0.08971476829384337, + "grad_norm": 1.6504180431365967, + "learning_rate": 4.9013712384282505e-05, + "loss": 6.0733, + "step": 15085 + }, + { + "epoch": 0.08972071557712437, + "grad_norm": 1.7268849611282349, + "learning_rate": 4.9013582474127965e-05, + "loss": 5.9707, + "step": 15086 + }, + { + "epoch": 0.08972666286040537, + "grad_norm": 1.8029861450195312, + "learning_rate": 4.901345255559053e-05, + "loss": 5.3645, + "step": 15087 + }, + { + "epoch": 0.08973261014368636, + "grad_norm": 1.8240137100219727, + "learning_rate": 4.9013322628670246e-05, + "loss": 5.4201, + "step": 15088 + }, + { + "epoch": 0.08973855742696736, + "grad_norm": 1.799771785736084, + "learning_rate": 4.901319269336716e-05, + "loss": 5.2043, + "step": 15089 + }, + { + "epoch": 0.08974450471024836, + "grad_norm": 1.6271024942398071, + "learning_rate": 4.901306274968131e-05, + "loss": 5.4118, + "step": 15090 + }, + { + "epoch": 0.08975045199352935, + "grad_norm": 1.4443042278289795, + "learning_rate": 4.9012932797612756e-05, + "loss": 5.5921, + "step": 15091 + }, + { + "epoch": 0.08975639927681035, + "grad_norm": 1.7174689769744873, + "learning_rate": 4.9012802837161535e-05, + "loss": 5.5233, + "step": 15092 + }, + { + "epoch": 0.08976234656009136, + "grad_norm": 1.7158472537994385, + "learning_rate": 4.901267286832769e-05, + "loss": 5.9171, + "step": 15093 + }, + { + "epoch": 0.08976829384337234, + "grad_norm": 1.691797137260437, + "learning_rate": 4.9012542891111275e-05, + "loss": 5.6207, + "step": 15094 + }, + { + "epoch": 0.08977424112665335, + "grad_norm": 1.7525362968444824, + "learning_rate": 4.901241290551233e-05, + "loss": 5.3468, + "step": 15095 + }, + { + "epoch": 0.08978018840993435, + "grad_norm": 1.6895235776901245, + "learning_rate": 4.901228291153089e-05, + "loss": 5.3567, + "step": 15096 + }, + { + "epoch": 0.08978613569321534, + "grad_norm": 1.6617051362991333, + "learning_rate": 4.9012152909167015e-05, + "loss": 5.6781, + "step": 15097 + }, + { + "epoch": 0.08979208297649634, + "grad_norm": 1.5234577655792236, + "learning_rate": 4.901202289842075e-05, + "loss": 5.6262, + "step": 15098 + }, + { + "epoch": 0.08979803025977734, + "grad_norm": 2.1545703411102295, + "learning_rate": 4.9011892879292125e-05, + "loss": 5.3112, + "step": 15099 + }, + { + "epoch": 0.08980397754305833, + "grad_norm": 2.246051073074341, + "learning_rate": 4.9011762851781204e-05, + "loss": 5.3783, + "step": 15100 + }, + { + "epoch": 0.08980992482633933, + "grad_norm": 2.000429630279541, + "learning_rate": 4.901163281588802e-05, + "loss": 5.2561, + "step": 15101 + }, + { + "epoch": 0.08981587210962033, + "grad_norm": 2.0881898403167725, + "learning_rate": 4.901150277161263e-05, + "loss": 5.3308, + "step": 15102 + }, + { + "epoch": 0.08982181939290132, + "grad_norm": 2.4498097896575928, + "learning_rate": 4.901137271895506e-05, + "loss": 5.8405, + "step": 15103 + }, + { + "epoch": 0.08982776667618232, + "grad_norm": 2.210160732269287, + "learning_rate": 4.901124265791538e-05, + "loss": 5.5462, + "step": 15104 + }, + { + "epoch": 0.08983371395946332, + "grad_norm": 2.366419553756714, + "learning_rate": 4.9011112588493625e-05, + "loss": 5.4069, + "step": 15105 + }, + { + "epoch": 0.08983966124274431, + "grad_norm": 1.812118649482727, + "learning_rate": 4.901098251068983e-05, + "loss": 5.9549, + "step": 15106 + }, + { + "epoch": 0.08984560852602531, + "grad_norm": 1.6506917476654053, + "learning_rate": 4.901085242450405e-05, + "loss": 5.762, + "step": 15107 + }, + { + "epoch": 0.0898515558093063, + "grad_norm": 1.8076404333114624, + "learning_rate": 4.901072232993633e-05, + "loss": 5.7841, + "step": 15108 + }, + { + "epoch": 0.0898575030925873, + "grad_norm": 2.51157546043396, + "learning_rate": 4.9010592226986716e-05, + "loss": 5.1544, + "step": 15109 + }, + { + "epoch": 0.0898634503758683, + "grad_norm": 1.9424755573272705, + "learning_rate": 4.901046211565526e-05, + "loss": 5.4587, + "step": 15110 + }, + { + "epoch": 0.0898693976591493, + "grad_norm": 1.998506784439087, + "learning_rate": 4.9010331995941995e-05, + "loss": 5.8242, + "step": 15111 + }, + { + "epoch": 0.0898753449424303, + "grad_norm": 1.8947205543518066, + "learning_rate": 4.901020186784697e-05, + "loss": 5.4488, + "step": 15112 + }, + { + "epoch": 0.0898812922257113, + "grad_norm": 1.905993938446045, + "learning_rate": 4.901007173137022e-05, + "loss": 5.3882, + "step": 15113 + }, + { + "epoch": 0.08988723950899229, + "grad_norm": 1.723973274230957, + "learning_rate": 4.900994158651182e-05, + "loss": 5.9411, + "step": 15114 + }, + { + "epoch": 0.08989318679227329, + "grad_norm": 1.747159719467163, + "learning_rate": 4.900981143327179e-05, + "loss": 5.8436, + "step": 15115 + }, + { + "epoch": 0.08989913407555429, + "grad_norm": 1.7400517463684082, + "learning_rate": 4.900968127165018e-05, + "loss": 5.7067, + "step": 15116 + }, + { + "epoch": 0.08990508135883528, + "grad_norm": 1.763750433921814, + "learning_rate": 4.900955110164704e-05, + "loss": 5.6198, + "step": 15117 + }, + { + "epoch": 0.08991102864211628, + "grad_norm": 1.9004894495010376, + "learning_rate": 4.9009420923262416e-05, + "loss": 5.0977, + "step": 15118 + }, + { + "epoch": 0.08991697592539728, + "grad_norm": 1.6853641271591187, + "learning_rate": 4.900929073649635e-05, + "loss": 5.5213, + "step": 15119 + }, + { + "epoch": 0.08992292320867827, + "grad_norm": 1.7032074928283691, + "learning_rate": 4.900916054134889e-05, + "loss": 5.3764, + "step": 15120 + }, + { + "epoch": 0.08992887049195927, + "grad_norm": 1.623089075088501, + "learning_rate": 4.9009030337820084e-05, + "loss": 5.525, + "step": 15121 + }, + { + "epoch": 0.08993481777524027, + "grad_norm": 1.6154295206069946, + "learning_rate": 4.900890012590996e-05, + "loss": 5.7378, + "step": 15122 + }, + { + "epoch": 0.08994076505852126, + "grad_norm": 1.8368462324142456, + "learning_rate": 4.900876990561859e-05, + "loss": 5.4768, + "step": 15123 + }, + { + "epoch": 0.08994671234180227, + "grad_norm": 1.7773829698562622, + "learning_rate": 4.9008639676946e-05, + "loss": 5.419, + "step": 15124 + }, + { + "epoch": 0.08995265962508327, + "grad_norm": 1.625287413597107, + "learning_rate": 4.9008509439892244e-05, + "loss": 5.4727, + "step": 15125 + }, + { + "epoch": 0.08995860690836426, + "grad_norm": 1.6234408617019653, + "learning_rate": 4.9008379194457364e-05, + "loss": 5.413, + "step": 15126 + }, + { + "epoch": 0.08996455419164526, + "grad_norm": 1.7441129684448242, + "learning_rate": 4.900824894064141e-05, + "loss": 5.2681, + "step": 15127 + }, + { + "epoch": 0.08997050147492626, + "grad_norm": 1.8756482601165771, + "learning_rate": 4.900811867844443e-05, + "loss": 5.5319, + "step": 15128 + }, + { + "epoch": 0.08997644875820725, + "grad_norm": 1.9200249910354614, + "learning_rate": 4.900798840786645e-05, + "loss": 4.7499, + "step": 15129 + }, + { + "epoch": 0.08998239604148825, + "grad_norm": 2.4838919639587402, + "learning_rate": 4.900785812890753e-05, + "loss": 5.0713, + "step": 15130 + }, + { + "epoch": 0.08998834332476925, + "grad_norm": 2.1441292762756348, + "learning_rate": 4.900772784156773e-05, + "loss": 4.9425, + "step": 15131 + }, + { + "epoch": 0.08999429060805024, + "grad_norm": 2.0838072299957275, + "learning_rate": 4.9007597545847066e-05, + "loss": 5.0632, + "step": 15132 + }, + { + "epoch": 0.09000023789133124, + "grad_norm": 1.630042314529419, + "learning_rate": 4.90074672417456e-05, + "loss": 5.2275, + "step": 15133 + }, + { + "epoch": 0.09000618517461224, + "grad_norm": 2.336031675338745, + "learning_rate": 4.900733692926338e-05, + "loss": 4.9596, + "step": 15134 + }, + { + "epoch": 0.09001213245789323, + "grad_norm": 2.414837598800659, + "learning_rate": 4.9007206608400446e-05, + "loss": 4.7405, + "step": 15135 + }, + { + "epoch": 0.09001807974117423, + "grad_norm": 2.2872564792633057, + "learning_rate": 4.900707627915684e-05, + "loss": 4.8294, + "step": 15136 + }, + { + "epoch": 0.09002402702445522, + "grad_norm": 2.474933624267578, + "learning_rate": 4.9006945941532615e-05, + "loss": 4.882, + "step": 15137 + }, + { + "epoch": 0.09002997430773622, + "grad_norm": 2.170109987258911, + "learning_rate": 4.900681559552781e-05, + "loss": 4.6778, + "step": 15138 + }, + { + "epoch": 0.09003592159101723, + "grad_norm": 2.1962943077087402, + "learning_rate": 4.900668524114248e-05, + "loss": 4.8201, + "step": 15139 + }, + { + "epoch": 0.09004186887429821, + "grad_norm": 2.46073317527771, + "learning_rate": 4.9006554878376656e-05, + "loss": 4.6929, + "step": 15140 + }, + { + "epoch": 0.09004781615757922, + "grad_norm": 2.4591431617736816, + "learning_rate": 4.90064245072304e-05, + "loss": 4.711, + "step": 15141 + }, + { + "epoch": 0.09005376344086022, + "grad_norm": 2.2225937843322754, + "learning_rate": 4.9006294127703745e-05, + "loss": 5.2556, + "step": 15142 + }, + { + "epoch": 0.0900597107241412, + "grad_norm": 2.3457517623901367, + "learning_rate": 4.900616373979674e-05, + "loss": 5.7773, + "step": 15143 + }, + { + "epoch": 0.09006565800742221, + "grad_norm": 2.226430892944336, + "learning_rate": 4.9006033343509436e-05, + "loss": 5.6364, + "step": 15144 + }, + { + "epoch": 0.09007160529070321, + "grad_norm": 2.1407759189605713, + "learning_rate": 4.900590293884186e-05, + "loss": 5.4202, + "step": 15145 + }, + { + "epoch": 0.0900775525739842, + "grad_norm": 1.7371548414230347, + "learning_rate": 4.9005772525794084e-05, + "loss": 5.5686, + "step": 15146 + }, + { + "epoch": 0.0900834998572652, + "grad_norm": 1.8759154081344604, + "learning_rate": 4.900564210436615e-05, + "loss": 5.4824, + "step": 15147 + }, + { + "epoch": 0.0900894471405462, + "grad_norm": 1.8595685958862305, + "learning_rate": 4.900551167455807e-05, + "loss": 5.6123, + "step": 15148 + }, + { + "epoch": 0.09009539442382719, + "grad_norm": 2.0119471549987793, + "learning_rate": 4.900538123636993e-05, + "loss": 5.5925, + "step": 15149 + }, + { + "epoch": 0.09010134170710819, + "grad_norm": 1.9375147819519043, + "learning_rate": 4.900525078980176e-05, + "loss": 5.5707, + "step": 15150 + }, + { + "epoch": 0.0901072889903892, + "grad_norm": 1.7323594093322754, + "learning_rate": 4.9005120334853595e-05, + "loss": 5.4133, + "step": 15151 + }, + { + "epoch": 0.09011323627367018, + "grad_norm": 1.7680727243423462, + "learning_rate": 4.90049898715255e-05, + "loss": 5.5954, + "step": 15152 + }, + { + "epoch": 0.09011918355695119, + "grad_norm": 1.8436721563339233, + "learning_rate": 4.9004859399817505e-05, + "loss": 5.5689, + "step": 15153 + }, + { + "epoch": 0.09012513084023219, + "grad_norm": 1.8080954551696777, + "learning_rate": 4.9004728919729664e-05, + "loss": 5.5266, + "step": 15154 + }, + { + "epoch": 0.09013107812351318, + "grad_norm": 2.2874748706817627, + "learning_rate": 4.900459843126202e-05, + "loss": 5.1985, + "step": 15155 + }, + { + "epoch": 0.09013702540679418, + "grad_norm": 1.8425899744033813, + "learning_rate": 4.900446793441462e-05, + "loss": 5.2856, + "step": 15156 + }, + { + "epoch": 0.09014297269007518, + "grad_norm": 1.6970654726028442, + "learning_rate": 4.900433742918751e-05, + "loss": 5.8597, + "step": 15157 + }, + { + "epoch": 0.09014891997335617, + "grad_norm": 2.3444008827209473, + "learning_rate": 4.9004206915580726e-05, + "loss": 4.4653, + "step": 15158 + }, + { + "epoch": 0.09015486725663717, + "grad_norm": 2.0390350818634033, + "learning_rate": 4.9004076393594325e-05, + "loss": 4.6565, + "step": 15159 + }, + { + "epoch": 0.09016081453991817, + "grad_norm": 2.0733320713043213, + "learning_rate": 4.900394586322835e-05, + "loss": 4.6052, + "step": 15160 + }, + { + "epoch": 0.09016676182319916, + "grad_norm": 1.9700855016708374, + "learning_rate": 4.9003815324482846e-05, + "loss": 4.7535, + "step": 15161 + }, + { + "epoch": 0.09017270910648016, + "grad_norm": 2.0294783115386963, + "learning_rate": 4.900368477735786e-05, + "loss": 5.4154, + "step": 15162 + }, + { + "epoch": 0.09017865638976116, + "grad_norm": 1.8937848806381226, + "learning_rate": 4.900355422185343e-05, + "loss": 5.3244, + "step": 15163 + }, + { + "epoch": 0.09018460367304215, + "grad_norm": 1.7404329776763916, + "learning_rate": 4.900342365796961e-05, + "loss": 5.887, + "step": 15164 + }, + { + "epoch": 0.09019055095632315, + "grad_norm": 1.5309412479400635, + "learning_rate": 4.9003293085706446e-05, + "loss": 5.4574, + "step": 15165 + }, + { + "epoch": 0.09019649823960414, + "grad_norm": 2.10003662109375, + "learning_rate": 4.9003162505063976e-05, + "loss": 5.2962, + "step": 15166 + }, + { + "epoch": 0.09020244552288514, + "grad_norm": 2.7704551219940186, + "learning_rate": 4.900303191604225e-05, + "loss": 4.6386, + "step": 15167 + }, + { + "epoch": 0.09020839280616615, + "grad_norm": 3.3551974296569824, + "learning_rate": 4.9002901318641314e-05, + "loss": 5.3348, + "step": 15168 + }, + { + "epoch": 0.09021434008944713, + "grad_norm": 2.8300132751464844, + "learning_rate": 4.9002770712861216e-05, + "loss": 5.2031, + "step": 15169 + }, + { + "epoch": 0.09022028737272814, + "grad_norm": 1.77587890625, + "learning_rate": 4.9002640098702005e-05, + "loss": 5.1371, + "step": 15170 + }, + { + "epoch": 0.09022623465600914, + "grad_norm": 1.694191575050354, + "learning_rate": 4.900250947616371e-05, + "loss": 5.7283, + "step": 15171 + }, + { + "epoch": 0.09023218193929013, + "grad_norm": 1.6392415761947632, + "learning_rate": 4.900237884524638e-05, + "loss": 5.3856, + "step": 15172 + }, + { + "epoch": 0.09023812922257113, + "grad_norm": 2.302626371383667, + "learning_rate": 4.900224820595008e-05, + "loss": 5.1007, + "step": 15173 + }, + { + "epoch": 0.09024407650585213, + "grad_norm": 2.296760082244873, + "learning_rate": 4.900211755827484e-05, + "loss": 5.0303, + "step": 15174 + }, + { + "epoch": 0.09025002378913312, + "grad_norm": 2.2914488315582275, + "learning_rate": 4.9001986902220706e-05, + "loss": 5.3176, + "step": 15175 + }, + { + "epoch": 0.09025597107241412, + "grad_norm": 2.084686756134033, + "learning_rate": 4.900185623778774e-05, + "loss": 5.2028, + "step": 15176 + }, + { + "epoch": 0.09026191835569512, + "grad_norm": 1.9465001821517944, + "learning_rate": 4.9001725564975953e-05, + "loss": 4.661, + "step": 15177 + }, + { + "epoch": 0.09026786563897611, + "grad_norm": 2.926347494125366, + "learning_rate": 4.900159488378542e-05, + "loss": 4.4579, + "step": 15178 + }, + { + "epoch": 0.09027381292225711, + "grad_norm": 2.6047539710998535, + "learning_rate": 4.900146419421619e-05, + "loss": 4.5486, + "step": 15179 + }, + { + "epoch": 0.09027976020553811, + "grad_norm": 2.4737868309020996, + "learning_rate": 4.9001333496268274e-05, + "loss": 4.3661, + "step": 15180 + }, + { + "epoch": 0.0902857074888191, + "grad_norm": 2.075547456741333, + "learning_rate": 4.900120278994176e-05, + "loss": 4.3157, + "step": 15181 + }, + { + "epoch": 0.0902916547721001, + "grad_norm": 2.509284019470215, + "learning_rate": 4.900107207523666e-05, + "loss": 4.2558, + "step": 15182 + }, + { + "epoch": 0.09029760205538111, + "grad_norm": 2.4345662593841553, + "learning_rate": 4.9000941352153046e-05, + "loss": 4.2932, + "step": 15183 + }, + { + "epoch": 0.0903035493386621, + "grad_norm": 2.214146137237549, + "learning_rate": 4.9000810620690945e-05, + "loss": 4.6953, + "step": 15184 + }, + { + "epoch": 0.0903094966219431, + "grad_norm": 2.197709083557129, + "learning_rate": 4.900067988085041e-05, + "loss": 4.7138, + "step": 15185 + }, + { + "epoch": 0.0903154439052241, + "grad_norm": 2.0381791591644287, + "learning_rate": 4.900054913263148e-05, + "loss": 6.1924, + "step": 15186 + }, + { + "epoch": 0.09032139118850509, + "grad_norm": 1.7017699480056763, + "learning_rate": 4.900041837603422e-05, + "loss": 6.1646, + "step": 15187 + }, + { + "epoch": 0.09032733847178609, + "grad_norm": 1.5804365873336792, + "learning_rate": 4.9000287611058645e-05, + "loss": 6.1757, + "step": 15188 + }, + { + "epoch": 0.09033328575506709, + "grad_norm": 1.6158896684646606, + "learning_rate": 4.9000156837704836e-05, + "loss": 6.1136, + "step": 15189 + }, + { + "epoch": 0.09033923303834808, + "grad_norm": 1.9524257183074951, + "learning_rate": 4.90000260559728e-05, + "loss": 5.43, + "step": 15190 + }, + { + "epoch": 0.09034518032162908, + "grad_norm": 1.835134744644165, + "learning_rate": 4.899989526586261e-05, + "loss": 6.0223, + "step": 15191 + }, + { + "epoch": 0.09035112760491008, + "grad_norm": 1.7213332653045654, + "learning_rate": 4.899976446737432e-05, + "loss": 5.7823, + "step": 15192 + }, + { + "epoch": 0.09035707488819107, + "grad_norm": 1.8744465112686157, + "learning_rate": 4.899963366050795e-05, + "loss": 5.0549, + "step": 15193 + }, + { + "epoch": 0.09036302217147207, + "grad_norm": 1.800979495048523, + "learning_rate": 4.899950284526355e-05, + "loss": 5.0726, + "step": 15194 + }, + { + "epoch": 0.09036896945475306, + "grad_norm": 1.7476063966751099, + "learning_rate": 4.899937202164118e-05, + "loss": 4.9177, + "step": 15195 + }, + { + "epoch": 0.09037491673803406, + "grad_norm": 1.5107455253601074, + "learning_rate": 4.899924118964087e-05, + "loss": 5.1873, + "step": 15196 + }, + { + "epoch": 0.09038086402131507, + "grad_norm": 1.4630497694015503, + "learning_rate": 4.899911034926267e-05, + "loss": 4.9166, + "step": 15197 + }, + { + "epoch": 0.09038681130459605, + "grad_norm": 1.519824743270874, + "learning_rate": 4.899897950050664e-05, + "loss": 4.9084, + "step": 15198 + }, + { + "epoch": 0.09039275858787706, + "grad_norm": 1.480298399925232, + "learning_rate": 4.899884864337281e-05, + "loss": 4.8724, + "step": 15199 + }, + { + "epoch": 0.09039870587115806, + "grad_norm": 1.549485445022583, + "learning_rate": 4.8998717777861224e-05, + "loss": 4.8378, + "step": 15200 + }, + { + "epoch": 0.09040465315443905, + "grad_norm": 1.6650373935699463, + "learning_rate": 4.8998586903971936e-05, + "loss": 4.9478, + "step": 15201 + }, + { + "epoch": 0.09041060043772005, + "grad_norm": 1.5880005359649658, + "learning_rate": 4.899845602170499e-05, + "loss": 4.7952, + "step": 15202 + }, + { + "epoch": 0.09041654772100105, + "grad_norm": 1.5553892850875854, + "learning_rate": 4.899832513106043e-05, + "loss": 4.9303, + "step": 15203 + }, + { + "epoch": 0.09042249500428204, + "grad_norm": 1.5907729864120483, + "learning_rate": 4.899819423203831e-05, + "loss": 4.7951, + "step": 15204 + }, + { + "epoch": 0.09042844228756304, + "grad_norm": 1.5885943174362183, + "learning_rate": 4.899806332463866e-05, + "loss": 4.8896, + "step": 15205 + }, + { + "epoch": 0.09043438957084404, + "grad_norm": 1.7483280897140503, + "learning_rate": 4.899793240886154e-05, + "loss": 5.6137, + "step": 15206 + }, + { + "epoch": 0.09044033685412503, + "grad_norm": 1.7883373498916626, + "learning_rate": 4.8997801484706984e-05, + "loss": 5.7183, + "step": 15207 + }, + { + "epoch": 0.09044628413740603, + "grad_norm": 1.7988712787628174, + "learning_rate": 4.8997670552175044e-05, + "loss": 5.7979, + "step": 15208 + }, + { + "epoch": 0.09045223142068703, + "grad_norm": 2.1793367862701416, + "learning_rate": 4.899753961126577e-05, + "loss": 5.3549, + "step": 15209 + }, + { + "epoch": 0.09045817870396802, + "grad_norm": 2.117983341217041, + "learning_rate": 4.8997408661979194e-05, + "loss": 5.1934, + "step": 15210 + }, + { + "epoch": 0.09046412598724902, + "grad_norm": 2.1799557209014893, + "learning_rate": 4.899727770431538e-05, + "loss": 5.2521, + "step": 15211 + }, + { + "epoch": 0.09047007327053003, + "grad_norm": 2.117403745651245, + "learning_rate": 4.8997146738274355e-05, + "loss": 5.3379, + "step": 15212 + }, + { + "epoch": 0.09047602055381102, + "grad_norm": 1.59669828414917, + "learning_rate": 4.899701576385619e-05, + "loss": 5.375, + "step": 15213 + }, + { + "epoch": 0.09048196783709202, + "grad_norm": 1.6929266452789307, + "learning_rate": 4.8996884781060907e-05, + "loss": 5.9243, + "step": 15214 + }, + { + "epoch": 0.09048791512037302, + "grad_norm": 1.8353838920593262, + "learning_rate": 4.899675378988855e-05, + "loss": 5.9216, + "step": 15215 + }, + { + "epoch": 0.09049386240365401, + "grad_norm": 1.6468323469161987, + "learning_rate": 4.899662279033918e-05, + "loss": 6.0171, + "step": 15216 + }, + { + "epoch": 0.09049980968693501, + "grad_norm": 1.4748890399932861, + "learning_rate": 4.899649178241284e-05, + "loss": 5.6775, + "step": 15217 + }, + { + "epoch": 0.09050575697021601, + "grad_norm": 1.8783589601516724, + "learning_rate": 4.8996360766109576e-05, + "loss": 5.7625, + "step": 15218 + }, + { + "epoch": 0.090511704253497, + "grad_norm": 1.7860721349716187, + "learning_rate": 4.8996229741429416e-05, + "loss": 5.7512, + "step": 15219 + }, + { + "epoch": 0.090517651536778, + "grad_norm": 1.7337830066680908, + "learning_rate": 4.899609870837243e-05, + "loss": 5.8233, + "step": 15220 + }, + { + "epoch": 0.090523598820059, + "grad_norm": 1.9256298542022705, + "learning_rate": 4.899596766693865e-05, + "loss": 5.8586, + "step": 15221 + }, + { + "epoch": 0.09052954610333999, + "grad_norm": 1.814205288887024, + "learning_rate": 4.8995836617128135e-05, + "loss": 5.4852, + "step": 15222 + }, + { + "epoch": 0.090535493386621, + "grad_norm": 1.8664608001708984, + "learning_rate": 4.899570555894091e-05, + "loss": 5.6847, + "step": 15223 + }, + { + "epoch": 0.09054144066990198, + "grad_norm": 1.8377459049224854, + "learning_rate": 4.899557449237704e-05, + "loss": 5.8869, + "step": 15224 + }, + { + "epoch": 0.09054738795318298, + "grad_norm": 1.788875937461853, + "learning_rate": 4.899544341743656e-05, + "loss": 5.4372, + "step": 15225 + }, + { + "epoch": 0.09055333523646399, + "grad_norm": 1.8490506410598755, + "learning_rate": 4.899531233411951e-05, + "loss": 6.1163, + "step": 15226 + }, + { + "epoch": 0.09055928251974497, + "grad_norm": 2.14841628074646, + "learning_rate": 4.8995181242425955e-05, + "loss": 6.1154, + "step": 15227 + }, + { + "epoch": 0.09056522980302598, + "grad_norm": 2.051154851913452, + "learning_rate": 4.899505014235593e-05, + "loss": 4.9326, + "step": 15228 + }, + { + "epoch": 0.09057117708630698, + "grad_norm": 2.071126937866211, + "learning_rate": 4.899491903390948e-05, + "loss": 4.8831, + "step": 15229 + }, + { + "epoch": 0.09057712436958797, + "grad_norm": 2.0155231952667236, + "learning_rate": 4.899478791708665e-05, + "loss": 4.87, + "step": 15230 + }, + { + "epoch": 0.09058307165286897, + "grad_norm": 1.946815013885498, + "learning_rate": 4.89946567918875e-05, + "loss": 4.8139, + "step": 15231 + }, + { + "epoch": 0.09058901893614997, + "grad_norm": 1.9526349306106567, + "learning_rate": 4.899452565831204e-05, + "loss": 4.7618, + "step": 15232 + }, + { + "epoch": 0.09059496621943096, + "grad_norm": 2.0434954166412354, + "learning_rate": 4.8994394516360355e-05, + "loss": 4.7617, + "step": 15233 + }, + { + "epoch": 0.09060091350271196, + "grad_norm": 2.0964083671569824, + "learning_rate": 4.8994263366032466e-05, + "loss": 4.6298, + "step": 15234 + }, + { + "epoch": 0.09060686078599296, + "grad_norm": 2.0333590507507324, + "learning_rate": 4.899413220732843e-05, + "loss": 4.6419, + "step": 15235 + }, + { + "epoch": 0.09061280806927395, + "grad_norm": 2.076993703842163, + "learning_rate": 4.89940010402483e-05, + "loss": 4.6163, + "step": 15236 + }, + { + "epoch": 0.09061875535255495, + "grad_norm": 1.767774224281311, + "learning_rate": 4.89938698647921e-05, + "loss": 5.2418, + "step": 15237 + }, + { + "epoch": 0.09062470263583595, + "grad_norm": 1.8380626440048218, + "learning_rate": 4.899373868095989e-05, + "loss": 5.3304, + "step": 15238 + }, + { + "epoch": 0.09063064991911694, + "grad_norm": 1.7332574129104614, + "learning_rate": 4.8993607488751716e-05, + "loss": 5.3528, + "step": 15239 + }, + { + "epoch": 0.09063659720239794, + "grad_norm": 1.8473124504089355, + "learning_rate": 4.8993476288167614e-05, + "loss": 5.5801, + "step": 15240 + }, + { + "epoch": 0.09064254448567895, + "grad_norm": 2.299206256866455, + "learning_rate": 4.899334507920765e-05, + "loss": 5.308, + "step": 15241 + }, + { + "epoch": 0.09064849176895994, + "grad_norm": 1.945417046546936, + "learning_rate": 4.899321386187185e-05, + "loss": 4.8894, + "step": 15242 + }, + { + "epoch": 0.09065443905224094, + "grad_norm": 2.328246831893921, + "learning_rate": 4.899308263616027e-05, + "loss": 5.0332, + "step": 15243 + }, + { + "epoch": 0.09066038633552194, + "grad_norm": 2.194546699523926, + "learning_rate": 4.899295140207295e-05, + "loss": 4.8891, + "step": 15244 + }, + { + "epoch": 0.09066633361880293, + "grad_norm": 2.078903913497925, + "learning_rate": 4.899282015960994e-05, + "loss": 5.0327, + "step": 15245 + }, + { + "epoch": 0.09067228090208393, + "grad_norm": 2.2129557132720947, + "learning_rate": 4.8992688908771285e-05, + "loss": 4.8806, + "step": 15246 + }, + { + "epoch": 0.09067822818536493, + "grad_norm": 2.3200979232788086, + "learning_rate": 4.8992557649557026e-05, + "loss": 4.9961, + "step": 15247 + }, + { + "epoch": 0.09068417546864592, + "grad_norm": 1.5829685926437378, + "learning_rate": 4.899242638196722e-05, + "loss": 5.4238, + "step": 15248 + }, + { + "epoch": 0.09069012275192692, + "grad_norm": 1.9085135459899902, + "learning_rate": 4.89922951060019e-05, + "loss": 5.0338, + "step": 15249 + }, + { + "epoch": 0.09069607003520792, + "grad_norm": 2.3000802993774414, + "learning_rate": 4.899216382166112e-05, + "loss": 4.9529, + "step": 15250 + }, + { + "epoch": 0.09070201731848891, + "grad_norm": 2.1610753536224365, + "learning_rate": 4.899203252894492e-05, + "loss": 4.9373, + "step": 15251 + }, + { + "epoch": 0.09070796460176991, + "grad_norm": 2.2821414470672607, + "learning_rate": 4.899190122785336e-05, + "loss": 5.2032, + "step": 15252 + }, + { + "epoch": 0.0907139118850509, + "grad_norm": 2.226741075515747, + "learning_rate": 4.899176991838646e-05, + "loss": 4.9354, + "step": 15253 + }, + { + "epoch": 0.0907198591683319, + "grad_norm": 2.0117716789245605, + "learning_rate": 4.899163860054429e-05, + "loss": 5.1179, + "step": 15254 + }, + { + "epoch": 0.0907258064516129, + "grad_norm": 1.6551730632781982, + "learning_rate": 4.8991507274326886e-05, + "loss": 5.6428, + "step": 15255 + }, + { + "epoch": 0.0907317537348939, + "grad_norm": 1.5236784219741821, + "learning_rate": 4.89913759397343e-05, + "loss": 5.4088, + "step": 15256 + }, + { + "epoch": 0.0907377010181749, + "grad_norm": 1.542356252670288, + "learning_rate": 4.899124459676656e-05, + "loss": 5.3383, + "step": 15257 + }, + { + "epoch": 0.0907436483014559, + "grad_norm": 1.5694434642791748, + "learning_rate": 4.899111324542374e-05, + "loss": 5.5202, + "step": 15258 + }, + { + "epoch": 0.09074959558473689, + "grad_norm": 1.459039568901062, + "learning_rate": 4.8990981885705856e-05, + "loss": 5.3481, + "step": 15259 + }, + { + "epoch": 0.09075554286801789, + "grad_norm": 1.4624565839767456, + "learning_rate": 4.899085051761297e-05, + "loss": 5.343, + "step": 15260 + }, + { + "epoch": 0.09076149015129889, + "grad_norm": 1.2748361825942993, + "learning_rate": 4.899071914114513e-05, + "loss": 5.1925, + "step": 15261 + }, + { + "epoch": 0.09076743743457988, + "grad_norm": 1.3813046216964722, + "learning_rate": 4.899058775630237e-05, + "loss": 4.9712, + "step": 15262 + }, + { + "epoch": 0.09077338471786088, + "grad_norm": 1.349108099937439, + "learning_rate": 4.8990456363084756e-05, + "loss": 4.9562, + "step": 15263 + }, + { + "epoch": 0.09077933200114188, + "grad_norm": 1.4744555950164795, + "learning_rate": 4.8990324961492316e-05, + "loss": 5.0014, + "step": 15264 + }, + { + "epoch": 0.09078527928442287, + "grad_norm": 1.4227643013000488, + "learning_rate": 4.8990193551525105e-05, + "loss": 5.076, + "step": 15265 + }, + { + "epoch": 0.09079122656770387, + "grad_norm": 1.4344059228897095, + "learning_rate": 4.8990062133183164e-05, + "loss": 5.2212, + "step": 15266 + }, + { + "epoch": 0.09079717385098487, + "grad_norm": 1.5858408212661743, + "learning_rate": 4.8989930706466534e-05, + "loss": 5.1893, + "step": 15267 + }, + { + "epoch": 0.09080312113426586, + "grad_norm": 1.6398282051086426, + "learning_rate": 4.898979927137527e-05, + "loss": 5.034, + "step": 15268 + }, + { + "epoch": 0.09080906841754686, + "grad_norm": 1.4295551776885986, + "learning_rate": 4.8989667827909416e-05, + "loss": 5.2761, + "step": 15269 + }, + { + "epoch": 0.09081501570082787, + "grad_norm": 1.4313840866088867, + "learning_rate": 4.898953637606902e-05, + "loss": 5.183, + "step": 15270 + }, + { + "epoch": 0.09082096298410886, + "grad_norm": 1.2977478504180908, + "learning_rate": 4.898940491585412e-05, + "loss": 5.1148, + "step": 15271 + }, + { + "epoch": 0.09082691026738986, + "grad_norm": 1.6052992343902588, + "learning_rate": 4.898927344726477e-05, + "loss": 5.3767, + "step": 15272 + }, + { + "epoch": 0.09083285755067086, + "grad_norm": 1.3184257745742798, + "learning_rate": 4.898914197030101e-05, + "loss": 5.3465, + "step": 15273 + }, + { + "epoch": 0.09083880483395185, + "grad_norm": 1.292985439300537, + "learning_rate": 4.898901048496289e-05, + "loss": 5.2478, + "step": 15274 + }, + { + "epoch": 0.09084475211723285, + "grad_norm": 1.1660702228546143, + "learning_rate": 4.898887899125045e-05, + "loss": 5.2655, + "step": 15275 + }, + { + "epoch": 0.09085069940051385, + "grad_norm": 1.2271296977996826, + "learning_rate": 4.8988747489163746e-05, + "loss": 5.2001, + "step": 15276 + }, + { + "epoch": 0.09085664668379484, + "grad_norm": 1.2237215042114258, + "learning_rate": 4.898861597870281e-05, + "loss": 5.213, + "step": 15277 + }, + { + "epoch": 0.09086259396707584, + "grad_norm": 1.3682539463043213, + "learning_rate": 4.898848445986771e-05, + "loss": 5.2174, + "step": 15278 + }, + { + "epoch": 0.09086854125035684, + "grad_norm": 1.2321406602859497, + "learning_rate": 4.8988352932658466e-05, + "loss": 5.1424, + "step": 15279 + }, + { + "epoch": 0.09087448853363783, + "grad_norm": 1.285792350769043, + "learning_rate": 4.898822139707514e-05, + "loss": 5.1438, + "step": 15280 + }, + { + "epoch": 0.09088043581691883, + "grad_norm": 1.137921690940857, + "learning_rate": 4.898808985311778e-05, + "loss": 5.159, + "step": 15281 + }, + { + "epoch": 0.09088638310019982, + "grad_norm": 1.2261563539505005, + "learning_rate": 4.898795830078641e-05, + "loss": 5.1176, + "step": 15282 + }, + { + "epoch": 0.09089233038348082, + "grad_norm": 1.1642104387283325, + "learning_rate": 4.89878267400811e-05, + "loss": 5.0887, + "step": 15283 + }, + { + "epoch": 0.09089827766676183, + "grad_norm": 1.3699917793273926, + "learning_rate": 4.898769517100189e-05, + "loss": 5.0048, + "step": 15284 + }, + { + "epoch": 0.09090422495004281, + "grad_norm": 1.6375452280044556, + "learning_rate": 4.898756359354882e-05, + "loss": 4.6914, + "step": 15285 + }, + { + "epoch": 0.09091017223332382, + "grad_norm": 1.5404956340789795, + "learning_rate": 4.8987432007721944e-05, + "loss": 4.8266, + "step": 15286 + }, + { + "epoch": 0.09091611951660482, + "grad_norm": 1.6747840642929077, + "learning_rate": 4.89873004135213e-05, + "loss": 4.697, + "step": 15287 + }, + { + "epoch": 0.0909220667998858, + "grad_norm": 1.3908432722091675, + "learning_rate": 4.8987168810946935e-05, + "loss": 4.9327, + "step": 15288 + }, + { + "epoch": 0.09092801408316681, + "grad_norm": 1.4933167695999146, + "learning_rate": 4.89870371999989e-05, + "loss": 4.6153, + "step": 15289 + }, + { + "epoch": 0.09093396136644781, + "grad_norm": 1.6259129047393799, + "learning_rate": 4.8986905580677234e-05, + "loss": 4.533, + "step": 15290 + }, + { + "epoch": 0.0909399086497288, + "grad_norm": 1.3692474365234375, + "learning_rate": 4.898677395298199e-05, + "loss": 4.6246, + "step": 15291 + }, + { + "epoch": 0.0909458559330098, + "grad_norm": 1.4951711893081665, + "learning_rate": 4.8986642316913214e-05, + "loss": 4.6677, + "step": 15292 + }, + { + "epoch": 0.0909518032162908, + "grad_norm": 1.5491467714309692, + "learning_rate": 4.8986510672470946e-05, + "loss": 4.9271, + "step": 15293 + }, + { + "epoch": 0.09095775049957179, + "grad_norm": 1.6902397871017456, + "learning_rate": 4.8986379019655235e-05, + "loss": 4.6467, + "step": 15294 + }, + { + "epoch": 0.09096369778285279, + "grad_norm": 1.5122796297073364, + "learning_rate": 4.898624735846613e-05, + "loss": 4.7103, + "step": 15295 + }, + { + "epoch": 0.0909696450661338, + "grad_norm": 1.5287622213363647, + "learning_rate": 4.898611568890367e-05, + "loss": 4.7461, + "step": 15296 + }, + { + "epoch": 0.09097559234941478, + "grad_norm": 1.4649391174316406, + "learning_rate": 4.898598401096791e-05, + "loss": 5.2472, + "step": 15297 + }, + { + "epoch": 0.09098153963269578, + "grad_norm": 1.7621572017669678, + "learning_rate": 4.898585232465889e-05, + "loss": 4.6864, + "step": 15298 + }, + { + "epoch": 0.09098748691597679, + "grad_norm": 1.6371783018112183, + "learning_rate": 4.898572062997665e-05, + "loss": 4.6091, + "step": 15299 + }, + { + "epoch": 0.09099343419925777, + "grad_norm": 1.28440523147583, + "learning_rate": 4.898558892692125e-05, + "loss": 5.0019, + "step": 15300 + }, + { + "epoch": 0.09099938148253878, + "grad_norm": 1.4753130674362183, + "learning_rate": 4.898545721549272e-05, + "loss": 5.3848, + "step": 15301 + }, + { + "epoch": 0.09100532876581978, + "grad_norm": 1.4267481565475464, + "learning_rate": 4.898532549569112e-05, + "loss": 5.1787, + "step": 15302 + }, + { + "epoch": 0.09101127604910077, + "grad_norm": 1.4724546670913696, + "learning_rate": 4.898519376751649e-05, + "loss": 5.2581, + "step": 15303 + }, + { + "epoch": 0.09101722333238177, + "grad_norm": 1.4417310953140259, + "learning_rate": 4.8985062030968875e-05, + "loss": 5.4829, + "step": 15304 + }, + { + "epoch": 0.09102317061566277, + "grad_norm": 1.1160683631896973, + "learning_rate": 4.898493028604833e-05, + "loss": 5.5287, + "step": 15305 + }, + { + "epoch": 0.09102911789894376, + "grad_norm": 1.2454899549484253, + "learning_rate": 4.8984798532754884e-05, + "loss": 5.2984, + "step": 15306 + }, + { + "epoch": 0.09103506518222476, + "grad_norm": 1.5732132196426392, + "learning_rate": 4.8984666771088596e-05, + "loss": 5.4998, + "step": 15307 + }, + { + "epoch": 0.09104101246550576, + "grad_norm": 1.6430423259735107, + "learning_rate": 4.8984535001049515e-05, + "loss": 5.4636, + "step": 15308 + }, + { + "epoch": 0.09104695974878675, + "grad_norm": 1.245288372039795, + "learning_rate": 4.898440322263768e-05, + "loss": 5.2874, + "step": 15309 + }, + { + "epoch": 0.09105290703206775, + "grad_norm": 1.4186644554138184, + "learning_rate": 4.898427143585312e-05, + "loss": 5.2275, + "step": 15310 + }, + { + "epoch": 0.09105885431534876, + "grad_norm": 1.3040757179260254, + "learning_rate": 4.8984139640695915e-05, + "loss": 5.2864, + "step": 15311 + }, + { + "epoch": 0.09106480159862974, + "grad_norm": 1.4106818437576294, + "learning_rate": 4.898400783716609e-05, + "loss": 5.5897, + "step": 15312 + }, + { + "epoch": 0.09107074888191075, + "grad_norm": 1.5596522092819214, + "learning_rate": 4.89838760252637e-05, + "loss": 5.4827, + "step": 15313 + }, + { + "epoch": 0.09107669616519173, + "grad_norm": 2.2576634883880615, + "learning_rate": 4.898374420498878e-05, + "loss": 5.1471, + "step": 15314 + }, + { + "epoch": 0.09108264344847274, + "grad_norm": 1.2749537229537964, + "learning_rate": 4.898361237634139e-05, + "loss": 5.2688, + "step": 15315 + }, + { + "epoch": 0.09108859073175374, + "grad_norm": 1.4171591997146606, + "learning_rate": 4.8983480539321566e-05, + "loss": 5.0796, + "step": 15316 + }, + { + "epoch": 0.09109453801503473, + "grad_norm": 1.2233314514160156, + "learning_rate": 4.898334869392936e-05, + "loss": 5.0992, + "step": 15317 + }, + { + "epoch": 0.09110048529831573, + "grad_norm": 1.4817143678665161, + "learning_rate": 4.8983216840164804e-05, + "loss": 5.2354, + "step": 15318 + }, + { + "epoch": 0.09110643258159673, + "grad_norm": 1.442088007926941, + "learning_rate": 4.898308497802796e-05, + "loss": 5.2177, + "step": 15319 + }, + { + "epoch": 0.09111237986487772, + "grad_norm": 1.3996042013168335, + "learning_rate": 4.898295310751887e-05, + "loss": 4.9938, + "step": 15320 + }, + { + "epoch": 0.09111832714815872, + "grad_norm": 1.3091521263122559, + "learning_rate": 4.8982821228637576e-05, + "loss": 4.9916, + "step": 15321 + }, + { + "epoch": 0.09112427443143972, + "grad_norm": 1.4807448387145996, + "learning_rate": 4.898268934138414e-05, + "loss": 4.9833, + "step": 15322 + }, + { + "epoch": 0.09113022171472071, + "grad_norm": 1.5992671251296997, + "learning_rate": 4.898255744575858e-05, + "loss": 5.1007, + "step": 15323 + }, + { + "epoch": 0.09113616899800171, + "grad_norm": 1.4472523927688599, + "learning_rate": 4.8982425541760954e-05, + "loss": 5.3123, + "step": 15324 + }, + { + "epoch": 0.09114211628128271, + "grad_norm": 1.2865816354751587, + "learning_rate": 4.898229362939132e-05, + "loss": 5.0817, + "step": 15325 + }, + { + "epoch": 0.0911480635645637, + "grad_norm": 1.477144479751587, + "learning_rate": 4.898216170864972e-05, + "loss": 5.1819, + "step": 15326 + }, + { + "epoch": 0.0911540108478447, + "grad_norm": 1.5831303596496582, + "learning_rate": 4.8982029779536184e-05, + "loss": 5.28, + "step": 15327 + }, + { + "epoch": 0.0911599581311257, + "grad_norm": 1.3366963863372803, + "learning_rate": 4.898189784205078e-05, + "loss": 5.3715, + "step": 15328 + }, + { + "epoch": 0.0911659054144067, + "grad_norm": 1.5603365898132324, + "learning_rate": 4.898176589619353e-05, + "loss": 5.2642, + "step": 15329 + }, + { + "epoch": 0.0911718526976877, + "grad_norm": 1.5105326175689697, + "learning_rate": 4.8981633941964506e-05, + "loss": 4.949, + "step": 15330 + }, + { + "epoch": 0.0911777999809687, + "grad_norm": 1.2074800729751587, + "learning_rate": 4.8981501979363734e-05, + "loss": 5.2847, + "step": 15331 + }, + { + "epoch": 0.09118374726424969, + "grad_norm": 1.4356200695037842, + "learning_rate": 4.898137000839127e-05, + "loss": 5.6169, + "step": 15332 + }, + { + "epoch": 0.09118969454753069, + "grad_norm": 1.5015919208526611, + "learning_rate": 4.8981238029047154e-05, + "loss": 5.1135, + "step": 15333 + }, + { + "epoch": 0.09119564183081169, + "grad_norm": 1.4902187585830688, + "learning_rate": 4.8981106041331434e-05, + "loss": 5.4406, + "step": 15334 + }, + { + "epoch": 0.09120158911409268, + "grad_norm": 1.2884581089019775, + "learning_rate": 4.898097404524416e-05, + "loss": 5.3493, + "step": 15335 + }, + { + "epoch": 0.09120753639737368, + "grad_norm": 1.4323054552078247, + "learning_rate": 4.898084204078539e-05, + "loss": 5.0939, + "step": 15336 + }, + { + "epoch": 0.09121348368065468, + "grad_norm": 1.6282861232757568, + "learning_rate": 4.898071002795514e-05, + "loss": 5.1857, + "step": 15337 + }, + { + "epoch": 0.09121943096393567, + "grad_norm": 1.3413678407669067, + "learning_rate": 4.898057800675347e-05, + "loss": 4.9581, + "step": 15338 + }, + { + "epoch": 0.09122537824721667, + "grad_norm": 1.5613822937011719, + "learning_rate": 4.898044597718044e-05, + "loss": 4.6401, + "step": 15339 + }, + { + "epoch": 0.09123132553049768, + "grad_norm": 1.4945799112319946, + "learning_rate": 4.898031393923608e-05, + "loss": 4.6649, + "step": 15340 + }, + { + "epoch": 0.09123727281377866, + "grad_norm": 1.6086750030517578, + "learning_rate": 4.898018189292043e-05, + "loss": 4.5996, + "step": 15341 + }, + { + "epoch": 0.09124322009705967, + "grad_norm": 1.3530272245407104, + "learning_rate": 4.898004983823355e-05, + "loss": 4.6511, + "step": 15342 + }, + { + "epoch": 0.09124916738034065, + "grad_norm": 1.5523587465286255, + "learning_rate": 4.897991777517549e-05, + "loss": 4.8099, + "step": 15343 + }, + { + "epoch": 0.09125511466362166, + "grad_norm": 1.6695882081985474, + "learning_rate": 4.8979785703746286e-05, + "loss": 5.2371, + "step": 15344 + }, + { + "epoch": 0.09126106194690266, + "grad_norm": 1.777717113494873, + "learning_rate": 4.897965362394599e-05, + "loss": 5.373, + "step": 15345 + }, + { + "epoch": 0.09126700923018365, + "grad_norm": 1.2890517711639404, + "learning_rate": 4.8979521535774636e-05, + "loss": 5.3851, + "step": 15346 + }, + { + "epoch": 0.09127295651346465, + "grad_norm": 1.3539687395095825, + "learning_rate": 4.897938943923228e-05, + "loss": 5.1218, + "step": 15347 + }, + { + "epoch": 0.09127890379674565, + "grad_norm": 1.4157010316848755, + "learning_rate": 4.8979257334318974e-05, + "loss": 4.9411, + "step": 15348 + }, + { + "epoch": 0.09128485108002664, + "grad_norm": 1.4856256246566772, + "learning_rate": 4.897912522103475e-05, + "loss": 5.1622, + "step": 15349 + }, + { + "epoch": 0.09129079836330764, + "grad_norm": 1.4729665517807007, + "learning_rate": 4.8978993099379666e-05, + "loss": 5.0901, + "step": 15350 + }, + { + "epoch": 0.09129674564658864, + "grad_norm": 1.376625895500183, + "learning_rate": 4.897886096935376e-05, + "loss": 4.8843, + "step": 15351 + }, + { + "epoch": 0.09130269292986963, + "grad_norm": 1.3019710779190063, + "learning_rate": 4.897872883095708e-05, + "loss": 4.9956, + "step": 15352 + }, + { + "epoch": 0.09130864021315063, + "grad_norm": 1.4751423597335815, + "learning_rate": 4.897859668418968e-05, + "loss": 5.4369, + "step": 15353 + }, + { + "epoch": 0.09131458749643163, + "grad_norm": 1.3563402891159058, + "learning_rate": 4.8978464529051595e-05, + "loss": 5.2071, + "step": 15354 + }, + { + "epoch": 0.09132053477971262, + "grad_norm": 1.7365561723709106, + "learning_rate": 4.8978332365542875e-05, + "loss": 4.8797, + "step": 15355 + }, + { + "epoch": 0.09132648206299362, + "grad_norm": 1.4001792669296265, + "learning_rate": 4.8978200193663565e-05, + "loss": 5.2549, + "step": 15356 + }, + { + "epoch": 0.09133242934627463, + "grad_norm": 1.5568649768829346, + "learning_rate": 4.897806801341371e-05, + "loss": 5.3805, + "step": 15357 + }, + { + "epoch": 0.09133837662955561, + "grad_norm": 1.4169847965240479, + "learning_rate": 4.897793582479337e-05, + "loss": 5.2655, + "step": 15358 + }, + { + "epoch": 0.09134432391283662, + "grad_norm": 1.3992067575454712, + "learning_rate": 4.897780362780258e-05, + "loss": 5.4284, + "step": 15359 + }, + { + "epoch": 0.09135027119611762, + "grad_norm": 1.2274264097213745, + "learning_rate": 4.8977671422441376e-05, + "loss": 5.2443, + "step": 15360 + }, + { + "epoch": 0.09135621847939861, + "grad_norm": 1.4754104614257812, + "learning_rate": 4.897753920870982e-05, + "loss": 5.3438, + "step": 15361 + }, + { + "epoch": 0.09136216576267961, + "grad_norm": 1.3993452787399292, + "learning_rate": 4.897740698660796e-05, + "loss": 5.2396, + "step": 15362 + }, + { + "epoch": 0.09136811304596061, + "grad_norm": 1.2840338945388794, + "learning_rate": 4.897727475613583e-05, + "loss": 5.2912, + "step": 15363 + }, + { + "epoch": 0.0913740603292416, + "grad_norm": 1.5234180688858032, + "learning_rate": 4.8977142517293474e-05, + "loss": 5.4197, + "step": 15364 + }, + { + "epoch": 0.0913800076125226, + "grad_norm": 1.6243525743484497, + "learning_rate": 4.897701027008095e-05, + "loss": 5.4358, + "step": 15365 + }, + { + "epoch": 0.0913859548958036, + "grad_norm": 1.277801513671875, + "learning_rate": 4.8976878014498306e-05, + "loss": 5.2801, + "step": 15366 + }, + { + "epoch": 0.09139190217908459, + "grad_norm": 1.5294082164764404, + "learning_rate": 4.897674575054557e-05, + "loss": 4.8257, + "step": 15367 + }, + { + "epoch": 0.0913978494623656, + "grad_norm": 1.7289122343063354, + "learning_rate": 4.897661347822281e-05, + "loss": 4.8155, + "step": 15368 + }, + { + "epoch": 0.0914037967456466, + "grad_norm": 1.5567346811294556, + "learning_rate": 4.897648119753006e-05, + "loss": 4.8245, + "step": 15369 + }, + { + "epoch": 0.09140974402892758, + "grad_norm": 1.4855397939682007, + "learning_rate": 4.8976348908467365e-05, + "loss": 4.7247, + "step": 15370 + }, + { + "epoch": 0.09141569131220859, + "grad_norm": 1.4355418682098389, + "learning_rate": 4.897621661103477e-05, + "loss": 5.0925, + "step": 15371 + }, + { + "epoch": 0.09142163859548957, + "grad_norm": 1.3165326118469238, + "learning_rate": 4.897608430523233e-05, + "loss": 5.3419, + "step": 15372 + }, + { + "epoch": 0.09142758587877058, + "grad_norm": 1.4930912256240845, + "learning_rate": 4.8975951991060084e-05, + "loss": 5.3267, + "step": 15373 + }, + { + "epoch": 0.09143353316205158, + "grad_norm": 1.2326771020889282, + "learning_rate": 4.897581966851809e-05, + "loss": 5.2902, + "step": 15374 + }, + { + "epoch": 0.09143948044533257, + "grad_norm": 1.1512086391448975, + "learning_rate": 4.897568733760638e-05, + "loss": 5.2362, + "step": 15375 + }, + { + "epoch": 0.09144542772861357, + "grad_norm": 2.2404119968414307, + "learning_rate": 4.8975554998325e-05, + "loss": 5.055, + "step": 15376 + }, + { + "epoch": 0.09145137501189457, + "grad_norm": 1.3026318550109863, + "learning_rate": 4.8975422650674005e-05, + "loss": 5.0192, + "step": 15377 + }, + { + "epoch": 0.09145732229517556, + "grad_norm": 1.5808472633361816, + "learning_rate": 4.897529029465344e-05, + "loss": 5.2429, + "step": 15378 + }, + { + "epoch": 0.09146326957845656, + "grad_norm": 1.5761525630950928, + "learning_rate": 4.897515793026335e-05, + "loss": 4.9123, + "step": 15379 + }, + { + "epoch": 0.09146921686173756, + "grad_norm": 1.488484501838684, + "learning_rate": 4.897502555750377e-05, + "loss": 4.8463, + "step": 15380 + }, + { + "epoch": 0.09147516414501855, + "grad_norm": 1.4662736654281616, + "learning_rate": 4.897489317637477e-05, + "loss": 5.3047, + "step": 15381 + }, + { + "epoch": 0.09148111142829955, + "grad_norm": 1.6454370021820068, + "learning_rate": 4.897476078687637e-05, + "loss": 5.2335, + "step": 15382 + }, + { + "epoch": 0.09148705871158055, + "grad_norm": 1.425868034362793, + "learning_rate": 4.8974628389008636e-05, + "loss": 5.2016, + "step": 15383 + }, + { + "epoch": 0.09149300599486154, + "grad_norm": 1.599349021911621, + "learning_rate": 4.8974495982771606e-05, + "loss": 5.4205, + "step": 15384 + }, + { + "epoch": 0.09149895327814254, + "grad_norm": 1.6200257539749146, + "learning_rate": 4.897436356816533e-05, + "loss": 5.5001, + "step": 15385 + }, + { + "epoch": 0.09150490056142355, + "grad_norm": 1.5314574241638184, + "learning_rate": 4.8974231145189844e-05, + "loss": 5.4711, + "step": 15386 + }, + { + "epoch": 0.09151084784470453, + "grad_norm": 1.507489562034607, + "learning_rate": 4.8974098713845206e-05, + "loss": 5.4001, + "step": 15387 + }, + { + "epoch": 0.09151679512798554, + "grad_norm": 1.4561303853988647, + "learning_rate": 4.897396627413146e-05, + "loss": 5.4566, + "step": 15388 + }, + { + "epoch": 0.09152274241126654, + "grad_norm": 1.3273184299468994, + "learning_rate": 4.897383382604865e-05, + "loss": 5.4665, + "step": 15389 + }, + { + "epoch": 0.09152868969454753, + "grad_norm": 1.370138168334961, + "learning_rate": 4.8973701369596814e-05, + "loss": 5.4319, + "step": 15390 + }, + { + "epoch": 0.09153463697782853, + "grad_norm": 1.4831699132919312, + "learning_rate": 4.897356890477601e-05, + "loss": 5.2734, + "step": 15391 + }, + { + "epoch": 0.09154058426110953, + "grad_norm": 1.3152328729629517, + "learning_rate": 4.897343643158629e-05, + "loss": 5.3573, + "step": 15392 + }, + { + "epoch": 0.09154653154439052, + "grad_norm": 1.635460376739502, + "learning_rate": 4.8973303950027684e-05, + "loss": 5.2433, + "step": 15393 + }, + { + "epoch": 0.09155247882767152, + "grad_norm": 1.5252761840820312, + "learning_rate": 4.897317146010024e-05, + "loss": 5.2164, + "step": 15394 + }, + { + "epoch": 0.09155842611095252, + "grad_norm": 1.600043773651123, + "learning_rate": 4.897303896180402e-05, + "loss": 5.4138, + "step": 15395 + }, + { + "epoch": 0.09156437339423351, + "grad_norm": 1.6243258714675903, + "learning_rate": 4.8972906455139056e-05, + "loss": 5.6129, + "step": 15396 + }, + { + "epoch": 0.09157032067751451, + "grad_norm": 1.2726150751113892, + "learning_rate": 4.89727739401054e-05, + "loss": 5.4639, + "step": 15397 + }, + { + "epoch": 0.09157626796079552, + "grad_norm": 2.1045331954956055, + "learning_rate": 4.897264141670309e-05, + "loss": 5.1875, + "step": 15398 + }, + { + "epoch": 0.0915822152440765, + "grad_norm": 2.1204488277435303, + "learning_rate": 4.897250888493218e-05, + "loss": 5.0401, + "step": 15399 + }, + { + "epoch": 0.0915881625273575, + "grad_norm": 1.794190526008606, + "learning_rate": 4.8972376344792716e-05, + "loss": 6.0581, + "step": 15400 + }, + { + "epoch": 0.0915941098106385, + "grad_norm": 2.050788402557373, + "learning_rate": 4.8972243796284746e-05, + "loss": 5.0138, + "step": 15401 + }, + { + "epoch": 0.0916000570939195, + "grad_norm": 2.1165850162506104, + "learning_rate": 4.897211123940831e-05, + "loss": 4.7077, + "step": 15402 + }, + { + "epoch": 0.0916060043772005, + "grad_norm": 1.9797117710113525, + "learning_rate": 4.8971978674163455e-05, + "loss": 4.8248, + "step": 15403 + }, + { + "epoch": 0.09161195166048149, + "grad_norm": 1.922232747077942, + "learning_rate": 4.8971846100550234e-05, + "loss": 4.7655, + "step": 15404 + }, + { + "epoch": 0.09161789894376249, + "grad_norm": 1.7310322523117065, + "learning_rate": 4.897171351856869e-05, + "loss": 5.425, + "step": 15405 + }, + { + "epoch": 0.09162384622704349, + "grad_norm": 1.9186078310012817, + "learning_rate": 4.897158092821887e-05, + "loss": 6.2449, + "step": 15406 + }, + { + "epoch": 0.09162979351032448, + "grad_norm": 1.7470628023147583, + "learning_rate": 4.897144832950081e-05, + "loss": 6.1586, + "step": 15407 + }, + { + "epoch": 0.09163574079360548, + "grad_norm": 1.7828420400619507, + "learning_rate": 4.897131572241457e-05, + "loss": 6.1068, + "step": 15408 + }, + { + "epoch": 0.09164168807688648, + "grad_norm": 1.8831984996795654, + "learning_rate": 4.897118310696019e-05, + "loss": 5.6989, + "step": 15409 + }, + { + "epoch": 0.09164763536016747, + "grad_norm": 1.6138192415237427, + "learning_rate": 4.8971050483137726e-05, + "loss": 5.8222, + "step": 15410 + }, + { + "epoch": 0.09165358264344847, + "grad_norm": 1.6921756267547607, + "learning_rate": 4.897091785094721e-05, + "loss": 5.8559, + "step": 15411 + }, + { + "epoch": 0.09165952992672947, + "grad_norm": 2.007937431335449, + "learning_rate": 4.8970785210388694e-05, + "loss": 5.4523, + "step": 15412 + }, + { + "epoch": 0.09166547721001046, + "grad_norm": 1.8820117712020874, + "learning_rate": 4.8970652561462224e-05, + "loss": 5.6293, + "step": 15413 + }, + { + "epoch": 0.09167142449329146, + "grad_norm": 2.0193300247192383, + "learning_rate": 4.897051990416785e-05, + "loss": 5.8481, + "step": 15414 + }, + { + "epoch": 0.09167737177657247, + "grad_norm": 2.3685405254364014, + "learning_rate": 4.897038723850561e-05, + "loss": 6.2884, + "step": 15415 + }, + { + "epoch": 0.09168331905985345, + "grad_norm": 2.001131534576416, + "learning_rate": 4.897025456447556e-05, + "loss": 5.6747, + "step": 15416 + }, + { + "epoch": 0.09168926634313446, + "grad_norm": 1.9729053974151611, + "learning_rate": 4.897012188207774e-05, + "loss": 5.9019, + "step": 15417 + }, + { + "epoch": 0.09169521362641546, + "grad_norm": 1.7620398998260498, + "learning_rate": 4.896998919131219e-05, + "loss": 5.9498, + "step": 15418 + }, + { + "epoch": 0.09170116090969645, + "grad_norm": 1.6993772983551025, + "learning_rate": 4.896985649217898e-05, + "loss": 5.973, + "step": 15419 + }, + { + "epoch": 0.09170710819297745, + "grad_norm": 1.6905665397644043, + "learning_rate": 4.896972378467813e-05, + "loss": 5.9729, + "step": 15420 + }, + { + "epoch": 0.09171305547625845, + "grad_norm": 1.710838794708252, + "learning_rate": 4.8969591068809706e-05, + "loss": 5.6661, + "step": 15421 + }, + { + "epoch": 0.09171900275953944, + "grad_norm": 1.9235612154006958, + "learning_rate": 4.896945834457374e-05, + "loss": 5.38, + "step": 15422 + }, + { + "epoch": 0.09172495004282044, + "grad_norm": 2.360656976699829, + "learning_rate": 4.896932561197028e-05, + "loss": 5.2199, + "step": 15423 + }, + { + "epoch": 0.09173089732610144, + "grad_norm": 2.403338670730591, + "learning_rate": 4.896919287099938e-05, + "loss": 5.1776, + "step": 15424 + }, + { + "epoch": 0.09173684460938243, + "grad_norm": 1.9474782943725586, + "learning_rate": 4.896906012166108e-05, + "loss": 5.0781, + "step": 15425 + }, + { + "epoch": 0.09174279189266343, + "grad_norm": 1.8974144458770752, + "learning_rate": 4.896892736395543e-05, + "loss": 5.1609, + "step": 15426 + }, + { + "epoch": 0.09174873917594444, + "grad_norm": 2.3854262828826904, + "learning_rate": 4.896879459788247e-05, + "loss": 5.2019, + "step": 15427 + }, + { + "epoch": 0.09175468645922542, + "grad_norm": 2.4181137084960938, + "learning_rate": 4.8968661823442264e-05, + "loss": 5.1216, + "step": 15428 + }, + { + "epoch": 0.09176063374250643, + "grad_norm": 2.266355514526367, + "learning_rate": 4.896852904063484e-05, + "loss": 5.0401, + "step": 15429 + }, + { + "epoch": 0.09176658102578741, + "grad_norm": 2.086296558380127, + "learning_rate": 4.896839624946025e-05, + "loss": 4.8601, + "step": 15430 + }, + { + "epoch": 0.09177252830906842, + "grad_norm": 1.943326473236084, + "learning_rate": 4.896826344991854e-05, + "loss": 4.9978, + "step": 15431 + }, + { + "epoch": 0.09177847559234942, + "grad_norm": 2.0165631771087646, + "learning_rate": 4.896813064200975e-05, + "loss": 5.0379, + "step": 15432 + }, + { + "epoch": 0.0917844228756304, + "grad_norm": 1.7142544984817505, + "learning_rate": 4.896799782573394e-05, + "loss": 5.7101, + "step": 15433 + }, + { + "epoch": 0.09179037015891141, + "grad_norm": 1.9000083208084106, + "learning_rate": 4.896786500109115e-05, + "loss": 5.9536, + "step": 15434 + }, + { + "epoch": 0.09179631744219241, + "grad_norm": 1.6976677179336548, + "learning_rate": 4.8967732168081426e-05, + "loss": 5.4408, + "step": 15435 + }, + { + "epoch": 0.0918022647254734, + "grad_norm": 1.7433068752288818, + "learning_rate": 4.8967599326704815e-05, + "loss": 5.831, + "step": 15436 + }, + { + "epoch": 0.0918082120087544, + "grad_norm": 1.484256625175476, + "learning_rate": 4.896746647696136e-05, + "loss": 5.943, + "step": 15437 + }, + { + "epoch": 0.0918141592920354, + "grad_norm": 2.2480883598327637, + "learning_rate": 4.8967333618851106e-05, + "loss": 5.6634, + "step": 15438 + }, + { + "epoch": 0.09182010657531639, + "grad_norm": 1.3530383110046387, + "learning_rate": 4.896720075237411e-05, + "loss": 5.8981, + "step": 15439 + }, + { + "epoch": 0.09182605385859739, + "grad_norm": 1.451636552810669, + "learning_rate": 4.896706787753041e-05, + "loss": 5.9803, + "step": 15440 + }, + { + "epoch": 0.0918320011418784, + "grad_norm": 1.5904042720794678, + "learning_rate": 4.896693499432006e-05, + "loss": 5.9692, + "step": 15441 + }, + { + "epoch": 0.09183794842515938, + "grad_norm": 1.3971885442733765, + "learning_rate": 4.896680210274309e-05, + "loss": 5.8612, + "step": 15442 + }, + { + "epoch": 0.09184389570844038, + "grad_norm": 1.325842022895813, + "learning_rate": 4.8966669202799564e-05, + "loss": 5.9081, + "step": 15443 + }, + { + "epoch": 0.09184984299172139, + "grad_norm": 1.4639033079147339, + "learning_rate": 4.8966536294489515e-05, + "loss": 5.8395, + "step": 15444 + }, + { + "epoch": 0.09185579027500237, + "grad_norm": 1.248425006866455, + "learning_rate": 4.896640337781301e-05, + "loss": 5.9016, + "step": 15445 + }, + { + "epoch": 0.09186173755828338, + "grad_norm": 1.4250134229660034, + "learning_rate": 4.896627045277007e-05, + "loss": 5.815, + "step": 15446 + }, + { + "epoch": 0.09186768484156438, + "grad_norm": 1.9178589582443237, + "learning_rate": 4.896613751936075e-05, + "loss": 5.9092, + "step": 15447 + }, + { + "epoch": 0.09187363212484537, + "grad_norm": 1.9218472242355347, + "learning_rate": 4.896600457758511e-05, + "loss": 5.7151, + "step": 15448 + }, + { + "epoch": 0.09187957940812637, + "grad_norm": 1.7698949575424194, + "learning_rate": 4.896587162744317e-05, + "loss": 5.709, + "step": 15449 + }, + { + "epoch": 0.09188552669140737, + "grad_norm": 2.5047290325164795, + "learning_rate": 4.8965738668935e-05, + "loss": 5.5417, + "step": 15450 + }, + { + "epoch": 0.09189147397468836, + "grad_norm": 1.9855560064315796, + "learning_rate": 4.896560570206065e-05, + "loss": 5.9572, + "step": 15451 + }, + { + "epoch": 0.09189742125796936, + "grad_norm": 1.8577516078948975, + "learning_rate": 4.896547272682014e-05, + "loss": 4.8775, + "step": 15452 + }, + { + "epoch": 0.09190336854125036, + "grad_norm": 1.8830385208129883, + "learning_rate": 4.896533974321353e-05, + "loss": 4.8617, + "step": 15453 + }, + { + "epoch": 0.09190931582453135, + "grad_norm": 1.5114052295684814, + "learning_rate": 4.896520675124087e-05, + "loss": 4.9485, + "step": 15454 + }, + { + "epoch": 0.09191526310781235, + "grad_norm": 1.6233285665512085, + "learning_rate": 4.8965073750902205e-05, + "loss": 5.1098, + "step": 15455 + }, + { + "epoch": 0.09192121039109336, + "grad_norm": 1.6900150775909424, + "learning_rate": 4.896494074219758e-05, + "loss": 6.025, + "step": 15456 + }, + { + "epoch": 0.09192715767437434, + "grad_norm": 1.3984570503234863, + "learning_rate": 4.8964807725127046e-05, + "loss": 5.888, + "step": 15457 + }, + { + "epoch": 0.09193310495765535, + "grad_norm": 1.7069528102874756, + "learning_rate": 4.896467469969064e-05, + "loss": 5.6435, + "step": 15458 + }, + { + "epoch": 0.09193905224093633, + "grad_norm": 1.641513705253601, + "learning_rate": 4.896454166588842e-05, + "loss": 5.5641, + "step": 15459 + }, + { + "epoch": 0.09194499952421734, + "grad_norm": 1.8448737859725952, + "learning_rate": 4.896440862372042e-05, + "loss": 5.5673, + "step": 15460 + }, + { + "epoch": 0.09195094680749834, + "grad_norm": 1.7696945667266846, + "learning_rate": 4.8964275573186694e-05, + "loss": 5.4383, + "step": 15461 + }, + { + "epoch": 0.09195689409077933, + "grad_norm": 2.7951743602752686, + "learning_rate": 4.8964142514287285e-05, + "loss": 4.2996, + "step": 15462 + }, + { + "epoch": 0.09196284137406033, + "grad_norm": 2.5503883361816406, + "learning_rate": 4.8964009447022246e-05, + "loss": 4.2864, + "step": 15463 + }, + { + "epoch": 0.09196878865734133, + "grad_norm": 2.2069225311279297, + "learning_rate": 4.896387637139161e-05, + "loss": 4.3818, + "step": 15464 + }, + { + "epoch": 0.09197473594062232, + "grad_norm": 2.34734845161438, + "learning_rate": 4.8963743287395444e-05, + "loss": 4.2951, + "step": 15465 + }, + { + "epoch": 0.09198068322390332, + "grad_norm": 2.2955567836761475, + "learning_rate": 4.896361019503378e-05, + "loss": 4.3349, + "step": 15466 + }, + { + "epoch": 0.09198663050718432, + "grad_norm": 2.3519480228424072, + "learning_rate": 4.8963477094306666e-05, + "loss": 4.2685, + "step": 15467 + }, + { + "epoch": 0.09199257779046531, + "grad_norm": 2.3862032890319824, + "learning_rate": 4.896334398521415e-05, + "loss": 4.1333, + "step": 15468 + }, + { + "epoch": 0.09199852507374631, + "grad_norm": 2.1290738582611084, + "learning_rate": 4.896321086775627e-05, + "loss": 4.7918, + "step": 15469 + }, + { + "epoch": 0.09200447235702731, + "grad_norm": 2.2130253314971924, + "learning_rate": 4.8963077741933095e-05, + "loss": 5.208, + "step": 15470 + }, + { + "epoch": 0.0920104196403083, + "grad_norm": 2.063810110092163, + "learning_rate": 4.896294460774464e-05, + "loss": 5.1891, + "step": 15471 + }, + { + "epoch": 0.0920163669235893, + "grad_norm": 2.068791627883911, + "learning_rate": 4.8962811465190984e-05, + "loss": 5.2855, + "step": 15472 + }, + { + "epoch": 0.0920223142068703, + "grad_norm": 1.8504056930541992, + "learning_rate": 4.896267831427215e-05, + "loss": 5.0159, + "step": 15473 + }, + { + "epoch": 0.0920282614901513, + "grad_norm": 2.150820255279541, + "learning_rate": 4.89625451549882e-05, + "loss": 5.7728, + "step": 15474 + }, + { + "epoch": 0.0920342087734323, + "grad_norm": 2.3655643463134766, + "learning_rate": 4.8962411987339165e-05, + "loss": 5.4863, + "step": 15475 + }, + { + "epoch": 0.0920401560567133, + "grad_norm": 1.509820818901062, + "learning_rate": 4.8962278811325105e-05, + "loss": 5.5682, + "step": 15476 + }, + { + "epoch": 0.09204610333999429, + "grad_norm": 1.8581949472427368, + "learning_rate": 4.896214562694605e-05, + "loss": 5.6875, + "step": 15477 + }, + { + "epoch": 0.09205205062327529, + "grad_norm": 2.028116464614868, + "learning_rate": 4.8962012434202075e-05, + "loss": 5.3495, + "step": 15478 + }, + { + "epoch": 0.09205799790655629, + "grad_norm": 1.9395058155059814, + "learning_rate": 4.89618792330932e-05, + "loss": 5.5616, + "step": 15479 + }, + { + "epoch": 0.09206394518983728, + "grad_norm": 1.9281854629516602, + "learning_rate": 4.896174602361948e-05, + "loss": 5.6449, + "step": 15480 + }, + { + "epoch": 0.09206989247311828, + "grad_norm": 1.7750074863433838, + "learning_rate": 4.896161280578097e-05, + "loss": 5.1178, + "step": 15481 + }, + { + "epoch": 0.09207583975639928, + "grad_norm": 2.0160205364227295, + "learning_rate": 4.89614795795777e-05, + "loss": 5.4698, + "step": 15482 + }, + { + "epoch": 0.09208178703968027, + "grad_norm": 2.0041770935058594, + "learning_rate": 4.896134634500972e-05, + "loss": 4.6989, + "step": 15483 + }, + { + "epoch": 0.09208773432296127, + "grad_norm": 1.9916999340057373, + "learning_rate": 4.896121310207708e-05, + "loss": 4.6296, + "step": 15484 + }, + { + "epoch": 0.09209368160624228, + "grad_norm": 1.62458336353302, + "learning_rate": 4.8961079850779845e-05, + "loss": 5.1147, + "step": 15485 + }, + { + "epoch": 0.09209962888952326, + "grad_norm": 1.8349764347076416, + "learning_rate": 4.8960946591118036e-05, + "loss": 5.3646, + "step": 15486 + }, + { + "epoch": 0.09210557617280427, + "grad_norm": 2.0250589847564697, + "learning_rate": 4.89608133230917e-05, + "loss": 5.7467, + "step": 15487 + }, + { + "epoch": 0.09211152345608525, + "grad_norm": 1.8945664167404175, + "learning_rate": 4.89606800467009e-05, + "loss": 5.5526, + "step": 15488 + }, + { + "epoch": 0.09211747073936626, + "grad_norm": 2.1056711673736572, + "learning_rate": 4.896054676194568e-05, + "loss": 4.8553, + "step": 15489 + }, + { + "epoch": 0.09212341802264726, + "grad_norm": 2.0394606590270996, + "learning_rate": 4.896041346882607e-05, + "loss": 5.4427, + "step": 15490 + }, + { + "epoch": 0.09212936530592825, + "grad_norm": 2.3078689575195312, + "learning_rate": 4.896028016734213e-05, + "loss": 5.3668, + "step": 15491 + }, + { + "epoch": 0.09213531258920925, + "grad_norm": 2.1227409839630127, + "learning_rate": 4.8960146857493904e-05, + "loss": 5.6314, + "step": 15492 + }, + { + "epoch": 0.09214125987249025, + "grad_norm": 2.156165838241577, + "learning_rate": 4.896001353928144e-05, + "loss": 5.5088, + "step": 15493 + }, + { + "epoch": 0.09214720715577124, + "grad_norm": 1.8915730714797974, + "learning_rate": 4.895988021270478e-05, + "loss": 5.5636, + "step": 15494 + }, + { + "epoch": 0.09215315443905224, + "grad_norm": 1.8041549921035767, + "learning_rate": 4.895974687776398e-05, + "loss": 5.5213, + "step": 15495 + }, + { + "epoch": 0.09215910172233324, + "grad_norm": 1.8982187509536743, + "learning_rate": 4.8959613534459074e-05, + "loss": 5.7038, + "step": 15496 + }, + { + "epoch": 0.09216504900561423, + "grad_norm": 1.9235600233078003, + "learning_rate": 4.895948018279012e-05, + "loss": 5.514, + "step": 15497 + }, + { + "epoch": 0.09217099628889523, + "grad_norm": 2.284212112426758, + "learning_rate": 4.895934682275715e-05, + "loss": 5.4624, + "step": 15498 + }, + { + "epoch": 0.09217694357217623, + "grad_norm": 2.770934820175171, + "learning_rate": 4.895921345436022e-05, + "loss": 4.7516, + "step": 15499 + }, + { + "epoch": 0.09218289085545722, + "grad_norm": 2.054158926010132, + "learning_rate": 4.895908007759939e-05, + "loss": 5.6444, + "step": 15500 + }, + { + "epoch": 0.09218883813873822, + "grad_norm": 2.352905511856079, + "learning_rate": 4.895894669247468e-05, + "loss": 4.7985, + "step": 15501 + }, + { + "epoch": 0.09219478542201923, + "grad_norm": 2.612039804458618, + "learning_rate": 4.895881329898615e-05, + "loss": 4.769, + "step": 15502 + }, + { + "epoch": 0.09220073270530021, + "grad_norm": 2.1274194717407227, + "learning_rate": 4.8958679897133854e-05, + "loss": 4.6185, + "step": 15503 + }, + { + "epoch": 0.09220667998858122, + "grad_norm": 2.2458853721618652, + "learning_rate": 4.895854648691782e-05, + "loss": 4.8576, + "step": 15504 + }, + { + "epoch": 0.09221262727186222, + "grad_norm": 2.415526866912842, + "learning_rate": 4.895841306833811e-05, + "loss": 4.999, + "step": 15505 + }, + { + "epoch": 0.0922185745551432, + "grad_norm": 1.8172876834869385, + "learning_rate": 4.8958279641394765e-05, + "loss": 5.1992, + "step": 15506 + }, + { + "epoch": 0.09222452183842421, + "grad_norm": 2.0568878650665283, + "learning_rate": 4.8958146206087826e-05, + "loss": 5.1348, + "step": 15507 + }, + { + "epoch": 0.09223046912170521, + "grad_norm": 2.152869701385498, + "learning_rate": 4.895801276241736e-05, + "loss": 4.9832, + "step": 15508 + }, + { + "epoch": 0.0922364164049862, + "grad_norm": 1.8191282749176025, + "learning_rate": 4.895787931038339e-05, + "loss": 5.3098, + "step": 15509 + }, + { + "epoch": 0.0922423636882672, + "grad_norm": 1.9511895179748535, + "learning_rate": 4.895774584998597e-05, + "loss": 5.5763, + "step": 15510 + }, + { + "epoch": 0.0922483109715482, + "grad_norm": 1.8735122680664062, + "learning_rate": 4.895761238122515e-05, + "loss": 5.3644, + "step": 15511 + }, + { + "epoch": 0.09225425825482919, + "grad_norm": 1.672721028327942, + "learning_rate": 4.895747890410098e-05, + "loss": 5.2794, + "step": 15512 + }, + { + "epoch": 0.0922602055381102, + "grad_norm": 1.5318527221679688, + "learning_rate": 4.89573454186135e-05, + "loss": 5.3575, + "step": 15513 + }, + { + "epoch": 0.0922661528213912, + "grad_norm": 1.8192704916000366, + "learning_rate": 4.895721192476275e-05, + "loss": 5.498, + "step": 15514 + }, + { + "epoch": 0.09227210010467218, + "grad_norm": 1.948249340057373, + "learning_rate": 4.895707842254879e-05, + "loss": 5.6955, + "step": 15515 + }, + { + "epoch": 0.09227804738795319, + "grad_norm": 2.1378414630889893, + "learning_rate": 4.895694491197166e-05, + "loss": 5.4999, + "step": 15516 + }, + { + "epoch": 0.09228399467123417, + "grad_norm": 2.057358980178833, + "learning_rate": 4.8956811393031414e-05, + "loss": 4.7234, + "step": 15517 + }, + { + "epoch": 0.09228994195451518, + "grad_norm": 1.9550749063491821, + "learning_rate": 4.895667786572809e-05, + "loss": 5.7611, + "step": 15518 + }, + { + "epoch": 0.09229588923779618, + "grad_norm": 2.120396852493286, + "learning_rate": 4.8956544330061734e-05, + "loss": 5.8707, + "step": 15519 + }, + { + "epoch": 0.09230183652107717, + "grad_norm": 1.8432284593582153, + "learning_rate": 4.8956410786032404e-05, + "loss": 5.7512, + "step": 15520 + }, + { + "epoch": 0.09230778380435817, + "grad_norm": 1.738993525505066, + "learning_rate": 4.895627723364013e-05, + "loss": 5.2099, + "step": 15521 + }, + { + "epoch": 0.09231373108763917, + "grad_norm": 1.4885916709899902, + "learning_rate": 4.895614367288497e-05, + "loss": 5.6817, + "step": 15522 + }, + { + "epoch": 0.09231967837092016, + "grad_norm": 1.9712351560592651, + "learning_rate": 4.895601010376697e-05, + "loss": 5.4247, + "step": 15523 + }, + { + "epoch": 0.09232562565420116, + "grad_norm": 1.6669690608978271, + "learning_rate": 4.895587652628617e-05, + "loss": 5.2189, + "step": 15524 + }, + { + "epoch": 0.09233157293748216, + "grad_norm": 2.1034297943115234, + "learning_rate": 4.895574294044262e-05, + "loss": 5.4772, + "step": 15525 + }, + { + "epoch": 0.09233752022076315, + "grad_norm": 2.3692588806152344, + "learning_rate": 4.895560934623637e-05, + "loss": 5.002, + "step": 15526 + }, + { + "epoch": 0.09234346750404415, + "grad_norm": 2.708406686782837, + "learning_rate": 4.8955475743667464e-05, + "loss": 4.9923, + "step": 15527 + }, + { + "epoch": 0.09234941478732515, + "grad_norm": 2.4986281394958496, + "learning_rate": 4.895534213273595e-05, + "loss": 4.7859, + "step": 15528 + }, + { + "epoch": 0.09235536207060614, + "grad_norm": 2.4715240001678467, + "learning_rate": 4.895520851344187e-05, + "loss": 5.2135, + "step": 15529 + }, + { + "epoch": 0.09236130935388714, + "grad_norm": 1.77085280418396, + "learning_rate": 4.895507488578528e-05, + "loss": 5.4675, + "step": 15530 + }, + { + "epoch": 0.09236725663716815, + "grad_norm": 1.4845975637435913, + "learning_rate": 4.8954941249766225e-05, + "loss": 5.8627, + "step": 15531 + }, + { + "epoch": 0.09237320392044913, + "grad_norm": 2.0753140449523926, + "learning_rate": 4.8954807605384734e-05, + "loss": 5.8246, + "step": 15532 + }, + { + "epoch": 0.09237915120373014, + "grad_norm": 1.5671929121017456, + "learning_rate": 4.895467395264088e-05, + "loss": 5.8189, + "step": 15533 + }, + { + "epoch": 0.09238509848701114, + "grad_norm": 1.749223232269287, + "learning_rate": 4.895454029153469e-05, + "loss": 5.9183, + "step": 15534 + }, + { + "epoch": 0.09239104577029213, + "grad_norm": 1.7186611890792847, + "learning_rate": 4.895440662206622e-05, + "loss": 5.84, + "step": 15535 + }, + { + "epoch": 0.09239699305357313, + "grad_norm": 1.654483437538147, + "learning_rate": 4.895427294423551e-05, + "loss": 5.4055, + "step": 15536 + }, + { + "epoch": 0.09240294033685413, + "grad_norm": 1.7109687328338623, + "learning_rate": 4.895413925804261e-05, + "loss": 5.3028, + "step": 15537 + }, + { + "epoch": 0.09240888762013512, + "grad_norm": 1.9221105575561523, + "learning_rate": 4.895400556348757e-05, + "loss": 5.2911, + "step": 15538 + }, + { + "epoch": 0.09241483490341612, + "grad_norm": 1.9464010000228882, + "learning_rate": 4.895387186057044e-05, + "loss": 5.5883, + "step": 15539 + }, + { + "epoch": 0.09242078218669712, + "grad_norm": 1.9429137706756592, + "learning_rate": 4.8953738149291254e-05, + "loss": 5.7164, + "step": 15540 + }, + { + "epoch": 0.09242672946997811, + "grad_norm": 1.7792669534683228, + "learning_rate": 4.8953604429650065e-05, + "loss": 5.7924, + "step": 15541 + }, + { + "epoch": 0.09243267675325911, + "grad_norm": 2.2124290466308594, + "learning_rate": 4.895347070164692e-05, + "loss": 5.4432, + "step": 15542 + }, + { + "epoch": 0.09243862403654012, + "grad_norm": 1.6349585056304932, + "learning_rate": 4.8953336965281873e-05, + "loss": 5.6975, + "step": 15543 + }, + { + "epoch": 0.0924445713198211, + "grad_norm": 2.01434063911438, + "learning_rate": 4.895320322055496e-05, + "loss": 5.3564, + "step": 15544 + }, + { + "epoch": 0.0924505186031021, + "grad_norm": 1.8110109567642212, + "learning_rate": 4.895306946746623e-05, + "loss": 5.3061, + "step": 15545 + }, + { + "epoch": 0.0924564658863831, + "grad_norm": 1.6687593460083008, + "learning_rate": 4.895293570601573e-05, + "loss": 5.4061, + "step": 15546 + }, + { + "epoch": 0.0924624131696641, + "grad_norm": 1.7488101720809937, + "learning_rate": 4.895280193620351e-05, + "loss": 5.4726, + "step": 15547 + }, + { + "epoch": 0.0924683604529451, + "grad_norm": 1.9059126377105713, + "learning_rate": 4.895266815802961e-05, + "loss": 5.9665, + "step": 15548 + }, + { + "epoch": 0.09247430773622609, + "grad_norm": 1.9732307195663452, + "learning_rate": 4.8952534371494084e-05, + "loss": 6.007, + "step": 15549 + }, + { + "epoch": 0.09248025501950709, + "grad_norm": 1.792325496673584, + "learning_rate": 4.895240057659697e-05, + "loss": 5.9466, + "step": 15550 + }, + { + "epoch": 0.09248620230278809, + "grad_norm": 1.7282743453979492, + "learning_rate": 4.895226677333833e-05, + "loss": 5.456, + "step": 15551 + }, + { + "epoch": 0.09249214958606908, + "grad_norm": 1.5014616250991821, + "learning_rate": 4.89521329617182e-05, + "loss": 5.0257, + "step": 15552 + }, + { + "epoch": 0.09249809686935008, + "grad_norm": 1.5420494079589844, + "learning_rate": 4.8951999141736624e-05, + "loss": 5.0657, + "step": 15553 + }, + { + "epoch": 0.09250404415263108, + "grad_norm": 1.4273606538772583, + "learning_rate": 4.895186531339365e-05, + "loss": 5.3431, + "step": 15554 + }, + { + "epoch": 0.09250999143591207, + "grad_norm": 1.9525657892227173, + "learning_rate": 4.895173147668933e-05, + "loss": 5.514, + "step": 15555 + }, + { + "epoch": 0.09251593871919307, + "grad_norm": 2.7004175186157227, + "learning_rate": 4.895159763162371e-05, + "loss": 5.3548, + "step": 15556 + }, + { + "epoch": 0.09252188600247407, + "grad_norm": 2.5703442096710205, + "learning_rate": 4.8951463778196835e-05, + "loss": 5.4275, + "step": 15557 + }, + { + "epoch": 0.09252783328575506, + "grad_norm": 2.4033594131469727, + "learning_rate": 4.895132991640875e-05, + "loss": 5.285, + "step": 15558 + }, + { + "epoch": 0.09253378056903606, + "grad_norm": 2.0295355319976807, + "learning_rate": 4.89511960462595e-05, + "loss": 5.1196, + "step": 15559 + }, + { + "epoch": 0.09253972785231707, + "grad_norm": 2.0739188194274902, + "learning_rate": 4.895106216774914e-05, + "loss": 4.7362, + "step": 15560 + }, + { + "epoch": 0.09254567513559805, + "grad_norm": 2.2429590225219727, + "learning_rate": 4.895092828087771e-05, + "loss": 5.0749, + "step": 15561 + }, + { + "epoch": 0.09255162241887906, + "grad_norm": 1.9738318920135498, + "learning_rate": 4.895079438564526e-05, + "loss": 5.6755, + "step": 15562 + }, + { + "epoch": 0.09255756970216006, + "grad_norm": 2.692275047302246, + "learning_rate": 4.895066048205183e-05, + "loss": 5.3146, + "step": 15563 + }, + { + "epoch": 0.09256351698544105, + "grad_norm": 2.774864912033081, + "learning_rate": 4.895052657009748e-05, + "loss": 5.1116, + "step": 15564 + }, + { + "epoch": 0.09256946426872205, + "grad_norm": 2.5513851642608643, + "learning_rate": 4.895039264978224e-05, + "loss": 5.0464, + "step": 15565 + }, + { + "epoch": 0.09257541155200305, + "grad_norm": 2.2035319805145264, + "learning_rate": 4.895025872110617e-05, + "loss": 5.1499, + "step": 15566 + }, + { + "epoch": 0.09258135883528404, + "grad_norm": 1.669402837753296, + "learning_rate": 4.8950124784069305e-05, + "loss": 5.5006, + "step": 15567 + }, + { + "epoch": 0.09258730611856504, + "grad_norm": 1.9433900117874146, + "learning_rate": 4.894999083867171e-05, + "loss": 5.1423, + "step": 15568 + }, + { + "epoch": 0.09259325340184604, + "grad_norm": 2.2401936054229736, + "learning_rate": 4.8949856884913416e-05, + "loss": 4.8937, + "step": 15569 + }, + { + "epoch": 0.09259920068512703, + "grad_norm": 2.094503164291382, + "learning_rate": 4.894972292279447e-05, + "loss": 4.8554, + "step": 15570 + }, + { + "epoch": 0.09260514796840803, + "grad_norm": 2.1677212715148926, + "learning_rate": 4.894958895231493e-05, + "loss": 4.7446, + "step": 15571 + }, + { + "epoch": 0.09261109525168904, + "grad_norm": 2.0262231826782227, + "learning_rate": 4.894945497347483e-05, + "loss": 4.8282, + "step": 15572 + }, + { + "epoch": 0.09261704253497002, + "grad_norm": 1.9491705894470215, + "learning_rate": 4.894932098627423e-05, + "loss": 4.9579, + "step": 15573 + }, + { + "epoch": 0.09262298981825103, + "grad_norm": 2.0898170471191406, + "learning_rate": 4.8949186990713165e-05, + "loss": 4.8197, + "step": 15574 + }, + { + "epoch": 0.09262893710153201, + "grad_norm": 1.8452088832855225, + "learning_rate": 4.894905298679169e-05, + "loss": 4.8359, + "step": 15575 + }, + { + "epoch": 0.09263488438481302, + "grad_norm": 2.1573541164398193, + "learning_rate": 4.894891897450984e-05, + "loss": 4.5882, + "step": 15576 + }, + { + "epoch": 0.09264083166809402, + "grad_norm": 2.1609156131744385, + "learning_rate": 4.894878495386768e-05, + "loss": 4.7556, + "step": 15577 + }, + { + "epoch": 0.092646778951375, + "grad_norm": 1.9062503576278687, + "learning_rate": 4.894865092486524e-05, + "loss": 4.6933, + "step": 15578 + }, + { + "epoch": 0.09265272623465601, + "grad_norm": 1.8876394033432007, + "learning_rate": 4.894851688750257e-05, + "loss": 4.7317, + "step": 15579 + }, + { + "epoch": 0.09265867351793701, + "grad_norm": 1.9106816053390503, + "learning_rate": 4.894838284177972e-05, + "loss": 4.7597, + "step": 15580 + }, + { + "epoch": 0.092664620801218, + "grad_norm": 1.8116264343261719, + "learning_rate": 4.894824878769674e-05, + "loss": 4.8865, + "step": 15581 + }, + { + "epoch": 0.092670568084499, + "grad_norm": 1.8492180109024048, + "learning_rate": 4.894811472525368e-05, + "loss": 4.7282, + "step": 15582 + }, + { + "epoch": 0.09267651536778, + "grad_norm": 1.9450536966323853, + "learning_rate": 4.894798065445058e-05, + "loss": 5.0777, + "step": 15583 + }, + { + "epoch": 0.09268246265106099, + "grad_norm": 2.2099180221557617, + "learning_rate": 4.894784657528748e-05, + "loss": 5.421, + "step": 15584 + }, + { + "epoch": 0.09268840993434199, + "grad_norm": 2.2239253520965576, + "learning_rate": 4.8947712487764436e-05, + "loss": 5.8346, + "step": 15585 + }, + { + "epoch": 0.092694357217623, + "grad_norm": 1.7867511510849, + "learning_rate": 4.894757839188149e-05, + "loss": 5.9306, + "step": 15586 + }, + { + "epoch": 0.09270030450090398, + "grad_norm": 1.6986007690429688, + "learning_rate": 4.89474442876387e-05, + "loss": 5.0704, + "step": 15587 + }, + { + "epoch": 0.09270625178418498, + "grad_norm": 1.7906185388565063, + "learning_rate": 4.89473101750361e-05, + "loss": 5.1951, + "step": 15588 + }, + { + "epoch": 0.09271219906746599, + "grad_norm": 1.7287026643753052, + "learning_rate": 4.894717605407374e-05, + "loss": 5.1736, + "step": 15589 + }, + { + "epoch": 0.09271814635074697, + "grad_norm": 1.6170624494552612, + "learning_rate": 4.8947041924751665e-05, + "loss": 5.5399, + "step": 15590 + }, + { + "epoch": 0.09272409363402798, + "grad_norm": 1.7556488513946533, + "learning_rate": 4.894690778706994e-05, + "loss": 5.574, + "step": 15591 + }, + { + "epoch": 0.09273004091730898, + "grad_norm": 2.346484899520874, + "learning_rate": 4.894677364102859e-05, + "loss": 5.0062, + "step": 15592 + }, + { + "epoch": 0.09273598820058997, + "grad_norm": 2.1376540660858154, + "learning_rate": 4.894663948662766e-05, + "loss": 5.1377, + "step": 15593 + }, + { + "epoch": 0.09274193548387097, + "grad_norm": 2.2489631175994873, + "learning_rate": 4.894650532386721e-05, + "loss": 5.1058, + "step": 15594 + }, + { + "epoch": 0.09274788276715197, + "grad_norm": 1.984281063079834, + "learning_rate": 4.8946371152747285e-05, + "loss": 5.1223, + "step": 15595 + }, + { + "epoch": 0.09275383005043296, + "grad_norm": 1.9387162923812866, + "learning_rate": 4.8946236973267935e-05, + "loss": 5.5121, + "step": 15596 + }, + { + "epoch": 0.09275977733371396, + "grad_norm": 1.8052873611450195, + "learning_rate": 4.894610278542919e-05, + "loss": 5.2101, + "step": 15597 + }, + { + "epoch": 0.09276572461699496, + "grad_norm": 2.558525562286377, + "learning_rate": 4.894596858923111e-05, + "loss": 4.6659, + "step": 15598 + }, + { + "epoch": 0.09277167190027595, + "grad_norm": 1.700897455215454, + "learning_rate": 4.8945834384673746e-05, + "loss": 5.4634, + "step": 15599 + }, + { + "epoch": 0.09277761918355695, + "grad_norm": 1.4691836833953857, + "learning_rate": 4.8945700171757134e-05, + "loss": 5.3873, + "step": 15600 + }, + { + "epoch": 0.09278356646683796, + "grad_norm": 1.4673740863800049, + "learning_rate": 4.894556595048132e-05, + "loss": 5.3917, + "step": 15601 + }, + { + "epoch": 0.09278951375011894, + "grad_norm": 1.6252011060714722, + "learning_rate": 4.894543172084637e-05, + "loss": 5.2003, + "step": 15602 + }, + { + "epoch": 0.09279546103339995, + "grad_norm": 1.6320288181304932, + "learning_rate": 4.89452974828523e-05, + "loss": 5.4821, + "step": 15603 + }, + { + "epoch": 0.09280140831668093, + "grad_norm": 2.1444239616394043, + "learning_rate": 4.8945163236499194e-05, + "loss": 5.9926, + "step": 15604 + }, + { + "epoch": 0.09280735559996194, + "grad_norm": 2.3000271320343018, + "learning_rate": 4.894502898178707e-05, + "loss": 4.7545, + "step": 15605 + }, + { + "epoch": 0.09281330288324294, + "grad_norm": 2.259962797164917, + "learning_rate": 4.894489471871597e-05, + "loss": 5.1292, + "step": 15606 + }, + { + "epoch": 0.09281925016652393, + "grad_norm": 2.5522921085357666, + "learning_rate": 4.8944760447285977e-05, + "loss": 5.1226, + "step": 15607 + }, + { + "epoch": 0.09282519744980493, + "grad_norm": 1.7621963024139404, + "learning_rate": 4.8944626167497096e-05, + "loss": 5.5405, + "step": 15608 + }, + { + "epoch": 0.09283114473308593, + "grad_norm": 1.6631364822387695, + "learning_rate": 4.894449187934941e-05, + "loss": 5.4332, + "step": 15609 + }, + { + "epoch": 0.09283709201636692, + "grad_norm": 1.695904016494751, + "learning_rate": 4.894435758284294e-05, + "loss": 5.4989, + "step": 15610 + }, + { + "epoch": 0.09284303929964792, + "grad_norm": 2.0772507190704346, + "learning_rate": 4.894422327797774e-05, + "loss": 5.0412, + "step": 15611 + }, + { + "epoch": 0.09284898658292892, + "grad_norm": 1.959685206413269, + "learning_rate": 4.894408896475386e-05, + "loss": 5.2749, + "step": 15612 + }, + { + "epoch": 0.09285493386620991, + "grad_norm": 2.0305607318878174, + "learning_rate": 4.894395464317135e-05, + "loss": 5.6227, + "step": 15613 + }, + { + "epoch": 0.09286088114949091, + "grad_norm": 1.7631112337112427, + "learning_rate": 4.894382031323026e-05, + "loss": 5.4396, + "step": 15614 + }, + { + "epoch": 0.09286682843277191, + "grad_norm": 1.8171305656433105, + "learning_rate": 4.894368597493062e-05, + "loss": 5.2498, + "step": 15615 + }, + { + "epoch": 0.0928727757160529, + "grad_norm": 2.123805522918701, + "learning_rate": 4.894355162827249e-05, + "loss": 5.8113, + "step": 15616 + }, + { + "epoch": 0.0928787229993339, + "grad_norm": 1.840071201324463, + "learning_rate": 4.894341727325591e-05, + "loss": 5.6394, + "step": 15617 + }, + { + "epoch": 0.0928846702826149, + "grad_norm": 1.7636733055114746, + "learning_rate": 4.8943282909880935e-05, + "loss": 5.5515, + "step": 15618 + }, + { + "epoch": 0.0928906175658959, + "grad_norm": 1.956026315689087, + "learning_rate": 4.89431485381476e-05, + "loss": 5.1716, + "step": 15619 + }, + { + "epoch": 0.0928965648491769, + "grad_norm": 2.2381720542907715, + "learning_rate": 4.894301415805597e-05, + "loss": 4.9692, + "step": 15620 + }, + { + "epoch": 0.0929025121324579, + "grad_norm": 2.178999423980713, + "learning_rate": 4.894287976960607e-05, + "loss": 4.9732, + "step": 15621 + }, + { + "epoch": 0.09290845941573889, + "grad_norm": 2.1932144165039062, + "learning_rate": 4.894274537279796e-05, + "loss": 4.9497, + "step": 15622 + }, + { + "epoch": 0.09291440669901989, + "grad_norm": 2.093252182006836, + "learning_rate": 4.894261096763169e-05, + "loss": 4.7642, + "step": 15623 + }, + { + "epoch": 0.09292035398230089, + "grad_norm": 1.785686731338501, + "learning_rate": 4.89424765541073e-05, + "loss": 5.1449, + "step": 15624 + }, + { + "epoch": 0.09292630126558188, + "grad_norm": 2.250986099243164, + "learning_rate": 4.894234213222484e-05, + "loss": 4.8503, + "step": 15625 + }, + { + "epoch": 0.09293224854886288, + "grad_norm": 1.8585362434387207, + "learning_rate": 4.8942207701984355e-05, + "loss": 4.582, + "step": 15626 + }, + { + "epoch": 0.09293819583214388, + "grad_norm": 2.080742597579956, + "learning_rate": 4.894207326338589e-05, + "loss": 4.4912, + "step": 15627 + }, + { + "epoch": 0.09294414311542487, + "grad_norm": 2.422774076461792, + "learning_rate": 4.8941938816429495e-05, + "loss": 4.4227, + "step": 15628 + }, + { + "epoch": 0.09295009039870587, + "grad_norm": 2.3304965496063232, + "learning_rate": 4.8941804361115215e-05, + "loss": 4.2265, + "step": 15629 + }, + { + "epoch": 0.09295603768198687, + "grad_norm": 2.619837522506714, + "learning_rate": 4.8941669897443105e-05, + "loss": 4.6812, + "step": 15630 + }, + { + "epoch": 0.09296198496526786, + "grad_norm": 2.4924118518829346, + "learning_rate": 4.89415354254132e-05, + "loss": 4.5081, + "step": 15631 + }, + { + "epoch": 0.09296793224854887, + "grad_norm": 2.5034751892089844, + "learning_rate": 4.894140094502556e-05, + "loss": 4.3356, + "step": 15632 + }, + { + "epoch": 0.09297387953182985, + "grad_norm": 2.599963665008545, + "learning_rate": 4.894126645628021e-05, + "loss": 4.6952, + "step": 15633 + }, + { + "epoch": 0.09297982681511086, + "grad_norm": 2.189516544342041, + "learning_rate": 4.894113195917722e-05, + "loss": 5.75, + "step": 15634 + }, + { + "epoch": 0.09298577409839186, + "grad_norm": 2.5768351554870605, + "learning_rate": 4.894099745371663e-05, + "loss": 5.9257, + "step": 15635 + }, + { + "epoch": 0.09299172138167285, + "grad_norm": 2.2909457683563232, + "learning_rate": 4.894086293989848e-05, + "loss": 5.484, + "step": 15636 + }, + { + "epoch": 0.09299766866495385, + "grad_norm": 2.0447487831115723, + "learning_rate": 4.894072841772282e-05, + "loss": 5.2952, + "step": 15637 + }, + { + "epoch": 0.09300361594823485, + "grad_norm": 1.8934963941574097, + "learning_rate": 4.894059388718971e-05, + "loss": 5.3498, + "step": 15638 + }, + { + "epoch": 0.09300956323151584, + "grad_norm": 1.9989632368087769, + "learning_rate": 4.894045934829919e-05, + "loss": 5.55, + "step": 15639 + }, + { + "epoch": 0.09301551051479684, + "grad_norm": 1.4955580234527588, + "learning_rate": 4.8940324801051285e-05, + "loss": 5.1978, + "step": 15640 + }, + { + "epoch": 0.09302145779807784, + "grad_norm": 1.8308879137039185, + "learning_rate": 4.8940190245446074e-05, + "loss": 5.5448, + "step": 15641 + }, + { + "epoch": 0.09302740508135883, + "grad_norm": 1.4997726678848267, + "learning_rate": 4.8940055681483576e-05, + "loss": 5.353, + "step": 15642 + }, + { + "epoch": 0.09303335236463983, + "grad_norm": 1.5643866062164307, + "learning_rate": 4.8939921109163864e-05, + "loss": 5.1456, + "step": 15643 + }, + { + "epoch": 0.09303929964792083, + "grad_norm": 1.8125799894332886, + "learning_rate": 4.8939786528486967e-05, + "loss": 5.3456, + "step": 15644 + }, + { + "epoch": 0.09304524693120182, + "grad_norm": 1.6802864074707031, + "learning_rate": 4.893965193945294e-05, + "loss": 5.279, + "step": 15645 + }, + { + "epoch": 0.09305119421448282, + "grad_norm": 1.4397536516189575, + "learning_rate": 4.893951734206182e-05, + "loss": 5.9849, + "step": 15646 + }, + { + "epoch": 0.09305714149776383, + "grad_norm": 1.618416428565979, + "learning_rate": 4.893938273631368e-05, + "loss": 5.231, + "step": 15647 + }, + { + "epoch": 0.09306308878104481, + "grad_norm": 1.4833893775939941, + "learning_rate": 4.8939248122208537e-05, + "loss": 5.2883, + "step": 15648 + }, + { + "epoch": 0.09306903606432582, + "grad_norm": 1.2709630727767944, + "learning_rate": 4.8939113499746446e-05, + "loss": 5.1042, + "step": 15649 + }, + { + "epoch": 0.09307498334760682, + "grad_norm": 1.2770884037017822, + "learning_rate": 4.893897886892747e-05, + "loss": 5.0682, + "step": 15650 + }, + { + "epoch": 0.0930809306308878, + "grad_norm": 1.4511629343032837, + "learning_rate": 4.893884422975163e-05, + "loss": 5.0904, + "step": 15651 + }, + { + "epoch": 0.09308687791416881, + "grad_norm": 1.7428641319274902, + "learning_rate": 4.8938709582219e-05, + "loss": 5.2569, + "step": 15652 + }, + { + "epoch": 0.09309282519744981, + "grad_norm": 1.5430729389190674, + "learning_rate": 4.89385749263296e-05, + "loss": 5.1698, + "step": 15653 + }, + { + "epoch": 0.0930987724807308, + "grad_norm": 1.6689143180847168, + "learning_rate": 4.8938440262083495e-05, + "loss": 5.1866, + "step": 15654 + }, + { + "epoch": 0.0931047197640118, + "grad_norm": 1.505698323249817, + "learning_rate": 4.8938305589480734e-05, + "loss": 5.1574, + "step": 15655 + }, + { + "epoch": 0.0931106670472928, + "grad_norm": 1.496547818183899, + "learning_rate": 4.8938170908521356e-05, + "loss": 5.1175, + "step": 15656 + }, + { + "epoch": 0.09311661433057379, + "grad_norm": 1.5257115364074707, + "learning_rate": 4.893803621920541e-05, + "loss": 5.1796, + "step": 15657 + }, + { + "epoch": 0.09312256161385479, + "grad_norm": 1.5880948305130005, + "learning_rate": 4.893790152153294e-05, + "loss": 5.1864, + "step": 15658 + }, + { + "epoch": 0.0931285088971358, + "grad_norm": 1.632869839668274, + "learning_rate": 4.8937766815503994e-05, + "loss": 5.1126, + "step": 15659 + }, + { + "epoch": 0.09313445618041678, + "grad_norm": 1.5902632474899292, + "learning_rate": 4.893763210111862e-05, + "loss": 5.0661, + "step": 15660 + }, + { + "epoch": 0.09314040346369779, + "grad_norm": 1.2780532836914062, + "learning_rate": 4.893749737837687e-05, + "loss": 5.2189, + "step": 15661 + }, + { + "epoch": 0.09314635074697877, + "grad_norm": 1.604551076889038, + "learning_rate": 4.8937362647278786e-05, + "loss": 5.4624, + "step": 15662 + }, + { + "epoch": 0.09315229803025978, + "grad_norm": 1.3654263019561768, + "learning_rate": 4.8937227907824424e-05, + "loss": 5.3875, + "step": 15663 + }, + { + "epoch": 0.09315824531354078, + "grad_norm": 1.3098255395889282, + "learning_rate": 4.893709316001381e-05, + "loss": 5.2158, + "step": 15664 + }, + { + "epoch": 0.09316419259682177, + "grad_norm": 1.4036632776260376, + "learning_rate": 4.893695840384701e-05, + "loss": 5.3808, + "step": 15665 + }, + { + "epoch": 0.09317013988010277, + "grad_norm": 1.772504210472107, + "learning_rate": 4.893682363932407e-05, + "loss": 5.4599, + "step": 15666 + }, + { + "epoch": 0.09317608716338377, + "grad_norm": 1.8509577512741089, + "learning_rate": 4.893668886644503e-05, + "loss": 5.223, + "step": 15667 + }, + { + "epoch": 0.09318203444666476, + "grad_norm": 1.7572264671325684, + "learning_rate": 4.893655408520993e-05, + "loss": 5.3276, + "step": 15668 + }, + { + "epoch": 0.09318798172994576, + "grad_norm": 1.7149637937545776, + "learning_rate": 4.8936419295618835e-05, + "loss": 5.3093, + "step": 15669 + }, + { + "epoch": 0.09319392901322676, + "grad_norm": 1.441741943359375, + "learning_rate": 4.893628449767178e-05, + "loss": 5.2237, + "step": 15670 + }, + { + "epoch": 0.09319987629650775, + "grad_norm": 1.4929050207138062, + "learning_rate": 4.893614969136882e-05, + "loss": 5.22, + "step": 15671 + }, + { + "epoch": 0.09320582357978875, + "grad_norm": 1.251057505607605, + "learning_rate": 4.893601487670999e-05, + "loss": 5.2417, + "step": 15672 + }, + { + "epoch": 0.09321177086306975, + "grad_norm": 1.313826560974121, + "learning_rate": 4.893588005369535e-05, + "loss": 5.1841, + "step": 15673 + }, + { + "epoch": 0.09321771814635074, + "grad_norm": 1.1993061304092407, + "learning_rate": 4.8935745222324935e-05, + "loss": 5.1649, + "step": 15674 + }, + { + "epoch": 0.09322366542963174, + "grad_norm": 1.4086672067642212, + "learning_rate": 4.8935610382598806e-05, + "loss": 5.1463, + "step": 15675 + }, + { + "epoch": 0.09322961271291275, + "grad_norm": 1.3089197874069214, + "learning_rate": 4.893547553451701e-05, + "loss": 5.1505, + "step": 15676 + }, + { + "epoch": 0.09323555999619373, + "grad_norm": 1.3332446813583374, + "learning_rate": 4.893534067807957e-05, + "loss": 5.1267, + "step": 15677 + }, + { + "epoch": 0.09324150727947474, + "grad_norm": 1.433020830154419, + "learning_rate": 4.893520581328656e-05, + "loss": 5.1689, + "step": 15678 + }, + { + "epoch": 0.09324745456275574, + "grad_norm": 1.4111361503601074, + "learning_rate": 4.893507094013801e-05, + "loss": 5.1288, + "step": 15679 + }, + { + "epoch": 0.09325340184603673, + "grad_norm": 1.551698923110962, + "learning_rate": 4.893493605863398e-05, + "loss": 5.0919, + "step": 15680 + }, + { + "epoch": 0.09325934912931773, + "grad_norm": 1.5479143857955933, + "learning_rate": 4.893480116877451e-05, + "loss": 4.9749, + "step": 15681 + }, + { + "epoch": 0.09326529641259873, + "grad_norm": 1.3716951608657837, + "learning_rate": 4.893466627055964e-05, + "loss": 5.2221, + "step": 15682 + }, + { + "epoch": 0.09327124369587972, + "grad_norm": 1.409462571144104, + "learning_rate": 4.893453136398943e-05, + "loss": 5.2131, + "step": 15683 + }, + { + "epoch": 0.09327719097916072, + "grad_norm": 1.3185720443725586, + "learning_rate": 4.8934396449063935e-05, + "loss": 5.094, + "step": 15684 + }, + { + "epoch": 0.09328313826244172, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8934261525783176e-05, + "loss": 5.0889, + "step": 15685 + }, + { + "epoch": 0.09328908554572271, + "grad_norm": 2.147268772125244, + "learning_rate": 4.8934126594147216e-05, + "loss": 4.9404, + "step": 15686 + }, + { + "epoch": 0.09329503282900371, + "grad_norm": 1.3361799716949463, + "learning_rate": 4.8933991654156096e-05, + "loss": 5.0744, + "step": 15687 + }, + { + "epoch": 0.09330098011228471, + "grad_norm": 1.6436421871185303, + "learning_rate": 4.893385670580988e-05, + "loss": 5.0633, + "step": 15688 + }, + { + "epoch": 0.0933069273955657, + "grad_norm": 1.5499234199523926, + "learning_rate": 4.8933721749108586e-05, + "loss": 4.8445, + "step": 15689 + }, + { + "epoch": 0.0933128746788467, + "grad_norm": 1.363355278968811, + "learning_rate": 4.893358678405229e-05, + "loss": 5.1135, + "step": 15690 + }, + { + "epoch": 0.0933188219621277, + "grad_norm": 1.4172797203063965, + "learning_rate": 4.893345181064102e-05, + "loss": 5.056, + "step": 15691 + }, + { + "epoch": 0.0933247692454087, + "grad_norm": 1.546329140663147, + "learning_rate": 4.893331682887483e-05, + "loss": 4.9756, + "step": 15692 + }, + { + "epoch": 0.0933307165286897, + "grad_norm": 1.5151170492172241, + "learning_rate": 4.893318183875376e-05, + "loss": 4.991, + "step": 15693 + }, + { + "epoch": 0.09333666381197069, + "grad_norm": 1.1936514377593994, + "learning_rate": 4.893304684027787e-05, + "loss": 5.0454, + "step": 15694 + }, + { + "epoch": 0.09334261109525169, + "grad_norm": 1.4055380821228027, + "learning_rate": 4.893291183344721e-05, + "loss": 5.0673, + "step": 15695 + }, + { + "epoch": 0.09334855837853269, + "grad_norm": 1.4087036848068237, + "learning_rate": 4.89327768182618e-05, + "loss": 4.9748, + "step": 15696 + }, + { + "epoch": 0.09335450566181368, + "grad_norm": 1.251237392425537, + "learning_rate": 4.893264179472171e-05, + "loss": 5.158, + "step": 15697 + }, + { + "epoch": 0.09336045294509468, + "grad_norm": 1.3806357383728027, + "learning_rate": 4.893250676282699e-05, + "loss": 5.2027, + "step": 15698 + }, + { + "epoch": 0.09336640022837568, + "grad_norm": 1.3959203958511353, + "learning_rate": 4.893237172257767e-05, + "loss": 5.1854, + "step": 15699 + }, + { + "epoch": 0.09337234751165667, + "grad_norm": 1.4886810779571533, + "learning_rate": 4.893223667397381e-05, + "loss": 5.2363, + "step": 15700 + }, + { + "epoch": 0.09337829479493767, + "grad_norm": 1.2987968921661377, + "learning_rate": 4.893210161701546e-05, + "loss": 5.2931, + "step": 15701 + }, + { + "epoch": 0.09338424207821867, + "grad_norm": 1.2594645023345947, + "learning_rate": 4.8931966551702644e-05, + "loss": 5.1346, + "step": 15702 + }, + { + "epoch": 0.09339018936149966, + "grad_norm": 1.5101357698440552, + "learning_rate": 4.893183147803544e-05, + "loss": 5.0369, + "step": 15703 + }, + { + "epoch": 0.09339613664478066, + "grad_norm": 1.4388933181762695, + "learning_rate": 4.8931696396013876e-05, + "loss": 5.0427, + "step": 15704 + }, + { + "epoch": 0.09340208392806167, + "grad_norm": 1.2890875339508057, + "learning_rate": 4.8931561305638006e-05, + "loss": 5.1602, + "step": 15705 + }, + { + "epoch": 0.09340803121134265, + "grad_norm": 1.3310670852661133, + "learning_rate": 4.893142620690787e-05, + "loss": 5.4886, + "step": 15706 + }, + { + "epoch": 0.09341397849462366, + "grad_norm": 1.0935169458389282, + "learning_rate": 4.893129109982353e-05, + "loss": 5.4634, + "step": 15707 + }, + { + "epoch": 0.09341992577790466, + "grad_norm": 1.4718440771102905, + "learning_rate": 4.893115598438501e-05, + "loss": 5.4917, + "step": 15708 + }, + { + "epoch": 0.09342587306118565, + "grad_norm": 1.4053934812545776, + "learning_rate": 4.8931020860592384e-05, + "loss": 5.1588, + "step": 15709 + }, + { + "epoch": 0.09343182034446665, + "grad_norm": 1.3130263090133667, + "learning_rate": 4.893088572844568e-05, + "loss": 5.0464, + "step": 15710 + }, + { + "epoch": 0.09343776762774765, + "grad_norm": 1.3342580795288086, + "learning_rate": 4.8930750587944955e-05, + "loss": 5.1464, + "step": 15711 + }, + { + "epoch": 0.09344371491102864, + "grad_norm": 1.3214285373687744, + "learning_rate": 4.893061543909024e-05, + "loss": 5.0867, + "step": 15712 + }, + { + "epoch": 0.09344966219430964, + "grad_norm": 1.2091466188430786, + "learning_rate": 4.893048028188161e-05, + "loss": 5.1403, + "step": 15713 + }, + { + "epoch": 0.09345560947759064, + "grad_norm": 1.421499490737915, + "learning_rate": 4.893034511631909e-05, + "loss": 5.1853, + "step": 15714 + }, + { + "epoch": 0.09346155676087163, + "grad_norm": 1.2093148231506348, + "learning_rate": 4.893020994240273e-05, + "loss": 5.0892, + "step": 15715 + }, + { + "epoch": 0.09346750404415263, + "grad_norm": 1.361080288887024, + "learning_rate": 4.893007476013258e-05, + "loss": 5.0855, + "step": 15716 + }, + { + "epoch": 0.09347345132743363, + "grad_norm": 1.31247079372406, + "learning_rate": 4.89299395695087e-05, + "loss": 5.1667, + "step": 15717 + }, + { + "epoch": 0.09347939861071462, + "grad_norm": 1.4052191972732544, + "learning_rate": 4.892980437053112e-05, + "loss": 4.9256, + "step": 15718 + }, + { + "epoch": 0.09348534589399562, + "grad_norm": 1.409225344657898, + "learning_rate": 4.8929669163199886e-05, + "loss": 4.7722, + "step": 15719 + }, + { + "epoch": 0.09349129317727661, + "grad_norm": 1.54015052318573, + "learning_rate": 4.892953394751505e-05, + "loss": 4.9331, + "step": 15720 + }, + { + "epoch": 0.09349724046055762, + "grad_norm": 1.313596487045288, + "learning_rate": 4.892939872347667e-05, + "loss": 5.0221, + "step": 15721 + }, + { + "epoch": 0.09350318774383862, + "grad_norm": 1.5266852378845215, + "learning_rate": 4.8929263491084785e-05, + "loss": 5.0261, + "step": 15722 + }, + { + "epoch": 0.0935091350271196, + "grad_norm": 1.409408450126648, + "learning_rate": 4.892912825033944e-05, + "loss": 5.1319, + "step": 15723 + }, + { + "epoch": 0.09351508231040061, + "grad_norm": 1.444326639175415, + "learning_rate": 4.892899300124067e-05, + "loss": 5.0043, + "step": 15724 + }, + { + "epoch": 0.09352102959368161, + "grad_norm": 1.6662111282348633, + "learning_rate": 4.8928857743788556e-05, + "loss": 5.22, + "step": 15725 + }, + { + "epoch": 0.0935269768769626, + "grad_norm": 1.5927739143371582, + "learning_rate": 4.8928722477983116e-05, + "loss": 5.1532, + "step": 15726 + }, + { + "epoch": 0.0935329241602436, + "grad_norm": 1.5560848712921143, + "learning_rate": 4.892858720382441e-05, + "loss": 4.8893, + "step": 15727 + }, + { + "epoch": 0.0935388714435246, + "grad_norm": 1.450135588645935, + "learning_rate": 4.892845192131247e-05, + "loss": 4.8116, + "step": 15728 + }, + { + "epoch": 0.09354481872680559, + "grad_norm": 1.3629002571105957, + "learning_rate": 4.892831663044736e-05, + "loss": 4.9439, + "step": 15729 + }, + { + "epoch": 0.09355076601008659, + "grad_norm": 1.5293892621994019, + "learning_rate": 4.892818133122913e-05, + "loss": 5.1726, + "step": 15730 + }, + { + "epoch": 0.0935567132933676, + "grad_norm": 1.193088412284851, + "learning_rate": 4.892804602365781e-05, + "loss": 5.3199, + "step": 15731 + }, + { + "epoch": 0.09356266057664858, + "grad_norm": 1.5575615167617798, + "learning_rate": 4.8927910707733456e-05, + "loss": 5.3426, + "step": 15732 + }, + { + "epoch": 0.09356860785992958, + "grad_norm": 1.4177138805389404, + "learning_rate": 4.892777538345612e-05, + "loss": 5.4028, + "step": 15733 + }, + { + "epoch": 0.09357455514321059, + "grad_norm": 1.4139392375946045, + "learning_rate": 4.892764005082584e-05, + "loss": 5.3854, + "step": 15734 + }, + { + "epoch": 0.09358050242649157, + "grad_norm": 1.5129605531692505, + "learning_rate": 4.892750470984267e-05, + "loss": 5.3614, + "step": 15735 + }, + { + "epoch": 0.09358644970977258, + "grad_norm": 1.23565673828125, + "learning_rate": 4.8927369360506665e-05, + "loss": 5.2379, + "step": 15736 + }, + { + "epoch": 0.09359239699305358, + "grad_norm": 1.4861465692520142, + "learning_rate": 4.892723400281785e-05, + "loss": 5.0968, + "step": 15737 + }, + { + "epoch": 0.09359834427633457, + "grad_norm": 1.4061464071273804, + "learning_rate": 4.892709863677629e-05, + "loss": 5.2947, + "step": 15738 + }, + { + "epoch": 0.09360429155961557, + "grad_norm": 1.2175462245941162, + "learning_rate": 4.892696326238203e-05, + "loss": 5.2828, + "step": 15739 + }, + { + "epoch": 0.09361023884289657, + "grad_norm": 1.398414969444275, + "learning_rate": 4.8926827879635104e-05, + "loss": 5.3281, + "step": 15740 + }, + { + "epoch": 0.09361618612617756, + "grad_norm": 1.438428282737732, + "learning_rate": 4.892669248853558e-05, + "loss": 5.2483, + "step": 15741 + }, + { + "epoch": 0.09362213340945856, + "grad_norm": 1.6579184532165527, + "learning_rate": 4.8926557089083494e-05, + "loss": 5.1275, + "step": 15742 + }, + { + "epoch": 0.09362808069273956, + "grad_norm": 1.2637989521026611, + "learning_rate": 4.892642168127889e-05, + "loss": 5.2276, + "step": 15743 + }, + { + "epoch": 0.09363402797602055, + "grad_norm": 1.383898377418518, + "learning_rate": 4.892628626512182e-05, + "loss": 5.3406, + "step": 15744 + }, + { + "epoch": 0.09363997525930155, + "grad_norm": 1.3794132471084595, + "learning_rate": 4.8926150840612325e-05, + "loss": 5.2309, + "step": 15745 + }, + { + "epoch": 0.09364592254258255, + "grad_norm": 1.3234885931015015, + "learning_rate": 4.8926015407750466e-05, + "loss": 5.3171, + "step": 15746 + }, + { + "epoch": 0.09365186982586354, + "grad_norm": 1.4807502031326294, + "learning_rate": 4.892587996653629e-05, + "loss": 5.3362, + "step": 15747 + }, + { + "epoch": 0.09365781710914454, + "grad_norm": 2.380307912826538, + "learning_rate": 4.892574451696982e-05, + "loss": 5.3103, + "step": 15748 + }, + { + "epoch": 0.09366376439242553, + "grad_norm": 1.5202600955963135, + "learning_rate": 4.892560905905113e-05, + "loss": 5.2225, + "step": 15749 + }, + { + "epoch": 0.09366971167570654, + "grad_norm": 1.34883451461792, + "learning_rate": 4.892547359278025e-05, + "loss": 5.1794, + "step": 15750 + }, + { + "epoch": 0.09367565895898754, + "grad_norm": 1.7073168754577637, + "learning_rate": 4.8925338118157235e-05, + "loss": 5.101, + "step": 15751 + }, + { + "epoch": 0.09368160624226853, + "grad_norm": 1.2718127965927124, + "learning_rate": 4.892520263518214e-05, + "loss": 5.3492, + "step": 15752 + }, + { + "epoch": 0.09368755352554953, + "grad_norm": 1.2247645854949951, + "learning_rate": 4.8925067143854993e-05, + "loss": 5.0841, + "step": 15753 + }, + { + "epoch": 0.09369350080883053, + "grad_norm": 1.4443535804748535, + "learning_rate": 4.892493164417586e-05, + "loss": 5.2866, + "step": 15754 + }, + { + "epoch": 0.09369944809211152, + "grad_norm": 1.2206883430480957, + "learning_rate": 4.8924796136144776e-05, + "loss": 5.116, + "step": 15755 + }, + { + "epoch": 0.09370539537539252, + "grad_norm": 1.4597479104995728, + "learning_rate": 4.89246606197618e-05, + "loss": 5.1501, + "step": 15756 + }, + { + "epoch": 0.09371134265867352, + "grad_norm": 1.4129786491394043, + "learning_rate": 4.892452509502697e-05, + "loss": 5.2618, + "step": 15757 + }, + { + "epoch": 0.09371728994195451, + "grad_norm": 1.382739543914795, + "learning_rate": 4.892438956194033e-05, + "loss": 5.2191, + "step": 15758 + }, + { + "epoch": 0.09372323722523551, + "grad_norm": 1.3665072917938232, + "learning_rate": 4.8924254020501934e-05, + "loss": 4.9739, + "step": 15759 + }, + { + "epoch": 0.09372918450851651, + "grad_norm": 1.3109017610549927, + "learning_rate": 4.892411847071183e-05, + "loss": 5.0648, + "step": 15760 + }, + { + "epoch": 0.0937351317917975, + "grad_norm": 1.5278202295303345, + "learning_rate": 4.892398291257007e-05, + "loss": 5.0215, + "step": 15761 + }, + { + "epoch": 0.0937410790750785, + "grad_norm": 1.4676958322525024, + "learning_rate": 4.8923847346076686e-05, + "loss": 5.442, + "step": 15762 + }, + { + "epoch": 0.0937470263583595, + "grad_norm": 1.4718897342681885, + "learning_rate": 4.892371177123174e-05, + "loss": 5.1484, + "step": 15763 + }, + { + "epoch": 0.0937529736416405, + "grad_norm": 1.2358952760696411, + "learning_rate": 4.8923576188035264e-05, + "loss": 5.3594, + "step": 15764 + }, + { + "epoch": 0.0937589209249215, + "grad_norm": 1.59844172000885, + "learning_rate": 4.8923440596487326e-05, + "loss": 5.221, + "step": 15765 + }, + { + "epoch": 0.0937648682082025, + "grad_norm": 1.4293478727340698, + "learning_rate": 4.892330499658795e-05, + "loss": 5.2211, + "step": 15766 + }, + { + "epoch": 0.09377081549148349, + "grad_norm": 1.167673110961914, + "learning_rate": 4.8923169388337204e-05, + "loss": 5.1274, + "step": 15767 + }, + { + "epoch": 0.09377676277476449, + "grad_norm": 1.4637590646743774, + "learning_rate": 4.892303377173512e-05, + "loss": 5.0781, + "step": 15768 + }, + { + "epoch": 0.09378271005804549, + "grad_norm": 1.383498191833496, + "learning_rate": 4.892289814678176e-05, + "loss": 5.003, + "step": 15769 + }, + { + "epoch": 0.09378865734132648, + "grad_norm": 1.5803290605545044, + "learning_rate": 4.892276251347716e-05, + "loss": 4.9609, + "step": 15770 + }, + { + "epoch": 0.09379460462460748, + "grad_norm": 1.5272483825683594, + "learning_rate": 4.892262687182137e-05, + "loss": 5.074, + "step": 15771 + }, + { + "epoch": 0.09380055190788848, + "grad_norm": 1.377105951309204, + "learning_rate": 4.8922491221814436e-05, + "loss": 5.011, + "step": 15772 + }, + { + "epoch": 0.09380649919116947, + "grad_norm": 1.2150218486785889, + "learning_rate": 4.8922355563456414e-05, + "loss": 5.172, + "step": 15773 + }, + { + "epoch": 0.09381244647445047, + "grad_norm": 1.379515290260315, + "learning_rate": 4.892221989674734e-05, + "loss": 5.229, + "step": 15774 + }, + { + "epoch": 0.09381839375773147, + "grad_norm": 1.5256911516189575, + "learning_rate": 4.892208422168727e-05, + "loss": 5.0163, + "step": 15775 + }, + { + "epoch": 0.09382434104101246, + "grad_norm": 1.645808458328247, + "learning_rate": 4.892194853827624e-05, + "loss": 5.1382, + "step": 15776 + }, + { + "epoch": 0.09383028832429346, + "grad_norm": 1.7437238693237305, + "learning_rate": 4.8921812846514315e-05, + "loss": 4.8078, + "step": 15777 + }, + { + "epoch": 0.09383623560757447, + "grad_norm": 1.384291410446167, + "learning_rate": 4.892167714640152e-05, + "loss": 5.1645, + "step": 15778 + }, + { + "epoch": 0.09384218289085546, + "grad_norm": 1.6412228345870972, + "learning_rate": 4.892154143793792e-05, + "loss": 5.0472, + "step": 15779 + }, + { + "epoch": 0.09384813017413646, + "grad_norm": 1.5364267826080322, + "learning_rate": 4.8921405721123555e-05, + "loss": 5.1357, + "step": 15780 + }, + { + "epoch": 0.09385407745741745, + "grad_norm": 1.4579834938049316, + "learning_rate": 4.892126999595849e-05, + "loss": 5.2047, + "step": 15781 + }, + { + "epoch": 0.09386002474069845, + "grad_norm": 1.4087393283843994, + "learning_rate": 4.8921134262442745e-05, + "loss": 5.3224, + "step": 15782 + }, + { + "epoch": 0.09386597202397945, + "grad_norm": 1.4741411209106445, + "learning_rate": 4.8920998520576376e-05, + "loss": 4.9882, + "step": 15783 + }, + { + "epoch": 0.09387191930726044, + "grad_norm": 1.488578200340271, + "learning_rate": 4.8920862770359434e-05, + "loss": 4.8698, + "step": 15784 + }, + { + "epoch": 0.09387786659054144, + "grad_norm": 1.4695780277252197, + "learning_rate": 4.892072701179197e-05, + "loss": 4.6841, + "step": 15785 + }, + { + "epoch": 0.09388381387382244, + "grad_norm": 1.2468496561050415, + "learning_rate": 4.892059124487402e-05, + "loss": 5.0962, + "step": 15786 + }, + { + "epoch": 0.09388976115710343, + "grad_norm": 1.1099787950515747, + "learning_rate": 4.8920455469605654e-05, + "loss": 5.0883, + "step": 15787 + }, + { + "epoch": 0.09389570844038443, + "grad_norm": 1.3954483270645142, + "learning_rate": 4.892031968598689e-05, + "loss": 4.9554, + "step": 15788 + }, + { + "epoch": 0.09390165572366543, + "grad_norm": 1.3176839351654053, + "learning_rate": 4.892018389401779e-05, + "loss": 5.1638, + "step": 15789 + }, + { + "epoch": 0.09390760300694642, + "grad_norm": 1.2406723499298096, + "learning_rate": 4.892004809369841e-05, + "loss": 5.0569, + "step": 15790 + }, + { + "epoch": 0.09391355029022742, + "grad_norm": 1.395556926727295, + "learning_rate": 4.891991228502878e-05, + "loss": 4.9179, + "step": 15791 + }, + { + "epoch": 0.09391949757350843, + "grad_norm": 1.3977546691894531, + "learning_rate": 4.891977646800896e-05, + "loss": 5.0045, + "step": 15792 + }, + { + "epoch": 0.09392544485678941, + "grad_norm": 1.5089846849441528, + "learning_rate": 4.891964064263899e-05, + "loss": 5.176, + "step": 15793 + }, + { + "epoch": 0.09393139214007042, + "grad_norm": 1.260077953338623, + "learning_rate": 4.891950480891893e-05, + "loss": 5.3789, + "step": 15794 + }, + { + "epoch": 0.09393733942335142, + "grad_norm": 1.3587939739227295, + "learning_rate": 4.891936896684881e-05, + "loss": 5.308, + "step": 15795 + }, + { + "epoch": 0.0939432867066324, + "grad_norm": 1.4004688262939453, + "learning_rate": 4.8919233116428684e-05, + "loss": 5.5232, + "step": 15796 + }, + { + "epoch": 0.09394923398991341, + "grad_norm": 1.3308182954788208, + "learning_rate": 4.89190972576586e-05, + "loss": 5.3944, + "step": 15797 + }, + { + "epoch": 0.09395518127319441, + "grad_norm": 1.3078187704086304, + "learning_rate": 4.891896139053861e-05, + "loss": 5.3146, + "step": 15798 + }, + { + "epoch": 0.0939611285564754, + "grad_norm": 1.3268121480941772, + "learning_rate": 4.891882551506875e-05, + "loss": 5.2966, + "step": 15799 + }, + { + "epoch": 0.0939670758397564, + "grad_norm": 1.424813985824585, + "learning_rate": 4.8918689631249095e-05, + "loss": 5.132, + "step": 15800 + }, + { + "epoch": 0.0939730231230374, + "grad_norm": 1.2917978763580322, + "learning_rate": 4.8918553739079656e-05, + "loss": 5.1889, + "step": 15801 + }, + { + "epoch": 0.09397897040631839, + "grad_norm": 1.377146601676941, + "learning_rate": 4.8918417838560506e-05, + "loss": 5.2749, + "step": 15802 + }, + { + "epoch": 0.09398491768959939, + "grad_norm": 1.2476272583007812, + "learning_rate": 4.891828192969167e-05, + "loss": 5.1367, + "step": 15803 + }, + { + "epoch": 0.0939908649728804, + "grad_norm": 1.423923373222351, + "learning_rate": 4.891814601247322e-05, + "loss": 5.1657, + "step": 15804 + }, + { + "epoch": 0.09399681225616138, + "grad_norm": 1.2762609720230103, + "learning_rate": 4.891801008690518e-05, + "loss": 5.2245, + "step": 15805 + }, + { + "epoch": 0.09400275953944238, + "grad_norm": 1.3098403215408325, + "learning_rate": 4.891787415298763e-05, + "loss": 5.1452, + "step": 15806 + }, + { + "epoch": 0.09400870682272339, + "grad_norm": 1.2892425060272217, + "learning_rate": 4.8917738210720586e-05, + "loss": 5.268, + "step": 15807 + }, + { + "epoch": 0.09401465410600438, + "grad_norm": 1.4667305946350098, + "learning_rate": 4.8917602260104105e-05, + "loss": 5.1666, + "step": 15808 + }, + { + "epoch": 0.09402060138928538, + "grad_norm": 1.289933204650879, + "learning_rate": 4.891746630113824e-05, + "loss": 5.1772, + "step": 15809 + }, + { + "epoch": 0.09402654867256637, + "grad_norm": 2.3923516273498535, + "learning_rate": 4.891733033382303e-05, + "loss": 5.0732, + "step": 15810 + }, + { + "epoch": 0.09403249595584737, + "grad_norm": 1.223607063293457, + "learning_rate": 4.8917194358158534e-05, + "loss": 5.1025, + "step": 15811 + }, + { + "epoch": 0.09403844323912837, + "grad_norm": 1.5959491729736328, + "learning_rate": 4.8917058374144785e-05, + "loss": 5.3244, + "step": 15812 + }, + { + "epoch": 0.09404439052240936, + "grad_norm": 1.2359555959701538, + "learning_rate": 4.8916922381781845e-05, + "loss": 4.8643, + "step": 15813 + }, + { + "epoch": 0.09405033780569036, + "grad_norm": 1.3971196413040161, + "learning_rate": 4.891678638106974e-05, + "loss": 5.0362, + "step": 15814 + }, + { + "epoch": 0.09405628508897136, + "grad_norm": 1.3501266241073608, + "learning_rate": 4.891665037200855e-05, + "loss": 4.8705, + "step": 15815 + }, + { + "epoch": 0.09406223237225235, + "grad_norm": 1.3506006002426147, + "learning_rate": 4.89165143545983e-05, + "loss": 4.9122, + "step": 15816 + }, + { + "epoch": 0.09406817965553335, + "grad_norm": 1.4444037675857544, + "learning_rate": 4.891637832883904e-05, + "loss": 4.8428, + "step": 15817 + }, + { + "epoch": 0.09407412693881435, + "grad_norm": 1.4757333993911743, + "learning_rate": 4.891624229473082e-05, + "loss": 5.1774, + "step": 15818 + }, + { + "epoch": 0.09408007422209534, + "grad_norm": 1.3660651445388794, + "learning_rate": 4.891610625227369e-05, + "loss": 5.2998, + "step": 15819 + }, + { + "epoch": 0.09408602150537634, + "grad_norm": 1.625279426574707, + "learning_rate": 4.891597020146769e-05, + "loss": 5.1365, + "step": 15820 + }, + { + "epoch": 0.09409196878865735, + "grad_norm": 1.5202007293701172, + "learning_rate": 4.891583414231287e-05, + "loss": 5.287, + "step": 15821 + }, + { + "epoch": 0.09409791607193833, + "grad_norm": 1.5217576026916504, + "learning_rate": 4.891569807480928e-05, + "loss": 5.3599, + "step": 15822 + }, + { + "epoch": 0.09410386335521934, + "grad_norm": 1.5446710586547852, + "learning_rate": 4.891556199895696e-05, + "loss": 5.1332, + "step": 15823 + }, + { + "epoch": 0.09410981063850034, + "grad_norm": 1.2877990007400513, + "learning_rate": 4.8915425914755973e-05, + "loss": 5.0756, + "step": 15824 + }, + { + "epoch": 0.09411575792178133, + "grad_norm": 1.3024258613586426, + "learning_rate": 4.891528982220636e-05, + "loss": 5.3293, + "step": 15825 + }, + { + "epoch": 0.09412170520506233, + "grad_norm": 1.3039882183074951, + "learning_rate": 4.8915153721308166e-05, + "loss": 5.1406, + "step": 15826 + }, + { + "epoch": 0.09412765248834333, + "grad_norm": 1.2524348497390747, + "learning_rate": 4.8915017612061435e-05, + "loss": 5.3044, + "step": 15827 + }, + { + "epoch": 0.09413359977162432, + "grad_norm": 1.2522565126419067, + "learning_rate": 4.8914881494466226e-05, + "loss": 5.1776, + "step": 15828 + }, + { + "epoch": 0.09413954705490532, + "grad_norm": 1.3882638216018677, + "learning_rate": 4.8914745368522566e-05, + "loss": 5.2296, + "step": 15829 + }, + { + "epoch": 0.09414549433818632, + "grad_norm": 1.5169535875320435, + "learning_rate": 4.891460923423052e-05, + "loss": 5.2058, + "step": 15830 + }, + { + "epoch": 0.09415144162146731, + "grad_norm": 1.2045719623565674, + "learning_rate": 4.891447309159014e-05, + "loss": 5.256, + "step": 15831 + }, + { + "epoch": 0.09415738890474831, + "grad_norm": 1.4639356136322021, + "learning_rate": 4.891433694060146e-05, + "loss": 5.1781, + "step": 15832 + }, + { + "epoch": 0.09416333618802931, + "grad_norm": 1.498923420906067, + "learning_rate": 4.891420078126453e-05, + "loss": 5.1777, + "step": 15833 + }, + { + "epoch": 0.0941692834713103, + "grad_norm": 1.163977861404419, + "learning_rate": 4.89140646135794e-05, + "loss": 4.9302, + "step": 15834 + }, + { + "epoch": 0.0941752307545913, + "grad_norm": 1.502808690071106, + "learning_rate": 4.8913928437546113e-05, + "loss": 5.1053, + "step": 15835 + }, + { + "epoch": 0.0941811780378723, + "grad_norm": 1.401517391204834, + "learning_rate": 4.891379225316473e-05, + "loss": 5.3156, + "step": 15836 + }, + { + "epoch": 0.0941871253211533, + "grad_norm": 1.328116774559021, + "learning_rate": 4.891365606043528e-05, + "loss": 5.2333, + "step": 15837 + }, + { + "epoch": 0.0941930726044343, + "grad_norm": 1.160243272781372, + "learning_rate": 4.891351985935782e-05, + "loss": 5.2575, + "step": 15838 + }, + { + "epoch": 0.09419901988771529, + "grad_norm": 1.1748963594436646, + "learning_rate": 4.8913383649932404e-05, + "loss": 5.0673, + "step": 15839 + }, + { + "epoch": 0.09420496717099629, + "grad_norm": 1.2916535139083862, + "learning_rate": 4.891324743215907e-05, + "loss": 5.135, + "step": 15840 + }, + { + "epoch": 0.09421091445427729, + "grad_norm": 1.302393913269043, + "learning_rate": 4.8913111206037865e-05, + "loss": 4.9814, + "step": 15841 + }, + { + "epoch": 0.09421686173755828, + "grad_norm": 1.273445963859558, + "learning_rate": 4.891297497156885e-05, + "loss": 4.9163, + "step": 15842 + }, + { + "epoch": 0.09422280902083928, + "grad_norm": 1.444884181022644, + "learning_rate": 4.8912838728752055e-05, + "loss": 4.9316, + "step": 15843 + }, + { + "epoch": 0.09422875630412028, + "grad_norm": 1.411985993385315, + "learning_rate": 4.891270247758753e-05, + "loss": 4.9222, + "step": 15844 + }, + { + "epoch": 0.09423470358740127, + "grad_norm": 1.3697528839111328, + "learning_rate": 4.891256621807533e-05, + "loss": 4.8398, + "step": 15845 + }, + { + "epoch": 0.09424065087068227, + "grad_norm": 1.385298728942871, + "learning_rate": 4.891242995021551e-05, + "loss": 4.8869, + "step": 15846 + }, + { + "epoch": 0.09424659815396327, + "grad_norm": 1.821768879890442, + "learning_rate": 4.8912293674008094e-05, + "loss": 5.178, + "step": 15847 + }, + { + "epoch": 0.09425254543724426, + "grad_norm": 1.8198026418685913, + "learning_rate": 4.891215738945315e-05, + "loss": 5.2892, + "step": 15848 + }, + { + "epoch": 0.09425849272052526, + "grad_norm": 1.4373536109924316, + "learning_rate": 4.891202109655072e-05, + "loss": 5.1203, + "step": 15849 + }, + { + "epoch": 0.09426444000380627, + "grad_norm": 1.2086896896362305, + "learning_rate": 4.8911884795300855e-05, + "loss": 4.8603, + "step": 15850 + }, + { + "epoch": 0.09427038728708725, + "grad_norm": 1.3166700601577759, + "learning_rate": 4.891174848570359e-05, + "loss": 4.917, + "step": 15851 + }, + { + "epoch": 0.09427633457036826, + "grad_norm": 1.5753637552261353, + "learning_rate": 4.891161216775898e-05, + "loss": 5.0197, + "step": 15852 + }, + { + "epoch": 0.09428228185364926, + "grad_norm": 1.5428698062896729, + "learning_rate": 4.891147584146708e-05, + "loss": 5.2048, + "step": 15853 + }, + { + "epoch": 0.09428822913693025, + "grad_norm": 1.3760755062103271, + "learning_rate": 4.8911339506827924e-05, + "loss": 5.2568, + "step": 15854 + }, + { + "epoch": 0.09429417642021125, + "grad_norm": 1.6683621406555176, + "learning_rate": 4.891120316384157e-05, + "loss": 4.8976, + "step": 15855 + }, + { + "epoch": 0.09430012370349225, + "grad_norm": 1.4224987030029297, + "learning_rate": 4.891106681250807e-05, + "loss": 4.9538, + "step": 15856 + }, + { + "epoch": 0.09430607098677324, + "grad_norm": 1.2851178646087646, + "learning_rate": 4.8910930452827454e-05, + "loss": 4.8972, + "step": 15857 + }, + { + "epoch": 0.09431201827005424, + "grad_norm": 1.6412112712860107, + "learning_rate": 4.891079408479978e-05, + "loss": 5.124, + "step": 15858 + }, + { + "epoch": 0.09431796555333524, + "grad_norm": 1.380089282989502, + "learning_rate": 4.891065770842509e-05, + "loss": 5.1155, + "step": 15859 + }, + { + "epoch": 0.09432391283661623, + "grad_norm": 1.3117294311523438, + "learning_rate": 4.891052132370344e-05, + "loss": 5.1968, + "step": 15860 + }, + { + "epoch": 0.09432986011989723, + "grad_norm": 1.5171841382980347, + "learning_rate": 4.891038493063488e-05, + "loss": 5.1029, + "step": 15861 + }, + { + "epoch": 0.09433580740317823, + "grad_norm": 1.4801427125930786, + "learning_rate": 4.8910248529219446e-05, + "loss": 5.1533, + "step": 15862 + }, + { + "epoch": 0.09434175468645922, + "grad_norm": 1.672522783279419, + "learning_rate": 4.8910112119457196e-05, + "loss": 5.3259, + "step": 15863 + }, + { + "epoch": 0.09434770196974022, + "grad_norm": 1.5151952505111694, + "learning_rate": 4.890997570134816e-05, + "loss": 5.2654, + "step": 15864 + }, + { + "epoch": 0.09435364925302123, + "grad_norm": 1.4178684949874878, + "learning_rate": 4.890983927489242e-05, + "loss": 5.2369, + "step": 15865 + }, + { + "epoch": 0.09435959653630221, + "grad_norm": 1.3673019409179688, + "learning_rate": 4.890970284008999e-05, + "loss": 5.2176, + "step": 15866 + }, + { + "epoch": 0.09436554381958322, + "grad_norm": 1.4063305854797363, + "learning_rate": 4.8909566396940934e-05, + "loss": 5.1189, + "step": 15867 + }, + { + "epoch": 0.0943714911028642, + "grad_norm": 1.277815818786621, + "learning_rate": 4.890942994544528e-05, + "loss": 5.2204, + "step": 15868 + }, + { + "epoch": 0.09437743838614521, + "grad_norm": 1.5394912958145142, + "learning_rate": 4.890929348560311e-05, + "loss": 5.1147, + "step": 15869 + }, + { + "epoch": 0.09438338566942621, + "grad_norm": 1.4091798067092896, + "learning_rate": 4.890915701741444e-05, + "loss": 5.1367, + "step": 15870 + }, + { + "epoch": 0.0943893329527072, + "grad_norm": 1.367828369140625, + "learning_rate": 4.8909020540879336e-05, + "loss": 5.1871, + "step": 15871 + }, + { + "epoch": 0.0943952802359882, + "grad_norm": 2.2413175106048584, + "learning_rate": 4.890888405599784e-05, + "loss": 5.0571, + "step": 15872 + }, + { + "epoch": 0.0944012275192692, + "grad_norm": 1.392906904220581, + "learning_rate": 4.8908747562769995e-05, + "loss": 4.9885, + "step": 15873 + }, + { + "epoch": 0.09440717480255019, + "grad_norm": 1.4517099857330322, + "learning_rate": 4.8908611061195865e-05, + "loss": 5.1596, + "step": 15874 + }, + { + "epoch": 0.09441312208583119, + "grad_norm": 1.663919448852539, + "learning_rate": 4.890847455127547e-05, + "loss": 5.0029, + "step": 15875 + }, + { + "epoch": 0.0944190693691122, + "grad_norm": 1.5252666473388672, + "learning_rate": 4.8908338033008885e-05, + "loss": 4.9596, + "step": 15876 + }, + { + "epoch": 0.09442501665239318, + "grad_norm": 1.613261103630066, + "learning_rate": 4.8908201506396143e-05, + "loss": 4.91, + "step": 15877 + }, + { + "epoch": 0.09443096393567418, + "grad_norm": 1.5182253122329712, + "learning_rate": 4.8908064971437295e-05, + "loss": 5.0564, + "step": 15878 + }, + { + "epoch": 0.09443691121895519, + "grad_norm": 1.4765241146087646, + "learning_rate": 4.8907928428132386e-05, + "loss": 5.0863, + "step": 15879 + }, + { + "epoch": 0.09444285850223617, + "grad_norm": 1.6401035785675049, + "learning_rate": 4.890779187648147e-05, + "loss": 4.9876, + "step": 15880 + }, + { + "epoch": 0.09444880578551718, + "grad_norm": 1.4818077087402344, + "learning_rate": 4.8907655316484594e-05, + "loss": 4.9361, + "step": 15881 + }, + { + "epoch": 0.09445475306879818, + "grad_norm": 1.4490398168563843, + "learning_rate": 4.89075187481418e-05, + "loss": 4.8991, + "step": 15882 + }, + { + "epoch": 0.09446070035207917, + "grad_norm": 1.2799785137176514, + "learning_rate": 4.890738217145313e-05, + "loss": 5.0147, + "step": 15883 + }, + { + "epoch": 0.09446664763536017, + "grad_norm": 1.416590929031372, + "learning_rate": 4.890724558641865e-05, + "loss": 5.0255, + "step": 15884 + }, + { + "epoch": 0.09447259491864117, + "grad_norm": 1.4365648031234741, + "learning_rate": 4.8907108993038395e-05, + "loss": 5.0262, + "step": 15885 + }, + { + "epoch": 0.09447854220192216, + "grad_norm": 1.367490530014038, + "learning_rate": 4.890697239131241e-05, + "loss": 4.9478, + "step": 15886 + }, + { + "epoch": 0.09448448948520316, + "grad_norm": 1.3645575046539307, + "learning_rate": 4.8906835781240754e-05, + "loss": 5.0751, + "step": 15887 + }, + { + "epoch": 0.09449043676848416, + "grad_norm": 1.4014960527420044, + "learning_rate": 4.8906699162823464e-05, + "loss": 4.9789, + "step": 15888 + }, + { + "epoch": 0.09449638405176515, + "grad_norm": 1.2261216640472412, + "learning_rate": 4.8906562536060596e-05, + "loss": 4.9619, + "step": 15889 + }, + { + "epoch": 0.09450233133504615, + "grad_norm": 1.3241546154022217, + "learning_rate": 4.890642590095219e-05, + "loss": 4.9947, + "step": 15890 + }, + { + "epoch": 0.09450827861832715, + "grad_norm": 1.337372899055481, + "learning_rate": 4.89062892574983e-05, + "loss": 4.9817, + "step": 15891 + }, + { + "epoch": 0.09451422590160814, + "grad_norm": 1.47610604763031, + "learning_rate": 4.8906152605698974e-05, + "loss": 4.9467, + "step": 15892 + }, + { + "epoch": 0.09452017318488914, + "grad_norm": 1.3533576726913452, + "learning_rate": 4.890601594555425e-05, + "loss": 4.9819, + "step": 15893 + }, + { + "epoch": 0.09452612046817015, + "grad_norm": 1.4445271492004395, + "learning_rate": 4.890587927706419e-05, + "loss": 4.9566, + "step": 15894 + }, + { + "epoch": 0.09453206775145113, + "grad_norm": 1.4600121974945068, + "learning_rate": 4.8905742600228834e-05, + "loss": 4.9341, + "step": 15895 + }, + { + "epoch": 0.09453801503473214, + "grad_norm": 1.2824327945709229, + "learning_rate": 4.8905605915048224e-05, + "loss": 5.0945, + "step": 15896 + }, + { + "epoch": 0.09454396231801313, + "grad_norm": 1.4806164503097534, + "learning_rate": 4.890546922152242e-05, + "loss": 5.1312, + "step": 15897 + }, + { + "epoch": 0.09454990960129413, + "grad_norm": 1.3514155149459839, + "learning_rate": 4.890533251965146e-05, + "loss": 4.9596, + "step": 15898 + }, + { + "epoch": 0.09455585688457513, + "grad_norm": 1.332749843597412, + "learning_rate": 4.89051958094354e-05, + "loss": 5.0649, + "step": 15899 + }, + { + "epoch": 0.09456180416785612, + "grad_norm": 1.310562014579773, + "learning_rate": 4.8905059090874284e-05, + "loss": 5.0977, + "step": 15900 + }, + { + "epoch": 0.09456775145113712, + "grad_norm": 1.342310905456543, + "learning_rate": 4.8904922363968153e-05, + "loss": 5.115, + "step": 15901 + }, + { + "epoch": 0.09457369873441812, + "grad_norm": 1.4810988903045654, + "learning_rate": 4.890478562871706e-05, + "loss": 5.1305, + "step": 15902 + }, + { + "epoch": 0.09457964601769911, + "grad_norm": 1.3064900636672974, + "learning_rate": 4.890464888512106e-05, + "loss": 5.1387, + "step": 15903 + }, + { + "epoch": 0.09458559330098011, + "grad_norm": 1.4571950435638428, + "learning_rate": 4.890451213318019e-05, + "loss": 5.1235, + "step": 15904 + }, + { + "epoch": 0.09459154058426111, + "grad_norm": 1.3964077234268188, + "learning_rate": 4.89043753728945e-05, + "loss": 5.0854, + "step": 15905 + }, + { + "epoch": 0.0945974878675421, + "grad_norm": 1.4404022693634033, + "learning_rate": 4.8904238604264044e-05, + "loss": 5.0991, + "step": 15906 + }, + { + "epoch": 0.0946034351508231, + "grad_norm": 1.3269283771514893, + "learning_rate": 4.890410182728886e-05, + "loss": 4.9299, + "step": 15907 + }, + { + "epoch": 0.0946093824341041, + "grad_norm": 1.4588782787322998, + "learning_rate": 4.8903965041969e-05, + "loss": 5.0992, + "step": 15908 + }, + { + "epoch": 0.0946153297173851, + "grad_norm": 1.2911858558654785, + "learning_rate": 4.8903828248304525e-05, + "loss": 5.0639, + "step": 15909 + }, + { + "epoch": 0.0946212770006661, + "grad_norm": 1.336695909500122, + "learning_rate": 4.8903691446295466e-05, + "loss": 5.1479, + "step": 15910 + }, + { + "epoch": 0.0946272242839471, + "grad_norm": 1.3052904605865479, + "learning_rate": 4.890355463594186e-05, + "loss": 5.049, + "step": 15911 + }, + { + "epoch": 0.09463317156722809, + "grad_norm": 1.3744491338729858, + "learning_rate": 4.890341781724379e-05, + "loss": 5.0709, + "step": 15912 + }, + { + "epoch": 0.09463911885050909, + "grad_norm": 1.5727102756500244, + "learning_rate": 4.890328099020127e-05, + "loss": 4.9857, + "step": 15913 + }, + { + "epoch": 0.09464506613379009, + "grad_norm": 1.5804322957992554, + "learning_rate": 4.890314415481437e-05, + "loss": 5.133, + "step": 15914 + }, + { + "epoch": 0.09465101341707108, + "grad_norm": 1.228421926498413, + "learning_rate": 4.8903007311083124e-05, + "loss": 4.9561, + "step": 15915 + }, + { + "epoch": 0.09465696070035208, + "grad_norm": 1.4680207967758179, + "learning_rate": 4.890287045900759e-05, + "loss": 5.0502, + "step": 15916 + }, + { + "epoch": 0.09466290798363308, + "grad_norm": 1.3447710275650024, + "learning_rate": 4.89027335985878e-05, + "loss": 5.1255, + "step": 15917 + }, + { + "epoch": 0.09466885526691407, + "grad_norm": 1.3510375022888184, + "learning_rate": 4.8902596729823825e-05, + "loss": 5.0936, + "step": 15918 + }, + { + "epoch": 0.09467480255019507, + "grad_norm": 1.3805617094039917, + "learning_rate": 4.89024598527157e-05, + "loss": 5.1146, + "step": 15919 + }, + { + "epoch": 0.09468074983347607, + "grad_norm": 1.568036437034607, + "learning_rate": 4.890232296726347e-05, + "loss": 5.0032, + "step": 15920 + }, + { + "epoch": 0.09468669711675706, + "grad_norm": 1.6060000658035278, + "learning_rate": 4.890218607346718e-05, + "loss": 5.017, + "step": 15921 + }, + { + "epoch": 0.09469264440003806, + "grad_norm": 1.498241901397705, + "learning_rate": 4.890204917132689e-05, + "loss": 5.1265, + "step": 15922 + }, + { + "epoch": 0.09469859168331907, + "grad_norm": 1.418135643005371, + "learning_rate": 4.8901912260842644e-05, + "loss": 5.1458, + "step": 15923 + }, + { + "epoch": 0.09470453896660005, + "grad_norm": 1.3306639194488525, + "learning_rate": 4.890177534201448e-05, + "loss": 5.1672, + "step": 15924 + }, + { + "epoch": 0.09471048624988106, + "grad_norm": 1.542938470840454, + "learning_rate": 4.890163841484246e-05, + "loss": 5.1511, + "step": 15925 + }, + { + "epoch": 0.09471643353316204, + "grad_norm": 1.3050166368484497, + "learning_rate": 4.890150147932662e-05, + "loss": 5.2615, + "step": 15926 + }, + { + "epoch": 0.09472238081644305, + "grad_norm": 1.3447345495224, + "learning_rate": 4.890136453546702e-05, + "loss": 5.2957, + "step": 15927 + }, + { + "epoch": 0.09472832809972405, + "grad_norm": 1.3270481824874878, + "learning_rate": 4.8901227583263695e-05, + "loss": 5.2751, + "step": 15928 + }, + { + "epoch": 0.09473427538300504, + "grad_norm": 1.3909003734588623, + "learning_rate": 4.890109062271669e-05, + "loss": 5.1162, + "step": 15929 + }, + { + "epoch": 0.09474022266628604, + "grad_norm": 1.4668915271759033, + "learning_rate": 4.890095365382608e-05, + "loss": 5.0313, + "step": 15930 + }, + { + "epoch": 0.09474616994956704, + "grad_norm": 1.2651780843734741, + "learning_rate": 4.890081667659188e-05, + "loss": 5.0576, + "step": 15931 + }, + { + "epoch": 0.09475211723284803, + "grad_norm": 1.5086911916732788, + "learning_rate": 4.8900679691014154e-05, + "loss": 4.9508, + "step": 15932 + }, + { + "epoch": 0.09475806451612903, + "grad_norm": 1.2698594331741333, + "learning_rate": 4.8900542697092956e-05, + "loss": 5.0183, + "step": 15933 + }, + { + "epoch": 0.09476401179941003, + "grad_norm": 2.691392183303833, + "learning_rate": 4.8900405694828313e-05, + "loss": 5.0997, + "step": 15934 + }, + { + "epoch": 0.09476995908269102, + "grad_norm": 1.3395452499389648, + "learning_rate": 4.8900268684220295e-05, + "loss": 5.2219, + "step": 15935 + }, + { + "epoch": 0.09477590636597202, + "grad_norm": 1.3485181331634521, + "learning_rate": 4.8900131665268934e-05, + "loss": 4.9594, + "step": 15936 + }, + { + "epoch": 0.09478185364925303, + "grad_norm": 1.2990431785583496, + "learning_rate": 4.889999463797429e-05, + "loss": 4.9492, + "step": 15937 + }, + { + "epoch": 0.09478780093253401, + "grad_norm": 1.2848893404006958, + "learning_rate": 4.8899857602336396e-05, + "loss": 4.9819, + "step": 15938 + }, + { + "epoch": 0.09479374821581502, + "grad_norm": 1.4666554927825928, + "learning_rate": 4.889972055835531e-05, + "loss": 4.9672, + "step": 15939 + }, + { + "epoch": 0.09479969549909602, + "grad_norm": 1.3356142044067383, + "learning_rate": 4.8899583506031085e-05, + "loss": 5.029, + "step": 15940 + }, + { + "epoch": 0.094805642782377, + "grad_norm": 1.561786413192749, + "learning_rate": 4.8899446445363765e-05, + "loss": 4.9071, + "step": 15941 + }, + { + "epoch": 0.09481159006565801, + "grad_norm": 1.4906450510025024, + "learning_rate": 4.889930937635339e-05, + "loss": 5.0832, + "step": 15942 + }, + { + "epoch": 0.09481753734893901, + "grad_norm": 1.5042341947555542, + "learning_rate": 4.889917229900001e-05, + "loss": 5.1069, + "step": 15943 + }, + { + "epoch": 0.09482348463222, + "grad_norm": 1.6562377214431763, + "learning_rate": 4.889903521330368e-05, + "loss": 5.0532, + "step": 15944 + }, + { + "epoch": 0.094829431915501, + "grad_norm": 1.1881135702133179, + "learning_rate": 4.889889811926445e-05, + "loss": 5.1159, + "step": 15945 + }, + { + "epoch": 0.094835379198782, + "grad_norm": 1.3550158739089966, + "learning_rate": 4.889876101688234e-05, + "loss": 5.0754, + "step": 15946 + }, + { + "epoch": 0.09484132648206299, + "grad_norm": 1.403874158859253, + "learning_rate": 4.8898623906157435e-05, + "loss": 5.405, + "step": 15947 + }, + { + "epoch": 0.09484727376534399, + "grad_norm": 1.4460557699203491, + "learning_rate": 4.889848678708977e-05, + "loss": 5.041, + "step": 15948 + }, + { + "epoch": 0.094853221048625, + "grad_norm": 1.4151064157485962, + "learning_rate": 4.889834965967939e-05, + "loss": 5.368, + "step": 15949 + }, + { + "epoch": 0.09485916833190598, + "grad_norm": 1.3388437032699585, + "learning_rate": 4.889821252392633e-05, + "loss": 5.2905, + "step": 15950 + }, + { + "epoch": 0.09486511561518698, + "grad_norm": 1.1941900253295898, + "learning_rate": 4.8898075379830665e-05, + "loss": 5.1499, + "step": 15951 + }, + { + "epoch": 0.09487106289846799, + "grad_norm": 1.4840821027755737, + "learning_rate": 4.889793822739243e-05, + "loss": 5.0461, + "step": 15952 + }, + { + "epoch": 0.09487701018174897, + "grad_norm": 1.4021552801132202, + "learning_rate": 4.889780106661166e-05, + "loss": 4.89, + "step": 15953 + }, + { + "epoch": 0.09488295746502998, + "grad_norm": 1.4893288612365723, + "learning_rate": 4.889766389748842e-05, + "loss": 4.9719, + "step": 15954 + }, + { + "epoch": 0.09488890474831096, + "grad_norm": 1.4530198574066162, + "learning_rate": 4.889752672002275e-05, + "loss": 5.3931, + "step": 15955 + }, + { + "epoch": 0.09489485203159197, + "grad_norm": 1.468037724494934, + "learning_rate": 4.88973895342147e-05, + "loss": 5.271, + "step": 15956 + }, + { + "epoch": 0.09490079931487297, + "grad_norm": 1.3074537515640259, + "learning_rate": 4.889725234006433e-05, + "loss": 5.202, + "step": 15957 + }, + { + "epoch": 0.09490674659815396, + "grad_norm": 1.3678735494613647, + "learning_rate": 4.889711513757166e-05, + "loss": 5.0821, + "step": 15958 + }, + { + "epoch": 0.09491269388143496, + "grad_norm": 1.3922240734100342, + "learning_rate": 4.889697792673676e-05, + "loss": 4.8938, + "step": 15959 + }, + { + "epoch": 0.09491864116471596, + "grad_norm": 1.3895872831344604, + "learning_rate": 4.8896840707559674e-05, + "loss": 4.8293, + "step": 15960 + }, + { + "epoch": 0.09492458844799695, + "grad_norm": 1.223599910736084, + "learning_rate": 4.889670348004045e-05, + "loss": 4.8528, + "step": 15961 + }, + { + "epoch": 0.09493053573127795, + "grad_norm": 1.4488904476165771, + "learning_rate": 4.889656624417913e-05, + "loss": 5.0107, + "step": 15962 + }, + { + "epoch": 0.09493648301455895, + "grad_norm": 1.5250918865203857, + "learning_rate": 4.889642899997576e-05, + "loss": 4.9114, + "step": 15963 + }, + { + "epoch": 0.09494243029783994, + "grad_norm": 1.4656517505645752, + "learning_rate": 4.88962917474304e-05, + "loss": 5.2163, + "step": 15964 + }, + { + "epoch": 0.09494837758112094, + "grad_norm": 1.316635251045227, + "learning_rate": 4.889615448654309e-05, + "loss": 5.1904, + "step": 15965 + }, + { + "epoch": 0.09495432486440195, + "grad_norm": 1.5920292139053345, + "learning_rate": 4.8896017217313886e-05, + "loss": 5.0858, + "step": 15966 + }, + { + "epoch": 0.09496027214768293, + "grad_norm": 1.5263009071350098, + "learning_rate": 4.889587993974282e-05, + "loss": 5.0594, + "step": 15967 + }, + { + "epoch": 0.09496621943096394, + "grad_norm": 1.4230486154556274, + "learning_rate": 4.889574265382996e-05, + "loss": 5.0712, + "step": 15968 + }, + { + "epoch": 0.09497216671424494, + "grad_norm": 1.9315528869628906, + "learning_rate": 4.889560535957533e-05, + "loss": 4.8489, + "step": 15969 + }, + { + "epoch": 0.09497811399752593, + "grad_norm": 1.3432739973068237, + "learning_rate": 4.8895468056979e-05, + "loss": 4.9722, + "step": 15970 + }, + { + "epoch": 0.09498406128080693, + "grad_norm": 1.191886067390442, + "learning_rate": 4.8895330746041e-05, + "loss": 4.9384, + "step": 15971 + }, + { + "epoch": 0.09499000856408793, + "grad_norm": 1.4204323291778564, + "learning_rate": 4.8895193426761396e-05, + "loss": 5.1063, + "step": 15972 + }, + { + "epoch": 0.09499595584736892, + "grad_norm": 1.319189429283142, + "learning_rate": 4.8895056099140224e-05, + "loss": 5.0643, + "step": 15973 + }, + { + "epoch": 0.09500190313064992, + "grad_norm": 1.2905625104904175, + "learning_rate": 4.8894918763177533e-05, + "loss": 5.0806, + "step": 15974 + }, + { + "epoch": 0.09500785041393092, + "grad_norm": 1.6914581060409546, + "learning_rate": 4.889478141887338e-05, + "loss": 4.9209, + "step": 15975 + }, + { + "epoch": 0.09501379769721191, + "grad_norm": 1.390061378479004, + "learning_rate": 4.8894644066227797e-05, + "loss": 5.1376, + "step": 15976 + }, + { + "epoch": 0.09501974498049291, + "grad_norm": 1.2711600065231323, + "learning_rate": 4.889450670524084e-05, + "loss": 5.2344, + "step": 15977 + }, + { + "epoch": 0.09502569226377391, + "grad_norm": 1.472398042678833, + "learning_rate": 4.889436933591256e-05, + "loss": 5.0605, + "step": 15978 + }, + { + "epoch": 0.0950316395470549, + "grad_norm": 1.483567714691162, + "learning_rate": 4.889423195824301e-05, + "loss": 4.9827, + "step": 15979 + }, + { + "epoch": 0.0950375868303359, + "grad_norm": 1.706921935081482, + "learning_rate": 4.889409457223222e-05, + "loss": 5.0692, + "step": 15980 + }, + { + "epoch": 0.0950435341136169, + "grad_norm": 1.7719398736953735, + "learning_rate": 4.889395717788026e-05, + "loss": 5.0985, + "step": 15981 + }, + { + "epoch": 0.0950494813968979, + "grad_norm": 1.6768114566802979, + "learning_rate": 4.889381977518715e-05, + "loss": 4.8838, + "step": 15982 + }, + { + "epoch": 0.0950554286801789, + "grad_norm": 1.5722233057022095, + "learning_rate": 4.889368236415296e-05, + "loss": 4.824, + "step": 15983 + }, + { + "epoch": 0.09506137596345988, + "grad_norm": 1.5722928047180176, + "learning_rate": 4.889354494477773e-05, + "loss": 5.3027, + "step": 15984 + }, + { + "epoch": 0.09506732324674089, + "grad_norm": 2.0003905296325684, + "learning_rate": 4.8893407517061526e-05, + "loss": 5.2216, + "step": 15985 + }, + { + "epoch": 0.09507327053002189, + "grad_norm": 1.390168309211731, + "learning_rate": 4.889327008100437e-05, + "loss": 5.358, + "step": 15986 + }, + { + "epoch": 0.09507921781330288, + "grad_norm": 1.545292854309082, + "learning_rate": 4.889313263660632e-05, + "loss": 5.5124, + "step": 15987 + }, + { + "epoch": 0.09508516509658388, + "grad_norm": 1.4416158199310303, + "learning_rate": 4.889299518386742e-05, + "loss": 5.0929, + "step": 15988 + }, + { + "epoch": 0.09509111237986488, + "grad_norm": 1.8936892747879028, + "learning_rate": 4.889285772278773e-05, + "loss": 4.9407, + "step": 15989 + }, + { + "epoch": 0.09509705966314587, + "grad_norm": 1.4762251377105713, + "learning_rate": 4.889272025336729e-05, + "loss": 5.05, + "step": 15990 + }, + { + "epoch": 0.09510300694642687, + "grad_norm": 1.4513001441955566, + "learning_rate": 4.8892582775606146e-05, + "loss": 5.2386, + "step": 15991 + }, + { + "epoch": 0.09510895422970787, + "grad_norm": 1.8999260663986206, + "learning_rate": 4.8892445289504345e-05, + "loss": 5.1524, + "step": 15992 + }, + { + "epoch": 0.09511490151298886, + "grad_norm": 1.5721614360809326, + "learning_rate": 4.8892307795061945e-05, + "loss": 5.2276, + "step": 15993 + }, + { + "epoch": 0.09512084879626986, + "grad_norm": 1.754425287246704, + "learning_rate": 4.889217029227898e-05, + "loss": 5.118, + "step": 15994 + }, + { + "epoch": 0.09512679607955087, + "grad_norm": 1.6336870193481445, + "learning_rate": 4.889203278115551e-05, + "loss": 5.2065, + "step": 15995 + }, + { + "epoch": 0.09513274336283185, + "grad_norm": 2.721186876296997, + "learning_rate": 4.889189526169157e-05, + "loss": 5.3698, + "step": 15996 + }, + { + "epoch": 0.09513869064611286, + "grad_norm": 1.3870679140090942, + "learning_rate": 4.889175773388722e-05, + "loss": 5.294, + "step": 15997 + }, + { + "epoch": 0.09514463792939386, + "grad_norm": 1.4010889530181885, + "learning_rate": 4.889162019774252e-05, + "loss": 5.2313, + "step": 15998 + }, + { + "epoch": 0.09515058521267485, + "grad_norm": 1.6322177648544312, + "learning_rate": 4.889148265325748e-05, + "loss": 5.2871, + "step": 15999 + }, + { + "epoch": 0.09515653249595585, + "grad_norm": 1.5373196601867676, + "learning_rate": 4.889134510043218e-05, + "loss": 5.4748, + "step": 16000 + }, + { + "epoch": 0.09516247977923685, + "grad_norm": 1.572461724281311, + "learning_rate": 4.889120753926666e-05, + "loss": 5.3634, + "step": 16001 + }, + { + "epoch": 0.09516842706251784, + "grad_norm": 1.3587132692337036, + "learning_rate": 4.889106996976096e-05, + "loss": 5.1399, + "step": 16002 + }, + { + "epoch": 0.09517437434579884, + "grad_norm": 1.1270248889923096, + "learning_rate": 4.889093239191514e-05, + "loss": 5.1845, + "step": 16003 + }, + { + "epoch": 0.09518032162907984, + "grad_norm": 1.5456722974777222, + "learning_rate": 4.889079480572924e-05, + "loss": 5.4895, + "step": 16004 + }, + { + "epoch": 0.09518626891236083, + "grad_norm": 1.2772669792175293, + "learning_rate": 4.8890657211203307e-05, + "loss": 5.5415, + "step": 16005 + }, + { + "epoch": 0.09519221619564183, + "grad_norm": 1.5249123573303223, + "learning_rate": 4.88905196083374e-05, + "loss": 5.2731, + "step": 16006 + }, + { + "epoch": 0.09519816347892283, + "grad_norm": 1.137450098991394, + "learning_rate": 4.889038199713155e-05, + "loss": 5.2232, + "step": 16007 + }, + { + "epoch": 0.09520411076220382, + "grad_norm": 1.4076485633850098, + "learning_rate": 4.889024437758582e-05, + "loss": 5.3428, + "step": 16008 + }, + { + "epoch": 0.09521005804548482, + "grad_norm": 1.3883590698242188, + "learning_rate": 4.889010674970026e-05, + "loss": 5.328, + "step": 16009 + }, + { + "epoch": 0.09521600532876583, + "grad_norm": 1.4320605993270874, + "learning_rate": 4.88899691134749e-05, + "loss": 5.1469, + "step": 16010 + }, + { + "epoch": 0.09522195261204681, + "grad_norm": 1.5601880550384521, + "learning_rate": 4.8889831468909795e-05, + "loss": 5.1063, + "step": 16011 + }, + { + "epoch": 0.09522789989532782, + "grad_norm": 1.4243980646133423, + "learning_rate": 4.8889693816005014e-05, + "loss": 5.067, + "step": 16012 + }, + { + "epoch": 0.0952338471786088, + "grad_norm": 1.3901020288467407, + "learning_rate": 4.8889556154760577e-05, + "loss": 4.9954, + "step": 16013 + }, + { + "epoch": 0.0952397944618898, + "grad_norm": 1.2067557573318481, + "learning_rate": 4.8889418485176544e-05, + "loss": 5.5485, + "step": 16014 + }, + { + "epoch": 0.09524574174517081, + "grad_norm": 1.6004818677902222, + "learning_rate": 4.888928080725296e-05, + "loss": 5.0334, + "step": 16015 + }, + { + "epoch": 0.0952516890284518, + "grad_norm": 1.42451810836792, + "learning_rate": 4.8889143120989864e-05, + "loss": 4.9913, + "step": 16016 + }, + { + "epoch": 0.0952576363117328, + "grad_norm": 1.528438925743103, + "learning_rate": 4.888900542638734e-05, + "loss": 4.9749, + "step": 16017 + }, + { + "epoch": 0.0952635835950138, + "grad_norm": 1.2179231643676758, + "learning_rate": 4.888886772344539e-05, + "loss": 5.0631, + "step": 16018 + }, + { + "epoch": 0.09526953087829479, + "grad_norm": 1.5069763660430908, + "learning_rate": 4.8888730012164085e-05, + "loss": 5.0739, + "step": 16019 + }, + { + "epoch": 0.09527547816157579, + "grad_norm": 1.3587465286254883, + "learning_rate": 4.888859229254348e-05, + "loss": 5.0924, + "step": 16020 + }, + { + "epoch": 0.0952814254448568, + "grad_norm": 1.412811517715454, + "learning_rate": 4.888845456458361e-05, + "loss": 5.0228, + "step": 16021 + }, + { + "epoch": 0.09528737272813778, + "grad_norm": 1.5316507816314697, + "learning_rate": 4.888831682828453e-05, + "loss": 4.9514, + "step": 16022 + }, + { + "epoch": 0.09529332001141878, + "grad_norm": 1.4402068853378296, + "learning_rate": 4.888817908364628e-05, + "loss": 4.9404, + "step": 16023 + }, + { + "epoch": 0.09529926729469979, + "grad_norm": 1.353027582168579, + "learning_rate": 4.888804133066892e-05, + "loss": 5.0359, + "step": 16024 + }, + { + "epoch": 0.09530521457798077, + "grad_norm": 1.4211509227752686, + "learning_rate": 4.8887903569352486e-05, + "loss": 5.2472, + "step": 16025 + }, + { + "epoch": 0.09531116186126178, + "grad_norm": 1.3640077114105225, + "learning_rate": 4.888776579969704e-05, + "loss": 5.4126, + "step": 16026 + }, + { + "epoch": 0.09531710914454278, + "grad_norm": 1.5627541542053223, + "learning_rate": 4.8887628021702616e-05, + "loss": 5.1019, + "step": 16027 + }, + { + "epoch": 0.09532305642782377, + "grad_norm": 1.788611650466919, + "learning_rate": 4.888749023536927e-05, + "loss": 4.9395, + "step": 16028 + }, + { + "epoch": 0.09532900371110477, + "grad_norm": 1.3194786310195923, + "learning_rate": 4.8887352440697044e-05, + "loss": 4.9888, + "step": 16029 + }, + { + "epoch": 0.09533495099438577, + "grad_norm": 1.3091423511505127, + "learning_rate": 4.888721463768598e-05, + "loss": 5.1328, + "step": 16030 + }, + { + "epoch": 0.09534089827766676, + "grad_norm": 1.2864805459976196, + "learning_rate": 4.8887076826336154e-05, + "loss": 5.2569, + "step": 16031 + }, + { + "epoch": 0.09534684556094776, + "grad_norm": 1.3800050020217896, + "learning_rate": 4.888693900664759e-05, + "loss": 5.0698, + "step": 16032 + }, + { + "epoch": 0.09535279284422876, + "grad_norm": 1.2338416576385498, + "learning_rate": 4.8886801178620347e-05, + "loss": 5.227, + "step": 16033 + }, + { + "epoch": 0.09535874012750975, + "grad_norm": 1.4023356437683105, + "learning_rate": 4.888666334225446e-05, + "loss": 5.2976, + "step": 16034 + }, + { + "epoch": 0.09536468741079075, + "grad_norm": 1.4695215225219727, + "learning_rate": 4.8886525497549994e-05, + "loss": 5.1062, + "step": 16035 + }, + { + "epoch": 0.09537063469407175, + "grad_norm": 1.3647410869598389, + "learning_rate": 4.888638764450698e-05, + "loss": 5.2613, + "step": 16036 + }, + { + "epoch": 0.09537658197735274, + "grad_norm": 1.3059413433074951, + "learning_rate": 4.8886249783125484e-05, + "loss": 5.1593, + "step": 16037 + }, + { + "epoch": 0.09538252926063374, + "grad_norm": 1.3861093521118164, + "learning_rate": 4.8886111913405544e-05, + "loss": 4.9149, + "step": 16038 + }, + { + "epoch": 0.09538847654391475, + "grad_norm": 1.4214578866958618, + "learning_rate": 4.88859740353472e-05, + "loss": 5.0443, + "step": 16039 + }, + { + "epoch": 0.09539442382719573, + "grad_norm": 1.3835242986679077, + "learning_rate": 4.888583614895052e-05, + "loss": 4.9516, + "step": 16040 + }, + { + "epoch": 0.09540037111047674, + "grad_norm": 1.47120201587677, + "learning_rate": 4.8885698254215526e-05, + "loss": 4.9673, + "step": 16041 + }, + { + "epoch": 0.09540631839375772, + "grad_norm": 1.4861125946044922, + "learning_rate": 4.8885560351142295e-05, + "loss": 4.8283, + "step": 16042 + }, + { + "epoch": 0.09541226567703873, + "grad_norm": 1.2469282150268555, + "learning_rate": 4.888542243973086e-05, + "loss": 5.164, + "step": 16043 + }, + { + "epoch": 0.09541821296031973, + "grad_norm": 1.2372372150421143, + "learning_rate": 4.888528451998127e-05, + "loss": 5.2986, + "step": 16044 + }, + { + "epoch": 0.09542416024360072, + "grad_norm": 1.370978593826294, + "learning_rate": 4.888514659189357e-05, + "loss": 5.2353, + "step": 16045 + }, + { + "epoch": 0.09543010752688172, + "grad_norm": 1.4328222274780273, + "learning_rate": 4.888500865546781e-05, + "loss": 5.3482, + "step": 16046 + }, + { + "epoch": 0.09543605481016272, + "grad_norm": 1.2651796340942383, + "learning_rate": 4.888487071070405e-05, + "loss": 5.3276, + "step": 16047 + }, + { + "epoch": 0.09544200209344371, + "grad_norm": 1.34639310836792, + "learning_rate": 4.8884732757602325e-05, + "loss": 5.108, + "step": 16048 + }, + { + "epoch": 0.09544794937672471, + "grad_norm": 1.2254658937454224, + "learning_rate": 4.888459479616269e-05, + "loss": 5.1569, + "step": 16049 + }, + { + "epoch": 0.09545389666000571, + "grad_norm": 1.2902439832687378, + "learning_rate": 4.888445682638518e-05, + "loss": 5.2215, + "step": 16050 + }, + { + "epoch": 0.0954598439432867, + "grad_norm": 1.572160243988037, + "learning_rate": 4.888431884826986e-05, + "loss": 5.1288, + "step": 16051 + }, + { + "epoch": 0.0954657912265677, + "grad_norm": 1.266427993774414, + "learning_rate": 4.888418086181676e-05, + "loss": 5.231, + "step": 16052 + }, + { + "epoch": 0.0954717385098487, + "grad_norm": 1.2186620235443115, + "learning_rate": 4.888404286702595e-05, + "loss": 5.113, + "step": 16053 + }, + { + "epoch": 0.0954776857931297, + "grad_norm": 1.386727213859558, + "learning_rate": 4.888390486389747e-05, + "loss": 5.0559, + "step": 16054 + }, + { + "epoch": 0.0954836330764107, + "grad_norm": 1.3253827095031738, + "learning_rate": 4.8883766852431354e-05, + "loss": 5.2569, + "step": 16055 + }, + { + "epoch": 0.0954895803596917, + "grad_norm": 1.219800591468811, + "learning_rate": 4.888362883262767e-05, + "loss": 5.0805, + "step": 16056 + }, + { + "epoch": 0.09549552764297269, + "grad_norm": 1.2425061464309692, + "learning_rate": 4.888349080448646e-05, + "loss": 5.1447, + "step": 16057 + }, + { + "epoch": 0.09550147492625369, + "grad_norm": 2.619645833969116, + "learning_rate": 4.888335276800777e-05, + "loss": 5.2419, + "step": 16058 + }, + { + "epoch": 0.09550742220953469, + "grad_norm": 1.3087180852890015, + "learning_rate": 4.888321472319164e-05, + "loss": 5.1895, + "step": 16059 + }, + { + "epoch": 0.09551336949281568, + "grad_norm": 1.1865695714950562, + "learning_rate": 4.888307667003813e-05, + "loss": 5.1791, + "step": 16060 + }, + { + "epoch": 0.09551931677609668, + "grad_norm": 1.2647303342819214, + "learning_rate": 4.8882938608547294e-05, + "loss": 5.1928, + "step": 16061 + }, + { + "epoch": 0.09552526405937768, + "grad_norm": 1.2161632776260376, + "learning_rate": 4.888280053871916e-05, + "loss": 5.1431, + "step": 16062 + }, + { + "epoch": 0.09553121134265867, + "grad_norm": 1.3904309272766113, + "learning_rate": 4.8882662460553784e-05, + "loss": 5.0658, + "step": 16063 + }, + { + "epoch": 0.09553715862593967, + "grad_norm": 1.4302258491516113, + "learning_rate": 4.888252437405123e-05, + "loss": 5.1838, + "step": 16064 + }, + { + "epoch": 0.09554310590922067, + "grad_norm": 1.4313236474990845, + "learning_rate": 4.888238627921152e-05, + "loss": 5.2108, + "step": 16065 + }, + { + "epoch": 0.09554905319250166, + "grad_norm": 1.485170602798462, + "learning_rate": 4.8882248176034726e-05, + "loss": 5.179, + "step": 16066 + }, + { + "epoch": 0.09555500047578266, + "grad_norm": 1.3742952346801758, + "learning_rate": 4.888211006452088e-05, + "loss": 5.0416, + "step": 16067 + }, + { + "epoch": 0.09556094775906367, + "grad_norm": 1.2600523233413696, + "learning_rate": 4.888197194467005e-05, + "loss": 5.0891, + "step": 16068 + }, + { + "epoch": 0.09556689504234465, + "grad_norm": 1.2905696630477905, + "learning_rate": 4.888183381648225e-05, + "loss": 5.1004, + "step": 16069 + }, + { + "epoch": 0.09557284232562566, + "grad_norm": 1.2373219728469849, + "learning_rate": 4.8881695679957565e-05, + "loss": 5.1549, + "step": 16070 + }, + { + "epoch": 0.09557878960890664, + "grad_norm": 1.43118155002594, + "learning_rate": 4.8881557535096014e-05, + "loss": 5.067, + "step": 16071 + }, + { + "epoch": 0.09558473689218765, + "grad_norm": 1.201025366783142, + "learning_rate": 4.888141938189767e-05, + "loss": 5.1304, + "step": 16072 + }, + { + "epoch": 0.09559068417546865, + "grad_norm": 1.3497222661972046, + "learning_rate": 4.888128122036256e-05, + "loss": 5.0802, + "step": 16073 + }, + { + "epoch": 0.09559663145874964, + "grad_norm": 1.3429580926895142, + "learning_rate": 4.888114305049074e-05, + "loss": 5.1033, + "step": 16074 + }, + { + "epoch": 0.09560257874203064, + "grad_norm": 1.212725281715393, + "learning_rate": 4.888100487228227e-05, + "loss": 5.0627, + "step": 16075 + }, + { + "epoch": 0.09560852602531164, + "grad_norm": 1.258507490158081, + "learning_rate": 4.8880866685737174e-05, + "loss": 5.1215, + "step": 16076 + }, + { + "epoch": 0.09561447330859263, + "grad_norm": 1.4401910305023193, + "learning_rate": 4.888072849085552e-05, + "loss": 4.9619, + "step": 16077 + }, + { + "epoch": 0.09562042059187363, + "grad_norm": 1.240682601928711, + "learning_rate": 4.888059028763735e-05, + "loss": 4.8384, + "step": 16078 + }, + { + "epoch": 0.09562636787515463, + "grad_norm": 1.5701509714126587, + "learning_rate": 4.888045207608272e-05, + "loss": 5.0756, + "step": 16079 + }, + { + "epoch": 0.09563231515843562, + "grad_norm": 2.0408403873443604, + "learning_rate": 4.888031385619166e-05, + "loss": 5.1615, + "step": 16080 + }, + { + "epoch": 0.09563826244171662, + "grad_norm": 1.8134169578552246, + "learning_rate": 4.8880175627964245e-05, + "loss": 5.2383, + "step": 16081 + }, + { + "epoch": 0.09564420972499763, + "grad_norm": 1.4934067726135254, + "learning_rate": 4.888003739140049e-05, + "loss": 5.1512, + "step": 16082 + }, + { + "epoch": 0.09565015700827861, + "grad_norm": 1.6359374523162842, + "learning_rate": 4.887989914650047e-05, + "loss": 5.1245, + "step": 16083 + }, + { + "epoch": 0.09565610429155962, + "grad_norm": 1.5446397066116333, + "learning_rate": 4.887976089326422e-05, + "loss": 4.9806, + "step": 16084 + }, + { + "epoch": 0.09566205157484062, + "grad_norm": 1.845180869102478, + "learning_rate": 4.8879622631691794e-05, + "loss": 5.0474, + "step": 16085 + }, + { + "epoch": 0.0956679988581216, + "grad_norm": 1.8755276203155518, + "learning_rate": 4.887948436178324e-05, + "loss": 5.0674, + "step": 16086 + }, + { + "epoch": 0.09567394614140261, + "grad_norm": 1.5596239566802979, + "learning_rate": 4.88793460835386e-05, + "loss": 5.0699, + "step": 16087 + }, + { + "epoch": 0.09567989342468361, + "grad_norm": 1.6092095375061035, + "learning_rate": 4.8879207796957935e-05, + "loss": 5.1184, + "step": 16088 + }, + { + "epoch": 0.0956858407079646, + "grad_norm": 1.6217916011810303, + "learning_rate": 4.887906950204127e-05, + "loss": 4.9607, + "step": 16089 + }, + { + "epoch": 0.0956917879912456, + "grad_norm": 1.5006567239761353, + "learning_rate": 4.8878931198788694e-05, + "loss": 4.7948, + "step": 16090 + }, + { + "epoch": 0.0956977352745266, + "grad_norm": 1.397647738456726, + "learning_rate": 4.887879288720021e-05, + "loss": 5.1067, + "step": 16091 + }, + { + "epoch": 0.09570368255780759, + "grad_norm": 1.5627835988998413, + "learning_rate": 4.8878654567275886e-05, + "loss": 4.9138, + "step": 16092 + }, + { + "epoch": 0.09570962984108859, + "grad_norm": 1.4590591192245483, + "learning_rate": 4.8878516239015784e-05, + "loss": 4.9132, + "step": 16093 + }, + { + "epoch": 0.0957155771243696, + "grad_norm": 1.347569465637207, + "learning_rate": 4.887837790241992e-05, + "loss": 4.9732, + "step": 16094 + }, + { + "epoch": 0.09572152440765058, + "grad_norm": 1.547169804573059, + "learning_rate": 4.887823955748838e-05, + "loss": 5.1336, + "step": 16095 + }, + { + "epoch": 0.09572747169093158, + "grad_norm": 1.3920515775680542, + "learning_rate": 4.887810120422118e-05, + "loss": 5.0738, + "step": 16096 + }, + { + "epoch": 0.09573341897421259, + "grad_norm": 1.4531773328781128, + "learning_rate": 4.8877962842618386e-05, + "loss": 5.0517, + "step": 16097 + }, + { + "epoch": 0.09573936625749357, + "grad_norm": 1.458679437637329, + "learning_rate": 4.887782447268004e-05, + "loss": 4.9291, + "step": 16098 + }, + { + "epoch": 0.09574531354077458, + "grad_norm": 1.6293518543243408, + "learning_rate": 4.8877686094406196e-05, + "loss": 4.7676, + "step": 16099 + }, + { + "epoch": 0.09575126082405556, + "grad_norm": 1.6756728887557983, + "learning_rate": 4.8877547707796895e-05, + "loss": 4.7426, + "step": 16100 + }, + { + "epoch": 0.09575720810733657, + "grad_norm": 1.7573354244232178, + "learning_rate": 4.8877409312852194e-05, + "loss": 4.6344, + "step": 16101 + }, + { + "epoch": 0.09576315539061757, + "grad_norm": 1.701581597328186, + "learning_rate": 4.8877270909572126e-05, + "loss": 4.8023, + "step": 16102 + }, + { + "epoch": 0.09576910267389856, + "grad_norm": 1.4811267852783203, + "learning_rate": 4.887713249795676e-05, + "loss": 4.9964, + "step": 16103 + }, + { + "epoch": 0.09577504995717956, + "grad_norm": 1.4324437379837036, + "learning_rate": 4.887699407800612e-05, + "loss": 4.9657, + "step": 16104 + }, + { + "epoch": 0.09578099724046056, + "grad_norm": 1.6630572080612183, + "learning_rate": 4.8876855649720285e-05, + "loss": 4.8689, + "step": 16105 + }, + { + "epoch": 0.09578694452374155, + "grad_norm": 1.8548660278320312, + "learning_rate": 4.887671721309928e-05, + "loss": 4.8775, + "step": 16106 + }, + { + "epoch": 0.09579289180702255, + "grad_norm": 1.5234023332595825, + "learning_rate": 4.887657876814316e-05, + "loss": 5.1495, + "step": 16107 + }, + { + "epoch": 0.09579883909030355, + "grad_norm": 1.5281673669815063, + "learning_rate": 4.8876440314851967e-05, + "loss": 4.8887, + "step": 16108 + }, + { + "epoch": 0.09580478637358454, + "grad_norm": 1.6189017295837402, + "learning_rate": 4.887630185322576e-05, + "loss": 4.7103, + "step": 16109 + }, + { + "epoch": 0.09581073365686554, + "grad_norm": 1.8149834871292114, + "learning_rate": 4.8876163383264584e-05, + "loss": 4.5674, + "step": 16110 + }, + { + "epoch": 0.09581668094014655, + "grad_norm": 1.6370511054992676, + "learning_rate": 4.887602490496848e-05, + "loss": 4.6307, + "step": 16111 + }, + { + "epoch": 0.09582262822342753, + "grad_norm": 1.603553056716919, + "learning_rate": 4.887588641833751e-05, + "loss": 4.597, + "step": 16112 + }, + { + "epoch": 0.09582857550670854, + "grad_norm": 1.6511812210083008, + "learning_rate": 4.887574792337171e-05, + "loss": 4.604, + "step": 16113 + }, + { + "epoch": 0.09583452278998954, + "grad_norm": 1.6924868822097778, + "learning_rate": 4.887560942007113e-05, + "loss": 4.6674, + "step": 16114 + }, + { + "epoch": 0.09584047007327053, + "grad_norm": 1.6445999145507812, + "learning_rate": 4.887547090843583e-05, + "loss": 4.492, + "step": 16115 + }, + { + "epoch": 0.09584641735655153, + "grad_norm": 2.282087564468384, + "learning_rate": 4.887533238846585e-05, + "loss": 5.7458, + "step": 16116 + }, + { + "epoch": 0.09585236463983253, + "grad_norm": 1.8790422677993774, + "learning_rate": 4.887519386016123e-05, + "loss": 5.6642, + "step": 16117 + }, + { + "epoch": 0.09585831192311352, + "grad_norm": 1.887954592704773, + "learning_rate": 4.887505532352203e-05, + "loss": 5.8485, + "step": 16118 + }, + { + "epoch": 0.09586425920639452, + "grad_norm": 1.8805441856384277, + "learning_rate": 4.88749167785483e-05, + "loss": 5.5941, + "step": 16119 + }, + { + "epoch": 0.09587020648967552, + "grad_norm": 2.141098976135254, + "learning_rate": 4.8874778225240076e-05, + "loss": 5.1748, + "step": 16120 + }, + { + "epoch": 0.09587615377295651, + "grad_norm": 1.560094952583313, + "learning_rate": 4.887463966359741e-05, + "loss": 5.625, + "step": 16121 + }, + { + "epoch": 0.09588210105623751, + "grad_norm": 1.6463109254837036, + "learning_rate": 4.887450109362036e-05, + "loss": 5.6568, + "step": 16122 + }, + { + "epoch": 0.09588804833951851, + "grad_norm": 1.5389329195022583, + "learning_rate": 4.887436251530898e-05, + "loss": 5.6461, + "step": 16123 + }, + { + "epoch": 0.0958939956227995, + "grad_norm": 1.4973753690719604, + "learning_rate": 4.8874223928663284e-05, + "loss": 5.3542, + "step": 16124 + }, + { + "epoch": 0.0958999429060805, + "grad_norm": 1.4039745330810547, + "learning_rate": 4.8874085333683364e-05, + "loss": 5.506, + "step": 16125 + }, + { + "epoch": 0.0959058901893615, + "grad_norm": 1.819114089012146, + "learning_rate": 4.8873946730369235e-05, + "loss": 5.2586, + "step": 16126 + }, + { + "epoch": 0.0959118374726425, + "grad_norm": 1.9034372568130493, + "learning_rate": 4.887380811872095e-05, + "loss": 5.1818, + "step": 16127 + }, + { + "epoch": 0.0959177847559235, + "grad_norm": 1.8390016555786133, + "learning_rate": 4.8873669498738584e-05, + "loss": 5.8263, + "step": 16128 + }, + { + "epoch": 0.09592373203920448, + "grad_norm": 1.780961275100708, + "learning_rate": 4.887353087042216e-05, + "loss": 5.801, + "step": 16129 + }, + { + "epoch": 0.09592967932248549, + "grad_norm": 1.8105396032333374, + "learning_rate": 4.887339223377173e-05, + "loss": 5.3426, + "step": 16130 + }, + { + "epoch": 0.09593562660576649, + "grad_norm": 1.9126670360565186, + "learning_rate": 4.887325358878735e-05, + "loss": 5.404, + "step": 16131 + }, + { + "epoch": 0.09594157388904748, + "grad_norm": 1.4767181873321533, + "learning_rate": 4.887311493546906e-05, + "loss": 5.5631, + "step": 16132 + }, + { + "epoch": 0.09594752117232848, + "grad_norm": 1.4779311418533325, + "learning_rate": 4.8872976273816904e-05, + "loss": 5.6407, + "step": 16133 + }, + { + "epoch": 0.09595346845560948, + "grad_norm": 1.9026421308517456, + "learning_rate": 4.8872837603830955e-05, + "loss": 5.4299, + "step": 16134 + }, + { + "epoch": 0.09595941573889047, + "grad_norm": 1.845184326171875, + "learning_rate": 4.887269892551123e-05, + "loss": 5.4873, + "step": 16135 + }, + { + "epoch": 0.09596536302217147, + "grad_norm": 2.49023175239563, + "learning_rate": 4.88725602388578e-05, + "loss": 4.1458, + "step": 16136 + }, + { + "epoch": 0.09597131030545247, + "grad_norm": 2.0831515789031982, + "learning_rate": 4.887242154387071e-05, + "loss": 5.0316, + "step": 16137 + }, + { + "epoch": 0.09597725758873346, + "grad_norm": 1.6316094398498535, + "learning_rate": 4.887228284055e-05, + "loss": 5.1289, + "step": 16138 + }, + { + "epoch": 0.09598320487201446, + "grad_norm": 2.025193214416504, + "learning_rate": 4.8872144128895724e-05, + "loss": 5.3065, + "step": 16139 + }, + { + "epoch": 0.09598915215529547, + "grad_norm": 2.077871322631836, + "learning_rate": 4.887200540890793e-05, + "loss": 5.1163, + "step": 16140 + }, + { + "epoch": 0.09599509943857645, + "grad_norm": 1.8450415134429932, + "learning_rate": 4.8871866680586666e-05, + "loss": 5.2638, + "step": 16141 + }, + { + "epoch": 0.09600104672185746, + "grad_norm": 1.676255464553833, + "learning_rate": 4.8871727943931974e-05, + "loss": 4.8191, + "step": 16142 + }, + { + "epoch": 0.09600699400513846, + "grad_norm": 1.6484187841415405, + "learning_rate": 4.8871589198943914e-05, + "loss": 5.3993, + "step": 16143 + }, + { + "epoch": 0.09601294128841945, + "grad_norm": 1.7061866521835327, + "learning_rate": 4.887145044562253e-05, + "loss": 5.2941, + "step": 16144 + }, + { + "epoch": 0.09601888857170045, + "grad_norm": 1.7628071308135986, + "learning_rate": 4.887131168396786e-05, + "loss": 5.2736, + "step": 16145 + }, + { + "epoch": 0.09602483585498145, + "grad_norm": 2.0107390880584717, + "learning_rate": 4.887117291397997e-05, + "loss": 5.1561, + "step": 16146 + }, + { + "epoch": 0.09603078313826244, + "grad_norm": 1.7889841794967651, + "learning_rate": 4.887103413565889e-05, + "loss": 6.0519, + "step": 16147 + }, + { + "epoch": 0.09603673042154344, + "grad_norm": 1.7982914447784424, + "learning_rate": 4.8870895349004686e-05, + "loss": 5.4913, + "step": 16148 + }, + { + "epoch": 0.09604267770482444, + "grad_norm": 1.8263020515441895, + "learning_rate": 4.88707565540174e-05, + "loss": 5.8516, + "step": 16149 + }, + { + "epoch": 0.09604862498810543, + "grad_norm": 1.642863392829895, + "learning_rate": 4.887061775069708e-05, + "loss": 5.5714, + "step": 16150 + }, + { + "epoch": 0.09605457227138643, + "grad_norm": 1.5696642398834229, + "learning_rate": 4.887047893904377e-05, + "loss": 5.4624, + "step": 16151 + }, + { + "epoch": 0.09606051955466743, + "grad_norm": 1.8895677328109741, + "learning_rate": 4.8870340119057536e-05, + "loss": 5.621, + "step": 16152 + }, + { + "epoch": 0.09606646683794842, + "grad_norm": 1.772875428199768, + "learning_rate": 4.8870201290738395e-05, + "loss": 5.5371, + "step": 16153 + }, + { + "epoch": 0.09607241412122942, + "grad_norm": 1.6763731241226196, + "learning_rate": 4.8870062454086415e-05, + "loss": 5.966, + "step": 16154 + }, + { + "epoch": 0.09607836140451043, + "grad_norm": 1.5911294221878052, + "learning_rate": 4.886992360910165e-05, + "loss": 5.3707, + "step": 16155 + }, + { + "epoch": 0.09608430868779141, + "grad_norm": 1.7060188055038452, + "learning_rate": 4.886978475578414e-05, + "loss": 5.5278, + "step": 16156 + }, + { + "epoch": 0.09609025597107242, + "grad_norm": 1.6456331014633179, + "learning_rate": 4.886964589413394e-05, + "loss": 5.5132, + "step": 16157 + }, + { + "epoch": 0.0960962032543534, + "grad_norm": 1.6736609935760498, + "learning_rate": 4.886950702415109e-05, + "loss": 5.245, + "step": 16158 + }, + { + "epoch": 0.0961021505376344, + "grad_norm": 1.5359262228012085, + "learning_rate": 4.886936814583564e-05, + "loss": 5.3893, + "step": 16159 + }, + { + "epoch": 0.09610809782091541, + "grad_norm": 1.5430463552474976, + "learning_rate": 4.886922925918763e-05, + "loss": 5.4257, + "step": 16160 + }, + { + "epoch": 0.0961140451041964, + "grad_norm": 1.940909743309021, + "learning_rate": 4.886909036420714e-05, + "loss": 5.0744, + "step": 16161 + }, + { + "epoch": 0.0961199923874774, + "grad_norm": 1.869372844696045, + "learning_rate": 4.886895146089418e-05, + "loss": 5.4901, + "step": 16162 + }, + { + "epoch": 0.0961259396707584, + "grad_norm": 1.794975996017456, + "learning_rate": 4.886881254924882e-05, + "loss": 5.5174, + "step": 16163 + }, + { + "epoch": 0.09613188695403939, + "grad_norm": 1.6314165592193604, + "learning_rate": 4.8868673629271105e-05, + "loss": 5.5883, + "step": 16164 + }, + { + "epoch": 0.09613783423732039, + "grad_norm": 1.7309901714324951, + "learning_rate": 4.886853470096108e-05, + "loss": 5.3881, + "step": 16165 + }, + { + "epoch": 0.09614378152060139, + "grad_norm": 1.7356623411178589, + "learning_rate": 4.88683957643188e-05, + "loss": 5.3578, + "step": 16166 + }, + { + "epoch": 0.09614972880388238, + "grad_norm": 2.302006244659424, + "learning_rate": 4.886825681934431e-05, + "loss": 5.7811, + "step": 16167 + }, + { + "epoch": 0.09615567608716338, + "grad_norm": 2.282381534576416, + "learning_rate": 4.8868117866037656e-05, + "loss": 5.8847, + "step": 16168 + }, + { + "epoch": 0.09616162337044439, + "grad_norm": 1.9158310890197754, + "learning_rate": 4.886797890439889e-05, + "loss": 5.7663, + "step": 16169 + }, + { + "epoch": 0.09616757065372537, + "grad_norm": 1.6491609811782837, + "learning_rate": 4.886783993442806e-05, + "loss": 5.9077, + "step": 16170 + }, + { + "epoch": 0.09617351793700638, + "grad_norm": 1.739547848701477, + "learning_rate": 4.886770095612521e-05, + "loss": 5.5126, + "step": 16171 + }, + { + "epoch": 0.09617946522028738, + "grad_norm": 1.534516453742981, + "learning_rate": 4.88675619694904e-05, + "loss": 5.372, + "step": 16172 + }, + { + "epoch": 0.09618541250356837, + "grad_norm": 1.8228504657745361, + "learning_rate": 4.8867422974523657e-05, + "loss": 5.4673, + "step": 16173 + }, + { + "epoch": 0.09619135978684937, + "grad_norm": 1.8887168169021606, + "learning_rate": 4.886728397122505e-05, + "loss": 5.5699, + "step": 16174 + }, + { + "epoch": 0.09619730707013037, + "grad_norm": 1.6889835596084595, + "learning_rate": 4.8867144959594626e-05, + "loss": 5.6244, + "step": 16175 + }, + { + "epoch": 0.09620325435341136, + "grad_norm": 1.7387192249298096, + "learning_rate": 4.8867005939632424e-05, + "loss": 5.7735, + "step": 16176 + }, + { + "epoch": 0.09620920163669236, + "grad_norm": 1.9036939144134521, + "learning_rate": 4.8866866911338494e-05, + "loss": 5.8873, + "step": 16177 + }, + { + "epoch": 0.09621514891997336, + "grad_norm": 1.6884106397628784, + "learning_rate": 4.886672787471289e-05, + "loss": 5.1366, + "step": 16178 + }, + { + "epoch": 0.09622109620325435, + "grad_norm": 1.5132830142974854, + "learning_rate": 4.886658882975566e-05, + "loss": 5.2964, + "step": 16179 + }, + { + "epoch": 0.09622704348653535, + "grad_norm": 1.7039000988006592, + "learning_rate": 4.886644977646685e-05, + "loss": 5.2287, + "step": 16180 + }, + { + "epoch": 0.09623299076981635, + "grad_norm": 1.6894882917404175, + "learning_rate": 4.886631071484651e-05, + "loss": 5.3205, + "step": 16181 + }, + { + "epoch": 0.09623893805309734, + "grad_norm": 2.303013324737549, + "learning_rate": 4.8866171644894684e-05, + "loss": 5.2701, + "step": 16182 + }, + { + "epoch": 0.09624488533637834, + "grad_norm": 1.6158491373062134, + "learning_rate": 4.886603256661142e-05, + "loss": 5.522, + "step": 16183 + }, + { + "epoch": 0.09625083261965935, + "grad_norm": 1.5886715650558472, + "learning_rate": 4.8865893479996776e-05, + "loss": 5.7498, + "step": 16184 + }, + { + "epoch": 0.09625677990294033, + "grad_norm": 2.007570505142212, + "learning_rate": 4.88657543850508e-05, + "loss": 5.3746, + "step": 16185 + }, + { + "epoch": 0.09626272718622134, + "grad_norm": 2.8191232681274414, + "learning_rate": 4.886561528177352e-05, + "loss": 4.9794, + "step": 16186 + }, + { + "epoch": 0.09626867446950232, + "grad_norm": 2.5193052291870117, + "learning_rate": 4.886547617016501e-05, + "loss": 4.982, + "step": 16187 + }, + { + "epoch": 0.09627462175278333, + "grad_norm": 1.8875666856765747, + "learning_rate": 4.8865337050225316e-05, + "loss": 5.1801, + "step": 16188 + }, + { + "epoch": 0.09628056903606433, + "grad_norm": 1.441834568977356, + "learning_rate": 4.8865197921954475e-05, + "loss": 5.2723, + "step": 16189 + }, + { + "epoch": 0.09628651631934532, + "grad_norm": 2.0356223583221436, + "learning_rate": 4.8865058785352536e-05, + "loss": 5.4185, + "step": 16190 + }, + { + "epoch": 0.09629246360262632, + "grad_norm": 2.03885817527771, + "learning_rate": 4.8864919640419554e-05, + "loss": 5.1636, + "step": 16191 + }, + { + "epoch": 0.09629841088590732, + "grad_norm": 2.118439197540283, + "learning_rate": 4.8864780487155576e-05, + "loss": 5.4012, + "step": 16192 + }, + { + "epoch": 0.09630435816918831, + "grad_norm": 1.8266710042953491, + "learning_rate": 4.886464132556064e-05, + "loss": 4.9442, + "step": 16193 + }, + { + "epoch": 0.09631030545246931, + "grad_norm": 1.646341323852539, + "learning_rate": 4.886450215563482e-05, + "loss": 5.1368, + "step": 16194 + }, + { + "epoch": 0.09631625273575031, + "grad_norm": 1.8833272457122803, + "learning_rate": 4.886436297737814e-05, + "loss": 5.279, + "step": 16195 + }, + { + "epoch": 0.0963222000190313, + "grad_norm": 1.9521067142486572, + "learning_rate": 4.8864223790790666e-05, + "loss": 5.6571, + "step": 16196 + }, + { + "epoch": 0.0963281473023123, + "grad_norm": 1.8902586698532104, + "learning_rate": 4.8864084595872427e-05, + "loss": 5.632, + "step": 16197 + }, + { + "epoch": 0.0963340945855933, + "grad_norm": 1.7994412183761597, + "learning_rate": 4.886394539262349e-05, + "loss": 5.574, + "step": 16198 + }, + { + "epoch": 0.0963400418688743, + "grad_norm": 1.751780390739441, + "learning_rate": 4.8863806181043895e-05, + "loss": 5.691, + "step": 16199 + }, + { + "epoch": 0.0963459891521553, + "grad_norm": 2.30880069732666, + "learning_rate": 4.8863666961133684e-05, + "loss": 5.7477, + "step": 16200 + }, + { + "epoch": 0.0963519364354363, + "grad_norm": 2.351921319961548, + "learning_rate": 4.8863527732892924e-05, + "loss": 5.8162, + "step": 16201 + }, + { + "epoch": 0.09635788371871729, + "grad_norm": 1.6124454736709595, + "learning_rate": 4.8863388496321636e-05, + "loss": 5.8105, + "step": 16202 + }, + { + "epoch": 0.09636383100199829, + "grad_norm": 1.4927148818969727, + "learning_rate": 4.886324925141991e-05, + "loss": 5.8246, + "step": 16203 + }, + { + "epoch": 0.09636977828527929, + "grad_norm": 1.71438729763031, + "learning_rate": 4.886310999818775e-05, + "loss": 5.798, + "step": 16204 + }, + { + "epoch": 0.09637572556856028, + "grad_norm": 1.9519150257110596, + "learning_rate": 4.886297073662523e-05, + "loss": 5.2815, + "step": 16205 + }, + { + "epoch": 0.09638167285184128, + "grad_norm": 1.7694860696792603, + "learning_rate": 4.88628314667324e-05, + "loss": 5.7564, + "step": 16206 + }, + { + "epoch": 0.09638762013512228, + "grad_norm": 1.658252477645874, + "learning_rate": 4.88626921885093e-05, + "loss": 5.6586, + "step": 16207 + }, + { + "epoch": 0.09639356741840327, + "grad_norm": 2.310295581817627, + "learning_rate": 4.886255290195598e-05, + "loss": 4.9317, + "step": 16208 + }, + { + "epoch": 0.09639951470168427, + "grad_norm": 2.239964246749878, + "learning_rate": 4.886241360707249e-05, + "loss": 5.3794, + "step": 16209 + }, + { + "epoch": 0.09640546198496527, + "grad_norm": 2.470205307006836, + "learning_rate": 4.886227430385887e-05, + "loss": 5.1755, + "step": 16210 + }, + { + "epoch": 0.09641140926824626, + "grad_norm": 2.208298683166504, + "learning_rate": 4.8862134992315185e-05, + "loss": 5.1296, + "step": 16211 + }, + { + "epoch": 0.09641735655152726, + "grad_norm": 2.112288475036621, + "learning_rate": 4.886199567244147e-05, + "loss": 5.0888, + "step": 16212 + }, + { + "epoch": 0.09642330383480827, + "grad_norm": 2.3725969791412354, + "learning_rate": 4.886185634423778e-05, + "loss": 5.0256, + "step": 16213 + }, + { + "epoch": 0.09642925111808925, + "grad_norm": 2.3314402103424072, + "learning_rate": 4.8861717007704164e-05, + "loss": 5.012, + "step": 16214 + }, + { + "epoch": 0.09643519840137026, + "grad_norm": 2.1015000343322754, + "learning_rate": 4.8861577662840676e-05, + "loss": 4.7244, + "step": 16215 + }, + { + "epoch": 0.09644114568465124, + "grad_norm": 2.335218906402588, + "learning_rate": 4.8861438309647344e-05, + "loss": 4.8442, + "step": 16216 + }, + { + "epoch": 0.09644709296793225, + "grad_norm": 2.249216079711914, + "learning_rate": 4.886129894812424e-05, + "loss": 5.2573, + "step": 16217 + }, + { + "epoch": 0.09645304025121325, + "grad_norm": 2.228283166885376, + "learning_rate": 4.8861159578271406e-05, + "loss": 4.7297, + "step": 16218 + }, + { + "epoch": 0.09645898753449424, + "grad_norm": 1.7820645570755005, + "learning_rate": 4.886102020008888e-05, + "loss": 4.8427, + "step": 16219 + }, + { + "epoch": 0.09646493481777524, + "grad_norm": 2.1911120414733887, + "learning_rate": 4.886088081357672e-05, + "loss": 4.9677, + "step": 16220 + }, + { + "epoch": 0.09647088210105624, + "grad_norm": 2.453758716583252, + "learning_rate": 4.8860741418734976e-05, + "loss": 4.9039, + "step": 16221 + }, + { + "epoch": 0.09647682938433723, + "grad_norm": 2.488105058670044, + "learning_rate": 4.886060201556369e-05, + "loss": 5.0211, + "step": 16222 + }, + { + "epoch": 0.09648277666761823, + "grad_norm": 2.2040843963623047, + "learning_rate": 4.8860462604062915e-05, + "loss": 5.1067, + "step": 16223 + }, + { + "epoch": 0.09648872395089923, + "grad_norm": 2.0934717655181885, + "learning_rate": 4.8860323184232695e-05, + "loss": 4.9648, + "step": 16224 + }, + { + "epoch": 0.09649467123418022, + "grad_norm": 2.3775415420532227, + "learning_rate": 4.886018375607309e-05, + "loss": 4.9459, + "step": 16225 + }, + { + "epoch": 0.09650061851746122, + "grad_norm": 2.4042131900787354, + "learning_rate": 4.886004431958414e-05, + "loss": 4.7845, + "step": 16226 + }, + { + "epoch": 0.09650656580074223, + "grad_norm": 2.34424090385437, + "learning_rate": 4.885990487476589e-05, + "loss": 5.012, + "step": 16227 + }, + { + "epoch": 0.09651251308402321, + "grad_norm": 2.2711172103881836, + "learning_rate": 4.8859765421618395e-05, + "loss": 4.906, + "step": 16228 + }, + { + "epoch": 0.09651846036730422, + "grad_norm": 2.4021360874176025, + "learning_rate": 4.8859625960141706e-05, + "loss": 4.916, + "step": 16229 + }, + { + "epoch": 0.09652440765058522, + "grad_norm": 1.9205279350280762, + "learning_rate": 4.885948649033587e-05, + "loss": 5.0469, + "step": 16230 + }, + { + "epoch": 0.0965303549338662, + "grad_norm": 2.226362466812134, + "learning_rate": 4.885934701220093e-05, + "loss": 4.9439, + "step": 16231 + }, + { + "epoch": 0.09653630221714721, + "grad_norm": 2.288909673690796, + "learning_rate": 4.885920752573694e-05, + "loss": 4.8271, + "step": 16232 + }, + { + "epoch": 0.09654224950042821, + "grad_norm": 2.132235050201416, + "learning_rate": 4.8859068030943943e-05, + "loss": 5.1891, + "step": 16233 + }, + { + "epoch": 0.0965481967837092, + "grad_norm": 2.080244541168213, + "learning_rate": 4.8858928527822e-05, + "loss": 4.9055, + "step": 16234 + }, + { + "epoch": 0.0965541440669902, + "grad_norm": 2.324211359024048, + "learning_rate": 4.8858789016371145e-05, + "loss": 5.2614, + "step": 16235 + }, + { + "epoch": 0.0965600913502712, + "grad_norm": 1.827802062034607, + "learning_rate": 4.8858649496591437e-05, + "loss": 4.8874, + "step": 16236 + }, + { + "epoch": 0.09656603863355219, + "grad_norm": 1.8670811653137207, + "learning_rate": 4.885850996848292e-05, + "loss": 5.2402, + "step": 16237 + }, + { + "epoch": 0.09657198591683319, + "grad_norm": 2.046444892883301, + "learning_rate": 4.885837043204564e-05, + "loss": 4.7029, + "step": 16238 + }, + { + "epoch": 0.0965779332001142, + "grad_norm": 2.007894992828369, + "learning_rate": 4.885823088727965e-05, + "loss": 5.6706, + "step": 16239 + }, + { + "epoch": 0.09658388048339518, + "grad_norm": 2.24422025680542, + "learning_rate": 4.8858091334185005e-05, + "loss": 5.9666, + "step": 16240 + }, + { + "epoch": 0.09658982776667618, + "grad_norm": 1.7045838832855225, + "learning_rate": 4.885795177276174e-05, + "loss": 5.3021, + "step": 16241 + }, + { + "epoch": 0.09659577504995719, + "grad_norm": 1.7880860567092896, + "learning_rate": 4.885781220300991e-05, + "loss": 4.9151, + "step": 16242 + }, + { + "epoch": 0.09660172233323817, + "grad_norm": 2.3720862865448, + "learning_rate": 4.885767262492957e-05, + "loss": 5.0868, + "step": 16243 + }, + { + "epoch": 0.09660766961651918, + "grad_norm": 1.8655211925506592, + "learning_rate": 4.8857533038520756e-05, + "loss": 5.5072, + "step": 16244 + }, + { + "epoch": 0.09661361689980018, + "grad_norm": 1.8259748220443726, + "learning_rate": 4.885739344378353e-05, + "loss": 5.5992, + "step": 16245 + }, + { + "epoch": 0.09661956418308117, + "grad_norm": 1.667145013809204, + "learning_rate": 4.885725384071793e-05, + "loss": 5.2069, + "step": 16246 + }, + { + "epoch": 0.09662551146636217, + "grad_norm": 1.8004356622695923, + "learning_rate": 4.8857114229324015e-05, + "loss": 5.232, + "step": 16247 + }, + { + "epoch": 0.09663145874964316, + "grad_norm": 1.8246740102767944, + "learning_rate": 4.8856974609601825e-05, + "loss": 5.185, + "step": 16248 + }, + { + "epoch": 0.09663740603292416, + "grad_norm": 1.7453134059906006, + "learning_rate": 4.885683498155141e-05, + "loss": 4.9118, + "step": 16249 + }, + { + "epoch": 0.09664335331620516, + "grad_norm": 1.76914381980896, + "learning_rate": 4.885669534517282e-05, + "loss": 4.6679, + "step": 16250 + }, + { + "epoch": 0.09664930059948615, + "grad_norm": 2.0119516849517822, + "learning_rate": 4.88565557004661e-05, + "loss": 4.6495, + "step": 16251 + }, + { + "epoch": 0.09665524788276715, + "grad_norm": 1.7628357410430908, + "learning_rate": 4.885641604743131e-05, + "loss": 4.7581, + "step": 16252 + }, + { + "epoch": 0.09666119516604815, + "grad_norm": 1.6456751823425293, + "learning_rate": 4.8856276386068486e-05, + "loss": 4.9539, + "step": 16253 + }, + { + "epoch": 0.09666714244932914, + "grad_norm": 1.8474618196487427, + "learning_rate": 4.885613671637769e-05, + "loss": 5.9248, + "step": 16254 + }, + { + "epoch": 0.09667308973261014, + "grad_norm": 2.1205222606658936, + "learning_rate": 4.885599703835896e-05, + "loss": 5.2783, + "step": 16255 + }, + { + "epoch": 0.09667903701589114, + "grad_norm": 1.7559815645217896, + "learning_rate": 4.885585735201235e-05, + "loss": 5.6276, + "step": 16256 + }, + { + "epoch": 0.09668498429917213, + "grad_norm": 1.5784190893173218, + "learning_rate": 4.885571765733789e-05, + "loss": 5.5933, + "step": 16257 + }, + { + "epoch": 0.09669093158245314, + "grad_norm": 1.7377841472625732, + "learning_rate": 4.885557795433567e-05, + "loss": 5.1234, + "step": 16258 + }, + { + "epoch": 0.09669687886573414, + "grad_norm": 1.6517775058746338, + "learning_rate": 4.88554382430057e-05, + "loss": 5.6291, + "step": 16259 + }, + { + "epoch": 0.09670282614901513, + "grad_norm": 1.8474104404449463, + "learning_rate": 4.885529852334805e-05, + "loss": 6.0357, + "step": 16260 + }, + { + "epoch": 0.09670877343229613, + "grad_norm": 1.6555463075637817, + "learning_rate": 4.8855158795362756e-05, + "loss": 5.9828, + "step": 16261 + }, + { + "epoch": 0.09671472071557713, + "grad_norm": 1.6003193855285645, + "learning_rate": 4.8855019059049876e-05, + "loss": 5.9705, + "step": 16262 + }, + { + "epoch": 0.09672066799885812, + "grad_norm": 1.4992772340774536, + "learning_rate": 4.885487931440945e-05, + "loss": 5.8604, + "step": 16263 + }, + { + "epoch": 0.09672661528213912, + "grad_norm": 1.8667478561401367, + "learning_rate": 4.885473956144154e-05, + "loss": 6.1141, + "step": 16264 + }, + { + "epoch": 0.09673256256542012, + "grad_norm": 1.7311911582946777, + "learning_rate": 4.8854599800146186e-05, + "loss": 5.4142, + "step": 16265 + }, + { + "epoch": 0.09673850984870111, + "grad_norm": 2.0519683361053467, + "learning_rate": 4.885446003052343e-05, + "loss": 5.4321, + "step": 16266 + }, + { + "epoch": 0.09674445713198211, + "grad_norm": 2.02132248878479, + "learning_rate": 4.8854320252573325e-05, + "loss": 5.4957, + "step": 16267 + }, + { + "epoch": 0.09675040441526311, + "grad_norm": 1.7282330989837646, + "learning_rate": 4.885418046629594e-05, + "loss": 5.4486, + "step": 16268 + }, + { + "epoch": 0.0967563516985441, + "grad_norm": 1.909114122390747, + "learning_rate": 4.885404067169129e-05, + "loss": 5.4782, + "step": 16269 + }, + { + "epoch": 0.0967622989818251, + "grad_norm": 1.897161602973938, + "learning_rate": 4.885390086875945e-05, + "loss": 5.8678, + "step": 16270 + }, + { + "epoch": 0.0967682462651061, + "grad_norm": 2.0866503715515137, + "learning_rate": 4.885376105750046e-05, + "loss": 5.0869, + "step": 16271 + }, + { + "epoch": 0.0967741935483871, + "grad_norm": 1.6914600133895874, + "learning_rate": 4.885362123791437e-05, + "loss": 5.3385, + "step": 16272 + }, + { + "epoch": 0.0967801408316681, + "grad_norm": 1.4390329122543335, + "learning_rate": 4.885348141000122e-05, + "loss": 5.8069, + "step": 16273 + }, + { + "epoch": 0.0967860881149491, + "grad_norm": 1.5077629089355469, + "learning_rate": 4.885334157376107e-05, + "loss": 5.6679, + "step": 16274 + }, + { + "epoch": 0.09679203539823009, + "grad_norm": 1.4550343751907349, + "learning_rate": 4.885320172919397e-05, + "loss": 5.7548, + "step": 16275 + }, + { + "epoch": 0.09679798268151109, + "grad_norm": 2.068070650100708, + "learning_rate": 4.8853061876299956e-05, + "loss": 4.9706, + "step": 16276 + }, + { + "epoch": 0.09680392996479208, + "grad_norm": 1.3487659692764282, + "learning_rate": 4.885292201507909e-05, + "loss": 5.6918, + "step": 16277 + }, + { + "epoch": 0.09680987724807308, + "grad_norm": 1.4306180477142334, + "learning_rate": 4.885278214553141e-05, + "loss": 5.6196, + "step": 16278 + }, + { + "epoch": 0.09681582453135408, + "grad_norm": 1.6410231590270996, + "learning_rate": 4.885264226765698e-05, + "loss": 5.0523, + "step": 16279 + }, + { + "epoch": 0.09682177181463507, + "grad_norm": 2.4701485633850098, + "learning_rate": 4.8852502381455825e-05, + "loss": 4.6255, + "step": 16280 + }, + { + "epoch": 0.09682771909791607, + "grad_norm": 2.5248069763183594, + "learning_rate": 4.885236248692802e-05, + "loss": 4.5055, + "step": 16281 + }, + { + "epoch": 0.09683366638119707, + "grad_norm": 2.1913154125213623, + "learning_rate": 4.8852222584073595e-05, + "loss": 4.748, + "step": 16282 + }, + { + "epoch": 0.09683961366447806, + "grad_norm": 1.951987385749817, + "learning_rate": 4.8852082672892606e-05, + "loss": 5.3871, + "step": 16283 + }, + { + "epoch": 0.09684556094775906, + "grad_norm": 2.007020950317383, + "learning_rate": 4.885194275338511e-05, + "loss": 6.1075, + "step": 16284 + }, + { + "epoch": 0.09685150823104006, + "grad_norm": 1.9821717739105225, + "learning_rate": 4.885180282555113e-05, + "loss": 5.1719, + "step": 16285 + }, + { + "epoch": 0.09685745551432105, + "grad_norm": 2.339564800262451, + "learning_rate": 4.885166288939074e-05, + "loss": 4.9518, + "step": 16286 + }, + { + "epoch": 0.09686340279760206, + "grad_norm": 2.1785504817962646, + "learning_rate": 4.8851522944903984e-05, + "loss": 4.9656, + "step": 16287 + }, + { + "epoch": 0.09686935008088306, + "grad_norm": 1.7723946571350098, + "learning_rate": 4.885138299209091e-05, + "loss": 6.1572, + "step": 16288 + }, + { + "epoch": 0.09687529736416405, + "grad_norm": 1.702458381652832, + "learning_rate": 4.885124303095156e-05, + "loss": 5.9616, + "step": 16289 + }, + { + "epoch": 0.09688124464744505, + "grad_norm": 2.279836893081665, + "learning_rate": 4.885110306148599e-05, + "loss": 5.4305, + "step": 16290 + }, + { + "epoch": 0.09688719193072605, + "grad_norm": 1.8569501638412476, + "learning_rate": 4.8850963083694244e-05, + "loss": 5.8019, + "step": 16291 + }, + { + "epoch": 0.09689313921400704, + "grad_norm": 1.8126327991485596, + "learning_rate": 4.885082309757637e-05, + "loss": 5.7076, + "step": 16292 + }, + { + "epoch": 0.09689908649728804, + "grad_norm": 1.7170337438583374, + "learning_rate": 4.8850683103132424e-05, + "loss": 5.9862, + "step": 16293 + }, + { + "epoch": 0.09690503378056904, + "grad_norm": 1.7631909847259521, + "learning_rate": 4.8850543100362454e-05, + "loss": 5.917, + "step": 16294 + }, + { + "epoch": 0.09691098106385003, + "grad_norm": 1.9938957691192627, + "learning_rate": 4.88504030892665e-05, + "loss": 5.5773, + "step": 16295 + }, + { + "epoch": 0.09691692834713103, + "grad_norm": 1.9459222555160522, + "learning_rate": 4.8850263069844623e-05, + "loss": 5.2847, + "step": 16296 + }, + { + "epoch": 0.09692287563041203, + "grad_norm": 1.8420277833938599, + "learning_rate": 4.8850123042096865e-05, + "loss": 5.5691, + "step": 16297 + }, + { + "epoch": 0.09692882291369302, + "grad_norm": 2.2592809200286865, + "learning_rate": 4.8849983006023267e-05, + "loss": 5.4666, + "step": 16298 + }, + { + "epoch": 0.09693477019697402, + "grad_norm": 2.080939292907715, + "learning_rate": 4.884984296162389e-05, + "loss": 5.243, + "step": 16299 + }, + { + "epoch": 0.09694071748025503, + "grad_norm": 1.648836374282837, + "learning_rate": 4.884970290889879e-05, + "loss": 5.8331, + "step": 16300 + }, + { + "epoch": 0.09694666476353601, + "grad_norm": 1.668505311012268, + "learning_rate": 4.884956284784799e-05, + "loss": 5.7523, + "step": 16301 + }, + { + "epoch": 0.09695261204681702, + "grad_norm": 1.5473688840866089, + "learning_rate": 4.8849422778471567e-05, + "loss": 5.5379, + "step": 16302 + }, + { + "epoch": 0.09695855933009802, + "grad_norm": 1.9258644580841064, + "learning_rate": 4.8849282700769545e-05, + "loss": 5.6405, + "step": 16303 + }, + { + "epoch": 0.096964506613379, + "grad_norm": 1.5651416778564453, + "learning_rate": 4.884914261474199e-05, + "loss": 6.1487, + "step": 16304 + }, + { + "epoch": 0.09697045389666001, + "grad_norm": 1.5289270877838135, + "learning_rate": 4.884900252038894e-05, + "loss": 5.6653, + "step": 16305 + }, + { + "epoch": 0.096976401179941, + "grad_norm": 1.8394510746002197, + "learning_rate": 4.8848862417710464e-05, + "loss": 4.9243, + "step": 16306 + }, + { + "epoch": 0.096982348463222, + "grad_norm": 1.7624824047088623, + "learning_rate": 4.8848722306706584e-05, + "loss": 5.7712, + "step": 16307 + }, + { + "epoch": 0.096988295746503, + "grad_norm": 1.7294182777404785, + "learning_rate": 4.8848582187377365e-05, + "loss": 5.5197, + "step": 16308 + }, + { + "epoch": 0.09699424302978399, + "grad_norm": 1.69902765750885, + "learning_rate": 4.8848442059722856e-05, + "loss": 5.6485, + "step": 16309 + }, + { + "epoch": 0.09700019031306499, + "grad_norm": 1.7867447137832642, + "learning_rate": 4.88483019237431e-05, + "loss": 5.4422, + "step": 16310 + }, + { + "epoch": 0.09700613759634599, + "grad_norm": 1.6588819026947021, + "learning_rate": 4.884816177943814e-05, + "loss": 5.4282, + "step": 16311 + }, + { + "epoch": 0.09701208487962698, + "grad_norm": 1.504918098449707, + "learning_rate": 4.884802162680804e-05, + "loss": 5.508, + "step": 16312 + }, + { + "epoch": 0.09701803216290798, + "grad_norm": 1.5852895975112915, + "learning_rate": 4.8847881465852846e-05, + "loss": 5.5567, + "step": 16313 + }, + { + "epoch": 0.09702397944618898, + "grad_norm": 1.5719797611236572, + "learning_rate": 4.88477412965726e-05, + "loss": 5.6284, + "step": 16314 + }, + { + "epoch": 0.09702992672946997, + "grad_norm": 1.4208050966262817, + "learning_rate": 4.884760111896735e-05, + "loss": 5.5653, + "step": 16315 + }, + { + "epoch": 0.09703587401275098, + "grad_norm": 1.567555546760559, + "learning_rate": 4.8847460933037156e-05, + "loss": 5.5144, + "step": 16316 + }, + { + "epoch": 0.09704182129603198, + "grad_norm": 1.9179699420928955, + "learning_rate": 4.884732073878205e-05, + "loss": 4.7947, + "step": 16317 + }, + { + "epoch": 0.09704776857931297, + "grad_norm": 2.5346062183380127, + "learning_rate": 4.88471805362021e-05, + "loss": 3.8315, + "step": 16318 + }, + { + "epoch": 0.09705371586259397, + "grad_norm": 2.585686683654785, + "learning_rate": 4.884704032529734e-05, + "loss": 3.7288, + "step": 16319 + }, + { + "epoch": 0.09705966314587497, + "grad_norm": 2.133723020553589, + "learning_rate": 4.8846900106067825e-05, + "loss": 3.6369, + "step": 16320 + }, + { + "epoch": 0.09706561042915596, + "grad_norm": 2.4039080142974854, + "learning_rate": 4.884675987851361e-05, + "loss": 3.9068, + "step": 16321 + }, + { + "epoch": 0.09707155771243696, + "grad_norm": 2.643489360809326, + "learning_rate": 4.884661964263473e-05, + "loss": 3.7793, + "step": 16322 + }, + { + "epoch": 0.09707750499571796, + "grad_norm": 2.485727071762085, + "learning_rate": 4.8846479398431244e-05, + "loss": 4.9789, + "step": 16323 + }, + { + "epoch": 0.09708345227899895, + "grad_norm": 2.8592441082000732, + "learning_rate": 4.8846339145903194e-05, + "loss": 4.0196, + "step": 16324 + }, + { + "epoch": 0.09708939956227995, + "grad_norm": 2.470813035964966, + "learning_rate": 4.884619888505064e-05, + "loss": 5.2308, + "step": 16325 + }, + { + "epoch": 0.09709534684556095, + "grad_norm": 2.3255081176757812, + "learning_rate": 4.884605861587362e-05, + "loss": 5.3535, + "step": 16326 + }, + { + "epoch": 0.09710129412884194, + "grad_norm": 2.1462676525115967, + "learning_rate": 4.8845918338372195e-05, + "loss": 5.2611, + "step": 16327 + }, + { + "epoch": 0.09710724141212294, + "grad_norm": 1.8838989734649658, + "learning_rate": 4.88457780525464e-05, + "loss": 5.8104, + "step": 16328 + }, + { + "epoch": 0.09711318869540395, + "grad_norm": 2.137746572494507, + "learning_rate": 4.884563775839629e-05, + "loss": 5.4702, + "step": 16329 + }, + { + "epoch": 0.09711913597868493, + "grad_norm": 1.8934431076049805, + "learning_rate": 4.884549745592192e-05, + "loss": 4.9703, + "step": 16330 + }, + { + "epoch": 0.09712508326196594, + "grad_norm": 2.409020185470581, + "learning_rate": 4.884535714512333e-05, + "loss": 5.6793, + "step": 16331 + }, + { + "epoch": 0.09713103054524694, + "grad_norm": 2.039520263671875, + "learning_rate": 4.884521682600056e-05, + "loss": 5.7809, + "step": 16332 + }, + { + "epoch": 0.09713697782852793, + "grad_norm": 3.1211516857147217, + "learning_rate": 4.884507649855369e-05, + "loss": 5.6195, + "step": 16333 + }, + { + "epoch": 0.09714292511180893, + "grad_norm": 1.9474505186080933, + "learning_rate": 4.884493616278274e-05, + "loss": 5.3064, + "step": 16334 + }, + { + "epoch": 0.09714887239508992, + "grad_norm": 1.7586307525634766, + "learning_rate": 4.884479581868777e-05, + "loss": 4.9531, + "step": 16335 + }, + { + "epoch": 0.09715481967837092, + "grad_norm": 1.6352753639221191, + "learning_rate": 4.884465546626883e-05, + "loss": 5.304, + "step": 16336 + }, + { + "epoch": 0.09716076696165192, + "grad_norm": 1.681362271308899, + "learning_rate": 4.884451510552597e-05, + "loss": 5.9167, + "step": 16337 + }, + { + "epoch": 0.09716671424493291, + "grad_norm": 1.7970985174179077, + "learning_rate": 4.8844374736459225e-05, + "loss": 6.122, + "step": 16338 + }, + { + "epoch": 0.09717266152821391, + "grad_norm": 1.5312799215316772, + "learning_rate": 4.8844234359068666e-05, + "loss": 4.903, + "step": 16339 + }, + { + "epoch": 0.09717860881149491, + "grad_norm": 1.7024787664413452, + "learning_rate": 4.884409397335432e-05, + "loss": 5.3306, + "step": 16340 + }, + { + "epoch": 0.0971845560947759, + "grad_norm": 3.000169515609741, + "learning_rate": 4.884395357931626e-05, + "loss": 4.9682, + "step": 16341 + }, + { + "epoch": 0.0971905033780569, + "grad_norm": 2.910048484802246, + "learning_rate": 4.884381317695452e-05, + "loss": 5.2385, + "step": 16342 + }, + { + "epoch": 0.0971964506613379, + "grad_norm": 2.1094155311584473, + "learning_rate": 4.8843672766269147e-05, + "loss": 5.1025, + "step": 16343 + }, + { + "epoch": 0.09720239794461889, + "grad_norm": 1.7918319702148438, + "learning_rate": 4.884353234726019e-05, + "loss": 5.2822, + "step": 16344 + }, + { + "epoch": 0.0972083452278999, + "grad_norm": 1.574461579322815, + "learning_rate": 4.884339191992771e-05, + "loss": 5.6254, + "step": 16345 + }, + { + "epoch": 0.0972142925111809, + "grad_norm": 2.0780746936798096, + "learning_rate": 4.884325148427175e-05, + "loss": 5.0641, + "step": 16346 + }, + { + "epoch": 0.09722023979446189, + "grad_norm": 2.30399227142334, + "learning_rate": 4.884311104029235e-05, + "loss": 4.9591, + "step": 16347 + }, + { + "epoch": 0.09722618707774289, + "grad_norm": 2.087993621826172, + "learning_rate": 4.884297058798957e-05, + "loss": 5.0514, + "step": 16348 + }, + { + "epoch": 0.09723213436102389, + "grad_norm": 2.0179786682128906, + "learning_rate": 4.884283012736345e-05, + "loss": 4.9632, + "step": 16349 + }, + { + "epoch": 0.09723808164430488, + "grad_norm": 2.4394171237945557, + "learning_rate": 4.8842689658414054e-05, + "loss": 4.6517, + "step": 16350 + }, + { + "epoch": 0.09724402892758588, + "grad_norm": 2.6895275115966797, + "learning_rate": 4.884254918114142e-05, + "loss": 4.726, + "step": 16351 + }, + { + "epoch": 0.09724997621086688, + "grad_norm": 1.5181125402450562, + "learning_rate": 4.884240869554559e-05, + "loss": 5.679, + "step": 16352 + }, + { + "epoch": 0.09725592349414787, + "grad_norm": 1.758475422859192, + "learning_rate": 4.884226820162662e-05, + "loss": 5.2323, + "step": 16353 + }, + { + "epoch": 0.09726187077742887, + "grad_norm": 2.0166938304901123, + "learning_rate": 4.884212769938457e-05, + "loss": 4.6912, + "step": 16354 + }, + { + "epoch": 0.09726781806070987, + "grad_norm": 2.1366612911224365, + "learning_rate": 4.8841987188819475e-05, + "loss": 4.4761, + "step": 16355 + }, + { + "epoch": 0.09727376534399086, + "grad_norm": 1.9595547914505005, + "learning_rate": 4.884184666993139e-05, + "loss": 4.5343, + "step": 16356 + }, + { + "epoch": 0.09727971262727186, + "grad_norm": 1.896043300628662, + "learning_rate": 4.884170614272037e-05, + "loss": 4.465, + "step": 16357 + }, + { + "epoch": 0.09728565991055287, + "grad_norm": 2.062506675720215, + "learning_rate": 4.884156560718645e-05, + "loss": 4.301, + "step": 16358 + }, + { + "epoch": 0.09729160719383385, + "grad_norm": 2.0816612243652344, + "learning_rate": 4.884142506332968e-05, + "loss": 4.5414, + "step": 16359 + }, + { + "epoch": 0.09729755447711486, + "grad_norm": 2.0095489025115967, + "learning_rate": 4.884128451115012e-05, + "loss": 4.3779, + "step": 16360 + }, + { + "epoch": 0.09730350176039586, + "grad_norm": 2.0766615867614746, + "learning_rate": 4.884114395064781e-05, + "loss": 4.3999, + "step": 16361 + }, + { + "epoch": 0.09730944904367685, + "grad_norm": 2.0266785621643066, + "learning_rate": 4.8841003381822805e-05, + "loss": 4.5122, + "step": 16362 + }, + { + "epoch": 0.09731539632695785, + "grad_norm": 1.9631284475326538, + "learning_rate": 4.884086280467516e-05, + "loss": 4.3061, + "step": 16363 + }, + { + "epoch": 0.09732134361023884, + "grad_norm": 2.2965009212493896, + "learning_rate": 4.8840722219204905e-05, + "loss": 4.3387, + "step": 16364 + }, + { + "epoch": 0.09732729089351984, + "grad_norm": 2.036365509033203, + "learning_rate": 4.8840581625412105e-05, + "loss": 4.3242, + "step": 16365 + }, + { + "epoch": 0.09733323817680084, + "grad_norm": 2.186131477355957, + "learning_rate": 4.88404410232968e-05, + "loss": 4.2517, + "step": 16366 + }, + { + "epoch": 0.09733918546008183, + "grad_norm": 2.2000489234924316, + "learning_rate": 4.884030041285905e-05, + "loss": 4.274, + "step": 16367 + }, + { + "epoch": 0.09734513274336283, + "grad_norm": 3.2708849906921387, + "learning_rate": 4.884015979409889e-05, + "loss": 4.9575, + "step": 16368 + }, + { + "epoch": 0.09735108002664383, + "grad_norm": 1.7634176015853882, + "learning_rate": 4.884001916701639e-05, + "loss": 4.63, + "step": 16369 + }, + { + "epoch": 0.09735702730992482, + "grad_norm": 2.297611713409424, + "learning_rate": 4.883987853161157e-05, + "loss": 4.3009, + "step": 16370 + }, + { + "epoch": 0.09736297459320582, + "grad_norm": 2.1840944290161133, + "learning_rate": 4.8839737887884507e-05, + "loss": 4.2232, + "step": 16371 + }, + { + "epoch": 0.09736892187648682, + "grad_norm": 2.1925270557403564, + "learning_rate": 4.8839597235835234e-05, + "loss": 4.1824, + "step": 16372 + }, + { + "epoch": 0.09737486915976781, + "grad_norm": 2.175720453262329, + "learning_rate": 4.88394565754638e-05, + "loss": 4.2619, + "step": 16373 + }, + { + "epoch": 0.09738081644304881, + "grad_norm": 2.282804489135742, + "learning_rate": 4.883931590677026e-05, + "loss": 4.2207, + "step": 16374 + }, + { + "epoch": 0.09738676372632982, + "grad_norm": 1.674668788909912, + "learning_rate": 4.883917522975466e-05, + "loss": 5.3627, + "step": 16375 + }, + { + "epoch": 0.0973927110096108, + "grad_norm": 1.6538902521133423, + "learning_rate": 4.883903454441705e-05, + "loss": 5.302, + "step": 16376 + }, + { + "epoch": 0.09739865829289181, + "grad_norm": 1.4267115592956543, + "learning_rate": 4.8838893850757485e-05, + "loss": 5.2545, + "step": 16377 + }, + { + "epoch": 0.09740460557617281, + "grad_norm": 1.3086082935333252, + "learning_rate": 4.8838753148776e-05, + "loss": 5.1538, + "step": 16378 + }, + { + "epoch": 0.0974105528594538, + "grad_norm": 1.4384034872055054, + "learning_rate": 4.883861243847266e-05, + "loss": 5.3925, + "step": 16379 + }, + { + "epoch": 0.0974165001427348, + "grad_norm": 1.4971977472305298, + "learning_rate": 4.88384717198475e-05, + "loss": 5.3966, + "step": 16380 + }, + { + "epoch": 0.0974224474260158, + "grad_norm": 1.517468810081482, + "learning_rate": 4.8838330992900584e-05, + "loss": 5.1097, + "step": 16381 + }, + { + "epoch": 0.09742839470929679, + "grad_norm": 1.388852596282959, + "learning_rate": 4.8838190257631944e-05, + "loss": 5.1066, + "step": 16382 + }, + { + "epoch": 0.09743434199257779, + "grad_norm": 1.2972341775894165, + "learning_rate": 4.8838049514041646e-05, + "loss": 5.0383, + "step": 16383 + }, + { + "epoch": 0.0974402892758588, + "grad_norm": 1.338291049003601, + "learning_rate": 4.883790876212972e-05, + "loss": 5.1339, + "step": 16384 + }, + { + "epoch": 0.09744623655913978, + "grad_norm": 1.4399670362472534, + "learning_rate": 4.883776800189624e-05, + "loss": 5.0542, + "step": 16385 + }, + { + "epoch": 0.09745218384242078, + "grad_norm": 1.5091251134872437, + "learning_rate": 4.8837627233341235e-05, + "loss": 4.9303, + "step": 16386 + }, + { + "epoch": 0.09745813112570179, + "grad_norm": 1.4728022813796997, + "learning_rate": 4.8837486456464764e-05, + "loss": 5.0902, + "step": 16387 + }, + { + "epoch": 0.09746407840898277, + "grad_norm": 1.454509973526001, + "learning_rate": 4.8837345671266865e-05, + "loss": 4.9227, + "step": 16388 + }, + { + "epoch": 0.09747002569226378, + "grad_norm": 1.431118130683899, + "learning_rate": 4.88372048777476e-05, + "loss": 5.0128, + "step": 16389 + }, + { + "epoch": 0.09747597297554478, + "grad_norm": 1.434967041015625, + "learning_rate": 4.8837064075907015e-05, + "loss": 5.1793, + "step": 16390 + }, + { + "epoch": 0.09748192025882577, + "grad_norm": 1.5077275037765503, + "learning_rate": 4.883692326574515e-05, + "loss": 5.1573, + "step": 16391 + }, + { + "epoch": 0.09748786754210677, + "grad_norm": 1.44413161277771, + "learning_rate": 4.883678244726208e-05, + "loss": 5.2297, + "step": 16392 + }, + { + "epoch": 0.09749381482538776, + "grad_norm": 1.606898546218872, + "learning_rate": 4.883664162045781e-05, + "loss": 4.9409, + "step": 16393 + }, + { + "epoch": 0.09749976210866876, + "grad_norm": 1.649034857749939, + "learning_rate": 4.883650078533243e-05, + "loss": 5.1519, + "step": 16394 + }, + { + "epoch": 0.09750570939194976, + "grad_norm": 1.5309730768203735, + "learning_rate": 4.883635994188597e-05, + "loss": 4.9568, + "step": 16395 + }, + { + "epoch": 0.09751165667523075, + "grad_norm": 1.8033829927444458, + "learning_rate": 4.883621909011848e-05, + "loss": 4.7442, + "step": 16396 + }, + { + "epoch": 0.09751760395851175, + "grad_norm": 1.653501272201538, + "learning_rate": 4.8836078230030016e-05, + "loss": 4.5672, + "step": 16397 + }, + { + "epoch": 0.09752355124179275, + "grad_norm": 1.686077356338501, + "learning_rate": 4.8835937361620624e-05, + "loss": 4.5819, + "step": 16398 + }, + { + "epoch": 0.09752949852507374, + "grad_norm": 1.5233088731765747, + "learning_rate": 4.883579648489035e-05, + "loss": 4.5191, + "step": 16399 + }, + { + "epoch": 0.09753544580835474, + "grad_norm": 1.6472907066345215, + "learning_rate": 4.883565559983925e-05, + "loss": 4.6418, + "step": 16400 + }, + { + "epoch": 0.09754139309163574, + "grad_norm": 1.817649483680725, + "learning_rate": 4.8835514706467364e-05, + "loss": 4.806, + "step": 16401 + }, + { + "epoch": 0.09754734037491673, + "grad_norm": 1.8404059410095215, + "learning_rate": 4.8835373804774754e-05, + "loss": 4.8169, + "step": 16402 + }, + { + "epoch": 0.09755328765819773, + "grad_norm": 1.5510175228118896, + "learning_rate": 4.883523289476145e-05, + "loss": 4.7987, + "step": 16403 + }, + { + "epoch": 0.09755923494147874, + "grad_norm": 1.4557734727859497, + "learning_rate": 4.8835091976427514e-05, + "loss": 4.7322, + "step": 16404 + }, + { + "epoch": 0.09756518222475973, + "grad_norm": 1.528123140335083, + "learning_rate": 4.8834951049773006e-05, + "loss": 4.7376, + "step": 16405 + }, + { + "epoch": 0.09757112950804073, + "grad_norm": 1.6215547323226929, + "learning_rate": 4.8834810114797944e-05, + "loss": 4.7679, + "step": 16406 + }, + { + "epoch": 0.09757707679132173, + "grad_norm": 1.4554566144943237, + "learning_rate": 4.883466917150241e-05, + "loss": 4.6452, + "step": 16407 + }, + { + "epoch": 0.09758302407460272, + "grad_norm": 1.5100599527359009, + "learning_rate": 4.883452821988644e-05, + "loss": 4.6957, + "step": 16408 + }, + { + "epoch": 0.09758897135788372, + "grad_norm": 1.7057833671569824, + "learning_rate": 4.8834387259950074e-05, + "loss": 4.7888, + "step": 16409 + }, + { + "epoch": 0.09759491864116472, + "grad_norm": 1.4016892910003662, + "learning_rate": 4.883424629169337e-05, + "loss": 4.769, + "step": 16410 + }, + { + "epoch": 0.09760086592444571, + "grad_norm": 1.5257891416549683, + "learning_rate": 4.883410531511638e-05, + "loss": 4.7443, + "step": 16411 + }, + { + "epoch": 0.09760681320772671, + "grad_norm": 1.3904502391815186, + "learning_rate": 4.883396433021916e-05, + "loss": 4.786, + "step": 16412 + }, + { + "epoch": 0.09761276049100771, + "grad_norm": 1.6081106662750244, + "learning_rate": 4.883382333700174e-05, + "loss": 4.5321, + "step": 16413 + }, + { + "epoch": 0.0976187077742887, + "grad_norm": 1.4291402101516724, + "learning_rate": 4.883368233546417e-05, + "loss": 4.5898, + "step": 16414 + }, + { + "epoch": 0.0976246550575697, + "grad_norm": 1.5700920820236206, + "learning_rate": 4.8833541325606524e-05, + "loss": 5.2177, + "step": 16415 + }, + { + "epoch": 0.0976306023408507, + "grad_norm": 1.5503007173538208, + "learning_rate": 4.8833400307428825e-05, + "loss": 5.3911, + "step": 16416 + }, + { + "epoch": 0.0976365496241317, + "grad_norm": 1.5890953540802002, + "learning_rate": 4.8833259280931135e-05, + "loss": 4.9426, + "step": 16417 + }, + { + "epoch": 0.0976424969074127, + "grad_norm": 1.5032304525375366, + "learning_rate": 4.8833118246113494e-05, + "loss": 4.6124, + "step": 16418 + }, + { + "epoch": 0.0976484441906937, + "grad_norm": 1.5300242900848389, + "learning_rate": 4.8832977202975964e-05, + "loss": 4.9323, + "step": 16419 + }, + { + "epoch": 0.09765439147397469, + "grad_norm": 1.7094424962997437, + "learning_rate": 4.883283615151859e-05, + "loss": 5.3205, + "step": 16420 + }, + { + "epoch": 0.09766033875725569, + "grad_norm": 1.8231004476547241, + "learning_rate": 4.883269509174142e-05, + "loss": 5.0414, + "step": 16421 + }, + { + "epoch": 0.09766628604053668, + "grad_norm": 1.7779520750045776, + "learning_rate": 4.8832554023644496e-05, + "loss": 4.9106, + "step": 16422 + }, + { + "epoch": 0.09767223332381768, + "grad_norm": 1.5394103527069092, + "learning_rate": 4.8832412947227875e-05, + "loss": 4.998, + "step": 16423 + }, + { + "epoch": 0.09767818060709868, + "grad_norm": 1.3814078569412231, + "learning_rate": 4.883227186249161e-05, + "loss": 4.9109, + "step": 16424 + }, + { + "epoch": 0.09768412789037967, + "grad_norm": 1.291040301322937, + "learning_rate": 4.8832130769435735e-05, + "loss": 5.3617, + "step": 16425 + }, + { + "epoch": 0.09769007517366067, + "grad_norm": 1.561249017715454, + "learning_rate": 4.883198966806032e-05, + "loss": 5.3041, + "step": 16426 + }, + { + "epoch": 0.09769602245694167, + "grad_norm": 1.7411010265350342, + "learning_rate": 4.883184855836539e-05, + "loss": 5.0816, + "step": 16427 + }, + { + "epoch": 0.09770196974022266, + "grad_norm": 1.6507155895233154, + "learning_rate": 4.8831707440351024e-05, + "loss": 5.1089, + "step": 16428 + }, + { + "epoch": 0.09770791702350366, + "grad_norm": 1.5242364406585693, + "learning_rate": 4.8831566314017254e-05, + "loss": 4.9718, + "step": 16429 + }, + { + "epoch": 0.09771386430678466, + "grad_norm": 2.3768868446350098, + "learning_rate": 4.883142517936412e-05, + "loss": 4.9333, + "step": 16430 + }, + { + "epoch": 0.09771981159006565, + "grad_norm": 1.2830429077148438, + "learning_rate": 4.8831284036391684e-05, + "loss": 4.9238, + "step": 16431 + }, + { + "epoch": 0.09772575887334665, + "grad_norm": 1.5065499544143677, + "learning_rate": 4.883114288509999e-05, + "loss": 5.0151, + "step": 16432 + }, + { + "epoch": 0.09773170615662766, + "grad_norm": 1.5989798307418823, + "learning_rate": 4.88310017254891e-05, + "loss": 5.0081, + "step": 16433 + }, + { + "epoch": 0.09773765343990864, + "grad_norm": 1.391644835472107, + "learning_rate": 4.883086055755905e-05, + "loss": 4.8942, + "step": 16434 + }, + { + "epoch": 0.09774360072318965, + "grad_norm": 1.4952952861785889, + "learning_rate": 4.883071938130989e-05, + "loss": 5.0018, + "step": 16435 + }, + { + "epoch": 0.09774954800647065, + "grad_norm": 1.522814393043518, + "learning_rate": 4.883057819674168e-05, + "loss": 5.2591, + "step": 16436 + }, + { + "epoch": 0.09775549528975164, + "grad_norm": 1.3879649639129639, + "learning_rate": 4.8830437003854454e-05, + "loss": 4.9136, + "step": 16437 + }, + { + "epoch": 0.09776144257303264, + "grad_norm": 1.3485056161880493, + "learning_rate": 4.883029580264827e-05, + "loss": 5.5159, + "step": 16438 + }, + { + "epoch": 0.09776738985631364, + "grad_norm": 1.475131869316101, + "learning_rate": 4.883015459312317e-05, + "loss": 5.4397, + "step": 16439 + }, + { + "epoch": 0.09777333713959463, + "grad_norm": 1.2736895084381104, + "learning_rate": 4.8830013375279215e-05, + "loss": 5.2867, + "step": 16440 + }, + { + "epoch": 0.09777928442287563, + "grad_norm": 1.456312656402588, + "learning_rate": 4.882987214911645e-05, + "loss": 5.3351, + "step": 16441 + }, + { + "epoch": 0.09778523170615663, + "grad_norm": 1.5312397480010986, + "learning_rate": 4.882973091463492e-05, + "loss": 5.3233, + "step": 16442 + }, + { + "epoch": 0.09779117898943762, + "grad_norm": 1.5735961198806763, + "learning_rate": 4.882958967183468e-05, + "loss": 4.9878, + "step": 16443 + }, + { + "epoch": 0.09779712627271862, + "grad_norm": 1.337172508239746, + "learning_rate": 4.882944842071577e-05, + "loss": 5.121, + "step": 16444 + }, + { + "epoch": 0.09780307355599963, + "grad_norm": 1.47593355178833, + "learning_rate": 4.882930716127826e-05, + "loss": 5.4733, + "step": 16445 + }, + { + "epoch": 0.09780902083928061, + "grad_norm": 1.4311164617538452, + "learning_rate": 4.882916589352217e-05, + "loss": 5.2215, + "step": 16446 + }, + { + "epoch": 0.09781496812256162, + "grad_norm": 1.3628556728363037, + "learning_rate": 4.882902461744757e-05, + "loss": 5.3611, + "step": 16447 + }, + { + "epoch": 0.09782091540584262, + "grad_norm": 1.5621687173843384, + "learning_rate": 4.882888333305451e-05, + "loss": 5.4407, + "step": 16448 + }, + { + "epoch": 0.0978268626891236, + "grad_norm": 1.570478081703186, + "learning_rate": 4.8828742040343024e-05, + "loss": 5.533, + "step": 16449 + }, + { + "epoch": 0.09783280997240461, + "grad_norm": 1.3725816011428833, + "learning_rate": 4.8828600739313174e-05, + "loss": 5.1467, + "step": 16450 + }, + { + "epoch": 0.0978387572556856, + "grad_norm": 1.4899497032165527, + "learning_rate": 4.8828459429965e-05, + "loss": 5.233, + "step": 16451 + }, + { + "epoch": 0.0978447045389666, + "grad_norm": 1.380609154701233, + "learning_rate": 4.882831811229857e-05, + "loss": 5.1484, + "step": 16452 + }, + { + "epoch": 0.0978506518222476, + "grad_norm": 1.2167932987213135, + "learning_rate": 4.882817678631391e-05, + "loss": 5.1687, + "step": 16453 + }, + { + "epoch": 0.09785659910552859, + "grad_norm": 1.5250643491744995, + "learning_rate": 4.882803545201108e-05, + "loss": 5.2395, + "step": 16454 + }, + { + "epoch": 0.09786254638880959, + "grad_norm": 1.4288511276245117, + "learning_rate": 4.882789410939013e-05, + "loss": 5.0532, + "step": 16455 + }, + { + "epoch": 0.09786849367209059, + "grad_norm": 1.6325379610061646, + "learning_rate": 4.8827752758451105e-05, + "loss": 5.2077, + "step": 16456 + }, + { + "epoch": 0.09787444095537158, + "grad_norm": 1.4227756261825562, + "learning_rate": 4.882761139919406e-05, + "loss": 5.0431, + "step": 16457 + }, + { + "epoch": 0.09788038823865258, + "grad_norm": 1.355039358139038, + "learning_rate": 4.8827470031619046e-05, + "loss": 4.9062, + "step": 16458 + }, + { + "epoch": 0.09788633552193358, + "grad_norm": 1.5071823596954346, + "learning_rate": 4.8827328655726113e-05, + "loss": 5.2632, + "step": 16459 + }, + { + "epoch": 0.09789228280521457, + "grad_norm": 1.411828637123108, + "learning_rate": 4.88271872715153e-05, + "loss": 5.343, + "step": 16460 + }, + { + "epoch": 0.09789823008849557, + "grad_norm": 1.419164776802063, + "learning_rate": 4.882704587898666e-05, + "loss": 5.1643, + "step": 16461 + }, + { + "epoch": 0.09790417737177658, + "grad_norm": 1.4997645616531372, + "learning_rate": 4.882690447814024e-05, + "loss": 5.1701, + "step": 16462 + }, + { + "epoch": 0.09791012465505756, + "grad_norm": 1.4251139163970947, + "learning_rate": 4.88267630689761e-05, + "loss": 5.0228, + "step": 16463 + }, + { + "epoch": 0.09791607193833857, + "grad_norm": 1.289102554321289, + "learning_rate": 4.882662165149429e-05, + "loss": 5.1934, + "step": 16464 + }, + { + "epoch": 0.09792201922161957, + "grad_norm": 1.1589713096618652, + "learning_rate": 4.882648022569484e-05, + "loss": 5.3388, + "step": 16465 + }, + { + "epoch": 0.09792796650490056, + "grad_norm": 1.1682082414627075, + "learning_rate": 4.8826338791577816e-05, + "loss": 5.2062, + "step": 16466 + }, + { + "epoch": 0.09793391378818156, + "grad_norm": 1.2263107299804688, + "learning_rate": 4.882619734914326e-05, + "loss": 5.414, + "step": 16467 + }, + { + "epoch": 0.09793986107146256, + "grad_norm": 1.2873631715774536, + "learning_rate": 4.882605589839123e-05, + "loss": 5.4286, + "step": 16468 + }, + { + "epoch": 0.09794580835474355, + "grad_norm": 1.2950979471206665, + "learning_rate": 4.882591443932177e-05, + "loss": 5.1603, + "step": 16469 + }, + { + "epoch": 0.09795175563802455, + "grad_norm": 1.5623066425323486, + "learning_rate": 4.882577297193493e-05, + "loss": 5.0778, + "step": 16470 + }, + { + "epoch": 0.09795770292130555, + "grad_norm": 1.5446339845657349, + "learning_rate": 4.882563149623076e-05, + "loss": 5.1451, + "step": 16471 + }, + { + "epoch": 0.09796365020458654, + "grad_norm": 1.599387526512146, + "learning_rate": 4.882549001220931e-05, + "loss": 5.4596, + "step": 16472 + }, + { + "epoch": 0.09796959748786754, + "grad_norm": 1.325596809387207, + "learning_rate": 4.882534851987062e-05, + "loss": 5.4639, + "step": 16473 + }, + { + "epoch": 0.09797554477114855, + "grad_norm": 1.3077852725982666, + "learning_rate": 4.8825207019214746e-05, + "loss": 5.3654, + "step": 16474 + }, + { + "epoch": 0.09798149205442953, + "grad_norm": 1.5500328540802002, + "learning_rate": 4.882506551024174e-05, + "loss": 4.946, + "step": 16475 + }, + { + "epoch": 0.09798743933771054, + "grad_norm": 1.6101415157318115, + "learning_rate": 4.8824923992951656e-05, + "loss": 4.9618, + "step": 16476 + }, + { + "epoch": 0.09799338662099154, + "grad_norm": 1.542837381362915, + "learning_rate": 4.882478246734453e-05, + "loss": 4.9959, + "step": 16477 + }, + { + "epoch": 0.09799933390427253, + "grad_norm": 1.5618165731430054, + "learning_rate": 4.8824640933420424e-05, + "loss": 5.1221, + "step": 16478 + }, + { + "epoch": 0.09800528118755353, + "grad_norm": 1.4425160884857178, + "learning_rate": 4.882449939117938e-05, + "loss": 5.1689, + "step": 16479 + }, + { + "epoch": 0.09801122847083452, + "grad_norm": 1.3621004819869995, + "learning_rate": 4.8824357840621445e-05, + "loss": 4.9975, + "step": 16480 + }, + { + "epoch": 0.09801717575411552, + "grad_norm": 1.5944523811340332, + "learning_rate": 4.882421628174668e-05, + "loss": 5.0296, + "step": 16481 + }, + { + "epoch": 0.09802312303739652, + "grad_norm": 1.391321063041687, + "learning_rate": 4.8824074714555125e-05, + "loss": 5.0139, + "step": 16482 + }, + { + "epoch": 0.09802907032067751, + "grad_norm": 1.2085964679718018, + "learning_rate": 4.882393313904683e-05, + "loss": 5.1125, + "step": 16483 + }, + { + "epoch": 0.09803501760395851, + "grad_norm": 1.391383409500122, + "learning_rate": 4.882379155522185e-05, + "loss": 5.2999, + "step": 16484 + }, + { + "epoch": 0.09804096488723951, + "grad_norm": 1.3748564720153809, + "learning_rate": 4.882364996308023e-05, + "loss": 5.3096, + "step": 16485 + }, + { + "epoch": 0.0980469121705205, + "grad_norm": 1.825728416442871, + "learning_rate": 4.8823508362622014e-05, + "loss": 5.3318, + "step": 16486 + }, + { + "epoch": 0.0980528594538015, + "grad_norm": 1.6402180194854736, + "learning_rate": 4.882336675384726e-05, + "loss": 5.155, + "step": 16487 + }, + { + "epoch": 0.0980588067370825, + "grad_norm": 1.343284249305725, + "learning_rate": 4.882322513675601e-05, + "loss": 4.9341, + "step": 16488 + }, + { + "epoch": 0.09806475402036349, + "grad_norm": 1.3958711624145508, + "learning_rate": 4.882308351134833e-05, + "loss": 4.9595, + "step": 16489 + }, + { + "epoch": 0.0980707013036445, + "grad_norm": 1.572996735572815, + "learning_rate": 4.882294187762425e-05, + "loss": 4.9666, + "step": 16490 + }, + { + "epoch": 0.0980766485869255, + "grad_norm": 1.6167391538619995, + "learning_rate": 4.882280023558383e-05, + "loss": 4.7387, + "step": 16491 + }, + { + "epoch": 0.09808259587020648, + "grad_norm": 2.474092483520508, + "learning_rate": 4.882265858522711e-05, + "loss": 5.1476, + "step": 16492 + }, + { + "epoch": 0.09808854315348749, + "grad_norm": 1.5375875234603882, + "learning_rate": 4.8822516926554155e-05, + "loss": 4.5832, + "step": 16493 + }, + { + "epoch": 0.09809449043676849, + "grad_norm": 1.6802133321762085, + "learning_rate": 4.8822375259565e-05, + "loss": 4.615, + "step": 16494 + }, + { + "epoch": 0.09810043772004948, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.8822233584259703e-05, + "loss": 4.6586, + "step": 16495 + }, + { + "epoch": 0.09810638500333048, + "grad_norm": 1.5207875967025757, + "learning_rate": 4.882209190063831e-05, + "loss": 4.6748, + "step": 16496 + }, + { + "epoch": 0.09811233228661148, + "grad_norm": 1.4980802536010742, + "learning_rate": 4.882195020870087e-05, + "loss": 4.5326, + "step": 16497 + }, + { + "epoch": 0.09811827956989247, + "grad_norm": 1.473092794418335, + "learning_rate": 4.882180850844743e-05, + "loss": 4.6126, + "step": 16498 + }, + { + "epoch": 0.09812422685317347, + "grad_norm": 1.521147608757019, + "learning_rate": 4.8821666799878055e-05, + "loss": 4.6269, + "step": 16499 + }, + { + "epoch": 0.09813017413645447, + "grad_norm": 1.7371230125427246, + "learning_rate": 4.882152508299277e-05, + "loss": 4.6847, + "step": 16500 + }, + { + "epoch": 0.09813612141973546, + "grad_norm": 1.7222683429718018, + "learning_rate": 4.8821383357791636e-05, + "loss": 5.3943, + "step": 16501 + }, + { + "epoch": 0.09814206870301646, + "grad_norm": 1.523373007774353, + "learning_rate": 4.8821241624274705e-05, + "loss": 5.2822, + "step": 16502 + }, + { + "epoch": 0.09814801598629747, + "grad_norm": 1.365224838256836, + "learning_rate": 4.882109988244203e-05, + "loss": 5.1923, + "step": 16503 + }, + { + "epoch": 0.09815396326957845, + "grad_norm": 1.503907322883606, + "learning_rate": 4.882095813229365e-05, + "loss": 5.128, + "step": 16504 + }, + { + "epoch": 0.09815991055285946, + "grad_norm": 1.5996166467666626, + "learning_rate": 4.8820816373829625e-05, + "loss": 4.9296, + "step": 16505 + }, + { + "epoch": 0.09816585783614046, + "grad_norm": 1.373089075088501, + "learning_rate": 4.8820674607049994e-05, + "loss": 5.0614, + "step": 16506 + }, + { + "epoch": 0.09817180511942145, + "grad_norm": 1.3730735778808594, + "learning_rate": 4.882053283195481e-05, + "loss": 5.0374, + "step": 16507 + }, + { + "epoch": 0.09817775240270245, + "grad_norm": 1.2357912063598633, + "learning_rate": 4.882039104854413e-05, + "loss": 5.1513, + "step": 16508 + }, + { + "epoch": 0.09818369968598344, + "grad_norm": 1.402327299118042, + "learning_rate": 4.8820249256817995e-05, + "loss": 5.7344, + "step": 16509 + }, + { + "epoch": 0.09818964696926444, + "grad_norm": 1.3152369260787964, + "learning_rate": 4.882010745677645e-05, + "loss": 5.6755, + "step": 16510 + }, + { + "epoch": 0.09819559425254544, + "grad_norm": 1.409428358078003, + "learning_rate": 4.8819965648419565e-05, + "loss": 5.3562, + "step": 16511 + }, + { + "epoch": 0.09820154153582643, + "grad_norm": 1.3278082609176636, + "learning_rate": 4.881982383174737e-05, + "loss": 5.2401, + "step": 16512 + }, + { + "epoch": 0.09820748881910743, + "grad_norm": 1.287716269493103, + "learning_rate": 4.881968200675991e-05, + "loss": 4.9961, + "step": 16513 + }, + { + "epoch": 0.09821343610238843, + "grad_norm": 1.3444676399230957, + "learning_rate": 4.881954017345727e-05, + "loss": 5.5592, + "step": 16514 + }, + { + "epoch": 0.09821938338566942, + "grad_norm": 1.4815365076065063, + "learning_rate": 4.881939833183945e-05, + "loss": 5.5342, + "step": 16515 + }, + { + "epoch": 0.09822533066895042, + "grad_norm": 1.210050344467163, + "learning_rate": 4.8819256481906536e-05, + "loss": 5.5375, + "step": 16516 + }, + { + "epoch": 0.09823127795223142, + "grad_norm": 2.041801691055298, + "learning_rate": 4.881911462365857e-05, + "loss": 4.601, + "step": 16517 + }, + { + "epoch": 0.09823722523551241, + "grad_norm": 2.196315050125122, + "learning_rate": 4.881897275709558e-05, + "loss": 4.2376, + "step": 16518 + }, + { + "epoch": 0.09824317251879341, + "grad_norm": 2.1649539470672607, + "learning_rate": 4.881883088221765e-05, + "loss": 4.4159, + "step": 16519 + }, + { + "epoch": 0.09824911980207442, + "grad_norm": 2.02476167678833, + "learning_rate": 4.881868899902481e-05, + "loss": 4.4091, + "step": 16520 + }, + { + "epoch": 0.0982550670853554, + "grad_norm": 1.9262346029281616, + "learning_rate": 4.88185471075171e-05, + "loss": 4.4326, + "step": 16521 + }, + { + "epoch": 0.0982610143686364, + "grad_norm": 1.8461369276046753, + "learning_rate": 4.881840520769459e-05, + "loss": 4.1563, + "step": 16522 + }, + { + "epoch": 0.09826696165191741, + "grad_norm": 1.8261640071868896, + "learning_rate": 4.881826329955732e-05, + "loss": 4.3518, + "step": 16523 + }, + { + "epoch": 0.0982729089351984, + "grad_norm": 2.1533737182617188, + "learning_rate": 4.881812138310534e-05, + "loss": 4.292, + "step": 16524 + }, + { + "epoch": 0.0982788562184794, + "grad_norm": 2.11578369140625, + "learning_rate": 4.8817979458338705e-05, + "loss": 4.5411, + "step": 16525 + }, + { + "epoch": 0.0982848035017604, + "grad_norm": 1.8681827783584595, + "learning_rate": 4.881783752525745e-05, + "loss": 5.7264, + "step": 16526 + }, + { + "epoch": 0.09829075078504139, + "grad_norm": 1.98794424533844, + "learning_rate": 4.881769558386163e-05, + "loss": 5.4694, + "step": 16527 + }, + { + "epoch": 0.09829669806832239, + "grad_norm": 2.6389517784118652, + "learning_rate": 4.881755363415131e-05, + "loss": 5.0086, + "step": 16528 + }, + { + "epoch": 0.0983026453516034, + "grad_norm": 2.2565221786499023, + "learning_rate": 4.881741167612653e-05, + "loss": 4.9219, + "step": 16529 + }, + { + "epoch": 0.09830859263488438, + "grad_norm": 1.8296940326690674, + "learning_rate": 4.881726970978733e-05, + "loss": 4.9185, + "step": 16530 + }, + { + "epoch": 0.09831453991816538, + "grad_norm": 2.031334638595581, + "learning_rate": 4.8817127735133774e-05, + "loss": 4.8589, + "step": 16531 + }, + { + "epoch": 0.09832048720144639, + "grad_norm": 1.5883747339248657, + "learning_rate": 4.8816985752165904e-05, + "loss": 5.2695, + "step": 16532 + }, + { + "epoch": 0.09832643448472737, + "grad_norm": 1.4946906566619873, + "learning_rate": 4.8816843760883755e-05, + "loss": 5.6835, + "step": 16533 + }, + { + "epoch": 0.09833238176800838, + "grad_norm": 1.7901808023452759, + "learning_rate": 4.881670176128741e-05, + "loss": 6.1753, + "step": 16534 + }, + { + "epoch": 0.09833832905128938, + "grad_norm": 1.7249737977981567, + "learning_rate": 4.881655975337689e-05, + "loss": 5.86, + "step": 16535 + }, + { + "epoch": 0.09834427633457037, + "grad_norm": 1.8257695436477661, + "learning_rate": 4.8816417737152264e-05, + "loss": 5.1969, + "step": 16536 + }, + { + "epoch": 0.09835022361785137, + "grad_norm": 1.3712751865386963, + "learning_rate": 4.881627571261357e-05, + "loss": 5.7666, + "step": 16537 + }, + { + "epoch": 0.09835617090113236, + "grad_norm": 1.8865090608596802, + "learning_rate": 4.881613367976086e-05, + "loss": 4.8832, + "step": 16538 + }, + { + "epoch": 0.09836211818441336, + "grad_norm": 1.7155808210372925, + "learning_rate": 4.8815991638594175e-05, + "loss": 4.7248, + "step": 16539 + }, + { + "epoch": 0.09836806546769436, + "grad_norm": 1.6654868125915527, + "learning_rate": 4.8815849589113585e-05, + "loss": 4.7095, + "step": 16540 + }, + { + "epoch": 0.09837401275097535, + "grad_norm": 1.6152902841567993, + "learning_rate": 4.881570753131912e-05, + "loss": 5.2894, + "step": 16541 + }, + { + "epoch": 0.09837996003425635, + "grad_norm": 2.1657047271728516, + "learning_rate": 4.8815565465210835e-05, + "loss": 5.9782, + "step": 16542 + }, + { + "epoch": 0.09838590731753735, + "grad_norm": 1.801346778869629, + "learning_rate": 4.88154233907888e-05, + "loss": 5.6683, + "step": 16543 + }, + { + "epoch": 0.09839185460081834, + "grad_norm": 1.7916477918624878, + "learning_rate": 4.881528130805303e-05, + "loss": 5.7056, + "step": 16544 + }, + { + "epoch": 0.09839780188409934, + "grad_norm": 2.1006147861480713, + "learning_rate": 4.881513921700359e-05, + "loss": 5.6315, + "step": 16545 + }, + { + "epoch": 0.09840374916738034, + "grad_norm": 2.3291585445404053, + "learning_rate": 4.8814997117640535e-05, + "loss": 4.8996, + "step": 16546 + }, + { + "epoch": 0.09840969645066133, + "grad_norm": 1.9543695449829102, + "learning_rate": 4.8814855009963916e-05, + "loss": 5.1839, + "step": 16547 + }, + { + "epoch": 0.09841564373394233, + "grad_norm": 2.7100865840911865, + "learning_rate": 4.881471289397378e-05, + "loss": 5.1445, + "step": 16548 + }, + { + "epoch": 0.09842159101722334, + "grad_norm": 2.5749876499176025, + "learning_rate": 4.8814570769670165e-05, + "loss": 5.2023, + "step": 16549 + }, + { + "epoch": 0.09842753830050432, + "grad_norm": 2.079770088195801, + "learning_rate": 4.881442863705313e-05, + "loss": 5.1197, + "step": 16550 + }, + { + "epoch": 0.09843348558378533, + "grad_norm": 1.9495431184768677, + "learning_rate": 4.881428649612272e-05, + "loss": 4.8669, + "step": 16551 + }, + { + "epoch": 0.09843943286706633, + "grad_norm": 2.0918610095977783, + "learning_rate": 4.8814144346879e-05, + "loss": 5.0413, + "step": 16552 + }, + { + "epoch": 0.09844538015034732, + "grad_norm": 2.326662302017212, + "learning_rate": 4.8814002189322e-05, + "loss": 5.0085, + "step": 16553 + }, + { + "epoch": 0.09845132743362832, + "grad_norm": 2.3819150924682617, + "learning_rate": 4.881386002345178e-05, + "loss": 4.8364, + "step": 16554 + }, + { + "epoch": 0.09845727471690932, + "grad_norm": 2.6585230827331543, + "learning_rate": 4.881371784926839e-05, + "loss": 5.1722, + "step": 16555 + }, + { + "epoch": 0.09846322200019031, + "grad_norm": 2.209075689315796, + "learning_rate": 4.881357566677187e-05, + "loss": 5.0474, + "step": 16556 + }, + { + "epoch": 0.09846916928347131, + "grad_norm": 1.9725440740585327, + "learning_rate": 4.881343347596229e-05, + "loss": 5.0361, + "step": 16557 + }, + { + "epoch": 0.09847511656675231, + "grad_norm": 2.0074071884155273, + "learning_rate": 4.881329127683968e-05, + "loss": 5.5143, + "step": 16558 + }, + { + "epoch": 0.0984810638500333, + "grad_norm": 1.8329545259475708, + "learning_rate": 4.8813149069404093e-05, + "loss": 5.8843, + "step": 16559 + }, + { + "epoch": 0.0984870111333143, + "grad_norm": 2.2991678714752197, + "learning_rate": 4.881300685365558e-05, + "loss": 4.6178, + "step": 16560 + }, + { + "epoch": 0.0984929584165953, + "grad_norm": 2.7643637657165527, + "learning_rate": 4.881286462959419e-05, + "loss": 4.1381, + "step": 16561 + }, + { + "epoch": 0.0984989056998763, + "grad_norm": 2.5811941623687744, + "learning_rate": 4.8812722397219985e-05, + "loss": 3.8026, + "step": 16562 + }, + { + "epoch": 0.0985048529831573, + "grad_norm": 2.1111907958984375, + "learning_rate": 4.8812580156533e-05, + "loss": 4.0149, + "step": 16563 + }, + { + "epoch": 0.0985108002664383, + "grad_norm": 2.229973793029785, + "learning_rate": 4.8812437907533294e-05, + "loss": 4.24, + "step": 16564 + }, + { + "epoch": 0.09851674754971929, + "grad_norm": 1.6310914754867554, + "learning_rate": 4.8812295650220905e-05, + "loss": 5.9476, + "step": 16565 + }, + { + "epoch": 0.09852269483300029, + "grad_norm": 1.7397875785827637, + "learning_rate": 4.881215338459589e-05, + "loss": 5.8527, + "step": 16566 + }, + { + "epoch": 0.09852864211628128, + "grad_norm": 1.8279019594192505, + "learning_rate": 4.88120111106583e-05, + "loss": 5.5869, + "step": 16567 + }, + { + "epoch": 0.09853458939956228, + "grad_norm": 1.6956331729888916, + "learning_rate": 4.881186882840818e-05, + "loss": 5.6508, + "step": 16568 + }, + { + "epoch": 0.09854053668284328, + "grad_norm": 1.619205355644226, + "learning_rate": 4.881172653784559e-05, + "loss": 5.6502, + "step": 16569 + }, + { + "epoch": 0.09854648396612427, + "grad_norm": 1.4612733125686646, + "learning_rate": 4.881158423897057e-05, + "loss": 5.5937, + "step": 16570 + }, + { + "epoch": 0.09855243124940527, + "grad_norm": 1.4997358322143555, + "learning_rate": 4.8811441931783165e-05, + "loss": 5.5865, + "step": 16571 + }, + { + "epoch": 0.09855837853268627, + "grad_norm": 1.6516716480255127, + "learning_rate": 4.8811299616283434e-05, + "loss": 5.4031, + "step": 16572 + }, + { + "epoch": 0.09856432581596726, + "grad_norm": 1.5714633464813232, + "learning_rate": 4.881115729247143e-05, + "loss": 5.4543, + "step": 16573 + }, + { + "epoch": 0.09857027309924826, + "grad_norm": 1.4891443252563477, + "learning_rate": 4.881101496034719e-05, + "loss": 5.5687, + "step": 16574 + }, + { + "epoch": 0.09857622038252926, + "grad_norm": 1.3504915237426758, + "learning_rate": 4.8810872619910773e-05, + "loss": 5.5777, + "step": 16575 + }, + { + "epoch": 0.09858216766581025, + "grad_norm": 1.5825836658477783, + "learning_rate": 4.881073027116223e-05, + "loss": 5.547, + "step": 16576 + }, + { + "epoch": 0.09858811494909125, + "grad_norm": 1.4398233890533447, + "learning_rate": 4.8810587914101607e-05, + "loss": 5.4707, + "step": 16577 + }, + { + "epoch": 0.09859406223237226, + "grad_norm": 1.6776020526885986, + "learning_rate": 4.881044554872895e-05, + "loss": 5.4879, + "step": 16578 + }, + { + "epoch": 0.09860000951565324, + "grad_norm": 1.417771339416504, + "learning_rate": 4.8810303175044316e-05, + "loss": 5.5362, + "step": 16579 + }, + { + "epoch": 0.09860595679893425, + "grad_norm": 1.4919921159744263, + "learning_rate": 4.881016079304775e-05, + "loss": 5.5289, + "step": 16580 + }, + { + "epoch": 0.09861190408221525, + "grad_norm": 1.6195905208587646, + "learning_rate": 4.88100184027393e-05, + "loss": 5.467, + "step": 16581 + }, + { + "epoch": 0.09861785136549624, + "grad_norm": 1.5255846977233887, + "learning_rate": 4.880987600411902e-05, + "loss": 6.268, + "step": 16582 + }, + { + "epoch": 0.09862379864877724, + "grad_norm": 1.5051823854446411, + "learning_rate": 4.880973359718696e-05, + "loss": 6.024, + "step": 16583 + }, + { + "epoch": 0.09862974593205824, + "grad_norm": 2.455932378768921, + "learning_rate": 4.880959118194317e-05, + "loss": 5.0881, + "step": 16584 + }, + { + "epoch": 0.09863569321533923, + "grad_norm": 2.3916566371917725, + "learning_rate": 4.880944875838769e-05, + "loss": 5.0897, + "step": 16585 + }, + { + "epoch": 0.09864164049862023, + "grad_norm": 2.0487334728240967, + "learning_rate": 4.880930632652058e-05, + "loss": 5.603, + "step": 16586 + }, + { + "epoch": 0.09864758778190123, + "grad_norm": 1.9195282459259033, + "learning_rate": 4.880916388634189e-05, + "loss": 5.6492, + "step": 16587 + }, + { + "epoch": 0.09865353506518222, + "grad_norm": 1.743602991104126, + "learning_rate": 4.880902143785166e-05, + "loss": 5.7378, + "step": 16588 + }, + { + "epoch": 0.09865948234846322, + "grad_norm": 1.913156509399414, + "learning_rate": 4.880887898104996e-05, + "loss": 5.6267, + "step": 16589 + }, + { + "epoch": 0.09866542963174423, + "grad_norm": 1.8759669065475464, + "learning_rate": 4.880873651593681e-05, + "loss": 5.5593, + "step": 16590 + }, + { + "epoch": 0.09867137691502521, + "grad_norm": 1.8475536108016968, + "learning_rate": 4.880859404251229e-05, + "loss": 5.5021, + "step": 16591 + }, + { + "epoch": 0.09867732419830622, + "grad_norm": 1.5235642194747925, + "learning_rate": 4.880845156077643e-05, + "loss": 5.4692, + "step": 16592 + }, + { + "epoch": 0.09868327148158722, + "grad_norm": 1.8132069110870361, + "learning_rate": 4.8808309070729294e-05, + "loss": 5.6067, + "step": 16593 + }, + { + "epoch": 0.0986892187648682, + "grad_norm": 1.8001697063446045, + "learning_rate": 4.880816657237091e-05, + "loss": 5.749, + "step": 16594 + }, + { + "epoch": 0.09869516604814921, + "grad_norm": 1.8349007368087769, + "learning_rate": 4.8808024065701354e-05, + "loss": 5.6596, + "step": 16595 + }, + { + "epoch": 0.0987011133314302, + "grad_norm": 1.5677918195724487, + "learning_rate": 4.880788155072065e-05, + "loss": 5.725, + "step": 16596 + }, + { + "epoch": 0.0987070606147112, + "grad_norm": 1.8379719257354736, + "learning_rate": 4.880773902742887e-05, + "loss": 5.4325, + "step": 16597 + }, + { + "epoch": 0.0987130078979922, + "grad_norm": 1.8847566843032837, + "learning_rate": 4.880759649582605e-05, + "loss": 5.5737, + "step": 16598 + }, + { + "epoch": 0.09871895518127319, + "grad_norm": 2.398552417755127, + "learning_rate": 4.8807453955912244e-05, + "loss": 5.4192, + "step": 16599 + }, + { + "epoch": 0.09872490246455419, + "grad_norm": 1.990404486656189, + "learning_rate": 4.8807311407687494e-05, + "loss": 5.4624, + "step": 16600 + }, + { + "epoch": 0.09873084974783519, + "grad_norm": 1.533575177192688, + "learning_rate": 4.880716885115187e-05, + "loss": 5.8242, + "step": 16601 + }, + { + "epoch": 0.09873679703111618, + "grad_norm": 1.7357563972473145, + "learning_rate": 4.88070262863054e-05, + "loss": 5.9343, + "step": 16602 + }, + { + "epoch": 0.09874274431439718, + "grad_norm": 1.8504372835159302, + "learning_rate": 4.880688371314816e-05, + "loss": 5.6685, + "step": 16603 + }, + { + "epoch": 0.09874869159767818, + "grad_norm": 2.5040910243988037, + "learning_rate": 4.880674113168016e-05, + "loss": 5.1591, + "step": 16604 + }, + { + "epoch": 0.09875463888095917, + "grad_norm": 2.7820568084716797, + "learning_rate": 4.880659854190148e-05, + "loss": 5.0528, + "step": 16605 + }, + { + "epoch": 0.09876058616424017, + "grad_norm": 2.004427909851074, + "learning_rate": 4.8806455943812165e-05, + "loss": 5.6251, + "step": 16606 + }, + { + "epoch": 0.09876653344752118, + "grad_norm": 1.8053330183029175, + "learning_rate": 4.880631333741227e-05, + "loss": 5.5293, + "step": 16607 + }, + { + "epoch": 0.09877248073080216, + "grad_norm": 1.6708273887634277, + "learning_rate": 4.8806170722701824e-05, + "loss": 6.1215, + "step": 16608 + }, + { + "epoch": 0.09877842801408317, + "grad_norm": 1.6344959735870361, + "learning_rate": 4.88060280996809e-05, + "loss": 6.191, + "step": 16609 + }, + { + "epoch": 0.09878437529736417, + "grad_norm": 1.68915593624115, + "learning_rate": 4.880588546834953e-05, + "loss": 5.9302, + "step": 16610 + }, + { + "epoch": 0.09879032258064516, + "grad_norm": 2.108917236328125, + "learning_rate": 4.8805742828707777e-05, + "loss": 5.5227, + "step": 16611 + }, + { + "epoch": 0.09879626986392616, + "grad_norm": 1.7772480249404907, + "learning_rate": 4.8805600180755685e-05, + "loss": 5.5694, + "step": 16612 + }, + { + "epoch": 0.09880221714720716, + "grad_norm": 1.629629135131836, + "learning_rate": 4.8805457524493305e-05, + "loss": 5.7881, + "step": 16613 + }, + { + "epoch": 0.09880816443048815, + "grad_norm": 1.8985555171966553, + "learning_rate": 4.880531485992068e-05, + "loss": 5.5357, + "step": 16614 + }, + { + "epoch": 0.09881411171376915, + "grad_norm": 2.5329599380493164, + "learning_rate": 4.880517218703786e-05, + "loss": 4.8959, + "step": 16615 + }, + { + "epoch": 0.09882005899705015, + "grad_norm": 2.408377170562744, + "learning_rate": 4.8805029505844915e-05, + "loss": 4.9581, + "step": 16616 + }, + { + "epoch": 0.09882600628033114, + "grad_norm": 2.125190258026123, + "learning_rate": 4.880488681634187e-05, + "loss": 4.4116, + "step": 16617 + }, + { + "epoch": 0.09883195356361214, + "grad_norm": 2.153186082839966, + "learning_rate": 4.880474411852879e-05, + "loss": 4.2887, + "step": 16618 + }, + { + "epoch": 0.09883790084689315, + "grad_norm": 2.3961498737335205, + "learning_rate": 4.880460141240571e-05, + "loss": 4.6521, + "step": 16619 + }, + { + "epoch": 0.09884384813017413, + "grad_norm": 2.4282264709472656, + "learning_rate": 4.880445869797271e-05, + "loss": 4.6307, + "step": 16620 + }, + { + "epoch": 0.09884979541345514, + "grad_norm": 2.461005687713623, + "learning_rate": 4.88043159752298e-05, + "loss": 4.4234, + "step": 16621 + }, + { + "epoch": 0.09885574269673614, + "grad_norm": 2.5483081340789795, + "learning_rate": 4.8804173244177056e-05, + "loss": 4.2688, + "step": 16622 + }, + { + "epoch": 0.09886168998001713, + "grad_norm": 2.370413303375244, + "learning_rate": 4.8804030504814524e-05, + "loss": 4.4887, + "step": 16623 + }, + { + "epoch": 0.09886763726329813, + "grad_norm": 2.681118965148926, + "learning_rate": 4.880388775714225e-05, + "loss": 4.2941, + "step": 16624 + }, + { + "epoch": 0.09887358454657912, + "grad_norm": 2.1210896968841553, + "learning_rate": 4.8803745001160284e-05, + "loss": 5.1994, + "step": 16625 + }, + { + "epoch": 0.09887953182986012, + "grad_norm": 1.703626275062561, + "learning_rate": 4.880360223686867e-05, + "loss": 5.5578, + "step": 16626 + }, + { + "epoch": 0.09888547911314112, + "grad_norm": 1.5515342950820923, + "learning_rate": 4.8803459464267475e-05, + "loss": 5.6636, + "step": 16627 + }, + { + "epoch": 0.09889142639642211, + "grad_norm": 1.2145434617996216, + "learning_rate": 4.880331668335673e-05, + "loss": 5.3634, + "step": 16628 + }, + { + "epoch": 0.09889737367970311, + "grad_norm": 1.2893304824829102, + "learning_rate": 4.88031738941365e-05, + "loss": 5.5383, + "step": 16629 + }, + { + "epoch": 0.09890332096298411, + "grad_norm": 3.1206297874450684, + "learning_rate": 4.880303109660682e-05, + "loss": 4.9313, + "step": 16630 + }, + { + "epoch": 0.0989092682462651, + "grad_norm": 3.382498264312744, + "learning_rate": 4.8802888290767756e-05, + "loss": 4.4475, + "step": 16631 + }, + { + "epoch": 0.0989152155295461, + "grad_norm": 1.8280858993530273, + "learning_rate": 4.880274547661934e-05, + "loss": 5.6722, + "step": 16632 + }, + { + "epoch": 0.0989211628128271, + "grad_norm": 2.0412793159484863, + "learning_rate": 4.880260265416164e-05, + "loss": 5.3952, + "step": 16633 + }, + { + "epoch": 0.09892711009610809, + "grad_norm": 2.0702524185180664, + "learning_rate": 4.880245982339469e-05, + "loss": 5.2754, + "step": 16634 + }, + { + "epoch": 0.0989330573793891, + "grad_norm": 1.7081348896026611, + "learning_rate": 4.880231698431855e-05, + "loss": 5.8414, + "step": 16635 + }, + { + "epoch": 0.0989390046626701, + "grad_norm": 1.7762012481689453, + "learning_rate": 4.880217413693328e-05, + "loss": 6.0106, + "step": 16636 + }, + { + "epoch": 0.09894495194595108, + "grad_norm": 1.815253496170044, + "learning_rate": 4.8802031281238895e-05, + "loss": 5.9715, + "step": 16637 + }, + { + "epoch": 0.09895089922923209, + "grad_norm": 1.8652589321136475, + "learning_rate": 4.880188841723548e-05, + "loss": 5.9437, + "step": 16638 + }, + { + "epoch": 0.09895684651251309, + "grad_norm": 1.687664270401001, + "learning_rate": 4.8801745544923075e-05, + "loss": 6.0776, + "step": 16639 + }, + { + "epoch": 0.09896279379579408, + "grad_norm": 1.579231858253479, + "learning_rate": 4.880160266430171e-05, + "loss": 6.0486, + "step": 16640 + }, + { + "epoch": 0.09896874107907508, + "grad_norm": 1.711932897567749, + "learning_rate": 4.8801459775371464e-05, + "loss": 5.7954, + "step": 16641 + }, + { + "epoch": 0.09897468836235608, + "grad_norm": 2.022918939590454, + "learning_rate": 4.880131687813237e-05, + "loss": 5.4453, + "step": 16642 + }, + { + "epoch": 0.09898063564563707, + "grad_norm": 2.4682674407958984, + "learning_rate": 4.880117397258449e-05, + "loss": 5.2084, + "step": 16643 + }, + { + "epoch": 0.09898658292891807, + "grad_norm": 2.7558486461639404, + "learning_rate": 4.880103105872786e-05, + "loss": 4.8931, + "step": 16644 + }, + { + "epoch": 0.09899253021219907, + "grad_norm": 1.8757295608520508, + "learning_rate": 4.880088813656253e-05, + "loss": 5.4484, + "step": 16645 + }, + { + "epoch": 0.09899847749548006, + "grad_norm": 2.0811331272125244, + "learning_rate": 4.880074520608857e-05, + "loss": 5.8003, + "step": 16646 + }, + { + "epoch": 0.09900442477876106, + "grad_norm": 1.9147615432739258, + "learning_rate": 4.880060226730601e-05, + "loss": 5.869, + "step": 16647 + }, + { + "epoch": 0.09901037206204207, + "grad_norm": 1.974865436553955, + "learning_rate": 4.88004593202149e-05, + "loss": 5.5896, + "step": 16648 + }, + { + "epoch": 0.09901631934532305, + "grad_norm": 1.8365596532821655, + "learning_rate": 4.88003163648153e-05, + "loss": 5.5321, + "step": 16649 + }, + { + "epoch": 0.09902226662860406, + "grad_norm": 1.5927996635437012, + "learning_rate": 4.8800173401107255e-05, + "loss": 5.49, + "step": 16650 + }, + { + "epoch": 0.09902821391188506, + "grad_norm": 1.7566391229629517, + "learning_rate": 4.880003042909081e-05, + "loss": 5.49, + "step": 16651 + }, + { + "epoch": 0.09903416119516605, + "grad_norm": 1.718018651008606, + "learning_rate": 4.879988744876602e-05, + "loss": 5.4515, + "step": 16652 + }, + { + "epoch": 0.09904010847844705, + "grad_norm": 1.8946046829223633, + "learning_rate": 4.879974446013295e-05, + "loss": 4.9902, + "step": 16653 + }, + { + "epoch": 0.09904605576172804, + "grad_norm": 1.939060926437378, + "learning_rate": 4.879960146319162e-05, + "loss": 5.2067, + "step": 16654 + }, + { + "epoch": 0.09905200304500904, + "grad_norm": 1.6621825695037842, + "learning_rate": 4.8799458457942106e-05, + "loss": 5.0041, + "step": 16655 + }, + { + "epoch": 0.09905795032829004, + "grad_norm": 1.8790650367736816, + "learning_rate": 4.879931544438444e-05, + "loss": 4.6893, + "step": 16656 + }, + { + "epoch": 0.09906389761157103, + "grad_norm": 2.20035982131958, + "learning_rate": 4.879917242251868e-05, + "loss": 4.4463, + "step": 16657 + }, + { + "epoch": 0.09906984489485203, + "grad_norm": 1.4379361867904663, + "learning_rate": 4.879902939234487e-05, + "loss": 4.993, + "step": 16658 + }, + { + "epoch": 0.09907579217813303, + "grad_norm": 2.2738726139068604, + "learning_rate": 4.879888635386307e-05, + "loss": 5.108, + "step": 16659 + }, + { + "epoch": 0.09908173946141402, + "grad_norm": 2.0921952724456787, + "learning_rate": 4.8798743307073325e-05, + "loss": 5.3023, + "step": 16660 + }, + { + "epoch": 0.09908768674469502, + "grad_norm": 1.894437313079834, + "learning_rate": 4.8798600251975684e-05, + "loss": 5.2797, + "step": 16661 + }, + { + "epoch": 0.09909363402797602, + "grad_norm": 1.6831610202789307, + "learning_rate": 4.87984571885702e-05, + "loss": 5.3342, + "step": 16662 + }, + { + "epoch": 0.09909958131125701, + "grad_norm": 1.9177473783493042, + "learning_rate": 4.879831411685691e-05, + "loss": 5.2245, + "step": 16663 + }, + { + "epoch": 0.09910552859453801, + "grad_norm": 1.8289183378219604, + "learning_rate": 4.879817103683589e-05, + "loss": 5.2411, + "step": 16664 + }, + { + "epoch": 0.09911147587781902, + "grad_norm": 1.7047971487045288, + "learning_rate": 4.8798027948507166e-05, + "loss": 5.1896, + "step": 16665 + }, + { + "epoch": 0.0991174231611, + "grad_norm": 1.5395535230636597, + "learning_rate": 4.87978848518708e-05, + "loss": 5.0688, + "step": 16666 + }, + { + "epoch": 0.099123370444381, + "grad_norm": 1.652870535850525, + "learning_rate": 4.879774174692683e-05, + "loss": 5.1786, + "step": 16667 + }, + { + "epoch": 0.09912931772766201, + "grad_norm": 1.7581889629364014, + "learning_rate": 4.8797598633675326e-05, + "loss": 5.0549, + "step": 16668 + }, + { + "epoch": 0.099135265010943, + "grad_norm": 1.6056864261627197, + "learning_rate": 4.8797455512116315e-05, + "loss": 5.0516, + "step": 16669 + }, + { + "epoch": 0.099141212294224, + "grad_norm": 1.8067295551300049, + "learning_rate": 4.879731238224986e-05, + "loss": 5.0642, + "step": 16670 + }, + { + "epoch": 0.099147159577505, + "grad_norm": 1.7332173585891724, + "learning_rate": 4.8797169244076016e-05, + "loss": 5.0361, + "step": 16671 + }, + { + "epoch": 0.09915310686078599, + "grad_norm": 1.64972984790802, + "learning_rate": 4.879702609759482e-05, + "loss": 5.0521, + "step": 16672 + }, + { + "epoch": 0.09915905414406699, + "grad_norm": 1.8066579103469849, + "learning_rate": 4.879688294280633e-05, + "loss": 5.1431, + "step": 16673 + }, + { + "epoch": 0.09916500142734799, + "grad_norm": 2.093921661376953, + "learning_rate": 4.879673977971059e-05, + "loss": 5.4831, + "step": 16674 + }, + { + "epoch": 0.09917094871062898, + "grad_norm": 2.1563215255737305, + "learning_rate": 4.879659660830766e-05, + "loss": 5.4992, + "step": 16675 + }, + { + "epoch": 0.09917689599390998, + "grad_norm": 1.9041906595230103, + "learning_rate": 4.8796453428597585e-05, + "loss": 6.0952, + "step": 16676 + }, + { + "epoch": 0.09918284327719099, + "grad_norm": 1.7259836196899414, + "learning_rate": 4.879631024058041e-05, + "loss": 5.9602, + "step": 16677 + }, + { + "epoch": 0.09918879056047197, + "grad_norm": 2.075324058532715, + "learning_rate": 4.879616704425619e-05, + "loss": 5.1186, + "step": 16678 + }, + { + "epoch": 0.09919473784375298, + "grad_norm": 2.243378162384033, + "learning_rate": 4.8796023839624975e-05, + "loss": 4.8764, + "step": 16679 + }, + { + "epoch": 0.09920068512703398, + "grad_norm": 1.8717987537384033, + "learning_rate": 4.879588062668681e-05, + "loss": 5.6084, + "step": 16680 + }, + { + "epoch": 0.09920663241031497, + "grad_norm": 1.8316127061843872, + "learning_rate": 4.879573740544175e-05, + "loss": 5.5613, + "step": 16681 + }, + { + "epoch": 0.09921257969359597, + "grad_norm": 1.7016340494155884, + "learning_rate": 4.879559417588985e-05, + "loss": 5.5577, + "step": 16682 + }, + { + "epoch": 0.09921852697687697, + "grad_norm": 2.2173359394073486, + "learning_rate": 4.879545093803115e-05, + "loss": 4.9591, + "step": 16683 + }, + { + "epoch": 0.09922447426015796, + "grad_norm": 1.9507017135620117, + "learning_rate": 4.87953076918657e-05, + "loss": 5.6648, + "step": 16684 + }, + { + "epoch": 0.09923042154343896, + "grad_norm": 1.6124898195266724, + "learning_rate": 4.879516443739356e-05, + "loss": 6.0163, + "step": 16685 + }, + { + "epoch": 0.09923636882671995, + "grad_norm": 1.5823163986206055, + "learning_rate": 4.879502117461477e-05, + "loss": 5.868, + "step": 16686 + }, + { + "epoch": 0.09924231611000095, + "grad_norm": 1.608522653579712, + "learning_rate": 4.879487790352938e-05, + "loss": 5.7482, + "step": 16687 + }, + { + "epoch": 0.09924826339328195, + "grad_norm": 1.783008337020874, + "learning_rate": 4.879473462413745e-05, + "loss": 5.2352, + "step": 16688 + }, + { + "epoch": 0.09925421067656294, + "grad_norm": 1.8089349269866943, + "learning_rate": 4.8794591336439024e-05, + "loss": 5.1793, + "step": 16689 + }, + { + "epoch": 0.09926015795984394, + "grad_norm": 1.5393356084823608, + "learning_rate": 4.879444804043415e-05, + "loss": 5.4802, + "step": 16690 + }, + { + "epoch": 0.09926610524312494, + "grad_norm": 1.7046642303466797, + "learning_rate": 4.8794304736122886e-05, + "loss": 5.8368, + "step": 16691 + }, + { + "epoch": 0.09927205252640593, + "grad_norm": 1.7474054098129272, + "learning_rate": 4.879416142350527e-05, + "loss": 5.7578, + "step": 16692 + }, + { + "epoch": 0.09927799980968693, + "grad_norm": 1.9804757833480835, + "learning_rate": 4.879401810258136e-05, + "loss": 5.691, + "step": 16693 + }, + { + "epoch": 0.09928394709296794, + "grad_norm": 1.7752422094345093, + "learning_rate": 4.87938747733512e-05, + "loss": 5.2478, + "step": 16694 + }, + { + "epoch": 0.09928989437624892, + "grad_norm": 1.8842644691467285, + "learning_rate": 4.879373143581485e-05, + "loss": 5.2061, + "step": 16695 + }, + { + "epoch": 0.09929584165952993, + "grad_norm": 1.6537442207336426, + "learning_rate": 4.8793588089972355e-05, + "loss": 5.215, + "step": 16696 + }, + { + "epoch": 0.09930178894281093, + "grad_norm": 1.5108014345169067, + "learning_rate": 4.8793444735823755e-05, + "loss": 5.2327, + "step": 16697 + }, + { + "epoch": 0.09930773622609192, + "grad_norm": 1.4653078317642212, + "learning_rate": 4.8793301373369116e-05, + "loss": 5.219, + "step": 16698 + }, + { + "epoch": 0.09931368350937292, + "grad_norm": 1.3908593654632568, + "learning_rate": 4.879315800260848e-05, + "loss": 5.1597, + "step": 16699 + }, + { + "epoch": 0.09931963079265392, + "grad_norm": 1.3809629678726196, + "learning_rate": 4.87930146235419e-05, + "loss": 5.2364, + "step": 16700 + }, + { + "epoch": 0.09932557807593491, + "grad_norm": 1.741685152053833, + "learning_rate": 4.879287123616943e-05, + "loss": 5.7777, + "step": 16701 + }, + { + "epoch": 0.09933152535921591, + "grad_norm": 1.7733122110366821, + "learning_rate": 4.879272784049111e-05, + "loss": 5.4035, + "step": 16702 + }, + { + "epoch": 0.09933747264249691, + "grad_norm": 1.4871195554733276, + "learning_rate": 4.8792584436506985e-05, + "loss": 4.961, + "step": 16703 + }, + { + "epoch": 0.0993434199257779, + "grad_norm": 1.6865509748458862, + "learning_rate": 4.8792441024217115e-05, + "loss": 4.9876, + "step": 16704 + }, + { + "epoch": 0.0993493672090589, + "grad_norm": 1.6606428623199463, + "learning_rate": 4.879229760362156e-05, + "loss": 5.1431, + "step": 16705 + }, + { + "epoch": 0.0993553144923399, + "grad_norm": 1.6394522190093994, + "learning_rate": 4.879215417472036e-05, + "loss": 5.223, + "step": 16706 + }, + { + "epoch": 0.0993612617756209, + "grad_norm": 1.6220464706420898, + "learning_rate": 4.879201073751356e-05, + "loss": 5.322, + "step": 16707 + }, + { + "epoch": 0.0993672090589019, + "grad_norm": 1.4539369344711304, + "learning_rate": 4.879186729200121e-05, + "loss": 5.1935, + "step": 16708 + }, + { + "epoch": 0.0993731563421829, + "grad_norm": 1.7421495914459229, + "learning_rate": 4.8791723838183376e-05, + "loss": 5.0639, + "step": 16709 + }, + { + "epoch": 0.09937910362546389, + "grad_norm": 1.5782475471496582, + "learning_rate": 4.8791580376060085e-05, + "loss": 5.8221, + "step": 16710 + }, + { + "epoch": 0.09938505090874489, + "grad_norm": 1.6991766691207886, + "learning_rate": 4.879143690563141e-05, + "loss": 5.9037, + "step": 16711 + }, + { + "epoch": 0.09939099819202589, + "grad_norm": 1.7815147638320923, + "learning_rate": 4.879129342689739e-05, + "loss": 5.668, + "step": 16712 + }, + { + "epoch": 0.09939694547530688, + "grad_norm": 1.6047189235687256, + "learning_rate": 4.879114993985806e-05, + "loss": 5.3005, + "step": 16713 + }, + { + "epoch": 0.09940289275858788, + "grad_norm": 1.8050780296325684, + "learning_rate": 4.87910064445135e-05, + "loss": 5.4931, + "step": 16714 + }, + { + "epoch": 0.09940884004186887, + "grad_norm": 2.010920286178589, + "learning_rate": 4.8790862940863744e-05, + "loss": 5.6301, + "step": 16715 + }, + { + "epoch": 0.09941478732514987, + "grad_norm": 1.443099856376648, + "learning_rate": 4.879071942890884e-05, + "loss": 5.9498, + "step": 16716 + }, + { + "epoch": 0.09942073460843087, + "grad_norm": 1.777207612991333, + "learning_rate": 4.879057590864885e-05, + "loss": 5.2754, + "step": 16717 + }, + { + "epoch": 0.09942668189171186, + "grad_norm": 2.314602851867676, + "learning_rate": 4.87904323800838e-05, + "loss": 5.1447, + "step": 16718 + }, + { + "epoch": 0.09943262917499286, + "grad_norm": 1.4886807203292847, + "learning_rate": 4.879028884321377e-05, + "loss": 5.5389, + "step": 16719 + }, + { + "epoch": 0.09943857645827386, + "grad_norm": 1.4403626918792725, + "learning_rate": 4.879014529803879e-05, + "loss": 5.5377, + "step": 16720 + }, + { + "epoch": 0.09944452374155485, + "grad_norm": 1.570827841758728, + "learning_rate": 4.8790001744558916e-05, + "loss": 5.2541, + "step": 16721 + }, + { + "epoch": 0.09945047102483585, + "grad_norm": 1.6352084875106812, + "learning_rate": 4.87898581827742e-05, + "loss": 4.9031, + "step": 16722 + }, + { + "epoch": 0.09945641830811686, + "grad_norm": 1.864465594291687, + "learning_rate": 4.878971461268469e-05, + "loss": 4.8689, + "step": 16723 + }, + { + "epoch": 0.09946236559139784, + "grad_norm": 1.5618411302566528, + "learning_rate": 4.878957103429044e-05, + "loss": 5.4576, + "step": 16724 + }, + { + "epoch": 0.09946831287467885, + "grad_norm": 1.6910091638565063, + "learning_rate": 4.8789427447591486e-05, + "loss": 5.557, + "step": 16725 + }, + { + "epoch": 0.09947426015795985, + "grad_norm": 1.708056926727295, + "learning_rate": 4.8789283852587895e-05, + "loss": 5.5343, + "step": 16726 + }, + { + "epoch": 0.09948020744124084, + "grad_norm": 1.5828802585601807, + "learning_rate": 4.878914024927971e-05, + "loss": 5.3913, + "step": 16727 + }, + { + "epoch": 0.09948615472452184, + "grad_norm": 1.6802269220352173, + "learning_rate": 4.878899663766698e-05, + "loss": 5.4407, + "step": 16728 + }, + { + "epoch": 0.09949210200780284, + "grad_norm": 2.0542306900024414, + "learning_rate": 4.8788853017749766e-05, + "loss": 4.9265, + "step": 16729 + }, + { + "epoch": 0.09949804929108383, + "grad_norm": 2.035903215408325, + "learning_rate": 4.87887093895281e-05, + "loss": 5.1802, + "step": 16730 + }, + { + "epoch": 0.09950399657436483, + "grad_norm": 1.7885538339614868, + "learning_rate": 4.8788565753002044e-05, + "loss": 5.5238, + "step": 16731 + }, + { + "epoch": 0.09950994385764583, + "grad_norm": 1.606881022453308, + "learning_rate": 4.878842210817165e-05, + "loss": 5.805, + "step": 16732 + }, + { + "epoch": 0.09951589114092682, + "grad_norm": 1.6354256868362427, + "learning_rate": 4.8788278455036956e-05, + "loss": 5.7968, + "step": 16733 + }, + { + "epoch": 0.09952183842420782, + "grad_norm": 1.7537651062011719, + "learning_rate": 4.8788134793598024e-05, + "loss": 5.5945, + "step": 16734 + }, + { + "epoch": 0.09952778570748883, + "grad_norm": 2.149411678314209, + "learning_rate": 4.8787991123854895e-05, + "loss": 4.7458, + "step": 16735 + }, + { + "epoch": 0.09953373299076981, + "grad_norm": 1.9956060647964478, + "learning_rate": 4.878784744580763e-05, + "loss": 4.9471, + "step": 16736 + }, + { + "epoch": 0.09953968027405082, + "grad_norm": 2.0445396900177, + "learning_rate": 4.878770375945627e-05, + "loss": 4.9063, + "step": 16737 + }, + { + "epoch": 0.09954562755733182, + "grad_norm": 1.8563852310180664, + "learning_rate": 4.878756006480088e-05, + "loss": 5.8788, + "step": 16738 + }, + { + "epoch": 0.0995515748406128, + "grad_norm": 1.8931719064712524, + "learning_rate": 4.8787416361841474e-05, + "loss": 6.0917, + "step": 16739 + }, + { + "epoch": 0.09955752212389381, + "grad_norm": 2.062368869781494, + "learning_rate": 4.878727265057814e-05, + "loss": 5.0113, + "step": 16740 + }, + { + "epoch": 0.09956346940717481, + "grad_norm": 1.7274762392044067, + "learning_rate": 4.878712893101092e-05, + "loss": 5.7383, + "step": 16741 + }, + { + "epoch": 0.0995694166904558, + "grad_norm": 1.7377746105194092, + "learning_rate": 4.878698520313986e-05, + "loss": 5.5545, + "step": 16742 + }, + { + "epoch": 0.0995753639737368, + "grad_norm": 1.8383115530014038, + "learning_rate": 4.8786841466965e-05, + "loss": 5.2297, + "step": 16743 + }, + { + "epoch": 0.09958131125701779, + "grad_norm": 1.7715762853622437, + "learning_rate": 4.8786697722486405e-05, + "loss": 5.4735, + "step": 16744 + }, + { + "epoch": 0.09958725854029879, + "grad_norm": 1.8447803258895874, + "learning_rate": 4.878655396970412e-05, + "loss": 5.25, + "step": 16745 + }, + { + "epoch": 0.09959320582357979, + "grad_norm": 2.215622663497925, + "learning_rate": 4.878641020861819e-05, + "loss": 4.8387, + "step": 16746 + }, + { + "epoch": 0.09959915310686078, + "grad_norm": 1.71353018283844, + "learning_rate": 4.878626643922867e-05, + "loss": 5.6831, + "step": 16747 + }, + { + "epoch": 0.09960510039014178, + "grad_norm": 1.8424171209335327, + "learning_rate": 4.8786122661535616e-05, + "loss": 5.5785, + "step": 16748 + }, + { + "epoch": 0.09961104767342278, + "grad_norm": 1.8796172142028809, + "learning_rate": 4.8785978875539065e-05, + "loss": 5.5921, + "step": 16749 + }, + { + "epoch": 0.09961699495670377, + "grad_norm": 1.820435881614685, + "learning_rate": 4.878583508123908e-05, + "loss": 5.7645, + "step": 16750 + }, + { + "epoch": 0.09962294223998477, + "grad_norm": 1.9210152626037598, + "learning_rate": 4.87856912786357e-05, + "loss": 5.0471, + "step": 16751 + }, + { + "epoch": 0.09962888952326578, + "grad_norm": 1.4372605085372925, + "learning_rate": 4.878554746772899e-05, + "loss": 5.3131, + "step": 16752 + }, + { + "epoch": 0.09963483680654676, + "grad_norm": 1.8078817129135132, + "learning_rate": 4.878540364851898e-05, + "loss": 5.266, + "step": 16753 + }, + { + "epoch": 0.09964078408982777, + "grad_norm": 2.068875551223755, + "learning_rate": 4.878525982100575e-05, + "loss": 4.714, + "step": 16754 + }, + { + "epoch": 0.09964673137310877, + "grad_norm": 2.0813167095184326, + "learning_rate": 4.878511598518931e-05, + "loss": 4.5889, + "step": 16755 + }, + { + "epoch": 0.09965267865638976, + "grad_norm": 2.3035426139831543, + "learning_rate": 4.878497214106974e-05, + "loss": 4.8549, + "step": 16756 + }, + { + "epoch": 0.09965862593967076, + "grad_norm": 1.7791129350662231, + "learning_rate": 4.878482828864709e-05, + "loss": 5.2515, + "step": 16757 + }, + { + "epoch": 0.09966457322295176, + "grad_norm": 1.7512277364730835, + "learning_rate": 4.878468442792139e-05, + "loss": 5.8079, + "step": 16758 + }, + { + "epoch": 0.09967052050623275, + "grad_norm": 1.789523720741272, + "learning_rate": 4.878454055889271e-05, + "loss": 5.4302, + "step": 16759 + }, + { + "epoch": 0.09967646778951375, + "grad_norm": 1.72003173828125, + "learning_rate": 4.8784396681561086e-05, + "loss": 5.6425, + "step": 16760 + }, + { + "epoch": 0.09968241507279475, + "grad_norm": 2.0497727394104004, + "learning_rate": 4.878425279592658e-05, + "loss": 5.6608, + "step": 16761 + }, + { + "epoch": 0.09968836235607574, + "grad_norm": 1.7305432558059692, + "learning_rate": 4.878410890198923e-05, + "loss": 5.5431, + "step": 16762 + }, + { + "epoch": 0.09969430963935674, + "grad_norm": 1.708824634552002, + "learning_rate": 4.878396499974911e-05, + "loss": 5.1754, + "step": 16763 + }, + { + "epoch": 0.09970025692263774, + "grad_norm": 1.9238412380218506, + "learning_rate": 4.878382108920624e-05, + "loss": 5.0595, + "step": 16764 + }, + { + "epoch": 0.09970620420591873, + "grad_norm": 1.7634879350662231, + "learning_rate": 4.878367717036069e-05, + "loss": 5.5733, + "step": 16765 + }, + { + "epoch": 0.09971215148919974, + "grad_norm": 1.7330491542816162, + "learning_rate": 4.8783533243212495e-05, + "loss": 5.4314, + "step": 16766 + }, + { + "epoch": 0.09971809877248074, + "grad_norm": 1.4424408674240112, + "learning_rate": 4.878338930776172e-05, + "loss": 5.3059, + "step": 16767 + }, + { + "epoch": 0.09972404605576173, + "grad_norm": 1.4692374467849731, + "learning_rate": 4.878324536400841e-05, + "loss": 5.2838, + "step": 16768 + }, + { + "epoch": 0.09972999333904273, + "grad_norm": 1.3602346181869507, + "learning_rate": 4.878310141195262e-05, + "loss": 5.5587, + "step": 16769 + }, + { + "epoch": 0.09973594062232373, + "grad_norm": 1.3222168684005737, + "learning_rate": 4.878295745159438e-05, + "loss": 5.61, + "step": 16770 + }, + { + "epoch": 0.09974188790560472, + "grad_norm": 1.398383378982544, + "learning_rate": 4.878281348293377e-05, + "loss": 5.5348, + "step": 16771 + }, + { + "epoch": 0.09974783518888572, + "grad_norm": 1.4184808731079102, + "learning_rate": 4.878266950597081e-05, + "loss": 5.4425, + "step": 16772 + }, + { + "epoch": 0.09975378247216671, + "grad_norm": 1.2451627254486084, + "learning_rate": 4.878252552070558e-05, + "loss": 5.5105, + "step": 16773 + }, + { + "epoch": 0.09975972975544771, + "grad_norm": 1.4243760108947754, + "learning_rate": 4.878238152713811e-05, + "loss": 5.5839, + "step": 16774 + }, + { + "epoch": 0.09976567703872871, + "grad_norm": 1.1774061918258667, + "learning_rate": 4.878223752526846e-05, + "loss": 5.4785, + "step": 16775 + }, + { + "epoch": 0.0997716243220097, + "grad_norm": 1.2542285919189453, + "learning_rate": 4.8782093515096676e-05, + "loss": 5.4994, + "step": 16776 + }, + { + "epoch": 0.0997775716052907, + "grad_norm": 1.486611008644104, + "learning_rate": 4.878194949662281e-05, + "loss": 5.347, + "step": 16777 + }, + { + "epoch": 0.0997835188885717, + "grad_norm": 1.391717791557312, + "learning_rate": 4.878180546984691e-05, + "loss": 5.3397, + "step": 16778 + }, + { + "epoch": 0.09978946617185269, + "grad_norm": 1.819778323173523, + "learning_rate": 4.878166143476902e-05, + "loss": 5.4217, + "step": 16779 + }, + { + "epoch": 0.0997954134551337, + "grad_norm": 1.549660563468933, + "learning_rate": 4.8781517391389205e-05, + "loss": 5.5044, + "step": 16780 + }, + { + "epoch": 0.0998013607384147, + "grad_norm": 1.4923075437545776, + "learning_rate": 4.878137333970751e-05, + "loss": 5.4779, + "step": 16781 + }, + { + "epoch": 0.09980730802169568, + "grad_norm": 1.3846399784088135, + "learning_rate": 4.878122927972398e-05, + "loss": 5.8974, + "step": 16782 + }, + { + "epoch": 0.09981325530497669, + "grad_norm": 1.325563669204712, + "learning_rate": 4.878108521143867e-05, + "loss": 5.516, + "step": 16783 + }, + { + "epoch": 0.09981920258825769, + "grad_norm": 1.3482844829559326, + "learning_rate": 4.878094113485162e-05, + "loss": 5.4661, + "step": 16784 + }, + { + "epoch": 0.09982514987153868, + "grad_norm": 1.4238206148147583, + "learning_rate": 4.87807970499629e-05, + "loss": 5.5551, + "step": 16785 + }, + { + "epoch": 0.09983109715481968, + "grad_norm": 1.1277439594268799, + "learning_rate": 4.8780652956772544e-05, + "loss": 5.3611, + "step": 16786 + }, + { + "epoch": 0.09983704443810068, + "grad_norm": 1.2312495708465576, + "learning_rate": 4.878050885528061e-05, + "loss": 5.4233, + "step": 16787 + }, + { + "epoch": 0.09984299172138167, + "grad_norm": 1.3811876773834229, + "learning_rate": 4.878036474548715e-05, + "loss": 5.4336, + "step": 16788 + }, + { + "epoch": 0.09984893900466267, + "grad_norm": 1.211362361907959, + "learning_rate": 4.87802206273922e-05, + "loss": 4.9956, + "step": 16789 + }, + { + "epoch": 0.09985488628794367, + "grad_norm": 1.0385311841964722, + "learning_rate": 4.878007650099583e-05, + "loss": 5.4416, + "step": 16790 + }, + { + "epoch": 0.09986083357122466, + "grad_norm": 1.2311192750930786, + "learning_rate": 4.8779932366298074e-05, + "loss": 5.4814, + "step": 16791 + }, + { + "epoch": 0.09986678085450566, + "grad_norm": 1.6310219764709473, + "learning_rate": 4.8779788223299e-05, + "loss": 5.1746, + "step": 16792 + }, + { + "epoch": 0.09987272813778666, + "grad_norm": 1.4695444107055664, + "learning_rate": 4.877964407199864e-05, + "loss": 5.3724, + "step": 16793 + }, + { + "epoch": 0.09987867542106765, + "grad_norm": 1.8295196294784546, + "learning_rate": 4.877949991239705e-05, + "loss": 5.1085, + "step": 16794 + }, + { + "epoch": 0.09988462270434866, + "grad_norm": 1.5845080614089966, + "learning_rate": 4.877935574449428e-05, + "loss": 5.027, + "step": 16795 + }, + { + "epoch": 0.09989056998762966, + "grad_norm": 1.3743692636489868, + "learning_rate": 4.8779211568290395e-05, + "loss": 5.0717, + "step": 16796 + }, + { + "epoch": 0.09989651727091065, + "grad_norm": 1.3857053518295288, + "learning_rate": 4.877906738378542e-05, + "loss": 4.9698, + "step": 16797 + }, + { + "epoch": 0.09990246455419165, + "grad_norm": 1.3818373680114746, + "learning_rate": 4.8778923190979425e-05, + "loss": 4.8686, + "step": 16798 + }, + { + "epoch": 0.09990841183747265, + "grad_norm": 1.563095211982727, + "learning_rate": 4.877877898987245e-05, + "loss": 4.6804, + "step": 16799 + }, + { + "epoch": 0.09991435912075364, + "grad_norm": 1.3965919017791748, + "learning_rate": 4.877863478046455e-05, + "loss": 5.141, + "step": 16800 + }, + { + "epoch": 0.09992030640403464, + "grad_norm": 1.5473159551620483, + "learning_rate": 4.8778490562755775e-05, + "loss": 5.0796, + "step": 16801 + }, + { + "epoch": 0.09992625368731563, + "grad_norm": 2.548140525817871, + "learning_rate": 4.877834633674618e-05, + "loss": 4.9149, + "step": 16802 + }, + { + "epoch": 0.09993220097059663, + "grad_norm": 1.59461510181427, + "learning_rate": 4.87782021024358e-05, + "loss": 4.9048, + "step": 16803 + }, + { + "epoch": 0.09993814825387763, + "grad_norm": 1.49467134475708, + "learning_rate": 4.87780578598247e-05, + "loss": 5.2484, + "step": 16804 + }, + { + "epoch": 0.09994409553715862, + "grad_norm": 1.5844218730926514, + "learning_rate": 4.8777913608912926e-05, + "loss": 5.2107, + "step": 16805 + }, + { + "epoch": 0.09995004282043962, + "grad_norm": 1.465334415435791, + "learning_rate": 4.877776934970053e-05, + "loss": 5.4002, + "step": 16806 + }, + { + "epoch": 0.09995599010372062, + "grad_norm": 1.5409786701202393, + "learning_rate": 4.877762508218756e-05, + "loss": 5.6233, + "step": 16807 + }, + { + "epoch": 0.09996193738700161, + "grad_norm": 1.3813812732696533, + "learning_rate": 4.877748080637406e-05, + "loss": 5.3072, + "step": 16808 + }, + { + "epoch": 0.09996788467028261, + "grad_norm": 1.3815702199935913, + "learning_rate": 4.8777336522260095e-05, + "loss": 5.0923, + "step": 16809 + }, + { + "epoch": 0.09997383195356362, + "grad_norm": 1.6513910293579102, + "learning_rate": 4.87771922298457e-05, + "loss": 5.0482, + "step": 16810 + }, + { + "epoch": 0.0999797792368446, + "grad_norm": 1.6680731773376465, + "learning_rate": 4.8777047929130944e-05, + "loss": 4.984, + "step": 16811 + }, + { + "epoch": 0.0999857265201256, + "grad_norm": 1.4342384338378906, + "learning_rate": 4.8776903620115855e-05, + "loss": 5.2745, + "step": 16812 + }, + { + "epoch": 0.09999167380340661, + "grad_norm": 1.564255714416504, + "learning_rate": 4.87767593028005e-05, + "loss": 5.398, + "step": 16813 + }, + { + "epoch": 0.0999976210866876, + "grad_norm": 1.2767013311386108, + "learning_rate": 4.877661497718493e-05, + "loss": 5.0663, + "step": 16814 + }, + { + "epoch": 0.1000035683699686, + "grad_norm": 1.35418701171875, + "learning_rate": 4.877647064326918e-05, + "loss": 5.064, + "step": 16815 + }, + { + "epoch": 0.1000095156532496, + "grad_norm": 1.5754468441009521, + "learning_rate": 4.877632630105331e-05, + "loss": 5.1525, + "step": 16816 + }, + { + "epoch": 0.10001546293653059, + "grad_norm": 1.8457043170928955, + "learning_rate": 4.877618195053737e-05, + "loss": 5.3074, + "step": 16817 + }, + { + "epoch": 0.10002141021981159, + "grad_norm": 1.7238751649856567, + "learning_rate": 4.877603759172141e-05, + "loss": 5.3408, + "step": 16818 + }, + { + "epoch": 0.10002735750309259, + "grad_norm": 1.5342493057250977, + "learning_rate": 4.8775893224605486e-05, + "loss": 5.3495, + "step": 16819 + }, + { + "epoch": 0.10003330478637358, + "grad_norm": 1.4931390285491943, + "learning_rate": 4.877574884918964e-05, + "loss": 5.2617, + "step": 16820 + }, + { + "epoch": 0.10003925206965458, + "grad_norm": 1.5503534078598022, + "learning_rate": 4.877560446547393e-05, + "loss": 5.0805, + "step": 16821 + }, + { + "epoch": 0.10004519935293558, + "grad_norm": 1.480191707611084, + "learning_rate": 4.87754600734584e-05, + "loss": 5.1405, + "step": 16822 + }, + { + "epoch": 0.10005114663621657, + "grad_norm": 1.371559977531433, + "learning_rate": 4.87753156731431e-05, + "loss": 5.2313, + "step": 16823 + }, + { + "epoch": 0.10005709391949758, + "grad_norm": 1.2534080743789673, + "learning_rate": 4.8775171264528085e-05, + "loss": 5.3029, + "step": 16824 + }, + { + "epoch": 0.10006304120277858, + "grad_norm": 1.4513366222381592, + "learning_rate": 4.8775026847613406e-05, + "loss": 5.2663, + "step": 16825 + }, + { + "epoch": 0.10006898848605957, + "grad_norm": 1.4045735597610474, + "learning_rate": 4.8774882422399105e-05, + "loss": 5.2358, + "step": 16826 + }, + { + "epoch": 0.10007493576934057, + "grad_norm": 1.469664216041565, + "learning_rate": 4.877473798888524e-05, + "loss": 5.0215, + "step": 16827 + }, + { + "epoch": 0.10008088305262157, + "grad_norm": 1.4306927919387817, + "learning_rate": 4.8774593547071855e-05, + "loss": 4.8262, + "step": 16828 + }, + { + "epoch": 0.10008683033590256, + "grad_norm": 1.5118143558502197, + "learning_rate": 4.877444909695902e-05, + "loss": 4.8248, + "step": 16829 + }, + { + "epoch": 0.10009277761918356, + "grad_norm": 1.3022321462631226, + "learning_rate": 4.8774304638546754e-05, + "loss": 4.7268, + "step": 16830 + }, + { + "epoch": 0.10009872490246455, + "grad_norm": 1.468758463859558, + "learning_rate": 4.877416017183513e-05, + "loss": 4.8686, + "step": 16831 + }, + { + "epoch": 0.10010467218574555, + "grad_norm": 1.4958772659301758, + "learning_rate": 4.8774015696824196e-05, + "loss": 5.084, + "step": 16832 + }, + { + "epoch": 0.10011061946902655, + "grad_norm": 1.5816160440444946, + "learning_rate": 4.877387121351399e-05, + "loss": 5.1009, + "step": 16833 + }, + { + "epoch": 0.10011656675230754, + "grad_norm": 1.4751555919647217, + "learning_rate": 4.877372672190458e-05, + "loss": 5.1875, + "step": 16834 + }, + { + "epoch": 0.10012251403558854, + "grad_norm": 1.380433201789856, + "learning_rate": 4.8773582221996006e-05, + "loss": 5.3213, + "step": 16835 + }, + { + "epoch": 0.10012846131886954, + "grad_norm": 1.566112756729126, + "learning_rate": 4.877343771378832e-05, + "loss": 4.9251, + "step": 16836 + }, + { + "epoch": 0.10013440860215053, + "grad_norm": 1.4834301471710205, + "learning_rate": 4.8773293197281566e-05, + "loss": 4.7936, + "step": 16837 + }, + { + "epoch": 0.10014035588543153, + "grad_norm": 1.6053043603897095, + "learning_rate": 4.877314867247581e-05, + "loss": 4.8611, + "step": 16838 + }, + { + "epoch": 0.10014630316871254, + "grad_norm": 1.420598030090332, + "learning_rate": 4.877300413937109e-05, + "loss": 5.0481, + "step": 16839 + }, + { + "epoch": 0.10015225045199352, + "grad_norm": 1.474554181098938, + "learning_rate": 4.877285959796746e-05, + "loss": 5.0342, + "step": 16840 + }, + { + "epoch": 0.10015819773527453, + "grad_norm": 1.6535485982894897, + "learning_rate": 4.877271504826496e-05, + "loss": 5.4624, + "step": 16841 + }, + { + "epoch": 0.10016414501855553, + "grad_norm": 1.3873733282089233, + "learning_rate": 4.877257049026367e-05, + "loss": 5.1673, + "step": 16842 + }, + { + "epoch": 0.10017009230183652, + "grad_norm": 1.3890115022659302, + "learning_rate": 4.8772425923963606e-05, + "loss": 4.938, + "step": 16843 + }, + { + "epoch": 0.10017603958511752, + "grad_norm": 1.443969964981079, + "learning_rate": 4.8772281349364846e-05, + "loss": 4.8525, + "step": 16844 + }, + { + "epoch": 0.10018198686839852, + "grad_norm": 1.545344591140747, + "learning_rate": 4.877213676646742e-05, + "loss": 4.8682, + "step": 16845 + }, + { + "epoch": 0.10018793415167951, + "grad_norm": 1.6065396070480347, + "learning_rate": 4.877199217527138e-05, + "loss": 4.7394, + "step": 16846 + }, + { + "epoch": 0.10019388143496051, + "grad_norm": 1.444199800491333, + "learning_rate": 4.877184757577679e-05, + "loss": 4.7775, + "step": 16847 + }, + { + "epoch": 0.10019982871824151, + "grad_norm": 1.5434626340866089, + "learning_rate": 4.87717029679837e-05, + "loss": 4.6714, + "step": 16848 + }, + { + "epoch": 0.1002057760015225, + "grad_norm": 1.502533197402954, + "learning_rate": 4.877155835189215e-05, + "loss": 4.7591, + "step": 16849 + }, + { + "epoch": 0.1002117232848035, + "grad_norm": 1.6330854892730713, + "learning_rate": 4.877141372750219e-05, + "loss": 4.7426, + "step": 16850 + }, + { + "epoch": 0.1002176705680845, + "grad_norm": 1.658887267112732, + "learning_rate": 4.877126909481388e-05, + "loss": 4.7558, + "step": 16851 + }, + { + "epoch": 0.10022361785136549, + "grad_norm": 1.4569580554962158, + "learning_rate": 4.877112445382727e-05, + "loss": 4.7797, + "step": 16852 + }, + { + "epoch": 0.1002295651346465, + "grad_norm": 1.4903759956359863, + "learning_rate": 4.8770979804542394e-05, + "loss": 4.7895, + "step": 16853 + }, + { + "epoch": 0.1002355124179275, + "grad_norm": 1.638406753540039, + "learning_rate": 4.877083514695933e-05, + "loss": 4.7197, + "step": 16854 + }, + { + "epoch": 0.10024145970120849, + "grad_norm": 1.4558868408203125, + "learning_rate": 4.87706904810781e-05, + "loss": 4.7159, + "step": 16855 + }, + { + "epoch": 0.10024740698448949, + "grad_norm": 1.5545023679733276, + "learning_rate": 4.877054580689877e-05, + "loss": 4.7387, + "step": 16856 + }, + { + "epoch": 0.10025335426777049, + "grad_norm": 1.3767842054367065, + "learning_rate": 4.877040112442139e-05, + "loss": 4.7149, + "step": 16857 + }, + { + "epoch": 0.10025930155105148, + "grad_norm": 1.4483342170715332, + "learning_rate": 4.877025643364601e-05, + "loss": 4.7756, + "step": 16858 + }, + { + "epoch": 0.10026524883433248, + "grad_norm": 1.1949654817581177, + "learning_rate": 4.8770111734572673e-05, + "loss": 4.7883, + "step": 16859 + }, + { + "epoch": 0.10027119611761347, + "grad_norm": 1.430977463722229, + "learning_rate": 4.876996702720144e-05, + "loss": 5.0236, + "step": 16860 + }, + { + "epoch": 0.10027714340089447, + "grad_norm": 1.4976351261138916, + "learning_rate": 4.876982231153236e-05, + "loss": 5.1242, + "step": 16861 + }, + { + "epoch": 0.10028309068417547, + "grad_norm": 1.6913431882858276, + "learning_rate": 4.876967758756547e-05, + "loss": 5.3454, + "step": 16862 + }, + { + "epoch": 0.10028903796745646, + "grad_norm": 1.5901557207107544, + "learning_rate": 4.876953285530084e-05, + "loss": 5.2313, + "step": 16863 + }, + { + "epoch": 0.10029498525073746, + "grad_norm": 2.483757257461548, + "learning_rate": 4.8769388114738515e-05, + "loss": 4.9951, + "step": 16864 + }, + { + "epoch": 0.10030093253401846, + "grad_norm": 1.5647902488708496, + "learning_rate": 4.8769243365878536e-05, + "loss": 5.1029, + "step": 16865 + }, + { + "epoch": 0.10030687981729945, + "grad_norm": 1.5830740928649902, + "learning_rate": 4.8769098608720954e-05, + "loss": 5.1918, + "step": 16866 + }, + { + "epoch": 0.10031282710058045, + "grad_norm": 1.5231165885925293, + "learning_rate": 4.876895384326584e-05, + "loss": 5.0817, + "step": 16867 + }, + { + "epoch": 0.10031877438386146, + "grad_norm": 1.5266731977462769, + "learning_rate": 4.876880906951321e-05, + "loss": 4.9117, + "step": 16868 + }, + { + "epoch": 0.10032472166714244, + "grad_norm": 1.9662569761276245, + "learning_rate": 4.876866428746315e-05, + "loss": 4.8381, + "step": 16869 + }, + { + "epoch": 0.10033066895042345, + "grad_norm": 1.34932279586792, + "learning_rate": 4.876851949711569e-05, + "loss": 5.0781, + "step": 16870 + }, + { + "epoch": 0.10033661623370445, + "grad_norm": 1.3333275318145752, + "learning_rate": 4.876837469847089e-05, + "loss": 5.0527, + "step": 16871 + }, + { + "epoch": 0.10034256351698544, + "grad_norm": 1.3569806814193726, + "learning_rate": 4.876822989152879e-05, + "loss": 5.0854, + "step": 16872 + }, + { + "epoch": 0.10034851080026644, + "grad_norm": 1.4417848587036133, + "learning_rate": 4.876808507628945e-05, + "loss": 4.885, + "step": 16873 + }, + { + "epoch": 0.10035445808354744, + "grad_norm": 1.453704833984375, + "learning_rate": 4.876794025275292e-05, + "loss": 4.8919, + "step": 16874 + }, + { + "epoch": 0.10036040536682843, + "grad_norm": 1.392701268196106, + "learning_rate": 4.876779542091924e-05, + "loss": 5.0682, + "step": 16875 + }, + { + "epoch": 0.10036635265010943, + "grad_norm": 1.5623222589492798, + "learning_rate": 4.876765058078847e-05, + "loss": 5.0369, + "step": 16876 + }, + { + "epoch": 0.10037229993339043, + "grad_norm": 1.4053794145584106, + "learning_rate": 4.876750573236066e-05, + "loss": 4.9932, + "step": 16877 + }, + { + "epoch": 0.10037824721667142, + "grad_norm": 1.3282443284988403, + "learning_rate": 4.876736087563586e-05, + "loss": 5.0678, + "step": 16878 + }, + { + "epoch": 0.10038419449995242, + "grad_norm": 1.3737441301345825, + "learning_rate": 4.876721601061412e-05, + "loss": 5.1292, + "step": 16879 + }, + { + "epoch": 0.10039014178323342, + "grad_norm": 1.3209916353225708, + "learning_rate": 4.876707113729549e-05, + "loss": 5.0717, + "step": 16880 + }, + { + "epoch": 0.10039608906651441, + "grad_norm": 1.2051011323928833, + "learning_rate": 4.8766926255680026e-05, + "loss": 5.0075, + "step": 16881 + }, + { + "epoch": 0.10040203634979541, + "grad_norm": 1.260746955871582, + "learning_rate": 4.876678136576777e-05, + "loss": 4.8419, + "step": 16882 + }, + { + "epoch": 0.10040798363307642, + "grad_norm": 1.3981266021728516, + "learning_rate": 4.876663646755877e-05, + "loss": 4.8558, + "step": 16883 + }, + { + "epoch": 0.1004139309163574, + "grad_norm": 1.3491755723953247, + "learning_rate": 4.876649156105309e-05, + "loss": 4.7809, + "step": 16884 + }, + { + "epoch": 0.10041987819963841, + "grad_norm": 1.3315166234970093, + "learning_rate": 4.8766346646250774e-05, + "loss": 4.9221, + "step": 16885 + }, + { + "epoch": 0.10042582548291941, + "grad_norm": 1.250731348991394, + "learning_rate": 4.876620172315186e-05, + "loss": 4.8344, + "step": 16886 + }, + { + "epoch": 0.1004317727662004, + "grad_norm": 1.249316692352295, + "learning_rate": 4.876605679175642e-05, + "loss": 4.8441, + "step": 16887 + }, + { + "epoch": 0.1004377200494814, + "grad_norm": 1.3112961053848267, + "learning_rate": 4.87659118520645e-05, + "loss": 4.834, + "step": 16888 + }, + { + "epoch": 0.10044366733276239, + "grad_norm": 1.4331620931625366, + "learning_rate": 4.876576690407614e-05, + "loss": 4.9801, + "step": 16889 + }, + { + "epoch": 0.10044961461604339, + "grad_norm": 1.5304386615753174, + "learning_rate": 4.8765621947791396e-05, + "loss": 5.1799, + "step": 16890 + }, + { + "epoch": 0.10045556189932439, + "grad_norm": 1.3581719398498535, + "learning_rate": 4.8765476983210326e-05, + "loss": 5.1517, + "step": 16891 + }, + { + "epoch": 0.10046150918260538, + "grad_norm": 1.2568892240524292, + "learning_rate": 4.876533201033296e-05, + "loss": 5.0663, + "step": 16892 + }, + { + "epoch": 0.10046745646588638, + "grad_norm": 1.3863126039505005, + "learning_rate": 4.876518702915936e-05, + "loss": 4.9666, + "step": 16893 + }, + { + "epoch": 0.10047340374916738, + "grad_norm": 1.328078031539917, + "learning_rate": 4.87650420396896e-05, + "loss": 5.0049, + "step": 16894 + }, + { + "epoch": 0.10047935103244837, + "grad_norm": 1.252009630203247, + "learning_rate": 4.8764897041923696e-05, + "loss": 5.0709, + "step": 16895 + }, + { + "epoch": 0.10048529831572937, + "grad_norm": 1.4895809888839722, + "learning_rate": 4.876475203586171e-05, + "loss": 5.0922, + "step": 16896 + }, + { + "epoch": 0.10049124559901038, + "grad_norm": 1.363641619682312, + "learning_rate": 4.8764607021503696e-05, + "loss": 5.0233, + "step": 16897 + }, + { + "epoch": 0.10049719288229136, + "grad_norm": 1.5323866605758667, + "learning_rate": 4.876446199884971e-05, + "loss": 4.8705, + "step": 16898 + }, + { + "epoch": 0.10050314016557237, + "grad_norm": 1.4069478511810303, + "learning_rate": 4.8764316967899786e-05, + "loss": 5.0136, + "step": 16899 + }, + { + "epoch": 0.10050908744885337, + "grad_norm": 1.4166046380996704, + "learning_rate": 4.876417192865399e-05, + "loss": 5.0047, + "step": 16900 + }, + { + "epoch": 0.10051503473213436, + "grad_norm": 1.5298703908920288, + "learning_rate": 4.876402688111237e-05, + "loss": 5.0046, + "step": 16901 + }, + { + "epoch": 0.10052098201541536, + "grad_norm": 1.340071678161621, + "learning_rate": 4.876388182527497e-05, + "loss": 5.107, + "step": 16902 + }, + { + "epoch": 0.10052692929869636, + "grad_norm": 1.367415189743042, + "learning_rate": 4.876373676114184e-05, + "loss": 4.9292, + "step": 16903 + }, + { + "epoch": 0.10053287658197735, + "grad_norm": 1.3535525798797607, + "learning_rate": 4.876359168871304e-05, + "loss": 4.9801, + "step": 16904 + }, + { + "epoch": 0.10053882386525835, + "grad_norm": 1.2370539903640747, + "learning_rate": 4.8763446607988615e-05, + "loss": 4.9598, + "step": 16905 + }, + { + "epoch": 0.10054477114853935, + "grad_norm": 1.251837968826294, + "learning_rate": 4.876330151896862e-05, + "loss": 5.0506, + "step": 16906 + }, + { + "epoch": 0.10055071843182034, + "grad_norm": 1.3221372365951538, + "learning_rate": 4.8763156421653097e-05, + "loss": 5.4094, + "step": 16907 + }, + { + "epoch": 0.10055666571510134, + "grad_norm": 1.34721040725708, + "learning_rate": 4.87630113160421e-05, + "loss": 5.4361, + "step": 16908 + }, + { + "epoch": 0.10056261299838234, + "grad_norm": 1.2884198427200317, + "learning_rate": 4.876286620213568e-05, + "loss": 5.3518, + "step": 16909 + }, + { + "epoch": 0.10056856028166333, + "grad_norm": 1.259414553642273, + "learning_rate": 4.87627210799339e-05, + "loss": 5.2298, + "step": 16910 + }, + { + "epoch": 0.10057450756494433, + "grad_norm": 1.482032299041748, + "learning_rate": 4.8762575949436796e-05, + "loss": 5.3625, + "step": 16911 + }, + { + "epoch": 0.10058045484822534, + "grad_norm": 1.2673801183700562, + "learning_rate": 4.876243081064441e-05, + "loss": 5.2678, + "step": 16912 + }, + { + "epoch": 0.10058640213150633, + "grad_norm": 1.3014607429504395, + "learning_rate": 4.876228566355682e-05, + "loss": 5.2762, + "step": 16913 + }, + { + "epoch": 0.10059234941478733, + "grad_norm": 1.2084840536117554, + "learning_rate": 4.876214050817405e-05, + "loss": 5.1128, + "step": 16914 + }, + { + "epoch": 0.10059829669806833, + "grad_norm": 1.3497353792190552, + "learning_rate": 4.876199534449617e-05, + "loss": 5.1666, + "step": 16915 + }, + { + "epoch": 0.10060424398134932, + "grad_norm": 1.4095430374145508, + "learning_rate": 4.876185017252322e-05, + "loss": 5.0055, + "step": 16916 + }, + { + "epoch": 0.10061019126463032, + "grad_norm": 1.319938063621521, + "learning_rate": 4.876170499225525e-05, + "loss": 5.0628, + "step": 16917 + }, + { + "epoch": 0.10061613854791131, + "grad_norm": 1.2126001119613647, + "learning_rate": 4.876155980369232e-05, + "loss": 5.4244, + "step": 16918 + }, + { + "epoch": 0.10062208583119231, + "grad_norm": 1.0456511974334717, + "learning_rate": 4.876141460683448e-05, + "loss": 5.2556, + "step": 16919 + }, + { + "epoch": 0.10062803311447331, + "grad_norm": 1.2545825242996216, + "learning_rate": 4.8761269401681765e-05, + "loss": 5.1549, + "step": 16920 + }, + { + "epoch": 0.1006339803977543, + "grad_norm": 1.3613678216934204, + "learning_rate": 4.876112418823424e-05, + "loss": 5.0592, + "step": 16921 + }, + { + "epoch": 0.1006399276810353, + "grad_norm": 1.4963204860687256, + "learning_rate": 4.876097896649196e-05, + "loss": 5.1025, + "step": 16922 + }, + { + "epoch": 0.1006458749643163, + "grad_norm": 1.3221436738967896, + "learning_rate": 4.876083373645495e-05, + "loss": 5.2534, + "step": 16923 + }, + { + "epoch": 0.10065182224759729, + "grad_norm": 1.6041839122772217, + "learning_rate": 4.8760688498123294e-05, + "loss": 5.3351, + "step": 16924 + }, + { + "epoch": 0.1006577695308783, + "grad_norm": 1.4891480207443237, + "learning_rate": 4.876054325149702e-05, + "loss": 5.4782, + "step": 16925 + }, + { + "epoch": 0.1006637168141593, + "grad_norm": 2.101271867752075, + "learning_rate": 4.876039799657619e-05, + "loss": 5.3844, + "step": 16926 + }, + { + "epoch": 0.10066966409744028, + "grad_norm": 1.5637247562408447, + "learning_rate": 4.8760252733360845e-05, + "loss": 5.4488, + "step": 16927 + }, + { + "epoch": 0.10067561138072129, + "grad_norm": 1.5939668416976929, + "learning_rate": 4.8760107461851044e-05, + "loss": 5.3429, + "step": 16928 + }, + { + "epoch": 0.10068155866400229, + "grad_norm": 1.509945273399353, + "learning_rate": 4.875996218204684e-05, + "loss": 5.4501, + "step": 16929 + }, + { + "epoch": 0.10068750594728328, + "grad_norm": 1.553009271621704, + "learning_rate": 4.875981689394827e-05, + "loss": 5.4183, + "step": 16930 + }, + { + "epoch": 0.10069345323056428, + "grad_norm": 1.5002714395523071, + "learning_rate": 4.875967159755539e-05, + "loss": 5.2343, + "step": 16931 + }, + { + "epoch": 0.10069940051384528, + "grad_norm": 1.5027118921279907, + "learning_rate": 4.8759526292868266e-05, + "loss": 5.4414, + "step": 16932 + }, + { + "epoch": 0.10070534779712627, + "grad_norm": 1.38532555103302, + "learning_rate": 4.875938097988694e-05, + "loss": 5.4026, + "step": 16933 + }, + { + "epoch": 0.10071129508040727, + "grad_norm": 1.4190242290496826, + "learning_rate": 4.8759235658611445e-05, + "loss": 5.346, + "step": 16934 + }, + { + "epoch": 0.10071724236368827, + "grad_norm": 1.291375756263733, + "learning_rate": 4.875909032904186e-05, + "loss": 5.3715, + "step": 16935 + }, + { + "epoch": 0.10072318964696926, + "grad_norm": 1.5563501119613647, + "learning_rate": 4.8758944991178214e-05, + "loss": 5.2474, + "step": 16936 + }, + { + "epoch": 0.10072913693025026, + "grad_norm": 1.2936631441116333, + "learning_rate": 4.875879964502056e-05, + "loss": 5.2627, + "step": 16937 + }, + { + "epoch": 0.10073508421353126, + "grad_norm": 1.5020617246627808, + "learning_rate": 4.875865429056896e-05, + "loss": 5.2166, + "step": 16938 + }, + { + "epoch": 0.10074103149681225, + "grad_norm": 1.4830302000045776, + "learning_rate": 4.8758508927823464e-05, + "loss": 5.2558, + "step": 16939 + }, + { + "epoch": 0.10074697878009325, + "grad_norm": 1.4259967803955078, + "learning_rate": 4.8758363556784114e-05, + "loss": 5.3117, + "step": 16940 + }, + { + "epoch": 0.10075292606337426, + "grad_norm": 1.5735303163528442, + "learning_rate": 4.875821817745096e-05, + "loss": 5.2993, + "step": 16941 + }, + { + "epoch": 0.10075887334665524, + "grad_norm": 1.6409742832183838, + "learning_rate": 4.875807278982407e-05, + "loss": 5.4337, + "step": 16942 + }, + { + "epoch": 0.10076482062993625, + "grad_norm": 1.5159885883331299, + "learning_rate": 4.875792739390347e-05, + "loss": 5.4222, + "step": 16943 + }, + { + "epoch": 0.10077076791321725, + "grad_norm": 1.704200029373169, + "learning_rate": 4.875778198968923e-05, + "loss": 5.5248, + "step": 16944 + }, + { + "epoch": 0.10077671519649824, + "grad_norm": 1.8533267974853516, + "learning_rate": 4.875763657718139e-05, + "loss": 5.2155, + "step": 16945 + }, + { + "epoch": 0.10078266247977924, + "grad_norm": 1.3260399103164673, + "learning_rate": 4.8757491156380006e-05, + "loss": 5.3239, + "step": 16946 + }, + { + "epoch": 0.10078860976306023, + "grad_norm": 1.317050814628601, + "learning_rate": 4.875734572728513e-05, + "loss": 5.2346, + "step": 16947 + }, + { + "epoch": 0.10079455704634123, + "grad_norm": 1.5583351850509644, + "learning_rate": 4.875720028989681e-05, + "loss": 5.194, + "step": 16948 + }, + { + "epoch": 0.10080050432962223, + "grad_norm": 1.3424546718597412, + "learning_rate": 4.8757054844215094e-05, + "loss": 5.3616, + "step": 16949 + }, + { + "epoch": 0.10080645161290322, + "grad_norm": 1.3151681423187256, + "learning_rate": 4.875690939024004e-05, + "loss": 5.2183, + "step": 16950 + }, + { + "epoch": 0.10081239889618422, + "grad_norm": 1.441724419593811, + "learning_rate": 4.875676392797168e-05, + "loss": 5.3292, + "step": 16951 + }, + { + "epoch": 0.10081834617946522, + "grad_norm": 1.3751790523529053, + "learning_rate": 4.87566184574101e-05, + "loss": 5.1747, + "step": 16952 + }, + { + "epoch": 0.10082429346274621, + "grad_norm": 1.5188177824020386, + "learning_rate": 4.8756472978555314e-05, + "loss": 5.2291, + "step": 16953 + }, + { + "epoch": 0.10083024074602721, + "grad_norm": 1.2834105491638184, + "learning_rate": 4.87563274914074e-05, + "loss": 5.1655, + "step": 16954 + }, + { + "epoch": 0.10083618802930822, + "grad_norm": 1.3950659036636353, + "learning_rate": 4.8756181995966385e-05, + "loss": 5.2318, + "step": 16955 + }, + { + "epoch": 0.1008421353125892, + "grad_norm": 1.3544670343399048, + "learning_rate": 4.875603649223234e-05, + "loss": 5.026, + "step": 16956 + }, + { + "epoch": 0.1008480825958702, + "grad_norm": 1.4849059581756592, + "learning_rate": 4.875589098020531e-05, + "loss": 5.2139, + "step": 16957 + }, + { + "epoch": 0.10085402987915121, + "grad_norm": 1.2032678127288818, + "learning_rate": 4.875574545988534e-05, + "loss": 5.3103, + "step": 16958 + }, + { + "epoch": 0.1008599771624322, + "grad_norm": 1.4803698062896729, + "learning_rate": 4.875559993127249e-05, + "loss": 5.2546, + "step": 16959 + }, + { + "epoch": 0.1008659244457132, + "grad_norm": 1.374115228652954, + "learning_rate": 4.8755454394366795e-05, + "loss": 5.1654, + "step": 16960 + }, + { + "epoch": 0.1008718717289942, + "grad_norm": 1.420754075050354, + "learning_rate": 4.875530884916832e-05, + "loss": 5.3368, + "step": 16961 + }, + { + "epoch": 0.10087781901227519, + "grad_norm": 1.3919636011123657, + "learning_rate": 4.875516329567712e-05, + "loss": 5.3053, + "step": 16962 + }, + { + "epoch": 0.10088376629555619, + "grad_norm": 1.2697970867156982, + "learning_rate": 4.8755017733893235e-05, + "loss": 5.1771, + "step": 16963 + }, + { + "epoch": 0.10088971357883719, + "grad_norm": 1.3521144390106201, + "learning_rate": 4.8754872163816714e-05, + "loss": 5.3226, + "step": 16964 + }, + { + "epoch": 0.10089566086211818, + "grad_norm": 1.4171572923660278, + "learning_rate": 4.875472658544761e-05, + "loss": 5.17, + "step": 16965 + }, + { + "epoch": 0.10090160814539918, + "grad_norm": 1.1771302223205566, + "learning_rate": 4.875458099878598e-05, + "loss": 5.2938, + "step": 16966 + }, + { + "epoch": 0.10090755542868018, + "grad_norm": 1.3881202936172485, + "learning_rate": 4.875443540383188e-05, + "loss": 5.2567, + "step": 16967 + }, + { + "epoch": 0.10091350271196117, + "grad_norm": 1.3272387981414795, + "learning_rate": 4.875428980058534e-05, + "loss": 5.2459, + "step": 16968 + }, + { + "epoch": 0.10091944999524217, + "grad_norm": 1.227569341659546, + "learning_rate": 4.875414418904643e-05, + "loss": 5.4037, + "step": 16969 + }, + { + "epoch": 0.10092539727852318, + "grad_norm": 1.6725070476531982, + "learning_rate": 4.875399856921519e-05, + "loss": 4.957, + "step": 16970 + }, + { + "epoch": 0.10093134456180416, + "grad_norm": 1.2896990776062012, + "learning_rate": 4.8753852941091676e-05, + "loss": 5.0245, + "step": 16971 + }, + { + "epoch": 0.10093729184508517, + "grad_norm": 1.4771101474761963, + "learning_rate": 4.8753707304675935e-05, + "loss": 5.007, + "step": 16972 + }, + { + "epoch": 0.10094323912836617, + "grad_norm": 1.5898420810699463, + "learning_rate": 4.8753561659968025e-05, + "loss": 5.2144, + "step": 16973 + }, + { + "epoch": 0.10094918641164716, + "grad_norm": 1.3972615003585815, + "learning_rate": 4.875341600696799e-05, + "loss": 5.0019, + "step": 16974 + }, + { + "epoch": 0.10095513369492816, + "grad_norm": 1.3663748502731323, + "learning_rate": 4.875327034567588e-05, + "loss": 5.3281, + "step": 16975 + }, + { + "epoch": 0.10096108097820915, + "grad_norm": 1.4441343545913696, + "learning_rate": 4.875312467609175e-05, + "loss": 5.3224, + "step": 16976 + }, + { + "epoch": 0.10096702826149015, + "grad_norm": 1.409233570098877, + "learning_rate": 4.875297899821565e-05, + "loss": 5.1244, + "step": 16977 + }, + { + "epoch": 0.10097297554477115, + "grad_norm": 1.286838412284851, + "learning_rate": 4.875283331204763e-05, + "loss": 5.187, + "step": 16978 + }, + { + "epoch": 0.10097892282805214, + "grad_norm": 1.3722141981124878, + "learning_rate": 4.8752687617587744e-05, + "loss": 5.1052, + "step": 16979 + }, + { + "epoch": 0.10098487011133314, + "grad_norm": 1.464938998222351, + "learning_rate": 4.8752541914836034e-05, + "loss": 5.2428, + "step": 16980 + }, + { + "epoch": 0.10099081739461414, + "grad_norm": 1.5051358938217163, + "learning_rate": 4.875239620379256e-05, + "loss": 5.204, + "step": 16981 + }, + { + "epoch": 0.10099676467789513, + "grad_norm": 1.374108076095581, + "learning_rate": 4.875225048445737e-05, + "loss": 5.4567, + "step": 16982 + }, + { + "epoch": 0.10100271196117613, + "grad_norm": 1.482023000717163, + "learning_rate": 4.875210475683052e-05, + "loss": 5.3605, + "step": 16983 + }, + { + "epoch": 0.10100865924445714, + "grad_norm": 1.429819107055664, + "learning_rate": 4.8751959020912056e-05, + "loss": 5.3351, + "step": 16984 + }, + { + "epoch": 0.10101460652773812, + "grad_norm": 1.3165935277938843, + "learning_rate": 4.875181327670202e-05, + "loss": 5.2705, + "step": 16985 + }, + { + "epoch": 0.10102055381101913, + "grad_norm": 1.4560794830322266, + "learning_rate": 4.8751667524200474e-05, + "loss": 5.313, + "step": 16986 + }, + { + "epoch": 0.10102650109430013, + "grad_norm": 1.5268526077270508, + "learning_rate": 4.875152176340747e-05, + "loss": 5.2432, + "step": 16987 + }, + { + "epoch": 0.10103244837758112, + "grad_norm": 1.8486063480377197, + "learning_rate": 4.875137599432305e-05, + "loss": 5.4951, + "step": 16988 + }, + { + "epoch": 0.10103839566086212, + "grad_norm": 1.5344970226287842, + "learning_rate": 4.875123021694727e-05, + "loss": 4.7321, + "step": 16989 + }, + { + "epoch": 0.10104434294414312, + "grad_norm": 1.5000940561294556, + "learning_rate": 4.8751084431280186e-05, + "loss": 5.1539, + "step": 16990 + }, + { + "epoch": 0.10105029022742411, + "grad_norm": 1.3047879934310913, + "learning_rate": 4.875093863732184e-05, + "loss": 5.1549, + "step": 16991 + }, + { + "epoch": 0.10105623751070511, + "grad_norm": 1.3496383428573608, + "learning_rate": 4.875079283507229e-05, + "loss": 5.0896, + "step": 16992 + }, + { + "epoch": 0.10106218479398611, + "grad_norm": 1.3492714166641235, + "learning_rate": 4.875064702453158e-05, + "loss": 5.0242, + "step": 16993 + }, + { + "epoch": 0.1010681320772671, + "grad_norm": 1.3479794263839722, + "learning_rate": 4.8750501205699766e-05, + "loss": 4.9653, + "step": 16994 + }, + { + "epoch": 0.1010740793605481, + "grad_norm": 1.4737683534622192, + "learning_rate": 4.87503553785769e-05, + "loss": 5.0082, + "step": 16995 + }, + { + "epoch": 0.1010800266438291, + "grad_norm": 1.335184931755066, + "learning_rate": 4.8750209543163026e-05, + "loss": 5.0068, + "step": 16996 + }, + { + "epoch": 0.10108597392711009, + "grad_norm": 1.3982423543930054, + "learning_rate": 4.87500636994582e-05, + "loss": 4.9958, + "step": 16997 + }, + { + "epoch": 0.1010919212103911, + "grad_norm": 1.4706374406814575, + "learning_rate": 4.874991784746248e-05, + "loss": 4.9776, + "step": 16998 + }, + { + "epoch": 0.1010978684936721, + "grad_norm": 1.4456995725631714, + "learning_rate": 4.8749771987175896e-05, + "loss": 5.1226, + "step": 16999 + }, + { + "epoch": 0.10110381577695308, + "grad_norm": 1.3827359676361084, + "learning_rate": 4.874962611859853e-05, + "loss": 5.0648, + "step": 17000 + }, + { + "epoch": 0.10110976306023409, + "grad_norm": 1.4089758396148682, + "learning_rate": 4.874948024173039e-05, + "loss": 5.0511, + "step": 17001 + }, + { + "epoch": 0.10111571034351509, + "grad_norm": 1.5135823488235474, + "learning_rate": 4.874933435657157e-05, + "loss": 5.1586, + "step": 17002 + }, + { + "epoch": 0.10112165762679608, + "grad_norm": 1.3575700521469116, + "learning_rate": 4.87491884631221e-05, + "loss": 5.4172, + "step": 17003 + }, + { + "epoch": 0.10112760491007708, + "grad_norm": 1.6240919828414917, + "learning_rate": 4.874904256138203e-05, + "loss": 4.8663, + "step": 17004 + }, + { + "epoch": 0.10113355219335807, + "grad_norm": 1.517287254333496, + "learning_rate": 4.8748896651351415e-05, + "loss": 5.2746, + "step": 17005 + }, + { + "epoch": 0.10113949947663907, + "grad_norm": 1.359541893005371, + "learning_rate": 4.87487507330303e-05, + "loss": 5.2497, + "step": 17006 + }, + { + "epoch": 0.10114544675992007, + "grad_norm": 1.608406901359558, + "learning_rate": 4.8748604806418755e-05, + "loss": 5.2789, + "step": 17007 + }, + { + "epoch": 0.10115139404320106, + "grad_norm": 1.5752578973770142, + "learning_rate": 4.874845887151681e-05, + "loss": 5.1583, + "step": 17008 + }, + { + "epoch": 0.10115734132648206, + "grad_norm": 1.5864077806472778, + "learning_rate": 4.8748312928324524e-05, + "loss": 5.2091, + "step": 17009 + }, + { + "epoch": 0.10116328860976306, + "grad_norm": 1.4714727401733398, + "learning_rate": 4.874816697684195e-05, + "loss": 5.2404, + "step": 17010 + }, + { + "epoch": 0.10116923589304405, + "grad_norm": 1.4676539897918701, + "learning_rate": 4.874802101706913e-05, + "loss": 5.3318, + "step": 17011 + }, + { + "epoch": 0.10117518317632505, + "grad_norm": 1.3290908336639404, + "learning_rate": 4.874787504900612e-05, + "loss": 5.0484, + "step": 17012 + }, + { + "epoch": 0.10118113045960606, + "grad_norm": 1.2661367654800415, + "learning_rate": 4.8747729072652984e-05, + "loss": 5.1857, + "step": 17013 + }, + { + "epoch": 0.10118707774288704, + "grad_norm": 1.2540318965911865, + "learning_rate": 4.874758308800975e-05, + "loss": 5.3025, + "step": 17014 + }, + { + "epoch": 0.10119302502616805, + "grad_norm": 1.2353893518447876, + "learning_rate": 4.874743709507649e-05, + "loss": 5.3613, + "step": 17015 + }, + { + "epoch": 0.10119897230944905, + "grad_norm": 1.2193371057510376, + "learning_rate": 4.874729109385323e-05, + "loss": 5.3029, + "step": 17016 + }, + { + "epoch": 0.10120491959273004, + "grad_norm": 1.2443112134933472, + "learning_rate": 4.874714508434005e-05, + "loss": 5.3667, + "step": 17017 + }, + { + "epoch": 0.10121086687601104, + "grad_norm": 1.4194598197937012, + "learning_rate": 4.874699906653698e-05, + "loss": 5.5583, + "step": 17018 + }, + { + "epoch": 0.10121681415929204, + "grad_norm": 1.4791369438171387, + "learning_rate": 4.874685304044408e-05, + "loss": 5.2797, + "step": 17019 + }, + { + "epoch": 0.10122276144257303, + "grad_norm": 1.4528671503067017, + "learning_rate": 4.87467070060614e-05, + "loss": 5.1261, + "step": 17020 + }, + { + "epoch": 0.10122870872585403, + "grad_norm": 1.2694898843765259, + "learning_rate": 4.8746560963388985e-05, + "loss": 5.3817, + "step": 17021 + }, + { + "epoch": 0.10123465600913503, + "grad_norm": 1.6012862920761108, + "learning_rate": 4.8746414912426896e-05, + "loss": 4.962, + "step": 17022 + }, + { + "epoch": 0.10124060329241602, + "grad_norm": 1.6179730892181396, + "learning_rate": 4.874626885317518e-05, + "loss": 4.6365, + "step": 17023 + }, + { + "epoch": 0.10124655057569702, + "grad_norm": 1.4522144794464111, + "learning_rate": 4.8746122785633885e-05, + "loss": 4.8943, + "step": 17024 + }, + { + "epoch": 0.10125249785897802, + "grad_norm": 1.6087841987609863, + "learning_rate": 4.8745976709803064e-05, + "loss": 4.81, + "step": 17025 + }, + { + "epoch": 0.10125844514225901, + "grad_norm": 1.424810767173767, + "learning_rate": 4.8745830625682766e-05, + "loss": 4.8699, + "step": 17026 + }, + { + "epoch": 0.10126439242554001, + "grad_norm": 1.3316916227340698, + "learning_rate": 4.874568453327304e-05, + "loss": 5.0084, + "step": 17027 + }, + { + "epoch": 0.10127033970882102, + "grad_norm": 1.549833059310913, + "learning_rate": 4.8745538432573946e-05, + "loss": 4.748, + "step": 17028 + }, + { + "epoch": 0.101276286992102, + "grad_norm": 1.294263482093811, + "learning_rate": 4.874539232358553e-05, + "loss": 4.8004, + "step": 17029 + }, + { + "epoch": 0.101282234275383, + "grad_norm": 1.5209519863128662, + "learning_rate": 4.8745246206307845e-05, + "loss": 4.8187, + "step": 17030 + }, + { + "epoch": 0.10128818155866401, + "grad_norm": 1.5805583000183105, + "learning_rate": 4.874510008074094e-05, + "loss": 4.7126, + "step": 17031 + }, + { + "epoch": 0.101294128841945, + "grad_norm": 1.473693609237671, + "learning_rate": 4.8744953946884864e-05, + "loss": 4.86, + "step": 17032 + }, + { + "epoch": 0.101300076125226, + "grad_norm": 1.6662403345108032, + "learning_rate": 4.8744807804739664e-05, + "loss": 4.8903, + "step": 17033 + }, + { + "epoch": 0.10130602340850699, + "grad_norm": 1.5269529819488525, + "learning_rate": 4.87446616543054e-05, + "loss": 5.1061, + "step": 17034 + }, + { + "epoch": 0.10131197069178799, + "grad_norm": 1.3940715789794922, + "learning_rate": 4.8744515495582127e-05, + "loss": 5.3221, + "step": 17035 + }, + { + "epoch": 0.10131791797506899, + "grad_norm": 1.4603626728057861, + "learning_rate": 4.874436932856988e-05, + "loss": 5.2562, + "step": 17036 + }, + { + "epoch": 0.10132386525834998, + "grad_norm": 1.4601393938064575, + "learning_rate": 4.874422315326873e-05, + "loss": 5.1297, + "step": 17037 + }, + { + "epoch": 0.10132981254163098, + "grad_norm": 1.3284024000167847, + "learning_rate": 4.874407696967871e-05, + "loss": 5.2209, + "step": 17038 + }, + { + "epoch": 0.10133575982491198, + "grad_norm": 1.1924611330032349, + "learning_rate": 4.874393077779987e-05, + "loss": 5.265, + "step": 17039 + }, + { + "epoch": 0.10134170710819297, + "grad_norm": 1.1306421756744385, + "learning_rate": 4.874378457763228e-05, + "loss": 5.1637, + "step": 17040 + }, + { + "epoch": 0.10134765439147397, + "grad_norm": 1.414591908454895, + "learning_rate": 4.874363836917598e-05, + "loss": 5.1238, + "step": 17041 + }, + { + "epoch": 0.10135360167475498, + "grad_norm": 1.245263934135437, + "learning_rate": 4.8743492152431016e-05, + "loss": 5.1779, + "step": 17042 + }, + { + "epoch": 0.10135954895803596, + "grad_norm": 1.363484501838684, + "learning_rate": 4.874334592739745e-05, + "loss": 5.1328, + "step": 17043 + }, + { + "epoch": 0.10136549624131697, + "grad_norm": 1.3666833639144897, + "learning_rate": 4.8743199694075326e-05, + "loss": 5.2547, + "step": 17044 + }, + { + "epoch": 0.10137144352459797, + "grad_norm": 1.3848010301589966, + "learning_rate": 4.8743053452464694e-05, + "loss": 5.2745, + "step": 17045 + }, + { + "epoch": 0.10137739080787896, + "grad_norm": 1.4478403329849243, + "learning_rate": 4.87429072025656e-05, + "loss": 5.2069, + "step": 17046 + }, + { + "epoch": 0.10138333809115996, + "grad_norm": 1.5361924171447754, + "learning_rate": 4.8742760944378115e-05, + "loss": 5.1721, + "step": 17047 + }, + { + "epoch": 0.10138928537444096, + "grad_norm": 1.549049973487854, + "learning_rate": 4.874261467790227e-05, + "loss": 5.2525, + "step": 17048 + }, + { + "epoch": 0.10139523265772195, + "grad_norm": 1.484999656677246, + "learning_rate": 4.874246840313813e-05, + "loss": 5.2433, + "step": 17049 + }, + { + "epoch": 0.10140117994100295, + "grad_norm": 1.58607017993927, + "learning_rate": 4.8742322120085734e-05, + "loss": 4.9631, + "step": 17050 + }, + { + "epoch": 0.10140712722428395, + "grad_norm": 1.1922807693481445, + "learning_rate": 4.874217582874514e-05, + "loss": 5.1917, + "step": 17051 + }, + { + "epoch": 0.10141307450756494, + "grad_norm": 1.1538786888122559, + "learning_rate": 4.87420295291164e-05, + "loss": 5.0231, + "step": 17052 + }, + { + "epoch": 0.10141902179084594, + "grad_norm": 1.302758812904358, + "learning_rate": 4.874188322119956e-05, + "loss": 5.0292, + "step": 17053 + }, + { + "epoch": 0.10142496907412694, + "grad_norm": 1.2432395219802856, + "learning_rate": 4.874173690499467e-05, + "loss": 5.1671, + "step": 17054 + }, + { + "epoch": 0.10143091635740793, + "grad_norm": 1.3793164491653442, + "learning_rate": 4.8741590580501786e-05, + "loss": 5.2231, + "step": 17055 + }, + { + "epoch": 0.10143686364068893, + "grad_norm": 1.3487818241119385, + "learning_rate": 4.8741444247720966e-05, + "loss": 5.0464, + "step": 17056 + }, + { + "epoch": 0.10144281092396994, + "grad_norm": 1.512860894203186, + "learning_rate": 4.874129790665225e-05, + "loss": 4.8973, + "step": 17057 + }, + { + "epoch": 0.10144875820725092, + "grad_norm": 1.6202374696731567, + "learning_rate": 4.874115155729569e-05, + "loss": 5.0055, + "step": 17058 + }, + { + "epoch": 0.10145470549053193, + "grad_norm": 1.3453385829925537, + "learning_rate": 4.874100519965134e-05, + "loss": 4.7808, + "step": 17059 + }, + { + "epoch": 0.10146065277381293, + "grad_norm": 1.4613635540008545, + "learning_rate": 4.874085883371925e-05, + "loss": 4.8073, + "step": 17060 + }, + { + "epoch": 0.10146660005709392, + "grad_norm": 1.3086074590682983, + "learning_rate": 4.874071245949947e-05, + "loss": 4.9751, + "step": 17061 + }, + { + "epoch": 0.10147254734037492, + "grad_norm": 1.454784631729126, + "learning_rate": 4.8740566076992055e-05, + "loss": 5.2422, + "step": 17062 + }, + { + "epoch": 0.10147849462365591, + "grad_norm": 1.3406941890716553, + "learning_rate": 4.8740419686197054e-05, + "loss": 5.2342, + "step": 17063 + }, + { + "epoch": 0.10148444190693691, + "grad_norm": 1.3241393566131592, + "learning_rate": 4.8740273287114514e-05, + "loss": 5.2168, + "step": 17064 + }, + { + "epoch": 0.10149038919021791, + "grad_norm": 1.2292134761810303, + "learning_rate": 4.8740126879744495e-05, + "loss": 5.171, + "step": 17065 + }, + { + "epoch": 0.1014963364734989, + "grad_norm": 1.395484209060669, + "learning_rate": 4.8739980464087044e-05, + "loss": 5.1782, + "step": 17066 + }, + { + "epoch": 0.1015022837567799, + "grad_norm": 1.8667857646942139, + "learning_rate": 4.87398340401422e-05, + "loss": 5.7113, + "step": 17067 + }, + { + "epoch": 0.1015082310400609, + "grad_norm": 1.4775335788726807, + "learning_rate": 4.873968760791003e-05, + "loss": 5.2518, + "step": 17068 + }, + { + "epoch": 0.10151417832334189, + "grad_norm": 1.5058828592300415, + "learning_rate": 4.873954116739059e-05, + "loss": 5.3249, + "step": 17069 + }, + { + "epoch": 0.1015201256066229, + "grad_norm": 1.4806468486785889, + "learning_rate": 4.873939471858391e-05, + "loss": 5.1119, + "step": 17070 + }, + { + "epoch": 0.1015260728899039, + "grad_norm": 1.3866868019104004, + "learning_rate": 4.873924826149006e-05, + "loss": 5.1709, + "step": 17071 + }, + { + "epoch": 0.10153202017318488, + "grad_norm": 1.2337566614151, + "learning_rate": 4.8739101796109074e-05, + "loss": 5.2346, + "step": 17072 + }, + { + "epoch": 0.10153796745646589, + "grad_norm": 1.5977396965026855, + "learning_rate": 4.873895532244103e-05, + "loss": 5.4213, + "step": 17073 + }, + { + "epoch": 0.10154391473974689, + "grad_norm": 1.343363642692566, + "learning_rate": 4.873880884048595e-05, + "loss": 5.2865, + "step": 17074 + }, + { + "epoch": 0.10154986202302788, + "grad_norm": 1.4759324789047241, + "learning_rate": 4.87386623502439e-05, + "loss": 5.1743, + "step": 17075 + }, + { + "epoch": 0.10155580930630888, + "grad_norm": 1.2113150358200073, + "learning_rate": 4.873851585171493e-05, + "loss": 5.2218, + "step": 17076 + }, + { + "epoch": 0.10156175658958988, + "grad_norm": 1.3962153196334839, + "learning_rate": 4.873836934489908e-05, + "loss": 5.1031, + "step": 17077 + }, + { + "epoch": 0.10156770387287087, + "grad_norm": 1.410144329071045, + "learning_rate": 4.8738222829796424e-05, + "loss": 5.0662, + "step": 17078 + }, + { + "epoch": 0.10157365115615187, + "grad_norm": 1.224947452545166, + "learning_rate": 4.873807630640699e-05, + "loss": 5.1583, + "step": 17079 + }, + { + "epoch": 0.10157959843943287, + "grad_norm": 1.401877522468567, + "learning_rate": 4.873792977473084e-05, + "loss": 5.2688, + "step": 17080 + }, + { + "epoch": 0.10158554572271386, + "grad_norm": 1.3576874732971191, + "learning_rate": 4.873778323476802e-05, + "loss": 5.037, + "step": 17081 + }, + { + "epoch": 0.10159149300599486, + "grad_norm": 1.226619839668274, + "learning_rate": 4.8737636686518595e-05, + "loss": 5.0502, + "step": 17082 + }, + { + "epoch": 0.10159744028927586, + "grad_norm": 1.2307099103927612, + "learning_rate": 4.87374901299826e-05, + "loss": 5.0855, + "step": 17083 + }, + { + "epoch": 0.10160338757255685, + "grad_norm": 1.1481422185897827, + "learning_rate": 4.873734356516009e-05, + "loss": 5.2114, + "step": 17084 + }, + { + "epoch": 0.10160933485583785, + "grad_norm": 1.4645094871520996, + "learning_rate": 4.873719699205113e-05, + "loss": 5.1432, + "step": 17085 + }, + { + "epoch": 0.10161528213911886, + "grad_norm": 1.3309158086776733, + "learning_rate": 4.873705041065575e-05, + "loss": 5.1557, + "step": 17086 + }, + { + "epoch": 0.10162122942239984, + "grad_norm": 1.2546007633209229, + "learning_rate": 4.873690382097401e-05, + "loss": 5.324, + "step": 17087 + }, + { + "epoch": 0.10162717670568085, + "grad_norm": 1.33823561668396, + "learning_rate": 4.873675722300597e-05, + "loss": 5.1773, + "step": 17088 + }, + { + "epoch": 0.10163312398896185, + "grad_norm": 1.3027381896972656, + "learning_rate": 4.873661061675166e-05, + "loss": 5.4172, + "step": 17089 + }, + { + "epoch": 0.10163907127224284, + "grad_norm": 1.3852121829986572, + "learning_rate": 4.873646400221116e-05, + "loss": 5.1655, + "step": 17090 + }, + { + "epoch": 0.10164501855552384, + "grad_norm": 1.4345825910568237, + "learning_rate": 4.87363173793845e-05, + "loss": 4.9941, + "step": 17091 + }, + { + "epoch": 0.10165096583880483, + "grad_norm": 1.4016261100769043, + "learning_rate": 4.873617074827173e-05, + "loss": 4.9657, + "step": 17092 + }, + { + "epoch": 0.10165691312208583, + "grad_norm": 1.339082956314087, + "learning_rate": 4.8736024108872914e-05, + "loss": 5.0075, + "step": 17093 + }, + { + "epoch": 0.10166286040536683, + "grad_norm": 1.3223985433578491, + "learning_rate": 4.8735877461188094e-05, + "loss": 4.9656, + "step": 17094 + }, + { + "epoch": 0.10166880768864782, + "grad_norm": 1.4618138074874878, + "learning_rate": 4.8735730805217326e-05, + "loss": 5.0158, + "step": 17095 + }, + { + "epoch": 0.10167475497192882, + "grad_norm": 1.4075788259506226, + "learning_rate": 4.8735584140960666e-05, + "loss": 5.3668, + "step": 17096 + }, + { + "epoch": 0.10168070225520982, + "grad_norm": 1.2219016551971436, + "learning_rate": 4.873543746841815e-05, + "loss": 5.3549, + "step": 17097 + }, + { + "epoch": 0.10168664953849081, + "grad_norm": 1.4344584941864014, + "learning_rate": 4.873529078758985e-05, + "loss": 5.2044, + "step": 17098 + }, + { + "epoch": 0.10169259682177181, + "grad_norm": 1.3579001426696777, + "learning_rate": 4.8735144098475794e-05, + "loss": 5.1071, + "step": 17099 + }, + { + "epoch": 0.10169854410505282, + "grad_norm": 1.4645969867706299, + "learning_rate": 4.873499740107604e-05, + "loss": 5.0359, + "step": 17100 + }, + { + "epoch": 0.1017044913883338, + "grad_norm": 1.6800013780593872, + "learning_rate": 4.8734850695390654e-05, + "loss": 5.2085, + "step": 17101 + }, + { + "epoch": 0.1017104386716148, + "grad_norm": 1.678339958190918, + "learning_rate": 4.873470398141968e-05, + "loss": 5.1671, + "step": 17102 + }, + { + "epoch": 0.10171638595489581, + "grad_norm": 1.6498647928237915, + "learning_rate": 4.873455725916316e-05, + "loss": 5.2105, + "step": 17103 + }, + { + "epoch": 0.1017223332381768, + "grad_norm": 1.522147297859192, + "learning_rate": 4.873441052862115e-05, + "loss": 5.1215, + "step": 17104 + }, + { + "epoch": 0.1017282805214578, + "grad_norm": 1.3335652351379395, + "learning_rate": 4.87342637897937e-05, + "loss": 5.2504, + "step": 17105 + }, + { + "epoch": 0.1017342278047388, + "grad_norm": 1.1647717952728271, + "learning_rate": 4.873411704268087e-05, + "loss": 5.3183, + "step": 17106 + }, + { + "epoch": 0.10174017508801979, + "grad_norm": 1.3210188150405884, + "learning_rate": 4.8733970287282706e-05, + "loss": 5.399, + "step": 17107 + }, + { + "epoch": 0.10174612237130079, + "grad_norm": 1.2331137657165527, + "learning_rate": 4.873382352359925e-05, + "loss": 5.2521, + "step": 17108 + }, + { + "epoch": 0.10175206965458179, + "grad_norm": 1.245252251625061, + "learning_rate": 4.873367675163056e-05, + "loss": 5.2092, + "step": 17109 + }, + { + "epoch": 0.10175801693786278, + "grad_norm": 1.3423751592636108, + "learning_rate": 4.87335299713767e-05, + "loss": 4.918, + "step": 17110 + }, + { + "epoch": 0.10176396422114378, + "grad_norm": 1.8670060634613037, + "learning_rate": 4.87333831828377e-05, + "loss": 4.6559, + "step": 17111 + }, + { + "epoch": 0.10176991150442478, + "grad_norm": 1.54763925075531, + "learning_rate": 4.873323638601363e-05, + "loss": 5.2565, + "step": 17112 + }, + { + "epoch": 0.10177585878770577, + "grad_norm": 1.134102702140808, + "learning_rate": 4.8733089580904525e-05, + "loss": 5.2119, + "step": 17113 + }, + { + "epoch": 0.10178180607098677, + "grad_norm": 1.395027756690979, + "learning_rate": 4.873294276751045e-05, + "loss": 5.0732, + "step": 17114 + }, + { + "epoch": 0.10178775335426778, + "grad_norm": 1.104973554611206, + "learning_rate": 4.873279594583144e-05, + "loss": 5.0807, + "step": 17115 + }, + { + "epoch": 0.10179370063754876, + "grad_norm": 1.0554969310760498, + "learning_rate": 4.873264911586757e-05, + "loss": 5.0831, + "step": 17116 + }, + { + "epoch": 0.10179964792082977, + "grad_norm": 1.0598722696304321, + "learning_rate": 4.873250227761887e-05, + "loss": 5.1264, + "step": 17117 + }, + { + "epoch": 0.10180559520411077, + "grad_norm": 1.1047697067260742, + "learning_rate": 4.8732355431085395e-05, + "loss": 5.0687, + "step": 17118 + }, + { + "epoch": 0.10181154248739176, + "grad_norm": 1.5564457178115845, + "learning_rate": 4.87322085762672e-05, + "loss": 5.0063, + "step": 17119 + }, + { + "epoch": 0.10181748977067276, + "grad_norm": 1.5218400955200195, + "learning_rate": 4.8732061713164344e-05, + "loss": 5.3785, + "step": 17120 + }, + { + "epoch": 0.10182343705395375, + "grad_norm": 1.3067396879196167, + "learning_rate": 4.873191484177686e-05, + "loss": 5.4108, + "step": 17121 + }, + { + "epoch": 0.10182938433723475, + "grad_norm": 1.4401333332061768, + "learning_rate": 4.873176796210482e-05, + "loss": 5.5251, + "step": 17122 + }, + { + "epoch": 0.10183533162051575, + "grad_norm": 1.0483810901641846, + "learning_rate": 4.873162107414826e-05, + "loss": 5.4983, + "step": 17123 + }, + { + "epoch": 0.10184127890379674, + "grad_norm": 1.2637344598770142, + "learning_rate": 4.8731474177907244e-05, + "loss": 5.4487, + "step": 17124 + }, + { + "epoch": 0.10184722618707774, + "grad_norm": 1.314834475517273, + "learning_rate": 4.873132727338181e-05, + "loss": 5.228, + "step": 17125 + }, + { + "epoch": 0.10185317347035874, + "grad_norm": 1.354665756225586, + "learning_rate": 4.8731180360572e-05, + "loss": 5.3908, + "step": 17126 + }, + { + "epoch": 0.10185912075363973, + "grad_norm": 1.3690662384033203, + "learning_rate": 4.87310334394779e-05, + "loss": 5.0955, + "step": 17127 + }, + { + "epoch": 0.10186506803692073, + "grad_norm": 1.5240978002548218, + "learning_rate": 4.873088651009954e-05, + "loss": 5.2838, + "step": 17128 + }, + { + "epoch": 0.10187101532020174, + "grad_norm": 1.147658109664917, + "learning_rate": 4.8730739572436966e-05, + "loss": 5.3074, + "step": 17129 + }, + { + "epoch": 0.10187696260348272, + "grad_norm": 1.3384162187576294, + "learning_rate": 4.8730592626490235e-05, + "loss": 5.3677, + "step": 17130 + }, + { + "epoch": 0.10188290988676373, + "grad_norm": 1.3388500213623047, + "learning_rate": 4.87304456722594e-05, + "loss": 5.3151, + "step": 17131 + }, + { + "epoch": 0.10188885717004473, + "grad_norm": 1.215617060661316, + "learning_rate": 4.873029870974452e-05, + "loss": 4.9182, + "step": 17132 + }, + { + "epoch": 0.10189480445332572, + "grad_norm": 1.2983050346374512, + "learning_rate": 4.873015173894563e-05, + "loss": 5.142, + "step": 17133 + }, + { + "epoch": 0.10190075173660672, + "grad_norm": 1.3918750286102295, + "learning_rate": 4.873000475986279e-05, + "loss": 5.0548, + "step": 17134 + }, + { + "epoch": 0.10190669901988772, + "grad_norm": 1.3934828042984009, + "learning_rate": 4.8729857772496045e-05, + "loss": 5.1319, + "step": 17135 + }, + { + "epoch": 0.10191264630316871, + "grad_norm": 1.32583487033844, + "learning_rate": 4.872971077684546e-05, + "loss": 5.2762, + "step": 17136 + }, + { + "epoch": 0.10191859358644971, + "grad_norm": 1.295102834701538, + "learning_rate": 4.872956377291108e-05, + "loss": 5.2338, + "step": 17137 + }, + { + "epoch": 0.10192454086973071, + "grad_norm": 1.2840588092803955, + "learning_rate": 4.8729416760692946e-05, + "loss": 5.3957, + "step": 17138 + }, + { + "epoch": 0.1019304881530117, + "grad_norm": 1.371270775794983, + "learning_rate": 4.872926974019112e-05, + "loss": 5.5933, + "step": 17139 + }, + { + "epoch": 0.1019364354362927, + "grad_norm": 1.380387783050537, + "learning_rate": 4.872912271140565e-05, + "loss": 5.6628, + "step": 17140 + }, + { + "epoch": 0.1019423827195737, + "grad_norm": 1.3120551109313965, + "learning_rate": 4.8728975674336596e-05, + "loss": 5.6424, + "step": 17141 + }, + { + "epoch": 0.10194833000285469, + "grad_norm": 1.3965035676956177, + "learning_rate": 4.8728828628984003e-05, + "loss": 5.5413, + "step": 17142 + }, + { + "epoch": 0.1019542772861357, + "grad_norm": 1.5870885848999023, + "learning_rate": 4.872868157534791e-05, + "loss": 5.1952, + "step": 17143 + }, + { + "epoch": 0.1019602245694167, + "grad_norm": 1.584633231163025, + "learning_rate": 4.872853451342839e-05, + "loss": 5.1045, + "step": 17144 + }, + { + "epoch": 0.10196617185269768, + "grad_norm": 1.5781641006469727, + "learning_rate": 4.872838744322548e-05, + "loss": 4.9581, + "step": 17145 + }, + { + "epoch": 0.10197211913597869, + "grad_norm": 1.3683301210403442, + "learning_rate": 4.872824036473923e-05, + "loss": 4.9931, + "step": 17146 + }, + { + "epoch": 0.10197806641925969, + "grad_norm": 1.4182472229003906, + "learning_rate": 4.87280932779697e-05, + "loss": 4.7815, + "step": 17147 + }, + { + "epoch": 0.10198401370254068, + "grad_norm": 1.464609146118164, + "learning_rate": 4.872794618291694e-05, + "loss": 4.9158, + "step": 17148 + }, + { + "epoch": 0.10198996098582168, + "grad_norm": 1.4733667373657227, + "learning_rate": 4.872779907958099e-05, + "loss": 5.069, + "step": 17149 + }, + { + "epoch": 0.10199590826910268, + "grad_norm": 1.4454584121704102, + "learning_rate": 4.872765196796192e-05, + "loss": 5.1131, + "step": 17150 + }, + { + "epoch": 0.10200185555238367, + "grad_norm": 1.6175665855407715, + "learning_rate": 4.872750484805977e-05, + "loss": 4.9432, + "step": 17151 + }, + { + "epoch": 0.10200780283566467, + "grad_norm": 1.378569483757019, + "learning_rate": 4.872735771987459e-05, + "loss": 4.9243, + "step": 17152 + }, + { + "epoch": 0.10201375011894566, + "grad_norm": 1.452481985092163, + "learning_rate": 4.872721058340644e-05, + "loss": 4.8421, + "step": 17153 + }, + { + "epoch": 0.10201969740222666, + "grad_norm": 1.8265782594680786, + "learning_rate": 4.872706343865536e-05, + "loss": 5.2555, + "step": 17154 + }, + { + "epoch": 0.10202564468550766, + "grad_norm": 1.6913262605667114, + "learning_rate": 4.8726916285621414e-05, + "loss": 5.3829, + "step": 17155 + }, + { + "epoch": 0.10203159196878865, + "grad_norm": 1.6480923891067505, + "learning_rate": 4.8726769124304644e-05, + "loss": 5.4168, + "step": 17156 + }, + { + "epoch": 0.10203753925206965, + "grad_norm": 1.702602744102478, + "learning_rate": 4.8726621954705105e-05, + "loss": 5.4045, + "step": 17157 + }, + { + "epoch": 0.10204348653535066, + "grad_norm": 1.749205470085144, + "learning_rate": 4.8726474776822844e-05, + "loss": 5.5886, + "step": 17158 + }, + { + "epoch": 0.10204943381863164, + "grad_norm": 1.927309274673462, + "learning_rate": 4.8726327590657916e-05, + "loss": 5.5547, + "step": 17159 + }, + { + "epoch": 0.10205538110191265, + "grad_norm": 1.6493511199951172, + "learning_rate": 4.8726180396210374e-05, + "loss": 5.6764, + "step": 17160 + }, + { + "epoch": 0.10206132838519365, + "grad_norm": 1.7083081007003784, + "learning_rate": 4.8726033193480266e-05, + "loss": 5.5823, + "step": 17161 + }, + { + "epoch": 0.10206727566847464, + "grad_norm": 1.7882472276687622, + "learning_rate": 4.872588598246765e-05, + "loss": 5.4388, + "step": 17162 + }, + { + "epoch": 0.10207322295175564, + "grad_norm": 1.6043784618377686, + "learning_rate": 4.872573876317257e-05, + "loss": 5.6816, + "step": 17163 + }, + { + "epoch": 0.10207917023503664, + "grad_norm": 1.3449418544769287, + "learning_rate": 4.872559153559507e-05, + "loss": 5.5661, + "step": 17164 + }, + { + "epoch": 0.10208511751831763, + "grad_norm": 1.7593882083892822, + "learning_rate": 4.8725444299735226e-05, + "loss": 4.95, + "step": 17165 + }, + { + "epoch": 0.10209106480159863, + "grad_norm": 1.8593993186950684, + "learning_rate": 4.872529705559307e-05, + "loss": 5.3296, + "step": 17166 + }, + { + "epoch": 0.10209701208487963, + "grad_norm": 1.7530159950256348, + "learning_rate": 4.872514980316865e-05, + "loss": 5.4378, + "step": 17167 + }, + { + "epoch": 0.10210295936816062, + "grad_norm": 1.7487550973892212, + "learning_rate": 4.872500254246203e-05, + "loss": 5.3435, + "step": 17168 + }, + { + "epoch": 0.10210890665144162, + "grad_norm": 1.7868090867996216, + "learning_rate": 4.8724855273473256e-05, + "loss": 5.2266, + "step": 17169 + }, + { + "epoch": 0.10211485393472262, + "grad_norm": 1.6116459369659424, + "learning_rate": 4.872470799620238e-05, + "loss": 5.2394, + "step": 17170 + }, + { + "epoch": 0.10212080121800361, + "grad_norm": 1.6221721172332764, + "learning_rate": 4.872456071064946e-05, + "loss": 5.823, + "step": 17171 + }, + { + "epoch": 0.10212674850128461, + "grad_norm": 1.462540626525879, + "learning_rate": 4.872441341681454e-05, + "loss": 5.8816, + "step": 17172 + }, + { + "epoch": 0.10213269578456562, + "grad_norm": 1.3804352283477783, + "learning_rate": 4.872426611469766e-05, + "loss": 5.7982, + "step": 17173 + }, + { + "epoch": 0.1021386430678466, + "grad_norm": 1.7873106002807617, + "learning_rate": 4.872411880429889e-05, + "loss": 5.0282, + "step": 17174 + }, + { + "epoch": 0.1021445903511276, + "grad_norm": 1.9154506921768188, + "learning_rate": 4.8723971485618284e-05, + "loss": 4.8535, + "step": 17175 + }, + { + "epoch": 0.10215053763440861, + "grad_norm": 1.865502953529358, + "learning_rate": 4.872382415865587e-05, + "loss": 5.5282, + "step": 17176 + }, + { + "epoch": 0.1021564849176896, + "grad_norm": 1.8683371543884277, + "learning_rate": 4.872367682341173e-05, + "loss": 5.2973, + "step": 17177 + }, + { + "epoch": 0.1021624322009706, + "grad_norm": 1.8488374948501587, + "learning_rate": 4.872352947988589e-05, + "loss": 5.4094, + "step": 17178 + }, + { + "epoch": 0.1021683794842516, + "grad_norm": 1.6702567338943481, + "learning_rate": 4.872338212807841e-05, + "loss": 5.5705, + "step": 17179 + }, + { + "epoch": 0.10217432676753259, + "grad_norm": 1.6559606790542603, + "learning_rate": 4.8723234767989345e-05, + "loss": 5.6637, + "step": 17180 + }, + { + "epoch": 0.10218027405081359, + "grad_norm": 1.523253321647644, + "learning_rate": 4.872308739961875e-05, + "loss": 5.4033, + "step": 17181 + }, + { + "epoch": 0.10218622133409458, + "grad_norm": 1.4300789833068848, + "learning_rate": 4.8722940022966665e-05, + "loss": 5.7568, + "step": 17182 + }, + { + "epoch": 0.10219216861737558, + "grad_norm": 1.5076279640197754, + "learning_rate": 4.872279263803314e-05, + "loss": 4.9469, + "step": 17183 + }, + { + "epoch": 0.10219811590065658, + "grad_norm": 1.721596598625183, + "learning_rate": 4.872264524481824e-05, + "loss": 5.1595, + "step": 17184 + }, + { + "epoch": 0.10220406318393757, + "grad_norm": 1.5876305103302002, + "learning_rate": 4.872249784332201e-05, + "loss": 4.9964, + "step": 17185 + }, + { + "epoch": 0.10221001046721857, + "grad_norm": 1.6709486246109009, + "learning_rate": 4.87223504335445e-05, + "loss": 5.0299, + "step": 17186 + }, + { + "epoch": 0.10221595775049958, + "grad_norm": 1.586411952972412, + "learning_rate": 4.872220301548576e-05, + "loss": 4.9945, + "step": 17187 + }, + { + "epoch": 0.10222190503378056, + "grad_norm": 1.541045069694519, + "learning_rate": 4.872205558914585e-05, + "loss": 4.8789, + "step": 17188 + }, + { + "epoch": 0.10222785231706157, + "grad_norm": 1.8977370262145996, + "learning_rate": 4.872190815452481e-05, + "loss": 4.849, + "step": 17189 + }, + { + "epoch": 0.10223379960034257, + "grad_norm": 1.7448357343673706, + "learning_rate": 4.87217607116227e-05, + "loss": 4.7961, + "step": 17190 + }, + { + "epoch": 0.10223974688362356, + "grad_norm": 1.7249553203582764, + "learning_rate": 4.872161326043957e-05, + "loss": 4.7988, + "step": 17191 + }, + { + "epoch": 0.10224569416690456, + "grad_norm": 1.6894437074661255, + "learning_rate": 4.8721465800975465e-05, + "loss": 4.6713, + "step": 17192 + }, + { + "epoch": 0.10225164145018556, + "grad_norm": 1.5226197242736816, + "learning_rate": 4.8721318333230446e-05, + "loss": 4.8233, + "step": 17193 + }, + { + "epoch": 0.10225758873346655, + "grad_norm": 1.6511256694793701, + "learning_rate": 4.8721170857204554e-05, + "loss": 5.177, + "step": 17194 + }, + { + "epoch": 0.10226353601674755, + "grad_norm": 1.8213993310928345, + "learning_rate": 4.872102337289785e-05, + "loss": 5.2472, + "step": 17195 + }, + { + "epoch": 0.10226948330002855, + "grad_norm": 1.6683803796768188, + "learning_rate": 4.872087588031038e-05, + "loss": 4.7902, + "step": 17196 + }, + { + "epoch": 0.10227543058330954, + "grad_norm": 1.5809015035629272, + "learning_rate": 4.8720728379442204e-05, + "loss": 4.6288, + "step": 17197 + }, + { + "epoch": 0.10228137786659054, + "grad_norm": 1.7978498935699463, + "learning_rate": 4.872058087029336e-05, + "loss": 4.6638, + "step": 17198 + }, + { + "epoch": 0.10228732514987154, + "grad_norm": 1.74656081199646, + "learning_rate": 4.87204333528639e-05, + "loss": 5.652, + "step": 17199 + }, + { + "epoch": 0.10229327243315253, + "grad_norm": 1.6222811937332153, + "learning_rate": 4.87202858271539e-05, + "loss": 5.3951, + "step": 17200 + }, + { + "epoch": 0.10229921971643353, + "grad_norm": 1.8816531896591187, + "learning_rate": 4.8720138293163374e-05, + "loss": 5.728, + "step": 17201 + }, + { + "epoch": 0.10230516699971454, + "grad_norm": 1.5618531703948975, + "learning_rate": 4.871999075089241e-05, + "loss": 5.7162, + "step": 17202 + }, + { + "epoch": 0.10231111428299552, + "grad_norm": 1.4562182426452637, + "learning_rate": 4.871984320034103e-05, + "loss": 5.7563, + "step": 17203 + }, + { + "epoch": 0.10231706156627653, + "grad_norm": 1.8649898767471313, + "learning_rate": 4.87196956415093e-05, + "loss": 5.6333, + "step": 17204 + }, + { + "epoch": 0.10232300884955753, + "grad_norm": 1.7934935092926025, + "learning_rate": 4.871954807439727e-05, + "loss": 5.5804, + "step": 17205 + }, + { + "epoch": 0.10232895613283852, + "grad_norm": 1.5005213022232056, + "learning_rate": 4.8719400499005e-05, + "loss": 5.2471, + "step": 17206 + }, + { + "epoch": 0.10233490341611952, + "grad_norm": 1.5418996810913086, + "learning_rate": 4.871925291533252e-05, + "loss": 6.0574, + "step": 17207 + }, + { + "epoch": 0.10234085069940052, + "grad_norm": 1.3919132947921753, + "learning_rate": 4.87191053233799e-05, + "loss": 6.0048, + "step": 17208 + }, + { + "epoch": 0.10234679798268151, + "grad_norm": 1.9565762281417847, + "learning_rate": 4.8718957723147184e-05, + "loss": 4.9914, + "step": 17209 + }, + { + "epoch": 0.10235274526596251, + "grad_norm": 2.3950796127319336, + "learning_rate": 4.871881011463442e-05, + "loss": 5.7963, + "step": 17210 + }, + { + "epoch": 0.1023586925492435, + "grad_norm": 2.0693960189819336, + "learning_rate": 4.871866249784167e-05, + "loss": 5.4641, + "step": 17211 + }, + { + "epoch": 0.1023646398325245, + "grad_norm": 2.105893850326538, + "learning_rate": 4.871851487276898e-05, + "loss": 5.3983, + "step": 17212 + }, + { + "epoch": 0.1023705871158055, + "grad_norm": 2.171363115310669, + "learning_rate": 4.8718367239416404e-05, + "loss": 5.6619, + "step": 17213 + }, + { + "epoch": 0.10237653439908649, + "grad_norm": 2.141611099243164, + "learning_rate": 4.8718219597783984e-05, + "loss": 5.5488, + "step": 17214 + }, + { + "epoch": 0.1023824816823675, + "grad_norm": 1.8755214214324951, + "learning_rate": 4.871807194787178e-05, + "loss": 5.4888, + "step": 17215 + }, + { + "epoch": 0.1023884289656485, + "grad_norm": 2.0865023136138916, + "learning_rate": 4.871792428967984e-05, + "loss": 5.4645, + "step": 17216 + }, + { + "epoch": 0.10239437624892948, + "grad_norm": 1.9486721754074097, + "learning_rate": 4.871777662320823e-05, + "loss": 5.4057, + "step": 17217 + }, + { + "epoch": 0.10240032353221049, + "grad_norm": 2.109412670135498, + "learning_rate": 4.8717628948456976e-05, + "loss": 5.3768, + "step": 17218 + }, + { + "epoch": 0.10240627081549149, + "grad_norm": 2.202826499938965, + "learning_rate": 4.871748126542615e-05, + "loss": 5.4996, + "step": 17219 + }, + { + "epoch": 0.10241221809877248, + "grad_norm": 1.8646687269210815, + "learning_rate": 4.87173335741158e-05, + "loss": 5.5151, + "step": 17220 + }, + { + "epoch": 0.10241816538205348, + "grad_norm": 1.7966501712799072, + "learning_rate": 4.8717185874525964e-05, + "loss": 5.5548, + "step": 17221 + }, + { + "epoch": 0.10242411266533448, + "grad_norm": 1.9538966417312622, + "learning_rate": 4.8717038166656706e-05, + "loss": 5.6221, + "step": 17222 + }, + { + "epoch": 0.10243005994861547, + "grad_norm": 1.6085959672927856, + "learning_rate": 4.871689045050808e-05, + "loss": 5.2468, + "step": 17223 + }, + { + "epoch": 0.10243600723189647, + "grad_norm": 1.7573461532592773, + "learning_rate": 4.871674272608012e-05, + "loss": 5.5835, + "step": 17224 + }, + { + "epoch": 0.10244195451517747, + "grad_norm": 1.8237701654434204, + "learning_rate": 4.87165949933729e-05, + "loss": 5.3537, + "step": 17225 + }, + { + "epoch": 0.10244790179845846, + "grad_norm": 1.963970422744751, + "learning_rate": 4.8716447252386465e-05, + "loss": 5.5714, + "step": 17226 + }, + { + "epoch": 0.10245384908173946, + "grad_norm": 2.0216476917266846, + "learning_rate": 4.871629950312086e-05, + "loss": 5.4889, + "step": 17227 + }, + { + "epoch": 0.10245979636502046, + "grad_norm": 2.0271217823028564, + "learning_rate": 4.871615174557614e-05, + "loss": 5.5903, + "step": 17228 + }, + { + "epoch": 0.10246574364830145, + "grad_norm": 1.7717560529708862, + "learning_rate": 4.871600397975236e-05, + "loss": 5.3989, + "step": 17229 + }, + { + "epoch": 0.10247169093158245, + "grad_norm": 1.722076416015625, + "learning_rate": 4.8715856205649556e-05, + "loss": 5.526, + "step": 17230 + }, + { + "epoch": 0.10247763821486346, + "grad_norm": 2.124905586242676, + "learning_rate": 4.8715708423267805e-05, + "loss": 5.3835, + "step": 17231 + }, + { + "epoch": 0.10248358549814444, + "grad_norm": 2.2088522911071777, + "learning_rate": 4.8715560632607135e-05, + "loss": 5.5228, + "step": 17232 + }, + { + "epoch": 0.10248953278142545, + "grad_norm": 2.0236847400665283, + "learning_rate": 4.871541283366761e-05, + "loss": 5.3851, + "step": 17233 + }, + { + "epoch": 0.10249548006470645, + "grad_norm": 1.7546913623809814, + "learning_rate": 4.871526502644928e-05, + "loss": 5.2, + "step": 17234 + }, + { + "epoch": 0.10250142734798744, + "grad_norm": 1.9796072244644165, + "learning_rate": 4.87151172109522e-05, + "loss": 5.3873, + "step": 17235 + }, + { + "epoch": 0.10250737463126844, + "grad_norm": 1.5305960178375244, + "learning_rate": 4.8714969387176414e-05, + "loss": 5.1888, + "step": 17236 + }, + { + "epoch": 0.10251332191454944, + "grad_norm": 2.007124185562134, + "learning_rate": 4.871482155512198e-05, + "loss": 5.4024, + "step": 17237 + }, + { + "epoch": 0.10251926919783043, + "grad_norm": 1.8268414735794067, + "learning_rate": 4.871467371478894e-05, + "loss": 5.4289, + "step": 17238 + }, + { + "epoch": 0.10252521648111143, + "grad_norm": 1.9826276302337646, + "learning_rate": 4.871452586617736e-05, + "loss": 5.3222, + "step": 17239 + }, + { + "epoch": 0.10253116376439242, + "grad_norm": 1.7642468214035034, + "learning_rate": 4.8714378009287285e-05, + "loss": 5.3858, + "step": 17240 + }, + { + "epoch": 0.10253711104767342, + "grad_norm": 1.9604185819625854, + "learning_rate": 4.8714230144118764e-05, + "loss": 5.4142, + "step": 17241 + }, + { + "epoch": 0.10254305833095442, + "grad_norm": 2.333829402923584, + "learning_rate": 4.8714082270671844e-05, + "loss": 5.2124, + "step": 17242 + }, + { + "epoch": 0.10254900561423541, + "grad_norm": 1.996928095817566, + "learning_rate": 4.8713934388946593e-05, + "loss": 5.5055, + "step": 17243 + }, + { + "epoch": 0.10255495289751641, + "grad_norm": 2.2702581882476807, + "learning_rate": 4.871378649894304e-05, + "loss": 5.3477, + "step": 17244 + }, + { + "epoch": 0.10256090018079742, + "grad_norm": 1.9696896076202393, + "learning_rate": 4.871363860066126e-05, + "loss": 5.39, + "step": 17245 + }, + { + "epoch": 0.1025668474640784, + "grad_norm": 1.7752536535263062, + "learning_rate": 4.871349069410129e-05, + "loss": 5.326, + "step": 17246 + }, + { + "epoch": 0.1025727947473594, + "grad_norm": 1.798829197883606, + "learning_rate": 4.8713342779263184e-05, + "loss": 5.4066, + "step": 17247 + }, + { + "epoch": 0.10257874203064041, + "grad_norm": 1.975467562675476, + "learning_rate": 4.871319485614699e-05, + "loss": 5.4183, + "step": 17248 + }, + { + "epoch": 0.1025846893139214, + "grad_norm": 2.4021782875061035, + "learning_rate": 4.871304692475277e-05, + "loss": 5.3949, + "step": 17249 + }, + { + "epoch": 0.1025906365972024, + "grad_norm": 1.8973580598831177, + "learning_rate": 4.871289898508058e-05, + "loss": 5.437, + "step": 17250 + }, + { + "epoch": 0.1025965838804834, + "grad_norm": 2.3427937030792236, + "learning_rate": 4.8712751037130446e-05, + "loss": 5.4347, + "step": 17251 + }, + { + "epoch": 0.10260253116376439, + "grad_norm": 1.8699359893798828, + "learning_rate": 4.871260308090245e-05, + "loss": 5.3404, + "step": 17252 + }, + { + "epoch": 0.10260847844704539, + "grad_norm": 2.146106719970703, + "learning_rate": 4.871245511639661e-05, + "loss": 5.3664, + "step": 17253 + }, + { + "epoch": 0.10261442573032639, + "grad_norm": 2.0223419666290283, + "learning_rate": 4.871230714361302e-05, + "loss": 5.4117, + "step": 17254 + }, + { + "epoch": 0.10262037301360738, + "grad_norm": 2.036025047302246, + "learning_rate": 4.871215916255169e-05, + "loss": 5.4349, + "step": 17255 + }, + { + "epoch": 0.10262632029688838, + "grad_norm": 2.0085432529449463, + "learning_rate": 4.87120111732127e-05, + "loss": 5.4896, + "step": 17256 + }, + { + "epoch": 0.10263226758016938, + "grad_norm": 2.088165521621704, + "learning_rate": 4.871186317559609e-05, + "loss": 5.2516, + "step": 17257 + }, + { + "epoch": 0.10263821486345037, + "grad_norm": 1.7493584156036377, + "learning_rate": 4.871171516970191e-05, + "loss": 5.0744, + "step": 17258 + }, + { + "epoch": 0.10264416214673137, + "grad_norm": 1.9395314455032349, + "learning_rate": 4.8711567155530224e-05, + "loss": 5.2783, + "step": 17259 + }, + { + "epoch": 0.10265010943001238, + "grad_norm": 2.057565689086914, + "learning_rate": 4.871141913308107e-05, + "loss": 5.2501, + "step": 17260 + }, + { + "epoch": 0.10265605671329336, + "grad_norm": 2.159641742706299, + "learning_rate": 4.87112711023545e-05, + "loss": 5.2844, + "step": 17261 + }, + { + "epoch": 0.10266200399657437, + "grad_norm": 1.8931914567947388, + "learning_rate": 4.8711123063350575e-05, + "loss": 5.4454, + "step": 17262 + }, + { + "epoch": 0.10266795127985537, + "grad_norm": 1.9728927612304688, + "learning_rate": 4.871097501606934e-05, + "loss": 5.3719, + "step": 17263 + }, + { + "epoch": 0.10267389856313636, + "grad_norm": 1.8770530223846436, + "learning_rate": 4.8710826960510845e-05, + "loss": 5.4244, + "step": 17264 + }, + { + "epoch": 0.10267984584641736, + "grad_norm": 2.072201728820801, + "learning_rate": 4.871067889667516e-05, + "loss": 5.3282, + "step": 17265 + }, + { + "epoch": 0.10268579312969836, + "grad_norm": 2.16689133644104, + "learning_rate": 4.8710530824562304e-05, + "loss": 5.4205, + "step": 17266 + }, + { + "epoch": 0.10269174041297935, + "grad_norm": 2.017695903778076, + "learning_rate": 4.8710382744172354e-05, + "loss": 5.1803, + "step": 17267 + }, + { + "epoch": 0.10269768769626035, + "grad_norm": 1.8181023597717285, + "learning_rate": 4.871023465550535e-05, + "loss": 5.3418, + "step": 17268 + }, + { + "epoch": 0.10270363497954134, + "grad_norm": 1.9661909341812134, + "learning_rate": 4.871008655856136e-05, + "loss": 5.115, + "step": 17269 + }, + { + "epoch": 0.10270958226282234, + "grad_norm": 1.9482250213623047, + "learning_rate": 4.870993845334041e-05, + "loss": 5.0172, + "step": 17270 + }, + { + "epoch": 0.10271552954610334, + "grad_norm": 2.0916497707366943, + "learning_rate": 4.870979033984257e-05, + "loss": 5.4317, + "step": 17271 + }, + { + "epoch": 0.10272147682938433, + "grad_norm": 1.919918417930603, + "learning_rate": 4.8709642218067894e-05, + "loss": 5.3986, + "step": 17272 + }, + { + "epoch": 0.10272742411266533, + "grad_norm": 1.8286259174346924, + "learning_rate": 4.870949408801642e-05, + "loss": 5.1301, + "step": 17273 + }, + { + "epoch": 0.10273337139594634, + "grad_norm": 2.2312278747558594, + "learning_rate": 4.870934594968821e-05, + "loss": 5.0839, + "step": 17274 + }, + { + "epoch": 0.10273931867922732, + "grad_norm": 2.2795724868774414, + "learning_rate": 4.870919780308331e-05, + "loss": 5.3578, + "step": 17275 + }, + { + "epoch": 0.10274526596250833, + "grad_norm": 2.253885269165039, + "learning_rate": 4.870904964820178e-05, + "loss": 5.2482, + "step": 17276 + }, + { + "epoch": 0.10275121324578933, + "grad_norm": 1.9351953268051147, + "learning_rate": 4.870890148504366e-05, + "loss": 5.3657, + "step": 17277 + }, + { + "epoch": 0.10275716052907032, + "grad_norm": 2.072274923324585, + "learning_rate": 4.8708753313609004e-05, + "loss": 5.2433, + "step": 17278 + }, + { + "epoch": 0.10276310781235132, + "grad_norm": 2.0419273376464844, + "learning_rate": 4.8708605133897874e-05, + "loss": 5.27, + "step": 17279 + }, + { + "epoch": 0.10276905509563232, + "grad_norm": 2.156855821609497, + "learning_rate": 4.870845694591031e-05, + "loss": 5.1727, + "step": 17280 + }, + { + "epoch": 0.10277500237891331, + "grad_norm": 1.6552194356918335, + "learning_rate": 4.870830874964637e-05, + "loss": 5.0872, + "step": 17281 + }, + { + "epoch": 0.10278094966219431, + "grad_norm": 1.8167924880981445, + "learning_rate": 4.870816054510611e-05, + "loss": 5.2827, + "step": 17282 + }, + { + "epoch": 0.10278689694547531, + "grad_norm": 2.1617610454559326, + "learning_rate": 4.870801233228956e-05, + "loss": 5.1375, + "step": 17283 + }, + { + "epoch": 0.1027928442287563, + "grad_norm": 1.918817162513733, + "learning_rate": 4.87078641111968e-05, + "loss": 5.2945, + "step": 17284 + }, + { + "epoch": 0.1027987915120373, + "grad_norm": 1.5282881259918213, + "learning_rate": 4.870771588182788e-05, + "loss": 5.6653, + "step": 17285 + }, + { + "epoch": 0.1028047387953183, + "grad_norm": 1.7902590036392212, + "learning_rate": 4.8707567644182825e-05, + "loss": 5.6262, + "step": 17286 + }, + { + "epoch": 0.10281068607859929, + "grad_norm": 1.9451625347137451, + "learning_rate": 4.87074193982617e-05, + "loss": 5.1153, + "step": 17287 + }, + { + "epoch": 0.1028166333618803, + "grad_norm": 1.832401156425476, + "learning_rate": 4.870727114406457e-05, + "loss": 5.2928, + "step": 17288 + }, + { + "epoch": 0.1028225806451613, + "grad_norm": 1.645761251449585, + "learning_rate": 4.870712288159147e-05, + "loss": 5.649, + "step": 17289 + }, + { + "epoch": 0.10282852792844228, + "grad_norm": 1.6721855401992798, + "learning_rate": 4.8706974610842474e-05, + "loss": 5.7568, + "step": 17290 + }, + { + "epoch": 0.10283447521172329, + "grad_norm": 1.7489598989486694, + "learning_rate": 4.87068263318176e-05, + "loss": 5.6752, + "step": 17291 + }, + { + "epoch": 0.10284042249500429, + "grad_norm": 1.505332112312317, + "learning_rate": 4.870667804451693e-05, + "loss": 5.2993, + "step": 17292 + }, + { + "epoch": 0.10284636977828528, + "grad_norm": 1.3620814085006714, + "learning_rate": 4.870652974894049e-05, + "loss": 4.7225, + "step": 17293 + }, + { + "epoch": 0.10285231706156628, + "grad_norm": 2.1685922145843506, + "learning_rate": 4.8706381445088356e-05, + "loss": 4.8737, + "step": 17294 + }, + { + "epoch": 0.10285826434484728, + "grad_norm": 2.219942331314087, + "learning_rate": 4.8706233132960566e-05, + "loss": 5.7529, + "step": 17295 + }, + { + "epoch": 0.10286421162812827, + "grad_norm": 1.928809404373169, + "learning_rate": 4.8706084812557176e-05, + "loss": 5.803, + "step": 17296 + }, + { + "epoch": 0.10287015891140927, + "grad_norm": 1.8534711599349976, + "learning_rate": 4.870593648387823e-05, + "loss": 5.9403, + "step": 17297 + }, + { + "epoch": 0.10287610619469026, + "grad_norm": 2.2624459266662598, + "learning_rate": 4.87057881469238e-05, + "loss": 5.1227, + "step": 17298 + }, + { + "epoch": 0.10288205347797126, + "grad_norm": 2.4320240020751953, + "learning_rate": 4.870563980169391e-05, + "loss": 4.9701, + "step": 17299 + }, + { + "epoch": 0.10288800076125226, + "grad_norm": 2.664921760559082, + "learning_rate": 4.870549144818864e-05, + "loss": 4.8771, + "step": 17300 + }, + { + "epoch": 0.10289394804453325, + "grad_norm": 2.2558987140655518, + "learning_rate": 4.870534308640802e-05, + "loss": 5.0682, + "step": 17301 + }, + { + "epoch": 0.10289989532781425, + "grad_norm": 2.291553258895874, + "learning_rate": 4.870519471635211e-05, + "loss": 4.8481, + "step": 17302 + }, + { + "epoch": 0.10290584261109526, + "grad_norm": 1.9109137058258057, + "learning_rate": 4.870504633802096e-05, + "loss": 5.377, + "step": 17303 + }, + { + "epoch": 0.10291178989437624, + "grad_norm": 1.6809476613998413, + "learning_rate": 4.870489795141463e-05, + "loss": 5.5337, + "step": 17304 + }, + { + "epoch": 0.10291773717765725, + "grad_norm": 1.6410505771636963, + "learning_rate": 4.870474955653316e-05, + "loss": 5.5353, + "step": 17305 + }, + { + "epoch": 0.10292368446093825, + "grad_norm": 1.6310313940048218, + "learning_rate": 4.87046011533766e-05, + "loss": 5.4727, + "step": 17306 + }, + { + "epoch": 0.10292963174421924, + "grad_norm": 1.6450475454330444, + "learning_rate": 4.8704452741945015e-05, + "loss": 5.3677, + "step": 17307 + }, + { + "epoch": 0.10293557902750024, + "grad_norm": 1.7327302694320679, + "learning_rate": 4.870430432223846e-05, + "loss": 5.2964, + "step": 17308 + }, + { + "epoch": 0.10294152631078124, + "grad_norm": 2.837498426437378, + "learning_rate": 4.870415589425696e-05, + "loss": 4.7407, + "step": 17309 + }, + { + "epoch": 0.10294747359406223, + "grad_norm": 2.326399803161621, + "learning_rate": 4.8704007458000593e-05, + "loss": 4.8998, + "step": 17310 + }, + { + "epoch": 0.10295342087734323, + "grad_norm": 1.9505521059036255, + "learning_rate": 4.87038590134694e-05, + "loss": 5.438, + "step": 17311 + }, + { + "epoch": 0.10295936816062423, + "grad_norm": 1.690581202507019, + "learning_rate": 4.870371056066344e-05, + "loss": 5.4291, + "step": 17312 + }, + { + "epoch": 0.10296531544390522, + "grad_norm": 1.9977236986160278, + "learning_rate": 4.870356209958276e-05, + "loss": 5.81, + "step": 17313 + }, + { + "epoch": 0.10297126272718622, + "grad_norm": 1.7996702194213867, + "learning_rate": 4.8703413630227405e-05, + "loss": 5.7569, + "step": 17314 + }, + { + "epoch": 0.10297721001046722, + "grad_norm": 1.7594531774520874, + "learning_rate": 4.870326515259743e-05, + "loss": 5.9367, + "step": 17315 + }, + { + "epoch": 0.10298315729374821, + "grad_norm": 1.8434146642684937, + "learning_rate": 4.870311666669289e-05, + "loss": 5.1578, + "step": 17316 + }, + { + "epoch": 0.10298910457702921, + "grad_norm": 2.531515598297119, + "learning_rate": 4.870296817251385e-05, + "loss": 5.0574, + "step": 17317 + }, + { + "epoch": 0.10299505186031022, + "grad_norm": 2.2126452922821045, + "learning_rate": 4.870281967006034e-05, + "loss": 4.9034, + "step": 17318 + }, + { + "epoch": 0.1030009991435912, + "grad_norm": 2.391558885574341, + "learning_rate": 4.870267115933242e-05, + "loss": 4.9584, + "step": 17319 + }, + { + "epoch": 0.1030069464268722, + "grad_norm": 1.9653453826904297, + "learning_rate": 4.8702522640330145e-05, + "loss": 4.9569, + "step": 17320 + }, + { + "epoch": 0.10301289371015321, + "grad_norm": 2.0124504566192627, + "learning_rate": 4.870237411305356e-05, + "loss": 4.9237, + "step": 17321 + }, + { + "epoch": 0.1030188409934342, + "grad_norm": 1.9120689630508423, + "learning_rate": 4.8702225577502724e-05, + "loss": 4.9637, + "step": 17322 + }, + { + "epoch": 0.1030247882767152, + "grad_norm": 2.108009099960327, + "learning_rate": 4.8702077033677684e-05, + "loss": 4.9479, + "step": 17323 + }, + { + "epoch": 0.1030307355599962, + "grad_norm": 2.211385488510132, + "learning_rate": 4.8701928481578494e-05, + "loss": 4.9553, + "step": 17324 + }, + { + "epoch": 0.10303668284327719, + "grad_norm": 2.1452252864837646, + "learning_rate": 4.8701779921205215e-05, + "loss": 4.7809, + "step": 17325 + }, + { + "epoch": 0.10304263012655819, + "grad_norm": 2.126650810241699, + "learning_rate": 4.8701631352557874e-05, + "loss": 4.7027, + "step": 17326 + }, + { + "epoch": 0.10304857740983918, + "grad_norm": 1.9753129482269287, + "learning_rate": 4.870148277563655e-05, + "loss": 4.8073, + "step": 17327 + }, + { + "epoch": 0.10305452469312018, + "grad_norm": 2.013455867767334, + "learning_rate": 4.8701334190441284e-05, + "loss": 4.7989, + "step": 17328 + }, + { + "epoch": 0.10306047197640118, + "grad_norm": 2.2819676399230957, + "learning_rate": 4.8701185596972124e-05, + "loss": 4.7784, + "step": 17329 + }, + { + "epoch": 0.10306641925968217, + "grad_norm": 2.050511360168457, + "learning_rate": 4.870103699522912e-05, + "loss": 4.9621, + "step": 17330 + }, + { + "epoch": 0.10307236654296317, + "grad_norm": 2.422591209411621, + "learning_rate": 4.870088838521233e-05, + "loss": 4.7558, + "step": 17331 + }, + { + "epoch": 0.10307831382624418, + "grad_norm": 2.2109572887420654, + "learning_rate": 4.870073976692181e-05, + "loss": 4.7162, + "step": 17332 + }, + { + "epoch": 0.10308426110952516, + "grad_norm": 2.070526123046875, + "learning_rate": 4.8700591140357596e-05, + "loss": 4.9765, + "step": 17333 + }, + { + "epoch": 0.10309020839280617, + "grad_norm": 1.610152244567871, + "learning_rate": 4.870044250551976e-05, + "loss": 5.9361, + "step": 17334 + }, + { + "epoch": 0.10309615567608717, + "grad_norm": 1.8921641111373901, + "learning_rate": 4.870029386240834e-05, + "loss": 4.9423, + "step": 17335 + }, + { + "epoch": 0.10310210295936816, + "grad_norm": 2.07476806640625, + "learning_rate": 4.870014521102339e-05, + "loss": 4.7742, + "step": 17336 + }, + { + "epoch": 0.10310805024264916, + "grad_norm": 2.021850824356079, + "learning_rate": 4.869999655136498e-05, + "loss": 4.8182, + "step": 17337 + }, + { + "epoch": 0.10311399752593016, + "grad_norm": 1.5896223783493042, + "learning_rate": 4.869984788343314e-05, + "loss": 5.5694, + "step": 17338 + }, + { + "epoch": 0.10311994480921115, + "grad_norm": 1.1907202005386353, + "learning_rate": 4.869969920722792e-05, + "loss": 5.4427, + "step": 17339 + }, + { + "epoch": 0.10312589209249215, + "grad_norm": 1.56050443649292, + "learning_rate": 4.869955052274938e-05, + "loss": 5.2405, + "step": 17340 + }, + { + "epoch": 0.10313183937577315, + "grad_norm": 1.6611580848693848, + "learning_rate": 4.869940182999757e-05, + "loss": 5.1457, + "step": 17341 + }, + { + "epoch": 0.10313778665905414, + "grad_norm": 1.4664785861968994, + "learning_rate": 4.869925312897256e-05, + "loss": 5.2846, + "step": 17342 + }, + { + "epoch": 0.10314373394233514, + "grad_norm": 1.9751476049423218, + "learning_rate": 4.8699104419674366e-05, + "loss": 5.0283, + "step": 17343 + }, + { + "epoch": 0.10314968122561614, + "grad_norm": 1.715144157409668, + "learning_rate": 4.869895570210307e-05, + "loss": 4.8856, + "step": 17344 + }, + { + "epoch": 0.10315562850889713, + "grad_norm": 1.7803713083267212, + "learning_rate": 4.8698806976258704e-05, + "loss": 5.5573, + "step": 17345 + }, + { + "epoch": 0.10316157579217813, + "grad_norm": 1.4687060117721558, + "learning_rate": 4.8698658242141336e-05, + "loss": 5.2287, + "step": 17346 + }, + { + "epoch": 0.10316752307545914, + "grad_norm": 1.6236404180526733, + "learning_rate": 4.869850949975101e-05, + "loss": 5.1, + "step": 17347 + }, + { + "epoch": 0.10317347035874012, + "grad_norm": 1.6414464712142944, + "learning_rate": 4.869836074908778e-05, + "loss": 5.0884, + "step": 17348 + }, + { + "epoch": 0.10317941764202113, + "grad_norm": 1.5938411951065063, + "learning_rate": 4.86982119901517e-05, + "loss": 5.9405, + "step": 17349 + }, + { + "epoch": 0.10318536492530213, + "grad_norm": 1.7434169054031372, + "learning_rate": 4.869806322294282e-05, + "loss": 6.3698, + "step": 17350 + }, + { + "epoch": 0.10319131220858312, + "grad_norm": 1.4999836683273315, + "learning_rate": 4.8697914447461185e-05, + "loss": 5.4169, + "step": 17351 + }, + { + "epoch": 0.10319725949186412, + "grad_norm": 1.768048644065857, + "learning_rate": 4.869776566370686e-05, + "loss": 5.6703, + "step": 17352 + }, + { + "epoch": 0.10320320677514512, + "grad_norm": 1.734729528427124, + "learning_rate": 4.869761687167988e-05, + "loss": 5.6454, + "step": 17353 + }, + { + "epoch": 0.10320915405842611, + "grad_norm": 1.848308801651001, + "learning_rate": 4.869746807138031e-05, + "loss": 5.742, + "step": 17354 + }, + { + "epoch": 0.10321510134170711, + "grad_norm": 1.628144383430481, + "learning_rate": 4.8697319262808205e-05, + "loss": 5.6099, + "step": 17355 + }, + { + "epoch": 0.1032210486249881, + "grad_norm": 1.5005884170532227, + "learning_rate": 4.86971704459636e-05, + "loss": 5.5419, + "step": 17356 + }, + { + "epoch": 0.1032269959082691, + "grad_norm": 1.5255531072616577, + "learning_rate": 4.869702162084657e-05, + "loss": 5.4757, + "step": 17357 + }, + { + "epoch": 0.1032329431915501, + "grad_norm": 1.549132227897644, + "learning_rate": 4.869687278745715e-05, + "loss": 5.4757, + "step": 17358 + }, + { + "epoch": 0.10323889047483109, + "grad_norm": 1.6518296003341675, + "learning_rate": 4.869672394579539e-05, + "loss": 5.5803, + "step": 17359 + }, + { + "epoch": 0.10324483775811209, + "grad_norm": 2.3987839221954346, + "learning_rate": 4.869657509586136e-05, + "loss": 5.0978, + "step": 17360 + }, + { + "epoch": 0.1032507850413931, + "grad_norm": 1.7290594577789307, + "learning_rate": 4.869642623765509e-05, + "loss": 5.4998, + "step": 17361 + }, + { + "epoch": 0.10325673232467408, + "grad_norm": 1.6334084272384644, + "learning_rate": 4.869627737117665e-05, + "loss": 5.4695, + "step": 17362 + }, + { + "epoch": 0.10326267960795509, + "grad_norm": 1.609734296798706, + "learning_rate": 4.8696128496426074e-05, + "loss": 5.4406, + "step": 17363 + }, + { + "epoch": 0.10326862689123609, + "grad_norm": 1.7579066753387451, + "learning_rate": 4.869597961340343e-05, + "loss": 5.6412, + "step": 17364 + }, + { + "epoch": 0.10327457417451708, + "grad_norm": 1.8831701278686523, + "learning_rate": 4.869583072210877e-05, + "loss": 5.444, + "step": 17365 + }, + { + "epoch": 0.10328052145779808, + "grad_norm": 1.9597128629684448, + "learning_rate": 4.869568182254214e-05, + "loss": 5.2228, + "step": 17366 + }, + { + "epoch": 0.10328646874107908, + "grad_norm": 1.8867931365966797, + "learning_rate": 4.8695532914703584e-05, + "loss": 4.9979, + "step": 17367 + }, + { + "epoch": 0.10329241602436007, + "grad_norm": 1.5480263233184814, + "learning_rate": 4.869538399859317e-05, + "loss": 5.6457, + "step": 17368 + }, + { + "epoch": 0.10329836330764107, + "grad_norm": 1.6710255146026611, + "learning_rate": 4.869523507421093e-05, + "loss": 5.774, + "step": 17369 + }, + { + "epoch": 0.10330431059092207, + "grad_norm": 1.6559721231460571, + "learning_rate": 4.869508614155695e-05, + "loss": 5.5643, + "step": 17370 + }, + { + "epoch": 0.10331025787420306, + "grad_norm": 1.4451355934143066, + "learning_rate": 4.869493720063124e-05, + "loss": 5.4598, + "step": 17371 + }, + { + "epoch": 0.10331620515748406, + "grad_norm": 1.8376599550247192, + "learning_rate": 4.869478825143388e-05, + "loss": 4.7552, + "step": 17372 + }, + { + "epoch": 0.10332215244076506, + "grad_norm": 2.0193891525268555, + "learning_rate": 4.869463929396491e-05, + "loss": 4.5671, + "step": 17373 + }, + { + "epoch": 0.10332809972404605, + "grad_norm": 2.07692551612854, + "learning_rate": 4.869449032822439e-05, + "loss": 4.4776, + "step": 17374 + }, + { + "epoch": 0.10333404700732705, + "grad_norm": 1.820893406867981, + "learning_rate": 4.869434135421237e-05, + "loss": 5.4705, + "step": 17375 + }, + { + "epoch": 0.10333999429060806, + "grad_norm": 1.7207856178283691, + "learning_rate": 4.86941923719289e-05, + "loss": 4.8619, + "step": 17376 + }, + { + "epoch": 0.10334594157388904, + "grad_norm": 1.9348174333572388, + "learning_rate": 4.8694043381374026e-05, + "loss": 4.3723, + "step": 17377 + }, + { + "epoch": 0.10335188885717005, + "grad_norm": 1.8993666172027588, + "learning_rate": 4.869389438254781e-05, + "loss": 4.5442, + "step": 17378 + }, + { + "epoch": 0.10335783614045105, + "grad_norm": 1.9089124202728271, + "learning_rate": 4.869374537545031e-05, + "loss": 4.3347, + "step": 17379 + }, + { + "epoch": 0.10336378342373204, + "grad_norm": 1.8560502529144287, + "learning_rate": 4.869359636008155e-05, + "loss": 4.312, + "step": 17380 + }, + { + "epoch": 0.10336973070701304, + "grad_norm": 1.909680962562561, + "learning_rate": 4.8693447336441614e-05, + "loss": 4.3109, + "step": 17381 + }, + { + "epoch": 0.10337567799029404, + "grad_norm": 1.7769371271133423, + "learning_rate": 4.8693298304530535e-05, + "loss": 4.4442, + "step": 17382 + }, + { + "epoch": 0.10338162527357503, + "grad_norm": 2.080097198486328, + "learning_rate": 4.869314926434837e-05, + "loss": 4.339, + "step": 17383 + }, + { + "epoch": 0.10338757255685603, + "grad_norm": 1.8703278303146362, + "learning_rate": 4.8693000215895176e-05, + "loss": 4.4124, + "step": 17384 + }, + { + "epoch": 0.10339351984013702, + "grad_norm": 1.9553934335708618, + "learning_rate": 4.869285115917099e-05, + "loss": 4.3571, + "step": 17385 + }, + { + "epoch": 0.10339946712341802, + "grad_norm": 1.8989006280899048, + "learning_rate": 4.869270209417588e-05, + "loss": 4.4108, + "step": 17386 + }, + { + "epoch": 0.10340541440669902, + "grad_norm": 1.8347021341323853, + "learning_rate": 4.8692553020909896e-05, + "loss": 4.1529, + "step": 17387 + }, + { + "epoch": 0.10341136168998001, + "grad_norm": 1.9458621740341187, + "learning_rate": 4.869240393937309e-05, + "loss": 4.2392, + "step": 17388 + }, + { + "epoch": 0.10341730897326101, + "grad_norm": 1.8578664064407349, + "learning_rate": 4.86922548495655e-05, + "loss": 4.3238, + "step": 17389 + }, + { + "epoch": 0.10342325625654201, + "grad_norm": 1.9359874725341797, + "learning_rate": 4.869210575148719e-05, + "loss": 4.56, + "step": 17390 + }, + { + "epoch": 0.103429203539823, + "grad_norm": 2.0030486583709717, + "learning_rate": 4.869195664513822e-05, + "loss": 4.1571, + "step": 17391 + }, + { + "epoch": 0.103435150823104, + "grad_norm": 1.9431639909744263, + "learning_rate": 4.869180753051863e-05, + "loss": 4.2181, + "step": 17392 + }, + { + "epoch": 0.10344109810638501, + "grad_norm": 1.9171335697174072, + "learning_rate": 4.869165840762847e-05, + "loss": 4.3139, + "step": 17393 + }, + { + "epoch": 0.103447045389666, + "grad_norm": 1.9467666149139404, + "learning_rate": 4.86915092764678e-05, + "loss": 4.3906, + "step": 17394 + }, + { + "epoch": 0.103452992672947, + "grad_norm": 2.1354262828826904, + "learning_rate": 4.8691360137036666e-05, + "loss": 4.3407, + "step": 17395 + }, + { + "epoch": 0.103458939956228, + "grad_norm": 1.7994540929794312, + "learning_rate": 4.8691210989335126e-05, + "loss": 4.5767, + "step": 17396 + }, + { + "epoch": 0.10346488723950899, + "grad_norm": 1.8322330713272095, + "learning_rate": 4.869106183336323e-05, + "loss": 4.62, + "step": 17397 + }, + { + "epoch": 0.10347083452278999, + "grad_norm": 1.9874459505081177, + "learning_rate": 4.869091266912102e-05, + "loss": 4.2579, + "step": 17398 + }, + { + "epoch": 0.10347678180607099, + "grad_norm": 1.8300455808639526, + "learning_rate": 4.869076349660856e-05, + "loss": 4.3049, + "step": 17399 + }, + { + "epoch": 0.10348272908935198, + "grad_norm": 1.8731672763824463, + "learning_rate": 4.8690614315825914e-05, + "loss": 4.3241, + "step": 17400 + }, + { + "epoch": 0.10348867637263298, + "grad_norm": 1.8587061166763306, + "learning_rate": 4.86904651267731e-05, + "loss": 4.2513, + "step": 17401 + }, + { + "epoch": 0.10349462365591398, + "grad_norm": 1.8614505529403687, + "learning_rate": 4.86903159294502e-05, + "loss": 4.2877, + "step": 17402 + }, + { + "epoch": 0.10350057093919497, + "grad_norm": 1.7118782997131348, + "learning_rate": 4.869016672385725e-05, + "loss": 5.951, + "step": 17403 + }, + { + "epoch": 0.10350651822247597, + "grad_norm": 1.6701730489730835, + "learning_rate": 4.869001750999431e-05, + "loss": 5.8099, + "step": 17404 + }, + { + "epoch": 0.10351246550575698, + "grad_norm": 1.4960297346115112, + "learning_rate": 4.868986828786143e-05, + "loss": 5.7589, + "step": 17405 + }, + { + "epoch": 0.10351841278903796, + "grad_norm": 1.3732372522354126, + "learning_rate": 4.868971905745866e-05, + "loss": 5.8552, + "step": 17406 + }, + { + "epoch": 0.10352436007231897, + "grad_norm": 1.5108624696731567, + "learning_rate": 4.868956981878606e-05, + "loss": 5.82, + "step": 17407 + }, + { + "epoch": 0.10353030735559997, + "grad_norm": 1.8640809059143066, + "learning_rate": 4.868942057184367e-05, + "loss": 5.4388, + "step": 17408 + }, + { + "epoch": 0.10353625463888096, + "grad_norm": 2.082534074783325, + "learning_rate": 4.868927131663154e-05, + "loss": 4.3796, + "step": 17409 + }, + { + "epoch": 0.10354220192216196, + "grad_norm": 1.8963665962219238, + "learning_rate": 4.868912205314975e-05, + "loss": 5.6469, + "step": 17410 + }, + { + "epoch": 0.10354814920544296, + "grad_norm": 1.7797149419784546, + "learning_rate": 4.868897278139832e-05, + "loss": 5.6187, + "step": 17411 + }, + { + "epoch": 0.10355409648872395, + "grad_norm": 1.8464981317520142, + "learning_rate": 4.868882350137732e-05, + "loss": 4.8464, + "step": 17412 + }, + { + "epoch": 0.10356004377200495, + "grad_norm": 1.5401747226715088, + "learning_rate": 4.8688674213086794e-05, + "loss": 5.3547, + "step": 17413 + }, + { + "epoch": 0.10356599105528594, + "grad_norm": 1.4159618616104126, + "learning_rate": 4.868852491652679e-05, + "loss": 5.4428, + "step": 17414 + }, + { + "epoch": 0.10357193833856694, + "grad_norm": 1.6561527252197266, + "learning_rate": 4.868837561169738e-05, + "loss": 5.6467, + "step": 17415 + }, + { + "epoch": 0.10357788562184794, + "grad_norm": 1.659527063369751, + "learning_rate": 4.8688226298598586e-05, + "loss": 5.8631, + "step": 17416 + }, + { + "epoch": 0.10358383290512893, + "grad_norm": 1.8206923007965088, + "learning_rate": 4.868807697723049e-05, + "loss": 5.6475, + "step": 17417 + }, + { + "epoch": 0.10358978018840993, + "grad_norm": 1.9741102457046509, + "learning_rate": 4.868792764759312e-05, + "loss": 4.633, + "step": 17418 + }, + { + "epoch": 0.10359572747169093, + "grad_norm": 1.9505152702331543, + "learning_rate": 4.8687778309686546e-05, + "loss": 4.4024, + "step": 17419 + }, + { + "epoch": 0.10360167475497192, + "grad_norm": 1.7461168766021729, + "learning_rate": 4.868762896351082e-05, + "loss": 5.6505, + "step": 17420 + }, + { + "epoch": 0.10360762203825293, + "grad_norm": 1.6750074625015259, + "learning_rate": 4.868747960906598e-05, + "loss": 5.7747, + "step": 17421 + }, + { + "epoch": 0.10361356932153393, + "grad_norm": 1.5986868143081665, + "learning_rate": 4.8687330246352085e-05, + "loss": 5.2086, + "step": 17422 + }, + { + "epoch": 0.10361951660481492, + "grad_norm": 1.5743950605392456, + "learning_rate": 4.868718087536919e-05, + "loss": 5.6462, + "step": 17423 + }, + { + "epoch": 0.10362546388809592, + "grad_norm": 1.5192588567733765, + "learning_rate": 4.868703149611734e-05, + "loss": 5.5579, + "step": 17424 + }, + { + "epoch": 0.10363141117137692, + "grad_norm": 1.7356244325637817, + "learning_rate": 4.86868821085966e-05, + "loss": 5.5978, + "step": 17425 + }, + { + "epoch": 0.10363735845465791, + "grad_norm": 1.7366925477981567, + "learning_rate": 4.868673271280701e-05, + "loss": 5.3812, + "step": 17426 + }, + { + "epoch": 0.10364330573793891, + "grad_norm": 2.016662836074829, + "learning_rate": 4.868658330874862e-05, + "loss": 5.4003, + "step": 17427 + }, + { + "epoch": 0.10364925302121991, + "grad_norm": 2.022550582885742, + "learning_rate": 4.86864338964215e-05, + "loss": 5.191, + "step": 17428 + }, + { + "epoch": 0.1036552003045009, + "grad_norm": 1.8406000137329102, + "learning_rate": 4.868628447582568e-05, + "loss": 5.9494, + "step": 17429 + }, + { + "epoch": 0.1036611475877819, + "grad_norm": 1.7836806774139404, + "learning_rate": 4.868613504696123e-05, + "loss": 5.4606, + "step": 17430 + }, + { + "epoch": 0.1036670948710629, + "grad_norm": 1.6688835620880127, + "learning_rate": 4.86859856098282e-05, + "loss": 5.2287, + "step": 17431 + }, + { + "epoch": 0.10367304215434389, + "grad_norm": 1.7083512544631958, + "learning_rate": 4.868583616442663e-05, + "loss": 4.7133, + "step": 17432 + }, + { + "epoch": 0.1036789894376249, + "grad_norm": 1.8784829378128052, + "learning_rate": 4.8685686710756576e-05, + "loss": 4.8341, + "step": 17433 + }, + { + "epoch": 0.1036849367209059, + "grad_norm": 2.380962610244751, + "learning_rate": 4.8685537248818105e-05, + "loss": 4.6553, + "step": 17434 + }, + { + "epoch": 0.10369088400418688, + "grad_norm": 1.936126470565796, + "learning_rate": 4.868538777861125e-05, + "loss": 5.0645, + "step": 17435 + }, + { + "epoch": 0.10369683128746789, + "grad_norm": 1.9400380849838257, + "learning_rate": 4.8685238300136065e-05, + "loss": 4.9022, + "step": 17436 + }, + { + "epoch": 0.10370277857074889, + "grad_norm": 2.0275371074676514, + "learning_rate": 4.868508881339261e-05, + "loss": 4.8918, + "step": 17437 + }, + { + "epoch": 0.10370872585402988, + "grad_norm": 1.8734835386276245, + "learning_rate": 4.868493931838094e-05, + "loss": 4.9889, + "step": 17438 + }, + { + "epoch": 0.10371467313731088, + "grad_norm": 2.346519947052002, + "learning_rate": 4.868478981510111e-05, + "loss": 4.4857, + "step": 17439 + }, + { + "epoch": 0.10372062042059188, + "grad_norm": 2.4242961406707764, + "learning_rate": 4.868464030355315e-05, + "loss": 4.034, + "step": 17440 + }, + { + "epoch": 0.10372656770387287, + "grad_norm": 2.3877294063568115, + "learning_rate": 4.8684490783737133e-05, + "loss": 4.2761, + "step": 17441 + }, + { + "epoch": 0.10373251498715387, + "grad_norm": 1.832585096359253, + "learning_rate": 4.8684341255653107e-05, + "loss": 5.1485, + "step": 17442 + }, + { + "epoch": 0.10373846227043486, + "grad_norm": 2.0385608673095703, + "learning_rate": 4.868419171930112e-05, + "loss": 5.7793, + "step": 17443 + }, + { + "epoch": 0.10374440955371586, + "grad_norm": 1.8885849714279175, + "learning_rate": 4.8684042174681225e-05, + "loss": 5.9304, + "step": 17444 + }, + { + "epoch": 0.10375035683699686, + "grad_norm": 1.8748784065246582, + "learning_rate": 4.868389262179348e-05, + "loss": 5.3722, + "step": 17445 + }, + { + "epoch": 0.10375630412027785, + "grad_norm": 1.9851447343826294, + "learning_rate": 4.8683743060637924e-05, + "loss": 5.4734, + "step": 17446 + }, + { + "epoch": 0.10376225140355885, + "grad_norm": 2.387681245803833, + "learning_rate": 4.868359349121463e-05, + "loss": 4.7244, + "step": 17447 + }, + { + "epoch": 0.10376819868683985, + "grad_norm": 1.8236793279647827, + "learning_rate": 4.868344391352363e-05, + "loss": 5.0094, + "step": 17448 + }, + { + "epoch": 0.10377414597012084, + "grad_norm": 1.3649673461914062, + "learning_rate": 4.868329432756498e-05, + "loss": 5.3295, + "step": 17449 + }, + { + "epoch": 0.10378009325340184, + "grad_norm": 1.8916471004486084, + "learning_rate": 4.8683144733338746e-05, + "loss": 5.9443, + "step": 17450 + }, + { + "epoch": 0.10378604053668285, + "grad_norm": 1.8541333675384521, + "learning_rate": 4.868299513084497e-05, + "loss": 5.425, + "step": 17451 + }, + { + "epoch": 0.10379198781996384, + "grad_norm": 1.9708364009857178, + "learning_rate": 4.8682845520083695e-05, + "loss": 5.3254, + "step": 17452 + }, + { + "epoch": 0.10379793510324484, + "grad_norm": 1.7171103954315186, + "learning_rate": 4.8682695901054995e-05, + "loss": 5.3498, + "step": 17453 + }, + { + "epoch": 0.10380388238652584, + "grad_norm": 1.6002514362335205, + "learning_rate": 4.868254627375891e-05, + "loss": 5.1611, + "step": 17454 + }, + { + "epoch": 0.10380982966980683, + "grad_norm": 1.9245331287384033, + "learning_rate": 4.8682396638195486e-05, + "loss": 5.3348, + "step": 17455 + }, + { + "epoch": 0.10381577695308783, + "grad_norm": 1.4742863178253174, + "learning_rate": 4.8682246994364786e-05, + "loss": 5.7573, + "step": 17456 + }, + { + "epoch": 0.10382172423636883, + "grad_norm": 1.929343581199646, + "learning_rate": 4.8682097342266855e-05, + "loss": 5.8469, + "step": 17457 + }, + { + "epoch": 0.10382767151964982, + "grad_norm": 1.6212769746780396, + "learning_rate": 4.8681947681901754e-05, + "loss": 5.9121, + "step": 17458 + }, + { + "epoch": 0.10383361880293082, + "grad_norm": 1.6550590991973877, + "learning_rate": 4.868179801326952e-05, + "loss": 5.7114, + "step": 17459 + }, + { + "epoch": 0.10383956608621182, + "grad_norm": 1.671628475189209, + "learning_rate": 4.868164833637023e-05, + "loss": 5.3988, + "step": 17460 + }, + { + "epoch": 0.10384551336949281, + "grad_norm": 1.5833921432495117, + "learning_rate": 4.868149865120391e-05, + "loss": 5.1952, + "step": 17461 + }, + { + "epoch": 0.10385146065277381, + "grad_norm": 1.8280199766159058, + "learning_rate": 4.868134895777063e-05, + "loss": 5.4812, + "step": 17462 + }, + { + "epoch": 0.10385740793605482, + "grad_norm": 1.7413616180419922, + "learning_rate": 4.868119925607043e-05, + "loss": 5.4119, + "step": 17463 + }, + { + "epoch": 0.1038633552193358, + "grad_norm": 1.6645252704620361, + "learning_rate": 4.868104954610337e-05, + "loss": 5.3546, + "step": 17464 + }, + { + "epoch": 0.1038693025026168, + "grad_norm": 1.634175181388855, + "learning_rate": 4.86808998278695e-05, + "loss": 5.3119, + "step": 17465 + }, + { + "epoch": 0.10387524978589781, + "grad_norm": 1.5220096111297607, + "learning_rate": 4.868075010136887e-05, + "loss": 5.1345, + "step": 17466 + }, + { + "epoch": 0.1038811970691788, + "grad_norm": 1.3279895782470703, + "learning_rate": 4.8680600366601534e-05, + "loss": 5.0071, + "step": 17467 + }, + { + "epoch": 0.1038871443524598, + "grad_norm": 1.4460431337356567, + "learning_rate": 4.8680450623567555e-05, + "loss": 4.8219, + "step": 17468 + }, + { + "epoch": 0.1038930916357408, + "grad_norm": 1.7028027772903442, + "learning_rate": 4.868030087226697e-05, + "loss": 5.2679, + "step": 17469 + }, + { + "epoch": 0.10389903891902179, + "grad_norm": 1.7697324752807617, + "learning_rate": 4.8680151112699835e-05, + "loss": 5.504, + "step": 17470 + }, + { + "epoch": 0.10390498620230279, + "grad_norm": 1.4549357891082764, + "learning_rate": 4.86800013448662e-05, + "loss": 5.4475, + "step": 17471 + }, + { + "epoch": 0.10391093348558378, + "grad_norm": 1.7069107294082642, + "learning_rate": 4.867985156876613e-05, + "loss": 5.5878, + "step": 17472 + }, + { + "epoch": 0.10391688076886478, + "grad_norm": 1.8917819261550903, + "learning_rate": 4.867970178439967e-05, + "loss": 5.4449, + "step": 17473 + }, + { + "epoch": 0.10392282805214578, + "grad_norm": 1.7132060527801514, + "learning_rate": 4.8679551991766856e-05, + "loss": 5.7547, + "step": 17474 + }, + { + "epoch": 0.10392877533542677, + "grad_norm": 1.6535362005233765, + "learning_rate": 4.867940219086777e-05, + "loss": 5.9603, + "step": 17475 + }, + { + "epoch": 0.10393472261870777, + "grad_norm": 1.6559079885482788, + "learning_rate": 4.8679252381702443e-05, + "loss": 5.9673, + "step": 17476 + }, + { + "epoch": 0.10394066990198877, + "grad_norm": 1.5295041799545288, + "learning_rate": 4.867910256427093e-05, + "loss": 5.4502, + "step": 17477 + }, + { + "epoch": 0.10394661718526976, + "grad_norm": 1.8571394681930542, + "learning_rate": 4.8678952738573294e-05, + "loss": 6.1838, + "step": 17478 + }, + { + "epoch": 0.10395256446855076, + "grad_norm": 1.7148513793945312, + "learning_rate": 4.8678802904609576e-05, + "loss": 5.9624, + "step": 17479 + }, + { + "epoch": 0.10395851175183177, + "grad_norm": 1.7191139459609985, + "learning_rate": 4.867865306237983e-05, + "loss": 5.8591, + "step": 17480 + }, + { + "epoch": 0.10396445903511276, + "grad_norm": 1.526285171508789, + "learning_rate": 4.867850321188412e-05, + "loss": 5.988, + "step": 17481 + }, + { + "epoch": 0.10397040631839376, + "grad_norm": 1.5284392833709717, + "learning_rate": 4.867835335312249e-05, + "loss": 5.7212, + "step": 17482 + }, + { + "epoch": 0.10397635360167476, + "grad_norm": 1.5675333738327026, + "learning_rate": 4.8678203486094975e-05, + "loss": 5.5921, + "step": 17483 + }, + { + "epoch": 0.10398230088495575, + "grad_norm": 1.7697393894195557, + "learning_rate": 4.8678053610801654e-05, + "loss": 5.1748, + "step": 17484 + }, + { + "epoch": 0.10398824816823675, + "grad_norm": 1.5940029621124268, + "learning_rate": 4.867790372724257e-05, + "loss": 5.7108, + "step": 17485 + }, + { + "epoch": 0.10399419545151775, + "grad_norm": 2.0347743034362793, + "learning_rate": 4.867775383541777e-05, + "loss": 5.4253, + "step": 17486 + }, + { + "epoch": 0.10400014273479874, + "grad_norm": 2.1038641929626465, + "learning_rate": 4.867760393532732e-05, + "loss": 5.2362, + "step": 17487 + }, + { + "epoch": 0.10400609001807974, + "grad_norm": 2.2253377437591553, + "learning_rate": 4.867745402697126e-05, + "loss": 5.0801, + "step": 17488 + }, + { + "epoch": 0.10401203730136074, + "grad_norm": 1.8215906620025635, + "learning_rate": 4.867730411034964e-05, + "loss": 5.1438, + "step": 17489 + }, + { + "epoch": 0.10401798458464173, + "grad_norm": 1.5428386926651, + "learning_rate": 4.867715418546252e-05, + "loss": 5.0664, + "step": 17490 + }, + { + "epoch": 0.10402393186792273, + "grad_norm": 1.3886137008666992, + "learning_rate": 4.867700425230995e-05, + "loss": 4.992, + "step": 17491 + }, + { + "epoch": 0.10402987915120374, + "grad_norm": 1.4177032709121704, + "learning_rate": 4.867685431089199e-05, + "loss": 4.9245, + "step": 17492 + }, + { + "epoch": 0.10403582643448472, + "grad_norm": 1.2621585130691528, + "learning_rate": 4.867670436120867e-05, + "loss": 4.8902, + "step": 17493 + }, + { + "epoch": 0.10404177371776573, + "grad_norm": 1.4095661640167236, + "learning_rate": 4.867655440326007e-05, + "loss": 4.871, + "step": 17494 + }, + { + "epoch": 0.10404772100104673, + "grad_norm": 1.3117374181747437, + "learning_rate": 4.867640443704622e-05, + "loss": 4.9351, + "step": 17495 + }, + { + "epoch": 0.10405366828432772, + "grad_norm": 1.6237322092056274, + "learning_rate": 4.867625446256719e-05, + "loss": 5.4253, + "step": 17496 + }, + { + "epoch": 0.10405961556760872, + "grad_norm": 2.095696210861206, + "learning_rate": 4.867610447982302e-05, + "loss": 5.1793, + "step": 17497 + }, + { + "epoch": 0.10406556285088972, + "grad_norm": 3.627516508102417, + "learning_rate": 4.867595448881377e-05, + "loss": 5.1206, + "step": 17498 + }, + { + "epoch": 0.10407151013417071, + "grad_norm": 2.0525522232055664, + "learning_rate": 4.8675804489539477e-05, + "loss": 5.5922, + "step": 17499 + }, + { + "epoch": 0.10407745741745171, + "grad_norm": 1.6003656387329102, + "learning_rate": 4.867565448200022e-05, + "loss": 6.0267, + "step": 17500 + }, + { + "epoch": 0.1040834047007327, + "grad_norm": 1.4709582328796387, + "learning_rate": 4.8675504466196034e-05, + "loss": 5.55, + "step": 17501 + }, + { + "epoch": 0.1040893519840137, + "grad_norm": 1.5550457239151, + "learning_rate": 4.8675354442126966e-05, + "loss": 5.6857, + "step": 17502 + }, + { + "epoch": 0.1040952992672947, + "grad_norm": 1.6180169582366943, + "learning_rate": 4.8675204409793085e-05, + "loss": 5.3079, + "step": 17503 + }, + { + "epoch": 0.10410124655057569, + "grad_norm": 1.5625691413879395, + "learning_rate": 4.8675054369194426e-05, + "loss": 5.5965, + "step": 17504 + }, + { + "epoch": 0.10410719383385669, + "grad_norm": 1.4117538928985596, + "learning_rate": 4.8674904320331064e-05, + "loss": 5.7337, + "step": 17505 + }, + { + "epoch": 0.1041131411171377, + "grad_norm": 1.5518572330474854, + "learning_rate": 4.867475426320302e-05, + "loss": 5.5802, + "step": 17506 + }, + { + "epoch": 0.10411908840041868, + "grad_norm": 1.3276773691177368, + "learning_rate": 4.867460419781037e-05, + "loss": 6.0462, + "step": 17507 + }, + { + "epoch": 0.10412503568369968, + "grad_norm": 1.3660519123077393, + "learning_rate": 4.867445412415317e-05, + "loss": 6.0382, + "step": 17508 + }, + { + "epoch": 0.10413098296698069, + "grad_norm": 1.2959636449813843, + "learning_rate": 4.867430404223146e-05, + "loss": 5.8823, + "step": 17509 + }, + { + "epoch": 0.10413693025026168, + "grad_norm": 2.009265899658203, + "learning_rate": 4.867415395204528e-05, + "loss": 4.9889, + "step": 17510 + }, + { + "epoch": 0.10414287753354268, + "grad_norm": 1.3692728281021118, + "learning_rate": 4.8674003853594705e-05, + "loss": 5.2382, + "step": 17511 + }, + { + "epoch": 0.10414882481682368, + "grad_norm": 1.4074095487594604, + "learning_rate": 4.8673853746879785e-05, + "loss": 5.8241, + "step": 17512 + }, + { + "epoch": 0.10415477210010467, + "grad_norm": 1.2155077457427979, + "learning_rate": 4.867370363190057e-05, + "loss": 5.762, + "step": 17513 + }, + { + "epoch": 0.10416071938338567, + "grad_norm": 1.1142069101333618, + "learning_rate": 4.86735535086571e-05, + "loss": 5.7591, + "step": 17514 + }, + { + "epoch": 0.10416666666666667, + "grad_norm": 1.1758382320404053, + "learning_rate": 4.867340337714944e-05, + "loss": 5.6534, + "step": 17515 + }, + { + "epoch": 0.10417261394994766, + "grad_norm": 1.2154567241668701, + "learning_rate": 4.867325323737765e-05, + "loss": 5.7465, + "step": 17516 + }, + { + "epoch": 0.10417856123322866, + "grad_norm": 1.3033171892166138, + "learning_rate": 4.867310308934176e-05, + "loss": 5.7701, + "step": 17517 + }, + { + "epoch": 0.10418450851650966, + "grad_norm": 1.3491926193237305, + "learning_rate": 4.867295293304184e-05, + "loss": 5.7883, + "step": 17518 + }, + { + "epoch": 0.10419045579979065, + "grad_norm": 1.223988652229309, + "learning_rate": 4.867280276847793e-05, + "loss": 5.7065, + "step": 17519 + }, + { + "epoch": 0.10419640308307165, + "grad_norm": 1.3885700702667236, + "learning_rate": 4.867265259565009e-05, + "loss": 5.6934, + "step": 17520 + }, + { + "epoch": 0.10420235036635266, + "grad_norm": 1.1616452932357788, + "learning_rate": 4.867250241455837e-05, + "loss": 5.6958, + "step": 17521 + }, + { + "epoch": 0.10420829764963364, + "grad_norm": 1.2696330547332764, + "learning_rate": 4.867235222520283e-05, + "loss": 5.5534, + "step": 17522 + }, + { + "epoch": 0.10421424493291465, + "grad_norm": 1.3539372682571411, + "learning_rate": 4.8672202027583516e-05, + "loss": 5.8028, + "step": 17523 + }, + { + "epoch": 0.10422019221619565, + "grad_norm": 2.547095775604248, + "learning_rate": 4.867205182170048e-05, + "loss": 5.0223, + "step": 17524 + }, + { + "epoch": 0.10422613949947664, + "grad_norm": 1.7378231287002563, + "learning_rate": 4.8671901607553775e-05, + "loss": 5.8356, + "step": 17525 + }, + { + "epoch": 0.10423208678275764, + "grad_norm": 1.9287587404251099, + "learning_rate": 4.867175138514346e-05, + "loss": 5.9694, + "step": 17526 + }, + { + "epoch": 0.10423803406603864, + "grad_norm": 1.685260534286499, + "learning_rate": 4.867160115446957e-05, + "loss": 5.6962, + "step": 17527 + }, + { + "epoch": 0.10424398134931963, + "grad_norm": 1.594699501991272, + "learning_rate": 4.8671450915532176e-05, + "loss": 5.6139, + "step": 17528 + }, + { + "epoch": 0.10424992863260063, + "grad_norm": 1.5966441631317139, + "learning_rate": 4.867130066833132e-05, + "loss": 5.6369, + "step": 17529 + }, + { + "epoch": 0.10425587591588162, + "grad_norm": 1.701524019241333, + "learning_rate": 4.867115041286706e-05, + "loss": 5.6487, + "step": 17530 + }, + { + "epoch": 0.10426182319916262, + "grad_norm": 1.575536847114563, + "learning_rate": 4.8671000149139444e-05, + "loss": 5.5935, + "step": 17531 + }, + { + "epoch": 0.10426777048244362, + "grad_norm": 1.6812626123428345, + "learning_rate": 4.867084987714853e-05, + "loss": 5.4343, + "step": 17532 + }, + { + "epoch": 0.10427371776572461, + "grad_norm": 1.6122568845748901, + "learning_rate": 4.867069959689435e-05, + "loss": 5.5194, + "step": 17533 + }, + { + "epoch": 0.10427966504900561, + "grad_norm": 1.5337659120559692, + "learning_rate": 4.8670549308376996e-05, + "loss": 5.5248, + "step": 17534 + }, + { + "epoch": 0.10428561233228661, + "grad_norm": 1.45541250705719, + "learning_rate": 4.867039901159649e-05, + "loss": 5.6301, + "step": 17535 + }, + { + "epoch": 0.1042915596155676, + "grad_norm": 1.6674455404281616, + "learning_rate": 4.867024870655289e-05, + "loss": 6.1182, + "step": 17536 + }, + { + "epoch": 0.1042975068988486, + "grad_norm": 1.4686870574951172, + "learning_rate": 4.867009839324624e-05, + "loss": 5.9761, + "step": 17537 + }, + { + "epoch": 0.1043034541821296, + "grad_norm": 1.6447898149490356, + "learning_rate": 4.866994807167662e-05, + "loss": 5.4559, + "step": 17538 + }, + { + "epoch": 0.1043094014654106, + "grad_norm": 1.4841620922088623, + "learning_rate": 4.866979774184406e-05, + "loss": 5.4441, + "step": 17539 + }, + { + "epoch": 0.1043153487486916, + "grad_norm": 1.8813121318817139, + "learning_rate": 4.8669647403748616e-05, + "loss": 5.348, + "step": 17540 + }, + { + "epoch": 0.1043212960319726, + "grad_norm": 4.018791198730469, + "learning_rate": 4.866949705739035e-05, + "loss": 5.457, + "step": 17541 + }, + { + "epoch": 0.10432724331525359, + "grad_norm": 2.9932172298431396, + "learning_rate": 4.86693467027693e-05, + "loss": 5.2345, + "step": 17542 + }, + { + "epoch": 0.10433319059853459, + "grad_norm": 1.4329689741134644, + "learning_rate": 4.866919633988553e-05, + "loss": 5.8491, + "step": 17543 + }, + { + "epoch": 0.10433913788181559, + "grad_norm": 1.7308731079101562, + "learning_rate": 4.866904596873909e-05, + "loss": 5.5858, + "step": 17544 + }, + { + "epoch": 0.10434508516509658, + "grad_norm": 2.2066311836242676, + "learning_rate": 4.866889558933002e-05, + "loss": 4.7702, + "step": 17545 + }, + { + "epoch": 0.10435103244837758, + "grad_norm": 1.528171181678772, + "learning_rate": 4.866874520165839e-05, + "loss": 5.1622, + "step": 17546 + }, + { + "epoch": 0.10435697973165858, + "grad_norm": 1.8969347476959229, + "learning_rate": 4.866859480572424e-05, + "loss": 5.0091, + "step": 17547 + }, + { + "epoch": 0.10436292701493957, + "grad_norm": 1.6737502813339233, + "learning_rate": 4.8668444401527644e-05, + "loss": 5.7552, + "step": 17548 + }, + { + "epoch": 0.10436887429822057, + "grad_norm": 1.793411374092102, + "learning_rate": 4.8668293989068626e-05, + "loss": 5.7963, + "step": 17549 + }, + { + "epoch": 0.10437482158150158, + "grad_norm": 1.8675566911697388, + "learning_rate": 4.866814356834725e-05, + "loss": 4.7389, + "step": 17550 + }, + { + "epoch": 0.10438076886478256, + "grad_norm": 1.9145622253417969, + "learning_rate": 4.8667993139363574e-05, + "loss": 5.0921, + "step": 17551 + }, + { + "epoch": 0.10438671614806357, + "grad_norm": 1.6751158237457275, + "learning_rate": 4.866784270211764e-05, + "loss": 5.5547, + "step": 17552 + }, + { + "epoch": 0.10439266343134457, + "grad_norm": 1.754550576210022, + "learning_rate": 4.866769225660951e-05, + "loss": 5.6077, + "step": 17553 + }, + { + "epoch": 0.10439861071462556, + "grad_norm": 2.0323402881622314, + "learning_rate": 4.866754180283924e-05, + "loss": 5.1191, + "step": 17554 + }, + { + "epoch": 0.10440455799790656, + "grad_norm": 1.8000339269638062, + "learning_rate": 4.866739134080687e-05, + "loss": 5.1533, + "step": 17555 + }, + { + "epoch": 0.10441050528118756, + "grad_norm": 2.053093671798706, + "learning_rate": 4.866724087051245e-05, + "loss": 4.9985, + "step": 17556 + }, + { + "epoch": 0.10441645256446855, + "grad_norm": 1.6764185428619385, + "learning_rate": 4.866709039195605e-05, + "loss": 4.9674, + "step": 17557 + }, + { + "epoch": 0.10442239984774955, + "grad_norm": 1.6942695379257202, + "learning_rate": 4.866693990513772e-05, + "loss": 4.9319, + "step": 17558 + }, + { + "epoch": 0.10442834713103054, + "grad_norm": 1.5124322175979614, + "learning_rate": 4.8666789410057496e-05, + "loss": 5.1371, + "step": 17559 + }, + { + "epoch": 0.10443429441431154, + "grad_norm": 1.925757646560669, + "learning_rate": 4.866663890671545e-05, + "loss": 4.6366, + "step": 17560 + }, + { + "epoch": 0.10444024169759254, + "grad_norm": 2.0077321529388428, + "learning_rate": 4.866648839511161e-05, + "loss": 4.9993, + "step": 17561 + }, + { + "epoch": 0.10444618898087353, + "grad_norm": 2.1986982822418213, + "learning_rate": 4.866633787524605e-05, + "loss": 4.814, + "step": 17562 + }, + { + "epoch": 0.10445213626415453, + "grad_norm": 1.9967917203903198, + "learning_rate": 4.866618734711882e-05, + "loss": 4.5182, + "step": 17563 + }, + { + "epoch": 0.10445808354743553, + "grad_norm": 1.7663863897323608, + "learning_rate": 4.8666036810729965e-05, + "loss": 4.5589, + "step": 17564 + }, + { + "epoch": 0.10446403083071652, + "grad_norm": 1.7784098386764526, + "learning_rate": 4.8665886266079537e-05, + "loss": 4.6739, + "step": 17565 + }, + { + "epoch": 0.10446997811399752, + "grad_norm": 1.7143903970718384, + "learning_rate": 4.8665735713167596e-05, + "loss": 4.8434, + "step": 17566 + }, + { + "epoch": 0.10447592539727853, + "grad_norm": 2.018825054168701, + "learning_rate": 4.866558515199419e-05, + "loss": 4.5235, + "step": 17567 + }, + { + "epoch": 0.10448187268055951, + "grad_norm": 2.1135973930358887, + "learning_rate": 4.8665434582559374e-05, + "loss": 4.5048, + "step": 17568 + }, + { + "epoch": 0.10448781996384052, + "grad_norm": 2.097177028656006, + "learning_rate": 4.86652840048632e-05, + "loss": 4.7811, + "step": 17569 + }, + { + "epoch": 0.10449376724712152, + "grad_norm": 2.054049015045166, + "learning_rate": 4.866513341890572e-05, + "loss": 4.5964, + "step": 17570 + }, + { + "epoch": 0.10449971453040251, + "grad_norm": 1.9631117582321167, + "learning_rate": 4.866498282468699e-05, + "loss": 4.4055, + "step": 17571 + }, + { + "epoch": 0.10450566181368351, + "grad_norm": 2.079071521759033, + "learning_rate": 4.8664832222207055e-05, + "loss": 4.3743, + "step": 17572 + }, + { + "epoch": 0.10451160909696451, + "grad_norm": 1.8425450325012207, + "learning_rate": 4.8664681611465966e-05, + "loss": 4.411, + "step": 17573 + }, + { + "epoch": 0.1045175563802455, + "grad_norm": 1.812538743019104, + "learning_rate": 4.866453099246379e-05, + "loss": 4.3496, + "step": 17574 + }, + { + "epoch": 0.1045235036635265, + "grad_norm": 1.8823848962783813, + "learning_rate": 4.8664380365200566e-05, + "loss": 4.3613, + "step": 17575 + }, + { + "epoch": 0.1045294509468075, + "grad_norm": 1.6085865497589111, + "learning_rate": 4.8664229729676356e-05, + "loss": 4.5187, + "step": 17576 + }, + { + "epoch": 0.10453539823008849, + "grad_norm": 1.8719606399536133, + "learning_rate": 4.8664079085891204e-05, + "loss": 4.7276, + "step": 17577 + }, + { + "epoch": 0.1045413455133695, + "grad_norm": 1.7630116939544678, + "learning_rate": 4.866392843384517e-05, + "loss": 4.3749, + "step": 17578 + }, + { + "epoch": 0.1045472927966505, + "grad_norm": 1.8641449213027954, + "learning_rate": 4.86637777735383e-05, + "loss": 4.5781, + "step": 17579 + }, + { + "epoch": 0.10455324007993148, + "grad_norm": 1.8178362846374512, + "learning_rate": 4.8663627104970645e-05, + "loss": 4.3217, + "step": 17580 + }, + { + "epoch": 0.10455918736321249, + "grad_norm": 1.7655141353607178, + "learning_rate": 4.866347642814228e-05, + "loss": 4.4972, + "step": 17581 + }, + { + "epoch": 0.10456513464649349, + "grad_norm": 1.843266248703003, + "learning_rate": 4.8663325743053216e-05, + "loss": 4.5214, + "step": 17582 + }, + { + "epoch": 0.10457108192977448, + "grad_norm": 1.8023161888122559, + "learning_rate": 4.866317504970354e-05, + "loss": 4.3205, + "step": 17583 + }, + { + "epoch": 0.10457702921305548, + "grad_norm": 1.7845708131790161, + "learning_rate": 4.8663024348093296e-05, + "loss": 4.1439, + "step": 17584 + }, + { + "epoch": 0.10458297649633648, + "grad_norm": 2.0029754638671875, + "learning_rate": 4.866287363822253e-05, + "loss": 4.4627, + "step": 17585 + }, + { + "epoch": 0.10458892377961747, + "grad_norm": 1.6008789539337158, + "learning_rate": 4.8662722920091305e-05, + "loss": 4.5539, + "step": 17586 + }, + { + "epoch": 0.10459487106289847, + "grad_norm": 1.884207844734192, + "learning_rate": 4.8662572193699664e-05, + "loss": 4.1132, + "step": 17587 + }, + { + "epoch": 0.10460081834617946, + "grad_norm": 1.7014282941818237, + "learning_rate": 4.866242145904767e-05, + "loss": 4.9612, + "step": 17588 + }, + { + "epoch": 0.10460676562946046, + "grad_norm": 1.7388410568237305, + "learning_rate": 4.8662270716135364e-05, + "loss": 5.3079, + "step": 17589 + }, + { + "epoch": 0.10461271291274146, + "grad_norm": 1.6414510011672974, + "learning_rate": 4.8662119964962805e-05, + "loss": 5.5816, + "step": 17590 + }, + { + "epoch": 0.10461866019602245, + "grad_norm": 1.4039387702941895, + "learning_rate": 4.866196920553004e-05, + "loss": 5.0036, + "step": 17591 + }, + { + "epoch": 0.10462460747930345, + "grad_norm": 1.7621723413467407, + "learning_rate": 4.866181843783712e-05, + "loss": 5.3461, + "step": 17592 + }, + { + "epoch": 0.10463055476258445, + "grad_norm": 1.4525210857391357, + "learning_rate": 4.866166766188412e-05, + "loss": 5.2897, + "step": 17593 + }, + { + "epoch": 0.10463650204586544, + "grad_norm": 1.4203788042068481, + "learning_rate": 4.866151687767107e-05, + "loss": 5.2506, + "step": 17594 + }, + { + "epoch": 0.10464244932914644, + "grad_norm": 1.419097900390625, + "learning_rate": 4.866136608519803e-05, + "loss": 5.246, + "step": 17595 + }, + { + "epoch": 0.10464839661242745, + "grad_norm": 1.8866242170333862, + "learning_rate": 4.8661215284465047e-05, + "loss": 5.5259, + "step": 17596 + }, + { + "epoch": 0.10465434389570843, + "grad_norm": 1.5161887407302856, + "learning_rate": 4.866106447547218e-05, + "loss": 5.2219, + "step": 17597 + }, + { + "epoch": 0.10466029117898944, + "grad_norm": 1.3552051782608032, + "learning_rate": 4.866091365821948e-05, + "loss": 4.9473, + "step": 17598 + }, + { + "epoch": 0.10466623846227044, + "grad_norm": 1.3443762063980103, + "learning_rate": 4.8660762832707e-05, + "loss": 5.0027, + "step": 17599 + }, + { + "epoch": 0.10467218574555143, + "grad_norm": 1.5657448768615723, + "learning_rate": 4.866061199893479e-05, + "loss": 5.3873, + "step": 17600 + }, + { + "epoch": 0.10467813302883243, + "grad_norm": 1.177984595298767, + "learning_rate": 4.866046115690291e-05, + "loss": 4.8628, + "step": 17601 + }, + { + "epoch": 0.10468408031211343, + "grad_norm": 1.1911925077438354, + "learning_rate": 4.8660310306611405e-05, + "loss": 4.7862, + "step": 17602 + }, + { + "epoch": 0.10469002759539442, + "grad_norm": 1.238619327545166, + "learning_rate": 4.866015944806033e-05, + "loss": 4.6844, + "step": 17603 + }, + { + "epoch": 0.10469597487867542, + "grad_norm": 1.4151804447174072, + "learning_rate": 4.8660008581249736e-05, + "loss": 4.7824, + "step": 17604 + }, + { + "epoch": 0.10470192216195642, + "grad_norm": 1.1852803230285645, + "learning_rate": 4.8659857706179676e-05, + "loss": 4.8358, + "step": 17605 + }, + { + "epoch": 0.10470786944523741, + "grad_norm": 1.2641617059707642, + "learning_rate": 4.865970682285022e-05, + "loss": 4.688, + "step": 17606 + }, + { + "epoch": 0.10471381672851841, + "grad_norm": 1.3711220026016235, + "learning_rate": 4.865955593126138e-05, + "loss": 4.6552, + "step": 17607 + }, + { + "epoch": 0.10471976401179942, + "grad_norm": 1.5641502141952515, + "learning_rate": 4.865940503141325e-05, + "loss": 5.0781, + "step": 17608 + }, + { + "epoch": 0.1047257112950804, + "grad_norm": 1.5290453433990479, + "learning_rate": 4.865925412330586e-05, + "loss": 5.1347, + "step": 17609 + }, + { + "epoch": 0.1047316585783614, + "grad_norm": 1.6220836639404297, + "learning_rate": 4.8659103206939275e-05, + "loss": 5.2943, + "step": 17610 + }, + { + "epoch": 0.10473760586164241, + "grad_norm": 1.4212614297866821, + "learning_rate": 4.865895228231353e-05, + "loss": 5.2939, + "step": 17611 + }, + { + "epoch": 0.1047435531449234, + "grad_norm": 1.4920703172683716, + "learning_rate": 4.8658801349428696e-05, + "loss": 5.3314, + "step": 17612 + }, + { + "epoch": 0.1047495004282044, + "grad_norm": 1.4596521854400635, + "learning_rate": 4.865865040828482e-05, + "loss": 5.3082, + "step": 17613 + }, + { + "epoch": 0.1047554477114854, + "grad_norm": 1.2887258529663086, + "learning_rate": 4.865849945888195e-05, + "loss": 5.1002, + "step": 17614 + }, + { + "epoch": 0.10476139499476639, + "grad_norm": 1.3587419986724854, + "learning_rate": 4.8658348501220145e-05, + "loss": 4.9773, + "step": 17615 + }, + { + "epoch": 0.10476734227804739, + "grad_norm": 1.5476746559143066, + "learning_rate": 4.865819753529945e-05, + "loss": 5.0726, + "step": 17616 + }, + { + "epoch": 0.10477328956132839, + "grad_norm": 1.2820343971252441, + "learning_rate": 4.865804656111993e-05, + "loss": 5.0708, + "step": 17617 + }, + { + "epoch": 0.10477923684460938, + "grad_norm": 1.5396101474761963, + "learning_rate": 4.8657895578681634e-05, + "loss": 5.087, + "step": 17618 + }, + { + "epoch": 0.10478518412789038, + "grad_norm": 1.9199161529541016, + "learning_rate": 4.86577445879846e-05, + "loss": 4.9402, + "step": 17619 + }, + { + "epoch": 0.10479113141117137, + "grad_norm": 1.6283903121948242, + "learning_rate": 4.8657593589028894e-05, + "loss": 5.2045, + "step": 17620 + }, + { + "epoch": 0.10479707869445237, + "grad_norm": 1.350632905960083, + "learning_rate": 4.865744258181457e-05, + "loss": 5.2314, + "step": 17621 + }, + { + "epoch": 0.10480302597773337, + "grad_norm": 1.5528992414474487, + "learning_rate": 4.865729156634168e-05, + "loss": 4.9361, + "step": 17622 + }, + { + "epoch": 0.10480897326101436, + "grad_norm": 1.4698718786239624, + "learning_rate": 4.865714054261027e-05, + "loss": 5.6547, + "step": 17623 + }, + { + "epoch": 0.10481492054429536, + "grad_norm": 1.2905457019805908, + "learning_rate": 4.86569895106204e-05, + "loss": 5.5628, + "step": 17624 + }, + { + "epoch": 0.10482086782757637, + "grad_norm": 1.2559312582015991, + "learning_rate": 4.8656838470372116e-05, + "loss": 5.3106, + "step": 17625 + }, + { + "epoch": 0.10482681511085735, + "grad_norm": 1.2229273319244385, + "learning_rate": 4.8656687421865466e-05, + "loss": 5.1566, + "step": 17626 + }, + { + "epoch": 0.10483276239413836, + "grad_norm": 1.4148969650268555, + "learning_rate": 4.8656536365100524e-05, + "loss": 5.1785, + "step": 17627 + }, + { + "epoch": 0.10483870967741936, + "grad_norm": 1.4109671115875244, + "learning_rate": 4.865638530007732e-05, + "loss": 4.922, + "step": 17628 + }, + { + "epoch": 0.10484465696070035, + "grad_norm": 1.526160478591919, + "learning_rate": 4.865623422679593e-05, + "loss": 5.0734, + "step": 17629 + }, + { + "epoch": 0.10485060424398135, + "grad_norm": 1.5093508958816528, + "learning_rate": 4.865608314525638e-05, + "loss": 5.1926, + "step": 17630 + }, + { + "epoch": 0.10485655152726235, + "grad_norm": 1.4625009298324585, + "learning_rate": 4.8655932055458734e-05, + "loss": 5.1372, + "step": 17631 + }, + { + "epoch": 0.10486249881054334, + "grad_norm": 1.348502516746521, + "learning_rate": 4.865578095740305e-05, + "loss": 5.0275, + "step": 17632 + }, + { + "epoch": 0.10486844609382434, + "grad_norm": 1.4530283212661743, + "learning_rate": 4.865562985108938e-05, + "loss": 5.093, + "step": 17633 + }, + { + "epoch": 0.10487439337710534, + "grad_norm": 1.4871639013290405, + "learning_rate": 4.865547873651778e-05, + "loss": 5.0789, + "step": 17634 + }, + { + "epoch": 0.10488034066038633, + "grad_norm": 1.2314977645874023, + "learning_rate": 4.865532761368828e-05, + "loss": 5.0966, + "step": 17635 + }, + { + "epoch": 0.10488628794366733, + "grad_norm": 1.3988053798675537, + "learning_rate": 4.865517648260097e-05, + "loss": 5.4284, + "step": 17636 + }, + { + "epoch": 0.10489223522694834, + "grad_norm": 1.3434901237487793, + "learning_rate": 4.865502534325587e-05, + "loss": 5.3563, + "step": 17637 + }, + { + "epoch": 0.10489818251022932, + "grad_norm": 1.3380807638168335, + "learning_rate": 4.865487419565305e-05, + "loss": 5.3628, + "step": 17638 + }, + { + "epoch": 0.10490412979351033, + "grad_norm": 1.5222781896591187, + "learning_rate": 4.865472303979255e-05, + "loss": 5.2164, + "step": 17639 + }, + { + "epoch": 0.10491007707679133, + "grad_norm": 1.2916938066482544, + "learning_rate": 4.865457187567444e-05, + "loss": 5.1248, + "step": 17640 + }, + { + "epoch": 0.10491602436007232, + "grad_norm": 1.4988411664962769, + "learning_rate": 4.8654420703298755e-05, + "loss": 5.0932, + "step": 17641 + }, + { + "epoch": 0.10492197164335332, + "grad_norm": 1.2529023885726929, + "learning_rate": 4.8654269522665564e-05, + "loss": 5.1465, + "step": 17642 + }, + { + "epoch": 0.10492791892663432, + "grad_norm": 1.3913809061050415, + "learning_rate": 4.86541183337749e-05, + "loss": 5.0039, + "step": 17643 + }, + { + "epoch": 0.10493386620991531, + "grad_norm": 1.5128841400146484, + "learning_rate": 4.8653967136626836e-05, + "loss": 4.9937, + "step": 17644 + }, + { + "epoch": 0.10493981349319631, + "grad_norm": 1.3300340175628662, + "learning_rate": 4.865381593122142e-05, + "loss": 5.0521, + "step": 17645 + }, + { + "epoch": 0.10494576077647731, + "grad_norm": 1.6548517942428589, + "learning_rate": 4.86536647175587e-05, + "loss": 5.1361, + "step": 17646 + }, + { + "epoch": 0.1049517080597583, + "grad_norm": 1.2479137182235718, + "learning_rate": 4.865351349563873e-05, + "loss": 5.3129, + "step": 17647 + }, + { + "epoch": 0.1049576553430393, + "grad_norm": 1.3804575204849243, + "learning_rate": 4.8653362265461556e-05, + "loss": 4.9891, + "step": 17648 + }, + { + "epoch": 0.10496360262632029, + "grad_norm": 1.2821561098098755, + "learning_rate": 4.865321102702724e-05, + "loss": 5.0255, + "step": 17649 + }, + { + "epoch": 0.10496954990960129, + "grad_norm": 1.5715882778167725, + "learning_rate": 4.865305978033583e-05, + "loss": 4.9897, + "step": 17650 + }, + { + "epoch": 0.1049754971928823, + "grad_norm": 1.5910687446594238, + "learning_rate": 4.865290852538738e-05, + "loss": 5.1387, + "step": 17651 + }, + { + "epoch": 0.10498144447616328, + "grad_norm": 1.4188683032989502, + "learning_rate": 4.865275726218196e-05, + "loss": 5.3502, + "step": 17652 + }, + { + "epoch": 0.10498739175944428, + "grad_norm": 1.6032958030700684, + "learning_rate": 4.8652605990719594e-05, + "loss": 5.2716, + "step": 17653 + }, + { + "epoch": 0.10499333904272529, + "grad_norm": 1.4894942045211792, + "learning_rate": 4.8652454711000353e-05, + "loss": 5.237, + "step": 17654 + }, + { + "epoch": 0.10499928632600627, + "grad_norm": 1.5370794534683228, + "learning_rate": 4.8652303423024276e-05, + "loss": 5.0227, + "step": 17655 + }, + { + "epoch": 0.10500523360928728, + "grad_norm": 1.4100168943405151, + "learning_rate": 4.865215212679143e-05, + "loss": 5.0713, + "step": 17656 + }, + { + "epoch": 0.10501118089256828, + "grad_norm": 1.6180533170700073, + "learning_rate": 4.8652000822301856e-05, + "loss": 5.2041, + "step": 17657 + }, + { + "epoch": 0.10501712817584927, + "grad_norm": 1.2447609901428223, + "learning_rate": 4.865184950955562e-05, + "loss": 5.1073, + "step": 17658 + }, + { + "epoch": 0.10502307545913027, + "grad_norm": 1.4866548776626587, + "learning_rate": 4.865169818855277e-05, + "loss": 5.1287, + "step": 17659 + }, + { + "epoch": 0.10502902274241127, + "grad_norm": 1.33426034450531, + "learning_rate": 4.865154685929335e-05, + "loss": 5.1343, + "step": 17660 + }, + { + "epoch": 0.10503497002569226, + "grad_norm": 1.122551679611206, + "learning_rate": 4.865139552177742e-05, + "loss": 5.1267, + "step": 17661 + }, + { + "epoch": 0.10504091730897326, + "grad_norm": 1.787278175354004, + "learning_rate": 4.865124417600504e-05, + "loss": 5.4828, + "step": 17662 + }, + { + "epoch": 0.10504686459225426, + "grad_norm": 1.4937405586242676, + "learning_rate": 4.8651092821976246e-05, + "loss": 5.3467, + "step": 17663 + }, + { + "epoch": 0.10505281187553525, + "grad_norm": 1.395286202430725, + "learning_rate": 4.86509414596911e-05, + "loss": 5.1552, + "step": 17664 + }, + { + "epoch": 0.10505875915881625, + "grad_norm": 1.5284260511398315, + "learning_rate": 4.865079008914965e-05, + "loss": 5.2718, + "step": 17665 + }, + { + "epoch": 0.10506470644209726, + "grad_norm": 2.0051753520965576, + "learning_rate": 4.865063871035197e-05, + "loss": 5.1121, + "step": 17666 + }, + { + "epoch": 0.10507065372537824, + "grad_norm": 1.690699577331543, + "learning_rate": 4.8650487323298085e-05, + "loss": 5.1091, + "step": 17667 + }, + { + "epoch": 0.10507660100865925, + "grad_norm": 1.5275843143463135, + "learning_rate": 4.865033592798807e-05, + "loss": 5.3064, + "step": 17668 + }, + { + "epoch": 0.10508254829194025, + "grad_norm": 1.584038496017456, + "learning_rate": 4.865018452442195e-05, + "loss": 5.2598, + "step": 17669 + }, + { + "epoch": 0.10508849557522124, + "grad_norm": 1.8086310625076294, + "learning_rate": 4.865003311259981e-05, + "loss": 5.2229, + "step": 17670 + }, + { + "epoch": 0.10509444285850224, + "grad_norm": 1.805972695350647, + "learning_rate": 4.864988169252168e-05, + "loss": 5.1051, + "step": 17671 + }, + { + "epoch": 0.10510039014178324, + "grad_norm": 1.6209838390350342, + "learning_rate": 4.864973026418762e-05, + "loss": 5.1808, + "step": 17672 + }, + { + "epoch": 0.10510633742506423, + "grad_norm": 1.3997793197631836, + "learning_rate": 4.8649578827597684e-05, + "loss": 4.9167, + "step": 17673 + }, + { + "epoch": 0.10511228470834523, + "grad_norm": 1.368037462234497, + "learning_rate": 4.8649427382751925e-05, + "loss": 4.98, + "step": 17674 + }, + { + "epoch": 0.10511823199162623, + "grad_norm": 1.3904718160629272, + "learning_rate": 4.864927592965039e-05, + "loss": 4.8101, + "step": 17675 + }, + { + "epoch": 0.10512417927490722, + "grad_norm": 1.3237133026123047, + "learning_rate": 4.864912446829315e-05, + "loss": 5.1427, + "step": 17676 + }, + { + "epoch": 0.10513012655818822, + "grad_norm": 1.2642048597335815, + "learning_rate": 4.864897299868024e-05, + "loss": 5.2961, + "step": 17677 + }, + { + "epoch": 0.10513607384146921, + "grad_norm": 1.4357531070709229, + "learning_rate": 4.864882152081172e-05, + "loss": 5.4811, + "step": 17678 + }, + { + "epoch": 0.10514202112475021, + "grad_norm": 1.652321696281433, + "learning_rate": 4.864867003468763e-05, + "loss": 5.2172, + "step": 17679 + }, + { + "epoch": 0.10514796840803121, + "grad_norm": 1.6143925189971924, + "learning_rate": 4.864851854030804e-05, + "loss": 4.9856, + "step": 17680 + }, + { + "epoch": 0.1051539156913122, + "grad_norm": 1.637320637702942, + "learning_rate": 4.8648367037673e-05, + "loss": 4.9458, + "step": 17681 + }, + { + "epoch": 0.1051598629745932, + "grad_norm": 1.650970458984375, + "learning_rate": 4.864821552678256e-05, + "loss": 4.714, + "step": 17682 + }, + { + "epoch": 0.1051658102578742, + "grad_norm": 1.616098403930664, + "learning_rate": 4.864806400763676e-05, + "loss": 4.7064, + "step": 17683 + }, + { + "epoch": 0.1051717575411552, + "grad_norm": 1.6400461196899414, + "learning_rate": 4.864791248023568e-05, + "loss": 4.5955, + "step": 17684 + }, + { + "epoch": 0.1051777048244362, + "grad_norm": 1.3815523386001587, + "learning_rate": 4.8647760944579344e-05, + "loss": 4.7491, + "step": 17685 + }, + { + "epoch": 0.1051836521077172, + "grad_norm": 1.5695693492889404, + "learning_rate": 4.864760940066783e-05, + "loss": 4.6242, + "step": 17686 + }, + { + "epoch": 0.10518959939099819, + "grad_norm": 1.5861409902572632, + "learning_rate": 4.8647457848501174e-05, + "loss": 4.5859, + "step": 17687 + }, + { + "epoch": 0.10519554667427919, + "grad_norm": 1.637741208076477, + "learning_rate": 4.864730628807944e-05, + "loss": 4.6572, + "step": 17688 + }, + { + "epoch": 0.10520149395756019, + "grad_norm": 1.5806957483291626, + "learning_rate": 4.864715471940268e-05, + "loss": 4.8879, + "step": 17689 + }, + { + "epoch": 0.10520744124084118, + "grad_norm": 2.0158286094665527, + "learning_rate": 4.864700314247093e-05, + "loss": 5.5019, + "step": 17690 + }, + { + "epoch": 0.10521338852412218, + "grad_norm": 1.5022921562194824, + "learning_rate": 4.8646851557284256e-05, + "loss": 5.2029, + "step": 17691 + }, + { + "epoch": 0.10521933580740318, + "grad_norm": 1.8164446353912354, + "learning_rate": 4.864669996384272e-05, + "loss": 4.9258, + "step": 17692 + }, + { + "epoch": 0.10522528309068417, + "grad_norm": 1.6789724826812744, + "learning_rate": 4.864654836214636e-05, + "loss": 5.0876, + "step": 17693 + }, + { + "epoch": 0.10523123037396517, + "grad_norm": 1.778971552848816, + "learning_rate": 4.864639675219523e-05, + "loss": 5.1052, + "step": 17694 + }, + { + "epoch": 0.10523717765724618, + "grad_norm": 1.2401436567306519, + "learning_rate": 4.8646245133989396e-05, + "loss": 5.2536, + "step": 17695 + }, + { + "epoch": 0.10524312494052716, + "grad_norm": 1.6509275436401367, + "learning_rate": 4.8646093507528904e-05, + "loss": 4.9215, + "step": 17696 + }, + { + "epoch": 0.10524907222380817, + "grad_norm": 1.3725727796554565, + "learning_rate": 4.864594187281379e-05, + "loss": 5.5578, + "step": 17697 + }, + { + "epoch": 0.10525501950708917, + "grad_norm": 1.481040358543396, + "learning_rate": 4.864579022984413e-05, + "loss": 5.4683, + "step": 17698 + }, + { + "epoch": 0.10526096679037016, + "grad_norm": 1.4682444334030151, + "learning_rate": 4.864563857861998e-05, + "loss": 5.5076, + "step": 17699 + }, + { + "epoch": 0.10526691407365116, + "grad_norm": 1.2660551071166992, + "learning_rate": 4.864548691914137e-05, + "loss": 5.6092, + "step": 17700 + }, + { + "epoch": 0.10527286135693216, + "grad_norm": 1.266858458518982, + "learning_rate": 4.8645335251408366e-05, + "loss": 5.4373, + "step": 17701 + }, + { + "epoch": 0.10527880864021315, + "grad_norm": 1.5075262784957886, + "learning_rate": 4.8645183575421024e-05, + "loss": 5.3651, + "step": 17702 + }, + { + "epoch": 0.10528475592349415, + "grad_norm": 1.6108607053756714, + "learning_rate": 4.864503189117939e-05, + "loss": 5.3372, + "step": 17703 + }, + { + "epoch": 0.10529070320677515, + "grad_norm": 1.677874207496643, + "learning_rate": 4.8644880198683515e-05, + "loss": 4.9378, + "step": 17704 + }, + { + "epoch": 0.10529665049005614, + "grad_norm": 1.5847524404525757, + "learning_rate": 4.864472849793346e-05, + "loss": 5.2918, + "step": 17705 + }, + { + "epoch": 0.10530259777333714, + "grad_norm": 1.598244309425354, + "learning_rate": 4.864457678892927e-05, + "loss": 5.2408, + "step": 17706 + }, + { + "epoch": 0.10530854505661813, + "grad_norm": 1.4147340059280396, + "learning_rate": 4.8644425071671015e-05, + "loss": 5.2856, + "step": 17707 + }, + { + "epoch": 0.10531449233989913, + "grad_norm": 1.6057299375534058, + "learning_rate": 4.8644273346158734e-05, + "loss": 5.343, + "step": 17708 + }, + { + "epoch": 0.10532043962318013, + "grad_norm": 1.3503344058990479, + "learning_rate": 4.864412161239247e-05, + "loss": 5.4081, + "step": 17709 + }, + { + "epoch": 0.10532638690646112, + "grad_norm": 1.8316742181777954, + "learning_rate": 4.8643969870372295e-05, + "loss": 4.7925, + "step": 17710 + }, + { + "epoch": 0.10533233418974212, + "grad_norm": 2.1429593563079834, + "learning_rate": 4.864381812009825e-05, + "loss": 4.3519, + "step": 17711 + }, + { + "epoch": 0.10533828147302313, + "grad_norm": 1.9665764570236206, + "learning_rate": 4.8643666361570396e-05, + "loss": 4.388, + "step": 17712 + }, + { + "epoch": 0.10534422875630411, + "grad_norm": 1.7851755619049072, + "learning_rate": 4.864351459478878e-05, + "loss": 4.5242, + "step": 17713 + }, + { + "epoch": 0.10535017603958512, + "grad_norm": 1.8347305059432983, + "learning_rate": 4.864336281975346e-05, + "loss": 4.166, + "step": 17714 + }, + { + "epoch": 0.10535612332286612, + "grad_norm": 1.9413511753082275, + "learning_rate": 4.864321103646449e-05, + "loss": 4.0937, + "step": 17715 + }, + { + "epoch": 0.1053620706061471, + "grad_norm": 1.8122237920761108, + "learning_rate": 4.8643059244921904e-05, + "loss": 4.3812, + "step": 17716 + }, + { + "epoch": 0.10536801788942811, + "grad_norm": 2.0114996433258057, + "learning_rate": 4.864290744512578e-05, + "loss": 4.0728, + "step": 17717 + }, + { + "epoch": 0.10537396517270911, + "grad_norm": 1.8565599918365479, + "learning_rate": 4.8642755637076165e-05, + "loss": 4.2625, + "step": 17718 + }, + { + "epoch": 0.1053799124559901, + "grad_norm": 1.9136046171188354, + "learning_rate": 4.8642603820773105e-05, + "loss": 4.4933, + "step": 17719 + }, + { + "epoch": 0.1053858597392711, + "grad_norm": 1.8930033445358276, + "learning_rate": 4.864245199621666e-05, + "loss": 4.3249, + "step": 17720 + }, + { + "epoch": 0.1053918070225521, + "grad_norm": 1.7729578018188477, + "learning_rate": 4.864230016340687e-05, + "loss": 4.4736, + "step": 17721 + }, + { + "epoch": 0.10539775430583309, + "grad_norm": 2.1663360595703125, + "learning_rate": 4.864214832234381e-05, + "loss": 4.7505, + "step": 17722 + }, + { + "epoch": 0.1054037015891141, + "grad_norm": 1.9864879846572876, + "learning_rate": 4.864199647302751e-05, + "loss": 4.7233, + "step": 17723 + }, + { + "epoch": 0.1054096488723951, + "grad_norm": 2.031329870223999, + "learning_rate": 4.8641844615458035e-05, + "loss": 4.8218, + "step": 17724 + }, + { + "epoch": 0.10541559615567608, + "grad_norm": 2.0325984954833984, + "learning_rate": 4.864169274963544e-05, + "loss": 4.9383, + "step": 17725 + }, + { + "epoch": 0.10542154343895709, + "grad_norm": 1.9482324123382568, + "learning_rate": 4.864154087555977e-05, + "loss": 5.0849, + "step": 17726 + }, + { + "epoch": 0.10542749072223809, + "grad_norm": 1.6887640953063965, + "learning_rate": 4.864138899323108e-05, + "loss": 5.0216, + "step": 17727 + }, + { + "epoch": 0.10543343800551908, + "grad_norm": 2.0226924419403076, + "learning_rate": 4.864123710264944e-05, + "loss": 4.9241, + "step": 17728 + }, + { + "epoch": 0.10543938528880008, + "grad_norm": 1.647629976272583, + "learning_rate": 4.8641085203814873e-05, + "loss": 5.0318, + "step": 17729 + }, + { + "epoch": 0.10544533257208108, + "grad_norm": 1.766290545463562, + "learning_rate": 4.864093329672745e-05, + "loss": 4.9034, + "step": 17730 + }, + { + "epoch": 0.10545127985536207, + "grad_norm": 1.7573658227920532, + "learning_rate": 4.864078138138723e-05, + "loss": 4.7783, + "step": 17731 + }, + { + "epoch": 0.10545722713864307, + "grad_norm": 1.5503767728805542, + "learning_rate": 4.864062945779425e-05, + "loss": 5.1085, + "step": 17732 + }, + { + "epoch": 0.10546317442192407, + "grad_norm": 1.7276320457458496, + "learning_rate": 4.864047752594857e-05, + "loss": 4.8028, + "step": 17733 + }, + { + "epoch": 0.10546912170520506, + "grad_norm": 1.9654134511947632, + "learning_rate": 4.864032558585024e-05, + "loss": 5.1221, + "step": 17734 + }, + { + "epoch": 0.10547506898848606, + "grad_norm": 1.9654512405395508, + "learning_rate": 4.864017363749933e-05, + "loss": 5.0463, + "step": 17735 + }, + { + "epoch": 0.10548101627176705, + "grad_norm": 1.9071869850158691, + "learning_rate": 4.864002168089587e-05, + "loss": 5.0822, + "step": 17736 + }, + { + "epoch": 0.10548696355504805, + "grad_norm": 2.4190056324005127, + "learning_rate": 4.863986971603993e-05, + "loss": 5.7404, + "step": 17737 + }, + { + "epoch": 0.10549291083832905, + "grad_norm": 2.2098371982574463, + "learning_rate": 4.863971774293155e-05, + "loss": 5.9282, + "step": 17738 + }, + { + "epoch": 0.10549885812161004, + "grad_norm": 2.569831132888794, + "learning_rate": 4.8639565761570784e-05, + "loss": 4.3309, + "step": 17739 + }, + { + "epoch": 0.10550480540489104, + "grad_norm": 2.252847909927368, + "learning_rate": 4.8639413771957696e-05, + "loss": 4.185, + "step": 17740 + }, + { + "epoch": 0.10551075268817205, + "grad_norm": 2.3022215366363525, + "learning_rate": 4.8639261774092325e-05, + "loss": 4.3537, + "step": 17741 + }, + { + "epoch": 0.10551669997145303, + "grad_norm": 2.2695138454437256, + "learning_rate": 4.8639109767974745e-05, + "loss": 3.9806, + "step": 17742 + }, + { + "epoch": 0.10552264725473404, + "grad_norm": 2.1722588539123535, + "learning_rate": 4.8638957753604985e-05, + "loss": 3.9803, + "step": 17743 + }, + { + "epoch": 0.10552859453801504, + "grad_norm": 2.4385933876037598, + "learning_rate": 4.863880573098312e-05, + "loss": 4.0148, + "step": 17744 + }, + { + "epoch": 0.10553454182129603, + "grad_norm": 2.3186235427856445, + "learning_rate": 4.8638653700109184e-05, + "loss": 3.979, + "step": 17745 + }, + { + "epoch": 0.10554048910457703, + "grad_norm": 2.4591264724731445, + "learning_rate": 4.863850166098324e-05, + "loss": 3.9258, + "step": 17746 + }, + { + "epoch": 0.10554643638785803, + "grad_norm": 2.2619590759277344, + "learning_rate": 4.8638349613605336e-05, + "loss": 4.0571, + "step": 17747 + }, + { + "epoch": 0.10555238367113902, + "grad_norm": 2.393226146697998, + "learning_rate": 4.863819755797553e-05, + "loss": 4.0036, + "step": 17748 + }, + { + "epoch": 0.10555833095442002, + "grad_norm": 2.281846046447754, + "learning_rate": 4.8638045494093875e-05, + "loss": 3.9382, + "step": 17749 + }, + { + "epoch": 0.10556427823770102, + "grad_norm": 2.165407657623291, + "learning_rate": 4.8637893421960425e-05, + "loss": 4.0204, + "step": 17750 + }, + { + "epoch": 0.10557022552098201, + "grad_norm": 2.131829261779785, + "learning_rate": 4.863774134157523e-05, + "loss": 4.8661, + "step": 17751 + }, + { + "epoch": 0.10557617280426301, + "grad_norm": 2.0619029998779297, + "learning_rate": 4.863758925293834e-05, + "loss": 5.5522, + "step": 17752 + }, + { + "epoch": 0.10558212008754402, + "grad_norm": 1.6535427570343018, + "learning_rate": 4.863743715604981e-05, + "loss": 5.3463, + "step": 17753 + }, + { + "epoch": 0.105588067370825, + "grad_norm": 1.903904676437378, + "learning_rate": 4.86372850509097e-05, + "loss": 5.7202, + "step": 17754 + }, + { + "epoch": 0.105594014654106, + "grad_norm": 1.649357557296753, + "learning_rate": 4.863713293751806e-05, + "loss": 5.577, + "step": 17755 + }, + { + "epoch": 0.10559996193738701, + "grad_norm": 2.0812721252441406, + "learning_rate": 4.8636980815874936e-05, + "loss": 5.3164, + "step": 17756 + }, + { + "epoch": 0.105605909220668, + "grad_norm": 2.312357187271118, + "learning_rate": 4.8636828685980384e-05, + "loss": 5.3018, + "step": 17757 + }, + { + "epoch": 0.105611856503949, + "grad_norm": 2.1815388202667236, + "learning_rate": 4.863667654783447e-05, + "loss": 5.1509, + "step": 17758 + }, + { + "epoch": 0.10561780378723, + "grad_norm": 1.7500512599945068, + "learning_rate": 4.8636524401437225e-05, + "loss": 5.492, + "step": 17759 + }, + { + "epoch": 0.10562375107051099, + "grad_norm": 1.6850415468215942, + "learning_rate": 4.863637224678872e-05, + "loss": 5.5086, + "step": 17760 + }, + { + "epoch": 0.10562969835379199, + "grad_norm": 1.7222185134887695, + "learning_rate": 4.8636220083889e-05, + "loss": 5.4139, + "step": 17761 + }, + { + "epoch": 0.10563564563707299, + "grad_norm": 1.627914309501648, + "learning_rate": 4.8636067912738116e-05, + "loss": 5.5763, + "step": 17762 + }, + { + "epoch": 0.10564159292035398, + "grad_norm": 1.5884100198745728, + "learning_rate": 4.863591573333613e-05, + "loss": 5.544, + "step": 17763 + }, + { + "epoch": 0.10564754020363498, + "grad_norm": 1.4660178422927856, + "learning_rate": 4.8635763545683085e-05, + "loss": 5.4913, + "step": 17764 + }, + { + "epoch": 0.10565348748691597, + "grad_norm": 1.5240764617919922, + "learning_rate": 4.863561134977904e-05, + "loss": 5.4757, + "step": 17765 + }, + { + "epoch": 0.10565943477019697, + "grad_norm": 1.3686332702636719, + "learning_rate": 4.863545914562406e-05, + "loss": 5.4934, + "step": 17766 + }, + { + "epoch": 0.10566538205347797, + "grad_norm": 1.5429164171218872, + "learning_rate": 4.863530693321817e-05, + "loss": 5.3654, + "step": 17767 + }, + { + "epoch": 0.10567132933675896, + "grad_norm": 1.4237322807312012, + "learning_rate": 4.863515471256145e-05, + "loss": 5.4128, + "step": 17768 + }, + { + "epoch": 0.10567727662003996, + "grad_norm": 1.6438677310943604, + "learning_rate": 4.863500248365393e-05, + "loss": 5.3129, + "step": 17769 + }, + { + "epoch": 0.10568322390332097, + "grad_norm": 1.9208921194076538, + "learning_rate": 4.8634850246495675e-05, + "loss": 5.4889, + "step": 17770 + }, + { + "epoch": 0.10568917118660195, + "grad_norm": 1.6967288255691528, + "learning_rate": 4.863469800108675e-05, + "loss": 5.5301, + "step": 17771 + }, + { + "epoch": 0.10569511846988296, + "grad_norm": 1.5820802450180054, + "learning_rate": 4.8634545747427185e-05, + "loss": 5.4126, + "step": 17772 + }, + { + "epoch": 0.10570106575316396, + "grad_norm": 1.8280025720596313, + "learning_rate": 4.8634393485517046e-05, + "loss": 6.1201, + "step": 17773 + }, + { + "epoch": 0.10570701303644495, + "grad_norm": 1.809193730354309, + "learning_rate": 4.8634241215356394e-05, + "loss": 5.4123, + "step": 17774 + }, + { + "epoch": 0.10571296031972595, + "grad_norm": 1.596528172492981, + "learning_rate": 4.863408893694527e-05, + "loss": 5.6865, + "step": 17775 + }, + { + "epoch": 0.10571890760300695, + "grad_norm": 1.7726397514343262, + "learning_rate": 4.8633936650283715e-05, + "loss": 5.7298, + "step": 17776 + }, + { + "epoch": 0.10572485488628794, + "grad_norm": 1.5804529190063477, + "learning_rate": 4.863378435537182e-05, + "loss": 5.6051, + "step": 17777 + }, + { + "epoch": 0.10573080216956894, + "grad_norm": 1.5244919061660767, + "learning_rate": 4.8633632052209595e-05, + "loss": 5.7402, + "step": 17778 + }, + { + "epoch": 0.10573674945284994, + "grad_norm": 1.5003318786621094, + "learning_rate": 4.8633479740797117e-05, + "loss": 5.6978, + "step": 17779 + }, + { + "epoch": 0.10574269673613093, + "grad_norm": 1.7325289249420166, + "learning_rate": 4.863332742113444e-05, + "loss": 5.8616, + "step": 17780 + }, + { + "epoch": 0.10574864401941193, + "grad_norm": 1.8214267492294312, + "learning_rate": 4.863317509322161e-05, + "loss": 5.9213, + "step": 17781 + }, + { + "epoch": 0.10575459130269294, + "grad_norm": 1.7067787647247314, + "learning_rate": 4.863302275705869e-05, + "loss": 5.5518, + "step": 17782 + }, + { + "epoch": 0.10576053858597392, + "grad_norm": 1.8018234968185425, + "learning_rate": 4.863287041264571e-05, + "loss": 5.5241, + "step": 17783 + }, + { + "epoch": 0.10576648586925493, + "grad_norm": 1.7645032405853271, + "learning_rate": 4.863271805998275e-05, + "loss": 5.6471, + "step": 17784 + }, + { + "epoch": 0.10577243315253593, + "grad_norm": 1.6891655921936035, + "learning_rate": 4.8632565699069854e-05, + "loss": 5.9138, + "step": 17785 + }, + { + "epoch": 0.10577838043581692, + "grad_norm": 1.6546204090118408, + "learning_rate": 4.8632413329907076e-05, + "loss": 5.8511, + "step": 17786 + }, + { + "epoch": 0.10578432771909792, + "grad_norm": 1.864680528640747, + "learning_rate": 4.863226095249446e-05, + "loss": 5.7665, + "step": 17787 + }, + { + "epoch": 0.10579027500237892, + "grad_norm": 1.9052486419677734, + "learning_rate": 4.863210856683207e-05, + "loss": 5.6528, + "step": 17788 + }, + { + "epoch": 0.10579622228565991, + "grad_norm": 2.212982416152954, + "learning_rate": 4.8631956172919944e-05, + "loss": 5.2294, + "step": 17789 + }, + { + "epoch": 0.10580216956894091, + "grad_norm": 2.0703213214874268, + "learning_rate": 4.863180377075816e-05, + "loss": 4.9963, + "step": 17790 + }, + { + "epoch": 0.10580811685222191, + "grad_norm": 2.1718661785125732, + "learning_rate": 4.863165136034675e-05, + "loss": 5.1047, + "step": 17791 + }, + { + "epoch": 0.1058140641355029, + "grad_norm": 2.2078070640563965, + "learning_rate": 4.8631498941685774e-05, + "loss": 5.2682, + "step": 17792 + }, + { + "epoch": 0.1058200114187839, + "grad_norm": 2.187614917755127, + "learning_rate": 4.863134651477529e-05, + "loss": 4.9008, + "step": 17793 + }, + { + "epoch": 0.10582595870206489, + "grad_norm": 1.7202839851379395, + "learning_rate": 4.863119407961535e-05, + "loss": 5.1006, + "step": 17794 + }, + { + "epoch": 0.10583190598534589, + "grad_norm": 2.3109450340270996, + "learning_rate": 4.8631041636206e-05, + "loss": 4.8489, + "step": 17795 + }, + { + "epoch": 0.1058378532686269, + "grad_norm": 2.2688632011413574, + "learning_rate": 4.8630889184547295e-05, + "loss": 4.953, + "step": 17796 + }, + { + "epoch": 0.10584380055190788, + "grad_norm": 2.0636980533599854, + "learning_rate": 4.863073672463929e-05, + "loss": 4.9537, + "step": 17797 + }, + { + "epoch": 0.10584974783518888, + "grad_norm": 1.9752720594406128, + "learning_rate": 4.863058425648205e-05, + "loss": 4.8646, + "step": 17798 + }, + { + "epoch": 0.10585569511846989, + "grad_norm": 1.9784966707229614, + "learning_rate": 4.86304317800756e-05, + "loss": 5.1245, + "step": 17799 + }, + { + "epoch": 0.10586164240175087, + "grad_norm": 1.812218427658081, + "learning_rate": 4.863027929542002e-05, + "loss": 5.4367, + "step": 17800 + }, + { + "epoch": 0.10586758968503188, + "grad_norm": 1.8048956394195557, + "learning_rate": 4.863012680251536e-05, + "loss": 5.6052, + "step": 17801 + }, + { + "epoch": 0.10587353696831288, + "grad_norm": 1.9246432781219482, + "learning_rate": 4.862997430136166e-05, + "loss": 5.9335, + "step": 17802 + }, + { + "epoch": 0.10587948425159387, + "grad_norm": 1.5138533115386963, + "learning_rate": 4.862982179195897e-05, + "loss": 5.8785, + "step": 17803 + }, + { + "epoch": 0.10588543153487487, + "grad_norm": 1.4948742389678955, + "learning_rate": 4.862966927430737e-05, + "loss": 5.7478, + "step": 17804 + }, + { + "epoch": 0.10589137881815587, + "grad_norm": 1.4670746326446533, + "learning_rate": 4.862951674840689e-05, + "loss": 5.7397, + "step": 17805 + }, + { + "epoch": 0.10589732610143686, + "grad_norm": 1.4234925508499146, + "learning_rate": 4.862936421425759e-05, + "loss": 5.9919, + "step": 17806 + }, + { + "epoch": 0.10590327338471786, + "grad_norm": 1.8313277959823608, + "learning_rate": 4.862921167185953e-05, + "loss": 5.7289, + "step": 17807 + }, + { + "epoch": 0.10590922066799886, + "grad_norm": 1.7373311519622803, + "learning_rate": 4.8629059121212745e-05, + "loss": 5.7652, + "step": 17808 + }, + { + "epoch": 0.10591516795127985, + "grad_norm": 1.7706129550933838, + "learning_rate": 4.86289065623173e-05, + "loss": 5.4623, + "step": 17809 + }, + { + "epoch": 0.10592111523456085, + "grad_norm": 1.7332470417022705, + "learning_rate": 4.862875399517325e-05, + "loss": 5.5546, + "step": 17810 + }, + { + "epoch": 0.10592706251784186, + "grad_norm": 1.7493473291397095, + "learning_rate": 4.862860141978065e-05, + "loss": 5.2762, + "step": 17811 + }, + { + "epoch": 0.10593300980112284, + "grad_norm": 1.8064602613449097, + "learning_rate": 4.862844883613955e-05, + "loss": 5.2969, + "step": 17812 + }, + { + "epoch": 0.10593895708440385, + "grad_norm": 1.6318674087524414, + "learning_rate": 4.862829624425e-05, + "loss": 5.3229, + "step": 17813 + }, + { + "epoch": 0.10594490436768485, + "grad_norm": 1.7438777685165405, + "learning_rate": 4.8628143644112056e-05, + "loss": 5.3167, + "step": 17814 + }, + { + "epoch": 0.10595085165096584, + "grad_norm": 1.8095386028289795, + "learning_rate": 4.8627991035725774e-05, + "loss": 5.2744, + "step": 17815 + }, + { + "epoch": 0.10595679893424684, + "grad_norm": 1.8095691204071045, + "learning_rate": 4.86278384190912e-05, + "loss": 5.5105, + "step": 17816 + }, + { + "epoch": 0.10596274621752784, + "grad_norm": 1.858776569366455, + "learning_rate": 4.862768579420839e-05, + "loss": 5.4338, + "step": 17817 + }, + { + "epoch": 0.10596869350080883, + "grad_norm": 1.8224806785583496, + "learning_rate": 4.86275331610774e-05, + "loss": 5.6273, + "step": 17818 + }, + { + "epoch": 0.10597464078408983, + "grad_norm": 1.6850696802139282, + "learning_rate": 4.8627380519698284e-05, + "loss": 5.9963, + "step": 17819 + }, + { + "epoch": 0.10598058806737083, + "grad_norm": 1.4804600477218628, + "learning_rate": 4.86272278700711e-05, + "loss": 5.726, + "step": 17820 + }, + { + "epoch": 0.10598653535065182, + "grad_norm": 1.721027135848999, + "learning_rate": 4.862707521219589e-05, + "loss": 5.191, + "step": 17821 + }, + { + "epoch": 0.10599248263393282, + "grad_norm": 1.8109691143035889, + "learning_rate": 4.862692254607271e-05, + "loss": 4.926, + "step": 17822 + }, + { + "epoch": 0.10599842991721381, + "grad_norm": 1.7531434297561646, + "learning_rate": 4.862676987170162e-05, + "loss": 5.0376, + "step": 17823 + }, + { + "epoch": 0.10600437720049481, + "grad_norm": 1.6847648620605469, + "learning_rate": 4.8626617189082656e-05, + "loss": 5.0376, + "step": 17824 + }, + { + "epoch": 0.10601032448377581, + "grad_norm": 1.6512411832809448, + "learning_rate": 4.86264644982159e-05, + "loss": 5.087, + "step": 17825 + }, + { + "epoch": 0.1060162717670568, + "grad_norm": 1.6410924196243286, + "learning_rate": 4.8626311799101375e-05, + "loss": 5.6917, + "step": 17826 + }, + { + "epoch": 0.1060222190503378, + "grad_norm": 2.1565957069396973, + "learning_rate": 4.862615909173916e-05, + "loss": 4.619, + "step": 17827 + }, + { + "epoch": 0.1060281663336188, + "grad_norm": 1.8235310316085815, + "learning_rate": 4.86260063761293e-05, + "loss": 5.1155, + "step": 17828 + }, + { + "epoch": 0.1060341136168998, + "grad_norm": 1.7710633277893066, + "learning_rate": 4.862585365227184e-05, + "loss": 4.7845, + "step": 17829 + }, + { + "epoch": 0.1060400609001808, + "grad_norm": 2.174832820892334, + "learning_rate": 4.862570092016683e-05, + "loss": 4.6384, + "step": 17830 + }, + { + "epoch": 0.1060460081834618, + "grad_norm": 2.359682321548462, + "learning_rate": 4.862554817981434e-05, + "loss": 4.2191, + "step": 17831 + }, + { + "epoch": 0.10605195546674279, + "grad_norm": 2.4251585006713867, + "learning_rate": 4.8625395431214414e-05, + "loss": 4.0982, + "step": 17832 + }, + { + "epoch": 0.10605790275002379, + "grad_norm": 2.543009042739868, + "learning_rate": 4.86252426743671e-05, + "loss": 4.0773, + "step": 17833 + }, + { + "epoch": 0.10606385003330479, + "grad_norm": 2.6991419792175293, + "learning_rate": 4.862508990927247e-05, + "loss": 4.0209, + "step": 17834 + }, + { + "epoch": 0.10606979731658578, + "grad_norm": 2.354445695877075, + "learning_rate": 4.862493713593056e-05, + "loss": 3.9223, + "step": 17835 + }, + { + "epoch": 0.10607574459986678, + "grad_norm": 2.5119223594665527, + "learning_rate": 4.8624784354341426e-05, + "loss": 3.9006, + "step": 17836 + }, + { + "epoch": 0.10608169188314778, + "grad_norm": 2.717792272567749, + "learning_rate": 4.862463156450513e-05, + "loss": 4.3295, + "step": 17837 + }, + { + "epoch": 0.10608763916642877, + "grad_norm": 3.1779162883758545, + "learning_rate": 4.862447876642171e-05, + "loss": 4.3483, + "step": 17838 + }, + { + "epoch": 0.10609358644970977, + "grad_norm": 2.272994041442871, + "learning_rate": 4.8624325960091235e-05, + "loss": 4.2826, + "step": 17839 + }, + { + "epoch": 0.10609953373299078, + "grad_norm": 2.4689860343933105, + "learning_rate": 4.862417314551375e-05, + "loss": 4.9144, + "step": 17840 + }, + { + "epoch": 0.10610548101627176, + "grad_norm": 1.8101458549499512, + "learning_rate": 4.862402032268931e-05, + "loss": 5.9325, + "step": 17841 + }, + { + "epoch": 0.10611142829955277, + "grad_norm": 1.9994734525680542, + "learning_rate": 4.862386749161797e-05, + "loss": 5.5438, + "step": 17842 + }, + { + "epoch": 0.10611737558283377, + "grad_norm": 2.5475401878356934, + "learning_rate": 4.8623714652299786e-05, + "loss": 5.2262, + "step": 17843 + }, + { + "epoch": 0.10612332286611476, + "grad_norm": 2.286040782928467, + "learning_rate": 4.86235618047348e-05, + "loss": 5.065, + "step": 17844 + }, + { + "epoch": 0.10612927014939576, + "grad_norm": 1.788761854171753, + "learning_rate": 4.862340894892308e-05, + "loss": 5.5053, + "step": 17845 + }, + { + "epoch": 0.10613521743267676, + "grad_norm": 2.2951841354370117, + "learning_rate": 4.8623256084864663e-05, + "loss": 5.1262, + "step": 17846 + }, + { + "epoch": 0.10614116471595775, + "grad_norm": 1.962814211845398, + "learning_rate": 4.862310321255962e-05, + "loss": 5.8084, + "step": 17847 + }, + { + "epoch": 0.10614711199923875, + "grad_norm": 1.7888414859771729, + "learning_rate": 4.862295033200799e-05, + "loss": 5.2409, + "step": 17848 + }, + { + "epoch": 0.10615305928251975, + "grad_norm": 1.7108670473098755, + "learning_rate": 4.862279744320983e-05, + "loss": 5.6138, + "step": 17849 + }, + { + "epoch": 0.10615900656580074, + "grad_norm": 1.7636443376541138, + "learning_rate": 4.8622644546165196e-05, + "loss": 5.5664, + "step": 17850 + }, + { + "epoch": 0.10616495384908174, + "grad_norm": 1.7193186283111572, + "learning_rate": 4.8622491640874147e-05, + "loss": 5.7852, + "step": 17851 + }, + { + "epoch": 0.10617090113236273, + "grad_norm": 1.817215919494629, + "learning_rate": 4.8622338727336723e-05, + "loss": 5.5478, + "step": 17852 + }, + { + "epoch": 0.10617684841564373, + "grad_norm": 1.547817349433899, + "learning_rate": 4.8622185805552994e-05, + "loss": 5.5249, + "step": 17853 + }, + { + "epoch": 0.10618279569892473, + "grad_norm": 1.577528953552246, + "learning_rate": 4.862203287552299e-05, + "loss": 5.7268, + "step": 17854 + }, + { + "epoch": 0.10618874298220572, + "grad_norm": 1.4524853229522705, + "learning_rate": 4.862187993724679e-05, + "loss": 5.8539, + "step": 17855 + }, + { + "epoch": 0.10619469026548672, + "grad_norm": 1.6361198425292969, + "learning_rate": 4.8621726990724437e-05, + "loss": 5.0815, + "step": 17856 + }, + { + "epoch": 0.10620063754876773, + "grad_norm": 1.65043044090271, + "learning_rate": 4.862157403595598e-05, + "loss": 5.1938, + "step": 17857 + }, + { + "epoch": 0.10620658483204871, + "grad_norm": 1.6236746311187744, + "learning_rate": 4.8621421072941476e-05, + "loss": 5.5602, + "step": 17858 + }, + { + "epoch": 0.10621253211532972, + "grad_norm": 1.4648228883743286, + "learning_rate": 4.862126810168097e-05, + "loss": 5.3728, + "step": 17859 + }, + { + "epoch": 0.10621847939861072, + "grad_norm": 1.4803123474121094, + "learning_rate": 4.862111512217453e-05, + "loss": 5.58, + "step": 17860 + }, + { + "epoch": 0.1062244266818917, + "grad_norm": 1.320387840270996, + "learning_rate": 4.862096213442221e-05, + "loss": 5.0337, + "step": 17861 + }, + { + "epoch": 0.10623037396517271, + "grad_norm": 1.8309158086776733, + "learning_rate": 4.862080913842405e-05, + "loss": 4.3603, + "step": 17862 + }, + { + "epoch": 0.10623632124845371, + "grad_norm": 1.79231595993042, + "learning_rate": 4.86206561341801e-05, + "loss": 4.401, + "step": 17863 + }, + { + "epoch": 0.1062422685317347, + "grad_norm": 1.7894480228424072, + "learning_rate": 4.862050312169043e-05, + "loss": 4.4592, + "step": 17864 + }, + { + "epoch": 0.1062482158150157, + "grad_norm": 1.8271396160125732, + "learning_rate": 4.8620350100955095e-05, + "loss": 4.2442, + "step": 17865 + }, + { + "epoch": 0.1062541630982967, + "grad_norm": 2.03336238861084, + "learning_rate": 4.862019707197413e-05, + "loss": 4.6245, + "step": 17866 + }, + { + "epoch": 0.10626011038157769, + "grad_norm": 1.8034088611602783, + "learning_rate": 4.86200440347476e-05, + "loss": 4.5798, + "step": 17867 + }, + { + "epoch": 0.10626605766485869, + "grad_norm": 1.366013765335083, + "learning_rate": 4.861989098927556e-05, + "loss": 5.2409, + "step": 17868 + }, + { + "epoch": 0.1062720049481397, + "grad_norm": 1.603281855583191, + "learning_rate": 4.8619737935558054e-05, + "loss": 5.6699, + "step": 17869 + }, + { + "epoch": 0.10627795223142068, + "grad_norm": 1.6720329523086548, + "learning_rate": 4.861958487359515e-05, + "loss": 5.2162, + "step": 17870 + }, + { + "epoch": 0.10628389951470169, + "grad_norm": 2.5577762126922607, + "learning_rate": 4.861943180338689e-05, + "loss": 3.9116, + "step": 17871 + }, + { + "epoch": 0.10628984679798269, + "grad_norm": 2.6489310264587402, + "learning_rate": 4.861927872493332e-05, + "loss": 4.232, + "step": 17872 + }, + { + "epoch": 0.10629579408126368, + "grad_norm": 2.481381893157959, + "learning_rate": 4.861912563823451e-05, + "loss": 4.374, + "step": 17873 + }, + { + "epoch": 0.10630174136454468, + "grad_norm": 2.444721221923828, + "learning_rate": 4.861897254329052e-05, + "loss": 4.504, + "step": 17874 + }, + { + "epoch": 0.10630768864782568, + "grad_norm": 2.529085636138916, + "learning_rate": 4.8618819440101373e-05, + "loss": 4.1305, + "step": 17875 + }, + { + "epoch": 0.10631363593110667, + "grad_norm": 3.966379404067993, + "learning_rate": 4.861866632866715e-05, + "loss": 3.9104, + "step": 17876 + }, + { + "epoch": 0.10631958321438767, + "grad_norm": 2.408405065536499, + "learning_rate": 4.8618513208987895e-05, + "loss": 3.8762, + "step": 17877 + }, + { + "epoch": 0.10632553049766867, + "grad_norm": 2.41780686378479, + "learning_rate": 4.8618360081063654e-05, + "loss": 3.7665, + "step": 17878 + }, + { + "epoch": 0.10633147778094966, + "grad_norm": 2.60262393951416, + "learning_rate": 4.861820694489448e-05, + "loss": 4.067, + "step": 17879 + }, + { + "epoch": 0.10633742506423066, + "grad_norm": 2.624938726425171, + "learning_rate": 4.8618053800480456e-05, + "loss": 4.5653, + "step": 17880 + }, + { + "epoch": 0.10634337234751165, + "grad_norm": 2.783202886581421, + "learning_rate": 4.86179006478216e-05, + "loss": 4.4091, + "step": 17881 + }, + { + "epoch": 0.10634931963079265, + "grad_norm": 2.8269615173339844, + "learning_rate": 4.861774748691798e-05, + "loss": 3.949, + "step": 17882 + }, + { + "epoch": 0.10635526691407365, + "grad_norm": 2.82108998298645, + "learning_rate": 4.861759431776965e-05, + "loss": 3.8479, + "step": 17883 + }, + { + "epoch": 0.10636121419735464, + "grad_norm": 2.8543620109558105, + "learning_rate": 4.861744114037666e-05, + "loss": 3.4358, + "step": 17884 + }, + { + "epoch": 0.10636716148063564, + "grad_norm": 2.6492035388946533, + "learning_rate": 4.861728795473907e-05, + "loss": 3.6298, + "step": 17885 + }, + { + "epoch": 0.10637310876391665, + "grad_norm": 2.834181785583496, + "learning_rate": 4.861713476085693e-05, + "loss": 3.4125, + "step": 17886 + }, + { + "epoch": 0.10637905604719763, + "grad_norm": 3.447075605392456, + "learning_rate": 4.861698155873028e-05, + "loss": 3.5416, + "step": 17887 + }, + { + "epoch": 0.10638500333047864, + "grad_norm": 3.6009531021118164, + "learning_rate": 4.86168283483592e-05, + "loss": 4.1912, + "step": 17888 + }, + { + "epoch": 0.10639095061375964, + "grad_norm": 4.086645126342773, + "learning_rate": 4.861667512974372e-05, + "loss": 4.3999, + "step": 17889 + }, + { + "epoch": 0.10639689789704063, + "grad_norm": 3.673405408859253, + "learning_rate": 4.86165219028839e-05, + "loss": 4.3731, + "step": 17890 + }, + { + "epoch": 0.10640284518032163, + "grad_norm": 2.2896664142608643, + "learning_rate": 4.861636866777981e-05, + "loss": 5.5963, + "step": 17891 + }, + { + "epoch": 0.10640879246360263, + "grad_norm": 2.0481069087982178, + "learning_rate": 4.861621542443148e-05, + "loss": 5.7909, + "step": 17892 + }, + { + "epoch": 0.10641473974688362, + "grad_norm": 1.9108741283416748, + "learning_rate": 4.861606217283897e-05, + "loss": 5.3044, + "step": 17893 + }, + { + "epoch": 0.10642068703016462, + "grad_norm": 1.7842040061950684, + "learning_rate": 4.861590891300235e-05, + "loss": 5.3071, + "step": 17894 + }, + { + "epoch": 0.10642663431344562, + "grad_norm": 1.854777455329895, + "learning_rate": 4.861575564492164e-05, + "loss": 5.386, + "step": 17895 + }, + { + "epoch": 0.10643258159672661, + "grad_norm": 1.7286109924316406, + "learning_rate": 4.861560236859693e-05, + "loss": 5.5609, + "step": 17896 + }, + { + "epoch": 0.10643852888000761, + "grad_norm": 1.709408164024353, + "learning_rate": 4.861544908402825e-05, + "loss": 5.6772, + "step": 17897 + }, + { + "epoch": 0.10644447616328861, + "grad_norm": 1.9251428842544556, + "learning_rate": 4.861529579121567e-05, + "loss": 5.6114, + "step": 17898 + }, + { + "epoch": 0.1064504234465696, + "grad_norm": 1.6568808555603027, + "learning_rate": 4.8615142490159226e-05, + "loss": 5.4648, + "step": 17899 + }, + { + "epoch": 0.1064563707298506, + "grad_norm": 1.7793960571289062, + "learning_rate": 4.861498918085898e-05, + "loss": 5.4987, + "step": 17900 + }, + { + "epoch": 0.10646231801313161, + "grad_norm": 1.9044899940490723, + "learning_rate": 4.861483586331499e-05, + "loss": 5.7757, + "step": 17901 + }, + { + "epoch": 0.1064682652964126, + "grad_norm": 2.215278387069702, + "learning_rate": 4.86146825375273e-05, + "loss": 6.2767, + "step": 17902 + }, + { + "epoch": 0.1064742125796936, + "grad_norm": 1.8699604272842407, + "learning_rate": 4.861452920349597e-05, + "loss": 6.2987, + "step": 17903 + }, + { + "epoch": 0.1064801598629746, + "grad_norm": 1.634887456893921, + "learning_rate": 4.861437586122105e-05, + "loss": 6.2596, + "step": 17904 + }, + { + "epoch": 0.10648610714625559, + "grad_norm": 1.54149329662323, + "learning_rate": 4.86142225107026e-05, + "loss": 6.1988, + "step": 17905 + }, + { + "epoch": 0.10649205442953659, + "grad_norm": 1.5954409837722778, + "learning_rate": 4.861406915194067e-05, + "loss": 6.1052, + "step": 17906 + }, + { + "epoch": 0.10649800171281759, + "grad_norm": 1.8810808658599854, + "learning_rate": 4.86139157849353e-05, + "loss": 6.0318, + "step": 17907 + }, + { + "epoch": 0.10650394899609858, + "grad_norm": 1.4983458518981934, + "learning_rate": 4.861376240968656e-05, + "loss": 5.8614, + "step": 17908 + }, + { + "epoch": 0.10650989627937958, + "grad_norm": 1.5446088314056396, + "learning_rate": 4.8613609026194504e-05, + "loss": 5.623, + "step": 17909 + }, + { + "epoch": 0.10651584356266057, + "grad_norm": 1.7121042013168335, + "learning_rate": 4.861345563445918e-05, + "loss": 4.9258, + "step": 17910 + }, + { + "epoch": 0.10652179084594157, + "grad_norm": 2.002478837966919, + "learning_rate": 4.861330223448065e-05, + "loss": 5.285, + "step": 17911 + }, + { + "epoch": 0.10652773812922257, + "grad_norm": 1.7703490257263184, + "learning_rate": 4.8613148826258944e-05, + "loss": 5.2279, + "step": 17912 + }, + { + "epoch": 0.10653368541250356, + "grad_norm": 1.7763222455978394, + "learning_rate": 4.861299540979415e-05, + "loss": 4.8737, + "step": 17913 + }, + { + "epoch": 0.10653963269578456, + "grad_norm": 1.5921473503112793, + "learning_rate": 4.8612841985086296e-05, + "loss": 5.3756, + "step": 17914 + }, + { + "epoch": 0.10654557997906557, + "grad_norm": 1.810085654258728, + "learning_rate": 4.8612688552135435e-05, + "loss": 5.3784, + "step": 17915 + }, + { + "epoch": 0.10655152726234655, + "grad_norm": 2.2289364337921143, + "learning_rate": 4.8612535110941636e-05, + "loss": 5.0258, + "step": 17916 + }, + { + "epoch": 0.10655747454562756, + "grad_norm": 1.9337642192840576, + "learning_rate": 4.8612381661504946e-05, + "loss": 4.9943, + "step": 17917 + }, + { + "epoch": 0.10656342182890856, + "grad_norm": 1.5772477388381958, + "learning_rate": 4.861222820382542e-05, + "loss": 5.1188, + "step": 17918 + }, + { + "epoch": 0.10656936911218955, + "grad_norm": 1.6176950931549072, + "learning_rate": 4.8612074737903097e-05, + "loss": 5.0973, + "step": 17919 + }, + { + "epoch": 0.10657531639547055, + "grad_norm": 1.7878233194351196, + "learning_rate": 4.8611921263738045e-05, + "loss": 5.0342, + "step": 17920 + }, + { + "epoch": 0.10658126367875155, + "grad_norm": 1.7473089694976807, + "learning_rate": 4.861176778133033e-05, + "loss": 5.2844, + "step": 17921 + }, + { + "epoch": 0.10658721096203254, + "grad_norm": 2.472464084625244, + "learning_rate": 4.8611614290679975e-05, + "loss": 4.9654, + "step": 17922 + }, + { + "epoch": 0.10659315824531354, + "grad_norm": 2.5256218910217285, + "learning_rate": 4.861146079178706e-05, + "loss": 4.7885, + "step": 17923 + }, + { + "epoch": 0.10659910552859454, + "grad_norm": 2.2665674686431885, + "learning_rate": 4.861130728465162e-05, + "loss": 5.0838, + "step": 17924 + }, + { + "epoch": 0.10660505281187553, + "grad_norm": 1.6795161962509155, + "learning_rate": 4.861115376927372e-05, + "loss": 5.3174, + "step": 17925 + }, + { + "epoch": 0.10661100009515653, + "grad_norm": 1.5786751508712769, + "learning_rate": 4.8611000245653405e-05, + "loss": 5.1831, + "step": 17926 + }, + { + "epoch": 0.10661694737843753, + "grad_norm": 2.0238442420959473, + "learning_rate": 4.861084671379074e-05, + "loss": 5.7967, + "step": 17927 + }, + { + "epoch": 0.10662289466171852, + "grad_norm": 1.5760328769683838, + "learning_rate": 4.861069317368577e-05, + "loss": 5.5692, + "step": 17928 + }, + { + "epoch": 0.10662884194499953, + "grad_norm": 1.7190479040145874, + "learning_rate": 4.861053962533855e-05, + "loss": 5.4248, + "step": 17929 + }, + { + "epoch": 0.10663478922828053, + "grad_norm": 1.987444519996643, + "learning_rate": 4.861038606874914e-05, + "loss": 5.3845, + "step": 17930 + }, + { + "epoch": 0.10664073651156152, + "grad_norm": 2.3603975772857666, + "learning_rate": 4.8610232503917585e-05, + "loss": 4.9948, + "step": 17931 + }, + { + "epoch": 0.10664668379484252, + "grad_norm": 2.560696601867676, + "learning_rate": 4.861007893084394e-05, + "loss": 4.797, + "step": 17932 + }, + { + "epoch": 0.10665263107812352, + "grad_norm": 2.3494272232055664, + "learning_rate": 4.860992534952826e-05, + "loss": 4.81, + "step": 17933 + }, + { + "epoch": 0.10665857836140451, + "grad_norm": 2.1878998279571533, + "learning_rate": 4.86097717599706e-05, + "loss": 4.7863, + "step": 17934 + }, + { + "epoch": 0.10666452564468551, + "grad_norm": 2.123789072036743, + "learning_rate": 4.8609618162171016e-05, + "loss": 4.7846, + "step": 17935 + }, + { + "epoch": 0.10667047292796651, + "grad_norm": 2.307370662689209, + "learning_rate": 4.8609464556129555e-05, + "loss": 4.3901, + "step": 17936 + }, + { + "epoch": 0.1066764202112475, + "grad_norm": 1.8189514875411987, + "learning_rate": 4.8609310941846274e-05, + "loss": 5.2722, + "step": 17937 + }, + { + "epoch": 0.1066823674945285, + "grad_norm": 1.4699981212615967, + "learning_rate": 4.860915731932123e-05, + "loss": 5.7501, + "step": 17938 + }, + { + "epoch": 0.10668831477780949, + "grad_norm": 1.5624393224716187, + "learning_rate": 4.860900368855447e-05, + "loss": 5.6963, + "step": 17939 + }, + { + "epoch": 0.10669426206109049, + "grad_norm": 1.8463138341903687, + "learning_rate": 4.860885004954605e-05, + "loss": 5.3627, + "step": 17940 + }, + { + "epoch": 0.1067002093443715, + "grad_norm": 1.7627042531967163, + "learning_rate": 4.8608696402296025e-05, + "loss": 5.6548, + "step": 17941 + }, + { + "epoch": 0.10670615662765248, + "grad_norm": 1.631505012512207, + "learning_rate": 4.860854274680444e-05, + "loss": 5.7926, + "step": 17942 + }, + { + "epoch": 0.10671210391093348, + "grad_norm": 1.4491498470306396, + "learning_rate": 4.860838908307137e-05, + "loss": 5.5395, + "step": 17943 + }, + { + "epoch": 0.10671805119421449, + "grad_norm": 1.6210049390792847, + "learning_rate": 4.8608235411096845e-05, + "loss": 5.2768, + "step": 17944 + }, + { + "epoch": 0.10672399847749547, + "grad_norm": 1.4522534608840942, + "learning_rate": 4.860808173088094e-05, + "loss": 5.7723, + "step": 17945 + }, + { + "epoch": 0.10672994576077648, + "grad_norm": 2.0779013633728027, + "learning_rate": 4.860792804242369e-05, + "loss": 5.4679, + "step": 17946 + }, + { + "epoch": 0.10673589304405748, + "grad_norm": 2.248556137084961, + "learning_rate": 4.860777434572515e-05, + "loss": 5.5089, + "step": 17947 + }, + { + "epoch": 0.10674184032733847, + "grad_norm": 2.2192306518554688, + "learning_rate": 4.86076206407854e-05, + "loss": 5.4098, + "step": 17948 + }, + { + "epoch": 0.10674778761061947, + "grad_norm": 1.7523053884506226, + "learning_rate": 4.8607466927604455e-05, + "loss": 5.3223, + "step": 17949 + }, + { + "epoch": 0.10675373489390047, + "grad_norm": 1.8636107444763184, + "learning_rate": 4.8607313206182395e-05, + "loss": 5.339, + "step": 17950 + }, + { + "epoch": 0.10675968217718146, + "grad_norm": 1.9067093133926392, + "learning_rate": 4.860715947651926e-05, + "loss": 5.3779, + "step": 17951 + }, + { + "epoch": 0.10676562946046246, + "grad_norm": 1.850948452949524, + "learning_rate": 4.860700573861512e-05, + "loss": 5.3474, + "step": 17952 + }, + { + "epoch": 0.10677157674374346, + "grad_norm": 2.144895076751709, + "learning_rate": 4.8606851992470005e-05, + "loss": 5.3089, + "step": 17953 + }, + { + "epoch": 0.10677752402702445, + "grad_norm": 2.054420232772827, + "learning_rate": 4.860669823808399e-05, + "loss": 5.3653, + "step": 17954 + }, + { + "epoch": 0.10678347131030545, + "grad_norm": 1.94870126247406, + "learning_rate": 4.860654447545711e-05, + "loss": 5.2514, + "step": 17955 + }, + { + "epoch": 0.10678941859358645, + "grad_norm": 1.8006596565246582, + "learning_rate": 4.860639070458945e-05, + "loss": 5.2357, + "step": 17956 + }, + { + "epoch": 0.10679536587686744, + "grad_norm": 2.309035301208496, + "learning_rate": 4.860623692548103e-05, + "loss": 5.2681, + "step": 17957 + }, + { + "epoch": 0.10680131316014845, + "grad_norm": 2.402949571609497, + "learning_rate": 4.860608313813192e-05, + "loss": 5.549, + "step": 17958 + }, + { + "epoch": 0.10680726044342945, + "grad_norm": 1.724307894706726, + "learning_rate": 4.8605929342542164e-05, + "loss": 5.5283, + "step": 17959 + }, + { + "epoch": 0.10681320772671044, + "grad_norm": 1.8566054105758667, + "learning_rate": 4.860577553871183e-05, + "loss": 5.834, + "step": 17960 + }, + { + "epoch": 0.10681915500999144, + "grad_norm": 1.8882628679275513, + "learning_rate": 4.860562172664096e-05, + "loss": 5.7954, + "step": 17961 + }, + { + "epoch": 0.10682510229327244, + "grad_norm": 1.694075345993042, + "learning_rate": 4.860546790632961e-05, + "loss": 5.7573, + "step": 17962 + }, + { + "epoch": 0.10683104957655343, + "grad_norm": 1.8312102556228638, + "learning_rate": 4.860531407777783e-05, + "loss": 5.4479, + "step": 17963 + }, + { + "epoch": 0.10683699685983443, + "grad_norm": 1.6124730110168457, + "learning_rate": 4.860516024098569e-05, + "loss": 5.5356, + "step": 17964 + }, + { + "epoch": 0.10684294414311543, + "grad_norm": 2.3505187034606934, + "learning_rate": 4.8605006395953225e-05, + "loss": 5.6543, + "step": 17965 + }, + { + "epoch": 0.10684889142639642, + "grad_norm": 2.69331431388855, + "learning_rate": 4.86048525426805e-05, + "loss": 5.5359, + "step": 17966 + }, + { + "epoch": 0.10685483870967742, + "grad_norm": 2.095374822616577, + "learning_rate": 4.860469868116756e-05, + "loss": 5.5514, + "step": 17967 + }, + { + "epoch": 0.10686078599295841, + "grad_norm": 1.8596038818359375, + "learning_rate": 4.8604544811414465e-05, + "loss": 5.5171, + "step": 17968 + }, + { + "epoch": 0.10686673327623941, + "grad_norm": 2.215549945831299, + "learning_rate": 4.860439093342127e-05, + "loss": 5.3824, + "step": 17969 + }, + { + "epoch": 0.10687268055952041, + "grad_norm": 1.9737238883972168, + "learning_rate": 4.860423704718803e-05, + "loss": 5.4159, + "step": 17970 + }, + { + "epoch": 0.1068786278428014, + "grad_norm": 1.8673701286315918, + "learning_rate": 4.860408315271479e-05, + "loss": 5.421, + "step": 17971 + }, + { + "epoch": 0.1068845751260824, + "grad_norm": 1.905371069908142, + "learning_rate": 4.86039292500016e-05, + "loss": 5.4003, + "step": 17972 + }, + { + "epoch": 0.1068905224093634, + "grad_norm": 1.7888939380645752, + "learning_rate": 4.8603775339048534e-05, + "loss": 5.1581, + "step": 17973 + }, + { + "epoch": 0.1068964696926444, + "grad_norm": 1.7499796152114868, + "learning_rate": 4.8603621419855625e-05, + "loss": 5.1334, + "step": 17974 + }, + { + "epoch": 0.1069024169759254, + "grad_norm": 1.6159700155258179, + "learning_rate": 4.860346749242295e-05, + "loss": 5.1999, + "step": 17975 + }, + { + "epoch": 0.1069083642592064, + "grad_norm": 1.7355921268463135, + "learning_rate": 4.860331355675053e-05, + "loss": 5.3899, + "step": 17976 + }, + { + "epoch": 0.10691431154248739, + "grad_norm": 1.760110855102539, + "learning_rate": 4.860315961283846e-05, + "loss": 5.5386, + "step": 17977 + }, + { + "epoch": 0.10692025882576839, + "grad_norm": 1.605482816696167, + "learning_rate": 4.860300566068675e-05, + "loss": 5.5486, + "step": 17978 + }, + { + "epoch": 0.10692620610904939, + "grad_norm": 2.1792690753936768, + "learning_rate": 4.860285170029548e-05, + "loss": 4.8871, + "step": 17979 + }, + { + "epoch": 0.10693215339233038, + "grad_norm": 1.4513617753982544, + "learning_rate": 4.86026977316647e-05, + "loss": 5.1944, + "step": 17980 + }, + { + "epoch": 0.10693810067561138, + "grad_norm": 2.560112476348877, + "learning_rate": 4.860254375479446e-05, + "loss": 4.2504, + "step": 17981 + }, + { + "epoch": 0.10694404795889238, + "grad_norm": 2.035403251647949, + "learning_rate": 4.8602389769684816e-05, + "loss": 5.4479, + "step": 17982 + }, + { + "epoch": 0.10694999524217337, + "grad_norm": 1.8496562242507935, + "learning_rate": 4.8602235776335826e-05, + "loss": 5.4981, + "step": 17983 + }, + { + "epoch": 0.10695594252545437, + "grad_norm": 1.9541285037994385, + "learning_rate": 4.8602081774747536e-05, + "loss": 5.5772, + "step": 17984 + }, + { + "epoch": 0.10696188980873537, + "grad_norm": 1.674981951713562, + "learning_rate": 4.860192776492001e-05, + "loss": 5.3656, + "step": 17985 + }, + { + "epoch": 0.10696783709201636, + "grad_norm": 1.675601601600647, + "learning_rate": 4.860177374685328e-05, + "loss": 5.3382, + "step": 17986 + }, + { + "epoch": 0.10697378437529736, + "grad_norm": 1.8874675035476685, + "learning_rate": 4.860161972054743e-05, + "loss": 5.1908, + "step": 17987 + }, + { + "epoch": 0.10697973165857837, + "grad_norm": 2.267000675201416, + "learning_rate": 4.860146568600249e-05, + "loss": 5.4437, + "step": 17988 + }, + { + "epoch": 0.10698567894185936, + "grad_norm": 1.8062045574188232, + "learning_rate": 4.8601311643218526e-05, + "loss": 5.2315, + "step": 17989 + }, + { + "epoch": 0.10699162622514036, + "grad_norm": 1.9503196477890015, + "learning_rate": 4.8601157592195584e-05, + "loss": 5.3999, + "step": 17990 + }, + { + "epoch": 0.10699757350842136, + "grad_norm": 1.8589918613433838, + "learning_rate": 4.860100353293372e-05, + "loss": 5.694, + "step": 17991 + }, + { + "epoch": 0.10700352079170235, + "grad_norm": 1.69667649269104, + "learning_rate": 4.8600849465432995e-05, + "loss": 5.6146, + "step": 17992 + }, + { + "epoch": 0.10700946807498335, + "grad_norm": 1.6006754636764526, + "learning_rate": 4.8600695389693455e-05, + "loss": 5.2849, + "step": 17993 + }, + { + "epoch": 0.10701541535826435, + "grad_norm": 1.7502506971359253, + "learning_rate": 4.860054130571516e-05, + "loss": 4.9652, + "step": 17994 + }, + { + "epoch": 0.10702136264154534, + "grad_norm": 1.6936286687850952, + "learning_rate": 4.860038721349816e-05, + "loss": 5.2192, + "step": 17995 + }, + { + "epoch": 0.10702730992482634, + "grad_norm": 1.4757579565048218, + "learning_rate": 4.8600233113042496e-05, + "loss": 5.3917, + "step": 17996 + }, + { + "epoch": 0.10703325720810733, + "grad_norm": 1.4602460861206055, + "learning_rate": 4.8600079004348245e-05, + "loss": 5.5418, + "step": 17997 + }, + { + "epoch": 0.10703920449138833, + "grad_norm": 1.4150431156158447, + "learning_rate": 4.859992488741545e-05, + "loss": 5.6592, + "step": 17998 + }, + { + "epoch": 0.10704515177466933, + "grad_norm": 1.385908842086792, + "learning_rate": 4.859977076224416e-05, + "loss": 5.2818, + "step": 17999 + }, + { + "epoch": 0.10705109905795032, + "grad_norm": 1.3683747053146362, + "learning_rate": 4.8599616628834446e-05, + "loss": 5.2743, + "step": 18000 + }, + { + "epoch": 0.10705704634123132, + "grad_norm": 1.2521027326583862, + "learning_rate": 4.859946248718634e-05, + "loss": 5.1564, + "step": 18001 + }, + { + "epoch": 0.10706299362451233, + "grad_norm": 1.445575475692749, + "learning_rate": 4.8599308337299906e-05, + "loss": 5.0108, + "step": 18002 + }, + { + "epoch": 0.10706894090779331, + "grad_norm": 1.3680258989334106, + "learning_rate": 4.859915417917519e-05, + "loss": 5.2649, + "step": 18003 + }, + { + "epoch": 0.10707488819107432, + "grad_norm": 1.2142491340637207, + "learning_rate": 4.859900001281227e-05, + "loss": 5.1143, + "step": 18004 + }, + { + "epoch": 0.10708083547435532, + "grad_norm": 1.244157314300537, + "learning_rate": 4.859884583821117e-05, + "loss": 5.2321, + "step": 18005 + }, + { + "epoch": 0.1070867827576363, + "grad_norm": 1.4057670831680298, + "learning_rate": 4.859869165537196e-05, + "loss": 5.3419, + "step": 18006 + }, + { + "epoch": 0.10709273004091731, + "grad_norm": 1.3243392705917358, + "learning_rate": 4.859853746429469e-05, + "loss": 5.0217, + "step": 18007 + }, + { + "epoch": 0.10709867732419831, + "grad_norm": 1.3227713108062744, + "learning_rate": 4.8598383264979416e-05, + "loss": 5.055, + "step": 18008 + }, + { + "epoch": 0.1071046246074793, + "grad_norm": 1.3313336372375488, + "learning_rate": 4.8598229057426195e-05, + "loss": 5.1319, + "step": 18009 + }, + { + "epoch": 0.1071105718907603, + "grad_norm": 1.385715126991272, + "learning_rate": 4.8598074841635064e-05, + "loss": 4.9349, + "step": 18010 + }, + { + "epoch": 0.1071165191740413, + "grad_norm": 1.3244850635528564, + "learning_rate": 4.85979206176061e-05, + "loss": 4.9055, + "step": 18011 + }, + { + "epoch": 0.10712246645732229, + "grad_norm": 1.2922260761260986, + "learning_rate": 4.859776638533934e-05, + "loss": 5.0518, + "step": 18012 + }, + { + "epoch": 0.10712841374060329, + "grad_norm": 1.3371012210845947, + "learning_rate": 4.8597612144834845e-05, + "loss": 5.234, + "step": 18013 + }, + { + "epoch": 0.1071343610238843, + "grad_norm": 1.3367552757263184, + "learning_rate": 4.859745789609267e-05, + "loss": 4.9765, + "step": 18014 + }, + { + "epoch": 0.10714030830716528, + "grad_norm": 1.5067929029464722, + "learning_rate": 4.859730363911286e-05, + "loss": 5.235, + "step": 18015 + }, + { + "epoch": 0.10714625559044628, + "grad_norm": 1.3660157918930054, + "learning_rate": 4.859714937389548e-05, + "loss": 5.4104, + "step": 18016 + }, + { + "epoch": 0.10715220287372729, + "grad_norm": 1.3999029397964478, + "learning_rate": 4.859699510044057e-05, + "loss": 5.1603, + "step": 18017 + }, + { + "epoch": 0.10715815015700828, + "grad_norm": 1.6147737503051758, + "learning_rate": 4.8596840818748204e-05, + "loss": 5.0506, + "step": 18018 + }, + { + "epoch": 0.10716409744028928, + "grad_norm": 1.5618371963500977, + "learning_rate": 4.859668652881843e-05, + "loss": 5.1564, + "step": 18019 + }, + { + "epoch": 0.10717004472357028, + "grad_norm": 1.3786426782608032, + "learning_rate": 4.859653223065128e-05, + "loss": 5.1884, + "step": 18020 + }, + { + "epoch": 0.10717599200685127, + "grad_norm": 1.429489016532898, + "learning_rate": 4.859637792424683e-05, + "loss": 5.1556, + "step": 18021 + }, + { + "epoch": 0.10718193929013227, + "grad_norm": 1.3347980976104736, + "learning_rate": 4.859622360960513e-05, + "loss": 5.008, + "step": 18022 + }, + { + "epoch": 0.10718788657341327, + "grad_norm": 1.3850064277648926, + "learning_rate": 4.859606928672623e-05, + "loss": 5.0719, + "step": 18023 + }, + { + "epoch": 0.10719383385669426, + "grad_norm": 1.3279672861099243, + "learning_rate": 4.859591495561019e-05, + "loss": 5.0793, + "step": 18024 + }, + { + "epoch": 0.10719978113997526, + "grad_norm": 1.5108927488327026, + "learning_rate": 4.8595760616257056e-05, + "loss": 5.1067, + "step": 18025 + }, + { + "epoch": 0.10720572842325625, + "grad_norm": 1.2342565059661865, + "learning_rate": 4.859560626866689e-05, + "loss": 5.0298, + "step": 18026 + }, + { + "epoch": 0.10721167570653725, + "grad_norm": 1.2821179628372192, + "learning_rate": 4.859545191283974e-05, + "loss": 5.2185, + "step": 18027 + }, + { + "epoch": 0.10721762298981825, + "grad_norm": 1.11893630027771, + "learning_rate": 4.859529754877566e-05, + "loss": 5.1911, + "step": 18028 + }, + { + "epoch": 0.10722357027309924, + "grad_norm": 1.2202814817428589, + "learning_rate": 4.859514317647471e-05, + "loss": 5.028, + "step": 18029 + }, + { + "epoch": 0.10722951755638024, + "grad_norm": 1.3898543119430542, + "learning_rate": 4.859498879593694e-05, + "loss": 5.4019, + "step": 18030 + }, + { + "epoch": 0.10723546483966125, + "grad_norm": 1.2810478210449219, + "learning_rate": 4.859483440716239e-05, + "loss": 5.0634, + "step": 18031 + }, + { + "epoch": 0.10724141212294223, + "grad_norm": 1.4424680471420288, + "learning_rate": 4.859468001015114e-05, + "loss": 5.0058, + "step": 18032 + }, + { + "epoch": 0.10724735940622324, + "grad_norm": 1.4053739309310913, + "learning_rate": 4.859452560490323e-05, + "loss": 5.0174, + "step": 18033 + }, + { + "epoch": 0.10725330668950424, + "grad_norm": 1.2552763223648071, + "learning_rate": 4.859437119141871e-05, + "loss": 5.0222, + "step": 18034 + }, + { + "epoch": 0.10725925397278523, + "grad_norm": 1.3694052696228027, + "learning_rate": 4.859421676969764e-05, + "loss": 4.9663, + "step": 18035 + }, + { + "epoch": 0.10726520125606623, + "grad_norm": 1.3814043998718262, + "learning_rate": 4.859406233974007e-05, + "loss": 5.01, + "step": 18036 + }, + { + "epoch": 0.10727114853934723, + "grad_norm": 1.5185308456420898, + "learning_rate": 4.859390790154606e-05, + "loss": 4.9698, + "step": 18037 + }, + { + "epoch": 0.10727709582262822, + "grad_norm": 1.2509820461273193, + "learning_rate": 4.859375345511566e-05, + "loss": 5.1034, + "step": 18038 + }, + { + "epoch": 0.10728304310590922, + "grad_norm": 1.3478872776031494, + "learning_rate": 4.8593599000448926e-05, + "loss": 5.2459, + "step": 18039 + }, + { + "epoch": 0.10728899038919022, + "grad_norm": 1.3720686435699463, + "learning_rate": 4.859344453754591e-05, + "loss": 5.1671, + "step": 18040 + }, + { + "epoch": 0.10729493767247121, + "grad_norm": 1.3953602313995361, + "learning_rate": 4.859329006640666e-05, + "loss": 5.3221, + "step": 18041 + }, + { + "epoch": 0.10730088495575221, + "grad_norm": 1.4901010990142822, + "learning_rate": 4.859313558703125e-05, + "loss": 5.1694, + "step": 18042 + }, + { + "epoch": 0.10730683223903321, + "grad_norm": 1.4153228998184204, + "learning_rate": 4.859298109941971e-05, + "loss": 5.2721, + "step": 18043 + }, + { + "epoch": 0.1073127795223142, + "grad_norm": 1.34188711643219, + "learning_rate": 4.859282660357211e-05, + "loss": 5.3048, + "step": 18044 + }, + { + "epoch": 0.1073187268055952, + "grad_norm": 1.355832576751709, + "learning_rate": 4.859267209948849e-05, + "loss": 5.2908, + "step": 18045 + }, + { + "epoch": 0.1073246740888762, + "grad_norm": 1.1551882028579712, + "learning_rate": 4.859251758716891e-05, + "loss": 5.1681, + "step": 18046 + }, + { + "epoch": 0.1073306213721572, + "grad_norm": 1.1728358268737793, + "learning_rate": 4.8592363066613434e-05, + "loss": 5.1535, + "step": 18047 + }, + { + "epoch": 0.1073365686554382, + "grad_norm": 1.4180268049240112, + "learning_rate": 4.859220853782211e-05, + "loss": 4.6467, + "step": 18048 + }, + { + "epoch": 0.1073425159387192, + "grad_norm": 1.4042308330535889, + "learning_rate": 4.8592054000794984e-05, + "loss": 4.7348, + "step": 18049 + }, + { + "epoch": 0.10734846322200019, + "grad_norm": 1.2508533000946045, + "learning_rate": 4.859189945553211e-05, + "loss": 4.7797, + "step": 18050 + }, + { + "epoch": 0.10735441050528119, + "grad_norm": 1.2266274690628052, + "learning_rate": 4.859174490203355e-05, + "loss": 4.7223, + "step": 18051 + }, + { + "epoch": 0.10736035778856219, + "grad_norm": 1.3217378854751587, + "learning_rate": 4.8591590340299366e-05, + "loss": 4.82, + "step": 18052 + }, + { + "epoch": 0.10736630507184318, + "grad_norm": 1.3789056539535522, + "learning_rate": 4.8591435770329594e-05, + "loss": 5.3133, + "step": 18053 + }, + { + "epoch": 0.10737225235512418, + "grad_norm": 1.6090314388275146, + "learning_rate": 4.85912811921243e-05, + "loss": 5.2263, + "step": 18054 + }, + { + "epoch": 0.10737819963840518, + "grad_norm": 1.3780972957611084, + "learning_rate": 4.859112660568353e-05, + "loss": 5.3081, + "step": 18055 + }, + { + "epoch": 0.10738414692168617, + "grad_norm": 1.3518953323364258, + "learning_rate": 4.859097201100734e-05, + "loss": 5.3423, + "step": 18056 + }, + { + "epoch": 0.10739009420496717, + "grad_norm": 1.4160034656524658, + "learning_rate": 4.859081740809579e-05, + "loss": 5.3082, + "step": 18057 + }, + { + "epoch": 0.10739604148824816, + "grad_norm": 1.1970654726028442, + "learning_rate": 4.8590662796948924e-05, + "loss": 5.254, + "step": 18058 + }, + { + "epoch": 0.10740198877152916, + "grad_norm": 1.3175582885742188, + "learning_rate": 4.859050817756681e-05, + "loss": 5.2823, + "step": 18059 + }, + { + "epoch": 0.10740793605481017, + "grad_norm": 1.5136942863464355, + "learning_rate": 4.859035354994948e-05, + "loss": 5.2238, + "step": 18060 + }, + { + "epoch": 0.10741388333809115, + "grad_norm": 1.2552412748336792, + "learning_rate": 4.859019891409701e-05, + "loss": 5.0492, + "step": 18061 + }, + { + "epoch": 0.10741983062137216, + "grad_norm": 1.2873655557632446, + "learning_rate": 4.859004427000945e-05, + "loss": 4.9162, + "step": 18062 + }, + { + "epoch": 0.10742577790465316, + "grad_norm": 1.2441788911819458, + "learning_rate": 4.8589889617686834e-05, + "loss": 4.9769, + "step": 18063 + }, + { + "epoch": 0.10743172518793415, + "grad_norm": 1.4254180192947388, + "learning_rate": 4.8589734957129246e-05, + "loss": 4.9917, + "step": 18064 + }, + { + "epoch": 0.10743767247121515, + "grad_norm": 1.3922675848007202, + "learning_rate": 4.858958028833672e-05, + "loss": 4.9705, + "step": 18065 + }, + { + "epoch": 0.10744361975449615, + "grad_norm": 1.430801510810852, + "learning_rate": 4.858942561130932e-05, + "loss": 5.0772, + "step": 18066 + }, + { + "epoch": 0.10744956703777714, + "grad_norm": 1.3651894330978394, + "learning_rate": 4.8589270926047085e-05, + "loss": 4.8844, + "step": 18067 + }, + { + "epoch": 0.10745551432105814, + "grad_norm": 1.4133042097091675, + "learning_rate": 4.858911623255008e-05, + "loss": 4.9397, + "step": 18068 + }, + { + "epoch": 0.10746146160433914, + "grad_norm": 1.4437615871429443, + "learning_rate": 4.858896153081837e-05, + "loss": 4.9977, + "step": 18069 + }, + { + "epoch": 0.10746740888762013, + "grad_norm": 1.3420813083648682, + "learning_rate": 4.858880682085199e-05, + "loss": 4.9295, + "step": 18070 + }, + { + "epoch": 0.10747335617090113, + "grad_norm": 1.2613091468811035, + "learning_rate": 4.8588652102651e-05, + "loss": 5.3186, + "step": 18071 + }, + { + "epoch": 0.10747930345418213, + "grad_norm": 1.2117836475372314, + "learning_rate": 4.858849737621545e-05, + "loss": 5.207, + "step": 18072 + }, + { + "epoch": 0.10748525073746312, + "grad_norm": 1.3153164386749268, + "learning_rate": 4.85883426415454e-05, + "loss": 4.9786, + "step": 18073 + }, + { + "epoch": 0.10749119802074412, + "grad_norm": 1.2437881231307983, + "learning_rate": 4.858818789864091e-05, + "loss": 4.8748, + "step": 18074 + }, + { + "epoch": 0.10749714530402513, + "grad_norm": 1.2477847337722778, + "learning_rate": 4.858803314750203e-05, + "loss": 4.8874, + "step": 18075 + }, + { + "epoch": 0.10750309258730611, + "grad_norm": 1.342822790145874, + "learning_rate": 4.858787838812881e-05, + "loss": 4.8244, + "step": 18076 + }, + { + "epoch": 0.10750903987058712, + "grad_norm": 1.4947394132614136, + "learning_rate": 4.8587723620521306e-05, + "loss": 4.9091, + "step": 18077 + }, + { + "epoch": 0.10751498715386812, + "grad_norm": 1.388978362083435, + "learning_rate": 4.8587568844679566e-05, + "loss": 4.9075, + "step": 18078 + }, + { + "epoch": 0.10752093443714911, + "grad_norm": 1.5932878255844116, + "learning_rate": 4.8587414060603656e-05, + "loss": 4.8712, + "step": 18079 + }, + { + "epoch": 0.10752688172043011, + "grad_norm": 1.3746308088302612, + "learning_rate": 4.8587259268293616e-05, + "loss": 4.9187, + "step": 18080 + }, + { + "epoch": 0.10753282900371111, + "grad_norm": 1.2811295986175537, + "learning_rate": 4.858710446774951e-05, + "loss": 4.8643, + "step": 18081 + }, + { + "epoch": 0.1075387762869921, + "grad_norm": 1.4154548645019531, + "learning_rate": 4.858694965897139e-05, + "loss": 4.8802, + "step": 18082 + }, + { + "epoch": 0.1075447235702731, + "grad_norm": 1.3216148614883423, + "learning_rate": 4.8586794841959305e-05, + "loss": 5.0356, + "step": 18083 + }, + { + "epoch": 0.1075506708535541, + "grad_norm": 1.0971577167510986, + "learning_rate": 4.858664001671332e-05, + "loss": 5.2085, + "step": 18084 + }, + { + "epoch": 0.10755661813683509, + "grad_norm": 1.3257287740707397, + "learning_rate": 4.858648518323348e-05, + "loss": 5.1728, + "step": 18085 + }, + { + "epoch": 0.1075625654201161, + "grad_norm": 1.2429475784301758, + "learning_rate": 4.858633034151985e-05, + "loss": 5.1053, + "step": 18086 + }, + { + "epoch": 0.10756851270339708, + "grad_norm": 1.1196707487106323, + "learning_rate": 4.858617549157246e-05, + "loss": 5.074, + "step": 18087 + }, + { + "epoch": 0.10757445998667808, + "grad_norm": 1.1981266736984253, + "learning_rate": 4.858602063339139e-05, + "loss": 5.0093, + "step": 18088 + }, + { + "epoch": 0.10758040726995909, + "grad_norm": 1.3818682432174683, + "learning_rate": 4.858586576697668e-05, + "loss": 5.0184, + "step": 18089 + }, + { + "epoch": 0.10758635455324007, + "grad_norm": 1.303539752960205, + "learning_rate": 4.85857108923284e-05, + "loss": 5.1778, + "step": 18090 + }, + { + "epoch": 0.10759230183652108, + "grad_norm": 1.3990812301635742, + "learning_rate": 4.8585556009446576e-05, + "loss": 4.9785, + "step": 18091 + }, + { + "epoch": 0.10759824911980208, + "grad_norm": 1.2507104873657227, + "learning_rate": 4.858540111833129e-05, + "loss": 4.9024, + "step": 18092 + }, + { + "epoch": 0.10760419640308307, + "grad_norm": 1.2867792844772339, + "learning_rate": 4.858524621898257e-05, + "loss": 4.8847, + "step": 18093 + }, + { + "epoch": 0.10761014368636407, + "grad_norm": 1.1816591024398804, + "learning_rate": 4.8585091311400495e-05, + "loss": 4.9431, + "step": 18094 + }, + { + "epoch": 0.10761609096964507, + "grad_norm": 1.292284607887268, + "learning_rate": 4.85849363955851e-05, + "loss": 5.2273, + "step": 18095 + }, + { + "epoch": 0.10762203825292606, + "grad_norm": 1.3242478370666504, + "learning_rate": 4.8584781471536456e-05, + "loss": 5.093, + "step": 18096 + }, + { + "epoch": 0.10762798553620706, + "grad_norm": 1.211534857749939, + "learning_rate": 4.858462653925461e-05, + "loss": 5.0928, + "step": 18097 + }, + { + "epoch": 0.10763393281948806, + "grad_norm": 1.0469262599945068, + "learning_rate": 4.858447159873961e-05, + "loss": 5.0435, + "step": 18098 + }, + { + "epoch": 0.10763988010276905, + "grad_norm": 1.2352322340011597, + "learning_rate": 4.8584316649991514e-05, + "loss": 5.1899, + "step": 18099 + }, + { + "epoch": 0.10764582738605005, + "grad_norm": 1.2135246992111206, + "learning_rate": 4.8584161693010375e-05, + "loss": 5.1028, + "step": 18100 + }, + { + "epoch": 0.10765177466933105, + "grad_norm": 1.3525876998901367, + "learning_rate": 4.858400672779625e-05, + "loss": 5.0422, + "step": 18101 + }, + { + "epoch": 0.10765772195261204, + "grad_norm": 1.3221076726913452, + "learning_rate": 4.85838517543492e-05, + "loss": 5.1329, + "step": 18102 + }, + { + "epoch": 0.10766366923589304, + "grad_norm": 1.4856393337249756, + "learning_rate": 4.858369677266926e-05, + "loss": 4.6795, + "step": 18103 + }, + { + "epoch": 0.10766961651917405, + "grad_norm": 1.4690982103347778, + "learning_rate": 4.8583541782756495e-05, + "loss": 5.1234, + "step": 18104 + }, + { + "epoch": 0.10767556380245503, + "grad_norm": 1.2535064220428467, + "learning_rate": 4.8583386784610964e-05, + "loss": 5.1344, + "step": 18105 + }, + { + "epoch": 0.10768151108573604, + "grad_norm": 1.3537837266921997, + "learning_rate": 4.858323177823272e-05, + "loss": 5.228, + "step": 18106 + }, + { + "epoch": 0.10768745836901704, + "grad_norm": 1.2927895784378052, + "learning_rate": 4.8583076763621805e-05, + "loss": 5.2371, + "step": 18107 + }, + { + "epoch": 0.10769340565229803, + "grad_norm": 1.2356709241867065, + "learning_rate": 4.8582921740778284e-05, + "loss": 4.9056, + "step": 18108 + }, + { + "epoch": 0.10769935293557903, + "grad_norm": 1.266918420791626, + "learning_rate": 4.858276670970221e-05, + "loss": 5.2142, + "step": 18109 + }, + { + "epoch": 0.10770530021886003, + "grad_norm": 1.1703591346740723, + "learning_rate": 4.858261167039364e-05, + "loss": 5.1237, + "step": 18110 + }, + { + "epoch": 0.10771124750214102, + "grad_norm": 1.2324700355529785, + "learning_rate": 4.858245662285262e-05, + "loss": 5.1391, + "step": 18111 + }, + { + "epoch": 0.10771719478542202, + "grad_norm": 1.2764140367507935, + "learning_rate": 4.85823015670792e-05, + "loss": 5.1368, + "step": 18112 + }, + { + "epoch": 0.10772314206870302, + "grad_norm": 1.254909634590149, + "learning_rate": 4.8582146503073456e-05, + "loss": 5.002, + "step": 18113 + }, + { + "epoch": 0.10772908935198401, + "grad_norm": 1.3368279933929443, + "learning_rate": 4.858199143083542e-05, + "loss": 5.1365, + "step": 18114 + }, + { + "epoch": 0.10773503663526501, + "grad_norm": 1.3550091981887817, + "learning_rate": 4.8581836350365165e-05, + "loss": 5.1722, + "step": 18115 + }, + { + "epoch": 0.107740983918546, + "grad_norm": 1.6306661367416382, + "learning_rate": 4.858168126166272e-05, + "loss": 5.0883, + "step": 18116 + }, + { + "epoch": 0.107746931201827, + "grad_norm": 1.5143946409225464, + "learning_rate": 4.858152616472816e-05, + "loss": 5.1258, + "step": 18117 + }, + { + "epoch": 0.107752878485108, + "grad_norm": 1.6553763151168823, + "learning_rate": 4.858137105956153e-05, + "loss": 4.9596, + "step": 18118 + }, + { + "epoch": 0.107758825768389, + "grad_norm": 1.920473337173462, + "learning_rate": 4.8581215946162896e-05, + "loss": 5.2206, + "step": 18119 + }, + { + "epoch": 0.10776477305167, + "grad_norm": 1.8482425212860107, + "learning_rate": 4.85810608245323e-05, + "loss": 5.1515, + "step": 18120 + }, + { + "epoch": 0.107770720334951, + "grad_norm": 1.6005665063858032, + "learning_rate": 4.8580905694669794e-05, + "loss": 5.1383, + "step": 18121 + }, + { + "epoch": 0.10777666761823199, + "grad_norm": 1.2169783115386963, + "learning_rate": 4.858075055657544e-05, + "loss": 5.3538, + "step": 18122 + }, + { + "epoch": 0.10778261490151299, + "grad_norm": 1.3251442909240723, + "learning_rate": 4.858059541024929e-05, + "loss": 5.3116, + "step": 18123 + }, + { + "epoch": 0.10778856218479399, + "grad_norm": 1.2065789699554443, + "learning_rate": 4.858044025569139e-05, + "loss": 5.2334, + "step": 18124 + }, + { + "epoch": 0.10779450946807498, + "grad_norm": 1.5847411155700684, + "learning_rate": 4.858028509290181e-05, + "loss": 4.9114, + "step": 18125 + }, + { + "epoch": 0.10780045675135598, + "grad_norm": 1.373826503753662, + "learning_rate": 4.85801299218806e-05, + "loss": 5.0748, + "step": 18126 + }, + { + "epoch": 0.10780640403463698, + "grad_norm": 1.7349494695663452, + "learning_rate": 4.85799747426278e-05, + "loss": 5.0888, + "step": 18127 + }, + { + "epoch": 0.10781235131791797, + "grad_norm": 1.3385915756225586, + "learning_rate": 4.857981955514349e-05, + "loss": 5.1472, + "step": 18128 + }, + { + "epoch": 0.10781829860119897, + "grad_norm": 1.3666753768920898, + "learning_rate": 4.857966435942769e-05, + "loss": 5.0881, + "step": 18129 + }, + { + "epoch": 0.10782424588447997, + "grad_norm": 1.39078688621521, + "learning_rate": 4.857950915548048e-05, + "loss": 5.3867, + "step": 18130 + }, + { + "epoch": 0.10783019316776096, + "grad_norm": 1.4484905004501343, + "learning_rate": 4.857935394330192e-05, + "loss": 5.0516, + "step": 18131 + }, + { + "epoch": 0.10783614045104196, + "grad_norm": 1.526084542274475, + "learning_rate": 4.8579198722892034e-05, + "loss": 5.0424, + "step": 18132 + }, + { + "epoch": 0.10784208773432297, + "grad_norm": 1.4617003202438354, + "learning_rate": 4.8579043494250895e-05, + "loss": 5.0245, + "step": 18133 + }, + { + "epoch": 0.10784803501760395, + "grad_norm": 1.3335559368133545, + "learning_rate": 4.857888825737856e-05, + "loss": 4.9398, + "step": 18134 + }, + { + "epoch": 0.10785398230088496, + "grad_norm": 1.1473711729049683, + "learning_rate": 4.857873301227508e-05, + "loss": 5.1818, + "step": 18135 + }, + { + "epoch": 0.10785992958416596, + "grad_norm": 1.5986409187316895, + "learning_rate": 4.8578577758940504e-05, + "loss": 5.3518, + "step": 18136 + }, + { + "epoch": 0.10786587686744695, + "grad_norm": 1.6430408954620361, + "learning_rate": 4.857842249737489e-05, + "loss": 5.3052, + "step": 18137 + }, + { + "epoch": 0.10787182415072795, + "grad_norm": 1.5069605112075806, + "learning_rate": 4.8578267227578303e-05, + "loss": 5.3491, + "step": 18138 + }, + { + "epoch": 0.10787777143400895, + "grad_norm": 1.3385566473007202, + "learning_rate": 4.857811194955077e-05, + "loss": 5.3864, + "step": 18139 + }, + { + "epoch": 0.10788371871728994, + "grad_norm": 1.1956936120986938, + "learning_rate": 4.857795666329237e-05, + "loss": 5.1304, + "step": 18140 + }, + { + "epoch": 0.10788966600057094, + "grad_norm": 1.3437196016311646, + "learning_rate": 4.857780136880315e-05, + "loss": 5.1872, + "step": 18141 + }, + { + "epoch": 0.10789561328385194, + "grad_norm": 1.4649217128753662, + "learning_rate": 4.857764606608316e-05, + "loss": 5.4178, + "step": 18142 + }, + { + "epoch": 0.10790156056713293, + "grad_norm": 1.2196028232574463, + "learning_rate": 4.857749075513246e-05, + "loss": 5.1782, + "step": 18143 + }, + { + "epoch": 0.10790750785041393, + "grad_norm": 1.2016780376434326, + "learning_rate": 4.8577335435951096e-05, + "loss": 5.2293, + "step": 18144 + }, + { + "epoch": 0.10791345513369492, + "grad_norm": 1.3034183979034424, + "learning_rate": 4.857718010853914e-05, + "loss": 5.2886, + "step": 18145 + }, + { + "epoch": 0.10791940241697592, + "grad_norm": 1.1815390586853027, + "learning_rate": 4.857702477289663e-05, + "loss": 5.2637, + "step": 18146 + }, + { + "epoch": 0.10792534970025693, + "grad_norm": 1.328203558921814, + "learning_rate": 4.857686942902362e-05, + "loss": 5.3154, + "step": 18147 + }, + { + "epoch": 0.10793129698353791, + "grad_norm": 1.2995961904525757, + "learning_rate": 4.857671407692016e-05, + "loss": 5.3313, + "step": 18148 + }, + { + "epoch": 0.10793724426681892, + "grad_norm": 1.181191325187683, + "learning_rate": 4.8576558716586326e-05, + "loss": 5.2589, + "step": 18149 + }, + { + "epoch": 0.10794319155009992, + "grad_norm": 1.266570806503296, + "learning_rate": 4.8576403348022154e-05, + "loss": 5.1694, + "step": 18150 + }, + { + "epoch": 0.1079491388333809, + "grad_norm": 1.4107643365859985, + "learning_rate": 4.857624797122771e-05, + "loss": 5.1784, + "step": 18151 + }, + { + "epoch": 0.10795508611666191, + "grad_norm": 1.1809200048446655, + "learning_rate": 4.8576092586203024e-05, + "loss": 5.3081, + "step": 18152 + }, + { + "epoch": 0.10796103339994291, + "grad_norm": 1.179453730583191, + "learning_rate": 4.857593719294818e-05, + "loss": 5.2534, + "step": 18153 + }, + { + "epoch": 0.1079669806832239, + "grad_norm": 1.3677690029144287, + "learning_rate": 4.857578179146323e-05, + "loss": 5.4021, + "step": 18154 + }, + { + "epoch": 0.1079729279665049, + "grad_norm": 1.3077856302261353, + "learning_rate": 4.8575626381748196e-05, + "loss": 5.1766, + "step": 18155 + }, + { + "epoch": 0.1079788752497859, + "grad_norm": 1.075791835784912, + "learning_rate": 4.857547096380317e-05, + "loss": 5.163, + "step": 18156 + }, + { + "epoch": 0.10798482253306689, + "grad_norm": 1.2855931520462036, + "learning_rate": 4.8575315537628186e-05, + "loss": 5.157, + "step": 18157 + }, + { + "epoch": 0.10799076981634789, + "grad_norm": 1.1961009502410889, + "learning_rate": 4.8575160103223303e-05, + "loss": 5.1632, + "step": 18158 + }, + { + "epoch": 0.1079967170996289, + "grad_norm": 1.6419997215270996, + "learning_rate": 4.8575004660588574e-05, + "loss": 5.1575, + "step": 18159 + }, + { + "epoch": 0.10800266438290988, + "grad_norm": 1.5928575992584229, + "learning_rate": 4.857484920972405e-05, + "loss": 5.0818, + "step": 18160 + }, + { + "epoch": 0.10800861166619088, + "grad_norm": 1.3492580652236938, + "learning_rate": 4.85746937506298e-05, + "loss": 5.1529, + "step": 18161 + }, + { + "epoch": 0.10801455894947189, + "grad_norm": 1.543717861175537, + "learning_rate": 4.857453828330587e-05, + "loss": 5.6192, + "step": 18162 + }, + { + "epoch": 0.10802050623275287, + "grad_norm": 1.5657880306243896, + "learning_rate": 4.85743828077523e-05, + "loss": 5.6619, + "step": 18163 + }, + { + "epoch": 0.10802645351603388, + "grad_norm": 1.3861533403396606, + "learning_rate": 4.8574227323969164e-05, + "loss": 5.2147, + "step": 18164 + }, + { + "epoch": 0.10803240079931488, + "grad_norm": 1.3780323266983032, + "learning_rate": 4.85740718319565e-05, + "loss": 5.1112, + "step": 18165 + }, + { + "epoch": 0.10803834808259587, + "grad_norm": 1.5768086910247803, + "learning_rate": 4.857391633171438e-05, + "loss": 5.011, + "step": 18166 + }, + { + "epoch": 0.10804429536587687, + "grad_norm": 1.4504894018173218, + "learning_rate": 4.857376082324285e-05, + "loss": 4.9349, + "step": 18167 + }, + { + "epoch": 0.10805024264915787, + "grad_norm": 1.5084949731826782, + "learning_rate": 4.857360530654196e-05, + "loss": 4.9861, + "step": 18168 + }, + { + "epoch": 0.10805618993243886, + "grad_norm": 1.4052237272262573, + "learning_rate": 4.857344978161177e-05, + "loss": 5.0447, + "step": 18169 + }, + { + "epoch": 0.10806213721571986, + "grad_norm": 1.5666663646697998, + "learning_rate": 4.857329424845233e-05, + "loss": 5.3537, + "step": 18170 + }, + { + "epoch": 0.10806808449900086, + "grad_norm": 1.251293420791626, + "learning_rate": 4.8573138707063695e-05, + "loss": 5.0139, + "step": 18171 + }, + { + "epoch": 0.10807403178228185, + "grad_norm": 1.2570216655731201, + "learning_rate": 4.8572983157445926e-05, + "loss": 4.9959, + "step": 18172 + }, + { + "epoch": 0.10807997906556285, + "grad_norm": 1.5116729736328125, + "learning_rate": 4.857282759959907e-05, + "loss": 5.1592, + "step": 18173 + }, + { + "epoch": 0.10808592634884384, + "grad_norm": 1.518898367881775, + "learning_rate": 4.857267203352318e-05, + "loss": 5.3541, + "step": 18174 + }, + { + "epoch": 0.10809187363212484, + "grad_norm": 1.314247965812683, + "learning_rate": 4.857251645921832e-05, + "loss": 5.2249, + "step": 18175 + }, + { + "epoch": 0.10809782091540585, + "grad_norm": 1.378150224685669, + "learning_rate": 4.857236087668453e-05, + "loss": 5.0004, + "step": 18176 + }, + { + "epoch": 0.10810376819868683, + "grad_norm": 1.4453868865966797, + "learning_rate": 4.8572205285921876e-05, + "loss": 5.2717, + "step": 18177 + }, + { + "epoch": 0.10810971548196784, + "grad_norm": 1.3493587970733643, + "learning_rate": 4.857204968693041e-05, + "loss": 5.4044, + "step": 18178 + }, + { + "epoch": 0.10811566276524884, + "grad_norm": 1.3819094896316528, + "learning_rate": 4.857189407971019e-05, + "loss": 5.0641, + "step": 18179 + }, + { + "epoch": 0.10812161004852983, + "grad_norm": 1.337969422340393, + "learning_rate": 4.857173846426126e-05, + "loss": 4.9078, + "step": 18180 + }, + { + "epoch": 0.10812755733181083, + "grad_norm": 1.655778408050537, + "learning_rate": 4.857158284058367e-05, + "loss": 4.9192, + "step": 18181 + }, + { + "epoch": 0.10813350461509183, + "grad_norm": 1.3867977857589722, + "learning_rate": 4.85714272086775e-05, + "loss": 4.86, + "step": 18182 + }, + { + "epoch": 0.10813945189837282, + "grad_norm": 1.5444231033325195, + "learning_rate": 4.8571271568542786e-05, + "loss": 4.9745, + "step": 18183 + }, + { + "epoch": 0.10814539918165382, + "grad_norm": 1.470123052597046, + "learning_rate": 4.8571115920179576e-05, + "loss": 5.1311, + "step": 18184 + }, + { + "epoch": 0.10815134646493482, + "grad_norm": 1.3052124977111816, + "learning_rate": 4.8570960263587936e-05, + "loss": 5.0657, + "step": 18185 + }, + { + "epoch": 0.10815729374821581, + "grad_norm": 1.4197286367416382, + "learning_rate": 4.857080459876792e-05, + "loss": 5.0798, + "step": 18186 + }, + { + "epoch": 0.10816324103149681, + "grad_norm": 1.5119234323501587, + "learning_rate": 4.857064892571958e-05, + "loss": 5.2842, + "step": 18187 + }, + { + "epoch": 0.10816918831477781, + "grad_norm": 1.6037629842758179, + "learning_rate": 4.8570493244442974e-05, + "loss": 4.8785, + "step": 18188 + }, + { + "epoch": 0.1081751355980588, + "grad_norm": 1.6456643342971802, + "learning_rate": 4.857033755493814e-05, + "loss": 5.2566, + "step": 18189 + }, + { + "epoch": 0.1081810828813398, + "grad_norm": 1.5777020454406738, + "learning_rate": 4.8570181857205155e-05, + "loss": 4.9856, + "step": 18190 + }, + { + "epoch": 0.1081870301646208, + "grad_norm": 1.6042171716690063, + "learning_rate": 4.857002615124405e-05, + "loss": 4.9179, + "step": 18191 + }, + { + "epoch": 0.1081929774479018, + "grad_norm": 1.2339718341827393, + "learning_rate": 4.856987043705491e-05, + "loss": 4.9144, + "step": 18192 + }, + { + "epoch": 0.1081989247311828, + "grad_norm": 1.4531115293502808, + "learning_rate": 4.856971471463776e-05, + "loss": 5.0296, + "step": 18193 + }, + { + "epoch": 0.1082048720144638, + "grad_norm": 1.4179781675338745, + "learning_rate": 4.856955898399267e-05, + "loss": 5.268, + "step": 18194 + }, + { + "epoch": 0.10821081929774479, + "grad_norm": 1.5291078090667725, + "learning_rate": 4.856940324511969e-05, + "loss": 5.2433, + "step": 18195 + }, + { + "epoch": 0.10821676658102579, + "grad_norm": 1.5799169540405273, + "learning_rate": 4.856924749801888e-05, + "loss": 5.1906, + "step": 18196 + }, + { + "epoch": 0.10822271386430679, + "grad_norm": 1.4068591594696045, + "learning_rate": 4.8569091742690276e-05, + "loss": 5.2152, + "step": 18197 + }, + { + "epoch": 0.10822866114758778, + "grad_norm": 1.3728901147842407, + "learning_rate": 4.8568935979133953e-05, + "loss": 5.1717, + "step": 18198 + }, + { + "epoch": 0.10823460843086878, + "grad_norm": 1.524344563484192, + "learning_rate": 4.856878020734996e-05, + "loss": 5.0635, + "step": 18199 + }, + { + "epoch": 0.10824055571414978, + "grad_norm": 1.4725397825241089, + "learning_rate": 4.856862442733835e-05, + "loss": 5.2382, + "step": 18200 + }, + { + "epoch": 0.10824650299743077, + "grad_norm": 1.3467813730239868, + "learning_rate": 4.856846863909917e-05, + "loss": 5.0823, + "step": 18201 + }, + { + "epoch": 0.10825245028071177, + "grad_norm": 1.264833927154541, + "learning_rate": 4.856831284263249e-05, + "loss": 5.1763, + "step": 18202 + }, + { + "epoch": 0.10825839756399276, + "grad_norm": 1.2883045673370361, + "learning_rate": 4.856815703793836e-05, + "loss": 5.1207, + "step": 18203 + }, + { + "epoch": 0.10826434484727376, + "grad_norm": 1.309486746788025, + "learning_rate": 4.856800122501681e-05, + "loss": 5.0648, + "step": 18204 + }, + { + "epoch": 0.10827029213055477, + "grad_norm": 1.4473057985305786, + "learning_rate": 4.856784540386793e-05, + "loss": 4.9615, + "step": 18205 + }, + { + "epoch": 0.10827623941383575, + "grad_norm": 1.5151125192642212, + "learning_rate": 4.856768957449175e-05, + "loss": 5.2847, + "step": 18206 + }, + { + "epoch": 0.10828218669711676, + "grad_norm": 1.4859318733215332, + "learning_rate": 4.8567533736888336e-05, + "loss": 4.931, + "step": 18207 + }, + { + "epoch": 0.10828813398039776, + "grad_norm": 1.6516517400741577, + "learning_rate": 4.8567377891057745e-05, + "loss": 5.05, + "step": 18208 + }, + { + "epoch": 0.10829408126367875, + "grad_norm": 1.679347276687622, + "learning_rate": 4.8567222037000024e-05, + "loss": 5.2281, + "step": 18209 + }, + { + "epoch": 0.10830002854695975, + "grad_norm": 1.5119515657424927, + "learning_rate": 4.856706617471523e-05, + "loss": 4.9572, + "step": 18210 + }, + { + "epoch": 0.10830597583024075, + "grad_norm": 1.6819381713867188, + "learning_rate": 4.8566910304203404e-05, + "loss": 4.6228, + "step": 18211 + }, + { + "epoch": 0.10831192311352174, + "grad_norm": 1.7754294872283936, + "learning_rate": 4.856675442546462e-05, + "loss": 4.6851, + "step": 18212 + }, + { + "epoch": 0.10831787039680274, + "grad_norm": 1.455660343170166, + "learning_rate": 4.856659853849893e-05, + "loss": 5.059, + "step": 18213 + }, + { + "epoch": 0.10832381768008374, + "grad_norm": 1.358823299407959, + "learning_rate": 4.856644264330639e-05, + "loss": 5.0354, + "step": 18214 + }, + { + "epoch": 0.10832976496336473, + "grad_norm": 1.465482473373413, + "learning_rate": 4.856628673988703e-05, + "loss": 5.0441, + "step": 18215 + }, + { + "epoch": 0.10833571224664573, + "grad_norm": 1.3863260746002197, + "learning_rate": 4.8566130828240936e-05, + "loss": 5.0445, + "step": 18216 + }, + { + "epoch": 0.10834165952992673, + "grad_norm": 1.556997299194336, + "learning_rate": 4.856597490836815e-05, + "loss": 5.0629, + "step": 18217 + }, + { + "epoch": 0.10834760681320772, + "grad_norm": 1.3784066438674927, + "learning_rate": 4.856581898026872e-05, + "loss": 5.1894, + "step": 18218 + }, + { + "epoch": 0.10835355409648872, + "grad_norm": 1.4675719738006592, + "learning_rate": 4.856566304394271e-05, + "loss": 5.008, + "step": 18219 + }, + { + "epoch": 0.10835950137976973, + "grad_norm": 1.634920597076416, + "learning_rate": 4.856550709939016e-05, + "loss": 4.7707, + "step": 18220 + }, + { + "epoch": 0.10836544866305071, + "grad_norm": 1.83092200756073, + "learning_rate": 4.856535114661115e-05, + "loss": 4.8947, + "step": 18221 + }, + { + "epoch": 0.10837139594633172, + "grad_norm": 1.497359037399292, + "learning_rate": 4.856519518560571e-05, + "loss": 4.9656, + "step": 18222 + }, + { + "epoch": 0.10837734322961272, + "grad_norm": 1.3194255828857422, + "learning_rate": 4.856503921637391e-05, + "loss": 5.2374, + "step": 18223 + }, + { + "epoch": 0.1083832905128937, + "grad_norm": 1.3584619760513306, + "learning_rate": 4.8564883238915794e-05, + "loss": 5.1154, + "step": 18224 + }, + { + "epoch": 0.10838923779617471, + "grad_norm": 1.4173928499221802, + "learning_rate": 4.8564727253231416e-05, + "loss": 5.173, + "step": 18225 + }, + { + "epoch": 0.10839518507945571, + "grad_norm": 1.4110074043273926, + "learning_rate": 4.8564571259320844e-05, + "loss": 5.2409, + "step": 18226 + }, + { + "epoch": 0.1084011323627367, + "grad_norm": 1.4481827020645142, + "learning_rate": 4.856441525718412e-05, + "loss": 4.8533, + "step": 18227 + }, + { + "epoch": 0.1084070796460177, + "grad_norm": 1.4017881155014038, + "learning_rate": 4.85642592468213e-05, + "loss": 5.0483, + "step": 18228 + }, + { + "epoch": 0.1084130269292987, + "grad_norm": 1.3940458297729492, + "learning_rate": 4.8564103228232445e-05, + "loss": 5.0983, + "step": 18229 + }, + { + "epoch": 0.10841897421257969, + "grad_norm": 1.4414485692977905, + "learning_rate": 4.8563947201417604e-05, + "loss": 5.1561, + "step": 18230 + }, + { + "epoch": 0.1084249214958607, + "grad_norm": 1.3622056245803833, + "learning_rate": 4.856379116637683e-05, + "loss": 5.1773, + "step": 18231 + }, + { + "epoch": 0.10843086877914168, + "grad_norm": 1.3298035860061646, + "learning_rate": 4.856363512311019e-05, + "loss": 5.0742, + "step": 18232 + }, + { + "epoch": 0.10843681606242268, + "grad_norm": 1.3110575675964355, + "learning_rate": 4.856347907161771e-05, + "loss": 5.044, + "step": 18233 + }, + { + "epoch": 0.10844276334570369, + "grad_norm": 1.309591293334961, + "learning_rate": 4.856332301189948e-05, + "loss": 5.1313, + "step": 18234 + }, + { + "epoch": 0.10844871062898467, + "grad_norm": 1.2283830642700195, + "learning_rate": 4.856316694395552e-05, + "loss": 5.0777, + "step": 18235 + }, + { + "epoch": 0.10845465791226568, + "grad_norm": 1.1523172855377197, + "learning_rate": 4.856301086778592e-05, + "loss": 5.1245, + "step": 18236 + }, + { + "epoch": 0.10846060519554668, + "grad_norm": 1.3058217763900757, + "learning_rate": 4.85628547833907e-05, + "loss": 4.9649, + "step": 18237 + }, + { + "epoch": 0.10846655247882767, + "grad_norm": 1.239734172821045, + "learning_rate": 4.856269869076994e-05, + "loss": 5.0736, + "step": 18238 + }, + { + "epoch": 0.10847249976210867, + "grad_norm": 1.2624062299728394, + "learning_rate": 4.856254258992369e-05, + "loss": 5.0538, + "step": 18239 + }, + { + "epoch": 0.10847844704538967, + "grad_norm": 1.2172342538833618, + "learning_rate": 4.856238648085199e-05, + "loss": 5.0781, + "step": 18240 + }, + { + "epoch": 0.10848439432867066, + "grad_norm": 1.2534043788909912, + "learning_rate": 4.8562230363554906e-05, + "loss": 5.2148, + "step": 18241 + }, + { + "epoch": 0.10849034161195166, + "grad_norm": 1.3765602111816406, + "learning_rate": 4.85620742380325e-05, + "loss": 5.1274, + "step": 18242 + }, + { + "epoch": 0.10849628889523266, + "grad_norm": 1.4610897302627563, + "learning_rate": 4.856191810428481e-05, + "loss": 5.0356, + "step": 18243 + }, + { + "epoch": 0.10850223617851365, + "grad_norm": 1.4103399515151978, + "learning_rate": 4.8561761962311895e-05, + "loss": 5.0198, + "step": 18244 + }, + { + "epoch": 0.10850818346179465, + "grad_norm": 1.5159040689468384, + "learning_rate": 4.856160581211382e-05, + "loss": 5.0139, + "step": 18245 + }, + { + "epoch": 0.10851413074507565, + "grad_norm": 1.5071041584014893, + "learning_rate": 4.856144965369063e-05, + "loss": 4.9644, + "step": 18246 + }, + { + "epoch": 0.10852007802835664, + "grad_norm": 1.4504464864730835, + "learning_rate": 4.856129348704237e-05, + "loss": 5.041, + "step": 18247 + }, + { + "epoch": 0.10852602531163764, + "grad_norm": 1.2327022552490234, + "learning_rate": 4.856113731216911e-05, + "loss": 4.9775, + "step": 18248 + }, + { + "epoch": 0.10853197259491865, + "grad_norm": 2.013401508331299, + "learning_rate": 4.8560981129070914e-05, + "loss": 4.5814, + "step": 18249 + }, + { + "epoch": 0.10853791987819963, + "grad_norm": 1.7224215269088745, + "learning_rate": 4.8560824937747814e-05, + "loss": 5.3439, + "step": 18250 + }, + { + "epoch": 0.10854386716148064, + "grad_norm": 1.6198631525039673, + "learning_rate": 4.856066873819987e-05, + "loss": 5.0878, + "step": 18251 + }, + { + "epoch": 0.10854981444476164, + "grad_norm": 1.3257763385772705, + "learning_rate": 4.8560512530427146e-05, + "loss": 5.4697, + "step": 18252 + }, + { + "epoch": 0.10855576172804263, + "grad_norm": 1.6341005563735962, + "learning_rate": 4.856035631442969e-05, + "loss": 5.1383, + "step": 18253 + }, + { + "epoch": 0.10856170901132363, + "grad_norm": 1.4148058891296387, + "learning_rate": 4.8560200090207555e-05, + "loss": 5.3053, + "step": 18254 + }, + { + "epoch": 0.10856765629460463, + "grad_norm": 1.4810155630111694, + "learning_rate": 4.8560043857760796e-05, + "loss": 5.1222, + "step": 18255 + }, + { + "epoch": 0.10857360357788562, + "grad_norm": 1.4345650672912598, + "learning_rate": 4.8559887617089476e-05, + "loss": 5.2331, + "step": 18256 + }, + { + "epoch": 0.10857955086116662, + "grad_norm": 1.7319680452346802, + "learning_rate": 4.855973136819363e-05, + "loss": 4.6762, + "step": 18257 + }, + { + "epoch": 0.10858549814444762, + "grad_norm": 1.3632503747940063, + "learning_rate": 4.855957511107333e-05, + "loss": 4.8047, + "step": 18258 + }, + { + "epoch": 0.10859144542772861, + "grad_norm": 1.2798017263412476, + "learning_rate": 4.8559418845728636e-05, + "loss": 4.9368, + "step": 18259 + }, + { + "epoch": 0.10859739271100961, + "grad_norm": 1.539689540863037, + "learning_rate": 4.855926257215958e-05, + "loss": 4.8178, + "step": 18260 + }, + { + "epoch": 0.1086033399942906, + "grad_norm": 1.2351077795028687, + "learning_rate": 4.855910629036623e-05, + "loss": 5.0983, + "step": 18261 + }, + { + "epoch": 0.1086092872775716, + "grad_norm": 1.582154393196106, + "learning_rate": 4.855895000034865e-05, + "loss": 5.0563, + "step": 18262 + }, + { + "epoch": 0.1086152345608526, + "grad_norm": 1.3505899906158447, + "learning_rate": 4.855879370210688e-05, + "loss": 5.4024, + "step": 18263 + }, + { + "epoch": 0.1086211818441336, + "grad_norm": 1.236626148223877, + "learning_rate": 4.855863739564097e-05, + "loss": 5.4412, + "step": 18264 + }, + { + "epoch": 0.1086271291274146, + "grad_norm": 1.1207302808761597, + "learning_rate": 4.855848108095099e-05, + "loss": 5.3498, + "step": 18265 + }, + { + "epoch": 0.1086330764106956, + "grad_norm": 1.3238142728805542, + "learning_rate": 4.855832475803698e-05, + "loss": 4.9028, + "step": 18266 + }, + { + "epoch": 0.10863902369397659, + "grad_norm": 1.4837650060653687, + "learning_rate": 4.8558168426899006e-05, + "loss": 5.354, + "step": 18267 + }, + { + "epoch": 0.10864497097725759, + "grad_norm": 1.55657160282135, + "learning_rate": 4.8558012087537126e-05, + "loss": 5.4629, + "step": 18268 + }, + { + "epoch": 0.10865091826053859, + "grad_norm": 1.4918092489242554, + "learning_rate": 4.855785573995138e-05, + "loss": 5.046, + "step": 18269 + }, + { + "epoch": 0.10865686554381958, + "grad_norm": 1.5374544858932495, + "learning_rate": 4.855769938414183e-05, + "loss": 4.9571, + "step": 18270 + }, + { + "epoch": 0.10866281282710058, + "grad_norm": 1.360386610031128, + "learning_rate": 4.8557543020108537e-05, + "loss": 4.9482, + "step": 18271 + }, + { + "epoch": 0.10866876011038158, + "grad_norm": 1.2835793495178223, + "learning_rate": 4.855738664785154e-05, + "loss": 4.8301, + "step": 18272 + }, + { + "epoch": 0.10867470739366257, + "grad_norm": 1.453478217124939, + "learning_rate": 4.8557230267370915e-05, + "loss": 4.7873, + "step": 18273 + }, + { + "epoch": 0.10868065467694357, + "grad_norm": 1.4986752271652222, + "learning_rate": 4.855707387866669e-05, + "loss": 5.4533, + "step": 18274 + }, + { + "epoch": 0.10868660196022457, + "grad_norm": 1.574263572692871, + "learning_rate": 4.855691748173894e-05, + "loss": 5.0576, + "step": 18275 + }, + { + "epoch": 0.10869254924350556, + "grad_norm": 1.6014435291290283, + "learning_rate": 4.855676107658772e-05, + "loss": 4.8039, + "step": 18276 + }, + { + "epoch": 0.10869849652678656, + "grad_norm": 1.3822481632232666, + "learning_rate": 4.855660466321307e-05, + "loss": 4.9241, + "step": 18277 + }, + { + "epoch": 0.10870444381006757, + "grad_norm": 1.3199692964553833, + "learning_rate": 4.855644824161506e-05, + "loss": 4.842, + "step": 18278 + }, + { + "epoch": 0.10871039109334855, + "grad_norm": 1.340505599975586, + "learning_rate": 4.855629181179373e-05, + "loss": 4.8217, + "step": 18279 + }, + { + "epoch": 0.10871633837662956, + "grad_norm": 1.32645845413208, + "learning_rate": 4.8556135373749144e-05, + "loss": 4.9701, + "step": 18280 + }, + { + "epoch": 0.10872228565991056, + "grad_norm": 1.3629400730133057, + "learning_rate": 4.855597892748135e-05, + "loss": 5.2129, + "step": 18281 + }, + { + "epoch": 0.10872823294319155, + "grad_norm": 1.504604458808899, + "learning_rate": 4.8555822472990415e-05, + "loss": 4.988, + "step": 18282 + }, + { + "epoch": 0.10873418022647255, + "grad_norm": 1.514352560043335, + "learning_rate": 4.855566601027638e-05, + "loss": 4.8909, + "step": 18283 + }, + { + "epoch": 0.10874012750975355, + "grad_norm": 1.35514235496521, + "learning_rate": 4.85555095393393e-05, + "loss": 4.9441, + "step": 18284 + }, + { + "epoch": 0.10874607479303454, + "grad_norm": 1.1690728664398193, + "learning_rate": 4.8555353060179256e-05, + "loss": 5.3733, + "step": 18285 + }, + { + "epoch": 0.10875202207631554, + "grad_norm": 1.3280658721923828, + "learning_rate": 4.855519657279626e-05, + "loss": 5.4406, + "step": 18286 + }, + { + "epoch": 0.10875796935959654, + "grad_norm": 1.5852582454681396, + "learning_rate": 4.85550400771904e-05, + "loss": 5.176, + "step": 18287 + }, + { + "epoch": 0.10876391664287753, + "grad_norm": 1.233869194984436, + "learning_rate": 4.855488357336172e-05, + "loss": 5.2879, + "step": 18288 + }, + { + "epoch": 0.10876986392615853, + "grad_norm": 1.365251064300537, + "learning_rate": 4.855472706131027e-05, + "loss": 5.1592, + "step": 18289 + }, + { + "epoch": 0.10877581120943952, + "grad_norm": 1.6119641065597534, + "learning_rate": 4.8554570541036104e-05, + "loss": 5.0079, + "step": 18290 + }, + { + "epoch": 0.10878175849272052, + "grad_norm": 1.3233095407485962, + "learning_rate": 4.855441401253928e-05, + "loss": 5.3579, + "step": 18291 + }, + { + "epoch": 0.10878770577600153, + "grad_norm": 1.3345812559127808, + "learning_rate": 4.855425747581986e-05, + "loss": 5.1435, + "step": 18292 + }, + { + "epoch": 0.10879365305928251, + "grad_norm": 1.6694916486740112, + "learning_rate": 4.855410093087789e-05, + "loss": 5.0007, + "step": 18293 + }, + { + "epoch": 0.10879960034256352, + "grad_norm": 1.5835634469985962, + "learning_rate": 4.855394437771342e-05, + "loss": 4.9706, + "step": 18294 + }, + { + "epoch": 0.10880554762584452, + "grad_norm": 1.5465360879898071, + "learning_rate": 4.8553787816326526e-05, + "loss": 4.8983, + "step": 18295 + }, + { + "epoch": 0.1088114949091255, + "grad_norm": 1.4393326044082642, + "learning_rate": 4.855363124671723e-05, + "loss": 4.9365, + "step": 18296 + }, + { + "epoch": 0.10881744219240651, + "grad_norm": 1.5096935033798218, + "learning_rate": 4.8553474668885626e-05, + "loss": 4.8343, + "step": 18297 + }, + { + "epoch": 0.10882338947568751, + "grad_norm": 1.422397255897522, + "learning_rate": 4.8553318082831735e-05, + "loss": 4.9229, + "step": 18298 + }, + { + "epoch": 0.1088293367589685, + "grad_norm": 1.6444910764694214, + "learning_rate": 4.855316148855562e-05, + "loss": 5.0403, + "step": 18299 + }, + { + "epoch": 0.1088352840422495, + "grad_norm": 1.3621931076049805, + "learning_rate": 4.855300488605734e-05, + "loss": 4.9027, + "step": 18300 + }, + { + "epoch": 0.1088412313255305, + "grad_norm": 1.5086915493011475, + "learning_rate": 4.855284827533696e-05, + "loss": 4.95, + "step": 18301 + }, + { + "epoch": 0.10884717860881149, + "grad_norm": 1.7021756172180176, + "learning_rate": 4.855269165639451e-05, + "loss": 4.8245, + "step": 18302 + }, + { + "epoch": 0.10885312589209249, + "grad_norm": 1.6745699644088745, + "learning_rate": 4.855253502923007e-05, + "loss": 4.7832, + "step": 18303 + }, + { + "epoch": 0.1088590731753735, + "grad_norm": 1.2379045486450195, + "learning_rate": 4.8552378393843676e-05, + "loss": 5.0438, + "step": 18304 + }, + { + "epoch": 0.10886502045865448, + "grad_norm": 1.3999474048614502, + "learning_rate": 4.85522217502354e-05, + "loss": 5.0123, + "step": 18305 + }, + { + "epoch": 0.10887096774193548, + "grad_norm": 1.3539077043533325, + "learning_rate": 4.8552065098405276e-05, + "loss": 5.0722, + "step": 18306 + }, + { + "epoch": 0.10887691502521649, + "grad_norm": 1.3992128372192383, + "learning_rate": 4.8551908438353374e-05, + "loss": 4.9449, + "step": 18307 + }, + { + "epoch": 0.10888286230849747, + "grad_norm": 1.617443323135376, + "learning_rate": 4.8551751770079744e-05, + "loss": 5.1081, + "step": 18308 + }, + { + "epoch": 0.10888880959177848, + "grad_norm": 1.6027116775512695, + "learning_rate": 4.8551595093584446e-05, + "loss": 5.06, + "step": 18309 + }, + { + "epoch": 0.10889475687505948, + "grad_norm": 1.1488780975341797, + "learning_rate": 4.855143840886752e-05, + "loss": 5.1771, + "step": 18310 + }, + { + "epoch": 0.10890070415834047, + "grad_norm": 1.5683537721633911, + "learning_rate": 4.855128171592903e-05, + "loss": 5.1402, + "step": 18311 + }, + { + "epoch": 0.10890665144162147, + "grad_norm": 1.2840538024902344, + "learning_rate": 4.855112501476904e-05, + "loss": 5.2887, + "step": 18312 + }, + { + "epoch": 0.10891259872490247, + "grad_norm": 1.2311303615570068, + "learning_rate": 4.855096830538759e-05, + "loss": 5.2057, + "step": 18313 + }, + { + "epoch": 0.10891854600818346, + "grad_norm": 1.3655261993408203, + "learning_rate": 4.855081158778474e-05, + "loss": 5.3298, + "step": 18314 + }, + { + "epoch": 0.10892449329146446, + "grad_norm": 1.3405102491378784, + "learning_rate": 4.855065486196055e-05, + "loss": 5.3249, + "step": 18315 + }, + { + "epoch": 0.10893044057474546, + "grad_norm": 1.3816508054733276, + "learning_rate": 4.855049812791506e-05, + "loss": 5.2829, + "step": 18316 + }, + { + "epoch": 0.10893638785802645, + "grad_norm": 1.1929587125778198, + "learning_rate": 4.855034138564835e-05, + "loss": 5.5317, + "step": 18317 + }, + { + "epoch": 0.10894233514130745, + "grad_norm": 1.2426830530166626, + "learning_rate": 4.855018463516045e-05, + "loss": 5.263, + "step": 18318 + }, + { + "epoch": 0.10894828242458844, + "grad_norm": 1.3385604619979858, + "learning_rate": 4.855002787645141e-05, + "loss": 5.2531, + "step": 18319 + }, + { + "epoch": 0.10895422970786944, + "grad_norm": 1.2306677103042603, + "learning_rate": 4.8549871109521314e-05, + "loss": 5.245, + "step": 18320 + }, + { + "epoch": 0.10896017699115045, + "grad_norm": 1.3108047246932983, + "learning_rate": 4.85497143343702e-05, + "loss": 5.3063, + "step": 18321 + }, + { + "epoch": 0.10896612427443143, + "grad_norm": 1.3951044082641602, + "learning_rate": 4.8549557550998126e-05, + "loss": 5.4842, + "step": 18322 + }, + { + "epoch": 0.10897207155771244, + "grad_norm": 1.4618322849273682, + "learning_rate": 4.854940075940514e-05, + "loss": 5.5703, + "step": 18323 + }, + { + "epoch": 0.10897801884099344, + "grad_norm": 1.3512097597122192, + "learning_rate": 4.8549243959591304e-05, + "loss": 5.2615, + "step": 18324 + }, + { + "epoch": 0.10898396612427443, + "grad_norm": 1.261428713798523, + "learning_rate": 4.8549087151556675e-05, + "loss": 5.2617, + "step": 18325 + }, + { + "epoch": 0.10898991340755543, + "grad_norm": 1.5647974014282227, + "learning_rate": 4.854893033530129e-05, + "loss": 5.0529, + "step": 18326 + }, + { + "epoch": 0.10899586069083643, + "grad_norm": 1.3635188341140747, + "learning_rate": 4.8548773510825226e-05, + "loss": 5.1029, + "step": 18327 + }, + { + "epoch": 0.10900180797411742, + "grad_norm": 1.2746639251708984, + "learning_rate": 4.854861667812852e-05, + "loss": 5.1788, + "step": 18328 + }, + { + "epoch": 0.10900775525739842, + "grad_norm": 1.3292982578277588, + "learning_rate": 4.854845983721125e-05, + "loss": 5.2442, + "step": 18329 + }, + { + "epoch": 0.10901370254067942, + "grad_norm": 1.3015047311782837, + "learning_rate": 4.854830298807345e-05, + "loss": 5.2234, + "step": 18330 + }, + { + "epoch": 0.10901964982396041, + "grad_norm": 1.2642244100570679, + "learning_rate": 4.854814613071518e-05, + "loss": 5.1501, + "step": 18331 + }, + { + "epoch": 0.10902559710724141, + "grad_norm": 1.191630482673645, + "learning_rate": 4.8547989265136484e-05, + "loss": 5.1618, + "step": 18332 + }, + { + "epoch": 0.10903154439052241, + "grad_norm": 1.4171391725540161, + "learning_rate": 4.8547832391337445e-05, + "loss": 5.1431, + "step": 18333 + }, + { + "epoch": 0.1090374916738034, + "grad_norm": 1.3901907205581665, + "learning_rate": 4.854767550931809e-05, + "loss": 5.1464, + "step": 18334 + }, + { + "epoch": 0.1090434389570844, + "grad_norm": 1.5166548490524292, + "learning_rate": 4.854751861907849e-05, + "loss": 5.0841, + "step": 18335 + }, + { + "epoch": 0.1090493862403654, + "grad_norm": 1.3555935621261597, + "learning_rate": 4.854736172061869e-05, + "loss": 5.2947, + "step": 18336 + }, + { + "epoch": 0.1090553335236464, + "grad_norm": 1.1348215341567993, + "learning_rate": 4.854720481393875e-05, + "loss": 5.2813, + "step": 18337 + }, + { + "epoch": 0.1090612808069274, + "grad_norm": 1.3353219032287598, + "learning_rate": 4.8547047899038734e-05, + "loss": 5.2473, + "step": 18338 + }, + { + "epoch": 0.1090672280902084, + "grad_norm": 1.550512671470642, + "learning_rate": 4.854689097591868e-05, + "loss": 5.1364, + "step": 18339 + }, + { + "epoch": 0.10907317537348939, + "grad_norm": 1.5353589057922363, + "learning_rate": 4.8546734044578646e-05, + "loss": 5.0105, + "step": 18340 + }, + { + "epoch": 0.10907912265677039, + "grad_norm": 1.4025498628616333, + "learning_rate": 4.85465771050187e-05, + "loss": 5.0779, + "step": 18341 + }, + { + "epoch": 0.10908506994005139, + "grad_norm": 1.220438838005066, + "learning_rate": 4.8546420157238874e-05, + "loss": 5.0732, + "step": 18342 + }, + { + "epoch": 0.10909101722333238, + "grad_norm": 1.4058369398117065, + "learning_rate": 4.8546263201239245e-05, + "loss": 5.0838, + "step": 18343 + }, + { + "epoch": 0.10909696450661338, + "grad_norm": 1.4438905715942383, + "learning_rate": 4.854610623701986e-05, + "loss": 5.0449, + "step": 18344 + }, + { + "epoch": 0.10910291178989438, + "grad_norm": 1.536890983581543, + "learning_rate": 4.854594926458076e-05, + "loss": 4.9601, + "step": 18345 + }, + { + "epoch": 0.10910885907317537, + "grad_norm": 1.3566638231277466, + "learning_rate": 4.8545792283922025e-05, + "loss": 4.9283, + "step": 18346 + }, + { + "epoch": 0.10911480635645637, + "grad_norm": 1.3086943626403809, + "learning_rate": 4.8545635295043694e-05, + "loss": 5.0638, + "step": 18347 + }, + { + "epoch": 0.10912075363973736, + "grad_norm": 1.330124020576477, + "learning_rate": 4.854547829794582e-05, + "loss": 5.0944, + "step": 18348 + }, + { + "epoch": 0.10912670092301836, + "grad_norm": 1.4076783657073975, + "learning_rate": 4.854532129262848e-05, + "loss": 4.9725, + "step": 18349 + }, + { + "epoch": 0.10913264820629937, + "grad_norm": 1.380814552307129, + "learning_rate": 4.854516427909169e-05, + "loss": 5.0551, + "step": 18350 + }, + { + "epoch": 0.10913859548958035, + "grad_norm": 1.4243587255477905, + "learning_rate": 4.854500725733554e-05, + "loss": 5.103, + "step": 18351 + }, + { + "epoch": 0.10914454277286136, + "grad_norm": 1.438328742980957, + "learning_rate": 4.854485022736006e-05, + "loss": 5.1153, + "step": 18352 + }, + { + "epoch": 0.10915049005614236, + "grad_norm": 1.4602978229522705, + "learning_rate": 4.8544693189165324e-05, + "loss": 4.8916, + "step": 18353 + }, + { + "epoch": 0.10915643733942335, + "grad_norm": 1.548378586769104, + "learning_rate": 4.8544536142751385e-05, + "loss": 5.0205, + "step": 18354 + }, + { + "epoch": 0.10916238462270435, + "grad_norm": 1.33285653591156, + "learning_rate": 4.854437908811828e-05, + "loss": 4.9558, + "step": 18355 + }, + { + "epoch": 0.10916833190598535, + "grad_norm": 1.442918300628662, + "learning_rate": 4.854422202526609e-05, + "loss": 4.9119, + "step": 18356 + }, + { + "epoch": 0.10917427918926634, + "grad_norm": 1.498830795288086, + "learning_rate": 4.8544064954194836e-05, + "loss": 4.9787, + "step": 18357 + }, + { + "epoch": 0.10918022647254734, + "grad_norm": 1.422012209892273, + "learning_rate": 4.85439078749046e-05, + "loss": 5.0013, + "step": 18358 + }, + { + "epoch": 0.10918617375582834, + "grad_norm": 1.4635952711105347, + "learning_rate": 4.854375078739543e-05, + "loss": 4.8389, + "step": 18359 + }, + { + "epoch": 0.10919212103910933, + "grad_norm": 1.3973792791366577, + "learning_rate": 4.854359369166738e-05, + "loss": 4.9503, + "step": 18360 + }, + { + "epoch": 0.10919806832239033, + "grad_norm": 1.4016454219818115, + "learning_rate": 4.8543436587720504e-05, + "loss": 4.8533, + "step": 18361 + }, + { + "epoch": 0.10920401560567133, + "grad_norm": 1.215690016746521, + "learning_rate": 4.854327947555486e-05, + "loss": 5.0961, + "step": 18362 + }, + { + "epoch": 0.10920996288895232, + "grad_norm": 1.1589696407318115, + "learning_rate": 4.85431223551705e-05, + "loss": 4.8991, + "step": 18363 + }, + { + "epoch": 0.10921591017223332, + "grad_norm": 1.2894245386123657, + "learning_rate": 4.854296522656748e-05, + "loss": 5.0622, + "step": 18364 + }, + { + "epoch": 0.10922185745551433, + "grad_norm": 1.3525546789169312, + "learning_rate": 4.854280808974585e-05, + "loss": 5.1679, + "step": 18365 + }, + { + "epoch": 0.10922780473879531, + "grad_norm": 1.2055712938308716, + "learning_rate": 4.854265094470567e-05, + "loss": 5.2706, + "step": 18366 + }, + { + "epoch": 0.10923375202207632, + "grad_norm": 1.3646256923675537, + "learning_rate": 4.8542493791447e-05, + "loss": 5.2381, + "step": 18367 + }, + { + "epoch": 0.10923969930535732, + "grad_norm": 1.535840630531311, + "learning_rate": 4.8542336629969875e-05, + "loss": 5.0133, + "step": 18368 + }, + { + "epoch": 0.1092456465886383, + "grad_norm": 1.3226375579833984, + "learning_rate": 4.854217946027437e-05, + "loss": 4.9518, + "step": 18369 + }, + { + "epoch": 0.10925159387191931, + "grad_norm": 1.4403883218765259, + "learning_rate": 4.854202228236054e-05, + "loss": 5.1958, + "step": 18370 + }, + { + "epoch": 0.10925754115520031, + "grad_norm": 1.3661396503448486, + "learning_rate": 4.8541865096228426e-05, + "loss": 5.297, + "step": 18371 + }, + { + "epoch": 0.1092634884384813, + "grad_norm": 1.1291767358779907, + "learning_rate": 4.8541707901878096e-05, + "loss": 5.0954, + "step": 18372 + }, + { + "epoch": 0.1092694357217623, + "grad_norm": 1.414288878440857, + "learning_rate": 4.854155069930959e-05, + "loss": 5.0499, + "step": 18373 + }, + { + "epoch": 0.1092753830050433, + "grad_norm": 1.405760407447815, + "learning_rate": 4.8541393488522976e-05, + "loss": 5.004, + "step": 18374 + }, + { + "epoch": 0.10928133028832429, + "grad_norm": 1.2152272462844849, + "learning_rate": 4.854123626951831e-05, + "loss": 4.9798, + "step": 18375 + }, + { + "epoch": 0.10928727757160529, + "grad_norm": 1.3401811122894287, + "learning_rate": 4.854107904229564e-05, + "loss": 5.1179, + "step": 18376 + }, + { + "epoch": 0.10929322485488628, + "grad_norm": 1.036811113357544, + "learning_rate": 4.854092180685502e-05, + "loss": 5.129, + "step": 18377 + }, + { + "epoch": 0.10929917213816728, + "grad_norm": 1.380259394645691, + "learning_rate": 4.8540764563196506e-05, + "loss": 5.163, + "step": 18378 + }, + { + "epoch": 0.10930511942144829, + "grad_norm": 1.3078418970108032, + "learning_rate": 4.8540607311320156e-05, + "loss": 4.9882, + "step": 18379 + }, + { + "epoch": 0.10931106670472927, + "grad_norm": 1.2273530960083008, + "learning_rate": 4.854045005122603e-05, + "loss": 5.0736, + "step": 18380 + }, + { + "epoch": 0.10931701398801028, + "grad_norm": 1.1997276544570923, + "learning_rate": 4.8540292782914164e-05, + "loss": 4.9193, + "step": 18381 + }, + { + "epoch": 0.10932296127129128, + "grad_norm": 1.2119728326797485, + "learning_rate": 4.854013550638463e-05, + "loss": 4.9752, + "step": 18382 + }, + { + "epoch": 0.10932890855457227, + "grad_norm": 1.1508461236953735, + "learning_rate": 4.853997822163748e-05, + "loss": 4.8432, + "step": 18383 + }, + { + "epoch": 0.10933485583785327, + "grad_norm": 1.2142893075942993, + "learning_rate": 4.853982092867276e-05, + "loss": 5.0771, + "step": 18384 + }, + { + "epoch": 0.10934080312113427, + "grad_norm": 1.1016231775283813, + "learning_rate": 4.8539663627490536e-05, + "loss": 5.0918, + "step": 18385 + }, + { + "epoch": 0.10934675040441526, + "grad_norm": 1.2202482223510742, + "learning_rate": 4.8539506318090865e-05, + "loss": 5.1181, + "step": 18386 + }, + { + "epoch": 0.10935269768769626, + "grad_norm": 1.3560340404510498, + "learning_rate": 4.853934900047379e-05, + "loss": 5.1007, + "step": 18387 + }, + { + "epoch": 0.10935864497097726, + "grad_norm": 1.350473165512085, + "learning_rate": 4.8539191674639374e-05, + "loss": 5.1084, + "step": 18388 + }, + { + "epoch": 0.10936459225425825, + "grad_norm": 1.5102394819259644, + "learning_rate": 4.853903434058766e-05, + "loss": 5.0825, + "step": 18389 + }, + { + "epoch": 0.10937053953753925, + "grad_norm": 1.3704886436462402, + "learning_rate": 4.853887699831872e-05, + "loss": 5.1083, + "step": 18390 + }, + { + "epoch": 0.10937648682082025, + "grad_norm": 1.315167784690857, + "learning_rate": 4.8538719647832606e-05, + "loss": 4.9786, + "step": 18391 + }, + { + "epoch": 0.10938243410410124, + "grad_norm": 1.5208832025527954, + "learning_rate": 4.8538562289129356e-05, + "loss": 4.9011, + "step": 18392 + }, + { + "epoch": 0.10938838138738224, + "grad_norm": 1.3259782791137695, + "learning_rate": 4.8538404922209046e-05, + "loss": 4.9368, + "step": 18393 + }, + { + "epoch": 0.10939432867066325, + "grad_norm": 1.3342556953430176, + "learning_rate": 4.853824754707172e-05, + "loss": 4.9858, + "step": 18394 + }, + { + "epoch": 0.10940027595394423, + "grad_norm": 1.2291737794876099, + "learning_rate": 4.853809016371743e-05, + "loss": 5.0289, + "step": 18395 + }, + { + "epoch": 0.10940622323722524, + "grad_norm": 1.1539384126663208, + "learning_rate": 4.8537932772146245e-05, + "loss": 4.9444, + "step": 18396 + }, + { + "epoch": 0.10941217052050624, + "grad_norm": 1.2171412706375122, + "learning_rate": 4.8537775372358204e-05, + "loss": 4.9818, + "step": 18397 + }, + { + "epoch": 0.10941811780378723, + "grad_norm": 1.2133311033248901, + "learning_rate": 4.8537617964353374e-05, + "loss": 5.2647, + "step": 18398 + }, + { + "epoch": 0.10942406508706823, + "grad_norm": 1.2499877214431763, + "learning_rate": 4.8537460548131796e-05, + "loss": 5.4893, + "step": 18399 + }, + { + "epoch": 0.10943001237034923, + "grad_norm": 1.2127736806869507, + "learning_rate": 4.8537303123693545e-05, + "loss": 5.3607, + "step": 18400 + }, + { + "epoch": 0.10943595965363022, + "grad_norm": 1.3051133155822754, + "learning_rate": 4.853714569103865e-05, + "loss": 5.4531, + "step": 18401 + }, + { + "epoch": 0.10944190693691122, + "grad_norm": 1.3183389902114868, + "learning_rate": 4.85369882501672e-05, + "loss": 5.1784, + "step": 18402 + }, + { + "epoch": 0.10944785422019222, + "grad_norm": 1.5276503562927246, + "learning_rate": 4.853683080107922e-05, + "loss": 4.9092, + "step": 18403 + }, + { + "epoch": 0.10945380150347321, + "grad_norm": 1.519415259361267, + "learning_rate": 4.853667334377478e-05, + "loss": 4.7973, + "step": 18404 + }, + { + "epoch": 0.10945974878675421, + "grad_norm": 1.4063026905059814, + "learning_rate": 4.853651587825392e-05, + "loss": 4.7771, + "step": 18405 + }, + { + "epoch": 0.1094656960700352, + "grad_norm": 1.2753932476043701, + "learning_rate": 4.8536358404516715e-05, + "loss": 4.7902, + "step": 18406 + }, + { + "epoch": 0.1094716433533162, + "grad_norm": 1.5203404426574707, + "learning_rate": 4.8536200922563205e-05, + "loss": 4.961, + "step": 18407 + }, + { + "epoch": 0.1094775906365972, + "grad_norm": 1.4700336456298828, + "learning_rate": 4.8536043432393455e-05, + "loss": 5.0276, + "step": 18408 + }, + { + "epoch": 0.1094835379198782, + "grad_norm": 1.3945552110671997, + "learning_rate": 4.8535885934007506e-05, + "loss": 4.9641, + "step": 18409 + }, + { + "epoch": 0.1094894852031592, + "grad_norm": 1.1885923147201538, + "learning_rate": 4.853572842740544e-05, + "loss": 4.9162, + "step": 18410 + }, + { + "epoch": 0.1094954324864402, + "grad_norm": 1.414090871810913, + "learning_rate": 4.853557091258728e-05, + "loss": 4.9317, + "step": 18411 + }, + { + "epoch": 0.10950137976972119, + "grad_norm": 1.4395371675491333, + "learning_rate": 4.85354133895531e-05, + "loss": 4.7658, + "step": 18412 + }, + { + "epoch": 0.10950732705300219, + "grad_norm": 1.351665735244751, + "learning_rate": 4.8535255858302944e-05, + "loss": 4.9385, + "step": 18413 + }, + { + "epoch": 0.10951327433628319, + "grad_norm": 1.5085922479629517, + "learning_rate": 4.853509831883688e-05, + "loss": 5.0192, + "step": 18414 + }, + { + "epoch": 0.10951922161956418, + "grad_norm": 1.3413939476013184, + "learning_rate": 4.8534940771154954e-05, + "loss": 4.9193, + "step": 18415 + }, + { + "epoch": 0.10952516890284518, + "grad_norm": 1.532934546470642, + "learning_rate": 4.853478321525723e-05, + "loss": 4.9137, + "step": 18416 + }, + { + "epoch": 0.10953111618612618, + "grad_norm": 1.388016700744629, + "learning_rate": 4.8534625651143754e-05, + "loss": 4.9381, + "step": 18417 + }, + { + "epoch": 0.10953706346940717, + "grad_norm": 1.551255702972412, + "learning_rate": 4.853446807881458e-05, + "loss": 5.0973, + "step": 18418 + }, + { + "epoch": 0.10954301075268817, + "grad_norm": 1.4487138986587524, + "learning_rate": 4.853431049826976e-05, + "loss": 5.1313, + "step": 18419 + }, + { + "epoch": 0.10954895803596917, + "grad_norm": 1.467703104019165, + "learning_rate": 4.853415290950936e-05, + "loss": 5.0381, + "step": 18420 + }, + { + "epoch": 0.10955490531925016, + "grad_norm": 1.4529845714569092, + "learning_rate": 4.853399531253343e-05, + "loss": 4.9945, + "step": 18421 + }, + { + "epoch": 0.10956085260253116, + "grad_norm": 1.230872631072998, + "learning_rate": 4.8533837707342036e-05, + "loss": 5.0579, + "step": 18422 + }, + { + "epoch": 0.10956679988581217, + "grad_norm": 1.3668066263198853, + "learning_rate": 4.8533680093935206e-05, + "loss": 5.2567, + "step": 18423 + }, + { + "epoch": 0.10957274716909315, + "grad_norm": 1.3560447692871094, + "learning_rate": 4.853352247231302e-05, + "loss": 5.0152, + "step": 18424 + }, + { + "epoch": 0.10957869445237416, + "grad_norm": 1.4296886920928955, + "learning_rate": 4.8533364842475524e-05, + "loss": 5.1132, + "step": 18425 + }, + { + "epoch": 0.10958464173565516, + "grad_norm": 1.4232845306396484, + "learning_rate": 4.853320720442277e-05, + "loss": 5.0427, + "step": 18426 + }, + { + "epoch": 0.10959058901893615, + "grad_norm": 1.4019423723220825, + "learning_rate": 4.8533049558154826e-05, + "loss": 5.2369, + "step": 18427 + }, + { + "epoch": 0.10959653630221715, + "grad_norm": 1.5423427820205688, + "learning_rate": 4.853289190367173e-05, + "loss": 5.1053, + "step": 18428 + }, + { + "epoch": 0.10960248358549815, + "grad_norm": 1.5049951076507568, + "learning_rate": 4.8532734240973545e-05, + "loss": 5.3784, + "step": 18429 + }, + { + "epoch": 0.10960843086877914, + "grad_norm": 1.678328037261963, + "learning_rate": 4.853257657006033e-05, + "loss": 5.3021, + "step": 18430 + }, + { + "epoch": 0.10961437815206014, + "grad_norm": 1.5986173152923584, + "learning_rate": 4.853241889093213e-05, + "loss": 5.1686, + "step": 18431 + }, + { + "epoch": 0.10962032543534114, + "grad_norm": 1.5304551124572754, + "learning_rate": 4.853226120358901e-05, + "loss": 5.2319, + "step": 18432 + }, + { + "epoch": 0.10962627271862213, + "grad_norm": 1.609595775604248, + "learning_rate": 4.853210350803102e-05, + "loss": 5.0256, + "step": 18433 + }, + { + "epoch": 0.10963222000190313, + "grad_norm": 1.3506170511245728, + "learning_rate": 4.853194580425821e-05, + "loss": 5.0792, + "step": 18434 + }, + { + "epoch": 0.10963816728518412, + "grad_norm": 1.2946768999099731, + "learning_rate": 4.853178809227065e-05, + "loss": 5.0155, + "step": 18435 + }, + { + "epoch": 0.10964411456846512, + "grad_norm": 1.5691487789154053, + "learning_rate": 4.853163037206838e-05, + "loss": 5.1302, + "step": 18436 + }, + { + "epoch": 0.10965006185174613, + "grad_norm": 1.6740599870681763, + "learning_rate": 4.853147264365146e-05, + "loss": 5.2371, + "step": 18437 + }, + { + "epoch": 0.10965600913502711, + "grad_norm": 1.4822674989700317, + "learning_rate": 4.853131490701995e-05, + "loss": 5.0194, + "step": 18438 + }, + { + "epoch": 0.10966195641830812, + "grad_norm": 1.385177493095398, + "learning_rate": 4.853115716217389e-05, + "loss": 4.9444, + "step": 18439 + }, + { + "epoch": 0.10966790370158912, + "grad_norm": 1.3696002960205078, + "learning_rate": 4.853099940911337e-05, + "loss": 5.0557, + "step": 18440 + }, + { + "epoch": 0.1096738509848701, + "grad_norm": 1.6609543561935425, + "learning_rate": 4.8530841647838396e-05, + "loss": 4.9032, + "step": 18441 + }, + { + "epoch": 0.10967979826815111, + "grad_norm": 1.5938438177108765, + "learning_rate": 4.8530683878349056e-05, + "loss": 4.8639, + "step": 18442 + }, + { + "epoch": 0.10968574555143211, + "grad_norm": 1.4565002918243408, + "learning_rate": 4.85305261006454e-05, + "loss": 5.0483, + "step": 18443 + }, + { + "epoch": 0.1096916928347131, + "grad_norm": 1.5930250883102417, + "learning_rate": 4.853036831472749e-05, + "loss": 5.0751, + "step": 18444 + }, + { + "epoch": 0.1096976401179941, + "grad_norm": 1.5648735761642456, + "learning_rate": 4.853021052059536e-05, + "loss": 5.0991, + "step": 18445 + }, + { + "epoch": 0.1097035874012751, + "grad_norm": 1.4230155944824219, + "learning_rate": 4.8530052718249076e-05, + "loss": 5.098, + "step": 18446 + }, + { + "epoch": 0.10970953468455609, + "grad_norm": 1.4366841316223145, + "learning_rate": 4.85298949076887e-05, + "loss": 5.0975, + "step": 18447 + }, + { + "epoch": 0.10971548196783709, + "grad_norm": 1.437514066696167, + "learning_rate": 4.852973708891427e-05, + "loss": 5.0325, + "step": 18448 + }, + { + "epoch": 0.1097214292511181, + "grad_norm": 2.0367636680603027, + "learning_rate": 4.852957926192586e-05, + "loss": 5.2064, + "step": 18449 + }, + { + "epoch": 0.10972737653439908, + "grad_norm": 2.16357684135437, + "learning_rate": 4.852942142672352e-05, + "loss": 5.1532, + "step": 18450 + }, + { + "epoch": 0.10973332381768008, + "grad_norm": 1.6931402683258057, + "learning_rate": 4.8529263583307296e-05, + "loss": 5.2128, + "step": 18451 + }, + { + "epoch": 0.10973927110096109, + "grad_norm": 2.4651196002960205, + "learning_rate": 4.852910573167725e-05, + "loss": 4.798, + "step": 18452 + }, + { + "epoch": 0.10974521838424207, + "grad_norm": 1.7160784006118774, + "learning_rate": 4.852894787183344e-05, + "loss": 5.5087, + "step": 18453 + }, + { + "epoch": 0.10975116566752308, + "grad_norm": 1.478097915649414, + "learning_rate": 4.852879000377591e-05, + "loss": 5.6876, + "step": 18454 + }, + { + "epoch": 0.10975711295080408, + "grad_norm": 1.8612531423568726, + "learning_rate": 4.852863212750474e-05, + "loss": 5.2259, + "step": 18455 + }, + { + "epoch": 0.10976306023408507, + "grad_norm": 1.6869621276855469, + "learning_rate": 4.852847424301995e-05, + "loss": 5.5294, + "step": 18456 + }, + { + "epoch": 0.10976900751736607, + "grad_norm": 1.7378077507019043, + "learning_rate": 4.852831635032161e-05, + "loss": 5.4568, + "step": 18457 + }, + { + "epoch": 0.10977495480064707, + "grad_norm": 1.7788033485412598, + "learning_rate": 4.852815844940979e-05, + "loss": 5.2331, + "step": 18458 + }, + { + "epoch": 0.10978090208392806, + "grad_norm": 1.8730370998382568, + "learning_rate": 4.852800054028453e-05, + "loss": 4.9792, + "step": 18459 + }, + { + "epoch": 0.10978684936720906, + "grad_norm": 1.5126397609710693, + "learning_rate": 4.852784262294588e-05, + "loss": 5.3134, + "step": 18460 + }, + { + "epoch": 0.10979279665049006, + "grad_norm": 1.6687992811203003, + "learning_rate": 4.8527684697393914e-05, + "loss": 5.3296, + "step": 18461 + }, + { + "epoch": 0.10979874393377105, + "grad_norm": 1.6268471479415894, + "learning_rate": 4.852752676362867e-05, + "loss": 4.9804, + "step": 18462 + }, + { + "epoch": 0.10980469121705205, + "grad_norm": 1.7055017948150635, + "learning_rate": 4.8527368821650214e-05, + "loss": 5.0289, + "step": 18463 + }, + { + "epoch": 0.10981063850033304, + "grad_norm": 1.489247441291809, + "learning_rate": 4.852721087145859e-05, + "loss": 5.0428, + "step": 18464 + }, + { + "epoch": 0.10981658578361404, + "grad_norm": 1.7411161661148071, + "learning_rate": 4.8527052913053874e-05, + "loss": 5.1142, + "step": 18465 + }, + { + "epoch": 0.10982253306689505, + "grad_norm": 1.5776443481445312, + "learning_rate": 4.8526894946436094e-05, + "loss": 5.2881, + "step": 18466 + }, + { + "epoch": 0.10982848035017603, + "grad_norm": 1.342997431755066, + "learning_rate": 4.852673697160532e-05, + "loss": 5.0295, + "step": 18467 + }, + { + "epoch": 0.10983442763345704, + "grad_norm": 1.1686962842941284, + "learning_rate": 4.8526578988561606e-05, + "loss": 5.0607, + "step": 18468 + }, + { + "epoch": 0.10984037491673804, + "grad_norm": 1.578697681427002, + "learning_rate": 4.8526420997305006e-05, + "loss": 5.3291, + "step": 18469 + }, + { + "epoch": 0.10984632220001903, + "grad_norm": 1.5248758792877197, + "learning_rate": 4.8526262997835575e-05, + "loss": 5.1206, + "step": 18470 + }, + { + "epoch": 0.10985226948330003, + "grad_norm": 1.1425076723098755, + "learning_rate": 4.852610499015337e-05, + "loss": 5.1892, + "step": 18471 + }, + { + "epoch": 0.10985821676658103, + "grad_norm": 1.356423020362854, + "learning_rate": 4.852594697425844e-05, + "loss": 4.9477, + "step": 18472 + }, + { + "epoch": 0.10986416404986202, + "grad_norm": 1.3905398845672607, + "learning_rate": 4.852578895015085e-05, + "loss": 4.9084, + "step": 18473 + }, + { + "epoch": 0.10987011133314302, + "grad_norm": 1.3447619676589966, + "learning_rate": 4.8525630917830655e-05, + "loss": 4.9042, + "step": 18474 + }, + { + "epoch": 0.10987605861642402, + "grad_norm": 1.2110105752944946, + "learning_rate": 4.8525472877297893e-05, + "loss": 4.9669, + "step": 18475 + }, + { + "epoch": 0.10988200589970501, + "grad_norm": 1.480750560760498, + "learning_rate": 4.8525314828552646e-05, + "loss": 5.1071, + "step": 18476 + }, + { + "epoch": 0.10988795318298601, + "grad_norm": 1.2497118711471558, + "learning_rate": 4.852515677159495e-05, + "loss": 4.8868, + "step": 18477 + }, + { + "epoch": 0.10989390046626701, + "grad_norm": 1.4057846069335938, + "learning_rate": 4.8524998706424856e-05, + "loss": 5.1173, + "step": 18478 + }, + { + "epoch": 0.109899847749548, + "grad_norm": 1.3325163125991821, + "learning_rate": 4.8524840633042436e-05, + "loss": 5.1066, + "step": 18479 + }, + { + "epoch": 0.109905795032829, + "grad_norm": 1.333720326423645, + "learning_rate": 4.852468255144773e-05, + "loss": 5.1404, + "step": 18480 + }, + { + "epoch": 0.10991174231611, + "grad_norm": 1.3484537601470947, + "learning_rate": 4.852452446164081e-05, + "loss": 5.1284, + "step": 18481 + }, + { + "epoch": 0.109917689599391, + "grad_norm": 1.3348337411880493, + "learning_rate": 4.8524366363621716e-05, + "loss": 5.2056, + "step": 18482 + }, + { + "epoch": 0.109923636882672, + "grad_norm": 1.1838293075561523, + "learning_rate": 4.8524208257390504e-05, + "loss": 5.0488, + "step": 18483 + }, + { + "epoch": 0.109929584165953, + "grad_norm": 1.2820385694503784, + "learning_rate": 4.852405014294724e-05, + "loss": 5.1329, + "step": 18484 + }, + { + "epoch": 0.10993553144923399, + "grad_norm": 1.3892844915390015, + "learning_rate": 4.852389202029198e-05, + "loss": 5.0263, + "step": 18485 + }, + { + "epoch": 0.10994147873251499, + "grad_norm": 1.4780217409133911, + "learning_rate": 4.852373388942476e-05, + "loss": 5.0866, + "step": 18486 + }, + { + "epoch": 0.10994742601579599, + "grad_norm": 1.4181870222091675, + "learning_rate": 4.852357575034565e-05, + "loss": 5.1436, + "step": 18487 + }, + { + "epoch": 0.10995337329907698, + "grad_norm": 1.4174554347991943, + "learning_rate": 4.852341760305471e-05, + "loss": 5.132, + "step": 18488 + }, + { + "epoch": 0.10995932058235798, + "grad_norm": 1.2727283239364624, + "learning_rate": 4.852325944755198e-05, + "loss": 5.0171, + "step": 18489 + }, + { + "epoch": 0.10996526786563898, + "grad_norm": 1.2102142572402954, + "learning_rate": 4.852310128383753e-05, + "loss": 5.0183, + "step": 18490 + }, + { + "epoch": 0.10997121514891997, + "grad_norm": 1.254946231842041, + "learning_rate": 4.85229431119114e-05, + "loss": 5.105, + "step": 18491 + }, + { + "epoch": 0.10997716243220097, + "grad_norm": 1.4097338914871216, + "learning_rate": 4.8522784931773666e-05, + "loss": 4.953, + "step": 18492 + }, + { + "epoch": 0.10998310971548196, + "grad_norm": 1.368314504623413, + "learning_rate": 4.852262674342436e-05, + "loss": 4.9527, + "step": 18493 + }, + { + "epoch": 0.10998905699876296, + "grad_norm": 1.3907700777053833, + "learning_rate": 4.8522468546863554e-05, + "loss": 4.9416, + "step": 18494 + }, + { + "epoch": 0.10999500428204396, + "grad_norm": 1.2113755941390991, + "learning_rate": 4.852231034209129e-05, + "loss": 4.8552, + "step": 18495 + }, + { + "epoch": 0.11000095156532495, + "grad_norm": 1.3752022981643677, + "learning_rate": 4.852215212910763e-05, + "loss": 4.9314, + "step": 18496 + }, + { + "epoch": 0.11000689884860596, + "grad_norm": 1.243531584739685, + "learning_rate": 4.852199390791264e-05, + "loss": 4.925, + "step": 18497 + }, + { + "epoch": 0.11001284613188696, + "grad_norm": 1.3528475761413574, + "learning_rate": 4.852183567850636e-05, + "loss": 4.8643, + "step": 18498 + }, + { + "epoch": 0.11001879341516795, + "grad_norm": 1.4653394222259521, + "learning_rate": 4.8521677440888845e-05, + "loss": 4.8894, + "step": 18499 + }, + { + "epoch": 0.11002474069844895, + "grad_norm": 1.3524682521820068, + "learning_rate": 4.852151919506016e-05, + "loss": 4.7458, + "step": 18500 + }, + { + "epoch": 0.11003068798172995, + "grad_norm": 1.3654247522354126, + "learning_rate": 4.852136094102036e-05, + "loss": 4.7971, + "step": 18501 + }, + { + "epoch": 0.11003663526501094, + "grad_norm": 1.395735740661621, + "learning_rate": 4.85212026787695e-05, + "loss": 4.7677, + "step": 18502 + }, + { + "epoch": 0.11004258254829194, + "grad_norm": 1.4467344284057617, + "learning_rate": 4.8521044408307616e-05, + "loss": 4.726, + "step": 18503 + }, + { + "epoch": 0.11004852983157294, + "grad_norm": 1.276580572128296, + "learning_rate": 4.852088612963478e-05, + "loss": 4.8145, + "step": 18504 + }, + { + "epoch": 0.11005447711485393, + "grad_norm": 1.4406812191009521, + "learning_rate": 4.852072784275106e-05, + "loss": 4.7942, + "step": 18505 + }, + { + "epoch": 0.11006042439813493, + "grad_norm": 1.4281691312789917, + "learning_rate": 4.8520569547656483e-05, + "loss": 4.9745, + "step": 18506 + }, + { + "epoch": 0.11006637168141593, + "grad_norm": 1.3521541357040405, + "learning_rate": 4.852041124435112e-05, + "loss": 4.8335, + "step": 18507 + }, + { + "epoch": 0.11007231896469692, + "grad_norm": 1.2510555982589722, + "learning_rate": 4.852025293283503e-05, + "loss": 4.8868, + "step": 18508 + }, + { + "epoch": 0.11007826624797792, + "grad_norm": 1.3792724609375, + "learning_rate": 4.852009461310826e-05, + "loss": 4.9388, + "step": 18509 + }, + { + "epoch": 0.11008421353125893, + "grad_norm": 1.3494830131530762, + "learning_rate": 4.851993628517086e-05, + "loss": 4.8536, + "step": 18510 + }, + { + "epoch": 0.11009016081453991, + "grad_norm": 1.2981318235397339, + "learning_rate": 4.851977794902291e-05, + "loss": 4.8479, + "step": 18511 + }, + { + "epoch": 0.11009610809782092, + "grad_norm": 1.3305935859680176, + "learning_rate": 4.851961960466444e-05, + "loss": 4.9893, + "step": 18512 + }, + { + "epoch": 0.11010205538110192, + "grad_norm": 1.3141270875930786, + "learning_rate": 4.851946125209551e-05, + "loss": 4.8349, + "step": 18513 + }, + { + "epoch": 0.1101080026643829, + "grad_norm": 1.2411303520202637, + "learning_rate": 4.851930289131619e-05, + "loss": 4.8698, + "step": 18514 + }, + { + "epoch": 0.11011394994766391, + "grad_norm": 1.520176887512207, + "learning_rate": 4.851914452232651e-05, + "loss": 4.7576, + "step": 18515 + }, + { + "epoch": 0.11011989723094491, + "grad_norm": 1.3073054552078247, + "learning_rate": 4.851898614512655e-05, + "loss": 4.8974, + "step": 18516 + }, + { + "epoch": 0.1101258445142259, + "grad_norm": 1.4703196287155151, + "learning_rate": 4.8518827759716354e-05, + "loss": 5.0947, + "step": 18517 + }, + { + "epoch": 0.1101317917975069, + "grad_norm": 1.3140865564346313, + "learning_rate": 4.851866936609597e-05, + "loss": 5.4125, + "step": 18518 + }, + { + "epoch": 0.1101377390807879, + "grad_norm": 1.2075819969177246, + "learning_rate": 4.8518510964265465e-05, + "loss": 5.2993, + "step": 18519 + }, + { + "epoch": 0.11014368636406889, + "grad_norm": 1.6519954204559326, + "learning_rate": 4.85183525542249e-05, + "loss": 5.6638, + "step": 18520 + }, + { + "epoch": 0.11014963364734989, + "grad_norm": 2.118663787841797, + "learning_rate": 4.851819413597432e-05, + "loss": 5.5422, + "step": 18521 + }, + { + "epoch": 0.1101555809306309, + "grad_norm": 1.902429461479187, + "learning_rate": 4.851803570951377e-05, + "loss": 5.3244, + "step": 18522 + }, + { + "epoch": 0.11016152821391188, + "grad_norm": 2.593628406524658, + "learning_rate": 4.8517877274843315e-05, + "loss": 5.0554, + "step": 18523 + }, + { + "epoch": 0.11016747549719288, + "grad_norm": 2.6404380798339844, + "learning_rate": 4.851771883196302e-05, + "loss": 4.9789, + "step": 18524 + }, + { + "epoch": 0.11017342278047387, + "grad_norm": 2.08564829826355, + "learning_rate": 4.8517560380872934e-05, + "loss": 4.9616, + "step": 18525 + }, + { + "epoch": 0.11017937006375488, + "grad_norm": 2.306739091873169, + "learning_rate": 4.8517401921573114e-05, + "loss": 4.9368, + "step": 18526 + }, + { + "epoch": 0.11018531734703588, + "grad_norm": 3.0212862491607666, + "learning_rate": 4.85172434540636e-05, + "loss": 4.6379, + "step": 18527 + }, + { + "epoch": 0.11019126463031687, + "grad_norm": 2.554163694381714, + "learning_rate": 4.851708497834446e-05, + "loss": 4.6958, + "step": 18528 + }, + { + "epoch": 0.11019721191359787, + "grad_norm": 2.354631185531616, + "learning_rate": 4.851692649441576e-05, + "loss": 4.7904, + "step": 18529 + }, + { + "epoch": 0.11020315919687887, + "grad_norm": 1.5072609186172485, + "learning_rate": 4.851676800227754e-05, + "loss": 5.5862, + "step": 18530 + }, + { + "epoch": 0.11020910648015986, + "grad_norm": 1.5677906274795532, + "learning_rate": 4.851660950192986e-05, + "loss": 5.8712, + "step": 18531 + }, + { + "epoch": 0.11021505376344086, + "grad_norm": 1.7329411506652832, + "learning_rate": 4.851645099337276e-05, + "loss": 5.4559, + "step": 18532 + }, + { + "epoch": 0.11022100104672186, + "grad_norm": 2.187192916870117, + "learning_rate": 4.851629247660633e-05, + "loss": 5.2172, + "step": 18533 + }, + { + "epoch": 0.11022694833000285, + "grad_norm": 2.5248184204101562, + "learning_rate": 4.851613395163059e-05, + "loss": 4.7283, + "step": 18534 + }, + { + "epoch": 0.11023289561328385, + "grad_norm": 1.897926926612854, + "learning_rate": 4.8515975418445625e-05, + "loss": 5.0609, + "step": 18535 + }, + { + "epoch": 0.11023884289656485, + "grad_norm": 1.6827658414840698, + "learning_rate": 4.851581687705147e-05, + "loss": 5.2637, + "step": 18536 + }, + { + "epoch": 0.11024479017984584, + "grad_norm": 1.6638895273208618, + "learning_rate": 4.8515658327448184e-05, + "loss": 5.3758, + "step": 18537 + }, + { + "epoch": 0.11025073746312684, + "grad_norm": 1.3794528245925903, + "learning_rate": 4.8515499769635824e-05, + "loss": 5.1398, + "step": 18538 + }, + { + "epoch": 0.11025668474640785, + "grad_norm": 1.7829253673553467, + "learning_rate": 4.8515341203614454e-05, + "loss": 5.8449, + "step": 18539 + }, + { + "epoch": 0.11026263202968883, + "grad_norm": 1.9193391799926758, + "learning_rate": 4.85151826293841e-05, + "loss": 5.6113, + "step": 18540 + }, + { + "epoch": 0.11026857931296984, + "grad_norm": 1.9315286874771118, + "learning_rate": 4.851502404694486e-05, + "loss": 5.4341, + "step": 18541 + }, + { + "epoch": 0.11027452659625084, + "grad_norm": 1.8884371519088745, + "learning_rate": 4.851486545629677e-05, + "loss": 5.0711, + "step": 18542 + }, + { + "epoch": 0.11028047387953183, + "grad_norm": 2.104315996170044, + "learning_rate": 4.8514706857439866e-05, + "loss": 4.7431, + "step": 18543 + }, + { + "epoch": 0.11028642116281283, + "grad_norm": 1.9781455993652344, + "learning_rate": 4.8514548250374234e-05, + "loss": 4.9088, + "step": 18544 + }, + { + "epoch": 0.11029236844609383, + "grad_norm": 2.0802392959594727, + "learning_rate": 4.851438963509991e-05, + "loss": 4.8418, + "step": 18545 + }, + { + "epoch": 0.11029831572937482, + "grad_norm": 2.1856627464294434, + "learning_rate": 4.851423101161696e-05, + "loss": 5.5758, + "step": 18546 + }, + { + "epoch": 0.11030426301265582, + "grad_norm": 1.578050971031189, + "learning_rate": 4.851407237992543e-05, + "loss": 5.2795, + "step": 18547 + }, + { + "epoch": 0.11031021029593682, + "grad_norm": 2.241647720336914, + "learning_rate": 4.8513913740025376e-05, + "loss": 4.7807, + "step": 18548 + }, + { + "epoch": 0.11031615757921781, + "grad_norm": 2.102911949157715, + "learning_rate": 4.851375509191687e-05, + "loss": 5.1933, + "step": 18549 + }, + { + "epoch": 0.11032210486249881, + "grad_norm": 1.7198251485824585, + "learning_rate": 4.851359643559995e-05, + "loss": 5.273, + "step": 18550 + }, + { + "epoch": 0.11032805214577981, + "grad_norm": 1.6389858722686768, + "learning_rate": 4.8513437771074675e-05, + "loss": 5.7741, + "step": 18551 + }, + { + "epoch": 0.1103339994290608, + "grad_norm": 1.3120185136795044, + "learning_rate": 4.8513279098341106e-05, + "loss": 5.6433, + "step": 18552 + }, + { + "epoch": 0.1103399467123418, + "grad_norm": 2.6182525157928467, + "learning_rate": 4.8513120417399286e-05, + "loss": 5.2905, + "step": 18553 + }, + { + "epoch": 0.11034589399562279, + "grad_norm": 2.8740553855895996, + "learning_rate": 4.851296172824928e-05, + "loss": 5.0364, + "step": 18554 + }, + { + "epoch": 0.1103518412789038, + "grad_norm": 2.126779794692993, + "learning_rate": 4.851280303089115e-05, + "loss": 4.8801, + "step": 18555 + }, + { + "epoch": 0.1103577885621848, + "grad_norm": 2.2658486366271973, + "learning_rate": 4.851264432532493e-05, + "loss": 5.0411, + "step": 18556 + }, + { + "epoch": 0.11036373584546579, + "grad_norm": 2.2387850284576416, + "learning_rate": 4.8512485611550706e-05, + "loss": 5.048, + "step": 18557 + }, + { + "epoch": 0.11036968312874679, + "grad_norm": 2.5402557849884033, + "learning_rate": 4.851232688956851e-05, + "loss": 5.2581, + "step": 18558 + }, + { + "epoch": 0.11037563041202779, + "grad_norm": 1.9275699853897095, + "learning_rate": 4.8512168159378396e-05, + "loss": 5.765, + "step": 18559 + }, + { + "epoch": 0.11038157769530878, + "grad_norm": 1.6632050275802612, + "learning_rate": 4.8512009420980434e-05, + "loss": 5.9928, + "step": 18560 + }, + { + "epoch": 0.11038752497858978, + "grad_norm": 1.9383779764175415, + "learning_rate": 4.851185067437467e-05, + "loss": 5.306, + "step": 18561 + }, + { + "epoch": 0.11039347226187078, + "grad_norm": 1.6358258724212646, + "learning_rate": 4.851169191956117e-05, + "loss": 5.4039, + "step": 18562 + }, + { + "epoch": 0.11039941954515177, + "grad_norm": 1.625636339187622, + "learning_rate": 4.851153315653997e-05, + "loss": 5.5028, + "step": 18563 + }, + { + "epoch": 0.11040536682843277, + "grad_norm": 1.8142133951187134, + "learning_rate": 4.8511374385311134e-05, + "loss": 5.3636, + "step": 18564 + }, + { + "epoch": 0.11041131411171377, + "grad_norm": 1.778742790222168, + "learning_rate": 4.8511215605874724e-05, + "loss": 5.9869, + "step": 18565 + }, + { + "epoch": 0.11041726139499476, + "grad_norm": 1.7027266025543213, + "learning_rate": 4.8511056818230795e-05, + "loss": 5.9855, + "step": 18566 + }, + { + "epoch": 0.11042320867827576, + "grad_norm": 1.8098080158233643, + "learning_rate": 4.85108980223794e-05, + "loss": 5.3241, + "step": 18567 + }, + { + "epoch": 0.11042915596155677, + "grad_norm": 2.058525562286377, + "learning_rate": 4.851073921832059e-05, + "loss": 5.3369, + "step": 18568 + }, + { + "epoch": 0.11043510324483775, + "grad_norm": 1.6393969058990479, + "learning_rate": 4.851058040605443e-05, + "loss": 5.234, + "step": 18569 + }, + { + "epoch": 0.11044105052811876, + "grad_norm": 1.7245092391967773, + "learning_rate": 4.8510421585580954e-05, + "loss": 5.3252, + "step": 18570 + }, + { + "epoch": 0.11044699781139976, + "grad_norm": 1.7108781337738037, + "learning_rate": 4.851026275690025e-05, + "loss": 5.342, + "step": 18571 + }, + { + "epoch": 0.11045294509468075, + "grad_norm": 1.6860250234603882, + "learning_rate": 4.8510103920012354e-05, + "loss": 5.1265, + "step": 18572 + }, + { + "epoch": 0.11045889237796175, + "grad_norm": 1.4939595460891724, + "learning_rate": 4.850994507491731e-05, + "loss": 4.995, + "step": 18573 + }, + { + "epoch": 0.11046483966124275, + "grad_norm": 1.6137492656707764, + "learning_rate": 4.85097862216152e-05, + "loss": 5.0099, + "step": 18574 + }, + { + "epoch": 0.11047078694452374, + "grad_norm": 1.8155491352081299, + "learning_rate": 4.850962736010606e-05, + "loss": 4.965, + "step": 18575 + }, + { + "epoch": 0.11047673422780474, + "grad_norm": 1.6313834190368652, + "learning_rate": 4.8509468490389955e-05, + "loss": 5.1881, + "step": 18576 + }, + { + "epoch": 0.11048268151108574, + "grad_norm": 1.9885855913162231, + "learning_rate": 4.850930961246694e-05, + "loss": 4.9172, + "step": 18577 + }, + { + "epoch": 0.11048862879436673, + "grad_norm": 1.7815529108047485, + "learning_rate": 4.850915072633706e-05, + "loss": 5.2431, + "step": 18578 + }, + { + "epoch": 0.11049457607764773, + "grad_norm": 1.496060848236084, + "learning_rate": 4.8508991832000384e-05, + "loss": 5.0222, + "step": 18579 + }, + { + "epoch": 0.11050052336092873, + "grad_norm": 1.76019287109375, + "learning_rate": 4.850883292945696e-05, + "loss": 5.1522, + "step": 18580 + }, + { + "epoch": 0.11050647064420972, + "grad_norm": 1.6975457668304443, + "learning_rate": 4.8508674018706845e-05, + "loss": 5.0687, + "step": 18581 + }, + { + "epoch": 0.11051241792749072, + "grad_norm": 2.056002378463745, + "learning_rate": 4.85085150997501e-05, + "loss": 5.0267, + "step": 18582 + }, + { + "epoch": 0.11051836521077171, + "grad_norm": 1.8109005689620972, + "learning_rate": 4.850835617258677e-05, + "loss": 5.7661, + "step": 18583 + }, + { + "epoch": 0.11052431249405271, + "grad_norm": 1.762326717376709, + "learning_rate": 4.850819723721692e-05, + "loss": 5.8038, + "step": 18584 + }, + { + "epoch": 0.11053025977733372, + "grad_norm": 1.5169013738632202, + "learning_rate": 4.85080382936406e-05, + "loss": 5.7988, + "step": 18585 + }, + { + "epoch": 0.1105362070606147, + "grad_norm": 1.7740446329116821, + "learning_rate": 4.850787934185786e-05, + "loss": 5.5388, + "step": 18586 + }, + { + "epoch": 0.11054215434389571, + "grad_norm": 1.560950756072998, + "learning_rate": 4.850772038186877e-05, + "loss": 5.406, + "step": 18587 + }, + { + "epoch": 0.11054810162717671, + "grad_norm": 1.6391148567199707, + "learning_rate": 4.850756141367338e-05, + "loss": 5.4669, + "step": 18588 + }, + { + "epoch": 0.1105540489104577, + "grad_norm": 1.5571023225784302, + "learning_rate": 4.8507402437271734e-05, + "loss": 5.6556, + "step": 18589 + }, + { + "epoch": 0.1105599961937387, + "grad_norm": 1.5374432802200317, + "learning_rate": 4.85072434526639e-05, + "loss": 5.7617, + "step": 18590 + }, + { + "epoch": 0.1105659434770197, + "grad_norm": 1.4683212041854858, + "learning_rate": 4.850708445984993e-05, + "loss": 5.5074, + "step": 18591 + }, + { + "epoch": 0.11057189076030069, + "grad_norm": 1.6689101457595825, + "learning_rate": 4.850692545882988e-05, + "loss": 5.3259, + "step": 18592 + }, + { + "epoch": 0.11057783804358169, + "grad_norm": 1.394108533859253, + "learning_rate": 4.85067664496038e-05, + "loss": 5.1686, + "step": 18593 + }, + { + "epoch": 0.1105837853268627, + "grad_norm": 1.7093585729599, + "learning_rate": 4.850660743217176e-05, + "loss": 5.6622, + "step": 18594 + }, + { + "epoch": 0.11058973261014368, + "grad_norm": 1.6189805269241333, + "learning_rate": 4.85064484065338e-05, + "loss": 5.6855, + "step": 18595 + }, + { + "epoch": 0.11059567989342468, + "grad_norm": 1.5303481817245483, + "learning_rate": 4.850628937268999e-05, + "loss": 5.8242, + "step": 18596 + }, + { + "epoch": 0.11060162717670569, + "grad_norm": 1.6557955741882324, + "learning_rate": 4.850613033064037e-05, + "loss": 5.4924, + "step": 18597 + }, + { + "epoch": 0.11060757445998667, + "grad_norm": 1.5280576944351196, + "learning_rate": 4.8505971280385e-05, + "loss": 5.6122, + "step": 18598 + }, + { + "epoch": 0.11061352174326768, + "grad_norm": 1.3656830787658691, + "learning_rate": 4.8505812221923945e-05, + "loss": 5.5282, + "step": 18599 + }, + { + "epoch": 0.11061946902654868, + "grad_norm": 1.3605096340179443, + "learning_rate": 4.850565315525725e-05, + "loss": 5.0747, + "step": 18600 + }, + { + "epoch": 0.11062541630982967, + "grad_norm": 2.120056390762329, + "learning_rate": 4.850549408038498e-05, + "loss": 5.1559, + "step": 18601 + }, + { + "epoch": 0.11063136359311067, + "grad_norm": 2.14626145362854, + "learning_rate": 4.850533499730718e-05, + "loss": 4.9778, + "step": 18602 + }, + { + "epoch": 0.11063731087639167, + "grad_norm": 2.1857240200042725, + "learning_rate": 4.8505175906023916e-05, + "loss": 4.8555, + "step": 18603 + }, + { + "epoch": 0.11064325815967266, + "grad_norm": 1.6636399030685425, + "learning_rate": 4.850501680653523e-05, + "loss": 5.3488, + "step": 18604 + }, + { + "epoch": 0.11064920544295366, + "grad_norm": 1.669511079788208, + "learning_rate": 4.8504857698841185e-05, + "loss": 5.2697, + "step": 18605 + }, + { + "epoch": 0.11065515272623466, + "grad_norm": 2.1935081481933594, + "learning_rate": 4.850469858294184e-05, + "loss": 4.4319, + "step": 18606 + }, + { + "epoch": 0.11066110000951565, + "grad_norm": 2.2359724044799805, + "learning_rate": 4.850453945883725e-05, + "loss": 4.2343, + "step": 18607 + }, + { + "epoch": 0.11066704729279665, + "grad_norm": 2.278247594833374, + "learning_rate": 4.850438032652747e-05, + "loss": 4.4955, + "step": 18608 + }, + { + "epoch": 0.11067299457607765, + "grad_norm": 2.3036160469055176, + "learning_rate": 4.850422118601254e-05, + "loss": 4.9122, + "step": 18609 + }, + { + "epoch": 0.11067894185935864, + "grad_norm": 2.3913469314575195, + "learning_rate": 4.850406203729254e-05, + "loss": 4.4703, + "step": 18610 + }, + { + "epoch": 0.11068488914263964, + "grad_norm": 1.9795238971710205, + "learning_rate": 4.8503902880367516e-05, + "loss": 4.7099, + "step": 18611 + }, + { + "epoch": 0.11069083642592063, + "grad_norm": 2.3990728855133057, + "learning_rate": 4.850374371523752e-05, + "loss": 4.3833, + "step": 18612 + }, + { + "epoch": 0.11069678370920163, + "grad_norm": 2.429461717605591, + "learning_rate": 4.850358454190261e-05, + "loss": 4.4279, + "step": 18613 + }, + { + "epoch": 0.11070273099248264, + "grad_norm": 2.598304271697998, + "learning_rate": 4.8503425360362845e-05, + "loss": 4.4376, + "step": 18614 + }, + { + "epoch": 0.11070867827576363, + "grad_norm": 2.3201403617858887, + "learning_rate": 4.850326617061827e-05, + "loss": 4.6822, + "step": 18615 + }, + { + "epoch": 0.11071462555904463, + "grad_norm": 1.8401033878326416, + "learning_rate": 4.8503106972668956e-05, + "loss": 5.1109, + "step": 18616 + }, + { + "epoch": 0.11072057284232563, + "grad_norm": 1.772309422492981, + "learning_rate": 4.850294776651494e-05, + "loss": 5.7237, + "step": 18617 + }, + { + "epoch": 0.11072652012560662, + "grad_norm": 1.7160669565200806, + "learning_rate": 4.8502788552156295e-05, + "loss": 5.7218, + "step": 18618 + }, + { + "epoch": 0.11073246740888762, + "grad_norm": 1.5467272996902466, + "learning_rate": 4.850262932959306e-05, + "loss": 5.4169, + "step": 18619 + }, + { + "epoch": 0.11073841469216862, + "grad_norm": 1.3382668495178223, + "learning_rate": 4.8502470098825316e-05, + "loss": 5.1243, + "step": 18620 + }, + { + "epoch": 0.11074436197544961, + "grad_norm": 1.3461776971817017, + "learning_rate": 4.850231085985309e-05, + "loss": 4.9412, + "step": 18621 + }, + { + "epoch": 0.11075030925873061, + "grad_norm": 1.4207700490951538, + "learning_rate": 4.850215161267646e-05, + "loss": 5.4449, + "step": 18622 + }, + { + "epoch": 0.11075625654201161, + "grad_norm": 1.7271502017974854, + "learning_rate": 4.8501992357295454e-05, + "loss": 5.4579, + "step": 18623 + }, + { + "epoch": 0.1107622038252926, + "grad_norm": 1.753090500831604, + "learning_rate": 4.8501833093710156e-05, + "loss": 5.7577, + "step": 18624 + }, + { + "epoch": 0.1107681511085736, + "grad_norm": 1.3730309009552002, + "learning_rate": 4.850167382192062e-05, + "loss": 5.3646, + "step": 18625 + }, + { + "epoch": 0.1107740983918546, + "grad_norm": 1.4723306894302368, + "learning_rate": 4.8501514541926883e-05, + "loss": 4.8234, + "step": 18626 + }, + { + "epoch": 0.1107800456751356, + "grad_norm": 1.3944339752197266, + "learning_rate": 4.850135525372901e-05, + "loss": 4.805, + "step": 18627 + }, + { + "epoch": 0.1107859929584166, + "grad_norm": 1.1402732133865356, + "learning_rate": 4.850119595732706e-05, + "loss": 4.9865, + "step": 18628 + }, + { + "epoch": 0.1107919402416976, + "grad_norm": 1.0595287084579468, + "learning_rate": 4.850103665272108e-05, + "loss": 4.9961, + "step": 18629 + }, + { + "epoch": 0.11079788752497859, + "grad_norm": 1.445143699645996, + "learning_rate": 4.8500877339911136e-05, + "loss": 5.2089, + "step": 18630 + }, + { + "epoch": 0.11080383480825959, + "grad_norm": 2.2014050483703613, + "learning_rate": 4.8500718018897275e-05, + "loss": 4.7445, + "step": 18631 + }, + { + "epoch": 0.11080978209154059, + "grad_norm": 2.117194890975952, + "learning_rate": 4.850055868967956e-05, + "loss": 4.8755, + "step": 18632 + }, + { + "epoch": 0.11081572937482158, + "grad_norm": 1.82968008518219, + "learning_rate": 4.850039935225804e-05, + "loss": 4.8852, + "step": 18633 + }, + { + "epoch": 0.11082167665810258, + "grad_norm": 1.613770842552185, + "learning_rate": 4.8500240006632766e-05, + "loss": 5.1053, + "step": 18634 + }, + { + "epoch": 0.11082762394138358, + "grad_norm": 1.8672553300857544, + "learning_rate": 4.850008065280381e-05, + "loss": 4.7134, + "step": 18635 + }, + { + "epoch": 0.11083357122466457, + "grad_norm": 1.9933403730392456, + "learning_rate": 4.849992129077122e-05, + "loss": 4.7544, + "step": 18636 + }, + { + "epoch": 0.11083951850794557, + "grad_norm": 1.8642876148223877, + "learning_rate": 4.849976192053505e-05, + "loss": 4.6598, + "step": 18637 + }, + { + "epoch": 0.11084546579122657, + "grad_norm": 1.8983674049377441, + "learning_rate": 4.849960254209536e-05, + "loss": 4.7403, + "step": 18638 + }, + { + "epoch": 0.11085141307450756, + "grad_norm": 1.9882328510284424, + "learning_rate": 4.849944315545219e-05, + "loss": 5.0105, + "step": 18639 + }, + { + "epoch": 0.11085736035778856, + "grad_norm": 1.7971723079681396, + "learning_rate": 4.8499283760605614e-05, + "loss": 5.6138, + "step": 18640 + }, + { + "epoch": 0.11086330764106955, + "grad_norm": 1.5002641677856445, + "learning_rate": 4.849912435755568e-05, + "loss": 5.7336, + "step": 18641 + }, + { + "epoch": 0.11086925492435055, + "grad_norm": 1.412880778312683, + "learning_rate": 4.8498964946302436e-05, + "loss": 5.532, + "step": 18642 + }, + { + "epoch": 0.11087520220763156, + "grad_norm": 1.6482197046279907, + "learning_rate": 4.849880552684596e-05, + "loss": 5.5432, + "step": 18643 + }, + { + "epoch": 0.11088114949091255, + "grad_norm": 1.5852200984954834, + "learning_rate": 4.849864609918629e-05, + "loss": 5.3577, + "step": 18644 + }, + { + "epoch": 0.11088709677419355, + "grad_norm": 1.540536642074585, + "learning_rate": 4.849848666332348e-05, + "loss": 5.4983, + "step": 18645 + }, + { + "epoch": 0.11089304405747455, + "grad_norm": 1.7822679281234741, + "learning_rate": 4.849832721925759e-05, + "loss": 5.1427, + "step": 18646 + }, + { + "epoch": 0.11089899134075554, + "grad_norm": 1.722977638244629, + "learning_rate": 4.8498167766988685e-05, + "loss": 5.2759, + "step": 18647 + }, + { + "epoch": 0.11090493862403654, + "grad_norm": 1.7543476819992065, + "learning_rate": 4.8498008306516806e-05, + "loss": 5.2616, + "step": 18648 + }, + { + "epoch": 0.11091088590731754, + "grad_norm": 1.4882584810256958, + "learning_rate": 4.8497848837842016e-05, + "loss": 5.3781, + "step": 18649 + }, + { + "epoch": 0.11091683319059853, + "grad_norm": 1.7358192205429077, + "learning_rate": 4.849768936096437e-05, + "loss": 5.5262, + "step": 18650 + }, + { + "epoch": 0.11092278047387953, + "grad_norm": 1.6070705652236938, + "learning_rate": 4.849752987588393e-05, + "loss": 5.0576, + "step": 18651 + }, + { + "epoch": 0.11092872775716053, + "grad_norm": 1.7641521692276, + "learning_rate": 4.8497370382600736e-05, + "loss": 5.21, + "step": 18652 + }, + { + "epoch": 0.11093467504044152, + "grad_norm": 1.8225789070129395, + "learning_rate": 4.849721088111485e-05, + "loss": 6.2734, + "step": 18653 + }, + { + "epoch": 0.11094062232372252, + "grad_norm": 1.8502428531646729, + "learning_rate": 4.849705137142634e-05, + "loss": 5.8298, + "step": 18654 + }, + { + "epoch": 0.11094656960700353, + "grad_norm": 1.4959850311279297, + "learning_rate": 4.8496891853535255e-05, + "loss": 5.4667, + "step": 18655 + }, + { + "epoch": 0.11095251689028451, + "grad_norm": 1.7957161664962769, + "learning_rate": 4.849673232744164e-05, + "loss": 5.3483, + "step": 18656 + }, + { + "epoch": 0.11095846417356552, + "grad_norm": 1.448737382888794, + "learning_rate": 4.8496572793145554e-05, + "loss": 5.4568, + "step": 18657 + }, + { + "epoch": 0.11096441145684652, + "grad_norm": 1.5068676471710205, + "learning_rate": 4.8496413250647065e-05, + "loss": 5.7089, + "step": 18658 + }, + { + "epoch": 0.1109703587401275, + "grad_norm": 1.5162447690963745, + "learning_rate": 4.849625369994622e-05, + "loss": 5.6042, + "step": 18659 + }, + { + "epoch": 0.11097630602340851, + "grad_norm": 1.81594979763031, + "learning_rate": 4.8496094141043076e-05, + "loss": 5.5301, + "step": 18660 + }, + { + "epoch": 0.11098225330668951, + "grad_norm": 1.9147114753723145, + "learning_rate": 4.8495934573937684e-05, + "loss": 4.6335, + "step": 18661 + }, + { + "epoch": 0.1109882005899705, + "grad_norm": 1.4161462783813477, + "learning_rate": 4.8495774998630106e-05, + "loss": 4.9868, + "step": 18662 + }, + { + "epoch": 0.1109941478732515, + "grad_norm": 1.5652790069580078, + "learning_rate": 4.8495615415120396e-05, + "loss": 5.6954, + "step": 18663 + }, + { + "epoch": 0.1110000951565325, + "grad_norm": 1.5217374563217163, + "learning_rate": 4.8495455823408616e-05, + "loss": 5.4338, + "step": 18664 + }, + { + "epoch": 0.11100604243981349, + "grad_norm": 1.3335540294647217, + "learning_rate": 4.8495296223494805e-05, + "loss": 5.4751, + "step": 18665 + }, + { + "epoch": 0.11101198972309449, + "grad_norm": 1.8903460502624512, + "learning_rate": 4.849513661537903e-05, + "loss": 4.9481, + "step": 18666 + }, + { + "epoch": 0.1110179370063755, + "grad_norm": 1.814666748046875, + "learning_rate": 4.849497699906135e-05, + "loss": 5.1422, + "step": 18667 + }, + { + "epoch": 0.11102388428965648, + "grad_norm": 1.7838057279586792, + "learning_rate": 4.8494817374541816e-05, + "loss": 5.3991, + "step": 18668 + }, + { + "epoch": 0.11102983157293748, + "grad_norm": 1.665671944618225, + "learning_rate": 4.849465774182048e-05, + "loss": 5.5362, + "step": 18669 + }, + { + "epoch": 0.11103577885621847, + "grad_norm": 2.255326509475708, + "learning_rate": 4.8494498100897415e-05, + "loss": 5.3161, + "step": 18670 + }, + { + "epoch": 0.11104172613949947, + "grad_norm": 1.7641721963882446, + "learning_rate": 4.849433845177265e-05, + "loss": 5.0422, + "step": 18671 + }, + { + "epoch": 0.11104767342278048, + "grad_norm": 1.4214074611663818, + "learning_rate": 4.8494178794446256e-05, + "loss": 5.2417, + "step": 18672 + }, + { + "epoch": 0.11105362070606146, + "grad_norm": 1.6417256593704224, + "learning_rate": 4.849401912891829e-05, + "loss": 5.262, + "step": 18673 + }, + { + "epoch": 0.11105956798934247, + "grad_norm": 1.4238179922103882, + "learning_rate": 4.84938594551888e-05, + "loss": 5.9754, + "step": 18674 + }, + { + "epoch": 0.11106551527262347, + "grad_norm": 1.9513673782348633, + "learning_rate": 4.849369977325785e-05, + "loss": 5.8917, + "step": 18675 + }, + { + "epoch": 0.11107146255590446, + "grad_norm": 1.625225305557251, + "learning_rate": 4.849354008312549e-05, + "loss": 5.7142, + "step": 18676 + }, + { + "epoch": 0.11107740983918546, + "grad_norm": 1.5306450128555298, + "learning_rate": 4.849338038479178e-05, + "loss": 5.3206, + "step": 18677 + }, + { + "epoch": 0.11108335712246646, + "grad_norm": 2.7895541191101074, + "learning_rate": 4.849322067825677e-05, + "loss": 4.3585, + "step": 18678 + }, + { + "epoch": 0.11108930440574745, + "grad_norm": 2.2688374519348145, + "learning_rate": 4.849306096352052e-05, + "loss": 4.4967, + "step": 18679 + }, + { + "epoch": 0.11109525168902845, + "grad_norm": 2.1710267066955566, + "learning_rate": 4.849290124058309e-05, + "loss": 4.0673, + "step": 18680 + }, + { + "epoch": 0.11110119897230945, + "grad_norm": 2.235142707824707, + "learning_rate": 4.849274150944453e-05, + "loss": 3.8198, + "step": 18681 + }, + { + "epoch": 0.11110714625559044, + "grad_norm": 2.328324317932129, + "learning_rate": 4.849258177010489e-05, + "loss": 4.008, + "step": 18682 + }, + { + "epoch": 0.11111309353887144, + "grad_norm": 2.2681312561035156, + "learning_rate": 4.849242202256424e-05, + "loss": 4.1541, + "step": 18683 + }, + { + "epoch": 0.11111904082215245, + "grad_norm": 2.5430855751037598, + "learning_rate": 4.849226226682262e-05, + "loss": 4.3177, + "step": 18684 + }, + { + "epoch": 0.11112498810543343, + "grad_norm": 2.1995978355407715, + "learning_rate": 4.84921025028801e-05, + "loss": 4.5792, + "step": 18685 + }, + { + "epoch": 0.11113093538871444, + "grad_norm": 1.9515454769134521, + "learning_rate": 4.849194273073673e-05, + "loss": 4.8759, + "step": 18686 + }, + { + "epoch": 0.11113688267199544, + "grad_norm": 2.484431028366089, + "learning_rate": 4.849178295039257e-05, + "loss": 4.1916, + "step": 18687 + }, + { + "epoch": 0.11114282995527643, + "grad_norm": 2.356790065765381, + "learning_rate": 4.8491623161847665e-05, + "loss": 4.38, + "step": 18688 + }, + { + "epoch": 0.11114877723855743, + "grad_norm": 2.414517879486084, + "learning_rate": 4.849146336510207e-05, + "loss": 4.3739, + "step": 18689 + }, + { + "epoch": 0.11115472452183843, + "grad_norm": 2.4129765033721924, + "learning_rate": 4.849130356015587e-05, + "loss": 4.0384, + "step": 18690 + }, + { + "epoch": 0.11116067180511942, + "grad_norm": 2.146932363510132, + "learning_rate": 4.8491143747009074e-05, + "loss": 4.4045, + "step": 18691 + }, + { + "epoch": 0.11116661908840042, + "grad_norm": 2.1945905685424805, + "learning_rate": 4.8490983925661776e-05, + "loss": 5.1674, + "step": 18692 + }, + { + "epoch": 0.11117256637168142, + "grad_norm": 2.2188448905944824, + "learning_rate": 4.849082409611402e-05, + "loss": 4.628, + "step": 18693 + }, + { + "epoch": 0.11117851365496241, + "grad_norm": 1.7684906721115112, + "learning_rate": 4.8490664258365847e-05, + "loss": 5.236, + "step": 18694 + }, + { + "epoch": 0.11118446093824341, + "grad_norm": 2.0367350578308105, + "learning_rate": 4.849050441241734e-05, + "loss": 5.6408, + "step": 18695 + }, + { + "epoch": 0.11119040822152441, + "grad_norm": 2.0829811096191406, + "learning_rate": 4.849034455826853e-05, + "loss": 5.5519, + "step": 18696 + }, + { + "epoch": 0.1111963555048054, + "grad_norm": 1.7884539365768433, + "learning_rate": 4.8490184695919486e-05, + "loss": 5.2345, + "step": 18697 + }, + { + "epoch": 0.1112023027880864, + "grad_norm": 1.8792423009872437, + "learning_rate": 4.849002482537026e-05, + "loss": 4.7622, + "step": 18698 + }, + { + "epoch": 0.11120825007136739, + "grad_norm": 1.7493008375167847, + "learning_rate": 4.8489864946620914e-05, + "loss": 5.295, + "step": 18699 + }, + { + "epoch": 0.1112141973546484, + "grad_norm": 1.60455322265625, + "learning_rate": 4.84897050596715e-05, + "loss": 5.5708, + "step": 18700 + }, + { + "epoch": 0.1112201446379294, + "grad_norm": 1.4326173067092896, + "learning_rate": 4.848954516452206e-05, + "loss": 5.9185, + "step": 18701 + }, + { + "epoch": 0.11122609192121038, + "grad_norm": 1.6318118572235107, + "learning_rate": 4.8489385261172685e-05, + "loss": 5.6545, + "step": 18702 + }, + { + "epoch": 0.11123203920449139, + "grad_norm": 1.4083906412124634, + "learning_rate": 4.848922534962339e-05, + "loss": 5.4776, + "step": 18703 + }, + { + "epoch": 0.11123798648777239, + "grad_norm": 1.222609519958496, + "learning_rate": 4.8489065429874256e-05, + "loss": 5.5094, + "step": 18704 + }, + { + "epoch": 0.11124393377105338, + "grad_norm": 1.6955020427703857, + "learning_rate": 4.848890550192533e-05, + "loss": 5.0516, + "step": 18705 + }, + { + "epoch": 0.11124988105433438, + "grad_norm": 1.3875632286071777, + "learning_rate": 4.848874556577667e-05, + "loss": 5.5321, + "step": 18706 + }, + { + "epoch": 0.11125582833761538, + "grad_norm": 1.2538158893585205, + "learning_rate": 4.848858562142833e-05, + "loss": 5.464, + "step": 18707 + }, + { + "epoch": 0.11126177562089637, + "grad_norm": 1.7350475788116455, + "learning_rate": 4.8488425668880366e-05, + "loss": 5.2815, + "step": 18708 + }, + { + "epoch": 0.11126772290417737, + "grad_norm": 1.543989658355713, + "learning_rate": 4.848826570813284e-05, + "loss": 5.4817, + "step": 18709 + }, + { + "epoch": 0.11127367018745837, + "grad_norm": 1.3931440114974976, + "learning_rate": 4.8488105739185807e-05, + "loss": 5.7652, + "step": 18710 + }, + { + "epoch": 0.11127961747073936, + "grad_norm": 1.4630471467971802, + "learning_rate": 4.8487945762039314e-05, + "loss": 5.4886, + "step": 18711 + }, + { + "epoch": 0.11128556475402036, + "grad_norm": 1.338161826133728, + "learning_rate": 4.848778577669342e-05, + "loss": 5.2021, + "step": 18712 + }, + { + "epoch": 0.11129151203730137, + "grad_norm": 1.4282599687576294, + "learning_rate": 4.8487625783148186e-05, + "loss": 5.2767, + "step": 18713 + }, + { + "epoch": 0.11129745932058235, + "grad_norm": 1.4386523962020874, + "learning_rate": 4.848746578140366e-05, + "loss": 5.7286, + "step": 18714 + }, + { + "epoch": 0.11130340660386336, + "grad_norm": 1.2272754907608032, + "learning_rate": 4.84873057714599e-05, + "loss": 5.3609, + "step": 18715 + }, + { + "epoch": 0.11130935388714436, + "grad_norm": 1.8362592458724976, + "learning_rate": 4.848714575331697e-05, + "loss": 5.0494, + "step": 18716 + }, + { + "epoch": 0.11131530117042535, + "grad_norm": 2.098970651626587, + "learning_rate": 4.848698572697492e-05, + "loss": 4.8282, + "step": 18717 + }, + { + "epoch": 0.11132124845370635, + "grad_norm": 2.2145583629608154, + "learning_rate": 4.84868256924338e-05, + "loss": 4.4621, + "step": 18718 + }, + { + "epoch": 0.11132719573698735, + "grad_norm": 1.8036415576934814, + "learning_rate": 4.848666564969368e-05, + "loss": 5.374, + "step": 18719 + }, + { + "epoch": 0.11133314302026834, + "grad_norm": 1.5794750452041626, + "learning_rate": 4.8486505598754605e-05, + "loss": 5.6246, + "step": 18720 + }, + { + "epoch": 0.11133909030354934, + "grad_norm": 1.637068510055542, + "learning_rate": 4.848634553961664e-05, + "loss": 5.4506, + "step": 18721 + }, + { + "epoch": 0.11134503758683034, + "grad_norm": 1.6928807497024536, + "learning_rate": 4.8486185472279824e-05, + "loss": 5.2405, + "step": 18722 + }, + { + "epoch": 0.11135098487011133, + "grad_norm": 2.0931332111358643, + "learning_rate": 4.848602539674422e-05, + "loss": 4.9366, + "step": 18723 + }, + { + "epoch": 0.11135693215339233, + "grad_norm": 1.4645583629608154, + "learning_rate": 4.848586531300989e-05, + "loss": 5.0677, + "step": 18724 + }, + { + "epoch": 0.11136287943667333, + "grad_norm": 1.7817938327789307, + "learning_rate": 4.8485705221076896e-05, + "loss": 5.5975, + "step": 18725 + }, + { + "epoch": 0.11136882671995432, + "grad_norm": 1.7167946100234985, + "learning_rate": 4.848554512094528e-05, + "loss": 5.829, + "step": 18726 + }, + { + "epoch": 0.11137477400323532, + "grad_norm": 1.723574161529541, + "learning_rate": 4.8485385012615106e-05, + "loss": 5.2702, + "step": 18727 + }, + { + "epoch": 0.11138072128651631, + "grad_norm": 1.4848002195358276, + "learning_rate": 4.848522489608642e-05, + "loss": 5.6739, + "step": 18728 + }, + { + "epoch": 0.11138666856979731, + "grad_norm": 1.798085331916809, + "learning_rate": 4.848506477135929e-05, + "loss": 5.7314, + "step": 18729 + }, + { + "epoch": 0.11139261585307832, + "grad_norm": 1.7033846378326416, + "learning_rate": 4.848490463843376e-05, + "loss": 5.531, + "step": 18730 + }, + { + "epoch": 0.1113985631363593, + "grad_norm": 1.64686119556427, + "learning_rate": 4.8484744497309896e-05, + "loss": 5.8325, + "step": 18731 + }, + { + "epoch": 0.1114045104196403, + "grad_norm": 1.9923123121261597, + "learning_rate": 4.8484584347987755e-05, + "loss": 5.9614, + "step": 18732 + }, + { + "epoch": 0.11141045770292131, + "grad_norm": 1.768896460533142, + "learning_rate": 4.8484424190467385e-05, + "loss": 5.9892, + "step": 18733 + }, + { + "epoch": 0.1114164049862023, + "grad_norm": 1.5981477499008179, + "learning_rate": 4.848426402474885e-05, + "loss": 5.6239, + "step": 18734 + }, + { + "epoch": 0.1114223522694833, + "grad_norm": 1.8919446468353271, + "learning_rate": 4.848410385083219e-05, + "loss": 5.7437, + "step": 18735 + }, + { + "epoch": 0.1114282995527643, + "grad_norm": 2.2705752849578857, + "learning_rate": 4.848394366871748e-05, + "loss": 4.5999, + "step": 18736 + }, + { + "epoch": 0.11143424683604529, + "grad_norm": 1.8626762628555298, + "learning_rate": 4.848378347840476e-05, + "loss": 5.5706, + "step": 18737 + }, + { + "epoch": 0.11144019411932629, + "grad_norm": 1.5893161296844482, + "learning_rate": 4.84836232798941e-05, + "loss": 5.4011, + "step": 18738 + }, + { + "epoch": 0.1114461414026073, + "grad_norm": 1.3441518545150757, + "learning_rate": 4.8483463073185554e-05, + "loss": 5.2412, + "step": 18739 + }, + { + "epoch": 0.11145208868588828, + "grad_norm": 1.6281975507736206, + "learning_rate": 4.848330285827917e-05, + "loss": 5.4281, + "step": 18740 + }, + { + "epoch": 0.11145803596916928, + "grad_norm": 2.1942298412323, + "learning_rate": 4.8483142635175e-05, + "loss": 5.6202, + "step": 18741 + }, + { + "epoch": 0.11146398325245029, + "grad_norm": 2.086764097213745, + "learning_rate": 4.848298240387311e-05, + "loss": 5.665, + "step": 18742 + }, + { + "epoch": 0.11146993053573127, + "grad_norm": 2.0656285285949707, + "learning_rate": 4.848282216437356e-05, + "loss": 5.5196, + "step": 18743 + }, + { + "epoch": 0.11147587781901228, + "grad_norm": 1.5579513311386108, + "learning_rate": 4.84826619166764e-05, + "loss": 5.7366, + "step": 18744 + }, + { + "epoch": 0.11148182510229328, + "grad_norm": 1.7952065467834473, + "learning_rate": 4.848250166078168e-05, + "loss": 5.8041, + "step": 18745 + }, + { + "epoch": 0.11148777238557427, + "grad_norm": 1.3523657321929932, + "learning_rate": 4.848234139668947e-05, + "loss": 5.6628, + "step": 18746 + }, + { + "epoch": 0.11149371966885527, + "grad_norm": 1.6833933591842651, + "learning_rate": 4.848218112439981e-05, + "loss": 5.5285, + "step": 18747 + }, + { + "epoch": 0.11149966695213627, + "grad_norm": 1.308733344078064, + "learning_rate": 4.848202084391276e-05, + "loss": 5.9953, + "step": 18748 + }, + { + "epoch": 0.11150561423541726, + "grad_norm": 1.3434252738952637, + "learning_rate": 4.848186055522838e-05, + "loss": 5.8267, + "step": 18749 + }, + { + "epoch": 0.11151156151869826, + "grad_norm": 1.6250263452529907, + "learning_rate": 4.848170025834673e-05, + "loss": 4.964, + "step": 18750 + }, + { + "epoch": 0.11151750880197926, + "grad_norm": 1.4924334287643433, + "learning_rate": 4.848153995326786e-05, + "loss": 4.9072, + "step": 18751 + }, + { + "epoch": 0.11152345608526025, + "grad_norm": 1.5650702714920044, + "learning_rate": 4.8481379639991826e-05, + "loss": 5.8793, + "step": 18752 + }, + { + "epoch": 0.11152940336854125, + "grad_norm": 1.488553762435913, + "learning_rate": 4.848121931851868e-05, + "loss": 5.823, + "step": 18753 + }, + { + "epoch": 0.11153535065182225, + "grad_norm": 1.5356508493423462, + "learning_rate": 4.848105898884849e-05, + "loss": 5.7632, + "step": 18754 + }, + { + "epoch": 0.11154129793510324, + "grad_norm": 1.5389797687530518, + "learning_rate": 4.8480898650981296e-05, + "loss": 5.8662, + "step": 18755 + }, + { + "epoch": 0.11154724521838424, + "grad_norm": 1.3963713645935059, + "learning_rate": 4.848073830491717e-05, + "loss": 5.5647, + "step": 18756 + }, + { + "epoch": 0.11155319250166523, + "grad_norm": 1.3739324808120728, + "learning_rate": 4.848057795065617e-05, + "loss": 5.6686, + "step": 18757 + }, + { + "epoch": 0.11155913978494623, + "grad_norm": 1.2932708263397217, + "learning_rate": 4.848041758819833e-05, + "loss": 5.6567, + "step": 18758 + }, + { + "epoch": 0.11156508706822724, + "grad_norm": 1.3388581275939941, + "learning_rate": 4.848025721754372e-05, + "loss": 5.6111, + "step": 18759 + }, + { + "epoch": 0.11157103435150822, + "grad_norm": 1.28604257106781, + "learning_rate": 4.84800968386924e-05, + "loss": 5.633, + "step": 18760 + }, + { + "epoch": 0.11157698163478923, + "grad_norm": 2.0710771083831787, + "learning_rate": 4.847993645164441e-05, + "loss": 5.1686, + "step": 18761 + }, + { + "epoch": 0.11158292891807023, + "grad_norm": 1.8022092580795288, + "learning_rate": 4.847977605639983e-05, + "loss": 5.6373, + "step": 18762 + }, + { + "epoch": 0.11158887620135122, + "grad_norm": 1.7080397605895996, + "learning_rate": 4.84796156529587e-05, + "loss": 5.5389, + "step": 18763 + }, + { + "epoch": 0.11159482348463222, + "grad_norm": 1.3582305908203125, + "learning_rate": 4.847945524132107e-05, + "loss": 5.5574, + "step": 18764 + }, + { + "epoch": 0.11160077076791322, + "grad_norm": 1.9037936925888062, + "learning_rate": 4.8479294821487015e-05, + "loss": 5.2108, + "step": 18765 + }, + { + "epoch": 0.11160671805119421, + "grad_norm": 1.6884709596633911, + "learning_rate": 4.8479134393456576e-05, + "loss": 5.2462, + "step": 18766 + }, + { + "epoch": 0.11161266533447521, + "grad_norm": 1.720261812210083, + "learning_rate": 4.8478973957229813e-05, + "loss": 5.5132, + "step": 18767 + }, + { + "epoch": 0.11161861261775621, + "grad_norm": 2.1769275665283203, + "learning_rate": 4.847881351280679e-05, + "loss": 5.1169, + "step": 18768 + }, + { + "epoch": 0.1116245599010372, + "grad_norm": 1.8593683242797852, + "learning_rate": 4.847865306018754e-05, + "loss": 4.8812, + "step": 18769 + }, + { + "epoch": 0.1116305071843182, + "grad_norm": 1.9496150016784668, + "learning_rate": 4.8478492599372147e-05, + "loss": 4.8244, + "step": 18770 + }, + { + "epoch": 0.1116364544675992, + "grad_norm": 1.584330677986145, + "learning_rate": 4.8478332130360655e-05, + "loss": 4.769, + "step": 18771 + }, + { + "epoch": 0.1116424017508802, + "grad_norm": 1.5987087488174438, + "learning_rate": 4.8478171653153116e-05, + "loss": 4.8385, + "step": 18772 + }, + { + "epoch": 0.1116483490341612, + "grad_norm": 1.919463038444519, + "learning_rate": 4.847801116774959e-05, + "loss": 4.7365, + "step": 18773 + }, + { + "epoch": 0.1116542963174422, + "grad_norm": 1.8708561658859253, + "learning_rate": 4.847785067415014e-05, + "loss": 4.9067, + "step": 18774 + }, + { + "epoch": 0.11166024360072319, + "grad_norm": 1.778316617012024, + "learning_rate": 4.8477690172354804e-05, + "loss": 4.8213, + "step": 18775 + }, + { + "epoch": 0.11166619088400419, + "grad_norm": 1.7170525789260864, + "learning_rate": 4.8477529662363655e-05, + "loss": 4.7115, + "step": 18776 + }, + { + "epoch": 0.11167213816728519, + "grad_norm": 1.6704293489456177, + "learning_rate": 4.847736914417674e-05, + "loss": 4.5814, + "step": 18777 + }, + { + "epoch": 0.11167808545056618, + "grad_norm": 1.7422312498092651, + "learning_rate": 4.847720861779412e-05, + "loss": 4.6206, + "step": 18778 + }, + { + "epoch": 0.11168403273384718, + "grad_norm": 1.7162894010543823, + "learning_rate": 4.8477048083215845e-05, + "loss": 4.6421, + "step": 18779 + }, + { + "epoch": 0.11168998001712818, + "grad_norm": 1.7825870513916016, + "learning_rate": 4.847688754044199e-05, + "loss": 4.6899, + "step": 18780 + }, + { + "epoch": 0.11169592730040917, + "grad_norm": 1.8103221654891968, + "learning_rate": 4.8476726989472577e-05, + "loss": 4.5619, + "step": 18781 + }, + { + "epoch": 0.11170187458369017, + "grad_norm": 1.8276532888412476, + "learning_rate": 4.847656643030769e-05, + "loss": 4.3429, + "step": 18782 + }, + { + "epoch": 0.11170782186697117, + "grad_norm": 1.7625696659088135, + "learning_rate": 4.847640586294737e-05, + "loss": 4.4154, + "step": 18783 + }, + { + "epoch": 0.11171376915025216, + "grad_norm": 1.842450499534607, + "learning_rate": 4.8476245287391684e-05, + "loss": 4.6279, + "step": 18784 + }, + { + "epoch": 0.11171971643353316, + "grad_norm": 1.879961371421814, + "learning_rate": 4.847608470364069e-05, + "loss": 4.4906, + "step": 18785 + }, + { + "epoch": 0.11172566371681415, + "grad_norm": 1.5556871891021729, + "learning_rate": 4.847592411169443e-05, + "loss": 5.0258, + "step": 18786 + }, + { + "epoch": 0.11173161100009515, + "grad_norm": 1.8000839948654175, + "learning_rate": 4.8475763511552965e-05, + "loss": 4.4746, + "step": 18787 + }, + { + "epoch": 0.11173755828337616, + "grad_norm": 1.4234516620635986, + "learning_rate": 4.847560290321636e-05, + "loss": 5.4744, + "step": 18788 + }, + { + "epoch": 0.11174350556665714, + "grad_norm": 1.5717182159423828, + "learning_rate": 4.847544228668466e-05, + "loss": 5.4368, + "step": 18789 + }, + { + "epoch": 0.11174945284993815, + "grad_norm": 1.3514728546142578, + "learning_rate": 4.847528166195793e-05, + "loss": 5.3036, + "step": 18790 + }, + { + "epoch": 0.11175540013321915, + "grad_norm": 1.4620373249053955, + "learning_rate": 4.847512102903621e-05, + "loss": 5.2206, + "step": 18791 + }, + { + "epoch": 0.11176134741650014, + "grad_norm": 1.3034706115722656, + "learning_rate": 4.847496038791958e-05, + "loss": 5.3359, + "step": 18792 + }, + { + "epoch": 0.11176729469978114, + "grad_norm": 1.599876046180725, + "learning_rate": 4.847479973860808e-05, + "loss": 5.1282, + "step": 18793 + }, + { + "epoch": 0.11177324198306214, + "grad_norm": 1.4783935546875, + "learning_rate": 4.847463908110177e-05, + "loss": 5.1958, + "step": 18794 + }, + { + "epoch": 0.11177918926634313, + "grad_norm": 1.5132538080215454, + "learning_rate": 4.84744784154007e-05, + "loss": 5.0166, + "step": 18795 + }, + { + "epoch": 0.11178513654962413, + "grad_norm": 1.9335131645202637, + "learning_rate": 4.847431774150495e-05, + "loss": 4.8899, + "step": 18796 + }, + { + "epoch": 0.11179108383290513, + "grad_norm": 1.5765737295150757, + "learning_rate": 4.847415705941454e-05, + "loss": 5.2848, + "step": 18797 + }, + { + "epoch": 0.11179703111618612, + "grad_norm": 1.7239350080490112, + "learning_rate": 4.847399636912955e-05, + "loss": 5.0606, + "step": 18798 + }, + { + "epoch": 0.11180297839946712, + "grad_norm": 1.5246455669403076, + "learning_rate": 4.847383567065004e-05, + "loss": 5.0829, + "step": 18799 + }, + { + "epoch": 0.11180892568274813, + "grad_norm": 1.3902997970581055, + "learning_rate": 4.847367496397604e-05, + "loss": 5.2729, + "step": 18800 + }, + { + "epoch": 0.11181487296602911, + "grad_norm": 1.426282286643982, + "learning_rate": 4.8473514249107634e-05, + "loss": 5.2259, + "step": 18801 + }, + { + "epoch": 0.11182082024931012, + "grad_norm": 1.4425853490829468, + "learning_rate": 4.847335352604486e-05, + "loss": 4.923, + "step": 18802 + }, + { + "epoch": 0.11182676753259112, + "grad_norm": 1.26097571849823, + "learning_rate": 4.8473192794787786e-05, + "loss": 4.9122, + "step": 18803 + }, + { + "epoch": 0.1118327148158721, + "grad_norm": 1.4102699756622314, + "learning_rate": 4.847303205533646e-05, + "loss": 4.9641, + "step": 18804 + }, + { + "epoch": 0.11183866209915311, + "grad_norm": 1.3965771198272705, + "learning_rate": 4.847287130769094e-05, + "loss": 4.9832, + "step": 18805 + }, + { + "epoch": 0.11184460938243411, + "grad_norm": 1.3588200807571411, + "learning_rate": 4.8472710551851284e-05, + "loss": 5.0502, + "step": 18806 + }, + { + "epoch": 0.1118505566657151, + "grad_norm": 1.394020676612854, + "learning_rate": 4.847254978781755e-05, + "loss": 4.9699, + "step": 18807 + }, + { + "epoch": 0.1118565039489961, + "grad_norm": 1.4548087120056152, + "learning_rate": 4.8472389015589794e-05, + "loss": 4.9112, + "step": 18808 + }, + { + "epoch": 0.1118624512322771, + "grad_norm": 1.4359081983566284, + "learning_rate": 4.847222823516806e-05, + "loss": 4.9284, + "step": 18809 + }, + { + "epoch": 0.11186839851555809, + "grad_norm": 1.3159685134887695, + "learning_rate": 4.847206744655242e-05, + "loss": 4.9661, + "step": 18810 + }, + { + "epoch": 0.11187434579883909, + "grad_norm": 1.5037652254104614, + "learning_rate": 4.847190664974292e-05, + "loss": 5.0318, + "step": 18811 + }, + { + "epoch": 0.1118802930821201, + "grad_norm": 1.7603816986083984, + "learning_rate": 4.8471745844739624e-05, + "loss": 5.0486, + "step": 18812 + }, + { + "epoch": 0.11188624036540108, + "grad_norm": 1.6205053329467773, + "learning_rate": 4.847158503154259e-05, + "loss": 5.0587, + "step": 18813 + }, + { + "epoch": 0.11189218764868208, + "grad_norm": 1.559334635734558, + "learning_rate": 4.847142421015185e-05, + "loss": 5.1514, + "step": 18814 + }, + { + "epoch": 0.11189813493196307, + "grad_norm": 1.4896910190582275, + "learning_rate": 4.8471263380567495e-05, + "loss": 5.2103, + "step": 18815 + }, + { + "epoch": 0.11190408221524407, + "grad_norm": 1.43007493019104, + "learning_rate": 4.847110254278956e-05, + "loss": 5.0152, + "step": 18816 + }, + { + "epoch": 0.11191002949852508, + "grad_norm": 1.3567081689834595, + "learning_rate": 4.84709416968181e-05, + "loss": 4.7193, + "step": 18817 + }, + { + "epoch": 0.11191597678180606, + "grad_norm": 1.3283864259719849, + "learning_rate": 4.8470780842653186e-05, + "loss": 4.8559, + "step": 18818 + }, + { + "epoch": 0.11192192406508707, + "grad_norm": 1.5427826642990112, + "learning_rate": 4.8470619980294854e-05, + "loss": 5.1406, + "step": 18819 + }, + { + "epoch": 0.11192787134836807, + "grad_norm": 1.4549115896224976, + "learning_rate": 4.847045910974318e-05, + "loss": 5.0377, + "step": 18820 + }, + { + "epoch": 0.11193381863164906, + "grad_norm": 1.3822715282440186, + "learning_rate": 4.84702982309982e-05, + "loss": 4.9279, + "step": 18821 + }, + { + "epoch": 0.11193976591493006, + "grad_norm": 1.290756106376648, + "learning_rate": 4.8470137344059996e-05, + "loss": 4.9631, + "step": 18822 + }, + { + "epoch": 0.11194571319821106, + "grad_norm": 1.8070625066757202, + "learning_rate": 4.84699764489286e-05, + "loss": 5.0103, + "step": 18823 + }, + { + "epoch": 0.11195166048149205, + "grad_norm": 1.6692131757736206, + "learning_rate": 4.846981554560408e-05, + "loss": 5.1265, + "step": 18824 + }, + { + "epoch": 0.11195760776477305, + "grad_norm": 1.7644426822662354, + "learning_rate": 4.8469654634086495e-05, + "loss": 5.0712, + "step": 18825 + }, + { + "epoch": 0.11196355504805405, + "grad_norm": 1.5689074993133545, + "learning_rate": 4.8469493714375893e-05, + "loss": 5.0551, + "step": 18826 + }, + { + "epoch": 0.11196950233133504, + "grad_norm": 1.610300064086914, + "learning_rate": 4.846933278647233e-05, + "loss": 5.0746, + "step": 18827 + }, + { + "epoch": 0.11197544961461604, + "grad_norm": 1.2828009128570557, + "learning_rate": 4.846917185037586e-05, + "loss": 5.0645, + "step": 18828 + }, + { + "epoch": 0.11198139689789705, + "grad_norm": 1.386265516281128, + "learning_rate": 4.846901090608655e-05, + "loss": 5.1885, + "step": 18829 + }, + { + "epoch": 0.11198734418117803, + "grad_norm": 1.446359634399414, + "learning_rate": 4.846884995360446e-05, + "loss": 5.3245, + "step": 18830 + }, + { + "epoch": 0.11199329146445904, + "grad_norm": 1.4347827434539795, + "learning_rate": 4.846868899292962e-05, + "loss": 5.379, + "step": 18831 + }, + { + "epoch": 0.11199923874774004, + "grad_norm": 1.7589528560638428, + "learning_rate": 4.846852802406212e-05, + "loss": 5.2726, + "step": 18832 + }, + { + "epoch": 0.11200518603102103, + "grad_norm": 1.4316980838775635, + "learning_rate": 4.846836704700199e-05, + "loss": 5.5424, + "step": 18833 + }, + { + "epoch": 0.11201113331430203, + "grad_norm": 1.202364444732666, + "learning_rate": 4.84682060617493e-05, + "loss": 5.4271, + "step": 18834 + }, + { + "epoch": 0.11201708059758303, + "grad_norm": 1.282231330871582, + "learning_rate": 4.8468045068304094e-05, + "loss": 5.4895, + "step": 18835 + }, + { + "epoch": 0.11202302788086402, + "grad_norm": 1.8428497314453125, + "learning_rate": 4.846788406666644e-05, + "loss": 4.9924, + "step": 18836 + }, + { + "epoch": 0.11202897516414502, + "grad_norm": 1.8442119359970093, + "learning_rate": 4.846772305683639e-05, + "loss": 4.6735, + "step": 18837 + }, + { + "epoch": 0.11203492244742602, + "grad_norm": 1.7083659172058105, + "learning_rate": 4.846756203881401e-05, + "loss": 4.8064, + "step": 18838 + }, + { + "epoch": 0.11204086973070701, + "grad_norm": 1.5663195848464966, + "learning_rate": 4.8467401012599336e-05, + "loss": 5.095, + "step": 18839 + }, + { + "epoch": 0.11204681701398801, + "grad_norm": 1.7466095685958862, + "learning_rate": 4.846723997819244e-05, + "loss": 4.7633, + "step": 18840 + }, + { + "epoch": 0.11205276429726901, + "grad_norm": 1.73336660861969, + "learning_rate": 4.846707893559336e-05, + "loss": 4.8776, + "step": 18841 + }, + { + "epoch": 0.11205871158055, + "grad_norm": 1.726456880569458, + "learning_rate": 4.8466917884802175e-05, + "loss": 4.845, + "step": 18842 + }, + { + "epoch": 0.112064658863831, + "grad_norm": 1.733583927154541, + "learning_rate": 4.8466756825818934e-05, + "loss": 4.8272, + "step": 18843 + }, + { + "epoch": 0.11207060614711199, + "grad_norm": 1.8252346515655518, + "learning_rate": 4.8466595758643684e-05, + "loss": 4.7088, + "step": 18844 + }, + { + "epoch": 0.112076553430393, + "grad_norm": 1.6071163415908813, + "learning_rate": 4.8466434683276495e-05, + "loss": 4.7085, + "step": 18845 + }, + { + "epoch": 0.112082500713674, + "grad_norm": 1.8407503366470337, + "learning_rate": 4.846627359971741e-05, + "loss": 4.6885, + "step": 18846 + }, + { + "epoch": 0.11208844799695498, + "grad_norm": 1.5426356792449951, + "learning_rate": 4.84661125079665e-05, + "loss": 4.7252, + "step": 18847 + }, + { + "epoch": 0.11209439528023599, + "grad_norm": 1.8290139436721802, + "learning_rate": 4.84659514080238e-05, + "loss": 4.9314, + "step": 18848 + }, + { + "epoch": 0.11210034256351699, + "grad_norm": 1.73724365234375, + "learning_rate": 4.846579029988939e-05, + "loss": 4.7618, + "step": 18849 + }, + { + "epoch": 0.11210628984679798, + "grad_norm": 2.0577304363250732, + "learning_rate": 4.8465629183563314e-05, + "loss": 4.8118, + "step": 18850 + }, + { + "epoch": 0.11211223713007898, + "grad_norm": 1.8696433305740356, + "learning_rate": 4.846546805904562e-05, + "loss": 4.6813, + "step": 18851 + }, + { + "epoch": 0.11211818441335998, + "grad_norm": 1.6597977876663208, + "learning_rate": 4.846530692633638e-05, + "loss": 4.5187, + "step": 18852 + }, + { + "epoch": 0.11212413169664097, + "grad_norm": 1.6595630645751953, + "learning_rate": 4.846514578543564e-05, + "loss": 5.012, + "step": 18853 + }, + { + "epoch": 0.11213007897992197, + "grad_norm": 2.2116329669952393, + "learning_rate": 4.846498463634347e-05, + "loss": 5.1757, + "step": 18854 + }, + { + "epoch": 0.11213602626320297, + "grad_norm": 1.8592875003814697, + "learning_rate": 4.846482347905991e-05, + "loss": 6.0403, + "step": 18855 + }, + { + "epoch": 0.11214197354648396, + "grad_norm": 1.7812080383300781, + "learning_rate": 4.846466231358502e-05, + "loss": 5.974, + "step": 18856 + }, + { + "epoch": 0.11214792082976496, + "grad_norm": 1.8986600637435913, + "learning_rate": 4.846450113991886e-05, + "loss": 5.3866, + "step": 18857 + }, + { + "epoch": 0.11215386811304597, + "grad_norm": 2.4542179107666016, + "learning_rate": 4.846433995806148e-05, + "loss": 4.863, + "step": 18858 + }, + { + "epoch": 0.11215981539632695, + "grad_norm": 2.1604816913604736, + "learning_rate": 4.846417876801295e-05, + "loss": 5.219, + "step": 18859 + }, + { + "epoch": 0.11216576267960796, + "grad_norm": 2.325782060623169, + "learning_rate": 4.846401756977331e-05, + "loss": 5.1454, + "step": 18860 + }, + { + "epoch": 0.11217170996288896, + "grad_norm": 2.3508334159851074, + "learning_rate": 4.846385636334263e-05, + "loss": 5.1318, + "step": 18861 + }, + { + "epoch": 0.11217765724616995, + "grad_norm": 2.2381060123443604, + "learning_rate": 4.846369514872096e-05, + "loss": 5.0676, + "step": 18862 + }, + { + "epoch": 0.11218360452945095, + "grad_norm": 2.3624770641326904, + "learning_rate": 4.8463533925908355e-05, + "loss": 5.0251, + "step": 18863 + }, + { + "epoch": 0.11218955181273195, + "grad_norm": 1.9950919151306152, + "learning_rate": 4.846337269490487e-05, + "loss": 5.0396, + "step": 18864 + }, + { + "epoch": 0.11219549909601294, + "grad_norm": 1.829410433769226, + "learning_rate": 4.8463211455710574e-05, + "loss": 4.9327, + "step": 18865 + }, + { + "epoch": 0.11220144637929394, + "grad_norm": 1.8879605531692505, + "learning_rate": 4.846305020832551e-05, + "loss": 4.8902, + "step": 18866 + }, + { + "epoch": 0.11220739366257494, + "grad_norm": 1.89055335521698, + "learning_rate": 4.846288895274973e-05, + "loss": 4.9219, + "step": 18867 + }, + { + "epoch": 0.11221334094585593, + "grad_norm": 2.224971055984497, + "learning_rate": 4.84627276889833e-05, + "loss": 5.0164, + "step": 18868 + }, + { + "epoch": 0.11221928822913693, + "grad_norm": 2.1675336360931396, + "learning_rate": 4.8462566417026276e-05, + "loss": 5.0082, + "step": 18869 + }, + { + "epoch": 0.11222523551241793, + "grad_norm": 1.885236144065857, + "learning_rate": 4.8462405136878714e-05, + "loss": 5.1484, + "step": 18870 + }, + { + "epoch": 0.11223118279569892, + "grad_norm": 1.3037774562835693, + "learning_rate": 4.846224384854067e-05, + "loss": 5.64, + "step": 18871 + }, + { + "epoch": 0.11223713007897992, + "grad_norm": 1.6506762504577637, + "learning_rate": 4.846208255201219e-05, + "loss": 5.6067, + "step": 18872 + }, + { + "epoch": 0.11224307736226091, + "grad_norm": 1.4294368028640747, + "learning_rate": 4.8461921247293344e-05, + "loss": 5.67, + "step": 18873 + }, + { + "epoch": 0.11224902464554191, + "grad_norm": 1.6201854944229126, + "learning_rate": 4.846175993438419e-05, + "loss": 5.6093, + "step": 18874 + }, + { + "epoch": 0.11225497192882292, + "grad_norm": 1.5683603286743164, + "learning_rate": 4.846159861328478e-05, + "loss": 5.6129, + "step": 18875 + }, + { + "epoch": 0.1122609192121039, + "grad_norm": 1.5446193218231201, + "learning_rate": 4.8461437283995156e-05, + "loss": 5.6063, + "step": 18876 + }, + { + "epoch": 0.1122668664953849, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846127594651539e-05, + "loss": 5.6291, + "step": 18877 + }, + { + "epoch": 0.11227281377866591, + "grad_norm": 1.477872371673584, + "learning_rate": 4.846111460084554e-05, + "loss": 5.6282, + "step": 18878 + }, + { + "epoch": 0.1122787610619469, + "grad_norm": 1.4379156827926636, + "learning_rate": 4.846095324698565e-05, + "loss": 5.5451, + "step": 18879 + }, + { + "epoch": 0.1122847083452279, + "grad_norm": 1.4940646886825562, + "learning_rate": 4.8460791884935785e-05, + "loss": 5.4705, + "step": 18880 + }, + { + "epoch": 0.1122906556285089, + "grad_norm": 1.4625567197799683, + "learning_rate": 4.8460630514696e-05, + "loss": 5.5428, + "step": 18881 + }, + { + "epoch": 0.11229660291178989, + "grad_norm": 1.7899153232574463, + "learning_rate": 4.846046913626636e-05, + "loss": 5.7665, + "step": 18882 + }, + { + "epoch": 0.11230255019507089, + "grad_norm": 2.1002516746520996, + "learning_rate": 4.8460307749646906e-05, + "loss": 6.1132, + "step": 18883 + }, + { + "epoch": 0.11230849747835189, + "grad_norm": 1.8406580686569214, + "learning_rate": 4.84601463548377e-05, + "loss": 5.5207, + "step": 18884 + }, + { + "epoch": 0.11231444476163288, + "grad_norm": 1.6287425756454468, + "learning_rate": 4.84599849518388e-05, + "loss": 5.931, + "step": 18885 + }, + { + "epoch": 0.11232039204491388, + "grad_norm": 1.4447002410888672, + "learning_rate": 4.845982354065027e-05, + "loss": 5.6181, + "step": 18886 + }, + { + "epoch": 0.11232633932819489, + "grad_norm": 1.6555171012878418, + "learning_rate": 4.845966212127215e-05, + "loss": 5.1448, + "step": 18887 + }, + { + "epoch": 0.11233228661147587, + "grad_norm": 2.0948448181152344, + "learning_rate": 4.84595006937045e-05, + "loss": 5.3695, + "step": 18888 + }, + { + "epoch": 0.11233823389475688, + "grad_norm": 1.6369346380233765, + "learning_rate": 4.845933925794739e-05, + "loss": 5.5859, + "step": 18889 + }, + { + "epoch": 0.11234418117803788, + "grad_norm": 1.4660474061965942, + "learning_rate": 4.845917781400086e-05, + "loss": 5.6121, + "step": 18890 + }, + { + "epoch": 0.11235012846131887, + "grad_norm": 1.6739449501037598, + "learning_rate": 4.845901636186497e-05, + "loss": 5.6874, + "step": 18891 + }, + { + "epoch": 0.11235607574459987, + "grad_norm": 1.4542694091796875, + "learning_rate": 4.8458854901539794e-05, + "loss": 5.5956, + "step": 18892 + }, + { + "epoch": 0.11236202302788087, + "grad_norm": 1.3305023908615112, + "learning_rate": 4.8458693433025365e-05, + "loss": 5.658, + "step": 18893 + }, + { + "epoch": 0.11236797031116186, + "grad_norm": 1.8081300258636475, + "learning_rate": 4.845853195632175e-05, + "loss": 4.8563, + "step": 18894 + }, + { + "epoch": 0.11237391759444286, + "grad_norm": 1.8959764242172241, + "learning_rate": 4.8458370471429e-05, + "loss": 5.3051, + "step": 18895 + }, + { + "epoch": 0.11237986487772386, + "grad_norm": 1.9471427202224731, + "learning_rate": 4.845820897834718e-05, + "loss": 5.8181, + "step": 18896 + }, + { + "epoch": 0.11238581216100485, + "grad_norm": 1.6311548948287964, + "learning_rate": 4.845804747707634e-05, + "loss": 5.7714, + "step": 18897 + }, + { + "epoch": 0.11239175944428585, + "grad_norm": 1.830788493156433, + "learning_rate": 4.845788596761653e-05, + "loss": 5.9535, + "step": 18898 + }, + { + "epoch": 0.11239770672756685, + "grad_norm": 1.7896127700805664, + "learning_rate": 4.8457724449967836e-05, + "loss": 5.5385, + "step": 18899 + }, + { + "epoch": 0.11240365401084784, + "grad_norm": 1.5098718404769897, + "learning_rate": 4.845756292413027e-05, + "loss": 5.4067, + "step": 18900 + }, + { + "epoch": 0.11240960129412884, + "grad_norm": 1.9224756956100464, + "learning_rate": 4.845740139010392e-05, + "loss": 5.4863, + "step": 18901 + }, + { + "epoch": 0.11241554857740983, + "grad_norm": 2.1158740520477295, + "learning_rate": 4.845723984788884e-05, + "loss": 5.0745, + "step": 18902 + }, + { + "epoch": 0.11242149586069083, + "grad_norm": 2.292292594909668, + "learning_rate": 4.845707829748507e-05, + "loss": 4.9248, + "step": 18903 + }, + { + "epoch": 0.11242744314397184, + "grad_norm": 2.312593698501587, + "learning_rate": 4.8456916738892675e-05, + "loss": 4.9712, + "step": 18904 + }, + { + "epoch": 0.11243339042725282, + "grad_norm": 1.7302945852279663, + "learning_rate": 4.8456755172111725e-05, + "loss": 5.0814, + "step": 18905 + }, + { + "epoch": 0.11243933771053383, + "grad_norm": 1.3441206216812134, + "learning_rate": 4.845659359714225e-05, + "loss": 5.6563, + "step": 18906 + }, + { + "epoch": 0.11244528499381483, + "grad_norm": 1.5126272439956665, + "learning_rate": 4.845643201398433e-05, + "loss": 5.607, + "step": 18907 + }, + { + "epoch": 0.11245123227709582, + "grad_norm": 1.438795804977417, + "learning_rate": 4.845627042263801e-05, + "loss": 5.5287, + "step": 18908 + }, + { + "epoch": 0.11245717956037682, + "grad_norm": 1.6724447011947632, + "learning_rate": 4.845610882310335e-05, + "loss": 5.361, + "step": 18909 + }, + { + "epoch": 0.11246312684365782, + "grad_norm": 1.7267217636108398, + "learning_rate": 4.845594721538041e-05, + "loss": 5.6361, + "step": 18910 + }, + { + "epoch": 0.11246907412693881, + "grad_norm": 1.7616380453109741, + "learning_rate": 4.845578559946923e-05, + "loss": 5.2538, + "step": 18911 + }, + { + "epoch": 0.11247502141021981, + "grad_norm": 1.8318467140197754, + "learning_rate": 4.845562397536988e-05, + "loss": 4.8236, + "step": 18912 + }, + { + "epoch": 0.11248096869350081, + "grad_norm": 2.4882378578186035, + "learning_rate": 4.8455462343082415e-05, + "loss": 4.5624, + "step": 18913 + }, + { + "epoch": 0.1124869159767818, + "grad_norm": 2.5109870433807373, + "learning_rate": 4.845530070260689e-05, + "loss": 4.7906, + "step": 18914 + }, + { + "epoch": 0.1124928632600628, + "grad_norm": 2.2084672451019287, + "learning_rate": 4.845513905394336e-05, + "loss": 4.5304, + "step": 18915 + }, + { + "epoch": 0.1124988105433438, + "grad_norm": 2.4276058673858643, + "learning_rate": 4.8454977397091885e-05, + "loss": 4.3753, + "step": 18916 + }, + { + "epoch": 0.1125047578266248, + "grad_norm": 2.5022165775299072, + "learning_rate": 4.845481573205252e-05, + "loss": 4.1849, + "step": 18917 + }, + { + "epoch": 0.1125107051099058, + "grad_norm": 2.511643171310425, + "learning_rate": 4.845465405882532e-05, + "loss": 4.4007, + "step": 18918 + }, + { + "epoch": 0.1125166523931868, + "grad_norm": 2.598860263824463, + "learning_rate": 4.845449237741034e-05, + "loss": 4.6015, + "step": 18919 + }, + { + "epoch": 0.11252259967646779, + "grad_norm": 2.339555263519287, + "learning_rate": 4.845433068780765e-05, + "loss": 4.4123, + "step": 18920 + }, + { + "epoch": 0.11252854695974879, + "grad_norm": 2.286858320236206, + "learning_rate": 4.845416899001729e-05, + "loss": 4.3709, + "step": 18921 + }, + { + "epoch": 0.11253449424302979, + "grad_norm": 2.431622266769409, + "learning_rate": 4.845400728403932e-05, + "loss": 4.2162, + "step": 18922 + }, + { + "epoch": 0.11254044152631078, + "grad_norm": 2.7147364616394043, + "learning_rate": 4.8453845569873796e-05, + "loss": 4.3949, + "step": 18923 + }, + { + "epoch": 0.11254638880959178, + "grad_norm": 2.4738264083862305, + "learning_rate": 4.8453683847520784e-05, + "loss": 4.2671, + "step": 18924 + }, + { + "epoch": 0.11255233609287278, + "grad_norm": 2.007298707962036, + "learning_rate": 4.8453522116980325e-05, + "loss": 4.9317, + "step": 18925 + }, + { + "epoch": 0.11255828337615377, + "grad_norm": 1.8057860136032104, + "learning_rate": 4.8453360378252486e-05, + "loss": 5.4763, + "step": 18926 + }, + { + "epoch": 0.11256423065943477, + "grad_norm": 1.913892149925232, + "learning_rate": 4.845319863133733e-05, + "loss": 5.3112, + "step": 18927 + }, + { + "epoch": 0.11257017794271577, + "grad_norm": 1.6226540803909302, + "learning_rate": 4.845303687623489e-05, + "loss": 5.7164, + "step": 18928 + }, + { + "epoch": 0.11257612522599676, + "grad_norm": 1.7885600328445435, + "learning_rate": 4.8452875112945253e-05, + "loss": 5.7746, + "step": 18929 + }, + { + "epoch": 0.11258207250927776, + "grad_norm": 1.5598177909851074, + "learning_rate": 4.8452713341468444e-05, + "loss": 5.7843, + "step": 18930 + }, + { + "epoch": 0.11258801979255875, + "grad_norm": 1.517059564590454, + "learning_rate": 4.845255156180455e-05, + "loss": 5.7777, + "step": 18931 + }, + { + "epoch": 0.11259396707583975, + "grad_norm": 1.2515442371368408, + "learning_rate": 4.84523897739536e-05, + "loss": 5.7443, + "step": 18932 + }, + { + "epoch": 0.11259991435912076, + "grad_norm": 1.4970554113388062, + "learning_rate": 4.845222797791566e-05, + "loss": 5.6157, + "step": 18933 + }, + { + "epoch": 0.11260586164240174, + "grad_norm": 1.632620930671692, + "learning_rate": 4.8452066173690804e-05, + "loss": 5.0715, + "step": 18934 + }, + { + "epoch": 0.11261180892568275, + "grad_norm": 1.9634324312210083, + "learning_rate": 4.845190436127907e-05, + "loss": 5.3624, + "step": 18935 + }, + { + "epoch": 0.11261775620896375, + "grad_norm": 1.663560152053833, + "learning_rate": 4.8451742540680514e-05, + "loss": 5.4324, + "step": 18936 + }, + { + "epoch": 0.11262370349224474, + "grad_norm": 1.560684323310852, + "learning_rate": 4.84515807118952e-05, + "loss": 4.8426, + "step": 18937 + }, + { + "epoch": 0.11262965077552574, + "grad_norm": 1.5759334564208984, + "learning_rate": 4.8451418874923185e-05, + "loss": 5.6239, + "step": 18938 + }, + { + "epoch": 0.11263559805880674, + "grad_norm": 1.8501811027526855, + "learning_rate": 4.8451257029764504e-05, + "loss": 5.1734, + "step": 18939 + }, + { + "epoch": 0.11264154534208773, + "grad_norm": 1.811924934387207, + "learning_rate": 4.845109517641925e-05, + "loss": 5.2778, + "step": 18940 + }, + { + "epoch": 0.11264749262536873, + "grad_norm": 1.9684933423995972, + "learning_rate": 4.845093331488746e-05, + "loss": 5.3673, + "step": 18941 + }, + { + "epoch": 0.11265343990864973, + "grad_norm": 2.1155457496643066, + "learning_rate": 4.8450771445169185e-05, + "loss": 4.6955, + "step": 18942 + }, + { + "epoch": 0.11265938719193072, + "grad_norm": 2.117941379547119, + "learning_rate": 4.8450609567264495e-05, + "loss": 4.4051, + "step": 18943 + }, + { + "epoch": 0.11266533447521172, + "grad_norm": 1.9649946689605713, + "learning_rate": 4.845044768117343e-05, + "loss": 5.0204, + "step": 18944 + }, + { + "epoch": 0.11267128175849273, + "grad_norm": 1.898119568824768, + "learning_rate": 4.845028578689606e-05, + "loss": 4.9994, + "step": 18945 + }, + { + "epoch": 0.11267722904177371, + "grad_norm": 2.4376771450042725, + "learning_rate": 4.845012388443244e-05, + "loss": 4.6852, + "step": 18946 + }, + { + "epoch": 0.11268317632505472, + "grad_norm": 2.593094825744629, + "learning_rate": 4.844996197378262e-05, + "loss": 4.3845, + "step": 18947 + }, + { + "epoch": 0.11268912360833572, + "grad_norm": 2.6004302501678467, + "learning_rate": 4.844980005494666e-05, + "loss": 4.2989, + "step": 18948 + }, + { + "epoch": 0.1126950708916167, + "grad_norm": 2.4045653343200684, + "learning_rate": 4.844963812792462e-05, + "loss": 4.411, + "step": 18949 + }, + { + "epoch": 0.11270101817489771, + "grad_norm": 2.2256572246551514, + "learning_rate": 4.8449476192716555e-05, + "loss": 4.423, + "step": 18950 + }, + { + "epoch": 0.11270696545817871, + "grad_norm": 2.110077142715454, + "learning_rate": 4.844931424932252e-05, + "loss": 4.2971, + "step": 18951 + }, + { + "epoch": 0.1127129127414597, + "grad_norm": 1.8960111141204834, + "learning_rate": 4.844915229774257e-05, + "loss": 5.0758, + "step": 18952 + }, + { + "epoch": 0.1127188600247407, + "grad_norm": 1.998542308807373, + "learning_rate": 4.844899033797676e-05, + "loss": 4.8565, + "step": 18953 + }, + { + "epoch": 0.1127248073080217, + "grad_norm": 1.7070491313934326, + "learning_rate": 4.8448828370025156e-05, + "loss": 5.4684, + "step": 18954 + }, + { + "epoch": 0.11273075459130269, + "grad_norm": 2.062570095062256, + "learning_rate": 4.8448666393887806e-05, + "loss": 5.5384, + "step": 18955 + }, + { + "epoch": 0.11273670187458369, + "grad_norm": 1.8782148361206055, + "learning_rate": 4.844850440956476e-05, + "loss": 5.0373, + "step": 18956 + }, + { + "epoch": 0.1127426491578647, + "grad_norm": 2.3674817085266113, + "learning_rate": 4.8448342417056096e-05, + "loss": 5.1999, + "step": 18957 + }, + { + "epoch": 0.11274859644114568, + "grad_norm": 2.2243809700012207, + "learning_rate": 4.844818041636186e-05, + "loss": 5.3275, + "step": 18958 + }, + { + "epoch": 0.11275454372442668, + "grad_norm": 2.2929039001464844, + "learning_rate": 4.8448018407482096e-05, + "loss": 5.3958, + "step": 18959 + }, + { + "epoch": 0.11276049100770767, + "grad_norm": 2.0325045585632324, + "learning_rate": 4.844785639041688e-05, + "loss": 4.6686, + "step": 18960 + }, + { + "epoch": 0.11276643829098867, + "grad_norm": 1.8510624170303345, + "learning_rate": 4.8447694365166255e-05, + "loss": 4.9134, + "step": 18961 + }, + { + "epoch": 0.11277238557426968, + "grad_norm": 1.7537583112716675, + "learning_rate": 4.844753233173027e-05, + "loss": 5.0618, + "step": 18962 + }, + { + "epoch": 0.11277833285755066, + "grad_norm": 1.9293370246887207, + "learning_rate": 4.844737029010901e-05, + "loss": 4.8716, + "step": 18963 + }, + { + "epoch": 0.11278428014083167, + "grad_norm": 1.6931575536727905, + "learning_rate": 4.844720824030251e-05, + "loss": 5.4606, + "step": 18964 + }, + { + "epoch": 0.11279022742411267, + "grad_norm": 1.970825433731079, + "learning_rate": 4.8447046182310836e-05, + "loss": 5.2482, + "step": 18965 + }, + { + "epoch": 0.11279617470739366, + "grad_norm": 1.4842323064804077, + "learning_rate": 4.844688411613404e-05, + "loss": 5.972, + "step": 18966 + }, + { + "epoch": 0.11280212199067466, + "grad_norm": 1.84175705909729, + "learning_rate": 4.8446722041772174e-05, + "loss": 4.7696, + "step": 18967 + }, + { + "epoch": 0.11280806927395566, + "grad_norm": 1.8980286121368408, + "learning_rate": 4.84465599592253e-05, + "loss": 4.5125, + "step": 18968 + }, + { + "epoch": 0.11281401655723665, + "grad_norm": 1.7349838018417358, + "learning_rate": 4.844639786849348e-05, + "loss": 4.581, + "step": 18969 + }, + { + "epoch": 0.11281996384051765, + "grad_norm": 1.5894320011138916, + "learning_rate": 4.844623576957675e-05, + "loss": 4.9205, + "step": 18970 + }, + { + "epoch": 0.11282591112379865, + "grad_norm": 1.8740227222442627, + "learning_rate": 4.84460736624752e-05, + "loss": 4.938, + "step": 18971 + }, + { + "epoch": 0.11283185840707964, + "grad_norm": 1.744537591934204, + "learning_rate": 4.8445911547188854e-05, + "loss": 5.5215, + "step": 18972 + }, + { + "epoch": 0.11283780569036064, + "grad_norm": 1.5465041399002075, + "learning_rate": 4.844574942371779e-05, + "loss": 5.3607, + "step": 18973 + }, + { + "epoch": 0.11284375297364165, + "grad_norm": 1.8417413234710693, + "learning_rate": 4.8445587292062056e-05, + "loss": 5.632, + "step": 18974 + }, + { + "epoch": 0.11284970025692263, + "grad_norm": 1.7401045560836792, + "learning_rate": 4.8445425152221704e-05, + "loss": 5.5514, + "step": 18975 + }, + { + "epoch": 0.11285564754020364, + "grad_norm": 1.6192666292190552, + "learning_rate": 4.8445263004196805e-05, + "loss": 5.2694, + "step": 18976 + }, + { + "epoch": 0.11286159482348464, + "grad_norm": 1.842510461807251, + "learning_rate": 4.84451008479874e-05, + "loss": 5.3429, + "step": 18977 + }, + { + "epoch": 0.11286754210676563, + "grad_norm": 1.4824966192245483, + "learning_rate": 4.8444938683593554e-05, + "loss": 5.5212, + "step": 18978 + }, + { + "epoch": 0.11287348939004663, + "grad_norm": 1.7926548719406128, + "learning_rate": 4.8444776511015324e-05, + "loss": 4.8687, + "step": 18979 + }, + { + "epoch": 0.11287943667332763, + "grad_norm": 1.7114008665084839, + "learning_rate": 4.844461433025277e-05, + "loss": 4.7459, + "step": 18980 + }, + { + "epoch": 0.11288538395660862, + "grad_norm": 1.8884011507034302, + "learning_rate": 4.844445214130594e-05, + "loss": 5.1957, + "step": 18981 + }, + { + "epoch": 0.11289133123988962, + "grad_norm": 1.6901582479476929, + "learning_rate": 4.844428994417489e-05, + "loss": 5.3349, + "step": 18982 + }, + { + "epoch": 0.11289727852317062, + "grad_norm": 1.7148336172103882, + "learning_rate": 4.844412773885968e-05, + "loss": 5.4903, + "step": 18983 + }, + { + "epoch": 0.11290322580645161, + "grad_norm": 1.478767991065979, + "learning_rate": 4.844396552536037e-05, + "loss": 5.6339, + "step": 18984 + }, + { + "epoch": 0.11290917308973261, + "grad_norm": 1.5679733753204346, + "learning_rate": 4.844380330367701e-05, + "loss": 5.4722, + "step": 18985 + }, + { + "epoch": 0.11291512037301361, + "grad_norm": 1.718564510345459, + "learning_rate": 4.844364107380966e-05, + "loss": 5.2826, + "step": 18986 + }, + { + "epoch": 0.1129210676562946, + "grad_norm": 1.6757621765136719, + "learning_rate": 4.844347883575839e-05, + "loss": 5.7454, + "step": 18987 + }, + { + "epoch": 0.1129270149395756, + "grad_norm": 1.9370322227478027, + "learning_rate": 4.844331658952324e-05, + "loss": 4.6631, + "step": 18988 + }, + { + "epoch": 0.1129329622228566, + "grad_norm": 1.9932162761688232, + "learning_rate": 4.844315433510426e-05, + "loss": 4.7486, + "step": 18989 + }, + { + "epoch": 0.1129389095061376, + "grad_norm": 2.0191309452056885, + "learning_rate": 4.844299207250152e-05, + "loss": 4.6999, + "step": 18990 + }, + { + "epoch": 0.1129448567894186, + "grad_norm": 1.971913456916809, + "learning_rate": 4.8442829801715074e-05, + "loss": 4.7345, + "step": 18991 + }, + { + "epoch": 0.11295080407269958, + "grad_norm": 1.8503371477127075, + "learning_rate": 4.844266752274498e-05, + "loss": 4.5352, + "step": 18992 + }, + { + "epoch": 0.11295675135598059, + "grad_norm": 2.0024712085723877, + "learning_rate": 4.8442505235591294e-05, + "loss": 4.6513, + "step": 18993 + }, + { + "epoch": 0.11296269863926159, + "grad_norm": 1.645996332168579, + "learning_rate": 4.844234294025407e-05, + "loss": 4.816, + "step": 18994 + }, + { + "epoch": 0.11296864592254258, + "grad_norm": 1.6649290323257446, + "learning_rate": 4.844218063673337e-05, + "loss": 5.1471, + "step": 18995 + }, + { + "epoch": 0.11297459320582358, + "grad_norm": 1.4211794137954712, + "learning_rate": 4.844201832502924e-05, + "loss": 5.0807, + "step": 18996 + }, + { + "epoch": 0.11298054048910458, + "grad_norm": 1.6982463598251343, + "learning_rate": 4.844185600514175e-05, + "loss": 4.9912, + "step": 18997 + }, + { + "epoch": 0.11298648777238557, + "grad_norm": 1.5852501392364502, + "learning_rate": 4.844169367707095e-05, + "loss": 5.4541, + "step": 18998 + }, + { + "epoch": 0.11299243505566657, + "grad_norm": 1.787331223487854, + "learning_rate": 4.844153134081689e-05, + "loss": 5.4295, + "step": 18999 + }, + { + "epoch": 0.11299838233894757, + "grad_norm": 1.5758492946624756, + "learning_rate": 4.844136899637964e-05, + "loss": 5.2601, + "step": 19000 + }, + { + "epoch": 0.11300432962222856, + "grad_norm": 1.5441172122955322, + "learning_rate": 4.844120664375925e-05, + "loss": 4.882, + "step": 19001 + }, + { + "epoch": 0.11301027690550956, + "grad_norm": 1.6587432622909546, + "learning_rate": 4.8441044282955774e-05, + "loss": 4.8311, + "step": 19002 + }, + { + "epoch": 0.11301622418879056, + "grad_norm": 1.6563838720321655, + "learning_rate": 4.844088191396927e-05, + "loss": 4.87, + "step": 19003 + }, + { + "epoch": 0.11302217147207155, + "grad_norm": 1.7367866039276123, + "learning_rate": 4.84407195367998e-05, + "loss": 5.2984, + "step": 19004 + }, + { + "epoch": 0.11302811875535256, + "grad_norm": 2.3307883739471436, + "learning_rate": 4.844055715144742e-05, + "loss": 4.8798, + "step": 19005 + }, + { + "epoch": 0.11303406603863356, + "grad_norm": 2.601762294769287, + "learning_rate": 4.844039475791218e-05, + "loss": 4.8156, + "step": 19006 + }, + { + "epoch": 0.11304001332191455, + "grad_norm": 2.372610330581665, + "learning_rate": 4.844023235619414e-05, + "loss": 4.0715, + "step": 19007 + }, + { + "epoch": 0.11304596060519555, + "grad_norm": 2.16119384765625, + "learning_rate": 4.8440069946293356e-05, + "loss": 4.2701, + "step": 19008 + }, + { + "epoch": 0.11305190788847655, + "grad_norm": 2.1576502323150635, + "learning_rate": 4.843990752820989e-05, + "loss": 4.1302, + "step": 19009 + }, + { + "epoch": 0.11305785517175754, + "grad_norm": 2.122025489807129, + "learning_rate": 4.843974510194379e-05, + "loss": 4.0969, + "step": 19010 + }, + { + "epoch": 0.11306380245503854, + "grad_norm": 2.1929194927215576, + "learning_rate": 4.843958266749512e-05, + "loss": 4.2054, + "step": 19011 + }, + { + "epoch": 0.11306974973831954, + "grad_norm": 2.6305301189422607, + "learning_rate": 4.843942022486393e-05, + "loss": 4.3942, + "step": 19012 + }, + { + "epoch": 0.11307569702160053, + "grad_norm": 2.5355119705200195, + "learning_rate": 4.843925777405028e-05, + "loss": 4.4392, + "step": 19013 + }, + { + "epoch": 0.11308164430488153, + "grad_norm": 2.5040411949157715, + "learning_rate": 4.843909531505424e-05, + "loss": 4.221, + "step": 19014 + }, + { + "epoch": 0.11308759158816253, + "grad_norm": 2.15824556350708, + "learning_rate": 4.843893284787584e-05, + "loss": 4.8255, + "step": 19015 + }, + { + "epoch": 0.11309353887144352, + "grad_norm": 1.6300889253616333, + "learning_rate": 4.8438770372515155e-05, + "loss": 5.3668, + "step": 19016 + }, + { + "epoch": 0.11309948615472452, + "grad_norm": 1.745676875114441, + "learning_rate": 4.8438607888972245e-05, + "loss": 5.2858, + "step": 19017 + }, + { + "epoch": 0.11310543343800553, + "grad_norm": 1.6511434316635132, + "learning_rate": 4.8438445397247146e-05, + "loss": 5.2856, + "step": 19018 + }, + { + "epoch": 0.11311138072128651, + "grad_norm": 1.6282720565795898, + "learning_rate": 4.843828289733994e-05, + "loss": 5.7748, + "step": 19019 + }, + { + "epoch": 0.11311732800456752, + "grad_norm": 1.6303821802139282, + "learning_rate": 4.843812038925066e-05, + "loss": 5.3627, + "step": 19020 + }, + { + "epoch": 0.1131232752878485, + "grad_norm": 1.5684829950332642, + "learning_rate": 4.843795787297938e-05, + "loss": 5.6563, + "step": 19021 + }, + { + "epoch": 0.1131292225711295, + "grad_norm": 1.9084935188293457, + "learning_rate": 4.843779534852615e-05, + "loss": 5.7084, + "step": 19022 + }, + { + "epoch": 0.11313516985441051, + "grad_norm": 1.5176855325698853, + "learning_rate": 4.843763281589103e-05, + "loss": 5.7602, + "step": 19023 + }, + { + "epoch": 0.1131411171376915, + "grad_norm": 1.3877767324447632, + "learning_rate": 4.843747027507407e-05, + "loss": 5.4914, + "step": 19024 + }, + { + "epoch": 0.1131470644209725, + "grad_norm": 2.0801119804382324, + "learning_rate": 4.843730772607533e-05, + "loss": 4.8814, + "step": 19025 + }, + { + "epoch": 0.1131530117042535, + "grad_norm": 1.9673620462417603, + "learning_rate": 4.8437145168894874e-05, + "loss": 4.9423, + "step": 19026 + }, + { + "epoch": 0.11315895898753449, + "grad_norm": 1.5284085273742676, + "learning_rate": 4.8436982603532755e-05, + "loss": 5.0471, + "step": 19027 + }, + { + "epoch": 0.11316490627081549, + "grad_norm": 1.870762825012207, + "learning_rate": 4.8436820029989023e-05, + "loss": 4.9376, + "step": 19028 + }, + { + "epoch": 0.11317085355409649, + "grad_norm": 1.9094692468643188, + "learning_rate": 4.843665744826374e-05, + "loss": 4.8677, + "step": 19029 + }, + { + "epoch": 0.11317680083737748, + "grad_norm": 1.6463623046875, + "learning_rate": 4.8436494858356964e-05, + "loss": 5.3397, + "step": 19030 + }, + { + "epoch": 0.11318274812065848, + "grad_norm": 1.8127562999725342, + "learning_rate": 4.8436332260268745e-05, + "loss": 5.1626, + "step": 19031 + }, + { + "epoch": 0.11318869540393948, + "grad_norm": 1.5196025371551514, + "learning_rate": 4.8436169653999144e-05, + "loss": 5.1213, + "step": 19032 + }, + { + "epoch": 0.11319464268722047, + "grad_norm": 1.8930630683898926, + "learning_rate": 4.843600703954823e-05, + "loss": 4.8268, + "step": 19033 + }, + { + "epoch": 0.11320058997050148, + "grad_norm": 2.1579136848449707, + "learning_rate": 4.843584441691603e-05, + "loss": 5.6111, + "step": 19034 + }, + { + "epoch": 0.11320653725378248, + "grad_norm": 1.7644915580749512, + "learning_rate": 4.8435681786102624e-05, + "loss": 5.5762, + "step": 19035 + }, + { + "epoch": 0.11321248453706347, + "grad_norm": 1.5442852973937988, + "learning_rate": 4.843551914710808e-05, + "loss": 5.6486, + "step": 19036 + }, + { + "epoch": 0.11321843182034447, + "grad_norm": 1.823852777481079, + "learning_rate": 4.843535649993242e-05, + "loss": 5.6581, + "step": 19037 + }, + { + "epoch": 0.11322437910362547, + "grad_norm": 1.5850268602371216, + "learning_rate": 4.8435193844575726e-05, + "loss": 5.6351, + "step": 19038 + }, + { + "epoch": 0.11323032638690646, + "grad_norm": 1.6234556436538696, + "learning_rate": 4.843503118103805e-05, + "loss": 5.5462, + "step": 19039 + }, + { + "epoch": 0.11323627367018746, + "grad_norm": 1.602618932723999, + "learning_rate": 4.843486850931944e-05, + "loss": 5.2935, + "step": 19040 + }, + { + "epoch": 0.11324222095346846, + "grad_norm": 1.6808282136917114, + "learning_rate": 4.843470582941997e-05, + "loss": 5.2254, + "step": 19041 + }, + { + "epoch": 0.11324816823674945, + "grad_norm": 1.6311568021774292, + "learning_rate": 4.8434543141339674e-05, + "loss": 5.1894, + "step": 19042 + }, + { + "epoch": 0.11325411552003045, + "grad_norm": 1.5836867094039917, + "learning_rate": 4.843438044507863e-05, + "loss": 5.6344, + "step": 19043 + }, + { + "epoch": 0.11326006280331145, + "grad_norm": 1.5654397010803223, + "learning_rate": 4.843421774063688e-05, + "loss": 5.2902, + "step": 19044 + }, + { + "epoch": 0.11326601008659244, + "grad_norm": 2.3957626819610596, + "learning_rate": 4.843405502801449e-05, + "loss": 4.812, + "step": 19045 + }, + { + "epoch": 0.11327195736987344, + "grad_norm": 2.123473644256592, + "learning_rate": 4.843389230721151e-05, + "loss": 4.6399, + "step": 19046 + }, + { + "epoch": 0.11327790465315445, + "grad_norm": 1.6691471338272095, + "learning_rate": 4.8433729578228007e-05, + "loss": 4.9337, + "step": 19047 + }, + { + "epoch": 0.11328385193643543, + "grad_norm": 1.6179373264312744, + "learning_rate": 4.8433566841064025e-05, + "loss": 5.1002, + "step": 19048 + }, + { + "epoch": 0.11328979921971644, + "grad_norm": 1.658995270729065, + "learning_rate": 4.843340409571963e-05, + "loss": 5.0397, + "step": 19049 + }, + { + "epoch": 0.11329574650299742, + "grad_norm": 2.0216362476348877, + "learning_rate": 4.843324134219488e-05, + "loss": 5.3112, + "step": 19050 + }, + { + "epoch": 0.11330169378627843, + "grad_norm": 2.0376546382904053, + "learning_rate": 4.843307858048982e-05, + "loss": 5.087, + "step": 19051 + }, + { + "epoch": 0.11330764106955943, + "grad_norm": 2.2038021087646484, + "learning_rate": 4.8432915810604516e-05, + "loss": 4.951, + "step": 19052 + }, + { + "epoch": 0.11331358835284042, + "grad_norm": 1.8985834121704102, + "learning_rate": 4.843275303253903e-05, + "loss": 5.522, + "step": 19053 + }, + { + "epoch": 0.11331953563612142, + "grad_norm": 1.9047077894210815, + "learning_rate": 4.8432590246293404e-05, + "loss": 5.8387, + "step": 19054 + }, + { + "epoch": 0.11332548291940242, + "grad_norm": 1.508352279663086, + "learning_rate": 4.8432427451867704e-05, + "loss": 5.7969, + "step": 19055 + }, + { + "epoch": 0.11333143020268341, + "grad_norm": 1.631695032119751, + "learning_rate": 4.8432264649261984e-05, + "loss": 5.3562, + "step": 19056 + }, + { + "epoch": 0.11333737748596441, + "grad_norm": 1.673411250114441, + "learning_rate": 4.8432101838476305e-05, + "loss": 5.3286, + "step": 19057 + }, + { + "epoch": 0.11334332476924541, + "grad_norm": 2.697946071624756, + "learning_rate": 4.843193901951072e-05, + "loss": 5.0634, + "step": 19058 + }, + { + "epoch": 0.1133492720525264, + "grad_norm": 2.5914673805236816, + "learning_rate": 4.843177619236529e-05, + "loss": 4.8294, + "step": 19059 + }, + { + "epoch": 0.1133552193358074, + "grad_norm": 1.8503727912902832, + "learning_rate": 4.843161335704007e-05, + "loss": 5.1436, + "step": 19060 + }, + { + "epoch": 0.1133611666190884, + "grad_norm": 1.7629435062408447, + "learning_rate": 4.843145051353511e-05, + "loss": 5.1822, + "step": 19061 + }, + { + "epoch": 0.11336711390236939, + "grad_norm": 1.826360821723938, + "learning_rate": 4.843128766185048e-05, + "loss": 5.5151, + "step": 19062 + }, + { + "epoch": 0.1133730611856504, + "grad_norm": 2.0347046852111816, + "learning_rate": 4.843112480198623e-05, + "loss": 4.7732, + "step": 19063 + }, + { + "epoch": 0.1133790084689314, + "grad_norm": 2.037482738494873, + "learning_rate": 4.843096193394241e-05, + "loss": 4.6475, + "step": 19064 + }, + { + "epoch": 0.11338495575221239, + "grad_norm": 2.1152050495147705, + "learning_rate": 4.8430799057719076e-05, + "loss": 4.531, + "step": 19065 + }, + { + "epoch": 0.11339090303549339, + "grad_norm": 2.303982734680176, + "learning_rate": 4.8430636173316306e-05, + "loss": 4.6317, + "step": 19066 + }, + { + "epoch": 0.11339685031877439, + "grad_norm": 2.3326570987701416, + "learning_rate": 4.843047328073414e-05, + "loss": 4.736, + "step": 19067 + }, + { + "epoch": 0.11340279760205538, + "grad_norm": 2.371316909790039, + "learning_rate": 4.8430310379972634e-05, + "loss": 4.806, + "step": 19068 + }, + { + "epoch": 0.11340874488533638, + "grad_norm": 2.5370912551879883, + "learning_rate": 4.8430147471031855e-05, + "loss": 4.7867, + "step": 19069 + }, + { + "epoch": 0.11341469216861738, + "grad_norm": 2.456982135772705, + "learning_rate": 4.842998455391185e-05, + "loss": 4.6942, + "step": 19070 + }, + { + "epoch": 0.11342063945189837, + "grad_norm": 2.526287078857422, + "learning_rate": 4.842982162861268e-05, + "loss": 4.7333, + "step": 19071 + }, + { + "epoch": 0.11342658673517937, + "grad_norm": 2.2763514518737793, + "learning_rate": 4.84296586951344e-05, + "loss": 4.712, + "step": 19072 + }, + { + "epoch": 0.11343253401846037, + "grad_norm": 2.330958366394043, + "learning_rate": 4.842949575347707e-05, + "loss": 4.5875, + "step": 19073 + }, + { + "epoch": 0.11343848130174136, + "grad_norm": 2.390018939971924, + "learning_rate": 4.8429332803640745e-05, + "loss": 4.6941, + "step": 19074 + }, + { + "epoch": 0.11344442858502236, + "grad_norm": 2.279719829559326, + "learning_rate": 4.842916984562548e-05, + "loss": 4.6216, + "step": 19075 + }, + { + "epoch": 0.11345037586830337, + "grad_norm": 2.2815043926239014, + "learning_rate": 4.842900687943133e-05, + "loss": 4.5667, + "step": 19076 + }, + { + "epoch": 0.11345632315158435, + "grad_norm": 2.301231861114502, + "learning_rate": 4.842884390505836e-05, + "loss": 4.5451, + "step": 19077 + }, + { + "epoch": 0.11346227043486536, + "grad_norm": 2.1763200759887695, + "learning_rate": 4.842868092250662e-05, + "loss": 4.5937, + "step": 19078 + }, + { + "epoch": 0.11346821771814634, + "grad_norm": 2.2151448726654053, + "learning_rate": 4.842851793177618e-05, + "loss": 4.8341, + "step": 19079 + }, + { + "epoch": 0.11347416500142735, + "grad_norm": 2.3094639778137207, + "learning_rate": 4.8428354932867085e-05, + "loss": 4.7308, + "step": 19080 + }, + { + "epoch": 0.11348011228470835, + "grad_norm": 1.5218987464904785, + "learning_rate": 4.8428191925779385e-05, + "loss": 5.2701, + "step": 19081 + }, + { + "epoch": 0.11348605956798934, + "grad_norm": 1.3781639337539673, + "learning_rate": 4.842802891051315e-05, + "loss": 5.6873, + "step": 19082 + }, + { + "epoch": 0.11349200685127034, + "grad_norm": 1.814702033996582, + "learning_rate": 4.842786588706842e-05, + "loss": 5.7713, + "step": 19083 + }, + { + "epoch": 0.11349795413455134, + "grad_norm": 1.5691754817962646, + "learning_rate": 4.842770285544528e-05, + "loss": 5.7115, + "step": 19084 + }, + { + "epoch": 0.11350390141783233, + "grad_norm": 1.962762713432312, + "learning_rate": 4.8427539815643766e-05, + "loss": 5.4159, + "step": 19085 + }, + { + "epoch": 0.11350984870111333, + "grad_norm": 1.6766527891159058, + "learning_rate": 4.842737676766393e-05, + "loss": 5.6007, + "step": 19086 + }, + { + "epoch": 0.11351579598439433, + "grad_norm": 1.782934308052063, + "learning_rate": 4.8427213711505844e-05, + "loss": 5.982, + "step": 19087 + }, + { + "epoch": 0.11352174326767532, + "grad_norm": 1.5706422328948975, + "learning_rate": 4.842705064716957e-05, + "loss": 5.5125, + "step": 19088 + }, + { + "epoch": 0.11352769055095632, + "grad_norm": 2.4957141876220703, + "learning_rate": 4.842688757465515e-05, + "loss": 4.5386, + "step": 19089 + }, + { + "epoch": 0.11353363783423732, + "grad_norm": 2.1444833278656006, + "learning_rate": 4.842672449396264e-05, + "loss": 4.6108, + "step": 19090 + }, + { + "epoch": 0.11353958511751831, + "grad_norm": 2.4586305618286133, + "learning_rate": 4.8426561405092106e-05, + "loss": 4.7453, + "step": 19091 + }, + { + "epoch": 0.11354553240079931, + "grad_norm": 2.228759765625, + "learning_rate": 4.8426398308043605e-05, + "loss": 4.662, + "step": 19092 + }, + { + "epoch": 0.11355147968408032, + "grad_norm": 2.029172420501709, + "learning_rate": 4.8426235202817184e-05, + "loss": 4.6389, + "step": 19093 + }, + { + "epoch": 0.1135574269673613, + "grad_norm": 2.1887340545654297, + "learning_rate": 4.842607208941291e-05, + "loss": 4.6852, + "step": 19094 + }, + { + "epoch": 0.11356337425064231, + "grad_norm": 1.7664849758148193, + "learning_rate": 4.842590896783084e-05, + "loss": 5.2435, + "step": 19095 + }, + { + "epoch": 0.11356932153392331, + "grad_norm": 1.5581247806549072, + "learning_rate": 4.8425745838071016e-05, + "loss": 5.6828, + "step": 19096 + }, + { + "epoch": 0.1135752688172043, + "grad_norm": 1.570602297782898, + "learning_rate": 4.842558270013352e-05, + "loss": 5.7011, + "step": 19097 + }, + { + "epoch": 0.1135812161004853, + "grad_norm": 1.4669830799102783, + "learning_rate": 4.842541955401838e-05, + "loss": 5.4361, + "step": 19098 + }, + { + "epoch": 0.1135871633837663, + "grad_norm": 1.199173927307129, + "learning_rate": 4.842525639972568e-05, + "loss": 5.5198, + "step": 19099 + }, + { + "epoch": 0.11359311066704729, + "grad_norm": 1.1747777462005615, + "learning_rate": 4.842509323725546e-05, + "loss": 5.6252, + "step": 19100 + }, + { + "epoch": 0.11359905795032829, + "grad_norm": 1.4497981071472168, + "learning_rate": 4.8424930066607784e-05, + "loss": 5.4295, + "step": 19101 + }, + { + "epoch": 0.1136050052336093, + "grad_norm": 1.485688328742981, + "learning_rate": 4.8424766887782704e-05, + "loss": 5.1248, + "step": 19102 + }, + { + "epoch": 0.11361095251689028, + "grad_norm": 1.419149398803711, + "learning_rate": 4.842460370078028e-05, + "loss": 5.0604, + "step": 19103 + }, + { + "epoch": 0.11361689980017128, + "grad_norm": 1.622096300125122, + "learning_rate": 4.842444050560058e-05, + "loss": 5.4429, + "step": 19104 + }, + { + "epoch": 0.11362284708345229, + "grad_norm": 1.2471072673797607, + "learning_rate": 4.8424277302243636e-05, + "loss": 5.3636, + "step": 19105 + }, + { + "epoch": 0.11362879436673327, + "grad_norm": 1.3416316509246826, + "learning_rate": 4.842411409070952e-05, + "loss": 5.1415, + "step": 19106 + }, + { + "epoch": 0.11363474165001428, + "grad_norm": 1.3691420555114746, + "learning_rate": 4.8423950870998293e-05, + "loss": 5.3286, + "step": 19107 + }, + { + "epoch": 0.11364068893329526, + "grad_norm": 1.2382487058639526, + "learning_rate": 4.842378764311e-05, + "loss": 5.4391, + "step": 19108 + }, + { + "epoch": 0.11364663621657627, + "grad_norm": 1.1729276180267334, + "learning_rate": 4.842362440704471e-05, + "loss": 5.4158, + "step": 19109 + }, + { + "epoch": 0.11365258349985727, + "grad_norm": 1.2451897859573364, + "learning_rate": 4.842346116280247e-05, + "loss": 5.2487, + "step": 19110 + }, + { + "epoch": 0.11365853078313826, + "grad_norm": 1.255652666091919, + "learning_rate": 4.8423297910383354e-05, + "loss": 5.2759, + "step": 19111 + }, + { + "epoch": 0.11366447806641926, + "grad_norm": 1.170296549797058, + "learning_rate": 4.8423134649787394e-05, + "loss": 5.1508, + "step": 19112 + }, + { + "epoch": 0.11367042534970026, + "grad_norm": 1.3954061269760132, + "learning_rate": 4.842297138101467e-05, + "loss": 5.3102, + "step": 19113 + }, + { + "epoch": 0.11367637263298125, + "grad_norm": 1.2746593952178955, + "learning_rate": 4.842280810406522e-05, + "loss": 5.2587, + "step": 19114 + }, + { + "epoch": 0.11368231991626225, + "grad_norm": 1.3224173784255981, + "learning_rate": 4.8422644818939114e-05, + "loss": 5.1927, + "step": 19115 + }, + { + "epoch": 0.11368826719954325, + "grad_norm": 1.0930812358856201, + "learning_rate": 4.84224815256364e-05, + "loss": 5.1676, + "step": 19116 + }, + { + "epoch": 0.11369421448282424, + "grad_norm": 1.3805547952651978, + "learning_rate": 4.842231822415715e-05, + "loss": 5.066, + "step": 19117 + }, + { + "epoch": 0.11370016176610524, + "grad_norm": 1.3455450534820557, + "learning_rate": 4.84221549145014e-05, + "loss": 4.9656, + "step": 19118 + }, + { + "epoch": 0.11370610904938624, + "grad_norm": 1.442218542098999, + "learning_rate": 4.842199159666922e-05, + "loss": 4.9094, + "step": 19119 + }, + { + "epoch": 0.11371205633266723, + "grad_norm": 1.435941457748413, + "learning_rate": 4.8421828270660665e-05, + "loss": 5.1035, + "step": 19120 + }, + { + "epoch": 0.11371800361594823, + "grad_norm": 1.2507586479187012, + "learning_rate": 4.84216649364758e-05, + "loss": 5.2395, + "step": 19121 + }, + { + "epoch": 0.11372395089922924, + "grad_norm": 1.3616739511489868, + "learning_rate": 4.842150159411466e-05, + "loss": 5.2082, + "step": 19122 + }, + { + "epoch": 0.11372989818251023, + "grad_norm": 1.2988322973251343, + "learning_rate": 4.842133824357732e-05, + "loss": 5.1271, + "step": 19123 + }, + { + "epoch": 0.11373584546579123, + "grad_norm": 1.2761636972427368, + "learning_rate": 4.842117488486384e-05, + "loss": 5.1724, + "step": 19124 + }, + { + "epoch": 0.11374179274907223, + "grad_norm": 1.2834585905075073, + "learning_rate": 4.842101151797426e-05, + "loss": 5.2256, + "step": 19125 + }, + { + "epoch": 0.11374774003235322, + "grad_norm": 1.2074506282806396, + "learning_rate": 4.8420848142908655e-05, + "loss": 5.2704, + "step": 19126 + }, + { + "epoch": 0.11375368731563422, + "grad_norm": 1.355292797088623, + "learning_rate": 4.842068475966707e-05, + "loss": 5.1109, + "step": 19127 + }, + { + "epoch": 0.11375963459891522, + "grad_norm": 1.1144691705703735, + "learning_rate": 4.8420521368249565e-05, + "loss": 5.0903, + "step": 19128 + }, + { + "epoch": 0.11376558188219621, + "grad_norm": 1.3889878988265991, + "learning_rate": 4.84203579686562e-05, + "loss": 5.1289, + "step": 19129 + }, + { + "epoch": 0.11377152916547721, + "grad_norm": 1.1302597522735596, + "learning_rate": 4.8420194560887035e-05, + "loss": 4.9211, + "step": 19130 + }, + { + "epoch": 0.11377747644875821, + "grad_norm": 1.1715654134750366, + "learning_rate": 4.8420031144942115e-05, + "loss": 5.2239, + "step": 19131 + }, + { + "epoch": 0.1137834237320392, + "grad_norm": 1.327021837234497, + "learning_rate": 4.84198677208215e-05, + "loss": 5.2941, + "step": 19132 + }, + { + "epoch": 0.1137893710153202, + "grad_norm": 1.3442116975784302, + "learning_rate": 4.841970428852526e-05, + "loss": 5.1752, + "step": 19133 + }, + { + "epoch": 0.1137953182986012, + "grad_norm": 1.207207202911377, + "learning_rate": 4.841954084805344e-05, + "loss": 4.9607, + "step": 19134 + }, + { + "epoch": 0.1138012655818822, + "grad_norm": 1.1609065532684326, + "learning_rate": 4.8419377399406104e-05, + "loss": 5.0458, + "step": 19135 + }, + { + "epoch": 0.1138072128651632, + "grad_norm": 1.365605115890503, + "learning_rate": 4.84192139425833e-05, + "loss": 5.0884, + "step": 19136 + }, + { + "epoch": 0.11381316014844418, + "grad_norm": 1.5192269086837769, + "learning_rate": 4.8419050477585096e-05, + "loss": 5.4803, + "step": 19137 + }, + { + "epoch": 0.11381910743172519, + "grad_norm": 1.187456488609314, + "learning_rate": 4.841888700441153e-05, + "loss": 5.4595, + "step": 19138 + }, + { + "epoch": 0.11382505471500619, + "grad_norm": 1.1836395263671875, + "learning_rate": 4.841872352306268e-05, + "loss": 5.27, + "step": 19139 + }, + { + "epoch": 0.11383100199828718, + "grad_norm": 1.353762149810791, + "learning_rate": 4.841856003353861e-05, + "loss": 5.4646, + "step": 19140 + }, + { + "epoch": 0.11383694928156818, + "grad_norm": 1.4854416847229004, + "learning_rate": 4.8418396535839344e-05, + "loss": 5.2894, + "step": 19141 + }, + { + "epoch": 0.11384289656484918, + "grad_norm": 1.3731143474578857, + "learning_rate": 4.841823302996496e-05, + "loss": 4.7512, + "step": 19142 + }, + { + "epoch": 0.11384884384813017, + "grad_norm": 1.3945658206939697, + "learning_rate": 4.841806951591552e-05, + "loss": 4.9625, + "step": 19143 + }, + { + "epoch": 0.11385479113141117, + "grad_norm": 1.2692869901657104, + "learning_rate": 4.841790599369107e-05, + "loss": 5.2245, + "step": 19144 + }, + { + "epoch": 0.11386073841469217, + "grad_norm": 1.3667423725128174, + "learning_rate": 4.8417742463291674e-05, + "loss": 5.202, + "step": 19145 + }, + { + "epoch": 0.11386668569797316, + "grad_norm": 1.2639939785003662, + "learning_rate": 4.8417578924717377e-05, + "loss": 5.4378, + "step": 19146 + }, + { + "epoch": 0.11387263298125416, + "grad_norm": 1.327867865562439, + "learning_rate": 4.8417415377968255e-05, + "loss": 5.1632, + "step": 19147 + }, + { + "epoch": 0.11387858026453516, + "grad_norm": 1.2095093727111816, + "learning_rate": 4.841725182304435e-05, + "loss": 4.9969, + "step": 19148 + }, + { + "epoch": 0.11388452754781615, + "grad_norm": 1.3395425081253052, + "learning_rate": 4.841708825994573e-05, + "loss": 5.1797, + "step": 19149 + }, + { + "epoch": 0.11389047483109715, + "grad_norm": 1.4817496538162231, + "learning_rate": 4.841692468867244e-05, + "loss": 5.1126, + "step": 19150 + }, + { + "epoch": 0.11389642211437816, + "grad_norm": 1.3066308498382568, + "learning_rate": 4.8416761109224547e-05, + "loss": 5.2692, + "step": 19151 + }, + { + "epoch": 0.11390236939765915, + "grad_norm": 1.444701075553894, + "learning_rate": 4.84165975216021e-05, + "loss": 5.0525, + "step": 19152 + }, + { + "epoch": 0.11390831668094015, + "grad_norm": 1.2720032930374146, + "learning_rate": 4.8416433925805165e-05, + "loss": 5.138, + "step": 19153 + }, + { + "epoch": 0.11391426396422115, + "grad_norm": 1.2228437662124634, + "learning_rate": 4.84162703218338e-05, + "loss": 5.028, + "step": 19154 + }, + { + "epoch": 0.11392021124750214, + "grad_norm": 1.1950013637542725, + "learning_rate": 4.841610670968805e-05, + "loss": 5.0873, + "step": 19155 + }, + { + "epoch": 0.11392615853078314, + "grad_norm": 1.3538236618041992, + "learning_rate": 4.8415943089367976e-05, + "loss": 5.0039, + "step": 19156 + }, + { + "epoch": 0.11393210581406414, + "grad_norm": 1.3344488143920898, + "learning_rate": 4.841577946087364e-05, + "loss": 5.0215, + "step": 19157 + }, + { + "epoch": 0.11393805309734513, + "grad_norm": 1.7098866701126099, + "learning_rate": 4.841561582420511e-05, + "loss": 5.5719, + "step": 19158 + }, + { + "epoch": 0.11394400038062613, + "grad_norm": 1.3574185371398926, + "learning_rate": 4.841545217936241e-05, + "loss": 4.8491, + "step": 19159 + }, + { + "epoch": 0.11394994766390713, + "grad_norm": 1.447292447090149, + "learning_rate": 4.8415288526345634e-05, + "loss": 4.8632, + "step": 19160 + }, + { + "epoch": 0.11395589494718812, + "grad_norm": 1.6439673900604248, + "learning_rate": 4.841512486515481e-05, + "loss": 5.282, + "step": 19161 + }, + { + "epoch": 0.11396184223046912, + "grad_norm": 1.3063132762908936, + "learning_rate": 4.841496119579002e-05, + "loss": 5.0399, + "step": 19162 + }, + { + "epoch": 0.11396778951375013, + "grad_norm": 1.4244173765182495, + "learning_rate": 4.8414797518251296e-05, + "loss": 4.7731, + "step": 19163 + }, + { + "epoch": 0.11397373679703111, + "grad_norm": 1.225203514099121, + "learning_rate": 4.841463383253872e-05, + "loss": 4.8294, + "step": 19164 + }, + { + "epoch": 0.11397968408031212, + "grad_norm": 1.2978007793426514, + "learning_rate": 4.8414470138652334e-05, + "loss": 4.6336, + "step": 19165 + }, + { + "epoch": 0.1139856313635931, + "grad_norm": 1.306591272354126, + "learning_rate": 4.8414306436592194e-05, + "loss": 4.8267, + "step": 19166 + }, + { + "epoch": 0.1139915786468741, + "grad_norm": 1.1227960586547852, + "learning_rate": 4.841414272635837e-05, + "loss": 4.7438, + "step": 19167 + }, + { + "epoch": 0.11399752593015511, + "grad_norm": 1.3674911260604858, + "learning_rate": 4.8413979007950905e-05, + "loss": 4.8127, + "step": 19168 + }, + { + "epoch": 0.1140034732134361, + "grad_norm": 1.3923397064208984, + "learning_rate": 4.841381528136986e-05, + "loss": 5.1568, + "step": 19169 + }, + { + "epoch": 0.1140094204967171, + "grad_norm": 1.2014738321304321, + "learning_rate": 4.84136515466153e-05, + "loss": 5.0116, + "step": 19170 + }, + { + "epoch": 0.1140153677799981, + "grad_norm": 1.3564008474349976, + "learning_rate": 4.841348780368726e-05, + "loss": 5.1181, + "step": 19171 + }, + { + "epoch": 0.11402131506327909, + "grad_norm": 1.1918834447860718, + "learning_rate": 4.841332405258583e-05, + "loss": 5.0854, + "step": 19172 + }, + { + "epoch": 0.11402726234656009, + "grad_norm": 1.2056841850280762, + "learning_rate": 4.8413160293311047e-05, + "loss": 4.825, + "step": 19173 + }, + { + "epoch": 0.11403320962984109, + "grad_norm": 1.3841508626937866, + "learning_rate": 4.841299652586298e-05, + "loss": 4.7543, + "step": 19174 + }, + { + "epoch": 0.11403915691312208, + "grad_norm": 1.511307716369629, + "learning_rate": 4.841283275024166e-05, + "loss": 4.9821, + "step": 19175 + }, + { + "epoch": 0.11404510419640308, + "grad_norm": 1.2577831745147705, + "learning_rate": 4.8412668966447175e-05, + "loss": 5.0138, + "step": 19176 + }, + { + "epoch": 0.11405105147968408, + "grad_norm": 1.442159652709961, + "learning_rate": 4.841250517447956e-05, + "loss": 5.0066, + "step": 19177 + }, + { + "epoch": 0.11405699876296507, + "grad_norm": 1.3029484748840332, + "learning_rate": 4.841234137433889e-05, + "loss": 4.9229, + "step": 19178 + }, + { + "epoch": 0.11406294604624607, + "grad_norm": 1.3138917684555054, + "learning_rate": 4.841217756602521e-05, + "loss": 4.6262, + "step": 19179 + }, + { + "epoch": 0.11406889332952708, + "grad_norm": 1.2164885997772217, + "learning_rate": 4.841201374953857e-05, + "loss": 4.7952, + "step": 19180 + }, + { + "epoch": 0.11407484061280806, + "grad_norm": 1.4247347116470337, + "learning_rate": 4.8411849924879046e-05, + "loss": 5.0066, + "step": 19181 + }, + { + "epoch": 0.11408078789608907, + "grad_norm": 1.236006736755371, + "learning_rate": 4.8411686092046695e-05, + "loss": 4.6585, + "step": 19182 + }, + { + "epoch": 0.11408673517937007, + "grad_norm": 1.2381118535995483, + "learning_rate": 4.841152225104156e-05, + "loss": 5.0935, + "step": 19183 + }, + { + "epoch": 0.11409268246265106, + "grad_norm": 1.3557883501052856, + "learning_rate": 4.84113584018637e-05, + "loss": 5.1536, + "step": 19184 + }, + { + "epoch": 0.11409862974593206, + "grad_norm": 1.3191505670547485, + "learning_rate": 4.8411194544513184e-05, + "loss": 5.2857, + "step": 19185 + }, + { + "epoch": 0.11410457702921306, + "grad_norm": 1.2058855295181274, + "learning_rate": 4.841103067899006e-05, + "loss": 5.142, + "step": 19186 + }, + { + "epoch": 0.11411052431249405, + "grad_norm": 1.163136601448059, + "learning_rate": 4.8410866805294384e-05, + "loss": 5.1891, + "step": 19187 + }, + { + "epoch": 0.11411647159577505, + "grad_norm": 1.3245770931243896, + "learning_rate": 4.841070292342622e-05, + "loss": 5.0629, + "step": 19188 + }, + { + "epoch": 0.11412241887905605, + "grad_norm": 1.13837730884552, + "learning_rate": 4.841053903338562e-05, + "loss": 5.1045, + "step": 19189 + }, + { + "epoch": 0.11412836616233704, + "grad_norm": 1.4724907875061035, + "learning_rate": 4.8410375135172646e-05, + "loss": 5.01, + "step": 19190 + }, + { + "epoch": 0.11413431344561804, + "grad_norm": 1.3786016702651978, + "learning_rate": 4.841021122878735e-05, + "loss": 5.0188, + "step": 19191 + }, + { + "epoch": 0.11414026072889905, + "grad_norm": 1.2996101379394531, + "learning_rate": 4.841004731422979e-05, + "loss": 4.954, + "step": 19192 + }, + { + "epoch": 0.11414620801218003, + "grad_norm": 1.297892451286316, + "learning_rate": 4.840988339150002e-05, + "loss": 4.9841, + "step": 19193 + }, + { + "epoch": 0.11415215529546104, + "grad_norm": 1.3011624813079834, + "learning_rate": 4.84097194605981e-05, + "loss": 4.8547, + "step": 19194 + }, + { + "epoch": 0.11415810257874202, + "grad_norm": 1.2169194221496582, + "learning_rate": 4.8409555521524096e-05, + "loss": 4.8801, + "step": 19195 + }, + { + "epoch": 0.11416404986202303, + "grad_norm": 1.4189658164978027, + "learning_rate": 4.8409391574278065e-05, + "loss": 4.9521, + "step": 19196 + }, + { + "epoch": 0.11416999714530403, + "grad_norm": 1.4178590774536133, + "learning_rate": 4.840922761886004e-05, + "loss": 4.7847, + "step": 19197 + }, + { + "epoch": 0.11417594442858502, + "grad_norm": 1.395585536956787, + "learning_rate": 4.8409063655270105e-05, + "loss": 5.0404, + "step": 19198 + }, + { + "epoch": 0.11418189171186602, + "grad_norm": 1.4803121089935303, + "learning_rate": 4.840889968350831e-05, + "loss": 4.8851, + "step": 19199 + }, + { + "epoch": 0.11418783899514702, + "grad_norm": 1.4736177921295166, + "learning_rate": 4.84087357035747e-05, + "loss": 4.9127, + "step": 19200 + }, + { + "epoch": 0.11419378627842801, + "grad_norm": 1.2947148084640503, + "learning_rate": 4.8408571715469354e-05, + "loss": 4.9169, + "step": 19201 + }, + { + "epoch": 0.11419973356170901, + "grad_norm": 1.2428392171859741, + "learning_rate": 4.840840771919232e-05, + "loss": 5.2759, + "step": 19202 + }, + { + "epoch": 0.11420568084499001, + "grad_norm": 1.2743968963623047, + "learning_rate": 4.840824371474364e-05, + "loss": 5.2273, + "step": 19203 + }, + { + "epoch": 0.114211628128271, + "grad_norm": 1.3068950176239014, + "learning_rate": 4.840807970212339e-05, + "loss": 5.3455, + "step": 19204 + }, + { + "epoch": 0.114217575411552, + "grad_norm": 1.2238211631774902, + "learning_rate": 4.8407915681331614e-05, + "loss": 5.024, + "step": 19205 + }, + { + "epoch": 0.114223522694833, + "grad_norm": 1.1461126804351807, + "learning_rate": 4.8407751652368384e-05, + "loss": 5.2113, + "step": 19206 + }, + { + "epoch": 0.11422946997811399, + "grad_norm": 1.2286972999572754, + "learning_rate": 4.840758761523375e-05, + "loss": 5.006, + "step": 19207 + }, + { + "epoch": 0.114235417261395, + "grad_norm": 1.3054790496826172, + "learning_rate": 4.840742356992777e-05, + "loss": 5.0592, + "step": 19208 + }, + { + "epoch": 0.114241364544676, + "grad_norm": 1.2426046133041382, + "learning_rate": 4.84072595164505e-05, + "loss": 5.1058, + "step": 19209 + }, + { + "epoch": 0.11424731182795698, + "grad_norm": 1.325263261795044, + "learning_rate": 4.840709545480199e-05, + "loss": 5.0528, + "step": 19210 + }, + { + "epoch": 0.11425325911123799, + "grad_norm": 1.1753286123275757, + "learning_rate": 4.840693138498231e-05, + "loss": 5.2193, + "step": 19211 + }, + { + "epoch": 0.11425920639451899, + "grad_norm": 1.486204743385315, + "learning_rate": 4.8406767306991515e-05, + "loss": 5.0389, + "step": 19212 + }, + { + "epoch": 0.11426515367779998, + "grad_norm": 1.344887614250183, + "learning_rate": 4.8406603220829655e-05, + "loss": 5.0072, + "step": 19213 + }, + { + "epoch": 0.11427110096108098, + "grad_norm": 1.270340919494629, + "learning_rate": 4.840643912649679e-05, + "loss": 5.0154, + "step": 19214 + }, + { + "epoch": 0.11427704824436198, + "grad_norm": 1.390960454940796, + "learning_rate": 4.8406275023992983e-05, + "loss": 5.0803, + "step": 19215 + }, + { + "epoch": 0.11428299552764297, + "grad_norm": 1.2927583456039429, + "learning_rate": 4.8406110913318294e-05, + "loss": 5.04, + "step": 19216 + }, + { + "epoch": 0.11428894281092397, + "grad_norm": 1.3101180791854858, + "learning_rate": 4.840594679447275e-05, + "loss": 4.9988, + "step": 19217 + }, + { + "epoch": 0.11429489009420497, + "grad_norm": 1.2187588214874268, + "learning_rate": 4.8405782667456454e-05, + "loss": 5.1006, + "step": 19218 + }, + { + "epoch": 0.11430083737748596, + "grad_norm": 1.3578346967697144, + "learning_rate": 4.840561853226944e-05, + "loss": 5.0528, + "step": 19219 + }, + { + "epoch": 0.11430678466076696, + "grad_norm": 1.8960474729537964, + "learning_rate": 4.840545438891176e-05, + "loss": 5.323, + "step": 19220 + }, + { + "epoch": 0.11431273194404797, + "grad_norm": 1.3410239219665527, + "learning_rate": 4.840529023738348e-05, + "loss": 5.1488, + "step": 19221 + }, + { + "epoch": 0.11431867922732895, + "grad_norm": 1.381373405456543, + "learning_rate": 4.840512607768465e-05, + "loss": 5.1477, + "step": 19222 + }, + { + "epoch": 0.11432462651060996, + "grad_norm": 1.4095546007156372, + "learning_rate": 4.8404961909815336e-05, + "loss": 5.1515, + "step": 19223 + }, + { + "epoch": 0.11433057379389094, + "grad_norm": 1.254451870918274, + "learning_rate": 4.840479773377559e-05, + "loss": 5.1276, + "step": 19224 + }, + { + "epoch": 0.11433652107717195, + "grad_norm": 1.3001519441604614, + "learning_rate": 4.840463354956548e-05, + "loss": 5.1561, + "step": 19225 + }, + { + "epoch": 0.11434246836045295, + "grad_norm": 1.231469750404358, + "learning_rate": 4.840446935718505e-05, + "loss": 4.963, + "step": 19226 + }, + { + "epoch": 0.11434841564373394, + "grad_norm": 1.323225736618042, + "learning_rate": 4.840430515663435e-05, + "loss": 5.0998, + "step": 19227 + }, + { + "epoch": 0.11435436292701494, + "grad_norm": 1.2244281768798828, + "learning_rate": 4.8404140947913456e-05, + "loss": 5.0727, + "step": 19228 + }, + { + "epoch": 0.11436031021029594, + "grad_norm": 1.2634974718093872, + "learning_rate": 4.840397673102242e-05, + "loss": 5.2049, + "step": 19229 + }, + { + "epoch": 0.11436625749357693, + "grad_norm": 1.5431766510009766, + "learning_rate": 4.84038125059613e-05, + "loss": 5.1387, + "step": 19230 + }, + { + "epoch": 0.11437220477685793, + "grad_norm": 1.485696792602539, + "learning_rate": 4.8403648272730145e-05, + "loss": 4.7971, + "step": 19231 + }, + { + "epoch": 0.11437815206013893, + "grad_norm": 1.4774583578109741, + "learning_rate": 4.840348403132902e-05, + "loss": 4.8967, + "step": 19232 + }, + { + "epoch": 0.11438409934341992, + "grad_norm": 1.1903584003448486, + "learning_rate": 4.840331978175798e-05, + "loss": 4.8827, + "step": 19233 + }, + { + "epoch": 0.11439004662670092, + "grad_norm": 1.3851109743118286, + "learning_rate": 4.840315552401708e-05, + "loss": 4.8348, + "step": 19234 + }, + { + "epoch": 0.11439599390998192, + "grad_norm": 1.3834025859832764, + "learning_rate": 4.840299125810639e-05, + "loss": 4.9392, + "step": 19235 + }, + { + "epoch": 0.11440194119326291, + "grad_norm": 1.2576985359191895, + "learning_rate": 4.840282698402595e-05, + "loss": 4.9092, + "step": 19236 + }, + { + "epoch": 0.11440788847654391, + "grad_norm": 1.2408863306045532, + "learning_rate": 4.840266270177583e-05, + "loss": 4.9041, + "step": 19237 + }, + { + "epoch": 0.11441383575982492, + "grad_norm": 1.4397286176681519, + "learning_rate": 4.840249841135608e-05, + "loss": 4.9588, + "step": 19238 + }, + { + "epoch": 0.1144197830431059, + "grad_norm": 1.3446424007415771, + "learning_rate": 4.840233411276676e-05, + "loss": 4.9757, + "step": 19239 + }, + { + "epoch": 0.1144257303263869, + "grad_norm": 1.2520800828933716, + "learning_rate": 4.840216980600793e-05, + "loss": 4.9746, + "step": 19240 + }, + { + "epoch": 0.11443167760966791, + "grad_norm": 1.2509692907333374, + "learning_rate": 4.840200549107963e-05, + "loss": 5.063, + "step": 19241 + }, + { + "epoch": 0.1144376248929489, + "grad_norm": 1.3295235633850098, + "learning_rate": 4.840184116798194e-05, + "loss": 5.02, + "step": 19242 + }, + { + "epoch": 0.1144435721762299, + "grad_norm": 1.3346072435379028, + "learning_rate": 4.8401676836714916e-05, + "loss": 5.0393, + "step": 19243 + }, + { + "epoch": 0.1144495194595109, + "grad_norm": 1.6711392402648926, + "learning_rate": 4.84015124972786e-05, + "loss": 5.0856, + "step": 19244 + }, + { + "epoch": 0.11445546674279189, + "grad_norm": 1.2785863876342773, + "learning_rate": 4.8401348149673065e-05, + "loss": 5.1181, + "step": 19245 + }, + { + "epoch": 0.11446141402607289, + "grad_norm": 1.4998282194137573, + "learning_rate": 4.8401183793898354e-05, + "loss": 5.0101, + "step": 19246 + }, + { + "epoch": 0.1144673613093539, + "grad_norm": 1.4768141508102417, + "learning_rate": 4.840101942995454e-05, + "loss": 4.8256, + "step": 19247 + }, + { + "epoch": 0.11447330859263488, + "grad_norm": 1.3829854726791382, + "learning_rate": 4.840085505784167e-05, + "loss": 4.8298, + "step": 19248 + }, + { + "epoch": 0.11447925587591588, + "grad_norm": 1.2079180479049683, + "learning_rate": 4.840069067755979e-05, + "loss": 4.9054, + "step": 19249 + }, + { + "epoch": 0.11448520315919689, + "grad_norm": 1.464245080947876, + "learning_rate": 4.8400526289108984e-05, + "loss": 4.8943, + "step": 19250 + }, + { + "epoch": 0.11449115044247787, + "grad_norm": 1.400992512702942, + "learning_rate": 4.840036189248929e-05, + "loss": 4.754, + "step": 19251 + }, + { + "epoch": 0.11449709772575888, + "grad_norm": 1.41909921169281, + "learning_rate": 4.840019748770077e-05, + "loss": 4.9179, + "step": 19252 + }, + { + "epoch": 0.11450304500903986, + "grad_norm": 1.3990073204040527, + "learning_rate": 4.840003307474349e-05, + "loss": 4.7989, + "step": 19253 + }, + { + "epoch": 0.11450899229232087, + "grad_norm": 1.2858465909957886, + "learning_rate": 4.8399868653617497e-05, + "loss": 4.7556, + "step": 19254 + }, + { + "epoch": 0.11451493957560187, + "grad_norm": 1.2721470594406128, + "learning_rate": 4.8399704224322854e-05, + "loss": 4.8441, + "step": 19255 + }, + { + "epoch": 0.11452088685888286, + "grad_norm": 1.2352218627929688, + "learning_rate": 4.839953978685961e-05, + "loss": 4.753, + "step": 19256 + }, + { + "epoch": 0.11452683414216386, + "grad_norm": 1.3000402450561523, + "learning_rate": 4.8399375341227834e-05, + "loss": 4.7634, + "step": 19257 + }, + { + "epoch": 0.11453278142544486, + "grad_norm": 1.2934285402297974, + "learning_rate": 4.839921088742757e-05, + "loss": 4.8047, + "step": 19258 + }, + { + "epoch": 0.11453872870872585, + "grad_norm": 1.5773643255233765, + "learning_rate": 4.839904642545889e-05, + "loss": 4.8588, + "step": 19259 + }, + { + "epoch": 0.11454467599200685, + "grad_norm": 1.3872511386871338, + "learning_rate": 4.8398881955321844e-05, + "loss": 5.0781, + "step": 19260 + }, + { + "epoch": 0.11455062327528785, + "grad_norm": 1.403011679649353, + "learning_rate": 4.839871747701649e-05, + "loss": 5.1375, + "step": 19261 + }, + { + "epoch": 0.11455657055856884, + "grad_norm": 1.2086342573165894, + "learning_rate": 4.839855299054289e-05, + "loss": 5.1052, + "step": 19262 + }, + { + "epoch": 0.11456251784184984, + "grad_norm": 1.3916890621185303, + "learning_rate": 4.8398388495901085e-05, + "loss": 5.0687, + "step": 19263 + }, + { + "epoch": 0.11456846512513084, + "grad_norm": 1.4591625928878784, + "learning_rate": 4.839822399309115e-05, + "loss": 5.0098, + "step": 19264 + }, + { + "epoch": 0.11457441240841183, + "grad_norm": 1.3421653509140015, + "learning_rate": 4.839805948211314e-05, + "loss": 4.9511, + "step": 19265 + }, + { + "epoch": 0.11458035969169283, + "grad_norm": 1.3959892988204956, + "learning_rate": 4.83978949629671e-05, + "loss": 5.0206, + "step": 19266 + }, + { + "epoch": 0.11458630697497384, + "grad_norm": 1.3058884143829346, + "learning_rate": 4.839773043565311e-05, + "loss": 5.0885, + "step": 19267 + }, + { + "epoch": 0.11459225425825482, + "grad_norm": 1.452760100364685, + "learning_rate": 4.839756590017121e-05, + "loss": 4.9945, + "step": 19268 + }, + { + "epoch": 0.11459820154153583, + "grad_norm": 1.4445050954818726, + "learning_rate": 4.8397401356521454e-05, + "loss": 4.8128, + "step": 19269 + }, + { + "epoch": 0.11460414882481683, + "grad_norm": 1.2491203546524048, + "learning_rate": 4.8397236804703916e-05, + "loss": 4.7355, + "step": 19270 + }, + { + "epoch": 0.11461009610809782, + "grad_norm": 1.3198809623718262, + "learning_rate": 4.839707224471864e-05, + "loss": 4.7621, + "step": 19271 + }, + { + "epoch": 0.11461604339137882, + "grad_norm": 1.4831585884094238, + "learning_rate": 4.8396907676565686e-05, + "loss": 4.7393, + "step": 19272 + }, + { + "epoch": 0.11462199067465982, + "grad_norm": 1.2767844200134277, + "learning_rate": 4.839674310024512e-05, + "loss": 4.8063, + "step": 19273 + }, + { + "epoch": 0.11462793795794081, + "grad_norm": 1.4342589378356934, + "learning_rate": 4.839657851575698e-05, + "loss": 4.7615, + "step": 19274 + }, + { + "epoch": 0.11463388524122181, + "grad_norm": 1.30052649974823, + "learning_rate": 4.839641392310135e-05, + "loss": 4.7389, + "step": 19275 + }, + { + "epoch": 0.11463983252450281, + "grad_norm": 1.3592944145202637, + "learning_rate": 4.8396249322278266e-05, + "loss": 4.704, + "step": 19276 + }, + { + "epoch": 0.1146457798077838, + "grad_norm": 1.1905149221420288, + "learning_rate": 4.83960847132878e-05, + "loss": 4.7189, + "step": 19277 + }, + { + "epoch": 0.1146517270910648, + "grad_norm": 1.4920209646224976, + "learning_rate": 4.8395920096129996e-05, + "loss": 4.8844, + "step": 19278 + }, + { + "epoch": 0.1146576743743458, + "grad_norm": 1.486556887626648, + "learning_rate": 4.839575547080491e-05, + "loss": 4.9462, + "step": 19279 + }, + { + "epoch": 0.1146636216576268, + "grad_norm": 1.500434160232544, + "learning_rate": 4.839559083731262e-05, + "loss": 4.9118, + "step": 19280 + }, + { + "epoch": 0.1146695689409078, + "grad_norm": 1.5061683654785156, + "learning_rate": 4.839542619565317e-05, + "loss": 4.7921, + "step": 19281 + }, + { + "epoch": 0.11467551622418878, + "grad_norm": 1.587161660194397, + "learning_rate": 4.839526154582662e-05, + "loss": 5.1129, + "step": 19282 + }, + { + "epoch": 0.11468146350746979, + "grad_norm": 1.3225055932998657, + "learning_rate": 4.839509688783302e-05, + "loss": 4.8538, + "step": 19283 + }, + { + "epoch": 0.11468741079075079, + "grad_norm": 1.3121862411499023, + "learning_rate": 4.839493222167244e-05, + "loss": 4.8695, + "step": 19284 + }, + { + "epoch": 0.11469335807403178, + "grad_norm": 1.4202474355697632, + "learning_rate": 4.839476754734492e-05, + "loss": 4.8628, + "step": 19285 + }, + { + "epoch": 0.11469930535731278, + "grad_norm": 1.283316969871521, + "learning_rate": 4.8394602864850534e-05, + "loss": 4.8431, + "step": 19286 + }, + { + "epoch": 0.11470525264059378, + "grad_norm": 1.3255420923233032, + "learning_rate": 4.839443817418934e-05, + "loss": 4.9993, + "step": 19287 + }, + { + "epoch": 0.11471119992387477, + "grad_norm": 1.3569047451019287, + "learning_rate": 4.8394273475361386e-05, + "loss": 4.9478, + "step": 19288 + }, + { + "epoch": 0.11471714720715577, + "grad_norm": 1.2374382019042969, + "learning_rate": 4.839410876836673e-05, + "loss": 5.1119, + "step": 19289 + }, + { + "epoch": 0.11472309449043677, + "grad_norm": 1.3518184423446655, + "learning_rate": 4.839394405320543e-05, + "loss": 5.2506, + "step": 19290 + }, + { + "epoch": 0.11472904177371776, + "grad_norm": 1.2599278688430786, + "learning_rate": 4.839377932987755e-05, + "loss": 5.208, + "step": 19291 + }, + { + "epoch": 0.11473498905699876, + "grad_norm": 1.3122080564498901, + "learning_rate": 4.839361459838314e-05, + "loss": 5.2356, + "step": 19292 + }, + { + "epoch": 0.11474093634027976, + "grad_norm": 1.1587629318237305, + "learning_rate": 4.839344985872226e-05, + "loss": 5.2469, + "step": 19293 + }, + { + "epoch": 0.11474688362356075, + "grad_norm": 1.2733700275421143, + "learning_rate": 4.839328511089498e-05, + "loss": 5.2365, + "step": 19294 + }, + { + "epoch": 0.11475283090684175, + "grad_norm": 1.3206977844238281, + "learning_rate": 4.8393120354901334e-05, + "loss": 5.2242, + "step": 19295 + }, + { + "epoch": 0.11475877819012276, + "grad_norm": 1.1924374103546143, + "learning_rate": 4.83929555907414e-05, + "loss": 5.2916, + "step": 19296 + }, + { + "epoch": 0.11476472547340374, + "grad_norm": 1.2989557981491089, + "learning_rate": 4.8392790818415215e-05, + "loss": 5.173, + "step": 19297 + }, + { + "epoch": 0.11477067275668475, + "grad_norm": 1.3470929861068726, + "learning_rate": 4.839262603792286e-05, + "loss": 5.2309, + "step": 19298 + }, + { + "epoch": 0.11477662003996575, + "grad_norm": 1.1529438495635986, + "learning_rate": 4.8392461249264376e-05, + "loss": 5.2373, + "step": 19299 + }, + { + "epoch": 0.11478256732324674, + "grad_norm": 1.1988370418548584, + "learning_rate": 4.839229645243982e-05, + "loss": 5.2067, + "step": 19300 + }, + { + "epoch": 0.11478851460652774, + "grad_norm": 1.3069959878921509, + "learning_rate": 4.839213164744926e-05, + "loss": 5.1413, + "step": 19301 + }, + { + "epoch": 0.11479446188980874, + "grad_norm": 1.230211615562439, + "learning_rate": 4.839196683429275e-05, + "loss": 5.2076, + "step": 19302 + }, + { + "epoch": 0.11480040917308973, + "grad_norm": 1.3232944011688232, + "learning_rate": 4.839180201297034e-05, + "loss": 5.2077, + "step": 19303 + }, + { + "epoch": 0.11480635645637073, + "grad_norm": 1.2436466217041016, + "learning_rate": 4.839163718348211e-05, + "loss": 5.1646, + "step": 19304 + }, + { + "epoch": 0.11481230373965173, + "grad_norm": 1.160416841506958, + "learning_rate": 4.8391472345828085e-05, + "loss": 5.0582, + "step": 19305 + }, + { + "epoch": 0.11481825102293272, + "grad_norm": 1.3895483016967773, + "learning_rate": 4.8391307500008344e-05, + "loss": 5.2516, + "step": 19306 + }, + { + "epoch": 0.11482419830621372, + "grad_norm": 1.5018577575683594, + "learning_rate": 4.8391142646022935e-05, + "loss": 5.4308, + "step": 19307 + }, + { + "epoch": 0.11483014558949473, + "grad_norm": 1.5278204679489136, + "learning_rate": 4.8390977783871925e-05, + "loss": 5.2238, + "step": 19308 + }, + { + "epoch": 0.11483609287277571, + "grad_norm": 1.5735019445419312, + "learning_rate": 4.839081291355536e-05, + "loss": 5.4874, + "step": 19309 + }, + { + "epoch": 0.11484204015605672, + "grad_norm": 1.4098745584487915, + "learning_rate": 4.839064803507332e-05, + "loss": 5.082, + "step": 19310 + }, + { + "epoch": 0.1148479874393377, + "grad_norm": 1.47605299949646, + "learning_rate": 4.8390483148425824e-05, + "loss": 5.0869, + "step": 19311 + }, + { + "epoch": 0.1148539347226187, + "grad_norm": 1.442550778388977, + "learning_rate": 4.8390318253612966e-05, + "loss": 5.1232, + "step": 19312 + }, + { + "epoch": 0.11485988200589971, + "grad_norm": 1.1225110292434692, + "learning_rate": 4.8390153350634785e-05, + "loss": 5.0782, + "step": 19313 + }, + { + "epoch": 0.1148658292891807, + "grad_norm": 1.329656720161438, + "learning_rate": 4.838998843949135e-05, + "loss": 4.9912, + "step": 19314 + }, + { + "epoch": 0.1148717765724617, + "grad_norm": 1.6484954357147217, + "learning_rate": 4.8389823520182704e-05, + "loss": 4.785, + "step": 19315 + }, + { + "epoch": 0.1148777238557427, + "grad_norm": 1.46773099899292, + "learning_rate": 4.838965859270891e-05, + "loss": 4.7835, + "step": 19316 + }, + { + "epoch": 0.11488367113902369, + "grad_norm": 1.717592477798462, + "learning_rate": 4.838949365707004e-05, + "loss": 5.1603, + "step": 19317 + }, + { + "epoch": 0.11488961842230469, + "grad_norm": 1.7265046834945679, + "learning_rate": 4.838932871326613e-05, + "loss": 4.9057, + "step": 19318 + }, + { + "epoch": 0.11489556570558569, + "grad_norm": 1.6203346252441406, + "learning_rate": 4.838916376129725e-05, + "loss": 4.8206, + "step": 19319 + }, + { + "epoch": 0.11490151298886668, + "grad_norm": 1.2972123622894287, + "learning_rate": 4.838899880116345e-05, + "loss": 4.7026, + "step": 19320 + }, + { + "epoch": 0.11490746027214768, + "grad_norm": 1.4215303659439087, + "learning_rate": 4.838883383286479e-05, + "loss": 4.7032, + "step": 19321 + }, + { + "epoch": 0.11491340755542868, + "grad_norm": 1.442439317703247, + "learning_rate": 4.838866885640134e-05, + "loss": 4.6853, + "step": 19322 + }, + { + "epoch": 0.11491935483870967, + "grad_norm": 1.3752079010009766, + "learning_rate": 4.838850387177315e-05, + "loss": 4.6842, + "step": 19323 + }, + { + "epoch": 0.11492530212199067, + "grad_norm": 1.4834825992584229, + "learning_rate": 4.838833887898026e-05, + "loss": 4.6455, + "step": 19324 + }, + { + "epoch": 0.11493124940527168, + "grad_norm": 1.3493545055389404, + "learning_rate": 4.8388173878022743e-05, + "loss": 4.5489, + "step": 19325 + }, + { + "epoch": 0.11493719668855266, + "grad_norm": 1.5903066396713257, + "learning_rate": 4.838800886890067e-05, + "loss": 4.5574, + "step": 19326 + }, + { + "epoch": 0.11494314397183367, + "grad_norm": 1.3842332363128662, + "learning_rate": 4.8387843851614076e-05, + "loss": 4.7516, + "step": 19327 + }, + { + "epoch": 0.11494909125511467, + "grad_norm": 1.5355647802352905, + "learning_rate": 4.838767882616303e-05, + "loss": 4.5984, + "step": 19328 + }, + { + "epoch": 0.11495503853839566, + "grad_norm": 1.6534103155136108, + "learning_rate": 4.838751379254759e-05, + "loss": 4.7761, + "step": 19329 + }, + { + "epoch": 0.11496098582167666, + "grad_norm": 1.7028656005859375, + "learning_rate": 4.83873487507678e-05, + "loss": 5.0164, + "step": 19330 + }, + { + "epoch": 0.11496693310495766, + "grad_norm": 1.7165244817733765, + "learning_rate": 4.838718370082374e-05, + "loss": 5.1044, + "step": 19331 + }, + { + "epoch": 0.11497288038823865, + "grad_norm": 1.3272297382354736, + "learning_rate": 4.838701864271545e-05, + "loss": 5.0072, + "step": 19332 + }, + { + "epoch": 0.11497882767151965, + "grad_norm": 1.553613543510437, + "learning_rate": 4.8386853576442994e-05, + "loss": 4.945, + "step": 19333 + }, + { + "epoch": 0.11498477495480065, + "grad_norm": 1.4403818845748901, + "learning_rate": 4.8386688502006425e-05, + "loss": 5.0661, + "step": 19334 + }, + { + "epoch": 0.11499072223808164, + "grad_norm": 1.5347598791122437, + "learning_rate": 4.8386523419405814e-05, + "loss": 5.0603, + "step": 19335 + }, + { + "epoch": 0.11499666952136264, + "grad_norm": 1.3777856826782227, + "learning_rate": 4.83863583286412e-05, + "loss": 5.112, + "step": 19336 + }, + { + "epoch": 0.11500261680464365, + "grad_norm": 1.794287919998169, + "learning_rate": 4.8386193229712654e-05, + "loss": 5.1972, + "step": 19337 + }, + { + "epoch": 0.11500856408792463, + "grad_norm": 1.3142359256744385, + "learning_rate": 4.8386028122620234e-05, + "loss": 5.3577, + "step": 19338 + }, + { + "epoch": 0.11501451137120564, + "grad_norm": 1.0925400257110596, + "learning_rate": 4.838586300736399e-05, + "loss": 5.2094, + "step": 19339 + }, + { + "epoch": 0.11502045865448662, + "grad_norm": 1.6456180810928345, + "learning_rate": 4.838569788394398e-05, + "loss": 4.8287, + "step": 19340 + }, + { + "epoch": 0.11502640593776763, + "grad_norm": 1.2811404466629028, + "learning_rate": 4.8385532752360265e-05, + "loss": 5.0659, + "step": 19341 + }, + { + "epoch": 0.11503235322104863, + "grad_norm": 1.392863154411316, + "learning_rate": 4.83853676126129e-05, + "loss": 5.2655, + "step": 19342 + }, + { + "epoch": 0.11503830050432962, + "grad_norm": 1.2255772352218628, + "learning_rate": 4.838520246470195e-05, + "loss": 5.0422, + "step": 19343 + }, + { + "epoch": 0.11504424778761062, + "grad_norm": 1.735661506652832, + "learning_rate": 4.8385037308627465e-05, + "loss": 6.0562, + "step": 19344 + }, + { + "epoch": 0.11505019507089162, + "grad_norm": 1.2034478187561035, + "learning_rate": 4.838487214438951e-05, + "loss": 4.9773, + "step": 19345 + }, + { + "epoch": 0.11505614235417261, + "grad_norm": 1.2786695957183838, + "learning_rate": 4.838470697198813e-05, + "loss": 4.8771, + "step": 19346 + }, + { + "epoch": 0.11506208963745361, + "grad_norm": 1.2345244884490967, + "learning_rate": 4.8384541791423394e-05, + "loss": 5.0098, + "step": 19347 + }, + { + "epoch": 0.11506803692073461, + "grad_norm": 1.3156319856643677, + "learning_rate": 4.838437660269536e-05, + "loss": 5.1089, + "step": 19348 + }, + { + "epoch": 0.1150739842040156, + "grad_norm": 1.3406500816345215, + "learning_rate": 4.838421140580407e-05, + "loss": 4.8374, + "step": 19349 + }, + { + "epoch": 0.1150799314872966, + "grad_norm": 1.412318468093872, + "learning_rate": 4.83840462007496e-05, + "loss": 4.9074, + "step": 19350 + }, + { + "epoch": 0.1150858787705776, + "grad_norm": 1.3075577020645142, + "learning_rate": 4.8383880987532004e-05, + "loss": 4.9694, + "step": 19351 + }, + { + "epoch": 0.11509182605385859, + "grad_norm": 1.178300380706787, + "learning_rate": 4.838371576615134e-05, + "loss": 4.9863, + "step": 19352 + }, + { + "epoch": 0.1150977733371396, + "grad_norm": 1.5120453834533691, + "learning_rate": 4.838355053660765e-05, + "loss": 4.8766, + "step": 19353 + }, + { + "epoch": 0.1151037206204206, + "grad_norm": 1.4834094047546387, + "learning_rate": 4.8383385298901014e-05, + "loss": 4.9724, + "step": 19354 + }, + { + "epoch": 0.11510966790370158, + "grad_norm": 1.561998724937439, + "learning_rate": 4.8383220053031475e-05, + "loss": 4.9239, + "step": 19355 + }, + { + "epoch": 0.11511561518698259, + "grad_norm": 1.4366774559020996, + "learning_rate": 4.83830547989991e-05, + "loss": 4.8052, + "step": 19356 + }, + { + "epoch": 0.11512156247026359, + "grad_norm": 1.2530354261398315, + "learning_rate": 4.8382889536803936e-05, + "loss": 5.0115, + "step": 19357 + }, + { + "epoch": 0.11512750975354458, + "grad_norm": 1.4827991724014282, + "learning_rate": 4.838272426644606e-05, + "loss": 5.1592, + "step": 19358 + }, + { + "epoch": 0.11513345703682558, + "grad_norm": 1.5874660015106201, + "learning_rate": 4.83825589879255e-05, + "loss": 5.0255, + "step": 19359 + }, + { + "epoch": 0.11513940432010658, + "grad_norm": 1.4771748781204224, + "learning_rate": 4.8382393701242335e-05, + "loss": 5.1537, + "step": 19360 + }, + { + "epoch": 0.11514535160338757, + "grad_norm": 1.4980419874191284, + "learning_rate": 4.8382228406396625e-05, + "loss": 5.0109, + "step": 19361 + }, + { + "epoch": 0.11515129888666857, + "grad_norm": 1.5008245706558228, + "learning_rate": 4.8382063103388405e-05, + "loss": 5.1644, + "step": 19362 + }, + { + "epoch": 0.11515724616994957, + "grad_norm": 1.425648808479309, + "learning_rate": 4.838189779221777e-05, + "loss": 4.8298, + "step": 19363 + }, + { + "epoch": 0.11516319345323056, + "grad_norm": 1.4478559494018555, + "learning_rate": 4.8381732472884744e-05, + "loss": 5.2984, + "step": 19364 + }, + { + "epoch": 0.11516914073651156, + "grad_norm": 1.5071446895599365, + "learning_rate": 4.83815671453894e-05, + "loss": 4.9557, + "step": 19365 + }, + { + "epoch": 0.11517508801979257, + "grad_norm": 1.6358442306518555, + "learning_rate": 4.8381401809731785e-05, + "loss": 4.7956, + "step": 19366 + }, + { + "epoch": 0.11518103530307355, + "grad_norm": 1.5035837888717651, + "learning_rate": 4.838123646591197e-05, + "loss": 4.816, + "step": 19367 + }, + { + "epoch": 0.11518698258635456, + "grad_norm": 1.4265867471694946, + "learning_rate": 4.838107111393e-05, + "loss": 4.7911, + "step": 19368 + }, + { + "epoch": 0.11519292986963554, + "grad_norm": 1.489668369293213, + "learning_rate": 4.838090575378595e-05, + "loss": 4.8403, + "step": 19369 + }, + { + "epoch": 0.11519887715291655, + "grad_norm": 1.4454714059829712, + "learning_rate": 4.838074038547986e-05, + "loss": 4.8848, + "step": 19370 + }, + { + "epoch": 0.11520482443619755, + "grad_norm": 1.42531418800354, + "learning_rate": 4.83805750090118e-05, + "loss": 5.0249, + "step": 19371 + }, + { + "epoch": 0.11521077171947854, + "grad_norm": 1.4370076656341553, + "learning_rate": 4.8380409624381826e-05, + "loss": 4.9219, + "step": 19372 + }, + { + "epoch": 0.11521671900275954, + "grad_norm": 1.543291449546814, + "learning_rate": 4.838024423158999e-05, + "loss": 4.9835, + "step": 19373 + }, + { + "epoch": 0.11522266628604054, + "grad_norm": 1.2460718154907227, + "learning_rate": 4.838007883063634e-05, + "loss": 5.0426, + "step": 19374 + }, + { + "epoch": 0.11522861356932153, + "grad_norm": 1.5159900188446045, + "learning_rate": 4.837991342152096e-05, + "loss": 5.0214, + "step": 19375 + }, + { + "epoch": 0.11523456085260253, + "grad_norm": 1.3800876140594482, + "learning_rate": 4.837974800424389e-05, + "loss": 4.7606, + "step": 19376 + }, + { + "epoch": 0.11524050813588353, + "grad_norm": 1.509788155555725, + "learning_rate": 4.8379582578805197e-05, + "loss": 4.9886, + "step": 19377 + }, + { + "epoch": 0.11524645541916452, + "grad_norm": 1.292523741722107, + "learning_rate": 4.837941714520492e-05, + "loss": 5.1574, + "step": 19378 + }, + { + "epoch": 0.11525240270244552, + "grad_norm": 1.351827621459961, + "learning_rate": 4.837925170344314e-05, + "loss": 5.3133, + "step": 19379 + }, + { + "epoch": 0.11525834998572652, + "grad_norm": 1.4871753454208374, + "learning_rate": 4.83790862535199e-05, + "loss": 4.843, + "step": 19380 + }, + { + "epoch": 0.11526429726900751, + "grad_norm": 1.6031657457351685, + "learning_rate": 4.8378920795435264e-05, + "loss": 4.8244, + "step": 19381 + }, + { + "epoch": 0.11527024455228851, + "grad_norm": 1.3754857778549194, + "learning_rate": 4.8378755329189294e-05, + "loss": 4.8421, + "step": 19382 + }, + { + "epoch": 0.11527619183556952, + "grad_norm": 1.5428962707519531, + "learning_rate": 4.837858985478203e-05, + "loss": 4.9472, + "step": 19383 + }, + { + "epoch": 0.1152821391188505, + "grad_norm": 1.45586097240448, + "learning_rate": 4.837842437221356e-05, + "loss": 4.874, + "step": 19384 + }, + { + "epoch": 0.1152880864021315, + "grad_norm": 1.5139529705047607, + "learning_rate": 4.837825888148391e-05, + "loss": 4.8867, + "step": 19385 + }, + { + "epoch": 0.11529403368541251, + "grad_norm": 1.6341979503631592, + "learning_rate": 4.837809338259315e-05, + "loss": 4.8476, + "step": 19386 + }, + { + "epoch": 0.1152999809686935, + "grad_norm": 1.45046865940094, + "learning_rate": 4.837792787554134e-05, + "loss": 5.0273, + "step": 19387 + }, + { + "epoch": 0.1153059282519745, + "grad_norm": 1.2840397357940674, + "learning_rate": 4.8377762360328547e-05, + "loss": 5.1717, + "step": 19388 + }, + { + "epoch": 0.1153118755352555, + "grad_norm": 1.4211467504501343, + "learning_rate": 4.8377596836954805e-05, + "loss": 5.021, + "step": 19389 + }, + { + "epoch": 0.11531782281853649, + "grad_norm": 1.3885877132415771, + "learning_rate": 4.837743130542019e-05, + "loss": 5.2158, + "step": 19390 + }, + { + "epoch": 0.11532377010181749, + "grad_norm": 1.2344088554382324, + "learning_rate": 4.837726576572476e-05, + "loss": 5.212, + "step": 19391 + }, + { + "epoch": 0.11532971738509849, + "grad_norm": 1.1903822422027588, + "learning_rate": 4.837710021786857e-05, + "loss": 5.3071, + "step": 19392 + }, + { + "epoch": 0.11533566466837948, + "grad_norm": 1.4263699054718018, + "learning_rate": 4.837693466185167e-05, + "loss": 5.1472, + "step": 19393 + }, + { + "epoch": 0.11534161195166048, + "grad_norm": 1.201027512550354, + "learning_rate": 4.837676909767412e-05, + "loss": 5.1779, + "step": 19394 + }, + { + "epoch": 0.11534755923494149, + "grad_norm": 1.2903262376785278, + "learning_rate": 4.8376603525335995e-05, + "loss": 5.038, + "step": 19395 + }, + { + "epoch": 0.11535350651822247, + "grad_norm": 1.3125475645065308, + "learning_rate": 4.837643794483733e-05, + "loss": 4.8948, + "step": 19396 + }, + { + "epoch": 0.11535945380150348, + "grad_norm": 1.1773933172225952, + "learning_rate": 4.837627235617819e-05, + "loss": 5.0854, + "step": 19397 + }, + { + "epoch": 0.11536540108478446, + "grad_norm": 1.2542996406555176, + "learning_rate": 4.837610675935864e-05, + "loss": 5.1329, + "step": 19398 + }, + { + "epoch": 0.11537134836806547, + "grad_norm": 1.1876561641693115, + "learning_rate": 4.837594115437873e-05, + "loss": 4.9757, + "step": 19399 + }, + { + "epoch": 0.11537729565134647, + "grad_norm": 1.2957814931869507, + "learning_rate": 4.837577554123852e-05, + "loss": 5.1203, + "step": 19400 + }, + { + "epoch": 0.11538324293462746, + "grad_norm": 1.2537682056427002, + "learning_rate": 4.837560991993807e-05, + "loss": 4.975, + "step": 19401 + }, + { + "epoch": 0.11538919021790846, + "grad_norm": 1.1898986101150513, + "learning_rate": 4.837544429047743e-05, + "loss": 4.9028, + "step": 19402 + }, + { + "epoch": 0.11539513750118946, + "grad_norm": 1.4129477739334106, + "learning_rate": 4.837527865285667e-05, + "loss": 4.7576, + "step": 19403 + }, + { + "epoch": 0.11540108478447045, + "grad_norm": 1.5386319160461426, + "learning_rate": 4.837511300707585e-05, + "loss": 4.9332, + "step": 19404 + }, + { + "epoch": 0.11540703206775145, + "grad_norm": 1.3597557544708252, + "learning_rate": 4.8374947353135e-05, + "loss": 4.8007, + "step": 19405 + }, + { + "epoch": 0.11541297935103245, + "grad_norm": 1.8251479864120483, + "learning_rate": 4.837478169103421e-05, + "loss": 5.048, + "step": 19406 + }, + { + "epoch": 0.11541892663431344, + "grad_norm": 1.488844871520996, + "learning_rate": 4.8374616020773523e-05, + "loss": 4.855, + "step": 19407 + }, + { + "epoch": 0.11542487391759444, + "grad_norm": 1.1640641689300537, + "learning_rate": 4.8374450342352996e-05, + "loss": 4.7714, + "step": 19408 + }, + { + "epoch": 0.11543082120087544, + "grad_norm": 1.1133109331130981, + "learning_rate": 4.8374284655772696e-05, + "loss": 4.849, + "step": 19409 + }, + { + "epoch": 0.11543676848415643, + "grad_norm": 1.2767143249511719, + "learning_rate": 4.837411896103266e-05, + "loss": 4.8078, + "step": 19410 + }, + { + "epoch": 0.11544271576743743, + "grad_norm": 1.2564034461975098, + "learning_rate": 4.837395325813298e-05, + "loss": 4.8602, + "step": 19411 + }, + { + "epoch": 0.11544866305071844, + "grad_norm": 1.2702561616897583, + "learning_rate": 4.837378754707369e-05, + "loss": 4.9148, + "step": 19412 + }, + { + "epoch": 0.11545461033399942, + "grad_norm": 1.1960140466690063, + "learning_rate": 4.8373621827854845e-05, + "loss": 4.9242, + "step": 19413 + }, + { + "epoch": 0.11546055761728043, + "grad_norm": 1.3663053512573242, + "learning_rate": 4.837345610047651e-05, + "loss": 4.9837, + "step": 19414 + }, + { + "epoch": 0.11546650490056143, + "grad_norm": 1.340897560119629, + "learning_rate": 4.837329036493875e-05, + "loss": 4.8059, + "step": 19415 + }, + { + "epoch": 0.11547245218384242, + "grad_norm": 1.326195478439331, + "learning_rate": 4.8373124621241616e-05, + "loss": 4.7115, + "step": 19416 + }, + { + "epoch": 0.11547839946712342, + "grad_norm": 1.2291951179504395, + "learning_rate": 4.837295886938516e-05, + "loss": 5.0075, + "step": 19417 + }, + { + "epoch": 0.11548434675040442, + "grad_norm": 1.3071776628494263, + "learning_rate": 4.837279310936945e-05, + "loss": 4.7839, + "step": 19418 + }, + { + "epoch": 0.11549029403368541, + "grad_norm": 1.4331681728363037, + "learning_rate": 4.837262734119453e-05, + "loss": 4.7494, + "step": 19419 + }, + { + "epoch": 0.11549624131696641, + "grad_norm": 1.4209895133972168, + "learning_rate": 4.837246156486048e-05, + "loss": 4.8538, + "step": 19420 + }, + { + "epoch": 0.11550218860024741, + "grad_norm": 1.2397242784500122, + "learning_rate": 4.837229578036734e-05, + "loss": 4.7616, + "step": 19421 + }, + { + "epoch": 0.1155081358835284, + "grad_norm": 1.2271560430526733, + "learning_rate": 4.837212998771517e-05, + "loss": 4.7361, + "step": 19422 + }, + { + "epoch": 0.1155140831668094, + "grad_norm": 1.3334344625473022, + "learning_rate": 4.837196418690403e-05, + "loss": 4.8971, + "step": 19423 + }, + { + "epoch": 0.1155200304500904, + "grad_norm": 1.3195756673812866, + "learning_rate": 4.837179837793398e-05, + "loss": 4.8944, + "step": 19424 + }, + { + "epoch": 0.1155259777333714, + "grad_norm": 1.4583542346954346, + "learning_rate": 4.837163256080508e-05, + "loss": 4.7857, + "step": 19425 + }, + { + "epoch": 0.1155319250166524, + "grad_norm": 1.5155558586120605, + "learning_rate": 4.837146673551739e-05, + "loss": 4.7728, + "step": 19426 + }, + { + "epoch": 0.1155378722999334, + "grad_norm": 1.3582627773284912, + "learning_rate": 4.837130090207095e-05, + "loss": 4.7065, + "step": 19427 + }, + { + "epoch": 0.11554381958321439, + "grad_norm": 1.2635151147842407, + "learning_rate": 4.837113506046584e-05, + "loss": 4.882, + "step": 19428 + }, + { + "epoch": 0.11554976686649539, + "grad_norm": 1.417083501815796, + "learning_rate": 4.83709692107021e-05, + "loss": 4.8928, + "step": 19429 + }, + { + "epoch": 0.11555571414977638, + "grad_norm": 1.4780973196029663, + "learning_rate": 4.8370803352779806e-05, + "loss": 4.9458, + "step": 19430 + }, + { + "epoch": 0.11556166143305738, + "grad_norm": 1.2949103116989136, + "learning_rate": 4.8370637486699e-05, + "loss": 4.8753, + "step": 19431 + }, + { + "epoch": 0.11556760871633838, + "grad_norm": 1.4755308628082275, + "learning_rate": 4.8370471612459744e-05, + "loss": 4.7886, + "step": 19432 + }, + { + "epoch": 0.11557355599961937, + "grad_norm": 1.4527158737182617, + "learning_rate": 4.8370305730062095e-05, + "loss": 4.8442, + "step": 19433 + }, + { + "epoch": 0.11557950328290037, + "grad_norm": 1.3422110080718994, + "learning_rate": 4.8370139839506124e-05, + "loss": 4.9745, + "step": 19434 + }, + { + "epoch": 0.11558545056618137, + "grad_norm": 1.5843584537506104, + "learning_rate": 4.836997394079187e-05, + "loss": 4.8432, + "step": 19435 + }, + { + "epoch": 0.11559139784946236, + "grad_norm": 1.3267780542373657, + "learning_rate": 4.836980803391941e-05, + "loss": 4.7816, + "step": 19436 + }, + { + "epoch": 0.11559734513274336, + "grad_norm": 1.3092966079711914, + "learning_rate": 4.836964211888878e-05, + "loss": 5.0283, + "step": 19437 + }, + { + "epoch": 0.11560329241602436, + "grad_norm": 1.4653512239456177, + "learning_rate": 4.836947619570005e-05, + "loss": 4.9265, + "step": 19438 + }, + { + "epoch": 0.11560923969930535, + "grad_norm": 1.344672441482544, + "learning_rate": 4.836931026435328e-05, + "loss": 5.0426, + "step": 19439 + }, + { + "epoch": 0.11561518698258635, + "grad_norm": 1.3949403762817383, + "learning_rate": 4.836914432484853e-05, + "loss": 5.1539, + "step": 19440 + }, + { + "epoch": 0.11562113426586736, + "grad_norm": 1.3876662254333496, + "learning_rate": 4.836897837718585e-05, + "loss": 4.9346, + "step": 19441 + }, + { + "epoch": 0.11562708154914834, + "grad_norm": 1.3399412631988525, + "learning_rate": 4.83688124213653e-05, + "loss": 4.8688, + "step": 19442 + }, + { + "epoch": 0.11563302883242935, + "grad_norm": 1.3819881677627563, + "learning_rate": 4.836864645738694e-05, + "loss": 4.9527, + "step": 19443 + }, + { + "epoch": 0.11563897611571035, + "grad_norm": 1.509074091911316, + "learning_rate": 4.8368480485250825e-05, + "loss": 4.9273, + "step": 19444 + }, + { + "epoch": 0.11564492339899134, + "grad_norm": 1.2591453790664673, + "learning_rate": 4.836831450495701e-05, + "loss": 4.9065, + "step": 19445 + }, + { + "epoch": 0.11565087068227234, + "grad_norm": 1.4065910577774048, + "learning_rate": 4.836814851650557e-05, + "loss": 4.9699, + "step": 19446 + }, + { + "epoch": 0.11565681796555334, + "grad_norm": 1.3355581760406494, + "learning_rate": 4.836798251989655e-05, + "loss": 5.1639, + "step": 19447 + }, + { + "epoch": 0.11566276524883433, + "grad_norm": 1.3715496063232422, + "learning_rate": 4.836781651513e-05, + "loss": 4.855, + "step": 19448 + }, + { + "epoch": 0.11566871253211533, + "grad_norm": 1.569305658340454, + "learning_rate": 4.836765050220599e-05, + "loss": 4.6329, + "step": 19449 + }, + { + "epoch": 0.11567465981539633, + "grad_norm": 1.3613293170928955, + "learning_rate": 4.836748448112458e-05, + "loss": 4.9897, + "step": 19450 + }, + { + "epoch": 0.11568060709867732, + "grad_norm": 1.2653577327728271, + "learning_rate": 4.836731845188581e-05, + "loss": 4.9819, + "step": 19451 + }, + { + "epoch": 0.11568655438195832, + "grad_norm": 1.5030022859573364, + "learning_rate": 4.836715241448976e-05, + "loss": 4.8387, + "step": 19452 + }, + { + "epoch": 0.11569250166523933, + "grad_norm": 1.2560715675354004, + "learning_rate": 4.836698636893647e-05, + "loss": 5.0862, + "step": 19453 + }, + { + "epoch": 0.11569844894852031, + "grad_norm": 1.1981379985809326, + "learning_rate": 4.836682031522602e-05, + "loss": 4.7682, + "step": 19454 + }, + { + "epoch": 0.11570439623180132, + "grad_norm": 1.3572615385055542, + "learning_rate": 4.8366654253358444e-05, + "loss": 4.9008, + "step": 19455 + }, + { + "epoch": 0.11571034351508232, + "grad_norm": 1.2542002201080322, + "learning_rate": 4.8366488183333816e-05, + "loss": 4.911, + "step": 19456 + }, + { + "epoch": 0.1157162907983633, + "grad_norm": 1.4759174585342407, + "learning_rate": 4.8366322105152186e-05, + "loss": 4.789, + "step": 19457 + }, + { + "epoch": 0.11572223808164431, + "grad_norm": 1.2307411432266235, + "learning_rate": 4.8366156018813616e-05, + "loss": 4.9556, + "step": 19458 + }, + { + "epoch": 0.1157281853649253, + "grad_norm": 1.240334153175354, + "learning_rate": 4.836598992431816e-05, + "loss": 4.9996, + "step": 19459 + }, + { + "epoch": 0.1157341326482063, + "grad_norm": 1.3100368976593018, + "learning_rate": 4.8365823821665876e-05, + "loss": 5.0693, + "step": 19460 + }, + { + "epoch": 0.1157400799314873, + "grad_norm": 1.0904709100723267, + "learning_rate": 4.8365657710856835e-05, + "loss": 5.0327, + "step": 19461 + }, + { + "epoch": 0.11574602721476829, + "grad_norm": 1.3847914934158325, + "learning_rate": 4.836549159189108e-05, + "loss": 5.0512, + "step": 19462 + }, + { + "epoch": 0.11575197449804929, + "grad_norm": 1.2307064533233643, + "learning_rate": 4.836532546476866e-05, + "loss": 5.0687, + "step": 19463 + }, + { + "epoch": 0.11575792178133029, + "grad_norm": 1.3900285959243774, + "learning_rate": 4.836515932948966e-05, + "loss": 5.1044, + "step": 19464 + }, + { + "epoch": 0.11576386906461128, + "grad_norm": 1.2194246053695679, + "learning_rate": 4.836499318605412e-05, + "loss": 5.0412, + "step": 19465 + }, + { + "epoch": 0.11576981634789228, + "grad_norm": 1.3460240364074707, + "learning_rate": 4.83648270344621e-05, + "loss": 5.14, + "step": 19466 + }, + { + "epoch": 0.11577576363117328, + "grad_norm": 1.2739115953445435, + "learning_rate": 4.8364660874713664e-05, + "loss": 5.0782, + "step": 19467 + }, + { + "epoch": 0.11578171091445427, + "grad_norm": 1.987092137336731, + "learning_rate": 4.836449470680887e-05, + "loss": 4.8106, + "step": 19468 + }, + { + "epoch": 0.11578765819773527, + "grad_norm": 1.3820792436599731, + "learning_rate": 4.8364328530747765e-05, + "loss": 5.3549, + "step": 19469 + }, + { + "epoch": 0.11579360548101628, + "grad_norm": 1.5276916027069092, + "learning_rate": 4.836416234653042e-05, + "loss": 5.3479, + "step": 19470 + }, + { + "epoch": 0.11579955276429726, + "grad_norm": 1.5292818546295166, + "learning_rate": 4.836399615415688e-05, + "loss": 5.2627, + "step": 19471 + }, + { + "epoch": 0.11580550004757827, + "grad_norm": 1.5759434700012207, + "learning_rate": 4.836382995362722e-05, + "loss": 5.2925, + "step": 19472 + }, + { + "epoch": 0.11581144733085927, + "grad_norm": 1.3807876110076904, + "learning_rate": 4.836366374494148e-05, + "loss": 5.0794, + "step": 19473 + }, + { + "epoch": 0.11581739461414026, + "grad_norm": 1.3631199598312378, + "learning_rate": 4.836349752809973e-05, + "loss": 5.0606, + "step": 19474 + }, + { + "epoch": 0.11582334189742126, + "grad_norm": 1.5250667333602905, + "learning_rate": 4.836333130310202e-05, + "loss": 5.1799, + "step": 19475 + }, + { + "epoch": 0.11582928918070226, + "grad_norm": 1.4191410541534424, + "learning_rate": 4.836316506994842e-05, + "loss": 5.2812, + "step": 19476 + }, + { + "epoch": 0.11583523646398325, + "grad_norm": 1.5502076148986816, + "learning_rate": 4.8362998828638975e-05, + "loss": 5.3503, + "step": 19477 + }, + { + "epoch": 0.11584118374726425, + "grad_norm": 1.441786766052246, + "learning_rate": 4.836283257917375e-05, + "loss": 5.1526, + "step": 19478 + }, + { + "epoch": 0.11584713103054525, + "grad_norm": 1.3994730710983276, + "learning_rate": 4.83626663215528e-05, + "loss": 5.1969, + "step": 19479 + }, + { + "epoch": 0.11585307831382624, + "grad_norm": 1.5141762495040894, + "learning_rate": 4.836250005577619e-05, + "loss": 5.099, + "step": 19480 + }, + { + "epoch": 0.11585902559710724, + "grad_norm": 1.4504029750823975, + "learning_rate": 4.836233378184397e-05, + "loss": 5.5225, + "step": 19481 + }, + { + "epoch": 0.11586497288038825, + "grad_norm": 1.3617264032363892, + "learning_rate": 4.8362167499756194e-05, + "loss": 5.3426, + "step": 19482 + }, + { + "epoch": 0.11587092016366923, + "grad_norm": 1.3681023120880127, + "learning_rate": 4.8362001209512934e-05, + "loss": 5.3476, + "step": 19483 + }, + { + "epoch": 0.11587686744695024, + "grad_norm": 1.050550937652588, + "learning_rate": 4.836183491111424e-05, + "loss": 5.1338, + "step": 19484 + }, + { + "epoch": 0.11588281473023124, + "grad_norm": 1.386715054512024, + "learning_rate": 4.836166860456017e-05, + "loss": 5.2761, + "step": 19485 + }, + { + "epoch": 0.11588876201351223, + "grad_norm": 1.2128262519836426, + "learning_rate": 4.836150228985078e-05, + "loss": 5.165, + "step": 19486 + }, + { + "epoch": 0.11589470929679323, + "grad_norm": 1.224721074104309, + "learning_rate": 4.836133596698614e-05, + "loss": 5.1631, + "step": 19487 + }, + { + "epoch": 0.11590065658007422, + "grad_norm": 1.2348668575286865, + "learning_rate": 4.8361169635966285e-05, + "loss": 5.3206, + "step": 19488 + }, + { + "epoch": 0.11590660386335522, + "grad_norm": 1.1665185689926147, + "learning_rate": 4.836100329679129e-05, + "loss": 5.3162, + "step": 19489 + }, + { + "epoch": 0.11591255114663622, + "grad_norm": 1.2063257694244385, + "learning_rate": 4.836083694946122e-05, + "loss": 5.0348, + "step": 19490 + }, + { + "epoch": 0.11591849842991721, + "grad_norm": 1.5199745893478394, + "learning_rate": 4.836067059397612e-05, + "loss": 5.0793, + "step": 19491 + }, + { + "epoch": 0.11592444571319821, + "grad_norm": 1.2285770177841187, + "learning_rate": 4.8360504230336044e-05, + "loss": 5.1478, + "step": 19492 + }, + { + "epoch": 0.11593039299647921, + "grad_norm": 1.3429020643234253, + "learning_rate": 4.836033785854107e-05, + "loss": 5.3225, + "step": 19493 + }, + { + "epoch": 0.1159363402797602, + "grad_norm": 1.3870415687561035, + "learning_rate": 4.836017147859123e-05, + "loss": 5.2711, + "step": 19494 + }, + { + "epoch": 0.1159422875630412, + "grad_norm": 1.3311539888381958, + "learning_rate": 4.8360005090486603e-05, + "loss": 5.1778, + "step": 19495 + }, + { + "epoch": 0.1159482348463222, + "grad_norm": 1.1331884860992432, + "learning_rate": 4.8359838694227236e-05, + "loss": 5.1435, + "step": 19496 + }, + { + "epoch": 0.11595418212960319, + "grad_norm": 1.427506685256958, + "learning_rate": 4.83596722898132e-05, + "loss": 5.2153, + "step": 19497 + }, + { + "epoch": 0.1159601294128842, + "grad_norm": 1.4716016054153442, + "learning_rate": 4.835950587724453e-05, + "loss": 4.9599, + "step": 19498 + }, + { + "epoch": 0.1159660766961652, + "grad_norm": 1.073724389076233, + "learning_rate": 4.8359339456521305e-05, + "loss": 5.3481, + "step": 19499 + }, + { + "epoch": 0.11597202397944618, + "grad_norm": 1.1965457201004028, + "learning_rate": 4.835917302764358e-05, + "loss": 5.128, + "step": 19500 + }, + { + "epoch": 0.11597797126272719, + "grad_norm": 1.2589031457901, + "learning_rate": 4.83590065906114e-05, + "loss": 5.1952, + "step": 19501 + }, + { + "epoch": 0.11598391854600819, + "grad_norm": 1.5062520503997803, + "learning_rate": 4.8358840145424835e-05, + "loss": 5.3431, + "step": 19502 + }, + { + "epoch": 0.11598986582928918, + "grad_norm": 1.3464981317520142, + "learning_rate": 4.8358673692083944e-05, + "loss": 5.187, + "step": 19503 + }, + { + "epoch": 0.11599581311257018, + "grad_norm": 1.195157766342163, + "learning_rate": 4.8358507230588776e-05, + "loss": 5.4018, + "step": 19504 + }, + { + "epoch": 0.11600176039585118, + "grad_norm": 1.185371994972229, + "learning_rate": 4.83583407609394e-05, + "loss": 5.3204, + "step": 19505 + }, + { + "epoch": 0.11600770767913217, + "grad_norm": 1.1011184453964233, + "learning_rate": 4.835817428313586e-05, + "loss": 5.2426, + "step": 19506 + }, + { + "epoch": 0.11601365496241317, + "grad_norm": 1.2706186771392822, + "learning_rate": 4.835800779717823e-05, + "loss": 5.3277, + "step": 19507 + }, + { + "epoch": 0.11601960224569417, + "grad_norm": 1.23444664478302, + "learning_rate": 4.8357841303066564e-05, + "loss": 5.304, + "step": 19508 + }, + { + "epoch": 0.11602554952897516, + "grad_norm": 1.3166215419769287, + "learning_rate": 4.8357674800800915e-05, + "loss": 5.1755, + "step": 19509 + }, + { + "epoch": 0.11603149681225616, + "grad_norm": 1.0634559392929077, + "learning_rate": 4.835750829038134e-05, + "loss": 5.2188, + "step": 19510 + }, + { + "epoch": 0.11603744409553716, + "grad_norm": 1.0847052335739136, + "learning_rate": 4.8357341771807894e-05, + "loss": 5.1993, + "step": 19511 + }, + { + "epoch": 0.11604339137881815, + "grad_norm": 1.2893394231796265, + "learning_rate": 4.8357175245080645e-05, + "loss": 5.278, + "step": 19512 + }, + { + "epoch": 0.11604933866209916, + "grad_norm": 1.1346744298934937, + "learning_rate": 4.8357008710199653e-05, + "loss": 5.0915, + "step": 19513 + }, + { + "epoch": 0.11605528594538016, + "grad_norm": 1.2405723333358765, + "learning_rate": 4.835684216716497e-05, + "loss": 5.3274, + "step": 19514 + }, + { + "epoch": 0.11606123322866115, + "grad_norm": 1.2367215156555176, + "learning_rate": 4.8356675615976646e-05, + "loss": 5.3145, + "step": 19515 + }, + { + "epoch": 0.11606718051194215, + "grad_norm": 1.23695969581604, + "learning_rate": 4.835650905663476e-05, + "loss": 5.1454, + "step": 19516 + }, + { + "epoch": 0.11607312779522314, + "grad_norm": 1.649644136428833, + "learning_rate": 4.835634248913935e-05, + "loss": 4.9684, + "step": 19517 + }, + { + "epoch": 0.11607907507850414, + "grad_norm": 1.3828257322311401, + "learning_rate": 4.835617591349049e-05, + "loss": 4.8913, + "step": 19518 + }, + { + "epoch": 0.11608502236178514, + "grad_norm": 1.4446587562561035, + "learning_rate": 4.8356009329688215e-05, + "loss": 4.9248, + "step": 19519 + }, + { + "epoch": 0.11609096964506613, + "grad_norm": 1.4149401187896729, + "learning_rate": 4.835584273773261e-05, + "loss": 5.0446, + "step": 19520 + }, + { + "epoch": 0.11609691692834713, + "grad_norm": 1.4073368310928345, + "learning_rate": 4.835567613762372e-05, + "loss": 5.1451, + "step": 19521 + }, + { + "epoch": 0.11610286421162813, + "grad_norm": 1.438539743423462, + "learning_rate": 4.835550952936161e-05, + "loss": 5.3629, + "step": 19522 + }, + { + "epoch": 0.11610881149490912, + "grad_norm": 1.4686654806137085, + "learning_rate": 4.835534291294632e-05, + "loss": 5.4386, + "step": 19523 + }, + { + "epoch": 0.11611475877819012, + "grad_norm": 1.3416131734848022, + "learning_rate": 4.835517628837793e-05, + "loss": 5.4625, + "step": 19524 + }, + { + "epoch": 0.11612070606147112, + "grad_norm": 1.38942551612854, + "learning_rate": 4.835500965565649e-05, + "loss": 5.2164, + "step": 19525 + }, + { + "epoch": 0.11612665334475211, + "grad_norm": 1.157583475112915, + "learning_rate": 4.835484301478205e-05, + "loss": 4.931, + "step": 19526 + }, + { + "epoch": 0.11613260062803311, + "grad_norm": 1.1182529926300049, + "learning_rate": 4.835467636575468e-05, + "loss": 5.0804, + "step": 19527 + }, + { + "epoch": 0.11613854791131412, + "grad_norm": 1.1087690591812134, + "learning_rate": 4.835450970857444e-05, + "loss": 4.9112, + "step": 19528 + }, + { + "epoch": 0.1161444951945951, + "grad_norm": 1.1217858791351318, + "learning_rate": 4.8354343043241374e-05, + "loss": 4.8775, + "step": 19529 + }, + { + "epoch": 0.1161504424778761, + "grad_norm": 1.703722596168518, + "learning_rate": 4.8354176369755556e-05, + "loss": 5.0991, + "step": 19530 + }, + { + "epoch": 0.11615638976115711, + "grad_norm": 1.5027599334716797, + "learning_rate": 4.8354009688117026e-05, + "loss": 5.3486, + "step": 19531 + }, + { + "epoch": 0.1161623370444381, + "grad_norm": 1.3976017236709595, + "learning_rate": 4.835384299832586e-05, + "loss": 5.3045, + "step": 19532 + }, + { + "epoch": 0.1161682843277191, + "grad_norm": 1.4341175556182861, + "learning_rate": 4.83536763003821e-05, + "loss": 5.2463, + "step": 19533 + }, + { + "epoch": 0.1161742316110001, + "grad_norm": 1.248632550239563, + "learning_rate": 4.835350959428582e-05, + "loss": 5.1573, + "step": 19534 + }, + { + "epoch": 0.11618017889428109, + "grad_norm": 1.2873725891113281, + "learning_rate": 4.835334288003707e-05, + "loss": 5.3115, + "step": 19535 + }, + { + "epoch": 0.11618612617756209, + "grad_norm": 1.4359512329101562, + "learning_rate": 4.835317615763591e-05, + "loss": 5.1134, + "step": 19536 + }, + { + "epoch": 0.11619207346084309, + "grad_norm": 1.3092215061187744, + "learning_rate": 4.8353009427082395e-05, + "loss": 5.2955, + "step": 19537 + }, + { + "epoch": 0.11619802074412408, + "grad_norm": 1.292256474494934, + "learning_rate": 4.8352842688376585e-05, + "loss": 5.2163, + "step": 19538 + }, + { + "epoch": 0.11620396802740508, + "grad_norm": 1.2327983379364014, + "learning_rate": 4.8352675941518545e-05, + "loss": 5.2785, + "step": 19539 + }, + { + "epoch": 0.11620991531068608, + "grad_norm": 1.3402459621429443, + "learning_rate": 4.835250918650832e-05, + "loss": 5.2474, + "step": 19540 + }, + { + "epoch": 0.11621586259396707, + "grad_norm": 1.4312702417373657, + "learning_rate": 4.835234242334598e-05, + "loss": 5.1451, + "step": 19541 + }, + { + "epoch": 0.11622180987724808, + "grad_norm": 1.4165308475494385, + "learning_rate": 4.8352175652031576e-05, + "loss": 5.2241, + "step": 19542 + }, + { + "epoch": 0.11622775716052908, + "grad_norm": 1.1984010934829712, + "learning_rate": 4.835200887256517e-05, + "loss": 5.2084, + "step": 19543 + }, + { + "epoch": 0.11623370444381007, + "grad_norm": 1.277029275894165, + "learning_rate": 4.835184208494682e-05, + "loss": 5.1136, + "step": 19544 + }, + { + "epoch": 0.11623965172709107, + "grad_norm": 1.4002219438552856, + "learning_rate": 4.8351675289176586e-05, + "loss": 5.1313, + "step": 19545 + }, + { + "epoch": 0.11624559901037206, + "grad_norm": 1.397129774093628, + "learning_rate": 4.835150848525452e-05, + "loss": 5.2001, + "step": 19546 + }, + { + "epoch": 0.11625154629365306, + "grad_norm": 1.3968653678894043, + "learning_rate": 4.8351341673180686e-05, + "loss": 5.1292, + "step": 19547 + }, + { + "epoch": 0.11625749357693406, + "grad_norm": 1.298600435256958, + "learning_rate": 4.8351174852955125e-05, + "loss": 5.1185, + "step": 19548 + }, + { + "epoch": 0.11626344086021505, + "grad_norm": 1.119382619857788, + "learning_rate": 4.835100802457793e-05, + "loss": 5.2052, + "step": 19549 + }, + { + "epoch": 0.11626938814349605, + "grad_norm": 1.2555358409881592, + "learning_rate": 4.835084118804913e-05, + "loss": 5.2604, + "step": 19550 + }, + { + "epoch": 0.11627533542677705, + "grad_norm": 1.293525218963623, + "learning_rate": 4.835067434336879e-05, + "loss": 5.1402, + "step": 19551 + }, + { + "epoch": 0.11628128271005804, + "grad_norm": 1.3321988582611084, + "learning_rate": 4.8350507490536976e-05, + "loss": 5.0959, + "step": 19552 + }, + { + "epoch": 0.11628722999333904, + "grad_norm": 1.3231252431869507, + "learning_rate": 4.835034062955374e-05, + "loss": 5.0461, + "step": 19553 + }, + { + "epoch": 0.11629317727662004, + "grad_norm": 1.2743831872940063, + "learning_rate": 4.835017376041914e-05, + "loss": 5.1215, + "step": 19554 + }, + { + "epoch": 0.11629912455990103, + "grad_norm": 1.3750208616256714, + "learning_rate": 4.835000688313323e-05, + "loss": 5.0459, + "step": 19555 + }, + { + "epoch": 0.11630507184318203, + "grad_norm": 1.394209861755371, + "learning_rate": 4.834983999769609e-05, + "loss": 5.1577, + "step": 19556 + }, + { + "epoch": 0.11631101912646304, + "grad_norm": 1.2393178939819336, + "learning_rate": 4.834967310410775e-05, + "loss": 5.1217, + "step": 19557 + }, + { + "epoch": 0.11631696640974402, + "grad_norm": 1.2668427228927612, + "learning_rate": 4.834950620236829e-05, + "loss": 5.0266, + "step": 19558 + }, + { + "epoch": 0.11632291369302503, + "grad_norm": 1.4088828563690186, + "learning_rate": 4.834933929247775e-05, + "loss": 4.8089, + "step": 19559 + }, + { + "epoch": 0.11632886097630603, + "grad_norm": 1.2668780088424683, + "learning_rate": 4.83491723744362e-05, + "loss": 5.2791, + "step": 19560 + }, + { + "epoch": 0.11633480825958702, + "grad_norm": 1.3243741989135742, + "learning_rate": 4.834900544824369e-05, + "loss": 5.1743, + "step": 19561 + }, + { + "epoch": 0.11634075554286802, + "grad_norm": 1.497856616973877, + "learning_rate": 4.834883851390029e-05, + "loss": 4.8667, + "step": 19562 + }, + { + "epoch": 0.11634670282614902, + "grad_norm": 1.426867961883545, + "learning_rate": 4.834867157140605e-05, + "loss": 4.9758, + "step": 19563 + }, + { + "epoch": 0.11635265010943001, + "grad_norm": 1.4427236318588257, + "learning_rate": 4.834850462076103e-05, + "loss": 5.45, + "step": 19564 + }, + { + "epoch": 0.11635859739271101, + "grad_norm": 1.4465901851654053, + "learning_rate": 4.834833766196528e-05, + "loss": 5.0877, + "step": 19565 + }, + { + "epoch": 0.11636454467599201, + "grad_norm": 1.76282799243927, + "learning_rate": 4.834817069501888e-05, + "loss": 5.0607, + "step": 19566 + }, + { + "epoch": 0.116370491959273, + "grad_norm": 1.4688469171524048, + "learning_rate": 4.8348003719921864e-05, + "loss": 4.9929, + "step": 19567 + }, + { + "epoch": 0.116376439242554, + "grad_norm": 1.576390266418457, + "learning_rate": 4.834783673667431e-05, + "loss": 5.7283, + "step": 19568 + }, + { + "epoch": 0.116382386525835, + "grad_norm": 1.517745852470398, + "learning_rate": 4.834766974527626e-05, + "loss": 5.3711, + "step": 19569 + }, + { + "epoch": 0.11638833380911599, + "grad_norm": 1.5122108459472656, + "learning_rate": 4.834750274572778e-05, + "loss": 5.6297, + "step": 19570 + }, + { + "epoch": 0.116394281092397, + "grad_norm": 1.9188055992126465, + "learning_rate": 4.8347335738028934e-05, + "loss": 5.0911, + "step": 19571 + }, + { + "epoch": 0.116400228375678, + "grad_norm": 1.7408324480056763, + "learning_rate": 4.834716872217977e-05, + "loss": 5.1396, + "step": 19572 + }, + { + "epoch": 0.11640617565895899, + "grad_norm": 1.7669044733047485, + "learning_rate": 4.834700169818035e-05, + "loss": 5.1463, + "step": 19573 + }, + { + "epoch": 0.11641212294223999, + "grad_norm": 1.7838845252990723, + "learning_rate": 4.834683466603074e-05, + "loss": 5.3486, + "step": 19574 + }, + { + "epoch": 0.11641807022552098, + "grad_norm": 1.8427141904830933, + "learning_rate": 4.834666762573098e-05, + "loss": 5.1454, + "step": 19575 + }, + { + "epoch": 0.11642401750880198, + "grad_norm": 1.8620864152908325, + "learning_rate": 4.8346500577281145e-05, + "loss": 4.9462, + "step": 19576 + }, + { + "epoch": 0.11642996479208298, + "grad_norm": 1.7334544658660889, + "learning_rate": 4.834633352068129e-05, + "loss": 4.9012, + "step": 19577 + }, + { + "epoch": 0.11643591207536397, + "grad_norm": 1.7202188968658447, + "learning_rate": 4.834616645593147e-05, + "loss": 5.2577, + "step": 19578 + }, + { + "epoch": 0.11644185935864497, + "grad_norm": 1.5666993856430054, + "learning_rate": 4.834599938303174e-05, + "loss": 4.9502, + "step": 19579 + }, + { + "epoch": 0.11644780664192597, + "grad_norm": 1.5880829095840454, + "learning_rate": 4.834583230198217e-05, + "loss": 5.1193, + "step": 19580 + }, + { + "epoch": 0.11645375392520696, + "grad_norm": 1.7851444482803345, + "learning_rate": 4.834566521278281e-05, + "loss": 5.1411, + "step": 19581 + }, + { + "epoch": 0.11645970120848796, + "grad_norm": 1.8817992210388184, + "learning_rate": 4.834549811543371e-05, + "loss": 5.1773, + "step": 19582 + }, + { + "epoch": 0.11646564849176896, + "grad_norm": 1.8055325746536255, + "learning_rate": 4.834533100993495e-05, + "loss": 4.8526, + "step": 19583 + }, + { + "epoch": 0.11647159577504995, + "grad_norm": 1.501705527305603, + "learning_rate": 4.834516389628657e-05, + "loss": 4.9943, + "step": 19584 + }, + { + "epoch": 0.11647754305833095, + "grad_norm": 1.8224765062332153, + "learning_rate": 4.8344996774488635e-05, + "loss": 5.3321, + "step": 19585 + }, + { + "epoch": 0.11648349034161196, + "grad_norm": 1.7806826829910278, + "learning_rate": 4.83448296445412e-05, + "loss": 5.1565, + "step": 19586 + }, + { + "epoch": 0.11648943762489294, + "grad_norm": 1.64619779586792, + "learning_rate": 4.8344662506444334e-05, + "loss": 4.9259, + "step": 19587 + }, + { + "epoch": 0.11649538490817395, + "grad_norm": 1.7176555395126343, + "learning_rate": 4.834449536019808e-05, + "loss": 4.9173, + "step": 19588 + }, + { + "epoch": 0.11650133219145495, + "grad_norm": 1.7485530376434326, + "learning_rate": 4.834432820580251e-05, + "loss": 4.9548, + "step": 19589 + }, + { + "epoch": 0.11650727947473594, + "grad_norm": 1.8407695293426514, + "learning_rate": 4.834416104325767e-05, + "loss": 5.5323, + "step": 19590 + }, + { + "epoch": 0.11651322675801694, + "grad_norm": 1.37450110912323, + "learning_rate": 4.834399387256363e-05, + "loss": 5.0058, + "step": 19591 + }, + { + "epoch": 0.11651917404129794, + "grad_norm": 1.6784085035324097, + "learning_rate": 4.834382669372044e-05, + "loss": 5.0886, + "step": 19592 + }, + { + "epoch": 0.11652512132457893, + "grad_norm": 1.9228695631027222, + "learning_rate": 4.834365950672816e-05, + "loss": 5.5382, + "step": 19593 + }, + { + "epoch": 0.11653106860785993, + "grad_norm": 1.7998968362808228, + "learning_rate": 4.834349231158685e-05, + "loss": 5.3286, + "step": 19594 + }, + { + "epoch": 0.11653701589114093, + "grad_norm": 1.9077783823013306, + "learning_rate": 4.8343325108296574e-05, + "loss": 4.9033, + "step": 19595 + }, + { + "epoch": 0.11654296317442192, + "grad_norm": 1.3677197694778442, + "learning_rate": 4.834315789685738e-05, + "loss": 5.4146, + "step": 19596 + }, + { + "epoch": 0.11654891045770292, + "grad_norm": 1.5490330457687378, + "learning_rate": 4.834299067726933e-05, + "loss": 5.8435, + "step": 19597 + }, + { + "epoch": 0.11655485774098392, + "grad_norm": 1.7260395288467407, + "learning_rate": 4.8342823449532484e-05, + "loss": 4.9687, + "step": 19598 + }, + { + "epoch": 0.11656080502426491, + "grad_norm": 1.5140855312347412, + "learning_rate": 4.83426562136469e-05, + "loss": 4.8185, + "step": 19599 + }, + { + "epoch": 0.11656675230754591, + "grad_norm": 1.7183781862258911, + "learning_rate": 4.834248896961263e-05, + "loss": 4.954, + "step": 19600 + }, + { + "epoch": 0.11657269959082692, + "grad_norm": 1.3909941911697388, + "learning_rate": 4.834232171742975e-05, + "loss": 5.3393, + "step": 19601 + }, + { + "epoch": 0.1165786468741079, + "grad_norm": 1.437046766281128, + "learning_rate": 4.83421544570983e-05, + "loss": 5.5486, + "step": 19602 + }, + { + "epoch": 0.11658459415738891, + "grad_norm": 1.4513304233551025, + "learning_rate": 4.8341987188618344e-05, + "loss": 5.6754, + "step": 19603 + }, + { + "epoch": 0.1165905414406699, + "grad_norm": 1.7366830110549927, + "learning_rate": 4.8341819911989936e-05, + "loss": 5.5651, + "step": 19604 + }, + { + "epoch": 0.1165964887239509, + "grad_norm": 1.7084081172943115, + "learning_rate": 4.834165262721315e-05, + "loss": 5.5237, + "step": 19605 + }, + { + "epoch": 0.1166024360072319, + "grad_norm": 1.588749647140503, + "learning_rate": 4.834148533428803e-05, + "loss": 5.5371, + "step": 19606 + }, + { + "epoch": 0.11660838329051289, + "grad_norm": 1.6907262802124023, + "learning_rate": 4.834131803321464e-05, + "loss": 5.3998, + "step": 19607 + }, + { + "epoch": 0.11661433057379389, + "grad_norm": 1.676530122756958, + "learning_rate": 4.834115072399304e-05, + "loss": 5.1636, + "step": 19608 + }, + { + "epoch": 0.11662027785707489, + "grad_norm": 1.6379070281982422, + "learning_rate": 4.834098340662327e-05, + "loss": 5.4196, + "step": 19609 + }, + { + "epoch": 0.11662622514035588, + "grad_norm": 1.6794102191925049, + "learning_rate": 4.8340816081105424e-05, + "loss": 5.3671, + "step": 19610 + }, + { + "epoch": 0.11663217242363688, + "grad_norm": 1.7833147048950195, + "learning_rate": 4.834064874743953e-05, + "loss": 5.3417, + "step": 19611 + }, + { + "epoch": 0.11663811970691788, + "grad_norm": 1.649409532546997, + "learning_rate": 4.834048140562566e-05, + "loss": 5.2781, + "step": 19612 + }, + { + "epoch": 0.11664406699019887, + "grad_norm": 1.6082829236984253, + "learning_rate": 4.834031405566387e-05, + "loss": 5.1188, + "step": 19613 + }, + { + "epoch": 0.11665001427347987, + "grad_norm": 1.6651804447174072, + "learning_rate": 4.834014669755421e-05, + "loss": 5.1683, + "step": 19614 + }, + { + "epoch": 0.11665596155676088, + "grad_norm": 1.715795636177063, + "learning_rate": 4.8339979331296755e-05, + "loss": 5.2491, + "step": 19615 + }, + { + "epoch": 0.11666190884004186, + "grad_norm": 1.6809749603271484, + "learning_rate": 4.8339811956891546e-05, + "loss": 5.0614, + "step": 19616 + }, + { + "epoch": 0.11666785612332287, + "grad_norm": 1.563790202140808, + "learning_rate": 4.833964457433865e-05, + "loss": 5.231, + "step": 19617 + }, + { + "epoch": 0.11667380340660387, + "grad_norm": 1.464647650718689, + "learning_rate": 4.8339477183638136e-05, + "loss": 5.0405, + "step": 19618 + }, + { + "epoch": 0.11667975068988486, + "grad_norm": 1.989701509475708, + "learning_rate": 4.8339309784790043e-05, + "loss": 5.4454, + "step": 19619 + }, + { + "epoch": 0.11668569797316586, + "grad_norm": 2.438558340072632, + "learning_rate": 4.833914237779444e-05, + "loss": 5.7298, + "step": 19620 + }, + { + "epoch": 0.11669164525644686, + "grad_norm": 1.7590994834899902, + "learning_rate": 4.833897496265139e-05, + "loss": 5.4473, + "step": 19621 + }, + { + "epoch": 0.11669759253972785, + "grad_norm": 2.1040074825286865, + "learning_rate": 4.833880753936093e-05, + "loss": 5.2399, + "step": 19622 + }, + { + "epoch": 0.11670353982300885, + "grad_norm": 1.7136433124542236, + "learning_rate": 4.8338640107923146e-05, + "loss": 5.21, + "step": 19623 + }, + { + "epoch": 0.11670948710628985, + "grad_norm": 1.5797784328460693, + "learning_rate": 4.8338472668338074e-05, + "loss": 5.3555, + "step": 19624 + }, + { + "epoch": 0.11671543438957084, + "grad_norm": 1.512645959854126, + "learning_rate": 4.833830522060579e-05, + "loss": 5.4964, + "step": 19625 + }, + { + "epoch": 0.11672138167285184, + "grad_norm": 1.9328651428222656, + "learning_rate": 4.833813776472634e-05, + "loss": 5.9072, + "step": 19626 + }, + { + "epoch": 0.11672732895613284, + "grad_norm": 1.882068395614624, + "learning_rate": 4.8337970300699795e-05, + "loss": 5.4304, + "step": 19627 + }, + { + "epoch": 0.11673327623941383, + "grad_norm": 2.1347815990448, + "learning_rate": 4.83378028285262e-05, + "loss": 5.1286, + "step": 19628 + }, + { + "epoch": 0.11673922352269483, + "grad_norm": 2.0237247943878174, + "learning_rate": 4.833763534820562e-05, + "loss": 5.113, + "step": 19629 + }, + { + "epoch": 0.11674517080597584, + "grad_norm": 1.5656205415725708, + "learning_rate": 4.833746785973811e-05, + "loss": 4.8452, + "step": 19630 + }, + { + "epoch": 0.11675111808925683, + "grad_norm": 2.268324613571167, + "learning_rate": 4.833730036312374e-05, + "loss": 5.7184, + "step": 19631 + }, + { + "epoch": 0.11675706537253783, + "grad_norm": 2.1705756187438965, + "learning_rate": 4.833713285836255e-05, + "loss": 5.6489, + "step": 19632 + }, + { + "epoch": 0.11676301265581882, + "grad_norm": 1.7976182699203491, + "learning_rate": 4.833696534545461e-05, + "loss": 5.7016, + "step": 19633 + }, + { + "epoch": 0.11676895993909982, + "grad_norm": 1.2853381633758545, + "learning_rate": 4.8336797824399976e-05, + "loss": 5.654, + "step": 19634 + }, + { + "epoch": 0.11677490722238082, + "grad_norm": 1.8741413354873657, + "learning_rate": 4.833663029519871e-05, + "loss": 5.6735, + "step": 19635 + }, + { + "epoch": 0.11678085450566181, + "grad_norm": 1.4911704063415527, + "learning_rate": 4.8336462757850864e-05, + "loss": 5.3877, + "step": 19636 + }, + { + "epoch": 0.11678680178894281, + "grad_norm": 1.7979151010513306, + "learning_rate": 4.8336295212356506e-05, + "loss": 5.5677, + "step": 19637 + }, + { + "epoch": 0.11679274907222381, + "grad_norm": 2.036970376968384, + "learning_rate": 4.8336127658715677e-05, + "loss": 5.4768, + "step": 19638 + }, + { + "epoch": 0.1167986963555048, + "grad_norm": 1.9423377513885498, + "learning_rate": 4.833596009692846e-05, + "loss": 5.4021, + "step": 19639 + }, + { + "epoch": 0.1168046436387858, + "grad_norm": 1.5860786437988281, + "learning_rate": 4.8335792526994894e-05, + "loss": 5.3363, + "step": 19640 + }, + { + "epoch": 0.1168105909220668, + "grad_norm": 1.5712209939956665, + "learning_rate": 4.833562494891504e-05, + "loss": 5.432, + "step": 19641 + }, + { + "epoch": 0.11681653820534779, + "grad_norm": 1.3889914751052856, + "learning_rate": 4.833545736268897e-05, + "loss": 5.3272, + "step": 19642 + }, + { + "epoch": 0.1168224854886288, + "grad_norm": 1.607134461402893, + "learning_rate": 4.8335289768316726e-05, + "loss": 5.9617, + "step": 19643 + }, + { + "epoch": 0.1168284327719098, + "grad_norm": 1.6738252639770508, + "learning_rate": 4.8335122165798376e-05, + "loss": 5.6361, + "step": 19644 + }, + { + "epoch": 0.11683438005519078, + "grad_norm": 1.6006174087524414, + "learning_rate": 4.8334954555133974e-05, + "loss": 5.7384, + "step": 19645 + }, + { + "epoch": 0.11684032733847179, + "grad_norm": 1.7018747329711914, + "learning_rate": 4.833478693632358e-05, + "loss": 5.0784, + "step": 19646 + }, + { + "epoch": 0.11684627462175279, + "grad_norm": 1.7542921304702759, + "learning_rate": 4.833461930936726e-05, + "loss": 5.2674, + "step": 19647 + }, + { + "epoch": 0.11685222190503378, + "grad_norm": 1.6434245109558105, + "learning_rate": 4.8334451674265055e-05, + "loss": 4.7117, + "step": 19648 + }, + { + "epoch": 0.11685816918831478, + "grad_norm": 1.7878485918045044, + "learning_rate": 4.8334284031017044e-05, + "loss": 4.8068, + "step": 19649 + }, + { + "epoch": 0.11686411647159578, + "grad_norm": 1.7029922008514404, + "learning_rate": 4.833411637962327e-05, + "loss": 4.9168, + "step": 19650 + }, + { + "epoch": 0.11687006375487677, + "grad_norm": 1.8004266023635864, + "learning_rate": 4.83339487200838e-05, + "loss": 4.9931, + "step": 19651 + }, + { + "epoch": 0.11687601103815777, + "grad_norm": 1.7843881845474243, + "learning_rate": 4.833378105239869e-05, + "loss": 5.0786, + "step": 19652 + }, + { + "epoch": 0.11688195832143877, + "grad_norm": 1.697993278503418, + "learning_rate": 4.833361337656799e-05, + "loss": 5.188, + "step": 19653 + }, + { + "epoch": 0.11688790560471976, + "grad_norm": 1.8484392166137695, + "learning_rate": 4.833344569259177e-05, + "loss": 5.4858, + "step": 19654 + }, + { + "epoch": 0.11689385288800076, + "grad_norm": 1.6850509643554688, + "learning_rate": 4.833327800047009e-05, + "loss": 5.7946, + "step": 19655 + }, + { + "epoch": 0.11689980017128176, + "grad_norm": 1.709845781326294, + "learning_rate": 4.8333110300203e-05, + "loss": 6.0674, + "step": 19656 + }, + { + "epoch": 0.11690574745456275, + "grad_norm": 1.6634660959243774, + "learning_rate": 4.833294259179057e-05, + "loss": 5.8038, + "step": 19657 + }, + { + "epoch": 0.11691169473784375, + "grad_norm": 1.6274930238723755, + "learning_rate": 4.833277487523283e-05, + "loss": 5.6752, + "step": 19658 + }, + { + "epoch": 0.11691764202112476, + "grad_norm": 1.5415219068527222, + "learning_rate": 4.833260715052988e-05, + "loss": 5.4002, + "step": 19659 + }, + { + "epoch": 0.11692358930440575, + "grad_norm": 1.6023998260498047, + "learning_rate": 4.833243941768175e-05, + "loss": 5.2429, + "step": 19660 + }, + { + "epoch": 0.11692953658768675, + "grad_norm": 1.4608384370803833, + "learning_rate": 4.8332271676688515e-05, + "loss": 5.5144, + "step": 19661 + }, + { + "epoch": 0.11693548387096774, + "grad_norm": 1.700076937675476, + "learning_rate": 4.833210392755021e-05, + "loss": 5.6356, + "step": 19662 + }, + { + "epoch": 0.11694143115424874, + "grad_norm": 1.415705919265747, + "learning_rate": 4.833193617026692e-05, + "loss": 5.6977, + "step": 19663 + }, + { + "epoch": 0.11694737843752974, + "grad_norm": 1.620815634727478, + "learning_rate": 4.833176840483868e-05, + "loss": 5.8967, + "step": 19664 + }, + { + "epoch": 0.11695332572081073, + "grad_norm": 1.4221736192703247, + "learning_rate": 4.833160063126558e-05, + "loss": 5.5351, + "step": 19665 + }, + { + "epoch": 0.11695927300409173, + "grad_norm": 1.460254192352295, + "learning_rate": 4.833143284954764e-05, + "loss": 5.327, + "step": 19666 + }, + { + "epoch": 0.11696522028737273, + "grad_norm": 1.8340283632278442, + "learning_rate": 4.833126505968495e-05, + "loss": 5.199, + "step": 19667 + }, + { + "epoch": 0.11697116757065372, + "grad_norm": 1.4036595821380615, + "learning_rate": 4.8331097261677555e-05, + "loss": 5.185, + "step": 19668 + }, + { + "epoch": 0.11697711485393472, + "grad_norm": 1.5454041957855225, + "learning_rate": 4.833092945552551e-05, + "loss": 5.3545, + "step": 19669 + }, + { + "epoch": 0.11698306213721572, + "grad_norm": 1.4965288639068604, + "learning_rate": 4.8330761641228886e-05, + "loss": 5.2993, + "step": 19670 + }, + { + "epoch": 0.11698900942049671, + "grad_norm": 2.4290192127227783, + "learning_rate": 4.833059381878773e-05, + "loss": 5.2738, + "step": 19671 + }, + { + "epoch": 0.11699495670377771, + "grad_norm": 2.502086877822876, + "learning_rate": 4.8330425988202097e-05, + "loss": 5.3218, + "step": 19672 + }, + { + "epoch": 0.11700090398705872, + "grad_norm": 2.1629221439361572, + "learning_rate": 4.833025814947206e-05, + "loss": 5.304, + "step": 19673 + }, + { + "epoch": 0.1170068512703397, + "grad_norm": 2.096604824066162, + "learning_rate": 4.8330090302597675e-05, + "loss": 5.3423, + "step": 19674 + }, + { + "epoch": 0.1170127985536207, + "grad_norm": 2.2843055725097656, + "learning_rate": 4.832992244757899e-05, + "loss": 5.2463, + "step": 19675 + }, + { + "epoch": 0.11701874583690171, + "grad_norm": 2.1538522243499756, + "learning_rate": 4.8329754584416074e-05, + "loss": 5.0529, + "step": 19676 + }, + { + "epoch": 0.1170246931201827, + "grad_norm": 1.763832688331604, + "learning_rate": 4.832958671310898e-05, + "loss": 5.105, + "step": 19677 + }, + { + "epoch": 0.1170306404034637, + "grad_norm": 2.048945426940918, + "learning_rate": 4.832941883365777e-05, + "loss": 5.1724, + "step": 19678 + }, + { + "epoch": 0.1170365876867447, + "grad_norm": 2.324202537536621, + "learning_rate": 4.83292509460625e-05, + "loss": 5.1574, + "step": 19679 + }, + { + "epoch": 0.11704253497002569, + "grad_norm": 2.447587728500366, + "learning_rate": 4.8329083050323235e-05, + "loss": 5.2401, + "step": 19680 + }, + { + "epoch": 0.11704848225330669, + "grad_norm": 2.212921380996704, + "learning_rate": 4.832891514644002e-05, + "loss": 5.1122, + "step": 19681 + }, + { + "epoch": 0.11705442953658769, + "grad_norm": 2.1183717250823975, + "learning_rate": 4.832874723441292e-05, + "loss": 4.985, + "step": 19682 + }, + { + "epoch": 0.11706037681986868, + "grad_norm": 2.1509101390838623, + "learning_rate": 4.8328579314242006e-05, + "loss": 5.1369, + "step": 19683 + }, + { + "epoch": 0.11706632410314968, + "grad_norm": 1.9071851968765259, + "learning_rate": 4.832841138592732e-05, + "loss": 5.0454, + "step": 19684 + }, + { + "epoch": 0.11707227138643068, + "grad_norm": 2.262612819671631, + "learning_rate": 4.8328243449468926e-05, + "loss": 5.0763, + "step": 19685 + }, + { + "epoch": 0.11707821866971167, + "grad_norm": 2.073665142059326, + "learning_rate": 4.8328075504866874e-05, + "loss": 5.0779, + "step": 19686 + }, + { + "epoch": 0.11708416595299267, + "grad_norm": 1.9270633459091187, + "learning_rate": 4.832790755212124e-05, + "loss": 4.8148, + "step": 19687 + }, + { + "epoch": 0.11709011323627368, + "grad_norm": 1.9167968034744263, + "learning_rate": 4.832773959123208e-05, + "loss": 4.8027, + "step": 19688 + }, + { + "epoch": 0.11709606051955466, + "grad_norm": 2.0495805740356445, + "learning_rate": 4.8327571622199444e-05, + "loss": 4.9483, + "step": 19689 + }, + { + "epoch": 0.11710200780283567, + "grad_norm": 2.203997850418091, + "learning_rate": 4.83274036450234e-05, + "loss": 5.1086, + "step": 19690 + }, + { + "epoch": 0.11710795508611666, + "grad_norm": 2.0023131370544434, + "learning_rate": 4.8327235659703984e-05, + "loss": 5.0601, + "step": 19691 + }, + { + "epoch": 0.11711390236939766, + "grad_norm": 2.3212523460388184, + "learning_rate": 4.832706766624128e-05, + "loss": 4.9391, + "step": 19692 + }, + { + "epoch": 0.11711984965267866, + "grad_norm": 2.2633869647979736, + "learning_rate": 4.8326899664635336e-05, + "loss": 5.0262, + "step": 19693 + }, + { + "epoch": 0.11712579693595965, + "grad_norm": 2.2608723640441895, + "learning_rate": 4.832673165488622e-05, + "loss": 4.9814, + "step": 19694 + }, + { + "epoch": 0.11713174421924065, + "grad_norm": 2.0270745754241943, + "learning_rate": 4.8326563636993975e-05, + "loss": 4.9321, + "step": 19695 + }, + { + "epoch": 0.11713769150252165, + "grad_norm": 2.1299290657043457, + "learning_rate": 4.832639561095867e-05, + "loss": 4.8248, + "step": 19696 + }, + { + "epoch": 0.11714363878580264, + "grad_norm": 2.1891887187957764, + "learning_rate": 4.8326227576780355e-05, + "loss": 4.963, + "step": 19697 + }, + { + "epoch": 0.11714958606908364, + "grad_norm": 2.35532546043396, + "learning_rate": 4.8326059534459114e-05, + "loss": 4.8617, + "step": 19698 + }, + { + "epoch": 0.11715553335236464, + "grad_norm": 2.215864658355713, + "learning_rate": 4.8325891483994964e-05, + "loss": 5.1467, + "step": 19699 + }, + { + "epoch": 0.11716148063564563, + "grad_norm": 1.7004871368408203, + "learning_rate": 4.8325723425387996e-05, + "loss": 4.8682, + "step": 19700 + }, + { + "epoch": 0.11716742791892663, + "grad_norm": 2.537426471710205, + "learning_rate": 4.832555535863826e-05, + "loss": 5.0373, + "step": 19701 + }, + { + "epoch": 0.11717337520220764, + "grad_norm": 2.3324837684631348, + "learning_rate": 4.832538728374581e-05, + "loss": 4.9261, + "step": 19702 + }, + { + "epoch": 0.11717932248548862, + "grad_norm": 2.107374906539917, + "learning_rate": 4.832521920071071e-05, + "loss": 5.0036, + "step": 19703 + }, + { + "epoch": 0.11718526976876963, + "grad_norm": 2.0933899879455566, + "learning_rate": 4.8325051109533024e-05, + "loss": 5.086, + "step": 19704 + }, + { + "epoch": 0.11719121705205063, + "grad_norm": 1.9250128269195557, + "learning_rate": 4.8324883010212794e-05, + "loss": 4.9056, + "step": 19705 + }, + { + "epoch": 0.11719716433533162, + "grad_norm": 2.0679538249969482, + "learning_rate": 4.832471490275009e-05, + "loss": 5.0291, + "step": 19706 + }, + { + "epoch": 0.11720311161861262, + "grad_norm": 2.1115055084228516, + "learning_rate": 4.8324546787144974e-05, + "loss": 4.8649, + "step": 19707 + }, + { + "epoch": 0.11720905890189362, + "grad_norm": 2.123899459838867, + "learning_rate": 4.832437866339749e-05, + "loss": 4.9011, + "step": 19708 + }, + { + "epoch": 0.11721500618517461, + "grad_norm": 2.2809536457061768, + "learning_rate": 4.832421053150772e-05, + "loss": 5.1844, + "step": 19709 + }, + { + "epoch": 0.11722095346845561, + "grad_norm": 2.04567551612854, + "learning_rate": 4.83240423914757e-05, + "loss": 4.8685, + "step": 19710 + }, + { + "epoch": 0.11722690075173661, + "grad_norm": 1.5762519836425781, + "learning_rate": 4.8323874243301495e-05, + "loss": 5.4069, + "step": 19711 + }, + { + "epoch": 0.1172328480350176, + "grad_norm": 1.719250202178955, + "learning_rate": 4.832370608698518e-05, + "loss": 5.6127, + "step": 19712 + }, + { + "epoch": 0.1172387953182986, + "grad_norm": 1.6808120012283325, + "learning_rate": 4.8323537922526785e-05, + "loss": 5.5401, + "step": 19713 + }, + { + "epoch": 0.1172447426015796, + "grad_norm": 1.6794480085372925, + "learning_rate": 4.832336974992639e-05, + "loss": 5.6679, + "step": 19714 + }, + { + "epoch": 0.11725068988486059, + "grad_norm": 1.7805535793304443, + "learning_rate": 4.832320156918405e-05, + "loss": 5.5025, + "step": 19715 + }, + { + "epoch": 0.1172566371681416, + "grad_norm": 2.1433472633361816, + "learning_rate": 4.832303338029982e-05, + "loss": 5.2425, + "step": 19716 + }, + { + "epoch": 0.1172625844514226, + "grad_norm": 1.5449565649032593, + "learning_rate": 4.832286518327376e-05, + "loss": 5.3278, + "step": 19717 + }, + { + "epoch": 0.11726853173470358, + "grad_norm": 1.7341786623001099, + "learning_rate": 4.832269697810592e-05, + "loss": 5.3393, + "step": 19718 + }, + { + "epoch": 0.11727447901798459, + "grad_norm": 1.4936028718948364, + "learning_rate": 4.832252876479638e-05, + "loss": 5.0499, + "step": 19719 + }, + { + "epoch": 0.11728042630126558, + "grad_norm": 1.7648371458053589, + "learning_rate": 4.832236054334518e-05, + "loss": 5.3585, + "step": 19720 + }, + { + "epoch": 0.11728637358454658, + "grad_norm": 1.8131940364837646, + "learning_rate": 4.832219231375238e-05, + "loss": 5.2496, + "step": 19721 + }, + { + "epoch": 0.11729232086782758, + "grad_norm": 1.5939579010009766, + "learning_rate": 4.832202407601806e-05, + "loss": 5.2294, + "step": 19722 + }, + { + "epoch": 0.11729826815110857, + "grad_norm": 1.6752222776412964, + "learning_rate": 4.832185583014225e-05, + "loss": 5.2679, + "step": 19723 + }, + { + "epoch": 0.11730421543438957, + "grad_norm": 1.4784640073776245, + "learning_rate": 4.832168757612502e-05, + "loss": 5.1567, + "step": 19724 + }, + { + "epoch": 0.11731016271767057, + "grad_norm": 1.5112851858139038, + "learning_rate": 4.8321519313966436e-05, + "loss": 5.0304, + "step": 19725 + }, + { + "epoch": 0.11731611000095156, + "grad_norm": 1.5895473957061768, + "learning_rate": 4.832135104366654e-05, + "loss": 5.0681, + "step": 19726 + }, + { + "epoch": 0.11732205728423256, + "grad_norm": 1.510641098022461, + "learning_rate": 4.832118276522541e-05, + "loss": 5.0667, + "step": 19727 + }, + { + "epoch": 0.11732800456751356, + "grad_norm": 1.7403017282485962, + "learning_rate": 4.83210144786431e-05, + "loss": 4.9199, + "step": 19728 + }, + { + "epoch": 0.11733395185079455, + "grad_norm": 2.239452600479126, + "learning_rate": 4.832084618391966e-05, + "loss": 5.2846, + "step": 19729 + }, + { + "epoch": 0.11733989913407555, + "grad_norm": 1.977001428604126, + "learning_rate": 4.8320677881055154e-05, + "loss": 4.9573, + "step": 19730 + }, + { + "epoch": 0.11734584641735656, + "grad_norm": 2.2819485664367676, + "learning_rate": 4.8320509570049633e-05, + "loss": 4.6549, + "step": 19731 + }, + { + "epoch": 0.11735179370063754, + "grad_norm": 2.3943941593170166, + "learning_rate": 4.832034125090317e-05, + "loss": 4.8411, + "step": 19732 + }, + { + "epoch": 0.11735774098391855, + "grad_norm": 2.5439767837524414, + "learning_rate": 4.832017292361582e-05, + "loss": 4.7305, + "step": 19733 + }, + { + "epoch": 0.11736368826719955, + "grad_norm": 2.21797251701355, + "learning_rate": 4.8320004588187636e-05, + "loss": 4.8963, + "step": 19734 + }, + { + "epoch": 0.11736963555048054, + "grad_norm": 1.9822254180908203, + "learning_rate": 4.831983624461868e-05, + "loss": 4.8062, + "step": 19735 + }, + { + "epoch": 0.11737558283376154, + "grad_norm": 2.56172513961792, + "learning_rate": 4.8319667892909004e-05, + "loss": 4.6495, + "step": 19736 + }, + { + "epoch": 0.11738153011704254, + "grad_norm": 2.3328988552093506, + "learning_rate": 4.831949953305868e-05, + "loss": 4.3587, + "step": 19737 + }, + { + "epoch": 0.11738747740032353, + "grad_norm": 2.4720728397369385, + "learning_rate": 4.831933116506775e-05, + "loss": 4.5648, + "step": 19738 + }, + { + "epoch": 0.11739342468360453, + "grad_norm": 2.3738696575164795, + "learning_rate": 4.831916278893629e-05, + "loss": 4.391, + "step": 19739 + }, + { + "epoch": 0.11739937196688553, + "grad_norm": 2.400050640106201, + "learning_rate": 4.831899440466435e-05, + "loss": 4.5792, + "step": 19740 + }, + { + "epoch": 0.11740531925016652, + "grad_norm": 1.7596909999847412, + "learning_rate": 4.831882601225199e-05, + "loss": 4.8026, + "step": 19741 + }, + { + "epoch": 0.11741126653344752, + "grad_norm": 2.2190558910369873, + "learning_rate": 4.831865761169927e-05, + "loss": 4.578, + "step": 19742 + }, + { + "epoch": 0.11741721381672852, + "grad_norm": 2.468982458114624, + "learning_rate": 4.831848920300624e-05, + "loss": 4.3132, + "step": 19743 + }, + { + "epoch": 0.11742316110000951, + "grad_norm": 2.1495306491851807, + "learning_rate": 4.831832078617298e-05, + "loss": 4.5307, + "step": 19744 + }, + { + "epoch": 0.11742910838329051, + "grad_norm": 2.2298312187194824, + "learning_rate": 4.831815236119953e-05, + "loss": 4.3435, + "step": 19745 + }, + { + "epoch": 0.11743505566657152, + "grad_norm": 2.0968551635742188, + "learning_rate": 4.831798392808595e-05, + "loss": 4.4348, + "step": 19746 + }, + { + "epoch": 0.1174410029498525, + "grad_norm": 2.2520592212677, + "learning_rate": 4.831781548683231e-05, + "loss": 4.4347, + "step": 19747 + }, + { + "epoch": 0.1174469502331335, + "grad_norm": 2.5319058895111084, + "learning_rate": 4.8317647037438655e-05, + "loss": 4.3817, + "step": 19748 + }, + { + "epoch": 0.1174528975164145, + "grad_norm": 2.186539649963379, + "learning_rate": 4.8317478579905054e-05, + "loss": 4.6415, + "step": 19749 + }, + { + "epoch": 0.1174588447996955, + "grad_norm": 2.472963571548462, + "learning_rate": 4.8317310114231554e-05, + "loss": 4.4495, + "step": 19750 + }, + { + "epoch": 0.1174647920829765, + "grad_norm": 2.3692901134490967, + "learning_rate": 4.831714164041823e-05, + "loss": 4.3571, + "step": 19751 + }, + { + "epoch": 0.11747073936625749, + "grad_norm": 1.8001717329025269, + "learning_rate": 4.831697315846513e-05, + "loss": 5.3843, + "step": 19752 + }, + { + "epoch": 0.11747668664953849, + "grad_norm": 1.6087725162506104, + "learning_rate": 4.8316804668372315e-05, + "loss": 5.7155, + "step": 19753 + }, + { + "epoch": 0.11748263393281949, + "grad_norm": 1.5348961353302002, + "learning_rate": 4.8316636170139845e-05, + "loss": 4.8697, + "step": 19754 + }, + { + "epoch": 0.11748858121610048, + "grad_norm": 1.790076494216919, + "learning_rate": 4.831646766376778e-05, + "loss": 5.708, + "step": 19755 + }, + { + "epoch": 0.11749452849938148, + "grad_norm": 1.8615236282348633, + "learning_rate": 4.831629914925617e-05, + "loss": 5.3669, + "step": 19756 + }, + { + "epoch": 0.11750047578266248, + "grad_norm": 1.5969476699829102, + "learning_rate": 4.8316130626605096e-05, + "loss": 5.4041, + "step": 19757 + }, + { + "epoch": 0.11750642306594347, + "grad_norm": 1.5471712350845337, + "learning_rate": 4.8315962095814584e-05, + "loss": 5.5293, + "step": 19758 + }, + { + "epoch": 0.11751237034922447, + "grad_norm": 1.6281818151474, + "learning_rate": 4.831579355688472e-05, + "loss": 5.51, + "step": 19759 + }, + { + "epoch": 0.11751831763250548, + "grad_norm": 1.5264689922332764, + "learning_rate": 4.831562500981555e-05, + "loss": 4.9906, + "step": 19760 + }, + { + "epoch": 0.11752426491578646, + "grad_norm": 1.8446382284164429, + "learning_rate": 4.8315456454607145e-05, + "loss": 4.8351, + "step": 19761 + }, + { + "epoch": 0.11753021219906747, + "grad_norm": 2.0462918281555176, + "learning_rate": 4.8315287891259545e-05, + "loss": 4.7906, + "step": 19762 + }, + { + "epoch": 0.11753615948234847, + "grad_norm": 1.664975643157959, + "learning_rate": 4.831511931977282e-05, + "loss": 5.4149, + "step": 19763 + }, + { + "epoch": 0.11754210676562946, + "grad_norm": 1.8824998140335083, + "learning_rate": 4.831495074014703e-05, + "loss": 5.2587, + "step": 19764 + }, + { + "epoch": 0.11754805404891046, + "grad_norm": 1.6167455911636353, + "learning_rate": 4.8314782152382235e-05, + "loss": 5.3213, + "step": 19765 + }, + { + "epoch": 0.11755400133219146, + "grad_norm": 1.686562180519104, + "learning_rate": 4.831461355647848e-05, + "loss": 5.3497, + "step": 19766 + }, + { + "epoch": 0.11755994861547245, + "grad_norm": 1.7332249879837036, + "learning_rate": 4.831444495243584e-05, + "loss": 5.3139, + "step": 19767 + }, + { + "epoch": 0.11756589589875345, + "grad_norm": 1.6482213735580444, + "learning_rate": 4.8314276340254375e-05, + "loss": 5.5488, + "step": 19768 + }, + { + "epoch": 0.11757184318203445, + "grad_norm": 1.6714067459106445, + "learning_rate": 4.8314107719934134e-05, + "loss": 4.7354, + "step": 19769 + }, + { + "epoch": 0.11757779046531544, + "grad_norm": 1.5826655626296997, + "learning_rate": 4.8313939091475166e-05, + "loss": 5.5232, + "step": 19770 + }, + { + "epoch": 0.11758373774859644, + "grad_norm": 1.4177565574645996, + "learning_rate": 4.831377045487756e-05, + "loss": 5.4262, + "step": 19771 + }, + { + "epoch": 0.11758968503187744, + "grad_norm": 1.4056715965270996, + "learning_rate": 4.831360181014135e-05, + "loss": 5.6306, + "step": 19772 + }, + { + "epoch": 0.11759563231515843, + "grad_norm": 1.7903814315795898, + "learning_rate": 4.83134331572666e-05, + "loss": 4.5016, + "step": 19773 + }, + { + "epoch": 0.11760157959843943, + "grad_norm": 1.8719782829284668, + "learning_rate": 4.831326449625337e-05, + "loss": 4.3561, + "step": 19774 + }, + { + "epoch": 0.11760752688172044, + "grad_norm": 2.0182130336761475, + "learning_rate": 4.831309582710173e-05, + "loss": 4.3988, + "step": 19775 + }, + { + "epoch": 0.11761347416500142, + "grad_norm": 1.828475832939148, + "learning_rate": 4.8312927149811726e-05, + "loss": 4.4127, + "step": 19776 + }, + { + "epoch": 0.11761942144828243, + "grad_norm": 1.8332375288009644, + "learning_rate": 4.831275846438341e-05, + "loss": 4.3285, + "step": 19777 + }, + { + "epoch": 0.11762536873156341, + "grad_norm": 1.7542626857757568, + "learning_rate": 4.831258977081686e-05, + "loss": 5.4412, + "step": 19778 + }, + { + "epoch": 0.11763131601484442, + "grad_norm": 1.9277591705322266, + "learning_rate": 4.831242106911212e-05, + "loss": 4.1537, + "step": 19779 + }, + { + "epoch": 0.11763726329812542, + "grad_norm": 1.943296194076538, + "learning_rate": 4.8312252359269265e-05, + "loss": 4.448, + "step": 19780 + }, + { + "epoch": 0.11764321058140641, + "grad_norm": 1.8032363653182983, + "learning_rate": 4.831208364128834e-05, + "loss": 4.9847, + "step": 19781 + }, + { + "epoch": 0.11764915786468741, + "grad_norm": 1.9383130073547363, + "learning_rate": 4.83119149151694e-05, + "loss": 4.7231, + "step": 19782 + }, + { + "epoch": 0.11765510514796841, + "grad_norm": 1.8854987621307373, + "learning_rate": 4.831174618091252e-05, + "loss": 4.1493, + "step": 19783 + }, + { + "epoch": 0.1176610524312494, + "grad_norm": 1.932180404663086, + "learning_rate": 4.831157743851775e-05, + "loss": 4.0519, + "step": 19784 + }, + { + "epoch": 0.1176669997145304, + "grad_norm": 1.885292887687683, + "learning_rate": 4.831140868798514e-05, + "loss": 4.1593, + "step": 19785 + }, + { + "epoch": 0.1176729469978114, + "grad_norm": 1.8257746696472168, + "learning_rate": 4.8311239929314764e-05, + "loss": 4.3896, + "step": 19786 + }, + { + "epoch": 0.11767889428109239, + "grad_norm": 1.9383732080459595, + "learning_rate": 4.831107116250667e-05, + "loss": 4.1973, + "step": 19787 + }, + { + "epoch": 0.1176848415643734, + "grad_norm": 1.9942466020584106, + "learning_rate": 4.831090238756093e-05, + "loss": 4.3542, + "step": 19788 + }, + { + "epoch": 0.1176907888476544, + "grad_norm": 1.5551074743270874, + "learning_rate": 4.831073360447759e-05, + "loss": 4.9338, + "step": 19789 + }, + { + "epoch": 0.11769673613093538, + "grad_norm": 1.5898525714874268, + "learning_rate": 4.831056481325672e-05, + "loss": 4.8582, + "step": 19790 + }, + { + "epoch": 0.11770268341421639, + "grad_norm": 1.7175228595733643, + "learning_rate": 4.831039601389836e-05, + "loss": 4.6618, + "step": 19791 + }, + { + "epoch": 0.11770863069749739, + "grad_norm": 2.3165528774261475, + "learning_rate": 4.8310227206402594e-05, + "loss": 4.8579, + "step": 19792 + }, + { + "epoch": 0.11771457798077838, + "grad_norm": 1.4406440258026123, + "learning_rate": 4.8310058390769464e-05, + "loss": 5.6443, + "step": 19793 + }, + { + "epoch": 0.11772052526405938, + "grad_norm": 1.6670812368392944, + "learning_rate": 4.8309889566999037e-05, + "loss": 5.2096, + "step": 19794 + }, + { + "epoch": 0.11772647254734038, + "grad_norm": 1.6150201559066772, + "learning_rate": 4.8309720735091354e-05, + "loss": 5.2055, + "step": 19795 + }, + { + "epoch": 0.11773241983062137, + "grad_norm": 1.7714163064956665, + "learning_rate": 4.83095518950465e-05, + "loss": 5.9145, + "step": 19796 + }, + { + "epoch": 0.11773836711390237, + "grad_norm": 1.3608043193817139, + "learning_rate": 4.8309383046864526e-05, + "loss": 5.1546, + "step": 19797 + }, + { + "epoch": 0.11774431439718337, + "grad_norm": 1.2962807416915894, + "learning_rate": 4.830921419054548e-05, + "loss": 5.3574, + "step": 19798 + }, + { + "epoch": 0.11775026168046436, + "grad_norm": 2.0007364749908447, + "learning_rate": 4.8309045326089434e-05, + "loss": 5.0939, + "step": 19799 + }, + { + "epoch": 0.11775620896374536, + "grad_norm": 1.6526695489883423, + "learning_rate": 4.830887645349644e-05, + "loss": 5.7498, + "step": 19800 + }, + { + "epoch": 0.11776215624702636, + "grad_norm": 1.4990460872650146, + "learning_rate": 4.830870757276655e-05, + "loss": 5.2728, + "step": 19801 + }, + { + "epoch": 0.11776810353030735, + "grad_norm": 2.182511806488037, + "learning_rate": 4.830853868389984e-05, + "loss": 5.1598, + "step": 19802 + }, + { + "epoch": 0.11777405081358835, + "grad_norm": 2.515284538269043, + "learning_rate": 4.8308369786896354e-05, + "loss": 5.1378, + "step": 19803 + }, + { + "epoch": 0.11777999809686936, + "grad_norm": 1.9783490896224976, + "learning_rate": 4.830820088175616e-05, + "loss": 4.9242, + "step": 19804 + }, + { + "epoch": 0.11778594538015034, + "grad_norm": 1.790901780128479, + "learning_rate": 4.8308031968479315e-05, + "loss": 5.1156, + "step": 19805 + }, + { + "epoch": 0.11779189266343135, + "grad_norm": 1.751846432685852, + "learning_rate": 4.830786304706587e-05, + "loss": 5.2306, + "step": 19806 + }, + { + "epoch": 0.11779783994671233, + "grad_norm": 1.588497519493103, + "learning_rate": 4.83076941175159e-05, + "loss": 5.3987, + "step": 19807 + }, + { + "epoch": 0.11780378722999334, + "grad_norm": 1.9150582551956177, + "learning_rate": 4.830752517982945e-05, + "loss": 4.977, + "step": 19808 + }, + { + "epoch": 0.11780973451327434, + "grad_norm": 1.706708312034607, + "learning_rate": 4.8307356234006584e-05, + "loss": 5.0455, + "step": 19809 + }, + { + "epoch": 0.11781568179655533, + "grad_norm": 1.9373780488967896, + "learning_rate": 4.830718728004736e-05, + "loss": 5.0547, + "step": 19810 + }, + { + "epoch": 0.11782162907983633, + "grad_norm": 1.6948046684265137, + "learning_rate": 4.830701831795184e-05, + "loss": 5.0943, + "step": 19811 + }, + { + "epoch": 0.11782757636311733, + "grad_norm": 1.630083680152893, + "learning_rate": 4.8306849347720087e-05, + "loss": 5.6369, + "step": 19812 + }, + { + "epoch": 0.11783352364639832, + "grad_norm": 1.4906461238861084, + "learning_rate": 4.830668036935214e-05, + "loss": 5.2921, + "step": 19813 + }, + { + "epoch": 0.11783947092967932, + "grad_norm": 1.6434717178344727, + "learning_rate": 4.8306511382848076e-05, + "loss": 5.3473, + "step": 19814 + }, + { + "epoch": 0.11784541821296032, + "grad_norm": 1.5606834888458252, + "learning_rate": 4.8306342388207956e-05, + "loss": 5.3031, + "step": 19815 + }, + { + "epoch": 0.11785136549624131, + "grad_norm": 2.157352924346924, + "learning_rate": 4.830617338543183e-05, + "loss": 4.4939, + "step": 19816 + }, + { + "epoch": 0.11785731277952231, + "grad_norm": 2.49686598777771, + "learning_rate": 4.830600437451975e-05, + "loss": 4.506, + "step": 19817 + }, + { + "epoch": 0.11786326006280332, + "grad_norm": 1.943969964981079, + "learning_rate": 4.830583535547179e-05, + "loss": 4.411, + "step": 19818 + }, + { + "epoch": 0.1178692073460843, + "grad_norm": 1.9092329740524292, + "learning_rate": 4.830566632828801e-05, + "loss": 4.4121, + "step": 19819 + }, + { + "epoch": 0.1178751546293653, + "grad_norm": 1.7568551301956177, + "learning_rate": 4.830549729296846e-05, + "loss": 4.317, + "step": 19820 + }, + { + "epoch": 0.11788110191264631, + "grad_norm": 1.788150429725647, + "learning_rate": 4.83053282495132e-05, + "loss": 4.2928, + "step": 19821 + }, + { + "epoch": 0.1178870491959273, + "grad_norm": 1.9792863130569458, + "learning_rate": 4.830515919792229e-05, + "loss": 4.3219, + "step": 19822 + }, + { + "epoch": 0.1178929964792083, + "grad_norm": 2.2407681941986084, + "learning_rate": 4.8304990138195795e-05, + "loss": 4.296, + "step": 19823 + }, + { + "epoch": 0.1178989437624893, + "grad_norm": 1.993288516998291, + "learning_rate": 4.830482107033377e-05, + "loss": 4.2922, + "step": 19824 + }, + { + "epoch": 0.11790489104577029, + "grad_norm": 2.1966097354888916, + "learning_rate": 4.8304651994336264e-05, + "loss": 4.1215, + "step": 19825 + }, + { + "epoch": 0.11791083832905129, + "grad_norm": 1.569989562034607, + "learning_rate": 4.8304482910203345e-05, + "loss": 5.5432, + "step": 19826 + }, + { + "epoch": 0.11791678561233229, + "grad_norm": 1.522828459739685, + "learning_rate": 4.8304313817935075e-05, + "loss": 5.465, + "step": 19827 + }, + { + "epoch": 0.11792273289561328, + "grad_norm": 1.9455969333648682, + "learning_rate": 4.830414471753151e-05, + "loss": 5.1462, + "step": 19828 + }, + { + "epoch": 0.11792868017889428, + "grad_norm": 1.8587162494659424, + "learning_rate": 4.830397560899271e-05, + "loss": 5.1987, + "step": 19829 + }, + { + "epoch": 0.11793462746217528, + "grad_norm": 2.1671674251556396, + "learning_rate": 4.830380649231873e-05, + "loss": 5.3333, + "step": 19830 + }, + { + "epoch": 0.11794057474545627, + "grad_norm": 1.8267066478729248, + "learning_rate": 4.8303637367509636e-05, + "loss": 5.5306, + "step": 19831 + }, + { + "epoch": 0.11794652202873727, + "grad_norm": 1.80419921875, + "learning_rate": 4.830346823456548e-05, + "loss": 5.3077, + "step": 19832 + }, + { + "epoch": 0.11795246931201828, + "grad_norm": 1.9116721153259277, + "learning_rate": 4.830329909348632e-05, + "loss": 4.8531, + "step": 19833 + }, + { + "epoch": 0.11795841659529926, + "grad_norm": 1.9208347797393799, + "learning_rate": 4.830312994427223e-05, + "loss": 4.9645, + "step": 19834 + }, + { + "epoch": 0.11796436387858027, + "grad_norm": 1.8385374546051025, + "learning_rate": 4.8302960786923246e-05, + "loss": 4.7095, + "step": 19835 + }, + { + "epoch": 0.11797031116186125, + "grad_norm": 1.9271587133407593, + "learning_rate": 4.830279162143945e-05, + "loss": 4.5788, + "step": 19836 + }, + { + "epoch": 0.11797625844514226, + "grad_norm": 2.0168333053588867, + "learning_rate": 4.8302622447820885e-05, + "loss": 4.7595, + "step": 19837 + }, + { + "epoch": 0.11798220572842326, + "grad_norm": 1.9674837589263916, + "learning_rate": 4.8302453266067616e-05, + "loss": 4.674, + "step": 19838 + }, + { + "epoch": 0.11798815301170425, + "grad_norm": 1.944601058959961, + "learning_rate": 4.830228407617969e-05, + "loss": 4.6683, + "step": 19839 + }, + { + "epoch": 0.11799410029498525, + "grad_norm": 1.8970340490341187, + "learning_rate": 4.83021148781572e-05, + "loss": 5.2577, + "step": 19840 + }, + { + "epoch": 0.11800004757826625, + "grad_norm": 2.035505533218384, + "learning_rate": 4.8301945672000164e-05, + "loss": 4.7872, + "step": 19841 + }, + { + "epoch": 0.11800599486154724, + "grad_norm": 2.4211058616638184, + "learning_rate": 4.830177645770867e-05, + "loss": 4.9424, + "step": 19842 + }, + { + "epoch": 0.11801194214482824, + "grad_norm": 2.080132484436035, + "learning_rate": 4.830160723528276e-05, + "loss": 4.7908, + "step": 19843 + }, + { + "epoch": 0.11801788942810924, + "grad_norm": 3.5975728034973145, + "learning_rate": 4.83014380047225e-05, + "loss": 5.3434, + "step": 19844 + }, + { + "epoch": 0.11802383671139023, + "grad_norm": 1.6917449235916138, + "learning_rate": 4.830126876602795e-05, + "loss": 5.2593, + "step": 19845 + }, + { + "epoch": 0.11802978399467123, + "grad_norm": 1.8179433345794678, + "learning_rate": 4.8301099519199173e-05, + "loss": 5.9407, + "step": 19846 + }, + { + "epoch": 0.11803573127795224, + "grad_norm": 1.652653694152832, + "learning_rate": 4.8300930264236216e-05, + "loss": 5.505, + "step": 19847 + }, + { + "epoch": 0.11804167856123322, + "grad_norm": 1.6400798559188843, + "learning_rate": 4.830076100113915e-05, + "loss": 5.7281, + "step": 19848 + }, + { + "epoch": 0.11804762584451423, + "grad_norm": 1.865049123764038, + "learning_rate": 4.830059172990802e-05, + "loss": 5.4562, + "step": 19849 + }, + { + "epoch": 0.11805357312779523, + "grad_norm": 1.68345308303833, + "learning_rate": 4.8300422450542906e-05, + "loss": 5.3027, + "step": 19850 + }, + { + "epoch": 0.11805952041107622, + "grad_norm": 2.1790804862976074, + "learning_rate": 4.8300253163043855e-05, + "loss": 4.5531, + "step": 19851 + }, + { + "epoch": 0.11806546769435722, + "grad_norm": 2.63421368598938, + "learning_rate": 4.8300083867410915e-05, + "loss": 4.0978, + "step": 19852 + }, + { + "epoch": 0.11807141497763822, + "grad_norm": 1.8692448139190674, + "learning_rate": 4.829991456364417e-05, + "loss": 5.5482, + "step": 19853 + }, + { + "epoch": 0.11807736226091921, + "grad_norm": 1.684128761291504, + "learning_rate": 4.829974525174365e-05, + "loss": 5.5612, + "step": 19854 + }, + { + "epoch": 0.11808330954420021, + "grad_norm": 1.5720278024673462, + "learning_rate": 4.829957593170944e-05, + "loss": 5.6787, + "step": 19855 + }, + { + "epoch": 0.11808925682748121, + "grad_norm": 1.834423303604126, + "learning_rate": 4.829940660354159e-05, + "loss": 4.5591, + "step": 19856 + }, + { + "epoch": 0.1180952041107622, + "grad_norm": 1.7370680570602417, + "learning_rate": 4.829923726724015e-05, + "loss": 5.1643, + "step": 19857 + }, + { + "epoch": 0.1181011513940432, + "grad_norm": 2.1546318531036377, + "learning_rate": 4.829906792280519e-05, + "loss": 4.5788, + "step": 19858 + }, + { + "epoch": 0.1181070986773242, + "grad_norm": 2.5604169368743896, + "learning_rate": 4.829889857023677e-05, + "loss": 3.1948, + "step": 19859 + }, + { + "epoch": 0.11811304596060519, + "grad_norm": 2.072169780731201, + "learning_rate": 4.829872920953494e-05, + "loss": 3.9707, + "step": 19860 + }, + { + "epoch": 0.1181189932438862, + "grad_norm": 1.7981303930282593, + "learning_rate": 4.829855984069976e-05, + "loss": 5.8413, + "step": 19861 + }, + { + "epoch": 0.1181249405271672, + "grad_norm": 1.621327519416809, + "learning_rate": 4.8298390463731305e-05, + "loss": 5.4867, + "step": 19862 + }, + { + "epoch": 0.11813088781044818, + "grad_norm": 1.5245294570922852, + "learning_rate": 4.829822107862962e-05, + "loss": 5.7148, + "step": 19863 + }, + { + "epoch": 0.11813683509372919, + "grad_norm": 2.2656896114349365, + "learning_rate": 4.8298051685394765e-05, + "loss": 5.6678, + "step": 19864 + }, + { + "epoch": 0.11814278237701017, + "grad_norm": 1.8529094457626343, + "learning_rate": 4.8297882284026805e-05, + "loss": 5.4445, + "step": 19865 + }, + { + "epoch": 0.11814872966029118, + "grad_norm": 1.5151565074920654, + "learning_rate": 4.829771287452579e-05, + "loss": 5.2794, + "step": 19866 + }, + { + "epoch": 0.11815467694357218, + "grad_norm": 1.8492248058319092, + "learning_rate": 4.829754345689178e-05, + "loss": 5.0797, + "step": 19867 + }, + { + "epoch": 0.11816062422685317, + "grad_norm": 2.7612802982330322, + "learning_rate": 4.829737403112484e-05, + "loss": 5.1486, + "step": 19868 + }, + { + "epoch": 0.11816657151013417, + "grad_norm": 1.9457459449768066, + "learning_rate": 4.8297204597225035e-05, + "loss": 5.6507, + "step": 19869 + }, + { + "epoch": 0.11817251879341517, + "grad_norm": 1.6429107189178467, + "learning_rate": 4.829703515519242e-05, + "loss": 5.8414, + "step": 19870 + }, + { + "epoch": 0.11817846607669616, + "grad_norm": 1.556187391281128, + "learning_rate": 4.829686570502704e-05, + "loss": 5.9028, + "step": 19871 + }, + { + "epoch": 0.11818441335997716, + "grad_norm": 1.451532006263733, + "learning_rate": 4.8296696246728965e-05, + "loss": 5.8497, + "step": 19872 + }, + { + "epoch": 0.11819036064325816, + "grad_norm": 1.7325583696365356, + "learning_rate": 4.8296526780298256e-05, + "loss": 5.3531, + "step": 19873 + }, + { + "epoch": 0.11819630792653915, + "grad_norm": 1.784332275390625, + "learning_rate": 4.829635730573497e-05, + "loss": 5.6025, + "step": 19874 + }, + { + "epoch": 0.11820225520982015, + "grad_norm": 1.6109933853149414, + "learning_rate": 4.829618782303917e-05, + "loss": 5.5626, + "step": 19875 + }, + { + "epoch": 0.11820820249310116, + "grad_norm": 1.6639639139175415, + "learning_rate": 4.8296018332210905e-05, + "loss": 5.5679, + "step": 19876 + }, + { + "epoch": 0.11821414977638214, + "grad_norm": 1.8205533027648926, + "learning_rate": 4.829584883325025e-05, + "loss": 5.448, + "step": 19877 + }, + { + "epoch": 0.11822009705966315, + "grad_norm": 1.6450576782226562, + "learning_rate": 4.829567932615725e-05, + "loss": 5.5966, + "step": 19878 + }, + { + "epoch": 0.11822604434294415, + "grad_norm": 1.456151008605957, + "learning_rate": 4.829550981093196e-05, + "loss": 5.5194, + "step": 19879 + }, + { + "epoch": 0.11823199162622514, + "grad_norm": 1.6064491271972656, + "learning_rate": 4.829534028757446e-05, + "loss": 5.6929, + "step": 19880 + }, + { + "epoch": 0.11823793890950614, + "grad_norm": 1.438132405281067, + "learning_rate": 4.829517075608479e-05, + "loss": 5.6738, + "step": 19881 + }, + { + "epoch": 0.11824388619278714, + "grad_norm": 2.503048896789551, + "learning_rate": 4.8295001216463024e-05, + "loss": 4.9929, + "step": 19882 + }, + { + "epoch": 0.11824983347606813, + "grad_norm": 2.3379812240600586, + "learning_rate": 4.829483166870921e-05, + "loss": 4.7947, + "step": 19883 + }, + { + "epoch": 0.11825578075934913, + "grad_norm": 2.055328130722046, + "learning_rate": 4.829466211282341e-05, + "loss": 5.3265, + "step": 19884 + }, + { + "epoch": 0.11826172804263013, + "grad_norm": 1.7393126487731934, + "learning_rate": 4.829449254880569e-05, + "loss": 5.0483, + "step": 19885 + }, + { + "epoch": 0.11826767532591112, + "grad_norm": 2.3054347038269043, + "learning_rate": 4.829432297665609e-05, + "loss": 4.9002, + "step": 19886 + }, + { + "epoch": 0.11827362260919212, + "grad_norm": 2.434323310852051, + "learning_rate": 4.82941533963747e-05, + "loss": 4.8013, + "step": 19887 + }, + { + "epoch": 0.11827956989247312, + "grad_norm": 2.0834875106811523, + "learning_rate": 4.829398380796155e-05, + "loss": 4.786, + "step": 19888 + }, + { + "epoch": 0.11828551717575411, + "grad_norm": 1.6682358980178833, + "learning_rate": 4.829381421141671e-05, + "loss": 5.6843, + "step": 19889 + }, + { + "epoch": 0.11829146445903511, + "grad_norm": 1.8787375688552856, + "learning_rate": 4.829364460674025e-05, + "loss": 5.5191, + "step": 19890 + }, + { + "epoch": 0.11829741174231612, + "grad_norm": 1.7496438026428223, + "learning_rate": 4.829347499393221e-05, + "loss": 5.6968, + "step": 19891 + }, + { + "epoch": 0.1183033590255971, + "grad_norm": 1.5585973262786865, + "learning_rate": 4.829330537299266e-05, + "loss": 5.5588, + "step": 19892 + }, + { + "epoch": 0.1183093063088781, + "grad_norm": 1.8294848203659058, + "learning_rate": 4.8293135743921664e-05, + "loss": 5.2407, + "step": 19893 + }, + { + "epoch": 0.11831525359215911, + "grad_norm": 1.4877654314041138, + "learning_rate": 4.829296610671927e-05, + "loss": 5.5383, + "step": 19894 + }, + { + "epoch": 0.1183212008754401, + "grad_norm": 1.5250638723373413, + "learning_rate": 4.829279646138554e-05, + "loss": 5.6443, + "step": 19895 + }, + { + "epoch": 0.1183271481587211, + "grad_norm": 1.5662062168121338, + "learning_rate": 4.829262680792054e-05, + "loss": 5.5409, + "step": 19896 + }, + { + "epoch": 0.11833309544200209, + "grad_norm": 1.1783791780471802, + "learning_rate": 4.829245714632432e-05, + "loss": 5.6169, + "step": 19897 + }, + { + "epoch": 0.11833904272528309, + "grad_norm": 1.4960299730300903, + "learning_rate": 4.829228747659695e-05, + "loss": 5.7195, + "step": 19898 + }, + { + "epoch": 0.11834499000856409, + "grad_norm": 1.437047004699707, + "learning_rate": 4.829211779873848e-05, + "loss": 5.7229, + "step": 19899 + }, + { + "epoch": 0.11835093729184508, + "grad_norm": 1.4095619916915894, + "learning_rate": 4.829194811274897e-05, + "loss": 5.7227, + "step": 19900 + }, + { + "epoch": 0.11835688457512608, + "grad_norm": 1.5694538354873657, + "learning_rate": 4.829177841862849e-05, + "loss": 5.356, + "step": 19901 + }, + { + "epoch": 0.11836283185840708, + "grad_norm": 1.7124476432800293, + "learning_rate": 4.829160871637708e-05, + "loss": 4.9185, + "step": 19902 + }, + { + "epoch": 0.11836877914168807, + "grad_norm": 2.2423064708709717, + "learning_rate": 4.829143900599481e-05, + "loss": 5.4345, + "step": 19903 + }, + { + "epoch": 0.11837472642496907, + "grad_norm": 1.8333791494369507, + "learning_rate": 4.829126928748175e-05, + "loss": 5.3666, + "step": 19904 + }, + { + "epoch": 0.11838067370825008, + "grad_norm": 1.5184969902038574, + "learning_rate": 4.8291099560837936e-05, + "loss": 5.4372, + "step": 19905 + }, + { + "epoch": 0.11838662099153106, + "grad_norm": 1.628544807434082, + "learning_rate": 4.829092982606345e-05, + "loss": 5.2682, + "step": 19906 + }, + { + "epoch": 0.11839256827481207, + "grad_norm": 1.5791584253311157, + "learning_rate": 4.829076008315834e-05, + "loss": 5.2149, + "step": 19907 + }, + { + "epoch": 0.11839851555809307, + "grad_norm": 1.299560546875, + "learning_rate": 4.8290590332122656e-05, + "loss": 5.1735, + "step": 19908 + }, + { + "epoch": 0.11840446284137406, + "grad_norm": 1.343913197517395, + "learning_rate": 4.829042057295647e-05, + "loss": 5.2344, + "step": 19909 + }, + { + "epoch": 0.11841041012465506, + "grad_norm": 1.2621396780014038, + "learning_rate": 4.829025080565985e-05, + "loss": 5.2982, + "step": 19910 + }, + { + "epoch": 0.11841635740793606, + "grad_norm": 1.2189174890518188, + "learning_rate": 4.829008103023284e-05, + "loss": 5.3347, + "step": 19911 + }, + { + "epoch": 0.11842230469121705, + "grad_norm": 1.2917883396148682, + "learning_rate": 4.82899112466755e-05, + "loss": 5.0745, + "step": 19912 + }, + { + "epoch": 0.11842825197449805, + "grad_norm": 1.2382320165634155, + "learning_rate": 4.828974145498789e-05, + "loss": 5.1999, + "step": 19913 + }, + { + "epoch": 0.11843419925777905, + "grad_norm": 1.398218035697937, + "learning_rate": 4.828957165517007e-05, + "loss": 5.4944, + "step": 19914 + }, + { + "epoch": 0.11844014654106004, + "grad_norm": 1.448901653289795, + "learning_rate": 4.8289401847222115e-05, + "loss": 5.4645, + "step": 19915 + }, + { + "epoch": 0.11844609382434104, + "grad_norm": 1.4628182649612427, + "learning_rate": 4.828923203114406e-05, + "loss": 5.003, + "step": 19916 + }, + { + "epoch": 0.11845204110762204, + "grad_norm": 1.3390740156173706, + "learning_rate": 4.828906220693598e-05, + "loss": 5.3482, + "step": 19917 + }, + { + "epoch": 0.11845798839090303, + "grad_norm": 1.539097547531128, + "learning_rate": 4.8288892374597925e-05, + "loss": 5.304, + "step": 19918 + }, + { + "epoch": 0.11846393567418403, + "grad_norm": 1.4011404514312744, + "learning_rate": 4.828872253412996e-05, + "loss": 5.2073, + "step": 19919 + }, + { + "epoch": 0.11846988295746504, + "grad_norm": 1.4064414501190186, + "learning_rate": 4.828855268553214e-05, + "loss": 5.2316, + "step": 19920 + }, + { + "epoch": 0.11847583024074602, + "grad_norm": 1.5808193683624268, + "learning_rate": 4.828838282880452e-05, + "loss": 5.211, + "step": 19921 + }, + { + "epoch": 0.11848177752402703, + "grad_norm": 1.5043809413909912, + "learning_rate": 4.828821296394718e-05, + "loss": 5.0564, + "step": 19922 + }, + { + "epoch": 0.11848772480730803, + "grad_norm": 1.2494529485702515, + "learning_rate": 4.828804309096016e-05, + "loss": 5.1523, + "step": 19923 + }, + { + "epoch": 0.11849367209058902, + "grad_norm": 1.4186055660247803, + "learning_rate": 4.8287873209843524e-05, + "loss": 4.9103, + "step": 19924 + }, + { + "epoch": 0.11849961937387002, + "grad_norm": 1.6093229055404663, + "learning_rate": 4.828770332059733e-05, + "loss": 4.9215, + "step": 19925 + }, + { + "epoch": 0.118505566657151, + "grad_norm": 1.5125865936279297, + "learning_rate": 4.8287533423221643e-05, + "loss": 5.0515, + "step": 19926 + }, + { + "epoch": 0.11851151394043201, + "grad_norm": 1.5410135984420776, + "learning_rate": 4.828736351771652e-05, + "loss": 4.9576, + "step": 19927 + }, + { + "epoch": 0.11851746122371301, + "grad_norm": 1.5431303977966309, + "learning_rate": 4.828719360408201e-05, + "loss": 5.1606, + "step": 19928 + }, + { + "epoch": 0.118523408506994, + "grad_norm": 1.4709242582321167, + "learning_rate": 4.828702368231819e-05, + "loss": 4.7685, + "step": 19929 + }, + { + "epoch": 0.118529355790275, + "grad_norm": 1.173568606376648, + "learning_rate": 4.828685375242511e-05, + "loss": 4.7591, + "step": 19930 + }, + { + "epoch": 0.118535303073556, + "grad_norm": 1.3113515377044678, + "learning_rate": 4.828668381440283e-05, + "loss": 4.786, + "step": 19931 + }, + { + "epoch": 0.11854125035683699, + "grad_norm": 1.4658124446868896, + "learning_rate": 4.828651386825141e-05, + "loss": 4.7776, + "step": 19932 + }, + { + "epoch": 0.118547197640118, + "grad_norm": 1.3406554460525513, + "learning_rate": 4.828634391397091e-05, + "loss": 5.0733, + "step": 19933 + }, + { + "epoch": 0.118553144923399, + "grad_norm": 1.2102482318878174, + "learning_rate": 4.828617395156138e-05, + "loss": 5.0069, + "step": 19934 + }, + { + "epoch": 0.11855909220667998, + "grad_norm": 0.989989697933197, + "learning_rate": 4.828600398102289e-05, + "loss": 4.759, + "step": 19935 + }, + { + "epoch": 0.11856503948996099, + "grad_norm": 1.2296501398086548, + "learning_rate": 4.82858340023555e-05, + "loss": 4.6269, + "step": 19936 + }, + { + "epoch": 0.11857098677324199, + "grad_norm": 1.5649582147598267, + "learning_rate": 4.828566401555926e-05, + "loss": 5.0196, + "step": 19937 + }, + { + "epoch": 0.11857693405652298, + "grad_norm": 1.2393609285354614, + "learning_rate": 4.8285494020634245e-05, + "loss": 5.059, + "step": 19938 + }, + { + "epoch": 0.11858288133980398, + "grad_norm": 1.450697422027588, + "learning_rate": 4.82853240175805e-05, + "loss": 5.1143, + "step": 19939 + }, + { + "epoch": 0.11858882862308498, + "grad_norm": 1.4795258045196533, + "learning_rate": 4.8285154006398084e-05, + "loss": 5.075, + "step": 19940 + }, + { + "epoch": 0.11859477590636597, + "grad_norm": 1.5858484506607056, + "learning_rate": 4.828498398708707e-05, + "loss": 5.0665, + "step": 19941 + }, + { + "epoch": 0.11860072318964697, + "grad_norm": 1.3411937952041626, + "learning_rate": 4.82848139596475e-05, + "loss": 4.9864, + "step": 19942 + }, + { + "epoch": 0.11860667047292797, + "grad_norm": 1.4348468780517578, + "learning_rate": 4.828464392407945e-05, + "loss": 4.904, + "step": 19943 + }, + { + "epoch": 0.11861261775620896, + "grad_norm": 1.4753068685531616, + "learning_rate": 4.8284473880382967e-05, + "loss": 5.0784, + "step": 19944 + }, + { + "epoch": 0.11861856503948996, + "grad_norm": 1.379059076309204, + "learning_rate": 4.828430382855811e-05, + "loss": 4.9782, + "step": 19945 + }, + { + "epoch": 0.11862451232277096, + "grad_norm": 1.444729208946228, + "learning_rate": 4.828413376860495e-05, + "loss": 5.5804, + "step": 19946 + }, + { + "epoch": 0.11863045960605195, + "grad_norm": 1.3467416763305664, + "learning_rate": 4.8283963700523535e-05, + "loss": 5.3278, + "step": 19947 + }, + { + "epoch": 0.11863640688933295, + "grad_norm": 1.5206544399261475, + "learning_rate": 4.8283793624313936e-05, + "loss": 5.01, + "step": 19948 + }, + { + "epoch": 0.11864235417261396, + "grad_norm": 1.394729733467102, + "learning_rate": 4.8283623539976195e-05, + "loss": 5.2139, + "step": 19949 + }, + { + "epoch": 0.11864830145589494, + "grad_norm": 1.3675029277801514, + "learning_rate": 4.8283453447510394e-05, + "loss": 5.4559, + "step": 19950 + }, + { + "epoch": 0.11865424873917595, + "grad_norm": 1.1950232982635498, + "learning_rate": 4.828328334691657e-05, + "loss": 5.2233, + "step": 19951 + }, + { + "epoch": 0.11866019602245695, + "grad_norm": 1.3517179489135742, + "learning_rate": 4.82831132381948e-05, + "loss": 5.0519, + "step": 19952 + }, + { + "epoch": 0.11866614330573794, + "grad_norm": 1.4184643030166626, + "learning_rate": 4.828294312134512e-05, + "loss": 4.8722, + "step": 19953 + }, + { + "epoch": 0.11867209058901894, + "grad_norm": 1.4558582305908203, + "learning_rate": 4.828277299636762e-05, + "loss": 5.3876, + "step": 19954 + }, + { + "epoch": 0.11867803787229993, + "grad_norm": 1.4617977142333984, + "learning_rate": 4.8282602863262345e-05, + "loss": 5.4784, + "step": 19955 + }, + { + "epoch": 0.11868398515558093, + "grad_norm": 1.4997669458389282, + "learning_rate": 4.828243272202935e-05, + "loss": 5.2556, + "step": 19956 + }, + { + "epoch": 0.11868993243886193, + "grad_norm": 1.2730913162231445, + "learning_rate": 4.8282262572668696e-05, + "loss": 5.3194, + "step": 19957 + }, + { + "epoch": 0.11869587972214292, + "grad_norm": 1.4149047136306763, + "learning_rate": 4.8282092415180444e-05, + "loss": 5.5139, + "step": 19958 + }, + { + "epoch": 0.11870182700542392, + "grad_norm": 1.2510145902633667, + "learning_rate": 4.828192224956466e-05, + "loss": 5.2486, + "step": 19959 + }, + { + "epoch": 0.11870777428870492, + "grad_norm": 1.2229409217834473, + "learning_rate": 4.828175207582139e-05, + "loss": 5.2391, + "step": 19960 + }, + { + "epoch": 0.11871372157198591, + "grad_norm": 1.3316899538040161, + "learning_rate": 4.828158189395071e-05, + "loss": 5.2928, + "step": 19961 + }, + { + "epoch": 0.11871966885526691, + "grad_norm": 1.4331640005111694, + "learning_rate": 4.828141170395266e-05, + "loss": 5.3311, + "step": 19962 + }, + { + "epoch": 0.11872561613854792, + "grad_norm": 1.3313428163528442, + "learning_rate": 4.828124150582732e-05, + "loss": 5.2203, + "step": 19963 + }, + { + "epoch": 0.1187315634218289, + "grad_norm": 1.6505075693130493, + "learning_rate": 4.828107129957473e-05, + "loss": 4.8604, + "step": 19964 + }, + { + "epoch": 0.1187375107051099, + "grad_norm": 1.3544394969940186, + "learning_rate": 4.828090108519496e-05, + "loss": 5.17, + "step": 19965 + }, + { + "epoch": 0.11874345798839091, + "grad_norm": 1.3194384574890137, + "learning_rate": 4.828073086268808e-05, + "loss": 5.2197, + "step": 19966 + }, + { + "epoch": 0.1187494052716719, + "grad_norm": 1.4014582633972168, + "learning_rate": 4.8280560632054126e-05, + "loss": 5.2865, + "step": 19967 + }, + { + "epoch": 0.1187553525549529, + "grad_norm": 1.5148218870162964, + "learning_rate": 4.828039039329317e-05, + "loss": 5.3765, + "step": 19968 + }, + { + "epoch": 0.1187612998382339, + "grad_norm": 1.3657969236373901, + "learning_rate": 4.828022014640527e-05, + "loss": 4.9787, + "step": 19969 + }, + { + "epoch": 0.11876724712151489, + "grad_norm": 1.547717571258545, + "learning_rate": 4.828004989139049e-05, + "loss": 5.0538, + "step": 19970 + }, + { + "epoch": 0.11877319440479589, + "grad_norm": 1.5132863521575928, + "learning_rate": 4.827987962824888e-05, + "loss": 5.0301, + "step": 19971 + }, + { + "epoch": 0.11877914168807689, + "grad_norm": 1.4020887613296509, + "learning_rate": 4.827970935698051e-05, + "loss": 4.9646, + "step": 19972 + }, + { + "epoch": 0.11878508897135788, + "grad_norm": 1.4983519315719604, + "learning_rate": 4.8279539077585424e-05, + "loss": 5.2266, + "step": 19973 + }, + { + "epoch": 0.11879103625463888, + "grad_norm": 1.3545745611190796, + "learning_rate": 4.82793687900637e-05, + "loss": 5.108, + "step": 19974 + }, + { + "epoch": 0.11879698353791988, + "grad_norm": 1.4865717887878418, + "learning_rate": 4.827919849441539e-05, + "loss": 5.257, + "step": 19975 + }, + { + "epoch": 0.11880293082120087, + "grad_norm": 1.4389182329177856, + "learning_rate": 4.8279028190640546e-05, + "loss": 4.976, + "step": 19976 + }, + { + "epoch": 0.11880887810448187, + "grad_norm": 1.2823866605758667, + "learning_rate": 4.827885787873924e-05, + "loss": 4.7617, + "step": 19977 + }, + { + "epoch": 0.11881482538776288, + "grad_norm": 1.369992971420288, + "learning_rate": 4.8278687558711525e-05, + "loss": 4.7165, + "step": 19978 + }, + { + "epoch": 0.11882077267104386, + "grad_norm": 1.2873594760894775, + "learning_rate": 4.827851723055745e-05, + "loss": 4.6705, + "step": 19979 + }, + { + "epoch": 0.11882671995432487, + "grad_norm": 1.3779295682907104, + "learning_rate": 4.827834689427709e-05, + "loss": 4.9752, + "step": 19980 + }, + { + "epoch": 0.11883266723760587, + "grad_norm": 1.5264688730239868, + "learning_rate": 4.82781765498705e-05, + "loss": 5.0295, + "step": 19981 + }, + { + "epoch": 0.11883861452088686, + "grad_norm": 1.6745606660842896, + "learning_rate": 4.827800619733774e-05, + "loss": 5.4265, + "step": 19982 + }, + { + "epoch": 0.11884456180416786, + "grad_norm": 1.5993295907974243, + "learning_rate": 4.8277835836678874e-05, + "loss": 5.0611, + "step": 19983 + }, + { + "epoch": 0.11885050908744885, + "grad_norm": 1.6451520919799805, + "learning_rate": 4.827766546789395e-05, + "loss": 4.9504, + "step": 19984 + }, + { + "epoch": 0.11885645637072985, + "grad_norm": 1.4769519567489624, + "learning_rate": 4.827749509098304e-05, + "loss": 5.1324, + "step": 19985 + }, + { + "epoch": 0.11886240365401085, + "grad_norm": 1.6930506229400635, + "learning_rate": 4.827732470594619e-05, + "loss": 5.134, + "step": 19986 + }, + { + "epoch": 0.11886835093729184, + "grad_norm": 1.1951912641525269, + "learning_rate": 4.827715431278347e-05, + "loss": 5.2521, + "step": 19987 + }, + { + "epoch": 0.11887429822057284, + "grad_norm": 1.3520997762680054, + "learning_rate": 4.827698391149493e-05, + "loss": 5.1791, + "step": 19988 + }, + { + "epoch": 0.11888024550385384, + "grad_norm": 1.3710130453109741, + "learning_rate": 4.8276813502080644e-05, + "loss": 5.1179, + "step": 19989 + }, + { + "epoch": 0.11888619278713483, + "grad_norm": 1.4977210760116577, + "learning_rate": 4.827664308454066e-05, + "loss": 5.1492, + "step": 19990 + }, + { + "epoch": 0.11889214007041583, + "grad_norm": 1.2681607007980347, + "learning_rate": 4.8276472658875035e-05, + "loss": 5.1178, + "step": 19991 + }, + { + "epoch": 0.11889808735369684, + "grad_norm": 1.2606865167617798, + "learning_rate": 4.827630222508385e-05, + "loss": 5.2796, + "step": 19992 + }, + { + "epoch": 0.11890403463697782, + "grad_norm": 1.477273941040039, + "learning_rate": 4.827613178316713e-05, + "loss": 5.251, + "step": 19993 + }, + { + "epoch": 0.11890998192025883, + "grad_norm": 1.4194386005401611, + "learning_rate": 4.8275961333124956e-05, + "loss": 5.157, + "step": 19994 + }, + { + "epoch": 0.11891592920353983, + "grad_norm": 1.2693103551864624, + "learning_rate": 4.8275790874957396e-05, + "loss": 5.2037, + "step": 19995 + }, + { + "epoch": 0.11892187648682082, + "grad_norm": 1.2035702466964722, + "learning_rate": 4.8275620408664487e-05, + "loss": 5.1613, + "step": 19996 + }, + { + "epoch": 0.11892782377010182, + "grad_norm": 1.1674199104309082, + "learning_rate": 4.8275449934246295e-05, + "loss": 5.2415, + "step": 19997 + }, + { + "epoch": 0.11893377105338282, + "grad_norm": 1.5064369440078735, + "learning_rate": 4.8275279451702895e-05, + "loss": 5.2025, + "step": 19998 + }, + { + "epoch": 0.11893971833666381, + "grad_norm": 1.3770934343338013, + "learning_rate": 4.827510896103433e-05, + "loss": 5.0804, + "step": 19999 + }, + { + "epoch": 0.11894566561994481, + "grad_norm": 1.4852590560913086, + "learning_rate": 4.827493846224067e-05, + "loss": 5.0169, + "step": 20000 + }, + { + "epoch": 0.11895161290322581, + "grad_norm": 1.3760627508163452, + "learning_rate": 4.8274767955321966e-05, + "loss": 5.245, + "step": 20001 + }, + { + "epoch": 0.1189575601865068, + "grad_norm": 1.4135125875473022, + "learning_rate": 4.827459744027828e-05, + "loss": 5.1599, + "step": 20002 + }, + { + "epoch": 0.1189635074697878, + "grad_norm": 1.352949857711792, + "learning_rate": 4.8274426917109675e-05, + "loss": 5.187, + "step": 20003 + }, + { + "epoch": 0.1189694547530688, + "grad_norm": 1.279439091682434, + "learning_rate": 4.82742563858162e-05, + "loss": 5.1369, + "step": 20004 + }, + { + "epoch": 0.11897540203634979, + "grad_norm": 1.6078580617904663, + "learning_rate": 4.8274085846397935e-05, + "loss": 5.097, + "step": 20005 + }, + { + "epoch": 0.1189813493196308, + "grad_norm": 1.4414268732070923, + "learning_rate": 4.827391529885492e-05, + "loss": 5.1412, + "step": 20006 + }, + { + "epoch": 0.1189872966029118, + "grad_norm": 1.249731421470642, + "learning_rate": 4.827374474318722e-05, + "loss": 5.002, + "step": 20007 + }, + { + "epoch": 0.11899324388619278, + "grad_norm": 1.5977002382278442, + "learning_rate": 4.82735741793949e-05, + "loss": 5.0387, + "step": 20008 + }, + { + "epoch": 0.11899919116947379, + "grad_norm": 1.5115478038787842, + "learning_rate": 4.8273403607478016e-05, + "loss": 4.9497, + "step": 20009 + }, + { + "epoch": 0.11900513845275479, + "grad_norm": 1.433825135231018, + "learning_rate": 4.8273233027436625e-05, + "loss": 4.9818, + "step": 20010 + }, + { + "epoch": 0.11901108573603578, + "grad_norm": 1.51628839969635, + "learning_rate": 4.827306243927079e-05, + "loss": 4.8819, + "step": 20011 + }, + { + "epoch": 0.11901703301931678, + "grad_norm": 1.3780534267425537, + "learning_rate": 4.8272891842980564e-05, + "loss": 5.18, + "step": 20012 + }, + { + "epoch": 0.11902298030259777, + "grad_norm": 1.2616275548934937, + "learning_rate": 4.8272721238566023e-05, + "loss": 5.549, + "step": 20013 + }, + { + "epoch": 0.11902892758587877, + "grad_norm": 1.2978616952896118, + "learning_rate": 4.8272550626027204e-05, + "loss": 5.4608, + "step": 20014 + }, + { + "epoch": 0.11903487486915977, + "grad_norm": 1.2539299726486206, + "learning_rate": 4.827238000536418e-05, + "loss": 5.5612, + "step": 20015 + }, + { + "epoch": 0.11904082215244076, + "grad_norm": 1.4023045301437378, + "learning_rate": 4.827220937657702e-05, + "loss": 5.2669, + "step": 20016 + }, + { + "epoch": 0.11904676943572176, + "grad_norm": 1.4386683702468872, + "learning_rate": 4.827203873966576e-05, + "loss": 5.0703, + "step": 20017 + }, + { + "epoch": 0.11905271671900276, + "grad_norm": 1.5248057842254639, + "learning_rate": 4.827186809463048e-05, + "loss": 5.0376, + "step": 20018 + }, + { + "epoch": 0.11905866400228375, + "grad_norm": 1.4410630464553833, + "learning_rate": 4.827169744147122e-05, + "loss": 5.1396, + "step": 20019 + }, + { + "epoch": 0.11906461128556475, + "grad_norm": 1.7917122840881348, + "learning_rate": 4.827152678018806e-05, + "loss": 5.1673, + "step": 20020 + }, + { + "epoch": 0.11907055856884576, + "grad_norm": 1.739169716835022, + "learning_rate": 4.827135611078105e-05, + "loss": 5.6848, + "step": 20021 + }, + { + "epoch": 0.11907650585212674, + "grad_norm": 1.6629457473754883, + "learning_rate": 4.827118543325024e-05, + "loss": 5.7335, + "step": 20022 + }, + { + "epoch": 0.11908245313540775, + "grad_norm": 1.634628176689148, + "learning_rate": 4.827101474759571e-05, + "loss": 5.7718, + "step": 20023 + }, + { + "epoch": 0.11908840041868875, + "grad_norm": 1.299861192703247, + "learning_rate": 4.827084405381751e-05, + "loss": 5.6917, + "step": 20024 + }, + { + "epoch": 0.11909434770196974, + "grad_norm": 1.3863619565963745, + "learning_rate": 4.82706733519157e-05, + "loss": 5.7363, + "step": 20025 + }, + { + "epoch": 0.11910029498525074, + "grad_norm": 2.3500845432281494, + "learning_rate": 4.827050264189033e-05, + "loss": 5.192, + "step": 20026 + }, + { + "epoch": 0.11910624226853174, + "grad_norm": 1.426633358001709, + "learning_rate": 4.827033192374147e-05, + "loss": 5.5643, + "step": 20027 + }, + { + "epoch": 0.11911218955181273, + "grad_norm": 1.4728987216949463, + "learning_rate": 4.8270161197469175e-05, + "loss": 5.6323, + "step": 20028 + }, + { + "epoch": 0.11911813683509373, + "grad_norm": 1.66750168800354, + "learning_rate": 4.826999046307352e-05, + "loss": 5.4327, + "step": 20029 + }, + { + "epoch": 0.11912408411837473, + "grad_norm": 1.4894248247146606, + "learning_rate": 4.8269819720554545e-05, + "loss": 5.4332, + "step": 20030 + }, + { + "epoch": 0.11913003140165572, + "grad_norm": 1.5166181325912476, + "learning_rate": 4.826964896991231e-05, + "loss": 5.5467, + "step": 20031 + }, + { + "epoch": 0.11913597868493672, + "grad_norm": 1.2947237491607666, + "learning_rate": 4.826947821114689e-05, + "loss": 5.5116, + "step": 20032 + }, + { + "epoch": 0.11914192596821772, + "grad_norm": 1.3890970945358276, + "learning_rate": 4.8269307444258326e-05, + "loss": 5.5459, + "step": 20033 + }, + { + "epoch": 0.11914787325149871, + "grad_norm": 1.496099591255188, + "learning_rate": 4.8269136669246695e-05, + "loss": 5.5533, + "step": 20034 + }, + { + "epoch": 0.11915382053477971, + "grad_norm": 1.4115175008773804, + "learning_rate": 4.8268965886112045e-05, + "loss": 5.4898, + "step": 20035 + }, + { + "epoch": 0.11915976781806072, + "grad_norm": 1.3803601264953613, + "learning_rate": 4.826879509485444e-05, + "loss": 5.598, + "step": 20036 + }, + { + "epoch": 0.1191657151013417, + "grad_norm": 1.7235617637634277, + "learning_rate": 4.826862429547394e-05, + "loss": 5.5489, + "step": 20037 + }, + { + "epoch": 0.1191716623846227, + "grad_norm": 1.726289987564087, + "learning_rate": 4.82684534879706e-05, + "loss": 5.5461, + "step": 20038 + }, + { + "epoch": 0.11917760966790371, + "grad_norm": 1.593349814414978, + "learning_rate": 4.826828267234449e-05, + "loss": 5.3594, + "step": 20039 + }, + { + "epoch": 0.1191835569511847, + "grad_norm": 2.3147101402282715, + "learning_rate": 4.826811184859566e-05, + "loss": 4.6888, + "step": 20040 + }, + { + "epoch": 0.1191895042344657, + "grad_norm": 2.1485888957977295, + "learning_rate": 4.826794101672417e-05, + "loss": 4.6874, + "step": 20041 + }, + { + "epoch": 0.11919545151774669, + "grad_norm": 2.5710601806640625, + "learning_rate": 4.826777017673009e-05, + "loss": 4.6524, + "step": 20042 + }, + { + "epoch": 0.11920139880102769, + "grad_norm": 2.314556121826172, + "learning_rate": 4.826759932861346e-05, + "loss": 4.3273, + "step": 20043 + }, + { + "epoch": 0.11920734608430869, + "grad_norm": 2.060617208480835, + "learning_rate": 4.826742847237436e-05, + "loss": 4.6601, + "step": 20044 + }, + { + "epoch": 0.11921329336758968, + "grad_norm": 1.9709726572036743, + "learning_rate": 4.826725760801284e-05, + "loss": 6.1007, + "step": 20045 + }, + { + "epoch": 0.11921924065087068, + "grad_norm": 2.0907840728759766, + "learning_rate": 4.826708673552895e-05, + "loss": 6.0386, + "step": 20046 + }, + { + "epoch": 0.11922518793415168, + "grad_norm": 2.02783203125, + "learning_rate": 4.826691585492278e-05, + "loss": 5.4651, + "step": 20047 + }, + { + "epoch": 0.11923113521743267, + "grad_norm": 1.8326990604400635, + "learning_rate": 4.826674496619435e-05, + "loss": 5.7342, + "step": 20048 + }, + { + "epoch": 0.11923708250071367, + "grad_norm": 1.8395801782608032, + "learning_rate": 4.8266574069343753e-05, + "loss": 5.657, + "step": 20049 + }, + { + "epoch": 0.11924302978399468, + "grad_norm": 1.5144078731536865, + "learning_rate": 4.826640316437103e-05, + "loss": 5.6856, + "step": 20050 + }, + { + "epoch": 0.11924897706727566, + "grad_norm": 1.6133313179016113, + "learning_rate": 4.826623225127626e-05, + "loss": 5.114, + "step": 20051 + }, + { + "epoch": 0.11925492435055667, + "grad_norm": 2.0678884983062744, + "learning_rate": 4.826606133005947e-05, + "loss": 5.6642, + "step": 20052 + }, + { + "epoch": 0.11926087163383767, + "grad_norm": 1.7214683294296265, + "learning_rate": 4.8265890400720744e-05, + "loss": 5.8689, + "step": 20053 + }, + { + "epoch": 0.11926681891711866, + "grad_norm": 1.7670868635177612, + "learning_rate": 4.826571946326014e-05, + "loss": 5.6504, + "step": 20054 + }, + { + "epoch": 0.11927276620039966, + "grad_norm": 1.6336724758148193, + "learning_rate": 4.82655485176777e-05, + "loss": 5.7624, + "step": 20055 + }, + { + "epoch": 0.11927871348368066, + "grad_norm": 1.6147593259811401, + "learning_rate": 4.8265377563973514e-05, + "loss": 5.8398, + "step": 20056 + }, + { + "epoch": 0.11928466076696165, + "grad_norm": 1.6203758716583252, + "learning_rate": 4.8265206602147614e-05, + "loss": 5.3793, + "step": 20057 + }, + { + "epoch": 0.11929060805024265, + "grad_norm": 1.8295884132385254, + "learning_rate": 4.8265035632200084e-05, + "loss": 5.0185, + "step": 20058 + }, + { + "epoch": 0.11929655533352365, + "grad_norm": 1.6802337169647217, + "learning_rate": 4.826486465413096e-05, + "loss": 5.8104, + "step": 20059 + }, + { + "epoch": 0.11930250261680464, + "grad_norm": 1.9276031255722046, + "learning_rate": 4.826469366794031e-05, + "loss": 5.2106, + "step": 20060 + }, + { + "epoch": 0.11930844990008564, + "grad_norm": 1.9589072465896606, + "learning_rate": 4.8264522673628205e-05, + "loss": 5.2336, + "step": 20061 + }, + { + "epoch": 0.11931439718336664, + "grad_norm": 3.45713472366333, + "learning_rate": 4.826435167119469e-05, + "loss": 5.7015, + "step": 20062 + }, + { + "epoch": 0.11932034446664763, + "grad_norm": 3.057732343673706, + "learning_rate": 4.826418066063983e-05, + "loss": 4.2376, + "step": 20063 + }, + { + "epoch": 0.11932629174992863, + "grad_norm": 2.9540810585021973, + "learning_rate": 4.8264009641963684e-05, + "loss": 4.1357, + "step": 20064 + }, + { + "epoch": 0.11933223903320964, + "grad_norm": 2.707113027572632, + "learning_rate": 4.826383861516632e-05, + "loss": 3.7255, + "step": 20065 + }, + { + "epoch": 0.11933818631649062, + "grad_norm": 2.488718032836914, + "learning_rate": 4.8263667580247784e-05, + "loss": 3.7309, + "step": 20066 + }, + { + "epoch": 0.11934413359977163, + "grad_norm": 2.6351873874664307, + "learning_rate": 4.826349653720814e-05, + "loss": 3.5953, + "step": 20067 + }, + { + "epoch": 0.11935008088305263, + "grad_norm": 2.866333246231079, + "learning_rate": 4.826332548604745e-05, + "loss": 3.8627, + "step": 20068 + }, + { + "epoch": 0.11935602816633362, + "grad_norm": 1.5446399450302124, + "learning_rate": 4.8263154426765777e-05, + "loss": 5.3014, + "step": 20069 + }, + { + "epoch": 0.11936197544961462, + "grad_norm": 1.7273021936416626, + "learning_rate": 4.8262983359363176e-05, + "loss": 5.6102, + "step": 20070 + }, + { + "epoch": 0.1193679227328956, + "grad_norm": 1.4169118404388428, + "learning_rate": 4.826281228383971e-05, + "loss": 5.6831, + "step": 20071 + }, + { + "epoch": 0.11937387001617661, + "grad_norm": 1.7140129804611206, + "learning_rate": 4.826264120019544e-05, + "loss": 5.6609, + "step": 20072 + }, + { + "epoch": 0.11937981729945761, + "grad_norm": 1.4560796022415161, + "learning_rate": 4.8262470108430414e-05, + "loss": 5.6279, + "step": 20073 + }, + { + "epoch": 0.1193857645827386, + "grad_norm": 1.6894809007644653, + "learning_rate": 4.8262299008544697e-05, + "loss": 5.192, + "step": 20074 + }, + { + "epoch": 0.1193917118660196, + "grad_norm": 2.995307683944702, + "learning_rate": 4.826212790053836e-05, + "loss": 4.9009, + "step": 20075 + }, + { + "epoch": 0.1193976591493006, + "grad_norm": 2.9559946060180664, + "learning_rate": 4.826195678441145e-05, + "loss": 4.8801, + "step": 20076 + }, + { + "epoch": 0.11940360643258159, + "grad_norm": 2.550973653793335, + "learning_rate": 4.826178566016403e-05, + "loss": 4.7061, + "step": 20077 + }, + { + "epoch": 0.11940955371586259, + "grad_norm": 2.0249550342559814, + "learning_rate": 4.826161452779617e-05, + "loss": 5.0315, + "step": 20078 + }, + { + "epoch": 0.1194155009991436, + "grad_norm": 1.6208853721618652, + "learning_rate": 4.826144338730791e-05, + "loss": 5.3685, + "step": 20079 + }, + { + "epoch": 0.11942144828242458, + "grad_norm": 1.6138144731521606, + "learning_rate": 4.826127223869933e-05, + "loss": 5.3098, + "step": 20080 + }, + { + "epoch": 0.11942739556570559, + "grad_norm": 1.6347969770431519, + "learning_rate": 4.8261101081970476e-05, + "loss": 5.7519, + "step": 20081 + }, + { + "epoch": 0.11943334284898659, + "grad_norm": 1.6273889541625977, + "learning_rate": 4.8260929917121403e-05, + "loss": 5.5083, + "step": 20082 + }, + { + "epoch": 0.11943929013226758, + "grad_norm": 1.7236882448196411, + "learning_rate": 4.826075874415219e-05, + "loss": 5.3613, + "step": 20083 + }, + { + "epoch": 0.11944523741554858, + "grad_norm": 1.5177632570266724, + "learning_rate": 4.826058756306289e-05, + "loss": 5.4234, + "step": 20084 + }, + { + "epoch": 0.11945118469882958, + "grad_norm": 1.9017301797866821, + "learning_rate": 4.826041637385354e-05, + "loss": 4.6868, + "step": 20085 + }, + { + "epoch": 0.11945713198211057, + "grad_norm": 1.8880805969238281, + "learning_rate": 4.826024517652425e-05, + "loss": 4.4478, + "step": 20086 + }, + { + "epoch": 0.11946307926539157, + "grad_norm": 1.5617226362228394, + "learning_rate": 4.826007397107503e-05, + "loss": 5.3775, + "step": 20087 + }, + { + "epoch": 0.11946902654867257, + "grad_norm": 1.836101770401001, + "learning_rate": 4.825990275750595e-05, + "loss": 5.33, + "step": 20088 + }, + { + "epoch": 0.11947497383195356, + "grad_norm": 1.6876533031463623, + "learning_rate": 4.825973153581709e-05, + "loss": 5.3164, + "step": 20089 + }, + { + "epoch": 0.11948092111523456, + "grad_norm": 1.7182306051254272, + "learning_rate": 4.82595603060085e-05, + "loss": 5.3545, + "step": 20090 + }, + { + "epoch": 0.11948686839851556, + "grad_norm": 2.160414934158325, + "learning_rate": 4.825938906808023e-05, + "loss": 4.3744, + "step": 20091 + }, + { + "epoch": 0.11949281568179655, + "grad_norm": 1.4865752458572388, + "learning_rate": 4.825921782203236e-05, + "loss": 5.455, + "step": 20092 + }, + { + "epoch": 0.11949876296507755, + "grad_norm": 1.550986409187317, + "learning_rate": 4.825904656786492e-05, + "loss": 5.4879, + "step": 20093 + }, + { + "epoch": 0.11950471024835856, + "grad_norm": 1.473037838935852, + "learning_rate": 4.8258875305577996e-05, + "loss": 5.3964, + "step": 20094 + }, + { + "epoch": 0.11951065753163954, + "grad_norm": 1.6714228391647339, + "learning_rate": 4.825870403517164e-05, + "loss": 5.0215, + "step": 20095 + }, + { + "epoch": 0.11951660481492055, + "grad_norm": 1.7555420398712158, + "learning_rate": 4.8258532756645905e-05, + "loss": 4.9852, + "step": 20096 + }, + { + "epoch": 0.11952255209820155, + "grad_norm": 1.562729835510254, + "learning_rate": 4.825836147000086e-05, + "loss": 4.5928, + "step": 20097 + }, + { + "epoch": 0.11952849938148254, + "grad_norm": 1.7901209592819214, + "learning_rate": 4.825819017523656e-05, + "loss": 5.3176, + "step": 20098 + }, + { + "epoch": 0.11953444666476354, + "grad_norm": 1.605578064918518, + "learning_rate": 4.825801887235307e-05, + "loss": 5.3162, + "step": 20099 + }, + { + "epoch": 0.11954039394804453, + "grad_norm": 1.9077202081680298, + "learning_rate": 4.8257847561350445e-05, + "loss": 5.3378, + "step": 20100 + }, + { + "epoch": 0.11954634123132553, + "grad_norm": 1.9171262979507446, + "learning_rate": 4.825767624222875e-05, + "loss": 5.2585, + "step": 20101 + }, + { + "epoch": 0.11955228851460653, + "grad_norm": 1.5661342144012451, + "learning_rate": 4.825750491498803e-05, + "loss": 5.3421, + "step": 20102 + }, + { + "epoch": 0.11955823579788752, + "grad_norm": 2.188962697982788, + "learning_rate": 4.825733357962836e-05, + "loss": 4.8925, + "step": 20103 + }, + { + "epoch": 0.11956418308116852, + "grad_norm": 1.4218099117279053, + "learning_rate": 4.82571622361498e-05, + "loss": 5.3497, + "step": 20104 + }, + { + "epoch": 0.11957013036444952, + "grad_norm": 1.6142303943634033, + "learning_rate": 4.82569908845524e-05, + "loss": 5.1657, + "step": 20105 + }, + { + "epoch": 0.11957607764773051, + "grad_norm": 1.9385474920272827, + "learning_rate": 4.8256819524836224e-05, + "loss": 5.0509, + "step": 20106 + }, + { + "epoch": 0.11958202493101151, + "grad_norm": 2.077528953552246, + "learning_rate": 4.825664815700134e-05, + "loss": 5.1879, + "step": 20107 + }, + { + "epoch": 0.11958797221429252, + "grad_norm": 2.158764123916626, + "learning_rate": 4.825647678104779e-05, + "loss": 4.9595, + "step": 20108 + }, + { + "epoch": 0.1195939194975735, + "grad_norm": 2.0398664474487305, + "learning_rate": 4.825630539697565e-05, + "loss": 4.9156, + "step": 20109 + }, + { + "epoch": 0.1195998667808545, + "grad_norm": 2.0280275344848633, + "learning_rate": 4.825613400478497e-05, + "loss": 4.8655, + "step": 20110 + }, + { + "epoch": 0.11960581406413551, + "grad_norm": 2.0311338901519775, + "learning_rate": 4.8255962604475816e-05, + "loss": 4.8953, + "step": 20111 + }, + { + "epoch": 0.1196117613474165, + "grad_norm": 2.334346055984497, + "learning_rate": 4.825579119604825e-05, + "loss": 5.0044, + "step": 20112 + }, + { + "epoch": 0.1196177086306975, + "grad_norm": 2.272148847579956, + "learning_rate": 4.825561977950233e-05, + "loss": 4.8911, + "step": 20113 + }, + { + "epoch": 0.1196236559139785, + "grad_norm": 2.0724244117736816, + "learning_rate": 4.8255448354838104e-05, + "loss": 5.3492, + "step": 20114 + }, + { + "epoch": 0.11962960319725949, + "grad_norm": 1.7691513299942017, + "learning_rate": 4.8255276922055644e-05, + "loss": 5.5727, + "step": 20115 + }, + { + "epoch": 0.11963555048054049, + "grad_norm": 1.9434363842010498, + "learning_rate": 4.8255105481155004e-05, + "loss": 5.4564, + "step": 20116 + }, + { + "epoch": 0.11964149776382149, + "grad_norm": 1.623660683631897, + "learning_rate": 4.825493403213626e-05, + "loss": 5.2862, + "step": 20117 + }, + { + "epoch": 0.11964744504710248, + "grad_norm": 1.6246039867401123, + "learning_rate": 4.8254762574999446e-05, + "loss": 5.3627, + "step": 20118 + }, + { + "epoch": 0.11965339233038348, + "grad_norm": 1.689290165901184, + "learning_rate": 4.825459110974464e-05, + "loss": 4.6902, + "step": 20119 + }, + { + "epoch": 0.11965933961366448, + "grad_norm": 1.487697720527649, + "learning_rate": 4.825441963637189e-05, + "loss": 4.7598, + "step": 20120 + }, + { + "epoch": 0.11966528689694547, + "grad_norm": 1.7388331890106201, + "learning_rate": 4.825424815488126e-05, + "loss": 4.709, + "step": 20121 + }, + { + "epoch": 0.11967123418022647, + "grad_norm": 1.9586225748062134, + "learning_rate": 4.8254076665272826e-05, + "loss": 4.4625, + "step": 20122 + }, + { + "epoch": 0.11967718146350748, + "grad_norm": 1.9228769540786743, + "learning_rate": 4.825390516754662e-05, + "loss": 4.1447, + "step": 20123 + }, + { + "epoch": 0.11968312874678846, + "grad_norm": 1.8852907419204712, + "learning_rate": 4.825373366170273e-05, + "loss": 4.2618, + "step": 20124 + }, + { + "epoch": 0.11968907603006947, + "grad_norm": 1.8267028331756592, + "learning_rate": 4.825356214774119e-05, + "loss": 4.4095, + "step": 20125 + }, + { + "epoch": 0.11969502331335047, + "grad_norm": 1.8847311735153198, + "learning_rate": 4.825339062566208e-05, + "loss": 4.1904, + "step": 20126 + }, + { + "epoch": 0.11970097059663146, + "grad_norm": 2.0036990642547607, + "learning_rate": 4.825321909546545e-05, + "loss": 4.2348, + "step": 20127 + }, + { + "epoch": 0.11970691787991246, + "grad_norm": 1.8992520570755005, + "learning_rate": 4.825304755715136e-05, + "loss": 4.3038, + "step": 20128 + }, + { + "epoch": 0.11971286516319345, + "grad_norm": 1.8314359188079834, + "learning_rate": 4.8252876010719874e-05, + "loss": 4.102, + "step": 20129 + }, + { + "epoch": 0.11971881244647445, + "grad_norm": 1.9093595743179321, + "learning_rate": 4.825270445617104e-05, + "loss": 4.0307, + "step": 20130 + }, + { + "epoch": 0.11972475972975545, + "grad_norm": 2.1645400524139404, + "learning_rate": 4.8252532893504936e-05, + "loss": 4.2032, + "step": 20131 + }, + { + "epoch": 0.11973070701303644, + "grad_norm": 2.0268661975860596, + "learning_rate": 4.8252361322721605e-05, + "loss": 4.7705, + "step": 20132 + }, + { + "epoch": 0.11973665429631744, + "grad_norm": 1.8852148056030273, + "learning_rate": 4.825218974382113e-05, + "loss": 4.8969, + "step": 20133 + }, + { + "epoch": 0.11974260157959844, + "grad_norm": 1.9107592105865479, + "learning_rate": 4.825201815680354e-05, + "loss": 5.2587, + "step": 20134 + }, + { + "epoch": 0.11974854886287943, + "grad_norm": 1.6433600187301636, + "learning_rate": 4.825184656166892e-05, + "loss": 5.1954, + "step": 20135 + }, + { + "epoch": 0.11975449614616043, + "grad_norm": 1.4135210514068604, + "learning_rate": 4.825167495841731e-05, + "loss": 5.0398, + "step": 20136 + }, + { + "epoch": 0.11976044342944143, + "grad_norm": 1.9514580965042114, + "learning_rate": 4.825150334704879e-05, + "loss": 4.3527, + "step": 20137 + }, + { + "epoch": 0.11976639071272242, + "grad_norm": 1.8811348676681519, + "learning_rate": 4.825133172756341e-05, + "loss": 4.2798, + "step": 20138 + }, + { + "epoch": 0.11977233799600343, + "grad_norm": 1.8210500478744507, + "learning_rate": 4.825116009996123e-05, + "loss": 4.666, + "step": 20139 + }, + { + "epoch": 0.11977828527928443, + "grad_norm": 1.8773581981658936, + "learning_rate": 4.825098846424231e-05, + "loss": 4.9104, + "step": 20140 + }, + { + "epoch": 0.11978423256256542, + "grad_norm": 1.517233967781067, + "learning_rate": 4.825081682040671e-05, + "loss": 5.5915, + "step": 20141 + }, + { + "epoch": 0.11979017984584642, + "grad_norm": 1.6219067573547363, + "learning_rate": 4.825064516845449e-05, + "loss": 5.6538, + "step": 20142 + }, + { + "epoch": 0.11979612712912742, + "grad_norm": 1.4977927207946777, + "learning_rate": 4.8250473508385707e-05, + "loss": 5.3499, + "step": 20143 + }, + { + "epoch": 0.11980207441240841, + "grad_norm": 1.5381087064743042, + "learning_rate": 4.8250301840200424e-05, + "loss": 5.6666, + "step": 20144 + }, + { + "epoch": 0.11980802169568941, + "grad_norm": 1.5895806550979614, + "learning_rate": 4.82501301638987e-05, + "loss": 5.2099, + "step": 20145 + }, + { + "epoch": 0.11981396897897041, + "grad_norm": 1.7511320114135742, + "learning_rate": 4.8249958479480603e-05, + "loss": 4.622, + "step": 20146 + }, + { + "epoch": 0.1198199162622514, + "grad_norm": 1.8109928369522095, + "learning_rate": 4.824978678694618e-05, + "loss": 4.4156, + "step": 20147 + }, + { + "epoch": 0.1198258635455324, + "grad_norm": 1.474926471710205, + "learning_rate": 4.8249615086295494e-05, + "loss": 5.4845, + "step": 20148 + }, + { + "epoch": 0.1198318108288134, + "grad_norm": 1.8301719427108765, + "learning_rate": 4.824944337752861e-05, + "loss": 5.1814, + "step": 20149 + }, + { + "epoch": 0.11983775811209439, + "grad_norm": 1.8549950122833252, + "learning_rate": 4.824927166064559e-05, + "loss": 5.2944, + "step": 20150 + }, + { + "epoch": 0.1198437053953754, + "grad_norm": 1.7832791805267334, + "learning_rate": 4.8249099935646494e-05, + "loss": 5.7594, + "step": 20151 + }, + { + "epoch": 0.1198496526786564, + "grad_norm": 1.5706509351730347, + "learning_rate": 4.8248928202531366e-05, + "loss": 5.4607, + "step": 20152 + }, + { + "epoch": 0.11985559996193738, + "grad_norm": 1.6395286321640015, + "learning_rate": 4.824875646130028e-05, + "loss": 5.3338, + "step": 20153 + }, + { + "epoch": 0.11986154724521839, + "grad_norm": 1.9523805379867554, + "learning_rate": 4.824858471195329e-05, + "loss": 5.1205, + "step": 20154 + }, + { + "epoch": 0.11986749452849939, + "grad_norm": 2.45190691947937, + "learning_rate": 4.824841295449047e-05, + "loss": 4.5387, + "step": 20155 + }, + { + "epoch": 0.11987344181178038, + "grad_norm": 2.2806150913238525, + "learning_rate": 4.8248241188911856e-05, + "loss": 4.8134, + "step": 20156 + }, + { + "epoch": 0.11987938909506138, + "grad_norm": 2.230710029602051, + "learning_rate": 4.8248069415217534e-05, + "loss": 4.7386, + "step": 20157 + }, + { + "epoch": 0.11988533637834237, + "grad_norm": 2.13611102104187, + "learning_rate": 4.8247897633407546e-05, + "loss": 4.6519, + "step": 20158 + }, + { + "epoch": 0.11989128366162337, + "grad_norm": 1.7644202709197998, + "learning_rate": 4.824772584348196e-05, + "loss": 5.5343, + "step": 20159 + }, + { + "epoch": 0.11989723094490437, + "grad_norm": 1.8997445106506348, + "learning_rate": 4.824755404544083e-05, + "loss": 5.2135, + "step": 20160 + }, + { + "epoch": 0.11990317822818536, + "grad_norm": 1.8288135528564453, + "learning_rate": 4.824738223928421e-05, + "loss": 4.9554, + "step": 20161 + }, + { + "epoch": 0.11990912551146636, + "grad_norm": 1.795866847038269, + "learning_rate": 4.824721042501218e-05, + "loss": 5.6791, + "step": 20162 + }, + { + "epoch": 0.11991507279474736, + "grad_norm": 2.3721072673797607, + "learning_rate": 4.824703860262479e-05, + "loss": 5.4931, + "step": 20163 + }, + { + "epoch": 0.11992102007802835, + "grad_norm": 2.415207862854004, + "learning_rate": 4.824686677212209e-05, + "loss": 5.3801, + "step": 20164 + }, + { + "epoch": 0.11992696736130935, + "grad_norm": 2.411116600036621, + "learning_rate": 4.824669493350415e-05, + "loss": 5.1122, + "step": 20165 + }, + { + "epoch": 0.11993291464459035, + "grad_norm": 1.928256869316101, + "learning_rate": 4.824652308677104e-05, + "loss": 5.1627, + "step": 20166 + }, + { + "epoch": 0.11993886192787134, + "grad_norm": 1.9031376838684082, + "learning_rate": 4.8246351231922803e-05, + "loss": 5.014, + "step": 20167 + }, + { + "epoch": 0.11994480921115235, + "grad_norm": 1.8143563270568848, + "learning_rate": 4.82461793689595e-05, + "loss": 4.8921, + "step": 20168 + }, + { + "epoch": 0.11995075649443335, + "grad_norm": 1.7218538522720337, + "learning_rate": 4.824600749788121e-05, + "loss": 4.83, + "step": 20169 + }, + { + "epoch": 0.11995670377771434, + "grad_norm": 1.8235888481140137, + "learning_rate": 4.824583561868796e-05, + "loss": 5.0709, + "step": 20170 + }, + { + "epoch": 0.11996265106099534, + "grad_norm": 2.404656410217285, + "learning_rate": 4.8245663731379845e-05, + "loss": 4.7555, + "step": 20171 + }, + { + "epoch": 0.11996859834427634, + "grad_norm": 2.0463438034057617, + "learning_rate": 4.82454918359569e-05, + "loss": 5.2582, + "step": 20172 + }, + { + "epoch": 0.11997454562755733, + "grad_norm": 1.9073017835617065, + "learning_rate": 4.82453199324192e-05, + "loss": 5.794, + "step": 20173 + }, + { + "epoch": 0.11998049291083833, + "grad_norm": 1.856101632118225, + "learning_rate": 4.8245148020766796e-05, + "loss": 5.8569, + "step": 20174 + }, + { + "epoch": 0.11998644019411933, + "grad_norm": 1.6862335205078125, + "learning_rate": 4.8244976100999745e-05, + "loss": 5.7762, + "step": 20175 + }, + { + "epoch": 0.11999238747740032, + "grad_norm": 1.8727613687515259, + "learning_rate": 4.824480417311812e-05, + "loss": 5.5417, + "step": 20176 + }, + { + "epoch": 0.11999833476068132, + "grad_norm": 2.2967453002929688, + "learning_rate": 4.8244632237121964e-05, + "loss": 5.3268, + "step": 20177 + }, + { + "epoch": 0.12000428204396232, + "grad_norm": 2.1443405151367188, + "learning_rate": 4.824446029301136e-05, + "loss": 5.1333, + "step": 20178 + }, + { + "epoch": 0.12001022932724331, + "grad_norm": 1.7855141162872314, + "learning_rate": 4.824428834078635e-05, + "loss": 5.2781, + "step": 20179 + }, + { + "epoch": 0.12001617661052431, + "grad_norm": 1.880510926246643, + "learning_rate": 4.8244116380447e-05, + "loss": 5.1012, + "step": 20180 + }, + { + "epoch": 0.12002212389380532, + "grad_norm": 1.6733261346817017, + "learning_rate": 4.824394441199337e-05, + "loss": 5.3, + "step": 20181 + }, + { + "epoch": 0.1200280711770863, + "grad_norm": 1.781132459640503, + "learning_rate": 4.824377243542552e-05, + "loss": 5.7102, + "step": 20182 + }, + { + "epoch": 0.1200340184603673, + "grad_norm": 1.779144287109375, + "learning_rate": 4.82436004507435e-05, + "loss": 5.694, + "step": 20183 + }, + { + "epoch": 0.12003996574364831, + "grad_norm": 1.6547144651412964, + "learning_rate": 4.824342845794739e-05, + "loss": 5.4852, + "step": 20184 + }, + { + "epoch": 0.1200459130269293, + "grad_norm": 1.8403137922286987, + "learning_rate": 4.824325645703723e-05, + "loss": 5.9584, + "step": 20185 + }, + { + "epoch": 0.1200518603102103, + "grad_norm": 1.738139271736145, + "learning_rate": 4.8243084448013095e-05, + "loss": 5.903, + "step": 20186 + }, + { + "epoch": 0.12005780759349129, + "grad_norm": 1.7819492816925049, + "learning_rate": 4.824291243087504e-05, + "loss": 5.587, + "step": 20187 + }, + { + "epoch": 0.12006375487677229, + "grad_norm": 1.5876322984695435, + "learning_rate": 4.824274040562313e-05, + "loss": 5.1007, + "step": 20188 + }, + { + "epoch": 0.12006970216005329, + "grad_norm": 1.6465766429901123, + "learning_rate": 4.824256837225741e-05, + "loss": 4.9674, + "step": 20189 + }, + { + "epoch": 0.12007564944333428, + "grad_norm": 1.5593008995056152, + "learning_rate": 4.824239633077795e-05, + "loss": 4.8428, + "step": 20190 + }, + { + "epoch": 0.12008159672661528, + "grad_norm": 1.9153317213058472, + "learning_rate": 4.8242224281184814e-05, + "loss": 5.7613, + "step": 20191 + }, + { + "epoch": 0.12008754400989628, + "grad_norm": 1.727364182472229, + "learning_rate": 4.8242052223478055e-05, + "loss": 5.8612, + "step": 20192 + }, + { + "epoch": 0.12009349129317727, + "grad_norm": 1.567190408706665, + "learning_rate": 4.8241880157657736e-05, + "loss": 5.9975, + "step": 20193 + }, + { + "epoch": 0.12009943857645827, + "grad_norm": 1.549182415008545, + "learning_rate": 4.824170808372391e-05, + "loss": 5.9723, + "step": 20194 + }, + { + "epoch": 0.12010538585973927, + "grad_norm": 1.6152268648147583, + "learning_rate": 4.824153600167666e-05, + "loss": 5.9953, + "step": 20195 + }, + { + "epoch": 0.12011133314302026, + "grad_norm": 1.5206012725830078, + "learning_rate": 4.824136391151602e-05, + "loss": 5.7435, + "step": 20196 + }, + { + "epoch": 0.12011728042630127, + "grad_norm": 1.719746470451355, + "learning_rate": 4.824119181324206e-05, + "loss": 5.6181, + "step": 20197 + }, + { + "epoch": 0.12012322770958227, + "grad_norm": 1.53969407081604, + "learning_rate": 4.824101970685484e-05, + "loss": 5.2699, + "step": 20198 + }, + { + "epoch": 0.12012917499286326, + "grad_norm": 1.6543430089950562, + "learning_rate": 4.824084759235442e-05, + "loss": 5.3316, + "step": 20199 + }, + { + "epoch": 0.12013512227614426, + "grad_norm": 1.8182042837142944, + "learning_rate": 4.8240675469740856e-05, + "loss": 5.4494, + "step": 20200 + }, + { + "epoch": 0.12014106955942526, + "grad_norm": 1.5531221628189087, + "learning_rate": 4.824050333901422e-05, + "loss": 5.3292, + "step": 20201 + }, + { + "epoch": 0.12014701684270625, + "grad_norm": 1.4964851140975952, + "learning_rate": 4.8240331200174564e-05, + "loss": 5.391, + "step": 20202 + }, + { + "epoch": 0.12015296412598725, + "grad_norm": 1.5492072105407715, + "learning_rate": 4.824015905322195e-05, + "loss": 5.373, + "step": 20203 + }, + { + "epoch": 0.12015891140926825, + "grad_norm": 1.733115792274475, + "learning_rate": 4.823998689815643e-05, + "loss": 5.6997, + "step": 20204 + }, + { + "epoch": 0.12016485869254924, + "grad_norm": 1.8122310638427734, + "learning_rate": 4.8239814734978074e-05, + "loss": 5.4116, + "step": 20205 + }, + { + "epoch": 0.12017080597583024, + "grad_norm": 1.9058727025985718, + "learning_rate": 4.8239642563686934e-05, + "loss": 4.9749, + "step": 20206 + }, + { + "epoch": 0.12017675325911124, + "grad_norm": 1.5442882776260376, + "learning_rate": 4.823947038428308e-05, + "loss": 5.6342, + "step": 20207 + }, + { + "epoch": 0.12018270054239223, + "grad_norm": 1.5593653917312622, + "learning_rate": 4.823929819676657e-05, + "loss": 5.7084, + "step": 20208 + }, + { + "epoch": 0.12018864782567323, + "grad_norm": 1.5067681074142456, + "learning_rate": 4.823912600113746e-05, + "loss": 5.2455, + "step": 20209 + }, + { + "epoch": 0.12019459510895424, + "grad_norm": 1.7560538053512573, + "learning_rate": 4.82389537973958e-05, + "loss": 5.5733, + "step": 20210 + }, + { + "epoch": 0.12020054239223522, + "grad_norm": 1.6941232681274414, + "learning_rate": 4.823878158554167e-05, + "loss": 5.4642, + "step": 20211 + }, + { + "epoch": 0.12020648967551623, + "grad_norm": 1.531043529510498, + "learning_rate": 4.8238609365575124e-05, + "loss": 5.1859, + "step": 20212 + }, + { + "epoch": 0.12021243695879723, + "grad_norm": 1.8201080560684204, + "learning_rate": 4.823843713749622e-05, + "loss": 5.1331, + "step": 20213 + }, + { + "epoch": 0.12021838424207822, + "grad_norm": 1.6585347652435303, + "learning_rate": 4.823826490130501e-05, + "loss": 5.6017, + "step": 20214 + }, + { + "epoch": 0.12022433152535922, + "grad_norm": 1.7156457901000977, + "learning_rate": 4.8238092657001566e-05, + "loss": 5.4022, + "step": 20215 + }, + { + "epoch": 0.1202302788086402, + "grad_norm": 1.474266529083252, + "learning_rate": 4.823792040458595e-05, + "loss": 5.6352, + "step": 20216 + }, + { + "epoch": 0.12023622609192121, + "grad_norm": 1.4047836065292358, + "learning_rate": 4.8237748144058206e-05, + "loss": 5.7834, + "step": 20217 + }, + { + "epoch": 0.12024217337520221, + "grad_norm": 1.4172712564468384, + "learning_rate": 4.823757587541841e-05, + "loss": 5.7711, + "step": 20218 + }, + { + "epoch": 0.1202481206584832, + "grad_norm": 1.6180040836334229, + "learning_rate": 4.823740359866661e-05, + "loss": 4.9208, + "step": 20219 + }, + { + "epoch": 0.1202540679417642, + "grad_norm": 1.917434573173523, + "learning_rate": 4.8237231313802875e-05, + "loss": 5.0108, + "step": 20220 + }, + { + "epoch": 0.1202600152250452, + "grad_norm": 1.6807219982147217, + "learning_rate": 4.823705902082727e-05, + "loss": 4.8156, + "step": 20221 + }, + { + "epoch": 0.12026596250832619, + "grad_norm": 1.7759804725646973, + "learning_rate": 4.823688671973984e-05, + "loss": 4.9253, + "step": 20222 + }, + { + "epoch": 0.12027190979160719, + "grad_norm": 1.667723536491394, + "learning_rate": 4.8236714410540664e-05, + "loss": 5.3166, + "step": 20223 + }, + { + "epoch": 0.1202778570748882, + "grad_norm": 2.089888334274292, + "learning_rate": 4.823654209322977e-05, + "loss": 4.5147, + "step": 20224 + }, + { + "epoch": 0.12028380435816918, + "grad_norm": 1.878585934638977, + "learning_rate": 4.823636976780725e-05, + "loss": 5.2102, + "step": 20225 + }, + { + "epoch": 0.12028975164145018, + "grad_norm": 1.758644461631775, + "learning_rate": 4.8236197434273164e-05, + "loss": 5.7388, + "step": 20226 + }, + { + "epoch": 0.12029569892473119, + "grad_norm": 1.8373035192489624, + "learning_rate": 4.823602509262755e-05, + "loss": 5.0102, + "step": 20227 + }, + { + "epoch": 0.12030164620801218, + "grad_norm": 1.697994589805603, + "learning_rate": 4.8235852742870486e-05, + "loss": 4.8272, + "step": 20228 + }, + { + "epoch": 0.12030759349129318, + "grad_norm": 1.8276288509368896, + "learning_rate": 4.823568038500202e-05, + "loss": 5.2316, + "step": 20229 + }, + { + "epoch": 0.12031354077457418, + "grad_norm": 1.691236972808838, + "learning_rate": 4.823550801902222e-05, + "loss": 5.2957, + "step": 20230 + }, + { + "epoch": 0.12031948805785517, + "grad_norm": 1.5625227689743042, + "learning_rate": 4.823533564493115e-05, + "loss": 5.0525, + "step": 20231 + }, + { + "epoch": 0.12032543534113617, + "grad_norm": 1.927823543548584, + "learning_rate": 4.823516326272886e-05, + "loss": 5.1367, + "step": 20232 + }, + { + "epoch": 0.12033138262441717, + "grad_norm": 1.649434208869934, + "learning_rate": 4.823499087241541e-05, + "loss": 4.8151, + "step": 20233 + }, + { + "epoch": 0.12033732990769816, + "grad_norm": 1.660487413406372, + "learning_rate": 4.8234818473990866e-05, + "loss": 4.8875, + "step": 20234 + }, + { + "epoch": 0.12034327719097916, + "grad_norm": 1.584165096282959, + "learning_rate": 4.823464606745529e-05, + "loss": 5.4909, + "step": 20235 + }, + { + "epoch": 0.12034922447426016, + "grad_norm": 1.6812808513641357, + "learning_rate": 4.823447365280874e-05, + "loss": 5.1194, + "step": 20236 + }, + { + "epoch": 0.12035517175754115, + "grad_norm": 1.6096045970916748, + "learning_rate": 4.823430123005127e-05, + "loss": 4.974, + "step": 20237 + }, + { + "epoch": 0.12036111904082215, + "grad_norm": 1.9969391822814941, + "learning_rate": 4.8234128799182954e-05, + "loss": 4.4403, + "step": 20238 + }, + { + "epoch": 0.12036706632410316, + "grad_norm": 1.7902976274490356, + "learning_rate": 4.8233956360203836e-05, + "loss": 5.0718, + "step": 20239 + }, + { + "epoch": 0.12037301360738414, + "grad_norm": 1.7156457901000977, + "learning_rate": 4.8233783913113985e-05, + "loss": 5.0892, + "step": 20240 + }, + { + "epoch": 0.12037896089066515, + "grad_norm": 2.1590521335601807, + "learning_rate": 4.823361145791346e-05, + "loss": 5.3385, + "step": 20241 + }, + { + "epoch": 0.12038490817394615, + "grad_norm": 1.7091206312179565, + "learning_rate": 4.8233438994602325e-05, + "loss": 5.1961, + "step": 20242 + }, + { + "epoch": 0.12039085545722714, + "grad_norm": 1.3705766201019287, + "learning_rate": 4.823326652318063e-05, + "loss": 5.023, + "step": 20243 + }, + { + "epoch": 0.12039680274050814, + "grad_norm": 1.2733731269836426, + "learning_rate": 4.8233094043648456e-05, + "loss": 5.2236, + "step": 20244 + }, + { + "epoch": 0.12040275002378913, + "grad_norm": 1.3697882890701294, + "learning_rate": 4.823292155600583e-05, + "loss": 5.3146, + "step": 20245 + }, + { + "epoch": 0.12040869730707013, + "grad_norm": 1.4292283058166504, + "learning_rate": 4.8232749060252846e-05, + "loss": 5.2777, + "step": 20246 + }, + { + "epoch": 0.12041464459035113, + "grad_norm": 1.5285491943359375, + "learning_rate": 4.823257655638954e-05, + "loss": 5.3465, + "step": 20247 + }, + { + "epoch": 0.12042059187363212, + "grad_norm": 1.6307164430618286, + "learning_rate": 4.823240404441598e-05, + "loss": 5.2863, + "step": 20248 + }, + { + "epoch": 0.12042653915691312, + "grad_norm": 1.4112886190414429, + "learning_rate": 4.823223152433224e-05, + "loss": 5.3082, + "step": 20249 + }, + { + "epoch": 0.12043248644019412, + "grad_norm": 1.4699361324310303, + "learning_rate": 4.823205899613836e-05, + "loss": 5.2161, + "step": 20250 + }, + { + "epoch": 0.12043843372347511, + "grad_norm": 1.3991621732711792, + "learning_rate": 4.823188645983441e-05, + "loss": 5.2493, + "step": 20251 + }, + { + "epoch": 0.12044438100675611, + "grad_norm": 1.4673911333084106, + "learning_rate": 4.8231713915420446e-05, + "loss": 5.1592, + "step": 20252 + }, + { + "epoch": 0.12045032829003711, + "grad_norm": 1.3782176971435547, + "learning_rate": 4.8231541362896534e-05, + "loss": 5.3296, + "step": 20253 + }, + { + "epoch": 0.1204562755733181, + "grad_norm": 1.5209922790527344, + "learning_rate": 4.823136880226272e-05, + "loss": 5.4215, + "step": 20254 + }, + { + "epoch": 0.1204622228565991, + "grad_norm": 1.3906199932098389, + "learning_rate": 4.823119623351909e-05, + "loss": 5.2263, + "step": 20255 + }, + { + "epoch": 0.1204681701398801, + "grad_norm": 1.4061380624771118, + "learning_rate": 4.823102365666568e-05, + "loss": 5.2252, + "step": 20256 + }, + { + "epoch": 0.1204741174231611, + "grad_norm": 1.3005892038345337, + "learning_rate": 4.8230851071702564e-05, + "loss": 5.2015, + "step": 20257 + }, + { + "epoch": 0.1204800647064421, + "grad_norm": 1.4949315786361694, + "learning_rate": 4.8230678478629796e-05, + "loss": 4.9753, + "step": 20258 + }, + { + "epoch": 0.1204860119897231, + "grad_norm": 1.5322837829589844, + "learning_rate": 4.823050587744744e-05, + "loss": 5.1862, + "step": 20259 + }, + { + "epoch": 0.12049195927300409, + "grad_norm": 1.379016637802124, + "learning_rate": 4.8230333268155556e-05, + "loss": 5.0689, + "step": 20260 + }, + { + "epoch": 0.12049790655628509, + "grad_norm": 1.2959635257720947, + "learning_rate": 4.8230160650754205e-05, + "loss": 5.1079, + "step": 20261 + }, + { + "epoch": 0.12050385383956609, + "grad_norm": 1.3587706089019775, + "learning_rate": 4.8229988025243436e-05, + "loss": 5.2024, + "step": 20262 + }, + { + "epoch": 0.12050980112284708, + "grad_norm": 1.3031280040740967, + "learning_rate": 4.822981539162332e-05, + "loss": 5.1008, + "step": 20263 + }, + { + "epoch": 0.12051574840612808, + "grad_norm": 1.315364956855774, + "learning_rate": 4.822964274989392e-05, + "loss": 4.8122, + "step": 20264 + }, + { + "epoch": 0.12052169568940908, + "grad_norm": 1.3627794981002808, + "learning_rate": 4.8229470100055293e-05, + "loss": 5.0851, + "step": 20265 + }, + { + "epoch": 0.12052764297269007, + "grad_norm": 1.4490907192230225, + "learning_rate": 4.822929744210749e-05, + "loss": 4.7956, + "step": 20266 + }, + { + "epoch": 0.12053359025597107, + "grad_norm": 1.1658390760421753, + "learning_rate": 4.8229124776050584e-05, + "loss": 5.0365, + "step": 20267 + }, + { + "epoch": 0.12053953753925208, + "grad_norm": 1.2844047546386719, + "learning_rate": 4.822895210188463e-05, + "loss": 5.3005, + "step": 20268 + }, + { + "epoch": 0.12054548482253306, + "grad_norm": 1.5759227275848389, + "learning_rate": 4.822877941960969e-05, + "loss": 5.0768, + "step": 20269 + }, + { + "epoch": 0.12055143210581407, + "grad_norm": 1.457592248916626, + "learning_rate": 4.822860672922582e-05, + "loss": 5.1662, + "step": 20270 + }, + { + "epoch": 0.12055737938909507, + "grad_norm": 1.2711186408996582, + "learning_rate": 4.8228434030733086e-05, + "loss": 5.3703, + "step": 20271 + }, + { + "epoch": 0.12056332667237606, + "grad_norm": 1.300824522972107, + "learning_rate": 4.822826132413155e-05, + "loss": 5.2529, + "step": 20272 + }, + { + "epoch": 0.12056927395565706, + "grad_norm": 1.2395694255828857, + "learning_rate": 4.822808860942126e-05, + "loss": 5.3225, + "step": 20273 + }, + { + "epoch": 0.12057522123893805, + "grad_norm": 1.491053581237793, + "learning_rate": 4.822791588660229e-05, + "loss": 5.5039, + "step": 20274 + }, + { + "epoch": 0.12058116852221905, + "grad_norm": 1.4981472492218018, + "learning_rate": 4.8227743155674684e-05, + "loss": 4.8774, + "step": 20275 + }, + { + "epoch": 0.12058711580550005, + "grad_norm": 1.4627505540847778, + "learning_rate": 4.822757041663852e-05, + "loss": 4.9165, + "step": 20276 + }, + { + "epoch": 0.12059306308878104, + "grad_norm": 1.5328632593154907, + "learning_rate": 4.8227397669493856e-05, + "loss": 4.8773, + "step": 20277 + }, + { + "epoch": 0.12059901037206204, + "grad_norm": 1.314146876335144, + "learning_rate": 4.822722491424074e-05, + "loss": 5.0159, + "step": 20278 + }, + { + "epoch": 0.12060495765534304, + "grad_norm": 1.435636043548584, + "learning_rate": 4.822705215087925e-05, + "loss": 5.1621, + "step": 20279 + }, + { + "epoch": 0.12061090493862403, + "grad_norm": 1.3141332864761353, + "learning_rate": 4.822687937940943e-05, + "loss": 5.3143, + "step": 20280 + }, + { + "epoch": 0.12061685222190503, + "grad_norm": 1.3140829801559448, + "learning_rate": 4.822670659983134e-05, + "loss": 5.3171, + "step": 20281 + }, + { + "epoch": 0.12062279950518603, + "grad_norm": 1.5490076541900635, + "learning_rate": 4.8226533812145056e-05, + "loss": 5.1932, + "step": 20282 + }, + { + "epoch": 0.12062874678846702, + "grad_norm": 1.4878573417663574, + "learning_rate": 4.822636101635063e-05, + "loss": 5.1662, + "step": 20283 + }, + { + "epoch": 0.12063469407174802, + "grad_norm": 1.519872784614563, + "learning_rate": 4.822618821244811e-05, + "loss": 5.0641, + "step": 20284 + }, + { + "epoch": 0.12064064135502903, + "grad_norm": 1.430929183959961, + "learning_rate": 4.822601540043757e-05, + "loss": 4.9086, + "step": 20285 + }, + { + "epoch": 0.12064658863831002, + "grad_norm": 1.483995795249939, + "learning_rate": 4.822584258031908e-05, + "loss": 4.992, + "step": 20286 + }, + { + "epoch": 0.12065253592159102, + "grad_norm": 1.3074853420257568, + "learning_rate": 4.822566975209269e-05, + "loss": 4.9514, + "step": 20287 + }, + { + "epoch": 0.12065848320487202, + "grad_norm": 1.6032319068908691, + "learning_rate": 4.822549691575844e-05, + "loss": 4.8495, + "step": 20288 + }, + { + "epoch": 0.12066443048815301, + "grad_norm": 1.2918034791946411, + "learning_rate": 4.822532407131641e-05, + "loss": 5.0728, + "step": 20289 + }, + { + "epoch": 0.12067037777143401, + "grad_norm": 1.3000357151031494, + "learning_rate": 4.8225151218766675e-05, + "loss": 5.0898, + "step": 20290 + }, + { + "epoch": 0.12067632505471501, + "grad_norm": 1.3674614429473877, + "learning_rate": 4.8224978358109274e-05, + "loss": 4.8252, + "step": 20291 + }, + { + "epoch": 0.120682272337996, + "grad_norm": 1.1932893991470337, + "learning_rate": 4.822480548934427e-05, + "loss": 4.9946, + "step": 20292 + }, + { + "epoch": 0.120688219621277, + "grad_norm": 1.1052628755569458, + "learning_rate": 4.822463261247173e-05, + "loss": 5.0293, + "step": 20293 + }, + { + "epoch": 0.120694166904558, + "grad_norm": 1.1658306121826172, + "learning_rate": 4.82244597274917e-05, + "loss": 4.9417, + "step": 20294 + }, + { + "epoch": 0.12070011418783899, + "grad_norm": 1.1357192993164062, + "learning_rate": 4.822428683440426e-05, + "loss": 4.9448, + "step": 20295 + }, + { + "epoch": 0.12070606147112, + "grad_norm": 1.0769197940826416, + "learning_rate": 4.822411393320946e-05, + "loss": 4.8676, + "step": 20296 + }, + { + "epoch": 0.120712008754401, + "grad_norm": 1.4339419603347778, + "learning_rate": 4.8223941023907366e-05, + "loss": 5.0648, + "step": 20297 + }, + { + "epoch": 0.12071795603768198, + "grad_norm": 1.6009191274642944, + "learning_rate": 4.822376810649803e-05, + "loss": 5.2228, + "step": 20298 + }, + { + "epoch": 0.12072390332096299, + "grad_norm": 1.5266865491867065, + "learning_rate": 4.8223595180981515e-05, + "loss": 5.1399, + "step": 20299 + }, + { + "epoch": 0.12072985060424399, + "grad_norm": 1.6861037015914917, + "learning_rate": 4.822342224735788e-05, + "loss": 4.9326, + "step": 20300 + }, + { + "epoch": 0.12073579788752498, + "grad_norm": 1.4925029277801514, + "learning_rate": 4.8223249305627204e-05, + "loss": 4.9586, + "step": 20301 + }, + { + "epoch": 0.12074174517080598, + "grad_norm": 1.3088650703430176, + "learning_rate": 4.822307635578952e-05, + "loss": 5.1486, + "step": 20302 + }, + { + "epoch": 0.12074769245408697, + "grad_norm": 1.5702837705612183, + "learning_rate": 4.82229033978449e-05, + "loss": 4.788, + "step": 20303 + }, + { + "epoch": 0.12075363973736797, + "grad_norm": 1.5717079639434814, + "learning_rate": 4.8222730431793406e-05, + "loss": 4.6354, + "step": 20304 + }, + { + "epoch": 0.12075958702064897, + "grad_norm": 1.4520710706710815, + "learning_rate": 4.822255745763509e-05, + "loss": 4.6995, + "step": 20305 + }, + { + "epoch": 0.12076553430392996, + "grad_norm": 1.57894766330719, + "learning_rate": 4.822238447537003e-05, + "loss": 4.6355, + "step": 20306 + }, + { + "epoch": 0.12077148158721096, + "grad_norm": 1.5820640325546265, + "learning_rate": 4.822221148499827e-05, + "loss": 4.6993, + "step": 20307 + }, + { + "epoch": 0.12077742887049196, + "grad_norm": 1.5759177207946777, + "learning_rate": 4.822203848651987e-05, + "loss": 4.5678, + "step": 20308 + }, + { + "epoch": 0.12078337615377295, + "grad_norm": 1.5758824348449707, + "learning_rate": 4.822186547993491e-05, + "loss": 4.547, + "step": 20309 + }, + { + "epoch": 0.12078932343705395, + "grad_norm": 1.6604961156845093, + "learning_rate": 4.822169246524343e-05, + "loss": 4.6418, + "step": 20310 + }, + { + "epoch": 0.12079527072033495, + "grad_norm": 1.6913725137710571, + "learning_rate": 4.8221519442445496e-05, + "loss": 4.5329, + "step": 20311 + }, + { + "epoch": 0.12080121800361594, + "grad_norm": 1.6500364542007446, + "learning_rate": 4.822134641154117e-05, + "loss": 4.6701, + "step": 20312 + }, + { + "epoch": 0.12080716528689694, + "grad_norm": 1.6819617748260498, + "learning_rate": 4.822117337253051e-05, + "loss": 4.619, + "step": 20313 + }, + { + "epoch": 0.12081311257017795, + "grad_norm": 1.27179753780365, + "learning_rate": 4.8221000325413576e-05, + "loss": 5.091, + "step": 20314 + }, + { + "epoch": 0.12081905985345893, + "grad_norm": 1.357703447341919, + "learning_rate": 4.822082727019044e-05, + "loss": 4.9313, + "step": 20315 + }, + { + "epoch": 0.12082500713673994, + "grad_norm": 1.2419538497924805, + "learning_rate": 4.8220654206861144e-05, + "loss": 4.9511, + "step": 20316 + }, + { + "epoch": 0.12083095442002094, + "grad_norm": 1.4506672620773315, + "learning_rate": 4.822048113542576e-05, + "loss": 5.1608, + "step": 20317 + }, + { + "epoch": 0.12083690170330193, + "grad_norm": 1.597922921180725, + "learning_rate": 4.8220308055884345e-05, + "loss": 5.1663, + "step": 20318 + }, + { + "epoch": 0.12084284898658293, + "grad_norm": 1.2692219018936157, + "learning_rate": 4.822013496823696e-05, + "loss": 5.0838, + "step": 20319 + }, + { + "epoch": 0.12084879626986393, + "grad_norm": 1.427439570426941, + "learning_rate": 4.8219961872483674e-05, + "loss": 4.9863, + "step": 20320 + }, + { + "epoch": 0.12085474355314492, + "grad_norm": 1.3992658853530884, + "learning_rate": 4.821978876862453e-05, + "loss": 5.1907, + "step": 20321 + }, + { + "epoch": 0.12086069083642592, + "grad_norm": 1.3777414560317993, + "learning_rate": 4.8219615656659605e-05, + "loss": 5.137, + "step": 20322 + }, + { + "epoch": 0.12086663811970692, + "grad_norm": 1.3394333124160767, + "learning_rate": 4.821944253658895e-05, + "loss": 5.1222, + "step": 20323 + }, + { + "epoch": 0.12087258540298791, + "grad_norm": 1.3054091930389404, + "learning_rate": 4.8219269408412625e-05, + "loss": 4.9626, + "step": 20324 + }, + { + "epoch": 0.12087853268626891, + "grad_norm": 1.3209751844406128, + "learning_rate": 4.8219096272130696e-05, + "loss": 5.0408, + "step": 20325 + }, + { + "epoch": 0.12088447996954992, + "grad_norm": 1.3860117197036743, + "learning_rate": 4.821892312774322e-05, + "loss": 4.9667, + "step": 20326 + }, + { + "epoch": 0.1208904272528309, + "grad_norm": 1.2468161582946777, + "learning_rate": 4.821874997525025e-05, + "loss": 5.1203, + "step": 20327 + }, + { + "epoch": 0.1208963745361119, + "grad_norm": 1.221932291984558, + "learning_rate": 4.821857681465186e-05, + "loss": 4.9117, + "step": 20328 + }, + { + "epoch": 0.12090232181939291, + "grad_norm": 1.2188096046447754, + "learning_rate": 4.8218403645948105e-05, + "loss": 4.7647, + "step": 20329 + }, + { + "epoch": 0.1209082691026739, + "grad_norm": 1.4023007154464722, + "learning_rate": 4.8218230469139044e-05, + "loss": 4.9038, + "step": 20330 + }, + { + "epoch": 0.1209142163859549, + "grad_norm": 1.4733843803405762, + "learning_rate": 4.821805728422474e-05, + "loss": 4.9782, + "step": 20331 + }, + { + "epoch": 0.12092016366923589, + "grad_norm": 1.405462384223938, + "learning_rate": 4.821788409120525e-05, + "loss": 5.0028, + "step": 20332 + }, + { + "epoch": 0.12092611095251689, + "grad_norm": 1.4103752374649048, + "learning_rate": 4.821771089008064e-05, + "loss": 4.8219, + "step": 20333 + }, + { + "epoch": 0.12093205823579789, + "grad_norm": 1.403225064277649, + "learning_rate": 4.821753768085096e-05, + "loss": 4.9024, + "step": 20334 + }, + { + "epoch": 0.12093800551907888, + "grad_norm": 1.3480467796325684, + "learning_rate": 4.821736446351629e-05, + "loss": 4.9341, + "step": 20335 + }, + { + "epoch": 0.12094395280235988, + "grad_norm": 1.4869621992111206, + "learning_rate": 4.821719123807667e-05, + "loss": 5.6448, + "step": 20336 + }, + { + "epoch": 0.12094990008564088, + "grad_norm": 1.3473197221755981, + "learning_rate": 4.821701800453217e-05, + "loss": 4.9512, + "step": 20337 + }, + { + "epoch": 0.12095584736892187, + "grad_norm": 1.378721833229065, + "learning_rate": 4.821684476288285e-05, + "loss": 5.0146, + "step": 20338 + }, + { + "epoch": 0.12096179465220287, + "grad_norm": 1.2590171098709106, + "learning_rate": 4.821667151312876e-05, + "loss": 4.8453, + "step": 20339 + }, + { + "epoch": 0.12096774193548387, + "grad_norm": 1.3700976371765137, + "learning_rate": 4.821649825526998e-05, + "loss": 4.989, + "step": 20340 + }, + { + "epoch": 0.12097368921876486, + "grad_norm": 1.2956781387329102, + "learning_rate": 4.821632498930656e-05, + "loss": 5.1885, + "step": 20341 + }, + { + "epoch": 0.12097963650204586, + "grad_norm": 1.5004302263259888, + "learning_rate": 4.821615171523856e-05, + "loss": 4.8978, + "step": 20342 + }, + { + "epoch": 0.12098558378532687, + "grad_norm": 1.4427284002304077, + "learning_rate": 4.821597843306603e-05, + "loss": 5.0771, + "step": 20343 + }, + { + "epoch": 0.12099153106860785, + "grad_norm": 1.2329649925231934, + "learning_rate": 4.8215805142789054e-05, + "loss": 5.1695, + "step": 20344 + }, + { + "epoch": 0.12099747835188886, + "grad_norm": 1.521106243133545, + "learning_rate": 4.8215631844407685e-05, + "loss": 4.8117, + "step": 20345 + }, + { + "epoch": 0.12100342563516986, + "grad_norm": 1.4634170532226562, + "learning_rate": 4.8215458537921966e-05, + "loss": 4.8144, + "step": 20346 + }, + { + "epoch": 0.12100937291845085, + "grad_norm": 1.2921918630599976, + "learning_rate": 4.821528522333197e-05, + "loss": 5.0769, + "step": 20347 + }, + { + "epoch": 0.12101532020173185, + "grad_norm": 1.5667484998703003, + "learning_rate": 4.821511190063777e-05, + "loss": 4.7748, + "step": 20348 + }, + { + "epoch": 0.12102126748501285, + "grad_norm": 1.3546236753463745, + "learning_rate": 4.8214938569839405e-05, + "loss": 5.0496, + "step": 20349 + }, + { + "epoch": 0.12102721476829384, + "grad_norm": 1.354236125946045, + "learning_rate": 4.821476523093695e-05, + "loss": 4.9173, + "step": 20350 + }, + { + "epoch": 0.12103316205157484, + "grad_norm": 1.3883708715438843, + "learning_rate": 4.821459188393046e-05, + "loss": 5.0093, + "step": 20351 + }, + { + "epoch": 0.12103910933485584, + "grad_norm": 1.5914138555526733, + "learning_rate": 4.8214418528819995e-05, + "loss": 4.7995, + "step": 20352 + }, + { + "epoch": 0.12104505661813683, + "grad_norm": 1.3804936408996582, + "learning_rate": 4.821424516560561e-05, + "loss": 5.0071, + "step": 20353 + }, + { + "epoch": 0.12105100390141783, + "grad_norm": 1.4783899784088135, + "learning_rate": 4.8214071794287376e-05, + "loss": 4.9744, + "step": 20354 + }, + { + "epoch": 0.12105695118469884, + "grad_norm": 1.480790376663208, + "learning_rate": 4.821389841486535e-05, + "loss": 4.9975, + "step": 20355 + }, + { + "epoch": 0.12106289846797982, + "grad_norm": 1.852853536605835, + "learning_rate": 4.82137250273396e-05, + "loss": 5.069, + "step": 20356 + }, + { + "epoch": 0.12106884575126083, + "grad_norm": 1.623017430305481, + "learning_rate": 4.821355163171016e-05, + "loss": 4.9939, + "step": 20357 + }, + { + "epoch": 0.12107479303454183, + "grad_norm": 1.526219367980957, + "learning_rate": 4.8213378227977123e-05, + "loss": 5.1281, + "step": 20358 + }, + { + "epoch": 0.12108074031782282, + "grad_norm": 1.574321985244751, + "learning_rate": 4.8213204816140536e-05, + "loss": 5.0241, + "step": 20359 + }, + { + "epoch": 0.12108668760110382, + "grad_norm": 1.5978455543518066, + "learning_rate": 4.8213031396200446e-05, + "loss": 5.0107, + "step": 20360 + }, + { + "epoch": 0.12109263488438482, + "grad_norm": 1.509109616279602, + "learning_rate": 4.821285796815694e-05, + "loss": 5.0056, + "step": 20361 + }, + { + "epoch": 0.12109858216766581, + "grad_norm": 1.4923186302185059, + "learning_rate": 4.8212684532010054e-05, + "loss": 5.0412, + "step": 20362 + }, + { + "epoch": 0.12110452945094681, + "grad_norm": 1.7046619653701782, + "learning_rate": 4.8212511087759874e-05, + "loss": 4.8996, + "step": 20363 + }, + { + "epoch": 0.1211104767342278, + "grad_norm": 1.7599172592163086, + "learning_rate": 4.8212337635406435e-05, + "loss": 4.9979, + "step": 20364 + }, + { + "epoch": 0.1211164240175088, + "grad_norm": 1.6309099197387695, + "learning_rate": 4.821216417494982e-05, + "loss": 4.9639, + "step": 20365 + }, + { + "epoch": 0.1211223713007898, + "grad_norm": 1.7311389446258545, + "learning_rate": 4.821199070639006e-05, + "loss": 4.9296, + "step": 20366 + }, + { + "epoch": 0.12112831858407079, + "grad_norm": 1.480536699295044, + "learning_rate": 4.8211817229727246e-05, + "loss": 4.9338, + "step": 20367 + }, + { + "epoch": 0.12113426586735179, + "grad_norm": 1.4267778396606445, + "learning_rate": 4.821164374496143e-05, + "loss": 4.8954, + "step": 20368 + }, + { + "epoch": 0.1211402131506328, + "grad_norm": 1.3726919889450073, + "learning_rate": 4.821147025209266e-05, + "loss": 4.8362, + "step": 20369 + }, + { + "epoch": 0.12114616043391378, + "grad_norm": 1.5158253908157349, + "learning_rate": 4.821129675112101e-05, + "loss": 5.0629, + "step": 20370 + }, + { + "epoch": 0.12115210771719478, + "grad_norm": 1.2002782821655273, + "learning_rate": 4.8211123242046535e-05, + "loss": 4.7668, + "step": 20371 + }, + { + "epoch": 0.12115805500047579, + "grad_norm": 1.123113751411438, + "learning_rate": 4.821094972486929e-05, + "loss": 5.0103, + "step": 20372 + }, + { + "epoch": 0.12116400228375677, + "grad_norm": 1.360532283782959, + "learning_rate": 4.821077619958936e-05, + "loss": 5.0503, + "step": 20373 + }, + { + "epoch": 0.12116994956703778, + "grad_norm": 1.3912672996520996, + "learning_rate": 4.821060266620677e-05, + "loss": 4.9326, + "step": 20374 + }, + { + "epoch": 0.12117589685031878, + "grad_norm": 1.2644896507263184, + "learning_rate": 4.821042912472161e-05, + "loss": 4.9584, + "step": 20375 + }, + { + "epoch": 0.12118184413359977, + "grad_norm": 1.1967086791992188, + "learning_rate": 4.821025557513392e-05, + "loss": 4.8954, + "step": 20376 + }, + { + "epoch": 0.12118779141688077, + "grad_norm": 1.353725552558899, + "learning_rate": 4.821008201744378e-05, + "loss": 4.8438, + "step": 20377 + }, + { + "epoch": 0.12119373870016177, + "grad_norm": 1.239682912826538, + "learning_rate": 4.820990845165123e-05, + "loss": 4.9624, + "step": 20378 + }, + { + "epoch": 0.12119968598344276, + "grad_norm": 1.1952159404754639, + "learning_rate": 4.820973487775634e-05, + "loss": 4.9254, + "step": 20379 + }, + { + "epoch": 0.12120563326672376, + "grad_norm": 1.4531627893447876, + "learning_rate": 4.820956129575918e-05, + "loss": 4.9487, + "step": 20380 + }, + { + "epoch": 0.12121158055000476, + "grad_norm": 1.2653759717941284, + "learning_rate": 4.8209387705659805e-05, + "loss": 4.7916, + "step": 20381 + }, + { + "epoch": 0.12121752783328575, + "grad_norm": 1.3156383037567139, + "learning_rate": 4.820921410745826e-05, + "loss": 5.0585, + "step": 20382 + }, + { + "epoch": 0.12122347511656675, + "grad_norm": 1.536216139793396, + "learning_rate": 4.820904050115462e-05, + "loss": 4.849, + "step": 20383 + }, + { + "epoch": 0.12122942239984776, + "grad_norm": 1.6567318439483643, + "learning_rate": 4.820886688674895e-05, + "loss": 4.6508, + "step": 20384 + }, + { + "epoch": 0.12123536968312874, + "grad_norm": 1.4173903465270996, + "learning_rate": 4.82086932642413e-05, + "loss": 4.8919, + "step": 20385 + }, + { + "epoch": 0.12124131696640975, + "grad_norm": 1.4352593421936035, + "learning_rate": 4.820851963363174e-05, + "loss": 4.7546, + "step": 20386 + }, + { + "epoch": 0.12124726424969075, + "grad_norm": 1.538988471031189, + "learning_rate": 4.8208345994920326e-05, + "loss": 4.7707, + "step": 20387 + }, + { + "epoch": 0.12125321153297174, + "grad_norm": 1.3959681987762451, + "learning_rate": 4.820817234810711e-05, + "loss": 4.5633, + "step": 20388 + }, + { + "epoch": 0.12125915881625274, + "grad_norm": 1.3972582817077637, + "learning_rate": 4.820799869319217e-05, + "loss": 4.5165, + "step": 20389 + }, + { + "epoch": 0.12126510609953374, + "grad_norm": 1.770070195198059, + "learning_rate": 4.820782503017555e-05, + "loss": 4.9679, + "step": 20390 + }, + { + "epoch": 0.12127105338281473, + "grad_norm": 1.6822887659072876, + "learning_rate": 4.820765135905732e-05, + "loss": 4.9589, + "step": 20391 + }, + { + "epoch": 0.12127700066609573, + "grad_norm": 1.6352055072784424, + "learning_rate": 4.820747767983754e-05, + "loss": 5.0389, + "step": 20392 + }, + { + "epoch": 0.12128294794937672, + "grad_norm": 1.4803529977798462, + "learning_rate": 4.8207303992516274e-05, + "loss": 5.1071, + "step": 20393 + }, + { + "epoch": 0.12128889523265772, + "grad_norm": 1.5575767755508423, + "learning_rate": 4.820713029709357e-05, + "loss": 5.2072, + "step": 20394 + }, + { + "epoch": 0.12129484251593872, + "grad_norm": 1.3417006731033325, + "learning_rate": 4.82069565935695e-05, + "loss": 5.1991, + "step": 20395 + }, + { + "epoch": 0.12130078979921971, + "grad_norm": 1.3577461242675781, + "learning_rate": 4.820678288194412e-05, + "loss": 5.3225, + "step": 20396 + }, + { + "epoch": 0.12130673708250071, + "grad_norm": 1.3763153553009033, + "learning_rate": 4.8206609162217494e-05, + "loss": 5.0247, + "step": 20397 + }, + { + "epoch": 0.12131268436578171, + "grad_norm": 1.7175389528274536, + "learning_rate": 4.8206435434389675e-05, + "loss": 5.2964, + "step": 20398 + }, + { + "epoch": 0.1213186316490627, + "grad_norm": 1.4921340942382812, + "learning_rate": 4.820626169846073e-05, + "loss": 4.781, + "step": 20399 + }, + { + "epoch": 0.1213245789323437, + "grad_norm": 1.3149629831314087, + "learning_rate": 4.8206087954430725e-05, + "loss": 5.2148, + "step": 20400 + }, + { + "epoch": 0.1213305262156247, + "grad_norm": 1.5960938930511475, + "learning_rate": 4.8205914202299715e-05, + "loss": 5.4152, + "step": 20401 + }, + { + "epoch": 0.1213364734989057, + "grad_norm": 1.4101301431655884, + "learning_rate": 4.8205740442067757e-05, + "loss": 5.2033, + "step": 20402 + }, + { + "epoch": 0.1213424207821867, + "grad_norm": 1.2584593296051025, + "learning_rate": 4.820556667373492e-05, + "loss": 5.1183, + "step": 20403 + }, + { + "epoch": 0.1213483680654677, + "grad_norm": 1.365639090538025, + "learning_rate": 4.820539289730125e-05, + "loss": 4.9446, + "step": 20404 + }, + { + "epoch": 0.12135431534874869, + "grad_norm": 1.6515495777130127, + "learning_rate": 4.820521911276682e-05, + "loss": 4.9002, + "step": 20405 + }, + { + "epoch": 0.12136026263202969, + "grad_norm": 1.2481954097747803, + "learning_rate": 4.8205045320131684e-05, + "loss": 5.3332, + "step": 20406 + }, + { + "epoch": 0.12136620991531069, + "grad_norm": 1.4952552318572998, + "learning_rate": 4.820487151939591e-05, + "loss": 4.7492, + "step": 20407 + }, + { + "epoch": 0.12137215719859168, + "grad_norm": 1.4472140073776245, + "learning_rate": 4.8204697710559556e-05, + "loss": 5.158, + "step": 20408 + }, + { + "epoch": 0.12137810448187268, + "grad_norm": 1.2544384002685547, + "learning_rate": 4.8204523893622685e-05, + "loss": 5.2041, + "step": 20409 + }, + { + "epoch": 0.12138405176515368, + "grad_norm": 1.1283172369003296, + "learning_rate": 4.820435006858535e-05, + "loss": 5.246, + "step": 20410 + }, + { + "epoch": 0.12138999904843467, + "grad_norm": 1.1113736629486084, + "learning_rate": 4.8204176235447617e-05, + "loss": 5.2116, + "step": 20411 + }, + { + "epoch": 0.12139594633171567, + "grad_norm": 1.2103666067123413, + "learning_rate": 4.820400239420955e-05, + "loss": 5.4421, + "step": 20412 + }, + { + "epoch": 0.12140189361499668, + "grad_norm": 1.2054588794708252, + "learning_rate": 4.82038285448712e-05, + "loss": 5.2503, + "step": 20413 + }, + { + "epoch": 0.12140784089827766, + "grad_norm": 1.568247675895691, + "learning_rate": 4.820365468743263e-05, + "loss": 4.9009, + "step": 20414 + }, + { + "epoch": 0.12141378818155867, + "grad_norm": 1.7106029987335205, + "learning_rate": 4.820348082189391e-05, + "loss": 4.7826, + "step": 20415 + }, + { + "epoch": 0.12141973546483967, + "grad_norm": 1.4479119777679443, + "learning_rate": 4.8203306948255095e-05, + "loss": 5.0084, + "step": 20416 + }, + { + "epoch": 0.12142568274812066, + "grad_norm": 1.467880368232727, + "learning_rate": 4.820313306651624e-05, + "loss": 5.2477, + "step": 20417 + }, + { + "epoch": 0.12143163003140166, + "grad_norm": 1.4088891744613647, + "learning_rate": 4.820295917667742e-05, + "loss": 5.1151, + "step": 20418 + }, + { + "epoch": 0.12143757731468266, + "grad_norm": 1.2838404178619385, + "learning_rate": 4.820278527873868e-05, + "loss": 5.6774, + "step": 20419 + }, + { + "epoch": 0.12144352459796365, + "grad_norm": 1.2146633863449097, + "learning_rate": 4.820261137270009e-05, + "loss": 5.0487, + "step": 20420 + }, + { + "epoch": 0.12144947188124465, + "grad_norm": 1.5603777170181274, + "learning_rate": 4.820243745856171e-05, + "loss": 5.0569, + "step": 20421 + }, + { + "epoch": 0.12145541916452564, + "grad_norm": 1.4454957246780396, + "learning_rate": 4.8202263536323586e-05, + "loss": 4.9556, + "step": 20422 + }, + { + "epoch": 0.12146136644780664, + "grad_norm": 1.4732788801193237, + "learning_rate": 4.820208960598581e-05, + "loss": 5.0095, + "step": 20423 + }, + { + "epoch": 0.12146731373108764, + "grad_norm": 1.4078243970870972, + "learning_rate": 4.820191566754841e-05, + "loss": 5.2642, + "step": 20424 + }, + { + "epoch": 0.12147326101436863, + "grad_norm": 1.2497223615646362, + "learning_rate": 4.820174172101147e-05, + "loss": 5.0792, + "step": 20425 + }, + { + "epoch": 0.12147920829764963, + "grad_norm": 1.5479954481124878, + "learning_rate": 4.8201567766375034e-05, + "loss": 4.9157, + "step": 20426 + }, + { + "epoch": 0.12148515558093063, + "grad_norm": 1.4266546964645386, + "learning_rate": 4.8201393803639175e-05, + "loss": 4.8776, + "step": 20427 + }, + { + "epoch": 0.12149110286421162, + "grad_norm": 1.3757798671722412, + "learning_rate": 4.8201219832803946e-05, + "loss": 4.8253, + "step": 20428 + }, + { + "epoch": 0.12149705014749262, + "grad_norm": 1.3386640548706055, + "learning_rate": 4.8201045853869416e-05, + "loss": 4.7895, + "step": 20429 + }, + { + "epoch": 0.12150299743077363, + "grad_norm": 1.3968008756637573, + "learning_rate": 4.820087186683564e-05, + "loss": 4.7238, + "step": 20430 + }, + { + "epoch": 0.12150894471405461, + "grad_norm": 1.4070801734924316, + "learning_rate": 4.820069787170267e-05, + "loss": 4.9614, + "step": 20431 + }, + { + "epoch": 0.12151489199733562, + "grad_norm": 1.4280625581741333, + "learning_rate": 4.820052386847059e-05, + "loss": 4.6765, + "step": 20432 + }, + { + "epoch": 0.12152083928061662, + "grad_norm": 1.3841910362243652, + "learning_rate": 4.820034985713944e-05, + "loss": 4.8008, + "step": 20433 + }, + { + "epoch": 0.1215267865638976, + "grad_norm": 1.3555341958999634, + "learning_rate": 4.820017583770928e-05, + "loss": 4.7967, + "step": 20434 + }, + { + "epoch": 0.12153273384717861, + "grad_norm": 1.3651773929595947, + "learning_rate": 4.820000181018019e-05, + "loss": 4.9003, + "step": 20435 + }, + { + "epoch": 0.12153868113045961, + "grad_norm": 1.364749789237976, + "learning_rate": 4.8199827774552215e-05, + "loss": 4.9834, + "step": 20436 + }, + { + "epoch": 0.1215446284137406, + "grad_norm": 1.384041428565979, + "learning_rate": 4.8199653730825424e-05, + "loss": 4.9997, + "step": 20437 + }, + { + "epoch": 0.1215505756970216, + "grad_norm": 1.544909954071045, + "learning_rate": 4.8199479678999867e-05, + "loss": 4.7909, + "step": 20438 + }, + { + "epoch": 0.1215565229803026, + "grad_norm": 1.4012216329574585, + "learning_rate": 4.819930561907561e-05, + "loss": 4.7359, + "step": 20439 + }, + { + "epoch": 0.12156247026358359, + "grad_norm": 1.2876297235488892, + "learning_rate": 4.819913155105272e-05, + "loss": 4.5025, + "step": 20440 + }, + { + "epoch": 0.1215684175468646, + "grad_norm": 1.5008763074874878, + "learning_rate": 4.819895747493125e-05, + "loss": 4.4486, + "step": 20441 + }, + { + "epoch": 0.1215743648301456, + "grad_norm": 1.3206987380981445, + "learning_rate": 4.8198783390711264e-05, + "loss": 4.4767, + "step": 20442 + }, + { + "epoch": 0.12158031211342658, + "grad_norm": 1.3569231033325195, + "learning_rate": 4.819860929839283e-05, + "loss": 4.6665, + "step": 20443 + }, + { + "epoch": 0.12158625939670759, + "grad_norm": 1.3377808332443237, + "learning_rate": 4.8198435197975986e-05, + "loss": 4.6109, + "step": 20444 + }, + { + "epoch": 0.12159220667998859, + "grad_norm": 1.5400346517562866, + "learning_rate": 4.8198261089460824e-05, + "loss": 4.2635, + "step": 20445 + }, + { + "epoch": 0.12159815396326958, + "grad_norm": 1.6329059600830078, + "learning_rate": 4.8198086972847376e-05, + "loss": 4.3745, + "step": 20446 + }, + { + "epoch": 0.12160410124655058, + "grad_norm": 1.4274183511734009, + "learning_rate": 4.819791284813573e-05, + "loss": 4.4103, + "step": 20447 + }, + { + "epoch": 0.12161004852983158, + "grad_norm": 1.4671530723571777, + "learning_rate": 4.8197738715325916e-05, + "loss": 4.3995, + "step": 20448 + }, + { + "epoch": 0.12161599581311257, + "grad_norm": 1.3783891201019287, + "learning_rate": 4.819756457441802e-05, + "loss": 4.3874, + "step": 20449 + }, + { + "epoch": 0.12162194309639357, + "grad_norm": 1.4054951667785645, + "learning_rate": 4.819739042541209e-05, + "loss": 4.3307, + "step": 20450 + }, + { + "epoch": 0.12162789037967456, + "grad_norm": 1.5449576377868652, + "learning_rate": 4.81972162683082e-05, + "loss": 4.8499, + "step": 20451 + }, + { + "epoch": 0.12163383766295556, + "grad_norm": 1.3887544870376587, + "learning_rate": 4.8197042103106394e-05, + "loss": 4.622, + "step": 20452 + }, + { + "epoch": 0.12163978494623656, + "grad_norm": 1.319422960281372, + "learning_rate": 4.819686792980673e-05, + "loss": 4.5172, + "step": 20453 + }, + { + "epoch": 0.12164573222951755, + "grad_norm": 1.3681663274765015, + "learning_rate": 4.8196693748409296e-05, + "loss": 4.8121, + "step": 20454 + }, + { + "epoch": 0.12165167951279855, + "grad_norm": 1.250482439994812, + "learning_rate": 4.819651955891413e-05, + "loss": 4.8792, + "step": 20455 + }, + { + "epoch": 0.12165762679607955, + "grad_norm": 1.3297876119613647, + "learning_rate": 4.819634536132129e-05, + "loss": 5.1069, + "step": 20456 + }, + { + "epoch": 0.12166357407936054, + "grad_norm": 1.3733534812927246, + "learning_rate": 4.819617115563086e-05, + "loss": 4.6061, + "step": 20457 + }, + { + "epoch": 0.12166952136264154, + "grad_norm": 1.287663459777832, + "learning_rate": 4.819599694184288e-05, + "loss": 4.9407, + "step": 20458 + }, + { + "epoch": 0.12167546864592255, + "grad_norm": 1.4198147058486938, + "learning_rate": 4.8195822719957416e-05, + "loss": 4.5361, + "step": 20459 + }, + { + "epoch": 0.12168141592920353, + "grad_norm": 1.7429990768432617, + "learning_rate": 4.819564848997453e-05, + "loss": 4.6604, + "step": 20460 + }, + { + "epoch": 0.12168736321248454, + "grad_norm": 1.4298913478851318, + "learning_rate": 4.819547425189429e-05, + "loss": 4.7415, + "step": 20461 + }, + { + "epoch": 0.12169331049576554, + "grad_norm": 1.3519923686981201, + "learning_rate": 4.8195300005716736e-05, + "loss": 5.3706, + "step": 20462 + }, + { + "epoch": 0.12169925777904653, + "grad_norm": 1.1476925611495972, + "learning_rate": 4.819512575144195e-05, + "loss": 5.4474, + "step": 20463 + }, + { + "epoch": 0.12170520506232753, + "grad_norm": 1.2756370306015015, + "learning_rate": 4.819495148906999e-05, + "loss": 4.9747, + "step": 20464 + }, + { + "epoch": 0.12171115234560853, + "grad_norm": 1.3161675930023193, + "learning_rate": 4.8194777218600906e-05, + "loss": 4.7093, + "step": 20465 + }, + { + "epoch": 0.12171709962888952, + "grad_norm": 1.4928854703903198, + "learning_rate": 4.8194602940034766e-05, + "loss": 4.7517, + "step": 20466 + }, + { + "epoch": 0.12172304691217052, + "grad_norm": 1.426684856414795, + "learning_rate": 4.819442865337163e-05, + "loss": 4.8639, + "step": 20467 + }, + { + "epoch": 0.12172899419545152, + "grad_norm": 1.368988037109375, + "learning_rate": 4.819425435861156e-05, + "loss": 4.8532, + "step": 20468 + }, + { + "epoch": 0.12173494147873251, + "grad_norm": 1.492031455039978, + "learning_rate": 4.819408005575461e-05, + "loss": 4.5139, + "step": 20469 + }, + { + "epoch": 0.12174088876201351, + "grad_norm": 1.6340793371200562, + "learning_rate": 4.819390574480085e-05, + "loss": 4.4042, + "step": 20470 + }, + { + "epoch": 0.12174683604529452, + "grad_norm": 1.5353302955627441, + "learning_rate": 4.819373142575034e-05, + "loss": 5.1097, + "step": 20471 + }, + { + "epoch": 0.1217527833285755, + "grad_norm": 1.5314761400222778, + "learning_rate": 4.8193557098603134e-05, + "loss": 4.7689, + "step": 20472 + }, + { + "epoch": 0.1217587306118565, + "grad_norm": 1.4626027345657349, + "learning_rate": 4.8193382763359295e-05, + "loss": 4.434, + "step": 20473 + }, + { + "epoch": 0.12176467789513751, + "grad_norm": 1.621871829032898, + "learning_rate": 4.8193208420018885e-05, + "loss": 4.5098, + "step": 20474 + }, + { + "epoch": 0.1217706251784185, + "grad_norm": 1.5429425239562988, + "learning_rate": 4.819303406858198e-05, + "loss": 4.4547, + "step": 20475 + }, + { + "epoch": 0.1217765724616995, + "grad_norm": 1.5002613067626953, + "learning_rate": 4.819285970904861e-05, + "loss": 4.6906, + "step": 20476 + }, + { + "epoch": 0.1217825197449805, + "grad_norm": 1.2322206497192383, + "learning_rate": 4.819268534141886e-05, + "loss": 5.049, + "step": 20477 + }, + { + "epoch": 0.12178846702826149, + "grad_norm": 1.2598546743392944, + "learning_rate": 4.819251096569278e-05, + "loss": 5.2906, + "step": 20478 + }, + { + "epoch": 0.12179441431154249, + "grad_norm": 1.2702369689941406, + "learning_rate": 4.8192336581870436e-05, + "loss": 5.1828, + "step": 20479 + }, + { + "epoch": 0.12180036159482348, + "grad_norm": 1.3816938400268555, + "learning_rate": 4.819216218995189e-05, + "loss": 5.1083, + "step": 20480 + }, + { + "epoch": 0.12180630887810448, + "grad_norm": 1.2958251237869263, + "learning_rate": 4.819198778993719e-05, + "loss": 5.1715, + "step": 20481 + }, + { + "epoch": 0.12181225616138548, + "grad_norm": 1.2317209243774414, + "learning_rate": 4.819181338182641e-05, + "loss": 5.1969, + "step": 20482 + }, + { + "epoch": 0.12181820344466647, + "grad_norm": 1.362483263015747, + "learning_rate": 4.819163896561961e-05, + "loss": 5.0893, + "step": 20483 + }, + { + "epoch": 0.12182415072794747, + "grad_norm": 1.1019991636276245, + "learning_rate": 4.819146454131685e-05, + "loss": 5.411, + "step": 20484 + }, + { + "epoch": 0.12183009801122847, + "grad_norm": 1.3575057983398438, + "learning_rate": 4.8191290108918184e-05, + "loss": 5.1797, + "step": 20485 + }, + { + "epoch": 0.12183604529450946, + "grad_norm": 1.4110307693481445, + "learning_rate": 4.8191115668423685e-05, + "loss": 5.3108, + "step": 20486 + }, + { + "epoch": 0.12184199257779046, + "grad_norm": 1.3322244882583618, + "learning_rate": 4.819094121983341e-05, + "loss": 5.238, + "step": 20487 + }, + { + "epoch": 0.12184793986107147, + "grad_norm": 1.3466796875, + "learning_rate": 4.819076676314741e-05, + "loss": 5.2786, + "step": 20488 + }, + { + "epoch": 0.12185388714435245, + "grad_norm": 1.4118572473526, + "learning_rate": 4.819059229836575e-05, + "loss": 5.0254, + "step": 20489 + }, + { + "epoch": 0.12185983442763346, + "grad_norm": 1.6264641284942627, + "learning_rate": 4.81904178254885e-05, + "loss": 4.8822, + "step": 20490 + }, + { + "epoch": 0.12186578171091446, + "grad_norm": 1.325591802597046, + "learning_rate": 4.8190243344515705e-05, + "loss": 5.5997, + "step": 20491 + }, + { + "epoch": 0.12187172899419545, + "grad_norm": 1.5424168109893799, + "learning_rate": 4.8190068855447444e-05, + "loss": 5.2096, + "step": 20492 + }, + { + "epoch": 0.12187767627747645, + "grad_norm": 1.3096263408660889, + "learning_rate": 4.818989435828377e-05, + "loss": 5.1026, + "step": 20493 + }, + { + "epoch": 0.12188362356075745, + "grad_norm": 1.3479657173156738, + "learning_rate": 4.8189719853024746e-05, + "loss": 5.0403, + "step": 20494 + }, + { + "epoch": 0.12188957084403844, + "grad_norm": 1.1970547437667847, + "learning_rate": 4.818954533967043e-05, + "loss": 5.06, + "step": 20495 + }, + { + "epoch": 0.12189551812731944, + "grad_norm": 1.3364722728729248, + "learning_rate": 4.818937081822088e-05, + "loss": 5.0216, + "step": 20496 + }, + { + "epoch": 0.12190146541060044, + "grad_norm": 1.2553714513778687, + "learning_rate": 4.818919628867615e-05, + "loss": 4.9662, + "step": 20497 + }, + { + "epoch": 0.12190741269388143, + "grad_norm": 1.270330786705017, + "learning_rate": 4.818902175103633e-05, + "loss": 4.8526, + "step": 20498 + }, + { + "epoch": 0.12191335997716243, + "grad_norm": 1.4872468709945679, + "learning_rate": 4.818884720530145e-05, + "loss": 4.9435, + "step": 20499 + }, + { + "epoch": 0.12191930726044344, + "grad_norm": 1.3152670860290527, + "learning_rate": 4.818867265147159e-05, + "loss": 5.1301, + "step": 20500 + }, + { + "epoch": 0.12192525454372442, + "grad_norm": 1.210864543914795, + "learning_rate": 4.8188498089546794e-05, + "loss": 5.1465, + "step": 20501 + }, + { + "epoch": 0.12193120182700543, + "grad_norm": 1.276159644126892, + "learning_rate": 4.818832351952715e-05, + "loss": 5.0847, + "step": 20502 + }, + { + "epoch": 0.12193714911028643, + "grad_norm": 1.449988842010498, + "learning_rate": 4.8188148941412684e-05, + "loss": 5.1143, + "step": 20503 + }, + { + "epoch": 0.12194309639356742, + "grad_norm": 1.241921305656433, + "learning_rate": 4.818797435520348e-05, + "loss": 5.067, + "step": 20504 + }, + { + "epoch": 0.12194904367684842, + "grad_norm": 1.3087794780731201, + "learning_rate": 4.81877997608996e-05, + "loss": 5.121, + "step": 20505 + }, + { + "epoch": 0.12195499096012942, + "grad_norm": 1.2226066589355469, + "learning_rate": 4.8187625158501095e-05, + "loss": 5.1879, + "step": 20506 + }, + { + "epoch": 0.12196093824341041, + "grad_norm": 1.2744648456573486, + "learning_rate": 4.8187450548008025e-05, + "loss": 5.1308, + "step": 20507 + }, + { + "epoch": 0.12196688552669141, + "grad_norm": 1.3409245014190674, + "learning_rate": 4.8187275929420464e-05, + "loss": 5.0914, + "step": 20508 + }, + { + "epoch": 0.1219728328099724, + "grad_norm": 1.2840641736984253, + "learning_rate": 4.818710130273846e-05, + "loss": 5.0818, + "step": 20509 + }, + { + "epoch": 0.1219787800932534, + "grad_norm": 1.4204998016357422, + "learning_rate": 4.818692666796207e-05, + "loss": 5.4553, + "step": 20510 + }, + { + "epoch": 0.1219847273765344, + "grad_norm": 1.3061211109161377, + "learning_rate": 4.818675202509137e-05, + "loss": 5.1777, + "step": 20511 + }, + { + "epoch": 0.12199067465981539, + "grad_norm": 1.3137598037719727, + "learning_rate": 4.818657737412642e-05, + "loss": 5.1156, + "step": 20512 + }, + { + "epoch": 0.12199662194309639, + "grad_norm": 1.1616209745407104, + "learning_rate": 4.818640271506727e-05, + "loss": 5.3169, + "step": 20513 + }, + { + "epoch": 0.1220025692263774, + "grad_norm": 1.270844578742981, + "learning_rate": 4.8186228047914e-05, + "loss": 5.3005, + "step": 20514 + }, + { + "epoch": 0.12200851650965838, + "grad_norm": 1.4955285787582397, + "learning_rate": 4.818605337266664e-05, + "loss": 5.1762, + "step": 20515 + }, + { + "epoch": 0.12201446379293938, + "grad_norm": 1.3431698083877563, + "learning_rate": 4.818587868932527e-05, + "loss": 4.9477, + "step": 20516 + }, + { + "epoch": 0.12202041107622039, + "grad_norm": 1.3437286615371704, + "learning_rate": 4.818570399788995e-05, + "loss": 4.7787, + "step": 20517 + }, + { + "epoch": 0.12202635835950137, + "grad_norm": 1.3840901851654053, + "learning_rate": 4.818552929836074e-05, + "loss": 5.0749, + "step": 20518 + }, + { + "epoch": 0.12203230564278238, + "grad_norm": 1.3907465934753418, + "learning_rate": 4.8185354590737707e-05, + "loss": 4.9084, + "step": 20519 + }, + { + "epoch": 0.12203825292606338, + "grad_norm": 1.360065221786499, + "learning_rate": 4.818517987502091e-05, + "loss": 4.9323, + "step": 20520 + }, + { + "epoch": 0.12204420020934437, + "grad_norm": 1.1924186944961548, + "learning_rate": 4.818500515121039e-05, + "loss": 4.8237, + "step": 20521 + }, + { + "epoch": 0.12205014749262537, + "grad_norm": 1.6362069845199585, + "learning_rate": 4.818483041930624e-05, + "loss": 4.6073, + "step": 20522 + }, + { + "epoch": 0.12205609477590637, + "grad_norm": 1.4413504600524902, + "learning_rate": 4.81846556793085e-05, + "loss": 4.7733, + "step": 20523 + }, + { + "epoch": 0.12206204205918736, + "grad_norm": 1.5076016187667847, + "learning_rate": 4.818448093121723e-05, + "loss": 5.4376, + "step": 20524 + }, + { + "epoch": 0.12206798934246836, + "grad_norm": 1.5311039686203003, + "learning_rate": 4.818430617503251e-05, + "loss": 5.1398, + "step": 20525 + }, + { + "epoch": 0.12207393662574936, + "grad_norm": 1.4373403787612915, + "learning_rate": 4.818413141075438e-05, + "loss": 4.897, + "step": 20526 + }, + { + "epoch": 0.12207988390903035, + "grad_norm": 1.4221818447113037, + "learning_rate": 4.818395663838291e-05, + "loss": 5.223, + "step": 20527 + }, + { + "epoch": 0.12208583119231135, + "grad_norm": 1.2606967687606812, + "learning_rate": 4.818378185791817e-05, + "loss": 4.7242, + "step": 20528 + }, + { + "epoch": 0.12209177847559236, + "grad_norm": 1.2508289813995361, + "learning_rate": 4.818360706936019e-05, + "loss": 4.623, + "step": 20529 + }, + { + "epoch": 0.12209772575887334, + "grad_norm": 1.3701050281524658, + "learning_rate": 4.8183432272709065e-05, + "loss": 4.6716, + "step": 20530 + }, + { + "epoch": 0.12210367304215435, + "grad_norm": 1.5785399675369263, + "learning_rate": 4.818325746796485e-05, + "loss": 4.5495, + "step": 20531 + }, + { + "epoch": 0.12210962032543535, + "grad_norm": 1.4542807340621948, + "learning_rate": 4.8183082655127584e-05, + "loss": 4.6848, + "step": 20532 + }, + { + "epoch": 0.12211556760871634, + "grad_norm": 1.2740551233291626, + "learning_rate": 4.818290783419736e-05, + "loss": 4.7792, + "step": 20533 + }, + { + "epoch": 0.12212151489199734, + "grad_norm": 1.2965741157531738, + "learning_rate": 4.8182733005174205e-05, + "loss": 4.7552, + "step": 20534 + }, + { + "epoch": 0.12212746217527834, + "grad_norm": 1.3440501689910889, + "learning_rate": 4.8182558168058215e-05, + "loss": 5.0506, + "step": 20535 + }, + { + "epoch": 0.12213340945855933, + "grad_norm": 1.3767000436782837, + "learning_rate": 4.8182383322849415e-05, + "loss": 5.0523, + "step": 20536 + }, + { + "epoch": 0.12213935674184033, + "grad_norm": 1.4770883321762085, + "learning_rate": 4.81822084695479e-05, + "loss": 5.117, + "step": 20537 + }, + { + "epoch": 0.12214530402512132, + "grad_norm": 1.4463403224945068, + "learning_rate": 4.818203360815371e-05, + "loss": 5.0566, + "step": 20538 + }, + { + "epoch": 0.12215125130840232, + "grad_norm": 1.5590862035751343, + "learning_rate": 4.8181858738666905e-05, + "loss": 5.1184, + "step": 20539 + }, + { + "epoch": 0.12215719859168332, + "grad_norm": 1.2578922510147095, + "learning_rate": 4.818168386108756e-05, + "loss": 5.0364, + "step": 20540 + }, + { + "epoch": 0.12216314587496431, + "grad_norm": 1.363750696182251, + "learning_rate": 4.8181508975415727e-05, + "loss": 5.1133, + "step": 20541 + }, + { + "epoch": 0.12216909315824531, + "grad_norm": 1.5973013639450073, + "learning_rate": 4.8181334081651474e-05, + "loss": 4.9659, + "step": 20542 + }, + { + "epoch": 0.12217504044152631, + "grad_norm": 1.4429646730422974, + "learning_rate": 4.818115917979485e-05, + "loss": 5.1669, + "step": 20543 + }, + { + "epoch": 0.1221809877248073, + "grad_norm": 1.4704759120941162, + "learning_rate": 4.818098426984592e-05, + "loss": 5.1613, + "step": 20544 + }, + { + "epoch": 0.1221869350080883, + "grad_norm": 1.3613824844360352, + "learning_rate": 4.8180809351804756e-05, + "loss": 5.2524, + "step": 20545 + }, + { + "epoch": 0.1221928822913693, + "grad_norm": 1.199265480041504, + "learning_rate": 4.8180634425671404e-05, + "loss": 5.1596, + "step": 20546 + }, + { + "epoch": 0.1221988295746503, + "grad_norm": 1.3537240028381348, + "learning_rate": 4.818045949144594e-05, + "loss": 5.1456, + "step": 20547 + }, + { + "epoch": 0.1222047768579313, + "grad_norm": 1.4804584980010986, + "learning_rate": 4.818028454912841e-05, + "loss": 5.0443, + "step": 20548 + }, + { + "epoch": 0.1222107241412123, + "grad_norm": 1.3245832920074463, + "learning_rate": 4.8180109598718884e-05, + "loss": 4.9495, + "step": 20549 + }, + { + "epoch": 0.12221667142449329, + "grad_norm": 1.5168079137802124, + "learning_rate": 4.817993464021742e-05, + "loss": 4.8094, + "step": 20550 + }, + { + "epoch": 0.12222261870777429, + "grad_norm": 1.4146143198013306, + "learning_rate": 4.817975967362408e-05, + "loss": 5.0319, + "step": 20551 + }, + { + "epoch": 0.12222856599105529, + "grad_norm": 1.30800199508667, + "learning_rate": 4.817958469893893e-05, + "loss": 4.6641, + "step": 20552 + }, + { + "epoch": 0.12223451327433628, + "grad_norm": 1.1652897596359253, + "learning_rate": 4.8179409716162026e-05, + "loss": 4.8978, + "step": 20553 + }, + { + "epoch": 0.12224046055761728, + "grad_norm": 1.4594627618789673, + "learning_rate": 4.817923472529343e-05, + "loss": 5.0124, + "step": 20554 + }, + { + "epoch": 0.12224640784089828, + "grad_norm": 1.2955336570739746, + "learning_rate": 4.81790597263332e-05, + "loss": 5.0336, + "step": 20555 + }, + { + "epoch": 0.12225235512417927, + "grad_norm": 1.3508485555648804, + "learning_rate": 4.8178884719281395e-05, + "loss": 4.8695, + "step": 20556 + }, + { + "epoch": 0.12225830240746027, + "grad_norm": 1.363410472869873, + "learning_rate": 4.8178709704138094e-05, + "loss": 4.9162, + "step": 20557 + }, + { + "epoch": 0.12226424969074128, + "grad_norm": 1.4330451488494873, + "learning_rate": 4.817853468090333e-05, + "loss": 4.8993, + "step": 20558 + }, + { + "epoch": 0.12227019697402226, + "grad_norm": 1.3630226850509644, + "learning_rate": 4.817835964957719e-05, + "loss": 4.9196, + "step": 20559 + }, + { + "epoch": 0.12227614425730327, + "grad_norm": 1.4265079498291016, + "learning_rate": 4.817818461015972e-05, + "loss": 4.8966, + "step": 20560 + }, + { + "epoch": 0.12228209154058427, + "grad_norm": 1.4709514379501343, + "learning_rate": 4.817800956265098e-05, + "loss": 4.7685, + "step": 20561 + }, + { + "epoch": 0.12228803882386526, + "grad_norm": 1.1047412157058716, + "learning_rate": 4.8177834507051044e-05, + "loss": 4.8495, + "step": 20562 + }, + { + "epoch": 0.12229398610714626, + "grad_norm": 1.302027940750122, + "learning_rate": 4.817765944335996e-05, + "loss": 4.9414, + "step": 20563 + }, + { + "epoch": 0.12229993339042726, + "grad_norm": 1.2321425676345825, + "learning_rate": 4.8177484371577796e-05, + "loss": 4.8089, + "step": 20564 + }, + { + "epoch": 0.12230588067370825, + "grad_norm": 1.5107663869857788, + "learning_rate": 4.8177309291704616e-05, + "loss": 4.8964, + "step": 20565 + }, + { + "epoch": 0.12231182795698925, + "grad_norm": 1.4476573467254639, + "learning_rate": 4.817713420374047e-05, + "loss": 5.1385, + "step": 20566 + }, + { + "epoch": 0.12231777524027024, + "grad_norm": 1.7367160320281982, + "learning_rate": 4.817695910768544e-05, + "loss": 4.7051, + "step": 20567 + }, + { + "epoch": 0.12232372252355124, + "grad_norm": 1.7436206340789795, + "learning_rate": 4.817678400353955e-05, + "loss": 5.0161, + "step": 20568 + }, + { + "epoch": 0.12232966980683224, + "grad_norm": 1.667702317237854, + "learning_rate": 4.8176608891302905e-05, + "loss": 4.7507, + "step": 20569 + }, + { + "epoch": 0.12233561709011323, + "grad_norm": 1.3754125833511353, + "learning_rate": 4.817643377097554e-05, + "loss": 4.9623, + "step": 20570 + }, + { + "epoch": 0.12234156437339423, + "grad_norm": 1.539730191230774, + "learning_rate": 4.817625864255751e-05, + "loss": 4.9798, + "step": 20571 + }, + { + "epoch": 0.12234751165667523, + "grad_norm": 1.2995619773864746, + "learning_rate": 4.81760835060489e-05, + "loss": 4.9225, + "step": 20572 + }, + { + "epoch": 0.12235345893995622, + "grad_norm": 1.4950238466262817, + "learning_rate": 4.817590836144975e-05, + "loss": 5.0578, + "step": 20573 + }, + { + "epoch": 0.12235940622323722, + "grad_norm": 1.5506999492645264, + "learning_rate": 4.8175733208760144e-05, + "loss": 4.7418, + "step": 20574 + }, + { + "epoch": 0.12236535350651823, + "grad_norm": 2.153271198272705, + "learning_rate": 4.817555804798012e-05, + "loss": 4.8025, + "step": 20575 + }, + { + "epoch": 0.12237130078979921, + "grad_norm": 1.4991137981414795, + "learning_rate": 4.817538287910974e-05, + "loss": 4.9943, + "step": 20576 + }, + { + "epoch": 0.12237724807308022, + "grad_norm": 1.3596469163894653, + "learning_rate": 4.8175207702149085e-05, + "loss": 5.4109, + "step": 20577 + }, + { + "epoch": 0.12238319535636122, + "grad_norm": 1.182950735092163, + "learning_rate": 4.81750325170982e-05, + "loss": 5.4844, + "step": 20578 + }, + { + "epoch": 0.1223891426396422, + "grad_norm": 1.2713780403137207, + "learning_rate": 4.817485732395715e-05, + "loss": 5.3333, + "step": 20579 + }, + { + "epoch": 0.12239508992292321, + "grad_norm": 1.396163821220398, + "learning_rate": 4.8174682122726e-05, + "loss": 5.1666, + "step": 20580 + }, + { + "epoch": 0.12240103720620421, + "grad_norm": 1.3530118465423584, + "learning_rate": 4.81745069134048e-05, + "loss": 5.055, + "step": 20581 + }, + { + "epoch": 0.1224069844894852, + "grad_norm": 1.1625109910964966, + "learning_rate": 4.8174331695993626e-05, + "loss": 5.2553, + "step": 20582 + }, + { + "epoch": 0.1224129317727662, + "grad_norm": 1.4428709745407104, + "learning_rate": 4.817415647049253e-05, + "loss": 5.1255, + "step": 20583 + }, + { + "epoch": 0.1224188790560472, + "grad_norm": 1.674591064453125, + "learning_rate": 4.8173981236901574e-05, + "loss": 4.7623, + "step": 20584 + }, + { + "epoch": 0.12242482633932819, + "grad_norm": 1.4691076278686523, + "learning_rate": 4.817380599522083e-05, + "loss": 5.1077, + "step": 20585 + }, + { + "epoch": 0.12243077362260919, + "grad_norm": 1.0224462747573853, + "learning_rate": 4.817363074545034e-05, + "loss": 5.1022, + "step": 20586 + }, + { + "epoch": 0.1224367209058902, + "grad_norm": 1.3090193271636963, + "learning_rate": 4.817345548759018e-05, + "loss": 5.121, + "step": 20587 + }, + { + "epoch": 0.12244266818917118, + "grad_norm": 1.028120756149292, + "learning_rate": 4.81732802216404e-05, + "loss": 5.2709, + "step": 20588 + }, + { + "epoch": 0.12244861547245219, + "grad_norm": 1.3667192459106445, + "learning_rate": 4.817310494760107e-05, + "loss": 5.075, + "step": 20589 + }, + { + "epoch": 0.12245456275573319, + "grad_norm": 1.3145662546157837, + "learning_rate": 4.8172929665472255e-05, + "loss": 5.1258, + "step": 20590 + }, + { + "epoch": 0.12246051003901418, + "grad_norm": 1.2744371891021729, + "learning_rate": 4.8172754375254e-05, + "loss": 5.0155, + "step": 20591 + }, + { + "epoch": 0.12246645732229518, + "grad_norm": 1.4647456407546997, + "learning_rate": 4.817257907694638e-05, + "loss": 5.0325, + "step": 20592 + }, + { + "epoch": 0.12247240460557618, + "grad_norm": 1.1393789052963257, + "learning_rate": 4.817240377054945e-05, + "loss": 5.1304, + "step": 20593 + }, + { + "epoch": 0.12247835188885717, + "grad_norm": 1.3927806615829468, + "learning_rate": 4.817222845606328e-05, + "loss": 5.0588, + "step": 20594 + }, + { + "epoch": 0.12248429917213817, + "grad_norm": 1.3344571590423584, + "learning_rate": 4.817205313348792e-05, + "loss": 5.0428, + "step": 20595 + }, + { + "epoch": 0.12249024645541916, + "grad_norm": 0.9816542267799377, + "learning_rate": 4.817187780282343e-05, + "loss": 5.0046, + "step": 20596 + }, + { + "epoch": 0.12249619373870016, + "grad_norm": 1.1602904796600342, + "learning_rate": 4.817170246406989e-05, + "loss": 5.0372, + "step": 20597 + }, + { + "epoch": 0.12250214102198116, + "grad_norm": 1.2147279977798462, + "learning_rate": 4.817152711722733e-05, + "loss": 4.999, + "step": 20598 + }, + { + "epoch": 0.12250808830526215, + "grad_norm": 1.3654884099960327, + "learning_rate": 4.817135176229585e-05, + "loss": 5.0635, + "step": 20599 + }, + { + "epoch": 0.12251403558854315, + "grad_norm": 1.3051310777664185, + "learning_rate": 4.817117639927547e-05, + "loss": 5.0137, + "step": 20600 + }, + { + "epoch": 0.12251998287182415, + "grad_norm": 1.2217040061950684, + "learning_rate": 4.8171001028166284e-05, + "loss": 4.7167, + "step": 20601 + }, + { + "epoch": 0.12252593015510514, + "grad_norm": 1.3541781902313232, + "learning_rate": 4.8170825648968345e-05, + "loss": 4.9244, + "step": 20602 + }, + { + "epoch": 0.12253187743838614, + "grad_norm": 1.2899030447006226, + "learning_rate": 4.81706502616817e-05, + "loss": 5.0452, + "step": 20603 + }, + { + "epoch": 0.12253782472166715, + "grad_norm": 1.4059736728668213, + "learning_rate": 4.817047486630643e-05, + "loss": 4.9318, + "step": 20604 + }, + { + "epoch": 0.12254377200494813, + "grad_norm": 1.6990517377853394, + "learning_rate": 4.817029946284257e-05, + "loss": 4.5067, + "step": 20605 + }, + { + "epoch": 0.12254971928822914, + "grad_norm": 1.4028486013412476, + "learning_rate": 4.817012405129021e-05, + "loss": 5.0994, + "step": 20606 + }, + { + "epoch": 0.12255566657151014, + "grad_norm": 1.5692994594573975, + "learning_rate": 4.8169948631649395e-05, + "loss": 4.742, + "step": 20607 + }, + { + "epoch": 0.12256161385479113, + "grad_norm": 1.4501662254333496, + "learning_rate": 4.81697732039202e-05, + "loss": 4.9951, + "step": 20608 + }, + { + "epoch": 0.12256756113807213, + "grad_norm": 1.2898585796356201, + "learning_rate": 4.816959776810267e-05, + "loss": 5.2756, + "step": 20609 + }, + { + "epoch": 0.12257350842135313, + "grad_norm": 1.2808797359466553, + "learning_rate": 4.8169422324196867e-05, + "loss": 5.043, + "step": 20610 + }, + { + "epoch": 0.12257945570463412, + "grad_norm": 1.6888319253921509, + "learning_rate": 4.816924687220287e-05, + "loss": 4.6803, + "step": 20611 + }, + { + "epoch": 0.12258540298791512, + "grad_norm": 1.6619288921356201, + "learning_rate": 4.8169071412120716e-05, + "loss": 4.7334, + "step": 20612 + }, + { + "epoch": 0.12259135027119612, + "grad_norm": 1.4474331140518188, + "learning_rate": 4.816889594395049e-05, + "loss": 4.8519, + "step": 20613 + }, + { + "epoch": 0.12259729755447711, + "grad_norm": 1.519037127494812, + "learning_rate": 4.816872046769223e-05, + "loss": 4.7864, + "step": 20614 + }, + { + "epoch": 0.12260324483775811, + "grad_norm": 1.4860186576843262, + "learning_rate": 4.816854498334602e-05, + "loss": 4.7542, + "step": 20615 + }, + { + "epoch": 0.12260919212103912, + "grad_norm": 1.3120838403701782, + "learning_rate": 4.81683694909119e-05, + "loss": 4.6539, + "step": 20616 + }, + { + "epoch": 0.1226151394043201, + "grad_norm": 1.4509785175323486, + "learning_rate": 4.816819399038995e-05, + "loss": 5.105, + "step": 20617 + }, + { + "epoch": 0.1226210866876011, + "grad_norm": 1.428066372871399, + "learning_rate": 4.816801848178022e-05, + "loss": 5.1138, + "step": 20618 + }, + { + "epoch": 0.12262703397088211, + "grad_norm": 1.3920371532440186, + "learning_rate": 4.816784296508277e-05, + "loss": 5.0398, + "step": 20619 + }, + { + "epoch": 0.1226329812541631, + "grad_norm": 1.258225679397583, + "learning_rate": 4.816766744029767e-05, + "loss": 4.7204, + "step": 20620 + }, + { + "epoch": 0.1226389285374441, + "grad_norm": 1.4209269285202026, + "learning_rate": 4.816749190742498e-05, + "loss": 4.6532, + "step": 20621 + }, + { + "epoch": 0.1226448758207251, + "grad_norm": 1.6276925802230835, + "learning_rate": 4.816731636646475e-05, + "loss": 4.7025, + "step": 20622 + }, + { + "epoch": 0.12265082310400609, + "grad_norm": 1.3714722394943237, + "learning_rate": 4.8167140817417055e-05, + "loss": 5.1781, + "step": 20623 + }, + { + "epoch": 0.12265677038728709, + "grad_norm": 1.397017240524292, + "learning_rate": 4.816696526028195e-05, + "loss": 5.2097, + "step": 20624 + }, + { + "epoch": 0.12266271767056808, + "grad_norm": 1.2807291746139526, + "learning_rate": 4.8166789695059486e-05, + "loss": 5.1588, + "step": 20625 + }, + { + "epoch": 0.12266866495384908, + "grad_norm": 1.301222562789917, + "learning_rate": 4.816661412174976e-05, + "loss": 5.0906, + "step": 20626 + }, + { + "epoch": 0.12267461223713008, + "grad_norm": 1.6813510656356812, + "learning_rate": 4.816643854035279e-05, + "loss": 4.4956, + "step": 20627 + }, + { + "epoch": 0.12268055952041107, + "grad_norm": 1.7415688037872314, + "learning_rate": 4.816626295086865e-05, + "loss": 4.4246, + "step": 20628 + }, + { + "epoch": 0.12268650680369207, + "grad_norm": 1.9389246702194214, + "learning_rate": 4.816608735329742e-05, + "loss": 4.4231, + "step": 20629 + }, + { + "epoch": 0.12269245408697307, + "grad_norm": 1.7021642923355103, + "learning_rate": 4.816591174763914e-05, + "loss": 4.5314, + "step": 20630 + }, + { + "epoch": 0.12269840137025406, + "grad_norm": 1.889491081237793, + "learning_rate": 4.8165736133893876e-05, + "loss": 4.384, + "step": 20631 + }, + { + "epoch": 0.12270434865353506, + "grad_norm": 1.8447821140289307, + "learning_rate": 4.816556051206171e-05, + "loss": 4.5086, + "step": 20632 + }, + { + "epoch": 0.12271029593681607, + "grad_norm": 1.7669256925582886, + "learning_rate": 4.8165384882142674e-05, + "loss": 4.4537, + "step": 20633 + }, + { + "epoch": 0.12271624322009705, + "grad_norm": 1.8175028562545776, + "learning_rate": 4.8165209244136846e-05, + "loss": 4.4478, + "step": 20634 + }, + { + "epoch": 0.12272219050337806, + "grad_norm": 1.7047181129455566, + "learning_rate": 4.816503359804427e-05, + "loss": 4.7366, + "step": 20635 + }, + { + "epoch": 0.12272813778665906, + "grad_norm": 1.4321893453598022, + "learning_rate": 4.816485794386504e-05, + "loss": 4.9958, + "step": 20636 + }, + { + "epoch": 0.12273408506994005, + "grad_norm": 1.3354036808013916, + "learning_rate": 4.816468228159918e-05, + "loss": 4.906, + "step": 20637 + }, + { + "epoch": 0.12274003235322105, + "grad_norm": 1.281680703163147, + "learning_rate": 4.8164506611246784e-05, + "loss": 4.884, + "step": 20638 + }, + { + "epoch": 0.12274597963650205, + "grad_norm": 1.32127046585083, + "learning_rate": 4.8164330932807885e-05, + "loss": 4.8039, + "step": 20639 + }, + { + "epoch": 0.12275192691978304, + "grad_norm": 1.2233742475509644, + "learning_rate": 4.816415524628257e-05, + "loss": 4.8872, + "step": 20640 + }, + { + "epoch": 0.12275787420306404, + "grad_norm": 1.4896177053451538, + "learning_rate": 4.816397955167088e-05, + "loss": 5.0379, + "step": 20641 + }, + { + "epoch": 0.12276382148634504, + "grad_norm": 1.389992594718933, + "learning_rate": 4.8163803848972886e-05, + "loss": 5.1364, + "step": 20642 + }, + { + "epoch": 0.12276976876962603, + "grad_norm": 1.4248872995376587, + "learning_rate": 4.8163628138188645e-05, + "loss": 5.3152, + "step": 20643 + }, + { + "epoch": 0.12277571605290703, + "grad_norm": 1.3105376958847046, + "learning_rate": 4.816345241931822e-05, + "loss": 4.9878, + "step": 20644 + }, + { + "epoch": 0.12278166333618803, + "grad_norm": 1.3307970762252808, + "learning_rate": 4.816327669236167e-05, + "loss": 4.9105, + "step": 20645 + }, + { + "epoch": 0.12278761061946902, + "grad_norm": 1.9464685916900635, + "learning_rate": 4.816310095731907e-05, + "loss": 5.2259, + "step": 20646 + }, + { + "epoch": 0.12279355790275003, + "grad_norm": 1.4600616693496704, + "learning_rate": 4.816292521419046e-05, + "loss": 4.7044, + "step": 20647 + }, + { + "epoch": 0.12279950518603103, + "grad_norm": 1.202574610710144, + "learning_rate": 4.816274946297592e-05, + "loss": 5.1854, + "step": 20648 + }, + { + "epoch": 0.12280545246931202, + "grad_norm": 1.5569230318069458, + "learning_rate": 4.81625737036755e-05, + "loss": 4.8316, + "step": 20649 + }, + { + "epoch": 0.12281139975259302, + "grad_norm": 1.3303078413009644, + "learning_rate": 4.8162397936289264e-05, + "loss": 4.891, + "step": 20650 + }, + { + "epoch": 0.12281734703587402, + "grad_norm": 1.2397204637527466, + "learning_rate": 4.816222216081728e-05, + "loss": 4.8077, + "step": 20651 + }, + { + "epoch": 0.12282329431915501, + "grad_norm": 1.29647696018219, + "learning_rate": 4.8162046377259594e-05, + "loss": 4.7518, + "step": 20652 + }, + { + "epoch": 0.12282924160243601, + "grad_norm": 1.4492244720458984, + "learning_rate": 4.816187058561629e-05, + "loss": 4.6352, + "step": 20653 + }, + { + "epoch": 0.122835188885717, + "grad_norm": 1.2785146236419678, + "learning_rate": 4.81616947858874e-05, + "loss": 4.9128, + "step": 20654 + }, + { + "epoch": 0.122841136168998, + "grad_norm": 1.2652465105056763, + "learning_rate": 4.8161518978073016e-05, + "loss": 5.1555, + "step": 20655 + }, + { + "epoch": 0.122847083452279, + "grad_norm": 1.5048694610595703, + "learning_rate": 4.816134316217318e-05, + "loss": 5.0648, + "step": 20656 + }, + { + "epoch": 0.12285303073555999, + "grad_norm": 1.3626654148101807, + "learning_rate": 4.816116733818795e-05, + "loss": 5.0668, + "step": 20657 + }, + { + "epoch": 0.12285897801884099, + "grad_norm": 1.614112377166748, + "learning_rate": 4.816099150611741e-05, + "loss": 4.9234, + "step": 20658 + }, + { + "epoch": 0.122864925302122, + "grad_norm": 1.9453253746032715, + "learning_rate": 4.81608156659616e-05, + "loss": 4.7709, + "step": 20659 + }, + { + "epoch": 0.12287087258540298, + "grad_norm": 1.7604261636734009, + "learning_rate": 4.816063981772059e-05, + "loss": 4.8153, + "step": 20660 + }, + { + "epoch": 0.12287681986868398, + "grad_norm": 1.473319172859192, + "learning_rate": 4.8160463961394436e-05, + "loss": 4.9552, + "step": 20661 + }, + { + "epoch": 0.12288276715196499, + "grad_norm": 1.332900881767273, + "learning_rate": 4.8160288096983207e-05, + "loss": 5.1753, + "step": 20662 + }, + { + "epoch": 0.12288871443524597, + "grad_norm": 1.438464641571045, + "learning_rate": 4.816011222448696e-05, + "loss": 5.0386, + "step": 20663 + }, + { + "epoch": 0.12289466171852698, + "grad_norm": 1.4369616508483887, + "learning_rate": 4.8159936343905756e-05, + "loss": 5.1144, + "step": 20664 + }, + { + "epoch": 0.12290060900180798, + "grad_norm": 1.307914137840271, + "learning_rate": 4.8159760455239656e-05, + "loss": 5.0308, + "step": 20665 + }, + { + "epoch": 0.12290655628508897, + "grad_norm": 1.4199682474136353, + "learning_rate": 4.815958455848872e-05, + "loss": 4.9803, + "step": 20666 + }, + { + "epoch": 0.12291250356836997, + "grad_norm": 1.2451025247573853, + "learning_rate": 4.815940865365303e-05, + "loss": 5.0328, + "step": 20667 + }, + { + "epoch": 0.12291845085165097, + "grad_norm": 1.2542675733566284, + "learning_rate": 4.8159232740732615e-05, + "loss": 5.0961, + "step": 20668 + }, + { + "epoch": 0.12292439813493196, + "grad_norm": 1.4102520942687988, + "learning_rate": 4.815905681972756e-05, + "loss": 5.1512, + "step": 20669 + }, + { + "epoch": 0.12293034541821296, + "grad_norm": 1.7003612518310547, + "learning_rate": 4.81588808906379e-05, + "loss": 5.6308, + "step": 20670 + }, + { + "epoch": 0.12293629270149396, + "grad_norm": 1.7957112789154053, + "learning_rate": 4.815870495346373e-05, + "loss": 5.2033, + "step": 20671 + }, + { + "epoch": 0.12294223998477495, + "grad_norm": 1.8667526245117188, + "learning_rate": 4.815852900820509e-05, + "loss": 5.3148, + "step": 20672 + }, + { + "epoch": 0.12294818726805595, + "grad_norm": 1.5151188373565674, + "learning_rate": 4.815835305486205e-05, + "loss": 5.1791, + "step": 20673 + }, + { + "epoch": 0.12295413455133695, + "grad_norm": 1.842624545097351, + "learning_rate": 4.8158177093434666e-05, + "loss": 4.7996, + "step": 20674 + }, + { + "epoch": 0.12296008183461794, + "grad_norm": 1.6197025775909424, + "learning_rate": 4.815800112392299e-05, + "loss": 4.9929, + "step": 20675 + }, + { + "epoch": 0.12296602911789895, + "grad_norm": 1.4609524011611938, + "learning_rate": 4.8157825146327113e-05, + "loss": 4.961, + "step": 20676 + }, + { + "epoch": 0.12297197640117995, + "grad_norm": 1.479789137840271, + "learning_rate": 4.8157649160647065e-05, + "loss": 5.3686, + "step": 20677 + }, + { + "epoch": 0.12297792368446094, + "grad_norm": 2.120084524154663, + "learning_rate": 4.815747316688293e-05, + "loss": 4.8741, + "step": 20678 + }, + { + "epoch": 0.12298387096774194, + "grad_norm": 1.2068350315093994, + "learning_rate": 4.815729716503476e-05, + "loss": 5.5907, + "step": 20679 + }, + { + "epoch": 0.12298981825102294, + "grad_norm": 1.9006667137145996, + "learning_rate": 4.815712115510261e-05, + "loss": 5.0154, + "step": 20680 + }, + { + "epoch": 0.12299576553430393, + "grad_norm": 1.7368868589401245, + "learning_rate": 4.815694513708656e-05, + "loss": 5.1994, + "step": 20681 + }, + { + "epoch": 0.12300171281758493, + "grad_norm": 1.8622910976409912, + "learning_rate": 4.815676911098665e-05, + "loss": 4.7889, + "step": 20682 + }, + { + "epoch": 0.12300766010086592, + "grad_norm": 1.7475686073303223, + "learning_rate": 4.815659307680295e-05, + "loss": 5.1067, + "step": 20683 + }, + { + "epoch": 0.12301360738414692, + "grad_norm": 1.7088334560394287, + "learning_rate": 4.815641703453553e-05, + "loss": 4.8665, + "step": 20684 + }, + { + "epoch": 0.12301955466742792, + "grad_norm": 1.4785330295562744, + "learning_rate": 4.815624098418444e-05, + "loss": 5.417, + "step": 20685 + }, + { + "epoch": 0.12302550195070891, + "grad_norm": 1.5346219539642334, + "learning_rate": 4.8156064925749745e-05, + "loss": 5.4747, + "step": 20686 + }, + { + "epoch": 0.12303144923398991, + "grad_norm": 1.7572461366653442, + "learning_rate": 4.815588885923151e-05, + "loss": 5.021, + "step": 20687 + }, + { + "epoch": 0.12303739651727091, + "grad_norm": 1.57370126247406, + "learning_rate": 4.815571278462979e-05, + "loss": 5.5248, + "step": 20688 + }, + { + "epoch": 0.1230433438005519, + "grad_norm": 1.7549457550048828, + "learning_rate": 4.815553670194465e-05, + "loss": 5.346, + "step": 20689 + }, + { + "epoch": 0.1230492910838329, + "grad_norm": 1.7188549041748047, + "learning_rate": 4.8155360611176156e-05, + "loss": 5.4671, + "step": 20690 + }, + { + "epoch": 0.1230552383671139, + "grad_norm": 2.358586311340332, + "learning_rate": 4.815518451232436e-05, + "loss": 4.4753, + "step": 20691 + }, + { + "epoch": 0.1230611856503949, + "grad_norm": 2.2453999519348145, + "learning_rate": 4.815500840538933e-05, + "loss": 4.5065, + "step": 20692 + }, + { + "epoch": 0.1230671329336759, + "grad_norm": 1.505689263343811, + "learning_rate": 4.8154832290371123e-05, + "loss": 5.2223, + "step": 20693 + }, + { + "epoch": 0.1230730802169569, + "grad_norm": 1.5649336576461792, + "learning_rate": 4.8154656167269804e-05, + "loss": 5.3686, + "step": 20694 + }, + { + "epoch": 0.12307902750023789, + "grad_norm": 1.8131600618362427, + "learning_rate": 4.815448003608544e-05, + "loss": 5.5532, + "step": 20695 + }, + { + "epoch": 0.12308497478351889, + "grad_norm": 1.7565428018569946, + "learning_rate": 4.815430389681808e-05, + "loss": 5.4619, + "step": 20696 + }, + { + "epoch": 0.12309092206679989, + "grad_norm": 1.708799958229065, + "learning_rate": 4.815412774946779e-05, + "loss": 5.5746, + "step": 20697 + }, + { + "epoch": 0.12309686935008088, + "grad_norm": 1.6220203638076782, + "learning_rate": 4.815395159403464e-05, + "loss": 5.1071, + "step": 20698 + }, + { + "epoch": 0.12310281663336188, + "grad_norm": 1.5516228675842285, + "learning_rate": 4.8153775430518676e-05, + "loss": 5.3921, + "step": 20699 + }, + { + "epoch": 0.12310876391664288, + "grad_norm": 1.7192966938018799, + "learning_rate": 4.815359925891998e-05, + "loss": 5.2339, + "step": 20700 + }, + { + "epoch": 0.12311471119992387, + "grad_norm": 1.3066575527191162, + "learning_rate": 4.815342307923859e-05, + "loss": 4.998, + "step": 20701 + }, + { + "epoch": 0.12312065848320487, + "grad_norm": 1.49882173538208, + "learning_rate": 4.815324689147459e-05, + "loss": 5.0493, + "step": 20702 + }, + { + "epoch": 0.12312660576648587, + "grad_norm": 1.5100362300872803, + "learning_rate": 4.815307069562802e-05, + "loss": 5.7113, + "step": 20703 + }, + { + "epoch": 0.12313255304976686, + "grad_norm": 1.7987116575241089, + "learning_rate": 4.815289449169896e-05, + "loss": 4.3582, + "step": 20704 + }, + { + "epoch": 0.12313850033304787, + "grad_norm": 1.7036083936691284, + "learning_rate": 4.815271827968746e-05, + "loss": 5.0769, + "step": 20705 + }, + { + "epoch": 0.12314444761632887, + "grad_norm": 1.8392287492752075, + "learning_rate": 4.8152542059593584e-05, + "loss": 4.6458, + "step": 20706 + }, + { + "epoch": 0.12315039489960986, + "grad_norm": 1.7489079236984253, + "learning_rate": 4.81523658314174e-05, + "loss": 4.9117, + "step": 20707 + }, + { + "epoch": 0.12315634218289086, + "grad_norm": 2.2490482330322266, + "learning_rate": 4.8152189595158965e-05, + "loss": 5.2912, + "step": 20708 + }, + { + "epoch": 0.12316228946617186, + "grad_norm": 1.6101025342941284, + "learning_rate": 4.815201335081834e-05, + "loss": 4.9382, + "step": 20709 + }, + { + "epoch": 0.12316823674945285, + "grad_norm": 1.7892024517059326, + "learning_rate": 4.815183709839558e-05, + "loss": 5.0046, + "step": 20710 + }, + { + "epoch": 0.12317418403273385, + "grad_norm": 1.5614895820617676, + "learning_rate": 4.815166083789076e-05, + "loss": 5.5325, + "step": 20711 + }, + { + "epoch": 0.12318013131601484, + "grad_norm": 1.4775935411453247, + "learning_rate": 4.815148456930392e-05, + "loss": 5.0981, + "step": 20712 + }, + { + "epoch": 0.12318607859929584, + "grad_norm": 1.3652704954147339, + "learning_rate": 4.815130829263515e-05, + "loss": 4.9632, + "step": 20713 + }, + { + "epoch": 0.12319202588257684, + "grad_norm": 1.7767298221588135, + "learning_rate": 4.815113200788449e-05, + "loss": 4.5071, + "step": 20714 + }, + { + "epoch": 0.12319797316585783, + "grad_norm": 1.8673535585403442, + "learning_rate": 4.815095571505202e-05, + "loss": 4.3313, + "step": 20715 + }, + { + "epoch": 0.12320392044913883, + "grad_norm": 1.6682900190353394, + "learning_rate": 4.8150779414137775e-05, + "loss": 5.2341, + "step": 20716 + }, + { + "epoch": 0.12320986773241983, + "grad_norm": 1.6456630229949951, + "learning_rate": 4.815060310514184e-05, + "loss": 5.3823, + "step": 20717 + }, + { + "epoch": 0.12321581501570082, + "grad_norm": 1.9971877336502075, + "learning_rate": 4.8150426788064265e-05, + "loss": 5.1093, + "step": 20718 + }, + { + "epoch": 0.12322176229898182, + "grad_norm": 1.6881333589553833, + "learning_rate": 4.815025046290512e-05, + "loss": 5.1788, + "step": 20719 + }, + { + "epoch": 0.12322770958226283, + "grad_norm": 1.6873126029968262, + "learning_rate": 4.815007412966446e-05, + "loss": 5.4508, + "step": 20720 + }, + { + "epoch": 0.12323365686554381, + "grad_norm": 1.5401923656463623, + "learning_rate": 4.814989778834235e-05, + "loss": 5.3638, + "step": 20721 + }, + { + "epoch": 0.12323960414882482, + "grad_norm": 1.3972458839416504, + "learning_rate": 4.814972143893885e-05, + "loss": 5.3096, + "step": 20722 + }, + { + "epoch": 0.12324555143210582, + "grad_norm": 1.7662227153778076, + "learning_rate": 4.8149545081454015e-05, + "loss": 5.7959, + "step": 20723 + }, + { + "epoch": 0.1232514987153868, + "grad_norm": 1.5072314739227295, + "learning_rate": 4.814936871588792e-05, + "loss": 5.6857, + "step": 20724 + }, + { + "epoch": 0.12325744599866781, + "grad_norm": 1.6628614664077759, + "learning_rate": 4.814919234224062e-05, + "loss": 5.4054, + "step": 20725 + }, + { + "epoch": 0.12326339328194881, + "grad_norm": 1.7059345245361328, + "learning_rate": 4.814901596051217e-05, + "loss": 5.3205, + "step": 20726 + }, + { + "epoch": 0.1232693405652298, + "grad_norm": 1.5989772081375122, + "learning_rate": 4.814883957070264e-05, + "loss": 5.0841, + "step": 20727 + }, + { + "epoch": 0.1232752878485108, + "grad_norm": 1.3816654682159424, + "learning_rate": 4.814866317281209e-05, + "loss": 4.9146, + "step": 20728 + }, + { + "epoch": 0.1232812351317918, + "grad_norm": 1.3992705345153809, + "learning_rate": 4.814848676684058e-05, + "loss": 4.8416, + "step": 20729 + }, + { + "epoch": 0.12328718241507279, + "grad_norm": 1.7377054691314697, + "learning_rate": 4.814831035278818e-05, + "loss": 5.3636, + "step": 20730 + }, + { + "epoch": 0.12329312969835379, + "grad_norm": 2.1461470127105713, + "learning_rate": 4.814813393065494e-05, + "loss": 5.7162, + "step": 20731 + }, + { + "epoch": 0.1232990769816348, + "grad_norm": 1.7310097217559814, + "learning_rate": 4.814795750044092e-05, + "loss": 5.7005, + "step": 20732 + }, + { + "epoch": 0.12330502426491578, + "grad_norm": 1.678813099861145, + "learning_rate": 4.814778106214619e-05, + "loss": 5.8184, + "step": 20733 + }, + { + "epoch": 0.12331097154819678, + "grad_norm": 1.7520476579666138, + "learning_rate": 4.814760461577081e-05, + "loss": 5.5746, + "step": 20734 + }, + { + "epoch": 0.12331691883147779, + "grad_norm": 1.6140379905700684, + "learning_rate": 4.8147428161314846e-05, + "loss": 5.4311, + "step": 20735 + }, + { + "epoch": 0.12332286611475878, + "grad_norm": 1.5862205028533936, + "learning_rate": 4.814725169877834e-05, + "loss": 5.5008, + "step": 20736 + }, + { + "epoch": 0.12332881339803978, + "grad_norm": 1.5568691492080688, + "learning_rate": 4.814707522816138e-05, + "loss": 5.5164, + "step": 20737 + }, + { + "epoch": 0.12333476068132078, + "grad_norm": 1.245606780052185, + "learning_rate": 4.814689874946401e-05, + "loss": 5.4217, + "step": 20738 + }, + { + "epoch": 0.12334070796460177, + "grad_norm": 1.3054754734039307, + "learning_rate": 4.8146722262686294e-05, + "loss": 5.4749, + "step": 20739 + }, + { + "epoch": 0.12334665524788277, + "grad_norm": 1.5772032737731934, + "learning_rate": 4.81465457678283e-05, + "loss": 5.7249, + "step": 20740 + }, + { + "epoch": 0.12335260253116376, + "grad_norm": 1.469688057899475, + "learning_rate": 4.814636926489009e-05, + "loss": 5.8515, + "step": 20741 + }, + { + "epoch": 0.12335854981444476, + "grad_norm": 2.3438186645507812, + "learning_rate": 4.814619275387172e-05, + "loss": 4.7599, + "step": 20742 + }, + { + "epoch": 0.12336449709772576, + "grad_norm": 2.4038238525390625, + "learning_rate": 4.814601623477325e-05, + "loss": 4.5717, + "step": 20743 + }, + { + "epoch": 0.12337044438100675, + "grad_norm": 2.773898124694824, + "learning_rate": 4.8145839707594745e-05, + "loss": 4.4889, + "step": 20744 + }, + { + "epoch": 0.12337639166428775, + "grad_norm": 2.863701820373535, + "learning_rate": 4.814566317233626e-05, + "loss": 4.5076, + "step": 20745 + }, + { + "epoch": 0.12338233894756875, + "grad_norm": 2.066301107406616, + "learning_rate": 4.8145486628997875e-05, + "loss": 4.8112, + "step": 20746 + }, + { + "epoch": 0.12338828623084974, + "grad_norm": 2.307910680770874, + "learning_rate": 4.814531007757963e-05, + "loss": 4.3896, + "step": 20747 + }, + { + "epoch": 0.12339423351413074, + "grad_norm": 2.2435505390167236, + "learning_rate": 4.81451335180816e-05, + "loss": 4.6403, + "step": 20748 + }, + { + "epoch": 0.12340018079741175, + "grad_norm": 2.4653170108795166, + "learning_rate": 4.814495695050385e-05, + "loss": 4.4737, + "step": 20749 + }, + { + "epoch": 0.12340612808069273, + "grad_norm": 2.3770196437835693, + "learning_rate": 4.814478037484643e-05, + "loss": 4.4951, + "step": 20750 + }, + { + "epoch": 0.12341207536397374, + "grad_norm": 1.8455066680908203, + "learning_rate": 4.81446037911094e-05, + "loss": 5.2646, + "step": 20751 + }, + { + "epoch": 0.12341802264725474, + "grad_norm": 1.6683069467544556, + "learning_rate": 4.814442719929283e-05, + "loss": 5.4287, + "step": 20752 + }, + { + "epoch": 0.12342396993053573, + "grad_norm": 1.4904793500900269, + "learning_rate": 4.814425059939679e-05, + "loss": 4.9993, + "step": 20753 + }, + { + "epoch": 0.12342991721381673, + "grad_norm": 1.5601847171783447, + "learning_rate": 4.8144073991421326e-05, + "loss": 5.1637, + "step": 20754 + }, + { + "epoch": 0.12343586449709773, + "grad_norm": 1.8937057256698608, + "learning_rate": 4.8143897375366496e-05, + "loss": 4.6928, + "step": 20755 + }, + { + "epoch": 0.12344181178037872, + "grad_norm": 1.8150557279586792, + "learning_rate": 4.814372075123238e-05, + "loss": 5.8257, + "step": 20756 + }, + { + "epoch": 0.12344775906365972, + "grad_norm": 1.537091612815857, + "learning_rate": 4.814354411901902e-05, + "loss": 5.0506, + "step": 20757 + }, + { + "epoch": 0.12345370634694072, + "grad_norm": 1.9722800254821777, + "learning_rate": 4.8143367478726495e-05, + "loss": 4.2019, + "step": 20758 + }, + { + "epoch": 0.12345965363022171, + "grad_norm": 1.9497390985488892, + "learning_rate": 4.8143190830354865e-05, + "loss": 4.2974, + "step": 20759 + }, + { + "epoch": 0.12346560091350271, + "grad_norm": 1.877036690711975, + "learning_rate": 4.814301417390418e-05, + "loss": 4.1039, + "step": 20760 + }, + { + "epoch": 0.12347154819678371, + "grad_norm": 1.932218313217163, + "learning_rate": 4.814283750937451e-05, + "loss": 4.3427, + "step": 20761 + }, + { + "epoch": 0.1234774954800647, + "grad_norm": 2.175657272338867, + "learning_rate": 4.814266083676591e-05, + "loss": 4.6891, + "step": 20762 + }, + { + "epoch": 0.1234834427633457, + "grad_norm": 1.7364848852157593, + "learning_rate": 4.8142484156078456e-05, + "loss": 4.4825, + "step": 20763 + }, + { + "epoch": 0.1234893900466267, + "grad_norm": 1.7598278522491455, + "learning_rate": 4.8142307467312184e-05, + "loss": 4.0782, + "step": 20764 + }, + { + "epoch": 0.1234953373299077, + "grad_norm": 1.9056943655014038, + "learning_rate": 4.814213077046719e-05, + "loss": 4.245, + "step": 20765 + }, + { + "epoch": 0.1235012846131887, + "grad_norm": 1.8974699974060059, + "learning_rate": 4.8141954065543506e-05, + "loss": 4.0707, + "step": 20766 + }, + { + "epoch": 0.1235072318964697, + "grad_norm": 1.9884151220321655, + "learning_rate": 4.814177735254121e-05, + "loss": 4.1443, + "step": 20767 + }, + { + "epoch": 0.12351317917975069, + "grad_norm": 1.952216625213623, + "learning_rate": 4.814160063146035e-05, + "loss": 4.6248, + "step": 20768 + }, + { + "epoch": 0.12351912646303169, + "grad_norm": 2.537240743637085, + "learning_rate": 4.814142390230101e-05, + "loss": 4.8936, + "step": 20769 + }, + { + "epoch": 0.12352507374631268, + "grad_norm": 1.6106029748916626, + "learning_rate": 4.814124716506322e-05, + "loss": 5.9498, + "step": 20770 + }, + { + "epoch": 0.12353102102959368, + "grad_norm": 2.3211259841918945, + "learning_rate": 4.814107041974707e-05, + "loss": 4.634, + "step": 20771 + }, + { + "epoch": 0.12353696831287468, + "grad_norm": 2.1425933837890625, + "learning_rate": 4.814089366635261e-05, + "loss": 4.9106, + "step": 20772 + }, + { + "epoch": 0.12354291559615567, + "grad_norm": 1.9194071292877197, + "learning_rate": 4.814071690487991e-05, + "loss": 4.9044, + "step": 20773 + }, + { + "epoch": 0.12354886287943667, + "grad_norm": 2.2048282623291016, + "learning_rate": 4.814054013532902e-05, + "loss": 4.7123, + "step": 20774 + }, + { + "epoch": 0.12355481016271767, + "grad_norm": 2.1015446186065674, + "learning_rate": 4.8140363357700004e-05, + "loss": 4.6005, + "step": 20775 + }, + { + "epoch": 0.12356075744599866, + "grad_norm": 2.133510112762451, + "learning_rate": 4.814018657199293e-05, + "loss": 5.1534, + "step": 20776 + }, + { + "epoch": 0.12356670472927966, + "grad_norm": 2.050220012664795, + "learning_rate": 4.814000977820785e-05, + "loss": 4.8997, + "step": 20777 + }, + { + "epoch": 0.12357265201256067, + "grad_norm": 2.0189473628997803, + "learning_rate": 4.8139832976344836e-05, + "loss": 4.6096, + "step": 20778 + }, + { + "epoch": 0.12357859929584165, + "grad_norm": 2.515733242034912, + "learning_rate": 4.813965616640395e-05, + "loss": 4.7096, + "step": 20779 + }, + { + "epoch": 0.12358454657912266, + "grad_norm": 2.062140941619873, + "learning_rate": 4.813947934838524e-05, + "loss": 4.8037, + "step": 20780 + }, + { + "epoch": 0.12359049386240366, + "grad_norm": 2.0707905292510986, + "learning_rate": 4.8139302522288776e-05, + "loss": 5.3148, + "step": 20781 + }, + { + "epoch": 0.12359644114568465, + "grad_norm": 2.0126004219055176, + "learning_rate": 4.813912568811463e-05, + "loss": 5.522, + "step": 20782 + }, + { + "epoch": 0.12360238842896565, + "grad_norm": 1.9760699272155762, + "learning_rate": 4.8138948845862855e-05, + "loss": 5.2751, + "step": 20783 + }, + { + "epoch": 0.12360833571224665, + "grad_norm": 1.6164956092834473, + "learning_rate": 4.81387719955335e-05, + "loss": 5.4444, + "step": 20784 + }, + { + "epoch": 0.12361428299552764, + "grad_norm": 1.7360550165176392, + "learning_rate": 4.8138595137126645e-05, + "loss": 4.7908, + "step": 20785 + }, + { + "epoch": 0.12362023027880864, + "grad_norm": 1.691304087638855, + "learning_rate": 4.813841827064235e-05, + "loss": 5.4206, + "step": 20786 + }, + { + "epoch": 0.12362617756208964, + "grad_norm": 1.685165524482727, + "learning_rate": 4.813824139608066e-05, + "loss": 4.457, + "step": 20787 + }, + { + "epoch": 0.12363212484537063, + "grad_norm": 2.114884376525879, + "learning_rate": 4.813806451344166e-05, + "loss": 4.8126, + "step": 20788 + }, + { + "epoch": 0.12363807212865163, + "grad_norm": 2.084394693374634, + "learning_rate": 4.81378876227254e-05, + "loss": 4.6486, + "step": 20789 + }, + { + "epoch": 0.12364401941193263, + "grad_norm": 1.901607871055603, + "learning_rate": 4.813771072393194e-05, + "loss": 4.3079, + "step": 20790 + }, + { + "epoch": 0.12364996669521362, + "grad_norm": 1.8139945268630981, + "learning_rate": 4.8137533817061345e-05, + "loss": 4.2445, + "step": 20791 + }, + { + "epoch": 0.12365591397849462, + "grad_norm": 1.8131442070007324, + "learning_rate": 4.8137356902113674e-05, + "loss": 4.1701, + "step": 20792 + }, + { + "epoch": 0.12366186126177563, + "grad_norm": 1.7977681159973145, + "learning_rate": 4.8137179979088995e-05, + "loss": 4.1976, + "step": 20793 + }, + { + "epoch": 0.12366780854505662, + "grad_norm": 1.78773832321167, + "learning_rate": 4.813700304798736e-05, + "loss": 4.0982, + "step": 20794 + }, + { + "epoch": 0.12367375582833762, + "grad_norm": 1.9300304651260376, + "learning_rate": 4.8136826108808844e-05, + "loss": 4.0887, + "step": 20795 + }, + { + "epoch": 0.12367970311161862, + "grad_norm": 1.8883346319198608, + "learning_rate": 4.813664916155349e-05, + "loss": 5.0699, + "step": 20796 + }, + { + "epoch": 0.12368565039489961, + "grad_norm": 1.9141865968704224, + "learning_rate": 4.813647220622137e-05, + "loss": 4.6133, + "step": 20797 + }, + { + "epoch": 0.12369159767818061, + "grad_norm": 2.074240207672119, + "learning_rate": 4.813629524281256e-05, + "loss": 4.2272, + "step": 20798 + }, + { + "epoch": 0.12369754496146161, + "grad_norm": 1.9218412637710571, + "learning_rate": 4.81361182713271e-05, + "loss": 4.2612, + "step": 20799 + }, + { + "epoch": 0.1237034922447426, + "grad_norm": 2.3334543704986572, + "learning_rate": 4.8135941291765066e-05, + "loss": 5.4561, + "step": 20800 + }, + { + "epoch": 0.1237094395280236, + "grad_norm": 2.1329383850097656, + "learning_rate": 4.8135764304126504e-05, + "loss": 4.8373, + "step": 20801 + }, + { + "epoch": 0.12371538681130459, + "grad_norm": 2.2241666316986084, + "learning_rate": 4.81355873084115e-05, + "loss": 4.5995, + "step": 20802 + }, + { + "epoch": 0.12372133409458559, + "grad_norm": 1.448601245880127, + "learning_rate": 4.8135410304620086e-05, + "loss": 6.0327, + "step": 20803 + }, + { + "epoch": 0.1237272813778666, + "grad_norm": 2.05168080329895, + "learning_rate": 4.8135233292752344e-05, + "loss": 4.8944, + "step": 20804 + }, + { + "epoch": 0.12373322866114758, + "grad_norm": 1.9282878637313843, + "learning_rate": 4.813505627280834e-05, + "loss": 5.1704, + "step": 20805 + }, + { + "epoch": 0.12373917594442858, + "grad_norm": 1.892562747001648, + "learning_rate": 4.813487924478812e-05, + "loss": 5.3674, + "step": 20806 + }, + { + "epoch": 0.12374512322770959, + "grad_norm": 1.866495132446289, + "learning_rate": 4.813470220869175e-05, + "loss": 5.3585, + "step": 20807 + }, + { + "epoch": 0.12375107051099057, + "grad_norm": 1.8725072145462036, + "learning_rate": 4.81345251645193e-05, + "loss": 5.0933, + "step": 20808 + }, + { + "epoch": 0.12375701779427158, + "grad_norm": 1.486983299255371, + "learning_rate": 4.8134348112270825e-05, + "loss": 5.1869, + "step": 20809 + }, + { + "epoch": 0.12376296507755258, + "grad_norm": 1.5050567388534546, + "learning_rate": 4.813417105194639e-05, + "loss": 5.1382, + "step": 20810 + }, + { + "epoch": 0.12376891236083357, + "grad_norm": 1.629869818687439, + "learning_rate": 4.813399398354605e-05, + "loss": 5.3847, + "step": 20811 + }, + { + "epoch": 0.12377485964411457, + "grad_norm": 1.749213695526123, + "learning_rate": 4.813381690706987e-05, + "loss": 4.8655, + "step": 20812 + }, + { + "epoch": 0.12378080692739557, + "grad_norm": 1.734803318977356, + "learning_rate": 4.813363982251792e-05, + "loss": 5.2059, + "step": 20813 + }, + { + "epoch": 0.12378675421067656, + "grad_norm": 1.8050858974456787, + "learning_rate": 4.813346272989024e-05, + "loss": 5.1364, + "step": 20814 + }, + { + "epoch": 0.12379270149395756, + "grad_norm": 1.6926177740097046, + "learning_rate": 4.813328562918692e-05, + "loss": 4.969, + "step": 20815 + }, + { + "epoch": 0.12379864877723856, + "grad_norm": 1.9767627716064453, + "learning_rate": 4.813310852040801e-05, + "loss": 5.1043, + "step": 20816 + }, + { + "epoch": 0.12380459606051955, + "grad_norm": 1.5432230234146118, + "learning_rate": 4.813293140355357e-05, + "loss": 5.0858, + "step": 20817 + }, + { + "epoch": 0.12381054334380055, + "grad_norm": 1.5301191806793213, + "learning_rate": 4.813275427862366e-05, + "loss": 5.2312, + "step": 20818 + }, + { + "epoch": 0.12381649062708155, + "grad_norm": 1.6347124576568604, + "learning_rate": 4.813257714561835e-05, + "loss": 5.1701, + "step": 20819 + }, + { + "epoch": 0.12382243791036254, + "grad_norm": 2.1260578632354736, + "learning_rate": 4.813240000453769e-05, + "loss": 5.3055, + "step": 20820 + }, + { + "epoch": 0.12382838519364354, + "grad_norm": 2.0905344486236572, + "learning_rate": 4.813222285538175e-05, + "loss": 5.1265, + "step": 20821 + }, + { + "epoch": 0.12383433247692455, + "grad_norm": 1.8773592710494995, + "learning_rate": 4.81320456981506e-05, + "loss": 5.1409, + "step": 20822 + }, + { + "epoch": 0.12384027976020553, + "grad_norm": 1.9149075746536255, + "learning_rate": 4.8131868532844275e-05, + "loss": 5.1855, + "step": 20823 + }, + { + "epoch": 0.12384622704348654, + "grad_norm": 2.0494494438171387, + "learning_rate": 4.813169135946286e-05, + "loss": 5.2561, + "step": 20824 + }, + { + "epoch": 0.12385217432676754, + "grad_norm": 1.9590463638305664, + "learning_rate": 4.8131514178006417e-05, + "loss": 5.0764, + "step": 20825 + }, + { + "epoch": 0.12385812161004853, + "grad_norm": 2.5940022468566895, + "learning_rate": 4.8131336988475e-05, + "loss": 4.42, + "step": 20826 + }, + { + "epoch": 0.12386406889332953, + "grad_norm": 2.135793924331665, + "learning_rate": 4.8131159790868665e-05, + "loss": 4.653, + "step": 20827 + }, + { + "epoch": 0.12387001617661053, + "grad_norm": 2.1380679607391357, + "learning_rate": 4.813098258518748e-05, + "loss": 4.7332, + "step": 20828 + }, + { + "epoch": 0.12387596345989152, + "grad_norm": 2.264723300933838, + "learning_rate": 4.8130805371431513e-05, + "loss": 4.8735, + "step": 20829 + }, + { + "epoch": 0.12388191074317252, + "grad_norm": 2.4449269771575928, + "learning_rate": 4.813062814960082e-05, + "loss": 3.6335, + "step": 20830 + }, + { + "epoch": 0.12388785802645351, + "grad_norm": 2.5718894004821777, + "learning_rate": 4.813045091969547e-05, + "loss": 3.8212, + "step": 20831 + }, + { + "epoch": 0.12389380530973451, + "grad_norm": 1.9600555896759033, + "learning_rate": 4.813027368171551e-05, + "loss": 5.7456, + "step": 20832 + }, + { + "epoch": 0.12389975259301551, + "grad_norm": 2.032362699508667, + "learning_rate": 4.813009643566101e-05, + "loss": 5.3087, + "step": 20833 + }, + { + "epoch": 0.1239056998762965, + "grad_norm": 2.0349206924438477, + "learning_rate": 4.8129919181532036e-05, + "loss": 5.0988, + "step": 20834 + }, + { + "epoch": 0.1239116471595775, + "grad_norm": 2.811582565307617, + "learning_rate": 4.812974191932864e-05, + "loss": 4.4085, + "step": 20835 + }, + { + "epoch": 0.1239175944428585, + "grad_norm": 1.8748958110809326, + "learning_rate": 4.8129564649050904e-05, + "loss": 5.3469, + "step": 20836 + }, + { + "epoch": 0.1239235417261395, + "grad_norm": 2.162895917892456, + "learning_rate": 4.8129387370698865e-05, + "loss": 5.4258, + "step": 20837 + }, + { + "epoch": 0.1239294890094205, + "grad_norm": 1.60780668258667, + "learning_rate": 4.8129210084272596e-05, + "loss": 5.4865, + "step": 20838 + }, + { + "epoch": 0.1239354362927015, + "grad_norm": 1.8906630277633667, + "learning_rate": 4.812903278977216e-05, + "loss": 5.3286, + "step": 20839 + }, + { + "epoch": 0.12394138357598249, + "grad_norm": 1.5469995737075806, + "learning_rate": 4.812885548719762e-05, + "loss": 5.4524, + "step": 20840 + }, + { + "epoch": 0.12394733085926349, + "grad_norm": 1.632104754447937, + "learning_rate": 4.8128678176549034e-05, + "loss": 5.4239, + "step": 20841 + }, + { + "epoch": 0.12395327814254449, + "grad_norm": 1.9250766038894653, + "learning_rate": 4.812850085782646e-05, + "loss": 5.3333, + "step": 20842 + }, + { + "epoch": 0.12395922542582548, + "grad_norm": 1.5831308364868164, + "learning_rate": 4.8128323531029974e-05, + "loss": 5.38, + "step": 20843 + }, + { + "epoch": 0.12396517270910648, + "grad_norm": 1.8450974225997925, + "learning_rate": 4.812814619615963e-05, + "loss": 5.1909, + "step": 20844 + }, + { + "epoch": 0.12397111999238748, + "grad_norm": 1.990018367767334, + "learning_rate": 4.8127968853215485e-05, + "loss": 5.2392, + "step": 20845 + }, + { + "epoch": 0.12397706727566847, + "grad_norm": 1.7380045652389526, + "learning_rate": 4.812779150219761e-05, + "loss": 5.4486, + "step": 20846 + }, + { + "epoch": 0.12398301455894947, + "grad_norm": 1.6080845594406128, + "learning_rate": 4.812761414310605e-05, + "loss": 6.0048, + "step": 20847 + }, + { + "epoch": 0.12398896184223047, + "grad_norm": 1.2336721420288086, + "learning_rate": 4.8127436775940884e-05, + "loss": 5.8988, + "step": 20848 + }, + { + "epoch": 0.12399490912551146, + "grad_norm": 1.3851333856582642, + "learning_rate": 4.8127259400702173e-05, + "loss": 6.0162, + "step": 20849 + }, + { + "epoch": 0.12400085640879246, + "grad_norm": 1.3938422203063965, + "learning_rate": 4.8127082017389965e-05, + "loss": 5.9186, + "step": 20850 + }, + { + "epoch": 0.12400680369207347, + "grad_norm": 1.6463207006454468, + "learning_rate": 4.812690462600434e-05, + "loss": 5.9684, + "step": 20851 + }, + { + "epoch": 0.12401275097535445, + "grad_norm": 1.4180574417114258, + "learning_rate": 4.8126727226545353e-05, + "loss": 5.9383, + "step": 20852 + }, + { + "epoch": 0.12401869825863546, + "grad_norm": 1.3431847095489502, + "learning_rate": 4.8126549819013065e-05, + "loss": 5.862, + "step": 20853 + }, + { + "epoch": 0.12402464554191646, + "grad_norm": 1.3493611812591553, + "learning_rate": 4.812637240340753e-05, + "loss": 5.8796, + "step": 20854 + }, + { + "epoch": 0.12403059282519745, + "grad_norm": 1.2833929061889648, + "learning_rate": 4.812619497972882e-05, + "loss": 5.7322, + "step": 20855 + }, + { + "epoch": 0.12403654010847845, + "grad_norm": 1.4494770765304565, + "learning_rate": 4.8126017547977e-05, + "loss": 5.6871, + "step": 20856 + }, + { + "epoch": 0.12404248739175945, + "grad_norm": 1.9750009775161743, + "learning_rate": 4.812584010815212e-05, + "loss": 5.4744, + "step": 20857 + }, + { + "epoch": 0.12404843467504044, + "grad_norm": 2.2873501777648926, + "learning_rate": 4.812566266025425e-05, + "loss": 4.7326, + "step": 20858 + }, + { + "epoch": 0.12405438195832144, + "grad_norm": 2.3699395656585693, + "learning_rate": 4.8125485204283446e-05, + "loss": 5.1084, + "step": 20859 + }, + { + "epoch": 0.12406032924160243, + "grad_norm": 2.3874311447143555, + "learning_rate": 4.812530774023978e-05, + "loss": 4.7226, + "step": 20860 + }, + { + "epoch": 0.12406627652488343, + "grad_norm": 1.6285946369171143, + "learning_rate": 4.8125130268123305e-05, + "loss": 5.4695, + "step": 20861 + }, + { + "epoch": 0.12407222380816443, + "grad_norm": 1.5346466302871704, + "learning_rate": 4.8124952787934096e-05, + "loss": 5.5105, + "step": 20862 + }, + { + "epoch": 0.12407817109144542, + "grad_norm": 1.7935290336608887, + "learning_rate": 4.8124775299672195e-05, + "loss": 5.2028, + "step": 20863 + }, + { + "epoch": 0.12408411837472642, + "grad_norm": 1.7893015146255493, + "learning_rate": 4.812459780333767e-05, + "loss": 5.1571, + "step": 20864 + }, + { + "epoch": 0.12409006565800743, + "grad_norm": 1.6904758214950562, + "learning_rate": 4.8124420298930596e-05, + "loss": 6.0024, + "step": 20865 + }, + { + "epoch": 0.12409601294128841, + "grad_norm": 1.7721166610717773, + "learning_rate": 4.812424278645102e-05, + "loss": 5.8716, + "step": 20866 + }, + { + "epoch": 0.12410196022456942, + "grad_norm": 1.5822969675064087, + "learning_rate": 4.812406526589901e-05, + "loss": 5.7984, + "step": 20867 + }, + { + "epoch": 0.12410790750785042, + "grad_norm": 1.713592290878296, + "learning_rate": 4.8123887737274634e-05, + "loss": 5.7348, + "step": 20868 + }, + { + "epoch": 0.1241138547911314, + "grad_norm": 1.54501473903656, + "learning_rate": 4.812371020057794e-05, + "loss": 5.7012, + "step": 20869 + }, + { + "epoch": 0.12411980207441241, + "grad_norm": 1.2782925367355347, + "learning_rate": 4.8123532655809e-05, + "loss": 5.6171, + "step": 20870 + }, + { + "epoch": 0.12412574935769341, + "grad_norm": 1.357879638671875, + "learning_rate": 4.812335510296787e-05, + "loss": 5.7021, + "step": 20871 + }, + { + "epoch": 0.1241316966409744, + "grad_norm": 1.468440294265747, + "learning_rate": 4.812317754205462e-05, + "loss": 5.6863, + "step": 20872 + }, + { + "epoch": 0.1241376439242554, + "grad_norm": 2.965566396713257, + "learning_rate": 4.812299997306931e-05, + "loss": 5.3282, + "step": 20873 + }, + { + "epoch": 0.1241435912075364, + "grad_norm": 3.3760321140289307, + "learning_rate": 4.8122822396012e-05, + "loss": 5.0464, + "step": 20874 + }, + { + "epoch": 0.12414953849081739, + "grad_norm": 2.340055465698242, + "learning_rate": 4.8122644810882746e-05, + "loss": 4.6466, + "step": 20875 + }, + { + "epoch": 0.12415548577409839, + "grad_norm": 1.5659359693527222, + "learning_rate": 4.8122467217681615e-05, + "loss": 5.5262, + "step": 20876 + }, + { + "epoch": 0.1241614330573794, + "grad_norm": 1.9036263227462769, + "learning_rate": 4.812228961640868e-05, + "loss": 5.7474, + "step": 20877 + }, + { + "epoch": 0.12416738034066038, + "grad_norm": 1.8488661050796509, + "learning_rate": 4.812211200706398e-05, + "loss": 5.6901, + "step": 20878 + }, + { + "epoch": 0.12417332762394138, + "grad_norm": 1.7501896619796753, + "learning_rate": 4.8121934389647594e-05, + "loss": 5.9729, + "step": 20879 + }, + { + "epoch": 0.12417927490722239, + "grad_norm": 1.7495286464691162, + "learning_rate": 4.812175676415957e-05, + "loss": 5.4282, + "step": 20880 + }, + { + "epoch": 0.12418522219050337, + "grad_norm": 1.8494720458984375, + "learning_rate": 4.8121579130600005e-05, + "loss": 5.6148, + "step": 20881 + }, + { + "epoch": 0.12419116947378438, + "grad_norm": 1.860341191291809, + "learning_rate": 4.812140148896892e-05, + "loss": 5.6192, + "step": 20882 + }, + { + "epoch": 0.12419711675706538, + "grad_norm": 1.845438003540039, + "learning_rate": 4.8121223839266386e-05, + "loss": 5.4989, + "step": 20883 + }, + { + "epoch": 0.12420306404034637, + "grad_norm": 1.7625926733016968, + "learning_rate": 4.812104618149248e-05, + "loss": 5.4833, + "step": 20884 + }, + { + "epoch": 0.12420901132362737, + "grad_norm": 1.4869773387908936, + "learning_rate": 4.812086851564725e-05, + "loss": 5.6437, + "step": 20885 + }, + { + "epoch": 0.12421495860690837, + "grad_norm": 1.528306245803833, + "learning_rate": 4.812069084173077e-05, + "loss": 5.4938, + "step": 20886 + }, + { + "epoch": 0.12422090589018936, + "grad_norm": 1.28203284740448, + "learning_rate": 4.81205131597431e-05, + "loss": 5.5411, + "step": 20887 + }, + { + "epoch": 0.12422685317347036, + "grad_norm": 1.9413608312606812, + "learning_rate": 4.8120335469684285e-05, + "loss": 5.4842, + "step": 20888 + }, + { + "epoch": 0.12423280045675135, + "grad_norm": 1.8776315450668335, + "learning_rate": 4.812015777155441e-05, + "loss": 5.495, + "step": 20889 + }, + { + "epoch": 0.12423874774003235, + "grad_norm": 1.941171646118164, + "learning_rate": 4.8119980065353524e-05, + "loss": 5.7711, + "step": 20890 + }, + { + "epoch": 0.12424469502331335, + "grad_norm": 1.8312263488769531, + "learning_rate": 4.811980235108169e-05, + "loss": 5.5998, + "step": 20891 + }, + { + "epoch": 0.12425064230659434, + "grad_norm": 1.6940878629684448, + "learning_rate": 4.811962462873897e-05, + "loss": 5.9089, + "step": 20892 + }, + { + "epoch": 0.12425658958987534, + "grad_norm": 1.8769567012786865, + "learning_rate": 4.811944689832543e-05, + "loss": 5.5854, + "step": 20893 + }, + { + "epoch": 0.12426253687315635, + "grad_norm": 1.8289974927902222, + "learning_rate": 4.811926915984113e-05, + "loss": 5.4698, + "step": 20894 + }, + { + "epoch": 0.12426848415643733, + "grad_norm": 2.343961000442505, + "learning_rate": 4.811909141328613e-05, + "loss": 4.4474, + "step": 20895 + }, + { + "epoch": 0.12427443143971834, + "grad_norm": 1.9822384119033813, + "learning_rate": 4.8118913658660504e-05, + "loss": 4.9353, + "step": 20896 + }, + { + "epoch": 0.12428037872299934, + "grad_norm": 2.3056247234344482, + "learning_rate": 4.811873589596429e-05, + "loss": 4.7128, + "step": 20897 + }, + { + "epoch": 0.12428632600628033, + "grad_norm": 2.205653667449951, + "learning_rate": 4.811855812519758e-05, + "loss": 4.08, + "step": 20898 + }, + { + "epoch": 0.12429227328956133, + "grad_norm": 2.0141141414642334, + "learning_rate": 4.81183803463604e-05, + "loss": 4.2903, + "step": 20899 + }, + { + "epoch": 0.12429822057284233, + "grad_norm": 2.2912099361419678, + "learning_rate": 4.811820255945285e-05, + "loss": 4.7582, + "step": 20900 + }, + { + "epoch": 0.12430416785612332, + "grad_norm": 2.1577751636505127, + "learning_rate": 4.8118024764474965e-05, + "loss": 4.757, + "step": 20901 + }, + { + "epoch": 0.12431011513940432, + "grad_norm": 2.2851569652557373, + "learning_rate": 4.811784696142682e-05, + "loss": 3.9403, + "step": 20902 + }, + { + "epoch": 0.12431606242268532, + "grad_norm": 2.256500720977783, + "learning_rate": 4.8117669150308474e-05, + "loss": 4.3498, + "step": 20903 + }, + { + "epoch": 0.12432200970596631, + "grad_norm": 2.1631035804748535, + "learning_rate": 4.811749133111999e-05, + "loss": 4.6171, + "step": 20904 + }, + { + "epoch": 0.12432795698924731, + "grad_norm": 2.360530138015747, + "learning_rate": 4.811731350386142e-05, + "loss": 4.5958, + "step": 20905 + }, + { + "epoch": 0.12433390427252831, + "grad_norm": 2.031268835067749, + "learning_rate": 4.8117135668532845e-05, + "loss": 4.4466, + "step": 20906 + }, + { + "epoch": 0.1243398515558093, + "grad_norm": 1.7367441654205322, + "learning_rate": 4.811695782513431e-05, + "loss": 4.8605, + "step": 20907 + }, + { + "epoch": 0.1243457988390903, + "grad_norm": 2.5067267417907715, + "learning_rate": 4.8116779973665886e-05, + "loss": 4.0849, + "step": 20908 + }, + { + "epoch": 0.1243517461223713, + "grad_norm": 1.5404255390167236, + "learning_rate": 4.811660211412763e-05, + "loss": 4.4511, + "step": 20909 + }, + { + "epoch": 0.1243576934056523, + "grad_norm": 1.4191818237304688, + "learning_rate": 4.8116424246519606e-05, + "loss": 4.4274, + "step": 20910 + }, + { + "epoch": 0.1243636406889333, + "grad_norm": 1.4610079526901245, + "learning_rate": 4.811624637084189e-05, + "loss": 4.4112, + "step": 20911 + }, + { + "epoch": 0.1243695879722143, + "grad_norm": 1.3842167854309082, + "learning_rate": 4.811606848709452e-05, + "loss": 4.3019, + "step": 20912 + }, + { + "epoch": 0.12437553525549529, + "grad_norm": 1.4025331735610962, + "learning_rate": 4.811589059527757e-05, + "loss": 4.251, + "step": 20913 + }, + { + "epoch": 0.12438148253877629, + "grad_norm": 1.5034327507019043, + "learning_rate": 4.81157126953911e-05, + "loss": 4.1553, + "step": 20914 + }, + { + "epoch": 0.12438742982205729, + "grad_norm": 1.5153253078460693, + "learning_rate": 4.811553478743518e-05, + "loss": 4.1264, + "step": 20915 + }, + { + "epoch": 0.12439337710533828, + "grad_norm": 1.4300923347473145, + "learning_rate": 4.811535687140987e-05, + "loss": 4.2653, + "step": 20916 + }, + { + "epoch": 0.12439932438861928, + "grad_norm": 1.4667567014694214, + "learning_rate": 4.811517894731521e-05, + "loss": 4.2216, + "step": 20917 + }, + { + "epoch": 0.12440527167190027, + "grad_norm": 1.6324750185012817, + "learning_rate": 4.81150010151513e-05, + "loss": 4.3083, + "step": 20918 + }, + { + "epoch": 0.12441121895518127, + "grad_norm": 1.507516622543335, + "learning_rate": 4.8114823074918165e-05, + "loss": 4.1369, + "step": 20919 + }, + { + "epoch": 0.12441716623846227, + "grad_norm": 1.5365220308303833, + "learning_rate": 4.8114645126615886e-05, + "loss": 4.061, + "step": 20920 + }, + { + "epoch": 0.12442311352174326, + "grad_norm": 1.3880743980407715, + "learning_rate": 4.811446717024453e-05, + "loss": 4.2464, + "step": 20921 + }, + { + "epoch": 0.12442906080502426, + "grad_norm": 1.619391918182373, + "learning_rate": 4.8114289205804155e-05, + "loss": 4.0032, + "step": 20922 + }, + { + "epoch": 0.12443500808830527, + "grad_norm": 1.5912760496139526, + "learning_rate": 4.811411123329481e-05, + "loss": 3.9996, + "step": 20923 + }, + { + "epoch": 0.12444095537158625, + "grad_norm": 1.6042509078979492, + "learning_rate": 4.811393325271657e-05, + "loss": 3.9225, + "step": 20924 + }, + { + "epoch": 0.12444690265486726, + "grad_norm": 1.4620057344436646, + "learning_rate": 4.8113755264069505e-05, + "loss": 4.4391, + "step": 20925 + }, + { + "epoch": 0.12445284993814826, + "grad_norm": 1.6154197454452515, + "learning_rate": 4.811357726735366e-05, + "loss": 4.1254, + "step": 20926 + }, + { + "epoch": 0.12445879722142925, + "grad_norm": 1.520150065422058, + "learning_rate": 4.8113399262569104e-05, + "loss": 4.7638, + "step": 20927 + }, + { + "epoch": 0.12446474450471025, + "grad_norm": 1.5869375467300415, + "learning_rate": 4.81132212497159e-05, + "loss": 4.047, + "step": 20928 + }, + { + "epoch": 0.12447069178799125, + "grad_norm": 1.610819697380066, + "learning_rate": 4.8113043228794105e-05, + "loss": 4.0823, + "step": 20929 + }, + { + "epoch": 0.12447663907127224, + "grad_norm": 1.4962780475616455, + "learning_rate": 4.811286519980379e-05, + "loss": 5.4004, + "step": 20930 + }, + { + "epoch": 0.12448258635455324, + "grad_norm": 1.382641077041626, + "learning_rate": 4.811268716274501e-05, + "loss": 5.3129, + "step": 20931 + }, + { + "epoch": 0.12448853363783424, + "grad_norm": 1.3323496580123901, + "learning_rate": 4.811250911761783e-05, + "loss": 5.2123, + "step": 20932 + }, + { + "epoch": 0.12449448092111523, + "grad_norm": 1.4375461339950562, + "learning_rate": 4.811233106442231e-05, + "loss": 5.4249, + "step": 20933 + }, + { + "epoch": 0.12450042820439623, + "grad_norm": 1.6861125230789185, + "learning_rate": 4.811215300315852e-05, + "loss": 5.0697, + "step": 20934 + }, + { + "epoch": 0.12450637548767723, + "grad_norm": 1.52859365940094, + "learning_rate": 4.811197493382651e-05, + "loss": 5.5925, + "step": 20935 + }, + { + "epoch": 0.12451232277095822, + "grad_norm": 1.4931366443634033, + "learning_rate": 4.811179685642635e-05, + "loss": 5.4442, + "step": 20936 + }, + { + "epoch": 0.12451827005423922, + "grad_norm": 1.3825764656066895, + "learning_rate": 4.8111618770958104e-05, + "loss": 5.5773, + "step": 20937 + }, + { + "epoch": 0.12452421733752023, + "grad_norm": 1.3441286087036133, + "learning_rate": 4.811144067742183e-05, + "loss": 5.5421, + "step": 20938 + }, + { + "epoch": 0.12453016462080121, + "grad_norm": 1.2910594940185547, + "learning_rate": 4.811126257581758e-05, + "loss": 5.3507, + "step": 20939 + }, + { + "epoch": 0.12453611190408222, + "grad_norm": 1.3505282402038574, + "learning_rate": 4.811108446614544e-05, + "loss": 5.5285, + "step": 20940 + }, + { + "epoch": 0.12454205918736322, + "grad_norm": 1.4562500715255737, + "learning_rate": 4.811090634840546e-05, + "loss": 5.3592, + "step": 20941 + }, + { + "epoch": 0.1245480064706442, + "grad_norm": 1.4702924489974976, + "learning_rate": 4.8110728222597694e-05, + "loss": 5.2603, + "step": 20942 + }, + { + "epoch": 0.12455395375392521, + "grad_norm": 1.6397823095321655, + "learning_rate": 4.811055008872222e-05, + "loss": 5.222, + "step": 20943 + }, + { + "epoch": 0.12455990103720621, + "grad_norm": 1.5603538751602173, + "learning_rate": 4.811037194677908e-05, + "loss": 5.2075, + "step": 20944 + }, + { + "epoch": 0.1245658483204872, + "grad_norm": 1.3349683284759521, + "learning_rate": 4.811019379676835e-05, + "loss": 5.2903, + "step": 20945 + }, + { + "epoch": 0.1245717956037682, + "grad_norm": 1.348935842514038, + "learning_rate": 4.8110015638690096e-05, + "loss": 5.4688, + "step": 20946 + }, + { + "epoch": 0.12457774288704919, + "grad_norm": 1.4173049926757812, + "learning_rate": 4.810983747254437e-05, + "loss": 5.0299, + "step": 20947 + }, + { + "epoch": 0.12458369017033019, + "grad_norm": 1.3553805351257324, + "learning_rate": 4.8109659298331244e-05, + "loss": 5.0798, + "step": 20948 + }, + { + "epoch": 0.1245896374536112, + "grad_norm": 1.3770824670791626, + "learning_rate": 4.810948111605077e-05, + "loss": 4.807, + "step": 20949 + }, + { + "epoch": 0.12459558473689218, + "grad_norm": 1.3450689315795898, + "learning_rate": 4.810930292570302e-05, + "loss": 4.8061, + "step": 20950 + }, + { + "epoch": 0.12460153202017318, + "grad_norm": 1.4118422269821167, + "learning_rate": 4.8109124727288044e-05, + "loss": 5.203, + "step": 20951 + }, + { + "epoch": 0.12460747930345419, + "grad_norm": 1.4127706289291382, + "learning_rate": 4.810894652080592e-05, + "loss": 5.104, + "step": 20952 + }, + { + "epoch": 0.12461342658673517, + "grad_norm": 1.2636264562606812, + "learning_rate": 4.810876830625669e-05, + "loss": 4.9306, + "step": 20953 + }, + { + "epoch": 0.12461937387001618, + "grad_norm": 1.3846913576126099, + "learning_rate": 4.810859008364044e-05, + "loss": 4.8095, + "step": 20954 + }, + { + "epoch": 0.12462532115329718, + "grad_norm": 1.6017072200775146, + "learning_rate": 4.8108411852957216e-05, + "loss": 4.9926, + "step": 20955 + }, + { + "epoch": 0.12463126843657817, + "grad_norm": 1.5098768472671509, + "learning_rate": 4.8108233614207075e-05, + "loss": 5.3204, + "step": 20956 + }, + { + "epoch": 0.12463721571985917, + "grad_norm": 1.1792641878128052, + "learning_rate": 4.8108055367390097e-05, + "loss": 4.7596, + "step": 20957 + }, + { + "epoch": 0.12464316300314017, + "grad_norm": 1.3787871599197388, + "learning_rate": 4.8107877112506336e-05, + "loss": 5.0914, + "step": 20958 + }, + { + "epoch": 0.12464911028642116, + "grad_norm": 1.3097307682037354, + "learning_rate": 4.8107698849555846e-05, + "loss": 4.8154, + "step": 20959 + }, + { + "epoch": 0.12465505756970216, + "grad_norm": 1.4452660083770752, + "learning_rate": 4.810752057853871e-05, + "loss": 5.1395, + "step": 20960 + }, + { + "epoch": 0.12466100485298316, + "grad_norm": 1.4970120191574097, + "learning_rate": 4.8107342299454974e-05, + "loss": 4.8164, + "step": 20961 + }, + { + "epoch": 0.12466695213626415, + "grad_norm": 1.4092109203338623, + "learning_rate": 4.810716401230469e-05, + "loss": 4.9219, + "step": 20962 + }, + { + "epoch": 0.12467289941954515, + "grad_norm": 1.5558546781539917, + "learning_rate": 4.810698571708795e-05, + "loss": 4.8639, + "step": 20963 + }, + { + "epoch": 0.12467884670282615, + "grad_norm": 1.3631898164749146, + "learning_rate": 4.810680741380479e-05, + "loss": 5.2145, + "step": 20964 + }, + { + "epoch": 0.12468479398610714, + "grad_norm": 1.608810544013977, + "learning_rate": 4.8106629102455286e-05, + "loss": 5.2486, + "step": 20965 + }, + { + "epoch": 0.12469074126938814, + "grad_norm": 1.573190689086914, + "learning_rate": 4.81064507830395e-05, + "loss": 5.2476, + "step": 20966 + }, + { + "epoch": 0.12469668855266915, + "grad_norm": 1.5032795667648315, + "learning_rate": 4.810627245555748e-05, + "loss": 5.1557, + "step": 20967 + }, + { + "epoch": 0.12470263583595013, + "grad_norm": 1.3919012546539307, + "learning_rate": 4.810609412000931e-05, + "loss": 5.2812, + "step": 20968 + }, + { + "epoch": 0.12470858311923114, + "grad_norm": 1.417431354522705, + "learning_rate": 4.810591577639504e-05, + "loss": 5.3173, + "step": 20969 + }, + { + "epoch": 0.12471453040251214, + "grad_norm": 1.2135869264602661, + "learning_rate": 4.8105737424714724e-05, + "loss": 5.3511, + "step": 20970 + }, + { + "epoch": 0.12472047768579313, + "grad_norm": 1.3142472505569458, + "learning_rate": 4.810555906496844e-05, + "loss": 5.225, + "step": 20971 + }, + { + "epoch": 0.12472642496907413, + "grad_norm": 1.4344936609268188, + "learning_rate": 4.810538069715625e-05, + "loss": 5.5032, + "step": 20972 + }, + { + "epoch": 0.12473237225235513, + "grad_norm": 1.214281439781189, + "learning_rate": 4.81052023212782e-05, + "loss": 5.4466, + "step": 20973 + }, + { + "epoch": 0.12473831953563612, + "grad_norm": 1.5831886529922485, + "learning_rate": 4.810502393733437e-05, + "loss": 4.6211, + "step": 20974 + }, + { + "epoch": 0.12474426681891712, + "grad_norm": 1.6281508207321167, + "learning_rate": 4.8104845545324816e-05, + "loss": 4.6212, + "step": 20975 + }, + { + "epoch": 0.12475021410219811, + "grad_norm": 1.5753840208053589, + "learning_rate": 4.810466714524959e-05, + "loss": 4.7089, + "step": 20976 + }, + { + "epoch": 0.12475616138547911, + "grad_norm": 1.355692744255066, + "learning_rate": 4.810448873710877e-05, + "loss": 5.0399, + "step": 20977 + }, + { + "epoch": 0.12476210866876011, + "grad_norm": 1.27257239818573, + "learning_rate": 4.810431032090241e-05, + "loss": 4.7091, + "step": 20978 + }, + { + "epoch": 0.1247680559520411, + "grad_norm": 1.532210350036621, + "learning_rate": 4.810413189663058e-05, + "loss": 4.6682, + "step": 20979 + }, + { + "epoch": 0.1247740032353221, + "grad_norm": 1.4075580835342407, + "learning_rate": 4.810395346429333e-05, + "loss": 4.5135, + "step": 20980 + }, + { + "epoch": 0.1247799505186031, + "grad_norm": 1.3797897100448608, + "learning_rate": 4.810377502389073e-05, + "loss": 4.5548, + "step": 20981 + }, + { + "epoch": 0.1247858978018841, + "grad_norm": 1.4484235048294067, + "learning_rate": 4.810359657542284e-05, + "loss": 4.5336, + "step": 20982 + }, + { + "epoch": 0.1247918450851651, + "grad_norm": 1.4712706804275513, + "learning_rate": 4.810341811888972e-05, + "loss": 4.6805, + "step": 20983 + }, + { + "epoch": 0.1247977923684461, + "grad_norm": 1.548684000968933, + "learning_rate": 4.8103239654291444e-05, + "loss": 4.6239, + "step": 20984 + }, + { + "epoch": 0.12480373965172709, + "grad_norm": 1.481542944908142, + "learning_rate": 4.810306118162806e-05, + "loss": 4.981, + "step": 20985 + }, + { + "epoch": 0.12480968693500809, + "grad_norm": 1.423977017402649, + "learning_rate": 4.810288270089963e-05, + "loss": 5.1813, + "step": 20986 + }, + { + "epoch": 0.12481563421828909, + "grad_norm": 1.2712557315826416, + "learning_rate": 4.810270421210623e-05, + "loss": 5.1499, + "step": 20987 + }, + { + "epoch": 0.12482158150157008, + "grad_norm": 1.4444210529327393, + "learning_rate": 4.810252571524791e-05, + "loss": 5.1801, + "step": 20988 + }, + { + "epoch": 0.12482752878485108, + "grad_norm": 1.2743985652923584, + "learning_rate": 4.810234721032475e-05, + "loss": 5.1433, + "step": 20989 + }, + { + "epoch": 0.12483347606813208, + "grad_norm": 1.4066376686096191, + "learning_rate": 4.810216869733679e-05, + "loss": 5.1821, + "step": 20990 + }, + { + "epoch": 0.12483942335141307, + "grad_norm": 1.362889051437378, + "learning_rate": 4.81019901762841e-05, + "loss": 5.2135, + "step": 20991 + }, + { + "epoch": 0.12484537063469407, + "grad_norm": 1.2178412675857544, + "learning_rate": 4.810181164716674e-05, + "loss": 5.3131, + "step": 20992 + }, + { + "epoch": 0.12485131791797507, + "grad_norm": 1.7444922924041748, + "learning_rate": 4.8101633109984786e-05, + "loss": 4.8666, + "step": 20993 + }, + { + "epoch": 0.12485726520125606, + "grad_norm": 1.4151227474212646, + "learning_rate": 4.810145456473828e-05, + "loss": 5.0585, + "step": 20994 + }, + { + "epoch": 0.12486321248453706, + "grad_norm": 1.2906028032302856, + "learning_rate": 4.81012760114273e-05, + "loss": 5.1402, + "step": 20995 + }, + { + "epoch": 0.12486915976781807, + "grad_norm": 1.4265183210372925, + "learning_rate": 4.8101097450051906e-05, + "loss": 5.184, + "step": 20996 + }, + { + "epoch": 0.12487510705109905, + "grad_norm": 1.499804139137268, + "learning_rate": 4.8100918880612154e-05, + "loss": 4.9952, + "step": 20997 + }, + { + "epoch": 0.12488105433438006, + "grad_norm": 1.5296711921691895, + "learning_rate": 4.810074030310812e-05, + "loss": 4.9743, + "step": 20998 + }, + { + "epoch": 0.12488700161766106, + "grad_norm": 1.4345946311950684, + "learning_rate": 4.810056171753984e-05, + "loss": 4.9107, + "step": 20999 + }, + { + "epoch": 0.12489294890094205, + "grad_norm": 1.501966953277588, + "learning_rate": 4.81003831239074e-05, + "loss": 4.8123, + "step": 21000 + }, + { + "epoch": 0.12489889618422305, + "grad_norm": 1.1865864992141724, + "learning_rate": 4.810020452221086e-05, + "loss": 5.1614, + "step": 21001 + }, + { + "epoch": 0.12490484346750405, + "grad_norm": 1.345996379852295, + "learning_rate": 4.810002591245027e-05, + "loss": 4.9784, + "step": 21002 + }, + { + "epoch": 0.12491079075078504, + "grad_norm": 1.2252000570297241, + "learning_rate": 4.80998472946257e-05, + "loss": 4.9433, + "step": 21003 + }, + { + "epoch": 0.12491673803406604, + "grad_norm": 1.4540387392044067, + "learning_rate": 4.809966866873722e-05, + "loss": 4.8608, + "step": 21004 + }, + { + "epoch": 0.12492268531734703, + "grad_norm": 1.382969617843628, + "learning_rate": 4.809949003478488e-05, + "loss": 4.8168, + "step": 21005 + }, + { + "epoch": 0.12492863260062803, + "grad_norm": 1.3642408847808838, + "learning_rate": 4.809931139276874e-05, + "loss": 4.9262, + "step": 21006 + }, + { + "epoch": 0.12493457988390903, + "grad_norm": 1.1903620958328247, + "learning_rate": 4.809913274268887e-05, + "loss": 5.1817, + "step": 21007 + }, + { + "epoch": 0.12494052716719002, + "grad_norm": 1.3020774126052856, + "learning_rate": 4.809895408454534e-05, + "loss": 4.956, + "step": 21008 + }, + { + "epoch": 0.12494647445047102, + "grad_norm": 1.3209398984909058, + "learning_rate": 4.80987754183382e-05, + "loss": 4.9542, + "step": 21009 + }, + { + "epoch": 0.12495242173375203, + "grad_norm": 1.2684825658798218, + "learning_rate": 4.809859674406752e-05, + "loss": 5.2919, + "step": 21010 + }, + { + "epoch": 0.12495836901703301, + "grad_norm": 1.271053671836853, + "learning_rate": 4.809841806173335e-05, + "loss": 5.1397, + "step": 21011 + }, + { + "epoch": 0.12496431630031402, + "grad_norm": 1.2137185335159302, + "learning_rate": 4.809823937133576e-05, + "loss": 5.1874, + "step": 21012 + }, + { + "epoch": 0.12497026358359502, + "grad_norm": 1.2429122924804688, + "learning_rate": 4.8098060672874825e-05, + "loss": 5.0626, + "step": 21013 + }, + { + "epoch": 0.124976210866876, + "grad_norm": 1.3292062282562256, + "learning_rate": 4.809788196635058e-05, + "loss": 4.9019, + "step": 21014 + }, + { + "epoch": 0.12498215815015701, + "grad_norm": 1.3801854848861694, + "learning_rate": 4.8097703251763115e-05, + "loss": 4.8948, + "step": 21015 + }, + { + "epoch": 0.12498810543343801, + "grad_norm": 1.1259671449661255, + "learning_rate": 4.8097524529112484e-05, + "loss": 4.8041, + "step": 21016 + }, + { + "epoch": 0.124994052716719, + "grad_norm": 1.145451307296753, + "learning_rate": 4.809734579839873e-05, + "loss": 5.0012, + "step": 21017 + }, + { + "epoch": 0.125, + "grad_norm": 2.0128631591796875, + "learning_rate": 4.8097167059621945e-05, + "loss": 5.5174, + "step": 21018 + }, + { + "epoch": 0.125005947283281, + "grad_norm": 1.2371736764907837, + "learning_rate": 4.8096988312782174e-05, + "loss": 4.9491, + "step": 21019 + }, + { + "epoch": 0.125011894566562, + "grad_norm": 1.4009771347045898, + "learning_rate": 4.809680955787948e-05, + "loss": 4.8699, + "step": 21020 + }, + { + "epoch": 0.125017841849843, + "grad_norm": 1.2181386947631836, + "learning_rate": 4.809663079491393e-05, + "loss": 4.8258, + "step": 21021 + }, + { + "epoch": 0.12502378913312398, + "grad_norm": 1.3663759231567383, + "learning_rate": 4.809645202388559e-05, + "loss": 5.085, + "step": 21022 + }, + { + "epoch": 0.125029736416405, + "grad_norm": 1.4783004522323608, + "learning_rate": 4.809627324479451e-05, + "loss": 5.0309, + "step": 21023 + }, + { + "epoch": 0.12503568369968598, + "grad_norm": 1.5568218231201172, + "learning_rate": 4.809609445764076e-05, + "loss": 5.217, + "step": 21024 + }, + { + "epoch": 0.12504163098296697, + "grad_norm": 1.42091965675354, + "learning_rate": 4.80959156624244e-05, + "loss": 5.1213, + "step": 21025 + }, + { + "epoch": 0.125047578266248, + "grad_norm": 1.5361231565475464, + "learning_rate": 4.8095736859145504e-05, + "loss": 5.1539, + "step": 21026 + }, + { + "epoch": 0.12505352554952898, + "grad_norm": 1.4799479246139526, + "learning_rate": 4.809555804780411e-05, + "loss": 5.0524, + "step": 21027 + }, + { + "epoch": 0.12505947283280996, + "grad_norm": 1.379309892654419, + "learning_rate": 4.809537922840031e-05, + "loss": 4.8477, + "step": 21028 + }, + { + "epoch": 0.12506542011609098, + "grad_norm": 1.3503345251083374, + "learning_rate": 4.809520040093415e-05, + "loss": 5.3253, + "step": 21029 + }, + { + "epoch": 0.12507136739937197, + "grad_norm": 1.1925950050354004, + "learning_rate": 4.8095021565405684e-05, + "loss": 5.2129, + "step": 21030 + }, + { + "epoch": 0.12507731468265296, + "grad_norm": 1.433516025543213, + "learning_rate": 4.809484272181499e-05, + "loss": 5.1091, + "step": 21031 + }, + { + "epoch": 0.12508326196593397, + "grad_norm": 1.3334667682647705, + "learning_rate": 4.809466387016213e-05, + "loss": 5.3445, + "step": 21032 + }, + { + "epoch": 0.12508920924921496, + "grad_norm": 1.270871877670288, + "learning_rate": 4.809448501044715e-05, + "loss": 5.1455, + "step": 21033 + }, + { + "epoch": 0.12509515653249595, + "grad_norm": 1.2028634548187256, + "learning_rate": 4.8094306142670145e-05, + "loss": 5.1721, + "step": 21034 + }, + { + "epoch": 0.12510110381577697, + "grad_norm": 1.537757396697998, + "learning_rate": 4.809412726683114e-05, + "loss": 5.1853, + "step": 21035 + }, + { + "epoch": 0.12510705109905795, + "grad_norm": 1.3350294828414917, + "learning_rate": 4.809394838293021e-05, + "loss": 5.0725, + "step": 21036 + }, + { + "epoch": 0.12511299838233894, + "grad_norm": 1.3986246585845947, + "learning_rate": 4.8093769490967434e-05, + "loss": 5.1176, + "step": 21037 + }, + { + "epoch": 0.12511894566561996, + "grad_norm": 1.3993934392929077, + "learning_rate": 4.809359059094285e-05, + "loss": 5.1085, + "step": 21038 + }, + { + "epoch": 0.12512489294890095, + "grad_norm": 1.6875231266021729, + "learning_rate": 4.8093411682856535e-05, + "loss": 5.134, + "step": 21039 + }, + { + "epoch": 0.12513084023218193, + "grad_norm": 1.2966142892837524, + "learning_rate": 4.809323276670855e-05, + "loss": 5.1509, + "step": 21040 + }, + { + "epoch": 0.12513678751546295, + "grad_norm": 1.3994536399841309, + "learning_rate": 4.8093053842498956e-05, + "loss": 4.8962, + "step": 21041 + }, + { + "epoch": 0.12514273479874394, + "grad_norm": 1.3936022520065308, + "learning_rate": 4.809287491022782e-05, + "loss": 4.908, + "step": 21042 + }, + { + "epoch": 0.12514868208202493, + "grad_norm": 1.9262713193893433, + "learning_rate": 4.80926959698952e-05, + "loss": 5.0856, + "step": 21043 + }, + { + "epoch": 0.12515462936530594, + "grad_norm": 1.3765772581100464, + "learning_rate": 4.809251702150115e-05, + "loss": 5.0438, + "step": 21044 + }, + { + "epoch": 0.12516057664858693, + "grad_norm": 1.4509775638580322, + "learning_rate": 4.809233806504575e-05, + "loss": 5.2001, + "step": 21045 + }, + { + "epoch": 0.12516652393186792, + "grad_norm": 1.6581740379333496, + "learning_rate": 4.809215910052904e-05, + "loss": 4.7155, + "step": 21046 + }, + { + "epoch": 0.12517247121514893, + "grad_norm": 1.5386825799942017, + "learning_rate": 4.8091980127951115e-05, + "loss": 4.6354, + "step": 21047 + }, + { + "epoch": 0.12517841849842992, + "grad_norm": 1.3021749258041382, + "learning_rate": 4.8091801147312e-05, + "loss": 5.2241, + "step": 21048 + }, + { + "epoch": 0.1251843657817109, + "grad_norm": 1.3396178483963013, + "learning_rate": 4.809162215861179e-05, + "loss": 5.2361, + "step": 21049 + }, + { + "epoch": 0.1251903130649919, + "grad_norm": 1.381496548652649, + "learning_rate": 4.809144316185052e-05, + "loss": 5.3347, + "step": 21050 + }, + { + "epoch": 0.12519626034827291, + "grad_norm": 1.4430748224258423, + "learning_rate": 4.809126415702828e-05, + "loss": 4.895, + "step": 21051 + }, + { + "epoch": 0.1252022076315539, + "grad_norm": 1.2426742315292358, + "learning_rate": 4.809108514414511e-05, + "loss": 4.9085, + "step": 21052 + }, + { + "epoch": 0.1252081549148349, + "grad_norm": 1.224529504776001, + "learning_rate": 4.8090906123201085e-05, + "loss": 5.1997, + "step": 21053 + }, + { + "epoch": 0.1252141021981159, + "grad_norm": 1.295866847038269, + "learning_rate": 4.809072709419626e-05, + "loss": 5.5419, + "step": 21054 + }, + { + "epoch": 0.1252200494813969, + "grad_norm": 1.7327667474746704, + "learning_rate": 4.80905480571307e-05, + "loss": 5.1902, + "step": 21055 + }, + { + "epoch": 0.12522599676467788, + "grad_norm": 1.4727381467819214, + "learning_rate": 4.809036901200447e-05, + "loss": 4.9909, + "step": 21056 + }, + { + "epoch": 0.1252319440479589, + "grad_norm": 1.5449626445770264, + "learning_rate": 4.8090189958817626e-05, + "loss": 4.8721, + "step": 21057 + }, + { + "epoch": 0.1252378913312399, + "grad_norm": 1.563591718673706, + "learning_rate": 4.809001089757024e-05, + "loss": 5.0417, + "step": 21058 + }, + { + "epoch": 0.12524383861452087, + "grad_norm": 1.3692893981933594, + "learning_rate": 4.808983182826237e-05, + "loss": 4.9748, + "step": 21059 + }, + { + "epoch": 0.1252497858978019, + "grad_norm": 1.3994625806808472, + "learning_rate": 4.8089652750894074e-05, + "loss": 5.1823, + "step": 21060 + }, + { + "epoch": 0.12525573318108288, + "grad_norm": 1.3998682498931885, + "learning_rate": 4.8089473665465425e-05, + "loss": 5.2272, + "step": 21061 + }, + { + "epoch": 0.12526168046436387, + "grad_norm": 1.4436434507369995, + "learning_rate": 4.808929457197647e-05, + "loss": 5.4049, + "step": 21062 + }, + { + "epoch": 0.12526762774764488, + "grad_norm": 1.2826770544052124, + "learning_rate": 4.8089115470427294e-05, + "loss": 5.2065, + "step": 21063 + }, + { + "epoch": 0.12527357503092587, + "grad_norm": 1.4545691013336182, + "learning_rate": 4.808893636081794e-05, + "loss": 5.1212, + "step": 21064 + }, + { + "epoch": 0.12527952231420686, + "grad_norm": 1.70439875125885, + "learning_rate": 4.808875724314847e-05, + "loss": 4.9993, + "step": 21065 + }, + { + "epoch": 0.12528546959748788, + "grad_norm": 1.5612056255340576, + "learning_rate": 4.8088578117418965e-05, + "loss": 5.1109, + "step": 21066 + }, + { + "epoch": 0.12529141688076886, + "grad_norm": 1.3385684490203857, + "learning_rate": 4.808839898362947e-05, + "loss": 5.3485, + "step": 21067 + }, + { + "epoch": 0.12529736416404985, + "grad_norm": 1.4440029859542847, + "learning_rate": 4.808821984178006e-05, + "loss": 5.3289, + "step": 21068 + }, + { + "epoch": 0.12530331144733087, + "grad_norm": 1.4780069589614868, + "learning_rate": 4.808804069187078e-05, + "loss": 5.4379, + "step": 21069 + }, + { + "epoch": 0.12530925873061186, + "grad_norm": 1.4137150049209595, + "learning_rate": 4.808786153390171e-05, + "loss": 5.4666, + "step": 21070 + }, + { + "epoch": 0.12531520601389284, + "grad_norm": 1.3870670795440674, + "learning_rate": 4.80876823678729e-05, + "loss": 5.4342, + "step": 21071 + }, + { + "epoch": 0.12532115329717386, + "grad_norm": 1.3641326427459717, + "learning_rate": 4.808750319378442e-05, + "loss": 5.148, + "step": 21072 + }, + { + "epoch": 0.12532710058045485, + "grad_norm": 1.3099322319030762, + "learning_rate": 4.808732401163634e-05, + "loss": 5.1237, + "step": 21073 + }, + { + "epoch": 0.12533304786373584, + "grad_norm": 1.4198615550994873, + "learning_rate": 4.808714482142871e-05, + "loss": 5.5755, + "step": 21074 + }, + { + "epoch": 0.12533899514701685, + "grad_norm": 1.1760785579681396, + "learning_rate": 4.80869656231616e-05, + "loss": 5.5684, + "step": 21075 + }, + { + "epoch": 0.12534494243029784, + "grad_norm": 1.2611156702041626, + "learning_rate": 4.8086786416835054e-05, + "loss": 5.3834, + "step": 21076 + }, + { + "epoch": 0.12535088971357883, + "grad_norm": 1.085659384727478, + "learning_rate": 4.808660720244916e-05, + "loss": 5.2553, + "step": 21077 + }, + { + "epoch": 0.12535683699685984, + "grad_norm": 1.2537906169891357, + "learning_rate": 4.808642798000397e-05, + "loss": 5.3423, + "step": 21078 + }, + { + "epoch": 0.12536278428014083, + "grad_norm": 1.0891891717910767, + "learning_rate": 4.808624874949954e-05, + "loss": 5.4889, + "step": 21079 + }, + { + "epoch": 0.12536873156342182, + "grad_norm": 1.976110577583313, + "learning_rate": 4.808606951093595e-05, + "loss": 5.6103, + "step": 21080 + }, + { + "epoch": 0.12537467884670284, + "grad_norm": 1.3253698348999023, + "learning_rate": 4.808589026431324e-05, + "loss": 5.4673, + "step": 21081 + }, + { + "epoch": 0.12538062612998382, + "grad_norm": 1.4394372701644897, + "learning_rate": 4.808571100963149e-05, + "loss": 5.5256, + "step": 21082 + }, + { + "epoch": 0.1253865734132648, + "grad_norm": 1.45836341381073, + "learning_rate": 4.808553174689076e-05, + "loss": 4.5206, + "step": 21083 + }, + { + "epoch": 0.12539252069654583, + "grad_norm": 1.5719448328018188, + "learning_rate": 4.8085352476091105e-05, + "loss": 4.0577, + "step": 21084 + }, + { + "epoch": 0.12539846797982682, + "grad_norm": 1.3744319677352905, + "learning_rate": 4.808517319723259e-05, + "loss": 4.3965, + "step": 21085 + }, + { + "epoch": 0.1254044152631078, + "grad_norm": 1.4404634237289429, + "learning_rate": 4.8084993910315286e-05, + "loss": 4.3534, + "step": 21086 + }, + { + "epoch": 0.12541036254638882, + "grad_norm": 1.696215033531189, + "learning_rate": 4.8084814615339244e-05, + "loss": 5.4743, + "step": 21087 + }, + { + "epoch": 0.1254163098296698, + "grad_norm": 2.3401246070861816, + "learning_rate": 4.808463531230454e-05, + "loss": 4.3249, + "step": 21088 + }, + { + "epoch": 0.1254222571129508, + "grad_norm": 2.673963785171509, + "learning_rate": 4.808445600121122e-05, + "loss": 4.0038, + "step": 21089 + }, + { + "epoch": 0.1254282043962318, + "grad_norm": 2.551712989807129, + "learning_rate": 4.808427668205935e-05, + "loss": 4.0593, + "step": 21090 + }, + { + "epoch": 0.1254341516795128, + "grad_norm": 2.224776029586792, + "learning_rate": 4.8084097354849004e-05, + "loss": 4.4923, + "step": 21091 + }, + { + "epoch": 0.1254400989627938, + "grad_norm": 2.8964626789093018, + "learning_rate": 4.808391801958024e-05, + "loss": 4.8955, + "step": 21092 + }, + { + "epoch": 0.1254460462460748, + "grad_norm": 2.647202491760254, + "learning_rate": 4.808373867625312e-05, + "loss": 4.315, + "step": 21093 + }, + { + "epoch": 0.1254519935293558, + "grad_norm": 2.852851152420044, + "learning_rate": 4.80835593248677e-05, + "loss": 4.6153, + "step": 21094 + }, + { + "epoch": 0.12545794081263678, + "grad_norm": 1.5732487440109253, + "learning_rate": 4.808337996542405e-05, + "loss": 5.7685, + "step": 21095 + }, + { + "epoch": 0.1254638880959178, + "grad_norm": 1.764635682106018, + "learning_rate": 4.808320059792223e-05, + "loss": 5.8056, + "step": 21096 + }, + { + "epoch": 0.12546983537919879, + "grad_norm": 3.040402889251709, + "learning_rate": 4.80830212223623e-05, + "loss": 4.3029, + "step": 21097 + }, + { + "epoch": 0.12547578266247977, + "grad_norm": 2.3675732612609863, + "learning_rate": 4.8082841838744335e-05, + "loss": 4.2356, + "step": 21098 + }, + { + "epoch": 0.1254817299457608, + "grad_norm": 2.153254747390747, + "learning_rate": 4.808266244706838e-05, + "loss": 4.1071, + "step": 21099 + }, + { + "epoch": 0.12548767722904178, + "grad_norm": 2.181788921356201, + "learning_rate": 4.808248304733451e-05, + "loss": 4.1941, + "step": 21100 + }, + { + "epoch": 0.12549362451232277, + "grad_norm": 2.416555881500244, + "learning_rate": 4.808230363954278e-05, + "loss": 4.0926, + "step": 21101 + }, + { + "epoch": 0.12549957179560378, + "grad_norm": 1.7010666131973267, + "learning_rate": 4.808212422369327e-05, + "loss": 5.3639, + "step": 21102 + }, + { + "epoch": 0.12550551907888477, + "grad_norm": 1.4592742919921875, + "learning_rate": 4.808194479978601e-05, + "loss": 5.5641, + "step": 21103 + }, + { + "epoch": 0.12551146636216576, + "grad_norm": 1.5593754053115845, + "learning_rate": 4.808176536782109e-05, + "loss": 5.4008, + "step": 21104 + }, + { + "epoch": 0.12551741364544677, + "grad_norm": 1.7061179876327515, + "learning_rate": 4.8081585927798565e-05, + "loss": 5.6922, + "step": 21105 + }, + { + "epoch": 0.12552336092872776, + "grad_norm": 1.8220082521438599, + "learning_rate": 4.808140647971849e-05, + "loss": 5.4052, + "step": 21106 + }, + { + "epoch": 0.12552930821200875, + "grad_norm": 1.5218451023101807, + "learning_rate": 4.808122702358095e-05, + "loss": 5.4067, + "step": 21107 + }, + { + "epoch": 0.12553525549528974, + "grad_norm": 1.6590322256088257, + "learning_rate": 4.808104755938598e-05, + "loss": 5.5558, + "step": 21108 + }, + { + "epoch": 0.12554120277857075, + "grad_norm": 1.751290202140808, + "learning_rate": 4.808086808713366e-05, + "loss": 5.5584, + "step": 21109 + }, + { + "epoch": 0.12554715006185174, + "grad_norm": 1.6635403633117676, + "learning_rate": 4.8080688606824035e-05, + "loss": 5.4828, + "step": 21110 + }, + { + "epoch": 0.12555309734513273, + "grad_norm": 1.4710462093353271, + "learning_rate": 4.80805091184572e-05, + "loss": 5.4251, + "step": 21111 + }, + { + "epoch": 0.12555904462841375, + "grad_norm": 1.7598154544830322, + "learning_rate": 4.808032962203318e-05, + "loss": 5.5093, + "step": 21112 + }, + { + "epoch": 0.12556499191169473, + "grad_norm": 1.5128235816955566, + "learning_rate": 4.8080150117552057e-05, + "loss": 5.5069, + "step": 21113 + }, + { + "epoch": 0.12557093919497572, + "grad_norm": 1.5336002111434937, + "learning_rate": 4.80799706050139e-05, + "loss": 5.461, + "step": 21114 + }, + { + "epoch": 0.12557688647825674, + "grad_norm": 1.80903160572052, + "learning_rate": 4.807979108441876e-05, + "loss": 5.5894, + "step": 21115 + }, + { + "epoch": 0.12558283376153773, + "grad_norm": 1.8075919151306152, + "learning_rate": 4.8079611555766706e-05, + "loss": 5.4132, + "step": 21116 + }, + { + "epoch": 0.12558878104481871, + "grad_norm": 1.8319743871688843, + "learning_rate": 4.8079432019057794e-05, + "loss": 5.4409, + "step": 21117 + }, + { + "epoch": 0.12559472832809973, + "grad_norm": 1.7753643989562988, + "learning_rate": 4.8079252474292095e-05, + "loss": 5.425, + "step": 21118 + }, + { + "epoch": 0.12560067561138072, + "grad_norm": 1.614693522453308, + "learning_rate": 4.807907292146967e-05, + "loss": 5.2583, + "step": 21119 + }, + { + "epoch": 0.1256066228946617, + "grad_norm": 1.7520705461502075, + "learning_rate": 4.807889336059057e-05, + "loss": 5.5297, + "step": 21120 + }, + { + "epoch": 0.12561257017794272, + "grad_norm": 1.478826642036438, + "learning_rate": 4.8078713791654875e-05, + "loss": 5.8051, + "step": 21121 + }, + { + "epoch": 0.1256185174612237, + "grad_norm": 1.5645164251327515, + "learning_rate": 4.807853421466263e-05, + "loss": 5.6658, + "step": 21122 + }, + { + "epoch": 0.1256244647445047, + "grad_norm": 1.6254135370254517, + "learning_rate": 4.807835462961392e-05, + "loss": 5.2885, + "step": 21123 + }, + { + "epoch": 0.12563041202778572, + "grad_norm": 1.4290140867233276, + "learning_rate": 4.807817503650879e-05, + "loss": 5.6284, + "step": 21124 + }, + { + "epoch": 0.1256363593110667, + "grad_norm": 1.541447401046753, + "learning_rate": 4.8077995435347304e-05, + "loss": 5.8538, + "step": 21125 + }, + { + "epoch": 0.1256423065943477, + "grad_norm": 1.4778785705566406, + "learning_rate": 4.8077815826129526e-05, + "loss": 5.7019, + "step": 21126 + }, + { + "epoch": 0.1256482538776287, + "grad_norm": 1.5369840860366821, + "learning_rate": 4.807763620885552e-05, + "loss": 5.7164, + "step": 21127 + }, + { + "epoch": 0.1256542011609097, + "grad_norm": 1.5266817808151245, + "learning_rate": 4.807745658352536e-05, + "loss": 5.6203, + "step": 21128 + }, + { + "epoch": 0.12566014844419068, + "grad_norm": 1.4452829360961914, + "learning_rate": 4.8077276950139085e-05, + "loss": 5.7994, + "step": 21129 + }, + { + "epoch": 0.1256660957274717, + "grad_norm": 1.3619974851608276, + "learning_rate": 4.8077097308696786e-05, + "loss": 5.6703, + "step": 21130 + }, + { + "epoch": 0.1256720430107527, + "grad_norm": 1.1146374940872192, + "learning_rate": 4.80769176591985e-05, + "loss": 5.6631, + "step": 21131 + }, + { + "epoch": 0.12567799029403368, + "grad_norm": 1.2224622964859009, + "learning_rate": 4.8076738001644305e-05, + "loss": 5.5511, + "step": 21132 + }, + { + "epoch": 0.1256839375773147, + "grad_norm": 1.530564308166504, + "learning_rate": 4.807655833603426e-05, + "loss": 5.6201, + "step": 21133 + }, + { + "epoch": 0.12568988486059568, + "grad_norm": 1.5123308897018433, + "learning_rate": 4.807637866236842e-05, + "loss": 5.3411, + "step": 21134 + }, + { + "epoch": 0.12569583214387667, + "grad_norm": 1.4682310819625854, + "learning_rate": 4.807619898064686e-05, + "loss": 5.7009, + "step": 21135 + }, + { + "epoch": 0.12570177942715768, + "grad_norm": 1.7714731693267822, + "learning_rate": 4.8076019290869634e-05, + "loss": 5.8286, + "step": 21136 + }, + { + "epoch": 0.12570772671043867, + "grad_norm": 1.6663479804992676, + "learning_rate": 4.8075839593036814e-05, + "loss": 5.8158, + "step": 21137 + }, + { + "epoch": 0.12571367399371966, + "grad_norm": 1.458070158958435, + "learning_rate": 4.8075659887148454e-05, + "loss": 5.6954, + "step": 21138 + }, + { + "epoch": 0.12571962127700068, + "grad_norm": 2.572174072265625, + "learning_rate": 4.807548017320462e-05, + "loss": 4.715, + "step": 21139 + }, + { + "epoch": 0.12572556856028166, + "grad_norm": 2.4615628719329834, + "learning_rate": 4.8075300451205375e-05, + "loss": 4.8458, + "step": 21140 + }, + { + "epoch": 0.12573151584356265, + "grad_norm": 2.193739175796509, + "learning_rate": 4.807512072115078e-05, + "loss": 4.8746, + "step": 21141 + }, + { + "epoch": 0.12573746312684367, + "grad_norm": 1.9279803037643433, + "learning_rate": 4.80749409830409e-05, + "loss": 5.3174, + "step": 21142 + }, + { + "epoch": 0.12574341041012466, + "grad_norm": 2.0332345962524414, + "learning_rate": 4.807476123687579e-05, + "loss": 4.6696, + "step": 21143 + }, + { + "epoch": 0.12574935769340564, + "grad_norm": 2.1900224685668945, + "learning_rate": 4.8074581482655525e-05, + "loss": 4.7911, + "step": 21144 + }, + { + "epoch": 0.12575530497668666, + "grad_norm": 2.1232707500457764, + "learning_rate": 4.807440172038016e-05, + "loss": 4.4891, + "step": 21145 + }, + { + "epoch": 0.12576125225996765, + "grad_norm": 2.2046613693237305, + "learning_rate": 4.807422195004976e-05, + "loss": 5.1136, + "step": 21146 + }, + { + "epoch": 0.12576719954324864, + "grad_norm": 1.9693876504898071, + "learning_rate": 4.807404217166439e-05, + "loss": 5.7068, + "step": 21147 + }, + { + "epoch": 0.12577314682652965, + "grad_norm": 1.8561034202575684, + "learning_rate": 4.807386238522411e-05, + "loss": 5.6435, + "step": 21148 + }, + { + "epoch": 0.12577909410981064, + "grad_norm": 1.7676606178283691, + "learning_rate": 4.8073682590728974e-05, + "loss": 5.0934, + "step": 21149 + }, + { + "epoch": 0.12578504139309163, + "grad_norm": 1.729425311088562, + "learning_rate": 4.8073502788179064e-05, + "loss": 5.4891, + "step": 21150 + }, + { + "epoch": 0.12579098867637264, + "grad_norm": 1.5410076379776, + "learning_rate": 4.807332297757443e-05, + "loss": 5.919, + "step": 21151 + }, + { + "epoch": 0.12579693595965363, + "grad_norm": 1.5089081525802612, + "learning_rate": 4.8073143158915134e-05, + "loss": 5.9701, + "step": 21152 + }, + { + "epoch": 0.12580288324293462, + "grad_norm": 1.476559042930603, + "learning_rate": 4.807296333220125e-05, + "loss": 5.7351, + "step": 21153 + }, + { + "epoch": 0.12580883052621564, + "grad_norm": 2.055143117904663, + "learning_rate": 4.807278349743283e-05, + "loss": 5.4949, + "step": 21154 + }, + { + "epoch": 0.12581477780949663, + "grad_norm": 1.5232601165771484, + "learning_rate": 4.807260365460994e-05, + "loss": 5.3052, + "step": 21155 + }, + { + "epoch": 0.1258207250927776, + "grad_norm": 1.832310676574707, + "learning_rate": 4.807242380373264e-05, + "loss": 5.2832, + "step": 21156 + }, + { + "epoch": 0.12582667237605863, + "grad_norm": 1.8327937126159668, + "learning_rate": 4.807224394480099e-05, + "loss": 5.482, + "step": 21157 + }, + { + "epoch": 0.12583261965933962, + "grad_norm": 1.7728074789047241, + "learning_rate": 4.8072064077815065e-05, + "loss": 5.2636, + "step": 21158 + }, + { + "epoch": 0.1258385669426206, + "grad_norm": 1.6927982568740845, + "learning_rate": 4.8071884202774916e-05, + "loss": 5.369, + "step": 21159 + }, + { + "epoch": 0.12584451422590162, + "grad_norm": 1.8296928405761719, + "learning_rate": 4.8071704319680616e-05, + "loss": 5.4939, + "step": 21160 + }, + { + "epoch": 0.1258504615091826, + "grad_norm": 1.5497393608093262, + "learning_rate": 4.8071524428532224e-05, + "loss": 5.1909, + "step": 21161 + }, + { + "epoch": 0.1258564087924636, + "grad_norm": 1.8332972526550293, + "learning_rate": 4.807134452932979e-05, + "loss": 5.1555, + "step": 21162 + }, + { + "epoch": 0.1258623560757446, + "grad_norm": 1.856772780418396, + "learning_rate": 4.80711646220734e-05, + "loss": 5.1182, + "step": 21163 + }, + { + "epoch": 0.1258683033590256, + "grad_norm": 1.6313568353652954, + "learning_rate": 4.80709847067631e-05, + "loss": 5.0921, + "step": 21164 + }, + { + "epoch": 0.1258742506423066, + "grad_norm": 1.6753991842269897, + "learning_rate": 4.807080478339896e-05, + "loss": 5.1176, + "step": 21165 + }, + { + "epoch": 0.12588019792558758, + "grad_norm": 1.554154396057129, + "learning_rate": 4.807062485198104e-05, + "loss": 5.0849, + "step": 21166 + }, + { + "epoch": 0.1258861452088686, + "grad_norm": 1.9408693313598633, + "learning_rate": 4.8070444912509394e-05, + "loss": 4.9181, + "step": 21167 + }, + { + "epoch": 0.12589209249214958, + "grad_norm": 1.7222824096679688, + "learning_rate": 4.80702649649841e-05, + "loss": 5.6235, + "step": 21168 + }, + { + "epoch": 0.12589803977543057, + "grad_norm": 1.8301146030426025, + "learning_rate": 4.807008500940522e-05, + "loss": 5.3885, + "step": 21169 + }, + { + "epoch": 0.1259039870587116, + "grad_norm": 1.7527635097503662, + "learning_rate": 4.806990504577281e-05, + "loss": 5.3772, + "step": 21170 + }, + { + "epoch": 0.12590993434199257, + "grad_norm": 1.7983075380325317, + "learning_rate": 4.806972507408693e-05, + "loss": 5.7616, + "step": 21171 + }, + { + "epoch": 0.12591588162527356, + "grad_norm": 1.6842983961105347, + "learning_rate": 4.8069545094347653e-05, + "loss": 5.8808, + "step": 21172 + }, + { + "epoch": 0.12592182890855458, + "grad_norm": 1.8382412195205688, + "learning_rate": 4.806936510655503e-05, + "loss": 5.4304, + "step": 21173 + }, + { + "epoch": 0.12592777619183557, + "grad_norm": 1.833301305770874, + "learning_rate": 4.8069185110709133e-05, + "loss": 5.4221, + "step": 21174 + }, + { + "epoch": 0.12593372347511655, + "grad_norm": 1.52051842212677, + "learning_rate": 4.8069005106810025e-05, + "loss": 5.4133, + "step": 21175 + }, + { + "epoch": 0.12593967075839757, + "grad_norm": 1.5269474983215332, + "learning_rate": 4.806882509485776e-05, + "loss": 5.5549, + "step": 21176 + }, + { + "epoch": 0.12594561804167856, + "grad_norm": 1.8116832971572876, + "learning_rate": 4.806864507485241e-05, + "loss": 5.2989, + "step": 21177 + }, + { + "epoch": 0.12595156532495955, + "grad_norm": 1.7355883121490479, + "learning_rate": 4.806846504679403e-05, + "loss": 5.3839, + "step": 21178 + }, + { + "epoch": 0.12595751260824056, + "grad_norm": 1.7445424795150757, + "learning_rate": 4.806828501068269e-05, + "loss": 4.982, + "step": 21179 + }, + { + "epoch": 0.12596345989152155, + "grad_norm": 2.445030689239502, + "learning_rate": 4.806810496651845e-05, + "loss": 4.2665, + "step": 21180 + }, + { + "epoch": 0.12596940717480254, + "grad_norm": 2.6840837001800537, + "learning_rate": 4.8067924914301377e-05, + "loss": 3.9739, + "step": 21181 + }, + { + "epoch": 0.12597535445808355, + "grad_norm": 2.431506872177124, + "learning_rate": 4.806774485403153e-05, + "loss": 3.9235, + "step": 21182 + }, + { + "epoch": 0.12598130174136454, + "grad_norm": 3.124319076538086, + "learning_rate": 4.806756478570896e-05, + "loss": 3.7692, + "step": 21183 + }, + { + "epoch": 0.12598724902464553, + "grad_norm": 2.8702549934387207, + "learning_rate": 4.806738470933375e-05, + "loss": 3.6848, + "step": 21184 + }, + { + "epoch": 0.12599319630792655, + "grad_norm": 2.6687517166137695, + "learning_rate": 4.8067204624905954e-05, + "loss": 3.5655, + "step": 21185 + }, + { + "epoch": 0.12599914359120754, + "grad_norm": 2.3944084644317627, + "learning_rate": 4.806702453242563e-05, + "loss": 3.6176, + "step": 21186 + }, + { + "epoch": 0.12600509087448852, + "grad_norm": 2.565718173980713, + "learning_rate": 4.8066844431892856e-05, + "loss": 3.6557, + "step": 21187 + }, + { + "epoch": 0.12601103815776954, + "grad_norm": 2.9165117740631104, + "learning_rate": 4.806666432330768e-05, + "loss": 3.4013, + "step": 21188 + }, + { + "epoch": 0.12601698544105053, + "grad_norm": 3.232210397720337, + "learning_rate": 4.806648420667017e-05, + "loss": 4.8954, + "step": 21189 + }, + { + "epoch": 0.12602293272433152, + "grad_norm": 3.2784297466278076, + "learning_rate": 4.8066304081980384e-05, + "loss": 4.7801, + "step": 21190 + }, + { + "epoch": 0.12602888000761253, + "grad_norm": 2.8707523345947266, + "learning_rate": 4.8066123949238396e-05, + "loss": 4.7461, + "step": 21191 + }, + { + "epoch": 0.12603482729089352, + "grad_norm": 2.3808538913726807, + "learning_rate": 4.8065943808444255e-05, + "loss": 4.5148, + "step": 21192 + }, + { + "epoch": 0.1260407745741745, + "grad_norm": 2.2710814476013184, + "learning_rate": 4.806576365959804e-05, + "loss": 4.522, + "step": 21193 + }, + { + "epoch": 0.12604672185745552, + "grad_norm": 2.2108187675476074, + "learning_rate": 4.80655835026998e-05, + "loss": 4.7575, + "step": 21194 + }, + { + "epoch": 0.1260526691407365, + "grad_norm": 2.1496641635894775, + "learning_rate": 4.80654033377496e-05, + "loss": 4.6543, + "step": 21195 + }, + { + "epoch": 0.1260586164240175, + "grad_norm": 1.9770373106002808, + "learning_rate": 4.806522316474752e-05, + "loss": 4.59, + "step": 21196 + }, + { + "epoch": 0.12606456370729852, + "grad_norm": 1.8799597024917603, + "learning_rate": 4.80650429836936e-05, + "loss": 4.598, + "step": 21197 + }, + { + "epoch": 0.1260705109905795, + "grad_norm": 1.846724510192871, + "learning_rate": 4.8064862794587903e-05, + "loss": 4.4912, + "step": 21198 + }, + { + "epoch": 0.1260764582738605, + "grad_norm": 1.7821966409683228, + "learning_rate": 4.806468259743051e-05, + "loss": 4.4898, + "step": 21199 + }, + { + "epoch": 0.1260824055571415, + "grad_norm": 1.7804360389709473, + "learning_rate": 4.806450239222148e-05, + "loss": 4.5324, + "step": 21200 + }, + { + "epoch": 0.1260883528404225, + "grad_norm": 1.705761194229126, + "learning_rate": 4.8064322178960864e-05, + "loss": 4.7046, + "step": 21201 + }, + { + "epoch": 0.12609430012370348, + "grad_norm": 2.41103458404541, + "learning_rate": 4.8064141957648726e-05, + "loss": 5.1943, + "step": 21202 + }, + { + "epoch": 0.1261002474069845, + "grad_norm": 2.3028182983398438, + "learning_rate": 4.806396172828515e-05, + "loss": 5.0494, + "step": 21203 + }, + { + "epoch": 0.1261061946902655, + "grad_norm": 2.1674535274505615, + "learning_rate": 4.806378149087016e-05, + "loss": 5.3104, + "step": 21204 + }, + { + "epoch": 0.12611214197354648, + "grad_norm": 1.9217156171798706, + "learning_rate": 4.8063601245403864e-05, + "loss": 5.2403, + "step": 21205 + }, + { + "epoch": 0.1261180892568275, + "grad_norm": 2.097116231918335, + "learning_rate": 4.806342099188629e-05, + "loss": 5.3471, + "step": 21206 + }, + { + "epoch": 0.12612403654010848, + "grad_norm": 1.8356170654296875, + "learning_rate": 4.806324073031751e-05, + "loss": 5.2168, + "step": 21207 + }, + { + "epoch": 0.12612998382338947, + "grad_norm": 2.2306652069091797, + "learning_rate": 4.806306046069761e-05, + "loss": 5.1406, + "step": 21208 + }, + { + "epoch": 0.12613593110667048, + "grad_norm": 1.8946762084960938, + "learning_rate": 4.8062880183026624e-05, + "loss": 5.072, + "step": 21209 + }, + { + "epoch": 0.12614187838995147, + "grad_norm": 2.0963854789733887, + "learning_rate": 4.806269989730462e-05, + "loss": 5.2702, + "step": 21210 + }, + { + "epoch": 0.12614782567323246, + "grad_norm": 1.859677791595459, + "learning_rate": 4.806251960353167e-05, + "loss": 5.1133, + "step": 21211 + }, + { + "epoch": 0.12615377295651348, + "grad_norm": 1.9993607997894287, + "learning_rate": 4.806233930170783e-05, + "loss": 5.1201, + "step": 21212 + }, + { + "epoch": 0.12615972023979447, + "grad_norm": 1.7218701839447021, + "learning_rate": 4.8062158991833176e-05, + "loss": 5.0055, + "step": 21213 + }, + { + "epoch": 0.12616566752307545, + "grad_norm": 1.9172027111053467, + "learning_rate": 4.806197867390775e-05, + "loss": 4.955, + "step": 21214 + }, + { + "epoch": 0.12617161480635647, + "grad_norm": 2.0665276050567627, + "learning_rate": 4.8061798347931627e-05, + "loss": 4.842, + "step": 21215 + }, + { + "epoch": 0.12617756208963746, + "grad_norm": 1.932822346687317, + "learning_rate": 4.806161801390486e-05, + "loss": 4.5687, + "step": 21216 + }, + { + "epoch": 0.12618350937291845, + "grad_norm": 1.7978770732879639, + "learning_rate": 4.806143767182754e-05, + "loss": 4.6994, + "step": 21217 + }, + { + "epoch": 0.12618945665619946, + "grad_norm": 1.9298393726348877, + "learning_rate": 4.80612573216997e-05, + "loss": 4.8935, + "step": 21218 + }, + { + "epoch": 0.12619540393948045, + "grad_norm": 1.8706467151641846, + "learning_rate": 4.806107696352141e-05, + "loss": 4.699, + "step": 21219 + }, + { + "epoch": 0.12620135122276144, + "grad_norm": 1.946582317352295, + "learning_rate": 4.806089659729274e-05, + "loss": 4.9519, + "step": 21220 + }, + { + "epoch": 0.12620729850604245, + "grad_norm": 2.1021311283111572, + "learning_rate": 4.806071622301375e-05, + "loss": 4.8315, + "step": 21221 + }, + { + "epoch": 0.12621324578932344, + "grad_norm": 2.110234022140503, + "learning_rate": 4.8060535840684504e-05, + "loss": 4.6524, + "step": 21222 + }, + { + "epoch": 0.12621919307260443, + "grad_norm": 2.1723785400390625, + "learning_rate": 4.806035545030506e-05, + "loss": 4.7154, + "step": 21223 + }, + { + "epoch": 0.12622514035588542, + "grad_norm": 1.8978101015090942, + "learning_rate": 4.806017505187548e-05, + "loss": 4.6743, + "step": 21224 + }, + { + "epoch": 0.12623108763916643, + "grad_norm": 2.0092225074768066, + "learning_rate": 4.8059994645395833e-05, + "loss": 4.9198, + "step": 21225 + }, + { + "epoch": 0.12623703492244742, + "grad_norm": 1.935624122619629, + "learning_rate": 4.8059814230866184e-05, + "loss": 4.7253, + "step": 21226 + }, + { + "epoch": 0.1262429822057284, + "grad_norm": 1.9758509397506714, + "learning_rate": 4.80596338082866e-05, + "loss": 4.6388, + "step": 21227 + }, + { + "epoch": 0.12624892948900943, + "grad_norm": 2.0389976501464844, + "learning_rate": 4.805945337765712e-05, + "loss": 4.7527, + "step": 21228 + }, + { + "epoch": 0.12625487677229041, + "grad_norm": 2.0781445503234863, + "learning_rate": 4.805927293897783e-05, + "loss": 4.7985, + "step": 21229 + }, + { + "epoch": 0.1262608240555714, + "grad_norm": 2.0403099060058594, + "learning_rate": 4.8059092492248786e-05, + "loss": 5.1442, + "step": 21230 + }, + { + "epoch": 0.12626677133885242, + "grad_norm": 2.141681432723999, + "learning_rate": 4.805891203747005e-05, + "loss": 5.1191, + "step": 21231 + }, + { + "epoch": 0.1262727186221334, + "grad_norm": 2.159761905670166, + "learning_rate": 4.805873157464169e-05, + "loss": 5.2995, + "step": 21232 + }, + { + "epoch": 0.1262786659054144, + "grad_norm": 2.568081855773926, + "learning_rate": 4.805855110376376e-05, + "loss": 5.4263, + "step": 21233 + }, + { + "epoch": 0.1262846131886954, + "grad_norm": 1.8911200761795044, + "learning_rate": 4.8058370624836336e-05, + "loss": 5.3457, + "step": 21234 + }, + { + "epoch": 0.1262905604719764, + "grad_norm": 2.3370580673217773, + "learning_rate": 4.805819013785946e-05, + "loss": 4.8342, + "step": 21235 + }, + { + "epoch": 0.1262965077552574, + "grad_norm": 2.669029474258423, + "learning_rate": 4.805800964283322e-05, + "loss": 4.9175, + "step": 21236 + }, + { + "epoch": 0.1263024550385384, + "grad_norm": 1.9824459552764893, + "learning_rate": 4.8057829139757657e-05, + "loss": 4.6509, + "step": 21237 + }, + { + "epoch": 0.1263084023218194, + "grad_norm": 1.9576833248138428, + "learning_rate": 4.805764862863286e-05, + "loss": 5.4197, + "step": 21238 + }, + { + "epoch": 0.12631434960510038, + "grad_norm": 1.9594717025756836, + "learning_rate": 4.805746810945886e-05, + "loss": 5.7506, + "step": 21239 + }, + { + "epoch": 0.1263202968883814, + "grad_norm": 2.063676357269287, + "learning_rate": 4.8057287582235746e-05, + "loss": 5.6675, + "step": 21240 + }, + { + "epoch": 0.12632624417166238, + "grad_norm": 1.9354885816574097, + "learning_rate": 4.805710704696356e-05, + "loss": 5.1697, + "step": 21241 + }, + { + "epoch": 0.12633219145494337, + "grad_norm": 1.9859137535095215, + "learning_rate": 4.8056926503642384e-05, + "loss": 4.9055, + "step": 21242 + }, + { + "epoch": 0.1263381387382244, + "grad_norm": 2.1015024185180664, + "learning_rate": 4.805674595227228e-05, + "loss": 4.4961, + "step": 21243 + }, + { + "epoch": 0.12634408602150538, + "grad_norm": 2.225673198699951, + "learning_rate": 4.805656539285329e-05, + "loss": 4.2943, + "step": 21244 + }, + { + "epoch": 0.12635003330478636, + "grad_norm": 1.9753731489181519, + "learning_rate": 4.8056384825385495e-05, + "loss": 4.401, + "step": 21245 + }, + { + "epoch": 0.12635598058806738, + "grad_norm": 1.693865180015564, + "learning_rate": 4.805620424986896e-05, + "loss": 4.2992, + "step": 21246 + }, + { + "epoch": 0.12636192787134837, + "grad_norm": 2.0757269859313965, + "learning_rate": 4.805602366630374e-05, + "loss": 4.4564, + "step": 21247 + }, + { + "epoch": 0.12636787515462936, + "grad_norm": 1.559611201286316, + "learning_rate": 4.80558430746899e-05, + "loss": 5.95, + "step": 21248 + }, + { + "epoch": 0.12637382243791037, + "grad_norm": 1.7863824367523193, + "learning_rate": 4.80556624750275e-05, + "loss": 5.2208, + "step": 21249 + }, + { + "epoch": 0.12637976972119136, + "grad_norm": 1.7766302824020386, + "learning_rate": 4.805548186731661e-05, + "loss": 4.9666, + "step": 21250 + }, + { + "epoch": 0.12638571700447235, + "grad_norm": 1.5633225440979004, + "learning_rate": 4.805530125155728e-05, + "loss": 4.7051, + "step": 21251 + }, + { + "epoch": 0.12639166428775336, + "grad_norm": 1.795332431793213, + "learning_rate": 4.80551206277496e-05, + "loss": 4.624, + "step": 21252 + }, + { + "epoch": 0.12639761157103435, + "grad_norm": 2.2065796852111816, + "learning_rate": 4.805493999589361e-05, + "loss": 4.2034, + "step": 21253 + }, + { + "epoch": 0.12640355885431534, + "grad_norm": 2.0833165645599365, + "learning_rate": 4.805475935598937e-05, + "loss": 4.3267, + "step": 21254 + }, + { + "epoch": 0.12640950613759636, + "grad_norm": 2.591543436050415, + "learning_rate": 4.8054578708036954e-05, + "loss": 4.5015, + "step": 21255 + }, + { + "epoch": 0.12641545342087734, + "grad_norm": 1.7929967641830444, + "learning_rate": 4.805439805203643e-05, + "loss": 5.1193, + "step": 21256 + }, + { + "epoch": 0.12642140070415833, + "grad_norm": 1.632691740989685, + "learning_rate": 4.805421738798785e-05, + "loss": 4.728, + "step": 21257 + }, + { + "epoch": 0.12642734798743935, + "grad_norm": 1.844673752784729, + "learning_rate": 4.8054036715891284e-05, + "loss": 4.8617, + "step": 21258 + }, + { + "epoch": 0.12643329527072034, + "grad_norm": 1.7764726877212524, + "learning_rate": 4.805385603574678e-05, + "loss": 5.0102, + "step": 21259 + }, + { + "epoch": 0.12643924255400132, + "grad_norm": 1.7257095575332642, + "learning_rate": 4.8053675347554425e-05, + "loss": 5.4136, + "step": 21260 + }, + { + "epoch": 0.12644518983728234, + "grad_norm": 1.9378974437713623, + "learning_rate": 4.805349465131427e-05, + "loss": 4.8102, + "step": 21261 + }, + { + "epoch": 0.12645113712056333, + "grad_norm": 2.1207330226898193, + "learning_rate": 4.805331394702637e-05, + "loss": 5.137, + "step": 21262 + }, + { + "epoch": 0.12645708440384432, + "grad_norm": 2.630957841873169, + "learning_rate": 4.8053133234690806e-05, + "loss": 3.9948, + "step": 21263 + }, + { + "epoch": 0.12646303168712533, + "grad_norm": 2.5051863193511963, + "learning_rate": 4.805295251430762e-05, + "loss": 3.7358, + "step": 21264 + }, + { + "epoch": 0.12646897897040632, + "grad_norm": 2.4558019638061523, + "learning_rate": 4.805277178587689e-05, + "loss": 4.1314, + "step": 21265 + }, + { + "epoch": 0.1264749262536873, + "grad_norm": 2.1878461837768555, + "learning_rate": 4.805259104939869e-05, + "loss": 5.1189, + "step": 21266 + }, + { + "epoch": 0.12648087353696832, + "grad_norm": 2.303126811981201, + "learning_rate": 4.805241030487305e-05, + "loss": 4.4202, + "step": 21267 + }, + { + "epoch": 0.1264868208202493, + "grad_norm": 2.4533417224884033, + "learning_rate": 4.805222955230006e-05, + "loss": 4.4752, + "step": 21268 + }, + { + "epoch": 0.1264927681035303, + "grad_norm": 2.4850356578826904, + "learning_rate": 4.805204879167977e-05, + "loss": 4.0938, + "step": 21269 + }, + { + "epoch": 0.12649871538681132, + "grad_norm": 2.622119665145874, + "learning_rate": 4.805186802301226e-05, + "loss": 3.5693, + "step": 21270 + }, + { + "epoch": 0.1265046626700923, + "grad_norm": 2.5546908378601074, + "learning_rate": 4.8051687246297574e-05, + "loss": 4.1895, + "step": 21271 + }, + { + "epoch": 0.1265106099533733, + "grad_norm": 2.6318092346191406, + "learning_rate": 4.805150646153578e-05, + "loss": 4.5214, + "step": 21272 + }, + { + "epoch": 0.1265165572366543, + "grad_norm": 2.380413770675659, + "learning_rate": 4.805132566872694e-05, + "loss": 4.601, + "step": 21273 + }, + { + "epoch": 0.1265225045199353, + "grad_norm": 2.652449369430542, + "learning_rate": 4.805114486787112e-05, + "loss": 4.7164, + "step": 21274 + }, + { + "epoch": 0.12652845180321629, + "grad_norm": 2.6453335285186768, + "learning_rate": 4.8050964058968394e-05, + "loss": 4.8007, + "step": 21275 + }, + { + "epoch": 0.1265343990864973, + "grad_norm": 2.226515054702759, + "learning_rate": 4.8050783242018805e-05, + "loss": 4.7653, + "step": 21276 + }, + { + "epoch": 0.1265403463697783, + "grad_norm": 2.678157091140747, + "learning_rate": 4.805060241702243e-05, + "loss": 4.8511, + "step": 21277 + }, + { + "epoch": 0.12654629365305928, + "grad_norm": 2.2161943912506104, + "learning_rate": 4.8050421583979324e-05, + "loss": 4.6734, + "step": 21278 + }, + { + "epoch": 0.1265522409363403, + "grad_norm": 2.242539882659912, + "learning_rate": 4.805024074288956e-05, + "loss": 4.5445, + "step": 21279 + }, + { + "epoch": 0.12655818821962128, + "grad_norm": 1.9599577188491821, + "learning_rate": 4.805005989375319e-05, + "loss": 4.7331, + "step": 21280 + }, + { + "epoch": 0.12656413550290227, + "grad_norm": 2.1399378776550293, + "learning_rate": 4.8049879036570286e-05, + "loss": 4.1747, + "step": 21281 + }, + { + "epoch": 0.12657008278618326, + "grad_norm": 2.202322244644165, + "learning_rate": 4.8049698171340904e-05, + "loss": 4.3195, + "step": 21282 + }, + { + "epoch": 0.12657603006946427, + "grad_norm": 2.071727991104126, + "learning_rate": 4.8049517298065115e-05, + "loss": 4.3142, + "step": 21283 + }, + { + "epoch": 0.12658197735274526, + "grad_norm": 1.8801134824752808, + "learning_rate": 4.8049336416742974e-05, + "loss": 4.2353, + "step": 21284 + }, + { + "epoch": 0.12658792463602625, + "grad_norm": 1.8937469720840454, + "learning_rate": 4.804915552737455e-05, + "loss": 4.1141, + "step": 21285 + }, + { + "epoch": 0.12659387191930727, + "grad_norm": 1.8500044345855713, + "learning_rate": 4.8048974629959906e-05, + "loss": 4.0509, + "step": 21286 + }, + { + "epoch": 0.12659981920258825, + "grad_norm": 1.8931934833526611, + "learning_rate": 4.8048793724499095e-05, + "loss": 4.1905, + "step": 21287 + }, + { + "epoch": 0.12660576648586924, + "grad_norm": 1.6579469442367554, + "learning_rate": 4.8048612810992196e-05, + "loss": 4.8032, + "step": 21288 + }, + { + "epoch": 0.12661171376915026, + "grad_norm": 1.7402268648147583, + "learning_rate": 4.804843188943926e-05, + "loss": 5.363, + "step": 21289 + }, + { + "epoch": 0.12661766105243125, + "grad_norm": 1.6550151109695435, + "learning_rate": 4.804825095984036e-05, + "loss": 5.4504, + "step": 21290 + }, + { + "epoch": 0.12662360833571223, + "grad_norm": 1.5498002767562866, + "learning_rate": 4.8048070022195546e-05, + "loss": 5.2858, + "step": 21291 + }, + { + "epoch": 0.12662955561899325, + "grad_norm": 1.6577101945877075, + "learning_rate": 4.804788907650489e-05, + "loss": 5.0535, + "step": 21292 + }, + { + "epoch": 0.12663550290227424, + "grad_norm": 1.5144888162612915, + "learning_rate": 4.804770812276845e-05, + "loss": 5.0564, + "step": 21293 + }, + { + "epoch": 0.12664145018555523, + "grad_norm": 1.7675977945327759, + "learning_rate": 4.804752716098631e-05, + "loss": 4.9044, + "step": 21294 + }, + { + "epoch": 0.12664739746883624, + "grad_norm": 1.6419012546539307, + "learning_rate": 4.8047346191158506e-05, + "loss": 5.1735, + "step": 21295 + }, + { + "epoch": 0.12665334475211723, + "grad_norm": 1.9034998416900635, + "learning_rate": 4.8047165213285106e-05, + "loss": 5.762, + "step": 21296 + }, + { + "epoch": 0.12665929203539822, + "grad_norm": 2.2357866764068604, + "learning_rate": 4.8046984227366186e-05, + "loss": 5.0351, + "step": 21297 + }, + { + "epoch": 0.12666523931867923, + "grad_norm": 1.528701663017273, + "learning_rate": 4.8046803233401796e-05, + "loss": 5.3659, + "step": 21298 + }, + { + "epoch": 0.12667118660196022, + "grad_norm": 1.5450912714004517, + "learning_rate": 4.8046622231392015e-05, + "loss": 5.4961, + "step": 21299 + }, + { + "epoch": 0.1266771338852412, + "grad_norm": 2.459630012512207, + "learning_rate": 4.804644122133689e-05, + "loss": 4.6308, + "step": 21300 + }, + { + "epoch": 0.12668308116852223, + "grad_norm": 1.8703144788742065, + "learning_rate": 4.8046260203236494e-05, + "loss": 4.6424, + "step": 21301 + }, + { + "epoch": 0.12668902845180322, + "grad_norm": 1.4294613599777222, + "learning_rate": 4.804607917709088e-05, + "loss": 5.5703, + "step": 21302 + }, + { + "epoch": 0.1266949757350842, + "grad_norm": 1.6063963174819946, + "learning_rate": 4.804589814290012e-05, + "loss": 5.6344, + "step": 21303 + }, + { + "epoch": 0.12670092301836522, + "grad_norm": 2.1621460914611816, + "learning_rate": 4.8045717100664275e-05, + "loss": 5.1798, + "step": 21304 + }, + { + "epoch": 0.1267068703016462, + "grad_norm": 2.187513828277588, + "learning_rate": 4.804553605038341e-05, + "loss": 4.4837, + "step": 21305 + }, + { + "epoch": 0.1267128175849272, + "grad_norm": 2.5205118656158447, + "learning_rate": 4.804535499205759e-05, + "loss": 4.5554, + "step": 21306 + }, + { + "epoch": 0.1267187648682082, + "grad_norm": 2.196026563644409, + "learning_rate": 4.804517392568687e-05, + "loss": 4.5849, + "step": 21307 + }, + { + "epoch": 0.1267247121514892, + "grad_norm": 2.152150869369507, + "learning_rate": 4.804499285127132e-05, + "loss": 4.4153, + "step": 21308 + }, + { + "epoch": 0.1267306594347702, + "grad_norm": 2.398475170135498, + "learning_rate": 4.8044811768811e-05, + "loss": 4.1129, + "step": 21309 + }, + { + "epoch": 0.1267366067180512, + "grad_norm": 2.4291298389434814, + "learning_rate": 4.8044630678305976e-05, + "loss": 4.4199, + "step": 21310 + }, + { + "epoch": 0.1267425540013322, + "grad_norm": 2.6893248558044434, + "learning_rate": 4.80444495797563e-05, + "loss": 4.419, + "step": 21311 + }, + { + "epoch": 0.12674850128461318, + "grad_norm": 2.369361400604248, + "learning_rate": 4.804426847316206e-05, + "loss": 4.3434, + "step": 21312 + }, + { + "epoch": 0.1267544485678942, + "grad_norm": 2.206676721572876, + "learning_rate": 4.804408735852329e-05, + "loss": 4.2195, + "step": 21313 + }, + { + "epoch": 0.12676039585117518, + "grad_norm": 2.3347322940826416, + "learning_rate": 4.8043906235840074e-05, + "loss": 4.352, + "step": 21314 + }, + { + "epoch": 0.12676634313445617, + "grad_norm": 2.4026732444763184, + "learning_rate": 4.804372510511247e-05, + "loss": 4.0351, + "step": 21315 + }, + { + "epoch": 0.1267722904177372, + "grad_norm": 2.3547754287719727, + "learning_rate": 4.8043543966340546e-05, + "loss": 4.1292, + "step": 21316 + }, + { + "epoch": 0.12677823770101818, + "grad_norm": 2.3924174308776855, + "learning_rate": 4.804336281952434e-05, + "loss": 4.138, + "step": 21317 + }, + { + "epoch": 0.12678418498429916, + "grad_norm": 2.063361883163452, + "learning_rate": 4.804318166466395e-05, + "loss": 4.1288, + "step": 21318 + }, + { + "epoch": 0.12679013226758018, + "grad_norm": 2.1719813346862793, + "learning_rate": 4.8043000501759415e-05, + "loss": 4.3262, + "step": 21319 + }, + { + "epoch": 0.12679607955086117, + "grad_norm": 2.3787803649902344, + "learning_rate": 4.8042819330810803e-05, + "loss": 4.448, + "step": 21320 + }, + { + "epoch": 0.12680202683414216, + "grad_norm": 2.369344472885132, + "learning_rate": 4.80426381518182e-05, + "loss": 4.4237, + "step": 21321 + }, + { + "epoch": 0.12680797411742317, + "grad_norm": 1.9213550090789795, + "learning_rate": 4.804245696478163e-05, + "loss": 4.8805, + "step": 21322 + }, + { + "epoch": 0.12681392140070416, + "grad_norm": 2.1709017753601074, + "learning_rate": 4.804227576970118e-05, + "loss": 5.7745, + "step": 21323 + }, + { + "epoch": 0.12681986868398515, + "grad_norm": 2.1823856830596924, + "learning_rate": 4.8042094566576925e-05, + "loss": 5.561, + "step": 21324 + }, + { + "epoch": 0.12682581596726616, + "grad_norm": 2.403367519378662, + "learning_rate": 4.80419133554089e-05, + "loss": 5.6699, + "step": 21325 + }, + { + "epoch": 0.12683176325054715, + "grad_norm": 1.8335449695587158, + "learning_rate": 4.8041732136197184e-05, + "loss": 5.5058, + "step": 21326 + }, + { + "epoch": 0.12683771053382814, + "grad_norm": 1.7406642436981201, + "learning_rate": 4.804155090894183e-05, + "loss": 5.6536, + "step": 21327 + }, + { + "epoch": 0.12684365781710916, + "grad_norm": 2.160098075866699, + "learning_rate": 4.804136967364291e-05, + "loss": 5.4742, + "step": 21328 + }, + { + "epoch": 0.12684960510039014, + "grad_norm": 1.5187212228775024, + "learning_rate": 4.804118843030049e-05, + "loss": 5.2908, + "step": 21329 + }, + { + "epoch": 0.12685555238367113, + "grad_norm": 1.387417197227478, + "learning_rate": 4.804100717891463e-05, + "loss": 5.3319, + "step": 21330 + }, + { + "epoch": 0.12686149966695215, + "grad_norm": 1.3029687404632568, + "learning_rate": 4.80408259194854e-05, + "loss": 5.4069, + "step": 21331 + }, + { + "epoch": 0.12686744695023314, + "grad_norm": 1.7097088098526, + "learning_rate": 4.804064465201284e-05, + "loss": 4.8422, + "step": 21332 + }, + { + "epoch": 0.12687339423351413, + "grad_norm": 1.7519829273223877, + "learning_rate": 4.804046337649704e-05, + "loss": 5.4513, + "step": 21333 + }, + { + "epoch": 0.12687934151679514, + "grad_norm": 1.5313260555267334, + "learning_rate": 4.8040282092938046e-05, + "loss": 4.8656, + "step": 21334 + }, + { + "epoch": 0.12688528880007613, + "grad_norm": 1.629780888557434, + "learning_rate": 4.804010080133593e-05, + "loss": 4.8751, + "step": 21335 + }, + { + "epoch": 0.12689123608335712, + "grad_norm": 1.7247028350830078, + "learning_rate": 4.8039919501690756e-05, + "loss": 4.7207, + "step": 21336 + }, + { + "epoch": 0.12689718336663813, + "grad_norm": 1.517016887664795, + "learning_rate": 4.803973819400258e-05, + "loss": 5.0604, + "step": 21337 + }, + { + "epoch": 0.12690313064991912, + "grad_norm": 1.4583669900894165, + "learning_rate": 4.8039556878271475e-05, + "loss": 5.0638, + "step": 21338 + }, + { + "epoch": 0.1269090779332001, + "grad_norm": 1.725014567375183, + "learning_rate": 4.803937555449749e-05, + "loss": 5.5831, + "step": 21339 + }, + { + "epoch": 0.1269150252164811, + "grad_norm": 1.4144753217697144, + "learning_rate": 4.803919422268071e-05, + "loss": 5.3899, + "step": 21340 + }, + { + "epoch": 0.1269209724997621, + "grad_norm": 1.4197511672973633, + "learning_rate": 4.803901288282117e-05, + "loss": 5.4904, + "step": 21341 + }, + { + "epoch": 0.1269269197830431, + "grad_norm": 1.5491420030593872, + "learning_rate": 4.803883153491896e-05, + "loss": 5.5008, + "step": 21342 + }, + { + "epoch": 0.1269328670663241, + "grad_norm": 1.4152858257293701, + "learning_rate": 4.803865017897412e-05, + "loss": 5.5328, + "step": 21343 + }, + { + "epoch": 0.1269388143496051, + "grad_norm": 1.6931630373001099, + "learning_rate": 4.803846881498674e-05, + "loss": 5.4435, + "step": 21344 + }, + { + "epoch": 0.1269447616328861, + "grad_norm": 1.4955002069473267, + "learning_rate": 4.803828744295686e-05, + "loss": 5.3631, + "step": 21345 + }, + { + "epoch": 0.12695070891616708, + "grad_norm": 1.5340615510940552, + "learning_rate": 4.803810606288455e-05, + "loss": 5.4711, + "step": 21346 + }, + { + "epoch": 0.1269566561994481, + "grad_norm": 1.4584442377090454, + "learning_rate": 4.803792467476988e-05, + "loss": 5.512, + "step": 21347 + }, + { + "epoch": 0.1269626034827291, + "grad_norm": 1.663875699043274, + "learning_rate": 4.803774327861291e-05, + "loss": 5.5867, + "step": 21348 + }, + { + "epoch": 0.12696855076601007, + "grad_norm": 1.4865331649780273, + "learning_rate": 4.8037561874413696e-05, + "loss": 5.0047, + "step": 21349 + }, + { + "epoch": 0.1269744980492911, + "grad_norm": 1.5889533758163452, + "learning_rate": 4.803738046217231e-05, + "loss": 4.9325, + "step": 21350 + }, + { + "epoch": 0.12698044533257208, + "grad_norm": 1.7473856210708618, + "learning_rate": 4.8037199041888814e-05, + "loss": 4.9296, + "step": 21351 + }, + { + "epoch": 0.12698639261585307, + "grad_norm": 1.9395428895950317, + "learning_rate": 4.8037017613563265e-05, + "loss": 5.5787, + "step": 21352 + }, + { + "epoch": 0.12699233989913408, + "grad_norm": 1.8723230361938477, + "learning_rate": 4.8036836177195734e-05, + "loss": 5.2864, + "step": 21353 + }, + { + "epoch": 0.12699828718241507, + "grad_norm": 1.8751366138458252, + "learning_rate": 4.8036654732786276e-05, + "loss": 4.9116, + "step": 21354 + }, + { + "epoch": 0.12700423446569606, + "grad_norm": 1.6620196104049683, + "learning_rate": 4.803647328033497e-05, + "loss": 5.1592, + "step": 21355 + }, + { + "epoch": 0.12701018174897707, + "grad_norm": 2.01167631149292, + "learning_rate": 4.803629181984187e-05, + "loss": 5.2254, + "step": 21356 + }, + { + "epoch": 0.12701612903225806, + "grad_norm": 1.6565442085266113, + "learning_rate": 4.803611035130703e-05, + "loss": 5.2454, + "step": 21357 + }, + { + "epoch": 0.12702207631553905, + "grad_norm": 1.3379613161087036, + "learning_rate": 4.803592887473053e-05, + "loss": 5.3203, + "step": 21358 + }, + { + "epoch": 0.12702802359882007, + "grad_norm": 1.580633282661438, + "learning_rate": 4.8035747390112415e-05, + "loss": 5.2555, + "step": 21359 + }, + { + "epoch": 0.12703397088210105, + "grad_norm": 1.9735597372055054, + "learning_rate": 4.803556589745276e-05, + "loss": 5.6899, + "step": 21360 + }, + { + "epoch": 0.12703991816538204, + "grad_norm": 1.6550042629241943, + "learning_rate": 4.8035384396751636e-05, + "loss": 4.8188, + "step": 21361 + }, + { + "epoch": 0.12704586544866306, + "grad_norm": 1.598645567893982, + "learning_rate": 4.803520288800909e-05, + "loss": 5.0498, + "step": 21362 + }, + { + "epoch": 0.12705181273194405, + "grad_norm": 1.5990798473358154, + "learning_rate": 4.80350213712252e-05, + "loss": 5.0563, + "step": 21363 + }, + { + "epoch": 0.12705776001522504, + "grad_norm": 1.5130763053894043, + "learning_rate": 4.803483984640001e-05, + "loss": 5.2562, + "step": 21364 + }, + { + "epoch": 0.12706370729850605, + "grad_norm": 1.5498485565185547, + "learning_rate": 4.803465831353361e-05, + "loss": 5.551, + "step": 21365 + }, + { + "epoch": 0.12706965458178704, + "grad_norm": 1.819954752922058, + "learning_rate": 4.803447677262603e-05, + "loss": 4.5888, + "step": 21366 + }, + { + "epoch": 0.12707560186506803, + "grad_norm": 1.5863771438598633, + "learning_rate": 4.8034295223677374e-05, + "loss": 5.108, + "step": 21367 + }, + { + "epoch": 0.12708154914834904, + "grad_norm": 1.6637874841690063, + "learning_rate": 4.803411366668767e-05, + "loss": 5.3476, + "step": 21368 + }, + { + "epoch": 0.12708749643163003, + "grad_norm": 1.5182580947875977, + "learning_rate": 4.8033932101657e-05, + "loss": 5.6559, + "step": 21369 + }, + { + "epoch": 0.12709344371491102, + "grad_norm": 1.725801706314087, + "learning_rate": 4.803375052858542e-05, + "loss": 4.6643, + "step": 21370 + }, + { + "epoch": 0.12709939099819204, + "grad_norm": 1.6476885080337524, + "learning_rate": 4.803356894747299e-05, + "loss": 4.6574, + "step": 21371 + }, + { + "epoch": 0.12710533828147302, + "grad_norm": 1.520213007926941, + "learning_rate": 4.803338735831979e-05, + "loss": 5.3691, + "step": 21372 + }, + { + "epoch": 0.127111285564754, + "grad_norm": 1.4914368391036987, + "learning_rate": 4.803320576112586e-05, + "loss": 5.2913, + "step": 21373 + }, + { + "epoch": 0.12711723284803503, + "grad_norm": 1.254329800605774, + "learning_rate": 4.803302415589128e-05, + "loss": 5.3926, + "step": 21374 + }, + { + "epoch": 0.12712318013131602, + "grad_norm": 1.909441351890564, + "learning_rate": 4.8032842542616116e-05, + "loss": 4.6179, + "step": 21375 + }, + { + "epoch": 0.127129127414597, + "grad_norm": 1.7123392820358276, + "learning_rate": 4.803266092130042e-05, + "loss": 5.1276, + "step": 21376 + }, + { + "epoch": 0.12713507469787802, + "grad_norm": 1.717854380607605, + "learning_rate": 4.8032479291944265e-05, + "loss": 5.3377, + "step": 21377 + }, + { + "epoch": 0.127141021981159, + "grad_norm": 1.7636181116104126, + "learning_rate": 4.80322976545477e-05, + "loss": 5.3434, + "step": 21378 + }, + { + "epoch": 0.12714696926444, + "grad_norm": 1.6754179000854492, + "learning_rate": 4.80321160091108e-05, + "loss": 5.3604, + "step": 21379 + }, + { + "epoch": 0.127152916547721, + "grad_norm": 1.4759787321090698, + "learning_rate": 4.803193435563364e-05, + "loss": 5.267, + "step": 21380 + }, + { + "epoch": 0.127158863831002, + "grad_norm": 1.8769867420196533, + "learning_rate": 4.803175269411625e-05, + "loss": 5.2666, + "step": 21381 + }, + { + "epoch": 0.127164811114283, + "grad_norm": 1.7843588590621948, + "learning_rate": 4.803157102455873e-05, + "loss": 5.1529, + "step": 21382 + }, + { + "epoch": 0.127170758397564, + "grad_norm": 1.7799369096755981, + "learning_rate": 4.803138934696111e-05, + "loss": 4.9332, + "step": 21383 + }, + { + "epoch": 0.127176705680845, + "grad_norm": 1.8240329027175903, + "learning_rate": 4.803120766132348e-05, + "loss": 4.8369, + "step": 21384 + }, + { + "epoch": 0.12718265296412598, + "grad_norm": 1.7379107475280762, + "learning_rate": 4.8031025967645895e-05, + "loss": 4.6134, + "step": 21385 + }, + { + "epoch": 0.127188600247407, + "grad_norm": 1.9912395477294922, + "learning_rate": 4.8030844265928414e-05, + "loss": 4.5456, + "step": 21386 + }, + { + "epoch": 0.12719454753068798, + "grad_norm": 1.762600302696228, + "learning_rate": 4.80306625561711e-05, + "loss": 5.4269, + "step": 21387 + }, + { + "epoch": 0.12720049481396897, + "grad_norm": 1.9208531379699707, + "learning_rate": 4.8030480838374027e-05, + "loss": 5.542, + "step": 21388 + }, + { + "epoch": 0.12720644209725, + "grad_norm": 1.8121410608291626, + "learning_rate": 4.803029911253725e-05, + "loss": 5.7218, + "step": 21389 + }, + { + "epoch": 0.12721238938053098, + "grad_norm": 2.0130512714385986, + "learning_rate": 4.803011737866082e-05, + "loss": 5.4736, + "step": 21390 + }, + { + "epoch": 0.12721833666381197, + "grad_norm": 1.4087759256362915, + "learning_rate": 4.802993563674483e-05, + "loss": 5.5634, + "step": 21391 + }, + { + "epoch": 0.12722428394709298, + "grad_norm": 1.640550971031189, + "learning_rate": 4.8029753886789316e-05, + "loss": 5.6422, + "step": 21392 + }, + { + "epoch": 0.12723023123037397, + "grad_norm": 1.58751380443573, + "learning_rate": 4.802957212879436e-05, + "loss": 5.2661, + "step": 21393 + }, + { + "epoch": 0.12723617851365496, + "grad_norm": 1.536847472190857, + "learning_rate": 4.802939036276002e-05, + "loss": 5.475, + "step": 21394 + }, + { + "epoch": 0.12724212579693597, + "grad_norm": 1.8386236429214478, + "learning_rate": 4.802920858868635e-05, + "loss": 5.4889, + "step": 21395 + }, + { + "epoch": 0.12724807308021696, + "grad_norm": 1.7268786430358887, + "learning_rate": 4.802902680657343e-05, + "loss": 5.2129, + "step": 21396 + }, + { + "epoch": 0.12725402036349795, + "grad_norm": 1.5081709623336792, + "learning_rate": 4.8028845016421306e-05, + "loss": 5.0437, + "step": 21397 + }, + { + "epoch": 0.12725996764677894, + "grad_norm": 1.3470754623413086, + "learning_rate": 4.802866321823006e-05, + "loss": 5.2242, + "step": 21398 + }, + { + "epoch": 0.12726591493005995, + "grad_norm": 1.2352057695388794, + "learning_rate": 4.802848141199974e-05, + "loss": 4.6926, + "step": 21399 + }, + { + "epoch": 0.12727186221334094, + "grad_norm": 1.4411710500717163, + "learning_rate": 4.802829959773041e-05, + "loss": 5.098, + "step": 21400 + }, + { + "epoch": 0.12727780949662193, + "grad_norm": 1.3453952074050903, + "learning_rate": 4.802811777542214e-05, + "loss": 5.0484, + "step": 21401 + }, + { + "epoch": 0.12728375677990295, + "grad_norm": 1.4602265357971191, + "learning_rate": 4.8027935945074995e-05, + "loss": 5.167, + "step": 21402 + }, + { + "epoch": 0.12728970406318393, + "grad_norm": 1.4542255401611328, + "learning_rate": 4.802775410668904e-05, + "loss": 5.0701, + "step": 21403 + }, + { + "epoch": 0.12729565134646492, + "grad_norm": 1.4398037195205688, + "learning_rate": 4.802757226026433e-05, + "loss": 5.0809, + "step": 21404 + }, + { + "epoch": 0.12730159862974594, + "grad_norm": 1.3027135133743286, + "learning_rate": 4.8027390405800935e-05, + "loss": 5.1283, + "step": 21405 + }, + { + "epoch": 0.12730754591302693, + "grad_norm": 1.3704328536987305, + "learning_rate": 4.802720854329891e-05, + "loss": 5.0886, + "step": 21406 + }, + { + "epoch": 0.12731349319630791, + "grad_norm": 1.2771658897399902, + "learning_rate": 4.802702667275833e-05, + "loss": 4.968, + "step": 21407 + }, + { + "epoch": 0.12731944047958893, + "grad_norm": 1.3370757102966309, + "learning_rate": 4.802684479417925e-05, + "loss": 5.2742, + "step": 21408 + }, + { + "epoch": 0.12732538776286992, + "grad_norm": 1.2101991176605225, + "learning_rate": 4.802666290756174e-05, + "loss": 5.3125, + "step": 21409 + }, + { + "epoch": 0.1273313350461509, + "grad_norm": 1.327354907989502, + "learning_rate": 4.8026481012905854e-05, + "loss": 5.0784, + "step": 21410 + }, + { + "epoch": 0.12733728232943192, + "grad_norm": 1.2267961502075195, + "learning_rate": 4.802629911021166e-05, + "loss": 5.0666, + "step": 21411 + }, + { + "epoch": 0.1273432296127129, + "grad_norm": 1.2195243835449219, + "learning_rate": 4.8026117199479224e-05, + "loss": 5.1941, + "step": 21412 + }, + { + "epoch": 0.1273491768959939, + "grad_norm": 1.1964733600616455, + "learning_rate": 4.8025935280708616e-05, + "loss": 5.0561, + "step": 21413 + }, + { + "epoch": 0.12735512417927491, + "grad_norm": 1.148831844329834, + "learning_rate": 4.802575335389989e-05, + "loss": 4.9592, + "step": 21414 + }, + { + "epoch": 0.1273610714625559, + "grad_norm": 1.2319111824035645, + "learning_rate": 4.802557141905311e-05, + "loss": 5.0165, + "step": 21415 + }, + { + "epoch": 0.1273670187458369, + "grad_norm": 1.324744462966919, + "learning_rate": 4.802538947616834e-05, + "loss": 4.9402, + "step": 21416 + }, + { + "epoch": 0.1273729660291179, + "grad_norm": 1.1551966667175293, + "learning_rate": 4.802520752524564e-05, + "loss": 5.1849, + "step": 21417 + }, + { + "epoch": 0.1273789133123989, + "grad_norm": 1.2087135314941406, + "learning_rate": 4.802502556628508e-05, + "loss": 5.1082, + "step": 21418 + }, + { + "epoch": 0.12738486059567988, + "grad_norm": 1.1568787097930908, + "learning_rate": 4.8024843599286726e-05, + "loss": 5.1379, + "step": 21419 + }, + { + "epoch": 0.1273908078789609, + "grad_norm": 1.2819747924804688, + "learning_rate": 4.802466162425063e-05, + "loss": 5.2054, + "step": 21420 + }, + { + "epoch": 0.1273967551622419, + "grad_norm": 1.3548219203948975, + "learning_rate": 4.8024479641176866e-05, + "loss": 4.8277, + "step": 21421 + }, + { + "epoch": 0.12740270244552288, + "grad_norm": 1.3331178426742554, + "learning_rate": 4.80242976500655e-05, + "loss": 4.991, + "step": 21422 + }, + { + "epoch": 0.1274086497288039, + "grad_norm": 1.3595576286315918, + "learning_rate": 4.8024115650916584e-05, + "loss": 4.8734, + "step": 21423 + }, + { + "epoch": 0.12741459701208488, + "grad_norm": 1.310585856437683, + "learning_rate": 4.802393364373019e-05, + "loss": 4.9281, + "step": 21424 + }, + { + "epoch": 0.12742054429536587, + "grad_norm": 1.3193553686141968, + "learning_rate": 4.8023751628506374e-05, + "loss": 4.9819, + "step": 21425 + }, + { + "epoch": 0.12742649157864688, + "grad_norm": 1.2952460050582886, + "learning_rate": 4.8023569605245204e-05, + "loss": 4.9577, + "step": 21426 + }, + { + "epoch": 0.12743243886192787, + "grad_norm": 1.376548409461975, + "learning_rate": 4.802338757394674e-05, + "loss": 5.2219, + "step": 21427 + }, + { + "epoch": 0.12743838614520886, + "grad_norm": 1.1417921781539917, + "learning_rate": 4.802320553461106e-05, + "loss": 5.0234, + "step": 21428 + }, + { + "epoch": 0.12744433342848988, + "grad_norm": 1.2543314695358276, + "learning_rate": 4.8023023487238214e-05, + "loss": 4.9921, + "step": 21429 + }, + { + "epoch": 0.12745028071177086, + "grad_norm": 1.4437085390090942, + "learning_rate": 4.802284143182827e-05, + "loss": 4.8699, + "step": 21430 + }, + { + "epoch": 0.12745622799505185, + "grad_norm": 1.137539267539978, + "learning_rate": 4.802265936838128e-05, + "loss": 5.1073, + "step": 21431 + }, + { + "epoch": 0.12746217527833287, + "grad_norm": 1.4179331064224243, + "learning_rate": 4.802247729689733e-05, + "loss": 5.0073, + "step": 21432 + }, + { + "epoch": 0.12746812256161386, + "grad_norm": 1.5519764423370361, + "learning_rate": 4.802229521737646e-05, + "loss": 4.9426, + "step": 21433 + }, + { + "epoch": 0.12747406984489484, + "grad_norm": 1.440847396850586, + "learning_rate": 4.8022113129818754e-05, + "loss": 5.2137, + "step": 21434 + }, + { + "epoch": 0.12748001712817586, + "grad_norm": 1.2741557359695435, + "learning_rate": 4.802193103422426e-05, + "loss": 4.966, + "step": 21435 + }, + { + "epoch": 0.12748596441145685, + "grad_norm": 1.5297214984893799, + "learning_rate": 4.8021748930593045e-05, + "loss": 5.006, + "step": 21436 + }, + { + "epoch": 0.12749191169473784, + "grad_norm": 1.2509713172912598, + "learning_rate": 4.802156681892518e-05, + "loss": 5.0719, + "step": 21437 + }, + { + "epoch": 0.12749785897801885, + "grad_norm": 1.2376511096954346, + "learning_rate": 4.802138469922073e-05, + "loss": 4.8896, + "step": 21438 + }, + { + "epoch": 0.12750380626129984, + "grad_norm": 1.311804175376892, + "learning_rate": 4.802120257147974e-05, + "loss": 5.0292, + "step": 21439 + }, + { + "epoch": 0.12750975354458083, + "grad_norm": 1.2717031240463257, + "learning_rate": 4.802102043570229e-05, + "loss": 5.157, + "step": 21440 + }, + { + "epoch": 0.12751570082786184, + "grad_norm": 1.2967960834503174, + "learning_rate": 4.8020838291888445e-05, + "loss": 5.1289, + "step": 21441 + }, + { + "epoch": 0.12752164811114283, + "grad_norm": 1.2796543836593628, + "learning_rate": 4.802065614003826e-05, + "loss": 5.0702, + "step": 21442 + }, + { + "epoch": 0.12752759539442382, + "grad_norm": 1.4490569829940796, + "learning_rate": 4.80204739801518e-05, + "loss": 5.1, + "step": 21443 + }, + { + "epoch": 0.12753354267770484, + "grad_norm": 1.1721242666244507, + "learning_rate": 4.8020291812229136e-05, + "loss": 5.1237, + "step": 21444 + }, + { + "epoch": 0.12753948996098582, + "grad_norm": 1.3185924291610718, + "learning_rate": 4.8020109636270316e-05, + "loss": 5.0208, + "step": 21445 + }, + { + "epoch": 0.1275454372442668, + "grad_norm": 1.4432177543640137, + "learning_rate": 4.801992745227543e-05, + "loss": 5.0235, + "step": 21446 + }, + { + "epoch": 0.12755138452754783, + "grad_norm": 1.3810619115829468, + "learning_rate": 4.801974526024451e-05, + "loss": 4.8893, + "step": 21447 + }, + { + "epoch": 0.12755733181082882, + "grad_norm": 1.3421547412872314, + "learning_rate": 4.8019563060177634e-05, + "loss": 4.9605, + "step": 21448 + }, + { + "epoch": 0.1275632790941098, + "grad_norm": 1.304095983505249, + "learning_rate": 4.8019380852074875e-05, + "loss": 4.9489, + "step": 21449 + }, + { + "epoch": 0.12756922637739082, + "grad_norm": 1.3935438394546509, + "learning_rate": 4.801919863593629e-05, + "loss": 4.9097, + "step": 21450 + }, + { + "epoch": 0.1275751736606718, + "grad_norm": 1.1719253063201904, + "learning_rate": 4.801901641176193e-05, + "loss": 4.9922, + "step": 21451 + }, + { + "epoch": 0.1275811209439528, + "grad_norm": 1.8718456029891968, + "learning_rate": 4.801883417955188e-05, + "loss": 5.409, + "step": 21452 + }, + { + "epoch": 0.1275870682272338, + "grad_norm": 1.1837137937545776, + "learning_rate": 4.801865193930618e-05, + "loss": 4.967, + "step": 21453 + }, + { + "epoch": 0.1275930155105148, + "grad_norm": 1.2643749713897705, + "learning_rate": 4.801846969102491e-05, + "loss": 4.7932, + "step": 21454 + }, + { + "epoch": 0.1275989627937958, + "grad_norm": 1.2207399606704712, + "learning_rate": 4.801828743470814e-05, + "loss": 4.9634, + "step": 21455 + }, + { + "epoch": 0.12760491007707678, + "grad_norm": 1.2489538192749023, + "learning_rate": 4.801810517035592e-05, + "loss": 5.1077, + "step": 21456 + }, + { + "epoch": 0.1276108573603578, + "grad_norm": 1.3879250288009644, + "learning_rate": 4.801792289796832e-05, + "loss": 5.225, + "step": 21457 + }, + { + "epoch": 0.12761680464363878, + "grad_norm": 1.4891397953033447, + "learning_rate": 4.8017740617545385e-05, + "loss": 5.1288, + "step": 21458 + }, + { + "epoch": 0.12762275192691977, + "grad_norm": 1.555528998374939, + "learning_rate": 4.801755832908721e-05, + "loss": 5.1875, + "step": 21459 + }, + { + "epoch": 0.12762869921020079, + "grad_norm": 1.287625789642334, + "learning_rate": 4.8017376032593834e-05, + "loss": 5.1934, + "step": 21460 + }, + { + "epoch": 0.12763464649348177, + "grad_norm": 1.4907346963882446, + "learning_rate": 4.801719372806533e-05, + "loss": 5.169, + "step": 21461 + }, + { + "epoch": 0.12764059377676276, + "grad_norm": 1.2776025533676147, + "learning_rate": 4.801701141550177e-05, + "loss": 5.2178, + "step": 21462 + }, + { + "epoch": 0.12764654106004378, + "grad_norm": 1.4319080114364624, + "learning_rate": 4.80168290949032e-05, + "loss": 5.159, + "step": 21463 + }, + { + "epoch": 0.12765248834332477, + "grad_norm": 1.4323997497558594, + "learning_rate": 4.80166467662697e-05, + "loss": 5.227, + "step": 21464 + }, + { + "epoch": 0.12765843562660575, + "grad_norm": 1.409071445465088, + "learning_rate": 4.8016464429601326e-05, + "loss": 5.0025, + "step": 21465 + }, + { + "epoch": 0.12766438290988677, + "grad_norm": 1.42705500125885, + "learning_rate": 4.801628208489814e-05, + "loss": 5.0332, + "step": 21466 + }, + { + "epoch": 0.12767033019316776, + "grad_norm": 1.2235654592514038, + "learning_rate": 4.801609973216021e-05, + "loss": 5.0734, + "step": 21467 + }, + { + "epoch": 0.12767627747644875, + "grad_norm": 1.2238860130310059, + "learning_rate": 4.8015917371387595e-05, + "loss": 4.9804, + "step": 21468 + }, + { + "epoch": 0.12768222475972976, + "grad_norm": 1.4584438800811768, + "learning_rate": 4.801573500258036e-05, + "loss": 5.162, + "step": 21469 + }, + { + "epoch": 0.12768817204301075, + "grad_norm": 1.236396074295044, + "learning_rate": 4.8015552625738566e-05, + "loss": 5.1374, + "step": 21470 + }, + { + "epoch": 0.12769411932629174, + "grad_norm": 1.472617745399475, + "learning_rate": 4.801537024086229e-05, + "loss": 5.0376, + "step": 21471 + }, + { + "epoch": 0.12770006660957275, + "grad_norm": 1.2870211601257324, + "learning_rate": 4.801518784795158e-05, + "loss": 4.9798, + "step": 21472 + }, + { + "epoch": 0.12770601389285374, + "grad_norm": 1.3299795389175415, + "learning_rate": 4.801500544700651e-05, + "loss": 4.9588, + "step": 21473 + }, + { + "epoch": 0.12771196117613473, + "grad_norm": 1.474135398864746, + "learning_rate": 4.8014823038027134e-05, + "loss": 5.015, + "step": 21474 + }, + { + "epoch": 0.12771790845941575, + "grad_norm": 1.6452490091323853, + "learning_rate": 4.8014640621013524e-05, + "loss": 5.0075, + "step": 21475 + }, + { + "epoch": 0.12772385574269673, + "grad_norm": 1.3577489852905273, + "learning_rate": 4.801445819596574e-05, + "loss": 4.9675, + "step": 21476 + }, + { + "epoch": 0.12772980302597772, + "grad_norm": 1.2642143964767456, + "learning_rate": 4.801427576288384e-05, + "loss": 5.0593, + "step": 21477 + }, + { + "epoch": 0.12773575030925874, + "grad_norm": 1.5256940126419067, + "learning_rate": 4.801409332176791e-05, + "loss": 4.8987, + "step": 21478 + }, + { + "epoch": 0.12774169759253973, + "grad_norm": 1.667886734008789, + "learning_rate": 4.801391087261798e-05, + "loss": 4.7562, + "step": 21479 + }, + { + "epoch": 0.12774764487582072, + "grad_norm": 1.3564702272415161, + "learning_rate": 4.801372841543415e-05, + "loss": 5.2975, + "step": 21480 + }, + { + "epoch": 0.12775359215910173, + "grad_norm": 1.607532262802124, + "learning_rate": 4.801354595021645e-05, + "loss": 4.9578, + "step": 21481 + }, + { + "epoch": 0.12775953944238272, + "grad_norm": 1.2633382081985474, + "learning_rate": 4.801336347696496e-05, + "loss": 5.1104, + "step": 21482 + }, + { + "epoch": 0.1277654867256637, + "grad_norm": 1.4292182922363281, + "learning_rate": 4.801318099567975e-05, + "loss": 4.9637, + "step": 21483 + }, + { + "epoch": 0.12777143400894472, + "grad_norm": 1.1797621250152588, + "learning_rate": 4.8012998506360874e-05, + "loss": 5.403, + "step": 21484 + }, + { + "epoch": 0.1277773812922257, + "grad_norm": 1.3704683780670166, + "learning_rate": 4.801281600900839e-05, + "loss": 4.9852, + "step": 21485 + }, + { + "epoch": 0.1277833285755067, + "grad_norm": 1.4775960445404053, + "learning_rate": 4.8012633503622384e-05, + "loss": 5.2049, + "step": 21486 + }, + { + "epoch": 0.12778927585878772, + "grad_norm": 1.5056041479110718, + "learning_rate": 4.801245099020289e-05, + "loss": 4.9782, + "step": 21487 + }, + { + "epoch": 0.1277952231420687, + "grad_norm": 1.3562772274017334, + "learning_rate": 4.801226846875e-05, + "loss": 5.0427, + "step": 21488 + }, + { + "epoch": 0.1278011704253497, + "grad_norm": 1.346339464187622, + "learning_rate": 4.801208593926376e-05, + "loss": 5.2215, + "step": 21489 + }, + { + "epoch": 0.1278071177086307, + "grad_norm": 1.3189916610717773, + "learning_rate": 4.801190340174424e-05, + "loss": 5.2097, + "step": 21490 + }, + { + "epoch": 0.1278130649919117, + "grad_norm": 1.466374397277832, + "learning_rate": 4.80117208561915e-05, + "loss": 4.8106, + "step": 21491 + }, + { + "epoch": 0.12781901227519268, + "grad_norm": 1.4882310628890991, + "learning_rate": 4.801153830260561e-05, + "loss": 5.1702, + "step": 21492 + }, + { + "epoch": 0.1278249595584737, + "grad_norm": 1.4080910682678223, + "learning_rate": 4.801135574098662e-05, + "loss": 5.0508, + "step": 21493 + }, + { + "epoch": 0.1278309068417547, + "grad_norm": 1.366672396659851, + "learning_rate": 4.801117317133461e-05, + "loss": 4.8692, + "step": 21494 + }, + { + "epoch": 0.12783685412503568, + "grad_norm": 1.3347315788269043, + "learning_rate": 4.801099059364963e-05, + "loss": 5.2327, + "step": 21495 + }, + { + "epoch": 0.1278428014083167, + "grad_norm": 1.434276819229126, + "learning_rate": 4.8010808007931765e-05, + "loss": 4.7217, + "step": 21496 + }, + { + "epoch": 0.12784874869159768, + "grad_norm": 1.2148855924606323, + "learning_rate": 4.801062541418105e-05, + "loss": 5.2082, + "step": 21497 + }, + { + "epoch": 0.12785469597487867, + "grad_norm": 1.4282805919647217, + "learning_rate": 4.801044281239758e-05, + "loss": 4.8627, + "step": 21498 + }, + { + "epoch": 0.12786064325815968, + "grad_norm": 1.309984564781189, + "learning_rate": 4.8010260202581394e-05, + "loss": 5.0809, + "step": 21499 + }, + { + "epoch": 0.12786659054144067, + "grad_norm": 1.2769159078598022, + "learning_rate": 4.801007758473256e-05, + "loss": 5.0357, + "step": 21500 + }, + { + "epoch": 0.12787253782472166, + "grad_norm": 1.4789204597473145, + "learning_rate": 4.800989495885115e-05, + "loss": 5.0572, + "step": 21501 + }, + { + "epoch": 0.12787848510800268, + "grad_norm": 1.2763663530349731, + "learning_rate": 4.8009712324937216e-05, + "loss": 5.2331, + "step": 21502 + }, + { + "epoch": 0.12788443239128366, + "grad_norm": 1.237911581993103, + "learning_rate": 4.800952968299084e-05, + "loss": 5.1217, + "step": 21503 + }, + { + "epoch": 0.12789037967456465, + "grad_norm": 1.3204708099365234, + "learning_rate": 4.800934703301206e-05, + "loss": 5.0503, + "step": 21504 + }, + { + "epoch": 0.12789632695784567, + "grad_norm": 1.2918440103530884, + "learning_rate": 4.800916437500097e-05, + "loss": 5.1229, + "step": 21505 + }, + { + "epoch": 0.12790227424112666, + "grad_norm": 1.2793703079223633, + "learning_rate": 4.8008981708957614e-05, + "loss": 4.9075, + "step": 21506 + }, + { + "epoch": 0.12790822152440764, + "grad_norm": 1.177607536315918, + "learning_rate": 4.8008799034882054e-05, + "loss": 4.89, + "step": 21507 + }, + { + "epoch": 0.12791416880768866, + "grad_norm": 0.9703904986381531, + "learning_rate": 4.800861635277437e-05, + "loss": 5.0141, + "step": 21508 + }, + { + "epoch": 0.12792011609096965, + "grad_norm": 1.2512762546539307, + "learning_rate": 4.800843366263461e-05, + "loss": 4.953, + "step": 21509 + }, + { + "epoch": 0.12792606337425064, + "grad_norm": 1.3279083967208862, + "learning_rate": 4.8008250964462846e-05, + "loss": 5.0179, + "step": 21510 + }, + { + "epoch": 0.12793201065753165, + "grad_norm": 1.3790103197097778, + "learning_rate": 4.8008068258259144e-05, + "loss": 4.9531, + "step": 21511 + }, + { + "epoch": 0.12793795794081264, + "grad_norm": 1.2640241384506226, + "learning_rate": 4.800788554402355e-05, + "loss": 5.0281, + "step": 21512 + }, + { + "epoch": 0.12794390522409363, + "grad_norm": 1.2616617679595947, + "learning_rate": 4.800770282175615e-05, + "loss": 5.1131, + "step": 21513 + }, + { + "epoch": 0.12794985250737465, + "grad_norm": 1.7765449285507202, + "learning_rate": 4.800752009145699e-05, + "loss": 5.3388, + "step": 21514 + }, + { + "epoch": 0.12795579979065563, + "grad_norm": 1.4468929767608643, + "learning_rate": 4.800733735312615e-05, + "loss": 4.9308, + "step": 21515 + }, + { + "epoch": 0.12796174707393662, + "grad_norm": 1.286733865737915, + "learning_rate": 4.800715460676369e-05, + "loss": 5.0407, + "step": 21516 + }, + { + "epoch": 0.1279676943572176, + "grad_norm": 1.3074883222579956, + "learning_rate": 4.8006971852369665e-05, + "loss": 5.0364, + "step": 21517 + }, + { + "epoch": 0.12797364164049863, + "grad_norm": 1.2966744899749756, + "learning_rate": 4.8006789089944144e-05, + "loss": 5.0411, + "step": 21518 + }, + { + "epoch": 0.1279795889237796, + "grad_norm": 1.4764792919158936, + "learning_rate": 4.800660631948719e-05, + "loss": 5.0178, + "step": 21519 + }, + { + "epoch": 0.1279855362070606, + "grad_norm": 1.3073668479919434, + "learning_rate": 4.800642354099887e-05, + "loss": 4.8384, + "step": 21520 + }, + { + "epoch": 0.12799148349034162, + "grad_norm": 1.433164119720459, + "learning_rate": 4.800624075447924e-05, + "loss": 4.4844, + "step": 21521 + }, + { + "epoch": 0.1279974307736226, + "grad_norm": 1.435656189918518, + "learning_rate": 4.8006057959928375e-05, + "loss": 4.7067, + "step": 21522 + }, + { + "epoch": 0.1280033780569036, + "grad_norm": 1.2541238069534302, + "learning_rate": 4.800587515734632e-05, + "loss": 4.76, + "step": 21523 + }, + { + "epoch": 0.1280093253401846, + "grad_norm": 1.3341822624206543, + "learning_rate": 4.8005692346733166e-05, + "loss": 4.9485, + "step": 21524 + }, + { + "epoch": 0.1280152726234656, + "grad_norm": 1.1761771440505981, + "learning_rate": 4.8005509528088963e-05, + "loss": 4.9416, + "step": 21525 + }, + { + "epoch": 0.1280212199067466, + "grad_norm": 1.490059494972229, + "learning_rate": 4.8005326701413764e-05, + "loss": 4.5864, + "step": 21526 + }, + { + "epoch": 0.1280271671900276, + "grad_norm": 1.4474053382873535, + "learning_rate": 4.8005143866707656e-05, + "loss": 4.3612, + "step": 21527 + }, + { + "epoch": 0.1280331144733086, + "grad_norm": 1.4138057231903076, + "learning_rate": 4.800496102397068e-05, + "loss": 4.7795, + "step": 21528 + }, + { + "epoch": 0.12803906175658958, + "grad_norm": 1.3671265840530396, + "learning_rate": 4.8004778173202915e-05, + "loss": 4.8096, + "step": 21529 + }, + { + "epoch": 0.1280450090398706, + "grad_norm": 1.3463077545166016, + "learning_rate": 4.800459531440441e-05, + "loss": 4.4858, + "step": 21530 + }, + { + "epoch": 0.12805095632315158, + "grad_norm": 1.2250823974609375, + "learning_rate": 4.800441244757525e-05, + "loss": 4.7394, + "step": 21531 + }, + { + "epoch": 0.12805690360643257, + "grad_norm": 1.4103713035583496, + "learning_rate": 4.800422957271548e-05, + "loss": 4.8084, + "step": 21532 + }, + { + "epoch": 0.1280628508897136, + "grad_norm": 1.3920261859893799, + "learning_rate": 4.800404668982518e-05, + "loss": 4.9744, + "step": 21533 + }, + { + "epoch": 0.12806879817299457, + "grad_norm": 1.2541594505310059, + "learning_rate": 4.8003863798904395e-05, + "loss": 5.024, + "step": 21534 + }, + { + "epoch": 0.12807474545627556, + "grad_norm": 1.2717599868774414, + "learning_rate": 4.80036808999532e-05, + "loss": 4.9402, + "step": 21535 + }, + { + "epoch": 0.12808069273955658, + "grad_norm": 1.168628215789795, + "learning_rate": 4.8003497992971656e-05, + "loss": 4.8391, + "step": 21536 + }, + { + "epoch": 0.12808664002283757, + "grad_norm": 1.2135813236236572, + "learning_rate": 4.800331507795984e-05, + "loss": 4.9725, + "step": 21537 + }, + { + "epoch": 0.12809258730611855, + "grad_norm": 1.2984068393707275, + "learning_rate": 4.8003132154917795e-05, + "loss": 4.8946, + "step": 21538 + }, + { + "epoch": 0.12809853458939957, + "grad_norm": 1.1610583066940308, + "learning_rate": 4.8002949223845595e-05, + "loss": 4.9362, + "step": 21539 + }, + { + "epoch": 0.12810448187268056, + "grad_norm": 1.357981562614441, + "learning_rate": 4.8002766284743306e-05, + "loss": 4.6936, + "step": 21540 + }, + { + "epoch": 0.12811042915596155, + "grad_norm": 1.437784194946289, + "learning_rate": 4.800258333761098e-05, + "loss": 4.7942, + "step": 21541 + }, + { + "epoch": 0.12811637643924256, + "grad_norm": 1.364261507987976, + "learning_rate": 4.8002400382448704e-05, + "loss": 4.763, + "step": 21542 + }, + { + "epoch": 0.12812232372252355, + "grad_norm": 1.3244688510894775, + "learning_rate": 4.800221741925652e-05, + "loss": 4.8804, + "step": 21543 + }, + { + "epoch": 0.12812827100580454, + "grad_norm": 1.5480263233184814, + "learning_rate": 4.80020344480345e-05, + "loss": 4.6523, + "step": 21544 + }, + { + "epoch": 0.12813421828908556, + "grad_norm": 1.2875494956970215, + "learning_rate": 4.800185146878271e-05, + "loss": 4.6137, + "step": 21545 + }, + { + "epoch": 0.12814016557236654, + "grad_norm": 1.1969667673110962, + "learning_rate": 4.80016684815012e-05, + "loss": 5.1034, + "step": 21546 + }, + { + "epoch": 0.12814611285564753, + "grad_norm": 1.3188492059707642, + "learning_rate": 4.8001485486190064e-05, + "loss": 5.078, + "step": 21547 + }, + { + "epoch": 0.12815206013892855, + "grad_norm": 1.2246590852737427, + "learning_rate": 4.800130248284934e-05, + "loss": 5.0404, + "step": 21548 + }, + { + "epoch": 0.12815800742220954, + "grad_norm": 1.2853569984436035, + "learning_rate": 4.800111947147909e-05, + "loss": 4.9271, + "step": 21549 + }, + { + "epoch": 0.12816395470549052, + "grad_norm": 1.1865004301071167, + "learning_rate": 4.8000936452079395e-05, + "loss": 4.8657, + "step": 21550 + }, + { + "epoch": 0.12816990198877154, + "grad_norm": 1.4134557247161865, + "learning_rate": 4.8000753424650306e-05, + "loss": 4.5964, + "step": 21551 + }, + { + "epoch": 0.12817584927205253, + "grad_norm": 1.3943791389465332, + "learning_rate": 4.8000570389191894e-05, + "loss": 4.7792, + "step": 21552 + }, + { + "epoch": 0.12818179655533352, + "grad_norm": 1.5506455898284912, + "learning_rate": 4.8000387345704225e-05, + "loss": 4.767, + "step": 21553 + }, + { + "epoch": 0.12818774383861453, + "grad_norm": 1.516860008239746, + "learning_rate": 4.8000204294187356e-05, + "loss": 4.8412, + "step": 21554 + }, + { + "epoch": 0.12819369112189552, + "grad_norm": 1.3515304327011108, + "learning_rate": 4.8000021234641345e-05, + "loss": 4.7443, + "step": 21555 + }, + { + "epoch": 0.1281996384051765, + "grad_norm": 1.4094910621643066, + "learning_rate": 4.7999838167066276e-05, + "loss": 4.8343, + "step": 21556 + }, + { + "epoch": 0.12820558568845752, + "grad_norm": 1.3746453523635864, + "learning_rate": 4.7999655091462195e-05, + "loss": 4.6913, + "step": 21557 + }, + { + "epoch": 0.1282115329717385, + "grad_norm": 1.4625654220581055, + "learning_rate": 4.799947200782917e-05, + "loss": 4.8412, + "step": 21558 + }, + { + "epoch": 0.1282174802550195, + "grad_norm": 1.3790411949157715, + "learning_rate": 4.7999288916167275e-05, + "loss": 4.5777, + "step": 21559 + }, + { + "epoch": 0.12822342753830052, + "grad_norm": 1.4020804166793823, + "learning_rate": 4.799910581647656e-05, + "loss": 4.8728, + "step": 21560 + }, + { + "epoch": 0.1282293748215815, + "grad_norm": 1.2850565910339355, + "learning_rate": 4.799892270875709e-05, + "loss": 4.9687, + "step": 21561 + }, + { + "epoch": 0.1282353221048625, + "grad_norm": 1.4895892143249512, + "learning_rate": 4.799873959300894e-05, + "loss": 4.9786, + "step": 21562 + }, + { + "epoch": 0.1282412693881435, + "grad_norm": 1.149808406829834, + "learning_rate": 4.799855646923217e-05, + "loss": 4.9924, + "step": 21563 + }, + { + "epoch": 0.1282472166714245, + "grad_norm": 1.3952314853668213, + "learning_rate": 4.799837333742684e-05, + "loss": 4.9225, + "step": 21564 + }, + { + "epoch": 0.12825316395470548, + "grad_norm": 1.271844744682312, + "learning_rate": 4.799819019759301e-05, + "loss": 4.9967, + "step": 21565 + }, + { + "epoch": 0.1282591112379865, + "grad_norm": 1.3351553678512573, + "learning_rate": 4.799800704973075e-05, + "loss": 4.9089, + "step": 21566 + }, + { + "epoch": 0.1282650585212675, + "grad_norm": 1.2077351808547974, + "learning_rate": 4.799782389384013e-05, + "loss": 4.8948, + "step": 21567 + }, + { + "epoch": 0.12827100580454848, + "grad_norm": 1.6159747838974, + "learning_rate": 4.79976407299212e-05, + "loss": 4.6636, + "step": 21568 + }, + { + "epoch": 0.1282769530878295, + "grad_norm": 1.4904805421829224, + "learning_rate": 4.7997457557974035e-05, + "loss": 4.8164, + "step": 21569 + }, + { + "epoch": 0.12828290037111048, + "grad_norm": 1.2312726974487305, + "learning_rate": 4.79972743779987e-05, + "loss": 4.8022, + "step": 21570 + }, + { + "epoch": 0.12828884765439147, + "grad_norm": 1.3150570392608643, + "learning_rate": 4.799709118999525e-05, + "loss": 4.7237, + "step": 21571 + }, + { + "epoch": 0.12829479493767248, + "grad_norm": 1.441749930381775, + "learning_rate": 4.799690799396375e-05, + "loss": 4.8704, + "step": 21572 + }, + { + "epoch": 0.12830074222095347, + "grad_norm": 1.4237558841705322, + "learning_rate": 4.799672478990427e-05, + "loss": 4.9428, + "step": 21573 + }, + { + "epoch": 0.12830668950423446, + "grad_norm": 1.5440024137496948, + "learning_rate": 4.7996541577816867e-05, + "loss": 4.7546, + "step": 21574 + }, + { + "epoch": 0.12831263678751545, + "grad_norm": 1.2962610721588135, + "learning_rate": 4.799635835770161e-05, + "loss": 4.9324, + "step": 21575 + }, + { + "epoch": 0.12831858407079647, + "grad_norm": 2.1041312217712402, + "learning_rate": 4.799617512955857e-05, + "loss": 5.2894, + "step": 21576 + }, + { + "epoch": 0.12832453135407745, + "grad_norm": 1.3591945171356201, + "learning_rate": 4.7995991893387796e-05, + "loss": 4.6942, + "step": 21577 + }, + { + "epoch": 0.12833047863735844, + "grad_norm": 1.2474287748336792, + "learning_rate": 4.799580864918936e-05, + "loss": 5.0003, + "step": 21578 + }, + { + "epoch": 0.12833642592063946, + "grad_norm": 1.4604638814926147, + "learning_rate": 4.7995625396963326e-05, + "loss": 4.8608, + "step": 21579 + }, + { + "epoch": 0.12834237320392045, + "grad_norm": 1.5033100843429565, + "learning_rate": 4.7995442136709755e-05, + "loss": 4.9221, + "step": 21580 + }, + { + "epoch": 0.12834832048720143, + "grad_norm": 1.4712806940078735, + "learning_rate": 4.799525886842872e-05, + "loss": 4.9657, + "step": 21581 + }, + { + "epoch": 0.12835426777048245, + "grad_norm": 1.4505717754364014, + "learning_rate": 4.799507559212026e-05, + "loss": 4.7913, + "step": 21582 + }, + { + "epoch": 0.12836021505376344, + "grad_norm": 1.6151630878448486, + "learning_rate": 4.7994892307784466e-05, + "loss": 4.6494, + "step": 21583 + }, + { + "epoch": 0.12836616233704443, + "grad_norm": 1.5356489419937134, + "learning_rate": 4.79947090154214e-05, + "loss": 4.5596, + "step": 21584 + }, + { + "epoch": 0.12837210962032544, + "grad_norm": 1.5046836137771606, + "learning_rate": 4.7994525715031114e-05, + "loss": 4.6486, + "step": 21585 + }, + { + "epoch": 0.12837805690360643, + "grad_norm": 1.413750171661377, + "learning_rate": 4.799434240661367e-05, + "loss": 4.8878, + "step": 21586 + }, + { + "epoch": 0.12838400418688742, + "grad_norm": 1.3955304622650146, + "learning_rate": 4.799415909016915e-05, + "loss": 5.1577, + "step": 21587 + }, + { + "epoch": 0.12838995147016843, + "grad_norm": 1.5791069269180298, + "learning_rate": 4.79939757656976e-05, + "loss": 5.1712, + "step": 21588 + }, + { + "epoch": 0.12839589875344942, + "grad_norm": 1.3384202718734741, + "learning_rate": 4.799379243319909e-05, + "loss": 5.1534, + "step": 21589 + }, + { + "epoch": 0.1284018460367304, + "grad_norm": 1.4390661716461182, + "learning_rate": 4.7993609092673684e-05, + "loss": 5.3616, + "step": 21590 + }, + { + "epoch": 0.12840779332001143, + "grad_norm": 1.3923462629318237, + "learning_rate": 4.799342574412145e-05, + "loss": 5.2225, + "step": 21591 + }, + { + "epoch": 0.12841374060329241, + "grad_norm": 1.2241096496582031, + "learning_rate": 4.799324238754245e-05, + "loss": 5.2419, + "step": 21592 + }, + { + "epoch": 0.1284196878865734, + "grad_norm": 1.3041672706604004, + "learning_rate": 4.799305902293674e-05, + "loss": 5.0903, + "step": 21593 + }, + { + "epoch": 0.12842563516985442, + "grad_norm": 1.2822580337524414, + "learning_rate": 4.799287565030439e-05, + "loss": 5.1304, + "step": 21594 + }, + { + "epoch": 0.1284315824531354, + "grad_norm": 1.4155261516571045, + "learning_rate": 4.7992692269645475e-05, + "loss": 5.2332, + "step": 21595 + }, + { + "epoch": 0.1284375297364164, + "grad_norm": 1.4972230195999146, + "learning_rate": 4.799250888096004e-05, + "loss": 5.0588, + "step": 21596 + }, + { + "epoch": 0.1284434770196974, + "grad_norm": 1.3301728963851929, + "learning_rate": 4.799232548424816e-05, + "loss": 5.0401, + "step": 21597 + }, + { + "epoch": 0.1284494243029784, + "grad_norm": 1.2775028944015503, + "learning_rate": 4.799214207950989e-05, + "loss": 4.877, + "step": 21598 + }, + { + "epoch": 0.1284553715862594, + "grad_norm": 1.1996419429779053, + "learning_rate": 4.799195866674532e-05, + "loss": 4.9223, + "step": 21599 + }, + { + "epoch": 0.1284613188695404, + "grad_norm": 1.1330626010894775, + "learning_rate": 4.7991775245954477e-05, + "loss": 4.9224, + "step": 21600 + }, + { + "epoch": 0.1284672661528214, + "grad_norm": 1.3013830184936523, + "learning_rate": 4.7991591817137446e-05, + "loss": 5.1005, + "step": 21601 + }, + { + "epoch": 0.12847321343610238, + "grad_norm": 1.2901992797851562, + "learning_rate": 4.79914083802943e-05, + "loss": 4.9554, + "step": 21602 + }, + { + "epoch": 0.1284791607193834, + "grad_norm": 1.4342957735061646, + "learning_rate": 4.799122493542507e-05, + "loss": 4.9685, + "step": 21603 + }, + { + "epoch": 0.12848510800266438, + "grad_norm": 1.2227423191070557, + "learning_rate": 4.7991041482529856e-05, + "loss": 4.9219, + "step": 21604 + }, + { + "epoch": 0.12849105528594537, + "grad_norm": 1.2947163581848145, + "learning_rate": 4.7990858021608705e-05, + "loss": 4.9747, + "step": 21605 + }, + { + "epoch": 0.1284970025692264, + "grad_norm": 1.2928695678710938, + "learning_rate": 4.799067455266168e-05, + "loss": 5.0456, + "step": 21606 + }, + { + "epoch": 0.12850294985250738, + "grad_norm": 1.461930513381958, + "learning_rate": 4.799049107568885e-05, + "loss": 4.8518, + "step": 21607 + }, + { + "epoch": 0.12850889713578836, + "grad_norm": 1.4009983539581299, + "learning_rate": 4.799030759069028e-05, + "loss": 4.8761, + "step": 21608 + }, + { + "epoch": 0.12851484441906938, + "grad_norm": 1.2762218713760376, + "learning_rate": 4.799012409766602e-05, + "loss": 4.8551, + "step": 21609 + }, + { + "epoch": 0.12852079170235037, + "grad_norm": 1.3359547853469849, + "learning_rate": 4.7989940596616156e-05, + "loss": 4.7933, + "step": 21610 + }, + { + "epoch": 0.12852673898563136, + "grad_norm": 1.4515223503112793, + "learning_rate": 4.7989757087540735e-05, + "loss": 4.8432, + "step": 21611 + }, + { + "epoch": 0.12853268626891237, + "grad_norm": 1.445410966873169, + "learning_rate": 4.7989573570439825e-05, + "loss": 5.0115, + "step": 21612 + }, + { + "epoch": 0.12853863355219336, + "grad_norm": 1.4424355030059814, + "learning_rate": 4.79893900453135e-05, + "loss": 4.9, + "step": 21613 + }, + { + "epoch": 0.12854458083547435, + "grad_norm": 1.2938885688781738, + "learning_rate": 4.798920651216182e-05, + "loss": 4.7918, + "step": 21614 + }, + { + "epoch": 0.12855052811875536, + "grad_norm": 1.3097805976867676, + "learning_rate": 4.798902297098484e-05, + "loss": 4.7449, + "step": 21615 + }, + { + "epoch": 0.12855647540203635, + "grad_norm": 1.5416840314865112, + "learning_rate": 4.798883942178263e-05, + "loss": 5.3092, + "step": 21616 + }, + { + "epoch": 0.12856242268531734, + "grad_norm": 1.339882493019104, + "learning_rate": 4.798865586455525e-05, + "loss": 5.2832, + "step": 21617 + }, + { + "epoch": 0.12856836996859836, + "grad_norm": 1.2793277502059937, + "learning_rate": 4.7988472299302764e-05, + "loss": 4.9532, + "step": 21618 + }, + { + "epoch": 0.12857431725187934, + "grad_norm": 1.3368133306503296, + "learning_rate": 4.7988288726025254e-05, + "loss": 5.0795, + "step": 21619 + }, + { + "epoch": 0.12858026453516033, + "grad_norm": 1.4083633422851562, + "learning_rate": 4.7988105144722764e-05, + "loss": 5.3231, + "step": 21620 + }, + { + "epoch": 0.12858621181844135, + "grad_norm": 1.4018146991729736, + "learning_rate": 4.7987921555395356e-05, + "loss": 5.0031, + "step": 21621 + }, + { + "epoch": 0.12859215910172234, + "grad_norm": 1.2982511520385742, + "learning_rate": 4.798773795804311e-05, + "loss": 4.9553, + "step": 21622 + }, + { + "epoch": 0.12859810638500332, + "grad_norm": 1.2939512729644775, + "learning_rate": 4.798755435266607e-05, + "loss": 4.9096, + "step": 21623 + }, + { + "epoch": 0.12860405366828434, + "grad_norm": 1.2920591831207275, + "learning_rate": 4.7987370739264334e-05, + "loss": 4.8198, + "step": 21624 + }, + { + "epoch": 0.12861000095156533, + "grad_norm": 1.537635326385498, + "learning_rate": 4.798718711783793e-05, + "loss": 4.9656, + "step": 21625 + }, + { + "epoch": 0.12861594823484632, + "grad_norm": 1.4374878406524658, + "learning_rate": 4.798700348838694e-05, + "loss": 5.022, + "step": 21626 + }, + { + "epoch": 0.12862189551812733, + "grad_norm": 1.4768397808074951, + "learning_rate": 4.798681985091142e-05, + "loss": 5.1965, + "step": 21627 + }, + { + "epoch": 0.12862784280140832, + "grad_norm": 1.370009183883667, + "learning_rate": 4.798663620541145e-05, + "loss": 5.049, + "step": 21628 + }, + { + "epoch": 0.1286337900846893, + "grad_norm": 1.309531569480896, + "learning_rate": 4.7986452551887076e-05, + "loss": 4.9583, + "step": 21629 + }, + { + "epoch": 0.12863973736797032, + "grad_norm": 1.3303570747375488, + "learning_rate": 4.7986268890338365e-05, + "loss": 5.0708, + "step": 21630 + }, + { + "epoch": 0.1286456846512513, + "grad_norm": 1.389640212059021, + "learning_rate": 4.7986085220765385e-05, + "loss": 5.0744, + "step": 21631 + }, + { + "epoch": 0.1286516319345323, + "grad_norm": 1.198508620262146, + "learning_rate": 4.798590154316821e-05, + "loss": 5.0152, + "step": 21632 + }, + { + "epoch": 0.1286575792178133, + "grad_norm": 1.3534667491912842, + "learning_rate": 4.7985717857546886e-05, + "loss": 5.0292, + "step": 21633 + }, + { + "epoch": 0.1286635265010943, + "grad_norm": 1.4618093967437744, + "learning_rate": 4.798553416390149e-05, + "loss": 5.0733, + "step": 21634 + }, + { + "epoch": 0.1286694737843753, + "grad_norm": 1.4006026983261108, + "learning_rate": 4.798535046223207e-05, + "loss": 5.0071, + "step": 21635 + }, + { + "epoch": 0.12867542106765628, + "grad_norm": 1.4667402505874634, + "learning_rate": 4.7985166752538714e-05, + "loss": 4.8829, + "step": 21636 + }, + { + "epoch": 0.1286813683509373, + "grad_norm": 1.2916743755340576, + "learning_rate": 4.798498303482147e-05, + "loss": 4.9049, + "step": 21637 + }, + { + "epoch": 0.12868731563421829, + "grad_norm": 1.400270700454712, + "learning_rate": 4.798479930908041e-05, + "loss": 5.1051, + "step": 21638 + }, + { + "epoch": 0.12869326291749927, + "grad_norm": 1.3317632675170898, + "learning_rate": 4.798461557531558e-05, + "loss": 4.7864, + "step": 21639 + }, + { + "epoch": 0.1286992102007803, + "grad_norm": 1.1226558685302734, + "learning_rate": 4.7984431833527074e-05, + "loss": 4.8598, + "step": 21640 + }, + { + "epoch": 0.12870515748406128, + "grad_norm": 1.2921690940856934, + "learning_rate": 4.7984248083714934e-05, + "loss": 4.8687, + "step": 21641 + }, + { + "epoch": 0.12871110476734227, + "grad_norm": 1.2811640501022339, + "learning_rate": 4.798406432587923e-05, + "loss": 4.7438, + "step": 21642 + }, + { + "epoch": 0.12871705205062328, + "grad_norm": 1.1892732381820679, + "learning_rate": 4.7983880560020026e-05, + "loss": 4.681, + "step": 21643 + }, + { + "epoch": 0.12872299933390427, + "grad_norm": 1.3800525665283203, + "learning_rate": 4.7983696786137386e-05, + "loss": 4.9215, + "step": 21644 + }, + { + "epoch": 0.12872894661718526, + "grad_norm": 1.2753770351409912, + "learning_rate": 4.7983513004231385e-05, + "loss": 5.0006, + "step": 21645 + }, + { + "epoch": 0.12873489390046627, + "grad_norm": 1.494894027709961, + "learning_rate": 4.7983329214302064e-05, + "loss": 4.9356, + "step": 21646 + }, + { + "epoch": 0.12874084118374726, + "grad_norm": 1.3660098314285278, + "learning_rate": 4.7983145416349505e-05, + "loss": 5.3071, + "step": 21647 + }, + { + "epoch": 0.12874678846702825, + "grad_norm": 1.3494385480880737, + "learning_rate": 4.798296161037377e-05, + "loss": 5.3493, + "step": 21648 + }, + { + "epoch": 0.12875273575030927, + "grad_norm": 1.2632153034210205, + "learning_rate": 4.798277779637492e-05, + "loss": 4.9825, + "step": 21649 + }, + { + "epoch": 0.12875868303359025, + "grad_norm": 1.3519765138626099, + "learning_rate": 4.7982593974353015e-05, + "loss": 4.9032, + "step": 21650 + }, + { + "epoch": 0.12876463031687124, + "grad_norm": 1.3728691339492798, + "learning_rate": 4.798241014430813e-05, + "loss": 5.0458, + "step": 21651 + }, + { + "epoch": 0.12877057760015226, + "grad_norm": 1.326675295829773, + "learning_rate": 4.798222630624032e-05, + "loss": 4.9129, + "step": 21652 + }, + { + "epoch": 0.12877652488343325, + "grad_norm": 1.4878405332565308, + "learning_rate": 4.798204246014965e-05, + "loss": 5.1253, + "step": 21653 + }, + { + "epoch": 0.12878247216671423, + "grad_norm": 1.322288990020752, + "learning_rate": 4.798185860603619e-05, + "loss": 5.1333, + "step": 21654 + }, + { + "epoch": 0.12878841944999525, + "grad_norm": 1.496812343597412, + "learning_rate": 4.7981674743899995e-05, + "loss": 5.0263, + "step": 21655 + }, + { + "epoch": 0.12879436673327624, + "grad_norm": 1.4336779117584229, + "learning_rate": 4.7981490873741144e-05, + "loss": 5.1177, + "step": 21656 + }, + { + "epoch": 0.12880031401655723, + "grad_norm": 1.380751132965088, + "learning_rate": 4.7981306995559684e-05, + "loss": 5.0884, + "step": 21657 + }, + { + "epoch": 0.12880626129983824, + "grad_norm": 1.3929660320281982, + "learning_rate": 4.798112310935569e-05, + "loss": 5.3662, + "step": 21658 + }, + { + "epoch": 0.12881220858311923, + "grad_norm": 1.2857346534729004, + "learning_rate": 4.798093921512923e-05, + "loss": 5.2264, + "step": 21659 + }, + { + "epoch": 0.12881815586640022, + "grad_norm": 1.2468816041946411, + "learning_rate": 4.798075531288035e-05, + "loss": 4.8248, + "step": 21660 + }, + { + "epoch": 0.12882410314968123, + "grad_norm": 1.43264901638031, + "learning_rate": 4.798057140260913e-05, + "loss": 5.3999, + "step": 21661 + }, + { + "epoch": 0.12883005043296222, + "grad_norm": 1.3590344190597534, + "learning_rate": 4.798038748431563e-05, + "loss": 5.1312, + "step": 21662 + }, + { + "epoch": 0.1288359977162432, + "grad_norm": 1.4812084436416626, + "learning_rate": 4.7980203557999915e-05, + "loss": 4.7615, + "step": 21663 + }, + { + "epoch": 0.12884194499952423, + "grad_norm": 1.4256600141525269, + "learning_rate": 4.798001962366205e-05, + "loss": 4.8678, + "step": 21664 + }, + { + "epoch": 0.12884789228280522, + "grad_norm": 1.1849418878555298, + "learning_rate": 4.7979835681302095e-05, + "loss": 4.8823, + "step": 21665 + }, + { + "epoch": 0.1288538395660862, + "grad_norm": 1.395228385925293, + "learning_rate": 4.7979651730920116e-05, + "loss": 4.682, + "step": 21666 + }, + { + "epoch": 0.12885978684936722, + "grad_norm": 1.2800064086914062, + "learning_rate": 4.7979467772516186e-05, + "loss": 4.7797, + "step": 21667 + }, + { + "epoch": 0.1288657341326482, + "grad_norm": 1.3429536819458008, + "learning_rate": 4.7979283806090346e-05, + "loss": 4.7517, + "step": 21668 + }, + { + "epoch": 0.1288716814159292, + "grad_norm": 1.359732747077942, + "learning_rate": 4.797909983164269e-05, + "loss": 4.7123, + "step": 21669 + }, + { + "epoch": 0.1288776286992102, + "grad_norm": 1.2731539011001587, + "learning_rate": 4.7978915849173254e-05, + "loss": 4.7211, + "step": 21670 + }, + { + "epoch": 0.1288835759824912, + "grad_norm": 1.3688287734985352, + "learning_rate": 4.797873185868213e-05, + "loss": 4.7257, + "step": 21671 + }, + { + "epoch": 0.1288895232657722, + "grad_norm": 1.4043165445327759, + "learning_rate": 4.797854786016936e-05, + "loss": 4.8099, + "step": 21672 + }, + { + "epoch": 0.1288954705490532, + "grad_norm": 1.3721412420272827, + "learning_rate": 4.797836385363502e-05, + "loss": 4.7698, + "step": 21673 + }, + { + "epoch": 0.1289014178323342, + "grad_norm": 1.4348787069320679, + "learning_rate": 4.797817983907917e-05, + "loss": 4.7587, + "step": 21674 + }, + { + "epoch": 0.12890736511561518, + "grad_norm": 1.133793592453003, + "learning_rate": 4.797799581650187e-05, + "loss": 4.8101, + "step": 21675 + }, + { + "epoch": 0.1289133123988962, + "grad_norm": 1.3624104261398315, + "learning_rate": 4.797781178590319e-05, + "loss": 4.7416, + "step": 21676 + }, + { + "epoch": 0.12891925968217718, + "grad_norm": 1.5194214582443237, + "learning_rate": 4.7977627747283196e-05, + "loss": 4.6894, + "step": 21677 + }, + { + "epoch": 0.12892520696545817, + "grad_norm": 1.3625789880752563, + "learning_rate": 4.7977443700641954e-05, + "loss": 4.8029, + "step": 21678 + }, + { + "epoch": 0.1289311542487392, + "grad_norm": 1.2961907386779785, + "learning_rate": 4.797725964597952e-05, + "loss": 4.718, + "step": 21679 + }, + { + "epoch": 0.12893710153202018, + "grad_norm": 1.4091925621032715, + "learning_rate": 4.797707558329596e-05, + "loss": 4.7604, + "step": 21680 + }, + { + "epoch": 0.12894304881530116, + "grad_norm": 1.2274402379989624, + "learning_rate": 4.797689151259134e-05, + "loss": 4.8241, + "step": 21681 + }, + { + "epoch": 0.12894899609858218, + "grad_norm": 1.3694384098052979, + "learning_rate": 4.797670743386573e-05, + "loss": 4.7724, + "step": 21682 + }, + { + "epoch": 0.12895494338186317, + "grad_norm": 1.3621066808700562, + "learning_rate": 4.7976523347119184e-05, + "loss": 4.685, + "step": 21683 + }, + { + "epoch": 0.12896089066514416, + "grad_norm": 1.418641209602356, + "learning_rate": 4.7976339252351766e-05, + "loss": 4.7379, + "step": 21684 + }, + { + "epoch": 0.12896683794842517, + "grad_norm": 1.3113913536071777, + "learning_rate": 4.797615514956355e-05, + "loss": 4.7922, + "step": 21685 + }, + { + "epoch": 0.12897278523170616, + "grad_norm": 1.3266078233718872, + "learning_rate": 4.79759710387546e-05, + "loss": 4.7116, + "step": 21686 + }, + { + "epoch": 0.12897873251498715, + "grad_norm": 1.5212455987930298, + "learning_rate": 4.7975786919924975e-05, + "loss": 4.8422, + "step": 21687 + }, + { + "epoch": 0.12898467979826816, + "grad_norm": 1.225883960723877, + "learning_rate": 4.797560279307473e-05, + "loss": 4.8641, + "step": 21688 + }, + { + "epoch": 0.12899062708154915, + "grad_norm": 1.451951026916504, + "learning_rate": 4.797541865820395e-05, + "loss": 4.7685, + "step": 21689 + }, + { + "epoch": 0.12899657436483014, + "grad_norm": 1.3755689859390259, + "learning_rate": 4.7975234515312694e-05, + "loss": 4.7828, + "step": 21690 + }, + { + "epoch": 0.12900252164811113, + "grad_norm": 1.2667524814605713, + "learning_rate": 4.797505036440101e-05, + "loss": 4.6897, + "step": 21691 + }, + { + "epoch": 0.12900846893139215, + "grad_norm": 1.4491240978240967, + "learning_rate": 4.797486620546898e-05, + "loss": 4.8052, + "step": 21692 + }, + { + "epoch": 0.12901441621467313, + "grad_norm": 1.21664559841156, + "learning_rate": 4.797468203851665e-05, + "loss": 4.712, + "step": 21693 + }, + { + "epoch": 0.12902036349795412, + "grad_norm": 1.3836992979049683, + "learning_rate": 4.797449786354411e-05, + "loss": 4.6642, + "step": 21694 + }, + { + "epoch": 0.12902631078123514, + "grad_norm": 1.4487723112106323, + "learning_rate": 4.79743136805514e-05, + "loss": 4.7088, + "step": 21695 + }, + { + "epoch": 0.12903225806451613, + "grad_norm": 1.2507479190826416, + "learning_rate": 4.79741294895386e-05, + "loss": 4.8429, + "step": 21696 + }, + { + "epoch": 0.1290382053477971, + "grad_norm": 1.231549620628357, + "learning_rate": 4.7973945290505766e-05, + "loss": 4.9336, + "step": 21697 + }, + { + "epoch": 0.12904415263107813, + "grad_norm": 1.2317709922790527, + "learning_rate": 4.797376108345297e-05, + "loss": 4.6885, + "step": 21698 + }, + { + "epoch": 0.12905009991435912, + "grad_norm": 1.2158896923065186, + "learning_rate": 4.797357686838026e-05, + "loss": 4.7068, + "step": 21699 + }, + { + "epoch": 0.1290560471976401, + "grad_norm": 1.5054548978805542, + "learning_rate": 4.7973392645287726e-05, + "loss": 4.8568, + "step": 21700 + }, + { + "epoch": 0.12906199448092112, + "grad_norm": 1.1551764011383057, + "learning_rate": 4.7973208414175406e-05, + "loss": 4.6746, + "step": 21701 + }, + { + "epoch": 0.1290679417642021, + "grad_norm": 1.3304046392440796, + "learning_rate": 4.7973024175043386e-05, + "loss": 4.8012, + "step": 21702 + }, + { + "epoch": 0.1290738890474831, + "grad_norm": 1.4763063192367554, + "learning_rate": 4.797283992789172e-05, + "loss": 4.7412, + "step": 21703 + }, + { + "epoch": 0.12907983633076411, + "grad_norm": 1.247894287109375, + "learning_rate": 4.797265567272047e-05, + "loss": 4.7786, + "step": 21704 + }, + { + "epoch": 0.1290857836140451, + "grad_norm": 1.3668314218521118, + "learning_rate": 4.79724714095297e-05, + "loss": 4.7728, + "step": 21705 + }, + { + "epoch": 0.1290917308973261, + "grad_norm": 1.3727326393127441, + "learning_rate": 4.7972287138319477e-05, + "loss": 4.8493, + "step": 21706 + }, + { + "epoch": 0.1290976781806071, + "grad_norm": 1.3531663417816162, + "learning_rate": 4.797210285908987e-05, + "loss": 4.7598, + "step": 21707 + }, + { + "epoch": 0.1291036254638881, + "grad_norm": 1.4899832010269165, + "learning_rate": 4.797191857184094e-05, + "loss": 4.7274, + "step": 21708 + }, + { + "epoch": 0.12910957274716908, + "grad_norm": 1.3908995389938354, + "learning_rate": 4.7971734276572744e-05, + "loss": 4.9911, + "step": 21709 + }, + { + "epoch": 0.1291155200304501, + "grad_norm": 1.621774435043335, + "learning_rate": 4.7971549973285357e-05, + "loss": 5.0285, + "step": 21710 + }, + { + "epoch": 0.1291214673137311, + "grad_norm": 1.415650725364685, + "learning_rate": 4.797136566197884e-05, + "loss": 5.0959, + "step": 21711 + }, + { + "epoch": 0.12912741459701207, + "grad_norm": 1.4947463274002075, + "learning_rate": 4.797118134265326e-05, + "loss": 4.9473, + "step": 21712 + }, + { + "epoch": 0.1291333618802931, + "grad_norm": 1.45017409324646, + "learning_rate": 4.7970997015308674e-05, + "loss": 4.9928, + "step": 21713 + }, + { + "epoch": 0.12913930916357408, + "grad_norm": 1.2527333498001099, + "learning_rate": 4.7970812679945145e-05, + "loss": 4.9771, + "step": 21714 + }, + { + "epoch": 0.12914525644685507, + "grad_norm": 1.457526683807373, + "learning_rate": 4.797062833656275e-05, + "loss": 5.0143, + "step": 21715 + }, + { + "epoch": 0.12915120373013608, + "grad_norm": 1.1785821914672852, + "learning_rate": 4.7970443985161546e-05, + "loss": 5.1313, + "step": 21716 + }, + { + "epoch": 0.12915715101341707, + "grad_norm": 1.3593906164169312, + "learning_rate": 4.79702596257416e-05, + "loss": 5.0906, + "step": 21717 + }, + { + "epoch": 0.12916309829669806, + "grad_norm": 1.3789628744125366, + "learning_rate": 4.797007525830296e-05, + "loss": 5.0352, + "step": 21718 + }, + { + "epoch": 0.12916904557997907, + "grad_norm": 1.282631278038025, + "learning_rate": 4.796989088284571e-05, + "loss": 4.9973, + "step": 21719 + }, + { + "epoch": 0.12917499286326006, + "grad_norm": 1.2933098077774048, + "learning_rate": 4.796970649936991e-05, + "loss": 5.0783, + "step": 21720 + }, + { + "epoch": 0.12918094014654105, + "grad_norm": 1.3595205545425415, + "learning_rate": 4.796952210787563e-05, + "loss": 5.158, + "step": 21721 + }, + { + "epoch": 0.12918688742982207, + "grad_norm": 1.3962088823318481, + "learning_rate": 4.796933770836293e-05, + "loss": 4.9939, + "step": 21722 + }, + { + "epoch": 0.12919283471310306, + "grad_norm": 1.382554292678833, + "learning_rate": 4.796915330083186e-05, + "loss": 4.8864, + "step": 21723 + }, + { + "epoch": 0.12919878199638404, + "grad_norm": 1.3807674646377563, + "learning_rate": 4.7968968885282495e-05, + "loss": 5.0454, + "step": 21724 + }, + { + "epoch": 0.12920472927966506, + "grad_norm": 1.276809811592102, + "learning_rate": 4.7968784461714905e-05, + "loss": 5.1221, + "step": 21725 + }, + { + "epoch": 0.12921067656294605, + "grad_norm": 1.230714201927185, + "learning_rate": 4.796860003012915e-05, + "loss": 5.0382, + "step": 21726 + }, + { + "epoch": 0.12921662384622704, + "grad_norm": 1.2899225950241089, + "learning_rate": 4.796841559052529e-05, + "loss": 4.8591, + "step": 21727 + }, + { + "epoch": 0.12922257112950805, + "grad_norm": 1.3561869859695435, + "learning_rate": 4.79682311429034e-05, + "loss": 4.8811, + "step": 21728 + }, + { + "epoch": 0.12922851841278904, + "grad_norm": 1.600656270980835, + "learning_rate": 4.796804668726353e-05, + "loss": 4.9317, + "step": 21729 + }, + { + "epoch": 0.12923446569607003, + "grad_norm": 1.4110677242279053, + "learning_rate": 4.7967862223605756e-05, + "loss": 5.0964, + "step": 21730 + }, + { + "epoch": 0.12924041297935104, + "grad_norm": 1.2293707132339478, + "learning_rate": 4.796767775193014e-05, + "loss": 5.2952, + "step": 21731 + }, + { + "epoch": 0.12924636026263203, + "grad_norm": 1.4413278102874756, + "learning_rate": 4.796749327223674e-05, + "loss": 4.9628, + "step": 21732 + }, + { + "epoch": 0.12925230754591302, + "grad_norm": 1.4178003072738647, + "learning_rate": 4.7967308784525635e-05, + "loss": 4.7142, + "step": 21733 + }, + { + "epoch": 0.12925825482919404, + "grad_norm": 1.2427667379379272, + "learning_rate": 4.7967124288796875e-05, + "loss": 5.2655, + "step": 21734 + }, + { + "epoch": 0.12926420211247502, + "grad_norm": 1.3278542757034302, + "learning_rate": 4.796693978505052e-05, + "loss": 5.0156, + "step": 21735 + }, + { + "epoch": 0.129270149395756, + "grad_norm": 1.3728119134902954, + "learning_rate": 4.7966755273286656e-05, + "loss": 5.4176, + "step": 21736 + }, + { + "epoch": 0.12927609667903703, + "grad_norm": 1.344072937965393, + "learning_rate": 4.796657075350533e-05, + "loss": 4.8808, + "step": 21737 + }, + { + "epoch": 0.12928204396231802, + "grad_norm": 1.2877874374389648, + "learning_rate": 4.796638622570661e-05, + "loss": 5.0312, + "step": 21738 + }, + { + "epoch": 0.129287991245599, + "grad_norm": 1.3147602081298828, + "learning_rate": 4.7966201689890566e-05, + "loss": 5.0241, + "step": 21739 + }, + { + "epoch": 0.12929393852888002, + "grad_norm": 1.3858917951583862, + "learning_rate": 4.796601714605726e-05, + "loss": 4.468, + "step": 21740 + }, + { + "epoch": 0.129299885812161, + "grad_norm": 1.4089725017547607, + "learning_rate": 4.7965832594206747e-05, + "loss": 4.587, + "step": 21741 + }, + { + "epoch": 0.129305833095442, + "grad_norm": 1.4754424095153809, + "learning_rate": 4.796564803433911e-05, + "loss": 4.8697, + "step": 21742 + }, + { + "epoch": 0.129311780378723, + "grad_norm": 1.557544231414795, + "learning_rate": 4.796546346645439e-05, + "loss": 5.058, + "step": 21743 + }, + { + "epoch": 0.129317727662004, + "grad_norm": 1.3962191343307495, + "learning_rate": 4.7965278890552666e-05, + "loss": 5.172, + "step": 21744 + }, + { + "epoch": 0.129323674945285, + "grad_norm": 1.4976222515106201, + "learning_rate": 4.796509430663401e-05, + "loss": 5.2878, + "step": 21745 + }, + { + "epoch": 0.129329622228566, + "grad_norm": 1.3315789699554443, + "learning_rate": 4.796490971469847e-05, + "loss": 5.0468, + "step": 21746 + }, + { + "epoch": 0.129335569511847, + "grad_norm": 1.3718360662460327, + "learning_rate": 4.796472511474611e-05, + "loss": 4.9696, + "step": 21747 + }, + { + "epoch": 0.12934151679512798, + "grad_norm": 1.4873707294464111, + "learning_rate": 4.7964540506777014e-05, + "loss": 4.9281, + "step": 21748 + }, + { + "epoch": 0.12934746407840897, + "grad_norm": 1.3806785345077515, + "learning_rate": 4.7964355890791226e-05, + "loss": 5.1646, + "step": 21749 + }, + { + "epoch": 0.12935341136168998, + "grad_norm": 1.4873976707458496, + "learning_rate": 4.796417126678883e-05, + "loss": 5.1125, + "step": 21750 + }, + { + "epoch": 0.12935935864497097, + "grad_norm": 1.3314671516418457, + "learning_rate": 4.7963986634769864e-05, + "loss": 5.0819, + "step": 21751 + }, + { + "epoch": 0.12936530592825196, + "grad_norm": 1.2392772436141968, + "learning_rate": 4.796380199473442e-05, + "loss": 5.0049, + "step": 21752 + }, + { + "epoch": 0.12937125321153298, + "grad_norm": 1.4799960851669312, + "learning_rate": 4.7963617346682544e-05, + "loss": 4.8518, + "step": 21753 + }, + { + "epoch": 0.12937720049481397, + "grad_norm": 1.5646624565124512, + "learning_rate": 4.796343269061431e-05, + "loss": 4.5612, + "step": 21754 + }, + { + "epoch": 0.12938314777809495, + "grad_norm": 1.5001260042190552, + "learning_rate": 4.796324802652977e-05, + "loss": 4.8736, + "step": 21755 + }, + { + "epoch": 0.12938909506137597, + "grad_norm": 1.4235304594039917, + "learning_rate": 4.7963063354429004e-05, + "loss": 4.9256, + "step": 21756 + }, + { + "epoch": 0.12939504234465696, + "grad_norm": 1.3335869312286377, + "learning_rate": 4.7962878674312075e-05, + "loss": 4.7066, + "step": 21757 + }, + { + "epoch": 0.12940098962793795, + "grad_norm": 1.2664694786071777, + "learning_rate": 4.7962693986179036e-05, + "loss": 4.7202, + "step": 21758 + }, + { + "epoch": 0.12940693691121896, + "grad_norm": 1.2120671272277832, + "learning_rate": 4.7962509290029954e-05, + "loss": 4.8417, + "step": 21759 + }, + { + "epoch": 0.12941288419449995, + "grad_norm": 1.3657382726669312, + "learning_rate": 4.7962324585864906e-05, + "loss": 4.6566, + "step": 21760 + }, + { + "epoch": 0.12941883147778094, + "grad_norm": 1.3212461471557617, + "learning_rate": 4.7962139873683944e-05, + "loss": 4.8251, + "step": 21761 + }, + { + "epoch": 0.12942477876106195, + "grad_norm": 1.9045685529708862, + "learning_rate": 4.7961955153487137e-05, + "loss": 4.5268, + "step": 21762 + }, + { + "epoch": 0.12943072604434294, + "grad_norm": 1.536188006401062, + "learning_rate": 4.7961770425274545e-05, + "loss": 4.8356, + "step": 21763 + }, + { + "epoch": 0.12943667332762393, + "grad_norm": 1.4966436624526978, + "learning_rate": 4.796158568904624e-05, + "loss": 4.485, + "step": 21764 + }, + { + "epoch": 0.12944262061090495, + "grad_norm": 1.377543568611145, + "learning_rate": 4.796140094480228e-05, + "loss": 4.7828, + "step": 21765 + }, + { + "epoch": 0.12944856789418593, + "grad_norm": 1.6093590259552002, + "learning_rate": 4.796121619254273e-05, + "loss": 4.6621, + "step": 21766 + }, + { + "epoch": 0.12945451517746692, + "grad_norm": 1.4633464813232422, + "learning_rate": 4.796103143226767e-05, + "loss": 4.7979, + "step": 21767 + }, + { + "epoch": 0.12946046246074794, + "grad_norm": 1.332219123840332, + "learning_rate": 4.7960846663977136e-05, + "loss": 4.8313, + "step": 21768 + }, + { + "epoch": 0.12946640974402893, + "grad_norm": 1.2190324068069458, + "learning_rate": 4.796066188767121e-05, + "loss": 4.6559, + "step": 21769 + }, + { + "epoch": 0.12947235702730991, + "grad_norm": 1.4958453178405762, + "learning_rate": 4.796047710334996e-05, + "loss": 4.7633, + "step": 21770 + }, + { + "epoch": 0.12947830431059093, + "grad_norm": 1.2693027257919312, + "learning_rate": 4.796029231101344e-05, + "loss": 4.7291, + "step": 21771 + }, + { + "epoch": 0.12948425159387192, + "grad_norm": 1.2988125085830688, + "learning_rate": 4.7960107510661725e-05, + "loss": 4.7817, + "step": 21772 + }, + { + "epoch": 0.1294901988771529, + "grad_norm": 1.355332374572754, + "learning_rate": 4.7959922702294866e-05, + "loss": 4.6112, + "step": 21773 + }, + { + "epoch": 0.12949614616043392, + "grad_norm": 1.3531986474990845, + "learning_rate": 4.7959737885912934e-05, + "loss": 4.7711, + "step": 21774 + }, + { + "epoch": 0.1295020934437149, + "grad_norm": 1.275888204574585, + "learning_rate": 4.7959553061516004e-05, + "loss": 4.9089, + "step": 21775 + }, + { + "epoch": 0.1295080407269959, + "grad_norm": 1.4016762971878052, + "learning_rate": 4.795936822910413e-05, + "loss": 4.8768, + "step": 21776 + }, + { + "epoch": 0.12951398801027691, + "grad_norm": 1.5274311304092407, + "learning_rate": 4.795918338867737e-05, + "loss": 4.7434, + "step": 21777 + }, + { + "epoch": 0.1295199352935579, + "grad_norm": 1.4976401329040527, + "learning_rate": 4.79589985402358e-05, + "loss": 4.992, + "step": 21778 + }, + { + "epoch": 0.1295258825768389, + "grad_norm": 1.5180116891860962, + "learning_rate": 4.795881368377948e-05, + "loss": 5.1312, + "step": 21779 + }, + { + "epoch": 0.1295318298601199, + "grad_norm": 1.3271901607513428, + "learning_rate": 4.795862881930848e-05, + "loss": 5.1021, + "step": 21780 + }, + { + "epoch": 0.1295377771434009, + "grad_norm": 1.5069388151168823, + "learning_rate": 4.795844394682286e-05, + "loss": 4.8872, + "step": 21781 + }, + { + "epoch": 0.12954372442668188, + "grad_norm": 1.4247567653656006, + "learning_rate": 4.795825906632267e-05, + "loss": 5.0028, + "step": 21782 + }, + { + "epoch": 0.1295496717099629, + "grad_norm": 1.4976978302001953, + "learning_rate": 4.795807417780801e-05, + "loss": 5.0181, + "step": 21783 + }, + { + "epoch": 0.1295556189932439, + "grad_norm": 1.291518211364746, + "learning_rate": 4.7957889281278913e-05, + "loss": 4.8314, + "step": 21784 + }, + { + "epoch": 0.12956156627652488, + "grad_norm": 1.352803349494934, + "learning_rate": 4.7957704376735455e-05, + "loss": 4.916, + "step": 21785 + }, + { + "epoch": 0.1295675135598059, + "grad_norm": 1.3911688327789307, + "learning_rate": 4.7957519464177695e-05, + "loss": 5.1256, + "step": 21786 + }, + { + "epoch": 0.12957346084308688, + "grad_norm": 1.2493035793304443, + "learning_rate": 4.795733454360571e-05, + "loss": 4.8268, + "step": 21787 + }, + { + "epoch": 0.12957940812636787, + "grad_norm": 1.4249591827392578, + "learning_rate": 4.7957149615019547e-05, + "loss": 4.8414, + "step": 21788 + }, + { + "epoch": 0.12958535540964888, + "grad_norm": 1.5388774871826172, + "learning_rate": 4.795696467841929e-05, + "loss": 4.6288, + "step": 21789 + }, + { + "epoch": 0.12959130269292987, + "grad_norm": 1.1780091524124146, + "learning_rate": 4.795677973380499e-05, + "loss": 4.5712, + "step": 21790 + }, + { + "epoch": 0.12959724997621086, + "grad_norm": 1.2415392398834229, + "learning_rate": 4.7956594781176716e-05, + "loss": 4.8536, + "step": 21791 + }, + { + "epoch": 0.12960319725949188, + "grad_norm": 1.2828611135482788, + "learning_rate": 4.795640982053453e-05, + "loss": 5.1549, + "step": 21792 + }, + { + "epoch": 0.12960914454277286, + "grad_norm": 1.5143916606903076, + "learning_rate": 4.79562248518785e-05, + "loss": 5.2302, + "step": 21793 + }, + { + "epoch": 0.12961509182605385, + "grad_norm": 1.3260207176208496, + "learning_rate": 4.795603987520869e-05, + "loss": 4.9272, + "step": 21794 + }, + { + "epoch": 0.12962103910933487, + "grad_norm": 1.2133897542953491, + "learning_rate": 4.795585489052516e-05, + "loss": 4.8229, + "step": 21795 + }, + { + "epoch": 0.12962698639261586, + "grad_norm": 1.5181169509887695, + "learning_rate": 4.795566989782798e-05, + "loss": 4.8024, + "step": 21796 + }, + { + "epoch": 0.12963293367589684, + "grad_norm": 1.3889726400375366, + "learning_rate": 4.795548489711722e-05, + "loss": 4.5859, + "step": 21797 + }, + { + "epoch": 0.12963888095917786, + "grad_norm": 1.543861985206604, + "learning_rate": 4.7955299888392924e-05, + "loss": 4.7782, + "step": 21798 + }, + { + "epoch": 0.12964482824245885, + "grad_norm": 1.4648151397705078, + "learning_rate": 4.795511487165518e-05, + "loss": 4.9949, + "step": 21799 + }, + { + "epoch": 0.12965077552573984, + "grad_norm": 1.2487531900405884, + "learning_rate": 4.795492984690404e-05, + "loss": 5.0329, + "step": 21800 + }, + { + "epoch": 0.12965672280902085, + "grad_norm": 1.503164529800415, + "learning_rate": 4.795474481413957e-05, + "loss": 4.7723, + "step": 21801 + }, + { + "epoch": 0.12966267009230184, + "grad_norm": 1.3406294584274292, + "learning_rate": 4.795455977336184e-05, + "loss": 4.9541, + "step": 21802 + }, + { + "epoch": 0.12966861737558283, + "grad_norm": 1.4314171075820923, + "learning_rate": 4.795437472457091e-05, + "loss": 5.018, + "step": 21803 + }, + { + "epoch": 0.12967456465886384, + "grad_norm": 1.3255850076675415, + "learning_rate": 4.795418966776683e-05, + "loss": 4.7675, + "step": 21804 + }, + { + "epoch": 0.12968051194214483, + "grad_norm": 1.6132442951202393, + "learning_rate": 4.7954004602949697e-05, + "loss": 4.8068, + "step": 21805 + }, + { + "epoch": 0.12968645922542582, + "grad_norm": 1.25650954246521, + "learning_rate": 4.7953819530119555e-05, + "loss": 4.8709, + "step": 21806 + }, + { + "epoch": 0.1296924065087068, + "grad_norm": 1.3686168193817139, + "learning_rate": 4.795363444927646e-05, + "loss": 4.8815, + "step": 21807 + }, + { + "epoch": 0.12969835379198782, + "grad_norm": 1.250143051147461, + "learning_rate": 4.79534493604205e-05, + "loss": 4.9077, + "step": 21808 + }, + { + "epoch": 0.1297043010752688, + "grad_norm": 1.421834111213684, + "learning_rate": 4.795326426355173e-05, + "loss": 4.806, + "step": 21809 + }, + { + "epoch": 0.1297102483585498, + "grad_norm": 1.3038170337677002, + "learning_rate": 4.795307915867021e-05, + "loss": 5.0142, + "step": 21810 + }, + { + "epoch": 0.12971619564183082, + "grad_norm": 1.390637993812561, + "learning_rate": 4.7952894045776e-05, + "loss": 4.8802, + "step": 21811 + }, + { + "epoch": 0.1297221429251118, + "grad_norm": 1.3310891389846802, + "learning_rate": 4.7952708924869184e-05, + "loss": 4.7995, + "step": 21812 + }, + { + "epoch": 0.1297280902083928, + "grad_norm": 1.243156909942627, + "learning_rate": 4.79525237959498e-05, + "loss": 4.6147, + "step": 21813 + }, + { + "epoch": 0.1297340374916738, + "grad_norm": 1.522707223892212, + "learning_rate": 4.7952338659017934e-05, + "loss": 4.6666, + "step": 21814 + }, + { + "epoch": 0.1297399847749548, + "grad_norm": 1.3331211805343628, + "learning_rate": 4.795215351407365e-05, + "loss": 4.7236, + "step": 21815 + }, + { + "epoch": 0.12974593205823579, + "grad_norm": 1.3704382181167603, + "learning_rate": 4.7951968361116996e-05, + "loss": 5.299, + "step": 21816 + }, + { + "epoch": 0.1297518793415168, + "grad_norm": 1.4870846271514893, + "learning_rate": 4.7951783200148055e-05, + "loss": 5.2623, + "step": 21817 + }, + { + "epoch": 0.1297578266247978, + "grad_norm": 1.4282408952713013, + "learning_rate": 4.795159803116688e-05, + "loss": 5.075, + "step": 21818 + }, + { + "epoch": 0.12976377390807878, + "grad_norm": 1.408409595489502, + "learning_rate": 4.795141285417354e-05, + "loss": 4.7274, + "step": 21819 + }, + { + "epoch": 0.1297697211913598, + "grad_norm": 1.4432475566864014, + "learning_rate": 4.79512276691681e-05, + "loss": 4.8196, + "step": 21820 + }, + { + "epoch": 0.12977566847464078, + "grad_norm": 1.6136623620986938, + "learning_rate": 4.7951042476150624e-05, + "loss": 4.7634, + "step": 21821 + }, + { + "epoch": 0.12978161575792177, + "grad_norm": 1.13461434841156, + "learning_rate": 4.795085727512117e-05, + "loss": 4.9421, + "step": 21822 + }, + { + "epoch": 0.12978756304120279, + "grad_norm": 1.2107611894607544, + "learning_rate": 4.795067206607981e-05, + "loss": 5.1572, + "step": 21823 + }, + { + "epoch": 0.12979351032448377, + "grad_norm": 1.8843787908554077, + "learning_rate": 4.795048684902661e-05, + "loss": 5.4081, + "step": 21824 + }, + { + "epoch": 0.12979945760776476, + "grad_norm": 1.192597508430481, + "learning_rate": 4.7950301623961633e-05, + "loss": 4.9609, + "step": 21825 + }, + { + "epoch": 0.12980540489104578, + "grad_norm": 1.4349040985107422, + "learning_rate": 4.795011639088495e-05, + "loss": 4.72, + "step": 21826 + }, + { + "epoch": 0.12981135217432677, + "grad_norm": 1.8054217100143433, + "learning_rate": 4.79499311497966e-05, + "loss": 5.5003, + "step": 21827 + }, + { + "epoch": 0.12981729945760775, + "grad_norm": 1.521070122718811, + "learning_rate": 4.794974590069669e-05, + "loss": 5.5325, + "step": 21828 + }, + { + "epoch": 0.12982324674088877, + "grad_norm": 1.936892032623291, + "learning_rate": 4.794956064358524e-05, + "loss": 4.6644, + "step": 21829 + }, + { + "epoch": 0.12982919402416976, + "grad_norm": 1.9401378631591797, + "learning_rate": 4.794937537846234e-05, + "loss": 4.7442, + "step": 21830 + }, + { + "epoch": 0.12983514130745075, + "grad_norm": 1.3924851417541504, + "learning_rate": 4.794919010532806e-05, + "loss": 4.9434, + "step": 21831 + }, + { + "epoch": 0.12984108859073176, + "grad_norm": 1.3180463314056396, + "learning_rate": 4.794900482418244e-05, + "loss": 4.9098, + "step": 21832 + }, + { + "epoch": 0.12984703587401275, + "grad_norm": 1.3872355222702026, + "learning_rate": 4.7948819535025565e-05, + "loss": 4.8212, + "step": 21833 + }, + { + "epoch": 0.12985298315729374, + "grad_norm": 1.2868075370788574, + "learning_rate": 4.79486342378575e-05, + "loss": 4.7609, + "step": 21834 + }, + { + "epoch": 0.12985893044057475, + "grad_norm": 1.4286006689071655, + "learning_rate": 4.79484489326783e-05, + "loss": 4.828, + "step": 21835 + }, + { + "epoch": 0.12986487772385574, + "grad_norm": 1.3485580682754517, + "learning_rate": 4.794826361948804e-05, + "loss": 4.7596, + "step": 21836 + }, + { + "epoch": 0.12987082500713673, + "grad_norm": 1.469319224357605, + "learning_rate": 4.794807829828677e-05, + "loss": 4.8431, + "step": 21837 + }, + { + "epoch": 0.12987677229041775, + "grad_norm": 1.4626957178115845, + "learning_rate": 4.794789296907457e-05, + "loss": 4.7884, + "step": 21838 + }, + { + "epoch": 0.12988271957369873, + "grad_norm": 1.2266536951065063, + "learning_rate": 4.794770763185149e-05, + "loss": 4.8359, + "step": 21839 + }, + { + "epoch": 0.12988866685697972, + "grad_norm": 1.2295827865600586, + "learning_rate": 4.794752228661761e-05, + "loss": 4.6327, + "step": 21840 + }, + { + "epoch": 0.12989461414026074, + "grad_norm": 1.4784702062606812, + "learning_rate": 4.794733693337298e-05, + "loss": 4.8363, + "step": 21841 + }, + { + "epoch": 0.12990056142354173, + "grad_norm": 1.6527009010314941, + "learning_rate": 4.794715157211767e-05, + "loss": 5.0696, + "step": 21842 + }, + { + "epoch": 0.12990650870682272, + "grad_norm": 1.7082421779632568, + "learning_rate": 4.7946966202851754e-05, + "loss": 4.8249, + "step": 21843 + }, + { + "epoch": 0.12991245599010373, + "grad_norm": 1.5493143796920776, + "learning_rate": 4.794678082557529e-05, + "loss": 4.9604, + "step": 21844 + }, + { + "epoch": 0.12991840327338472, + "grad_norm": 1.631940245628357, + "learning_rate": 4.7946595440288335e-05, + "loss": 4.6672, + "step": 21845 + }, + { + "epoch": 0.1299243505566657, + "grad_norm": 1.3021342754364014, + "learning_rate": 4.794641004699096e-05, + "loss": 4.821, + "step": 21846 + }, + { + "epoch": 0.12993029783994672, + "grad_norm": 1.331272006034851, + "learning_rate": 4.794622464568324e-05, + "loss": 5.1398, + "step": 21847 + }, + { + "epoch": 0.1299362451232277, + "grad_norm": 1.5635039806365967, + "learning_rate": 4.794603923636522e-05, + "loss": 5.0405, + "step": 21848 + }, + { + "epoch": 0.1299421924065087, + "grad_norm": 1.412961721420288, + "learning_rate": 4.794585381903698e-05, + "loss": 5.1334, + "step": 21849 + }, + { + "epoch": 0.12994813968978972, + "grad_norm": 1.0943198204040527, + "learning_rate": 4.794566839369857e-05, + "loss": 5.1978, + "step": 21850 + }, + { + "epoch": 0.1299540869730707, + "grad_norm": 1.6458427906036377, + "learning_rate": 4.794548296035007e-05, + "loss": 4.6475, + "step": 21851 + }, + { + "epoch": 0.1299600342563517, + "grad_norm": 1.37641179561615, + "learning_rate": 4.794529751899155e-05, + "loss": 5.0094, + "step": 21852 + }, + { + "epoch": 0.1299659815396327, + "grad_norm": 1.6493875980377197, + "learning_rate": 4.7945112069623054e-05, + "loss": 4.9748, + "step": 21853 + }, + { + "epoch": 0.1299719288229137, + "grad_norm": 1.4612071514129639, + "learning_rate": 4.794492661224466e-05, + "loss": 5.1217, + "step": 21854 + }, + { + "epoch": 0.12997787610619468, + "grad_norm": 1.4929149150848389, + "learning_rate": 4.7944741146856425e-05, + "loss": 4.916, + "step": 21855 + }, + { + "epoch": 0.1299838233894757, + "grad_norm": 1.5030015707015991, + "learning_rate": 4.794455567345842e-05, + "loss": 5.1206, + "step": 21856 + }, + { + "epoch": 0.1299897706727567, + "grad_norm": 1.3132811784744263, + "learning_rate": 4.79443701920507e-05, + "loss": 5.1996, + "step": 21857 + }, + { + "epoch": 0.12999571795603768, + "grad_norm": 1.3515914678573608, + "learning_rate": 4.794418470263335e-05, + "loss": 4.8565, + "step": 21858 + }, + { + "epoch": 0.1300016652393187, + "grad_norm": 1.3780977725982666, + "learning_rate": 4.7943999205206414e-05, + "loss": 4.9207, + "step": 21859 + }, + { + "epoch": 0.13000761252259968, + "grad_norm": 1.3044095039367676, + "learning_rate": 4.794381369976997e-05, + "loss": 5.0898, + "step": 21860 + }, + { + "epoch": 0.13001355980588067, + "grad_norm": 1.3406704664230347, + "learning_rate": 4.7943628186324076e-05, + "loss": 4.942, + "step": 21861 + }, + { + "epoch": 0.13001950708916168, + "grad_norm": 1.2654430866241455, + "learning_rate": 4.7943442664868795e-05, + "loss": 5.2096, + "step": 21862 + }, + { + "epoch": 0.13002545437244267, + "grad_norm": 1.313717007637024, + "learning_rate": 4.79432571354042e-05, + "loss": 4.9946, + "step": 21863 + }, + { + "epoch": 0.13003140165572366, + "grad_norm": 1.0787066221237183, + "learning_rate": 4.794307159793035e-05, + "loss": 4.9556, + "step": 21864 + }, + { + "epoch": 0.13003734893900465, + "grad_norm": 1.3731575012207031, + "learning_rate": 4.794288605244731e-05, + "loss": 4.904, + "step": 21865 + }, + { + "epoch": 0.13004329622228566, + "grad_norm": 1.4843237400054932, + "learning_rate": 4.794270049895514e-05, + "loss": 5.1451, + "step": 21866 + }, + { + "epoch": 0.13004924350556665, + "grad_norm": 1.3293545246124268, + "learning_rate": 4.794251493745392e-05, + "loss": 5.1794, + "step": 21867 + }, + { + "epoch": 0.13005519078884764, + "grad_norm": 1.6757280826568604, + "learning_rate": 4.79423293679437e-05, + "loss": 4.9797, + "step": 21868 + }, + { + "epoch": 0.13006113807212866, + "grad_norm": 1.7158734798431396, + "learning_rate": 4.794214379042456e-05, + "loss": 4.7833, + "step": 21869 + }, + { + "epoch": 0.13006708535540965, + "grad_norm": 2.164602756500244, + "learning_rate": 4.794195820489654e-05, + "loss": 4.4662, + "step": 21870 + }, + { + "epoch": 0.13007303263869063, + "grad_norm": 1.5726985931396484, + "learning_rate": 4.794177261135972e-05, + "loss": 5.3064, + "step": 21871 + }, + { + "epoch": 0.13007897992197165, + "grad_norm": 1.3667716979980469, + "learning_rate": 4.794158700981417e-05, + "loss": 5.0881, + "step": 21872 + }, + { + "epoch": 0.13008492720525264, + "grad_norm": 1.5155465602874756, + "learning_rate": 4.794140140025994e-05, + "loss": 4.95, + "step": 21873 + }, + { + "epoch": 0.13009087448853363, + "grad_norm": 1.4024773836135864, + "learning_rate": 4.794121578269712e-05, + "loss": 5.1932, + "step": 21874 + }, + { + "epoch": 0.13009682177181464, + "grad_norm": 1.3104946613311768, + "learning_rate": 4.7941030157125746e-05, + "loss": 5.1143, + "step": 21875 + }, + { + "epoch": 0.13010276905509563, + "grad_norm": 1.3269513845443726, + "learning_rate": 4.79408445235459e-05, + "loss": 5.1411, + "step": 21876 + }, + { + "epoch": 0.13010871633837662, + "grad_norm": 1.3147937059402466, + "learning_rate": 4.7940658881957645e-05, + "loss": 5.0444, + "step": 21877 + }, + { + "epoch": 0.13011466362165763, + "grad_norm": 1.125897765159607, + "learning_rate": 4.794047323236104e-05, + "loss": 5.0522, + "step": 21878 + }, + { + "epoch": 0.13012061090493862, + "grad_norm": 1.331945776939392, + "learning_rate": 4.794028757475615e-05, + "loss": 5.1433, + "step": 21879 + }, + { + "epoch": 0.1301265581882196, + "grad_norm": 1.206411361694336, + "learning_rate": 4.794010190914304e-05, + "loss": 4.7293, + "step": 21880 + }, + { + "epoch": 0.13013250547150063, + "grad_norm": 1.6212915182113647, + "learning_rate": 4.793991623552179e-05, + "loss": 4.5976, + "step": 21881 + }, + { + "epoch": 0.13013845275478161, + "grad_norm": 1.4009672403335571, + "learning_rate": 4.793973055389244e-05, + "loss": 4.8846, + "step": 21882 + }, + { + "epoch": 0.1301444000380626, + "grad_norm": 1.5049399137496948, + "learning_rate": 4.793954486425507e-05, + "loss": 4.7785, + "step": 21883 + }, + { + "epoch": 0.13015034732134362, + "grad_norm": 1.496751070022583, + "learning_rate": 4.7939359166609746e-05, + "loss": 4.5957, + "step": 21884 + }, + { + "epoch": 0.1301562946046246, + "grad_norm": 1.7572035789489746, + "learning_rate": 4.7939173460956525e-05, + "loss": 4.8929, + "step": 21885 + }, + { + "epoch": 0.1301622418879056, + "grad_norm": 1.593353271484375, + "learning_rate": 4.793898774729548e-05, + "loss": 5.6704, + "step": 21886 + }, + { + "epoch": 0.1301681891711866, + "grad_norm": 1.4550076723098755, + "learning_rate": 4.7938802025626665e-05, + "loss": 5.6588, + "step": 21887 + }, + { + "epoch": 0.1301741364544676, + "grad_norm": 1.6618671417236328, + "learning_rate": 4.793861629595015e-05, + "loss": 5.6571, + "step": 21888 + }, + { + "epoch": 0.1301800837377486, + "grad_norm": 1.4493645429611206, + "learning_rate": 4.793843055826601e-05, + "loss": 5.4406, + "step": 21889 + }, + { + "epoch": 0.1301860310210296, + "grad_norm": 1.5164732933044434, + "learning_rate": 4.793824481257429e-05, + "loss": 5.4872, + "step": 21890 + }, + { + "epoch": 0.1301919783043106, + "grad_norm": 1.5956424474716187, + "learning_rate": 4.793805905887508e-05, + "loss": 4.7702, + "step": 21891 + }, + { + "epoch": 0.13019792558759158, + "grad_norm": 1.850864291191101, + "learning_rate": 4.7937873297168425e-05, + "loss": 4.6842, + "step": 21892 + }, + { + "epoch": 0.1302038728708726, + "grad_norm": 1.637451171875, + "learning_rate": 4.793768752745439e-05, + "loss": 5.2488, + "step": 21893 + }, + { + "epoch": 0.13020982015415358, + "grad_norm": 1.5980913639068604, + "learning_rate": 4.793750174973305e-05, + "loss": 5.4026, + "step": 21894 + }, + { + "epoch": 0.13021576743743457, + "grad_norm": 1.7420471906661987, + "learning_rate": 4.793731596400446e-05, + "loss": 5.2409, + "step": 21895 + }, + { + "epoch": 0.1302217147207156, + "grad_norm": 2.749483346939087, + "learning_rate": 4.7937130170268694e-05, + "loss": 5.3401, + "step": 21896 + }, + { + "epoch": 0.13022766200399657, + "grad_norm": 2.610828399658203, + "learning_rate": 4.793694436852581e-05, + "loss": 5.0967, + "step": 21897 + }, + { + "epoch": 0.13023360928727756, + "grad_norm": 2.5725367069244385, + "learning_rate": 4.793675855877588e-05, + "loss": 5.1184, + "step": 21898 + }, + { + "epoch": 0.13023955657055858, + "grad_norm": 2.438526153564453, + "learning_rate": 4.793657274101896e-05, + "loss": 5.1315, + "step": 21899 + }, + { + "epoch": 0.13024550385383957, + "grad_norm": 2.2574191093444824, + "learning_rate": 4.793638691525513e-05, + "loss": 4.9999, + "step": 21900 + }, + { + "epoch": 0.13025145113712056, + "grad_norm": 1.9024723768234253, + "learning_rate": 4.7936201081484434e-05, + "loss": 5.1766, + "step": 21901 + }, + { + "epoch": 0.13025739842040157, + "grad_norm": 2.2040951251983643, + "learning_rate": 4.793601523970695e-05, + "loss": 4.9261, + "step": 21902 + }, + { + "epoch": 0.13026334570368256, + "grad_norm": 2.333158016204834, + "learning_rate": 4.7935829389922736e-05, + "loss": 4.9423, + "step": 21903 + }, + { + "epoch": 0.13026929298696355, + "grad_norm": 2.2712838649749756, + "learning_rate": 4.793564353213187e-05, + "loss": 4.7511, + "step": 21904 + }, + { + "epoch": 0.13027524027024456, + "grad_norm": 2.119046211242676, + "learning_rate": 4.79354576663344e-05, + "loss": 4.7284, + "step": 21905 + }, + { + "epoch": 0.13028118755352555, + "grad_norm": 2.3056483268737793, + "learning_rate": 4.79352717925304e-05, + "loss": 4.8627, + "step": 21906 + }, + { + "epoch": 0.13028713483680654, + "grad_norm": 2.2767837047576904, + "learning_rate": 4.793508591071993e-05, + "loss": 4.7924, + "step": 21907 + }, + { + "epoch": 0.13029308212008756, + "grad_norm": 2.138441324234009, + "learning_rate": 4.793490002090306e-05, + "loss": 4.747, + "step": 21908 + }, + { + "epoch": 0.13029902940336854, + "grad_norm": 1.9595372676849365, + "learning_rate": 4.793471412307986e-05, + "loss": 4.6861, + "step": 21909 + }, + { + "epoch": 0.13030497668664953, + "grad_norm": 2.207357883453369, + "learning_rate": 4.793452821725039e-05, + "loss": 4.4727, + "step": 21910 + }, + { + "epoch": 0.13031092396993055, + "grad_norm": 1.9506596326828003, + "learning_rate": 4.7934342303414704e-05, + "loss": 4.4445, + "step": 21911 + }, + { + "epoch": 0.13031687125321154, + "grad_norm": 2.0946574211120605, + "learning_rate": 4.793415638157288e-05, + "loss": 4.4556, + "step": 21912 + }, + { + "epoch": 0.13032281853649252, + "grad_norm": 2.7089650630950928, + "learning_rate": 4.793397045172497e-05, + "loss": 4.3106, + "step": 21913 + }, + { + "epoch": 0.13032876581977354, + "grad_norm": 2.6837174892425537, + "learning_rate": 4.793378451387106e-05, + "loss": 4.4133, + "step": 21914 + }, + { + "epoch": 0.13033471310305453, + "grad_norm": 2.28702712059021, + "learning_rate": 4.7933598568011207e-05, + "loss": 4.4326, + "step": 21915 + }, + { + "epoch": 0.13034066038633552, + "grad_norm": 2.172691583633423, + "learning_rate": 4.793341261414546e-05, + "loss": 4.6047, + "step": 21916 + }, + { + "epoch": 0.13034660766961653, + "grad_norm": 2.202906608581543, + "learning_rate": 4.79332266522739e-05, + "loss": 4.6857, + "step": 21917 + }, + { + "epoch": 0.13035255495289752, + "grad_norm": 1.7617685794830322, + "learning_rate": 4.793304068239658e-05, + "loss": 4.4888, + "step": 21918 + }, + { + "epoch": 0.1303585022361785, + "grad_norm": 2.2866454124450684, + "learning_rate": 4.7932854704513586e-05, + "loss": 4.5558, + "step": 21919 + }, + { + "epoch": 0.13036444951945952, + "grad_norm": 2.0338642597198486, + "learning_rate": 4.793266871862496e-05, + "loss": 5.2769, + "step": 21920 + }, + { + "epoch": 0.1303703968027405, + "grad_norm": 2.0302703380584717, + "learning_rate": 4.793248272473078e-05, + "loss": 4.5903, + "step": 21921 + }, + { + "epoch": 0.1303763440860215, + "grad_norm": 2.1618101596832275, + "learning_rate": 4.793229672283111e-05, + "loss": 4.9971, + "step": 21922 + }, + { + "epoch": 0.1303822913693025, + "grad_norm": 2.0446085929870605, + "learning_rate": 4.7932110712926004e-05, + "loss": 5.286, + "step": 21923 + }, + { + "epoch": 0.1303882386525835, + "grad_norm": 1.544705867767334, + "learning_rate": 4.793192469501554e-05, + "loss": 5.5509, + "step": 21924 + }, + { + "epoch": 0.1303941859358645, + "grad_norm": 1.5994058847427368, + "learning_rate": 4.7931738669099776e-05, + "loss": 5.5891, + "step": 21925 + }, + { + "epoch": 0.13040013321914548, + "grad_norm": 1.5866730213165283, + "learning_rate": 4.793155263517878e-05, + "loss": 5.3539, + "step": 21926 + }, + { + "epoch": 0.1304060805024265, + "grad_norm": 1.5843631029129028, + "learning_rate": 4.793136659325262e-05, + "loss": 5.5528, + "step": 21927 + }, + { + "epoch": 0.13041202778570748, + "grad_norm": 1.8037461042404175, + "learning_rate": 4.7931180543321354e-05, + "loss": 4.9484, + "step": 21928 + }, + { + "epoch": 0.13041797506898847, + "grad_norm": 1.8021430969238281, + "learning_rate": 4.793099448538505e-05, + "loss": 5.2239, + "step": 21929 + }, + { + "epoch": 0.1304239223522695, + "grad_norm": 1.9063239097595215, + "learning_rate": 4.793080841944377e-05, + "loss": 5.0627, + "step": 21930 + }, + { + "epoch": 0.13042986963555048, + "grad_norm": 1.8546555042266846, + "learning_rate": 4.7930622345497575e-05, + "loss": 4.8691, + "step": 21931 + }, + { + "epoch": 0.13043581691883147, + "grad_norm": 1.7901126146316528, + "learning_rate": 4.793043626354655e-05, + "loss": 4.8975, + "step": 21932 + }, + { + "epoch": 0.13044176420211248, + "grad_norm": 1.7083008289337158, + "learning_rate": 4.793025017359074e-05, + "loss": 4.8176, + "step": 21933 + }, + { + "epoch": 0.13044771148539347, + "grad_norm": 1.7584604024887085, + "learning_rate": 4.793006407563022e-05, + "loss": 5.2551, + "step": 21934 + }, + { + "epoch": 0.13045365876867446, + "grad_norm": 1.6731703281402588, + "learning_rate": 4.792987796966505e-05, + "loss": 5.0456, + "step": 21935 + }, + { + "epoch": 0.13045960605195547, + "grad_norm": 1.6340082883834839, + "learning_rate": 4.7929691855695294e-05, + "loss": 5.5061, + "step": 21936 + }, + { + "epoch": 0.13046555333523646, + "grad_norm": 1.7354822158813477, + "learning_rate": 4.792950573372102e-05, + "loss": 5.7164, + "step": 21937 + }, + { + "epoch": 0.13047150061851745, + "grad_norm": 1.6100409030914307, + "learning_rate": 4.79293196037423e-05, + "loss": 5.2427, + "step": 21938 + }, + { + "epoch": 0.13047744790179847, + "grad_norm": 2.603156328201294, + "learning_rate": 4.7929133465759184e-05, + "loss": 4.1146, + "step": 21939 + }, + { + "epoch": 0.13048339518507945, + "grad_norm": 2.518183946609497, + "learning_rate": 4.7928947319771746e-05, + "loss": 4.2918, + "step": 21940 + }, + { + "epoch": 0.13048934246836044, + "grad_norm": 1.7518165111541748, + "learning_rate": 4.792876116578004e-05, + "loss": 5.9257, + "step": 21941 + }, + { + "epoch": 0.13049528975164146, + "grad_norm": 1.8118661642074585, + "learning_rate": 4.792857500378416e-05, + "loss": 5.8985, + "step": 21942 + }, + { + "epoch": 0.13050123703492245, + "grad_norm": 1.5877163410186768, + "learning_rate": 4.792838883378414e-05, + "loss": 6.0572, + "step": 21943 + }, + { + "epoch": 0.13050718431820343, + "grad_norm": 1.313362956047058, + "learning_rate": 4.7928202655780055e-05, + "loss": 5.7739, + "step": 21944 + }, + { + "epoch": 0.13051313160148445, + "grad_norm": 1.5902273654937744, + "learning_rate": 4.792801646977198e-05, + "loss": 6.021, + "step": 21945 + }, + { + "epoch": 0.13051907888476544, + "grad_norm": 1.8784877061843872, + "learning_rate": 4.792783027575996e-05, + "loss": 5.0933, + "step": 21946 + }, + { + "epoch": 0.13052502616804643, + "grad_norm": 1.7743972539901733, + "learning_rate": 4.7927644073744076e-05, + "loss": 5.1168, + "step": 21947 + }, + { + "epoch": 0.13053097345132744, + "grad_norm": 2.0093095302581787, + "learning_rate": 4.792745786372439e-05, + "loss": 5.7441, + "step": 21948 + }, + { + "epoch": 0.13053692073460843, + "grad_norm": 2.0483853816986084, + "learning_rate": 4.7927271645700966e-05, + "loss": 5.4851, + "step": 21949 + }, + { + "epoch": 0.13054286801788942, + "grad_norm": 1.7858600616455078, + "learning_rate": 4.792708541967386e-05, + "loss": 5.4308, + "step": 21950 + }, + { + "epoch": 0.13054881530117043, + "grad_norm": 1.578202247619629, + "learning_rate": 4.7926899185643155e-05, + "loss": 5.4409, + "step": 21951 + }, + { + "epoch": 0.13055476258445142, + "grad_norm": 1.5763752460479736, + "learning_rate": 4.7926712943608895e-05, + "loss": 5.438, + "step": 21952 + }, + { + "epoch": 0.1305607098677324, + "grad_norm": 1.4117366075515747, + "learning_rate": 4.792652669357117e-05, + "loss": 5.3256, + "step": 21953 + }, + { + "epoch": 0.13056665715101343, + "grad_norm": 1.8186451196670532, + "learning_rate": 4.792634043553003e-05, + "loss": 5.4336, + "step": 21954 + }, + { + "epoch": 0.13057260443429441, + "grad_norm": 1.8576366901397705, + "learning_rate": 4.7926154169485536e-05, + "loss": 5.5133, + "step": 21955 + }, + { + "epoch": 0.1305785517175754, + "grad_norm": 1.81550931930542, + "learning_rate": 4.7925967895437754e-05, + "loss": 5.3673, + "step": 21956 + }, + { + "epoch": 0.13058449900085642, + "grad_norm": 1.5518393516540527, + "learning_rate": 4.7925781613386765e-05, + "loss": 5.3788, + "step": 21957 + }, + { + "epoch": 0.1305904462841374, + "grad_norm": 1.726492166519165, + "learning_rate": 4.7925595323332615e-05, + "loss": 5.4759, + "step": 21958 + }, + { + "epoch": 0.1305963935674184, + "grad_norm": 1.6105836629867554, + "learning_rate": 4.792540902527538e-05, + "loss": 5.3339, + "step": 21959 + }, + { + "epoch": 0.1306023408506994, + "grad_norm": 1.6900887489318848, + "learning_rate": 4.792522271921512e-05, + "loss": 5.457, + "step": 21960 + }, + { + "epoch": 0.1306082881339804, + "grad_norm": 1.6158493757247925, + "learning_rate": 4.79250364051519e-05, + "loss": 5.4049, + "step": 21961 + }, + { + "epoch": 0.1306142354172614, + "grad_norm": 1.5123624801635742, + "learning_rate": 4.792485008308579e-05, + "loss": 5.3611, + "step": 21962 + }, + { + "epoch": 0.1306201827005424, + "grad_norm": 1.4421589374542236, + "learning_rate": 4.792466375301685e-05, + "loss": 5.3816, + "step": 21963 + }, + { + "epoch": 0.1306261299838234, + "grad_norm": 1.6167370080947876, + "learning_rate": 4.792447741494514e-05, + "loss": 5.3484, + "step": 21964 + }, + { + "epoch": 0.13063207726710438, + "grad_norm": 1.5235882997512817, + "learning_rate": 4.7924291068870745e-05, + "loss": 5.4756, + "step": 21965 + }, + { + "epoch": 0.1306380245503854, + "grad_norm": 1.5585761070251465, + "learning_rate": 4.7924104714793705e-05, + "loss": 4.9743, + "step": 21966 + }, + { + "epoch": 0.13064397183366638, + "grad_norm": 1.6565943956375122, + "learning_rate": 4.79239183527141e-05, + "loss": 4.9801, + "step": 21967 + }, + { + "epoch": 0.13064991911694737, + "grad_norm": 1.449012041091919, + "learning_rate": 4.7923731982631993e-05, + "loss": 5.2166, + "step": 21968 + }, + { + "epoch": 0.1306558664002284, + "grad_norm": 1.7511426210403442, + "learning_rate": 4.792354560454745e-05, + "loss": 4.7892, + "step": 21969 + }, + { + "epoch": 0.13066181368350938, + "grad_norm": 1.8433175086975098, + "learning_rate": 4.7923359218460535e-05, + "loss": 5.1481, + "step": 21970 + }, + { + "epoch": 0.13066776096679036, + "grad_norm": 1.4407368898391724, + "learning_rate": 4.792317282437131e-05, + "loss": 5.3282, + "step": 21971 + }, + { + "epoch": 0.13067370825007138, + "grad_norm": 1.7756870985031128, + "learning_rate": 4.7922986422279836e-05, + "loss": 4.9934, + "step": 21972 + }, + { + "epoch": 0.13067965553335237, + "grad_norm": 1.6745517253875732, + "learning_rate": 4.7922800012186197e-05, + "loss": 4.9524, + "step": 21973 + }, + { + "epoch": 0.13068560281663336, + "grad_norm": 1.6869374513626099, + "learning_rate": 4.792261359409044e-05, + "loss": 5.0163, + "step": 21974 + }, + { + "epoch": 0.13069155009991437, + "grad_norm": 1.810007929801941, + "learning_rate": 4.7922427167992635e-05, + "loss": 5.7507, + "step": 21975 + }, + { + "epoch": 0.13069749738319536, + "grad_norm": 1.438236951828003, + "learning_rate": 4.792224073389284e-05, + "loss": 5.6271, + "step": 21976 + }, + { + "epoch": 0.13070344466647635, + "grad_norm": 1.7424002885818481, + "learning_rate": 4.7922054291791135e-05, + "loss": 5.4101, + "step": 21977 + }, + { + "epoch": 0.13070939194975736, + "grad_norm": 1.6832276582717896, + "learning_rate": 4.7921867841687576e-05, + "loss": 5.5323, + "step": 21978 + }, + { + "epoch": 0.13071533923303835, + "grad_norm": 1.4542639255523682, + "learning_rate": 4.792168138358223e-05, + "loss": 5.6003, + "step": 21979 + }, + { + "epoch": 0.13072128651631934, + "grad_norm": 1.5791352987289429, + "learning_rate": 4.7921494917475164e-05, + "loss": 4.448, + "step": 21980 + }, + { + "epoch": 0.13072723379960036, + "grad_norm": 1.7216298580169678, + "learning_rate": 4.792130844336644e-05, + "loss": 5.2205, + "step": 21981 + }, + { + "epoch": 0.13073318108288134, + "grad_norm": 1.7315418720245361, + "learning_rate": 4.792112196125612e-05, + "loss": 5.617, + "step": 21982 + }, + { + "epoch": 0.13073912836616233, + "grad_norm": 1.6149991750717163, + "learning_rate": 4.792093547114428e-05, + "loss": 5.1341, + "step": 21983 + }, + { + "epoch": 0.13074507564944332, + "grad_norm": 1.8531928062438965, + "learning_rate": 4.792074897303097e-05, + "loss": 5.384, + "step": 21984 + }, + { + "epoch": 0.13075102293272434, + "grad_norm": 1.869070053100586, + "learning_rate": 4.792056246691627e-05, + "loss": 5.428, + "step": 21985 + }, + { + "epoch": 0.13075697021600532, + "grad_norm": 1.715179204940796, + "learning_rate": 4.792037595280024e-05, + "loss": 5.5358, + "step": 21986 + }, + { + "epoch": 0.1307629174992863, + "grad_norm": 2.155991315841675, + "learning_rate": 4.792018943068294e-05, + "loss": 4.9676, + "step": 21987 + }, + { + "epoch": 0.13076886478256733, + "grad_norm": 1.9201817512512207, + "learning_rate": 4.7920002900564434e-05, + "loss": 5.1021, + "step": 21988 + }, + { + "epoch": 0.13077481206584832, + "grad_norm": 1.8021970987319946, + "learning_rate": 4.79198163624448e-05, + "loss": 5.233, + "step": 21989 + }, + { + "epoch": 0.1307807593491293, + "grad_norm": 2.034694194793701, + "learning_rate": 4.7919629816324093e-05, + "loss": 5.7133, + "step": 21990 + }, + { + "epoch": 0.13078670663241032, + "grad_norm": 1.7929306030273438, + "learning_rate": 4.791944326220238e-05, + "loss": 5.1922, + "step": 21991 + }, + { + "epoch": 0.1307926539156913, + "grad_norm": 1.6092936992645264, + "learning_rate": 4.791925670007972e-05, + "loss": 4.8169, + "step": 21992 + }, + { + "epoch": 0.1307986011989723, + "grad_norm": 1.6994092464447021, + "learning_rate": 4.791907012995619e-05, + "loss": 4.7869, + "step": 21993 + }, + { + "epoch": 0.1308045484822533, + "grad_norm": 1.7823549509048462, + "learning_rate": 4.791888355183185e-05, + "loss": 5.1608, + "step": 21994 + }, + { + "epoch": 0.1308104957655343, + "grad_norm": 1.9024605751037598, + "learning_rate": 4.7918696965706764e-05, + "loss": 4.016, + "step": 21995 + }, + { + "epoch": 0.1308164430488153, + "grad_norm": 1.8696129322052002, + "learning_rate": 4.7918510371580993e-05, + "loss": 4.3457, + "step": 21996 + }, + { + "epoch": 0.1308223903320963, + "grad_norm": 1.8359664678573608, + "learning_rate": 4.791832376945461e-05, + "loss": 4.1822, + "step": 21997 + }, + { + "epoch": 0.1308283376153773, + "grad_norm": 1.867409586906433, + "learning_rate": 4.791813715932768e-05, + "loss": 4.0156, + "step": 21998 + }, + { + "epoch": 0.13083428489865828, + "grad_norm": 1.729768991470337, + "learning_rate": 4.7917950541200264e-05, + "loss": 5.4221, + "step": 21999 + }, + { + "epoch": 0.1308402321819393, + "grad_norm": 1.8171114921569824, + "learning_rate": 4.791776391507242e-05, + "loss": 4.1685, + "step": 22000 + }, + { + "epoch": 0.13084617946522029, + "grad_norm": 1.8626638650894165, + "learning_rate": 4.7917577280944234e-05, + "loss": 4.1981, + "step": 22001 + }, + { + "epoch": 0.13085212674850127, + "grad_norm": 1.9804152250289917, + "learning_rate": 4.791739063881575e-05, + "loss": 4.1258, + "step": 22002 + }, + { + "epoch": 0.1308580740317823, + "grad_norm": 2.6114773750305176, + "learning_rate": 4.791720398868704e-05, + "loss": 4.0207, + "step": 22003 + }, + { + "epoch": 0.13086402131506328, + "grad_norm": 2.1169519424438477, + "learning_rate": 4.791701733055818e-05, + "loss": 4.0134, + "step": 22004 + }, + { + "epoch": 0.13086996859834427, + "grad_norm": 2.318971872329712, + "learning_rate": 4.791683066442922e-05, + "loss": 4.1341, + "step": 22005 + }, + { + "epoch": 0.13087591588162528, + "grad_norm": 2.1771652698516846, + "learning_rate": 4.7916643990300234e-05, + "loss": 4.5816, + "step": 22006 + }, + { + "epoch": 0.13088186316490627, + "grad_norm": 2.327596426010132, + "learning_rate": 4.791645730817128e-05, + "loss": 5.3562, + "step": 22007 + }, + { + "epoch": 0.13088781044818726, + "grad_norm": 2.3558785915374756, + "learning_rate": 4.7916270618042434e-05, + "loss": 4.055, + "step": 22008 + }, + { + "epoch": 0.13089375773146827, + "grad_norm": 2.07840633392334, + "learning_rate": 4.791608391991374e-05, + "loss": 4.4366, + "step": 22009 + }, + { + "epoch": 0.13089970501474926, + "grad_norm": 2.4755849838256836, + "learning_rate": 4.79158972137853e-05, + "loss": 5.5616, + "step": 22010 + }, + { + "epoch": 0.13090565229803025, + "grad_norm": 1.8745293617248535, + "learning_rate": 4.791571049965714e-05, + "loss": 5.1908, + "step": 22011 + }, + { + "epoch": 0.13091159958131127, + "grad_norm": 1.8463020324707031, + "learning_rate": 4.791552377752935e-05, + "loss": 5.64, + "step": 22012 + }, + { + "epoch": 0.13091754686459225, + "grad_norm": 1.7283350229263306, + "learning_rate": 4.791533704740199e-05, + "loss": 5.191, + "step": 22013 + }, + { + "epoch": 0.13092349414787324, + "grad_norm": 2.290731191635132, + "learning_rate": 4.7915150309275115e-05, + "loss": 4.7131, + "step": 22014 + }, + { + "epoch": 0.13092944143115426, + "grad_norm": 2.1718969345092773, + "learning_rate": 4.7914963563148794e-05, + "loss": 4.6983, + "step": 22015 + }, + { + "epoch": 0.13093538871443525, + "grad_norm": 2.179349184036255, + "learning_rate": 4.791477680902311e-05, + "loss": 4.7265, + "step": 22016 + }, + { + "epoch": 0.13094133599771623, + "grad_norm": 1.7619205713272095, + "learning_rate": 4.79145900468981e-05, + "loss": 5.3916, + "step": 22017 + }, + { + "epoch": 0.13094728328099725, + "grad_norm": 1.827709674835205, + "learning_rate": 4.7914403276773855e-05, + "loss": 5.4988, + "step": 22018 + }, + { + "epoch": 0.13095323056427824, + "grad_norm": 1.768192172050476, + "learning_rate": 4.7914216498650424e-05, + "loss": 5.3605, + "step": 22019 + }, + { + "epoch": 0.13095917784755923, + "grad_norm": 1.6903995275497437, + "learning_rate": 4.791402971252788e-05, + "loss": 5.3919, + "step": 22020 + }, + { + "epoch": 0.13096512513084024, + "grad_norm": 1.5048458576202393, + "learning_rate": 4.791384291840628e-05, + "loss": 5.43, + "step": 22021 + }, + { + "epoch": 0.13097107241412123, + "grad_norm": 1.6317448616027832, + "learning_rate": 4.7913656116285685e-05, + "loss": 5.4964, + "step": 22022 + }, + { + "epoch": 0.13097701969740222, + "grad_norm": 1.775623083114624, + "learning_rate": 4.791346930616619e-05, + "loss": 5.4068, + "step": 22023 + }, + { + "epoch": 0.13098296698068324, + "grad_norm": 1.7148652076721191, + "learning_rate": 4.7913282488047826e-05, + "loss": 5.4362, + "step": 22024 + }, + { + "epoch": 0.13098891426396422, + "grad_norm": 1.6784619092941284, + "learning_rate": 4.7913095661930675e-05, + "loss": 5.3668, + "step": 22025 + }, + { + "epoch": 0.1309948615472452, + "grad_norm": 1.671555757522583, + "learning_rate": 4.79129088278148e-05, + "loss": 5.264, + "step": 22026 + }, + { + "epoch": 0.13100080883052623, + "grad_norm": 1.5523961782455444, + "learning_rate": 4.791272198570027e-05, + "loss": 5.1395, + "step": 22027 + }, + { + "epoch": 0.13100675611380722, + "grad_norm": 1.8762462139129639, + "learning_rate": 4.7912535135587134e-05, + "loss": 5.1099, + "step": 22028 + }, + { + "epoch": 0.1310127033970882, + "grad_norm": 1.7621192932128906, + "learning_rate": 4.7912348277475474e-05, + "loss": 5.0033, + "step": 22029 + }, + { + "epoch": 0.13101865068036922, + "grad_norm": 1.6044316291809082, + "learning_rate": 4.791216141136535e-05, + "loss": 5.2646, + "step": 22030 + }, + { + "epoch": 0.1310245979636502, + "grad_norm": 2.3852479457855225, + "learning_rate": 4.791197453725683e-05, + "loss": 4.7932, + "step": 22031 + }, + { + "epoch": 0.1310305452469312, + "grad_norm": 2.259331703186035, + "learning_rate": 4.7911787655149975e-05, + "loss": 4.8083, + "step": 22032 + }, + { + "epoch": 0.1310364925302122, + "grad_norm": 2.167745351791382, + "learning_rate": 4.791160076504485e-05, + "loss": 4.852, + "step": 22033 + }, + { + "epoch": 0.1310424398134932, + "grad_norm": 1.8246276378631592, + "learning_rate": 4.791141386694152e-05, + "loss": 5.1364, + "step": 22034 + }, + { + "epoch": 0.1310483870967742, + "grad_norm": 1.820461630821228, + "learning_rate": 4.791122696084006e-05, + "loss": 4.9647, + "step": 22035 + }, + { + "epoch": 0.1310543343800552, + "grad_norm": 1.6964235305786133, + "learning_rate": 4.791104004674052e-05, + "loss": 5.4281, + "step": 22036 + }, + { + "epoch": 0.1310602816633362, + "grad_norm": 1.8432056903839111, + "learning_rate": 4.791085312464297e-05, + "loss": 5.1905, + "step": 22037 + }, + { + "epoch": 0.13106622894661718, + "grad_norm": 1.9929230213165283, + "learning_rate": 4.7910666194547485e-05, + "loss": 5.0115, + "step": 22038 + }, + { + "epoch": 0.1310721762298982, + "grad_norm": 1.70926034450531, + "learning_rate": 4.791047925645412e-05, + "loss": 5.299, + "step": 22039 + }, + { + "epoch": 0.13107812351317918, + "grad_norm": 1.5090575218200684, + "learning_rate": 4.791029231036295e-05, + "loss": 5.4832, + "step": 22040 + }, + { + "epoch": 0.13108407079646017, + "grad_norm": 1.9068914651870728, + "learning_rate": 4.7910105356274025e-05, + "loss": 4.6246, + "step": 22041 + }, + { + "epoch": 0.13109001807974116, + "grad_norm": 1.9232919216156006, + "learning_rate": 4.7909918394187425e-05, + "loss": 4.7151, + "step": 22042 + }, + { + "epoch": 0.13109596536302218, + "grad_norm": 1.973927617073059, + "learning_rate": 4.790973142410321e-05, + "loss": 4.4912, + "step": 22043 + }, + { + "epoch": 0.13110191264630316, + "grad_norm": 1.554721474647522, + "learning_rate": 4.7909544446021434e-05, + "loss": 5.211, + "step": 22044 + }, + { + "epoch": 0.13110785992958415, + "grad_norm": 1.8059271574020386, + "learning_rate": 4.7909357459942185e-05, + "loss": 5.2998, + "step": 22045 + }, + { + "epoch": 0.13111380721286517, + "grad_norm": 1.7360923290252686, + "learning_rate": 4.79091704658655e-05, + "loss": 5.58, + "step": 22046 + }, + { + "epoch": 0.13111975449614616, + "grad_norm": 1.627770185470581, + "learning_rate": 4.790898346379148e-05, + "loss": 5.7186, + "step": 22047 + }, + { + "epoch": 0.13112570177942715, + "grad_norm": 1.6354387998580933, + "learning_rate": 4.790879645372016e-05, + "loss": 5.5099, + "step": 22048 + }, + { + "epoch": 0.13113164906270816, + "grad_norm": 1.6667500734329224, + "learning_rate": 4.790860943565161e-05, + "loss": 5.4328, + "step": 22049 + }, + { + "epoch": 0.13113759634598915, + "grad_norm": 1.7549245357513428, + "learning_rate": 4.790842240958591e-05, + "loss": 5.4191, + "step": 22050 + }, + { + "epoch": 0.13114354362927014, + "grad_norm": 1.5705612897872925, + "learning_rate": 4.790823537552311e-05, + "loss": 5.254, + "step": 22051 + }, + { + "epoch": 0.13114949091255115, + "grad_norm": 1.438839316368103, + "learning_rate": 4.790804833346329e-05, + "loss": 5.4708, + "step": 22052 + }, + { + "epoch": 0.13115543819583214, + "grad_norm": 1.8666369915008545, + "learning_rate": 4.790786128340651e-05, + "loss": 5.8635, + "step": 22053 + }, + { + "epoch": 0.13116138547911313, + "grad_norm": 2.1541588306427, + "learning_rate": 4.7907674225352815e-05, + "loss": 5.4732, + "step": 22054 + }, + { + "epoch": 0.13116733276239415, + "grad_norm": 1.6082664728164673, + "learning_rate": 4.79074871593023e-05, + "loss": 5.3902, + "step": 22055 + }, + { + "epoch": 0.13117328004567513, + "grad_norm": 1.7293864488601685, + "learning_rate": 4.790730008525502e-05, + "loss": 5.3317, + "step": 22056 + }, + { + "epoch": 0.13117922732895612, + "grad_norm": 1.830518126487732, + "learning_rate": 4.790711300321104e-05, + "loss": 5.3786, + "step": 22057 + }, + { + "epoch": 0.13118517461223714, + "grad_norm": 2.368182897567749, + "learning_rate": 4.790692591317041e-05, + "loss": 5.8, + "step": 22058 + }, + { + "epoch": 0.13119112189551813, + "grad_norm": 2.27848482131958, + "learning_rate": 4.7906738815133216e-05, + "loss": 5.4954, + "step": 22059 + }, + { + "epoch": 0.13119706917879911, + "grad_norm": 1.6672909259796143, + "learning_rate": 4.790655170909952e-05, + "loss": 5.2937, + "step": 22060 + }, + { + "epoch": 0.13120301646208013, + "grad_norm": 1.9788751602172852, + "learning_rate": 4.790636459506938e-05, + "loss": 5.1761, + "step": 22061 + }, + { + "epoch": 0.13120896374536112, + "grad_norm": 2.8215107917785645, + "learning_rate": 4.7906177473042865e-05, + "loss": 4.9236, + "step": 22062 + }, + { + "epoch": 0.1312149110286421, + "grad_norm": 2.0486905574798584, + "learning_rate": 4.790599034302004e-05, + "loss": 5.2273, + "step": 22063 + }, + { + "epoch": 0.13122085831192312, + "grad_norm": 1.9029892683029175, + "learning_rate": 4.790580320500097e-05, + "loss": 4.7737, + "step": 22064 + }, + { + "epoch": 0.1312268055952041, + "grad_norm": 2.052060842514038, + "learning_rate": 4.790561605898572e-05, + "loss": 4.7055, + "step": 22065 + }, + { + "epoch": 0.1312327528784851, + "grad_norm": 2.3215537071228027, + "learning_rate": 4.790542890497436e-05, + "loss": 4.6687, + "step": 22066 + }, + { + "epoch": 0.13123870016176611, + "grad_norm": 1.9903185367584229, + "learning_rate": 4.790524174296694e-05, + "loss": 4.5768, + "step": 22067 + }, + { + "epoch": 0.1312446474450471, + "grad_norm": 1.9112823009490967, + "learning_rate": 4.790505457296355e-05, + "loss": 4.664, + "step": 22068 + }, + { + "epoch": 0.1312505947283281, + "grad_norm": 2.09714412689209, + "learning_rate": 4.790486739496424e-05, + "loss": 4.4941, + "step": 22069 + }, + { + "epoch": 0.1312565420116091, + "grad_norm": 1.986820936203003, + "learning_rate": 4.7904680208969073e-05, + "loss": 4.8173, + "step": 22070 + }, + { + "epoch": 0.1312624892948901, + "grad_norm": 1.8170347213745117, + "learning_rate": 4.790449301497812e-05, + "loss": 4.78, + "step": 22071 + }, + { + "epoch": 0.13126843657817108, + "grad_norm": 1.7738579511642456, + "learning_rate": 4.790430581299145e-05, + "loss": 5.3492, + "step": 22072 + }, + { + "epoch": 0.1312743838614521, + "grad_norm": 1.9075175523757935, + "learning_rate": 4.7904118603009115e-05, + "loss": 4.4672, + "step": 22073 + }, + { + "epoch": 0.1312803311447331, + "grad_norm": 1.9848250150680542, + "learning_rate": 4.790393138503119e-05, + "loss": 4.2157, + "step": 22074 + }, + { + "epoch": 0.13128627842801407, + "grad_norm": 1.7980430126190186, + "learning_rate": 4.7903744159057745e-05, + "loss": 4.2482, + "step": 22075 + }, + { + "epoch": 0.1312922257112951, + "grad_norm": 1.8066810369491577, + "learning_rate": 4.7903556925088835e-05, + "loss": 4.0731, + "step": 22076 + }, + { + "epoch": 0.13129817299457608, + "grad_norm": 1.901912808418274, + "learning_rate": 4.790336968312453e-05, + "loss": 4.0677, + "step": 22077 + }, + { + "epoch": 0.13130412027785707, + "grad_norm": 1.8650418519973755, + "learning_rate": 4.79031824331649e-05, + "loss": 4.0593, + "step": 22078 + }, + { + "epoch": 0.13131006756113808, + "grad_norm": 1.8098959922790527, + "learning_rate": 4.7902995175210003e-05, + "loss": 4.1248, + "step": 22079 + }, + { + "epoch": 0.13131601484441907, + "grad_norm": 1.7840689420700073, + "learning_rate": 4.790280790925991e-05, + "loss": 4.1299, + "step": 22080 + }, + { + "epoch": 0.13132196212770006, + "grad_norm": 1.847676157951355, + "learning_rate": 4.7902620635314676e-05, + "loss": 3.9775, + "step": 22081 + }, + { + "epoch": 0.13132790941098108, + "grad_norm": 1.970070719718933, + "learning_rate": 4.7902433353374374e-05, + "loss": 3.9744, + "step": 22082 + }, + { + "epoch": 0.13133385669426206, + "grad_norm": 1.7709019184112549, + "learning_rate": 4.790224606343908e-05, + "loss": 3.9691, + "step": 22083 + }, + { + "epoch": 0.13133980397754305, + "grad_norm": 2.0055277347564697, + "learning_rate": 4.790205876550884e-05, + "loss": 4.0181, + "step": 22084 + }, + { + "epoch": 0.13134575126082407, + "grad_norm": 1.8686769008636475, + "learning_rate": 4.790187145958372e-05, + "loss": 3.9445, + "step": 22085 + }, + { + "epoch": 0.13135169854410506, + "grad_norm": 1.8052544593811035, + "learning_rate": 4.790168414566381e-05, + "loss": 4.3716, + "step": 22086 + }, + { + "epoch": 0.13135764582738604, + "grad_norm": 1.730320692062378, + "learning_rate": 4.790149682374915e-05, + "loss": 5.8462, + "step": 22087 + }, + { + "epoch": 0.13136359311066706, + "grad_norm": 1.8372067213058472, + "learning_rate": 4.790130949383982e-05, + "loss": 6.0599, + "step": 22088 + }, + { + "epoch": 0.13136954039394805, + "grad_norm": 1.505204200744629, + "learning_rate": 4.7901122155935874e-05, + "loss": 5.9626, + "step": 22089 + }, + { + "epoch": 0.13137548767722904, + "grad_norm": 2.126800537109375, + "learning_rate": 4.790093481003738e-05, + "loss": 5.3673, + "step": 22090 + }, + { + "epoch": 0.13138143496051005, + "grad_norm": 1.5778108835220337, + "learning_rate": 4.7900747456144415e-05, + "loss": 5.4421, + "step": 22091 + }, + { + "epoch": 0.13138738224379104, + "grad_norm": 1.4741785526275635, + "learning_rate": 4.7900560094257024e-05, + "loss": 5.5546, + "step": 22092 + }, + { + "epoch": 0.13139332952707203, + "grad_norm": 1.3331834077835083, + "learning_rate": 4.7900372724375295e-05, + "loss": 5.592, + "step": 22093 + }, + { + "epoch": 0.13139927681035304, + "grad_norm": 2.421566963195801, + "learning_rate": 4.790018534649927e-05, + "loss": 5.1022, + "step": 22094 + }, + { + "epoch": 0.13140522409363403, + "grad_norm": 1.761720895767212, + "learning_rate": 4.789999796062904e-05, + "loss": 5.2071, + "step": 22095 + }, + { + "epoch": 0.13141117137691502, + "grad_norm": 1.5059387683868408, + "learning_rate": 4.789981056676465e-05, + "loss": 5.3767, + "step": 22096 + }, + { + "epoch": 0.13141711866019604, + "grad_norm": 1.5319740772247314, + "learning_rate": 4.7899623164906176e-05, + "loss": 5.6233, + "step": 22097 + }, + { + "epoch": 0.13142306594347702, + "grad_norm": 1.7106443643569946, + "learning_rate": 4.789943575505368e-05, + "loss": 5.5583, + "step": 22098 + }, + { + "epoch": 0.131429013226758, + "grad_norm": 1.4288161993026733, + "learning_rate": 4.7899248337207227e-05, + "loss": 5.4574, + "step": 22099 + }, + { + "epoch": 0.131434960510039, + "grad_norm": 1.7327675819396973, + "learning_rate": 4.789906091136688e-05, + "loss": 5.3935, + "step": 22100 + }, + { + "epoch": 0.13144090779332002, + "grad_norm": 1.7318532466888428, + "learning_rate": 4.7898873477532716e-05, + "loss": 5.0156, + "step": 22101 + }, + { + "epoch": 0.131446855076601, + "grad_norm": 1.4947113990783691, + "learning_rate": 4.789868603570478e-05, + "loss": 5.2255, + "step": 22102 + }, + { + "epoch": 0.131452802359882, + "grad_norm": 2.454650402069092, + "learning_rate": 4.789849858588316e-05, + "loss": 5.0697, + "step": 22103 + }, + { + "epoch": 0.131458749643163, + "grad_norm": 2.0269839763641357, + "learning_rate": 4.789831112806791e-05, + "loss": 5.3687, + "step": 22104 + }, + { + "epoch": 0.131464696926444, + "grad_norm": 1.89911687374115, + "learning_rate": 4.7898123662259084e-05, + "loss": 5.1816, + "step": 22105 + }, + { + "epoch": 0.13147064420972498, + "grad_norm": 1.7952163219451904, + "learning_rate": 4.789793618845677e-05, + "loss": 5.1441, + "step": 22106 + }, + { + "epoch": 0.131476591493006, + "grad_norm": 1.458935022354126, + "learning_rate": 4.789774870666102e-05, + "loss": 4.8489, + "step": 22107 + }, + { + "epoch": 0.131482538776287, + "grad_norm": 1.5516583919525146, + "learning_rate": 4.78975612168719e-05, + "loss": 4.9763, + "step": 22108 + }, + { + "epoch": 0.13148848605956798, + "grad_norm": 1.525307297706604, + "learning_rate": 4.789737371908948e-05, + "loss": 5.5826, + "step": 22109 + }, + { + "epoch": 0.131494433342849, + "grad_norm": 1.516675353050232, + "learning_rate": 4.7897186213313824e-05, + "loss": 5.7384, + "step": 22110 + }, + { + "epoch": 0.13150038062612998, + "grad_norm": 1.3918993473052979, + "learning_rate": 4.7896998699545e-05, + "loss": 5.9798, + "step": 22111 + }, + { + "epoch": 0.13150632790941097, + "grad_norm": 1.7346227169036865, + "learning_rate": 4.789681117778307e-05, + "loss": 5.4939, + "step": 22112 + }, + { + "epoch": 0.13151227519269199, + "grad_norm": 1.784882664680481, + "learning_rate": 4.7896623648028094e-05, + "loss": 5.5369, + "step": 22113 + }, + { + "epoch": 0.13151822247597297, + "grad_norm": 1.5360532999038696, + "learning_rate": 4.789643611028015e-05, + "loss": 5.5539, + "step": 22114 + }, + { + "epoch": 0.13152416975925396, + "grad_norm": 1.3865541219711304, + "learning_rate": 4.789624856453929e-05, + "loss": 5.6192, + "step": 22115 + }, + { + "epoch": 0.13153011704253498, + "grad_norm": 1.8362021446228027, + "learning_rate": 4.7896061010805596e-05, + "loss": 5.6915, + "step": 22116 + }, + { + "epoch": 0.13153606432581597, + "grad_norm": 1.607771635055542, + "learning_rate": 4.789587344907911e-05, + "loss": 5.4442, + "step": 22117 + }, + { + "epoch": 0.13154201160909695, + "grad_norm": 1.5097888708114624, + "learning_rate": 4.789568587935992e-05, + "loss": 5.84, + "step": 22118 + }, + { + "epoch": 0.13154795889237797, + "grad_norm": 1.4404877424240112, + "learning_rate": 4.789549830164809e-05, + "loss": 5.7407, + "step": 22119 + }, + { + "epoch": 0.13155390617565896, + "grad_norm": 1.5682063102722168, + "learning_rate": 4.7895310715943665e-05, + "loss": 5.3026, + "step": 22120 + }, + { + "epoch": 0.13155985345893995, + "grad_norm": 1.6435290575027466, + "learning_rate": 4.789512312224672e-05, + "loss": 5.7749, + "step": 22121 + }, + { + "epoch": 0.13156580074222096, + "grad_norm": 1.7454910278320312, + "learning_rate": 4.7894935520557335e-05, + "loss": 5.5817, + "step": 22122 + }, + { + "epoch": 0.13157174802550195, + "grad_norm": 1.9168800115585327, + "learning_rate": 4.789474791087556e-05, + "loss": 4.3752, + "step": 22123 + }, + { + "epoch": 0.13157769530878294, + "grad_norm": 2.1051509380340576, + "learning_rate": 4.789456029320147e-05, + "loss": 3.6253, + "step": 22124 + }, + { + "epoch": 0.13158364259206395, + "grad_norm": 2.0902812480926514, + "learning_rate": 4.789437266753512e-05, + "loss": 4.039, + "step": 22125 + }, + { + "epoch": 0.13158958987534494, + "grad_norm": 1.804121971130371, + "learning_rate": 4.789418503387658e-05, + "loss": 3.6551, + "step": 22126 + }, + { + "epoch": 0.13159553715862593, + "grad_norm": 1.992370367050171, + "learning_rate": 4.789399739222592e-05, + "loss": 3.6387, + "step": 22127 + }, + { + "epoch": 0.13160148444190695, + "grad_norm": 2.0625061988830566, + "learning_rate": 4.7893809742583204e-05, + "loss": 3.943, + "step": 22128 + }, + { + "epoch": 0.13160743172518793, + "grad_norm": 2.021989107131958, + "learning_rate": 4.789362208494849e-05, + "loss": 4.0269, + "step": 22129 + }, + { + "epoch": 0.13161337900846892, + "grad_norm": 2.037161350250244, + "learning_rate": 4.7893434419321856e-05, + "loss": 5.3085, + "step": 22130 + }, + { + "epoch": 0.13161932629174994, + "grad_norm": 1.8836485147476196, + "learning_rate": 4.7893246745703355e-05, + "loss": 4.7337, + "step": 22131 + }, + { + "epoch": 0.13162527357503093, + "grad_norm": 1.5900107622146606, + "learning_rate": 4.789305906409306e-05, + "loss": 5.0772, + "step": 22132 + }, + { + "epoch": 0.13163122085831191, + "grad_norm": 1.627558946609497, + "learning_rate": 4.789287137449103e-05, + "loss": 5.1703, + "step": 22133 + }, + { + "epoch": 0.13163716814159293, + "grad_norm": 1.8517992496490479, + "learning_rate": 4.7892683676897344e-05, + "loss": 5.173, + "step": 22134 + }, + { + "epoch": 0.13164311542487392, + "grad_norm": 1.2436500787734985, + "learning_rate": 4.789249597131205e-05, + "loss": 4.956, + "step": 22135 + }, + { + "epoch": 0.1316490627081549, + "grad_norm": 1.5156265497207642, + "learning_rate": 4.789230825773523e-05, + "loss": 5.6121, + "step": 22136 + }, + { + "epoch": 0.13165500999143592, + "grad_norm": 1.3742187023162842, + "learning_rate": 4.789212053616694e-05, + "loss": 5.2186, + "step": 22137 + }, + { + "epoch": 0.1316609572747169, + "grad_norm": 1.3079794645309448, + "learning_rate": 4.7891932806607245e-05, + "loss": 5.4108, + "step": 22138 + }, + { + "epoch": 0.1316669045579979, + "grad_norm": 1.5291730165481567, + "learning_rate": 4.789174506905621e-05, + "loss": 5.1516, + "step": 22139 + }, + { + "epoch": 0.13167285184127892, + "grad_norm": 1.3465576171875, + "learning_rate": 4.7891557323513904e-05, + "loss": 4.9797, + "step": 22140 + }, + { + "epoch": 0.1316787991245599, + "grad_norm": 1.228513479232788, + "learning_rate": 4.789136956998039e-05, + "loss": 5.0119, + "step": 22141 + }, + { + "epoch": 0.1316847464078409, + "grad_norm": 1.4027810096740723, + "learning_rate": 4.789118180845574e-05, + "loss": 5.2781, + "step": 22142 + }, + { + "epoch": 0.1316906936911219, + "grad_norm": 1.371072769165039, + "learning_rate": 4.789099403894002e-05, + "loss": 5.1414, + "step": 22143 + }, + { + "epoch": 0.1316966409744029, + "grad_norm": 1.264255404472351, + "learning_rate": 4.7890806261433286e-05, + "loss": 4.9926, + "step": 22144 + }, + { + "epoch": 0.13170258825768388, + "grad_norm": 1.351501226425171, + "learning_rate": 4.78906184759356e-05, + "loss": 5.1473, + "step": 22145 + }, + { + "epoch": 0.1317085355409649, + "grad_norm": 1.4877911806106567, + "learning_rate": 4.7890430682447046e-05, + "loss": 5.2634, + "step": 22146 + }, + { + "epoch": 0.1317144828242459, + "grad_norm": 1.3446416854858398, + "learning_rate": 4.7890242880967675e-05, + "loss": 5.197, + "step": 22147 + }, + { + "epoch": 0.13172043010752688, + "grad_norm": 1.2246133089065552, + "learning_rate": 4.789005507149756e-05, + "loss": 5.1262, + "step": 22148 + }, + { + "epoch": 0.1317263773908079, + "grad_norm": 1.3092166185379028, + "learning_rate": 4.7889867254036755e-05, + "loss": 5.0157, + "step": 22149 + }, + { + "epoch": 0.13173232467408888, + "grad_norm": 1.3076307773590088, + "learning_rate": 4.788967942858534e-05, + "loss": 5.159, + "step": 22150 + }, + { + "epoch": 0.13173827195736987, + "grad_norm": 1.3207625150680542, + "learning_rate": 4.788949159514338e-05, + "loss": 5.1559, + "step": 22151 + }, + { + "epoch": 0.13174421924065088, + "grad_norm": 1.4235469102859497, + "learning_rate": 4.788930375371092e-05, + "loss": 4.9426, + "step": 22152 + }, + { + "epoch": 0.13175016652393187, + "grad_norm": 1.4294525384902954, + "learning_rate": 4.7889115904288054e-05, + "loss": 5.0116, + "step": 22153 + }, + { + "epoch": 0.13175611380721286, + "grad_norm": 1.3456943035125732, + "learning_rate": 4.788892804687483e-05, + "loss": 4.9962, + "step": 22154 + }, + { + "epoch": 0.13176206109049388, + "grad_norm": 1.368545651435852, + "learning_rate": 4.788874018147132e-05, + "loss": 5.1523, + "step": 22155 + }, + { + "epoch": 0.13176800837377486, + "grad_norm": 1.2844034433364868, + "learning_rate": 4.788855230807758e-05, + "loss": 4.879, + "step": 22156 + }, + { + "epoch": 0.13177395565705585, + "grad_norm": 1.3061450719833374, + "learning_rate": 4.788836442669369e-05, + "loss": 4.9011, + "step": 22157 + }, + { + "epoch": 0.13177990294033684, + "grad_norm": 1.4233042001724243, + "learning_rate": 4.788817653731971e-05, + "loss": 4.8821, + "step": 22158 + }, + { + "epoch": 0.13178585022361786, + "grad_norm": 1.4013172388076782, + "learning_rate": 4.788798863995569e-05, + "loss": 4.8431, + "step": 22159 + }, + { + "epoch": 0.13179179750689884, + "grad_norm": 1.2786699533462524, + "learning_rate": 4.7887800734601716e-05, + "loss": 4.6884, + "step": 22160 + }, + { + "epoch": 0.13179774479017983, + "grad_norm": 1.408245325088501, + "learning_rate": 4.7887612821257855e-05, + "loss": 5.2191, + "step": 22161 + }, + { + "epoch": 0.13180369207346085, + "grad_norm": 1.5876145362854004, + "learning_rate": 4.788742489992416e-05, + "loss": 5.459, + "step": 22162 + }, + { + "epoch": 0.13180963935674184, + "grad_norm": 1.4462308883666992, + "learning_rate": 4.7887236970600705e-05, + "loss": 5.2757, + "step": 22163 + }, + { + "epoch": 0.13181558664002282, + "grad_norm": 1.288514494895935, + "learning_rate": 4.7887049033287546e-05, + "loss": 5.1, + "step": 22164 + }, + { + "epoch": 0.13182153392330384, + "grad_norm": 1.387949824333191, + "learning_rate": 4.788686108798476e-05, + "loss": 4.9212, + "step": 22165 + }, + { + "epoch": 0.13182748120658483, + "grad_norm": 1.534636378288269, + "learning_rate": 4.7886673134692404e-05, + "loss": 4.7585, + "step": 22166 + }, + { + "epoch": 0.13183342848986582, + "grad_norm": 1.464815378189087, + "learning_rate": 4.788648517341054e-05, + "loss": 5.121, + "step": 22167 + }, + { + "epoch": 0.13183937577314683, + "grad_norm": 1.2842152118682861, + "learning_rate": 4.788629720413925e-05, + "loss": 5.1032, + "step": 22168 + }, + { + "epoch": 0.13184532305642782, + "grad_norm": 1.5626686811447144, + "learning_rate": 4.7886109226878595e-05, + "loss": 4.9001, + "step": 22169 + }, + { + "epoch": 0.1318512703397088, + "grad_norm": 1.4019660949707031, + "learning_rate": 4.788592124162863e-05, + "loss": 5.2157, + "step": 22170 + }, + { + "epoch": 0.13185721762298983, + "grad_norm": 1.1018543243408203, + "learning_rate": 4.788573324838942e-05, + "loss": 5.5623, + "step": 22171 + }, + { + "epoch": 0.1318631649062708, + "grad_norm": 1.4074633121490479, + "learning_rate": 4.788554524716105e-05, + "loss": 5.0306, + "step": 22172 + }, + { + "epoch": 0.1318691121895518, + "grad_norm": 1.4724953174591064, + "learning_rate": 4.788535723794356e-05, + "loss": 5.033, + "step": 22173 + }, + { + "epoch": 0.13187505947283282, + "grad_norm": 1.359288215637207, + "learning_rate": 4.788516922073703e-05, + "loss": 4.918, + "step": 22174 + }, + { + "epoch": 0.1318810067561138, + "grad_norm": 1.3733046054840088, + "learning_rate": 4.788498119554152e-05, + "loss": 4.9631, + "step": 22175 + }, + { + "epoch": 0.1318869540393948, + "grad_norm": 1.1926368474960327, + "learning_rate": 4.7884793162357114e-05, + "loss": 4.8628, + "step": 22176 + }, + { + "epoch": 0.1318929013226758, + "grad_norm": 1.1444061994552612, + "learning_rate": 4.788460512118386e-05, + "loss": 4.8978, + "step": 22177 + }, + { + "epoch": 0.1318988486059568, + "grad_norm": 1.3945989608764648, + "learning_rate": 4.7884417072021814e-05, + "loss": 4.9901, + "step": 22178 + }, + { + "epoch": 0.13190479588923779, + "grad_norm": 1.4278130531311035, + "learning_rate": 4.7884229014871063e-05, + "loss": 4.8705, + "step": 22179 + }, + { + "epoch": 0.1319107431725188, + "grad_norm": 1.4391251802444458, + "learning_rate": 4.788404094973167e-05, + "loss": 4.8575, + "step": 22180 + }, + { + "epoch": 0.1319166904557998, + "grad_norm": 1.435241460800171, + "learning_rate": 4.788385287660369e-05, + "loss": 4.8571, + "step": 22181 + }, + { + "epoch": 0.13192263773908078, + "grad_norm": 1.2841169834136963, + "learning_rate": 4.788366479548718e-05, + "loss": 4.8738, + "step": 22182 + }, + { + "epoch": 0.1319285850223618, + "grad_norm": 1.318769931793213, + "learning_rate": 4.7883476706382236e-05, + "loss": 5.1381, + "step": 22183 + }, + { + "epoch": 0.13193453230564278, + "grad_norm": 1.398940920829773, + "learning_rate": 4.78832886092889e-05, + "loss": 4.8094, + "step": 22184 + }, + { + "epoch": 0.13194047958892377, + "grad_norm": 1.373937726020813, + "learning_rate": 4.788310050420725e-05, + "loss": 5.0183, + "step": 22185 + }, + { + "epoch": 0.1319464268722048, + "grad_norm": 1.2899675369262695, + "learning_rate": 4.788291239113734e-05, + "loss": 5.3211, + "step": 22186 + }, + { + "epoch": 0.13195237415548577, + "grad_norm": 1.2992362976074219, + "learning_rate": 4.788272427007924e-05, + "loss": 5.2411, + "step": 22187 + }, + { + "epoch": 0.13195832143876676, + "grad_norm": 1.3528488874435425, + "learning_rate": 4.7882536141033025e-05, + "loss": 5.272, + "step": 22188 + }, + { + "epoch": 0.13196426872204778, + "grad_norm": 1.0530016422271729, + "learning_rate": 4.7882348003998746e-05, + "loss": 5.1516, + "step": 22189 + }, + { + "epoch": 0.13197021600532877, + "grad_norm": 1.3447175025939941, + "learning_rate": 4.7882159858976486e-05, + "loss": 5.0007, + "step": 22190 + }, + { + "epoch": 0.13197616328860975, + "grad_norm": 1.531227946281433, + "learning_rate": 4.788197170596629e-05, + "loss": 5.0506, + "step": 22191 + }, + { + "epoch": 0.13198211057189077, + "grad_norm": 1.3458744287490845, + "learning_rate": 4.788178354496823e-05, + "loss": 4.931, + "step": 22192 + }, + { + "epoch": 0.13198805785517176, + "grad_norm": 1.380890965461731, + "learning_rate": 4.788159537598239e-05, + "loss": 5.2813, + "step": 22193 + }, + { + "epoch": 0.13199400513845275, + "grad_norm": 1.387640118598938, + "learning_rate": 4.788140719900881e-05, + "loss": 5.1234, + "step": 22194 + }, + { + "epoch": 0.13199995242173376, + "grad_norm": 1.304620623588562, + "learning_rate": 4.788121901404757e-05, + "loss": 4.988, + "step": 22195 + }, + { + "epoch": 0.13200589970501475, + "grad_norm": 1.3828579187393188, + "learning_rate": 4.7881030821098736e-05, + "loss": 5.2552, + "step": 22196 + }, + { + "epoch": 0.13201184698829574, + "grad_norm": 1.4819931983947754, + "learning_rate": 4.788084262016237e-05, + "loss": 4.9094, + "step": 22197 + }, + { + "epoch": 0.13201779427157675, + "grad_norm": 1.4570109844207764, + "learning_rate": 4.788065441123853e-05, + "loss": 5.0518, + "step": 22198 + }, + { + "epoch": 0.13202374155485774, + "grad_norm": 1.4303123950958252, + "learning_rate": 4.7880466194327305e-05, + "loss": 4.773, + "step": 22199 + }, + { + "epoch": 0.13202968883813873, + "grad_norm": 1.5727583169937134, + "learning_rate": 4.788027796942874e-05, + "loss": 4.458, + "step": 22200 + }, + { + "epoch": 0.13203563612141975, + "grad_norm": 1.5693985223770142, + "learning_rate": 4.78800897365429e-05, + "loss": 4.4378, + "step": 22201 + }, + { + "epoch": 0.13204158340470074, + "grad_norm": 1.4328757524490356, + "learning_rate": 4.787990149566987e-05, + "loss": 4.3503, + "step": 22202 + }, + { + "epoch": 0.13204753068798172, + "grad_norm": 1.4490034580230713, + "learning_rate": 4.787971324680969e-05, + "loss": 4.3476, + "step": 22203 + }, + { + "epoch": 0.13205347797126274, + "grad_norm": 1.4600367546081543, + "learning_rate": 4.7879524989962446e-05, + "loss": 4.3052, + "step": 22204 + }, + { + "epoch": 0.13205942525454373, + "grad_norm": 1.5479463338851929, + "learning_rate": 4.787933672512819e-05, + "loss": 4.3291, + "step": 22205 + }, + { + "epoch": 0.13206537253782472, + "grad_norm": 1.6317998170852661, + "learning_rate": 4.7879148452306986e-05, + "loss": 4.2697, + "step": 22206 + }, + { + "epoch": 0.13207131982110573, + "grad_norm": 1.5387004613876343, + "learning_rate": 4.787896017149892e-05, + "loss": 4.3413, + "step": 22207 + }, + { + "epoch": 0.13207726710438672, + "grad_norm": 1.5556374788284302, + "learning_rate": 4.7878771882704046e-05, + "loss": 4.2002, + "step": 22208 + }, + { + "epoch": 0.1320832143876677, + "grad_norm": 1.626752495765686, + "learning_rate": 4.787858358592243e-05, + "loss": 4.2729, + "step": 22209 + }, + { + "epoch": 0.13208916167094872, + "grad_norm": 1.3982586860656738, + "learning_rate": 4.7878395281154134e-05, + "loss": 4.2138, + "step": 22210 + }, + { + "epoch": 0.1320951089542297, + "grad_norm": 1.5739530324935913, + "learning_rate": 4.787820696839922e-05, + "loss": 4.1526, + "step": 22211 + }, + { + "epoch": 0.1321010562375107, + "grad_norm": 1.458217978477478, + "learning_rate": 4.787801864765777e-05, + "loss": 4.2584, + "step": 22212 + }, + { + "epoch": 0.13210700352079172, + "grad_norm": 1.4696205854415894, + "learning_rate": 4.787783031892984e-05, + "loss": 4.2042, + "step": 22213 + }, + { + "epoch": 0.1321129508040727, + "grad_norm": 1.729152798652649, + "learning_rate": 4.7877641982215485e-05, + "loss": 4.4817, + "step": 22214 + }, + { + "epoch": 0.1321188980873537, + "grad_norm": 1.7412737607955933, + "learning_rate": 4.787745363751479e-05, + "loss": 4.4568, + "step": 22215 + }, + { + "epoch": 0.13212484537063468, + "grad_norm": 1.6463770866394043, + "learning_rate": 4.787726528482781e-05, + "loss": 4.4503, + "step": 22216 + }, + { + "epoch": 0.1321307926539157, + "grad_norm": 1.5496896505355835, + "learning_rate": 4.7877076924154617e-05, + "loss": 4.3863, + "step": 22217 + }, + { + "epoch": 0.13213673993719668, + "grad_norm": 1.6521345376968384, + "learning_rate": 4.787688855549527e-05, + "loss": 4.3847, + "step": 22218 + }, + { + "epoch": 0.13214268722047767, + "grad_norm": 1.6477288007736206, + "learning_rate": 4.7876700178849836e-05, + "loss": 4.3939, + "step": 22219 + }, + { + "epoch": 0.1321486345037587, + "grad_norm": 1.6795778274536133, + "learning_rate": 4.787651179421838e-05, + "loss": 4.1722, + "step": 22220 + }, + { + "epoch": 0.13215458178703968, + "grad_norm": 1.5795823335647583, + "learning_rate": 4.787632340160098e-05, + "loss": 4.2125, + "step": 22221 + }, + { + "epoch": 0.13216052907032066, + "grad_norm": 1.6583930253982544, + "learning_rate": 4.7876135000997686e-05, + "loss": 4.2013, + "step": 22222 + }, + { + "epoch": 0.13216647635360168, + "grad_norm": 1.4495878219604492, + "learning_rate": 4.7875946592408575e-05, + "loss": 4.1335, + "step": 22223 + }, + { + "epoch": 0.13217242363688267, + "grad_norm": 1.5657227039337158, + "learning_rate": 4.78757581758337e-05, + "loss": 4.1514, + "step": 22224 + }, + { + "epoch": 0.13217837092016366, + "grad_norm": 1.7183332443237305, + "learning_rate": 4.787556975127313e-05, + "loss": 4.7715, + "step": 22225 + }, + { + "epoch": 0.13218431820344467, + "grad_norm": 2.1822710037231445, + "learning_rate": 4.7875381318726945e-05, + "loss": 4.9383, + "step": 22226 + }, + { + "epoch": 0.13219026548672566, + "grad_norm": 1.9633662700653076, + "learning_rate": 4.787519287819519e-05, + "loss": 4.9601, + "step": 22227 + }, + { + "epoch": 0.13219621277000665, + "grad_norm": 1.6858619451522827, + "learning_rate": 4.787500442967795e-05, + "loss": 5.0091, + "step": 22228 + }, + { + "epoch": 0.13220216005328767, + "grad_norm": 1.5447601079940796, + "learning_rate": 4.787481597317528e-05, + "loss": 4.8372, + "step": 22229 + }, + { + "epoch": 0.13220810733656865, + "grad_norm": 1.4934616088867188, + "learning_rate": 4.787462750868725e-05, + "loss": 4.9812, + "step": 22230 + }, + { + "epoch": 0.13221405461984964, + "grad_norm": 1.4039883613586426, + "learning_rate": 4.787443903621393e-05, + "loss": 4.829, + "step": 22231 + }, + { + "epoch": 0.13222000190313066, + "grad_norm": 1.5184186697006226, + "learning_rate": 4.787425055575536e-05, + "loss": 4.8379, + "step": 22232 + }, + { + "epoch": 0.13222594918641165, + "grad_norm": 1.3783762454986572, + "learning_rate": 4.787406206731164e-05, + "loss": 4.9209, + "step": 22233 + }, + { + "epoch": 0.13223189646969263, + "grad_norm": 1.360772967338562, + "learning_rate": 4.787387357088282e-05, + "loss": 5.0036, + "step": 22234 + }, + { + "epoch": 0.13223784375297365, + "grad_norm": 1.4753018617630005, + "learning_rate": 4.787368506646897e-05, + "loss": 5.3268, + "step": 22235 + }, + { + "epoch": 0.13224379103625464, + "grad_norm": 1.3295317888259888, + "learning_rate": 4.787349655407014e-05, + "loss": 5.3096, + "step": 22236 + }, + { + "epoch": 0.13224973831953563, + "grad_norm": 1.4120566844940186, + "learning_rate": 4.787330803368642e-05, + "loss": 4.9041, + "step": 22237 + }, + { + "epoch": 0.13225568560281664, + "grad_norm": 1.3822401762008667, + "learning_rate": 4.787311950531787e-05, + "loss": 5.0089, + "step": 22238 + }, + { + "epoch": 0.13226163288609763, + "grad_norm": 1.0574642419815063, + "learning_rate": 4.7872930968964535e-05, + "loss": 5.528, + "step": 22239 + }, + { + "epoch": 0.13226758016937862, + "grad_norm": 1.4523993730545044, + "learning_rate": 4.78727424246265e-05, + "loss": 5.3844, + "step": 22240 + }, + { + "epoch": 0.13227352745265963, + "grad_norm": 1.283956527709961, + "learning_rate": 4.787255387230383e-05, + "loss": 5.226, + "step": 22241 + }, + { + "epoch": 0.13227947473594062, + "grad_norm": 1.621275782585144, + "learning_rate": 4.7872365311996594e-05, + "loss": 4.7797, + "step": 22242 + }, + { + "epoch": 0.1322854220192216, + "grad_norm": 1.327376365661621, + "learning_rate": 4.787217674370484e-05, + "loss": 4.9057, + "step": 22243 + }, + { + "epoch": 0.13229136930250263, + "grad_norm": 1.5311939716339111, + "learning_rate": 4.787198816742865e-05, + "loss": 5.0076, + "step": 22244 + }, + { + "epoch": 0.13229731658578361, + "grad_norm": 1.3926832675933838, + "learning_rate": 4.7871799583168085e-05, + "loss": 4.9328, + "step": 22245 + }, + { + "epoch": 0.1323032638690646, + "grad_norm": 1.2381867170333862, + "learning_rate": 4.787161099092321e-05, + "loss": 5.1678, + "step": 22246 + }, + { + "epoch": 0.13230921115234562, + "grad_norm": 1.1969068050384521, + "learning_rate": 4.78714223906941e-05, + "loss": 5.5106, + "step": 22247 + }, + { + "epoch": 0.1323151584356266, + "grad_norm": 1.2368844747543335, + "learning_rate": 4.7871233782480804e-05, + "loss": 5.4105, + "step": 22248 + }, + { + "epoch": 0.1323211057189076, + "grad_norm": 1.45974862575531, + "learning_rate": 4.78710451662834e-05, + "loss": 4.9328, + "step": 22249 + }, + { + "epoch": 0.1323270530021886, + "grad_norm": 1.2457060813903809, + "learning_rate": 4.787085654210195e-05, + "loss": 5.225, + "step": 22250 + }, + { + "epoch": 0.1323330002854696, + "grad_norm": 1.4274303913116455, + "learning_rate": 4.787066790993652e-05, + "loss": 4.8785, + "step": 22251 + }, + { + "epoch": 0.1323389475687506, + "grad_norm": 1.3072400093078613, + "learning_rate": 4.7870479269787174e-05, + "loss": 4.871, + "step": 22252 + }, + { + "epoch": 0.1323448948520316, + "grad_norm": 1.2442991733551025, + "learning_rate": 4.787029062165398e-05, + "loss": 4.8374, + "step": 22253 + }, + { + "epoch": 0.1323508421353126, + "grad_norm": 1.3584920167922974, + "learning_rate": 4.787010196553701e-05, + "loss": 5.2427, + "step": 22254 + }, + { + "epoch": 0.13235678941859358, + "grad_norm": 1.560067892074585, + "learning_rate": 4.786991330143632e-05, + "loss": 4.8689, + "step": 22255 + }, + { + "epoch": 0.1323627367018746, + "grad_norm": 1.3197054862976074, + "learning_rate": 4.786972462935198e-05, + "loss": 4.8326, + "step": 22256 + }, + { + "epoch": 0.13236868398515558, + "grad_norm": 1.2790191173553467, + "learning_rate": 4.786953594928405e-05, + "loss": 4.7454, + "step": 22257 + }, + { + "epoch": 0.13237463126843657, + "grad_norm": 1.6187344789505005, + "learning_rate": 4.7869347261232606e-05, + "loss": 5.5456, + "step": 22258 + }, + { + "epoch": 0.1323805785517176, + "grad_norm": 1.3327410221099854, + "learning_rate": 4.786915856519771e-05, + "loss": 4.834, + "step": 22259 + }, + { + "epoch": 0.13238652583499858, + "grad_norm": 1.2602509260177612, + "learning_rate": 4.786896986117943e-05, + "loss": 5.1677, + "step": 22260 + }, + { + "epoch": 0.13239247311827956, + "grad_norm": 1.4382299184799194, + "learning_rate": 4.786878114917782e-05, + "loss": 5.0591, + "step": 22261 + }, + { + "epoch": 0.13239842040156058, + "grad_norm": 1.4061304330825806, + "learning_rate": 4.786859242919296e-05, + "loss": 5.0161, + "step": 22262 + }, + { + "epoch": 0.13240436768484157, + "grad_norm": 1.4143967628479004, + "learning_rate": 4.7868403701224905e-05, + "loss": 4.7625, + "step": 22263 + }, + { + "epoch": 0.13241031496812256, + "grad_norm": 1.4221394062042236, + "learning_rate": 4.786821496527374e-05, + "loss": 4.8579, + "step": 22264 + }, + { + "epoch": 0.13241626225140357, + "grad_norm": 1.3852332830429077, + "learning_rate": 4.78680262213395e-05, + "loss": 4.6081, + "step": 22265 + }, + { + "epoch": 0.13242220953468456, + "grad_norm": 1.2698066234588623, + "learning_rate": 4.786783746942228e-05, + "loss": 4.7903, + "step": 22266 + }, + { + "epoch": 0.13242815681796555, + "grad_norm": 1.2313082218170166, + "learning_rate": 4.7867648709522136e-05, + "loss": 4.8353, + "step": 22267 + }, + { + "epoch": 0.13243410410124656, + "grad_norm": 1.3578218221664429, + "learning_rate": 4.7867459941639124e-05, + "loss": 5.2778, + "step": 22268 + }, + { + "epoch": 0.13244005138452755, + "grad_norm": 1.5034034252166748, + "learning_rate": 4.786727116577332e-05, + "loss": 5.2208, + "step": 22269 + }, + { + "epoch": 0.13244599866780854, + "grad_norm": 1.621207356452942, + "learning_rate": 4.786708238192479e-05, + "loss": 4.8394, + "step": 22270 + }, + { + "epoch": 0.13245194595108956, + "grad_norm": 1.471311092376709, + "learning_rate": 4.7866893590093595e-05, + "loss": 4.8942, + "step": 22271 + }, + { + "epoch": 0.13245789323437054, + "grad_norm": 1.3276898860931396, + "learning_rate": 4.7866704790279806e-05, + "loss": 4.833, + "step": 22272 + }, + { + "epoch": 0.13246384051765153, + "grad_norm": 1.484650731086731, + "learning_rate": 4.786651598248349e-05, + "loss": 5.0415, + "step": 22273 + }, + { + "epoch": 0.13246978780093252, + "grad_norm": 1.3327105045318604, + "learning_rate": 4.7866327166704703e-05, + "loss": 5.2227, + "step": 22274 + }, + { + "epoch": 0.13247573508421354, + "grad_norm": 1.4387754201889038, + "learning_rate": 4.7866138342943525e-05, + "loss": 5.1764, + "step": 22275 + }, + { + "epoch": 0.13248168236749452, + "grad_norm": 1.3406511545181274, + "learning_rate": 4.786594951120001e-05, + "loss": 5.2711, + "step": 22276 + }, + { + "epoch": 0.1324876296507755, + "grad_norm": 1.3859505653381348, + "learning_rate": 4.7865760671474224e-05, + "loss": 5.1102, + "step": 22277 + }, + { + "epoch": 0.13249357693405653, + "grad_norm": 1.517545461654663, + "learning_rate": 4.7865571823766245e-05, + "loss": 5.1275, + "step": 22278 + }, + { + "epoch": 0.13249952421733752, + "grad_norm": 1.720278263092041, + "learning_rate": 4.7865382968076125e-05, + "loss": 5.0902, + "step": 22279 + }, + { + "epoch": 0.1325054715006185, + "grad_norm": 1.543717622756958, + "learning_rate": 4.786519410440394e-05, + "loss": 5.1094, + "step": 22280 + }, + { + "epoch": 0.13251141878389952, + "grad_norm": 1.2068023681640625, + "learning_rate": 4.786500523274975e-05, + "loss": 5.1791, + "step": 22281 + }, + { + "epoch": 0.1325173660671805, + "grad_norm": 1.426169991493225, + "learning_rate": 4.786481635311362e-05, + "loss": 5.2155, + "step": 22282 + }, + { + "epoch": 0.1325233133504615, + "grad_norm": 1.4624898433685303, + "learning_rate": 4.7864627465495626e-05, + "loss": 4.8741, + "step": 22283 + }, + { + "epoch": 0.1325292606337425, + "grad_norm": 1.2942382097244263, + "learning_rate": 4.786443856989582e-05, + "loss": 5.4888, + "step": 22284 + }, + { + "epoch": 0.1325352079170235, + "grad_norm": 1.2372108697891235, + "learning_rate": 4.786424966631428e-05, + "loss": 5.1907, + "step": 22285 + }, + { + "epoch": 0.1325411552003045, + "grad_norm": 1.368546962738037, + "learning_rate": 4.7864060754751064e-05, + "loss": 5.1653, + "step": 22286 + }, + { + "epoch": 0.1325471024835855, + "grad_norm": 1.6052632331848145, + "learning_rate": 4.786387183520624e-05, + "loss": 5.2139, + "step": 22287 + }, + { + "epoch": 0.1325530497668665, + "grad_norm": 1.4893959760665894, + "learning_rate": 4.7863682907679874e-05, + "loss": 4.9972, + "step": 22288 + }, + { + "epoch": 0.13255899705014748, + "grad_norm": 1.370919942855835, + "learning_rate": 4.786349397217204e-05, + "loss": 5.315, + "step": 22289 + }, + { + "epoch": 0.1325649443334285, + "grad_norm": 1.7138948440551758, + "learning_rate": 4.786330502868279e-05, + "loss": 5.4063, + "step": 22290 + }, + { + "epoch": 0.13257089161670949, + "grad_norm": 1.4117851257324219, + "learning_rate": 4.786311607721219e-05, + "loss": 5.3601, + "step": 22291 + }, + { + "epoch": 0.13257683889999047, + "grad_norm": 2.5631167888641357, + "learning_rate": 4.786292711776033e-05, + "loss": 3.8547, + "step": 22292 + }, + { + "epoch": 0.1325827861832715, + "grad_norm": 2.4507203102111816, + "learning_rate": 4.786273815032724e-05, + "loss": 3.9096, + "step": 22293 + }, + { + "epoch": 0.13258873346655248, + "grad_norm": 2.384136915206909, + "learning_rate": 4.7862549174913014e-05, + "loss": 4.0437, + "step": 22294 + }, + { + "epoch": 0.13259468074983347, + "grad_norm": 2.215449094772339, + "learning_rate": 4.786236019151771e-05, + "loss": 3.9703, + "step": 22295 + }, + { + "epoch": 0.13260062803311448, + "grad_norm": 2.1639139652252197, + "learning_rate": 4.786217120014138e-05, + "loss": 3.5108, + "step": 22296 + }, + { + "epoch": 0.13260657531639547, + "grad_norm": 2.2001569271087646, + "learning_rate": 4.786198220078412e-05, + "loss": 3.3189, + "step": 22297 + }, + { + "epoch": 0.13261252259967646, + "grad_norm": 2.1637179851531982, + "learning_rate": 4.7861793193445964e-05, + "loss": 3.3301, + "step": 22298 + }, + { + "epoch": 0.13261846988295747, + "grad_norm": 2.12546443939209, + "learning_rate": 4.7861604178127e-05, + "loss": 3.4002, + "step": 22299 + }, + { + "epoch": 0.13262441716623846, + "grad_norm": 1.632663369178772, + "learning_rate": 4.7861415154827285e-05, + "loss": 5.6516, + "step": 22300 + }, + { + "epoch": 0.13263036444951945, + "grad_norm": 1.6801213026046753, + "learning_rate": 4.786122612354688e-05, + "loss": 5.5013, + "step": 22301 + }, + { + "epoch": 0.13263631173280047, + "grad_norm": 1.5306708812713623, + "learning_rate": 4.7861037084285866e-05, + "loss": 5.6885, + "step": 22302 + }, + { + "epoch": 0.13264225901608145, + "grad_norm": 1.553322196006775, + "learning_rate": 4.7860848037044294e-05, + "loss": 5.499, + "step": 22303 + }, + { + "epoch": 0.13264820629936244, + "grad_norm": 1.5508325099945068, + "learning_rate": 4.7860658981822234e-05, + "loss": 5.522, + "step": 22304 + }, + { + "epoch": 0.13265415358264346, + "grad_norm": 1.4522117376327515, + "learning_rate": 4.786046991861976e-05, + "loss": 5.616, + "step": 22305 + }, + { + "epoch": 0.13266010086592445, + "grad_norm": 1.5596072673797607, + "learning_rate": 4.7860280847436926e-05, + "loss": 5.5323, + "step": 22306 + }, + { + "epoch": 0.13266604814920543, + "grad_norm": 1.8776074647903442, + "learning_rate": 4.7860091768273806e-05, + "loss": 5.4604, + "step": 22307 + }, + { + "epoch": 0.13267199543248645, + "grad_norm": 1.97171151638031, + "learning_rate": 4.785990268113048e-05, + "loss": 5.2305, + "step": 22308 + }, + { + "epoch": 0.13267794271576744, + "grad_norm": 1.35499107837677, + "learning_rate": 4.785971358600698e-05, + "loss": 4.8288, + "step": 22309 + }, + { + "epoch": 0.13268388999904843, + "grad_norm": 1.5026946067810059, + "learning_rate": 4.785952448290339e-05, + "loss": 4.6641, + "step": 22310 + }, + { + "epoch": 0.13268983728232944, + "grad_norm": 1.6728490591049194, + "learning_rate": 4.785933537181978e-05, + "loss": 4.8855, + "step": 22311 + }, + { + "epoch": 0.13269578456561043, + "grad_norm": 1.834144115447998, + "learning_rate": 4.7859146252756213e-05, + "loss": 4.5688, + "step": 22312 + }, + { + "epoch": 0.13270173184889142, + "grad_norm": 2.314073085784912, + "learning_rate": 4.7858957125712753e-05, + "loss": 5.3503, + "step": 22313 + }, + { + "epoch": 0.13270767913217243, + "grad_norm": 1.7270644903182983, + "learning_rate": 4.785876799068947e-05, + "loss": 5.6763, + "step": 22314 + }, + { + "epoch": 0.13271362641545342, + "grad_norm": 1.929304599761963, + "learning_rate": 4.785857884768643e-05, + "loss": 5.1659, + "step": 22315 + }, + { + "epoch": 0.1327195736987344, + "grad_norm": 1.8507132530212402, + "learning_rate": 4.785838969670369e-05, + "loss": 5.0806, + "step": 22316 + }, + { + "epoch": 0.13272552098201543, + "grad_norm": 1.6761378049850464, + "learning_rate": 4.785820053774133e-05, + "loss": 5.2008, + "step": 22317 + }, + { + "epoch": 0.13273146826529642, + "grad_norm": 1.521119475364685, + "learning_rate": 4.785801137079939e-05, + "loss": 5.0448, + "step": 22318 + }, + { + "epoch": 0.1327374155485774, + "grad_norm": 1.6237796545028687, + "learning_rate": 4.785782219587797e-05, + "loss": 5.0451, + "step": 22319 + }, + { + "epoch": 0.13274336283185842, + "grad_norm": 1.4166826009750366, + "learning_rate": 4.785763301297712e-05, + "loss": 5.0055, + "step": 22320 + }, + { + "epoch": 0.1327493101151394, + "grad_norm": 1.7093290090560913, + "learning_rate": 4.7857443822096905e-05, + "loss": 4.9528, + "step": 22321 + }, + { + "epoch": 0.1327552573984204, + "grad_norm": 1.7715668678283691, + "learning_rate": 4.785725462323739e-05, + "loss": 5.1638, + "step": 22322 + }, + { + "epoch": 0.1327612046817014, + "grad_norm": 1.8321062326431274, + "learning_rate": 4.785706541639865e-05, + "loss": 5.1916, + "step": 22323 + }, + { + "epoch": 0.1327671519649824, + "grad_norm": 1.6878079175949097, + "learning_rate": 4.7856876201580736e-05, + "loss": 5.1106, + "step": 22324 + }, + { + "epoch": 0.1327730992482634, + "grad_norm": 1.5275590419769287, + "learning_rate": 4.7856686978783725e-05, + "loss": 5.1073, + "step": 22325 + }, + { + "epoch": 0.1327790465315444, + "grad_norm": 1.6648119688034058, + "learning_rate": 4.7856497748007684e-05, + "loss": 5.3244, + "step": 22326 + }, + { + "epoch": 0.1327849938148254, + "grad_norm": 1.693325400352478, + "learning_rate": 4.7856308509252674e-05, + "loss": 5.596, + "step": 22327 + }, + { + "epoch": 0.13279094109810638, + "grad_norm": 2.6629621982574463, + "learning_rate": 4.785611926251876e-05, + "loss": 4.1305, + "step": 22328 + }, + { + "epoch": 0.1327968883813874, + "grad_norm": 2.4292843341827393, + "learning_rate": 4.785593000780602e-05, + "loss": 4.5656, + "step": 22329 + }, + { + "epoch": 0.13280283566466838, + "grad_norm": 1.5317484140396118, + "learning_rate": 4.78557407451145e-05, + "loss": 5.6828, + "step": 22330 + }, + { + "epoch": 0.13280878294794937, + "grad_norm": 1.59109365940094, + "learning_rate": 4.7855551474444285e-05, + "loss": 5.7914, + "step": 22331 + }, + { + "epoch": 0.13281473023123036, + "grad_norm": 1.359665036201477, + "learning_rate": 4.7855362195795425e-05, + "loss": 5.6294, + "step": 22332 + }, + { + "epoch": 0.13282067751451138, + "grad_norm": 1.327269196510315, + "learning_rate": 4.7855172909168003e-05, + "loss": 5.7178, + "step": 22333 + }, + { + "epoch": 0.13282662479779236, + "grad_norm": 1.4080103635787964, + "learning_rate": 4.785498361456207e-05, + "loss": 5.8786, + "step": 22334 + }, + { + "epoch": 0.13283257208107335, + "grad_norm": 1.393926978111267, + "learning_rate": 4.78547943119777e-05, + "loss": 5.4177, + "step": 22335 + }, + { + "epoch": 0.13283851936435437, + "grad_norm": 1.6050227880477905, + "learning_rate": 4.785460500141495e-05, + "loss": 5.5235, + "step": 22336 + }, + { + "epoch": 0.13284446664763536, + "grad_norm": 1.5462367534637451, + "learning_rate": 4.785441568287391e-05, + "loss": 6.1101, + "step": 22337 + }, + { + "epoch": 0.13285041393091634, + "grad_norm": 1.5062382221221924, + "learning_rate": 4.785422635635462e-05, + "loss": 5.8075, + "step": 22338 + }, + { + "epoch": 0.13285636121419736, + "grad_norm": 1.7419465780258179, + "learning_rate": 4.785403702185716e-05, + "loss": 5.8189, + "step": 22339 + }, + { + "epoch": 0.13286230849747835, + "grad_norm": 1.754164218902588, + "learning_rate": 4.785384767938158e-05, + "loss": 5.6446, + "step": 22340 + }, + { + "epoch": 0.13286825578075934, + "grad_norm": 1.3769707679748535, + "learning_rate": 4.785365832892797e-05, + "loss": 5.7689, + "step": 22341 + }, + { + "epoch": 0.13287420306404035, + "grad_norm": 1.6358861923217773, + "learning_rate": 4.7853468970496386e-05, + "loss": 5.4568, + "step": 22342 + }, + { + "epoch": 0.13288015034732134, + "grad_norm": 1.567083477973938, + "learning_rate": 4.7853279604086883e-05, + "loss": 5.4124, + "step": 22343 + }, + { + "epoch": 0.13288609763060233, + "grad_norm": 1.3793751001358032, + "learning_rate": 4.785309022969954e-05, + "loss": 5.5976, + "step": 22344 + }, + { + "epoch": 0.13289204491388334, + "grad_norm": 1.5371218919754028, + "learning_rate": 4.7852900847334414e-05, + "loss": 5.2898, + "step": 22345 + }, + { + "epoch": 0.13289799219716433, + "grad_norm": 2.1502809524536133, + "learning_rate": 4.785271145699158e-05, + "loss": 4.1536, + "step": 22346 + }, + { + "epoch": 0.13290393948044532, + "grad_norm": 1.9648473262786865, + "learning_rate": 4.785252205867111e-05, + "loss": 4.1755, + "step": 22347 + }, + { + "epoch": 0.13290988676372634, + "grad_norm": 1.874877691268921, + "learning_rate": 4.785233265237305e-05, + "loss": 4.1043, + "step": 22348 + }, + { + "epoch": 0.13291583404700733, + "grad_norm": 1.924109935760498, + "learning_rate": 4.785214323809748e-05, + "loss": 4.0551, + "step": 22349 + }, + { + "epoch": 0.1329217813302883, + "grad_norm": 1.8653898239135742, + "learning_rate": 4.785195381584446e-05, + "loss": 4.0712, + "step": 22350 + }, + { + "epoch": 0.13292772861356933, + "grad_norm": 1.8480240106582642, + "learning_rate": 4.785176438561406e-05, + "loss": 4.0729, + "step": 22351 + }, + { + "epoch": 0.13293367589685032, + "grad_norm": 1.7229113578796387, + "learning_rate": 4.785157494740635e-05, + "loss": 3.9822, + "step": 22352 + }, + { + "epoch": 0.1329396231801313, + "grad_norm": 1.9756056070327759, + "learning_rate": 4.7851385501221385e-05, + "loss": 3.8667, + "step": 22353 + }, + { + "epoch": 0.13294557046341232, + "grad_norm": 1.9121302366256714, + "learning_rate": 4.785119604705924e-05, + "loss": 4.0157, + "step": 22354 + }, + { + "epoch": 0.1329515177466933, + "grad_norm": 1.999444842338562, + "learning_rate": 4.785100658491998e-05, + "loss": 4.0511, + "step": 22355 + }, + { + "epoch": 0.1329574650299743, + "grad_norm": 1.8992079496383667, + "learning_rate": 4.785081711480367e-05, + "loss": 3.9595, + "step": 22356 + }, + { + "epoch": 0.1329634123132553, + "grad_norm": 1.8835148811340332, + "learning_rate": 4.785062763671037e-05, + "loss": 3.9891, + "step": 22357 + }, + { + "epoch": 0.1329693595965363, + "grad_norm": 1.8938409090042114, + "learning_rate": 4.785043815064015e-05, + "loss": 3.927, + "step": 22358 + }, + { + "epoch": 0.1329753068798173, + "grad_norm": 1.8824357986450195, + "learning_rate": 4.785024865659309e-05, + "loss": 4.0438, + "step": 22359 + }, + { + "epoch": 0.1329812541630983, + "grad_norm": 1.9158250093460083, + "learning_rate": 4.785005915456924e-05, + "loss": 4.0448, + "step": 22360 + }, + { + "epoch": 0.1329872014463793, + "grad_norm": 1.7421679496765137, + "learning_rate": 4.784986964456867e-05, + "loss": 3.9869, + "step": 22361 + }, + { + "epoch": 0.13299314872966028, + "grad_norm": 1.7917057275772095, + "learning_rate": 4.784968012659145e-05, + "loss": 3.9976, + "step": 22362 + }, + { + "epoch": 0.1329990960129413, + "grad_norm": 1.9387284517288208, + "learning_rate": 4.784949060063764e-05, + "loss": 4.3383, + "step": 22363 + }, + { + "epoch": 0.1330050432962223, + "grad_norm": 2.60548996925354, + "learning_rate": 4.78493010667073e-05, + "loss": 4.5527, + "step": 22364 + }, + { + "epoch": 0.13301099057950327, + "grad_norm": 2.440361976623535, + "learning_rate": 4.784911152480051e-05, + "loss": 4.7931, + "step": 22365 + }, + { + "epoch": 0.1330169378627843, + "grad_norm": 2.4233226776123047, + "learning_rate": 4.784892197491734e-05, + "loss": 4.5482, + "step": 22366 + }, + { + "epoch": 0.13302288514606528, + "grad_norm": 2.3421928882598877, + "learning_rate": 4.7848732417057836e-05, + "loss": 4.6708, + "step": 22367 + }, + { + "epoch": 0.13302883242934627, + "grad_norm": 1.9476850032806396, + "learning_rate": 4.784854285122208e-05, + "loss": 4.5518, + "step": 22368 + }, + { + "epoch": 0.13303477971262728, + "grad_norm": 2.015965223312378, + "learning_rate": 4.784835327741013e-05, + "loss": 4.5258, + "step": 22369 + }, + { + "epoch": 0.13304072699590827, + "grad_norm": 2.28434157371521, + "learning_rate": 4.784816369562206e-05, + "loss": 4.6413, + "step": 22370 + }, + { + "epoch": 0.13304667427918926, + "grad_norm": 1.9141323566436768, + "learning_rate": 4.784797410585794e-05, + "loss": 4.7134, + "step": 22371 + }, + { + "epoch": 0.13305262156247027, + "grad_norm": 2.2627341747283936, + "learning_rate": 4.7847784508117815e-05, + "loss": 4.512, + "step": 22372 + }, + { + "epoch": 0.13305856884575126, + "grad_norm": 2.2111268043518066, + "learning_rate": 4.784759490240177e-05, + "loss": 4.6105, + "step": 22373 + }, + { + "epoch": 0.13306451612903225, + "grad_norm": 2.4321610927581787, + "learning_rate": 4.7847405288709864e-05, + "loss": 5.1333, + "step": 22374 + }, + { + "epoch": 0.13307046341231327, + "grad_norm": 2.49605131149292, + "learning_rate": 4.7847215667042165e-05, + "loss": 5.2355, + "step": 22375 + }, + { + "epoch": 0.13307641069559425, + "grad_norm": 2.2517080307006836, + "learning_rate": 4.784702603739874e-05, + "loss": 5.3007, + "step": 22376 + }, + { + "epoch": 0.13308235797887524, + "grad_norm": 1.807502269744873, + "learning_rate": 4.784683639977966e-05, + "loss": 5.2645, + "step": 22377 + }, + { + "epoch": 0.13308830526215626, + "grad_norm": 1.9133596420288086, + "learning_rate": 4.784664675418497e-05, + "loss": 5.3313, + "step": 22378 + }, + { + "epoch": 0.13309425254543725, + "grad_norm": 1.823691725730896, + "learning_rate": 4.7846457100614774e-05, + "loss": 5.5637, + "step": 22379 + }, + { + "epoch": 0.13310019982871824, + "grad_norm": 1.769579291343689, + "learning_rate": 4.78462674390691e-05, + "loss": 5.3217, + "step": 22380 + }, + { + "epoch": 0.13310614711199925, + "grad_norm": 1.576685905456543, + "learning_rate": 4.784607776954804e-05, + "loss": 5.5387, + "step": 22381 + }, + { + "epoch": 0.13311209439528024, + "grad_norm": 1.5737719535827637, + "learning_rate": 4.784588809205164e-05, + "loss": 5.269, + "step": 22382 + }, + { + "epoch": 0.13311804167856123, + "grad_norm": 1.6323963403701782, + "learning_rate": 4.784569840657998e-05, + "loss": 5.156, + "step": 22383 + }, + { + "epoch": 0.13312398896184224, + "grad_norm": 2.5943386554718018, + "learning_rate": 4.784550871313312e-05, + "loss": 5.0882, + "step": 22384 + }, + { + "epoch": 0.13312993624512323, + "grad_norm": 1.5392063856124878, + "learning_rate": 4.784531901171113e-05, + "loss": 5.0303, + "step": 22385 + }, + { + "epoch": 0.13313588352840422, + "grad_norm": 1.7257198095321655, + "learning_rate": 4.784512930231408e-05, + "loss": 5.3784, + "step": 22386 + }, + { + "epoch": 0.13314183081168524, + "grad_norm": 1.7736787796020508, + "learning_rate": 4.784493958494203e-05, + "loss": 5.256, + "step": 22387 + }, + { + "epoch": 0.13314777809496622, + "grad_norm": 1.575386643409729, + "learning_rate": 4.784474985959505e-05, + "loss": 5.1247, + "step": 22388 + }, + { + "epoch": 0.1331537253782472, + "grad_norm": 1.6164257526397705, + "learning_rate": 4.7844560126273195e-05, + "loss": 5.553, + "step": 22389 + }, + { + "epoch": 0.13315967266152823, + "grad_norm": 1.515674114227295, + "learning_rate": 4.7844370384976546e-05, + "loss": 5.556, + "step": 22390 + }, + { + "epoch": 0.13316561994480922, + "grad_norm": 1.5831459760665894, + "learning_rate": 4.784418063570516e-05, + "loss": 5.2649, + "step": 22391 + }, + { + "epoch": 0.1331715672280902, + "grad_norm": 1.5372157096862793, + "learning_rate": 4.7843990878459114e-05, + "loss": 5.1961, + "step": 22392 + }, + { + "epoch": 0.1331775145113712, + "grad_norm": 1.5881307125091553, + "learning_rate": 4.784380111323846e-05, + "loss": 5.5521, + "step": 22393 + }, + { + "epoch": 0.1331834617946522, + "grad_norm": 1.7717739343643188, + "learning_rate": 4.784361134004327e-05, + "loss": 5.4407, + "step": 22394 + }, + { + "epoch": 0.1331894090779332, + "grad_norm": 1.7472600936889648, + "learning_rate": 4.784342155887362e-05, + "loss": 5.1055, + "step": 22395 + }, + { + "epoch": 0.13319535636121418, + "grad_norm": 1.8296018838882446, + "learning_rate": 4.784323176972956e-05, + "loss": 4.596, + "step": 22396 + }, + { + "epoch": 0.1332013036444952, + "grad_norm": 1.6303856372833252, + "learning_rate": 4.784304197261117e-05, + "loss": 5.4028, + "step": 22397 + }, + { + "epoch": 0.1332072509277762, + "grad_norm": 1.4000413417816162, + "learning_rate": 4.78428521675185e-05, + "loss": 5.8166, + "step": 22398 + }, + { + "epoch": 0.13321319821105718, + "grad_norm": 1.4396088123321533, + "learning_rate": 4.7842662354451634e-05, + "loss": 5.4439, + "step": 22399 + }, + { + "epoch": 0.1332191454943382, + "grad_norm": 1.580919623374939, + "learning_rate": 4.7842472533410635e-05, + "loss": 5.3089, + "step": 22400 + }, + { + "epoch": 0.13322509277761918, + "grad_norm": 1.7976210117340088, + "learning_rate": 4.7842282704395545e-05, + "loss": 5.1538, + "step": 22401 + }, + { + "epoch": 0.13323104006090017, + "grad_norm": 1.7573418617248535, + "learning_rate": 4.784209286740647e-05, + "loss": 5.3701, + "step": 22402 + }, + { + "epoch": 0.13323698734418118, + "grad_norm": 1.6944206953048706, + "learning_rate": 4.784190302244345e-05, + "loss": 4.8349, + "step": 22403 + }, + { + "epoch": 0.13324293462746217, + "grad_norm": 1.9255948066711426, + "learning_rate": 4.7841713169506555e-05, + "loss": 5.2077, + "step": 22404 + }, + { + "epoch": 0.13324888191074316, + "grad_norm": 1.7583602666854858, + "learning_rate": 4.784152330859586e-05, + "loss": 4.9968, + "step": 22405 + }, + { + "epoch": 0.13325482919402418, + "grad_norm": 1.6917812824249268, + "learning_rate": 4.784133343971142e-05, + "loss": 5.3295, + "step": 22406 + }, + { + "epoch": 0.13326077647730517, + "grad_norm": 1.5531493425369263, + "learning_rate": 4.784114356285331e-05, + "loss": 5.2978, + "step": 22407 + }, + { + "epoch": 0.13326672376058615, + "grad_norm": 1.5347543954849243, + "learning_rate": 4.7840953678021586e-05, + "loss": 5.2922, + "step": 22408 + }, + { + "epoch": 0.13327267104386717, + "grad_norm": 1.3059866428375244, + "learning_rate": 4.7840763785216323e-05, + "loss": 5.2255, + "step": 22409 + }, + { + "epoch": 0.13327861832714816, + "grad_norm": 1.2207573652267456, + "learning_rate": 4.784057388443759e-05, + "loss": 4.9595, + "step": 22410 + }, + { + "epoch": 0.13328456561042915, + "grad_norm": 1.9115726947784424, + "learning_rate": 4.784038397568545e-05, + "loss": 5.0465, + "step": 22411 + }, + { + "epoch": 0.13329051289371016, + "grad_norm": 1.907443642616272, + "learning_rate": 4.7840194058959965e-05, + "loss": 4.5429, + "step": 22412 + }, + { + "epoch": 0.13329646017699115, + "grad_norm": 1.7891590595245361, + "learning_rate": 4.78400041342612e-05, + "loss": 4.5718, + "step": 22413 + }, + { + "epoch": 0.13330240746027214, + "grad_norm": 1.7904539108276367, + "learning_rate": 4.7839814201589234e-05, + "loss": 4.7077, + "step": 22414 + }, + { + "epoch": 0.13330835474355315, + "grad_norm": 1.8562805652618408, + "learning_rate": 4.783962426094411e-05, + "loss": 4.8559, + "step": 22415 + }, + { + "epoch": 0.13331430202683414, + "grad_norm": 1.7840648889541626, + "learning_rate": 4.7839434312325924e-05, + "loss": 4.5559, + "step": 22416 + }, + { + "epoch": 0.13332024931011513, + "grad_norm": 1.8956695795059204, + "learning_rate": 4.783924435573472e-05, + "loss": 4.6933, + "step": 22417 + }, + { + "epoch": 0.13332619659339615, + "grad_norm": 1.798685073852539, + "learning_rate": 4.783905439117058e-05, + "loss": 4.5131, + "step": 22418 + }, + { + "epoch": 0.13333214387667713, + "grad_norm": 1.8377288579940796, + "learning_rate": 4.7838864418633554e-05, + "loss": 4.4986, + "step": 22419 + }, + { + "epoch": 0.13333809115995812, + "grad_norm": 1.8382439613342285, + "learning_rate": 4.783867443812372e-05, + "loss": 5.1565, + "step": 22420 + }, + { + "epoch": 0.13334403844323914, + "grad_norm": 2.030796766281128, + "learning_rate": 4.783848444964114e-05, + "loss": 5.4532, + "step": 22421 + }, + { + "epoch": 0.13334998572652013, + "grad_norm": 2.020561695098877, + "learning_rate": 4.7838294453185886e-05, + "loss": 5.4529, + "step": 22422 + }, + { + "epoch": 0.13335593300980111, + "grad_norm": 1.8092904090881348, + "learning_rate": 4.783810444875801e-05, + "loss": 5.4092, + "step": 22423 + }, + { + "epoch": 0.13336188029308213, + "grad_norm": 1.7571618556976318, + "learning_rate": 4.78379144363576e-05, + "loss": 5.5134, + "step": 22424 + }, + { + "epoch": 0.13336782757636312, + "grad_norm": 1.8572049140930176, + "learning_rate": 4.7837724415984694e-05, + "loss": 5.1786, + "step": 22425 + }, + { + "epoch": 0.1333737748596441, + "grad_norm": 2.3944039344787598, + "learning_rate": 4.783753438763938e-05, + "loss": 4.7667, + "step": 22426 + }, + { + "epoch": 0.13337972214292512, + "grad_norm": 1.9377988576889038, + "learning_rate": 4.7837344351321725e-05, + "loss": 5.6523, + "step": 22427 + }, + { + "epoch": 0.1333856694262061, + "grad_norm": 1.7981183528900146, + "learning_rate": 4.783715430703178e-05, + "loss": 5.5374, + "step": 22428 + }, + { + "epoch": 0.1333916167094871, + "grad_norm": 1.6658248901367188, + "learning_rate": 4.783696425476963e-05, + "loss": 5.5128, + "step": 22429 + }, + { + "epoch": 0.13339756399276811, + "grad_norm": 1.6594502925872803, + "learning_rate": 4.783677419453533e-05, + "loss": 5.5225, + "step": 22430 + }, + { + "epoch": 0.1334035112760491, + "grad_norm": 1.6250741481781006, + "learning_rate": 4.7836584126328945e-05, + "loss": 5.4027, + "step": 22431 + }, + { + "epoch": 0.1334094585593301, + "grad_norm": 1.633254885673523, + "learning_rate": 4.783639405015054e-05, + "loss": 5.3856, + "step": 22432 + }, + { + "epoch": 0.1334154058426111, + "grad_norm": 1.5948752164840698, + "learning_rate": 4.783620396600019e-05, + "loss": 5.5501, + "step": 22433 + }, + { + "epoch": 0.1334213531258921, + "grad_norm": 2.007847547531128, + "learning_rate": 4.783601387387796e-05, + "loss": 4.878, + "step": 22434 + }, + { + "epoch": 0.13342730040917308, + "grad_norm": 2.4036359786987305, + "learning_rate": 4.783582377378391e-05, + "loss": 3.8348, + "step": 22435 + }, + { + "epoch": 0.1334332476924541, + "grad_norm": 2.7686264514923096, + "learning_rate": 4.783563366571811e-05, + "loss": 3.13, + "step": 22436 + }, + { + "epoch": 0.1334391949757351, + "grad_norm": 2.4651095867156982, + "learning_rate": 4.7835443549680625e-05, + "loss": 2.9104, + "step": 22437 + }, + { + "epoch": 0.13344514225901608, + "grad_norm": 2.57837176322937, + "learning_rate": 4.7835253425671526e-05, + "loss": 3.1145, + "step": 22438 + }, + { + "epoch": 0.1334510895422971, + "grad_norm": 2.804194688796997, + "learning_rate": 4.783506329369087e-05, + "loss": 3.7685, + "step": 22439 + }, + { + "epoch": 0.13345703682557808, + "grad_norm": 2.5836985111236572, + "learning_rate": 4.783487315373874e-05, + "loss": 3.383, + "step": 22440 + }, + { + "epoch": 0.13346298410885907, + "grad_norm": 2.5800416469573975, + "learning_rate": 4.7834683005815184e-05, + "loss": 3.345, + "step": 22441 + }, + { + "epoch": 0.13346893139214008, + "grad_norm": 2.695234775543213, + "learning_rate": 4.7834492849920275e-05, + "loss": 3.7905, + "step": 22442 + }, + { + "epoch": 0.13347487867542107, + "grad_norm": 2.075918436050415, + "learning_rate": 4.783430268605409e-05, + "loss": 4.3114, + "step": 22443 + }, + { + "epoch": 0.13348082595870206, + "grad_norm": 2.221691131591797, + "learning_rate": 4.7834112514216676e-05, + "loss": 5.5658, + "step": 22444 + }, + { + "epoch": 0.13348677324198308, + "grad_norm": 1.9432377815246582, + "learning_rate": 4.783392233440811e-05, + "loss": 5.2566, + "step": 22445 + }, + { + "epoch": 0.13349272052526406, + "grad_norm": 1.9735411405563354, + "learning_rate": 4.783373214662846e-05, + "loss": 4.2656, + "step": 22446 + }, + { + "epoch": 0.13349866780854505, + "grad_norm": 1.8616423606872559, + "learning_rate": 4.783354195087779e-05, + "loss": 4.2018, + "step": 22447 + }, + { + "epoch": 0.13350461509182607, + "grad_norm": 1.9751770496368408, + "learning_rate": 4.783335174715617e-05, + "loss": 4.1716, + "step": 22448 + }, + { + "epoch": 0.13351056237510706, + "grad_norm": 2.053149461746216, + "learning_rate": 4.7833161535463656e-05, + "loss": 4.0603, + "step": 22449 + }, + { + "epoch": 0.13351650965838804, + "grad_norm": 1.8129456043243408, + "learning_rate": 4.7832971315800325e-05, + "loss": 4.098, + "step": 22450 + }, + { + "epoch": 0.13352245694166903, + "grad_norm": 1.8842658996582031, + "learning_rate": 4.783278108816624e-05, + "loss": 4.1225, + "step": 22451 + }, + { + "epoch": 0.13352840422495005, + "grad_norm": 1.9037132263183594, + "learning_rate": 4.783259085256146e-05, + "loss": 4.0953, + "step": 22452 + }, + { + "epoch": 0.13353435150823104, + "grad_norm": 1.8058161735534668, + "learning_rate": 4.7832400608986074e-05, + "loss": 3.9189, + "step": 22453 + }, + { + "epoch": 0.13354029879151202, + "grad_norm": 1.899573564529419, + "learning_rate": 4.7832210357440124e-05, + "loss": 4.063, + "step": 22454 + }, + { + "epoch": 0.13354624607479304, + "grad_norm": 1.8507969379425049, + "learning_rate": 4.783202009792368e-05, + "loss": 4.1139, + "step": 22455 + }, + { + "epoch": 0.13355219335807403, + "grad_norm": 1.861315369606018, + "learning_rate": 4.783182983043681e-05, + "loss": 4.1063, + "step": 22456 + }, + { + "epoch": 0.13355814064135502, + "grad_norm": 1.9481399059295654, + "learning_rate": 4.7831639554979603e-05, + "loss": 4.1103, + "step": 22457 + }, + { + "epoch": 0.13356408792463603, + "grad_norm": 1.9315237998962402, + "learning_rate": 4.7831449271552086e-05, + "loss": 4.0723, + "step": 22458 + }, + { + "epoch": 0.13357003520791702, + "grad_norm": 1.951989769935608, + "learning_rate": 4.783125898015436e-05, + "loss": 4.3063, + "step": 22459 + }, + { + "epoch": 0.133575982491198, + "grad_norm": 1.8107032775878906, + "learning_rate": 4.783106868078647e-05, + "loss": 4.1869, + "step": 22460 + }, + { + "epoch": 0.13358192977447902, + "grad_norm": 1.8079946041107178, + "learning_rate": 4.7830878373448495e-05, + "loss": 4.2569, + "step": 22461 + }, + { + "epoch": 0.13358787705776, + "grad_norm": 1.9094295501708984, + "learning_rate": 4.7830688058140494e-05, + "loss": 4.8144, + "step": 22462 + }, + { + "epoch": 0.133593824341041, + "grad_norm": 1.9410862922668457, + "learning_rate": 4.7830497734862536e-05, + "loss": 4.6606, + "step": 22463 + }, + { + "epoch": 0.13359977162432202, + "grad_norm": 1.832387089729309, + "learning_rate": 4.783030740361469e-05, + "loss": 4.774, + "step": 22464 + }, + { + "epoch": 0.133605718907603, + "grad_norm": 1.8661162853240967, + "learning_rate": 4.783011706439701e-05, + "loss": 5.0414, + "step": 22465 + }, + { + "epoch": 0.133611666190884, + "grad_norm": 1.6019399166107178, + "learning_rate": 4.782992671720958e-05, + "loss": 5.1333, + "step": 22466 + }, + { + "epoch": 0.133617613474165, + "grad_norm": 1.539556860923767, + "learning_rate": 4.7829736362052455e-05, + "loss": 5.5576, + "step": 22467 + }, + { + "epoch": 0.133623560757446, + "grad_norm": 1.6988813877105713, + "learning_rate": 4.7829545998925704e-05, + "loss": 5.5953, + "step": 22468 + }, + { + "epoch": 0.13362950804072699, + "grad_norm": 1.77605140209198, + "learning_rate": 4.78293556278294e-05, + "loss": 5.1917, + "step": 22469 + }, + { + "epoch": 0.133635455324008, + "grad_norm": 1.958486557006836, + "learning_rate": 4.78291652487636e-05, + "loss": 5.141, + "step": 22470 + }, + { + "epoch": 0.133641402607289, + "grad_norm": 1.4875729084014893, + "learning_rate": 4.7828974861728374e-05, + "loss": 5.551, + "step": 22471 + }, + { + "epoch": 0.13364734989056998, + "grad_norm": 1.5118046998977661, + "learning_rate": 4.7828784466723795e-05, + "loss": 5.8965, + "step": 22472 + }, + { + "epoch": 0.133653297173851, + "grad_norm": 1.7107024192810059, + "learning_rate": 4.7828594063749924e-05, + "loss": 5.444, + "step": 22473 + }, + { + "epoch": 0.13365924445713198, + "grad_norm": 2.211569309234619, + "learning_rate": 4.7828403652806814e-05, + "loss": 4.6709, + "step": 22474 + }, + { + "epoch": 0.13366519174041297, + "grad_norm": 1.5755807161331177, + "learning_rate": 4.782821323389455e-05, + "loss": 5.481, + "step": 22475 + }, + { + "epoch": 0.13367113902369399, + "grad_norm": 1.5715577602386475, + "learning_rate": 4.782802280701319e-05, + "loss": 5.4475, + "step": 22476 + }, + { + "epoch": 0.13367708630697497, + "grad_norm": 1.483229160308838, + "learning_rate": 4.782783237216281e-05, + "loss": 5.287, + "step": 22477 + }, + { + "epoch": 0.13368303359025596, + "grad_norm": 1.6031765937805176, + "learning_rate": 4.782764192934347e-05, + "loss": 4.9328, + "step": 22478 + }, + { + "epoch": 0.13368898087353698, + "grad_norm": 1.5472909212112427, + "learning_rate": 4.782745147855523e-05, + "loss": 5.4962, + "step": 22479 + }, + { + "epoch": 0.13369492815681797, + "grad_norm": 1.5153834819793701, + "learning_rate": 4.7827261019798164e-05, + "loss": 5.2488, + "step": 22480 + }, + { + "epoch": 0.13370087544009895, + "grad_norm": 1.8485814332962036, + "learning_rate": 4.782707055307233e-05, + "loss": 4.6998, + "step": 22481 + }, + { + "epoch": 0.13370682272337997, + "grad_norm": 1.6526838541030884, + "learning_rate": 4.782688007837781e-05, + "loss": 4.7843, + "step": 22482 + }, + { + "epoch": 0.13371277000666096, + "grad_norm": 1.6769697666168213, + "learning_rate": 4.782668959571467e-05, + "loss": 4.8344, + "step": 22483 + }, + { + "epoch": 0.13371871728994195, + "grad_norm": 1.6509302854537964, + "learning_rate": 4.782649910508296e-05, + "loss": 5.0646, + "step": 22484 + }, + { + "epoch": 0.13372466457322296, + "grad_norm": 1.58712637424469, + "learning_rate": 4.782630860648275e-05, + "loss": 4.841, + "step": 22485 + }, + { + "epoch": 0.13373061185650395, + "grad_norm": 1.7171813249588013, + "learning_rate": 4.782611809991412e-05, + "loss": 5.5934, + "step": 22486 + }, + { + "epoch": 0.13373655913978494, + "grad_norm": 1.598689079284668, + "learning_rate": 4.782592758537712e-05, + "loss": 5.5131, + "step": 22487 + }, + { + "epoch": 0.13374250642306595, + "grad_norm": 1.652279019355774, + "learning_rate": 4.782573706287183e-05, + "loss": 4.9244, + "step": 22488 + }, + { + "epoch": 0.13374845370634694, + "grad_norm": 1.733337163925171, + "learning_rate": 4.782554653239831e-05, + "loss": 5.1153, + "step": 22489 + }, + { + "epoch": 0.13375440098962793, + "grad_norm": 1.3961280584335327, + "learning_rate": 4.782535599395662e-05, + "loss": 5.1146, + "step": 22490 + }, + { + "epoch": 0.13376034827290895, + "grad_norm": 1.371650218963623, + "learning_rate": 4.782516544754685e-05, + "loss": 4.9608, + "step": 22491 + }, + { + "epoch": 0.13376629555618993, + "grad_norm": 1.738678216934204, + "learning_rate": 4.782497489316904e-05, + "loss": 5.384, + "step": 22492 + }, + { + "epoch": 0.13377224283947092, + "grad_norm": 1.899530291557312, + "learning_rate": 4.7824784330823266e-05, + "loss": 5.479, + "step": 22493 + }, + { + "epoch": 0.13377819012275194, + "grad_norm": 1.6108837127685547, + "learning_rate": 4.782459376050959e-05, + "loss": 5.4919, + "step": 22494 + }, + { + "epoch": 0.13378413740603293, + "grad_norm": 1.688045859336853, + "learning_rate": 4.78244031822281e-05, + "loss": 5.5093, + "step": 22495 + }, + { + "epoch": 0.13379008468931392, + "grad_norm": 1.526538610458374, + "learning_rate": 4.782421259597884e-05, + "loss": 5.4022, + "step": 22496 + }, + { + "epoch": 0.13379603197259493, + "grad_norm": 1.5651198625564575, + "learning_rate": 4.7824022001761884e-05, + "loss": 5.3737, + "step": 22497 + }, + { + "epoch": 0.13380197925587592, + "grad_norm": 1.6090896129608154, + "learning_rate": 4.7823831399577296e-05, + "loss": 5.3482, + "step": 22498 + }, + { + "epoch": 0.1338079265391569, + "grad_norm": 1.5139176845550537, + "learning_rate": 4.782364078942514e-05, + "loss": 5.2195, + "step": 22499 + }, + { + "epoch": 0.13381387382243792, + "grad_norm": 1.468328833580017, + "learning_rate": 4.782345017130549e-05, + "loss": 5.4421, + "step": 22500 + }, + { + "epoch": 0.1338198211057189, + "grad_norm": 1.4803540706634521, + "learning_rate": 4.782325954521841e-05, + "loss": 5.8645, + "step": 22501 + }, + { + "epoch": 0.1338257683889999, + "grad_norm": 1.5472211837768555, + "learning_rate": 4.782306891116397e-05, + "loss": 5.5739, + "step": 22502 + }, + { + "epoch": 0.13383171567228092, + "grad_norm": 1.5523242950439453, + "learning_rate": 4.782287826914223e-05, + "loss": 5.4971, + "step": 22503 + }, + { + "epoch": 0.1338376629555619, + "grad_norm": 1.6459407806396484, + "learning_rate": 4.7822687619153264e-05, + "loss": 5.5006, + "step": 22504 + }, + { + "epoch": 0.1338436102388429, + "grad_norm": 1.9664801359176636, + "learning_rate": 4.782249696119712e-05, + "loss": 4.908, + "step": 22505 + }, + { + "epoch": 0.1338495575221239, + "grad_norm": 1.757797360420227, + "learning_rate": 4.782230629527389e-05, + "loss": 5.3259, + "step": 22506 + }, + { + "epoch": 0.1338555048054049, + "grad_norm": 1.734212040901184, + "learning_rate": 4.7822115621383626e-05, + "loss": 4.9526, + "step": 22507 + }, + { + "epoch": 0.13386145208868588, + "grad_norm": 1.7347631454467773, + "learning_rate": 4.7821924939526386e-05, + "loss": 4.9416, + "step": 22508 + }, + { + "epoch": 0.13386739937196687, + "grad_norm": 1.6283304691314697, + "learning_rate": 4.782173424970226e-05, + "loss": 5.1706, + "step": 22509 + }, + { + "epoch": 0.1338733466552479, + "grad_norm": 1.6665587425231934, + "learning_rate": 4.7821543551911294e-05, + "loss": 5.6977, + "step": 22510 + }, + { + "epoch": 0.13387929393852888, + "grad_norm": 1.5051319599151611, + "learning_rate": 4.7821352846153576e-05, + "loss": 5.7575, + "step": 22511 + }, + { + "epoch": 0.13388524122180986, + "grad_norm": 1.966944932937622, + "learning_rate": 4.7821162132429154e-05, + "loss": 4.8996, + "step": 22512 + }, + { + "epoch": 0.13389118850509088, + "grad_norm": 2.669949769973755, + "learning_rate": 4.782097141073809e-05, + "loss": 3.7917, + "step": 22513 + }, + { + "epoch": 0.13389713578837187, + "grad_norm": 2.743389844894409, + "learning_rate": 4.782078068108048e-05, + "loss": 3.658, + "step": 22514 + }, + { + "epoch": 0.13390308307165286, + "grad_norm": 2.8011279106140137, + "learning_rate": 4.782058994345635e-05, + "loss": 3.4269, + "step": 22515 + }, + { + "epoch": 0.13390903035493387, + "grad_norm": 2.332318067550659, + "learning_rate": 4.78203991978658e-05, + "loss": 3.7318, + "step": 22516 + }, + { + "epoch": 0.13391497763821486, + "grad_norm": 2.1522371768951416, + "learning_rate": 4.782020844430888e-05, + "loss": 3.912, + "step": 22517 + }, + { + "epoch": 0.13392092492149585, + "grad_norm": 1.7325389385223389, + "learning_rate": 4.782001768278567e-05, + "loss": 5.2602, + "step": 22518 + }, + { + "epoch": 0.13392687220477686, + "grad_norm": 1.872207522392273, + "learning_rate": 4.7819826913296216e-05, + "loss": 5.3663, + "step": 22519 + }, + { + "epoch": 0.13393281948805785, + "grad_norm": 1.86244535446167, + "learning_rate": 4.78196361358406e-05, + "loss": 5.382, + "step": 22520 + }, + { + "epoch": 0.13393876677133884, + "grad_norm": 1.6984341144561768, + "learning_rate": 4.781944535041889e-05, + "loss": 5.2243, + "step": 22521 + }, + { + "epoch": 0.13394471405461986, + "grad_norm": 1.7697153091430664, + "learning_rate": 4.781925455703114e-05, + "loss": 5.2368, + "step": 22522 + }, + { + "epoch": 0.13395066133790084, + "grad_norm": 2.323636293411255, + "learning_rate": 4.781906375567743e-05, + "loss": 4.7709, + "step": 22523 + }, + { + "epoch": 0.13395660862118183, + "grad_norm": 2.2196481227874756, + "learning_rate": 4.781887294635782e-05, + "loss": 4.8089, + "step": 22524 + }, + { + "epoch": 0.13396255590446285, + "grad_norm": 1.8148611783981323, + "learning_rate": 4.7818682129072365e-05, + "loss": 4.673, + "step": 22525 + }, + { + "epoch": 0.13396850318774384, + "grad_norm": 1.9306626319885254, + "learning_rate": 4.7818491303821155e-05, + "loss": 5.3217, + "step": 22526 + }, + { + "epoch": 0.13397445047102483, + "grad_norm": 1.9646215438842773, + "learning_rate": 4.781830047060425e-05, + "loss": 4.9239, + "step": 22527 + }, + { + "epoch": 0.13398039775430584, + "grad_norm": 1.7711313962936401, + "learning_rate": 4.7818109629421706e-05, + "loss": 5.1977, + "step": 22528 + }, + { + "epoch": 0.13398634503758683, + "grad_norm": 1.5714713335037231, + "learning_rate": 4.781791878027359e-05, + "loss": 5.1759, + "step": 22529 + }, + { + "epoch": 0.13399229232086782, + "grad_norm": 1.573440670967102, + "learning_rate": 4.781772792315998e-05, + "loss": 5.2892, + "step": 22530 + }, + { + "epoch": 0.13399823960414883, + "grad_norm": 1.484643816947937, + "learning_rate": 4.781753705808094e-05, + "loss": 5.2751, + "step": 22531 + }, + { + "epoch": 0.13400418688742982, + "grad_norm": 1.484236240386963, + "learning_rate": 4.781734618503653e-05, + "loss": 5.1928, + "step": 22532 + }, + { + "epoch": 0.1340101341707108, + "grad_norm": 1.6469415426254272, + "learning_rate": 4.781715530402682e-05, + "loss": 4.9161, + "step": 22533 + }, + { + "epoch": 0.13401608145399183, + "grad_norm": 1.736928939819336, + "learning_rate": 4.781696441505188e-05, + "loss": 5.2132, + "step": 22534 + }, + { + "epoch": 0.1340220287372728, + "grad_norm": 1.6927560567855835, + "learning_rate": 4.781677351811177e-05, + "loss": 5.1001, + "step": 22535 + }, + { + "epoch": 0.1340279760205538, + "grad_norm": 1.4961135387420654, + "learning_rate": 4.7816582613206564e-05, + "loss": 4.8025, + "step": 22536 + }, + { + "epoch": 0.13403392330383482, + "grad_norm": 1.6069209575653076, + "learning_rate": 4.7816391700336315e-05, + "loss": 5.1449, + "step": 22537 + }, + { + "epoch": 0.1340398705871158, + "grad_norm": 1.9168766736984253, + "learning_rate": 4.781620077950111e-05, + "loss": 5.1479, + "step": 22538 + }, + { + "epoch": 0.1340458178703968, + "grad_norm": 1.545693278312683, + "learning_rate": 4.7816009850701e-05, + "loss": 5.1445, + "step": 22539 + }, + { + "epoch": 0.1340517651536778, + "grad_norm": 2.524106740951538, + "learning_rate": 4.781581891393606e-05, + "loss": 4.3988, + "step": 22540 + }, + { + "epoch": 0.1340577124369588, + "grad_norm": 3.073733329772949, + "learning_rate": 4.781562796920635e-05, + "loss": 4.8931, + "step": 22541 + }, + { + "epoch": 0.1340636597202398, + "grad_norm": 2.1566405296325684, + "learning_rate": 4.7815437016511936e-05, + "loss": 4.9778, + "step": 22542 + }, + { + "epoch": 0.1340696070035208, + "grad_norm": 1.6103532314300537, + "learning_rate": 4.78152460558529e-05, + "loss": 5.0521, + "step": 22543 + }, + { + "epoch": 0.1340755542868018, + "grad_norm": 2.068673849105835, + "learning_rate": 4.781505508722929e-05, + "loss": 4.4481, + "step": 22544 + }, + { + "epoch": 0.13408150157008278, + "grad_norm": 2.2658448219299316, + "learning_rate": 4.7814864110641175e-05, + "loss": 4.5904, + "step": 22545 + }, + { + "epoch": 0.1340874488533638, + "grad_norm": 1.6960278749465942, + "learning_rate": 4.781467312608864e-05, + "loss": 5.4661, + "step": 22546 + }, + { + "epoch": 0.13409339613664478, + "grad_norm": 1.7006616592407227, + "learning_rate": 4.781448213357173e-05, + "loss": 5.338, + "step": 22547 + }, + { + "epoch": 0.13409934341992577, + "grad_norm": 1.6810702085494995, + "learning_rate": 4.7814291133090515e-05, + "loss": 5.6328, + "step": 22548 + }, + { + "epoch": 0.1341052907032068, + "grad_norm": 1.788943886756897, + "learning_rate": 4.781410012464508e-05, + "loss": 4.7265, + "step": 22549 + }, + { + "epoch": 0.13411123798648777, + "grad_norm": 1.8539581298828125, + "learning_rate": 4.781390910823547e-05, + "loss": 5.0821, + "step": 22550 + }, + { + "epoch": 0.13411718526976876, + "grad_norm": 1.548677682876587, + "learning_rate": 4.781371808386176e-05, + "loss": 5.4704, + "step": 22551 + }, + { + "epoch": 0.13412313255304978, + "grad_norm": 1.4806692600250244, + "learning_rate": 4.781352705152402e-05, + "loss": 5.5085, + "step": 22552 + }, + { + "epoch": 0.13412907983633077, + "grad_norm": 1.5281784534454346, + "learning_rate": 4.781333601122231e-05, + "loss": 5.0698, + "step": 22553 + }, + { + "epoch": 0.13413502711961175, + "grad_norm": 1.681803822517395, + "learning_rate": 4.78131449629567e-05, + "loss": 4.6259, + "step": 22554 + }, + { + "epoch": 0.13414097440289277, + "grad_norm": 1.9039119482040405, + "learning_rate": 4.781295390672726e-05, + "loss": 4.967, + "step": 22555 + }, + { + "epoch": 0.13414692168617376, + "grad_norm": 1.3885890245437622, + "learning_rate": 4.781276284253405e-05, + "loss": 4.5992, + "step": 22556 + }, + { + "epoch": 0.13415286896945475, + "grad_norm": 1.5828464031219482, + "learning_rate": 4.781257177037714e-05, + "loss": 4.6859, + "step": 22557 + }, + { + "epoch": 0.13415881625273576, + "grad_norm": 1.6242060661315918, + "learning_rate": 4.78123806902566e-05, + "loss": 4.7105, + "step": 22558 + }, + { + "epoch": 0.13416476353601675, + "grad_norm": 1.6682454347610474, + "learning_rate": 4.781218960217249e-05, + "loss": 4.8545, + "step": 22559 + }, + { + "epoch": 0.13417071081929774, + "grad_norm": 1.8982216119766235, + "learning_rate": 4.781199850612489e-05, + "loss": 5.4946, + "step": 22560 + }, + { + "epoch": 0.13417665810257876, + "grad_norm": 1.916904330253601, + "learning_rate": 4.781180740211384e-05, + "loss": 5.7877, + "step": 22561 + }, + { + "epoch": 0.13418260538585974, + "grad_norm": 2.1762099266052246, + "learning_rate": 4.781161629013944e-05, + "loss": 5.7918, + "step": 22562 + }, + { + "epoch": 0.13418855266914073, + "grad_norm": 1.7190003395080566, + "learning_rate": 4.7811425170201726e-05, + "loss": 5.5881, + "step": 22563 + }, + { + "epoch": 0.13419449995242175, + "grad_norm": 1.5587143898010254, + "learning_rate": 4.781123404230079e-05, + "loss": 5.5391, + "step": 22564 + }, + { + "epoch": 0.13420044723570274, + "grad_norm": 1.8347082138061523, + "learning_rate": 4.7811042906436684e-05, + "loss": 5.7366, + "step": 22565 + }, + { + "epoch": 0.13420639451898372, + "grad_norm": 1.5644575357437134, + "learning_rate": 4.7810851762609484e-05, + "loss": 5.6529, + "step": 22566 + }, + { + "epoch": 0.1342123418022647, + "grad_norm": 1.6571894884109497, + "learning_rate": 4.7810660610819246e-05, + "loss": 5.1555, + "step": 22567 + }, + { + "epoch": 0.13421828908554573, + "grad_norm": 1.8291380405426025, + "learning_rate": 4.7810469451066045e-05, + "loss": 5.878, + "step": 22568 + }, + { + "epoch": 0.13422423636882672, + "grad_norm": 1.8254185914993286, + "learning_rate": 4.781027828334994e-05, + "loss": 5.0244, + "step": 22569 + }, + { + "epoch": 0.1342301836521077, + "grad_norm": 1.5728260278701782, + "learning_rate": 4.7810087107671e-05, + "loss": 5.8684, + "step": 22570 + }, + { + "epoch": 0.13423613093538872, + "grad_norm": 1.4518792629241943, + "learning_rate": 4.7809895924029303e-05, + "loss": 6.0868, + "step": 22571 + }, + { + "epoch": 0.1342420782186697, + "grad_norm": 1.5205591917037964, + "learning_rate": 4.7809704732424905e-05, + "loss": 5.3721, + "step": 22572 + }, + { + "epoch": 0.1342480255019507, + "grad_norm": 1.7081562280654907, + "learning_rate": 4.7809513532857876e-05, + "loss": 4.9758, + "step": 22573 + }, + { + "epoch": 0.1342539727852317, + "grad_norm": 1.4048930406570435, + "learning_rate": 4.7809322325328275e-05, + "loss": 5.4701, + "step": 22574 + }, + { + "epoch": 0.1342599200685127, + "grad_norm": 1.5663319826126099, + "learning_rate": 4.780913110983618e-05, + "loss": 5.1094, + "step": 22575 + }, + { + "epoch": 0.1342658673517937, + "grad_norm": 1.6008634567260742, + "learning_rate": 4.780893988638165e-05, + "loss": 5.2138, + "step": 22576 + }, + { + "epoch": 0.1342718146350747, + "grad_norm": 1.5711628198623657, + "learning_rate": 4.780874865496475e-05, + "loss": 5.7172, + "step": 22577 + }, + { + "epoch": 0.1342777619183557, + "grad_norm": 1.799984335899353, + "learning_rate": 4.7808557415585566e-05, + "loss": 4.8959, + "step": 22578 + }, + { + "epoch": 0.13428370920163668, + "grad_norm": 1.7693933248519897, + "learning_rate": 4.7808366168244137e-05, + "loss": 5.376, + "step": 22579 + }, + { + "epoch": 0.1342896564849177, + "grad_norm": 2.1041815280914307, + "learning_rate": 4.780817491294055e-05, + "loss": 5.4672, + "step": 22580 + }, + { + "epoch": 0.13429560376819868, + "grad_norm": 1.8219122886657715, + "learning_rate": 4.780798364967486e-05, + "loss": 5.6201, + "step": 22581 + }, + { + "epoch": 0.13430155105147967, + "grad_norm": 1.5907140970230103, + "learning_rate": 4.780779237844715e-05, + "loss": 5.2499, + "step": 22582 + }, + { + "epoch": 0.1343074983347607, + "grad_norm": 1.388074278831482, + "learning_rate": 4.780760109925746e-05, + "loss": 5.535, + "step": 22583 + }, + { + "epoch": 0.13431344561804168, + "grad_norm": 1.4996978044509888, + "learning_rate": 4.780740981210588e-05, + "loss": 5.2713, + "step": 22584 + }, + { + "epoch": 0.13431939290132267, + "grad_norm": 1.591178059577942, + "learning_rate": 4.780721851699247e-05, + "loss": 5.2211, + "step": 22585 + }, + { + "epoch": 0.13432534018460368, + "grad_norm": 1.5548349618911743, + "learning_rate": 4.780702721391729e-05, + "loss": 5.2867, + "step": 22586 + }, + { + "epoch": 0.13433128746788467, + "grad_norm": 1.5549981594085693, + "learning_rate": 4.780683590288042e-05, + "loss": 5.3627, + "step": 22587 + }, + { + "epoch": 0.13433723475116566, + "grad_norm": 1.4587602615356445, + "learning_rate": 4.780664458388191e-05, + "loss": 5.2031, + "step": 22588 + }, + { + "epoch": 0.13434318203444667, + "grad_norm": 1.836823582649231, + "learning_rate": 4.7806453256921846e-05, + "loss": 4.9802, + "step": 22589 + }, + { + "epoch": 0.13434912931772766, + "grad_norm": 1.5445985794067383, + "learning_rate": 4.780626192200027e-05, + "loss": 4.8789, + "step": 22590 + }, + { + "epoch": 0.13435507660100865, + "grad_norm": 1.5032085180282593, + "learning_rate": 4.780607057911728e-05, + "loss": 4.936, + "step": 22591 + }, + { + "epoch": 0.13436102388428967, + "grad_norm": 1.5628653764724731, + "learning_rate": 4.780587922827292e-05, + "loss": 4.9026, + "step": 22592 + }, + { + "epoch": 0.13436697116757065, + "grad_norm": 2.011505126953125, + "learning_rate": 4.7805687869467265e-05, + "loss": 4.5883, + "step": 22593 + }, + { + "epoch": 0.13437291845085164, + "grad_norm": 1.824877142906189, + "learning_rate": 4.780549650270038e-05, + "loss": 4.7637, + "step": 22594 + }, + { + "epoch": 0.13437886573413266, + "grad_norm": 1.3882604837417603, + "learning_rate": 4.780530512797232e-05, + "loss": 5.1455, + "step": 22595 + }, + { + "epoch": 0.13438481301741365, + "grad_norm": 1.6364738941192627, + "learning_rate": 4.780511374528318e-05, + "loss": 4.7607, + "step": 22596 + }, + { + "epoch": 0.13439076030069463, + "grad_norm": 1.6384764909744263, + "learning_rate": 4.7804922354633004e-05, + "loss": 4.8959, + "step": 22597 + }, + { + "epoch": 0.13439670758397565, + "grad_norm": 1.53514564037323, + "learning_rate": 4.780473095602186e-05, + "loss": 4.9072, + "step": 22598 + }, + { + "epoch": 0.13440265486725664, + "grad_norm": 1.5599232912063599, + "learning_rate": 4.780453954944983e-05, + "loss": 5.0727, + "step": 22599 + }, + { + "epoch": 0.13440860215053763, + "grad_norm": 1.6296029090881348, + "learning_rate": 4.780434813491696e-05, + "loss": 5.1448, + "step": 22600 + }, + { + "epoch": 0.13441454943381864, + "grad_norm": 1.8083057403564453, + "learning_rate": 4.780415671242334e-05, + "loss": 5.0841, + "step": 22601 + }, + { + "epoch": 0.13442049671709963, + "grad_norm": 1.668716311454773, + "learning_rate": 4.780396528196902e-05, + "loss": 5.0684, + "step": 22602 + }, + { + "epoch": 0.13442644400038062, + "grad_norm": 1.5879114866256714, + "learning_rate": 4.7803773843554065e-05, + "loss": 5.3685, + "step": 22603 + }, + { + "epoch": 0.13443239128366163, + "grad_norm": 1.6570247411727905, + "learning_rate": 4.780358239717855e-05, + "loss": 5.2864, + "step": 22604 + }, + { + "epoch": 0.13443833856694262, + "grad_norm": 1.5763763189315796, + "learning_rate": 4.780339094284254e-05, + "loss": 5.1896, + "step": 22605 + }, + { + "epoch": 0.1344442858502236, + "grad_norm": 1.6956191062927246, + "learning_rate": 4.7803199480546105e-05, + "loss": 5.1213, + "step": 22606 + }, + { + "epoch": 0.13445023313350463, + "grad_norm": 1.64959716796875, + "learning_rate": 4.780300801028931e-05, + "loss": 4.8764, + "step": 22607 + }, + { + "epoch": 0.13445618041678561, + "grad_norm": 1.7988736629486084, + "learning_rate": 4.7802816532072216e-05, + "loss": 4.7578, + "step": 22608 + }, + { + "epoch": 0.1344621277000666, + "grad_norm": 1.6349395513534546, + "learning_rate": 4.78026250458949e-05, + "loss": 5.5973, + "step": 22609 + }, + { + "epoch": 0.13446807498334762, + "grad_norm": 1.7561520338058472, + "learning_rate": 4.7802433551757416e-05, + "loss": 4.5933, + "step": 22610 + }, + { + "epoch": 0.1344740222666286, + "grad_norm": 1.7918694019317627, + "learning_rate": 4.780224204965984e-05, + "loss": 4.6726, + "step": 22611 + }, + { + "epoch": 0.1344799695499096, + "grad_norm": 1.6543810367584229, + "learning_rate": 4.780205053960224e-05, + "loss": 5.0966, + "step": 22612 + }, + { + "epoch": 0.1344859168331906, + "grad_norm": 1.4896337985992432, + "learning_rate": 4.7801859021584685e-05, + "loss": 4.9243, + "step": 22613 + }, + { + "epoch": 0.1344918641164716, + "grad_norm": 1.6509222984313965, + "learning_rate": 4.780166749560723e-05, + "loss": 5.0023, + "step": 22614 + }, + { + "epoch": 0.1344978113997526, + "grad_norm": 1.7909302711486816, + "learning_rate": 4.7801475961669944e-05, + "loss": 4.8274, + "step": 22615 + }, + { + "epoch": 0.1345037586830336, + "grad_norm": 1.7640331983566284, + "learning_rate": 4.780128441977291e-05, + "loss": 4.7262, + "step": 22616 + }, + { + "epoch": 0.1345097059663146, + "grad_norm": 1.6381694078445435, + "learning_rate": 4.780109286991617e-05, + "loss": 4.9027, + "step": 22617 + }, + { + "epoch": 0.13451565324959558, + "grad_norm": 1.830243468284607, + "learning_rate": 4.780090131209981e-05, + "loss": 4.837, + "step": 22618 + }, + { + "epoch": 0.1345216005328766, + "grad_norm": 1.6413569450378418, + "learning_rate": 4.780070974632389e-05, + "loss": 4.6675, + "step": 22619 + }, + { + "epoch": 0.13452754781615758, + "grad_norm": 1.7041996717453003, + "learning_rate": 4.780051817258848e-05, + "loss": 4.4556, + "step": 22620 + }, + { + "epoch": 0.13453349509943857, + "grad_norm": 1.6706191301345825, + "learning_rate": 4.780032659089364e-05, + "loss": 5.29, + "step": 22621 + }, + { + "epoch": 0.1345394423827196, + "grad_norm": 1.6883933544158936, + "learning_rate": 4.780013500123945e-05, + "loss": 5.2777, + "step": 22622 + }, + { + "epoch": 0.13454538966600058, + "grad_norm": 1.6006532907485962, + "learning_rate": 4.779994340362596e-05, + "loss": 5.1652, + "step": 22623 + }, + { + "epoch": 0.13455133694928156, + "grad_norm": 1.5645374059677124, + "learning_rate": 4.779975179805325e-05, + "loss": 5.0746, + "step": 22624 + }, + { + "epoch": 0.13455728423256255, + "grad_norm": 1.4294723272323608, + "learning_rate": 4.7799560184521384e-05, + "loss": 5.1747, + "step": 22625 + }, + { + "epoch": 0.13456323151584357, + "grad_norm": 1.5289671421051025, + "learning_rate": 4.7799368563030424e-05, + "loss": 5.0096, + "step": 22626 + }, + { + "epoch": 0.13456917879912456, + "grad_norm": 1.4476962089538574, + "learning_rate": 4.779917693358044e-05, + "loss": 5.1043, + "step": 22627 + }, + { + "epoch": 0.13457512608240554, + "grad_norm": 1.647494912147522, + "learning_rate": 4.7798985296171494e-05, + "loss": 5.2014, + "step": 22628 + }, + { + "epoch": 0.13458107336568656, + "grad_norm": 1.6972601413726807, + "learning_rate": 4.7798793650803665e-05, + "loss": 5.1526, + "step": 22629 + }, + { + "epoch": 0.13458702064896755, + "grad_norm": 1.7442299127578735, + "learning_rate": 4.779860199747701e-05, + "loss": 5.3699, + "step": 22630 + }, + { + "epoch": 0.13459296793224854, + "grad_norm": 1.5356593132019043, + "learning_rate": 4.77984103361916e-05, + "loss": 5.164, + "step": 22631 + }, + { + "epoch": 0.13459891521552955, + "grad_norm": 1.4700989723205566, + "learning_rate": 4.77982186669475e-05, + "loss": 4.7305, + "step": 22632 + }, + { + "epoch": 0.13460486249881054, + "grad_norm": 1.4296282529830933, + "learning_rate": 4.779802698974477e-05, + "loss": 4.7196, + "step": 22633 + }, + { + "epoch": 0.13461080978209153, + "grad_norm": 1.4722986221313477, + "learning_rate": 4.7797835304583494e-05, + "loss": 4.763, + "step": 22634 + }, + { + "epoch": 0.13461675706537254, + "grad_norm": 1.4767835140228271, + "learning_rate": 4.779764361146373e-05, + "loss": 4.6168, + "step": 22635 + }, + { + "epoch": 0.13462270434865353, + "grad_norm": 1.5353070497512817, + "learning_rate": 4.779745191038554e-05, + "loss": 4.8458, + "step": 22636 + }, + { + "epoch": 0.13462865163193452, + "grad_norm": 1.6942658424377441, + "learning_rate": 4.779726020134899e-05, + "loss": 4.8253, + "step": 22637 + }, + { + "epoch": 0.13463459891521554, + "grad_norm": 1.3153749704360962, + "learning_rate": 4.779706848435416e-05, + "loss": 4.6095, + "step": 22638 + }, + { + "epoch": 0.13464054619849652, + "grad_norm": 1.5381252765655518, + "learning_rate": 4.779687675940111e-05, + "loss": 4.202, + "step": 22639 + }, + { + "epoch": 0.1346464934817775, + "grad_norm": 1.5490522384643555, + "learning_rate": 4.779668502648989e-05, + "loss": 4.9204, + "step": 22640 + }, + { + "epoch": 0.13465244076505853, + "grad_norm": 1.518019676208496, + "learning_rate": 4.7796493285620604e-05, + "loss": 5.3894, + "step": 22641 + }, + { + "epoch": 0.13465838804833952, + "grad_norm": 1.635918378829956, + "learning_rate": 4.7796301536793284e-05, + "loss": 4.3345, + "step": 22642 + }, + { + "epoch": 0.1346643353316205, + "grad_norm": 1.7409108877182007, + "learning_rate": 4.779610978000802e-05, + "loss": 4.2783, + "step": 22643 + }, + { + "epoch": 0.13467028261490152, + "grad_norm": 1.7899144887924194, + "learning_rate": 4.7795918015264865e-05, + "loss": 4.8578, + "step": 22644 + }, + { + "epoch": 0.1346762298981825, + "grad_norm": 1.6725822687149048, + "learning_rate": 4.779572624256389e-05, + "loss": 4.7902, + "step": 22645 + }, + { + "epoch": 0.1346821771814635, + "grad_norm": 1.8630287647247314, + "learning_rate": 4.7795534461905165e-05, + "loss": 4.5775, + "step": 22646 + }, + { + "epoch": 0.1346881244647445, + "grad_norm": 1.6607400178909302, + "learning_rate": 4.779534267328875e-05, + "loss": 4.7948, + "step": 22647 + }, + { + "epoch": 0.1346940717480255, + "grad_norm": 1.5015220642089844, + "learning_rate": 4.7795150876714726e-05, + "loss": 4.3331, + "step": 22648 + }, + { + "epoch": 0.1347000190313065, + "grad_norm": 1.5176305770874023, + "learning_rate": 4.779495907218314e-05, + "loss": 4.7168, + "step": 22649 + }, + { + "epoch": 0.1347059663145875, + "grad_norm": 1.8669017553329468, + "learning_rate": 4.7794767259694076e-05, + "loss": 4.6268, + "step": 22650 + }, + { + "epoch": 0.1347119135978685, + "grad_norm": 1.795281171798706, + "learning_rate": 4.7794575439247586e-05, + "loss": 4.6233, + "step": 22651 + }, + { + "epoch": 0.13471786088114948, + "grad_norm": 1.9019118547439575, + "learning_rate": 4.779438361084375e-05, + "loss": 4.9087, + "step": 22652 + }, + { + "epoch": 0.1347238081644305, + "grad_norm": 1.8863301277160645, + "learning_rate": 4.779419177448263e-05, + "loss": 4.6571, + "step": 22653 + }, + { + "epoch": 0.13472975544771149, + "grad_norm": 1.7758681774139404, + "learning_rate": 4.779399993016429e-05, + "loss": 4.7445, + "step": 22654 + }, + { + "epoch": 0.13473570273099247, + "grad_norm": 1.8668162822723389, + "learning_rate": 4.7793808077888804e-05, + "loss": 4.8334, + "step": 22655 + }, + { + "epoch": 0.1347416500142735, + "grad_norm": 1.8495571613311768, + "learning_rate": 4.7793616217656235e-05, + "loss": 4.7865, + "step": 22656 + }, + { + "epoch": 0.13474759729755448, + "grad_norm": 2.0655038356781006, + "learning_rate": 4.779342434946665e-05, + "loss": 4.6479, + "step": 22657 + }, + { + "epoch": 0.13475354458083547, + "grad_norm": 1.8008273839950562, + "learning_rate": 4.7793232473320116e-05, + "loss": 4.8482, + "step": 22658 + }, + { + "epoch": 0.13475949186411648, + "grad_norm": 1.8431730270385742, + "learning_rate": 4.7793040589216695e-05, + "loss": 4.5315, + "step": 22659 + }, + { + "epoch": 0.13476543914739747, + "grad_norm": 1.7335654497146606, + "learning_rate": 4.779284869715647e-05, + "loss": 5.2788, + "step": 22660 + }, + { + "epoch": 0.13477138643067846, + "grad_norm": 1.6339887380599976, + "learning_rate": 4.779265679713949e-05, + "loss": 4.9113, + "step": 22661 + }, + { + "epoch": 0.13477733371395947, + "grad_norm": 1.746029019355774, + "learning_rate": 4.7792464889165825e-05, + "loss": 5.3739, + "step": 22662 + }, + { + "epoch": 0.13478328099724046, + "grad_norm": 1.6831165552139282, + "learning_rate": 4.7792272973235554e-05, + "loss": 5.2394, + "step": 22663 + }, + { + "epoch": 0.13478922828052145, + "grad_norm": 1.629170298576355, + "learning_rate": 4.7792081049348737e-05, + "loss": 5.0894, + "step": 22664 + }, + { + "epoch": 0.13479517556380247, + "grad_norm": 1.71427321434021, + "learning_rate": 4.779188911750543e-05, + "loss": 4.9391, + "step": 22665 + }, + { + "epoch": 0.13480112284708345, + "grad_norm": 1.6911921501159668, + "learning_rate": 4.779169717770572e-05, + "loss": 4.965, + "step": 22666 + }, + { + "epoch": 0.13480707013036444, + "grad_norm": 1.6597939729690552, + "learning_rate": 4.779150522994965e-05, + "loss": 5.1885, + "step": 22667 + }, + { + "epoch": 0.13481301741364546, + "grad_norm": 1.8732246160507202, + "learning_rate": 4.779131327423732e-05, + "loss": 4.7274, + "step": 22668 + }, + { + "epoch": 0.13481896469692645, + "grad_norm": 1.6462973356246948, + "learning_rate": 4.7791121310568765e-05, + "loss": 5.0614, + "step": 22669 + }, + { + "epoch": 0.13482491198020743, + "grad_norm": 1.5832293033599854, + "learning_rate": 4.7790929338944065e-05, + "loss": 5.4794, + "step": 22670 + }, + { + "epoch": 0.13483085926348845, + "grad_norm": 1.8505337238311768, + "learning_rate": 4.7790737359363293e-05, + "loss": 5.3381, + "step": 22671 + }, + { + "epoch": 0.13483680654676944, + "grad_norm": 1.4535889625549316, + "learning_rate": 4.7790545371826504e-05, + "loss": 5.1247, + "step": 22672 + }, + { + "epoch": 0.13484275383005043, + "grad_norm": 2.478214979171753, + "learning_rate": 4.779035337633377e-05, + "loss": 5.2909, + "step": 22673 + }, + { + "epoch": 0.13484870111333144, + "grad_norm": 1.3034166097640991, + "learning_rate": 4.7790161372885176e-05, + "loss": 5.36, + "step": 22674 + }, + { + "epoch": 0.13485464839661243, + "grad_norm": 1.6429485082626343, + "learning_rate": 4.778996936148076e-05, + "loss": 5.5559, + "step": 22675 + }, + { + "epoch": 0.13486059567989342, + "grad_norm": 1.7537177801132202, + "learning_rate": 4.77897773421206e-05, + "loss": 5.3665, + "step": 22676 + }, + { + "epoch": 0.13486654296317444, + "grad_norm": 1.7982977628707886, + "learning_rate": 4.778958531480476e-05, + "loss": 5.5078, + "step": 22677 + }, + { + "epoch": 0.13487249024645542, + "grad_norm": 1.5147206783294678, + "learning_rate": 4.7789393279533315e-05, + "loss": 5.6726, + "step": 22678 + }, + { + "epoch": 0.1348784375297364, + "grad_norm": 1.405532956123352, + "learning_rate": 4.778920123630634e-05, + "loss": 5.4188, + "step": 22679 + }, + { + "epoch": 0.13488438481301743, + "grad_norm": 1.4880021810531616, + "learning_rate": 4.778900918512387e-05, + "loss": 5.4478, + "step": 22680 + }, + { + "epoch": 0.13489033209629842, + "grad_norm": 1.4672034978866577, + "learning_rate": 4.7788817125986006e-05, + "loss": 5.2975, + "step": 22681 + }, + { + "epoch": 0.1348962793795794, + "grad_norm": 1.5284076929092407, + "learning_rate": 4.77886250588928e-05, + "loss": 5.008, + "step": 22682 + }, + { + "epoch": 0.1349022266628604, + "grad_norm": 1.6853814125061035, + "learning_rate": 4.778843298384431e-05, + "loss": 4.5719, + "step": 22683 + }, + { + "epoch": 0.1349081739461414, + "grad_norm": 1.8264626264572144, + "learning_rate": 4.778824090084063e-05, + "loss": 4.7764, + "step": 22684 + }, + { + "epoch": 0.1349141212294224, + "grad_norm": 1.3100756406784058, + "learning_rate": 4.77880488098818e-05, + "loss": 4.9967, + "step": 22685 + }, + { + "epoch": 0.13492006851270338, + "grad_norm": 1.5330268144607544, + "learning_rate": 4.7787856710967895e-05, + "loss": 4.6979, + "step": 22686 + }, + { + "epoch": 0.1349260157959844, + "grad_norm": 1.5872783660888672, + "learning_rate": 4.778766460409899e-05, + "loss": 4.9115, + "step": 22687 + }, + { + "epoch": 0.1349319630792654, + "grad_norm": 1.7895172834396362, + "learning_rate": 4.778747248927515e-05, + "loss": 4.9802, + "step": 22688 + }, + { + "epoch": 0.13493791036254638, + "grad_norm": 1.7277544736862183, + "learning_rate": 4.778728036649643e-05, + "loss": 5.2551, + "step": 22689 + }, + { + "epoch": 0.1349438576458274, + "grad_norm": 1.6623975038528442, + "learning_rate": 4.778708823576291e-05, + "loss": 5.4733, + "step": 22690 + }, + { + "epoch": 0.13494980492910838, + "grad_norm": 1.5472412109375, + "learning_rate": 4.7786896097074655e-05, + "loss": 5.3827, + "step": 22691 + }, + { + "epoch": 0.13495575221238937, + "grad_norm": 1.5824527740478516, + "learning_rate": 4.778670395043173e-05, + "loss": 5.1529, + "step": 22692 + }, + { + "epoch": 0.13496169949567038, + "grad_norm": 1.702009916305542, + "learning_rate": 4.77865117958342e-05, + "loss": 4.9916, + "step": 22693 + }, + { + "epoch": 0.13496764677895137, + "grad_norm": 1.653401255607605, + "learning_rate": 4.778631963328214e-05, + "loss": 5.3644, + "step": 22694 + }, + { + "epoch": 0.13497359406223236, + "grad_norm": 1.7365010976791382, + "learning_rate": 4.7786127462775604e-05, + "loss": 5.6488, + "step": 22695 + }, + { + "epoch": 0.13497954134551338, + "grad_norm": 1.749050498008728, + "learning_rate": 4.778593528431467e-05, + "loss": 5.6256, + "step": 22696 + }, + { + "epoch": 0.13498548862879436, + "grad_norm": 1.8504292964935303, + "learning_rate": 4.7785743097899394e-05, + "loss": 5.3972, + "step": 22697 + }, + { + "epoch": 0.13499143591207535, + "grad_norm": 1.6481549739837646, + "learning_rate": 4.7785550903529864e-05, + "loss": 5.2532, + "step": 22698 + }, + { + "epoch": 0.13499738319535637, + "grad_norm": 1.6081243753433228, + "learning_rate": 4.778535870120612e-05, + "loss": 5.2455, + "step": 22699 + }, + { + "epoch": 0.13500333047863736, + "grad_norm": 1.7087515592575073, + "learning_rate": 4.7785166490928246e-05, + "loss": 5.3115, + "step": 22700 + }, + { + "epoch": 0.13500927776191834, + "grad_norm": 1.626558780670166, + "learning_rate": 4.7784974272696314e-05, + "loss": 4.9586, + "step": 22701 + }, + { + "epoch": 0.13501522504519936, + "grad_norm": 1.5453464984893799, + "learning_rate": 4.778478204651038e-05, + "loss": 5.3882, + "step": 22702 + }, + { + "epoch": 0.13502117232848035, + "grad_norm": 1.602817416191101, + "learning_rate": 4.778458981237051e-05, + "loss": 5.1293, + "step": 22703 + }, + { + "epoch": 0.13502711961176134, + "grad_norm": 1.642824411392212, + "learning_rate": 4.778439757027677e-05, + "loss": 5.25, + "step": 22704 + }, + { + "epoch": 0.13503306689504235, + "grad_norm": 1.544092059135437, + "learning_rate": 4.7784205320229245e-05, + "loss": 5.4593, + "step": 22705 + }, + { + "epoch": 0.13503901417832334, + "grad_norm": 1.5194666385650635, + "learning_rate": 4.778401306222798e-05, + "loss": 5.1281, + "step": 22706 + }, + { + "epoch": 0.13504496146160433, + "grad_norm": 1.5252684354782104, + "learning_rate": 4.778382079627305e-05, + "loss": 5.2614, + "step": 22707 + }, + { + "epoch": 0.13505090874488535, + "grad_norm": 1.3341602087020874, + "learning_rate": 4.778362852236453e-05, + "loss": 5.6714, + "step": 22708 + }, + { + "epoch": 0.13505685602816633, + "grad_norm": 1.4264339208602905, + "learning_rate": 4.7783436240502475e-05, + "loss": 5.5506, + "step": 22709 + }, + { + "epoch": 0.13506280331144732, + "grad_norm": 1.7837181091308594, + "learning_rate": 4.778324395068696e-05, + "loss": 5.4757, + "step": 22710 + }, + { + "epoch": 0.13506875059472834, + "grad_norm": 1.6878288984298706, + "learning_rate": 4.7783051652918054e-05, + "loss": 5.4745, + "step": 22711 + }, + { + "epoch": 0.13507469787800933, + "grad_norm": 1.4143346548080444, + "learning_rate": 4.778285934719582e-05, + "loss": 5.5602, + "step": 22712 + }, + { + "epoch": 0.1350806451612903, + "grad_norm": 1.4829423427581787, + "learning_rate": 4.778266703352032e-05, + "loss": 5.4767, + "step": 22713 + }, + { + "epoch": 0.13508659244457133, + "grad_norm": 1.5431561470031738, + "learning_rate": 4.778247471189163e-05, + "loss": 5.532, + "step": 22714 + }, + { + "epoch": 0.13509253972785232, + "grad_norm": 1.6398223638534546, + "learning_rate": 4.7782282382309814e-05, + "loss": 5.4421, + "step": 22715 + }, + { + "epoch": 0.1350984870111333, + "grad_norm": 1.7385345697402954, + "learning_rate": 4.778209004477494e-05, + "loss": 4.9767, + "step": 22716 + }, + { + "epoch": 0.13510443429441432, + "grad_norm": 1.659159541130066, + "learning_rate": 4.7781897699287066e-05, + "loss": 5.2567, + "step": 22717 + }, + { + "epoch": 0.1351103815776953, + "grad_norm": 1.665582299232483, + "learning_rate": 4.7781705345846274e-05, + "loss": 4.9557, + "step": 22718 + }, + { + "epoch": 0.1351163288609763, + "grad_norm": 1.603225827217102, + "learning_rate": 4.7781512984452614e-05, + "loss": 5.3373, + "step": 22719 + }, + { + "epoch": 0.13512227614425731, + "grad_norm": 2.11853289604187, + "learning_rate": 4.7781320615106176e-05, + "loss": 4.9767, + "step": 22720 + }, + { + "epoch": 0.1351282234275383, + "grad_norm": 1.463710069656372, + "learning_rate": 4.7781128237807006e-05, + "loss": 5.0996, + "step": 22721 + }, + { + "epoch": 0.1351341707108193, + "grad_norm": 1.785783290863037, + "learning_rate": 4.7780935852555186e-05, + "loss": 5.0664, + "step": 22722 + }, + { + "epoch": 0.1351401179941003, + "grad_norm": 1.6467021703720093, + "learning_rate": 4.778074345935078e-05, + "loss": 5.0879, + "step": 22723 + }, + { + "epoch": 0.1351460652773813, + "grad_norm": 1.7273554801940918, + "learning_rate": 4.7780551058193834e-05, + "loss": 5.1165, + "step": 22724 + }, + { + "epoch": 0.13515201256066228, + "grad_norm": 1.7785577774047852, + "learning_rate": 4.7780358649084443e-05, + "loss": 4.9459, + "step": 22725 + }, + { + "epoch": 0.1351579598439433, + "grad_norm": 1.6499429941177368, + "learning_rate": 4.7780166232022674e-05, + "loss": 5.3581, + "step": 22726 + }, + { + "epoch": 0.1351639071272243, + "grad_norm": 1.651881217956543, + "learning_rate": 4.777997380700857e-05, + "loss": 5.215, + "step": 22727 + }, + { + "epoch": 0.13516985441050527, + "grad_norm": 1.726369857788086, + "learning_rate": 4.7779781374042215e-05, + "loss": 4.8891, + "step": 22728 + }, + { + "epoch": 0.1351758016937863, + "grad_norm": 1.5628979206085205, + "learning_rate": 4.7779588933123675e-05, + "loss": 5.0173, + "step": 22729 + }, + { + "epoch": 0.13518174897706728, + "grad_norm": 2.179954767227173, + "learning_rate": 4.777939648425302e-05, + "loss": 5.0088, + "step": 22730 + }, + { + "epoch": 0.13518769626034827, + "grad_norm": 1.5813510417938232, + "learning_rate": 4.777920402743031e-05, + "loss": 5.064, + "step": 22731 + }, + { + "epoch": 0.13519364354362928, + "grad_norm": 1.4100569486618042, + "learning_rate": 4.7779011562655616e-05, + "loss": 5.5696, + "step": 22732 + }, + { + "epoch": 0.13519959082691027, + "grad_norm": 1.4252601861953735, + "learning_rate": 4.7778819089929e-05, + "loss": 5.4797, + "step": 22733 + }, + { + "epoch": 0.13520553811019126, + "grad_norm": 1.5482890605926514, + "learning_rate": 4.7778626609250546e-05, + "loss": 5.7168, + "step": 22734 + }, + { + "epoch": 0.13521148539347227, + "grad_norm": 1.7441178560256958, + "learning_rate": 4.77784341206203e-05, + "loss": 5.5385, + "step": 22735 + }, + { + "epoch": 0.13521743267675326, + "grad_norm": 1.5903903245925903, + "learning_rate": 4.777824162403833e-05, + "loss": 5.4181, + "step": 22736 + }, + { + "epoch": 0.13522337996003425, + "grad_norm": 1.6240642070770264, + "learning_rate": 4.777804911950472e-05, + "loss": 5.5071, + "step": 22737 + }, + { + "epoch": 0.13522932724331527, + "grad_norm": 1.4418225288391113, + "learning_rate": 4.7777856607019536e-05, + "loss": 5.6326, + "step": 22738 + }, + { + "epoch": 0.13523527452659626, + "grad_norm": 1.618449330329895, + "learning_rate": 4.7777664086582823e-05, + "loss": 5.4445, + "step": 22739 + }, + { + "epoch": 0.13524122180987724, + "grad_norm": 1.7598767280578613, + "learning_rate": 4.777747155819467e-05, + "loss": 5.3207, + "step": 22740 + }, + { + "epoch": 0.13524716909315823, + "grad_norm": 1.707531213760376, + "learning_rate": 4.7777279021855134e-05, + "loss": 5.2888, + "step": 22741 + }, + { + "epoch": 0.13525311637643925, + "grad_norm": 1.8292144536972046, + "learning_rate": 4.777708647756429e-05, + "loss": 4.897, + "step": 22742 + }, + { + "epoch": 0.13525906365972024, + "grad_norm": 1.893703818321228, + "learning_rate": 4.77768939253222e-05, + "loss": 4.8088, + "step": 22743 + }, + { + "epoch": 0.13526501094300122, + "grad_norm": 1.6884989738464355, + "learning_rate": 4.777670136512893e-05, + "loss": 5.183, + "step": 22744 + }, + { + "epoch": 0.13527095822628224, + "grad_norm": 1.8513271808624268, + "learning_rate": 4.777650879698454e-05, + "loss": 4.6775, + "step": 22745 + }, + { + "epoch": 0.13527690550956323, + "grad_norm": 1.5597106218338013, + "learning_rate": 4.777631622088912e-05, + "loss": 5.268, + "step": 22746 + }, + { + "epoch": 0.13528285279284422, + "grad_norm": 1.6159777641296387, + "learning_rate": 4.777612363684272e-05, + "loss": 5.223, + "step": 22747 + }, + { + "epoch": 0.13528880007612523, + "grad_norm": 1.6712334156036377, + "learning_rate": 4.777593104484541e-05, + "loss": 5.1676, + "step": 22748 + }, + { + "epoch": 0.13529474735940622, + "grad_norm": 1.4349523782730103, + "learning_rate": 4.7775738444897253e-05, + "loss": 5.3066, + "step": 22749 + }, + { + "epoch": 0.1353006946426872, + "grad_norm": 1.6191719770431519, + "learning_rate": 4.7775545836998324e-05, + "loss": 5.2426, + "step": 22750 + }, + { + "epoch": 0.13530664192596822, + "grad_norm": 1.8324687480926514, + "learning_rate": 4.777535322114869e-05, + "loss": 5.2352, + "step": 22751 + }, + { + "epoch": 0.1353125892092492, + "grad_norm": 1.5355842113494873, + "learning_rate": 4.777516059734841e-05, + "loss": 5.5875, + "step": 22752 + }, + { + "epoch": 0.1353185364925302, + "grad_norm": 1.6957530975341797, + "learning_rate": 4.777496796559756e-05, + "loss": 5.4624, + "step": 22753 + }, + { + "epoch": 0.13532448377581122, + "grad_norm": 1.6195729970932007, + "learning_rate": 4.7774775325896205e-05, + "loss": 5.2686, + "step": 22754 + }, + { + "epoch": 0.1353304310590922, + "grad_norm": 1.429439663887024, + "learning_rate": 4.7774582678244406e-05, + "loss": 5.3407, + "step": 22755 + }, + { + "epoch": 0.1353363783423732, + "grad_norm": 1.4609668254852295, + "learning_rate": 4.777439002264225e-05, + "loss": 5.4332, + "step": 22756 + }, + { + "epoch": 0.1353423256256542, + "grad_norm": 1.3537366390228271, + "learning_rate": 4.7774197359089765e-05, + "loss": 5.4353, + "step": 22757 + }, + { + "epoch": 0.1353482729089352, + "grad_norm": 1.6953861713409424, + "learning_rate": 4.7774004687587057e-05, + "loss": 5.1824, + "step": 22758 + }, + { + "epoch": 0.13535422019221618, + "grad_norm": 1.3835570812225342, + "learning_rate": 4.7773812008134186e-05, + "loss": 5.1748, + "step": 22759 + }, + { + "epoch": 0.1353601674754972, + "grad_norm": 1.94771146774292, + "learning_rate": 4.7773619320731206e-05, + "loss": 4.7599, + "step": 22760 + }, + { + "epoch": 0.1353661147587782, + "grad_norm": 1.56703782081604, + "learning_rate": 4.777342662537819e-05, + "loss": 5.4686, + "step": 22761 + }, + { + "epoch": 0.13537206204205918, + "grad_norm": 1.627790093421936, + "learning_rate": 4.77732339220752e-05, + "loss": 5.4504, + "step": 22762 + }, + { + "epoch": 0.1353780093253402, + "grad_norm": 1.5668286085128784, + "learning_rate": 4.777304121082232e-05, + "loss": 5.5147, + "step": 22763 + }, + { + "epoch": 0.13538395660862118, + "grad_norm": 1.7350172996520996, + "learning_rate": 4.7772848491619606e-05, + "loss": 5.1803, + "step": 22764 + }, + { + "epoch": 0.13538990389190217, + "grad_norm": 1.700966715812683, + "learning_rate": 4.7772655764467124e-05, + "loss": 5.1222, + "step": 22765 + }, + { + "epoch": 0.13539585117518319, + "grad_norm": 1.7613048553466797, + "learning_rate": 4.777246302936494e-05, + "loss": 5.1391, + "step": 22766 + }, + { + "epoch": 0.13540179845846417, + "grad_norm": 1.7095452547073364, + "learning_rate": 4.777227028631312e-05, + "loss": 5.112, + "step": 22767 + }, + { + "epoch": 0.13540774574174516, + "grad_norm": 1.8310586214065552, + "learning_rate": 4.7772077535311744e-05, + "loss": 5.0404, + "step": 22768 + }, + { + "epoch": 0.13541369302502618, + "grad_norm": 1.7058879137039185, + "learning_rate": 4.777188477636087e-05, + "loss": 5.1165, + "step": 22769 + }, + { + "epoch": 0.13541964030830717, + "grad_norm": 1.7806624174118042, + "learning_rate": 4.7771692009460565e-05, + "loss": 5.0711, + "step": 22770 + }, + { + "epoch": 0.13542558759158815, + "grad_norm": 1.8086166381835938, + "learning_rate": 4.777149923461089e-05, + "loss": 4.7757, + "step": 22771 + }, + { + "epoch": 0.13543153487486917, + "grad_norm": 1.9984580278396606, + "learning_rate": 4.777130645181194e-05, + "loss": 4.918, + "step": 22772 + }, + { + "epoch": 0.13543748215815016, + "grad_norm": 1.6648451089859009, + "learning_rate": 4.777111366106375e-05, + "loss": 5.0051, + "step": 22773 + }, + { + "epoch": 0.13544342944143115, + "grad_norm": 1.6590383052825928, + "learning_rate": 4.77709208623664e-05, + "loss": 5.6166, + "step": 22774 + }, + { + "epoch": 0.13544937672471216, + "grad_norm": 1.4530583620071411, + "learning_rate": 4.777072805571995e-05, + "loss": 5.6772, + "step": 22775 + }, + { + "epoch": 0.13545532400799315, + "grad_norm": 1.5310078859329224, + "learning_rate": 4.777053524112448e-05, + "loss": 4.9965, + "step": 22776 + }, + { + "epoch": 0.13546127129127414, + "grad_norm": 1.5363576412200928, + "learning_rate": 4.777034241858005e-05, + "loss": 5.2144, + "step": 22777 + }, + { + "epoch": 0.13546721857455515, + "grad_norm": 1.7318395376205444, + "learning_rate": 4.7770149588086735e-05, + "loss": 5.2367, + "step": 22778 + }, + { + "epoch": 0.13547316585783614, + "grad_norm": 1.567736268043518, + "learning_rate": 4.776995674964459e-05, + "loss": 5.4778, + "step": 22779 + }, + { + "epoch": 0.13547911314111713, + "grad_norm": 1.879223108291626, + "learning_rate": 4.7769763903253685e-05, + "loss": 4.8963, + "step": 22780 + }, + { + "epoch": 0.13548506042439815, + "grad_norm": 1.6292016506195068, + "learning_rate": 4.77695710489141e-05, + "loss": 5.2529, + "step": 22781 + }, + { + "epoch": 0.13549100770767913, + "grad_norm": 1.4838228225708008, + "learning_rate": 4.7769378186625885e-05, + "loss": 5.5594, + "step": 22782 + }, + { + "epoch": 0.13549695499096012, + "grad_norm": 1.4567928314208984, + "learning_rate": 4.776918531638912e-05, + "loss": 5.5789, + "step": 22783 + }, + { + "epoch": 0.13550290227424114, + "grad_norm": 1.6464484930038452, + "learning_rate": 4.776899243820386e-05, + "loss": 5.4319, + "step": 22784 + }, + { + "epoch": 0.13550884955752213, + "grad_norm": 1.501028060913086, + "learning_rate": 4.776879955207019e-05, + "loss": 5.5543, + "step": 22785 + }, + { + "epoch": 0.13551479684080311, + "grad_norm": 1.6811163425445557, + "learning_rate": 4.776860665798816e-05, + "loss": 5.4512, + "step": 22786 + }, + { + "epoch": 0.13552074412408413, + "grad_norm": 1.762147068977356, + "learning_rate": 4.7768413755957854e-05, + "loss": 5.6262, + "step": 22787 + }, + { + "epoch": 0.13552669140736512, + "grad_norm": 1.846987009048462, + "learning_rate": 4.7768220845979315e-05, + "loss": 5.4735, + "step": 22788 + }, + { + "epoch": 0.1355326386906461, + "grad_norm": 1.9326568841934204, + "learning_rate": 4.776802792805264e-05, + "loss": 5.3295, + "step": 22789 + }, + { + "epoch": 0.13553858597392712, + "grad_norm": 1.5496313571929932, + "learning_rate": 4.7767835002177874e-05, + "loss": 5.4742, + "step": 22790 + }, + { + "epoch": 0.1355445332572081, + "grad_norm": 1.3328933715820312, + "learning_rate": 4.776764206835509e-05, + "loss": 5.5611, + "step": 22791 + }, + { + "epoch": 0.1355504805404891, + "grad_norm": 1.3349891901016235, + "learning_rate": 4.776744912658437e-05, + "loss": 5.5732, + "step": 22792 + }, + { + "epoch": 0.13555642782377011, + "grad_norm": 1.510608434677124, + "learning_rate": 4.776725617686576e-05, + "loss": 5.4108, + "step": 22793 + }, + { + "epoch": 0.1355623751070511, + "grad_norm": 1.4556225538253784, + "learning_rate": 4.776706321919934e-05, + "loss": 5.5154, + "step": 22794 + }, + { + "epoch": 0.1355683223903321, + "grad_norm": 1.7231537103652954, + "learning_rate": 4.776687025358516e-05, + "loss": 5.4437, + "step": 22795 + }, + { + "epoch": 0.1355742696736131, + "grad_norm": 1.6234036684036255, + "learning_rate": 4.7766677280023314e-05, + "loss": 5.2642, + "step": 22796 + }, + { + "epoch": 0.1355802169568941, + "grad_norm": 1.6550066471099854, + "learning_rate": 4.776648429851385e-05, + "loss": 5.3577, + "step": 22797 + }, + { + "epoch": 0.13558616424017508, + "grad_norm": 1.5199332237243652, + "learning_rate": 4.776629130905684e-05, + "loss": 4.9679, + "step": 22798 + }, + { + "epoch": 0.13559211152345607, + "grad_norm": 1.5900238752365112, + "learning_rate": 4.776609831165236e-05, + "loss": 5.5357, + "step": 22799 + }, + { + "epoch": 0.1355980588067371, + "grad_norm": 1.4585398435592651, + "learning_rate": 4.776590530630047e-05, + "loss": 5.4191, + "step": 22800 + }, + { + "epoch": 0.13560400609001808, + "grad_norm": 1.4049118757247925, + "learning_rate": 4.7765712293001234e-05, + "loss": 5.4423, + "step": 22801 + }, + { + "epoch": 0.13560995337329906, + "grad_norm": 1.5287877321243286, + "learning_rate": 4.7765519271754726e-05, + "loss": 5.4635, + "step": 22802 + }, + { + "epoch": 0.13561590065658008, + "grad_norm": 1.4761078357696533, + "learning_rate": 4.776532624256101e-05, + "loss": 5.394, + "step": 22803 + }, + { + "epoch": 0.13562184793986107, + "grad_norm": 1.523536205291748, + "learning_rate": 4.776513320542015e-05, + "loss": 5.4171, + "step": 22804 + }, + { + "epoch": 0.13562779522314206, + "grad_norm": 1.701953411102295, + "learning_rate": 4.7764940160332214e-05, + "loss": 5.336, + "step": 22805 + }, + { + "epoch": 0.13563374250642307, + "grad_norm": 1.5426260232925415, + "learning_rate": 4.7764747107297284e-05, + "loss": 5.5175, + "step": 22806 + }, + { + "epoch": 0.13563968978970406, + "grad_norm": 1.5670596361160278, + "learning_rate": 4.776455404631541e-05, + "loss": 5.4254, + "step": 22807 + }, + { + "epoch": 0.13564563707298505, + "grad_norm": 1.4388494491577148, + "learning_rate": 4.7764360977386666e-05, + "loss": 5.3282, + "step": 22808 + }, + { + "epoch": 0.13565158435626606, + "grad_norm": 1.4222092628479004, + "learning_rate": 4.776416790051111e-05, + "loss": 5.5187, + "step": 22809 + }, + { + "epoch": 0.13565753163954705, + "grad_norm": 1.604407787322998, + "learning_rate": 4.776397481568883e-05, + "loss": 5.3026, + "step": 22810 + }, + { + "epoch": 0.13566347892282804, + "grad_norm": 1.4160562753677368, + "learning_rate": 4.776378172291988e-05, + "loss": 5.2925, + "step": 22811 + }, + { + "epoch": 0.13566942620610906, + "grad_norm": 1.543260931968689, + "learning_rate": 4.776358862220433e-05, + "loss": 5.4234, + "step": 22812 + }, + { + "epoch": 0.13567537348939004, + "grad_norm": 1.6589266061782837, + "learning_rate": 4.776339551354224e-05, + "loss": 5.0677, + "step": 22813 + }, + { + "epoch": 0.13568132077267103, + "grad_norm": 1.5909267663955688, + "learning_rate": 4.7763202396933696e-05, + "loss": 5.145, + "step": 22814 + }, + { + "epoch": 0.13568726805595205, + "grad_norm": 1.4697500467300415, + "learning_rate": 4.776300927237873e-05, + "loss": 5.2856, + "step": 22815 + }, + { + "epoch": 0.13569321533923304, + "grad_norm": 1.895766019821167, + "learning_rate": 4.7762816139877456e-05, + "loss": 5.3554, + "step": 22816 + }, + { + "epoch": 0.13569916262251402, + "grad_norm": 1.8093748092651367, + "learning_rate": 4.7762622999429905e-05, + "loss": 4.9482, + "step": 22817 + }, + { + "epoch": 0.13570510990579504, + "grad_norm": 1.6899988651275635, + "learning_rate": 4.776242985103616e-05, + "loss": 5.1788, + "step": 22818 + }, + { + "epoch": 0.13571105718907603, + "grad_norm": 1.8199821710586548, + "learning_rate": 4.7762236694696294e-05, + "loss": 4.9181, + "step": 22819 + }, + { + "epoch": 0.13571700447235702, + "grad_norm": 1.7687036991119385, + "learning_rate": 4.776204353041036e-05, + "loss": 4.9925, + "step": 22820 + }, + { + "epoch": 0.13572295175563803, + "grad_norm": 1.705419659614563, + "learning_rate": 4.776185035817843e-05, + "loss": 5.0644, + "step": 22821 + }, + { + "epoch": 0.13572889903891902, + "grad_norm": 1.7805287837982178, + "learning_rate": 4.7761657178000575e-05, + "loss": 5.1567, + "step": 22822 + }, + { + "epoch": 0.1357348463222, + "grad_norm": 1.4791945219039917, + "learning_rate": 4.776146398987686e-05, + "loss": 5.2834, + "step": 22823 + }, + { + "epoch": 0.13574079360548102, + "grad_norm": 1.546128749847412, + "learning_rate": 4.776127079380735e-05, + "loss": 4.8066, + "step": 22824 + }, + { + "epoch": 0.135746740888762, + "grad_norm": 1.6163334846496582, + "learning_rate": 4.776107758979212e-05, + "loss": 5.1771, + "step": 22825 + }, + { + "epoch": 0.135752688172043, + "grad_norm": 1.6902676820755005, + "learning_rate": 4.776088437783123e-05, + "loss": 4.9249, + "step": 22826 + }, + { + "epoch": 0.13575863545532402, + "grad_norm": 1.4966270923614502, + "learning_rate": 4.776069115792475e-05, + "loss": 5.6609, + "step": 22827 + }, + { + "epoch": 0.135764582738605, + "grad_norm": 1.6107707023620605, + "learning_rate": 4.7760497930072754e-05, + "loss": 5.4167, + "step": 22828 + }, + { + "epoch": 0.135770530021886, + "grad_norm": 1.5773305892944336, + "learning_rate": 4.77603046942753e-05, + "loss": 5.4044, + "step": 22829 + }, + { + "epoch": 0.135776477305167, + "grad_norm": 1.6871259212493896, + "learning_rate": 4.7760111450532454e-05, + "loss": 5.5288, + "step": 22830 + }, + { + "epoch": 0.135782424588448, + "grad_norm": 1.4027100801467896, + "learning_rate": 4.77599181988443e-05, + "loss": 5.265, + "step": 22831 + }, + { + "epoch": 0.13578837187172899, + "grad_norm": 1.7435009479522705, + "learning_rate": 4.775972493921088e-05, + "loss": 5.3546, + "step": 22832 + }, + { + "epoch": 0.13579431915501, + "grad_norm": 1.4834927320480347, + "learning_rate": 4.7759531671632286e-05, + "loss": 5.168, + "step": 22833 + }, + { + "epoch": 0.135800266438291, + "grad_norm": 1.6468613147735596, + "learning_rate": 4.775933839610857e-05, + "loss": 5.0984, + "step": 22834 + }, + { + "epoch": 0.13580621372157198, + "grad_norm": 1.6906235218048096, + "learning_rate": 4.77591451126398e-05, + "loss": 5.0563, + "step": 22835 + }, + { + "epoch": 0.135812161004853, + "grad_norm": 1.2667183876037598, + "learning_rate": 4.775895182122605e-05, + "loss": 5.7256, + "step": 22836 + }, + { + "epoch": 0.13581810828813398, + "grad_norm": 1.381974697113037, + "learning_rate": 4.775875852186739e-05, + "loss": 5.6773, + "step": 22837 + }, + { + "epoch": 0.13582405557141497, + "grad_norm": 1.395326018333435, + "learning_rate": 4.775856521456388e-05, + "loss": 5.4884, + "step": 22838 + }, + { + "epoch": 0.13583000285469599, + "grad_norm": 1.4601794481277466, + "learning_rate": 4.775837189931559e-05, + "loss": 5.6866, + "step": 22839 + }, + { + "epoch": 0.13583595013797697, + "grad_norm": 1.3722656965255737, + "learning_rate": 4.7758178576122584e-05, + "loss": 5.7885, + "step": 22840 + }, + { + "epoch": 0.13584189742125796, + "grad_norm": 1.5126278400421143, + "learning_rate": 4.775798524498494e-05, + "loss": 5.5806, + "step": 22841 + }, + { + "epoch": 0.13584784470453898, + "grad_norm": 1.465306282043457, + "learning_rate": 4.7757791905902714e-05, + "loss": 5.5597, + "step": 22842 + }, + { + "epoch": 0.13585379198781997, + "grad_norm": 1.7111048698425293, + "learning_rate": 4.775759855887598e-05, + "loss": 5.3431, + "step": 22843 + }, + { + "epoch": 0.13585973927110095, + "grad_norm": 1.7369952201843262, + "learning_rate": 4.7757405203904796e-05, + "loss": 5.4373, + "step": 22844 + }, + { + "epoch": 0.13586568655438197, + "grad_norm": 1.571898341178894, + "learning_rate": 4.7757211840989246e-05, + "loss": 5.4751, + "step": 22845 + }, + { + "epoch": 0.13587163383766296, + "grad_norm": 1.6752384901046753, + "learning_rate": 4.775701847012938e-05, + "loss": 5.3411, + "step": 22846 + }, + { + "epoch": 0.13587758112094395, + "grad_norm": 1.3036680221557617, + "learning_rate": 4.775682509132529e-05, + "loss": 5.6136, + "step": 22847 + }, + { + "epoch": 0.13588352840422496, + "grad_norm": 1.60060453414917, + "learning_rate": 4.775663170457701e-05, + "loss": 5.3134, + "step": 22848 + }, + { + "epoch": 0.13588947568750595, + "grad_norm": 1.746317982673645, + "learning_rate": 4.775643830988463e-05, + "loss": 5.1176, + "step": 22849 + }, + { + "epoch": 0.13589542297078694, + "grad_norm": 1.5190258026123047, + "learning_rate": 4.775624490724822e-05, + "loss": 5.2673, + "step": 22850 + }, + { + "epoch": 0.13590137025406795, + "grad_norm": 1.5572645664215088, + "learning_rate": 4.775605149666783e-05, + "loss": 5.7732, + "step": 22851 + }, + { + "epoch": 0.13590731753734894, + "grad_norm": 1.6563985347747803, + "learning_rate": 4.775585807814354e-05, + "loss": 5.3757, + "step": 22852 + }, + { + "epoch": 0.13591326482062993, + "grad_norm": 1.583486795425415, + "learning_rate": 4.775566465167541e-05, + "loss": 5.5406, + "step": 22853 + }, + { + "epoch": 0.13591921210391095, + "grad_norm": 1.9212104082107544, + "learning_rate": 4.7755471217263525e-05, + "loss": 5.5629, + "step": 22854 + }, + { + "epoch": 0.13592515938719194, + "grad_norm": 1.5397447347640991, + "learning_rate": 4.775527777490793e-05, + "loss": 5.5745, + "step": 22855 + }, + { + "epoch": 0.13593110667047292, + "grad_norm": 1.4469612836837769, + "learning_rate": 4.775508432460871e-05, + "loss": 5.5762, + "step": 22856 + }, + { + "epoch": 0.13593705395375394, + "grad_norm": 1.6050552129745483, + "learning_rate": 4.775489086636592e-05, + "loss": 5.2207, + "step": 22857 + }, + { + "epoch": 0.13594300123703493, + "grad_norm": 1.5991270542144775, + "learning_rate": 4.7754697400179636e-05, + "loss": 5.3331, + "step": 22858 + }, + { + "epoch": 0.13594894852031592, + "grad_norm": 1.8474901914596558, + "learning_rate": 4.775450392604992e-05, + "loss": 5.3208, + "step": 22859 + }, + { + "epoch": 0.1359548958035969, + "grad_norm": 1.6865973472595215, + "learning_rate": 4.7754310443976844e-05, + "loss": 5.2557, + "step": 22860 + }, + { + "epoch": 0.13596084308687792, + "grad_norm": 1.9411492347717285, + "learning_rate": 4.775411695396047e-05, + "loss": 5.2765, + "step": 22861 + }, + { + "epoch": 0.1359667903701589, + "grad_norm": 1.6263481378555298, + "learning_rate": 4.775392345600087e-05, + "loss": 5.2767, + "step": 22862 + }, + { + "epoch": 0.1359727376534399, + "grad_norm": 1.7159794569015503, + "learning_rate": 4.7753729950098116e-05, + "loss": 5.5175, + "step": 22863 + }, + { + "epoch": 0.1359786849367209, + "grad_norm": 1.6026562452316284, + "learning_rate": 4.7753536436252266e-05, + "loss": 5.3517, + "step": 22864 + }, + { + "epoch": 0.1359846322200019, + "grad_norm": 1.4052190780639648, + "learning_rate": 4.775334291446339e-05, + "loss": 5.3153, + "step": 22865 + }, + { + "epoch": 0.1359905795032829, + "grad_norm": 1.4030534029006958, + "learning_rate": 4.7753149384731556e-05, + "loss": 5.3798, + "step": 22866 + }, + { + "epoch": 0.1359965267865639, + "grad_norm": 1.5234447717666626, + "learning_rate": 4.775295584705683e-05, + "loss": 5.2717, + "step": 22867 + }, + { + "epoch": 0.1360024740698449, + "grad_norm": 1.6578015089035034, + "learning_rate": 4.775276230143929e-05, + "loss": 5.2482, + "step": 22868 + }, + { + "epoch": 0.13600842135312588, + "grad_norm": 1.427674651145935, + "learning_rate": 4.775256874787899e-05, + "loss": 5.3303, + "step": 22869 + }, + { + "epoch": 0.1360143686364069, + "grad_norm": 1.610268473625183, + "learning_rate": 4.7752375186376006e-05, + "loss": 5.4775, + "step": 22870 + }, + { + "epoch": 0.13602031591968788, + "grad_norm": 1.7097511291503906, + "learning_rate": 4.7752181616930404e-05, + "loss": 5.2721, + "step": 22871 + }, + { + "epoch": 0.13602626320296887, + "grad_norm": 1.6628022193908691, + "learning_rate": 4.775198803954225e-05, + "loss": 5.2049, + "step": 22872 + }, + { + "epoch": 0.1360322104862499, + "grad_norm": 1.6983882188796997, + "learning_rate": 4.7751794454211615e-05, + "loss": 5.1596, + "step": 22873 + }, + { + "epoch": 0.13603815776953088, + "grad_norm": 1.6148128509521484, + "learning_rate": 4.775160086093856e-05, + "loss": 5.3958, + "step": 22874 + }, + { + "epoch": 0.13604410505281186, + "grad_norm": 1.6220009326934814, + "learning_rate": 4.7751407259723155e-05, + "loss": 5.2774, + "step": 22875 + }, + { + "epoch": 0.13605005233609288, + "grad_norm": 1.5017454624176025, + "learning_rate": 4.7751213650565464e-05, + "loss": 5.303, + "step": 22876 + }, + { + "epoch": 0.13605599961937387, + "grad_norm": 1.6734380722045898, + "learning_rate": 4.7751020033465566e-05, + "loss": 5.3784, + "step": 22877 + }, + { + "epoch": 0.13606194690265486, + "grad_norm": 1.8177162408828735, + "learning_rate": 4.775082640842352e-05, + "loss": 5.4498, + "step": 22878 + }, + { + "epoch": 0.13606789418593587, + "grad_norm": 1.6287364959716797, + "learning_rate": 4.7750632775439396e-05, + "loss": 5.3252, + "step": 22879 + }, + { + "epoch": 0.13607384146921686, + "grad_norm": 1.5242222547531128, + "learning_rate": 4.7750439134513267e-05, + "loss": 5.2287, + "step": 22880 + }, + { + "epoch": 0.13607978875249785, + "grad_norm": 1.4447482824325562, + "learning_rate": 4.775024548564519e-05, + "loss": 5.3725, + "step": 22881 + }, + { + "epoch": 0.13608573603577886, + "grad_norm": 1.4994373321533203, + "learning_rate": 4.775005182883523e-05, + "loss": 5.4844, + "step": 22882 + }, + { + "epoch": 0.13609168331905985, + "grad_norm": 1.541668176651001, + "learning_rate": 4.774985816408347e-05, + "loss": 5.4171, + "step": 22883 + }, + { + "epoch": 0.13609763060234084, + "grad_norm": 1.4670990705490112, + "learning_rate": 4.7749664491389965e-05, + "loss": 5.4372, + "step": 22884 + }, + { + "epoch": 0.13610357788562186, + "grad_norm": 1.686318039894104, + "learning_rate": 4.7749470810754796e-05, + "loss": 5.1164, + "step": 22885 + }, + { + "epoch": 0.13610952516890285, + "grad_norm": 1.4744656085968018, + "learning_rate": 4.7749277122178015e-05, + "loss": 5.3787, + "step": 22886 + }, + { + "epoch": 0.13611547245218383, + "grad_norm": 1.498948574066162, + "learning_rate": 4.77490834256597e-05, + "loss": 5.2837, + "step": 22887 + }, + { + "epoch": 0.13612141973546485, + "grad_norm": 1.4990612268447876, + "learning_rate": 4.774888972119991e-05, + "loss": 5.3503, + "step": 22888 + }, + { + "epoch": 0.13612736701874584, + "grad_norm": 1.6973026990890503, + "learning_rate": 4.774869600879872e-05, + "loss": 5.2776, + "step": 22889 + }, + { + "epoch": 0.13613331430202683, + "grad_norm": 1.5271309614181519, + "learning_rate": 4.7748502288456193e-05, + "loss": 5.3318, + "step": 22890 + }, + { + "epoch": 0.13613926158530784, + "grad_norm": 1.5284117460250854, + "learning_rate": 4.7748308560172406e-05, + "loss": 5.2975, + "step": 22891 + }, + { + "epoch": 0.13614520886858883, + "grad_norm": 1.45162034034729, + "learning_rate": 4.774811482394741e-05, + "loss": 5.1825, + "step": 22892 + }, + { + "epoch": 0.13615115615186982, + "grad_norm": 1.558273434638977, + "learning_rate": 4.774792107978129e-05, + "loss": 5.1004, + "step": 22893 + }, + { + "epoch": 0.13615710343515083, + "grad_norm": 1.576781988143921, + "learning_rate": 4.77477273276741e-05, + "loss": 5.4028, + "step": 22894 + }, + { + "epoch": 0.13616305071843182, + "grad_norm": 1.3964447975158691, + "learning_rate": 4.7747533567625916e-05, + "loss": 5.4402, + "step": 22895 + }, + { + "epoch": 0.1361689980017128, + "grad_norm": 1.7266137599945068, + "learning_rate": 4.77473397996368e-05, + "loss": 4.9304, + "step": 22896 + }, + { + "epoch": 0.13617494528499383, + "grad_norm": 1.573444128036499, + "learning_rate": 4.774714602370683e-05, + "loss": 4.9736, + "step": 22897 + }, + { + "epoch": 0.13618089256827481, + "grad_norm": 1.7123498916625977, + "learning_rate": 4.774695223983606e-05, + "loss": 5.3678, + "step": 22898 + }, + { + "epoch": 0.1361868398515558, + "grad_norm": 1.8102420568466187, + "learning_rate": 4.7746758448024566e-05, + "loss": 5.2433, + "step": 22899 + }, + { + "epoch": 0.13619278713483682, + "grad_norm": 1.5984879732131958, + "learning_rate": 4.774656464827242e-05, + "loss": 5.2601, + "step": 22900 + }, + { + "epoch": 0.1361987344181178, + "grad_norm": 1.8117280006408691, + "learning_rate": 4.7746370840579666e-05, + "loss": 5.1488, + "step": 22901 + }, + { + "epoch": 0.1362046817013988, + "grad_norm": 1.6972469091415405, + "learning_rate": 4.7746177024946405e-05, + "loss": 5.337, + "step": 22902 + }, + { + "epoch": 0.1362106289846798, + "grad_norm": 1.4006030559539795, + "learning_rate": 4.7745983201372685e-05, + "loss": 5.4563, + "step": 22903 + }, + { + "epoch": 0.1362165762679608, + "grad_norm": 1.7627719640731812, + "learning_rate": 4.774578936985857e-05, + "loss": 5.0125, + "step": 22904 + }, + { + "epoch": 0.1362225235512418, + "grad_norm": 1.3935896158218384, + "learning_rate": 4.774559553040415e-05, + "loss": 5.2413, + "step": 22905 + }, + { + "epoch": 0.1362284708345228, + "grad_norm": 1.3300725221633911, + "learning_rate": 4.7745401683009464e-05, + "loss": 5.391, + "step": 22906 + }, + { + "epoch": 0.1362344181178038, + "grad_norm": 1.5094577074050903, + "learning_rate": 4.7745207827674596e-05, + "loss": 6.0553, + "step": 22907 + }, + { + "epoch": 0.13624036540108478, + "grad_norm": 1.3816832304000854, + "learning_rate": 4.774501396439961e-05, + "loss": 5.9914, + "step": 22908 + }, + { + "epoch": 0.1362463126843658, + "grad_norm": 1.5488735437393188, + "learning_rate": 4.774482009318458e-05, + "loss": 5.5686, + "step": 22909 + }, + { + "epoch": 0.13625225996764678, + "grad_norm": 1.7096377611160278, + "learning_rate": 4.774462621402957e-05, + "loss": 5.0948, + "step": 22910 + }, + { + "epoch": 0.13625820725092777, + "grad_norm": 1.8099161386489868, + "learning_rate": 4.7744432326934644e-05, + "loss": 5.3055, + "step": 22911 + }, + { + "epoch": 0.1362641545342088, + "grad_norm": 1.5320358276367188, + "learning_rate": 4.7744238431899864e-05, + "loss": 5.467, + "step": 22912 + }, + { + "epoch": 0.13627010181748977, + "grad_norm": 1.928933024406433, + "learning_rate": 4.774404452892531e-05, + "loss": 4.9311, + "step": 22913 + }, + { + "epoch": 0.13627604910077076, + "grad_norm": 1.912596344947815, + "learning_rate": 4.7743850618011046e-05, + "loss": 5.1982, + "step": 22914 + }, + { + "epoch": 0.13628199638405178, + "grad_norm": 1.6227478981018066, + "learning_rate": 4.774365669915714e-05, + "loss": 5.3649, + "step": 22915 + }, + { + "epoch": 0.13628794366733277, + "grad_norm": 1.8333683013916016, + "learning_rate": 4.7743462772363656e-05, + "loss": 4.7404, + "step": 22916 + }, + { + "epoch": 0.13629389095061376, + "grad_norm": 1.6802351474761963, + "learning_rate": 4.7743268837630665e-05, + "loss": 5.2044, + "step": 22917 + }, + { + "epoch": 0.13629983823389474, + "grad_norm": 1.76273775100708, + "learning_rate": 4.774307489495823e-05, + "loss": 4.7032, + "step": 22918 + }, + { + "epoch": 0.13630578551717576, + "grad_norm": 1.8272813558578491, + "learning_rate": 4.7742880944346427e-05, + "loss": 4.6324, + "step": 22919 + }, + { + "epoch": 0.13631173280045675, + "grad_norm": 2.327012777328491, + "learning_rate": 4.7742686985795316e-05, + "loss": 4.3851, + "step": 22920 + }, + { + "epoch": 0.13631768008373774, + "grad_norm": 2.035224199295044, + "learning_rate": 4.7742493019304965e-05, + "loss": 4.2965, + "step": 22921 + }, + { + "epoch": 0.13632362736701875, + "grad_norm": 2.3920044898986816, + "learning_rate": 4.774229904487546e-05, + "loss": 4.237, + "step": 22922 + }, + { + "epoch": 0.13632957465029974, + "grad_norm": 2.3279507160186768, + "learning_rate": 4.7742105062506835e-05, + "loss": 4.3676, + "step": 22923 + }, + { + "epoch": 0.13633552193358073, + "grad_norm": 2.360509157180786, + "learning_rate": 4.7741911072199185e-05, + "loss": 4.1116, + "step": 22924 + }, + { + "epoch": 0.13634146921686174, + "grad_norm": 2.3977739810943604, + "learning_rate": 4.7741717073952573e-05, + "loss": 4.4254, + "step": 22925 + }, + { + "epoch": 0.13634741650014273, + "grad_norm": 2.2043890953063965, + "learning_rate": 4.774152306776706e-05, + "loss": 4.3602, + "step": 22926 + }, + { + "epoch": 0.13635336378342372, + "grad_norm": 2.264444589614868, + "learning_rate": 4.7741329053642714e-05, + "loss": 4.3561, + "step": 22927 + }, + { + "epoch": 0.13635931106670474, + "grad_norm": 1.9636424779891968, + "learning_rate": 4.7741135031579596e-05, + "loss": 4.9631, + "step": 22928 + }, + { + "epoch": 0.13636525834998572, + "grad_norm": 1.9803466796875, + "learning_rate": 4.77409410015778e-05, + "loss": 4.4919, + "step": 22929 + }, + { + "epoch": 0.1363712056332667, + "grad_norm": 2.3046467304229736, + "learning_rate": 4.774074696363736e-05, + "loss": 4.7812, + "step": 22930 + }, + { + "epoch": 0.13637715291654773, + "grad_norm": 1.8447179794311523, + "learning_rate": 4.774055291775837e-05, + "loss": 4.7631, + "step": 22931 + }, + { + "epoch": 0.13638310019982872, + "grad_norm": 1.7349412441253662, + "learning_rate": 4.774035886394089e-05, + "loss": 4.7341, + "step": 22932 + }, + { + "epoch": 0.1363890474831097, + "grad_norm": 1.751775860786438, + "learning_rate": 4.774016480218498e-05, + "loss": 4.9051, + "step": 22933 + }, + { + "epoch": 0.13639499476639072, + "grad_norm": 1.6568492650985718, + "learning_rate": 4.773997073249071e-05, + "loss": 4.9236, + "step": 22934 + }, + { + "epoch": 0.1364009420496717, + "grad_norm": 1.6315816640853882, + "learning_rate": 4.773977665485816e-05, + "loss": 5.0631, + "step": 22935 + }, + { + "epoch": 0.1364068893329527, + "grad_norm": 1.7680082321166992, + "learning_rate": 4.773958256928739e-05, + "loss": 4.7632, + "step": 22936 + }, + { + "epoch": 0.1364128366162337, + "grad_norm": 1.656140923500061, + "learning_rate": 4.773938847577846e-05, + "loss": 4.7978, + "step": 22937 + }, + { + "epoch": 0.1364187838995147, + "grad_norm": 1.9236876964569092, + "learning_rate": 4.773919437433144e-05, + "loss": 4.5575, + "step": 22938 + }, + { + "epoch": 0.1364247311827957, + "grad_norm": 1.98481023311615, + "learning_rate": 4.773900026494641e-05, + "loss": 4.4456, + "step": 22939 + }, + { + "epoch": 0.1364306784660767, + "grad_norm": 1.494399070739746, + "learning_rate": 4.773880614762343e-05, + "loss": 5.3057, + "step": 22940 + }, + { + "epoch": 0.1364366257493577, + "grad_norm": 1.972229242324829, + "learning_rate": 4.773861202236257e-05, + "loss": 4.3849, + "step": 22941 + }, + { + "epoch": 0.13644257303263868, + "grad_norm": 2.0766615867614746, + "learning_rate": 4.773841788916389e-05, + "loss": 4.4249, + "step": 22942 + }, + { + "epoch": 0.1364485203159197, + "grad_norm": 1.9418238401412964, + "learning_rate": 4.773822374802747e-05, + "loss": 4.577, + "step": 22943 + }, + { + "epoch": 0.13645446759920069, + "grad_norm": 2.066725254058838, + "learning_rate": 4.773802959895336e-05, + "loss": 4.3563, + "step": 22944 + }, + { + "epoch": 0.13646041488248167, + "grad_norm": 2.948639154434204, + "learning_rate": 4.773783544194165e-05, + "loss": 3.2644, + "step": 22945 + }, + { + "epoch": 0.1364663621657627, + "grad_norm": 2.065586805343628, + "learning_rate": 4.7737641276992385e-05, + "loss": 5.9715, + "step": 22946 + }, + { + "epoch": 0.13647230944904368, + "grad_norm": 2.169130325317383, + "learning_rate": 4.7737447104105645e-05, + "loss": 4.9516, + "step": 22947 + }, + { + "epoch": 0.13647825673232467, + "grad_norm": 2.4133553504943848, + "learning_rate": 4.773725292328151e-05, + "loss": 5.2266, + "step": 22948 + }, + { + "epoch": 0.13648420401560568, + "grad_norm": 2.4718146324157715, + "learning_rate": 4.773705873452002e-05, + "loss": 5.1842, + "step": 22949 + }, + { + "epoch": 0.13649015129888667, + "grad_norm": 1.8822194337844849, + "learning_rate": 4.773686453782127e-05, + "loss": 4.9297, + "step": 22950 + }, + { + "epoch": 0.13649609858216766, + "grad_norm": 1.8627861738204956, + "learning_rate": 4.773667033318531e-05, + "loss": 4.682, + "step": 22951 + }, + { + "epoch": 0.13650204586544867, + "grad_norm": 2.1915957927703857, + "learning_rate": 4.773647612061222e-05, + "loss": 4.5292, + "step": 22952 + }, + { + "epoch": 0.13650799314872966, + "grad_norm": 2.182401657104492, + "learning_rate": 4.773628190010205e-05, + "loss": 4.6416, + "step": 22953 + }, + { + "epoch": 0.13651394043201065, + "grad_norm": 2.020988941192627, + "learning_rate": 4.773608767165488e-05, + "loss": 4.5698, + "step": 22954 + }, + { + "epoch": 0.13651988771529167, + "grad_norm": 1.5788037776947021, + "learning_rate": 4.773589343527078e-05, + "loss": 5.0962, + "step": 22955 + }, + { + "epoch": 0.13652583499857265, + "grad_norm": 1.929002285003662, + "learning_rate": 4.773569919094982e-05, + "loss": 4.7789, + "step": 22956 + }, + { + "epoch": 0.13653178228185364, + "grad_norm": 1.4314018487930298, + "learning_rate": 4.773550493869206e-05, + "loss": 5.1814, + "step": 22957 + }, + { + "epoch": 0.13653772956513466, + "grad_norm": 1.3779473304748535, + "learning_rate": 4.7735310678497566e-05, + "loss": 5.3468, + "step": 22958 + }, + { + "epoch": 0.13654367684841565, + "grad_norm": 1.543843150138855, + "learning_rate": 4.773511641036641e-05, + "loss": 5.2539, + "step": 22959 + }, + { + "epoch": 0.13654962413169663, + "grad_norm": 1.3671090602874756, + "learning_rate": 4.773492213429866e-05, + "loss": 5.2174, + "step": 22960 + }, + { + "epoch": 0.13655557141497765, + "grad_norm": 1.6130348443984985, + "learning_rate": 4.7734727850294386e-05, + "loss": 5.3554, + "step": 22961 + }, + { + "epoch": 0.13656151869825864, + "grad_norm": 1.4536763429641724, + "learning_rate": 4.773453355835365e-05, + "loss": 5.1686, + "step": 22962 + }, + { + "epoch": 0.13656746598153963, + "grad_norm": 1.4020705223083496, + "learning_rate": 4.773433925847652e-05, + "loss": 5.1832, + "step": 22963 + }, + { + "epoch": 0.13657341326482064, + "grad_norm": 1.5963356494903564, + "learning_rate": 4.773414495066308e-05, + "loss": 5.2799, + "step": 22964 + }, + { + "epoch": 0.13657936054810163, + "grad_norm": 1.235477328300476, + "learning_rate": 4.773395063491338e-05, + "loss": 5.3078, + "step": 22965 + }, + { + "epoch": 0.13658530783138262, + "grad_norm": 1.658551812171936, + "learning_rate": 4.7733756311227484e-05, + "loss": 4.8935, + "step": 22966 + }, + { + "epoch": 0.13659125511466363, + "grad_norm": 1.3750555515289307, + "learning_rate": 4.773356197960548e-05, + "loss": 5.4716, + "step": 22967 + }, + { + "epoch": 0.13659720239794462, + "grad_norm": 1.368320107460022, + "learning_rate": 4.773336764004742e-05, + "loss": 5.3549, + "step": 22968 + }, + { + "epoch": 0.1366031496812256, + "grad_norm": 1.6175824403762817, + "learning_rate": 4.773317329255337e-05, + "loss": 5.6482, + "step": 22969 + }, + { + "epoch": 0.13660909696450663, + "grad_norm": 1.5855069160461426, + "learning_rate": 4.7732978937123404e-05, + "loss": 4.8048, + "step": 22970 + }, + { + "epoch": 0.13661504424778761, + "grad_norm": 1.2763618230819702, + "learning_rate": 4.77327845737576e-05, + "loss": 5.3114, + "step": 22971 + }, + { + "epoch": 0.1366209915310686, + "grad_norm": 1.296797275543213, + "learning_rate": 4.773259020245601e-05, + "loss": 5.2154, + "step": 22972 + }, + { + "epoch": 0.13662693881434962, + "grad_norm": 1.6255276203155518, + "learning_rate": 4.7732395823218714e-05, + "loss": 4.7173, + "step": 22973 + }, + { + "epoch": 0.1366328860976306, + "grad_norm": 1.6712839603424072, + "learning_rate": 4.7732201436045764e-05, + "loss": 4.7129, + "step": 22974 + }, + { + "epoch": 0.1366388333809116, + "grad_norm": 1.3639626502990723, + "learning_rate": 4.773200704093724e-05, + "loss": 5.3616, + "step": 22975 + }, + { + "epoch": 0.13664478066419258, + "grad_norm": 1.5322916507720947, + "learning_rate": 4.773181263789321e-05, + "loss": 4.9117, + "step": 22976 + }, + { + "epoch": 0.1366507279474736, + "grad_norm": 1.5231655836105347, + "learning_rate": 4.7731618226913735e-05, + "loss": 5.3278, + "step": 22977 + }, + { + "epoch": 0.1366566752307546, + "grad_norm": 1.610016942024231, + "learning_rate": 4.7731423807998896e-05, + "loss": 4.8782, + "step": 22978 + }, + { + "epoch": 0.13666262251403558, + "grad_norm": 1.578951358795166, + "learning_rate": 4.773122938114875e-05, + "loss": 5.4874, + "step": 22979 + }, + { + "epoch": 0.1366685697973166, + "grad_norm": 1.7087042331695557, + "learning_rate": 4.773103494636335e-05, + "loss": 5.4259, + "step": 22980 + }, + { + "epoch": 0.13667451708059758, + "grad_norm": 1.4179787635803223, + "learning_rate": 4.773084050364279e-05, + "loss": 5.3227, + "step": 22981 + }, + { + "epoch": 0.13668046436387857, + "grad_norm": 1.6982066631317139, + "learning_rate": 4.773064605298714e-05, + "loss": 4.9789, + "step": 22982 + }, + { + "epoch": 0.13668641164715958, + "grad_norm": 1.6331787109375, + "learning_rate": 4.773045159439644e-05, + "loss": 5.3524, + "step": 22983 + }, + { + "epoch": 0.13669235893044057, + "grad_norm": 1.5722705125808716, + "learning_rate": 4.773025712787078e-05, + "loss": 5.2852, + "step": 22984 + }, + { + "epoch": 0.13669830621372156, + "grad_norm": 1.553524136543274, + "learning_rate": 4.773006265341023e-05, + "loss": 5.3803, + "step": 22985 + }, + { + "epoch": 0.13670425349700258, + "grad_norm": 1.6696399450302124, + "learning_rate": 4.772986817101484e-05, + "loss": 5.1719, + "step": 22986 + }, + { + "epoch": 0.13671020078028356, + "grad_norm": 1.468403935432434, + "learning_rate": 4.772967368068469e-05, + "loss": 5.3468, + "step": 22987 + }, + { + "epoch": 0.13671614806356455, + "grad_norm": 1.5586446523666382, + "learning_rate": 4.772947918241985e-05, + "loss": 5.3733, + "step": 22988 + }, + { + "epoch": 0.13672209534684557, + "grad_norm": 1.549392819404602, + "learning_rate": 4.7729284676220385e-05, + "loss": 5.4622, + "step": 22989 + }, + { + "epoch": 0.13672804263012656, + "grad_norm": 1.4469774961471558, + "learning_rate": 4.772909016208636e-05, + "loss": 5.3998, + "step": 22990 + }, + { + "epoch": 0.13673398991340754, + "grad_norm": 1.3361252546310425, + "learning_rate": 4.7728895640017833e-05, + "loss": 5.1723, + "step": 22991 + }, + { + "epoch": 0.13673993719668856, + "grad_norm": 1.5584652423858643, + "learning_rate": 4.7728701110014894e-05, + "loss": 5.03, + "step": 22992 + }, + { + "epoch": 0.13674588447996955, + "grad_norm": 1.319245457649231, + "learning_rate": 4.7728506572077594e-05, + "loss": 5.0349, + "step": 22993 + }, + { + "epoch": 0.13675183176325054, + "grad_norm": 1.6574468612670898, + "learning_rate": 4.7728312026206015e-05, + "loss": 5.3401, + "step": 22994 + }, + { + "epoch": 0.13675777904653155, + "grad_norm": 1.564598560333252, + "learning_rate": 4.772811747240022e-05, + "loss": 5.3047, + "step": 22995 + }, + { + "epoch": 0.13676372632981254, + "grad_norm": 1.5692095756530762, + "learning_rate": 4.772792291066026e-05, + "loss": 5.1632, + "step": 22996 + }, + { + "epoch": 0.13676967361309353, + "grad_norm": 1.3904811143875122, + "learning_rate": 4.772772834098622e-05, + "loss": 5.2429, + "step": 22997 + }, + { + "epoch": 0.13677562089637454, + "grad_norm": 1.6455345153808594, + "learning_rate": 4.7727533763378175e-05, + "loss": 5.164, + "step": 22998 + }, + { + "epoch": 0.13678156817965553, + "grad_norm": 1.384092092514038, + "learning_rate": 4.772733917783618e-05, + "loss": 4.9753, + "step": 22999 + }, + { + "epoch": 0.13678751546293652, + "grad_norm": 1.5056332349777222, + "learning_rate": 4.77271445843603e-05, + "loss": 5.008, + "step": 23000 + }, + { + "epoch": 0.13679346274621754, + "grad_norm": 1.6766334772109985, + "learning_rate": 4.772694998295061e-05, + "loss": 5.2156, + "step": 23001 + }, + { + "epoch": 0.13679941002949852, + "grad_norm": 1.517899513244629, + "learning_rate": 4.772675537360718e-05, + "loss": 5.4637, + "step": 23002 + }, + { + "epoch": 0.1368053573127795, + "grad_norm": 1.539090633392334, + "learning_rate": 4.772656075633007e-05, + "loss": 4.9678, + "step": 23003 + }, + { + "epoch": 0.13681130459606053, + "grad_norm": 1.5403459072113037, + "learning_rate": 4.772636613111936e-05, + "loss": 5.1884, + "step": 23004 + }, + { + "epoch": 0.13681725187934152, + "grad_norm": 1.4680373668670654, + "learning_rate": 4.7726171497975106e-05, + "loss": 5.118, + "step": 23005 + }, + { + "epoch": 0.1368231991626225, + "grad_norm": 1.6800905466079712, + "learning_rate": 4.7725976856897376e-05, + "loss": 5.5796, + "step": 23006 + }, + { + "epoch": 0.13682914644590352, + "grad_norm": 1.6708084344863892, + "learning_rate": 4.7725782207886246e-05, + "loss": 4.8021, + "step": 23007 + }, + { + "epoch": 0.1368350937291845, + "grad_norm": 1.3744218349456787, + "learning_rate": 4.772558755094177e-05, + "loss": 5.2993, + "step": 23008 + }, + { + "epoch": 0.1368410410124655, + "grad_norm": 1.6822494268417358, + "learning_rate": 4.772539288606405e-05, + "loss": 4.8643, + "step": 23009 + }, + { + "epoch": 0.1368469882957465, + "grad_norm": 1.7003953456878662, + "learning_rate": 4.772519821325311e-05, + "loss": 5.0189, + "step": 23010 + }, + { + "epoch": 0.1368529355790275, + "grad_norm": 1.5518492460250854, + "learning_rate": 4.772500353250905e-05, + "loss": 5.2159, + "step": 23011 + }, + { + "epoch": 0.1368588828623085, + "grad_norm": 1.64122474193573, + "learning_rate": 4.772480884383191e-05, + "loss": 4.8965, + "step": 23012 + }, + { + "epoch": 0.1368648301455895, + "grad_norm": 1.6162265539169312, + "learning_rate": 4.772461414722179e-05, + "loss": 5.1521, + "step": 23013 + }, + { + "epoch": 0.1368707774288705, + "grad_norm": 1.7200851440429688, + "learning_rate": 4.7724419442678736e-05, + "loss": 5.1694, + "step": 23014 + }, + { + "epoch": 0.13687672471215148, + "grad_norm": 1.4717456102371216, + "learning_rate": 4.772422473020283e-05, + "loss": 5.1999, + "step": 23015 + }, + { + "epoch": 0.1368826719954325, + "grad_norm": 1.6320459842681885, + "learning_rate": 4.772403000979412e-05, + "loss": 4.9127, + "step": 23016 + }, + { + "epoch": 0.13688861927871349, + "grad_norm": 1.5466574430465698, + "learning_rate": 4.772383528145269e-05, + "loss": 5.0589, + "step": 23017 + }, + { + "epoch": 0.13689456656199447, + "grad_norm": 1.7745109796524048, + "learning_rate": 4.77236405451786e-05, + "loss": 4.9933, + "step": 23018 + }, + { + "epoch": 0.1369005138452755, + "grad_norm": 1.4493471384048462, + "learning_rate": 4.772344580097193e-05, + "loss": 5.0996, + "step": 23019 + }, + { + "epoch": 0.13690646112855648, + "grad_norm": 1.4859240055084229, + "learning_rate": 4.7723251048832734e-05, + "loss": 5.2686, + "step": 23020 + }, + { + "epoch": 0.13691240841183747, + "grad_norm": 1.6349207162857056, + "learning_rate": 4.7723056288761084e-05, + "loss": 5.1644, + "step": 23021 + }, + { + "epoch": 0.13691835569511848, + "grad_norm": 1.5266002416610718, + "learning_rate": 4.772286152075706e-05, + "loss": 4.988, + "step": 23022 + }, + { + "epoch": 0.13692430297839947, + "grad_norm": 1.592774748802185, + "learning_rate": 4.772266674482071e-05, + "loss": 5.2701, + "step": 23023 + }, + { + "epoch": 0.13693025026168046, + "grad_norm": 1.4789998531341553, + "learning_rate": 4.772247196095211e-05, + "loss": 5.1181, + "step": 23024 + }, + { + "epoch": 0.13693619754496147, + "grad_norm": 1.4374842643737793, + "learning_rate": 4.772227716915134e-05, + "loss": 4.5882, + "step": 23025 + }, + { + "epoch": 0.13694214482824246, + "grad_norm": 1.682689905166626, + "learning_rate": 4.772208236941845e-05, + "loss": 5.5035, + "step": 23026 + }, + { + "epoch": 0.13694809211152345, + "grad_norm": 1.5549851655960083, + "learning_rate": 4.772188756175352e-05, + "loss": 5.5484, + "step": 23027 + }, + { + "epoch": 0.13695403939480447, + "grad_norm": 1.5018965005874634, + "learning_rate": 4.772169274615661e-05, + "loss": 5.0517, + "step": 23028 + }, + { + "epoch": 0.13695998667808545, + "grad_norm": 1.648807168006897, + "learning_rate": 4.77214979226278e-05, + "loss": 5.1527, + "step": 23029 + }, + { + "epoch": 0.13696593396136644, + "grad_norm": 1.6059378385543823, + "learning_rate": 4.772130309116714e-05, + "loss": 5.0003, + "step": 23030 + }, + { + "epoch": 0.13697188124464746, + "grad_norm": 1.368412971496582, + "learning_rate": 4.772110825177472e-05, + "loss": 5.1025, + "step": 23031 + }, + { + "epoch": 0.13697782852792845, + "grad_norm": 1.627031922340393, + "learning_rate": 4.7720913404450576e-05, + "loss": 5.1206, + "step": 23032 + }, + { + "epoch": 0.13698377581120944, + "grad_norm": 1.654307246208191, + "learning_rate": 4.772071854919481e-05, + "loss": 4.8403, + "step": 23033 + }, + { + "epoch": 0.13698972309449042, + "grad_norm": 1.658847451210022, + "learning_rate": 4.772052368600748e-05, + "loss": 5.2089, + "step": 23034 + }, + { + "epoch": 0.13699567037777144, + "grad_norm": 1.6999456882476807, + "learning_rate": 4.772032881488864e-05, + "loss": 5.2022, + "step": 23035 + }, + { + "epoch": 0.13700161766105243, + "grad_norm": 1.2880116701126099, + "learning_rate": 4.772013393583837e-05, + "loss": 5.4331, + "step": 23036 + }, + { + "epoch": 0.13700756494433342, + "grad_norm": 1.4780634641647339, + "learning_rate": 4.7719939048856735e-05, + "loss": 5.034, + "step": 23037 + }, + { + "epoch": 0.13701351222761443, + "grad_norm": 1.5058658123016357, + "learning_rate": 4.771974415394381e-05, + "loss": 5.4403, + "step": 23038 + }, + { + "epoch": 0.13701945951089542, + "grad_norm": 1.4378021955490112, + "learning_rate": 4.771954925109965e-05, + "loss": 5.1769, + "step": 23039 + }, + { + "epoch": 0.1370254067941764, + "grad_norm": 1.6010862588882446, + "learning_rate": 4.7719354340324337e-05, + "loss": 5.4728, + "step": 23040 + }, + { + "epoch": 0.13703135407745742, + "grad_norm": 1.6916764974594116, + "learning_rate": 4.7719159421617924e-05, + "loss": 4.9489, + "step": 23041 + }, + { + "epoch": 0.1370373013607384, + "grad_norm": 1.4737353324890137, + "learning_rate": 4.771896449498049e-05, + "loss": 4.8795, + "step": 23042 + }, + { + "epoch": 0.1370432486440194, + "grad_norm": 1.5808194875717163, + "learning_rate": 4.7718769560412105e-05, + "loss": 4.8375, + "step": 23043 + }, + { + "epoch": 0.13704919592730042, + "grad_norm": 1.3700640201568604, + "learning_rate": 4.771857461791283e-05, + "loss": 4.8135, + "step": 23044 + }, + { + "epoch": 0.1370551432105814, + "grad_norm": 1.1938998699188232, + "learning_rate": 4.7718379667482735e-05, + "loss": 4.8199, + "step": 23045 + }, + { + "epoch": 0.1370610904938624, + "grad_norm": 1.3598859310150146, + "learning_rate": 4.7718184709121885e-05, + "loss": 4.6871, + "step": 23046 + }, + { + "epoch": 0.1370670377771434, + "grad_norm": 1.2303695678710938, + "learning_rate": 4.7717989742830354e-05, + "loss": 4.7421, + "step": 23047 + }, + { + "epoch": 0.1370729850604244, + "grad_norm": 1.2872051000595093, + "learning_rate": 4.77177947686082e-05, + "loss": 4.6669, + "step": 23048 + }, + { + "epoch": 0.13707893234370538, + "grad_norm": 1.2523759603500366, + "learning_rate": 4.771759978645551e-05, + "loss": 4.6359, + "step": 23049 + }, + { + "epoch": 0.1370848796269864, + "grad_norm": 1.2552485466003418, + "learning_rate": 4.771740479637234e-05, + "loss": 4.7362, + "step": 23050 + }, + { + "epoch": 0.1370908269102674, + "grad_norm": 1.434870958328247, + "learning_rate": 4.771720979835875e-05, + "loss": 4.812, + "step": 23051 + }, + { + "epoch": 0.13709677419354838, + "grad_norm": 1.6004719734191895, + "learning_rate": 4.771701479241483e-05, + "loss": 5.1579, + "step": 23052 + }, + { + "epoch": 0.1371027214768294, + "grad_norm": 1.5982462167739868, + "learning_rate": 4.7716819778540625e-05, + "loss": 4.8038, + "step": 23053 + }, + { + "epoch": 0.13710866876011038, + "grad_norm": 1.7509288787841797, + "learning_rate": 4.7716624756736215e-05, + "loss": 5.091, + "step": 23054 + }, + { + "epoch": 0.13711461604339137, + "grad_norm": 1.729748010635376, + "learning_rate": 4.7716429727001665e-05, + "loss": 5.0755, + "step": 23055 + }, + { + "epoch": 0.13712056332667238, + "grad_norm": 1.6167495250701904, + "learning_rate": 4.7716234689337044e-05, + "loss": 5.0602, + "step": 23056 + }, + { + "epoch": 0.13712651060995337, + "grad_norm": 1.7035753726959229, + "learning_rate": 4.771603964374242e-05, + "loss": 5.1877, + "step": 23057 + }, + { + "epoch": 0.13713245789323436, + "grad_norm": 1.5923055410385132, + "learning_rate": 4.7715844590217865e-05, + "loss": 4.6043, + "step": 23058 + }, + { + "epoch": 0.13713840517651538, + "grad_norm": 1.551894187927246, + "learning_rate": 4.771564952876344e-05, + "loss": 5.0746, + "step": 23059 + }, + { + "epoch": 0.13714435245979636, + "grad_norm": 1.8965848684310913, + "learning_rate": 4.771545445937921e-05, + "loss": 4.6152, + "step": 23060 + }, + { + "epoch": 0.13715029974307735, + "grad_norm": 1.630903720855713, + "learning_rate": 4.771525938206527e-05, + "loss": 5.3948, + "step": 23061 + }, + { + "epoch": 0.13715624702635837, + "grad_norm": 1.7285772562026978, + "learning_rate": 4.771506429682166e-05, + "loss": 5.2535, + "step": 23062 + }, + { + "epoch": 0.13716219430963936, + "grad_norm": 1.789049506187439, + "learning_rate": 4.771486920364844e-05, + "loss": 4.7232, + "step": 23063 + }, + { + "epoch": 0.13716814159292035, + "grad_norm": 1.6774955987930298, + "learning_rate": 4.7714674102545706e-05, + "loss": 5.424, + "step": 23064 + }, + { + "epoch": 0.13717408887620136, + "grad_norm": 1.9038479328155518, + "learning_rate": 4.771447899351351e-05, + "loss": 4.7868, + "step": 23065 + }, + { + "epoch": 0.13718003615948235, + "grad_norm": 1.906087875366211, + "learning_rate": 4.771428387655192e-05, + "loss": 4.7115, + "step": 23066 + }, + { + "epoch": 0.13718598344276334, + "grad_norm": 1.786908745765686, + "learning_rate": 4.771408875166103e-05, + "loss": 4.6741, + "step": 23067 + }, + { + "epoch": 0.13719193072604435, + "grad_norm": 1.8421779870986938, + "learning_rate": 4.771389361884086e-05, + "loss": 4.9338, + "step": 23068 + }, + { + "epoch": 0.13719787800932534, + "grad_norm": 1.8146562576293945, + "learning_rate": 4.7713698478091516e-05, + "loss": 4.5556, + "step": 23069 + }, + { + "epoch": 0.13720382529260633, + "grad_norm": 1.4694918394088745, + "learning_rate": 4.7713503329413056e-05, + "loss": 5.611, + "step": 23070 + }, + { + "epoch": 0.13720977257588735, + "grad_norm": 1.553694248199463, + "learning_rate": 4.771330817280554e-05, + "loss": 5.6062, + "step": 23071 + }, + { + "epoch": 0.13721571985916833, + "grad_norm": 1.293204426765442, + "learning_rate": 4.771311300826905e-05, + "loss": 5.7157, + "step": 23072 + }, + { + "epoch": 0.13722166714244932, + "grad_norm": 1.369480013847351, + "learning_rate": 4.771291783580364e-05, + "loss": 5.754, + "step": 23073 + }, + { + "epoch": 0.13722761442573034, + "grad_norm": 1.7480628490447998, + "learning_rate": 4.771272265540939e-05, + "loss": 5.4179, + "step": 23074 + }, + { + "epoch": 0.13723356170901133, + "grad_norm": 1.604788064956665, + "learning_rate": 4.771252746708636e-05, + "loss": 5.3766, + "step": 23075 + }, + { + "epoch": 0.13723950899229231, + "grad_norm": 1.721793532371521, + "learning_rate": 4.7712332270834626e-05, + "loss": 4.9839, + "step": 23076 + }, + { + "epoch": 0.13724545627557333, + "grad_norm": 1.528327226638794, + "learning_rate": 4.771213706665425e-05, + "loss": 5.427, + "step": 23077 + }, + { + "epoch": 0.13725140355885432, + "grad_norm": 1.425625205039978, + "learning_rate": 4.7711941854545295e-05, + "loss": 5.6246, + "step": 23078 + }, + { + "epoch": 0.1372573508421353, + "grad_norm": 1.9369326829910278, + "learning_rate": 4.771174663450784e-05, + "loss": 5.5038, + "step": 23079 + }, + { + "epoch": 0.13726329812541632, + "grad_norm": 1.906792163848877, + "learning_rate": 4.771155140654195e-05, + "loss": 5.5361, + "step": 23080 + }, + { + "epoch": 0.1372692454086973, + "grad_norm": 1.7495099306106567, + "learning_rate": 4.7711356170647694e-05, + "loss": 4.9809, + "step": 23081 + }, + { + "epoch": 0.1372751926919783, + "grad_norm": 1.5589921474456787, + "learning_rate": 4.771116092682514e-05, + "loss": 4.9627, + "step": 23082 + }, + { + "epoch": 0.13728113997525931, + "grad_norm": 1.7177824974060059, + "learning_rate": 4.771096567507435e-05, + "loss": 5.403, + "step": 23083 + }, + { + "epoch": 0.1372870872585403, + "grad_norm": 1.5470298528671265, + "learning_rate": 4.7710770415395395e-05, + "loss": 6.0237, + "step": 23084 + }, + { + "epoch": 0.1372930345418213, + "grad_norm": 1.5613659620285034, + "learning_rate": 4.771057514778835e-05, + "loss": 5.7272, + "step": 23085 + }, + { + "epoch": 0.1372989818251023, + "grad_norm": 1.7003729343414307, + "learning_rate": 4.771037987225328e-05, + "loss": 4.9305, + "step": 23086 + }, + { + "epoch": 0.1373049291083833, + "grad_norm": 2.587393283843994, + "learning_rate": 4.771018458879023e-05, + "loss": 4.9075, + "step": 23087 + }, + { + "epoch": 0.13731087639166428, + "grad_norm": 2.208308696746826, + "learning_rate": 4.770998929739931e-05, + "loss": 4.9141, + "step": 23088 + }, + { + "epoch": 0.1373168236749453, + "grad_norm": 2.0532326698303223, + "learning_rate": 4.770979399808057e-05, + "loss": 5.0574, + "step": 23089 + }, + { + "epoch": 0.1373227709582263, + "grad_norm": 1.86672043800354, + "learning_rate": 4.770959869083406e-05, + "loss": 4.9269, + "step": 23090 + }, + { + "epoch": 0.13732871824150727, + "grad_norm": 1.8310163021087646, + "learning_rate": 4.7709403375659874e-05, + "loss": 4.901, + "step": 23091 + }, + { + "epoch": 0.13733466552478826, + "grad_norm": 1.8886011838912964, + "learning_rate": 4.7709208052558065e-05, + "loss": 4.8325, + "step": 23092 + }, + { + "epoch": 0.13734061280806928, + "grad_norm": 1.9192320108413696, + "learning_rate": 4.770901272152871e-05, + "loss": 4.8783, + "step": 23093 + }, + { + "epoch": 0.13734656009135027, + "grad_norm": 2.0797886848449707, + "learning_rate": 4.770881738257187e-05, + "loss": 4.6473, + "step": 23094 + }, + { + "epoch": 0.13735250737463126, + "grad_norm": 2.2008140087127686, + "learning_rate": 4.770862203568762e-05, + "loss": 4.7291, + "step": 23095 + }, + { + "epoch": 0.13735845465791227, + "grad_norm": 2.002549886703491, + "learning_rate": 4.770842668087602e-05, + "loss": 4.5471, + "step": 23096 + }, + { + "epoch": 0.13736440194119326, + "grad_norm": 1.7748942375183105, + "learning_rate": 4.770823131813714e-05, + "loss": 4.5844, + "step": 23097 + }, + { + "epoch": 0.13737034922447425, + "grad_norm": 2.128469467163086, + "learning_rate": 4.7708035947471065e-05, + "loss": 4.7365, + "step": 23098 + }, + { + "epoch": 0.13737629650775526, + "grad_norm": 1.9279344081878662, + "learning_rate": 4.770784056887784e-05, + "loss": 4.5673, + "step": 23099 + }, + { + "epoch": 0.13738224379103625, + "grad_norm": 1.896638035774231, + "learning_rate": 4.770764518235754e-05, + "loss": 4.5956, + "step": 23100 + }, + { + "epoch": 0.13738819107431724, + "grad_norm": 2.4768176078796387, + "learning_rate": 4.770744978791024e-05, + "loss": 4.5071, + "step": 23101 + }, + { + "epoch": 0.13739413835759826, + "grad_norm": 2.0828697681427, + "learning_rate": 4.7707254385536e-05, + "loss": 4.5681, + "step": 23102 + }, + { + "epoch": 0.13740008564087924, + "grad_norm": 2.197688579559326, + "learning_rate": 4.7707058975234895e-05, + "loss": 4.5111, + "step": 23103 + }, + { + "epoch": 0.13740603292416023, + "grad_norm": 2.0053935050964355, + "learning_rate": 4.7706863557007e-05, + "loss": 4.5441, + "step": 23104 + }, + { + "epoch": 0.13741198020744125, + "grad_norm": 2.247901439666748, + "learning_rate": 4.770666813085236e-05, + "loss": 4.5538, + "step": 23105 + }, + { + "epoch": 0.13741792749072224, + "grad_norm": 2.1666789054870605, + "learning_rate": 4.770647269677106e-05, + "loss": 4.7712, + "step": 23106 + }, + { + "epoch": 0.13742387477400322, + "grad_norm": 2.0191304683685303, + "learning_rate": 4.770627725476317e-05, + "loss": 4.5244, + "step": 23107 + }, + { + "epoch": 0.13742982205728424, + "grad_norm": 1.9388200044631958, + "learning_rate": 4.770608180482874e-05, + "loss": 4.6272, + "step": 23108 + }, + { + "epoch": 0.13743576934056523, + "grad_norm": 2.0467464923858643, + "learning_rate": 4.7705886346967865e-05, + "loss": 4.5852, + "step": 23109 + }, + { + "epoch": 0.13744171662384622, + "grad_norm": 2.0310070514678955, + "learning_rate": 4.770569088118059e-05, + "loss": 4.3915, + "step": 23110 + }, + { + "epoch": 0.13744766390712723, + "grad_norm": 2.1216657161712646, + "learning_rate": 4.770549540746701e-05, + "loss": 4.4549, + "step": 23111 + }, + { + "epoch": 0.13745361119040822, + "grad_norm": 1.9715701341629028, + "learning_rate": 4.770529992582715e-05, + "loss": 4.8822, + "step": 23112 + }, + { + "epoch": 0.1374595584736892, + "grad_norm": 2.0956320762634277, + "learning_rate": 4.7705104436261124e-05, + "loss": 5.3927, + "step": 23113 + }, + { + "epoch": 0.13746550575697022, + "grad_norm": 1.6396405696868896, + "learning_rate": 4.770490893876898e-05, + "loss": 5.5089, + "step": 23114 + }, + { + "epoch": 0.1374714530402512, + "grad_norm": 1.8379572629928589, + "learning_rate": 4.7704713433350777e-05, + "loss": 5.9133, + "step": 23115 + }, + { + "epoch": 0.1374774003235322, + "grad_norm": 1.6787012815475464, + "learning_rate": 4.7704517920006594e-05, + "loss": 5.4497, + "step": 23116 + }, + { + "epoch": 0.13748334760681322, + "grad_norm": 1.6657997369766235, + "learning_rate": 4.77043223987365e-05, + "loss": 5.2093, + "step": 23117 + }, + { + "epoch": 0.1374892948900942, + "grad_norm": 1.7581418752670288, + "learning_rate": 4.7704126869540565e-05, + "loss": 6.4119, + "step": 23118 + }, + { + "epoch": 0.1374952421733752, + "grad_norm": 1.4436302185058594, + "learning_rate": 4.770393133241885e-05, + "loss": 6.3299, + "step": 23119 + }, + { + "epoch": 0.1375011894566562, + "grad_norm": 1.6737406253814697, + "learning_rate": 4.7703735787371434e-05, + "loss": 5.8634, + "step": 23120 + }, + { + "epoch": 0.1375071367399372, + "grad_norm": 1.5715806484222412, + "learning_rate": 4.7703540234398375e-05, + "loss": 5.7896, + "step": 23121 + }, + { + "epoch": 0.13751308402321819, + "grad_norm": 1.8452152013778687, + "learning_rate": 4.7703344673499744e-05, + "loss": 5.8868, + "step": 23122 + }, + { + "epoch": 0.1375190313064992, + "grad_norm": 1.6291402578353882, + "learning_rate": 4.770314910467561e-05, + "loss": 5.8256, + "step": 23123 + }, + { + "epoch": 0.1375249785897802, + "grad_norm": 1.4301279783248901, + "learning_rate": 4.770295352792604e-05, + "loss": 5.7982, + "step": 23124 + }, + { + "epoch": 0.13753092587306118, + "grad_norm": 1.5949046611785889, + "learning_rate": 4.770275794325111e-05, + "loss": 5.5606, + "step": 23125 + }, + { + "epoch": 0.1375368731563422, + "grad_norm": 1.572860598564148, + "learning_rate": 4.770256235065087e-05, + "loss": 5.1636, + "step": 23126 + }, + { + "epoch": 0.13754282043962318, + "grad_norm": 1.4339121580123901, + "learning_rate": 4.7702366750125405e-05, + "loss": 5.1374, + "step": 23127 + }, + { + "epoch": 0.13754876772290417, + "grad_norm": 1.4290729761123657, + "learning_rate": 4.770217114167478e-05, + "loss": 5.7268, + "step": 23128 + }, + { + "epoch": 0.13755471500618519, + "grad_norm": 1.1300958395004272, + "learning_rate": 4.7701975525299066e-05, + "loss": 5.6887, + "step": 23129 + }, + { + "epoch": 0.13756066228946617, + "grad_norm": 1.1974701881408691, + "learning_rate": 4.7701779900998325e-05, + "loss": 5.6763, + "step": 23130 + }, + { + "epoch": 0.13756660957274716, + "grad_norm": 1.3675005435943604, + "learning_rate": 4.7701584268772614e-05, + "loss": 5.6558, + "step": 23131 + }, + { + "epoch": 0.13757255685602818, + "grad_norm": 1.3302583694458008, + "learning_rate": 4.770138862862203e-05, + "loss": 5.6915, + "step": 23132 + }, + { + "epoch": 0.13757850413930917, + "grad_norm": 1.3415045738220215, + "learning_rate": 4.770119298054662e-05, + "loss": 5.6922, + "step": 23133 + }, + { + "epoch": 0.13758445142259015, + "grad_norm": 1.229663372039795, + "learning_rate": 4.770099732454646e-05, + "loss": 5.7799, + "step": 23134 + }, + { + "epoch": 0.13759039870587117, + "grad_norm": 1.3245000839233398, + "learning_rate": 4.7700801660621614e-05, + "loss": 5.7848, + "step": 23135 + }, + { + "epoch": 0.13759634598915216, + "grad_norm": 1.2835783958435059, + "learning_rate": 4.770060598877215e-05, + "loss": 5.5999, + "step": 23136 + }, + { + "epoch": 0.13760229327243315, + "grad_norm": 1.9270732402801514, + "learning_rate": 4.770041030899814e-05, + "loss": 4.8701, + "step": 23137 + }, + { + "epoch": 0.13760824055571416, + "grad_norm": 1.8123419284820557, + "learning_rate": 4.7700214621299656e-05, + "loss": 5.3828, + "step": 23138 + }, + { + "epoch": 0.13761418783899515, + "grad_norm": 2.0436434745788574, + "learning_rate": 4.770001892567676e-05, + "loss": 4.6098, + "step": 23139 + }, + { + "epoch": 0.13762013512227614, + "grad_norm": 1.4343012571334839, + "learning_rate": 4.769982322212953e-05, + "loss": 5.5587, + "step": 23140 + }, + { + "epoch": 0.13762608240555715, + "grad_norm": 1.266640067100525, + "learning_rate": 4.769962751065801e-05, + "loss": 5.626, + "step": 23141 + }, + { + "epoch": 0.13763202968883814, + "grad_norm": 1.9386495351791382, + "learning_rate": 4.7699431791262296e-05, + "loss": 4.7212, + "step": 23142 + }, + { + "epoch": 0.13763797697211913, + "grad_norm": 2.270129919052124, + "learning_rate": 4.769923606394244e-05, + "loss": 4.7609, + "step": 23143 + }, + { + "epoch": 0.13764392425540015, + "grad_norm": 2.0305488109588623, + "learning_rate": 4.7699040328698516e-05, + "loss": 4.8083, + "step": 23144 + }, + { + "epoch": 0.13764987153868113, + "grad_norm": 2.1791486740112305, + "learning_rate": 4.769884458553059e-05, + "loss": 4.834, + "step": 23145 + }, + { + "epoch": 0.13765581882196212, + "grad_norm": 2.152580738067627, + "learning_rate": 4.769864883443873e-05, + "loss": 4.5418, + "step": 23146 + }, + { + "epoch": 0.13766176610524314, + "grad_norm": 2.2850470542907715, + "learning_rate": 4.769845307542301e-05, + "loss": 4.9344, + "step": 23147 + }, + { + "epoch": 0.13766771338852413, + "grad_norm": 1.745813012123108, + "learning_rate": 4.76982573084835e-05, + "loss": 4.9631, + "step": 23148 + }, + { + "epoch": 0.13767366067180511, + "grad_norm": 1.5848993062973022, + "learning_rate": 4.769806153362025e-05, + "loss": 5.3936, + "step": 23149 + }, + { + "epoch": 0.1376796079550861, + "grad_norm": 1.5276480913162231, + "learning_rate": 4.7697865750833356e-05, + "loss": 5.7806, + "step": 23150 + }, + { + "epoch": 0.13768555523836712, + "grad_norm": 1.3464304208755493, + "learning_rate": 4.769766996012286e-05, + "loss": 5.5572, + "step": 23151 + }, + { + "epoch": 0.1376915025216481, + "grad_norm": 1.375168800354004, + "learning_rate": 4.769747416148885e-05, + "loss": 5.6109, + "step": 23152 + }, + { + "epoch": 0.1376974498049291, + "grad_norm": 1.3537193536758423, + "learning_rate": 4.769727835493138e-05, + "loss": 5.5257, + "step": 23153 + }, + { + "epoch": 0.1377033970882101, + "grad_norm": 1.6656006574630737, + "learning_rate": 4.769708254045053e-05, + "loss": 5.3327, + "step": 23154 + }, + { + "epoch": 0.1377093443714911, + "grad_norm": 1.6092736721038818, + "learning_rate": 4.769688671804635e-05, + "loss": 5.7785, + "step": 23155 + }, + { + "epoch": 0.1377152916547721, + "grad_norm": 1.5005303621292114, + "learning_rate": 4.7696690887718934e-05, + "loss": 5.4944, + "step": 23156 + }, + { + "epoch": 0.1377212389380531, + "grad_norm": 1.6100717782974243, + "learning_rate": 4.7696495049468336e-05, + "loss": 5.3767, + "step": 23157 + }, + { + "epoch": 0.1377271862213341, + "grad_norm": 1.5637480020523071, + "learning_rate": 4.7696299203294626e-05, + "loss": 5.3981, + "step": 23158 + }, + { + "epoch": 0.13773313350461508, + "grad_norm": 1.6407819986343384, + "learning_rate": 4.769610334919787e-05, + "loss": 5.4328, + "step": 23159 + }, + { + "epoch": 0.1377390807878961, + "grad_norm": 1.8828953504562378, + "learning_rate": 4.7695907487178146e-05, + "loss": 5.5127, + "step": 23160 + }, + { + "epoch": 0.13774502807117708, + "grad_norm": 1.5160561800003052, + "learning_rate": 4.7695711617235506e-05, + "loss": 5.3309, + "step": 23161 + }, + { + "epoch": 0.13775097535445807, + "grad_norm": 1.4901509284973145, + "learning_rate": 4.769551573937003e-05, + "loss": 5.4584, + "step": 23162 + }, + { + "epoch": 0.1377569226377391, + "grad_norm": 1.3983137607574463, + "learning_rate": 4.769531985358179e-05, + "loss": 5.6738, + "step": 23163 + }, + { + "epoch": 0.13776286992102008, + "grad_norm": 1.7664490938186646, + "learning_rate": 4.7695123959870834e-05, + "loss": 5.513, + "step": 23164 + }, + { + "epoch": 0.13776881720430106, + "grad_norm": 1.4650641679763794, + "learning_rate": 4.7694928058237255e-05, + "loss": 4.9959, + "step": 23165 + }, + { + "epoch": 0.13777476448758208, + "grad_norm": 1.5515252351760864, + "learning_rate": 4.7694732148681106e-05, + "loss": 5.1419, + "step": 23166 + }, + { + "epoch": 0.13778071177086307, + "grad_norm": 1.459083914756775, + "learning_rate": 4.769453623120247e-05, + "loss": 5.3639, + "step": 23167 + }, + { + "epoch": 0.13778665905414406, + "grad_norm": 1.6032545566558838, + "learning_rate": 4.76943403058014e-05, + "loss": 5.3822, + "step": 23168 + }, + { + "epoch": 0.13779260633742507, + "grad_norm": 1.5436428785324097, + "learning_rate": 4.769414437247797e-05, + "loss": 5.0313, + "step": 23169 + }, + { + "epoch": 0.13779855362070606, + "grad_norm": 1.2577800750732422, + "learning_rate": 4.769394843123225e-05, + "loss": 4.8907, + "step": 23170 + }, + { + "epoch": 0.13780450090398705, + "grad_norm": 1.4654191732406616, + "learning_rate": 4.769375248206431e-05, + "loss": 5.0346, + "step": 23171 + }, + { + "epoch": 0.13781044818726806, + "grad_norm": 1.9576739072799683, + "learning_rate": 4.769355652497421e-05, + "loss": 5.4, + "step": 23172 + }, + { + "epoch": 0.13781639547054905, + "grad_norm": 1.7060799598693848, + "learning_rate": 4.7693360559962027e-05, + "loss": 4.9668, + "step": 23173 + }, + { + "epoch": 0.13782234275383004, + "grad_norm": 1.4705651998519897, + "learning_rate": 4.769316458702782e-05, + "loss": 5.2053, + "step": 23174 + }, + { + "epoch": 0.13782829003711106, + "grad_norm": 1.806314468383789, + "learning_rate": 4.769296860617167e-05, + "loss": 5.5297, + "step": 23175 + }, + { + "epoch": 0.13783423732039204, + "grad_norm": 1.7741440534591675, + "learning_rate": 4.769277261739364e-05, + "loss": 5.569, + "step": 23176 + }, + { + "epoch": 0.13784018460367303, + "grad_norm": 1.4956278800964355, + "learning_rate": 4.7692576620693796e-05, + "loss": 5.2616, + "step": 23177 + }, + { + "epoch": 0.13784613188695405, + "grad_norm": 1.4668684005737305, + "learning_rate": 4.7692380616072205e-05, + "loss": 5.551, + "step": 23178 + }, + { + "epoch": 0.13785207917023504, + "grad_norm": 1.9172862768173218, + "learning_rate": 4.769218460352894e-05, + "loss": 5.072, + "step": 23179 + }, + { + "epoch": 0.13785802645351602, + "grad_norm": 2.3610761165618896, + "learning_rate": 4.769198858306407e-05, + "loss": 4.5511, + "step": 23180 + }, + { + "epoch": 0.13786397373679704, + "grad_norm": 2.099209785461426, + "learning_rate": 4.769179255467766e-05, + "loss": 5.1829, + "step": 23181 + }, + { + "epoch": 0.13786992102007803, + "grad_norm": 1.8222076892852783, + "learning_rate": 4.7691596518369776e-05, + "loss": 5.1451, + "step": 23182 + }, + { + "epoch": 0.13787586830335902, + "grad_norm": 2.129558563232422, + "learning_rate": 4.769140047414049e-05, + "loss": 4.574, + "step": 23183 + }, + { + "epoch": 0.13788181558664003, + "grad_norm": 2.3188533782958984, + "learning_rate": 4.7691204421989876e-05, + "loss": 4.4604, + "step": 23184 + }, + { + "epoch": 0.13788776286992102, + "grad_norm": 2.2996792793273926, + "learning_rate": 4.7691008361918e-05, + "loss": 4.6119, + "step": 23185 + }, + { + "epoch": 0.137893710153202, + "grad_norm": 2.164652109146118, + "learning_rate": 4.769081229392492e-05, + "loss": 4.6286, + "step": 23186 + }, + { + "epoch": 0.13789965743648303, + "grad_norm": 1.9271842241287231, + "learning_rate": 4.769061621801071e-05, + "loss": 4.947, + "step": 23187 + }, + { + "epoch": 0.137905604719764, + "grad_norm": 1.8559855222702026, + "learning_rate": 4.769042013417545e-05, + "loss": 5.1969, + "step": 23188 + }, + { + "epoch": 0.137911552003045, + "grad_norm": 1.8955408334732056, + "learning_rate": 4.769022404241919e-05, + "loss": 5.0117, + "step": 23189 + }, + { + "epoch": 0.13791749928632602, + "grad_norm": 2.333242177963257, + "learning_rate": 4.769002794274201e-05, + "loss": 4.4839, + "step": 23190 + }, + { + "epoch": 0.137923446569607, + "grad_norm": 1.6732560396194458, + "learning_rate": 4.768983183514397e-05, + "loss": 5.2458, + "step": 23191 + }, + { + "epoch": 0.137929393852888, + "grad_norm": 1.6078556776046753, + "learning_rate": 4.768963571962516e-05, + "loss": 5.616, + "step": 23192 + }, + { + "epoch": 0.137935341136169, + "grad_norm": 1.7516095638275146, + "learning_rate": 4.768943959618562e-05, + "loss": 5.3052, + "step": 23193 + }, + { + "epoch": 0.13794128841945, + "grad_norm": 1.5200318098068237, + "learning_rate": 4.7689243464825425e-05, + "loss": 5.664, + "step": 23194 + }, + { + "epoch": 0.13794723570273099, + "grad_norm": 1.3212077617645264, + "learning_rate": 4.7689047325544664e-05, + "loss": 5.4562, + "step": 23195 + }, + { + "epoch": 0.137953182986012, + "grad_norm": 1.3307675123214722, + "learning_rate": 4.7688851178343386e-05, + "loss": 5.2517, + "step": 23196 + }, + { + "epoch": 0.137959130269293, + "grad_norm": 1.5186207294464111, + "learning_rate": 4.768865502322166e-05, + "loss": 5.654, + "step": 23197 + }, + { + "epoch": 0.13796507755257398, + "grad_norm": 1.6482549905776978, + "learning_rate": 4.7688458860179564e-05, + "loss": 5.3282, + "step": 23198 + }, + { + "epoch": 0.137971024835855, + "grad_norm": 1.4418150186538696, + "learning_rate": 4.768826268921717e-05, + "loss": 5.5913, + "step": 23199 + }, + { + "epoch": 0.13797697211913598, + "grad_norm": 1.5591225624084473, + "learning_rate": 4.768806651033452e-05, + "loss": 5.8459, + "step": 23200 + }, + { + "epoch": 0.13798291940241697, + "grad_norm": 1.3476347923278809, + "learning_rate": 4.768787032353171e-05, + "loss": 5.3597, + "step": 23201 + }, + { + "epoch": 0.137988866685698, + "grad_norm": 1.4543404579162598, + "learning_rate": 4.76876741288088e-05, + "loss": 5.4525, + "step": 23202 + }, + { + "epoch": 0.13799481396897897, + "grad_norm": 1.3845150470733643, + "learning_rate": 4.7687477926165846e-05, + "loss": 5.6559, + "step": 23203 + }, + { + "epoch": 0.13800076125225996, + "grad_norm": 1.303808569908142, + "learning_rate": 4.768728171560294e-05, + "loss": 5.8732, + "step": 23204 + }, + { + "epoch": 0.13800670853554098, + "grad_norm": 1.422867774963379, + "learning_rate": 4.768708549712013e-05, + "loss": 5.217, + "step": 23205 + }, + { + "epoch": 0.13801265581882197, + "grad_norm": 1.558089017868042, + "learning_rate": 4.7686889270717506e-05, + "loss": 5.6403, + "step": 23206 + }, + { + "epoch": 0.13801860310210295, + "grad_norm": 1.5510298013687134, + "learning_rate": 4.7686693036395115e-05, + "loss": 5.6199, + "step": 23207 + }, + { + "epoch": 0.13802455038538394, + "grad_norm": 1.2693150043487549, + "learning_rate": 4.768649679415303e-05, + "loss": 5.7368, + "step": 23208 + }, + { + "epoch": 0.13803049766866496, + "grad_norm": 1.5053805112838745, + "learning_rate": 4.768630054399132e-05, + "loss": 5.4941, + "step": 23209 + }, + { + "epoch": 0.13803644495194595, + "grad_norm": 2.5151054859161377, + "learning_rate": 4.768610428591007e-05, + "loss": 4.5744, + "step": 23210 + }, + { + "epoch": 0.13804239223522694, + "grad_norm": 2.1085267066955566, + "learning_rate": 4.768590801990933e-05, + "loss": 4.5849, + "step": 23211 + }, + { + "epoch": 0.13804833951850795, + "grad_norm": 2.0741498470306396, + "learning_rate": 4.7685711745989174e-05, + "loss": 4.5745, + "step": 23212 + }, + { + "epoch": 0.13805428680178894, + "grad_norm": 2.0066654682159424, + "learning_rate": 4.7685515464149664e-05, + "loss": 4.6388, + "step": 23213 + }, + { + "epoch": 0.13806023408506993, + "grad_norm": 1.9224933385849, + "learning_rate": 4.7685319174390885e-05, + "loss": 4.5382, + "step": 23214 + }, + { + "epoch": 0.13806618136835094, + "grad_norm": 2.2363088130950928, + "learning_rate": 4.7685122876712896e-05, + "loss": 4.5825, + "step": 23215 + }, + { + "epoch": 0.13807212865163193, + "grad_norm": 2.1900362968444824, + "learning_rate": 4.768492657111576e-05, + "loss": 4.5519, + "step": 23216 + }, + { + "epoch": 0.13807807593491292, + "grad_norm": 2.0702250003814697, + "learning_rate": 4.768473025759955e-05, + "loss": 4.5917, + "step": 23217 + }, + { + "epoch": 0.13808402321819394, + "grad_norm": 2.000380277633667, + "learning_rate": 4.768453393616433e-05, + "loss": 4.8847, + "step": 23218 + }, + { + "epoch": 0.13808997050147492, + "grad_norm": 2.0710175037384033, + "learning_rate": 4.768433760681018e-05, + "loss": 4.5455, + "step": 23219 + }, + { + "epoch": 0.1380959177847559, + "grad_norm": 2.1148219108581543, + "learning_rate": 4.7684141269537165e-05, + "loss": 4.5109, + "step": 23220 + }, + { + "epoch": 0.13810186506803693, + "grad_norm": 1.7681657075881958, + "learning_rate": 4.768394492434535e-05, + "loss": 4.8899, + "step": 23221 + }, + { + "epoch": 0.13810781235131792, + "grad_norm": 2.032696008682251, + "learning_rate": 4.76837485712348e-05, + "loss": 5.2375, + "step": 23222 + }, + { + "epoch": 0.1381137596345989, + "grad_norm": 2.0016825199127197, + "learning_rate": 4.7683552210205585e-05, + "loss": 4.9066, + "step": 23223 + }, + { + "epoch": 0.13811970691787992, + "grad_norm": 2.1309103965759277, + "learning_rate": 4.7683355841257784e-05, + "loss": 4.4317, + "step": 23224 + }, + { + "epoch": 0.1381256542011609, + "grad_norm": 1.9037781953811646, + "learning_rate": 4.768315946439145e-05, + "loss": 5.0218, + "step": 23225 + }, + { + "epoch": 0.1381316014844419, + "grad_norm": 2.3080644607543945, + "learning_rate": 4.768296307960666e-05, + "loss": 5.2226, + "step": 23226 + }, + { + "epoch": 0.1381375487677229, + "grad_norm": 2.1073081493377686, + "learning_rate": 4.7682766686903494e-05, + "loss": 5.2403, + "step": 23227 + }, + { + "epoch": 0.1381434960510039, + "grad_norm": 1.7865220308303833, + "learning_rate": 4.768257028628199e-05, + "loss": 5.1642, + "step": 23228 + }, + { + "epoch": 0.1381494433342849, + "grad_norm": 1.7039834260940552, + "learning_rate": 4.768237387774225e-05, + "loss": 5.1943, + "step": 23229 + }, + { + "epoch": 0.1381553906175659, + "grad_norm": 1.714506983757019, + "learning_rate": 4.768217746128432e-05, + "loss": 5.0952, + "step": 23230 + }, + { + "epoch": 0.1381613379008469, + "grad_norm": 1.7183910608291626, + "learning_rate": 4.768198103690827e-05, + "loss": 5.0447, + "step": 23231 + }, + { + "epoch": 0.13816728518412788, + "grad_norm": 1.776077151298523, + "learning_rate": 4.768178460461419e-05, + "loss": 5.1296, + "step": 23232 + }, + { + "epoch": 0.1381732324674089, + "grad_norm": 1.7849907875061035, + "learning_rate": 4.7681588164402124e-05, + "loss": 4.7961, + "step": 23233 + }, + { + "epoch": 0.13817917975068988, + "grad_norm": 1.403860330581665, + "learning_rate": 4.768139171627216e-05, + "loss": 5.4794, + "step": 23234 + }, + { + "epoch": 0.13818512703397087, + "grad_norm": 1.5944229364395142, + "learning_rate": 4.7681195260224344e-05, + "loss": 4.973, + "step": 23235 + }, + { + "epoch": 0.1381910743172519, + "grad_norm": 2.196274518966675, + "learning_rate": 4.7680998796258764e-05, + "loss": 5.1835, + "step": 23236 + }, + { + "epoch": 0.13819702160053288, + "grad_norm": 1.5403459072113037, + "learning_rate": 4.768080232437548e-05, + "loss": 5.828, + "step": 23237 + }, + { + "epoch": 0.13820296888381386, + "grad_norm": 1.9711260795593262, + "learning_rate": 4.768060584457456e-05, + "loss": 5.4937, + "step": 23238 + }, + { + "epoch": 0.13820891616709488, + "grad_norm": 1.6869981288909912, + "learning_rate": 4.7680409356856075e-05, + "loss": 5.3298, + "step": 23239 + }, + { + "epoch": 0.13821486345037587, + "grad_norm": 2.4224069118499756, + "learning_rate": 4.7680212861220096e-05, + "loss": 4.9544, + "step": 23240 + }, + { + "epoch": 0.13822081073365686, + "grad_norm": 1.905261754989624, + "learning_rate": 4.768001635766669e-05, + "loss": 4.852, + "step": 23241 + }, + { + "epoch": 0.13822675801693787, + "grad_norm": 1.7081589698791504, + "learning_rate": 4.7679819846195925e-05, + "loss": 5.2201, + "step": 23242 + }, + { + "epoch": 0.13823270530021886, + "grad_norm": 1.5893620252609253, + "learning_rate": 4.767962332680786e-05, + "loss": 4.9691, + "step": 23243 + }, + { + "epoch": 0.13823865258349985, + "grad_norm": 1.7598754167556763, + "learning_rate": 4.767942679950258e-05, + "loss": 4.9661, + "step": 23244 + }, + { + "epoch": 0.13824459986678087, + "grad_norm": 1.6882308721542358, + "learning_rate": 4.767923026428015e-05, + "loss": 5.3529, + "step": 23245 + }, + { + "epoch": 0.13825054715006185, + "grad_norm": 1.6711715459823608, + "learning_rate": 4.767903372114063e-05, + "loss": 5.3288, + "step": 23246 + }, + { + "epoch": 0.13825649443334284, + "grad_norm": 1.5780813694000244, + "learning_rate": 4.76788371700841e-05, + "loss": 5.5583, + "step": 23247 + }, + { + "epoch": 0.13826244171662386, + "grad_norm": 1.9719429016113281, + "learning_rate": 4.767864061111061e-05, + "loss": 5.2821, + "step": 23248 + }, + { + "epoch": 0.13826838899990485, + "grad_norm": 1.6447231769561768, + "learning_rate": 4.767844404422025e-05, + "loss": 6.0166, + "step": 23249 + }, + { + "epoch": 0.13827433628318583, + "grad_norm": 1.6587456464767456, + "learning_rate": 4.767824746941307e-05, + "loss": 5.4081, + "step": 23250 + }, + { + "epoch": 0.13828028356646685, + "grad_norm": 1.9438105821609497, + "learning_rate": 4.767805088668916e-05, + "loss": 5.4436, + "step": 23251 + }, + { + "epoch": 0.13828623084974784, + "grad_norm": 2.1185503005981445, + "learning_rate": 4.767785429604857e-05, + "loss": 4.8413, + "step": 23252 + }, + { + "epoch": 0.13829217813302883, + "grad_norm": 2.176520347595215, + "learning_rate": 4.767765769749138e-05, + "loss": 4.9092, + "step": 23253 + }, + { + "epoch": 0.13829812541630984, + "grad_norm": 2.020982503890991, + "learning_rate": 4.767746109101765e-05, + "loss": 4.9179, + "step": 23254 + }, + { + "epoch": 0.13830407269959083, + "grad_norm": 1.6086227893829346, + "learning_rate": 4.767726447662746e-05, + "loss": 5.1998, + "step": 23255 + }, + { + "epoch": 0.13831001998287182, + "grad_norm": 1.8750804662704468, + "learning_rate": 4.767706785432087e-05, + "loss": 4.6858, + "step": 23256 + }, + { + "epoch": 0.13831596726615283, + "grad_norm": 1.7748466730117798, + "learning_rate": 4.767687122409794e-05, + "loss": 4.5468, + "step": 23257 + }, + { + "epoch": 0.13832191454943382, + "grad_norm": 1.94595205783844, + "learning_rate": 4.767667458595875e-05, + "loss": 4.6902, + "step": 23258 + }, + { + "epoch": 0.1383278618327148, + "grad_norm": 1.7588400840759277, + "learning_rate": 4.7676477939903375e-05, + "loss": 5.8701, + "step": 23259 + }, + { + "epoch": 0.13833380911599583, + "grad_norm": 1.8222272396087646, + "learning_rate": 4.7676281285931866e-05, + "loss": 4.6879, + "step": 23260 + }, + { + "epoch": 0.13833975639927681, + "grad_norm": 1.7244281768798828, + "learning_rate": 4.767608462404431e-05, + "loss": 5.0215, + "step": 23261 + }, + { + "epoch": 0.1383457036825578, + "grad_norm": 1.5756913423538208, + "learning_rate": 4.767588795424077e-05, + "loss": 5.9537, + "step": 23262 + }, + { + "epoch": 0.13835165096583882, + "grad_norm": 1.6441105604171753, + "learning_rate": 4.767569127652131e-05, + "loss": 5.9245, + "step": 23263 + }, + { + "epoch": 0.1383575982491198, + "grad_norm": 1.5573482513427734, + "learning_rate": 4.767549459088599e-05, + "loss": 5.6705, + "step": 23264 + }, + { + "epoch": 0.1383635455324008, + "grad_norm": 1.65425705909729, + "learning_rate": 4.767529789733489e-05, + "loss": 5.8664, + "step": 23265 + }, + { + "epoch": 0.13836949281568178, + "grad_norm": 1.665283441543579, + "learning_rate": 4.767510119586809e-05, + "loss": 5.7634, + "step": 23266 + }, + { + "epoch": 0.1383754400989628, + "grad_norm": 1.4760838747024536, + "learning_rate": 4.767490448648564e-05, + "loss": 5.7739, + "step": 23267 + }, + { + "epoch": 0.1383813873822438, + "grad_norm": 1.649942398071289, + "learning_rate": 4.7674707769187616e-05, + "loss": 5.7518, + "step": 23268 + }, + { + "epoch": 0.13838733466552477, + "grad_norm": 1.5092672109603882, + "learning_rate": 4.7674511043974084e-05, + "loss": 5.7706, + "step": 23269 + }, + { + "epoch": 0.1383932819488058, + "grad_norm": 2.5008256435394287, + "learning_rate": 4.767431431084512e-05, + "loss": 4.6023, + "step": 23270 + }, + { + "epoch": 0.13839922923208678, + "grad_norm": 2.4018449783325195, + "learning_rate": 4.767411756980078e-05, + "loss": 4.7872, + "step": 23271 + }, + { + "epoch": 0.13840517651536777, + "grad_norm": 1.7928224802017212, + "learning_rate": 4.7673920820841136e-05, + "loss": 5.2731, + "step": 23272 + }, + { + "epoch": 0.13841112379864878, + "grad_norm": 1.844249963760376, + "learning_rate": 4.767372406396627e-05, + "loss": 5.2441, + "step": 23273 + }, + { + "epoch": 0.13841707108192977, + "grad_norm": 2.160876989364624, + "learning_rate": 4.7673527299176236e-05, + "loss": 4.5445, + "step": 23274 + }, + { + "epoch": 0.13842301836521076, + "grad_norm": 1.6312650442123413, + "learning_rate": 4.767333052647112e-05, + "loss": 5.0418, + "step": 23275 + }, + { + "epoch": 0.13842896564849178, + "grad_norm": 1.6567429304122925, + "learning_rate": 4.7673133745850965e-05, + "loss": 5.2882, + "step": 23276 + }, + { + "epoch": 0.13843491293177276, + "grad_norm": 1.8484638929367065, + "learning_rate": 4.767293695731585e-05, + "loss": 5.3432, + "step": 23277 + }, + { + "epoch": 0.13844086021505375, + "grad_norm": 1.8447157144546509, + "learning_rate": 4.767274016086586e-05, + "loss": 5.3307, + "step": 23278 + }, + { + "epoch": 0.13844680749833477, + "grad_norm": 1.6714428663253784, + "learning_rate": 4.767254335650104e-05, + "loss": 5.3053, + "step": 23279 + }, + { + "epoch": 0.13845275478161576, + "grad_norm": 1.7423646450042725, + "learning_rate": 4.7672346544221474e-05, + "loss": 5.3129, + "step": 23280 + }, + { + "epoch": 0.13845870206489674, + "grad_norm": 1.5770469903945923, + "learning_rate": 4.7672149724027224e-05, + "loss": 5.2806, + "step": 23281 + }, + { + "epoch": 0.13846464934817776, + "grad_norm": 1.5982024669647217, + "learning_rate": 4.7671952895918365e-05, + "loss": 5.4873, + "step": 23282 + }, + { + "epoch": 0.13847059663145875, + "grad_norm": 1.9240913391113281, + "learning_rate": 4.767175605989496e-05, + "loss": 5.8309, + "step": 23283 + }, + { + "epoch": 0.13847654391473974, + "grad_norm": 1.612429141998291, + "learning_rate": 4.7671559215957075e-05, + "loss": 5.4479, + "step": 23284 + }, + { + "epoch": 0.13848249119802075, + "grad_norm": 1.5843868255615234, + "learning_rate": 4.7671362364104785e-05, + "loss": 5.5509, + "step": 23285 + }, + { + "epoch": 0.13848843848130174, + "grad_norm": 2.3811614513397217, + "learning_rate": 4.767116550433816e-05, + "loss": 5.4695, + "step": 23286 + }, + { + "epoch": 0.13849438576458273, + "grad_norm": 2.6257996559143066, + "learning_rate": 4.767096863665726e-05, + "loss": 5.0195, + "step": 23287 + }, + { + "epoch": 0.13850033304786374, + "grad_norm": 1.8920071125030518, + "learning_rate": 4.7670771761062164e-05, + "loss": 5.2023, + "step": 23288 + }, + { + "epoch": 0.13850628033114473, + "grad_norm": 1.52253258228302, + "learning_rate": 4.767057487755293e-05, + "loss": 5.6985, + "step": 23289 + }, + { + "epoch": 0.13851222761442572, + "grad_norm": 2.240440845489502, + "learning_rate": 4.767037798612964e-05, + "loss": 5.1073, + "step": 23290 + }, + { + "epoch": 0.13851817489770674, + "grad_norm": 2.127216100692749, + "learning_rate": 4.7670181086792354e-05, + "loss": 5.1885, + "step": 23291 + }, + { + "epoch": 0.13852412218098772, + "grad_norm": 2.128519058227539, + "learning_rate": 4.766998417954114e-05, + "loss": 4.9388, + "step": 23292 + }, + { + "epoch": 0.1385300694642687, + "grad_norm": 1.87863290309906, + "learning_rate": 4.7669787264376066e-05, + "loss": 4.8293, + "step": 23293 + }, + { + "epoch": 0.13853601674754973, + "grad_norm": 2.03975510597229, + "learning_rate": 4.766959034129721e-05, + "loss": 4.9168, + "step": 23294 + }, + { + "epoch": 0.13854196403083072, + "grad_norm": 2.0336341857910156, + "learning_rate": 4.766939341030463e-05, + "loss": 4.9715, + "step": 23295 + }, + { + "epoch": 0.1385479113141117, + "grad_norm": 1.943743348121643, + "learning_rate": 4.7669196471398396e-05, + "loss": 4.7709, + "step": 23296 + }, + { + "epoch": 0.13855385859739272, + "grad_norm": 2.1629462242126465, + "learning_rate": 4.766899952457858e-05, + "loss": 4.7499, + "step": 23297 + }, + { + "epoch": 0.1385598058806737, + "grad_norm": 2.200531005859375, + "learning_rate": 4.7668802569845256e-05, + "loss": 4.8418, + "step": 23298 + }, + { + "epoch": 0.1385657531639547, + "grad_norm": 2.038649797439575, + "learning_rate": 4.766860560719849e-05, + "loss": 5.2351, + "step": 23299 + }, + { + "epoch": 0.1385717004472357, + "grad_norm": 1.8091388940811157, + "learning_rate": 4.766840863663834e-05, + "loss": 5.3526, + "step": 23300 + }, + { + "epoch": 0.1385776477305167, + "grad_norm": 1.9351911544799805, + "learning_rate": 4.7668211658164884e-05, + "loss": 4.813, + "step": 23301 + }, + { + "epoch": 0.1385835950137977, + "grad_norm": 2.0985751152038574, + "learning_rate": 4.766801467177819e-05, + "loss": 4.7762, + "step": 23302 + }, + { + "epoch": 0.1385895422970787, + "grad_norm": 2.023658275604248, + "learning_rate": 4.766781767747833e-05, + "loss": 4.8076, + "step": 23303 + }, + { + "epoch": 0.1385954895803597, + "grad_norm": 1.7464020252227783, + "learning_rate": 4.7667620675265364e-05, + "loss": 5.2537, + "step": 23304 + }, + { + "epoch": 0.13860143686364068, + "grad_norm": 1.7812929153442383, + "learning_rate": 4.7667423665139364e-05, + "loss": 4.8896, + "step": 23305 + }, + { + "epoch": 0.1386073841469217, + "grad_norm": 2.0042948722839355, + "learning_rate": 4.76672266471004e-05, + "loss": 4.7254, + "step": 23306 + }, + { + "epoch": 0.13861333143020269, + "grad_norm": 1.8378963470458984, + "learning_rate": 4.7667029621148554e-05, + "loss": 4.9849, + "step": 23307 + }, + { + "epoch": 0.13861927871348367, + "grad_norm": 2.1476621627807617, + "learning_rate": 4.7666832587283873e-05, + "loss": 4.5167, + "step": 23308 + }, + { + "epoch": 0.1386252259967647, + "grad_norm": 1.8289295434951782, + "learning_rate": 4.7666635545506434e-05, + "loss": 4.8841, + "step": 23309 + }, + { + "epoch": 0.13863117328004568, + "grad_norm": 1.7215977907180786, + "learning_rate": 4.766643849581631e-05, + "loss": 5.0148, + "step": 23310 + }, + { + "epoch": 0.13863712056332667, + "grad_norm": 1.464308261871338, + "learning_rate": 4.7666241438213566e-05, + "loss": 5.2551, + "step": 23311 + }, + { + "epoch": 0.13864306784660768, + "grad_norm": 1.655523657798767, + "learning_rate": 4.766604437269827e-05, + "loss": 5.604, + "step": 23312 + }, + { + "epoch": 0.13864901512988867, + "grad_norm": 1.9533252716064453, + "learning_rate": 4.766584729927049e-05, + "loss": 5.6238, + "step": 23313 + }, + { + "epoch": 0.13865496241316966, + "grad_norm": 1.8174513578414917, + "learning_rate": 4.7665650217930305e-05, + "loss": 5.6806, + "step": 23314 + }, + { + "epoch": 0.13866090969645067, + "grad_norm": 1.58940851688385, + "learning_rate": 4.766545312867776e-05, + "loss": 5.5066, + "step": 23315 + }, + { + "epoch": 0.13866685697973166, + "grad_norm": 1.5862720012664795, + "learning_rate": 4.766525603151295e-05, + "loss": 5.352, + "step": 23316 + }, + { + "epoch": 0.13867280426301265, + "grad_norm": 1.7878305912017822, + "learning_rate": 4.7665058926435934e-05, + "loss": 5.4043, + "step": 23317 + }, + { + "epoch": 0.13867875154629367, + "grad_norm": 1.3984423875808716, + "learning_rate": 4.766486181344678e-05, + "loss": 5.8719, + "step": 23318 + }, + { + "epoch": 0.13868469882957465, + "grad_norm": 1.6912389993667603, + "learning_rate": 4.7664664692545555e-05, + "loss": 5.6587, + "step": 23319 + }, + { + "epoch": 0.13869064611285564, + "grad_norm": 1.593245506286621, + "learning_rate": 4.766446756373233e-05, + "loss": 5.424, + "step": 23320 + }, + { + "epoch": 0.13869659339613666, + "grad_norm": 1.5353487730026245, + "learning_rate": 4.766427042700717e-05, + "loss": 5.7179, + "step": 23321 + }, + { + "epoch": 0.13870254067941765, + "grad_norm": 1.4989358186721802, + "learning_rate": 4.766407328237016e-05, + "loss": 6.1919, + "step": 23322 + }, + { + "epoch": 0.13870848796269863, + "grad_norm": 1.292460322380066, + "learning_rate": 4.766387612982134e-05, + "loss": 5.8265, + "step": 23323 + }, + { + "epoch": 0.13871443524597965, + "grad_norm": 1.4890642166137695, + "learning_rate": 4.766367896936081e-05, + "loss": 5.1671, + "step": 23324 + }, + { + "epoch": 0.13872038252926064, + "grad_norm": 1.7513198852539062, + "learning_rate": 4.766348180098861e-05, + "loss": 4.908, + "step": 23325 + }, + { + "epoch": 0.13872632981254163, + "grad_norm": 1.503311038017273, + "learning_rate": 4.766328462470483e-05, + "loss": 5.661, + "step": 23326 + }, + { + "epoch": 0.13873227709582261, + "grad_norm": 2.333216667175293, + "learning_rate": 4.766308744050953e-05, + "loss": 4.5921, + "step": 23327 + }, + { + "epoch": 0.13873822437910363, + "grad_norm": 2.1495418548583984, + "learning_rate": 4.7662890248402786e-05, + "loss": 4.8017, + "step": 23328 + }, + { + "epoch": 0.13874417166238462, + "grad_norm": 1.4922517538070679, + "learning_rate": 4.766269304838466e-05, + "loss": 5.3407, + "step": 23329 + }, + { + "epoch": 0.1387501189456656, + "grad_norm": 1.5760530233383179, + "learning_rate": 4.7662495840455214e-05, + "loss": 5.1536, + "step": 23330 + }, + { + "epoch": 0.13875606622894662, + "grad_norm": 1.432483434677124, + "learning_rate": 4.7662298624614524e-05, + "loss": 4.405, + "step": 23331 + }, + { + "epoch": 0.1387620135122276, + "grad_norm": 1.5221575498580933, + "learning_rate": 4.766210140086267e-05, + "loss": 4.5132, + "step": 23332 + }, + { + "epoch": 0.1387679607955086, + "grad_norm": 1.7520684003829956, + "learning_rate": 4.76619041691997e-05, + "loss": 4.5229, + "step": 23333 + }, + { + "epoch": 0.13877390807878962, + "grad_norm": 1.8210954666137695, + "learning_rate": 4.76617069296257e-05, + "loss": 4.7207, + "step": 23334 + }, + { + "epoch": 0.1387798553620706, + "grad_norm": 1.5682491064071655, + "learning_rate": 4.7661509682140734e-05, + "loss": 4.5045, + "step": 23335 + }, + { + "epoch": 0.1387858026453516, + "grad_norm": 1.7219401597976685, + "learning_rate": 4.7661312426744865e-05, + "loss": 4.4846, + "step": 23336 + }, + { + "epoch": 0.1387917499286326, + "grad_norm": 1.590681791305542, + "learning_rate": 4.766111516343816e-05, + "loss": 4.2617, + "step": 23337 + }, + { + "epoch": 0.1387976972119136, + "grad_norm": 1.533359408378601, + "learning_rate": 4.76609178922207e-05, + "loss": 4.4746, + "step": 23338 + }, + { + "epoch": 0.13880364449519458, + "grad_norm": 1.5994545221328735, + "learning_rate": 4.7660720613092555e-05, + "loss": 4.5712, + "step": 23339 + }, + { + "epoch": 0.1388095917784756, + "grad_norm": 1.472655177116394, + "learning_rate": 4.766052332605377e-05, + "loss": 4.3592, + "step": 23340 + }, + { + "epoch": 0.1388155390617566, + "grad_norm": 1.5625941753387451, + "learning_rate": 4.7660326031104445e-05, + "loss": 4.2859, + "step": 23341 + }, + { + "epoch": 0.13882148634503758, + "grad_norm": 2.1194114685058594, + "learning_rate": 4.766012872824464e-05, + "loss": 5.0237, + "step": 23342 + }, + { + "epoch": 0.1388274336283186, + "grad_norm": 1.699491262435913, + "learning_rate": 4.7659931417474404e-05, + "loss": 5.4558, + "step": 23343 + }, + { + "epoch": 0.13883338091159958, + "grad_norm": 1.7734466791152954, + "learning_rate": 4.765973409879382e-05, + "loss": 4.5118, + "step": 23344 + }, + { + "epoch": 0.13883932819488057, + "grad_norm": 1.7193443775177002, + "learning_rate": 4.765953677220296e-05, + "loss": 5.7915, + "step": 23345 + }, + { + "epoch": 0.13884527547816158, + "grad_norm": 1.6994706392288208, + "learning_rate": 4.765933943770189e-05, + "loss": 5.2722, + "step": 23346 + }, + { + "epoch": 0.13885122276144257, + "grad_norm": 2.1580300331115723, + "learning_rate": 4.765914209529068e-05, + "loss": 5.2697, + "step": 23347 + }, + { + "epoch": 0.13885717004472356, + "grad_norm": 2.437685012817383, + "learning_rate": 4.765894474496939e-05, + "loss": 5.2533, + "step": 23348 + }, + { + "epoch": 0.13886311732800458, + "grad_norm": 2.2965760231018066, + "learning_rate": 4.7658747386738113e-05, + "loss": 5.3419, + "step": 23349 + }, + { + "epoch": 0.13886906461128556, + "grad_norm": 2.0520517826080322, + "learning_rate": 4.765855002059689e-05, + "loss": 5.1966, + "step": 23350 + }, + { + "epoch": 0.13887501189456655, + "grad_norm": 2.043931484222412, + "learning_rate": 4.76583526465458e-05, + "loss": 5.1984, + "step": 23351 + }, + { + "epoch": 0.13888095917784757, + "grad_norm": 1.9283409118652344, + "learning_rate": 4.765815526458491e-05, + "loss": 4.6806, + "step": 23352 + }, + { + "epoch": 0.13888690646112856, + "grad_norm": 1.8964955806732178, + "learning_rate": 4.76579578747143e-05, + "loss": 4.9367, + "step": 23353 + }, + { + "epoch": 0.13889285374440954, + "grad_norm": 1.8109381198883057, + "learning_rate": 4.765776047693403e-05, + "loss": 4.7777, + "step": 23354 + }, + { + "epoch": 0.13889880102769056, + "grad_norm": 2.0096335411071777, + "learning_rate": 4.765756307124417e-05, + "loss": 4.9217, + "step": 23355 + }, + { + "epoch": 0.13890474831097155, + "grad_norm": 1.8210729360580444, + "learning_rate": 4.765736565764479e-05, + "loss": 4.8393, + "step": 23356 + }, + { + "epoch": 0.13891069559425254, + "grad_norm": 2.1033902168273926, + "learning_rate": 4.7657168236135954e-05, + "loss": 5.043, + "step": 23357 + }, + { + "epoch": 0.13891664287753355, + "grad_norm": 2.0610570907592773, + "learning_rate": 4.7656970806717736e-05, + "loss": 5.0493, + "step": 23358 + }, + { + "epoch": 0.13892259016081454, + "grad_norm": 2.169670343399048, + "learning_rate": 4.765677336939021e-05, + "loss": 5.2321, + "step": 23359 + }, + { + "epoch": 0.13892853744409553, + "grad_norm": 2.198686122894287, + "learning_rate": 4.7656575924153426e-05, + "loss": 5.2698, + "step": 23360 + }, + { + "epoch": 0.13893448472737654, + "grad_norm": 1.9425220489501953, + "learning_rate": 4.7656378471007476e-05, + "loss": 4.9435, + "step": 23361 + }, + { + "epoch": 0.13894043201065753, + "grad_norm": 1.936712384223938, + "learning_rate": 4.765618100995241e-05, + "loss": 4.6584, + "step": 23362 + }, + { + "epoch": 0.13894637929393852, + "grad_norm": 1.7941532135009766, + "learning_rate": 4.765598354098831e-05, + "loss": 4.6791, + "step": 23363 + }, + { + "epoch": 0.13895232657721954, + "grad_norm": 2.0149965286254883, + "learning_rate": 4.765578606411524e-05, + "loss": 5.1019, + "step": 23364 + }, + { + "epoch": 0.13895827386050053, + "grad_norm": 1.9302345514297485, + "learning_rate": 4.7655588579333265e-05, + "loss": 5.1168, + "step": 23365 + }, + { + "epoch": 0.1389642211437815, + "grad_norm": 2.0851333141326904, + "learning_rate": 4.7655391086642465e-05, + "loss": 5.0517, + "step": 23366 + }, + { + "epoch": 0.13897016842706253, + "grad_norm": 1.9221385717391968, + "learning_rate": 4.7655193586042904e-05, + "loss": 5.1486, + "step": 23367 + }, + { + "epoch": 0.13897611571034352, + "grad_norm": 1.9929136037826538, + "learning_rate": 4.765499607753464e-05, + "loss": 5.1288, + "step": 23368 + }, + { + "epoch": 0.1389820629936245, + "grad_norm": 1.8818596601486206, + "learning_rate": 4.765479856111775e-05, + "loss": 4.8252, + "step": 23369 + }, + { + "epoch": 0.13898801027690552, + "grad_norm": 1.748961091041565, + "learning_rate": 4.765460103679231e-05, + "loss": 4.7829, + "step": 23370 + }, + { + "epoch": 0.1389939575601865, + "grad_norm": 1.8021109104156494, + "learning_rate": 4.765440350455838e-05, + "loss": 4.7424, + "step": 23371 + }, + { + "epoch": 0.1389999048434675, + "grad_norm": 2.1486730575561523, + "learning_rate": 4.765420596441603e-05, + "loss": 4.6696, + "step": 23372 + }, + { + "epoch": 0.1390058521267485, + "grad_norm": 1.9908959865570068, + "learning_rate": 4.765400841636534e-05, + "loss": 4.5644, + "step": 23373 + }, + { + "epoch": 0.1390117994100295, + "grad_norm": 2.021198272705078, + "learning_rate": 4.765381086040636e-05, + "loss": 5.2841, + "step": 23374 + }, + { + "epoch": 0.1390177466933105, + "grad_norm": 2.0757644176483154, + "learning_rate": 4.765361329653918e-05, + "loss": 5.0479, + "step": 23375 + }, + { + "epoch": 0.1390236939765915, + "grad_norm": 2.6452016830444336, + "learning_rate": 4.7653415724763844e-05, + "loss": 4.5668, + "step": 23376 + }, + { + "epoch": 0.1390296412598725, + "grad_norm": 1.8536683320999146, + "learning_rate": 4.7653218145080436e-05, + "loss": 4.6049, + "step": 23377 + }, + { + "epoch": 0.13903558854315348, + "grad_norm": 2.1392767429351807, + "learning_rate": 4.765302055748903e-05, + "loss": 4.5307, + "step": 23378 + }, + { + "epoch": 0.1390415358264345, + "grad_norm": 2.0592446327209473, + "learning_rate": 4.765282296198968e-05, + "loss": 4.7421, + "step": 23379 + }, + { + "epoch": 0.1390474831097155, + "grad_norm": 1.9982407093048096, + "learning_rate": 4.765262535858248e-05, + "loss": 4.5699, + "step": 23380 + }, + { + "epoch": 0.13905343039299647, + "grad_norm": 1.6928536891937256, + "learning_rate": 4.765242774726747e-05, + "loss": 5.0689, + "step": 23381 + }, + { + "epoch": 0.1390593776762775, + "grad_norm": 2.1993813514709473, + "learning_rate": 4.765223012804474e-05, + "loss": 4.8268, + "step": 23382 + }, + { + "epoch": 0.13906532495955848, + "grad_norm": 1.711241364479065, + "learning_rate": 4.765203250091434e-05, + "loss": 5.7443, + "step": 23383 + }, + { + "epoch": 0.13907127224283947, + "grad_norm": 1.862398386001587, + "learning_rate": 4.765183486587636e-05, + "loss": 5.3367, + "step": 23384 + }, + { + "epoch": 0.13907721952612045, + "grad_norm": 1.95891273021698, + "learning_rate": 4.765163722293084e-05, + "loss": 5.6618, + "step": 23385 + }, + { + "epoch": 0.13908316680940147, + "grad_norm": 2.362205743789673, + "learning_rate": 4.765143957207789e-05, + "loss": 5.1168, + "step": 23386 + }, + { + "epoch": 0.13908911409268246, + "grad_norm": 1.7440927028656006, + "learning_rate": 4.7651241913317545e-05, + "loss": 4.858, + "step": 23387 + }, + { + "epoch": 0.13909506137596345, + "grad_norm": 1.7432098388671875, + "learning_rate": 4.765104424664989e-05, + "loss": 4.9096, + "step": 23388 + }, + { + "epoch": 0.13910100865924446, + "grad_norm": 1.7505769729614258, + "learning_rate": 4.765084657207498e-05, + "loss": 5.0255, + "step": 23389 + }, + { + "epoch": 0.13910695594252545, + "grad_norm": 1.5105990171432495, + "learning_rate": 4.76506488895929e-05, + "loss": 5.2811, + "step": 23390 + }, + { + "epoch": 0.13911290322580644, + "grad_norm": 1.6876368522644043, + "learning_rate": 4.765045119920372e-05, + "loss": 5.6723, + "step": 23391 + }, + { + "epoch": 0.13911885050908745, + "grad_norm": 1.6542494297027588, + "learning_rate": 4.7650253500907494e-05, + "loss": 5.1409, + "step": 23392 + }, + { + "epoch": 0.13912479779236844, + "grad_norm": 2.0412867069244385, + "learning_rate": 4.76500557947043e-05, + "loss": 4.8772, + "step": 23393 + }, + { + "epoch": 0.13913074507564943, + "grad_norm": 1.8121492862701416, + "learning_rate": 4.76498580805942e-05, + "loss": 5.1079, + "step": 23394 + }, + { + "epoch": 0.13913669235893045, + "grad_norm": 1.576653003692627, + "learning_rate": 4.764966035857727e-05, + "loss": 4.9576, + "step": 23395 + }, + { + "epoch": 0.13914263964221144, + "grad_norm": 1.5891642570495605, + "learning_rate": 4.764946262865358e-05, + "loss": 4.8846, + "step": 23396 + }, + { + "epoch": 0.13914858692549242, + "grad_norm": 1.7079927921295166, + "learning_rate": 4.7649264890823195e-05, + "loss": 5.0182, + "step": 23397 + }, + { + "epoch": 0.13915453420877344, + "grad_norm": 1.6532564163208008, + "learning_rate": 4.764906714508619e-05, + "loss": 4.8068, + "step": 23398 + }, + { + "epoch": 0.13916048149205443, + "grad_norm": 1.5107650756835938, + "learning_rate": 4.764886939144263e-05, + "loss": 5.3482, + "step": 23399 + }, + { + "epoch": 0.13916642877533542, + "grad_norm": 1.666096806526184, + "learning_rate": 4.764867162989258e-05, + "loss": 5.1747, + "step": 23400 + }, + { + "epoch": 0.13917237605861643, + "grad_norm": 1.864372730255127, + "learning_rate": 4.764847386043611e-05, + "loss": 4.3209, + "step": 23401 + }, + { + "epoch": 0.13917832334189742, + "grad_norm": 2.2691080570220947, + "learning_rate": 4.7648276083073295e-05, + "loss": 4.5254, + "step": 23402 + }, + { + "epoch": 0.1391842706251784, + "grad_norm": 2.0673935413360596, + "learning_rate": 4.76480782978042e-05, + "loss": 4.639, + "step": 23403 + }, + { + "epoch": 0.13919021790845942, + "grad_norm": 1.9274605512619019, + "learning_rate": 4.76478805046289e-05, + "loss": 4.579, + "step": 23404 + }, + { + "epoch": 0.1391961651917404, + "grad_norm": 1.5076278448104858, + "learning_rate": 4.7647682703547455e-05, + "loss": 4.9522, + "step": 23405 + }, + { + "epoch": 0.1392021124750214, + "grad_norm": 2.005662202835083, + "learning_rate": 4.7647484894559936e-05, + "loss": 4.3399, + "step": 23406 + }, + { + "epoch": 0.13920805975830242, + "grad_norm": 1.9292556047439575, + "learning_rate": 4.7647287077666414e-05, + "loss": 4.4166, + "step": 23407 + }, + { + "epoch": 0.1392140070415834, + "grad_norm": 1.7474818229675293, + "learning_rate": 4.764708925286696e-05, + "loss": 4.3355, + "step": 23408 + }, + { + "epoch": 0.1392199543248644, + "grad_norm": 1.9833084344863892, + "learning_rate": 4.764689142016164e-05, + "loss": 4.3388, + "step": 23409 + }, + { + "epoch": 0.1392259016081454, + "grad_norm": 1.7962874174118042, + "learning_rate": 4.764669357955053e-05, + "loss": 5.3199, + "step": 23410 + }, + { + "epoch": 0.1392318488914264, + "grad_norm": 1.6865921020507812, + "learning_rate": 4.764649573103368e-05, + "loss": 5.3787, + "step": 23411 + }, + { + "epoch": 0.13923779617470738, + "grad_norm": 1.2966182231903076, + "learning_rate": 4.7646297874611185e-05, + "loss": 5.0989, + "step": 23412 + }, + { + "epoch": 0.1392437434579884, + "grad_norm": 1.732437252998352, + "learning_rate": 4.76461000102831e-05, + "loss": 5.6207, + "step": 23413 + }, + { + "epoch": 0.1392496907412694, + "grad_norm": 1.567841649055481, + "learning_rate": 4.7645902138049494e-05, + "loss": 5.3921, + "step": 23414 + }, + { + "epoch": 0.13925563802455038, + "grad_norm": 1.7841026782989502, + "learning_rate": 4.764570425791043e-05, + "loss": 5.7206, + "step": 23415 + }, + { + "epoch": 0.1392615853078314, + "grad_norm": 2.0582776069641113, + "learning_rate": 4.764550636986599e-05, + "loss": 4.7812, + "step": 23416 + }, + { + "epoch": 0.13926753259111238, + "grad_norm": 1.5891739130020142, + "learning_rate": 4.764530847391624e-05, + "loss": 5.3211, + "step": 23417 + }, + { + "epoch": 0.13927347987439337, + "grad_norm": 1.4662810564041138, + "learning_rate": 4.764511057006125e-05, + "loss": 5.6385, + "step": 23418 + }, + { + "epoch": 0.13927942715767438, + "grad_norm": 1.6601322889328003, + "learning_rate": 4.764491265830108e-05, + "loss": 5.7947, + "step": 23419 + }, + { + "epoch": 0.13928537444095537, + "grad_norm": 1.5726239681243896, + "learning_rate": 4.7644714738635796e-05, + "loss": 5.6488, + "step": 23420 + }, + { + "epoch": 0.13929132172423636, + "grad_norm": 2.0315866470336914, + "learning_rate": 4.7644516811065494e-05, + "loss": 5.3196, + "step": 23421 + }, + { + "epoch": 0.13929726900751738, + "grad_norm": 2.3560190200805664, + "learning_rate": 4.764431887559022e-05, + "loss": 5.0898, + "step": 23422 + }, + { + "epoch": 0.13930321629079837, + "grad_norm": 1.6240613460540771, + "learning_rate": 4.764412093221004e-05, + "loss": 4.9766, + "step": 23423 + }, + { + "epoch": 0.13930916357407935, + "grad_norm": 1.9657840728759766, + "learning_rate": 4.764392298092504e-05, + "loss": 5.5328, + "step": 23424 + }, + { + "epoch": 0.13931511085736037, + "grad_norm": 1.8219939470291138, + "learning_rate": 4.764372502173527e-05, + "loss": 5.3713, + "step": 23425 + }, + { + "epoch": 0.13932105814064136, + "grad_norm": 1.6808767318725586, + "learning_rate": 4.764352705464082e-05, + "loss": 5.4753, + "step": 23426 + }, + { + "epoch": 0.13932700542392235, + "grad_norm": 1.6270160675048828, + "learning_rate": 4.764332907964175e-05, + "loss": 5.6609, + "step": 23427 + }, + { + "epoch": 0.13933295270720336, + "grad_norm": 1.5609904527664185, + "learning_rate": 4.764313109673812e-05, + "loss": 5.6954, + "step": 23428 + }, + { + "epoch": 0.13933889999048435, + "grad_norm": 1.5029795169830322, + "learning_rate": 4.764293310593001e-05, + "loss": 5.6655, + "step": 23429 + }, + { + "epoch": 0.13934484727376534, + "grad_norm": 1.6427209377288818, + "learning_rate": 4.7642735107217484e-05, + "loss": 4.9946, + "step": 23430 + }, + { + "epoch": 0.13935079455704635, + "grad_norm": 1.5815205574035645, + "learning_rate": 4.764253710060062e-05, + "loss": 5.4891, + "step": 23431 + }, + { + "epoch": 0.13935674184032734, + "grad_norm": 1.7551064491271973, + "learning_rate": 4.764233908607947e-05, + "loss": 5.4036, + "step": 23432 + }, + { + "epoch": 0.13936268912360833, + "grad_norm": 1.62980055809021, + "learning_rate": 4.7642141063654114e-05, + "loss": 5.4836, + "step": 23433 + }, + { + "epoch": 0.13936863640688935, + "grad_norm": 1.836366891860962, + "learning_rate": 4.7641943033324634e-05, + "loss": 5.4079, + "step": 23434 + }, + { + "epoch": 0.13937458369017033, + "grad_norm": 1.710744857788086, + "learning_rate": 4.764174499509107e-05, + "loss": 5.2859, + "step": 23435 + }, + { + "epoch": 0.13938053097345132, + "grad_norm": 1.6887309551239014, + "learning_rate": 4.7641546948953515e-05, + "loss": 5.4671, + "step": 23436 + }, + { + "epoch": 0.13938647825673234, + "grad_norm": 1.6997935771942139, + "learning_rate": 4.764134889491203e-05, + "loss": 5.2601, + "step": 23437 + }, + { + "epoch": 0.13939242554001333, + "grad_norm": 1.560526967048645, + "learning_rate": 4.764115083296668e-05, + "loss": 5.795, + "step": 23438 + }, + { + "epoch": 0.13939837282329431, + "grad_norm": 1.4518390893936157, + "learning_rate": 4.7640952763117544e-05, + "loss": 5.3885, + "step": 23439 + }, + { + "epoch": 0.13940432010657533, + "grad_norm": 1.698185920715332, + "learning_rate": 4.7640754685364675e-05, + "loss": 5.053, + "step": 23440 + }, + { + "epoch": 0.13941026738985632, + "grad_norm": 1.7422363758087158, + "learning_rate": 4.764055659970816e-05, + "loss": 5.1586, + "step": 23441 + }, + { + "epoch": 0.1394162146731373, + "grad_norm": 1.7014398574829102, + "learning_rate": 4.7640358506148065e-05, + "loss": 5.2313, + "step": 23442 + }, + { + "epoch": 0.1394221619564183, + "grad_norm": 1.6611777544021606, + "learning_rate": 4.764016040468444e-05, + "loss": 5.1691, + "step": 23443 + }, + { + "epoch": 0.1394281092396993, + "grad_norm": 1.6166971921920776, + "learning_rate": 4.763996229531739e-05, + "loss": 5.2217, + "step": 23444 + }, + { + "epoch": 0.1394340565229803, + "grad_norm": 1.9434369802474976, + "learning_rate": 4.763976417804694e-05, + "loss": 4.4322, + "step": 23445 + }, + { + "epoch": 0.1394400038062613, + "grad_norm": 3.2407455444335938, + "learning_rate": 4.7639566052873197e-05, + "loss": 3.3762, + "step": 23446 + }, + { + "epoch": 0.1394459510895423, + "grad_norm": 1.8475316762924194, + "learning_rate": 4.7639367919796215e-05, + "loss": 5.2435, + "step": 23447 + }, + { + "epoch": 0.1394518983728233, + "grad_norm": 1.7297134399414062, + "learning_rate": 4.763916977881606e-05, + "loss": 5.2485, + "step": 23448 + }, + { + "epoch": 0.13945784565610428, + "grad_norm": 1.720375657081604, + "learning_rate": 4.76389716299328e-05, + "loss": 5.1242, + "step": 23449 + }, + { + "epoch": 0.1394637929393853, + "grad_norm": 1.729045033454895, + "learning_rate": 4.763877347314652e-05, + "loss": 5.0312, + "step": 23450 + }, + { + "epoch": 0.13946974022266628, + "grad_norm": 1.817941427230835, + "learning_rate": 4.7638575308457266e-05, + "loss": 4.5856, + "step": 23451 + }, + { + "epoch": 0.13947568750594727, + "grad_norm": 2.7483971118927, + "learning_rate": 4.763837713586513e-05, + "loss": 3.3044, + "step": 23452 + }, + { + "epoch": 0.1394816347892283, + "grad_norm": 2.3746731281280518, + "learning_rate": 4.763817895537017e-05, + "loss": 3.0149, + "step": 23453 + }, + { + "epoch": 0.13948758207250928, + "grad_norm": 2.6971354484558105, + "learning_rate": 4.763798076697244e-05, + "loss": 3.7174, + "step": 23454 + }, + { + "epoch": 0.13949352935579026, + "grad_norm": 2.457082986831665, + "learning_rate": 4.763778257067205e-05, + "loss": 2.8548, + "step": 23455 + }, + { + "epoch": 0.13949947663907128, + "grad_norm": 2.4862163066864014, + "learning_rate": 4.7637584366469024e-05, + "loss": 2.6084, + "step": 23456 + }, + { + "epoch": 0.13950542392235227, + "grad_norm": 2.847895622253418, + "learning_rate": 4.763738615436346e-05, + "loss": 4.1775, + "step": 23457 + }, + { + "epoch": 0.13951137120563326, + "grad_norm": 2.827467918395996, + "learning_rate": 4.763718793435541e-05, + "loss": 4.0248, + "step": 23458 + }, + { + "epoch": 0.13951731848891427, + "grad_norm": 2.9717519283294678, + "learning_rate": 4.763698970644496e-05, + "loss": 3.8032, + "step": 23459 + }, + { + "epoch": 0.13952326577219526, + "grad_norm": 2.6418726444244385, + "learning_rate": 4.7636791470632166e-05, + "loss": 3.7307, + "step": 23460 + }, + { + "epoch": 0.13952921305547625, + "grad_norm": 2.789552927017212, + "learning_rate": 4.763659322691711e-05, + "loss": 3.458, + "step": 23461 + }, + { + "epoch": 0.13953516033875726, + "grad_norm": 2.3144681453704834, + "learning_rate": 4.7636394975299845e-05, + "loss": 4.1631, + "step": 23462 + }, + { + "epoch": 0.13954110762203825, + "grad_norm": 3.1292171478271484, + "learning_rate": 4.7636196715780454e-05, + "loss": 3.3234, + "step": 23463 + }, + { + "epoch": 0.13954705490531924, + "grad_norm": 3.2646241188049316, + "learning_rate": 4.763599844835899e-05, + "loss": 3.4951, + "step": 23464 + }, + { + "epoch": 0.13955300218860026, + "grad_norm": 3.3047688007354736, + "learning_rate": 4.7635800173035545e-05, + "loss": 3.6349, + "step": 23465 + }, + { + "epoch": 0.13955894947188124, + "grad_norm": 2.6160805225372314, + "learning_rate": 4.763560188981017e-05, + "loss": 3.8286, + "step": 23466 + }, + { + "epoch": 0.13956489675516223, + "grad_norm": 2.5719079971313477, + "learning_rate": 4.763540359868294e-05, + "loss": 3.7716, + "step": 23467 + }, + { + "epoch": 0.13957084403844325, + "grad_norm": 2.6471214294433594, + "learning_rate": 4.763520529965393e-05, + "loss": 3.4606, + "step": 23468 + }, + { + "epoch": 0.13957679132172424, + "grad_norm": 2.581679344177246, + "learning_rate": 4.7635006992723194e-05, + "loss": 3.5469, + "step": 23469 + }, + { + "epoch": 0.13958273860500522, + "grad_norm": 2.3326828479766846, + "learning_rate": 4.763480867789082e-05, + "loss": 3.7371, + "step": 23470 + }, + { + "epoch": 0.13958868588828624, + "grad_norm": 2.46588397026062, + "learning_rate": 4.763461035515686e-05, + "loss": 3.5972, + "step": 23471 + }, + { + "epoch": 0.13959463317156723, + "grad_norm": 2.3971428871154785, + "learning_rate": 4.76344120245214e-05, + "loss": 3.9445, + "step": 23472 + }, + { + "epoch": 0.13960058045484822, + "grad_norm": 1.8938592672348022, + "learning_rate": 4.7634213685984494e-05, + "loss": 5.1934, + "step": 23473 + }, + { + "epoch": 0.13960652773812923, + "grad_norm": 1.4792579412460327, + "learning_rate": 4.763401533954622e-05, + "loss": 5.5867, + "step": 23474 + }, + { + "epoch": 0.13961247502141022, + "grad_norm": 1.9039497375488281, + "learning_rate": 4.763381698520665e-05, + "loss": 4.9615, + "step": 23475 + }, + { + "epoch": 0.1396184223046912, + "grad_norm": 2.2181084156036377, + "learning_rate": 4.7633618622965845e-05, + "loss": 5.107, + "step": 23476 + }, + { + "epoch": 0.13962436958797222, + "grad_norm": 1.618551254272461, + "learning_rate": 4.7633420252823876e-05, + "loss": 4.8326, + "step": 23477 + }, + { + "epoch": 0.1396303168712532, + "grad_norm": 1.7516298294067383, + "learning_rate": 4.763322187478081e-05, + "loss": 5.0812, + "step": 23478 + }, + { + "epoch": 0.1396362641545342, + "grad_norm": 2.385951042175293, + "learning_rate": 4.7633023488836726e-05, + "loss": 4.2155, + "step": 23479 + }, + { + "epoch": 0.13964221143781522, + "grad_norm": 2.1702630519866943, + "learning_rate": 4.7632825094991686e-05, + "loss": 4.1257, + "step": 23480 + }, + { + "epoch": 0.1396481587210962, + "grad_norm": 1.9801292419433594, + "learning_rate": 4.763262669324576e-05, + "loss": 3.7386, + "step": 23481 + }, + { + "epoch": 0.1396541060043772, + "grad_norm": 2.250795602798462, + "learning_rate": 4.7632428283599016e-05, + "loss": 3.7169, + "step": 23482 + }, + { + "epoch": 0.1396600532876582, + "grad_norm": 2.124037027359009, + "learning_rate": 4.763222986605153e-05, + "loss": 3.7271, + "step": 23483 + }, + { + "epoch": 0.1396660005709392, + "grad_norm": 1.7976130247116089, + "learning_rate": 4.763203144060336e-05, + "loss": 3.9943, + "step": 23484 + }, + { + "epoch": 0.13967194785422019, + "grad_norm": 1.8421905040740967, + "learning_rate": 4.763183300725459e-05, + "loss": 4.1526, + "step": 23485 + }, + { + "epoch": 0.1396778951375012, + "grad_norm": 2.166212797164917, + "learning_rate": 4.763163456600527e-05, + "loss": 4.0225, + "step": 23486 + }, + { + "epoch": 0.1396838424207822, + "grad_norm": 2.2913808822631836, + "learning_rate": 4.763143611685549e-05, + "loss": 4.1125, + "step": 23487 + }, + { + "epoch": 0.13968978970406318, + "grad_norm": 2.20432448387146, + "learning_rate": 4.7631237659805307e-05, + "loss": 3.8297, + "step": 23488 + }, + { + "epoch": 0.1396957369873442, + "grad_norm": 2.323784351348877, + "learning_rate": 4.7631039194854785e-05, + "loss": 3.9128, + "step": 23489 + }, + { + "epoch": 0.13970168427062518, + "grad_norm": 2.22320294380188, + "learning_rate": 4.7630840722004014e-05, + "loss": 3.2834, + "step": 23490 + }, + { + "epoch": 0.13970763155390617, + "grad_norm": 2.2063205242156982, + "learning_rate": 4.763064224125304e-05, + "loss": 3.2472, + "step": 23491 + }, + { + "epoch": 0.13971357883718719, + "grad_norm": 2.1124684810638428, + "learning_rate": 4.763044375260195e-05, + "loss": 3.4765, + "step": 23492 + }, + { + "epoch": 0.13971952612046817, + "grad_norm": 2.2450273036956787, + "learning_rate": 4.7630245256050796e-05, + "loss": 3.586, + "step": 23493 + }, + { + "epoch": 0.13972547340374916, + "grad_norm": 2.821563243865967, + "learning_rate": 4.7630046751599665e-05, + "loss": 3.9152, + "step": 23494 + }, + { + "epoch": 0.13973142068703018, + "grad_norm": 2.623655319213867, + "learning_rate": 4.762984823924862e-05, + "loss": 5.2159, + "step": 23495 + }, + { + "epoch": 0.13973736797031117, + "grad_norm": 2.5610146522521973, + "learning_rate": 4.762964971899773e-05, + "loss": 5.0813, + "step": 23496 + }, + { + "epoch": 0.13974331525359215, + "grad_norm": 2.434995651245117, + "learning_rate": 4.7629451190847055e-05, + "loss": 4.651, + "step": 23497 + }, + { + "epoch": 0.13974926253687317, + "grad_norm": 2.0094375610351562, + "learning_rate": 4.7629252654796675e-05, + "loss": 5.6776, + "step": 23498 + }, + { + "epoch": 0.13975520982015416, + "grad_norm": 2.568547248840332, + "learning_rate": 4.7629054110846664e-05, + "loss": 3.2979, + "step": 23499 + }, + { + "epoch": 0.13976115710343515, + "grad_norm": 1.9725669622421265, + "learning_rate": 4.7628855558997074e-05, + "loss": 5.6269, + "step": 23500 + }, + { + "epoch": 0.13976710438671613, + "grad_norm": 1.6308250427246094, + "learning_rate": 4.7628656999247986e-05, + "loss": 5.7476, + "step": 23501 + }, + { + "epoch": 0.13977305166999715, + "grad_norm": 2.4110774993896484, + "learning_rate": 4.762845843159947e-05, + "loss": 4.8208, + "step": 23502 + }, + { + "epoch": 0.13977899895327814, + "grad_norm": 2.9670233726501465, + "learning_rate": 4.762825985605159e-05, + "loss": 3.3216, + "step": 23503 + }, + { + "epoch": 0.13978494623655913, + "grad_norm": 2.9264471530914307, + "learning_rate": 4.762806127260443e-05, + "loss": 3.12, + "step": 23504 + }, + { + "epoch": 0.13979089351984014, + "grad_norm": 2.983513593673706, + "learning_rate": 4.7627862681258037e-05, + "loss": 3.2355, + "step": 23505 + }, + { + "epoch": 0.13979684080312113, + "grad_norm": 2.5023698806762695, + "learning_rate": 4.7627664082012494e-05, + "loss": 3.6619, + "step": 23506 + }, + { + "epoch": 0.13980278808640212, + "grad_norm": 2.691542625427246, + "learning_rate": 4.762746547486786e-05, + "loss": 2.9562, + "step": 23507 + }, + { + "epoch": 0.13980873536968313, + "grad_norm": 2.487741470336914, + "learning_rate": 4.762726685982421e-05, + "loss": 3.6212, + "step": 23508 + }, + { + "epoch": 0.13981468265296412, + "grad_norm": 2.5798730850219727, + "learning_rate": 4.762706823688163e-05, + "loss": 3.6246, + "step": 23509 + }, + { + "epoch": 0.1398206299362451, + "grad_norm": 2.8465988636016846, + "learning_rate": 4.762686960604017e-05, + "loss": 3.3039, + "step": 23510 + }, + { + "epoch": 0.13982657721952613, + "grad_norm": 2.70969820022583, + "learning_rate": 4.7626670967299897e-05, + "loss": 2.3823, + "step": 23511 + }, + { + "epoch": 0.13983252450280712, + "grad_norm": 2.3834662437438965, + "learning_rate": 4.762647232066089e-05, + "loss": 2.8856, + "step": 23512 + }, + { + "epoch": 0.1398384717860881, + "grad_norm": 2.694798231124878, + "learning_rate": 4.762627366612321e-05, + "loss": 4.3653, + "step": 23513 + }, + { + "epoch": 0.13984441906936912, + "grad_norm": 2.6196436882019043, + "learning_rate": 4.7626075003686944e-05, + "loss": 4.5615, + "step": 23514 + }, + { + "epoch": 0.1398503663526501, + "grad_norm": 2.6196036338806152, + "learning_rate": 4.7625876333352136e-05, + "loss": 3.4767, + "step": 23515 + }, + { + "epoch": 0.1398563136359311, + "grad_norm": 2.32704496383667, + "learning_rate": 4.762567765511888e-05, + "loss": 3.7236, + "step": 23516 + }, + { + "epoch": 0.1398622609192121, + "grad_norm": 2.7415919303894043, + "learning_rate": 4.7625478968987226e-05, + "loss": 3.2248, + "step": 23517 + }, + { + "epoch": 0.1398682082024931, + "grad_norm": 2.402270555496216, + "learning_rate": 4.7625280274957254e-05, + "loss": 3.5112, + "step": 23518 + }, + { + "epoch": 0.1398741554857741, + "grad_norm": 2.722087860107422, + "learning_rate": 4.762508157302903e-05, + "loss": 3.5728, + "step": 23519 + }, + { + "epoch": 0.1398801027690551, + "grad_norm": 2.2336719036102295, + "learning_rate": 4.7624882863202626e-05, + "loss": 4.361, + "step": 23520 + }, + { + "epoch": 0.1398860500523361, + "grad_norm": 1.687203288078308, + "learning_rate": 4.7624684145478106e-05, + "loss": 5.2352, + "step": 23521 + }, + { + "epoch": 0.13989199733561708, + "grad_norm": 2.0672800540924072, + "learning_rate": 4.762448541985553e-05, + "loss": 5.0935, + "step": 23522 + }, + { + "epoch": 0.1398979446188981, + "grad_norm": 1.9521383047103882, + "learning_rate": 4.7624286686335e-05, + "loss": 5.1912, + "step": 23523 + }, + { + "epoch": 0.13990389190217908, + "grad_norm": 1.8050906658172607, + "learning_rate": 4.762408794491656e-05, + "loss": 5.2494, + "step": 23524 + }, + { + "epoch": 0.13990983918546007, + "grad_norm": 1.7029122114181519, + "learning_rate": 4.762388919560028e-05, + "loss": 5.2882, + "step": 23525 + }, + { + "epoch": 0.1399157864687411, + "grad_norm": 2.089055299758911, + "learning_rate": 4.7623690438386234e-05, + "loss": 5.1689, + "step": 23526 + }, + { + "epoch": 0.13992173375202208, + "grad_norm": 1.8083282709121704, + "learning_rate": 4.7623491673274503e-05, + "loss": 5.2078, + "step": 23527 + }, + { + "epoch": 0.13992768103530306, + "grad_norm": 1.6455740928649902, + "learning_rate": 4.7623292900265126e-05, + "loss": 4.6492, + "step": 23528 + }, + { + "epoch": 0.13993362831858408, + "grad_norm": 1.7084187269210815, + "learning_rate": 4.76230941193582e-05, + "loss": 4.5537, + "step": 23529 + }, + { + "epoch": 0.13993957560186507, + "grad_norm": 1.5048147439956665, + "learning_rate": 4.762289533055379e-05, + "loss": 4.3823, + "step": 23530 + }, + { + "epoch": 0.13994552288514606, + "grad_norm": 1.6451318264007568, + "learning_rate": 4.762269653385196e-05, + "loss": 4.4546, + "step": 23531 + }, + { + "epoch": 0.13995147016842707, + "grad_norm": 1.4565141201019287, + "learning_rate": 4.762249772925278e-05, + "loss": 4.5148, + "step": 23532 + }, + { + "epoch": 0.13995741745170806, + "grad_norm": 1.4664920568466187, + "learning_rate": 4.7622298916756316e-05, + "loss": 4.4532, + "step": 23533 + }, + { + "epoch": 0.13996336473498905, + "grad_norm": 1.5902373790740967, + "learning_rate": 4.762210009636264e-05, + "loss": 4.4744, + "step": 23534 + }, + { + "epoch": 0.13996931201827006, + "grad_norm": 1.6029250621795654, + "learning_rate": 4.762190126807182e-05, + "loss": 4.4635, + "step": 23535 + }, + { + "epoch": 0.13997525930155105, + "grad_norm": 1.49099862575531, + "learning_rate": 4.7621702431883943e-05, + "loss": 4.4079, + "step": 23536 + }, + { + "epoch": 0.13998120658483204, + "grad_norm": 1.5527629852294922, + "learning_rate": 4.762150358779905e-05, + "loss": 4.4034, + "step": 23537 + }, + { + "epoch": 0.13998715386811306, + "grad_norm": 1.4014298915863037, + "learning_rate": 4.762130473581723e-05, + "loss": 4.5512, + "step": 23538 + }, + { + "epoch": 0.13999310115139404, + "grad_norm": 1.4211797714233398, + "learning_rate": 4.762110587593854e-05, + "loss": 4.3554, + "step": 23539 + }, + { + "epoch": 0.13999904843467503, + "grad_norm": 1.305879831314087, + "learning_rate": 4.762090700816306e-05, + "loss": 4.5469, + "step": 23540 + }, + { + "epoch": 0.14000499571795605, + "grad_norm": 1.6035869121551514, + "learning_rate": 4.762070813249085e-05, + "loss": 4.2506, + "step": 23541 + }, + { + "epoch": 0.14001094300123704, + "grad_norm": 2.48470139503479, + "learning_rate": 4.7620509248922e-05, + "loss": 4.4341, + "step": 23542 + }, + { + "epoch": 0.14001689028451803, + "grad_norm": 2.1328017711639404, + "learning_rate": 4.7620310357456546e-05, + "loss": 4.8064, + "step": 23543 + }, + { + "epoch": 0.14002283756779904, + "grad_norm": 2.631490707397461, + "learning_rate": 4.7620111458094586e-05, + "loss": 4.9828, + "step": 23544 + }, + { + "epoch": 0.14002878485108003, + "grad_norm": 2.4217545986175537, + "learning_rate": 4.761991255083617e-05, + "loss": 3.7975, + "step": 23545 + }, + { + "epoch": 0.14003473213436102, + "grad_norm": 2.1837475299835205, + "learning_rate": 4.7619713635681384e-05, + "loss": 3.7627, + "step": 23546 + }, + { + "epoch": 0.14004067941764203, + "grad_norm": 2.188026189804077, + "learning_rate": 4.7619514712630284e-05, + "loss": 3.6425, + "step": 23547 + }, + { + "epoch": 0.14004662670092302, + "grad_norm": 2.157501697540283, + "learning_rate": 4.761931578168295e-05, + "loss": 3.2671, + "step": 23548 + }, + { + "epoch": 0.140052573984204, + "grad_norm": 2.28362774848938, + "learning_rate": 4.7619116842839446e-05, + "loss": 3.9765, + "step": 23549 + }, + { + "epoch": 0.14005852126748503, + "grad_norm": 2.1072418689727783, + "learning_rate": 4.7618917896099844e-05, + "loss": 3.8694, + "step": 23550 + }, + { + "epoch": 0.140064468550766, + "grad_norm": 2.061612367630005, + "learning_rate": 4.76187189414642e-05, + "loss": 3.6775, + "step": 23551 + }, + { + "epoch": 0.140070415834047, + "grad_norm": 2.153618812561035, + "learning_rate": 4.761851997893261e-05, + "loss": 3.2189, + "step": 23552 + }, + { + "epoch": 0.14007636311732802, + "grad_norm": 2.211912155151367, + "learning_rate": 4.761832100850512e-05, + "loss": 4.0855, + "step": 23553 + }, + { + "epoch": 0.140082310400609, + "grad_norm": 2.109023094177246, + "learning_rate": 4.761812203018181e-05, + "loss": 3.1532, + "step": 23554 + }, + { + "epoch": 0.14008825768389, + "grad_norm": 2.056579113006592, + "learning_rate": 4.7617923043962745e-05, + "loss": 3.3965, + "step": 23555 + }, + { + "epoch": 0.140094204967171, + "grad_norm": 2.6552531719207764, + "learning_rate": 4.761772404984799e-05, + "loss": 4.8136, + "step": 23556 + }, + { + "epoch": 0.140100152250452, + "grad_norm": 2.873891592025757, + "learning_rate": 4.7617525047837634e-05, + "loss": 5.1014, + "step": 23557 + }, + { + "epoch": 0.140106099533733, + "grad_norm": 2.9486472606658936, + "learning_rate": 4.761732603793173e-05, + "loss": 4.9751, + "step": 23558 + }, + { + "epoch": 0.14011204681701397, + "grad_norm": 1.6354721784591675, + "learning_rate": 4.761712702013035e-05, + "loss": 5.6091, + "step": 23559 + }, + { + "epoch": 0.140117994100295, + "grad_norm": 1.766449213027954, + "learning_rate": 4.761692799443357e-05, + "loss": 5.6621, + "step": 23560 + }, + { + "epoch": 0.14012394138357598, + "grad_norm": 1.9253995418548584, + "learning_rate": 4.7616728960841444e-05, + "loss": 5.0477, + "step": 23561 + }, + { + "epoch": 0.14012988866685697, + "grad_norm": 1.5569409132003784, + "learning_rate": 4.761652991935406e-05, + "loss": 5.2989, + "step": 23562 + }, + { + "epoch": 0.14013583595013798, + "grad_norm": 1.395662784576416, + "learning_rate": 4.761633086997147e-05, + "loss": 5.2249, + "step": 23563 + }, + { + "epoch": 0.14014178323341897, + "grad_norm": 1.9045140743255615, + "learning_rate": 4.761613181269376e-05, + "loss": 5.5549, + "step": 23564 + }, + { + "epoch": 0.14014773051669996, + "grad_norm": 2.0041518211364746, + "learning_rate": 4.761593274752099e-05, + "loss": 5.7419, + "step": 23565 + }, + { + "epoch": 0.14015367779998097, + "grad_norm": 1.983040452003479, + "learning_rate": 4.761573367445323e-05, + "loss": 5.761, + "step": 23566 + }, + { + "epoch": 0.14015962508326196, + "grad_norm": 1.6701973676681519, + "learning_rate": 4.761553459349055e-05, + "loss": 5.8376, + "step": 23567 + }, + { + "epoch": 0.14016557236654295, + "grad_norm": 1.3928866386413574, + "learning_rate": 4.761533550463303e-05, + "loss": 5.7623, + "step": 23568 + }, + { + "epoch": 0.14017151964982397, + "grad_norm": 1.5971790552139282, + "learning_rate": 4.761513640788072e-05, + "loss": 5.6896, + "step": 23569 + }, + { + "epoch": 0.14017746693310495, + "grad_norm": 1.655540943145752, + "learning_rate": 4.76149373032337e-05, + "loss": 5.7311, + "step": 23570 + }, + { + "epoch": 0.14018341421638594, + "grad_norm": 1.6018282175064087, + "learning_rate": 4.761473819069204e-05, + "loss": 5.6966, + "step": 23571 + }, + { + "epoch": 0.14018936149966696, + "grad_norm": 2.0446600914001465, + "learning_rate": 4.7614539070255816e-05, + "loss": 4.7235, + "step": 23572 + }, + { + "epoch": 0.14019530878294795, + "grad_norm": 1.6043277978897095, + "learning_rate": 4.761433994192508e-05, + "loss": 5.1602, + "step": 23573 + }, + { + "epoch": 0.14020125606622894, + "grad_norm": 1.7339102029800415, + "learning_rate": 4.761414080569992e-05, + "loss": 4.6082, + "step": 23574 + }, + { + "epoch": 0.14020720334950995, + "grad_norm": 1.9234665632247925, + "learning_rate": 4.761394166158039e-05, + "loss": 5.1365, + "step": 23575 + }, + { + "epoch": 0.14021315063279094, + "grad_norm": 1.7816582918167114, + "learning_rate": 4.7613742509566574e-05, + "loss": 5.4685, + "step": 23576 + }, + { + "epoch": 0.14021909791607193, + "grad_norm": 2.230858564376831, + "learning_rate": 4.7613543349658526e-05, + "loss": 4.0433, + "step": 23577 + }, + { + "epoch": 0.14022504519935294, + "grad_norm": 2.088791847229004, + "learning_rate": 4.761334418185633e-05, + "loss": 4.0262, + "step": 23578 + }, + { + "epoch": 0.14023099248263393, + "grad_norm": 2.2880146503448486, + "learning_rate": 4.761314500616004e-05, + "loss": 3.8526, + "step": 23579 + }, + { + "epoch": 0.14023693976591492, + "grad_norm": 1.428227186203003, + "learning_rate": 4.7612945822569744e-05, + "loss": 5.538, + "step": 23580 + }, + { + "epoch": 0.14024288704919594, + "grad_norm": 1.5487463474273682, + "learning_rate": 4.76127466310855e-05, + "loss": 4.9206, + "step": 23581 + }, + { + "epoch": 0.14024883433247692, + "grad_norm": 1.7598581314086914, + "learning_rate": 4.761254743170738e-05, + "loss": 4.8871, + "step": 23582 + }, + { + "epoch": 0.1402547816157579, + "grad_norm": 1.8421943187713623, + "learning_rate": 4.7612348224435457e-05, + "loss": 4.9177, + "step": 23583 + }, + { + "epoch": 0.14026072889903893, + "grad_norm": 1.9214147329330444, + "learning_rate": 4.761214900926979e-05, + "loss": 5.148, + "step": 23584 + }, + { + "epoch": 0.14026667618231992, + "grad_norm": 1.8675332069396973, + "learning_rate": 4.761194978621045e-05, + "loss": 5.455, + "step": 23585 + }, + { + "epoch": 0.1402726234656009, + "grad_norm": 1.7940279245376587, + "learning_rate": 4.761175055525753e-05, + "loss": 5.3608, + "step": 23586 + }, + { + "epoch": 0.14027857074888192, + "grad_norm": 1.526066541671753, + "learning_rate": 4.761155131641107e-05, + "loss": 5.3672, + "step": 23587 + }, + { + "epoch": 0.1402845180321629, + "grad_norm": 1.7407697439193726, + "learning_rate": 4.761135206967115e-05, + "loss": 5.4809, + "step": 23588 + }, + { + "epoch": 0.1402904653154439, + "grad_norm": 1.8562800884246826, + "learning_rate": 4.761115281503784e-05, + "loss": 5.3086, + "step": 23589 + }, + { + "epoch": 0.1402964125987249, + "grad_norm": 1.7709288597106934, + "learning_rate": 4.7610953552511216e-05, + "loss": 4.8511, + "step": 23590 + }, + { + "epoch": 0.1403023598820059, + "grad_norm": 1.6407638788223267, + "learning_rate": 4.761075428209134e-05, + "loss": 4.7137, + "step": 23591 + }, + { + "epoch": 0.1403083071652869, + "grad_norm": 1.8322784900665283, + "learning_rate": 4.761055500377828e-05, + "loss": 5.1288, + "step": 23592 + }, + { + "epoch": 0.1403142544485679, + "grad_norm": 2.5631179809570312, + "learning_rate": 4.761035571757211e-05, + "loss": 3.9808, + "step": 23593 + }, + { + "epoch": 0.1403202017318489, + "grad_norm": 2.5823936462402344, + "learning_rate": 4.7610156423472895e-05, + "loss": 4.0532, + "step": 23594 + }, + { + "epoch": 0.14032614901512988, + "grad_norm": 3.3013498783111572, + "learning_rate": 4.760995712148072e-05, + "loss": 3.5222, + "step": 23595 + }, + { + "epoch": 0.1403320962984109, + "grad_norm": 2.8877291679382324, + "learning_rate": 4.760975781159563e-05, + "loss": 3.4662, + "step": 23596 + }, + { + "epoch": 0.14033804358169188, + "grad_norm": 2.757053852081299, + "learning_rate": 4.760955849381771e-05, + "loss": 2.9554, + "step": 23597 + }, + { + "epoch": 0.14034399086497287, + "grad_norm": 2.611163854598999, + "learning_rate": 4.760935916814703e-05, + "loss": 3.0722, + "step": 23598 + }, + { + "epoch": 0.1403499381482539, + "grad_norm": 2.5141069889068604, + "learning_rate": 4.760915983458366e-05, + "loss": 2.9377, + "step": 23599 + }, + { + "epoch": 0.14035588543153488, + "grad_norm": 2.88659930229187, + "learning_rate": 4.7608960493127655e-05, + "loss": 2.7086, + "step": 23600 + }, + { + "epoch": 0.14036183271481587, + "grad_norm": 1.4970325231552124, + "learning_rate": 4.7608761143779103e-05, + "loss": 5.279, + "step": 23601 + }, + { + "epoch": 0.14036777999809688, + "grad_norm": 1.883097767829895, + "learning_rate": 4.760856178653806e-05, + "loss": 4.9675, + "step": 23602 + }, + { + "epoch": 0.14037372728137787, + "grad_norm": 1.8045644760131836, + "learning_rate": 4.760836242140461e-05, + "loss": 4.9739, + "step": 23603 + }, + { + "epoch": 0.14037967456465886, + "grad_norm": 2.2752342224121094, + "learning_rate": 4.760816304837881e-05, + "loss": 5.1278, + "step": 23604 + }, + { + "epoch": 0.14038562184793987, + "grad_norm": 1.8345577716827393, + "learning_rate": 4.760796366746074e-05, + "loss": 5.232, + "step": 23605 + }, + { + "epoch": 0.14039156913122086, + "grad_norm": 1.6739290952682495, + "learning_rate": 4.760776427865046e-05, + "loss": 5.1867, + "step": 23606 + }, + { + "epoch": 0.14039751641450185, + "grad_norm": 1.8607251644134521, + "learning_rate": 4.760756488194803e-05, + "loss": 5.1918, + "step": 23607 + }, + { + "epoch": 0.14040346369778287, + "grad_norm": 1.852330207824707, + "learning_rate": 4.760736547735355e-05, + "loss": 5.1462, + "step": 23608 + }, + { + "epoch": 0.14040941098106385, + "grad_norm": 1.738235354423523, + "learning_rate": 4.760716606486706e-05, + "loss": 5.1607, + "step": 23609 + }, + { + "epoch": 0.14041535826434484, + "grad_norm": 1.7101359367370605, + "learning_rate": 4.760696664448865e-05, + "loss": 5.1047, + "step": 23610 + }, + { + "epoch": 0.14042130554762586, + "grad_norm": 1.618538737297058, + "learning_rate": 4.760676721621838e-05, + "loss": 5.034, + "step": 23611 + }, + { + "epoch": 0.14042725283090685, + "grad_norm": 1.5971029996871948, + "learning_rate": 4.760656778005632e-05, + "loss": 5.0689, + "step": 23612 + }, + { + "epoch": 0.14043320011418783, + "grad_norm": 1.7599228620529175, + "learning_rate": 4.760636833600254e-05, + "loss": 5.0584, + "step": 23613 + }, + { + "epoch": 0.14043914739746885, + "grad_norm": 1.7093656063079834, + "learning_rate": 4.7606168884057114e-05, + "loss": 5.0887, + "step": 23614 + }, + { + "epoch": 0.14044509468074984, + "grad_norm": 1.77159583568573, + "learning_rate": 4.760596942422011e-05, + "loss": 4.9885, + "step": 23615 + }, + { + "epoch": 0.14045104196403083, + "grad_norm": 1.6793224811553955, + "learning_rate": 4.7605769956491586e-05, + "loss": 5.7858, + "step": 23616 + }, + { + "epoch": 0.14045698924731181, + "grad_norm": 2.0000784397125244, + "learning_rate": 4.7605570480871624e-05, + "loss": 5.1434, + "step": 23617 + }, + { + "epoch": 0.14046293653059283, + "grad_norm": 1.777692437171936, + "learning_rate": 4.760537099736029e-05, + "loss": 5.237, + "step": 23618 + }, + { + "epoch": 0.14046888381387382, + "grad_norm": 1.7709475755691528, + "learning_rate": 4.760517150595766e-05, + "loss": 5.1844, + "step": 23619 + }, + { + "epoch": 0.1404748310971548, + "grad_norm": 1.5300654172897339, + "learning_rate": 4.76049720066638e-05, + "loss": 5.4657, + "step": 23620 + }, + { + "epoch": 0.14048077838043582, + "grad_norm": 1.5757399797439575, + "learning_rate": 4.7604772499478767e-05, + "loss": 5.7018, + "step": 23621 + }, + { + "epoch": 0.1404867256637168, + "grad_norm": 1.572698712348938, + "learning_rate": 4.760457298440265e-05, + "loss": 5.5974, + "step": 23622 + }, + { + "epoch": 0.1404926729469978, + "grad_norm": 1.7017083168029785, + "learning_rate": 4.760437346143551e-05, + "loss": 5.6591, + "step": 23623 + }, + { + "epoch": 0.14049862023027881, + "grad_norm": 1.496193528175354, + "learning_rate": 4.760417393057741e-05, + "loss": 5.603, + "step": 23624 + }, + { + "epoch": 0.1405045675135598, + "grad_norm": 1.5156760215759277, + "learning_rate": 4.760397439182843e-05, + "loss": 5.5561, + "step": 23625 + }, + { + "epoch": 0.1405105147968408, + "grad_norm": 1.520276665687561, + "learning_rate": 4.760377484518864e-05, + "loss": 5.6208, + "step": 23626 + }, + { + "epoch": 0.1405164620801218, + "grad_norm": 1.6519960165023804, + "learning_rate": 4.760357529065811e-05, + "loss": 5.6191, + "step": 23627 + }, + { + "epoch": 0.1405224093634028, + "grad_norm": 1.6115814447402954, + "learning_rate": 4.760337572823689e-05, + "loss": 5.6622, + "step": 23628 + }, + { + "epoch": 0.14052835664668378, + "grad_norm": 1.6744813919067383, + "learning_rate": 4.760317615792508e-05, + "loss": 4.9525, + "step": 23629 + }, + { + "epoch": 0.1405343039299648, + "grad_norm": 1.8949360847473145, + "learning_rate": 4.7602976579722725e-05, + "loss": 5.2284, + "step": 23630 + }, + { + "epoch": 0.1405402512132458, + "grad_norm": 1.7098066806793213, + "learning_rate": 4.760277699362991e-05, + "loss": 5.6612, + "step": 23631 + }, + { + "epoch": 0.14054619849652678, + "grad_norm": 2.258535861968994, + "learning_rate": 4.76025773996467e-05, + "loss": 5.3049, + "step": 23632 + }, + { + "epoch": 0.1405521457798078, + "grad_norm": 1.713905692100525, + "learning_rate": 4.760237779777316e-05, + "loss": 6.081, + "step": 23633 + }, + { + "epoch": 0.14055809306308878, + "grad_norm": 1.744905710220337, + "learning_rate": 4.760217818800936e-05, + "loss": 5.6269, + "step": 23634 + }, + { + "epoch": 0.14056404034636977, + "grad_norm": 2.032653570175171, + "learning_rate": 4.760197857035538e-05, + "loss": 4.8417, + "step": 23635 + }, + { + "epoch": 0.14056998762965078, + "grad_norm": 1.9457743167877197, + "learning_rate": 4.7601778944811275e-05, + "loss": 4.6145, + "step": 23636 + }, + { + "epoch": 0.14057593491293177, + "grad_norm": 2.0428082942962646, + "learning_rate": 4.760157931137713e-05, + "loss": 4.7341, + "step": 23637 + }, + { + "epoch": 0.14058188219621276, + "grad_norm": 1.8817776441574097, + "learning_rate": 4.7601379670053006e-05, + "loss": 4.4932, + "step": 23638 + }, + { + "epoch": 0.14058782947949378, + "grad_norm": 1.9882752895355225, + "learning_rate": 4.760118002083897e-05, + "loss": 4.5001, + "step": 23639 + }, + { + "epoch": 0.14059377676277476, + "grad_norm": 1.6730908155441284, + "learning_rate": 4.760098036373509e-05, + "loss": 4.2396, + "step": 23640 + }, + { + "epoch": 0.14059972404605575, + "grad_norm": 1.9490888118743896, + "learning_rate": 4.760078069874145e-05, + "loss": 4.2708, + "step": 23641 + }, + { + "epoch": 0.14060567132933677, + "grad_norm": 1.8162645101547241, + "learning_rate": 4.7600581025858114e-05, + "loss": 4.2507, + "step": 23642 + }, + { + "epoch": 0.14061161861261776, + "grad_norm": 1.9260125160217285, + "learning_rate": 4.760038134508514e-05, + "loss": 4.4647, + "step": 23643 + }, + { + "epoch": 0.14061756589589874, + "grad_norm": 1.892685055732727, + "learning_rate": 4.7600181656422616e-05, + "loss": 4.1241, + "step": 23644 + }, + { + "epoch": 0.14062351317917976, + "grad_norm": 1.625123143196106, + "learning_rate": 4.75999819598706e-05, + "loss": 4.3582, + "step": 23645 + }, + { + "epoch": 0.14062946046246075, + "grad_norm": 1.841758131980896, + "learning_rate": 4.759978225542916e-05, + "loss": 4.3403, + "step": 23646 + }, + { + "epoch": 0.14063540774574174, + "grad_norm": 1.8946552276611328, + "learning_rate": 4.759958254309837e-05, + "loss": 4.5008, + "step": 23647 + }, + { + "epoch": 0.14064135502902275, + "grad_norm": 1.7985520362854004, + "learning_rate": 4.75993828228783e-05, + "loss": 4.4869, + "step": 23648 + }, + { + "epoch": 0.14064730231230374, + "grad_norm": 1.823662519454956, + "learning_rate": 4.759918309476902e-05, + "loss": 4.6177, + "step": 23649 + }, + { + "epoch": 0.14065324959558473, + "grad_norm": 1.94038724899292, + "learning_rate": 4.75989833587706e-05, + "loss": 4.4979, + "step": 23650 + }, + { + "epoch": 0.14065919687886574, + "grad_norm": 1.9023078680038452, + "learning_rate": 4.75987836148831e-05, + "loss": 4.3507, + "step": 23651 + }, + { + "epoch": 0.14066514416214673, + "grad_norm": 1.917851448059082, + "learning_rate": 4.7598583863106606e-05, + "loss": 4.1841, + "step": 23652 + }, + { + "epoch": 0.14067109144542772, + "grad_norm": 1.8332593441009521, + "learning_rate": 4.759838410344117e-05, + "loss": 4.4705, + "step": 23653 + }, + { + "epoch": 0.14067703872870874, + "grad_norm": 1.7567338943481445, + "learning_rate": 4.759818433588689e-05, + "loss": 4.5008, + "step": 23654 + }, + { + "epoch": 0.14068298601198972, + "grad_norm": 1.9399288892745972, + "learning_rate": 4.75979845604438e-05, + "loss": 4.3969, + "step": 23655 + }, + { + "epoch": 0.1406889332952707, + "grad_norm": 1.7779430150985718, + "learning_rate": 4.7597784777112e-05, + "loss": 4.3292, + "step": 23656 + }, + { + "epoch": 0.14069488057855173, + "grad_norm": 1.802742600440979, + "learning_rate": 4.759758498589153e-05, + "loss": 5.0038, + "step": 23657 + }, + { + "epoch": 0.14070082786183272, + "grad_norm": 2.5247714519500732, + "learning_rate": 4.759738518678249e-05, + "loss": 5.0153, + "step": 23658 + }, + { + "epoch": 0.1407067751451137, + "grad_norm": 3.0549800395965576, + "learning_rate": 4.759718537978494e-05, + "loss": 4.6653, + "step": 23659 + }, + { + "epoch": 0.14071272242839472, + "grad_norm": 2.7805356979370117, + "learning_rate": 4.7596985564898935e-05, + "loss": 4.4669, + "step": 23660 + }, + { + "epoch": 0.1407186697116757, + "grad_norm": 2.404932737350464, + "learning_rate": 4.759678574212456e-05, + "loss": 4.6932, + "step": 23661 + }, + { + "epoch": 0.1407246169949567, + "grad_norm": 2.2168543338775635, + "learning_rate": 4.7596585911461875e-05, + "loss": 4.397, + "step": 23662 + }, + { + "epoch": 0.1407305642782377, + "grad_norm": 2.423726797103882, + "learning_rate": 4.759638607291097e-05, + "loss": 4.3534, + "step": 23663 + }, + { + "epoch": 0.1407365115615187, + "grad_norm": 2.1283328533172607, + "learning_rate": 4.759618622647188e-05, + "loss": 4.9248, + "step": 23664 + }, + { + "epoch": 0.1407424588447997, + "grad_norm": 1.6989446878433228, + "learning_rate": 4.7595986372144716e-05, + "loss": 5.4656, + "step": 23665 + }, + { + "epoch": 0.1407484061280807, + "grad_norm": 1.7057443857192993, + "learning_rate": 4.759578650992951e-05, + "loss": 5.193, + "step": 23666 + }, + { + "epoch": 0.1407543534113617, + "grad_norm": 2.3968324661254883, + "learning_rate": 4.7595586639826364e-05, + "loss": 5.132, + "step": 23667 + }, + { + "epoch": 0.14076030069464268, + "grad_norm": 1.7770966291427612, + "learning_rate": 4.7595386761835314e-05, + "loss": 4.8487, + "step": 23668 + }, + { + "epoch": 0.1407662479779237, + "grad_norm": 1.8165397644042969, + "learning_rate": 4.759518687595646e-05, + "loss": 4.9981, + "step": 23669 + }, + { + "epoch": 0.14077219526120469, + "grad_norm": 1.4801784753799438, + "learning_rate": 4.759498698218986e-05, + "loss": 5.0204, + "step": 23670 + }, + { + "epoch": 0.14077814254448567, + "grad_norm": 1.6488209962844849, + "learning_rate": 4.759478708053557e-05, + "loss": 4.9349, + "step": 23671 + }, + { + "epoch": 0.1407840898277667, + "grad_norm": 1.5207561254501343, + "learning_rate": 4.759458717099369e-05, + "loss": 4.9986, + "step": 23672 + }, + { + "epoch": 0.14079003711104768, + "grad_norm": 1.5029826164245605, + "learning_rate": 4.7594387253564263e-05, + "loss": 4.9708, + "step": 23673 + }, + { + "epoch": 0.14079598439432867, + "grad_norm": 1.6697144508361816, + "learning_rate": 4.7594187328247375e-05, + "loss": 4.9915, + "step": 23674 + }, + { + "epoch": 0.14080193167760965, + "grad_norm": 1.7437782287597656, + "learning_rate": 4.7593987395043085e-05, + "loss": 5.068, + "step": 23675 + }, + { + "epoch": 0.14080787896089067, + "grad_norm": 1.8639456033706665, + "learning_rate": 4.7593787453951475e-05, + "loss": 4.9861, + "step": 23676 + }, + { + "epoch": 0.14081382624417166, + "grad_norm": 1.7246698141098022, + "learning_rate": 4.75935875049726e-05, + "loss": 4.9547, + "step": 23677 + }, + { + "epoch": 0.14081977352745265, + "grad_norm": 1.764772891998291, + "learning_rate": 4.759338754810654e-05, + "loss": 4.7823, + "step": 23678 + }, + { + "epoch": 0.14082572081073366, + "grad_norm": 1.3609477281570435, + "learning_rate": 4.759318758335336e-05, + "loss": 4.9039, + "step": 23679 + }, + { + "epoch": 0.14083166809401465, + "grad_norm": 1.4477577209472656, + "learning_rate": 4.759298761071313e-05, + "loss": 4.7816, + "step": 23680 + }, + { + "epoch": 0.14083761537729564, + "grad_norm": 1.6295807361602783, + "learning_rate": 4.759278763018592e-05, + "loss": 4.641, + "step": 23681 + }, + { + "epoch": 0.14084356266057665, + "grad_norm": 1.7831028699874878, + "learning_rate": 4.7592587641771806e-05, + "loss": 4.8989, + "step": 23682 + }, + { + "epoch": 0.14084950994385764, + "grad_norm": 1.7806429862976074, + "learning_rate": 4.7592387645470845e-05, + "loss": 4.9344, + "step": 23683 + }, + { + "epoch": 0.14085545722713863, + "grad_norm": 2.0284979343414307, + "learning_rate": 4.759218764128313e-05, + "loss": 5.7399, + "step": 23684 + }, + { + "epoch": 0.14086140451041965, + "grad_norm": 1.853495717048645, + "learning_rate": 4.7591987629208706e-05, + "loss": 4.8495, + "step": 23685 + }, + { + "epoch": 0.14086735179370063, + "grad_norm": 1.6907382011413574, + "learning_rate": 4.759178760924765e-05, + "loss": 4.8365, + "step": 23686 + }, + { + "epoch": 0.14087329907698162, + "grad_norm": 1.7131983041763306, + "learning_rate": 4.7591587581400045e-05, + "loss": 4.8217, + "step": 23687 + }, + { + "epoch": 0.14087924636026264, + "grad_norm": 1.6896579265594482, + "learning_rate": 4.759138754566595e-05, + "loss": 5.4568, + "step": 23688 + }, + { + "epoch": 0.14088519364354363, + "grad_norm": 1.7312794923782349, + "learning_rate": 4.759118750204542e-05, + "loss": 5.7501, + "step": 23689 + }, + { + "epoch": 0.14089114092682462, + "grad_norm": 1.494137167930603, + "learning_rate": 4.759098745053855e-05, + "loss": 5.526, + "step": 23690 + }, + { + "epoch": 0.14089708821010563, + "grad_norm": 2.2159650325775146, + "learning_rate": 4.75907873911454e-05, + "loss": 5.3686, + "step": 23691 + }, + { + "epoch": 0.14090303549338662, + "grad_norm": 2.0564072132110596, + "learning_rate": 4.759058732386603e-05, + "loss": 5.2311, + "step": 23692 + }, + { + "epoch": 0.1409089827766676, + "grad_norm": 2.5233311653137207, + "learning_rate": 4.759038724870053e-05, + "loss": 4.7775, + "step": 23693 + }, + { + "epoch": 0.14091493005994862, + "grad_norm": 2.180325984954834, + "learning_rate": 4.7590187165648956e-05, + "loss": 4.8106, + "step": 23694 + }, + { + "epoch": 0.1409208773432296, + "grad_norm": 2.1391143798828125, + "learning_rate": 4.758998707471138e-05, + "loss": 4.741, + "step": 23695 + }, + { + "epoch": 0.1409268246265106, + "grad_norm": 1.9628124237060547, + "learning_rate": 4.758978697588787e-05, + "loss": 4.7177, + "step": 23696 + }, + { + "epoch": 0.14093277190979162, + "grad_norm": 2.1324729919433594, + "learning_rate": 4.7589586869178506e-05, + "loss": 4.8006, + "step": 23697 + }, + { + "epoch": 0.1409387191930726, + "grad_norm": 1.9791810512542725, + "learning_rate": 4.758938675458335e-05, + "loss": 4.6171, + "step": 23698 + }, + { + "epoch": 0.1409446664763536, + "grad_norm": 1.8566325902938843, + "learning_rate": 4.758918663210247e-05, + "loss": 5.0375, + "step": 23699 + }, + { + "epoch": 0.1409506137596346, + "grad_norm": 2.3218674659729004, + "learning_rate": 4.758898650173593e-05, + "loss": 5.2169, + "step": 23700 + }, + { + "epoch": 0.1409565610429156, + "grad_norm": 2.0162737369537354, + "learning_rate": 4.7588786363483816e-05, + "loss": 4.8988, + "step": 23701 + }, + { + "epoch": 0.14096250832619658, + "grad_norm": 2.1534879207611084, + "learning_rate": 4.7588586217346197e-05, + "loss": 4.9911, + "step": 23702 + }, + { + "epoch": 0.1409684556094776, + "grad_norm": 2.16445255279541, + "learning_rate": 4.7588386063323134e-05, + "loss": 4.9501, + "step": 23703 + }, + { + "epoch": 0.1409744028927586, + "grad_norm": 1.9189707040786743, + "learning_rate": 4.7588185901414684e-05, + "loss": 4.9125, + "step": 23704 + }, + { + "epoch": 0.14098035017603958, + "grad_norm": 2.1000189781188965, + "learning_rate": 4.7587985731620945e-05, + "loss": 5.002, + "step": 23705 + }, + { + "epoch": 0.1409862974593206, + "grad_norm": 2.0911948680877686, + "learning_rate": 4.7587785553941974e-05, + "loss": 5.0206, + "step": 23706 + }, + { + "epoch": 0.14099224474260158, + "grad_norm": 1.9519456624984741, + "learning_rate": 4.758758536837783e-05, + "loss": 4.5715, + "step": 23707 + }, + { + "epoch": 0.14099819202588257, + "grad_norm": 2.1036672592163086, + "learning_rate": 4.75873851749286e-05, + "loss": 4.7427, + "step": 23708 + }, + { + "epoch": 0.14100413930916358, + "grad_norm": 1.6662368774414062, + "learning_rate": 4.7587184973594354e-05, + "loss": 5.1132, + "step": 23709 + }, + { + "epoch": 0.14101008659244457, + "grad_norm": 1.5314775705337524, + "learning_rate": 4.758698476437514e-05, + "loss": 5.6674, + "step": 23710 + }, + { + "epoch": 0.14101603387572556, + "grad_norm": 1.7167651653289795, + "learning_rate": 4.7586784547271056e-05, + "loss": 5.74, + "step": 23711 + }, + { + "epoch": 0.14102198115900658, + "grad_norm": 1.6126611232757568, + "learning_rate": 4.758658432228216e-05, + "loss": 5.7798, + "step": 23712 + }, + { + "epoch": 0.14102792844228756, + "grad_norm": 1.5236903429031372, + "learning_rate": 4.758638408940851e-05, + "loss": 5.3924, + "step": 23713 + }, + { + "epoch": 0.14103387572556855, + "grad_norm": 1.7352653741836548, + "learning_rate": 4.758618384865019e-05, + "loss": 5.3551, + "step": 23714 + }, + { + "epoch": 0.14103982300884957, + "grad_norm": 2.1185758113861084, + "learning_rate": 4.758598360000727e-05, + "loss": 4.5986, + "step": 23715 + }, + { + "epoch": 0.14104577029213056, + "grad_norm": 2.0252137184143066, + "learning_rate": 4.758578334347981e-05, + "loss": 5.5963, + "step": 23716 + }, + { + "epoch": 0.14105171757541154, + "grad_norm": 2.1225454807281494, + "learning_rate": 4.75855830790679e-05, + "loss": 5.1949, + "step": 23717 + }, + { + "epoch": 0.14105766485869256, + "grad_norm": 2.7703025341033936, + "learning_rate": 4.7585382806771585e-05, + "loss": 4.4741, + "step": 23718 + }, + { + "epoch": 0.14106361214197355, + "grad_norm": 1.6570090055465698, + "learning_rate": 4.758518252659094e-05, + "loss": 4.8543, + "step": 23719 + }, + { + "epoch": 0.14106955942525454, + "grad_norm": 1.759743571281433, + "learning_rate": 4.7584982238526053e-05, + "loss": 4.7901, + "step": 23720 + }, + { + "epoch": 0.14107550670853555, + "grad_norm": 1.562591314315796, + "learning_rate": 4.7584781942576976e-05, + "loss": 5.351, + "step": 23721 + }, + { + "epoch": 0.14108145399181654, + "grad_norm": 1.279597520828247, + "learning_rate": 4.758458163874379e-05, + "loss": 6.0303, + "step": 23722 + }, + { + "epoch": 0.14108740127509753, + "grad_norm": 1.3173538446426392, + "learning_rate": 4.758438132702656e-05, + "loss": 6.015, + "step": 23723 + }, + { + "epoch": 0.14109334855837855, + "grad_norm": 1.4862935543060303, + "learning_rate": 4.7584181007425354e-05, + "loss": 5.6649, + "step": 23724 + }, + { + "epoch": 0.14109929584165953, + "grad_norm": 1.8398306369781494, + "learning_rate": 4.7583980679940244e-05, + "loss": 5.3897, + "step": 23725 + }, + { + "epoch": 0.14110524312494052, + "grad_norm": 2.02359676361084, + "learning_rate": 4.758378034457129e-05, + "loss": 5.8195, + "step": 23726 + }, + { + "epoch": 0.14111119040822154, + "grad_norm": 2.131068706512451, + "learning_rate": 4.758358000131858e-05, + "loss": 5.693, + "step": 23727 + }, + { + "epoch": 0.14111713769150253, + "grad_norm": 2.144928455352783, + "learning_rate": 4.7583379650182184e-05, + "loss": 5.4745, + "step": 23728 + }, + { + "epoch": 0.1411230849747835, + "grad_norm": 2.043093681335449, + "learning_rate": 4.758317929116215e-05, + "loss": 5.5877, + "step": 23729 + }, + { + "epoch": 0.14112903225806453, + "grad_norm": 1.7879455089569092, + "learning_rate": 4.758297892425857e-05, + "loss": 5.5822, + "step": 23730 + }, + { + "epoch": 0.14113497954134552, + "grad_norm": 1.6113840341567993, + "learning_rate": 4.7582778549471494e-05, + "loss": 5.2861, + "step": 23731 + }, + { + "epoch": 0.1411409268246265, + "grad_norm": 1.6712645292282104, + "learning_rate": 4.7582578166801015e-05, + "loss": 5.1185, + "step": 23732 + }, + { + "epoch": 0.1411468741079075, + "grad_norm": 1.6905531883239746, + "learning_rate": 4.758237777624719e-05, + "loss": 5.3339, + "step": 23733 + }, + { + "epoch": 0.1411528213911885, + "grad_norm": 2.058136224746704, + "learning_rate": 4.758217737781009e-05, + "loss": 4.6243, + "step": 23734 + }, + { + "epoch": 0.1411587686744695, + "grad_norm": 1.9609389305114746, + "learning_rate": 4.758197697148978e-05, + "loss": 4.7675, + "step": 23735 + }, + { + "epoch": 0.1411647159577505, + "grad_norm": 1.947270154953003, + "learning_rate": 4.758177655728634e-05, + "loss": 4.6854, + "step": 23736 + }, + { + "epoch": 0.1411706632410315, + "grad_norm": 2.0735461711883545, + "learning_rate": 4.7581576135199834e-05, + "loss": 4.9539, + "step": 23737 + }, + { + "epoch": 0.1411766105243125, + "grad_norm": 2.0236589908599854, + "learning_rate": 4.758137570523033e-05, + "loss": 5.0488, + "step": 23738 + }, + { + "epoch": 0.14118255780759348, + "grad_norm": 2.1183953285217285, + "learning_rate": 4.7581175267377906e-05, + "loss": 4.9358, + "step": 23739 + }, + { + "epoch": 0.1411885050908745, + "grad_norm": 2.0142831802368164, + "learning_rate": 4.758097482164262e-05, + "loss": 4.8333, + "step": 23740 + }, + { + "epoch": 0.14119445237415548, + "grad_norm": 2.204681634902954, + "learning_rate": 4.758077436802455e-05, + "loss": 4.8852, + "step": 23741 + }, + { + "epoch": 0.14120039965743647, + "grad_norm": 2.216187000274658, + "learning_rate": 4.7580573906523774e-05, + "loss": 5.0268, + "step": 23742 + }, + { + "epoch": 0.1412063469407175, + "grad_norm": 2.1434781551361084, + "learning_rate": 4.7580373437140343e-05, + "loss": 4.9048, + "step": 23743 + }, + { + "epoch": 0.14121229422399847, + "grad_norm": 1.8260117769241333, + "learning_rate": 4.758017295987435e-05, + "loss": 5.0481, + "step": 23744 + }, + { + "epoch": 0.14121824150727946, + "grad_norm": 2.2184064388275146, + "learning_rate": 4.757997247472584e-05, + "loss": 4.8967, + "step": 23745 + }, + { + "epoch": 0.14122418879056048, + "grad_norm": 1.8644381761550903, + "learning_rate": 4.75797719816949e-05, + "loss": 5.1945, + "step": 23746 + }, + { + "epoch": 0.14123013607384147, + "grad_norm": 2.0591354370117188, + "learning_rate": 4.757957148078159e-05, + "loss": 4.8916, + "step": 23747 + }, + { + "epoch": 0.14123608335712245, + "grad_norm": 2.429004669189453, + "learning_rate": 4.7579370971985986e-05, + "loss": 4.555, + "step": 23748 + }, + { + "epoch": 0.14124203064040347, + "grad_norm": 2.451037883758545, + "learning_rate": 4.757917045530816e-05, + "loss": 4.663, + "step": 23749 + }, + { + "epoch": 0.14124797792368446, + "grad_norm": 1.8227989673614502, + "learning_rate": 4.7578969930748176e-05, + "loss": 5.6976, + "step": 23750 + }, + { + "epoch": 0.14125392520696545, + "grad_norm": 1.8706707954406738, + "learning_rate": 4.757876939830611e-05, + "loss": 6.0974, + "step": 23751 + }, + { + "epoch": 0.14125987249024646, + "grad_norm": 1.7714571952819824, + "learning_rate": 4.7578568857982025e-05, + "loss": 5.5516, + "step": 23752 + }, + { + "epoch": 0.14126581977352745, + "grad_norm": 2.067776679992676, + "learning_rate": 4.7578368309776e-05, + "loss": 5.296, + "step": 23753 + }, + { + "epoch": 0.14127176705680844, + "grad_norm": 1.9231433868408203, + "learning_rate": 4.7578167753688095e-05, + "loss": 5.1286, + "step": 23754 + }, + { + "epoch": 0.14127771434008946, + "grad_norm": 2.0858731269836426, + "learning_rate": 4.7577967189718386e-05, + "loss": 4.717, + "step": 23755 + }, + { + "epoch": 0.14128366162337044, + "grad_norm": 2.173215627670288, + "learning_rate": 4.757776661786694e-05, + "loss": 4.6995, + "step": 23756 + }, + { + "epoch": 0.14128960890665143, + "grad_norm": 2.008244037628174, + "learning_rate": 4.7577566038133834e-05, + "loss": 4.4147, + "step": 23757 + }, + { + "epoch": 0.14129555618993245, + "grad_norm": 1.9767186641693115, + "learning_rate": 4.757736545051913e-05, + "loss": 4.9901, + "step": 23758 + }, + { + "epoch": 0.14130150347321344, + "grad_norm": 1.860136866569519, + "learning_rate": 4.7577164855022905e-05, + "loss": 4.7252, + "step": 23759 + }, + { + "epoch": 0.14130745075649442, + "grad_norm": 1.9243319034576416, + "learning_rate": 4.757696425164522e-05, + "loss": 4.6387, + "step": 23760 + }, + { + "epoch": 0.14131339803977544, + "grad_norm": 1.9811434745788574, + "learning_rate": 4.7576763640386155e-05, + "loss": 4.7365, + "step": 23761 + }, + { + "epoch": 0.14131934532305643, + "grad_norm": 2.1552014350891113, + "learning_rate": 4.757656302124577e-05, + "loss": 4.4764, + "step": 23762 + }, + { + "epoch": 0.14132529260633742, + "grad_norm": 1.8660786151885986, + "learning_rate": 4.757636239422414e-05, + "loss": 4.6108, + "step": 23763 + }, + { + "epoch": 0.14133123988961843, + "grad_norm": 2.0548014640808105, + "learning_rate": 4.757616175932134e-05, + "loss": 4.3871, + "step": 23764 + }, + { + "epoch": 0.14133718717289942, + "grad_norm": 2.107966184616089, + "learning_rate": 4.757596111653743e-05, + "loss": 4.3013, + "step": 23765 + }, + { + "epoch": 0.1413431344561804, + "grad_norm": 2.062649726867676, + "learning_rate": 4.757576046587249e-05, + "loss": 4.3352, + "step": 23766 + }, + { + "epoch": 0.14134908173946142, + "grad_norm": 1.9424866437911987, + "learning_rate": 4.7575559807326584e-05, + "loss": 4.5538, + "step": 23767 + }, + { + "epoch": 0.1413550290227424, + "grad_norm": 1.9787993431091309, + "learning_rate": 4.757535914089978e-05, + "loss": 4.7105, + "step": 23768 + }, + { + "epoch": 0.1413609763060234, + "grad_norm": 2.3590548038482666, + "learning_rate": 4.7575158466592154e-05, + "loss": 4.5962, + "step": 23769 + }, + { + "epoch": 0.14136692358930442, + "grad_norm": 2.3521318435668945, + "learning_rate": 4.757495778440377e-05, + "loss": 4.8107, + "step": 23770 + }, + { + "epoch": 0.1413728708725854, + "grad_norm": 2.079169273376465, + "learning_rate": 4.7574757094334696e-05, + "loss": 4.6617, + "step": 23771 + }, + { + "epoch": 0.1413788181558664, + "grad_norm": 2.020505428314209, + "learning_rate": 4.757455639638502e-05, + "loss": 4.9402, + "step": 23772 + }, + { + "epoch": 0.1413847654391474, + "grad_norm": 1.8023982048034668, + "learning_rate": 4.75743556905548e-05, + "loss": 5.7173, + "step": 23773 + }, + { + "epoch": 0.1413907127224284, + "grad_norm": 1.471612572669983, + "learning_rate": 4.75741549768441e-05, + "loss": 5.6359, + "step": 23774 + }, + { + "epoch": 0.14139666000570938, + "grad_norm": 1.691918969154358, + "learning_rate": 4.7573954255252996e-05, + "loss": 5.6043, + "step": 23775 + }, + { + "epoch": 0.1414026072889904, + "grad_norm": 1.5347981452941895, + "learning_rate": 4.757375352578156e-05, + "loss": 5.9488, + "step": 23776 + }, + { + "epoch": 0.1414085545722714, + "grad_norm": 1.6003544330596924, + "learning_rate": 4.757355278842985e-05, + "loss": 5.4831, + "step": 23777 + }, + { + "epoch": 0.14141450185555238, + "grad_norm": 1.868674397468567, + "learning_rate": 4.757335204319796e-05, + "loss": 5.3372, + "step": 23778 + }, + { + "epoch": 0.1414204491388334, + "grad_norm": 1.827628254890442, + "learning_rate": 4.7573151290085935e-05, + "loss": 5.2977, + "step": 23779 + }, + { + "epoch": 0.14142639642211438, + "grad_norm": 1.80328369140625, + "learning_rate": 4.757295052909386e-05, + "loss": 5.2484, + "step": 23780 + }, + { + "epoch": 0.14143234370539537, + "grad_norm": 1.7244900465011597, + "learning_rate": 4.7572749760221815e-05, + "loss": 5.341, + "step": 23781 + }, + { + "epoch": 0.14143829098867639, + "grad_norm": 1.6203787326812744, + "learning_rate": 4.757254898346984e-05, + "loss": 5.1993, + "step": 23782 + }, + { + "epoch": 0.14144423827195737, + "grad_norm": 1.7411043643951416, + "learning_rate": 4.7572348198838026e-05, + "loss": 5.177, + "step": 23783 + }, + { + "epoch": 0.14145018555523836, + "grad_norm": 1.6770362854003906, + "learning_rate": 4.7572147406326435e-05, + "loss": 5.2169, + "step": 23784 + }, + { + "epoch": 0.14145613283851938, + "grad_norm": 1.6283633708953857, + "learning_rate": 4.7571946605935146e-05, + "loss": 5.1338, + "step": 23785 + }, + { + "epoch": 0.14146208012180037, + "grad_norm": 1.601276159286499, + "learning_rate": 4.7571745797664215e-05, + "loss": 5.0783, + "step": 23786 + }, + { + "epoch": 0.14146802740508135, + "grad_norm": 1.7484774589538574, + "learning_rate": 4.757154498151373e-05, + "loss": 5.106, + "step": 23787 + }, + { + "epoch": 0.14147397468836237, + "grad_norm": 1.8326083421707153, + "learning_rate": 4.7571344157483744e-05, + "loss": 5.0202, + "step": 23788 + }, + { + "epoch": 0.14147992197164336, + "grad_norm": 1.7564448118209839, + "learning_rate": 4.757114332557434e-05, + "loss": 5.0854, + "step": 23789 + }, + { + "epoch": 0.14148586925492435, + "grad_norm": 1.776414394378662, + "learning_rate": 4.757094248578558e-05, + "loss": 5.049, + "step": 23790 + }, + { + "epoch": 0.14149181653820536, + "grad_norm": 1.6053420305252075, + "learning_rate": 4.757074163811754e-05, + "loss": 5.1644, + "step": 23791 + }, + { + "epoch": 0.14149776382148635, + "grad_norm": 1.9419928789138794, + "learning_rate": 4.7570540782570295e-05, + "loss": 5.6868, + "step": 23792 + }, + { + "epoch": 0.14150371110476734, + "grad_norm": 1.8629308938980103, + "learning_rate": 4.757033991914389e-05, + "loss": 5.6614, + "step": 23793 + }, + { + "epoch": 0.14150965838804833, + "grad_norm": 1.745348572731018, + "learning_rate": 4.757013904783842e-05, + "loss": 5.6742, + "step": 23794 + }, + { + "epoch": 0.14151560567132934, + "grad_norm": 1.8093681335449219, + "learning_rate": 4.756993816865396e-05, + "loss": 5.8902, + "step": 23795 + }, + { + "epoch": 0.14152155295461033, + "grad_norm": 1.8000177145004272, + "learning_rate": 4.7569737281590554e-05, + "loss": 5.7025, + "step": 23796 + }, + { + "epoch": 0.14152750023789132, + "grad_norm": 1.7782033681869507, + "learning_rate": 4.756953638664829e-05, + "loss": 5.492, + "step": 23797 + }, + { + "epoch": 0.14153344752117233, + "grad_norm": 1.7651612758636475, + "learning_rate": 4.756933548382723e-05, + "loss": 4.8989, + "step": 23798 + }, + { + "epoch": 0.14153939480445332, + "grad_norm": 2.0286474227905273, + "learning_rate": 4.756913457312745e-05, + "loss": 4.5672, + "step": 23799 + }, + { + "epoch": 0.1415453420877343, + "grad_norm": 2.361325740814209, + "learning_rate": 4.756893365454902e-05, + "loss": 4.6471, + "step": 23800 + }, + { + "epoch": 0.14155128937101533, + "grad_norm": 1.8565771579742432, + "learning_rate": 4.756873272809202e-05, + "loss": 4.589, + "step": 23801 + }, + { + "epoch": 0.14155723665429631, + "grad_norm": 1.895958662033081, + "learning_rate": 4.756853179375649e-05, + "loss": 4.4608, + "step": 23802 + }, + { + "epoch": 0.1415631839375773, + "grad_norm": 2.103283166885376, + "learning_rate": 4.756833085154252e-05, + "loss": 4.3885, + "step": 23803 + }, + { + "epoch": 0.14156913122085832, + "grad_norm": 2.0823607444763184, + "learning_rate": 4.756812990145019e-05, + "loss": 4.307, + "step": 23804 + }, + { + "epoch": 0.1415750785041393, + "grad_norm": 1.852010726928711, + "learning_rate": 4.7567928943479546e-05, + "loss": 4.7289, + "step": 23805 + }, + { + "epoch": 0.1415810257874203, + "grad_norm": 1.6223875284194946, + "learning_rate": 4.7567727977630685e-05, + "loss": 5.5772, + "step": 23806 + }, + { + "epoch": 0.1415869730707013, + "grad_norm": 1.9508872032165527, + "learning_rate": 4.756752700390366e-05, + "loss": 5.3001, + "step": 23807 + }, + { + "epoch": 0.1415929203539823, + "grad_norm": 1.6098484992980957, + "learning_rate": 4.756732602229853e-05, + "loss": 5.2318, + "step": 23808 + }, + { + "epoch": 0.1415988676372633, + "grad_norm": 1.4197050333023071, + "learning_rate": 4.7567125032815394e-05, + "loss": 4.9752, + "step": 23809 + }, + { + "epoch": 0.1416048149205443, + "grad_norm": 1.5325055122375488, + "learning_rate": 4.7566924035454305e-05, + "loss": 5.0156, + "step": 23810 + }, + { + "epoch": 0.1416107622038253, + "grad_norm": 1.7188338041305542, + "learning_rate": 4.7566723030215335e-05, + "loss": 5.3756, + "step": 23811 + }, + { + "epoch": 0.14161670948710628, + "grad_norm": 1.779646396636963, + "learning_rate": 4.756652201709856e-05, + "loss": 5.3844, + "step": 23812 + }, + { + "epoch": 0.1416226567703873, + "grad_norm": 1.913001298904419, + "learning_rate": 4.756632099610404e-05, + "loss": 5.2604, + "step": 23813 + }, + { + "epoch": 0.14162860405366828, + "grad_norm": 1.5379444360733032, + "learning_rate": 4.7566119967231846e-05, + "loss": 5.4184, + "step": 23814 + }, + { + "epoch": 0.14163455133694927, + "grad_norm": 2.5433242321014404, + "learning_rate": 4.756591893048206e-05, + "loss": 5.076, + "step": 23815 + }, + { + "epoch": 0.1416404986202303, + "grad_norm": 2.0431840419769287, + "learning_rate": 4.756571788585474e-05, + "loss": 5.0766, + "step": 23816 + }, + { + "epoch": 0.14164644590351128, + "grad_norm": 2.5103769302368164, + "learning_rate": 4.7565516833349964e-05, + "loss": 4.9539, + "step": 23817 + }, + { + "epoch": 0.14165239318679226, + "grad_norm": 1.563063383102417, + "learning_rate": 4.75653157729678e-05, + "loss": 5.4752, + "step": 23818 + }, + { + "epoch": 0.14165834047007328, + "grad_norm": 1.8695935010910034, + "learning_rate": 4.756511470470832e-05, + "loss": 5.4486, + "step": 23819 + }, + { + "epoch": 0.14166428775335427, + "grad_norm": 2.092947244644165, + "learning_rate": 4.756491362857158e-05, + "loss": 5.1404, + "step": 23820 + }, + { + "epoch": 0.14167023503663526, + "grad_norm": 1.8582149744033813, + "learning_rate": 4.756471254455768e-05, + "loss": 5.0814, + "step": 23821 + }, + { + "epoch": 0.14167618231991627, + "grad_norm": 3.3430545330047607, + "learning_rate": 4.756451145266666e-05, + "loss": 5.2346, + "step": 23822 + }, + { + "epoch": 0.14168212960319726, + "grad_norm": 2.023859977722168, + "learning_rate": 4.75643103528986e-05, + "loss": 5.1639, + "step": 23823 + }, + { + "epoch": 0.14168807688647825, + "grad_norm": 2.0848581790924072, + "learning_rate": 4.756410924525358e-05, + "loss": 5.1314, + "step": 23824 + }, + { + "epoch": 0.14169402416975926, + "grad_norm": 2.2708516120910645, + "learning_rate": 4.7563908129731663e-05, + "loss": 5.1218, + "step": 23825 + }, + { + "epoch": 0.14169997145304025, + "grad_norm": 1.9105170965194702, + "learning_rate": 4.7563707006332905e-05, + "loss": 5.0428, + "step": 23826 + }, + { + "epoch": 0.14170591873632124, + "grad_norm": 1.9914016723632812, + "learning_rate": 4.75635058750574e-05, + "loss": 5.0497, + "step": 23827 + }, + { + "epoch": 0.14171186601960226, + "grad_norm": 1.9820994138717651, + "learning_rate": 4.756330473590521e-05, + "loss": 5.1161, + "step": 23828 + }, + { + "epoch": 0.14171781330288324, + "grad_norm": 1.7676537036895752, + "learning_rate": 4.75631035888764e-05, + "loss": 5.0291, + "step": 23829 + }, + { + "epoch": 0.14172376058616423, + "grad_norm": 1.9614083766937256, + "learning_rate": 4.7562902433971046e-05, + "loss": 5.3574, + "step": 23830 + }, + { + "epoch": 0.14172970786944525, + "grad_norm": 1.4212971925735474, + "learning_rate": 4.756270127118921e-05, + "loss": 5.8053, + "step": 23831 + }, + { + "epoch": 0.14173565515272624, + "grad_norm": 1.6015945672988892, + "learning_rate": 4.7562500100530984e-05, + "loss": 5.9339, + "step": 23832 + }, + { + "epoch": 0.14174160243600722, + "grad_norm": 1.6133309602737427, + "learning_rate": 4.7562298921996405e-05, + "loss": 5.4939, + "step": 23833 + }, + { + "epoch": 0.14174754971928824, + "grad_norm": 1.514958381652832, + "learning_rate": 4.7562097735585565e-05, + "loss": 5.649, + "step": 23834 + }, + { + "epoch": 0.14175349700256923, + "grad_norm": 1.912479281425476, + "learning_rate": 4.756189654129853e-05, + "loss": 5.5304, + "step": 23835 + }, + { + "epoch": 0.14175944428585022, + "grad_norm": 2.149765968322754, + "learning_rate": 4.756169533913538e-05, + "loss": 5.4228, + "step": 23836 + }, + { + "epoch": 0.14176539156913123, + "grad_norm": 1.8468290567398071, + "learning_rate": 4.756149412909616e-05, + "loss": 5.4605, + "step": 23837 + }, + { + "epoch": 0.14177133885241222, + "grad_norm": 1.670300841331482, + "learning_rate": 4.756129291118097e-05, + "loss": 5.4537, + "step": 23838 + }, + { + "epoch": 0.1417772861356932, + "grad_norm": 1.8857238292694092, + "learning_rate": 4.756109168538985e-05, + "loss": 5.2654, + "step": 23839 + }, + { + "epoch": 0.14178323341897422, + "grad_norm": 1.9114692211151123, + "learning_rate": 4.7560890451722894e-05, + "loss": 5.3255, + "step": 23840 + }, + { + "epoch": 0.1417891807022552, + "grad_norm": 1.654356598854065, + "learning_rate": 4.7560689210180164e-05, + "loss": 5.2983, + "step": 23841 + }, + { + "epoch": 0.1417951279855362, + "grad_norm": 1.9302277565002441, + "learning_rate": 4.7560487960761734e-05, + "loss": 5.7902, + "step": 23842 + }, + { + "epoch": 0.14180107526881722, + "grad_norm": 1.8009575605392456, + "learning_rate": 4.7560286703467674e-05, + "loss": 5.7359, + "step": 23843 + }, + { + "epoch": 0.1418070225520982, + "grad_norm": 1.4472894668579102, + "learning_rate": 4.7560085438298043e-05, + "loss": 5.813, + "step": 23844 + }, + { + "epoch": 0.1418129698353792, + "grad_norm": 1.6131559610366821, + "learning_rate": 4.755988416525292e-05, + "loss": 5.7525, + "step": 23845 + }, + { + "epoch": 0.1418189171186602, + "grad_norm": 1.4684244394302368, + "learning_rate": 4.755968288433237e-05, + "loss": 5.7649, + "step": 23846 + }, + { + "epoch": 0.1418248644019412, + "grad_norm": 1.369974970817566, + "learning_rate": 4.755948159553647e-05, + "loss": 5.666, + "step": 23847 + }, + { + "epoch": 0.14183081168522219, + "grad_norm": 1.6687818765640259, + "learning_rate": 4.755928029886529e-05, + "loss": 5.5685, + "step": 23848 + }, + { + "epoch": 0.1418367589685032, + "grad_norm": 2.011798858642578, + "learning_rate": 4.755907899431891e-05, + "loss": 6.0011, + "step": 23849 + }, + { + "epoch": 0.1418427062517842, + "grad_norm": 2.1938908100128174, + "learning_rate": 4.7558877681897376e-05, + "loss": 5.4987, + "step": 23850 + }, + { + "epoch": 0.14184865353506518, + "grad_norm": 1.9103244543075562, + "learning_rate": 4.7558676361600774e-05, + "loss": 5.5061, + "step": 23851 + }, + { + "epoch": 0.14185460081834617, + "grad_norm": 1.850809097290039, + "learning_rate": 4.7558475033429165e-05, + "loss": 5.4346, + "step": 23852 + }, + { + "epoch": 0.14186054810162718, + "grad_norm": 1.6861615180969238, + "learning_rate": 4.755827369738263e-05, + "loss": 5.5082, + "step": 23853 + }, + { + "epoch": 0.14186649538490817, + "grad_norm": 1.532423496246338, + "learning_rate": 4.7558072353461236e-05, + "loss": 5.704, + "step": 23854 + }, + { + "epoch": 0.14187244266818916, + "grad_norm": 1.6446877717971802, + "learning_rate": 4.755787100166506e-05, + "loss": 5.7046, + "step": 23855 + }, + { + "epoch": 0.14187838995147017, + "grad_norm": 1.599294662475586, + "learning_rate": 4.7557669641994144e-05, + "loss": 5.7324, + "step": 23856 + }, + { + "epoch": 0.14188433723475116, + "grad_norm": 1.8838186264038086, + "learning_rate": 4.7557468274448594e-05, + "loss": 5.5496, + "step": 23857 + }, + { + "epoch": 0.14189028451803215, + "grad_norm": 1.8579468727111816, + "learning_rate": 4.7557266899028464e-05, + "loss": 5.6645, + "step": 23858 + }, + { + "epoch": 0.14189623180131317, + "grad_norm": 2.02162766456604, + "learning_rate": 4.7557065515733815e-05, + "loss": 5.7992, + "step": 23859 + }, + { + "epoch": 0.14190217908459415, + "grad_norm": 1.559417486190796, + "learning_rate": 4.755686412456474e-05, + "loss": 5.6176, + "step": 23860 + }, + { + "epoch": 0.14190812636787514, + "grad_norm": 1.5074375867843628, + "learning_rate": 4.755666272552129e-05, + "loss": 5.3933, + "step": 23861 + }, + { + "epoch": 0.14191407365115616, + "grad_norm": 1.521987795829773, + "learning_rate": 4.755646131860354e-05, + "loss": 5.834, + "step": 23862 + }, + { + "epoch": 0.14192002093443715, + "grad_norm": 1.7396782636642456, + "learning_rate": 4.755625990381157e-05, + "loss": 5.149, + "step": 23863 + }, + { + "epoch": 0.14192596821771813, + "grad_norm": 1.7040945291519165, + "learning_rate": 4.755605848114544e-05, + "loss": 5.1569, + "step": 23864 + }, + { + "epoch": 0.14193191550099915, + "grad_norm": 1.7336739301681519, + "learning_rate": 4.7555857050605217e-05, + "loss": 5.1509, + "step": 23865 + }, + { + "epoch": 0.14193786278428014, + "grad_norm": 1.6548901796340942, + "learning_rate": 4.755565561219099e-05, + "loss": 4.9829, + "step": 23866 + }, + { + "epoch": 0.14194381006756113, + "grad_norm": 1.9203529357910156, + "learning_rate": 4.7555454165902804e-05, + "loss": 4.8946, + "step": 23867 + }, + { + "epoch": 0.14194975735084214, + "grad_norm": 1.8711525201797485, + "learning_rate": 4.755525271174074e-05, + "loss": 4.9691, + "step": 23868 + }, + { + "epoch": 0.14195570463412313, + "grad_norm": 1.8115698099136353, + "learning_rate": 4.755505124970488e-05, + "loss": 4.7342, + "step": 23869 + }, + { + "epoch": 0.14196165191740412, + "grad_norm": 1.996324896812439, + "learning_rate": 4.7554849779795284e-05, + "loss": 4.8892, + "step": 23870 + }, + { + "epoch": 0.14196759920068514, + "grad_norm": 1.7132238149642944, + "learning_rate": 4.7554648302012015e-05, + "loss": 4.7785, + "step": 23871 + }, + { + "epoch": 0.14197354648396612, + "grad_norm": 1.8130909204483032, + "learning_rate": 4.755444681635516e-05, + "loss": 4.9106, + "step": 23872 + }, + { + "epoch": 0.1419794937672471, + "grad_norm": 1.8058964014053345, + "learning_rate": 4.755424532282478e-05, + "loss": 4.7486, + "step": 23873 + }, + { + "epoch": 0.14198544105052813, + "grad_norm": 3.171724557876587, + "learning_rate": 4.755404382142094e-05, + "loss": 4.7696, + "step": 23874 + }, + { + "epoch": 0.14199138833380912, + "grad_norm": 1.99362313747406, + "learning_rate": 4.755384231214372e-05, + "loss": 4.6704, + "step": 23875 + }, + { + "epoch": 0.1419973356170901, + "grad_norm": 1.3904173374176025, + "learning_rate": 4.755364079499318e-05, + "loss": 5.6621, + "step": 23876 + }, + { + "epoch": 0.14200328290037112, + "grad_norm": 1.4735981225967407, + "learning_rate": 4.7553439269969415e-05, + "loss": 5.5464, + "step": 23877 + }, + { + "epoch": 0.1420092301836521, + "grad_norm": 1.3085891008377075, + "learning_rate": 4.755323773707246e-05, + "loss": 5.4913, + "step": 23878 + }, + { + "epoch": 0.1420151774669331, + "grad_norm": 1.627657175064087, + "learning_rate": 4.755303619630241e-05, + "loss": 5.4001, + "step": 23879 + }, + { + "epoch": 0.1420211247502141, + "grad_norm": 1.8672151565551758, + "learning_rate": 4.755283464765933e-05, + "loss": 5.5518, + "step": 23880 + }, + { + "epoch": 0.1420270720334951, + "grad_norm": 1.8344969749450684, + "learning_rate": 4.755263309114328e-05, + "loss": 5.2819, + "step": 23881 + }, + { + "epoch": 0.1420330193167761, + "grad_norm": 1.8662999868392944, + "learning_rate": 4.755243152675434e-05, + "loss": 5.3128, + "step": 23882 + }, + { + "epoch": 0.1420389666000571, + "grad_norm": 1.6729795932769775, + "learning_rate": 4.755222995449259e-05, + "loss": 5.1282, + "step": 23883 + }, + { + "epoch": 0.1420449138833381, + "grad_norm": 2.925039529800415, + "learning_rate": 4.7552028374358074e-05, + "loss": 4.9187, + "step": 23884 + }, + { + "epoch": 0.14205086116661908, + "grad_norm": 2.414885997772217, + "learning_rate": 4.755182678635089e-05, + "loss": 5.219, + "step": 23885 + }, + { + "epoch": 0.1420568084499001, + "grad_norm": 1.7273744344711304, + "learning_rate": 4.7551625190471095e-05, + "loss": 5.1296, + "step": 23886 + }, + { + "epoch": 0.14206275573318108, + "grad_norm": 1.691588044166565, + "learning_rate": 4.755142358671876e-05, + "loss": 5.3328, + "step": 23887 + }, + { + "epoch": 0.14206870301646207, + "grad_norm": 1.6644389629364014, + "learning_rate": 4.755122197509395e-05, + "loss": 6.162, + "step": 23888 + }, + { + "epoch": 0.1420746502997431, + "grad_norm": 1.7232459783554077, + "learning_rate": 4.7551020355596744e-05, + "loss": 6.1469, + "step": 23889 + }, + { + "epoch": 0.14208059758302408, + "grad_norm": 1.4883437156677246, + "learning_rate": 4.7550818728227206e-05, + "loss": 6.1803, + "step": 23890 + }, + { + "epoch": 0.14208654486630506, + "grad_norm": 1.4301148653030396, + "learning_rate": 4.7550617092985425e-05, + "loss": 6.0918, + "step": 23891 + }, + { + "epoch": 0.14209249214958608, + "grad_norm": 1.4922714233398438, + "learning_rate": 4.755041544987144e-05, + "loss": 5.8328, + "step": 23892 + }, + { + "epoch": 0.14209843943286707, + "grad_norm": 1.9683314561843872, + "learning_rate": 4.7550213798885345e-05, + "loss": 5.3362, + "step": 23893 + }, + { + "epoch": 0.14210438671614806, + "grad_norm": 1.841512680053711, + "learning_rate": 4.755001214002721e-05, + "loss": 5.1776, + "step": 23894 + }, + { + "epoch": 0.14211033399942907, + "grad_norm": 1.615190863609314, + "learning_rate": 4.7549810473297085e-05, + "loss": 5.4266, + "step": 23895 + }, + { + "epoch": 0.14211628128271006, + "grad_norm": 1.728252649307251, + "learning_rate": 4.7549608798695065e-05, + "loss": 5.5736, + "step": 23896 + }, + { + "epoch": 0.14212222856599105, + "grad_norm": 1.5590336322784424, + "learning_rate": 4.75494071162212e-05, + "loss": 5.4725, + "step": 23897 + }, + { + "epoch": 0.14212817584927206, + "grad_norm": 1.5246217250823975, + "learning_rate": 4.7549205425875585e-05, + "loss": 5.3707, + "step": 23898 + }, + { + "epoch": 0.14213412313255305, + "grad_norm": 1.4803682565689087, + "learning_rate": 4.754900372765826e-05, + "loss": 5.5735, + "step": 23899 + }, + { + "epoch": 0.14214007041583404, + "grad_norm": 1.633510947227478, + "learning_rate": 4.7548802021569315e-05, + "loss": 5.3334, + "step": 23900 + }, + { + "epoch": 0.14214601769911506, + "grad_norm": 1.9321861267089844, + "learning_rate": 4.754860030760882e-05, + "loss": 5.3384, + "step": 23901 + }, + { + "epoch": 0.14215196498239605, + "grad_norm": 1.858965516090393, + "learning_rate": 4.7548398585776844e-05, + "loss": 5.4072, + "step": 23902 + }, + { + "epoch": 0.14215791226567703, + "grad_norm": 1.7266136407852173, + "learning_rate": 4.754819685607345e-05, + "loss": 5.3865, + "step": 23903 + }, + { + "epoch": 0.14216385954895805, + "grad_norm": 1.579783320426941, + "learning_rate": 4.754799511849871e-05, + "loss": 5.3524, + "step": 23904 + }, + { + "epoch": 0.14216980683223904, + "grad_norm": 1.5112273693084717, + "learning_rate": 4.7547793373052704e-05, + "loss": 5.3411, + "step": 23905 + }, + { + "epoch": 0.14217575411552003, + "grad_norm": 1.5031278133392334, + "learning_rate": 4.754759161973549e-05, + "loss": 5.3782, + "step": 23906 + }, + { + "epoch": 0.14218170139880104, + "grad_norm": 1.581784963607788, + "learning_rate": 4.7547389858547155e-05, + "loss": 5.2722, + "step": 23907 + }, + { + "epoch": 0.14218764868208203, + "grad_norm": 1.350386619567871, + "learning_rate": 4.754718808948775e-05, + "loss": 5.5733, + "step": 23908 + }, + { + "epoch": 0.14219359596536302, + "grad_norm": 1.5469433069229126, + "learning_rate": 4.754698631255736e-05, + "loss": 5.7556, + "step": 23909 + }, + { + "epoch": 0.142199543248644, + "grad_norm": 1.5234500169754028, + "learning_rate": 4.754678452775604e-05, + "loss": 5.9086, + "step": 23910 + }, + { + "epoch": 0.14220549053192502, + "grad_norm": 1.4361084699630737, + "learning_rate": 4.754658273508388e-05, + "loss": 5.7659, + "step": 23911 + }, + { + "epoch": 0.142211437815206, + "grad_norm": 1.5128140449523926, + "learning_rate": 4.754638093454094e-05, + "loss": 5.7307, + "step": 23912 + }, + { + "epoch": 0.142217385098487, + "grad_norm": 1.4324685335159302, + "learning_rate": 4.754617912612729e-05, + "loss": 5.4717, + "step": 23913 + }, + { + "epoch": 0.14222333238176801, + "grad_norm": 1.8225339651107788, + "learning_rate": 4.7545977309843004e-05, + "loss": 5.3876, + "step": 23914 + }, + { + "epoch": 0.142229279665049, + "grad_norm": 1.6822171211242676, + "learning_rate": 4.754577548568815e-05, + "loss": 5.5243, + "step": 23915 + }, + { + "epoch": 0.14223522694833, + "grad_norm": 1.7231889963150024, + "learning_rate": 4.754557365366279e-05, + "loss": 5.9398, + "step": 23916 + }, + { + "epoch": 0.142241174231611, + "grad_norm": 1.6815425157546997, + "learning_rate": 4.754537181376702e-05, + "loss": 6.0264, + "step": 23917 + }, + { + "epoch": 0.142247121514892, + "grad_norm": 1.599161148071289, + "learning_rate": 4.754516996600088e-05, + "loss": 6.0783, + "step": 23918 + }, + { + "epoch": 0.14225306879817298, + "grad_norm": 1.565960168838501, + "learning_rate": 4.7544968110364455e-05, + "loss": 6.2248, + "step": 23919 + }, + { + "epoch": 0.142259016081454, + "grad_norm": 1.5778778791427612, + "learning_rate": 4.754476624685782e-05, + "loss": 6.1216, + "step": 23920 + }, + { + "epoch": 0.142264963364735, + "grad_norm": 1.6303963661193848, + "learning_rate": 4.754456437548104e-05, + "loss": 5.9956, + "step": 23921 + }, + { + "epoch": 0.14227091064801597, + "grad_norm": 1.6119714975357056, + "learning_rate": 4.754436249623418e-05, + "loss": 5.4221, + "step": 23922 + }, + { + "epoch": 0.142276857931297, + "grad_norm": 1.9543877840042114, + "learning_rate": 4.754416060911732e-05, + "loss": 5.3631, + "step": 23923 + }, + { + "epoch": 0.14228280521457798, + "grad_norm": 1.90111243724823, + "learning_rate": 4.754395871413052e-05, + "loss": 5.3828, + "step": 23924 + }, + { + "epoch": 0.14228875249785897, + "grad_norm": 1.6575809717178345, + "learning_rate": 4.754375681127386e-05, + "loss": 5.1258, + "step": 23925 + }, + { + "epoch": 0.14229469978113998, + "grad_norm": 1.5518983602523804, + "learning_rate": 4.7543554900547416e-05, + "loss": 5.2144, + "step": 23926 + }, + { + "epoch": 0.14230064706442097, + "grad_norm": 1.604325532913208, + "learning_rate": 4.754335298195124e-05, + "loss": 5.1447, + "step": 23927 + }, + { + "epoch": 0.14230659434770196, + "grad_norm": 1.6287504434585571, + "learning_rate": 4.754315105548542e-05, + "loss": 5.1267, + "step": 23928 + }, + { + "epoch": 0.14231254163098297, + "grad_norm": 1.5111888647079468, + "learning_rate": 4.7542949121150014e-05, + "loss": 5.1122, + "step": 23929 + }, + { + "epoch": 0.14231848891426396, + "grad_norm": 1.4685728549957275, + "learning_rate": 4.75427471789451e-05, + "loss": 5.5366, + "step": 23930 + }, + { + "epoch": 0.14232443619754495, + "grad_norm": 2.1167118549346924, + "learning_rate": 4.754254522887074e-05, + "loss": 5.0426, + "step": 23931 + }, + { + "epoch": 0.14233038348082597, + "grad_norm": 1.7412205934524536, + "learning_rate": 4.754234327092702e-05, + "loss": 5.1454, + "step": 23932 + }, + { + "epoch": 0.14233633076410696, + "grad_norm": 2.290722608566284, + "learning_rate": 4.754214130511399e-05, + "loss": 4.7253, + "step": 23933 + }, + { + "epoch": 0.14234227804738794, + "grad_norm": 2.460817813873291, + "learning_rate": 4.754193933143174e-05, + "loss": 4.762, + "step": 23934 + }, + { + "epoch": 0.14234822533066896, + "grad_norm": 2.2080838680267334, + "learning_rate": 4.754173734988032e-05, + "loss": 4.6405, + "step": 23935 + }, + { + "epoch": 0.14235417261394995, + "grad_norm": 2.475855588912964, + "learning_rate": 4.7541535360459825e-05, + "loss": 4.6213, + "step": 23936 + }, + { + "epoch": 0.14236011989723094, + "grad_norm": 2.1748647689819336, + "learning_rate": 4.754133336317031e-05, + "loss": 4.5461, + "step": 23937 + }, + { + "epoch": 0.14236606718051195, + "grad_norm": 2.1339731216430664, + "learning_rate": 4.754113135801185e-05, + "loss": 4.6366, + "step": 23938 + }, + { + "epoch": 0.14237201446379294, + "grad_norm": 2.142465353012085, + "learning_rate": 4.754092934498451e-05, + "loss": 4.6129, + "step": 23939 + }, + { + "epoch": 0.14237796174707393, + "grad_norm": 2.1925458908081055, + "learning_rate": 4.754072732408836e-05, + "loss": 4.6171, + "step": 23940 + }, + { + "epoch": 0.14238390903035494, + "grad_norm": 2.1470870971679688, + "learning_rate": 4.7540525295323483e-05, + "loss": 4.4577, + "step": 23941 + }, + { + "epoch": 0.14238985631363593, + "grad_norm": 1.7223306894302368, + "learning_rate": 4.754032325868994e-05, + "loss": 5.7355, + "step": 23942 + }, + { + "epoch": 0.14239580359691692, + "grad_norm": 1.8489956855773926, + "learning_rate": 4.7540121214187805e-05, + "loss": 5.9877, + "step": 23943 + }, + { + "epoch": 0.14240175088019794, + "grad_norm": 1.8920329809188843, + "learning_rate": 4.7539919161817134e-05, + "loss": 5.6751, + "step": 23944 + }, + { + "epoch": 0.14240769816347892, + "grad_norm": 1.642392635345459, + "learning_rate": 4.753971710157802e-05, + "loss": 5.3404, + "step": 23945 + }, + { + "epoch": 0.1424136454467599, + "grad_norm": 1.681997537612915, + "learning_rate": 4.753951503347053e-05, + "loss": 5.2964, + "step": 23946 + }, + { + "epoch": 0.14241959273004093, + "grad_norm": 1.767589807510376, + "learning_rate": 4.753931295749472e-05, + "loss": 5.2843, + "step": 23947 + }, + { + "epoch": 0.14242554001332192, + "grad_norm": 1.7100127935409546, + "learning_rate": 4.7539110873650674e-05, + "loss": 5.3869, + "step": 23948 + }, + { + "epoch": 0.1424314872966029, + "grad_norm": 1.5660570859909058, + "learning_rate": 4.7538908781938453e-05, + "loss": 5.3994, + "step": 23949 + }, + { + "epoch": 0.14243743457988392, + "grad_norm": 1.8509501218795776, + "learning_rate": 4.7538706682358124e-05, + "loss": 5.8575, + "step": 23950 + }, + { + "epoch": 0.1424433818631649, + "grad_norm": 1.5773848295211792, + "learning_rate": 4.753850457490978e-05, + "loss": 5.8548, + "step": 23951 + }, + { + "epoch": 0.1424493291464459, + "grad_norm": 1.4020990133285522, + "learning_rate": 4.753830245959347e-05, + "loss": 5.6696, + "step": 23952 + }, + { + "epoch": 0.1424552764297269, + "grad_norm": 1.7756813764572144, + "learning_rate": 4.753810033640928e-05, + "loss": 5.3623, + "step": 23953 + }, + { + "epoch": 0.1424612237130079, + "grad_norm": 1.9046579599380493, + "learning_rate": 4.7537898205357255e-05, + "loss": 5.4078, + "step": 23954 + }, + { + "epoch": 0.1424671709962889, + "grad_norm": 1.6977450847625732, + "learning_rate": 4.753769606643749e-05, + "loss": 5.4418, + "step": 23955 + }, + { + "epoch": 0.1424731182795699, + "grad_norm": 1.6306700706481934, + "learning_rate": 4.753749391965005e-05, + "loss": 5.6299, + "step": 23956 + }, + { + "epoch": 0.1424790655628509, + "grad_norm": 1.8286629915237427, + "learning_rate": 4.7537291764995006e-05, + "loss": 5.7271, + "step": 23957 + }, + { + "epoch": 0.14248501284613188, + "grad_norm": 1.5603896379470825, + "learning_rate": 4.753708960247242e-05, + "loss": 5.645, + "step": 23958 + }, + { + "epoch": 0.1424909601294129, + "grad_norm": 1.6031434535980225, + "learning_rate": 4.7536887432082375e-05, + "loss": 5.6604, + "step": 23959 + }, + { + "epoch": 0.14249690741269389, + "grad_norm": 1.6950321197509766, + "learning_rate": 4.753668525382493e-05, + "loss": 5.7467, + "step": 23960 + }, + { + "epoch": 0.14250285469597487, + "grad_norm": 1.367156744003296, + "learning_rate": 4.753648306770017e-05, + "loss": 5.8554, + "step": 23961 + }, + { + "epoch": 0.1425088019792559, + "grad_norm": 1.6769720315933228, + "learning_rate": 4.753628087370815e-05, + "loss": 5.7408, + "step": 23962 + }, + { + "epoch": 0.14251474926253688, + "grad_norm": 2.3092730045318604, + "learning_rate": 4.753607867184894e-05, + "loss": 4.3284, + "step": 23963 + }, + { + "epoch": 0.14252069654581787, + "grad_norm": 1.8199213743209839, + "learning_rate": 4.753587646212263e-05, + "loss": 4.9928, + "step": 23964 + }, + { + "epoch": 0.14252664382909888, + "grad_norm": 1.5818908214569092, + "learning_rate": 4.753567424452927e-05, + "loss": 5.4382, + "step": 23965 + }, + { + "epoch": 0.14253259111237987, + "grad_norm": 1.6112592220306396, + "learning_rate": 4.753547201906895e-05, + "loss": 5.6344, + "step": 23966 + }, + { + "epoch": 0.14253853839566086, + "grad_norm": 1.530733585357666, + "learning_rate": 4.753526978574172e-05, + "loss": 5.6788, + "step": 23967 + }, + { + "epoch": 0.14254448567894185, + "grad_norm": 1.4186383485794067, + "learning_rate": 4.7535067544547664e-05, + "loss": 5.5129, + "step": 23968 + }, + { + "epoch": 0.14255043296222286, + "grad_norm": 1.3288373947143555, + "learning_rate": 4.753486529548684e-05, + "loss": 5.4413, + "step": 23969 + }, + { + "epoch": 0.14255638024550385, + "grad_norm": 1.3416498899459839, + "learning_rate": 4.7534663038559335e-05, + "loss": 5.6757, + "step": 23970 + }, + { + "epoch": 0.14256232752878484, + "grad_norm": 1.2552043199539185, + "learning_rate": 4.7534460773765215e-05, + "loss": 5.4015, + "step": 23971 + }, + { + "epoch": 0.14256827481206585, + "grad_norm": 1.7393593788146973, + "learning_rate": 4.7534258501104544e-05, + "loss": 5.8824, + "step": 23972 + }, + { + "epoch": 0.14257422209534684, + "grad_norm": 1.5608609914779663, + "learning_rate": 4.75340562205774e-05, + "loss": 5.7623, + "step": 23973 + }, + { + "epoch": 0.14258016937862783, + "grad_norm": 1.484365463256836, + "learning_rate": 4.753385393218384e-05, + "loss": 5.6563, + "step": 23974 + }, + { + "epoch": 0.14258611666190885, + "grad_norm": 1.5432020425796509, + "learning_rate": 4.753365163592395e-05, + "loss": 5.6214, + "step": 23975 + }, + { + "epoch": 0.14259206394518983, + "grad_norm": 1.3963783979415894, + "learning_rate": 4.7533449331797797e-05, + "loss": 5.5315, + "step": 23976 + }, + { + "epoch": 0.14259801122847082, + "grad_norm": 1.778178095817566, + "learning_rate": 4.753324701980545e-05, + "loss": 5.8467, + "step": 23977 + }, + { + "epoch": 0.14260395851175184, + "grad_norm": 1.717940330505371, + "learning_rate": 4.753304469994698e-05, + "loss": 5.6369, + "step": 23978 + }, + { + "epoch": 0.14260990579503283, + "grad_norm": 1.7598493099212646, + "learning_rate": 4.753284237222245e-05, + "loss": 5.2906, + "step": 23979 + }, + { + "epoch": 0.14261585307831381, + "grad_norm": 2.1206471920013428, + "learning_rate": 4.753264003663194e-05, + "loss": 4.5855, + "step": 23980 + }, + { + "epoch": 0.14262180036159483, + "grad_norm": 2.1312971115112305, + "learning_rate": 4.7532437693175525e-05, + "loss": 4.6795, + "step": 23981 + }, + { + "epoch": 0.14262774764487582, + "grad_norm": 2.6566877365112305, + "learning_rate": 4.753223534185326e-05, + "loss": 4.6831, + "step": 23982 + }, + { + "epoch": 0.1426336949281568, + "grad_norm": 2.5692079067230225, + "learning_rate": 4.753203298266523e-05, + "loss": 4.3662, + "step": 23983 + }, + { + "epoch": 0.14263964221143782, + "grad_norm": 2.2617204189300537, + "learning_rate": 4.75318306156115e-05, + "loss": 4.5077, + "step": 23984 + }, + { + "epoch": 0.1426455894947188, + "grad_norm": 2.3445560932159424, + "learning_rate": 4.753162824069214e-05, + "loss": 4.3449, + "step": 23985 + }, + { + "epoch": 0.1426515367779998, + "grad_norm": 2.193120002746582, + "learning_rate": 4.7531425857907216e-05, + "loss": 4.3601, + "step": 23986 + }, + { + "epoch": 0.14265748406128081, + "grad_norm": 2.3515334129333496, + "learning_rate": 4.753122346725681e-05, + "loss": 4.411, + "step": 23987 + }, + { + "epoch": 0.1426634313445618, + "grad_norm": 2.286971092224121, + "learning_rate": 4.7531021068740986e-05, + "loss": 4.4801, + "step": 23988 + }, + { + "epoch": 0.1426693786278428, + "grad_norm": 2.30155873298645, + "learning_rate": 4.7530818662359814e-05, + "loss": 4.4121, + "step": 23989 + }, + { + "epoch": 0.1426753259111238, + "grad_norm": 2.151796340942383, + "learning_rate": 4.7530616248113364e-05, + "loss": 4.4185, + "step": 23990 + }, + { + "epoch": 0.1426812731944048, + "grad_norm": 2.6092782020568848, + "learning_rate": 4.7530413826001706e-05, + "loss": 4.5183, + "step": 23991 + }, + { + "epoch": 0.14268722047768578, + "grad_norm": 2.3881771564483643, + "learning_rate": 4.7530211396024926e-05, + "loss": 4.5246, + "step": 23992 + }, + { + "epoch": 0.1426931677609668, + "grad_norm": 2.921297550201416, + "learning_rate": 4.753000895818307e-05, + "loss": 4.5855, + "step": 23993 + }, + { + "epoch": 0.1426991150442478, + "grad_norm": 2.039461135864258, + "learning_rate": 4.752980651247623e-05, + "loss": 5.3866, + "step": 23994 + }, + { + "epoch": 0.14270506232752878, + "grad_norm": 2.6810874938964844, + "learning_rate": 4.752960405890446e-05, + "loss": 4.3992, + "step": 23995 + }, + { + "epoch": 0.1427110096108098, + "grad_norm": 2.366675615310669, + "learning_rate": 4.752940159746784e-05, + "loss": 4.3981, + "step": 23996 + }, + { + "epoch": 0.14271695689409078, + "grad_norm": 2.446672201156616, + "learning_rate": 4.7529199128166435e-05, + "loss": 4.3428, + "step": 23997 + }, + { + "epoch": 0.14272290417737177, + "grad_norm": 2.686692476272583, + "learning_rate": 4.7528996651000325e-05, + "loss": 4.4006, + "step": 23998 + }, + { + "epoch": 0.14272885146065278, + "grad_norm": 2.577341318130493, + "learning_rate": 4.752879416596957e-05, + "loss": 4.3635, + "step": 23999 + }, + { + "epoch": 0.14273479874393377, + "grad_norm": 2.0183050632476807, + "learning_rate": 4.752859167307425e-05, + "loss": 4.402, + "step": 24000 + }, + { + "epoch": 0.14274074602721476, + "grad_norm": 2.062704563140869, + "learning_rate": 4.7528389172314434e-05, + "loss": 4.3103, + "step": 24001 + }, + { + "epoch": 0.14274669331049578, + "grad_norm": 2.3112356662750244, + "learning_rate": 4.752818666369019e-05, + "loss": 4.5129, + "step": 24002 + }, + { + "epoch": 0.14275264059377676, + "grad_norm": 2.3484156131744385, + "learning_rate": 4.752798414720158e-05, + "loss": 4.2367, + "step": 24003 + }, + { + "epoch": 0.14275858787705775, + "grad_norm": 2.142179250717163, + "learning_rate": 4.752778162284869e-05, + "loss": 4.8016, + "step": 24004 + }, + { + "epoch": 0.14276453516033877, + "grad_norm": 2.076201915740967, + "learning_rate": 4.752757909063158e-05, + "loss": 5.2754, + "step": 24005 + }, + { + "epoch": 0.14277048244361976, + "grad_norm": 1.7873663902282715, + "learning_rate": 4.752737655055033e-05, + "loss": 5.3064, + "step": 24006 + }, + { + "epoch": 0.14277642972690074, + "grad_norm": 1.863776445388794, + "learning_rate": 4.7527174002605e-05, + "loss": 5.045, + "step": 24007 + }, + { + "epoch": 0.14278237701018176, + "grad_norm": 1.9370598793029785, + "learning_rate": 4.752697144679567e-05, + "loss": 5.037, + "step": 24008 + }, + { + "epoch": 0.14278832429346275, + "grad_norm": 1.967492938041687, + "learning_rate": 4.7526768883122405e-05, + "loss": 4.9898, + "step": 24009 + }, + { + "epoch": 0.14279427157674374, + "grad_norm": 1.6309136152267456, + "learning_rate": 4.7526566311585285e-05, + "loss": 5.0752, + "step": 24010 + }, + { + "epoch": 0.14280021886002475, + "grad_norm": 1.6783781051635742, + "learning_rate": 4.7526363732184365e-05, + "loss": 4.7746, + "step": 24011 + }, + { + "epoch": 0.14280616614330574, + "grad_norm": 1.4897167682647705, + "learning_rate": 4.752616114491972e-05, + "loss": 5.1681, + "step": 24012 + }, + { + "epoch": 0.14281211342658673, + "grad_norm": 1.4138036966323853, + "learning_rate": 4.752595854979144e-05, + "loss": 5.351, + "step": 24013 + }, + { + "epoch": 0.14281806070986774, + "grad_norm": 1.4653584957122803, + "learning_rate": 4.7525755946799566e-05, + "loss": 5.1754, + "step": 24014 + }, + { + "epoch": 0.14282400799314873, + "grad_norm": 1.7669284343719482, + "learning_rate": 4.752555333594419e-05, + "loss": 5.2409, + "step": 24015 + }, + { + "epoch": 0.14282995527642972, + "grad_norm": 2.478325366973877, + "learning_rate": 4.752535071722538e-05, + "loss": 5.7027, + "step": 24016 + }, + { + "epoch": 0.14283590255971074, + "grad_norm": 1.3903100490570068, + "learning_rate": 4.75251480906432e-05, + "loss": 5.371, + "step": 24017 + }, + { + "epoch": 0.14284184984299172, + "grad_norm": 1.5938868522644043, + "learning_rate": 4.752494545619772e-05, + "loss": 5.0741, + "step": 24018 + }, + { + "epoch": 0.1428477971262727, + "grad_norm": 1.4633463621139526, + "learning_rate": 4.752474281388901e-05, + "loss": 5.2562, + "step": 24019 + }, + { + "epoch": 0.14285374440955373, + "grad_norm": 1.5575978755950928, + "learning_rate": 4.7524540163717155e-05, + "loss": 5.7142, + "step": 24020 + }, + { + "epoch": 0.14285969169283472, + "grad_norm": 1.857527732849121, + "learning_rate": 4.7524337505682216e-05, + "loss": 5.6595, + "step": 24021 + }, + { + "epoch": 0.1428656389761157, + "grad_norm": 1.6097089052200317, + "learning_rate": 4.752413483978426e-05, + "loss": 5.2562, + "step": 24022 + }, + { + "epoch": 0.14287158625939672, + "grad_norm": 1.8765082359313965, + "learning_rate": 4.752393216602335e-05, + "loss": 4.511, + "step": 24023 + }, + { + "epoch": 0.1428775335426777, + "grad_norm": 1.5626455545425415, + "learning_rate": 4.752372948439959e-05, + "loss": 4.8816, + "step": 24024 + }, + { + "epoch": 0.1428834808259587, + "grad_norm": 1.4234426021575928, + "learning_rate": 4.7523526794913015e-05, + "loss": 5.1271, + "step": 24025 + }, + { + "epoch": 0.14288942810923969, + "grad_norm": 1.4709553718566895, + "learning_rate": 4.7523324097563706e-05, + "loss": 5.2034, + "step": 24026 + }, + { + "epoch": 0.1428953753925207, + "grad_norm": 1.7568445205688477, + "learning_rate": 4.752312139235175e-05, + "loss": 4.7914, + "step": 24027 + }, + { + "epoch": 0.1429013226758017, + "grad_norm": 1.711824893951416, + "learning_rate": 4.752291867927719e-05, + "loss": 4.6601, + "step": 24028 + }, + { + "epoch": 0.14290726995908268, + "grad_norm": 1.6301651000976562, + "learning_rate": 4.752271595834012e-05, + "loss": 4.9326, + "step": 24029 + }, + { + "epoch": 0.1429132172423637, + "grad_norm": 1.5549229383468628, + "learning_rate": 4.752251322954061e-05, + "loss": 5.1706, + "step": 24030 + }, + { + "epoch": 0.14291916452564468, + "grad_norm": 1.5638782978057861, + "learning_rate": 4.752231049287871e-05, + "loss": 4.9079, + "step": 24031 + }, + { + "epoch": 0.14292511180892567, + "grad_norm": 1.6099932193756104, + "learning_rate": 4.752210774835451e-05, + "loss": 4.7565, + "step": 24032 + }, + { + "epoch": 0.14293105909220669, + "grad_norm": 1.5388545989990234, + "learning_rate": 4.752190499596808e-05, + "loss": 4.792, + "step": 24033 + }, + { + "epoch": 0.14293700637548767, + "grad_norm": 1.4083584547042847, + "learning_rate": 4.752170223571948e-05, + "loss": 4.8608, + "step": 24034 + }, + { + "epoch": 0.14294295365876866, + "grad_norm": 1.5718214511871338, + "learning_rate": 4.752149946760879e-05, + "loss": 4.7874, + "step": 24035 + }, + { + "epoch": 0.14294890094204968, + "grad_norm": 1.5951184034347534, + "learning_rate": 4.752129669163607e-05, + "loss": 4.7581, + "step": 24036 + }, + { + "epoch": 0.14295484822533067, + "grad_norm": 1.5525321960449219, + "learning_rate": 4.7521093907801404e-05, + "loss": 4.5684, + "step": 24037 + }, + { + "epoch": 0.14296079550861165, + "grad_norm": 1.6149049997329712, + "learning_rate": 4.7520891116104856e-05, + "loss": 4.4343, + "step": 24038 + }, + { + "epoch": 0.14296674279189267, + "grad_norm": 1.624150037765503, + "learning_rate": 4.752068831654649e-05, + "loss": 4.4697, + "step": 24039 + }, + { + "epoch": 0.14297269007517366, + "grad_norm": 1.3906975984573364, + "learning_rate": 4.75204855091264e-05, + "loss": 4.4062, + "step": 24040 + }, + { + "epoch": 0.14297863735845465, + "grad_norm": 1.6626862287521362, + "learning_rate": 4.7520282693844623e-05, + "loss": 4.9593, + "step": 24041 + }, + { + "epoch": 0.14298458464173566, + "grad_norm": 1.8431484699249268, + "learning_rate": 4.752007987070126e-05, + "loss": 5.3581, + "step": 24042 + }, + { + "epoch": 0.14299053192501665, + "grad_norm": 1.7550246715545654, + "learning_rate": 4.751987703969637e-05, + "loss": 5.3909, + "step": 24043 + }, + { + "epoch": 0.14299647920829764, + "grad_norm": 1.6016278266906738, + "learning_rate": 4.7519674200830015e-05, + "loss": 5.1732, + "step": 24044 + }, + { + "epoch": 0.14300242649157865, + "grad_norm": 1.4594265222549438, + "learning_rate": 4.7519471354102285e-05, + "loss": 5.0859, + "step": 24045 + }, + { + "epoch": 0.14300837377485964, + "grad_norm": 1.7040293216705322, + "learning_rate": 4.751926849951323e-05, + "loss": 5.1476, + "step": 24046 + }, + { + "epoch": 0.14301432105814063, + "grad_norm": 1.4739158153533936, + "learning_rate": 4.7519065637062934e-05, + "loss": 5.3691, + "step": 24047 + }, + { + "epoch": 0.14302026834142165, + "grad_norm": 1.5245054960250854, + "learning_rate": 4.751886276675147e-05, + "loss": 5.4395, + "step": 24048 + }, + { + "epoch": 0.14302621562470264, + "grad_norm": 1.678786039352417, + "learning_rate": 4.75186598885789e-05, + "loss": 4.826, + "step": 24049 + }, + { + "epoch": 0.14303216290798362, + "grad_norm": 1.9114538431167603, + "learning_rate": 4.7518457002545305e-05, + "loss": 5.1483, + "step": 24050 + }, + { + "epoch": 0.14303811019126464, + "grad_norm": 1.5139118432998657, + "learning_rate": 4.751825410865074e-05, + "loss": 5.1349, + "step": 24051 + }, + { + "epoch": 0.14304405747454563, + "grad_norm": 1.4199074506759644, + "learning_rate": 4.7518051206895286e-05, + "loss": 5.0579, + "step": 24052 + }, + { + "epoch": 0.14305000475782662, + "grad_norm": 1.570027470588684, + "learning_rate": 4.751784829727902e-05, + "loss": 4.9915, + "step": 24053 + }, + { + "epoch": 0.14305595204110763, + "grad_norm": 1.476340651512146, + "learning_rate": 4.7517645379802e-05, + "loss": 5.4808, + "step": 24054 + }, + { + "epoch": 0.14306189932438862, + "grad_norm": 1.7526558637619019, + "learning_rate": 4.75174424544643e-05, + "loss": 5.3816, + "step": 24055 + }, + { + "epoch": 0.1430678466076696, + "grad_norm": 1.846692681312561, + "learning_rate": 4.7517239521266e-05, + "loss": 5.6713, + "step": 24056 + }, + { + "epoch": 0.14307379389095062, + "grad_norm": 1.5340349674224854, + "learning_rate": 4.751703658020716e-05, + "loss": 5.6456, + "step": 24057 + }, + { + "epoch": 0.1430797411742316, + "grad_norm": 1.6693123579025269, + "learning_rate": 4.751683363128786e-05, + "loss": 5.5229, + "step": 24058 + }, + { + "epoch": 0.1430856884575126, + "grad_norm": 1.7673590183258057, + "learning_rate": 4.751663067450816e-05, + "loss": 4.9188, + "step": 24059 + }, + { + "epoch": 0.14309163574079362, + "grad_norm": 1.8243883848190308, + "learning_rate": 4.751642770986814e-05, + "loss": 4.5658, + "step": 24060 + }, + { + "epoch": 0.1430975830240746, + "grad_norm": 2.394139051437378, + "learning_rate": 4.7516224737367866e-05, + "loss": 4.101, + "step": 24061 + }, + { + "epoch": 0.1431035303073556, + "grad_norm": 2.0918843746185303, + "learning_rate": 4.7516021757007414e-05, + "loss": 4.03, + "step": 24062 + }, + { + "epoch": 0.1431094775906366, + "grad_norm": 2.129743814468384, + "learning_rate": 4.751581876878685e-05, + "loss": 4.1339, + "step": 24063 + }, + { + "epoch": 0.1431154248739176, + "grad_norm": 2.1546170711517334, + "learning_rate": 4.751561577270624e-05, + "loss": 4.4471, + "step": 24064 + }, + { + "epoch": 0.14312137215719858, + "grad_norm": 1.9738941192626953, + "learning_rate": 4.751541276876567e-05, + "loss": 5.8276, + "step": 24065 + }, + { + "epoch": 0.1431273194404796, + "grad_norm": 1.9925949573516846, + "learning_rate": 4.7515209756965196e-05, + "loss": 5.2116, + "step": 24066 + }, + { + "epoch": 0.1431332667237606, + "grad_norm": 1.761315941810608, + "learning_rate": 4.75150067373049e-05, + "loss": 5.0048, + "step": 24067 + }, + { + "epoch": 0.14313921400704158, + "grad_norm": 1.7744289636611938, + "learning_rate": 4.751480370978485e-05, + "loss": 5.2451, + "step": 24068 + }, + { + "epoch": 0.1431451612903226, + "grad_norm": 1.4490324258804321, + "learning_rate": 4.7514600674405106e-05, + "loss": 5.704, + "step": 24069 + }, + { + "epoch": 0.14315110857360358, + "grad_norm": 1.4389432668685913, + "learning_rate": 4.751439763116575e-05, + "loss": 5.6274, + "step": 24070 + }, + { + "epoch": 0.14315705585688457, + "grad_norm": 2.0219969749450684, + "learning_rate": 4.751419458006685e-05, + "loss": 4.2387, + "step": 24071 + }, + { + "epoch": 0.14316300314016558, + "grad_norm": 1.6722300052642822, + "learning_rate": 4.751399152110848e-05, + "loss": 4.7426, + "step": 24072 + }, + { + "epoch": 0.14316895042344657, + "grad_norm": 1.461065411567688, + "learning_rate": 4.751378845429071e-05, + "loss": 5.4895, + "step": 24073 + }, + { + "epoch": 0.14317489770672756, + "grad_norm": 1.3877815008163452, + "learning_rate": 4.75135853796136e-05, + "loss": 5.6264, + "step": 24074 + }, + { + "epoch": 0.14318084499000858, + "grad_norm": 1.3981953859329224, + "learning_rate": 4.751338229707724e-05, + "loss": 5.4467, + "step": 24075 + }, + { + "epoch": 0.14318679227328956, + "grad_norm": 1.3032608032226562, + "learning_rate": 4.751317920668169e-05, + "loss": 5.5902, + "step": 24076 + }, + { + "epoch": 0.14319273955657055, + "grad_norm": 1.477534532546997, + "learning_rate": 4.751297610842701e-05, + "loss": 5.6286, + "step": 24077 + }, + { + "epoch": 0.14319868683985157, + "grad_norm": 1.5056313276290894, + "learning_rate": 4.75127730023133e-05, + "loss": 5.5233, + "step": 24078 + }, + { + "epoch": 0.14320463412313256, + "grad_norm": 1.6936917304992676, + "learning_rate": 4.75125698883406e-05, + "loss": 4.9877, + "step": 24079 + }, + { + "epoch": 0.14321058140641355, + "grad_norm": 1.5967860221862793, + "learning_rate": 4.7512366766509004e-05, + "loss": 5.1782, + "step": 24080 + }, + { + "epoch": 0.14321652868969456, + "grad_norm": 1.4995664358139038, + "learning_rate": 4.751216363681857e-05, + "loss": 5.3016, + "step": 24081 + }, + { + "epoch": 0.14322247597297555, + "grad_norm": 1.6829060316085815, + "learning_rate": 4.751196049926937e-05, + "loss": 5.228, + "step": 24082 + }, + { + "epoch": 0.14322842325625654, + "grad_norm": 2.151371955871582, + "learning_rate": 4.7511757353861475e-05, + "loss": 5.1807, + "step": 24083 + }, + { + "epoch": 0.14323437053953753, + "grad_norm": 2.1892330646514893, + "learning_rate": 4.751155420059497e-05, + "loss": 5.3542, + "step": 24084 + }, + { + "epoch": 0.14324031782281854, + "grad_norm": 2.0016772747039795, + "learning_rate": 4.75113510394699e-05, + "loss": 4.9516, + "step": 24085 + }, + { + "epoch": 0.14324626510609953, + "grad_norm": 1.8935182094573975, + "learning_rate": 4.751114787048635e-05, + "loss": 5.0342, + "step": 24086 + }, + { + "epoch": 0.14325221238938052, + "grad_norm": 2.004809617996216, + "learning_rate": 4.75109446936444e-05, + "loss": 4.2826, + "step": 24087 + }, + { + "epoch": 0.14325815967266153, + "grad_norm": 1.8340208530426025, + "learning_rate": 4.7510741508944115e-05, + "loss": 4.9323, + "step": 24088 + }, + { + "epoch": 0.14326410695594252, + "grad_norm": 1.769805908203125, + "learning_rate": 4.7510538316385545e-05, + "loss": 5.3595, + "step": 24089 + }, + { + "epoch": 0.1432700542392235, + "grad_norm": 1.5973625183105469, + "learning_rate": 4.75103351159688e-05, + "loss": 5.6195, + "step": 24090 + }, + { + "epoch": 0.14327600152250453, + "grad_norm": 1.5248761177062988, + "learning_rate": 4.751013190769391e-05, + "loss": 5.3578, + "step": 24091 + }, + { + "epoch": 0.14328194880578551, + "grad_norm": 1.5317707061767578, + "learning_rate": 4.750992869156098e-05, + "loss": 5.2791, + "step": 24092 + }, + { + "epoch": 0.1432878960890665, + "grad_norm": 1.9778176546096802, + "learning_rate": 4.750972546757005e-05, + "loss": 5.1077, + "step": 24093 + }, + { + "epoch": 0.14329384337234752, + "grad_norm": 1.7787549495697021, + "learning_rate": 4.750952223572123e-05, + "loss": 5.1073, + "step": 24094 + }, + { + "epoch": 0.1432997906556285, + "grad_norm": 1.6317193508148193, + "learning_rate": 4.750931899601455e-05, + "loss": 5.3686, + "step": 24095 + }, + { + "epoch": 0.1433057379389095, + "grad_norm": 1.7646535634994507, + "learning_rate": 4.7509115748450106e-05, + "loss": 5.4542, + "step": 24096 + }, + { + "epoch": 0.1433116852221905, + "grad_norm": 1.679877519607544, + "learning_rate": 4.750891249302796e-05, + "loss": 5.7126, + "step": 24097 + }, + { + "epoch": 0.1433176325054715, + "grad_norm": 1.3325512409210205, + "learning_rate": 4.750870922974819e-05, + "loss": 5.512, + "step": 24098 + }, + { + "epoch": 0.1433235797887525, + "grad_norm": 1.443447470664978, + "learning_rate": 4.750850595861086e-05, + "loss": 5.4712, + "step": 24099 + }, + { + "epoch": 0.1433295270720335, + "grad_norm": 1.5300956964492798, + "learning_rate": 4.7508302679616044e-05, + "loss": 5.2247, + "step": 24100 + }, + { + "epoch": 0.1433354743553145, + "grad_norm": 1.4438292980194092, + "learning_rate": 4.750809939276381e-05, + "loss": 5.3292, + "step": 24101 + }, + { + "epoch": 0.14334142163859548, + "grad_norm": 1.5861626863479614, + "learning_rate": 4.750789609805423e-05, + "loss": 5.1881, + "step": 24102 + }, + { + "epoch": 0.1433473689218765, + "grad_norm": 1.4352222681045532, + "learning_rate": 4.750769279548738e-05, + "loss": 5.3461, + "step": 24103 + }, + { + "epoch": 0.14335331620515748, + "grad_norm": 1.4064099788665771, + "learning_rate": 4.750748948506332e-05, + "loss": 5.1699, + "step": 24104 + }, + { + "epoch": 0.14335926348843847, + "grad_norm": 1.2421483993530273, + "learning_rate": 4.7507286166782136e-05, + "loss": 5.3811, + "step": 24105 + }, + { + "epoch": 0.1433652107717195, + "grad_norm": 1.430109977722168, + "learning_rate": 4.750708284064389e-05, + "loss": 5.3169, + "step": 24106 + }, + { + "epoch": 0.14337115805500047, + "grad_norm": 1.4107475280761719, + "learning_rate": 4.750687950664865e-05, + "loss": 5.1744, + "step": 24107 + }, + { + "epoch": 0.14337710533828146, + "grad_norm": 1.4888633489608765, + "learning_rate": 4.750667616479649e-05, + "loss": 5.0892, + "step": 24108 + }, + { + "epoch": 0.14338305262156248, + "grad_norm": 1.5325970649719238, + "learning_rate": 4.7506472815087486e-05, + "loss": 4.8421, + "step": 24109 + }, + { + "epoch": 0.14338899990484347, + "grad_norm": 1.806287407875061, + "learning_rate": 4.75062694575217e-05, + "loss": 5.459, + "step": 24110 + }, + { + "epoch": 0.14339494718812446, + "grad_norm": 1.8281558752059937, + "learning_rate": 4.750606609209921e-05, + "loss": 4.7275, + "step": 24111 + }, + { + "epoch": 0.14340089447140547, + "grad_norm": 1.3527547121047974, + "learning_rate": 4.750586271882009e-05, + "loss": 5.4797, + "step": 24112 + }, + { + "epoch": 0.14340684175468646, + "grad_norm": 1.719956874847412, + "learning_rate": 4.75056593376844e-05, + "loss": 5.1069, + "step": 24113 + }, + { + "epoch": 0.14341278903796745, + "grad_norm": 1.484231948852539, + "learning_rate": 4.750545594869222e-05, + "loss": 5.2246, + "step": 24114 + }, + { + "epoch": 0.14341873632124846, + "grad_norm": 1.7525322437286377, + "learning_rate": 4.7505252551843615e-05, + "loss": 5.2036, + "step": 24115 + }, + { + "epoch": 0.14342468360452945, + "grad_norm": 1.6943596601486206, + "learning_rate": 4.7505049147138656e-05, + "loss": 5.6783, + "step": 24116 + }, + { + "epoch": 0.14343063088781044, + "grad_norm": 1.619377851486206, + "learning_rate": 4.750484573457743e-05, + "loss": 5.4861, + "step": 24117 + }, + { + "epoch": 0.14343657817109146, + "grad_norm": 1.9882891178131104, + "learning_rate": 4.750464231415998e-05, + "loss": 5.1085, + "step": 24118 + }, + { + "epoch": 0.14344252545437244, + "grad_norm": 1.4033042192459106, + "learning_rate": 4.75044388858864e-05, + "loss": 5.2776, + "step": 24119 + }, + { + "epoch": 0.14344847273765343, + "grad_norm": 1.2633885145187378, + "learning_rate": 4.750423544975675e-05, + "loss": 5.3406, + "step": 24120 + }, + { + "epoch": 0.14345442002093445, + "grad_norm": 1.4787468910217285, + "learning_rate": 4.7504032005771105e-05, + "loss": 5.5417, + "step": 24121 + }, + { + "epoch": 0.14346036730421544, + "grad_norm": 1.6677738428115845, + "learning_rate": 4.750382855392953e-05, + "loss": 5.39, + "step": 24122 + }, + { + "epoch": 0.14346631458749642, + "grad_norm": 1.6277536153793335, + "learning_rate": 4.750362509423211e-05, + "loss": 5.443, + "step": 24123 + }, + { + "epoch": 0.14347226187077744, + "grad_norm": 1.7157353162765503, + "learning_rate": 4.75034216266789e-05, + "loss": 5.6696, + "step": 24124 + }, + { + "epoch": 0.14347820915405843, + "grad_norm": 1.6321076154708862, + "learning_rate": 4.750321815126998e-05, + "loss": 5.4125, + "step": 24125 + }, + { + "epoch": 0.14348415643733942, + "grad_norm": 1.3769804239273071, + "learning_rate": 4.750301466800542e-05, + "loss": 5.5333, + "step": 24126 + }, + { + "epoch": 0.14349010372062043, + "grad_norm": 1.6320770978927612, + "learning_rate": 4.7502811176885286e-05, + "loss": 5.062, + "step": 24127 + }, + { + "epoch": 0.14349605100390142, + "grad_norm": 1.8570098876953125, + "learning_rate": 4.750260767790966e-05, + "loss": 4.8349, + "step": 24128 + }, + { + "epoch": 0.1435019982871824, + "grad_norm": 1.6399726867675781, + "learning_rate": 4.7502404171078604e-05, + "loss": 5.0899, + "step": 24129 + }, + { + "epoch": 0.14350794557046342, + "grad_norm": 1.6327539682388306, + "learning_rate": 4.7502200656392184e-05, + "loss": 5.4722, + "step": 24130 + }, + { + "epoch": 0.1435138928537444, + "grad_norm": 1.887136697769165, + "learning_rate": 4.750199713385048e-05, + "loss": 5.2569, + "step": 24131 + }, + { + "epoch": 0.1435198401370254, + "grad_norm": 1.8090238571166992, + "learning_rate": 4.750179360345357e-05, + "loss": 5.252, + "step": 24132 + }, + { + "epoch": 0.14352578742030642, + "grad_norm": 1.7913198471069336, + "learning_rate": 4.750159006520152e-05, + "loss": 5.2661, + "step": 24133 + }, + { + "epoch": 0.1435317347035874, + "grad_norm": 2.239309310913086, + "learning_rate": 4.7501386519094385e-05, + "loss": 5.1478, + "step": 24134 + }, + { + "epoch": 0.1435376819868684, + "grad_norm": 2.179140090942383, + "learning_rate": 4.750118296513225e-05, + "loss": 4.9088, + "step": 24135 + }, + { + "epoch": 0.1435436292701494, + "grad_norm": 1.629287838935852, + "learning_rate": 4.7500979403315186e-05, + "loss": 5.0642, + "step": 24136 + }, + { + "epoch": 0.1435495765534304, + "grad_norm": 1.598783254623413, + "learning_rate": 4.750077583364326e-05, + "loss": 5.7616, + "step": 24137 + }, + { + "epoch": 0.14355552383671139, + "grad_norm": 1.792859435081482, + "learning_rate": 4.750057225611656e-05, + "loss": 6.1022, + "step": 24138 + }, + { + "epoch": 0.1435614711199924, + "grad_norm": 1.728210687637329, + "learning_rate": 4.750036867073513e-05, + "loss": 5.904, + "step": 24139 + }, + { + "epoch": 0.1435674184032734, + "grad_norm": 1.9541816711425781, + "learning_rate": 4.7500165077499056e-05, + "loss": 5.3199, + "step": 24140 + }, + { + "epoch": 0.14357336568655438, + "grad_norm": 1.6042431592941284, + "learning_rate": 4.7499961476408405e-05, + "loss": 5.5277, + "step": 24141 + }, + { + "epoch": 0.14357931296983537, + "grad_norm": 1.50521719455719, + "learning_rate": 4.749975786746325e-05, + "loss": 5.4995, + "step": 24142 + }, + { + "epoch": 0.14358526025311638, + "grad_norm": 1.2425066232681274, + "learning_rate": 4.749955425066366e-05, + "loss": 5.6135, + "step": 24143 + }, + { + "epoch": 0.14359120753639737, + "grad_norm": 1.3020912408828735, + "learning_rate": 4.749935062600971e-05, + "loss": 5.5885, + "step": 24144 + }, + { + "epoch": 0.14359715481967836, + "grad_norm": 1.8732852935791016, + "learning_rate": 4.749914699350148e-05, + "loss": 5.3004, + "step": 24145 + }, + { + "epoch": 0.14360310210295937, + "grad_norm": 1.5296770334243774, + "learning_rate": 4.749894335313901e-05, + "loss": 5.5932, + "step": 24146 + }, + { + "epoch": 0.14360904938624036, + "grad_norm": 1.6563706398010254, + "learning_rate": 4.749873970492241e-05, + "loss": 5.4436, + "step": 24147 + }, + { + "epoch": 0.14361499666952135, + "grad_norm": 1.5168625116348267, + "learning_rate": 4.749853604885172e-05, + "loss": 5.5198, + "step": 24148 + }, + { + "epoch": 0.14362094395280237, + "grad_norm": 1.8161656856536865, + "learning_rate": 4.749833238492703e-05, + "loss": 5.3261, + "step": 24149 + }, + { + "epoch": 0.14362689123608335, + "grad_norm": 1.6286919116973877, + "learning_rate": 4.749812871314841e-05, + "loss": 5.3505, + "step": 24150 + }, + { + "epoch": 0.14363283851936434, + "grad_norm": 1.6236040592193604, + "learning_rate": 4.749792503351591e-05, + "loss": 5.4271, + "step": 24151 + }, + { + "epoch": 0.14363878580264536, + "grad_norm": 1.8177775144577026, + "learning_rate": 4.749772134602963e-05, + "loss": 5.2076, + "step": 24152 + }, + { + "epoch": 0.14364473308592635, + "grad_norm": 1.8818564414978027, + "learning_rate": 4.7497517650689616e-05, + "loss": 5.2685, + "step": 24153 + }, + { + "epoch": 0.14365068036920733, + "grad_norm": 1.7166740894317627, + "learning_rate": 4.749731394749596e-05, + "loss": 5.0742, + "step": 24154 + }, + { + "epoch": 0.14365662765248835, + "grad_norm": 1.6446893215179443, + "learning_rate": 4.749711023644873e-05, + "loss": 5.0406, + "step": 24155 + }, + { + "epoch": 0.14366257493576934, + "grad_norm": 1.5812546014785767, + "learning_rate": 4.749690651754798e-05, + "loss": 5.1155, + "step": 24156 + }, + { + "epoch": 0.14366852221905033, + "grad_norm": 1.8002673387527466, + "learning_rate": 4.749670279079379e-05, + "loss": 4.8509, + "step": 24157 + }, + { + "epoch": 0.14367446950233134, + "grad_norm": 1.6835267543792725, + "learning_rate": 4.749649905618624e-05, + "loss": 4.8694, + "step": 24158 + }, + { + "epoch": 0.14368041678561233, + "grad_norm": 1.605454683303833, + "learning_rate": 4.74962953137254e-05, + "loss": 4.926, + "step": 24159 + }, + { + "epoch": 0.14368636406889332, + "grad_norm": 1.6154637336730957, + "learning_rate": 4.749609156341133e-05, + "loss": 5.0548, + "step": 24160 + }, + { + "epoch": 0.14369231135217433, + "grad_norm": 1.7472615242004395, + "learning_rate": 4.74958878052441e-05, + "loss": 5.2218, + "step": 24161 + }, + { + "epoch": 0.14369825863545532, + "grad_norm": 1.80000901222229, + "learning_rate": 4.7495684039223795e-05, + "loss": 5.5268, + "step": 24162 + }, + { + "epoch": 0.1437042059187363, + "grad_norm": 1.6673831939697266, + "learning_rate": 4.749548026535048e-05, + "loss": 4.9823, + "step": 24163 + }, + { + "epoch": 0.14371015320201733, + "grad_norm": 1.5900602340698242, + "learning_rate": 4.749527648362422e-05, + "loss": 4.9122, + "step": 24164 + }, + { + "epoch": 0.14371610048529831, + "grad_norm": 1.538674235343933, + "learning_rate": 4.74950726940451e-05, + "loss": 4.887, + "step": 24165 + }, + { + "epoch": 0.1437220477685793, + "grad_norm": 1.5512803792953491, + "learning_rate": 4.749486889661318e-05, + "loss": 5.106, + "step": 24166 + }, + { + "epoch": 0.14372799505186032, + "grad_norm": 1.6589990854263306, + "learning_rate": 4.7494665091328524e-05, + "loss": 5.1019, + "step": 24167 + }, + { + "epoch": 0.1437339423351413, + "grad_norm": 1.3078352212905884, + "learning_rate": 4.7494461278191225e-05, + "loss": 5.5803, + "step": 24168 + }, + { + "epoch": 0.1437398896184223, + "grad_norm": 1.2839313745498657, + "learning_rate": 4.7494257457201333e-05, + "loss": 5.2538, + "step": 24169 + }, + { + "epoch": 0.1437458369017033, + "grad_norm": 1.6686280965805054, + "learning_rate": 4.749405362835894e-05, + "loss": 4.6737, + "step": 24170 + }, + { + "epoch": 0.1437517841849843, + "grad_norm": 1.6385589838027954, + "learning_rate": 4.7493849791664094e-05, + "loss": 5.3224, + "step": 24171 + }, + { + "epoch": 0.1437577314682653, + "grad_norm": 1.5661671161651611, + "learning_rate": 4.749364594711688e-05, + "loss": 5.4675, + "step": 24172 + }, + { + "epoch": 0.1437636787515463, + "grad_norm": 1.481903314590454, + "learning_rate": 4.749344209471737e-05, + "loss": 5.6801, + "step": 24173 + }, + { + "epoch": 0.1437696260348273, + "grad_norm": 1.6317354440689087, + "learning_rate": 4.749323823446562e-05, + "loss": 5.2531, + "step": 24174 + }, + { + "epoch": 0.14377557331810828, + "grad_norm": 1.7542403936386108, + "learning_rate": 4.749303436636173e-05, + "loss": 4.9242, + "step": 24175 + }, + { + "epoch": 0.1437815206013893, + "grad_norm": 1.7798454761505127, + "learning_rate": 4.7492830490405746e-05, + "loss": 5.0939, + "step": 24176 + }, + { + "epoch": 0.14378746788467028, + "grad_norm": 1.3787469863891602, + "learning_rate": 4.7492626606597744e-05, + "loss": 5.2257, + "step": 24177 + }, + { + "epoch": 0.14379341516795127, + "grad_norm": 1.7178335189819336, + "learning_rate": 4.7492422714937806e-05, + "loss": 4.5083, + "step": 24178 + }, + { + "epoch": 0.1437993624512323, + "grad_norm": 1.559964656829834, + "learning_rate": 4.7492218815425996e-05, + "loss": 5.3788, + "step": 24179 + }, + { + "epoch": 0.14380530973451328, + "grad_norm": 3.269479990005493, + "learning_rate": 4.749201490806238e-05, + "loss": 4.0238, + "step": 24180 + }, + { + "epoch": 0.14381125701779426, + "grad_norm": 1.696169137954712, + "learning_rate": 4.749181099284703e-05, + "loss": 5.7992, + "step": 24181 + }, + { + "epoch": 0.14381720430107528, + "grad_norm": 1.563265085220337, + "learning_rate": 4.749160706978003e-05, + "loss": 5.2459, + "step": 24182 + }, + { + "epoch": 0.14382315158435627, + "grad_norm": 1.6364827156066895, + "learning_rate": 4.7491403138861435e-05, + "loss": 5.2826, + "step": 24183 + }, + { + "epoch": 0.14382909886763726, + "grad_norm": 1.82567298412323, + "learning_rate": 4.749119920009132e-05, + "loss": 4.8079, + "step": 24184 + }, + { + "epoch": 0.14383504615091827, + "grad_norm": 1.3982584476470947, + "learning_rate": 4.7490995253469774e-05, + "loss": 5.4093, + "step": 24185 + }, + { + "epoch": 0.14384099343419926, + "grad_norm": 1.349155068397522, + "learning_rate": 4.749079129899684e-05, + "loss": 5.3707, + "step": 24186 + }, + { + "epoch": 0.14384694071748025, + "grad_norm": 1.4101881980895996, + "learning_rate": 4.749058733667261e-05, + "loss": 4.9554, + "step": 24187 + }, + { + "epoch": 0.14385288800076126, + "grad_norm": 1.1910806894302368, + "learning_rate": 4.749038336649715e-05, + "loss": 5.0658, + "step": 24188 + }, + { + "epoch": 0.14385883528404225, + "grad_norm": 1.5315760374069214, + "learning_rate": 4.749017938847052e-05, + "loss": 5.4716, + "step": 24189 + }, + { + "epoch": 0.14386478256732324, + "grad_norm": 1.1762129068374634, + "learning_rate": 4.7489975402592814e-05, + "loss": 5.6235, + "step": 24190 + }, + { + "epoch": 0.14387072985060426, + "grad_norm": 1.2317709922790527, + "learning_rate": 4.748977140886408e-05, + "loss": 5.8842, + "step": 24191 + }, + { + "epoch": 0.14387667713388524, + "grad_norm": 1.439610481262207, + "learning_rate": 4.7489567407284405e-05, + "loss": 5.4157, + "step": 24192 + }, + { + "epoch": 0.14388262441716623, + "grad_norm": 1.842933177947998, + "learning_rate": 4.7489363397853854e-05, + "loss": 5.1555, + "step": 24193 + }, + { + "epoch": 0.14388857170044725, + "grad_norm": 1.887911081314087, + "learning_rate": 4.748915938057249e-05, + "loss": 5.5591, + "step": 24194 + }, + { + "epoch": 0.14389451898372824, + "grad_norm": 1.7697376012802124, + "learning_rate": 4.7488955355440404e-05, + "loss": 5.5617, + "step": 24195 + }, + { + "epoch": 0.14390046626700922, + "grad_norm": 1.5946240425109863, + "learning_rate": 4.7488751322457655e-05, + "loss": 5.3901, + "step": 24196 + }, + { + "epoch": 0.14390641355029024, + "grad_norm": 1.7462904453277588, + "learning_rate": 4.7488547281624306e-05, + "loss": 5.3187, + "step": 24197 + }, + { + "epoch": 0.14391236083357123, + "grad_norm": 1.7388325929641724, + "learning_rate": 4.7488343232940445e-05, + "loss": 5.0042, + "step": 24198 + }, + { + "epoch": 0.14391830811685222, + "grad_norm": 1.5990902185440063, + "learning_rate": 4.7488139176406135e-05, + "loss": 5.1336, + "step": 24199 + }, + { + "epoch": 0.1439242554001332, + "grad_norm": 1.7063771486282349, + "learning_rate": 4.748793511202145e-05, + "loss": 5.6073, + "step": 24200 + }, + { + "epoch": 0.14393020268341422, + "grad_norm": 1.5042674541473389, + "learning_rate": 4.748773103978645e-05, + "loss": 5.6617, + "step": 24201 + }, + { + "epoch": 0.1439361499666952, + "grad_norm": 1.4366991519927979, + "learning_rate": 4.7487526959701225e-05, + "loss": 5.3679, + "step": 24202 + }, + { + "epoch": 0.1439420972499762, + "grad_norm": 1.571524977684021, + "learning_rate": 4.748732287176584e-05, + "loss": 5.5487, + "step": 24203 + }, + { + "epoch": 0.1439480445332572, + "grad_norm": 1.3584872484207153, + "learning_rate": 4.748711877598036e-05, + "loss": 5.3332, + "step": 24204 + }, + { + "epoch": 0.1439539918165382, + "grad_norm": 1.4718894958496094, + "learning_rate": 4.748691467234484e-05, + "loss": 5.3985, + "step": 24205 + }, + { + "epoch": 0.1439599390998192, + "grad_norm": 1.5978455543518066, + "learning_rate": 4.748671056085939e-05, + "loss": 5.6351, + "step": 24206 + }, + { + "epoch": 0.1439658863831002, + "grad_norm": 2.2037017345428467, + "learning_rate": 4.748650644152406e-05, + "loss": 4.9972, + "step": 24207 + }, + { + "epoch": 0.1439718336663812, + "grad_norm": 1.7493484020233154, + "learning_rate": 4.748630231433891e-05, + "loss": 4.8863, + "step": 24208 + }, + { + "epoch": 0.14397778094966218, + "grad_norm": 1.7967579364776611, + "learning_rate": 4.748609817930405e-05, + "loss": 5.5271, + "step": 24209 + }, + { + "epoch": 0.1439837282329432, + "grad_norm": 1.3049358129501343, + "learning_rate": 4.7485894036419505e-05, + "loss": 5.631, + "step": 24210 + }, + { + "epoch": 0.14398967551622419, + "grad_norm": 2.1333138942718506, + "learning_rate": 4.7485689885685366e-05, + "loss": 4.3777, + "step": 24211 + }, + { + "epoch": 0.14399562279950517, + "grad_norm": 1.7402033805847168, + "learning_rate": 4.748548572710172e-05, + "loss": 5.0069, + "step": 24212 + }, + { + "epoch": 0.1440015700827862, + "grad_norm": 1.5663232803344727, + "learning_rate": 4.748528156066861e-05, + "loss": 5.8514, + "step": 24213 + }, + { + "epoch": 0.14400751736606718, + "grad_norm": 1.5079457759857178, + "learning_rate": 4.748507738638612e-05, + "loss": 5.771, + "step": 24214 + }, + { + "epoch": 0.14401346464934817, + "grad_norm": 1.407939076423645, + "learning_rate": 4.7484873204254334e-05, + "loss": 5.405, + "step": 24215 + }, + { + "epoch": 0.14401941193262918, + "grad_norm": 1.6172797679901123, + "learning_rate": 4.7484669014273296e-05, + "loss": 5.3918, + "step": 24216 + }, + { + "epoch": 0.14402535921591017, + "grad_norm": 1.52508544921875, + "learning_rate": 4.74844648164431e-05, + "loss": 5.3287, + "step": 24217 + }, + { + "epoch": 0.14403130649919116, + "grad_norm": 1.6615005731582642, + "learning_rate": 4.7484260610763806e-05, + "loss": 5.3211, + "step": 24218 + }, + { + "epoch": 0.14403725378247217, + "grad_norm": 1.7896537780761719, + "learning_rate": 4.74840563972355e-05, + "loss": 5.3131, + "step": 24219 + }, + { + "epoch": 0.14404320106575316, + "grad_norm": 1.665890097618103, + "learning_rate": 4.748385217585823e-05, + "loss": 5.4934, + "step": 24220 + }, + { + "epoch": 0.14404914834903415, + "grad_norm": 1.9217110872268677, + "learning_rate": 4.7483647946632085e-05, + "loss": 4.9057, + "step": 24221 + }, + { + "epoch": 0.14405509563231517, + "grad_norm": 1.3658103942871094, + "learning_rate": 4.748344370955713e-05, + "loss": 5.3585, + "step": 24222 + }, + { + "epoch": 0.14406104291559615, + "grad_norm": 1.3099697828292847, + "learning_rate": 4.748323946463343e-05, + "loss": 5.7427, + "step": 24223 + }, + { + "epoch": 0.14406699019887714, + "grad_norm": 1.5619271993637085, + "learning_rate": 4.7483035211861075e-05, + "loss": 5.4217, + "step": 24224 + }, + { + "epoch": 0.14407293748215816, + "grad_norm": 1.6359944343566895, + "learning_rate": 4.748283095124012e-05, + "loss": 5.0194, + "step": 24225 + }, + { + "epoch": 0.14407888476543915, + "grad_norm": 1.5773736238479614, + "learning_rate": 4.748262668277064e-05, + "loss": 5.0422, + "step": 24226 + }, + { + "epoch": 0.14408483204872014, + "grad_norm": 1.4909980297088623, + "learning_rate": 4.748242240645271e-05, + "loss": 5.6089, + "step": 24227 + }, + { + "epoch": 0.14409077933200115, + "grad_norm": 1.3489822149276733, + "learning_rate": 4.74822181222864e-05, + "loss": 5.6137, + "step": 24228 + }, + { + "epoch": 0.14409672661528214, + "grad_norm": 1.3335795402526855, + "learning_rate": 4.748201383027178e-05, + "loss": 5.4704, + "step": 24229 + }, + { + "epoch": 0.14410267389856313, + "grad_norm": 1.2519936561584473, + "learning_rate": 4.748180953040891e-05, + "loss": 5.5211, + "step": 24230 + }, + { + "epoch": 0.14410862118184414, + "grad_norm": 1.3223121166229248, + "learning_rate": 4.748160522269788e-05, + "loss": 5.897, + "step": 24231 + }, + { + "epoch": 0.14411456846512513, + "grad_norm": 1.3471014499664307, + "learning_rate": 4.748140090713876e-05, + "loss": 5.5012, + "step": 24232 + }, + { + "epoch": 0.14412051574840612, + "grad_norm": 1.7432321310043335, + "learning_rate": 4.74811965837316e-05, + "loss": 5.5286, + "step": 24233 + }, + { + "epoch": 0.14412646303168714, + "grad_norm": 1.4858758449554443, + "learning_rate": 4.74809922524765e-05, + "loss": 5.0719, + "step": 24234 + }, + { + "epoch": 0.14413241031496812, + "grad_norm": 1.3750518560409546, + "learning_rate": 4.7480787913373515e-05, + "loss": 5.63, + "step": 24235 + }, + { + "epoch": 0.1441383575982491, + "grad_norm": 1.3795223236083984, + "learning_rate": 4.7480583566422723e-05, + "loss": 5.5985, + "step": 24236 + }, + { + "epoch": 0.14414430488153013, + "grad_norm": 1.5779204368591309, + "learning_rate": 4.7480379211624185e-05, + "loss": 5.4503, + "step": 24237 + }, + { + "epoch": 0.14415025216481112, + "grad_norm": 1.5513705015182495, + "learning_rate": 4.7480174848977974e-05, + "loss": 5.6559, + "step": 24238 + }, + { + "epoch": 0.1441561994480921, + "grad_norm": 1.3171751499176025, + "learning_rate": 4.747997047848417e-05, + "loss": 5.7664, + "step": 24239 + }, + { + "epoch": 0.14416214673137312, + "grad_norm": 1.4049638509750366, + "learning_rate": 4.7479766100142855e-05, + "loss": 5.7167, + "step": 24240 + }, + { + "epoch": 0.1441680940146541, + "grad_norm": 1.5657798051834106, + "learning_rate": 4.747956171395407e-05, + "loss": 5.3544, + "step": 24241 + }, + { + "epoch": 0.1441740412979351, + "grad_norm": 1.7015857696533203, + "learning_rate": 4.747935731991791e-05, + "loss": 5.2192, + "step": 24242 + }, + { + "epoch": 0.1441799885812161, + "grad_norm": 1.396626591682434, + "learning_rate": 4.7479152918034433e-05, + "loss": 5.6169, + "step": 24243 + }, + { + "epoch": 0.1441859358644971, + "grad_norm": 1.5319141149520874, + "learning_rate": 4.7478948508303714e-05, + "loss": 5.5103, + "step": 24244 + }, + { + "epoch": 0.1441918831477781, + "grad_norm": 1.878131628036499, + "learning_rate": 4.747874409072583e-05, + "loss": 5.0926, + "step": 24245 + }, + { + "epoch": 0.1441978304310591, + "grad_norm": 1.3702614307403564, + "learning_rate": 4.7478539665300845e-05, + "loss": 5.5891, + "step": 24246 + }, + { + "epoch": 0.1442037777143401, + "grad_norm": 1.729227066040039, + "learning_rate": 4.7478335232028845e-05, + "loss": 5.4893, + "step": 24247 + }, + { + "epoch": 0.14420972499762108, + "grad_norm": 1.356343150138855, + "learning_rate": 4.747813079090988e-05, + "loss": 5.3913, + "step": 24248 + }, + { + "epoch": 0.1442156722809021, + "grad_norm": 1.6735188961029053, + "learning_rate": 4.7477926341944036e-05, + "loss": 5.1161, + "step": 24249 + }, + { + "epoch": 0.14422161956418308, + "grad_norm": 1.6281756162643433, + "learning_rate": 4.7477721885131376e-05, + "loss": 5.0971, + "step": 24250 + }, + { + "epoch": 0.14422756684746407, + "grad_norm": 1.789338231086731, + "learning_rate": 4.747751742047199e-05, + "loss": 5.0477, + "step": 24251 + }, + { + "epoch": 0.1442335141307451, + "grad_norm": 2.3384926319122314, + "learning_rate": 4.7477312947965915e-05, + "loss": 4.5108, + "step": 24252 + }, + { + "epoch": 0.14423946141402608, + "grad_norm": 2.1642465591430664, + "learning_rate": 4.7477108467613255e-05, + "loss": 4.6503, + "step": 24253 + }, + { + "epoch": 0.14424540869730706, + "grad_norm": 2.0242364406585693, + "learning_rate": 4.747690397941406e-05, + "loss": 4.7346, + "step": 24254 + }, + { + "epoch": 0.14425135598058808, + "grad_norm": 2.543030023574829, + "learning_rate": 4.7476699483368414e-05, + "loss": 4.4076, + "step": 24255 + }, + { + "epoch": 0.14425730326386907, + "grad_norm": 2.274937391281128, + "learning_rate": 4.747649497947638e-05, + "loss": 4.5464, + "step": 24256 + }, + { + "epoch": 0.14426325054715006, + "grad_norm": 2.695321798324585, + "learning_rate": 4.747629046773805e-05, + "loss": 4.5794, + "step": 24257 + }, + { + "epoch": 0.14426919783043107, + "grad_norm": 2.2838776111602783, + "learning_rate": 4.7476085948153465e-05, + "loss": 4.6079, + "step": 24258 + }, + { + "epoch": 0.14427514511371206, + "grad_norm": 2.1405718326568604, + "learning_rate": 4.7475881420722714e-05, + "loss": 4.4428, + "step": 24259 + }, + { + "epoch": 0.14428109239699305, + "grad_norm": 2.17814302444458, + "learning_rate": 4.747567688544586e-05, + "loss": 4.3945, + "step": 24260 + }, + { + "epoch": 0.14428703968027404, + "grad_norm": 2.24731183052063, + "learning_rate": 4.747547234232299e-05, + "loss": 4.4622, + "step": 24261 + }, + { + "epoch": 0.14429298696355505, + "grad_norm": 2.2340478897094727, + "learning_rate": 4.747526779135416e-05, + "loss": 4.3968, + "step": 24262 + }, + { + "epoch": 0.14429893424683604, + "grad_norm": 2.1889898777008057, + "learning_rate": 4.747506323253944e-05, + "loss": 4.4357, + "step": 24263 + }, + { + "epoch": 0.14430488153011703, + "grad_norm": 2.30887770652771, + "learning_rate": 4.747485866587891e-05, + "loss": 4.3798, + "step": 24264 + }, + { + "epoch": 0.14431082881339805, + "grad_norm": 1.8898377418518066, + "learning_rate": 4.7474654091372645e-05, + "loss": 4.759, + "step": 24265 + }, + { + "epoch": 0.14431677609667903, + "grad_norm": 1.8610650300979614, + "learning_rate": 4.747444950902071e-05, + "loss": 5.2619, + "step": 24266 + }, + { + "epoch": 0.14432272337996002, + "grad_norm": 2.0524682998657227, + "learning_rate": 4.747424491882317e-05, + "loss": 5.1975, + "step": 24267 + }, + { + "epoch": 0.14432867066324104, + "grad_norm": 1.9053709506988525, + "learning_rate": 4.7474040320780114e-05, + "loss": 4.9233, + "step": 24268 + }, + { + "epoch": 0.14433461794652203, + "grad_norm": 1.8127448558807373, + "learning_rate": 4.747383571489159e-05, + "loss": 5.4335, + "step": 24269 + }, + { + "epoch": 0.14434056522980301, + "grad_norm": 1.6836609840393066, + "learning_rate": 4.747363110115769e-05, + "loss": 5.3978, + "step": 24270 + }, + { + "epoch": 0.14434651251308403, + "grad_norm": 1.5606380701065063, + "learning_rate": 4.747342647957848e-05, + "loss": 5.4756, + "step": 24271 + }, + { + "epoch": 0.14435245979636502, + "grad_norm": 1.5684814453125, + "learning_rate": 4.747322185015402e-05, + "loss": 5.2942, + "step": 24272 + }, + { + "epoch": 0.144358407079646, + "grad_norm": 1.4253596067428589, + "learning_rate": 4.7473017212884395e-05, + "loss": 5.3061, + "step": 24273 + }, + { + "epoch": 0.14436435436292702, + "grad_norm": 1.5249817371368408, + "learning_rate": 4.747281256776968e-05, + "loss": 5.2824, + "step": 24274 + }, + { + "epoch": 0.144370301646208, + "grad_norm": 1.7111622095108032, + "learning_rate": 4.747260791480992e-05, + "loss": 5.3591, + "step": 24275 + }, + { + "epoch": 0.144376248929489, + "grad_norm": 1.6259697675704956, + "learning_rate": 4.7472403254005216e-05, + "loss": 5.6083, + "step": 24276 + }, + { + "epoch": 0.14438219621277001, + "grad_norm": 1.7138687372207642, + "learning_rate": 4.7472198585355634e-05, + "loss": 5.45, + "step": 24277 + }, + { + "epoch": 0.144388143496051, + "grad_norm": 1.55049729347229, + "learning_rate": 4.7471993908861226e-05, + "loss": 5.413, + "step": 24278 + }, + { + "epoch": 0.144394090779332, + "grad_norm": 1.619774580001831, + "learning_rate": 4.7471789224522086e-05, + "loss": 5.4499, + "step": 24279 + }, + { + "epoch": 0.144400038062613, + "grad_norm": 1.4726954698562622, + "learning_rate": 4.747158453233828e-05, + "loss": 5.3787, + "step": 24280 + }, + { + "epoch": 0.144405985345894, + "grad_norm": 1.5688132047653198, + "learning_rate": 4.7471379832309865e-05, + "loss": 5.0952, + "step": 24281 + }, + { + "epoch": 0.14441193262917498, + "grad_norm": 1.5431749820709229, + "learning_rate": 4.747117512443693e-05, + "loss": 5.4646, + "step": 24282 + }, + { + "epoch": 0.144417879912456, + "grad_norm": 1.5271220207214355, + "learning_rate": 4.747097040871954e-05, + "loss": 4.7074, + "step": 24283 + }, + { + "epoch": 0.144423827195737, + "grad_norm": 1.49335777759552, + "learning_rate": 4.7470765685157765e-05, + "loss": 5.1271, + "step": 24284 + }, + { + "epoch": 0.14442977447901797, + "grad_norm": 1.624834418296814, + "learning_rate": 4.7470560953751675e-05, + "loss": 4.7448, + "step": 24285 + }, + { + "epoch": 0.144435721762299, + "grad_norm": 1.4151476621627808, + "learning_rate": 4.7470356214501355e-05, + "loss": 5.2011, + "step": 24286 + }, + { + "epoch": 0.14444166904557998, + "grad_norm": 1.4529035091400146, + "learning_rate": 4.747015146740685e-05, + "loss": 5.2849, + "step": 24287 + }, + { + "epoch": 0.14444761632886097, + "grad_norm": 1.43472158908844, + "learning_rate": 4.746994671246826e-05, + "loss": 5.2655, + "step": 24288 + }, + { + "epoch": 0.14445356361214198, + "grad_norm": 1.2202403545379639, + "learning_rate": 4.7469741949685645e-05, + "loss": 5.3629, + "step": 24289 + }, + { + "epoch": 0.14445951089542297, + "grad_norm": 1.5001815557479858, + "learning_rate": 4.746953717905906e-05, + "loss": 5.3728, + "step": 24290 + }, + { + "epoch": 0.14446545817870396, + "grad_norm": 1.3214131593704224, + "learning_rate": 4.7469332400588604e-05, + "loss": 5.2877, + "step": 24291 + }, + { + "epoch": 0.14447140546198498, + "grad_norm": 1.5443751811981201, + "learning_rate": 4.7469127614274334e-05, + "loss": 5.2852, + "step": 24292 + }, + { + "epoch": 0.14447735274526596, + "grad_norm": 1.63779616355896, + "learning_rate": 4.746892282011632e-05, + "loss": 5.1985, + "step": 24293 + }, + { + "epoch": 0.14448330002854695, + "grad_norm": 1.4744620323181152, + "learning_rate": 4.7468718018114644e-05, + "loss": 5.4607, + "step": 24294 + }, + { + "epoch": 0.14448924731182797, + "grad_norm": 1.6099250316619873, + "learning_rate": 4.7468513208269366e-05, + "loss": 5.3546, + "step": 24295 + }, + { + "epoch": 0.14449519459510896, + "grad_norm": 1.692960262298584, + "learning_rate": 4.746830839058056e-05, + "loss": 5.2117, + "step": 24296 + }, + { + "epoch": 0.14450114187838994, + "grad_norm": 2.379516363143921, + "learning_rate": 4.746810356504831e-05, + "loss": 4.3924, + "step": 24297 + }, + { + "epoch": 0.14450708916167096, + "grad_norm": 1.5348504781723022, + "learning_rate": 4.7467898731672665e-05, + "loss": 5.556, + "step": 24298 + }, + { + "epoch": 0.14451303644495195, + "grad_norm": 1.65830397605896, + "learning_rate": 4.746769389045371e-05, + "loss": 5.26, + "step": 24299 + }, + { + "epoch": 0.14451898372823294, + "grad_norm": 1.6785426139831543, + "learning_rate": 4.746748904139152e-05, + "loss": 5.6459, + "step": 24300 + }, + { + "epoch": 0.14452493101151395, + "grad_norm": 1.8990434408187866, + "learning_rate": 4.746728418448616e-05, + "loss": 5.7021, + "step": 24301 + }, + { + "epoch": 0.14453087829479494, + "grad_norm": 1.5564841032028198, + "learning_rate": 4.7467079319737706e-05, + "loss": 5.1878, + "step": 24302 + }, + { + "epoch": 0.14453682557807593, + "grad_norm": 1.5522741079330444, + "learning_rate": 4.7466874447146226e-05, + "loss": 5.356, + "step": 24303 + }, + { + "epoch": 0.14454277286135694, + "grad_norm": 1.5835893154144287, + "learning_rate": 4.746666956671179e-05, + "loss": 5.1861, + "step": 24304 + }, + { + "epoch": 0.14454872014463793, + "grad_norm": 1.5514174699783325, + "learning_rate": 4.746646467843447e-05, + "loss": 4.9673, + "step": 24305 + }, + { + "epoch": 0.14455466742791892, + "grad_norm": 1.5682575702667236, + "learning_rate": 4.746625978231435e-05, + "loss": 4.8175, + "step": 24306 + }, + { + "epoch": 0.14456061471119994, + "grad_norm": 1.7369959354400635, + "learning_rate": 4.746605487835148e-05, + "loss": 4.8891, + "step": 24307 + }, + { + "epoch": 0.14456656199448092, + "grad_norm": 1.5230990648269653, + "learning_rate": 4.7465849966545945e-05, + "loss": 4.7425, + "step": 24308 + }, + { + "epoch": 0.1445725092777619, + "grad_norm": 1.696858525276184, + "learning_rate": 4.7465645046897814e-05, + "loss": 5.2665, + "step": 24309 + }, + { + "epoch": 0.14457845656104293, + "grad_norm": 1.3940263986587524, + "learning_rate": 4.7465440119407153e-05, + "loss": 4.9574, + "step": 24310 + }, + { + "epoch": 0.14458440384432392, + "grad_norm": 1.6118072271347046, + "learning_rate": 4.7465235184074046e-05, + "loss": 4.6531, + "step": 24311 + }, + { + "epoch": 0.1445903511276049, + "grad_norm": 1.671982765197754, + "learning_rate": 4.746503024089856e-05, + "loss": 4.6481, + "step": 24312 + }, + { + "epoch": 0.14459629841088592, + "grad_norm": 1.74351167678833, + "learning_rate": 4.746482528988076e-05, + "loss": 4.6964, + "step": 24313 + }, + { + "epoch": 0.1446022456941669, + "grad_norm": 1.8614739179611206, + "learning_rate": 4.746462033102072e-05, + "loss": 4.6784, + "step": 24314 + }, + { + "epoch": 0.1446081929774479, + "grad_norm": 1.4908361434936523, + "learning_rate": 4.746441536431851e-05, + "loss": 4.5367, + "step": 24315 + }, + { + "epoch": 0.1446141402607289, + "grad_norm": 1.6088496446609497, + "learning_rate": 4.746421038977421e-05, + "loss": 4.6425, + "step": 24316 + }, + { + "epoch": 0.1446200875440099, + "grad_norm": 1.674081563949585, + "learning_rate": 4.746400540738789e-05, + "loss": 4.4158, + "step": 24317 + }, + { + "epoch": 0.1446260348272909, + "grad_norm": 1.8151460886001587, + "learning_rate": 4.746380041715961e-05, + "loss": 4.6386, + "step": 24318 + }, + { + "epoch": 0.14463198211057188, + "grad_norm": 1.9019746780395508, + "learning_rate": 4.7463595419089456e-05, + "loss": 5.501, + "step": 24319 + }, + { + "epoch": 0.1446379293938529, + "grad_norm": 1.4574391841888428, + "learning_rate": 4.746339041317749e-05, + "loss": 5.4025, + "step": 24320 + }, + { + "epoch": 0.14464387667713388, + "grad_norm": 1.6762443780899048, + "learning_rate": 4.746318539942378e-05, + "loss": 5.4696, + "step": 24321 + }, + { + "epoch": 0.14464982396041487, + "grad_norm": 1.6373367309570312, + "learning_rate": 4.746298037782841e-05, + "loss": 5.3375, + "step": 24322 + }, + { + "epoch": 0.14465577124369589, + "grad_norm": 2.50252103805542, + "learning_rate": 4.7462775348391455e-05, + "loss": 4.5236, + "step": 24323 + }, + { + "epoch": 0.14466171852697687, + "grad_norm": 2.569896936416626, + "learning_rate": 4.7462570311112965e-05, + "loss": 4.5617, + "step": 24324 + }, + { + "epoch": 0.14466766581025786, + "grad_norm": 2.6712963581085205, + "learning_rate": 4.7462365265993024e-05, + "loss": 4.552, + "step": 24325 + }, + { + "epoch": 0.14467361309353888, + "grad_norm": 2.3401951789855957, + "learning_rate": 4.7462160213031705e-05, + "loss": 4.306, + "step": 24326 + }, + { + "epoch": 0.14467956037681987, + "grad_norm": 2.5915024280548096, + "learning_rate": 4.746195515222908e-05, + "loss": 4.2392, + "step": 24327 + }, + { + "epoch": 0.14468550766010085, + "grad_norm": 1.6202619075775146, + "learning_rate": 4.746175008358522e-05, + "loss": 5.2185, + "step": 24328 + }, + { + "epoch": 0.14469145494338187, + "grad_norm": 1.3534679412841797, + "learning_rate": 4.746154500710019e-05, + "loss": 5.3462, + "step": 24329 + }, + { + "epoch": 0.14469740222666286, + "grad_norm": 1.6344646215438843, + "learning_rate": 4.746133992277407e-05, + "loss": 5.2465, + "step": 24330 + }, + { + "epoch": 0.14470334950994385, + "grad_norm": 1.4203686714172363, + "learning_rate": 4.7461134830606924e-05, + "loss": 5.3623, + "step": 24331 + }, + { + "epoch": 0.14470929679322486, + "grad_norm": 1.3993933200836182, + "learning_rate": 4.7460929730598834e-05, + "loss": 5.3726, + "step": 24332 + }, + { + "epoch": 0.14471524407650585, + "grad_norm": 1.804283618927002, + "learning_rate": 4.746072462274986e-05, + "loss": 4.8216, + "step": 24333 + }, + { + "epoch": 0.14472119135978684, + "grad_norm": 1.5801303386688232, + "learning_rate": 4.746051950706009e-05, + "loss": 5.1036, + "step": 24334 + }, + { + "epoch": 0.14472713864306785, + "grad_norm": 1.525907278060913, + "learning_rate": 4.746031438352957e-05, + "loss": 4.743, + "step": 24335 + }, + { + "epoch": 0.14473308592634884, + "grad_norm": 1.6091197729110718, + "learning_rate": 4.746010925215839e-05, + "loss": 5.347, + "step": 24336 + }, + { + "epoch": 0.14473903320962983, + "grad_norm": 1.6794999837875366, + "learning_rate": 4.7459904112946626e-05, + "loss": 4.7244, + "step": 24337 + }, + { + "epoch": 0.14474498049291085, + "grad_norm": 1.5076384544372559, + "learning_rate": 4.745969896589434e-05, + "loss": 4.591, + "step": 24338 + }, + { + "epoch": 0.14475092777619183, + "grad_norm": 1.6222561597824097, + "learning_rate": 4.74594938110016e-05, + "loss": 4.7175, + "step": 24339 + }, + { + "epoch": 0.14475687505947282, + "grad_norm": 1.6383036375045776, + "learning_rate": 4.745928864826848e-05, + "loss": 5.5165, + "step": 24340 + }, + { + "epoch": 0.14476282234275384, + "grad_norm": 1.4812443256378174, + "learning_rate": 4.745908347769507e-05, + "loss": 5.4227, + "step": 24341 + }, + { + "epoch": 0.14476876962603483, + "grad_norm": 1.4673051834106445, + "learning_rate": 4.7458878299281406e-05, + "loss": 5.1107, + "step": 24342 + }, + { + "epoch": 0.14477471690931581, + "grad_norm": 1.3475501537322998, + "learning_rate": 4.745867311302759e-05, + "loss": 4.9491, + "step": 24343 + }, + { + "epoch": 0.14478066419259683, + "grad_norm": 1.437537670135498, + "learning_rate": 4.745846791893368e-05, + "loss": 4.985, + "step": 24344 + }, + { + "epoch": 0.14478661147587782, + "grad_norm": 1.3732200860977173, + "learning_rate": 4.745826271699975e-05, + "loss": 4.9058, + "step": 24345 + }, + { + "epoch": 0.1447925587591588, + "grad_norm": 1.2727693319320679, + "learning_rate": 4.7458057507225875e-05, + "loss": 4.9508, + "step": 24346 + }, + { + "epoch": 0.14479850604243982, + "grad_norm": 1.0993971824645996, + "learning_rate": 4.7457852289612125e-05, + "loss": 5.125, + "step": 24347 + }, + { + "epoch": 0.1448044533257208, + "grad_norm": 1.325086236000061, + "learning_rate": 4.745764706415857e-05, + "loss": 5.4091, + "step": 24348 + }, + { + "epoch": 0.1448104006090018, + "grad_norm": 1.378989815711975, + "learning_rate": 4.745744183086528e-05, + "loss": 5.472, + "step": 24349 + }, + { + "epoch": 0.14481634789228282, + "grad_norm": 1.347360372543335, + "learning_rate": 4.745723658973233e-05, + "loss": 5.4071, + "step": 24350 + }, + { + "epoch": 0.1448222951755638, + "grad_norm": 1.367734670639038, + "learning_rate": 4.74570313407598e-05, + "loss": 5.3895, + "step": 24351 + }, + { + "epoch": 0.1448282424588448, + "grad_norm": 1.4136337041854858, + "learning_rate": 4.745682608394774e-05, + "loss": 5.4637, + "step": 24352 + }, + { + "epoch": 0.1448341897421258, + "grad_norm": 1.475825548171997, + "learning_rate": 4.745662081929624e-05, + "loss": 5.3135, + "step": 24353 + }, + { + "epoch": 0.1448401370254068, + "grad_norm": 1.6745150089263916, + "learning_rate": 4.745641554680537e-05, + "loss": 4.9959, + "step": 24354 + }, + { + "epoch": 0.14484608430868778, + "grad_norm": 1.7860320806503296, + "learning_rate": 4.7456210266475185e-05, + "loss": 5.054, + "step": 24355 + }, + { + "epoch": 0.1448520315919688, + "grad_norm": 1.456579327583313, + "learning_rate": 4.745600497830577e-05, + "loss": 5.2742, + "step": 24356 + }, + { + "epoch": 0.1448579788752498, + "grad_norm": 1.5492240190505981, + "learning_rate": 4.745579968229721e-05, + "loss": 5.0763, + "step": 24357 + }, + { + "epoch": 0.14486392615853078, + "grad_norm": 1.5172940492630005, + "learning_rate": 4.7455594378449554e-05, + "loss": 5.3951, + "step": 24358 + }, + { + "epoch": 0.1448698734418118, + "grad_norm": 1.5349613428115845, + "learning_rate": 4.7455389066762876e-05, + "loss": 5.5627, + "step": 24359 + }, + { + "epoch": 0.14487582072509278, + "grad_norm": 1.8341866731643677, + "learning_rate": 4.745518374723726e-05, + "loss": 5.2127, + "step": 24360 + }, + { + "epoch": 0.14488176800837377, + "grad_norm": 1.4852558374404907, + "learning_rate": 4.745497841987277e-05, + "loss": 5.2764, + "step": 24361 + }, + { + "epoch": 0.14488771529165478, + "grad_norm": 1.6629048585891724, + "learning_rate": 4.745477308466948e-05, + "loss": 5.0675, + "step": 24362 + }, + { + "epoch": 0.14489366257493577, + "grad_norm": 1.7459721565246582, + "learning_rate": 4.745456774162746e-05, + "loss": 5.0865, + "step": 24363 + }, + { + "epoch": 0.14489960985821676, + "grad_norm": 1.9257551431655884, + "learning_rate": 4.745436239074678e-05, + "loss": 4.9022, + "step": 24364 + }, + { + "epoch": 0.14490555714149778, + "grad_norm": 1.9146925210952759, + "learning_rate": 4.745415703202752e-05, + "loss": 5.3591, + "step": 24365 + }, + { + "epoch": 0.14491150442477876, + "grad_norm": 1.5624557733535767, + "learning_rate": 4.7453951665469744e-05, + "loss": 5.2383, + "step": 24366 + }, + { + "epoch": 0.14491745170805975, + "grad_norm": 1.4265660047531128, + "learning_rate": 4.745374629107352e-05, + "loss": 5.5559, + "step": 24367 + }, + { + "epoch": 0.14492339899134077, + "grad_norm": 2.072206497192383, + "learning_rate": 4.7453540908838934e-05, + "loss": 4.6001, + "step": 24368 + }, + { + "epoch": 0.14492934627462176, + "grad_norm": 2.144535779953003, + "learning_rate": 4.745333551876604e-05, + "loss": 4.6874, + "step": 24369 + }, + { + "epoch": 0.14493529355790274, + "grad_norm": 2.311624050140381, + "learning_rate": 4.745313012085492e-05, + "loss": 5.2824, + "step": 24370 + }, + { + "epoch": 0.14494124084118376, + "grad_norm": 1.6523234844207764, + "learning_rate": 4.745292471510565e-05, + "loss": 5.447, + "step": 24371 + }, + { + "epoch": 0.14494718812446475, + "grad_norm": 1.480470895767212, + "learning_rate": 4.745271930151829e-05, + "loss": 5.4511, + "step": 24372 + }, + { + "epoch": 0.14495313540774574, + "grad_norm": 1.6797810792922974, + "learning_rate": 4.7452513880092917e-05, + "loss": 5.258, + "step": 24373 + }, + { + "epoch": 0.14495908269102675, + "grad_norm": 1.541110873222351, + "learning_rate": 4.7452308450829615e-05, + "loss": 5.4877, + "step": 24374 + }, + { + "epoch": 0.14496502997430774, + "grad_norm": 1.8961621522903442, + "learning_rate": 4.745210301372843e-05, + "loss": 5.5844, + "step": 24375 + }, + { + "epoch": 0.14497097725758873, + "grad_norm": 1.8623143434524536, + "learning_rate": 4.745189756878945e-05, + "loss": 5.6454, + "step": 24376 + }, + { + "epoch": 0.14497692454086972, + "grad_norm": 1.6899166107177734, + "learning_rate": 4.745169211601276e-05, + "loss": 5.3369, + "step": 24377 + }, + { + "epoch": 0.14498287182415073, + "grad_norm": 1.7222342491149902, + "learning_rate": 4.74514866553984e-05, + "loss": 5.5805, + "step": 24378 + }, + { + "epoch": 0.14498881910743172, + "grad_norm": 1.7649940252304077, + "learning_rate": 4.745128118694646e-05, + "loss": 5.5568, + "step": 24379 + }, + { + "epoch": 0.1449947663907127, + "grad_norm": 1.9492725133895874, + "learning_rate": 4.745107571065701e-05, + "loss": 5.2019, + "step": 24380 + }, + { + "epoch": 0.14500071367399373, + "grad_norm": 1.6403963565826416, + "learning_rate": 4.745087022653013e-05, + "loss": 5.0867, + "step": 24381 + }, + { + "epoch": 0.1450066609572747, + "grad_norm": 1.6921762228012085, + "learning_rate": 4.7450664734565875e-05, + "loss": 4.823, + "step": 24382 + }, + { + "epoch": 0.1450126082405557, + "grad_norm": 1.7539616823196411, + "learning_rate": 4.745045923476432e-05, + "loss": 5.0268, + "step": 24383 + }, + { + "epoch": 0.14501855552383672, + "grad_norm": 1.7073962688446045, + "learning_rate": 4.745025372712555e-05, + "loss": 5.3082, + "step": 24384 + }, + { + "epoch": 0.1450245028071177, + "grad_norm": 1.457963228225708, + "learning_rate": 4.7450048211649626e-05, + "loss": 5.1926, + "step": 24385 + }, + { + "epoch": 0.1450304500903987, + "grad_norm": 1.7305623292922974, + "learning_rate": 4.744984268833662e-05, + "loss": 5.3563, + "step": 24386 + }, + { + "epoch": 0.1450363973736797, + "grad_norm": 1.4888592958450317, + "learning_rate": 4.744963715718661e-05, + "loss": 5.3478, + "step": 24387 + }, + { + "epoch": 0.1450423446569607, + "grad_norm": 1.7059639692306519, + "learning_rate": 4.744943161819966e-05, + "loss": 5.3782, + "step": 24388 + }, + { + "epoch": 0.14504829194024169, + "grad_norm": 1.539562463760376, + "learning_rate": 4.744922607137585e-05, + "loss": 5.4259, + "step": 24389 + }, + { + "epoch": 0.1450542392235227, + "grad_norm": 1.6427409648895264, + "learning_rate": 4.7449020516715245e-05, + "loss": 5.2505, + "step": 24390 + }, + { + "epoch": 0.1450601865068037, + "grad_norm": 1.5506988763809204, + "learning_rate": 4.7448814954217916e-05, + "loss": 5.214, + "step": 24391 + }, + { + "epoch": 0.14506613379008468, + "grad_norm": 1.7298414707183838, + "learning_rate": 4.744860938388395e-05, + "loss": 5.4361, + "step": 24392 + }, + { + "epoch": 0.1450720810733657, + "grad_norm": 1.6383203268051147, + "learning_rate": 4.744840380571339e-05, + "loss": 5.2703, + "step": 24393 + }, + { + "epoch": 0.14507802835664668, + "grad_norm": 1.6193071603775024, + "learning_rate": 4.744819821970633e-05, + "loss": 5.1414, + "step": 24394 + }, + { + "epoch": 0.14508397563992767, + "grad_norm": 1.6779429912567139, + "learning_rate": 4.7447992625862835e-05, + "loss": 5.1886, + "step": 24395 + }, + { + "epoch": 0.1450899229232087, + "grad_norm": 1.7938344478607178, + "learning_rate": 4.7447787024182975e-05, + "loss": 5.4733, + "step": 24396 + }, + { + "epoch": 0.14509587020648967, + "grad_norm": 1.7248293161392212, + "learning_rate": 4.7447581414666834e-05, + "loss": 5.484, + "step": 24397 + }, + { + "epoch": 0.14510181748977066, + "grad_norm": 1.7749347686767578, + "learning_rate": 4.744737579731447e-05, + "loss": 5.3896, + "step": 24398 + }, + { + "epoch": 0.14510776477305168, + "grad_norm": 1.6471116542816162, + "learning_rate": 4.744717017212595e-05, + "loss": 5.4622, + "step": 24399 + }, + { + "epoch": 0.14511371205633267, + "grad_norm": 1.7347856760025024, + "learning_rate": 4.7446964539101366e-05, + "loss": 5.5045, + "step": 24400 + }, + { + "epoch": 0.14511965933961365, + "grad_norm": 1.7716923952102661, + "learning_rate": 4.744675889824078e-05, + "loss": 5.3604, + "step": 24401 + }, + { + "epoch": 0.14512560662289467, + "grad_norm": 1.8484928607940674, + "learning_rate": 4.7446553249544253e-05, + "loss": 5.4746, + "step": 24402 + }, + { + "epoch": 0.14513155390617566, + "grad_norm": 1.7685359716415405, + "learning_rate": 4.7446347593011875e-05, + "loss": 5.4244, + "step": 24403 + }, + { + "epoch": 0.14513750118945665, + "grad_norm": 1.6140607595443726, + "learning_rate": 4.74461419286437e-05, + "loss": 5.4337, + "step": 24404 + }, + { + "epoch": 0.14514344847273766, + "grad_norm": 1.8425545692443848, + "learning_rate": 4.744593625643982e-05, + "loss": 4.7602, + "step": 24405 + }, + { + "epoch": 0.14514939575601865, + "grad_norm": 1.7787073850631714, + "learning_rate": 4.7445730576400284e-05, + "loss": 4.7792, + "step": 24406 + }, + { + "epoch": 0.14515534303929964, + "grad_norm": 1.7401658296585083, + "learning_rate": 4.7445524888525185e-05, + "loss": 5.1436, + "step": 24407 + }, + { + "epoch": 0.14516129032258066, + "grad_norm": 1.9028658866882324, + "learning_rate": 4.744531919281457e-05, + "loss": 5.2477, + "step": 24408 + }, + { + "epoch": 0.14516723760586164, + "grad_norm": 1.86017644405365, + "learning_rate": 4.7445113489268544e-05, + "loss": 5.593, + "step": 24409 + }, + { + "epoch": 0.14517318488914263, + "grad_norm": 1.551146149635315, + "learning_rate": 4.744490777788716e-05, + "loss": 5.7147, + "step": 24410 + }, + { + "epoch": 0.14517913217242365, + "grad_norm": 1.689828634262085, + "learning_rate": 4.744470205867048e-05, + "loss": 5.7174, + "step": 24411 + }, + { + "epoch": 0.14518507945570464, + "grad_norm": 1.6940490007400513, + "learning_rate": 4.744449633161859e-05, + "loss": 5.4586, + "step": 24412 + }, + { + "epoch": 0.14519102673898562, + "grad_norm": 1.6582127809524536, + "learning_rate": 4.7444290596731555e-05, + "loss": 5.4499, + "step": 24413 + }, + { + "epoch": 0.14519697402226664, + "grad_norm": 1.5289736986160278, + "learning_rate": 4.7444084854009454e-05, + "loss": 5.3323, + "step": 24414 + }, + { + "epoch": 0.14520292130554763, + "grad_norm": 1.597364068031311, + "learning_rate": 4.744387910345235e-05, + "loss": 5.2472, + "step": 24415 + }, + { + "epoch": 0.14520886858882862, + "grad_norm": 1.567718505859375, + "learning_rate": 4.7443673345060325e-05, + "loss": 5.1505, + "step": 24416 + }, + { + "epoch": 0.14521481587210963, + "grad_norm": 1.6296337842941284, + "learning_rate": 4.7443467578833446e-05, + "loss": 5.5358, + "step": 24417 + }, + { + "epoch": 0.14522076315539062, + "grad_norm": 1.5341614484786987, + "learning_rate": 4.744326180477179e-05, + "loss": 5.4139, + "step": 24418 + }, + { + "epoch": 0.1452267104386716, + "grad_norm": 1.6611801385879517, + "learning_rate": 4.744305602287541e-05, + "loss": 5.3999, + "step": 24419 + }, + { + "epoch": 0.14523265772195262, + "grad_norm": 1.4712778329849243, + "learning_rate": 4.74428502331444e-05, + "loss": 5.5498, + "step": 24420 + }, + { + "epoch": 0.1452386050052336, + "grad_norm": 1.6814862489700317, + "learning_rate": 4.744264443557882e-05, + "loss": 5.3511, + "step": 24421 + }, + { + "epoch": 0.1452445522885146, + "grad_norm": 1.512871265411377, + "learning_rate": 4.7442438630178746e-05, + "loss": 5.2377, + "step": 24422 + }, + { + "epoch": 0.14525049957179562, + "grad_norm": 1.4311738014221191, + "learning_rate": 4.744223281694424e-05, + "loss": 5.49, + "step": 24423 + }, + { + "epoch": 0.1452564468550766, + "grad_norm": 1.4469417333602905, + "learning_rate": 4.744202699587539e-05, + "loss": 5.2427, + "step": 24424 + }, + { + "epoch": 0.1452623941383576, + "grad_norm": 1.4444100856781006, + "learning_rate": 4.744182116697226e-05, + "loss": 5.263, + "step": 24425 + }, + { + "epoch": 0.1452683414216386, + "grad_norm": 1.4034851789474487, + "learning_rate": 4.744161533023492e-05, + "loss": 5.2735, + "step": 24426 + }, + { + "epoch": 0.1452742887049196, + "grad_norm": 1.637856364250183, + "learning_rate": 4.7441409485663444e-05, + "loss": 5.0982, + "step": 24427 + }, + { + "epoch": 0.14528023598820058, + "grad_norm": 1.7255091667175293, + "learning_rate": 4.7441203633257915e-05, + "loss": 4.9104, + "step": 24428 + }, + { + "epoch": 0.1452861832714816, + "grad_norm": 2.115915536880493, + "learning_rate": 4.744099777301838e-05, + "loss": 4.9661, + "step": 24429 + }, + { + "epoch": 0.1452921305547626, + "grad_norm": 1.8747011423110962, + "learning_rate": 4.7440791904944926e-05, + "loss": 5.2122, + "step": 24430 + }, + { + "epoch": 0.14529807783804358, + "grad_norm": 1.7300605773925781, + "learning_rate": 4.744058602903763e-05, + "loss": 5.1689, + "step": 24431 + }, + { + "epoch": 0.1453040251213246, + "grad_norm": 1.4435160160064697, + "learning_rate": 4.744038014529655e-05, + "loss": 5.2636, + "step": 24432 + }, + { + "epoch": 0.14530997240460558, + "grad_norm": 1.6441041231155396, + "learning_rate": 4.744017425372177e-05, + "loss": 5.2737, + "step": 24433 + }, + { + "epoch": 0.14531591968788657, + "grad_norm": 1.5537841320037842, + "learning_rate": 4.743996835431336e-05, + "loss": 5.1661, + "step": 24434 + }, + { + "epoch": 0.14532186697116756, + "grad_norm": 1.5431783199310303, + "learning_rate": 4.743976244707138e-05, + "loss": 5.0257, + "step": 24435 + }, + { + "epoch": 0.14532781425444857, + "grad_norm": 1.6137834787368774, + "learning_rate": 4.7439556531995914e-05, + "loss": 4.9459, + "step": 24436 + }, + { + "epoch": 0.14533376153772956, + "grad_norm": 1.6870076656341553, + "learning_rate": 4.743935060908703e-05, + "loss": 5.0615, + "step": 24437 + }, + { + "epoch": 0.14533970882101055, + "grad_norm": 1.7536146640777588, + "learning_rate": 4.74391446783448e-05, + "loss": 5.041, + "step": 24438 + }, + { + "epoch": 0.14534565610429157, + "grad_norm": 1.8259520530700684, + "learning_rate": 4.7438938739769304e-05, + "loss": 5.0222, + "step": 24439 + }, + { + "epoch": 0.14535160338757255, + "grad_norm": 1.9656455516815186, + "learning_rate": 4.74387327933606e-05, + "loss": 5.3352, + "step": 24440 + }, + { + "epoch": 0.14535755067085354, + "grad_norm": 2.096452236175537, + "learning_rate": 4.743852683911877e-05, + "loss": 5.4241, + "step": 24441 + }, + { + "epoch": 0.14536349795413456, + "grad_norm": 1.6562155485153198, + "learning_rate": 4.743832087704388e-05, + "loss": 6.0049, + "step": 24442 + }, + { + "epoch": 0.14536944523741555, + "grad_norm": 1.538763165473938, + "learning_rate": 4.7438114907136e-05, + "loss": 5.4588, + "step": 24443 + }, + { + "epoch": 0.14537539252069653, + "grad_norm": 1.835303783416748, + "learning_rate": 4.7437908929395216e-05, + "loss": 5.1866, + "step": 24444 + }, + { + "epoch": 0.14538133980397755, + "grad_norm": 1.6841330528259277, + "learning_rate": 4.743770294382158e-05, + "loss": 5.51, + "step": 24445 + }, + { + "epoch": 0.14538728708725854, + "grad_norm": 1.775283694267273, + "learning_rate": 4.743749695041517e-05, + "loss": 5.5482, + "step": 24446 + }, + { + "epoch": 0.14539323437053953, + "grad_norm": 1.5169485807418823, + "learning_rate": 4.7437290949176074e-05, + "loss": 5.5175, + "step": 24447 + }, + { + "epoch": 0.14539918165382054, + "grad_norm": 1.3337781429290771, + "learning_rate": 4.743708494010435e-05, + "loss": 5.8864, + "step": 24448 + }, + { + "epoch": 0.14540512893710153, + "grad_norm": 1.5488650798797607, + "learning_rate": 4.743687892320006e-05, + "loss": 5.9374, + "step": 24449 + }, + { + "epoch": 0.14541107622038252, + "grad_norm": 1.7683097124099731, + "learning_rate": 4.74366728984633e-05, + "loss": 5.2741, + "step": 24450 + }, + { + "epoch": 0.14541702350366353, + "grad_norm": 1.750689148902893, + "learning_rate": 4.743646686589413e-05, + "loss": 5.5179, + "step": 24451 + }, + { + "epoch": 0.14542297078694452, + "grad_norm": 1.8411931991577148, + "learning_rate": 4.7436260825492604e-05, + "loss": 5.2341, + "step": 24452 + }, + { + "epoch": 0.1454289180702255, + "grad_norm": 1.8112800121307373, + "learning_rate": 4.7436054777258824e-05, + "loss": 5.2025, + "step": 24453 + }, + { + "epoch": 0.14543486535350653, + "grad_norm": 1.5593929290771484, + "learning_rate": 4.743584872119285e-05, + "loss": 5.4906, + "step": 24454 + }, + { + "epoch": 0.14544081263678751, + "grad_norm": 1.683072805404663, + "learning_rate": 4.743564265729475e-05, + "loss": 5.279, + "step": 24455 + }, + { + "epoch": 0.1454467599200685, + "grad_norm": 1.6395639181137085, + "learning_rate": 4.74354365855646e-05, + "loss": 5.9672, + "step": 24456 + }, + { + "epoch": 0.14545270720334952, + "grad_norm": 1.5672929286956787, + "learning_rate": 4.743523050600247e-05, + "loss": 5.3588, + "step": 24457 + }, + { + "epoch": 0.1454586544866305, + "grad_norm": 1.7329927682876587, + "learning_rate": 4.7435024418608434e-05, + "loss": 5.1456, + "step": 24458 + }, + { + "epoch": 0.1454646017699115, + "grad_norm": 1.7443114519119263, + "learning_rate": 4.7434818323382554e-05, + "loss": 5.0256, + "step": 24459 + }, + { + "epoch": 0.1454705490531925, + "grad_norm": 1.6770588159561157, + "learning_rate": 4.7434612220324926e-05, + "loss": 5.0028, + "step": 24460 + }, + { + "epoch": 0.1454764963364735, + "grad_norm": 1.7134469747543335, + "learning_rate": 4.74344061094356e-05, + "loss": 5.0299, + "step": 24461 + }, + { + "epoch": 0.1454824436197545, + "grad_norm": 1.55935537815094, + "learning_rate": 4.743419999071465e-05, + "loss": 5.0422, + "step": 24462 + }, + { + "epoch": 0.1454883909030355, + "grad_norm": 1.722185730934143, + "learning_rate": 4.743399386416216e-05, + "loss": 4.9558, + "step": 24463 + }, + { + "epoch": 0.1454943381863165, + "grad_norm": 1.6128919124603271, + "learning_rate": 4.743378772977819e-05, + "loss": 4.903, + "step": 24464 + }, + { + "epoch": 0.14550028546959748, + "grad_norm": 1.6574269533157349, + "learning_rate": 4.7433581587562816e-05, + "loss": 4.9092, + "step": 24465 + }, + { + "epoch": 0.1455062327528785, + "grad_norm": 1.6132055521011353, + "learning_rate": 4.7433375437516116e-05, + "loss": 4.8561, + "step": 24466 + }, + { + "epoch": 0.14551218003615948, + "grad_norm": 1.7846872806549072, + "learning_rate": 4.743316927963814e-05, + "loss": 5.3115, + "step": 24467 + }, + { + "epoch": 0.14551812731944047, + "grad_norm": 1.787424087524414, + "learning_rate": 4.7432963113929e-05, + "loss": 5.2607, + "step": 24468 + }, + { + "epoch": 0.1455240746027215, + "grad_norm": 1.9011743068695068, + "learning_rate": 4.743275694038873e-05, + "loss": 4.989, + "step": 24469 + }, + { + "epoch": 0.14553002188600248, + "grad_norm": 1.7853960990905762, + "learning_rate": 4.7432550759017415e-05, + "loss": 5.066, + "step": 24470 + }, + { + "epoch": 0.14553596916928346, + "grad_norm": 2.131143569946289, + "learning_rate": 4.7432344569815134e-05, + "loss": 5.0322, + "step": 24471 + }, + { + "epoch": 0.14554191645256448, + "grad_norm": 1.7870924472808838, + "learning_rate": 4.743213837278195e-05, + "loss": 4.8767, + "step": 24472 + }, + { + "epoch": 0.14554786373584547, + "grad_norm": 1.8804802894592285, + "learning_rate": 4.743193216791795e-05, + "loss": 5.0155, + "step": 24473 + }, + { + "epoch": 0.14555381101912646, + "grad_norm": 2.4177560806274414, + "learning_rate": 4.7431725955223175e-05, + "loss": 4.6521, + "step": 24474 + }, + { + "epoch": 0.14555975830240747, + "grad_norm": 2.3657360076904297, + "learning_rate": 4.743151973469773e-05, + "loss": 4.5406, + "step": 24475 + }, + { + "epoch": 0.14556570558568846, + "grad_norm": 2.233304977416992, + "learning_rate": 4.743131350634167e-05, + "loss": 4.6725, + "step": 24476 + }, + { + "epoch": 0.14557165286896945, + "grad_norm": 2.314302921295166, + "learning_rate": 4.743110727015506e-05, + "loss": 4.2326, + "step": 24477 + }, + { + "epoch": 0.14557760015225046, + "grad_norm": 2.272599220275879, + "learning_rate": 4.7430901026137996e-05, + "loss": 4.2031, + "step": 24478 + }, + { + "epoch": 0.14558354743553145, + "grad_norm": 1.7667213678359985, + "learning_rate": 4.743069477429053e-05, + "loss": 5.0108, + "step": 24479 + }, + { + "epoch": 0.14558949471881244, + "grad_norm": 2.192775011062622, + "learning_rate": 4.7430488514612746e-05, + "loss": 4.0625, + "step": 24480 + }, + { + "epoch": 0.14559544200209346, + "grad_norm": 2.4205431938171387, + "learning_rate": 4.743028224710471e-05, + "loss": 4.1039, + "step": 24481 + }, + { + "epoch": 0.14560138928537444, + "grad_norm": 2.1844823360443115, + "learning_rate": 4.743007597176649e-05, + "loss": 3.9408, + "step": 24482 + }, + { + "epoch": 0.14560733656865543, + "grad_norm": 2.3235034942626953, + "learning_rate": 4.742986968859816e-05, + "loss": 4.0957, + "step": 24483 + }, + { + "epoch": 0.14561328385193645, + "grad_norm": 2.3802473545074463, + "learning_rate": 4.742966339759979e-05, + "loss": 4.2864, + "step": 24484 + }, + { + "epoch": 0.14561923113521744, + "grad_norm": 2.2253031730651855, + "learning_rate": 4.742945709877147e-05, + "loss": 4.1559, + "step": 24485 + }, + { + "epoch": 0.14562517841849842, + "grad_norm": 2.559008836746216, + "learning_rate": 4.742925079211324e-05, + "loss": 4.0356, + "step": 24486 + }, + { + "epoch": 0.14563112570177944, + "grad_norm": 2.222951889038086, + "learning_rate": 4.7429044477625206e-05, + "loss": 4.0193, + "step": 24487 + }, + { + "epoch": 0.14563707298506043, + "grad_norm": 1.9578197002410889, + "learning_rate": 4.742883815530742e-05, + "loss": 4.8917, + "step": 24488 + }, + { + "epoch": 0.14564302026834142, + "grad_norm": 1.8768174648284912, + "learning_rate": 4.742863182515996e-05, + "loss": 4.8987, + "step": 24489 + }, + { + "epoch": 0.14564896755162243, + "grad_norm": 2.0520718097686768, + "learning_rate": 4.7428425487182895e-05, + "loss": 5.2806, + "step": 24490 + }, + { + "epoch": 0.14565491483490342, + "grad_norm": 1.7171385288238525, + "learning_rate": 4.74282191413763e-05, + "loss": 4.801, + "step": 24491 + }, + { + "epoch": 0.1456608621181844, + "grad_norm": 1.5739022493362427, + "learning_rate": 4.742801278774024e-05, + "loss": 5.5888, + "step": 24492 + }, + { + "epoch": 0.1456668094014654, + "grad_norm": 1.6728390455245972, + "learning_rate": 4.742780642627479e-05, + "loss": 5.0339, + "step": 24493 + }, + { + "epoch": 0.1456727566847464, + "grad_norm": 1.5647993087768555, + "learning_rate": 4.7427600056980035e-05, + "loss": 4.859, + "step": 24494 + }, + { + "epoch": 0.1456787039680274, + "grad_norm": 1.8099721670150757, + "learning_rate": 4.7427393679856026e-05, + "loss": 5.4872, + "step": 24495 + }, + { + "epoch": 0.1456846512513084, + "grad_norm": 1.7053685188293457, + "learning_rate": 4.742718729490285e-05, + "loss": 5.0992, + "step": 24496 + }, + { + "epoch": 0.1456905985345894, + "grad_norm": 1.57960045337677, + "learning_rate": 4.742698090212058e-05, + "loss": 5.3847, + "step": 24497 + }, + { + "epoch": 0.1456965458178704, + "grad_norm": 1.6272963285446167, + "learning_rate": 4.7426774501509275e-05, + "loss": 5.2833, + "step": 24498 + }, + { + "epoch": 0.14570249310115138, + "grad_norm": 1.8782978057861328, + "learning_rate": 4.742656809306902e-05, + "loss": 5.2527, + "step": 24499 + }, + { + "epoch": 0.1457084403844324, + "grad_norm": 1.6581416130065918, + "learning_rate": 4.742636167679988e-05, + "loss": 5.4469, + "step": 24500 + }, + { + "epoch": 0.14571438766771339, + "grad_norm": 1.4809743165969849, + "learning_rate": 4.742615525270193e-05, + "loss": 5.5264, + "step": 24501 + }, + { + "epoch": 0.14572033495099437, + "grad_norm": 1.7145473957061768, + "learning_rate": 4.742594882077523e-05, + "loss": 5.3418, + "step": 24502 + }, + { + "epoch": 0.1457262822342754, + "grad_norm": 1.5335949659347534, + "learning_rate": 4.742574238101988e-05, + "loss": 5.3467, + "step": 24503 + }, + { + "epoch": 0.14573222951755638, + "grad_norm": 1.4682936668395996, + "learning_rate": 4.742553593343593e-05, + "loss": 5.3817, + "step": 24504 + }, + { + "epoch": 0.14573817680083737, + "grad_norm": 1.3231433629989624, + "learning_rate": 4.742532947802345e-05, + "loss": 5.4963, + "step": 24505 + }, + { + "epoch": 0.14574412408411838, + "grad_norm": 1.4141665697097778, + "learning_rate": 4.7425123014782525e-05, + "loss": 5.6261, + "step": 24506 + }, + { + "epoch": 0.14575007136739937, + "grad_norm": 1.5164703130722046, + "learning_rate": 4.742491654371322e-05, + "loss": 5.8411, + "step": 24507 + }, + { + "epoch": 0.14575601865068036, + "grad_norm": 1.309892177581787, + "learning_rate": 4.7424710064815606e-05, + "loss": 5.497, + "step": 24508 + }, + { + "epoch": 0.14576196593396137, + "grad_norm": 1.9315495491027832, + "learning_rate": 4.742450357808976e-05, + "loss": 5.5718, + "step": 24509 + }, + { + "epoch": 0.14576791321724236, + "grad_norm": 1.3881922960281372, + "learning_rate": 4.742429708353575e-05, + "loss": 5.6583, + "step": 24510 + }, + { + "epoch": 0.14577386050052335, + "grad_norm": 1.186221957206726, + "learning_rate": 4.7424090581153654e-05, + "loss": 5.5111, + "step": 24511 + }, + { + "epoch": 0.14577980778380437, + "grad_norm": 1.5839451551437378, + "learning_rate": 4.742388407094354e-05, + "loss": 5.285, + "step": 24512 + }, + { + "epoch": 0.14578575506708535, + "grad_norm": 1.659534215927124, + "learning_rate": 4.7423677552905474e-05, + "loss": 5.2722, + "step": 24513 + }, + { + "epoch": 0.14579170235036634, + "grad_norm": 1.530068278312683, + "learning_rate": 4.742347102703953e-05, + "loss": 5.6943, + "step": 24514 + }, + { + "epoch": 0.14579764963364736, + "grad_norm": 1.966497540473938, + "learning_rate": 4.7423264493345794e-05, + "loss": 5.3509, + "step": 24515 + }, + { + "epoch": 0.14580359691692835, + "grad_norm": 2.2554593086242676, + "learning_rate": 4.7423057951824325e-05, + "loss": 4.8778, + "step": 24516 + }, + { + "epoch": 0.14580954420020933, + "grad_norm": 1.746324062347412, + "learning_rate": 4.7422851402475195e-05, + "loss": 5.2867, + "step": 24517 + }, + { + "epoch": 0.14581549148349035, + "grad_norm": 1.5312012434005737, + "learning_rate": 4.7422644845298484e-05, + "loss": 5.3472, + "step": 24518 + }, + { + "epoch": 0.14582143876677134, + "grad_norm": 1.8742462396621704, + "learning_rate": 4.742243828029426e-05, + "loss": 5.2399, + "step": 24519 + }, + { + "epoch": 0.14582738605005233, + "grad_norm": 1.563302993774414, + "learning_rate": 4.7422231707462585e-05, + "loss": 5.3742, + "step": 24520 + }, + { + "epoch": 0.14583333333333334, + "grad_norm": 1.7737884521484375, + "learning_rate": 4.7422025126803545e-05, + "loss": 5.6674, + "step": 24521 + }, + { + "epoch": 0.14583928061661433, + "grad_norm": 1.9887245893478394, + "learning_rate": 4.742181853831721e-05, + "loss": 5.3851, + "step": 24522 + }, + { + "epoch": 0.14584522789989532, + "grad_norm": 1.773938775062561, + "learning_rate": 4.7421611942003654e-05, + "loss": 5.22, + "step": 24523 + }, + { + "epoch": 0.14585117518317633, + "grad_norm": 1.733723521232605, + "learning_rate": 4.742140533786294e-05, + "loss": 5.0786, + "step": 24524 + }, + { + "epoch": 0.14585712246645732, + "grad_norm": 1.7058782577514648, + "learning_rate": 4.742119872589514e-05, + "loss": 5.214, + "step": 24525 + }, + { + "epoch": 0.1458630697497383, + "grad_norm": 1.7503206729888916, + "learning_rate": 4.742099210610034e-05, + "loss": 5.3132, + "step": 24526 + }, + { + "epoch": 0.14586901703301933, + "grad_norm": 1.9028650522232056, + "learning_rate": 4.7420785478478596e-05, + "loss": 5.3016, + "step": 24527 + }, + { + "epoch": 0.14587496431630032, + "grad_norm": 1.7530872821807861, + "learning_rate": 4.742057884302999e-05, + "loss": 5.199, + "step": 24528 + }, + { + "epoch": 0.1458809115995813, + "grad_norm": 1.8776800632476807, + "learning_rate": 4.7420372199754595e-05, + "loss": 5.0358, + "step": 24529 + }, + { + "epoch": 0.14588685888286232, + "grad_norm": 1.6402316093444824, + "learning_rate": 4.7420165548652474e-05, + "loss": 5.0548, + "step": 24530 + }, + { + "epoch": 0.1458928061661433, + "grad_norm": 1.9277185201644897, + "learning_rate": 4.741995888972371e-05, + "loss": 5.0196, + "step": 24531 + }, + { + "epoch": 0.1458987534494243, + "grad_norm": 1.7798771858215332, + "learning_rate": 4.7419752222968364e-05, + "loss": 5.0015, + "step": 24532 + }, + { + "epoch": 0.1459047007327053, + "grad_norm": 1.6921379566192627, + "learning_rate": 4.741954554838652e-05, + "loss": 5.0044, + "step": 24533 + }, + { + "epoch": 0.1459106480159863, + "grad_norm": 1.5286321640014648, + "learning_rate": 4.741933886597825e-05, + "loss": 5.2836, + "step": 24534 + }, + { + "epoch": 0.1459165952992673, + "grad_norm": 1.5439866781234741, + "learning_rate": 4.741913217574361e-05, + "loss": 5.645, + "step": 24535 + }, + { + "epoch": 0.1459225425825483, + "grad_norm": 1.8537307977676392, + "learning_rate": 4.741892547768269e-05, + "loss": 5.7112, + "step": 24536 + }, + { + "epoch": 0.1459284898658293, + "grad_norm": 1.458747386932373, + "learning_rate": 4.741871877179554e-05, + "loss": 5.3639, + "step": 24537 + }, + { + "epoch": 0.14593443714911028, + "grad_norm": 1.8507471084594727, + "learning_rate": 4.7418512058082255e-05, + "loss": 4.7947, + "step": 24538 + }, + { + "epoch": 0.1459403844323913, + "grad_norm": 1.8104653358459473, + "learning_rate": 4.74183053365429e-05, + "loss": 4.9444, + "step": 24539 + }, + { + "epoch": 0.14594633171567228, + "grad_norm": 1.8392473459243774, + "learning_rate": 4.741809860717755e-05, + "loss": 4.6432, + "step": 24540 + }, + { + "epoch": 0.14595227899895327, + "grad_norm": 1.8322739601135254, + "learning_rate": 4.7417891869986274e-05, + "loss": 4.8165, + "step": 24541 + }, + { + "epoch": 0.1459582262822343, + "grad_norm": 1.7574645280838013, + "learning_rate": 4.741768512496914e-05, + "loss": 4.5592, + "step": 24542 + }, + { + "epoch": 0.14596417356551528, + "grad_norm": 1.6960285902023315, + "learning_rate": 4.7417478372126223e-05, + "loss": 4.8203, + "step": 24543 + }, + { + "epoch": 0.14597012084879626, + "grad_norm": 1.624930739402771, + "learning_rate": 4.741727161145759e-05, + "loss": 4.7056, + "step": 24544 + }, + { + "epoch": 0.14597606813207728, + "grad_norm": 1.6901119947433472, + "learning_rate": 4.741706484296333e-05, + "loss": 4.8837, + "step": 24545 + }, + { + "epoch": 0.14598201541535827, + "grad_norm": 1.6677742004394531, + "learning_rate": 4.74168580666435e-05, + "loss": 5.777, + "step": 24546 + }, + { + "epoch": 0.14598796269863926, + "grad_norm": 1.9622048139572144, + "learning_rate": 4.741665128249818e-05, + "loss": 5.1728, + "step": 24547 + }, + { + "epoch": 0.14599390998192027, + "grad_norm": 2.1024181842803955, + "learning_rate": 4.7416444490527435e-05, + "loss": 5.1417, + "step": 24548 + }, + { + "epoch": 0.14599985726520126, + "grad_norm": 1.9071123600006104, + "learning_rate": 4.7416237690731336e-05, + "loss": 5.1996, + "step": 24549 + }, + { + "epoch": 0.14600580454848225, + "grad_norm": 2.404794931411743, + "learning_rate": 4.741603088310997e-05, + "loss": 5.2283, + "step": 24550 + }, + { + "epoch": 0.14601175183176324, + "grad_norm": 1.6359655857086182, + "learning_rate": 4.74158240676634e-05, + "loss": 5.3233, + "step": 24551 + }, + { + "epoch": 0.14601769911504425, + "grad_norm": 2.5952274799346924, + "learning_rate": 4.7415617244391686e-05, + "loss": 4.9227, + "step": 24552 + }, + { + "epoch": 0.14602364639832524, + "grad_norm": 1.709825038909912, + "learning_rate": 4.7415410413294914e-05, + "loss": 5.2745, + "step": 24553 + }, + { + "epoch": 0.14602959368160623, + "grad_norm": 1.709489345550537, + "learning_rate": 4.741520357437316e-05, + "loss": 5.0694, + "step": 24554 + }, + { + "epoch": 0.14603554096488724, + "grad_norm": 1.6386815309524536, + "learning_rate": 4.7414996727626484e-05, + "loss": 5.1265, + "step": 24555 + }, + { + "epoch": 0.14604148824816823, + "grad_norm": 1.4357349872589111, + "learning_rate": 4.741478987305497e-05, + "loss": 5.149, + "step": 24556 + }, + { + "epoch": 0.14604743553144922, + "grad_norm": 1.951442003250122, + "learning_rate": 4.741458301065868e-05, + "loss": 5.0956, + "step": 24557 + }, + { + "epoch": 0.14605338281473024, + "grad_norm": 2.0688650608062744, + "learning_rate": 4.7414376140437696e-05, + "loss": 4.8894, + "step": 24558 + }, + { + "epoch": 0.14605933009801123, + "grad_norm": 1.6985790729522705, + "learning_rate": 4.741416926239208e-05, + "loss": 4.9548, + "step": 24559 + }, + { + "epoch": 0.1460652773812922, + "grad_norm": 1.5429292917251587, + "learning_rate": 4.7413962376521906e-05, + "loss": 4.9634, + "step": 24560 + }, + { + "epoch": 0.14607122466457323, + "grad_norm": 1.5821011066436768, + "learning_rate": 4.741375548282726e-05, + "loss": 5.3701, + "step": 24561 + }, + { + "epoch": 0.14607717194785422, + "grad_norm": 1.5868496894836426, + "learning_rate": 4.7413548581308196e-05, + "loss": 5.0315, + "step": 24562 + }, + { + "epoch": 0.1460831192311352, + "grad_norm": 1.471294641494751, + "learning_rate": 4.74133416719648e-05, + "loss": 4.9128, + "step": 24563 + }, + { + "epoch": 0.14608906651441622, + "grad_norm": 1.4862011671066284, + "learning_rate": 4.7413134754797126e-05, + "loss": 4.8533, + "step": 24564 + }, + { + "epoch": 0.1460950137976972, + "grad_norm": 1.47359037399292, + "learning_rate": 4.741292782980527e-05, + "loss": 4.8428, + "step": 24565 + }, + { + "epoch": 0.1461009610809782, + "grad_norm": 1.4886908531188965, + "learning_rate": 4.741272089698928e-05, + "loss": 4.8365, + "step": 24566 + }, + { + "epoch": 0.1461069083642592, + "grad_norm": 1.561625599861145, + "learning_rate": 4.741251395634925e-05, + "loss": 4.9553, + "step": 24567 + }, + { + "epoch": 0.1461128556475402, + "grad_norm": 1.5089234113693237, + "learning_rate": 4.741230700788524e-05, + "loss": 4.7997, + "step": 24568 + }, + { + "epoch": 0.1461188029308212, + "grad_norm": 1.5985972881317139, + "learning_rate": 4.741210005159733e-05, + "loss": 4.8006, + "step": 24569 + }, + { + "epoch": 0.1461247502141022, + "grad_norm": 1.5302664041519165, + "learning_rate": 4.741189308748558e-05, + "loss": 4.7809, + "step": 24570 + }, + { + "epoch": 0.1461306974973832, + "grad_norm": 1.5156875848770142, + "learning_rate": 4.7411686115550074e-05, + "loss": 4.6965, + "step": 24571 + }, + { + "epoch": 0.14613664478066418, + "grad_norm": 1.6026439666748047, + "learning_rate": 4.741147913579088e-05, + "loss": 4.9386, + "step": 24572 + }, + { + "epoch": 0.1461425920639452, + "grad_norm": 1.849469542503357, + "learning_rate": 4.7411272148208067e-05, + "loss": 5.7675, + "step": 24573 + }, + { + "epoch": 0.1461485393472262, + "grad_norm": 1.9813694953918457, + "learning_rate": 4.7411065152801716e-05, + "loss": 5.3741, + "step": 24574 + }, + { + "epoch": 0.14615448663050717, + "grad_norm": 2.459035634994507, + "learning_rate": 4.741085814957189e-05, + "loss": 4.6126, + "step": 24575 + }, + { + "epoch": 0.1461604339137882, + "grad_norm": 2.858220100402832, + "learning_rate": 4.741065113851867e-05, + "loss": 4.1891, + "step": 24576 + }, + { + "epoch": 0.14616638119706918, + "grad_norm": 2.2826805114746094, + "learning_rate": 4.741044411964212e-05, + "loss": 4.4009, + "step": 24577 + }, + { + "epoch": 0.14617232848035017, + "grad_norm": 2.0174343585968018, + "learning_rate": 4.741023709294231e-05, + "loss": 4.946, + "step": 24578 + }, + { + "epoch": 0.14617827576363118, + "grad_norm": 2.0307867527008057, + "learning_rate": 4.741003005841932e-05, + "loss": 5.0872, + "step": 24579 + }, + { + "epoch": 0.14618422304691217, + "grad_norm": 2.147662878036499, + "learning_rate": 4.740982301607323e-05, + "loss": 4.648, + "step": 24580 + }, + { + "epoch": 0.14619017033019316, + "grad_norm": 2.7005789279937744, + "learning_rate": 4.740961596590409e-05, + "loss": 5.0555, + "step": 24581 + }, + { + "epoch": 0.14619611761347417, + "grad_norm": 2.3652596473693848, + "learning_rate": 4.740940890791199e-05, + "loss": 4.7969, + "step": 24582 + }, + { + "epoch": 0.14620206489675516, + "grad_norm": 2.5925567150115967, + "learning_rate": 4.7409201842097e-05, + "loss": 4.7544, + "step": 24583 + }, + { + "epoch": 0.14620801218003615, + "grad_norm": 1.9309169054031372, + "learning_rate": 4.740899476845918e-05, + "loss": 5.0901, + "step": 24584 + }, + { + "epoch": 0.14621395946331717, + "grad_norm": 2.6501107215881348, + "learning_rate": 4.740878768699861e-05, + "loss": 5.1449, + "step": 24585 + }, + { + "epoch": 0.14621990674659816, + "grad_norm": 2.3010451793670654, + "learning_rate": 4.7408580597715376e-05, + "loss": 5.276, + "step": 24586 + }, + { + "epoch": 0.14622585402987914, + "grad_norm": 1.8606983423233032, + "learning_rate": 4.740837350060953e-05, + "loss": 5.1453, + "step": 24587 + }, + { + "epoch": 0.14623180131316016, + "grad_norm": 2.0047266483306885, + "learning_rate": 4.740816639568115e-05, + "loss": 4.8976, + "step": 24588 + }, + { + "epoch": 0.14623774859644115, + "grad_norm": 2.4806363582611084, + "learning_rate": 4.740795928293032e-05, + "loss": 4.1182, + "step": 24589 + }, + { + "epoch": 0.14624369587972214, + "grad_norm": 2.560715675354004, + "learning_rate": 4.74077521623571e-05, + "loss": 4.4461, + "step": 24590 + }, + { + "epoch": 0.14624964316300315, + "grad_norm": 2.3709921836853027, + "learning_rate": 4.740754503396156e-05, + "loss": 4.5193, + "step": 24591 + }, + { + "epoch": 0.14625559044628414, + "grad_norm": 2.1095876693725586, + "learning_rate": 4.7407337897743784e-05, + "loss": 4.881, + "step": 24592 + }, + { + "epoch": 0.14626153772956513, + "grad_norm": 1.6448874473571777, + "learning_rate": 4.740713075370383e-05, + "loss": 5.0707, + "step": 24593 + }, + { + "epoch": 0.14626748501284614, + "grad_norm": 1.9237885475158691, + "learning_rate": 4.740692360184178e-05, + "loss": 5.0708, + "step": 24594 + }, + { + "epoch": 0.14627343229612713, + "grad_norm": 1.7685006856918335, + "learning_rate": 4.740671644215771e-05, + "loss": 5.0034, + "step": 24595 + }, + { + "epoch": 0.14627937957940812, + "grad_norm": 1.999850869178772, + "learning_rate": 4.740650927465169e-05, + "loss": 5.1153, + "step": 24596 + }, + { + "epoch": 0.14628532686268914, + "grad_norm": 2.0358314514160156, + "learning_rate": 4.740630209932378e-05, + "loss": 5.0567, + "step": 24597 + }, + { + "epoch": 0.14629127414597012, + "grad_norm": 1.883933424949646, + "learning_rate": 4.740609491617407e-05, + "loss": 5.0562, + "step": 24598 + }, + { + "epoch": 0.1462972214292511, + "grad_norm": 2.0172266960144043, + "learning_rate": 4.740588772520261e-05, + "loss": 5.0597, + "step": 24599 + }, + { + "epoch": 0.14630316871253213, + "grad_norm": 1.798579216003418, + "learning_rate": 4.74056805264095e-05, + "loss": 4.9391, + "step": 24600 + }, + { + "epoch": 0.14630911599581312, + "grad_norm": 1.8433833122253418, + "learning_rate": 4.7405473319794794e-05, + "loss": 5.0088, + "step": 24601 + }, + { + "epoch": 0.1463150632790941, + "grad_norm": 1.7729485034942627, + "learning_rate": 4.7405266105358564e-05, + "loss": 4.8909, + "step": 24602 + }, + { + "epoch": 0.14632101056237512, + "grad_norm": 1.9823477268218994, + "learning_rate": 4.740505888310089e-05, + "loss": 5.0547, + "step": 24603 + }, + { + "epoch": 0.1463269578456561, + "grad_norm": 2.0508856773376465, + "learning_rate": 4.740485165302184e-05, + "loss": 5.0857, + "step": 24604 + }, + { + "epoch": 0.1463329051289371, + "grad_norm": 2.0253899097442627, + "learning_rate": 4.740464441512149e-05, + "loss": 4.9882, + "step": 24605 + }, + { + "epoch": 0.1463388524122181, + "grad_norm": 1.977512001991272, + "learning_rate": 4.740443716939991e-05, + "loss": 4.8881, + "step": 24606 + }, + { + "epoch": 0.1463447996954991, + "grad_norm": 1.8985627889633179, + "learning_rate": 4.7404229915857175e-05, + "loss": 5.0182, + "step": 24607 + }, + { + "epoch": 0.1463507469787801, + "grad_norm": 2.009416103363037, + "learning_rate": 4.7404022654493355e-05, + "loss": 4.7361, + "step": 24608 + }, + { + "epoch": 0.14635669426206108, + "grad_norm": 2.3150322437286377, + "learning_rate": 4.7403815385308514e-05, + "loss": 4.2706, + "step": 24609 + }, + { + "epoch": 0.1463626415453421, + "grad_norm": 2.10493540763855, + "learning_rate": 4.740360810830275e-05, + "loss": 4.2009, + "step": 24610 + }, + { + "epoch": 0.14636858882862308, + "grad_norm": 2.019585132598877, + "learning_rate": 4.7403400823476094e-05, + "loss": 4.2991, + "step": 24611 + }, + { + "epoch": 0.14637453611190407, + "grad_norm": 1.966424584388733, + "learning_rate": 4.740319353082866e-05, + "loss": 5.0383, + "step": 24612 + }, + { + "epoch": 0.14638048339518508, + "grad_norm": 2.048212766647339, + "learning_rate": 4.740298623036049e-05, + "loss": 5.0623, + "step": 24613 + }, + { + "epoch": 0.14638643067846607, + "grad_norm": 2.318051338195801, + "learning_rate": 4.740277892207168e-05, + "loss": 5.7096, + "step": 24614 + }, + { + "epoch": 0.14639237796174706, + "grad_norm": 1.6807061433792114, + "learning_rate": 4.740257160596229e-05, + "loss": 4.9725, + "step": 24615 + }, + { + "epoch": 0.14639832524502808, + "grad_norm": 1.968828558921814, + "learning_rate": 4.7402364282032386e-05, + "loss": 4.9904, + "step": 24616 + }, + { + "epoch": 0.14640427252830907, + "grad_norm": 1.8591229915618896, + "learning_rate": 4.740215695028205e-05, + "loss": 4.9013, + "step": 24617 + }, + { + "epoch": 0.14641021981159005, + "grad_norm": 1.8735779523849487, + "learning_rate": 4.740194961071136e-05, + "loss": 5.0174, + "step": 24618 + }, + { + "epoch": 0.14641616709487107, + "grad_norm": 1.9068244695663452, + "learning_rate": 4.740174226332037e-05, + "loss": 4.9578, + "step": 24619 + }, + { + "epoch": 0.14642211437815206, + "grad_norm": 2.136747360229492, + "learning_rate": 4.740153490810917e-05, + "loss": 4.953, + "step": 24620 + }, + { + "epoch": 0.14642806166143305, + "grad_norm": 2.1197381019592285, + "learning_rate": 4.740132754507782e-05, + "loss": 5.1238, + "step": 24621 + }, + { + "epoch": 0.14643400894471406, + "grad_norm": 1.8754642009735107, + "learning_rate": 4.740112017422641e-05, + "loss": 4.9628, + "step": 24622 + }, + { + "epoch": 0.14643995622799505, + "grad_norm": 1.8816076517105103, + "learning_rate": 4.740091279555499e-05, + "loss": 4.8295, + "step": 24623 + }, + { + "epoch": 0.14644590351127604, + "grad_norm": 1.7956056594848633, + "learning_rate": 4.740070540906365e-05, + "loss": 4.7985, + "step": 24624 + }, + { + "epoch": 0.14645185079455705, + "grad_norm": 2.021692991256714, + "learning_rate": 4.740049801475245e-05, + "loss": 4.9583, + "step": 24625 + }, + { + "epoch": 0.14645779807783804, + "grad_norm": 1.69369637966156, + "learning_rate": 4.7400290612621465e-05, + "loss": 4.9205, + "step": 24626 + }, + { + "epoch": 0.14646374536111903, + "grad_norm": 1.7640669345855713, + "learning_rate": 4.740008320267077e-05, + "loss": 5.0191, + "step": 24627 + }, + { + "epoch": 0.14646969264440005, + "grad_norm": 2.0161068439483643, + "learning_rate": 4.739987578490045e-05, + "loss": 5.1847, + "step": 24628 + }, + { + "epoch": 0.14647563992768103, + "grad_norm": 1.8745818138122559, + "learning_rate": 4.7399668359310555e-05, + "loss": 5.0221, + "step": 24629 + }, + { + "epoch": 0.14648158721096202, + "grad_norm": 1.8857629299163818, + "learning_rate": 4.7399460925901164e-05, + "loss": 5.0957, + "step": 24630 + }, + { + "epoch": 0.14648753449424304, + "grad_norm": 1.7315385341644287, + "learning_rate": 4.739925348467236e-05, + "loss": 5.1935, + "step": 24631 + }, + { + "epoch": 0.14649348177752403, + "grad_norm": 1.968795657157898, + "learning_rate": 4.7399046035624204e-05, + "loss": 5.2074, + "step": 24632 + }, + { + "epoch": 0.14649942906080501, + "grad_norm": 1.889760971069336, + "learning_rate": 4.739883857875677e-05, + "loss": 4.7733, + "step": 24633 + }, + { + "epoch": 0.14650537634408603, + "grad_norm": 1.9310023784637451, + "learning_rate": 4.739863111407013e-05, + "loss": 5.0259, + "step": 24634 + }, + { + "epoch": 0.14651132362736702, + "grad_norm": 1.807829737663269, + "learning_rate": 4.739842364156437e-05, + "loss": 4.8263, + "step": 24635 + }, + { + "epoch": 0.146517270910648, + "grad_norm": 1.8053529262542725, + "learning_rate": 4.739821616123955e-05, + "loss": 4.8213, + "step": 24636 + }, + { + "epoch": 0.14652321819392902, + "grad_norm": 1.9432908296585083, + "learning_rate": 4.739800867309574e-05, + "loss": 4.8625, + "step": 24637 + }, + { + "epoch": 0.14652916547721, + "grad_norm": 1.5960321426391602, + "learning_rate": 4.739780117713302e-05, + "loss": 4.6592, + "step": 24638 + }, + { + "epoch": 0.146535112760491, + "grad_norm": 1.9232900142669678, + "learning_rate": 4.739759367335145e-05, + "loss": 4.8859, + "step": 24639 + }, + { + "epoch": 0.14654106004377201, + "grad_norm": 1.8403369188308716, + "learning_rate": 4.739738616175112e-05, + "loss": 4.7934, + "step": 24640 + }, + { + "epoch": 0.146547007327053, + "grad_norm": 1.6142429113388062, + "learning_rate": 4.7397178642332095e-05, + "loss": 4.7553, + "step": 24641 + }, + { + "epoch": 0.146552954610334, + "grad_norm": 1.7207775115966797, + "learning_rate": 4.7396971115094445e-05, + "loss": 4.5229, + "step": 24642 + }, + { + "epoch": 0.146558901893615, + "grad_norm": 1.651342511177063, + "learning_rate": 4.739676358003824e-05, + "loss": 4.7882, + "step": 24643 + }, + { + "epoch": 0.146564849176896, + "grad_norm": 1.5380842685699463, + "learning_rate": 4.7396556037163556e-05, + "loss": 5.1114, + "step": 24644 + }, + { + "epoch": 0.14657079646017698, + "grad_norm": 1.7868518829345703, + "learning_rate": 4.739634848647047e-05, + "loss": 6.0014, + "step": 24645 + }, + { + "epoch": 0.146576743743458, + "grad_norm": 1.7771759033203125, + "learning_rate": 4.7396140927959045e-05, + "loss": 6.0391, + "step": 24646 + }, + { + "epoch": 0.146582691026739, + "grad_norm": 1.7818456888198853, + "learning_rate": 4.739593336162936e-05, + "loss": 5.431, + "step": 24647 + }, + { + "epoch": 0.14658863831001998, + "grad_norm": 1.6585869789123535, + "learning_rate": 4.7395725787481496e-05, + "loss": 5.4888, + "step": 24648 + }, + { + "epoch": 0.146594585593301, + "grad_norm": 1.448287010192871, + "learning_rate": 4.73955182055155e-05, + "loss": 5.5616, + "step": 24649 + }, + { + "epoch": 0.14660053287658198, + "grad_norm": 1.600519061088562, + "learning_rate": 4.739531061573147e-05, + "loss": 5.4446, + "step": 24650 + }, + { + "epoch": 0.14660648015986297, + "grad_norm": 1.5828067064285278, + "learning_rate": 4.7395103018129464e-05, + "loss": 5.7003, + "step": 24651 + }, + { + "epoch": 0.14661242744314398, + "grad_norm": 2.0968759059906006, + "learning_rate": 4.739489541270956e-05, + "loss": 5.4655, + "step": 24652 + }, + { + "epoch": 0.14661837472642497, + "grad_norm": 2.287879467010498, + "learning_rate": 4.739468779947183e-05, + "loss": 5.182, + "step": 24653 + }, + { + "epoch": 0.14662432200970596, + "grad_norm": 1.9258517026901245, + "learning_rate": 4.7394480178416344e-05, + "loss": 5.6223, + "step": 24654 + }, + { + "epoch": 0.14663026929298698, + "grad_norm": 1.9016472101211548, + "learning_rate": 4.7394272549543183e-05, + "loss": 5.304, + "step": 24655 + }, + { + "epoch": 0.14663621657626796, + "grad_norm": 1.4872523546218872, + "learning_rate": 4.739406491285241e-05, + "loss": 5.4679, + "step": 24656 + }, + { + "epoch": 0.14664216385954895, + "grad_norm": 1.6542940139770508, + "learning_rate": 4.73938572683441e-05, + "loss": 5.4644, + "step": 24657 + }, + { + "epoch": 0.14664811114282997, + "grad_norm": 2.210514545440674, + "learning_rate": 4.739364961601832e-05, + "loss": 4.6455, + "step": 24658 + }, + { + "epoch": 0.14665405842611096, + "grad_norm": 2.3305461406707764, + "learning_rate": 4.739344195587515e-05, + "loss": 4.571, + "step": 24659 + }, + { + "epoch": 0.14666000570939194, + "grad_norm": 2.243680238723755, + "learning_rate": 4.739323428791467e-05, + "loss": 4.5274, + "step": 24660 + }, + { + "epoch": 0.14666595299267296, + "grad_norm": 2.1816461086273193, + "learning_rate": 4.739302661213693e-05, + "loss": 4.4871, + "step": 24661 + }, + { + "epoch": 0.14667190027595395, + "grad_norm": 2.0428659915924072, + "learning_rate": 4.739281892854203e-05, + "loss": 4.3641, + "step": 24662 + }, + { + "epoch": 0.14667784755923494, + "grad_norm": 1.902016043663025, + "learning_rate": 4.739261123713001e-05, + "loss": 4.42, + "step": 24663 + }, + { + "epoch": 0.14668379484251595, + "grad_norm": 2.382110118865967, + "learning_rate": 4.7392403537900974e-05, + "loss": 4.3784, + "step": 24664 + }, + { + "epoch": 0.14668974212579694, + "grad_norm": 2.014251470565796, + "learning_rate": 4.739219583085498e-05, + "loss": 4.583, + "step": 24665 + }, + { + "epoch": 0.14669568940907793, + "grad_norm": 2.268214464187622, + "learning_rate": 4.7391988115992106e-05, + "loss": 4.4803, + "step": 24666 + }, + { + "epoch": 0.14670163669235892, + "grad_norm": 2.19326114654541, + "learning_rate": 4.7391780393312405e-05, + "loss": 4.5751, + "step": 24667 + }, + { + "epoch": 0.14670758397563993, + "grad_norm": 2.1453635692596436, + "learning_rate": 4.739157266281597e-05, + "loss": 4.8723, + "step": 24668 + }, + { + "epoch": 0.14671353125892092, + "grad_norm": 1.788976788520813, + "learning_rate": 4.739136492450288e-05, + "loss": 5.3339, + "step": 24669 + }, + { + "epoch": 0.1467194785422019, + "grad_norm": 2.523129940032959, + "learning_rate": 4.739115717837319e-05, + "loss": 4.314, + "step": 24670 + }, + { + "epoch": 0.14672542582548292, + "grad_norm": 2.2541866302490234, + "learning_rate": 4.739094942442698e-05, + "loss": 4.5228, + "step": 24671 + }, + { + "epoch": 0.1467313731087639, + "grad_norm": 2.5569868087768555, + "learning_rate": 4.739074166266431e-05, + "loss": 4.6268, + "step": 24672 + }, + { + "epoch": 0.1467373203920449, + "grad_norm": 1.9912770986557007, + "learning_rate": 4.739053389308528e-05, + "loss": 4.642, + "step": 24673 + }, + { + "epoch": 0.14674326767532592, + "grad_norm": 1.8588427305221558, + "learning_rate": 4.739032611568993e-05, + "loss": 5.2527, + "step": 24674 + }, + { + "epoch": 0.1467492149586069, + "grad_norm": 1.9020613431930542, + "learning_rate": 4.7390118330478356e-05, + "loss": 5.4926, + "step": 24675 + }, + { + "epoch": 0.1467551622418879, + "grad_norm": 2.319058895111084, + "learning_rate": 4.7389910537450624e-05, + "loss": 5.1275, + "step": 24676 + }, + { + "epoch": 0.1467611095251689, + "grad_norm": 1.7051849365234375, + "learning_rate": 4.7389702736606804e-05, + "loss": 5.599, + "step": 24677 + }, + { + "epoch": 0.1467670568084499, + "grad_norm": 1.7340635061264038, + "learning_rate": 4.738949492794696e-05, + "loss": 5.3359, + "step": 24678 + }, + { + "epoch": 0.14677300409173089, + "grad_norm": 1.5634024143218994, + "learning_rate": 4.738928711147119e-05, + "loss": 5.2585, + "step": 24679 + }, + { + "epoch": 0.1467789513750119, + "grad_norm": 1.559401035308838, + "learning_rate": 4.738907928717955e-05, + "loss": 5.297, + "step": 24680 + }, + { + "epoch": 0.1467848986582929, + "grad_norm": 1.5967936515808105, + "learning_rate": 4.738887145507211e-05, + "loss": 5.2068, + "step": 24681 + }, + { + "epoch": 0.14679084594157388, + "grad_norm": 1.6294320821762085, + "learning_rate": 4.7388663615148945e-05, + "loss": 5.1878, + "step": 24682 + }, + { + "epoch": 0.1467967932248549, + "grad_norm": 1.4520001411437988, + "learning_rate": 4.7388455767410135e-05, + "loss": 5.0777, + "step": 24683 + }, + { + "epoch": 0.14680274050813588, + "grad_norm": 1.3392236232757568, + "learning_rate": 4.738824791185573e-05, + "loss": 5.2396, + "step": 24684 + }, + { + "epoch": 0.14680868779141687, + "grad_norm": 1.467822551727295, + "learning_rate": 4.738804004848584e-05, + "loss": 5.253, + "step": 24685 + }, + { + "epoch": 0.14681463507469789, + "grad_norm": 1.5025224685668945, + "learning_rate": 4.7387832177300504e-05, + "loss": 5.386, + "step": 24686 + }, + { + "epoch": 0.14682058235797887, + "grad_norm": 1.6178737878799438, + "learning_rate": 4.73876242982998e-05, + "loss": 5.2601, + "step": 24687 + }, + { + "epoch": 0.14682652964125986, + "grad_norm": 1.4832427501678467, + "learning_rate": 4.7387416411483825e-05, + "loss": 5.0987, + "step": 24688 + }, + { + "epoch": 0.14683247692454088, + "grad_norm": 1.4726454019546509, + "learning_rate": 4.738720851685263e-05, + "loss": 5.3468, + "step": 24689 + }, + { + "epoch": 0.14683842420782187, + "grad_norm": 1.5659757852554321, + "learning_rate": 4.7387000614406284e-05, + "loss": 5.1591, + "step": 24690 + }, + { + "epoch": 0.14684437149110285, + "grad_norm": 1.7832130193710327, + "learning_rate": 4.7386792704144875e-05, + "loss": 5.126, + "step": 24691 + }, + { + "epoch": 0.14685031877438387, + "grad_norm": 1.6943825483322144, + "learning_rate": 4.738658478606846e-05, + "loss": 5.4705, + "step": 24692 + }, + { + "epoch": 0.14685626605766486, + "grad_norm": 1.4877350330352783, + "learning_rate": 4.738637686017713e-05, + "loss": 5.3479, + "step": 24693 + }, + { + "epoch": 0.14686221334094585, + "grad_norm": 2.306101083755493, + "learning_rate": 4.738616892647094e-05, + "loss": 4.4746, + "step": 24694 + }, + { + "epoch": 0.14686816062422686, + "grad_norm": 2.2277164459228516, + "learning_rate": 4.7385960984949976e-05, + "loss": 4.4995, + "step": 24695 + }, + { + "epoch": 0.14687410790750785, + "grad_norm": 1.535406231880188, + "learning_rate": 4.738575303561429e-05, + "loss": 5.3042, + "step": 24696 + }, + { + "epoch": 0.14688005519078884, + "grad_norm": 1.7974361181259155, + "learning_rate": 4.738554507846398e-05, + "loss": 5.3804, + "step": 24697 + }, + { + "epoch": 0.14688600247406985, + "grad_norm": 1.9455167055130005, + "learning_rate": 4.7385337113499104e-05, + "loss": 4.9782, + "step": 24698 + }, + { + "epoch": 0.14689194975735084, + "grad_norm": 2.486859083175659, + "learning_rate": 4.738512914071974e-05, + "loss": 4.5543, + "step": 24699 + }, + { + "epoch": 0.14689789704063183, + "grad_norm": 2.1134984493255615, + "learning_rate": 4.738492116012596e-05, + "loss": 4.3281, + "step": 24700 + }, + { + "epoch": 0.14690384432391285, + "grad_norm": 2.081852674484253, + "learning_rate": 4.7384713171717833e-05, + "loss": 4.3307, + "step": 24701 + }, + { + "epoch": 0.14690979160719383, + "grad_norm": 2.3121731281280518, + "learning_rate": 4.7384505175495435e-05, + "loss": 4.4791, + "step": 24702 + }, + { + "epoch": 0.14691573889047482, + "grad_norm": 2.0572164058685303, + "learning_rate": 4.738429717145883e-05, + "loss": 4.5165, + "step": 24703 + }, + { + "epoch": 0.14692168617375584, + "grad_norm": 1.7863034009933472, + "learning_rate": 4.7384089159608115e-05, + "loss": 4.8086, + "step": 24704 + }, + { + "epoch": 0.14692763345703683, + "grad_norm": 2.0969200134277344, + "learning_rate": 4.7383881139943335e-05, + "loss": 4.7512, + "step": 24705 + }, + { + "epoch": 0.14693358074031782, + "grad_norm": 1.9164679050445557, + "learning_rate": 4.738367311246458e-05, + "loss": 4.5249, + "step": 24706 + }, + { + "epoch": 0.14693952802359883, + "grad_norm": 1.8215450048446655, + "learning_rate": 4.738346507717191e-05, + "loss": 4.7016, + "step": 24707 + }, + { + "epoch": 0.14694547530687982, + "grad_norm": 1.7830946445465088, + "learning_rate": 4.7383257034065395e-05, + "loss": 4.6173, + "step": 24708 + }, + { + "epoch": 0.1469514225901608, + "grad_norm": 1.7251957654953003, + "learning_rate": 4.7383048983145126e-05, + "loss": 4.9539, + "step": 24709 + }, + { + "epoch": 0.14695736987344182, + "grad_norm": 1.6763554811477661, + "learning_rate": 4.738284092441117e-05, + "loss": 4.8123, + "step": 24710 + }, + { + "epoch": 0.1469633171567228, + "grad_norm": 1.5693418979644775, + "learning_rate": 4.738263285786358e-05, + "loss": 4.6586, + "step": 24711 + }, + { + "epoch": 0.1469692644400038, + "grad_norm": 2.5585360527038574, + "learning_rate": 4.738242478350247e-05, + "loss": 4.2875, + "step": 24712 + }, + { + "epoch": 0.14697521172328482, + "grad_norm": 2.41618275642395, + "learning_rate": 4.738221670132786e-05, + "loss": 4.3448, + "step": 24713 + }, + { + "epoch": 0.1469811590065658, + "grad_norm": 2.233074903488159, + "learning_rate": 4.7382008611339867e-05, + "loss": 5.2453, + "step": 24714 + }, + { + "epoch": 0.1469871062898468, + "grad_norm": 1.7833389043807983, + "learning_rate": 4.738180051353854e-05, + "loss": 4.9964, + "step": 24715 + }, + { + "epoch": 0.1469930535731278, + "grad_norm": 1.7970653772354126, + "learning_rate": 4.738159240792396e-05, + "loss": 4.5124, + "step": 24716 + }, + { + "epoch": 0.1469990008564088, + "grad_norm": 2.1043243408203125, + "learning_rate": 4.738138429449619e-05, + "loss": 4.3681, + "step": 24717 + }, + { + "epoch": 0.14700494813968978, + "grad_norm": 1.5849015712738037, + "learning_rate": 4.738117617325532e-05, + "loss": 4.7756, + "step": 24718 + }, + { + "epoch": 0.1470108954229708, + "grad_norm": 1.5067150592803955, + "learning_rate": 4.73809680442014e-05, + "loss": 4.6255, + "step": 24719 + }, + { + "epoch": 0.1470168427062518, + "grad_norm": 1.5583860874176025, + "learning_rate": 4.7380759907334524e-05, + "loss": 4.7671, + "step": 24720 + }, + { + "epoch": 0.14702278998953278, + "grad_norm": 1.9732975959777832, + "learning_rate": 4.7380551762654755e-05, + "loss": 4.514, + "step": 24721 + }, + { + "epoch": 0.1470287372728138, + "grad_norm": 2.2196953296661377, + "learning_rate": 4.738034361016217e-05, + "loss": 4.2897, + "step": 24722 + }, + { + "epoch": 0.14703468455609478, + "grad_norm": 2.3124115467071533, + "learning_rate": 4.738013544985683e-05, + "loss": 4.2081, + "step": 24723 + }, + { + "epoch": 0.14704063183937577, + "grad_norm": 2.4807839393615723, + "learning_rate": 4.737992728173882e-05, + "loss": 4.5975, + "step": 24724 + }, + { + "epoch": 0.14704657912265678, + "grad_norm": 1.6757773160934448, + "learning_rate": 4.737971910580821e-05, + "loss": 5.6665, + "step": 24725 + }, + { + "epoch": 0.14705252640593777, + "grad_norm": 1.9433516263961792, + "learning_rate": 4.7379510922065074e-05, + "loss": 5.0243, + "step": 24726 + }, + { + "epoch": 0.14705847368921876, + "grad_norm": 2.392778158187866, + "learning_rate": 4.737930273050948e-05, + "loss": 4.7769, + "step": 24727 + }, + { + "epoch": 0.14706442097249975, + "grad_norm": 2.730144739151001, + "learning_rate": 4.73790945311415e-05, + "loss": 4.8214, + "step": 24728 + }, + { + "epoch": 0.14707036825578076, + "grad_norm": 1.9504640102386475, + "learning_rate": 4.7378886323961205e-05, + "loss": 4.8057, + "step": 24729 + }, + { + "epoch": 0.14707631553906175, + "grad_norm": 1.7174079418182373, + "learning_rate": 4.7378678108968675e-05, + "loss": 5.1865, + "step": 24730 + }, + { + "epoch": 0.14708226282234274, + "grad_norm": 2.109645128250122, + "learning_rate": 4.737846988616399e-05, + "loss": 5.1682, + "step": 24731 + }, + { + "epoch": 0.14708821010562376, + "grad_norm": 1.9357048273086548, + "learning_rate": 4.7378261655547204e-05, + "loss": 5.0972, + "step": 24732 + }, + { + "epoch": 0.14709415738890474, + "grad_norm": 1.4660345315933228, + "learning_rate": 4.73780534171184e-05, + "loss": 5.7247, + "step": 24733 + }, + { + "epoch": 0.14710010467218573, + "grad_norm": 1.8927645683288574, + "learning_rate": 4.7377845170877644e-05, + "loss": 5.241, + "step": 24734 + }, + { + "epoch": 0.14710605195546675, + "grad_norm": 1.1164909601211548, + "learning_rate": 4.737763691682502e-05, + "loss": 5.4844, + "step": 24735 + }, + { + "epoch": 0.14711199923874774, + "grad_norm": 1.5676599740982056, + "learning_rate": 4.7377428654960584e-05, + "loss": 5.0659, + "step": 24736 + }, + { + "epoch": 0.14711794652202873, + "grad_norm": 2.404731273651123, + "learning_rate": 4.737722038528443e-05, + "loss": 4.5183, + "step": 24737 + }, + { + "epoch": 0.14712389380530974, + "grad_norm": 1.9689422845840454, + "learning_rate": 4.7377012107796615e-05, + "loss": 4.9564, + "step": 24738 + }, + { + "epoch": 0.14712984108859073, + "grad_norm": 2.320307970046997, + "learning_rate": 4.737680382249721e-05, + "loss": 4.4609, + "step": 24739 + }, + { + "epoch": 0.14713578837187172, + "grad_norm": 1.8649024963378906, + "learning_rate": 4.7376595529386305e-05, + "loss": 4.7436, + "step": 24740 + }, + { + "epoch": 0.14714173565515273, + "grad_norm": 2.112926721572876, + "learning_rate": 4.7376387228463956e-05, + "loss": 4.6949, + "step": 24741 + }, + { + "epoch": 0.14714768293843372, + "grad_norm": 2.237760543823242, + "learning_rate": 4.737617891973024e-05, + "loss": 4.5927, + "step": 24742 + }, + { + "epoch": 0.1471536302217147, + "grad_norm": 2.115577220916748, + "learning_rate": 4.737597060318524e-05, + "loss": 4.4007, + "step": 24743 + }, + { + "epoch": 0.14715957750499573, + "grad_norm": 2.0081801414489746, + "learning_rate": 4.737576227882901e-05, + "loss": 4.3844, + "step": 24744 + }, + { + "epoch": 0.1471655247882767, + "grad_norm": 2.1995346546173096, + "learning_rate": 4.737555394666163e-05, + "loss": 4.4581, + "step": 24745 + }, + { + "epoch": 0.1471714720715577, + "grad_norm": 2.2637784481048584, + "learning_rate": 4.7375345606683184e-05, + "loss": 4.4969, + "step": 24746 + }, + { + "epoch": 0.14717741935483872, + "grad_norm": 2.4739608764648438, + "learning_rate": 4.737513725889373e-05, + "loss": 4.521, + "step": 24747 + }, + { + "epoch": 0.1471833666381197, + "grad_norm": 1.6418421268463135, + "learning_rate": 4.737492890329335e-05, + "loss": 5.1064, + "step": 24748 + }, + { + "epoch": 0.1471893139214007, + "grad_norm": 1.9451549053192139, + "learning_rate": 4.737472053988212e-05, + "loss": 4.6824, + "step": 24749 + }, + { + "epoch": 0.1471952612046817, + "grad_norm": 1.9891009330749512, + "learning_rate": 4.7374512168660094e-05, + "loss": 5.2228, + "step": 24750 + }, + { + "epoch": 0.1472012084879627, + "grad_norm": 2.1582279205322266, + "learning_rate": 4.737430378962736e-05, + "loss": 5.7231, + "step": 24751 + }, + { + "epoch": 0.1472071557712437, + "grad_norm": 1.8569883108139038, + "learning_rate": 4.737409540278399e-05, + "loss": 5.3307, + "step": 24752 + }, + { + "epoch": 0.1472131030545247, + "grad_norm": 1.4937759637832642, + "learning_rate": 4.737388700813006e-05, + "loss": 5.3213, + "step": 24753 + }, + { + "epoch": 0.1472190503378057, + "grad_norm": 1.6692577600479126, + "learning_rate": 4.737367860566563e-05, + "loss": 5.2426, + "step": 24754 + }, + { + "epoch": 0.14722499762108668, + "grad_norm": 2.3550398349761963, + "learning_rate": 4.737347019539078e-05, + "loss": 4.7053, + "step": 24755 + }, + { + "epoch": 0.1472309449043677, + "grad_norm": 2.122601270675659, + "learning_rate": 4.737326177730559e-05, + "loss": 4.9372, + "step": 24756 + }, + { + "epoch": 0.14723689218764868, + "grad_norm": 1.429738163948059, + "learning_rate": 4.737305335141012e-05, + "loss": 4.7637, + "step": 24757 + }, + { + "epoch": 0.14724283947092967, + "grad_norm": 1.6185976266860962, + "learning_rate": 4.7372844917704445e-05, + "loss": 4.6184, + "step": 24758 + }, + { + "epoch": 0.1472487867542107, + "grad_norm": 1.495154619216919, + "learning_rate": 4.737263647618865e-05, + "loss": 4.4256, + "step": 24759 + }, + { + "epoch": 0.14725473403749167, + "grad_norm": 1.366437554359436, + "learning_rate": 4.737242802686279e-05, + "loss": 4.5822, + "step": 24760 + }, + { + "epoch": 0.14726068132077266, + "grad_norm": 2.3462178707122803, + "learning_rate": 4.737221956972695e-05, + "loss": 4.9419, + "step": 24761 + }, + { + "epoch": 0.14726662860405368, + "grad_norm": 2.846083402633667, + "learning_rate": 4.73720111047812e-05, + "loss": 4.6403, + "step": 24762 + }, + { + "epoch": 0.14727257588733467, + "grad_norm": 2.388052463531494, + "learning_rate": 4.7371802632025605e-05, + "loss": 4.5375, + "step": 24763 + }, + { + "epoch": 0.14727852317061566, + "grad_norm": 1.4230948686599731, + "learning_rate": 4.7371594151460254e-05, + "loss": 4.6451, + "step": 24764 + }, + { + "epoch": 0.14728447045389667, + "grad_norm": 1.2602354288101196, + "learning_rate": 4.737138566308521e-05, + "loss": 4.4927, + "step": 24765 + }, + { + "epoch": 0.14729041773717766, + "grad_norm": 1.9645811319351196, + "learning_rate": 4.737117716690054e-05, + "loss": 4.875, + "step": 24766 + }, + { + "epoch": 0.14729636502045865, + "grad_norm": 2.729315757751465, + "learning_rate": 4.7370968662906325e-05, + "loss": 4.0048, + "step": 24767 + }, + { + "epoch": 0.14730231230373966, + "grad_norm": 2.797999382019043, + "learning_rate": 4.7370760151102635e-05, + "loss": 4.3436, + "step": 24768 + }, + { + "epoch": 0.14730825958702065, + "grad_norm": 2.058621406555176, + "learning_rate": 4.737055163148955e-05, + "loss": 4.4137, + "step": 24769 + }, + { + "epoch": 0.14731420687030164, + "grad_norm": 1.9290826320648193, + "learning_rate": 4.737034310406713e-05, + "loss": 4.4751, + "step": 24770 + }, + { + "epoch": 0.14732015415358266, + "grad_norm": 2.316140651702881, + "learning_rate": 4.737013456883546e-05, + "loss": 4.4009, + "step": 24771 + }, + { + "epoch": 0.14732610143686364, + "grad_norm": 2.326529026031494, + "learning_rate": 4.7369926025794606e-05, + "loss": 4.4272, + "step": 24772 + }, + { + "epoch": 0.14733204872014463, + "grad_norm": 2.089818239212036, + "learning_rate": 4.736971747494464e-05, + "loss": 4.4192, + "step": 24773 + }, + { + "epoch": 0.14733799600342565, + "grad_norm": 1.714152455329895, + "learning_rate": 4.736950891628564e-05, + "loss": 5.1404, + "step": 24774 + }, + { + "epoch": 0.14734394328670664, + "grad_norm": 2.01911997795105, + "learning_rate": 4.736930034981767e-05, + "loss": 4.7116, + "step": 24775 + }, + { + "epoch": 0.14734989056998762, + "grad_norm": 2.0275747776031494, + "learning_rate": 4.736909177554081e-05, + "loss": 4.4249, + "step": 24776 + }, + { + "epoch": 0.14735583785326864, + "grad_norm": 1.9515576362609863, + "learning_rate": 4.7368883193455135e-05, + "loss": 4.3968, + "step": 24777 + }, + { + "epoch": 0.14736178513654963, + "grad_norm": 1.6079367399215698, + "learning_rate": 4.736867460356071e-05, + "loss": 4.3927, + "step": 24778 + }, + { + "epoch": 0.14736773241983062, + "grad_norm": 1.856449842453003, + "learning_rate": 4.736846600585761e-05, + "loss": 4.4231, + "step": 24779 + }, + { + "epoch": 0.14737367970311163, + "grad_norm": 1.7405143976211548, + "learning_rate": 4.7368257400345915e-05, + "loss": 5.4894, + "step": 24780 + }, + { + "epoch": 0.14737962698639262, + "grad_norm": 1.6344300508499146, + "learning_rate": 4.736804878702569e-05, + "loss": 5.5489, + "step": 24781 + }, + { + "epoch": 0.1473855742696736, + "grad_norm": 1.693015694618225, + "learning_rate": 4.7367840165897014e-05, + "loss": 5.6432, + "step": 24782 + }, + { + "epoch": 0.14739152155295462, + "grad_norm": 1.5487139225006104, + "learning_rate": 4.736763153695995e-05, + "loss": 4.6316, + "step": 24783 + }, + { + "epoch": 0.1473974688362356, + "grad_norm": 1.5867420434951782, + "learning_rate": 4.736742290021458e-05, + "loss": 4.3782, + "step": 24784 + }, + { + "epoch": 0.1474034161195166, + "grad_norm": 1.7892907857894897, + "learning_rate": 4.736721425566097e-05, + "loss": 4.413, + "step": 24785 + }, + { + "epoch": 0.1474093634027976, + "grad_norm": 1.7791600227355957, + "learning_rate": 4.7367005603299206e-05, + "loss": 4.9471, + "step": 24786 + }, + { + "epoch": 0.1474153106860786, + "grad_norm": 1.5871254205703735, + "learning_rate": 4.736679694312934e-05, + "loss": 5.6475, + "step": 24787 + }, + { + "epoch": 0.1474212579693596, + "grad_norm": 1.5154014825820923, + "learning_rate": 4.7366588275151465e-05, + "loss": 5.6038, + "step": 24788 + }, + { + "epoch": 0.14742720525264058, + "grad_norm": 1.4058479070663452, + "learning_rate": 4.736637959936564e-05, + "loss": 5.4371, + "step": 24789 + }, + { + "epoch": 0.1474331525359216, + "grad_norm": 1.5023268461227417, + "learning_rate": 4.7366170915771946e-05, + "loss": 5.6043, + "step": 24790 + }, + { + "epoch": 0.14743909981920258, + "grad_norm": 1.573081135749817, + "learning_rate": 4.7365962224370445e-05, + "loss": 4.6014, + "step": 24791 + }, + { + "epoch": 0.14744504710248357, + "grad_norm": 1.413909673690796, + "learning_rate": 4.7365753525161225e-05, + "loss": 5.1478, + "step": 24792 + }, + { + "epoch": 0.1474509943857646, + "grad_norm": 1.6636765003204346, + "learning_rate": 4.736554481814435e-05, + "loss": 5.3099, + "step": 24793 + }, + { + "epoch": 0.14745694166904558, + "grad_norm": 1.4575749635696411, + "learning_rate": 4.7365336103319904e-05, + "loss": 4.7067, + "step": 24794 + }, + { + "epoch": 0.14746288895232657, + "grad_norm": 1.4840314388275146, + "learning_rate": 4.736512738068793e-05, + "loss": 5.3591, + "step": 24795 + }, + { + "epoch": 0.14746883623560758, + "grad_norm": 1.8716658353805542, + "learning_rate": 4.736491865024853e-05, + "loss": 4.9905, + "step": 24796 + }, + { + "epoch": 0.14747478351888857, + "grad_norm": 1.5661007165908813, + "learning_rate": 4.736470991200178e-05, + "loss": 5.5725, + "step": 24797 + }, + { + "epoch": 0.14748073080216956, + "grad_norm": 1.7020787000656128, + "learning_rate": 4.736450116594773e-05, + "loss": 4.97, + "step": 24798 + }, + { + "epoch": 0.14748667808545057, + "grad_norm": 1.7010732889175415, + "learning_rate": 4.736429241208646e-05, + "loss": 5.0832, + "step": 24799 + }, + { + "epoch": 0.14749262536873156, + "grad_norm": 2.984389305114746, + "learning_rate": 4.7364083650418057e-05, + "loss": 4.5466, + "step": 24800 + }, + { + "epoch": 0.14749857265201255, + "grad_norm": 1.8300197124481201, + "learning_rate": 4.7363874880942574e-05, + "loss": 4.9772, + "step": 24801 + }, + { + "epoch": 0.14750451993529357, + "grad_norm": 1.685394048690796, + "learning_rate": 4.73636661036601e-05, + "loss": 5.0689, + "step": 24802 + }, + { + "epoch": 0.14751046721857455, + "grad_norm": 1.559996485710144, + "learning_rate": 4.7363457318570695e-05, + "loss": 5.1496, + "step": 24803 + }, + { + "epoch": 0.14751641450185554, + "grad_norm": 1.5654375553131104, + "learning_rate": 4.736324852567444e-05, + "loss": 5.1427, + "step": 24804 + }, + { + "epoch": 0.14752236178513656, + "grad_norm": 2.0388715267181396, + "learning_rate": 4.736303972497141e-05, + "loss": 4.6176, + "step": 24805 + }, + { + "epoch": 0.14752830906841755, + "grad_norm": 2.139695882797241, + "learning_rate": 4.736283091646167e-05, + "loss": 4.7746, + "step": 24806 + }, + { + "epoch": 0.14753425635169853, + "grad_norm": 1.6551018953323364, + "learning_rate": 4.73626221001453e-05, + "loss": 5.3522, + "step": 24807 + }, + { + "epoch": 0.14754020363497955, + "grad_norm": 1.6643954515457153, + "learning_rate": 4.7362413276022364e-05, + "loss": 5.5479, + "step": 24808 + }, + { + "epoch": 0.14754615091826054, + "grad_norm": 1.6942282915115356, + "learning_rate": 4.7362204444092947e-05, + "loss": 5.2971, + "step": 24809 + }, + { + "epoch": 0.14755209820154153, + "grad_norm": 2.1273419857025146, + "learning_rate": 4.736199560435711e-05, + "loss": 5.1465, + "step": 24810 + }, + { + "epoch": 0.14755804548482254, + "grad_norm": 2.1430892944335938, + "learning_rate": 4.736178675681493e-05, + "loss": 4.9944, + "step": 24811 + }, + { + "epoch": 0.14756399276810353, + "grad_norm": 2.1971189975738525, + "learning_rate": 4.736157790146649e-05, + "loss": 5.2348, + "step": 24812 + }, + { + "epoch": 0.14756994005138452, + "grad_norm": 1.7993513345718384, + "learning_rate": 4.7361369038311855e-05, + "loss": 5.0186, + "step": 24813 + }, + { + "epoch": 0.14757588733466553, + "grad_norm": 1.8296352624893188, + "learning_rate": 4.7361160167351085e-05, + "loss": 4.9939, + "step": 24814 + }, + { + "epoch": 0.14758183461794652, + "grad_norm": 1.6994922161102295, + "learning_rate": 4.7360951288584276e-05, + "loss": 5.0838, + "step": 24815 + }, + { + "epoch": 0.1475877819012275, + "grad_norm": 1.8526664972305298, + "learning_rate": 4.736074240201148e-05, + "loss": 4.9977, + "step": 24816 + }, + { + "epoch": 0.14759372918450853, + "grad_norm": 1.6255830526351929, + "learning_rate": 4.736053350763279e-05, + "loss": 5.111, + "step": 24817 + }, + { + "epoch": 0.14759967646778951, + "grad_norm": 1.6871737241744995, + "learning_rate": 4.736032460544826e-05, + "loss": 4.8522, + "step": 24818 + }, + { + "epoch": 0.1476056237510705, + "grad_norm": 1.8430577516555786, + "learning_rate": 4.7360115695457975e-05, + "loss": 4.9312, + "step": 24819 + }, + { + "epoch": 0.14761157103435152, + "grad_norm": 1.6737143993377686, + "learning_rate": 4.735990677766201e-05, + "loss": 4.7894, + "step": 24820 + }, + { + "epoch": 0.1476175183176325, + "grad_norm": 1.648138403892517, + "learning_rate": 4.7359697852060425e-05, + "loss": 4.8173, + "step": 24821 + }, + { + "epoch": 0.1476234656009135, + "grad_norm": 1.8230416774749756, + "learning_rate": 4.73594889186533e-05, + "loss": 5.0618, + "step": 24822 + }, + { + "epoch": 0.1476294128841945, + "grad_norm": 1.928932547569275, + "learning_rate": 4.735927997744072e-05, + "loss": 4.8846, + "step": 24823 + }, + { + "epoch": 0.1476353601674755, + "grad_norm": 1.8593389987945557, + "learning_rate": 4.735907102842273e-05, + "loss": 5.0283, + "step": 24824 + }, + { + "epoch": 0.1476413074507565, + "grad_norm": 1.988168478012085, + "learning_rate": 4.735886207159943e-05, + "loss": 5.0253, + "step": 24825 + }, + { + "epoch": 0.1476472547340375, + "grad_norm": 1.6367772817611694, + "learning_rate": 4.7358653106970885e-05, + "loss": 4.9296, + "step": 24826 + }, + { + "epoch": 0.1476532020173185, + "grad_norm": 1.7799687385559082, + "learning_rate": 4.7358444134537154e-05, + "loss": 4.5257, + "step": 24827 + }, + { + "epoch": 0.14765914930059948, + "grad_norm": 1.8706213235855103, + "learning_rate": 4.735823515429833e-05, + "loss": 4.9739, + "step": 24828 + }, + { + "epoch": 0.1476650965838805, + "grad_norm": 1.7662311792373657, + "learning_rate": 4.7358026166254476e-05, + "loss": 4.9545, + "step": 24829 + }, + { + "epoch": 0.14767104386716148, + "grad_norm": 1.6466079950332642, + "learning_rate": 4.7357817170405664e-05, + "loss": 4.8203, + "step": 24830 + }, + { + "epoch": 0.14767699115044247, + "grad_norm": 1.7296116352081299, + "learning_rate": 4.7357608166751965e-05, + "loss": 4.7575, + "step": 24831 + }, + { + "epoch": 0.1476829384337235, + "grad_norm": 1.6118981838226318, + "learning_rate": 4.735739915529346e-05, + "loss": 4.6546, + "step": 24832 + }, + { + "epoch": 0.14768888571700448, + "grad_norm": 1.7108652591705322, + "learning_rate": 4.735719013603022e-05, + "loss": 5.5278, + "step": 24833 + }, + { + "epoch": 0.14769483300028546, + "grad_norm": 1.583243727684021, + "learning_rate": 4.735698110896232e-05, + "loss": 5.5526, + "step": 24834 + }, + { + "epoch": 0.14770078028356648, + "grad_norm": 1.9354965686798096, + "learning_rate": 4.735677207408982e-05, + "loss": 4.9137, + "step": 24835 + }, + { + "epoch": 0.14770672756684747, + "grad_norm": 2.2551913261413574, + "learning_rate": 4.7356563031412805e-05, + "loss": 5.105, + "step": 24836 + }, + { + "epoch": 0.14771267485012846, + "grad_norm": 1.8324413299560547, + "learning_rate": 4.7356353980931344e-05, + "loss": 5.1002, + "step": 24837 + }, + { + "epoch": 0.14771862213340947, + "grad_norm": 1.7993746995925903, + "learning_rate": 4.7356144922645504e-05, + "loss": 5.0061, + "step": 24838 + }, + { + "epoch": 0.14772456941669046, + "grad_norm": 1.6633015871047974, + "learning_rate": 4.735593585655538e-05, + "loss": 5.6399, + "step": 24839 + }, + { + "epoch": 0.14773051669997145, + "grad_norm": 1.6153156757354736, + "learning_rate": 4.735572678266102e-05, + "loss": 5.845, + "step": 24840 + }, + { + "epoch": 0.14773646398325246, + "grad_norm": 1.5680739879608154, + "learning_rate": 4.7355517700962506e-05, + "loss": 4.9451, + "step": 24841 + }, + { + "epoch": 0.14774241126653345, + "grad_norm": 1.7775828838348389, + "learning_rate": 4.735530861145992e-05, + "loss": 5.3363, + "step": 24842 + }, + { + "epoch": 0.14774835854981444, + "grad_norm": 1.5199836492538452, + "learning_rate": 4.7355099514153316e-05, + "loss": 5.2147, + "step": 24843 + }, + { + "epoch": 0.14775430583309543, + "grad_norm": 1.5332800149917603, + "learning_rate": 4.7354890409042783e-05, + "loss": 5.2439, + "step": 24844 + }, + { + "epoch": 0.14776025311637644, + "grad_norm": 2.0724799633026123, + "learning_rate": 4.735468129612839e-05, + "loss": 5.0292, + "step": 24845 + }, + { + "epoch": 0.14776620039965743, + "grad_norm": 2.5946760177612305, + "learning_rate": 4.73544721754102e-05, + "loss": 4.973, + "step": 24846 + }, + { + "epoch": 0.14777214768293842, + "grad_norm": 1.9194954633712769, + "learning_rate": 4.735426304688831e-05, + "loss": 4.7452, + "step": 24847 + }, + { + "epoch": 0.14777809496621944, + "grad_norm": 1.38433039188385, + "learning_rate": 4.735405391056277e-05, + "loss": 5.5551, + "step": 24848 + }, + { + "epoch": 0.14778404224950042, + "grad_norm": 1.8728227615356445, + "learning_rate": 4.735384476643366e-05, + "loss": 5.3088, + "step": 24849 + }, + { + "epoch": 0.1477899895327814, + "grad_norm": 1.6192907094955444, + "learning_rate": 4.7353635614501054e-05, + "loss": 5.3365, + "step": 24850 + }, + { + "epoch": 0.14779593681606243, + "grad_norm": 1.4671828746795654, + "learning_rate": 4.735342645476503e-05, + "loss": 5.5339, + "step": 24851 + }, + { + "epoch": 0.14780188409934342, + "grad_norm": 1.924024224281311, + "learning_rate": 4.7353217287225646e-05, + "loss": 5.2287, + "step": 24852 + }, + { + "epoch": 0.1478078313826244, + "grad_norm": 1.6585190296173096, + "learning_rate": 4.735300811188299e-05, + "loss": 5.124, + "step": 24853 + }, + { + "epoch": 0.14781377866590542, + "grad_norm": 1.6820423603057861, + "learning_rate": 4.735279892873713e-05, + "loss": 5.4088, + "step": 24854 + }, + { + "epoch": 0.1478197259491864, + "grad_norm": 1.5978790521621704, + "learning_rate": 4.7352589737788134e-05, + "loss": 5.8087, + "step": 24855 + }, + { + "epoch": 0.1478256732324674, + "grad_norm": 1.6521705389022827, + "learning_rate": 4.735238053903609e-05, + "loss": 5.2014, + "step": 24856 + }, + { + "epoch": 0.1478316205157484, + "grad_norm": 1.6667120456695557, + "learning_rate": 4.7352171332481056e-05, + "loss": 5.1015, + "step": 24857 + }, + { + "epoch": 0.1478375677990294, + "grad_norm": 1.7318087816238403, + "learning_rate": 4.735196211812311e-05, + "loss": 5.4063, + "step": 24858 + }, + { + "epoch": 0.1478435150823104, + "grad_norm": 1.7706724405288696, + "learning_rate": 4.735175289596232e-05, + "loss": 5.0941, + "step": 24859 + }, + { + "epoch": 0.1478494623655914, + "grad_norm": 1.5582432746887207, + "learning_rate": 4.7351543665998764e-05, + "loss": 5.2643, + "step": 24860 + }, + { + "epoch": 0.1478554096488724, + "grad_norm": 1.5588469505310059, + "learning_rate": 4.735133442823252e-05, + "loss": 5.5234, + "step": 24861 + }, + { + "epoch": 0.14786135693215338, + "grad_norm": 2.5532615184783936, + "learning_rate": 4.735112518266366e-05, + "loss": 4.5405, + "step": 24862 + }, + { + "epoch": 0.1478673042154344, + "grad_norm": 1.5495831966400146, + "learning_rate": 4.735091592929224e-05, + "loss": 5.5153, + "step": 24863 + }, + { + "epoch": 0.14787325149871539, + "grad_norm": 1.4878839254379272, + "learning_rate": 4.7350706668118356e-05, + "loss": 5.2186, + "step": 24864 + }, + { + "epoch": 0.14787919878199637, + "grad_norm": 1.4914618730545044, + "learning_rate": 4.735049739914207e-05, + "loss": 5.3108, + "step": 24865 + }, + { + "epoch": 0.1478851460652774, + "grad_norm": 1.6413542032241821, + "learning_rate": 4.735028812236345e-05, + "loss": 5.2726, + "step": 24866 + }, + { + "epoch": 0.14789109334855838, + "grad_norm": 1.6650172472000122, + "learning_rate": 4.735007883778259e-05, + "loss": 5.3186, + "step": 24867 + }, + { + "epoch": 0.14789704063183937, + "grad_norm": 1.5289151668548584, + "learning_rate": 4.734986954539954e-05, + "loss": 5.1124, + "step": 24868 + }, + { + "epoch": 0.14790298791512038, + "grad_norm": 1.5151697397232056, + "learning_rate": 4.734966024521438e-05, + "loss": 5.495, + "step": 24869 + }, + { + "epoch": 0.14790893519840137, + "grad_norm": 1.3832122087478638, + "learning_rate": 4.734945093722718e-05, + "loss": 5.426, + "step": 24870 + }, + { + "epoch": 0.14791488248168236, + "grad_norm": 1.6117453575134277, + "learning_rate": 4.7349241621438023e-05, + "loss": 5.2548, + "step": 24871 + }, + { + "epoch": 0.14792082976496337, + "grad_norm": 1.5391991138458252, + "learning_rate": 4.734903229784698e-05, + "loss": 4.7025, + "step": 24872 + }, + { + "epoch": 0.14792677704824436, + "grad_norm": 1.649274468421936, + "learning_rate": 4.734882296645411e-05, + "loss": 5.4152, + "step": 24873 + }, + { + "epoch": 0.14793272433152535, + "grad_norm": 1.7147942781448364, + "learning_rate": 4.734861362725951e-05, + "loss": 5.4865, + "step": 24874 + }, + { + "epoch": 0.14793867161480637, + "grad_norm": 1.4434807300567627, + "learning_rate": 4.734840428026324e-05, + "loss": 5.5211, + "step": 24875 + }, + { + "epoch": 0.14794461889808735, + "grad_norm": 1.4886515140533447, + "learning_rate": 4.7348194925465364e-05, + "loss": 5.197, + "step": 24876 + }, + { + "epoch": 0.14795056618136834, + "grad_norm": 1.3683615922927856, + "learning_rate": 4.734798556286596e-05, + "loss": 4.9886, + "step": 24877 + }, + { + "epoch": 0.14795651346464936, + "grad_norm": 1.4986892938613892, + "learning_rate": 4.734777619246512e-05, + "loss": 5.0067, + "step": 24878 + }, + { + "epoch": 0.14796246074793035, + "grad_norm": 1.8438472747802734, + "learning_rate": 4.734756681426289e-05, + "loss": 5.2865, + "step": 24879 + }, + { + "epoch": 0.14796840803121133, + "grad_norm": 1.710975170135498, + "learning_rate": 4.734735742825935e-05, + "loss": 5.1215, + "step": 24880 + }, + { + "epoch": 0.14797435531449235, + "grad_norm": 2.074619770050049, + "learning_rate": 4.7347148034454594e-05, + "loss": 4.5968, + "step": 24881 + }, + { + "epoch": 0.14798030259777334, + "grad_norm": 2.5662643909454346, + "learning_rate": 4.7346938632848676e-05, + "loss": 4.3404, + "step": 24882 + }, + { + "epoch": 0.14798624988105433, + "grad_norm": 1.6698600053787231, + "learning_rate": 4.7346729223441665e-05, + "loss": 5.2027, + "step": 24883 + }, + { + "epoch": 0.14799219716433534, + "grad_norm": 2.1604435443878174, + "learning_rate": 4.7346519806233644e-05, + "loss": 4.4595, + "step": 24884 + }, + { + "epoch": 0.14799814444761633, + "grad_norm": 2.7507572174072266, + "learning_rate": 4.734631038122469e-05, + "loss": 3.1764, + "step": 24885 + }, + { + "epoch": 0.14800409173089732, + "grad_norm": 2.8016562461853027, + "learning_rate": 4.734610094841487e-05, + "loss": 3.8763, + "step": 24886 + }, + { + "epoch": 0.14801003901417834, + "grad_norm": 2.9202160835266113, + "learning_rate": 4.7345891507804253e-05, + "loss": 3.6681, + "step": 24887 + }, + { + "epoch": 0.14801598629745932, + "grad_norm": 3.071167230606079, + "learning_rate": 4.7345682059392914e-05, + "loss": 3.027, + "step": 24888 + }, + { + "epoch": 0.1480219335807403, + "grad_norm": 2.7173242568969727, + "learning_rate": 4.734547260318093e-05, + "loss": 3.3615, + "step": 24889 + }, + { + "epoch": 0.14802788086402133, + "grad_norm": 2.1972641944885254, + "learning_rate": 4.7345263139168375e-05, + "loss": 4.8097, + "step": 24890 + }, + { + "epoch": 0.14803382814730232, + "grad_norm": 2.031700849533081, + "learning_rate": 4.7345053667355324e-05, + "loss": 5.1153, + "step": 24891 + }, + { + "epoch": 0.1480397754305833, + "grad_norm": 2.627568483352661, + "learning_rate": 4.734484418774183e-05, + "loss": 4.3777, + "step": 24892 + }, + { + "epoch": 0.14804572271386432, + "grad_norm": 2.2821667194366455, + "learning_rate": 4.734463470032799e-05, + "loss": 4.4845, + "step": 24893 + }, + { + "epoch": 0.1480516699971453, + "grad_norm": 1.8525490760803223, + "learning_rate": 4.7344425205113875e-05, + "loss": 5.4187, + "step": 24894 + }, + { + "epoch": 0.1480576172804263, + "grad_norm": 2.0583372116088867, + "learning_rate": 4.7344215702099546e-05, + "loss": 4.4807, + "step": 24895 + }, + { + "epoch": 0.1480635645637073, + "grad_norm": 1.7403303384780884, + "learning_rate": 4.734400619128509e-05, + "loss": 5.5355, + "step": 24896 + }, + { + "epoch": 0.1480695118469883, + "grad_norm": 2.953425645828247, + "learning_rate": 4.734379667267056e-05, + "loss": 4.0136, + "step": 24897 + }, + { + "epoch": 0.1480754591302693, + "grad_norm": 2.8318042755126953, + "learning_rate": 4.7343587146256044e-05, + "loss": 3.5818, + "step": 24898 + }, + { + "epoch": 0.1480814064135503, + "grad_norm": 1.6144517660140991, + "learning_rate": 4.7343377612041615e-05, + "loss": 4.789, + "step": 24899 + }, + { + "epoch": 0.1480873536968313, + "grad_norm": 1.639545202255249, + "learning_rate": 4.734316807002734e-05, + "loss": 5.1812, + "step": 24900 + }, + { + "epoch": 0.14809330098011228, + "grad_norm": 1.7593424320220947, + "learning_rate": 4.734295852021331e-05, + "loss": 5.0547, + "step": 24901 + }, + { + "epoch": 0.14809924826339327, + "grad_norm": 1.6794737577438354, + "learning_rate": 4.734274896259957e-05, + "loss": 5.125, + "step": 24902 + }, + { + "epoch": 0.14810519554667428, + "grad_norm": 1.5941787958145142, + "learning_rate": 4.734253939718621e-05, + "loss": 5.0559, + "step": 24903 + }, + { + "epoch": 0.14811114282995527, + "grad_norm": 1.9701952934265137, + "learning_rate": 4.7342329823973304e-05, + "loss": 4.7468, + "step": 24904 + }, + { + "epoch": 0.14811709011323626, + "grad_norm": 1.8744746446609497, + "learning_rate": 4.734212024296092e-05, + "loss": 5.2544, + "step": 24905 + }, + { + "epoch": 0.14812303739651728, + "grad_norm": 1.5343592166900635, + "learning_rate": 4.734191065414913e-05, + "loss": 5.1794, + "step": 24906 + }, + { + "epoch": 0.14812898467979826, + "grad_norm": 1.509623408317566, + "learning_rate": 4.734170105753801e-05, + "loss": 5.4512, + "step": 24907 + }, + { + "epoch": 0.14813493196307925, + "grad_norm": 1.4235179424285889, + "learning_rate": 4.734149145312764e-05, + "loss": 5.4535, + "step": 24908 + }, + { + "epoch": 0.14814087924636027, + "grad_norm": 1.4011653661727905, + "learning_rate": 4.7341281840918076e-05, + "loss": 5.4248, + "step": 24909 + }, + { + "epoch": 0.14814682652964126, + "grad_norm": 1.3742294311523438, + "learning_rate": 4.734107222090941e-05, + "loss": 5.3076, + "step": 24910 + }, + { + "epoch": 0.14815277381292224, + "grad_norm": 1.4808472394943237, + "learning_rate": 4.73408625931017e-05, + "loss": 5.4432, + "step": 24911 + }, + { + "epoch": 0.14815872109620326, + "grad_norm": 1.3847295045852661, + "learning_rate": 4.734065295749502e-05, + "loss": 5.4678, + "step": 24912 + }, + { + "epoch": 0.14816466837948425, + "grad_norm": 1.4962565898895264, + "learning_rate": 4.734044331408947e-05, + "loss": 5.6803, + "step": 24913 + }, + { + "epoch": 0.14817061566276524, + "grad_norm": 1.7258118391036987, + "learning_rate": 4.734023366288508e-05, + "loss": 4.933, + "step": 24914 + }, + { + "epoch": 0.14817656294604625, + "grad_norm": 1.7875369787216187, + "learning_rate": 4.7340024003881955e-05, + "loss": 4.9978, + "step": 24915 + }, + { + "epoch": 0.14818251022932724, + "grad_norm": 1.5841879844665527, + "learning_rate": 4.733981433708016e-05, + "loss": 5.1718, + "step": 24916 + }, + { + "epoch": 0.14818845751260823, + "grad_norm": 1.4346718788146973, + "learning_rate": 4.733960466247976e-05, + "loss": 4.6579, + "step": 24917 + }, + { + "epoch": 0.14819440479588925, + "grad_norm": 1.4387844800949097, + "learning_rate": 4.7339394980080844e-05, + "loss": 5.012, + "step": 24918 + }, + { + "epoch": 0.14820035207917023, + "grad_norm": 1.7081257104873657, + "learning_rate": 4.733918528988347e-05, + "loss": 5.4316, + "step": 24919 + }, + { + "epoch": 0.14820629936245122, + "grad_norm": 1.7600195407867432, + "learning_rate": 4.733897559188771e-05, + "loss": 5.309, + "step": 24920 + }, + { + "epoch": 0.14821224664573224, + "grad_norm": 1.7399616241455078, + "learning_rate": 4.733876588609366e-05, + "loss": 5.1796, + "step": 24921 + }, + { + "epoch": 0.14821819392901323, + "grad_norm": 1.7843348979949951, + "learning_rate": 4.733855617250137e-05, + "loss": 5.0371, + "step": 24922 + }, + { + "epoch": 0.1482241412122942, + "grad_norm": 1.6706308126449585, + "learning_rate": 4.733834645111092e-05, + "loss": 5.1058, + "step": 24923 + }, + { + "epoch": 0.14823008849557523, + "grad_norm": 2.6056525707244873, + "learning_rate": 4.733813672192239e-05, + "loss": 4.5804, + "step": 24924 + }, + { + "epoch": 0.14823603577885622, + "grad_norm": 1.836887001991272, + "learning_rate": 4.733792698493584e-05, + "loss": 5.0871, + "step": 24925 + }, + { + "epoch": 0.1482419830621372, + "grad_norm": 1.8913605213165283, + "learning_rate": 4.733771724015135e-05, + "loss": 5.4228, + "step": 24926 + }, + { + "epoch": 0.14824793034541822, + "grad_norm": 1.7032699584960938, + "learning_rate": 4.7337507487569e-05, + "loss": 5.5599, + "step": 24927 + }, + { + "epoch": 0.1482538776286992, + "grad_norm": 1.6115164756774902, + "learning_rate": 4.733729772718885e-05, + "loss": 5.5348, + "step": 24928 + }, + { + "epoch": 0.1482598249119802, + "grad_norm": 1.563080906867981, + "learning_rate": 4.733708795901098e-05, + "loss": 5.4334, + "step": 24929 + }, + { + "epoch": 0.14826577219526121, + "grad_norm": 1.6452966928482056, + "learning_rate": 4.733687818303547e-05, + "loss": 5.7378, + "step": 24930 + }, + { + "epoch": 0.1482717194785422, + "grad_norm": 1.602687120437622, + "learning_rate": 4.7336668399262386e-05, + "loss": 5.7311, + "step": 24931 + }, + { + "epoch": 0.1482776667618232, + "grad_norm": 1.6656992435455322, + "learning_rate": 4.73364586076918e-05, + "loss": 5.3285, + "step": 24932 + }, + { + "epoch": 0.1482836140451042, + "grad_norm": 2.0401406288146973, + "learning_rate": 4.7336248808323786e-05, + "loss": 4.9655, + "step": 24933 + }, + { + "epoch": 0.1482895613283852, + "grad_norm": 2.536595582962036, + "learning_rate": 4.733603900115842e-05, + "loss": 4.6622, + "step": 24934 + }, + { + "epoch": 0.14829550861166618, + "grad_norm": 1.5609594583511353, + "learning_rate": 4.7335829186195766e-05, + "loss": 5.2326, + "step": 24935 + }, + { + "epoch": 0.1483014558949472, + "grad_norm": 1.6761829853057861, + "learning_rate": 4.733561936343591e-05, + "loss": 5.4059, + "step": 24936 + }, + { + "epoch": 0.1483074031782282, + "grad_norm": 1.1501821279525757, + "learning_rate": 4.733540953287893e-05, + "loss": 4.8906, + "step": 24937 + }, + { + "epoch": 0.14831335046150917, + "grad_norm": 1.6217314004898071, + "learning_rate": 4.733519969452488e-05, + "loss": 4.8381, + "step": 24938 + }, + { + "epoch": 0.1483192977447902, + "grad_norm": 1.8240901231765747, + "learning_rate": 4.733498984837384e-05, + "loss": 5.4137, + "step": 24939 + }, + { + "epoch": 0.14832524502807118, + "grad_norm": 1.7012525796890259, + "learning_rate": 4.733477999442589e-05, + "loss": 5.4581, + "step": 24940 + }, + { + "epoch": 0.14833119231135217, + "grad_norm": 1.3260048627853394, + "learning_rate": 4.73345701326811e-05, + "loss": 5.6434, + "step": 24941 + }, + { + "epoch": 0.14833713959463318, + "grad_norm": 1.6175122261047363, + "learning_rate": 4.7334360263139536e-05, + "loss": 5.5073, + "step": 24942 + }, + { + "epoch": 0.14834308687791417, + "grad_norm": 1.890405535697937, + "learning_rate": 4.7334150385801276e-05, + "loss": 5.059, + "step": 24943 + }, + { + "epoch": 0.14834903416119516, + "grad_norm": 2.121887683868408, + "learning_rate": 4.733394050066641e-05, + "loss": 4.7292, + "step": 24944 + }, + { + "epoch": 0.14835498144447617, + "grad_norm": 2.054938316345215, + "learning_rate": 4.7333730607734985e-05, + "loss": 4.7551, + "step": 24945 + }, + { + "epoch": 0.14836092872775716, + "grad_norm": 1.853046178817749, + "learning_rate": 4.733352070700708e-05, + "loss": 4.7807, + "step": 24946 + }, + { + "epoch": 0.14836687601103815, + "grad_norm": 1.926611304283142, + "learning_rate": 4.733331079848279e-05, + "loss": 5.026, + "step": 24947 + }, + { + "epoch": 0.14837282329431917, + "grad_norm": 1.9281972646713257, + "learning_rate": 4.7333100882162164e-05, + "loss": 5.0131, + "step": 24948 + }, + { + "epoch": 0.14837877057760016, + "grad_norm": 2.158128499984741, + "learning_rate": 4.733289095804527e-05, + "loss": 4.8987, + "step": 24949 + }, + { + "epoch": 0.14838471786088114, + "grad_norm": 1.9640719890594482, + "learning_rate": 4.7332681026132216e-05, + "loss": 4.868, + "step": 24950 + }, + { + "epoch": 0.14839066514416216, + "grad_norm": 2.0871901512145996, + "learning_rate": 4.7332471086423045e-05, + "loss": 4.8542, + "step": 24951 + }, + { + "epoch": 0.14839661242744315, + "grad_norm": 2.2361068725585938, + "learning_rate": 4.7332261138917836e-05, + "loss": 4.9536, + "step": 24952 + }, + { + "epoch": 0.14840255971072414, + "grad_norm": 2.3177475929260254, + "learning_rate": 4.7332051183616665e-05, + "loss": 4.9228, + "step": 24953 + }, + { + "epoch": 0.14840850699400515, + "grad_norm": 2.0412709712982178, + "learning_rate": 4.733184122051961e-05, + "loss": 4.888, + "step": 24954 + }, + { + "epoch": 0.14841445427728614, + "grad_norm": 1.904599666595459, + "learning_rate": 4.733163124962674e-05, + "loss": 4.842, + "step": 24955 + }, + { + "epoch": 0.14842040156056713, + "grad_norm": 2.3957440853118896, + "learning_rate": 4.733142127093813e-05, + "loss": 4.7589, + "step": 24956 + }, + { + "epoch": 0.14842634884384814, + "grad_norm": 1.966145634651184, + "learning_rate": 4.733121128445384e-05, + "loss": 4.5783, + "step": 24957 + }, + { + "epoch": 0.14843229612712913, + "grad_norm": 2.230134963989258, + "learning_rate": 4.7331001290173966e-05, + "loss": 4.6108, + "step": 24958 + }, + { + "epoch": 0.14843824341041012, + "grad_norm": 1.9063829183578491, + "learning_rate": 4.7330791288098565e-05, + "loss": 4.765, + "step": 24959 + }, + { + "epoch": 0.1484441906936911, + "grad_norm": 2.0853664875030518, + "learning_rate": 4.7330581278227716e-05, + "loss": 6.0523, + "step": 24960 + }, + { + "epoch": 0.14845013797697212, + "grad_norm": 2.0823090076446533, + "learning_rate": 4.7330371260561494e-05, + "loss": 6.1014, + "step": 24961 + }, + { + "epoch": 0.1484560852602531, + "grad_norm": 1.7553062438964844, + "learning_rate": 4.733016123509997e-05, + "loss": 5.5322, + "step": 24962 + }, + { + "epoch": 0.1484620325435341, + "grad_norm": 1.7482306957244873, + "learning_rate": 4.7329951201843217e-05, + "loss": 5.5981, + "step": 24963 + }, + { + "epoch": 0.14846797982681512, + "grad_norm": 1.7615885734558105, + "learning_rate": 4.732974116079131e-05, + "loss": 5.447, + "step": 24964 + }, + { + "epoch": 0.1484739271100961, + "grad_norm": 1.645790696144104, + "learning_rate": 4.732953111194432e-05, + "loss": 5.4439, + "step": 24965 + }, + { + "epoch": 0.1484798743933771, + "grad_norm": 1.8099596500396729, + "learning_rate": 4.7329321055302326e-05, + "loss": 5.1291, + "step": 24966 + }, + { + "epoch": 0.1484858216766581, + "grad_norm": 1.8523690700531006, + "learning_rate": 4.732911099086539e-05, + "loss": 4.9296, + "step": 24967 + }, + { + "epoch": 0.1484917689599391, + "grad_norm": 1.7897992134094238, + "learning_rate": 4.732890091863359e-05, + "loss": 5.1764, + "step": 24968 + }, + { + "epoch": 0.14849771624322008, + "grad_norm": 1.8922818899154663, + "learning_rate": 4.7328690838607e-05, + "loss": 5.1548, + "step": 24969 + }, + { + "epoch": 0.1485036635265011, + "grad_norm": 1.9169872999191284, + "learning_rate": 4.73284807507857e-05, + "loss": 5.0837, + "step": 24970 + }, + { + "epoch": 0.1485096108097821, + "grad_norm": 1.649895429611206, + "learning_rate": 4.732827065516976e-05, + "loss": 5.2689, + "step": 24971 + }, + { + "epoch": 0.14851555809306308, + "grad_norm": 1.638153076171875, + "learning_rate": 4.732806055175925e-05, + "loss": 5.5579, + "step": 24972 + }, + { + "epoch": 0.1485215053763441, + "grad_norm": 1.6101715564727783, + "learning_rate": 4.7327850440554244e-05, + "loss": 5.5632, + "step": 24973 + }, + { + "epoch": 0.14852745265962508, + "grad_norm": 1.5299588441848755, + "learning_rate": 4.7327640321554815e-05, + "loss": 5.6415, + "step": 24974 + }, + { + "epoch": 0.14853339994290607, + "grad_norm": 1.508520245552063, + "learning_rate": 4.732743019476104e-05, + "loss": 5.1519, + "step": 24975 + }, + { + "epoch": 0.14853934722618709, + "grad_norm": 1.760366439819336, + "learning_rate": 4.732722006017299e-05, + "loss": 4.2604, + "step": 24976 + }, + { + "epoch": 0.14854529450946807, + "grad_norm": 1.6827213764190674, + "learning_rate": 4.732700991779073e-05, + "loss": 4.2258, + "step": 24977 + }, + { + "epoch": 0.14855124179274906, + "grad_norm": 1.576389193534851, + "learning_rate": 4.732679976761435e-05, + "loss": 4.2854, + "step": 24978 + }, + { + "epoch": 0.14855718907603008, + "grad_norm": 1.592392921447754, + "learning_rate": 4.732658960964391e-05, + "loss": 4.2775, + "step": 24979 + }, + { + "epoch": 0.14856313635931107, + "grad_norm": 1.6771488189697266, + "learning_rate": 4.7326379443879495e-05, + "loss": 4.3001, + "step": 24980 + }, + { + "epoch": 0.14856908364259205, + "grad_norm": 1.584578037261963, + "learning_rate": 4.732616927032117e-05, + "loss": 4.1592, + "step": 24981 + }, + { + "epoch": 0.14857503092587307, + "grad_norm": 1.7568552494049072, + "learning_rate": 4.732595908896901e-05, + "loss": 4.1514, + "step": 24982 + }, + { + "epoch": 0.14858097820915406, + "grad_norm": 1.6334513425827026, + "learning_rate": 4.732574889982309e-05, + "loss": 4.1319, + "step": 24983 + }, + { + "epoch": 0.14858692549243505, + "grad_norm": 1.7330750226974487, + "learning_rate": 4.732553870288347e-05, + "loss": 4.1036, + "step": 24984 + }, + { + "epoch": 0.14859287277571606, + "grad_norm": 1.7719300985336304, + "learning_rate": 4.732532849815024e-05, + "loss": 5.2043, + "step": 24985 + }, + { + "epoch": 0.14859882005899705, + "grad_norm": 2.9879441261291504, + "learning_rate": 4.732511828562347e-05, + "loss": 3.8784, + "step": 24986 + }, + { + "epoch": 0.14860476734227804, + "grad_norm": 1.9443185329437256, + "learning_rate": 4.732490806530324e-05, + "loss": 5.5898, + "step": 24987 + }, + { + "epoch": 0.14861071462555905, + "grad_norm": 1.800279140472412, + "learning_rate": 4.73246978371896e-05, + "loss": 5.465, + "step": 24988 + }, + { + "epoch": 0.14861666190884004, + "grad_norm": 1.9028568267822266, + "learning_rate": 4.732448760128265e-05, + "loss": 4.8782, + "step": 24989 + }, + { + "epoch": 0.14862260919212103, + "grad_norm": 2.79314923286438, + "learning_rate": 4.732427735758245e-05, + "loss": 4.5421, + "step": 24990 + }, + { + "epoch": 0.14862855647540205, + "grad_norm": 2.4686412811279297, + "learning_rate": 4.7324067106089074e-05, + "loss": 4.4616, + "step": 24991 + }, + { + "epoch": 0.14863450375868303, + "grad_norm": 1.8359897136688232, + "learning_rate": 4.73238568468026e-05, + "loss": 4.8081, + "step": 24992 + }, + { + "epoch": 0.14864045104196402, + "grad_norm": 2.3388144969940186, + "learning_rate": 4.732364657972309e-05, + "loss": 4.527, + "step": 24993 + }, + { + "epoch": 0.14864639832524504, + "grad_norm": 2.888598680496216, + "learning_rate": 4.7323436304850634e-05, + "loss": 4.1855, + "step": 24994 + }, + { + "epoch": 0.14865234560852603, + "grad_norm": 3.1639111042022705, + "learning_rate": 4.7323226022185296e-05, + "loss": 4.0865, + "step": 24995 + }, + { + "epoch": 0.14865829289180701, + "grad_norm": 2.8708033561706543, + "learning_rate": 4.732301573172715e-05, + "loss": 3.8629, + "step": 24996 + }, + { + "epoch": 0.14866424017508803, + "grad_norm": 2.667426347732544, + "learning_rate": 4.732280543347627e-05, + "loss": 4.0511, + "step": 24997 + }, + { + "epoch": 0.14867018745836902, + "grad_norm": 2.5031850337982178, + "learning_rate": 4.7322595127432725e-05, + "loss": 4.2035, + "step": 24998 + }, + { + "epoch": 0.14867613474165, + "grad_norm": 2.4356188774108887, + "learning_rate": 4.7322384813596595e-05, + "loss": 3.8996, + "step": 24999 + }, + { + "epoch": 0.14868208202493102, + "grad_norm": 2.334566354751587, + "learning_rate": 4.732217449196795e-05, + "loss": 4.2353, + "step": 25000 + }, + { + "epoch": 0.148688029308212, + "grad_norm": 2.357844591140747, + "learning_rate": 4.732196416254686e-05, + "loss": 4.3695, + "step": 25001 + }, + { + "epoch": 0.148693976591493, + "grad_norm": 2.4662234783172607, + "learning_rate": 4.7321753825333416e-05, + "loss": 3.9325, + "step": 25002 + }, + { + "epoch": 0.14869992387477401, + "grad_norm": 1.840820074081421, + "learning_rate": 4.7321543480327666e-05, + "loss": 5.1156, + "step": 25003 + }, + { + "epoch": 0.148705871158055, + "grad_norm": 1.9830942153930664, + "learning_rate": 4.73213331275297e-05, + "loss": 4.6774, + "step": 25004 + }, + { + "epoch": 0.148711818441336, + "grad_norm": 1.6185516119003296, + "learning_rate": 4.732112276693959e-05, + "loss": 4.6241, + "step": 25005 + }, + { + "epoch": 0.148717765724617, + "grad_norm": 1.8661324977874756, + "learning_rate": 4.7320912398557403e-05, + "loss": 4.6107, + "step": 25006 + }, + { + "epoch": 0.148723713007898, + "grad_norm": 1.750866174697876, + "learning_rate": 4.7320702022383226e-05, + "loss": 4.7134, + "step": 25007 + }, + { + "epoch": 0.14872966029117898, + "grad_norm": 1.7875406742095947, + "learning_rate": 4.7320491638417105e-05, + "loss": 4.6935, + "step": 25008 + }, + { + "epoch": 0.14873560757446, + "grad_norm": 1.6559946537017822, + "learning_rate": 4.732028124665915e-05, + "loss": 4.7556, + "step": 25009 + }, + { + "epoch": 0.148741554857741, + "grad_norm": 2.075535535812378, + "learning_rate": 4.7320070847109396e-05, + "loss": 4.6646, + "step": 25010 + }, + { + "epoch": 0.14874750214102198, + "grad_norm": 2.1029436588287354, + "learning_rate": 4.731986043976795e-05, + "loss": 5.0169, + "step": 25011 + }, + { + "epoch": 0.148753449424303, + "grad_norm": 1.9193171262741089, + "learning_rate": 4.7319650024634866e-05, + "loss": 5.236, + "step": 25012 + }, + { + "epoch": 0.14875939670758398, + "grad_norm": 1.6295948028564453, + "learning_rate": 4.731943960171022e-05, + "loss": 5.3538, + "step": 25013 + }, + { + "epoch": 0.14876534399086497, + "grad_norm": 1.5699677467346191, + "learning_rate": 4.73192291709941e-05, + "loss": 5.5413, + "step": 25014 + }, + { + "epoch": 0.14877129127414598, + "grad_norm": 2.8893580436706543, + "learning_rate": 4.7319018732486555e-05, + "loss": 4.5995, + "step": 25015 + }, + { + "epoch": 0.14877723855742697, + "grad_norm": 2.366352081298828, + "learning_rate": 4.731880828618768e-05, + "loss": 4.5993, + "step": 25016 + }, + { + "epoch": 0.14878318584070796, + "grad_norm": 2.1206884384155273, + "learning_rate": 4.731859783209753e-05, + "loss": 4.2081, + "step": 25017 + }, + { + "epoch": 0.14878913312398895, + "grad_norm": 2.4171648025512695, + "learning_rate": 4.73183873702162e-05, + "loss": 4.287, + "step": 25018 + }, + { + "epoch": 0.14879508040726996, + "grad_norm": 1.9675270318984985, + "learning_rate": 4.7318176900543744e-05, + "loss": 4.5648, + "step": 25019 + }, + { + "epoch": 0.14880102769055095, + "grad_norm": 1.750753402709961, + "learning_rate": 4.731796642308024e-05, + "loss": 5.6165, + "step": 25020 + }, + { + "epoch": 0.14880697497383194, + "grad_norm": 1.7137641906738281, + "learning_rate": 4.731775593782577e-05, + "loss": 5.1204, + "step": 25021 + }, + { + "epoch": 0.14881292225711296, + "grad_norm": 1.4377870559692383, + "learning_rate": 4.73175454447804e-05, + "loss": 5.4076, + "step": 25022 + }, + { + "epoch": 0.14881886954039394, + "grad_norm": 1.3382959365844727, + "learning_rate": 4.7317334943944204e-05, + "loss": 5.444, + "step": 25023 + }, + { + "epoch": 0.14882481682367493, + "grad_norm": 1.0098121166229248, + "learning_rate": 4.731712443531726e-05, + "loss": 5.2913, + "step": 25024 + }, + { + "epoch": 0.14883076410695595, + "grad_norm": 0.897736132144928, + "learning_rate": 4.7316913918899644e-05, + "loss": 5.2909, + "step": 25025 + }, + { + "epoch": 0.14883671139023694, + "grad_norm": 1.1516233682632446, + "learning_rate": 4.731670339469141e-05, + "loss": 5.3357, + "step": 25026 + }, + { + "epoch": 0.14884265867351792, + "grad_norm": 1.7736589908599854, + "learning_rate": 4.731649286269265e-05, + "loss": 5.1258, + "step": 25027 + }, + { + "epoch": 0.14884860595679894, + "grad_norm": 1.8994569778442383, + "learning_rate": 4.731628232290344e-05, + "loss": 5.5661, + "step": 25028 + }, + { + "epoch": 0.14885455324007993, + "grad_norm": 1.7552026510238647, + "learning_rate": 4.731607177532384e-05, + "loss": 5.3648, + "step": 25029 + }, + { + "epoch": 0.14886050052336092, + "grad_norm": 2.8771791458129883, + "learning_rate": 4.731586121995393e-05, + "loss": 4.6516, + "step": 25030 + }, + { + "epoch": 0.14886644780664193, + "grad_norm": 2.073287010192871, + "learning_rate": 4.731565065679379e-05, + "loss": 4.8374, + "step": 25031 + }, + { + "epoch": 0.14887239508992292, + "grad_norm": 1.6661057472229004, + "learning_rate": 4.7315440085843476e-05, + "loss": 5.0031, + "step": 25032 + }, + { + "epoch": 0.1488783423732039, + "grad_norm": 2.286806106567383, + "learning_rate": 4.7315229507103084e-05, + "loss": 4.3394, + "step": 25033 + }, + { + "epoch": 0.14888428965648492, + "grad_norm": 2.3657538890838623, + "learning_rate": 4.7315018920572666e-05, + "loss": 4.4455, + "step": 25034 + }, + { + "epoch": 0.1488902369397659, + "grad_norm": 2.1653788089752197, + "learning_rate": 4.7314808326252316e-05, + "loss": 4.5676, + "step": 25035 + }, + { + "epoch": 0.1488961842230469, + "grad_norm": 1.853837251663208, + "learning_rate": 4.731459772414208e-05, + "loss": 4.4169, + "step": 25036 + }, + { + "epoch": 0.14890213150632792, + "grad_norm": 2.1202454566955566, + "learning_rate": 4.7314387114242064e-05, + "loss": 4.4917, + "step": 25037 + }, + { + "epoch": 0.1489080787896089, + "grad_norm": 2.1203508377075195, + "learning_rate": 4.731417649655232e-05, + "loss": 4.2212, + "step": 25038 + }, + { + "epoch": 0.1489140260728899, + "grad_norm": 2.220571994781494, + "learning_rate": 4.731396587107293e-05, + "loss": 4.3678, + "step": 25039 + }, + { + "epoch": 0.1489199733561709, + "grad_norm": 1.9346973896026611, + "learning_rate": 4.731375523780397e-05, + "loss": 3.9189, + "step": 25040 + }, + { + "epoch": 0.1489259206394519, + "grad_norm": 2.1453700065612793, + "learning_rate": 4.731354459674549e-05, + "loss": 5.4543, + "step": 25041 + }, + { + "epoch": 0.14893186792273289, + "grad_norm": 2.7248880863189697, + "learning_rate": 4.73133339478976e-05, + "loss": 4.419, + "step": 25042 + }, + { + "epoch": 0.1489378152060139, + "grad_norm": 2.675060510635376, + "learning_rate": 4.731312329126035e-05, + "loss": 4.2858, + "step": 25043 + }, + { + "epoch": 0.1489437624892949, + "grad_norm": 2.5627496242523193, + "learning_rate": 4.731291262683382e-05, + "loss": 4.3065, + "step": 25044 + }, + { + "epoch": 0.14894970977257588, + "grad_norm": 2.238367795944214, + "learning_rate": 4.7312701954618086e-05, + "loss": 4.1853, + "step": 25045 + }, + { + "epoch": 0.1489556570558569, + "grad_norm": 2.144697427749634, + "learning_rate": 4.731249127461321e-05, + "loss": 4.5655, + "step": 25046 + }, + { + "epoch": 0.14896160433913788, + "grad_norm": 1.676389455795288, + "learning_rate": 4.731228058681928e-05, + "loss": 4.8332, + "step": 25047 + }, + { + "epoch": 0.14896755162241887, + "grad_norm": 2.7558321952819824, + "learning_rate": 4.7312069891236364e-05, + "loss": 3.5354, + "step": 25048 + }, + { + "epoch": 0.14897349890569989, + "grad_norm": 1.8224084377288818, + "learning_rate": 4.731185918786453e-05, + "loss": 4.8105, + "step": 25049 + }, + { + "epoch": 0.14897944618898087, + "grad_norm": 1.8380038738250732, + "learning_rate": 4.731164847670386e-05, + "loss": 4.8584, + "step": 25050 + }, + { + "epoch": 0.14898539347226186, + "grad_norm": 1.6260594129562378, + "learning_rate": 4.7311437757754425e-05, + "loss": 4.5548, + "step": 25051 + }, + { + "epoch": 0.14899134075554288, + "grad_norm": 1.490978717803955, + "learning_rate": 4.731122703101629e-05, + "loss": 4.7144, + "step": 25052 + }, + { + "epoch": 0.14899728803882387, + "grad_norm": 2.054363489151001, + "learning_rate": 4.731101629648954e-05, + "loss": 4.9561, + "step": 25053 + }, + { + "epoch": 0.14900323532210485, + "grad_norm": 2.431696891784668, + "learning_rate": 4.7310805554174255e-05, + "loss": 4.6347, + "step": 25054 + }, + { + "epoch": 0.14900918260538587, + "grad_norm": 2.9854423999786377, + "learning_rate": 4.7310594804070485e-05, + "loss": 4.3526, + "step": 25055 + }, + { + "epoch": 0.14901512988866686, + "grad_norm": 2.859827756881714, + "learning_rate": 4.731038404617832e-05, + "loss": 4.3427, + "step": 25056 + }, + { + "epoch": 0.14902107717194785, + "grad_norm": 2.866624593734741, + "learning_rate": 4.731017328049784e-05, + "loss": 4.4747, + "step": 25057 + }, + { + "epoch": 0.14902702445522886, + "grad_norm": 2.0833802223205566, + "learning_rate": 4.730996250702909e-05, + "loss": 4.1979, + "step": 25058 + }, + { + "epoch": 0.14903297173850985, + "grad_norm": 2.095679521560669, + "learning_rate": 4.7309751725772176e-05, + "loss": 4.2466, + "step": 25059 + }, + { + "epoch": 0.14903891902179084, + "grad_norm": 2.3466885089874268, + "learning_rate": 4.730954093672716e-05, + "loss": 4.3074, + "step": 25060 + }, + { + "epoch": 0.14904486630507185, + "grad_norm": 2.1188759803771973, + "learning_rate": 4.730933013989411e-05, + "loss": 4.2482, + "step": 25061 + }, + { + "epoch": 0.14905081358835284, + "grad_norm": 2.1638059616088867, + "learning_rate": 4.73091193352731e-05, + "loss": 4.1506, + "step": 25062 + }, + { + "epoch": 0.14905676087163383, + "grad_norm": 2.035240650177002, + "learning_rate": 4.7308908522864215e-05, + "loss": 4.4322, + "step": 25063 + }, + { + "epoch": 0.14906270815491485, + "grad_norm": 2.375912666320801, + "learning_rate": 4.730869770266751e-05, + "loss": 5.3206, + "step": 25064 + }, + { + "epoch": 0.14906865543819584, + "grad_norm": 1.8899742364883423, + "learning_rate": 4.7308486874683075e-05, + "loss": 5.1336, + "step": 25065 + }, + { + "epoch": 0.14907460272147682, + "grad_norm": 1.7068132162094116, + "learning_rate": 4.730827603891098e-05, + "loss": 5.0085, + "step": 25066 + }, + { + "epoch": 0.14908055000475784, + "grad_norm": 1.737470269203186, + "learning_rate": 4.730806519535129e-05, + "loss": 5.9056, + "step": 25067 + }, + { + "epoch": 0.14908649728803883, + "grad_norm": 1.251652717590332, + "learning_rate": 4.730785434400409e-05, + "loss": 5.3772, + "step": 25068 + }, + { + "epoch": 0.14909244457131982, + "grad_norm": 1.2134002447128296, + "learning_rate": 4.730764348486945e-05, + "loss": 5.4202, + "step": 25069 + }, + { + "epoch": 0.14909839185460083, + "grad_norm": 1.028356671333313, + "learning_rate": 4.730743261794743e-05, + "loss": 5.4883, + "step": 25070 + }, + { + "epoch": 0.14910433913788182, + "grad_norm": 1.3931416273117065, + "learning_rate": 4.730722174323813e-05, + "loss": 5.3274, + "step": 25071 + }, + { + "epoch": 0.1491102864211628, + "grad_norm": 1.2539725303649902, + "learning_rate": 4.7307010860741607e-05, + "loss": 5.2628, + "step": 25072 + }, + { + "epoch": 0.14911623370444382, + "grad_norm": 1.2422703504562378, + "learning_rate": 4.730679997045793e-05, + "loss": 5.1639, + "step": 25073 + }, + { + "epoch": 0.1491221809877248, + "grad_norm": 1.4616423845291138, + "learning_rate": 4.730658907238719e-05, + "loss": 5.0979, + "step": 25074 + }, + { + "epoch": 0.1491281282710058, + "grad_norm": 1.2968275547027588, + "learning_rate": 4.730637816652944e-05, + "loss": 5.0785, + "step": 25075 + }, + { + "epoch": 0.1491340755542868, + "grad_norm": 1.304254412651062, + "learning_rate": 4.730616725288477e-05, + "loss": 5.4885, + "step": 25076 + }, + { + "epoch": 0.1491400228375678, + "grad_norm": 2.3498852252960205, + "learning_rate": 4.730595633145324e-05, + "loss": 5.3064, + "step": 25077 + }, + { + "epoch": 0.1491459701208488, + "grad_norm": 1.7321240901947021, + "learning_rate": 4.730574540223493e-05, + "loss": 5.1844, + "step": 25078 + }, + { + "epoch": 0.14915191740412978, + "grad_norm": 1.903198480606079, + "learning_rate": 4.730553446522993e-05, + "loss": 5.5481, + "step": 25079 + }, + { + "epoch": 0.1491578646874108, + "grad_norm": 1.659658670425415, + "learning_rate": 4.7305323520438285e-05, + "loss": 5.3265, + "step": 25080 + }, + { + "epoch": 0.14916381197069178, + "grad_norm": 1.4510316848754883, + "learning_rate": 4.7305112567860085e-05, + "loss": 5.2607, + "step": 25081 + }, + { + "epoch": 0.14916975925397277, + "grad_norm": 1.5634890794754028, + "learning_rate": 4.73049016074954e-05, + "loss": 5.1961, + "step": 25082 + }, + { + "epoch": 0.1491757065372538, + "grad_norm": 1.5400700569152832, + "learning_rate": 4.730469063934431e-05, + "loss": 5.6441, + "step": 25083 + }, + { + "epoch": 0.14918165382053478, + "grad_norm": 1.814353108406067, + "learning_rate": 4.730447966340688e-05, + "loss": 5.1855, + "step": 25084 + }, + { + "epoch": 0.14918760110381576, + "grad_norm": 2.3644423484802246, + "learning_rate": 4.7304268679683184e-05, + "loss": 4.5312, + "step": 25085 + }, + { + "epoch": 0.14919354838709678, + "grad_norm": 2.6960058212280273, + "learning_rate": 4.73040576881733e-05, + "loss": 4.3128, + "step": 25086 + }, + { + "epoch": 0.14919949567037777, + "grad_norm": 2.50162410736084, + "learning_rate": 4.73038466888773e-05, + "loss": 4.3356, + "step": 25087 + }, + { + "epoch": 0.14920544295365876, + "grad_norm": 1.938988208770752, + "learning_rate": 4.730363568179526e-05, + "loss": 4.6391, + "step": 25088 + }, + { + "epoch": 0.14921139023693977, + "grad_norm": 2.0165152549743652, + "learning_rate": 4.730342466692725e-05, + "loss": 5.3267, + "step": 25089 + }, + { + "epoch": 0.14921733752022076, + "grad_norm": 2.3626153469085693, + "learning_rate": 4.7303213644273345e-05, + "loss": 5.2551, + "step": 25090 + }, + { + "epoch": 0.14922328480350175, + "grad_norm": 2.1070075035095215, + "learning_rate": 4.730300261383361e-05, + "loss": 5.2231, + "step": 25091 + }, + { + "epoch": 0.14922923208678276, + "grad_norm": 1.6806228160858154, + "learning_rate": 4.7302791575608144e-05, + "loss": 5.5844, + "step": 25092 + }, + { + "epoch": 0.14923517937006375, + "grad_norm": 2.149728298187256, + "learning_rate": 4.7302580529596985e-05, + "loss": 4.7185, + "step": 25093 + }, + { + "epoch": 0.14924112665334474, + "grad_norm": 1.93796968460083, + "learning_rate": 4.730236947580024e-05, + "loss": 4.7622, + "step": 25094 + }, + { + "epoch": 0.14924707393662576, + "grad_norm": 1.7360033988952637, + "learning_rate": 4.7302158414217964e-05, + "loss": 4.7068, + "step": 25095 + }, + { + "epoch": 0.14925302121990675, + "grad_norm": 1.712073564529419, + "learning_rate": 4.730194734485023e-05, + "loss": 4.8146, + "step": 25096 + }, + { + "epoch": 0.14925896850318773, + "grad_norm": 1.789083480834961, + "learning_rate": 4.730173626769712e-05, + "loss": 4.774, + "step": 25097 + }, + { + "epoch": 0.14926491578646875, + "grad_norm": 1.9072470664978027, + "learning_rate": 4.730152518275871e-05, + "loss": 4.9099, + "step": 25098 + }, + { + "epoch": 0.14927086306974974, + "grad_norm": 1.7209197282791138, + "learning_rate": 4.730131409003506e-05, + "loss": 4.7141, + "step": 25099 + }, + { + "epoch": 0.14927681035303073, + "grad_norm": 1.8528800010681152, + "learning_rate": 4.730110298952625e-05, + "loss": 4.9741, + "step": 25100 + }, + { + "epoch": 0.14928275763631174, + "grad_norm": 1.9865680932998657, + "learning_rate": 4.7300891881232365e-05, + "loss": 4.9079, + "step": 25101 + }, + { + "epoch": 0.14928870491959273, + "grad_norm": 2.1327319145202637, + "learning_rate": 4.730068076515346e-05, + "loss": 4.9929, + "step": 25102 + }, + { + "epoch": 0.14929465220287372, + "grad_norm": 1.856972336769104, + "learning_rate": 4.730046964128962e-05, + "loss": 4.935, + "step": 25103 + }, + { + "epoch": 0.14930059948615473, + "grad_norm": 1.9982047080993652, + "learning_rate": 4.7300258509640924e-05, + "loss": 5.1254, + "step": 25104 + }, + { + "epoch": 0.14930654676943572, + "grad_norm": 1.866350531578064, + "learning_rate": 4.730004737020743e-05, + "loss": 5.0198, + "step": 25105 + }, + { + "epoch": 0.1493124940527167, + "grad_norm": 1.8669421672821045, + "learning_rate": 4.729983622298922e-05, + "loss": 4.817, + "step": 25106 + }, + { + "epoch": 0.14931844133599773, + "grad_norm": 2.3156704902648926, + "learning_rate": 4.7299625067986366e-05, + "loss": 4.9341, + "step": 25107 + }, + { + "epoch": 0.14932438861927871, + "grad_norm": 2.304932117462158, + "learning_rate": 4.7299413905198956e-05, + "loss": 4.908, + "step": 25108 + }, + { + "epoch": 0.1493303359025597, + "grad_norm": 2.0287182331085205, + "learning_rate": 4.7299202734627035e-05, + "loss": 4.9244, + "step": 25109 + }, + { + "epoch": 0.14933628318584072, + "grad_norm": 2.554980754852295, + "learning_rate": 4.72989915562707e-05, + "loss": 4.7163, + "step": 25110 + }, + { + "epoch": 0.1493422304691217, + "grad_norm": 2.76092791557312, + "learning_rate": 4.7298780370130014e-05, + "loss": 5.293, + "step": 25111 + }, + { + "epoch": 0.1493481777524027, + "grad_norm": 2.203293561935425, + "learning_rate": 4.729856917620506e-05, + "loss": 4.8891, + "step": 25112 + }, + { + "epoch": 0.1493541250356837, + "grad_norm": 2.2550253868103027, + "learning_rate": 4.7298357974495905e-05, + "loss": 5.1578, + "step": 25113 + }, + { + "epoch": 0.1493600723189647, + "grad_norm": 2.41914963722229, + "learning_rate": 4.7298146765002624e-05, + "loss": 5.0363, + "step": 25114 + }, + { + "epoch": 0.1493660196022457, + "grad_norm": 2.058586359024048, + "learning_rate": 4.729793554772528e-05, + "loss": 4.9537, + "step": 25115 + }, + { + "epoch": 0.1493719668855267, + "grad_norm": 2.3880207538604736, + "learning_rate": 4.729772432266397e-05, + "loss": 4.9701, + "step": 25116 + }, + { + "epoch": 0.1493779141688077, + "grad_norm": 2.012542247772217, + "learning_rate": 4.7297513089818745e-05, + "loss": 5.0596, + "step": 25117 + }, + { + "epoch": 0.14938386145208868, + "grad_norm": 1.9091664552688599, + "learning_rate": 4.72973018491897e-05, + "loss": 5.0199, + "step": 25118 + }, + { + "epoch": 0.1493898087353697, + "grad_norm": 1.9325292110443115, + "learning_rate": 4.7297090600776886e-05, + "loss": 4.9442, + "step": 25119 + }, + { + "epoch": 0.14939575601865068, + "grad_norm": 2.106926918029785, + "learning_rate": 4.729687934458039e-05, + "loss": 4.8628, + "step": 25120 + }, + { + "epoch": 0.14940170330193167, + "grad_norm": 1.7365446090698242, + "learning_rate": 4.729666808060029e-05, + "loss": 4.8492, + "step": 25121 + }, + { + "epoch": 0.1494076505852127, + "grad_norm": 1.9125512838363647, + "learning_rate": 4.729645680883665e-05, + "loss": 4.9389, + "step": 25122 + }, + { + "epoch": 0.14941359786849367, + "grad_norm": 2.0423247814178467, + "learning_rate": 4.729624552928954e-05, + "loss": 4.8626, + "step": 25123 + }, + { + "epoch": 0.14941954515177466, + "grad_norm": 1.9502712488174438, + "learning_rate": 4.729603424195905e-05, + "loss": 5.0237, + "step": 25124 + }, + { + "epoch": 0.14942549243505568, + "grad_norm": 2.0014281272888184, + "learning_rate": 4.7295822946845245e-05, + "loss": 4.9913, + "step": 25125 + }, + { + "epoch": 0.14943143971833667, + "grad_norm": 1.9854202270507812, + "learning_rate": 4.7295611643948204e-05, + "loss": 4.9394, + "step": 25126 + }, + { + "epoch": 0.14943738700161766, + "grad_norm": 1.7897859811782837, + "learning_rate": 4.729540033326798e-05, + "loss": 4.9434, + "step": 25127 + }, + { + "epoch": 0.14944333428489867, + "grad_norm": 2.092635154724121, + "learning_rate": 4.7295189014804676e-05, + "loss": 4.9032, + "step": 25128 + }, + { + "epoch": 0.14944928156817966, + "grad_norm": 1.9637115001678467, + "learning_rate": 4.729497768855834e-05, + "loss": 4.7775, + "step": 25129 + }, + { + "epoch": 0.14945522885146065, + "grad_norm": 1.8016657829284668, + "learning_rate": 4.729476635452906e-05, + "loss": 4.791, + "step": 25130 + }, + { + "epoch": 0.14946117613474166, + "grad_norm": 2.326096534729004, + "learning_rate": 4.7294555012716915e-05, + "loss": 5.3299, + "step": 25131 + }, + { + "epoch": 0.14946712341802265, + "grad_norm": 2.1310572624206543, + "learning_rate": 4.7294343663121965e-05, + "loss": 5.1919, + "step": 25132 + }, + { + "epoch": 0.14947307070130364, + "grad_norm": 2.3155853748321533, + "learning_rate": 4.72941323057443e-05, + "loss": 5.0858, + "step": 25133 + }, + { + "epoch": 0.14947901798458466, + "grad_norm": 2.049995183944702, + "learning_rate": 4.729392094058397e-05, + "loss": 5.065, + "step": 25134 + }, + { + "epoch": 0.14948496526786564, + "grad_norm": 1.8955172300338745, + "learning_rate": 4.729370956764107e-05, + "loss": 5.1361, + "step": 25135 + }, + { + "epoch": 0.14949091255114663, + "grad_norm": 3.226020336151123, + "learning_rate": 4.729349818691567e-05, + "loss": 4.7323, + "step": 25136 + }, + { + "epoch": 0.14949685983442762, + "grad_norm": 3.1648058891296387, + "learning_rate": 4.7293286798407833e-05, + "loss": 4.6663, + "step": 25137 + }, + { + "epoch": 0.14950280711770864, + "grad_norm": 2.2341058254241943, + "learning_rate": 4.729307540211764e-05, + "loss": 4.584, + "step": 25138 + }, + { + "epoch": 0.14950875440098962, + "grad_norm": 2.088019609451294, + "learning_rate": 4.729286399804517e-05, + "loss": 4.5618, + "step": 25139 + }, + { + "epoch": 0.1495147016842706, + "grad_norm": 1.8777929544448853, + "learning_rate": 4.729265258619048e-05, + "loss": 5.0011, + "step": 25140 + }, + { + "epoch": 0.14952064896755163, + "grad_norm": 2.080986261367798, + "learning_rate": 4.729244116655366e-05, + "loss": 5.6192, + "step": 25141 + }, + { + "epoch": 0.14952659625083262, + "grad_norm": 1.9895329475402832, + "learning_rate": 4.729222973913479e-05, + "loss": 5.8569, + "step": 25142 + }, + { + "epoch": 0.1495325435341136, + "grad_norm": 2.0990312099456787, + "learning_rate": 4.7292018303933924e-05, + "loss": 5.772, + "step": 25143 + }, + { + "epoch": 0.14953849081739462, + "grad_norm": 1.8530125617980957, + "learning_rate": 4.7291806860951145e-05, + "loss": 5.9042, + "step": 25144 + }, + { + "epoch": 0.1495444381006756, + "grad_norm": 1.7631386518478394, + "learning_rate": 4.7291595410186526e-05, + "loss": 5.7611, + "step": 25145 + }, + { + "epoch": 0.1495503853839566, + "grad_norm": 1.4668217897415161, + "learning_rate": 4.729138395164015e-05, + "loss": 5.4997, + "step": 25146 + }, + { + "epoch": 0.1495563326672376, + "grad_norm": 1.2580885887145996, + "learning_rate": 4.729117248531206e-05, + "loss": 5.5554, + "step": 25147 + }, + { + "epoch": 0.1495622799505186, + "grad_norm": 1.612502932548523, + "learning_rate": 4.7290961011202375e-05, + "loss": 5.0982, + "step": 25148 + }, + { + "epoch": 0.1495682272337996, + "grad_norm": 1.6753286123275757, + "learning_rate": 4.729074952931114e-05, + "loss": 4.9553, + "step": 25149 + }, + { + "epoch": 0.1495741745170806, + "grad_norm": 1.530179738998413, + "learning_rate": 4.729053803963843e-05, + "loss": 4.9314, + "step": 25150 + }, + { + "epoch": 0.1495801218003616, + "grad_norm": 1.5077494382858276, + "learning_rate": 4.729032654218433e-05, + "loss": 5.4957, + "step": 25151 + }, + { + "epoch": 0.14958606908364258, + "grad_norm": 1.6995402574539185, + "learning_rate": 4.72901150369489e-05, + "loss": 5.7406, + "step": 25152 + }, + { + "epoch": 0.1495920163669236, + "grad_norm": 1.4611583948135376, + "learning_rate": 4.728990352393222e-05, + "loss": 5.5632, + "step": 25153 + }, + { + "epoch": 0.14959796365020459, + "grad_norm": 1.775568962097168, + "learning_rate": 4.728969200313437e-05, + "loss": 5.1666, + "step": 25154 + }, + { + "epoch": 0.14960391093348557, + "grad_norm": 1.6890829801559448, + "learning_rate": 4.728948047455541e-05, + "loss": 5.1776, + "step": 25155 + }, + { + "epoch": 0.1496098582167666, + "grad_norm": 1.7455476522445679, + "learning_rate": 4.728926893819544e-05, + "loss": 5.0308, + "step": 25156 + }, + { + "epoch": 0.14961580550004758, + "grad_norm": 2.0798380374908447, + "learning_rate": 4.72890573940545e-05, + "loss": 4.8164, + "step": 25157 + }, + { + "epoch": 0.14962175278332857, + "grad_norm": 2.0280489921569824, + "learning_rate": 4.728884584213269e-05, + "loss": 4.7693, + "step": 25158 + }, + { + "epoch": 0.14962770006660958, + "grad_norm": 1.9629135131835938, + "learning_rate": 4.728863428243008e-05, + "loss": 4.9072, + "step": 25159 + }, + { + "epoch": 0.14963364734989057, + "grad_norm": 2.1143929958343506, + "learning_rate": 4.7288422714946724e-05, + "loss": 4.6828, + "step": 25160 + }, + { + "epoch": 0.14963959463317156, + "grad_norm": 1.9618384838104248, + "learning_rate": 4.7288211139682715e-05, + "loss": 5.0383, + "step": 25161 + }, + { + "epoch": 0.14964554191645257, + "grad_norm": 1.8829975128173828, + "learning_rate": 4.728799955663812e-05, + "loss": 5.5072, + "step": 25162 + }, + { + "epoch": 0.14965148919973356, + "grad_norm": 1.5670249462127686, + "learning_rate": 4.728778796581302e-05, + "loss": 5.1815, + "step": 25163 + }, + { + "epoch": 0.14965743648301455, + "grad_norm": 2.0932981967926025, + "learning_rate": 4.728757636720748e-05, + "loss": 5.0871, + "step": 25164 + }, + { + "epoch": 0.14966338376629557, + "grad_norm": 1.5827875137329102, + "learning_rate": 4.728736476082158e-05, + "loss": 5.0983, + "step": 25165 + }, + { + "epoch": 0.14966933104957655, + "grad_norm": 1.7353198528289795, + "learning_rate": 4.728715314665538e-05, + "loss": 4.8113, + "step": 25166 + }, + { + "epoch": 0.14967527833285754, + "grad_norm": 1.6395387649536133, + "learning_rate": 4.728694152470898e-05, + "loss": 4.9403, + "step": 25167 + }, + { + "epoch": 0.14968122561613856, + "grad_norm": 1.8546936511993408, + "learning_rate": 4.7286729894982434e-05, + "loss": 4.9092, + "step": 25168 + }, + { + "epoch": 0.14968717289941955, + "grad_norm": 1.5432714223861694, + "learning_rate": 4.728651825747582e-05, + "loss": 4.8257, + "step": 25169 + }, + { + "epoch": 0.14969312018270053, + "grad_norm": 1.6309102773666382, + "learning_rate": 4.728630661218921e-05, + "loss": 5.5829, + "step": 25170 + }, + { + "epoch": 0.14969906746598155, + "grad_norm": 1.8060203790664673, + "learning_rate": 4.7286094959122685e-05, + "loss": 5.3099, + "step": 25171 + }, + { + "epoch": 0.14970501474926254, + "grad_norm": 1.8817297220230103, + "learning_rate": 4.728588329827631e-05, + "loss": 5.5393, + "step": 25172 + }, + { + "epoch": 0.14971096203254353, + "grad_norm": 1.806970477104187, + "learning_rate": 4.728567162965017e-05, + "loss": 5.8567, + "step": 25173 + }, + { + "epoch": 0.14971690931582454, + "grad_norm": 1.6101081371307373, + "learning_rate": 4.728545995324433e-05, + "loss": 5.5389, + "step": 25174 + }, + { + "epoch": 0.14972285659910553, + "grad_norm": 1.5525349378585815, + "learning_rate": 4.7285248269058854e-05, + "loss": 5.6075, + "step": 25175 + }, + { + "epoch": 0.14972880388238652, + "grad_norm": 1.543853998184204, + "learning_rate": 4.7285036577093844e-05, + "loss": 5.5287, + "step": 25176 + }, + { + "epoch": 0.14973475116566753, + "grad_norm": 1.5811434984207153, + "learning_rate": 4.728482487734935e-05, + "loss": 5.5584, + "step": 25177 + }, + { + "epoch": 0.14974069844894852, + "grad_norm": 1.2957634925842285, + "learning_rate": 4.728461316982546e-05, + "loss": 5.5264, + "step": 25178 + }, + { + "epoch": 0.1497466457322295, + "grad_norm": 1.3600691556930542, + "learning_rate": 4.728440145452224e-05, + "loss": 5.5781, + "step": 25179 + }, + { + "epoch": 0.14975259301551053, + "grad_norm": 1.3423492908477783, + "learning_rate": 4.7284189731439764e-05, + "loss": 5.535, + "step": 25180 + }, + { + "epoch": 0.14975854029879151, + "grad_norm": 1.5586212873458862, + "learning_rate": 4.7283978000578107e-05, + "loss": 5.0746, + "step": 25181 + }, + { + "epoch": 0.1497644875820725, + "grad_norm": 1.8833614587783813, + "learning_rate": 4.7283766261937346e-05, + "loss": 4.6121, + "step": 25182 + }, + { + "epoch": 0.14977043486535352, + "grad_norm": 1.8890469074249268, + "learning_rate": 4.728355451551755e-05, + "loss": 4.5572, + "step": 25183 + }, + { + "epoch": 0.1497763821486345, + "grad_norm": 1.7143722772598267, + "learning_rate": 4.728334276131879e-05, + "loss": 4.5289, + "step": 25184 + }, + { + "epoch": 0.1497823294319155, + "grad_norm": 1.766708493232727, + "learning_rate": 4.728313099934115e-05, + "loss": 4.6957, + "step": 25185 + }, + { + "epoch": 0.1497882767151965, + "grad_norm": 1.8504046201705933, + "learning_rate": 4.72829192295847e-05, + "loss": 4.8764, + "step": 25186 + }, + { + "epoch": 0.1497942239984775, + "grad_norm": 2.0711238384246826, + "learning_rate": 4.728270745204951e-05, + "loss": 4.9157, + "step": 25187 + }, + { + "epoch": 0.1498001712817585, + "grad_norm": 2.0366387367248535, + "learning_rate": 4.728249566673567e-05, + "loss": 4.9295, + "step": 25188 + }, + { + "epoch": 0.1498061185650395, + "grad_norm": 1.7883682250976562, + "learning_rate": 4.728228387364323e-05, + "loss": 5.1173, + "step": 25189 + }, + { + "epoch": 0.1498120658483205, + "grad_norm": 1.8308504819869995, + "learning_rate": 4.7282072072772276e-05, + "loss": 5.0593, + "step": 25190 + }, + { + "epoch": 0.14981801313160148, + "grad_norm": 1.5662436485290527, + "learning_rate": 4.728186026412288e-05, + "loss": 5.1499, + "step": 25191 + }, + { + "epoch": 0.1498239604148825, + "grad_norm": 1.8079571723937988, + "learning_rate": 4.728164844769511e-05, + "loss": 4.948, + "step": 25192 + }, + { + "epoch": 0.14982990769816348, + "grad_norm": 1.681217908859253, + "learning_rate": 4.728143662348906e-05, + "loss": 5.3433, + "step": 25193 + }, + { + "epoch": 0.14983585498144447, + "grad_norm": 1.5585112571716309, + "learning_rate": 4.7281224791504784e-05, + "loss": 5.6366, + "step": 25194 + }, + { + "epoch": 0.14984180226472546, + "grad_norm": 1.8676329851150513, + "learning_rate": 4.7281012951742364e-05, + "loss": 5.1824, + "step": 25195 + }, + { + "epoch": 0.14984774954800648, + "grad_norm": 2.227149248123169, + "learning_rate": 4.728080110420188e-05, + "loss": 5.0203, + "step": 25196 + }, + { + "epoch": 0.14985369683128746, + "grad_norm": 1.6362202167510986, + "learning_rate": 4.728058924888339e-05, + "loss": 5.1942, + "step": 25197 + }, + { + "epoch": 0.14985964411456845, + "grad_norm": 1.9886643886566162, + "learning_rate": 4.7280377385786976e-05, + "loss": 5.4607, + "step": 25198 + }, + { + "epoch": 0.14986559139784947, + "grad_norm": 1.8965426683425903, + "learning_rate": 4.728016551491271e-05, + "loss": 5.4426, + "step": 25199 + }, + { + "epoch": 0.14987153868113046, + "grad_norm": 1.7106379270553589, + "learning_rate": 4.7279953636260677e-05, + "loss": 5.2894, + "step": 25200 + }, + { + "epoch": 0.14987748596441144, + "grad_norm": 1.5771503448486328, + "learning_rate": 4.727974174983093e-05, + "loss": 5.7972, + "step": 25201 + }, + { + "epoch": 0.14988343324769246, + "grad_norm": 1.4394875764846802, + "learning_rate": 4.727952985562357e-05, + "loss": 5.4622, + "step": 25202 + }, + { + "epoch": 0.14988938053097345, + "grad_norm": 1.421237826347351, + "learning_rate": 4.727931795363864e-05, + "loss": 5.5927, + "step": 25203 + }, + { + "epoch": 0.14989532781425444, + "grad_norm": 1.4579883813858032, + "learning_rate": 4.727910604387624e-05, + "loss": 5.6534, + "step": 25204 + }, + { + "epoch": 0.14990127509753545, + "grad_norm": 1.5861623287200928, + "learning_rate": 4.727889412633644e-05, + "loss": 5.423, + "step": 25205 + }, + { + "epoch": 0.14990722238081644, + "grad_norm": 1.1634724140167236, + "learning_rate": 4.72786822010193e-05, + "loss": 5.5339, + "step": 25206 + }, + { + "epoch": 0.14991316966409743, + "grad_norm": 1.3486993312835693, + "learning_rate": 4.72784702679249e-05, + "loss": 5.572, + "step": 25207 + }, + { + "epoch": 0.14991911694737844, + "grad_norm": 1.1783596277236938, + "learning_rate": 4.727825832705333e-05, + "loss": 5.4949, + "step": 25208 + }, + { + "epoch": 0.14992506423065943, + "grad_norm": 1.405774712562561, + "learning_rate": 4.727804637840464e-05, + "loss": 5.4044, + "step": 25209 + }, + { + "epoch": 0.14993101151394042, + "grad_norm": 1.4211558103561401, + "learning_rate": 4.727783442197891e-05, + "loss": 5.3778, + "step": 25210 + }, + { + "epoch": 0.14993695879722144, + "grad_norm": 1.572511076927185, + "learning_rate": 4.727762245777623e-05, + "loss": 5.4308, + "step": 25211 + }, + { + "epoch": 0.14994290608050242, + "grad_norm": 1.4699571132659912, + "learning_rate": 4.727741048579665e-05, + "loss": 5.3195, + "step": 25212 + }, + { + "epoch": 0.1499488533637834, + "grad_norm": 1.231878399848938, + "learning_rate": 4.727719850604026e-05, + "loss": 5.2663, + "step": 25213 + }, + { + "epoch": 0.14995480064706443, + "grad_norm": 1.3779250383377075, + "learning_rate": 4.7276986518507136e-05, + "loss": 5.1489, + "step": 25214 + }, + { + "epoch": 0.14996074793034542, + "grad_norm": 2.058643341064453, + "learning_rate": 4.7276774523197334e-05, + "loss": 5.4943, + "step": 25215 + }, + { + "epoch": 0.1499666952136264, + "grad_norm": 2.3679542541503906, + "learning_rate": 4.727656252011095e-05, + "loss": 4.688, + "step": 25216 + }, + { + "epoch": 0.14997264249690742, + "grad_norm": 2.2339799404144287, + "learning_rate": 4.727635050924805e-05, + "loss": 5.1016, + "step": 25217 + }, + { + "epoch": 0.1499785897801884, + "grad_norm": 1.536407709121704, + "learning_rate": 4.72761384906087e-05, + "loss": 5.2741, + "step": 25218 + }, + { + "epoch": 0.1499845370634694, + "grad_norm": 1.6192244291305542, + "learning_rate": 4.7275926464192985e-05, + "loss": 5.0808, + "step": 25219 + }, + { + "epoch": 0.1499904843467504, + "grad_norm": 1.6183874607086182, + "learning_rate": 4.727571443000097e-05, + "loss": 5.4735, + "step": 25220 + }, + { + "epoch": 0.1499964316300314, + "grad_norm": 1.5945466756820679, + "learning_rate": 4.7275502388032736e-05, + "loss": 5.7213, + "step": 25221 + }, + { + "epoch": 0.1500023789133124, + "grad_norm": 1.455883264541626, + "learning_rate": 4.727529033828835e-05, + "loss": 5.588, + "step": 25222 + }, + { + "epoch": 0.1500083261965934, + "grad_norm": 1.6111440658569336, + "learning_rate": 4.727507828076789e-05, + "loss": 5.0907, + "step": 25223 + }, + { + "epoch": 0.1500142734798744, + "grad_norm": 1.6382368803024292, + "learning_rate": 4.727486621547144e-05, + "loss": 5.2271, + "step": 25224 + }, + { + "epoch": 0.15002022076315538, + "grad_norm": 1.637136697769165, + "learning_rate": 4.7274654142399056e-05, + "loss": 4.9102, + "step": 25225 + }, + { + "epoch": 0.1500261680464364, + "grad_norm": 1.8395768404006958, + "learning_rate": 4.727444206155082e-05, + "loss": 5.0519, + "step": 25226 + }, + { + "epoch": 0.15003211532971739, + "grad_norm": 1.7471513748168945, + "learning_rate": 4.727422997292681e-05, + "loss": 5.2439, + "step": 25227 + }, + { + "epoch": 0.15003806261299837, + "grad_norm": 2.3117516040802, + "learning_rate": 4.72740178765271e-05, + "loss": 5.1935, + "step": 25228 + }, + { + "epoch": 0.1500440098962794, + "grad_norm": 2.0054478645324707, + "learning_rate": 4.727380577235175e-05, + "loss": 5.2919, + "step": 25229 + }, + { + "epoch": 0.15004995717956038, + "grad_norm": 1.9058947563171387, + "learning_rate": 4.727359366040085e-05, + "loss": 4.8624, + "step": 25230 + }, + { + "epoch": 0.15005590446284137, + "grad_norm": 1.746030569076538, + "learning_rate": 4.727338154067447e-05, + "loss": 4.9731, + "step": 25231 + }, + { + "epoch": 0.15006185174612238, + "grad_norm": 1.693912386894226, + "learning_rate": 4.727316941317268e-05, + "loss": 4.948, + "step": 25232 + }, + { + "epoch": 0.15006779902940337, + "grad_norm": 1.742431640625, + "learning_rate": 4.727295727789556e-05, + "loss": 4.9891, + "step": 25233 + }, + { + "epoch": 0.15007374631268436, + "grad_norm": 2.8610570430755615, + "learning_rate": 4.7272745134843175e-05, + "loss": 3.9769, + "step": 25234 + }, + { + "epoch": 0.15007969359596537, + "grad_norm": 1.6757450103759766, + "learning_rate": 4.72725329840156e-05, + "loss": 5.4376, + "step": 25235 + }, + { + "epoch": 0.15008564087924636, + "grad_norm": 1.6358832120895386, + "learning_rate": 4.727232082541293e-05, + "loss": 5.6665, + "step": 25236 + }, + { + "epoch": 0.15009158816252735, + "grad_norm": 1.8907593488693237, + "learning_rate": 4.727210865903522e-05, + "loss": 5.4225, + "step": 25237 + }, + { + "epoch": 0.15009753544580837, + "grad_norm": 1.5822373628616333, + "learning_rate": 4.727189648488254e-05, + "loss": 5.5356, + "step": 25238 + }, + { + "epoch": 0.15010348272908935, + "grad_norm": 1.626504898071289, + "learning_rate": 4.7271684302954974e-05, + "loss": 5.2066, + "step": 25239 + }, + { + "epoch": 0.15010943001237034, + "grad_norm": 1.7297816276550293, + "learning_rate": 4.727147211325259e-05, + "loss": 5.109, + "step": 25240 + }, + { + "epoch": 0.15011537729565136, + "grad_norm": 1.6709920167922974, + "learning_rate": 4.727125991577547e-05, + "loss": 5.2468, + "step": 25241 + }, + { + "epoch": 0.15012132457893235, + "grad_norm": 1.5390464067459106, + "learning_rate": 4.727104771052368e-05, + "loss": 5.237, + "step": 25242 + }, + { + "epoch": 0.15012727186221334, + "grad_norm": 1.4673635959625244, + "learning_rate": 4.72708354974973e-05, + "loss": 5.2971, + "step": 25243 + }, + { + "epoch": 0.15013321914549435, + "grad_norm": 1.6094917058944702, + "learning_rate": 4.7270623276696394e-05, + "loss": 5.3539, + "step": 25244 + }, + { + "epoch": 0.15013916642877534, + "grad_norm": 1.697434902191162, + "learning_rate": 4.727041104812105e-05, + "loss": 4.9796, + "step": 25245 + }, + { + "epoch": 0.15014511371205633, + "grad_norm": 1.7680538892745972, + "learning_rate": 4.727019881177134e-05, + "loss": 5.0622, + "step": 25246 + }, + { + "epoch": 0.15015106099533734, + "grad_norm": 1.6313658952713013, + "learning_rate": 4.7269986567647324e-05, + "loss": 5.0507, + "step": 25247 + }, + { + "epoch": 0.15015700827861833, + "grad_norm": 1.6400883197784424, + "learning_rate": 4.72697743157491e-05, + "loss": 4.9752, + "step": 25248 + }, + { + "epoch": 0.15016295556189932, + "grad_norm": 1.6866703033447266, + "learning_rate": 4.726956205607671e-05, + "loss": 5.2475, + "step": 25249 + }, + { + "epoch": 0.15016890284518034, + "grad_norm": 1.5988578796386719, + "learning_rate": 4.7269349788630255e-05, + "loss": 4.9963, + "step": 25250 + }, + { + "epoch": 0.15017485012846132, + "grad_norm": 1.8661000728607178, + "learning_rate": 4.7269137513409796e-05, + "loss": 4.7149, + "step": 25251 + }, + { + "epoch": 0.1501807974117423, + "grad_norm": 1.5544322729110718, + "learning_rate": 4.726892523041541e-05, + "loss": 5.0037, + "step": 25252 + }, + { + "epoch": 0.1501867446950233, + "grad_norm": 1.6971745491027832, + "learning_rate": 4.726871293964718e-05, + "loss": 5.1207, + "step": 25253 + }, + { + "epoch": 0.15019269197830432, + "grad_norm": 1.508044958114624, + "learning_rate": 4.726850064110517e-05, + "loss": 5.3578, + "step": 25254 + }, + { + "epoch": 0.1501986392615853, + "grad_norm": 1.7235703468322754, + "learning_rate": 4.726828833478946e-05, + "loss": 5.3506, + "step": 25255 + }, + { + "epoch": 0.1502045865448663, + "grad_norm": 1.7117946147918701, + "learning_rate": 4.726807602070011e-05, + "loss": 5.0023, + "step": 25256 + }, + { + "epoch": 0.1502105338281473, + "grad_norm": 1.6594294309616089, + "learning_rate": 4.726786369883721e-05, + "loss": 4.8674, + "step": 25257 + }, + { + "epoch": 0.1502164811114283, + "grad_norm": 1.7046406269073486, + "learning_rate": 4.7267651369200825e-05, + "loss": 4.9614, + "step": 25258 + }, + { + "epoch": 0.15022242839470928, + "grad_norm": 1.6488447189331055, + "learning_rate": 4.726743903179104e-05, + "loss": 5.0612, + "step": 25259 + }, + { + "epoch": 0.1502283756779903, + "grad_norm": 1.5859414339065552, + "learning_rate": 4.726722668660792e-05, + "loss": 4.9399, + "step": 25260 + }, + { + "epoch": 0.1502343229612713, + "grad_norm": 2.1271414756774902, + "learning_rate": 4.726701433365154e-05, + "loss": 5.0729, + "step": 25261 + }, + { + "epoch": 0.15024027024455228, + "grad_norm": 1.9313926696777344, + "learning_rate": 4.726680197292198e-05, + "loss": 5.271, + "step": 25262 + }, + { + "epoch": 0.1502462175278333, + "grad_norm": 1.933329463005066, + "learning_rate": 4.72665896044193e-05, + "loss": 5.0125, + "step": 25263 + }, + { + "epoch": 0.15025216481111428, + "grad_norm": 1.7074263095855713, + "learning_rate": 4.726637722814359e-05, + "loss": 4.8612, + "step": 25264 + }, + { + "epoch": 0.15025811209439527, + "grad_norm": 2.2242465019226074, + "learning_rate": 4.7266164844094915e-05, + "loss": 4.5163, + "step": 25265 + }, + { + "epoch": 0.15026405937767628, + "grad_norm": 1.5982950925827026, + "learning_rate": 4.726595245227336e-05, + "loss": 5.2747, + "step": 25266 + }, + { + "epoch": 0.15027000666095727, + "grad_norm": 2.0305862426757812, + "learning_rate": 4.726574005267898e-05, + "loss": 4.6378, + "step": 25267 + }, + { + "epoch": 0.15027595394423826, + "grad_norm": 1.7604337930679321, + "learning_rate": 4.726552764531187e-05, + "loss": 5.0755, + "step": 25268 + }, + { + "epoch": 0.15028190122751928, + "grad_norm": 1.9310117959976196, + "learning_rate": 4.7265315230172087e-05, + "loss": 4.5722, + "step": 25269 + }, + { + "epoch": 0.15028784851080026, + "grad_norm": 1.7772380113601685, + "learning_rate": 4.726510280725972e-05, + "loss": 4.8739, + "step": 25270 + }, + { + "epoch": 0.15029379579408125, + "grad_norm": 1.635905385017395, + "learning_rate": 4.7264890376574824e-05, + "loss": 4.8656, + "step": 25271 + }, + { + "epoch": 0.15029974307736227, + "grad_norm": 1.7308213710784912, + "learning_rate": 4.7264677938117496e-05, + "loss": 4.8062, + "step": 25272 + }, + { + "epoch": 0.15030569036064326, + "grad_norm": 1.751625895500183, + "learning_rate": 4.7264465491887786e-05, + "loss": 4.9999, + "step": 25273 + }, + { + "epoch": 0.15031163764392425, + "grad_norm": 1.9022659063339233, + "learning_rate": 4.726425303788579e-05, + "loss": 4.3717, + "step": 25274 + }, + { + "epoch": 0.15031758492720526, + "grad_norm": 1.6903055906295776, + "learning_rate": 4.7264040576111576e-05, + "loss": 4.6601, + "step": 25275 + }, + { + "epoch": 0.15032353221048625, + "grad_norm": 1.7622424364089966, + "learning_rate": 4.726382810656521e-05, + "loss": 4.711, + "step": 25276 + }, + { + "epoch": 0.15032947949376724, + "grad_norm": 1.6687418222427368, + "learning_rate": 4.726361562924678e-05, + "loss": 4.8469, + "step": 25277 + }, + { + "epoch": 0.15033542677704825, + "grad_norm": 1.6430240869522095, + "learning_rate": 4.7263403144156334e-05, + "loss": 4.7209, + "step": 25278 + }, + { + "epoch": 0.15034137406032924, + "grad_norm": 1.8600574731826782, + "learning_rate": 4.726319065129398e-05, + "loss": 4.465, + "step": 25279 + }, + { + "epoch": 0.15034732134361023, + "grad_norm": 1.4847289323806763, + "learning_rate": 4.7262978150659776e-05, + "loss": 5.3048, + "step": 25280 + }, + { + "epoch": 0.15035326862689125, + "grad_norm": 1.5062929391860962, + "learning_rate": 4.726276564225379e-05, + "loss": 5.0202, + "step": 25281 + }, + { + "epoch": 0.15035921591017223, + "grad_norm": 1.999292254447937, + "learning_rate": 4.7262553126076106e-05, + "loss": 4.2882, + "step": 25282 + }, + { + "epoch": 0.15036516319345322, + "grad_norm": 1.7813308238983154, + "learning_rate": 4.7262340602126794e-05, + "loss": 4.7198, + "step": 25283 + }, + { + "epoch": 0.15037111047673424, + "grad_norm": 1.8029576539993286, + "learning_rate": 4.726212807040593e-05, + "loss": 4.9741, + "step": 25284 + }, + { + "epoch": 0.15037705776001523, + "grad_norm": 1.629035472869873, + "learning_rate": 4.726191553091358e-05, + "loss": 5.1917, + "step": 25285 + }, + { + "epoch": 0.15038300504329621, + "grad_norm": 1.54799222946167, + "learning_rate": 4.726170298364983e-05, + "loss": 4.9093, + "step": 25286 + }, + { + "epoch": 0.15038895232657723, + "grad_norm": 1.8892208337783813, + "learning_rate": 4.726149042861475e-05, + "loss": 4.2702, + "step": 25287 + }, + { + "epoch": 0.15039489960985822, + "grad_norm": 1.7078487873077393, + "learning_rate": 4.726127786580842e-05, + "loss": 4.2082, + "step": 25288 + }, + { + "epoch": 0.1504008468931392, + "grad_norm": 1.818529725074768, + "learning_rate": 4.72610652952309e-05, + "loss": 4.5002, + "step": 25289 + }, + { + "epoch": 0.15040679417642022, + "grad_norm": 1.600824236869812, + "learning_rate": 4.726085271688227e-05, + "loss": 4.8372, + "step": 25290 + }, + { + "epoch": 0.1504127414597012, + "grad_norm": 1.6711620092391968, + "learning_rate": 4.726064013076261e-05, + "loss": 4.8079, + "step": 25291 + }, + { + "epoch": 0.1504186887429822, + "grad_norm": 1.7478057146072388, + "learning_rate": 4.7260427536871985e-05, + "loss": 4.7123, + "step": 25292 + }, + { + "epoch": 0.15042463602626321, + "grad_norm": 1.6385493278503418, + "learning_rate": 4.726021493521048e-05, + "loss": 4.8043, + "step": 25293 + }, + { + "epoch": 0.1504305833095442, + "grad_norm": 1.6353743076324463, + "learning_rate": 4.7260002325778165e-05, + "loss": 4.7891, + "step": 25294 + }, + { + "epoch": 0.1504365305928252, + "grad_norm": 1.8076624870300293, + "learning_rate": 4.725978970857511e-05, + "loss": 4.502, + "step": 25295 + }, + { + "epoch": 0.1504424778761062, + "grad_norm": 2.979780673980713, + "learning_rate": 4.72595770836014e-05, + "loss": 3.7136, + "step": 25296 + }, + { + "epoch": 0.1504484251593872, + "grad_norm": 1.698283314704895, + "learning_rate": 4.7259364450857096e-05, + "loss": 4.9292, + "step": 25297 + }, + { + "epoch": 0.15045437244266818, + "grad_norm": 1.577962040901184, + "learning_rate": 4.725915181034228e-05, + "loss": 5.177, + "step": 25298 + }, + { + "epoch": 0.1504603197259492, + "grad_norm": 1.7820360660552979, + "learning_rate": 4.725893916205702e-05, + "loss": 4.6215, + "step": 25299 + }, + { + "epoch": 0.1504662670092302, + "grad_norm": 1.8856147527694702, + "learning_rate": 4.7258726506001396e-05, + "loss": 4.49, + "step": 25300 + }, + { + "epoch": 0.15047221429251117, + "grad_norm": 1.6485686302185059, + "learning_rate": 4.7258513842175475e-05, + "loss": 5.7732, + "step": 25301 + }, + { + "epoch": 0.1504781615757922, + "grad_norm": 2.143477439880371, + "learning_rate": 4.725830117057935e-05, + "loss": 4.8915, + "step": 25302 + }, + { + "epoch": 0.15048410885907318, + "grad_norm": 1.6669731140136719, + "learning_rate": 4.725808849121307e-05, + "loss": 5.1107, + "step": 25303 + }, + { + "epoch": 0.15049005614235417, + "grad_norm": 1.6642520427703857, + "learning_rate": 4.725787580407673e-05, + "loss": 4.5454, + "step": 25304 + }, + { + "epoch": 0.15049600342563518, + "grad_norm": 1.7125663757324219, + "learning_rate": 4.725766310917039e-05, + "loss": 4.7463, + "step": 25305 + }, + { + "epoch": 0.15050195070891617, + "grad_norm": 1.7411010265350342, + "learning_rate": 4.725745040649413e-05, + "loss": 4.643, + "step": 25306 + }, + { + "epoch": 0.15050789799219716, + "grad_norm": 1.8865814208984375, + "learning_rate": 4.725723769604803e-05, + "loss": 4.5555, + "step": 25307 + }, + { + "epoch": 0.15051384527547818, + "grad_norm": 1.6867681741714478, + "learning_rate": 4.725702497783215e-05, + "loss": 4.7334, + "step": 25308 + }, + { + "epoch": 0.15051979255875916, + "grad_norm": 1.5820156335830688, + "learning_rate": 4.7256812251846576e-05, + "loss": 5.5799, + "step": 25309 + }, + { + "epoch": 0.15052573984204015, + "grad_norm": 1.772575855255127, + "learning_rate": 4.725659951809138e-05, + "loss": 5.0303, + "step": 25310 + }, + { + "epoch": 0.15053168712532114, + "grad_norm": 1.7370164394378662, + "learning_rate": 4.725638677656663e-05, + "loss": 4.6378, + "step": 25311 + }, + { + "epoch": 0.15053763440860216, + "grad_norm": 1.6597602367401123, + "learning_rate": 4.725617402727241e-05, + "loss": 4.6918, + "step": 25312 + }, + { + "epoch": 0.15054358169188314, + "grad_norm": 1.6710939407348633, + "learning_rate": 4.725596127020879e-05, + "loss": 4.5664, + "step": 25313 + }, + { + "epoch": 0.15054952897516413, + "grad_norm": 1.7546216249465942, + "learning_rate": 4.725574850537584e-05, + "loss": 4.8903, + "step": 25314 + }, + { + "epoch": 0.15055547625844515, + "grad_norm": 1.8587819337844849, + "learning_rate": 4.725553573277365e-05, + "loss": 4.9894, + "step": 25315 + }, + { + "epoch": 0.15056142354172614, + "grad_norm": 1.3700711727142334, + "learning_rate": 4.725532295240227e-05, + "loss": 5.2452, + "step": 25316 + }, + { + "epoch": 0.15056737082500712, + "grad_norm": 1.7877662181854248, + "learning_rate": 4.725511016426179e-05, + "loss": 4.214, + "step": 25317 + }, + { + "epoch": 0.15057331810828814, + "grad_norm": 1.8162602186203003, + "learning_rate": 4.725489736835228e-05, + "loss": 4.9041, + "step": 25318 + }, + { + "epoch": 0.15057926539156913, + "grad_norm": 1.6758408546447754, + "learning_rate": 4.725468456467381e-05, + "loss": 4.3246, + "step": 25319 + }, + { + "epoch": 0.15058521267485012, + "grad_norm": 1.5553221702575684, + "learning_rate": 4.725447175322647e-05, + "loss": 5.1303, + "step": 25320 + }, + { + "epoch": 0.15059115995813113, + "grad_norm": 1.5233205556869507, + "learning_rate": 4.725425893401032e-05, + "loss": 5.4629, + "step": 25321 + }, + { + "epoch": 0.15059710724141212, + "grad_norm": 1.5840942859649658, + "learning_rate": 4.725404610702544e-05, + "loss": 5.12, + "step": 25322 + }, + { + "epoch": 0.1506030545246931, + "grad_norm": 1.787832260131836, + "learning_rate": 4.72538332722719e-05, + "loss": 5.2794, + "step": 25323 + }, + { + "epoch": 0.15060900180797412, + "grad_norm": 1.725203275680542, + "learning_rate": 4.725362042974978e-05, + "loss": 5.1121, + "step": 25324 + }, + { + "epoch": 0.1506149490912551, + "grad_norm": 1.5242986679077148, + "learning_rate": 4.725340757945914e-05, + "loss": 5.2826, + "step": 25325 + }, + { + "epoch": 0.1506208963745361, + "grad_norm": 1.9072916507720947, + "learning_rate": 4.725319472140007e-05, + "loss": 5.134, + "step": 25326 + }, + { + "epoch": 0.15062684365781712, + "grad_norm": 1.5604580640792847, + "learning_rate": 4.725298185557265e-05, + "loss": 5.1551, + "step": 25327 + }, + { + "epoch": 0.1506327909410981, + "grad_norm": 1.7541977167129517, + "learning_rate": 4.725276898197694e-05, + "loss": 4.6415, + "step": 25328 + }, + { + "epoch": 0.1506387382243791, + "grad_norm": 1.6959171295166016, + "learning_rate": 4.725255610061301e-05, + "loss": 4.9428, + "step": 25329 + }, + { + "epoch": 0.1506446855076601, + "grad_norm": 1.8614954948425293, + "learning_rate": 4.725234321148095e-05, + "loss": 5.2815, + "step": 25330 + }, + { + "epoch": 0.1506506327909411, + "grad_norm": 2.654698610305786, + "learning_rate": 4.725213031458082e-05, + "loss": 4.4367, + "step": 25331 + }, + { + "epoch": 0.15065658007422209, + "grad_norm": 2.4033470153808594, + "learning_rate": 4.7251917409912705e-05, + "loss": 4.6682, + "step": 25332 + }, + { + "epoch": 0.1506625273575031, + "grad_norm": 2.164626121520996, + "learning_rate": 4.725170449747668e-05, + "loss": 4.8865, + "step": 25333 + }, + { + "epoch": 0.1506684746407841, + "grad_norm": 2.046325445175171, + "learning_rate": 4.725149157727281e-05, + "loss": 4.9494, + "step": 25334 + }, + { + "epoch": 0.15067442192406508, + "grad_norm": 1.8939987421035767, + "learning_rate": 4.7251278649301175e-05, + "loss": 4.7641, + "step": 25335 + }, + { + "epoch": 0.1506803692073461, + "grad_norm": 1.6845778226852417, + "learning_rate": 4.725106571356185e-05, + "loss": 4.8831, + "step": 25336 + }, + { + "epoch": 0.15068631649062708, + "grad_norm": 1.7191179990768433, + "learning_rate": 4.7250852770054905e-05, + "loss": 4.9732, + "step": 25337 + }, + { + "epoch": 0.15069226377390807, + "grad_norm": 2.061174154281616, + "learning_rate": 4.725063981878042e-05, + "loss": 4.2263, + "step": 25338 + }, + { + "epoch": 0.15069821105718909, + "grad_norm": 2.3144235610961914, + "learning_rate": 4.7250426859738464e-05, + "loss": 4.2848, + "step": 25339 + }, + { + "epoch": 0.15070415834047007, + "grad_norm": 2.0103487968444824, + "learning_rate": 4.7250213892929115e-05, + "loss": 4.178, + "step": 25340 + }, + { + "epoch": 0.15071010562375106, + "grad_norm": 2.093339443206787, + "learning_rate": 4.725000091835245e-05, + "loss": 4.3689, + "step": 25341 + }, + { + "epoch": 0.15071605290703208, + "grad_norm": 2.085618495941162, + "learning_rate": 4.724978793600853e-05, + "loss": 4.1158, + "step": 25342 + }, + { + "epoch": 0.15072200019031307, + "grad_norm": 2.2095706462860107, + "learning_rate": 4.7249574945897445e-05, + "loss": 4.3338, + "step": 25343 + }, + { + "epoch": 0.15072794747359405, + "grad_norm": 2.169772148132324, + "learning_rate": 4.7249361948019267e-05, + "loss": 4.63, + "step": 25344 + }, + { + "epoch": 0.15073389475687507, + "grad_norm": 2.5633938312530518, + "learning_rate": 4.7249148942374054e-05, + "loss": 4.954, + "step": 25345 + }, + { + "epoch": 0.15073984204015606, + "grad_norm": 2.181420087814331, + "learning_rate": 4.72489359289619e-05, + "loss": 4.5234, + "step": 25346 + }, + { + "epoch": 0.15074578932343705, + "grad_norm": 2.265392541885376, + "learning_rate": 4.724872290778288e-05, + "loss": 4.1063, + "step": 25347 + }, + { + "epoch": 0.15075173660671806, + "grad_norm": 1.8531908988952637, + "learning_rate": 4.7248509878837054e-05, + "loss": 4.7115, + "step": 25348 + }, + { + "epoch": 0.15075768388999905, + "grad_norm": 2.096639633178711, + "learning_rate": 4.724829684212451e-05, + "loss": 4.2179, + "step": 25349 + }, + { + "epoch": 0.15076363117328004, + "grad_norm": 1.99870765209198, + "learning_rate": 4.72480837976453e-05, + "loss": 4.3259, + "step": 25350 + }, + { + "epoch": 0.15076957845656105, + "grad_norm": 2.024890422821045, + "learning_rate": 4.724787074539953e-05, + "loss": 4.1168, + "step": 25351 + }, + { + "epoch": 0.15077552573984204, + "grad_norm": 2.2805378437042236, + "learning_rate": 4.724765768538725e-05, + "loss": 4.3184, + "step": 25352 + }, + { + "epoch": 0.15078147302312303, + "grad_norm": 2.2098236083984375, + "learning_rate": 4.7247444617608535e-05, + "loss": 4.3815, + "step": 25353 + }, + { + "epoch": 0.15078742030640405, + "grad_norm": 2.6324753761291504, + "learning_rate": 4.724723154206348e-05, + "loss": 4.3017, + "step": 25354 + }, + { + "epoch": 0.15079336758968503, + "grad_norm": 3.0926623344421387, + "learning_rate": 4.724701845875215e-05, + "loss": 4.6768, + "step": 25355 + }, + { + "epoch": 0.15079931487296602, + "grad_norm": 2.8633837699890137, + "learning_rate": 4.7246805367674603e-05, + "loss": 4.3765, + "step": 25356 + }, + { + "epoch": 0.15080526215624704, + "grad_norm": 2.4857215881347656, + "learning_rate": 4.7246592268830924e-05, + "loss": 4.3245, + "step": 25357 + }, + { + "epoch": 0.15081120943952803, + "grad_norm": 3.3124706745147705, + "learning_rate": 4.72463791622212e-05, + "loss": 4.1451, + "step": 25358 + }, + { + "epoch": 0.15081715672280901, + "grad_norm": 2.3086657524108887, + "learning_rate": 4.724616604784549e-05, + "loss": 4.5879, + "step": 25359 + }, + { + "epoch": 0.15082310400609003, + "grad_norm": 2.082601308822632, + "learning_rate": 4.724595292570387e-05, + "loss": 5.1047, + "step": 25360 + }, + { + "epoch": 0.15082905128937102, + "grad_norm": 1.6798832416534424, + "learning_rate": 4.7245739795796426e-05, + "loss": 4.7877, + "step": 25361 + }, + { + "epoch": 0.150834998572652, + "grad_norm": 2.76798152923584, + "learning_rate": 4.724552665812322e-05, + "loss": 4.1044, + "step": 25362 + }, + { + "epoch": 0.15084094585593302, + "grad_norm": 2.7487802505493164, + "learning_rate": 4.724531351268433e-05, + "loss": 4.4089, + "step": 25363 + }, + { + "epoch": 0.150846893139214, + "grad_norm": 2.2958571910858154, + "learning_rate": 4.7245100359479833e-05, + "loss": 4.1923, + "step": 25364 + }, + { + "epoch": 0.150852840422495, + "grad_norm": 2.200896978378296, + "learning_rate": 4.7244887198509805e-05, + "loss": 4.3105, + "step": 25365 + }, + { + "epoch": 0.15085878770577602, + "grad_norm": 2.0711123943328857, + "learning_rate": 4.7244674029774307e-05, + "loss": 4.3327, + "step": 25366 + }, + { + "epoch": 0.150864734989057, + "grad_norm": 1.8481465578079224, + "learning_rate": 4.724446085327342e-05, + "loss": 4.7603, + "step": 25367 + }, + { + "epoch": 0.150870682272338, + "grad_norm": 1.5740338563919067, + "learning_rate": 4.7244247669007234e-05, + "loss": 4.7191, + "step": 25368 + }, + { + "epoch": 0.15087662955561898, + "grad_norm": 1.4988723993301392, + "learning_rate": 4.724403447697581e-05, + "loss": 4.6288, + "step": 25369 + }, + { + "epoch": 0.1508825768389, + "grad_norm": 1.8862982988357544, + "learning_rate": 4.7243821277179213e-05, + "loss": 4.6308, + "step": 25370 + }, + { + "epoch": 0.15088852412218098, + "grad_norm": 1.6412887573242188, + "learning_rate": 4.7243608069617534e-05, + "loss": 5.1476, + "step": 25371 + }, + { + "epoch": 0.15089447140546197, + "grad_norm": 1.58519446849823, + "learning_rate": 4.7243394854290847e-05, + "loss": 5.6586, + "step": 25372 + }, + { + "epoch": 0.150900418688743, + "grad_norm": 1.5548374652862549, + "learning_rate": 4.724318163119921e-05, + "loss": 5.4283, + "step": 25373 + }, + { + "epoch": 0.15090636597202398, + "grad_norm": 1.456405758857727, + "learning_rate": 4.724296840034271e-05, + "loss": 5.3778, + "step": 25374 + }, + { + "epoch": 0.15091231325530496, + "grad_norm": 1.2034344673156738, + "learning_rate": 4.7242755161721424e-05, + "loss": 5.1189, + "step": 25375 + }, + { + "epoch": 0.15091826053858598, + "grad_norm": 2.2144997119903564, + "learning_rate": 4.724254191533543e-05, + "loss": 4.7091, + "step": 25376 + }, + { + "epoch": 0.15092420782186697, + "grad_norm": 2.322824239730835, + "learning_rate": 4.7242328661184774e-05, + "loss": 4.3568, + "step": 25377 + }, + { + "epoch": 0.15093015510514796, + "grad_norm": 2.832406997680664, + "learning_rate": 4.7242115399269567e-05, + "loss": 4.156, + "step": 25378 + }, + { + "epoch": 0.15093610238842897, + "grad_norm": 2.5387492179870605, + "learning_rate": 4.724190212958986e-05, + "loss": 4.2464, + "step": 25379 + }, + { + "epoch": 0.15094204967170996, + "grad_norm": 2.3497941493988037, + "learning_rate": 4.724168885214574e-05, + "loss": 4.2937, + "step": 25380 + }, + { + "epoch": 0.15094799695499095, + "grad_norm": 1.9066410064697266, + "learning_rate": 4.724147556693727e-05, + "loss": 4.3862, + "step": 25381 + }, + { + "epoch": 0.15095394423827196, + "grad_norm": 1.981546401977539, + "learning_rate": 4.724126227396454e-05, + "loss": 4.2936, + "step": 25382 + }, + { + "epoch": 0.15095989152155295, + "grad_norm": 1.7924445867538452, + "learning_rate": 4.7241048973227604e-05, + "loss": 5.173, + "step": 25383 + }, + { + "epoch": 0.15096583880483394, + "grad_norm": 1.985730528831482, + "learning_rate": 4.724083566472655e-05, + "loss": 4.6256, + "step": 25384 + }, + { + "epoch": 0.15097178608811496, + "grad_norm": 1.7368820905685425, + "learning_rate": 4.7240622348461457e-05, + "loss": 5.2259, + "step": 25385 + }, + { + "epoch": 0.15097773337139594, + "grad_norm": 1.761334776878357, + "learning_rate": 4.724040902443239e-05, + "loss": 4.8674, + "step": 25386 + }, + { + "epoch": 0.15098368065467693, + "grad_norm": 2.460028886795044, + "learning_rate": 4.724019569263942e-05, + "loss": 4.6597, + "step": 25387 + }, + { + "epoch": 0.15098962793795795, + "grad_norm": 2.524463176727295, + "learning_rate": 4.723998235308263e-05, + "loss": 4.2823, + "step": 25388 + }, + { + "epoch": 0.15099557522123894, + "grad_norm": 2.211486577987671, + "learning_rate": 4.723976900576209e-05, + "loss": 4.2802, + "step": 25389 + }, + { + "epoch": 0.15100152250451992, + "grad_norm": 2.323294162750244, + "learning_rate": 4.723955565067788e-05, + "loss": 4.2044, + "step": 25390 + }, + { + "epoch": 0.15100746978780094, + "grad_norm": 2.0671331882476807, + "learning_rate": 4.723934228783007e-05, + "loss": 4.2368, + "step": 25391 + }, + { + "epoch": 0.15101341707108193, + "grad_norm": 2.4726204872131348, + "learning_rate": 4.723912891721874e-05, + "loss": 3.9728, + "step": 25392 + }, + { + "epoch": 0.15101936435436292, + "grad_norm": 2.278228998184204, + "learning_rate": 4.7238915538843954e-05, + "loss": 4.0742, + "step": 25393 + }, + { + "epoch": 0.15102531163764393, + "grad_norm": 2.3213517665863037, + "learning_rate": 4.7238702152705794e-05, + "loss": 4.2124, + "step": 25394 + }, + { + "epoch": 0.15103125892092492, + "grad_norm": 1.7494871616363525, + "learning_rate": 4.7238488758804334e-05, + "loss": 5.1252, + "step": 25395 + }, + { + "epoch": 0.1510372062042059, + "grad_norm": 1.8289192914962769, + "learning_rate": 4.723827535713965e-05, + "loss": 4.9194, + "step": 25396 + }, + { + "epoch": 0.15104315348748693, + "grad_norm": 1.7058460712432861, + "learning_rate": 4.723806194771181e-05, + "loss": 5.2878, + "step": 25397 + }, + { + "epoch": 0.1510491007707679, + "grad_norm": 2.0224595069885254, + "learning_rate": 4.723784853052089e-05, + "loss": 4.4899, + "step": 25398 + }, + { + "epoch": 0.1510550480540489, + "grad_norm": 2.4246976375579834, + "learning_rate": 4.723763510556697e-05, + "loss": 3.9646, + "step": 25399 + }, + { + "epoch": 0.15106099533732992, + "grad_norm": 2.473158597946167, + "learning_rate": 4.723742167285012e-05, + "loss": 4.1942, + "step": 25400 + }, + { + "epoch": 0.1510669426206109, + "grad_norm": 3.9526100158691406, + "learning_rate": 4.723720823237041e-05, + "loss": 3.6103, + "step": 25401 + }, + { + "epoch": 0.1510728899038919, + "grad_norm": 3.6537516117095947, + "learning_rate": 4.723699478412793e-05, + "loss": 4.2312, + "step": 25402 + }, + { + "epoch": 0.1510788371871729, + "grad_norm": 1.5094470977783203, + "learning_rate": 4.7236781328122745e-05, + "loss": 5.577, + "step": 25403 + }, + { + "epoch": 0.1510847844704539, + "grad_norm": 1.7783223390579224, + "learning_rate": 4.7236567864354924e-05, + "loss": 5.6923, + "step": 25404 + }, + { + "epoch": 0.15109073175373489, + "grad_norm": 1.8453465700149536, + "learning_rate": 4.723635439282455e-05, + "loss": 5.3975, + "step": 25405 + }, + { + "epoch": 0.1510966790370159, + "grad_norm": 1.7783082723617554, + "learning_rate": 4.723614091353169e-05, + "loss": 5.2236, + "step": 25406 + }, + { + "epoch": 0.1511026263202969, + "grad_norm": 1.6507834196090698, + "learning_rate": 4.723592742647643e-05, + "loss": 5.3565, + "step": 25407 + }, + { + "epoch": 0.15110857360357788, + "grad_norm": 1.4875059127807617, + "learning_rate": 4.723571393165883e-05, + "loss": 5.5752, + "step": 25408 + }, + { + "epoch": 0.1511145208868589, + "grad_norm": 1.6694411039352417, + "learning_rate": 4.7235500429078985e-05, + "loss": 5.4707, + "step": 25409 + }, + { + "epoch": 0.15112046817013988, + "grad_norm": 1.7157987356185913, + "learning_rate": 4.723528691873694e-05, + "loss": 5.3777, + "step": 25410 + }, + { + "epoch": 0.15112641545342087, + "grad_norm": 2.611750602722168, + "learning_rate": 4.72350734006328e-05, + "loss": 3.1969, + "step": 25411 + }, + { + "epoch": 0.1511323627367019, + "grad_norm": 2.0207319259643555, + "learning_rate": 4.7234859874766614e-05, + "loss": 4.8871, + "step": 25412 + }, + { + "epoch": 0.15113831001998287, + "grad_norm": 2.598403215408325, + "learning_rate": 4.723464634113847e-05, + "loss": 4.9404, + "step": 25413 + }, + { + "epoch": 0.15114425730326386, + "grad_norm": 1.764269232749939, + "learning_rate": 4.723443279974845e-05, + "loss": 5.2649, + "step": 25414 + }, + { + "epoch": 0.15115020458654488, + "grad_norm": 1.8783745765686035, + "learning_rate": 4.723421925059661e-05, + "loss": 4.8755, + "step": 25415 + }, + { + "epoch": 0.15115615186982587, + "grad_norm": 1.497833251953125, + "learning_rate": 4.7234005693683035e-05, + "loss": 5.0806, + "step": 25416 + }, + { + "epoch": 0.15116209915310685, + "grad_norm": 1.6030247211456299, + "learning_rate": 4.72337921290078e-05, + "loss": 5.0388, + "step": 25417 + }, + { + "epoch": 0.15116804643638787, + "grad_norm": 1.7181298732757568, + "learning_rate": 4.723357855657098e-05, + "loss": 4.8316, + "step": 25418 + }, + { + "epoch": 0.15117399371966886, + "grad_norm": 1.4665559530258179, + "learning_rate": 4.7233364976372644e-05, + "loss": 5.5005, + "step": 25419 + }, + { + "epoch": 0.15117994100294985, + "grad_norm": 3.3794503211975098, + "learning_rate": 4.723315138841287e-05, + "loss": 3.9864, + "step": 25420 + }, + { + "epoch": 0.15118588828623086, + "grad_norm": 1.7290079593658447, + "learning_rate": 4.723293779269173e-05, + "loss": 5.3736, + "step": 25421 + }, + { + "epoch": 0.15119183556951185, + "grad_norm": 1.995943307876587, + "learning_rate": 4.723272418920931e-05, + "loss": 4.8142, + "step": 25422 + }, + { + "epoch": 0.15119778285279284, + "grad_norm": 1.8627694845199585, + "learning_rate": 4.7232510577965674e-05, + "loss": 5.2348, + "step": 25423 + }, + { + "epoch": 0.15120373013607386, + "grad_norm": 1.5469872951507568, + "learning_rate": 4.72322969589609e-05, + "loss": 5.1102, + "step": 25424 + }, + { + "epoch": 0.15120967741935484, + "grad_norm": 1.503350853919983, + "learning_rate": 4.723208333219505e-05, + "loss": 5.2009, + "step": 25425 + }, + { + "epoch": 0.15121562470263583, + "grad_norm": 1.5141102075576782, + "learning_rate": 4.7231869697668214e-05, + "loss": 5.4231, + "step": 25426 + }, + { + "epoch": 0.15122157198591682, + "grad_norm": 1.5022274255752563, + "learning_rate": 4.723165605538046e-05, + "loss": 5.1454, + "step": 25427 + }, + { + "epoch": 0.15122751926919784, + "grad_norm": 1.2774550914764404, + "learning_rate": 4.7231442405331874e-05, + "loss": 5.4048, + "step": 25428 + }, + { + "epoch": 0.15123346655247882, + "grad_norm": 1.4588242769241333, + "learning_rate": 4.723122874752251e-05, + "loss": 5.1466, + "step": 25429 + }, + { + "epoch": 0.1512394138357598, + "grad_norm": 1.6666613817214966, + "learning_rate": 4.7231015081952454e-05, + "loss": 5.6505, + "step": 25430 + }, + { + "epoch": 0.15124536111904083, + "grad_norm": 3.1419155597686768, + "learning_rate": 4.72308014086218e-05, + "loss": 5.1714, + "step": 25431 + }, + { + "epoch": 0.15125130840232182, + "grad_norm": 1.8372479677200317, + "learning_rate": 4.723058772753058e-05, + "loss": 5.3135, + "step": 25432 + }, + { + "epoch": 0.1512572556856028, + "grad_norm": 1.4300392866134644, + "learning_rate": 4.7230374038678895e-05, + "loss": 5.4404, + "step": 25433 + }, + { + "epoch": 0.15126320296888382, + "grad_norm": 1.4411662817001343, + "learning_rate": 4.723016034206682e-05, + "loss": 5.4341, + "step": 25434 + }, + { + "epoch": 0.1512691502521648, + "grad_norm": 1.4989326000213623, + "learning_rate": 4.7229946637694425e-05, + "loss": 5.3632, + "step": 25435 + }, + { + "epoch": 0.1512750975354458, + "grad_norm": 1.2930675745010376, + "learning_rate": 4.7229732925561785e-05, + "loss": 5.1667, + "step": 25436 + }, + { + "epoch": 0.1512810448187268, + "grad_norm": 1.6399480104446411, + "learning_rate": 4.722951920566898e-05, + "loss": 5.0464, + "step": 25437 + }, + { + "epoch": 0.1512869921020078, + "grad_norm": 1.6308560371398926, + "learning_rate": 4.722930547801608e-05, + "loss": 5.416, + "step": 25438 + }, + { + "epoch": 0.1512929393852888, + "grad_norm": 1.8431388139724731, + "learning_rate": 4.722909174260316e-05, + "loss": 5.6069, + "step": 25439 + }, + { + "epoch": 0.1512988866685698, + "grad_norm": 1.964154601097107, + "learning_rate": 4.722887799943028e-05, + "loss": 5.845, + "step": 25440 + }, + { + "epoch": 0.1513048339518508, + "grad_norm": 1.731370210647583, + "learning_rate": 4.722866424849753e-05, + "loss": 5.3155, + "step": 25441 + }, + { + "epoch": 0.15131078123513178, + "grad_norm": 1.9794760942459106, + "learning_rate": 4.7228450489805e-05, + "loss": 4.8395, + "step": 25442 + }, + { + "epoch": 0.1513167285184128, + "grad_norm": 2.016857862472534, + "learning_rate": 4.7228236723352735e-05, + "loss": 4.5546, + "step": 25443 + }, + { + "epoch": 0.15132267580169378, + "grad_norm": 1.9085549116134644, + "learning_rate": 4.722802294914083e-05, + "loss": 4.7848, + "step": 25444 + }, + { + "epoch": 0.15132862308497477, + "grad_norm": 1.5769025087356567, + "learning_rate": 4.7227809167169345e-05, + "loss": 5.1207, + "step": 25445 + }, + { + "epoch": 0.1513345703682558, + "grad_norm": 1.4327126741409302, + "learning_rate": 4.7227595377438364e-05, + "loss": 5.323, + "step": 25446 + }, + { + "epoch": 0.15134051765153678, + "grad_norm": 1.536750316619873, + "learning_rate": 4.722738157994796e-05, + "loss": 4.812, + "step": 25447 + }, + { + "epoch": 0.15134646493481776, + "grad_norm": 1.6312404870986938, + "learning_rate": 4.72271677746982e-05, + "loss": 4.8753, + "step": 25448 + }, + { + "epoch": 0.15135241221809878, + "grad_norm": 1.3323699235916138, + "learning_rate": 4.722695396168917e-05, + "loss": 5.6005, + "step": 25449 + }, + { + "epoch": 0.15135835950137977, + "grad_norm": 1.5522531270980835, + "learning_rate": 4.722674014092094e-05, + "loss": 5.3848, + "step": 25450 + }, + { + "epoch": 0.15136430678466076, + "grad_norm": 1.5421935319900513, + "learning_rate": 4.722652631239358e-05, + "loss": 5.4136, + "step": 25451 + }, + { + "epoch": 0.15137025406794177, + "grad_norm": 1.564570665359497, + "learning_rate": 4.722631247610718e-05, + "loss": 5.3169, + "step": 25452 + }, + { + "epoch": 0.15137620135122276, + "grad_norm": 1.7175198793411255, + "learning_rate": 4.72260986320618e-05, + "loss": 4.5904, + "step": 25453 + }, + { + "epoch": 0.15138214863450375, + "grad_norm": 1.5852707624435425, + "learning_rate": 4.722588478025751e-05, + "loss": 4.8459, + "step": 25454 + }, + { + "epoch": 0.15138809591778477, + "grad_norm": 1.4209281206130981, + "learning_rate": 4.7225670920694404e-05, + "loss": 5.4134, + "step": 25455 + }, + { + "epoch": 0.15139404320106575, + "grad_norm": 1.4841557741165161, + "learning_rate": 4.722545705337254e-05, + "loss": 5.0996, + "step": 25456 + }, + { + "epoch": 0.15139999048434674, + "grad_norm": 1.4958367347717285, + "learning_rate": 4.7225243178292e-05, + "loss": 4.5363, + "step": 25457 + }, + { + "epoch": 0.15140593776762776, + "grad_norm": 1.6424293518066406, + "learning_rate": 4.722502929545286e-05, + "loss": 5.0227, + "step": 25458 + }, + { + "epoch": 0.15141188505090875, + "grad_norm": 1.687121868133545, + "learning_rate": 4.722481540485519e-05, + "loss": 4.9662, + "step": 25459 + }, + { + "epoch": 0.15141783233418973, + "grad_norm": 1.6748243570327759, + "learning_rate": 4.722460150649907e-05, + "loss": 4.4443, + "step": 25460 + }, + { + "epoch": 0.15142377961747075, + "grad_norm": 2.2483417987823486, + "learning_rate": 4.722438760038456e-05, + "loss": 4.8411, + "step": 25461 + }, + { + "epoch": 0.15142972690075174, + "grad_norm": 1.6556822061538696, + "learning_rate": 4.7224173686511754e-05, + "loss": 5.1596, + "step": 25462 + }, + { + "epoch": 0.15143567418403273, + "grad_norm": 1.6137731075286865, + "learning_rate": 4.722395976488072e-05, + "loss": 4.6538, + "step": 25463 + }, + { + "epoch": 0.15144162146731374, + "grad_norm": 1.7086783647537231, + "learning_rate": 4.722374583549153e-05, + "loss": 5.2168, + "step": 25464 + }, + { + "epoch": 0.15144756875059473, + "grad_norm": 1.668527603149414, + "learning_rate": 4.7223531898344256e-05, + "loss": 5.138, + "step": 25465 + }, + { + "epoch": 0.15145351603387572, + "grad_norm": 2.2906320095062256, + "learning_rate": 4.722331795343899e-05, + "loss": 4.6954, + "step": 25466 + }, + { + "epoch": 0.15145946331715673, + "grad_norm": 2.410048246383667, + "learning_rate": 4.722310400077578e-05, + "loss": 4.5377, + "step": 25467 + }, + { + "epoch": 0.15146541060043772, + "grad_norm": 1.7885384559631348, + "learning_rate": 4.722289004035471e-05, + "loss": 4.8978, + "step": 25468 + }, + { + "epoch": 0.1514713578837187, + "grad_norm": 1.5193252563476562, + "learning_rate": 4.7222676072175866e-05, + "loss": 5.2818, + "step": 25469 + }, + { + "epoch": 0.15147730516699973, + "grad_norm": 2.0139195919036865, + "learning_rate": 4.7222462096239314e-05, + "loss": 4.1632, + "step": 25470 + }, + { + "epoch": 0.15148325245028071, + "grad_norm": 2.007025718688965, + "learning_rate": 4.7222248112545133e-05, + "loss": 4.0832, + "step": 25471 + }, + { + "epoch": 0.1514891997335617, + "grad_norm": 2.2270402908325195, + "learning_rate": 4.722203412109339e-05, + "loss": 4.2317, + "step": 25472 + }, + { + "epoch": 0.15149514701684272, + "grad_norm": 2.0418808460235596, + "learning_rate": 4.722182012188417e-05, + "loss": 4.1849, + "step": 25473 + }, + { + "epoch": 0.1515010943001237, + "grad_norm": 2.087785243988037, + "learning_rate": 4.722160611491754e-05, + "loss": 4.1218, + "step": 25474 + }, + { + "epoch": 0.1515070415834047, + "grad_norm": 2.303571939468384, + "learning_rate": 4.7221392100193575e-05, + "loss": 3.9614, + "step": 25475 + }, + { + "epoch": 0.1515129888666857, + "grad_norm": 1.9516772031784058, + "learning_rate": 4.722117807771235e-05, + "loss": 3.9619, + "step": 25476 + }, + { + "epoch": 0.1515189361499667, + "grad_norm": 1.9611634016036987, + "learning_rate": 4.722096404747395e-05, + "loss": 3.9133, + "step": 25477 + }, + { + "epoch": 0.1515248834332477, + "grad_norm": 1.9254827499389648, + "learning_rate": 4.722075000947843e-05, + "loss": 3.877, + "step": 25478 + }, + { + "epoch": 0.1515308307165287, + "grad_norm": 1.803846001625061, + "learning_rate": 4.722053596372588e-05, + "loss": 3.8338, + "step": 25479 + }, + { + "epoch": 0.1515367779998097, + "grad_norm": 1.829439401626587, + "learning_rate": 4.722032191021637e-05, + "loss": 3.8183, + "step": 25480 + }, + { + "epoch": 0.15154272528309068, + "grad_norm": 1.7955585718154907, + "learning_rate": 4.722010784894998e-05, + "loss": 4.6821, + "step": 25481 + }, + { + "epoch": 0.1515486725663717, + "grad_norm": 2.9624781608581543, + "learning_rate": 4.7219893779926775e-05, + "loss": 3.9385, + "step": 25482 + }, + { + "epoch": 0.15155461984965268, + "grad_norm": 1.8687463998794556, + "learning_rate": 4.721967970314684e-05, + "loss": 4.0364, + "step": 25483 + }, + { + "epoch": 0.15156056713293367, + "grad_norm": 1.9090644121170044, + "learning_rate": 4.721946561861024e-05, + "loss": 3.8046, + "step": 25484 + }, + { + "epoch": 0.15156651441621466, + "grad_norm": 1.9757955074310303, + "learning_rate": 4.721925152631706e-05, + "loss": 3.943, + "step": 25485 + }, + { + "epoch": 0.15157246169949568, + "grad_norm": 1.9161666631698608, + "learning_rate": 4.7219037426267356e-05, + "loss": 3.8818, + "step": 25486 + }, + { + "epoch": 0.15157840898277666, + "grad_norm": 1.8484982252120972, + "learning_rate": 4.7218823318461226e-05, + "loss": 4.0713, + "step": 25487 + }, + { + "epoch": 0.15158435626605765, + "grad_norm": 1.6787267923355103, + "learning_rate": 4.7218609202898726e-05, + "loss": 5.7814, + "step": 25488 + }, + { + "epoch": 0.15159030354933867, + "grad_norm": 1.6946018934249878, + "learning_rate": 4.7218395079579946e-05, + "loss": 5.9241, + "step": 25489 + }, + { + "epoch": 0.15159625083261966, + "grad_norm": 1.5210212469100952, + "learning_rate": 4.721818094850495e-05, + "loss": 6.0828, + "step": 25490 + }, + { + "epoch": 0.15160219811590064, + "grad_norm": 1.7792625427246094, + "learning_rate": 4.721796680967382e-05, + "loss": 6.241, + "step": 25491 + }, + { + "epoch": 0.15160814539918166, + "grad_norm": 1.5366078615188599, + "learning_rate": 4.7217752663086626e-05, + "loss": 5.7111, + "step": 25492 + }, + { + "epoch": 0.15161409268246265, + "grad_norm": 1.5193569660186768, + "learning_rate": 4.721753850874344e-05, + "loss": 5.3155, + "step": 25493 + }, + { + "epoch": 0.15162003996574364, + "grad_norm": 1.9060078859329224, + "learning_rate": 4.7217324346644356e-05, + "loss": 5.368, + "step": 25494 + }, + { + "epoch": 0.15162598724902465, + "grad_norm": 1.4217309951782227, + "learning_rate": 4.7217110176789416e-05, + "loss": 5.4781, + "step": 25495 + }, + { + "epoch": 0.15163193453230564, + "grad_norm": 1.561132550239563, + "learning_rate": 4.7216895999178725e-05, + "loss": 5.3316, + "step": 25496 + }, + { + "epoch": 0.15163788181558663, + "grad_norm": 1.397314429283142, + "learning_rate": 4.7216681813812335e-05, + "loss": 5.4047, + "step": 25497 + }, + { + "epoch": 0.15164382909886764, + "grad_norm": 1.3138307332992554, + "learning_rate": 4.7216467620690335e-05, + "loss": 5.3706, + "step": 25498 + }, + { + "epoch": 0.15164977638214863, + "grad_norm": 1.4298443794250488, + "learning_rate": 4.7216253419812794e-05, + "loss": 5.3704, + "step": 25499 + }, + { + "epoch": 0.15165572366542962, + "grad_norm": 1.703792929649353, + "learning_rate": 4.72160392111798e-05, + "loss": 5.2468, + "step": 25500 + }, + { + "epoch": 0.15166167094871064, + "grad_norm": 1.566309928894043, + "learning_rate": 4.72158249947914e-05, + "loss": 5.5153, + "step": 25501 + }, + { + "epoch": 0.15166761823199162, + "grad_norm": 1.3141274452209473, + "learning_rate": 4.721561077064769e-05, + "loss": 5.6254, + "step": 25502 + }, + { + "epoch": 0.1516735655152726, + "grad_norm": 1.4979000091552734, + "learning_rate": 4.721539653874874e-05, + "loss": 5.4936, + "step": 25503 + }, + { + "epoch": 0.15167951279855363, + "grad_norm": 1.694068193435669, + "learning_rate": 4.721518229909463e-05, + "loss": 5.6601, + "step": 25504 + }, + { + "epoch": 0.15168546008183462, + "grad_norm": 1.8887871503829956, + "learning_rate": 4.721496805168543e-05, + "loss": 4.8596, + "step": 25505 + }, + { + "epoch": 0.1516914073651156, + "grad_norm": 2.5169517993927, + "learning_rate": 4.721475379652121e-05, + "loss": 4.0797, + "step": 25506 + }, + { + "epoch": 0.15169735464839662, + "grad_norm": 2.4206509590148926, + "learning_rate": 4.7214539533602046e-05, + "loss": 3.9878, + "step": 25507 + }, + { + "epoch": 0.1517033019316776, + "grad_norm": 2.054685354232788, + "learning_rate": 4.7214325262928013e-05, + "loss": 3.948, + "step": 25508 + }, + { + "epoch": 0.1517092492149586, + "grad_norm": 1.4626624584197998, + "learning_rate": 4.721411098449919e-05, + "loss": 5.4617, + "step": 25509 + }, + { + "epoch": 0.1517151964982396, + "grad_norm": 1.7592542171478271, + "learning_rate": 4.721389669831566e-05, + "loss": 5.4125, + "step": 25510 + }, + { + "epoch": 0.1517211437815206, + "grad_norm": 1.669419288635254, + "learning_rate": 4.721368240437748e-05, + "loss": 5.4718, + "step": 25511 + }, + { + "epoch": 0.1517270910648016, + "grad_norm": 1.0741300582885742, + "learning_rate": 4.721346810268473e-05, + "loss": 5.5668, + "step": 25512 + }, + { + "epoch": 0.1517330383480826, + "grad_norm": 1.41902494430542, + "learning_rate": 4.72132537932375e-05, + "loss": 5.5451, + "step": 25513 + }, + { + "epoch": 0.1517389856313636, + "grad_norm": 1.7693331241607666, + "learning_rate": 4.721303947603584e-05, + "loss": 5.7588, + "step": 25514 + }, + { + "epoch": 0.15174493291464458, + "grad_norm": 1.7695659399032593, + "learning_rate": 4.7212825151079844e-05, + "loss": 5.6659, + "step": 25515 + }, + { + "epoch": 0.1517508801979256, + "grad_norm": 1.5901025533676147, + "learning_rate": 4.7212610818369586e-05, + "loss": 5.3805, + "step": 25516 + }, + { + "epoch": 0.15175682748120659, + "grad_norm": 1.8363381624221802, + "learning_rate": 4.721239647790512e-05, + "loss": 5.808, + "step": 25517 + }, + { + "epoch": 0.15176277476448757, + "grad_norm": 1.7976000308990479, + "learning_rate": 4.721218212968655e-05, + "loss": 5.7034, + "step": 25518 + }, + { + "epoch": 0.1517687220477686, + "grad_norm": 1.7203330993652344, + "learning_rate": 4.721196777371393e-05, + "loss": 5.4174, + "step": 25519 + }, + { + "epoch": 0.15177466933104958, + "grad_norm": 1.6678218841552734, + "learning_rate": 4.7211753409987344e-05, + "loss": 5.4002, + "step": 25520 + }, + { + "epoch": 0.15178061661433057, + "grad_norm": 1.3932818174362183, + "learning_rate": 4.721153903850686e-05, + "loss": 5.7598, + "step": 25521 + }, + { + "epoch": 0.15178656389761158, + "grad_norm": 1.4975392818450928, + "learning_rate": 4.721132465927256e-05, + "loss": 5.2991, + "step": 25522 + }, + { + "epoch": 0.15179251118089257, + "grad_norm": 1.5375689268112183, + "learning_rate": 4.721111027228452e-05, + "loss": 5.7456, + "step": 25523 + }, + { + "epoch": 0.15179845846417356, + "grad_norm": 1.6894830465316772, + "learning_rate": 4.72108958775428e-05, + "loss": 5.1867, + "step": 25524 + }, + { + "epoch": 0.15180440574745457, + "grad_norm": 1.569059133529663, + "learning_rate": 4.72106814750475e-05, + "loss": 5.4544, + "step": 25525 + }, + { + "epoch": 0.15181035303073556, + "grad_norm": 1.5884952545166016, + "learning_rate": 4.721046706479867e-05, + "loss": 5.1496, + "step": 25526 + }, + { + "epoch": 0.15181630031401655, + "grad_norm": 1.552410364151001, + "learning_rate": 4.721025264679639e-05, + "loss": 5.0916, + "step": 25527 + }, + { + "epoch": 0.15182224759729757, + "grad_norm": 1.5972039699554443, + "learning_rate": 4.721003822104076e-05, + "loss": 5.2073, + "step": 25528 + }, + { + "epoch": 0.15182819488057855, + "grad_norm": 1.6742616891860962, + "learning_rate": 4.720982378753182e-05, + "loss": 5.4851, + "step": 25529 + }, + { + "epoch": 0.15183414216385954, + "grad_norm": 1.4974780082702637, + "learning_rate": 4.7209609346269665e-05, + "loss": 5.4444, + "step": 25530 + }, + { + "epoch": 0.15184008944714056, + "grad_norm": 1.5599150657653809, + "learning_rate": 4.7209394897254363e-05, + "loss": 4.8842, + "step": 25531 + }, + { + "epoch": 0.15184603673042155, + "grad_norm": 1.3979945182800293, + "learning_rate": 4.7209180440485986e-05, + "loss": 5.2836, + "step": 25532 + }, + { + "epoch": 0.15185198401370253, + "grad_norm": 1.3515275716781616, + "learning_rate": 4.720896597596462e-05, + "loss": 5.3011, + "step": 25533 + }, + { + "epoch": 0.15185793129698355, + "grad_norm": 1.7592774629592896, + "learning_rate": 4.720875150369034e-05, + "loss": 5.0874, + "step": 25534 + }, + { + "epoch": 0.15186387858026454, + "grad_norm": 1.5977163314819336, + "learning_rate": 4.72085370236632e-05, + "loss": 4.7678, + "step": 25535 + }, + { + "epoch": 0.15186982586354553, + "grad_norm": 1.3309252262115479, + "learning_rate": 4.7208322535883295e-05, + "loss": 4.9821, + "step": 25536 + }, + { + "epoch": 0.15187577314682654, + "grad_norm": 1.5985299348831177, + "learning_rate": 4.720810804035069e-05, + "loss": 5.1845, + "step": 25537 + }, + { + "epoch": 0.15188172043010753, + "grad_norm": 1.6021031141281128, + "learning_rate": 4.7207893537065475e-05, + "loss": 5.1628, + "step": 25538 + }, + { + "epoch": 0.15188766771338852, + "grad_norm": 1.6445283889770508, + "learning_rate": 4.7207679026027704e-05, + "loss": 4.7933, + "step": 25539 + }, + { + "epoch": 0.15189361499666953, + "grad_norm": 1.6480634212493896, + "learning_rate": 4.7207464507237474e-05, + "loss": 4.7912, + "step": 25540 + }, + { + "epoch": 0.15189956227995052, + "grad_norm": 1.7439652681350708, + "learning_rate": 4.720724998069483e-05, + "loss": 4.5412, + "step": 25541 + }, + { + "epoch": 0.1519055095632315, + "grad_norm": 1.5786992311477661, + "learning_rate": 4.720703544639988e-05, + "loss": 4.8873, + "step": 25542 + }, + { + "epoch": 0.1519114568465125, + "grad_norm": 1.3782871961593628, + "learning_rate": 4.7206820904352675e-05, + "loss": 4.5825, + "step": 25543 + }, + { + "epoch": 0.15191740412979352, + "grad_norm": 1.8048298358917236, + "learning_rate": 4.72066063545533e-05, + "loss": 4.746, + "step": 25544 + }, + { + "epoch": 0.1519233514130745, + "grad_norm": 1.4801894426345825, + "learning_rate": 4.7206391797001826e-05, + "loss": 4.8802, + "step": 25545 + }, + { + "epoch": 0.1519292986963555, + "grad_norm": 1.7984564304351807, + "learning_rate": 4.7206177231698333e-05, + "loss": 4.7674, + "step": 25546 + }, + { + "epoch": 0.1519352459796365, + "grad_norm": 1.7244421243667603, + "learning_rate": 4.72059626586429e-05, + "loss": 5.2729, + "step": 25547 + }, + { + "epoch": 0.1519411932629175, + "grad_norm": 1.2454429864883423, + "learning_rate": 4.7205748077835584e-05, + "loss": 4.9657, + "step": 25548 + }, + { + "epoch": 0.15194714054619848, + "grad_norm": 1.5179264545440674, + "learning_rate": 4.720553348927647e-05, + "loss": 5.2248, + "step": 25549 + }, + { + "epoch": 0.1519530878294795, + "grad_norm": 1.6204310655593872, + "learning_rate": 4.7205318892965636e-05, + "loss": 4.7349, + "step": 25550 + }, + { + "epoch": 0.1519590351127605, + "grad_norm": 1.6427180767059326, + "learning_rate": 4.7205104288903156e-05, + "loss": 4.9733, + "step": 25551 + }, + { + "epoch": 0.15196498239604148, + "grad_norm": 1.7110134363174438, + "learning_rate": 4.7204889677089104e-05, + "loss": 5.1714, + "step": 25552 + }, + { + "epoch": 0.1519709296793225, + "grad_norm": 1.6110901832580566, + "learning_rate": 4.7204675057523556e-05, + "loss": 5.409, + "step": 25553 + }, + { + "epoch": 0.15197687696260348, + "grad_norm": 1.7748627662658691, + "learning_rate": 4.720446043020658e-05, + "loss": 5.443, + "step": 25554 + }, + { + "epoch": 0.15198282424588447, + "grad_norm": 1.574576497077942, + "learning_rate": 4.720424579513826e-05, + "loss": 4.9988, + "step": 25555 + }, + { + "epoch": 0.15198877152916548, + "grad_norm": 1.4916949272155762, + "learning_rate": 4.720403115231867e-05, + "loss": 4.9242, + "step": 25556 + }, + { + "epoch": 0.15199471881244647, + "grad_norm": 1.4862215518951416, + "learning_rate": 4.7203816501747875e-05, + "loss": 5.2778, + "step": 25557 + }, + { + "epoch": 0.15200066609572746, + "grad_norm": 1.445859670639038, + "learning_rate": 4.720360184342597e-05, + "loss": 5.6821, + "step": 25558 + }, + { + "epoch": 0.15200661337900848, + "grad_norm": 1.5154931545257568, + "learning_rate": 4.7203387177353006e-05, + "loss": 5.1821, + "step": 25559 + }, + { + "epoch": 0.15201256066228946, + "grad_norm": 1.1950480937957764, + "learning_rate": 4.720317250352907e-05, + "loss": 5.55, + "step": 25560 + }, + { + "epoch": 0.15201850794557045, + "grad_norm": 1.4134416580200195, + "learning_rate": 4.720295782195423e-05, + "loss": 5.7252, + "step": 25561 + }, + { + "epoch": 0.15202445522885147, + "grad_norm": 1.5440611839294434, + "learning_rate": 4.720274313262858e-05, + "loss": 5.5527, + "step": 25562 + }, + { + "epoch": 0.15203040251213246, + "grad_norm": 1.3670108318328857, + "learning_rate": 4.720252843555217e-05, + "loss": 5.459, + "step": 25563 + }, + { + "epoch": 0.15203634979541344, + "grad_norm": 1.4591896533966064, + "learning_rate": 4.7202313730725094e-05, + "loss": 5.4654, + "step": 25564 + }, + { + "epoch": 0.15204229707869446, + "grad_norm": 1.675755500793457, + "learning_rate": 4.7202099018147414e-05, + "loss": 5.4915, + "step": 25565 + }, + { + "epoch": 0.15204824436197545, + "grad_norm": 1.9771230220794678, + "learning_rate": 4.720188429781922e-05, + "loss": 4.8577, + "step": 25566 + }, + { + "epoch": 0.15205419164525644, + "grad_norm": 1.3904792070388794, + "learning_rate": 4.720166956974057e-05, + "loss": 5.4445, + "step": 25567 + }, + { + "epoch": 0.15206013892853745, + "grad_norm": 1.4478521347045898, + "learning_rate": 4.720145483391155e-05, + "loss": 5.1729, + "step": 25568 + }, + { + "epoch": 0.15206608621181844, + "grad_norm": 2.138211250305176, + "learning_rate": 4.720124009033223e-05, + "loss": 4.0202, + "step": 25569 + }, + { + "epoch": 0.15207203349509943, + "grad_norm": 2.1613049507141113, + "learning_rate": 4.720102533900268e-05, + "loss": 4.0708, + "step": 25570 + }, + { + "epoch": 0.15207798077838044, + "grad_norm": 2.3467164039611816, + "learning_rate": 4.7200810579922996e-05, + "loss": 4.0428, + "step": 25571 + }, + { + "epoch": 0.15208392806166143, + "grad_norm": 2.0889739990234375, + "learning_rate": 4.720059581309323e-05, + "loss": 4.1653, + "step": 25572 + }, + { + "epoch": 0.15208987534494242, + "grad_norm": 1.611956238746643, + "learning_rate": 4.720038103851346e-05, + "loss": 5.3328, + "step": 25573 + }, + { + "epoch": 0.15209582262822344, + "grad_norm": 1.3318549394607544, + "learning_rate": 4.7200166256183776e-05, + "loss": 5.4102, + "step": 25574 + }, + { + "epoch": 0.15210176991150443, + "grad_norm": 1.674455165863037, + "learning_rate": 4.7199951466104234e-05, + "loss": 5.21, + "step": 25575 + }, + { + "epoch": 0.1521077171947854, + "grad_norm": 1.4780274629592896, + "learning_rate": 4.7199736668274924e-05, + "loss": 5.3385, + "step": 25576 + }, + { + "epoch": 0.15211366447806643, + "grad_norm": 1.7735114097595215, + "learning_rate": 4.719952186269592e-05, + "loss": 4.8768, + "step": 25577 + }, + { + "epoch": 0.15211961176134742, + "grad_norm": 1.6420248746871948, + "learning_rate": 4.719930704936728e-05, + "loss": 5.2584, + "step": 25578 + }, + { + "epoch": 0.1521255590446284, + "grad_norm": 1.970648169517517, + "learning_rate": 4.71990922282891e-05, + "loss": 4.4764, + "step": 25579 + }, + { + "epoch": 0.15213150632790942, + "grad_norm": 1.4318586587905884, + "learning_rate": 4.719887739946145e-05, + "loss": 5.5169, + "step": 25580 + }, + { + "epoch": 0.1521374536111904, + "grad_norm": 1.7637288570404053, + "learning_rate": 4.719866256288439e-05, + "loss": 5.1493, + "step": 25581 + }, + { + "epoch": 0.1521434008944714, + "grad_norm": 1.7159098386764526, + "learning_rate": 4.719844771855801e-05, + "loss": 5.3964, + "step": 25582 + }, + { + "epoch": 0.1521493481777524, + "grad_norm": 1.6556905508041382, + "learning_rate": 4.719823286648238e-05, + "loss": 5.3116, + "step": 25583 + }, + { + "epoch": 0.1521552954610334, + "grad_norm": 1.5177308320999146, + "learning_rate": 4.7198018006657584e-05, + "loss": 5.8963, + "step": 25584 + }, + { + "epoch": 0.1521612427443144, + "grad_norm": 1.960729718208313, + "learning_rate": 4.719780313908368e-05, + "loss": 5.266, + "step": 25585 + }, + { + "epoch": 0.1521671900275954, + "grad_norm": 1.6893891096115112, + "learning_rate": 4.719758826376076e-05, + "loss": 5.3618, + "step": 25586 + }, + { + "epoch": 0.1521731373108764, + "grad_norm": 1.5606249570846558, + "learning_rate": 4.719737338068889e-05, + "loss": 5.8684, + "step": 25587 + }, + { + "epoch": 0.15217908459415738, + "grad_norm": 1.6435186862945557, + "learning_rate": 4.7197158489868143e-05, + "loss": 4.9082, + "step": 25588 + }, + { + "epoch": 0.1521850318774384, + "grad_norm": 1.9077845811843872, + "learning_rate": 4.71969435912986e-05, + "loss": 4.0132, + "step": 25589 + }, + { + "epoch": 0.1521909791607194, + "grad_norm": 1.4427006244659424, + "learning_rate": 4.719672868498034e-05, + "loss": 5.5848, + "step": 25590 + }, + { + "epoch": 0.15219692644400037, + "grad_norm": 1.671826958656311, + "learning_rate": 4.719651377091342e-05, + "loss": 5.0797, + "step": 25591 + }, + { + "epoch": 0.1522028737272814, + "grad_norm": 1.8073980808258057, + "learning_rate": 4.719629884909793e-05, + "loss": 3.8879, + "step": 25592 + }, + { + "epoch": 0.15220882101056238, + "grad_norm": 1.8267574310302734, + "learning_rate": 4.719608391953394e-05, + "loss": 3.8104, + "step": 25593 + }, + { + "epoch": 0.15221476829384337, + "grad_norm": 1.8598294258117676, + "learning_rate": 4.7195868982221526e-05, + "loss": 3.6587, + "step": 25594 + }, + { + "epoch": 0.15222071557712438, + "grad_norm": 1.705465316772461, + "learning_rate": 4.7195654037160765e-05, + "loss": 3.9886, + "step": 25595 + }, + { + "epoch": 0.15222666286040537, + "grad_norm": 1.8253175020217896, + "learning_rate": 4.7195439084351734e-05, + "loss": 3.9031, + "step": 25596 + }, + { + "epoch": 0.15223261014368636, + "grad_norm": 1.718245506286621, + "learning_rate": 4.71952241237945e-05, + "loss": 4.2814, + "step": 25597 + }, + { + "epoch": 0.15223855742696737, + "grad_norm": 1.7115817070007324, + "learning_rate": 4.719500915548914e-05, + "loss": 4.748, + "step": 25598 + }, + { + "epoch": 0.15224450471024836, + "grad_norm": 1.53532874584198, + "learning_rate": 4.719479417943574e-05, + "loss": 5.499, + "step": 25599 + }, + { + "epoch": 0.15225045199352935, + "grad_norm": 1.854274868965149, + "learning_rate": 4.719457919563436e-05, + "loss": 4.1188, + "step": 25600 + }, + { + "epoch": 0.15225639927681037, + "grad_norm": 2.001619338989258, + "learning_rate": 4.7194364204085085e-05, + "loss": 3.89, + "step": 25601 + }, + { + "epoch": 0.15226234656009136, + "grad_norm": 1.9772802591323853, + "learning_rate": 4.7194149204787986e-05, + "loss": 3.8764, + "step": 25602 + }, + { + "epoch": 0.15226829384337234, + "grad_norm": 1.9361356496810913, + "learning_rate": 4.719393419774314e-05, + "loss": 5.0285, + "step": 25603 + }, + { + "epoch": 0.15227424112665333, + "grad_norm": 1.6824191808700562, + "learning_rate": 4.719371918295061e-05, + "loss": 5.2847, + "step": 25604 + }, + { + "epoch": 0.15228018840993435, + "grad_norm": 2.423736095428467, + "learning_rate": 4.7193504160410495e-05, + "loss": 4.087, + "step": 25605 + }, + { + "epoch": 0.15228613569321534, + "grad_norm": 1.711818814277649, + "learning_rate": 4.719328913012285e-05, + "loss": 5.0702, + "step": 25606 + }, + { + "epoch": 0.15229208297649632, + "grad_norm": 2.406665325164795, + "learning_rate": 4.7193074092087765e-05, + "loss": 4.1674, + "step": 25607 + }, + { + "epoch": 0.15229803025977734, + "grad_norm": 2.0252084732055664, + "learning_rate": 4.71928590463053e-05, + "loss": 3.9202, + "step": 25608 + }, + { + "epoch": 0.15230397754305833, + "grad_norm": 1.6908705234527588, + "learning_rate": 4.7192643992775534e-05, + "loss": 4.5446, + "step": 25609 + }, + { + "epoch": 0.15230992482633932, + "grad_norm": 1.2706576585769653, + "learning_rate": 4.719242893149855e-05, + "loss": 5.6578, + "step": 25610 + }, + { + "epoch": 0.15231587210962033, + "grad_norm": 1.380682349205017, + "learning_rate": 4.719221386247442e-05, + "loss": 5.6256, + "step": 25611 + }, + { + "epoch": 0.15232181939290132, + "grad_norm": 1.6104844808578491, + "learning_rate": 4.7191998785703214e-05, + "loss": 5.5271, + "step": 25612 + }, + { + "epoch": 0.1523277666761823, + "grad_norm": 1.5654959678649902, + "learning_rate": 4.719178370118502e-05, + "loss": 5.0767, + "step": 25613 + }, + { + "epoch": 0.15233371395946332, + "grad_norm": 1.7980438470840454, + "learning_rate": 4.719156860891989e-05, + "loss": 4.6667, + "step": 25614 + }, + { + "epoch": 0.1523396612427443, + "grad_norm": 1.6443228721618652, + "learning_rate": 4.719135350890792e-05, + "loss": 4.2763, + "step": 25615 + }, + { + "epoch": 0.1523456085260253, + "grad_norm": 1.442205548286438, + "learning_rate": 4.719113840114918e-05, + "loss": 5.0442, + "step": 25616 + }, + { + "epoch": 0.15235155580930632, + "grad_norm": 1.5215251445770264, + "learning_rate": 4.719092328564374e-05, + "loss": 5.2175, + "step": 25617 + }, + { + "epoch": 0.1523575030925873, + "grad_norm": 1.4463436603546143, + "learning_rate": 4.7190708162391677e-05, + "loss": 5.6153, + "step": 25618 + }, + { + "epoch": 0.1523634503758683, + "grad_norm": 1.624923825263977, + "learning_rate": 4.719049303139307e-05, + "loss": 5.4211, + "step": 25619 + }, + { + "epoch": 0.1523693976591493, + "grad_norm": 1.5821541547775269, + "learning_rate": 4.719027789264799e-05, + "loss": 5.7905, + "step": 25620 + }, + { + "epoch": 0.1523753449424303, + "grad_norm": 1.6683502197265625, + "learning_rate": 4.719006274615651e-05, + "loss": 5.112, + "step": 25621 + }, + { + "epoch": 0.15238129222571128, + "grad_norm": 1.3617998361587524, + "learning_rate": 4.7189847591918714e-05, + "loss": 5.3799, + "step": 25622 + }, + { + "epoch": 0.1523872395089923, + "grad_norm": 1.5106703042984009, + "learning_rate": 4.718963242993466e-05, + "loss": 4.9833, + "step": 25623 + }, + { + "epoch": 0.1523931867922733, + "grad_norm": 1.7020819187164307, + "learning_rate": 4.718941726020445e-05, + "loss": 4.2403, + "step": 25624 + }, + { + "epoch": 0.15239913407555428, + "grad_norm": 1.5678812265396118, + "learning_rate": 4.7189202082728133e-05, + "loss": 5.0985, + "step": 25625 + }, + { + "epoch": 0.1524050813588353, + "grad_norm": 1.4727619886398315, + "learning_rate": 4.71889868975058e-05, + "loss": 4.9088, + "step": 25626 + }, + { + "epoch": 0.15241102864211628, + "grad_norm": 1.5460275411605835, + "learning_rate": 4.7188771704537515e-05, + "loss": 5.2766, + "step": 25627 + }, + { + "epoch": 0.15241697592539727, + "grad_norm": 1.5763301849365234, + "learning_rate": 4.7188556503823366e-05, + "loss": 4.9134, + "step": 25628 + }, + { + "epoch": 0.15242292320867828, + "grad_norm": 1.8980252742767334, + "learning_rate": 4.718834129536341e-05, + "loss": 4.9331, + "step": 25629 + }, + { + "epoch": 0.15242887049195927, + "grad_norm": 2.768523693084717, + "learning_rate": 4.7188126079157744e-05, + "loss": 4.3952, + "step": 25630 + }, + { + "epoch": 0.15243481777524026, + "grad_norm": 2.6490437984466553, + "learning_rate": 4.718791085520643e-05, + "loss": 4.1387, + "step": 25631 + }, + { + "epoch": 0.15244076505852128, + "grad_norm": 1.806143879890442, + "learning_rate": 4.718769562350955e-05, + "loss": 4.7686, + "step": 25632 + }, + { + "epoch": 0.15244671234180227, + "grad_norm": 1.6871095895767212, + "learning_rate": 4.718748038406717e-05, + "loss": 5.3937, + "step": 25633 + }, + { + "epoch": 0.15245265962508325, + "grad_norm": 2.2100014686584473, + "learning_rate": 4.7187265136879364e-05, + "loss": 4.7869, + "step": 25634 + }, + { + "epoch": 0.15245860690836427, + "grad_norm": 1.978220820426941, + "learning_rate": 4.7187049881946224e-05, + "loss": 4.4701, + "step": 25635 + }, + { + "epoch": 0.15246455419164526, + "grad_norm": 1.8031092882156372, + "learning_rate": 4.718683461926781e-05, + "loss": 4.5107, + "step": 25636 + }, + { + "epoch": 0.15247050147492625, + "grad_norm": 1.795417308807373, + "learning_rate": 4.7186619348844196e-05, + "loss": 5.2659, + "step": 25637 + }, + { + "epoch": 0.15247644875820726, + "grad_norm": 2.3051810264587402, + "learning_rate": 4.718640407067547e-05, + "loss": 4.5413, + "step": 25638 + }, + { + "epoch": 0.15248239604148825, + "grad_norm": 1.983340859413147, + "learning_rate": 4.71861887847617e-05, + "loss": 4.5167, + "step": 25639 + }, + { + "epoch": 0.15248834332476924, + "grad_norm": 1.7354977130889893, + "learning_rate": 4.718597349110295e-05, + "loss": 4.5704, + "step": 25640 + }, + { + "epoch": 0.15249429060805025, + "grad_norm": 1.9091737270355225, + "learning_rate": 4.7185758189699313e-05, + "loss": 4.4381, + "step": 25641 + }, + { + "epoch": 0.15250023789133124, + "grad_norm": 1.8753962516784668, + "learning_rate": 4.718554288055086e-05, + "loss": 4.445, + "step": 25642 + }, + { + "epoch": 0.15250618517461223, + "grad_norm": 1.7315021753311157, + "learning_rate": 4.718532756365765e-05, + "loss": 4.7802, + "step": 25643 + }, + { + "epoch": 0.15251213245789325, + "grad_norm": 1.4017493724822998, + "learning_rate": 4.718511223901979e-05, + "loss": 5.3923, + "step": 25644 + }, + { + "epoch": 0.15251807974117423, + "grad_norm": 1.8367207050323486, + "learning_rate": 4.7184896906637326e-05, + "loss": 4.6229, + "step": 25645 + }, + { + "epoch": 0.15252402702445522, + "grad_norm": 2.3250296115875244, + "learning_rate": 4.718468156651035e-05, + "loss": 4.6332, + "step": 25646 + }, + { + "epoch": 0.15252997430773624, + "grad_norm": 2.047855854034424, + "learning_rate": 4.7184466218638925e-05, + "loss": 4.5316, + "step": 25647 + }, + { + "epoch": 0.15253592159101723, + "grad_norm": 1.9817044734954834, + "learning_rate": 4.7184250863023125e-05, + "loss": 4.3888, + "step": 25648 + }, + { + "epoch": 0.15254186887429821, + "grad_norm": 1.889957308769226, + "learning_rate": 4.718403549966305e-05, + "loss": 4.6436, + "step": 25649 + }, + { + "epoch": 0.15254781615757923, + "grad_norm": 1.4799065589904785, + "learning_rate": 4.718382012855874e-05, + "loss": 4.7965, + "step": 25650 + }, + { + "epoch": 0.15255376344086022, + "grad_norm": 2.046947717666626, + "learning_rate": 4.7183604749710296e-05, + "loss": 4.3206, + "step": 25651 + }, + { + "epoch": 0.1525597107241412, + "grad_norm": 1.970746636390686, + "learning_rate": 4.718338936311778e-05, + "loss": 4.3668, + "step": 25652 + }, + { + "epoch": 0.15256565800742222, + "grad_norm": 1.889931321144104, + "learning_rate": 4.718317396878128e-05, + "loss": 4.3436, + "step": 25653 + }, + { + "epoch": 0.1525716052907032, + "grad_norm": 2.0069503784179688, + "learning_rate": 4.7182958566700865e-05, + "loss": 4.5258, + "step": 25654 + }, + { + "epoch": 0.1525775525739842, + "grad_norm": 2.222224712371826, + "learning_rate": 4.7182743156876596e-05, + "loss": 4.362, + "step": 25655 + }, + { + "epoch": 0.15258349985726521, + "grad_norm": 2.2478747367858887, + "learning_rate": 4.718252773930857e-05, + "loss": 4.7401, + "step": 25656 + }, + { + "epoch": 0.1525894471405462, + "grad_norm": 2.224696636199951, + "learning_rate": 4.718231231399685e-05, + "loss": 4.5413, + "step": 25657 + }, + { + "epoch": 0.1525953944238272, + "grad_norm": 1.9385725259780884, + "learning_rate": 4.718209688094152e-05, + "loss": 4.7279, + "step": 25658 + }, + { + "epoch": 0.1526013417071082, + "grad_norm": 2.030127763748169, + "learning_rate": 4.718188144014264e-05, + "loss": 4.4943, + "step": 25659 + }, + { + "epoch": 0.1526072889903892, + "grad_norm": 2.115994453430176, + "learning_rate": 4.7181665991600296e-05, + "loss": 4.5709, + "step": 25660 + }, + { + "epoch": 0.15261323627367018, + "grad_norm": 1.6957606077194214, + "learning_rate": 4.718145053531456e-05, + "loss": 4.8779, + "step": 25661 + }, + { + "epoch": 0.15261918355695117, + "grad_norm": 1.9567986726760864, + "learning_rate": 4.718123507128551e-05, + "loss": 4.5541, + "step": 25662 + }, + { + "epoch": 0.1526251308402322, + "grad_norm": 2.147771120071411, + "learning_rate": 4.718101959951323e-05, + "loss": 4.5141, + "step": 25663 + }, + { + "epoch": 0.15263107812351318, + "grad_norm": 2.1374590396881104, + "learning_rate": 4.7180804119997774e-05, + "loss": 4.3474, + "step": 25664 + }, + { + "epoch": 0.15263702540679416, + "grad_norm": 2.060826539993286, + "learning_rate": 4.718058863273923e-05, + "loss": 4.4178, + "step": 25665 + }, + { + "epoch": 0.15264297269007518, + "grad_norm": 1.9931002855300903, + "learning_rate": 4.7180373137737673e-05, + "loss": 4.3213, + "step": 25666 + }, + { + "epoch": 0.15264891997335617, + "grad_norm": 1.3702372312545776, + "learning_rate": 4.718015763499318e-05, + "loss": 5.0551, + "step": 25667 + }, + { + "epoch": 0.15265486725663716, + "grad_norm": 1.8524867296218872, + "learning_rate": 4.7179942124505814e-05, + "loss": 5.0618, + "step": 25668 + }, + { + "epoch": 0.15266081453991817, + "grad_norm": 1.876756191253662, + "learning_rate": 4.717972660627567e-05, + "loss": 4.2719, + "step": 25669 + }, + { + "epoch": 0.15266676182319916, + "grad_norm": 2.0334908962249756, + "learning_rate": 4.7179511080302804e-05, + "loss": 4.5764, + "step": 25670 + }, + { + "epoch": 0.15267270910648015, + "grad_norm": 2.554891347885132, + "learning_rate": 4.717929554658731e-05, + "loss": 4.6706, + "step": 25671 + }, + { + "epoch": 0.15267865638976116, + "grad_norm": 2.032592296600342, + "learning_rate": 4.717908000512925e-05, + "loss": 4.9648, + "step": 25672 + }, + { + "epoch": 0.15268460367304215, + "grad_norm": 1.6153349876403809, + "learning_rate": 4.7178864455928696e-05, + "loss": 5.2224, + "step": 25673 + }, + { + "epoch": 0.15269055095632314, + "grad_norm": 2.0942156314849854, + "learning_rate": 4.7178648898985734e-05, + "loss": 4.6427, + "step": 25674 + }, + { + "epoch": 0.15269649823960416, + "grad_norm": 1.9911080598831177, + "learning_rate": 4.717843333430043e-05, + "loss": 4.3348, + "step": 25675 + }, + { + "epoch": 0.15270244552288514, + "grad_norm": 2.017202377319336, + "learning_rate": 4.7178217761872866e-05, + "loss": 4.5306, + "step": 25676 + }, + { + "epoch": 0.15270839280616613, + "grad_norm": 1.9934179782867432, + "learning_rate": 4.7178002181703116e-05, + "loss": 4.7443, + "step": 25677 + }, + { + "epoch": 0.15271434008944715, + "grad_norm": 1.9597182273864746, + "learning_rate": 4.717778659379126e-05, + "loss": 4.5526, + "step": 25678 + }, + { + "epoch": 0.15272028737272814, + "grad_norm": 1.3593907356262207, + "learning_rate": 4.717757099813737e-05, + "loss": 5.5802, + "step": 25679 + }, + { + "epoch": 0.15272623465600912, + "grad_norm": 2.0012102127075195, + "learning_rate": 4.717735539474151e-05, + "loss": 5.0289, + "step": 25680 + }, + { + "epoch": 0.15273218193929014, + "grad_norm": 1.5621830224990845, + "learning_rate": 4.7177139783603765e-05, + "loss": 4.9388, + "step": 25681 + }, + { + "epoch": 0.15273812922257113, + "grad_norm": 1.502643346786499, + "learning_rate": 4.717692416472421e-05, + "loss": 5.3317, + "step": 25682 + }, + { + "epoch": 0.15274407650585212, + "grad_norm": 1.6496142148971558, + "learning_rate": 4.717670853810292e-05, + "loss": 5.9642, + "step": 25683 + }, + { + "epoch": 0.15275002378913313, + "grad_norm": 1.7263692617416382, + "learning_rate": 4.717649290373997e-05, + "loss": 4.9383, + "step": 25684 + }, + { + "epoch": 0.15275597107241412, + "grad_norm": 1.4914296865463257, + "learning_rate": 4.7176277261635434e-05, + "loss": 5.2599, + "step": 25685 + }, + { + "epoch": 0.1527619183556951, + "grad_norm": 1.3947960138320923, + "learning_rate": 4.71760616117894e-05, + "loss": 5.3177, + "step": 25686 + }, + { + "epoch": 0.15276786563897612, + "grad_norm": 1.6703267097473145, + "learning_rate": 4.717584595420192e-05, + "loss": 5.0309, + "step": 25687 + }, + { + "epoch": 0.1527738129222571, + "grad_norm": 1.622600793838501, + "learning_rate": 4.7175630288873083e-05, + "loss": 5.2554, + "step": 25688 + }, + { + "epoch": 0.1527797602055381, + "grad_norm": 1.678843379020691, + "learning_rate": 4.717541461580297e-05, + "loss": 5.012, + "step": 25689 + }, + { + "epoch": 0.15278570748881912, + "grad_norm": 2.2063186168670654, + "learning_rate": 4.717519893499164e-05, + "loss": 4.4479, + "step": 25690 + }, + { + "epoch": 0.1527916547721001, + "grad_norm": 2.0667500495910645, + "learning_rate": 4.717498324643918e-05, + "loss": 4.7081, + "step": 25691 + }, + { + "epoch": 0.1527976020553811, + "grad_norm": 2.192436695098877, + "learning_rate": 4.717476755014566e-05, + "loss": 4.7662, + "step": 25692 + }, + { + "epoch": 0.1528035493386621, + "grad_norm": 1.4742953777313232, + "learning_rate": 4.7174551846111165e-05, + "loss": 5.5788, + "step": 25693 + }, + { + "epoch": 0.1528094966219431, + "grad_norm": 1.7715102434158325, + "learning_rate": 4.7174336134335765e-05, + "loss": 5.203, + "step": 25694 + }, + { + "epoch": 0.15281544390522409, + "grad_norm": 2.406721353530884, + "learning_rate": 4.717412041481952e-05, + "loss": 4.7807, + "step": 25695 + }, + { + "epoch": 0.1528213911885051, + "grad_norm": 1.765756607055664, + "learning_rate": 4.7173904687562525e-05, + "loss": 5.2479, + "step": 25696 + }, + { + "epoch": 0.1528273384717861, + "grad_norm": 1.6135215759277344, + "learning_rate": 4.7173688952564856e-05, + "loss": 5.4787, + "step": 25697 + }, + { + "epoch": 0.15283328575506708, + "grad_norm": 1.5617319345474243, + "learning_rate": 4.7173473209826566e-05, + "loss": 5.02, + "step": 25698 + }, + { + "epoch": 0.1528392330383481, + "grad_norm": 1.4704324007034302, + "learning_rate": 4.7173257459347756e-05, + "loss": 5.1675, + "step": 25699 + }, + { + "epoch": 0.15284518032162908, + "grad_norm": 1.8787862062454224, + "learning_rate": 4.7173041701128496e-05, + "loss": 4.7247, + "step": 25700 + }, + { + "epoch": 0.15285112760491007, + "grad_norm": 3.8647372722625732, + "learning_rate": 4.7172825935168845e-05, + "loss": 3.5335, + "step": 25701 + }, + { + "epoch": 0.15285707488819109, + "grad_norm": 3.6721291542053223, + "learning_rate": 4.717261016146889e-05, + "loss": 2.8843, + "step": 25702 + }, + { + "epoch": 0.15286302217147207, + "grad_norm": 2.0848543643951416, + "learning_rate": 4.717239438002872e-05, + "loss": 4.4863, + "step": 25703 + }, + { + "epoch": 0.15286896945475306, + "grad_norm": 1.7783108949661255, + "learning_rate": 4.717217859084838e-05, + "loss": 5.2903, + "step": 25704 + }, + { + "epoch": 0.15287491673803408, + "grad_norm": 2.006303548812866, + "learning_rate": 4.717196279392797e-05, + "loss": 4.3923, + "step": 25705 + }, + { + "epoch": 0.15288086402131507, + "grad_norm": 2.4214632511138916, + "learning_rate": 4.7171746989267553e-05, + "loss": 3.3506, + "step": 25706 + }, + { + "epoch": 0.15288681130459605, + "grad_norm": 2.8976924419403076, + "learning_rate": 4.7171531176867214e-05, + "loss": 3.2211, + "step": 25707 + }, + { + "epoch": 0.15289275858787707, + "grad_norm": 3.2015345096588135, + "learning_rate": 4.717131535672702e-05, + "loss": 2.8205, + "step": 25708 + }, + { + "epoch": 0.15289870587115806, + "grad_norm": 3.559465169906616, + "learning_rate": 4.7171099528847044e-05, + "loss": 2.8882, + "step": 25709 + }, + { + "epoch": 0.15290465315443905, + "grad_norm": 3.3753960132598877, + "learning_rate": 4.717088369322737e-05, + "loss": 2.6752, + "step": 25710 + }, + { + "epoch": 0.15291060043772006, + "grad_norm": 2.129783868789673, + "learning_rate": 4.717066784986806e-05, + "loss": 3.9983, + "step": 25711 + }, + { + "epoch": 0.15291654772100105, + "grad_norm": 1.797956943511963, + "learning_rate": 4.7170451998769214e-05, + "loss": 4.8075, + "step": 25712 + }, + { + "epoch": 0.15292249500428204, + "grad_norm": 3.3450467586517334, + "learning_rate": 4.717023613993089e-05, + "loss": 4.177, + "step": 25713 + }, + { + "epoch": 0.15292844228756305, + "grad_norm": 2.303511381149292, + "learning_rate": 4.7170020273353164e-05, + "loss": 4.471, + "step": 25714 + }, + { + "epoch": 0.15293438957084404, + "grad_norm": 1.4113452434539795, + "learning_rate": 4.7169804399036105e-05, + "loss": 5.4846, + "step": 25715 + }, + { + "epoch": 0.15294033685412503, + "grad_norm": 1.7091588973999023, + "learning_rate": 4.71695885169798e-05, + "loss": 4.8856, + "step": 25716 + }, + { + "epoch": 0.15294628413740605, + "grad_norm": 2.783010244369507, + "learning_rate": 4.7169372627184326e-05, + "loss": 4.3426, + "step": 25717 + }, + { + "epoch": 0.15295223142068703, + "grad_norm": 1.4658305644989014, + "learning_rate": 4.716915672964975e-05, + "loss": 5.3191, + "step": 25718 + }, + { + "epoch": 0.15295817870396802, + "grad_norm": 1.2862242460250854, + "learning_rate": 4.716894082437615e-05, + "loss": 5.3939, + "step": 25719 + }, + { + "epoch": 0.152964125987249, + "grad_norm": 1.4098438024520874, + "learning_rate": 4.71687249113636e-05, + "loss": 5.4493, + "step": 25720 + }, + { + "epoch": 0.15297007327053003, + "grad_norm": 1.4778176546096802, + "learning_rate": 4.7168508990612183e-05, + "loss": 5.2679, + "step": 25721 + }, + { + "epoch": 0.15297602055381102, + "grad_norm": 1.5448487997055054, + "learning_rate": 4.716829306212196e-05, + "loss": 5.1446, + "step": 25722 + }, + { + "epoch": 0.152981967837092, + "grad_norm": 1.3638159036636353, + "learning_rate": 4.716807712589302e-05, + "loss": 5.1152, + "step": 25723 + }, + { + "epoch": 0.15298791512037302, + "grad_norm": 1.7068208456039429, + "learning_rate": 4.716786118192543e-05, + "loss": 5.1389, + "step": 25724 + }, + { + "epoch": 0.152993862403654, + "grad_norm": 1.8191746473312378, + "learning_rate": 4.716764523021928e-05, + "loss": 5.2305, + "step": 25725 + }, + { + "epoch": 0.152999809686935, + "grad_norm": 1.6970409154891968, + "learning_rate": 4.716742927077462e-05, + "loss": 5.1097, + "step": 25726 + }, + { + "epoch": 0.153005756970216, + "grad_norm": 1.5453951358795166, + "learning_rate": 4.716721330359155e-05, + "loss": 5.2614, + "step": 25727 + }, + { + "epoch": 0.153011704253497, + "grad_norm": 1.5335613489151, + "learning_rate": 4.7166997328670125e-05, + "loss": 4.8482, + "step": 25728 + }, + { + "epoch": 0.153017651536778, + "grad_norm": 1.6566481590270996, + "learning_rate": 4.716678134601044e-05, + "loss": 4.9346, + "step": 25729 + }, + { + "epoch": 0.153023598820059, + "grad_norm": 1.7899013757705688, + "learning_rate": 4.716656535561256e-05, + "loss": 5.0877, + "step": 25730 + }, + { + "epoch": 0.15302954610334, + "grad_norm": 2.1659116744995117, + "learning_rate": 4.716634935747655e-05, + "loss": 4.6431, + "step": 25731 + }, + { + "epoch": 0.15303549338662098, + "grad_norm": 1.914923071861267, + "learning_rate": 4.71661333516025e-05, + "loss": 4.9001, + "step": 25732 + }, + { + "epoch": 0.153041440669902, + "grad_norm": 1.9240248203277588, + "learning_rate": 4.7165917337990495e-05, + "loss": 4.7709, + "step": 25733 + }, + { + "epoch": 0.15304738795318298, + "grad_norm": 1.6446973085403442, + "learning_rate": 4.7165701316640585e-05, + "loss": 4.9816, + "step": 25734 + }, + { + "epoch": 0.15305333523646397, + "grad_norm": 1.7971409559249878, + "learning_rate": 4.716548528755286e-05, + "loss": 5.0082, + "step": 25735 + }, + { + "epoch": 0.153059282519745, + "grad_norm": 1.3862462043762207, + "learning_rate": 4.716526925072739e-05, + "loss": 5.0245, + "step": 25736 + }, + { + "epoch": 0.15306522980302598, + "grad_norm": 2.157005548477173, + "learning_rate": 4.716505320616425e-05, + "loss": 4.573, + "step": 25737 + }, + { + "epoch": 0.15307117708630696, + "grad_norm": 2.4460175037384033, + "learning_rate": 4.716483715386354e-05, + "loss": 4.0872, + "step": 25738 + }, + { + "epoch": 0.15307712436958798, + "grad_norm": 1.7140263319015503, + "learning_rate": 4.7164621093825294e-05, + "loss": 4.5421, + "step": 25739 + }, + { + "epoch": 0.15308307165286897, + "grad_norm": 1.684173583984375, + "learning_rate": 4.7164405026049616e-05, + "loss": 4.5274, + "step": 25740 + }, + { + "epoch": 0.15308901893614996, + "grad_norm": 1.9424148797988892, + "learning_rate": 4.716418895053657e-05, + "loss": 4.2669, + "step": 25741 + }, + { + "epoch": 0.15309496621943097, + "grad_norm": 1.576071858406067, + "learning_rate": 4.716397286728623e-05, + "loss": 4.9536, + "step": 25742 + }, + { + "epoch": 0.15310091350271196, + "grad_norm": 1.8285739421844482, + "learning_rate": 4.7163756776298686e-05, + "loss": 4.9322, + "step": 25743 + }, + { + "epoch": 0.15310686078599295, + "grad_norm": 2.058610200881958, + "learning_rate": 4.7163540677574004e-05, + "loss": 4.4565, + "step": 25744 + }, + { + "epoch": 0.15311280806927396, + "grad_norm": 2.106513261795044, + "learning_rate": 4.716332457111226e-05, + "loss": 4.0534, + "step": 25745 + }, + { + "epoch": 0.15311875535255495, + "grad_norm": 1.821857213973999, + "learning_rate": 4.716310845691351e-05, + "loss": 4.5302, + "step": 25746 + }, + { + "epoch": 0.15312470263583594, + "grad_norm": 1.5679446458816528, + "learning_rate": 4.716289233497787e-05, + "loss": 4.9452, + "step": 25747 + }, + { + "epoch": 0.15313064991911696, + "grad_norm": 1.612362027168274, + "learning_rate": 4.716267620530538e-05, + "loss": 5.0074, + "step": 25748 + }, + { + "epoch": 0.15313659720239794, + "grad_norm": 1.6841483116149902, + "learning_rate": 4.716246006789613e-05, + "loss": 5.0202, + "step": 25749 + }, + { + "epoch": 0.15314254448567893, + "grad_norm": 1.7533215284347534, + "learning_rate": 4.7162243922750196e-05, + "loss": 4.6901, + "step": 25750 + }, + { + "epoch": 0.15314849176895995, + "grad_norm": 2.2937755584716797, + "learning_rate": 4.716202776986766e-05, + "loss": 4.0934, + "step": 25751 + }, + { + "epoch": 0.15315443905224094, + "grad_norm": 2.413012742996216, + "learning_rate": 4.7161811609248576e-05, + "loss": 4.0128, + "step": 25752 + }, + { + "epoch": 0.15316038633552193, + "grad_norm": 2.481255054473877, + "learning_rate": 4.7161595440893035e-05, + "loss": 4.4044, + "step": 25753 + }, + { + "epoch": 0.15316633361880294, + "grad_norm": 1.8999838829040527, + "learning_rate": 4.7161379264801115e-05, + "loss": 4.2328, + "step": 25754 + }, + { + "epoch": 0.15317228090208393, + "grad_norm": 2.3453261852264404, + "learning_rate": 4.7161163080972884e-05, + "loss": 4.283, + "step": 25755 + }, + { + "epoch": 0.15317822818536492, + "grad_norm": 1.6733421087265015, + "learning_rate": 4.716094688940842e-05, + "loss": 4.7254, + "step": 25756 + }, + { + "epoch": 0.15318417546864593, + "grad_norm": 1.5302658081054688, + "learning_rate": 4.7160730690107794e-05, + "loss": 4.9403, + "step": 25757 + }, + { + "epoch": 0.15319012275192692, + "grad_norm": 1.6725687980651855, + "learning_rate": 4.716051448307109e-05, + "loss": 4.699, + "step": 25758 + }, + { + "epoch": 0.1531960700352079, + "grad_norm": 2.067267894744873, + "learning_rate": 4.716029826829839e-05, + "loss": 4.0136, + "step": 25759 + }, + { + "epoch": 0.15320201731848893, + "grad_norm": 2.2834413051605225, + "learning_rate": 4.716008204578975e-05, + "loss": 4.1914, + "step": 25760 + }, + { + "epoch": 0.1532079646017699, + "grad_norm": 1.9917986392974854, + "learning_rate": 4.715986581554524e-05, + "loss": 4.2899, + "step": 25761 + }, + { + "epoch": 0.1532139118850509, + "grad_norm": 1.6681551933288574, + "learning_rate": 4.715964957756497e-05, + "loss": 4.7627, + "step": 25762 + }, + { + "epoch": 0.15321985916833192, + "grad_norm": 2.005560874938965, + "learning_rate": 4.715943333184899e-05, + "loss": 4.1686, + "step": 25763 + }, + { + "epoch": 0.1532258064516129, + "grad_norm": 1.7380902767181396, + "learning_rate": 4.715921707839738e-05, + "loss": 4.4208, + "step": 25764 + }, + { + "epoch": 0.1532317537348939, + "grad_norm": 2.6380422115325928, + "learning_rate": 4.7159000817210205e-05, + "loss": 4.9835, + "step": 25765 + }, + { + "epoch": 0.1532377010181749, + "grad_norm": 2.4079694747924805, + "learning_rate": 4.715878454828757e-05, + "loss": 4.5758, + "step": 25766 + }, + { + "epoch": 0.1532436483014559, + "grad_norm": 1.7469686269760132, + "learning_rate": 4.715856827162952e-05, + "loss": 4.8894, + "step": 25767 + }, + { + "epoch": 0.1532495955847369, + "grad_norm": 1.7569485902786255, + "learning_rate": 4.715835198723615e-05, + "loss": 5.0324, + "step": 25768 + }, + { + "epoch": 0.1532555428680179, + "grad_norm": 1.9182626008987427, + "learning_rate": 4.715813569510752e-05, + "loss": 4.2196, + "step": 25769 + }, + { + "epoch": 0.1532614901512989, + "grad_norm": 1.8836737871170044, + "learning_rate": 4.715791939524372e-05, + "loss": 4.3797, + "step": 25770 + }, + { + "epoch": 0.15326743743457988, + "grad_norm": 1.5073226690292358, + "learning_rate": 4.7157703087644816e-05, + "loss": 4.7137, + "step": 25771 + }, + { + "epoch": 0.1532733847178609, + "grad_norm": 1.764160394668579, + "learning_rate": 4.715748677231089e-05, + "loss": 4.784, + "step": 25772 + }, + { + "epoch": 0.15327933200114188, + "grad_norm": 1.5940345525741577, + "learning_rate": 4.715727044924201e-05, + "loss": 4.7749, + "step": 25773 + }, + { + "epoch": 0.15328527928442287, + "grad_norm": 1.9873480796813965, + "learning_rate": 4.715705411843826e-05, + "loss": 4.7084, + "step": 25774 + }, + { + "epoch": 0.1532912265677039, + "grad_norm": 2.712846279144287, + "learning_rate": 4.715683777989971e-05, + "loss": 4.8726, + "step": 25775 + }, + { + "epoch": 0.15329717385098487, + "grad_norm": 1.9030331373214722, + "learning_rate": 4.7156621433626434e-05, + "loss": 4.6475, + "step": 25776 + }, + { + "epoch": 0.15330312113426586, + "grad_norm": 1.9939697980880737, + "learning_rate": 4.715640507961852e-05, + "loss": 4.8202, + "step": 25777 + }, + { + "epoch": 0.15330906841754685, + "grad_norm": 1.5398924350738525, + "learning_rate": 4.715618871787602e-05, + "loss": 4.9801, + "step": 25778 + }, + { + "epoch": 0.15331501570082787, + "grad_norm": 1.5413012504577637, + "learning_rate": 4.7155972348399034e-05, + "loss": 4.7795, + "step": 25779 + }, + { + "epoch": 0.15332096298410886, + "grad_norm": 1.6835294961929321, + "learning_rate": 4.7155755971187625e-05, + "loss": 4.5937, + "step": 25780 + }, + { + "epoch": 0.15332691026738984, + "grad_norm": 1.4007564783096313, + "learning_rate": 4.715553958624187e-05, + "loss": 5.5904, + "step": 25781 + }, + { + "epoch": 0.15333285755067086, + "grad_norm": 1.6113498210906982, + "learning_rate": 4.715532319356184e-05, + "loss": 5.1083, + "step": 25782 + }, + { + "epoch": 0.15333880483395185, + "grad_norm": 1.9218871593475342, + "learning_rate": 4.715510679314762e-05, + "loss": 4.6371, + "step": 25783 + }, + { + "epoch": 0.15334475211723284, + "grad_norm": 1.4686646461486816, + "learning_rate": 4.715489038499928e-05, + "loss": 5.0536, + "step": 25784 + }, + { + "epoch": 0.15335069940051385, + "grad_norm": 1.5875191688537598, + "learning_rate": 4.71546739691169e-05, + "loss": 4.5976, + "step": 25785 + }, + { + "epoch": 0.15335664668379484, + "grad_norm": 1.5260745286941528, + "learning_rate": 4.7154457545500554e-05, + "loss": 4.6875, + "step": 25786 + }, + { + "epoch": 0.15336259396707583, + "grad_norm": 1.8652924299240112, + "learning_rate": 4.715424111415031e-05, + "loss": 4.846, + "step": 25787 + }, + { + "epoch": 0.15336854125035684, + "grad_norm": 1.3980404138565063, + "learning_rate": 4.715402467506625e-05, + "loss": 5.2552, + "step": 25788 + }, + { + "epoch": 0.15337448853363783, + "grad_norm": 1.6307755708694458, + "learning_rate": 4.715380822824845e-05, + "loss": 5.1316, + "step": 25789 + }, + { + "epoch": 0.15338043581691882, + "grad_norm": 1.9057358503341675, + "learning_rate": 4.715359177369698e-05, + "loss": 4.6232, + "step": 25790 + }, + { + "epoch": 0.15338638310019984, + "grad_norm": 1.260809302330017, + "learning_rate": 4.715337531141193e-05, + "loss": 5.1614, + "step": 25791 + }, + { + "epoch": 0.15339233038348082, + "grad_norm": 2.7115111351013184, + "learning_rate": 4.7153158841393354e-05, + "loss": 3.6292, + "step": 25792 + }, + { + "epoch": 0.1533982776667618, + "grad_norm": 1.296697974205017, + "learning_rate": 4.715294236364135e-05, + "loss": 5.5909, + "step": 25793 + }, + { + "epoch": 0.15340422495004283, + "grad_norm": 1.466179370880127, + "learning_rate": 4.7152725878155975e-05, + "loss": 5.3005, + "step": 25794 + }, + { + "epoch": 0.15341017223332382, + "grad_norm": 1.5478910207748413, + "learning_rate": 4.715250938493732e-05, + "loss": 4.9116, + "step": 25795 + }, + { + "epoch": 0.1534161195166048, + "grad_norm": 1.371853232383728, + "learning_rate": 4.715229288398544e-05, + "loss": 5.2196, + "step": 25796 + }, + { + "epoch": 0.15342206679988582, + "grad_norm": 1.4444376230239868, + "learning_rate": 4.715207637530043e-05, + "loss": 4.9255, + "step": 25797 + }, + { + "epoch": 0.1534280140831668, + "grad_norm": 1.3257986307144165, + "learning_rate": 4.715185985888236e-05, + "loss": 4.9662, + "step": 25798 + }, + { + "epoch": 0.1534339613664478, + "grad_norm": 1.4831913709640503, + "learning_rate": 4.71516433347313e-05, + "loss": 4.9466, + "step": 25799 + }, + { + "epoch": 0.1534399086497288, + "grad_norm": 1.8146830797195435, + "learning_rate": 4.715142680284734e-05, + "loss": 4.711, + "step": 25800 + }, + { + "epoch": 0.1534458559330098, + "grad_norm": 1.73066246509552, + "learning_rate": 4.7151210263230536e-05, + "loss": 4.4107, + "step": 25801 + }, + { + "epoch": 0.1534518032162908, + "grad_norm": 2.014646291732788, + "learning_rate": 4.715099371588098e-05, + "loss": 4.6119, + "step": 25802 + }, + { + "epoch": 0.1534577504995718, + "grad_norm": 2.1739413738250732, + "learning_rate": 4.715077716079874e-05, + "loss": 4.4887, + "step": 25803 + }, + { + "epoch": 0.1534636977828528, + "grad_norm": 1.4722633361816406, + "learning_rate": 4.7150560597983895e-05, + "loss": 5.0312, + "step": 25804 + }, + { + "epoch": 0.15346964506613378, + "grad_norm": 1.654250144958496, + "learning_rate": 4.715034402743651e-05, + "loss": 4.8815, + "step": 25805 + }, + { + "epoch": 0.1534755923494148, + "grad_norm": 1.6598440408706665, + "learning_rate": 4.715012744915668e-05, + "loss": 4.3904, + "step": 25806 + }, + { + "epoch": 0.15348153963269578, + "grad_norm": 1.5754339694976807, + "learning_rate": 4.714991086314445e-05, + "loss": 4.4223, + "step": 25807 + }, + { + "epoch": 0.15348748691597677, + "grad_norm": 1.800657033920288, + "learning_rate": 4.714969426939994e-05, + "loss": 4.5314, + "step": 25808 + }, + { + "epoch": 0.1534934341992578, + "grad_norm": 1.8917250633239746, + "learning_rate": 4.714947766792318e-05, + "loss": 4.4049, + "step": 25809 + }, + { + "epoch": 0.15349938148253878, + "grad_norm": 1.9953207969665527, + "learning_rate": 4.714926105871428e-05, + "loss": 4.3155, + "step": 25810 + }, + { + "epoch": 0.15350532876581977, + "grad_norm": 1.7314120531082153, + "learning_rate": 4.714904444177329e-05, + "loss": 4.3324, + "step": 25811 + }, + { + "epoch": 0.15351127604910078, + "grad_norm": 1.577124834060669, + "learning_rate": 4.7148827817100306e-05, + "loss": 4.6899, + "step": 25812 + }, + { + "epoch": 0.15351722333238177, + "grad_norm": 1.6661646366119385, + "learning_rate": 4.714861118469539e-05, + "loss": 4.9735, + "step": 25813 + }, + { + "epoch": 0.15352317061566276, + "grad_norm": 1.8606276512145996, + "learning_rate": 4.714839454455863e-05, + "loss": 5.2351, + "step": 25814 + }, + { + "epoch": 0.15352911789894377, + "grad_norm": 2.0107643604278564, + "learning_rate": 4.7148177896690085e-05, + "loss": 4.4152, + "step": 25815 + }, + { + "epoch": 0.15353506518222476, + "grad_norm": 1.6447992324829102, + "learning_rate": 4.7147961241089846e-05, + "loss": 4.5391, + "step": 25816 + }, + { + "epoch": 0.15354101246550575, + "grad_norm": 1.6666457653045654, + "learning_rate": 4.714774457775798e-05, + "loss": 4.5104, + "step": 25817 + }, + { + "epoch": 0.15354695974878677, + "grad_norm": 1.7214492559432983, + "learning_rate": 4.714752790669457e-05, + "loss": 5.0634, + "step": 25818 + }, + { + "epoch": 0.15355290703206775, + "grad_norm": 1.5697379112243652, + "learning_rate": 4.714731122789968e-05, + "loss": 4.8279, + "step": 25819 + }, + { + "epoch": 0.15355885431534874, + "grad_norm": 2.531752109527588, + "learning_rate": 4.7147094541373395e-05, + "loss": 3.9172, + "step": 25820 + }, + { + "epoch": 0.15356480159862976, + "grad_norm": 1.5037142038345337, + "learning_rate": 4.714687784711579e-05, + "loss": 4.7534, + "step": 25821 + }, + { + "epoch": 0.15357074888191075, + "grad_norm": 1.5798907279968262, + "learning_rate": 4.714666114512693e-05, + "loss": 4.6779, + "step": 25822 + }, + { + "epoch": 0.15357669616519173, + "grad_norm": 1.5223065614700317, + "learning_rate": 4.714644443540691e-05, + "loss": 4.8612, + "step": 25823 + }, + { + "epoch": 0.15358264344847275, + "grad_norm": 1.7736209630966187, + "learning_rate": 4.714622771795579e-05, + "loss": 4.9765, + "step": 25824 + }, + { + "epoch": 0.15358859073175374, + "grad_norm": 1.5920718908309937, + "learning_rate": 4.714601099277365e-05, + "loss": 5.2479, + "step": 25825 + }, + { + "epoch": 0.15359453801503473, + "grad_norm": 1.7325233221054077, + "learning_rate": 4.7145794259860576e-05, + "loss": 4.9202, + "step": 25826 + }, + { + "epoch": 0.15360048529831574, + "grad_norm": 1.6514594554901123, + "learning_rate": 4.714557751921662e-05, + "loss": 4.9212, + "step": 25827 + }, + { + "epoch": 0.15360643258159673, + "grad_norm": 1.731692910194397, + "learning_rate": 4.714536077084188e-05, + "loss": 4.8916, + "step": 25828 + }, + { + "epoch": 0.15361237986487772, + "grad_norm": 1.7444603443145752, + "learning_rate": 4.714514401473642e-05, + "loss": 4.4659, + "step": 25829 + }, + { + "epoch": 0.15361832714815873, + "grad_norm": 1.7847130298614502, + "learning_rate": 4.714492725090033e-05, + "loss": 4.3516, + "step": 25830 + }, + { + "epoch": 0.15362427443143972, + "grad_norm": 1.6140960454940796, + "learning_rate": 4.714471047933366e-05, + "loss": 4.3894, + "step": 25831 + }, + { + "epoch": 0.1536302217147207, + "grad_norm": 1.5573277473449707, + "learning_rate": 4.714449370003651e-05, + "loss": 5.0749, + "step": 25832 + }, + { + "epoch": 0.15363616899800173, + "grad_norm": 1.7352724075317383, + "learning_rate": 4.7144276913008936e-05, + "loss": 4.6311, + "step": 25833 + }, + { + "epoch": 0.15364211628128271, + "grad_norm": 2.1136815547943115, + "learning_rate": 4.714406011825103e-05, + "loss": 3.9239, + "step": 25834 + }, + { + "epoch": 0.1536480635645637, + "grad_norm": 1.5329402685165405, + "learning_rate": 4.7143843315762856e-05, + "loss": 5.0124, + "step": 25835 + }, + { + "epoch": 0.1536540108478447, + "grad_norm": 1.6305334568023682, + "learning_rate": 4.7143626505544504e-05, + "loss": 5.3047, + "step": 25836 + }, + { + "epoch": 0.1536599581311257, + "grad_norm": 1.6582584381103516, + "learning_rate": 4.714340968759604e-05, + "loss": 4.909, + "step": 25837 + }, + { + "epoch": 0.1536659054144067, + "grad_norm": 1.581274151802063, + "learning_rate": 4.7143192861917536e-05, + "loss": 4.8241, + "step": 25838 + }, + { + "epoch": 0.15367185269768768, + "grad_norm": 1.6180393695831299, + "learning_rate": 4.7142976028509076e-05, + "loss": 4.6608, + "step": 25839 + }, + { + "epoch": 0.1536777999809687, + "grad_norm": 1.8333182334899902, + "learning_rate": 4.714275918737073e-05, + "loss": 5.3005, + "step": 25840 + }, + { + "epoch": 0.1536837472642497, + "grad_norm": 1.6652151346206665, + "learning_rate": 4.714254233850257e-05, + "loss": 4.5989, + "step": 25841 + }, + { + "epoch": 0.15368969454753068, + "grad_norm": 1.7609338760375977, + "learning_rate": 4.714232548190468e-05, + "loss": 5.2105, + "step": 25842 + }, + { + "epoch": 0.1536956418308117, + "grad_norm": 1.6076292991638184, + "learning_rate": 4.714210861757714e-05, + "loss": 5.32, + "step": 25843 + }, + { + "epoch": 0.15370158911409268, + "grad_norm": 1.6114000082015991, + "learning_rate": 4.7141891745520005e-05, + "loss": 5.1365, + "step": 25844 + }, + { + "epoch": 0.15370753639737367, + "grad_norm": 1.9237120151519775, + "learning_rate": 4.714167486573337e-05, + "loss": 4.8821, + "step": 25845 + }, + { + "epoch": 0.15371348368065468, + "grad_norm": 1.7089736461639404, + "learning_rate": 4.7141457978217315e-05, + "loss": 4.8468, + "step": 25846 + }, + { + "epoch": 0.15371943096393567, + "grad_norm": 1.6240943670272827, + "learning_rate": 4.71412410829719e-05, + "loss": 4.9153, + "step": 25847 + }, + { + "epoch": 0.15372537824721666, + "grad_norm": 1.4397730827331543, + "learning_rate": 4.7141024179997205e-05, + "loss": 5.0853, + "step": 25848 + }, + { + "epoch": 0.15373132553049768, + "grad_norm": 1.6480834484100342, + "learning_rate": 4.714080726929331e-05, + "loss": 4.6492, + "step": 25849 + }, + { + "epoch": 0.15373727281377866, + "grad_norm": 1.702221155166626, + "learning_rate": 4.714059035086028e-05, + "loss": 4.5677, + "step": 25850 + }, + { + "epoch": 0.15374322009705965, + "grad_norm": 1.5285601615905762, + "learning_rate": 4.7140373424698206e-05, + "loss": 4.621, + "step": 25851 + }, + { + "epoch": 0.15374916738034067, + "grad_norm": 2.0238354206085205, + "learning_rate": 4.7140156490807156e-05, + "loss": 4.6883, + "step": 25852 + }, + { + "epoch": 0.15375511466362166, + "grad_norm": 2.392547845840454, + "learning_rate": 4.713993954918721e-05, + "loss": 4.7537, + "step": 25853 + }, + { + "epoch": 0.15376106194690264, + "grad_norm": 2.639981746673584, + "learning_rate": 4.713972259983843e-05, + "loss": 3.958, + "step": 25854 + }, + { + "epoch": 0.15376700923018366, + "grad_norm": 2.11757755279541, + "learning_rate": 4.713950564276091e-05, + "loss": 5.0082, + "step": 25855 + }, + { + "epoch": 0.15377295651346465, + "grad_norm": 2.032003879547119, + "learning_rate": 4.713928867795471e-05, + "loss": 4.9212, + "step": 25856 + }, + { + "epoch": 0.15377890379674564, + "grad_norm": 1.7791013717651367, + "learning_rate": 4.713907170541991e-05, + "loss": 4.925, + "step": 25857 + }, + { + "epoch": 0.15378485108002665, + "grad_norm": 1.8376729488372803, + "learning_rate": 4.71388547251566e-05, + "loss": 5.1545, + "step": 25858 + }, + { + "epoch": 0.15379079836330764, + "grad_norm": 1.7532944679260254, + "learning_rate": 4.7138637737164836e-05, + "loss": 5.1329, + "step": 25859 + }, + { + "epoch": 0.15379674564658863, + "grad_norm": 2.4505176544189453, + "learning_rate": 4.7138420741444704e-05, + "loss": 4.8803, + "step": 25860 + }, + { + "epoch": 0.15380269292986964, + "grad_norm": 2.4481520652770996, + "learning_rate": 4.7138203737996283e-05, + "loss": 4.9071, + "step": 25861 + }, + { + "epoch": 0.15380864021315063, + "grad_norm": 1.805619716644287, + "learning_rate": 4.7137986726819636e-05, + "loss": 4.9145, + "step": 25862 + }, + { + "epoch": 0.15381458749643162, + "grad_norm": 1.353178858757019, + "learning_rate": 4.7137769707914856e-05, + "loss": 4.8159, + "step": 25863 + }, + { + "epoch": 0.15382053477971264, + "grad_norm": 2.1220030784606934, + "learning_rate": 4.7137552681282006e-05, + "loss": 4.7573, + "step": 25864 + }, + { + "epoch": 0.15382648206299362, + "grad_norm": 1.7052141427993774, + "learning_rate": 4.713733564692116e-05, + "loss": 5.0372, + "step": 25865 + }, + { + "epoch": 0.1538324293462746, + "grad_norm": 1.5306216478347778, + "learning_rate": 4.71371186048324e-05, + "loss": 5.0694, + "step": 25866 + }, + { + "epoch": 0.15383837662955563, + "grad_norm": 1.5422348976135254, + "learning_rate": 4.713690155501581e-05, + "loss": 5.1864, + "step": 25867 + }, + { + "epoch": 0.15384432391283662, + "grad_norm": 1.5703792572021484, + "learning_rate": 4.7136684497471444e-05, + "loss": 5.1686, + "step": 25868 + }, + { + "epoch": 0.1538502711961176, + "grad_norm": 1.6716407537460327, + "learning_rate": 4.7136467432199396e-05, + "loss": 5.2515, + "step": 25869 + }, + { + "epoch": 0.15385621847939862, + "grad_norm": 1.5796306133270264, + "learning_rate": 4.713625035919974e-05, + "loss": 5.0068, + "step": 25870 + }, + { + "epoch": 0.1538621657626796, + "grad_norm": 1.6445972919464111, + "learning_rate": 4.713603327847254e-05, + "loss": 4.9683, + "step": 25871 + }, + { + "epoch": 0.1538681130459606, + "grad_norm": 1.588665246963501, + "learning_rate": 4.713581619001788e-05, + "loss": 4.9913, + "step": 25872 + }, + { + "epoch": 0.1538740603292416, + "grad_norm": 1.5067355632781982, + "learning_rate": 4.713559909383584e-05, + "loss": 5.1648, + "step": 25873 + }, + { + "epoch": 0.1538800076125226, + "grad_norm": 1.6328977346420288, + "learning_rate": 4.713538198992649e-05, + "loss": 4.9316, + "step": 25874 + }, + { + "epoch": 0.1538859548958036, + "grad_norm": 1.6389905214309692, + "learning_rate": 4.7135164878289903e-05, + "loss": 5.1095, + "step": 25875 + }, + { + "epoch": 0.1538919021790846, + "grad_norm": 1.5004593133926392, + "learning_rate": 4.713494775892616e-05, + "loss": 4.8718, + "step": 25876 + }, + { + "epoch": 0.1538978494623656, + "grad_norm": 1.7928706407546997, + "learning_rate": 4.713473063183534e-05, + "loss": 5.1074, + "step": 25877 + }, + { + "epoch": 0.15390379674564658, + "grad_norm": 1.4132859706878662, + "learning_rate": 4.713451349701751e-05, + "loss": 5.2395, + "step": 25878 + }, + { + "epoch": 0.1539097440289276, + "grad_norm": 1.7291496992111206, + "learning_rate": 4.7134296354472754e-05, + "loss": 5.2648, + "step": 25879 + }, + { + "epoch": 0.15391569131220859, + "grad_norm": 1.6724679470062256, + "learning_rate": 4.713407920420114e-05, + "loss": 5.2074, + "step": 25880 + }, + { + "epoch": 0.15392163859548957, + "grad_norm": 1.5899326801300049, + "learning_rate": 4.713386204620275e-05, + "loss": 5.0018, + "step": 25881 + }, + { + "epoch": 0.1539275858787706, + "grad_norm": 1.5092980861663818, + "learning_rate": 4.7133644880477656e-05, + "loss": 5.2861, + "step": 25882 + }, + { + "epoch": 0.15393353316205158, + "grad_norm": 1.5518758296966553, + "learning_rate": 4.7133427707025935e-05, + "loss": 5.2302, + "step": 25883 + }, + { + "epoch": 0.15393948044533257, + "grad_norm": 1.8629082441329956, + "learning_rate": 4.713321052584766e-05, + "loss": 4.8252, + "step": 25884 + }, + { + "epoch": 0.15394542772861358, + "grad_norm": 1.618132472038269, + "learning_rate": 4.713299333694291e-05, + "loss": 5.0853, + "step": 25885 + }, + { + "epoch": 0.15395137501189457, + "grad_norm": 1.494831919670105, + "learning_rate": 4.713277614031177e-05, + "loss": 5.1517, + "step": 25886 + }, + { + "epoch": 0.15395732229517556, + "grad_norm": 1.6972736120224, + "learning_rate": 4.71325589359543e-05, + "loss": 5.3104, + "step": 25887 + }, + { + "epoch": 0.15396326957845657, + "grad_norm": 1.8251672983169556, + "learning_rate": 4.713234172387058e-05, + "loss": 5.0705, + "step": 25888 + }, + { + "epoch": 0.15396921686173756, + "grad_norm": 1.4835257530212402, + "learning_rate": 4.7132124504060696e-05, + "loss": 4.5481, + "step": 25889 + }, + { + "epoch": 0.15397516414501855, + "grad_norm": 1.447768211364746, + "learning_rate": 4.713190727652471e-05, + "loss": 4.7023, + "step": 25890 + }, + { + "epoch": 0.15398111142829957, + "grad_norm": 1.581663727760315, + "learning_rate": 4.71316900412627e-05, + "loss": 4.5446, + "step": 25891 + }, + { + "epoch": 0.15398705871158055, + "grad_norm": 1.5457055568695068, + "learning_rate": 4.7131472798274754e-05, + "loss": 4.8265, + "step": 25892 + }, + { + "epoch": 0.15399300599486154, + "grad_norm": 1.5043967962265015, + "learning_rate": 4.713125554756093e-05, + "loss": 5.2398, + "step": 25893 + }, + { + "epoch": 0.15399895327814253, + "grad_norm": 1.3700400590896606, + "learning_rate": 4.7131038289121324e-05, + "loss": 4.9516, + "step": 25894 + }, + { + "epoch": 0.15400490056142355, + "grad_norm": 1.4897541999816895, + "learning_rate": 4.713082102295599e-05, + "loss": 4.9884, + "step": 25895 + }, + { + "epoch": 0.15401084784470453, + "grad_norm": 1.560887098312378, + "learning_rate": 4.713060374906503e-05, + "loss": 4.8639, + "step": 25896 + }, + { + "epoch": 0.15401679512798552, + "grad_norm": 1.542069911956787, + "learning_rate": 4.7130386467448495e-05, + "loss": 4.7692, + "step": 25897 + }, + { + "epoch": 0.15402274241126654, + "grad_norm": 1.7924245595932007, + "learning_rate": 4.7130169178106465e-05, + "loss": 4.6172, + "step": 25898 + }, + { + "epoch": 0.15402868969454753, + "grad_norm": 1.4520066976547241, + "learning_rate": 4.7129951881039033e-05, + "loss": 4.9518, + "step": 25899 + }, + { + "epoch": 0.15403463697782852, + "grad_norm": 1.4653339385986328, + "learning_rate": 4.7129734576246255e-05, + "loss": 5.0738, + "step": 25900 + }, + { + "epoch": 0.15404058426110953, + "grad_norm": 1.2604494094848633, + "learning_rate": 4.7129517263728224e-05, + "loss": 5.0677, + "step": 25901 + }, + { + "epoch": 0.15404653154439052, + "grad_norm": 1.4956402778625488, + "learning_rate": 4.7129299943485e-05, + "loss": 5.0547, + "step": 25902 + }, + { + "epoch": 0.1540524788276715, + "grad_norm": 1.3395041227340698, + "learning_rate": 4.712908261551667e-05, + "loss": 4.9042, + "step": 25903 + }, + { + "epoch": 0.15405842611095252, + "grad_norm": 1.4592647552490234, + "learning_rate": 4.7128865279823304e-05, + "loss": 4.8363, + "step": 25904 + }, + { + "epoch": 0.1540643733942335, + "grad_norm": 1.339340329170227, + "learning_rate": 4.712864793640498e-05, + "loss": 4.8916, + "step": 25905 + }, + { + "epoch": 0.1540703206775145, + "grad_norm": 1.5001643896102905, + "learning_rate": 4.7128430585261775e-05, + "loss": 5.1015, + "step": 25906 + }, + { + "epoch": 0.15407626796079552, + "grad_norm": 1.3876299858093262, + "learning_rate": 4.7128213226393756e-05, + "loss": 5.0368, + "step": 25907 + }, + { + "epoch": 0.1540822152440765, + "grad_norm": 1.4904955625534058, + "learning_rate": 4.712799585980101e-05, + "loss": 5.0785, + "step": 25908 + }, + { + "epoch": 0.1540881625273575, + "grad_norm": 1.4284460544586182, + "learning_rate": 4.712777848548362e-05, + "loss": 5.0015, + "step": 25909 + }, + { + "epoch": 0.1540941098106385, + "grad_norm": 1.4823048114776611, + "learning_rate": 4.712756110344164e-05, + "loss": 4.9969, + "step": 25910 + }, + { + "epoch": 0.1541000570939195, + "grad_norm": 1.5989056825637817, + "learning_rate": 4.712734371367516e-05, + "loss": 5.4401, + "step": 25911 + }, + { + "epoch": 0.15410600437720048, + "grad_norm": 1.475415587425232, + "learning_rate": 4.7127126316184256e-05, + "loss": 5.3553, + "step": 25912 + }, + { + "epoch": 0.1541119516604815, + "grad_norm": 1.3556677103042603, + "learning_rate": 4.712690891096899e-05, + "loss": 5.4228, + "step": 25913 + }, + { + "epoch": 0.1541178989437625, + "grad_norm": 1.4386837482452393, + "learning_rate": 4.712669149802946e-05, + "loss": 5.387, + "step": 25914 + }, + { + "epoch": 0.15412384622704348, + "grad_norm": 1.4365500211715698, + "learning_rate": 4.712647407736573e-05, + "loss": 4.8597, + "step": 25915 + }, + { + "epoch": 0.1541297935103245, + "grad_norm": 1.5703059434890747, + "learning_rate": 4.712625664897788e-05, + "loss": 5.2659, + "step": 25916 + }, + { + "epoch": 0.15413574079360548, + "grad_norm": 1.5057390928268433, + "learning_rate": 4.712603921286597e-05, + "loss": 4.9931, + "step": 25917 + }, + { + "epoch": 0.15414168807688647, + "grad_norm": 1.2982683181762695, + "learning_rate": 4.712582176903009e-05, + "loss": 5.5226, + "step": 25918 + }, + { + "epoch": 0.15414763536016748, + "grad_norm": 1.4120944738388062, + "learning_rate": 4.712560431747032e-05, + "loss": 5.4037, + "step": 25919 + }, + { + "epoch": 0.15415358264344847, + "grad_norm": 1.3634661436080933, + "learning_rate": 4.712538685818673e-05, + "loss": 5.521, + "step": 25920 + }, + { + "epoch": 0.15415952992672946, + "grad_norm": 1.3352160453796387, + "learning_rate": 4.7125169391179394e-05, + "loss": 5.2938, + "step": 25921 + }, + { + "epoch": 0.15416547721001048, + "grad_norm": 1.3874114751815796, + "learning_rate": 4.712495191644839e-05, + "loss": 5.272, + "step": 25922 + }, + { + "epoch": 0.15417142449329146, + "grad_norm": 1.5225552320480347, + "learning_rate": 4.712473443399379e-05, + "loss": 5.3211, + "step": 25923 + }, + { + "epoch": 0.15417737177657245, + "grad_norm": 1.4493452310562134, + "learning_rate": 4.712451694381568e-05, + "loss": 5.2799, + "step": 25924 + }, + { + "epoch": 0.15418331905985347, + "grad_norm": 1.3240947723388672, + "learning_rate": 4.712429944591413e-05, + "loss": 5.441, + "step": 25925 + }, + { + "epoch": 0.15418926634313446, + "grad_norm": 1.2881836891174316, + "learning_rate": 4.712408194028921e-05, + "loss": 5.4478, + "step": 25926 + }, + { + "epoch": 0.15419521362641544, + "grad_norm": 1.4163159132003784, + "learning_rate": 4.712386442694101e-05, + "loss": 5.252, + "step": 25927 + }, + { + "epoch": 0.15420116090969646, + "grad_norm": 1.4597609043121338, + "learning_rate": 4.712364690586959e-05, + "loss": 5.4359, + "step": 25928 + }, + { + "epoch": 0.15420710819297745, + "grad_norm": 1.31305992603302, + "learning_rate": 4.7123429377075036e-05, + "loss": 5.3141, + "step": 25929 + }, + { + "epoch": 0.15421305547625844, + "grad_norm": 1.1765657663345337, + "learning_rate": 4.712321184055742e-05, + "loss": 5.1828, + "step": 25930 + }, + { + "epoch": 0.15421900275953945, + "grad_norm": 1.3116487264633179, + "learning_rate": 4.7122994296316824e-05, + "loss": 5.4107, + "step": 25931 + }, + { + "epoch": 0.15422495004282044, + "grad_norm": 1.3636351823806763, + "learning_rate": 4.712277674435331e-05, + "loss": 5.3273, + "step": 25932 + }, + { + "epoch": 0.15423089732610143, + "grad_norm": 1.4326391220092773, + "learning_rate": 4.712255918466697e-05, + "loss": 5.4123, + "step": 25933 + }, + { + "epoch": 0.15423684460938245, + "grad_norm": 1.3996350765228271, + "learning_rate": 4.712234161725788e-05, + "loss": 5.3111, + "step": 25934 + }, + { + "epoch": 0.15424279189266343, + "grad_norm": 1.5358290672302246, + "learning_rate": 4.712212404212609e-05, + "loss": 5.4522, + "step": 25935 + }, + { + "epoch": 0.15424873917594442, + "grad_norm": 1.3900970220565796, + "learning_rate": 4.7121906459271716e-05, + "loss": 5.6671, + "step": 25936 + }, + { + "epoch": 0.15425468645922544, + "grad_norm": 1.5113252401351929, + "learning_rate": 4.71216888686948e-05, + "loss": 5.0736, + "step": 25937 + }, + { + "epoch": 0.15426063374250643, + "grad_norm": 1.434477686882019, + "learning_rate": 4.7121471270395434e-05, + "loss": 5.259, + "step": 25938 + }, + { + "epoch": 0.1542665810257874, + "grad_norm": 1.4467335939407349, + "learning_rate": 4.712125366437369e-05, + "loss": 5.3382, + "step": 25939 + }, + { + "epoch": 0.15427252830906843, + "grad_norm": 1.6080671548843384, + "learning_rate": 4.712103605062965e-05, + "loss": 5.1767, + "step": 25940 + }, + { + "epoch": 0.15427847559234942, + "grad_norm": 1.497689962387085, + "learning_rate": 4.712081842916338e-05, + "loss": 4.884, + "step": 25941 + }, + { + "epoch": 0.1542844228756304, + "grad_norm": 1.691441535949707, + "learning_rate": 4.712060079997496e-05, + "loss": 5.2065, + "step": 25942 + }, + { + "epoch": 0.15429037015891142, + "grad_norm": 1.4759876728057861, + "learning_rate": 4.712038316306447e-05, + "loss": 5.17, + "step": 25943 + }, + { + "epoch": 0.1542963174421924, + "grad_norm": 1.4109833240509033, + "learning_rate": 4.712016551843198e-05, + "loss": 5.1986, + "step": 25944 + }, + { + "epoch": 0.1543022647254734, + "grad_norm": 1.4481924772262573, + "learning_rate": 4.7119947866077566e-05, + "loss": 4.9301, + "step": 25945 + }, + { + "epoch": 0.15430821200875441, + "grad_norm": 1.4721769094467163, + "learning_rate": 4.711973020600131e-05, + "loss": 5.123, + "step": 25946 + }, + { + "epoch": 0.1543141592920354, + "grad_norm": 1.6822638511657715, + "learning_rate": 4.711951253820329e-05, + "loss": 5.122, + "step": 25947 + }, + { + "epoch": 0.1543201065753164, + "grad_norm": 1.6047651767730713, + "learning_rate": 4.711929486268357e-05, + "loss": 5.1417, + "step": 25948 + }, + { + "epoch": 0.1543260538585974, + "grad_norm": 1.4773536920547485, + "learning_rate": 4.711907717944224e-05, + "loss": 4.9562, + "step": 25949 + }, + { + "epoch": 0.1543320011418784, + "grad_norm": 1.4373167753219604, + "learning_rate": 4.711885948847936e-05, + "loss": 5.3515, + "step": 25950 + }, + { + "epoch": 0.15433794842515938, + "grad_norm": 1.4517033100128174, + "learning_rate": 4.711864178979501e-05, + "loss": 5.0668, + "step": 25951 + }, + { + "epoch": 0.15434389570844037, + "grad_norm": 1.7582489252090454, + "learning_rate": 4.711842408338929e-05, + "loss": 4.7104, + "step": 25952 + }, + { + "epoch": 0.1543498429917214, + "grad_norm": 1.6162217855453491, + "learning_rate": 4.711820636926224e-05, + "loss": 4.7747, + "step": 25953 + }, + { + "epoch": 0.15435579027500237, + "grad_norm": 1.7326339483261108, + "learning_rate": 4.711798864741396e-05, + "loss": 4.818, + "step": 25954 + }, + { + "epoch": 0.15436173755828336, + "grad_norm": 1.642146110534668, + "learning_rate": 4.711777091784452e-05, + "loss": 4.7517, + "step": 25955 + }, + { + "epoch": 0.15436768484156438, + "grad_norm": 1.5122802257537842, + "learning_rate": 4.711755318055399e-05, + "loss": 5.0139, + "step": 25956 + }, + { + "epoch": 0.15437363212484537, + "grad_norm": 1.7299772500991821, + "learning_rate": 4.711733543554245e-05, + "loss": 4.9988, + "step": 25957 + }, + { + "epoch": 0.15437957940812636, + "grad_norm": 1.5812711715698242, + "learning_rate": 4.711711768280998e-05, + "loss": 4.7134, + "step": 25958 + }, + { + "epoch": 0.15438552669140737, + "grad_norm": 1.5953545570373535, + "learning_rate": 4.711689992235665e-05, + "loss": 4.9644, + "step": 25959 + }, + { + "epoch": 0.15439147397468836, + "grad_norm": 1.7964719533920288, + "learning_rate": 4.711668215418255e-05, + "loss": 4.8476, + "step": 25960 + }, + { + "epoch": 0.15439742125796935, + "grad_norm": 1.6458512544631958, + "learning_rate": 4.711646437828773e-05, + "loss": 4.8117, + "step": 25961 + }, + { + "epoch": 0.15440336854125036, + "grad_norm": 1.4821311235427856, + "learning_rate": 4.711624659467229e-05, + "loss": 4.8647, + "step": 25962 + }, + { + "epoch": 0.15440931582453135, + "grad_norm": 1.4640769958496094, + "learning_rate": 4.711602880333629e-05, + "loss": 5.0038, + "step": 25963 + }, + { + "epoch": 0.15441526310781234, + "grad_norm": 1.7705153226852417, + "learning_rate": 4.711581100427981e-05, + "loss": 5.12, + "step": 25964 + }, + { + "epoch": 0.15442121039109336, + "grad_norm": 1.7333801984786987, + "learning_rate": 4.711559319750294e-05, + "loss": 4.9785, + "step": 25965 + }, + { + "epoch": 0.15442715767437434, + "grad_norm": 1.6170109510421753, + "learning_rate": 4.711537538300574e-05, + "loss": 4.9764, + "step": 25966 + }, + { + "epoch": 0.15443310495765533, + "grad_norm": 1.4895650148391724, + "learning_rate": 4.7115157560788295e-05, + "loss": 4.5585, + "step": 25967 + }, + { + "epoch": 0.15443905224093635, + "grad_norm": 1.6678147315979004, + "learning_rate": 4.711493973085067e-05, + "loss": 4.7897, + "step": 25968 + }, + { + "epoch": 0.15444499952421734, + "grad_norm": 1.537511944770813, + "learning_rate": 4.7114721893192945e-05, + "loss": 4.8845, + "step": 25969 + }, + { + "epoch": 0.15445094680749832, + "grad_norm": 1.7167041301727295, + "learning_rate": 4.711450404781521e-05, + "loss": 4.9126, + "step": 25970 + }, + { + "epoch": 0.15445689409077934, + "grad_norm": 1.763170599937439, + "learning_rate": 4.711428619471752e-05, + "loss": 4.6864, + "step": 25971 + }, + { + "epoch": 0.15446284137406033, + "grad_norm": 1.4620569944381714, + "learning_rate": 4.7114068333899964e-05, + "loss": 4.744, + "step": 25972 + }, + { + "epoch": 0.15446878865734132, + "grad_norm": 1.6106908321380615, + "learning_rate": 4.711385046536262e-05, + "loss": 5.2037, + "step": 25973 + }, + { + "epoch": 0.15447473594062233, + "grad_norm": 2.173444986343384, + "learning_rate": 4.711363258910556e-05, + "loss": 4.8086, + "step": 25974 + }, + { + "epoch": 0.15448068322390332, + "grad_norm": 2.0350496768951416, + "learning_rate": 4.711341470512885e-05, + "loss": 4.7291, + "step": 25975 + }, + { + "epoch": 0.1544866305071843, + "grad_norm": 1.9148650169372559, + "learning_rate": 4.7113196813432584e-05, + "loss": 4.7627, + "step": 25976 + }, + { + "epoch": 0.15449257779046532, + "grad_norm": 1.9944121837615967, + "learning_rate": 4.711297891401683e-05, + "loss": 4.8124, + "step": 25977 + }, + { + "epoch": 0.1544985250737463, + "grad_norm": 1.515162706375122, + "learning_rate": 4.7112761006881655e-05, + "loss": 4.8781, + "step": 25978 + }, + { + "epoch": 0.1545044723570273, + "grad_norm": 1.7549412250518799, + "learning_rate": 4.711254309202715e-05, + "loss": 4.9173, + "step": 25979 + }, + { + "epoch": 0.15451041964030832, + "grad_norm": 1.5914033651351929, + "learning_rate": 4.711232516945338e-05, + "loss": 5.012, + "step": 25980 + }, + { + "epoch": 0.1545163669235893, + "grad_norm": 1.7436847686767578, + "learning_rate": 4.711210723916043e-05, + "loss": 4.4552, + "step": 25981 + }, + { + "epoch": 0.1545223142068703, + "grad_norm": 1.5679067373275757, + "learning_rate": 4.711188930114837e-05, + "loss": 4.9158, + "step": 25982 + }, + { + "epoch": 0.1545282614901513, + "grad_norm": 1.5164258480072021, + "learning_rate": 4.711167135541727e-05, + "loss": 4.2524, + "step": 25983 + }, + { + "epoch": 0.1545342087734323, + "grad_norm": 1.7215555906295776, + "learning_rate": 4.711145340196723e-05, + "loss": 4.4035, + "step": 25984 + }, + { + "epoch": 0.15454015605671328, + "grad_norm": 1.8671064376831055, + "learning_rate": 4.7111235440798303e-05, + "loss": 4.6875, + "step": 25985 + }, + { + "epoch": 0.1545461033399943, + "grad_norm": 1.760772705078125, + "learning_rate": 4.7111017471910566e-05, + "loss": 4.7645, + "step": 25986 + }, + { + "epoch": 0.1545520506232753, + "grad_norm": 1.8126411437988281, + "learning_rate": 4.7110799495304115e-05, + "loss": 5.1524, + "step": 25987 + }, + { + "epoch": 0.15455799790655628, + "grad_norm": 1.6593974828720093, + "learning_rate": 4.7110581510979e-05, + "loss": 5.1902, + "step": 25988 + }, + { + "epoch": 0.1545639451898373, + "grad_norm": 1.721921443939209, + "learning_rate": 4.711036351893532e-05, + "loss": 5.0316, + "step": 25989 + }, + { + "epoch": 0.15456989247311828, + "grad_norm": 2.030829668045044, + "learning_rate": 4.7110145519173135e-05, + "loss": 4.9087, + "step": 25990 + }, + { + "epoch": 0.15457583975639927, + "grad_norm": 1.6568117141723633, + "learning_rate": 4.710992751169252e-05, + "loss": 4.3814, + "step": 25991 + }, + { + "epoch": 0.15458178703968029, + "grad_norm": 1.667718768119812, + "learning_rate": 4.7109709496493565e-05, + "loss": 4.8191, + "step": 25992 + }, + { + "epoch": 0.15458773432296127, + "grad_norm": 1.6483817100524902, + "learning_rate": 4.710949147357634e-05, + "loss": 5.055, + "step": 25993 + }, + { + "epoch": 0.15459368160624226, + "grad_norm": 1.703580617904663, + "learning_rate": 4.710927344294092e-05, + "loss": 5.0259, + "step": 25994 + }, + { + "epoch": 0.15459962888952328, + "grad_norm": 1.512531042098999, + "learning_rate": 4.710905540458737e-05, + "loss": 5.1221, + "step": 25995 + }, + { + "epoch": 0.15460557617280427, + "grad_norm": 1.4010028839111328, + "learning_rate": 4.710883735851579e-05, + "loss": 5.2263, + "step": 25996 + }, + { + "epoch": 0.15461152345608525, + "grad_norm": 1.694629192352295, + "learning_rate": 4.710861930472624e-05, + "loss": 4.9348, + "step": 25997 + }, + { + "epoch": 0.15461747073936627, + "grad_norm": 1.5974243879318237, + "learning_rate": 4.710840124321879e-05, + "loss": 5.1262, + "step": 25998 + }, + { + "epoch": 0.15462341802264726, + "grad_norm": 1.6333894729614258, + "learning_rate": 4.7108183173993535e-05, + "loss": 4.6557, + "step": 25999 + }, + { + "epoch": 0.15462936530592825, + "grad_norm": 1.660767674446106, + "learning_rate": 4.710796509705054e-05, + "loss": 4.9764, + "step": 26000 + }, + { + "epoch": 0.15463531258920926, + "grad_norm": 1.5514689683914185, + "learning_rate": 4.710774701238989e-05, + "loss": 4.8895, + "step": 26001 + }, + { + "epoch": 0.15464125987249025, + "grad_norm": 1.7753626108169556, + "learning_rate": 4.7107528920011645e-05, + "loss": 5.1251, + "step": 26002 + }, + { + "epoch": 0.15464720715577124, + "grad_norm": 1.5963994264602661, + "learning_rate": 4.7107310819915895e-05, + "loss": 4.9678, + "step": 26003 + }, + { + "epoch": 0.15465315443905225, + "grad_norm": 1.7098819017410278, + "learning_rate": 4.7107092712102706e-05, + "loss": 4.7313, + "step": 26004 + }, + { + "epoch": 0.15465910172233324, + "grad_norm": 1.7636046409606934, + "learning_rate": 4.710687459657216e-05, + "loss": 4.7752, + "step": 26005 + }, + { + "epoch": 0.15466504900561423, + "grad_norm": 1.5514246225357056, + "learning_rate": 4.7106656473324336e-05, + "loss": 4.6835, + "step": 26006 + }, + { + "epoch": 0.15467099628889525, + "grad_norm": 1.6040410995483398, + "learning_rate": 4.7106438342359303e-05, + "loss": 4.8096, + "step": 26007 + }, + { + "epoch": 0.15467694357217623, + "grad_norm": 1.622213363647461, + "learning_rate": 4.7106220203677144e-05, + "loss": 5.0896, + "step": 26008 + }, + { + "epoch": 0.15468289085545722, + "grad_norm": 1.6227675676345825, + "learning_rate": 4.710600205727793e-05, + "loss": 5.0895, + "step": 26009 + }, + { + "epoch": 0.1546888381387382, + "grad_norm": 1.6498078107833862, + "learning_rate": 4.710578390316174e-05, + "loss": 4.8625, + "step": 26010 + }, + { + "epoch": 0.15469478542201923, + "grad_norm": 1.6175272464752197, + "learning_rate": 4.710556574132865e-05, + "loss": 4.9729, + "step": 26011 + }, + { + "epoch": 0.15470073270530021, + "grad_norm": 1.5892902612686157, + "learning_rate": 4.7105347571778735e-05, + "loss": 4.755, + "step": 26012 + }, + { + "epoch": 0.1547066799885812, + "grad_norm": 1.4750880002975464, + "learning_rate": 4.710512939451207e-05, + "loss": 4.7497, + "step": 26013 + }, + { + "epoch": 0.15471262727186222, + "grad_norm": 1.5363775491714478, + "learning_rate": 4.710491120952874e-05, + "loss": 5.1039, + "step": 26014 + }, + { + "epoch": 0.1547185745551432, + "grad_norm": 1.5225108861923218, + "learning_rate": 4.71046930168288e-05, + "loss": 4.782, + "step": 26015 + }, + { + "epoch": 0.1547245218384242, + "grad_norm": 1.6348788738250732, + "learning_rate": 4.7104474816412345e-05, + "loss": 4.9252, + "step": 26016 + }, + { + "epoch": 0.1547304691217052, + "grad_norm": 1.6000639200210571, + "learning_rate": 4.7104256608279454e-05, + "loss": 4.9286, + "step": 26017 + }, + { + "epoch": 0.1547364164049862, + "grad_norm": 1.4785354137420654, + "learning_rate": 4.710403839243018e-05, + "loss": 4.7383, + "step": 26018 + }, + { + "epoch": 0.1547423636882672, + "grad_norm": 1.548176884651184, + "learning_rate": 4.710382016886463e-05, + "loss": 4.7526, + "step": 26019 + }, + { + "epoch": 0.1547483109715482, + "grad_norm": 1.537049651145935, + "learning_rate": 4.710360193758287e-05, + "loss": 4.6532, + "step": 26020 + }, + { + "epoch": 0.1547542582548292, + "grad_norm": 1.4506211280822754, + "learning_rate": 4.710338369858495e-05, + "loss": 5.1028, + "step": 26021 + }, + { + "epoch": 0.15476020553811018, + "grad_norm": 1.4539066553115845, + "learning_rate": 4.710316545187098e-05, + "loss": 5.0396, + "step": 26022 + }, + { + "epoch": 0.1547661528213912, + "grad_norm": 1.408674716949463, + "learning_rate": 4.7102947197441016e-05, + "loss": 5.2779, + "step": 26023 + }, + { + "epoch": 0.15477210010467218, + "grad_norm": 1.5732898712158203, + "learning_rate": 4.710272893529515e-05, + "loss": 5.1519, + "step": 26024 + }, + { + "epoch": 0.15477804738795317, + "grad_norm": 1.5260519981384277, + "learning_rate": 4.710251066543344e-05, + "loss": 5.056, + "step": 26025 + }, + { + "epoch": 0.1547839946712342, + "grad_norm": 1.4518004655838013, + "learning_rate": 4.710229238785598e-05, + "loss": 4.9322, + "step": 26026 + }, + { + "epoch": 0.15478994195451518, + "grad_norm": 1.6032034158706665, + "learning_rate": 4.7102074102562835e-05, + "loss": 5.0368, + "step": 26027 + }, + { + "epoch": 0.15479588923779616, + "grad_norm": 1.6396820545196533, + "learning_rate": 4.7101855809554085e-05, + "loss": 4.4808, + "step": 26028 + }, + { + "epoch": 0.15480183652107718, + "grad_norm": 1.6207085847854614, + "learning_rate": 4.710163750882981e-05, + "loss": 4.5206, + "step": 26029 + }, + { + "epoch": 0.15480778380435817, + "grad_norm": 1.5769189596176147, + "learning_rate": 4.7101419200390073e-05, + "loss": 4.4192, + "step": 26030 + }, + { + "epoch": 0.15481373108763916, + "grad_norm": 1.4689233303070068, + "learning_rate": 4.710120088423496e-05, + "loss": 4.8726, + "step": 26031 + }, + { + "epoch": 0.15481967837092017, + "grad_norm": 1.3557206392288208, + "learning_rate": 4.710098256036455e-05, + "loss": 5.1076, + "step": 26032 + }, + { + "epoch": 0.15482562565420116, + "grad_norm": 1.561497688293457, + "learning_rate": 4.710076422877891e-05, + "loss": 4.6845, + "step": 26033 + }, + { + "epoch": 0.15483157293748215, + "grad_norm": 1.6871447563171387, + "learning_rate": 4.710054588947813e-05, + "loss": 4.8231, + "step": 26034 + }, + { + "epoch": 0.15483752022076316, + "grad_norm": 1.7153793573379517, + "learning_rate": 4.710032754246228e-05, + "loss": 4.767, + "step": 26035 + }, + { + "epoch": 0.15484346750404415, + "grad_norm": 1.6859761476516724, + "learning_rate": 4.710010918773142e-05, + "loss": 4.6774, + "step": 26036 + }, + { + "epoch": 0.15484941478732514, + "grad_norm": 1.4598466157913208, + "learning_rate": 4.709989082528565e-05, + "loss": 4.8141, + "step": 26037 + }, + { + "epoch": 0.15485536207060616, + "grad_norm": 1.572952389717102, + "learning_rate": 4.709967245512504e-05, + "loss": 5.0215, + "step": 26038 + }, + { + "epoch": 0.15486130935388714, + "grad_norm": 1.6656177043914795, + "learning_rate": 4.7099454077249655e-05, + "loss": 4.5755, + "step": 26039 + }, + { + "epoch": 0.15486725663716813, + "grad_norm": 1.4872766733169556, + "learning_rate": 4.709923569165958e-05, + "loss": 4.9086, + "step": 26040 + }, + { + "epoch": 0.15487320392044915, + "grad_norm": 1.603215217590332, + "learning_rate": 4.70990172983549e-05, + "loss": 4.8528, + "step": 26041 + }, + { + "epoch": 0.15487915120373014, + "grad_norm": 1.5077006816864014, + "learning_rate": 4.7098798897335664e-05, + "loss": 4.8544, + "step": 26042 + }, + { + "epoch": 0.15488509848701112, + "grad_norm": 1.515825629234314, + "learning_rate": 4.709858048860197e-05, + "loss": 4.7793, + "step": 26043 + }, + { + "epoch": 0.15489104577029214, + "grad_norm": 1.472776174545288, + "learning_rate": 4.7098362072153904e-05, + "loss": 4.8047, + "step": 26044 + }, + { + "epoch": 0.15489699305357313, + "grad_norm": 1.5982736349105835, + "learning_rate": 4.709814364799151e-05, + "loss": 4.9911, + "step": 26045 + }, + { + "epoch": 0.15490294033685412, + "grad_norm": 1.3136348724365234, + "learning_rate": 4.709792521611489e-05, + "loss": 5.3009, + "step": 26046 + }, + { + "epoch": 0.15490888762013513, + "grad_norm": 1.6178503036499023, + "learning_rate": 4.709770677652412e-05, + "loss": 4.7873, + "step": 26047 + }, + { + "epoch": 0.15491483490341612, + "grad_norm": 1.544202446937561, + "learning_rate": 4.709748832921926e-05, + "loss": 4.645, + "step": 26048 + }, + { + "epoch": 0.1549207821866971, + "grad_norm": 1.359904408454895, + "learning_rate": 4.70972698742004e-05, + "loss": 5.0246, + "step": 26049 + }, + { + "epoch": 0.15492672946997812, + "grad_norm": 1.4320893287658691, + "learning_rate": 4.7097051411467606e-05, + "loss": 5.0227, + "step": 26050 + }, + { + "epoch": 0.1549326767532591, + "grad_norm": 1.7229030132293701, + "learning_rate": 4.7096832941020963e-05, + "loss": 5.2792, + "step": 26051 + }, + { + "epoch": 0.1549386240365401, + "grad_norm": 1.672554850578308, + "learning_rate": 4.709661446286054e-05, + "loss": 4.9227, + "step": 26052 + }, + { + "epoch": 0.15494457131982112, + "grad_norm": 1.5159001350402832, + "learning_rate": 4.709639597698642e-05, + "loss": 4.7464, + "step": 26053 + }, + { + "epoch": 0.1549505186031021, + "grad_norm": 1.5735573768615723, + "learning_rate": 4.7096177483398676e-05, + "loss": 5.2281, + "step": 26054 + }, + { + "epoch": 0.1549564658863831, + "grad_norm": 1.4174078702926636, + "learning_rate": 4.709595898209739e-05, + "loss": 5.138, + "step": 26055 + }, + { + "epoch": 0.1549624131696641, + "grad_norm": 1.3748446702957153, + "learning_rate": 4.7095740473082626e-05, + "loss": 5.2084, + "step": 26056 + }, + { + "epoch": 0.1549683604529451, + "grad_norm": 1.5169907808303833, + "learning_rate": 4.709552195635447e-05, + "loss": 5.3272, + "step": 26057 + }, + { + "epoch": 0.15497430773622609, + "grad_norm": 1.6235400438308716, + "learning_rate": 4.7095303431912994e-05, + "loss": 5.2201, + "step": 26058 + }, + { + "epoch": 0.1549802550195071, + "grad_norm": 1.571418046951294, + "learning_rate": 4.709508489975828e-05, + "loss": 5.3584, + "step": 26059 + }, + { + "epoch": 0.1549862023027881, + "grad_norm": 1.690524697303772, + "learning_rate": 4.70948663598904e-05, + "loss": 5.3091, + "step": 26060 + }, + { + "epoch": 0.15499214958606908, + "grad_norm": 1.6778768301010132, + "learning_rate": 4.7094647812309424e-05, + "loss": 4.8765, + "step": 26061 + }, + { + "epoch": 0.1549980968693501, + "grad_norm": 1.6365214586257935, + "learning_rate": 4.709442925701544e-05, + "loss": 5.4826, + "step": 26062 + }, + { + "epoch": 0.15500404415263108, + "grad_norm": 1.4799535274505615, + "learning_rate": 4.709421069400851e-05, + "loss": 5.5668, + "step": 26063 + }, + { + "epoch": 0.15500999143591207, + "grad_norm": 1.5750006437301636, + "learning_rate": 4.7093992123288734e-05, + "loss": 5.235, + "step": 26064 + }, + { + "epoch": 0.15501593871919309, + "grad_norm": 1.8067607879638672, + "learning_rate": 4.7093773544856165e-05, + "loss": 5.2708, + "step": 26065 + }, + { + "epoch": 0.15502188600247407, + "grad_norm": 1.4780645370483398, + "learning_rate": 4.709355495871088e-05, + "loss": 5.1626, + "step": 26066 + }, + { + "epoch": 0.15502783328575506, + "grad_norm": 1.5702919960021973, + "learning_rate": 4.709333636485298e-05, + "loss": 5.2306, + "step": 26067 + }, + { + "epoch": 0.15503378056903608, + "grad_norm": 1.7658028602600098, + "learning_rate": 4.7093117763282515e-05, + "loss": 4.9352, + "step": 26068 + }, + { + "epoch": 0.15503972785231707, + "grad_norm": 1.69098961353302, + "learning_rate": 4.709289915399957e-05, + "loss": 4.7679, + "step": 26069 + }, + { + "epoch": 0.15504567513559805, + "grad_norm": 1.704026460647583, + "learning_rate": 4.709268053700423e-05, + "loss": 4.6209, + "step": 26070 + }, + { + "epoch": 0.15505162241887904, + "grad_norm": 1.4715653657913208, + "learning_rate": 4.709246191229656e-05, + "loss": 5.1664, + "step": 26071 + }, + { + "epoch": 0.15505756970216006, + "grad_norm": 1.5663673877716064, + "learning_rate": 4.7092243279876634e-05, + "loss": 5.3833, + "step": 26072 + }, + { + "epoch": 0.15506351698544105, + "grad_norm": 1.4647293090820312, + "learning_rate": 4.709202463974454e-05, + "loss": 5.2766, + "step": 26073 + }, + { + "epoch": 0.15506946426872203, + "grad_norm": 1.5950292348861694, + "learning_rate": 4.7091805991900344e-05, + "loss": 5.2686, + "step": 26074 + }, + { + "epoch": 0.15507541155200305, + "grad_norm": 1.593206524848938, + "learning_rate": 4.709158733634413e-05, + "loss": 4.9969, + "step": 26075 + }, + { + "epoch": 0.15508135883528404, + "grad_norm": 1.5884050130844116, + "learning_rate": 4.7091368673075975e-05, + "loss": 4.9804, + "step": 26076 + }, + { + "epoch": 0.15508730611856503, + "grad_norm": 1.5333365201950073, + "learning_rate": 4.709115000209594e-05, + "loss": 4.6808, + "step": 26077 + }, + { + "epoch": 0.15509325340184604, + "grad_norm": 1.4642858505249023, + "learning_rate": 4.7090931323404116e-05, + "loss": 4.6828, + "step": 26078 + }, + { + "epoch": 0.15509920068512703, + "grad_norm": 2.0302491188049316, + "learning_rate": 4.709071263700059e-05, + "loss": 4.5523, + "step": 26079 + }, + { + "epoch": 0.15510514796840802, + "grad_norm": 1.6798481941223145, + "learning_rate": 4.709049394288541e-05, + "loss": 5.1286, + "step": 26080 + }, + { + "epoch": 0.15511109525168904, + "grad_norm": 1.5074591636657715, + "learning_rate": 4.7090275241058676e-05, + "loss": 5.3037, + "step": 26081 + }, + { + "epoch": 0.15511704253497002, + "grad_norm": 1.7001566886901855, + "learning_rate": 4.709005653152044e-05, + "loss": 5.0217, + "step": 26082 + }, + { + "epoch": 0.155122989818251, + "grad_norm": 1.84412682056427, + "learning_rate": 4.708983781427081e-05, + "loss": 4.5579, + "step": 26083 + }, + { + "epoch": 0.15512893710153203, + "grad_norm": 1.770264744758606, + "learning_rate": 4.708961908930984e-05, + "loss": 4.7394, + "step": 26084 + }, + { + "epoch": 0.15513488438481302, + "grad_norm": 1.7658874988555908, + "learning_rate": 4.7089400356637615e-05, + "loss": 4.9278, + "step": 26085 + }, + { + "epoch": 0.155140831668094, + "grad_norm": 1.5701930522918701, + "learning_rate": 4.7089181616254204e-05, + "loss": 4.7227, + "step": 26086 + }, + { + "epoch": 0.15514677895137502, + "grad_norm": 1.5790002346038818, + "learning_rate": 4.708896286815969e-05, + "loss": 4.9207, + "step": 26087 + }, + { + "epoch": 0.155152726234656, + "grad_norm": 1.8411163091659546, + "learning_rate": 4.7088744112354146e-05, + "loss": 3.8647, + "step": 26088 + }, + { + "epoch": 0.155158673517937, + "grad_norm": 1.813536524772644, + "learning_rate": 4.708852534883765e-05, + "loss": 4.1148, + "step": 26089 + }, + { + "epoch": 0.155164620801218, + "grad_norm": 1.6122519969940186, + "learning_rate": 4.708830657761028e-05, + "loss": 4.9749, + "step": 26090 + }, + { + "epoch": 0.155170568084499, + "grad_norm": 1.9105713367462158, + "learning_rate": 4.70880877986721e-05, + "loss": 4.9895, + "step": 26091 + }, + { + "epoch": 0.15517651536778, + "grad_norm": 1.849824070930481, + "learning_rate": 4.7087869012023215e-05, + "loss": 5.5382, + "step": 26092 + }, + { + "epoch": 0.155182462651061, + "grad_norm": 2.346090793609619, + "learning_rate": 4.708765021766367e-05, + "loss": 5.6398, + "step": 26093 + }, + { + "epoch": 0.155188409934342, + "grad_norm": 1.8905435800552368, + "learning_rate": 4.7087431415593555e-05, + "loss": 5.6089, + "step": 26094 + }, + { + "epoch": 0.15519435721762298, + "grad_norm": 1.6987192630767822, + "learning_rate": 4.7087212605812944e-05, + "loss": 5.4127, + "step": 26095 + }, + { + "epoch": 0.155200304500904, + "grad_norm": 1.7915600538253784, + "learning_rate": 4.708699378832193e-05, + "loss": 4.9027, + "step": 26096 + }, + { + "epoch": 0.15520625178418498, + "grad_norm": 1.5736148357391357, + "learning_rate": 4.708677496312056e-05, + "loss": 5.1403, + "step": 26097 + }, + { + "epoch": 0.15521219906746597, + "grad_norm": 1.6473568677902222, + "learning_rate": 4.708655613020893e-05, + "loss": 5.0299, + "step": 26098 + }, + { + "epoch": 0.155218146350747, + "grad_norm": 1.733720064163208, + "learning_rate": 4.708633728958711e-05, + "loss": 5.0153, + "step": 26099 + }, + { + "epoch": 0.15522409363402798, + "grad_norm": 1.842244267463684, + "learning_rate": 4.708611844125518e-05, + "loss": 4.7, + "step": 26100 + }, + { + "epoch": 0.15523004091730896, + "grad_norm": 1.8227342367172241, + "learning_rate": 4.708589958521321e-05, + "loss": 4.4889, + "step": 26101 + }, + { + "epoch": 0.15523598820058998, + "grad_norm": 1.7300339937210083, + "learning_rate": 4.708568072146129e-05, + "loss": 5.0326, + "step": 26102 + }, + { + "epoch": 0.15524193548387097, + "grad_norm": 2.0854434967041016, + "learning_rate": 4.708546184999948e-05, + "loss": 5.6966, + "step": 26103 + }, + { + "epoch": 0.15524788276715196, + "grad_norm": 1.5393275022506714, + "learning_rate": 4.708524297082786e-05, + "loss": 5.5777, + "step": 26104 + }, + { + "epoch": 0.15525383005043297, + "grad_norm": 1.7765403985977173, + "learning_rate": 4.7085024083946514e-05, + "loss": 5.7488, + "step": 26105 + }, + { + "epoch": 0.15525977733371396, + "grad_norm": 1.668286919593811, + "learning_rate": 4.708480518935552e-05, + "loss": 5.3823, + "step": 26106 + }, + { + "epoch": 0.15526572461699495, + "grad_norm": 1.7656164169311523, + "learning_rate": 4.708458628705494e-05, + "loss": 5.1098, + "step": 26107 + }, + { + "epoch": 0.15527167190027596, + "grad_norm": 1.6078004837036133, + "learning_rate": 4.708436737704486e-05, + "loss": 4.8957, + "step": 26108 + }, + { + "epoch": 0.15527761918355695, + "grad_norm": 1.5649595260620117, + "learning_rate": 4.7084148459325364e-05, + "loss": 5.4546, + "step": 26109 + }, + { + "epoch": 0.15528356646683794, + "grad_norm": 1.7555382251739502, + "learning_rate": 4.7083929533896506e-05, + "loss": 5.6428, + "step": 26110 + }, + { + "epoch": 0.15528951375011896, + "grad_norm": 1.7282280921936035, + "learning_rate": 4.708371060075839e-05, + "loss": 5.4197, + "step": 26111 + }, + { + "epoch": 0.15529546103339995, + "grad_norm": 1.8044626712799072, + "learning_rate": 4.708349165991107e-05, + "loss": 5.4676, + "step": 26112 + }, + { + "epoch": 0.15530140831668093, + "grad_norm": 1.6488827466964722, + "learning_rate": 4.7083272711354634e-05, + "loss": 5.2725, + "step": 26113 + }, + { + "epoch": 0.15530735559996195, + "grad_norm": 1.9291478395462036, + "learning_rate": 4.7083053755089155e-05, + "loss": 5.2565, + "step": 26114 + }, + { + "epoch": 0.15531330288324294, + "grad_norm": 1.9248192310333252, + "learning_rate": 4.708283479111471e-05, + "loss": 5.2514, + "step": 26115 + }, + { + "epoch": 0.15531925016652393, + "grad_norm": 1.9327218532562256, + "learning_rate": 4.708261581943137e-05, + "loss": 5.0833, + "step": 26116 + }, + { + "epoch": 0.15532519744980494, + "grad_norm": 1.952842354774475, + "learning_rate": 4.708239684003923e-05, + "loss": 5.0989, + "step": 26117 + }, + { + "epoch": 0.15533114473308593, + "grad_norm": 1.7923991680145264, + "learning_rate": 4.7082177852938344e-05, + "loss": 4.8204, + "step": 26118 + }, + { + "epoch": 0.15533709201636692, + "grad_norm": 1.761819839477539, + "learning_rate": 4.708195885812881e-05, + "loss": 5.1966, + "step": 26119 + }, + { + "epoch": 0.15534303929964793, + "grad_norm": 2.061192035675049, + "learning_rate": 4.7081739855610674e-05, + "loss": 4.7254, + "step": 26120 + }, + { + "epoch": 0.15534898658292892, + "grad_norm": 1.7219372987747192, + "learning_rate": 4.708152084538404e-05, + "loss": 5.008, + "step": 26121 + }, + { + "epoch": 0.1553549338662099, + "grad_norm": 1.836690068244934, + "learning_rate": 4.708130182744898e-05, + "loss": 4.8645, + "step": 26122 + }, + { + "epoch": 0.15536088114949093, + "grad_norm": 1.6488652229309082, + "learning_rate": 4.708108280180556e-05, + "loss": 5.1588, + "step": 26123 + }, + { + "epoch": 0.15536682843277191, + "grad_norm": 1.7643523216247559, + "learning_rate": 4.708086376845386e-05, + "loss": 4.9774, + "step": 26124 + }, + { + "epoch": 0.1553727757160529, + "grad_norm": 1.7396107912063599, + "learning_rate": 4.7080644727393967e-05, + "loss": 5.1542, + "step": 26125 + }, + { + "epoch": 0.15537872299933392, + "grad_norm": 1.723271131515503, + "learning_rate": 4.708042567862594e-05, + "loss": 4.5029, + "step": 26126 + }, + { + "epoch": 0.1553846702826149, + "grad_norm": 1.7824338674545288, + "learning_rate": 4.708020662214987e-05, + "loss": 4.8107, + "step": 26127 + }, + { + "epoch": 0.1553906175658959, + "grad_norm": 1.6587624549865723, + "learning_rate": 4.707998755796582e-05, + "loss": 5.0076, + "step": 26128 + }, + { + "epoch": 0.15539656484917688, + "grad_norm": 1.6058495044708252, + "learning_rate": 4.7079768486073884e-05, + "loss": 4.8512, + "step": 26129 + }, + { + "epoch": 0.1554025121324579, + "grad_norm": 1.6286768913269043, + "learning_rate": 4.707954940647412e-05, + "loss": 5.0587, + "step": 26130 + }, + { + "epoch": 0.1554084594157389, + "grad_norm": 1.5808156728744507, + "learning_rate": 4.707933031916662e-05, + "loss": 5.0254, + "step": 26131 + }, + { + "epoch": 0.15541440669901987, + "grad_norm": 1.7283897399902344, + "learning_rate": 4.707911122415145e-05, + "loss": 5.1255, + "step": 26132 + }, + { + "epoch": 0.1554203539823009, + "grad_norm": 1.9916651248931885, + "learning_rate": 4.70788921214287e-05, + "loss": 4.9384, + "step": 26133 + }, + { + "epoch": 0.15542630126558188, + "grad_norm": 1.5505808591842651, + "learning_rate": 4.7078673010998425e-05, + "loss": 5.0284, + "step": 26134 + }, + { + "epoch": 0.15543224854886287, + "grad_norm": 1.8529605865478516, + "learning_rate": 4.707845389286072e-05, + "loss": 5.1745, + "step": 26135 + }, + { + "epoch": 0.15543819583214388, + "grad_norm": 1.5921772718429565, + "learning_rate": 4.707823476701565e-05, + "loss": 5.1941, + "step": 26136 + }, + { + "epoch": 0.15544414311542487, + "grad_norm": 1.676703691482544, + "learning_rate": 4.70780156334633e-05, + "loss": 4.9678, + "step": 26137 + }, + { + "epoch": 0.15545009039870586, + "grad_norm": 1.5701407194137573, + "learning_rate": 4.707779649220374e-05, + "loss": 4.8332, + "step": 26138 + }, + { + "epoch": 0.15545603768198687, + "grad_norm": 1.4418753385543823, + "learning_rate": 4.707757734323706e-05, + "loss": 4.9294, + "step": 26139 + }, + { + "epoch": 0.15546198496526786, + "grad_norm": 1.4596991539001465, + "learning_rate": 4.707735818656331e-05, + "loss": 4.874, + "step": 26140 + }, + { + "epoch": 0.15546793224854885, + "grad_norm": 1.475049376487732, + "learning_rate": 4.707713902218259e-05, + "loss": 5.0269, + "step": 26141 + }, + { + "epoch": 0.15547387953182987, + "grad_norm": 1.4616882801055908, + "learning_rate": 4.7076919850094966e-05, + "loss": 5.0152, + "step": 26142 + }, + { + "epoch": 0.15547982681511086, + "grad_norm": 1.5477145910263062, + "learning_rate": 4.707670067030052e-05, + "loss": 4.9596, + "step": 26143 + }, + { + "epoch": 0.15548577409839184, + "grad_norm": 1.6296616792678833, + "learning_rate": 4.707648148279933e-05, + "loss": 4.7555, + "step": 26144 + }, + { + "epoch": 0.15549172138167286, + "grad_norm": 2.044677257537842, + "learning_rate": 4.707626228759147e-05, + "loss": 4.2117, + "step": 26145 + }, + { + "epoch": 0.15549766866495385, + "grad_norm": 1.8100709915161133, + "learning_rate": 4.7076043084677e-05, + "loss": 4.5057, + "step": 26146 + }, + { + "epoch": 0.15550361594823484, + "grad_norm": 1.698901653289795, + "learning_rate": 4.7075823874056026e-05, + "loss": 4.6707, + "step": 26147 + }, + { + "epoch": 0.15550956323151585, + "grad_norm": 1.5637656450271606, + "learning_rate": 4.70756046557286e-05, + "loss": 4.871, + "step": 26148 + }, + { + "epoch": 0.15551551051479684, + "grad_norm": 1.5465519428253174, + "learning_rate": 4.707538542969481e-05, + "loss": 4.6844, + "step": 26149 + }, + { + "epoch": 0.15552145779807783, + "grad_norm": 1.6268285512924194, + "learning_rate": 4.7075166195954736e-05, + "loss": 5.046, + "step": 26150 + }, + { + "epoch": 0.15552740508135884, + "grad_norm": 1.6071034669876099, + "learning_rate": 4.707494695450845e-05, + "loss": 4.9576, + "step": 26151 + }, + { + "epoch": 0.15553335236463983, + "grad_norm": 1.4627524614334106, + "learning_rate": 4.707472770535603e-05, + "loss": 5.0786, + "step": 26152 + }, + { + "epoch": 0.15553929964792082, + "grad_norm": 1.7464107275009155, + "learning_rate": 4.707450844849754e-05, + "loss": 5.0383, + "step": 26153 + }, + { + "epoch": 0.15554524693120184, + "grad_norm": 1.7528932094573975, + "learning_rate": 4.7074289183933077e-05, + "loss": 4.7332, + "step": 26154 + }, + { + "epoch": 0.15555119421448282, + "grad_norm": 1.9061720371246338, + "learning_rate": 4.70740699116627e-05, + "loss": 4.5108, + "step": 26155 + }, + { + "epoch": 0.1555571414977638, + "grad_norm": 1.6121511459350586, + "learning_rate": 4.70738506316865e-05, + "loss": 4.9586, + "step": 26156 + }, + { + "epoch": 0.15556308878104483, + "grad_norm": 1.622747778892517, + "learning_rate": 4.707363134400454e-05, + "loss": 5.0985, + "step": 26157 + }, + { + "epoch": 0.15556903606432582, + "grad_norm": 1.4669454097747803, + "learning_rate": 4.707341204861691e-05, + "loss": 4.9397, + "step": 26158 + }, + { + "epoch": 0.1555749833476068, + "grad_norm": 1.4583669900894165, + "learning_rate": 4.707319274552368e-05, + "loss": 5.0822, + "step": 26159 + }, + { + "epoch": 0.15558093063088782, + "grad_norm": 1.9358830451965332, + "learning_rate": 4.707297343472492e-05, + "loss": 4.9557, + "step": 26160 + }, + { + "epoch": 0.1555868779141688, + "grad_norm": 1.7523856163024902, + "learning_rate": 4.707275411622072e-05, + "loss": 4.5959, + "step": 26161 + }, + { + "epoch": 0.1555928251974498, + "grad_norm": 1.7858316898345947, + "learning_rate": 4.707253479001114e-05, + "loss": 5.1765, + "step": 26162 + }, + { + "epoch": 0.1555987724807308, + "grad_norm": 1.7400814294815063, + "learning_rate": 4.707231545609627e-05, + "loss": 5.4312, + "step": 26163 + }, + { + "epoch": 0.1556047197640118, + "grad_norm": 1.6235188245773315, + "learning_rate": 4.7072096114476186e-05, + "loss": 5.1745, + "step": 26164 + }, + { + "epoch": 0.1556106670472928, + "grad_norm": 1.6003834009170532, + "learning_rate": 4.7071876765150963e-05, + "loss": 4.9194, + "step": 26165 + }, + { + "epoch": 0.1556166143305738, + "grad_norm": 1.7427910566329956, + "learning_rate": 4.7071657408120675e-05, + "loss": 5.1942, + "step": 26166 + }, + { + "epoch": 0.1556225616138548, + "grad_norm": 1.5763969421386719, + "learning_rate": 4.7071438043385395e-05, + "loss": 4.9424, + "step": 26167 + }, + { + "epoch": 0.15562850889713578, + "grad_norm": 1.6284310817718506, + "learning_rate": 4.7071218670945206e-05, + "loss": 5.4415, + "step": 26168 + }, + { + "epoch": 0.1556344561804168, + "grad_norm": 1.3858957290649414, + "learning_rate": 4.707099929080019e-05, + "loss": 5.6362, + "step": 26169 + }, + { + "epoch": 0.15564040346369779, + "grad_norm": 1.4326859712600708, + "learning_rate": 4.70707799029504e-05, + "loss": 5.2872, + "step": 26170 + }, + { + "epoch": 0.15564635074697877, + "grad_norm": 1.6624369621276855, + "learning_rate": 4.7070560507395944e-05, + "loss": 5.1741, + "step": 26171 + }, + { + "epoch": 0.1556522980302598, + "grad_norm": 2.4475722312927246, + "learning_rate": 4.707034110413688e-05, + "loss": 4.8206, + "step": 26172 + }, + { + "epoch": 0.15565824531354078, + "grad_norm": 2.2583391666412354, + "learning_rate": 4.707012169317329e-05, + "loss": 4.6716, + "step": 26173 + }, + { + "epoch": 0.15566419259682177, + "grad_norm": 2.161346197128296, + "learning_rate": 4.706990227450524e-05, + "loss": 4.5228, + "step": 26174 + }, + { + "epoch": 0.15567013988010278, + "grad_norm": 1.550593614578247, + "learning_rate": 4.7069682848132815e-05, + "loss": 5.1581, + "step": 26175 + }, + { + "epoch": 0.15567608716338377, + "grad_norm": 1.524939775466919, + "learning_rate": 4.70694634140561e-05, + "loss": 5.6605, + "step": 26176 + }, + { + "epoch": 0.15568203444666476, + "grad_norm": 2.134462833404541, + "learning_rate": 4.7069243972275155e-05, + "loss": 4.9063, + "step": 26177 + }, + { + "epoch": 0.15568798172994577, + "grad_norm": 2.2610831260681152, + "learning_rate": 4.7069024522790075e-05, + "loss": 4.4764, + "step": 26178 + }, + { + "epoch": 0.15569392901322676, + "grad_norm": 2.4277896881103516, + "learning_rate": 4.706880506560092e-05, + "loss": 4.7747, + "step": 26179 + }, + { + "epoch": 0.15569987629650775, + "grad_norm": 2.5465261936187744, + "learning_rate": 4.706858560070777e-05, + "loss": 4.7831, + "step": 26180 + }, + { + "epoch": 0.15570582357978877, + "grad_norm": 2.4795758724212646, + "learning_rate": 4.706836612811071e-05, + "loss": 4.6256, + "step": 26181 + }, + { + "epoch": 0.15571177086306975, + "grad_norm": 2.624998092651367, + "learning_rate": 4.7068146647809805e-05, + "loss": 4.5916, + "step": 26182 + }, + { + "epoch": 0.15571771814635074, + "grad_norm": 2.1440951824188232, + "learning_rate": 4.706792715980515e-05, + "loss": 4.5955, + "step": 26183 + }, + { + "epoch": 0.15572366542963176, + "grad_norm": 2.386084794998169, + "learning_rate": 4.70677076640968e-05, + "loss": 3.9781, + "step": 26184 + }, + { + "epoch": 0.15572961271291275, + "grad_norm": 2.271477699279785, + "learning_rate": 4.7067488160684844e-05, + "loss": 4.3557, + "step": 26185 + }, + { + "epoch": 0.15573555999619373, + "grad_norm": 2.227630853652954, + "learning_rate": 4.706726864956935e-05, + "loss": 4.117, + "step": 26186 + }, + { + "epoch": 0.15574150727947472, + "grad_norm": 2.1777312755584717, + "learning_rate": 4.7067049130750414e-05, + "loss": 4.4695, + "step": 26187 + }, + { + "epoch": 0.15574745456275574, + "grad_norm": 2.131826162338257, + "learning_rate": 4.7066829604228094e-05, + "loss": 4.185, + "step": 26188 + }, + { + "epoch": 0.15575340184603673, + "grad_norm": 1.9766490459442139, + "learning_rate": 4.706661007000246e-05, + "loss": 5.6452, + "step": 26189 + }, + { + "epoch": 0.15575934912931771, + "grad_norm": 2.088787078857422, + "learning_rate": 4.706639052807361e-05, + "loss": 4.6965, + "step": 26190 + }, + { + "epoch": 0.15576529641259873, + "grad_norm": 2.012974262237549, + "learning_rate": 4.7066170978441616e-05, + "loss": 4.4508, + "step": 26191 + }, + { + "epoch": 0.15577124369587972, + "grad_norm": 2.473616123199463, + "learning_rate": 4.706595142110654e-05, + "loss": 4.4842, + "step": 26192 + }, + { + "epoch": 0.1557771909791607, + "grad_norm": 2.5314011573791504, + "learning_rate": 4.7065731856068475e-05, + "loss": 4.5175, + "step": 26193 + }, + { + "epoch": 0.15578313826244172, + "grad_norm": 2.0637693405151367, + "learning_rate": 4.7065512283327484e-05, + "loss": 4.8803, + "step": 26194 + }, + { + "epoch": 0.1557890855457227, + "grad_norm": 2.659450054168701, + "learning_rate": 4.706529270288366e-05, + "loss": 4.7659, + "step": 26195 + }, + { + "epoch": 0.1557950328290037, + "grad_norm": 1.741438865661621, + "learning_rate": 4.706507311473707e-05, + "loss": 5.5987, + "step": 26196 + }, + { + "epoch": 0.15580098011228471, + "grad_norm": 1.621771216392517, + "learning_rate": 4.706485351888778e-05, + "loss": 5.477, + "step": 26197 + }, + { + "epoch": 0.1558069273955657, + "grad_norm": 1.8086066246032715, + "learning_rate": 4.706463391533589e-05, + "loss": 5.4196, + "step": 26198 + }, + { + "epoch": 0.1558128746788467, + "grad_norm": 1.4268287420272827, + "learning_rate": 4.706441430408145e-05, + "loss": 5.8321, + "step": 26199 + }, + { + "epoch": 0.1558188219621277, + "grad_norm": 1.5565332174301147, + "learning_rate": 4.7064194685124564e-05, + "loss": 5.5548, + "step": 26200 + }, + { + "epoch": 0.1558247692454087, + "grad_norm": 1.7371162176132202, + "learning_rate": 4.706397505846529e-05, + "loss": 5.4536, + "step": 26201 + }, + { + "epoch": 0.15583071652868968, + "grad_norm": 1.6265679597854614, + "learning_rate": 4.706375542410371e-05, + "loss": 4.7589, + "step": 26202 + }, + { + "epoch": 0.1558366638119707, + "grad_norm": 1.5395931005477905, + "learning_rate": 4.70635357820399e-05, + "loss": 5.2809, + "step": 26203 + }, + { + "epoch": 0.1558426110952517, + "grad_norm": 1.5577752590179443, + "learning_rate": 4.7063316132273937e-05, + "loss": 5.2526, + "step": 26204 + }, + { + "epoch": 0.15584855837853268, + "grad_norm": 1.3954623937606812, + "learning_rate": 4.706309647480591e-05, + "loss": 5.3674, + "step": 26205 + }, + { + "epoch": 0.1558545056618137, + "grad_norm": 1.7251001596450806, + "learning_rate": 4.706287680963587e-05, + "loss": 5.2069, + "step": 26206 + }, + { + "epoch": 0.15586045294509468, + "grad_norm": 1.8611587285995483, + "learning_rate": 4.706265713676391e-05, + "loss": 5.2805, + "step": 26207 + }, + { + "epoch": 0.15586640022837567, + "grad_norm": 1.5871427059173584, + "learning_rate": 4.706243745619011e-05, + "loss": 5.2921, + "step": 26208 + }, + { + "epoch": 0.15587234751165668, + "grad_norm": 1.6353893280029297, + "learning_rate": 4.706221776791454e-05, + "loss": 5.3425, + "step": 26209 + }, + { + "epoch": 0.15587829479493767, + "grad_norm": 1.6304540634155273, + "learning_rate": 4.7061998071937274e-05, + "loss": 5.3577, + "step": 26210 + }, + { + "epoch": 0.15588424207821866, + "grad_norm": 1.6434270143508911, + "learning_rate": 4.706177836825839e-05, + "loss": 5.4573, + "step": 26211 + }, + { + "epoch": 0.15589018936149968, + "grad_norm": 1.6281068325042725, + "learning_rate": 4.7061558656877976e-05, + "loss": 4.8948, + "step": 26212 + }, + { + "epoch": 0.15589613664478066, + "grad_norm": 1.7287936210632324, + "learning_rate": 4.70613389377961e-05, + "loss": 5.2005, + "step": 26213 + }, + { + "epoch": 0.15590208392806165, + "grad_norm": 1.8355118036270142, + "learning_rate": 4.706111921101283e-05, + "loss": 5.456, + "step": 26214 + }, + { + "epoch": 0.15590803121134267, + "grad_norm": 1.5891990661621094, + "learning_rate": 4.7060899476528253e-05, + "loss": 5.1405, + "step": 26215 + }, + { + "epoch": 0.15591397849462366, + "grad_norm": 1.5852643251419067, + "learning_rate": 4.706067973434244e-05, + "loss": 5.5963, + "step": 26216 + }, + { + "epoch": 0.15591992577790464, + "grad_norm": 2.340528726577759, + "learning_rate": 4.706045998445548e-05, + "loss": 4.6047, + "step": 26217 + }, + { + "epoch": 0.15592587306118566, + "grad_norm": 1.872802495956421, + "learning_rate": 4.706024022686744e-05, + "loss": 4.7129, + "step": 26218 + }, + { + "epoch": 0.15593182034446665, + "grad_norm": 1.6725971698760986, + "learning_rate": 4.706002046157839e-05, + "loss": 5.2416, + "step": 26219 + }, + { + "epoch": 0.15593776762774764, + "grad_norm": 1.6346997022628784, + "learning_rate": 4.705980068858843e-05, + "loss": 5.0625, + "step": 26220 + }, + { + "epoch": 0.15594371491102865, + "grad_norm": 1.8969260454177856, + "learning_rate": 4.705958090789761e-05, + "loss": 4.6915, + "step": 26221 + }, + { + "epoch": 0.15594966219430964, + "grad_norm": 1.6025121212005615, + "learning_rate": 4.705936111950602e-05, + "loss": 4.9978, + "step": 26222 + }, + { + "epoch": 0.15595560947759063, + "grad_norm": 1.406001329421997, + "learning_rate": 4.705914132341374e-05, + "loss": 5.7913, + "step": 26223 + }, + { + "epoch": 0.15596155676087164, + "grad_norm": 2.1708552837371826, + "learning_rate": 4.7058921519620834e-05, + "loss": 5.1468, + "step": 26224 + }, + { + "epoch": 0.15596750404415263, + "grad_norm": 2.216993808746338, + "learning_rate": 4.705870170812739e-05, + "loss": 5.1279, + "step": 26225 + }, + { + "epoch": 0.15597345132743362, + "grad_norm": 1.7173157930374146, + "learning_rate": 4.705848188893348e-05, + "loss": 5.1289, + "step": 26226 + }, + { + "epoch": 0.15597939861071464, + "grad_norm": 1.6096726655960083, + "learning_rate": 4.705826206203918e-05, + "loss": 5.5078, + "step": 26227 + }, + { + "epoch": 0.15598534589399562, + "grad_norm": 1.8224303722381592, + "learning_rate": 4.705804222744458e-05, + "loss": 5.4791, + "step": 26228 + }, + { + "epoch": 0.1559912931772766, + "grad_norm": 1.722948431968689, + "learning_rate": 4.705782238514973e-05, + "loss": 5.1473, + "step": 26229 + }, + { + "epoch": 0.15599724046055763, + "grad_norm": 1.7583675384521484, + "learning_rate": 4.705760253515473e-05, + "loss": 5.5127, + "step": 26230 + }, + { + "epoch": 0.15600318774383862, + "grad_norm": 1.5635607242584229, + "learning_rate": 4.705738267745965e-05, + "loss": 5.417, + "step": 26231 + }, + { + "epoch": 0.1560091350271196, + "grad_norm": 1.570145606994629, + "learning_rate": 4.705716281206456e-05, + "loss": 5.266, + "step": 26232 + }, + { + "epoch": 0.15601508231040062, + "grad_norm": 1.6425197124481201, + "learning_rate": 4.705694293896955e-05, + "loss": 4.7162, + "step": 26233 + }, + { + "epoch": 0.1560210295936816, + "grad_norm": 1.6312974691390991, + "learning_rate": 4.705672305817468e-05, + "loss": 4.8861, + "step": 26234 + }, + { + "epoch": 0.1560269768769626, + "grad_norm": 1.6320679187774658, + "learning_rate": 4.7056503169680046e-05, + "loss": 5.2133, + "step": 26235 + }, + { + "epoch": 0.1560329241602436, + "grad_norm": 1.6294546127319336, + "learning_rate": 4.705628327348571e-05, + "loss": 5.7012, + "step": 26236 + }, + { + "epoch": 0.1560388714435246, + "grad_norm": 1.472088098526001, + "learning_rate": 4.705606336959175e-05, + "loss": 5.404, + "step": 26237 + }, + { + "epoch": 0.1560448187268056, + "grad_norm": 1.5214602947235107, + "learning_rate": 4.705584345799825e-05, + "loss": 5.3916, + "step": 26238 + }, + { + "epoch": 0.1560507660100866, + "grad_norm": 1.45046067237854, + "learning_rate": 4.705562353870528e-05, + "loss": 5.2275, + "step": 26239 + }, + { + "epoch": 0.1560567132933676, + "grad_norm": 1.5730977058410645, + "learning_rate": 4.705540361171292e-05, + "loss": 5.4597, + "step": 26240 + }, + { + "epoch": 0.15606266057664858, + "grad_norm": 1.6403652429580688, + "learning_rate": 4.7055183677021254e-05, + "loss": 4.7476, + "step": 26241 + }, + { + "epoch": 0.1560686078599296, + "grad_norm": 2.0256097316741943, + "learning_rate": 4.705496373463034e-05, + "loss": 4.7275, + "step": 26242 + }, + { + "epoch": 0.15607455514321059, + "grad_norm": 2.1107068061828613, + "learning_rate": 4.7054743784540265e-05, + "loss": 4.7459, + "step": 26243 + }, + { + "epoch": 0.15608050242649157, + "grad_norm": 1.4644510746002197, + "learning_rate": 4.705452382675112e-05, + "loss": 5.3951, + "step": 26244 + }, + { + "epoch": 0.15608644970977256, + "grad_norm": 1.4154125452041626, + "learning_rate": 4.705430386126296e-05, + "loss": 5.5351, + "step": 26245 + }, + { + "epoch": 0.15609239699305358, + "grad_norm": 1.4124795198440552, + "learning_rate": 4.7054083888075875e-05, + "loss": 5.3797, + "step": 26246 + }, + { + "epoch": 0.15609834427633457, + "grad_norm": 1.6197364330291748, + "learning_rate": 4.705386390718993e-05, + "loss": 5.3903, + "step": 26247 + }, + { + "epoch": 0.15610429155961555, + "grad_norm": 1.5693352222442627, + "learning_rate": 4.7053643918605216e-05, + "loss": 5.4997, + "step": 26248 + }, + { + "epoch": 0.15611023884289657, + "grad_norm": 1.4047479629516602, + "learning_rate": 4.70534239223218e-05, + "loss": 5.0258, + "step": 26249 + }, + { + "epoch": 0.15611618612617756, + "grad_norm": 1.7006193399429321, + "learning_rate": 4.705320391833976e-05, + "loss": 4.9798, + "step": 26250 + }, + { + "epoch": 0.15612213340945855, + "grad_norm": 1.7294094562530518, + "learning_rate": 4.705298390665917e-05, + "loss": 5.5811, + "step": 26251 + }, + { + "epoch": 0.15612808069273956, + "grad_norm": 1.4665381908416748, + "learning_rate": 4.705276388728013e-05, + "loss": 5.5117, + "step": 26252 + }, + { + "epoch": 0.15613402797602055, + "grad_norm": 1.4549496173858643, + "learning_rate": 4.705254386020268e-05, + "loss": 5.6141, + "step": 26253 + }, + { + "epoch": 0.15613997525930154, + "grad_norm": 1.4019516706466675, + "learning_rate": 4.705232382542691e-05, + "loss": 5.6525, + "step": 26254 + }, + { + "epoch": 0.15614592254258255, + "grad_norm": 1.3660154342651367, + "learning_rate": 4.705210378295292e-05, + "loss": 5.4377, + "step": 26255 + }, + { + "epoch": 0.15615186982586354, + "grad_norm": 1.5590531826019287, + "learning_rate": 4.7051883732780755e-05, + "loss": 5.5679, + "step": 26256 + }, + { + "epoch": 0.15615781710914453, + "grad_norm": 2.126138687133789, + "learning_rate": 4.7051663674910514e-05, + "loss": 4.8662, + "step": 26257 + }, + { + "epoch": 0.15616376439242555, + "grad_norm": 1.5536115169525146, + "learning_rate": 4.705144360934226e-05, + "loss": 4.97, + "step": 26258 + }, + { + "epoch": 0.15616971167570654, + "grad_norm": 2.0653862953186035, + "learning_rate": 4.705122353607607e-05, + "loss": 4.8683, + "step": 26259 + }, + { + "epoch": 0.15617565895898752, + "grad_norm": 1.872904658317566, + "learning_rate": 4.705100345511204e-05, + "loss": 4.8923, + "step": 26260 + }, + { + "epoch": 0.15618160624226854, + "grad_norm": 2.112368583679199, + "learning_rate": 4.7050783366450224e-05, + "loss": 4.7857, + "step": 26261 + }, + { + "epoch": 0.15618755352554953, + "grad_norm": 1.4000160694122314, + "learning_rate": 4.7050563270090704e-05, + "loss": 5.2055, + "step": 26262 + }, + { + "epoch": 0.15619350080883052, + "grad_norm": 1.4316319227218628, + "learning_rate": 4.705034316603356e-05, + "loss": 5.5257, + "step": 26263 + }, + { + "epoch": 0.15619944809211153, + "grad_norm": 1.4394290447235107, + "learning_rate": 4.705012305427887e-05, + "loss": 5.2702, + "step": 26264 + }, + { + "epoch": 0.15620539537539252, + "grad_norm": 2.0612921714782715, + "learning_rate": 4.704990293482672e-05, + "loss": 4.964, + "step": 26265 + }, + { + "epoch": 0.1562113426586735, + "grad_norm": 1.7573301792144775, + "learning_rate": 4.704968280767716e-05, + "loss": 5.1509, + "step": 26266 + }, + { + "epoch": 0.15621728994195452, + "grad_norm": 1.546891450881958, + "learning_rate": 4.70494626728303e-05, + "loss": 5.3226, + "step": 26267 + }, + { + "epoch": 0.1562232372252355, + "grad_norm": 1.672478437423706, + "learning_rate": 4.7049242530286195e-05, + "loss": 4.998, + "step": 26268 + }, + { + "epoch": 0.1562291845085165, + "grad_norm": 1.943877100944519, + "learning_rate": 4.704902238004492e-05, + "loss": 4.6489, + "step": 26269 + }, + { + "epoch": 0.15623513179179752, + "grad_norm": 2.779040813446045, + "learning_rate": 4.704880222210657e-05, + "loss": 3.8466, + "step": 26270 + }, + { + "epoch": 0.1562410790750785, + "grad_norm": 2.8241045475006104, + "learning_rate": 4.7048582056471205e-05, + "loss": 4.026, + "step": 26271 + }, + { + "epoch": 0.1562470263583595, + "grad_norm": 1.6769524812698364, + "learning_rate": 4.70483618831389e-05, + "loss": 4.6255, + "step": 26272 + }, + { + "epoch": 0.1562529736416405, + "grad_norm": 1.4940049648284912, + "learning_rate": 4.704814170210975e-05, + "loss": 4.7496, + "step": 26273 + }, + { + "epoch": 0.1562589209249215, + "grad_norm": 1.6519593000411987, + "learning_rate": 4.704792151338382e-05, + "loss": 4.7485, + "step": 26274 + }, + { + "epoch": 0.15626486820820248, + "grad_norm": 2.30234956741333, + "learning_rate": 4.704770131696119e-05, + "loss": 4.6089, + "step": 26275 + }, + { + "epoch": 0.1562708154914835, + "grad_norm": 1.6795179843902588, + "learning_rate": 4.704748111284193e-05, + "loss": 5.2412, + "step": 26276 + }, + { + "epoch": 0.1562767627747645, + "grad_norm": 2.194812536239624, + "learning_rate": 4.7047260901026124e-05, + "loss": 5.156, + "step": 26277 + }, + { + "epoch": 0.15628271005804548, + "grad_norm": 2.5557010173797607, + "learning_rate": 4.704704068151385e-05, + "loss": 4.5438, + "step": 26278 + }, + { + "epoch": 0.1562886573413265, + "grad_norm": 1.95830237865448, + "learning_rate": 4.704682045430518e-05, + "loss": 4.6183, + "step": 26279 + }, + { + "epoch": 0.15629460462460748, + "grad_norm": 2.1255557537078857, + "learning_rate": 4.704660021940019e-05, + "loss": 4.5619, + "step": 26280 + }, + { + "epoch": 0.15630055190788847, + "grad_norm": 1.6092948913574219, + "learning_rate": 4.704637997679896e-05, + "loss": 5.64, + "step": 26281 + }, + { + "epoch": 0.15630649919116948, + "grad_norm": 2.1546456813812256, + "learning_rate": 4.704615972650157e-05, + "loss": 4.9573, + "step": 26282 + }, + { + "epoch": 0.15631244647445047, + "grad_norm": 2.154639959335327, + "learning_rate": 4.7045939468508095e-05, + "loss": 4.4704, + "step": 26283 + }, + { + "epoch": 0.15631839375773146, + "grad_norm": 1.819509744644165, + "learning_rate": 4.7045719202818605e-05, + "loss": 4.6245, + "step": 26284 + }, + { + "epoch": 0.15632434104101248, + "grad_norm": 2.337667942047119, + "learning_rate": 4.704549892943318e-05, + "loss": 4.4268, + "step": 26285 + }, + { + "epoch": 0.15633028832429346, + "grad_norm": 2.308842658996582, + "learning_rate": 4.704527864835191e-05, + "loss": 4.7084, + "step": 26286 + }, + { + "epoch": 0.15633623560757445, + "grad_norm": 1.664182424545288, + "learning_rate": 4.704505835957486e-05, + "loss": 5.2576, + "step": 26287 + }, + { + "epoch": 0.15634218289085547, + "grad_norm": 1.7331715822219849, + "learning_rate": 4.7044838063102096e-05, + "loss": 5.3069, + "step": 26288 + }, + { + "epoch": 0.15634813017413646, + "grad_norm": 1.4833427667617798, + "learning_rate": 4.7044617758933714e-05, + "loss": 4.8484, + "step": 26289 + }, + { + "epoch": 0.15635407745741745, + "grad_norm": 2.975609064102173, + "learning_rate": 4.704439744706978e-05, + "loss": 5.5747, + "step": 26290 + }, + { + "epoch": 0.15636002474069846, + "grad_norm": 1.8256950378417969, + "learning_rate": 4.704417712751038e-05, + "loss": 5.2464, + "step": 26291 + }, + { + "epoch": 0.15636597202397945, + "grad_norm": 1.5019065141677856, + "learning_rate": 4.7043956800255585e-05, + "loss": 5.5261, + "step": 26292 + }, + { + "epoch": 0.15637191930726044, + "grad_norm": 1.4906537532806396, + "learning_rate": 4.7043736465305464e-05, + "loss": 5.38, + "step": 26293 + }, + { + "epoch": 0.15637786659054145, + "grad_norm": 1.601969599723816, + "learning_rate": 4.704351612266012e-05, + "loss": 5.2111, + "step": 26294 + }, + { + "epoch": 0.15638381387382244, + "grad_norm": 1.5806862115859985, + "learning_rate": 4.70432957723196e-05, + "loss": 5.5473, + "step": 26295 + }, + { + "epoch": 0.15638976115710343, + "grad_norm": 1.5971914529800415, + "learning_rate": 4.7043075414283986e-05, + "loss": 5.4841, + "step": 26296 + }, + { + "epoch": 0.15639570844038445, + "grad_norm": 1.6458126306533813, + "learning_rate": 4.704285504855337e-05, + "loss": 5.3215, + "step": 26297 + }, + { + "epoch": 0.15640165572366543, + "grad_norm": 1.5553637742996216, + "learning_rate": 4.704263467512782e-05, + "loss": 5.4461, + "step": 26298 + }, + { + "epoch": 0.15640760300694642, + "grad_norm": 1.447519063949585, + "learning_rate": 4.704241429400742e-05, + "loss": 5.3617, + "step": 26299 + }, + { + "epoch": 0.15641355029022744, + "grad_norm": 1.5533196926116943, + "learning_rate": 4.704219390519223e-05, + "loss": 4.8446, + "step": 26300 + }, + { + "epoch": 0.15641949757350843, + "grad_norm": 1.5320333242416382, + "learning_rate": 4.7041973508682344e-05, + "loss": 5.3333, + "step": 26301 + }, + { + "epoch": 0.15642544485678941, + "grad_norm": 1.6192045211791992, + "learning_rate": 4.704175310447784e-05, + "loss": 5.221, + "step": 26302 + }, + { + "epoch": 0.1564313921400704, + "grad_norm": 1.4964373111724854, + "learning_rate": 4.704153269257878e-05, + "loss": 5.3061, + "step": 26303 + }, + { + "epoch": 0.15643733942335142, + "grad_norm": 1.6173138618469238, + "learning_rate": 4.704131227298525e-05, + "loss": 5.3485, + "step": 26304 + }, + { + "epoch": 0.1564432867066324, + "grad_norm": 1.511825680732727, + "learning_rate": 4.704109184569733e-05, + "loss": 5.2024, + "step": 26305 + }, + { + "epoch": 0.1564492339899134, + "grad_norm": 1.5368350744247437, + "learning_rate": 4.704087141071508e-05, + "loss": 5.3867, + "step": 26306 + }, + { + "epoch": 0.1564551812731944, + "grad_norm": 1.612384557723999, + "learning_rate": 4.7040650968038605e-05, + "loss": 5.1923, + "step": 26307 + }, + { + "epoch": 0.1564611285564754, + "grad_norm": 1.5889664888381958, + "learning_rate": 4.704043051766795e-05, + "loss": 5.0457, + "step": 26308 + }, + { + "epoch": 0.1564670758397564, + "grad_norm": 1.5363719463348389, + "learning_rate": 4.704021005960322e-05, + "loss": 5.3852, + "step": 26309 + }, + { + "epoch": 0.1564730231230374, + "grad_norm": 1.5099613666534424, + "learning_rate": 4.703998959384447e-05, + "loss": 5.8659, + "step": 26310 + }, + { + "epoch": 0.1564789704063184, + "grad_norm": 1.5517312288284302, + "learning_rate": 4.70397691203918e-05, + "loss": 6.0298, + "step": 26311 + }, + { + "epoch": 0.15648491768959938, + "grad_norm": 1.616828441619873, + "learning_rate": 4.703954863924527e-05, + "loss": 4.8686, + "step": 26312 + }, + { + "epoch": 0.1564908649728804, + "grad_norm": 1.4939557313919067, + "learning_rate": 4.703932815040496e-05, + "loss": 5.3872, + "step": 26313 + }, + { + "epoch": 0.15649681225616138, + "grad_norm": 1.444994568824768, + "learning_rate": 4.7039107653870954e-05, + "loss": 5.38, + "step": 26314 + }, + { + "epoch": 0.15650275953944237, + "grad_norm": 1.7697070837020874, + "learning_rate": 4.7038887149643304e-05, + "loss": 5.6994, + "step": 26315 + }, + { + "epoch": 0.1565087068227234, + "grad_norm": 1.628763198852539, + "learning_rate": 4.703866663772213e-05, + "loss": 5.5986, + "step": 26316 + }, + { + "epoch": 0.15651465410600438, + "grad_norm": 1.5433357954025269, + "learning_rate": 4.703844611810747e-05, + "loss": 5.5968, + "step": 26317 + }, + { + "epoch": 0.15652060138928536, + "grad_norm": 1.452527403831482, + "learning_rate": 4.7038225590799424e-05, + "loss": 5.5669, + "step": 26318 + }, + { + "epoch": 0.15652654867256638, + "grad_norm": 1.6079583168029785, + "learning_rate": 4.703800505579806e-05, + "loss": 5.2624, + "step": 26319 + }, + { + "epoch": 0.15653249595584737, + "grad_norm": 1.4639090299606323, + "learning_rate": 4.703778451310345e-05, + "loss": 5.4219, + "step": 26320 + }, + { + "epoch": 0.15653844323912836, + "grad_norm": 1.7064789533615112, + "learning_rate": 4.703756396271568e-05, + "loss": 5.055, + "step": 26321 + }, + { + "epoch": 0.15654439052240937, + "grad_norm": 1.596901297569275, + "learning_rate": 4.7037343404634824e-05, + "loss": 6.4061, + "step": 26322 + }, + { + "epoch": 0.15655033780569036, + "grad_norm": 1.4072599411010742, + "learning_rate": 4.703712283886097e-05, + "loss": 5.4348, + "step": 26323 + }, + { + "epoch": 0.15655628508897135, + "grad_norm": 1.4027669429779053, + "learning_rate": 4.703690226539417e-05, + "loss": 5.285, + "step": 26324 + }, + { + "epoch": 0.15656223237225236, + "grad_norm": 1.3492887020111084, + "learning_rate": 4.703668168423452e-05, + "loss": 5.2334, + "step": 26325 + }, + { + "epoch": 0.15656817965553335, + "grad_norm": 1.5650583505630493, + "learning_rate": 4.703646109538209e-05, + "loss": 5.3706, + "step": 26326 + }, + { + "epoch": 0.15657412693881434, + "grad_norm": 1.549395203590393, + "learning_rate": 4.703624049883696e-05, + "loss": 5.3483, + "step": 26327 + }, + { + "epoch": 0.15658007422209536, + "grad_norm": 1.5657979249954224, + "learning_rate": 4.70360198945992e-05, + "loss": 5.2897, + "step": 26328 + }, + { + "epoch": 0.15658602150537634, + "grad_norm": 1.3859858512878418, + "learning_rate": 4.7035799282668906e-05, + "loss": 5.3292, + "step": 26329 + }, + { + "epoch": 0.15659196878865733, + "grad_norm": 1.8330230712890625, + "learning_rate": 4.7035578663046136e-05, + "loss": 5.6592, + "step": 26330 + }, + { + "epoch": 0.15659791607193835, + "grad_norm": 1.6347804069519043, + "learning_rate": 4.703535803573097e-05, + "loss": 5.5734, + "step": 26331 + }, + { + "epoch": 0.15660386335521934, + "grad_norm": 1.615646481513977, + "learning_rate": 4.7035137400723496e-05, + "loss": 5.8483, + "step": 26332 + }, + { + "epoch": 0.15660981063850032, + "grad_norm": 1.7376673221588135, + "learning_rate": 4.703491675802378e-05, + "loss": 5.327, + "step": 26333 + }, + { + "epoch": 0.15661575792178134, + "grad_norm": 2.2167186737060547, + "learning_rate": 4.70346961076319e-05, + "loss": 4.6295, + "step": 26334 + }, + { + "epoch": 0.15662170520506233, + "grad_norm": 1.8190215826034546, + "learning_rate": 4.703447544954794e-05, + "loss": 4.6977, + "step": 26335 + }, + { + "epoch": 0.15662765248834332, + "grad_norm": 1.8056445121765137, + "learning_rate": 4.703425478377197e-05, + "loss": 4.7828, + "step": 26336 + }, + { + "epoch": 0.15663359977162433, + "grad_norm": 1.3003071546554565, + "learning_rate": 4.7034034110304056e-05, + "loss": 5.3244, + "step": 26337 + }, + { + "epoch": 0.15663954705490532, + "grad_norm": 1.5494154691696167, + "learning_rate": 4.703381342914431e-05, + "loss": 5.2614, + "step": 26338 + }, + { + "epoch": 0.1566454943381863, + "grad_norm": 1.4443477392196655, + "learning_rate": 4.703359274029278e-05, + "loss": 5.6987, + "step": 26339 + }, + { + "epoch": 0.15665144162146732, + "grad_norm": 1.6877416372299194, + "learning_rate": 4.703337204374955e-05, + "loss": 5.0908, + "step": 26340 + }, + { + "epoch": 0.1566573889047483, + "grad_norm": 1.7778805494308472, + "learning_rate": 4.703315133951469e-05, + "loss": 5.067, + "step": 26341 + }, + { + "epoch": 0.1566633361880293, + "grad_norm": 1.8032246828079224, + "learning_rate": 4.703293062758829e-05, + "loss": 5.2325, + "step": 26342 + }, + { + "epoch": 0.15666928347131032, + "grad_norm": 1.6244032382965088, + "learning_rate": 4.703270990797042e-05, + "loss": 4.7988, + "step": 26343 + }, + { + "epoch": 0.1566752307545913, + "grad_norm": 2.212272882461548, + "learning_rate": 4.7032489180661154e-05, + "loss": 4.6136, + "step": 26344 + }, + { + "epoch": 0.1566811780378723, + "grad_norm": 1.4413294792175293, + "learning_rate": 4.703226844566059e-05, + "loss": 5.1378, + "step": 26345 + }, + { + "epoch": 0.1566871253211533, + "grad_norm": 1.7251073122024536, + "learning_rate": 4.703204770296877e-05, + "loss": 4.8629, + "step": 26346 + }, + { + "epoch": 0.1566930726044343, + "grad_norm": 1.8171210289001465, + "learning_rate": 4.70318269525858e-05, + "loss": 4.8487, + "step": 26347 + }, + { + "epoch": 0.15669901988771529, + "grad_norm": 1.7784240245819092, + "learning_rate": 4.703160619451175e-05, + "loss": 5.3187, + "step": 26348 + }, + { + "epoch": 0.1567049671709963, + "grad_norm": 1.7092580795288086, + "learning_rate": 4.703138542874669e-05, + "loss": 5.0771, + "step": 26349 + }, + { + "epoch": 0.1567109144542773, + "grad_norm": 1.4181660413742065, + "learning_rate": 4.7031164655290695e-05, + "loss": 5.3487, + "step": 26350 + }, + { + "epoch": 0.15671686173755828, + "grad_norm": 1.6292651891708374, + "learning_rate": 4.703094387414385e-05, + "loss": 5.2079, + "step": 26351 + }, + { + "epoch": 0.1567228090208393, + "grad_norm": 1.5617179870605469, + "learning_rate": 4.703072308530624e-05, + "loss": 5.3438, + "step": 26352 + }, + { + "epoch": 0.15672875630412028, + "grad_norm": 1.8505250215530396, + "learning_rate": 4.703050228877792e-05, + "loss": 5.223, + "step": 26353 + }, + { + "epoch": 0.15673470358740127, + "grad_norm": 1.2503677606582642, + "learning_rate": 4.7030281484558984e-05, + "loss": 4.7168, + "step": 26354 + }, + { + "epoch": 0.15674065087068229, + "grad_norm": 1.4453564882278442, + "learning_rate": 4.70300606726495e-05, + "loss": 5.3493, + "step": 26355 + }, + { + "epoch": 0.15674659815396327, + "grad_norm": 1.305949091911316, + "learning_rate": 4.702983985304956e-05, + "loss": 5.0599, + "step": 26356 + }, + { + "epoch": 0.15675254543724426, + "grad_norm": 2.160369634628296, + "learning_rate": 4.702961902575923e-05, + "loss": 4.2452, + "step": 26357 + }, + { + "epoch": 0.15675849272052528, + "grad_norm": 4.334263324737549, + "learning_rate": 4.7029398190778574e-05, + "loss": 2.7403, + "step": 26358 + }, + { + "epoch": 0.15676444000380627, + "grad_norm": 2.7898688316345215, + "learning_rate": 4.702917734810769e-05, + "loss": 2.7024, + "step": 26359 + }, + { + "epoch": 0.15677038728708725, + "grad_norm": 2.939950466156006, + "learning_rate": 4.702895649774665e-05, + "loss": 2.5659, + "step": 26360 + }, + { + "epoch": 0.15677633457036824, + "grad_norm": 2.2159571647644043, + "learning_rate": 4.702873563969553e-05, + "loss": 4.2729, + "step": 26361 + }, + { + "epoch": 0.15678228185364926, + "grad_norm": 1.4781655073165894, + "learning_rate": 4.7028514773954404e-05, + "loss": 4.7654, + "step": 26362 + }, + { + "epoch": 0.15678822913693025, + "grad_norm": 3.3153202533721924, + "learning_rate": 4.702829390052335e-05, + "loss": 4.055, + "step": 26363 + }, + { + "epoch": 0.15679417642021123, + "grad_norm": 4.366955757141113, + "learning_rate": 4.7028073019402446e-05, + "loss": 2.463, + "step": 26364 + }, + { + "epoch": 0.15680012370349225, + "grad_norm": 3.7748520374298096, + "learning_rate": 4.702785213059177e-05, + "loss": 2.8617, + "step": 26365 + }, + { + "epoch": 0.15680607098677324, + "grad_norm": 3.252652645111084, + "learning_rate": 4.7027631234091394e-05, + "loss": 2.8654, + "step": 26366 + }, + { + "epoch": 0.15681201827005423, + "grad_norm": 3.4591829776763916, + "learning_rate": 4.7027410329901414e-05, + "loss": 3.3268, + "step": 26367 + }, + { + "epoch": 0.15681796555333524, + "grad_norm": 2.971773624420166, + "learning_rate": 4.702718941802188e-05, + "loss": 2.835, + "step": 26368 + }, + { + "epoch": 0.15682391283661623, + "grad_norm": 2.8094983100891113, + "learning_rate": 4.7026968498452884e-05, + "loss": 3.5431, + "step": 26369 + }, + { + "epoch": 0.15682986011989722, + "grad_norm": 3.014570474624634, + "learning_rate": 4.7026747571194496e-05, + "loss": 3.2034, + "step": 26370 + }, + { + "epoch": 0.15683580740317823, + "grad_norm": 3.1913933753967285, + "learning_rate": 4.7026526636246805e-05, + "loss": 2.944, + "step": 26371 + }, + { + "epoch": 0.15684175468645922, + "grad_norm": 3.0981903076171875, + "learning_rate": 4.7026305693609884e-05, + "loss": 3.1399, + "step": 26372 + }, + { + "epoch": 0.1568477019697402, + "grad_norm": 2.7449357509613037, + "learning_rate": 4.70260847432838e-05, + "loss": 2.9713, + "step": 26373 + }, + { + "epoch": 0.15685364925302123, + "grad_norm": 2.5030126571655273, + "learning_rate": 4.7025863785268645e-05, + "loss": 4.1367, + "step": 26374 + }, + { + "epoch": 0.15685959653630221, + "grad_norm": 1.7585763931274414, + "learning_rate": 4.7025642819564476e-05, + "loss": 5.4266, + "step": 26375 + }, + { + "epoch": 0.1568655438195832, + "grad_norm": 1.6513370275497437, + "learning_rate": 4.702542184617139e-05, + "loss": 5.4329, + "step": 26376 + }, + { + "epoch": 0.15687149110286422, + "grad_norm": 1.381144404411316, + "learning_rate": 4.702520086508946e-05, + "loss": 5.2046, + "step": 26377 + }, + { + "epoch": 0.1568774383861452, + "grad_norm": 1.9510244131088257, + "learning_rate": 4.702497987631875e-05, + "loss": 5.365, + "step": 26378 + }, + { + "epoch": 0.1568833856694262, + "grad_norm": 2.6427478790283203, + "learning_rate": 4.702475887985936e-05, + "loss": 4.8551, + "step": 26379 + }, + { + "epoch": 0.1568893329527072, + "grad_norm": 1.9253584146499634, + "learning_rate": 4.702453787571135e-05, + "loss": 4.7738, + "step": 26380 + }, + { + "epoch": 0.1568952802359882, + "grad_norm": 1.9647809267044067, + "learning_rate": 4.7024316863874795e-05, + "loss": 5.0153, + "step": 26381 + }, + { + "epoch": 0.1569012275192692, + "grad_norm": 1.7858566045761108, + "learning_rate": 4.7024095844349786e-05, + "loss": 5.4806, + "step": 26382 + }, + { + "epoch": 0.1569071748025502, + "grad_norm": 1.5491056442260742, + "learning_rate": 4.7023874817136395e-05, + "loss": 5.1898, + "step": 26383 + }, + { + "epoch": 0.1569131220858312, + "grad_norm": 1.4932126998901367, + "learning_rate": 4.702365378223469e-05, + "loss": 5.3636, + "step": 26384 + }, + { + "epoch": 0.15691906936911218, + "grad_norm": 1.5436698198318481, + "learning_rate": 4.702343273964475e-05, + "loss": 5.2469, + "step": 26385 + }, + { + "epoch": 0.1569250166523932, + "grad_norm": 1.9735430479049683, + "learning_rate": 4.7023211689366666e-05, + "loss": 5.111, + "step": 26386 + }, + { + "epoch": 0.15693096393567418, + "grad_norm": 1.4643042087554932, + "learning_rate": 4.70229906314005e-05, + "loss": 4.9215, + "step": 26387 + }, + { + "epoch": 0.15693691121895517, + "grad_norm": 2.3229660987854004, + "learning_rate": 4.7022769565746345e-05, + "loss": 4.7726, + "step": 26388 + }, + { + "epoch": 0.1569428585022362, + "grad_norm": 4.978843688964844, + "learning_rate": 4.7022548492404264e-05, + "loss": 4.1208, + "step": 26389 + }, + { + "epoch": 0.15694880578551718, + "grad_norm": 4.040123462677002, + "learning_rate": 4.702232741137434e-05, + "loss": 4.6272, + "step": 26390 + }, + { + "epoch": 0.15695475306879816, + "grad_norm": 1.6977242231369019, + "learning_rate": 4.7022106322656643e-05, + "loss": 5.0605, + "step": 26391 + }, + { + "epoch": 0.15696070035207918, + "grad_norm": 2.055257558822632, + "learning_rate": 4.702188522625126e-05, + "loss": 4.9685, + "step": 26392 + }, + { + "epoch": 0.15696664763536017, + "grad_norm": 1.5921961069107056, + "learning_rate": 4.7021664122158264e-05, + "loss": 5.1433, + "step": 26393 + }, + { + "epoch": 0.15697259491864116, + "grad_norm": 1.5311743021011353, + "learning_rate": 4.7021443010377734e-05, + "loss": 5.2865, + "step": 26394 + }, + { + "epoch": 0.15697854220192217, + "grad_norm": 1.4683947563171387, + "learning_rate": 4.702122189090975e-05, + "loss": 5.2697, + "step": 26395 + }, + { + "epoch": 0.15698448948520316, + "grad_norm": 1.5425411462783813, + "learning_rate": 4.702100076375438e-05, + "loss": 5.5033, + "step": 26396 + }, + { + "epoch": 0.15699043676848415, + "grad_norm": 1.8671424388885498, + "learning_rate": 4.70207796289117e-05, + "loss": 4.544, + "step": 26397 + }, + { + "epoch": 0.15699638405176516, + "grad_norm": 2.107107400894165, + "learning_rate": 4.70205584863818e-05, + "loss": 4.2386, + "step": 26398 + }, + { + "epoch": 0.15700233133504615, + "grad_norm": 1.6025463342666626, + "learning_rate": 4.7020337336164746e-05, + "loss": 5.742, + "step": 26399 + }, + { + "epoch": 0.15700827861832714, + "grad_norm": 1.4157508611679077, + "learning_rate": 4.702011617826063e-05, + "loss": 6.2568, + "step": 26400 + }, + { + "epoch": 0.15701422590160816, + "grad_norm": 1.4367010593414307, + "learning_rate": 4.701989501266951e-05, + "loss": 6.0992, + "step": 26401 + }, + { + "epoch": 0.15702017318488914, + "grad_norm": 1.7271238565444946, + "learning_rate": 4.7019673839391476e-05, + "loss": 4.9925, + "step": 26402 + }, + { + "epoch": 0.15702612046817013, + "grad_norm": 1.4689936637878418, + "learning_rate": 4.70194526584266e-05, + "loss": 5.1224, + "step": 26403 + }, + { + "epoch": 0.15703206775145115, + "grad_norm": 1.816994071006775, + "learning_rate": 4.701923146977496e-05, + "loss": 4.5333, + "step": 26404 + }, + { + "epoch": 0.15703801503473214, + "grad_norm": 1.6789166927337646, + "learning_rate": 4.7019010273436634e-05, + "loss": 4.9303, + "step": 26405 + }, + { + "epoch": 0.15704396231801313, + "grad_norm": 1.8921838998794556, + "learning_rate": 4.70187890694117e-05, + "loss": 4.3924, + "step": 26406 + }, + { + "epoch": 0.15704990960129414, + "grad_norm": 2.397531270980835, + "learning_rate": 4.701856785770024e-05, + "loss": 3.317, + "step": 26407 + }, + { + "epoch": 0.15705585688457513, + "grad_norm": 2.1896491050720215, + "learning_rate": 4.7018346638302314e-05, + "loss": 4.2621, + "step": 26408 + }, + { + "epoch": 0.15706180416785612, + "grad_norm": 1.5073274374008179, + "learning_rate": 4.7018125411218014e-05, + "loss": 5.238, + "step": 26409 + }, + { + "epoch": 0.15706775145113713, + "grad_norm": 1.672512173652649, + "learning_rate": 4.701790417644741e-05, + "loss": 5.0822, + "step": 26410 + }, + { + "epoch": 0.15707369873441812, + "grad_norm": 1.6251648664474487, + "learning_rate": 4.701768293399059e-05, + "loss": 5.3444, + "step": 26411 + }, + { + "epoch": 0.1570796460176991, + "grad_norm": 1.8805150985717773, + "learning_rate": 4.701746168384763e-05, + "loss": 4.8765, + "step": 26412 + }, + { + "epoch": 0.15708559330098013, + "grad_norm": 1.7325724363327026, + "learning_rate": 4.701724042601859e-05, + "loss": 5.3281, + "step": 26413 + }, + { + "epoch": 0.1570915405842611, + "grad_norm": 1.5105476379394531, + "learning_rate": 4.701701916050357e-05, + "loss": 5.2577, + "step": 26414 + }, + { + "epoch": 0.1570974878675421, + "grad_norm": 1.766034722328186, + "learning_rate": 4.701679788730263e-05, + "loss": 4.8186, + "step": 26415 + }, + { + "epoch": 0.15710343515082312, + "grad_norm": 1.5909993648529053, + "learning_rate": 4.701657660641585e-05, + "loss": 4.9077, + "step": 26416 + }, + { + "epoch": 0.1571093824341041, + "grad_norm": 1.663878083229065, + "learning_rate": 4.7016355317843316e-05, + "loss": 5.3196, + "step": 26417 + }, + { + "epoch": 0.1571153297173851, + "grad_norm": 1.8101507425308228, + "learning_rate": 4.7016134021585095e-05, + "loss": 4.7219, + "step": 26418 + }, + { + "epoch": 0.15712127700066608, + "grad_norm": 1.3929054737091064, + "learning_rate": 4.7015912717641276e-05, + "loss": 5.169, + "step": 26419 + }, + { + "epoch": 0.1571272242839471, + "grad_norm": 1.6896204948425293, + "learning_rate": 4.701569140601192e-05, + "loss": 4.9141, + "step": 26420 + }, + { + "epoch": 0.15713317156722809, + "grad_norm": 2.3035976886749268, + "learning_rate": 4.7015470086697124e-05, + "loss": 4.4289, + "step": 26421 + }, + { + "epoch": 0.15713911885050907, + "grad_norm": 1.8286256790161133, + "learning_rate": 4.701524875969695e-05, + "loss": 4.7177, + "step": 26422 + }, + { + "epoch": 0.1571450661337901, + "grad_norm": 1.7254390716552734, + "learning_rate": 4.701502742501147e-05, + "loss": 3.99, + "step": 26423 + }, + { + "epoch": 0.15715101341707108, + "grad_norm": 1.6733616590499878, + "learning_rate": 4.701480608264078e-05, + "loss": 5.4146, + "step": 26424 + }, + { + "epoch": 0.15715696070035207, + "grad_norm": 2.167525291442871, + "learning_rate": 4.701458473258496e-05, + "loss": 5.751, + "step": 26425 + }, + { + "epoch": 0.15716290798363308, + "grad_norm": 1.5784038305282593, + "learning_rate": 4.7014363374844064e-05, + "loss": 5.2341, + "step": 26426 + }, + { + "epoch": 0.15716885526691407, + "grad_norm": 1.6087944507598877, + "learning_rate": 4.7014142009418176e-05, + "loss": 4.6644, + "step": 26427 + }, + { + "epoch": 0.15717480255019506, + "grad_norm": 2.1396427154541016, + "learning_rate": 4.701392063630739e-05, + "loss": 4.7034, + "step": 26428 + }, + { + "epoch": 0.15718074983347607, + "grad_norm": 2.069359540939331, + "learning_rate": 4.701369925551177e-05, + "loss": 4.1612, + "step": 26429 + }, + { + "epoch": 0.15718669711675706, + "grad_norm": 2.0008041858673096, + "learning_rate": 4.7013477867031385e-05, + "loss": 4.3536, + "step": 26430 + }, + { + "epoch": 0.15719264440003805, + "grad_norm": 1.9997189044952393, + "learning_rate": 4.701325647086633e-05, + "loss": 4.4613, + "step": 26431 + }, + { + "epoch": 0.15719859168331907, + "grad_norm": 1.625603437423706, + "learning_rate": 4.701303506701667e-05, + "loss": 4.63, + "step": 26432 + }, + { + "epoch": 0.15720453896660005, + "grad_norm": 1.5895150899887085, + "learning_rate": 4.701281365548249e-05, + "loss": 4.884, + "step": 26433 + }, + { + "epoch": 0.15721048624988104, + "grad_norm": 1.6569048166275024, + "learning_rate": 4.7012592236263865e-05, + "loss": 4.5834, + "step": 26434 + }, + { + "epoch": 0.15721643353316206, + "grad_norm": 1.9942916631698608, + "learning_rate": 4.7012370809360874e-05, + "loss": 4.8536, + "step": 26435 + }, + { + "epoch": 0.15722238081644305, + "grad_norm": 1.7535972595214844, + "learning_rate": 4.701214937477359e-05, + "loss": 4.9008, + "step": 26436 + }, + { + "epoch": 0.15722832809972404, + "grad_norm": 1.9767074584960938, + "learning_rate": 4.7011927932502085e-05, + "loss": 5.4972, + "step": 26437 + }, + { + "epoch": 0.15723427538300505, + "grad_norm": 1.6117023229599, + "learning_rate": 4.701170648254645e-05, + "loss": 5.2583, + "step": 26438 + }, + { + "epoch": 0.15724022266628604, + "grad_norm": 1.6277034282684326, + "learning_rate": 4.7011485024906754e-05, + "loss": 5.0635, + "step": 26439 + }, + { + "epoch": 0.15724616994956703, + "grad_norm": 1.5075265169143677, + "learning_rate": 4.701126355958308e-05, + "loss": 5.2974, + "step": 26440 + }, + { + "epoch": 0.15725211723284804, + "grad_norm": 1.377233862876892, + "learning_rate": 4.70110420865755e-05, + "loss": 5.0643, + "step": 26441 + }, + { + "epoch": 0.15725806451612903, + "grad_norm": 1.5468838214874268, + "learning_rate": 4.7010820605884085e-05, + "loss": 5.0746, + "step": 26442 + }, + { + "epoch": 0.15726401179941002, + "grad_norm": 1.864901065826416, + "learning_rate": 4.701059911750893e-05, + "loss": 5.0492, + "step": 26443 + }, + { + "epoch": 0.15726995908269104, + "grad_norm": 2.086214542388916, + "learning_rate": 4.70103776214501e-05, + "loss": 4.8566, + "step": 26444 + }, + { + "epoch": 0.15727590636597202, + "grad_norm": 1.571226716041565, + "learning_rate": 4.701015611770767e-05, + "loss": 4.7567, + "step": 26445 + }, + { + "epoch": 0.157281853649253, + "grad_norm": 2.299607753753662, + "learning_rate": 4.7009934606281726e-05, + "loss": 4.8576, + "step": 26446 + }, + { + "epoch": 0.15728780093253403, + "grad_norm": 2.019814968109131, + "learning_rate": 4.7009713087172335e-05, + "loss": 4.6524, + "step": 26447 + }, + { + "epoch": 0.15729374821581502, + "grad_norm": 1.8718371391296387, + "learning_rate": 4.700949156037959e-05, + "loss": 4.6629, + "step": 26448 + }, + { + "epoch": 0.157299695499096, + "grad_norm": 1.9023678302764893, + "learning_rate": 4.700927002590355e-05, + "loss": 4.8558, + "step": 26449 + }, + { + "epoch": 0.15730564278237702, + "grad_norm": 1.8519774675369263, + "learning_rate": 4.700904848374431e-05, + "loss": 4.8498, + "step": 26450 + }, + { + "epoch": 0.157311590065658, + "grad_norm": 2.1003715991973877, + "learning_rate": 4.7008826933901937e-05, + "loss": 4.9443, + "step": 26451 + }, + { + "epoch": 0.157317537348939, + "grad_norm": 1.8350003957748413, + "learning_rate": 4.7008605376376504e-05, + "loss": 4.9194, + "step": 26452 + }, + { + "epoch": 0.15732348463222, + "grad_norm": 1.9740381240844727, + "learning_rate": 4.70083838111681e-05, + "loss": 5.035, + "step": 26453 + }, + { + "epoch": 0.157329431915501, + "grad_norm": 1.8660650253295898, + "learning_rate": 4.700816223827679e-05, + "loss": 4.7712, + "step": 26454 + }, + { + "epoch": 0.157335379198782, + "grad_norm": 2.6117658615112305, + "learning_rate": 4.700794065770266e-05, + "loss": 4.0286, + "step": 26455 + }, + { + "epoch": 0.157341326482063, + "grad_norm": 2.0968191623687744, + "learning_rate": 4.700771906944579e-05, + "loss": 4.505, + "step": 26456 + }, + { + "epoch": 0.157347273765344, + "grad_norm": 2.0062074661254883, + "learning_rate": 4.700749747350624e-05, + "loss": 4.806, + "step": 26457 + }, + { + "epoch": 0.15735322104862498, + "grad_norm": 1.8398696184158325, + "learning_rate": 4.700727586988412e-05, + "loss": 4.799, + "step": 26458 + }, + { + "epoch": 0.157359168331906, + "grad_norm": 1.8096837997436523, + "learning_rate": 4.7007054258579474e-05, + "loss": 5.0503, + "step": 26459 + }, + { + "epoch": 0.15736511561518698, + "grad_norm": 1.735893726348877, + "learning_rate": 4.7006832639592396e-05, + "loss": 5.037, + "step": 26460 + }, + { + "epoch": 0.15737106289846797, + "grad_norm": 1.9189250469207764, + "learning_rate": 4.7006611012922966e-05, + "loss": 5.3352, + "step": 26461 + }, + { + "epoch": 0.157377010181749, + "grad_norm": 2.387317657470703, + "learning_rate": 4.7006389378571246e-05, + "loss": 4.055, + "step": 26462 + }, + { + "epoch": 0.15738295746502998, + "grad_norm": 2.414651870727539, + "learning_rate": 4.7006167736537323e-05, + "loss": 3.7756, + "step": 26463 + }, + { + "epoch": 0.15738890474831096, + "grad_norm": 2.497237205505371, + "learning_rate": 4.700594608682127e-05, + "loss": 3.7823, + "step": 26464 + }, + { + "epoch": 0.15739485203159198, + "grad_norm": 2.2141029834747314, + "learning_rate": 4.700572442942318e-05, + "loss": 4.1131, + "step": 26465 + }, + { + "epoch": 0.15740079931487297, + "grad_norm": 1.8615038394927979, + "learning_rate": 4.700550276434312e-05, + "loss": 4.8686, + "step": 26466 + }, + { + "epoch": 0.15740674659815396, + "grad_norm": 1.7082819938659668, + "learning_rate": 4.700528109158115e-05, + "loss": 5.2237, + "step": 26467 + }, + { + "epoch": 0.15741269388143497, + "grad_norm": 1.8039544820785522, + "learning_rate": 4.700505941113739e-05, + "loss": 4.5243, + "step": 26468 + }, + { + "epoch": 0.15741864116471596, + "grad_norm": 1.874585509300232, + "learning_rate": 4.700483772301187e-05, + "loss": 4.7674, + "step": 26469 + }, + { + "epoch": 0.15742458844799695, + "grad_norm": 2.083904266357422, + "learning_rate": 4.70046160272047e-05, + "loss": 4.8949, + "step": 26470 + }, + { + "epoch": 0.15743053573127797, + "grad_norm": 1.3937793970108032, + "learning_rate": 4.700439432371593e-05, + "loss": 5.6113, + "step": 26471 + }, + { + "epoch": 0.15743648301455895, + "grad_norm": 1.924481987953186, + "learning_rate": 4.700417261254567e-05, + "loss": 5.1439, + "step": 26472 + }, + { + "epoch": 0.15744243029783994, + "grad_norm": 1.6527281999588013, + "learning_rate": 4.700395089369397e-05, + "loss": 5.6962, + "step": 26473 + }, + { + "epoch": 0.15744837758112096, + "grad_norm": 1.5053030252456665, + "learning_rate": 4.700372916716093e-05, + "loss": 4.7299, + "step": 26474 + }, + { + "epoch": 0.15745432486440195, + "grad_norm": 1.2048367261886597, + "learning_rate": 4.7003507432946604e-05, + "loss": 5.5429, + "step": 26475 + }, + { + "epoch": 0.15746027214768293, + "grad_norm": 1.3451159000396729, + "learning_rate": 4.700328569105108e-05, + "loss": 5.5326, + "step": 26476 + }, + { + "epoch": 0.15746621943096392, + "grad_norm": 1.4441956281661987, + "learning_rate": 4.700306394147445e-05, + "loss": 5.5795, + "step": 26477 + }, + { + "epoch": 0.15747216671424494, + "grad_norm": 1.5551849603652954, + "learning_rate": 4.700284218421676e-05, + "loss": 5.2977, + "step": 26478 + }, + { + "epoch": 0.15747811399752593, + "grad_norm": 1.713437795639038, + "learning_rate": 4.7002620419278115e-05, + "loss": 5.242, + "step": 26479 + }, + { + "epoch": 0.15748406128080691, + "grad_norm": 1.4137530326843262, + "learning_rate": 4.7002398646658586e-05, + "loss": 5.2396, + "step": 26480 + }, + { + "epoch": 0.15749000856408793, + "grad_norm": 1.846640706062317, + "learning_rate": 4.700217686635824e-05, + "loss": 4.926, + "step": 26481 + }, + { + "epoch": 0.15749595584736892, + "grad_norm": 2.2699780464172363, + "learning_rate": 4.7001955078377156e-05, + "loss": 3.8352, + "step": 26482 + }, + { + "epoch": 0.1575019031306499, + "grad_norm": 1.959821105003357, + "learning_rate": 4.700173328271543e-05, + "loss": 4.7261, + "step": 26483 + }, + { + "epoch": 0.15750785041393092, + "grad_norm": 1.5478743314743042, + "learning_rate": 4.700151147937312e-05, + "loss": 5.463, + "step": 26484 + }, + { + "epoch": 0.1575137976972119, + "grad_norm": 1.835830807685852, + "learning_rate": 4.7001289668350314e-05, + "loss": 4.9938, + "step": 26485 + }, + { + "epoch": 0.1575197449804929, + "grad_norm": 2.1762354373931885, + "learning_rate": 4.700106784964708e-05, + "loss": 4.0548, + "step": 26486 + }, + { + "epoch": 0.15752569226377391, + "grad_norm": 1.8922265768051147, + "learning_rate": 4.70008460232635e-05, + "loss": 4.1947, + "step": 26487 + }, + { + "epoch": 0.1575316395470549, + "grad_norm": 1.6450932025909424, + "learning_rate": 4.7000624189199646e-05, + "loss": 5.014, + "step": 26488 + }, + { + "epoch": 0.1575375868303359, + "grad_norm": 1.5196298360824585, + "learning_rate": 4.7000402347455616e-05, + "loss": 5.332, + "step": 26489 + }, + { + "epoch": 0.1575435341136169, + "grad_norm": 1.665044903755188, + "learning_rate": 4.700018049803146e-05, + "loss": 4.992, + "step": 26490 + }, + { + "epoch": 0.1575494813968979, + "grad_norm": 1.4281147718429565, + "learning_rate": 4.6999958640927275e-05, + "loss": 4.9014, + "step": 26491 + }, + { + "epoch": 0.15755542868017888, + "grad_norm": 1.4559162855148315, + "learning_rate": 4.6999736776143135e-05, + "loss": 4.9361, + "step": 26492 + }, + { + "epoch": 0.1575613759634599, + "grad_norm": 1.7235175371170044, + "learning_rate": 4.699951490367911e-05, + "loss": 5.2429, + "step": 26493 + }, + { + "epoch": 0.1575673232467409, + "grad_norm": 1.5422228574752808, + "learning_rate": 4.699929302353528e-05, + "loss": 5.5294, + "step": 26494 + }, + { + "epoch": 0.15757327053002188, + "grad_norm": 1.6905406713485718, + "learning_rate": 4.699907113571173e-05, + "loss": 5.0958, + "step": 26495 + }, + { + "epoch": 0.1575792178133029, + "grad_norm": 1.8692830801010132, + "learning_rate": 4.699884924020853e-05, + "loss": 4.7711, + "step": 26496 + }, + { + "epoch": 0.15758516509658388, + "grad_norm": 1.7128182649612427, + "learning_rate": 4.699862733702575e-05, + "loss": 5.344, + "step": 26497 + }, + { + "epoch": 0.15759111237986487, + "grad_norm": 1.7795850038528442, + "learning_rate": 4.6998405426163486e-05, + "loss": 5.044, + "step": 26498 + }, + { + "epoch": 0.15759705966314588, + "grad_norm": 1.8591927289962769, + "learning_rate": 4.6998183507621804e-05, + "loss": 5.7269, + "step": 26499 + }, + { + "epoch": 0.15760300694642687, + "grad_norm": 1.7289692163467407, + "learning_rate": 4.6997961581400785e-05, + "loss": 5.295, + "step": 26500 + }, + { + "epoch": 0.15760895422970786, + "grad_norm": 2.03056001663208, + "learning_rate": 4.699773964750049e-05, + "loss": 4.9402, + "step": 26501 + }, + { + "epoch": 0.15761490151298888, + "grad_norm": 1.7518073320388794, + "learning_rate": 4.699751770592104e-05, + "loss": 4.8934, + "step": 26502 + }, + { + "epoch": 0.15762084879626986, + "grad_norm": 1.7724835872650146, + "learning_rate": 4.6997295756662465e-05, + "loss": 4.6237, + "step": 26503 + }, + { + "epoch": 0.15762679607955085, + "grad_norm": 1.475229263305664, + "learning_rate": 4.699707379972485e-05, + "loss": 5.2655, + "step": 26504 + }, + { + "epoch": 0.15763274336283187, + "grad_norm": 1.4267539978027344, + "learning_rate": 4.69968518351083e-05, + "loss": 5.2016, + "step": 26505 + }, + { + "epoch": 0.15763869064611286, + "grad_norm": 2.1211252212524414, + "learning_rate": 4.699662986281288e-05, + "loss": 4.1632, + "step": 26506 + }, + { + "epoch": 0.15764463792939384, + "grad_norm": 2.0549299716949463, + "learning_rate": 4.699640788283866e-05, + "loss": 4.0886, + "step": 26507 + }, + { + "epoch": 0.15765058521267486, + "grad_norm": 2.210500717163086, + "learning_rate": 4.699618589518572e-05, + "loss": 4.3042, + "step": 26508 + }, + { + "epoch": 0.15765653249595585, + "grad_norm": 2.2884981632232666, + "learning_rate": 4.699596389985413e-05, + "loss": 4.178, + "step": 26509 + }, + { + "epoch": 0.15766247977923684, + "grad_norm": 2.24526047706604, + "learning_rate": 4.699574189684399e-05, + "loss": 4.2319, + "step": 26510 + }, + { + "epoch": 0.15766842706251785, + "grad_norm": 2.401103973388672, + "learning_rate": 4.699551988615535e-05, + "loss": 4.1215, + "step": 26511 + }, + { + "epoch": 0.15767437434579884, + "grad_norm": 2.3012118339538574, + "learning_rate": 4.699529786778831e-05, + "loss": 4.3254, + "step": 26512 + }, + { + "epoch": 0.15768032162907983, + "grad_norm": 1.963396668434143, + "learning_rate": 4.699507584174294e-05, + "loss": 4.4707, + "step": 26513 + }, + { + "epoch": 0.15768626891236084, + "grad_norm": 2.3375425338745117, + "learning_rate": 4.699485380801931e-05, + "loss": 4.2861, + "step": 26514 + }, + { + "epoch": 0.15769221619564183, + "grad_norm": 2.189077377319336, + "learning_rate": 4.699463176661751e-05, + "loss": 4.3273, + "step": 26515 + }, + { + "epoch": 0.15769816347892282, + "grad_norm": 1.8198938369750977, + "learning_rate": 4.699440971753761e-05, + "loss": 4.6847, + "step": 26516 + }, + { + "epoch": 0.15770411076220384, + "grad_norm": 1.646579623222351, + "learning_rate": 4.699418766077969e-05, + "loss": 5.126, + "step": 26517 + }, + { + "epoch": 0.15771005804548482, + "grad_norm": 2.0718090534210205, + "learning_rate": 4.6993965596343825e-05, + "loss": 4.5059, + "step": 26518 + }, + { + "epoch": 0.1577160053287658, + "grad_norm": 1.6022831201553345, + "learning_rate": 4.699374352423009e-05, + "loss": 5.5119, + "step": 26519 + }, + { + "epoch": 0.15772195261204683, + "grad_norm": 1.3838839530944824, + "learning_rate": 4.699352144443857e-05, + "loss": 5.0512, + "step": 26520 + }, + { + "epoch": 0.15772789989532782, + "grad_norm": 1.3122941255569458, + "learning_rate": 4.699329935696934e-05, + "loss": 5.1832, + "step": 26521 + }, + { + "epoch": 0.1577338471786088, + "grad_norm": 1.6332945823669434, + "learning_rate": 4.699307726182247e-05, + "loss": 5.081, + "step": 26522 + }, + { + "epoch": 0.15773979446188982, + "grad_norm": 1.5045149326324463, + "learning_rate": 4.699285515899805e-05, + "loss": 5.2076, + "step": 26523 + }, + { + "epoch": 0.1577457417451708, + "grad_norm": 1.4530036449432373, + "learning_rate": 4.699263304849615e-05, + "loss": 5.3623, + "step": 26524 + }, + { + "epoch": 0.1577516890284518, + "grad_norm": 1.6600695848464966, + "learning_rate": 4.699241093031685e-05, + "loss": 5.5862, + "step": 26525 + }, + { + "epoch": 0.1577576363117328, + "grad_norm": 1.6276617050170898, + "learning_rate": 4.6992188804460225e-05, + "loss": 5.282, + "step": 26526 + }, + { + "epoch": 0.1577635835950138, + "grad_norm": 1.7213892936706543, + "learning_rate": 4.6991966670926355e-05, + "loss": 5.4613, + "step": 26527 + }, + { + "epoch": 0.1577695308782948, + "grad_norm": 1.63749361038208, + "learning_rate": 4.6991744529715316e-05, + "loss": 5.4498, + "step": 26528 + }, + { + "epoch": 0.1577754781615758, + "grad_norm": 1.5182081460952759, + "learning_rate": 4.6991522380827184e-05, + "loss": 5.3962, + "step": 26529 + }, + { + "epoch": 0.1577814254448568, + "grad_norm": 1.6695536375045776, + "learning_rate": 4.699130022426204e-05, + "loss": 5.1221, + "step": 26530 + }, + { + "epoch": 0.15778737272813778, + "grad_norm": 1.4350519180297852, + "learning_rate": 4.6991078060019966e-05, + "loss": 5.319, + "step": 26531 + }, + { + "epoch": 0.1577933200114188, + "grad_norm": 1.2092465162277222, + "learning_rate": 4.699085588810103e-05, + "loss": 5.4316, + "step": 26532 + }, + { + "epoch": 0.15779926729469979, + "grad_norm": 1.474252700805664, + "learning_rate": 4.6990633708505304e-05, + "loss": 5.6559, + "step": 26533 + }, + { + "epoch": 0.15780521457798077, + "grad_norm": 1.6271101236343384, + "learning_rate": 4.699041152123289e-05, + "loss": 5.7491, + "step": 26534 + }, + { + "epoch": 0.1578111618612618, + "grad_norm": 1.6184288263320923, + "learning_rate": 4.699018932628384e-05, + "loss": 5.3195, + "step": 26535 + }, + { + "epoch": 0.15781710914454278, + "grad_norm": 1.3626726865768433, + "learning_rate": 4.698996712365825e-05, + "loss": 5.2913, + "step": 26536 + }, + { + "epoch": 0.15782305642782377, + "grad_norm": 2.3408188819885254, + "learning_rate": 4.6989744913356185e-05, + "loss": 4.774, + "step": 26537 + }, + { + "epoch": 0.15782900371110475, + "grad_norm": 1.500992774963379, + "learning_rate": 4.698952269537773e-05, + "loss": 5.5717, + "step": 26538 + }, + { + "epoch": 0.15783495099438577, + "grad_norm": 1.393517017364502, + "learning_rate": 4.6989300469722955e-05, + "loss": 6.1478, + "step": 26539 + }, + { + "epoch": 0.15784089827766676, + "grad_norm": 1.6048024892807007, + "learning_rate": 4.698907823639195e-05, + "loss": 5.5076, + "step": 26540 + }, + { + "epoch": 0.15784684556094775, + "grad_norm": 1.7231130599975586, + "learning_rate": 4.698885599538478e-05, + "loss": 5.1799, + "step": 26541 + }, + { + "epoch": 0.15785279284422876, + "grad_norm": 1.4809112548828125, + "learning_rate": 4.6988633746701525e-05, + "loss": 5.146, + "step": 26542 + }, + { + "epoch": 0.15785874012750975, + "grad_norm": 1.6530802249908447, + "learning_rate": 4.6988411490342266e-05, + "loss": 5.3245, + "step": 26543 + }, + { + "epoch": 0.15786468741079074, + "grad_norm": 1.5264098644256592, + "learning_rate": 4.6988189226307087e-05, + "loss": 5.3715, + "step": 26544 + }, + { + "epoch": 0.15787063469407175, + "grad_norm": 1.3241318464279175, + "learning_rate": 4.6987966954596054e-05, + "loss": 5.387, + "step": 26545 + }, + { + "epoch": 0.15787658197735274, + "grad_norm": 1.6130857467651367, + "learning_rate": 4.698774467520924e-05, + "loss": 5.2902, + "step": 26546 + }, + { + "epoch": 0.15788252926063373, + "grad_norm": 1.4999042749404907, + "learning_rate": 4.698752238814674e-05, + "loss": 5.2129, + "step": 26547 + }, + { + "epoch": 0.15788847654391475, + "grad_norm": 1.4773963689804077, + "learning_rate": 4.698730009340863e-05, + "loss": 5.7722, + "step": 26548 + }, + { + "epoch": 0.15789442382719573, + "grad_norm": 1.666413426399231, + "learning_rate": 4.698707779099497e-05, + "loss": 5.7418, + "step": 26549 + }, + { + "epoch": 0.15790037111047672, + "grad_norm": 1.4869890213012695, + "learning_rate": 4.698685548090585e-05, + "loss": 4.8418, + "step": 26550 + }, + { + "epoch": 0.15790631839375774, + "grad_norm": 1.6295100450515747, + "learning_rate": 4.698663316314135e-05, + "loss": 4.7722, + "step": 26551 + }, + { + "epoch": 0.15791226567703873, + "grad_norm": 1.5449434518814087, + "learning_rate": 4.698641083770154e-05, + "loss": 5.0621, + "step": 26552 + }, + { + "epoch": 0.15791821296031971, + "grad_norm": 1.6735725402832031, + "learning_rate": 4.6986188504586507e-05, + "loss": 5.5605, + "step": 26553 + }, + { + "epoch": 0.15792416024360073, + "grad_norm": 1.6270878314971924, + "learning_rate": 4.698596616379631e-05, + "loss": 5.279, + "step": 26554 + }, + { + "epoch": 0.15793010752688172, + "grad_norm": 1.6335285902023315, + "learning_rate": 4.698574381533105e-05, + "loss": 5.398, + "step": 26555 + }, + { + "epoch": 0.1579360548101627, + "grad_norm": 2.2176520824432373, + "learning_rate": 4.698552145919079e-05, + "loss": 4.9806, + "step": 26556 + }, + { + "epoch": 0.15794200209344372, + "grad_norm": 1.8645645380020142, + "learning_rate": 4.6985299095375615e-05, + "loss": 5.2633, + "step": 26557 + }, + { + "epoch": 0.1579479493767247, + "grad_norm": 1.708526372909546, + "learning_rate": 4.698507672388559e-05, + "loss": 5.0308, + "step": 26558 + }, + { + "epoch": 0.1579538966600057, + "grad_norm": 2.148980140686035, + "learning_rate": 4.698485434472081e-05, + "loss": 4.5213, + "step": 26559 + }, + { + "epoch": 0.15795984394328672, + "grad_norm": 2.402442693710327, + "learning_rate": 4.6984631957881346e-05, + "loss": 4.4377, + "step": 26560 + }, + { + "epoch": 0.1579657912265677, + "grad_norm": 2.298003911972046, + "learning_rate": 4.698440956336727e-05, + "loss": 4.5809, + "step": 26561 + }, + { + "epoch": 0.1579717385098487, + "grad_norm": 2.53639554977417, + "learning_rate": 4.698418716117867e-05, + "loss": 4.1869, + "step": 26562 + }, + { + "epoch": 0.1579776857931297, + "grad_norm": 2.0686380863189697, + "learning_rate": 4.698396475131561e-05, + "loss": 4.413, + "step": 26563 + }, + { + "epoch": 0.1579836330764107, + "grad_norm": 1.8968595266342163, + "learning_rate": 4.698374233377818e-05, + "loss": 4.9939, + "step": 26564 + }, + { + "epoch": 0.15798958035969168, + "grad_norm": 1.8896044492721558, + "learning_rate": 4.698351990856645e-05, + "loss": 4.6383, + "step": 26565 + }, + { + "epoch": 0.1579955276429727, + "grad_norm": 1.7179672718048096, + "learning_rate": 4.6983297475680496e-05, + "loss": 5.5635, + "step": 26566 + }, + { + "epoch": 0.1580014749262537, + "grad_norm": 1.6506478786468506, + "learning_rate": 4.6983075035120404e-05, + "loss": 5.1821, + "step": 26567 + }, + { + "epoch": 0.15800742220953468, + "grad_norm": 2.180238723754883, + "learning_rate": 4.698285258688625e-05, + "loss": 4.1298, + "step": 26568 + }, + { + "epoch": 0.1580133694928157, + "grad_norm": 2.208676338195801, + "learning_rate": 4.698263013097811e-05, + "loss": 4.3238, + "step": 26569 + }, + { + "epoch": 0.15801931677609668, + "grad_norm": 1.694823145866394, + "learning_rate": 4.6982407667396055e-05, + "loss": 5.3418, + "step": 26570 + }, + { + "epoch": 0.15802526405937767, + "grad_norm": 1.7310692071914673, + "learning_rate": 4.6982185196140174e-05, + "loss": 5.4066, + "step": 26571 + }, + { + "epoch": 0.15803121134265868, + "grad_norm": 2.302055597305298, + "learning_rate": 4.698196271721054e-05, + "loss": 4.1817, + "step": 26572 + }, + { + "epoch": 0.15803715862593967, + "grad_norm": 1.872363567352295, + "learning_rate": 4.698174023060722e-05, + "loss": 4.6733, + "step": 26573 + }, + { + "epoch": 0.15804310590922066, + "grad_norm": 2.134537696838379, + "learning_rate": 4.698151773633032e-05, + "loss": 4.3211, + "step": 26574 + }, + { + "epoch": 0.15804905319250168, + "grad_norm": 2.4381020069122314, + "learning_rate": 4.698129523437989e-05, + "loss": 4.2212, + "step": 26575 + }, + { + "epoch": 0.15805500047578266, + "grad_norm": 1.6739851236343384, + "learning_rate": 4.6981072724756e-05, + "loss": 5.3057, + "step": 26576 + }, + { + "epoch": 0.15806094775906365, + "grad_norm": 1.8092267513275146, + "learning_rate": 4.6980850207458765e-05, + "loss": 4.7359, + "step": 26577 + }, + { + "epoch": 0.15806689504234467, + "grad_norm": 1.6420230865478516, + "learning_rate": 4.6980627682488235e-05, + "loss": 5.086, + "step": 26578 + }, + { + "epoch": 0.15807284232562566, + "grad_norm": 1.8741960525512695, + "learning_rate": 4.6980405149844494e-05, + "loss": 4.7842, + "step": 26579 + }, + { + "epoch": 0.15807878960890664, + "grad_norm": 2.6539900302886963, + "learning_rate": 4.698018260952763e-05, + "loss": 3.809, + "step": 26580 + }, + { + "epoch": 0.15808473689218766, + "grad_norm": 1.8262064456939697, + "learning_rate": 4.69799600615377e-05, + "loss": 4.8959, + "step": 26581 + }, + { + "epoch": 0.15809068417546865, + "grad_norm": 1.7090948820114136, + "learning_rate": 4.6979737505874796e-05, + "loss": 4.7723, + "step": 26582 + }, + { + "epoch": 0.15809663145874964, + "grad_norm": 1.5634857416152954, + "learning_rate": 4.6979514942539e-05, + "loss": 4.7533, + "step": 26583 + }, + { + "epoch": 0.15810257874203065, + "grad_norm": 1.6470197439193726, + "learning_rate": 4.697929237153037e-05, + "loss": 5.2194, + "step": 26584 + }, + { + "epoch": 0.15810852602531164, + "grad_norm": 2.060804605484009, + "learning_rate": 4.697906979284901e-05, + "loss": 4.3637, + "step": 26585 + }, + { + "epoch": 0.15811447330859263, + "grad_norm": 2.065943717956543, + "learning_rate": 4.697884720649498e-05, + "loss": 4.8908, + "step": 26586 + }, + { + "epoch": 0.15812042059187364, + "grad_norm": 1.5104914903640747, + "learning_rate": 4.697862461246836e-05, + "loss": 5.7029, + "step": 26587 + }, + { + "epoch": 0.15812636787515463, + "grad_norm": 1.593296766281128, + "learning_rate": 4.697840201076922e-05, + "loss": 5.7005, + "step": 26588 + }, + { + "epoch": 0.15813231515843562, + "grad_norm": 1.6516765356063843, + "learning_rate": 4.697817940139766e-05, + "loss": 5.3843, + "step": 26589 + }, + { + "epoch": 0.15813826244171664, + "grad_norm": 1.3671473264694214, + "learning_rate": 4.697795678435374e-05, + "loss": 5.4862, + "step": 26590 + }, + { + "epoch": 0.15814420972499763, + "grad_norm": 1.4163672924041748, + "learning_rate": 4.697773415963754e-05, + "loss": 5.4793, + "step": 26591 + }, + { + "epoch": 0.1581501570082786, + "grad_norm": 1.5477086305618286, + "learning_rate": 4.697751152724914e-05, + "loss": 5.2835, + "step": 26592 + }, + { + "epoch": 0.15815610429155963, + "grad_norm": 1.6029425859451294, + "learning_rate": 4.697728888718862e-05, + "loss": 5.3689, + "step": 26593 + }, + { + "epoch": 0.15816205157484062, + "grad_norm": 1.5130633115768433, + "learning_rate": 4.697706623945605e-05, + "loss": 6.1627, + "step": 26594 + }, + { + "epoch": 0.1581679988581216, + "grad_norm": 1.5171791315078735, + "learning_rate": 4.697684358405152e-05, + "loss": 4.9849, + "step": 26595 + }, + { + "epoch": 0.1581739461414026, + "grad_norm": 1.449781894683838, + "learning_rate": 4.69766209209751e-05, + "loss": 5.5273, + "step": 26596 + }, + { + "epoch": 0.1581798934246836, + "grad_norm": 1.430094838142395, + "learning_rate": 4.697639825022687e-05, + "loss": 5.6825, + "step": 26597 + }, + { + "epoch": 0.1581858407079646, + "grad_norm": 1.2635716199874878, + "learning_rate": 4.69761755718069e-05, + "loss": 5.2177, + "step": 26598 + }, + { + "epoch": 0.15819178799124559, + "grad_norm": 2.20355224609375, + "learning_rate": 4.697595288571528e-05, + "loss": 4.6664, + "step": 26599 + }, + { + "epoch": 0.1581977352745266, + "grad_norm": 1.586509108543396, + "learning_rate": 4.6975730191952086e-05, + "loss": 5.056, + "step": 26600 + }, + { + "epoch": 0.1582036825578076, + "grad_norm": 1.4773000478744507, + "learning_rate": 4.697550749051738e-05, + "loss": 5.2931, + "step": 26601 + }, + { + "epoch": 0.15820962984108858, + "grad_norm": 1.4557143449783325, + "learning_rate": 4.697528478141125e-05, + "loss": 4.9378, + "step": 26602 + }, + { + "epoch": 0.1582155771243696, + "grad_norm": 1.5859819650650024, + "learning_rate": 4.697506206463379e-05, + "loss": 5.1998, + "step": 26603 + }, + { + "epoch": 0.15822152440765058, + "grad_norm": 1.5068250894546509, + "learning_rate": 4.697483934018505e-05, + "loss": 5.2748, + "step": 26604 + }, + { + "epoch": 0.15822747169093157, + "grad_norm": 1.5842232704162598, + "learning_rate": 4.697461660806513e-05, + "loss": 5.326, + "step": 26605 + }, + { + "epoch": 0.1582334189742126, + "grad_norm": 1.5164762735366821, + "learning_rate": 4.697439386827409e-05, + "loss": 5.2282, + "step": 26606 + }, + { + "epoch": 0.15823936625749357, + "grad_norm": 1.5359309911727905, + "learning_rate": 4.697417112081203e-05, + "loss": 5.3723, + "step": 26607 + }, + { + "epoch": 0.15824531354077456, + "grad_norm": 1.560502529144287, + "learning_rate": 4.6973948365678996e-05, + "loss": 5.0822, + "step": 26608 + }, + { + "epoch": 0.15825126082405558, + "grad_norm": 1.5915874242782593, + "learning_rate": 4.69737256028751e-05, + "loss": 5.2849, + "step": 26609 + }, + { + "epoch": 0.15825720810733657, + "grad_norm": 1.613585352897644, + "learning_rate": 4.697350283240039e-05, + "loss": 5.1898, + "step": 26610 + }, + { + "epoch": 0.15826315539061755, + "grad_norm": 1.5696673393249512, + "learning_rate": 4.6973280054254966e-05, + "loss": 5.2518, + "step": 26611 + }, + { + "epoch": 0.15826910267389857, + "grad_norm": 1.2109240293502808, + "learning_rate": 4.697305726843889e-05, + "loss": 5.4032, + "step": 26612 + }, + { + "epoch": 0.15827504995717956, + "grad_norm": 1.47042715549469, + "learning_rate": 4.697283447495225e-05, + "loss": 5.1456, + "step": 26613 + }, + { + "epoch": 0.15828099724046055, + "grad_norm": 1.3937478065490723, + "learning_rate": 4.697261167379512e-05, + "loss": 5.3592, + "step": 26614 + }, + { + "epoch": 0.15828694452374156, + "grad_norm": 1.6204369068145752, + "learning_rate": 4.6972388864967574e-05, + "loss": 5.2882, + "step": 26615 + }, + { + "epoch": 0.15829289180702255, + "grad_norm": 1.654252290725708, + "learning_rate": 4.69721660484697e-05, + "loss": 5.2655, + "step": 26616 + }, + { + "epoch": 0.15829883909030354, + "grad_norm": 1.583075761795044, + "learning_rate": 4.6971943224301576e-05, + "loss": 5.097, + "step": 26617 + }, + { + "epoch": 0.15830478637358456, + "grad_norm": 1.3745534420013428, + "learning_rate": 4.697172039246326e-05, + "loss": 5.1911, + "step": 26618 + }, + { + "epoch": 0.15831073365686554, + "grad_norm": 1.662632703781128, + "learning_rate": 4.697149755295485e-05, + "loss": 4.9032, + "step": 26619 + }, + { + "epoch": 0.15831668094014653, + "grad_norm": 1.3548792600631714, + "learning_rate": 4.697127470577642e-05, + "loss": 5.3656, + "step": 26620 + }, + { + "epoch": 0.15832262822342755, + "grad_norm": 1.2697865962982178, + "learning_rate": 4.697105185092804e-05, + "loss": 5.2743, + "step": 26621 + }, + { + "epoch": 0.15832857550670854, + "grad_norm": 1.424477458000183, + "learning_rate": 4.69708289884098e-05, + "loss": 5.1278, + "step": 26622 + }, + { + "epoch": 0.15833452278998952, + "grad_norm": 1.5525426864624023, + "learning_rate": 4.697060611822176e-05, + "loss": 5.2804, + "step": 26623 + }, + { + "epoch": 0.15834047007327054, + "grad_norm": 1.5966732501983643, + "learning_rate": 4.697038324036401e-05, + "loss": 5.3546, + "step": 26624 + }, + { + "epoch": 0.15834641735655153, + "grad_norm": 1.4296703338623047, + "learning_rate": 4.6970160354836634e-05, + "loss": 5.1681, + "step": 26625 + }, + { + "epoch": 0.15835236463983252, + "grad_norm": 1.5928189754486084, + "learning_rate": 4.69699374616397e-05, + "loss": 5.2565, + "step": 26626 + }, + { + "epoch": 0.15835831192311353, + "grad_norm": 1.437814712524414, + "learning_rate": 4.696971456077328e-05, + "loss": 5.1813, + "step": 26627 + }, + { + "epoch": 0.15836425920639452, + "grad_norm": 1.4782744646072388, + "learning_rate": 4.696949165223747e-05, + "loss": 5.365, + "step": 26628 + }, + { + "epoch": 0.1583702064896755, + "grad_norm": 1.5123037099838257, + "learning_rate": 4.696926873603233e-05, + "loss": 5.255, + "step": 26629 + }, + { + "epoch": 0.15837615377295652, + "grad_norm": 1.4208122491836548, + "learning_rate": 4.696904581215795e-05, + "loss": 5.0531, + "step": 26630 + }, + { + "epoch": 0.1583821010562375, + "grad_norm": 1.4333672523498535, + "learning_rate": 4.69688228806144e-05, + "loss": 5.1035, + "step": 26631 + }, + { + "epoch": 0.1583880483395185, + "grad_norm": 1.3645392656326294, + "learning_rate": 4.696859994140176e-05, + "loss": 5.0107, + "step": 26632 + }, + { + "epoch": 0.15839399562279952, + "grad_norm": 1.6100040674209595, + "learning_rate": 4.6968376994520116e-05, + "loss": 5.054, + "step": 26633 + }, + { + "epoch": 0.1583999429060805, + "grad_norm": 1.431036353111267, + "learning_rate": 4.696815403996953e-05, + "loss": 5.4406, + "step": 26634 + }, + { + "epoch": 0.1584058901893615, + "grad_norm": 1.6785353422164917, + "learning_rate": 4.6967931077750096e-05, + "loss": 5.7861, + "step": 26635 + }, + { + "epoch": 0.1584118374726425, + "grad_norm": 1.549333095550537, + "learning_rate": 4.6967708107861876e-05, + "loss": 5.6662, + "step": 26636 + }, + { + "epoch": 0.1584177847559235, + "grad_norm": 1.5669690370559692, + "learning_rate": 4.696748513030496e-05, + "loss": 5.3213, + "step": 26637 + }, + { + "epoch": 0.15842373203920448, + "grad_norm": 1.6420881748199463, + "learning_rate": 4.696726214507942e-05, + "loss": 5.2381, + "step": 26638 + }, + { + "epoch": 0.1584296793224855, + "grad_norm": 1.811171293258667, + "learning_rate": 4.6967039152185345e-05, + "loss": 5.3656, + "step": 26639 + }, + { + "epoch": 0.1584356266057665, + "grad_norm": 1.7578849792480469, + "learning_rate": 4.696681615162279e-05, + "loss": 4.8774, + "step": 26640 + }, + { + "epoch": 0.15844157388904748, + "grad_norm": 2.0880799293518066, + "learning_rate": 4.696659314339185e-05, + "loss": 4.945, + "step": 26641 + }, + { + "epoch": 0.1584475211723285, + "grad_norm": 1.4735814332962036, + "learning_rate": 4.6966370127492603e-05, + "loss": 5.5415, + "step": 26642 + }, + { + "epoch": 0.15845346845560948, + "grad_norm": 1.7141392230987549, + "learning_rate": 4.696614710392512e-05, + "loss": 4.8197, + "step": 26643 + }, + { + "epoch": 0.15845941573889047, + "grad_norm": 1.9631140232086182, + "learning_rate": 4.696592407268949e-05, + "loss": 5.0147, + "step": 26644 + }, + { + "epoch": 0.15846536302217148, + "grad_norm": 2.1569128036499023, + "learning_rate": 4.696570103378577e-05, + "loss": 4.8175, + "step": 26645 + }, + { + "epoch": 0.15847131030545247, + "grad_norm": 2.07602596282959, + "learning_rate": 4.696547798721406e-05, + "loss": 5.0289, + "step": 26646 + }, + { + "epoch": 0.15847725758873346, + "grad_norm": 1.5406705141067505, + "learning_rate": 4.696525493297443e-05, + "loss": 5.1569, + "step": 26647 + }, + { + "epoch": 0.15848320487201448, + "grad_norm": 1.630928635597229, + "learning_rate": 4.696503187106695e-05, + "loss": 5.4698, + "step": 26648 + }, + { + "epoch": 0.15848915215529547, + "grad_norm": 1.5992403030395508, + "learning_rate": 4.696480880149171e-05, + "loss": 5.296, + "step": 26649 + }, + { + "epoch": 0.15849509943857645, + "grad_norm": 1.8908748626708984, + "learning_rate": 4.696458572424878e-05, + "loss": 5.0706, + "step": 26650 + }, + { + "epoch": 0.15850104672185747, + "grad_norm": 2.149810552597046, + "learning_rate": 4.6964362639338236e-05, + "loss": 4.8138, + "step": 26651 + }, + { + "epoch": 0.15850699400513846, + "grad_norm": 2.049520254135132, + "learning_rate": 4.696413954676016e-05, + "loss": 4.9173, + "step": 26652 + }, + { + "epoch": 0.15851294128841945, + "grad_norm": 2.03076434135437, + "learning_rate": 4.6963916446514634e-05, + "loss": 4.889, + "step": 26653 + }, + { + "epoch": 0.15851888857170043, + "grad_norm": 1.8261823654174805, + "learning_rate": 4.696369333860173e-05, + "loss": 4.7856, + "step": 26654 + }, + { + "epoch": 0.15852483585498145, + "grad_norm": 1.864707112312317, + "learning_rate": 4.6963470223021535e-05, + "loss": 4.8419, + "step": 26655 + }, + { + "epoch": 0.15853078313826244, + "grad_norm": 1.9796535968780518, + "learning_rate": 4.696324709977411e-05, + "loss": 4.7506, + "step": 26656 + }, + { + "epoch": 0.15853673042154343, + "grad_norm": 1.9936281442642212, + "learning_rate": 4.696302396885954e-05, + "loss": 4.8773, + "step": 26657 + }, + { + "epoch": 0.15854267770482444, + "grad_norm": 1.790238618850708, + "learning_rate": 4.696280083027791e-05, + "loss": 4.7998, + "step": 26658 + }, + { + "epoch": 0.15854862498810543, + "grad_norm": 1.9320149421691895, + "learning_rate": 4.69625776840293e-05, + "loss": 4.7113, + "step": 26659 + }, + { + "epoch": 0.15855457227138642, + "grad_norm": 1.6032037734985352, + "learning_rate": 4.696235453011377e-05, + "loss": 5.2848, + "step": 26660 + }, + { + "epoch": 0.15856051955466743, + "grad_norm": 1.8947795629501343, + "learning_rate": 4.696213136853141e-05, + "loss": 4.7212, + "step": 26661 + }, + { + "epoch": 0.15856646683794842, + "grad_norm": 2.017988681793213, + "learning_rate": 4.69619081992823e-05, + "loss": 4.8043, + "step": 26662 + }, + { + "epoch": 0.1585724141212294, + "grad_norm": 2.114877223968506, + "learning_rate": 4.696168502236652e-05, + "loss": 4.6626, + "step": 26663 + }, + { + "epoch": 0.15857836140451043, + "grad_norm": 2.029026985168457, + "learning_rate": 4.6961461837784134e-05, + "loss": 4.5315, + "step": 26664 + }, + { + "epoch": 0.15858430868779141, + "grad_norm": 2.052255630493164, + "learning_rate": 4.696123864553523e-05, + "loss": 4.6957, + "step": 26665 + }, + { + "epoch": 0.1585902559710724, + "grad_norm": 1.9599274396896362, + "learning_rate": 4.696101544561989e-05, + "loss": 4.792, + "step": 26666 + }, + { + "epoch": 0.15859620325435342, + "grad_norm": 2.1609420776367188, + "learning_rate": 4.6960792238038184e-05, + "loss": 4.8083, + "step": 26667 + }, + { + "epoch": 0.1586021505376344, + "grad_norm": 2.0834262371063232, + "learning_rate": 4.696056902279019e-05, + "loss": 4.7683, + "step": 26668 + }, + { + "epoch": 0.1586080978209154, + "grad_norm": 2.0544068813323975, + "learning_rate": 4.6960345799875995e-05, + "loss": 4.7, + "step": 26669 + }, + { + "epoch": 0.1586140451041964, + "grad_norm": 2.036548137664795, + "learning_rate": 4.696012256929566e-05, + "loss": 4.5653, + "step": 26670 + }, + { + "epoch": 0.1586199923874774, + "grad_norm": 1.7801802158355713, + "learning_rate": 4.6959899331049276e-05, + "loss": 4.7215, + "step": 26671 + }, + { + "epoch": 0.1586259396707584, + "grad_norm": 2.0025057792663574, + "learning_rate": 4.695967608513692e-05, + "loss": 4.6259, + "step": 26672 + }, + { + "epoch": 0.1586318869540394, + "grad_norm": 2.0719566345214844, + "learning_rate": 4.695945283155867e-05, + "loss": 4.7383, + "step": 26673 + }, + { + "epoch": 0.1586378342373204, + "grad_norm": 2.0565052032470703, + "learning_rate": 4.69592295703146e-05, + "loss": 5.2066, + "step": 26674 + }, + { + "epoch": 0.15864378152060138, + "grad_norm": 1.7758921384811401, + "learning_rate": 4.695900630140479e-05, + "loss": 5.5619, + "step": 26675 + }, + { + "epoch": 0.1586497288038824, + "grad_norm": 1.799654483795166, + "learning_rate": 4.695878302482931e-05, + "loss": 5.0901, + "step": 26676 + }, + { + "epoch": 0.15865567608716338, + "grad_norm": 1.785900592803955, + "learning_rate": 4.695855974058826e-05, + "loss": 4.9323, + "step": 26677 + }, + { + "epoch": 0.15866162337044437, + "grad_norm": 1.9525444507598877, + "learning_rate": 4.695833644868169e-05, + "loss": 4.7603, + "step": 26678 + }, + { + "epoch": 0.1586675706537254, + "grad_norm": 1.9197458028793335, + "learning_rate": 4.69581131491097e-05, + "loss": 4.636, + "step": 26679 + }, + { + "epoch": 0.15867351793700638, + "grad_norm": 2.3043594360351562, + "learning_rate": 4.695788984187236e-05, + "loss": 4.4529, + "step": 26680 + }, + { + "epoch": 0.15867946522028736, + "grad_norm": 1.687930703163147, + "learning_rate": 4.6957666526969744e-05, + "loss": 4.9549, + "step": 26681 + }, + { + "epoch": 0.15868541250356838, + "grad_norm": 1.5754574537277222, + "learning_rate": 4.6957443204401935e-05, + "loss": 5.4364, + "step": 26682 + }, + { + "epoch": 0.15869135978684937, + "grad_norm": 1.5300992727279663, + "learning_rate": 4.6957219874169013e-05, + "loss": 5.3151, + "step": 26683 + }, + { + "epoch": 0.15869730707013036, + "grad_norm": 1.7758506536483765, + "learning_rate": 4.695699653627105e-05, + "loss": 5.2053, + "step": 26684 + }, + { + "epoch": 0.15870325435341137, + "grad_norm": 1.5882158279418945, + "learning_rate": 4.6956773190708116e-05, + "loss": 4.8202, + "step": 26685 + }, + { + "epoch": 0.15870920163669236, + "grad_norm": 1.5649267435073853, + "learning_rate": 4.695654983748031e-05, + "loss": 4.3946, + "step": 26686 + }, + { + "epoch": 0.15871514891997335, + "grad_norm": 1.5999925136566162, + "learning_rate": 4.6956326476587696e-05, + "loss": 4.3512, + "step": 26687 + }, + { + "epoch": 0.15872109620325436, + "grad_norm": 1.699987530708313, + "learning_rate": 4.6956103108030356e-05, + "loss": 4.7479, + "step": 26688 + }, + { + "epoch": 0.15872704348653535, + "grad_norm": 1.4755208492279053, + "learning_rate": 4.695587973180837e-05, + "loss": 5.1206, + "step": 26689 + }, + { + "epoch": 0.15873299076981634, + "grad_norm": 1.7642509937286377, + "learning_rate": 4.6955656347921813e-05, + "loss": 5.3179, + "step": 26690 + }, + { + "epoch": 0.15873893805309736, + "grad_norm": 1.5511635541915894, + "learning_rate": 4.695543295637076e-05, + "loss": 4.4365, + "step": 26691 + }, + { + "epoch": 0.15874488533637834, + "grad_norm": 1.5347273349761963, + "learning_rate": 4.6955209557155286e-05, + "loss": 4.368, + "step": 26692 + }, + { + "epoch": 0.15875083261965933, + "grad_norm": 1.5347685813903809, + "learning_rate": 4.695498615027549e-05, + "loss": 4.2812, + "step": 26693 + }, + { + "epoch": 0.15875677990294035, + "grad_norm": 1.5469902753829956, + "learning_rate": 4.6954762735731425e-05, + "loss": 4.4445, + "step": 26694 + }, + { + "epoch": 0.15876272718622134, + "grad_norm": 1.4887003898620605, + "learning_rate": 4.695453931352318e-05, + "loss": 4.3584, + "step": 26695 + }, + { + "epoch": 0.15876867446950232, + "grad_norm": 1.5207375288009644, + "learning_rate": 4.695431588365084e-05, + "loss": 4.7219, + "step": 26696 + }, + { + "epoch": 0.15877462175278334, + "grad_norm": 1.7801141738891602, + "learning_rate": 4.695409244611447e-05, + "loss": 5.0328, + "step": 26697 + }, + { + "epoch": 0.15878056903606433, + "grad_norm": 1.8171552419662476, + "learning_rate": 4.695386900091415e-05, + "loss": 5.4286, + "step": 26698 + }, + { + "epoch": 0.15878651631934532, + "grad_norm": 1.698379397392273, + "learning_rate": 4.695364554804996e-05, + "loss": 4.7824, + "step": 26699 + }, + { + "epoch": 0.15879246360262633, + "grad_norm": 1.5824103355407715, + "learning_rate": 4.695342208752199e-05, + "loss": 4.2949, + "step": 26700 + }, + { + "epoch": 0.15879841088590732, + "grad_norm": 1.5890088081359863, + "learning_rate": 4.6953198619330295e-05, + "loss": 4.3689, + "step": 26701 + }, + { + "epoch": 0.1588043581691883, + "grad_norm": 1.6158654689788818, + "learning_rate": 4.6952975143474975e-05, + "loss": 4.7294, + "step": 26702 + }, + { + "epoch": 0.15881030545246932, + "grad_norm": 1.5613304376602173, + "learning_rate": 4.695275165995609e-05, + "loss": 5.4067, + "step": 26703 + }, + { + "epoch": 0.1588162527357503, + "grad_norm": 1.5085046291351318, + "learning_rate": 4.695252816877373e-05, + "loss": 5.4355, + "step": 26704 + }, + { + "epoch": 0.1588222000190313, + "grad_norm": 1.6180028915405273, + "learning_rate": 4.695230466992797e-05, + "loss": 5.0541, + "step": 26705 + }, + { + "epoch": 0.15882814730231232, + "grad_norm": 1.8564766645431519, + "learning_rate": 4.695208116341888e-05, + "loss": 5.3307, + "step": 26706 + }, + { + "epoch": 0.1588340945855933, + "grad_norm": 1.762041449546814, + "learning_rate": 4.6951857649246555e-05, + "loss": 5.2526, + "step": 26707 + }, + { + "epoch": 0.1588400418688743, + "grad_norm": 1.5610746145248413, + "learning_rate": 4.695163412741106e-05, + "loss": 5.0561, + "step": 26708 + }, + { + "epoch": 0.1588459891521553, + "grad_norm": 1.6463086605072021, + "learning_rate": 4.695141059791247e-05, + "loss": 4.9357, + "step": 26709 + }, + { + "epoch": 0.1588519364354363, + "grad_norm": 1.794967532157898, + "learning_rate": 4.695118706075088e-05, + "loss": 5.4786, + "step": 26710 + }, + { + "epoch": 0.15885788371871729, + "grad_norm": 1.6720161437988281, + "learning_rate": 4.695096351592635e-05, + "loss": 5.4702, + "step": 26711 + }, + { + "epoch": 0.15886383100199827, + "grad_norm": 1.6844518184661865, + "learning_rate": 4.6950739963438975e-05, + "loss": 5.2407, + "step": 26712 + }, + { + "epoch": 0.1588697782852793, + "grad_norm": 1.7027579545974731, + "learning_rate": 4.695051640328881e-05, + "loss": 5.1027, + "step": 26713 + }, + { + "epoch": 0.15887572556856028, + "grad_norm": 2.385157823562622, + "learning_rate": 4.695029283547595e-05, + "loss": 4.6013, + "step": 26714 + }, + { + "epoch": 0.15888167285184127, + "grad_norm": 1.7393914461135864, + "learning_rate": 4.6950069260000475e-05, + "loss": 4.9536, + "step": 26715 + }, + { + "epoch": 0.15888762013512228, + "grad_norm": 1.5079336166381836, + "learning_rate": 4.694984567686246e-05, + "loss": 4.9043, + "step": 26716 + }, + { + "epoch": 0.15889356741840327, + "grad_norm": 1.3347656726837158, + "learning_rate": 4.694962208606197e-05, + "loss": 5.6922, + "step": 26717 + }, + { + "epoch": 0.15889951470168426, + "grad_norm": 1.8166699409484863, + "learning_rate": 4.6949398487599106e-05, + "loss": 5.3646, + "step": 26718 + }, + { + "epoch": 0.15890546198496527, + "grad_norm": 1.7105693817138672, + "learning_rate": 4.694917488147392e-05, + "loss": 5.3915, + "step": 26719 + }, + { + "epoch": 0.15891140926824626, + "grad_norm": 2.5474836826324463, + "learning_rate": 4.6948951267686514e-05, + "loss": 4.7226, + "step": 26720 + }, + { + "epoch": 0.15891735655152725, + "grad_norm": 2.544551372528076, + "learning_rate": 4.694872764623696e-05, + "loss": 4.5184, + "step": 26721 + }, + { + "epoch": 0.15892330383480827, + "grad_norm": 1.6088052988052368, + "learning_rate": 4.6948504017125316e-05, + "loss": 4.913, + "step": 26722 + }, + { + "epoch": 0.15892925111808925, + "grad_norm": 2.0992431640625, + "learning_rate": 4.6948280380351686e-05, + "loss": 4.7943, + "step": 26723 + }, + { + "epoch": 0.15893519840137024, + "grad_norm": 2.4200751781463623, + "learning_rate": 4.6948056735916135e-05, + "loss": 4.6055, + "step": 26724 + }, + { + "epoch": 0.15894114568465126, + "grad_norm": 1.802924633026123, + "learning_rate": 4.694783308381875e-05, + "loss": 4.2752, + "step": 26725 + }, + { + "epoch": 0.15894709296793225, + "grad_norm": 1.8056386709213257, + "learning_rate": 4.694760942405959e-05, + "loss": 5.0256, + "step": 26726 + }, + { + "epoch": 0.15895304025121323, + "grad_norm": 1.5216751098632812, + "learning_rate": 4.694738575663876e-05, + "loss": 4.9218, + "step": 26727 + }, + { + "epoch": 0.15895898753449425, + "grad_norm": 2.3782224655151367, + "learning_rate": 4.694716208155632e-05, + "loss": 4.7504, + "step": 26728 + }, + { + "epoch": 0.15896493481777524, + "grad_norm": 2.0227694511413574, + "learning_rate": 4.694693839881236e-05, + "loss": 4.5376, + "step": 26729 + }, + { + "epoch": 0.15897088210105623, + "grad_norm": 2.289433240890503, + "learning_rate": 4.694671470840693e-05, + "loss": 4.4428, + "step": 26730 + }, + { + "epoch": 0.15897682938433724, + "grad_norm": 2.2303051948547363, + "learning_rate": 4.694649101034015e-05, + "loss": 4.326, + "step": 26731 + }, + { + "epoch": 0.15898277666761823, + "grad_norm": 1.6835930347442627, + "learning_rate": 4.6946267304612067e-05, + "loss": 4.7231, + "step": 26732 + }, + { + "epoch": 0.15898872395089922, + "grad_norm": 1.6131420135498047, + "learning_rate": 4.694604359122277e-05, + "loss": 5.5532, + "step": 26733 + }, + { + "epoch": 0.15899467123418023, + "grad_norm": 1.4710865020751953, + "learning_rate": 4.6945819870172335e-05, + "loss": 5.3341, + "step": 26734 + }, + { + "epoch": 0.15900061851746122, + "grad_norm": 1.5708924531936646, + "learning_rate": 4.694559614146085e-05, + "loss": 4.9195, + "step": 26735 + }, + { + "epoch": 0.1590065658007422, + "grad_norm": 1.5540367364883423, + "learning_rate": 4.6945372405088374e-05, + "loss": 5.2529, + "step": 26736 + }, + { + "epoch": 0.15901251308402323, + "grad_norm": 1.8328397274017334, + "learning_rate": 4.6945148661054995e-05, + "loss": 5.0446, + "step": 26737 + }, + { + "epoch": 0.15901846036730422, + "grad_norm": 1.9213111400604248, + "learning_rate": 4.694492490936079e-05, + "loss": 4.505, + "step": 26738 + }, + { + "epoch": 0.1590244076505852, + "grad_norm": 1.6417537927627563, + "learning_rate": 4.694470115000584e-05, + "loss": 5.064, + "step": 26739 + }, + { + "epoch": 0.15903035493386622, + "grad_norm": 1.4690046310424805, + "learning_rate": 4.6944477382990224e-05, + "loss": 5.1935, + "step": 26740 + }, + { + "epoch": 0.1590363022171472, + "grad_norm": 1.6286424398422241, + "learning_rate": 4.694425360831402e-05, + "loss": 4.8251, + "step": 26741 + }, + { + "epoch": 0.1590422495004282, + "grad_norm": 1.6581510305404663, + "learning_rate": 4.6944029825977296e-05, + "loss": 4.9166, + "step": 26742 + }, + { + "epoch": 0.1590481967837092, + "grad_norm": 1.4425448179244995, + "learning_rate": 4.694380603598015e-05, + "loss": 4.9857, + "step": 26743 + }, + { + "epoch": 0.1590541440669902, + "grad_norm": 1.6443709135055542, + "learning_rate": 4.694358223832263e-05, + "loss": 4.3642, + "step": 26744 + }, + { + "epoch": 0.1590600913502712, + "grad_norm": 1.8886995315551758, + "learning_rate": 4.6943358433004856e-05, + "loss": 4.2237, + "step": 26745 + }, + { + "epoch": 0.1590660386335522, + "grad_norm": 1.779401421546936, + "learning_rate": 4.6943134620026865e-05, + "loss": 3.8314, + "step": 26746 + }, + { + "epoch": 0.1590719859168332, + "grad_norm": 1.9053362607955933, + "learning_rate": 4.6942910799388755e-05, + "loss": 4.0761, + "step": 26747 + }, + { + "epoch": 0.15907793320011418, + "grad_norm": 1.7256511449813843, + "learning_rate": 4.694268697109061e-05, + "loss": 5.4427, + "step": 26748 + }, + { + "epoch": 0.1590838804833952, + "grad_norm": 1.7450202703475952, + "learning_rate": 4.6942463135132484e-05, + "loss": 4.535, + "step": 26749 + }, + { + "epoch": 0.15908982776667618, + "grad_norm": 1.4825485944747925, + "learning_rate": 4.6942239291514486e-05, + "loss": 4.4373, + "step": 26750 + }, + { + "epoch": 0.15909577504995717, + "grad_norm": 1.5326381921768188, + "learning_rate": 4.6942015440236673e-05, + "loss": 4.3876, + "step": 26751 + }, + { + "epoch": 0.1591017223332382, + "grad_norm": 1.7042746543884277, + "learning_rate": 4.694179158129913e-05, + "loss": 4.6091, + "step": 26752 + }, + { + "epoch": 0.15910766961651918, + "grad_norm": 1.584315299987793, + "learning_rate": 4.6941567714701926e-05, + "loss": 4.5937, + "step": 26753 + }, + { + "epoch": 0.15911361689980016, + "grad_norm": 1.5627310276031494, + "learning_rate": 4.694134384044516e-05, + "loss": 4.719, + "step": 26754 + }, + { + "epoch": 0.15911956418308118, + "grad_norm": 1.726309061050415, + "learning_rate": 4.694111995852889e-05, + "loss": 4.8064, + "step": 26755 + }, + { + "epoch": 0.15912551146636217, + "grad_norm": 1.6186972856521606, + "learning_rate": 4.6940896068953204e-05, + "loss": 5.215, + "step": 26756 + }, + { + "epoch": 0.15913145874964316, + "grad_norm": 1.7018485069274902, + "learning_rate": 4.694067217171818e-05, + "loss": 5.1681, + "step": 26757 + }, + { + "epoch": 0.15913740603292417, + "grad_norm": 1.8986917734146118, + "learning_rate": 4.694044826682389e-05, + "loss": 5.1551, + "step": 26758 + }, + { + "epoch": 0.15914335331620516, + "grad_norm": 1.6398760080337524, + "learning_rate": 4.694022435427042e-05, + "loss": 4.8223, + "step": 26759 + }, + { + "epoch": 0.15914930059948615, + "grad_norm": 1.5714781284332275, + "learning_rate": 4.694000043405784e-05, + "loss": 4.6631, + "step": 26760 + }, + { + "epoch": 0.15915524788276716, + "grad_norm": 1.9300872087478638, + "learning_rate": 4.6939776506186234e-05, + "loss": 4.8107, + "step": 26761 + }, + { + "epoch": 0.15916119516604815, + "grad_norm": 1.8684272766113281, + "learning_rate": 4.6939552570655674e-05, + "loss": 4.9762, + "step": 26762 + }, + { + "epoch": 0.15916714244932914, + "grad_norm": 1.6835062503814697, + "learning_rate": 4.693932862746625e-05, + "loss": 4.8015, + "step": 26763 + }, + { + "epoch": 0.15917308973261016, + "grad_norm": 1.5635250806808472, + "learning_rate": 4.693910467661803e-05, + "loss": 4.7763, + "step": 26764 + }, + { + "epoch": 0.15917903701589114, + "grad_norm": 1.584123134613037, + "learning_rate": 4.69388807181111e-05, + "loss": 4.7093, + "step": 26765 + }, + { + "epoch": 0.15918498429917213, + "grad_norm": 1.597011685371399, + "learning_rate": 4.693865675194553e-05, + "loss": 4.7376, + "step": 26766 + }, + { + "epoch": 0.15919093158245315, + "grad_norm": 1.5018924474716187, + "learning_rate": 4.693843277812141e-05, + "loss": 4.5752, + "step": 26767 + }, + { + "epoch": 0.15919687886573414, + "grad_norm": 1.5398659706115723, + "learning_rate": 4.6938208796638796e-05, + "loss": 4.3835, + "step": 26768 + }, + { + "epoch": 0.15920282614901513, + "grad_norm": 1.753659963607788, + "learning_rate": 4.693798480749778e-05, + "loss": 4.5366, + "step": 26769 + }, + { + "epoch": 0.1592087734322961, + "grad_norm": 1.6807688474655151, + "learning_rate": 4.693776081069845e-05, + "loss": 4.5043, + "step": 26770 + }, + { + "epoch": 0.15921472071557713, + "grad_norm": 1.547088384628296, + "learning_rate": 4.6937536806240865e-05, + "loss": 4.4129, + "step": 26771 + }, + { + "epoch": 0.15922066799885812, + "grad_norm": 1.6225403547286987, + "learning_rate": 4.693731279412512e-05, + "loss": 4.3027, + "step": 26772 + }, + { + "epoch": 0.1592266152821391, + "grad_norm": 1.521183967590332, + "learning_rate": 4.693708877435128e-05, + "loss": 4.3267, + "step": 26773 + }, + { + "epoch": 0.15923256256542012, + "grad_norm": 1.503652572631836, + "learning_rate": 4.693686474691944e-05, + "loss": 4.5069, + "step": 26774 + }, + { + "epoch": 0.1592385098487011, + "grad_norm": 1.3765262365341187, + "learning_rate": 4.693664071182965e-05, + "loss": 4.8385, + "step": 26775 + }, + { + "epoch": 0.1592444571319821, + "grad_norm": 1.552372932434082, + "learning_rate": 4.6936416669082015e-05, + "loss": 4.7109, + "step": 26776 + }, + { + "epoch": 0.1592504044152631, + "grad_norm": 1.5098180770874023, + "learning_rate": 4.693619261867661e-05, + "loss": 4.6682, + "step": 26777 + }, + { + "epoch": 0.1592563516985441, + "grad_norm": 1.7043485641479492, + "learning_rate": 4.69359685606135e-05, + "loss": 4.7291, + "step": 26778 + }, + { + "epoch": 0.1592622989818251, + "grad_norm": 1.342060923576355, + "learning_rate": 4.693574449489277e-05, + "loss": 4.4172, + "step": 26779 + }, + { + "epoch": 0.1592682462651061, + "grad_norm": 1.5385740995407104, + "learning_rate": 4.6935520421514494e-05, + "loss": 4.1767, + "step": 26780 + }, + { + "epoch": 0.1592741935483871, + "grad_norm": 1.3378406763076782, + "learning_rate": 4.6935296340478764e-05, + "loss": 4.419, + "step": 26781 + }, + { + "epoch": 0.15928014083166808, + "grad_norm": 1.5734392404556274, + "learning_rate": 4.693507225178564e-05, + "loss": 4.3342, + "step": 26782 + }, + { + "epoch": 0.1592860881149491, + "grad_norm": 1.9071681499481201, + "learning_rate": 4.6934848155435216e-05, + "loss": 4.4808, + "step": 26783 + }, + { + "epoch": 0.1592920353982301, + "grad_norm": 1.4852991104125977, + "learning_rate": 4.693462405142755e-05, + "loss": 5.2923, + "step": 26784 + }, + { + "epoch": 0.15929798268151107, + "grad_norm": 1.7078371047973633, + "learning_rate": 4.6934399939762746e-05, + "loss": 4.5363, + "step": 26785 + }, + { + "epoch": 0.1593039299647921, + "grad_norm": 1.731362223625183, + "learning_rate": 4.693417582044087e-05, + "loss": 4.3905, + "step": 26786 + }, + { + "epoch": 0.15930987724807308, + "grad_norm": 1.7854750156402588, + "learning_rate": 4.6933951693462e-05, + "loss": 4.6509, + "step": 26787 + }, + { + "epoch": 0.15931582453135407, + "grad_norm": 1.804178237915039, + "learning_rate": 4.69337275588262e-05, + "loss": 4.5157, + "step": 26788 + }, + { + "epoch": 0.15932177181463508, + "grad_norm": 1.9014322757720947, + "learning_rate": 4.693350341653358e-05, + "loss": 4.5673, + "step": 26789 + }, + { + "epoch": 0.15932771909791607, + "grad_norm": 2.1549782752990723, + "learning_rate": 4.693327926658418e-05, + "loss": 4.6754, + "step": 26790 + }, + { + "epoch": 0.15933366638119706, + "grad_norm": 1.9609428644180298, + "learning_rate": 4.693305510897812e-05, + "loss": 4.6832, + "step": 26791 + }, + { + "epoch": 0.15933961366447807, + "grad_norm": 2.0541574954986572, + "learning_rate": 4.693283094371545e-05, + "loss": 4.3928, + "step": 26792 + }, + { + "epoch": 0.15934556094775906, + "grad_norm": 2.151719331741333, + "learning_rate": 4.693260677079625e-05, + "loss": 4.2179, + "step": 26793 + }, + { + "epoch": 0.15935150823104005, + "grad_norm": 1.6300101280212402, + "learning_rate": 4.693238259022062e-05, + "loss": 5.202, + "step": 26794 + }, + { + "epoch": 0.15935745551432107, + "grad_norm": 1.860836148262024, + "learning_rate": 4.69321584019886e-05, + "loss": 4.7327, + "step": 26795 + }, + { + "epoch": 0.15936340279760206, + "grad_norm": 1.7627391815185547, + "learning_rate": 4.6931934206100304e-05, + "loss": 5.0884, + "step": 26796 + }, + { + "epoch": 0.15936935008088304, + "grad_norm": 1.6358652114868164, + "learning_rate": 4.693171000255579e-05, + "loss": 5.1218, + "step": 26797 + }, + { + "epoch": 0.15937529736416406, + "grad_norm": 1.938833475112915, + "learning_rate": 4.693148579135514e-05, + "loss": 5.0097, + "step": 26798 + }, + { + "epoch": 0.15938124464744505, + "grad_norm": 1.6986185312271118, + "learning_rate": 4.6931261572498445e-05, + "loss": 5.0552, + "step": 26799 + }, + { + "epoch": 0.15938719193072604, + "grad_norm": 1.9049108028411865, + "learning_rate": 4.693103734598576e-05, + "loss": 4.5521, + "step": 26800 + }, + { + "epoch": 0.15939313921400705, + "grad_norm": 1.723593831062317, + "learning_rate": 4.693081311181719e-05, + "loss": 4.624, + "step": 26801 + }, + { + "epoch": 0.15939908649728804, + "grad_norm": 1.8977972269058228, + "learning_rate": 4.693058886999279e-05, + "loss": 4.508, + "step": 26802 + }, + { + "epoch": 0.15940503378056903, + "grad_norm": 1.8587881326675415, + "learning_rate": 4.6930364620512656e-05, + "loss": 4.5824, + "step": 26803 + }, + { + "epoch": 0.15941098106385004, + "grad_norm": 2.033412456512451, + "learning_rate": 4.693014036337685e-05, + "loss": 4.2831, + "step": 26804 + }, + { + "epoch": 0.15941692834713103, + "grad_norm": 1.7461220026016235, + "learning_rate": 4.692991609858547e-05, + "loss": 4.3987, + "step": 26805 + }, + { + "epoch": 0.15942287563041202, + "grad_norm": 1.5717246532440186, + "learning_rate": 4.692969182613857e-05, + "loss": 4.4173, + "step": 26806 + }, + { + "epoch": 0.15942882291369304, + "grad_norm": 1.825589656829834, + "learning_rate": 4.692946754603625e-05, + "loss": 4.5616, + "step": 26807 + }, + { + "epoch": 0.15943477019697402, + "grad_norm": 1.5404088497161865, + "learning_rate": 4.6929243258278576e-05, + "loss": 5.393, + "step": 26808 + }, + { + "epoch": 0.159440717480255, + "grad_norm": 2.0158777236938477, + "learning_rate": 4.692901896286563e-05, + "loss": 4.7878, + "step": 26809 + }, + { + "epoch": 0.15944666476353603, + "grad_norm": 2.152909755706787, + "learning_rate": 4.6928794659797494e-05, + "loss": 4.1923, + "step": 26810 + }, + { + "epoch": 0.15945261204681702, + "grad_norm": 2.1839582920074463, + "learning_rate": 4.692857034907423e-05, + "loss": 4.4213, + "step": 26811 + }, + { + "epoch": 0.159458559330098, + "grad_norm": 1.7359018325805664, + "learning_rate": 4.6928346030695934e-05, + "loss": 4.4409, + "step": 26812 + }, + { + "epoch": 0.15946450661337902, + "grad_norm": 1.6525425910949707, + "learning_rate": 4.692812170466269e-05, + "loss": 5.0243, + "step": 26813 + }, + { + "epoch": 0.15947045389666, + "grad_norm": 1.471819519996643, + "learning_rate": 4.692789737097455e-05, + "loss": 5.5855, + "step": 26814 + }, + { + "epoch": 0.159476401179941, + "grad_norm": 1.4903481006622314, + "learning_rate": 4.692767302963162e-05, + "loss": 5.4807, + "step": 26815 + }, + { + "epoch": 0.159482348463222, + "grad_norm": 1.6658556461334229, + "learning_rate": 4.6927448680633954e-05, + "loss": 5.2928, + "step": 26816 + }, + { + "epoch": 0.159488295746503, + "grad_norm": 1.8180750608444214, + "learning_rate": 4.692722432398166e-05, + "loss": 5.0372, + "step": 26817 + }, + { + "epoch": 0.159494243029784, + "grad_norm": 1.4245752096176147, + "learning_rate": 4.692699995967478e-05, + "loss": 4.9285, + "step": 26818 + }, + { + "epoch": 0.159500190313065, + "grad_norm": 1.5879698991775513, + "learning_rate": 4.692677558771342e-05, + "loss": 4.7327, + "step": 26819 + }, + { + "epoch": 0.159506137596346, + "grad_norm": 2.3847367763519287, + "learning_rate": 4.692655120809764e-05, + "loss": 4.0357, + "step": 26820 + }, + { + "epoch": 0.15951208487962698, + "grad_norm": 2.5753002166748047, + "learning_rate": 4.692632682082754e-05, + "loss": 3.9462, + "step": 26821 + }, + { + "epoch": 0.159518032162908, + "grad_norm": 2.6524651050567627, + "learning_rate": 4.6926102425903185e-05, + "loss": 4.1065, + "step": 26822 + }, + { + "epoch": 0.15952397944618898, + "grad_norm": 2.808206558227539, + "learning_rate": 4.692587802332464e-05, + "loss": 4.112, + "step": 26823 + }, + { + "epoch": 0.15952992672946997, + "grad_norm": 1.5214722156524658, + "learning_rate": 4.692565361309201e-05, + "loss": 5.4128, + "step": 26824 + }, + { + "epoch": 0.159535874012751, + "grad_norm": 2.1168901920318604, + "learning_rate": 4.692542919520536e-05, + "loss": 4.1342, + "step": 26825 + }, + { + "epoch": 0.15954182129603198, + "grad_norm": 2.5575170516967773, + "learning_rate": 4.692520476966477e-05, + "loss": 4.0117, + "step": 26826 + }, + { + "epoch": 0.15954776857931297, + "grad_norm": 2.9047164916992188, + "learning_rate": 4.6924980336470314e-05, + "loss": 4.1555, + "step": 26827 + }, + { + "epoch": 0.15955371586259395, + "grad_norm": 2.678936719894409, + "learning_rate": 4.6924755895622076e-05, + "loss": 4.0008, + "step": 26828 + }, + { + "epoch": 0.15955966314587497, + "grad_norm": 2.4771978855133057, + "learning_rate": 4.692453144712014e-05, + "loss": 4.1707, + "step": 26829 + }, + { + "epoch": 0.15956561042915596, + "grad_norm": 2.1536855697631836, + "learning_rate": 4.6924306990964564e-05, + "loss": 4.1883, + "step": 26830 + }, + { + "epoch": 0.15957155771243695, + "grad_norm": 1.8136900663375854, + "learning_rate": 4.692408252715544e-05, + "loss": 4.8374, + "step": 26831 + }, + { + "epoch": 0.15957750499571796, + "grad_norm": 2.4778616428375244, + "learning_rate": 4.692385805569285e-05, + "loss": 3.9603, + "step": 26832 + }, + { + "epoch": 0.15958345227899895, + "grad_norm": 1.9646393060684204, + "learning_rate": 4.692363357657686e-05, + "loss": 4.2872, + "step": 26833 + }, + { + "epoch": 0.15958939956227994, + "grad_norm": 2.0261855125427246, + "learning_rate": 4.6923409089807566e-05, + "loss": 4.2673, + "step": 26834 + }, + { + "epoch": 0.15959534684556095, + "grad_norm": 2.361943244934082, + "learning_rate": 4.692318459538503e-05, + "loss": 3.9284, + "step": 26835 + }, + { + "epoch": 0.15960129412884194, + "grad_norm": 1.9567387104034424, + "learning_rate": 4.6922960093309334e-05, + "loss": 4.366, + "step": 26836 + }, + { + "epoch": 0.15960724141212293, + "grad_norm": 2.046351432800293, + "learning_rate": 4.692273558358057e-05, + "loss": 4.1074, + "step": 26837 + }, + { + "epoch": 0.15961318869540395, + "grad_norm": 1.9861648082733154, + "learning_rate": 4.6922511066198796e-05, + "loss": 4.1299, + "step": 26838 + }, + { + "epoch": 0.15961913597868493, + "grad_norm": 2.061688184738159, + "learning_rate": 4.692228654116411e-05, + "loss": 4.056, + "step": 26839 + }, + { + "epoch": 0.15962508326196592, + "grad_norm": 2.4299874305725098, + "learning_rate": 4.692206200847656e-05, + "loss": 3.8725, + "step": 26840 + }, + { + "epoch": 0.15963103054524694, + "grad_norm": 2.0996625423431396, + "learning_rate": 4.692183746813626e-05, + "loss": 3.9208, + "step": 26841 + }, + { + "epoch": 0.15963697782852793, + "grad_norm": 1.4910566806793213, + "learning_rate": 4.6921612920143276e-05, + "loss": 5.4869, + "step": 26842 + }, + { + "epoch": 0.15964292511180891, + "grad_norm": 2.304666042327881, + "learning_rate": 4.692138836449768e-05, + "loss": 4.3594, + "step": 26843 + }, + { + "epoch": 0.15964887239508993, + "grad_norm": 2.0998356342315674, + "learning_rate": 4.6921163801199553e-05, + "loss": 4.184, + "step": 26844 + }, + { + "epoch": 0.15965481967837092, + "grad_norm": 2.05517315864563, + "learning_rate": 4.692093923024897e-05, + "loss": 4.0709, + "step": 26845 + }, + { + "epoch": 0.1596607669616519, + "grad_norm": 1.7358988523483276, + "learning_rate": 4.692071465164601e-05, + "loss": 4.8628, + "step": 26846 + }, + { + "epoch": 0.15966671424493292, + "grad_norm": 2.173988103866577, + "learning_rate": 4.6920490065390766e-05, + "loss": 4.2944, + "step": 26847 + }, + { + "epoch": 0.1596726615282139, + "grad_norm": 1.41978919506073, + "learning_rate": 4.69202654714833e-05, + "loss": 4.9699, + "step": 26848 + }, + { + "epoch": 0.1596786088114949, + "grad_norm": 1.748255968093872, + "learning_rate": 4.6920040869923695e-05, + "loss": 3.9938, + "step": 26849 + }, + { + "epoch": 0.15968455609477591, + "grad_norm": 1.7858299016952515, + "learning_rate": 4.691981626071204e-05, + "loss": 4.7106, + "step": 26850 + }, + { + "epoch": 0.1596905033780569, + "grad_norm": 1.575324296951294, + "learning_rate": 4.691959164384839e-05, + "loss": 5.4768, + "step": 26851 + }, + { + "epoch": 0.1596964506613379, + "grad_norm": 1.383719801902771, + "learning_rate": 4.691936701933285e-05, + "loss": 5.154, + "step": 26852 + }, + { + "epoch": 0.1597023979446189, + "grad_norm": 1.559497356414795, + "learning_rate": 4.6919142387165476e-05, + "loss": 5.4081, + "step": 26853 + }, + { + "epoch": 0.1597083452278999, + "grad_norm": 2.3833580017089844, + "learning_rate": 4.691891774734636e-05, + "loss": 4.3001, + "step": 26854 + }, + { + "epoch": 0.15971429251118088, + "grad_norm": 1.5790619850158691, + "learning_rate": 4.6918693099875575e-05, + "loss": 5.1468, + "step": 26855 + }, + { + "epoch": 0.1597202397944619, + "grad_norm": 2.088935613632202, + "learning_rate": 4.69184684447532e-05, + "loss": 4.6097, + "step": 26856 + }, + { + "epoch": 0.1597261870777429, + "grad_norm": 1.7923367023468018, + "learning_rate": 4.691824378197931e-05, + "loss": 4.2733, + "step": 26857 + }, + { + "epoch": 0.15973213436102388, + "grad_norm": 1.583054780960083, + "learning_rate": 4.691801911155399e-05, + "loss": 4.7933, + "step": 26858 + }, + { + "epoch": 0.1597380816443049, + "grad_norm": 1.6564888954162598, + "learning_rate": 4.691779443347733e-05, + "loss": 4.6326, + "step": 26859 + }, + { + "epoch": 0.15974402892758588, + "grad_norm": 1.4905378818511963, + "learning_rate": 4.691756974774938e-05, + "loss": 4.8904, + "step": 26860 + }, + { + "epoch": 0.15974997621086687, + "grad_norm": 1.6564618349075317, + "learning_rate": 4.6917345054370234e-05, + "loss": 4.6245, + "step": 26861 + }, + { + "epoch": 0.15975592349414788, + "grad_norm": 1.262850284576416, + "learning_rate": 4.691712035333996e-05, + "loss": 5.584, + "step": 26862 + }, + { + "epoch": 0.15976187077742887, + "grad_norm": 1.54867684841156, + "learning_rate": 4.691689564465867e-05, + "loss": 5.543, + "step": 26863 + }, + { + "epoch": 0.15976781806070986, + "grad_norm": 1.470517635345459, + "learning_rate": 4.69166709283264e-05, + "loss": 5.5524, + "step": 26864 + }, + { + "epoch": 0.15977376534399088, + "grad_norm": 1.5773262977600098, + "learning_rate": 4.6916446204343245e-05, + "loss": 4.9904, + "step": 26865 + }, + { + "epoch": 0.15977971262727186, + "grad_norm": 1.5984915494918823, + "learning_rate": 4.6916221472709295e-05, + "loss": 4.7114, + "step": 26866 + }, + { + "epoch": 0.15978565991055285, + "grad_norm": 1.4829813241958618, + "learning_rate": 4.691599673342462e-05, + "loss": 4.9843, + "step": 26867 + }, + { + "epoch": 0.15979160719383387, + "grad_norm": 1.7312453985214233, + "learning_rate": 4.691577198648929e-05, + "loss": 4.2701, + "step": 26868 + }, + { + "epoch": 0.15979755447711486, + "grad_norm": 1.4807355403900146, + "learning_rate": 4.691554723190339e-05, + "loss": 4.7952, + "step": 26869 + }, + { + "epoch": 0.15980350176039584, + "grad_norm": 1.3604083061218262, + "learning_rate": 4.6915322469667e-05, + "loss": 5.1496, + "step": 26870 + }, + { + "epoch": 0.15980944904367686, + "grad_norm": 1.5444153547286987, + "learning_rate": 4.69150976997802e-05, + "loss": 5.791, + "step": 26871 + }, + { + "epoch": 0.15981539632695785, + "grad_norm": 1.617533564567566, + "learning_rate": 4.691487292224306e-05, + "loss": 5.5533, + "step": 26872 + }, + { + "epoch": 0.15982134361023884, + "grad_norm": 1.5946470499038696, + "learning_rate": 4.691464813705567e-05, + "loss": 5.5958, + "step": 26873 + }, + { + "epoch": 0.15982729089351985, + "grad_norm": 1.862707495689392, + "learning_rate": 4.691442334421809e-05, + "loss": 4.8171, + "step": 26874 + }, + { + "epoch": 0.15983323817680084, + "grad_norm": 1.355368971824646, + "learning_rate": 4.6914198543730425e-05, + "loss": 5.5431, + "step": 26875 + }, + { + "epoch": 0.15983918546008183, + "grad_norm": 1.4658385515213013, + "learning_rate": 4.6913973735592744e-05, + "loss": 5.3588, + "step": 26876 + }, + { + "epoch": 0.15984513274336284, + "grad_norm": 1.4573192596435547, + "learning_rate": 4.6913748919805115e-05, + "loss": 5.5454, + "step": 26877 + }, + { + "epoch": 0.15985108002664383, + "grad_norm": 1.495696783065796, + "learning_rate": 4.691352409636762e-05, + "loss": 5.5131, + "step": 26878 + }, + { + "epoch": 0.15985702730992482, + "grad_norm": 1.474161148071289, + "learning_rate": 4.691329926528034e-05, + "loss": 5.6235, + "step": 26879 + }, + { + "epoch": 0.15986297459320584, + "grad_norm": 1.5069948434829712, + "learning_rate": 4.6913074426543355e-05, + "loss": 5.3926, + "step": 26880 + }, + { + "epoch": 0.15986892187648682, + "grad_norm": 1.4088873863220215, + "learning_rate": 4.691284958015674e-05, + "loss": 5.2991, + "step": 26881 + }, + { + "epoch": 0.1598748691597678, + "grad_norm": 1.483222484588623, + "learning_rate": 4.691262472612058e-05, + "loss": 5.205, + "step": 26882 + }, + { + "epoch": 0.15988081644304883, + "grad_norm": 1.5325754880905151, + "learning_rate": 4.6912399864434953e-05, + "loss": 5.261, + "step": 26883 + }, + { + "epoch": 0.15988676372632982, + "grad_norm": 1.4159071445465088, + "learning_rate": 4.691217499509992e-05, + "loss": 5.2486, + "step": 26884 + }, + { + "epoch": 0.1598927110096108, + "grad_norm": 1.514702320098877, + "learning_rate": 4.6911950118115584e-05, + "loss": 5.332, + "step": 26885 + }, + { + "epoch": 0.1598986582928918, + "grad_norm": 1.757711410522461, + "learning_rate": 4.6911725233482005e-05, + "loss": 4.5752, + "step": 26886 + }, + { + "epoch": 0.1599046055761728, + "grad_norm": 1.6628808975219727, + "learning_rate": 4.691150034119928e-05, + "loss": 4.8776, + "step": 26887 + }, + { + "epoch": 0.1599105528594538, + "grad_norm": 1.6468075513839722, + "learning_rate": 4.691127544126746e-05, + "loss": 4.7613, + "step": 26888 + }, + { + "epoch": 0.15991650014273479, + "grad_norm": 1.603371262550354, + "learning_rate": 4.6911050533686656e-05, + "loss": 4.8145, + "step": 26889 + }, + { + "epoch": 0.1599224474260158, + "grad_norm": 1.4971832036972046, + "learning_rate": 4.6910825618456925e-05, + "loss": 5.5747, + "step": 26890 + }, + { + "epoch": 0.1599283947092968, + "grad_norm": 1.6911252737045288, + "learning_rate": 4.691060069557836e-05, + "loss": 5.5936, + "step": 26891 + }, + { + "epoch": 0.15993434199257778, + "grad_norm": 1.4903403520584106, + "learning_rate": 4.6910375765051016e-05, + "loss": 5.6195, + "step": 26892 + }, + { + "epoch": 0.1599402892758588, + "grad_norm": 1.8719216585159302, + "learning_rate": 4.6910150826874986e-05, + "loss": 4.818, + "step": 26893 + }, + { + "epoch": 0.15994623655913978, + "grad_norm": 1.7679294347763062, + "learning_rate": 4.690992588105036e-05, + "loss": 4.9175, + "step": 26894 + }, + { + "epoch": 0.15995218384242077, + "grad_norm": 1.8319326639175415, + "learning_rate": 4.69097009275772e-05, + "loss": 5.7222, + "step": 26895 + }, + { + "epoch": 0.15995813112570179, + "grad_norm": 1.6714746952056885, + "learning_rate": 4.690947596645559e-05, + "loss": 5.2146, + "step": 26896 + }, + { + "epoch": 0.15996407840898277, + "grad_norm": 1.6124671697616577, + "learning_rate": 4.690925099768561e-05, + "loss": 5.3234, + "step": 26897 + }, + { + "epoch": 0.15997002569226376, + "grad_norm": 1.546627402305603, + "learning_rate": 4.6909026021267336e-05, + "loss": 5.4278, + "step": 26898 + }, + { + "epoch": 0.15997597297554478, + "grad_norm": 1.492988109588623, + "learning_rate": 4.690880103720084e-05, + "loss": 5.5902, + "step": 26899 + }, + { + "epoch": 0.15998192025882577, + "grad_norm": 1.4887235164642334, + "learning_rate": 4.690857604548622e-05, + "loss": 5.5054, + "step": 26900 + }, + { + "epoch": 0.15998786754210675, + "grad_norm": 1.6349844932556152, + "learning_rate": 4.690835104612353e-05, + "loss": 5.4657, + "step": 26901 + }, + { + "epoch": 0.15999381482538777, + "grad_norm": 1.5228698253631592, + "learning_rate": 4.690812603911287e-05, + "loss": 5.3062, + "step": 26902 + }, + { + "epoch": 0.15999976210866876, + "grad_norm": 2.3719773292541504, + "learning_rate": 4.69079010244543e-05, + "loss": 4.3533, + "step": 26903 + }, + { + "epoch": 0.16000570939194975, + "grad_norm": 1.7740064859390259, + "learning_rate": 4.690767600214792e-05, + "loss": 4.8227, + "step": 26904 + }, + { + "epoch": 0.16001165667523076, + "grad_norm": 1.5493906736373901, + "learning_rate": 4.690745097219379e-05, + "loss": 5.2635, + "step": 26905 + }, + { + "epoch": 0.16001760395851175, + "grad_norm": 1.5318504571914673, + "learning_rate": 4.6907225934592e-05, + "loss": 5.1352, + "step": 26906 + }, + { + "epoch": 0.16002355124179274, + "grad_norm": 1.6286877393722534, + "learning_rate": 4.6907000889342626e-05, + "loss": 5.122, + "step": 26907 + }, + { + "epoch": 0.16002949852507375, + "grad_norm": 1.7091056108474731, + "learning_rate": 4.6906775836445735e-05, + "loss": 4.8629, + "step": 26908 + }, + { + "epoch": 0.16003544580835474, + "grad_norm": 1.8141852617263794, + "learning_rate": 4.6906550775901417e-05, + "loss": 5.0909, + "step": 26909 + }, + { + "epoch": 0.16004139309163573, + "grad_norm": 1.5500266551971436, + "learning_rate": 4.690632570770975e-05, + "loss": 5.3479, + "step": 26910 + }, + { + "epoch": 0.16004734037491675, + "grad_norm": 1.6703251600265503, + "learning_rate": 4.690610063187081e-05, + "loss": 5.264, + "step": 26911 + }, + { + "epoch": 0.16005328765819773, + "grad_norm": 1.2872283458709717, + "learning_rate": 4.690587554838468e-05, + "loss": 5.3643, + "step": 26912 + }, + { + "epoch": 0.16005923494147872, + "grad_norm": 1.456085443496704, + "learning_rate": 4.6905650457251435e-05, + "loss": 5.4866, + "step": 26913 + }, + { + "epoch": 0.16006518222475974, + "grad_norm": 1.560021996498108, + "learning_rate": 4.690542535847115e-05, + "loss": 5.3858, + "step": 26914 + }, + { + "epoch": 0.16007112950804073, + "grad_norm": 1.4462066888809204, + "learning_rate": 4.690520025204391e-05, + "loss": 5.2111, + "step": 26915 + }, + { + "epoch": 0.16007707679132172, + "grad_norm": 1.5655597448349, + "learning_rate": 4.6904975137969786e-05, + "loss": 5.2547, + "step": 26916 + }, + { + "epoch": 0.16008302407460273, + "grad_norm": 1.3707412481307983, + "learning_rate": 4.6904750016248865e-05, + "loss": 5.3997, + "step": 26917 + }, + { + "epoch": 0.16008897135788372, + "grad_norm": 1.7030435800552368, + "learning_rate": 4.690452488688123e-05, + "loss": 5.4115, + "step": 26918 + }, + { + "epoch": 0.1600949186411647, + "grad_norm": 1.4965012073516846, + "learning_rate": 4.690429974986694e-05, + "loss": 4.9977, + "step": 26919 + }, + { + "epoch": 0.16010086592444572, + "grad_norm": 1.3461761474609375, + "learning_rate": 4.69040746052061e-05, + "loss": 5.3629, + "step": 26920 + }, + { + "epoch": 0.1601068132077267, + "grad_norm": 1.3323198556900024, + "learning_rate": 4.690384945289875e-05, + "loss": 5.3162, + "step": 26921 + }, + { + "epoch": 0.1601127604910077, + "grad_norm": 1.6808300018310547, + "learning_rate": 4.690362429294501e-05, + "loss": 5.0513, + "step": 26922 + }, + { + "epoch": 0.16011870777428872, + "grad_norm": 1.659193515777588, + "learning_rate": 4.690339912534494e-05, + "loss": 5.2587, + "step": 26923 + }, + { + "epoch": 0.1601246550575697, + "grad_norm": 1.7092478275299072, + "learning_rate": 4.690317395009861e-05, + "loss": 5.1897, + "step": 26924 + }, + { + "epoch": 0.1601306023408507, + "grad_norm": 1.5868886709213257, + "learning_rate": 4.6902948767206115e-05, + "loss": 4.7132, + "step": 26925 + }, + { + "epoch": 0.1601365496241317, + "grad_norm": 1.584676742553711, + "learning_rate": 4.690272357666753e-05, + "loss": 4.8759, + "step": 26926 + }, + { + "epoch": 0.1601424969074127, + "grad_norm": 1.6470085382461548, + "learning_rate": 4.690249837848293e-05, + "loss": 4.9947, + "step": 26927 + }, + { + "epoch": 0.16014844419069368, + "grad_norm": 1.4562335014343262, + "learning_rate": 4.690227317265239e-05, + "loss": 5.1101, + "step": 26928 + }, + { + "epoch": 0.1601543914739747, + "grad_norm": 1.4088939428329468, + "learning_rate": 4.690204795917599e-05, + "loss": 5.3212, + "step": 26929 + }, + { + "epoch": 0.1601603387572557, + "grad_norm": 1.4988348484039307, + "learning_rate": 4.6901822738053816e-05, + "loss": 4.9456, + "step": 26930 + }, + { + "epoch": 0.16016628604053668, + "grad_norm": 1.608365535736084, + "learning_rate": 4.690159750928594e-05, + "loss": 5.082, + "step": 26931 + }, + { + "epoch": 0.1601722333238177, + "grad_norm": 1.5603444576263428, + "learning_rate": 4.6901372272872445e-05, + "loss": 5.4297, + "step": 26932 + }, + { + "epoch": 0.16017818060709868, + "grad_norm": 1.6907488107681274, + "learning_rate": 4.690114702881341e-05, + "loss": 4.9653, + "step": 26933 + }, + { + "epoch": 0.16018412789037967, + "grad_norm": 1.566992998123169, + "learning_rate": 4.69009217771089e-05, + "loss": 5.2261, + "step": 26934 + }, + { + "epoch": 0.16019007517366068, + "grad_norm": 1.4666292667388916, + "learning_rate": 4.690069651775901e-05, + "loss": 5.0251, + "step": 26935 + }, + { + "epoch": 0.16019602245694167, + "grad_norm": 1.5898406505584717, + "learning_rate": 4.690047125076382e-05, + "loss": 5.1041, + "step": 26936 + }, + { + "epoch": 0.16020196974022266, + "grad_norm": 1.3918042182922363, + "learning_rate": 4.6900245976123396e-05, + "loss": 5.3757, + "step": 26937 + }, + { + "epoch": 0.16020791702350368, + "grad_norm": 1.390620231628418, + "learning_rate": 4.690002069383782e-05, + "loss": 5.2667, + "step": 26938 + }, + { + "epoch": 0.16021386430678466, + "grad_norm": 1.4058221578598022, + "learning_rate": 4.6899795403907174e-05, + "loss": 5.8193, + "step": 26939 + }, + { + "epoch": 0.16021981159006565, + "grad_norm": 1.7895981073379517, + "learning_rate": 4.689957010633154e-05, + "loss": 4.9949, + "step": 26940 + }, + { + "epoch": 0.16022575887334667, + "grad_norm": 1.6591132879257202, + "learning_rate": 4.689934480111099e-05, + "loss": 5.0723, + "step": 26941 + }, + { + "epoch": 0.16023170615662766, + "grad_norm": 1.6578445434570312, + "learning_rate": 4.6899119488245605e-05, + "loss": 5.0636, + "step": 26942 + }, + { + "epoch": 0.16023765343990864, + "grad_norm": 1.7342137098312378, + "learning_rate": 4.6898894167735464e-05, + "loss": 4.9476, + "step": 26943 + }, + { + "epoch": 0.16024360072318963, + "grad_norm": 1.7774765491485596, + "learning_rate": 4.689866883958065e-05, + "loss": 5.04, + "step": 26944 + }, + { + "epoch": 0.16024954800647065, + "grad_norm": 1.519485354423523, + "learning_rate": 4.689844350378122e-05, + "loss": 5.353, + "step": 26945 + }, + { + "epoch": 0.16025549528975164, + "grad_norm": 1.7019078731536865, + "learning_rate": 4.6898218160337286e-05, + "loss": 5.2927, + "step": 26946 + }, + { + "epoch": 0.16026144257303263, + "grad_norm": 1.6364177465438843, + "learning_rate": 4.6897992809248903e-05, + "loss": 5.3286, + "step": 26947 + }, + { + "epoch": 0.16026738985631364, + "grad_norm": 1.5034300088882446, + "learning_rate": 4.6897767450516164e-05, + "loss": 5.1647, + "step": 26948 + }, + { + "epoch": 0.16027333713959463, + "grad_norm": 1.4327138662338257, + "learning_rate": 4.6897542084139135e-05, + "loss": 5.1381, + "step": 26949 + }, + { + "epoch": 0.16027928442287562, + "grad_norm": 1.666137456893921, + "learning_rate": 4.68973167101179e-05, + "loss": 4.7333, + "step": 26950 + }, + { + "epoch": 0.16028523170615663, + "grad_norm": 1.6748521327972412, + "learning_rate": 4.689709132845254e-05, + "loss": 4.8698, + "step": 26951 + }, + { + "epoch": 0.16029117898943762, + "grad_norm": 1.7348641157150269, + "learning_rate": 4.689686593914313e-05, + "loss": 5.0501, + "step": 26952 + }, + { + "epoch": 0.1602971262727186, + "grad_norm": 1.6517002582550049, + "learning_rate": 4.689664054218975e-05, + "loss": 4.9992, + "step": 26953 + }, + { + "epoch": 0.16030307355599963, + "grad_norm": 1.9717700481414795, + "learning_rate": 4.689641513759249e-05, + "loss": 4.6581, + "step": 26954 + }, + { + "epoch": 0.1603090208392806, + "grad_norm": 1.9283233880996704, + "learning_rate": 4.68961897253514e-05, + "loss": 4.1993, + "step": 26955 + }, + { + "epoch": 0.1603149681225616, + "grad_norm": 2.814549446105957, + "learning_rate": 4.689596430546659e-05, + "loss": 4.2436, + "step": 26956 + }, + { + "epoch": 0.16032091540584262, + "grad_norm": 1.8716390132904053, + "learning_rate": 4.689573887793811e-05, + "loss": 4.7558, + "step": 26957 + }, + { + "epoch": 0.1603268626891236, + "grad_norm": 1.5305246114730835, + "learning_rate": 4.689551344276607e-05, + "loss": 5.0986, + "step": 26958 + }, + { + "epoch": 0.1603328099724046, + "grad_norm": 1.7304683923721313, + "learning_rate": 4.689528799995052e-05, + "loss": 4.8627, + "step": 26959 + }, + { + "epoch": 0.1603387572556856, + "grad_norm": 1.6693211793899536, + "learning_rate": 4.6895062549491564e-05, + "loss": 4.6759, + "step": 26960 + }, + { + "epoch": 0.1603447045389666, + "grad_norm": 1.6889755725860596, + "learning_rate": 4.6894837091389256e-05, + "loss": 4.6676, + "step": 26961 + }, + { + "epoch": 0.1603506518222476, + "grad_norm": 1.7085540294647217, + "learning_rate": 4.6894611625643695e-05, + "loss": 5.2494, + "step": 26962 + }, + { + "epoch": 0.1603565991055286, + "grad_norm": 1.7167129516601562, + "learning_rate": 4.689438615225494e-05, + "loss": 4.7013, + "step": 26963 + }, + { + "epoch": 0.1603625463888096, + "grad_norm": 1.6896833181381226, + "learning_rate": 4.689416067122309e-05, + "loss": 5.0363, + "step": 26964 + }, + { + "epoch": 0.16036849367209058, + "grad_norm": 1.4529087543487549, + "learning_rate": 4.6893935182548215e-05, + "loss": 5.2665, + "step": 26965 + }, + { + "epoch": 0.1603744409553716, + "grad_norm": 1.630214810371399, + "learning_rate": 4.689370968623039e-05, + "loss": 5.3018, + "step": 26966 + }, + { + "epoch": 0.16038038823865258, + "grad_norm": 1.4638413190841675, + "learning_rate": 4.6893484182269697e-05, + "loss": 5.4105, + "step": 26967 + }, + { + "epoch": 0.16038633552193357, + "grad_norm": 1.7969051599502563, + "learning_rate": 4.689325867066622e-05, + "loss": 5.3511, + "step": 26968 + }, + { + "epoch": 0.1603922828052146, + "grad_norm": 1.65691339969635, + "learning_rate": 4.689303315142003e-05, + "loss": 5.158, + "step": 26969 + }, + { + "epoch": 0.16039823008849557, + "grad_norm": 1.391390085220337, + "learning_rate": 4.689280762453121e-05, + "loss": 5.2721, + "step": 26970 + }, + { + "epoch": 0.16040417737177656, + "grad_norm": 1.699019193649292, + "learning_rate": 4.689258208999983e-05, + "loss": 5.0995, + "step": 26971 + }, + { + "epoch": 0.16041012465505758, + "grad_norm": 1.7829947471618652, + "learning_rate": 4.6892356547825984e-05, + "loss": 4.9086, + "step": 26972 + }, + { + "epoch": 0.16041607193833857, + "grad_norm": 1.7381236553192139, + "learning_rate": 4.689213099800974e-05, + "loss": 4.9298, + "step": 26973 + }, + { + "epoch": 0.16042201922161956, + "grad_norm": 1.273488998413086, + "learning_rate": 4.689190544055118e-05, + "loss": 5.1877, + "step": 26974 + }, + { + "epoch": 0.16042796650490057, + "grad_norm": 1.5737167596817017, + "learning_rate": 4.689167987545038e-05, + "loss": 5.229, + "step": 26975 + }, + { + "epoch": 0.16043391378818156, + "grad_norm": 1.4660385847091675, + "learning_rate": 4.6891454302707414e-05, + "loss": 5.3256, + "step": 26976 + }, + { + "epoch": 0.16043986107146255, + "grad_norm": 1.7380048036575317, + "learning_rate": 4.6891228722322375e-05, + "loss": 4.3369, + "step": 26977 + }, + { + "epoch": 0.16044580835474356, + "grad_norm": 1.686514139175415, + "learning_rate": 4.6891003134295336e-05, + "loss": 4.9901, + "step": 26978 + }, + { + "epoch": 0.16045175563802455, + "grad_norm": 1.8255095481872559, + "learning_rate": 4.689077753862637e-05, + "loss": 4.7844, + "step": 26979 + }, + { + "epoch": 0.16045770292130554, + "grad_norm": 1.7652206420898438, + "learning_rate": 4.689055193531556e-05, + "loss": 5.2592, + "step": 26980 + }, + { + "epoch": 0.16046365020458656, + "grad_norm": 2.122629165649414, + "learning_rate": 4.6890326324362985e-05, + "loss": 4.9435, + "step": 26981 + }, + { + "epoch": 0.16046959748786754, + "grad_norm": 2.0414109230041504, + "learning_rate": 4.689010070576872e-05, + "loss": 4.8267, + "step": 26982 + }, + { + "epoch": 0.16047554477114853, + "grad_norm": 1.8635056018829346, + "learning_rate": 4.6889875079532855e-05, + "loss": 5.0768, + "step": 26983 + }, + { + "epoch": 0.16048149205442955, + "grad_norm": 1.649129033088684, + "learning_rate": 4.688964944565546e-05, + "loss": 5.1536, + "step": 26984 + }, + { + "epoch": 0.16048743933771054, + "grad_norm": 1.6211038827896118, + "learning_rate": 4.688942380413661e-05, + "loss": 5.1866, + "step": 26985 + }, + { + "epoch": 0.16049338662099152, + "grad_norm": 1.862961769104004, + "learning_rate": 4.6889198154976387e-05, + "loss": 4.9439, + "step": 26986 + }, + { + "epoch": 0.16049933390427254, + "grad_norm": 2.02945613861084, + "learning_rate": 4.6888972498174874e-05, + "loss": 4.8791, + "step": 26987 + }, + { + "epoch": 0.16050528118755353, + "grad_norm": 2.434349536895752, + "learning_rate": 4.688874683373215e-05, + "loss": 4.9336, + "step": 26988 + }, + { + "epoch": 0.16051122847083452, + "grad_norm": 1.6819970607757568, + "learning_rate": 4.6888521161648284e-05, + "loss": 4.9917, + "step": 26989 + }, + { + "epoch": 0.16051717575411553, + "grad_norm": 1.7764739990234375, + "learning_rate": 4.688829548192337e-05, + "loss": 5.274, + "step": 26990 + }, + { + "epoch": 0.16052312303739652, + "grad_norm": 1.4962623119354248, + "learning_rate": 4.6888069794557465e-05, + "loss": 5.0699, + "step": 26991 + }, + { + "epoch": 0.1605290703206775, + "grad_norm": 1.7750627994537354, + "learning_rate": 4.688784409955067e-05, + "loss": 4.9197, + "step": 26992 + }, + { + "epoch": 0.16053501760395852, + "grad_norm": 1.7030991315841675, + "learning_rate": 4.6887618396903055e-05, + "loss": 5.1113, + "step": 26993 + }, + { + "epoch": 0.1605409648872395, + "grad_norm": 1.7158962488174438, + "learning_rate": 4.68873926866147e-05, + "loss": 5.2175, + "step": 26994 + }, + { + "epoch": 0.1605469121705205, + "grad_norm": 1.5792635679244995, + "learning_rate": 4.6887166968685684e-05, + "loss": 5.2031, + "step": 26995 + }, + { + "epoch": 0.16055285945380152, + "grad_norm": 1.6441086530685425, + "learning_rate": 4.688694124311607e-05, + "loss": 4.669, + "step": 26996 + }, + { + "epoch": 0.1605588067370825, + "grad_norm": 1.4879902601242065, + "learning_rate": 4.688671550990597e-05, + "loss": 5.2163, + "step": 26997 + }, + { + "epoch": 0.1605647540203635, + "grad_norm": 1.7525761127471924, + "learning_rate": 4.688648976905543e-05, + "loss": 4.6094, + "step": 26998 + }, + { + "epoch": 0.1605707013036445, + "grad_norm": 1.500331163406372, + "learning_rate": 4.6886264020564544e-05, + "loss": 5.0793, + "step": 26999 + }, + { + "epoch": 0.1605766485869255, + "grad_norm": 1.505900502204895, + "learning_rate": 4.688603826443339e-05, + "loss": 4.9562, + "step": 27000 + }, + { + "epoch": 0.16058259587020648, + "grad_norm": 1.558977723121643, + "learning_rate": 4.688581250066205e-05, + "loss": 4.8143, + "step": 27001 + }, + { + "epoch": 0.1605885431534875, + "grad_norm": 1.4914512634277344, + "learning_rate": 4.6885586729250596e-05, + "loss": 4.624, + "step": 27002 + }, + { + "epoch": 0.1605944904367685, + "grad_norm": 1.482251763343811, + "learning_rate": 4.688536095019911e-05, + "loss": 4.87, + "step": 27003 + }, + { + "epoch": 0.16060043772004948, + "grad_norm": 1.4962702989578247, + "learning_rate": 4.688513516350767e-05, + "loss": 5.1775, + "step": 27004 + }, + { + "epoch": 0.16060638500333047, + "grad_norm": 1.71797513961792, + "learning_rate": 4.688490936917636e-05, + "loss": 5.3413, + "step": 27005 + }, + { + "epoch": 0.16061233228661148, + "grad_norm": 1.5410555601119995, + "learning_rate": 4.688468356720525e-05, + "loss": 5.399, + "step": 27006 + }, + { + "epoch": 0.16061827956989247, + "grad_norm": 1.597773551940918, + "learning_rate": 4.6884457757594424e-05, + "loss": 5.4056, + "step": 27007 + }, + { + "epoch": 0.16062422685317346, + "grad_norm": 1.3013349771499634, + "learning_rate": 4.688423194034396e-05, + "loss": 5.6953, + "step": 27008 + }, + { + "epoch": 0.16063017413645447, + "grad_norm": 1.557054877281189, + "learning_rate": 4.6884006115453935e-05, + "loss": 5.078, + "step": 27009 + }, + { + "epoch": 0.16063612141973546, + "grad_norm": 1.5944912433624268, + "learning_rate": 4.688378028292443e-05, + "loss": 5.0212, + "step": 27010 + }, + { + "epoch": 0.16064206870301645, + "grad_norm": 1.45020592212677, + "learning_rate": 4.6883554442755526e-05, + "loss": 4.9653, + "step": 27011 + }, + { + "epoch": 0.16064801598629747, + "grad_norm": 1.7178733348846436, + "learning_rate": 4.68833285949473e-05, + "loss": 5.2027, + "step": 27012 + }, + { + "epoch": 0.16065396326957845, + "grad_norm": 1.574744462966919, + "learning_rate": 4.688310273949983e-05, + "loss": 5.3929, + "step": 27013 + }, + { + "epoch": 0.16065991055285944, + "grad_norm": 1.511526107788086, + "learning_rate": 4.688287687641319e-05, + "loss": 4.9275, + "step": 27014 + }, + { + "epoch": 0.16066585783614046, + "grad_norm": 1.5261460542678833, + "learning_rate": 4.688265100568747e-05, + "loss": 5.193, + "step": 27015 + }, + { + "epoch": 0.16067180511942145, + "grad_norm": 1.3765456676483154, + "learning_rate": 4.688242512732274e-05, + "loss": 5.006, + "step": 27016 + }, + { + "epoch": 0.16067775240270243, + "grad_norm": 1.4258984327316284, + "learning_rate": 4.688219924131908e-05, + "loss": 5.0301, + "step": 27017 + }, + { + "epoch": 0.16068369968598345, + "grad_norm": 1.6083779335021973, + "learning_rate": 4.688197334767657e-05, + "loss": 5.0202, + "step": 27018 + }, + { + "epoch": 0.16068964696926444, + "grad_norm": 1.3578145503997803, + "learning_rate": 4.6881747446395285e-05, + "loss": 5.0357, + "step": 27019 + }, + { + "epoch": 0.16069559425254543, + "grad_norm": 1.5515062808990479, + "learning_rate": 4.6881521537475316e-05, + "loss": 4.7463, + "step": 27020 + }, + { + "epoch": 0.16070154153582644, + "grad_norm": 1.5254274606704712, + "learning_rate": 4.688129562091673e-05, + "loss": 5.0846, + "step": 27021 + }, + { + "epoch": 0.16070748881910743, + "grad_norm": 1.6628260612487793, + "learning_rate": 4.6881069696719615e-05, + "loss": 4.7732, + "step": 27022 + }, + { + "epoch": 0.16071343610238842, + "grad_norm": 1.5955768823623657, + "learning_rate": 4.6880843764884044e-05, + "loss": 4.7582, + "step": 27023 + }, + { + "epoch": 0.16071938338566943, + "grad_norm": 1.4915939569473267, + "learning_rate": 4.6880617825410086e-05, + "loss": 4.7503, + "step": 27024 + }, + { + "epoch": 0.16072533066895042, + "grad_norm": 1.6703109741210938, + "learning_rate": 4.6880391878297836e-05, + "loss": 4.393, + "step": 27025 + }, + { + "epoch": 0.1607312779522314, + "grad_norm": 1.6725270748138428, + "learning_rate": 4.688016592354737e-05, + "loss": 5.2538, + "step": 27026 + }, + { + "epoch": 0.16073722523551243, + "grad_norm": 1.820046305656433, + "learning_rate": 4.687993996115876e-05, + "loss": 4.7337, + "step": 27027 + }, + { + "epoch": 0.16074317251879341, + "grad_norm": 1.7842957973480225, + "learning_rate": 4.6879713991132096e-05, + "loss": 4.8615, + "step": 27028 + }, + { + "epoch": 0.1607491198020744, + "grad_norm": 1.9226150512695312, + "learning_rate": 4.687948801346745e-05, + "loss": 4.3828, + "step": 27029 + }, + { + "epoch": 0.16075506708535542, + "grad_norm": 1.3625149726867676, + "learning_rate": 4.6879262028164895e-05, + "loss": 4.962, + "step": 27030 + }, + { + "epoch": 0.1607610143686364, + "grad_norm": 1.6589162349700928, + "learning_rate": 4.687903603522452e-05, + "loss": 4.373, + "step": 27031 + }, + { + "epoch": 0.1607669616519174, + "grad_norm": 1.5190513134002686, + "learning_rate": 4.6878810034646395e-05, + "loss": 5.3889, + "step": 27032 + }, + { + "epoch": 0.1607729089351984, + "grad_norm": 1.4899837970733643, + "learning_rate": 4.6878584026430604e-05, + "loss": 4.6972, + "step": 27033 + }, + { + "epoch": 0.1607788562184794, + "grad_norm": 1.7779310941696167, + "learning_rate": 4.6878358010577226e-05, + "loss": 5.0265, + "step": 27034 + }, + { + "epoch": 0.1607848035017604, + "grad_norm": 1.7755082845687866, + "learning_rate": 4.687813198708634e-05, + "loss": 4.7129, + "step": 27035 + }, + { + "epoch": 0.1607907507850414, + "grad_norm": 1.986676573753357, + "learning_rate": 4.6877905955958024e-05, + "loss": 4.5315, + "step": 27036 + }, + { + "epoch": 0.1607966980683224, + "grad_norm": 1.727644443511963, + "learning_rate": 4.687767991719235e-05, + "loss": 4.5498, + "step": 27037 + }, + { + "epoch": 0.16080264535160338, + "grad_norm": 1.936285138130188, + "learning_rate": 4.687745387078942e-05, + "loss": 4.2741, + "step": 27038 + }, + { + "epoch": 0.1608085926348844, + "grad_norm": 1.7781955003738403, + "learning_rate": 4.687722781674928e-05, + "loss": 5.0867, + "step": 27039 + }, + { + "epoch": 0.16081453991816538, + "grad_norm": 1.7659040689468384, + "learning_rate": 4.687700175507204e-05, + "loss": 5.2197, + "step": 27040 + }, + { + "epoch": 0.16082048720144637, + "grad_norm": 1.8074475526809692, + "learning_rate": 4.6876775685757755e-05, + "loss": 4.8669, + "step": 27041 + }, + { + "epoch": 0.1608264344847274, + "grad_norm": 1.8640440702438354, + "learning_rate": 4.687654960880652e-05, + "loss": 4.2379, + "step": 27042 + }, + { + "epoch": 0.16083238176800838, + "grad_norm": 2.278597831726074, + "learning_rate": 4.6876323524218405e-05, + "loss": 4.4334, + "step": 27043 + }, + { + "epoch": 0.16083832905128936, + "grad_norm": 1.7002289295196533, + "learning_rate": 4.6876097431993486e-05, + "loss": 4.9251, + "step": 27044 + }, + { + "epoch": 0.16084427633457038, + "grad_norm": 1.626347303390503, + "learning_rate": 4.687587133213186e-05, + "loss": 5.3526, + "step": 27045 + }, + { + "epoch": 0.16085022361785137, + "grad_norm": 1.6184710264205933, + "learning_rate": 4.687564522463358e-05, + "loss": 4.9963, + "step": 27046 + }, + { + "epoch": 0.16085617090113236, + "grad_norm": 1.9560445547103882, + "learning_rate": 4.687541910949874e-05, + "loss": 4.3859, + "step": 27047 + }, + { + "epoch": 0.16086211818441337, + "grad_norm": 1.8181273937225342, + "learning_rate": 4.687519298672743e-05, + "loss": 4.7349, + "step": 27048 + }, + { + "epoch": 0.16086806546769436, + "grad_norm": 1.76878023147583, + "learning_rate": 4.68749668563197e-05, + "loss": 4.6734, + "step": 27049 + }, + { + "epoch": 0.16087401275097535, + "grad_norm": 1.6105148792266846, + "learning_rate": 4.6874740718275655e-05, + "loss": 4.7374, + "step": 27050 + }, + { + "epoch": 0.16087996003425636, + "grad_norm": 1.7216439247131348, + "learning_rate": 4.687451457259536e-05, + "loss": 4.7108, + "step": 27051 + }, + { + "epoch": 0.16088590731753735, + "grad_norm": 1.591200828552246, + "learning_rate": 4.68742884192789e-05, + "loss": 4.8113, + "step": 27052 + }, + { + "epoch": 0.16089185460081834, + "grad_norm": 1.8275965452194214, + "learning_rate": 4.687406225832635e-05, + "loss": 4.765, + "step": 27053 + }, + { + "epoch": 0.16089780188409936, + "grad_norm": 1.796170949935913, + "learning_rate": 4.68738360897378e-05, + "loss": 4.5656, + "step": 27054 + }, + { + "epoch": 0.16090374916738034, + "grad_norm": 1.6721670627593994, + "learning_rate": 4.6873609913513307e-05, + "loss": 4.7761, + "step": 27055 + }, + { + "epoch": 0.16090969645066133, + "grad_norm": 1.577500820159912, + "learning_rate": 4.687338372965296e-05, + "loss": 4.6552, + "step": 27056 + }, + { + "epoch": 0.16091564373394235, + "grad_norm": 1.4649289846420288, + "learning_rate": 4.687315753815685e-05, + "loss": 4.7041, + "step": 27057 + }, + { + "epoch": 0.16092159101722334, + "grad_norm": 1.5088578462600708, + "learning_rate": 4.687293133902505e-05, + "loss": 4.9058, + "step": 27058 + }, + { + "epoch": 0.16092753830050432, + "grad_norm": 1.5987037420272827, + "learning_rate": 4.687270513225763e-05, + "loss": 4.6935, + "step": 27059 + }, + { + "epoch": 0.16093348558378534, + "grad_norm": 1.6780216693878174, + "learning_rate": 4.687247891785468e-05, + "loss": 4.6561, + "step": 27060 + }, + { + "epoch": 0.16093943286706633, + "grad_norm": 1.678200125694275, + "learning_rate": 4.6872252695816265e-05, + "loss": 4.7769, + "step": 27061 + }, + { + "epoch": 0.16094538015034732, + "grad_norm": 1.7499932050704956, + "learning_rate": 4.687202646614248e-05, + "loss": 4.8831, + "step": 27062 + }, + { + "epoch": 0.1609513274336283, + "grad_norm": 1.5174812078475952, + "learning_rate": 4.687180022883339e-05, + "loss": 5.3915, + "step": 27063 + }, + { + "epoch": 0.16095727471690932, + "grad_norm": 1.6853543519973755, + "learning_rate": 4.6871573983889084e-05, + "loss": 5.0194, + "step": 27064 + }, + { + "epoch": 0.1609632220001903, + "grad_norm": 1.590044379234314, + "learning_rate": 4.6871347731309634e-05, + "loss": 4.8239, + "step": 27065 + }, + { + "epoch": 0.1609691692834713, + "grad_norm": 1.6128438711166382, + "learning_rate": 4.6871121471095124e-05, + "loss": 4.418, + "step": 27066 + }, + { + "epoch": 0.1609751165667523, + "grad_norm": 1.5933514833450317, + "learning_rate": 4.6870895203245635e-05, + "loss": 4.5319, + "step": 27067 + }, + { + "epoch": 0.1609810638500333, + "grad_norm": 2.0290753841400146, + "learning_rate": 4.687066892776124e-05, + "loss": 4.2566, + "step": 27068 + }, + { + "epoch": 0.1609870111333143, + "grad_norm": 1.7339308261871338, + "learning_rate": 4.687044264464202e-05, + "loss": 4.7884, + "step": 27069 + }, + { + "epoch": 0.1609929584165953, + "grad_norm": 1.3594622611999512, + "learning_rate": 4.6870216353888056e-05, + "loss": 5.2241, + "step": 27070 + }, + { + "epoch": 0.1609989056998763, + "grad_norm": 1.599043607711792, + "learning_rate": 4.6869990055499424e-05, + "loss": 4.7043, + "step": 27071 + }, + { + "epoch": 0.16100485298315728, + "grad_norm": 1.6405742168426514, + "learning_rate": 4.686976374947621e-05, + "loss": 4.7731, + "step": 27072 + }, + { + "epoch": 0.1610108002664383, + "grad_norm": 1.544199824333191, + "learning_rate": 4.686953743581848e-05, + "loss": 4.3322, + "step": 27073 + }, + { + "epoch": 0.16101674754971929, + "grad_norm": 1.5622215270996094, + "learning_rate": 4.686931111452633e-05, + "loss": 4.4059, + "step": 27074 + }, + { + "epoch": 0.16102269483300027, + "grad_norm": 1.472733497619629, + "learning_rate": 4.6869084785599814e-05, + "loss": 4.5119, + "step": 27075 + }, + { + "epoch": 0.1610286421162813, + "grad_norm": 1.6917856931686401, + "learning_rate": 4.686885844903904e-05, + "loss": 4.4056, + "step": 27076 + }, + { + "epoch": 0.16103458939956228, + "grad_norm": 1.67365300655365, + "learning_rate": 4.6868632104844066e-05, + "loss": 4.6975, + "step": 27077 + }, + { + "epoch": 0.16104053668284327, + "grad_norm": 1.7588708400726318, + "learning_rate": 4.6868405753014974e-05, + "loss": 4.5234, + "step": 27078 + }, + { + "epoch": 0.16104648396612428, + "grad_norm": 1.703722596168518, + "learning_rate": 4.686817939355186e-05, + "loss": 4.8189, + "step": 27079 + }, + { + "epoch": 0.16105243124940527, + "grad_norm": 1.9225337505340576, + "learning_rate": 4.686795302645478e-05, + "loss": 4.6807, + "step": 27080 + }, + { + "epoch": 0.16105837853268626, + "grad_norm": 1.9755665063858032, + "learning_rate": 4.686772665172383e-05, + "loss": 4.6981, + "step": 27081 + }, + { + "epoch": 0.16106432581596727, + "grad_norm": 1.8112698793411255, + "learning_rate": 4.6867500269359084e-05, + "loss": 4.6576, + "step": 27082 + }, + { + "epoch": 0.16107027309924826, + "grad_norm": 1.5739562511444092, + "learning_rate": 4.686727387936062e-05, + "loss": 4.8203, + "step": 27083 + }, + { + "epoch": 0.16107622038252925, + "grad_norm": 1.6816823482513428, + "learning_rate": 4.686704748172851e-05, + "loss": 4.9051, + "step": 27084 + }, + { + "epoch": 0.16108216766581027, + "grad_norm": 1.9315879344940186, + "learning_rate": 4.6866821076462844e-05, + "loss": 4.9205, + "step": 27085 + }, + { + "epoch": 0.16108811494909125, + "grad_norm": 1.9262312650680542, + "learning_rate": 4.686659466356369e-05, + "loss": 4.8491, + "step": 27086 + }, + { + "epoch": 0.16109406223237224, + "grad_norm": 2.244142532348633, + "learning_rate": 4.686636824303114e-05, + "loss": 4.1662, + "step": 27087 + }, + { + "epoch": 0.16110000951565326, + "grad_norm": 1.8732181787490845, + "learning_rate": 4.6866141814865266e-05, + "loss": 4.6906, + "step": 27088 + }, + { + "epoch": 0.16110595679893425, + "grad_norm": 1.7964503765106201, + "learning_rate": 4.686591537906615e-05, + "loss": 4.8282, + "step": 27089 + }, + { + "epoch": 0.16111190408221523, + "grad_norm": 1.828946828842163, + "learning_rate": 4.686568893563387e-05, + "loss": 4.6226, + "step": 27090 + }, + { + "epoch": 0.16111785136549625, + "grad_norm": 1.6230894327163696, + "learning_rate": 4.68654624845685e-05, + "loss": 4.9008, + "step": 27091 + }, + { + "epoch": 0.16112379864877724, + "grad_norm": 1.7094733715057373, + "learning_rate": 4.686523602587012e-05, + "loss": 4.4854, + "step": 27092 + }, + { + "epoch": 0.16112974593205823, + "grad_norm": 1.5419751405715942, + "learning_rate": 4.6865009559538815e-05, + "loss": 4.7452, + "step": 27093 + }, + { + "epoch": 0.16113569321533924, + "grad_norm": 1.7994260787963867, + "learning_rate": 4.686478308557466e-05, + "loss": 4.798, + "step": 27094 + }, + { + "epoch": 0.16114164049862023, + "grad_norm": 1.5732755661010742, + "learning_rate": 4.6864556603977736e-05, + "loss": 5.0714, + "step": 27095 + }, + { + "epoch": 0.16114758778190122, + "grad_norm": 1.7569549083709717, + "learning_rate": 4.686433011474812e-05, + "loss": 5.1888, + "step": 27096 + }, + { + "epoch": 0.16115353506518224, + "grad_norm": 1.5478622913360596, + "learning_rate": 4.6864103617885895e-05, + "loss": 5.1684, + "step": 27097 + }, + { + "epoch": 0.16115948234846322, + "grad_norm": 1.80837082862854, + "learning_rate": 4.6863877113391136e-05, + "loss": 5.0916, + "step": 27098 + }, + { + "epoch": 0.1611654296317442, + "grad_norm": 1.6820951700210571, + "learning_rate": 4.686365060126392e-05, + "loss": 5.0685, + "step": 27099 + }, + { + "epoch": 0.16117137691502523, + "grad_norm": 1.6210129261016846, + "learning_rate": 4.686342408150434e-05, + "loss": 4.591, + "step": 27100 + }, + { + "epoch": 0.16117732419830622, + "grad_norm": 1.7377861738204956, + "learning_rate": 4.6863197554112455e-05, + "loss": 4.7656, + "step": 27101 + }, + { + "epoch": 0.1611832714815872, + "grad_norm": 1.5875985622406006, + "learning_rate": 4.686297101908835e-05, + "loss": 5.003, + "step": 27102 + }, + { + "epoch": 0.16118921876486822, + "grad_norm": 1.6775810718536377, + "learning_rate": 4.686274447643212e-05, + "loss": 5.269, + "step": 27103 + }, + { + "epoch": 0.1611951660481492, + "grad_norm": 1.7519687414169312, + "learning_rate": 4.6862517926143826e-05, + "loss": 5.3185, + "step": 27104 + }, + { + "epoch": 0.1612011133314302, + "grad_norm": 1.6947530508041382, + "learning_rate": 4.6862291368223554e-05, + "loss": 5.0105, + "step": 27105 + }, + { + "epoch": 0.1612070606147112, + "grad_norm": 1.6445891857147217, + "learning_rate": 4.686206480267138e-05, + "loss": 4.6697, + "step": 27106 + }, + { + "epoch": 0.1612130078979922, + "grad_norm": 1.7407753467559814, + "learning_rate": 4.6861838229487385e-05, + "loss": 4.6508, + "step": 27107 + }, + { + "epoch": 0.1612189551812732, + "grad_norm": 1.7013847827911377, + "learning_rate": 4.686161164867164e-05, + "loss": 4.6613, + "step": 27108 + }, + { + "epoch": 0.1612249024645542, + "grad_norm": 1.5500074625015259, + "learning_rate": 4.686138506022425e-05, + "loss": 4.5501, + "step": 27109 + }, + { + "epoch": 0.1612308497478352, + "grad_norm": 1.7138715982437134, + "learning_rate": 4.686115846414526e-05, + "loss": 5.1747, + "step": 27110 + }, + { + "epoch": 0.16123679703111618, + "grad_norm": 1.6952149868011475, + "learning_rate": 4.686093186043478e-05, + "loss": 5.6011, + "step": 27111 + }, + { + "epoch": 0.1612427443143972, + "grad_norm": 1.4229787588119507, + "learning_rate": 4.6860705249092864e-05, + "loss": 5.2581, + "step": 27112 + }, + { + "epoch": 0.16124869159767818, + "grad_norm": 1.5605623722076416, + "learning_rate": 4.68604786301196e-05, + "loss": 4.8483, + "step": 27113 + }, + { + "epoch": 0.16125463888095917, + "grad_norm": 1.7442682981491089, + "learning_rate": 4.686025200351508e-05, + "loss": 5.1217, + "step": 27114 + }, + { + "epoch": 0.1612605861642402, + "grad_norm": 1.8555563688278198, + "learning_rate": 4.6860025369279365e-05, + "loss": 4.8616, + "step": 27115 + }, + { + "epoch": 0.16126653344752118, + "grad_norm": 1.525015115737915, + "learning_rate": 4.685979872741254e-05, + "loss": 5.5315, + "step": 27116 + }, + { + "epoch": 0.16127248073080216, + "grad_norm": 1.656496524810791, + "learning_rate": 4.685957207791468e-05, + "loss": 5.081, + "step": 27117 + }, + { + "epoch": 0.16127842801408318, + "grad_norm": 1.717789649963379, + "learning_rate": 4.685934542078588e-05, + "loss": 5.0375, + "step": 27118 + }, + { + "epoch": 0.16128437529736417, + "grad_norm": 1.4504932165145874, + "learning_rate": 4.6859118756026205e-05, + "loss": 5.5946, + "step": 27119 + }, + { + "epoch": 0.16129032258064516, + "grad_norm": 1.7576172351837158, + "learning_rate": 4.685889208363573e-05, + "loss": 5.2869, + "step": 27120 + }, + { + "epoch": 0.16129626986392614, + "grad_norm": 1.7422624826431274, + "learning_rate": 4.685866540361456e-05, + "loss": 5.5119, + "step": 27121 + }, + { + "epoch": 0.16130221714720716, + "grad_norm": 1.8503597974777222, + "learning_rate": 4.685843871596274e-05, + "loss": 5.2748, + "step": 27122 + }, + { + "epoch": 0.16130816443048815, + "grad_norm": 1.4682457447052002, + "learning_rate": 4.685821202068037e-05, + "loss": 5.1808, + "step": 27123 + }, + { + "epoch": 0.16131411171376914, + "grad_norm": 1.6852977275848389, + "learning_rate": 4.685798531776752e-05, + "loss": 5.0024, + "step": 27124 + }, + { + "epoch": 0.16132005899705015, + "grad_norm": 1.3914788961410522, + "learning_rate": 4.6857758607224275e-05, + "loss": 5.6072, + "step": 27125 + }, + { + "epoch": 0.16132600628033114, + "grad_norm": 1.3304249048233032, + "learning_rate": 4.6857531889050716e-05, + "loss": 5.6519, + "step": 27126 + }, + { + "epoch": 0.16133195356361213, + "grad_norm": 1.4981189966201782, + "learning_rate": 4.6857305163246915e-05, + "loss": 5.377, + "step": 27127 + }, + { + "epoch": 0.16133790084689315, + "grad_norm": 1.6323606967926025, + "learning_rate": 4.685707842981295e-05, + "loss": 5.3525, + "step": 27128 + }, + { + "epoch": 0.16134384813017413, + "grad_norm": 1.7571280002593994, + "learning_rate": 4.685685168874892e-05, + "loss": 5.7243, + "step": 27129 + }, + { + "epoch": 0.16134979541345512, + "grad_norm": 1.4080052375793457, + "learning_rate": 4.685662494005487e-05, + "loss": 5.368, + "step": 27130 + }, + { + "epoch": 0.16135574269673614, + "grad_norm": 1.3173414468765259, + "learning_rate": 4.685639818373091e-05, + "loss": 5.6447, + "step": 27131 + }, + { + "epoch": 0.16136168998001713, + "grad_norm": 1.6236382722854614, + "learning_rate": 4.685617141977711e-05, + "loss": 5.4868, + "step": 27132 + }, + { + "epoch": 0.1613676372632981, + "grad_norm": 1.4955110549926758, + "learning_rate": 4.6855944648193535e-05, + "loss": 5.6484, + "step": 27133 + }, + { + "epoch": 0.16137358454657913, + "grad_norm": 1.408130407333374, + "learning_rate": 4.685571786898028e-05, + "loss": 5.4925, + "step": 27134 + }, + { + "epoch": 0.16137953182986012, + "grad_norm": 1.2188119888305664, + "learning_rate": 4.685549108213742e-05, + "loss": 5.459, + "step": 27135 + }, + { + "epoch": 0.1613854791131411, + "grad_norm": 1.5991405248641968, + "learning_rate": 4.685526428766503e-05, + "loss": 5.3962, + "step": 27136 + }, + { + "epoch": 0.16139142639642212, + "grad_norm": 1.3470097780227661, + "learning_rate": 4.68550374855632e-05, + "loss": 5.4446, + "step": 27137 + }, + { + "epoch": 0.1613973736797031, + "grad_norm": 1.439078450202942, + "learning_rate": 4.685481067583201e-05, + "loss": 5.5934, + "step": 27138 + }, + { + "epoch": 0.1614033209629841, + "grad_norm": 1.5195162296295166, + "learning_rate": 4.6854583858471514e-05, + "loss": 5.5948, + "step": 27139 + }, + { + "epoch": 0.16140926824626511, + "grad_norm": 1.3565785884857178, + "learning_rate": 4.6854357033481815e-05, + "loss": 5.4467, + "step": 27140 + }, + { + "epoch": 0.1614152155295461, + "grad_norm": 1.3213258981704712, + "learning_rate": 4.685413020086299e-05, + "loss": 4.7896, + "step": 27141 + }, + { + "epoch": 0.1614211628128271, + "grad_norm": 1.6580665111541748, + "learning_rate": 4.6853903360615106e-05, + "loss": 4.9581, + "step": 27142 + }, + { + "epoch": 0.1614271100961081, + "grad_norm": 1.5277694463729858, + "learning_rate": 4.685367651273825e-05, + "loss": 5.3508, + "step": 27143 + }, + { + "epoch": 0.1614330573793891, + "grad_norm": 1.6369842290878296, + "learning_rate": 4.685344965723251e-05, + "loss": 5.2761, + "step": 27144 + }, + { + "epoch": 0.16143900466267008, + "grad_norm": 1.6954752206802368, + "learning_rate": 4.685322279409795e-05, + "loss": 5.1258, + "step": 27145 + }, + { + "epoch": 0.1614449519459511, + "grad_norm": 1.5073758363723755, + "learning_rate": 4.6852995923334664e-05, + "loss": 5.2927, + "step": 27146 + }, + { + "epoch": 0.1614508992292321, + "grad_norm": 1.5817281007766724, + "learning_rate": 4.685276904494271e-05, + "loss": 5.208, + "step": 27147 + }, + { + "epoch": 0.16145684651251307, + "grad_norm": 1.4444465637207031, + "learning_rate": 4.685254215892219e-05, + "loss": 5.0568, + "step": 27148 + }, + { + "epoch": 0.1614627937957941, + "grad_norm": 1.6507529020309448, + "learning_rate": 4.6852315265273175e-05, + "loss": 4.4881, + "step": 27149 + }, + { + "epoch": 0.16146874107907508, + "grad_norm": 1.3630253076553345, + "learning_rate": 4.685208836399573e-05, + "loss": 4.4938, + "step": 27150 + }, + { + "epoch": 0.16147468836235607, + "grad_norm": 1.5907013416290283, + "learning_rate": 4.685186145508996e-05, + "loss": 4.6613, + "step": 27151 + }, + { + "epoch": 0.16148063564563708, + "grad_norm": 1.4582465887069702, + "learning_rate": 4.6851634538555925e-05, + "loss": 4.8144, + "step": 27152 + }, + { + "epoch": 0.16148658292891807, + "grad_norm": 1.5481383800506592, + "learning_rate": 4.685140761439371e-05, + "loss": 5.2502, + "step": 27153 + }, + { + "epoch": 0.16149253021219906, + "grad_norm": 1.523826003074646, + "learning_rate": 4.685118068260339e-05, + "loss": 5.6317, + "step": 27154 + }, + { + "epoch": 0.16149847749548008, + "grad_norm": 1.502137541770935, + "learning_rate": 4.6850953743185055e-05, + "loss": 5.12, + "step": 27155 + }, + { + "epoch": 0.16150442477876106, + "grad_norm": 1.5802767276763916, + "learning_rate": 4.6850726796138776e-05, + "loss": 4.8374, + "step": 27156 + }, + { + "epoch": 0.16151037206204205, + "grad_norm": 1.6513301134109497, + "learning_rate": 4.685049984146463e-05, + "loss": 5.0668, + "step": 27157 + }, + { + "epoch": 0.16151631934532307, + "grad_norm": 1.5628081560134888, + "learning_rate": 4.6850272879162714e-05, + "loss": 4.7497, + "step": 27158 + }, + { + "epoch": 0.16152226662860406, + "grad_norm": 1.4100914001464844, + "learning_rate": 4.685004590923308e-05, + "loss": 5.606, + "step": 27159 + }, + { + "epoch": 0.16152821391188504, + "grad_norm": 1.3457648754119873, + "learning_rate": 4.684981893167583e-05, + "loss": 5.5325, + "step": 27160 + }, + { + "epoch": 0.16153416119516606, + "grad_norm": 1.6010215282440186, + "learning_rate": 4.684959194649102e-05, + "loss": 5.5653, + "step": 27161 + }, + { + "epoch": 0.16154010847844705, + "grad_norm": 1.8687788248062134, + "learning_rate": 4.684936495367875e-05, + "loss": 5.2795, + "step": 27162 + }, + { + "epoch": 0.16154605576172804, + "grad_norm": 2.1888749599456787, + "learning_rate": 4.68491379532391e-05, + "loss": 5.0313, + "step": 27163 + }, + { + "epoch": 0.16155200304500905, + "grad_norm": 1.466637372970581, + "learning_rate": 4.684891094517214e-05, + "loss": 5.221, + "step": 27164 + }, + { + "epoch": 0.16155795032829004, + "grad_norm": 1.518754482269287, + "learning_rate": 4.684868392947794e-05, + "loss": 5.037, + "step": 27165 + }, + { + "epoch": 0.16156389761157103, + "grad_norm": 1.550714373588562, + "learning_rate": 4.68484569061566e-05, + "loss": 4.8986, + "step": 27166 + }, + { + "epoch": 0.16156984489485204, + "grad_norm": 1.5226268768310547, + "learning_rate": 4.6848229875208186e-05, + "loss": 5.2425, + "step": 27167 + }, + { + "epoch": 0.16157579217813303, + "grad_norm": 1.4854047298431396, + "learning_rate": 4.684800283663279e-05, + "loss": 5.0766, + "step": 27168 + }, + { + "epoch": 0.16158173946141402, + "grad_norm": 1.6625477075576782, + "learning_rate": 4.684777579043047e-05, + "loss": 5.1967, + "step": 27169 + }, + { + "epoch": 0.16158768674469504, + "grad_norm": 1.409916877746582, + "learning_rate": 4.684754873660132e-05, + "loss": 5.0735, + "step": 27170 + }, + { + "epoch": 0.16159363402797602, + "grad_norm": 1.4444838762283325, + "learning_rate": 4.684732167514542e-05, + "loss": 5.013, + "step": 27171 + }, + { + "epoch": 0.161599581311257, + "grad_norm": 1.5226528644561768, + "learning_rate": 4.684709460606284e-05, + "loss": 4.9328, + "step": 27172 + }, + { + "epoch": 0.16160552859453803, + "grad_norm": 1.7353004217147827, + "learning_rate": 4.6846867529353664e-05, + "loss": 4.9422, + "step": 27173 + }, + { + "epoch": 0.16161147587781902, + "grad_norm": 1.613166093826294, + "learning_rate": 4.6846640445017974e-05, + "loss": 5.0545, + "step": 27174 + }, + { + "epoch": 0.1616174231611, + "grad_norm": 1.7949568033218384, + "learning_rate": 4.684641335305585e-05, + "loss": 4.944, + "step": 27175 + }, + { + "epoch": 0.16162337044438102, + "grad_norm": 1.508563756942749, + "learning_rate": 4.684618625346737e-05, + "loss": 5.2551, + "step": 27176 + }, + { + "epoch": 0.161629317727662, + "grad_norm": 1.6090425252914429, + "learning_rate": 4.6845959146252605e-05, + "loss": 5.0839, + "step": 27177 + }, + { + "epoch": 0.161635265010943, + "grad_norm": 1.6595830917358398, + "learning_rate": 4.6845732031411646e-05, + "loss": 5.2307, + "step": 27178 + }, + { + "epoch": 0.16164121229422398, + "grad_norm": 1.787662386894226, + "learning_rate": 4.684550490894457e-05, + "loss": 5.3956, + "step": 27179 + }, + { + "epoch": 0.161647159577505, + "grad_norm": 1.5315039157867432, + "learning_rate": 4.684527777885145e-05, + "loss": 5.8196, + "step": 27180 + }, + { + "epoch": 0.161653106860786, + "grad_norm": 2.004093647003174, + "learning_rate": 4.684505064113238e-05, + "loss": 4.9922, + "step": 27181 + }, + { + "epoch": 0.16165905414406698, + "grad_norm": 1.655718445777893, + "learning_rate": 4.684482349578742e-05, + "loss": 5.0178, + "step": 27182 + }, + { + "epoch": 0.161665001427348, + "grad_norm": 1.721838116645813, + "learning_rate": 4.6844596342816654e-05, + "loss": 4.8412, + "step": 27183 + }, + { + "epoch": 0.16167094871062898, + "grad_norm": 1.6883397102355957, + "learning_rate": 4.684436918222017e-05, + "loss": 4.602, + "step": 27184 + }, + { + "epoch": 0.16167689599390997, + "grad_norm": 1.4376475811004639, + "learning_rate": 4.6844142013998035e-05, + "loss": 4.7408, + "step": 27185 + }, + { + "epoch": 0.16168284327719099, + "grad_norm": 1.5542229413986206, + "learning_rate": 4.684391483815035e-05, + "loss": 5.384, + "step": 27186 + }, + { + "epoch": 0.16168879056047197, + "grad_norm": 1.4321660995483398, + "learning_rate": 4.6843687654677163e-05, + "loss": 5.3393, + "step": 27187 + }, + { + "epoch": 0.16169473784375296, + "grad_norm": 1.7398759126663208, + "learning_rate": 4.684346046357858e-05, + "loss": 5.2492, + "step": 27188 + }, + { + "epoch": 0.16170068512703398, + "grad_norm": 1.802462100982666, + "learning_rate": 4.684323326485467e-05, + "loss": 5.8437, + "step": 27189 + }, + { + "epoch": 0.16170663241031497, + "grad_norm": 1.5931847095489502, + "learning_rate": 4.684300605850551e-05, + "loss": 5.6417, + "step": 27190 + }, + { + "epoch": 0.16171257969359595, + "grad_norm": 1.6900547742843628, + "learning_rate": 4.684277884453119e-05, + "loss": 4.4741, + "step": 27191 + }, + { + "epoch": 0.16171852697687697, + "grad_norm": 1.5422314405441284, + "learning_rate": 4.684255162293178e-05, + "loss": 4.5219, + "step": 27192 + }, + { + "epoch": 0.16172447426015796, + "grad_norm": 1.816253662109375, + "learning_rate": 4.6842324393707354e-05, + "loss": 4.5676, + "step": 27193 + }, + { + "epoch": 0.16173042154343895, + "grad_norm": 1.3935781717300415, + "learning_rate": 4.6842097156858e-05, + "loss": 5.5091, + "step": 27194 + }, + { + "epoch": 0.16173636882671996, + "grad_norm": 1.7103323936462402, + "learning_rate": 4.6841869912383794e-05, + "loss": 5.3831, + "step": 27195 + }, + { + "epoch": 0.16174231611000095, + "grad_norm": 1.4029678106307983, + "learning_rate": 4.6841642660284826e-05, + "loss": 5.2129, + "step": 27196 + }, + { + "epoch": 0.16174826339328194, + "grad_norm": 1.7814414501190186, + "learning_rate": 4.684141540056116e-05, + "loss": 5.3053, + "step": 27197 + }, + { + "epoch": 0.16175421067656295, + "grad_norm": 1.5314795970916748, + "learning_rate": 4.684118813321288e-05, + "loss": 5.3863, + "step": 27198 + }, + { + "epoch": 0.16176015795984394, + "grad_norm": 1.5359210968017578, + "learning_rate": 4.6840960858240065e-05, + "loss": 5.1326, + "step": 27199 + }, + { + "epoch": 0.16176610524312493, + "grad_norm": 1.5624679327011108, + "learning_rate": 4.68407335756428e-05, + "loss": 4.8275, + "step": 27200 + }, + { + "epoch": 0.16177205252640595, + "grad_norm": 1.4580225944519043, + "learning_rate": 4.6840506285421165e-05, + "loss": 4.8576, + "step": 27201 + }, + { + "epoch": 0.16177799980968693, + "grad_norm": 1.687788724899292, + "learning_rate": 4.684027898757523e-05, + "loss": 4.8731, + "step": 27202 + }, + { + "epoch": 0.16178394709296792, + "grad_norm": 1.882171869277954, + "learning_rate": 4.684005168210508e-05, + "loss": 4.8912, + "step": 27203 + }, + { + "epoch": 0.16178989437624894, + "grad_norm": 1.513374924659729, + "learning_rate": 4.6839824369010795e-05, + "loss": 5.2447, + "step": 27204 + }, + { + "epoch": 0.16179584165952993, + "grad_norm": 1.7321797609329224, + "learning_rate": 4.683959704829245e-05, + "loss": 5.0003, + "step": 27205 + }, + { + "epoch": 0.16180178894281091, + "grad_norm": 1.677239179611206, + "learning_rate": 4.683936971995013e-05, + "loss": 5.4732, + "step": 27206 + }, + { + "epoch": 0.16180773622609193, + "grad_norm": 1.615957498550415, + "learning_rate": 4.6839142383983906e-05, + "loss": 5.4448, + "step": 27207 + }, + { + "epoch": 0.16181368350937292, + "grad_norm": 1.4981861114501953, + "learning_rate": 4.6838915040393874e-05, + "loss": 5.4369, + "step": 27208 + }, + { + "epoch": 0.1618196307926539, + "grad_norm": 1.5658632516860962, + "learning_rate": 4.683868768918009e-05, + "loss": 5.474, + "step": 27209 + }, + { + "epoch": 0.16182557807593492, + "grad_norm": 1.469122052192688, + "learning_rate": 4.6838460330342646e-05, + "loss": 5.3001, + "step": 27210 + }, + { + "epoch": 0.1618315253592159, + "grad_norm": 1.5378628969192505, + "learning_rate": 4.683823296388163e-05, + "loss": 4.8535, + "step": 27211 + }, + { + "epoch": 0.1618374726424969, + "grad_norm": 1.6330792903900146, + "learning_rate": 4.6838005589797106e-05, + "loss": 4.812, + "step": 27212 + }, + { + "epoch": 0.16184341992577791, + "grad_norm": 1.89853036403656, + "learning_rate": 4.683777820808917e-05, + "loss": 5.1666, + "step": 27213 + }, + { + "epoch": 0.1618493672090589, + "grad_norm": 1.5161629915237427, + "learning_rate": 4.683755081875788e-05, + "loss": 5.1444, + "step": 27214 + }, + { + "epoch": 0.1618553144923399, + "grad_norm": 1.6083909273147583, + "learning_rate": 4.683732342180333e-05, + "loss": 5.1403, + "step": 27215 + }, + { + "epoch": 0.1618612617756209, + "grad_norm": 1.5731655359268188, + "learning_rate": 4.68370960172256e-05, + "loss": 5.1038, + "step": 27216 + }, + { + "epoch": 0.1618672090589019, + "grad_norm": 1.8221924304962158, + "learning_rate": 4.6836868605024756e-05, + "loss": 4.8889, + "step": 27217 + }, + { + "epoch": 0.16187315634218288, + "grad_norm": 1.7264484167099, + "learning_rate": 4.683664118520089e-05, + "loss": 5.2846, + "step": 27218 + }, + { + "epoch": 0.1618791036254639, + "grad_norm": 1.6429424285888672, + "learning_rate": 4.683641375775409e-05, + "loss": 5.1433, + "step": 27219 + }, + { + "epoch": 0.1618850509087449, + "grad_norm": 1.6444041728973389, + "learning_rate": 4.683618632268441e-05, + "loss": 5.7116, + "step": 27220 + }, + { + "epoch": 0.16189099819202588, + "grad_norm": 1.595996379852295, + "learning_rate": 4.683595887999195e-05, + "loss": 5.4419, + "step": 27221 + }, + { + "epoch": 0.1618969454753069, + "grad_norm": 1.489001989364624, + "learning_rate": 4.6835731429676776e-05, + "loss": 5.2004, + "step": 27222 + }, + { + "epoch": 0.16190289275858788, + "grad_norm": 1.6208230257034302, + "learning_rate": 4.683550397173898e-05, + "loss": 5.2405, + "step": 27223 + }, + { + "epoch": 0.16190884004186887, + "grad_norm": 1.7584507465362549, + "learning_rate": 4.683527650617863e-05, + "loss": 4.5921, + "step": 27224 + }, + { + "epoch": 0.16191478732514988, + "grad_norm": 1.8459594249725342, + "learning_rate": 4.683504903299581e-05, + "loss": 4.6269, + "step": 27225 + }, + { + "epoch": 0.16192073460843087, + "grad_norm": 2.055671453475952, + "learning_rate": 4.683482155219061e-05, + "loss": 4.8219, + "step": 27226 + }, + { + "epoch": 0.16192668189171186, + "grad_norm": 1.8772468566894531, + "learning_rate": 4.683459406376309e-05, + "loss": 4.9343, + "step": 27227 + }, + { + "epoch": 0.16193262917499288, + "grad_norm": 1.8033567667007446, + "learning_rate": 4.683436656771334e-05, + "loss": 4.5181, + "step": 27228 + }, + { + "epoch": 0.16193857645827386, + "grad_norm": 1.8112131357192993, + "learning_rate": 4.6834139064041436e-05, + "loss": 4.6479, + "step": 27229 + }, + { + "epoch": 0.16194452374155485, + "grad_norm": 1.958721399307251, + "learning_rate": 4.6833911552747466e-05, + "loss": 4.3747, + "step": 27230 + }, + { + "epoch": 0.16195047102483587, + "grad_norm": 1.9740078449249268, + "learning_rate": 4.683368403383151e-05, + "loss": 4.5357, + "step": 27231 + }, + { + "epoch": 0.16195641830811686, + "grad_norm": 1.8071064949035645, + "learning_rate": 4.683345650729362e-05, + "loss": 4.2025, + "step": 27232 + }, + { + "epoch": 0.16196236559139784, + "grad_norm": 2.067153215408325, + "learning_rate": 4.6833228973133914e-05, + "loss": 4.7599, + "step": 27233 + }, + { + "epoch": 0.16196831287467886, + "grad_norm": 2.219170570373535, + "learning_rate": 4.683300143135244e-05, + "loss": 4.8643, + "step": 27234 + }, + { + "epoch": 0.16197426015795985, + "grad_norm": 1.8077818155288696, + "learning_rate": 4.68327738819493e-05, + "loss": 4.9781, + "step": 27235 + }, + { + "epoch": 0.16198020744124084, + "grad_norm": 2.1170096397399902, + "learning_rate": 4.683254632492456e-05, + "loss": 4.5507, + "step": 27236 + }, + { + "epoch": 0.16198615472452182, + "grad_norm": 1.9441372156143188, + "learning_rate": 4.6832318760278306e-05, + "loss": 4.2419, + "step": 27237 + }, + { + "epoch": 0.16199210200780284, + "grad_norm": 2.261038064956665, + "learning_rate": 4.6832091188010615e-05, + "loss": 4.8287, + "step": 27238 + }, + { + "epoch": 0.16199804929108383, + "grad_norm": 1.906253457069397, + "learning_rate": 4.6831863608121565e-05, + "loss": 4.7154, + "step": 27239 + }, + { + "epoch": 0.16200399657436482, + "grad_norm": 1.7181471586227417, + "learning_rate": 4.683163602061124e-05, + "loss": 4.8286, + "step": 27240 + }, + { + "epoch": 0.16200994385764583, + "grad_norm": 1.6163973808288574, + "learning_rate": 4.683140842547971e-05, + "loss": 5.1988, + "step": 27241 + }, + { + "epoch": 0.16201589114092682, + "grad_norm": 1.8723608255386353, + "learning_rate": 4.6831180822727064e-05, + "loss": 4.6135, + "step": 27242 + }, + { + "epoch": 0.1620218384242078, + "grad_norm": 1.557589054107666, + "learning_rate": 4.683095321235338e-05, + "loss": 4.7632, + "step": 27243 + }, + { + "epoch": 0.16202778570748883, + "grad_norm": 1.3284127712249756, + "learning_rate": 4.683072559435873e-05, + "loss": 4.0683, + "step": 27244 + }, + { + "epoch": 0.1620337329907698, + "grad_norm": 1.5295307636260986, + "learning_rate": 4.68304979687432e-05, + "loss": 4.2219, + "step": 27245 + }, + { + "epoch": 0.1620396802740508, + "grad_norm": 2.0153698921203613, + "learning_rate": 4.683027033550687e-05, + "loss": 4.8334, + "step": 27246 + }, + { + "epoch": 0.16204562755733182, + "grad_norm": 1.3090236186981201, + "learning_rate": 4.683004269464983e-05, + "loss": 5.1588, + "step": 27247 + }, + { + "epoch": 0.1620515748406128, + "grad_norm": 1.4936387538909912, + "learning_rate": 4.6829815046172136e-05, + "loss": 5.2226, + "step": 27248 + }, + { + "epoch": 0.1620575221238938, + "grad_norm": 1.6028317213058472, + "learning_rate": 4.682958739007388e-05, + "loss": 5.0174, + "step": 27249 + }, + { + "epoch": 0.1620634694071748, + "grad_norm": 1.221101999282837, + "learning_rate": 4.6829359726355144e-05, + "loss": 5.3307, + "step": 27250 + }, + { + "epoch": 0.1620694166904558, + "grad_norm": 1.348512887954712, + "learning_rate": 4.6829132055016e-05, + "loss": 5.4754, + "step": 27251 + }, + { + "epoch": 0.16207536397373679, + "grad_norm": 1.506373643875122, + "learning_rate": 4.682890437605654e-05, + "loss": 5.0422, + "step": 27252 + }, + { + "epoch": 0.1620813112570178, + "grad_norm": 1.7753325700759888, + "learning_rate": 4.6828676689476825e-05, + "loss": 5.0218, + "step": 27253 + }, + { + "epoch": 0.1620872585402988, + "grad_norm": 1.5221372842788696, + "learning_rate": 4.6828448995276945e-05, + "loss": 5.1423, + "step": 27254 + }, + { + "epoch": 0.16209320582357978, + "grad_norm": 1.7772079706192017, + "learning_rate": 4.682822129345699e-05, + "loss": 4.8782, + "step": 27255 + }, + { + "epoch": 0.1620991531068608, + "grad_norm": 1.495651125907898, + "learning_rate": 4.6827993584017014e-05, + "loss": 5.2042, + "step": 27256 + }, + { + "epoch": 0.16210510039014178, + "grad_norm": 1.5901660919189453, + "learning_rate": 4.682776586695712e-05, + "loss": 5.5121, + "step": 27257 + }, + { + "epoch": 0.16211104767342277, + "grad_norm": 1.7442855834960938, + "learning_rate": 4.6827538142277373e-05, + "loss": 4.9278, + "step": 27258 + }, + { + "epoch": 0.16211699495670379, + "grad_norm": 2.777273416519165, + "learning_rate": 4.682731040997786e-05, + "loss": 4.6258, + "step": 27259 + }, + { + "epoch": 0.16212294223998477, + "grad_norm": 1.8470478057861328, + "learning_rate": 4.6827082670058655e-05, + "loss": 4.87, + "step": 27260 + }, + { + "epoch": 0.16212888952326576, + "grad_norm": 1.545902132987976, + "learning_rate": 4.6826854922519844e-05, + "loss": 4.8776, + "step": 27261 + }, + { + "epoch": 0.16213483680654678, + "grad_norm": 1.5720170736312866, + "learning_rate": 4.682662716736151e-05, + "loss": 4.9046, + "step": 27262 + }, + { + "epoch": 0.16214078408982777, + "grad_norm": 1.6243836879730225, + "learning_rate": 4.682639940458372e-05, + "loss": 5.0243, + "step": 27263 + }, + { + "epoch": 0.16214673137310875, + "grad_norm": 2.738065719604492, + "learning_rate": 4.682617163418656e-05, + "loss": 4.1899, + "step": 27264 + }, + { + "epoch": 0.16215267865638977, + "grad_norm": 4.745233058929443, + "learning_rate": 4.682594385617011e-05, + "loss": 3.0819, + "step": 27265 + }, + { + "epoch": 0.16215862593967076, + "grad_norm": 4.1978936195373535, + "learning_rate": 4.6825716070534444e-05, + "loss": 3.1755, + "step": 27266 + }, + { + "epoch": 0.16216457322295175, + "grad_norm": 2.8367183208465576, + "learning_rate": 4.682548827727965e-05, + "loss": 3.53, + "step": 27267 + }, + { + "epoch": 0.16217052050623276, + "grad_norm": 1.7866027355194092, + "learning_rate": 4.6825260476405805e-05, + "loss": 4.6173, + "step": 27268 + }, + { + "epoch": 0.16217646778951375, + "grad_norm": 1.7661093473434448, + "learning_rate": 4.6825032667912984e-05, + "loss": 5.0541, + "step": 27269 + }, + { + "epoch": 0.16218241507279474, + "grad_norm": 1.9146814346313477, + "learning_rate": 4.682480485180127e-05, + "loss": 4.9121, + "step": 27270 + }, + { + "epoch": 0.16218836235607575, + "grad_norm": 2.8185949325561523, + "learning_rate": 4.682457702807075e-05, + "loss": 2.9822, + "step": 27271 + }, + { + "epoch": 0.16219430963935674, + "grad_norm": 3.360478162765503, + "learning_rate": 4.682434919672148e-05, + "loss": 2.2526, + "step": 27272 + }, + { + "epoch": 0.16220025692263773, + "grad_norm": 3.5563254356384277, + "learning_rate": 4.682412135775357e-05, + "loss": 3.3203, + "step": 27273 + }, + { + "epoch": 0.16220620420591875, + "grad_norm": 2.84264874458313, + "learning_rate": 4.682389351116707e-05, + "loss": 3.1093, + "step": 27274 + }, + { + "epoch": 0.16221215148919974, + "grad_norm": 2.6400508880615234, + "learning_rate": 4.682366565696208e-05, + "loss": 4.1745, + "step": 27275 + }, + { + "epoch": 0.16221809877248072, + "grad_norm": 2.5986385345458984, + "learning_rate": 4.682343779513868e-05, + "loss": 5.5863, + "step": 27276 + }, + { + "epoch": 0.16222404605576174, + "grad_norm": 2.3456249237060547, + "learning_rate": 4.6823209925696945e-05, + "loss": 4.3825, + "step": 27277 + }, + { + "epoch": 0.16222999333904273, + "grad_norm": 1.909117341041565, + "learning_rate": 4.682298204863694e-05, + "loss": 4.9451, + "step": 27278 + }, + { + "epoch": 0.16223594062232372, + "grad_norm": 1.6204262971878052, + "learning_rate": 4.682275416395877e-05, + "loss": 5.0483, + "step": 27279 + }, + { + "epoch": 0.16224188790560473, + "grad_norm": 1.5689494609832764, + "learning_rate": 4.68225262716625e-05, + "loss": 5.0821, + "step": 27280 + }, + { + "epoch": 0.16224783518888572, + "grad_norm": 1.553642749786377, + "learning_rate": 4.682229837174821e-05, + "loss": 5.3247, + "step": 27281 + }, + { + "epoch": 0.1622537824721667, + "grad_norm": 2.1375479698181152, + "learning_rate": 4.682207046421597e-05, + "loss": 4.4596, + "step": 27282 + }, + { + "epoch": 0.16225972975544772, + "grad_norm": 2.6894989013671875, + "learning_rate": 4.682184254906589e-05, + "loss": 4.1466, + "step": 27283 + }, + { + "epoch": 0.1622656770387287, + "grad_norm": 2.0883328914642334, + "learning_rate": 4.6821614626298015e-05, + "loss": 4.1182, + "step": 27284 + }, + { + "epoch": 0.1622716243220097, + "grad_norm": 2.263207197189331, + "learning_rate": 4.6821386695912444e-05, + "loss": 4.1029, + "step": 27285 + }, + { + "epoch": 0.16227757160529072, + "grad_norm": 2.2623839378356934, + "learning_rate": 4.6821158757909255e-05, + "loss": 4.0745, + "step": 27286 + }, + { + "epoch": 0.1622835188885717, + "grad_norm": 1.7428866624832153, + "learning_rate": 4.682093081228852e-05, + "loss": 4.7707, + "step": 27287 + }, + { + "epoch": 0.1622894661718527, + "grad_norm": 2.0418710708618164, + "learning_rate": 4.682070285905033e-05, + "loss": 4.5464, + "step": 27288 + }, + { + "epoch": 0.1622954134551337, + "grad_norm": 2.421755313873291, + "learning_rate": 4.682047489819475e-05, + "loss": 3.9835, + "step": 27289 + }, + { + "epoch": 0.1623013607384147, + "grad_norm": 2.3179736137390137, + "learning_rate": 4.682024692972188e-05, + "loss": 3.8936, + "step": 27290 + }, + { + "epoch": 0.16230730802169568, + "grad_norm": 2.144463300704956, + "learning_rate": 4.682001895363177e-05, + "loss": 4.123, + "step": 27291 + }, + { + "epoch": 0.1623132553049767, + "grad_norm": 1.8054444789886475, + "learning_rate": 4.681979096992454e-05, + "loss": 4.5947, + "step": 27292 + }, + { + "epoch": 0.1623192025882577, + "grad_norm": 1.9559820890426636, + "learning_rate": 4.681956297860023e-05, + "loss": 4.1805, + "step": 27293 + }, + { + "epoch": 0.16232514987153868, + "grad_norm": 2.253756284713745, + "learning_rate": 4.6819334979658934e-05, + "loss": 3.7279, + "step": 27294 + }, + { + "epoch": 0.16233109715481966, + "grad_norm": 2.1193337440490723, + "learning_rate": 4.681910697310074e-05, + "loss": 3.646, + "step": 27295 + }, + { + "epoch": 0.16233704443810068, + "grad_norm": 2.2527666091918945, + "learning_rate": 4.681887895892572e-05, + "loss": 4.0891, + "step": 27296 + }, + { + "epoch": 0.16234299172138167, + "grad_norm": 2.255565643310547, + "learning_rate": 4.681865093713396e-05, + "loss": 3.8497, + "step": 27297 + }, + { + "epoch": 0.16234893900466266, + "grad_norm": 2.3153398036956787, + "learning_rate": 4.681842290772552e-05, + "loss": 3.787, + "step": 27298 + }, + { + "epoch": 0.16235488628794367, + "grad_norm": 2.7600228786468506, + "learning_rate": 4.681819487070051e-05, + "loss": 4.0376, + "step": 27299 + }, + { + "epoch": 0.16236083357122466, + "grad_norm": 1.8102682828903198, + "learning_rate": 4.681796682605898e-05, + "loss": 4.1901, + "step": 27300 + }, + { + "epoch": 0.16236678085450565, + "grad_norm": 2.125884771347046, + "learning_rate": 4.6817738773801035e-05, + "loss": 4.4809, + "step": 27301 + }, + { + "epoch": 0.16237272813778666, + "grad_norm": 2.308034658432007, + "learning_rate": 4.681751071392674e-05, + "loss": 3.7836, + "step": 27302 + }, + { + "epoch": 0.16237867542106765, + "grad_norm": 2.2197370529174805, + "learning_rate": 4.6817282646436166e-05, + "loss": 3.7033, + "step": 27303 + }, + { + "epoch": 0.16238462270434864, + "grad_norm": 1.7763569355010986, + "learning_rate": 4.681705457132942e-05, + "loss": 4.7483, + "step": 27304 + }, + { + "epoch": 0.16239056998762966, + "grad_norm": 2.2781457901000977, + "learning_rate": 4.681682648860656e-05, + "loss": 3.5617, + "step": 27305 + }, + { + "epoch": 0.16239651727091065, + "grad_norm": 2.257497787475586, + "learning_rate": 4.6816598398267664e-05, + "loss": 3.7756, + "step": 27306 + }, + { + "epoch": 0.16240246455419163, + "grad_norm": 2.277405261993408, + "learning_rate": 4.681637030031283e-05, + "loss": 3.6759, + "step": 27307 + }, + { + "epoch": 0.16240841183747265, + "grad_norm": 2.160238265991211, + "learning_rate": 4.681614219474212e-05, + "loss": 3.568, + "step": 27308 + }, + { + "epoch": 0.16241435912075364, + "grad_norm": 2.0068106651306152, + "learning_rate": 4.6815914081555624e-05, + "loss": 3.7039, + "step": 27309 + }, + { + "epoch": 0.16242030640403463, + "grad_norm": 3.0893945693969727, + "learning_rate": 4.681568596075341e-05, + "loss": 3.8708, + "step": 27310 + }, + { + "epoch": 0.16242625368731564, + "grad_norm": 2.5544440746307373, + "learning_rate": 4.681545783233557e-05, + "loss": 3.9529, + "step": 27311 + }, + { + "epoch": 0.16243220097059663, + "grad_norm": 1.7706321477890015, + "learning_rate": 4.681522969630218e-05, + "loss": 6.004, + "step": 27312 + }, + { + "epoch": 0.16243814825387762, + "grad_norm": 2.0155160427093506, + "learning_rate": 4.681500155265332e-05, + "loss": 4.1088, + "step": 27313 + }, + { + "epoch": 0.16244409553715863, + "grad_norm": 2.436854124069214, + "learning_rate": 4.681477340138907e-05, + "loss": 3.7281, + "step": 27314 + }, + { + "epoch": 0.16245004282043962, + "grad_norm": 1.7717199325561523, + "learning_rate": 4.68145452425095e-05, + "loss": 4.7058, + "step": 27315 + }, + { + "epoch": 0.1624559901037206, + "grad_norm": 1.8537521362304688, + "learning_rate": 4.6814317076014705e-05, + "loss": 5.5633, + "step": 27316 + }, + { + "epoch": 0.16246193738700163, + "grad_norm": 1.4485749006271362, + "learning_rate": 4.681408890190475e-05, + "loss": 6.1646, + "step": 27317 + }, + { + "epoch": 0.16246788467028261, + "grad_norm": 1.7619411945343018, + "learning_rate": 4.681386072017973e-05, + "loss": 4.9872, + "step": 27318 + }, + { + "epoch": 0.1624738319535636, + "grad_norm": 1.3868266344070435, + "learning_rate": 4.681363253083971e-05, + "loss": 5.337, + "step": 27319 + }, + { + "epoch": 0.16247977923684462, + "grad_norm": 2.339705467224121, + "learning_rate": 4.681340433388478e-05, + "loss": 4.1131, + "step": 27320 + }, + { + "epoch": 0.1624857265201256, + "grad_norm": 2.4623711109161377, + "learning_rate": 4.681317612931502e-05, + "loss": 4.0167, + "step": 27321 + }, + { + "epoch": 0.1624916738034066, + "grad_norm": 2.06557559967041, + "learning_rate": 4.68129479171305e-05, + "loss": 4.4482, + "step": 27322 + }, + { + "epoch": 0.1624976210866876, + "grad_norm": 1.9864208698272705, + "learning_rate": 4.681271969733131e-05, + "loss": 4.5421, + "step": 27323 + }, + { + "epoch": 0.1625035683699686, + "grad_norm": 2.29591703414917, + "learning_rate": 4.6812491469917516e-05, + "loss": 4.4407, + "step": 27324 + }, + { + "epoch": 0.1625095156532496, + "grad_norm": 1.9640796184539795, + "learning_rate": 4.681226323488921e-05, + "loss": 4.3113, + "step": 27325 + }, + { + "epoch": 0.1625154629365306, + "grad_norm": 1.6823822259902954, + "learning_rate": 4.6812034992246464e-05, + "loss": 5.3048, + "step": 27326 + }, + { + "epoch": 0.1625214102198116, + "grad_norm": 1.7765403985977173, + "learning_rate": 4.681180674198937e-05, + "loss": 4.7484, + "step": 27327 + }, + { + "epoch": 0.16252735750309258, + "grad_norm": 2.8496274948120117, + "learning_rate": 4.6811578484118e-05, + "loss": 2.9788, + "step": 27328 + }, + { + "epoch": 0.1625333047863736, + "grad_norm": 2.600203514099121, + "learning_rate": 4.681135021863243e-05, + "loss": 3.6706, + "step": 27329 + }, + { + "epoch": 0.16253925206965458, + "grad_norm": 2.3449292182922363, + "learning_rate": 4.681112194553274e-05, + "loss": 3.1501, + "step": 27330 + }, + { + "epoch": 0.16254519935293557, + "grad_norm": 2.6009342670440674, + "learning_rate": 4.681089366481902e-05, + "loss": 3.3097, + "step": 27331 + }, + { + "epoch": 0.1625511466362166, + "grad_norm": 2.4977009296417236, + "learning_rate": 4.681066537649134e-05, + "loss": 3.2114, + "step": 27332 + }, + { + "epoch": 0.16255709391949758, + "grad_norm": 1.9522204399108887, + "learning_rate": 4.681043708054978e-05, + "loss": 4.9502, + "step": 27333 + }, + { + "epoch": 0.16256304120277856, + "grad_norm": 1.8254719972610474, + "learning_rate": 4.6810208776994425e-05, + "loss": 5.1497, + "step": 27334 + }, + { + "epoch": 0.16256898848605958, + "grad_norm": 2.9470701217651367, + "learning_rate": 4.680998046582535e-05, + "loss": 3.1034, + "step": 27335 + }, + { + "epoch": 0.16257493576934057, + "grad_norm": 3.033200979232788, + "learning_rate": 4.680975214704263e-05, + "loss": 3.1627, + "step": 27336 + }, + { + "epoch": 0.16258088305262156, + "grad_norm": 2.9590744972229004, + "learning_rate": 4.680952382064636e-05, + "loss": 3.6219, + "step": 27337 + }, + { + "epoch": 0.16258683033590257, + "grad_norm": 1.759320616722107, + "learning_rate": 4.680929548663661e-05, + "loss": 5.0067, + "step": 27338 + }, + { + "epoch": 0.16259277761918356, + "grad_norm": 1.7571178674697876, + "learning_rate": 4.680906714501345e-05, + "loss": 4.9829, + "step": 27339 + }, + { + "epoch": 0.16259872490246455, + "grad_norm": 1.7212225198745728, + "learning_rate": 4.680883879577698e-05, + "loss": 4.854, + "step": 27340 + }, + { + "epoch": 0.16260467218574556, + "grad_norm": 1.732384204864502, + "learning_rate": 4.680861043892727e-05, + "loss": 4.7023, + "step": 27341 + }, + { + "epoch": 0.16261061946902655, + "grad_norm": 1.8678463697433472, + "learning_rate": 4.680838207446439e-05, + "loss": 5.3755, + "step": 27342 + }, + { + "epoch": 0.16261656675230754, + "grad_norm": 1.6973927021026611, + "learning_rate": 4.680815370238843e-05, + "loss": 4.678, + "step": 27343 + }, + { + "epoch": 0.16262251403558856, + "grad_norm": 1.6274856328964233, + "learning_rate": 4.680792532269948e-05, + "loss": 4.8053, + "step": 27344 + }, + { + "epoch": 0.16262846131886954, + "grad_norm": 1.6367913484573364, + "learning_rate": 4.6807696935397604e-05, + "loss": 4.8855, + "step": 27345 + }, + { + "epoch": 0.16263440860215053, + "grad_norm": 1.5021651983261108, + "learning_rate": 4.680746854048288e-05, + "loss": 5.318, + "step": 27346 + }, + { + "epoch": 0.16264035588543155, + "grad_norm": 1.329917073249817, + "learning_rate": 4.68072401379554e-05, + "loss": 6.0137, + "step": 27347 + }, + { + "epoch": 0.16264630316871254, + "grad_norm": 1.6316022872924805, + "learning_rate": 4.680701172781524e-05, + "loss": 5.8787, + "step": 27348 + }, + { + "epoch": 0.16265225045199352, + "grad_norm": 1.640479564666748, + "learning_rate": 4.6806783310062476e-05, + "loss": 5.568, + "step": 27349 + }, + { + "epoch": 0.16265819773527454, + "grad_norm": 1.6600250005722046, + "learning_rate": 4.680655488469718e-05, + "loss": 5.3461, + "step": 27350 + }, + { + "epoch": 0.16266414501855553, + "grad_norm": 1.7950623035430908, + "learning_rate": 4.680632645171945e-05, + "loss": 4.8529, + "step": 27351 + }, + { + "epoch": 0.16267009230183652, + "grad_norm": 1.732972502708435, + "learning_rate": 4.6806098011129356e-05, + "loss": 4.8085, + "step": 27352 + }, + { + "epoch": 0.1626760395851175, + "grad_norm": 1.7508574724197388, + "learning_rate": 4.680586956292698e-05, + "loss": 4.9188, + "step": 27353 + }, + { + "epoch": 0.16268198686839852, + "grad_norm": 1.521814227104187, + "learning_rate": 4.6805641107112395e-05, + "loss": 4.6616, + "step": 27354 + }, + { + "epoch": 0.1626879341516795, + "grad_norm": 1.7594850063323975, + "learning_rate": 4.6805412643685684e-05, + "loss": 4.6634, + "step": 27355 + }, + { + "epoch": 0.1626938814349605, + "grad_norm": 1.5281226634979248, + "learning_rate": 4.6805184172646944e-05, + "loss": 5.0508, + "step": 27356 + }, + { + "epoch": 0.1626998287182415, + "grad_norm": 1.3342808485031128, + "learning_rate": 4.6804955693996225e-05, + "loss": 5.605, + "step": 27357 + }, + { + "epoch": 0.1627057760015225, + "grad_norm": 1.5639429092407227, + "learning_rate": 4.680472720773362e-05, + "loss": 5.0959, + "step": 27358 + }, + { + "epoch": 0.1627117232848035, + "grad_norm": 1.661442756652832, + "learning_rate": 4.680449871385922e-05, + "loss": 4.8981, + "step": 27359 + }, + { + "epoch": 0.1627176705680845, + "grad_norm": 1.601442813873291, + "learning_rate": 4.6804270212373094e-05, + "loss": 4.8313, + "step": 27360 + }, + { + "epoch": 0.1627236178513655, + "grad_norm": 1.5367902517318726, + "learning_rate": 4.6804041703275315e-05, + "loss": 4.8772, + "step": 27361 + }, + { + "epoch": 0.16272956513464648, + "grad_norm": 1.5161237716674805, + "learning_rate": 4.680381318656597e-05, + "loss": 4.7877, + "step": 27362 + }, + { + "epoch": 0.1627355124179275, + "grad_norm": 1.790384292602539, + "learning_rate": 4.680358466224515e-05, + "loss": 5.2596, + "step": 27363 + }, + { + "epoch": 0.16274145970120849, + "grad_norm": 1.6441622972488403, + "learning_rate": 4.6803356130312915e-05, + "loss": 5.3774, + "step": 27364 + }, + { + "epoch": 0.16274740698448947, + "grad_norm": 1.4816210269927979, + "learning_rate": 4.680312759076935e-05, + "loss": 5.4754, + "step": 27365 + }, + { + "epoch": 0.1627533542677705, + "grad_norm": 1.5345895290374756, + "learning_rate": 4.680289904361454e-05, + "loss": 5.2805, + "step": 27366 + }, + { + "epoch": 0.16275930155105148, + "grad_norm": 1.3760472536087036, + "learning_rate": 4.680267048884857e-05, + "loss": 5.327, + "step": 27367 + }, + { + "epoch": 0.16276524883433247, + "grad_norm": 2.4343063831329346, + "learning_rate": 4.680244192647151e-05, + "loss": 4.8059, + "step": 27368 + }, + { + "epoch": 0.16277119611761348, + "grad_norm": 2.8197708129882812, + "learning_rate": 4.6802213356483444e-05, + "loss": 4.1087, + "step": 27369 + }, + { + "epoch": 0.16277714340089447, + "grad_norm": 3.0709099769592285, + "learning_rate": 4.680198477888445e-05, + "loss": 4.1441, + "step": 27370 + }, + { + "epoch": 0.16278309068417546, + "grad_norm": 2.8608505725860596, + "learning_rate": 4.680175619367461e-05, + "loss": 4.3136, + "step": 27371 + }, + { + "epoch": 0.16278903796745647, + "grad_norm": 2.9403672218322754, + "learning_rate": 4.6801527600854e-05, + "loss": 3.903, + "step": 27372 + }, + { + "epoch": 0.16279498525073746, + "grad_norm": 1.7551895380020142, + "learning_rate": 4.6801299000422696e-05, + "loss": 5.0392, + "step": 27373 + }, + { + "epoch": 0.16280093253401845, + "grad_norm": 1.862855076789856, + "learning_rate": 4.680107039238079e-05, + "loss": 4.712, + "step": 27374 + }, + { + "epoch": 0.16280687981729947, + "grad_norm": 1.6673380136489868, + "learning_rate": 4.680084177672835e-05, + "loss": 5.1954, + "step": 27375 + }, + { + "epoch": 0.16281282710058045, + "grad_norm": 1.3807284832000732, + "learning_rate": 4.680061315346547e-05, + "loss": 5.7525, + "step": 27376 + }, + { + "epoch": 0.16281877438386144, + "grad_norm": 1.6106042861938477, + "learning_rate": 4.680038452259222e-05, + "loss": 6.1879, + "step": 27377 + }, + { + "epoch": 0.16282472166714246, + "grad_norm": 1.3592698574066162, + "learning_rate": 4.6800155884108674e-05, + "loss": 5.725, + "step": 27378 + }, + { + "epoch": 0.16283066895042345, + "grad_norm": 1.7938450574874878, + "learning_rate": 4.679992723801493e-05, + "loss": 4.8694, + "step": 27379 + }, + { + "epoch": 0.16283661623370443, + "grad_norm": 2.0678904056549072, + "learning_rate": 4.679969858431105e-05, + "loss": 5.0753, + "step": 27380 + }, + { + "epoch": 0.16284256351698545, + "grad_norm": 2.147873640060425, + "learning_rate": 4.679946992299712e-05, + "loss": 5.2131, + "step": 27381 + }, + { + "epoch": 0.16284851080026644, + "grad_norm": 1.7163617610931396, + "learning_rate": 4.679924125407322e-05, + "loss": 5.2478, + "step": 27382 + }, + { + "epoch": 0.16285445808354743, + "grad_norm": 2.040842056274414, + "learning_rate": 4.679901257753943e-05, + "loss": 5.2402, + "step": 27383 + }, + { + "epoch": 0.16286040536682844, + "grad_norm": 1.8307139873504639, + "learning_rate": 4.6798783893395834e-05, + "loss": 4.5761, + "step": 27384 + }, + { + "epoch": 0.16286635265010943, + "grad_norm": 1.4522336721420288, + "learning_rate": 4.67985552016425e-05, + "loss": 4.7127, + "step": 27385 + }, + { + "epoch": 0.16287229993339042, + "grad_norm": 1.8996527194976807, + "learning_rate": 4.679832650227952e-05, + "loss": 4.7754, + "step": 27386 + }, + { + "epoch": 0.16287824721667143, + "grad_norm": 2.1785221099853516, + "learning_rate": 4.679809779530697e-05, + "loss": 4.9305, + "step": 27387 + }, + { + "epoch": 0.16288419449995242, + "grad_norm": 2.266005754470825, + "learning_rate": 4.679786908072493e-05, + "loss": 5.1013, + "step": 27388 + }, + { + "epoch": 0.1628901417832334, + "grad_norm": 2.08335542678833, + "learning_rate": 4.679764035853348e-05, + "loss": 5.0172, + "step": 27389 + }, + { + "epoch": 0.16289608906651443, + "grad_norm": 2.1042888164520264, + "learning_rate": 4.679741162873269e-05, + "loss": 5.0088, + "step": 27390 + }, + { + "epoch": 0.16290203634979541, + "grad_norm": 2.0641071796417236, + "learning_rate": 4.679718289132266e-05, + "loss": 4.9374, + "step": 27391 + }, + { + "epoch": 0.1629079836330764, + "grad_norm": 1.855651617050171, + "learning_rate": 4.6796954146303454e-05, + "loss": 5.0419, + "step": 27392 + }, + { + "epoch": 0.16291393091635742, + "grad_norm": 1.8837964534759521, + "learning_rate": 4.679672539367516e-05, + "loss": 5.0203, + "step": 27393 + }, + { + "epoch": 0.1629198781996384, + "grad_norm": 1.9748656749725342, + "learning_rate": 4.679649663343785e-05, + "loss": 5.0305, + "step": 27394 + }, + { + "epoch": 0.1629258254829194, + "grad_norm": 2.2613768577575684, + "learning_rate": 4.67962678655916e-05, + "loss": 4.9047, + "step": 27395 + }, + { + "epoch": 0.1629317727662004, + "grad_norm": 1.583208441734314, + "learning_rate": 4.6796039090136514e-05, + "loss": 4.6715, + "step": 27396 + }, + { + "epoch": 0.1629377200494814, + "grad_norm": 1.6698166131973267, + "learning_rate": 4.679581030707265e-05, + "loss": 5.3792, + "step": 27397 + }, + { + "epoch": 0.1629436673327624, + "grad_norm": 1.778937816619873, + "learning_rate": 4.679558151640009e-05, + "loss": 5.682, + "step": 27398 + }, + { + "epoch": 0.1629496146160434, + "grad_norm": 1.7441314458847046, + "learning_rate": 4.679535271811892e-05, + "loss": 5.2928, + "step": 27399 + }, + { + "epoch": 0.1629555618993244, + "grad_norm": 2.2535476684570312, + "learning_rate": 4.679512391222922e-05, + "loss": 4.9041, + "step": 27400 + }, + { + "epoch": 0.16296150918260538, + "grad_norm": 2.237154483795166, + "learning_rate": 4.679489509873106e-05, + "loss": 4.8852, + "step": 27401 + }, + { + "epoch": 0.1629674564658864, + "grad_norm": 1.7429604530334473, + "learning_rate": 4.679466627762454e-05, + "loss": 4.7548, + "step": 27402 + }, + { + "epoch": 0.16297340374916738, + "grad_norm": 2.02030086517334, + "learning_rate": 4.6794437448909723e-05, + "loss": 4.8708, + "step": 27403 + }, + { + "epoch": 0.16297935103244837, + "grad_norm": 1.5148401260375977, + "learning_rate": 4.6794208612586684e-05, + "loss": 4.8774, + "step": 27404 + }, + { + "epoch": 0.1629852983157294, + "grad_norm": 1.9291085004806519, + "learning_rate": 4.679397976865552e-05, + "loss": 4.7936, + "step": 27405 + }, + { + "epoch": 0.16299124559901038, + "grad_norm": 2.0261623859405518, + "learning_rate": 4.67937509171163e-05, + "loss": 4.5639, + "step": 27406 + }, + { + "epoch": 0.16299719288229136, + "grad_norm": 2.1595592498779297, + "learning_rate": 4.679352205796911e-05, + "loss": 4.7767, + "step": 27407 + }, + { + "epoch": 0.16300314016557238, + "grad_norm": 1.7030655145645142, + "learning_rate": 4.679329319121403e-05, + "loss": 4.9251, + "step": 27408 + }, + { + "epoch": 0.16300908744885337, + "grad_norm": 1.5864980220794678, + "learning_rate": 4.679306431685112e-05, + "loss": 5.0048, + "step": 27409 + }, + { + "epoch": 0.16301503473213436, + "grad_norm": 1.695307970046997, + "learning_rate": 4.679283543488049e-05, + "loss": 5.1882, + "step": 27410 + }, + { + "epoch": 0.16302098201541534, + "grad_norm": 1.4839437007904053, + "learning_rate": 4.6792606545302206e-05, + "loss": 5.3838, + "step": 27411 + }, + { + "epoch": 0.16302692929869636, + "grad_norm": 1.883641242980957, + "learning_rate": 4.6792377648116346e-05, + "loss": 4.9213, + "step": 27412 + }, + { + "epoch": 0.16303287658197735, + "grad_norm": 2.2560174465179443, + "learning_rate": 4.6792148743322985e-05, + "loss": 4.2573, + "step": 27413 + }, + { + "epoch": 0.16303882386525834, + "grad_norm": 2.452279567718506, + "learning_rate": 4.6791919830922225e-05, + "loss": 4.526, + "step": 27414 + }, + { + "epoch": 0.16304477114853935, + "grad_norm": 2.429499387741089, + "learning_rate": 4.679169091091412e-05, + "loss": 4.1269, + "step": 27415 + }, + { + "epoch": 0.16305071843182034, + "grad_norm": 1.7020376920700073, + "learning_rate": 4.6791461983298764e-05, + "loss": 5.367, + "step": 27416 + }, + { + "epoch": 0.16305666571510133, + "grad_norm": 1.6802117824554443, + "learning_rate": 4.679123304807623e-05, + "loss": 5.628, + "step": 27417 + }, + { + "epoch": 0.16306261299838234, + "grad_norm": 1.5536737442016602, + "learning_rate": 4.6791004105246606e-05, + "loss": 4.4013, + "step": 27418 + }, + { + "epoch": 0.16306856028166333, + "grad_norm": 1.6626231670379639, + "learning_rate": 4.6790775154809966e-05, + "loss": 5.1377, + "step": 27419 + }, + { + "epoch": 0.16307450756494432, + "grad_norm": 1.4954432249069214, + "learning_rate": 4.6790546196766395e-05, + "loss": 4.8278, + "step": 27420 + }, + { + "epoch": 0.16308045484822534, + "grad_norm": 2.2759921550750732, + "learning_rate": 4.679031723111597e-05, + "loss": 4.0856, + "step": 27421 + }, + { + "epoch": 0.16308640213150633, + "grad_norm": 2.298222541809082, + "learning_rate": 4.679008825785877e-05, + "loss": 4.169, + "step": 27422 + }, + { + "epoch": 0.1630923494147873, + "grad_norm": 2.435786247253418, + "learning_rate": 4.678985927699486e-05, + "loss": 3.9992, + "step": 27423 + }, + { + "epoch": 0.16309829669806833, + "grad_norm": 2.273677110671997, + "learning_rate": 4.678963028852436e-05, + "loss": 3.689, + "step": 27424 + }, + { + "epoch": 0.16310424398134932, + "grad_norm": 2.1706488132476807, + "learning_rate": 4.6789401292447306e-05, + "loss": 3.7752, + "step": 27425 + }, + { + "epoch": 0.1631101912646303, + "grad_norm": 1.7838464975357056, + "learning_rate": 4.6789172288763804e-05, + "loss": 4.863, + "step": 27426 + }, + { + "epoch": 0.16311613854791132, + "grad_norm": 2.0465335845947266, + "learning_rate": 4.678894327747393e-05, + "loss": 4.8415, + "step": 27427 + }, + { + "epoch": 0.1631220858311923, + "grad_norm": 2.5023603439331055, + "learning_rate": 4.678871425857775e-05, + "loss": 3.8268, + "step": 27428 + }, + { + "epoch": 0.1631280331144733, + "grad_norm": 3.1593286991119385, + "learning_rate": 4.6788485232075366e-05, + "loss": 3.8232, + "step": 27429 + }, + { + "epoch": 0.1631339803977543, + "grad_norm": 2.5644307136535645, + "learning_rate": 4.6788256197966847e-05, + "loss": 3.4984, + "step": 27430 + }, + { + "epoch": 0.1631399276810353, + "grad_norm": 2.0135555267333984, + "learning_rate": 4.678802715625227e-05, + "loss": 4.1888, + "step": 27431 + }, + { + "epoch": 0.1631458749643163, + "grad_norm": 2.4584031105041504, + "learning_rate": 4.678779810693171e-05, + "loss": 4.2168, + "step": 27432 + }, + { + "epoch": 0.1631518222475973, + "grad_norm": 3.071559429168701, + "learning_rate": 4.678756905000526e-05, + "loss": 4.191, + "step": 27433 + }, + { + "epoch": 0.1631577695308783, + "grad_norm": 2.8028981685638428, + "learning_rate": 4.6787339985473e-05, + "loss": 3.9579, + "step": 27434 + }, + { + "epoch": 0.16316371681415928, + "grad_norm": 1.8563295602798462, + "learning_rate": 4.6787110913335006e-05, + "loss": 4.7058, + "step": 27435 + }, + { + "epoch": 0.1631696640974403, + "grad_norm": 1.576141357421875, + "learning_rate": 4.678688183359135e-05, + "loss": 5.2126, + "step": 27436 + }, + { + "epoch": 0.16317561138072129, + "grad_norm": 1.715032935142517, + "learning_rate": 4.6786652746242124e-05, + "loss": 5.1945, + "step": 27437 + }, + { + "epoch": 0.16318155866400227, + "grad_norm": 1.5476752519607544, + "learning_rate": 4.67864236512874e-05, + "loss": 5.523, + "step": 27438 + }, + { + "epoch": 0.1631875059472833, + "grad_norm": 1.4861894845962524, + "learning_rate": 4.6786194548727255e-05, + "loss": 5.4119, + "step": 27439 + }, + { + "epoch": 0.16319345323056428, + "grad_norm": 1.3097593784332275, + "learning_rate": 4.6785965438561784e-05, + "loss": 5.4008, + "step": 27440 + }, + { + "epoch": 0.16319940051384527, + "grad_norm": 1.733404517173767, + "learning_rate": 4.678573632079105e-05, + "loss": 4.4261, + "step": 27441 + }, + { + "epoch": 0.16320534779712628, + "grad_norm": 1.4431440830230713, + "learning_rate": 4.678550719541514e-05, + "loss": 3.8523, + "step": 27442 + }, + { + "epoch": 0.16321129508040727, + "grad_norm": 1.5869112014770508, + "learning_rate": 4.678527806243415e-05, + "loss": 5.0346, + "step": 27443 + }, + { + "epoch": 0.16321724236368826, + "grad_norm": 1.7510712146759033, + "learning_rate": 4.6785048921848127e-05, + "loss": 5.2022, + "step": 27444 + }, + { + "epoch": 0.16322318964696927, + "grad_norm": 2.5091726779937744, + "learning_rate": 4.678481977365717e-05, + "loss": 4.3526, + "step": 27445 + }, + { + "epoch": 0.16322913693025026, + "grad_norm": 2.355930805206299, + "learning_rate": 4.6784590617861365e-05, + "loss": 3.9097, + "step": 27446 + }, + { + "epoch": 0.16323508421353125, + "grad_norm": 2.104262113571167, + "learning_rate": 4.678436145446078e-05, + "loss": 3.9491, + "step": 27447 + }, + { + "epoch": 0.16324103149681227, + "grad_norm": 2.6814212799072266, + "learning_rate": 4.678413228345551e-05, + "loss": 3.9986, + "step": 27448 + }, + { + "epoch": 0.16324697878009325, + "grad_norm": 2.017530679702759, + "learning_rate": 4.678390310484561e-05, + "loss": 4.0997, + "step": 27449 + }, + { + "epoch": 0.16325292606337424, + "grad_norm": 2.437260389328003, + "learning_rate": 4.6783673918631175e-05, + "loss": 4.2466, + "step": 27450 + }, + { + "epoch": 0.16325887334665526, + "grad_norm": 2.4225821495056152, + "learning_rate": 4.67834447248123e-05, + "loss": 4.0411, + "step": 27451 + }, + { + "epoch": 0.16326482062993625, + "grad_norm": 1.833397388458252, + "learning_rate": 4.6783215523389035e-05, + "loss": 4.5873, + "step": 27452 + }, + { + "epoch": 0.16327076791321724, + "grad_norm": 1.7432091236114502, + "learning_rate": 4.6782986314361477e-05, + "loss": 5.3351, + "step": 27453 + }, + { + "epoch": 0.16327671519649825, + "grad_norm": 1.8234552145004272, + "learning_rate": 4.6782757097729704e-05, + "loss": 5.3769, + "step": 27454 + }, + { + "epoch": 0.16328266247977924, + "grad_norm": 1.7435389757156372, + "learning_rate": 4.67825278734938e-05, + "loss": 4.6875, + "step": 27455 + }, + { + "epoch": 0.16328860976306023, + "grad_norm": 2.265040874481201, + "learning_rate": 4.678229864165383e-05, + "loss": 4.6138, + "step": 27456 + }, + { + "epoch": 0.16329455704634124, + "grad_norm": 2.105421304702759, + "learning_rate": 4.678206940220989e-05, + "loss": 4.7799, + "step": 27457 + }, + { + "epoch": 0.16330050432962223, + "grad_norm": 1.9669932126998901, + "learning_rate": 4.678184015516206e-05, + "loss": 4.3826, + "step": 27458 + }, + { + "epoch": 0.16330645161290322, + "grad_norm": 2.2020108699798584, + "learning_rate": 4.6781610900510406e-05, + "loss": 4.7784, + "step": 27459 + }, + { + "epoch": 0.16331239889618424, + "grad_norm": 2.0246944427490234, + "learning_rate": 4.678138163825503e-05, + "loss": 4.5324, + "step": 27460 + }, + { + "epoch": 0.16331834617946522, + "grad_norm": 2.0522918701171875, + "learning_rate": 4.678115236839599e-05, + "loss": 4.1903, + "step": 27461 + }, + { + "epoch": 0.1633242934627462, + "grad_norm": 2.0524399280548096, + "learning_rate": 4.678092309093337e-05, + "loss": 4.5542, + "step": 27462 + }, + { + "epoch": 0.16333024074602723, + "grad_norm": 2.0562379360198975, + "learning_rate": 4.678069380586726e-05, + "loss": 4.6572, + "step": 27463 + }, + { + "epoch": 0.16333618802930822, + "grad_norm": 1.931517481803894, + "learning_rate": 4.678046451319774e-05, + "loss": 4.3204, + "step": 27464 + }, + { + "epoch": 0.1633421353125892, + "grad_norm": 1.852124810218811, + "learning_rate": 4.678023521292487e-05, + "loss": 4.5307, + "step": 27465 + }, + { + "epoch": 0.16334808259587022, + "grad_norm": 1.690384030342102, + "learning_rate": 4.6780005905048764e-05, + "loss": 5.1771, + "step": 27466 + }, + { + "epoch": 0.1633540298791512, + "grad_norm": 1.7573405504226685, + "learning_rate": 4.6779776589569466e-05, + "loss": 4.894, + "step": 27467 + }, + { + "epoch": 0.1633599771624322, + "grad_norm": 2.139704942703247, + "learning_rate": 4.677954726648708e-05, + "loss": 4.7212, + "step": 27468 + }, + { + "epoch": 0.1633659244457132, + "grad_norm": 1.9621661901474, + "learning_rate": 4.677931793580168e-05, + "loss": 4.6083, + "step": 27469 + }, + { + "epoch": 0.1633718717289942, + "grad_norm": 1.9202685356140137, + "learning_rate": 4.6779088597513346e-05, + "loss": 5.3296, + "step": 27470 + }, + { + "epoch": 0.1633778190122752, + "grad_norm": 1.6269041299819946, + "learning_rate": 4.677885925162216e-05, + "loss": 5.4541, + "step": 27471 + }, + { + "epoch": 0.16338376629555618, + "grad_norm": 1.928564190864563, + "learning_rate": 4.677862989812819e-05, + "loss": 4.8419, + "step": 27472 + }, + { + "epoch": 0.1633897135788372, + "grad_norm": 2.1393957138061523, + "learning_rate": 4.677840053703153e-05, + "loss": 4.5768, + "step": 27473 + }, + { + "epoch": 0.16339566086211818, + "grad_norm": 2.2332470417022705, + "learning_rate": 4.677817116833225e-05, + "loss": 4.7571, + "step": 27474 + }, + { + "epoch": 0.16340160814539917, + "grad_norm": 1.7523399591445923, + "learning_rate": 4.6777941792030446e-05, + "loss": 5.0372, + "step": 27475 + }, + { + "epoch": 0.16340755542868018, + "grad_norm": 1.5460946559906006, + "learning_rate": 4.677771240812619e-05, + "loss": 5.1194, + "step": 27476 + }, + { + "epoch": 0.16341350271196117, + "grad_norm": 1.6920409202575684, + "learning_rate": 4.677748301661954e-05, + "loss": 5.0852, + "step": 27477 + }, + { + "epoch": 0.16341944999524216, + "grad_norm": 1.5086921453475952, + "learning_rate": 4.677725361751061e-05, + "loss": 5.2414, + "step": 27478 + }, + { + "epoch": 0.16342539727852318, + "grad_norm": 1.4637200832366943, + "learning_rate": 4.6777024210799465e-05, + "loss": 4.9873, + "step": 27479 + }, + { + "epoch": 0.16343134456180416, + "grad_norm": 1.6477910280227661, + "learning_rate": 4.677679479648618e-05, + "loss": 5.2834, + "step": 27480 + }, + { + "epoch": 0.16343729184508515, + "grad_norm": 1.7025471925735474, + "learning_rate": 4.6776565374570844e-05, + "loss": 5.3655, + "step": 27481 + }, + { + "epoch": 0.16344323912836617, + "grad_norm": 1.8360841274261475, + "learning_rate": 4.677633594505354e-05, + "loss": 4.4539, + "step": 27482 + }, + { + "epoch": 0.16344918641164716, + "grad_norm": 2.10629940032959, + "learning_rate": 4.6776106507934336e-05, + "loss": 4.2894, + "step": 27483 + }, + { + "epoch": 0.16345513369492815, + "grad_norm": 1.706100583076477, + "learning_rate": 4.677587706321333e-05, + "loss": 4.7572, + "step": 27484 + }, + { + "epoch": 0.16346108097820916, + "grad_norm": 1.518978238105774, + "learning_rate": 4.677564761089057e-05, + "loss": 5.8137, + "step": 27485 + }, + { + "epoch": 0.16346702826149015, + "grad_norm": 1.903784155845642, + "learning_rate": 4.677541815096617e-05, + "loss": 4.7093, + "step": 27486 + }, + { + "epoch": 0.16347297554477114, + "grad_norm": 1.9231067895889282, + "learning_rate": 4.677518868344019e-05, + "loss": 4.6492, + "step": 27487 + }, + { + "epoch": 0.16347892282805215, + "grad_norm": 1.5489968061447144, + "learning_rate": 4.6774959208312717e-05, + "loss": 5.1375, + "step": 27488 + }, + { + "epoch": 0.16348487011133314, + "grad_norm": 1.6851353645324707, + "learning_rate": 4.677472972558383e-05, + "loss": 5.3354, + "step": 27489 + }, + { + "epoch": 0.16349081739461413, + "grad_norm": 1.6556458473205566, + "learning_rate": 4.6774500235253614e-05, + "loss": 4.4959, + "step": 27490 + }, + { + "epoch": 0.16349676467789515, + "grad_norm": 1.8800296783447266, + "learning_rate": 4.6774270737322145e-05, + "loss": 4.0961, + "step": 27491 + }, + { + "epoch": 0.16350271196117613, + "grad_norm": 1.847226858139038, + "learning_rate": 4.67740412317895e-05, + "loss": 4.0567, + "step": 27492 + }, + { + "epoch": 0.16350865924445712, + "grad_norm": 1.8994855880737305, + "learning_rate": 4.6773811718655766e-05, + "loss": 4.8829, + "step": 27493 + }, + { + "epoch": 0.16351460652773814, + "grad_norm": 1.6551505327224731, + "learning_rate": 4.677358219792102e-05, + "loss": 5.0247, + "step": 27494 + }, + { + "epoch": 0.16352055381101913, + "grad_norm": 1.6510465145111084, + "learning_rate": 4.6773352669585336e-05, + "loss": 5.2324, + "step": 27495 + }, + { + "epoch": 0.16352650109430011, + "grad_norm": 1.851661205291748, + "learning_rate": 4.67731231336488e-05, + "loss": 4.1622, + "step": 27496 + }, + { + "epoch": 0.16353244837758113, + "grad_norm": 1.9479695558547974, + "learning_rate": 4.67728935901115e-05, + "loss": 3.9269, + "step": 27497 + }, + { + "epoch": 0.16353839566086212, + "grad_norm": 1.8207287788391113, + "learning_rate": 4.67726640389735e-05, + "loss": 3.8434, + "step": 27498 + }, + { + "epoch": 0.1635443429441431, + "grad_norm": 1.8698455095291138, + "learning_rate": 4.677243448023489e-05, + "loss": 3.9786, + "step": 27499 + }, + { + "epoch": 0.16355029022742412, + "grad_norm": 1.8257921934127808, + "learning_rate": 4.6772204913895746e-05, + "loss": 3.947, + "step": 27500 + }, + { + "epoch": 0.1635562375107051, + "grad_norm": 1.6152242422103882, + "learning_rate": 4.6771975339956155e-05, + "loss": 4.4898, + "step": 27501 + }, + { + "epoch": 0.1635621847939861, + "grad_norm": 1.956666350364685, + "learning_rate": 4.6771745758416185e-05, + "loss": 4.8584, + "step": 27502 + }, + { + "epoch": 0.16356813207726711, + "grad_norm": 1.8477699756622314, + "learning_rate": 4.677151616927593e-05, + "loss": 5.0331, + "step": 27503 + }, + { + "epoch": 0.1635740793605481, + "grad_norm": 1.705209732055664, + "learning_rate": 4.677128657253545e-05, + "loss": 4.193, + "step": 27504 + }, + { + "epoch": 0.1635800266438291, + "grad_norm": 1.8259029388427734, + "learning_rate": 4.677105696819486e-05, + "loss": 3.8187, + "step": 27505 + }, + { + "epoch": 0.1635859739271101, + "grad_norm": 1.633556604385376, + "learning_rate": 4.677082735625421e-05, + "loss": 3.8045, + "step": 27506 + }, + { + "epoch": 0.1635919212103911, + "grad_norm": 1.7349916696548462, + "learning_rate": 4.677059773671358e-05, + "loss": 4.1425, + "step": 27507 + }, + { + "epoch": 0.16359786849367208, + "grad_norm": 1.8932249546051025, + "learning_rate": 4.677036810957307e-05, + "loss": 4.838, + "step": 27508 + }, + { + "epoch": 0.1636038157769531, + "grad_norm": 1.6211893558502197, + "learning_rate": 4.677013847483275e-05, + "loss": 5.2038, + "step": 27509 + }, + { + "epoch": 0.1636097630602341, + "grad_norm": 1.7109664678573608, + "learning_rate": 4.6769908832492694e-05, + "loss": 4.8308, + "step": 27510 + }, + { + "epoch": 0.16361571034351508, + "grad_norm": 1.603644847869873, + "learning_rate": 4.6769679182553e-05, + "loss": 4.8959, + "step": 27511 + }, + { + "epoch": 0.1636216576267961, + "grad_norm": 1.6871256828308105, + "learning_rate": 4.676944952501372e-05, + "loss": 4.7762, + "step": 27512 + }, + { + "epoch": 0.16362760491007708, + "grad_norm": 1.5820897817611694, + "learning_rate": 4.676921985987496e-05, + "loss": 4.4533, + "step": 27513 + }, + { + "epoch": 0.16363355219335807, + "grad_norm": 1.6850042343139648, + "learning_rate": 4.676899018713678e-05, + "loss": 4.7149, + "step": 27514 + }, + { + "epoch": 0.16363949947663908, + "grad_norm": 1.6211190223693848, + "learning_rate": 4.676876050679928e-05, + "loss": 5.1372, + "step": 27515 + }, + { + "epoch": 0.16364544675992007, + "grad_norm": 1.7970921993255615, + "learning_rate": 4.676853081886252e-05, + "loss": 4.9738, + "step": 27516 + }, + { + "epoch": 0.16365139404320106, + "grad_norm": 1.9819167852401733, + "learning_rate": 4.67683011233266e-05, + "loss": 4.9069, + "step": 27517 + }, + { + "epoch": 0.16365734132648208, + "grad_norm": 1.9208866357803345, + "learning_rate": 4.6768071420191596e-05, + "loss": 4.6224, + "step": 27518 + }, + { + "epoch": 0.16366328860976306, + "grad_norm": 1.4924341440200806, + "learning_rate": 4.676784170945757e-05, + "loss": 4.4268, + "step": 27519 + }, + { + "epoch": 0.16366923589304405, + "grad_norm": 1.5947877168655396, + "learning_rate": 4.676761199112462e-05, + "loss": 4.231, + "step": 27520 + }, + { + "epoch": 0.16367518317632507, + "grad_norm": 1.4336072206497192, + "learning_rate": 4.676738226519283e-05, + "loss": 4.7233, + "step": 27521 + }, + { + "epoch": 0.16368113045960606, + "grad_norm": 1.496932864189148, + "learning_rate": 4.676715253166226e-05, + "loss": 4.2295, + "step": 27522 + }, + { + "epoch": 0.16368707774288704, + "grad_norm": 1.3215701580047607, + "learning_rate": 4.6766922790533005e-05, + "loss": 4.2627, + "step": 27523 + }, + { + "epoch": 0.16369302502616806, + "grad_norm": 1.524957299232483, + "learning_rate": 4.676669304180514e-05, + "loss": 4.5299, + "step": 27524 + }, + { + "epoch": 0.16369897230944905, + "grad_norm": 2.0174505710601807, + "learning_rate": 4.676646328547876e-05, + "loss": 4.8986, + "step": 27525 + }, + { + "epoch": 0.16370491959273004, + "grad_norm": 1.6895251274108887, + "learning_rate": 4.676623352155392e-05, + "loss": 4.6933, + "step": 27526 + }, + { + "epoch": 0.16371086687601105, + "grad_norm": 1.3915743827819824, + "learning_rate": 4.676600375003072e-05, + "loss": 4.3735, + "step": 27527 + }, + { + "epoch": 0.16371681415929204, + "grad_norm": 2.5097527503967285, + "learning_rate": 4.6765773970909224e-05, + "loss": 4.7227, + "step": 27528 + }, + { + "epoch": 0.16372276144257303, + "grad_norm": 1.4059836864471436, + "learning_rate": 4.676554418418953e-05, + "loss": 4.3861, + "step": 27529 + }, + { + "epoch": 0.16372870872585402, + "grad_norm": 1.5270711183547974, + "learning_rate": 4.6765314389871704e-05, + "loss": 4.4302, + "step": 27530 + }, + { + "epoch": 0.16373465600913503, + "grad_norm": 1.8292162418365479, + "learning_rate": 4.676508458795583e-05, + "loss": 4.697, + "step": 27531 + }, + { + "epoch": 0.16374060329241602, + "grad_norm": 1.8712737560272217, + "learning_rate": 4.6764854778442e-05, + "loss": 4.6228, + "step": 27532 + }, + { + "epoch": 0.163746550575697, + "grad_norm": 1.551424503326416, + "learning_rate": 4.6764624961330274e-05, + "loss": 5.1146, + "step": 27533 + }, + { + "epoch": 0.16375249785897802, + "grad_norm": 1.522362232208252, + "learning_rate": 4.6764395136620745e-05, + "loss": 4.8196, + "step": 27534 + }, + { + "epoch": 0.163758445142259, + "grad_norm": 2.196622371673584, + "learning_rate": 4.676416530431349e-05, + "loss": 4.6695, + "step": 27535 + }, + { + "epoch": 0.16376439242554, + "grad_norm": 1.7196024656295776, + "learning_rate": 4.676393546440859e-05, + "loss": 4.3153, + "step": 27536 + }, + { + "epoch": 0.16377033970882102, + "grad_norm": 1.841454267501831, + "learning_rate": 4.676370561690613e-05, + "loss": 3.9704, + "step": 27537 + }, + { + "epoch": 0.163776286992102, + "grad_norm": 1.8239476680755615, + "learning_rate": 4.6763475761806185e-05, + "loss": 3.9419, + "step": 27538 + }, + { + "epoch": 0.163782234275383, + "grad_norm": 1.8012974262237549, + "learning_rate": 4.6763245899108834e-05, + "loss": 3.9246, + "step": 27539 + }, + { + "epoch": 0.163788181558664, + "grad_norm": 1.7155267000198364, + "learning_rate": 4.676301602881415e-05, + "loss": 4.7766, + "step": 27540 + }, + { + "epoch": 0.163794128841945, + "grad_norm": 1.986662745475769, + "learning_rate": 4.676278615092223e-05, + "loss": 4.5932, + "step": 27541 + }, + { + "epoch": 0.16380007612522599, + "grad_norm": 1.7661755084991455, + "learning_rate": 4.676255626543314e-05, + "loss": 4.2295, + "step": 27542 + }, + { + "epoch": 0.163806023408507, + "grad_norm": 1.7953100204467773, + "learning_rate": 4.676232637234698e-05, + "loss": 3.7245, + "step": 27543 + }, + { + "epoch": 0.163811970691788, + "grad_norm": 1.8963271379470825, + "learning_rate": 4.6762096471663805e-05, + "loss": 3.7599, + "step": 27544 + }, + { + "epoch": 0.16381791797506898, + "grad_norm": 1.8365765810012817, + "learning_rate": 4.676186656338371e-05, + "loss": 3.8955, + "step": 27545 + }, + { + "epoch": 0.16382386525835, + "grad_norm": 1.7611230611801147, + "learning_rate": 4.676163664750677e-05, + "loss": 3.7164, + "step": 27546 + }, + { + "epoch": 0.16382981254163098, + "grad_norm": 1.6881484985351562, + "learning_rate": 4.676140672403307e-05, + "loss": 3.905, + "step": 27547 + }, + { + "epoch": 0.16383575982491197, + "grad_norm": 1.655831217765808, + "learning_rate": 4.676117679296269e-05, + "loss": 4.9185, + "step": 27548 + }, + { + "epoch": 0.16384170710819299, + "grad_norm": 1.602988839149475, + "learning_rate": 4.6760946854295707e-05, + "loss": 5.228, + "step": 27549 + }, + { + "epoch": 0.16384765439147397, + "grad_norm": 1.6523774862289429, + "learning_rate": 4.67607169080322e-05, + "loss": 5.2095, + "step": 27550 + }, + { + "epoch": 0.16385360167475496, + "grad_norm": 2.0141515731811523, + "learning_rate": 4.676048695417224e-05, + "loss": 5.2764, + "step": 27551 + }, + { + "epoch": 0.16385954895803598, + "grad_norm": 1.824358344078064, + "learning_rate": 4.676025699271594e-05, + "loss": 4.4083, + "step": 27552 + }, + { + "epoch": 0.16386549624131697, + "grad_norm": 1.90078604221344, + "learning_rate": 4.676002702366334e-05, + "loss": 4.3142, + "step": 27553 + }, + { + "epoch": 0.16387144352459795, + "grad_norm": 2.1593260765075684, + "learning_rate": 4.6759797047014554e-05, + "loss": 4.8884, + "step": 27554 + }, + { + "epoch": 0.16387739080787897, + "grad_norm": 1.6608953475952148, + "learning_rate": 4.675956706276965e-05, + "loss": 5.0272, + "step": 27555 + }, + { + "epoch": 0.16388333809115996, + "grad_norm": 1.6689786911010742, + "learning_rate": 4.67593370709287e-05, + "loss": 4.8278, + "step": 27556 + }, + { + "epoch": 0.16388928537444095, + "grad_norm": 1.5720055103302002, + "learning_rate": 4.675910707149178e-05, + "loss": 4.9288, + "step": 27557 + }, + { + "epoch": 0.16389523265772196, + "grad_norm": 1.6609811782836914, + "learning_rate": 4.675887706445899e-05, + "loss": 4.9233, + "step": 27558 + }, + { + "epoch": 0.16390117994100295, + "grad_norm": 1.7448883056640625, + "learning_rate": 4.6758647049830405e-05, + "loss": 4.8793, + "step": 27559 + }, + { + "epoch": 0.16390712722428394, + "grad_norm": 1.728389859199524, + "learning_rate": 4.6758417027606094e-05, + "loss": 5.2122, + "step": 27560 + }, + { + "epoch": 0.16391307450756495, + "grad_norm": 1.4038145542144775, + "learning_rate": 4.675818699778615e-05, + "loss": 5.1715, + "step": 27561 + }, + { + "epoch": 0.16391902179084594, + "grad_norm": 1.7425341606140137, + "learning_rate": 4.675795696037064e-05, + "loss": 5.3856, + "step": 27562 + }, + { + "epoch": 0.16392496907412693, + "grad_norm": 1.6463298797607422, + "learning_rate": 4.675772691535966e-05, + "loss": 4.8584, + "step": 27563 + }, + { + "epoch": 0.16393091635740795, + "grad_norm": 1.8424142599105835, + "learning_rate": 4.675749686275328e-05, + "loss": 4.7667, + "step": 27564 + }, + { + "epoch": 0.16393686364068893, + "grad_norm": 2.32179594039917, + "learning_rate": 4.675726680255158e-05, + "loss": 4.2014, + "step": 27565 + }, + { + "epoch": 0.16394281092396992, + "grad_norm": 2.380255699157715, + "learning_rate": 4.675703673475464e-05, + "loss": 4.5618, + "step": 27566 + }, + { + "epoch": 0.16394875820725094, + "grad_norm": 1.846535563468933, + "learning_rate": 4.675680665936255e-05, + "loss": 4.9291, + "step": 27567 + }, + { + "epoch": 0.16395470549053193, + "grad_norm": 1.9701546430587769, + "learning_rate": 4.675657657637538e-05, + "loss": 4.4594, + "step": 27568 + }, + { + "epoch": 0.16396065277381291, + "grad_norm": 2.15051007270813, + "learning_rate": 4.675634648579322e-05, + "loss": 4.0397, + "step": 27569 + }, + { + "epoch": 0.16396660005709393, + "grad_norm": 1.7181464433670044, + "learning_rate": 4.6756116387616136e-05, + "loss": 5.0483, + "step": 27570 + }, + { + "epoch": 0.16397254734037492, + "grad_norm": 1.3659751415252686, + "learning_rate": 4.675588628184422e-05, + "loss": 5.0627, + "step": 27571 + }, + { + "epoch": 0.1639784946236559, + "grad_norm": 1.7381535768508911, + "learning_rate": 4.6755656168477553e-05, + "loss": 4.8013, + "step": 27572 + }, + { + "epoch": 0.16398444190693692, + "grad_norm": 1.9152921438217163, + "learning_rate": 4.6755426047516205e-05, + "loss": 4.5437, + "step": 27573 + }, + { + "epoch": 0.1639903891902179, + "grad_norm": 1.449018955230713, + "learning_rate": 4.675519591896026e-05, + "loss": 5.046, + "step": 27574 + }, + { + "epoch": 0.1639963364734989, + "grad_norm": 2.2243831157684326, + "learning_rate": 4.675496578280981e-05, + "loss": 4.0585, + "step": 27575 + }, + { + "epoch": 0.16400228375677992, + "grad_norm": 1.9781684875488281, + "learning_rate": 4.675473563906492e-05, + "loss": 4.6334, + "step": 27576 + }, + { + "epoch": 0.1640082310400609, + "grad_norm": 1.9873735904693604, + "learning_rate": 4.675450548772568e-05, + "loss": 4.6854, + "step": 27577 + }, + { + "epoch": 0.1640141783233419, + "grad_norm": 1.914959192276001, + "learning_rate": 4.675427532879216e-05, + "loss": 4.7866, + "step": 27578 + }, + { + "epoch": 0.1640201256066229, + "grad_norm": 1.8510034084320068, + "learning_rate": 4.675404516226446e-05, + "loss": 4.4274, + "step": 27579 + }, + { + "epoch": 0.1640260728899039, + "grad_norm": 1.726172924041748, + "learning_rate": 4.6753814988142644e-05, + "loss": 4.4166, + "step": 27580 + }, + { + "epoch": 0.16403202017318488, + "grad_norm": 1.7206041812896729, + "learning_rate": 4.6753584806426786e-05, + "loss": 4.3724, + "step": 27581 + }, + { + "epoch": 0.1640379674564659, + "grad_norm": 1.9253183603286743, + "learning_rate": 4.6753354617116987e-05, + "loss": 3.8641, + "step": 27582 + }, + { + "epoch": 0.1640439147397469, + "grad_norm": 1.9023802280426025, + "learning_rate": 4.6753124420213306e-05, + "loss": 4.231, + "step": 27583 + }, + { + "epoch": 0.16404986202302788, + "grad_norm": 2.092531442642212, + "learning_rate": 4.675289421571584e-05, + "loss": 4.4025, + "step": 27584 + }, + { + "epoch": 0.1640558093063089, + "grad_norm": 2.0559768676757812, + "learning_rate": 4.675266400362466e-05, + "loss": 4.4643, + "step": 27585 + }, + { + "epoch": 0.16406175658958988, + "grad_norm": 2.1016385555267334, + "learning_rate": 4.6752433783939855e-05, + "loss": 4.4391, + "step": 27586 + }, + { + "epoch": 0.16406770387287087, + "grad_norm": 2.07698130607605, + "learning_rate": 4.67522035566615e-05, + "loss": 4.483, + "step": 27587 + }, + { + "epoch": 0.16407365115615186, + "grad_norm": 2.172579288482666, + "learning_rate": 4.6751973321789675e-05, + "loss": 4.2118, + "step": 27588 + }, + { + "epoch": 0.16407959843943287, + "grad_norm": 2.1808786392211914, + "learning_rate": 4.675174307932446e-05, + "loss": 4.4722, + "step": 27589 + }, + { + "epoch": 0.16408554572271386, + "grad_norm": 2.163482427597046, + "learning_rate": 4.675151282926593e-05, + "loss": 4.747, + "step": 27590 + }, + { + "epoch": 0.16409149300599485, + "grad_norm": 2.431328773498535, + "learning_rate": 4.675128257161418e-05, + "loss": 4.0239, + "step": 27591 + }, + { + "epoch": 0.16409744028927586, + "grad_norm": 2.2003822326660156, + "learning_rate": 4.675105230636928e-05, + "loss": 4.2945, + "step": 27592 + }, + { + "epoch": 0.16410338757255685, + "grad_norm": 1.8259824514389038, + "learning_rate": 4.675082203353131e-05, + "loss": 4.3246, + "step": 27593 + }, + { + "epoch": 0.16410933485583784, + "grad_norm": 2.02915358543396, + "learning_rate": 4.6750591753100356e-05, + "loss": 5.5056, + "step": 27594 + }, + { + "epoch": 0.16411528213911886, + "grad_norm": 2.2010276317596436, + "learning_rate": 4.675036146507649e-05, + "loss": 5.3688, + "step": 27595 + }, + { + "epoch": 0.16412122942239984, + "grad_norm": 1.8411953449249268, + "learning_rate": 4.6750131169459806e-05, + "loss": 5.6156, + "step": 27596 + }, + { + "epoch": 0.16412717670568083, + "grad_norm": 1.8446851968765259, + "learning_rate": 4.674990086625037e-05, + "loss": 5.1344, + "step": 27597 + }, + { + "epoch": 0.16413312398896185, + "grad_norm": 1.7121134996414185, + "learning_rate": 4.674967055544827e-05, + "loss": 5.164, + "step": 27598 + }, + { + "epoch": 0.16413907127224284, + "grad_norm": 1.68525230884552, + "learning_rate": 4.6749440237053574e-05, + "loss": 4.9757, + "step": 27599 + }, + { + "epoch": 0.16414501855552383, + "grad_norm": 1.9436984062194824, + "learning_rate": 4.6749209911066396e-05, + "loss": 4.4168, + "step": 27600 + }, + { + "epoch": 0.16415096583880484, + "grad_norm": 1.9261338710784912, + "learning_rate": 4.6748979577486774e-05, + "loss": 4.5949, + "step": 27601 + }, + { + "epoch": 0.16415691312208583, + "grad_norm": 1.4877192974090576, + "learning_rate": 4.6748749236314816e-05, + "loss": 5.0274, + "step": 27602 + }, + { + "epoch": 0.16416286040536682, + "grad_norm": 2.030029296875, + "learning_rate": 4.674851888755059e-05, + "loss": 5.3301, + "step": 27603 + }, + { + "epoch": 0.16416880768864783, + "grad_norm": 1.4313018321990967, + "learning_rate": 4.674828853119418e-05, + "loss": 4.9408, + "step": 27604 + }, + { + "epoch": 0.16417475497192882, + "grad_norm": 1.4011638164520264, + "learning_rate": 4.674805816724568e-05, + "loss": 5.2628, + "step": 27605 + }, + { + "epoch": 0.1641807022552098, + "grad_norm": 1.6607071161270142, + "learning_rate": 4.674782779570514e-05, + "loss": 5.0739, + "step": 27606 + }, + { + "epoch": 0.16418664953849083, + "grad_norm": 2.07830810546875, + "learning_rate": 4.674759741657267e-05, + "loss": 4.7624, + "step": 27607 + }, + { + "epoch": 0.1641925968217718, + "grad_norm": 2.402186870574951, + "learning_rate": 4.674736702984833e-05, + "loss": 4.2407, + "step": 27608 + }, + { + "epoch": 0.1641985441050528, + "grad_norm": 2.498345136642456, + "learning_rate": 4.674713663553222e-05, + "loss": 4.2357, + "step": 27609 + }, + { + "epoch": 0.16420449138833382, + "grad_norm": 2.4307384490966797, + "learning_rate": 4.67469062336244e-05, + "loss": 4.4379, + "step": 27610 + }, + { + "epoch": 0.1642104386716148, + "grad_norm": 1.721940279006958, + "learning_rate": 4.6746675824124964e-05, + "loss": 4.4393, + "step": 27611 + }, + { + "epoch": 0.1642163859548958, + "grad_norm": 1.9504097700119019, + "learning_rate": 4.674644540703399e-05, + "loss": 4.753, + "step": 27612 + }, + { + "epoch": 0.1642223332381768, + "grad_norm": 2.2953338623046875, + "learning_rate": 4.674621498235155e-05, + "loss": 4.7865, + "step": 27613 + }, + { + "epoch": 0.1642282805214578, + "grad_norm": 2.291163921356201, + "learning_rate": 4.674598455007773e-05, + "loss": 4.7659, + "step": 27614 + }, + { + "epoch": 0.16423422780473879, + "grad_norm": 2.1821818351745605, + "learning_rate": 4.674575411021262e-05, + "loss": 4.0771, + "step": 27615 + }, + { + "epoch": 0.1642401750880198, + "grad_norm": 2.2602016925811768, + "learning_rate": 4.6745523662756286e-05, + "loss": 4.2426, + "step": 27616 + }, + { + "epoch": 0.1642461223713008, + "grad_norm": 1.443772792816162, + "learning_rate": 4.674529320770882e-05, + "loss": 5.2936, + "step": 27617 + }, + { + "epoch": 0.16425206965458178, + "grad_norm": 2.0360827445983887, + "learning_rate": 4.674506274507029e-05, + "loss": 5.3444, + "step": 27618 + }, + { + "epoch": 0.1642580169378628, + "grad_norm": 1.7705327272415161, + "learning_rate": 4.6744832274840786e-05, + "loss": 5.0619, + "step": 27619 + }, + { + "epoch": 0.16426396422114378, + "grad_norm": 2.3405168056488037, + "learning_rate": 4.6744601797020384e-05, + "loss": 4.0113, + "step": 27620 + }, + { + "epoch": 0.16426991150442477, + "grad_norm": 1.6145120859146118, + "learning_rate": 4.674437131160917e-05, + "loss": 4.87, + "step": 27621 + }, + { + "epoch": 0.1642758587877058, + "grad_norm": 1.7102009057998657, + "learning_rate": 4.674414081860722e-05, + "loss": 5.2878, + "step": 27622 + }, + { + "epoch": 0.16428180607098677, + "grad_norm": 1.5974667072296143, + "learning_rate": 4.674391031801461e-05, + "loss": 5.1225, + "step": 27623 + }, + { + "epoch": 0.16428775335426776, + "grad_norm": 1.7934401035308838, + "learning_rate": 4.674367980983143e-05, + "loss": 5.1496, + "step": 27624 + }, + { + "epoch": 0.16429370063754878, + "grad_norm": 1.625554084777832, + "learning_rate": 4.674344929405775e-05, + "loss": 4.9198, + "step": 27625 + }, + { + "epoch": 0.16429964792082977, + "grad_norm": 1.5650711059570312, + "learning_rate": 4.674321877069366e-05, + "loss": 5.6505, + "step": 27626 + }, + { + "epoch": 0.16430559520411075, + "grad_norm": 1.8613455295562744, + "learning_rate": 4.674298823973924e-05, + "loss": 5.6026, + "step": 27627 + }, + { + "epoch": 0.16431154248739177, + "grad_norm": 1.617720603942871, + "learning_rate": 4.674275770119457e-05, + "loss": 5.4009, + "step": 27628 + }, + { + "epoch": 0.16431748977067276, + "grad_norm": 1.937449336051941, + "learning_rate": 4.6742527155059724e-05, + "loss": 4.8275, + "step": 27629 + }, + { + "epoch": 0.16432343705395375, + "grad_norm": 2.541095733642578, + "learning_rate": 4.674229660133479e-05, + "loss": 4.0442, + "step": 27630 + }, + { + "epoch": 0.16432938433723476, + "grad_norm": 2.760444402694702, + "learning_rate": 4.674206604001984e-05, + "loss": 3.19, + "step": 27631 + }, + { + "epoch": 0.16433533162051575, + "grad_norm": 2.561680316925049, + "learning_rate": 4.674183547111496e-05, + "loss": 3.9053, + "step": 27632 + }, + { + "epoch": 0.16434127890379674, + "grad_norm": 2.6636784076690674, + "learning_rate": 4.6741604894620225e-05, + "loss": 4.1, + "step": 27633 + }, + { + "epoch": 0.16434722618707776, + "grad_norm": 2.010796070098877, + "learning_rate": 4.674137431053573e-05, + "loss": 4.5599, + "step": 27634 + }, + { + "epoch": 0.16435317347035874, + "grad_norm": 2.131115198135376, + "learning_rate": 4.674114371886154e-05, + "loss": 4.2314, + "step": 27635 + }, + { + "epoch": 0.16435912075363973, + "grad_norm": 2.2468631267547607, + "learning_rate": 4.674091311959774e-05, + "loss": 4.3132, + "step": 27636 + }, + { + "epoch": 0.16436506803692075, + "grad_norm": 2.325503349304199, + "learning_rate": 4.674068251274442e-05, + "loss": 3.7301, + "step": 27637 + }, + { + "epoch": 0.16437101532020174, + "grad_norm": 2.631612777709961, + "learning_rate": 4.6740451898301646e-05, + "loss": 3.6578, + "step": 27638 + }, + { + "epoch": 0.16437696260348272, + "grad_norm": 2.2272074222564697, + "learning_rate": 4.67402212762695e-05, + "loss": 4.1707, + "step": 27639 + }, + { + "epoch": 0.16438290988676374, + "grad_norm": 1.6620466709136963, + "learning_rate": 4.673999064664808e-05, + "loss": 5.1998, + "step": 27640 + }, + { + "epoch": 0.16438885717004473, + "grad_norm": 2.39687442779541, + "learning_rate": 4.673976000943745e-05, + "loss": 4.99, + "step": 27641 + }, + { + "epoch": 0.16439480445332572, + "grad_norm": 2.3301122188568115, + "learning_rate": 4.673952936463769e-05, + "loss": 4.7562, + "step": 27642 + }, + { + "epoch": 0.16440075173660673, + "grad_norm": 2.335031509399414, + "learning_rate": 4.6739298712248887e-05, + "loss": 4.6406, + "step": 27643 + }, + { + "epoch": 0.16440669901988772, + "grad_norm": 2.3373608589172363, + "learning_rate": 4.6739068052271115e-05, + "loss": 4.555, + "step": 27644 + }, + { + "epoch": 0.1644126463031687, + "grad_norm": 1.887984037399292, + "learning_rate": 4.6738837384704463e-05, + "loss": 5.0687, + "step": 27645 + }, + { + "epoch": 0.1644185935864497, + "grad_norm": 2.8348052501678467, + "learning_rate": 4.673860670954901e-05, + "loss": 5.0324, + "step": 27646 + }, + { + "epoch": 0.1644245408697307, + "grad_norm": 2.3812403678894043, + "learning_rate": 4.673837602680483e-05, + "loss": 5.1471, + "step": 27647 + }, + { + "epoch": 0.1644304881530117, + "grad_norm": 2.797342300415039, + "learning_rate": 4.673814533647201e-05, + "loss": 4.9506, + "step": 27648 + }, + { + "epoch": 0.1644364354362927, + "grad_norm": 2.2026922702789307, + "learning_rate": 4.673791463855063e-05, + "loss": 4.8893, + "step": 27649 + }, + { + "epoch": 0.1644423827195737, + "grad_norm": 1.6675883531570435, + "learning_rate": 4.6737683933040766e-05, + "loss": 5.247, + "step": 27650 + }, + { + "epoch": 0.1644483300028547, + "grad_norm": 1.771507978439331, + "learning_rate": 4.6737453219942495e-05, + "loss": 5.0371, + "step": 27651 + }, + { + "epoch": 0.16445427728613568, + "grad_norm": 1.753718614578247, + "learning_rate": 4.6737222499255914e-05, + "loss": 4.9673, + "step": 27652 + }, + { + "epoch": 0.1644602245694167, + "grad_norm": 2.460538387298584, + "learning_rate": 4.673699177098109e-05, + "loss": 5.0578, + "step": 27653 + }, + { + "epoch": 0.16446617185269768, + "grad_norm": 2.2908952236175537, + "learning_rate": 4.6736761035118104e-05, + "loss": 4.9473, + "step": 27654 + }, + { + "epoch": 0.16447211913597867, + "grad_norm": 2.1169328689575195, + "learning_rate": 4.673653029166704e-05, + "loss": 4.8466, + "step": 27655 + }, + { + "epoch": 0.1644780664192597, + "grad_norm": 1.8647359609603882, + "learning_rate": 4.673629954062797e-05, + "loss": 4.9256, + "step": 27656 + }, + { + "epoch": 0.16448401370254068, + "grad_norm": 2.2176151275634766, + "learning_rate": 4.6736068782001e-05, + "loss": 5.1344, + "step": 27657 + }, + { + "epoch": 0.16448996098582166, + "grad_norm": 2.300567626953125, + "learning_rate": 4.6735838015786185e-05, + "loss": 4.9018, + "step": 27658 + }, + { + "epoch": 0.16449590826910268, + "grad_norm": 2.458017110824585, + "learning_rate": 4.673560724198361e-05, + "loss": 5.2333, + "step": 27659 + }, + { + "epoch": 0.16450185555238367, + "grad_norm": 2.418851852416992, + "learning_rate": 4.673537646059336e-05, + "loss": 5.0428, + "step": 27660 + }, + { + "epoch": 0.16450780283566466, + "grad_norm": 2.163425922393799, + "learning_rate": 4.673514567161551e-05, + "loss": 5.2115, + "step": 27661 + }, + { + "epoch": 0.16451375011894567, + "grad_norm": 2.171957492828369, + "learning_rate": 4.673491487505015e-05, + "loss": 5.1336, + "step": 27662 + }, + { + "epoch": 0.16451969740222666, + "grad_norm": 1.6024816036224365, + "learning_rate": 4.6734684070897364e-05, + "loss": 5.2832, + "step": 27663 + }, + { + "epoch": 0.16452564468550765, + "grad_norm": 2.581366777420044, + "learning_rate": 4.673445325915722e-05, + "loss": 4.2245, + "step": 27664 + }, + { + "epoch": 0.16453159196878867, + "grad_norm": 2.65466570854187, + "learning_rate": 4.67342224398298e-05, + "loss": 3.8786, + "step": 27665 + }, + { + "epoch": 0.16453753925206965, + "grad_norm": 1.909327745437622, + "learning_rate": 4.673399161291519e-05, + "loss": 5.2398, + "step": 27666 + }, + { + "epoch": 0.16454348653535064, + "grad_norm": 2.0884993076324463, + "learning_rate": 4.673376077841346e-05, + "loss": 4.8081, + "step": 27667 + }, + { + "epoch": 0.16454943381863166, + "grad_norm": 2.1802215576171875, + "learning_rate": 4.67335299363247e-05, + "loss": 4.9251, + "step": 27668 + }, + { + "epoch": 0.16455538110191265, + "grad_norm": 2.281020402908325, + "learning_rate": 4.6733299086648996e-05, + "loss": 4.2682, + "step": 27669 + }, + { + "epoch": 0.16456132838519363, + "grad_norm": 2.34698224067688, + "learning_rate": 4.673306822938642e-05, + "loss": 3.8815, + "step": 27670 + }, + { + "epoch": 0.16456727566847465, + "grad_norm": 2.84965181350708, + "learning_rate": 4.673283736453705e-05, + "loss": 3.8124, + "step": 27671 + }, + { + "epoch": 0.16457322295175564, + "grad_norm": 2.604818344116211, + "learning_rate": 4.673260649210098e-05, + "loss": 3.8991, + "step": 27672 + }, + { + "epoch": 0.16457917023503663, + "grad_norm": 2.5472776889801025, + "learning_rate": 4.673237561207827e-05, + "loss": 3.8003, + "step": 27673 + }, + { + "epoch": 0.16458511751831764, + "grad_norm": 1.9040625095367432, + "learning_rate": 4.673214472446902e-05, + "loss": 4.1075, + "step": 27674 + }, + { + "epoch": 0.16459106480159863, + "grad_norm": 1.5493569374084473, + "learning_rate": 4.6731913829273303e-05, + "loss": 5.5934, + "step": 27675 + }, + { + "epoch": 0.16459701208487962, + "grad_norm": 1.683307409286499, + "learning_rate": 4.67316829264912e-05, + "loss": 5.3139, + "step": 27676 + }, + { + "epoch": 0.16460295936816063, + "grad_norm": 1.5558831691741943, + "learning_rate": 4.673145201612279e-05, + "loss": 5.331, + "step": 27677 + }, + { + "epoch": 0.16460890665144162, + "grad_norm": 1.7119014263153076, + "learning_rate": 4.673122109816815e-05, + "loss": 5.4438, + "step": 27678 + }, + { + "epoch": 0.1646148539347226, + "grad_norm": 2.4084794521331787, + "learning_rate": 4.673099017262737e-05, + "loss": 4.2357, + "step": 27679 + }, + { + "epoch": 0.16462080121800363, + "grad_norm": 1.8065168857574463, + "learning_rate": 4.673075923950053e-05, + "loss": 4.4894, + "step": 27680 + }, + { + "epoch": 0.16462674850128461, + "grad_norm": 1.5240797996520996, + "learning_rate": 4.673052829878769e-05, + "loss": 4.6992, + "step": 27681 + }, + { + "epoch": 0.1646326957845656, + "grad_norm": 1.9197040796279907, + "learning_rate": 4.673029735048896e-05, + "loss": 5.0591, + "step": 27682 + }, + { + "epoch": 0.16463864306784662, + "grad_norm": 1.5522626638412476, + "learning_rate": 4.673006639460441e-05, + "loss": 5.2923, + "step": 27683 + }, + { + "epoch": 0.1646445903511276, + "grad_norm": 1.663277506828308, + "learning_rate": 4.6729835431134115e-05, + "loss": 5.0555, + "step": 27684 + }, + { + "epoch": 0.1646505376344086, + "grad_norm": 1.5276461839675903, + "learning_rate": 4.672960446007816e-05, + "loss": 5.1765, + "step": 27685 + }, + { + "epoch": 0.1646564849176896, + "grad_norm": 1.5308914184570312, + "learning_rate": 4.672937348143662e-05, + "loss": 4.371, + "step": 27686 + }, + { + "epoch": 0.1646624322009706, + "grad_norm": 1.6172471046447754, + "learning_rate": 4.672914249520958e-05, + "loss": 4.8801, + "step": 27687 + }, + { + "epoch": 0.1646683794842516, + "grad_norm": 1.523914098739624, + "learning_rate": 4.6728911501397124e-05, + "loss": 4.3999, + "step": 27688 + }, + { + "epoch": 0.1646743267675326, + "grad_norm": 1.2214871644973755, + "learning_rate": 4.672868049999933e-05, + "loss": 3.7981, + "step": 27689 + }, + { + "epoch": 0.1646802740508136, + "grad_norm": 1.340168833732605, + "learning_rate": 4.672844949101628e-05, + "loss": 4.5471, + "step": 27690 + }, + { + "epoch": 0.16468622133409458, + "grad_norm": 1.8667452335357666, + "learning_rate": 4.672821847444805e-05, + "loss": 4.3881, + "step": 27691 + }, + { + "epoch": 0.1646921686173756, + "grad_norm": 3.047363042831421, + "learning_rate": 4.672798745029472e-05, + "loss": 3.7606, + "step": 27692 + }, + { + "epoch": 0.16469811590065658, + "grad_norm": 1.8616588115692139, + "learning_rate": 4.672775641855638e-05, + "loss": 5.0264, + "step": 27693 + }, + { + "epoch": 0.16470406318393757, + "grad_norm": 1.9045435190200806, + "learning_rate": 4.67275253792331e-05, + "loss": 4.5934, + "step": 27694 + }, + { + "epoch": 0.1647100104672186, + "grad_norm": 1.9803951978683472, + "learning_rate": 4.672729433232497e-05, + "loss": 4.0846, + "step": 27695 + }, + { + "epoch": 0.16471595775049958, + "grad_norm": 1.797312617301941, + "learning_rate": 4.672706327783206e-05, + "loss": 4.5876, + "step": 27696 + }, + { + "epoch": 0.16472190503378056, + "grad_norm": 1.954188585281372, + "learning_rate": 4.672683221575446e-05, + "loss": 4.3985, + "step": 27697 + }, + { + "epoch": 0.16472785231706158, + "grad_norm": 2.246690273284912, + "learning_rate": 4.6726601146092255e-05, + "loss": 4.1058, + "step": 27698 + }, + { + "epoch": 0.16473379960034257, + "grad_norm": 2.160576343536377, + "learning_rate": 4.67263700688455e-05, + "loss": 4.0139, + "step": 27699 + }, + { + "epoch": 0.16473974688362356, + "grad_norm": 2.5650711059570312, + "learning_rate": 4.672613898401431e-05, + "loss": 3.5785, + "step": 27700 + }, + { + "epoch": 0.16474569416690457, + "grad_norm": 2.6694283485412598, + "learning_rate": 4.6725907891598744e-05, + "loss": 3.4553, + "step": 27701 + }, + { + "epoch": 0.16475164145018556, + "grad_norm": 1.8965697288513184, + "learning_rate": 4.672567679159888e-05, + "loss": 3.8723, + "step": 27702 + }, + { + "epoch": 0.16475758873346655, + "grad_norm": 2.0568554401397705, + "learning_rate": 4.6725445684014824e-05, + "loss": 4.6828, + "step": 27703 + }, + { + "epoch": 0.16476353601674754, + "grad_norm": 1.7810505628585815, + "learning_rate": 4.672521456884663e-05, + "loss": 5.5463, + "step": 27704 + }, + { + "epoch": 0.16476948330002855, + "grad_norm": 1.8636524677276611, + "learning_rate": 4.6724983446094385e-05, + "loss": 4.9334, + "step": 27705 + }, + { + "epoch": 0.16477543058330954, + "grad_norm": 2.172565460205078, + "learning_rate": 4.6724752315758174e-05, + "loss": 5.4723, + "step": 27706 + }, + { + "epoch": 0.16478137786659053, + "grad_norm": 2.461881637573242, + "learning_rate": 4.672452117783808e-05, + "loss": 4.6138, + "step": 27707 + }, + { + "epoch": 0.16478732514987154, + "grad_norm": 2.3633780479431152, + "learning_rate": 4.672429003233418e-05, + "loss": 4.2636, + "step": 27708 + }, + { + "epoch": 0.16479327243315253, + "grad_norm": 2.4033286571502686, + "learning_rate": 4.6724058879246546e-05, + "loss": 3.716, + "step": 27709 + }, + { + "epoch": 0.16479921971643352, + "grad_norm": 2.201249599456787, + "learning_rate": 4.672382771857527e-05, + "loss": 4.9046, + "step": 27710 + }, + { + "epoch": 0.16480516699971454, + "grad_norm": 2.0308284759521484, + "learning_rate": 4.672359655032044e-05, + "loss": 4.255, + "step": 27711 + }, + { + "epoch": 0.16481111428299552, + "grad_norm": 2.46120023727417, + "learning_rate": 4.672336537448212e-05, + "loss": 3.7853, + "step": 27712 + }, + { + "epoch": 0.1648170615662765, + "grad_norm": 2.130208969116211, + "learning_rate": 4.6723134191060404e-05, + "loss": 3.8114, + "step": 27713 + }, + { + "epoch": 0.16482300884955753, + "grad_norm": 2.138585329055786, + "learning_rate": 4.672290300005536e-05, + "loss": 4.6266, + "step": 27714 + }, + { + "epoch": 0.16482895613283852, + "grad_norm": 1.8015727996826172, + "learning_rate": 4.6722671801467074e-05, + "loss": 4.2178, + "step": 27715 + }, + { + "epoch": 0.1648349034161195, + "grad_norm": 2.3047871589660645, + "learning_rate": 4.672244059529564e-05, + "loss": 4.258, + "step": 27716 + }, + { + "epoch": 0.16484085069940052, + "grad_norm": 2.022953987121582, + "learning_rate": 4.672220938154111e-05, + "loss": 3.7605, + "step": 27717 + }, + { + "epoch": 0.1648467979826815, + "grad_norm": 2.3721368312835693, + "learning_rate": 4.672197816020358e-05, + "loss": 3.6132, + "step": 27718 + }, + { + "epoch": 0.1648527452659625, + "grad_norm": 1.9578886032104492, + "learning_rate": 4.672174693128314e-05, + "loss": 3.9983, + "step": 27719 + }, + { + "epoch": 0.1648586925492435, + "grad_norm": 2.0287981033325195, + "learning_rate": 4.672151569477987e-05, + "loss": 3.8297, + "step": 27720 + }, + { + "epoch": 0.1648646398325245, + "grad_norm": 2.1453230381011963, + "learning_rate": 4.672128445069383e-05, + "loss": 3.5676, + "step": 27721 + }, + { + "epoch": 0.1648705871158055, + "grad_norm": 2.209982395172119, + "learning_rate": 4.672105319902512e-05, + "loss": 3.6304, + "step": 27722 + }, + { + "epoch": 0.1648765343990865, + "grad_norm": 2.1707348823547363, + "learning_rate": 4.672082193977382e-05, + "loss": 3.679, + "step": 27723 + }, + { + "epoch": 0.1648824816823675, + "grad_norm": 1.9688754081726074, + "learning_rate": 4.672059067293999e-05, + "loss": 4.235, + "step": 27724 + }, + { + "epoch": 0.16488842896564848, + "grad_norm": 1.988599419593811, + "learning_rate": 4.672035939852374e-05, + "loss": 3.8704, + "step": 27725 + }, + { + "epoch": 0.1648943762489295, + "grad_norm": 1.7759329080581665, + "learning_rate": 4.672012811652513e-05, + "loss": 4.5621, + "step": 27726 + }, + { + "epoch": 0.16490032353221049, + "grad_norm": 1.9790258407592773, + "learning_rate": 4.6719896826944255e-05, + "loss": 4.2214, + "step": 27727 + }, + { + "epoch": 0.16490627081549147, + "grad_norm": 1.6736228466033936, + "learning_rate": 4.671966552978118e-05, + "loss": 4.555, + "step": 27728 + }, + { + "epoch": 0.1649122180987725, + "grad_norm": 2.4587225914001465, + "learning_rate": 4.6719434225036e-05, + "loss": 4.4134, + "step": 27729 + }, + { + "epoch": 0.16491816538205348, + "grad_norm": 1.5891488790512085, + "learning_rate": 4.671920291270879e-05, + "loss": 5.4629, + "step": 27730 + }, + { + "epoch": 0.16492411266533447, + "grad_norm": 1.5606093406677246, + "learning_rate": 4.671897159279962e-05, + "loss": 4.5045, + "step": 27731 + }, + { + "epoch": 0.16493005994861548, + "grad_norm": 2.5481436252593994, + "learning_rate": 4.6718740265308595e-05, + "loss": 3.2812, + "step": 27732 + }, + { + "epoch": 0.16493600723189647, + "grad_norm": 2.602802276611328, + "learning_rate": 4.671850893023577e-05, + "loss": 3.082, + "step": 27733 + }, + { + "epoch": 0.16494195451517746, + "grad_norm": 2.3786399364471436, + "learning_rate": 4.6718277587581246e-05, + "loss": 3.5956, + "step": 27734 + }, + { + "epoch": 0.16494790179845847, + "grad_norm": 1.5555487871170044, + "learning_rate": 4.67180462373451e-05, + "loss": 5.2082, + "step": 27735 + }, + { + "epoch": 0.16495384908173946, + "grad_norm": 1.6801286935806274, + "learning_rate": 4.67178148795274e-05, + "loss": 5.3879, + "step": 27736 + }, + { + "epoch": 0.16495979636502045, + "grad_norm": 1.3999351263046265, + "learning_rate": 4.671758351412824e-05, + "loss": 4.9347, + "step": 27737 + }, + { + "epoch": 0.16496574364830147, + "grad_norm": 2.48246693611145, + "learning_rate": 4.6717352141147696e-05, + "loss": 3.5764, + "step": 27738 + }, + { + "epoch": 0.16497169093158245, + "grad_norm": 2.5625791549682617, + "learning_rate": 4.6717120760585844e-05, + "loss": 2.841, + "step": 27739 + }, + { + "epoch": 0.16497763821486344, + "grad_norm": 2.025188684463501, + "learning_rate": 4.6716889372442775e-05, + "loss": 3.9643, + "step": 27740 + }, + { + "epoch": 0.16498358549814446, + "grad_norm": 2.5314674377441406, + "learning_rate": 4.671665797671856e-05, + "loss": 1.9703, + "step": 27741 + }, + { + "epoch": 0.16498953278142545, + "grad_norm": 2.7406599521636963, + "learning_rate": 4.671642657341329e-05, + "loss": 3.0071, + "step": 27742 + }, + { + "epoch": 0.16499548006470643, + "grad_norm": 2.0033769607543945, + "learning_rate": 4.671619516252703e-05, + "loss": 4.5621, + "step": 27743 + }, + { + "epoch": 0.16500142734798745, + "grad_norm": 1.587997555732727, + "learning_rate": 4.6715963744059874e-05, + "loss": 4.9265, + "step": 27744 + }, + { + "epoch": 0.16500737463126844, + "grad_norm": 1.6401697397232056, + "learning_rate": 4.6715732318011905e-05, + "loss": 4.6801, + "step": 27745 + }, + { + "epoch": 0.16501332191454943, + "grad_norm": 2.994272232055664, + "learning_rate": 4.671550088438319e-05, + "loss": 2.0322, + "step": 27746 + }, + { + "epoch": 0.16501926919783044, + "grad_norm": 3.038865089416504, + "learning_rate": 4.671526944317382e-05, + "loss": 2.0138, + "step": 27747 + }, + { + "epoch": 0.16502521648111143, + "grad_norm": 2.9179296493530273, + "learning_rate": 4.671503799438388e-05, + "loss": 3.2955, + "step": 27748 + }, + { + "epoch": 0.16503116376439242, + "grad_norm": 1.7475281953811646, + "learning_rate": 4.6714806538013446e-05, + "loss": 5.4316, + "step": 27749 + }, + { + "epoch": 0.16503711104767343, + "grad_norm": 1.4781032800674438, + "learning_rate": 4.6714575074062596e-05, + "loss": 5.4519, + "step": 27750 + }, + { + "epoch": 0.16504305833095442, + "grad_norm": 1.3684823513031006, + "learning_rate": 4.6714343602531404e-05, + "loss": 5.3335, + "step": 27751 + }, + { + "epoch": 0.1650490056142354, + "grad_norm": 1.6875170469284058, + "learning_rate": 4.6714112123419965e-05, + "loss": 5.0396, + "step": 27752 + }, + { + "epoch": 0.16505495289751643, + "grad_norm": 1.6213173866271973, + "learning_rate": 4.6713880636728346e-05, + "loss": 4.763, + "step": 27753 + }, + { + "epoch": 0.16506090018079742, + "grad_norm": 1.5345633029937744, + "learning_rate": 4.6713649142456644e-05, + "loss": 4.9192, + "step": 27754 + }, + { + "epoch": 0.1650668474640784, + "grad_norm": 1.9773199558258057, + "learning_rate": 4.671341764060493e-05, + "loss": 4.7158, + "step": 27755 + }, + { + "epoch": 0.16507279474735942, + "grad_norm": 1.786027193069458, + "learning_rate": 4.6713186131173284e-05, + "loss": 5.6319, + "step": 27756 + }, + { + "epoch": 0.1650787420306404, + "grad_norm": 1.5743745565414429, + "learning_rate": 4.6712954614161797e-05, + "loss": 5.5466, + "step": 27757 + }, + { + "epoch": 0.1650846893139214, + "grad_norm": 1.6003429889678955, + "learning_rate": 4.671272308957053e-05, + "loss": 5.5166, + "step": 27758 + }, + { + "epoch": 0.1650906365972024, + "grad_norm": 1.567775011062622, + "learning_rate": 4.6712491557399585e-05, + "loss": 5.1731, + "step": 27759 + }, + { + "epoch": 0.1650965838804834, + "grad_norm": 1.7042558193206787, + "learning_rate": 4.671226001764903e-05, + "loss": 4.7854, + "step": 27760 + }, + { + "epoch": 0.1651025311637644, + "grad_norm": 2.414813280105591, + "learning_rate": 4.6712028470318946e-05, + "loss": 3.969, + "step": 27761 + }, + { + "epoch": 0.16510847844704538, + "grad_norm": 2.2361044883728027, + "learning_rate": 4.671179691540942e-05, + "loss": 4.0416, + "step": 27762 + }, + { + "epoch": 0.1651144257303264, + "grad_norm": 1.4998681545257568, + "learning_rate": 4.6711565352920526e-05, + "loss": 4.0769, + "step": 27763 + }, + { + "epoch": 0.16512037301360738, + "grad_norm": 1.8944214582443237, + "learning_rate": 4.6711333782852364e-05, + "loss": 3.9101, + "step": 27764 + }, + { + "epoch": 0.16512632029688837, + "grad_norm": 2.432981252670288, + "learning_rate": 4.671110220520498e-05, + "loss": 3.7838, + "step": 27765 + }, + { + "epoch": 0.16513226758016938, + "grad_norm": 2.3724024295806885, + "learning_rate": 4.6710870619978486e-05, + "loss": 4.0045, + "step": 27766 + }, + { + "epoch": 0.16513821486345037, + "grad_norm": 2.136061429977417, + "learning_rate": 4.671063902717295e-05, + "loss": 4.3335, + "step": 27767 + }, + { + "epoch": 0.16514416214673136, + "grad_norm": 2.263643264770508, + "learning_rate": 4.671040742678845e-05, + "loss": 4.417, + "step": 27768 + }, + { + "epoch": 0.16515010943001238, + "grad_norm": 2.2661242485046387, + "learning_rate": 4.671017581882507e-05, + "loss": 3.9163, + "step": 27769 + }, + { + "epoch": 0.16515605671329336, + "grad_norm": 1.9908580780029297, + "learning_rate": 4.6709944203282905e-05, + "loss": 4.5396, + "step": 27770 + }, + { + "epoch": 0.16516200399657435, + "grad_norm": 1.7676030397415161, + "learning_rate": 4.6709712580162014e-05, + "loss": 4.3445, + "step": 27771 + }, + { + "epoch": 0.16516795127985537, + "grad_norm": 2.308959722518921, + "learning_rate": 4.670948094946248e-05, + "loss": 3.3659, + "step": 27772 + }, + { + "epoch": 0.16517389856313636, + "grad_norm": 2.0675418376922607, + "learning_rate": 4.67092493111844e-05, + "loss": 3.5967, + "step": 27773 + }, + { + "epoch": 0.16517984584641734, + "grad_norm": 2.192368268966675, + "learning_rate": 4.670901766532784e-05, + "loss": 3.7969, + "step": 27774 + }, + { + "epoch": 0.16518579312969836, + "grad_norm": 2.0077974796295166, + "learning_rate": 4.670878601189289e-05, + "loss": 3.43, + "step": 27775 + }, + { + "epoch": 0.16519174041297935, + "grad_norm": 2.169884443283081, + "learning_rate": 4.670855435087963e-05, + "loss": 4.8072, + "step": 27776 + }, + { + "epoch": 0.16519768769626034, + "grad_norm": 2.4910061359405518, + "learning_rate": 4.670832268228813e-05, + "loss": 3.5874, + "step": 27777 + }, + { + "epoch": 0.16520363497954135, + "grad_norm": 2.0694758892059326, + "learning_rate": 4.670809100611848e-05, + "loss": 4.3965, + "step": 27778 + }, + { + "epoch": 0.16520958226282234, + "grad_norm": 1.5337821245193481, + "learning_rate": 4.670785932237076e-05, + "loss": 4.8369, + "step": 27779 + }, + { + "epoch": 0.16521552954610333, + "grad_norm": 1.8797821998596191, + "learning_rate": 4.670762763104506e-05, + "loss": 5.2661, + "step": 27780 + }, + { + "epoch": 0.16522147682938434, + "grad_norm": 1.6902687549591064, + "learning_rate": 4.670739593214144e-05, + "loss": 5.4648, + "step": 27781 + }, + { + "epoch": 0.16522742411266533, + "grad_norm": 1.485190987586975, + "learning_rate": 4.670716422565999e-05, + "loss": 4.9547, + "step": 27782 + }, + { + "epoch": 0.16523337139594632, + "grad_norm": 1.7863634824752808, + "learning_rate": 4.670693251160081e-05, + "loss": 4.9542, + "step": 27783 + }, + { + "epoch": 0.16523931867922734, + "grad_norm": 1.7533354759216309, + "learning_rate": 4.670670078996395e-05, + "loss": 4.7394, + "step": 27784 + }, + { + "epoch": 0.16524526596250833, + "grad_norm": 1.7423986196517944, + "learning_rate": 4.670646906074951e-05, + "loss": 4.8273, + "step": 27785 + }, + { + "epoch": 0.1652512132457893, + "grad_norm": 1.3752869367599487, + "learning_rate": 4.670623732395756e-05, + "loss": 5.0926, + "step": 27786 + }, + { + "epoch": 0.16525716052907033, + "grad_norm": 1.5826581716537476, + "learning_rate": 4.67060055795882e-05, + "loss": 5.167, + "step": 27787 + }, + { + "epoch": 0.16526310781235132, + "grad_norm": 1.5029367208480835, + "learning_rate": 4.6705773827641485e-05, + "loss": 5.145, + "step": 27788 + }, + { + "epoch": 0.1652690550956323, + "grad_norm": 1.720220923423767, + "learning_rate": 4.670554206811751e-05, + "loss": 5.2389, + "step": 27789 + }, + { + "epoch": 0.16527500237891332, + "grad_norm": 1.8043471574783325, + "learning_rate": 4.6705310301016355e-05, + "loss": 5.0942, + "step": 27790 + }, + { + "epoch": 0.1652809496621943, + "grad_norm": 1.7888808250427246, + "learning_rate": 4.670507852633811e-05, + "loss": 5.2764, + "step": 27791 + }, + { + "epoch": 0.1652868969454753, + "grad_norm": 1.6223100423812866, + "learning_rate": 4.6704846744082835e-05, + "loss": 5.2812, + "step": 27792 + }, + { + "epoch": 0.1652928442287563, + "grad_norm": 1.5120409727096558, + "learning_rate": 4.670461495425063e-05, + "loss": 5.2022, + "step": 27793 + }, + { + "epoch": 0.1652987915120373, + "grad_norm": 1.5947920083999634, + "learning_rate": 4.670438315684156e-05, + "loss": 5.2711, + "step": 27794 + }, + { + "epoch": 0.1653047387953183, + "grad_norm": 1.6690993309020996, + "learning_rate": 4.6704151351855716e-05, + "loss": 4.8284, + "step": 27795 + }, + { + "epoch": 0.1653106860785993, + "grad_norm": 1.4904134273529053, + "learning_rate": 4.670391953929318e-05, + "loss": 5.2171, + "step": 27796 + }, + { + "epoch": 0.1653166333618803, + "grad_norm": 1.556333065032959, + "learning_rate": 4.6703687719154034e-05, + "loss": 5.6598, + "step": 27797 + }, + { + "epoch": 0.16532258064516128, + "grad_norm": 1.55083167552948, + "learning_rate": 4.670345589143835e-05, + "loss": 5.5919, + "step": 27798 + }, + { + "epoch": 0.1653285279284423, + "grad_norm": 1.9281244277954102, + "learning_rate": 4.670322405614621e-05, + "loss": 5.3313, + "step": 27799 + }, + { + "epoch": 0.1653344752117233, + "grad_norm": 1.4666374921798706, + "learning_rate": 4.670299221327771e-05, + "loss": 5.0905, + "step": 27800 + }, + { + "epoch": 0.16534042249500427, + "grad_norm": 1.8032478094100952, + "learning_rate": 4.670276036283291e-05, + "loss": 4.9322, + "step": 27801 + }, + { + "epoch": 0.1653463697782853, + "grad_norm": 1.7652195692062378, + "learning_rate": 4.67025285048119e-05, + "loss": 4.6763, + "step": 27802 + }, + { + "epoch": 0.16535231706156628, + "grad_norm": 1.7903348207473755, + "learning_rate": 4.6702296639214766e-05, + "loss": 4.491, + "step": 27803 + }, + { + "epoch": 0.16535826434484727, + "grad_norm": 1.6135162115097046, + "learning_rate": 4.6702064766041584e-05, + "loss": 4.3593, + "step": 27804 + }, + { + "epoch": 0.16536421162812828, + "grad_norm": 1.5779284238815308, + "learning_rate": 4.670183288529243e-05, + "loss": 4.3606, + "step": 27805 + }, + { + "epoch": 0.16537015891140927, + "grad_norm": 1.6469144821166992, + "learning_rate": 4.67016009969674e-05, + "loss": 4.3772, + "step": 27806 + }, + { + "epoch": 0.16537610619469026, + "grad_norm": 2.209540367126465, + "learning_rate": 4.670136910106656e-05, + "loss": 5.1859, + "step": 27807 + }, + { + "epoch": 0.16538205347797127, + "grad_norm": 2.5719592571258545, + "learning_rate": 4.670113719758999e-05, + "loss": 5.1312, + "step": 27808 + }, + { + "epoch": 0.16538800076125226, + "grad_norm": 2.1322646141052246, + "learning_rate": 4.670090528653779e-05, + "loss": 5.5602, + "step": 27809 + }, + { + "epoch": 0.16539394804453325, + "grad_norm": 1.8350342512130737, + "learning_rate": 4.670067336791002e-05, + "loss": 5.6963, + "step": 27810 + }, + { + "epoch": 0.16539989532781427, + "grad_norm": 1.6520220041275024, + "learning_rate": 4.670044144170677e-05, + "loss": 5.8053, + "step": 27811 + }, + { + "epoch": 0.16540584261109526, + "grad_norm": 1.559950590133667, + "learning_rate": 4.670020950792812e-05, + "loss": 5.5382, + "step": 27812 + }, + { + "epoch": 0.16541178989437624, + "grad_norm": 1.7970432043075562, + "learning_rate": 4.669997756657415e-05, + "loss": 4.7823, + "step": 27813 + }, + { + "epoch": 0.16541773717765726, + "grad_norm": 1.8613402843475342, + "learning_rate": 4.6699745617644945e-05, + "loss": 5.3559, + "step": 27814 + }, + { + "epoch": 0.16542368446093825, + "grad_norm": 2.660762310028076, + "learning_rate": 4.669951366114058e-05, + "loss": 4.7255, + "step": 27815 + }, + { + "epoch": 0.16542963174421924, + "grad_norm": 2.8636231422424316, + "learning_rate": 4.669928169706114e-05, + "loss": 4.8591, + "step": 27816 + }, + { + "epoch": 0.16543557902750025, + "grad_norm": 1.6894406080245972, + "learning_rate": 4.669904972540671e-05, + "loss": 5.0576, + "step": 27817 + }, + { + "epoch": 0.16544152631078124, + "grad_norm": 2.539830207824707, + "learning_rate": 4.669881774617736e-05, + "loss": 4.9346, + "step": 27818 + }, + { + "epoch": 0.16544747359406223, + "grad_norm": 2.0870940685272217, + "learning_rate": 4.669858575937318e-05, + "loss": 5.0034, + "step": 27819 + }, + { + "epoch": 0.16545342087734322, + "grad_norm": 1.6307538747787476, + "learning_rate": 4.669835376499425e-05, + "loss": 5.4536, + "step": 27820 + }, + { + "epoch": 0.16545936816062423, + "grad_norm": 1.1906611919403076, + "learning_rate": 4.669812176304064e-05, + "loss": 5.5965, + "step": 27821 + }, + { + "epoch": 0.16546531544390522, + "grad_norm": 1.5987422466278076, + "learning_rate": 4.669788975351245e-05, + "loss": 5.4403, + "step": 27822 + }, + { + "epoch": 0.1654712627271862, + "grad_norm": 2.267430543899536, + "learning_rate": 4.669765773640974e-05, + "loss": 5.0344, + "step": 27823 + }, + { + "epoch": 0.16547721001046722, + "grad_norm": 2.2842605113983154, + "learning_rate": 4.669742571173261e-05, + "loss": 4.481, + "step": 27824 + }, + { + "epoch": 0.1654831572937482, + "grad_norm": 1.5940486192703247, + "learning_rate": 4.6697193679481135e-05, + "loss": 5.1313, + "step": 27825 + }, + { + "epoch": 0.1654891045770292, + "grad_norm": 1.9549680948257446, + "learning_rate": 4.6696961639655386e-05, + "loss": 5.1298, + "step": 27826 + }, + { + "epoch": 0.16549505186031022, + "grad_norm": 2.387866497039795, + "learning_rate": 4.6696729592255454e-05, + "loss": 4.9029, + "step": 27827 + }, + { + "epoch": 0.1655009991435912, + "grad_norm": 1.6883118152618408, + "learning_rate": 4.669649753728142e-05, + "loss": 5.1273, + "step": 27828 + }, + { + "epoch": 0.1655069464268722, + "grad_norm": 1.6538794040679932, + "learning_rate": 4.669626547473336e-05, + "loss": 5.2022, + "step": 27829 + }, + { + "epoch": 0.1655128937101532, + "grad_norm": 1.7652950286865234, + "learning_rate": 4.669603340461136e-05, + "loss": 5.5397, + "step": 27830 + }, + { + "epoch": 0.1655188409934342, + "grad_norm": 1.6421597003936768, + "learning_rate": 4.66958013269155e-05, + "loss": 4.9982, + "step": 27831 + }, + { + "epoch": 0.16552478827671518, + "grad_norm": 1.5024685859680176, + "learning_rate": 4.669556924164586e-05, + "loss": 5.6933, + "step": 27832 + }, + { + "epoch": 0.1655307355599962, + "grad_norm": 1.4680891036987305, + "learning_rate": 4.669533714880252e-05, + "loss": 5.3262, + "step": 27833 + }, + { + "epoch": 0.1655366828432772, + "grad_norm": 1.375623345375061, + "learning_rate": 4.669510504838556e-05, + "loss": 5.2673, + "step": 27834 + }, + { + "epoch": 0.16554263012655818, + "grad_norm": 2.1354503631591797, + "learning_rate": 4.669487294039506e-05, + "loss": 4.2156, + "step": 27835 + }, + { + "epoch": 0.1655485774098392, + "grad_norm": 1.5564913749694824, + "learning_rate": 4.669464082483112e-05, + "loss": 4.7238, + "step": 27836 + }, + { + "epoch": 0.16555452469312018, + "grad_norm": 1.6255192756652832, + "learning_rate": 4.669440870169379e-05, + "loss": 5.6043, + "step": 27837 + }, + { + "epoch": 0.16556047197640117, + "grad_norm": 1.6268353462219238, + "learning_rate": 4.6694176570983174e-05, + "loss": 5.3919, + "step": 27838 + }, + { + "epoch": 0.16556641925968218, + "grad_norm": 1.5626128911972046, + "learning_rate": 4.669394443269933e-05, + "loss": 5.5142, + "step": 27839 + }, + { + "epoch": 0.16557236654296317, + "grad_norm": 1.5001987218856812, + "learning_rate": 4.669371228684237e-05, + "loss": 4.7294, + "step": 27840 + }, + { + "epoch": 0.16557831382624416, + "grad_norm": 1.5922046899795532, + "learning_rate": 4.669348013341235e-05, + "loss": 4.9363, + "step": 27841 + }, + { + "epoch": 0.16558426110952518, + "grad_norm": 1.555086374282837, + "learning_rate": 4.669324797240937e-05, + "loss": 4.6704, + "step": 27842 + }, + { + "epoch": 0.16559020839280617, + "grad_norm": 1.711538553237915, + "learning_rate": 4.66930158038335e-05, + "loss": 4.673, + "step": 27843 + }, + { + "epoch": 0.16559615567608715, + "grad_norm": 1.7905937433242798, + "learning_rate": 4.669278362768481e-05, + "loss": 4.5295, + "step": 27844 + }, + { + "epoch": 0.16560210295936817, + "grad_norm": 1.8714954853057861, + "learning_rate": 4.669255144396341e-05, + "loss": 4.699, + "step": 27845 + }, + { + "epoch": 0.16560805024264916, + "grad_norm": 1.6783734560012817, + "learning_rate": 4.669231925266935e-05, + "loss": 5.5447, + "step": 27846 + }, + { + "epoch": 0.16561399752593015, + "grad_norm": 1.3632158041000366, + "learning_rate": 4.669208705380273e-05, + "loss": 5.5541, + "step": 27847 + }, + { + "epoch": 0.16561994480921116, + "grad_norm": 1.6476699113845825, + "learning_rate": 4.669185484736362e-05, + "loss": 4.5751, + "step": 27848 + }, + { + "epoch": 0.16562589209249215, + "grad_norm": 1.630963921546936, + "learning_rate": 4.669162263335212e-05, + "loss": 5.3621, + "step": 27849 + }, + { + "epoch": 0.16563183937577314, + "grad_norm": 1.4858328104019165, + "learning_rate": 4.66913904117683e-05, + "loss": 5.3973, + "step": 27850 + }, + { + "epoch": 0.16563778665905415, + "grad_norm": 1.7069036960601807, + "learning_rate": 4.669115818261223e-05, + "loss": 5.0102, + "step": 27851 + }, + { + "epoch": 0.16564373394233514, + "grad_norm": 1.4385701417922974, + "learning_rate": 4.6690925945884e-05, + "loss": 5.4805, + "step": 27852 + }, + { + "epoch": 0.16564968122561613, + "grad_norm": 1.6895365715026855, + "learning_rate": 4.66906937015837e-05, + "loss": 4.9834, + "step": 27853 + }, + { + "epoch": 0.16565562850889715, + "grad_norm": 2.1618361473083496, + "learning_rate": 4.66904614497114e-05, + "loss": 4.6309, + "step": 27854 + }, + { + "epoch": 0.16566157579217813, + "grad_norm": 2.331005811691284, + "learning_rate": 4.669022919026718e-05, + "loss": 4.1853, + "step": 27855 + }, + { + "epoch": 0.16566752307545912, + "grad_norm": 1.7161813974380493, + "learning_rate": 4.668999692325113e-05, + "loss": 4.5842, + "step": 27856 + }, + { + "epoch": 0.16567347035874014, + "grad_norm": 2.117947816848755, + "learning_rate": 4.668976464866332e-05, + "loss": 4.6009, + "step": 27857 + }, + { + "epoch": 0.16567941764202113, + "grad_norm": 1.6272234916687012, + "learning_rate": 4.6689532366503846e-05, + "loss": 4.8592, + "step": 27858 + }, + { + "epoch": 0.16568536492530211, + "grad_norm": 1.9852404594421387, + "learning_rate": 4.6689300076772776e-05, + "loss": 4.363, + "step": 27859 + }, + { + "epoch": 0.16569131220858313, + "grad_norm": 1.6235220432281494, + "learning_rate": 4.6689067779470194e-05, + "loss": 4.6625, + "step": 27860 + }, + { + "epoch": 0.16569725949186412, + "grad_norm": 1.7212275266647339, + "learning_rate": 4.668883547459618e-05, + "loss": 4.7013, + "step": 27861 + }, + { + "epoch": 0.1657032067751451, + "grad_norm": 2.5496368408203125, + "learning_rate": 4.6688603162150824e-05, + "loss": 4.0435, + "step": 27862 + }, + { + "epoch": 0.16570915405842612, + "grad_norm": 2.681366443634033, + "learning_rate": 4.66883708421342e-05, + "loss": 4.4567, + "step": 27863 + }, + { + "epoch": 0.1657151013417071, + "grad_norm": 2.2227134704589844, + "learning_rate": 4.668813851454639e-05, + "loss": 4.5467, + "step": 27864 + }, + { + "epoch": 0.1657210486249881, + "grad_norm": 2.413037061691284, + "learning_rate": 4.668790617938748e-05, + "loss": 4.1955, + "step": 27865 + }, + { + "epoch": 0.16572699590826911, + "grad_norm": 2.749058723449707, + "learning_rate": 4.668767383665753e-05, + "loss": 4.1209, + "step": 27866 + }, + { + "epoch": 0.1657329431915501, + "grad_norm": 2.075108528137207, + "learning_rate": 4.668744148635665e-05, + "loss": 4.2322, + "step": 27867 + }, + { + "epoch": 0.1657388904748311, + "grad_norm": 1.7476239204406738, + "learning_rate": 4.66872091284849e-05, + "loss": 4.7075, + "step": 27868 + }, + { + "epoch": 0.1657448377581121, + "grad_norm": 1.7722108364105225, + "learning_rate": 4.6686976763042376e-05, + "loss": 4.7211, + "step": 27869 + }, + { + "epoch": 0.1657507850413931, + "grad_norm": 1.57614266872406, + "learning_rate": 4.668674439002915e-05, + "loss": 4.8495, + "step": 27870 + }, + { + "epoch": 0.16575673232467408, + "grad_norm": 1.5763459205627441, + "learning_rate": 4.6686512009445306e-05, + "loss": 5.1311, + "step": 27871 + }, + { + "epoch": 0.1657626796079551, + "grad_norm": 1.5253850221633911, + "learning_rate": 4.6686279621290925e-05, + "loss": 5.3513, + "step": 27872 + }, + { + "epoch": 0.1657686268912361, + "grad_norm": 1.8837103843688965, + "learning_rate": 4.668604722556609e-05, + "loss": 4.9349, + "step": 27873 + }, + { + "epoch": 0.16577457417451708, + "grad_norm": 1.7190310955047607, + "learning_rate": 4.668581482227087e-05, + "loss": 5.4962, + "step": 27874 + }, + { + "epoch": 0.1657805214577981, + "grad_norm": 1.6501142978668213, + "learning_rate": 4.668558241140537e-05, + "loss": 5.0092, + "step": 27875 + }, + { + "epoch": 0.16578646874107908, + "grad_norm": 2.03367018699646, + "learning_rate": 4.668534999296965e-05, + "loss": 5.2323, + "step": 27876 + }, + { + "epoch": 0.16579241602436007, + "grad_norm": 2.455427885055542, + "learning_rate": 4.66851175669638e-05, + "loss": 4.2927, + "step": 27877 + }, + { + "epoch": 0.16579836330764108, + "grad_norm": 2.443146228790283, + "learning_rate": 4.668488513338789e-05, + "loss": 4.3029, + "step": 27878 + }, + { + "epoch": 0.16580431059092207, + "grad_norm": 2.656646251678467, + "learning_rate": 4.6684652692242026e-05, + "loss": 4.2249, + "step": 27879 + }, + { + "epoch": 0.16581025787420306, + "grad_norm": 2.4562222957611084, + "learning_rate": 4.668442024352626e-05, + "loss": 4.5162, + "step": 27880 + }, + { + "epoch": 0.16581620515748405, + "grad_norm": 2.8980703353881836, + "learning_rate": 4.6684187787240695e-05, + "loss": 4.1083, + "step": 27881 + }, + { + "epoch": 0.16582215244076506, + "grad_norm": 2.5985610485076904, + "learning_rate": 4.668395532338541e-05, + "loss": 4.1557, + "step": 27882 + }, + { + "epoch": 0.16582809972404605, + "grad_norm": 2.4054651260375977, + "learning_rate": 4.6683722851960465e-05, + "loss": 4.2334, + "step": 27883 + }, + { + "epoch": 0.16583404700732704, + "grad_norm": 2.0977237224578857, + "learning_rate": 4.668349037296597e-05, + "loss": 4.5715, + "step": 27884 + }, + { + "epoch": 0.16583999429060806, + "grad_norm": 2.0701677799224854, + "learning_rate": 4.6683257886401985e-05, + "loss": 4.7195, + "step": 27885 + }, + { + "epoch": 0.16584594157388904, + "grad_norm": 1.9294004440307617, + "learning_rate": 4.6683025392268597e-05, + "loss": 4.6521, + "step": 27886 + }, + { + "epoch": 0.16585188885717003, + "grad_norm": 2.1713595390319824, + "learning_rate": 4.66827928905659e-05, + "loss": 4.7052, + "step": 27887 + }, + { + "epoch": 0.16585783614045105, + "grad_norm": 2.835434913635254, + "learning_rate": 4.668256038129395e-05, + "loss": 4.4006, + "step": 27888 + }, + { + "epoch": 0.16586378342373204, + "grad_norm": 2.466986894607544, + "learning_rate": 4.668232786445285e-05, + "loss": 4.3107, + "step": 27889 + }, + { + "epoch": 0.16586973070701302, + "grad_norm": 1.7013013362884521, + "learning_rate": 4.6682095340042675e-05, + "loss": 4.4813, + "step": 27890 + }, + { + "epoch": 0.16587567799029404, + "grad_norm": 1.7486096620559692, + "learning_rate": 4.66818628080635e-05, + "loss": 4.6227, + "step": 27891 + }, + { + "epoch": 0.16588162527357503, + "grad_norm": 1.6579736471176147, + "learning_rate": 4.6681630268515407e-05, + "loss": 4.7124, + "step": 27892 + }, + { + "epoch": 0.16588757255685602, + "grad_norm": 1.5885511636734009, + "learning_rate": 4.668139772139849e-05, + "loss": 4.6244, + "step": 27893 + }, + { + "epoch": 0.16589351984013703, + "grad_norm": 1.5703203678131104, + "learning_rate": 4.668116516671282e-05, + "loss": 4.7233, + "step": 27894 + }, + { + "epoch": 0.16589946712341802, + "grad_norm": 1.6852905750274658, + "learning_rate": 4.668093260445847e-05, + "loss": 5.0091, + "step": 27895 + }, + { + "epoch": 0.165905414406699, + "grad_norm": 1.7425652742385864, + "learning_rate": 4.668070003463554e-05, + "loss": 5.0271, + "step": 27896 + }, + { + "epoch": 0.16591136168998002, + "grad_norm": 1.7271431684494019, + "learning_rate": 4.6680467457244104e-05, + "loss": 4.525, + "step": 27897 + }, + { + "epoch": 0.165917308973261, + "grad_norm": 1.8759088516235352, + "learning_rate": 4.668023487228423e-05, + "loss": 4.4729, + "step": 27898 + }, + { + "epoch": 0.165923256256542, + "grad_norm": 1.5073447227478027, + "learning_rate": 4.668000227975602e-05, + "loss": 4.8768, + "step": 27899 + }, + { + "epoch": 0.16592920353982302, + "grad_norm": 1.3689100742340088, + "learning_rate": 4.667976967965954e-05, + "loss": 5.1424, + "step": 27900 + }, + { + "epoch": 0.165935150823104, + "grad_norm": 1.7475918531417847, + "learning_rate": 4.6679537071994874e-05, + "loss": 4.7103, + "step": 27901 + }, + { + "epoch": 0.165941098106385, + "grad_norm": 1.5559403896331787, + "learning_rate": 4.6679304456762107e-05, + "loss": 5.0524, + "step": 27902 + }, + { + "epoch": 0.165947045389666, + "grad_norm": 1.7627094984054565, + "learning_rate": 4.667907183396132e-05, + "loss": 4.9901, + "step": 27903 + }, + { + "epoch": 0.165952992672947, + "grad_norm": 1.8173136711120605, + "learning_rate": 4.667883920359259e-05, + "loss": 4.6419, + "step": 27904 + }, + { + "epoch": 0.16595893995622799, + "grad_norm": 2.0207037925720215, + "learning_rate": 4.667860656565601e-05, + "loss": 5.2537, + "step": 27905 + }, + { + "epoch": 0.165964887239509, + "grad_norm": 1.6715987920761108, + "learning_rate": 4.6678373920151646e-05, + "loss": 5.0337, + "step": 27906 + }, + { + "epoch": 0.16597083452279, + "grad_norm": 1.6425293684005737, + "learning_rate": 4.667814126707959e-05, + "loss": 5.0065, + "step": 27907 + }, + { + "epoch": 0.16597678180607098, + "grad_norm": 1.8118547201156616, + "learning_rate": 4.667790860643991e-05, + "loss": 4.9293, + "step": 27908 + }, + { + "epoch": 0.165982729089352, + "grad_norm": 1.5994832515716553, + "learning_rate": 4.66776759382327e-05, + "loss": 5.2379, + "step": 27909 + }, + { + "epoch": 0.16598867637263298, + "grad_norm": 1.6475836038589478, + "learning_rate": 4.667744326245804e-05, + "loss": 5.4609, + "step": 27910 + }, + { + "epoch": 0.16599462365591397, + "grad_norm": 1.4168953895568848, + "learning_rate": 4.6677210579116e-05, + "loss": 5.5907, + "step": 27911 + }, + { + "epoch": 0.16600057093919499, + "grad_norm": 1.46638822555542, + "learning_rate": 4.667697788820669e-05, + "loss": 5.4639, + "step": 27912 + }, + { + "epoch": 0.16600651822247597, + "grad_norm": 1.6889718770980835, + "learning_rate": 4.667674518973015e-05, + "loss": 5.4013, + "step": 27913 + }, + { + "epoch": 0.16601246550575696, + "grad_norm": 1.8182064294815063, + "learning_rate": 4.6676512483686495e-05, + "loss": 4.7796, + "step": 27914 + }, + { + "epoch": 0.16601841278903798, + "grad_norm": 1.6663529872894287, + "learning_rate": 4.6676279770075784e-05, + "loss": 4.8987, + "step": 27915 + }, + { + "epoch": 0.16602436007231897, + "grad_norm": 1.762170672416687, + "learning_rate": 4.6676047048898115e-05, + "loss": 4.8513, + "step": 27916 + }, + { + "epoch": 0.16603030735559995, + "grad_norm": 1.6480133533477783, + "learning_rate": 4.6675814320153554e-05, + "loss": 4.7579, + "step": 27917 + }, + { + "epoch": 0.16603625463888097, + "grad_norm": 1.698567271232605, + "learning_rate": 4.66755815838422e-05, + "loss": 4.8489, + "step": 27918 + }, + { + "epoch": 0.16604220192216196, + "grad_norm": 1.5158538818359375, + "learning_rate": 4.667534883996412e-05, + "loss": 4.878, + "step": 27919 + }, + { + "epoch": 0.16604814920544295, + "grad_norm": 2.1120738983154297, + "learning_rate": 4.66751160885194e-05, + "loss": 4.8203, + "step": 27920 + }, + { + "epoch": 0.16605409648872396, + "grad_norm": 1.8523337841033936, + "learning_rate": 4.667488332950812e-05, + "loss": 4.79, + "step": 27921 + }, + { + "epoch": 0.16606004377200495, + "grad_norm": 1.9057866334915161, + "learning_rate": 4.6674650562930364e-05, + "loss": 4.55, + "step": 27922 + }, + { + "epoch": 0.16606599105528594, + "grad_norm": 1.690329670906067, + "learning_rate": 4.6674417788786206e-05, + "loss": 4.8434, + "step": 27923 + }, + { + "epoch": 0.16607193833856695, + "grad_norm": 1.796695590019226, + "learning_rate": 4.667418500707574e-05, + "loss": 4.8883, + "step": 27924 + }, + { + "epoch": 0.16607788562184794, + "grad_norm": 1.9424879550933838, + "learning_rate": 4.6673952217799035e-05, + "loss": 4.2556, + "step": 27925 + }, + { + "epoch": 0.16608383290512893, + "grad_norm": 1.886226773262024, + "learning_rate": 4.6673719420956176e-05, + "loss": 4.5911, + "step": 27926 + }, + { + "epoch": 0.16608978018840995, + "grad_norm": 2.1246280670166016, + "learning_rate": 4.6673486616547254e-05, + "loss": 4.5398, + "step": 27927 + }, + { + "epoch": 0.16609572747169093, + "grad_norm": 2.219155788421631, + "learning_rate": 4.667325380457233e-05, + "loss": 4.6747, + "step": 27928 + }, + { + "epoch": 0.16610167475497192, + "grad_norm": 2.0169975757598877, + "learning_rate": 4.66730209850315e-05, + "loss": 4.7622, + "step": 27929 + }, + { + "epoch": 0.16610762203825294, + "grad_norm": 1.884619116783142, + "learning_rate": 4.667278815792485e-05, + "loss": 5.0192, + "step": 27930 + }, + { + "epoch": 0.16611356932153393, + "grad_norm": 1.8539994955062866, + "learning_rate": 4.6672555323252446e-05, + "loss": 4.2732, + "step": 27931 + }, + { + "epoch": 0.16611951660481492, + "grad_norm": 2.045879364013672, + "learning_rate": 4.667232248101439e-05, + "loss": 3.8245, + "step": 27932 + }, + { + "epoch": 0.16612546388809593, + "grad_norm": 2.005019426345825, + "learning_rate": 4.667208963121073e-05, + "loss": 3.9687, + "step": 27933 + }, + { + "epoch": 0.16613141117137692, + "grad_norm": 1.7998180389404297, + "learning_rate": 4.667185677384158e-05, + "loss": 3.84, + "step": 27934 + }, + { + "epoch": 0.1661373584546579, + "grad_norm": 1.9813350439071655, + "learning_rate": 4.6671623908907e-05, + "loss": 3.7387, + "step": 27935 + }, + { + "epoch": 0.16614330573793892, + "grad_norm": 1.9212778806686401, + "learning_rate": 4.6671391036407086e-05, + "loss": 3.48, + "step": 27936 + }, + { + "epoch": 0.1661492530212199, + "grad_norm": 1.9081000089645386, + "learning_rate": 4.667115815634192e-05, + "loss": 3.4218, + "step": 27937 + }, + { + "epoch": 0.1661552003045009, + "grad_norm": 2.209960699081421, + "learning_rate": 4.667092526871156e-05, + "loss": 3.7272, + "step": 27938 + }, + { + "epoch": 0.1661611475877819, + "grad_norm": 2.3802664279937744, + "learning_rate": 4.6670692373516124e-05, + "loss": 3.6476, + "step": 27939 + }, + { + "epoch": 0.1661670948710629, + "grad_norm": 2.359929323196411, + "learning_rate": 4.667045947075566e-05, + "loss": 3.7406, + "step": 27940 + }, + { + "epoch": 0.1661730421543439, + "grad_norm": 2.242333173751831, + "learning_rate": 4.6670226560430266e-05, + "loss": 3.8315, + "step": 27941 + }, + { + "epoch": 0.16617898943762488, + "grad_norm": 1.7727068662643433, + "learning_rate": 4.6669993642540017e-05, + "loss": 4.6083, + "step": 27942 + }, + { + "epoch": 0.1661849367209059, + "grad_norm": 2.2704246044158936, + "learning_rate": 4.6669760717085e-05, + "loss": 3.947, + "step": 27943 + }, + { + "epoch": 0.16619088400418688, + "grad_norm": 2.550279140472412, + "learning_rate": 4.6669527784065295e-05, + "loss": 3.5335, + "step": 27944 + }, + { + "epoch": 0.16619683128746787, + "grad_norm": 2.455237627029419, + "learning_rate": 4.666929484348097e-05, + "loss": 3.5817, + "step": 27945 + }, + { + "epoch": 0.1662027785707489, + "grad_norm": 1.9026764631271362, + "learning_rate": 4.666906189533213e-05, + "loss": 3.4742, + "step": 27946 + }, + { + "epoch": 0.16620872585402988, + "grad_norm": 1.9334417581558228, + "learning_rate": 4.6668828939618845e-05, + "loss": 3.3938, + "step": 27947 + }, + { + "epoch": 0.16621467313731086, + "grad_norm": 1.9052705764770508, + "learning_rate": 4.666859597634119e-05, + "loss": 4.0506, + "step": 27948 + }, + { + "epoch": 0.16622062042059188, + "grad_norm": 1.702767252922058, + "learning_rate": 4.666836300549926e-05, + "loss": 5.1613, + "step": 27949 + }, + { + "epoch": 0.16622656770387287, + "grad_norm": 2.1399359703063965, + "learning_rate": 4.666813002709312e-05, + "loss": 4.9766, + "step": 27950 + }, + { + "epoch": 0.16623251498715386, + "grad_norm": 2.493435859680176, + "learning_rate": 4.666789704112286e-05, + "loss": 4.2058, + "step": 27951 + }, + { + "epoch": 0.16623846227043487, + "grad_norm": 2.689168930053711, + "learning_rate": 4.666766404758857e-05, + "loss": 3.7151, + "step": 27952 + }, + { + "epoch": 0.16624440955371586, + "grad_norm": 2.172666311264038, + "learning_rate": 4.666743104649031e-05, + "loss": 3.6916, + "step": 27953 + }, + { + "epoch": 0.16625035683699685, + "grad_norm": 1.551274299621582, + "learning_rate": 4.6667198037828173e-05, + "loss": 4.9331, + "step": 27954 + }, + { + "epoch": 0.16625630412027786, + "grad_norm": 1.7849092483520508, + "learning_rate": 4.666696502160226e-05, + "loss": 5.288, + "step": 27955 + }, + { + "epoch": 0.16626225140355885, + "grad_norm": 1.8850775957107544, + "learning_rate": 4.6666731997812614e-05, + "loss": 5.1946, + "step": 27956 + }, + { + "epoch": 0.16626819868683984, + "grad_norm": 1.4710248708724976, + "learning_rate": 4.666649896645934e-05, + "loss": 5.2753, + "step": 27957 + }, + { + "epoch": 0.16627414597012086, + "grad_norm": 1.4987525939941406, + "learning_rate": 4.6666265927542516e-05, + "loss": 5.3751, + "step": 27958 + }, + { + "epoch": 0.16628009325340184, + "grad_norm": 1.5894343852996826, + "learning_rate": 4.666603288106223e-05, + "loss": 5.1087, + "step": 27959 + }, + { + "epoch": 0.16628604053668283, + "grad_norm": 1.491098165512085, + "learning_rate": 4.666579982701855e-05, + "loss": 5.1876, + "step": 27960 + }, + { + "epoch": 0.16629198781996385, + "grad_norm": 1.6574211120605469, + "learning_rate": 4.666556676541156e-05, + "loss": 5.1677, + "step": 27961 + }, + { + "epoch": 0.16629793510324484, + "grad_norm": 1.3962849378585815, + "learning_rate": 4.666533369624135e-05, + "loss": 4.6312, + "step": 27962 + }, + { + "epoch": 0.16630388238652583, + "grad_norm": 1.3819752931594849, + "learning_rate": 4.6665100619507986e-05, + "loss": 5.1794, + "step": 27963 + }, + { + "epoch": 0.16630982966980684, + "grad_norm": 1.392821192741394, + "learning_rate": 4.666486753521157e-05, + "loss": 5.192, + "step": 27964 + }, + { + "epoch": 0.16631577695308783, + "grad_norm": 1.3655375242233276, + "learning_rate": 4.6664634443352176e-05, + "loss": 5.0533, + "step": 27965 + }, + { + "epoch": 0.16632172423636882, + "grad_norm": 1.7046358585357666, + "learning_rate": 4.6664401343929864e-05, + "loss": 4.7244, + "step": 27966 + }, + { + "epoch": 0.16632767151964983, + "grad_norm": 1.8924365043640137, + "learning_rate": 4.6664168236944747e-05, + "loss": 4.8182, + "step": 27967 + }, + { + "epoch": 0.16633361880293082, + "grad_norm": 1.7032650709152222, + "learning_rate": 4.666393512239689e-05, + "loss": 4.6594, + "step": 27968 + }, + { + "epoch": 0.1663395660862118, + "grad_norm": 2.0425281524658203, + "learning_rate": 4.666370200028638e-05, + "loss": 4.0096, + "step": 27969 + }, + { + "epoch": 0.16634551336949283, + "grad_norm": 2.4013113975524902, + "learning_rate": 4.666346887061329e-05, + "loss": 3.7662, + "step": 27970 + }, + { + "epoch": 0.1663514606527738, + "grad_norm": 1.8698662519454956, + "learning_rate": 4.666323573337771e-05, + "loss": 4.2575, + "step": 27971 + }, + { + "epoch": 0.1663574079360548, + "grad_norm": 1.5415421724319458, + "learning_rate": 4.666300258857972e-05, + "loss": 4.739, + "step": 27972 + }, + { + "epoch": 0.16636335521933582, + "grad_norm": 1.79619562625885, + "learning_rate": 4.666276943621939e-05, + "loss": 4.7542, + "step": 27973 + }, + { + "epoch": 0.1663693025026168, + "grad_norm": 1.5592199563980103, + "learning_rate": 4.666253627629682e-05, + "loss": 4.5968, + "step": 27974 + }, + { + "epoch": 0.1663752497858978, + "grad_norm": 1.7237550020217896, + "learning_rate": 4.666230310881208e-05, + "loss": 4.6581, + "step": 27975 + }, + { + "epoch": 0.1663811970691788, + "grad_norm": 1.6247119903564453, + "learning_rate": 4.6662069933765255e-05, + "loss": 4.6803, + "step": 27976 + }, + { + "epoch": 0.1663871443524598, + "grad_norm": 1.6257696151733398, + "learning_rate": 4.666183675115643e-05, + "loss": 4.7591, + "step": 27977 + }, + { + "epoch": 0.1663930916357408, + "grad_norm": 1.6353588104248047, + "learning_rate": 4.666160356098567e-05, + "loss": 4.3821, + "step": 27978 + }, + { + "epoch": 0.1663990389190218, + "grad_norm": 1.7179335355758667, + "learning_rate": 4.666137036325308e-05, + "loss": 4.6386, + "step": 27979 + }, + { + "epoch": 0.1664049862023028, + "grad_norm": 1.6724573373794556, + "learning_rate": 4.6661137157958716e-05, + "loss": 4.596, + "step": 27980 + }, + { + "epoch": 0.16641093348558378, + "grad_norm": 1.8331623077392578, + "learning_rate": 4.666090394510269e-05, + "loss": 4.6706, + "step": 27981 + }, + { + "epoch": 0.1664168807688648, + "grad_norm": 1.5815516710281372, + "learning_rate": 4.666067072468505e-05, + "loss": 4.5764, + "step": 27982 + }, + { + "epoch": 0.16642282805214578, + "grad_norm": 1.6047725677490234, + "learning_rate": 4.66604374967059e-05, + "loss": 4.4228, + "step": 27983 + }, + { + "epoch": 0.16642877533542677, + "grad_norm": 2.057325601577759, + "learning_rate": 4.666020426116531e-05, + "loss": 3.886, + "step": 27984 + }, + { + "epoch": 0.1664347226187078, + "grad_norm": 2.2633588314056396, + "learning_rate": 4.6659971018063375e-05, + "loss": 4.2796, + "step": 27985 + }, + { + "epoch": 0.16644066990198877, + "grad_norm": 1.9848732948303223, + "learning_rate": 4.6659737767400166e-05, + "loss": 4.4349, + "step": 27986 + }, + { + "epoch": 0.16644661718526976, + "grad_norm": 1.8116247653961182, + "learning_rate": 4.6659504509175764e-05, + "loss": 4.5313, + "step": 27987 + }, + { + "epoch": 0.16645256446855078, + "grad_norm": 1.8909553289413452, + "learning_rate": 4.665927124339026e-05, + "loss": 4.5166, + "step": 27988 + }, + { + "epoch": 0.16645851175183177, + "grad_norm": 1.6827013492584229, + "learning_rate": 4.665903797004371e-05, + "loss": 4.7353, + "step": 27989 + }, + { + "epoch": 0.16646445903511276, + "grad_norm": 1.8081045150756836, + "learning_rate": 4.6658804689136227e-05, + "loss": 4.743, + "step": 27990 + }, + { + "epoch": 0.16647040631839377, + "grad_norm": 1.7859995365142822, + "learning_rate": 4.665857140066788e-05, + "loss": 4.6476, + "step": 27991 + }, + { + "epoch": 0.16647635360167476, + "grad_norm": 2.158141613006592, + "learning_rate": 4.665833810463874e-05, + "loss": 4.1541, + "step": 27992 + }, + { + "epoch": 0.16648230088495575, + "grad_norm": 2.059534788131714, + "learning_rate": 4.665810480104891e-05, + "loss": 4.2993, + "step": 27993 + }, + { + "epoch": 0.16648824816823676, + "grad_norm": 2.0945677757263184, + "learning_rate": 4.665787148989845e-05, + "loss": 4.5941, + "step": 27994 + }, + { + "epoch": 0.16649419545151775, + "grad_norm": 1.8792952299118042, + "learning_rate": 4.6657638171187455e-05, + "loss": 4.5735, + "step": 27995 + }, + { + "epoch": 0.16650014273479874, + "grad_norm": 1.7018059492111206, + "learning_rate": 4.665740484491601e-05, + "loss": 4.6591, + "step": 27996 + }, + { + "epoch": 0.16650609001807973, + "grad_norm": 1.6992706060409546, + "learning_rate": 4.6657171511084175e-05, + "loss": 4.512, + "step": 27997 + }, + { + "epoch": 0.16651203730136074, + "grad_norm": 1.7492562532424927, + "learning_rate": 4.6656938169692054e-05, + "loss": 4.6722, + "step": 27998 + }, + { + "epoch": 0.16651798458464173, + "grad_norm": 1.6457120180130005, + "learning_rate": 4.665670482073972e-05, + "loss": 4.5632, + "step": 27999 + }, + { + "epoch": 0.16652393186792272, + "grad_norm": 1.9052523374557495, + "learning_rate": 4.6656471464227246e-05, + "loss": 4.5678, + "step": 28000 + }, + { + "epoch": 0.16652987915120374, + "grad_norm": 1.7932218313217163, + "learning_rate": 4.665623810015473e-05, + "loss": 4.5433, + "step": 28001 + }, + { + "epoch": 0.16653582643448472, + "grad_norm": 1.7252825498580933, + "learning_rate": 4.665600472852224e-05, + "loss": 4.3902, + "step": 28002 + }, + { + "epoch": 0.1665417737177657, + "grad_norm": 1.810628890991211, + "learning_rate": 4.665577134932986e-05, + "loss": 4.242, + "step": 28003 + }, + { + "epoch": 0.16654772100104673, + "grad_norm": 1.7332589626312256, + "learning_rate": 4.6655537962577676e-05, + "loss": 4.2713, + "step": 28004 + }, + { + "epoch": 0.16655366828432772, + "grad_norm": 1.720533847808838, + "learning_rate": 4.6655304568265776e-05, + "loss": 4.3828, + "step": 28005 + }, + { + "epoch": 0.1665596155676087, + "grad_norm": 1.680240511894226, + "learning_rate": 4.665507116639423e-05, + "loss": 4.4578, + "step": 28006 + }, + { + "epoch": 0.16656556285088972, + "grad_norm": 1.6451648473739624, + "learning_rate": 4.665483775696311e-05, + "loss": 4.4493, + "step": 28007 + }, + { + "epoch": 0.1665715101341707, + "grad_norm": 1.8150712251663208, + "learning_rate": 4.665460433997252e-05, + "loss": 4.353, + "step": 28008 + }, + { + "epoch": 0.1665774574174517, + "grad_norm": 1.6858443021774292, + "learning_rate": 4.665437091542253e-05, + "loss": 4.2929, + "step": 28009 + }, + { + "epoch": 0.1665834047007327, + "grad_norm": 1.7269021272659302, + "learning_rate": 4.665413748331322e-05, + "loss": 4.2856, + "step": 28010 + }, + { + "epoch": 0.1665893519840137, + "grad_norm": 1.6517678499221802, + "learning_rate": 4.665390404364468e-05, + "loss": 4.977, + "step": 28011 + }, + { + "epoch": 0.1665952992672947, + "grad_norm": 1.8300232887268066, + "learning_rate": 4.665367059641698e-05, + "loss": 4.3829, + "step": 28012 + }, + { + "epoch": 0.1666012465505757, + "grad_norm": 1.7685927152633667, + "learning_rate": 4.6653437141630215e-05, + "loss": 4.3178, + "step": 28013 + }, + { + "epoch": 0.1666071938338567, + "grad_norm": 1.944615125656128, + "learning_rate": 4.665320367928445e-05, + "loss": 4.2248, + "step": 28014 + }, + { + "epoch": 0.16661314111713768, + "grad_norm": 2.097490072250366, + "learning_rate": 4.6652970209379775e-05, + "loss": 4.2814, + "step": 28015 + }, + { + "epoch": 0.1666190884004187, + "grad_norm": 1.5824095010757446, + "learning_rate": 4.665273673191628e-05, + "loss": 4.2074, + "step": 28016 + }, + { + "epoch": 0.16662503568369968, + "grad_norm": 1.6979020833969116, + "learning_rate": 4.665250324689403e-05, + "loss": 4.3534, + "step": 28017 + }, + { + "epoch": 0.16663098296698067, + "grad_norm": 1.7754404544830322, + "learning_rate": 4.6652269754313125e-05, + "loss": 4.3066, + "step": 28018 + }, + { + "epoch": 0.1666369302502617, + "grad_norm": 1.8645826578140259, + "learning_rate": 4.665203625417363e-05, + "loss": 4.1896, + "step": 28019 + }, + { + "epoch": 0.16664287753354268, + "grad_norm": 1.8967339992523193, + "learning_rate": 4.6651802746475633e-05, + "loss": 4.4092, + "step": 28020 + }, + { + "epoch": 0.16664882481682367, + "grad_norm": 1.76931893825531, + "learning_rate": 4.665156923121922e-05, + "loss": 4.5632, + "step": 28021 + }, + { + "epoch": 0.16665477210010468, + "grad_norm": 2.338927745819092, + "learning_rate": 4.665133570840446e-05, + "loss": 4.2858, + "step": 28022 + }, + { + "epoch": 0.16666071938338567, + "grad_norm": 1.747149109840393, + "learning_rate": 4.665110217803145e-05, + "loss": 4.6505, + "step": 28023 + }, + { + "epoch": 0.16666666666666666, + "grad_norm": 2.8555750846862793, + "learning_rate": 4.6650868640100254e-05, + "loss": 4.4681, + "step": 28024 + }, + { + "epoch": 0.16667261394994767, + "grad_norm": 2.284841299057007, + "learning_rate": 4.665063509461097e-05, + "loss": 3.9607, + "step": 28025 + }, + { + "epoch": 0.16667856123322866, + "grad_norm": 2.51346492767334, + "learning_rate": 4.6650401541563673e-05, + "loss": 3.8373, + "step": 28026 + }, + { + "epoch": 0.16668450851650965, + "grad_norm": 2.33309006690979, + "learning_rate": 4.6650167980958444e-05, + "loss": 3.8783, + "step": 28027 + }, + { + "epoch": 0.16669045579979067, + "grad_norm": 1.886756420135498, + "learning_rate": 4.664993441279536e-05, + "loss": 4.4607, + "step": 28028 + }, + { + "epoch": 0.16669640308307165, + "grad_norm": 1.6356357336044312, + "learning_rate": 4.664970083707452e-05, + "loss": 4.2901, + "step": 28029 + }, + { + "epoch": 0.16670235036635264, + "grad_norm": 2.366969108581543, + "learning_rate": 4.664946725379598e-05, + "loss": 3.7561, + "step": 28030 + }, + { + "epoch": 0.16670829764963366, + "grad_norm": 2.387471914291382, + "learning_rate": 4.664923366295984e-05, + "loss": 3.745, + "step": 28031 + }, + { + "epoch": 0.16671424493291465, + "grad_norm": 2.0741076469421387, + "learning_rate": 4.664900006456617e-05, + "loss": 3.6031, + "step": 28032 + }, + { + "epoch": 0.16672019221619563, + "grad_norm": 2.5359690189361572, + "learning_rate": 4.6648766458615066e-05, + "loss": 3.8495, + "step": 28033 + }, + { + "epoch": 0.16672613949947665, + "grad_norm": 2.423198938369751, + "learning_rate": 4.6648532845106604e-05, + "loss": 3.5664, + "step": 28034 + }, + { + "epoch": 0.16673208678275764, + "grad_norm": 1.7040067911148071, + "learning_rate": 4.664829922404086e-05, + "loss": 4.4474, + "step": 28035 + }, + { + "epoch": 0.16673803406603863, + "grad_norm": 1.8603174686431885, + "learning_rate": 4.664806559541791e-05, + "loss": 4.7263, + "step": 28036 + }, + { + "epoch": 0.16674398134931964, + "grad_norm": 1.7510238885879517, + "learning_rate": 4.664783195923785e-05, + "loss": 4.5566, + "step": 28037 + }, + { + "epoch": 0.16674992863260063, + "grad_norm": 1.6786305904388428, + "learning_rate": 4.6647598315500764e-05, + "loss": 4.5139, + "step": 28038 + }, + { + "epoch": 0.16675587591588162, + "grad_norm": 1.7382848262786865, + "learning_rate": 4.664736466420671e-05, + "loss": 4.5474, + "step": 28039 + }, + { + "epoch": 0.16676182319916263, + "grad_norm": 2.1977128982543945, + "learning_rate": 4.664713100535579e-05, + "loss": 3.8657, + "step": 28040 + }, + { + "epoch": 0.16676777048244362, + "grad_norm": 2.147538185119629, + "learning_rate": 4.664689733894808e-05, + "loss": 4.3258, + "step": 28041 + }, + { + "epoch": 0.1667737177657246, + "grad_norm": 1.6165980100631714, + "learning_rate": 4.6646663664983667e-05, + "loss": 5.6764, + "step": 28042 + }, + { + "epoch": 0.16677966504900563, + "grad_norm": 1.5513676404953003, + "learning_rate": 4.664642998346263e-05, + "loss": 5.286, + "step": 28043 + }, + { + "epoch": 0.16678561233228661, + "grad_norm": 2.4869754314422607, + "learning_rate": 4.664619629438504e-05, + "loss": 3.9925, + "step": 28044 + }, + { + "epoch": 0.1667915596155676, + "grad_norm": 1.9613736867904663, + "learning_rate": 4.6645962597750985e-05, + "loss": 4.3125, + "step": 28045 + }, + { + "epoch": 0.16679750689884862, + "grad_norm": 2.345310688018799, + "learning_rate": 4.664572889356055e-05, + "loss": 4.5785, + "step": 28046 + }, + { + "epoch": 0.1668034541821296, + "grad_norm": 1.6253316402435303, + "learning_rate": 4.664549518181382e-05, + "loss": 4.9924, + "step": 28047 + }, + { + "epoch": 0.1668094014654106, + "grad_norm": 1.7358524799346924, + "learning_rate": 4.664526146251087e-05, + "loss": 4.7523, + "step": 28048 + }, + { + "epoch": 0.1668153487486916, + "grad_norm": 1.6783114671707153, + "learning_rate": 4.664502773565178e-05, + "loss": 5.1525, + "step": 28049 + }, + { + "epoch": 0.1668212960319726, + "grad_norm": 1.102388858795166, + "learning_rate": 4.664479400123663e-05, + "loss": 5.5695, + "step": 28050 + }, + { + "epoch": 0.1668272433152536, + "grad_norm": 1.6548655033111572, + "learning_rate": 4.664456025926551e-05, + "loss": 4.7331, + "step": 28051 + }, + { + "epoch": 0.1668331905985346, + "grad_norm": 1.6468528509140015, + "learning_rate": 4.66443265097385e-05, + "loss": 4.7818, + "step": 28052 + }, + { + "epoch": 0.1668391378818156, + "grad_norm": 1.623849630355835, + "learning_rate": 4.664409275265568e-05, + "loss": 4.9336, + "step": 28053 + }, + { + "epoch": 0.16684508516509658, + "grad_norm": 1.4946188926696777, + "learning_rate": 4.664385898801713e-05, + "loss": 4.9361, + "step": 28054 + }, + { + "epoch": 0.16685103244837757, + "grad_norm": 1.6323179006576538, + "learning_rate": 4.664362521582293e-05, + "loss": 4.9529, + "step": 28055 + }, + { + "epoch": 0.16685697973165858, + "grad_norm": 1.579441785812378, + "learning_rate": 4.6643391436073165e-05, + "loss": 4.5593, + "step": 28056 + }, + { + "epoch": 0.16686292701493957, + "grad_norm": 1.578658103942871, + "learning_rate": 4.664315764876791e-05, + "loss": 4.5736, + "step": 28057 + }, + { + "epoch": 0.16686887429822056, + "grad_norm": 1.661720633506775, + "learning_rate": 4.664292385390726e-05, + "loss": 4.9137, + "step": 28058 + }, + { + "epoch": 0.16687482158150158, + "grad_norm": 1.9020450115203857, + "learning_rate": 4.664269005149129e-05, + "loss": 5.3975, + "step": 28059 + }, + { + "epoch": 0.16688076886478256, + "grad_norm": 2.0544557571411133, + "learning_rate": 4.664245624152007e-05, + "loss": 5.3485, + "step": 28060 + }, + { + "epoch": 0.16688671614806355, + "grad_norm": 1.8861839771270752, + "learning_rate": 4.664222242399371e-05, + "loss": 5.3917, + "step": 28061 + }, + { + "epoch": 0.16689266343134457, + "grad_norm": 1.7115676403045654, + "learning_rate": 4.6641988598912256e-05, + "loss": 5.3101, + "step": 28062 + }, + { + "epoch": 0.16689861071462556, + "grad_norm": 2.6457252502441406, + "learning_rate": 4.6641754766275815e-05, + "loss": 4.8323, + "step": 28063 + }, + { + "epoch": 0.16690455799790654, + "grad_norm": 1.9158306121826172, + "learning_rate": 4.664152092608446e-05, + "loss": 4.902, + "step": 28064 + }, + { + "epoch": 0.16691050528118756, + "grad_norm": 1.5592490434646606, + "learning_rate": 4.664128707833828e-05, + "loss": 4.85, + "step": 28065 + }, + { + "epoch": 0.16691645256446855, + "grad_norm": 1.8784046173095703, + "learning_rate": 4.664105322303734e-05, + "loss": 4.9118, + "step": 28066 + }, + { + "epoch": 0.16692239984774954, + "grad_norm": 1.8043493032455444, + "learning_rate": 4.6640819360181734e-05, + "loss": 4.8248, + "step": 28067 + }, + { + "epoch": 0.16692834713103055, + "grad_norm": 1.925399661064148, + "learning_rate": 4.664058548977155e-05, + "loss": 4.8808, + "step": 28068 + }, + { + "epoch": 0.16693429441431154, + "grad_norm": 2.1420938968658447, + "learning_rate": 4.664035161180686e-05, + "loss": 4.5251, + "step": 28069 + }, + { + "epoch": 0.16694024169759253, + "grad_norm": 1.3386578559875488, + "learning_rate": 4.664011772628774e-05, + "loss": 5.0788, + "step": 28070 + }, + { + "epoch": 0.16694618898087354, + "grad_norm": 1.7500650882720947, + "learning_rate": 4.663988383321427e-05, + "loss": 4.6332, + "step": 28071 + }, + { + "epoch": 0.16695213626415453, + "grad_norm": 1.6339102983474731, + "learning_rate": 4.6639649932586555e-05, + "loss": 4.9342, + "step": 28072 + }, + { + "epoch": 0.16695808354743552, + "grad_norm": 1.634045124053955, + "learning_rate": 4.6639416024404655e-05, + "loss": 4.8166, + "step": 28073 + }, + { + "epoch": 0.16696403083071654, + "grad_norm": 1.6168557405471802, + "learning_rate": 4.663918210866866e-05, + "loss": 4.9086, + "step": 28074 + }, + { + "epoch": 0.16696997811399752, + "grad_norm": 1.7027981281280518, + "learning_rate": 4.663894818537865e-05, + "loss": 4.7404, + "step": 28075 + }, + { + "epoch": 0.1669759253972785, + "grad_norm": 1.621127724647522, + "learning_rate": 4.663871425453471e-05, + "loss": 4.8458, + "step": 28076 + }, + { + "epoch": 0.16698187268055953, + "grad_norm": 1.524674415588379, + "learning_rate": 4.663848031613691e-05, + "loss": 4.8977, + "step": 28077 + }, + { + "epoch": 0.16698781996384052, + "grad_norm": 1.3619705438613892, + "learning_rate": 4.663824637018535e-05, + "loss": 4.6809, + "step": 28078 + }, + { + "epoch": 0.1669937672471215, + "grad_norm": 1.6202057600021362, + "learning_rate": 4.66380124166801e-05, + "loss": 4.2993, + "step": 28079 + }, + { + "epoch": 0.16699971453040252, + "grad_norm": 1.6400598287582397, + "learning_rate": 4.663777845562124e-05, + "loss": 4.498, + "step": 28080 + }, + { + "epoch": 0.1670056618136835, + "grad_norm": 1.6038521528244019, + "learning_rate": 4.663754448700885e-05, + "loss": 4.5864, + "step": 28081 + }, + { + "epoch": 0.1670116090969645, + "grad_norm": 1.6111528873443604, + "learning_rate": 4.663731051084303e-05, + "loss": 4.77, + "step": 28082 + }, + { + "epoch": 0.1670175563802455, + "grad_norm": 1.7978882789611816, + "learning_rate": 4.663707652712384e-05, + "loss": 4.2634, + "step": 28083 + }, + { + "epoch": 0.1670235036635265, + "grad_norm": 1.8533109426498413, + "learning_rate": 4.6636842535851374e-05, + "loss": 4.6601, + "step": 28084 + }, + { + "epoch": 0.1670294509468075, + "grad_norm": 1.776292324066162, + "learning_rate": 4.663660853702571e-05, + "loss": 4.4957, + "step": 28085 + }, + { + "epoch": 0.1670353982300885, + "grad_norm": 2.15081524848938, + "learning_rate": 4.663637453064692e-05, + "loss": 4.2726, + "step": 28086 + }, + { + "epoch": 0.1670413455133695, + "grad_norm": 1.4586591720581055, + "learning_rate": 4.6636140516715104e-05, + "loss": 5.4757, + "step": 28087 + }, + { + "epoch": 0.16704729279665048, + "grad_norm": 1.4819058179855347, + "learning_rate": 4.663590649523033e-05, + "loss": 5.6895, + "step": 28088 + }, + { + "epoch": 0.1670532400799315, + "grad_norm": 1.8194465637207031, + "learning_rate": 4.663567246619269e-05, + "loss": 4.5697, + "step": 28089 + }, + { + "epoch": 0.16705918736321249, + "grad_norm": 1.8187286853790283, + "learning_rate": 4.663543842960226e-05, + "loss": 4.4745, + "step": 28090 + }, + { + "epoch": 0.16706513464649347, + "grad_norm": 1.7815576791763306, + "learning_rate": 4.663520438545912e-05, + "loss": 4.7309, + "step": 28091 + }, + { + "epoch": 0.1670710819297745, + "grad_norm": 1.9799631834030151, + "learning_rate": 4.663497033376335e-05, + "loss": 4.5429, + "step": 28092 + }, + { + "epoch": 0.16707702921305548, + "grad_norm": 1.7019764184951782, + "learning_rate": 4.663473627451504e-05, + "loss": 4.402, + "step": 28093 + }, + { + "epoch": 0.16708297649633647, + "grad_norm": 1.9056285619735718, + "learning_rate": 4.663450220771427e-05, + "loss": 4.3428, + "step": 28094 + }, + { + "epoch": 0.16708892377961748, + "grad_norm": 1.877556562423706, + "learning_rate": 4.663426813336112e-05, + "loss": 4.4579, + "step": 28095 + }, + { + "epoch": 0.16709487106289847, + "grad_norm": 1.6415005922317505, + "learning_rate": 4.663403405145565e-05, + "loss": 5.1392, + "step": 28096 + }, + { + "epoch": 0.16710081834617946, + "grad_norm": 2.0315005779266357, + "learning_rate": 4.663379996199798e-05, + "loss": 4.5666, + "step": 28097 + }, + { + "epoch": 0.16710676562946047, + "grad_norm": 1.744367241859436, + "learning_rate": 4.663356586498817e-05, + "loss": 4.6629, + "step": 28098 + }, + { + "epoch": 0.16711271291274146, + "grad_norm": 1.8645330667495728, + "learning_rate": 4.663333176042631e-05, + "loss": 4.2716, + "step": 28099 + }, + { + "epoch": 0.16711866019602245, + "grad_norm": 1.6384168863296509, + "learning_rate": 4.6633097648312476e-05, + "loss": 4.1565, + "step": 28100 + }, + { + "epoch": 0.16712460747930347, + "grad_norm": 2.0455496311187744, + "learning_rate": 4.663286352864675e-05, + "loss": 4.3342, + "step": 28101 + }, + { + "epoch": 0.16713055476258445, + "grad_norm": 1.689454197883606, + "learning_rate": 4.663262940142921e-05, + "loss": 5.1503, + "step": 28102 + }, + { + "epoch": 0.16713650204586544, + "grad_norm": 1.7138323783874512, + "learning_rate": 4.663239526665995e-05, + "loss": 4.3616, + "step": 28103 + }, + { + "epoch": 0.16714244932914646, + "grad_norm": 2.171147584915161, + "learning_rate": 4.663216112433904e-05, + "loss": 4.3054, + "step": 28104 + }, + { + "epoch": 0.16714839661242745, + "grad_norm": 2.5418312549591064, + "learning_rate": 4.663192697446657e-05, + "loss": 3.387, + "step": 28105 + }, + { + "epoch": 0.16715434389570843, + "grad_norm": 1.5790460109710693, + "learning_rate": 4.6631692817042615e-05, + "loss": 5.2555, + "step": 28106 + }, + { + "epoch": 0.16716029117898945, + "grad_norm": 1.4285277128219604, + "learning_rate": 4.663145865206726e-05, + "loss": 5.2408, + "step": 28107 + }, + { + "epoch": 0.16716623846227044, + "grad_norm": 1.3292522430419922, + "learning_rate": 4.663122447954058e-05, + "loss": 5.1494, + "step": 28108 + }, + { + "epoch": 0.16717218574555143, + "grad_norm": 1.7032718658447266, + "learning_rate": 4.663099029946267e-05, + "loss": 4.7939, + "step": 28109 + }, + { + "epoch": 0.16717813302883244, + "grad_norm": 1.6049028635025024, + "learning_rate": 4.6630756111833605e-05, + "loss": 4.9406, + "step": 28110 + }, + { + "epoch": 0.16718408031211343, + "grad_norm": 1.4805787801742554, + "learning_rate": 4.663052191665347e-05, + "loss": 4.9251, + "step": 28111 + }, + { + "epoch": 0.16719002759539442, + "grad_norm": 1.585306167602539, + "learning_rate": 4.663028771392234e-05, + "loss": 5.3119, + "step": 28112 + }, + { + "epoch": 0.1671959748786754, + "grad_norm": 1.5918222665786743, + "learning_rate": 4.663005350364029e-05, + "loss": 5.1405, + "step": 28113 + }, + { + "epoch": 0.16720192216195642, + "grad_norm": 1.5273454189300537, + "learning_rate": 4.6629819285807426e-05, + "loss": 4.9654, + "step": 28114 + }, + { + "epoch": 0.1672078694452374, + "grad_norm": 2.3424551486968994, + "learning_rate": 4.662958506042381e-05, + "loss": 4.6364, + "step": 28115 + }, + { + "epoch": 0.1672138167285184, + "grad_norm": 1.5244309902191162, + "learning_rate": 4.6629350827489527e-05, + "loss": 5.1469, + "step": 28116 + }, + { + "epoch": 0.16721976401179942, + "grad_norm": 1.6393519639968872, + "learning_rate": 4.662911658700466e-05, + "loss": 5.3803, + "step": 28117 + }, + { + "epoch": 0.1672257112950804, + "grad_norm": 1.6506540775299072, + "learning_rate": 4.662888233896929e-05, + "loss": 5.2188, + "step": 28118 + }, + { + "epoch": 0.1672316585783614, + "grad_norm": 1.481735110282898, + "learning_rate": 4.6628648083383516e-05, + "loss": 5.4692, + "step": 28119 + }, + { + "epoch": 0.1672376058616424, + "grad_norm": 1.5239784717559814, + "learning_rate": 4.662841382024739e-05, + "loss": 5.5937, + "step": 28120 + }, + { + "epoch": 0.1672435531449234, + "grad_norm": 1.7525306940078735, + "learning_rate": 4.662817954956101e-05, + "loss": 5.2913, + "step": 28121 + }, + { + "epoch": 0.16724950042820438, + "grad_norm": 1.5808900594711304, + "learning_rate": 4.662794527132446e-05, + "loss": 4.8341, + "step": 28122 + }, + { + "epoch": 0.1672554477114854, + "grad_norm": 1.7503292560577393, + "learning_rate": 4.662771098553782e-05, + "loss": 4.6066, + "step": 28123 + }, + { + "epoch": 0.1672613949947664, + "grad_norm": 2.0583229064941406, + "learning_rate": 4.662747669220116e-05, + "loss": 4.747, + "step": 28124 + }, + { + "epoch": 0.16726734227804738, + "grad_norm": 1.8209635019302368, + "learning_rate": 4.662724239131458e-05, + "loss": 4.6837, + "step": 28125 + }, + { + "epoch": 0.1672732895613284, + "grad_norm": 1.3161481618881226, + "learning_rate": 4.662700808287815e-05, + "loss": 5.1877, + "step": 28126 + }, + { + "epoch": 0.16727923684460938, + "grad_norm": 1.492100715637207, + "learning_rate": 4.662677376689195e-05, + "loss": 5.0719, + "step": 28127 + }, + { + "epoch": 0.16728518412789037, + "grad_norm": 1.5123339891433716, + "learning_rate": 4.662653944335608e-05, + "loss": 5.2237, + "step": 28128 + }, + { + "epoch": 0.16729113141117138, + "grad_norm": 1.3963336944580078, + "learning_rate": 4.66263051122706e-05, + "loss": 5.5465, + "step": 28129 + }, + { + "epoch": 0.16729707869445237, + "grad_norm": 1.4128196239471436, + "learning_rate": 4.662607077363559e-05, + "loss": 5.4236, + "step": 28130 + }, + { + "epoch": 0.16730302597773336, + "grad_norm": 1.5107556581497192, + "learning_rate": 4.662583642745116e-05, + "loss": 5.411, + "step": 28131 + }, + { + "epoch": 0.16730897326101438, + "grad_norm": 1.4282488822937012, + "learning_rate": 4.662560207371737e-05, + "loss": 5.4301, + "step": 28132 + }, + { + "epoch": 0.16731492054429536, + "grad_norm": 1.7082507610321045, + "learning_rate": 4.6625367712434295e-05, + "loss": 5.2167, + "step": 28133 + }, + { + "epoch": 0.16732086782757635, + "grad_norm": 1.4769392013549805, + "learning_rate": 4.662513334360204e-05, + "loss": 4.8894, + "step": 28134 + }, + { + "epoch": 0.16732681511085737, + "grad_norm": 1.6305506229400635, + "learning_rate": 4.6624898967220664e-05, + "loss": 5.2891, + "step": 28135 + }, + { + "epoch": 0.16733276239413836, + "grad_norm": 1.4358271360397339, + "learning_rate": 4.662466458329027e-05, + "loss": 5.4362, + "step": 28136 + }, + { + "epoch": 0.16733870967741934, + "grad_norm": 1.3945128917694092, + "learning_rate": 4.662443019181092e-05, + "loss": 5.4208, + "step": 28137 + }, + { + "epoch": 0.16734465696070036, + "grad_norm": 1.3432549238204956, + "learning_rate": 4.662419579278271e-05, + "loss": 5.4326, + "step": 28138 + }, + { + "epoch": 0.16735060424398135, + "grad_norm": 1.3106540441513062, + "learning_rate": 4.662396138620571e-05, + "loss": 5.554, + "step": 28139 + }, + { + "epoch": 0.16735655152726234, + "grad_norm": 1.449013590812683, + "learning_rate": 4.662372697208002e-05, + "loss": 5.3896, + "step": 28140 + }, + { + "epoch": 0.16736249881054335, + "grad_norm": 1.2621738910675049, + "learning_rate": 4.66234925504057e-05, + "loss": 5.5235, + "step": 28141 + }, + { + "epoch": 0.16736844609382434, + "grad_norm": 1.5813289880752563, + "learning_rate": 4.6623258121182845e-05, + "loss": 5.4607, + "step": 28142 + }, + { + "epoch": 0.16737439337710533, + "grad_norm": 1.4719443321228027, + "learning_rate": 4.662302368441154e-05, + "loss": 5.2416, + "step": 28143 + }, + { + "epoch": 0.16738034066038635, + "grad_norm": 1.3261717557907104, + "learning_rate": 4.662278924009185e-05, + "loss": 5.2426, + "step": 28144 + }, + { + "epoch": 0.16738628794366733, + "grad_norm": 1.409119725227356, + "learning_rate": 4.6622554788223874e-05, + "loss": 4.8306, + "step": 28145 + }, + { + "epoch": 0.16739223522694832, + "grad_norm": 1.3746771812438965, + "learning_rate": 4.662232032880769e-05, + "loss": 5.3939, + "step": 28146 + }, + { + "epoch": 0.16739818251022934, + "grad_norm": 1.5453044176101685, + "learning_rate": 4.662208586184337e-05, + "loss": 5.2989, + "step": 28147 + }, + { + "epoch": 0.16740412979351033, + "grad_norm": 2.140986919403076, + "learning_rate": 4.6621851387331003e-05, + "loss": 4.7526, + "step": 28148 + }, + { + "epoch": 0.1674100770767913, + "grad_norm": 1.305344820022583, + "learning_rate": 4.662161690527068e-05, + "loss": 5.3339, + "step": 28149 + }, + { + "epoch": 0.16741602436007233, + "grad_norm": 1.200656533241272, + "learning_rate": 4.662138241566247e-05, + "loss": 5.2464, + "step": 28150 + }, + { + "epoch": 0.16742197164335332, + "grad_norm": 1.2441010475158691, + "learning_rate": 4.6621147918506457e-05, + "loss": 5.4545, + "step": 28151 + }, + { + "epoch": 0.1674279189266343, + "grad_norm": 1.6146814823150635, + "learning_rate": 4.662091341380272e-05, + "loss": 4.9968, + "step": 28152 + }, + { + "epoch": 0.16743386620991532, + "grad_norm": 1.2502530813217163, + "learning_rate": 4.6620678901551354e-05, + "loss": 5.3297, + "step": 28153 + }, + { + "epoch": 0.1674398134931963, + "grad_norm": 1.5260026454925537, + "learning_rate": 4.662044438175243e-05, + "loss": 5.2643, + "step": 28154 + }, + { + "epoch": 0.1674457607764773, + "grad_norm": 1.2725012302398682, + "learning_rate": 4.662020985440603e-05, + "loss": 5.4469, + "step": 28155 + }, + { + "epoch": 0.16745170805975831, + "grad_norm": 1.717331051826477, + "learning_rate": 4.661997531951224e-05, + "loss": 5.2711, + "step": 28156 + }, + { + "epoch": 0.1674576553430393, + "grad_norm": 1.6104686260223389, + "learning_rate": 4.661974077707114e-05, + "loss": 5.0773, + "step": 28157 + }, + { + "epoch": 0.1674636026263203, + "grad_norm": 1.568558692932129, + "learning_rate": 4.661950622708281e-05, + "loss": 4.4339, + "step": 28158 + }, + { + "epoch": 0.1674695499096013, + "grad_norm": 1.5101975202560425, + "learning_rate": 4.661927166954734e-05, + "loss": 3.9035, + "step": 28159 + }, + { + "epoch": 0.1674754971928823, + "grad_norm": 1.6529417037963867, + "learning_rate": 4.66190371044648e-05, + "loss": 3.8917, + "step": 28160 + }, + { + "epoch": 0.16748144447616328, + "grad_norm": 1.2637635469436646, + "learning_rate": 4.6618802531835285e-05, + "loss": 5.2091, + "step": 28161 + }, + { + "epoch": 0.1674873917594443, + "grad_norm": 1.4303425550460815, + "learning_rate": 4.661856795165886e-05, + "loss": 5.368, + "step": 28162 + }, + { + "epoch": 0.1674933390427253, + "grad_norm": 1.8119208812713623, + "learning_rate": 4.661833336393562e-05, + "loss": 4.257, + "step": 28163 + }, + { + "epoch": 0.16749928632600627, + "grad_norm": 2.0059077739715576, + "learning_rate": 4.661809876866564e-05, + "loss": 4.225, + "step": 28164 + }, + { + "epoch": 0.1675052336092873, + "grad_norm": 1.87846839427948, + "learning_rate": 4.6617864165849005e-05, + "loss": 4.182, + "step": 28165 + }, + { + "epoch": 0.16751118089256828, + "grad_norm": 1.5655750036239624, + "learning_rate": 4.66176295554858e-05, + "loss": 5.441, + "step": 28166 + }, + { + "epoch": 0.16751712817584927, + "grad_norm": 1.735921025276184, + "learning_rate": 4.661739493757611e-05, + "loss": 5.1415, + "step": 28167 + }, + { + "epoch": 0.16752307545913028, + "grad_norm": 1.6819477081298828, + "learning_rate": 4.661716031212e-05, + "loss": 5.2213, + "step": 28168 + }, + { + "epoch": 0.16752902274241127, + "grad_norm": 1.5038045644760132, + "learning_rate": 4.661692567911756e-05, + "loss": 4.3357, + "step": 28169 + }, + { + "epoch": 0.16753497002569226, + "grad_norm": 1.8683745861053467, + "learning_rate": 4.6616691038568885e-05, + "loss": 4.5498, + "step": 28170 + }, + { + "epoch": 0.16754091730897325, + "grad_norm": 1.6156747341156006, + "learning_rate": 4.661645639047405e-05, + "loss": 4.7422, + "step": 28171 + }, + { + "epoch": 0.16754686459225426, + "grad_norm": 1.8638094663619995, + "learning_rate": 4.661622173483312e-05, + "loss": 4.4363, + "step": 28172 + }, + { + "epoch": 0.16755281187553525, + "grad_norm": 1.800417184829712, + "learning_rate": 4.6615987071646194e-05, + "loss": 4.355, + "step": 28173 + }, + { + "epoch": 0.16755875915881624, + "grad_norm": 1.765234351158142, + "learning_rate": 4.661575240091336e-05, + "loss": 4.3521, + "step": 28174 + }, + { + "epoch": 0.16756470644209726, + "grad_norm": 1.7296849489212036, + "learning_rate": 4.661551772263468e-05, + "loss": 4.8884, + "step": 28175 + }, + { + "epoch": 0.16757065372537824, + "grad_norm": 1.609222650527954, + "learning_rate": 4.661528303681025e-05, + "loss": 4.6088, + "step": 28176 + }, + { + "epoch": 0.16757660100865923, + "grad_norm": 1.910651445388794, + "learning_rate": 4.6615048343440145e-05, + "loss": 4.3531, + "step": 28177 + }, + { + "epoch": 0.16758254829194025, + "grad_norm": 1.6934939622879028, + "learning_rate": 4.6614813642524454e-05, + "loss": 4.1895, + "step": 28178 + }, + { + "epoch": 0.16758849557522124, + "grad_norm": 1.630308985710144, + "learning_rate": 4.6614578934063244e-05, + "loss": 4.5883, + "step": 28179 + }, + { + "epoch": 0.16759444285850222, + "grad_norm": 1.4629896879196167, + "learning_rate": 4.6614344218056624e-05, + "loss": 4.4655, + "step": 28180 + }, + { + "epoch": 0.16760039014178324, + "grad_norm": 1.522980809211731, + "learning_rate": 4.6614109494504654e-05, + "loss": 5.3745, + "step": 28181 + }, + { + "epoch": 0.16760633742506423, + "grad_norm": 1.3758256435394287, + "learning_rate": 4.661387476340742e-05, + "loss": 5.4374, + "step": 28182 + }, + { + "epoch": 0.16761228470834522, + "grad_norm": 1.4767520427703857, + "learning_rate": 4.661364002476501e-05, + "loss": 5.4039, + "step": 28183 + }, + { + "epoch": 0.16761823199162623, + "grad_norm": 1.3167197704315186, + "learning_rate": 4.661340527857749e-05, + "loss": 5.3886, + "step": 28184 + }, + { + "epoch": 0.16762417927490722, + "grad_norm": 1.8137489557266235, + "learning_rate": 4.661317052484496e-05, + "loss": 4.6928, + "step": 28185 + }, + { + "epoch": 0.1676301265581882, + "grad_norm": 1.7553741931915283, + "learning_rate": 4.66129357635675e-05, + "loss": 5.0159, + "step": 28186 + }, + { + "epoch": 0.16763607384146922, + "grad_norm": 1.341352939605713, + "learning_rate": 4.661270099474518e-05, + "loss": 5.4529, + "step": 28187 + }, + { + "epoch": 0.1676420211247502, + "grad_norm": 1.553514003753662, + "learning_rate": 4.661246621837809e-05, + "loss": 5.1907, + "step": 28188 + }, + { + "epoch": 0.1676479684080312, + "grad_norm": 1.4974607229232788, + "learning_rate": 4.661223143446631e-05, + "loss": 5.2143, + "step": 28189 + }, + { + "epoch": 0.16765391569131222, + "grad_norm": 1.5769060850143433, + "learning_rate": 4.661199664300993e-05, + "loss": 5.1265, + "step": 28190 + }, + { + "epoch": 0.1676598629745932, + "grad_norm": 1.4753527641296387, + "learning_rate": 4.6611761844009026e-05, + "loss": 4.974, + "step": 28191 + }, + { + "epoch": 0.1676658102578742, + "grad_norm": 1.5406947135925293, + "learning_rate": 4.661152703746368e-05, + "loss": 4.8269, + "step": 28192 + }, + { + "epoch": 0.1676717575411552, + "grad_norm": 1.864577054977417, + "learning_rate": 4.661129222337397e-05, + "loss": 4.505, + "step": 28193 + }, + { + "epoch": 0.1676777048244362, + "grad_norm": 1.561606526374817, + "learning_rate": 4.6611057401739976e-05, + "loss": 4.6992, + "step": 28194 + }, + { + "epoch": 0.16768365210771718, + "grad_norm": 1.6339094638824463, + "learning_rate": 4.661082257256179e-05, + "loss": 4.8973, + "step": 28195 + }, + { + "epoch": 0.1676895993909982, + "grad_norm": 1.8106483221054077, + "learning_rate": 4.661058773583949e-05, + "loss": 4.5909, + "step": 28196 + }, + { + "epoch": 0.1676955466742792, + "grad_norm": 1.6181379556655884, + "learning_rate": 4.661035289157316e-05, + "loss": 5.225, + "step": 28197 + }, + { + "epoch": 0.16770149395756018, + "grad_norm": 1.8745672702789307, + "learning_rate": 4.6610118039762876e-05, + "loss": 4.6381, + "step": 28198 + }, + { + "epoch": 0.1677074412408412, + "grad_norm": 1.6809148788452148, + "learning_rate": 4.6609883180408717e-05, + "loss": 4.879, + "step": 28199 + }, + { + "epoch": 0.16771338852412218, + "grad_norm": 1.6960088014602661, + "learning_rate": 4.660964831351078e-05, + "loss": 4.8171, + "step": 28200 + }, + { + "epoch": 0.16771933580740317, + "grad_norm": 1.8078324794769287, + "learning_rate": 4.660941343906913e-05, + "loss": 4.4722, + "step": 28201 + }, + { + "epoch": 0.16772528309068419, + "grad_norm": 1.6765756607055664, + "learning_rate": 4.660917855708386e-05, + "loss": 4.3086, + "step": 28202 + }, + { + "epoch": 0.16773123037396517, + "grad_norm": 1.608927845954895, + "learning_rate": 4.660894366755505e-05, + "loss": 4.5967, + "step": 28203 + }, + { + "epoch": 0.16773717765724616, + "grad_norm": 2.0235023498535156, + "learning_rate": 4.660870877048278e-05, + "loss": 4.4936, + "step": 28204 + }, + { + "epoch": 0.16774312494052718, + "grad_norm": 1.6895809173583984, + "learning_rate": 4.660847386586713e-05, + "loss": 4.9949, + "step": 28205 + }, + { + "epoch": 0.16774907222380817, + "grad_norm": 1.6481704711914062, + "learning_rate": 4.660823895370819e-05, + "loss": 5.2061, + "step": 28206 + }, + { + "epoch": 0.16775501950708915, + "grad_norm": 1.5078449249267578, + "learning_rate": 4.660800403400604e-05, + "loss": 5.0231, + "step": 28207 + }, + { + "epoch": 0.16776096679037017, + "grad_norm": 1.6977524757385254, + "learning_rate": 4.660776910676076e-05, + "loss": 4.9922, + "step": 28208 + }, + { + "epoch": 0.16776691407365116, + "grad_norm": 1.826011300086975, + "learning_rate": 4.6607534171972425e-05, + "loss": 4.2673, + "step": 28209 + }, + { + "epoch": 0.16777286135693215, + "grad_norm": 2.544302463531494, + "learning_rate": 4.660729922964112e-05, + "loss": 4.3124, + "step": 28210 + }, + { + "epoch": 0.16777880864021316, + "grad_norm": 1.7719815969467163, + "learning_rate": 4.660706427976693e-05, + "loss": 4.249, + "step": 28211 + }, + { + "epoch": 0.16778475592349415, + "grad_norm": 1.6741911172866821, + "learning_rate": 4.660682932234994e-05, + "loss": 4.3522, + "step": 28212 + }, + { + "epoch": 0.16779070320677514, + "grad_norm": 1.6827515363693237, + "learning_rate": 4.660659435739023e-05, + "loss": 4.3316, + "step": 28213 + }, + { + "epoch": 0.16779665049005615, + "grad_norm": 1.722598671913147, + "learning_rate": 4.6606359384887884e-05, + "loss": 4.3367, + "step": 28214 + }, + { + "epoch": 0.16780259777333714, + "grad_norm": 1.7667568922042847, + "learning_rate": 4.660612440484298e-05, + "loss": 4.2754, + "step": 28215 + }, + { + "epoch": 0.16780854505661813, + "grad_norm": 1.7074247598648071, + "learning_rate": 4.6605889417255596e-05, + "loss": 4.2489, + "step": 28216 + }, + { + "epoch": 0.16781449233989915, + "grad_norm": 1.8784146308898926, + "learning_rate": 4.6605654422125836e-05, + "loss": 4.4672, + "step": 28217 + }, + { + "epoch": 0.16782043962318013, + "grad_norm": 1.909641981124878, + "learning_rate": 4.660541941945374e-05, + "loss": 4.5413, + "step": 28218 + }, + { + "epoch": 0.16782638690646112, + "grad_norm": 1.4848551750183105, + "learning_rate": 4.660518440923943e-05, + "loss": 4.6922, + "step": 28219 + }, + { + "epoch": 0.16783233418974214, + "grad_norm": 1.5976632833480835, + "learning_rate": 4.6604949391482974e-05, + "loss": 4.7525, + "step": 28220 + }, + { + "epoch": 0.16783828147302313, + "grad_norm": 1.609236478805542, + "learning_rate": 4.6604714366184455e-05, + "loss": 5.1537, + "step": 28221 + }, + { + "epoch": 0.16784422875630411, + "grad_norm": 1.4178111553192139, + "learning_rate": 4.660447933334394e-05, + "loss": 5.0935, + "step": 28222 + }, + { + "epoch": 0.16785017603958513, + "grad_norm": 1.7521015405654907, + "learning_rate": 4.660424429296154e-05, + "loss": 4.6712, + "step": 28223 + }, + { + "epoch": 0.16785612332286612, + "grad_norm": 1.8282933235168457, + "learning_rate": 4.660400924503731e-05, + "loss": 5.8207, + "step": 28224 + }, + { + "epoch": 0.1678620706061471, + "grad_norm": 1.5437854528427124, + "learning_rate": 4.6603774189571345e-05, + "loss": 5.751, + "step": 28225 + }, + { + "epoch": 0.16786801788942812, + "grad_norm": 1.723281979560852, + "learning_rate": 4.660353912656373e-05, + "loss": 4.6481, + "step": 28226 + }, + { + "epoch": 0.1678739651727091, + "grad_norm": 1.718805193901062, + "learning_rate": 4.6603304056014545e-05, + "loss": 5.4971, + "step": 28227 + }, + { + "epoch": 0.1678799124559901, + "grad_norm": 1.6174219846725464, + "learning_rate": 4.660306897792387e-05, + "loss": 4.475, + "step": 28228 + }, + { + "epoch": 0.1678858597392711, + "grad_norm": 1.8539583683013916, + "learning_rate": 4.660283389229178e-05, + "loss": 4.3182, + "step": 28229 + }, + { + "epoch": 0.1678918070225521, + "grad_norm": 1.6682637929916382, + "learning_rate": 4.660259879911837e-05, + "loss": 4.5625, + "step": 28230 + }, + { + "epoch": 0.1678977543058331, + "grad_norm": 1.825737714767456, + "learning_rate": 4.660236369840371e-05, + "loss": 4.1975, + "step": 28231 + }, + { + "epoch": 0.16790370158911408, + "grad_norm": 1.6130248308181763, + "learning_rate": 4.6602128590147894e-05, + "loss": 5.6634, + "step": 28232 + }, + { + "epoch": 0.1679096488723951, + "grad_norm": 1.6243139505386353, + "learning_rate": 4.660189347435099e-05, + "loss": 4.972, + "step": 28233 + }, + { + "epoch": 0.16791559615567608, + "grad_norm": 1.5760700702667236, + "learning_rate": 4.66016583510131e-05, + "loss": 4.7272, + "step": 28234 + }, + { + "epoch": 0.16792154343895707, + "grad_norm": 1.2500736713409424, + "learning_rate": 4.660142322013429e-05, + "loss": 4.469, + "step": 28235 + }, + { + "epoch": 0.1679274907222381, + "grad_norm": 1.3888235092163086, + "learning_rate": 4.660118808171464e-05, + "loss": 5.3952, + "step": 28236 + }, + { + "epoch": 0.16793343800551908, + "grad_norm": 1.3789753913879395, + "learning_rate": 4.660095293575424e-05, + "loss": 5.8424, + "step": 28237 + }, + { + "epoch": 0.16793938528880006, + "grad_norm": 1.1890273094177246, + "learning_rate": 4.660071778225317e-05, + "loss": 5.9341, + "step": 28238 + }, + { + "epoch": 0.16794533257208108, + "grad_norm": 1.3315849304199219, + "learning_rate": 4.660048262121152e-05, + "loss": 5.9202, + "step": 28239 + }, + { + "epoch": 0.16795127985536207, + "grad_norm": 1.5866754055023193, + "learning_rate": 4.6600247452629365e-05, + "loss": 5.1867, + "step": 28240 + }, + { + "epoch": 0.16795722713864306, + "grad_norm": 1.842445969581604, + "learning_rate": 4.660001227650678e-05, + "loss": 4.4602, + "step": 28241 + }, + { + "epoch": 0.16796317442192407, + "grad_norm": 1.7466117143630981, + "learning_rate": 4.6599777092843855e-05, + "loss": 4.696, + "step": 28242 + }, + { + "epoch": 0.16796912170520506, + "grad_norm": 1.5599199533462524, + "learning_rate": 4.6599541901640665e-05, + "loss": 4.5027, + "step": 28243 + }, + { + "epoch": 0.16797506898848605, + "grad_norm": 1.3156886100769043, + "learning_rate": 4.6599306702897304e-05, + "loss": 4.2991, + "step": 28244 + }, + { + "epoch": 0.16798101627176706, + "grad_norm": 1.372679352760315, + "learning_rate": 4.659907149661386e-05, + "loss": 4.6257, + "step": 28245 + }, + { + "epoch": 0.16798696355504805, + "grad_norm": 1.599493384361267, + "learning_rate": 4.659883628279039e-05, + "loss": 4.4781, + "step": 28246 + }, + { + "epoch": 0.16799291083832904, + "grad_norm": 1.516619324684143, + "learning_rate": 4.6598601061426986e-05, + "loss": 4.4817, + "step": 28247 + }, + { + "epoch": 0.16799885812161006, + "grad_norm": 1.6319454908370972, + "learning_rate": 4.6598365832523736e-05, + "loss": 4.4314, + "step": 28248 + }, + { + "epoch": 0.16800480540489104, + "grad_norm": 1.5013442039489746, + "learning_rate": 4.6598130596080726e-05, + "loss": 4.3608, + "step": 28249 + }, + { + "epoch": 0.16801075268817203, + "grad_norm": 1.5573625564575195, + "learning_rate": 4.659789535209803e-05, + "loss": 4.38, + "step": 28250 + }, + { + "epoch": 0.16801669997145305, + "grad_norm": 1.5244330167770386, + "learning_rate": 4.659766010057574e-05, + "loss": 4.4152, + "step": 28251 + }, + { + "epoch": 0.16802264725473404, + "grad_norm": 2.792175054550171, + "learning_rate": 4.659742484151391e-05, + "loss": 3.7226, + "step": 28252 + }, + { + "epoch": 0.16802859453801502, + "grad_norm": 2.0370240211486816, + "learning_rate": 4.6597189574912654e-05, + "loss": 4.1552, + "step": 28253 + }, + { + "epoch": 0.16803454182129604, + "grad_norm": 1.6263444423675537, + "learning_rate": 4.6596954300772044e-05, + "loss": 4.7215, + "step": 28254 + }, + { + "epoch": 0.16804048910457703, + "grad_norm": 1.6130170822143555, + "learning_rate": 4.659671901909215e-05, + "loss": 4.5078, + "step": 28255 + }, + { + "epoch": 0.16804643638785802, + "grad_norm": 1.3925176858901978, + "learning_rate": 4.659648372987308e-05, + "loss": 4.6085, + "step": 28256 + }, + { + "epoch": 0.16805238367113903, + "grad_norm": 1.4680298566818237, + "learning_rate": 4.6596248433114886e-05, + "loss": 4.4605, + "step": 28257 + }, + { + "epoch": 0.16805833095442002, + "grad_norm": 1.9639580249786377, + "learning_rate": 4.659601312881767e-05, + "loss": 4.1688, + "step": 28258 + }, + { + "epoch": 0.168064278237701, + "grad_norm": 1.7880107164382935, + "learning_rate": 4.6595777816981515e-05, + "loss": 4.3835, + "step": 28259 + }, + { + "epoch": 0.16807022552098203, + "grad_norm": 1.8420106172561646, + "learning_rate": 4.659554249760649e-05, + "loss": 4.4068, + "step": 28260 + }, + { + "epoch": 0.168076172804263, + "grad_norm": 1.7331891059875488, + "learning_rate": 4.659530717069269e-05, + "loss": 4.2069, + "step": 28261 + }, + { + "epoch": 0.168082120087544, + "grad_norm": 1.6757560968399048, + "learning_rate": 4.659507183624019e-05, + "loss": 4.7915, + "step": 28262 + }, + { + "epoch": 0.16808806737082502, + "grad_norm": 1.6277943849563599, + "learning_rate": 4.6594836494249066e-05, + "loss": 4.431, + "step": 28263 + }, + { + "epoch": 0.168094014654106, + "grad_norm": 1.9865028858184814, + "learning_rate": 4.6594601144719406e-05, + "loss": 4.8244, + "step": 28264 + }, + { + "epoch": 0.168099961937387, + "grad_norm": 1.818390130996704, + "learning_rate": 4.659436578765131e-05, + "loss": 4.7089, + "step": 28265 + }, + { + "epoch": 0.168105909220668, + "grad_norm": 1.3201099634170532, + "learning_rate": 4.6594130423044836e-05, + "loss": 4.8117, + "step": 28266 + }, + { + "epoch": 0.168111856503949, + "grad_norm": 1.7755099534988403, + "learning_rate": 4.6593895050900074e-05, + "loss": 4.4389, + "step": 28267 + }, + { + "epoch": 0.16811780378722999, + "grad_norm": 1.6653193235397339, + "learning_rate": 4.65936596712171e-05, + "loss": 4.3489, + "step": 28268 + }, + { + "epoch": 0.168123751070511, + "grad_norm": 1.4699918031692505, + "learning_rate": 4.6593424283996004e-05, + "loss": 4.935, + "step": 28269 + }, + { + "epoch": 0.168129698353792, + "grad_norm": 1.8290356397628784, + "learning_rate": 4.659318888923687e-05, + "loss": 5.1348, + "step": 28270 + }, + { + "epoch": 0.16813564563707298, + "grad_norm": 1.7782410383224487, + "learning_rate": 4.6592953486939784e-05, + "loss": 5.1601, + "step": 28271 + }, + { + "epoch": 0.168141592920354, + "grad_norm": 1.8384326696395874, + "learning_rate": 4.6592718077104814e-05, + "loss": 4.7923, + "step": 28272 + }, + { + "epoch": 0.16814754020363498, + "grad_norm": 1.6723445653915405, + "learning_rate": 4.659248265973205e-05, + "loss": 4.9497, + "step": 28273 + }, + { + "epoch": 0.16815348748691597, + "grad_norm": 1.4820493459701538, + "learning_rate": 4.6592247234821575e-05, + "loss": 4.3104, + "step": 28274 + }, + { + "epoch": 0.16815943477019699, + "grad_norm": 1.4215086698532104, + "learning_rate": 4.659201180237346e-05, + "loss": 4.5723, + "step": 28275 + }, + { + "epoch": 0.16816538205347797, + "grad_norm": 1.6446219682693481, + "learning_rate": 4.6591776362387804e-05, + "loss": 4.6208, + "step": 28276 + }, + { + "epoch": 0.16817132933675896, + "grad_norm": 1.6352293491363525, + "learning_rate": 4.6591540914864686e-05, + "loss": 5.03, + "step": 28277 + }, + { + "epoch": 0.16817727662003998, + "grad_norm": 1.59463369846344, + "learning_rate": 4.659130545980418e-05, + "loss": 4.5116, + "step": 28278 + }, + { + "epoch": 0.16818322390332097, + "grad_norm": 1.8565449714660645, + "learning_rate": 4.659106999720637e-05, + "loss": 4.4572, + "step": 28279 + }, + { + "epoch": 0.16818917118660195, + "grad_norm": 1.7354021072387695, + "learning_rate": 4.659083452707135e-05, + "loss": 4.9343, + "step": 28280 + }, + { + "epoch": 0.16819511846988297, + "grad_norm": 1.8169907331466675, + "learning_rate": 4.659059904939918e-05, + "loss": 4.6285, + "step": 28281 + }, + { + "epoch": 0.16820106575316396, + "grad_norm": 1.6343300342559814, + "learning_rate": 4.659036356418996e-05, + "loss": 4.6125, + "step": 28282 + }, + { + "epoch": 0.16820701303644495, + "grad_norm": 1.5487629175186157, + "learning_rate": 4.659012807144377e-05, + "loss": 4.5907, + "step": 28283 + }, + { + "epoch": 0.16821296031972596, + "grad_norm": 1.4640655517578125, + "learning_rate": 4.658989257116069e-05, + "loss": 4.4199, + "step": 28284 + }, + { + "epoch": 0.16821890760300695, + "grad_norm": 1.4370266199111938, + "learning_rate": 4.65896570633408e-05, + "loss": 4.5677, + "step": 28285 + }, + { + "epoch": 0.16822485488628794, + "grad_norm": 1.6564301252365112, + "learning_rate": 4.658942154798418e-05, + "loss": 4.5189, + "step": 28286 + }, + { + "epoch": 0.16823080216956893, + "grad_norm": 1.6301320791244507, + "learning_rate": 4.658918602509091e-05, + "loss": 4.9653, + "step": 28287 + }, + { + "epoch": 0.16823674945284994, + "grad_norm": 1.5462539196014404, + "learning_rate": 4.6588950494661096e-05, + "loss": 5.011, + "step": 28288 + }, + { + "epoch": 0.16824269673613093, + "grad_norm": 1.7004579305648804, + "learning_rate": 4.658871495669479e-05, + "loss": 4.7863, + "step": 28289 + }, + { + "epoch": 0.16824864401941192, + "grad_norm": 1.47449791431427, + "learning_rate": 4.658847941119209e-05, + "loss": 4.8344, + "step": 28290 + }, + { + "epoch": 0.16825459130269294, + "grad_norm": 1.7310223579406738, + "learning_rate": 4.658824385815308e-05, + "loss": 4.5996, + "step": 28291 + }, + { + "epoch": 0.16826053858597392, + "grad_norm": 1.5716323852539062, + "learning_rate": 4.658800829757782e-05, + "loss": 4.6623, + "step": 28292 + }, + { + "epoch": 0.1682664858692549, + "grad_norm": 1.8458023071289062, + "learning_rate": 4.6587772729466426e-05, + "loss": 4.8966, + "step": 28293 + }, + { + "epoch": 0.16827243315253593, + "grad_norm": 1.4939119815826416, + "learning_rate": 4.658753715381896e-05, + "loss": 4.9607, + "step": 28294 + }, + { + "epoch": 0.16827838043581692, + "grad_norm": 1.6060224771499634, + "learning_rate": 4.658730157063551e-05, + "loss": 4.9144, + "step": 28295 + }, + { + "epoch": 0.1682843277190979, + "grad_norm": 1.6743205785751343, + "learning_rate": 4.658706597991615e-05, + "loss": 5.1634, + "step": 28296 + }, + { + "epoch": 0.16829027500237892, + "grad_norm": 1.6277934312820435, + "learning_rate": 4.658683038166097e-05, + "loss": 4.5367, + "step": 28297 + }, + { + "epoch": 0.1682962222856599, + "grad_norm": 2.8272674083709717, + "learning_rate": 4.658659477587005e-05, + "loss": 4.5467, + "step": 28298 + }, + { + "epoch": 0.1683021695689409, + "grad_norm": 2.199181318283081, + "learning_rate": 4.658635916254348e-05, + "loss": 4.595, + "step": 28299 + }, + { + "epoch": 0.1683081168522219, + "grad_norm": 1.860811710357666, + "learning_rate": 4.6586123541681324e-05, + "loss": 4.6934, + "step": 28300 + }, + { + "epoch": 0.1683140641355029, + "grad_norm": 1.5959035158157349, + "learning_rate": 4.6585887913283685e-05, + "loss": 4.5346, + "step": 28301 + }, + { + "epoch": 0.1683200114187839, + "grad_norm": 1.503235936164856, + "learning_rate": 4.658565227735063e-05, + "loss": 4.7135, + "step": 28302 + }, + { + "epoch": 0.1683259587020649, + "grad_norm": 1.5272914171218872, + "learning_rate": 4.658541663388225e-05, + "loss": 4.507, + "step": 28303 + }, + { + "epoch": 0.1683319059853459, + "grad_norm": 1.7282012701034546, + "learning_rate": 4.6585180982878615e-05, + "loss": 4.4787, + "step": 28304 + }, + { + "epoch": 0.16833785326862688, + "grad_norm": 1.6522059440612793, + "learning_rate": 4.6584945324339823e-05, + "loss": 4.5825, + "step": 28305 + }, + { + "epoch": 0.1683438005519079, + "grad_norm": 1.3752492666244507, + "learning_rate": 4.6584709658265955e-05, + "loss": 4.7064, + "step": 28306 + }, + { + "epoch": 0.16834974783518888, + "grad_norm": 2.415187358856201, + "learning_rate": 4.6584473984657086e-05, + "loss": 4.1959, + "step": 28307 + }, + { + "epoch": 0.16835569511846987, + "grad_norm": 1.545029640197754, + "learning_rate": 4.6584238303513295e-05, + "loss": 4.426, + "step": 28308 + }, + { + "epoch": 0.1683616424017509, + "grad_norm": 1.6749895811080933, + "learning_rate": 4.6584002614834666e-05, + "loss": 5.19, + "step": 28309 + }, + { + "epoch": 0.16836758968503188, + "grad_norm": 1.5567103624343872, + "learning_rate": 4.65837669186213e-05, + "loss": 4.854, + "step": 28310 + }, + { + "epoch": 0.16837353696831286, + "grad_norm": 1.2138694524765015, + "learning_rate": 4.658353121487324e-05, + "loss": 4.6035, + "step": 28311 + }, + { + "epoch": 0.16837948425159388, + "grad_norm": 1.4592459201812744, + "learning_rate": 4.658329550359061e-05, + "loss": 4.6315, + "step": 28312 + }, + { + "epoch": 0.16838543153487487, + "grad_norm": 1.5305829048156738, + "learning_rate": 4.658305978477348e-05, + "loss": 4.9041, + "step": 28313 + }, + { + "epoch": 0.16839137881815586, + "grad_norm": 2.0584359169006348, + "learning_rate": 4.658282405842191e-05, + "loss": 3.7849, + "step": 28314 + }, + { + "epoch": 0.16839732610143687, + "grad_norm": 3.1896352767944336, + "learning_rate": 4.658258832453601e-05, + "loss": 3.9083, + "step": 28315 + }, + { + "epoch": 0.16840327338471786, + "grad_norm": 2.942909002304077, + "learning_rate": 4.658235258311584e-05, + "loss": 3.6764, + "step": 28316 + }, + { + "epoch": 0.16840922066799885, + "grad_norm": 3.2764618396759033, + "learning_rate": 4.65821168341615e-05, + "loss": 3.8794, + "step": 28317 + }, + { + "epoch": 0.16841516795127986, + "grad_norm": 2.8366522789001465, + "learning_rate": 4.6581881077673074e-05, + "loss": 4.8133, + "step": 28318 + }, + { + "epoch": 0.16842111523456085, + "grad_norm": 1.551155686378479, + "learning_rate": 4.658164531365063e-05, + "loss": 4.7024, + "step": 28319 + }, + { + "epoch": 0.16842706251784184, + "grad_norm": 2.4063937664031982, + "learning_rate": 4.6581409542094255e-05, + "loss": 3.2516, + "step": 28320 + }, + { + "epoch": 0.16843300980112286, + "grad_norm": 2.5758605003356934, + "learning_rate": 4.658117376300404e-05, + "loss": 3.5301, + "step": 28321 + }, + { + "epoch": 0.16843895708440385, + "grad_norm": 2.643880605697632, + "learning_rate": 4.658093797638005e-05, + "loss": 3.2137, + "step": 28322 + }, + { + "epoch": 0.16844490436768483, + "grad_norm": 2.6048755645751953, + "learning_rate": 4.658070218222238e-05, + "loss": 3.3595, + "step": 28323 + }, + { + "epoch": 0.16845085165096585, + "grad_norm": 2.677281141281128, + "learning_rate": 4.6580466380531116e-05, + "loss": 4.0526, + "step": 28324 + }, + { + "epoch": 0.16845679893424684, + "grad_norm": 2.1559438705444336, + "learning_rate": 4.658023057130633e-05, + "loss": 3.6773, + "step": 28325 + }, + { + "epoch": 0.16846274621752783, + "grad_norm": 2.271451711654663, + "learning_rate": 4.6579994754548105e-05, + "loss": 3.3233, + "step": 28326 + }, + { + "epoch": 0.16846869350080884, + "grad_norm": 2.6819088459014893, + "learning_rate": 4.657975893025653e-05, + "loss": 3.0184, + "step": 28327 + }, + { + "epoch": 0.16847464078408983, + "grad_norm": 2.7791247367858887, + "learning_rate": 4.6579523098431686e-05, + "loss": 3.4093, + "step": 28328 + }, + { + "epoch": 0.16848058806737082, + "grad_norm": 2.7528347969055176, + "learning_rate": 4.6579287259073654e-05, + "loss": 3.0479, + "step": 28329 + }, + { + "epoch": 0.16848653535065183, + "grad_norm": 2.3715124130249023, + "learning_rate": 4.657905141218252e-05, + "loss": 3.7365, + "step": 28330 + }, + { + "epoch": 0.16849248263393282, + "grad_norm": 1.9896430969238281, + "learning_rate": 4.657881555775835e-05, + "loss": 4.6336, + "step": 28331 + }, + { + "epoch": 0.1684984299172138, + "grad_norm": 1.6838959455490112, + "learning_rate": 4.657857969580124e-05, + "loss": 4.8033, + "step": 28332 + }, + { + "epoch": 0.16850437720049483, + "grad_norm": 1.7189829349517822, + "learning_rate": 4.6578343826311274e-05, + "loss": 4.721, + "step": 28333 + }, + { + "epoch": 0.16851032448377581, + "grad_norm": 2.3129501342773438, + "learning_rate": 4.657810794928854e-05, + "loss": 3.626, + "step": 28334 + }, + { + "epoch": 0.1685162717670568, + "grad_norm": 3.216485023498535, + "learning_rate": 4.6577872064733094e-05, + "loss": 3.2259, + "step": 28335 + }, + { + "epoch": 0.16852221905033782, + "grad_norm": 2.995213031768799, + "learning_rate": 4.657763617264506e-05, + "loss": 3.2364, + "step": 28336 + }, + { + "epoch": 0.1685281663336188, + "grad_norm": 2.6219449043273926, + "learning_rate": 4.6577400273024474e-05, + "loss": 4.2354, + "step": 28337 + }, + { + "epoch": 0.1685341136168998, + "grad_norm": 1.6310757398605347, + "learning_rate": 4.657716436587145e-05, + "loss": 5.3334, + "step": 28338 + }, + { + "epoch": 0.1685400609001808, + "grad_norm": 2.375399589538574, + "learning_rate": 4.657692845118605e-05, + "loss": 4.4366, + "step": 28339 + }, + { + "epoch": 0.1685460081834618, + "grad_norm": 1.874076247215271, + "learning_rate": 4.657669252896838e-05, + "loss": 5.2293, + "step": 28340 + }, + { + "epoch": 0.1685519554667428, + "grad_norm": 1.8757516145706177, + "learning_rate": 4.657645659921851e-05, + "loss": 4.6433, + "step": 28341 + }, + { + "epoch": 0.1685579027500238, + "grad_norm": 1.6679904460906982, + "learning_rate": 4.6576220661936514e-05, + "loss": 4.591, + "step": 28342 + }, + { + "epoch": 0.1685638500333048, + "grad_norm": 1.5081669092178345, + "learning_rate": 4.6575984717122487e-05, + "loss": 4.9147, + "step": 28343 + }, + { + "epoch": 0.16856979731658578, + "grad_norm": 1.4801992177963257, + "learning_rate": 4.657574876477651e-05, + "loss": 5.3181, + "step": 28344 + }, + { + "epoch": 0.1685757445998668, + "grad_norm": 1.5100293159484863, + "learning_rate": 4.657551280489865e-05, + "loss": 4.6282, + "step": 28345 + }, + { + "epoch": 0.16858169188314778, + "grad_norm": 1.5850365161895752, + "learning_rate": 4.6575276837489016e-05, + "loss": 4.566, + "step": 28346 + }, + { + "epoch": 0.16858763916642877, + "grad_norm": 1.9910119771957397, + "learning_rate": 4.657504086254766e-05, + "loss": 5.1222, + "step": 28347 + }, + { + "epoch": 0.16859358644970976, + "grad_norm": 1.8456346988677979, + "learning_rate": 4.65748048800747e-05, + "loss": 4.7977, + "step": 28348 + }, + { + "epoch": 0.16859953373299078, + "grad_norm": 2.4570720195770264, + "learning_rate": 4.657456889007018e-05, + "loss": 4.6518, + "step": 28349 + }, + { + "epoch": 0.16860548101627176, + "grad_norm": 2.76509952545166, + "learning_rate": 4.657433289253421e-05, + "loss": 4.2894, + "step": 28350 + }, + { + "epoch": 0.16861142829955275, + "grad_norm": 2.61690616607666, + "learning_rate": 4.657409688746686e-05, + "loss": 4.1016, + "step": 28351 + }, + { + "epoch": 0.16861737558283377, + "grad_norm": 2.678689479827881, + "learning_rate": 4.6573860874868214e-05, + "loss": 4.4325, + "step": 28352 + }, + { + "epoch": 0.16862332286611476, + "grad_norm": 2.1475918292999268, + "learning_rate": 4.657362485473836e-05, + "loss": 4.8043, + "step": 28353 + }, + { + "epoch": 0.16862927014939574, + "grad_norm": 1.7649880647659302, + "learning_rate": 4.657338882707738e-05, + "loss": 5.5315, + "step": 28354 + }, + { + "epoch": 0.16863521743267676, + "grad_norm": 2.451415538787842, + "learning_rate": 4.657315279188534e-05, + "loss": 4.4149, + "step": 28355 + }, + { + "epoch": 0.16864116471595775, + "grad_norm": 2.628056764602661, + "learning_rate": 4.657291674916234e-05, + "loss": 3.9996, + "step": 28356 + }, + { + "epoch": 0.16864711199923874, + "grad_norm": 2.5917954444885254, + "learning_rate": 4.657268069890847e-05, + "loss": 4.1523, + "step": 28357 + }, + { + "epoch": 0.16865305928251975, + "grad_norm": 2.5339810848236084, + "learning_rate": 4.657244464112379e-05, + "loss": 4.1835, + "step": 28358 + }, + { + "epoch": 0.16865900656580074, + "grad_norm": 2.5512847900390625, + "learning_rate": 4.657220857580839e-05, + "loss": 4.2205, + "step": 28359 + }, + { + "epoch": 0.16866495384908173, + "grad_norm": 1.9828633069992065, + "learning_rate": 4.657197250296236e-05, + "loss": 4.5812, + "step": 28360 + }, + { + "epoch": 0.16867090113236274, + "grad_norm": 1.9058914184570312, + "learning_rate": 4.657173642258578e-05, + "loss": 4.9579, + "step": 28361 + }, + { + "epoch": 0.16867684841564373, + "grad_norm": 2.473252534866333, + "learning_rate": 4.657150033467872e-05, + "loss": 4.2123, + "step": 28362 + }, + { + "epoch": 0.16868279569892472, + "grad_norm": 2.2516047954559326, + "learning_rate": 4.657126423924128e-05, + "loss": 4.2096, + "step": 28363 + }, + { + "epoch": 0.16868874298220574, + "grad_norm": 2.4706156253814697, + "learning_rate": 4.657102813627353e-05, + "loss": 4.0615, + "step": 28364 + }, + { + "epoch": 0.16869469026548672, + "grad_norm": 2.5827410221099854, + "learning_rate": 4.657079202577556e-05, + "loss": 4.4003, + "step": 28365 + }, + { + "epoch": 0.1687006375487677, + "grad_norm": 1.812254548072815, + "learning_rate": 4.657055590774745e-05, + "loss": 4.7705, + "step": 28366 + }, + { + "epoch": 0.16870658483204873, + "grad_norm": 1.5623784065246582, + "learning_rate": 4.6570319782189284e-05, + "loss": 5.3618, + "step": 28367 + }, + { + "epoch": 0.16871253211532972, + "grad_norm": 1.9756156206130981, + "learning_rate": 4.657008364910114e-05, + "loss": 5.0061, + "step": 28368 + }, + { + "epoch": 0.1687184793986107, + "grad_norm": 2.592015027999878, + "learning_rate": 4.65698475084831e-05, + "loss": 4.771, + "step": 28369 + }, + { + "epoch": 0.16872442668189172, + "grad_norm": 1.7394741773605347, + "learning_rate": 4.656961136033525e-05, + "loss": 5.4057, + "step": 28370 + }, + { + "epoch": 0.1687303739651727, + "grad_norm": 1.712748646736145, + "learning_rate": 4.656937520465767e-05, + "loss": 5.242, + "step": 28371 + }, + { + "epoch": 0.1687363212484537, + "grad_norm": 1.794945240020752, + "learning_rate": 4.6569139041450446e-05, + "loss": 5.1821, + "step": 28372 + }, + { + "epoch": 0.1687422685317347, + "grad_norm": 1.6122878789901733, + "learning_rate": 4.656890287071366e-05, + "loss": 5.3729, + "step": 28373 + }, + { + "epoch": 0.1687482158150157, + "grad_norm": 1.6189091205596924, + "learning_rate": 4.656866669244739e-05, + "loss": 5.5319, + "step": 28374 + }, + { + "epoch": 0.1687541630982967, + "grad_norm": 1.4604097604751587, + "learning_rate": 4.6568430506651715e-05, + "loss": 5.7885, + "step": 28375 + }, + { + "epoch": 0.1687601103815777, + "grad_norm": 1.4060790538787842, + "learning_rate": 4.656819431332673e-05, + "loss": 5.8022, + "step": 28376 + }, + { + "epoch": 0.1687660576648587, + "grad_norm": 1.4350751638412476, + "learning_rate": 4.6567958112472515e-05, + "loss": 5.8437, + "step": 28377 + }, + { + "epoch": 0.16877200494813968, + "grad_norm": 1.572094202041626, + "learning_rate": 4.656772190408914e-05, + "loss": 5.2559, + "step": 28378 + }, + { + "epoch": 0.1687779522314207, + "grad_norm": 1.5529630184173584, + "learning_rate": 4.656748568817671e-05, + "loss": 5.325, + "step": 28379 + }, + { + "epoch": 0.16878389951470169, + "grad_norm": 1.5496705770492554, + "learning_rate": 4.656724946473528e-05, + "loss": 5.2824, + "step": 28380 + }, + { + "epoch": 0.16878984679798267, + "grad_norm": 1.4349329471588135, + "learning_rate": 4.656701323376496e-05, + "loss": 5.3192, + "step": 28381 + }, + { + "epoch": 0.1687957940812637, + "grad_norm": 1.391747236251831, + "learning_rate": 4.6566776995265804e-05, + "loss": 5.2476, + "step": 28382 + }, + { + "epoch": 0.16880174136454468, + "grad_norm": 1.3532518148422241, + "learning_rate": 4.6566540749237916e-05, + "loss": 5.1795, + "step": 28383 + }, + { + "epoch": 0.16880768864782567, + "grad_norm": 1.4906384944915771, + "learning_rate": 4.656630449568137e-05, + "loss": 5.3211, + "step": 28384 + }, + { + "epoch": 0.16881363593110668, + "grad_norm": 1.560478687286377, + "learning_rate": 4.656606823459625e-05, + "loss": 5.2823, + "step": 28385 + }, + { + "epoch": 0.16881958321438767, + "grad_norm": 1.6834107637405396, + "learning_rate": 4.656583196598264e-05, + "loss": 5.206, + "step": 28386 + }, + { + "epoch": 0.16882553049766866, + "grad_norm": 1.4601906538009644, + "learning_rate": 4.656559568984062e-05, + "loss": 5.2269, + "step": 28387 + }, + { + "epoch": 0.16883147778094967, + "grad_norm": 1.7208976745605469, + "learning_rate": 4.656535940617027e-05, + "loss": 5.3731, + "step": 28388 + }, + { + "epoch": 0.16883742506423066, + "grad_norm": 1.6507620811462402, + "learning_rate": 4.656512311497168e-05, + "loss": 5.544, + "step": 28389 + }, + { + "epoch": 0.16884337234751165, + "grad_norm": 1.7269225120544434, + "learning_rate": 4.6564886816244926e-05, + "loss": 5.5757, + "step": 28390 + }, + { + "epoch": 0.16884931963079267, + "grad_norm": 1.8436660766601562, + "learning_rate": 4.6564650509990096e-05, + "loss": 5.2549, + "step": 28391 + }, + { + "epoch": 0.16885526691407365, + "grad_norm": 2.2432281970977783, + "learning_rate": 4.656441419620727e-05, + "loss": 4.788, + "step": 28392 + }, + { + "epoch": 0.16886121419735464, + "grad_norm": 1.6931114196777344, + "learning_rate": 4.656417787489652e-05, + "loss": 4.9039, + "step": 28393 + }, + { + "epoch": 0.16886716148063566, + "grad_norm": 1.6208950281143188, + "learning_rate": 4.656394154605795e-05, + "loss": 5.2821, + "step": 28394 + }, + { + "epoch": 0.16887310876391665, + "grad_norm": 2.725078821182251, + "learning_rate": 4.656370520969162e-05, + "loss": 4.3892, + "step": 28395 + }, + { + "epoch": 0.16887905604719763, + "grad_norm": 3.6109495162963867, + "learning_rate": 4.6563468865797636e-05, + "loss": 4.1935, + "step": 28396 + }, + { + "epoch": 0.16888500333047865, + "grad_norm": 1.9827744960784912, + "learning_rate": 4.656323251437606e-05, + "loss": 5.1187, + "step": 28397 + }, + { + "epoch": 0.16889095061375964, + "grad_norm": 1.8615485429763794, + "learning_rate": 4.6562996155426985e-05, + "loss": 5.6777, + "step": 28398 + }, + { + "epoch": 0.16889689789704063, + "grad_norm": 1.7114287614822388, + "learning_rate": 4.6562759788950484e-05, + "loss": 5.5126, + "step": 28399 + }, + { + "epoch": 0.16890284518032164, + "grad_norm": 1.672108769416809, + "learning_rate": 4.656252341494666e-05, + "loss": 5.2453, + "step": 28400 + }, + { + "epoch": 0.16890879246360263, + "grad_norm": 1.7363505363464355, + "learning_rate": 4.656228703341556e-05, + "loss": 5.1452, + "step": 28401 + }, + { + "epoch": 0.16891473974688362, + "grad_norm": 1.6358929872512817, + "learning_rate": 4.656205064435731e-05, + "loss": 4.7812, + "step": 28402 + }, + { + "epoch": 0.16892068703016463, + "grad_norm": 1.5269345045089722, + "learning_rate": 4.656181424777196e-05, + "loss": 4.9725, + "step": 28403 + }, + { + "epoch": 0.16892663431344562, + "grad_norm": 1.8694361448287964, + "learning_rate": 4.656157784365961e-05, + "loss": 4.8145, + "step": 28404 + }, + { + "epoch": 0.1689325815967266, + "grad_norm": 1.6409978866577148, + "learning_rate": 4.6561341432020335e-05, + "loss": 4.8409, + "step": 28405 + }, + { + "epoch": 0.1689385288800076, + "grad_norm": 1.586323618888855, + "learning_rate": 4.656110501285421e-05, + "loss": 4.9883, + "step": 28406 + }, + { + "epoch": 0.16894447616328861, + "grad_norm": 1.936805009841919, + "learning_rate": 4.656086858616133e-05, + "loss": 4.8728, + "step": 28407 + }, + { + "epoch": 0.1689504234465696, + "grad_norm": 2.4873859882354736, + "learning_rate": 4.656063215194178e-05, + "loss": 4.3402, + "step": 28408 + }, + { + "epoch": 0.1689563707298506, + "grad_norm": 2.295729637145996, + "learning_rate": 4.6560395710195624e-05, + "loss": 4.2334, + "step": 28409 + }, + { + "epoch": 0.1689623180131316, + "grad_norm": 2.2564427852630615, + "learning_rate": 4.6560159260922966e-05, + "loss": 4.6056, + "step": 28410 + }, + { + "epoch": 0.1689682652964126, + "grad_norm": 1.5321199893951416, + "learning_rate": 4.655992280412388e-05, + "loss": 5.7092, + "step": 28411 + }, + { + "epoch": 0.16897421257969358, + "grad_norm": 1.4915989637374878, + "learning_rate": 4.655968633979844e-05, + "loss": 5.5028, + "step": 28412 + }, + { + "epoch": 0.1689801598629746, + "grad_norm": 1.6282528638839722, + "learning_rate": 4.655944986794675e-05, + "loss": 5.405, + "step": 28413 + }, + { + "epoch": 0.1689861071462556, + "grad_norm": 1.5174504518508911, + "learning_rate": 4.6559213388568865e-05, + "loss": 5.2818, + "step": 28414 + }, + { + "epoch": 0.16899205442953658, + "grad_norm": 1.6792948246002197, + "learning_rate": 4.6558976901664885e-05, + "loss": 5.4466, + "step": 28415 + }, + { + "epoch": 0.1689980017128176, + "grad_norm": 1.5633111000061035, + "learning_rate": 4.655874040723489e-05, + "loss": 5.3313, + "step": 28416 + }, + { + "epoch": 0.16900394899609858, + "grad_norm": 1.6550037860870361, + "learning_rate": 4.655850390527896e-05, + "loss": 5.3279, + "step": 28417 + }, + { + "epoch": 0.16900989627937957, + "grad_norm": 1.6670206785202026, + "learning_rate": 4.6558267395797186e-05, + "loss": 5.0354, + "step": 28418 + }, + { + "epoch": 0.16901584356266058, + "grad_norm": 1.577187180519104, + "learning_rate": 4.6558030878789635e-05, + "loss": 4.9382, + "step": 28419 + }, + { + "epoch": 0.16902179084594157, + "grad_norm": 1.5832712650299072, + "learning_rate": 4.65577943542564e-05, + "loss": 5.3036, + "step": 28420 + }, + { + "epoch": 0.16902773812922256, + "grad_norm": 1.4962387084960938, + "learning_rate": 4.655755782219756e-05, + "loss": 5.3586, + "step": 28421 + }, + { + "epoch": 0.16903368541250358, + "grad_norm": 1.2843531370162964, + "learning_rate": 4.655732128261321e-05, + "loss": 5.3972, + "step": 28422 + }, + { + "epoch": 0.16903963269578456, + "grad_norm": 1.1370457410812378, + "learning_rate": 4.6557084735503406e-05, + "loss": 5.2004, + "step": 28423 + }, + { + "epoch": 0.16904557997906555, + "grad_norm": 2.759056329727173, + "learning_rate": 4.655684818086825e-05, + "loss": 4.5741, + "step": 28424 + }, + { + "epoch": 0.16905152726234657, + "grad_norm": 2.7487027645111084, + "learning_rate": 4.655661161870783e-05, + "loss": 4.1308, + "step": 28425 + }, + { + "epoch": 0.16905747454562756, + "grad_norm": 2.479084014892578, + "learning_rate": 4.655637504902221e-05, + "loss": 4.2166, + "step": 28426 + }, + { + "epoch": 0.16906342182890854, + "grad_norm": 2.667968511581421, + "learning_rate": 4.65561384718115e-05, + "loss": 4.1276, + "step": 28427 + }, + { + "epoch": 0.16906936911218956, + "grad_norm": 2.6374669075012207, + "learning_rate": 4.655590188707575e-05, + "loss": 3.7747, + "step": 28428 + }, + { + "epoch": 0.16907531639547055, + "grad_norm": 2.0448408126831055, + "learning_rate": 4.655566529481505e-05, + "loss": 4.7242, + "step": 28429 + }, + { + "epoch": 0.16908126367875154, + "grad_norm": 2.416416645050049, + "learning_rate": 4.65554286950295e-05, + "loss": 4.3241, + "step": 28430 + }, + { + "epoch": 0.16908721096203255, + "grad_norm": 2.018310308456421, + "learning_rate": 4.6555192087719175e-05, + "loss": 4.2137, + "step": 28431 + }, + { + "epoch": 0.16909315824531354, + "grad_norm": 2.2149248123168945, + "learning_rate": 4.655495547288415e-05, + "loss": 4.2518, + "step": 28432 + }, + { + "epoch": 0.16909910552859453, + "grad_norm": 2.190190553665161, + "learning_rate": 4.655471885052452e-05, + "loss": 4.0488, + "step": 28433 + }, + { + "epoch": 0.16910505281187554, + "grad_norm": 2.146759033203125, + "learning_rate": 4.6554482220640347e-05, + "loss": 4.005, + "step": 28434 + }, + { + "epoch": 0.16911100009515653, + "grad_norm": 1.7445921897888184, + "learning_rate": 4.655424558323174e-05, + "loss": 4.5846, + "step": 28435 + }, + { + "epoch": 0.16911694737843752, + "grad_norm": 1.924498200416565, + "learning_rate": 4.655400893829876e-05, + "loss": 4.4729, + "step": 28436 + }, + { + "epoch": 0.16912289466171854, + "grad_norm": 2.297170877456665, + "learning_rate": 4.65537722858415e-05, + "loss": 4.0639, + "step": 28437 + }, + { + "epoch": 0.16912884194499953, + "grad_norm": 2.254561424255371, + "learning_rate": 4.6553535625860044e-05, + "loss": 3.6444, + "step": 28438 + }, + { + "epoch": 0.1691347892282805, + "grad_norm": 2.3372230529785156, + "learning_rate": 4.655329895835447e-05, + "loss": 3.9905, + "step": 28439 + }, + { + "epoch": 0.16914073651156153, + "grad_norm": 2.376207113265991, + "learning_rate": 4.655306228332486e-05, + "loss": 3.9777, + "step": 28440 + }, + { + "epoch": 0.16914668379484252, + "grad_norm": 1.6520785093307495, + "learning_rate": 4.65528256007713e-05, + "loss": 4.9314, + "step": 28441 + }, + { + "epoch": 0.1691526310781235, + "grad_norm": 1.93073308467865, + "learning_rate": 4.6552588910693876e-05, + "loss": 5.1317, + "step": 28442 + }, + { + "epoch": 0.16915857836140452, + "grad_norm": 1.5278276205062866, + "learning_rate": 4.655235221309266e-05, + "loss": 5.2949, + "step": 28443 + }, + { + "epoch": 0.1691645256446855, + "grad_norm": 1.5671179294586182, + "learning_rate": 4.6552115507967744e-05, + "loss": 4.8824, + "step": 28444 + }, + { + "epoch": 0.1691704729279665, + "grad_norm": 1.6631091833114624, + "learning_rate": 4.6551878795319204e-05, + "loss": 4.6696, + "step": 28445 + }, + { + "epoch": 0.1691764202112475, + "grad_norm": 1.9113469123840332, + "learning_rate": 4.655164207514713e-05, + "loss": 4.2842, + "step": 28446 + }, + { + "epoch": 0.1691823674945285, + "grad_norm": 1.8953512907028198, + "learning_rate": 4.655140534745159e-05, + "loss": 5.3818, + "step": 28447 + }, + { + "epoch": 0.1691883147778095, + "grad_norm": 1.7372487783432007, + "learning_rate": 4.6551168612232685e-05, + "loss": 5.2441, + "step": 28448 + }, + { + "epoch": 0.1691942620610905, + "grad_norm": 1.8049054145812988, + "learning_rate": 4.655093186949049e-05, + "loss": 5.2056, + "step": 28449 + }, + { + "epoch": 0.1692002093443715, + "grad_norm": 2.019453763961792, + "learning_rate": 4.6550695119225086e-05, + "loss": 5.4237, + "step": 28450 + }, + { + "epoch": 0.16920615662765248, + "grad_norm": 1.3187928199768066, + "learning_rate": 4.6550458361436554e-05, + "loss": 5.2069, + "step": 28451 + }, + { + "epoch": 0.1692121039109335, + "grad_norm": 2.054603099822998, + "learning_rate": 4.655022159612499e-05, + "loss": 4.4155, + "step": 28452 + }, + { + "epoch": 0.16921805119421449, + "grad_norm": 2.41377854347229, + "learning_rate": 4.6549984823290454e-05, + "loss": 3.613, + "step": 28453 + }, + { + "epoch": 0.16922399847749547, + "grad_norm": 1.9458948373794556, + "learning_rate": 4.654974804293305e-05, + "loss": 3.6051, + "step": 28454 + }, + { + "epoch": 0.1692299457607765, + "grad_norm": 1.7371017932891846, + "learning_rate": 4.6549511255052844e-05, + "loss": 5.1229, + "step": 28455 + }, + { + "epoch": 0.16923589304405748, + "grad_norm": 1.3374329805374146, + "learning_rate": 4.654927445964993e-05, + "loss": 5.7105, + "step": 28456 + }, + { + "epoch": 0.16924184032733847, + "grad_norm": 1.453912377357483, + "learning_rate": 4.654903765672439e-05, + "loss": 5.7225, + "step": 28457 + }, + { + "epoch": 0.16924778761061948, + "grad_norm": 1.984152913093567, + "learning_rate": 4.65488008462763e-05, + "loss": 4.874, + "step": 28458 + }, + { + "epoch": 0.16925373489390047, + "grad_norm": 1.618017554283142, + "learning_rate": 4.6548564028305746e-05, + "loss": 4.6159, + "step": 28459 + }, + { + "epoch": 0.16925968217718146, + "grad_norm": 2.104875087738037, + "learning_rate": 4.654832720281281e-05, + "loss": 3.9827, + "step": 28460 + }, + { + "epoch": 0.16926562946046247, + "grad_norm": 1.9092068672180176, + "learning_rate": 4.654809036979758e-05, + "loss": 3.8551, + "step": 28461 + }, + { + "epoch": 0.16927157674374346, + "grad_norm": 1.6868946552276611, + "learning_rate": 4.6547853529260135e-05, + "loss": 5.6583, + "step": 28462 + }, + { + "epoch": 0.16927752402702445, + "grad_norm": 2.0791547298431396, + "learning_rate": 4.6547616681200544e-05, + "loss": 4.7682, + "step": 28463 + }, + { + "epoch": 0.16928347131030544, + "grad_norm": 2.254826307296753, + "learning_rate": 4.654737982561892e-05, + "loss": 3.7339, + "step": 28464 + }, + { + "epoch": 0.16928941859358645, + "grad_norm": 1.6225947141647339, + "learning_rate": 4.6547142962515314e-05, + "loss": 4.8278, + "step": 28465 + }, + { + "epoch": 0.16929536587686744, + "grad_norm": 1.8425785303115845, + "learning_rate": 4.654690609188983e-05, + "loss": 4.0161, + "step": 28466 + }, + { + "epoch": 0.16930131316014843, + "grad_norm": 1.9367843866348267, + "learning_rate": 4.6546669213742545e-05, + "loss": 3.794, + "step": 28467 + }, + { + "epoch": 0.16930726044342945, + "grad_norm": 1.988096833229065, + "learning_rate": 4.654643232807354e-05, + "loss": 3.7874, + "step": 28468 + }, + { + "epoch": 0.16931320772671044, + "grad_norm": 1.84897780418396, + "learning_rate": 4.6546195434882895e-05, + "loss": 3.8368, + "step": 28469 + }, + { + "epoch": 0.16931915500999142, + "grad_norm": 1.7867851257324219, + "learning_rate": 4.65459585341707e-05, + "loss": 3.7485, + "step": 28470 + }, + { + "epoch": 0.16932510229327244, + "grad_norm": 1.8112739324569702, + "learning_rate": 4.654572162593703e-05, + "loss": 3.7541, + "step": 28471 + }, + { + "epoch": 0.16933104957655343, + "grad_norm": 1.7835328578948975, + "learning_rate": 4.6545484710181974e-05, + "loss": 3.8461, + "step": 28472 + }, + { + "epoch": 0.16933699685983442, + "grad_norm": 1.7823615074157715, + "learning_rate": 4.6545247786905614e-05, + "loss": 3.7878, + "step": 28473 + }, + { + "epoch": 0.16934294414311543, + "grad_norm": 1.8897929191589355, + "learning_rate": 4.654501085610802e-05, + "loss": 3.8613, + "step": 28474 + }, + { + "epoch": 0.16934889142639642, + "grad_norm": 1.9433989524841309, + "learning_rate": 4.654477391778929e-05, + "loss": 3.7189, + "step": 28475 + }, + { + "epoch": 0.1693548387096774, + "grad_norm": 1.688061237335205, + "learning_rate": 4.6544536971949504e-05, + "loss": 4.1471, + "step": 28476 + }, + { + "epoch": 0.16936078599295842, + "grad_norm": 1.9753577709197998, + "learning_rate": 4.654430001858874e-05, + "loss": 4.1729, + "step": 28477 + }, + { + "epoch": 0.1693667332762394, + "grad_norm": 1.6471655368804932, + "learning_rate": 4.654406305770709e-05, + "loss": 5.4232, + "step": 28478 + }, + { + "epoch": 0.1693726805595204, + "grad_norm": 1.5919240713119507, + "learning_rate": 4.6543826089304626e-05, + "loss": 5.6299, + "step": 28479 + }, + { + "epoch": 0.16937862784280142, + "grad_norm": 1.505886435508728, + "learning_rate": 4.6543589113381434e-05, + "loss": 5.472, + "step": 28480 + }, + { + "epoch": 0.1693845751260824, + "grad_norm": 1.3407920598983765, + "learning_rate": 4.65433521299376e-05, + "loss": 5.4519, + "step": 28481 + }, + { + "epoch": 0.1693905224093634, + "grad_norm": 1.785452127456665, + "learning_rate": 4.65431151389732e-05, + "loss": 5.0539, + "step": 28482 + }, + { + "epoch": 0.1693964696926444, + "grad_norm": 1.6076501607894897, + "learning_rate": 4.654287814048833e-05, + "loss": 5.5523, + "step": 28483 + }, + { + "epoch": 0.1694024169759254, + "grad_norm": 1.7751826047897339, + "learning_rate": 4.654264113448306e-05, + "loss": 5.3904, + "step": 28484 + }, + { + "epoch": 0.16940836425920638, + "grad_norm": 2.516270160675049, + "learning_rate": 4.6542404120957465e-05, + "loss": 3.6737, + "step": 28485 + }, + { + "epoch": 0.1694143115424874, + "grad_norm": 2.094210386276245, + "learning_rate": 4.654216709991165e-05, + "loss": 3.3822, + "step": 28486 + }, + { + "epoch": 0.1694202588257684, + "grad_norm": 1.9401110410690308, + "learning_rate": 4.6541930071345685e-05, + "loss": 3.3866, + "step": 28487 + }, + { + "epoch": 0.16942620610904938, + "grad_norm": 1.6965755224227905, + "learning_rate": 4.654169303525966e-05, + "loss": 4.8492, + "step": 28488 + }, + { + "epoch": 0.1694321533923304, + "grad_norm": 2.676941156387329, + "learning_rate": 4.654145599165365e-05, + "loss": 4.4578, + "step": 28489 + }, + { + "epoch": 0.16943810067561138, + "grad_norm": 2.53593111038208, + "learning_rate": 4.654121894052773e-05, + "loss": 3.9574, + "step": 28490 + }, + { + "epoch": 0.16944404795889237, + "grad_norm": 2.355025053024292, + "learning_rate": 4.6540981881882006e-05, + "loss": 4.0911, + "step": 28491 + }, + { + "epoch": 0.16944999524217338, + "grad_norm": 2.2941341400146484, + "learning_rate": 4.654074481571654e-05, + "loss": 4.3186, + "step": 28492 + }, + { + "epoch": 0.16945594252545437, + "grad_norm": 2.2436282634735107, + "learning_rate": 4.654050774203143e-05, + "loss": 4.0785, + "step": 28493 + }, + { + "epoch": 0.16946188980873536, + "grad_norm": 2.8532540798187256, + "learning_rate": 4.6540270660826744e-05, + "loss": 3.2517, + "step": 28494 + }, + { + "epoch": 0.16946783709201638, + "grad_norm": 2.7810893058776855, + "learning_rate": 4.6540033572102575e-05, + "loss": 3.462, + "step": 28495 + }, + { + "epoch": 0.16947378437529736, + "grad_norm": 2.5841453075408936, + "learning_rate": 4.6539796475859004e-05, + "loss": 4.4611, + "step": 28496 + }, + { + "epoch": 0.16947973165857835, + "grad_norm": 2.433039903640747, + "learning_rate": 4.653955937209611e-05, + "loss": 3.7666, + "step": 28497 + }, + { + "epoch": 0.16948567894185937, + "grad_norm": 1.7830419540405273, + "learning_rate": 4.6539322260813984e-05, + "loss": 4.9613, + "step": 28498 + }, + { + "epoch": 0.16949162622514036, + "grad_norm": 1.8452028036117554, + "learning_rate": 4.653908514201269e-05, + "loss": 5.0721, + "step": 28499 + }, + { + "epoch": 0.16949757350842135, + "grad_norm": 1.9641203880310059, + "learning_rate": 4.6538848015692336e-05, + "loss": 4.2726, + "step": 28500 + }, + { + "epoch": 0.16950352079170236, + "grad_norm": 2.1620960235595703, + "learning_rate": 4.6538610881853e-05, + "loss": 3.9638, + "step": 28501 + }, + { + "epoch": 0.16950946807498335, + "grad_norm": 1.977523922920227, + "learning_rate": 4.6538373740494737e-05, + "loss": 4.0448, + "step": 28502 + }, + { + "epoch": 0.16951541535826434, + "grad_norm": 1.7069354057312012, + "learning_rate": 4.653813659161766e-05, + "loss": 4.053, + "step": 28503 + }, + { + "epoch": 0.16952136264154535, + "grad_norm": 1.8894158601760864, + "learning_rate": 4.653789943522184e-05, + "loss": 4.1357, + "step": 28504 + }, + { + "epoch": 0.16952730992482634, + "grad_norm": 1.8103679418563843, + "learning_rate": 4.6537662271307366e-05, + "loss": 3.8426, + "step": 28505 + }, + { + "epoch": 0.16953325720810733, + "grad_norm": 1.6966679096221924, + "learning_rate": 4.653742509987431e-05, + "loss": 3.9686, + "step": 28506 + }, + { + "epoch": 0.16953920449138835, + "grad_norm": 1.8758342266082764, + "learning_rate": 4.653718792092278e-05, + "loss": 3.7168, + "step": 28507 + }, + { + "epoch": 0.16954515177466933, + "grad_norm": 1.738481879234314, + "learning_rate": 4.6536950734452824e-05, + "loss": 4.0376, + "step": 28508 + }, + { + "epoch": 0.16955109905795032, + "grad_norm": 1.8814899921417236, + "learning_rate": 4.653671354046454e-05, + "loss": 3.7981, + "step": 28509 + }, + { + "epoch": 0.16955704634123134, + "grad_norm": 1.7275527715682983, + "learning_rate": 4.653647633895801e-05, + "loss": 3.7576, + "step": 28510 + }, + { + "epoch": 0.16956299362451233, + "grad_norm": 1.5637880563735962, + "learning_rate": 4.6536239129933326e-05, + "loss": 5.5343, + "step": 28511 + }, + { + "epoch": 0.16956894090779331, + "grad_norm": 1.6974562406539917, + "learning_rate": 4.653600191339056e-05, + "loss": 5.9386, + "step": 28512 + }, + { + "epoch": 0.16957488819107433, + "grad_norm": 2.0787951946258545, + "learning_rate": 4.65357646893298e-05, + "loss": 5.6018, + "step": 28513 + }, + { + "epoch": 0.16958083547435532, + "grad_norm": 2.0893337726593018, + "learning_rate": 4.653552745775113e-05, + "loss": 5.5357, + "step": 28514 + }, + { + "epoch": 0.1695867827576363, + "grad_norm": 2.1055009365081787, + "learning_rate": 4.6535290218654624e-05, + "loss": 5.6448, + "step": 28515 + }, + { + "epoch": 0.16959273004091732, + "grad_norm": 2.247347116470337, + "learning_rate": 4.653505297204037e-05, + "loss": 4.0233, + "step": 28516 + }, + { + "epoch": 0.1695986773241983, + "grad_norm": 1.5102436542510986, + "learning_rate": 4.653481571790846e-05, + "loss": 5.1274, + "step": 28517 + }, + { + "epoch": 0.1696046246074793, + "grad_norm": 1.5515743494033813, + "learning_rate": 4.653457845625896e-05, + "loss": 6.1905, + "step": 28518 + }, + { + "epoch": 0.16961057189076031, + "grad_norm": 1.5858293771743774, + "learning_rate": 4.6534341187091965e-05, + "loss": 5.2316, + "step": 28519 + }, + { + "epoch": 0.1696165191740413, + "grad_norm": 3.305469274520874, + "learning_rate": 4.653410391040755e-05, + "loss": 4.022, + "step": 28520 + }, + { + "epoch": 0.1696224664573223, + "grad_norm": 1.6751025915145874, + "learning_rate": 4.6533866626205805e-05, + "loss": 5.2442, + "step": 28521 + }, + { + "epoch": 0.16962841374060328, + "grad_norm": 1.777486801147461, + "learning_rate": 4.653362933448681e-05, + "loss": 5.0407, + "step": 28522 + }, + { + "epoch": 0.1696343610238843, + "grad_norm": 1.5896446704864502, + "learning_rate": 4.653339203525065e-05, + "loss": 4.807, + "step": 28523 + }, + { + "epoch": 0.16964030830716528, + "grad_norm": 1.9087060689926147, + "learning_rate": 4.65331547284974e-05, + "loss": 5.0863, + "step": 28524 + }, + { + "epoch": 0.16964625559044627, + "grad_norm": 1.7064319849014282, + "learning_rate": 4.653291741422715e-05, + "loss": 5.2761, + "step": 28525 + }, + { + "epoch": 0.1696522028737273, + "grad_norm": 1.5838422775268555, + "learning_rate": 4.6532680092439986e-05, + "loss": 5.316, + "step": 28526 + }, + { + "epoch": 0.16965815015700828, + "grad_norm": 1.702512264251709, + "learning_rate": 4.653244276313598e-05, + "loss": 5.2548, + "step": 28527 + }, + { + "epoch": 0.16966409744028926, + "grad_norm": 1.4088670015335083, + "learning_rate": 4.6532205426315215e-05, + "loss": 5.1767, + "step": 28528 + }, + { + "epoch": 0.16967004472357028, + "grad_norm": 2.7728757858276367, + "learning_rate": 4.653196808197779e-05, + "loss": 4.5771, + "step": 28529 + }, + { + "epoch": 0.16967599200685127, + "grad_norm": 2.977949857711792, + "learning_rate": 4.653173073012377e-05, + "loss": 4.2778, + "step": 28530 + }, + { + "epoch": 0.16968193929013226, + "grad_norm": 2.986652374267578, + "learning_rate": 4.6531493370753254e-05, + "loss": 4.1076, + "step": 28531 + }, + { + "epoch": 0.16968788657341327, + "grad_norm": 2.596334934234619, + "learning_rate": 4.6531256003866305e-05, + "loss": 3.6769, + "step": 28532 + }, + { + "epoch": 0.16969383385669426, + "grad_norm": 2.381591796875, + "learning_rate": 4.653101862946303e-05, + "loss": 3.9261, + "step": 28533 + }, + { + "epoch": 0.16969978113997525, + "grad_norm": 2.287313938140869, + "learning_rate": 4.653078124754349e-05, + "loss": 4.4583, + "step": 28534 + }, + { + "epoch": 0.16970572842325626, + "grad_norm": 1.716257929801941, + "learning_rate": 4.6530543858107776e-05, + "loss": 5.1735, + "step": 28535 + }, + { + "epoch": 0.16971167570653725, + "grad_norm": 1.5777500867843628, + "learning_rate": 4.6530306461155976e-05, + "loss": 4.958, + "step": 28536 + }, + { + "epoch": 0.16971762298981824, + "grad_norm": 1.6747970581054688, + "learning_rate": 4.653006905668817e-05, + "loss": 4.6559, + "step": 28537 + }, + { + "epoch": 0.16972357027309926, + "grad_norm": 1.8283017873764038, + "learning_rate": 4.652983164470444e-05, + "loss": 4.4711, + "step": 28538 + }, + { + "epoch": 0.16972951755638024, + "grad_norm": 2.753277063369751, + "learning_rate": 4.652959422520485e-05, + "loss": 3.9467, + "step": 28539 + }, + { + "epoch": 0.16973546483966123, + "grad_norm": 1.993268370628357, + "learning_rate": 4.652935679818952e-05, + "loss": 4.8315, + "step": 28540 + }, + { + "epoch": 0.16974141212294225, + "grad_norm": 1.7056300640106201, + "learning_rate": 4.652911936365851e-05, + "loss": 5.6509, + "step": 28541 + }, + { + "epoch": 0.16974735940622324, + "grad_norm": 1.6653499603271484, + "learning_rate": 4.6528881921611904e-05, + "loss": 5.5002, + "step": 28542 + }, + { + "epoch": 0.16975330668950422, + "grad_norm": 1.5368744134902954, + "learning_rate": 4.6528644472049795e-05, + "loss": 5.0847, + "step": 28543 + }, + { + "epoch": 0.16975925397278524, + "grad_norm": 1.597609043121338, + "learning_rate": 4.6528407014972255e-05, + "loss": 5.4779, + "step": 28544 + }, + { + "epoch": 0.16976520125606623, + "grad_norm": 1.5362802743911743, + "learning_rate": 4.6528169550379364e-05, + "loss": 4.931, + "step": 28545 + }, + { + "epoch": 0.16977114853934722, + "grad_norm": 1.4700133800506592, + "learning_rate": 4.652793207827122e-05, + "loss": 5.6209, + "step": 28546 + }, + { + "epoch": 0.16977709582262823, + "grad_norm": 2.0117483139038086, + "learning_rate": 4.652769459864788e-05, + "loss": 4.7425, + "step": 28547 + }, + { + "epoch": 0.16978304310590922, + "grad_norm": 1.4520665407180786, + "learning_rate": 4.652745711150946e-05, + "loss": 5.135, + "step": 28548 + }, + { + "epoch": 0.1697889903891902, + "grad_norm": 1.5992931127548218, + "learning_rate": 4.6527219616856036e-05, + "loss": 5.2732, + "step": 28549 + }, + { + "epoch": 0.16979493767247122, + "grad_norm": 1.689389944076538, + "learning_rate": 4.6526982114687666e-05, + "loss": 5.1537, + "step": 28550 + }, + { + "epoch": 0.1698008849557522, + "grad_norm": 1.5059309005737305, + "learning_rate": 4.652674460500446e-05, + "loss": 4.9021, + "step": 28551 + }, + { + "epoch": 0.1698068322390332, + "grad_norm": 2.6482186317443848, + "learning_rate": 4.652650708780648e-05, + "loss": 4.9221, + "step": 28552 + }, + { + "epoch": 0.16981277952231422, + "grad_norm": 1.7961699962615967, + "learning_rate": 4.652626956309382e-05, + "loss": 5.3804, + "step": 28553 + }, + { + "epoch": 0.1698187268055952, + "grad_norm": 1.704698085784912, + "learning_rate": 4.652603203086656e-05, + "loss": 5.775, + "step": 28554 + }, + { + "epoch": 0.1698246740888762, + "grad_norm": 1.7374398708343506, + "learning_rate": 4.65257944911248e-05, + "loss": 5.6455, + "step": 28555 + }, + { + "epoch": 0.1698306213721572, + "grad_norm": 1.5410466194152832, + "learning_rate": 4.652555694386859e-05, + "loss": 5.7316, + "step": 28556 + }, + { + "epoch": 0.1698365686554382, + "grad_norm": 1.5294291973114014, + "learning_rate": 4.652531938909804e-05, + "loss": 5.0427, + "step": 28557 + }, + { + "epoch": 0.16984251593871919, + "grad_norm": 2.2420549392700195, + "learning_rate": 4.652508182681322e-05, + "loss": 3.8954, + "step": 28558 + }, + { + "epoch": 0.1698484632220002, + "grad_norm": 1.640631079673767, + "learning_rate": 4.652484425701422e-05, + "loss": 5.2021, + "step": 28559 + }, + { + "epoch": 0.1698544105052812, + "grad_norm": 1.3961762189865112, + "learning_rate": 4.652460667970111e-05, + "loss": 4.6562, + "step": 28560 + }, + { + "epoch": 0.16986035778856218, + "grad_norm": 1.408497929573059, + "learning_rate": 4.6524369094873985e-05, + "loss": 5.2449, + "step": 28561 + }, + { + "epoch": 0.1698663050718432, + "grad_norm": 1.544072151184082, + "learning_rate": 4.6524131502532934e-05, + "loss": 5.1623, + "step": 28562 + }, + { + "epoch": 0.16987225235512418, + "grad_norm": 1.4092038869857788, + "learning_rate": 4.652389390267802e-05, + "loss": 5.1672, + "step": 28563 + }, + { + "epoch": 0.16987819963840517, + "grad_norm": 1.533828616142273, + "learning_rate": 4.6523656295309346e-05, + "loss": 5.1873, + "step": 28564 + }, + { + "epoch": 0.16988414692168619, + "grad_norm": 1.690058946609497, + "learning_rate": 4.6523418680426986e-05, + "loss": 5.1518, + "step": 28565 + }, + { + "epoch": 0.16989009420496717, + "grad_norm": 1.192253828048706, + "learning_rate": 4.652318105803102e-05, + "loss": 5.1708, + "step": 28566 + }, + { + "epoch": 0.16989604148824816, + "grad_norm": 1.6222058534622192, + "learning_rate": 4.6522943428121526e-05, + "loss": 5.2261, + "step": 28567 + }, + { + "epoch": 0.16990198877152918, + "grad_norm": 1.9990545511245728, + "learning_rate": 4.65227057906986e-05, + "loss": 5.1013, + "step": 28568 + }, + { + "epoch": 0.16990793605481017, + "grad_norm": 1.929602861404419, + "learning_rate": 4.652246814576233e-05, + "loss": 4.8618, + "step": 28569 + }, + { + "epoch": 0.16991388333809115, + "grad_norm": 1.3916577100753784, + "learning_rate": 4.6522230493312777e-05, + "loss": 4.929, + "step": 28570 + }, + { + "epoch": 0.16991983062137217, + "grad_norm": 1.7045917510986328, + "learning_rate": 4.6521992833350036e-05, + "loss": 4.925, + "step": 28571 + }, + { + "epoch": 0.16992577790465316, + "grad_norm": 1.68044114112854, + "learning_rate": 4.6521755165874194e-05, + "loss": 5.3032, + "step": 28572 + }, + { + "epoch": 0.16993172518793415, + "grad_norm": 1.747460126876831, + "learning_rate": 4.652151749088533e-05, + "loss": 5.1043, + "step": 28573 + }, + { + "epoch": 0.16993767247121516, + "grad_norm": 1.7225557565689087, + "learning_rate": 4.6521279808383526e-05, + "loss": 4.7359, + "step": 28574 + }, + { + "epoch": 0.16994361975449615, + "grad_norm": 1.9875255823135376, + "learning_rate": 4.652104211836886e-05, + "loss": 3.912, + "step": 28575 + }, + { + "epoch": 0.16994956703777714, + "grad_norm": 1.898094654083252, + "learning_rate": 4.652080442084142e-05, + "loss": 4.012, + "step": 28576 + }, + { + "epoch": 0.16995551432105815, + "grad_norm": 1.8791594505310059, + "learning_rate": 4.65205667158013e-05, + "loss": 3.8007, + "step": 28577 + }, + { + "epoch": 0.16996146160433914, + "grad_norm": 1.85286545753479, + "learning_rate": 4.652032900324857e-05, + "loss": 3.8686, + "step": 28578 + }, + { + "epoch": 0.16996740888762013, + "grad_norm": 1.8084555864334106, + "learning_rate": 4.652009128318331e-05, + "loss": 3.8287, + "step": 28579 + }, + { + "epoch": 0.16997335617090112, + "grad_norm": 1.8365230560302734, + "learning_rate": 4.651985355560562e-05, + "loss": 3.8072, + "step": 28580 + }, + { + "epoch": 0.16997930345418213, + "grad_norm": 1.8318002223968506, + "learning_rate": 4.651961582051555e-05, + "loss": 3.5751, + "step": 28581 + }, + { + "epoch": 0.16998525073746312, + "grad_norm": 2.9217238426208496, + "learning_rate": 4.651937807791322e-05, + "loss": 4.3074, + "step": 28582 + }, + { + "epoch": 0.1699911980207441, + "grad_norm": 1.8495897054672241, + "learning_rate": 4.651914032779869e-05, + "loss": 3.5268, + "step": 28583 + }, + { + "epoch": 0.16999714530402513, + "grad_norm": 1.7885898351669312, + "learning_rate": 4.651890257017206e-05, + "loss": 3.2383, + "step": 28584 + }, + { + "epoch": 0.17000309258730611, + "grad_norm": 1.9159060716629028, + "learning_rate": 4.6518664805033395e-05, + "loss": 3.7259, + "step": 28585 + }, + { + "epoch": 0.1700090398705871, + "grad_norm": 1.733549952507019, + "learning_rate": 4.6518427032382793e-05, + "loss": 5.1259, + "step": 28586 + }, + { + "epoch": 0.17001498715386812, + "grad_norm": 2.508037805557251, + "learning_rate": 4.651818925222033e-05, + "loss": 3.8367, + "step": 28587 + }, + { + "epoch": 0.1700209344371491, + "grad_norm": 2.5397400856018066, + "learning_rate": 4.651795146454608e-05, + "loss": 3.4588, + "step": 28588 + }, + { + "epoch": 0.1700268817204301, + "grad_norm": 2.3859269618988037, + "learning_rate": 4.651771366936015e-05, + "loss": 3.3977, + "step": 28589 + }, + { + "epoch": 0.1700328290037111, + "grad_norm": 1.8520206212997437, + "learning_rate": 4.65174758666626e-05, + "loss": 4.0797, + "step": 28590 + }, + { + "epoch": 0.1700387762869921, + "grad_norm": 2.0465288162231445, + "learning_rate": 4.651723805645352e-05, + "loss": 3.2528, + "step": 28591 + }, + { + "epoch": 0.1700447235702731, + "grad_norm": 2.100496530532837, + "learning_rate": 4.651700023873299e-05, + "loss": 2.9472, + "step": 28592 + }, + { + "epoch": 0.1700506708535541, + "grad_norm": 2.4353413581848145, + "learning_rate": 4.6516762413501106e-05, + "loss": 3.161, + "step": 28593 + }, + { + "epoch": 0.1700566181368351, + "grad_norm": 2.609565019607544, + "learning_rate": 4.651652458075794e-05, + "loss": 3.5234, + "step": 28594 + }, + { + "epoch": 0.17006256542011608, + "grad_norm": 2.2567410469055176, + "learning_rate": 4.651628674050358e-05, + "loss": 3.5863, + "step": 28595 + }, + { + "epoch": 0.1700685127033971, + "grad_norm": 2.6345736980438232, + "learning_rate": 4.6516048892738104e-05, + "loss": 3.5194, + "step": 28596 + }, + { + "epoch": 0.17007445998667808, + "grad_norm": 1.9039238691329956, + "learning_rate": 4.65158110374616e-05, + "loss": 4.0329, + "step": 28597 + }, + { + "epoch": 0.17008040726995907, + "grad_norm": 1.6507738828659058, + "learning_rate": 4.6515573174674143e-05, + "loss": 4.9022, + "step": 28598 + }, + { + "epoch": 0.1700863545532401, + "grad_norm": 1.6945186853408813, + "learning_rate": 4.651533530437583e-05, + "loss": 4.9487, + "step": 28599 + }, + { + "epoch": 0.17009230183652108, + "grad_norm": 1.8337676525115967, + "learning_rate": 4.651509742656673e-05, + "loss": 5.1238, + "step": 28600 + }, + { + "epoch": 0.17009824911980206, + "grad_norm": 1.4968239068984985, + "learning_rate": 4.651485954124694e-05, + "loss": 4.782, + "step": 28601 + }, + { + "epoch": 0.17010419640308308, + "grad_norm": 1.8200058937072754, + "learning_rate": 4.651462164841652e-05, + "loss": 5.3675, + "step": 28602 + }, + { + "epoch": 0.17011014368636407, + "grad_norm": 1.788134217262268, + "learning_rate": 4.6514383748075575e-05, + "loss": 4.6486, + "step": 28603 + }, + { + "epoch": 0.17011609096964506, + "grad_norm": 1.6064730882644653, + "learning_rate": 4.6514145840224184e-05, + "loss": 4.4153, + "step": 28604 + }, + { + "epoch": 0.17012203825292607, + "grad_norm": 1.4705356359481812, + "learning_rate": 4.651390792486242e-05, + "loss": 4.7254, + "step": 28605 + }, + { + "epoch": 0.17012798553620706, + "grad_norm": 1.5670931339263916, + "learning_rate": 4.6513670001990385e-05, + "loss": 5.0288, + "step": 28606 + }, + { + "epoch": 0.17013393281948805, + "grad_norm": 1.9141185283660889, + "learning_rate": 4.651343207160814e-05, + "loss": 5.0111, + "step": 28607 + }, + { + "epoch": 0.17013988010276906, + "grad_norm": 1.485753059387207, + "learning_rate": 4.6513194133715776e-05, + "loss": 5.0013, + "step": 28608 + }, + { + "epoch": 0.17014582738605005, + "grad_norm": 1.6797868013381958, + "learning_rate": 4.651295618831338e-05, + "loss": 5.0576, + "step": 28609 + }, + { + "epoch": 0.17015177466933104, + "grad_norm": 2.6057140827178955, + "learning_rate": 4.651271823540104e-05, + "loss": 3.9116, + "step": 28610 + }, + { + "epoch": 0.17015772195261206, + "grad_norm": 2.83886456489563, + "learning_rate": 4.651248027497883e-05, + "loss": 4.3674, + "step": 28611 + }, + { + "epoch": 0.17016366923589304, + "grad_norm": 2.470137596130371, + "learning_rate": 4.6512242307046834e-05, + "loss": 4.5506, + "step": 28612 + }, + { + "epoch": 0.17016961651917403, + "grad_norm": 2.0518956184387207, + "learning_rate": 4.6512004331605134e-05, + "loss": 4.9991, + "step": 28613 + }, + { + "epoch": 0.17017556380245505, + "grad_norm": 2.012444257736206, + "learning_rate": 4.6511766348653816e-05, + "loss": 4.6678, + "step": 28614 + }, + { + "epoch": 0.17018151108573604, + "grad_norm": 2.152315616607666, + "learning_rate": 4.651152835819297e-05, + "loss": 3.7695, + "step": 28615 + }, + { + "epoch": 0.17018745836901703, + "grad_norm": 2.255277156829834, + "learning_rate": 4.6511290360222664e-05, + "loss": 3.861, + "step": 28616 + }, + { + "epoch": 0.17019340565229804, + "grad_norm": 2.317800998687744, + "learning_rate": 4.651105235474299e-05, + "loss": 3.813, + "step": 28617 + }, + { + "epoch": 0.17019935293557903, + "grad_norm": 2.330914258956909, + "learning_rate": 4.651081434175403e-05, + "loss": 3.6723, + "step": 28618 + }, + { + "epoch": 0.17020530021886002, + "grad_norm": 2.112302541732788, + "learning_rate": 4.651057632125587e-05, + "loss": 3.6212, + "step": 28619 + }, + { + "epoch": 0.17021124750214103, + "grad_norm": 1.9216437339782715, + "learning_rate": 4.651033829324859e-05, + "loss": 4.3208, + "step": 28620 + }, + { + "epoch": 0.17021719478542202, + "grad_norm": 1.9902441501617432, + "learning_rate": 4.651010025773227e-05, + "loss": 4.7577, + "step": 28621 + }, + { + "epoch": 0.170223142068703, + "grad_norm": 1.7886050939559937, + "learning_rate": 4.6509862214707e-05, + "loss": 4.494, + "step": 28622 + }, + { + "epoch": 0.17022908935198403, + "grad_norm": 1.8544505834579468, + "learning_rate": 4.650962416417285e-05, + "loss": 5.4149, + "step": 28623 + }, + { + "epoch": 0.170235036635265, + "grad_norm": 1.682219386100769, + "learning_rate": 4.650938610612992e-05, + "loss": 5.434, + "step": 28624 + }, + { + "epoch": 0.170240983918546, + "grad_norm": 2.096231698989868, + "learning_rate": 4.650914804057829e-05, + "loss": 4.3005, + "step": 28625 + }, + { + "epoch": 0.17024693120182702, + "grad_norm": 2.311213970184326, + "learning_rate": 4.650890996751803e-05, + "loss": 3.7311, + "step": 28626 + }, + { + "epoch": 0.170252878485108, + "grad_norm": 1.9578297138214111, + "learning_rate": 4.650867188694924e-05, + "loss": 4.6696, + "step": 28627 + }, + { + "epoch": 0.170258825768389, + "grad_norm": 2.9123547077178955, + "learning_rate": 4.650843379887199e-05, + "loss": 3.8884, + "step": 28628 + }, + { + "epoch": 0.17026477305167, + "grad_norm": 2.6703314781188965, + "learning_rate": 4.650819570328636e-05, + "loss": 3.9453, + "step": 28629 + }, + { + "epoch": 0.170270720334951, + "grad_norm": 1.7576513290405273, + "learning_rate": 4.6507957600192454e-05, + "loss": 4.8754, + "step": 28630 + }, + { + "epoch": 0.17027666761823199, + "grad_norm": 1.6122910976409912, + "learning_rate": 4.650771948959033e-05, + "loss": 5.0507, + "step": 28631 + }, + { + "epoch": 0.170282614901513, + "grad_norm": 1.5017814636230469, + "learning_rate": 4.650748137148009e-05, + "loss": 4.9571, + "step": 28632 + }, + { + "epoch": 0.170288562184794, + "grad_norm": 1.4443883895874023, + "learning_rate": 4.6507243245861815e-05, + "loss": 4.524, + "step": 28633 + }, + { + "epoch": 0.17029450946807498, + "grad_norm": 1.8001708984375, + "learning_rate": 4.650700511273558e-05, + "loss": 4.8942, + "step": 28634 + }, + { + "epoch": 0.170300456751356, + "grad_norm": 2.039597749710083, + "learning_rate": 4.650676697210147e-05, + "loss": 5.0357, + "step": 28635 + }, + { + "epoch": 0.17030640403463698, + "grad_norm": 1.7828583717346191, + "learning_rate": 4.650652882395957e-05, + "loss": 4.8489, + "step": 28636 + }, + { + "epoch": 0.17031235131791797, + "grad_norm": 2.0128636360168457, + "learning_rate": 4.650629066830996e-05, + "loss": 4.3581, + "step": 28637 + }, + { + "epoch": 0.17031829860119896, + "grad_norm": 1.6843047142028809, + "learning_rate": 4.650605250515273e-05, + "loss": 5.2302, + "step": 28638 + }, + { + "epoch": 0.17032424588447997, + "grad_norm": 1.6175137758255005, + "learning_rate": 4.650581433448796e-05, + "loss": 5.2985, + "step": 28639 + }, + { + "epoch": 0.17033019316776096, + "grad_norm": 1.982064962387085, + "learning_rate": 4.6505576156315734e-05, + "loss": 4.8775, + "step": 28640 + }, + { + "epoch": 0.17033614045104195, + "grad_norm": 1.9722973108291626, + "learning_rate": 4.650533797063613e-05, + "loss": 4.6054, + "step": 28641 + }, + { + "epoch": 0.17034208773432297, + "grad_norm": 2.2383551597595215, + "learning_rate": 4.650509977744923e-05, + "loss": 4.2201, + "step": 28642 + }, + { + "epoch": 0.17034803501760395, + "grad_norm": 1.647186040878296, + "learning_rate": 4.650486157675513e-05, + "loss": 4.8552, + "step": 28643 + }, + { + "epoch": 0.17035398230088494, + "grad_norm": 2.658078193664551, + "learning_rate": 4.650462336855391e-05, + "loss": 4.0346, + "step": 28644 + }, + { + "epoch": 0.17035992958416596, + "grad_norm": 1.9004065990447998, + "learning_rate": 4.650438515284564e-05, + "loss": 4.7588, + "step": 28645 + }, + { + "epoch": 0.17036587686744695, + "grad_norm": 1.6584961414337158, + "learning_rate": 4.650414692963041e-05, + "loss": 5.0345, + "step": 28646 + }, + { + "epoch": 0.17037182415072794, + "grad_norm": 1.6760051250457764, + "learning_rate": 4.650390869890831e-05, + "loss": 5.2614, + "step": 28647 + }, + { + "epoch": 0.17037777143400895, + "grad_norm": 1.538028597831726, + "learning_rate": 4.650367046067942e-05, + "loss": 5.3746, + "step": 28648 + }, + { + "epoch": 0.17038371871728994, + "grad_norm": 1.592532992362976, + "learning_rate": 4.650343221494381e-05, + "loss": 5.2738, + "step": 28649 + }, + { + "epoch": 0.17038966600057093, + "grad_norm": 1.472048044204712, + "learning_rate": 4.650319396170158e-05, + "loss": 5.1399, + "step": 28650 + }, + { + "epoch": 0.17039561328385194, + "grad_norm": 1.570019245147705, + "learning_rate": 4.650295570095281e-05, + "loss": 5.199, + "step": 28651 + }, + { + "epoch": 0.17040156056713293, + "grad_norm": 1.82230806350708, + "learning_rate": 4.6502717432697577e-05, + "loss": 5.1108, + "step": 28652 + }, + { + "epoch": 0.17040750785041392, + "grad_norm": 1.9128144979476929, + "learning_rate": 4.650247915693596e-05, + "loss": 5.1805, + "step": 28653 + }, + { + "epoch": 0.17041345513369494, + "grad_norm": 1.683923363685608, + "learning_rate": 4.650224087366806e-05, + "loss": 5.203, + "step": 28654 + }, + { + "epoch": 0.17041940241697592, + "grad_norm": 1.5329160690307617, + "learning_rate": 4.6502002582893944e-05, + "loss": 4.8658, + "step": 28655 + }, + { + "epoch": 0.1704253497002569, + "grad_norm": 2.3513686656951904, + "learning_rate": 4.65017642846137e-05, + "loss": 4.9593, + "step": 28656 + }, + { + "epoch": 0.17043129698353793, + "grad_norm": 1.7208911180496216, + "learning_rate": 4.650152597882742e-05, + "loss": 5.2315, + "step": 28657 + }, + { + "epoch": 0.17043724426681892, + "grad_norm": 1.7835557460784912, + "learning_rate": 4.650128766553518e-05, + "loss": 5.2212, + "step": 28658 + }, + { + "epoch": 0.1704431915500999, + "grad_norm": 2.004202365875244, + "learning_rate": 4.650104934473705e-05, + "loss": 4.8766, + "step": 28659 + }, + { + "epoch": 0.17044913883338092, + "grad_norm": 1.7374918460845947, + "learning_rate": 4.650081101643314e-05, + "loss": 5.3659, + "step": 28660 + }, + { + "epoch": 0.1704550861166619, + "grad_norm": 1.5580469369888306, + "learning_rate": 4.650057268062351e-05, + "loss": 5.012, + "step": 28661 + }, + { + "epoch": 0.1704610333999429, + "grad_norm": 1.7098673582077026, + "learning_rate": 4.650033433730826e-05, + "loss": 5.0506, + "step": 28662 + }, + { + "epoch": 0.1704669806832239, + "grad_norm": 1.7775324583053589, + "learning_rate": 4.6500095986487454e-05, + "loss": 5.3536, + "step": 28663 + }, + { + "epoch": 0.1704729279665049, + "grad_norm": 1.7413294315338135, + "learning_rate": 4.649985762816119e-05, + "loss": 5.2773, + "step": 28664 + }, + { + "epoch": 0.1704788752497859, + "grad_norm": 1.791043996810913, + "learning_rate": 4.649961926232955e-05, + "loss": 5.1409, + "step": 28665 + }, + { + "epoch": 0.1704848225330669, + "grad_norm": 1.8042404651641846, + "learning_rate": 4.649938088899262e-05, + "loss": 5.3099, + "step": 28666 + }, + { + "epoch": 0.1704907698163479, + "grad_norm": 2.329183340072632, + "learning_rate": 4.649914250815047e-05, + "loss": 4.631, + "step": 28667 + }, + { + "epoch": 0.17049671709962888, + "grad_norm": 2.9833004474639893, + "learning_rate": 4.64989041198032e-05, + "loss": 5.1604, + "step": 28668 + }, + { + "epoch": 0.1705026643829099, + "grad_norm": 3.150871992111206, + "learning_rate": 4.649866572395088e-05, + "loss": 5.0831, + "step": 28669 + }, + { + "epoch": 0.17050861166619088, + "grad_norm": 1.6283338069915771, + "learning_rate": 4.64984273205936e-05, + "loss": 5.1733, + "step": 28670 + }, + { + "epoch": 0.17051455894947187, + "grad_norm": 1.6267815828323364, + "learning_rate": 4.649818890973143e-05, + "loss": 5.3692, + "step": 28671 + }, + { + "epoch": 0.1705205062327529, + "grad_norm": 1.638006567955017, + "learning_rate": 4.649795049136448e-05, + "loss": 5.5058, + "step": 28672 + }, + { + "epoch": 0.17052645351603388, + "grad_norm": 1.605161428451538, + "learning_rate": 4.649771206549281e-05, + "loss": 4.9665, + "step": 28673 + }, + { + "epoch": 0.17053240079931486, + "grad_norm": 1.762798547744751, + "learning_rate": 4.649747363211652e-05, + "loss": 4.6831, + "step": 28674 + }, + { + "epoch": 0.17053834808259588, + "grad_norm": 2.23942494392395, + "learning_rate": 4.649723519123567e-05, + "loss": 4.6154, + "step": 28675 + }, + { + "epoch": 0.17054429536587687, + "grad_norm": 1.6567063331604004, + "learning_rate": 4.649699674285036e-05, + "loss": 5.0949, + "step": 28676 + }, + { + "epoch": 0.17055024264915786, + "grad_norm": 1.4644149541854858, + "learning_rate": 4.649675828696067e-05, + "loss": 5.5432, + "step": 28677 + }, + { + "epoch": 0.17055618993243887, + "grad_norm": 1.7737239599227905, + "learning_rate": 4.6496519823566695e-05, + "loss": 5.0056, + "step": 28678 + }, + { + "epoch": 0.17056213721571986, + "grad_norm": 2.3689754009246826, + "learning_rate": 4.64962813526685e-05, + "loss": 3.7473, + "step": 28679 + }, + { + "epoch": 0.17056808449900085, + "grad_norm": 2.3994569778442383, + "learning_rate": 4.649604287426618e-05, + "loss": 3.7447, + "step": 28680 + }, + { + "epoch": 0.17057403178228187, + "grad_norm": 2.2940452098846436, + "learning_rate": 4.64958043883598e-05, + "loss": 3.623, + "step": 28681 + }, + { + "epoch": 0.17057997906556285, + "grad_norm": 2.1584625244140625, + "learning_rate": 4.6495565894949466e-05, + "loss": 3.5711, + "step": 28682 + }, + { + "epoch": 0.17058592634884384, + "grad_norm": 1.7486004829406738, + "learning_rate": 4.649532739403526e-05, + "loss": 4.4838, + "step": 28683 + }, + { + "epoch": 0.17059187363212486, + "grad_norm": 1.8745564222335815, + "learning_rate": 4.6495088885617245e-05, + "loss": 4.6985, + "step": 28684 + }, + { + "epoch": 0.17059782091540585, + "grad_norm": 1.6774717569351196, + "learning_rate": 4.6494850369695517e-05, + "loss": 4.9845, + "step": 28685 + }, + { + "epoch": 0.17060376819868683, + "grad_norm": 1.6051801443099976, + "learning_rate": 4.649461184627017e-05, + "loss": 5.085, + "step": 28686 + }, + { + "epoch": 0.17060971548196785, + "grad_norm": 1.9558120965957642, + "learning_rate": 4.649437331534126e-05, + "loss": 5.7887, + "step": 28687 + }, + { + "epoch": 0.17061566276524884, + "grad_norm": 2.1222105026245117, + "learning_rate": 4.649413477690889e-05, + "loss": 3.9971, + "step": 28688 + }, + { + "epoch": 0.17062161004852983, + "grad_norm": 2.5469319820404053, + "learning_rate": 4.6493896230973147e-05, + "loss": 3.3402, + "step": 28689 + }, + { + "epoch": 0.17062755733181084, + "grad_norm": 1.747454285621643, + "learning_rate": 4.6493657677534107e-05, + "loss": 4.5433, + "step": 28690 + }, + { + "epoch": 0.17063350461509183, + "grad_norm": 2.327911138534546, + "learning_rate": 4.6493419116591845e-05, + "loss": 5.1279, + "step": 28691 + }, + { + "epoch": 0.17063945189837282, + "grad_norm": 1.96173894405365, + "learning_rate": 4.649318054814646e-05, + "loss": 4.6642, + "step": 28692 + }, + { + "epoch": 0.17064539918165383, + "grad_norm": 2.74940824508667, + "learning_rate": 4.6492941972198026e-05, + "loss": 4.9272, + "step": 28693 + }, + { + "epoch": 0.17065134646493482, + "grad_norm": 2.1249771118164062, + "learning_rate": 4.649270338874663e-05, + "loss": 4.8603, + "step": 28694 + }, + { + "epoch": 0.1706572937482158, + "grad_norm": 1.5566577911376953, + "learning_rate": 4.6492464797792344e-05, + "loss": 5.0004, + "step": 28695 + }, + { + "epoch": 0.1706632410314968, + "grad_norm": 1.5969873666763306, + "learning_rate": 4.649222619933527e-05, + "loss": 5.1347, + "step": 28696 + }, + { + "epoch": 0.17066918831477781, + "grad_norm": 1.894946813583374, + "learning_rate": 4.649198759337548e-05, + "loss": 5.1455, + "step": 28697 + }, + { + "epoch": 0.1706751355980588, + "grad_norm": 1.7214184999465942, + "learning_rate": 4.6491748979913056e-05, + "loss": 5.2916, + "step": 28698 + }, + { + "epoch": 0.1706810828813398, + "grad_norm": 1.8061472177505493, + "learning_rate": 4.649151035894809e-05, + "loss": 4.8581, + "step": 28699 + }, + { + "epoch": 0.1706870301646208, + "grad_norm": 2.3920493125915527, + "learning_rate": 4.649127173048066e-05, + "loss": 4.8851, + "step": 28700 + }, + { + "epoch": 0.1706929774479018, + "grad_norm": 1.7309520244598389, + "learning_rate": 4.649103309451084e-05, + "loss": 4.5377, + "step": 28701 + }, + { + "epoch": 0.17069892473118278, + "grad_norm": 1.757692813873291, + "learning_rate": 4.6490794451038725e-05, + "loss": 4.9765, + "step": 28702 + }, + { + "epoch": 0.1707048720144638, + "grad_norm": 2.2090845108032227, + "learning_rate": 4.64905558000644e-05, + "loss": 4.741, + "step": 28703 + }, + { + "epoch": 0.1707108192977448, + "grad_norm": 1.7464302778244019, + "learning_rate": 4.649031714158794e-05, + "loss": 4.9167, + "step": 28704 + }, + { + "epoch": 0.17071676658102578, + "grad_norm": 1.4639854431152344, + "learning_rate": 4.649007847560944e-05, + "loss": 5.1732, + "step": 28705 + }, + { + "epoch": 0.1707227138643068, + "grad_norm": 1.8633160591125488, + "learning_rate": 4.648983980212896e-05, + "loss": 4.3169, + "step": 28706 + }, + { + "epoch": 0.17072866114758778, + "grad_norm": 1.645669937133789, + "learning_rate": 4.648960112114662e-05, + "loss": 5.3615, + "step": 28707 + }, + { + "epoch": 0.17073460843086877, + "grad_norm": 1.802817702293396, + "learning_rate": 4.648936243266246e-05, + "loss": 4.6081, + "step": 28708 + }, + { + "epoch": 0.17074055571414978, + "grad_norm": 1.6780096292495728, + "learning_rate": 4.648912373667661e-05, + "loss": 4.8164, + "step": 28709 + }, + { + "epoch": 0.17074650299743077, + "grad_norm": 1.6830222606658936, + "learning_rate": 4.648888503318911e-05, + "loss": 5.1217, + "step": 28710 + }, + { + "epoch": 0.17075245028071176, + "grad_norm": 1.9091911315917969, + "learning_rate": 4.648864632220007e-05, + "loss": 4.6718, + "step": 28711 + }, + { + "epoch": 0.17075839756399278, + "grad_norm": 1.7040106058120728, + "learning_rate": 4.6488407603709566e-05, + "loss": 5.3872, + "step": 28712 + }, + { + "epoch": 0.17076434484727376, + "grad_norm": 1.5387471914291382, + "learning_rate": 4.648816887771768e-05, + "loss": 4.999, + "step": 28713 + }, + { + "epoch": 0.17077029213055475, + "grad_norm": 1.6032272577285767, + "learning_rate": 4.648793014422449e-05, + "loss": 5.3291, + "step": 28714 + }, + { + "epoch": 0.17077623941383577, + "grad_norm": 2.1550817489624023, + "learning_rate": 4.6487691403230096e-05, + "loss": 4.4169, + "step": 28715 + }, + { + "epoch": 0.17078218669711676, + "grad_norm": 1.632123589515686, + "learning_rate": 4.648745265473457e-05, + "loss": 4.8016, + "step": 28716 + }, + { + "epoch": 0.17078813398039774, + "grad_norm": 1.9822715520858765, + "learning_rate": 4.6487213898737986e-05, + "loss": 4.8404, + "step": 28717 + }, + { + "epoch": 0.17079408126367876, + "grad_norm": 1.4587271213531494, + "learning_rate": 4.648697513524044e-05, + "loss": 5.195, + "step": 28718 + }, + { + "epoch": 0.17080002854695975, + "grad_norm": 1.4583262205123901, + "learning_rate": 4.648673636424202e-05, + "loss": 5.331, + "step": 28719 + }, + { + "epoch": 0.17080597583024074, + "grad_norm": 1.508599877357483, + "learning_rate": 4.648649758574279e-05, + "loss": 5.3316, + "step": 28720 + }, + { + "epoch": 0.17081192311352175, + "grad_norm": 1.5801657438278198, + "learning_rate": 4.648625879974287e-05, + "loss": 4.9691, + "step": 28721 + }, + { + "epoch": 0.17081787039680274, + "grad_norm": 1.383544921875, + "learning_rate": 4.648602000624229e-05, + "loss": 4.8747, + "step": 28722 + }, + { + "epoch": 0.17082381768008373, + "grad_norm": 1.6122874021530151, + "learning_rate": 4.648578120524118e-05, + "loss": 4.8057, + "step": 28723 + }, + { + "epoch": 0.17082976496336474, + "grad_norm": 1.7532804012298584, + "learning_rate": 4.64855423967396e-05, + "loss": 4.7074, + "step": 28724 + }, + { + "epoch": 0.17083571224664573, + "grad_norm": 1.440300703048706, + "learning_rate": 4.648530358073764e-05, + "loss": 4.6827, + "step": 28725 + }, + { + "epoch": 0.17084165952992672, + "grad_norm": 1.4043488502502441, + "learning_rate": 4.648506475723539e-05, + "loss": 5.1083, + "step": 28726 + }, + { + "epoch": 0.17084760681320774, + "grad_norm": 2.273939609527588, + "learning_rate": 4.6484825926232914e-05, + "loss": 4.3264, + "step": 28727 + }, + { + "epoch": 0.17085355409648872, + "grad_norm": 2.029352903366089, + "learning_rate": 4.6484587087730316e-05, + "loss": 4.2814, + "step": 28728 + }, + { + "epoch": 0.1708595013797697, + "grad_norm": 1.6527879238128662, + "learning_rate": 4.648434824172767e-05, + "loss": 4.6651, + "step": 28729 + }, + { + "epoch": 0.17086544866305073, + "grad_norm": 1.6313071250915527, + "learning_rate": 4.648410938822505e-05, + "loss": 5.202, + "step": 28730 + }, + { + "epoch": 0.17087139594633172, + "grad_norm": 1.706916332244873, + "learning_rate": 4.648387052722256e-05, + "loss": 5.1041, + "step": 28731 + }, + { + "epoch": 0.1708773432296127, + "grad_norm": 1.8511303663253784, + "learning_rate": 4.6483631658720265e-05, + "loss": 4.7474, + "step": 28732 + }, + { + "epoch": 0.17088329051289372, + "grad_norm": 2.102651357650757, + "learning_rate": 4.648339278271826e-05, + "loss": 4.7116, + "step": 28733 + }, + { + "epoch": 0.1708892377961747, + "grad_norm": 1.5868231058120728, + "learning_rate": 4.648315389921662e-05, + "loss": 4.8723, + "step": 28734 + }, + { + "epoch": 0.1708951850794557, + "grad_norm": 1.5616002082824707, + "learning_rate": 4.648291500821544e-05, + "loss": 4.7078, + "step": 28735 + }, + { + "epoch": 0.1709011323627367, + "grad_norm": 1.8076444864273071, + "learning_rate": 4.6482676109714804e-05, + "loss": 4.0856, + "step": 28736 + }, + { + "epoch": 0.1709070796460177, + "grad_norm": 2.5661611557006836, + "learning_rate": 4.6482437203714766e-05, + "loss": 4.0065, + "step": 28737 + }, + { + "epoch": 0.1709130269292987, + "grad_norm": 1.9630448818206787, + "learning_rate": 4.648219829021545e-05, + "loss": 4.3436, + "step": 28738 + }, + { + "epoch": 0.1709189742125797, + "grad_norm": 1.588693618774414, + "learning_rate": 4.648195936921691e-05, + "loss": 4.8528, + "step": 28739 + }, + { + "epoch": 0.1709249214958607, + "grad_norm": 1.6260273456573486, + "learning_rate": 4.6481720440719246e-05, + "loss": 4.9007, + "step": 28740 + }, + { + "epoch": 0.17093086877914168, + "grad_norm": 1.4332720041275024, + "learning_rate": 4.648148150472253e-05, + "loss": 4.6039, + "step": 28741 + }, + { + "epoch": 0.1709368160624227, + "grad_norm": 1.5845040082931519, + "learning_rate": 4.648124256122686e-05, + "loss": 4.6129, + "step": 28742 + }, + { + "epoch": 0.17094276334570369, + "grad_norm": 1.9368457794189453, + "learning_rate": 4.6481003610232296e-05, + "loss": 4.4027, + "step": 28743 + }, + { + "epoch": 0.17094871062898467, + "grad_norm": 2.4336676597595215, + "learning_rate": 4.648076465173894e-05, + "loss": 3.9717, + "step": 28744 + }, + { + "epoch": 0.1709546579122657, + "grad_norm": 2.120758056640625, + "learning_rate": 4.648052568574688e-05, + "loss": 3.4959, + "step": 28745 + }, + { + "epoch": 0.17096060519554668, + "grad_norm": 2.1304919719696045, + "learning_rate": 4.648028671225618e-05, + "loss": 3.6002, + "step": 28746 + }, + { + "epoch": 0.17096655247882767, + "grad_norm": 2.2495477199554443, + "learning_rate": 4.648004773126694e-05, + "loss": 3.8202, + "step": 28747 + }, + { + "epoch": 0.17097249976210868, + "grad_norm": 2.0952799320220947, + "learning_rate": 4.647980874277924e-05, + "loss": 4.0671, + "step": 28748 + }, + { + "epoch": 0.17097844704538967, + "grad_norm": 2.260267972946167, + "learning_rate": 4.6479569746793154e-05, + "loss": 4.004, + "step": 28749 + }, + { + "epoch": 0.17098439432867066, + "grad_norm": 1.6694860458374023, + "learning_rate": 4.647933074330878e-05, + "loss": 4.6784, + "step": 28750 + }, + { + "epoch": 0.17099034161195167, + "grad_norm": 1.8118653297424316, + "learning_rate": 4.647909173232618e-05, + "loss": 4.4819, + "step": 28751 + }, + { + "epoch": 0.17099628889523266, + "grad_norm": 1.6766449213027954, + "learning_rate": 4.647885271384546e-05, + "loss": 4.5391, + "step": 28752 + }, + { + "epoch": 0.17100223617851365, + "grad_norm": 2.1435959339141846, + "learning_rate": 4.6478613687866696e-05, + "loss": 3.5559, + "step": 28753 + }, + { + "epoch": 0.17100818346179464, + "grad_norm": 2.2521913051605225, + "learning_rate": 4.647837465438997e-05, + "loss": 3.434, + "step": 28754 + }, + { + "epoch": 0.17101413074507565, + "grad_norm": 2.012451171875, + "learning_rate": 4.6478135613415366e-05, + "loss": 3.7475, + "step": 28755 + }, + { + "epoch": 0.17102007802835664, + "grad_norm": 2.383465528488159, + "learning_rate": 4.6477896564942956e-05, + "loss": 4.2333, + "step": 28756 + }, + { + "epoch": 0.17102602531163763, + "grad_norm": 2.0753815174102783, + "learning_rate": 4.647765750897284e-05, + "loss": 3.9532, + "step": 28757 + }, + { + "epoch": 0.17103197259491865, + "grad_norm": 2.0559349060058594, + "learning_rate": 4.64774184455051e-05, + "loss": 3.8132, + "step": 28758 + }, + { + "epoch": 0.17103791987819963, + "grad_norm": 2.2562434673309326, + "learning_rate": 4.6477179374539814e-05, + "loss": 3.9445, + "step": 28759 + }, + { + "epoch": 0.17104386716148062, + "grad_norm": 1.9799115657806396, + "learning_rate": 4.6476940296077065e-05, + "loss": 4.0676, + "step": 28760 + }, + { + "epoch": 0.17104981444476164, + "grad_norm": 2.034501552581787, + "learning_rate": 4.6476701210116935e-05, + "loss": 3.5055, + "step": 28761 + }, + { + "epoch": 0.17105576172804263, + "grad_norm": 2.2014403343200684, + "learning_rate": 4.6476462116659514e-05, + "loss": 3.7419, + "step": 28762 + }, + { + "epoch": 0.17106170901132361, + "grad_norm": 2.271733522415161, + "learning_rate": 4.6476223015704875e-05, + "loss": 3.5206, + "step": 28763 + }, + { + "epoch": 0.17106765629460463, + "grad_norm": 2.144587278366089, + "learning_rate": 4.647598390725312e-05, + "loss": 3.4963, + "step": 28764 + }, + { + "epoch": 0.17107360357788562, + "grad_norm": 1.8896453380584717, + "learning_rate": 4.647574479130432e-05, + "loss": 3.6917, + "step": 28765 + }, + { + "epoch": 0.1710795508611666, + "grad_norm": 2.5320651531219482, + "learning_rate": 4.6475505667858556e-05, + "loss": 3.4057, + "step": 28766 + }, + { + "epoch": 0.17108549814444762, + "grad_norm": 2.5660650730133057, + "learning_rate": 4.647526653691591e-05, + "loss": 3.5343, + "step": 28767 + }, + { + "epoch": 0.1710914454277286, + "grad_norm": 2.016521453857422, + "learning_rate": 4.647502739847647e-05, + "loss": 5.0209, + "step": 28768 + }, + { + "epoch": 0.1710973927110096, + "grad_norm": 2.098594903945923, + "learning_rate": 4.6474788252540323e-05, + "loss": 3.4916, + "step": 28769 + }, + { + "epoch": 0.17110333999429062, + "grad_norm": 2.502556562423706, + "learning_rate": 4.6474549099107555e-05, + "loss": 3.6106, + "step": 28770 + }, + { + "epoch": 0.1711092872775716, + "grad_norm": 2.3364086151123047, + "learning_rate": 4.647430993817824e-05, + "loss": 3.6718, + "step": 28771 + }, + { + "epoch": 0.1711152345608526, + "grad_norm": 2.453624963760376, + "learning_rate": 4.647407076975247e-05, + "loss": 4.0256, + "step": 28772 + }, + { + "epoch": 0.1711211818441336, + "grad_norm": 2.250152826309204, + "learning_rate": 4.647383159383031e-05, + "loss": 3.8149, + "step": 28773 + }, + { + "epoch": 0.1711271291274146, + "grad_norm": 2.2971277236938477, + "learning_rate": 4.6473592410411864e-05, + "loss": 4.0557, + "step": 28774 + }, + { + "epoch": 0.17113307641069558, + "grad_norm": 2.2991559505462646, + "learning_rate": 4.647335321949721e-05, + "loss": 3.9136, + "step": 28775 + }, + { + "epoch": 0.1711390236939766, + "grad_norm": 2.220536708831787, + "learning_rate": 4.647311402108643e-05, + "loss": 4.0714, + "step": 28776 + }, + { + "epoch": 0.1711449709772576, + "grad_norm": 2.1241915225982666, + "learning_rate": 4.647287481517961e-05, + "loss": 3.5843, + "step": 28777 + }, + { + "epoch": 0.17115091826053858, + "grad_norm": 2.195129632949829, + "learning_rate": 4.647263560177683e-05, + "loss": 3.5294, + "step": 28778 + }, + { + "epoch": 0.1711568655438196, + "grad_norm": 2.3440191745758057, + "learning_rate": 4.647239638087817e-05, + "loss": 3.6608, + "step": 28779 + }, + { + "epoch": 0.17116281282710058, + "grad_norm": 2.478482246398926, + "learning_rate": 4.6472157152483726e-05, + "loss": 3.8389, + "step": 28780 + }, + { + "epoch": 0.17116876011038157, + "grad_norm": 2.488262414932251, + "learning_rate": 4.647191791659357e-05, + "loss": 3.3664, + "step": 28781 + }, + { + "epoch": 0.17117470739366258, + "grad_norm": 1.9902031421661377, + "learning_rate": 4.6471678673207784e-05, + "loss": 3.4656, + "step": 28782 + }, + { + "epoch": 0.17118065467694357, + "grad_norm": 1.7979692220687866, + "learning_rate": 4.647143942232647e-05, + "loss": 4.1077, + "step": 28783 + }, + { + "epoch": 0.17118660196022456, + "grad_norm": 2.0550832748413086, + "learning_rate": 4.647120016394969e-05, + "loss": 5.0827, + "step": 28784 + }, + { + "epoch": 0.17119254924350558, + "grad_norm": 2.58035945892334, + "learning_rate": 4.647096089807753e-05, + "loss": 3.3431, + "step": 28785 + }, + { + "epoch": 0.17119849652678656, + "grad_norm": 2.9299840927124023, + "learning_rate": 4.647072162471009e-05, + "loss": 4.3467, + "step": 28786 + }, + { + "epoch": 0.17120444381006755, + "grad_norm": 2.9246139526367188, + "learning_rate": 4.6470482343847434e-05, + "loss": 4.5002, + "step": 28787 + }, + { + "epoch": 0.17121039109334857, + "grad_norm": 2.434800148010254, + "learning_rate": 4.647024305548966e-05, + "loss": 4.39, + "step": 28788 + }, + { + "epoch": 0.17121633837662956, + "grad_norm": 2.0700294971466064, + "learning_rate": 4.647000375963685e-05, + "loss": 3.6275, + "step": 28789 + }, + { + "epoch": 0.17122228565991054, + "grad_norm": 2.0739026069641113, + "learning_rate": 4.6469764456289075e-05, + "loss": 3.294, + "step": 28790 + }, + { + "epoch": 0.17122823294319156, + "grad_norm": 2.158195972442627, + "learning_rate": 4.646952514544643e-05, + "loss": 3.0345, + "step": 28791 + }, + { + "epoch": 0.17123418022647255, + "grad_norm": 2.25756573677063, + "learning_rate": 4.6469285827109e-05, + "loss": 3.4395, + "step": 28792 + }, + { + "epoch": 0.17124012750975354, + "grad_norm": 1.756030559539795, + "learning_rate": 4.646904650127686e-05, + "loss": 4.57, + "step": 28793 + }, + { + "epoch": 0.17124607479303455, + "grad_norm": 1.7527079582214355, + "learning_rate": 4.6468807167950096e-05, + "loss": 4.8592, + "step": 28794 + }, + { + "epoch": 0.17125202207631554, + "grad_norm": 2.0758533477783203, + "learning_rate": 4.646856782712879e-05, + "loss": 3.6941, + "step": 28795 + }, + { + "epoch": 0.17125796935959653, + "grad_norm": 1.977253794670105, + "learning_rate": 4.646832847881304e-05, + "loss": 3.3686, + "step": 28796 + }, + { + "epoch": 0.17126391664287755, + "grad_norm": 2.0132908821105957, + "learning_rate": 4.646808912300291e-05, + "loss": 3.3937, + "step": 28797 + }, + { + "epoch": 0.17126986392615853, + "grad_norm": 1.8328338861465454, + "learning_rate": 4.646784975969849e-05, + "loss": 3.4359, + "step": 28798 + }, + { + "epoch": 0.17127581120943952, + "grad_norm": 1.7316343784332275, + "learning_rate": 4.646761038889987e-05, + "loss": 4.062, + "step": 28799 + }, + { + "epoch": 0.17128175849272054, + "grad_norm": 1.98564875125885, + "learning_rate": 4.646737101060713e-05, + "loss": 3.9671, + "step": 28800 + }, + { + "epoch": 0.17128770577600153, + "grad_norm": 1.4254114627838135, + "learning_rate": 4.646713162482035e-05, + "loss": 5.6623, + "step": 28801 + }, + { + "epoch": 0.1712936530592825, + "grad_norm": 1.7182563543319702, + "learning_rate": 4.646689223153962e-05, + "loss": 3.7951, + "step": 28802 + }, + { + "epoch": 0.17129960034256353, + "grad_norm": 1.9816060066223145, + "learning_rate": 4.646665283076502e-05, + "loss": 3.1926, + "step": 28803 + }, + { + "epoch": 0.17130554762584452, + "grad_norm": 1.9026448726654053, + "learning_rate": 4.646641342249663e-05, + "loss": 3.4481, + "step": 28804 + }, + { + "epoch": 0.1713114949091255, + "grad_norm": 1.9280551671981812, + "learning_rate": 4.646617400673453e-05, + "loss": 3.7474, + "step": 28805 + }, + { + "epoch": 0.17131744219240652, + "grad_norm": 1.9468990564346313, + "learning_rate": 4.646593458347882e-05, + "loss": 3.6522, + "step": 28806 + }, + { + "epoch": 0.1713233894756875, + "grad_norm": 1.8785784244537354, + "learning_rate": 4.646569515272957e-05, + "loss": 4.4277, + "step": 28807 + }, + { + "epoch": 0.1713293367589685, + "grad_norm": 2.5380280017852783, + "learning_rate": 4.6465455714486875e-05, + "loss": 4.7558, + "step": 28808 + }, + { + "epoch": 0.1713352840422495, + "grad_norm": 2.311422824859619, + "learning_rate": 4.64652162687508e-05, + "loss": 4.5887, + "step": 28809 + }, + { + "epoch": 0.1713412313255305, + "grad_norm": 2.215386390686035, + "learning_rate": 4.646497681552144e-05, + "loss": 4.6318, + "step": 28810 + }, + { + "epoch": 0.1713471786088115, + "grad_norm": 2.1793322563171387, + "learning_rate": 4.646473735479889e-05, + "loss": 4.8652, + "step": 28811 + }, + { + "epoch": 0.1713531258920925, + "grad_norm": 1.6395008563995361, + "learning_rate": 4.646449788658321e-05, + "loss": 5.1602, + "step": 28812 + }, + { + "epoch": 0.1713590731753735, + "grad_norm": 1.781542181968689, + "learning_rate": 4.646425841087451e-05, + "loss": 5.5992, + "step": 28813 + }, + { + "epoch": 0.17136502045865448, + "grad_norm": 1.7979416847229004, + "learning_rate": 4.6464018927672846e-05, + "loss": 5.4619, + "step": 28814 + }, + { + "epoch": 0.17137096774193547, + "grad_norm": 1.5196144580841064, + "learning_rate": 4.646377943697832e-05, + "loss": 5.5668, + "step": 28815 + }, + { + "epoch": 0.1713769150252165, + "grad_norm": 1.849569320678711, + "learning_rate": 4.6463539938791e-05, + "loss": 5.2762, + "step": 28816 + }, + { + "epoch": 0.17138286230849747, + "grad_norm": 2.4651362895965576, + "learning_rate": 4.6463300433111e-05, + "loss": 4.2121, + "step": 28817 + }, + { + "epoch": 0.17138880959177846, + "grad_norm": 2.2481956481933594, + "learning_rate": 4.646306091993837e-05, + "loss": 4.2369, + "step": 28818 + }, + { + "epoch": 0.17139475687505948, + "grad_norm": 1.5985668897628784, + "learning_rate": 4.646282139927321e-05, + "loss": 5.0238, + "step": 28819 + }, + { + "epoch": 0.17140070415834047, + "grad_norm": 1.5861318111419678, + "learning_rate": 4.64625818711156e-05, + "loss": 4.6181, + "step": 28820 + }, + { + "epoch": 0.17140665144162145, + "grad_norm": 1.5382401943206787, + "learning_rate": 4.646234233546562e-05, + "loss": 4.9682, + "step": 28821 + }, + { + "epoch": 0.17141259872490247, + "grad_norm": 1.604730248451233, + "learning_rate": 4.646210279232337e-05, + "loss": 5.2491, + "step": 28822 + }, + { + "epoch": 0.17141854600818346, + "grad_norm": 1.83149254322052, + "learning_rate": 4.6461863241688914e-05, + "loss": 5.514, + "step": 28823 + }, + { + "epoch": 0.17142449329146445, + "grad_norm": 2.151071786880493, + "learning_rate": 4.6461623683562336e-05, + "loss": 4.6684, + "step": 28824 + }, + { + "epoch": 0.17143044057474546, + "grad_norm": 1.934921145439148, + "learning_rate": 4.646138411794374e-05, + "loss": 4.5529, + "step": 28825 + }, + { + "epoch": 0.17143638785802645, + "grad_norm": 3.118504047393799, + "learning_rate": 4.646114454483319e-05, + "loss": 3.8805, + "step": 28826 + }, + { + "epoch": 0.17144233514130744, + "grad_norm": 2.784353733062744, + "learning_rate": 4.6460904964230776e-05, + "loss": 3.7983, + "step": 28827 + }, + { + "epoch": 0.17144828242458846, + "grad_norm": 2.2608816623687744, + "learning_rate": 4.6460665376136586e-05, + "loss": 4.0043, + "step": 28828 + }, + { + "epoch": 0.17145422970786944, + "grad_norm": 2.0400445461273193, + "learning_rate": 4.6460425780550695e-05, + "loss": 4.3601, + "step": 28829 + }, + { + "epoch": 0.17146017699115043, + "grad_norm": 1.7697999477386475, + "learning_rate": 4.64601861774732e-05, + "loss": 5.0038, + "step": 28830 + }, + { + "epoch": 0.17146612427443145, + "grad_norm": 1.916419267654419, + "learning_rate": 4.645994656690417e-05, + "loss": 3.8579, + "step": 28831 + }, + { + "epoch": 0.17147207155771244, + "grad_norm": 1.8474862575531006, + "learning_rate": 4.6459706948843687e-05, + "loss": 4.528, + "step": 28832 + }, + { + "epoch": 0.17147801884099342, + "grad_norm": 1.532090425491333, + "learning_rate": 4.645946732329185e-05, + "loss": 5.7598, + "step": 28833 + }, + { + "epoch": 0.17148396612427444, + "grad_norm": 1.4666064977645874, + "learning_rate": 4.645922769024873e-05, + "loss": 5.3868, + "step": 28834 + }, + { + "epoch": 0.17148991340755543, + "grad_norm": 1.5077399015426636, + "learning_rate": 4.645898804971442e-05, + "loss": 5.1645, + "step": 28835 + }, + { + "epoch": 0.17149586069083642, + "grad_norm": 1.5031183958053589, + "learning_rate": 4.6458748401689e-05, + "loss": 4.6318, + "step": 28836 + }, + { + "epoch": 0.17150180797411743, + "grad_norm": 1.9876207113265991, + "learning_rate": 4.6458508746172544e-05, + "loss": 3.7609, + "step": 28837 + }, + { + "epoch": 0.17150775525739842, + "grad_norm": 1.9552377462387085, + "learning_rate": 4.6458269083165155e-05, + "loss": 3.7297, + "step": 28838 + }, + { + "epoch": 0.1715137025406794, + "grad_norm": 1.7688027620315552, + "learning_rate": 4.64580294126669e-05, + "loss": 4.2031, + "step": 28839 + }, + { + "epoch": 0.17151964982396042, + "grad_norm": 1.7358896732330322, + "learning_rate": 4.645778973467787e-05, + "loss": 5.3203, + "step": 28840 + }, + { + "epoch": 0.1715255971072414, + "grad_norm": 1.6685024499893188, + "learning_rate": 4.645755004919814e-05, + "loss": 4.1383, + "step": 28841 + }, + { + "epoch": 0.1715315443905224, + "grad_norm": 1.7474262714385986, + "learning_rate": 4.645731035622781e-05, + "loss": 4.3956, + "step": 28842 + }, + { + "epoch": 0.17153749167380342, + "grad_norm": 2.3153438568115234, + "learning_rate": 4.6457070655766956e-05, + "loss": 3.6617, + "step": 28843 + }, + { + "epoch": 0.1715434389570844, + "grad_norm": 1.6651357412338257, + "learning_rate": 4.645683094781565e-05, + "loss": 3.7946, + "step": 28844 + }, + { + "epoch": 0.1715493862403654, + "grad_norm": 1.8230834007263184, + "learning_rate": 4.645659123237399e-05, + "loss": 3.6286, + "step": 28845 + }, + { + "epoch": 0.1715553335236464, + "grad_norm": 1.724862813949585, + "learning_rate": 4.645635150944206e-05, + "loss": 3.8681, + "step": 28846 + }, + { + "epoch": 0.1715612808069274, + "grad_norm": 1.7765378952026367, + "learning_rate": 4.645611177901994e-05, + "loss": 3.9172, + "step": 28847 + }, + { + "epoch": 0.17156722809020838, + "grad_norm": 1.7206759452819824, + "learning_rate": 4.645587204110771e-05, + "loss": 3.8603, + "step": 28848 + }, + { + "epoch": 0.1715731753734894, + "grad_norm": 1.9421840906143188, + "learning_rate": 4.645563229570546e-05, + "loss": 3.5207, + "step": 28849 + }, + { + "epoch": 0.1715791226567704, + "grad_norm": 1.9873075485229492, + "learning_rate": 4.645539254281327e-05, + "loss": 4.0805, + "step": 28850 + }, + { + "epoch": 0.17158506994005138, + "grad_norm": 1.7919063568115234, + "learning_rate": 4.645515278243122e-05, + "loss": 4.1832, + "step": 28851 + }, + { + "epoch": 0.1715910172233324, + "grad_norm": 1.6959470510482788, + "learning_rate": 4.6454913014559395e-05, + "loss": 4.135, + "step": 28852 + }, + { + "epoch": 0.17159696450661338, + "grad_norm": 2.2556352615356445, + "learning_rate": 4.645467323919789e-05, + "loss": 3.9897, + "step": 28853 + }, + { + "epoch": 0.17160291178989437, + "grad_norm": 2.394732713699341, + "learning_rate": 4.645443345634678e-05, + "loss": 4.0581, + "step": 28854 + }, + { + "epoch": 0.17160885907317538, + "grad_norm": 1.7620495557785034, + "learning_rate": 4.6454193666006144e-05, + "loss": 3.6301, + "step": 28855 + }, + { + "epoch": 0.17161480635645637, + "grad_norm": 2.046990394592285, + "learning_rate": 4.645395386817607e-05, + "loss": 3.6809, + "step": 28856 + }, + { + "epoch": 0.17162075363973736, + "grad_norm": 1.8854444026947021, + "learning_rate": 4.6453714062856645e-05, + "loss": 3.8665, + "step": 28857 + }, + { + "epoch": 0.17162670092301838, + "grad_norm": 1.952010989189148, + "learning_rate": 4.645347425004795e-05, + "loss": 3.9584, + "step": 28858 + }, + { + "epoch": 0.17163264820629937, + "grad_norm": 2.7259037494659424, + "learning_rate": 4.645323442975007e-05, + "loss": 4.1483, + "step": 28859 + }, + { + "epoch": 0.17163859548958035, + "grad_norm": 2.6531686782836914, + "learning_rate": 4.645299460196309e-05, + "loss": 4.2874, + "step": 28860 + }, + { + "epoch": 0.17164454277286137, + "grad_norm": 2.204883337020874, + "learning_rate": 4.645275476668708e-05, + "loss": 4.6409, + "step": 28861 + }, + { + "epoch": 0.17165049005614236, + "grad_norm": 1.8465254306793213, + "learning_rate": 4.645251492392214e-05, + "loss": 3.6078, + "step": 28862 + }, + { + "epoch": 0.17165643733942335, + "grad_norm": 1.6021015644073486, + "learning_rate": 4.645227507366835e-05, + "loss": 3.9142, + "step": 28863 + }, + { + "epoch": 0.17166238462270436, + "grad_norm": 1.9014915227890015, + "learning_rate": 4.645203521592579e-05, + "loss": 4.5439, + "step": 28864 + }, + { + "epoch": 0.17166833190598535, + "grad_norm": 2.176541805267334, + "learning_rate": 4.645179535069455e-05, + "loss": 4.0324, + "step": 28865 + }, + { + "epoch": 0.17167427918926634, + "grad_norm": 1.6138490438461304, + "learning_rate": 4.645155547797472e-05, + "loss": 5.2606, + "step": 28866 + }, + { + "epoch": 0.17168022647254735, + "grad_norm": 1.5091575384140015, + "learning_rate": 4.645131559776635e-05, + "loss": 4.8829, + "step": 28867 + }, + { + "epoch": 0.17168617375582834, + "grad_norm": 2.131401777267456, + "learning_rate": 4.645107571006957e-05, + "loss": 5.1779, + "step": 28868 + }, + { + "epoch": 0.17169212103910933, + "grad_norm": 1.871749758720398, + "learning_rate": 4.645083581488443e-05, + "loss": 4.8126, + "step": 28869 + }, + { + "epoch": 0.17169806832239035, + "grad_norm": 1.825909972190857, + "learning_rate": 4.6450595912211026e-05, + "loss": 4.4965, + "step": 28870 + }, + { + "epoch": 0.17170401560567133, + "grad_norm": 1.546570897102356, + "learning_rate": 4.645035600204944e-05, + "loss": 4.8261, + "step": 28871 + }, + { + "epoch": 0.17170996288895232, + "grad_norm": 1.6035295724868774, + "learning_rate": 4.6450116084399753e-05, + "loss": 4.8019, + "step": 28872 + }, + { + "epoch": 0.1717159101722333, + "grad_norm": 1.6257683038711548, + "learning_rate": 4.644987615926206e-05, + "loss": 4.6993, + "step": 28873 + }, + { + "epoch": 0.17172185745551433, + "grad_norm": 1.6006081104278564, + "learning_rate": 4.6449636226636427e-05, + "loss": 4.7575, + "step": 28874 + }, + { + "epoch": 0.17172780473879531, + "grad_norm": 1.9441580772399902, + "learning_rate": 4.6449396286522954e-05, + "loss": 4.4509, + "step": 28875 + }, + { + "epoch": 0.1717337520220763, + "grad_norm": 2.2355899810791016, + "learning_rate": 4.6449156338921716e-05, + "loss": 3.3666, + "step": 28876 + }, + { + "epoch": 0.17173969930535732, + "grad_norm": 1.863898754119873, + "learning_rate": 4.644891638383281e-05, + "loss": 3.4932, + "step": 28877 + }, + { + "epoch": 0.1717456465886383, + "grad_norm": 1.505720615386963, + "learning_rate": 4.64486764212563e-05, + "loss": 4.3892, + "step": 28878 + }, + { + "epoch": 0.1717515938719193, + "grad_norm": 2.197970151901245, + "learning_rate": 4.644843645119228e-05, + "loss": 4.5169, + "step": 28879 + }, + { + "epoch": 0.1717575411552003, + "grad_norm": 2.1132233142852783, + "learning_rate": 4.644819647364082e-05, + "loss": 3.9246, + "step": 28880 + }, + { + "epoch": 0.1717634884384813, + "grad_norm": 2.273036479949951, + "learning_rate": 4.644795648860203e-05, + "loss": 4.0134, + "step": 28881 + }, + { + "epoch": 0.1717694357217623, + "grad_norm": 2.3725993633270264, + "learning_rate": 4.6447716496075975e-05, + "loss": 3.9562, + "step": 28882 + }, + { + "epoch": 0.1717753830050433, + "grad_norm": 1.6925543546676636, + "learning_rate": 4.6447476496062745e-05, + "loss": 5.22, + "step": 28883 + }, + { + "epoch": 0.1717813302883243, + "grad_norm": 1.7216755151748657, + "learning_rate": 4.644723648856243e-05, + "loss": 4.2907, + "step": 28884 + }, + { + "epoch": 0.17178727757160528, + "grad_norm": 1.9896382093429565, + "learning_rate": 4.64469964735751e-05, + "loss": 3.4634, + "step": 28885 + }, + { + "epoch": 0.1717932248548863, + "grad_norm": 1.924800992012024, + "learning_rate": 4.6446756451100844e-05, + "loss": 3.627, + "step": 28886 + }, + { + "epoch": 0.17179917213816728, + "grad_norm": 2.1140928268432617, + "learning_rate": 4.644651642113975e-05, + "loss": 3.8234, + "step": 28887 + }, + { + "epoch": 0.17180511942144827, + "grad_norm": 1.9103795289993286, + "learning_rate": 4.644627638369189e-05, + "loss": 3.7129, + "step": 28888 + }, + { + "epoch": 0.1718110667047293, + "grad_norm": 2.002732038497925, + "learning_rate": 4.6446036338757363e-05, + "loss": 3.741, + "step": 28889 + }, + { + "epoch": 0.17181701398801028, + "grad_norm": 1.6863858699798584, + "learning_rate": 4.644579628633625e-05, + "loss": 4.3454, + "step": 28890 + }, + { + "epoch": 0.17182296127129126, + "grad_norm": 1.5118045806884766, + "learning_rate": 4.6445556226428625e-05, + "loss": 5.1573, + "step": 28891 + }, + { + "epoch": 0.17182890855457228, + "grad_norm": 2.336212158203125, + "learning_rate": 4.644531615903458e-05, + "loss": 3.7499, + "step": 28892 + }, + { + "epoch": 0.17183485583785327, + "grad_norm": 1.5706313848495483, + "learning_rate": 4.6445076084154195e-05, + "loss": 4.5392, + "step": 28893 + }, + { + "epoch": 0.17184080312113426, + "grad_norm": 1.9531837701797485, + "learning_rate": 4.644483600178756e-05, + "loss": 3.72, + "step": 28894 + }, + { + "epoch": 0.17184675040441527, + "grad_norm": 1.652535080909729, + "learning_rate": 4.644459591193475e-05, + "loss": 4.6445, + "step": 28895 + }, + { + "epoch": 0.17185269768769626, + "grad_norm": 1.856799840927124, + "learning_rate": 4.644435581459585e-05, + "loss": 3.6899, + "step": 28896 + }, + { + "epoch": 0.17185864497097725, + "grad_norm": 1.8917557001113892, + "learning_rate": 4.644411570977096e-05, + "loss": 3.7475, + "step": 28897 + }, + { + "epoch": 0.17186459225425826, + "grad_norm": 1.7784960269927979, + "learning_rate": 4.644387559746014e-05, + "loss": 3.6315, + "step": 28898 + }, + { + "epoch": 0.17187053953753925, + "grad_norm": 1.8464044332504272, + "learning_rate": 4.644363547766348e-05, + "loss": 4.0489, + "step": 28899 + }, + { + "epoch": 0.17187648682082024, + "grad_norm": 1.8629194498062134, + "learning_rate": 4.6443395350381084e-05, + "loss": 3.755, + "step": 28900 + }, + { + "epoch": 0.17188243410410126, + "grad_norm": 1.774107813835144, + "learning_rate": 4.644315521561301e-05, + "loss": 3.6051, + "step": 28901 + }, + { + "epoch": 0.17188838138738224, + "grad_norm": 1.6542714834213257, + "learning_rate": 4.644291507335935e-05, + "loss": 3.622, + "step": 28902 + }, + { + "epoch": 0.17189432867066323, + "grad_norm": 1.7980518341064453, + "learning_rate": 4.64426749236202e-05, + "loss": 3.7703, + "step": 28903 + }, + { + "epoch": 0.17190027595394425, + "grad_norm": 1.771996021270752, + "learning_rate": 4.644243476639563e-05, + "loss": 3.8511, + "step": 28904 + }, + { + "epoch": 0.17190622323722524, + "grad_norm": 1.9656630754470825, + "learning_rate": 4.644219460168572e-05, + "loss": 5.0433, + "step": 28905 + }, + { + "epoch": 0.17191217052050622, + "grad_norm": 1.7453303337097168, + "learning_rate": 4.6441954429490564e-05, + "loss": 4.3733, + "step": 28906 + }, + { + "epoch": 0.17191811780378724, + "grad_norm": 1.8528467416763306, + "learning_rate": 4.644171424981025e-05, + "loss": 3.7542, + "step": 28907 + }, + { + "epoch": 0.17192406508706823, + "grad_norm": 1.8916527032852173, + "learning_rate": 4.6441474062644844e-05, + "loss": 3.726, + "step": 28908 + }, + { + "epoch": 0.17193001237034922, + "grad_norm": 1.8707592487335205, + "learning_rate": 4.644123386799445e-05, + "loss": 3.77, + "step": 28909 + }, + { + "epoch": 0.17193595965363023, + "grad_norm": 1.7839124202728271, + "learning_rate": 4.644099366585914e-05, + "loss": 3.8036, + "step": 28910 + }, + { + "epoch": 0.17194190693691122, + "grad_norm": 2.1418814659118652, + "learning_rate": 4.6440753456239e-05, + "loss": 3.83, + "step": 28911 + }, + { + "epoch": 0.1719478542201922, + "grad_norm": 1.7159006595611572, + "learning_rate": 4.644051323913412e-05, + "loss": 3.6423, + "step": 28912 + }, + { + "epoch": 0.17195380150347322, + "grad_norm": 2.0046510696411133, + "learning_rate": 4.644027301454457e-05, + "loss": 3.6761, + "step": 28913 + }, + { + "epoch": 0.1719597487867542, + "grad_norm": 1.8171806335449219, + "learning_rate": 4.6440032782470446e-05, + "loss": 3.6621, + "step": 28914 + }, + { + "epoch": 0.1719656960700352, + "grad_norm": 1.813620924949646, + "learning_rate": 4.6439792542911826e-05, + "loss": 3.6249, + "step": 28915 + }, + { + "epoch": 0.17197164335331622, + "grad_norm": 1.8341031074523926, + "learning_rate": 4.64395522958688e-05, + "loss": 4.1758, + "step": 28916 + }, + { + "epoch": 0.1719775906365972, + "grad_norm": 2.3422980308532715, + "learning_rate": 4.643931204134144e-05, + "loss": 4.0642, + "step": 28917 + }, + { + "epoch": 0.1719835379198782, + "grad_norm": 2.2799339294433594, + "learning_rate": 4.643907177932985e-05, + "loss": 3.5248, + "step": 28918 + }, + { + "epoch": 0.1719894852031592, + "grad_norm": 2.3583829402923584, + "learning_rate": 4.643883150983409e-05, + "loss": 3.4972, + "step": 28919 + }, + { + "epoch": 0.1719954324864402, + "grad_norm": 2.667558431625366, + "learning_rate": 4.6438591232854265e-05, + "loss": 3.3926, + "step": 28920 + }, + { + "epoch": 0.17200137976972119, + "grad_norm": 2.2808713912963867, + "learning_rate": 4.6438350948390444e-05, + "loss": 3.2806, + "step": 28921 + }, + { + "epoch": 0.1720073270530022, + "grad_norm": 2.0563879013061523, + "learning_rate": 4.6438110656442713e-05, + "loss": 4.4691, + "step": 28922 + }, + { + "epoch": 0.1720132743362832, + "grad_norm": 1.8717663288116455, + "learning_rate": 4.643787035701116e-05, + "loss": 4.8282, + "step": 28923 + }, + { + "epoch": 0.17201922161956418, + "grad_norm": 2.2592520713806152, + "learning_rate": 4.643763005009588e-05, + "loss": 3.6768, + "step": 28924 + }, + { + "epoch": 0.1720251689028452, + "grad_norm": 2.2937116622924805, + "learning_rate": 4.643738973569693e-05, + "loss": 3.5727, + "step": 28925 + }, + { + "epoch": 0.17203111618612618, + "grad_norm": 2.3913755416870117, + "learning_rate": 4.643714941381441e-05, + "loss": 3.6011, + "step": 28926 + }, + { + "epoch": 0.17203706346940717, + "grad_norm": 2.3368663787841797, + "learning_rate": 4.643690908444841e-05, + "loss": 3.6664, + "step": 28927 + }, + { + "epoch": 0.17204301075268819, + "grad_norm": 1.4821833372116089, + "learning_rate": 4.6436668747599005e-05, + "loss": 5.495, + "step": 28928 + }, + { + "epoch": 0.17204895803596917, + "grad_norm": 1.8062217235565186, + "learning_rate": 4.643642840326627e-05, + "loss": 4.632, + "step": 28929 + }, + { + "epoch": 0.17205490531925016, + "grad_norm": 2.0992000102996826, + "learning_rate": 4.6436188051450314e-05, + "loss": 4.1965, + "step": 28930 + }, + { + "epoch": 0.17206085260253115, + "grad_norm": 1.6724803447723389, + "learning_rate": 4.6435947692151207e-05, + "loss": 5.1407, + "step": 28931 + }, + { + "epoch": 0.17206679988581217, + "grad_norm": 2.1039113998413086, + "learning_rate": 4.6435707325369024e-05, + "loss": 4.9189, + "step": 28932 + }, + { + "epoch": 0.17207274716909315, + "grad_norm": 1.7378982305526733, + "learning_rate": 4.6435466951103853e-05, + "loss": 5.0936, + "step": 28933 + }, + { + "epoch": 0.17207869445237414, + "grad_norm": 1.7237809896469116, + "learning_rate": 4.643522656935579e-05, + "loss": 5.175, + "step": 28934 + }, + { + "epoch": 0.17208464173565516, + "grad_norm": 1.5770435333251953, + "learning_rate": 4.6434986180124904e-05, + "loss": 5.0878, + "step": 28935 + }, + { + "epoch": 0.17209058901893615, + "grad_norm": 1.5708106756210327, + "learning_rate": 4.6434745783411294e-05, + "loss": 5.185, + "step": 28936 + }, + { + "epoch": 0.17209653630221713, + "grad_norm": 1.840494990348816, + "learning_rate": 4.643450537921503e-05, + "loss": 5.0293, + "step": 28937 + }, + { + "epoch": 0.17210248358549815, + "grad_norm": 1.9380584955215454, + "learning_rate": 4.64342649675362e-05, + "loss": 4.6983, + "step": 28938 + }, + { + "epoch": 0.17210843086877914, + "grad_norm": 1.6215778589248657, + "learning_rate": 4.64340245483749e-05, + "loss": 5.1622, + "step": 28939 + }, + { + "epoch": 0.17211437815206013, + "grad_norm": 2.1743335723876953, + "learning_rate": 4.6433784121731196e-05, + "loss": 4.2748, + "step": 28940 + }, + { + "epoch": 0.17212032543534114, + "grad_norm": 2.269792318344116, + "learning_rate": 4.643354368760517e-05, + "loss": 3.682, + "step": 28941 + }, + { + "epoch": 0.17212627271862213, + "grad_norm": 1.956141471862793, + "learning_rate": 4.643330324599693e-05, + "loss": 4.4543, + "step": 28942 + }, + { + "epoch": 0.17213222000190312, + "grad_norm": 1.5037137269973755, + "learning_rate": 4.6433062796906544e-05, + "loss": 5.4757, + "step": 28943 + }, + { + "epoch": 0.17213816728518413, + "grad_norm": 2.0092952251434326, + "learning_rate": 4.643282234033409e-05, + "loss": 3.9942, + "step": 28944 + }, + { + "epoch": 0.17214411456846512, + "grad_norm": 2.0670738220214844, + "learning_rate": 4.643258187627967e-05, + "loss": 3.2918, + "step": 28945 + }, + { + "epoch": 0.1721500618517461, + "grad_norm": 2.011192560195923, + "learning_rate": 4.643234140474334e-05, + "loss": 3.6096, + "step": 28946 + }, + { + "epoch": 0.17215600913502713, + "grad_norm": 2.221064805984497, + "learning_rate": 4.643210092572522e-05, + "loss": 4.0979, + "step": 28947 + }, + { + "epoch": 0.17216195641830812, + "grad_norm": 2.543839931488037, + "learning_rate": 4.643186043922536e-05, + "loss": 3.8645, + "step": 28948 + }, + { + "epoch": 0.1721679037015891, + "grad_norm": 1.8699936866760254, + "learning_rate": 4.6431619945243866e-05, + "loss": 3.8908, + "step": 28949 + }, + { + "epoch": 0.17217385098487012, + "grad_norm": 1.6603435277938843, + "learning_rate": 4.6431379443780815e-05, + "loss": 4.9394, + "step": 28950 + }, + { + "epoch": 0.1721797982681511, + "grad_norm": 2.0914523601531982, + "learning_rate": 4.643113893483629e-05, + "loss": 3.1328, + "step": 28951 + }, + { + "epoch": 0.1721857455514321, + "grad_norm": 2.469694137573242, + "learning_rate": 4.6430898418410373e-05, + "loss": 3.5583, + "step": 28952 + }, + { + "epoch": 0.1721916928347131, + "grad_norm": 2.5100619792938232, + "learning_rate": 4.643065789450315e-05, + "loss": 3.7234, + "step": 28953 + }, + { + "epoch": 0.1721976401179941, + "grad_norm": 2.565922737121582, + "learning_rate": 4.643041736311471e-05, + "loss": 3.3566, + "step": 28954 + }, + { + "epoch": 0.1722035874012751, + "grad_norm": 2.454882860183716, + "learning_rate": 4.643017682424513e-05, + "loss": 3.6576, + "step": 28955 + }, + { + "epoch": 0.1722095346845561, + "grad_norm": 1.6239404678344727, + "learning_rate": 4.64299362778945e-05, + "loss": 4.7344, + "step": 28956 + }, + { + "epoch": 0.1722154819678371, + "grad_norm": 1.6332730054855347, + "learning_rate": 4.6429695724062906e-05, + "loss": 4.9091, + "step": 28957 + }, + { + "epoch": 0.17222142925111808, + "grad_norm": 1.495293378829956, + "learning_rate": 4.642945516275041e-05, + "loss": 4.7336, + "step": 28958 + }, + { + "epoch": 0.1722273765343991, + "grad_norm": 1.531150460243225, + "learning_rate": 4.6429214593957125e-05, + "loss": 4.7503, + "step": 28959 + }, + { + "epoch": 0.17223332381768008, + "grad_norm": 1.2761198282241821, + "learning_rate": 4.642897401768312e-05, + "loss": 4.6507, + "step": 28960 + }, + { + "epoch": 0.17223927110096107, + "grad_norm": 1.366808295249939, + "learning_rate": 4.642873343392848e-05, + "loss": 4.7195, + "step": 28961 + }, + { + "epoch": 0.1722452183842421, + "grad_norm": 2.072298765182495, + "learning_rate": 4.6428492842693295e-05, + "loss": 4.3342, + "step": 28962 + }, + { + "epoch": 0.17225116566752308, + "grad_norm": 2.4667413234710693, + "learning_rate": 4.642825224397764e-05, + "loss": 3.3579, + "step": 28963 + }, + { + "epoch": 0.17225711295080406, + "grad_norm": 2.5743234157562256, + "learning_rate": 4.64280116377816e-05, + "loss": 3.559, + "step": 28964 + }, + { + "epoch": 0.17226306023408508, + "grad_norm": 2.4581592082977295, + "learning_rate": 4.6427771024105274e-05, + "loss": 3.6332, + "step": 28965 + }, + { + "epoch": 0.17226900751736607, + "grad_norm": 2.156362533569336, + "learning_rate": 4.642753040294873e-05, + "loss": 4.5459, + "step": 28966 + }, + { + "epoch": 0.17227495480064706, + "grad_norm": 2.2250757217407227, + "learning_rate": 4.642728977431205e-05, + "loss": 3.7909, + "step": 28967 + }, + { + "epoch": 0.17228090208392807, + "grad_norm": 2.06371808052063, + "learning_rate": 4.642704913819533e-05, + "loss": 5.3105, + "step": 28968 + }, + { + "epoch": 0.17228684936720906, + "grad_norm": 2.0080556869506836, + "learning_rate": 4.642680849459865e-05, + "loss": 5.2019, + "step": 28969 + }, + { + "epoch": 0.17229279665049005, + "grad_norm": 1.4533225297927856, + "learning_rate": 4.642656784352209e-05, + "loss": 5.3035, + "step": 28970 + }, + { + "epoch": 0.17229874393377106, + "grad_norm": 1.8252445459365845, + "learning_rate": 4.642632718496573e-05, + "loss": 4.5186, + "step": 28971 + }, + { + "epoch": 0.17230469121705205, + "grad_norm": 2.125659465789795, + "learning_rate": 4.642608651892967e-05, + "loss": 4.5968, + "step": 28972 + }, + { + "epoch": 0.17231063850033304, + "grad_norm": 1.7049205303192139, + "learning_rate": 4.6425845845413984e-05, + "loss": 5.2613, + "step": 28973 + }, + { + "epoch": 0.17231658578361406, + "grad_norm": 1.818495512008667, + "learning_rate": 4.642560516441875e-05, + "loss": 4.5706, + "step": 28974 + }, + { + "epoch": 0.17232253306689505, + "grad_norm": 1.4389350414276123, + "learning_rate": 4.6425364475944065e-05, + "loss": 5.3398, + "step": 28975 + }, + { + "epoch": 0.17232848035017603, + "grad_norm": 1.3256508111953735, + "learning_rate": 4.6425123779990005e-05, + "loss": 5.0498, + "step": 28976 + }, + { + "epoch": 0.17233442763345705, + "grad_norm": 1.3190927505493164, + "learning_rate": 4.642488307655666e-05, + "loss": 5.1833, + "step": 28977 + }, + { + "epoch": 0.17234037491673804, + "grad_norm": 1.6174373626708984, + "learning_rate": 4.64246423656441e-05, + "loss": 4.7737, + "step": 28978 + }, + { + "epoch": 0.17234632220001903, + "grad_norm": 1.3956570625305176, + "learning_rate": 4.6424401647252425e-05, + "loss": 5.0439, + "step": 28979 + }, + { + "epoch": 0.17235226948330004, + "grad_norm": 1.3336056470870972, + "learning_rate": 4.642416092138171e-05, + "loss": 5.526, + "step": 28980 + }, + { + "epoch": 0.17235821676658103, + "grad_norm": 1.9870527982711792, + "learning_rate": 4.642392018803204e-05, + "loss": 4.6277, + "step": 28981 + }, + { + "epoch": 0.17236416404986202, + "grad_norm": 1.9504579305648804, + "learning_rate": 4.64236794472035e-05, + "loss": 4.6113, + "step": 28982 + }, + { + "epoch": 0.17237011133314303, + "grad_norm": 1.7667953968048096, + "learning_rate": 4.642343869889618e-05, + "loss": 5.0653, + "step": 28983 + }, + { + "epoch": 0.17237605861642402, + "grad_norm": 1.6792775392532349, + "learning_rate": 4.642319794311016e-05, + "loss": 5.1556, + "step": 28984 + }, + { + "epoch": 0.172382005899705, + "grad_norm": 1.7935463190078735, + "learning_rate": 4.642295717984551e-05, + "loss": 4.4604, + "step": 28985 + }, + { + "epoch": 0.17238795318298603, + "grad_norm": 1.8608596324920654, + "learning_rate": 4.642271640910235e-05, + "loss": 5.1865, + "step": 28986 + }, + { + "epoch": 0.172393900466267, + "grad_norm": 1.7945232391357422, + "learning_rate": 4.642247563088073e-05, + "loss": 4.8413, + "step": 28987 + }, + { + "epoch": 0.172399847749548, + "grad_norm": 1.6362812519073486, + "learning_rate": 4.6422234845180734e-05, + "loss": 5.4072, + "step": 28988 + }, + { + "epoch": 0.172405795032829, + "grad_norm": 1.7283893823623657, + "learning_rate": 4.642199405200247e-05, + "loss": 5.2463, + "step": 28989 + }, + { + "epoch": 0.17241174231611, + "grad_norm": 2.589603900909424, + "learning_rate": 4.6421753251346004e-05, + "loss": 4.0614, + "step": 28990 + }, + { + "epoch": 0.172417689599391, + "grad_norm": 1.785037875175476, + "learning_rate": 4.642151244321143e-05, + "loss": 4.7127, + "step": 28991 + }, + { + "epoch": 0.17242363688267198, + "grad_norm": 1.5093384981155396, + "learning_rate": 4.6421271627598826e-05, + "loss": 5.2746, + "step": 28992 + }, + { + "epoch": 0.172429584165953, + "grad_norm": 1.4697469472885132, + "learning_rate": 4.642103080450828e-05, + "loss": 5.2444, + "step": 28993 + }, + { + "epoch": 0.172435531449234, + "grad_norm": 1.5588436126708984, + "learning_rate": 4.642078997393986e-05, + "loss": 5.3832, + "step": 28994 + }, + { + "epoch": 0.17244147873251497, + "grad_norm": 1.4939788579940796, + "learning_rate": 4.642054913589368e-05, + "loss": 5.5868, + "step": 28995 + }, + { + "epoch": 0.172447426015796, + "grad_norm": 1.8973298072814941, + "learning_rate": 4.6420308290369795e-05, + "loss": 5.3981, + "step": 28996 + }, + { + "epoch": 0.17245337329907698, + "grad_norm": 1.7295379638671875, + "learning_rate": 4.642006743736831e-05, + "loss": 4.8308, + "step": 28997 + }, + { + "epoch": 0.17245932058235797, + "grad_norm": 1.519732117652893, + "learning_rate": 4.641982657688929e-05, + "loss": 5.423, + "step": 28998 + }, + { + "epoch": 0.17246526786563898, + "grad_norm": 1.6511726379394531, + "learning_rate": 4.641958570893284e-05, + "loss": 5.2029, + "step": 28999 + }, + { + "epoch": 0.17247121514891997, + "grad_norm": 1.5355091094970703, + "learning_rate": 4.641934483349903e-05, + "loss": 5.3556, + "step": 29000 + }, + { + "epoch": 0.17247716243220096, + "grad_norm": 1.562451720237732, + "learning_rate": 4.641910395058795e-05, + "loss": 5.3171, + "step": 29001 + }, + { + "epoch": 0.17248310971548197, + "grad_norm": 1.4412742853164673, + "learning_rate": 4.6418863060199684e-05, + "loss": 5.1771, + "step": 29002 + }, + { + "epoch": 0.17248905699876296, + "grad_norm": 1.5048646926879883, + "learning_rate": 4.6418622162334315e-05, + "loss": 5.3242, + "step": 29003 + }, + { + "epoch": 0.17249500428204395, + "grad_norm": 1.4204987287521362, + "learning_rate": 4.641838125699192e-05, + "loss": 5.3281, + "step": 29004 + }, + { + "epoch": 0.17250095156532497, + "grad_norm": 1.5606169700622559, + "learning_rate": 4.641814034417259e-05, + "loss": 5.0594, + "step": 29005 + }, + { + "epoch": 0.17250689884860596, + "grad_norm": 1.5690323114395142, + "learning_rate": 4.641789942387641e-05, + "loss": 5.2602, + "step": 29006 + }, + { + "epoch": 0.17251284613188694, + "grad_norm": 1.4904906749725342, + "learning_rate": 4.641765849610347e-05, + "loss": 5.2554, + "step": 29007 + }, + { + "epoch": 0.17251879341516796, + "grad_norm": 1.8319326639175415, + "learning_rate": 4.641741756085384e-05, + "loss": 4.5856, + "step": 29008 + }, + { + "epoch": 0.17252474069844895, + "grad_norm": 1.984311819076538, + "learning_rate": 4.6417176618127614e-05, + "loss": 5.2343, + "step": 29009 + }, + { + "epoch": 0.17253068798172994, + "grad_norm": 1.8066591024398804, + "learning_rate": 4.6416935667924864e-05, + "loss": 5.6382, + "step": 29010 + }, + { + "epoch": 0.17253663526501095, + "grad_norm": 1.3843746185302734, + "learning_rate": 4.641669471024569e-05, + "loss": 5.4115, + "step": 29011 + }, + { + "epoch": 0.17254258254829194, + "grad_norm": 1.6255708932876587, + "learning_rate": 4.6416453745090164e-05, + "loss": 5.1379, + "step": 29012 + }, + { + "epoch": 0.17254852983157293, + "grad_norm": 1.4723587036132812, + "learning_rate": 4.641621277245838e-05, + "loss": 5.1829, + "step": 29013 + }, + { + "epoch": 0.17255447711485394, + "grad_norm": 1.7830013036727905, + "learning_rate": 4.641597179235042e-05, + "loss": 4.8646, + "step": 29014 + }, + { + "epoch": 0.17256042439813493, + "grad_norm": 1.6139211654663086, + "learning_rate": 4.641573080476636e-05, + "loss": 5.3989, + "step": 29015 + }, + { + "epoch": 0.17256637168141592, + "grad_norm": 2.9187774658203125, + "learning_rate": 4.641548980970629e-05, + "loss": 3.3579, + "step": 29016 + }, + { + "epoch": 0.17257231896469694, + "grad_norm": 1.4265162944793701, + "learning_rate": 4.6415248807170296e-05, + "loss": 5.1783, + "step": 29017 + }, + { + "epoch": 0.17257826624797792, + "grad_norm": 1.3095968961715698, + "learning_rate": 4.641500779715846e-05, + "loss": 5.6357, + "step": 29018 + }, + { + "epoch": 0.1725842135312589, + "grad_norm": 1.3929443359375, + "learning_rate": 4.641476677967087e-05, + "loss": 5.3234, + "step": 29019 + }, + { + "epoch": 0.17259016081453993, + "grad_norm": 1.6466419696807861, + "learning_rate": 4.64145257547076e-05, + "loss": 5.5066, + "step": 29020 + }, + { + "epoch": 0.17259610809782092, + "grad_norm": 1.4895389080047607, + "learning_rate": 4.6414284722268745e-05, + "loss": 5.0983, + "step": 29021 + }, + { + "epoch": 0.1726020553811019, + "grad_norm": 1.6978981494903564, + "learning_rate": 4.641404368235438e-05, + "loss": 5.3724, + "step": 29022 + }, + { + "epoch": 0.17260800266438292, + "grad_norm": 1.7038211822509766, + "learning_rate": 4.641380263496459e-05, + "loss": 5.2525, + "step": 29023 + }, + { + "epoch": 0.1726139499476639, + "grad_norm": 1.4917408227920532, + "learning_rate": 4.641356158009947e-05, + "loss": 4.9793, + "step": 29024 + }, + { + "epoch": 0.1726198972309449, + "grad_norm": 1.6916602849960327, + "learning_rate": 4.6413320517759094e-05, + "loss": 5.0735, + "step": 29025 + }, + { + "epoch": 0.1726258445142259, + "grad_norm": 1.4852558374404907, + "learning_rate": 4.6413079447943556e-05, + "loss": 5.27, + "step": 29026 + }, + { + "epoch": 0.1726317917975069, + "grad_norm": 1.6030479669570923, + "learning_rate": 4.6412838370652925e-05, + "loss": 5.2712, + "step": 29027 + }, + { + "epoch": 0.1726377390807879, + "grad_norm": 1.5208861827850342, + "learning_rate": 4.6412597285887296e-05, + "loss": 5.4238, + "step": 29028 + }, + { + "epoch": 0.1726436863640689, + "grad_norm": 1.8001056909561157, + "learning_rate": 4.6412356193646744e-05, + "loss": 5.433, + "step": 29029 + }, + { + "epoch": 0.1726496336473499, + "grad_norm": 1.570449948310852, + "learning_rate": 4.641211509393136e-05, + "loss": 5.3843, + "step": 29030 + }, + { + "epoch": 0.17265558093063088, + "grad_norm": 1.4007776975631714, + "learning_rate": 4.641187398674124e-05, + "loss": 5.213, + "step": 29031 + }, + { + "epoch": 0.1726615282139119, + "grad_norm": 1.7244693040847778, + "learning_rate": 4.641163287207645e-05, + "loss": 4.342, + "step": 29032 + }, + { + "epoch": 0.17266747549719288, + "grad_norm": 1.752119779586792, + "learning_rate": 4.6411391749937076e-05, + "loss": 5.2256, + "step": 29033 + }, + { + "epoch": 0.17267342278047387, + "grad_norm": 1.7031835317611694, + "learning_rate": 4.6411150620323214e-05, + "loss": 5.3993, + "step": 29034 + }, + { + "epoch": 0.1726793700637549, + "grad_norm": 1.6741119623184204, + "learning_rate": 4.641090948323493e-05, + "loss": 5.3929, + "step": 29035 + }, + { + "epoch": 0.17268531734703588, + "grad_norm": 1.5801132917404175, + "learning_rate": 4.6410668338672326e-05, + "loss": 5.5049, + "step": 29036 + }, + { + "epoch": 0.17269126463031687, + "grad_norm": 1.6885874271392822, + "learning_rate": 4.641042718663548e-05, + "loss": 5.4284, + "step": 29037 + }, + { + "epoch": 0.17269721191359788, + "grad_norm": 2.0031561851501465, + "learning_rate": 4.6410186027124475e-05, + "loss": 5.064, + "step": 29038 + }, + { + "epoch": 0.17270315919687887, + "grad_norm": 1.9345756769180298, + "learning_rate": 4.640994486013939e-05, + "loss": 4.902, + "step": 29039 + }, + { + "epoch": 0.17270910648015986, + "grad_norm": 1.7898815870285034, + "learning_rate": 4.640970368568032e-05, + "loss": 4.576, + "step": 29040 + }, + { + "epoch": 0.17271505376344087, + "grad_norm": 1.7370834350585938, + "learning_rate": 4.640946250374734e-05, + "loss": 4.2676, + "step": 29041 + }, + { + "epoch": 0.17272100104672186, + "grad_norm": 1.3820379972457886, + "learning_rate": 4.640922131434054e-05, + "loss": 4.1509, + "step": 29042 + }, + { + "epoch": 0.17272694833000285, + "grad_norm": 1.507027506828308, + "learning_rate": 4.640898011746e-05, + "loss": 4.8934, + "step": 29043 + }, + { + "epoch": 0.17273289561328387, + "grad_norm": 1.7124078273773193, + "learning_rate": 4.640873891310581e-05, + "loss": 5.0756, + "step": 29044 + }, + { + "epoch": 0.17273884289656485, + "grad_norm": 1.5267462730407715, + "learning_rate": 4.6408497701278045e-05, + "loss": 5.2387, + "step": 29045 + }, + { + "epoch": 0.17274479017984584, + "grad_norm": 1.560703158378601, + "learning_rate": 4.64082564819768e-05, + "loss": 4.8667, + "step": 29046 + }, + { + "epoch": 0.17275073746312683, + "grad_norm": 1.5322329998016357, + "learning_rate": 4.6408015255202145e-05, + "loss": 5.013, + "step": 29047 + }, + { + "epoch": 0.17275668474640785, + "grad_norm": 1.675746202468872, + "learning_rate": 4.640777402095419e-05, + "loss": 4.8509, + "step": 29048 + }, + { + "epoch": 0.17276263202968883, + "grad_norm": 1.6513665914535522, + "learning_rate": 4.640753277923299e-05, + "loss": 4.9737, + "step": 29049 + }, + { + "epoch": 0.17276857931296982, + "grad_norm": 1.7950671911239624, + "learning_rate": 4.640729153003864e-05, + "loss": 4.3243, + "step": 29050 + }, + { + "epoch": 0.17277452659625084, + "grad_norm": 1.7763174772262573, + "learning_rate": 4.6407050273371225e-05, + "loss": 4.3468, + "step": 29051 + }, + { + "epoch": 0.17278047387953183, + "grad_norm": 1.7274105548858643, + "learning_rate": 4.640680900923083e-05, + "loss": 4.3678, + "step": 29052 + }, + { + "epoch": 0.17278642116281281, + "grad_norm": 1.8083571195602417, + "learning_rate": 4.640656773761755e-05, + "loss": 4.0583, + "step": 29053 + }, + { + "epoch": 0.17279236844609383, + "grad_norm": 1.5555697679519653, + "learning_rate": 4.640632645853145e-05, + "loss": 4.9759, + "step": 29054 + }, + { + "epoch": 0.17279831572937482, + "grad_norm": 1.5617389678955078, + "learning_rate": 4.640608517197263e-05, + "loss": 4.9137, + "step": 29055 + }, + { + "epoch": 0.1728042630126558, + "grad_norm": 1.549464225769043, + "learning_rate": 4.640584387794115e-05, + "loss": 5.158, + "step": 29056 + }, + { + "epoch": 0.17281021029593682, + "grad_norm": 1.7087653875350952, + "learning_rate": 4.6405602576437126e-05, + "loss": 5.136, + "step": 29057 + }, + { + "epoch": 0.1728161575792178, + "grad_norm": 1.5118201971054077, + "learning_rate": 4.640536126746062e-05, + "loss": 5.1956, + "step": 29058 + }, + { + "epoch": 0.1728221048624988, + "grad_norm": 1.6387808322906494, + "learning_rate": 4.640511995101173e-05, + "loss": 5.0441, + "step": 29059 + }, + { + "epoch": 0.17282805214577981, + "grad_norm": 1.652024745941162, + "learning_rate": 4.640487862709053e-05, + "loss": 4.9147, + "step": 29060 + }, + { + "epoch": 0.1728339994290608, + "grad_norm": 1.6259782314300537, + "learning_rate": 4.640463729569711e-05, + "loss": 4.2755, + "step": 29061 + }, + { + "epoch": 0.1728399467123418, + "grad_norm": 1.6286218166351318, + "learning_rate": 4.640439595683155e-05, + "loss": 4.6328, + "step": 29062 + }, + { + "epoch": 0.1728458939956228, + "grad_norm": 1.7396693229675293, + "learning_rate": 4.6404154610493934e-05, + "loss": 4.5711, + "step": 29063 + }, + { + "epoch": 0.1728518412789038, + "grad_norm": 1.4926822185516357, + "learning_rate": 4.640391325668435e-05, + "loss": 5.118, + "step": 29064 + }, + { + "epoch": 0.17285778856218478, + "grad_norm": 2.454763650894165, + "learning_rate": 4.6403671895402884e-05, + "loss": 4.817, + "step": 29065 + }, + { + "epoch": 0.1728637358454658, + "grad_norm": 1.6225837469100952, + "learning_rate": 4.640343052664962e-05, + "loss": 4.9953, + "step": 29066 + }, + { + "epoch": 0.1728696831287468, + "grad_norm": 1.8164595365524292, + "learning_rate": 4.640318915042463e-05, + "loss": 4.9384, + "step": 29067 + }, + { + "epoch": 0.17287563041202778, + "grad_norm": 1.4794782400131226, + "learning_rate": 4.640294776672801e-05, + "loss": 5.2635, + "step": 29068 + }, + { + "epoch": 0.1728815776953088, + "grad_norm": 1.6981302499771118, + "learning_rate": 4.640270637555985e-05, + "loss": 5.283, + "step": 29069 + }, + { + "epoch": 0.17288752497858978, + "grad_norm": 1.8669052124023438, + "learning_rate": 4.640246497692022e-05, + "loss": 4.303, + "step": 29070 + }, + { + "epoch": 0.17289347226187077, + "grad_norm": 1.8505442142486572, + "learning_rate": 4.640222357080921e-05, + "loss": 4.6573, + "step": 29071 + }, + { + "epoch": 0.17289941954515178, + "grad_norm": 1.6368263959884644, + "learning_rate": 4.640198215722691e-05, + "loss": 4.5301, + "step": 29072 + }, + { + "epoch": 0.17290536682843277, + "grad_norm": 1.665531039237976, + "learning_rate": 4.640174073617339e-05, + "loss": 5.2184, + "step": 29073 + }, + { + "epoch": 0.17291131411171376, + "grad_norm": 1.663392186164856, + "learning_rate": 4.640149930764875e-05, + "loss": 4.1373, + "step": 29074 + }, + { + "epoch": 0.17291726139499478, + "grad_norm": 1.8580307960510254, + "learning_rate": 4.640125787165307e-05, + "loss": 4.4035, + "step": 29075 + }, + { + "epoch": 0.17292320867827576, + "grad_norm": 1.5936819314956665, + "learning_rate": 4.640101642818643e-05, + "loss": 5.145, + "step": 29076 + }, + { + "epoch": 0.17292915596155675, + "grad_norm": 1.7124170064926147, + "learning_rate": 4.6400774977248915e-05, + "loss": 4.1569, + "step": 29077 + }, + { + "epoch": 0.17293510324483777, + "grad_norm": 2.51955509185791, + "learning_rate": 4.6400533518840614e-05, + "loss": 3.8795, + "step": 29078 + }, + { + "epoch": 0.17294105052811876, + "grad_norm": 1.6238064765930176, + "learning_rate": 4.6400292052961604e-05, + "loss": 5.0575, + "step": 29079 + }, + { + "epoch": 0.17294699781139974, + "grad_norm": 1.7471083402633667, + "learning_rate": 4.6400050579611974e-05, + "loss": 4.1607, + "step": 29080 + }, + { + "epoch": 0.17295294509468076, + "grad_norm": 1.7179365158081055, + "learning_rate": 4.639980909879181e-05, + "loss": 4.2253, + "step": 29081 + }, + { + "epoch": 0.17295889237796175, + "grad_norm": 1.6772149801254272, + "learning_rate": 4.639956761050119e-05, + "loss": 4.0833, + "step": 29082 + }, + { + "epoch": 0.17296483966124274, + "grad_norm": 1.6395635604858398, + "learning_rate": 4.639932611474021e-05, + "loss": 4.3961, + "step": 29083 + }, + { + "epoch": 0.17297078694452375, + "grad_norm": 1.5897985696792603, + "learning_rate": 4.6399084611508935e-05, + "loss": 4.5272, + "step": 29084 + }, + { + "epoch": 0.17297673422780474, + "grad_norm": 1.5276799201965332, + "learning_rate": 4.639884310080746e-05, + "loss": 5.037, + "step": 29085 + }, + { + "epoch": 0.17298268151108573, + "grad_norm": 1.5612523555755615, + "learning_rate": 4.639860158263588e-05, + "loss": 5.2272, + "step": 29086 + }, + { + "epoch": 0.17298862879436674, + "grad_norm": 1.7078372240066528, + "learning_rate": 4.639836005699426e-05, + "loss": 4.2294, + "step": 29087 + }, + { + "epoch": 0.17299457607764773, + "grad_norm": 1.643798828125, + "learning_rate": 4.63981185238827e-05, + "loss": 4.1974, + "step": 29088 + }, + { + "epoch": 0.17300052336092872, + "grad_norm": 1.7256457805633545, + "learning_rate": 4.639787698330128e-05, + "loss": 4.3683, + "step": 29089 + }, + { + "epoch": 0.17300647064420974, + "grad_norm": 1.9199156761169434, + "learning_rate": 4.6397635435250076e-05, + "loss": 4.3005, + "step": 29090 + }, + { + "epoch": 0.17301241792749072, + "grad_norm": 1.927114486694336, + "learning_rate": 4.6397393879729176e-05, + "loss": 3.53, + "step": 29091 + }, + { + "epoch": 0.1730183652107717, + "grad_norm": 1.5402168035507202, + "learning_rate": 4.639715231673868e-05, + "loss": 5.048, + "step": 29092 + }, + { + "epoch": 0.17302431249405273, + "grad_norm": 1.4014962911605835, + "learning_rate": 4.6396910746278646e-05, + "loss": 4.9029, + "step": 29093 + }, + { + "epoch": 0.17303025977733372, + "grad_norm": 1.3504273891448975, + "learning_rate": 4.639666916834918e-05, + "loss": 4.9728, + "step": 29094 + }, + { + "epoch": 0.1730362070606147, + "grad_norm": 1.4277746677398682, + "learning_rate": 4.639642758295035e-05, + "loss": 4.9853, + "step": 29095 + }, + { + "epoch": 0.17304215434389572, + "grad_norm": 1.664764165878296, + "learning_rate": 4.639618599008225e-05, + "loss": 4.9195, + "step": 29096 + }, + { + "epoch": 0.1730481016271767, + "grad_norm": 1.7788653373718262, + "learning_rate": 4.639594438974497e-05, + "loss": 4.6073, + "step": 29097 + }, + { + "epoch": 0.1730540489104577, + "grad_norm": 1.543224573135376, + "learning_rate": 4.639570278193858e-05, + "loss": 4.5988, + "step": 29098 + }, + { + "epoch": 0.1730599961937387, + "grad_norm": 1.8790651559829712, + "learning_rate": 4.639546116666317e-05, + "loss": 4.3982, + "step": 29099 + }, + { + "epoch": 0.1730659434770197, + "grad_norm": 1.6308414936065674, + "learning_rate": 4.639521954391883e-05, + "loss": 4.8477, + "step": 29100 + }, + { + "epoch": 0.1730718907603007, + "grad_norm": 1.7135157585144043, + "learning_rate": 4.639497791370564e-05, + "loss": 5.0111, + "step": 29101 + }, + { + "epoch": 0.1730778380435817, + "grad_norm": 1.9777605533599854, + "learning_rate": 4.639473627602369e-05, + "loss": 5.2615, + "step": 29102 + }, + { + "epoch": 0.1730837853268627, + "grad_norm": 1.8689080476760864, + "learning_rate": 4.639449463087304e-05, + "loss": 5.4032, + "step": 29103 + }, + { + "epoch": 0.17308973261014368, + "grad_norm": 1.8719011545181274, + "learning_rate": 4.6394252978253814e-05, + "loss": 4.7377, + "step": 29104 + }, + { + "epoch": 0.17309567989342467, + "grad_norm": 2.0242390632629395, + "learning_rate": 4.6394011318166066e-05, + "loss": 4.3017, + "step": 29105 + }, + { + "epoch": 0.17310162717670569, + "grad_norm": 1.6117249727249146, + "learning_rate": 4.639376965060989e-05, + "loss": 4.5215, + "step": 29106 + }, + { + "epoch": 0.17310757445998667, + "grad_norm": 1.9272388219833374, + "learning_rate": 4.639352797558536e-05, + "loss": 4.4802, + "step": 29107 + }, + { + "epoch": 0.17311352174326766, + "grad_norm": 1.7987074851989746, + "learning_rate": 4.639328629309259e-05, + "loss": 4.4009, + "step": 29108 + }, + { + "epoch": 0.17311946902654868, + "grad_norm": 1.8932039737701416, + "learning_rate": 4.639304460313163e-05, + "loss": 4.3668, + "step": 29109 + }, + { + "epoch": 0.17312541630982967, + "grad_norm": 2.2508416175842285, + "learning_rate": 4.639280290570258e-05, + "loss": 4.9557, + "step": 29110 + }, + { + "epoch": 0.17313136359311065, + "grad_norm": 2.086383104324341, + "learning_rate": 4.639256120080553e-05, + "loss": 5.0933, + "step": 29111 + }, + { + "epoch": 0.17313731087639167, + "grad_norm": 1.7917490005493164, + "learning_rate": 4.639231948844056e-05, + "loss": 5.2057, + "step": 29112 + }, + { + "epoch": 0.17314325815967266, + "grad_norm": 1.8576172590255737, + "learning_rate": 4.639207776860774e-05, + "loss": 4.4434, + "step": 29113 + }, + { + "epoch": 0.17314920544295365, + "grad_norm": 1.746186375617981, + "learning_rate": 4.639183604130717e-05, + "loss": 4.2003, + "step": 29114 + }, + { + "epoch": 0.17315515272623466, + "grad_norm": 2.03523588180542, + "learning_rate": 4.639159430653894e-05, + "loss": 4.2907, + "step": 29115 + }, + { + "epoch": 0.17316110000951565, + "grad_norm": 2.0713725090026855, + "learning_rate": 4.639135256430312e-05, + "loss": 4.3741, + "step": 29116 + }, + { + "epoch": 0.17316704729279664, + "grad_norm": 2.745671510696411, + "learning_rate": 4.63911108145998e-05, + "loss": 4.6313, + "step": 29117 + }, + { + "epoch": 0.17317299457607765, + "grad_norm": 1.9662394523620605, + "learning_rate": 4.639086905742906e-05, + "loss": 4.2027, + "step": 29118 + }, + { + "epoch": 0.17317894185935864, + "grad_norm": 1.7448909282684326, + "learning_rate": 4.6390627292791e-05, + "loss": 4.9481, + "step": 29119 + }, + { + "epoch": 0.17318488914263963, + "grad_norm": 1.684590458869934, + "learning_rate": 4.639038552068569e-05, + "loss": 4.8794, + "step": 29120 + }, + { + "epoch": 0.17319083642592065, + "grad_norm": 1.8462331295013428, + "learning_rate": 4.639014374111321e-05, + "loss": 3.9728, + "step": 29121 + }, + { + "epoch": 0.17319678370920163, + "grad_norm": 1.9657787084579468, + "learning_rate": 4.638990195407366e-05, + "loss": 4.0798, + "step": 29122 + }, + { + "epoch": 0.17320273099248262, + "grad_norm": 1.7591108083724976, + "learning_rate": 4.638966015956711e-05, + "loss": 3.9714, + "step": 29123 + }, + { + "epoch": 0.17320867827576364, + "grad_norm": 1.6764097213745117, + "learning_rate": 4.638941835759365e-05, + "loss": 4.7804, + "step": 29124 + }, + { + "epoch": 0.17321462555904463, + "grad_norm": 1.7766660451889038, + "learning_rate": 4.638917654815336e-05, + "loss": 4.8408, + "step": 29125 + }, + { + "epoch": 0.17322057284232562, + "grad_norm": 1.7548637390136719, + "learning_rate": 4.638893473124634e-05, + "loss": 4.9905, + "step": 29126 + }, + { + "epoch": 0.17322652012560663, + "grad_norm": 1.933996319770813, + "learning_rate": 4.6388692906872664e-05, + "loss": 4.757, + "step": 29127 + }, + { + "epoch": 0.17323246740888762, + "grad_norm": 1.6957604885101318, + "learning_rate": 4.638845107503241e-05, + "loss": 5.1555, + "step": 29128 + }, + { + "epoch": 0.1732384146921686, + "grad_norm": 1.7500252723693848, + "learning_rate": 4.638820923572567e-05, + "loss": 4.9637, + "step": 29129 + }, + { + "epoch": 0.17324436197544962, + "grad_norm": 1.8749233484268188, + "learning_rate": 4.638796738895253e-05, + "loss": 4.0375, + "step": 29130 + }, + { + "epoch": 0.1732503092587306, + "grad_norm": 2.124462842941284, + "learning_rate": 4.6387725534713066e-05, + "loss": 4.6226, + "step": 29131 + }, + { + "epoch": 0.1732562565420116, + "grad_norm": 1.877875804901123, + "learning_rate": 4.6387483673007375e-05, + "loss": 4.572, + "step": 29132 + }, + { + "epoch": 0.17326220382529262, + "grad_norm": 1.7845820188522339, + "learning_rate": 4.6387241803835535e-05, + "loss": 4.4978, + "step": 29133 + }, + { + "epoch": 0.1732681511085736, + "grad_norm": 1.5177055597305298, + "learning_rate": 4.638699992719762e-05, + "loss": 4.6488, + "step": 29134 + }, + { + "epoch": 0.1732740983918546, + "grad_norm": 1.6078678369522095, + "learning_rate": 4.6386758043093736e-05, + "loss": 4.5668, + "step": 29135 + }, + { + "epoch": 0.1732800456751356, + "grad_norm": 1.640980839729309, + "learning_rate": 4.638651615152395e-05, + "loss": 4.8367, + "step": 29136 + }, + { + "epoch": 0.1732859929584166, + "grad_norm": 1.4911829233169556, + "learning_rate": 4.638627425248835e-05, + "loss": 4.6056, + "step": 29137 + }, + { + "epoch": 0.17329194024169758, + "grad_norm": 1.7402757406234741, + "learning_rate": 4.6386032345987026e-05, + "loss": 4.6695, + "step": 29138 + }, + { + "epoch": 0.1732978875249786, + "grad_norm": 1.7571971416473389, + "learning_rate": 4.638579043202006e-05, + "loss": 4.3587, + "step": 29139 + }, + { + "epoch": 0.1733038348082596, + "grad_norm": 1.9201890230178833, + "learning_rate": 4.6385548510587527e-05, + "loss": 4.6875, + "step": 29140 + }, + { + "epoch": 0.17330978209154058, + "grad_norm": 1.61739182472229, + "learning_rate": 4.638530658168954e-05, + "loss": 4.354, + "step": 29141 + }, + { + "epoch": 0.1733157293748216, + "grad_norm": 1.530254602432251, + "learning_rate": 4.6385064645326144e-05, + "loss": 5.1195, + "step": 29142 + }, + { + "epoch": 0.17332167665810258, + "grad_norm": 1.604181170463562, + "learning_rate": 4.638482270149745e-05, + "loss": 4.5733, + "step": 29143 + }, + { + "epoch": 0.17332762394138357, + "grad_norm": 1.5250577926635742, + "learning_rate": 4.638458075020353e-05, + "loss": 5.0787, + "step": 29144 + }, + { + "epoch": 0.17333357122466458, + "grad_norm": 1.539345383644104, + "learning_rate": 4.638433879144448e-05, + "loss": 4.5644, + "step": 29145 + }, + { + "epoch": 0.17333951850794557, + "grad_norm": 1.4076765775680542, + "learning_rate": 4.6384096825220376e-05, + "loss": 4.8226, + "step": 29146 + }, + { + "epoch": 0.17334546579122656, + "grad_norm": 1.5576672554016113, + "learning_rate": 4.6383854851531304e-05, + "loss": 4.8671, + "step": 29147 + }, + { + "epoch": 0.17335141307450758, + "grad_norm": 1.4902443885803223, + "learning_rate": 4.638361287037735e-05, + "loss": 5.0003, + "step": 29148 + }, + { + "epoch": 0.17335736035778856, + "grad_norm": 1.3985077142715454, + "learning_rate": 4.63833708817586e-05, + "loss": 5.297, + "step": 29149 + }, + { + "epoch": 0.17336330764106955, + "grad_norm": 1.798403263092041, + "learning_rate": 4.638312888567513e-05, + "loss": 4.8625, + "step": 29150 + }, + { + "epoch": 0.17336925492435057, + "grad_norm": 1.5843340158462524, + "learning_rate": 4.638288688212704e-05, + "loss": 4.7577, + "step": 29151 + }, + { + "epoch": 0.17337520220763156, + "grad_norm": 1.5716784000396729, + "learning_rate": 4.63826448711144e-05, + "loss": 5.1091, + "step": 29152 + }, + { + "epoch": 0.17338114949091255, + "grad_norm": 1.7493597269058228, + "learning_rate": 4.6382402852637294e-05, + "loss": 4.8566, + "step": 29153 + }, + { + "epoch": 0.17338709677419356, + "grad_norm": 1.6974579095840454, + "learning_rate": 4.638216082669582e-05, + "loss": 4.8687, + "step": 29154 + }, + { + "epoch": 0.17339304405747455, + "grad_norm": 1.6314281225204468, + "learning_rate": 4.6381918793290055e-05, + "loss": 4.8077, + "step": 29155 + }, + { + "epoch": 0.17339899134075554, + "grad_norm": 1.6575573682785034, + "learning_rate": 4.6381676752420076e-05, + "loss": 4.9225, + "step": 29156 + }, + { + "epoch": 0.17340493862403655, + "grad_norm": 1.4562337398529053, + "learning_rate": 4.638143470408598e-05, + "loss": 5.056, + "step": 29157 + }, + { + "epoch": 0.17341088590731754, + "grad_norm": 1.6989314556121826, + "learning_rate": 4.638119264828784e-05, + "loss": 5.0006, + "step": 29158 + }, + { + "epoch": 0.17341683319059853, + "grad_norm": 1.6114591360092163, + "learning_rate": 4.638095058502575e-05, + "loss": 4.7174, + "step": 29159 + }, + { + "epoch": 0.17342278047387955, + "grad_norm": 1.8833446502685547, + "learning_rate": 4.6380708514299794e-05, + "loss": 4.6826, + "step": 29160 + }, + { + "epoch": 0.17342872775716053, + "grad_norm": 1.8556679487228394, + "learning_rate": 4.638046643611006e-05, + "loss": 4.6246, + "step": 29161 + }, + { + "epoch": 0.17343467504044152, + "grad_norm": 1.8661102056503296, + "learning_rate": 4.6380224350456615e-05, + "loss": 4.4789, + "step": 29162 + }, + { + "epoch": 0.1734406223237225, + "grad_norm": 1.7095074653625488, + "learning_rate": 4.637998225733956e-05, + "loss": 4.923, + "step": 29163 + }, + { + "epoch": 0.17344656960700353, + "grad_norm": 1.34967041015625, + "learning_rate": 4.6379740156758966e-05, + "loss": 4.797, + "step": 29164 + }, + { + "epoch": 0.1734525168902845, + "grad_norm": 1.7319891452789307, + "learning_rate": 4.637949804871493e-05, + "loss": 4.5764, + "step": 29165 + }, + { + "epoch": 0.1734584641735655, + "grad_norm": 1.7644058465957642, + "learning_rate": 4.637925593320754e-05, + "loss": 4.5592, + "step": 29166 + }, + { + "epoch": 0.17346441145684652, + "grad_norm": 1.773938775062561, + "learning_rate": 4.637901381023686e-05, + "loss": 5.0608, + "step": 29167 + }, + { + "epoch": 0.1734703587401275, + "grad_norm": 1.7514781951904297, + "learning_rate": 4.637877167980299e-05, + "loss": 4.6467, + "step": 29168 + }, + { + "epoch": 0.1734763060234085, + "grad_norm": 1.6960844993591309, + "learning_rate": 4.637852954190602e-05, + "loss": 4.4893, + "step": 29169 + }, + { + "epoch": 0.1734822533066895, + "grad_norm": 1.687488317489624, + "learning_rate": 4.6378287396546024e-05, + "loss": 4.5032, + "step": 29170 + }, + { + "epoch": 0.1734882005899705, + "grad_norm": 1.71660315990448, + "learning_rate": 4.6378045243723084e-05, + "loss": 4.9538, + "step": 29171 + }, + { + "epoch": 0.1734941478732515, + "grad_norm": 1.8937394618988037, + "learning_rate": 4.637780308343729e-05, + "loss": 4.6157, + "step": 29172 + }, + { + "epoch": 0.1735000951565325, + "grad_norm": 1.8577438592910767, + "learning_rate": 4.637756091568873e-05, + "loss": 4.5289, + "step": 29173 + }, + { + "epoch": 0.1735060424398135, + "grad_norm": 1.6964426040649414, + "learning_rate": 4.637731874047748e-05, + "loss": 4.6735, + "step": 29174 + }, + { + "epoch": 0.17351198972309448, + "grad_norm": 1.708333134651184, + "learning_rate": 4.637707655780363e-05, + "loss": 4.7042, + "step": 29175 + }, + { + "epoch": 0.1735179370063755, + "grad_norm": 2.0699708461761475, + "learning_rate": 4.637683436766726e-05, + "loss": 4.259, + "step": 29176 + }, + { + "epoch": 0.17352388428965648, + "grad_norm": 1.9782260656356812, + "learning_rate": 4.637659217006846e-05, + "loss": 4.2724, + "step": 29177 + }, + { + "epoch": 0.17352983157293747, + "grad_norm": 1.8892062902450562, + "learning_rate": 4.6376349965007316e-05, + "loss": 4.0619, + "step": 29178 + }, + { + "epoch": 0.1735357788562185, + "grad_norm": 3.4207348823547363, + "learning_rate": 4.637610775248391e-05, + "loss": 4.0752, + "step": 29179 + }, + { + "epoch": 0.17354172613949947, + "grad_norm": 2.4128661155700684, + "learning_rate": 4.6375865532498316e-05, + "loss": 3.7859, + "step": 29180 + }, + { + "epoch": 0.17354767342278046, + "grad_norm": 1.7334697246551514, + "learning_rate": 4.6375623305050635e-05, + "loss": 4.586, + "step": 29181 + }, + { + "epoch": 0.17355362070606148, + "grad_norm": 2.0362465381622314, + "learning_rate": 4.6375381070140946e-05, + "loss": 4.2091, + "step": 29182 + }, + { + "epoch": 0.17355956798934247, + "grad_norm": 1.7851359844207764, + "learning_rate": 4.637513882776933e-05, + "loss": 4.1567, + "step": 29183 + }, + { + "epoch": 0.17356551527262346, + "grad_norm": 1.9078037738800049, + "learning_rate": 4.637489657793588e-05, + "loss": 4.0716, + "step": 29184 + }, + { + "epoch": 0.17357146255590447, + "grad_norm": 1.7366207838058472, + "learning_rate": 4.6374654320640666e-05, + "loss": 4.3262, + "step": 29185 + }, + { + "epoch": 0.17357740983918546, + "grad_norm": 1.8948423862457275, + "learning_rate": 4.6374412055883785e-05, + "loss": 4.1564, + "step": 29186 + }, + { + "epoch": 0.17358335712246645, + "grad_norm": 1.9613217115402222, + "learning_rate": 4.637416978366532e-05, + "loss": 4.1586, + "step": 29187 + }, + { + "epoch": 0.17358930440574746, + "grad_norm": 2.4783365726470947, + "learning_rate": 4.637392750398535e-05, + "loss": 3.6734, + "step": 29188 + }, + { + "epoch": 0.17359525168902845, + "grad_norm": 2.1660149097442627, + "learning_rate": 4.637368521684396e-05, + "loss": 3.7469, + "step": 29189 + }, + { + "epoch": 0.17360119897230944, + "grad_norm": 2.462066650390625, + "learning_rate": 4.637344292224124e-05, + "loss": 3.6566, + "step": 29190 + }, + { + "epoch": 0.17360714625559046, + "grad_norm": 1.8963021039962769, + "learning_rate": 4.637320062017727e-05, + "loss": 4.0244, + "step": 29191 + }, + { + "epoch": 0.17361309353887144, + "grad_norm": 1.9739018678665161, + "learning_rate": 4.6372958310652135e-05, + "loss": 4.1696, + "step": 29192 + }, + { + "epoch": 0.17361904082215243, + "grad_norm": 1.9879587888717651, + "learning_rate": 4.637271599366593e-05, + "loss": 4.7111, + "step": 29193 + }, + { + "epoch": 0.17362498810543345, + "grad_norm": 1.8292521238327026, + "learning_rate": 4.637247366921872e-05, + "loss": 4.6283, + "step": 29194 + }, + { + "epoch": 0.17363093538871444, + "grad_norm": 1.5309460163116455, + "learning_rate": 4.6372231337310605e-05, + "loss": 4.6252, + "step": 29195 + }, + { + "epoch": 0.17363688267199542, + "grad_norm": 1.8792744874954224, + "learning_rate": 4.637198899794167e-05, + "loss": 4.2226, + "step": 29196 + }, + { + "epoch": 0.17364282995527644, + "grad_norm": 2.1824088096618652, + "learning_rate": 4.6371746651111985e-05, + "loss": 4.2028, + "step": 29197 + }, + { + "epoch": 0.17364877723855743, + "grad_norm": 2.0413753986358643, + "learning_rate": 4.637150429682165e-05, + "loss": 4.1982, + "step": 29198 + }, + { + "epoch": 0.17365472452183842, + "grad_norm": 1.6897474527359009, + "learning_rate": 4.637126193507074e-05, + "loss": 4.5085, + "step": 29199 + }, + { + "epoch": 0.17366067180511943, + "grad_norm": 1.6577891111373901, + "learning_rate": 4.637101956585935e-05, + "loss": 4.6212, + "step": 29200 + }, + { + "epoch": 0.17366661908840042, + "grad_norm": 1.6855782270431519, + "learning_rate": 4.637077718918755e-05, + "loss": 4.7156, + "step": 29201 + }, + { + "epoch": 0.1736725663716814, + "grad_norm": 2.017664909362793, + "learning_rate": 4.637053480505543e-05, + "loss": 4.5439, + "step": 29202 + }, + { + "epoch": 0.17367851365496242, + "grad_norm": 1.7421058416366577, + "learning_rate": 4.637029241346309e-05, + "loss": 4.3292, + "step": 29203 + }, + { + "epoch": 0.1736844609382434, + "grad_norm": 1.6741775274276733, + "learning_rate": 4.6370050014410594e-05, + "loss": 4.3136, + "step": 29204 + }, + { + "epoch": 0.1736904082215244, + "grad_norm": 1.9777534008026123, + "learning_rate": 4.636980760789803e-05, + "loss": 4.1499, + "step": 29205 + }, + { + "epoch": 0.17369635550480542, + "grad_norm": 2.133716583251953, + "learning_rate": 4.6369565193925505e-05, + "loss": 4.2251, + "step": 29206 + }, + { + "epoch": 0.1737023027880864, + "grad_norm": 2.047595739364624, + "learning_rate": 4.636932277249306e-05, + "loss": 4.0876, + "step": 29207 + }, + { + "epoch": 0.1737082500713674, + "grad_norm": 1.9693220853805542, + "learning_rate": 4.636908034360082e-05, + "loss": 3.8007, + "step": 29208 + }, + { + "epoch": 0.1737141973546484, + "grad_norm": 1.7148840427398682, + "learning_rate": 4.6368837907248855e-05, + "loss": 4.3048, + "step": 29209 + }, + { + "epoch": 0.1737201446379294, + "grad_norm": 1.4605804681777954, + "learning_rate": 4.6368595463437246e-05, + "loss": 4.402, + "step": 29210 + }, + { + "epoch": 0.17372609192121038, + "grad_norm": 1.8033897876739502, + "learning_rate": 4.636835301216608e-05, + "loss": 4.491, + "step": 29211 + }, + { + "epoch": 0.1737320392044914, + "grad_norm": 1.6581388711929321, + "learning_rate": 4.636811055343545e-05, + "loss": 4.5847, + "step": 29212 + }, + { + "epoch": 0.1737379864877724, + "grad_norm": 1.7046984434127808, + "learning_rate": 4.636786808724542e-05, + "loss": 4.7485, + "step": 29213 + }, + { + "epoch": 0.17374393377105338, + "grad_norm": 1.735479474067688, + "learning_rate": 4.6367625613596096e-05, + "loss": 4.8771, + "step": 29214 + }, + { + "epoch": 0.1737498810543344, + "grad_norm": 1.781473994255066, + "learning_rate": 4.636738313248756e-05, + "loss": 4.4308, + "step": 29215 + }, + { + "epoch": 0.17375582833761538, + "grad_norm": 1.945377230644226, + "learning_rate": 4.636714064391988e-05, + "loss": 3.9839, + "step": 29216 + }, + { + "epoch": 0.17376177562089637, + "grad_norm": 1.9880878925323486, + "learning_rate": 4.6366898147893165e-05, + "loss": 4.1544, + "step": 29217 + }, + { + "epoch": 0.17376772290417739, + "grad_norm": 1.9976726770401, + "learning_rate": 4.6366655644407475e-05, + "loss": 4.4061, + "step": 29218 + }, + { + "epoch": 0.17377367018745837, + "grad_norm": 2.0192174911499023, + "learning_rate": 4.6366413133462915e-05, + "loss": 4.3094, + "step": 29219 + }, + { + "epoch": 0.17377961747073936, + "grad_norm": 1.9302101135253906, + "learning_rate": 4.636617061505956e-05, + "loss": 4.7673, + "step": 29220 + }, + { + "epoch": 0.17378556475402035, + "grad_norm": 1.6863242387771606, + "learning_rate": 4.636592808919749e-05, + "loss": 4.7641, + "step": 29221 + }, + { + "epoch": 0.17379151203730137, + "grad_norm": 1.8345664739608765, + "learning_rate": 4.63656855558768e-05, + "loss": 4.6849, + "step": 29222 + }, + { + "epoch": 0.17379745932058235, + "grad_norm": 1.5179288387298584, + "learning_rate": 4.636544301509756e-05, + "loss": 4.7481, + "step": 29223 + }, + { + "epoch": 0.17380340660386334, + "grad_norm": 1.82593834400177, + "learning_rate": 4.6365200466859876e-05, + "loss": 4.7234, + "step": 29224 + }, + { + "epoch": 0.17380935388714436, + "grad_norm": 1.7959182262420654, + "learning_rate": 4.636495791116382e-05, + "loss": 5.0005, + "step": 29225 + }, + { + "epoch": 0.17381530117042535, + "grad_norm": 2.36141037940979, + "learning_rate": 4.636471534800947e-05, + "loss": 4.1279, + "step": 29226 + }, + { + "epoch": 0.17382124845370633, + "grad_norm": 1.8446800708770752, + "learning_rate": 4.636447277739693e-05, + "loss": 4.3379, + "step": 29227 + }, + { + "epoch": 0.17382719573698735, + "grad_norm": 1.9190828800201416, + "learning_rate": 4.636423019932626e-05, + "loss": 4.3296, + "step": 29228 + }, + { + "epoch": 0.17383314302026834, + "grad_norm": 1.863991379737854, + "learning_rate": 4.636398761379756e-05, + "loss": 4.3733, + "step": 29229 + }, + { + "epoch": 0.17383909030354933, + "grad_norm": 1.7630629539489746, + "learning_rate": 4.636374502081092e-05, + "loss": 4.3829, + "step": 29230 + }, + { + "epoch": 0.17384503758683034, + "grad_norm": 1.554083228111267, + "learning_rate": 4.636350242036642e-05, + "loss": 4.6883, + "step": 29231 + }, + { + "epoch": 0.17385098487011133, + "grad_norm": 1.6765477657318115, + "learning_rate": 4.6363259812464135e-05, + "loss": 4.5129, + "step": 29232 + }, + { + "epoch": 0.17385693215339232, + "grad_norm": 1.6007416248321533, + "learning_rate": 4.636301719710416e-05, + "loss": 4.561, + "step": 29233 + }, + { + "epoch": 0.17386287943667333, + "grad_norm": 1.6795105934143066, + "learning_rate": 4.6362774574286575e-05, + "loss": 4.6389, + "step": 29234 + }, + { + "epoch": 0.17386882671995432, + "grad_norm": 1.6491032838821411, + "learning_rate": 4.6362531944011464e-05, + "loss": 4.3857, + "step": 29235 + }, + { + "epoch": 0.1738747740032353, + "grad_norm": 2.123032569885254, + "learning_rate": 4.636228930627892e-05, + "loss": 3.8423, + "step": 29236 + }, + { + "epoch": 0.17388072128651633, + "grad_norm": 2.0041513442993164, + "learning_rate": 4.636204666108902e-05, + "loss": 3.1621, + "step": 29237 + }, + { + "epoch": 0.17388666856979731, + "grad_norm": 1.6654435396194458, + "learning_rate": 4.636180400844185e-05, + "loss": 4.3272, + "step": 29238 + }, + { + "epoch": 0.1738926158530783, + "grad_norm": 1.553393006324768, + "learning_rate": 4.636156134833749e-05, + "loss": 4.9542, + "step": 29239 + }, + { + "epoch": 0.17389856313635932, + "grad_norm": 1.6511328220367432, + "learning_rate": 4.6361318680776035e-05, + "loss": 5.0055, + "step": 29240 + }, + { + "epoch": 0.1739045104196403, + "grad_norm": 1.8133567571640015, + "learning_rate": 4.6361076005757554e-05, + "loss": 4.4575, + "step": 29241 + }, + { + "epoch": 0.1739104577029213, + "grad_norm": 2.6649341583251953, + "learning_rate": 4.636083332328215e-05, + "loss": 4.1054, + "step": 29242 + }, + { + "epoch": 0.1739164049862023, + "grad_norm": 2.676636219024658, + "learning_rate": 4.63605906333499e-05, + "loss": 3.5847, + "step": 29243 + }, + { + "epoch": 0.1739223522694833, + "grad_norm": 2.376490592956543, + "learning_rate": 4.636034793596089e-05, + "loss": 3.9051, + "step": 29244 + }, + { + "epoch": 0.1739282995527643, + "grad_norm": 1.6567094326019287, + "learning_rate": 4.63601052311152e-05, + "loss": 5.1711, + "step": 29245 + }, + { + "epoch": 0.1739342468360453, + "grad_norm": 1.981115698814392, + "learning_rate": 4.6359862518812924e-05, + "loss": 3.8426, + "step": 29246 + }, + { + "epoch": 0.1739401941193263, + "grad_norm": 1.640690565109253, + "learning_rate": 4.6359619799054136e-05, + "loss": 4.3196, + "step": 29247 + }, + { + "epoch": 0.17394614140260728, + "grad_norm": 1.6027098894119263, + "learning_rate": 4.635937707183892e-05, + "loss": 5.2091, + "step": 29248 + }, + { + "epoch": 0.1739520886858883, + "grad_norm": 1.732526183128357, + "learning_rate": 4.6359134337167375e-05, + "loss": 5.0799, + "step": 29249 + }, + { + "epoch": 0.17395803596916928, + "grad_norm": 1.7720987796783447, + "learning_rate": 4.635889159503957e-05, + "loss": 4.9359, + "step": 29250 + }, + { + "epoch": 0.17396398325245027, + "grad_norm": 1.60392427444458, + "learning_rate": 4.63586488454556e-05, + "loss": 4.8213, + "step": 29251 + }, + { + "epoch": 0.1739699305357313, + "grad_norm": 1.4416741132736206, + "learning_rate": 4.635840608841555e-05, + "loss": 5.1283, + "step": 29252 + }, + { + "epoch": 0.17397587781901228, + "grad_norm": 1.9322450160980225, + "learning_rate": 4.63581633239195e-05, + "loss": 4.4477, + "step": 29253 + }, + { + "epoch": 0.17398182510229326, + "grad_norm": 1.661475658416748, + "learning_rate": 4.635792055196753e-05, + "loss": 4.6993, + "step": 29254 + }, + { + "epoch": 0.17398777238557428, + "grad_norm": 1.7771600484848022, + "learning_rate": 4.635767777255973e-05, + "loss": 4.4883, + "step": 29255 + }, + { + "epoch": 0.17399371966885527, + "grad_norm": 1.8131498098373413, + "learning_rate": 4.635743498569619e-05, + "loss": 3.9214, + "step": 29256 + }, + { + "epoch": 0.17399966695213626, + "grad_norm": 1.6624927520751953, + "learning_rate": 4.635719219137699e-05, + "loss": 4.9492, + "step": 29257 + }, + { + "epoch": 0.17400561423541727, + "grad_norm": 2.7123286724090576, + "learning_rate": 4.6356949389602214e-05, + "loss": 4.7048, + "step": 29258 + }, + { + "epoch": 0.17401156151869826, + "grad_norm": 2.078057050704956, + "learning_rate": 4.6356706580371945e-05, + "loss": 4.5294, + "step": 29259 + }, + { + "epoch": 0.17401750880197925, + "grad_norm": 1.738935947418213, + "learning_rate": 4.6356463763686275e-05, + "loss": 4.7332, + "step": 29260 + }, + { + "epoch": 0.17402345608526026, + "grad_norm": 1.8803629875183105, + "learning_rate": 4.635622093954528e-05, + "loss": 4.9347, + "step": 29261 + }, + { + "epoch": 0.17402940336854125, + "grad_norm": 1.3738025426864624, + "learning_rate": 4.635597810794905e-05, + "loss": 5.4709, + "step": 29262 + }, + { + "epoch": 0.17403535065182224, + "grad_norm": 1.6917965412139893, + "learning_rate": 4.635573526889767e-05, + "loss": 4.5494, + "step": 29263 + }, + { + "epoch": 0.17404129793510326, + "grad_norm": 1.9916536808013916, + "learning_rate": 4.6355492422391226e-05, + "loss": 4.4302, + "step": 29264 + }, + { + "epoch": 0.17404724521838424, + "grad_norm": 1.8959016799926758, + "learning_rate": 4.63552495684298e-05, + "loss": 4.1595, + "step": 29265 + }, + { + "epoch": 0.17405319250166523, + "grad_norm": 1.7730271816253662, + "learning_rate": 4.635500670701347e-05, + "loss": 4.6212, + "step": 29266 + }, + { + "epoch": 0.17405913978494625, + "grad_norm": 1.9785410165786743, + "learning_rate": 4.635476383814233e-05, + "loss": 4.6885, + "step": 29267 + }, + { + "epoch": 0.17406508706822724, + "grad_norm": 1.915924310684204, + "learning_rate": 4.6354520961816475e-05, + "loss": 4.4186, + "step": 29268 + }, + { + "epoch": 0.17407103435150822, + "grad_norm": 1.6227480173110962, + "learning_rate": 4.6354278078035964e-05, + "loss": 4.6483, + "step": 29269 + }, + { + "epoch": 0.17407698163478924, + "grad_norm": 1.6679190397262573, + "learning_rate": 4.635403518680089e-05, + "loss": 4.9393, + "step": 29270 + }, + { + "epoch": 0.17408292891807023, + "grad_norm": 1.3380484580993652, + "learning_rate": 4.6353792288111353e-05, + "loss": 5.1539, + "step": 29271 + }, + { + "epoch": 0.17408887620135122, + "grad_norm": 1.3670740127563477, + "learning_rate": 4.635354938196743e-05, + "loss": 5.1949, + "step": 29272 + }, + { + "epoch": 0.17409482348463223, + "grad_norm": 1.288189172744751, + "learning_rate": 4.63533064683692e-05, + "loss": 5.4657, + "step": 29273 + }, + { + "epoch": 0.17410077076791322, + "grad_norm": 1.4686154127120972, + "learning_rate": 4.635306354731675e-05, + "loss": 5.5222, + "step": 29274 + }, + { + "epoch": 0.1741067180511942, + "grad_norm": 1.4154938459396362, + "learning_rate": 4.635282061881017e-05, + "loss": 5.261, + "step": 29275 + }, + { + "epoch": 0.17411266533447523, + "grad_norm": 1.7723246812820435, + "learning_rate": 4.635257768284953e-05, + "loss": 5.1817, + "step": 29276 + }, + { + "epoch": 0.1741186126177562, + "grad_norm": 1.7621451616287231, + "learning_rate": 4.635233473943494e-05, + "loss": 5.2426, + "step": 29277 + }, + { + "epoch": 0.1741245599010372, + "grad_norm": 1.6899840831756592, + "learning_rate": 4.6352091788566466e-05, + "loss": 4.5392, + "step": 29278 + }, + { + "epoch": 0.17413050718431822, + "grad_norm": 1.7704182863235474, + "learning_rate": 4.6351848830244195e-05, + "loss": 4.4345, + "step": 29279 + }, + { + "epoch": 0.1741364544675992, + "grad_norm": 1.9371867179870605, + "learning_rate": 4.635160586446821e-05, + "loss": 4.1621, + "step": 29280 + }, + { + "epoch": 0.1741424017508802, + "grad_norm": 1.771759271621704, + "learning_rate": 4.63513628912386e-05, + "loss": 4.7001, + "step": 29281 + }, + { + "epoch": 0.17414834903416118, + "grad_norm": 2.212144136428833, + "learning_rate": 4.635111991055546e-05, + "loss": 3.7101, + "step": 29282 + }, + { + "epoch": 0.1741542963174422, + "grad_norm": 2.0476841926574707, + "learning_rate": 4.6350876922418864e-05, + "loss": 3.4412, + "step": 29283 + }, + { + "epoch": 0.17416024360072319, + "grad_norm": 1.849636197090149, + "learning_rate": 4.635063392682889e-05, + "loss": 4.553, + "step": 29284 + }, + { + "epoch": 0.17416619088400417, + "grad_norm": 1.9307612180709839, + "learning_rate": 4.6350390923785634e-05, + "loss": 3.7483, + "step": 29285 + }, + { + "epoch": 0.1741721381672852, + "grad_norm": 1.9862045049667358, + "learning_rate": 4.6350147913289176e-05, + "loss": 4.3754, + "step": 29286 + }, + { + "epoch": 0.17417808545056618, + "grad_norm": 1.7079651355743408, + "learning_rate": 4.63499048953396e-05, + "loss": 5.3671, + "step": 29287 + }, + { + "epoch": 0.17418403273384717, + "grad_norm": 1.8182214498519897, + "learning_rate": 4.6349661869937e-05, + "loss": 5.148, + "step": 29288 + }, + { + "epoch": 0.17418998001712818, + "grad_norm": 1.7571437358856201, + "learning_rate": 4.6349418837081445e-05, + "loss": 4.4641, + "step": 29289 + }, + { + "epoch": 0.17419592730040917, + "grad_norm": 1.6432558298110962, + "learning_rate": 4.6349175796773026e-05, + "loss": 4.6966, + "step": 29290 + }, + { + "epoch": 0.17420187458369016, + "grad_norm": 1.729112148284912, + "learning_rate": 4.634893274901184e-05, + "loss": 4.6125, + "step": 29291 + }, + { + "epoch": 0.17420782186697117, + "grad_norm": 1.5376940965652466, + "learning_rate": 4.6348689693797954e-05, + "loss": 4.4921, + "step": 29292 + }, + { + "epoch": 0.17421376915025216, + "grad_norm": 1.8997972011566162, + "learning_rate": 4.634844663113147e-05, + "loss": 4.4163, + "step": 29293 + }, + { + "epoch": 0.17421971643353315, + "grad_norm": 1.6643134355545044, + "learning_rate": 4.634820356101246e-05, + "loss": 4.7624, + "step": 29294 + }, + { + "epoch": 0.17422566371681417, + "grad_norm": 1.4758678674697876, + "learning_rate": 4.6347960483441013e-05, + "loss": 5.3261, + "step": 29295 + }, + { + "epoch": 0.17423161100009515, + "grad_norm": 1.7518540620803833, + "learning_rate": 4.6347717398417203e-05, + "loss": 4.4916, + "step": 29296 + }, + { + "epoch": 0.17423755828337614, + "grad_norm": 1.6143438816070557, + "learning_rate": 4.634747430594114e-05, + "loss": 4.336, + "step": 29297 + }, + { + "epoch": 0.17424350556665716, + "grad_norm": 1.6077839136123657, + "learning_rate": 4.634723120601289e-05, + "loss": 4.5486, + "step": 29298 + }, + { + "epoch": 0.17424945284993815, + "grad_norm": 1.9146685600280762, + "learning_rate": 4.634698809863254e-05, + "loss": 5.1115, + "step": 29299 + }, + { + "epoch": 0.17425540013321913, + "grad_norm": 1.6625542640686035, + "learning_rate": 4.634674498380018e-05, + "loss": 4.653, + "step": 29300 + }, + { + "epoch": 0.17426134741650015, + "grad_norm": 1.7577245235443115, + "learning_rate": 4.634650186151589e-05, + "loss": 4.8305, + "step": 29301 + }, + { + "epoch": 0.17426729469978114, + "grad_norm": 1.5614792108535767, + "learning_rate": 4.6346258731779755e-05, + "loss": 4.8553, + "step": 29302 + }, + { + "epoch": 0.17427324198306213, + "grad_norm": 1.5734407901763916, + "learning_rate": 4.634601559459186e-05, + "loss": 4.9925, + "step": 29303 + }, + { + "epoch": 0.17427918926634314, + "grad_norm": 1.855974555015564, + "learning_rate": 4.6345772449952293e-05, + "loss": 4.7862, + "step": 29304 + }, + { + "epoch": 0.17428513654962413, + "grad_norm": 2.7702269554138184, + "learning_rate": 4.6345529297861146e-05, + "loss": 3.766, + "step": 29305 + }, + { + "epoch": 0.17429108383290512, + "grad_norm": 2.7763569355010986, + "learning_rate": 4.634528613831848e-05, + "loss": 3.343, + "step": 29306 + }, + { + "epoch": 0.17429703111618614, + "grad_norm": 3.1644763946533203, + "learning_rate": 4.6345042971324406e-05, + "loss": 3.4067, + "step": 29307 + }, + { + "epoch": 0.17430297839946712, + "grad_norm": 2.487724781036377, + "learning_rate": 4.6344799796878996e-05, + "loss": 3.226, + "step": 29308 + }, + { + "epoch": 0.1743089256827481, + "grad_norm": 2.340416431427002, + "learning_rate": 4.634455661498234e-05, + "loss": 3.2899, + "step": 29309 + }, + { + "epoch": 0.17431487296602913, + "grad_norm": 1.6526988744735718, + "learning_rate": 4.634431342563451e-05, + "loss": 4.9105, + "step": 29310 + }, + { + "epoch": 0.17432082024931012, + "grad_norm": 2.876229763031006, + "learning_rate": 4.6344070228835614e-05, + "loss": 3.4319, + "step": 29311 + }, + { + "epoch": 0.1743267675325911, + "grad_norm": 2.176748037338257, + "learning_rate": 4.6343827024585716e-05, + "loss": 3.2444, + "step": 29312 + }, + { + "epoch": 0.17433271481587212, + "grad_norm": 2.6688148975372314, + "learning_rate": 4.6343583812884904e-05, + "loss": 3.3417, + "step": 29313 + }, + { + "epoch": 0.1743386620991531, + "grad_norm": 2.5572376251220703, + "learning_rate": 4.634334059373328e-05, + "loss": 4.0048, + "step": 29314 + }, + { + "epoch": 0.1743446093824341, + "grad_norm": 2.3012688159942627, + "learning_rate": 4.6343097367130905e-05, + "loss": 3.5363, + "step": 29315 + }, + { + "epoch": 0.1743505566657151, + "grad_norm": 1.9870244264602661, + "learning_rate": 4.6342854133077875e-05, + "loss": 4.0843, + "step": 29316 + }, + { + "epoch": 0.1743565039489961, + "grad_norm": 2.538632392883301, + "learning_rate": 4.6342610891574276e-05, + "loss": 3.3337, + "step": 29317 + }, + { + "epoch": 0.1743624512322771, + "grad_norm": 2.8932511806488037, + "learning_rate": 4.63423676426202e-05, + "loss": 3.2887, + "step": 29318 + }, + { + "epoch": 0.1743683985155581, + "grad_norm": 2.55438494682312, + "learning_rate": 4.634212438621572e-05, + "loss": 3.6218, + "step": 29319 + }, + { + "epoch": 0.1743743457988391, + "grad_norm": 2.505047082901001, + "learning_rate": 4.634188112236092e-05, + "loss": 3.182, + "step": 29320 + }, + { + "epoch": 0.17438029308212008, + "grad_norm": 2.8068132400512695, + "learning_rate": 4.63416378510559e-05, + "loss": 3.5654, + "step": 29321 + }, + { + "epoch": 0.1743862403654011, + "grad_norm": 1.9296205043792725, + "learning_rate": 4.6341394572300725e-05, + "loss": 4.492, + "step": 29322 + }, + { + "epoch": 0.17439218764868208, + "grad_norm": 1.6537705659866333, + "learning_rate": 4.63411512860955e-05, + "loss": 5.0017, + "step": 29323 + }, + { + "epoch": 0.17439813493196307, + "grad_norm": 1.8064972162246704, + "learning_rate": 4.634090799244028e-05, + "loss": 4.5991, + "step": 29324 + }, + { + "epoch": 0.1744040822152441, + "grad_norm": 1.7944536209106445, + "learning_rate": 4.634066469133519e-05, + "loss": 4.6755, + "step": 29325 + }, + { + "epoch": 0.17441002949852508, + "grad_norm": 2.222592830657959, + "learning_rate": 4.6340421382780286e-05, + "loss": 4.3161, + "step": 29326 + }, + { + "epoch": 0.17441597678180606, + "grad_norm": 2.1058638095855713, + "learning_rate": 4.634017806677567e-05, + "loss": 4.077, + "step": 29327 + }, + { + "epoch": 0.17442192406508708, + "grad_norm": 1.8931814432144165, + "learning_rate": 4.63399347433214e-05, + "loss": 4.2838, + "step": 29328 + }, + { + "epoch": 0.17442787134836807, + "grad_norm": 1.7035942077636719, + "learning_rate": 4.6339691412417586e-05, + "loss": 4.8623, + "step": 29329 + }, + { + "epoch": 0.17443381863164906, + "grad_norm": 1.7701468467712402, + "learning_rate": 4.6339448074064314e-05, + "loss": 4.9063, + "step": 29330 + }, + { + "epoch": 0.17443976591493007, + "grad_norm": 1.7608574628829956, + "learning_rate": 4.633920472826165e-05, + "loss": 4.42, + "step": 29331 + }, + { + "epoch": 0.17444571319821106, + "grad_norm": 2.5129191875457764, + "learning_rate": 4.633896137500971e-05, + "loss": 3.2521, + "step": 29332 + }, + { + "epoch": 0.17445166048149205, + "grad_norm": 1.477378010749817, + "learning_rate": 4.6338718014308534e-05, + "loss": 5.6654, + "step": 29333 + }, + { + "epoch": 0.17445760776477306, + "grad_norm": 1.6242940425872803, + "learning_rate": 4.633847464615825e-05, + "loss": 5.392, + "step": 29334 + }, + { + "epoch": 0.17446355504805405, + "grad_norm": 1.827919602394104, + "learning_rate": 4.633823127055892e-05, + "loss": 4.8818, + "step": 29335 + }, + { + "epoch": 0.17446950233133504, + "grad_norm": 1.6197007894515991, + "learning_rate": 4.633798788751063e-05, + "loss": 4.983, + "step": 29336 + }, + { + "epoch": 0.17447544961461606, + "grad_norm": 1.703899621963501, + "learning_rate": 4.633774449701347e-05, + "loss": 4.9122, + "step": 29337 + }, + { + "epoch": 0.17448139689789705, + "grad_norm": 1.7812259197235107, + "learning_rate": 4.633750109906753e-05, + "loss": 4.6429, + "step": 29338 + }, + { + "epoch": 0.17448734418117803, + "grad_norm": 1.6351381540298462, + "learning_rate": 4.633725769367288e-05, + "loss": 4.9078, + "step": 29339 + }, + { + "epoch": 0.17449329146445902, + "grad_norm": 1.7403061389923096, + "learning_rate": 4.633701428082962e-05, + "loss": 4.6946, + "step": 29340 + }, + { + "epoch": 0.17449923874774004, + "grad_norm": 1.8006681203842163, + "learning_rate": 4.633677086053783e-05, + "loss": 4.2984, + "step": 29341 + }, + { + "epoch": 0.17450518603102103, + "grad_norm": 1.7105704545974731, + "learning_rate": 4.633652743279759e-05, + "loss": 4.4426, + "step": 29342 + }, + { + "epoch": 0.174511133314302, + "grad_norm": 1.7440415620803833, + "learning_rate": 4.6336283997608984e-05, + "loss": 4.4029, + "step": 29343 + }, + { + "epoch": 0.17451708059758303, + "grad_norm": 1.7197996377944946, + "learning_rate": 4.633604055497211e-05, + "loss": 4.263, + "step": 29344 + }, + { + "epoch": 0.17452302788086402, + "grad_norm": 1.7282319068908691, + "learning_rate": 4.633579710488704e-05, + "loss": 4.546, + "step": 29345 + }, + { + "epoch": 0.174528975164145, + "grad_norm": 1.7449449300765991, + "learning_rate": 4.633555364735387e-05, + "loss": 5.1083, + "step": 29346 + }, + { + "epoch": 0.17453492244742602, + "grad_norm": 1.645507574081421, + "learning_rate": 4.633531018237267e-05, + "loss": 4.1636, + "step": 29347 + }, + { + "epoch": 0.174540869730707, + "grad_norm": 1.671286702156067, + "learning_rate": 4.6335066709943534e-05, + "loss": 4.5991, + "step": 29348 + }, + { + "epoch": 0.174546817013988, + "grad_norm": 1.5074694156646729, + "learning_rate": 4.6334823230066554e-05, + "loss": 4.5064, + "step": 29349 + }, + { + "epoch": 0.17455276429726901, + "grad_norm": 1.7285078763961792, + "learning_rate": 4.63345797427418e-05, + "loss": 4.561, + "step": 29350 + }, + { + "epoch": 0.17455871158055, + "grad_norm": 1.9212089776992798, + "learning_rate": 4.6334336247969376e-05, + "loss": 4.2444, + "step": 29351 + }, + { + "epoch": 0.174564658863831, + "grad_norm": 1.6223878860473633, + "learning_rate": 4.633409274574935e-05, + "loss": 4.8405, + "step": 29352 + }, + { + "epoch": 0.174570606147112, + "grad_norm": 1.7474267482757568, + "learning_rate": 4.6333849236081805e-05, + "loss": 4.5651, + "step": 29353 + }, + { + "epoch": 0.174576553430393, + "grad_norm": 1.6735780239105225, + "learning_rate": 4.6333605718966844e-05, + "loss": 4.1536, + "step": 29354 + }, + { + "epoch": 0.17458250071367398, + "grad_norm": 1.7096998691558838, + "learning_rate": 4.633336219440453e-05, + "loss": 4.4034, + "step": 29355 + }, + { + "epoch": 0.174588447996955, + "grad_norm": 1.7881802320480347, + "learning_rate": 4.633311866239497e-05, + "loss": 4.308, + "step": 29356 + }, + { + "epoch": 0.174594395280236, + "grad_norm": 1.4776397943496704, + "learning_rate": 4.6332875122938236e-05, + "loss": 5.1879, + "step": 29357 + }, + { + "epoch": 0.17460034256351697, + "grad_norm": 1.499626636505127, + "learning_rate": 4.6332631576034414e-05, + "loss": 5.1217, + "step": 29358 + }, + { + "epoch": 0.174606289846798, + "grad_norm": 1.5779564380645752, + "learning_rate": 4.6332388021683594e-05, + "loss": 5.1155, + "step": 29359 + }, + { + "epoch": 0.17461223713007898, + "grad_norm": 1.5778738260269165, + "learning_rate": 4.633214445988585e-05, + "loss": 5.0889, + "step": 29360 + }, + { + "epoch": 0.17461818441335997, + "grad_norm": 1.4342097043991089, + "learning_rate": 4.633190089064128e-05, + "loss": 5.1313, + "step": 29361 + }, + { + "epoch": 0.17462413169664098, + "grad_norm": 1.977306604385376, + "learning_rate": 4.6331657313949975e-05, + "loss": 4.3384, + "step": 29362 + }, + { + "epoch": 0.17463007897992197, + "grad_norm": 1.7359813451766968, + "learning_rate": 4.633141372981199e-05, + "loss": 4.9874, + "step": 29363 + }, + { + "epoch": 0.17463602626320296, + "grad_norm": 1.5922671556472778, + "learning_rate": 4.6331170138227435e-05, + "loss": 5.1194, + "step": 29364 + }, + { + "epoch": 0.17464197354648398, + "grad_norm": 1.8139041662216187, + "learning_rate": 4.63309265391964e-05, + "loss": 5.0046, + "step": 29365 + }, + { + "epoch": 0.17464792082976496, + "grad_norm": 1.6782366037368774, + "learning_rate": 4.633068293271895e-05, + "loss": 5.056, + "step": 29366 + }, + { + "epoch": 0.17465386811304595, + "grad_norm": 1.6051324605941772, + "learning_rate": 4.6330439318795174e-05, + "loss": 5.1002, + "step": 29367 + }, + { + "epoch": 0.17465981539632697, + "grad_norm": 1.6109590530395508, + "learning_rate": 4.633019569742517e-05, + "loss": 4.9802, + "step": 29368 + }, + { + "epoch": 0.17466576267960796, + "grad_norm": 1.5063222646713257, + "learning_rate": 4.6329952068609005e-05, + "loss": 5.5857, + "step": 29369 + }, + { + "epoch": 0.17467170996288894, + "grad_norm": 1.6874276399612427, + "learning_rate": 4.632970843234678e-05, + "loss": 5.161, + "step": 29370 + }, + { + "epoch": 0.17467765724616996, + "grad_norm": 1.8858634233474731, + "learning_rate": 4.6329464788638576e-05, + "loss": 4.6397, + "step": 29371 + }, + { + "epoch": 0.17468360452945095, + "grad_norm": 2.004140853881836, + "learning_rate": 4.632922113748447e-05, + "loss": 4.5306, + "step": 29372 + }, + { + "epoch": 0.17468955181273194, + "grad_norm": 1.278494954109192, + "learning_rate": 4.632897747888456e-05, + "loss": 5.032, + "step": 29373 + }, + { + "epoch": 0.17469549909601295, + "grad_norm": 1.7012786865234375, + "learning_rate": 4.6328733812838925e-05, + "loss": 5.1362, + "step": 29374 + }, + { + "epoch": 0.17470144637929394, + "grad_norm": 1.6155195236206055, + "learning_rate": 4.632849013934765e-05, + "loss": 5.4839, + "step": 29375 + }, + { + "epoch": 0.17470739366257493, + "grad_norm": 1.3312060832977295, + "learning_rate": 4.6328246458410816e-05, + "loss": 5.521, + "step": 29376 + }, + { + "epoch": 0.17471334094585594, + "grad_norm": 1.6347986459732056, + "learning_rate": 4.632800277002851e-05, + "loss": 5.1883, + "step": 29377 + }, + { + "epoch": 0.17471928822913693, + "grad_norm": 1.6213163137435913, + "learning_rate": 4.632775907420082e-05, + "loss": 5.1724, + "step": 29378 + }, + { + "epoch": 0.17472523551241792, + "grad_norm": 2.0514700412750244, + "learning_rate": 4.632751537092783e-05, + "loss": 3.6934, + "step": 29379 + }, + { + "epoch": 0.17473118279569894, + "grad_norm": 1.4713187217712402, + "learning_rate": 4.6327271660209626e-05, + "loss": 4.7456, + "step": 29380 + }, + { + "epoch": 0.17473713007897992, + "grad_norm": 1.5584750175476074, + "learning_rate": 4.6327027942046286e-05, + "loss": 5.0259, + "step": 29381 + }, + { + "epoch": 0.1747430773622609, + "grad_norm": 1.7405140399932861, + "learning_rate": 4.632678421643791e-05, + "loss": 5.1115, + "step": 29382 + }, + { + "epoch": 0.17474902464554193, + "grad_norm": 1.7233058214187622, + "learning_rate": 4.632654048338457e-05, + "loss": 5.3849, + "step": 29383 + }, + { + "epoch": 0.17475497192882292, + "grad_norm": 1.7387725114822388, + "learning_rate": 4.6326296742886356e-05, + "loss": 5.4367, + "step": 29384 + }, + { + "epoch": 0.1747609192121039, + "grad_norm": 1.7022291421890259, + "learning_rate": 4.632605299494335e-05, + "loss": 5.1317, + "step": 29385 + }, + { + "epoch": 0.17476686649538492, + "grad_norm": 1.7683387994766235, + "learning_rate": 4.632580923955564e-05, + "loss": 4.4575, + "step": 29386 + }, + { + "epoch": 0.1747728137786659, + "grad_norm": 1.4611074924468994, + "learning_rate": 4.632556547672331e-05, + "loss": 4.7676, + "step": 29387 + }, + { + "epoch": 0.1747787610619469, + "grad_norm": 1.9123033285140991, + "learning_rate": 4.632532170644644e-05, + "loss": 4.966, + "step": 29388 + }, + { + "epoch": 0.1747847083452279, + "grad_norm": 1.857445478439331, + "learning_rate": 4.632507792872513e-05, + "loss": 4.0338, + "step": 29389 + }, + { + "epoch": 0.1747906556285089, + "grad_norm": 2.620339870452881, + "learning_rate": 4.632483414355945e-05, + "loss": 3.4506, + "step": 29390 + }, + { + "epoch": 0.1747966029117899, + "grad_norm": 2.141939401626587, + "learning_rate": 4.6324590350949494e-05, + "loss": 4.516, + "step": 29391 + }, + { + "epoch": 0.1748025501950709, + "grad_norm": 1.5560227632522583, + "learning_rate": 4.632434655089535e-05, + "loss": 4.8785, + "step": 29392 + }, + { + "epoch": 0.1748084974783519, + "grad_norm": 1.640221357345581, + "learning_rate": 4.632410274339708e-05, + "loss": 4.9614, + "step": 29393 + }, + { + "epoch": 0.17481444476163288, + "grad_norm": 1.6104960441589355, + "learning_rate": 4.63238589284548e-05, + "loss": 4.7536, + "step": 29394 + }, + { + "epoch": 0.1748203920449139, + "grad_norm": 1.599259853363037, + "learning_rate": 4.6323615106068575e-05, + "loss": 5.0939, + "step": 29395 + }, + { + "epoch": 0.17482633932819489, + "grad_norm": 1.630430817604065, + "learning_rate": 4.6323371276238496e-05, + "loss": 4.8851, + "step": 29396 + }, + { + "epoch": 0.17483228661147587, + "grad_norm": 1.6281993389129639, + "learning_rate": 4.632312743896465e-05, + "loss": 4.8152, + "step": 29397 + }, + { + "epoch": 0.17483823389475686, + "grad_norm": 1.7055253982543945, + "learning_rate": 4.632288359424712e-05, + "loss": 4.2515, + "step": 29398 + }, + { + "epoch": 0.17484418117803788, + "grad_norm": 1.739365577697754, + "learning_rate": 4.6322639742085995e-05, + "loss": 4.5137, + "step": 29399 + }, + { + "epoch": 0.17485012846131887, + "grad_norm": 1.7686853408813477, + "learning_rate": 4.632239588248135e-05, + "loss": 5.307, + "step": 29400 + }, + { + "epoch": 0.17485607574459985, + "grad_norm": 1.369730830192566, + "learning_rate": 4.632215201543328e-05, + "loss": 5.3096, + "step": 29401 + }, + { + "epoch": 0.17486202302788087, + "grad_norm": 1.6965676546096802, + "learning_rate": 4.6321908140941874e-05, + "loss": 4.9252, + "step": 29402 + }, + { + "epoch": 0.17486797031116186, + "grad_norm": 1.797540307044983, + "learning_rate": 4.63216642590072e-05, + "loss": 4.4397, + "step": 29403 + }, + { + "epoch": 0.17487391759444285, + "grad_norm": 1.7250994443893433, + "learning_rate": 4.632142036962936e-05, + "loss": 4.4416, + "step": 29404 + }, + { + "epoch": 0.17487986487772386, + "grad_norm": 1.649828314781189, + "learning_rate": 4.632117647280843e-05, + "loss": 4.4497, + "step": 29405 + }, + { + "epoch": 0.17488581216100485, + "grad_norm": 1.7073628902435303, + "learning_rate": 4.632093256854449e-05, + "loss": 4.3074, + "step": 29406 + }, + { + "epoch": 0.17489175944428584, + "grad_norm": 1.6241555213928223, + "learning_rate": 4.632068865683765e-05, + "loss": 4.1219, + "step": 29407 + }, + { + "epoch": 0.17489770672756685, + "grad_norm": 1.356092929840088, + "learning_rate": 4.6320444737687965e-05, + "loss": 4.5548, + "step": 29408 + }, + { + "epoch": 0.17490365401084784, + "grad_norm": 1.5094983577728271, + "learning_rate": 4.632020081109554e-05, + "loss": 5.0598, + "step": 29409 + }, + { + "epoch": 0.17490960129412883, + "grad_norm": 1.596183180809021, + "learning_rate": 4.6319956877060445e-05, + "loss": 5.0795, + "step": 29410 + }, + { + "epoch": 0.17491554857740985, + "grad_norm": 1.7887545824050903, + "learning_rate": 4.6319712935582784e-05, + "loss": 4.9287, + "step": 29411 + }, + { + "epoch": 0.17492149586069083, + "grad_norm": 1.4806302785873413, + "learning_rate": 4.631946898666262e-05, + "loss": 5.0627, + "step": 29412 + }, + { + "epoch": 0.17492744314397182, + "grad_norm": 1.5581897497177124, + "learning_rate": 4.631922503030005e-05, + "loss": 5.2001, + "step": 29413 + }, + { + "epoch": 0.17493339042725284, + "grad_norm": 1.614473819732666, + "learning_rate": 4.631898106649517e-05, + "loss": 4.396, + "step": 29414 + }, + { + "epoch": 0.17493933771053383, + "grad_norm": 1.9394686222076416, + "learning_rate": 4.6318737095248044e-05, + "loss": 3.9614, + "step": 29415 + }, + { + "epoch": 0.17494528499381481, + "grad_norm": 1.6874741315841675, + "learning_rate": 4.631849311655877e-05, + "loss": 4.4714, + "step": 29416 + }, + { + "epoch": 0.17495123227709583, + "grad_norm": 1.8840105533599854, + "learning_rate": 4.6318249130427435e-05, + "loss": 4.51, + "step": 29417 + }, + { + "epoch": 0.17495717956037682, + "grad_norm": 1.7205270528793335, + "learning_rate": 4.631800513685412e-05, + "loss": 4.554, + "step": 29418 + }, + { + "epoch": 0.1749631268436578, + "grad_norm": 1.449798583984375, + "learning_rate": 4.6317761135838896e-05, + "loss": 5.0114, + "step": 29419 + }, + { + "epoch": 0.17496907412693882, + "grad_norm": 1.6449236869812012, + "learning_rate": 4.631751712738187e-05, + "loss": 5.7704, + "step": 29420 + }, + { + "epoch": 0.1749750214102198, + "grad_norm": 1.5362746715545654, + "learning_rate": 4.631727311148312e-05, + "loss": 5.6398, + "step": 29421 + }, + { + "epoch": 0.1749809686935008, + "grad_norm": 1.6383920907974243, + "learning_rate": 4.6317029088142726e-05, + "loss": 5.2901, + "step": 29422 + }, + { + "epoch": 0.17498691597678181, + "grad_norm": 1.8682830333709717, + "learning_rate": 4.631678505736079e-05, + "loss": 4.2822, + "step": 29423 + }, + { + "epoch": 0.1749928632600628, + "grad_norm": 1.9640558958053589, + "learning_rate": 4.631654101913737e-05, + "loss": 4.121, + "step": 29424 + }, + { + "epoch": 0.1749988105433438, + "grad_norm": 1.569744348526001, + "learning_rate": 4.6316296973472576e-05, + "loss": 4.3937, + "step": 29425 + }, + { + "epoch": 0.1750047578266248, + "grad_norm": 1.524356484413147, + "learning_rate": 4.6316052920366475e-05, + "loss": 4.8107, + "step": 29426 + }, + { + "epoch": 0.1750107051099058, + "grad_norm": 1.7055494785308838, + "learning_rate": 4.6315808859819164e-05, + "loss": 4.8751, + "step": 29427 + }, + { + "epoch": 0.17501665239318678, + "grad_norm": 1.683262586593628, + "learning_rate": 4.631556479183072e-05, + "loss": 5.4053, + "step": 29428 + }, + { + "epoch": 0.1750225996764678, + "grad_norm": 1.7124066352844238, + "learning_rate": 4.6315320716401244e-05, + "loss": 5.0109, + "step": 29429 + }, + { + "epoch": 0.1750285469597488, + "grad_norm": 1.6951466798782349, + "learning_rate": 4.63150766335308e-05, + "loss": 5.4747, + "step": 29430 + }, + { + "epoch": 0.17503449424302978, + "grad_norm": 1.5457607507705688, + "learning_rate": 4.631483254321949e-05, + "loss": 4.8729, + "step": 29431 + }, + { + "epoch": 0.1750404415263108, + "grad_norm": 1.5366050004959106, + "learning_rate": 4.6314588445467386e-05, + "loss": 5.0268, + "step": 29432 + }, + { + "epoch": 0.17504638880959178, + "grad_norm": 1.6533615589141846, + "learning_rate": 4.6314344340274573e-05, + "loss": 4.7626, + "step": 29433 + }, + { + "epoch": 0.17505233609287277, + "grad_norm": 1.559486746788025, + "learning_rate": 4.631410022764115e-05, + "loss": 5.0673, + "step": 29434 + }, + { + "epoch": 0.17505828337615378, + "grad_norm": 1.534456729888916, + "learning_rate": 4.63138561075672e-05, + "loss": 5.5142, + "step": 29435 + }, + { + "epoch": 0.17506423065943477, + "grad_norm": 1.641667366027832, + "learning_rate": 4.63136119800528e-05, + "loss": 4.7032, + "step": 29436 + }, + { + "epoch": 0.17507017794271576, + "grad_norm": 1.4128551483154297, + "learning_rate": 4.631336784509803e-05, + "loss": 4.8777, + "step": 29437 + }, + { + "epoch": 0.17507612522599678, + "grad_norm": 1.4912710189819336, + "learning_rate": 4.6313123702703e-05, + "loss": 4.866, + "step": 29438 + }, + { + "epoch": 0.17508207250927776, + "grad_norm": 1.381341576576233, + "learning_rate": 4.631287955286776e-05, + "loss": 4.6116, + "step": 29439 + }, + { + "epoch": 0.17508801979255875, + "grad_norm": 1.4270753860473633, + "learning_rate": 4.631263539559243e-05, + "loss": 5.0519, + "step": 29440 + }, + { + "epoch": 0.17509396707583977, + "grad_norm": 1.4962128400802612, + "learning_rate": 4.6312391230877074e-05, + "loss": 4.6934, + "step": 29441 + }, + { + "epoch": 0.17509991435912076, + "grad_norm": 1.3959366083145142, + "learning_rate": 4.631214705872178e-05, + "loss": 4.9172, + "step": 29442 + }, + { + "epoch": 0.17510586164240174, + "grad_norm": 1.5014355182647705, + "learning_rate": 4.631190287912663e-05, + "loss": 4.8429, + "step": 29443 + }, + { + "epoch": 0.17511180892568276, + "grad_norm": 1.584879994392395, + "learning_rate": 4.631165869209172e-05, + "loss": 5.1186, + "step": 29444 + }, + { + "epoch": 0.17511775620896375, + "grad_norm": 1.6547553539276123, + "learning_rate": 4.6311414497617135e-05, + "loss": 4.9739, + "step": 29445 + }, + { + "epoch": 0.17512370349224474, + "grad_norm": 1.4584704637527466, + "learning_rate": 4.631117029570295e-05, + "loss": 4.927, + "step": 29446 + }, + { + "epoch": 0.17512965077552575, + "grad_norm": 1.5092477798461914, + "learning_rate": 4.631092608634926e-05, + "loss": 4.9163, + "step": 29447 + }, + { + "epoch": 0.17513559805880674, + "grad_norm": 1.466023564338684, + "learning_rate": 4.631068186955614e-05, + "loss": 4.9867, + "step": 29448 + }, + { + "epoch": 0.17514154534208773, + "grad_norm": 1.8561779260635376, + "learning_rate": 4.6310437645323676e-05, + "loss": 4.6118, + "step": 29449 + }, + { + "epoch": 0.17514749262536874, + "grad_norm": 2.27844500541687, + "learning_rate": 4.631019341365197e-05, + "loss": 4.4978, + "step": 29450 + }, + { + "epoch": 0.17515343990864973, + "grad_norm": 1.7874199151992798, + "learning_rate": 4.6309949174541096e-05, + "loss": 3.7357, + "step": 29451 + }, + { + "epoch": 0.17515938719193072, + "grad_norm": 1.6950316429138184, + "learning_rate": 4.6309704927991136e-05, + "loss": 4.1866, + "step": 29452 + }, + { + "epoch": 0.17516533447521174, + "grad_norm": 1.6692928075790405, + "learning_rate": 4.630946067400217e-05, + "loss": 3.9566, + "step": 29453 + }, + { + "epoch": 0.17517128175849273, + "grad_norm": 1.680684208869934, + "learning_rate": 4.63092164125743e-05, + "loss": 4.0473, + "step": 29454 + }, + { + "epoch": 0.1751772290417737, + "grad_norm": 1.7636792659759521, + "learning_rate": 4.6308972143707606e-05, + "loss": 4.161, + "step": 29455 + }, + { + "epoch": 0.1751831763250547, + "grad_norm": 1.7277029752731323, + "learning_rate": 4.6308727867402165e-05, + "loss": 4.6943, + "step": 29456 + }, + { + "epoch": 0.17518912360833572, + "grad_norm": 1.7087599039077759, + "learning_rate": 4.630848358365807e-05, + "loss": 4.9239, + "step": 29457 + }, + { + "epoch": 0.1751950708916167, + "grad_norm": 1.8207015991210938, + "learning_rate": 4.63082392924754e-05, + "loss": 4.8358, + "step": 29458 + }, + { + "epoch": 0.1752010181748977, + "grad_norm": 1.9595861434936523, + "learning_rate": 4.6307994993854245e-05, + "loss": 4.3975, + "step": 29459 + }, + { + "epoch": 0.1752069654581787, + "grad_norm": 2.330233335494995, + "learning_rate": 4.630775068779469e-05, + "loss": 3.9516, + "step": 29460 + }, + { + "epoch": 0.1752129127414597, + "grad_norm": 1.801896572113037, + "learning_rate": 4.630750637429682e-05, + "loss": 4.3272, + "step": 29461 + }, + { + "epoch": 0.17521886002474069, + "grad_norm": 1.8079783916473389, + "learning_rate": 4.630726205336071e-05, + "loss": 4.4698, + "step": 29462 + }, + { + "epoch": 0.1752248073080217, + "grad_norm": 1.7742640972137451, + "learning_rate": 4.6307017724986466e-05, + "loss": 4.5466, + "step": 29463 + }, + { + "epoch": 0.1752307545913027, + "grad_norm": 1.5979267358779907, + "learning_rate": 4.6306773389174154e-05, + "loss": 4.497, + "step": 29464 + }, + { + "epoch": 0.17523670187458368, + "grad_norm": 1.6667109727859497, + "learning_rate": 4.630652904592388e-05, + "loss": 5.338, + "step": 29465 + }, + { + "epoch": 0.1752426491578647, + "grad_norm": 1.5170248746871948, + "learning_rate": 4.63062846952357e-05, + "loss": 4.6994, + "step": 29466 + }, + { + "epoch": 0.17524859644114568, + "grad_norm": 1.597468376159668, + "learning_rate": 4.630604033710974e-05, + "loss": 4.1865, + "step": 29467 + }, + { + "epoch": 0.17525454372442667, + "grad_norm": 1.638096809387207, + "learning_rate": 4.630579597154604e-05, + "loss": 4.2936, + "step": 29468 + }, + { + "epoch": 0.17526049100770769, + "grad_norm": 1.5512175559997559, + "learning_rate": 4.630555159854472e-05, + "loss": 4.6191, + "step": 29469 + }, + { + "epoch": 0.17526643829098867, + "grad_norm": 1.57890784740448, + "learning_rate": 4.630530721810584e-05, + "loss": 4.9381, + "step": 29470 + }, + { + "epoch": 0.17527238557426966, + "grad_norm": 1.7156378030776978, + "learning_rate": 4.63050628302295e-05, + "loss": 5.022, + "step": 29471 + }, + { + "epoch": 0.17527833285755068, + "grad_norm": 1.6688953638076782, + "learning_rate": 4.630481843491579e-05, + "loss": 4.5509, + "step": 29472 + }, + { + "epoch": 0.17528428014083167, + "grad_norm": 1.835450530052185, + "learning_rate": 4.630457403216478e-05, + "loss": 4.6413, + "step": 29473 + }, + { + "epoch": 0.17529022742411265, + "grad_norm": 1.2935006618499756, + "learning_rate": 4.6304329621976574e-05, + "loss": 4.9823, + "step": 29474 + }, + { + "epoch": 0.17529617470739367, + "grad_norm": 2.152981758117676, + "learning_rate": 4.6304085204351234e-05, + "loss": 4.6183, + "step": 29475 + }, + { + "epoch": 0.17530212199067466, + "grad_norm": 1.6258760690689087, + "learning_rate": 4.630384077928886e-05, + "loss": 4.9874, + "step": 29476 + }, + { + "epoch": 0.17530806927395565, + "grad_norm": 1.6755950450897217, + "learning_rate": 4.630359634678954e-05, + "loss": 5.089, + "step": 29477 + }, + { + "epoch": 0.17531401655723666, + "grad_norm": 1.7208611965179443, + "learning_rate": 4.6303351906853355e-05, + "loss": 5.3393, + "step": 29478 + }, + { + "epoch": 0.17531996384051765, + "grad_norm": 1.5461162328720093, + "learning_rate": 4.630310745948039e-05, + "loss": 5.2263, + "step": 29479 + }, + { + "epoch": 0.17532591112379864, + "grad_norm": 1.9592080116271973, + "learning_rate": 4.630286300467073e-05, + "loss": 4.1235, + "step": 29480 + }, + { + "epoch": 0.17533185840707965, + "grad_norm": 1.8409465551376343, + "learning_rate": 4.630261854242446e-05, + "loss": 4.8235, + "step": 29481 + }, + { + "epoch": 0.17533780569036064, + "grad_norm": 1.6198770999908447, + "learning_rate": 4.630237407274166e-05, + "loss": 5.5198, + "step": 29482 + }, + { + "epoch": 0.17534375297364163, + "grad_norm": 1.692572832107544, + "learning_rate": 4.630212959562243e-05, + "loss": 4.8526, + "step": 29483 + }, + { + "epoch": 0.17534970025692265, + "grad_norm": 1.7479051351547241, + "learning_rate": 4.6301885111066847e-05, + "loss": 4.8774, + "step": 29484 + }, + { + "epoch": 0.17535564754020364, + "grad_norm": 2.0946943759918213, + "learning_rate": 4.630164061907499e-05, + "loss": 4.4918, + "step": 29485 + }, + { + "epoch": 0.17536159482348462, + "grad_norm": 1.702415943145752, + "learning_rate": 4.6301396119646954e-05, + "loss": 4.424, + "step": 29486 + }, + { + "epoch": 0.17536754210676564, + "grad_norm": 1.4786335229873657, + "learning_rate": 4.630115161278282e-05, + "loss": 5.5655, + "step": 29487 + }, + { + "epoch": 0.17537348939004663, + "grad_norm": 1.5471251010894775, + "learning_rate": 4.630090709848267e-05, + "loss": 5.2839, + "step": 29488 + }, + { + "epoch": 0.17537943667332762, + "grad_norm": 1.8128043413162231, + "learning_rate": 4.6300662576746595e-05, + "loss": 4.7968, + "step": 29489 + }, + { + "epoch": 0.17538538395660863, + "grad_norm": 1.6280453205108643, + "learning_rate": 4.630041804757469e-05, + "loss": 4.7266, + "step": 29490 + }, + { + "epoch": 0.17539133123988962, + "grad_norm": 1.6138848066329956, + "learning_rate": 4.6300173510967015e-05, + "loss": 4.3718, + "step": 29491 + }, + { + "epoch": 0.1753972785231706, + "grad_norm": 1.6392838954925537, + "learning_rate": 4.6299928966923675e-05, + "loss": 4.7491, + "step": 29492 + }, + { + "epoch": 0.17540322580645162, + "grad_norm": 1.722277283668518, + "learning_rate": 4.629968441544475e-05, + "loss": 4.4053, + "step": 29493 + }, + { + "epoch": 0.1754091730897326, + "grad_norm": 1.4803645610809326, + "learning_rate": 4.629943985653032e-05, + "loss": 4.5624, + "step": 29494 + }, + { + "epoch": 0.1754151203730136, + "grad_norm": 1.696871042251587, + "learning_rate": 4.629919529018048e-05, + "loss": 4.2274, + "step": 29495 + }, + { + "epoch": 0.17542106765629462, + "grad_norm": 2.0104081630706787, + "learning_rate": 4.629895071639531e-05, + "loss": 4.954, + "step": 29496 + }, + { + "epoch": 0.1754270149395756, + "grad_norm": 1.91762113571167, + "learning_rate": 4.62987061351749e-05, + "loss": 4.5869, + "step": 29497 + }, + { + "epoch": 0.1754329622228566, + "grad_norm": 2.0672197341918945, + "learning_rate": 4.629846154651932e-05, + "loss": 4.3838, + "step": 29498 + }, + { + "epoch": 0.1754389095061376, + "grad_norm": 1.9841183423995972, + "learning_rate": 4.629821695042869e-05, + "loss": 5.2067, + "step": 29499 + }, + { + "epoch": 0.1754448567894186, + "grad_norm": 1.850253701210022, + "learning_rate": 4.6297972346903055e-05, + "loss": 4.7302, + "step": 29500 + }, + { + "epoch": 0.17545080407269958, + "grad_norm": 1.4990947246551514, + "learning_rate": 4.629772773594252e-05, + "loss": 4.9005, + "step": 29501 + }, + { + "epoch": 0.1754567513559806, + "grad_norm": 1.5953363180160522, + "learning_rate": 4.629748311754717e-05, + "loss": 4.9025, + "step": 29502 + }, + { + "epoch": 0.1754626986392616, + "grad_norm": 1.5136396884918213, + "learning_rate": 4.6297238491717085e-05, + "loss": 4.835, + "step": 29503 + }, + { + "epoch": 0.17546864592254258, + "grad_norm": 1.7335329055786133, + "learning_rate": 4.6296993858452356e-05, + "loss": 4.7231, + "step": 29504 + }, + { + "epoch": 0.1754745932058236, + "grad_norm": 1.5969070196151733, + "learning_rate": 4.629674921775307e-05, + "loss": 4.7903, + "step": 29505 + }, + { + "epoch": 0.17548054048910458, + "grad_norm": 1.7393018007278442, + "learning_rate": 4.62965045696193e-05, + "loss": 5.2468, + "step": 29506 + }, + { + "epoch": 0.17548648777238557, + "grad_norm": 1.4993494749069214, + "learning_rate": 4.629625991405116e-05, + "loss": 5.0639, + "step": 29507 + }, + { + "epoch": 0.17549243505566658, + "grad_norm": 1.559507966041565, + "learning_rate": 4.62960152510487e-05, + "loss": 5.2718, + "step": 29508 + }, + { + "epoch": 0.17549838233894757, + "grad_norm": 1.6528722047805786, + "learning_rate": 4.629577058061202e-05, + "loss": 5.0881, + "step": 29509 + }, + { + "epoch": 0.17550432962222856, + "grad_norm": 1.5357880592346191, + "learning_rate": 4.629552590274121e-05, + "loss": 4.5841, + "step": 29510 + }, + { + "epoch": 0.17551027690550958, + "grad_norm": 1.7293065786361694, + "learning_rate": 4.629528121743635e-05, + "loss": 4.6718, + "step": 29511 + }, + { + "epoch": 0.17551622418879056, + "grad_norm": 2.699164390563965, + "learning_rate": 4.6295036524697536e-05, + "loss": 4.1491, + "step": 29512 + }, + { + "epoch": 0.17552217147207155, + "grad_norm": 1.5221933126449585, + "learning_rate": 4.629479182452483e-05, + "loss": 4.8606, + "step": 29513 + }, + { + "epoch": 0.17552811875535254, + "grad_norm": 1.5474234819412231, + "learning_rate": 4.629454711691835e-05, + "loss": 4.7198, + "step": 29514 + }, + { + "epoch": 0.17553406603863356, + "grad_norm": 1.5748153924942017, + "learning_rate": 4.629430240187816e-05, + "loss": 4.9429, + "step": 29515 + }, + { + "epoch": 0.17554001332191455, + "grad_norm": 1.5812437534332275, + "learning_rate": 4.629405767940434e-05, + "loss": 4.7219, + "step": 29516 + }, + { + "epoch": 0.17554596060519553, + "grad_norm": 1.572482943534851, + "learning_rate": 4.629381294949698e-05, + "loss": 4.9071, + "step": 29517 + }, + { + "epoch": 0.17555190788847655, + "grad_norm": 1.8683935403823853, + "learning_rate": 4.629356821215618e-05, + "loss": 4.539, + "step": 29518 + }, + { + "epoch": 0.17555785517175754, + "grad_norm": 3.200904607772827, + "learning_rate": 4.629332346738201e-05, + "loss": 4.2734, + "step": 29519 + }, + { + "epoch": 0.17556380245503853, + "grad_norm": 2.051896572113037, + "learning_rate": 4.629307871517457e-05, + "loss": 5.0986, + "step": 29520 + }, + { + "epoch": 0.17556974973831954, + "grad_norm": 1.7927826642990112, + "learning_rate": 4.6292833955533926e-05, + "loss": 4.6581, + "step": 29521 + }, + { + "epoch": 0.17557569702160053, + "grad_norm": 1.6184303760528564, + "learning_rate": 4.629258918846018e-05, + "loss": 4.8106, + "step": 29522 + }, + { + "epoch": 0.17558164430488152, + "grad_norm": 1.4969747066497803, + "learning_rate": 4.62923444139534e-05, + "loss": 5.2787, + "step": 29523 + }, + { + "epoch": 0.17558759158816253, + "grad_norm": 1.471805214881897, + "learning_rate": 4.6292099632013695e-05, + "loss": 5.3599, + "step": 29524 + }, + { + "epoch": 0.17559353887144352, + "grad_norm": 1.3968273401260376, + "learning_rate": 4.629185484264113e-05, + "loss": 4.9754, + "step": 29525 + }, + { + "epoch": 0.1755994861547245, + "grad_norm": 1.627172589302063, + "learning_rate": 4.629161004583581e-05, + "loss": 4.3703, + "step": 29526 + }, + { + "epoch": 0.17560543343800553, + "grad_norm": 1.5334340333938599, + "learning_rate": 4.62913652415978e-05, + "loss": 4.8447, + "step": 29527 + }, + { + "epoch": 0.17561138072128651, + "grad_norm": 1.552454948425293, + "learning_rate": 4.6291120429927194e-05, + "loss": 4.823, + "step": 29528 + }, + { + "epoch": 0.1756173280045675, + "grad_norm": 1.4378019571304321, + "learning_rate": 4.629087561082408e-05, + "loss": 5.019, + "step": 29529 + }, + { + "epoch": 0.17562327528784852, + "grad_norm": 1.513752818107605, + "learning_rate": 4.6290630784288544e-05, + "loss": 4.7146, + "step": 29530 + }, + { + "epoch": 0.1756292225711295, + "grad_norm": 1.5130308866500854, + "learning_rate": 4.629038595032066e-05, + "loss": 4.5687, + "step": 29531 + }, + { + "epoch": 0.1756351698544105, + "grad_norm": 1.6177191734313965, + "learning_rate": 4.6290141108920534e-05, + "loss": 4.49, + "step": 29532 + }, + { + "epoch": 0.1756411171376915, + "grad_norm": 1.6133641004562378, + "learning_rate": 4.628989626008823e-05, + "loss": 4.6966, + "step": 29533 + }, + { + "epoch": 0.1756470644209725, + "grad_norm": 1.5740238428115845, + "learning_rate": 4.628965140382385e-05, + "loss": 4.8149, + "step": 29534 + }, + { + "epoch": 0.1756530117042535, + "grad_norm": 1.4787334203720093, + "learning_rate": 4.6289406540127466e-05, + "loss": 4.7759, + "step": 29535 + }, + { + "epoch": 0.1756589589875345, + "grad_norm": 1.5558816194534302, + "learning_rate": 4.628916166899917e-05, + "loss": 5.0831, + "step": 29536 + }, + { + "epoch": 0.1756649062708155, + "grad_norm": 1.3332229852676392, + "learning_rate": 4.628891679043905e-05, + "loss": 4.9866, + "step": 29537 + }, + { + "epoch": 0.17567085355409648, + "grad_norm": 1.5539603233337402, + "learning_rate": 4.6288671904447195e-05, + "loss": 4.96, + "step": 29538 + }, + { + "epoch": 0.1756768008373775, + "grad_norm": 1.4858051538467407, + "learning_rate": 4.628842701102368e-05, + "loss": 4.9161, + "step": 29539 + }, + { + "epoch": 0.17568274812065848, + "grad_norm": 1.6222684383392334, + "learning_rate": 4.62881821101686e-05, + "loss": 4.9328, + "step": 29540 + }, + { + "epoch": 0.17568869540393947, + "grad_norm": 1.6516577005386353, + "learning_rate": 4.6287937201882025e-05, + "loss": 4.7577, + "step": 29541 + }, + { + "epoch": 0.1756946426872205, + "grad_norm": 1.7349826097488403, + "learning_rate": 4.6287692286164056e-05, + "loss": 4.5927, + "step": 29542 + }, + { + "epoch": 0.17570058997050148, + "grad_norm": 1.4014586210250854, + "learning_rate": 4.6287447363014776e-05, + "loss": 4.8835, + "step": 29543 + }, + { + "epoch": 0.17570653725378246, + "grad_norm": 1.5037766695022583, + "learning_rate": 4.6287202432434265e-05, + "loss": 4.9221, + "step": 29544 + }, + { + "epoch": 0.17571248453706348, + "grad_norm": 1.5138404369354248, + "learning_rate": 4.628695749442261e-05, + "loss": 4.5962, + "step": 29545 + }, + { + "epoch": 0.17571843182034447, + "grad_norm": 1.5634385347366333, + "learning_rate": 4.6286712548979907e-05, + "loss": 5.2178, + "step": 29546 + }, + { + "epoch": 0.17572437910362546, + "grad_norm": 1.6049305200576782, + "learning_rate": 4.628646759610622e-05, + "loss": 5.1726, + "step": 29547 + }, + { + "epoch": 0.17573032638690647, + "grad_norm": 1.6202237606048584, + "learning_rate": 4.628622263580166e-05, + "loss": 4.8598, + "step": 29548 + }, + { + "epoch": 0.17573627367018746, + "grad_norm": 1.4801881313323975, + "learning_rate": 4.628597766806629e-05, + "loss": 4.9164, + "step": 29549 + }, + { + "epoch": 0.17574222095346845, + "grad_norm": 1.5014153718948364, + "learning_rate": 4.628573269290021e-05, + "loss": 4.3787, + "step": 29550 + }, + { + "epoch": 0.17574816823674946, + "grad_norm": 1.5468509197235107, + "learning_rate": 4.62854877103035e-05, + "loss": 4.9178, + "step": 29551 + }, + { + "epoch": 0.17575411552003045, + "grad_norm": 1.4622128009796143, + "learning_rate": 4.628524272027624e-05, + "loss": 4.8219, + "step": 29552 + }, + { + "epoch": 0.17576006280331144, + "grad_norm": 1.6060843467712402, + "learning_rate": 4.628499772281853e-05, + "loss": 4.869, + "step": 29553 + }, + { + "epoch": 0.17576601008659246, + "grad_norm": 1.7407468557357788, + "learning_rate": 4.628475271793044e-05, + "loss": 4.7171, + "step": 29554 + }, + { + "epoch": 0.17577195736987344, + "grad_norm": 1.5435397624969482, + "learning_rate": 4.628450770561207e-05, + "loss": 4.6929, + "step": 29555 + }, + { + "epoch": 0.17577790465315443, + "grad_norm": 1.5211220979690552, + "learning_rate": 4.628426268586349e-05, + "loss": 4.6811, + "step": 29556 + }, + { + "epoch": 0.17578385193643545, + "grad_norm": 1.3432724475860596, + "learning_rate": 4.6284017658684796e-05, + "loss": 4.8499, + "step": 29557 + }, + { + "epoch": 0.17578979921971644, + "grad_norm": 1.6592440605163574, + "learning_rate": 4.628377262407608e-05, + "loss": 4.4278, + "step": 29558 + }, + { + "epoch": 0.17579574650299742, + "grad_norm": 1.5314370393753052, + "learning_rate": 4.6283527582037415e-05, + "loss": 5.0514, + "step": 29559 + }, + { + "epoch": 0.17580169378627844, + "grad_norm": 1.8792412281036377, + "learning_rate": 4.6283282532568884e-05, + "loss": 4.3201, + "step": 29560 + }, + { + "epoch": 0.17580764106955943, + "grad_norm": 1.726537823677063, + "learning_rate": 4.628303747567058e-05, + "loss": 4.4524, + "step": 29561 + }, + { + "epoch": 0.17581358835284042, + "grad_norm": 1.5222519636154175, + "learning_rate": 4.628279241134259e-05, + "loss": 4.7075, + "step": 29562 + }, + { + "epoch": 0.17581953563612143, + "grad_norm": 1.6036890745162964, + "learning_rate": 4.6282547339585e-05, + "loss": 4.6974, + "step": 29563 + }, + { + "epoch": 0.17582548291940242, + "grad_norm": 1.6295074224472046, + "learning_rate": 4.628230226039789e-05, + "loss": 4.4021, + "step": 29564 + }, + { + "epoch": 0.1758314302026834, + "grad_norm": 2.6549839973449707, + "learning_rate": 4.628205717378135e-05, + "loss": 3.8639, + "step": 29565 + }, + { + "epoch": 0.17583737748596442, + "grad_norm": 2.752455234527588, + "learning_rate": 4.628181207973547e-05, + "loss": 3.745, + "step": 29566 + }, + { + "epoch": 0.1758433247692454, + "grad_norm": 2.4327378273010254, + "learning_rate": 4.6281566978260314e-05, + "loss": 3.4675, + "step": 29567 + }, + { + "epoch": 0.1758492720525264, + "grad_norm": 2.2893288135528564, + "learning_rate": 4.628132186935599e-05, + "loss": 3.4223, + "step": 29568 + }, + { + "epoch": 0.17585521933580742, + "grad_norm": 2.6514787673950195, + "learning_rate": 4.628107675302258e-05, + "loss": 3.6378, + "step": 29569 + }, + { + "epoch": 0.1758611666190884, + "grad_norm": 1.501243233680725, + "learning_rate": 4.628083162926016e-05, + "loss": 4.9402, + "step": 29570 + }, + { + "epoch": 0.1758671139023694, + "grad_norm": 2.5400307178497314, + "learning_rate": 4.6280586498068824e-05, + "loss": 3.9097, + "step": 29571 + }, + { + "epoch": 0.17587306118565038, + "grad_norm": 3.0715131759643555, + "learning_rate": 4.628034135944865e-05, + "loss": 3.8084, + "step": 29572 + }, + { + "epoch": 0.1758790084689314, + "grad_norm": 2.320291042327881, + "learning_rate": 4.628009621339974e-05, + "loss": 3.743, + "step": 29573 + }, + { + "epoch": 0.17588495575221239, + "grad_norm": 2.653029441833496, + "learning_rate": 4.627985105992216e-05, + "loss": 3.5106, + "step": 29574 + }, + { + "epoch": 0.17589090303549337, + "grad_norm": 2.5279390811920166, + "learning_rate": 4.6279605899016007e-05, + "loss": 3.6074, + "step": 29575 + }, + { + "epoch": 0.1758968503187744, + "grad_norm": 2.6520915031433105, + "learning_rate": 4.6279360730681364e-05, + "loss": 3.5559, + "step": 29576 + }, + { + "epoch": 0.17590279760205538, + "grad_norm": 1.5509624481201172, + "learning_rate": 4.627911555491831e-05, + "loss": 4.8954, + "step": 29577 + }, + { + "epoch": 0.17590874488533637, + "grad_norm": 2.044759750366211, + "learning_rate": 4.627887037172695e-05, + "loss": 3.7401, + "step": 29578 + }, + { + "epoch": 0.17591469216861738, + "grad_norm": 2.512817144393921, + "learning_rate": 4.6278625181107336e-05, + "loss": 3.3898, + "step": 29579 + }, + { + "epoch": 0.17592063945189837, + "grad_norm": 2.3796133995056152, + "learning_rate": 4.627837998305959e-05, + "loss": 3.5277, + "step": 29580 + }, + { + "epoch": 0.17592658673517936, + "grad_norm": 2.6435763835906982, + "learning_rate": 4.6278134777583774e-05, + "loss": 3.6078, + "step": 29581 + }, + { + "epoch": 0.17593253401846037, + "grad_norm": 1.9326622486114502, + "learning_rate": 4.6277889564679986e-05, + "loss": 4.3017, + "step": 29582 + }, + { + "epoch": 0.17593848130174136, + "grad_norm": 2.0501444339752197, + "learning_rate": 4.62776443443483e-05, + "loss": 4.2909, + "step": 29583 + }, + { + "epoch": 0.17594442858502235, + "grad_norm": 2.1053049564361572, + "learning_rate": 4.6277399116588816e-05, + "loss": 3.4639, + "step": 29584 + }, + { + "epoch": 0.17595037586830337, + "grad_norm": 2.2305474281311035, + "learning_rate": 4.627715388140161e-05, + "loss": 3.6551, + "step": 29585 + }, + { + "epoch": 0.17595632315158435, + "grad_norm": 2.328937292098999, + "learning_rate": 4.6276908638786766e-05, + "loss": 3.2528, + "step": 29586 + }, + { + "epoch": 0.17596227043486534, + "grad_norm": 3.2846357822418213, + "learning_rate": 4.627666338874437e-05, + "loss": 3.7581, + "step": 29587 + }, + { + "epoch": 0.17596821771814636, + "grad_norm": 2.145848512649536, + "learning_rate": 4.627641813127452e-05, + "loss": 3.6736, + "step": 29588 + }, + { + "epoch": 0.17597416500142735, + "grad_norm": 2.367215871810913, + "learning_rate": 4.627617286637729e-05, + "loss": 3.3043, + "step": 29589 + }, + { + "epoch": 0.17598011228470833, + "grad_norm": 2.314913272857666, + "learning_rate": 4.627592759405276e-05, + "loss": 3.3871, + "step": 29590 + }, + { + "epoch": 0.17598605956798935, + "grad_norm": 2.3208961486816406, + "learning_rate": 4.627568231430103e-05, + "loss": 3.3427, + "step": 29591 + }, + { + "epoch": 0.17599200685127034, + "grad_norm": 2.2277936935424805, + "learning_rate": 4.627543702712218e-05, + "loss": 3.4393, + "step": 29592 + }, + { + "epoch": 0.17599795413455133, + "grad_norm": 2.6522443294525146, + "learning_rate": 4.627519173251629e-05, + "loss": 3.4554, + "step": 29593 + }, + { + "epoch": 0.17600390141783234, + "grad_norm": 1.6064810752868652, + "learning_rate": 4.6274946430483454e-05, + "loss": 5.2487, + "step": 29594 + }, + { + "epoch": 0.17600984870111333, + "grad_norm": 2.488597869873047, + "learning_rate": 4.627470112102375e-05, + "loss": 3.8507, + "step": 29595 + }, + { + "epoch": 0.17601579598439432, + "grad_norm": 2.4922280311584473, + "learning_rate": 4.627445580413727e-05, + "loss": 3.901, + "step": 29596 + }, + { + "epoch": 0.17602174326767533, + "grad_norm": 2.5545835494995117, + "learning_rate": 4.62742104798241e-05, + "loss": 3.7327, + "step": 29597 + }, + { + "epoch": 0.17602769055095632, + "grad_norm": 2.674534559249878, + "learning_rate": 4.627396514808432e-05, + "loss": 3.6846, + "step": 29598 + }, + { + "epoch": 0.1760336378342373, + "grad_norm": 2.51946759223938, + "learning_rate": 4.627371980891801e-05, + "loss": 3.504, + "step": 29599 + }, + { + "epoch": 0.17603958511751833, + "grad_norm": 1.584033489227295, + "learning_rate": 4.6273474462325286e-05, + "loss": 4.9813, + "step": 29600 + }, + { + "epoch": 0.17604553240079931, + "grad_norm": 1.5800496339797974, + "learning_rate": 4.6273229108306195e-05, + "loss": 5.6641, + "step": 29601 + }, + { + "epoch": 0.1760514796840803, + "grad_norm": 1.5663219690322876, + "learning_rate": 4.627298374686084e-05, + "loss": 5.6077, + "step": 29602 + }, + { + "epoch": 0.17605742696736132, + "grad_norm": 1.5315394401550293, + "learning_rate": 4.627273837798932e-05, + "loss": 5.3647, + "step": 29603 + }, + { + "epoch": 0.1760633742506423, + "grad_norm": 1.6742242574691772, + "learning_rate": 4.627249300169169e-05, + "loss": 5.2066, + "step": 29604 + }, + { + "epoch": 0.1760693215339233, + "grad_norm": 1.6399402618408203, + "learning_rate": 4.627224761796806e-05, + "loss": 5.0195, + "step": 29605 + }, + { + "epoch": 0.1760752688172043, + "grad_norm": 1.7168047428131104, + "learning_rate": 4.627200222681851e-05, + "loss": 5.3056, + "step": 29606 + }, + { + "epoch": 0.1760812161004853, + "grad_norm": 1.6890738010406494, + "learning_rate": 4.627175682824312e-05, + "loss": 5.1811, + "step": 29607 + }, + { + "epoch": 0.1760871633837663, + "grad_norm": 1.7669142484664917, + "learning_rate": 4.627151142224198e-05, + "loss": 5.2459, + "step": 29608 + }, + { + "epoch": 0.1760931106670473, + "grad_norm": 1.4989925622940063, + "learning_rate": 4.627126600881517e-05, + "loss": 5.092, + "step": 29609 + }, + { + "epoch": 0.1760990579503283, + "grad_norm": 1.4541029930114746, + "learning_rate": 4.627102058796279e-05, + "loss": 5.0705, + "step": 29610 + }, + { + "epoch": 0.17610500523360928, + "grad_norm": 2.039470911026001, + "learning_rate": 4.627077515968492e-05, + "loss": 4.1636, + "step": 29611 + }, + { + "epoch": 0.1761109525168903, + "grad_norm": 3.1738526821136475, + "learning_rate": 4.6270529723981635e-05, + "loss": 2.1184, + "step": 29612 + }, + { + "epoch": 0.17611689980017128, + "grad_norm": 1.7128700017929077, + "learning_rate": 4.6270284280853024e-05, + "loss": 5.7775, + "step": 29613 + }, + { + "epoch": 0.17612284708345227, + "grad_norm": 1.7605071067810059, + "learning_rate": 4.627003883029918e-05, + "loss": 5.6578, + "step": 29614 + }, + { + "epoch": 0.1761287943667333, + "grad_norm": 1.6726125478744507, + "learning_rate": 4.6269793372320186e-05, + "loss": 5.3621, + "step": 29615 + }, + { + "epoch": 0.17613474165001428, + "grad_norm": 1.6924387216567993, + "learning_rate": 4.626954790691612e-05, + "loss": 5.2866, + "step": 29616 + }, + { + "epoch": 0.17614068893329526, + "grad_norm": 1.705000400543213, + "learning_rate": 4.6269302434087085e-05, + "loss": 5.009, + "step": 29617 + }, + { + "epoch": 0.17614663621657628, + "grad_norm": 1.6577481031417847, + "learning_rate": 4.6269056953833157e-05, + "loss": 5.4761, + "step": 29618 + }, + { + "epoch": 0.17615258349985727, + "grad_norm": 1.635854721069336, + "learning_rate": 4.6268811466154415e-05, + "loss": 5.3624, + "step": 29619 + }, + { + "epoch": 0.17615853078313826, + "grad_norm": 1.6608973741531372, + "learning_rate": 4.626856597105095e-05, + "loss": 5.4398, + "step": 29620 + }, + { + "epoch": 0.17616447806641927, + "grad_norm": 1.5028787851333618, + "learning_rate": 4.626832046852285e-05, + "loss": 5.3025, + "step": 29621 + }, + { + "epoch": 0.17617042534970026, + "grad_norm": 2.694622278213501, + "learning_rate": 4.62680749585702e-05, + "loss": 2.389, + "step": 29622 + }, + { + "epoch": 0.17617637263298125, + "grad_norm": 1.6484723091125488, + "learning_rate": 4.6267829441193086e-05, + "loss": 4.871, + "step": 29623 + }, + { + "epoch": 0.17618231991626226, + "grad_norm": 1.6752315759658813, + "learning_rate": 4.626758391639159e-05, + "loss": 5.1089, + "step": 29624 + }, + { + "epoch": 0.17618826719954325, + "grad_norm": 1.8165408372879028, + "learning_rate": 4.62673383841658e-05, + "loss": 5.1408, + "step": 29625 + }, + { + "epoch": 0.17619421448282424, + "grad_norm": 1.7555296421051025, + "learning_rate": 4.6267092844515804e-05, + "loss": 5.2196, + "step": 29626 + }, + { + "epoch": 0.17620016176610526, + "grad_norm": 1.6462376117706299, + "learning_rate": 4.626684729744168e-05, + "loss": 5.2127, + "step": 29627 + }, + { + "epoch": 0.17620610904938624, + "grad_norm": 1.7403783798217773, + "learning_rate": 4.6266601742943526e-05, + "loss": 5.1372, + "step": 29628 + }, + { + "epoch": 0.17621205633266723, + "grad_norm": 2.6064391136169434, + "learning_rate": 4.626635618102142e-05, + "loss": 5.3963, + "step": 29629 + }, + { + "epoch": 0.17621800361594822, + "grad_norm": 1.4826772212982178, + "learning_rate": 4.6266110611675446e-05, + "loss": 5.7049, + "step": 29630 + }, + { + "epoch": 0.17622395089922924, + "grad_norm": 1.685837984085083, + "learning_rate": 4.62658650349057e-05, + "loss": 5.117, + "step": 29631 + }, + { + "epoch": 0.17622989818251023, + "grad_norm": 1.5930708646774292, + "learning_rate": 4.626561945071225e-05, + "loss": 5.1709, + "step": 29632 + }, + { + "epoch": 0.1762358454657912, + "grad_norm": 1.7052996158599854, + "learning_rate": 4.6265373859095197e-05, + "loss": 5.3743, + "step": 29633 + }, + { + "epoch": 0.17624179274907223, + "grad_norm": 1.9218865633010864, + "learning_rate": 4.626512826005462e-05, + "loss": 5.0207, + "step": 29634 + }, + { + "epoch": 0.17624774003235322, + "grad_norm": 2.1410880088806152, + "learning_rate": 4.62648826535906e-05, + "loss": 4.7898, + "step": 29635 + }, + { + "epoch": 0.1762536873156342, + "grad_norm": 3.278724431991577, + "learning_rate": 4.626463703970324e-05, + "loss": 3.7456, + "step": 29636 + }, + { + "epoch": 0.17625963459891522, + "grad_norm": 1.6557966470718384, + "learning_rate": 4.6264391418392615e-05, + "loss": 5.1905, + "step": 29637 + }, + { + "epoch": 0.1762655818821962, + "grad_norm": 1.3662563562393188, + "learning_rate": 4.6264145789658804e-05, + "loss": 5.2232, + "step": 29638 + }, + { + "epoch": 0.1762715291654772, + "grad_norm": 1.5638326406478882, + "learning_rate": 4.62639001535019e-05, + "loss": 5.0933, + "step": 29639 + }, + { + "epoch": 0.1762774764487582, + "grad_norm": 1.81962251663208, + "learning_rate": 4.6263654509921996e-05, + "loss": 4.6625, + "step": 29640 + }, + { + "epoch": 0.1762834237320392, + "grad_norm": 1.5421823263168335, + "learning_rate": 4.626340885891916e-05, + "loss": 5.0372, + "step": 29641 + }, + { + "epoch": 0.1762893710153202, + "grad_norm": 1.8756135702133179, + "learning_rate": 4.626316320049349e-05, + "loss": 5.224, + "step": 29642 + }, + { + "epoch": 0.1762953182986012, + "grad_norm": 1.617411494255066, + "learning_rate": 4.6262917534645076e-05, + "loss": 5.3449, + "step": 29643 + }, + { + "epoch": 0.1763012655818822, + "grad_norm": 1.3965401649475098, + "learning_rate": 4.626267186137399e-05, + "loss": 5.4929, + "step": 29644 + }, + { + "epoch": 0.17630721286516318, + "grad_norm": 1.4743956327438354, + "learning_rate": 4.626242618068033e-05, + "loss": 5.3105, + "step": 29645 + }, + { + "epoch": 0.1763131601484442, + "grad_norm": 1.5603059530258179, + "learning_rate": 4.626218049256417e-05, + "loss": 5.2059, + "step": 29646 + }, + { + "epoch": 0.17631910743172519, + "grad_norm": 1.5562357902526855, + "learning_rate": 4.626193479702561e-05, + "loss": 5.0752, + "step": 29647 + }, + { + "epoch": 0.17632505471500617, + "grad_norm": 1.4330555200576782, + "learning_rate": 4.6261689094064724e-05, + "loss": 5.0991, + "step": 29648 + }, + { + "epoch": 0.1763310019982872, + "grad_norm": 1.636109709739685, + "learning_rate": 4.62614433836816e-05, + "loss": 5.2, + "step": 29649 + }, + { + "epoch": 0.17633694928156818, + "grad_norm": 1.4994865655899048, + "learning_rate": 4.626119766587633e-05, + "loss": 5.4368, + "step": 29650 + }, + { + "epoch": 0.17634289656484917, + "grad_norm": 1.5928007364273071, + "learning_rate": 4.6260951940648996e-05, + "loss": 5.3432, + "step": 29651 + }, + { + "epoch": 0.17634884384813018, + "grad_norm": 2.4773452281951904, + "learning_rate": 4.626070620799968e-05, + "loss": 4.6023, + "step": 29652 + }, + { + "epoch": 0.17635479113141117, + "grad_norm": 1.4862966537475586, + "learning_rate": 4.626046046792847e-05, + "loss": 5.2271, + "step": 29653 + }, + { + "epoch": 0.17636073841469216, + "grad_norm": 1.659691333770752, + "learning_rate": 4.626021472043546e-05, + "loss": 5.1621, + "step": 29654 + }, + { + "epoch": 0.17636668569797317, + "grad_norm": 1.708454966545105, + "learning_rate": 4.625996896552073e-05, + "loss": 4.9272, + "step": 29655 + }, + { + "epoch": 0.17637263298125416, + "grad_norm": 1.7151225805282593, + "learning_rate": 4.625972320318435e-05, + "loss": 5.0272, + "step": 29656 + }, + { + "epoch": 0.17637858026453515, + "grad_norm": 1.635591983795166, + "learning_rate": 4.625947743342644e-05, + "loss": 5.1541, + "step": 29657 + }, + { + "epoch": 0.17638452754781617, + "grad_norm": 1.6878983974456787, + "learning_rate": 4.625923165624705e-05, + "loss": 5.1822, + "step": 29658 + }, + { + "epoch": 0.17639047483109715, + "grad_norm": 1.5905377864837646, + "learning_rate": 4.625898587164628e-05, + "loss": 4.9331, + "step": 29659 + }, + { + "epoch": 0.17639642211437814, + "grad_norm": 1.5988421440124512, + "learning_rate": 4.625874007962423e-05, + "loss": 4.811, + "step": 29660 + }, + { + "epoch": 0.17640236939765916, + "grad_norm": 1.725674033164978, + "learning_rate": 4.625849428018096e-05, + "loss": 4.95, + "step": 29661 + }, + { + "epoch": 0.17640831668094015, + "grad_norm": 1.6319259405136108, + "learning_rate": 4.625824847331658e-05, + "loss": 4.8133, + "step": 29662 + }, + { + "epoch": 0.17641426396422114, + "grad_norm": 1.6534069776535034, + "learning_rate": 4.625800265903116e-05, + "loss": 4.8914, + "step": 29663 + }, + { + "epoch": 0.17642021124750215, + "grad_norm": 1.6242649555206299, + "learning_rate": 4.6257756837324793e-05, + "loss": 5.1348, + "step": 29664 + }, + { + "epoch": 0.17642615853078314, + "grad_norm": 1.59992253780365, + "learning_rate": 4.625751100819757e-05, + "loss": 5.5775, + "step": 29665 + }, + { + "epoch": 0.17643210581406413, + "grad_norm": 1.8516936302185059, + "learning_rate": 4.625726517164956e-05, + "loss": 4.8874, + "step": 29666 + }, + { + "epoch": 0.17643805309734514, + "grad_norm": 2.0659658908843994, + "learning_rate": 4.625701932768086e-05, + "loss": 4.8295, + "step": 29667 + }, + { + "epoch": 0.17644400038062613, + "grad_norm": 1.914340615272522, + "learning_rate": 4.625677347629156e-05, + "loss": 4.8001, + "step": 29668 + }, + { + "epoch": 0.17644994766390712, + "grad_norm": 1.76264226436615, + "learning_rate": 4.6256527617481734e-05, + "loss": 5.0296, + "step": 29669 + }, + { + "epoch": 0.17645589494718814, + "grad_norm": 2.414245367050171, + "learning_rate": 4.625628175125147e-05, + "loss": 4.4596, + "step": 29670 + }, + { + "epoch": 0.17646184223046912, + "grad_norm": 2.4253740310668945, + "learning_rate": 4.625603587760087e-05, + "loss": 4.8557, + "step": 29671 + }, + { + "epoch": 0.1764677895137501, + "grad_norm": 1.5761579275131226, + "learning_rate": 4.6255789996529995e-05, + "loss": 5.3967, + "step": 29672 + }, + { + "epoch": 0.17647373679703113, + "grad_norm": 1.6232905387878418, + "learning_rate": 4.625554410803895e-05, + "loss": 5.2305, + "step": 29673 + }, + { + "epoch": 0.17647968408031212, + "grad_norm": 1.5074714422225952, + "learning_rate": 4.6255298212127806e-05, + "loss": 5.0091, + "step": 29674 + }, + { + "epoch": 0.1764856313635931, + "grad_norm": 1.4851216077804565, + "learning_rate": 4.625505230879667e-05, + "loss": 5.3812, + "step": 29675 + }, + { + "epoch": 0.17649157864687412, + "grad_norm": 1.5750563144683838, + "learning_rate": 4.62548063980456e-05, + "loss": 5.1194, + "step": 29676 + }, + { + "epoch": 0.1764975259301551, + "grad_norm": 1.6650339365005493, + "learning_rate": 4.625456047987471e-05, + "loss": 5.7083, + "step": 29677 + }, + { + "epoch": 0.1765034732134361, + "grad_norm": 1.6024653911590576, + "learning_rate": 4.625431455428407e-05, + "loss": 5.435, + "step": 29678 + }, + { + "epoch": 0.1765094204967171, + "grad_norm": 2.434255361557007, + "learning_rate": 4.625406862127376e-05, + "loss": 4.4856, + "step": 29679 + }, + { + "epoch": 0.1765153677799981, + "grad_norm": 2.248991012573242, + "learning_rate": 4.6253822680843885e-05, + "loss": 4.5724, + "step": 29680 + }, + { + "epoch": 0.1765213150632791, + "grad_norm": 2.187962293624878, + "learning_rate": 4.625357673299451e-05, + "loss": 4.7556, + "step": 29681 + }, + { + "epoch": 0.1765272623465601, + "grad_norm": 1.6530205011367798, + "learning_rate": 4.625333077772574e-05, + "loss": 5.1289, + "step": 29682 + }, + { + "epoch": 0.1765332096298411, + "grad_norm": 1.3826985359191895, + "learning_rate": 4.625308481503765e-05, + "loss": 5.2029, + "step": 29683 + }, + { + "epoch": 0.17653915691312208, + "grad_norm": 1.4573781490325928, + "learning_rate": 4.625283884493032e-05, + "loss": 5.1572, + "step": 29684 + }, + { + "epoch": 0.1765451041964031, + "grad_norm": 1.4935249090194702, + "learning_rate": 4.6252592867403856e-05, + "loss": 5.0828, + "step": 29685 + }, + { + "epoch": 0.17655105147968408, + "grad_norm": 1.6328359842300415, + "learning_rate": 4.625234688245832e-05, + "loss": 5.1604, + "step": 29686 + }, + { + "epoch": 0.17655699876296507, + "grad_norm": 1.4190014600753784, + "learning_rate": 4.6252100890093816e-05, + "loss": 4.9567, + "step": 29687 + }, + { + "epoch": 0.17656294604624606, + "grad_norm": 1.7209579944610596, + "learning_rate": 4.625185489031042e-05, + "loss": 4.412, + "step": 29688 + }, + { + "epoch": 0.17656889332952708, + "grad_norm": 1.5644607543945312, + "learning_rate": 4.625160888310822e-05, + "loss": 4.9651, + "step": 29689 + }, + { + "epoch": 0.17657484061280806, + "grad_norm": 1.498563289642334, + "learning_rate": 4.62513628684873e-05, + "loss": 5.4318, + "step": 29690 + }, + { + "epoch": 0.17658078789608905, + "grad_norm": 1.4302527904510498, + "learning_rate": 4.625111684644776e-05, + "loss": 4.9763, + "step": 29691 + }, + { + "epoch": 0.17658673517937007, + "grad_norm": 1.5234086513519287, + "learning_rate": 4.6250870816989664e-05, + "loss": 4.9747, + "step": 29692 + }, + { + "epoch": 0.17659268246265106, + "grad_norm": 1.611867904663086, + "learning_rate": 4.6250624780113116e-05, + "loss": 4.8275, + "step": 29693 + }, + { + "epoch": 0.17659862974593205, + "grad_norm": 2.0380537509918213, + "learning_rate": 4.625037873581819e-05, + "loss": 5.1795, + "step": 29694 + }, + { + "epoch": 0.17660457702921306, + "grad_norm": 1.433166742324829, + "learning_rate": 4.625013268410498e-05, + "loss": 5.3237, + "step": 29695 + }, + { + "epoch": 0.17661052431249405, + "grad_norm": 1.8627065420150757, + "learning_rate": 4.6249886624973564e-05, + "loss": 5.28, + "step": 29696 + }, + { + "epoch": 0.17661647159577504, + "grad_norm": 1.572050929069519, + "learning_rate": 4.6249640558424036e-05, + "loss": 5.3744, + "step": 29697 + }, + { + "epoch": 0.17662241887905605, + "grad_norm": 3.271996021270752, + "learning_rate": 4.624939448445648e-05, + "loss": 3.856, + "step": 29698 + }, + { + "epoch": 0.17662836616233704, + "grad_norm": 1.7473957538604736, + "learning_rate": 4.624914840307098e-05, + "loss": 4.7745, + "step": 29699 + }, + { + "epoch": 0.17663431344561803, + "grad_norm": 1.5957887172698975, + "learning_rate": 4.62489023142676e-05, + "loss": 5.3401, + "step": 29700 + }, + { + "epoch": 0.17664026072889905, + "grad_norm": 1.519698977470398, + "learning_rate": 4.624865621804647e-05, + "loss": 5.2996, + "step": 29701 + }, + { + "epoch": 0.17664620801218003, + "grad_norm": 1.4777617454528809, + "learning_rate": 4.624841011440765e-05, + "loss": 5.2181, + "step": 29702 + }, + { + "epoch": 0.17665215529546102, + "grad_norm": 1.5206866264343262, + "learning_rate": 4.624816400335123e-05, + "loss": 5.3529, + "step": 29703 + }, + { + "epoch": 0.17665810257874204, + "grad_norm": 1.6352920532226562, + "learning_rate": 4.6247917884877296e-05, + "loss": 5.3274, + "step": 29704 + }, + { + "epoch": 0.17666404986202303, + "grad_norm": 1.572554111480713, + "learning_rate": 4.6247671758985934e-05, + "loss": 5.3941, + "step": 29705 + }, + { + "epoch": 0.17666999714530401, + "grad_norm": 2.0956475734710693, + "learning_rate": 4.624742562567722e-05, + "loss": 4.0032, + "step": 29706 + }, + { + "epoch": 0.17667594442858503, + "grad_norm": 1.382948398590088, + "learning_rate": 4.624717948495126e-05, + "loss": 5.539, + "step": 29707 + }, + { + "epoch": 0.17668189171186602, + "grad_norm": 1.406977653503418, + "learning_rate": 4.6246933336808126e-05, + "loss": 5.5437, + "step": 29708 + }, + { + "epoch": 0.176687838995147, + "grad_norm": 1.6577895879745483, + "learning_rate": 4.62466871812479e-05, + "loss": 5.1155, + "step": 29709 + }, + { + "epoch": 0.17669378627842802, + "grad_norm": 1.9551897048950195, + "learning_rate": 4.624644101827069e-05, + "loss": 4.6531, + "step": 29710 + }, + { + "epoch": 0.176699733561709, + "grad_norm": 2.409532308578491, + "learning_rate": 4.624619484787655e-05, + "loss": 4.5918, + "step": 29711 + }, + { + "epoch": 0.17670568084499, + "grad_norm": 1.8758010864257812, + "learning_rate": 4.6245948670065594e-05, + "loss": 4.9051, + "step": 29712 + }, + { + "epoch": 0.17671162812827101, + "grad_norm": 1.777886152267456, + "learning_rate": 4.6245702484837894e-05, + "loss": 5.1955, + "step": 29713 + }, + { + "epoch": 0.176717575411552, + "grad_norm": 1.6413220167160034, + "learning_rate": 4.624545629219354e-05, + "loss": 4.9031, + "step": 29714 + }, + { + "epoch": 0.176723522694833, + "grad_norm": 1.7025271654129028, + "learning_rate": 4.624521009213262e-05, + "loss": 5.2195, + "step": 29715 + }, + { + "epoch": 0.176729469978114, + "grad_norm": 1.4530411958694458, + "learning_rate": 4.6244963884655204e-05, + "loss": 5.3771, + "step": 29716 + }, + { + "epoch": 0.176735417261395, + "grad_norm": 1.4960378408432007, + "learning_rate": 4.62447176697614e-05, + "loss": 5.1019, + "step": 29717 + }, + { + "epoch": 0.17674136454467598, + "grad_norm": 1.863013505935669, + "learning_rate": 4.624447144745129e-05, + "loss": 4.7721, + "step": 29718 + }, + { + "epoch": 0.176747311827957, + "grad_norm": 1.7837802171707153, + "learning_rate": 4.624422521772495e-05, + "loss": 5.2047, + "step": 29719 + }, + { + "epoch": 0.176753259111238, + "grad_norm": 2.3820879459381104, + "learning_rate": 4.6243978980582456e-05, + "loss": 4.7627, + "step": 29720 + }, + { + "epoch": 0.17675920639451898, + "grad_norm": 2.2981441020965576, + "learning_rate": 4.6243732736023926e-05, + "loss": 4.7149, + "step": 29721 + }, + { + "epoch": 0.1767651536778, + "grad_norm": 1.916215181350708, + "learning_rate": 4.6243486484049426e-05, + "loss": 4.6663, + "step": 29722 + }, + { + "epoch": 0.17677110096108098, + "grad_norm": 1.7512091398239136, + "learning_rate": 4.624324022465904e-05, + "loss": 5.0612, + "step": 29723 + }, + { + "epoch": 0.17677704824436197, + "grad_norm": 1.513918161392212, + "learning_rate": 4.6242993957852855e-05, + "loss": 5.131, + "step": 29724 + }, + { + "epoch": 0.17678299552764298, + "grad_norm": 1.5861341953277588, + "learning_rate": 4.6242747683630966e-05, + "loss": 5.1035, + "step": 29725 + }, + { + "epoch": 0.17678894281092397, + "grad_norm": 1.5094410181045532, + "learning_rate": 4.6242501401993454e-05, + "loss": 5.0484, + "step": 29726 + }, + { + "epoch": 0.17679489009420496, + "grad_norm": 1.5102661848068237, + "learning_rate": 4.6242255112940405e-05, + "loss": 5.0001, + "step": 29727 + }, + { + "epoch": 0.17680083737748598, + "grad_norm": 1.8255689144134521, + "learning_rate": 4.62420088164719e-05, + "loss": 5.4749, + "step": 29728 + }, + { + "epoch": 0.17680678466076696, + "grad_norm": 1.9394241571426392, + "learning_rate": 4.624176251258803e-05, + "loss": 5.3997, + "step": 29729 + }, + { + "epoch": 0.17681273194404795, + "grad_norm": 1.6546714305877686, + "learning_rate": 4.624151620128888e-05, + "loss": 5.396, + "step": 29730 + }, + { + "epoch": 0.17681867922732897, + "grad_norm": 1.55864679813385, + "learning_rate": 4.6241269882574534e-05, + "loss": 5.145, + "step": 29731 + }, + { + "epoch": 0.17682462651060996, + "grad_norm": 1.5503425598144531, + "learning_rate": 4.6241023556445084e-05, + "loss": 5.0982, + "step": 29732 + }, + { + "epoch": 0.17683057379389094, + "grad_norm": 1.6777262687683105, + "learning_rate": 4.624077722290061e-05, + "loss": 4.8005, + "step": 29733 + }, + { + "epoch": 0.17683652107717196, + "grad_norm": 1.4268922805786133, + "learning_rate": 4.62405308819412e-05, + "loss": 5.0045, + "step": 29734 + }, + { + "epoch": 0.17684246836045295, + "grad_norm": 1.7886883020401, + "learning_rate": 4.6240284533566946e-05, + "loss": 4.8464, + "step": 29735 + }, + { + "epoch": 0.17684841564373394, + "grad_norm": 1.5553979873657227, + "learning_rate": 4.624003817777792e-05, + "loss": 5.3561, + "step": 29736 + }, + { + "epoch": 0.17685436292701495, + "grad_norm": 1.508204698562622, + "learning_rate": 4.6239791814574224e-05, + "loss": 5.3903, + "step": 29737 + }, + { + "epoch": 0.17686031021029594, + "grad_norm": 1.3388547897338867, + "learning_rate": 4.623954544395593e-05, + "loss": 5.488, + "step": 29738 + }, + { + "epoch": 0.17686625749357693, + "grad_norm": 1.518465280532837, + "learning_rate": 4.623929906592313e-05, + "loss": 5.4595, + "step": 29739 + }, + { + "epoch": 0.17687220477685794, + "grad_norm": 1.5171095132827759, + "learning_rate": 4.623905268047592e-05, + "loss": 5.6942, + "step": 29740 + }, + { + "epoch": 0.17687815206013893, + "grad_norm": 1.4345729351043701, + "learning_rate": 4.623880628761436e-05, + "loss": 5.598, + "step": 29741 + }, + { + "epoch": 0.17688409934341992, + "grad_norm": 1.3692567348480225, + "learning_rate": 4.623855988733856e-05, + "loss": 5.8299, + "step": 29742 + }, + { + "epoch": 0.17689004662670094, + "grad_norm": 1.6717381477355957, + "learning_rate": 4.62383134796486e-05, + "loss": 4.9299, + "step": 29743 + }, + { + "epoch": 0.17689599390998192, + "grad_norm": 1.6725213527679443, + "learning_rate": 4.6238067064544565e-05, + "loss": 4.8448, + "step": 29744 + }, + { + "epoch": 0.1769019411932629, + "grad_norm": 1.885776400566101, + "learning_rate": 4.623782064202653e-05, + "loss": 4.8159, + "step": 29745 + }, + { + "epoch": 0.17690788847654393, + "grad_norm": 1.7408405542373657, + "learning_rate": 4.6237574212094605e-05, + "loss": 5.3162, + "step": 29746 + }, + { + "epoch": 0.17691383575982492, + "grad_norm": 1.4585955142974854, + "learning_rate": 4.6237327774748856e-05, + "loss": 5.933, + "step": 29747 + }, + { + "epoch": 0.1769197830431059, + "grad_norm": 1.6204352378845215, + "learning_rate": 4.623708132998937e-05, + "loss": 5.4457, + "step": 29748 + }, + { + "epoch": 0.1769257303263869, + "grad_norm": 1.4227222204208374, + "learning_rate": 4.623683487781625e-05, + "loss": 5.387, + "step": 29749 + }, + { + "epoch": 0.1769316776096679, + "grad_norm": 1.4104609489440918, + "learning_rate": 4.623658841822956e-05, + "loss": 5.5075, + "step": 29750 + }, + { + "epoch": 0.1769376248929489, + "grad_norm": 2.1077404022216797, + "learning_rate": 4.6236341951229406e-05, + "loss": 4.343, + "step": 29751 + }, + { + "epoch": 0.17694357217622989, + "grad_norm": 1.820806622505188, + "learning_rate": 4.6236095476815855e-05, + "loss": 4.8388, + "step": 29752 + }, + { + "epoch": 0.1769495194595109, + "grad_norm": 1.6640592813491821, + "learning_rate": 4.623584899498901e-05, + "loss": 5.129, + "step": 29753 + }, + { + "epoch": 0.1769554667427919, + "grad_norm": 1.6439399719238281, + "learning_rate": 4.623560250574894e-05, + "loss": 5.1712, + "step": 29754 + }, + { + "epoch": 0.17696141402607288, + "grad_norm": 1.6510851383209229, + "learning_rate": 4.623535600909575e-05, + "loss": 5.1796, + "step": 29755 + }, + { + "epoch": 0.1769673613093539, + "grad_norm": 1.8089758157730103, + "learning_rate": 4.6235109505029515e-05, + "loss": 4.5897, + "step": 29756 + }, + { + "epoch": 0.17697330859263488, + "grad_norm": 1.734377384185791, + "learning_rate": 4.6234862993550324e-05, + "loss": 5.1078, + "step": 29757 + }, + { + "epoch": 0.17697925587591587, + "grad_norm": 1.7873172760009766, + "learning_rate": 4.623461647465825e-05, + "loss": 5.3811, + "step": 29758 + }, + { + "epoch": 0.17698520315919689, + "grad_norm": 2.1304049491882324, + "learning_rate": 4.623436994835341e-05, + "loss": 4.6419, + "step": 29759 + }, + { + "epoch": 0.17699115044247787, + "grad_norm": 2.734135150909424, + "learning_rate": 4.6234123414635856e-05, + "loss": 4.4103, + "step": 29760 + }, + { + "epoch": 0.17699709772575886, + "grad_norm": 1.9526289701461792, + "learning_rate": 4.6233876873505694e-05, + "loss": 4.4495, + "step": 29761 + }, + { + "epoch": 0.17700304500903988, + "grad_norm": 1.7902294397354126, + "learning_rate": 4.6233630324963004e-05, + "loss": 4.9202, + "step": 29762 + }, + { + "epoch": 0.17700899229232087, + "grad_norm": 2.161142587661743, + "learning_rate": 4.6233383769007874e-05, + "loss": 4.1941, + "step": 29763 + }, + { + "epoch": 0.17701493957560185, + "grad_norm": 2.3652687072753906, + "learning_rate": 4.6233137205640386e-05, + "loss": 4.085, + "step": 29764 + }, + { + "epoch": 0.17702088685888287, + "grad_norm": 2.204157829284668, + "learning_rate": 4.6232890634860635e-05, + "loss": 3.9856, + "step": 29765 + }, + { + "epoch": 0.17702683414216386, + "grad_norm": 2.5543384552001953, + "learning_rate": 4.6232644056668695e-05, + "loss": 4.1421, + "step": 29766 + }, + { + "epoch": 0.17703278142544485, + "grad_norm": 2.0842933654785156, + "learning_rate": 4.623239747106466e-05, + "loss": 3.8326, + "step": 29767 + }, + { + "epoch": 0.17703872870872586, + "grad_norm": 1.953341007232666, + "learning_rate": 4.623215087804862e-05, + "loss": 4.0444, + "step": 29768 + }, + { + "epoch": 0.17704467599200685, + "grad_norm": 2.1980764865875244, + "learning_rate": 4.6231904277620644e-05, + "loss": 4.2192, + "step": 29769 + }, + { + "epoch": 0.17705062327528784, + "grad_norm": 2.225207567214966, + "learning_rate": 4.6231657669780836e-05, + "loss": 4.7365, + "step": 29770 + }, + { + "epoch": 0.17705657055856885, + "grad_norm": 2.128333330154419, + "learning_rate": 4.623141105452928e-05, + "loss": 5.6755, + "step": 29771 + }, + { + "epoch": 0.17706251784184984, + "grad_norm": 1.8886544704437256, + "learning_rate": 4.623116443186605e-05, + "loss": 4.9885, + "step": 29772 + }, + { + "epoch": 0.17706846512513083, + "grad_norm": 3.213632345199585, + "learning_rate": 4.623091780179125e-05, + "loss": 3.1388, + "step": 29773 + }, + { + "epoch": 0.17707441240841185, + "grad_norm": 2.6279642581939697, + "learning_rate": 4.623067116430495e-05, + "loss": 4.1536, + "step": 29774 + }, + { + "epoch": 0.17708035969169283, + "grad_norm": 1.6456087827682495, + "learning_rate": 4.623042451940724e-05, + "loss": 5.1824, + "step": 29775 + }, + { + "epoch": 0.17708630697497382, + "grad_norm": 1.8505003452301025, + "learning_rate": 4.623017786709821e-05, + "loss": 5.1548, + "step": 29776 + }, + { + "epoch": 0.17709225425825484, + "grad_norm": 1.5285630226135254, + "learning_rate": 4.622993120737794e-05, + "loss": 5.1444, + "step": 29777 + }, + { + "epoch": 0.17709820154153583, + "grad_norm": 1.6634210348129272, + "learning_rate": 4.622968454024652e-05, + "loss": 5.3108, + "step": 29778 + }, + { + "epoch": 0.17710414882481681, + "grad_norm": 1.6948342323303223, + "learning_rate": 4.622943786570405e-05, + "loss": 5.0025, + "step": 29779 + }, + { + "epoch": 0.17711009610809783, + "grad_norm": 2.1120948791503906, + "learning_rate": 4.6229191183750594e-05, + "loss": 4.6668, + "step": 29780 + }, + { + "epoch": 0.17711604339137882, + "grad_norm": 5.567571640014648, + "learning_rate": 4.622894449438624e-05, + "loss": 4.7644, + "step": 29781 + }, + { + "epoch": 0.1771219906746598, + "grad_norm": 4.830391883850098, + "learning_rate": 4.622869779761109e-05, + "loss": 4.5086, + "step": 29782 + }, + { + "epoch": 0.17712793795794082, + "grad_norm": 3.956571578979492, + "learning_rate": 4.622845109342522e-05, + "loss": 4.311, + "step": 29783 + }, + { + "epoch": 0.1771338852412218, + "grad_norm": 3.274723529815674, + "learning_rate": 4.622820438182871e-05, + "loss": 4.3097, + "step": 29784 + }, + { + "epoch": 0.1771398325245028, + "grad_norm": 2.478320360183716, + "learning_rate": 4.6227957662821666e-05, + "loss": 4.4818, + "step": 29785 + }, + { + "epoch": 0.17714577980778382, + "grad_norm": 1.271023154258728, + "learning_rate": 4.6227710936404144e-05, + "loss": 5.4578, + "step": 29786 + }, + { + "epoch": 0.1771517270910648, + "grad_norm": 1.687338948249817, + "learning_rate": 4.622746420257626e-05, + "loss": 5.0832, + "step": 29787 + }, + { + "epoch": 0.1771576743743458, + "grad_norm": 1.6693392992019653, + "learning_rate": 4.6227217461338084e-05, + "loss": 5.23, + "step": 29788 + }, + { + "epoch": 0.1771636216576268, + "grad_norm": 1.884928822517395, + "learning_rate": 4.622697071268971e-05, + "loss": 4.4254, + "step": 29789 + }, + { + "epoch": 0.1771695689409078, + "grad_norm": 1.8463094234466553, + "learning_rate": 4.622672395663121e-05, + "loss": 4.3649, + "step": 29790 + }, + { + "epoch": 0.17717551622418878, + "grad_norm": 1.5451326370239258, + "learning_rate": 4.6226477193162685e-05, + "loss": 4.7212, + "step": 29791 + }, + { + "epoch": 0.1771814635074698, + "grad_norm": 1.6390217542648315, + "learning_rate": 4.622623042228422e-05, + "loss": 5.5775, + "step": 29792 + }, + { + "epoch": 0.1771874107907508, + "grad_norm": 1.553244709968567, + "learning_rate": 4.62259836439959e-05, + "loss": 5.5905, + "step": 29793 + }, + { + "epoch": 0.17719335807403178, + "grad_norm": 1.398796558380127, + "learning_rate": 4.62257368582978e-05, + "loss": 5.5597, + "step": 29794 + }, + { + "epoch": 0.1771993053573128, + "grad_norm": 1.6612623929977417, + "learning_rate": 4.622549006519001e-05, + "loss": 4.9175, + "step": 29795 + }, + { + "epoch": 0.17720525264059378, + "grad_norm": 1.7774828672409058, + "learning_rate": 4.622524326467263e-05, + "loss": 5.2457, + "step": 29796 + }, + { + "epoch": 0.17721119992387477, + "grad_norm": 1.447310447692871, + "learning_rate": 4.622499645674574e-05, + "loss": 4.6974, + "step": 29797 + }, + { + "epoch": 0.17721714720715578, + "grad_norm": 1.8368786573410034, + "learning_rate": 4.6224749641409417e-05, + "loss": 4.7698, + "step": 29798 + }, + { + "epoch": 0.17722309449043677, + "grad_norm": 1.7796480655670166, + "learning_rate": 4.622450281866375e-05, + "loss": 5.0171, + "step": 29799 + }, + { + "epoch": 0.17722904177371776, + "grad_norm": 1.584720492362976, + "learning_rate": 4.6224255988508836e-05, + "loss": 5.5296, + "step": 29800 + }, + { + "epoch": 0.17723498905699878, + "grad_norm": 1.7539535760879517, + "learning_rate": 4.622400915094475e-05, + "loss": 5.5441, + "step": 29801 + }, + { + "epoch": 0.17724093634027976, + "grad_norm": 1.608579397201538, + "learning_rate": 4.6223762305971576e-05, + "loss": 5.3746, + "step": 29802 + }, + { + "epoch": 0.17724688362356075, + "grad_norm": 1.7146000862121582, + "learning_rate": 4.622351545358942e-05, + "loss": 5.2776, + "step": 29803 + }, + { + "epoch": 0.17725283090684177, + "grad_norm": 1.741254448890686, + "learning_rate": 4.622326859379834e-05, + "loss": 5.14, + "step": 29804 + }, + { + "epoch": 0.17725877819012276, + "grad_norm": 1.669607162475586, + "learning_rate": 4.6223021726598434e-05, + "loss": 4.9702, + "step": 29805 + }, + { + "epoch": 0.17726472547340374, + "grad_norm": 1.817954659461975, + "learning_rate": 4.62227748519898e-05, + "loss": 5.1888, + "step": 29806 + }, + { + "epoch": 0.17727067275668473, + "grad_norm": 1.7606234550476074, + "learning_rate": 4.6222527969972516e-05, + "loss": 5.1171, + "step": 29807 + }, + { + "epoch": 0.17727662003996575, + "grad_norm": 1.6854933500289917, + "learning_rate": 4.622228108054666e-05, + "loss": 4.9143, + "step": 29808 + }, + { + "epoch": 0.17728256732324674, + "grad_norm": 1.801241159439087, + "learning_rate": 4.622203418371233e-05, + "loss": 4.5452, + "step": 29809 + }, + { + "epoch": 0.17728851460652773, + "grad_norm": 1.7132951021194458, + "learning_rate": 4.6221787279469606e-05, + "loss": 5.6643, + "step": 29810 + }, + { + "epoch": 0.17729446188980874, + "grad_norm": 1.5202804803848267, + "learning_rate": 4.6221540367818576e-05, + "loss": 5.7674, + "step": 29811 + }, + { + "epoch": 0.17730040917308973, + "grad_norm": 1.3772656917572021, + "learning_rate": 4.622129344875932e-05, + "loss": 5.4231, + "step": 29812 + }, + { + "epoch": 0.17730635645637072, + "grad_norm": 1.7075127363204956, + "learning_rate": 4.6221046522291936e-05, + "loss": 5.1009, + "step": 29813 + }, + { + "epoch": 0.17731230373965173, + "grad_norm": 1.6497002840042114, + "learning_rate": 4.622079958841651e-05, + "loss": 5.2202, + "step": 29814 + }, + { + "epoch": 0.17731825102293272, + "grad_norm": 1.796449065208435, + "learning_rate": 4.622055264713311e-05, + "loss": 4.9304, + "step": 29815 + }, + { + "epoch": 0.1773241983062137, + "grad_norm": 1.6709007024765015, + "learning_rate": 4.6220305698441836e-05, + "loss": 4.9885, + "step": 29816 + }, + { + "epoch": 0.17733014558949473, + "grad_norm": 1.4689090251922607, + "learning_rate": 4.622005874234278e-05, + "loss": 4.9051, + "step": 29817 + }, + { + "epoch": 0.1773360928727757, + "grad_norm": 1.7701568603515625, + "learning_rate": 4.621981177883601e-05, + "loss": 4.8309, + "step": 29818 + }, + { + "epoch": 0.1773420401560567, + "grad_norm": 1.6992321014404297, + "learning_rate": 4.621956480792163e-05, + "loss": 4.7161, + "step": 29819 + }, + { + "epoch": 0.17734798743933772, + "grad_norm": 1.7641901969909668, + "learning_rate": 4.6219317829599715e-05, + "loss": 4.5102, + "step": 29820 + }, + { + "epoch": 0.1773539347226187, + "grad_norm": 1.9778741598129272, + "learning_rate": 4.621907084387036e-05, + "loss": 5.0063, + "step": 29821 + }, + { + "epoch": 0.1773598820058997, + "grad_norm": 2.4267444610595703, + "learning_rate": 4.6218823850733636e-05, + "loss": 4.6155, + "step": 29822 + }, + { + "epoch": 0.1773658292891807, + "grad_norm": 1.8586831092834473, + "learning_rate": 4.6218576850189655e-05, + "loss": 5.1348, + "step": 29823 + }, + { + "epoch": 0.1773717765724617, + "grad_norm": 2.0853071212768555, + "learning_rate": 4.621832984223849e-05, + "loss": 4.9064, + "step": 29824 + }, + { + "epoch": 0.17737772385574269, + "grad_norm": 1.9400508403778076, + "learning_rate": 4.6218082826880205e-05, + "loss": 5.0123, + "step": 29825 + }, + { + "epoch": 0.1773836711390237, + "grad_norm": 1.6919422149658203, + "learning_rate": 4.621783580411492e-05, + "loss": 4.8755, + "step": 29826 + }, + { + "epoch": 0.1773896184223047, + "grad_norm": 2.295384407043457, + "learning_rate": 4.621758877394271e-05, + "loss": 3.9202, + "step": 29827 + }, + { + "epoch": 0.17739556570558568, + "grad_norm": 2.417031764984131, + "learning_rate": 4.621734173636365e-05, + "loss": 4.441, + "step": 29828 + }, + { + "epoch": 0.1774015129888667, + "grad_norm": 3.097060203552246, + "learning_rate": 4.6217094691377835e-05, + "loss": 4.6754, + "step": 29829 + }, + { + "epoch": 0.17740746027214768, + "grad_norm": 2.9717020988464355, + "learning_rate": 4.621684763898536e-05, + "loss": 4.7217, + "step": 29830 + }, + { + "epoch": 0.17741340755542867, + "grad_norm": 1.9695039987564087, + "learning_rate": 4.62166005791863e-05, + "loss": 5.0213, + "step": 29831 + }, + { + "epoch": 0.1774193548387097, + "grad_norm": 1.6653083562850952, + "learning_rate": 4.621635351198074e-05, + "loss": 4.7739, + "step": 29832 + }, + { + "epoch": 0.17742530212199067, + "grad_norm": 1.9583450555801392, + "learning_rate": 4.621610643736878e-05, + "loss": 5.0863, + "step": 29833 + }, + { + "epoch": 0.17743124940527166, + "grad_norm": 2.460378646850586, + "learning_rate": 4.621585935535049e-05, + "loss": 4.5889, + "step": 29834 + }, + { + "epoch": 0.17743719668855268, + "grad_norm": 2.478996992111206, + "learning_rate": 4.621561226592596e-05, + "loss": 3.7157, + "step": 29835 + }, + { + "epoch": 0.17744314397183367, + "grad_norm": 2.4852869510650635, + "learning_rate": 4.6215365169095283e-05, + "loss": 3.674, + "step": 29836 + }, + { + "epoch": 0.17744909125511465, + "grad_norm": 3.0013065338134766, + "learning_rate": 4.621511806485853e-05, + "loss": 3.6854, + "step": 29837 + }, + { + "epoch": 0.17745503853839567, + "grad_norm": 2.071744918823242, + "learning_rate": 4.621487095321581e-05, + "loss": 4.2681, + "step": 29838 + }, + { + "epoch": 0.17746098582167666, + "grad_norm": 1.7033419609069824, + "learning_rate": 4.62146238341672e-05, + "loss": 5.4001, + "step": 29839 + }, + { + "epoch": 0.17746693310495765, + "grad_norm": 1.9189993143081665, + "learning_rate": 4.621437670771278e-05, + "loss": 4.9708, + "step": 29840 + }, + { + "epoch": 0.17747288038823866, + "grad_norm": 1.924280047416687, + "learning_rate": 4.621412957385264e-05, + "loss": 4.5928, + "step": 29841 + }, + { + "epoch": 0.17747882767151965, + "grad_norm": 1.6338030099868774, + "learning_rate": 4.621388243258686e-05, + "loss": 4.6546, + "step": 29842 + }, + { + "epoch": 0.17748477495480064, + "grad_norm": 1.6776630878448486, + "learning_rate": 4.621363528391555e-05, + "loss": 5.0897, + "step": 29843 + }, + { + "epoch": 0.17749072223808166, + "grad_norm": 2.56796932220459, + "learning_rate": 4.621338812783877e-05, + "loss": 4.1294, + "step": 29844 + }, + { + "epoch": 0.17749666952136264, + "grad_norm": 2.6277754306793213, + "learning_rate": 4.621314096435661e-05, + "loss": 4.2364, + "step": 29845 + }, + { + "epoch": 0.17750261680464363, + "grad_norm": 2.836585760116577, + "learning_rate": 4.621289379346916e-05, + "loss": 4.4706, + "step": 29846 + }, + { + "epoch": 0.17750856408792465, + "grad_norm": 2.3705074787139893, + "learning_rate": 4.6212646615176514e-05, + "loss": 4.245, + "step": 29847 + }, + { + "epoch": 0.17751451137120564, + "grad_norm": 1.7258014678955078, + "learning_rate": 4.621239942947875e-05, + "loss": 5.338, + "step": 29848 + }, + { + "epoch": 0.17752045865448662, + "grad_norm": 1.5844351053237915, + "learning_rate": 4.621215223637596e-05, + "loss": 4.85, + "step": 29849 + }, + { + "epoch": 0.17752640593776764, + "grad_norm": 1.7583924531936646, + "learning_rate": 4.6211905035868224e-05, + "loss": 4.1059, + "step": 29850 + }, + { + "epoch": 0.17753235322104863, + "grad_norm": 1.7784796953201294, + "learning_rate": 4.621165782795564e-05, + "loss": 4.1206, + "step": 29851 + }, + { + "epoch": 0.17753830050432962, + "grad_norm": 2.0315020084381104, + "learning_rate": 4.6211410612638273e-05, + "loss": 4.8268, + "step": 29852 + }, + { + "epoch": 0.17754424778761063, + "grad_norm": 2.137162923812866, + "learning_rate": 4.621116338991622e-05, + "loss": 4.6874, + "step": 29853 + }, + { + "epoch": 0.17755019507089162, + "grad_norm": 2.5275580883026123, + "learning_rate": 4.621091615978957e-05, + "loss": 4.4036, + "step": 29854 + }, + { + "epoch": 0.1775561423541726, + "grad_norm": 2.170762062072754, + "learning_rate": 4.621066892225842e-05, + "loss": 4.8377, + "step": 29855 + }, + { + "epoch": 0.17756208963745362, + "grad_norm": 1.592443823814392, + "learning_rate": 4.6210421677322833e-05, + "loss": 4.4257, + "step": 29856 + }, + { + "epoch": 0.1775680369207346, + "grad_norm": 1.479036569595337, + "learning_rate": 4.6210174424982914e-05, + "loss": 4.0881, + "step": 29857 + }, + { + "epoch": 0.1775739842040156, + "grad_norm": 1.5338127613067627, + "learning_rate": 4.620992716523874e-05, + "loss": 4.1253, + "step": 29858 + }, + { + "epoch": 0.17757993148729662, + "grad_norm": 1.6100810766220093, + "learning_rate": 4.62096798980904e-05, + "loss": 4.224, + "step": 29859 + }, + { + "epoch": 0.1775858787705776, + "grad_norm": 1.6029894351959229, + "learning_rate": 4.6209432623537984e-05, + "loss": 4.3054, + "step": 29860 + }, + { + "epoch": 0.1775918260538586, + "grad_norm": 1.5900243520736694, + "learning_rate": 4.620918534158157e-05, + "loss": 5.0011, + "step": 29861 + }, + { + "epoch": 0.1775977733371396, + "grad_norm": 1.4453150033950806, + "learning_rate": 4.620893805222124e-05, + "loss": 5.035, + "step": 29862 + }, + { + "epoch": 0.1776037206204206, + "grad_norm": 1.2561450004577637, + "learning_rate": 4.62086907554571e-05, + "loss": 5.0042, + "step": 29863 + }, + { + "epoch": 0.17760966790370158, + "grad_norm": 1.6127535104751587, + "learning_rate": 4.620844345128923e-05, + "loss": 5.1504, + "step": 29864 + }, + { + "epoch": 0.17761561518698257, + "grad_norm": 1.505549430847168, + "learning_rate": 4.6208196139717697e-05, + "loss": 5.2917, + "step": 29865 + }, + { + "epoch": 0.1776215624702636, + "grad_norm": 1.652327537536621, + "learning_rate": 4.620794882074261e-05, + "loss": 4.7241, + "step": 29866 + }, + { + "epoch": 0.17762750975354458, + "grad_norm": 1.750353217124939, + "learning_rate": 4.620770149436405e-05, + "loss": 4.1058, + "step": 29867 + }, + { + "epoch": 0.17763345703682556, + "grad_norm": 1.6184377670288086, + "learning_rate": 4.6207454160582094e-05, + "loss": 4.0415, + "step": 29868 + }, + { + "epoch": 0.17763940432010658, + "grad_norm": 1.493651270866394, + "learning_rate": 4.6207206819396834e-05, + "loss": 4.3537, + "step": 29869 + }, + { + "epoch": 0.17764535160338757, + "grad_norm": 1.4839292764663696, + "learning_rate": 4.6206959470808364e-05, + "loss": 4.7692, + "step": 29870 + }, + { + "epoch": 0.17765129888666856, + "grad_norm": 1.726027488708496, + "learning_rate": 4.620671211481676e-05, + "loss": 4.971, + "step": 29871 + }, + { + "epoch": 0.17765724616994957, + "grad_norm": 1.6284557580947876, + "learning_rate": 4.6206464751422105e-05, + "loss": 4.4246, + "step": 29872 + }, + { + "epoch": 0.17766319345323056, + "grad_norm": 2.2713751792907715, + "learning_rate": 4.6206217380624505e-05, + "loss": 4.3045, + "step": 29873 + }, + { + "epoch": 0.17766914073651155, + "grad_norm": 1.8392630815505981, + "learning_rate": 4.620597000242403e-05, + "loss": 4.1344, + "step": 29874 + }, + { + "epoch": 0.17767508801979257, + "grad_norm": 1.5239953994750977, + "learning_rate": 4.620572261682077e-05, + "loss": 3.9802, + "step": 29875 + }, + { + "epoch": 0.17768103530307355, + "grad_norm": 1.6723328828811646, + "learning_rate": 4.6205475223814804e-05, + "loss": 4.0901, + "step": 29876 + }, + { + "epoch": 0.17768698258635454, + "grad_norm": 1.555239200592041, + "learning_rate": 4.620522782340623e-05, + "loss": 3.9096, + "step": 29877 + }, + { + "epoch": 0.17769292986963556, + "grad_norm": 1.8839585781097412, + "learning_rate": 4.620498041559513e-05, + "loss": 4.9657, + "step": 29878 + }, + { + "epoch": 0.17769887715291655, + "grad_norm": 1.9911398887634277, + "learning_rate": 4.620473300038159e-05, + "loss": 4.5497, + "step": 29879 + }, + { + "epoch": 0.17770482443619753, + "grad_norm": 2.2058022022247314, + "learning_rate": 4.62044855777657e-05, + "loss": 3.7231, + "step": 29880 + }, + { + "epoch": 0.17771077171947855, + "grad_norm": 2.0669283866882324, + "learning_rate": 4.6204238147747535e-05, + "loss": 3.8466, + "step": 29881 + }, + { + "epoch": 0.17771671900275954, + "grad_norm": 2.122668981552124, + "learning_rate": 4.62039907103272e-05, + "loss": 3.5758, + "step": 29882 + }, + { + "epoch": 0.17772266628604053, + "grad_norm": 2.091607093811035, + "learning_rate": 4.6203743265504765e-05, + "loss": 3.3965, + "step": 29883 + }, + { + "epoch": 0.17772861356932154, + "grad_norm": 2.204787492752075, + "learning_rate": 4.620349581328033e-05, + "loss": 4.3546, + "step": 29884 + }, + { + "epoch": 0.17773456085260253, + "grad_norm": 1.5886098146438599, + "learning_rate": 4.620324835365396e-05, + "loss": 5.0842, + "step": 29885 + }, + { + "epoch": 0.17774050813588352, + "grad_norm": 1.6993340253829956, + "learning_rate": 4.6203000886625766e-05, + "loss": 4.8315, + "step": 29886 + }, + { + "epoch": 0.17774645541916453, + "grad_norm": 1.6817113161087036, + "learning_rate": 4.620275341219582e-05, + "loss": 4.9972, + "step": 29887 + }, + { + "epoch": 0.17775240270244552, + "grad_norm": 1.7113308906555176, + "learning_rate": 4.620250593036421e-05, + "loss": 4.8823, + "step": 29888 + }, + { + "epoch": 0.1777583499857265, + "grad_norm": 1.7548478841781616, + "learning_rate": 4.620225844113103e-05, + "loss": 5.121, + "step": 29889 + }, + { + "epoch": 0.17776429726900753, + "grad_norm": 1.8111287355422974, + "learning_rate": 4.6202010944496356e-05, + "loss": 4.8074, + "step": 29890 + }, + { + "epoch": 0.17777024455228851, + "grad_norm": 1.279390573501587, + "learning_rate": 4.620176344046028e-05, + "loss": 4.6303, + "step": 29891 + }, + { + "epoch": 0.1777761918355695, + "grad_norm": 1.2164942026138306, + "learning_rate": 4.620151592902288e-05, + "loss": 4.6222, + "step": 29892 + }, + { + "epoch": 0.17778213911885052, + "grad_norm": 1.5320428609848022, + "learning_rate": 4.620126841018426e-05, + "loss": 4.9938, + "step": 29893 + }, + { + "epoch": 0.1777880864021315, + "grad_norm": 1.5564218759536743, + "learning_rate": 4.620102088394449e-05, + "loss": 4.961, + "step": 29894 + }, + { + "epoch": 0.1777940336854125, + "grad_norm": 1.5532233715057373, + "learning_rate": 4.6200773350303675e-05, + "loss": 4.8086, + "step": 29895 + }, + { + "epoch": 0.1777999809686935, + "grad_norm": 1.9697725772857666, + "learning_rate": 4.620052580926187e-05, + "loss": 4.6753, + "step": 29896 + }, + { + "epoch": 0.1778059282519745, + "grad_norm": 2.0587549209594727, + "learning_rate": 4.62002782608192e-05, + "loss": 5.3824, + "step": 29897 + }, + { + "epoch": 0.1778118755352555, + "grad_norm": 1.5464704036712646, + "learning_rate": 4.620003070497572e-05, + "loss": 5.2827, + "step": 29898 + }, + { + "epoch": 0.1778178228185365, + "grad_norm": 2.052751064300537, + "learning_rate": 4.619978314173152e-05, + "loss": 4.8924, + "step": 29899 + }, + { + "epoch": 0.1778237701018175, + "grad_norm": 1.857614517211914, + "learning_rate": 4.619953557108671e-05, + "loss": 4.9826, + "step": 29900 + }, + { + "epoch": 0.17782971738509848, + "grad_norm": 1.5344221591949463, + "learning_rate": 4.619928799304136e-05, + "loss": 5.0715, + "step": 29901 + }, + { + "epoch": 0.1778356646683795, + "grad_norm": 1.6682283878326416, + "learning_rate": 4.619904040759555e-05, + "loss": 5.5025, + "step": 29902 + }, + { + "epoch": 0.17784161195166048, + "grad_norm": 1.8382456302642822, + "learning_rate": 4.619879281474938e-05, + "loss": 5.0428, + "step": 29903 + }, + { + "epoch": 0.17784755923494147, + "grad_norm": 1.5137388706207275, + "learning_rate": 4.619854521450293e-05, + "loss": 5.1731, + "step": 29904 + }, + { + "epoch": 0.1778535065182225, + "grad_norm": 1.5241427421569824, + "learning_rate": 4.619829760685628e-05, + "loss": 5.11, + "step": 29905 + }, + { + "epoch": 0.17785945380150348, + "grad_norm": 1.6426124572753906, + "learning_rate": 4.6198049991809534e-05, + "loss": 5.0386, + "step": 29906 + }, + { + "epoch": 0.17786540108478446, + "grad_norm": 1.240784764289856, + "learning_rate": 4.6197802369362756e-05, + "loss": 4.9999, + "step": 29907 + }, + { + "epoch": 0.17787134836806548, + "grad_norm": 1.7629567384719849, + "learning_rate": 4.6197554739516054e-05, + "loss": 5.1035, + "step": 29908 + }, + { + "epoch": 0.17787729565134647, + "grad_norm": 1.7833048105239868, + "learning_rate": 4.61973071022695e-05, + "loss": 5.2879, + "step": 29909 + }, + { + "epoch": 0.17788324293462746, + "grad_norm": 1.6848218441009521, + "learning_rate": 4.619705945762318e-05, + "loss": 5.1269, + "step": 29910 + }, + { + "epoch": 0.17788919021790847, + "grad_norm": 1.917606234550476, + "learning_rate": 4.61968118055772e-05, + "loss": 4.422, + "step": 29911 + }, + { + "epoch": 0.17789513750118946, + "grad_norm": 2.092909336090088, + "learning_rate": 4.619656414613162e-05, + "loss": 4.4046, + "step": 29912 + }, + { + "epoch": 0.17790108478447045, + "grad_norm": 1.580072283744812, + "learning_rate": 4.6196316479286547e-05, + "loss": 5.117, + "step": 29913 + }, + { + "epoch": 0.17790703206775146, + "grad_norm": 1.5650675296783447, + "learning_rate": 4.619606880504205e-05, + "loss": 5.0848, + "step": 29914 + }, + { + "epoch": 0.17791297935103245, + "grad_norm": 1.5918974876403809, + "learning_rate": 4.619582112339823e-05, + "loss": 5.108, + "step": 29915 + }, + { + "epoch": 0.17791892663431344, + "grad_norm": 1.6393519639968872, + "learning_rate": 4.619557343435516e-05, + "loss": 5.1883, + "step": 29916 + }, + { + "epoch": 0.17792487391759446, + "grad_norm": 1.6605910062789917, + "learning_rate": 4.619532573791294e-05, + "loss": 5.3422, + "step": 29917 + }, + { + "epoch": 0.17793082120087544, + "grad_norm": 1.618237853050232, + "learning_rate": 4.619507803407166e-05, + "loss": 5.3366, + "step": 29918 + }, + { + "epoch": 0.17793676848415643, + "grad_norm": 1.7383369207382202, + "learning_rate": 4.6194830322831384e-05, + "loss": 5.2423, + "step": 29919 + }, + { + "epoch": 0.17794271576743745, + "grad_norm": 1.7745330333709717, + "learning_rate": 4.619458260419222e-05, + "loss": 5.5013, + "step": 29920 + }, + { + "epoch": 0.17794866305071844, + "grad_norm": 1.64639151096344, + "learning_rate": 4.6194334878154244e-05, + "loss": 5.6739, + "step": 29921 + }, + { + "epoch": 0.17795461033399942, + "grad_norm": 1.6652768850326538, + "learning_rate": 4.619408714471754e-05, + "loss": 5.5507, + "step": 29922 + }, + { + "epoch": 0.1779605576172804, + "grad_norm": 1.8969260454177856, + "learning_rate": 4.61938394038822e-05, + "loss": 4.7228, + "step": 29923 + }, + { + "epoch": 0.17796650490056143, + "grad_norm": 2.7471752166748047, + "learning_rate": 4.619359165564832e-05, + "loss": 3.7551, + "step": 29924 + }, + { + "epoch": 0.17797245218384242, + "grad_norm": 1.68784499168396, + "learning_rate": 4.6193343900015964e-05, + "loss": 4.6853, + "step": 29925 + }, + { + "epoch": 0.1779783994671234, + "grad_norm": 1.6362453699111938, + "learning_rate": 4.619309613698523e-05, + "loss": 4.665, + "step": 29926 + }, + { + "epoch": 0.17798434675040442, + "grad_norm": 1.737727165222168, + "learning_rate": 4.619284836655621e-05, + "loss": 4.9511, + "step": 29927 + }, + { + "epoch": 0.1779902940336854, + "grad_norm": 1.4916706085205078, + "learning_rate": 4.6192600588728985e-05, + "loss": 4.9043, + "step": 29928 + }, + { + "epoch": 0.1779962413169664, + "grad_norm": 1.6925257444381714, + "learning_rate": 4.619235280350365e-05, + "loss": 4.764, + "step": 29929 + }, + { + "epoch": 0.1780021886002474, + "grad_norm": 1.525317668914795, + "learning_rate": 4.619210501088027e-05, + "loss": 4.5491, + "step": 29930 + }, + { + "epoch": 0.1780081358835284, + "grad_norm": 1.771481990814209, + "learning_rate": 4.619185721085895e-05, + "loss": 4.7972, + "step": 29931 + }, + { + "epoch": 0.1780140831668094, + "grad_norm": 2.018819808959961, + "learning_rate": 4.619160940343977e-05, + "loss": 3.8428, + "step": 29932 + }, + { + "epoch": 0.1780200304500904, + "grad_norm": 1.7792484760284424, + "learning_rate": 4.6191361588622825e-05, + "loss": 4.9156, + "step": 29933 + }, + { + "epoch": 0.1780259777333714, + "grad_norm": 1.8811469078063965, + "learning_rate": 4.619111376640819e-05, + "loss": 4.0915, + "step": 29934 + }, + { + "epoch": 0.17803192501665238, + "grad_norm": 1.7818450927734375, + "learning_rate": 4.619086593679596e-05, + "loss": 5.1882, + "step": 29935 + }, + { + "epoch": 0.1780378722999334, + "grad_norm": 1.587109088897705, + "learning_rate": 4.619061809978621e-05, + "loss": 4.8753, + "step": 29936 + }, + { + "epoch": 0.17804381958321439, + "grad_norm": 1.6229913234710693, + "learning_rate": 4.619037025537904e-05, + "loss": 4.5926, + "step": 29937 + }, + { + "epoch": 0.17804976686649537, + "grad_norm": 2.0784964561462402, + "learning_rate": 4.619012240357452e-05, + "loss": 3.6958, + "step": 29938 + }, + { + "epoch": 0.1780557141497764, + "grad_norm": 1.829585313796997, + "learning_rate": 4.6189874544372766e-05, + "loss": 3.5768, + "step": 29939 + }, + { + "epoch": 0.17806166143305738, + "grad_norm": 2.243161201477051, + "learning_rate": 4.6189626677773837e-05, + "loss": 3.6418, + "step": 29940 + }, + { + "epoch": 0.17806760871633837, + "grad_norm": 1.8179738521575928, + "learning_rate": 4.618937880377782e-05, + "loss": 3.6718, + "step": 29941 + }, + { + "epoch": 0.17807355599961938, + "grad_norm": 1.7654396295547485, + "learning_rate": 4.618913092238482e-05, + "loss": 4.4997, + "step": 29942 + }, + { + "epoch": 0.17807950328290037, + "grad_norm": 1.615114688873291, + "learning_rate": 4.6188883033594907e-05, + "loss": 4.7439, + "step": 29943 + }, + { + "epoch": 0.17808545056618136, + "grad_norm": 1.2790718078613281, + "learning_rate": 4.6188635137408174e-05, + "loss": 4.6724, + "step": 29944 + }, + { + "epoch": 0.17809139784946237, + "grad_norm": 1.6814706325531006, + "learning_rate": 4.6188387233824717e-05, + "loss": 4.9715, + "step": 29945 + }, + { + "epoch": 0.17809734513274336, + "grad_norm": 2.3926637172698975, + "learning_rate": 4.61881393228446e-05, + "loss": 3.682, + "step": 29946 + }, + { + "epoch": 0.17810329241602435, + "grad_norm": 1.4340671300888062, + "learning_rate": 4.618789140446793e-05, + "loss": 4.586, + "step": 29947 + }, + { + "epoch": 0.17810923969930537, + "grad_norm": 1.6323633193969727, + "learning_rate": 4.6187643478694784e-05, + "loss": 4.7435, + "step": 29948 + }, + { + "epoch": 0.17811518698258635, + "grad_norm": 1.6034373044967651, + "learning_rate": 4.618739554552526e-05, + "loss": 4.9142, + "step": 29949 + }, + { + "epoch": 0.17812113426586734, + "grad_norm": 1.599575161933899, + "learning_rate": 4.618714760495943e-05, + "loss": 4.7991, + "step": 29950 + }, + { + "epoch": 0.17812708154914836, + "grad_norm": 1.7768034934997559, + "learning_rate": 4.618689965699737e-05, + "loss": 4.9267, + "step": 29951 + }, + { + "epoch": 0.17813302883242935, + "grad_norm": 1.8471229076385498, + "learning_rate": 4.6186651701639195e-05, + "loss": 4.4194, + "step": 29952 + }, + { + "epoch": 0.17813897611571033, + "grad_norm": 2.222182512283325, + "learning_rate": 4.6186403738884984e-05, + "loss": 4.1248, + "step": 29953 + }, + { + "epoch": 0.17814492339899135, + "grad_norm": 2.373452663421631, + "learning_rate": 4.6186155768734806e-05, + "loss": 4.3799, + "step": 29954 + }, + { + "epoch": 0.17815087068227234, + "grad_norm": 2.6431610584259033, + "learning_rate": 4.618590779118877e-05, + "loss": 4.4425, + "step": 29955 + }, + { + "epoch": 0.17815681796555333, + "grad_norm": 2.160435676574707, + "learning_rate": 4.618565980624695e-05, + "loss": 4.3708, + "step": 29956 + }, + { + "epoch": 0.17816276524883434, + "grad_norm": 2.0715856552124023, + "learning_rate": 4.618541181390943e-05, + "loss": 4.7181, + "step": 29957 + }, + { + "epoch": 0.17816871253211533, + "grad_norm": 2.107534408569336, + "learning_rate": 4.618516381417631e-05, + "loss": 3.9446, + "step": 29958 + }, + { + "epoch": 0.17817465981539632, + "grad_norm": 2.215634822845459, + "learning_rate": 4.618491580704766e-05, + "loss": 4.3066, + "step": 29959 + }, + { + "epoch": 0.17818060709867733, + "grad_norm": 1.760855793952942, + "learning_rate": 4.618466779252359e-05, + "loss": 4.757, + "step": 29960 + }, + { + "epoch": 0.17818655438195832, + "grad_norm": 1.6130295991897583, + "learning_rate": 4.618441977060415e-05, + "loss": 5.0813, + "step": 29961 + }, + { + "epoch": 0.1781925016652393, + "grad_norm": 1.4686352014541626, + "learning_rate": 4.6184171741289454e-05, + "loss": 4.5848, + "step": 29962 + }, + { + "epoch": 0.17819844894852033, + "grad_norm": 1.5685728788375854, + "learning_rate": 4.618392370457959e-05, + "loss": 4.5756, + "step": 29963 + }, + { + "epoch": 0.17820439623180132, + "grad_norm": 1.7625272274017334, + "learning_rate": 4.618367566047463e-05, + "loss": 4.4729, + "step": 29964 + }, + { + "epoch": 0.1782103435150823, + "grad_norm": 2.350189685821533, + "learning_rate": 4.618342760897467e-05, + "loss": 4.2178, + "step": 29965 + }, + { + "epoch": 0.17821629079836332, + "grad_norm": 2.462435007095337, + "learning_rate": 4.6183179550079796e-05, + "loss": 4.5618, + "step": 29966 + }, + { + "epoch": 0.1782222380816443, + "grad_norm": 2.354248523712158, + "learning_rate": 4.618293148379009e-05, + "loss": 4.4869, + "step": 29967 + }, + { + "epoch": 0.1782281853649253, + "grad_norm": 2.1047489643096924, + "learning_rate": 4.6182683410105646e-05, + "loss": 4.3849, + "step": 29968 + }, + { + "epoch": 0.1782341326482063, + "grad_norm": 1.859437108039856, + "learning_rate": 4.618243532902655e-05, + "loss": 4.3603, + "step": 29969 + }, + { + "epoch": 0.1782400799314873, + "grad_norm": 2.014723539352417, + "learning_rate": 4.6182187240552875e-05, + "loss": 5.363, + "step": 29970 + }, + { + "epoch": 0.1782460272147683, + "grad_norm": 1.637157917022705, + "learning_rate": 4.618193914468472e-05, + "loss": 5.0457, + "step": 29971 + }, + { + "epoch": 0.1782519744980493, + "grad_norm": 2.200927734375, + "learning_rate": 4.618169104142217e-05, + "loss": 4.9131, + "step": 29972 + }, + { + "epoch": 0.1782579217813303, + "grad_norm": 2.0116817951202393, + "learning_rate": 4.6181442930765305e-05, + "loss": 4.8401, + "step": 29973 + }, + { + "epoch": 0.17826386906461128, + "grad_norm": 1.9755736589431763, + "learning_rate": 4.618119481271422e-05, + "loss": 4.8402, + "step": 29974 + }, + { + "epoch": 0.1782698163478923, + "grad_norm": 1.954923152923584, + "learning_rate": 4.618094668726901e-05, + "loss": 4.7746, + "step": 29975 + }, + { + "epoch": 0.17827576363117328, + "grad_norm": 2.0195765495300293, + "learning_rate": 4.6180698554429737e-05, + "loss": 4.4359, + "step": 29976 + }, + { + "epoch": 0.17828171091445427, + "grad_norm": 1.9346232414245605, + "learning_rate": 4.618045041419651e-05, + "loss": 5.132, + "step": 29977 + }, + { + "epoch": 0.1782876581977353, + "grad_norm": 1.880932331085205, + "learning_rate": 4.6180202266569394e-05, + "loss": 5.26, + "step": 29978 + }, + { + "epoch": 0.17829360548101628, + "grad_norm": 1.8841670751571655, + "learning_rate": 4.6179954111548495e-05, + "loss": 4.7878, + "step": 29979 + }, + { + "epoch": 0.17829955276429726, + "grad_norm": 1.9039348363876343, + "learning_rate": 4.61797059491339e-05, + "loss": 4.8547, + "step": 29980 + }, + { + "epoch": 0.17830550004757825, + "grad_norm": 2.0296382904052734, + "learning_rate": 4.617945777932568e-05, + "loss": 5.0599, + "step": 29981 + }, + { + "epoch": 0.17831144733085927, + "grad_norm": 1.8153882026672363, + "learning_rate": 4.617920960212393e-05, + "loss": 5.0123, + "step": 29982 + }, + { + "epoch": 0.17831739461414026, + "grad_norm": 1.5454435348510742, + "learning_rate": 4.617896141752874e-05, + "loss": 4.4975, + "step": 29983 + }, + { + "epoch": 0.17832334189742124, + "grad_norm": 1.5883069038391113, + "learning_rate": 4.6178713225540196e-05, + "loss": 4.8825, + "step": 29984 + }, + { + "epoch": 0.17832928918070226, + "grad_norm": 1.58603036403656, + "learning_rate": 4.617846502615837e-05, + "loss": 5.3068, + "step": 29985 + }, + { + "epoch": 0.17833523646398325, + "grad_norm": 1.6731973886489868, + "learning_rate": 4.6178216819383374e-05, + "loss": 5.5331, + "step": 29986 + }, + { + "epoch": 0.17834118374726424, + "grad_norm": 1.6074113845825195, + "learning_rate": 4.6177968605215276e-05, + "loss": 5.5162, + "step": 29987 + }, + { + "epoch": 0.17834713103054525, + "grad_norm": 1.4040982723236084, + "learning_rate": 4.6177720383654166e-05, + "loss": 5.3135, + "step": 29988 + }, + { + "epoch": 0.17835307831382624, + "grad_norm": 1.6419864892959595, + "learning_rate": 4.617747215470014e-05, + "loss": 4.9229, + "step": 29989 + }, + { + "epoch": 0.17835902559710723, + "grad_norm": 1.7256529331207275, + "learning_rate": 4.617722391835327e-05, + "loss": 5.0782, + "step": 29990 + }, + { + "epoch": 0.17836497288038825, + "grad_norm": 1.7224550247192383, + "learning_rate": 4.617697567461365e-05, + "loss": 4.8078, + "step": 29991 + }, + { + "epoch": 0.17837092016366923, + "grad_norm": 1.63644278049469, + "learning_rate": 4.617672742348137e-05, + "loss": 5.2103, + "step": 29992 + }, + { + "epoch": 0.17837686744695022, + "grad_norm": 1.9455114603042603, + "learning_rate": 4.617647916495651e-05, + "loss": 5.3372, + "step": 29993 + }, + { + "epoch": 0.17838281473023124, + "grad_norm": 1.6073265075683594, + "learning_rate": 4.6176230899039166e-05, + "loss": 4.4093, + "step": 29994 + }, + { + "epoch": 0.17838876201351223, + "grad_norm": 2.0087218284606934, + "learning_rate": 4.6175982625729405e-05, + "loss": 5.0169, + "step": 29995 + }, + { + "epoch": 0.1783947092967932, + "grad_norm": 2.3341264724731445, + "learning_rate": 4.617573434502734e-05, + "loss": 4.271, + "step": 29996 + }, + { + "epoch": 0.17840065658007423, + "grad_norm": 1.6453101634979248, + "learning_rate": 4.617548605693305e-05, + "loss": 5.0354, + "step": 29997 + }, + { + "epoch": 0.17840660386335522, + "grad_norm": 1.6747314929962158, + "learning_rate": 4.61752377614466e-05, + "loss": 5.0349, + "step": 29998 + }, + { + "epoch": 0.1784125511466362, + "grad_norm": 1.7050796747207642, + "learning_rate": 4.617498945856811e-05, + "loss": 4.753, + "step": 29999 + }, + { + "epoch": 0.17841849842991722, + "grad_norm": 1.7062735557556152, + "learning_rate": 4.617474114829764e-05, + "loss": 5.1345, + "step": 30000 + }, + { + "epoch": 0.1784244457131982, + "grad_norm": 1.802368402481079, + "learning_rate": 4.6174492830635285e-05, + "loss": 4.4919, + "step": 30001 + }, + { + "epoch": 0.1784303929964792, + "grad_norm": 1.7409639358520508, + "learning_rate": 4.6174244505581135e-05, + "loss": 4.9005, + "step": 30002 + }, + { + "epoch": 0.1784363402797602, + "grad_norm": 1.6387557983398438, + "learning_rate": 4.617399617313528e-05, + "loss": 5.3176, + "step": 30003 + }, + { + "epoch": 0.1784422875630412, + "grad_norm": 2.3082478046417236, + "learning_rate": 4.617374783329779e-05, + "loss": 4.0222, + "step": 30004 + }, + { + "epoch": 0.1784482348463222, + "grad_norm": 2.5321269035339355, + "learning_rate": 4.617349948606878e-05, + "loss": 4.4114, + "step": 30005 + }, + { + "epoch": 0.1784541821296032, + "grad_norm": 1.9945601224899292, + "learning_rate": 4.6173251131448305e-05, + "loss": 5.0146, + "step": 30006 + }, + { + "epoch": 0.1784601294128842, + "grad_norm": 1.408103108406067, + "learning_rate": 4.6173002769436474e-05, + "loss": 5.2415, + "step": 30007 + }, + { + "epoch": 0.17846607669616518, + "grad_norm": 2.4887290000915527, + "learning_rate": 4.6172754400033366e-05, + "loss": 4.3658, + "step": 30008 + }, + { + "epoch": 0.1784720239794462, + "grad_norm": 1.4949021339416504, + "learning_rate": 4.617250602323907e-05, + "loss": 5.0803, + "step": 30009 + }, + { + "epoch": 0.1784779712627272, + "grad_norm": 1.5946985483169556, + "learning_rate": 4.617225763905367e-05, + "loss": 4.5409, + "step": 30010 + }, + { + "epoch": 0.17848391854600817, + "grad_norm": 1.618841528892517, + "learning_rate": 4.6172009247477246e-05, + "loss": 4.9119, + "step": 30011 + }, + { + "epoch": 0.1784898658292892, + "grad_norm": 1.4818013906478882, + "learning_rate": 4.61717608485099e-05, + "loss": 5.0194, + "step": 30012 + }, + { + "epoch": 0.17849581311257018, + "grad_norm": 2.2418477535247803, + "learning_rate": 4.617151244215171e-05, + "loss": 3.8263, + "step": 30013 + }, + { + "epoch": 0.17850176039585117, + "grad_norm": 2.023987054824829, + "learning_rate": 4.617126402840277e-05, + "loss": 5.4911, + "step": 30014 + }, + { + "epoch": 0.17850770767913218, + "grad_norm": 1.9841099977493286, + "learning_rate": 4.6171015607263144e-05, + "loss": 5.3778, + "step": 30015 + }, + { + "epoch": 0.17851365496241317, + "grad_norm": 1.896392822265625, + "learning_rate": 4.617076717873295e-05, + "loss": 4.7488, + "step": 30016 + }, + { + "epoch": 0.17851960224569416, + "grad_norm": 1.816318154335022, + "learning_rate": 4.6170518742812255e-05, + "loss": 4.1921, + "step": 30017 + }, + { + "epoch": 0.17852554952897517, + "grad_norm": 1.8096336126327515, + "learning_rate": 4.617027029950115e-05, + "loss": 4.1089, + "step": 30018 + }, + { + "epoch": 0.17853149681225616, + "grad_norm": 2.236724853515625, + "learning_rate": 4.617002184879973e-05, + "loss": 3.7496, + "step": 30019 + }, + { + "epoch": 0.17853744409553715, + "grad_norm": 2.349423885345459, + "learning_rate": 4.616977339070806e-05, + "loss": 3.8509, + "step": 30020 + }, + { + "epoch": 0.17854339137881817, + "grad_norm": 2.2639737129211426, + "learning_rate": 4.616952492522625e-05, + "loss": 3.5414, + "step": 30021 + }, + { + "epoch": 0.17854933866209916, + "grad_norm": 2.1458024978637695, + "learning_rate": 4.6169276452354374e-05, + "loss": 3.9498, + "step": 30022 + }, + { + "epoch": 0.17855528594538014, + "grad_norm": 1.7704306840896606, + "learning_rate": 4.616902797209253e-05, + "loss": 4.4054, + "step": 30023 + }, + { + "epoch": 0.17856123322866116, + "grad_norm": 2.261296033859253, + "learning_rate": 4.616877948444078e-05, + "loss": 4.5599, + "step": 30024 + }, + { + "epoch": 0.17856718051194215, + "grad_norm": 2.4894723892211914, + "learning_rate": 4.616853098939924e-05, + "loss": 3.9811, + "step": 30025 + }, + { + "epoch": 0.17857312779522314, + "grad_norm": 2.1360697746276855, + "learning_rate": 4.616828248696798e-05, + "loss": 4.2385, + "step": 30026 + }, + { + "epoch": 0.17857907507850415, + "grad_norm": 2.185976505279541, + "learning_rate": 4.61680339771471e-05, + "loss": 4.8738, + "step": 30027 + }, + { + "epoch": 0.17858502236178514, + "grad_norm": 2.6245265007019043, + "learning_rate": 4.6167785459936676e-05, + "loss": 4.2775, + "step": 30028 + }, + { + "epoch": 0.17859096964506613, + "grad_norm": 1.9567252397537231, + "learning_rate": 4.616753693533679e-05, + "loss": 4.7655, + "step": 30029 + }, + { + "epoch": 0.17859691692834714, + "grad_norm": 1.832485556602478, + "learning_rate": 4.616728840334754e-05, + "loss": 5.0603, + "step": 30030 + }, + { + "epoch": 0.17860286421162813, + "grad_norm": 1.8482451438903809, + "learning_rate": 4.6167039863969005e-05, + "loss": 4.6997, + "step": 30031 + }, + { + "epoch": 0.17860881149490912, + "grad_norm": 1.7290279865264893, + "learning_rate": 4.616679131720128e-05, + "loss": 5.1665, + "step": 30032 + }, + { + "epoch": 0.17861475877819014, + "grad_norm": 2.0203309059143066, + "learning_rate": 4.616654276304444e-05, + "loss": 4.5307, + "step": 30033 + }, + { + "epoch": 0.17862070606147112, + "grad_norm": 2.3280582427978516, + "learning_rate": 4.616629420149858e-05, + "loss": 4.1431, + "step": 30034 + }, + { + "epoch": 0.1786266533447521, + "grad_norm": 1.745954155921936, + "learning_rate": 4.616604563256379e-05, + "loss": 5.0325, + "step": 30035 + }, + { + "epoch": 0.17863260062803313, + "grad_norm": 1.4526299238204956, + "learning_rate": 4.616579705624016e-05, + "loss": 5.62, + "step": 30036 + }, + { + "epoch": 0.17863854791131412, + "grad_norm": 1.6712018251419067, + "learning_rate": 4.616554847252775e-05, + "loss": 5.0503, + "step": 30037 + }, + { + "epoch": 0.1786444951945951, + "grad_norm": 1.5935488939285278, + "learning_rate": 4.616529988142668e-05, + "loss": 5.1876, + "step": 30038 + }, + { + "epoch": 0.1786504424778761, + "grad_norm": 1.4841454029083252, + "learning_rate": 4.616505128293701e-05, + "loss": 5.5045, + "step": 30039 + }, + { + "epoch": 0.1786563897611571, + "grad_norm": 1.7214070558547974, + "learning_rate": 4.616480267705885e-05, + "loss": 5.1553, + "step": 30040 + }, + { + "epoch": 0.1786623370444381, + "grad_norm": 1.625107765197754, + "learning_rate": 4.6164554063792277e-05, + "loss": 5.0406, + "step": 30041 + }, + { + "epoch": 0.17866828432771908, + "grad_norm": 1.5284959077835083, + "learning_rate": 4.616430544313737e-05, + "loss": 5.7409, + "step": 30042 + }, + { + "epoch": 0.1786742316110001, + "grad_norm": 1.4745396375656128, + "learning_rate": 4.616405681509423e-05, + "loss": 5.9447, + "step": 30043 + }, + { + "epoch": 0.1786801788942811, + "grad_norm": 1.6352115869522095, + "learning_rate": 4.616380817966293e-05, + "loss": 5.405, + "step": 30044 + }, + { + "epoch": 0.17868612617756208, + "grad_norm": 1.5231393575668335, + "learning_rate": 4.616355953684356e-05, + "loss": 5.8282, + "step": 30045 + }, + { + "epoch": 0.1786920734608431, + "grad_norm": 1.5378559827804565, + "learning_rate": 4.6163310886636216e-05, + "loss": 5.4119, + "step": 30046 + }, + { + "epoch": 0.17869802074412408, + "grad_norm": 1.7744802236557007, + "learning_rate": 4.6163062229040976e-05, + "loss": 4.6722, + "step": 30047 + }, + { + "epoch": 0.17870396802740507, + "grad_norm": 1.6727073192596436, + "learning_rate": 4.616281356405793e-05, + "loss": 4.9073, + "step": 30048 + }, + { + "epoch": 0.17870991531068608, + "grad_norm": 1.7499542236328125, + "learning_rate": 4.616256489168717e-05, + "loss": 5.1734, + "step": 30049 + }, + { + "epoch": 0.17871586259396707, + "grad_norm": 1.6163703203201294, + "learning_rate": 4.616231621192877e-05, + "loss": 5.2352, + "step": 30050 + }, + { + "epoch": 0.17872180987724806, + "grad_norm": 1.8054791688919067, + "learning_rate": 4.6162067524782826e-05, + "loss": 4.7604, + "step": 30051 + }, + { + "epoch": 0.17872775716052908, + "grad_norm": 1.5915356874465942, + "learning_rate": 4.616181883024942e-05, + "loss": 4.9966, + "step": 30052 + }, + { + "epoch": 0.17873370444381007, + "grad_norm": 1.6951193809509277, + "learning_rate": 4.616157012832865e-05, + "loss": 5.2401, + "step": 30053 + }, + { + "epoch": 0.17873965172709105, + "grad_norm": 1.6987075805664062, + "learning_rate": 4.6161321419020584e-05, + "loss": 5.212, + "step": 30054 + }, + { + "epoch": 0.17874559901037207, + "grad_norm": 1.8731896877288818, + "learning_rate": 4.616107270232533e-05, + "loss": 5.1996, + "step": 30055 + }, + { + "epoch": 0.17875154629365306, + "grad_norm": 2.243042469024658, + "learning_rate": 4.6160823978242955e-05, + "loss": 5.0162, + "step": 30056 + }, + { + "epoch": 0.17875749357693405, + "grad_norm": 1.7902021408081055, + "learning_rate": 4.616057524677356e-05, + "loss": 5.046, + "step": 30057 + }, + { + "epoch": 0.17876344086021506, + "grad_norm": 1.591950535774231, + "learning_rate": 4.6160326507917225e-05, + "loss": 5.1167, + "step": 30058 + }, + { + "epoch": 0.17876938814349605, + "grad_norm": 1.8238025903701782, + "learning_rate": 4.616007776167404e-05, + "loss": 4.6017, + "step": 30059 + }, + { + "epoch": 0.17877533542677704, + "grad_norm": 1.719621181488037, + "learning_rate": 4.6159829008044086e-05, + "loss": 4.7455, + "step": 30060 + }, + { + "epoch": 0.17878128271005805, + "grad_norm": 1.6752606630325317, + "learning_rate": 4.6159580247027465e-05, + "loss": 4.965, + "step": 30061 + }, + { + "epoch": 0.17878722999333904, + "grad_norm": 1.6750445365905762, + "learning_rate": 4.615933147862424e-05, + "loss": 5.0824, + "step": 30062 + }, + { + "epoch": 0.17879317727662003, + "grad_norm": 1.6843575239181519, + "learning_rate": 4.615908270283452e-05, + "loss": 4.9444, + "step": 30063 + }, + { + "epoch": 0.17879912455990105, + "grad_norm": 1.3926664590835571, + "learning_rate": 4.6158833919658385e-05, + "loss": 5.8095, + "step": 30064 + }, + { + "epoch": 0.17880507184318203, + "grad_norm": 1.8290106058120728, + "learning_rate": 4.615858512909591e-05, + "loss": 5.1141, + "step": 30065 + }, + { + "epoch": 0.17881101912646302, + "grad_norm": 1.7168490886688232, + "learning_rate": 4.61583363311472e-05, + "loss": 4.4733, + "step": 30066 + }, + { + "epoch": 0.17881696640974404, + "grad_norm": 1.5453979969024658, + "learning_rate": 4.615808752581233e-05, + "loss": 4.3735, + "step": 30067 + }, + { + "epoch": 0.17882291369302503, + "grad_norm": 2.376648426055908, + "learning_rate": 4.615783871309139e-05, + "loss": 4.3287, + "step": 30068 + }, + { + "epoch": 0.17882886097630601, + "grad_norm": 1.7454547882080078, + "learning_rate": 4.615758989298447e-05, + "loss": 4.9139, + "step": 30069 + }, + { + "epoch": 0.17883480825958703, + "grad_norm": 2.697049617767334, + "learning_rate": 4.6157341065491644e-05, + "loss": 4.5206, + "step": 30070 + }, + { + "epoch": 0.17884075554286802, + "grad_norm": 2.183265447616577, + "learning_rate": 4.615709223061302e-05, + "loss": 4.5157, + "step": 30071 + }, + { + "epoch": 0.178846702826149, + "grad_norm": 2.572007179260254, + "learning_rate": 4.615684338834867e-05, + "loss": 4.4589, + "step": 30072 + }, + { + "epoch": 0.17885265010943002, + "grad_norm": 2.5697407722473145, + "learning_rate": 4.6156594538698685e-05, + "loss": 4.4577, + "step": 30073 + }, + { + "epoch": 0.178858597392711, + "grad_norm": 1.9497699737548828, + "learning_rate": 4.615634568166315e-05, + "loss": 4.691, + "step": 30074 + }, + { + "epoch": 0.178864544675992, + "grad_norm": 1.5829882621765137, + "learning_rate": 4.6156096817242154e-05, + "loss": 5.3589, + "step": 30075 + }, + { + "epoch": 0.17887049195927301, + "grad_norm": 1.837938904762268, + "learning_rate": 4.6155847945435785e-05, + "loss": 5.1354, + "step": 30076 + }, + { + "epoch": 0.178876439242554, + "grad_norm": 1.7852935791015625, + "learning_rate": 4.615559906624412e-05, + "loss": 5.3022, + "step": 30077 + }, + { + "epoch": 0.178882386525835, + "grad_norm": 1.8897148370742798, + "learning_rate": 4.615535017966726e-05, + "loss": 5.3356, + "step": 30078 + }, + { + "epoch": 0.178888333809116, + "grad_norm": 1.8716245889663696, + "learning_rate": 4.615510128570529e-05, + "loss": 4.8567, + "step": 30079 + }, + { + "epoch": 0.178894281092397, + "grad_norm": 1.5220413208007812, + "learning_rate": 4.6154852384358286e-05, + "loss": 4.9667, + "step": 30080 + }, + { + "epoch": 0.17890022837567798, + "grad_norm": 1.7942893505096436, + "learning_rate": 4.615460347562635e-05, + "loss": 4.7159, + "step": 30081 + }, + { + "epoch": 0.178906175658959, + "grad_norm": 1.6901856660842896, + "learning_rate": 4.615435455950955e-05, + "loss": 4.7713, + "step": 30082 + }, + { + "epoch": 0.17891212294224, + "grad_norm": 2.3212149143218994, + "learning_rate": 4.615410563600799e-05, + "loss": 4.2903, + "step": 30083 + }, + { + "epoch": 0.17891807022552098, + "grad_norm": 2.5704152584075928, + "learning_rate": 4.6153856705121744e-05, + "loss": 4.0653, + "step": 30084 + }, + { + "epoch": 0.178924017508802, + "grad_norm": 2.5747878551483154, + "learning_rate": 4.6153607766850915e-05, + "loss": 4.176, + "step": 30085 + }, + { + "epoch": 0.17892996479208298, + "grad_norm": 2.633906841278076, + "learning_rate": 4.615335882119557e-05, + "loss": 4.2643, + "step": 30086 + }, + { + "epoch": 0.17893591207536397, + "grad_norm": 2.129531145095825, + "learning_rate": 4.615310986815581e-05, + "loss": 4.2056, + "step": 30087 + }, + { + "epoch": 0.17894185935864498, + "grad_norm": 1.725446343421936, + "learning_rate": 4.615286090773172e-05, + "loss": 5.0645, + "step": 30088 + }, + { + "epoch": 0.17894780664192597, + "grad_norm": 1.3943272829055786, + "learning_rate": 4.6152611939923384e-05, + "loss": 4.9102, + "step": 30089 + }, + { + "epoch": 0.17895375392520696, + "grad_norm": 1.5813884735107422, + "learning_rate": 4.615236296473089e-05, + "loss": 4.9637, + "step": 30090 + }, + { + "epoch": 0.17895970120848798, + "grad_norm": 1.795130968093872, + "learning_rate": 4.6152113982154323e-05, + "loss": 4.4949, + "step": 30091 + }, + { + "epoch": 0.17896564849176896, + "grad_norm": 1.626152753829956, + "learning_rate": 4.615186499219377e-05, + "loss": 4.6137, + "step": 30092 + }, + { + "epoch": 0.17897159577504995, + "grad_norm": 1.7427598237991333, + "learning_rate": 4.6151615994849326e-05, + "loss": 4.6726, + "step": 30093 + }, + { + "epoch": 0.17897754305833097, + "grad_norm": 1.6865589618682861, + "learning_rate": 4.6151366990121065e-05, + "loss": 5.307, + "step": 30094 + }, + { + "epoch": 0.17898349034161196, + "grad_norm": 1.4603716135025024, + "learning_rate": 4.615111797800908e-05, + "loss": 5.0782, + "step": 30095 + }, + { + "epoch": 0.17898943762489294, + "grad_norm": 1.6204586029052734, + "learning_rate": 4.615086895851346e-05, + "loss": 4.6813, + "step": 30096 + }, + { + "epoch": 0.17899538490817393, + "grad_norm": 1.6653324365615845, + "learning_rate": 4.615061993163429e-05, + "loss": 4.9387, + "step": 30097 + }, + { + "epoch": 0.17900133219145495, + "grad_norm": 1.4770258665084839, + "learning_rate": 4.6150370897371664e-05, + "loss": 5.3575, + "step": 30098 + }, + { + "epoch": 0.17900727947473594, + "grad_norm": 1.7126123905181885, + "learning_rate": 4.615012185572565e-05, + "loss": 5.2374, + "step": 30099 + }, + { + "epoch": 0.17901322675801692, + "grad_norm": 1.6398087739944458, + "learning_rate": 4.614987280669635e-05, + "loss": 5.3829, + "step": 30100 + }, + { + "epoch": 0.17901917404129794, + "grad_norm": 1.651924729347229, + "learning_rate": 4.6149623750283854e-05, + "loss": 5.2819, + "step": 30101 + }, + { + "epoch": 0.17902512132457893, + "grad_norm": 1.441523790359497, + "learning_rate": 4.6149374686488245e-05, + "loss": 5.1186, + "step": 30102 + }, + { + "epoch": 0.17903106860785992, + "grad_norm": 1.5080089569091797, + "learning_rate": 4.61491256153096e-05, + "loss": 5.4575, + "step": 30103 + }, + { + "epoch": 0.17903701589114093, + "grad_norm": 1.8069994449615479, + "learning_rate": 4.6148876536748017e-05, + "loss": 4.7502, + "step": 30104 + }, + { + "epoch": 0.17904296317442192, + "grad_norm": 1.6729295253753662, + "learning_rate": 4.6148627450803573e-05, + "loss": 5.6077, + "step": 30105 + }, + { + "epoch": 0.1790489104577029, + "grad_norm": 1.703140377998352, + "learning_rate": 4.614837835747637e-05, + "loss": 5.0991, + "step": 30106 + }, + { + "epoch": 0.17905485774098392, + "grad_norm": 1.6417967081069946, + "learning_rate": 4.614812925676648e-05, + "loss": 5.0502, + "step": 30107 + }, + { + "epoch": 0.1790608050242649, + "grad_norm": 1.5912690162658691, + "learning_rate": 4.6147880148674006e-05, + "loss": 5.1349, + "step": 30108 + }, + { + "epoch": 0.1790667523075459, + "grad_norm": 1.4695717096328735, + "learning_rate": 4.6147631033199026e-05, + "loss": 5.1189, + "step": 30109 + }, + { + "epoch": 0.17907269959082692, + "grad_norm": 1.865962266921997, + "learning_rate": 4.614738191034161e-05, + "loss": 4.9236, + "step": 30110 + }, + { + "epoch": 0.1790786468741079, + "grad_norm": 1.6190448999404907, + "learning_rate": 4.614713278010188e-05, + "loss": 5.6018, + "step": 30111 + }, + { + "epoch": 0.1790845941573889, + "grad_norm": 1.6233062744140625, + "learning_rate": 4.614688364247989e-05, + "loss": 5.5866, + "step": 30112 + }, + { + "epoch": 0.1790905414406699, + "grad_norm": 1.8465989828109741, + "learning_rate": 4.614663449747575e-05, + "loss": 4.9602, + "step": 30113 + }, + { + "epoch": 0.1790964887239509, + "grad_norm": 2.509408950805664, + "learning_rate": 4.614638534508954e-05, + "loss": 4.4523, + "step": 30114 + }, + { + "epoch": 0.17910243600723189, + "grad_norm": 2.0963387489318848, + "learning_rate": 4.6146136185321336e-05, + "loss": 4.6302, + "step": 30115 + }, + { + "epoch": 0.1791083832905129, + "grad_norm": 2.2663495540618896, + "learning_rate": 4.614588701817124e-05, + "loss": 4.4798, + "step": 30116 + }, + { + "epoch": 0.1791143305737939, + "grad_norm": 3.107478380203247, + "learning_rate": 4.6145637843639336e-05, + "loss": 4.4151, + "step": 30117 + }, + { + "epoch": 0.17912027785707488, + "grad_norm": 2.5192575454711914, + "learning_rate": 4.614538866172571e-05, + "loss": 4.3541, + "step": 30118 + }, + { + "epoch": 0.1791262251403559, + "grad_norm": 2.0473275184631348, + "learning_rate": 4.614513947243044e-05, + "loss": 4.9031, + "step": 30119 + }, + { + "epoch": 0.17913217242363688, + "grad_norm": 2.1869711875915527, + "learning_rate": 4.6144890275753614e-05, + "loss": 4.4643, + "step": 30120 + }, + { + "epoch": 0.17913811970691787, + "grad_norm": 2.027974843978882, + "learning_rate": 4.614464107169534e-05, + "loss": 4.4752, + "step": 30121 + }, + { + "epoch": 0.17914406699019889, + "grad_norm": 1.6507370471954346, + "learning_rate": 4.614439186025569e-05, + "loss": 5.1633, + "step": 30122 + }, + { + "epoch": 0.17915001427347987, + "grad_norm": 1.7081741094589233, + "learning_rate": 4.614414264143474e-05, + "loss": 5.2202, + "step": 30123 + }, + { + "epoch": 0.17915596155676086, + "grad_norm": 1.6631501913070679, + "learning_rate": 4.61438934152326e-05, + "loss": 5.3157, + "step": 30124 + }, + { + "epoch": 0.17916190884004188, + "grad_norm": 1.4147378206253052, + "learning_rate": 4.6143644181649336e-05, + "loss": 5.4174, + "step": 30125 + }, + { + "epoch": 0.17916785612332287, + "grad_norm": 2.0424649715423584, + "learning_rate": 4.614339494068505e-05, + "loss": 4.5279, + "step": 30126 + }, + { + "epoch": 0.17917380340660385, + "grad_norm": 1.8058947324752808, + "learning_rate": 4.614314569233982e-05, + "loss": 4.7826, + "step": 30127 + }, + { + "epoch": 0.17917975068988487, + "grad_norm": 2.241539478302002, + "learning_rate": 4.6142896436613735e-05, + "loss": 4.6077, + "step": 30128 + }, + { + "epoch": 0.17918569797316586, + "grad_norm": 2.598933696746826, + "learning_rate": 4.614264717350688e-05, + "loss": 4.4264, + "step": 30129 + }, + { + "epoch": 0.17919164525644685, + "grad_norm": 2.471510887145996, + "learning_rate": 4.614239790301935e-05, + "loss": 4.2513, + "step": 30130 + }, + { + "epoch": 0.17919759253972786, + "grad_norm": 2.7215542793273926, + "learning_rate": 4.6142148625151235e-05, + "loss": 4.3288, + "step": 30131 + }, + { + "epoch": 0.17920353982300885, + "grad_norm": 1.9755866527557373, + "learning_rate": 4.61418993399026e-05, + "loss": 4.5703, + "step": 30132 + }, + { + "epoch": 0.17920948710628984, + "grad_norm": 2.450087070465088, + "learning_rate": 4.614165004727356e-05, + "loss": 4.4569, + "step": 30133 + }, + { + "epoch": 0.17921543438957085, + "grad_norm": 1.7952730655670166, + "learning_rate": 4.614140074726419e-05, + "loss": 5.1308, + "step": 30134 + }, + { + "epoch": 0.17922138167285184, + "grad_norm": 1.4159260988235474, + "learning_rate": 4.614115143987456e-05, + "loss": 5.2613, + "step": 30135 + }, + { + "epoch": 0.17922732895613283, + "grad_norm": 1.546238899230957, + "learning_rate": 4.614090212510478e-05, + "loss": 5.2523, + "step": 30136 + }, + { + "epoch": 0.17923327623941385, + "grad_norm": 1.439784288406372, + "learning_rate": 4.614065280295493e-05, + "loss": 5.3594, + "step": 30137 + }, + { + "epoch": 0.17923922352269483, + "grad_norm": 1.421764612197876, + "learning_rate": 4.6140403473425096e-05, + "loss": 5.1889, + "step": 30138 + }, + { + "epoch": 0.17924517080597582, + "grad_norm": 1.5206106901168823, + "learning_rate": 4.614015413651537e-05, + "loss": 5.1252, + "step": 30139 + }, + { + "epoch": 0.17925111808925684, + "grad_norm": 1.8457632064819336, + "learning_rate": 4.613990479222582e-05, + "loss": 5.2534, + "step": 30140 + }, + { + "epoch": 0.17925706537253783, + "grad_norm": 1.5591540336608887, + "learning_rate": 4.613965544055656e-05, + "loss": 5.5879, + "step": 30141 + }, + { + "epoch": 0.17926301265581882, + "grad_norm": 1.6546518802642822, + "learning_rate": 4.613940608150766e-05, + "loss": 5.4731, + "step": 30142 + }, + { + "epoch": 0.17926895993909983, + "grad_norm": 1.7547178268432617, + "learning_rate": 4.613915671507922e-05, + "loss": 5.2478, + "step": 30143 + }, + { + "epoch": 0.17927490722238082, + "grad_norm": 1.7758798599243164, + "learning_rate": 4.613890734127131e-05, + "loss": 5.2217, + "step": 30144 + }, + { + "epoch": 0.1792808545056618, + "grad_norm": 1.7525664567947388, + "learning_rate": 4.613865796008403e-05, + "loss": 5.1914, + "step": 30145 + }, + { + "epoch": 0.17928680178894282, + "grad_norm": 1.907631754875183, + "learning_rate": 4.6138408571517464e-05, + "loss": 5.4735, + "step": 30146 + }, + { + "epoch": 0.1792927490722238, + "grad_norm": 1.658576250076294, + "learning_rate": 4.6138159175571694e-05, + "loss": 4.9081, + "step": 30147 + }, + { + "epoch": 0.1792986963555048, + "grad_norm": 2.537595272064209, + "learning_rate": 4.613790977224681e-05, + "loss": 4.0906, + "step": 30148 + }, + { + "epoch": 0.17930464363878582, + "grad_norm": 2.0535919666290283, + "learning_rate": 4.613766036154291e-05, + "loss": 4.5155, + "step": 30149 + }, + { + "epoch": 0.1793105909220668, + "grad_norm": 1.588181734085083, + "learning_rate": 4.6137410943460056e-05, + "loss": 5.1572, + "step": 30150 + }, + { + "epoch": 0.1793165382053478, + "grad_norm": 1.74554443359375, + "learning_rate": 4.613716151799836e-05, + "loss": 4.9274, + "step": 30151 + }, + { + "epoch": 0.1793224854886288, + "grad_norm": 1.638634204864502, + "learning_rate": 4.61369120851579e-05, + "loss": 5.3069, + "step": 30152 + }, + { + "epoch": 0.1793284327719098, + "grad_norm": 1.5783028602600098, + "learning_rate": 4.613666264493876e-05, + "loss": 4.9436, + "step": 30153 + }, + { + "epoch": 0.17933438005519078, + "grad_norm": 1.7508025169372559, + "learning_rate": 4.613641319734103e-05, + "loss": 5.5884, + "step": 30154 + }, + { + "epoch": 0.17934032733847177, + "grad_norm": 1.8591163158416748, + "learning_rate": 4.6136163742364794e-05, + "loss": 5.4054, + "step": 30155 + }, + { + "epoch": 0.1793462746217528, + "grad_norm": 1.6123576164245605, + "learning_rate": 4.6135914280010144e-05, + "loss": 5.1458, + "step": 30156 + }, + { + "epoch": 0.17935222190503378, + "grad_norm": 3.3494856357574463, + "learning_rate": 4.613566481027716e-05, + "loss": 5.457, + "step": 30157 + }, + { + "epoch": 0.17935816918831476, + "grad_norm": 1.7815282344818115, + "learning_rate": 4.613541533316594e-05, + "loss": 5.619, + "step": 30158 + }, + { + "epoch": 0.17936411647159578, + "grad_norm": 1.8669323921203613, + "learning_rate": 4.6135165848676567e-05, + "loss": 5.2181, + "step": 30159 + }, + { + "epoch": 0.17937006375487677, + "grad_norm": 2.775512218475342, + "learning_rate": 4.613491635680912e-05, + "loss": 5.274, + "step": 30160 + }, + { + "epoch": 0.17937601103815776, + "grad_norm": 1.3478049039840698, + "learning_rate": 4.613466685756369e-05, + "loss": 5.6895, + "step": 30161 + }, + { + "epoch": 0.17938195832143877, + "grad_norm": 1.3616020679473877, + "learning_rate": 4.6134417350940376e-05, + "loss": 5.4167, + "step": 30162 + }, + { + "epoch": 0.17938790560471976, + "grad_norm": 1.6133387088775635, + "learning_rate": 4.613416783693925e-05, + "loss": 5.4037, + "step": 30163 + }, + { + "epoch": 0.17939385288800075, + "grad_norm": 1.5833585262298584, + "learning_rate": 4.61339183155604e-05, + "loss": 5.5905, + "step": 30164 + }, + { + "epoch": 0.17939980017128176, + "grad_norm": 1.5497944355010986, + "learning_rate": 4.613366878680392e-05, + "loss": 5.5072, + "step": 30165 + }, + { + "epoch": 0.17940574745456275, + "grad_norm": 1.4450465440750122, + "learning_rate": 4.6133419250669893e-05, + "loss": 5.391, + "step": 30166 + }, + { + "epoch": 0.17941169473784374, + "grad_norm": 1.4759451150894165, + "learning_rate": 4.6133169707158415e-05, + "loss": 5.4819, + "step": 30167 + }, + { + "epoch": 0.17941764202112476, + "grad_norm": 1.576032280921936, + "learning_rate": 4.613292015626956e-05, + "loss": 5.0684, + "step": 30168 + }, + { + "epoch": 0.17942358930440575, + "grad_norm": 1.3601480722427368, + "learning_rate": 4.613267059800342e-05, + "loss": 5.3811, + "step": 30169 + }, + { + "epoch": 0.17942953658768673, + "grad_norm": 1.4551454782485962, + "learning_rate": 4.6132421032360084e-05, + "loss": 5.3851, + "step": 30170 + }, + { + "epoch": 0.17943548387096775, + "grad_norm": 1.425933837890625, + "learning_rate": 4.613217145933964e-05, + "loss": 5.2831, + "step": 30171 + }, + { + "epoch": 0.17944143115424874, + "grad_norm": 1.53054678440094, + "learning_rate": 4.613192187894218e-05, + "loss": 5.3126, + "step": 30172 + }, + { + "epoch": 0.17944737843752973, + "grad_norm": 1.5513275861740112, + "learning_rate": 4.613167229116777e-05, + "loss": 5.1566, + "step": 30173 + }, + { + "epoch": 0.17945332572081074, + "grad_norm": 1.659415364265442, + "learning_rate": 4.613142269601652e-05, + "loss": 5.2562, + "step": 30174 + }, + { + "epoch": 0.17945927300409173, + "grad_norm": 2.1108832359313965, + "learning_rate": 4.6131173093488506e-05, + "loss": 5.2514, + "step": 30175 + }, + { + "epoch": 0.17946522028737272, + "grad_norm": 2.187035083770752, + "learning_rate": 4.613092348358382e-05, + "loss": 5.0864, + "step": 30176 + }, + { + "epoch": 0.17947116757065373, + "grad_norm": 1.9420459270477295, + "learning_rate": 4.613067386630254e-05, + "loss": 5.1925, + "step": 30177 + }, + { + "epoch": 0.17947711485393472, + "grad_norm": 2.1403605937957764, + "learning_rate": 4.6130424241644765e-05, + "loss": 5.104, + "step": 30178 + }, + { + "epoch": 0.1794830621372157, + "grad_norm": 1.567936897277832, + "learning_rate": 4.6130174609610584e-05, + "loss": 5.2187, + "step": 30179 + }, + { + "epoch": 0.17948900942049673, + "grad_norm": 1.7955834865570068, + "learning_rate": 4.612992497020007e-05, + "loss": 4.9966, + "step": 30180 + }, + { + "epoch": 0.1794949567037777, + "grad_norm": 1.5525354146957397, + "learning_rate": 4.612967532341332e-05, + "loss": 5.4877, + "step": 30181 + }, + { + "epoch": 0.1795009039870587, + "grad_norm": 2.041837692260742, + "learning_rate": 4.6129425669250416e-05, + "loss": 4.1008, + "step": 30182 + }, + { + "epoch": 0.17950685127033972, + "grad_norm": 1.7052921056747437, + "learning_rate": 4.612917600771145e-05, + "loss": 5.0677, + "step": 30183 + }, + { + "epoch": 0.1795127985536207, + "grad_norm": 1.8973312377929688, + "learning_rate": 4.6128926338796505e-05, + "loss": 5.0398, + "step": 30184 + }, + { + "epoch": 0.1795187458369017, + "grad_norm": 1.696648120880127, + "learning_rate": 4.612867666250567e-05, + "loss": 4.9444, + "step": 30185 + }, + { + "epoch": 0.1795246931201827, + "grad_norm": 1.3013113737106323, + "learning_rate": 4.6128426978839034e-05, + "loss": 5.2848, + "step": 30186 + }, + { + "epoch": 0.1795306404034637, + "grad_norm": 1.6199315786361694, + "learning_rate": 4.612817728779668e-05, + "loss": 5.3178, + "step": 30187 + }, + { + "epoch": 0.1795365876867447, + "grad_norm": 1.5402096509933472, + "learning_rate": 4.612792758937871e-05, + "loss": 5.3269, + "step": 30188 + }, + { + "epoch": 0.1795425349700257, + "grad_norm": 1.4913876056671143, + "learning_rate": 4.612767788358518e-05, + "loss": 5.1784, + "step": 30189 + }, + { + "epoch": 0.1795484822533067, + "grad_norm": 1.6965476274490356, + "learning_rate": 4.6127428170416203e-05, + "loss": 4.9591, + "step": 30190 + }, + { + "epoch": 0.17955442953658768, + "grad_norm": 1.71049964427948, + "learning_rate": 4.612717844987186e-05, + "loss": 5.6287, + "step": 30191 + }, + { + "epoch": 0.1795603768198687, + "grad_norm": 1.6330054998397827, + "learning_rate": 4.612692872195224e-05, + "loss": 4.9837, + "step": 30192 + }, + { + "epoch": 0.17956632410314968, + "grad_norm": 1.6912428140640259, + "learning_rate": 4.6126678986657424e-05, + "loss": 5.2847, + "step": 30193 + }, + { + "epoch": 0.17957227138643067, + "grad_norm": 1.4812456369400024, + "learning_rate": 4.61264292439875e-05, + "loss": 5.1037, + "step": 30194 + }, + { + "epoch": 0.1795782186697117, + "grad_norm": 1.8161237239837646, + "learning_rate": 4.612617949394257e-05, + "loss": 5.141, + "step": 30195 + }, + { + "epoch": 0.17958416595299267, + "grad_norm": 1.7827249765396118, + "learning_rate": 4.61259297365227e-05, + "loss": 5.184, + "step": 30196 + }, + { + "epoch": 0.17959011323627366, + "grad_norm": 2.4642884731292725, + "learning_rate": 4.612567997172798e-05, + "loss": 3.7194, + "step": 30197 + }, + { + "epoch": 0.17959606051955468, + "grad_norm": 1.680345892906189, + "learning_rate": 4.6125430199558515e-05, + "loss": 4.8509, + "step": 30198 + }, + { + "epoch": 0.17960200780283567, + "grad_norm": 1.510986566543579, + "learning_rate": 4.612518042001437e-05, + "loss": 5.4374, + "step": 30199 + }, + { + "epoch": 0.17960795508611666, + "grad_norm": 1.333565592765808, + "learning_rate": 4.612493063309565e-05, + "loss": 5.5605, + "step": 30200 + }, + { + "epoch": 0.17961390236939767, + "grad_norm": 1.5686737298965454, + "learning_rate": 4.612468083880244e-05, + "loss": 5.5275, + "step": 30201 + }, + { + "epoch": 0.17961984965267866, + "grad_norm": 1.4697351455688477, + "learning_rate": 4.6124431037134805e-05, + "loss": 5.2846, + "step": 30202 + }, + { + "epoch": 0.17962579693595965, + "grad_norm": 1.553658127784729, + "learning_rate": 4.612418122809286e-05, + "loss": 5.339, + "step": 30203 + }, + { + "epoch": 0.17963174421924066, + "grad_norm": 1.8520125150680542, + "learning_rate": 4.612393141167669e-05, + "loss": 4.5566, + "step": 30204 + }, + { + "epoch": 0.17963769150252165, + "grad_norm": 1.694443702697754, + "learning_rate": 4.6123681587886356e-05, + "loss": 5.094, + "step": 30205 + }, + { + "epoch": 0.17964363878580264, + "grad_norm": 1.6626062393188477, + "learning_rate": 4.612343175672198e-05, + "loss": 5.0872, + "step": 30206 + }, + { + "epoch": 0.17964958606908366, + "grad_norm": 1.8352187871932983, + "learning_rate": 4.612318191818362e-05, + "loss": 4.0188, + "step": 30207 + }, + { + "epoch": 0.17965553335236464, + "grad_norm": 1.8127634525299072, + "learning_rate": 4.6122932072271385e-05, + "loss": 4.3894, + "step": 30208 + }, + { + "epoch": 0.17966148063564563, + "grad_norm": 1.7093063592910767, + "learning_rate": 4.612268221898535e-05, + "loss": 4.9074, + "step": 30209 + }, + { + "epoch": 0.17966742791892665, + "grad_norm": 1.5482558012008667, + "learning_rate": 4.61224323583256e-05, + "loss": 4.9126, + "step": 30210 + }, + { + "epoch": 0.17967337520220764, + "grad_norm": 1.6215821504592896, + "learning_rate": 4.612218249029223e-05, + "loss": 4.9809, + "step": 30211 + }, + { + "epoch": 0.17967932248548862, + "grad_norm": 1.4449799060821533, + "learning_rate": 4.6121932614885324e-05, + "loss": 4.6292, + "step": 30212 + }, + { + "epoch": 0.17968526976876964, + "grad_norm": 1.6439566612243652, + "learning_rate": 4.612168273210496e-05, + "loss": 4.7304, + "step": 30213 + }, + { + "epoch": 0.17969121705205063, + "grad_norm": 1.566293716430664, + "learning_rate": 4.6121432841951254e-05, + "loss": 4.7359, + "step": 30214 + }, + { + "epoch": 0.17969716433533162, + "grad_norm": 1.3864619731903076, + "learning_rate": 4.612118294442426e-05, + "loss": 4.737, + "step": 30215 + }, + { + "epoch": 0.1797031116186126, + "grad_norm": 1.5013184547424316, + "learning_rate": 4.6120933039524087e-05, + "loss": 4.2283, + "step": 30216 + }, + { + "epoch": 0.17970905890189362, + "grad_norm": 1.6304489374160767, + "learning_rate": 4.612068312725081e-05, + "loss": 4.6443, + "step": 30217 + }, + { + "epoch": 0.1797150061851746, + "grad_norm": 1.512584924697876, + "learning_rate": 4.612043320760452e-05, + "loss": 4.6994, + "step": 30218 + }, + { + "epoch": 0.1797209534684556, + "grad_norm": 1.726282000541687, + "learning_rate": 4.612018328058531e-05, + "loss": 4.7797, + "step": 30219 + }, + { + "epoch": 0.1797269007517366, + "grad_norm": 1.9200310707092285, + "learning_rate": 4.611993334619326e-05, + "loss": 4.3433, + "step": 30220 + }, + { + "epoch": 0.1797328480350176, + "grad_norm": 1.6189771890640259, + "learning_rate": 4.611968340442845e-05, + "loss": 4.8367, + "step": 30221 + }, + { + "epoch": 0.1797387953182986, + "grad_norm": 1.6399370431900024, + "learning_rate": 4.6119433455290985e-05, + "loss": 5.169, + "step": 30222 + }, + { + "epoch": 0.1797447426015796, + "grad_norm": 1.8152503967285156, + "learning_rate": 4.611918349878095e-05, + "loss": 5.4301, + "step": 30223 + }, + { + "epoch": 0.1797506898848606, + "grad_norm": 1.5096112489700317, + "learning_rate": 4.611893353489841e-05, + "loss": 5.3012, + "step": 30224 + }, + { + "epoch": 0.17975663716814158, + "grad_norm": 1.5811582803726196, + "learning_rate": 4.611868356364348e-05, + "loss": 4.65, + "step": 30225 + }, + { + "epoch": 0.1797625844514226, + "grad_norm": 1.6519943475723267, + "learning_rate": 4.611843358501624e-05, + "loss": 5.1057, + "step": 30226 + }, + { + "epoch": 0.17976853173470358, + "grad_norm": 1.5644969940185547, + "learning_rate": 4.611818359901676e-05, + "loss": 5.3185, + "step": 30227 + }, + { + "epoch": 0.17977447901798457, + "grad_norm": 1.344948410987854, + "learning_rate": 4.611793360564515e-05, + "loss": 5.2657, + "step": 30228 + }, + { + "epoch": 0.1797804263012656, + "grad_norm": 1.5945618152618408, + "learning_rate": 4.6117683604901485e-05, + "loss": 5.197, + "step": 30229 + }, + { + "epoch": 0.17978637358454658, + "grad_norm": 1.41254460811615, + "learning_rate": 4.6117433596785855e-05, + "loss": 5.1255, + "step": 30230 + }, + { + "epoch": 0.17979232086782757, + "grad_norm": 1.7176563739776611, + "learning_rate": 4.611718358129835e-05, + "loss": 4.9022, + "step": 30231 + }, + { + "epoch": 0.17979826815110858, + "grad_norm": 1.7248926162719727, + "learning_rate": 4.611693355843905e-05, + "loss": 4.9377, + "step": 30232 + }, + { + "epoch": 0.17980421543438957, + "grad_norm": 1.551627516746521, + "learning_rate": 4.611668352820805e-05, + "loss": 5.2027, + "step": 30233 + }, + { + "epoch": 0.17981016271767056, + "grad_norm": 1.5485448837280273, + "learning_rate": 4.6116433490605435e-05, + "loss": 5.2818, + "step": 30234 + }, + { + "epoch": 0.17981611000095157, + "grad_norm": 1.5185739994049072, + "learning_rate": 4.611618344563129e-05, + "loss": 5.2565, + "step": 30235 + }, + { + "epoch": 0.17982205728423256, + "grad_norm": 1.3627973794937134, + "learning_rate": 4.61159333932857e-05, + "loss": 5.227, + "step": 30236 + }, + { + "epoch": 0.17982800456751355, + "grad_norm": 1.1721487045288086, + "learning_rate": 4.611568333356876e-05, + "loss": 5.244, + "step": 30237 + }, + { + "epoch": 0.17983395185079457, + "grad_norm": 1.4845436811447144, + "learning_rate": 4.611543326648055e-05, + "loss": 5.0118, + "step": 30238 + }, + { + "epoch": 0.17983989913407555, + "grad_norm": 1.733625888824463, + "learning_rate": 4.611518319202116e-05, + "loss": 5.2007, + "step": 30239 + }, + { + "epoch": 0.17984584641735654, + "grad_norm": 1.570659875869751, + "learning_rate": 4.611493311019068e-05, + "loss": 5.1015, + "step": 30240 + }, + { + "epoch": 0.17985179370063756, + "grad_norm": 1.5972294807434082, + "learning_rate": 4.611468302098919e-05, + "loss": 5.0667, + "step": 30241 + }, + { + "epoch": 0.17985774098391855, + "grad_norm": 1.5389827489852905, + "learning_rate": 4.611443292441678e-05, + "loss": 5.1393, + "step": 30242 + }, + { + "epoch": 0.17986368826719953, + "grad_norm": 1.778263807296753, + "learning_rate": 4.611418282047355e-05, + "loss": 4.6927, + "step": 30243 + }, + { + "epoch": 0.17986963555048055, + "grad_norm": 1.642376184463501, + "learning_rate": 4.611393270915958e-05, + "loss": 4.9779, + "step": 30244 + }, + { + "epoch": 0.17987558283376154, + "grad_norm": 1.7385129928588867, + "learning_rate": 4.611368259047494e-05, + "loss": 5.0129, + "step": 30245 + }, + { + "epoch": 0.17988153011704253, + "grad_norm": 1.5934865474700928, + "learning_rate": 4.6113432464419734e-05, + "loss": 5.151, + "step": 30246 + }, + { + "epoch": 0.17988747740032354, + "grad_norm": 1.6236854791641235, + "learning_rate": 4.611318233099406e-05, + "loss": 4.915, + "step": 30247 + }, + { + "epoch": 0.17989342468360453, + "grad_norm": 1.553943157196045, + "learning_rate": 4.611293219019798e-05, + "loss": 4.9386, + "step": 30248 + }, + { + "epoch": 0.17989937196688552, + "grad_norm": 1.597655177116394, + "learning_rate": 4.61126820420316e-05, + "loss": 5.0225, + "step": 30249 + }, + { + "epoch": 0.17990531925016653, + "grad_norm": 1.4023799896240234, + "learning_rate": 4.6112431886495e-05, + "loss": 4.8045, + "step": 30250 + }, + { + "epoch": 0.17991126653344752, + "grad_norm": 1.4906047582626343, + "learning_rate": 4.611218172358826e-05, + "loss": 5.0927, + "step": 30251 + }, + { + "epoch": 0.1799172138167285, + "grad_norm": 1.5440434217453003, + "learning_rate": 4.6111931553311486e-05, + "loss": 4.8847, + "step": 30252 + }, + { + "epoch": 0.17992316110000953, + "grad_norm": 1.5937246084213257, + "learning_rate": 4.611168137566475e-05, + "loss": 4.8006, + "step": 30253 + }, + { + "epoch": 0.17992910838329051, + "grad_norm": 1.913120150566101, + "learning_rate": 4.611143119064814e-05, + "loss": 4.3196, + "step": 30254 + }, + { + "epoch": 0.1799350556665715, + "grad_norm": 2.267242908477783, + "learning_rate": 4.6111180998261754e-05, + "loss": 3.9084, + "step": 30255 + }, + { + "epoch": 0.17994100294985252, + "grad_norm": 1.9298279285430908, + "learning_rate": 4.611093079850567e-05, + "loss": 4.8176, + "step": 30256 + }, + { + "epoch": 0.1799469502331335, + "grad_norm": 2.0990922451019287, + "learning_rate": 4.6110680591379977e-05, + "loss": 3.7105, + "step": 30257 + }, + { + "epoch": 0.1799528975164145, + "grad_norm": 2.2702863216400146, + "learning_rate": 4.611043037688477e-05, + "loss": 3.67, + "step": 30258 + }, + { + "epoch": 0.1799588447996955, + "grad_norm": 1.7797553539276123, + "learning_rate": 4.6110180155020124e-05, + "loss": 4.2347, + "step": 30259 + }, + { + "epoch": 0.1799647920829765, + "grad_norm": 1.827901005744934, + "learning_rate": 4.610992992578613e-05, + "loss": 4.8346, + "step": 30260 + }, + { + "epoch": 0.1799707393662575, + "grad_norm": 1.8111793994903564, + "learning_rate": 4.610967968918288e-05, + "loss": 4.429, + "step": 30261 + }, + { + "epoch": 0.1799766866495385, + "grad_norm": 1.7809714078903198, + "learning_rate": 4.610942944521046e-05, + "loss": 4.8362, + "step": 30262 + }, + { + "epoch": 0.1799826339328195, + "grad_norm": 1.7556761503219604, + "learning_rate": 4.610917919386895e-05, + "loss": 4.5426, + "step": 30263 + }, + { + "epoch": 0.17998858121610048, + "grad_norm": 2.094663381576538, + "learning_rate": 4.6108928935158457e-05, + "loss": 3.9912, + "step": 30264 + }, + { + "epoch": 0.1799945284993815, + "grad_norm": 2.4183871746063232, + "learning_rate": 4.610867866907905e-05, + "loss": 3.5367, + "step": 30265 + }, + { + "epoch": 0.18000047578266248, + "grad_norm": 1.9817161560058594, + "learning_rate": 4.610842839563082e-05, + "loss": 4.1249, + "step": 30266 + }, + { + "epoch": 0.18000642306594347, + "grad_norm": 1.8227890729904175, + "learning_rate": 4.610817811481385e-05, + "loss": 4.729, + "step": 30267 + }, + { + "epoch": 0.1800123703492245, + "grad_norm": 1.8719122409820557, + "learning_rate": 4.610792782662824e-05, + "loss": 4.6823, + "step": 30268 + }, + { + "epoch": 0.18001831763250548, + "grad_norm": 1.8727600574493408, + "learning_rate": 4.6107677531074075e-05, + "loss": 4.2555, + "step": 30269 + }, + { + "epoch": 0.18002426491578646, + "grad_norm": 1.526989221572876, + "learning_rate": 4.610742722815143e-05, + "loss": 4.9572, + "step": 30270 + }, + { + "epoch": 0.18003021219906748, + "grad_norm": 1.7702364921569824, + "learning_rate": 4.61071769178604e-05, + "loss": 4.6608, + "step": 30271 + }, + { + "epoch": 0.18003615948234847, + "grad_norm": 1.7519408464431763, + "learning_rate": 4.610692660020107e-05, + "loss": 4.6615, + "step": 30272 + }, + { + "epoch": 0.18004210676562946, + "grad_norm": 1.6772125959396362, + "learning_rate": 4.610667627517354e-05, + "loss": 4.5468, + "step": 30273 + }, + { + "epoch": 0.18004805404891044, + "grad_norm": 1.8781254291534424, + "learning_rate": 4.610642594277788e-05, + "loss": 4.4068, + "step": 30274 + }, + { + "epoch": 0.18005400133219146, + "grad_norm": 1.6861200332641602, + "learning_rate": 4.610617560301419e-05, + "loss": 4.5556, + "step": 30275 + }, + { + "epoch": 0.18005994861547245, + "grad_norm": 1.7441620826721191, + "learning_rate": 4.6105925255882545e-05, + "loss": 4.7047, + "step": 30276 + }, + { + "epoch": 0.18006589589875344, + "grad_norm": 1.5301376581192017, + "learning_rate": 4.6105674901383044e-05, + "loss": 4.7173, + "step": 30277 + }, + { + "epoch": 0.18007184318203445, + "grad_norm": 1.9823702573776245, + "learning_rate": 4.6105424539515765e-05, + "loss": 4.6475, + "step": 30278 + }, + { + "epoch": 0.18007779046531544, + "grad_norm": 1.7281779050827026, + "learning_rate": 4.6105174170280805e-05, + "loss": 4.5832, + "step": 30279 + }, + { + "epoch": 0.18008373774859643, + "grad_norm": 1.5739697217941284, + "learning_rate": 4.610492379367824e-05, + "loss": 4.6732, + "step": 30280 + }, + { + "epoch": 0.18008968503187744, + "grad_norm": 1.771346092224121, + "learning_rate": 4.6104673409708175e-05, + "loss": 4.4008, + "step": 30281 + }, + { + "epoch": 0.18009563231515843, + "grad_norm": 2.198194980621338, + "learning_rate": 4.610442301837068e-05, + "loss": 4.2249, + "step": 30282 + }, + { + "epoch": 0.18010157959843942, + "grad_norm": 1.7576837539672852, + "learning_rate": 4.610417261966585e-05, + "loss": 4.9569, + "step": 30283 + }, + { + "epoch": 0.18010752688172044, + "grad_norm": 1.849458932876587, + "learning_rate": 4.6103922213593775e-05, + "loss": 4.2704, + "step": 30284 + }, + { + "epoch": 0.18011347416500142, + "grad_norm": 1.8416085243225098, + "learning_rate": 4.610367180015454e-05, + "loss": 4.2699, + "step": 30285 + }, + { + "epoch": 0.1801194214482824, + "grad_norm": 1.8305091857910156, + "learning_rate": 4.610342137934822e-05, + "loss": 4.2922, + "step": 30286 + }, + { + "epoch": 0.18012536873156343, + "grad_norm": 2.0292394161224365, + "learning_rate": 4.6103170951174924e-05, + "loss": 4.1851, + "step": 30287 + }, + { + "epoch": 0.18013131601484442, + "grad_norm": 1.7172123193740845, + "learning_rate": 4.610292051563473e-05, + "loss": 4.4749, + "step": 30288 + }, + { + "epoch": 0.1801372632981254, + "grad_norm": 1.787654995918274, + "learning_rate": 4.610267007272772e-05, + "loss": 4.143, + "step": 30289 + }, + { + "epoch": 0.18014321058140642, + "grad_norm": 1.9985861778259277, + "learning_rate": 4.6102419622453985e-05, + "loss": 4.6417, + "step": 30290 + }, + { + "epoch": 0.1801491578646874, + "grad_norm": 1.7196992635726929, + "learning_rate": 4.610216916481361e-05, + "loss": 5.1097, + "step": 30291 + }, + { + "epoch": 0.1801551051479684, + "grad_norm": 1.5344418287277222, + "learning_rate": 4.610191869980669e-05, + "loss": 5.011, + "step": 30292 + }, + { + "epoch": 0.1801610524312494, + "grad_norm": 2.262801170349121, + "learning_rate": 4.610166822743331e-05, + "loss": 4.302, + "step": 30293 + }, + { + "epoch": 0.1801669997145304, + "grad_norm": 1.6699048280715942, + "learning_rate": 4.610141774769355e-05, + "loss": 4.6451, + "step": 30294 + }, + { + "epoch": 0.1801729469978114, + "grad_norm": 1.636252999305725, + "learning_rate": 4.6101167260587506e-05, + "loss": 4.5226, + "step": 30295 + }, + { + "epoch": 0.1801788942810924, + "grad_norm": 1.6654448509216309, + "learning_rate": 4.610091676611527e-05, + "loss": 4.8778, + "step": 30296 + }, + { + "epoch": 0.1801848415643734, + "grad_norm": 1.832134485244751, + "learning_rate": 4.610066626427691e-05, + "loss": 4.8301, + "step": 30297 + }, + { + "epoch": 0.18019078884765438, + "grad_norm": 1.5756455659866333, + "learning_rate": 4.6100415755072536e-05, + "loss": 4.8757, + "step": 30298 + }, + { + "epoch": 0.1801967361309354, + "grad_norm": 1.5991398096084595, + "learning_rate": 4.610016523850222e-05, + "loss": 4.8261, + "step": 30299 + }, + { + "epoch": 0.18020268341421639, + "grad_norm": 1.5322027206420898, + "learning_rate": 4.609991471456605e-05, + "loss": 5.0081, + "step": 30300 + }, + { + "epoch": 0.18020863069749737, + "grad_norm": 1.6513683795928955, + "learning_rate": 4.6099664183264126e-05, + "loss": 4.8251, + "step": 30301 + }, + { + "epoch": 0.1802145779807784, + "grad_norm": 2.100013494491577, + "learning_rate": 4.609941364459652e-05, + "loss": 4.2063, + "step": 30302 + }, + { + "epoch": 0.18022052526405938, + "grad_norm": 1.8772211074829102, + "learning_rate": 4.609916309856333e-05, + "loss": 5.0742, + "step": 30303 + }, + { + "epoch": 0.18022647254734037, + "grad_norm": 1.5628682374954224, + "learning_rate": 4.609891254516464e-05, + "loss": 5.0105, + "step": 30304 + }, + { + "epoch": 0.18023241983062138, + "grad_norm": 1.633851170539856, + "learning_rate": 4.6098661984400535e-05, + "loss": 4.8846, + "step": 30305 + }, + { + "epoch": 0.18023836711390237, + "grad_norm": 1.6528682708740234, + "learning_rate": 4.609841141627111e-05, + "loss": 4.9063, + "step": 30306 + }, + { + "epoch": 0.18024431439718336, + "grad_norm": 1.699247121810913, + "learning_rate": 4.609816084077645e-05, + "loss": 4.5751, + "step": 30307 + }, + { + "epoch": 0.18025026168046437, + "grad_norm": 1.6774038076400757, + "learning_rate": 4.609791025791663e-05, + "loss": 4.5651, + "step": 30308 + }, + { + "epoch": 0.18025620896374536, + "grad_norm": 1.695169448852539, + "learning_rate": 4.609765966769175e-05, + "loss": 4.5995, + "step": 30309 + }, + { + "epoch": 0.18026215624702635, + "grad_norm": 1.851489543914795, + "learning_rate": 4.6097409070101905e-05, + "loss": 4.6826, + "step": 30310 + }, + { + "epoch": 0.18026810353030737, + "grad_norm": 1.683112382888794, + "learning_rate": 4.609715846514716e-05, + "loss": 4.3293, + "step": 30311 + }, + { + "epoch": 0.18027405081358835, + "grad_norm": 1.5318275690078735, + "learning_rate": 4.609690785282762e-05, + "loss": 5.6662, + "step": 30312 + }, + { + "epoch": 0.18027999809686934, + "grad_norm": 2.2105138301849365, + "learning_rate": 4.609665723314337e-05, + "loss": 3.9675, + "step": 30313 + }, + { + "epoch": 0.18028594538015036, + "grad_norm": 1.7841753959655762, + "learning_rate": 4.609640660609449e-05, + "loss": 4.4832, + "step": 30314 + }, + { + "epoch": 0.18029189266343135, + "grad_norm": 1.7051490545272827, + "learning_rate": 4.6096155971681073e-05, + "loss": 4.3786, + "step": 30315 + }, + { + "epoch": 0.18029783994671233, + "grad_norm": 1.798112392425537, + "learning_rate": 4.609590532990321e-05, + "loss": 4.4915, + "step": 30316 + }, + { + "epoch": 0.18030378722999335, + "grad_norm": 1.8255062103271484, + "learning_rate": 4.6095654680760983e-05, + "loss": 4.6701, + "step": 30317 + }, + { + "epoch": 0.18030973451327434, + "grad_norm": 2.376105308532715, + "learning_rate": 4.609540402425448e-05, + "loss": 4.5314, + "step": 30318 + }, + { + "epoch": 0.18031568179655533, + "grad_norm": 1.6199541091918945, + "learning_rate": 4.609515336038379e-05, + "loss": 4.8641, + "step": 30319 + }, + { + "epoch": 0.18032162907983634, + "grad_norm": 1.8655678033828735, + "learning_rate": 4.6094902689149e-05, + "loss": 4.9883, + "step": 30320 + }, + { + "epoch": 0.18032757636311733, + "grad_norm": 1.6049344539642334, + "learning_rate": 4.6094652010550195e-05, + "loss": 5.0508, + "step": 30321 + }, + { + "epoch": 0.18033352364639832, + "grad_norm": 1.4725605249404907, + "learning_rate": 4.6094401324587464e-05, + "loss": 4.9306, + "step": 30322 + }, + { + "epoch": 0.18033947092967934, + "grad_norm": 1.4839946031570435, + "learning_rate": 4.60941506312609e-05, + "loss": 4.85, + "step": 30323 + }, + { + "epoch": 0.18034541821296032, + "grad_norm": 1.54611074924469, + "learning_rate": 4.609389993057058e-05, + "loss": 4.9655, + "step": 30324 + }, + { + "epoch": 0.1803513654962413, + "grad_norm": 1.612251877784729, + "learning_rate": 4.609364922251661e-05, + "loss": 4.4827, + "step": 30325 + }, + { + "epoch": 0.18035731277952233, + "grad_norm": 1.3921014070510864, + "learning_rate": 4.609339850709905e-05, + "loss": 4.609, + "step": 30326 + }, + { + "epoch": 0.18036326006280332, + "grad_norm": 1.7824617624282837, + "learning_rate": 4.6093147784318014e-05, + "loss": 4.7485, + "step": 30327 + }, + { + "epoch": 0.1803692073460843, + "grad_norm": 1.5730568170547485, + "learning_rate": 4.609289705417357e-05, + "loss": 5.1987, + "step": 30328 + }, + { + "epoch": 0.18037515462936532, + "grad_norm": 1.445325493812561, + "learning_rate": 4.6092646316665814e-05, + "loss": 5.0233, + "step": 30329 + }, + { + "epoch": 0.1803811019126463, + "grad_norm": 1.4553011655807495, + "learning_rate": 4.609239557179484e-05, + "loss": 5.1624, + "step": 30330 + }, + { + "epoch": 0.1803870491959273, + "grad_norm": 1.7723554372787476, + "learning_rate": 4.609214481956072e-05, + "loss": 5.3372, + "step": 30331 + }, + { + "epoch": 0.18039299647920828, + "grad_norm": 1.5231170654296875, + "learning_rate": 4.609189405996356e-05, + "loss": 5.3817, + "step": 30332 + }, + { + "epoch": 0.1803989437624893, + "grad_norm": 1.5292000770568848, + "learning_rate": 4.609164329300343e-05, + "loss": 5.1899, + "step": 30333 + }, + { + "epoch": 0.1804048910457703, + "grad_norm": 2.4883272647857666, + "learning_rate": 4.6091392518680424e-05, + "loss": 4.5457, + "step": 30334 + }, + { + "epoch": 0.18041083832905128, + "grad_norm": 2.357412815093994, + "learning_rate": 4.6091141736994635e-05, + "loss": 4.4771, + "step": 30335 + }, + { + "epoch": 0.1804167856123323, + "grad_norm": 1.6708316802978516, + "learning_rate": 4.6090890947946144e-05, + "loss": 4.8245, + "step": 30336 + }, + { + "epoch": 0.18042273289561328, + "grad_norm": 1.7885435819625854, + "learning_rate": 4.6090640151535046e-05, + "loss": 4.7263, + "step": 30337 + }, + { + "epoch": 0.18042868017889427, + "grad_norm": 2.1166250705718994, + "learning_rate": 4.609038934776142e-05, + "loss": 4.8725, + "step": 30338 + }, + { + "epoch": 0.18043462746217528, + "grad_norm": 1.6104192733764648, + "learning_rate": 4.609013853662536e-05, + "loss": 5.0208, + "step": 30339 + }, + { + "epoch": 0.18044057474545627, + "grad_norm": 1.663496494293213, + "learning_rate": 4.6089887718126945e-05, + "loss": 5.0706, + "step": 30340 + }, + { + "epoch": 0.18044652202873726, + "grad_norm": 1.571781873703003, + "learning_rate": 4.608963689226627e-05, + "loss": 5.0953, + "step": 30341 + }, + { + "epoch": 0.18045246931201828, + "grad_norm": 1.6184124946594238, + "learning_rate": 4.6089386059043415e-05, + "loss": 5.0428, + "step": 30342 + }, + { + "epoch": 0.18045841659529926, + "grad_norm": 2.4237656593322754, + "learning_rate": 4.608913521845848e-05, + "loss": 3.7821, + "step": 30343 + }, + { + "epoch": 0.18046436387858025, + "grad_norm": 2.287548065185547, + "learning_rate": 4.6088884370511545e-05, + "loss": 3.7935, + "step": 30344 + }, + { + "epoch": 0.18047031116186127, + "grad_norm": 2.1035749912261963, + "learning_rate": 4.60886335152027e-05, + "loss": 3.6729, + "step": 30345 + }, + { + "epoch": 0.18047625844514226, + "grad_norm": 1.9365202188491821, + "learning_rate": 4.608838265253203e-05, + "loss": 3.5706, + "step": 30346 + }, + { + "epoch": 0.18048220572842325, + "grad_norm": 1.8482760190963745, + "learning_rate": 4.608813178249962e-05, + "loss": 3.7941, + "step": 30347 + }, + { + "epoch": 0.18048815301170426, + "grad_norm": 1.879911184310913, + "learning_rate": 4.608788090510557e-05, + "loss": 4.0947, + "step": 30348 + }, + { + "epoch": 0.18049410029498525, + "grad_norm": 1.9760171175003052, + "learning_rate": 4.608763002034995e-05, + "loss": 3.6721, + "step": 30349 + }, + { + "epoch": 0.18050004757826624, + "grad_norm": 1.85044264793396, + "learning_rate": 4.608737912823286e-05, + "loss": 3.6058, + "step": 30350 + }, + { + "epoch": 0.18050599486154725, + "grad_norm": 1.7919642925262451, + "learning_rate": 4.6087128228754384e-05, + "loss": 3.5611, + "step": 30351 + }, + { + "epoch": 0.18051194214482824, + "grad_norm": 1.933648943901062, + "learning_rate": 4.60868773219146e-05, + "loss": 3.5883, + "step": 30352 + }, + { + "epoch": 0.18051788942810923, + "grad_norm": 1.9025899171829224, + "learning_rate": 4.6086626407713615e-05, + "loss": 3.6201, + "step": 30353 + }, + { + "epoch": 0.18052383671139025, + "grad_norm": 1.9761525392532349, + "learning_rate": 4.608637548615151e-05, + "loss": 3.6038, + "step": 30354 + }, + { + "epoch": 0.18052978399467123, + "grad_norm": 2.008164644241333, + "learning_rate": 4.608612455722836e-05, + "loss": 3.6495, + "step": 30355 + }, + { + "epoch": 0.18053573127795222, + "grad_norm": 1.7661700248718262, + "learning_rate": 4.6085873620944266e-05, + "loss": 3.6006, + "step": 30356 + }, + { + "epoch": 0.18054167856123324, + "grad_norm": 1.872231364250183, + "learning_rate": 4.608562267729931e-05, + "loss": 3.6929, + "step": 30357 + }, + { + "epoch": 0.18054762584451423, + "grad_norm": 1.8716074228286743, + "learning_rate": 4.608537172629358e-05, + "loss": 3.4804, + "step": 30358 + }, + { + "epoch": 0.1805535731277952, + "grad_norm": 1.6453325748443604, + "learning_rate": 4.608512076792717e-05, + "loss": 4.3521, + "step": 30359 + }, + { + "epoch": 0.18055952041107623, + "grad_norm": 1.9353103637695312, + "learning_rate": 4.6084869802200156e-05, + "loss": 3.5408, + "step": 30360 + }, + { + "epoch": 0.18056546769435722, + "grad_norm": 1.854251503944397, + "learning_rate": 4.6084618829112636e-05, + "loss": 3.5502, + "step": 30361 + }, + { + "epoch": 0.1805714149776382, + "grad_norm": 1.8924806118011475, + "learning_rate": 4.608436784866469e-05, + "loss": 3.4984, + "step": 30362 + }, + { + "epoch": 0.18057736226091922, + "grad_norm": 1.876546859741211, + "learning_rate": 4.608411686085641e-05, + "loss": 3.5422, + "step": 30363 + }, + { + "epoch": 0.1805833095442002, + "grad_norm": 1.81404709815979, + "learning_rate": 4.608386586568788e-05, + "loss": 3.528, + "step": 30364 + }, + { + "epoch": 0.1805892568274812, + "grad_norm": 1.6718660593032837, + "learning_rate": 4.60836148631592e-05, + "loss": 4.2733, + "step": 30365 + }, + { + "epoch": 0.18059520411076221, + "grad_norm": 1.8086154460906982, + "learning_rate": 4.6083363853270436e-05, + "loss": 4.2946, + "step": 30366 + }, + { + "epoch": 0.1806011513940432, + "grad_norm": 1.661757230758667, + "learning_rate": 4.6083112836021694e-05, + "loss": 4.6596, + "step": 30367 + }, + { + "epoch": 0.1806070986773242, + "grad_norm": 1.8891844749450684, + "learning_rate": 4.6082861811413056e-05, + "loss": 4.173, + "step": 30368 + }, + { + "epoch": 0.1806130459606052, + "grad_norm": 2.1718995571136475, + "learning_rate": 4.60826107794446e-05, + "loss": 3.6396, + "step": 30369 + }, + { + "epoch": 0.1806189932438862, + "grad_norm": 1.6074626445770264, + "learning_rate": 4.608235974011643e-05, + "loss": 4.2138, + "step": 30370 + }, + { + "epoch": 0.18062494052716718, + "grad_norm": 2.053957223892212, + "learning_rate": 4.608210869342863e-05, + "loss": 3.6579, + "step": 30371 + }, + { + "epoch": 0.1806308878104482, + "grad_norm": 2.0929627418518066, + "learning_rate": 4.6081857639381274e-05, + "loss": 3.7675, + "step": 30372 + }, + { + "epoch": 0.1806368350937292, + "grad_norm": 1.8131572008132935, + "learning_rate": 4.608160657797447e-05, + "loss": 4.482, + "step": 30373 + }, + { + "epoch": 0.18064278237701017, + "grad_norm": 1.8105684518814087, + "learning_rate": 4.608135550920829e-05, + "loss": 4.2737, + "step": 30374 + }, + { + "epoch": 0.1806487296602912, + "grad_norm": 1.7839126586914062, + "learning_rate": 4.608110443308282e-05, + "loss": 4.1166, + "step": 30375 + }, + { + "epoch": 0.18065467694357218, + "grad_norm": 1.7233171463012695, + "learning_rate": 4.6080853349598164e-05, + "loss": 4.1941, + "step": 30376 + }, + { + "epoch": 0.18066062422685317, + "grad_norm": 2.1062052249908447, + "learning_rate": 4.608060225875439e-05, + "loss": 4.5294, + "step": 30377 + }, + { + "epoch": 0.18066657151013418, + "grad_norm": 1.744558572769165, + "learning_rate": 4.6080351160551605e-05, + "loss": 5.3094, + "step": 30378 + }, + { + "epoch": 0.18067251879341517, + "grad_norm": 1.5789061784744263, + "learning_rate": 4.608010005498988e-05, + "loss": 4.7674, + "step": 30379 + }, + { + "epoch": 0.18067846607669616, + "grad_norm": 2.0195188522338867, + "learning_rate": 4.6079848942069316e-05, + "loss": 4.5897, + "step": 30380 + }, + { + "epoch": 0.18068441335997718, + "grad_norm": 1.8995375633239746, + "learning_rate": 4.6079597821789993e-05, + "loss": 3.55, + "step": 30381 + }, + { + "epoch": 0.18069036064325816, + "grad_norm": 1.9370126724243164, + "learning_rate": 4.6079346694152e-05, + "loss": 3.681, + "step": 30382 + }, + { + "epoch": 0.18069630792653915, + "grad_norm": 1.6433509588241577, + "learning_rate": 4.607909555915542e-05, + "loss": 4.6211, + "step": 30383 + }, + { + "epoch": 0.18070225520982017, + "grad_norm": 1.9012796878814697, + "learning_rate": 4.607884441680035e-05, + "loss": 4.5669, + "step": 30384 + }, + { + "epoch": 0.18070820249310116, + "grad_norm": 1.8061003684997559, + "learning_rate": 4.607859326708687e-05, + "loss": 4.4649, + "step": 30385 + }, + { + "epoch": 0.18071414977638214, + "grad_norm": 1.7555569410324097, + "learning_rate": 4.607834211001508e-05, + "loss": 4.4836, + "step": 30386 + }, + { + "epoch": 0.18072009705966316, + "grad_norm": 1.9138058423995972, + "learning_rate": 4.607809094558505e-05, + "loss": 4.5009, + "step": 30387 + }, + { + "epoch": 0.18072604434294415, + "grad_norm": 2.0391855239868164, + "learning_rate": 4.6077839773796874e-05, + "loss": 4.4596, + "step": 30388 + }, + { + "epoch": 0.18073199162622514, + "grad_norm": 2.037545680999756, + "learning_rate": 4.607758859465065e-05, + "loss": 4.5291, + "step": 30389 + }, + { + "epoch": 0.18073793890950612, + "grad_norm": 2.7652394771575928, + "learning_rate": 4.607733740814645e-05, + "loss": 4.2606, + "step": 30390 + }, + { + "epoch": 0.18074388619278714, + "grad_norm": 2.835252285003662, + "learning_rate": 4.607708621428438e-05, + "loss": 4.2674, + "step": 30391 + }, + { + "epoch": 0.18074983347606813, + "grad_norm": 2.889340400695801, + "learning_rate": 4.607683501306451e-05, + "loss": 4.3982, + "step": 30392 + }, + { + "epoch": 0.18075578075934912, + "grad_norm": 1.8587162494659424, + "learning_rate": 4.607658380448693e-05, + "loss": 4.1949, + "step": 30393 + }, + { + "epoch": 0.18076172804263013, + "grad_norm": 2.183932304382324, + "learning_rate": 4.607633258855174e-05, + "loss": 4.4338, + "step": 30394 + }, + { + "epoch": 0.18076767532591112, + "grad_norm": 1.8604317903518677, + "learning_rate": 4.607608136525902e-05, + "loss": 5.2043, + "step": 30395 + }, + { + "epoch": 0.1807736226091921, + "grad_norm": 1.7363629341125488, + "learning_rate": 4.607583013460885e-05, + "loss": 5.1606, + "step": 30396 + }, + { + "epoch": 0.18077956989247312, + "grad_norm": 1.6214736700057983, + "learning_rate": 4.607557889660133e-05, + "loss": 5.2732, + "step": 30397 + }, + { + "epoch": 0.1807855171757541, + "grad_norm": 1.7445697784423828, + "learning_rate": 4.607532765123654e-05, + "loss": 4.7826, + "step": 30398 + }, + { + "epoch": 0.1807914644590351, + "grad_norm": 2.053269147872925, + "learning_rate": 4.607507639851458e-05, + "loss": 4.5327, + "step": 30399 + }, + { + "epoch": 0.18079741174231612, + "grad_norm": 1.63230299949646, + "learning_rate": 4.607482513843552e-05, + "loss": 4.989, + "step": 30400 + }, + { + "epoch": 0.1808033590255971, + "grad_norm": 1.586403489112854, + "learning_rate": 4.607457387099946e-05, + "loss": 5.3342, + "step": 30401 + }, + { + "epoch": 0.1808093063088781, + "grad_norm": 1.43230140209198, + "learning_rate": 4.607432259620648e-05, + "loss": 5.377, + "step": 30402 + }, + { + "epoch": 0.1808152535921591, + "grad_norm": 2.190584182739258, + "learning_rate": 4.6074071314056676e-05, + "loss": 4.9366, + "step": 30403 + }, + { + "epoch": 0.1808212008754401, + "grad_norm": 1.6194654703140259, + "learning_rate": 4.607382002455013e-05, + "loss": 5.2639, + "step": 30404 + }, + { + "epoch": 0.18082714815872108, + "grad_norm": 1.615243911743164, + "learning_rate": 4.607356872768693e-05, + "loss": 4.6323, + "step": 30405 + }, + { + "epoch": 0.1808330954420021, + "grad_norm": 1.5417380332946777, + "learning_rate": 4.607331742346717e-05, + "loss": 4.8193, + "step": 30406 + }, + { + "epoch": 0.1808390427252831, + "grad_norm": 1.5013401508331299, + "learning_rate": 4.607306611189093e-05, + "loss": 5.1733, + "step": 30407 + }, + { + "epoch": 0.18084499000856408, + "grad_norm": 1.2872532606124878, + "learning_rate": 4.60728147929583e-05, + "loss": 5.1983, + "step": 30408 + }, + { + "epoch": 0.1808509372918451, + "grad_norm": 1.4880503416061401, + "learning_rate": 4.607256346666936e-05, + "loss": 5.4417, + "step": 30409 + }, + { + "epoch": 0.18085688457512608, + "grad_norm": 1.2395708560943604, + "learning_rate": 4.607231213302422e-05, + "loss": 5.5189, + "step": 30410 + }, + { + "epoch": 0.18086283185840707, + "grad_norm": 1.7053332328796387, + "learning_rate": 4.607206079202294e-05, + "loss": 5.3116, + "step": 30411 + }, + { + "epoch": 0.18086877914168809, + "grad_norm": 1.5006909370422363, + "learning_rate": 4.607180944366563e-05, + "loss": 5.6907, + "step": 30412 + }, + { + "epoch": 0.18087472642496907, + "grad_norm": 1.489794373512268, + "learning_rate": 4.6071558087952364e-05, + "loss": 5.5739, + "step": 30413 + }, + { + "epoch": 0.18088067370825006, + "grad_norm": 1.5303220748901367, + "learning_rate": 4.607130672488324e-05, + "loss": 5.3727, + "step": 30414 + }, + { + "epoch": 0.18088662099153108, + "grad_norm": 2.531562566757202, + "learning_rate": 4.6071055354458335e-05, + "loss": 4.2266, + "step": 30415 + }, + { + "epoch": 0.18089256827481207, + "grad_norm": 1.5819337368011475, + "learning_rate": 4.6070803976677744e-05, + "loss": 5.01, + "step": 30416 + }, + { + "epoch": 0.18089851555809305, + "grad_norm": 1.4588855504989624, + "learning_rate": 4.607055259154156e-05, + "loss": 5.0615, + "step": 30417 + }, + { + "epoch": 0.18090446284137407, + "grad_norm": 1.7806695699691772, + "learning_rate": 4.607030119904986e-05, + "loss": 5.1481, + "step": 30418 + }, + { + "epoch": 0.18091041012465506, + "grad_norm": 1.37575364112854, + "learning_rate": 4.607004979920273e-05, + "loss": 5.0087, + "step": 30419 + }, + { + "epoch": 0.18091635740793605, + "grad_norm": 1.6504050493240356, + "learning_rate": 4.606979839200027e-05, + "loss": 5.0311, + "step": 30420 + }, + { + "epoch": 0.18092230469121706, + "grad_norm": 1.484144687652588, + "learning_rate": 4.6069546977442556e-05, + "loss": 5.3201, + "step": 30421 + }, + { + "epoch": 0.18092825197449805, + "grad_norm": 1.762091040611267, + "learning_rate": 4.606929555552968e-05, + "loss": 5.1807, + "step": 30422 + }, + { + "epoch": 0.18093419925777904, + "grad_norm": 1.8154287338256836, + "learning_rate": 4.606904412626174e-05, + "loss": 4.8606, + "step": 30423 + }, + { + "epoch": 0.18094014654106005, + "grad_norm": 1.7479325532913208, + "learning_rate": 4.606879268963881e-05, + "loss": 4.8235, + "step": 30424 + }, + { + "epoch": 0.18094609382434104, + "grad_norm": 1.44249427318573, + "learning_rate": 4.6068541245660974e-05, + "loss": 4.7681, + "step": 30425 + }, + { + "epoch": 0.18095204110762203, + "grad_norm": 1.3895748853683472, + "learning_rate": 4.606828979432833e-05, + "loss": 5.1613, + "step": 30426 + }, + { + "epoch": 0.18095798839090305, + "grad_norm": 1.5282186269760132, + "learning_rate": 4.606803833564097e-05, + "loss": 4.8431, + "step": 30427 + }, + { + "epoch": 0.18096393567418403, + "grad_norm": 2.7380192279815674, + "learning_rate": 4.606778686959897e-05, + "loss": 4.7426, + "step": 30428 + }, + { + "epoch": 0.18096988295746502, + "grad_norm": 2.365036725997925, + "learning_rate": 4.6067535396202434e-05, + "loss": 4.9487, + "step": 30429 + }, + { + "epoch": 0.18097583024074604, + "grad_norm": 1.7427470684051514, + "learning_rate": 4.606728391545143e-05, + "loss": 4.8868, + "step": 30430 + }, + { + "epoch": 0.18098177752402703, + "grad_norm": 1.6613335609436035, + "learning_rate": 4.606703242734606e-05, + "loss": 5.0911, + "step": 30431 + }, + { + "epoch": 0.18098772480730801, + "grad_norm": 1.71418297290802, + "learning_rate": 4.60667809318864e-05, + "loss": 4.9135, + "step": 30432 + }, + { + "epoch": 0.18099367209058903, + "grad_norm": 1.4050582647323608, + "learning_rate": 4.6066529429072545e-05, + "loss": 5.1009, + "step": 30433 + }, + { + "epoch": 0.18099961937387002, + "grad_norm": 1.2690151929855347, + "learning_rate": 4.606627791890458e-05, + "loss": 5.5164, + "step": 30434 + }, + { + "epoch": 0.181005566657151, + "grad_norm": 1.6794445514678955, + "learning_rate": 4.60660264013826e-05, + "loss": 5.159, + "step": 30435 + }, + { + "epoch": 0.18101151394043202, + "grad_norm": 1.845813512802124, + "learning_rate": 4.606577487650669e-05, + "loss": 4.6055, + "step": 30436 + }, + { + "epoch": 0.181017461223713, + "grad_norm": 1.6325689554214478, + "learning_rate": 4.6065523344276925e-05, + "loss": 4.7565, + "step": 30437 + }, + { + "epoch": 0.181023408506994, + "grad_norm": 1.64036226272583, + "learning_rate": 4.6065271804693424e-05, + "loss": 5.1105, + "step": 30438 + }, + { + "epoch": 0.18102935579027502, + "grad_norm": 1.5065094232559204, + "learning_rate": 4.6065020257756234e-05, + "loss": 4.7116, + "step": 30439 + }, + { + "epoch": 0.181035303073556, + "grad_norm": 1.8012547492980957, + "learning_rate": 4.6064768703465476e-05, + "loss": 4.3993, + "step": 30440 + }, + { + "epoch": 0.181041250356837, + "grad_norm": 1.5189584493637085, + "learning_rate": 4.606451714182122e-05, + "loss": 4.6601, + "step": 30441 + }, + { + "epoch": 0.181047197640118, + "grad_norm": 1.7323181629180908, + "learning_rate": 4.606426557282356e-05, + "loss": 5.5846, + "step": 30442 + }, + { + "epoch": 0.181053144923399, + "grad_norm": 1.5709025859832764, + "learning_rate": 4.606401399647258e-05, + "loss": 5.1547, + "step": 30443 + }, + { + "epoch": 0.18105909220667998, + "grad_norm": 1.6060830354690552, + "learning_rate": 4.6063762412768365e-05, + "loss": 4.867, + "step": 30444 + }, + { + "epoch": 0.181065039489961, + "grad_norm": 1.4921566247940063, + "learning_rate": 4.606351082171102e-05, + "loss": 5.0458, + "step": 30445 + }, + { + "epoch": 0.181070986773242, + "grad_norm": 1.9008151292800903, + "learning_rate": 4.606325922330062e-05, + "loss": 4.9894, + "step": 30446 + }, + { + "epoch": 0.18107693405652298, + "grad_norm": 2.0366036891937256, + "learning_rate": 4.606300761753724e-05, + "loss": 4.6917, + "step": 30447 + }, + { + "epoch": 0.18108288133980396, + "grad_norm": 1.8549975156784058, + "learning_rate": 4.606275600442099e-05, + "loss": 4.9342, + "step": 30448 + }, + { + "epoch": 0.18108882862308498, + "grad_norm": 1.7794413566589355, + "learning_rate": 4.606250438395196e-05, + "loss": 4.6526, + "step": 30449 + }, + { + "epoch": 0.18109477590636597, + "grad_norm": 1.7541767358779907, + "learning_rate": 4.606225275613021e-05, + "loss": 4.7991, + "step": 30450 + }, + { + "epoch": 0.18110072318964696, + "grad_norm": 2.040306329727173, + "learning_rate": 4.6062001120955854e-05, + "loss": 4.1135, + "step": 30451 + }, + { + "epoch": 0.18110667047292797, + "grad_norm": 2.444293737411499, + "learning_rate": 4.606174947842897e-05, + "loss": 3.4574, + "step": 30452 + }, + { + "epoch": 0.18111261775620896, + "grad_norm": 2.5346062183380127, + "learning_rate": 4.606149782854964e-05, + "loss": 3.2278, + "step": 30453 + }, + { + "epoch": 0.18111856503948995, + "grad_norm": 2.1727371215820312, + "learning_rate": 4.6061246171317975e-05, + "loss": 3.6005, + "step": 30454 + }, + { + "epoch": 0.18112451232277096, + "grad_norm": 1.6244183778762817, + "learning_rate": 4.6060994506734034e-05, + "loss": 4.8594, + "step": 30455 + }, + { + "epoch": 0.18113045960605195, + "grad_norm": 1.6611864566802979, + "learning_rate": 4.606074283479792e-05, + "loss": 4.6606, + "step": 30456 + }, + { + "epoch": 0.18113640688933294, + "grad_norm": 1.9803105592727661, + "learning_rate": 4.606049115550972e-05, + "loss": 4.7584, + "step": 30457 + }, + { + "epoch": 0.18114235417261396, + "grad_norm": 2.047974109649658, + "learning_rate": 4.6060239468869514e-05, + "loss": 4.0147, + "step": 30458 + }, + { + "epoch": 0.18114830145589494, + "grad_norm": 2.57551908493042, + "learning_rate": 4.60599877748774e-05, + "loss": 3.3263, + "step": 30459 + }, + { + "epoch": 0.18115424873917593, + "grad_norm": 2.1633079051971436, + "learning_rate": 4.6059736073533465e-05, + "loss": 3.2757, + "step": 30460 + }, + { + "epoch": 0.18116019602245695, + "grad_norm": 3.3115196228027344, + "learning_rate": 4.605948436483779e-05, + "loss": 3.2996, + "step": 30461 + }, + { + "epoch": 0.18116614330573794, + "grad_norm": 2.717261791229248, + "learning_rate": 4.6059232648790465e-05, + "loss": 3.1929, + "step": 30462 + }, + { + "epoch": 0.18117209058901892, + "grad_norm": 2.1867258548736572, + "learning_rate": 4.6058980925391585e-05, + "loss": 3.3655, + "step": 30463 + }, + { + "epoch": 0.18117803787229994, + "grad_norm": 2.306809186935425, + "learning_rate": 4.6058729194641225e-05, + "loss": 2.9844, + "step": 30464 + }, + { + "epoch": 0.18118398515558093, + "grad_norm": 2.939728260040283, + "learning_rate": 4.6058477456539486e-05, + "loss": 3.158, + "step": 30465 + }, + { + "epoch": 0.18118993243886192, + "grad_norm": 2.996995687484741, + "learning_rate": 4.605822571108646e-05, + "loss": 3.141, + "step": 30466 + }, + { + "epoch": 0.18119587972214293, + "grad_norm": 2.5442357063293457, + "learning_rate": 4.6057973958282205e-05, + "loss": 3.2498, + "step": 30467 + }, + { + "epoch": 0.18120182700542392, + "grad_norm": 2.3496897220611572, + "learning_rate": 4.605772219812684e-05, + "loss": 3.537, + "step": 30468 + }, + { + "epoch": 0.1812077742887049, + "grad_norm": 1.6112096309661865, + "learning_rate": 4.605747043062044e-05, + "loss": 4.8052, + "step": 30469 + }, + { + "epoch": 0.18121372157198593, + "grad_norm": 2.8755533695220947, + "learning_rate": 4.605721865576309e-05, + "loss": 3.8164, + "step": 30470 + }, + { + "epoch": 0.1812196688552669, + "grad_norm": 2.406846046447754, + "learning_rate": 4.605696687355489e-05, + "loss": 3.4058, + "step": 30471 + }, + { + "epoch": 0.1812256161385479, + "grad_norm": 3.146632671356201, + "learning_rate": 4.605671508399592e-05, + "loss": 3.5037, + "step": 30472 + }, + { + "epoch": 0.18123156342182892, + "grad_norm": 2.710477828979492, + "learning_rate": 4.605646328708626e-05, + "loss": 3.1083, + "step": 30473 + }, + { + "epoch": 0.1812375107051099, + "grad_norm": 2.2567665576934814, + "learning_rate": 4.6056211482826e-05, + "loss": 3.2056, + "step": 30474 + }, + { + "epoch": 0.1812434579883909, + "grad_norm": 2.9403610229492188, + "learning_rate": 4.6055959671215256e-05, + "loss": 3.5021, + "step": 30475 + }, + { + "epoch": 0.1812494052716719, + "grad_norm": 2.386746406555176, + "learning_rate": 4.6055707852254085e-05, + "loss": 3.3324, + "step": 30476 + }, + { + "epoch": 0.1812553525549529, + "grad_norm": 1.872837781906128, + "learning_rate": 4.605545602594258e-05, + "loss": 4.0415, + "step": 30477 + }, + { + "epoch": 0.18126129983823389, + "grad_norm": 2.302643060684204, + "learning_rate": 4.605520419228084e-05, + "loss": 3.5739, + "step": 30478 + }, + { + "epoch": 0.1812672471215149, + "grad_norm": 1.8837559223175049, + "learning_rate": 4.6054952351268935e-05, + "loss": 3.8909, + "step": 30479 + }, + { + "epoch": 0.1812731944047959, + "grad_norm": 1.8574949502944946, + "learning_rate": 4.605470050290697e-05, + "loss": 4.6073, + "step": 30480 + }, + { + "epoch": 0.18127914168807688, + "grad_norm": 3.745434522628784, + "learning_rate": 4.605444864719503e-05, + "loss": 4.9296, + "step": 30481 + }, + { + "epoch": 0.1812850889713579, + "grad_norm": 2.209376573562622, + "learning_rate": 4.6054196784133195e-05, + "loss": 5.1083, + "step": 30482 + }, + { + "epoch": 0.18129103625463888, + "grad_norm": 1.746163249015808, + "learning_rate": 4.6053944913721555e-05, + "loss": 4.2706, + "step": 30483 + }, + { + "epoch": 0.18129698353791987, + "grad_norm": 2.2691433429718018, + "learning_rate": 4.6053693035960204e-05, + "loss": 3.8251, + "step": 30484 + }, + { + "epoch": 0.18130293082120089, + "grad_norm": 1.9895451068878174, + "learning_rate": 4.605344115084923e-05, + "loss": 3.8413, + "step": 30485 + }, + { + "epoch": 0.18130887810448187, + "grad_norm": 2.2342569828033447, + "learning_rate": 4.6053189258388706e-05, + "loss": 4.4328, + "step": 30486 + }, + { + "epoch": 0.18131482538776286, + "grad_norm": 1.7602850198745728, + "learning_rate": 4.605293735857874e-05, + "loss": 4.779, + "step": 30487 + }, + { + "epoch": 0.18132077267104388, + "grad_norm": 1.689023494720459, + "learning_rate": 4.6052685451419405e-05, + "loss": 4.7603, + "step": 30488 + }, + { + "epoch": 0.18132671995432487, + "grad_norm": 1.6477890014648438, + "learning_rate": 4.6052433536910804e-05, + "loss": 4.9194, + "step": 30489 + }, + { + "epoch": 0.18133266723760585, + "grad_norm": 1.879791021347046, + "learning_rate": 4.605218161505301e-05, + "loss": 4.9174, + "step": 30490 + }, + { + "epoch": 0.18133861452088687, + "grad_norm": 2.530984878540039, + "learning_rate": 4.605192968584612e-05, + "loss": 3.6623, + "step": 30491 + }, + { + "epoch": 0.18134456180416786, + "grad_norm": 2.555924415588379, + "learning_rate": 4.605167774929022e-05, + "loss": 3.5684, + "step": 30492 + }, + { + "epoch": 0.18135050908744885, + "grad_norm": 2.00748872756958, + "learning_rate": 4.6051425805385394e-05, + "loss": 4.3182, + "step": 30493 + }, + { + "epoch": 0.18135645637072986, + "grad_norm": 1.7455837726593018, + "learning_rate": 4.605117385413174e-05, + "loss": 5.2199, + "step": 30494 + }, + { + "epoch": 0.18136240365401085, + "grad_norm": 1.7002990245819092, + "learning_rate": 4.605092189552932e-05, + "loss": 4.8912, + "step": 30495 + }, + { + "epoch": 0.18136835093729184, + "grad_norm": 1.830411434173584, + "learning_rate": 4.605066992957825e-05, + "loss": 4.5212, + "step": 30496 + }, + { + "epoch": 0.18137429822057285, + "grad_norm": 1.7505379915237427, + "learning_rate": 4.605041795627861e-05, + "loss": 4.5038, + "step": 30497 + }, + { + "epoch": 0.18138024550385384, + "grad_norm": 1.3816022872924805, + "learning_rate": 4.605016597563049e-05, + "loss": 5.1461, + "step": 30498 + }, + { + "epoch": 0.18138619278713483, + "grad_norm": 1.1977434158325195, + "learning_rate": 4.6049913987633976e-05, + "loss": 5.2844, + "step": 30499 + }, + { + "epoch": 0.18139214007041585, + "grad_norm": 1.4711052179336548, + "learning_rate": 4.604966199228915e-05, + "loss": 5.1301, + "step": 30500 + }, + { + "epoch": 0.18139808735369684, + "grad_norm": 1.316135048866272, + "learning_rate": 4.6049409989596105e-05, + "loss": 5.2839, + "step": 30501 + }, + { + "epoch": 0.18140403463697782, + "grad_norm": 1.491049885749817, + "learning_rate": 4.6049157979554926e-05, + "loss": 5.3503, + "step": 30502 + }, + { + "epoch": 0.18140998192025884, + "grad_norm": 1.5653736591339111, + "learning_rate": 4.60489059621657e-05, + "loss": 5.2905, + "step": 30503 + }, + { + "epoch": 0.18141592920353983, + "grad_norm": 1.5193443298339844, + "learning_rate": 4.6048653937428523e-05, + "loss": 5.4668, + "step": 30504 + }, + { + "epoch": 0.18142187648682082, + "grad_norm": 1.5355736017227173, + "learning_rate": 4.604840190534349e-05, + "loss": 5.2235, + "step": 30505 + }, + { + "epoch": 0.1814278237701018, + "grad_norm": 1.6808356046676636, + "learning_rate": 4.604814986591066e-05, + "loss": 5.1193, + "step": 30506 + }, + { + "epoch": 0.18143377105338282, + "grad_norm": 1.5504355430603027, + "learning_rate": 4.6047897819130146e-05, + "loss": 5.4469, + "step": 30507 + }, + { + "epoch": 0.1814397183366638, + "grad_norm": 1.394782304763794, + "learning_rate": 4.604764576500202e-05, + "loss": 5.1401, + "step": 30508 + }, + { + "epoch": 0.1814456656199448, + "grad_norm": 1.9043993949890137, + "learning_rate": 4.6047393703526386e-05, + "loss": 4.1807, + "step": 30509 + }, + { + "epoch": 0.1814516129032258, + "grad_norm": 1.5536892414093018, + "learning_rate": 4.604714163470333e-05, + "loss": 4.8233, + "step": 30510 + }, + { + "epoch": 0.1814575601865068, + "grad_norm": 1.5314890146255493, + "learning_rate": 4.604688955853293e-05, + "loss": 4.8918, + "step": 30511 + }, + { + "epoch": 0.1814635074697878, + "grad_norm": 1.5154199600219727, + "learning_rate": 4.604663747501527e-05, + "loss": 5.2876, + "step": 30512 + }, + { + "epoch": 0.1814694547530688, + "grad_norm": 1.3783801794052124, + "learning_rate": 4.604638538415046e-05, + "loss": 5.437, + "step": 30513 + }, + { + "epoch": 0.1814754020363498, + "grad_norm": 1.850745677947998, + "learning_rate": 4.6046133285938567e-05, + "loss": 4.9178, + "step": 30514 + }, + { + "epoch": 0.18148134931963078, + "grad_norm": 1.5241893529891968, + "learning_rate": 4.604588118037968e-05, + "loss": 5.1389, + "step": 30515 + }, + { + "epoch": 0.1814872966029118, + "grad_norm": 1.4288957118988037, + "learning_rate": 4.60456290674739e-05, + "loss": 5.0731, + "step": 30516 + }, + { + "epoch": 0.18149324388619278, + "grad_norm": 1.7770181894302368, + "learning_rate": 4.604537694722131e-05, + "loss": 4.9011, + "step": 30517 + }, + { + "epoch": 0.18149919116947377, + "grad_norm": 1.6263269186019897, + "learning_rate": 4.6045124819621995e-05, + "loss": 4.5729, + "step": 30518 + }, + { + "epoch": 0.1815051384527548, + "grad_norm": 1.7641338109970093, + "learning_rate": 4.6044872684676044e-05, + "loss": 4.9114, + "step": 30519 + }, + { + "epoch": 0.18151108573603578, + "grad_norm": 2.29036283493042, + "learning_rate": 4.6044620542383546e-05, + "loss": 3.1598, + "step": 30520 + }, + { + "epoch": 0.18151703301931676, + "grad_norm": 3.0936734676361084, + "learning_rate": 4.604436839274459e-05, + "loss": 2.9494, + "step": 30521 + }, + { + "epoch": 0.18152298030259778, + "grad_norm": 2.300161838531494, + "learning_rate": 4.604411623575925e-05, + "loss": 3.103, + "step": 30522 + }, + { + "epoch": 0.18152892758587877, + "grad_norm": 2.440436601638794, + "learning_rate": 4.604386407142764e-05, + "loss": 2.7361, + "step": 30523 + }, + { + "epoch": 0.18153487486915976, + "grad_norm": 2.3842546939849854, + "learning_rate": 4.604361189974983e-05, + "loss": 3.3269, + "step": 30524 + }, + { + "epoch": 0.18154082215244077, + "grad_norm": 2.316323757171631, + "learning_rate": 4.6043359720725916e-05, + "loss": 2.4623, + "step": 30525 + }, + { + "epoch": 0.18154676943572176, + "grad_norm": 2.311478853225708, + "learning_rate": 4.604310753435598e-05, + "loss": 3.3375, + "step": 30526 + }, + { + "epoch": 0.18155271671900275, + "grad_norm": 2.571591854095459, + "learning_rate": 4.604285534064011e-05, + "loss": 3.1362, + "step": 30527 + }, + { + "epoch": 0.18155866400228377, + "grad_norm": 2.753108263015747, + "learning_rate": 4.60426031395784e-05, + "loss": 2.7714, + "step": 30528 + }, + { + "epoch": 0.18156461128556475, + "grad_norm": 2.680237054824829, + "learning_rate": 4.604235093117093e-05, + "loss": 2.7293, + "step": 30529 + }, + { + "epoch": 0.18157055856884574, + "grad_norm": 2.6374194622039795, + "learning_rate": 4.6042098715417795e-05, + "loss": 3.2162, + "step": 30530 + }, + { + "epoch": 0.18157650585212676, + "grad_norm": 2.288968563079834, + "learning_rate": 4.6041846492319086e-05, + "loss": 2.9725, + "step": 30531 + }, + { + "epoch": 0.18158245313540775, + "grad_norm": 2.3108694553375244, + "learning_rate": 4.604159426187488e-05, + "loss": 3.4549, + "step": 30532 + }, + { + "epoch": 0.18158840041868873, + "grad_norm": 2.3923144340515137, + "learning_rate": 4.604134202408528e-05, + "loss": 3.4278, + "step": 30533 + }, + { + "epoch": 0.18159434770196975, + "grad_norm": 2.669036626815796, + "learning_rate": 4.6041089778950355e-05, + "loss": 3.7157, + "step": 30534 + }, + { + "epoch": 0.18160029498525074, + "grad_norm": 2.038989782333374, + "learning_rate": 4.60408375264702e-05, + "loss": 4.6171, + "step": 30535 + }, + { + "epoch": 0.18160624226853173, + "grad_norm": 1.9777814149856567, + "learning_rate": 4.604058526664491e-05, + "loss": 5.1166, + "step": 30536 + }, + { + "epoch": 0.18161218955181274, + "grad_norm": 2.14339280128479, + "learning_rate": 4.604033299947457e-05, + "loss": 4.0872, + "step": 30537 + }, + { + "epoch": 0.18161813683509373, + "grad_norm": 2.5352818965911865, + "learning_rate": 4.604008072495927e-05, + "loss": 3.5657, + "step": 30538 + }, + { + "epoch": 0.18162408411837472, + "grad_norm": 2.4932284355163574, + "learning_rate": 4.603982844309909e-05, + "loss": 3.4923, + "step": 30539 + }, + { + "epoch": 0.18163003140165573, + "grad_norm": 2.817173719406128, + "learning_rate": 4.603957615389413e-05, + "loss": 3.612, + "step": 30540 + }, + { + "epoch": 0.18163597868493672, + "grad_norm": 2.3959133625030518, + "learning_rate": 4.603932385734446e-05, + "loss": 3.4037, + "step": 30541 + }, + { + "epoch": 0.1816419259682177, + "grad_norm": 2.288473129272461, + "learning_rate": 4.6039071553450194e-05, + "loss": 3.1999, + "step": 30542 + }, + { + "epoch": 0.18164787325149873, + "grad_norm": 2.2291407585144043, + "learning_rate": 4.60388192422114e-05, + "loss": 3.2781, + "step": 30543 + }, + { + "epoch": 0.18165382053477971, + "grad_norm": 2.4226462841033936, + "learning_rate": 4.603856692362817e-05, + "loss": 3.3288, + "step": 30544 + }, + { + "epoch": 0.1816597678180607, + "grad_norm": 2.264042377471924, + "learning_rate": 4.6038314597700594e-05, + "loss": 3.4528, + "step": 30545 + }, + { + "epoch": 0.18166571510134172, + "grad_norm": 2.625178813934326, + "learning_rate": 4.6038062264428756e-05, + "loss": 3.2663, + "step": 30546 + }, + { + "epoch": 0.1816716623846227, + "grad_norm": 2.498853921890259, + "learning_rate": 4.603780992381275e-05, + "loss": 3.3232, + "step": 30547 + }, + { + "epoch": 0.1816776096679037, + "grad_norm": 2.1288323402404785, + "learning_rate": 4.603755757585266e-05, + "loss": 3.3592, + "step": 30548 + }, + { + "epoch": 0.1816835569511847, + "grad_norm": 2.363189697265625, + "learning_rate": 4.603730522054858e-05, + "loss": 3.3607, + "step": 30549 + }, + { + "epoch": 0.1816895042344657, + "grad_norm": 2.465437889099121, + "learning_rate": 4.60370528579006e-05, + "loss": 3.2047, + "step": 30550 + }, + { + "epoch": 0.1816954515177467, + "grad_norm": 2.6008546352386475, + "learning_rate": 4.603680048790879e-05, + "loss": 3.3468, + "step": 30551 + }, + { + "epoch": 0.1817013988010277, + "grad_norm": 2.6666195392608643, + "learning_rate": 4.603654811057325e-05, + "loss": 3.7408, + "step": 30552 + }, + { + "epoch": 0.1817073460843087, + "grad_norm": 2.3587095737457275, + "learning_rate": 4.603629572589408e-05, + "loss": 3.5229, + "step": 30553 + }, + { + "epoch": 0.18171329336758968, + "grad_norm": 2.3080029487609863, + "learning_rate": 4.603604333387135e-05, + "loss": 3.0769, + "step": 30554 + }, + { + "epoch": 0.1817192406508707, + "grad_norm": 2.7178757190704346, + "learning_rate": 4.603579093450515e-05, + "loss": 3.2384, + "step": 30555 + }, + { + "epoch": 0.18172518793415168, + "grad_norm": 2.6380956172943115, + "learning_rate": 4.603553852779559e-05, + "loss": 3.0647, + "step": 30556 + }, + { + "epoch": 0.18173113521743267, + "grad_norm": 2.6807405948638916, + "learning_rate": 4.603528611374272e-05, + "loss": 3.5373, + "step": 30557 + }, + { + "epoch": 0.1817370825007137, + "grad_norm": 2.2781288623809814, + "learning_rate": 4.603503369234666e-05, + "loss": 3.18, + "step": 30558 + }, + { + "epoch": 0.18174302978399468, + "grad_norm": 2.6194839477539062, + "learning_rate": 4.6034781263607485e-05, + "loss": 3.2369, + "step": 30559 + }, + { + "epoch": 0.18174897706727566, + "grad_norm": 2.236381769180298, + "learning_rate": 4.603452882752528e-05, + "loss": 3.6477, + "step": 30560 + }, + { + "epoch": 0.18175492435055668, + "grad_norm": 3.2307355403900146, + "learning_rate": 4.603427638410014e-05, + "loss": 2.8661, + "step": 30561 + }, + { + "epoch": 0.18176087163383767, + "grad_norm": 3.1829538345336914, + "learning_rate": 4.603402393333216e-05, + "loss": 3.5902, + "step": 30562 + }, + { + "epoch": 0.18176681891711866, + "grad_norm": 3.2353084087371826, + "learning_rate": 4.603377147522141e-05, + "loss": 3.4499, + "step": 30563 + }, + { + "epoch": 0.18177276620039964, + "grad_norm": 2.7337300777435303, + "learning_rate": 4.6033519009767995e-05, + "loss": 3.0508, + "step": 30564 + }, + { + "epoch": 0.18177871348368066, + "grad_norm": 2.4610583782196045, + "learning_rate": 4.603326653697199e-05, + "loss": 3.372, + "step": 30565 + }, + { + "epoch": 0.18178466076696165, + "grad_norm": 1.5927339792251587, + "learning_rate": 4.603301405683349e-05, + "loss": 5.1742, + "step": 30566 + }, + { + "epoch": 0.18179060805024264, + "grad_norm": 2.8343615531921387, + "learning_rate": 4.6032761569352587e-05, + "loss": 2.8788, + "step": 30567 + }, + { + "epoch": 0.18179655533352365, + "grad_norm": 2.8158621788024902, + "learning_rate": 4.603250907452936e-05, + "loss": 2.9255, + "step": 30568 + }, + { + "epoch": 0.18180250261680464, + "grad_norm": 2.777045488357544, + "learning_rate": 4.60322565723639e-05, + "loss": 3.2428, + "step": 30569 + }, + { + "epoch": 0.18180844990008563, + "grad_norm": 2.668269157409668, + "learning_rate": 4.60320040628563e-05, + "loss": 3.0804, + "step": 30570 + }, + { + "epoch": 0.18181439718336664, + "grad_norm": 2.453457832336426, + "learning_rate": 4.603175154600664e-05, + "loss": 3.0223, + "step": 30571 + }, + { + "epoch": 0.18182034446664763, + "grad_norm": 1.9281212091445923, + "learning_rate": 4.6031499021815014e-05, + "loss": 3.4469, + "step": 30572 + }, + { + "epoch": 0.18182629174992862, + "grad_norm": 1.8291780948638916, + "learning_rate": 4.603124649028152e-05, + "loss": 4.7501, + "step": 30573 + }, + { + "epoch": 0.18183223903320964, + "grad_norm": 1.518445372581482, + "learning_rate": 4.603099395140622e-05, + "loss": 4.9895, + "step": 30574 + }, + { + "epoch": 0.18183818631649062, + "grad_norm": 1.562727928161621, + "learning_rate": 4.603074140518923e-05, + "loss": 5.0106, + "step": 30575 + }, + { + "epoch": 0.1818441335997716, + "grad_norm": 2.01888370513916, + "learning_rate": 4.6030488851630615e-05, + "loss": 4.8952, + "step": 30576 + }, + { + "epoch": 0.18185008088305263, + "grad_norm": 1.2194279432296753, + "learning_rate": 4.6030236290730476e-05, + "loss": 5.1772, + "step": 30577 + }, + { + "epoch": 0.18185602816633362, + "grad_norm": 2.1817402839660645, + "learning_rate": 4.60299837224889e-05, + "loss": 3.7245, + "step": 30578 + }, + { + "epoch": 0.1818619754496146, + "grad_norm": 1.5736979246139526, + "learning_rate": 4.6029731146905975e-05, + "loss": 5.4276, + "step": 30579 + }, + { + "epoch": 0.18186792273289562, + "grad_norm": 1.9954670667648315, + "learning_rate": 4.602947856398179e-05, + "loss": 4.3673, + "step": 30580 + }, + { + "epoch": 0.1818738700161766, + "grad_norm": 1.5366657972335815, + "learning_rate": 4.6029225973716426e-05, + "loss": 4.8274, + "step": 30581 + }, + { + "epoch": 0.1818798172994576, + "grad_norm": 1.5931968688964844, + "learning_rate": 4.602897337610998e-05, + "loss": 5.2114, + "step": 30582 + }, + { + "epoch": 0.1818857645827386, + "grad_norm": 1.6159030199050903, + "learning_rate": 4.6028720771162536e-05, + "loss": 4.5858, + "step": 30583 + }, + { + "epoch": 0.1818917118660196, + "grad_norm": 1.531935214996338, + "learning_rate": 4.602846815887418e-05, + "loss": 5.4222, + "step": 30584 + }, + { + "epoch": 0.1818976591493006, + "grad_norm": 1.7498992681503296, + "learning_rate": 4.6028215539245015e-05, + "loss": 5.406, + "step": 30585 + }, + { + "epoch": 0.1819036064325816, + "grad_norm": 1.5374906063079834, + "learning_rate": 4.60279629122751e-05, + "loss": 5.2197, + "step": 30586 + }, + { + "epoch": 0.1819095537158626, + "grad_norm": 1.4167890548706055, + "learning_rate": 4.6027710277964555e-05, + "loss": 5.5045, + "step": 30587 + }, + { + "epoch": 0.18191550099914358, + "grad_norm": 1.7180233001708984, + "learning_rate": 4.6027457636313446e-05, + "loss": 5.1006, + "step": 30588 + }, + { + "epoch": 0.1819214482824246, + "grad_norm": 1.6115717887878418, + "learning_rate": 4.602720498732187e-05, + "loss": 4.985, + "step": 30589 + }, + { + "epoch": 0.18192739556570559, + "grad_norm": 2.0676872730255127, + "learning_rate": 4.602695233098991e-05, + "loss": 4.838, + "step": 30590 + }, + { + "epoch": 0.18193334284898657, + "grad_norm": 1.924194574356079, + "learning_rate": 4.6026699667317663e-05, + "loss": 4.8063, + "step": 30591 + }, + { + "epoch": 0.1819392901322676, + "grad_norm": 2.717851400375366, + "learning_rate": 4.602644699630521e-05, + "loss": 4.4838, + "step": 30592 + }, + { + "epoch": 0.18194523741554858, + "grad_norm": 1.5828056335449219, + "learning_rate": 4.602619431795264e-05, + "loss": 5.6776, + "step": 30593 + }, + { + "epoch": 0.18195118469882957, + "grad_norm": 2.8755221366882324, + "learning_rate": 4.602594163226005e-05, + "loss": 5.0374, + "step": 30594 + }, + { + "epoch": 0.18195713198211058, + "grad_norm": 1.8692079782485962, + "learning_rate": 4.602568893922752e-05, + "loss": 4.5605, + "step": 30595 + }, + { + "epoch": 0.18196307926539157, + "grad_norm": 1.3632681369781494, + "learning_rate": 4.602543623885513e-05, + "loss": 5.179, + "step": 30596 + }, + { + "epoch": 0.18196902654867256, + "grad_norm": 1.5239547491073608, + "learning_rate": 4.602518353114298e-05, + "loss": 5.0395, + "step": 30597 + }, + { + "epoch": 0.18197497383195357, + "grad_norm": 1.5662137269973755, + "learning_rate": 4.602493081609116e-05, + "loss": 5.0049, + "step": 30598 + }, + { + "epoch": 0.18198092111523456, + "grad_norm": 1.5579825639724731, + "learning_rate": 4.602467809369976e-05, + "loss": 5.1001, + "step": 30599 + }, + { + "epoch": 0.18198686839851555, + "grad_norm": 1.6686931848526, + "learning_rate": 4.6024425363968846e-05, + "loss": 5.1319, + "step": 30600 + }, + { + "epoch": 0.18199281568179657, + "grad_norm": 1.5801063776016235, + "learning_rate": 4.602417262689853e-05, + "loss": 4.971, + "step": 30601 + }, + { + "epoch": 0.18199876296507755, + "grad_norm": 1.508872389793396, + "learning_rate": 4.6023919882488896e-05, + "loss": 4.9048, + "step": 30602 + }, + { + "epoch": 0.18200471024835854, + "grad_norm": 1.772307276725769, + "learning_rate": 4.602366713074003e-05, + "loss": 4.811, + "step": 30603 + }, + { + "epoch": 0.18201065753163956, + "grad_norm": 1.7669419050216675, + "learning_rate": 4.602341437165202e-05, + "loss": 5.1116, + "step": 30604 + }, + { + "epoch": 0.18201660481492055, + "grad_norm": 1.6603509187698364, + "learning_rate": 4.602316160522494e-05, + "loss": 5.2131, + "step": 30605 + }, + { + "epoch": 0.18202255209820153, + "grad_norm": 1.71107816696167, + "learning_rate": 4.60229088314589e-05, + "loss": 4.8201, + "step": 30606 + }, + { + "epoch": 0.18202849938148255, + "grad_norm": 1.6192432641983032, + "learning_rate": 4.602265605035398e-05, + "loss": 5.3336, + "step": 30607 + }, + { + "epoch": 0.18203444666476354, + "grad_norm": 1.3941278457641602, + "learning_rate": 4.602240326191027e-05, + "loss": 5.2017, + "step": 30608 + }, + { + "epoch": 0.18204039394804453, + "grad_norm": 1.7096537351608276, + "learning_rate": 4.602215046612785e-05, + "loss": 4.6411, + "step": 30609 + }, + { + "epoch": 0.18204634123132554, + "grad_norm": 1.819649338722229, + "learning_rate": 4.6021897663006826e-05, + "loss": 4.5691, + "step": 30610 + }, + { + "epoch": 0.18205228851460653, + "grad_norm": 1.976924180984497, + "learning_rate": 4.602164485254726e-05, + "loss": 4.6638, + "step": 30611 + }, + { + "epoch": 0.18205823579788752, + "grad_norm": 1.6236119270324707, + "learning_rate": 4.602139203474927e-05, + "loss": 4.2149, + "step": 30612 + }, + { + "epoch": 0.18206418308116853, + "grad_norm": 1.688239336013794, + "learning_rate": 4.602113920961292e-05, + "loss": 4.2815, + "step": 30613 + }, + { + "epoch": 0.18207013036444952, + "grad_norm": 1.854436993598938, + "learning_rate": 4.60208863771383e-05, + "loss": 4.3709, + "step": 30614 + }, + { + "epoch": 0.1820760776477305, + "grad_norm": 1.7107741832733154, + "learning_rate": 4.6020633537325516e-05, + "loss": 4.6272, + "step": 30615 + }, + { + "epoch": 0.18208202493101153, + "grad_norm": 1.785346508026123, + "learning_rate": 4.6020380690174645e-05, + "loss": 5.2134, + "step": 30616 + }, + { + "epoch": 0.18208797221429252, + "grad_norm": 1.5961878299713135, + "learning_rate": 4.602012783568578e-05, + "loss": 4.8977, + "step": 30617 + }, + { + "epoch": 0.1820939194975735, + "grad_norm": 1.711595892906189, + "learning_rate": 4.6019874973859e-05, + "loss": 3.9303, + "step": 30618 + }, + { + "epoch": 0.18209986678085452, + "grad_norm": 1.432024598121643, + "learning_rate": 4.6019622104694406e-05, + "loss": 4.1765, + "step": 30619 + }, + { + "epoch": 0.1821058140641355, + "grad_norm": 1.624489188194275, + "learning_rate": 4.601936922819207e-05, + "loss": 4.8633, + "step": 30620 + }, + { + "epoch": 0.1821117613474165, + "grad_norm": 1.4783191680908203, + "learning_rate": 4.6019116344352095e-05, + "loss": 4.8363, + "step": 30621 + }, + { + "epoch": 0.1821177086306975, + "grad_norm": 1.564587950706482, + "learning_rate": 4.601886345317456e-05, + "loss": 4.8749, + "step": 30622 + }, + { + "epoch": 0.1821236559139785, + "grad_norm": 1.7457023859024048, + "learning_rate": 4.601861055465956e-05, + "loss": 4.724, + "step": 30623 + }, + { + "epoch": 0.1821296031972595, + "grad_norm": 1.6358530521392822, + "learning_rate": 4.6018357648807175e-05, + "loss": 4.8055, + "step": 30624 + }, + { + "epoch": 0.18213555048054048, + "grad_norm": 1.67806875705719, + "learning_rate": 4.601810473561751e-05, + "loss": 4.7913, + "step": 30625 + }, + { + "epoch": 0.1821414977638215, + "grad_norm": 1.7741279602050781, + "learning_rate": 4.601785181509063e-05, + "loss": 4.056, + "step": 30626 + }, + { + "epoch": 0.18214744504710248, + "grad_norm": 1.6061371564865112, + "learning_rate": 4.601759888722663e-05, + "loss": 4.0635, + "step": 30627 + }, + { + "epoch": 0.18215339233038347, + "grad_norm": 1.329079508781433, + "learning_rate": 4.6017345952025616e-05, + "loss": 5.2524, + "step": 30628 + }, + { + "epoch": 0.18215933961366448, + "grad_norm": 1.6871402263641357, + "learning_rate": 4.601709300948767e-05, + "loss": 4.5095, + "step": 30629 + }, + { + "epoch": 0.18216528689694547, + "grad_norm": 1.7423584461212158, + "learning_rate": 4.6016840059612856e-05, + "loss": 4.2299, + "step": 30630 + }, + { + "epoch": 0.18217123418022646, + "grad_norm": 1.7102059125900269, + "learning_rate": 4.601658710240129e-05, + "loss": 4.3972, + "step": 30631 + }, + { + "epoch": 0.18217718146350748, + "grad_norm": 1.7572731971740723, + "learning_rate": 4.601633413785305e-05, + "loss": 5.0521, + "step": 30632 + }, + { + "epoch": 0.18218312874678846, + "grad_norm": 1.632642388343811, + "learning_rate": 4.6016081165968215e-05, + "loss": 4.9921, + "step": 30633 + }, + { + "epoch": 0.18218907603006945, + "grad_norm": 1.7840354442596436, + "learning_rate": 4.6015828186746896e-05, + "loss": 4.9606, + "step": 30634 + }, + { + "epoch": 0.18219502331335047, + "grad_norm": 1.7963460683822632, + "learning_rate": 4.601557520018917e-05, + "loss": 4.633, + "step": 30635 + }, + { + "epoch": 0.18220097059663146, + "grad_norm": 1.5674350261688232, + "learning_rate": 4.601532220629511e-05, + "loss": 5.1909, + "step": 30636 + }, + { + "epoch": 0.18220691787991244, + "grad_norm": 1.7398990392684937, + "learning_rate": 4.6015069205064835e-05, + "loss": 5.0319, + "step": 30637 + }, + { + "epoch": 0.18221286516319346, + "grad_norm": 1.474489450454712, + "learning_rate": 4.60148161964984e-05, + "loss": 5.0893, + "step": 30638 + }, + { + "epoch": 0.18221881244647445, + "grad_norm": 1.4791532754898071, + "learning_rate": 4.601456318059592e-05, + "loss": 4.6663, + "step": 30639 + }, + { + "epoch": 0.18222475972975544, + "grad_norm": 1.7666285037994385, + "learning_rate": 4.601431015735747e-05, + "loss": 4.297, + "step": 30640 + }, + { + "epoch": 0.18223070701303645, + "grad_norm": 1.7343413829803467, + "learning_rate": 4.601405712678314e-05, + "loss": 4.6023, + "step": 30641 + }, + { + "epoch": 0.18223665429631744, + "grad_norm": 1.87008798122406, + "learning_rate": 4.601380408887302e-05, + "loss": 5.0135, + "step": 30642 + }, + { + "epoch": 0.18224260157959843, + "grad_norm": 1.5589100122451782, + "learning_rate": 4.60135510436272e-05, + "loss": 4.973, + "step": 30643 + }, + { + "epoch": 0.18224854886287944, + "grad_norm": 1.6267797946929932, + "learning_rate": 4.601329799104577e-05, + "loss": 4.9089, + "step": 30644 + }, + { + "epoch": 0.18225449614616043, + "grad_norm": 1.3924577236175537, + "learning_rate": 4.601304493112881e-05, + "loss": 4.8534, + "step": 30645 + }, + { + "epoch": 0.18226044342944142, + "grad_norm": 1.6482142210006714, + "learning_rate": 4.601279186387642e-05, + "loss": 4.8919, + "step": 30646 + }, + { + "epoch": 0.18226639071272244, + "grad_norm": 1.5615832805633545, + "learning_rate": 4.6012538789288676e-05, + "loss": 4.9114, + "step": 30647 + }, + { + "epoch": 0.18227233799600343, + "grad_norm": 1.4806512594223022, + "learning_rate": 4.601228570736566e-05, + "loss": 4.8957, + "step": 30648 + }, + { + "epoch": 0.1822782852792844, + "grad_norm": 1.3537266254425049, + "learning_rate": 4.6012032618107494e-05, + "loss": 4.8277, + "step": 30649 + }, + { + "epoch": 0.18228423256256543, + "grad_norm": 1.5608755350112915, + "learning_rate": 4.601177952151423e-05, + "loss": 4.9707, + "step": 30650 + }, + { + "epoch": 0.18229017984584642, + "grad_norm": 1.6153634786605835, + "learning_rate": 4.601152641758597e-05, + "loss": 5.1415, + "step": 30651 + }, + { + "epoch": 0.1822961271291274, + "grad_norm": 1.7191613912582397, + "learning_rate": 4.601127330632281e-05, + "loss": 5.1045, + "step": 30652 + }, + { + "epoch": 0.18230207441240842, + "grad_norm": 1.668485164642334, + "learning_rate": 4.601102018772483e-05, + "loss": 5.1807, + "step": 30653 + }, + { + "epoch": 0.1823080216956894, + "grad_norm": 1.7589253187179565, + "learning_rate": 4.601076706179212e-05, + "loss": 4.9829, + "step": 30654 + }, + { + "epoch": 0.1823139689789704, + "grad_norm": 1.5183218717575073, + "learning_rate": 4.6010513928524766e-05, + "loss": 4.9434, + "step": 30655 + }, + { + "epoch": 0.1823199162622514, + "grad_norm": 1.5674960613250732, + "learning_rate": 4.601026078792287e-05, + "loss": 4.8959, + "step": 30656 + }, + { + "epoch": 0.1823258635455324, + "grad_norm": 1.8212403059005737, + "learning_rate": 4.60100076399865e-05, + "loss": 4.2204, + "step": 30657 + }, + { + "epoch": 0.1823318108288134, + "grad_norm": 1.7452092170715332, + "learning_rate": 4.600975448471575e-05, + "loss": 4.7962, + "step": 30658 + }, + { + "epoch": 0.1823377581120944, + "grad_norm": 1.5074201822280884, + "learning_rate": 4.6009501322110716e-05, + "loss": 5.202, + "step": 30659 + }, + { + "epoch": 0.1823437053953754, + "grad_norm": 1.5057684183120728, + "learning_rate": 4.600924815217147e-05, + "loss": 5.3574, + "step": 30660 + }, + { + "epoch": 0.18234965267865638, + "grad_norm": 1.5492216348648071, + "learning_rate": 4.600899497489813e-05, + "loss": 5.2738, + "step": 30661 + }, + { + "epoch": 0.1823555999619374, + "grad_norm": 1.65701425075531, + "learning_rate": 4.600874179029076e-05, + "loss": 5.2794, + "step": 30662 + }, + { + "epoch": 0.18236154724521839, + "grad_norm": 2.055145740509033, + "learning_rate": 4.600848859834945e-05, + "loss": 5.1107, + "step": 30663 + }, + { + "epoch": 0.18236749452849937, + "grad_norm": 2.13283634185791, + "learning_rate": 4.600823539907431e-05, + "loss": 4.5388, + "step": 30664 + }, + { + "epoch": 0.1823734418117804, + "grad_norm": 1.6410765647888184, + "learning_rate": 4.6007982192465396e-05, + "loss": 5.5528, + "step": 30665 + }, + { + "epoch": 0.18237938909506138, + "grad_norm": 1.7154231071472168, + "learning_rate": 4.600772897852282e-05, + "loss": 5.2923, + "step": 30666 + }, + { + "epoch": 0.18238533637834237, + "grad_norm": 1.7217670679092407, + "learning_rate": 4.6007475757246656e-05, + "loss": 5.0955, + "step": 30667 + }, + { + "epoch": 0.18239128366162338, + "grad_norm": 1.7309542894363403, + "learning_rate": 4.6007222528637005e-05, + "loss": 5.4714, + "step": 30668 + }, + { + "epoch": 0.18239723094490437, + "grad_norm": 2.1107077598571777, + "learning_rate": 4.6006969292693946e-05, + "loss": 4.1897, + "step": 30669 + }, + { + "epoch": 0.18240317822818536, + "grad_norm": 1.6427122354507446, + "learning_rate": 4.6006716049417574e-05, + "loss": 5.4529, + "step": 30670 + }, + { + "epoch": 0.18240912551146637, + "grad_norm": 1.698148488998413, + "learning_rate": 4.600646279880798e-05, + "loss": 4.844, + "step": 30671 + }, + { + "epoch": 0.18241507279474736, + "grad_norm": 2.373337984085083, + "learning_rate": 4.6006209540865236e-05, + "loss": 4.3026, + "step": 30672 + }, + { + "epoch": 0.18242102007802835, + "grad_norm": 2.3324615955352783, + "learning_rate": 4.6005956275589446e-05, + "loss": 4.1663, + "step": 30673 + }, + { + "epoch": 0.18242696736130937, + "grad_norm": 2.296529531478882, + "learning_rate": 4.6005703002980696e-05, + "loss": 4.3019, + "step": 30674 + }, + { + "epoch": 0.18243291464459035, + "grad_norm": 2.13657808303833, + "learning_rate": 4.6005449723039066e-05, + "loss": 3.9219, + "step": 30675 + }, + { + "epoch": 0.18243886192787134, + "grad_norm": 1.3434631824493408, + "learning_rate": 4.600519643576466e-05, + "loss": 5.5071, + "step": 30676 + }, + { + "epoch": 0.18244480921115236, + "grad_norm": 1.3897916078567505, + "learning_rate": 4.6004943141157554e-05, + "loss": 5.8751, + "step": 30677 + }, + { + "epoch": 0.18245075649443335, + "grad_norm": 1.660503625869751, + "learning_rate": 4.600468983921783e-05, + "loss": 5.3946, + "step": 30678 + }, + { + "epoch": 0.18245670377771434, + "grad_norm": 1.4913995265960693, + "learning_rate": 4.6004436529945596e-05, + "loss": 5.3409, + "step": 30679 + }, + { + "epoch": 0.18246265106099535, + "grad_norm": 1.590925693511963, + "learning_rate": 4.6004183213340924e-05, + "loss": 5.2041, + "step": 30680 + }, + { + "epoch": 0.18246859834427634, + "grad_norm": 1.5279881954193115, + "learning_rate": 4.6003929889403915e-05, + "loss": 4.8881, + "step": 30681 + }, + { + "epoch": 0.18247454562755733, + "grad_norm": 1.558207392692566, + "learning_rate": 4.600367655813464e-05, + "loss": 5.0951, + "step": 30682 + }, + { + "epoch": 0.18248049291083832, + "grad_norm": 1.4454327821731567, + "learning_rate": 4.600342321953322e-05, + "loss": 5.1351, + "step": 30683 + }, + { + "epoch": 0.18248644019411933, + "grad_norm": 1.6854497194290161, + "learning_rate": 4.60031698735997e-05, + "loss": 5.1165, + "step": 30684 + }, + { + "epoch": 0.18249238747740032, + "grad_norm": 2.277977466583252, + "learning_rate": 4.600291652033419e-05, + "loss": 5.2921, + "step": 30685 + }, + { + "epoch": 0.1824983347606813, + "grad_norm": 1.7434666156768799, + "learning_rate": 4.600266315973679e-05, + "loss": 5.1459, + "step": 30686 + }, + { + "epoch": 0.18250428204396232, + "grad_norm": 1.8711892366409302, + "learning_rate": 4.6002409791807575e-05, + "loss": 5.0815, + "step": 30687 + }, + { + "epoch": 0.1825102293272433, + "grad_norm": 1.7337292432785034, + "learning_rate": 4.600215641654664e-05, + "loss": 5.5504, + "step": 30688 + }, + { + "epoch": 0.1825161766105243, + "grad_norm": 1.599770188331604, + "learning_rate": 4.600190303395407e-05, + "loss": 5.4996, + "step": 30689 + }, + { + "epoch": 0.18252212389380532, + "grad_norm": 1.6465502977371216, + "learning_rate": 4.6001649644029945e-05, + "loss": 4.6497, + "step": 30690 + }, + { + "epoch": 0.1825280711770863, + "grad_norm": 1.3220854997634888, + "learning_rate": 4.600139624677436e-05, + "loss": 5.5304, + "step": 30691 + }, + { + "epoch": 0.1825340184603673, + "grad_norm": 1.4749271869659424, + "learning_rate": 4.600114284218741e-05, + "loss": 5.3472, + "step": 30692 + }, + { + "epoch": 0.1825399657436483, + "grad_norm": 1.3068197965621948, + "learning_rate": 4.6000889430269175e-05, + "loss": 5.4445, + "step": 30693 + }, + { + "epoch": 0.1825459130269293, + "grad_norm": 1.3629871606826782, + "learning_rate": 4.600063601101974e-05, + "loss": 5.3633, + "step": 30694 + }, + { + "epoch": 0.18255186031021028, + "grad_norm": 1.565169334411621, + "learning_rate": 4.600038258443921e-05, + "loss": 5.3485, + "step": 30695 + }, + { + "epoch": 0.1825578075934913, + "grad_norm": 1.4095406532287598, + "learning_rate": 4.600012915052766e-05, + "loss": 5.3252, + "step": 30696 + }, + { + "epoch": 0.1825637548767723, + "grad_norm": 1.3348292112350464, + "learning_rate": 4.599987570928518e-05, + "loss": 5.2613, + "step": 30697 + }, + { + "epoch": 0.18256970216005328, + "grad_norm": 1.436431646347046, + "learning_rate": 4.599962226071187e-05, + "loss": 5.2325, + "step": 30698 + }, + { + "epoch": 0.1825756494433343, + "grad_norm": 1.4782795906066895, + "learning_rate": 4.59993688048078e-05, + "loss": 5.2674, + "step": 30699 + }, + { + "epoch": 0.18258159672661528, + "grad_norm": 1.8673319816589355, + "learning_rate": 4.599911534157306e-05, + "loss": 4.9126, + "step": 30700 + }, + { + "epoch": 0.18258754400989627, + "grad_norm": 1.695785403251648, + "learning_rate": 4.599886187100776e-05, + "loss": 5.2728, + "step": 30701 + }, + { + "epoch": 0.18259349129317728, + "grad_norm": 1.6430630683898926, + "learning_rate": 4.599860839311197e-05, + "loss": 5.1289, + "step": 30702 + }, + { + "epoch": 0.18259943857645827, + "grad_norm": 1.4497203826904297, + "learning_rate": 4.599835490788578e-05, + "loss": 5.1118, + "step": 30703 + }, + { + "epoch": 0.18260538585973926, + "grad_norm": 1.6501460075378418, + "learning_rate": 4.599810141532929e-05, + "loss": 5.3177, + "step": 30704 + }, + { + "epoch": 0.18261133314302028, + "grad_norm": 1.6418206691741943, + "learning_rate": 4.5997847915442564e-05, + "loss": 5.3195, + "step": 30705 + }, + { + "epoch": 0.18261728042630127, + "grad_norm": 2.704659938812256, + "learning_rate": 4.599759440822572e-05, + "loss": 4.5861, + "step": 30706 + }, + { + "epoch": 0.18262322770958225, + "grad_norm": 2.0303256511688232, + "learning_rate": 4.5997340893678825e-05, + "loss": 4.7766, + "step": 30707 + }, + { + "epoch": 0.18262917499286327, + "grad_norm": 1.7311177253723145, + "learning_rate": 4.599708737180198e-05, + "loss": 5.2158, + "step": 30708 + }, + { + "epoch": 0.18263512227614426, + "grad_norm": 1.7286479473114014, + "learning_rate": 4.5996833842595264e-05, + "loss": 5.4676, + "step": 30709 + }, + { + "epoch": 0.18264106955942525, + "grad_norm": 1.5010279417037964, + "learning_rate": 4.599658030605877e-05, + "loss": 5.3087, + "step": 30710 + }, + { + "epoch": 0.18264701684270626, + "grad_norm": 1.9836444854736328, + "learning_rate": 4.5996326762192585e-05, + "loss": 5.1255, + "step": 30711 + }, + { + "epoch": 0.18265296412598725, + "grad_norm": 1.8559132814407349, + "learning_rate": 4.599607321099681e-05, + "loss": 5.5439, + "step": 30712 + }, + { + "epoch": 0.18265891140926824, + "grad_norm": 2.781334400177002, + "learning_rate": 4.5995819652471515e-05, + "loss": 4.236, + "step": 30713 + }, + { + "epoch": 0.18266485869254925, + "grad_norm": 1.7039425373077393, + "learning_rate": 4.59955660866168e-05, + "loss": 5.2539, + "step": 30714 + }, + { + "epoch": 0.18267080597583024, + "grad_norm": 1.7565476894378662, + "learning_rate": 4.5995312513432744e-05, + "loss": 5.3456, + "step": 30715 + }, + { + "epoch": 0.18267675325911123, + "grad_norm": 1.8682184219360352, + "learning_rate": 4.599505893291945e-05, + "loss": 5.1608, + "step": 30716 + }, + { + "epoch": 0.18268270054239225, + "grad_norm": 1.6879570484161377, + "learning_rate": 4.599480534507699e-05, + "loss": 5.2518, + "step": 30717 + }, + { + "epoch": 0.18268864782567323, + "grad_norm": 1.6643418073654175, + "learning_rate": 4.599455174990546e-05, + "loss": 5.2634, + "step": 30718 + }, + { + "epoch": 0.18269459510895422, + "grad_norm": 1.6813920736312866, + "learning_rate": 4.599429814740496e-05, + "loss": 5.1519, + "step": 30719 + }, + { + "epoch": 0.18270054239223524, + "grad_norm": 1.4897735118865967, + "learning_rate": 4.599404453757555e-05, + "loss": 5.0287, + "step": 30720 + }, + { + "epoch": 0.18270648967551623, + "grad_norm": 1.6526249647140503, + "learning_rate": 4.599379092041735e-05, + "loss": 5.2445, + "step": 30721 + }, + { + "epoch": 0.18271243695879721, + "grad_norm": 1.635257363319397, + "learning_rate": 4.599353729593043e-05, + "loss": 4.9676, + "step": 30722 + }, + { + "epoch": 0.18271838424207823, + "grad_norm": 2.6660733222961426, + "learning_rate": 4.599328366411488e-05, + "loss": 3.8513, + "step": 30723 + }, + { + "epoch": 0.18272433152535922, + "grad_norm": 2.4976534843444824, + "learning_rate": 4.5993030024970796e-05, + "loss": 3.8516, + "step": 30724 + }, + { + "epoch": 0.1827302788086402, + "grad_norm": 1.936405062675476, + "learning_rate": 4.599277637849826e-05, + "loss": 4.5016, + "step": 30725 + }, + { + "epoch": 0.18273622609192122, + "grad_norm": 1.5889533758163452, + "learning_rate": 4.5992522724697365e-05, + "loss": 5.095, + "step": 30726 + }, + { + "epoch": 0.1827421733752022, + "grad_norm": 1.8294072151184082, + "learning_rate": 4.5992269063568195e-05, + "loss": 5.1108, + "step": 30727 + }, + { + "epoch": 0.1827481206584832, + "grad_norm": 1.8671683073043823, + "learning_rate": 4.5992015395110835e-05, + "loss": 4.9816, + "step": 30728 + }, + { + "epoch": 0.18275406794176421, + "grad_norm": 1.6619024276733398, + "learning_rate": 4.5991761719325386e-05, + "loss": 5.0146, + "step": 30729 + }, + { + "epoch": 0.1827600152250452, + "grad_norm": 1.7837094068527222, + "learning_rate": 4.5991508036211936e-05, + "loss": 5.2135, + "step": 30730 + }, + { + "epoch": 0.1827659625083262, + "grad_norm": 2.0837316513061523, + "learning_rate": 4.5991254345770554e-05, + "loss": 5.4597, + "step": 30731 + }, + { + "epoch": 0.1827719097916072, + "grad_norm": 1.580824851989746, + "learning_rate": 4.5991000648001354e-05, + "loss": 5.0718, + "step": 30732 + }, + { + "epoch": 0.1827778570748882, + "grad_norm": 1.8121145963668823, + "learning_rate": 4.5990746942904404e-05, + "loss": 4.8452, + "step": 30733 + }, + { + "epoch": 0.18278380435816918, + "grad_norm": 1.6485167741775513, + "learning_rate": 4.59904932304798e-05, + "loss": 5.696, + "step": 30734 + }, + { + "epoch": 0.1827897516414502, + "grad_norm": 1.5621600151062012, + "learning_rate": 4.599023951072764e-05, + "loss": 5.5879, + "step": 30735 + }, + { + "epoch": 0.1827956989247312, + "grad_norm": 1.4888461828231812, + "learning_rate": 4.5989985783648006e-05, + "loss": 5.6557, + "step": 30736 + }, + { + "epoch": 0.18280164620801218, + "grad_norm": 1.422515869140625, + "learning_rate": 4.598973204924097e-05, + "loss": 5.3813, + "step": 30737 + }, + { + "epoch": 0.1828075934912932, + "grad_norm": 2.142186403274536, + "learning_rate": 4.598947830750665e-05, + "loss": 4.405, + "step": 30738 + }, + { + "epoch": 0.18281354077457418, + "grad_norm": 1.8209202289581299, + "learning_rate": 4.598922455844511e-05, + "loss": 4.4518, + "step": 30739 + }, + { + "epoch": 0.18281948805785517, + "grad_norm": 1.6258145570755005, + "learning_rate": 4.5988970802056454e-05, + "loss": 5.0344, + "step": 30740 + }, + { + "epoch": 0.18282543534113616, + "grad_norm": 1.7348908185958862, + "learning_rate": 4.5988717038340766e-05, + "loss": 4.9899, + "step": 30741 + }, + { + "epoch": 0.18283138262441717, + "grad_norm": 1.7576826810836792, + "learning_rate": 4.5988463267298134e-05, + "loss": 5.0383, + "step": 30742 + }, + { + "epoch": 0.18283732990769816, + "grad_norm": 1.9962698221206665, + "learning_rate": 4.598820948892864e-05, + "loss": 5.2593, + "step": 30743 + }, + { + "epoch": 0.18284327719097915, + "grad_norm": 2.0773308277130127, + "learning_rate": 4.5987955703232385e-05, + "loss": 5.3262, + "step": 30744 + }, + { + "epoch": 0.18284922447426016, + "grad_norm": 1.3248738050460815, + "learning_rate": 4.5987701910209445e-05, + "loss": 5.1114, + "step": 30745 + }, + { + "epoch": 0.18285517175754115, + "grad_norm": 1.578334927558899, + "learning_rate": 4.598744810985992e-05, + "loss": 4.863, + "step": 30746 + }, + { + "epoch": 0.18286111904082214, + "grad_norm": 1.6194567680358887, + "learning_rate": 4.59871943021839e-05, + "loss": 5.6935, + "step": 30747 + }, + { + "epoch": 0.18286706632410316, + "grad_norm": 1.4933133125305176, + "learning_rate": 4.5986940487181457e-05, + "loss": 5.7171, + "step": 30748 + }, + { + "epoch": 0.18287301360738414, + "grad_norm": 1.9716706275939941, + "learning_rate": 4.5986686664852694e-05, + "loss": 4.8064, + "step": 30749 + }, + { + "epoch": 0.18287896089066513, + "grad_norm": 1.518475890159607, + "learning_rate": 4.598643283519769e-05, + "loss": 4.675, + "step": 30750 + }, + { + "epoch": 0.18288490817394615, + "grad_norm": 1.9535077810287476, + "learning_rate": 4.598617899821655e-05, + "loss": 4.7127, + "step": 30751 + }, + { + "epoch": 0.18289085545722714, + "grad_norm": 1.7789416313171387, + "learning_rate": 4.598592515390934e-05, + "loss": 4.9423, + "step": 30752 + }, + { + "epoch": 0.18289680274050812, + "grad_norm": 1.5687211751937866, + "learning_rate": 4.5985671302276165e-05, + "loss": 5.1696, + "step": 30753 + }, + { + "epoch": 0.18290275002378914, + "grad_norm": 1.5808442831039429, + "learning_rate": 4.598541744331711e-05, + "loss": 4.9582, + "step": 30754 + }, + { + "epoch": 0.18290869730707013, + "grad_norm": 1.7823104858398438, + "learning_rate": 4.5985163577032264e-05, + "loss": 4.8837, + "step": 30755 + }, + { + "epoch": 0.18291464459035112, + "grad_norm": 1.5199090242385864, + "learning_rate": 4.598490970342172e-05, + "loss": 5.6375, + "step": 30756 + }, + { + "epoch": 0.18292059187363213, + "grad_norm": 1.3824554681777954, + "learning_rate": 4.598465582248555e-05, + "loss": 5.2014, + "step": 30757 + }, + { + "epoch": 0.18292653915691312, + "grad_norm": 1.5041509866714478, + "learning_rate": 4.598440193422386e-05, + "loss": 5.7263, + "step": 30758 + }, + { + "epoch": 0.1829324864401941, + "grad_norm": 1.5195960998535156, + "learning_rate": 4.598414803863673e-05, + "loss": 5.672, + "step": 30759 + }, + { + "epoch": 0.18293843372347512, + "grad_norm": 1.3529129028320312, + "learning_rate": 4.5983894135724245e-05, + "loss": 5.4614, + "step": 30760 + }, + { + "epoch": 0.1829443810067561, + "grad_norm": 1.6261144876480103, + "learning_rate": 4.59836402254865e-05, + "loss": 5.2684, + "step": 30761 + }, + { + "epoch": 0.1829503282900371, + "grad_norm": 1.7360271215438843, + "learning_rate": 4.5983386307923594e-05, + "loss": 5.2797, + "step": 30762 + }, + { + "epoch": 0.18295627557331812, + "grad_norm": 1.720116376876831, + "learning_rate": 4.59831323830356e-05, + "loss": 5.0761, + "step": 30763 + }, + { + "epoch": 0.1829622228565991, + "grad_norm": 1.8357081413269043, + "learning_rate": 4.598287845082261e-05, + "loss": 5.0053, + "step": 30764 + }, + { + "epoch": 0.1829681701398801, + "grad_norm": 2.3169431686401367, + "learning_rate": 4.598262451128471e-05, + "loss": 4.3751, + "step": 30765 + }, + { + "epoch": 0.1829741174231611, + "grad_norm": 2.5170505046844482, + "learning_rate": 4.5982370564422e-05, + "loss": 4.6384, + "step": 30766 + }, + { + "epoch": 0.1829800647064421, + "grad_norm": 1.6446155309677124, + "learning_rate": 4.598211661023455e-05, + "loss": 5.4014, + "step": 30767 + }, + { + "epoch": 0.18298601198972309, + "grad_norm": 1.8167470693588257, + "learning_rate": 4.598186264872247e-05, + "loss": 5.2924, + "step": 30768 + }, + { + "epoch": 0.1829919592730041, + "grad_norm": 1.7999087572097778, + "learning_rate": 4.598160867988584e-05, + "loss": 5.1272, + "step": 30769 + }, + { + "epoch": 0.1829979065562851, + "grad_norm": 1.9385194778442383, + "learning_rate": 4.598135470372473e-05, + "loss": 5.2449, + "step": 30770 + }, + { + "epoch": 0.18300385383956608, + "grad_norm": 1.6628237962722778, + "learning_rate": 4.598110072023927e-05, + "loss": 5.1795, + "step": 30771 + }, + { + "epoch": 0.1830098011228471, + "grad_norm": 1.775499701499939, + "learning_rate": 4.598084672942951e-05, + "loss": 4.2424, + "step": 30772 + }, + { + "epoch": 0.18301574840612808, + "grad_norm": 1.7905422449111938, + "learning_rate": 4.5980592731295554e-05, + "loss": 4.1706, + "step": 30773 + }, + { + "epoch": 0.18302169568940907, + "grad_norm": 2.221928834915161, + "learning_rate": 4.598033872583749e-05, + "loss": 4.5438, + "step": 30774 + }, + { + "epoch": 0.18302764297269009, + "grad_norm": 1.609844446182251, + "learning_rate": 4.59800847130554e-05, + "loss": 4.8598, + "step": 30775 + }, + { + "epoch": 0.18303359025597107, + "grad_norm": 1.642585277557373, + "learning_rate": 4.59798306929494e-05, + "loss": 5.3954, + "step": 30776 + }, + { + "epoch": 0.18303953753925206, + "grad_norm": 1.5810272693634033, + "learning_rate": 4.5979576665519543e-05, + "loss": 5.6698, + "step": 30777 + }, + { + "epoch": 0.18304548482253308, + "grad_norm": 1.5354760885238647, + "learning_rate": 4.597932263076593e-05, + "loss": 5.193, + "step": 30778 + }, + { + "epoch": 0.18305143210581407, + "grad_norm": 1.9338527917861938, + "learning_rate": 4.597906858868866e-05, + "loss": 4.9313, + "step": 30779 + }, + { + "epoch": 0.18305737938909505, + "grad_norm": 1.46987783908844, + "learning_rate": 4.5978814539287804e-05, + "loss": 5.164, + "step": 30780 + }, + { + "epoch": 0.18306332667237607, + "grad_norm": 1.794464349746704, + "learning_rate": 4.597856048256348e-05, + "loss": 5.2455, + "step": 30781 + }, + { + "epoch": 0.18306927395565706, + "grad_norm": 2.2967662811279297, + "learning_rate": 4.5978306418515736e-05, + "loss": 3.8119, + "step": 30782 + }, + { + "epoch": 0.18307522123893805, + "grad_norm": 3.0278241634368896, + "learning_rate": 4.59780523471447e-05, + "loss": 2.8503, + "step": 30783 + }, + { + "epoch": 0.18308116852221906, + "grad_norm": 2.2508223056793213, + "learning_rate": 4.597779826845043e-05, + "loss": 3.6492, + "step": 30784 + }, + { + "epoch": 0.18308711580550005, + "grad_norm": 1.6087052822113037, + "learning_rate": 4.597754418243303e-05, + "loss": 5.4596, + "step": 30785 + }, + { + "epoch": 0.18309306308878104, + "grad_norm": 1.630355954170227, + "learning_rate": 4.597729008909258e-05, + "loss": 5.2701, + "step": 30786 + }, + { + "epoch": 0.18309901037206205, + "grad_norm": 2.1090071201324463, + "learning_rate": 4.597703598842919e-05, + "loss": 4.9463, + "step": 30787 + }, + { + "epoch": 0.18310495765534304, + "grad_norm": 1.7964558601379395, + "learning_rate": 4.597678188044292e-05, + "loss": 5.1681, + "step": 30788 + }, + { + "epoch": 0.18311090493862403, + "grad_norm": 1.802701473236084, + "learning_rate": 4.5976527765133884e-05, + "loss": 5.6211, + "step": 30789 + }, + { + "epoch": 0.18311685222190505, + "grad_norm": 2.110750675201416, + "learning_rate": 4.5976273642502146e-05, + "loss": 5.3476, + "step": 30790 + }, + { + "epoch": 0.18312279950518603, + "grad_norm": 1.558624029159546, + "learning_rate": 4.5976019512547816e-05, + "loss": 5.9624, + "step": 30791 + }, + { + "epoch": 0.18312874678846702, + "grad_norm": 2.025865316390991, + "learning_rate": 4.597576537527097e-05, + "loss": 4.7095, + "step": 30792 + }, + { + "epoch": 0.18313469407174804, + "grad_norm": 1.986502766609192, + "learning_rate": 4.59755112306717e-05, + "loss": 5.0208, + "step": 30793 + }, + { + "epoch": 0.18314064135502903, + "grad_norm": 2.3034214973449707, + "learning_rate": 4.59752570787501e-05, + "loss": 5.3155, + "step": 30794 + }, + { + "epoch": 0.18314658863831002, + "grad_norm": 2.048161029815674, + "learning_rate": 4.597500291950626e-05, + "loss": 5.3074, + "step": 30795 + }, + { + "epoch": 0.18315253592159103, + "grad_norm": 1.9678623676300049, + "learning_rate": 4.5974748752940255e-05, + "loss": 5.6205, + "step": 30796 + }, + { + "epoch": 0.18315848320487202, + "grad_norm": 1.9089009761810303, + "learning_rate": 4.597449457905218e-05, + "loss": 5.1992, + "step": 30797 + }, + { + "epoch": 0.183164430488153, + "grad_norm": 1.6243164539337158, + "learning_rate": 4.5974240397842126e-05, + "loss": 5.8933, + "step": 30798 + }, + { + "epoch": 0.183170377771434, + "grad_norm": 1.676802396774292, + "learning_rate": 4.597398620931019e-05, + "loss": 5.2581, + "step": 30799 + }, + { + "epoch": 0.183176325054715, + "grad_norm": 1.8412030935287476, + "learning_rate": 4.5973732013456444e-05, + "loss": 5.3714, + "step": 30800 + }, + { + "epoch": 0.183182272337996, + "grad_norm": 1.725168228149414, + "learning_rate": 4.597347781028099e-05, + "loss": 5.365, + "step": 30801 + }, + { + "epoch": 0.183188219621277, + "grad_norm": 1.681129813194275, + "learning_rate": 4.5973223599783906e-05, + "loss": 5.6418, + "step": 30802 + }, + { + "epoch": 0.183194166904558, + "grad_norm": 2.0006189346313477, + "learning_rate": 4.597296938196529e-05, + "loss": 4.1994, + "step": 30803 + }, + { + "epoch": 0.183200114187839, + "grad_norm": 1.8607888221740723, + "learning_rate": 4.5972715156825225e-05, + "loss": 5.2584, + "step": 30804 + }, + { + "epoch": 0.18320606147111998, + "grad_norm": 1.9822429418563843, + "learning_rate": 4.59724609243638e-05, + "loss": 5.282, + "step": 30805 + }, + { + "epoch": 0.183212008754401, + "grad_norm": 1.6500173807144165, + "learning_rate": 4.597220668458111e-05, + "loss": 5.1869, + "step": 30806 + }, + { + "epoch": 0.18321795603768198, + "grad_norm": 1.6790781021118164, + "learning_rate": 4.597195243747724e-05, + "loss": 5.3556, + "step": 30807 + }, + { + "epoch": 0.18322390332096297, + "grad_norm": 2.0036866664886475, + "learning_rate": 4.597169818305228e-05, + "loss": 5.1406, + "step": 30808 + }, + { + "epoch": 0.183229850604244, + "grad_norm": 1.4782299995422363, + "learning_rate": 4.5971443921306315e-05, + "loss": 5.3893, + "step": 30809 + }, + { + "epoch": 0.18323579788752498, + "grad_norm": 2.174090623855591, + "learning_rate": 4.597118965223942e-05, + "loss": 5.0501, + "step": 30810 + }, + { + "epoch": 0.18324174517080596, + "grad_norm": 2.0031697750091553, + "learning_rate": 4.597093537585172e-05, + "loss": 5.1521, + "step": 30811 + }, + { + "epoch": 0.18324769245408698, + "grad_norm": 2.0772757530212402, + "learning_rate": 4.597068109214328e-05, + "loss": 4.1726, + "step": 30812 + }, + { + "epoch": 0.18325363973736797, + "grad_norm": 2.2878589630126953, + "learning_rate": 4.597042680111418e-05, + "loss": 4.0209, + "step": 30813 + }, + { + "epoch": 0.18325958702064896, + "grad_norm": 1.8026955127716064, + "learning_rate": 4.597017250276453e-05, + "loss": 5.3708, + "step": 30814 + }, + { + "epoch": 0.18326553430392997, + "grad_norm": 2.1650643348693848, + "learning_rate": 4.596991819709441e-05, + "loss": 3.9999, + "step": 30815 + }, + { + "epoch": 0.18327148158721096, + "grad_norm": 2.541799306869507, + "learning_rate": 4.59696638841039e-05, + "loss": 4.6602, + "step": 30816 + }, + { + "epoch": 0.18327742887049195, + "grad_norm": 1.9072203636169434, + "learning_rate": 4.596940956379311e-05, + "loss": 5.3711, + "step": 30817 + }, + { + "epoch": 0.18328337615377296, + "grad_norm": 1.8470267057418823, + "learning_rate": 4.596915523616211e-05, + "loss": 5.4715, + "step": 30818 + }, + { + "epoch": 0.18328932343705395, + "grad_norm": 1.887373924255371, + "learning_rate": 4.596890090121099e-05, + "loss": 5.9223, + "step": 30819 + }, + { + "epoch": 0.18329527072033494, + "grad_norm": 1.7427541017532349, + "learning_rate": 4.596864655893984e-05, + "loss": 5.8105, + "step": 30820 + }, + { + "epoch": 0.18330121800361596, + "grad_norm": 1.5923210382461548, + "learning_rate": 4.5968392209348763e-05, + "loss": 5.1934, + "step": 30821 + }, + { + "epoch": 0.18330716528689694, + "grad_norm": 2.4530539512634277, + "learning_rate": 4.596813785243783e-05, + "loss": 4.6757, + "step": 30822 + }, + { + "epoch": 0.18331311257017793, + "grad_norm": 2.533837080001831, + "learning_rate": 4.596788348820714e-05, + "loss": 4.1553, + "step": 30823 + }, + { + "epoch": 0.18331905985345895, + "grad_norm": 2.394258737564087, + "learning_rate": 4.596762911665678e-05, + "loss": 3.9019, + "step": 30824 + }, + { + "epoch": 0.18332500713673994, + "grad_norm": 1.8879469633102417, + "learning_rate": 4.596737473778684e-05, + "loss": 4.7484, + "step": 30825 + }, + { + "epoch": 0.18333095442002093, + "grad_norm": 1.896796464920044, + "learning_rate": 4.59671203515974e-05, + "loss": 5.5434, + "step": 30826 + }, + { + "epoch": 0.18333690170330194, + "grad_norm": 1.7430917024612427, + "learning_rate": 4.5966865958088555e-05, + "loss": 5.4315, + "step": 30827 + }, + { + "epoch": 0.18334284898658293, + "grad_norm": 2.2284209728240967, + "learning_rate": 4.59666115572604e-05, + "loss": 3.7282, + "step": 30828 + }, + { + "epoch": 0.18334879626986392, + "grad_norm": 2.362053155899048, + "learning_rate": 4.5966357149113005e-05, + "loss": 3.999, + "step": 30829 + }, + { + "epoch": 0.18335474355314493, + "grad_norm": 2.5124330520629883, + "learning_rate": 4.596610273364648e-05, + "loss": 3.9441, + "step": 30830 + }, + { + "epoch": 0.18336069083642592, + "grad_norm": 2.0157835483551025, + "learning_rate": 4.5965848310860906e-05, + "loss": 4.6031, + "step": 30831 + }, + { + "epoch": 0.1833666381197069, + "grad_norm": 2.036010503768921, + "learning_rate": 4.5965593880756365e-05, + "loss": 5.4114, + "step": 30832 + }, + { + "epoch": 0.18337258540298793, + "grad_norm": 1.6221730709075928, + "learning_rate": 4.596533944333296e-05, + "loss": 5.1669, + "step": 30833 + }, + { + "epoch": 0.1833785326862689, + "grad_norm": 1.5751827955245972, + "learning_rate": 4.5965084998590765e-05, + "loss": 5.0151, + "step": 30834 + }, + { + "epoch": 0.1833844799695499, + "grad_norm": 1.7404930591583252, + "learning_rate": 4.596483054652988e-05, + "loss": 5.0466, + "step": 30835 + }, + { + "epoch": 0.18339042725283092, + "grad_norm": 2.216836452484131, + "learning_rate": 4.5964576087150384e-05, + "loss": 3.9343, + "step": 30836 + }, + { + "epoch": 0.1833963745361119, + "grad_norm": 2.5696306228637695, + "learning_rate": 4.596432162045238e-05, + "loss": 3.9757, + "step": 30837 + }, + { + "epoch": 0.1834023218193929, + "grad_norm": 2.1181252002716064, + "learning_rate": 4.596406714643594e-05, + "loss": 3.6056, + "step": 30838 + }, + { + "epoch": 0.1834082691026739, + "grad_norm": 1.6865168809890747, + "learning_rate": 4.596381266510116e-05, + "loss": 4.7002, + "step": 30839 + }, + { + "epoch": 0.1834142163859549, + "grad_norm": 1.8423880338668823, + "learning_rate": 4.596355817644813e-05, + "loss": 5.0851, + "step": 30840 + }, + { + "epoch": 0.18342016366923589, + "grad_norm": 2.2296884059906006, + "learning_rate": 4.5963303680476945e-05, + "loss": 4.5105, + "step": 30841 + }, + { + "epoch": 0.1834261109525169, + "grad_norm": 2.051112413406372, + "learning_rate": 4.596304917718768e-05, + "loss": 4.465, + "step": 30842 + }, + { + "epoch": 0.1834320582357979, + "grad_norm": 1.638643741607666, + "learning_rate": 4.5962794666580435e-05, + "loss": 4.8162, + "step": 30843 + }, + { + "epoch": 0.18343800551907888, + "grad_norm": 1.5052911043167114, + "learning_rate": 4.5962540148655294e-05, + "loss": 5.7588, + "step": 30844 + }, + { + "epoch": 0.1834439528023599, + "grad_norm": 1.9093655347824097, + "learning_rate": 4.596228562341235e-05, + "loss": 4.9756, + "step": 30845 + }, + { + "epoch": 0.18344990008564088, + "grad_norm": 1.8002632856369019, + "learning_rate": 4.596203109085168e-05, + "loss": 5.4573, + "step": 30846 + }, + { + "epoch": 0.18345584736892187, + "grad_norm": 1.6063766479492188, + "learning_rate": 4.596177655097339e-05, + "loss": 5.1171, + "step": 30847 + }, + { + "epoch": 0.1834617946522029, + "grad_norm": 1.9913804531097412, + "learning_rate": 4.5961522003777554e-05, + "loss": 4.6128, + "step": 30848 + }, + { + "epoch": 0.18346774193548387, + "grad_norm": 1.7059962749481201, + "learning_rate": 4.5961267449264276e-05, + "loss": 5.1847, + "step": 30849 + }, + { + "epoch": 0.18347368921876486, + "grad_norm": 1.802331566810608, + "learning_rate": 4.596101288743362e-05, + "loss": 4.8961, + "step": 30850 + }, + { + "epoch": 0.18347963650204588, + "grad_norm": 3.5751075744628906, + "learning_rate": 4.596075831828571e-05, + "loss": 4.4167, + "step": 30851 + }, + { + "epoch": 0.18348558378532687, + "grad_norm": 3.360201597213745, + "learning_rate": 4.59605037418206e-05, + "loss": 4.7809, + "step": 30852 + }, + { + "epoch": 0.18349153106860785, + "grad_norm": 1.7143275737762451, + "learning_rate": 4.5960249158038406e-05, + "loss": 4.853, + "step": 30853 + }, + { + "epoch": 0.18349747835188887, + "grad_norm": 1.6688681840896606, + "learning_rate": 4.59599945669392e-05, + "loss": 5.0618, + "step": 30854 + }, + { + "epoch": 0.18350342563516986, + "grad_norm": 1.5650609731674194, + "learning_rate": 4.595973996852308e-05, + "loss": 4.8802, + "step": 30855 + }, + { + "epoch": 0.18350937291845085, + "grad_norm": 2.0990846157073975, + "learning_rate": 4.595948536279013e-05, + "loss": 4.4201, + "step": 30856 + }, + { + "epoch": 0.18351532020173184, + "grad_norm": 2.3723249435424805, + "learning_rate": 4.595923074974044e-05, + "loss": 4.2342, + "step": 30857 + }, + { + "epoch": 0.18352126748501285, + "grad_norm": 2.24969744682312, + "learning_rate": 4.59589761293741e-05, + "loss": 4.0662, + "step": 30858 + }, + { + "epoch": 0.18352721476829384, + "grad_norm": 2.0236549377441406, + "learning_rate": 4.595872150169119e-05, + "loss": 5.124, + "step": 30859 + }, + { + "epoch": 0.18353316205157483, + "grad_norm": 2.5715887546539307, + "learning_rate": 4.595846686669182e-05, + "loss": 4.1854, + "step": 30860 + }, + { + "epoch": 0.18353910933485584, + "grad_norm": 2.2042219638824463, + "learning_rate": 4.595821222437606e-05, + "loss": 4.55, + "step": 30861 + }, + { + "epoch": 0.18354505661813683, + "grad_norm": 1.5966359376907349, + "learning_rate": 4.5957957574744007e-05, + "loss": 4.9982, + "step": 30862 + }, + { + "epoch": 0.18355100390141782, + "grad_norm": 1.5397683382034302, + "learning_rate": 4.595770291779574e-05, + "loss": 4.568, + "step": 30863 + }, + { + "epoch": 0.18355695118469884, + "grad_norm": 2.3468825817108154, + "learning_rate": 4.595744825353136e-05, + "loss": 3.9617, + "step": 30864 + }, + { + "epoch": 0.18356289846797982, + "grad_norm": 2.3146417140960693, + "learning_rate": 4.595719358195095e-05, + "loss": 4.0914, + "step": 30865 + }, + { + "epoch": 0.1835688457512608, + "grad_norm": 2.2103490829467773, + "learning_rate": 4.59569389030546e-05, + "loss": 4.1443, + "step": 30866 + }, + { + "epoch": 0.18357479303454183, + "grad_norm": 2.2794134616851807, + "learning_rate": 4.59566842168424e-05, + "loss": 4.1926, + "step": 30867 + }, + { + "epoch": 0.18358074031782282, + "grad_norm": 2.3235437870025635, + "learning_rate": 4.595642952331444e-05, + "loss": 4.0462, + "step": 30868 + }, + { + "epoch": 0.1835866876011038, + "grad_norm": 2.440493583679199, + "learning_rate": 4.595617482247081e-05, + "loss": 4.0408, + "step": 30869 + }, + { + "epoch": 0.18359263488438482, + "grad_norm": 2.231560230255127, + "learning_rate": 4.595592011431159e-05, + "loss": 4.095, + "step": 30870 + }, + { + "epoch": 0.1835985821676658, + "grad_norm": 1.8984894752502441, + "learning_rate": 4.5955665398836877e-05, + "loss": 5.1887, + "step": 30871 + }, + { + "epoch": 0.1836045294509468, + "grad_norm": 1.725150465965271, + "learning_rate": 4.5955410676046754e-05, + "loss": 5.0515, + "step": 30872 + }, + { + "epoch": 0.1836104767342278, + "grad_norm": 1.5244455337524414, + "learning_rate": 4.595515594594132e-05, + "loss": 5.0655, + "step": 30873 + }, + { + "epoch": 0.1836164240175088, + "grad_norm": 1.5998716354370117, + "learning_rate": 4.595490120852065e-05, + "loss": 5.3198, + "step": 30874 + }, + { + "epoch": 0.1836223713007898, + "grad_norm": 1.787981390953064, + "learning_rate": 4.595464646378485e-05, + "loss": 4.6043, + "step": 30875 + }, + { + "epoch": 0.1836283185840708, + "grad_norm": 1.4464097023010254, + "learning_rate": 4.595439171173399e-05, + "loss": 4.7063, + "step": 30876 + }, + { + "epoch": 0.1836342658673518, + "grad_norm": 2.4086809158325195, + "learning_rate": 4.5954136952368175e-05, + "loss": 4.1193, + "step": 30877 + }, + { + "epoch": 0.18364021315063278, + "grad_norm": 2.57763671875, + "learning_rate": 4.595388218568748e-05, + "loss": 4.1104, + "step": 30878 + }, + { + "epoch": 0.1836461604339138, + "grad_norm": 2.3610222339630127, + "learning_rate": 4.5953627411692016e-05, + "loss": 3.9965, + "step": 30879 + }, + { + "epoch": 0.18365210771719478, + "grad_norm": 1.8578461408615112, + "learning_rate": 4.5953372630381845e-05, + "loss": 4.6334, + "step": 30880 + }, + { + "epoch": 0.18365805500047577, + "grad_norm": 1.5059680938720703, + "learning_rate": 4.595311784175706e-05, + "loss": 4.4804, + "step": 30881 + }, + { + "epoch": 0.1836640022837568, + "grad_norm": 1.833595871925354, + "learning_rate": 4.595286304581777e-05, + "loss": 4.563, + "step": 30882 + }, + { + "epoch": 0.18366994956703778, + "grad_norm": 1.8078968524932861, + "learning_rate": 4.595260824256405e-05, + "loss": 4.9626, + "step": 30883 + }, + { + "epoch": 0.18367589685031877, + "grad_norm": 1.5788074731826782, + "learning_rate": 4.5952353431996e-05, + "loss": 5.3483, + "step": 30884 + }, + { + "epoch": 0.18368184413359978, + "grad_norm": 1.642112135887146, + "learning_rate": 4.5952098614113684e-05, + "loss": 5.2537, + "step": 30885 + }, + { + "epoch": 0.18368779141688077, + "grad_norm": 1.4819180965423584, + "learning_rate": 4.595184378891722e-05, + "loss": 4.5967, + "step": 30886 + }, + { + "epoch": 0.18369373870016176, + "grad_norm": 1.5278507471084595, + "learning_rate": 4.5951588956406676e-05, + "loss": 4.6367, + "step": 30887 + }, + { + "epoch": 0.18369968598344277, + "grad_norm": 1.7402983903884888, + "learning_rate": 4.595133411658215e-05, + "loss": 4.7334, + "step": 30888 + }, + { + "epoch": 0.18370563326672376, + "grad_norm": 1.892587423324585, + "learning_rate": 4.595107926944373e-05, + "loss": 4.7473, + "step": 30889 + }, + { + "epoch": 0.18371158055000475, + "grad_norm": 1.741618275642395, + "learning_rate": 4.59508244149915e-05, + "loss": 4.5484, + "step": 30890 + }, + { + "epoch": 0.18371752783328577, + "grad_norm": 1.8447742462158203, + "learning_rate": 4.5950569553225565e-05, + "loss": 4.853, + "step": 30891 + }, + { + "epoch": 0.18372347511656675, + "grad_norm": 1.8637365102767944, + "learning_rate": 4.595031468414599e-05, + "loss": 5.2374, + "step": 30892 + }, + { + "epoch": 0.18372942239984774, + "grad_norm": 1.9203366041183472, + "learning_rate": 4.5950059807752886e-05, + "loss": 4.9026, + "step": 30893 + }, + { + "epoch": 0.18373536968312876, + "grad_norm": 1.5132418870925903, + "learning_rate": 4.5949804924046324e-05, + "loss": 4.7941, + "step": 30894 + }, + { + "epoch": 0.18374131696640975, + "grad_norm": 1.567147135734558, + "learning_rate": 4.594955003302641e-05, + "loss": 4.6679, + "step": 30895 + }, + { + "epoch": 0.18374726424969073, + "grad_norm": 1.6055753231048584, + "learning_rate": 4.594929513469322e-05, + "loss": 4.6216, + "step": 30896 + }, + { + "epoch": 0.18375321153297175, + "grad_norm": 1.609041690826416, + "learning_rate": 4.594904022904685e-05, + "loss": 4.6356, + "step": 30897 + }, + { + "epoch": 0.18375915881625274, + "grad_norm": 1.7323532104492188, + "learning_rate": 4.594878531608738e-05, + "loss": 5.333, + "step": 30898 + }, + { + "epoch": 0.18376510609953373, + "grad_norm": 1.7134934663772583, + "learning_rate": 4.5948530395814916e-05, + "loss": 5.3289, + "step": 30899 + }, + { + "epoch": 0.18377105338281474, + "grad_norm": 1.6868717670440674, + "learning_rate": 4.594827546822953e-05, + "loss": 4.7537, + "step": 30900 + }, + { + "epoch": 0.18377700066609573, + "grad_norm": 1.6590864658355713, + "learning_rate": 4.594802053333132e-05, + "loss": 5.8669, + "step": 30901 + }, + { + "epoch": 0.18378294794937672, + "grad_norm": 1.964417576789856, + "learning_rate": 4.594776559112037e-05, + "loss": 5.4957, + "step": 30902 + }, + { + "epoch": 0.18378889523265773, + "grad_norm": 1.68085777759552, + "learning_rate": 4.5947510641596775e-05, + "loss": 5.1391, + "step": 30903 + }, + { + "epoch": 0.18379484251593872, + "grad_norm": 1.7038891315460205, + "learning_rate": 4.5947255684760615e-05, + "loss": 5.1364, + "step": 30904 + }, + { + "epoch": 0.1838007897992197, + "grad_norm": 1.7355235815048218, + "learning_rate": 4.5947000720611985e-05, + "loss": 4.9449, + "step": 30905 + }, + { + "epoch": 0.18380673708250073, + "grad_norm": 1.458635926246643, + "learning_rate": 4.594674574915098e-05, + "loss": 5.0392, + "step": 30906 + }, + { + "epoch": 0.18381268436578171, + "grad_norm": 1.7265875339508057, + "learning_rate": 4.594649077037768e-05, + "loss": 4.9802, + "step": 30907 + }, + { + "epoch": 0.1838186316490627, + "grad_norm": 1.5100198984146118, + "learning_rate": 4.594623578429217e-05, + "loss": 5.0036, + "step": 30908 + }, + { + "epoch": 0.18382457893234372, + "grad_norm": 1.6836403608322144, + "learning_rate": 4.5945980790894553e-05, + "loss": 4.5476, + "step": 30909 + }, + { + "epoch": 0.1838305262156247, + "grad_norm": 1.6595370769500732, + "learning_rate": 4.5945725790184905e-05, + "loss": 4.9626, + "step": 30910 + }, + { + "epoch": 0.1838364734989057, + "grad_norm": 1.6304545402526855, + "learning_rate": 4.594547078216332e-05, + "loss": 5.1261, + "step": 30911 + }, + { + "epoch": 0.1838424207821867, + "grad_norm": 1.6057839393615723, + "learning_rate": 4.5945215766829894e-05, + "loss": 5.2167, + "step": 30912 + }, + { + "epoch": 0.1838483680654677, + "grad_norm": 1.5401513576507568, + "learning_rate": 4.594496074418471e-05, + "loss": 4.8433, + "step": 30913 + }, + { + "epoch": 0.1838543153487487, + "grad_norm": 1.6510026454925537, + "learning_rate": 4.594470571422785e-05, + "loss": 4.407, + "step": 30914 + }, + { + "epoch": 0.18386026263202968, + "grad_norm": 1.5904121398925781, + "learning_rate": 4.5944450676959414e-05, + "loss": 4.7868, + "step": 30915 + }, + { + "epoch": 0.1838662099153107, + "grad_norm": 1.5439600944519043, + "learning_rate": 4.594419563237949e-05, + "loss": 4.9075, + "step": 30916 + }, + { + "epoch": 0.18387215719859168, + "grad_norm": 1.6869488954544067, + "learning_rate": 4.5943940580488154e-05, + "loss": 4.7118, + "step": 30917 + }, + { + "epoch": 0.18387810448187267, + "grad_norm": 1.858880639076233, + "learning_rate": 4.594368552128551e-05, + "loss": 5.226, + "step": 30918 + }, + { + "epoch": 0.18388405176515368, + "grad_norm": 1.7510879039764404, + "learning_rate": 4.5943430454771644e-05, + "loss": 4.8886, + "step": 30919 + }, + { + "epoch": 0.18388999904843467, + "grad_norm": 1.6084439754486084, + "learning_rate": 4.594317538094664e-05, + "loss": 4.7247, + "step": 30920 + }, + { + "epoch": 0.18389594633171566, + "grad_norm": 1.7126952409744263, + "learning_rate": 4.594292029981059e-05, + "loss": 5.2381, + "step": 30921 + }, + { + "epoch": 0.18390189361499668, + "grad_norm": 1.8401120901107788, + "learning_rate": 4.594266521136358e-05, + "loss": 5.2361, + "step": 30922 + }, + { + "epoch": 0.18390784089827766, + "grad_norm": 1.7398508787155151, + "learning_rate": 4.59424101156057e-05, + "loss": 5.016, + "step": 30923 + }, + { + "epoch": 0.18391378818155865, + "grad_norm": 1.9287174940109253, + "learning_rate": 4.5942155012537056e-05, + "loss": 4.8992, + "step": 30924 + }, + { + "epoch": 0.18391973546483967, + "grad_norm": 1.8512134552001953, + "learning_rate": 4.5941899902157715e-05, + "loss": 4.815, + "step": 30925 + }, + { + "epoch": 0.18392568274812066, + "grad_norm": 1.500188946723938, + "learning_rate": 4.594164478446776e-05, + "loss": 4.9531, + "step": 30926 + }, + { + "epoch": 0.18393163003140164, + "grad_norm": 1.597621202468872, + "learning_rate": 4.594138965946731e-05, + "loss": 4.984, + "step": 30927 + }, + { + "epoch": 0.18393757731468266, + "grad_norm": 2.3577587604522705, + "learning_rate": 4.594113452715643e-05, + "loss": 4.5873, + "step": 30928 + }, + { + "epoch": 0.18394352459796365, + "grad_norm": 1.807442545890808, + "learning_rate": 4.594087938753522e-05, + "loss": 4.2157, + "step": 30929 + }, + { + "epoch": 0.18394947188124464, + "grad_norm": 1.7667385339736938, + "learning_rate": 4.594062424060376e-05, + "loss": 4.7323, + "step": 30930 + }, + { + "epoch": 0.18395541916452565, + "grad_norm": 1.7243330478668213, + "learning_rate": 4.5940369086362144e-05, + "loss": 5.2673, + "step": 30931 + }, + { + "epoch": 0.18396136644780664, + "grad_norm": 1.6076741218566895, + "learning_rate": 4.594011392481047e-05, + "loss": 5.2537, + "step": 30932 + }, + { + "epoch": 0.18396731373108763, + "grad_norm": 1.8104612827301025, + "learning_rate": 4.5939858755948806e-05, + "loss": 5.2573, + "step": 30933 + }, + { + "epoch": 0.18397326101436864, + "grad_norm": 1.4915204048156738, + "learning_rate": 4.5939603579777266e-05, + "loss": 5.1661, + "step": 30934 + }, + { + "epoch": 0.18397920829764963, + "grad_norm": 1.6471868753433228, + "learning_rate": 4.593934839629592e-05, + "loss": 4.8264, + "step": 30935 + }, + { + "epoch": 0.18398515558093062, + "grad_norm": 1.6875669956207275, + "learning_rate": 4.593909320550486e-05, + "loss": 5.0788, + "step": 30936 + }, + { + "epoch": 0.18399110286421164, + "grad_norm": 1.9455054998397827, + "learning_rate": 4.5938838007404185e-05, + "loss": 5.022, + "step": 30937 + }, + { + "epoch": 0.18399705014749262, + "grad_norm": 2.0597965717315674, + "learning_rate": 4.593858280199398e-05, + "loss": 4.6885, + "step": 30938 + }, + { + "epoch": 0.1840029974307736, + "grad_norm": 1.8781501054763794, + "learning_rate": 4.5938327589274324e-05, + "loss": 5.0725, + "step": 30939 + }, + { + "epoch": 0.18400894471405463, + "grad_norm": 1.7399587631225586, + "learning_rate": 4.593807236924532e-05, + "loss": 5.0705, + "step": 30940 + }, + { + "epoch": 0.18401489199733562, + "grad_norm": 1.5905550718307495, + "learning_rate": 4.5937817141907054e-05, + "loss": 5.269, + "step": 30941 + }, + { + "epoch": 0.1840208392806166, + "grad_norm": 1.5723954439163208, + "learning_rate": 4.5937561907259604e-05, + "loss": 5.1356, + "step": 30942 + }, + { + "epoch": 0.18402678656389762, + "grad_norm": 1.725982904434204, + "learning_rate": 4.593730666530307e-05, + "loss": 4.9754, + "step": 30943 + }, + { + "epoch": 0.1840327338471786, + "grad_norm": 1.5784368515014648, + "learning_rate": 4.593705141603755e-05, + "loss": 4.8637, + "step": 30944 + }, + { + "epoch": 0.1840386811304596, + "grad_norm": 1.2270019054412842, + "learning_rate": 4.5936796159463106e-05, + "loss": 4.4398, + "step": 30945 + }, + { + "epoch": 0.1840446284137406, + "grad_norm": 1.6701734066009521, + "learning_rate": 4.593654089557985e-05, + "loss": 4.6544, + "step": 30946 + }, + { + "epoch": 0.1840505756970216, + "grad_norm": 1.6493332386016846, + "learning_rate": 4.5936285624387865e-05, + "loss": 4.9398, + "step": 30947 + }, + { + "epoch": 0.1840565229803026, + "grad_norm": 1.6047924757003784, + "learning_rate": 4.5936030345887236e-05, + "loss": 4.6506, + "step": 30948 + }, + { + "epoch": 0.1840624702635836, + "grad_norm": 1.6082524061203003, + "learning_rate": 4.5935775060078055e-05, + "loss": 4.8463, + "step": 30949 + }, + { + "epoch": 0.1840684175468646, + "grad_norm": 1.603140115737915, + "learning_rate": 4.593551976696041e-05, + "loss": 4.9072, + "step": 30950 + }, + { + "epoch": 0.18407436483014558, + "grad_norm": 1.6736758947372437, + "learning_rate": 4.593526446653439e-05, + "loss": 4.8175, + "step": 30951 + }, + { + "epoch": 0.1840803121134266, + "grad_norm": 2.159503221511841, + "learning_rate": 4.593500915880009e-05, + "loss": 5.2822, + "step": 30952 + }, + { + "epoch": 0.18408625939670759, + "grad_norm": 2.116179943084717, + "learning_rate": 4.59347538437576e-05, + "loss": 4.9114, + "step": 30953 + }, + { + "epoch": 0.18409220667998857, + "grad_norm": 2.1627538204193115, + "learning_rate": 4.5934498521407e-05, + "loss": 4.6353, + "step": 30954 + }, + { + "epoch": 0.1840981539632696, + "grad_norm": 1.7306194305419922, + "learning_rate": 4.593424319174838e-05, + "loss": 5.0884, + "step": 30955 + }, + { + "epoch": 0.18410410124655058, + "grad_norm": 1.7881605625152588, + "learning_rate": 4.5933987854781824e-05, + "loss": 5.4829, + "step": 30956 + }, + { + "epoch": 0.18411004852983157, + "grad_norm": 1.6097657680511475, + "learning_rate": 4.5933732510507446e-05, + "loss": 5.4447, + "step": 30957 + }, + { + "epoch": 0.18411599581311258, + "grad_norm": 1.4753258228302002, + "learning_rate": 4.59334771589253e-05, + "loss": 5.4069, + "step": 30958 + }, + { + "epoch": 0.18412194309639357, + "grad_norm": 1.4360363483428955, + "learning_rate": 4.593322180003551e-05, + "loss": 5.3144, + "step": 30959 + }, + { + "epoch": 0.18412789037967456, + "grad_norm": 1.5445841550827026, + "learning_rate": 4.593296643383814e-05, + "loss": 5.3294, + "step": 30960 + }, + { + "epoch": 0.18413383766295557, + "grad_norm": 1.8465672731399536, + "learning_rate": 4.593271106033329e-05, + "loss": 5.2602, + "step": 30961 + }, + { + "epoch": 0.18413978494623656, + "grad_norm": 1.7009365558624268, + "learning_rate": 4.5932455679521046e-05, + "loss": 5.2779, + "step": 30962 + }, + { + "epoch": 0.18414573222951755, + "grad_norm": 1.5198291540145874, + "learning_rate": 4.593220029140149e-05, + "loss": 5.1775, + "step": 30963 + }, + { + "epoch": 0.18415167951279857, + "grad_norm": 1.5233417749404907, + "learning_rate": 4.5931944895974735e-05, + "loss": 5.1338, + "step": 30964 + }, + { + "epoch": 0.18415762679607955, + "grad_norm": 1.3948924541473389, + "learning_rate": 4.593168949324084e-05, + "loss": 5.2121, + "step": 30965 + }, + { + "epoch": 0.18416357407936054, + "grad_norm": 1.596511721611023, + "learning_rate": 4.593143408319992e-05, + "loss": 5.1374, + "step": 30966 + }, + { + "epoch": 0.18416952136264156, + "grad_norm": 1.8476365804672241, + "learning_rate": 4.593117866585205e-05, + "loss": 5.0453, + "step": 30967 + }, + { + "epoch": 0.18417546864592255, + "grad_norm": 1.9178073406219482, + "learning_rate": 4.5930923241197315e-05, + "loss": 5.1195, + "step": 30968 + }, + { + "epoch": 0.18418141592920353, + "grad_norm": 1.8207836151123047, + "learning_rate": 4.593066780923582e-05, + "loss": 4.8808, + "step": 30969 + }, + { + "epoch": 0.18418736321248455, + "grad_norm": 1.556929588317871, + "learning_rate": 4.5930412369967636e-05, + "loss": 5.042, + "step": 30970 + }, + { + "epoch": 0.18419331049576554, + "grad_norm": 1.5927326679229736, + "learning_rate": 4.593015692339286e-05, + "loss": 4.9574, + "step": 30971 + }, + { + "epoch": 0.18419925777904653, + "grad_norm": 1.686204195022583, + "learning_rate": 4.5929901469511594e-05, + "loss": 5.1615, + "step": 30972 + }, + { + "epoch": 0.18420520506232752, + "grad_norm": 1.8560882806777954, + "learning_rate": 4.5929646008323915e-05, + "loss": 5.4144, + "step": 30973 + }, + { + "epoch": 0.18421115234560853, + "grad_norm": 1.9906892776489258, + "learning_rate": 4.59293905398299e-05, + "loss": 5.5249, + "step": 30974 + }, + { + "epoch": 0.18421709962888952, + "grad_norm": 1.8656678199768066, + "learning_rate": 4.592913506402966e-05, + "loss": 5.4574, + "step": 30975 + }, + { + "epoch": 0.1842230469121705, + "grad_norm": 1.5969977378845215, + "learning_rate": 4.592887958092327e-05, + "loss": 5.3052, + "step": 30976 + }, + { + "epoch": 0.18422899419545152, + "grad_norm": 1.8761509656906128, + "learning_rate": 4.592862409051083e-05, + "loss": 5.4617, + "step": 30977 + }, + { + "epoch": 0.1842349414787325, + "grad_norm": 1.7512613534927368, + "learning_rate": 4.592836859279243e-05, + "loss": 5.1404, + "step": 30978 + }, + { + "epoch": 0.1842408887620135, + "grad_norm": 1.9314844608306885, + "learning_rate": 4.592811308776814e-05, + "loss": 5.1451, + "step": 30979 + }, + { + "epoch": 0.18424683604529452, + "grad_norm": 1.7287604808807373, + "learning_rate": 4.592785757543806e-05, + "loss": 4.9971, + "step": 30980 + }, + { + "epoch": 0.1842527833285755, + "grad_norm": 1.5554300546646118, + "learning_rate": 4.592760205580229e-05, + "loss": 4.9128, + "step": 30981 + }, + { + "epoch": 0.1842587306118565, + "grad_norm": 1.5447009801864624, + "learning_rate": 4.5927346528860907e-05, + "loss": 5.0247, + "step": 30982 + }, + { + "epoch": 0.1842646778951375, + "grad_norm": 1.4151129722595215, + "learning_rate": 4.592709099461401e-05, + "loss": 4.9106, + "step": 30983 + }, + { + "epoch": 0.1842706251784185, + "grad_norm": 1.4430291652679443, + "learning_rate": 4.5926835453061665e-05, + "loss": 5.0316, + "step": 30984 + }, + { + "epoch": 0.18427657246169948, + "grad_norm": 2.097165584564209, + "learning_rate": 4.592657990420399e-05, + "loss": 5.14, + "step": 30985 + }, + { + "epoch": 0.1842825197449805, + "grad_norm": 1.9558128118515015, + "learning_rate": 4.592632434804107e-05, + "loss": 4.6043, + "step": 30986 + }, + { + "epoch": 0.1842884670282615, + "grad_norm": 1.8616024255752563, + "learning_rate": 4.5926068784572975e-05, + "loss": 4.8654, + "step": 30987 + }, + { + "epoch": 0.18429441431154248, + "grad_norm": 2.043250560760498, + "learning_rate": 4.5925813213799805e-05, + "loss": 5.0763, + "step": 30988 + }, + { + "epoch": 0.1843003615948235, + "grad_norm": 1.9793142080307007, + "learning_rate": 4.5925557635721654e-05, + "loss": 4.9104, + "step": 30989 + }, + { + "epoch": 0.18430630887810448, + "grad_norm": 1.7368297576904297, + "learning_rate": 4.59253020503386e-05, + "loss": 5.0578, + "step": 30990 + }, + { + "epoch": 0.18431225616138547, + "grad_norm": 2.311291456222534, + "learning_rate": 4.592504645765075e-05, + "loss": 4.7787, + "step": 30991 + }, + { + "epoch": 0.18431820344466648, + "grad_norm": 1.9127613306045532, + "learning_rate": 4.592479085765818e-05, + "loss": 4.7311, + "step": 30992 + }, + { + "epoch": 0.18432415072794747, + "grad_norm": 2.0677103996276855, + "learning_rate": 4.592453525036098e-05, + "loss": 5.073, + "step": 30993 + }, + { + "epoch": 0.18433009801122846, + "grad_norm": 1.6885477304458618, + "learning_rate": 4.592427963575924e-05, + "loss": 4.7878, + "step": 30994 + }, + { + "epoch": 0.18433604529450948, + "grad_norm": 1.7439665794372559, + "learning_rate": 4.592402401385305e-05, + "loss": 4.882, + "step": 30995 + }, + { + "epoch": 0.18434199257779046, + "grad_norm": 2.02858567237854, + "learning_rate": 4.5923768384642494e-05, + "loss": 4.8182, + "step": 30996 + }, + { + "epoch": 0.18434793986107145, + "grad_norm": 2.1561737060546875, + "learning_rate": 4.5923512748127676e-05, + "loss": 4.8795, + "step": 30997 + }, + { + "epoch": 0.18435388714435247, + "grad_norm": 2.319322347640991, + "learning_rate": 4.592325710430867e-05, + "loss": 4.725, + "step": 30998 + }, + { + "epoch": 0.18435983442763346, + "grad_norm": 2.0449020862579346, + "learning_rate": 4.5923001453185575e-05, + "loss": 4.746, + "step": 30999 + }, + { + "epoch": 0.18436578171091444, + "grad_norm": 2.0369932651519775, + "learning_rate": 4.5922745794758475e-05, + "loss": 4.4575, + "step": 31000 + }, + { + "epoch": 0.18437172899419546, + "grad_norm": 2.1663169860839844, + "learning_rate": 4.5922490129027464e-05, + "loss": 4.7254, + "step": 31001 + }, + { + "epoch": 0.18437767627747645, + "grad_norm": 1.6700929403305054, + "learning_rate": 4.5922234455992617e-05, + "loss": 4.6762, + "step": 31002 + }, + { + "epoch": 0.18438362356075744, + "grad_norm": 2.359294891357422, + "learning_rate": 4.592197877565404e-05, + "loss": 4.67, + "step": 31003 + }, + { + "epoch": 0.18438957084403845, + "grad_norm": 1.9069437980651855, + "learning_rate": 4.5921723088011826e-05, + "loss": 4.9545, + "step": 31004 + }, + { + "epoch": 0.18439551812731944, + "grad_norm": 2.373521327972412, + "learning_rate": 4.592146739306604e-05, + "loss": 3.948, + "step": 31005 + }, + { + "epoch": 0.18440146541060043, + "grad_norm": 2.227628469467163, + "learning_rate": 4.592121169081679e-05, + "loss": 4.1342, + "step": 31006 + }, + { + "epoch": 0.18440741269388145, + "grad_norm": 2.1248085498809814, + "learning_rate": 4.592095598126417e-05, + "loss": 4.3805, + "step": 31007 + }, + { + "epoch": 0.18441335997716243, + "grad_norm": 2.362063407897949, + "learning_rate": 4.592070026440825e-05, + "loss": 4.6606, + "step": 31008 + }, + { + "epoch": 0.18441930726044342, + "grad_norm": 2.0881500244140625, + "learning_rate": 4.5920444540249135e-05, + "loss": 4.7613, + "step": 31009 + }, + { + "epoch": 0.18442525454372444, + "grad_norm": 2.026759147644043, + "learning_rate": 4.5920188808786904e-05, + "loss": 4.5697, + "step": 31010 + }, + { + "epoch": 0.18443120182700543, + "grad_norm": 2.4088351726531982, + "learning_rate": 4.5919933070021657e-05, + "loss": 4.1511, + "step": 31011 + }, + { + "epoch": 0.1844371491102864, + "grad_norm": 2.3477118015289307, + "learning_rate": 4.5919677323953474e-05, + "loss": 4.2753, + "step": 31012 + }, + { + "epoch": 0.18444309639356743, + "grad_norm": 2.198819875717163, + "learning_rate": 4.591942157058245e-05, + "loss": 4.4163, + "step": 31013 + }, + { + "epoch": 0.18444904367684842, + "grad_norm": 2.212641477584839, + "learning_rate": 4.591916580990867e-05, + "loss": 4.6979, + "step": 31014 + }, + { + "epoch": 0.1844549909601294, + "grad_norm": 1.924052119255066, + "learning_rate": 4.591891004193223e-05, + "loss": 4.7703, + "step": 31015 + }, + { + "epoch": 0.18446093824341042, + "grad_norm": 2.4676082134246826, + "learning_rate": 4.591865426665321e-05, + "loss": 4.6545, + "step": 31016 + }, + { + "epoch": 0.1844668855266914, + "grad_norm": 2.432497262954712, + "learning_rate": 4.59183984840717e-05, + "loss": 4.671, + "step": 31017 + }, + { + "epoch": 0.1844728328099724, + "grad_norm": 2.691105842590332, + "learning_rate": 4.59181426941878e-05, + "loss": 4.4464, + "step": 31018 + }, + { + "epoch": 0.1844787800932534, + "grad_norm": 2.5249433517456055, + "learning_rate": 4.591788689700159e-05, + "loss": 4.2623, + "step": 31019 + }, + { + "epoch": 0.1844847273765344, + "grad_norm": 2.6374852657318115, + "learning_rate": 4.5917631092513156e-05, + "loss": 3.4994, + "step": 31020 + }, + { + "epoch": 0.1844906746598154, + "grad_norm": 2.6089253425598145, + "learning_rate": 4.591737528072261e-05, + "loss": 3.8228, + "step": 31021 + }, + { + "epoch": 0.1844966219430964, + "grad_norm": 2.5166683197021484, + "learning_rate": 4.591711946163001e-05, + "loss": 3.4982, + "step": 31022 + }, + { + "epoch": 0.1845025692263774, + "grad_norm": 2.3516764640808105, + "learning_rate": 4.591686363523546e-05, + "loss": 3.5029, + "step": 31023 + }, + { + "epoch": 0.18450851650965838, + "grad_norm": 2.5474250316619873, + "learning_rate": 4.591660780153906e-05, + "loss": 3.5499, + "step": 31024 + }, + { + "epoch": 0.1845144637929394, + "grad_norm": 1.7902573347091675, + "learning_rate": 4.591635196054088e-05, + "loss": 4.6401, + "step": 31025 + }, + { + "epoch": 0.1845204110762204, + "grad_norm": 2.301729679107666, + "learning_rate": 4.5916096112241015e-05, + "loss": 5.2124, + "step": 31026 + }, + { + "epoch": 0.18452635835950137, + "grad_norm": 1.9211527109146118, + "learning_rate": 4.591584025663956e-05, + "loss": 5.107, + "step": 31027 + }, + { + "epoch": 0.1845323056427824, + "grad_norm": 2.245776653289795, + "learning_rate": 4.59155843937366e-05, + "loss": 4.1589, + "step": 31028 + }, + { + "epoch": 0.18453825292606338, + "grad_norm": 2.7997524738311768, + "learning_rate": 4.591532852353223e-05, + "loss": 3.6491, + "step": 31029 + }, + { + "epoch": 0.18454420020934437, + "grad_norm": 2.8077120780944824, + "learning_rate": 4.591507264602653e-05, + "loss": 3.3127, + "step": 31030 + }, + { + "epoch": 0.18455014749262535, + "grad_norm": 1.4262480735778809, + "learning_rate": 4.591481676121959e-05, + "loss": 4.9064, + "step": 31031 + }, + { + "epoch": 0.18455609477590637, + "grad_norm": 1.6911439895629883, + "learning_rate": 4.591456086911152e-05, + "loss": 5.1697, + "step": 31032 + }, + { + "epoch": 0.18456204205918736, + "grad_norm": 1.591536045074463, + "learning_rate": 4.591430496970238e-05, + "loss": 5.3011, + "step": 31033 + }, + { + "epoch": 0.18456798934246835, + "grad_norm": 1.1955918073654175, + "learning_rate": 4.591404906299227e-05, + "loss": 5.1465, + "step": 31034 + }, + { + "epoch": 0.18457393662574936, + "grad_norm": 1.6647759675979614, + "learning_rate": 4.5913793148981286e-05, + "loss": 5.0915, + "step": 31035 + }, + { + "epoch": 0.18457988390903035, + "grad_norm": 1.7477984428405762, + "learning_rate": 4.5913537227669515e-05, + "loss": 4.7211, + "step": 31036 + }, + { + "epoch": 0.18458583119231134, + "grad_norm": 1.973645567893982, + "learning_rate": 4.5913281299057045e-05, + "loss": 5.6001, + "step": 31037 + }, + { + "epoch": 0.18459177847559236, + "grad_norm": 1.3707242012023926, + "learning_rate": 4.591302536314396e-05, + "loss": 5.3164, + "step": 31038 + }, + { + "epoch": 0.18459772575887334, + "grad_norm": 1.71157968044281, + "learning_rate": 4.591276941993036e-05, + "loss": 5.2662, + "step": 31039 + }, + { + "epoch": 0.18460367304215433, + "grad_norm": 1.3975422382354736, + "learning_rate": 4.5912513469416315e-05, + "loss": 4.753, + "step": 31040 + }, + { + "epoch": 0.18460962032543535, + "grad_norm": 2.232591152191162, + "learning_rate": 4.5912257511601944e-05, + "loss": 4.7482, + "step": 31041 + }, + { + "epoch": 0.18461556760871634, + "grad_norm": 1.5958985090255737, + "learning_rate": 4.591200154648731e-05, + "loss": 5.0753, + "step": 31042 + }, + { + "epoch": 0.18462151489199732, + "grad_norm": 1.4874860048294067, + "learning_rate": 4.591174557407252e-05, + "loss": 4.9495, + "step": 31043 + }, + { + "epoch": 0.18462746217527834, + "grad_norm": 1.8329155445098877, + "learning_rate": 4.591148959435765e-05, + "loss": 5.3401, + "step": 31044 + }, + { + "epoch": 0.18463340945855933, + "grad_norm": 1.6365807056427002, + "learning_rate": 4.591123360734279e-05, + "loss": 4.8406, + "step": 31045 + }, + { + "epoch": 0.18463935674184032, + "grad_norm": 1.5671586990356445, + "learning_rate": 4.591097761302804e-05, + "loss": 4.9428, + "step": 31046 + }, + { + "epoch": 0.18464530402512133, + "grad_norm": 1.838995099067688, + "learning_rate": 4.5910721611413486e-05, + "loss": 5.2237, + "step": 31047 + }, + { + "epoch": 0.18465125130840232, + "grad_norm": 1.947945475578308, + "learning_rate": 4.5910465602499216e-05, + "loss": 4.4973, + "step": 31048 + }, + { + "epoch": 0.1846571985916833, + "grad_norm": 2.2322356700897217, + "learning_rate": 4.591020958628531e-05, + "loss": 4.3619, + "step": 31049 + }, + { + "epoch": 0.18466314587496432, + "grad_norm": 2.417125940322876, + "learning_rate": 4.590995356277187e-05, + "loss": 4.2789, + "step": 31050 + }, + { + "epoch": 0.1846690931582453, + "grad_norm": 2.1802711486816406, + "learning_rate": 4.590969753195898e-05, + "loss": 4.0677, + "step": 31051 + }, + { + "epoch": 0.1846750404415263, + "grad_norm": 2.1682262420654297, + "learning_rate": 4.590944149384674e-05, + "loss": 3.8951, + "step": 31052 + }, + { + "epoch": 0.18468098772480732, + "grad_norm": 2.156933546066284, + "learning_rate": 4.5909185448435224e-05, + "loss": 4.1987, + "step": 31053 + }, + { + "epoch": 0.1846869350080883, + "grad_norm": 1.6152640581130981, + "learning_rate": 4.5908929395724526e-05, + "loss": 5.3105, + "step": 31054 + }, + { + "epoch": 0.1846928822913693, + "grad_norm": 2.371634006500244, + "learning_rate": 4.5908673335714735e-05, + "loss": 3.9825, + "step": 31055 + }, + { + "epoch": 0.1846988295746503, + "grad_norm": 2.6450035572052, + "learning_rate": 4.5908417268405946e-05, + "loss": 3.9852, + "step": 31056 + }, + { + "epoch": 0.1847047768579313, + "grad_norm": 2.498091459274292, + "learning_rate": 4.590816119379825e-05, + "loss": 4.1961, + "step": 31057 + }, + { + "epoch": 0.18471072414121228, + "grad_norm": 2.2890594005584717, + "learning_rate": 4.590790511189172e-05, + "loss": 4.2483, + "step": 31058 + }, + { + "epoch": 0.1847166714244933, + "grad_norm": 1.9878109693527222, + "learning_rate": 4.590764902268646e-05, + "loss": 4.2698, + "step": 31059 + }, + { + "epoch": 0.1847226187077743, + "grad_norm": 1.937960147857666, + "learning_rate": 4.590739292618256e-05, + "loss": 4.9191, + "step": 31060 + }, + { + "epoch": 0.18472856599105528, + "grad_norm": 2.69293212890625, + "learning_rate": 4.590713682238009e-05, + "loss": 4.3818, + "step": 31061 + }, + { + "epoch": 0.1847345132743363, + "grad_norm": 1.8170857429504395, + "learning_rate": 4.590688071127917e-05, + "loss": 4.8619, + "step": 31062 + }, + { + "epoch": 0.18474046055761728, + "grad_norm": 2.600891590118408, + "learning_rate": 4.590662459287987e-05, + "loss": 5.3781, + "step": 31063 + }, + { + "epoch": 0.18474640784089827, + "grad_norm": 1.780999779701233, + "learning_rate": 4.590636846718228e-05, + "loss": 5.1513, + "step": 31064 + }, + { + "epoch": 0.18475235512417928, + "grad_norm": 1.7034980058670044, + "learning_rate": 4.59061123341865e-05, + "loss": 5.2488, + "step": 31065 + }, + { + "epoch": 0.18475830240746027, + "grad_norm": 1.7700848579406738, + "learning_rate": 4.590585619389261e-05, + "loss": 4.9997, + "step": 31066 + }, + { + "epoch": 0.18476424969074126, + "grad_norm": 1.8884303569793701, + "learning_rate": 4.5905600046300694e-05, + "loss": 4.8398, + "step": 31067 + }, + { + "epoch": 0.18477019697402228, + "grad_norm": 1.7098636627197266, + "learning_rate": 4.590534389141086e-05, + "loss": 4.7416, + "step": 31068 + }, + { + "epoch": 0.18477614425730327, + "grad_norm": 1.5094579458236694, + "learning_rate": 4.590508772922318e-05, + "loss": 4.8252, + "step": 31069 + }, + { + "epoch": 0.18478209154058425, + "grad_norm": 1.5103203058242798, + "learning_rate": 4.590483155973775e-05, + "loss": 4.9903, + "step": 31070 + }, + { + "epoch": 0.18478803882386527, + "grad_norm": 1.6473743915557861, + "learning_rate": 4.590457538295466e-05, + "loss": 4.5804, + "step": 31071 + }, + { + "epoch": 0.18479398610714626, + "grad_norm": 2.5655574798583984, + "learning_rate": 4.5904319198874e-05, + "loss": 4.8887, + "step": 31072 + }, + { + "epoch": 0.18479993339042725, + "grad_norm": 2.067401647567749, + "learning_rate": 4.5904063007495854e-05, + "loss": 4.4422, + "step": 31073 + }, + { + "epoch": 0.18480588067370826, + "grad_norm": 1.9005351066589355, + "learning_rate": 4.590380680882032e-05, + "loss": 4.3074, + "step": 31074 + }, + { + "epoch": 0.18481182795698925, + "grad_norm": 1.704513669013977, + "learning_rate": 4.590355060284748e-05, + "loss": 4.6102, + "step": 31075 + }, + { + "epoch": 0.18481777524027024, + "grad_norm": 1.7560302019119263, + "learning_rate": 4.590329438957743e-05, + "loss": 4.6725, + "step": 31076 + }, + { + "epoch": 0.18482372252355125, + "grad_norm": 1.44539213180542, + "learning_rate": 4.5903038169010254e-05, + "loss": 4.8119, + "step": 31077 + }, + { + "epoch": 0.18482966980683224, + "grad_norm": 1.451361894607544, + "learning_rate": 4.5902781941146045e-05, + "loss": 5.1253, + "step": 31078 + }, + { + "epoch": 0.18483561709011323, + "grad_norm": 1.8367254734039307, + "learning_rate": 4.590252570598489e-05, + "loss": 5.2783, + "step": 31079 + }, + { + "epoch": 0.18484156437339425, + "grad_norm": 1.64362370967865, + "learning_rate": 4.590226946352688e-05, + "loss": 4.8848, + "step": 31080 + }, + { + "epoch": 0.18484751165667523, + "grad_norm": 1.3705360889434814, + "learning_rate": 4.590201321377209e-05, + "loss": 5.0658, + "step": 31081 + }, + { + "epoch": 0.18485345893995622, + "grad_norm": 1.3959661722183228, + "learning_rate": 4.5901756956720645e-05, + "loss": 5.2573, + "step": 31082 + }, + { + "epoch": 0.18485940622323724, + "grad_norm": 2.0968472957611084, + "learning_rate": 4.59015006923726e-05, + "loss": 4.5473, + "step": 31083 + }, + { + "epoch": 0.18486535350651823, + "grad_norm": 1.7659695148468018, + "learning_rate": 4.5901244420728055e-05, + "loss": 4.6719, + "step": 31084 + }, + { + "epoch": 0.18487130078979921, + "grad_norm": 2.2793681621551514, + "learning_rate": 4.59009881417871e-05, + "loss": 4.4078, + "step": 31085 + }, + { + "epoch": 0.18487724807308023, + "grad_norm": 1.7163949012756348, + "learning_rate": 4.590073185554984e-05, + "loss": 4.9258, + "step": 31086 + }, + { + "epoch": 0.18488319535636122, + "grad_norm": 2.0032429695129395, + "learning_rate": 4.5900475562016346e-05, + "loss": 5.2906, + "step": 31087 + }, + { + "epoch": 0.1848891426396422, + "grad_norm": 1.6730388402938843, + "learning_rate": 4.5900219261186706e-05, + "loss": 4.7542, + "step": 31088 + }, + { + "epoch": 0.18489508992292322, + "grad_norm": 2.3741564750671387, + "learning_rate": 4.5899962953061015e-05, + "loss": 4.3728, + "step": 31089 + }, + { + "epoch": 0.1849010372062042, + "grad_norm": 1.950449824333191, + "learning_rate": 4.589970663763937e-05, + "loss": 4.7416, + "step": 31090 + }, + { + "epoch": 0.1849069844894852, + "grad_norm": 1.9121187925338745, + "learning_rate": 4.589945031492185e-05, + "loss": 4.6129, + "step": 31091 + }, + { + "epoch": 0.1849129317727662, + "grad_norm": 2.0481319427490234, + "learning_rate": 4.589919398490854e-05, + "loss": 4.4413, + "step": 31092 + }, + { + "epoch": 0.1849188790560472, + "grad_norm": 2.135030508041382, + "learning_rate": 4.589893764759955e-05, + "loss": 4.3973, + "step": 31093 + }, + { + "epoch": 0.1849248263393282, + "grad_norm": 1.7354028224945068, + "learning_rate": 4.589868130299495e-05, + "loss": 4.8435, + "step": 31094 + }, + { + "epoch": 0.18493077362260918, + "grad_norm": 1.616546630859375, + "learning_rate": 4.5898424951094834e-05, + "loss": 4.889, + "step": 31095 + }, + { + "epoch": 0.1849367209058902, + "grad_norm": 1.718268871307373, + "learning_rate": 4.5898168591899305e-05, + "loss": 5.0764, + "step": 31096 + }, + { + "epoch": 0.18494266818917118, + "grad_norm": 1.5846326351165771, + "learning_rate": 4.589791222540842e-05, + "loss": 5.0193, + "step": 31097 + }, + { + "epoch": 0.18494861547245217, + "grad_norm": 1.6834520101547241, + "learning_rate": 4.589765585162231e-05, + "loss": 4.9781, + "step": 31098 + }, + { + "epoch": 0.1849545627557332, + "grad_norm": 2.0722103118896484, + "learning_rate": 4.5897399470541035e-05, + "loss": 4.499, + "step": 31099 + }, + { + "epoch": 0.18496051003901418, + "grad_norm": 2.0412447452545166, + "learning_rate": 4.589714308216469e-05, + "loss": 4.6282, + "step": 31100 + }, + { + "epoch": 0.18496645732229516, + "grad_norm": 1.5334446430206299, + "learning_rate": 4.589688668649338e-05, + "loss": 4.8552, + "step": 31101 + }, + { + "epoch": 0.18497240460557618, + "grad_norm": 1.6716012954711914, + "learning_rate": 4.589663028352718e-05, + "loss": 4.6907, + "step": 31102 + }, + { + "epoch": 0.18497835188885717, + "grad_norm": 1.5221296548843384, + "learning_rate": 4.589637387326618e-05, + "loss": 4.8665, + "step": 31103 + }, + { + "epoch": 0.18498429917213816, + "grad_norm": 1.4777991771697998, + "learning_rate": 4.589611745571047e-05, + "loss": 4.6618, + "step": 31104 + }, + { + "epoch": 0.18499024645541917, + "grad_norm": 1.5103845596313477, + "learning_rate": 4.589586103086014e-05, + "loss": 4.8099, + "step": 31105 + }, + { + "epoch": 0.18499619373870016, + "grad_norm": 1.7128773927688599, + "learning_rate": 4.5895604598715284e-05, + "loss": 4.5107, + "step": 31106 + }, + { + "epoch": 0.18500214102198115, + "grad_norm": 1.7347596883773804, + "learning_rate": 4.5895348159275986e-05, + "loss": 5.2684, + "step": 31107 + }, + { + "epoch": 0.18500808830526216, + "grad_norm": 1.5564923286437988, + "learning_rate": 4.5895091712542346e-05, + "loss": 4.8463, + "step": 31108 + }, + { + "epoch": 0.18501403558854315, + "grad_norm": 1.6237825155258179, + "learning_rate": 4.589483525851444e-05, + "loss": 4.7388, + "step": 31109 + }, + { + "epoch": 0.18501998287182414, + "grad_norm": 2.2543084621429443, + "learning_rate": 4.5894578797192355e-05, + "loss": 4.5901, + "step": 31110 + }, + { + "epoch": 0.18502593015510516, + "grad_norm": 2.227154016494751, + "learning_rate": 4.58943223285762e-05, + "loss": 4.1078, + "step": 31111 + }, + { + "epoch": 0.18503187743838614, + "grad_norm": 2.371508836746216, + "learning_rate": 4.5894065852666044e-05, + "loss": 3.3593, + "step": 31112 + }, + { + "epoch": 0.18503782472166713, + "grad_norm": 2.228203773498535, + "learning_rate": 4.589380936946199e-05, + "loss": 3.3639, + "step": 31113 + }, + { + "epoch": 0.18504377200494815, + "grad_norm": 1.926233172416687, + "learning_rate": 4.589355287896412e-05, + "loss": 3.5136, + "step": 31114 + }, + { + "epoch": 0.18504971928822914, + "grad_norm": 1.9671204090118408, + "learning_rate": 4.5893296381172537e-05, + "loss": 3.8437, + "step": 31115 + }, + { + "epoch": 0.18505566657151012, + "grad_norm": 1.9354090690612793, + "learning_rate": 4.5893039876087305e-05, + "loss": 3.5759, + "step": 31116 + }, + { + "epoch": 0.18506161385479114, + "grad_norm": 1.5726033449172974, + "learning_rate": 4.589278336370854e-05, + "loss": 4.5015, + "step": 31117 + }, + { + "epoch": 0.18506756113807213, + "grad_norm": 1.7983962297439575, + "learning_rate": 4.5892526844036307e-05, + "loss": 4.7782, + "step": 31118 + }, + { + "epoch": 0.18507350842135312, + "grad_norm": 2.0265634059906006, + "learning_rate": 4.589227031707072e-05, + "loss": 3.7139, + "step": 31119 + }, + { + "epoch": 0.18507945570463413, + "grad_norm": 2.074643135070801, + "learning_rate": 4.589201378281186e-05, + "loss": 3.7737, + "step": 31120 + }, + { + "epoch": 0.18508540298791512, + "grad_norm": 1.91277277469635, + "learning_rate": 4.58917572412598e-05, + "loss": 4.5868, + "step": 31121 + }, + { + "epoch": 0.1850913502711961, + "grad_norm": 1.944737195968628, + "learning_rate": 4.589150069241466e-05, + "loss": 4.1658, + "step": 31122 + }, + { + "epoch": 0.18509729755447712, + "grad_norm": 2.1314923763275146, + "learning_rate": 4.58912441362765e-05, + "loss": 3.8389, + "step": 31123 + }, + { + "epoch": 0.1851032448377581, + "grad_norm": 1.9352933168411255, + "learning_rate": 4.589098757284543e-05, + "loss": 4.6027, + "step": 31124 + }, + { + "epoch": 0.1851091921210391, + "grad_norm": 1.8150495290756226, + "learning_rate": 4.589073100212153e-05, + "loss": 4.6796, + "step": 31125 + }, + { + "epoch": 0.18511513940432012, + "grad_norm": 1.7410165071487427, + "learning_rate": 4.589047442410489e-05, + "loss": 4.8486, + "step": 31126 + }, + { + "epoch": 0.1851210866876011, + "grad_norm": 2.197824716567993, + "learning_rate": 4.58902178387956e-05, + "loss": 4.6977, + "step": 31127 + }, + { + "epoch": 0.1851270339708821, + "grad_norm": 1.9299874305725098, + "learning_rate": 4.588996124619376e-05, + "loss": 4.4558, + "step": 31128 + }, + { + "epoch": 0.1851329812541631, + "grad_norm": 1.6607778072357178, + "learning_rate": 4.5889704646299433e-05, + "loss": 4.562, + "step": 31129 + }, + { + "epoch": 0.1851389285374441, + "grad_norm": 1.7494784593582153, + "learning_rate": 4.588944803911274e-05, + "loss": 4.9075, + "step": 31130 + }, + { + "epoch": 0.18514487582072509, + "grad_norm": 1.6292402744293213, + "learning_rate": 4.588919142463376e-05, + "loss": 5.0776, + "step": 31131 + }, + { + "epoch": 0.1851508231040061, + "grad_norm": 1.9825034141540527, + "learning_rate": 4.588893480286257e-05, + "loss": 4.3945, + "step": 31132 + }, + { + "epoch": 0.1851567703872871, + "grad_norm": 1.7921351194381714, + "learning_rate": 4.588867817379927e-05, + "loss": 4.8753, + "step": 31133 + }, + { + "epoch": 0.18516271767056808, + "grad_norm": 1.920423984527588, + "learning_rate": 4.588842153744395e-05, + "loss": 4.3311, + "step": 31134 + }, + { + "epoch": 0.1851686649538491, + "grad_norm": 2.3672003746032715, + "learning_rate": 4.5888164893796695e-05, + "loss": 4.1878, + "step": 31135 + }, + { + "epoch": 0.18517461223713008, + "grad_norm": 1.6721351146697998, + "learning_rate": 4.5887908242857594e-05, + "loss": 5.3044, + "step": 31136 + }, + { + "epoch": 0.18518055952041107, + "grad_norm": 2.2272095680236816, + "learning_rate": 4.5887651584626745e-05, + "loss": 4.1318, + "step": 31137 + }, + { + "epoch": 0.18518650680369209, + "grad_norm": 2.2360355854034424, + "learning_rate": 4.588739491910424e-05, + "loss": 4.0698, + "step": 31138 + }, + { + "epoch": 0.18519245408697307, + "grad_norm": 1.863351583480835, + "learning_rate": 4.588713824629015e-05, + "loss": 4.3417, + "step": 31139 + }, + { + "epoch": 0.18519840137025406, + "grad_norm": 1.5672686100006104, + "learning_rate": 4.588688156618458e-05, + "loss": 4.9516, + "step": 31140 + }, + { + "epoch": 0.18520434865353508, + "grad_norm": 1.7040348052978516, + "learning_rate": 4.5886624878787624e-05, + "loss": 4.8062, + "step": 31141 + }, + { + "epoch": 0.18521029593681607, + "grad_norm": 1.4039283990859985, + "learning_rate": 4.5886368184099346e-05, + "loss": 5.0631, + "step": 31142 + }, + { + "epoch": 0.18521624322009705, + "grad_norm": 1.474048376083374, + "learning_rate": 4.588611148211986e-05, + "loss": 4.9985, + "step": 31143 + }, + { + "epoch": 0.18522219050337807, + "grad_norm": 1.634386420249939, + "learning_rate": 4.5885854772849254e-05, + "loss": 4.9347, + "step": 31144 + }, + { + "epoch": 0.18522813778665906, + "grad_norm": 1.8768565654754639, + "learning_rate": 4.5885598056287604e-05, + "loss": 4.9722, + "step": 31145 + }, + { + "epoch": 0.18523408506994005, + "grad_norm": 2.405940532684326, + "learning_rate": 4.588534133243501e-05, + "loss": 4.2056, + "step": 31146 + }, + { + "epoch": 0.18524003235322106, + "grad_norm": 1.7994506359100342, + "learning_rate": 4.588508460129156e-05, + "loss": 4.8714, + "step": 31147 + }, + { + "epoch": 0.18524597963650205, + "grad_norm": 1.5454603433609009, + "learning_rate": 4.5884827862857344e-05, + "loss": 5.1527, + "step": 31148 + }, + { + "epoch": 0.18525192691978304, + "grad_norm": 1.4534333944320679, + "learning_rate": 4.5884571117132444e-05, + "loss": 5.0984, + "step": 31149 + }, + { + "epoch": 0.18525787420306403, + "grad_norm": 1.6229337453842163, + "learning_rate": 4.588431436411696e-05, + "loss": 4.3216, + "step": 31150 + }, + { + "epoch": 0.18526382148634504, + "grad_norm": 1.905275583267212, + "learning_rate": 4.588405760381098e-05, + "loss": 4.3979, + "step": 31151 + }, + { + "epoch": 0.18526976876962603, + "grad_norm": 1.85005521774292, + "learning_rate": 4.58838008362146e-05, + "loss": 5.1768, + "step": 31152 + }, + { + "epoch": 0.18527571605290702, + "grad_norm": 1.5412628650665283, + "learning_rate": 4.5883544061327885e-05, + "loss": 5.0367, + "step": 31153 + }, + { + "epoch": 0.18528166333618803, + "grad_norm": 1.4088354110717773, + "learning_rate": 4.588328727915094e-05, + "loss": 5.0446, + "step": 31154 + }, + { + "epoch": 0.18528761061946902, + "grad_norm": 1.4099864959716797, + "learning_rate": 4.5883030489683865e-05, + "loss": 4.8956, + "step": 31155 + }, + { + "epoch": 0.18529355790275, + "grad_norm": 1.5859589576721191, + "learning_rate": 4.588277369292674e-05, + "loss": 5.1765, + "step": 31156 + }, + { + "epoch": 0.18529950518603103, + "grad_norm": 1.9431182146072388, + "learning_rate": 4.588251688887965e-05, + "loss": 4.2741, + "step": 31157 + }, + { + "epoch": 0.18530545246931202, + "grad_norm": 2.457024335861206, + "learning_rate": 4.5882260077542685e-05, + "loss": 4.4239, + "step": 31158 + }, + { + "epoch": 0.185311399752593, + "grad_norm": 2.1999270915985107, + "learning_rate": 4.588200325891595e-05, + "loss": 4.8285, + "step": 31159 + }, + { + "epoch": 0.18531734703587402, + "grad_norm": 2.221158504486084, + "learning_rate": 4.588174643299952e-05, + "loss": 4.6192, + "step": 31160 + }, + { + "epoch": 0.185323294319155, + "grad_norm": 2.167083501815796, + "learning_rate": 4.5881489599793484e-05, + "loss": 4.6133, + "step": 31161 + }, + { + "epoch": 0.185329241602436, + "grad_norm": 2.050466775894165, + "learning_rate": 4.588123275929793e-05, + "loss": 4.3539, + "step": 31162 + }, + { + "epoch": 0.185335188885717, + "grad_norm": 1.8196213245391846, + "learning_rate": 4.588097591151296e-05, + "loss": 4.8754, + "step": 31163 + }, + { + "epoch": 0.185341136168998, + "grad_norm": 2.024564504623413, + "learning_rate": 4.588071905643866e-05, + "loss": 4.8754, + "step": 31164 + }, + { + "epoch": 0.185347083452279, + "grad_norm": 2.1911628246307373, + "learning_rate": 4.5880462194075114e-05, + "loss": 4.7296, + "step": 31165 + }, + { + "epoch": 0.18535303073556, + "grad_norm": 2.6719770431518555, + "learning_rate": 4.588020532442241e-05, + "loss": 4.6449, + "step": 31166 + }, + { + "epoch": 0.185358978018841, + "grad_norm": 1.9585731029510498, + "learning_rate": 4.587994844748065e-05, + "loss": 4.5527, + "step": 31167 + }, + { + "epoch": 0.18536492530212198, + "grad_norm": 1.9000869989395142, + "learning_rate": 4.587969156324991e-05, + "loss": 4.7364, + "step": 31168 + }, + { + "epoch": 0.185370872585403, + "grad_norm": 2.080929756164551, + "learning_rate": 4.587943467173029e-05, + "loss": 4.797, + "step": 31169 + }, + { + "epoch": 0.18537681986868398, + "grad_norm": 2.2646484375, + "learning_rate": 4.5879177772921864e-05, + "loss": 4.6755, + "step": 31170 + }, + { + "epoch": 0.18538276715196497, + "grad_norm": 2.2647855281829834, + "learning_rate": 4.5878920866824746e-05, + "loss": 4.7376, + "step": 31171 + }, + { + "epoch": 0.185388714435246, + "grad_norm": 2.094724655151367, + "learning_rate": 4.5878663953439005e-05, + "loss": 4.4832, + "step": 31172 + }, + { + "epoch": 0.18539466171852698, + "grad_norm": 2.17482852935791, + "learning_rate": 4.587840703276474e-05, + "loss": 4.5812, + "step": 31173 + }, + { + "epoch": 0.18540060900180796, + "grad_norm": 2.3196496963500977, + "learning_rate": 4.5878150104802045e-05, + "loss": 4.5377, + "step": 31174 + }, + { + "epoch": 0.18540655628508898, + "grad_norm": 1.9016317129135132, + "learning_rate": 4.5877893169550996e-05, + "loss": 4.4074, + "step": 31175 + }, + { + "epoch": 0.18541250356836997, + "grad_norm": 2.1237874031066895, + "learning_rate": 4.587763622701169e-05, + "loss": 4.6557, + "step": 31176 + }, + { + "epoch": 0.18541845085165096, + "grad_norm": 1.9775478839874268, + "learning_rate": 4.587737927718422e-05, + "loss": 4.7775, + "step": 31177 + }, + { + "epoch": 0.18542439813493197, + "grad_norm": 1.7758903503417969, + "learning_rate": 4.587712232006868e-05, + "loss": 5.1214, + "step": 31178 + }, + { + "epoch": 0.18543034541821296, + "grad_norm": 2.0964064598083496, + "learning_rate": 4.5876865355665135e-05, + "loss": 4.7776, + "step": 31179 + }, + { + "epoch": 0.18543629270149395, + "grad_norm": 2.201028347015381, + "learning_rate": 4.58766083839737e-05, + "loss": 4.4993, + "step": 31180 + }, + { + "epoch": 0.18544223998477496, + "grad_norm": 1.6263900995254517, + "learning_rate": 4.587635140499446e-05, + "loss": 4.8201, + "step": 31181 + }, + { + "epoch": 0.18544818726805595, + "grad_norm": 1.5977891683578491, + "learning_rate": 4.58760944187275e-05, + "loss": 4.8996, + "step": 31182 + }, + { + "epoch": 0.18545413455133694, + "grad_norm": 1.4332998991012573, + "learning_rate": 4.5875837425172904e-05, + "loss": 4.9172, + "step": 31183 + }, + { + "epoch": 0.18546008183461796, + "grad_norm": 1.5299646854400635, + "learning_rate": 4.5875580424330774e-05, + "loss": 4.7755, + "step": 31184 + }, + { + "epoch": 0.18546602911789895, + "grad_norm": 1.75115966796875, + "learning_rate": 4.58753234162012e-05, + "loss": 5.1949, + "step": 31185 + }, + { + "epoch": 0.18547197640117993, + "grad_norm": 1.7606922388076782, + "learning_rate": 4.587506640078426e-05, + "loss": 4.5256, + "step": 31186 + }, + { + "epoch": 0.18547792368446095, + "grad_norm": 1.8649322986602783, + "learning_rate": 4.5874809378080055e-05, + "loss": 4.4196, + "step": 31187 + }, + { + "epoch": 0.18548387096774194, + "grad_norm": 1.56643807888031, + "learning_rate": 4.587455234808867e-05, + "loss": 4.7597, + "step": 31188 + }, + { + "epoch": 0.18548981825102293, + "grad_norm": 1.4705426692962646, + "learning_rate": 4.587429531081019e-05, + "loss": 4.7919, + "step": 31189 + }, + { + "epoch": 0.18549576553430394, + "grad_norm": 1.472716212272644, + "learning_rate": 4.587403826624471e-05, + "loss": 4.89, + "step": 31190 + }, + { + "epoch": 0.18550171281758493, + "grad_norm": 1.6982768774032593, + "learning_rate": 4.5873781214392315e-05, + "loss": 5.0726, + "step": 31191 + }, + { + "epoch": 0.18550766010086592, + "grad_norm": 1.543344497680664, + "learning_rate": 4.58735241552531e-05, + "loss": 4.9335, + "step": 31192 + }, + { + "epoch": 0.18551360738414693, + "grad_norm": 1.5692951679229736, + "learning_rate": 4.587326708882716e-05, + "loss": 5.0962, + "step": 31193 + }, + { + "epoch": 0.18551955466742792, + "grad_norm": 1.42678701877594, + "learning_rate": 4.587301001511457e-05, + "loss": 4.9869, + "step": 31194 + }, + { + "epoch": 0.1855255019507089, + "grad_norm": 1.5326842069625854, + "learning_rate": 4.5872752934115437e-05, + "loss": 4.9169, + "step": 31195 + }, + { + "epoch": 0.18553144923398993, + "grad_norm": 1.6546173095703125, + "learning_rate": 4.587249584582983e-05, + "loss": 4.8579, + "step": 31196 + }, + { + "epoch": 0.1855373965172709, + "grad_norm": 1.5519356727600098, + "learning_rate": 4.587223875025786e-05, + "loss": 4.8597, + "step": 31197 + }, + { + "epoch": 0.1855433438005519, + "grad_norm": 1.6477192640304565, + "learning_rate": 4.5871981647399606e-05, + "loss": 4.9838, + "step": 31198 + }, + { + "epoch": 0.18554929108383292, + "grad_norm": 1.55699622631073, + "learning_rate": 4.587172453725516e-05, + "loss": 4.6201, + "step": 31199 + }, + { + "epoch": 0.1855552383671139, + "grad_norm": 1.8529999256134033, + "learning_rate": 4.58714674198246e-05, + "loss": 4.8909, + "step": 31200 + }, + { + "epoch": 0.1855611856503949, + "grad_norm": 1.6725835800170898, + "learning_rate": 4.587121029510804e-05, + "loss": 4.9577, + "step": 31201 + }, + { + "epoch": 0.1855671329336759, + "grad_norm": 1.6824015378952026, + "learning_rate": 4.5870953163105545e-05, + "loss": 5.1628, + "step": 31202 + }, + { + "epoch": 0.1855730802169569, + "grad_norm": 1.4448360204696655, + "learning_rate": 4.587069602381722e-05, + "loss": 5.0237, + "step": 31203 + }, + { + "epoch": 0.1855790275002379, + "grad_norm": 1.6337754726409912, + "learning_rate": 4.5870438877243154e-05, + "loss": 4.9626, + "step": 31204 + }, + { + "epoch": 0.1855849747835189, + "grad_norm": 1.80125093460083, + "learning_rate": 4.587018172338343e-05, + "loss": 5.3551, + "step": 31205 + }, + { + "epoch": 0.1855909220667999, + "grad_norm": 1.9253333806991577, + "learning_rate": 4.586992456223814e-05, + "loss": 5.2549, + "step": 31206 + }, + { + "epoch": 0.18559686935008088, + "grad_norm": 1.6633672714233398, + "learning_rate": 4.586966739380738e-05, + "loss": 4.8286, + "step": 31207 + }, + { + "epoch": 0.18560281663336187, + "grad_norm": 1.5835380554199219, + "learning_rate": 4.586941021809124e-05, + "loss": 5.2259, + "step": 31208 + }, + { + "epoch": 0.18560876391664288, + "grad_norm": 1.4492098093032837, + "learning_rate": 4.586915303508979e-05, + "loss": 4.9461, + "step": 31209 + }, + { + "epoch": 0.18561471119992387, + "grad_norm": 1.6649349927902222, + "learning_rate": 4.586889584480314e-05, + "loss": 5.0827, + "step": 31210 + }, + { + "epoch": 0.18562065848320486, + "grad_norm": 1.904850959777832, + "learning_rate": 4.5868638647231374e-05, + "loss": 5.1674, + "step": 31211 + }, + { + "epoch": 0.18562660576648587, + "grad_norm": 1.5173715353012085, + "learning_rate": 4.586838144237458e-05, + "loss": 5.1684, + "step": 31212 + }, + { + "epoch": 0.18563255304976686, + "grad_norm": 1.5624539852142334, + "learning_rate": 4.586812423023285e-05, + "loss": 5.1, + "step": 31213 + }, + { + "epoch": 0.18563850033304785, + "grad_norm": 1.4277743101119995, + "learning_rate": 4.5867867010806275e-05, + "loss": 5.1583, + "step": 31214 + }, + { + "epoch": 0.18564444761632887, + "grad_norm": 1.5859686136245728, + "learning_rate": 4.586760978409494e-05, + "loss": 5.1848, + "step": 31215 + }, + { + "epoch": 0.18565039489960986, + "grad_norm": 1.5678226947784424, + "learning_rate": 4.586735255009895e-05, + "loss": 4.919, + "step": 31216 + }, + { + "epoch": 0.18565634218289084, + "grad_norm": 1.8046095371246338, + "learning_rate": 4.586709530881837e-05, + "loss": 4.741, + "step": 31217 + }, + { + "epoch": 0.18566228946617186, + "grad_norm": 1.8384325504302979, + "learning_rate": 4.586683806025331e-05, + "loss": 4.6733, + "step": 31218 + }, + { + "epoch": 0.18566823674945285, + "grad_norm": 1.7575494050979614, + "learning_rate": 4.586658080440385e-05, + "loss": 4.5463, + "step": 31219 + }, + { + "epoch": 0.18567418403273384, + "grad_norm": 1.5560238361358643, + "learning_rate": 4.586632354127009e-05, + "loss": 4.8716, + "step": 31220 + }, + { + "epoch": 0.18568013131601485, + "grad_norm": 1.6690888404846191, + "learning_rate": 4.586606627085209e-05, + "loss": 5.1506, + "step": 31221 + }, + { + "epoch": 0.18568607859929584, + "grad_norm": 1.8201206922531128, + "learning_rate": 4.5865808993149985e-05, + "loss": 4.9691, + "step": 31222 + }, + { + "epoch": 0.18569202588257683, + "grad_norm": 1.4912810325622559, + "learning_rate": 4.586555170816383e-05, + "loss": 4.707, + "step": 31223 + }, + { + "epoch": 0.18569797316585784, + "grad_norm": 1.8407074213027954, + "learning_rate": 4.5865294415893726e-05, + "loss": 5.3221, + "step": 31224 + }, + { + "epoch": 0.18570392044913883, + "grad_norm": 1.699225664138794, + "learning_rate": 4.586503711633976e-05, + "loss": 4.8717, + "step": 31225 + }, + { + "epoch": 0.18570986773241982, + "grad_norm": 1.76210618019104, + "learning_rate": 4.586477980950203e-05, + "loss": 4.7315, + "step": 31226 + }, + { + "epoch": 0.18571581501570084, + "grad_norm": 1.6527961492538452, + "learning_rate": 4.586452249538063e-05, + "loss": 4.9227, + "step": 31227 + }, + { + "epoch": 0.18572176229898182, + "grad_norm": 1.576004981994629, + "learning_rate": 4.586426517397563e-05, + "loss": 4.8288, + "step": 31228 + }, + { + "epoch": 0.1857277095822628, + "grad_norm": 1.8966927528381348, + "learning_rate": 4.586400784528714e-05, + "loss": 4.6546, + "step": 31229 + }, + { + "epoch": 0.18573365686554383, + "grad_norm": 1.7888445854187012, + "learning_rate": 4.586375050931523e-05, + "loss": 5.3152, + "step": 31230 + }, + { + "epoch": 0.18573960414882482, + "grad_norm": 1.656417965888977, + "learning_rate": 4.586349316606e-05, + "loss": 4.8675, + "step": 31231 + }, + { + "epoch": 0.1857455514321058, + "grad_norm": 1.5206754207611084, + "learning_rate": 4.586323581552155e-05, + "loss": 4.8806, + "step": 31232 + }, + { + "epoch": 0.18575149871538682, + "grad_norm": 1.7659846544265747, + "learning_rate": 4.586297845769995e-05, + "loss": 4.7916, + "step": 31233 + }, + { + "epoch": 0.1857574459986678, + "grad_norm": 1.6599786281585693, + "learning_rate": 4.5862721092595305e-05, + "loss": 4.9745, + "step": 31234 + }, + { + "epoch": 0.1857633932819488, + "grad_norm": 1.5032174587249756, + "learning_rate": 4.5862463720207696e-05, + "loss": 5.0588, + "step": 31235 + }, + { + "epoch": 0.1857693405652298, + "grad_norm": 1.4480514526367188, + "learning_rate": 4.5862206340537215e-05, + "loss": 5.3229, + "step": 31236 + }, + { + "epoch": 0.1857752878485108, + "grad_norm": 2.4662117958068848, + "learning_rate": 4.5861948953583966e-05, + "loss": 4.3971, + "step": 31237 + }, + { + "epoch": 0.1857812351317918, + "grad_norm": 1.864040493965149, + "learning_rate": 4.5861691559348016e-05, + "loss": 4.5105, + "step": 31238 + }, + { + "epoch": 0.1857871824150728, + "grad_norm": 1.6042394638061523, + "learning_rate": 4.5861434157829466e-05, + "loss": 4.9541, + "step": 31239 + }, + { + "epoch": 0.1857931296983538, + "grad_norm": 1.6892380714416504, + "learning_rate": 4.58611767490284e-05, + "loss": 4.8085, + "step": 31240 + }, + { + "epoch": 0.18579907698163478, + "grad_norm": 1.625783920288086, + "learning_rate": 4.586091933294492e-05, + "loss": 4.5476, + "step": 31241 + }, + { + "epoch": 0.1858050242649158, + "grad_norm": 1.3952319622039795, + "learning_rate": 4.5860661909579106e-05, + "loss": 4.493, + "step": 31242 + }, + { + "epoch": 0.18581097154819678, + "grad_norm": 1.6816487312316895, + "learning_rate": 4.586040447893105e-05, + "loss": 4.562, + "step": 31243 + }, + { + "epoch": 0.18581691883147777, + "grad_norm": 1.593903660774231, + "learning_rate": 4.586014704100085e-05, + "loss": 4.4116, + "step": 31244 + }, + { + "epoch": 0.1858228661147588, + "grad_norm": 1.8696283102035522, + "learning_rate": 4.5859889595788577e-05, + "loss": 5.0991, + "step": 31245 + }, + { + "epoch": 0.18582881339803978, + "grad_norm": 1.7519524097442627, + "learning_rate": 4.585963214329434e-05, + "loss": 5.0826, + "step": 31246 + }, + { + "epoch": 0.18583476068132077, + "grad_norm": 1.9816068410873413, + "learning_rate": 4.5859374683518216e-05, + "loss": 5.0646, + "step": 31247 + }, + { + "epoch": 0.18584070796460178, + "grad_norm": 2.6658241748809814, + "learning_rate": 4.58591172164603e-05, + "loss": 4.0542, + "step": 31248 + }, + { + "epoch": 0.18584665524788277, + "grad_norm": 1.6509416103363037, + "learning_rate": 4.585885974212068e-05, + "loss": 5.1681, + "step": 31249 + }, + { + "epoch": 0.18585260253116376, + "grad_norm": 1.7301459312438965, + "learning_rate": 4.585860226049945e-05, + "loss": 4.6656, + "step": 31250 + }, + { + "epoch": 0.18585854981444477, + "grad_norm": 1.643879771232605, + "learning_rate": 4.58583447715967e-05, + "loss": 5.1228, + "step": 31251 + }, + { + "epoch": 0.18586449709772576, + "grad_norm": 1.7651242017745972, + "learning_rate": 4.585808727541252e-05, + "loss": 4.5896, + "step": 31252 + }, + { + "epoch": 0.18587044438100675, + "grad_norm": 1.7697621583938599, + "learning_rate": 4.585782977194699e-05, + "loss": 4.6722, + "step": 31253 + }, + { + "epoch": 0.18587639166428777, + "grad_norm": 1.751751184463501, + "learning_rate": 4.585757226120021e-05, + "loss": 5.1881, + "step": 31254 + }, + { + "epoch": 0.18588233894756875, + "grad_norm": 1.4857293367385864, + "learning_rate": 4.5857314743172265e-05, + "loss": 5.3738, + "step": 31255 + }, + { + "epoch": 0.18588828623084974, + "grad_norm": 1.6109179258346558, + "learning_rate": 4.5857057217863245e-05, + "loss": 5.0916, + "step": 31256 + }, + { + "epoch": 0.18589423351413076, + "grad_norm": 1.9718564748764038, + "learning_rate": 4.5856799685273244e-05, + "loss": 4.8674, + "step": 31257 + }, + { + "epoch": 0.18590018079741175, + "grad_norm": 2.1158852577209473, + "learning_rate": 4.585654214540235e-05, + "loss": 5.0665, + "step": 31258 + }, + { + "epoch": 0.18590612808069273, + "grad_norm": 1.5725470781326294, + "learning_rate": 4.5856284598250656e-05, + "loss": 5.3185, + "step": 31259 + }, + { + "epoch": 0.18591207536397375, + "grad_norm": 2.0950751304626465, + "learning_rate": 4.585602704381825e-05, + "loss": 4.3702, + "step": 31260 + }, + { + "epoch": 0.18591802264725474, + "grad_norm": 1.6896560192108154, + "learning_rate": 4.585576948210522e-05, + "loss": 4.3898, + "step": 31261 + }, + { + "epoch": 0.18592396993053573, + "grad_norm": 1.580668568611145, + "learning_rate": 4.5855511913111646e-05, + "loss": 4.9532, + "step": 31262 + }, + { + "epoch": 0.18592991721381674, + "grad_norm": 1.6650530099868774, + "learning_rate": 4.5855254336837636e-05, + "loss": 4.8532, + "step": 31263 + }, + { + "epoch": 0.18593586449709773, + "grad_norm": 2.025588274002075, + "learning_rate": 4.5854996753283276e-05, + "loss": 5.184, + "step": 31264 + }, + { + "epoch": 0.18594181178037872, + "grad_norm": 2.844881534576416, + "learning_rate": 4.585473916244865e-05, + "loss": 3.9266, + "step": 31265 + }, + { + "epoch": 0.1859477590636597, + "grad_norm": 2.53692626953125, + "learning_rate": 4.585448156433384e-05, + "loss": 4.2536, + "step": 31266 + }, + { + "epoch": 0.18595370634694072, + "grad_norm": 1.666998028755188, + "learning_rate": 4.585422395893896e-05, + "loss": 4.8051, + "step": 31267 + }, + { + "epoch": 0.1859596536302217, + "grad_norm": 1.4353892803192139, + "learning_rate": 4.585396634626408e-05, + "loss": 4.6797, + "step": 31268 + }, + { + "epoch": 0.1859656009135027, + "grad_norm": 1.8029720783233643, + "learning_rate": 4.5853708726309285e-05, + "loss": 4.3908, + "step": 31269 + }, + { + "epoch": 0.18597154819678371, + "grad_norm": 1.8102213144302368, + "learning_rate": 4.585345109907469e-05, + "loss": 4.6718, + "step": 31270 + }, + { + "epoch": 0.1859774954800647, + "grad_norm": 1.8388559818267822, + "learning_rate": 4.585319346456036e-05, + "loss": 4.8911, + "step": 31271 + }, + { + "epoch": 0.1859834427633457, + "grad_norm": 1.69950270652771, + "learning_rate": 4.585293582276641e-05, + "loss": 5.6486, + "step": 31272 + }, + { + "epoch": 0.1859893900466267, + "grad_norm": 1.5373098850250244, + "learning_rate": 4.585267817369291e-05, + "loss": 4.9994, + "step": 31273 + }, + { + "epoch": 0.1859953373299077, + "grad_norm": 1.6866233348846436, + "learning_rate": 4.5852420517339945e-05, + "loss": 4.9179, + "step": 31274 + }, + { + "epoch": 0.18600128461318868, + "grad_norm": 1.7572931051254272, + "learning_rate": 4.5852162853707625e-05, + "loss": 5.2829, + "step": 31275 + }, + { + "epoch": 0.1860072318964697, + "grad_norm": 1.6226259469985962, + "learning_rate": 4.5851905182796026e-05, + "loss": 5.0821, + "step": 31276 + }, + { + "epoch": 0.1860131791797507, + "grad_norm": 1.8315626382827759, + "learning_rate": 4.585164750460525e-05, + "loss": 4.9505, + "step": 31277 + }, + { + "epoch": 0.18601912646303168, + "grad_norm": 1.7805089950561523, + "learning_rate": 4.585138981913537e-05, + "loss": 5.2987, + "step": 31278 + }, + { + "epoch": 0.1860250737463127, + "grad_norm": 1.6392905712127686, + "learning_rate": 4.58511321263865e-05, + "loss": 4.5575, + "step": 31279 + }, + { + "epoch": 0.18603102102959368, + "grad_norm": 2.2516140937805176, + "learning_rate": 4.5850874426358703e-05, + "loss": 4.4565, + "step": 31280 + }, + { + "epoch": 0.18603696831287467, + "grad_norm": 2.3298892974853516, + "learning_rate": 4.585061671905208e-05, + "loss": 4.0626, + "step": 31281 + }, + { + "epoch": 0.18604291559615568, + "grad_norm": 1.3898862600326538, + "learning_rate": 4.585035900446673e-05, + "loss": 5.026, + "step": 31282 + }, + { + "epoch": 0.18604886287943667, + "grad_norm": 1.517095923423767, + "learning_rate": 4.5850101282602723e-05, + "loss": 5.0808, + "step": 31283 + }, + { + "epoch": 0.18605481016271766, + "grad_norm": 1.505425214767456, + "learning_rate": 4.584984355346017e-05, + "loss": 5.0229, + "step": 31284 + }, + { + "epoch": 0.18606075744599868, + "grad_norm": 2.051816701889038, + "learning_rate": 4.584958581703915e-05, + "loss": 4.5713, + "step": 31285 + }, + { + "epoch": 0.18606670472927966, + "grad_norm": 1.7285747528076172, + "learning_rate": 4.5849328073339756e-05, + "loss": 5.0913, + "step": 31286 + }, + { + "epoch": 0.18607265201256065, + "grad_norm": 1.5341402292251587, + "learning_rate": 4.584907032236208e-05, + "loss": 5.0847, + "step": 31287 + }, + { + "epoch": 0.18607859929584167, + "grad_norm": 1.6782925128936768, + "learning_rate": 4.58488125641062e-05, + "loss": 5.1384, + "step": 31288 + }, + { + "epoch": 0.18608454657912266, + "grad_norm": 1.3116849660873413, + "learning_rate": 4.584855479857222e-05, + "loss": 5.1293, + "step": 31289 + }, + { + "epoch": 0.18609049386240364, + "grad_norm": 1.7799185514450073, + "learning_rate": 4.584829702576022e-05, + "loss": 4.9044, + "step": 31290 + }, + { + "epoch": 0.18609644114568466, + "grad_norm": 2.572935104370117, + "learning_rate": 4.5848039245670304e-05, + "loss": 4.1723, + "step": 31291 + }, + { + "epoch": 0.18610238842896565, + "grad_norm": 1.9144190549850464, + "learning_rate": 4.584778145830255e-05, + "loss": 4.5369, + "step": 31292 + }, + { + "epoch": 0.18610833571224664, + "grad_norm": 2.1058690547943115, + "learning_rate": 4.584752366365706e-05, + "loss": 4.2558, + "step": 31293 + }, + { + "epoch": 0.18611428299552765, + "grad_norm": 2.1572718620300293, + "learning_rate": 4.58472658617339e-05, + "loss": 3.7545, + "step": 31294 + }, + { + "epoch": 0.18612023027880864, + "grad_norm": 2.5771355628967285, + "learning_rate": 4.584700805253317e-05, + "loss": 4.04, + "step": 31295 + }, + { + "epoch": 0.18612617756208963, + "grad_norm": 1.5100488662719727, + "learning_rate": 4.584675023605498e-05, + "loss": 4.5827, + "step": 31296 + }, + { + "epoch": 0.18613212484537064, + "grad_norm": 2.33481764793396, + "learning_rate": 4.58464924122994e-05, + "loss": 4.2408, + "step": 31297 + }, + { + "epoch": 0.18613807212865163, + "grad_norm": 2.062885284423828, + "learning_rate": 4.584623458126652e-05, + "loss": 4.196, + "step": 31298 + }, + { + "epoch": 0.18614401941193262, + "grad_norm": 2.3652517795562744, + "learning_rate": 4.584597674295644e-05, + "loss": 3.8771, + "step": 31299 + }, + { + "epoch": 0.18614996669521364, + "grad_norm": 2.5485894680023193, + "learning_rate": 4.584571889736925e-05, + "loss": 4.203, + "step": 31300 + }, + { + "epoch": 0.18615591397849462, + "grad_norm": 2.2650632858276367, + "learning_rate": 4.584546104450502e-05, + "loss": 4.1996, + "step": 31301 + }, + { + "epoch": 0.1861618612617756, + "grad_norm": 2.359175205230713, + "learning_rate": 4.584520318436387e-05, + "loss": 4.029, + "step": 31302 + }, + { + "epoch": 0.18616780854505663, + "grad_norm": 1.8140226602554321, + "learning_rate": 4.5844945316945867e-05, + "loss": 5.2865, + "step": 31303 + }, + { + "epoch": 0.18617375582833762, + "grad_norm": 1.4269286394119263, + "learning_rate": 4.584468744225111e-05, + "loss": 5.4931, + "step": 31304 + }, + { + "epoch": 0.1861797031116186, + "grad_norm": 1.6179120540618896, + "learning_rate": 4.584442956027969e-05, + "loss": 5.9247, + "step": 31305 + }, + { + "epoch": 0.18618565039489962, + "grad_norm": 1.5601685047149658, + "learning_rate": 4.5844171671031696e-05, + "loss": 5.6042, + "step": 31306 + }, + { + "epoch": 0.1861915976781806, + "grad_norm": 1.9319846630096436, + "learning_rate": 4.584391377450721e-05, + "loss": 4.4306, + "step": 31307 + }, + { + "epoch": 0.1861975449614616, + "grad_norm": 1.6437833309173584, + "learning_rate": 4.584365587070634e-05, + "loss": 4.9218, + "step": 31308 + }, + { + "epoch": 0.1862034922447426, + "grad_norm": 1.5281999111175537, + "learning_rate": 4.584339795962915e-05, + "loss": 4.7631, + "step": 31309 + }, + { + "epoch": 0.1862094395280236, + "grad_norm": 1.864122986793518, + "learning_rate": 4.584314004127576e-05, + "loss": 4.5204, + "step": 31310 + }, + { + "epoch": 0.1862153868113046, + "grad_norm": 1.773083209991455, + "learning_rate": 4.5842882115646234e-05, + "loss": 4.8342, + "step": 31311 + }, + { + "epoch": 0.1862213340945856, + "grad_norm": 1.4922150373458862, + "learning_rate": 4.5842624182740676e-05, + "loss": 5.2839, + "step": 31312 + }, + { + "epoch": 0.1862272813778666, + "grad_norm": 2.119924545288086, + "learning_rate": 4.584236624255918e-05, + "loss": 4.1028, + "step": 31313 + }, + { + "epoch": 0.18623322866114758, + "grad_norm": 2.1595730781555176, + "learning_rate": 4.584210829510183e-05, + "loss": 3.5932, + "step": 31314 + }, + { + "epoch": 0.1862391759444286, + "grad_norm": 2.704542875289917, + "learning_rate": 4.5841850340368706e-05, + "loss": 3.715, + "step": 31315 + }, + { + "epoch": 0.18624512322770959, + "grad_norm": 2.3942182064056396, + "learning_rate": 4.584159237835991e-05, + "loss": 3.5308, + "step": 31316 + }, + { + "epoch": 0.18625107051099057, + "grad_norm": 2.023380994796753, + "learning_rate": 4.584133440907553e-05, + "loss": 3.3817, + "step": 31317 + }, + { + "epoch": 0.1862570177942716, + "grad_norm": 2.1290016174316406, + "learning_rate": 4.5841076432515655e-05, + "loss": 3.6297, + "step": 31318 + }, + { + "epoch": 0.18626296507755258, + "grad_norm": 2.054892063140869, + "learning_rate": 4.584081844868038e-05, + "loss": 5.0177, + "step": 31319 + }, + { + "epoch": 0.18626891236083357, + "grad_norm": 2.030423879623413, + "learning_rate": 4.584056045756979e-05, + "loss": 4.1709, + "step": 31320 + }, + { + "epoch": 0.18627485964411458, + "grad_norm": 2.3559436798095703, + "learning_rate": 4.584030245918397e-05, + "loss": 4.2042, + "step": 31321 + }, + { + "epoch": 0.18628080692739557, + "grad_norm": 2.2861175537109375, + "learning_rate": 4.584004445352302e-05, + "loss": 5.2536, + "step": 31322 + }, + { + "epoch": 0.18628675421067656, + "grad_norm": 2.001182794570923, + "learning_rate": 4.5839786440587016e-05, + "loss": 5.2153, + "step": 31323 + }, + { + "epoch": 0.18629270149395755, + "grad_norm": 2.0066721439361572, + "learning_rate": 4.583952842037608e-05, + "loss": 5.3088, + "step": 31324 + }, + { + "epoch": 0.18629864877723856, + "grad_norm": 2.068047285079956, + "learning_rate": 4.583927039289026e-05, + "loss": 4.7806, + "step": 31325 + }, + { + "epoch": 0.18630459606051955, + "grad_norm": 2.7590277194976807, + "learning_rate": 4.5839012358129676e-05, + "loss": 2.992, + "step": 31326 + }, + { + "epoch": 0.18631054334380054, + "grad_norm": 2.9526596069335938, + "learning_rate": 4.58387543160944e-05, + "loss": 3.2839, + "step": 31327 + }, + { + "epoch": 0.18631649062708155, + "grad_norm": 1.7671618461608887, + "learning_rate": 4.5838496266784534e-05, + "loss": 4.8593, + "step": 31328 + }, + { + "epoch": 0.18632243791036254, + "grad_norm": 1.5757657289505005, + "learning_rate": 4.5838238210200165e-05, + "loss": 5.1869, + "step": 31329 + }, + { + "epoch": 0.18632838519364353, + "grad_norm": 1.4810543060302734, + "learning_rate": 4.5837980146341383e-05, + "loss": 4.7026, + "step": 31330 + }, + { + "epoch": 0.18633433247692455, + "grad_norm": 1.845725655555725, + "learning_rate": 4.5837722075208276e-05, + "loss": 4.3001, + "step": 31331 + }, + { + "epoch": 0.18634027976020553, + "grad_norm": 1.7754443883895874, + "learning_rate": 4.583746399680093e-05, + "loss": 5.4637, + "step": 31332 + }, + { + "epoch": 0.18634622704348652, + "grad_norm": 1.6854273080825806, + "learning_rate": 4.583720591111945e-05, + "loss": 5.3277, + "step": 31333 + }, + { + "epoch": 0.18635217432676754, + "grad_norm": 1.672729253768921, + "learning_rate": 4.583694781816392e-05, + "loss": 5.3735, + "step": 31334 + }, + { + "epoch": 0.18635812161004853, + "grad_norm": 1.7456068992614746, + "learning_rate": 4.583668971793442e-05, + "loss": 5.2757, + "step": 31335 + }, + { + "epoch": 0.18636406889332952, + "grad_norm": 2.863888740539551, + "learning_rate": 4.583643161043104e-05, + "loss": 3.96, + "step": 31336 + }, + { + "epoch": 0.18637001617661053, + "grad_norm": 2.82841157913208, + "learning_rate": 4.583617349565389e-05, + "loss": 3.8763, + "step": 31337 + }, + { + "epoch": 0.18637596345989152, + "grad_norm": 2.5859992504119873, + "learning_rate": 4.583591537360304e-05, + "loss": 3.6348, + "step": 31338 + }, + { + "epoch": 0.1863819107431725, + "grad_norm": 2.1093318462371826, + "learning_rate": 4.5835657244278584e-05, + "loss": 4.5981, + "step": 31339 + }, + { + "epoch": 0.18638785802645352, + "grad_norm": 1.672054409980774, + "learning_rate": 4.5835399107680624e-05, + "loss": 5.0322, + "step": 31340 + }, + { + "epoch": 0.1863938053097345, + "grad_norm": 1.9632993936538696, + "learning_rate": 4.583514096380924e-05, + "loss": 5.2158, + "step": 31341 + }, + { + "epoch": 0.1863997525930155, + "grad_norm": 1.7678093910217285, + "learning_rate": 4.583488281266451e-05, + "loss": 5.1891, + "step": 31342 + }, + { + "epoch": 0.18640569987629652, + "grad_norm": 1.7519903182983398, + "learning_rate": 4.583462465424656e-05, + "loss": 4.935, + "step": 31343 + }, + { + "epoch": 0.1864116471595775, + "grad_norm": 2.023782968521118, + "learning_rate": 4.5834366488555434e-05, + "loss": 5.2837, + "step": 31344 + }, + { + "epoch": 0.1864175944428585, + "grad_norm": 1.8081834316253662, + "learning_rate": 4.583410831559126e-05, + "loss": 4.9925, + "step": 31345 + }, + { + "epoch": 0.1864235417261395, + "grad_norm": 1.6684492826461792, + "learning_rate": 4.5833850135354115e-05, + "loss": 4.4966, + "step": 31346 + }, + { + "epoch": 0.1864294890094205, + "grad_norm": 2.077711582183838, + "learning_rate": 4.583359194784409e-05, + "loss": 3.9967, + "step": 31347 + }, + { + "epoch": 0.18643543629270148, + "grad_norm": 1.727041482925415, + "learning_rate": 4.5833333753061266e-05, + "loss": 5.0418, + "step": 31348 + }, + { + "epoch": 0.1864413835759825, + "grad_norm": 1.7768146991729736, + "learning_rate": 4.5833075551005745e-05, + "loss": 5.151, + "step": 31349 + }, + { + "epoch": 0.1864473308592635, + "grad_norm": 1.6089451313018799, + "learning_rate": 4.5832817341677606e-05, + "loss": 5.4846, + "step": 31350 + }, + { + "epoch": 0.18645327814254448, + "grad_norm": 1.5748451948165894, + "learning_rate": 4.583255912507695e-05, + "loss": 5.2612, + "step": 31351 + }, + { + "epoch": 0.1864592254258255, + "grad_norm": 1.5321335792541504, + "learning_rate": 4.583230090120386e-05, + "loss": 5.3703, + "step": 31352 + }, + { + "epoch": 0.18646517270910648, + "grad_norm": 1.5108387470245361, + "learning_rate": 4.5832042670058436e-05, + "loss": 5.7321, + "step": 31353 + }, + { + "epoch": 0.18647111999238747, + "grad_norm": 1.5854402780532837, + "learning_rate": 4.583178443164075e-05, + "loss": 4.8038, + "step": 31354 + }, + { + "epoch": 0.18647706727566848, + "grad_norm": 1.736132025718689, + "learning_rate": 4.583152618595092e-05, + "loss": 4.9117, + "step": 31355 + }, + { + "epoch": 0.18648301455894947, + "grad_norm": 1.6473597288131714, + "learning_rate": 4.583126793298901e-05, + "loss": 5.1955, + "step": 31356 + }, + { + "epoch": 0.18648896184223046, + "grad_norm": 1.777772307395935, + "learning_rate": 4.583100967275512e-05, + "loss": 4.8981, + "step": 31357 + }, + { + "epoch": 0.18649490912551148, + "grad_norm": 1.8098564147949219, + "learning_rate": 4.583075140524934e-05, + "loss": 4.7925, + "step": 31358 + }, + { + "epoch": 0.18650085640879246, + "grad_norm": 1.8321475982666016, + "learning_rate": 4.583049313047175e-05, + "loss": 5.1179, + "step": 31359 + }, + { + "epoch": 0.18650680369207345, + "grad_norm": 1.6555041074752808, + "learning_rate": 4.583023484842246e-05, + "loss": 5.3435, + "step": 31360 + }, + { + "epoch": 0.18651275097535447, + "grad_norm": 1.7198667526245117, + "learning_rate": 4.5829976559101553e-05, + "loss": 4.7813, + "step": 31361 + }, + { + "epoch": 0.18651869825863546, + "grad_norm": 2.30778169631958, + "learning_rate": 4.582971826250911e-05, + "loss": 3.006, + "step": 31362 + }, + { + "epoch": 0.18652464554191645, + "grad_norm": 2.258409261703491, + "learning_rate": 4.582945995864523e-05, + "loss": 2.8892, + "step": 31363 + }, + { + "epoch": 0.18653059282519746, + "grad_norm": 2.3513214588165283, + "learning_rate": 4.582920164751e-05, + "loss": 2.8185, + "step": 31364 + }, + { + "epoch": 0.18653654010847845, + "grad_norm": 2.1013023853302, + "learning_rate": 4.5828943329103513e-05, + "loss": 3.0731, + "step": 31365 + }, + { + "epoch": 0.18654248739175944, + "grad_norm": 1.7189773321151733, + "learning_rate": 4.582868500342586e-05, + "loss": 5.1003, + "step": 31366 + }, + { + "epoch": 0.18654843467504045, + "grad_norm": 2.006357192993164, + "learning_rate": 4.582842667047712e-05, + "loss": 5.4696, + "step": 31367 + }, + { + "epoch": 0.18655438195832144, + "grad_norm": 1.5163938999176025, + "learning_rate": 4.5828168330257396e-05, + "loss": 5.427, + "step": 31368 + }, + { + "epoch": 0.18656032924160243, + "grad_norm": 1.5231959819793701, + "learning_rate": 4.582790998276678e-05, + "loss": 5.3535, + "step": 31369 + }, + { + "epoch": 0.18656627652488345, + "grad_norm": 1.6396427154541016, + "learning_rate": 4.582765162800534e-05, + "loss": 5.0343, + "step": 31370 + }, + { + "epoch": 0.18657222380816443, + "grad_norm": 1.7178908586502075, + "learning_rate": 4.582739326597319e-05, + "loss": 4.675, + "step": 31371 + }, + { + "epoch": 0.18657817109144542, + "grad_norm": 2.420158863067627, + "learning_rate": 4.582713489667042e-05, + "loss": 4.0231, + "step": 31372 + }, + { + "epoch": 0.18658411837472644, + "grad_norm": 1.9106335639953613, + "learning_rate": 4.582687652009711e-05, + "loss": 4.6815, + "step": 31373 + }, + { + "epoch": 0.18659006565800743, + "grad_norm": 1.3372851610183716, + "learning_rate": 4.582661813625334e-05, + "loss": 5.1749, + "step": 31374 + }, + { + "epoch": 0.1865960129412884, + "grad_norm": 1.7023464441299438, + "learning_rate": 4.582635974513923e-05, + "loss": 4.9542, + "step": 31375 + }, + { + "epoch": 0.18660196022456943, + "grad_norm": 2.3644206523895264, + "learning_rate": 4.582610134675483e-05, + "loss": 4.3933, + "step": 31376 + }, + { + "epoch": 0.18660790750785042, + "grad_norm": 1.7321727275848389, + "learning_rate": 4.582584294110027e-05, + "loss": 4.6583, + "step": 31377 + }, + { + "epoch": 0.1866138547911314, + "grad_norm": 1.7842439413070679, + "learning_rate": 4.582558452817563e-05, + "loss": 4.5918, + "step": 31378 + }, + { + "epoch": 0.18661980207441242, + "grad_norm": 1.9122416973114014, + "learning_rate": 4.582532610798098e-05, + "loss": 4.187, + "step": 31379 + }, + { + "epoch": 0.1866257493576934, + "grad_norm": 1.8635472059249878, + "learning_rate": 4.5825067680516427e-05, + "loss": 4.4158, + "step": 31380 + }, + { + "epoch": 0.1866316966409744, + "grad_norm": 1.8706049919128418, + "learning_rate": 4.5824809245782066e-05, + "loss": 4.037, + "step": 31381 + }, + { + "epoch": 0.1866376439242554, + "grad_norm": 1.762373447418213, + "learning_rate": 4.582455080377797e-05, + "loss": 3.9966, + "step": 31382 + }, + { + "epoch": 0.1866435912075364, + "grad_norm": 1.6706191301345825, + "learning_rate": 4.582429235450424e-05, + "loss": 4.2182, + "step": 31383 + }, + { + "epoch": 0.1866495384908174, + "grad_norm": 1.55520498752594, + "learning_rate": 4.582403389796096e-05, + "loss": 5.1072, + "step": 31384 + }, + { + "epoch": 0.18665548577409838, + "grad_norm": 1.5530856847763062, + "learning_rate": 4.582377543414823e-05, + "loss": 5.0972, + "step": 31385 + }, + { + "epoch": 0.1866614330573794, + "grad_norm": 1.747187614440918, + "learning_rate": 4.582351696306614e-05, + "loss": 4.9334, + "step": 31386 + }, + { + "epoch": 0.18666738034066038, + "grad_norm": 1.696406602859497, + "learning_rate": 4.582325848471477e-05, + "loss": 5.6964, + "step": 31387 + }, + { + "epoch": 0.18667332762394137, + "grad_norm": 1.426660418510437, + "learning_rate": 4.5822999999094215e-05, + "loss": 5.3495, + "step": 31388 + }, + { + "epoch": 0.1866792749072224, + "grad_norm": 1.656969428062439, + "learning_rate": 4.582274150620457e-05, + "loss": 4.5178, + "step": 31389 + }, + { + "epoch": 0.18668522219050337, + "grad_norm": 1.558522343635559, + "learning_rate": 4.5822483006045915e-05, + "loss": 4.5923, + "step": 31390 + }, + { + "epoch": 0.18669116947378436, + "grad_norm": 1.76998770236969, + "learning_rate": 4.582222449861835e-05, + "loss": 4.517, + "step": 31391 + }, + { + "epoch": 0.18669711675706538, + "grad_norm": 1.4918303489685059, + "learning_rate": 4.582196598392196e-05, + "loss": 5.4223, + "step": 31392 + }, + { + "epoch": 0.18670306404034637, + "grad_norm": 1.9973161220550537, + "learning_rate": 4.5821707461956836e-05, + "loss": 5.229, + "step": 31393 + }, + { + "epoch": 0.18670901132362736, + "grad_norm": 1.789795994758606, + "learning_rate": 4.582144893272307e-05, + "loss": 5.6042, + "step": 31394 + }, + { + "epoch": 0.18671495860690837, + "grad_norm": 1.5900517702102661, + "learning_rate": 4.5821190396220756e-05, + "loss": 4.8256, + "step": 31395 + }, + { + "epoch": 0.18672090589018936, + "grad_norm": 1.594332218170166, + "learning_rate": 4.582093185244997e-05, + "loss": 5.0181, + "step": 31396 + }, + { + "epoch": 0.18672685317347035, + "grad_norm": 1.881818413734436, + "learning_rate": 4.582067330141082e-05, + "loss": 5.3832, + "step": 31397 + }, + { + "epoch": 0.18673280045675136, + "grad_norm": 2.042795419692993, + "learning_rate": 4.582041474310339e-05, + "loss": 5.0048, + "step": 31398 + }, + { + "epoch": 0.18673874774003235, + "grad_norm": 1.8554868698120117, + "learning_rate": 4.5820156177527764e-05, + "loss": 4.7971, + "step": 31399 + }, + { + "epoch": 0.18674469502331334, + "grad_norm": 1.6183528900146484, + "learning_rate": 4.581989760468404e-05, + "loss": 4.9781, + "step": 31400 + }, + { + "epoch": 0.18675064230659436, + "grad_norm": 2.160238265991211, + "learning_rate": 4.5819639024572295e-05, + "loss": 5.0855, + "step": 31401 + }, + { + "epoch": 0.18675658958987534, + "grad_norm": 2.1129162311553955, + "learning_rate": 4.5819380437192636e-05, + "loss": 5.111, + "step": 31402 + }, + { + "epoch": 0.18676253687315633, + "grad_norm": 1.541813850402832, + "learning_rate": 4.5819121842545144e-05, + "loss": 5.1907, + "step": 31403 + }, + { + "epoch": 0.18676848415643735, + "grad_norm": 1.655600905418396, + "learning_rate": 4.581886324062992e-05, + "loss": 5.6293, + "step": 31404 + }, + { + "epoch": 0.18677443143971834, + "grad_norm": 1.5326381921768188, + "learning_rate": 4.581860463144703e-05, + "loss": 4.9882, + "step": 31405 + }, + { + "epoch": 0.18678037872299932, + "grad_norm": 2.5064444541931152, + "learning_rate": 4.58183460149966e-05, + "loss": 4.3907, + "step": 31406 + }, + { + "epoch": 0.18678632600628034, + "grad_norm": 2.4211840629577637, + "learning_rate": 4.581808739127868e-05, + "loss": 4.6788, + "step": 31407 + }, + { + "epoch": 0.18679227328956133, + "grad_norm": 1.835132122039795, + "learning_rate": 4.581782876029339e-05, + "loss": 4.4737, + "step": 31408 + }, + { + "epoch": 0.18679822057284232, + "grad_norm": 1.724884033203125, + "learning_rate": 4.581757012204082e-05, + "loss": 4.2805, + "step": 31409 + }, + { + "epoch": 0.18680416785612333, + "grad_norm": 1.43998384475708, + "learning_rate": 4.581731147652104e-05, + "loss": 4.9872, + "step": 31410 + }, + { + "epoch": 0.18681011513940432, + "grad_norm": 1.7539047002792358, + "learning_rate": 4.5817052823734155e-05, + "loss": 5.1531, + "step": 31411 + }, + { + "epoch": 0.1868160624226853, + "grad_norm": 1.7996374368667603, + "learning_rate": 4.5816794163680255e-05, + "loss": 4.5348, + "step": 31412 + }, + { + "epoch": 0.18682200970596632, + "grad_norm": 1.9007580280303955, + "learning_rate": 4.5816535496359416e-05, + "loss": 4.5503, + "step": 31413 + }, + { + "epoch": 0.1868279569892473, + "grad_norm": 2.9723873138427734, + "learning_rate": 4.581627682177175e-05, + "loss": 3.6093, + "step": 31414 + }, + { + "epoch": 0.1868339042725283, + "grad_norm": 1.840366244316101, + "learning_rate": 4.581601813991734e-05, + "loss": 4.5359, + "step": 31415 + }, + { + "epoch": 0.18683985155580932, + "grad_norm": 1.7800344228744507, + "learning_rate": 4.5815759450796265e-05, + "loss": 4.7916, + "step": 31416 + }, + { + "epoch": 0.1868457988390903, + "grad_norm": 2.508409261703491, + "learning_rate": 4.581550075440862e-05, + "loss": 3.9651, + "step": 31417 + }, + { + "epoch": 0.1868517461223713, + "grad_norm": 1.4773229360580444, + "learning_rate": 4.581524205075451e-05, + "loss": 5.1962, + "step": 31418 + }, + { + "epoch": 0.1868576934056523, + "grad_norm": 1.7282037734985352, + "learning_rate": 4.5814983339834004e-05, + "loss": 5.0627, + "step": 31419 + }, + { + "epoch": 0.1868636406889333, + "grad_norm": 1.5566262006759644, + "learning_rate": 4.581472462164721e-05, + "loss": 5.0318, + "step": 31420 + }, + { + "epoch": 0.18686958797221428, + "grad_norm": 1.586804986000061, + "learning_rate": 4.581446589619421e-05, + "loss": 5.3587, + "step": 31421 + }, + { + "epoch": 0.1868755352554953, + "grad_norm": 1.626639723777771, + "learning_rate": 4.5814207163475094e-05, + "loss": 5.1839, + "step": 31422 + }, + { + "epoch": 0.1868814825387763, + "grad_norm": 1.9931199550628662, + "learning_rate": 4.581394842348995e-05, + "loss": 4.5328, + "step": 31423 + }, + { + "epoch": 0.18688742982205728, + "grad_norm": 1.5360701084136963, + "learning_rate": 4.581368967623887e-05, + "loss": 6.0491, + "step": 31424 + }, + { + "epoch": 0.1868933771053383, + "grad_norm": 1.7270042896270752, + "learning_rate": 4.5813430921721954e-05, + "loss": 5.4057, + "step": 31425 + }, + { + "epoch": 0.18689932438861928, + "grad_norm": 1.620786190032959, + "learning_rate": 4.5813172159939276e-05, + "loss": 5.1965, + "step": 31426 + }, + { + "epoch": 0.18690527167190027, + "grad_norm": 1.6832870244979858, + "learning_rate": 4.5812913390890945e-05, + "loss": 5.1923, + "step": 31427 + }, + { + "epoch": 0.18691121895518129, + "grad_norm": 1.7056113481521606, + "learning_rate": 4.581265461457703e-05, + "loss": 5.0523, + "step": 31428 + }, + { + "epoch": 0.18691716623846227, + "grad_norm": 1.7429434061050415, + "learning_rate": 4.581239583099763e-05, + "loss": 5.1345, + "step": 31429 + }, + { + "epoch": 0.18692311352174326, + "grad_norm": 1.6870777606964111, + "learning_rate": 4.5812137040152854e-05, + "loss": 5.3135, + "step": 31430 + }, + { + "epoch": 0.18692906080502428, + "grad_norm": 1.7804944515228271, + "learning_rate": 4.581187824204277e-05, + "loss": 5.3752, + "step": 31431 + }, + { + "epoch": 0.18693500808830527, + "grad_norm": 1.5267258882522583, + "learning_rate": 4.5811619436667465e-05, + "loss": 5.5806, + "step": 31432 + }, + { + "epoch": 0.18694095537158625, + "grad_norm": 1.6377745866775513, + "learning_rate": 4.5811360624027045e-05, + "loss": 5.3912, + "step": 31433 + }, + { + "epoch": 0.18694690265486727, + "grad_norm": 1.8628687858581543, + "learning_rate": 4.581110180412159e-05, + "loss": 4.087, + "step": 31434 + }, + { + "epoch": 0.18695284993814826, + "grad_norm": 1.439253568649292, + "learning_rate": 4.58108429769512e-05, + "loss": 5.171, + "step": 31435 + }, + { + "epoch": 0.18695879722142925, + "grad_norm": 1.7017579078674316, + "learning_rate": 4.581058414251596e-05, + "loss": 4.8104, + "step": 31436 + }, + { + "epoch": 0.18696474450471026, + "grad_norm": 1.866621971130371, + "learning_rate": 4.581032530081596e-05, + "loss": 4.979, + "step": 31437 + }, + { + "epoch": 0.18697069178799125, + "grad_norm": 1.5694007873535156, + "learning_rate": 4.581006645185129e-05, + "loss": 5.031, + "step": 31438 + }, + { + "epoch": 0.18697663907127224, + "grad_norm": 1.5056393146514893, + "learning_rate": 4.580980759562203e-05, + "loss": 5.082, + "step": 31439 + }, + { + "epoch": 0.18698258635455323, + "grad_norm": 1.5853091478347778, + "learning_rate": 4.580954873212829e-05, + "loss": 5.0652, + "step": 31440 + }, + { + "epoch": 0.18698853363783424, + "grad_norm": 1.423098087310791, + "learning_rate": 4.580928986137015e-05, + "loss": 5.2198, + "step": 31441 + }, + { + "epoch": 0.18699448092111523, + "grad_norm": 1.8297144174575806, + "learning_rate": 4.580903098334771e-05, + "loss": 4.8045, + "step": 31442 + }, + { + "epoch": 0.18700042820439622, + "grad_norm": 1.4703069925308228, + "learning_rate": 4.580877209806105e-05, + "loss": 4.9772, + "step": 31443 + }, + { + "epoch": 0.18700637548767723, + "grad_norm": 1.6311166286468506, + "learning_rate": 4.580851320551025e-05, + "loss": 5.0265, + "step": 31444 + }, + { + "epoch": 0.18701232277095822, + "grad_norm": 1.5908745527267456, + "learning_rate": 4.5808254305695425e-05, + "loss": 5.6455, + "step": 31445 + }, + { + "epoch": 0.1870182700542392, + "grad_norm": 1.6188886165618896, + "learning_rate": 4.580799539861665e-05, + "loss": 4.9907, + "step": 31446 + }, + { + "epoch": 0.18702421733752023, + "grad_norm": 1.6662514209747314, + "learning_rate": 4.580773648427402e-05, + "loss": 4.599, + "step": 31447 + }, + { + "epoch": 0.18703016462080121, + "grad_norm": 1.7355191707611084, + "learning_rate": 4.5807477562667624e-05, + "loss": 4.721, + "step": 31448 + }, + { + "epoch": 0.1870361119040822, + "grad_norm": 1.6992077827453613, + "learning_rate": 4.580721863379755e-05, + "loss": 4.6429, + "step": 31449 + }, + { + "epoch": 0.18704205918736322, + "grad_norm": 1.8001128435134888, + "learning_rate": 4.580695969766389e-05, + "loss": 4.6414, + "step": 31450 + }, + { + "epoch": 0.1870480064706442, + "grad_norm": 1.691829800605774, + "learning_rate": 4.580670075426674e-05, + "loss": 4.6086, + "step": 31451 + }, + { + "epoch": 0.1870539537539252, + "grad_norm": 1.8028392791748047, + "learning_rate": 4.580644180360618e-05, + "loss": 4.8074, + "step": 31452 + }, + { + "epoch": 0.1870599010372062, + "grad_norm": 1.355403184890747, + "learning_rate": 4.580618284568231e-05, + "loss": 5.077, + "step": 31453 + }, + { + "epoch": 0.1870658483204872, + "grad_norm": 1.6251015663146973, + "learning_rate": 4.580592388049522e-05, + "loss": 4.7268, + "step": 31454 + }, + { + "epoch": 0.1870717956037682, + "grad_norm": 1.8957926034927368, + "learning_rate": 4.580566490804499e-05, + "loss": 4.5649, + "step": 31455 + }, + { + "epoch": 0.1870777428870492, + "grad_norm": 1.628433346748352, + "learning_rate": 4.5805405928331726e-05, + "loss": 4.5964, + "step": 31456 + }, + { + "epoch": 0.1870836901703302, + "grad_norm": 1.7020845413208008, + "learning_rate": 4.58051469413555e-05, + "loss": 4.5698, + "step": 31457 + }, + { + "epoch": 0.18708963745361118, + "grad_norm": 1.6829500198364258, + "learning_rate": 4.580488794711641e-05, + "loss": 4.619, + "step": 31458 + }, + { + "epoch": 0.1870955847368922, + "grad_norm": 1.7393929958343506, + "learning_rate": 4.580462894561456e-05, + "loss": 4.4903, + "step": 31459 + }, + { + "epoch": 0.18710153202017318, + "grad_norm": 1.6554701328277588, + "learning_rate": 4.5804369936850024e-05, + "loss": 5.2823, + "step": 31460 + }, + { + "epoch": 0.18710747930345417, + "grad_norm": 1.4598510265350342, + "learning_rate": 4.58041109208229e-05, + "loss": 5.2572, + "step": 31461 + }, + { + "epoch": 0.1871134265867352, + "grad_norm": 1.5052999258041382, + "learning_rate": 4.5803851897533265e-05, + "loss": 5.1571, + "step": 31462 + }, + { + "epoch": 0.18711937387001618, + "grad_norm": 1.4165245294570923, + "learning_rate": 4.580359286698123e-05, + "loss": 5.0514, + "step": 31463 + }, + { + "epoch": 0.18712532115329716, + "grad_norm": 1.668857455253601, + "learning_rate": 4.5803333829166874e-05, + "loss": 4.7183, + "step": 31464 + }, + { + "epoch": 0.18713126843657818, + "grad_norm": 1.7835750579833984, + "learning_rate": 4.580307478409029e-05, + "loss": 4.1674, + "step": 31465 + }, + { + "epoch": 0.18713721571985917, + "grad_norm": 1.8612866401672363, + "learning_rate": 4.580281573175157e-05, + "loss": 4.245, + "step": 31466 + }, + { + "epoch": 0.18714316300314016, + "grad_norm": 2.1322779655456543, + "learning_rate": 4.58025566721508e-05, + "loss": 4.191, + "step": 31467 + }, + { + "epoch": 0.18714911028642117, + "grad_norm": 1.4032418727874756, + "learning_rate": 4.580229760528807e-05, + "loss": 4.7888, + "step": 31468 + }, + { + "epoch": 0.18715505756970216, + "grad_norm": 1.4955732822418213, + "learning_rate": 4.580203853116347e-05, + "loss": 5.0653, + "step": 31469 + }, + { + "epoch": 0.18716100485298315, + "grad_norm": 1.857201099395752, + "learning_rate": 4.580177944977709e-05, + "loss": 4.9189, + "step": 31470 + }, + { + "epoch": 0.18716695213626416, + "grad_norm": 1.4744160175323486, + "learning_rate": 4.5801520361129034e-05, + "loss": 3.9242, + "step": 31471 + }, + { + "epoch": 0.18717289941954515, + "grad_norm": 1.6050392389297485, + "learning_rate": 4.580126126521938e-05, + "loss": 4.7737, + "step": 31472 + }, + { + "epoch": 0.18717884670282614, + "grad_norm": 1.4203214645385742, + "learning_rate": 4.580100216204822e-05, + "loss": 4.7792, + "step": 31473 + }, + { + "epoch": 0.18718479398610716, + "grad_norm": 1.7042044401168823, + "learning_rate": 4.580074305161565e-05, + "loss": 4.5548, + "step": 31474 + }, + { + "epoch": 0.18719074126938814, + "grad_norm": 1.8733965158462524, + "learning_rate": 4.5800483933921746e-05, + "loss": 4.289, + "step": 31475 + }, + { + "epoch": 0.18719668855266913, + "grad_norm": 1.8629066944122314, + "learning_rate": 4.580022480896661e-05, + "loss": 4.2435, + "step": 31476 + }, + { + "epoch": 0.18720263583595015, + "grad_norm": 1.7233967781066895, + "learning_rate": 4.5799965676750336e-05, + "loss": 4.4444, + "step": 31477 + }, + { + "epoch": 0.18720858311923114, + "grad_norm": 1.6446317434310913, + "learning_rate": 4.5799706537273e-05, + "loss": 4.754, + "step": 31478 + }, + { + "epoch": 0.18721453040251212, + "grad_norm": 1.7049897909164429, + "learning_rate": 4.5799447390534714e-05, + "loss": 4.5082, + "step": 31479 + }, + { + "epoch": 0.18722047768579314, + "grad_norm": 1.6299967765808105, + "learning_rate": 4.579918823653554e-05, + "loss": 4.5914, + "step": 31480 + }, + { + "epoch": 0.18722642496907413, + "grad_norm": 1.862816333770752, + "learning_rate": 4.579892907527559e-05, + "loss": 4.4565, + "step": 31481 + }, + { + "epoch": 0.18723237225235512, + "grad_norm": 1.6829630136489868, + "learning_rate": 4.579866990675495e-05, + "loss": 3.9664, + "step": 31482 + }, + { + "epoch": 0.18723831953563613, + "grad_norm": 1.7739498615264893, + "learning_rate": 4.579841073097372e-05, + "loss": 4.5638, + "step": 31483 + }, + { + "epoch": 0.18724426681891712, + "grad_norm": 1.7989349365234375, + "learning_rate": 4.5798151547931963e-05, + "loss": 4.6418, + "step": 31484 + }, + { + "epoch": 0.1872502141021981, + "grad_norm": 1.6883355379104614, + "learning_rate": 4.5797892357629794e-05, + "loss": 4.6899, + "step": 31485 + }, + { + "epoch": 0.18725616138547913, + "grad_norm": 1.5071123838424683, + "learning_rate": 4.57976331600673e-05, + "loss": 4.605, + "step": 31486 + }, + { + "epoch": 0.1872621086687601, + "grad_norm": 1.6472139358520508, + "learning_rate": 4.579737395524456e-05, + "loss": 4.4949, + "step": 31487 + }, + { + "epoch": 0.1872680559520411, + "grad_norm": 2.729337215423584, + "learning_rate": 4.579711474316167e-05, + "loss": 4.4027, + "step": 31488 + }, + { + "epoch": 0.18727400323532212, + "grad_norm": 1.8999816179275513, + "learning_rate": 4.5796855523818726e-05, + "loss": 4.7577, + "step": 31489 + }, + { + "epoch": 0.1872799505186031, + "grad_norm": 1.6633950471878052, + "learning_rate": 4.5796596297215815e-05, + "loss": 4.3385, + "step": 31490 + }, + { + "epoch": 0.1872858978018841, + "grad_norm": 1.6885244846343994, + "learning_rate": 4.579633706335303e-05, + "loss": 4.4684, + "step": 31491 + }, + { + "epoch": 0.1872918450851651, + "grad_norm": 1.56419837474823, + "learning_rate": 4.579607782223045e-05, + "loss": 4.5609, + "step": 31492 + }, + { + "epoch": 0.1872977923684461, + "grad_norm": 1.6976735591888428, + "learning_rate": 4.579581857384818e-05, + "loss": 4.3122, + "step": 31493 + }, + { + "epoch": 0.18730373965172709, + "grad_norm": 2.019990921020508, + "learning_rate": 4.5795559318206304e-05, + "loss": 4.0644, + "step": 31494 + }, + { + "epoch": 0.1873096869350081, + "grad_norm": 2.4111409187316895, + "learning_rate": 4.5795300055304914e-05, + "loss": 3.8046, + "step": 31495 + }, + { + "epoch": 0.1873156342182891, + "grad_norm": 1.6888504028320312, + "learning_rate": 4.57950407851441e-05, + "loss": 3.7976, + "step": 31496 + }, + { + "epoch": 0.18732158150157008, + "grad_norm": 2.261028528213501, + "learning_rate": 4.579478150772395e-05, + "loss": 3.9696, + "step": 31497 + }, + { + "epoch": 0.18732752878485107, + "grad_norm": 2.104658365249634, + "learning_rate": 4.5794522223044555e-05, + "loss": 3.879, + "step": 31498 + }, + { + "epoch": 0.18733347606813208, + "grad_norm": 2.300837755203247, + "learning_rate": 4.5794262931106015e-05, + "loss": 4.1062, + "step": 31499 + }, + { + "epoch": 0.18733942335141307, + "grad_norm": 2.2843008041381836, + "learning_rate": 4.57940036319084e-05, + "loss": 3.7473, + "step": 31500 + }, + { + "epoch": 0.18734537063469406, + "grad_norm": 2.924936294555664, + "learning_rate": 4.5793744325451826e-05, + "loss": 3.7478, + "step": 31501 + }, + { + "epoch": 0.18735131791797507, + "grad_norm": 2.4981048107147217, + "learning_rate": 4.579348501173636e-05, + "loss": 3.7812, + "step": 31502 + }, + { + "epoch": 0.18735726520125606, + "grad_norm": 2.363129138946533, + "learning_rate": 4.5793225690762106e-05, + "loss": 3.7943, + "step": 31503 + }, + { + "epoch": 0.18736321248453705, + "grad_norm": 2.4851186275482178, + "learning_rate": 4.579296636252915e-05, + "loss": 3.8708, + "step": 31504 + }, + { + "epoch": 0.18736915976781807, + "grad_norm": 2.625079870223999, + "learning_rate": 4.5792707027037595e-05, + "loss": 3.769, + "step": 31505 + }, + { + "epoch": 0.18737510705109905, + "grad_norm": 1.9397916793823242, + "learning_rate": 4.579244768428751e-05, + "loss": 4.3074, + "step": 31506 + }, + { + "epoch": 0.18738105433438004, + "grad_norm": 2.270460605621338, + "learning_rate": 4.5792188334279004e-05, + "loss": 3.8198, + "step": 31507 + }, + { + "epoch": 0.18738700161766106, + "grad_norm": 2.187398910522461, + "learning_rate": 4.579192897701215e-05, + "loss": 3.7374, + "step": 31508 + }, + { + "epoch": 0.18739294890094205, + "grad_norm": 2.3796896934509277, + "learning_rate": 4.579166961248706e-05, + "loss": 3.9178, + "step": 31509 + }, + { + "epoch": 0.18739889618422303, + "grad_norm": 2.440819501876831, + "learning_rate": 4.579141024070381e-05, + "loss": 3.6605, + "step": 31510 + }, + { + "epoch": 0.18740484346750405, + "grad_norm": 2.090683698654175, + "learning_rate": 4.579115086166249e-05, + "loss": 4.0199, + "step": 31511 + }, + { + "epoch": 0.18741079075078504, + "grad_norm": 1.8660192489624023, + "learning_rate": 4.5790891475363195e-05, + "loss": 5.4397, + "step": 31512 + }, + { + "epoch": 0.18741673803406603, + "grad_norm": 1.8933132886886597, + "learning_rate": 4.579063208180601e-05, + "loss": 5.161, + "step": 31513 + }, + { + "epoch": 0.18742268531734704, + "grad_norm": 1.442830204963684, + "learning_rate": 4.5790372680991035e-05, + "loss": 4.9392, + "step": 31514 + }, + { + "epoch": 0.18742863260062803, + "grad_norm": 1.606457233428955, + "learning_rate": 4.5790113272918355e-05, + "loss": 5.1507, + "step": 31515 + }, + { + "epoch": 0.18743457988390902, + "grad_norm": 1.7178606986999512, + "learning_rate": 4.578985385758806e-05, + "loss": 5.1888, + "step": 31516 + }, + { + "epoch": 0.18744052716719004, + "grad_norm": 1.7797423601150513, + "learning_rate": 4.578959443500025e-05, + "loss": 5.1161, + "step": 31517 + }, + { + "epoch": 0.18744647445047102, + "grad_norm": 1.7583237886428833, + "learning_rate": 4.5789335005154996e-05, + "loss": 4.9044, + "step": 31518 + }, + { + "epoch": 0.187452421733752, + "grad_norm": 1.9187301397323608, + "learning_rate": 4.578907556805241e-05, + "loss": 4.8383, + "step": 31519 + }, + { + "epoch": 0.18745836901703303, + "grad_norm": 1.3928438425064087, + "learning_rate": 4.578881612369256e-05, + "loss": 4.6952, + "step": 31520 + }, + { + "epoch": 0.18746431630031402, + "grad_norm": 1.5495777130126953, + "learning_rate": 4.578855667207556e-05, + "loss": 5.093, + "step": 31521 + }, + { + "epoch": 0.187470263583595, + "grad_norm": 2.0939781665802, + "learning_rate": 4.578829721320148e-05, + "loss": 4.4353, + "step": 31522 + }, + { + "epoch": 0.18747621086687602, + "grad_norm": 2.6413023471832275, + "learning_rate": 4.578803774707043e-05, + "loss": 3.7471, + "step": 31523 + }, + { + "epoch": 0.187482158150157, + "grad_norm": 2.237964630126953, + "learning_rate": 4.578777827368249e-05, + "loss": 4.1189, + "step": 31524 + }, + { + "epoch": 0.187488105433438, + "grad_norm": 1.77215576171875, + "learning_rate": 4.5787518793037745e-05, + "loss": 4.5919, + "step": 31525 + }, + { + "epoch": 0.187494052716719, + "grad_norm": 1.7483875751495361, + "learning_rate": 4.5787259305136297e-05, + "loss": 4.7209, + "step": 31526 + }, + { + "epoch": 0.1875, + "grad_norm": 1.7072293758392334, + "learning_rate": 4.578699980997823e-05, + "loss": 4.4651, + "step": 31527 + }, + { + "epoch": 0.187505947283281, + "grad_norm": 1.7075767517089844, + "learning_rate": 4.5786740307563636e-05, + "loss": 4.3471, + "step": 31528 + }, + { + "epoch": 0.187511894566562, + "grad_norm": 2.496588945388794, + "learning_rate": 4.578648079789261e-05, + "loss": 3.6709, + "step": 31529 + }, + { + "epoch": 0.187517841849843, + "grad_norm": 2.438305139541626, + "learning_rate": 4.578622128096522e-05, + "loss": 3.8271, + "step": 31530 + }, + { + "epoch": 0.18752378913312398, + "grad_norm": 2.574528455734253, + "learning_rate": 4.578596175678159e-05, + "loss": 3.7591, + "step": 31531 + }, + { + "epoch": 0.187529736416405, + "grad_norm": 2.3681464195251465, + "learning_rate": 4.5785702225341796e-05, + "loss": 4.214, + "step": 31532 + }, + { + "epoch": 0.18753568369968598, + "grad_norm": 1.5918017625808716, + "learning_rate": 4.578544268664593e-05, + "loss": 5.2159, + "step": 31533 + }, + { + "epoch": 0.18754163098296697, + "grad_norm": 1.9178626537322998, + "learning_rate": 4.5785183140694073e-05, + "loss": 3.9341, + "step": 31534 + }, + { + "epoch": 0.187547578266248, + "grad_norm": 2.1391525268554688, + "learning_rate": 4.578492358748633e-05, + "loss": 3.1456, + "step": 31535 + }, + { + "epoch": 0.18755352554952898, + "grad_norm": 2.421508312225342, + "learning_rate": 4.578466402702278e-05, + "loss": 3.1124, + "step": 31536 + }, + { + "epoch": 0.18755947283280996, + "grad_norm": 2.379535675048828, + "learning_rate": 4.578440445930352e-05, + "loss": 3.2543, + "step": 31537 + }, + { + "epoch": 0.18756542011609098, + "grad_norm": 2.236633777618408, + "learning_rate": 4.578414488432864e-05, + "loss": 3.3216, + "step": 31538 + }, + { + "epoch": 0.18757136739937197, + "grad_norm": 2.082542657852173, + "learning_rate": 4.578388530209823e-05, + "loss": 3.1493, + "step": 31539 + }, + { + "epoch": 0.18757731468265296, + "grad_norm": 2.2979769706726074, + "learning_rate": 4.5783625712612384e-05, + "loss": 3.2585, + "step": 31540 + }, + { + "epoch": 0.18758326196593397, + "grad_norm": 2.1978182792663574, + "learning_rate": 4.5783366115871186e-05, + "loss": 3.2713, + "step": 31541 + }, + { + "epoch": 0.18758920924921496, + "grad_norm": 2.097055435180664, + "learning_rate": 4.578310651187473e-05, + "loss": 3.3176, + "step": 31542 + }, + { + "epoch": 0.18759515653249595, + "grad_norm": 2.2990310192108154, + "learning_rate": 4.57828469006231e-05, + "loss": 3.1615, + "step": 31543 + }, + { + "epoch": 0.18760110381577697, + "grad_norm": 2.353107213973999, + "learning_rate": 4.5782587282116394e-05, + "loss": 3.0828, + "step": 31544 + }, + { + "epoch": 0.18760705109905795, + "grad_norm": 2.156449794769287, + "learning_rate": 4.578232765635471e-05, + "loss": 3.7385, + "step": 31545 + }, + { + "epoch": 0.18761299838233894, + "grad_norm": 1.8776116371154785, + "learning_rate": 4.578206802333812e-05, + "loss": 5.1393, + "step": 31546 + }, + { + "epoch": 0.18761894566561996, + "grad_norm": 1.8295111656188965, + "learning_rate": 4.578180838306674e-05, + "loss": 4.6989, + "step": 31547 + }, + { + "epoch": 0.18762489294890095, + "grad_norm": 1.707702398300171, + "learning_rate": 4.578154873554063e-05, + "loss": 4.6461, + "step": 31548 + }, + { + "epoch": 0.18763084023218193, + "grad_norm": 2.7304489612579346, + "learning_rate": 4.57812890807599e-05, + "loss": 3.7014, + "step": 31549 + }, + { + "epoch": 0.18763678751546295, + "grad_norm": 3.1167895793914795, + "learning_rate": 4.578102941872464e-05, + "loss": 3.9208, + "step": 31550 + }, + { + "epoch": 0.18764273479874394, + "grad_norm": 2.5492351055145264, + "learning_rate": 4.578076974943494e-05, + "loss": 3.769, + "step": 31551 + }, + { + "epoch": 0.18764868208202493, + "grad_norm": 1.8772006034851074, + "learning_rate": 4.578051007289088e-05, + "loss": 4.4789, + "step": 31552 + }, + { + "epoch": 0.18765462936530594, + "grad_norm": 1.7834813594818115, + "learning_rate": 4.578025038909256e-05, + "loss": 4.9667, + "step": 31553 + }, + { + "epoch": 0.18766057664858693, + "grad_norm": 1.9036569595336914, + "learning_rate": 4.5779990698040074e-05, + "loss": 4.4362, + "step": 31554 + }, + { + "epoch": 0.18766652393186792, + "grad_norm": 1.837803602218628, + "learning_rate": 4.5779730999733506e-05, + "loss": 4.7798, + "step": 31555 + }, + { + "epoch": 0.18767247121514893, + "grad_norm": 1.6703819036483765, + "learning_rate": 4.577947129417295e-05, + "loss": 4.8608, + "step": 31556 + }, + { + "epoch": 0.18767841849842992, + "grad_norm": 1.7654380798339844, + "learning_rate": 4.577921158135849e-05, + "loss": 4.6977, + "step": 31557 + }, + { + "epoch": 0.1876843657817109, + "grad_norm": 1.843579649925232, + "learning_rate": 4.577895186129022e-05, + "loss": 4.2761, + "step": 31558 + }, + { + "epoch": 0.1876903130649919, + "grad_norm": 1.7880736589431763, + "learning_rate": 4.577869213396824e-05, + "loss": 4.7802, + "step": 31559 + }, + { + "epoch": 0.18769626034827291, + "grad_norm": 1.5163524150848389, + "learning_rate": 4.577843239939263e-05, + "loss": 4.9608, + "step": 31560 + }, + { + "epoch": 0.1877022076315539, + "grad_norm": 1.6260676383972168, + "learning_rate": 4.5778172657563486e-05, + "loss": 4.8441, + "step": 31561 + }, + { + "epoch": 0.1877081549148349, + "grad_norm": 2.001150369644165, + "learning_rate": 4.57779129084809e-05, + "loss": 4.1881, + "step": 31562 + }, + { + "epoch": 0.1877141021981159, + "grad_norm": 1.6918448209762573, + "learning_rate": 4.577765315214495e-05, + "loss": 4.309, + "step": 31563 + }, + { + "epoch": 0.1877200494813969, + "grad_norm": 1.5819053649902344, + "learning_rate": 4.5777393388555745e-05, + "loss": 4.7125, + "step": 31564 + }, + { + "epoch": 0.18772599676467788, + "grad_norm": 1.521506428718567, + "learning_rate": 4.5777133617713355e-05, + "loss": 4.4762, + "step": 31565 + }, + { + "epoch": 0.1877319440479589, + "grad_norm": 1.608293056488037, + "learning_rate": 4.57768738396179e-05, + "loss": 4.8308, + "step": 31566 + }, + { + "epoch": 0.1877378913312399, + "grad_norm": 1.7008312940597534, + "learning_rate": 4.577661405426943e-05, + "loss": 4.7827, + "step": 31567 + }, + { + "epoch": 0.18774383861452087, + "grad_norm": 1.6263885498046875, + "learning_rate": 4.577635426166807e-05, + "loss": 5.102, + "step": 31568 + }, + { + "epoch": 0.1877497858978019, + "grad_norm": 1.7362202405929565, + "learning_rate": 4.5776094461813903e-05, + "loss": 5.0606, + "step": 31569 + }, + { + "epoch": 0.18775573318108288, + "grad_norm": 1.699578881263733, + "learning_rate": 4.577583465470702e-05, + "loss": 4.5649, + "step": 31570 + }, + { + "epoch": 0.18776168046436387, + "grad_norm": 1.5926166772842407, + "learning_rate": 4.5775574840347504e-05, + "loss": 4.9645, + "step": 31571 + }, + { + "epoch": 0.18776762774764488, + "grad_norm": 1.5831513404846191, + "learning_rate": 4.5775315018735443e-05, + "loss": 5.0697, + "step": 31572 + }, + { + "epoch": 0.18777357503092587, + "grad_norm": 1.4057412147521973, + "learning_rate": 4.5775055189870945e-05, + "loss": 4.9488, + "step": 31573 + }, + { + "epoch": 0.18777952231420686, + "grad_norm": 1.5728765726089478, + "learning_rate": 4.5774795353754075e-05, + "loss": 4.6663, + "step": 31574 + }, + { + "epoch": 0.18778546959748788, + "grad_norm": 1.5813493728637695, + "learning_rate": 4.577453551038495e-05, + "loss": 5.5153, + "step": 31575 + }, + { + "epoch": 0.18779141688076886, + "grad_norm": 1.6821653842926025, + "learning_rate": 4.5774275659763644e-05, + "loss": 5.2125, + "step": 31576 + }, + { + "epoch": 0.18779736416404985, + "grad_norm": 1.6527361869812012, + "learning_rate": 4.577401580189025e-05, + "loss": 5.0845, + "step": 31577 + }, + { + "epoch": 0.18780331144733087, + "grad_norm": 1.7719552516937256, + "learning_rate": 4.5773755936764876e-05, + "loss": 5.0366, + "step": 31578 + }, + { + "epoch": 0.18780925873061186, + "grad_norm": 1.7301576137542725, + "learning_rate": 4.5773496064387576e-05, + "loss": 5.7365, + "step": 31579 + }, + { + "epoch": 0.18781520601389284, + "grad_norm": 1.64248788356781, + "learning_rate": 4.577323618475848e-05, + "loss": 5.1362, + "step": 31580 + }, + { + "epoch": 0.18782115329717386, + "grad_norm": 1.539428472518921, + "learning_rate": 4.5772976297877653e-05, + "loss": 5.2903, + "step": 31581 + }, + { + "epoch": 0.18782710058045485, + "grad_norm": 1.7478768825531006, + "learning_rate": 4.577271640374521e-05, + "loss": 4.9548, + "step": 31582 + }, + { + "epoch": 0.18783304786373584, + "grad_norm": 1.6046321392059326, + "learning_rate": 4.5772456502361216e-05, + "loss": 5.1547, + "step": 31583 + }, + { + "epoch": 0.18783899514701685, + "grad_norm": 1.613788366317749, + "learning_rate": 4.577219659372577e-05, + "loss": 4.7554, + "step": 31584 + }, + { + "epoch": 0.18784494243029784, + "grad_norm": 1.7057472467422485, + "learning_rate": 4.577193667783897e-05, + "loss": 4.8775, + "step": 31585 + }, + { + "epoch": 0.18785088971357883, + "grad_norm": 1.5329315662384033, + "learning_rate": 4.5771676754700896e-05, + "loss": 4.7219, + "step": 31586 + }, + { + "epoch": 0.18785683699685984, + "grad_norm": 1.5598114728927612, + "learning_rate": 4.577141682431164e-05, + "loss": 4.9717, + "step": 31587 + }, + { + "epoch": 0.18786278428014083, + "grad_norm": 1.6115435361862183, + "learning_rate": 4.577115688667131e-05, + "loss": 4.7923, + "step": 31588 + }, + { + "epoch": 0.18786873156342182, + "grad_norm": 1.2374604940414429, + "learning_rate": 4.5770896941779974e-05, + "loss": 5.4604, + "step": 31589 + }, + { + "epoch": 0.18787467884670284, + "grad_norm": 1.4944182634353638, + "learning_rate": 4.577063698963774e-05, + "loss": 5.6087, + "step": 31590 + }, + { + "epoch": 0.18788062612998382, + "grad_norm": 1.454232931137085, + "learning_rate": 4.577037703024468e-05, + "loss": 4.9809, + "step": 31591 + }, + { + "epoch": 0.1878865734132648, + "grad_norm": 1.7529237270355225, + "learning_rate": 4.57701170636009e-05, + "loss": 4.9926, + "step": 31592 + }, + { + "epoch": 0.18789252069654583, + "grad_norm": 1.7798666954040527, + "learning_rate": 4.5769857089706494e-05, + "loss": 4.7364, + "step": 31593 + }, + { + "epoch": 0.18789846797982682, + "grad_norm": 1.6372876167297363, + "learning_rate": 4.576959710856154e-05, + "loss": 4.7, + "step": 31594 + }, + { + "epoch": 0.1879044152631078, + "grad_norm": 1.503820776939392, + "learning_rate": 4.5769337120166135e-05, + "loss": 4.7779, + "step": 31595 + }, + { + "epoch": 0.18791036254638882, + "grad_norm": 1.51885187625885, + "learning_rate": 4.576907712452037e-05, + "loss": 5.1936, + "step": 31596 + }, + { + "epoch": 0.1879163098296698, + "grad_norm": 1.5635126829147339, + "learning_rate": 4.576881712162434e-05, + "loss": 5.4207, + "step": 31597 + }, + { + "epoch": 0.1879222571129508, + "grad_norm": 1.565337061882019, + "learning_rate": 4.576855711147812e-05, + "loss": 5.4425, + "step": 31598 + }, + { + "epoch": 0.1879282043962318, + "grad_norm": 1.7001174688339233, + "learning_rate": 4.576829709408181e-05, + "loss": 5.1692, + "step": 31599 + }, + { + "epoch": 0.1879341516795128, + "grad_norm": 1.720685362815857, + "learning_rate": 4.576803706943551e-05, + "loss": 5.3135, + "step": 31600 + }, + { + "epoch": 0.1879400989627938, + "grad_norm": 1.5667119026184082, + "learning_rate": 4.5767777037539304e-05, + "loss": 5.3522, + "step": 31601 + }, + { + "epoch": 0.1879460462460748, + "grad_norm": 1.7021211385726929, + "learning_rate": 4.576751699839328e-05, + "loss": 5.0113, + "step": 31602 + }, + { + "epoch": 0.1879519935293558, + "grad_norm": 1.6862629652023315, + "learning_rate": 4.5767256951997525e-05, + "loss": 5.2257, + "step": 31603 + }, + { + "epoch": 0.18795794081263678, + "grad_norm": 1.5623557567596436, + "learning_rate": 4.5766996898352146e-05, + "loss": 5.1346, + "step": 31604 + }, + { + "epoch": 0.1879638880959178, + "grad_norm": 1.6088786125183105, + "learning_rate": 4.576673683745721e-05, + "loss": 5.2304, + "step": 31605 + }, + { + "epoch": 0.18796983537919879, + "grad_norm": 1.5381817817687988, + "learning_rate": 4.5766476769312827e-05, + "loss": 5.399, + "step": 31606 + }, + { + "epoch": 0.18797578266247977, + "grad_norm": 1.4870381355285645, + "learning_rate": 4.576621669391908e-05, + "loss": 5.2942, + "step": 31607 + }, + { + "epoch": 0.1879817299457608, + "grad_norm": 1.8326987028121948, + "learning_rate": 4.576595661127606e-05, + "loss": 4.5692, + "step": 31608 + }, + { + "epoch": 0.18798767722904178, + "grad_norm": 1.8177613019943237, + "learning_rate": 4.5765696521383863e-05, + "loss": 4.6066, + "step": 31609 + }, + { + "epoch": 0.18799362451232277, + "grad_norm": 1.6168222427368164, + "learning_rate": 4.576543642424257e-05, + "loss": 5.113, + "step": 31610 + }, + { + "epoch": 0.18799957179560378, + "grad_norm": 1.8792698383331299, + "learning_rate": 4.5765176319852287e-05, + "loss": 4.9994, + "step": 31611 + }, + { + "epoch": 0.18800551907888477, + "grad_norm": 1.4694404602050781, + "learning_rate": 4.576491620821309e-05, + "loss": 4.9587, + "step": 31612 + }, + { + "epoch": 0.18801146636216576, + "grad_norm": 1.4442496299743652, + "learning_rate": 4.576465608932508e-05, + "loss": 5.0864, + "step": 31613 + }, + { + "epoch": 0.18801741364544677, + "grad_norm": 1.923790693283081, + "learning_rate": 4.5764395963188335e-05, + "loss": 4.4928, + "step": 31614 + }, + { + "epoch": 0.18802336092872776, + "grad_norm": 1.8033101558685303, + "learning_rate": 4.5764135829802956e-05, + "loss": 4.5554, + "step": 31615 + }, + { + "epoch": 0.18802930821200875, + "grad_norm": 1.7350363731384277, + "learning_rate": 4.5763875689169034e-05, + "loss": 4.4954, + "step": 31616 + }, + { + "epoch": 0.18803525549528974, + "grad_norm": 1.75509774684906, + "learning_rate": 4.576361554128665e-05, + "loss": 4.3791, + "step": 31617 + }, + { + "epoch": 0.18804120277857075, + "grad_norm": 1.8107062578201294, + "learning_rate": 4.576335538615592e-05, + "loss": 4.0603, + "step": 31618 + }, + { + "epoch": 0.18804715006185174, + "grad_norm": 1.824713110923767, + "learning_rate": 4.57630952237769e-05, + "loss": 4.2995, + "step": 31619 + }, + { + "epoch": 0.18805309734513273, + "grad_norm": 2.6946823596954346, + "learning_rate": 4.57628350541497e-05, + "loss": 4.1015, + "step": 31620 + }, + { + "epoch": 0.18805904462841375, + "grad_norm": 1.6974413394927979, + "learning_rate": 4.576257487727442e-05, + "loss": 4.603, + "step": 31621 + }, + { + "epoch": 0.18806499191169473, + "grad_norm": 2.0421180725097656, + "learning_rate": 4.576231469315113e-05, + "loss": 4.3945, + "step": 31622 + }, + { + "epoch": 0.18807093919497572, + "grad_norm": 1.8003754615783691, + "learning_rate": 4.5762054501779934e-05, + "loss": 4.5459, + "step": 31623 + }, + { + "epoch": 0.18807688647825674, + "grad_norm": 1.7390872240066528, + "learning_rate": 4.576179430316092e-05, + "loss": 4.4821, + "step": 31624 + }, + { + "epoch": 0.18808283376153773, + "grad_norm": 1.8832662105560303, + "learning_rate": 4.5761534097294174e-05, + "loss": 3.8606, + "step": 31625 + }, + { + "epoch": 0.18808878104481871, + "grad_norm": 1.6978578567504883, + "learning_rate": 4.576127388417979e-05, + "loss": 5.0896, + "step": 31626 + }, + { + "epoch": 0.18809472832809973, + "grad_norm": 2.140113592147827, + "learning_rate": 4.5761013663817864e-05, + "loss": 5.2355, + "step": 31627 + }, + { + "epoch": 0.18810067561138072, + "grad_norm": 1.6502524614334106, + "learning_rate": 4.576075343620848e-05, + "loss": 5.5907, + "step": 31628 + }, + { + "epoch": 0.1881066228946617, + "grad_norm": 1.6842014789581299, + "learning_rate": 4.576049320135174e-05, + "loss": 5.1909, + "step": 31629 + }, + { + "epoch": 0.18811257017794272, + "grad_norm": 1.5731878280639648, + "learning_rate": 4.576023295924772e-05, + "loss": 5.2126, + "step": 31630 + }, + { + "epoch": 0.1881185174612237, + "grad_norm": 1.822248101234436, + "learning_rate": 4.5759972709896516e-05, + "loss": 4.8566, + "step": 31631 + }, + { + "epoch": 0.1881244647445047, + "grad_norm": 1.8849093914031982, + "learning_rate": 4.575971245329822e-05, + "loss": 5.0437, + "step": 31632 + }, + { + "epoch": 0.18813041202778572, + "grad_norm": 1.7385406494140625, + "learning_rate": 4.575945218945292e-05, + "loss": 5.1574, + "step": 31633 + }, + { + "epoch": 0.1881363593110667, + "grad_norm": 1.8704962730407715, + "learning_rate": 4.5759191918360713e-05, + "loss": 5.4756, + "step": 31634 + }, + { + "epoch": 0.1881423065943477, + "grad_norm": 1.8415088653564453, + "learning_rate": 4.5758931640021684e-05, + "loss": 5.31, + "step": 31635 + }, + { + "epoch": 0.1881482538776287, + "grad_norm": 1.817290186882019, + "learning_rate": 4.5758671354435936e-05, + "loss": 5.2309, + "step": 31636 + }, + { + "epoch": 0.1881542011609097, + "grad_norm": 1.9851620197296143, + "learning_rate": 4.575841106160354e-05, + "loss": 5.1805, + "step": 31637 + }, + { + "epoch": 0.18816014844419068, + "grad_norm": 2.085020065307617, + "learning_rate": 4.57581507615246e-05, + "loss": 3.9923, + "step": 31638 + }, + { + "epoch": 0.1881660957274717, + "grad_norm": 1.8631166219711304, + "learning_rate": 4.57578904541992e-05, + "loss": 4.1787, + "step": 31639 + }, + { + "epoch": 0.1881720430107527, + "grad_norm": 2.2452220916748047, + "learning_rate": 4.5757630139627445e-05, + "loss": 3.9551, + "step": 31640 + }, + { + "epoch": 0.18817799029403368, + "grad_norm": 1.7852009534835815, + "learning_rate": 4.5757369817809415e-05, + "loss": 4.3387, + "step": 31641 + }, + { + "epoch": 0.1881839375773147, + "grad_norm": 1.7815812826156616, + "learning_rate": 4.5757109488745194e-05, + "loss": 4.3556, + "step": 31642 + }, + { + "epoch": 0.18818988486059568, + "grad_norm": 1.7845134735107422, + "learning_rate": 4.5756849152434884e-05, + "loss": 4.0154, + "step": 31643 + }, + { + "epoch": 0.18819583214387667, + "grad_norm": 2.093745231628418, + "learning_rate": 4.5756588808878574e-05, + "loss": 4.2242, + "step": 31644 + }, + { + "epoch": 0.18820177942715768, + "grad_norm": 1.9645696878433228, + "learning_rate": 4.575632845807635e-05, + "loss": 3.8064, + "step": 31645 + }, + { + "epoch": 0.18820772671043867, + "grad_norm": 2.1012284755706787, + "learning_rate": 4.57560681000283e-05, + "loss": 3.9011, + "step": 31646 + }, + { + "epoch": 0.18821367399371966, + "grad_norm": 1.9608296155929565, + "learning_rate": 4.575580773473454e-05, + "loss": 4.008, + "step": 31647 + }, + { + "epoch": 0.18821962127700068, + "grad_norm": 1.7520424127578735, + "learning_rate": 4.5755547362195125e-05, + "loss": 4.2574, + "step": 31648 + }, + { + "epoch": 0.18822556856028166, + "grad_norm": 1.8842599391937256, + "learning_rate": 4.5755286982410165e-05, + "loss": 4.1908, + "step": 31649 + }, + { + "epoch": 0.18823151584356265, + "grad_norm": 1.8884096145629883, + "learning_rate": 4.575502659537976e-05, + "loss": 4.2132, + "step": 31650 + }, + { + "epoch": 0.18823746312684367, + "grad_norm": 1.7970027923583984, + "learning_rate": 4.575476620110398e-05, + "loss": 4.2381, + "step": 31651 + }, + { + "epoch": 0.18824341041012466, + "grad_norm": 1.8529993295669556, + "learning_rate": 4.5754505799582925e-05, + "loss": 4.1563, + "step": 31652 + }, + { + "epoch": 0.18824935769340564, + "grad_norm": 1.8202285766601562, + "learning_rate": 4.5754245390816685e-05, + "loss": 3.8115, + "step": 31653 + }, + { + "epoch": 0.18825530497668666, + "grad_norm": 1.821083426475525, + "learning_rate": 4.575398497480536e-05, + "loss": 4.3038, + "step": 31654 + }, + { + "epoch": 0.18826125225996765, + "grad_norm": 2.2761406898498535, + "learning_rate": 4.575372455154903e-05, + "loss": 3.286, + "step": 31655 + }, + { + "epoch": 0.18826719954324864, + "grad_norm": 2.224435806274414, + "learning_rate": 4.575346412104779e-05, + "loss": 3.3841, + "step": 31656 + }, + { + "epoch": 0.18827314682652965, + "grad_norm": 2.656628370285034, + "learning_rate": 4.5753203683301725e-05, + "loss": 3.4385, + "step": 31657 + }, + { + "epoch": 0.18827909410981064, + "grad_norm": 2.2864227294921875, + "learning_rate": 4.5752943238310935e-05, + "loss": 3.5027, + "step": 31658 + }, + { + "epoch": 0.18828504139309163, + "grad_norm": 2.571734666824341, + "learning_rate": 4.575268278607551e-05, + "loss": 3.4458, + "step": 31659 + }, + { + "epoch": 0.18829098867637264, + "grad_norm": 2.2151083946228027, + "learning_rate": 4.5752422326595534e-05, + "loss": 3.9343, + "step": 31660 + }, + { + "epoch": 0.18829693595965363, + "grad_norm": 1.8273411989212036, + "learning_rate": 4.57521618598711e-05, + "loss": 4.1698, + "step": 31661 + }, + { + "epoch": 0.18830288324293462, + "grad_norm": 1.4451392889022827, + "learning_rate": 4.57519013859023e-05, + "loss": 5.1803, + "step": 31662 + }, + { + "epoch": 0.18830883052621564, + "grad_norm": 1.5774602890014648, + "learning_rate": 4.5751640904689233e-05, + "loss": 5.5158, + "step": 31663 + }, + { + "epoch": 0.18831477780949663, + "grad_norm": 1.76852548122406, + "learning_rate": 4.575138041623197e-05, + "loss": 4.6102, + "step": 31664 + }, + { + "epoch": 0.1883207250927776, + "grad_norm": 2.1750409603118896, + "learning_rate": 4.575111992053063e-05, + "loss": 4.2259, + "step": 31665 + }, + { + "epoch": 0.18832667237605863, + "grad_norm": 2.2930684089660645, + "learning_rate": 4.575085941758528e-05, + "loss": 3.645, + "step": 31666 + }, + { + "epoch": 0.18833261965933962, + "grad_norm": 1.6000158786773682, + "learning_rate": 4.5750598907396015e-05, + "loss": 4.7112, + "step": 31667 + }, + { + "epoch": 0.1883385669426206, + "grad_norm": 2.21150279045105, + "learning_rate": 4.5750338389962936e-05, + "loss": 2.8276, + "step": 31668 + }, + { + "epoch": 0.18834451422590162, + "grad_norm": 2.080242156982422, + "learning_rate": 4.575007786528613e-05, + "loss": 3.6421, + "step": 31669 + }, + { + "epoch": 0.1883504615091826, + "grad_norm": 1.7053500413894653, + "learning_rate": 4.5749817333365687e-05, + "loss": 5.3526, + "step": 31670 + }, + { + "epoch": 0.1883564087924636, + "grad_norm": 1.4372013807296753, + "learning_rate": 4.574955679420169e-05, + "loss": 5.0641, + "step": 31671 + }, + { + "epoch": 0.1883623560757446, + "grad_norm": 1.6831438541412354, + "learning_rate": 4.5749296247794246e-05, + "loss": 4.7807, + "step": 31672 + }, + { + "epoch": 0.1883683033590256, + "grad_norm": 1.7787952423095703, + "learning_rate": 4.574903569414343e-05, + "loss": 4.1886, + "step": 31673 + }, + { + "epoch": 0.1883742506423066, + "grad_norm": 2.1964874267578125, + "learning_rate": 4.5748775133249345e-05, + "loss": 3.8119, + "step": 31674 + }, + { + "epoch": 0.18838019792558758, + "grad_norm": 1.8583804368972778, + "learning_rate": 4.5748514565112074e-05, + "loss": 4.4153, + "step": 31675 + }, + { + "epoch": 0.1883861452088686, + "grad_norm": 1.8326549530029297, + "learning_rate": 4.574825398973171e-05, + "loss": 4.7196, + "step": 31676 + }, + { + "epoch": 0.18839209249214958, + "grad_norm": 1.685388207435608, + "learning_rate": 4.5747993407108345e-05, + "loss": 4.4115, + "step": 31677 + }, + { + "epoch": 0.18839803977543057, + "grad_norm": 1.5775798559188843, + "learning_rate": 4.574773281724207e-05, + "loss": 4.7152, + "step": 31678 + }, + { + "epoch": 0.1884039870587116, + "grad_norm": 1.4056192636489868, + "learning_rate": 4.574747222013298e-05, + "loss": 5.0494, + "step": 31679 + }, + { + "epoch": 0.18840993434199257, + "grad_norm": 1.5998051166534424, + "learning_rate": 4.574721161578115e-05, + "loss": 5.4125, + "step": 31680 + }, + { + "epoch": 0.18841588162527356, + "grad_norm": 1.418294072151184, + "learning_rate": 4.57469510041867e-05, + "loss": 5.2475, + "step": 31681 + }, + { + "epoch": 0.18842182890855458, + "grad_norm": 2.817990779876709, + "learning_rate": 4.574669038534969e-05, + "loss": 3.9644, + "step": 31682 + }, + { + "epoch": 0.18842777619183557, + "grad_norm": 1.8277714252471924, + "learning_rate": 4.574642975927023e-05, + "loss": 4.7339, + "step": 31683 + }, + { + "epoch": 0.18843372347511655, + "grad_norm": 1.7349371910095215, + "learning_rate": 4.5746169125948406e-05, + "loss": 5.0213, + "step": 31684 + }, + { + "epoch": 0.18843967075839757, + "grad_norm": 1.8414616584777832, + "learning_rate": 4.574590848538431e-05, + "loss": 4.6055, + "step": 31685 + }, + { + "epoch": 0.18844561804167856, + "grad_norm": 1.864438533782959, + "learning_rate": 4.574564783757803e-05, + "loss": 4.5234, + "step": 31686 + }, + { + "epoch": 0.18845156532495955, + "grad_norm": 1.8597543239593506, + "learning_rate": 4.574538718252966e-05, + "loss": 4.6606, + "step": 31687 + }, + { + "epoch": 0.18845751260824056, + "grad_norm": 1.7549642324447632, + "learning_rate": 4.574512652023929e-05, + "loss": 5.1434, + "step": 31688 + }, + { + "epoch": 0.18846345989152155, + "grad_norm": 1.574147343635559, + "learning_rate": 4.574486585070701e-05, + "loss": 5.2176, + "step": 31689 + }, + { + "epoch": 0.18846940717480254, + "grad_norm": 1.7602109909057617, + "learning_rate": 4.5744605173932906e-05, + "loss": 5.2822, + "step": 31690 + }, + { + "epoch": 0.18847535445808355, + "grad_norm": 1.6231430768966675, + "learning_rate": 4.574434448991708e-05, + "loss": 5.3009, + "step": 31691 + }, + { + "epoch": 0.18848130174136454, + "grad_norm": 1.9236938953399658, + "learning_rate": 4.5744083798659615e-05, + "loss": 5.1129, + "step": 31692 + }, + { + "epoch": 0.18848724902464553, + "grad_norm": 1.755083680152893, + "learning_rate": 4.574382310016061e-05, + "loss": 4.6636, + "step": 31693 + }, + { + "epoch": 0.18849319630792655, + "grad_norm": 1.7704771757125854, + "learning_rate": 4.574356239442015e-05, + "loss": 4.7429, + "step": 31694 + }, + { + "epoch": 0.18849914359120754, + "grad_norm": 2.079738140106201, + "learning_rate": 4.574330168143831e-05, + "loss": 4.1216, + "step": 31695 + }, + { + "epoch": 0.18850509087448852, + "grad_norm": 1.823591947555542, + "learning_rate": 4.574304096121522e-05, + "loss": 4.3193, + "step": 31696 + }, + { + "epoch": 0.18851103815776954, + "grad_norm": 1.7429176568984985, + "learning_rate": 4.5742780233750936e-05, + "loss": 4.9425, + "step": 31697 + }, + { + "epoch": 0.18851698544105053, + "grad_norm": 1.4497638940811157, + "learning_rate": 4.5742519499045565e-05, + "loss": 4.9634, + "step": 31698 + }, + { + "epoch": 0.18852293272433152, + "grad_norm": 1.698063850402832, + "learning_rate": 4.57422587570992e-05, + "loss": 4.8983, + "step": 31699 + }, + { + "epoch": 0.18852888000761253, + "grad_norm": 1.638048768043518, + "learning_rate": 4.574199800791192e-05, + "loss": 4.5512, + "step": 31700 + }, + { + "epoch": 0.18853482729089352, + "grad_norm": 1.8207498788833618, + "learning_rate": 4.574173725148383e-05, + "loss": 4.646, + "step": 31701 + }, + { + "epoch": 0.1885407745741745, + "grad_norm": 1.7710716724395752, + "learning_rate": 4.5741476487815006e-05, + "loss": 4.5777, + "step": 31702 + }, + { + "epoch": 0.18854672185745552, + "grad_norm": 2.0382273197174072, + "learning_rate": 4.574121571690555e-05, + "loss": 3.9293, + "step": 31703 + }, + { + "epoch": 0.1885526691407365, + "grad_norm": 1.8165003061294556, + "learning_rate": 4.574095493875555e-05, + "loss": 4.3543, + "step": 31704 + }, + { + "epoch": 0.1885586164240175, + "grad_norm": 1.7196195125579834, + "learning_rate": 4.57406941533651e-05, + "loss": 4.1694, + "step": 31705 + }, + { + "epoch": 0.18856456370729852, + "grad_norm": 1.9387542009353638, + "learning_rate": 4.574043336073428e-05, + "loss": 4.1091, + "step": 31706 + }, + { + "epoch": 0.1885705109905795, + "grad_norm": 1.605260968208313, + "learning_rate": 4.5740172560863194e-05, + "loss": 4.957, + "step": 31707 + }, + { + "epoch": 0.1885764582738605, + "grad_norm": 1.6566966772079468, + "learning_rate": 4.573991175375193e-05, + "loss": 4.9981, + "step": 31708 + }, + { + "epoch": 0.1885824055571415, + "grad_norm": 1.6529970169067383, + "learning_rate": 4.573965093940058e-05, + "loss": 4.9042, + "step": 31709 + }, + { + "epoch": 0.1885883528404225, + "grad_norm": 1.7275558710098267, + "learning_rate": 4.573939011780922e-05, + "loss": 4.4827, + "step": 31710 + }, + { + "epoch": 0.18859430012370348, + "grad_norm": 1.8472176790237427, + "learning_rate": 4.573912928897796e-05, + "loss": 3.9163, + "step": 31711 + }, + { + "epoch": 0.1886002474069845, + "grad_norm": 2.4503021240234375, + "learning_rate": 4.5738868452906886e-05, + "loss": 3.4869, + "step": 31712 + }, + { + "epoch": 0.1886061946902655, + "grad_norm": 2.6473751068115234, + "learning_rate": 4.573860760959608e-05, + "loss": 2.8014, + "step": 31713 + }, + { + "epoch": 0.18861214197354648, + "grad_norm": 2.5864624977111816, + "learning_rate": 4.5738346759045646e-05, + "loss": 3.4543, + "step": 31714 + }, + { + "epoch": 0.1886180892568275, + "grad_norm": 2.3731234073638916, + "learning_rate": 4.5738085901255674e-05, + "loss": 3.2747, + "step": 31715 + }, + { + "epoch": 0.18862403654010848, + "grad_norm": 2.6904380321502686, + "learning_rate": 4.573782503622625e-05, + "loss": 3.3082, + "step": 31716 + }, + { + "epoch": 0.18862998382338947, + "grad_norm": 2.2932288646698, + "learning_rate": 4.573756416395746e-05, + "loss": 3.5169, + "step": 31717 + }, + { + "epoch": 0.18863593110667048, + "grad_norm": 2.7179884910583496, + "learning_rate": 4.573730328444939e-05, + "loss": 3.6862, + "step": 31718 + }, + { + "epoch": 0.18864187838995147, + "grad_norm": 3.153721332550049, + "learning_rate": 4.573704239770216e-05, + "loss": 4.091, + "step": 31719 + }, + { + "epoch": 0.18864782567323246, + "grad_norm": 2.8296713829040527, + "learning_rate": 4.5736781503715844e-05, + "loss": 3.9948, + "step": 31720 + }, + { + "epoch": 0.18865377295651348, + "grad_norm": 2.557539701461792, + "learning_rate": 4.573652060249052e-05, + "loss": 3.7615, + "step": 31721 + }, + { + "epoch": 0.18865972023979447, + "grad_norm": 2.5534744262695312, + "learning_rate": 4.57362596940263e-05, + "loss": 3.1447, + "step": 31722 + }, + { + "epoch": 0.18866566752307545, + "grad_norm": 2.5435099601745605, + "learning_rate": 4.573599877832325e-05, + "loss": 3.0014, + "step": 31723 + }, + { + "epoch": 0.18867161480635647, + "grad_norm": 2.939772605895996, + "learning_rate": 4.573573785538149e-05, + "loss": 4.174, + "step": 31724 + }, + { + "epoch": 0.18867756208963746, + "grad_norm": 2.6355509757995605, + "learning_rate": 4.5735476925201095e-05, + "loss": 3.5901, + "step": 31725 + }, + { + "epoch": 0.18868350937291845, + "grad_norm": 2.7327325344085693, + "learning_rate": 4.5735215987782166e-05, + "loss": 3.6472, + "step": 31726 + }, + { + "epoch": 0.18868945665619946, + "grad_norm": 2.610990285873413, + "learning_rate": 4.5734955043124784e-05, + "loss": 3.8589, + "step": 31727 + }, + { + "epoch": 0.18869540393948045, + "grad_norm": 2.165526866912842, + "learning_rate": 4.573469409122904e-05, + "loss": 3.7147, + "step": 31728 + }, + { + "epoch": 0.18870135122276144, + "grad_norm": 2.7130625247955322, + "learning_rate": 4.573443313209503e-05, + "loss": 3.5475, + "step": 31729 + }, + { + "epoch": 0.18870729850604245, + "grad_norm": 2.849789619445801, + "learning_rate": 4.5734172165722847e-05, + "loss": 2.8693, + "step": 31730 + }, + { + "epoch": 0.18871324578932344, + "grad_norm": 2.405048370361328, + "learning_rate": 4.5733911192112564e-05, + "loss": 2.8687, + "step": 31731 + }, + { + "epoch": 0.18871919307260443, + "grad_norm": 2.5103397369384766, + "learning_rate": 4.5733650211264304e-05, + "loss": 3.4796, + "step": 31732 + }, + { + "epoch": 0.18872514035588542, + "grad_norm": 2.737912654876709, + "learning_rate": 4.573338922317813e-05, + "loss": 3.7991, + "step": 31733 + }, + { + "epoch": 0.18873108763916643, + "grad_norm": 1.9937493801116943, + "learning_rate": 4.573312822785415e-05, + "loss": 4.6986, + "step": 31734 + }, + { + "epoch": 0.18873703492244742, + "grad_norm": 1.9701282978057861, + "learning_rate": 4.5732867225292455e-05, + "loss": 4.6174, + "step": 31735 + }, + { + "epoch": 0.1887429822057284, + "grad_norm": 1.8944740295410156, + "learning_rate": 4.5732606215493116e-05, + "loss": 4.9427, + "step": 31736 + }, + { + "epoch": 0.18874892948900943, + "grad_norm": 1.601288080215454, + "learning_rate": 4.573234519845625e-05, + "loss": 5.2254, + "step": 31737 + }, + { + "epoch": 0.18875487677229041, + "grad_norm": 1.7480894327163696, + "learning_rate": 4.5732084174181936e-05, + "loss": 4.9654, + "step": 31738 + }, + { + "epoch": 0.1887608240555714, + "grad_norm": 2.1990926265716553, + "learning_rate": 4.5731823142670256e-05, + "loss": 4.3537, + "step": 31739 + }, + { + "epoch": 0.18876677133885242, + "grad_norm": 1.7892099618911743, + "learning_rate": 4.573156210392132e-05, + "loss": 4.9544, + "step": 31740 + }, + { + "epoch": 0.1887727186221334, + "grad_norm": 1.6811445951461792, + "learning_rate": 4.57313010579352e-05, + "loss": 4.9715, + "step": 31741 + }, + { + "epoch": 0.1887786659054144, + "grad_norm": 1.6930960416793823, + "learning_rate": 4.5731040004712006e-05, + "loss": 4.8801, + "step": 31742 + }, + { + "epoch": 0.1887846131886954, + "grad_norm": 1.424654245376587, + "learning_rate": 4.573077894425182e-05, + "loss": 5.0213, + "step": 31743 + }, + { + "epoch": 0.1887905604719764, + "grad_norm": 2.3140928745269775, + "learning_rate": 4.573051787655472e-05, + "loss": 4.1046, + "step": 31744 + }, + { + "epoch": 0.1887965077552574, + "grad_norm": 1.9428435564041138, + "learning_rate": 4.573025680162083e-05, + "loss": 4.7393, + "step": 31745 + }, + { + "epoch": 0.1888024550385384, + "grad_norm": 1.5271058082580566, + "learning_rate": 4.572999571945021e-05, + "loss": 4.9096, + "step": 31746 + }, + { + "epoch": 0.1888084023218194, + "grad_norm": 1.8436189889907837, + "learning_rate": 4.5729734630042964e-05, + "loss": 4.5118, + "step": 31747 + }, + { + "epoch": 0.18881434960510038, + "grad_norm": 1.3995059728622437, + "learning_rate": 4.572947353339918e-05, + "loss": 5.3253, + "step": 31748 + }, + { + "epoch": 0.1888202968883814, + "grad_norm": 1.628862738609314, + "learning_rate": 4.572921242951896e-05, + "loss": 5.0558, + "step": 31749 + }, + { + "epoch": 0.18882624417166238, + "grad_norm": 1.7975375652313232, + "learning_rate": 4.572895131840238e-05, + "loss": 4.8898, + "step": 31750 + }, + { + "epoch": 0.18883219145494337, + "grad_norm": 1.7084927558898926, + "learning_rate": 4.572869020004953e-05, + "loss": 5.0099, + "step": 31751 + }, + { + "epoch": 0.1888381387382244, + "grad_norm": 1.518763542175293, + "learning_rate": 4.572842907446052e-05, + "loss": 4.8393, + "step": 31752 + }, + { + "epoch": 0.18884408602150538, + "grad_norm": 1.4125078916549683, + "learning_rate": 4.5728167941635425e-05, + "loss": 4.6985, + "step": 31753 + }, + { + "epoch": 0.18885003330478636, + "grad_norm": 1.5411655902862549, + "learning_rate": 4.572790680157434e-05, + "loss": 4.6634, + "step": 31754 + }, + { + "epoch": 0.18885598058806738, + "grad_norm": 1.5578504800796509, + "learning_rate": 4.572764565427736e-05, + "loss": 4.6396, + "step": 31755 + }, + { + "epoch": 0.18886192787134837, + "grad_norm": 1.4977939128875732, + "learning_rate": 4.572738449974457e-05, + "loss": 4.742, + "step": 31756 + }, + { + "epoch": 0.18886787515462936, + "grad_norm": 1.4126876592636108, + "learning_rate": 4.572712333797606e-05, + "loss": 5.0874, + "step": 31757 + }, + { + "epoch": 0.18887382243791037, + "grad_norm": 1.4195787906646729, + "learning_rate": 4.572686216897194e-05, + "loss": 4.663, + "step": 31758 + }, + { + "epoch": 0.18887976972119136, + "grad_norm": 1.4082183837890625, + "learning_rate": 4.5726600992732274e-05, + "loss": 4.7646, + "step": 31759 + }, + { + "epoch": 0.18888571700447235, + "grad_norm": 1.3189529180526733, + "learning_rate": 4.572633980925717e-05, + "loss": 4.6805, + "step": 31760 + }, + { + "epoch": 0.18889166428775336, + "grad_norm": 1.589459776878357, + "learning_rate": 4.5726078618546707e-05, + "loss": 4.7832, + "step": 31761 + }, + { + "epoch": 0.18889761157103435, + "grad_norm": 1.7522832155227661, + "learning_rate": 4.5725817420600995e-05, + "loss": 4.384, + "step": 31762 + }, + { + "epoch": 0.18890355885431534, + "grad_norm": 1.5586985349655151, + "learning_rate": 4.5725556215420104e-05, + "loss": 4.9273, + "step": 31763 + }, + { + "epoch": 0.18890950613759636, + "grad_norm": 1.8645926713943481, + "learning_rate": 4.572529500300415e-05, + "loss": 4.8043, + "step": 31764 + }, + { + "epoch": 0.18891545342087734, + "grad_norm": 1.742484450340271, + "learning_rate": 4.57250337833532e-05, + "loss": 4.8432, + "step": 31765 + }, + { + "epoch": 0.18892140070415833, + "grad_norm": 1.6944609880447388, + "learning_rate": 4.572477255646736e-05, + "loss": 4.8162, + "step": 31766 + }, + { + "epoch": 0.18892734798743935, + "grad_norm": 1.5811859369277954, + "learning_rate": 4.5724511322346705e-05, + "loss": 4.6349, + "step": 31767 + }, + { + "epoch": 0.18893329527072034, + "grad_norm": 1.6676167249679565, + "learning_rate": 4.572425008099135e-05, + "loss": 4.7742, + "step": 31768 + }, + { + "epoch": 0.18893924255400132, + "grad_norm": 2.0189199447631836, + "learning_rate": 4.5723988832401366e-05, + "loss": 4.5048, + "step": 31769 + }, + { + "epoch": 0.18894518983728234, + "grad_norm": 1.9740796089172363, + "learning_rate": 4.572372757657685e-05, + "loss": 4.7859, + "step": 31770 + }, + { + "epoch": 0.18895113712056333, + "grad_norm": 1.780246615409851, + "learning_rate": 4.57234663135179e-05, + "loss": 4.5585, + "step": 31771 + }, + { + "epoch": 0.18895708440384432, + "grad_norm": 1.744525671005249, + "learning_rate": 4.5723205043224596e-05, + "loss": 4.6835, + "step": 31772 + }, + { + "epoch": 0.18896303168712533, + "grad_norm": 2.0885379314422607, + "learning_rate": 4.572294376569704e-05, + "loss": 4.1156, + "step": 31773 + }, + { + "epoch": 0.18896897897040632, + "grad_norm": 1.7957887649536133, + "learning_rate": 4.572268248093532e-05, + "loss": 4.7995, + "step": 31774 + }, + { + "epoch": 0.1889749262536873, + "grad_norm": 1.7398995161056519, + "learning_rate": 4.572242118893953e-05, + "loss": 4.649, + "step": 31775 + }, + { + "epoch": 0.18898087353696832, + "grad_norm": 1.6801024675369263, + "learning_rate": 4.572215988970974e-05, + "loss": 4.6727, + "step": 31776 + }, + { + "epoch": 0.1889868208202493, + "grad_norm": 1.7167788743972778, + "learning_rate": 4.572189858324607e-05, + "loss": 4.8166, + "step": 31777 + }, + { + "epoch": 0.1889927681035303, + "grad_norm": 1.853050947189331, + "learning_rate": 4.57216372695486e-05, + "loss": 4.6311, + "step": 31778 + }, + { + "epoch": 0.18899871538681132, + "grad_norm": 1.5725040435791016, + "learning_rate": 4.5721375948617416e-05, + "loss": 4.6623, + "step": 31779 + }, + { + "epoch": 0.1890046626700923, + "grad_norm": 1.5537104606628418, + "learning_rate": 4.572111462045261e-05, + "loss": 4.3422, + "step": 31780 + }, + { + "epoch": 0.1890106099533733, + "grad_norm": 1.5853313207626343, + "learning_rate": 4.572085328505429e-05, + "loss": 4.6766, + "step": 31781 + }, + { + "epoch": 0.1890165572366543, + "grad_norm": 1.4046406745910645, + "learning_rate": 4.5720591942422524e-05, + "loss": 4.5923, + "step": 31782 + }, + { + "epoch": 0.1890225045199353, + "grad_norm": 1.8137469291687012, + "learning_rate": 4.5720330592557415e-05, + "loss": 4.4214, + "step": 31783 + }, + { + "epoch": 0.18902845180321629, + "grad_norm": 1.552449107170105, + "learning_rate": 4.572006923545906e-05, + "loss": 4.5052, + "step": 31784 + }, + { + "epoch": 0.1890343990864973, + "grad_norm": 1.7049251794815063, + "learning_rate": 4.571980787112753e-05, + "loss": 4.4893, + "step": 31785 + }, + { + "epoch": 0.1890403463697783, + "grad_norm": 1.8483717441558838, + "learning_rate": 4.5719546499562937e-05, + "loss": 4.7381, + "step": 31786 + }, + { + "epoch": 0.18904629365305928, + "grad_norm": 1.7575819492340088, + "learning_rate": 4.5719285120765363e-05, + "loss": 4.8062, + "step": 31787 + }, + { + "epoch": 0.1890522409363403, + "grad_norm": 1.5546499490737915, + "learning_rate": 4.57190237347349e-05, + "loss": 4.6929, + "step": 31788 + }, + { + "epoch": 0.18905818821962128, + "grad_norm": 1.3272528648376465, + "learning_rate": 4.5718762341471645e-05, + "loss": 4.7948, + "step": 31789 + }, + { + "epoch": 0.18906413550290227, + "grad_norm": 1.6771869659423828, + "learning_rate": 4.571850094097568e-05, + "loss": 5.0822, + "step": 31790 + }, + { + "epoch": 0.18907008278618326, + "grad_norm": 1.6481125354766846, + "learning_rate": 4.57182395332471e-05, + "loss": 4.4177, + "step": 31791 + }, + { + "epoch": 0.18907603006946427, + "grad_norm": 1.8274805545806885, + "learning_rate": 4.5717978118286e-05, + "loss": 4.4042, + "step": 31792 + }, + { + "epoch": 0.18908197735274526, + "grad_norm": 1.6022708415985107, + "learning_rate": 4.5717716696092464e-05, + "loss": 4.3987, + "step": 31793 + }, + { + "epoch": 0.18908792463602625, + "grad_norm": 1.7151497602462769, + "learning_rate": 4.5717455266666586e-05, + "loss": 4.3557, + "step": 31794 + }, + { + "epoch": 0.18909387191930727, + "grad_norm": 1.4646251201629639, + "learning_rate": 4.5717193830008457e-05, + "loss": 4.5261, + "step": 31795 + }, + { + "epoch": 0.18909981920258825, + "grad_norm": 1.6726088523864746, + "learning_rate": 4.571693238611817e-05, + "loss": 4.6804, + "step": 31796 + }, + { + "epoch": 0.18910576648586924, + "grad_norm": 2.105010509490967, + "learning_rate": 4.571667093499583e-05, + "loss": 4.0689, + "step": 31797 + }, + { + "epoch": 0.18911171376915026, + "grad_norm": 1.9176527261734009, + "learning_rate": 4.57164094766415e-05, + "loss": 4.3012, + "step": 31798 + }, + { + "epoch": 0.18911766105243125, + "grad_norm": 1.653403639793396, + "learning_rate": 4.571614801105528e-05, + "loss": 4.7231, + "step": 31799 + }, + { + "epoch": 0.18912360833571223, + "grad_norm": 2.0761914253234863, + "learning_rate": 4.571588653823728e-05, + "loss": 3.5717, + "step": 31800 + }, + { + "epoch": 0.18912955561899325, + "grad_norm": 1.7735234498977661, + "learning_rate": 4.5715625058187574e-05, + "loss": 4.4324, + "step": 31801 + }, + { + "epoch": 0.18913550290227424, + "grad_norm": 1.6627693176269531, + "learning_rate": 4.5715363570906256e-05, + "loss": 4.5788, + "step": 31802 + }, + { + "epoch": 0.18914145018555523, + "grad_norm": 1.888426661491394, + "learning_rate": 4.5715102076393425e-05, + "loss": 4.8467, + "step": 31803 + }, + { + "epoch": 0.18914739746883624, + "grad_norm": 1.6043262481689453, + "learning_rate": 4.5714840574649155e-05, + "loss": 4.974, + "step": 31804 + }, + { + "epoch": 0.18915334475211723, + "grad_norm": 1.7923390865325928, + "learning_rate": 4.5714579065673555e-05, + "loss": 4.9429, + "step": 31805 + }, + { + "epoch": 0.18915929203539822, + "grad_norm": 2.3900370597839355, + "learning_rate": 4.571431754946671e-05, + "loss": 3.9334, + "step": 31806 + }, + { + "epoch": 0.18916523931867923, + "grad_norm": 1.6400319337844849, + "learning_rate": 4.571405602602871e-05, + "loss": 4.6774, + "step": 31807 + }, + { + "epoch": 0.18917118660196022, + "grad_norm": 2.724771738052368, + "learning_rate": 4.571379449535964e-05, + "loss": 3.6422, + "step": 31808 + }, + { + "epoch": 0.1891771338852412, + "grad_norm": 1.6361116170883179, + "learning_rate": 4.5713532957459605e-05, + "loss": 4.7115, + "step": 31809 + }, + { + "epoch": 0.18918308116852223, + "grad_norm": 1.7161823511123657, + "learning_rate": 4.571327141232869e-05, + "loss": 4.6349, + "step": 31810 + }, + { + "epoch": 0.18918902845180322, + "grad_norm": 1.8056199550628662, + "learning_rate": 4.571300985996698e-05, + "loss": 4.3451, + "step": 31811 + }, + { + "epoch": 0.1891949757350842, + "grad_norm": 1.9487394094467163, + "learning_rate": 4.571274830037458e-05, + "loss": 4.3959, + "step": 31812 + }, + { + "epoch": 0.18920092301836522, + "grad_norm": 1.9206527471542358, + "learning_rate": 4.5712486733551574e-05, + "loss": 4.5261, + "step": 31813 + }, + { + "epoch": 0.1892068703016462, + "grad_norm": 1.872023344039917, + "learning_rate": 4.5712225159498046e-05, + "loss": 4.7062, + "step": 31814 + }, + { + "epoch": 0.1892128175849272, + "grad_norm": 2.086467981338501, + "learning_rate": 4.57119635782141e-05, + "loss": 3.7319, + "step": 31815 + }, + { + "epoch": 0.1892187648682082, + "grad_norm": 1.942416787147522, + "learning_rate": 4.571170198969982e-05, + "loss": 4.3991, + "step": 31816 + }, + { + "epoch": 0.1892247121514892, + "grad_norm": 1.8786695003509521, + "learning_rate": 4.5711440393955295e-05, + "loss": 4.9689, + "step": 31817 + }, + { + "epoch": 0.1892306594347702, + "grad_norm": 1.9197100400924683, + "learning_rate": 4.571117879098063e-05, + "loss": 5.0809, + "step": 31818 + }, + { + "epoch": 0.1892366067180512, + "grad_norm": 1.9586657285690308, + "learning_rate": 4.571091718077589e-05, + "loss": 4.5672, + "step": 31819 + }, + { + "epoch": 0.1892425540013322, + "grad_norm": 1.8381383419036865, + "learning_rate": 4.5710655563341196e-05, + "loss": 4.2682, + "step": 31820 + }, + { + "epoch": 0.18924850128461318, + "grad_norm": 1.8966319561004639, + "learning_rate": 4.571039393867662e-05, + "loss": 3.4864, + "step": 31821 + }, + { + "epoch": 0.1892544485678942, + "grad_norm": 1.8893778324127197, + "learning_rate": 4.571013230678226e-05, + "loss": 3.6155, + "step": 31822 + }, + { + "epoch": 0.18926039585117518, + "grad_norm": 1.9687188863754272, + "learning_rate": 4.570987066765821e-05, + "loss": 3.6131, + "step": 31823 + }, + { + "epoch": 0.18926634313445617, + "grad_norm": 1.932376742362976, + "learning_rate": 4.570960902130456e-05, + "loss": 3.6234, + "step": 31824 + }, + { + "epoch": 0.1892722904177372, + "grad_norm": 1.8500068187713623, + "learning_rate": 4.570934736772139e-05, + "loss": 3.5913, + "step": 31825 + }, + { + "epoch": 0.18927823770101818, + "grad_norm": 1.765598177909851, + "learning_rate": 4.570908570690881e-05, + "loss": 3.4619, + "step": 31826 + }, + { + "epoch": 0.18928418498429916, + "grad_norm": 1.8413002490997314, + "learning_rate": 4.57088240388669e-05, + "loss": 3.596, + "step": 31827 + }, + { + "epoch": 0.18929013226758018, + "grad_norm": 1.9876320362091064, + "learning_rate": 4.570856236359575e-05, + "loss": 3.4154, + "step": 31828 + }, + { + "epoch": 0.18929607955086117, + "grad_norm": 1.9374988079071045, + "learning_rate": 4.570830068109546e-05, + "loss": 3.5511, + "step": 31829 + }, + { + "epoch": 0.18930202683414216, + "grad_norm": 1.971796989440918, + "learning_rate": 4.570803899136611e-05, + "loss": 3.4346, + "step": 31830 + }, + { + "epoch": 0.18930797411742317, + "grad_norm": 1.9629862308502197, + "learning_rate": 4.57077772944078e-05, + "loss": 3.7305, + "step": 31831 + }, + { + "epoch": 0.18931392140070416, + "grad_norm": 2.155545234680176, + "learning_rate": 4.5707515590220625e-05, + "loss": 3.9258, + "step": 31832 + }, + { + "epoch": 0.18931986868398515, + "grad_norm": 2.084571123123169, + "learning_rate": 4.5707253878804665e-05, + "loss": 3.5942, + "step": 31833 + }, + { + "epoch": 0.18932581596726616, + "grad_norm": 2.053971529006958, + "learning_rate": 4.570699216016001e-05, + "loss": 3.5606, + "step": 31834 + }, + { + "epoch": 0.18933176325054715, + "grad_norm": 1.9901275634765625, + "learning_rate": 4.570673043428677e-05, + "loss": 3.4579, + "step": 31835 + }, + { + "epoch": 0.18933771053382814, + "grad_norm": 1.846103549003601, + "learning_rate": 4.570646870118502e-05, + "loss": 3.319, + "step": 31836 + }, + { + "epoch": 0.18934365781710916, + "grad_norm": 2.0345115661621094, + "learning_rate": 4.570620696085486e-05, + "loss": 3.4513, + "step": 31837 + }, + { + "epoch": 0.18934960510039014, + "grad_norm": 1.8338862657546997, + "learning_rate": 4.570594521329636e-05, + "loss": 3.5135, + "step": 31838 + }, + { + "epoch": 0.18935555238367113, + "grad_norm": 1.9724763631820679, + "learning_rate": 4.5705683458509646e-05, + "loss": 3.4845, + "step": 31839 + }, + { + "epoch": 0.18936149966695215, + "grad_norm": 1.9579484462738037, + "learning_rate": 4.570542169649479e-05, + "loss": 3.4505, + "step": 31840 + }, + { + "epoch": 0.18936744695023314, + "grad_norm": 1.915587067604065, + "learning_rate": 4.570515992725187e-05, + "loss": 3.4607, + "step": 31841 + }, + { + "epoch": 0.18937339423351413, + "grad_norm": 2.1304988861083984, + "learning_rate": 4.5704898150781004e-05, + "loss": 3.4474, + "step": 31842 + }, + { + "epoch": 0.18937934151679514, + "grad_norm": 1.8973450660705566, + "learning_rate": 4.5704636367082275e-05, + "loss": 3.9997, + "step": 31843 + }, + { + "epoch": 0.18938528880007613, + "grad_norm": 2.0216281414031982, + "learning_rate": 4.570437457615577e-05, + "loss": 3.4405, + "step": 31844 + }, + { + "epoch": 0.18939123608335712, + "grad_norm": 1.8983052968978882, + "learning_rate": 4.5704112778001586e-05, + "loss": 3.5817, + "step": 31845 + }, + { + "epoch": 0.18939718336663813, + "grad_norm": 1.9334758520126343, + "learning_rate": 4.5703850972619796e-05, + "loss": 3.6004, + "step": 31846 + }, + { + "epoch": 0.18940313064991912, + "grad_norm": 1.9281392097473145, + "learning_rate": 4.570358916001052e-05, + "loss": 3.5149, + "step": 31847 + }, + { + "epoch": 0.1894090779332001, + "grad_norm": 2.114772081375122, + "learning_rate": 4.5703327340173826e-05, + "loss": 3.4635, + "step": 31848 + }, + { + "epoch": 0.1894150252164811, + "grad_norm": 1.9354569911956787, + "learning_rate": 4.5703065513109815e-05, + "loss": 3.4728, + "step": 31849 + }, + { + "epoch": 0.1894209724997621, + "grad_norm": 1.9464221000671387, + "learning_rate": 4.570280367881859e-05, + "loss": 3.364, + "step": 31850 + }, + { + "epoch": 0.1894269197830431, + "grad_norm": 1.9195743799209595, + "learning_rate": 4.570254183730021e-05, + "loss": 3.4428, + "step": 31851 + }, + { + "epoch": 0.1894328670663241, + "grad_norm": 1.9214719533920288, + "learning_rate": 4.57022799885548e-05, + "loss": 3.4926, + "step": 31852 + }, + { + "epoch": 0.1894388143496051, + "grad_norm": 1.9174745082855225, + "learning_rate": 4.5702018132582435e-05, + "loss": 3.4202, + "step": 31853 + }, + { + "epoch": 0.1894447616328861, + "grad_norm": 1.8605272769927979, + "learning_rate": 4.57017562693832e-05, + "loss": 3.7625, + "step": 31854 + }, + { + "epoch": 0.18945070891616708, + "grad_norm": 1.8724991083145142, + "learning_rate": 4.5701494398957214e-05, + "loss": 3.9864, + "step": 31855 + }, + { + "epoch": 0.1894566561994481, + "grad_norm": 1.8957018852233887, + "learning_rate": 4.570123252130454e-05, + "loss": 3.4135, + "step": 31856 + }, + { + "epoch": 0.1894626034827291, + "grad_norm": 1.8492218255996704, + "learning_rate": 4.570097063642528e-05, + "loss": 3.4631, + "step": 31857 + }, + { + "epoch": 0.18946855076601007, + "grad_norm": 1.955808401107788, + "learning_rate": 4.570070874431952e-05, + "loss": 3.3076, + "step": 31858 + }, + { + "epoch": 0.1894744980492911, + "grad_norm": 1.8996524810791016, + "learning_rate": 4.570044684498737e-05, + "loss": 3.3948, + "step": 31859 + }, + { + "epoch": 0.18948044533257208, + "grad_norm": 1.89797842502594, + "learning_rate": 4.570018493842889e-05, + "loss": 3.4224, + "step": 31860 + }, + { + "epoch": 0.18948639261585307, + "grad_norm": 1.8745187520980835, + "learning_rate": 4.5699923024644195e-05, + "loss": 3.4154, + "step": 31861 + }, + { + "epoch": 0.18949233989913408, + "grad_norm": 1.9138267040252686, + "learning_rate": 4.569966110363338e-05, + "loss": 3.2955, + "step": 31862 + }, + { + "epoch": 0.18949828718241507, + "grad_norm": 1.970042109489441, + "learning_rate": 4.569939917539652e-05, + "loss": 3.328, + "step": 31863 + }, + { + "epoch": 0.18950423446569606, + "grad_norm": 1.8819622993469238, + "learning_rate": 4.5699137239933716e-05, + "loss": 3.358, + "step": 31864 + }, + { + "epoch": 0.18951018174897707, + "grad_norm": 1.7060779333114624, + "learning_rate": 4.569887529724506e-05, + "loss": 3.3375, + "step": 31865 + }, + { + "epoch": 0.18951612903225806, + "grad_norm": 1.6891839504241943, + "learning_rate": 4.569861334733063e-05, + "loss": 5.0216, + "step": 31866 + }, + { + "epoch": 0.18952207631553905, + "grad_norm": 1.8553059101104736, + "learning_rate": 4.569835139019054e-05, + "loss": 3.5624, + "step": 31867 + }, + { + "epoch": 0.18952802359882007, + "grad_norm": 2.997297763824463, + "learning_rate": 4.569808942582486e-05, + "loss": 4.0015, + "step": 31868 + }, + { + "epoch": 0.18953397088210105, + "grad_norm": 1.9399126768112183, + "learning_rate": 4.56978274542337e-05, + "loss": 3.3589, + "step": 31869 + }, + { + "epoch": 0.18953991816538204, + "grad_norm": 2.066025733947754, + "learning_rate": 4.5697565475417135e-05, + "loss": 3.3023, + "step": 31870 + }, + { + "epoch": 0.18954586544866306, + "grad_norm": 1.9673593044281006, + "learning_rate": 4.5697303489375266e-05, + "loss": 3.1509, + "step": 31871 + }, + { + "epoch": 0.18955181273194405, + "grad_norm": 1.9587528705596924, + "learning_rate": 4.569704149610818e-05, + "loss": 3.4391, + "step": 31872 + }, + { + "epoch": 0.18955776001522504, + "grad_norm": 1.9434003829956055, + "learning_rate": 4.5696779495615974e-05, + "loss": 3.4094, + "step": 31873 + }, + { + "epoch": 0.18956370729850605, + "grad_norm": 2.1255874633789062, + "learning_rate": 4.5696517487898735e-05, + "loss": 3.4675, + "step": 31874 + }, + { + "epoch": 0.18956965458178704, + "grad_norm": 1.951338291168213, + "learning_rate": 4.569625547295655e-05, + "loss": 3.4062, + "step": 31875 + }, + { + "epoch": 0.18957560186506803, + "grad_norm": 1.9367462396621704, + "learning_rate": 4.5695993450789523e-05, + "loss": 3.4532, + "step": 31876 + }, + { + "epoch": 0.18958154914834904, + "grad_norm": 1.9697223901748657, + "learning_rate": 4.5695731421397734e-05, + "loss": 3.3318, + "step": 31877 + }, + { + "epoch": 0.18958749643163003, + "grad_norm": 1.6190401315689087, + "learning_rate": 4.569546938478129e-05, + "loss": 4.3346, + "step": 31878 + }, + { + "epoch": 0.18959344371491102, + "grad_norm": 1.9056912660598755, + "learning_rate": 4.569520734094026e-05, + "loss": 4.6924, + "step": 31879 + }, + { + "epoch": 0.18959939099819204, + "grad_norm": 1.7069121599197388, + "learning_rate": 4.5694945289874744e-05, + "loss": 4.7868, + "step": 31880 + }, + { + "epoch": 0.18960533828147302, + "grad_norm": 1.788473129272461, + "learning_rate": 4.569468323158485e-05, + "loss": 4.4976, + "step": 31881 + }, + { + "epoch": 0.189611285564754, + "grad_norm": 1.6094763278961182, + "learning_rate": 4.569442116607065e-05, + "loss": 4.6732, + "step": 31882 + }, + { + "epoch": 0.18961723284803503, + "grad_norm": 1.8496800661087036, + "learning_rate": 4.569415909333223e-05, + "loss": 4.0664, + "step": 31883 + }, + { + "epoch": 0.18962318013131602, + "grad_norm": 1.9682886600494385, + "learning_rate": 4.5693897013369715e-05, + "loss": 3.4463, + "step": 31884 + }, + { + "epoch": 0.189629127414597, + "grad_norm": 1.6034213304519653, + "learning_rate": 4.569363492618316e-05, + "loss": 4.6424, + "step": 31885 + }, + { + "epoch": 0.18963507469787802, + "grad_norm": 1.7703704833984375, + "learning_rate": 4.5693372831772675e-05, + "loss": 4.5325, + "step": 31886 + }, + { + "epoch": 0.189641021981159, + "grad_norm": 1.7447285652160645, + "learning_rate": 4.569311073013834e-05, + "loss": 4.5242, + "step": 31887 + }, + { + "epoch": 0.18964696926444, + "grad_norm": 1.6660053730010986, + "learning_rate": 4.569284862128026e-05, + "loss": 4.6989, + "step": 31888 + }, + { + "epoch": 0.189652916547721, + "grad_norm": 1.5886887311935425, + "learning_rate": 4.569258650519852e-05, + "loss": 4.6059, + "step": 31889 + }, + { + "epoch": 0.189658863831002, + "grad_norm": 1.530544638633728, + "learning_rate": 4.569232438189321e-05, + "loss": 5.1136, + "step": 31890 + }, + { + "epoch": 0.189664811114283, + "grad_norm": 1.814598560333252, + "learning_rate": 4.569206225136442e-05, + "loss": 4.7673, + "step": 31891 + }, + { + "epoch": 0.189670758397564, + "grad_norm": 1.8687660694122314, + "learning_rate": 4.569180011361225e-05, + "loss": 4.6851, + "step": 31892 + }, + { + "epoch": 0.189676705680845, + "grad_norm": 1.673263669013977, + "learning_rate": 4.569153796863679e-05, + "loss": 4.0894, + "step": 31893 + }, + { + "epoch": 0.18968265296412598, + "grad_norm": 1.7278350591659546, + "learning_rate": 4.569127581643812e-05, + "loss": 4.4977, + "step": 31894 + }, + { + "epoch": 0.189688600247407, + "grad_norm": 1.7827249765396118, + "learning_rate": 4.569101365701635e-05, + "loss": 4.4619, + "step": 31895 + }, + { + "epoch": 0.18969454753068798, + "grad_norm": 1.8192304372787476, + "learning_rate": 4.569075149037155e-05, + "loss": 4.5414, + "step": 31896 + }, + { + "epoch": 0.18970049481396897, + "grad_norm": 1.8950804471969604, + "learning_rate": 4.5690489316503824e-05, + "loss": 4.97, + "step": 31897 + }, + { + "epoch": 0.18970644209725, + "grad_norm": 2.004835605621338, + "learning_rate": 4.5690227135413266e-05, + "loss": 5.0177, + "step": 31898 + }, + { + "epoch": 0.18971238938053098, + "grad_norm": 1.5404337644577026, + "learning_rate": 4.568996494709996e-05, + "loss": 4.8197, + "step": 31899 + }, + { + "epoch": 0.18971833666381197, + "grad_norm": 2.201564073562622, + "learning_rate": 4.5689702751564e-05, + "loss": 4.5455, + "step": 31900 + }, + { + "epoch": 0.18972428394709298, + "grad_norm": 2.286740303039551, + "learning_rate": 4.568944054880549e-05, + "loss": 4.4012, + "step": 31901 + }, + { + "epoch": 0.18973023123037397, + "grad_norm": 1.5117316246032715, + "learning_rate": 4.56891783388245e-05, + "loss": 4.9668, + "step": 31902 + }, + { + "epoch": 0.18973617851365496, + "grad_norm": 1.5675127506256104, + "learning_rate": 4.568891612162113e-05, + "loss": 4.1084, + "step": 31903 + }, + { + "epoch": 0.18974212579693597, + "grad_norm": 1.7535734176635742, + "learning_rate": 4.568865389719548e-05, + "loss": 4.7085, + "step": 31904 + }, + { + "epoch": 0.18974807308021696, + "grad_norm": 1.822056531906128, + "learning_rate": 4.5688391665547624e-05, + "loss": 5.0422, + "step": 31905 + }, + { + "epoch": 0.18975402036349795, + "grad_norm": 2.162489414215088, + "learning_rate": 4.5688129426677675e-05, + "loss": 4.6578, + "step": 31906 + }, + { + "epoch": 0.18975996764677894, + "grad_norm": 1.6846884489059448, + "learning_rate": 4.5687867180585706e-05, + "loss": 4.7004, + "step": 31907 + }, + { + "epoch": 0.18976591493005995, + "grad_norm": 1.652793526649475, + "learning_rate": 4.568760492727182e-05, + "loss": 5.4678, + "step": 31908 + }, + { + "epoch": 0.18977186221334094, + "grad_norm": 1.8100409507751465, + "learning_rate": 4.568734266673611e-05, + "loss": 5.2258, + "step": 31909 + }, + { + "epoch": 0.18977780949662193, + "grad_norm": 1.7055999040603638, + "learning_rate": 4.568708039897865e-05, + "loss": 4.78, + "step": 31910 + }, + { + "epoch": 0.18978375677990295, + "grad_norm": 1.980807900428772, + "learning_rate": 4.568681812399955e-05, + "loss": 5.2426, + "step": 31911 + }, + { + "epoch": 0.18978970406318393, + "grad_norm": 1.5969680547714233, + "learning_rate": 4.56865558417989e-05, + "loss": 4.1172, + "step": 31912 + }, + { + "epoch": 0.18979565134646492, + "grad_norm": 1.5067203044891357, + "learning_rate": 4.5686293552376786e-05, + "loss": 5.0804, + "step": 31913 + }, + { + "epoch": 0.18980159862974594, + "grad_norm": 1.7787573337554932, + "learning_rate": 4.5686031255733295e-05, + "loss": 5.5088, + "step": 31914 + }, + { + "epoch": 0.18980754591302693, + "grad_norm": 1.7157970666885376, + "learning_rate": 4.568576895186853e-05, + "loss": 5.0429, + "step": 31915 + }, + { + "epoch": 0.18981349319630791, + "grad_norm": 1.6739206314086914, + "learning_rate": 4.568550664078257e-05, + "loss": 5.2735, + "step": 31916 + }, + { + "epoch": 0.18981944047958893, + "grad_norm": 1.33918297290802, + "learning_rate": 4.568524432247552e-05, + "loss": 5.3635, + "step": 31917 + }, + { + "epoch": 0.18982538776286992, + "grad_norm": 1.5649034976959229, + "learning_rate": 4.568498199694746e-05, + "loss": 5.4058, + "step": 31918 + }, + { + "epoch": 0.1898313350461509, + "grad_norm": 1.9228683710098267, + "learning_rate": 4.568471966419849e-05, + "loss": 4.801, + "step": 31919 + }, + { + "epoch": 0.18983728232943192, + "grad_norm": 2.49468731880188, + "learning_rate": 4.56844573242287e-05, + "loss": 4.0647, + "step": 31920 + }, + { + "epoch": 0.1898432296127129, + "grad_norm": 1.6524834632873535, + "learning_rate": 4.5684194977038175e-05, + "loss": 4.5686, + "step": 31921 + }, + { + "epoch": 0.1898491768959939, + "grad_norm": 2.149550676345825, + "learning_rate": 4.568393262262701e-05, + "loss": 4.6678, + "step": 31922 + }, + { + "epoch": 0.18985512417927491, + "grad_norm": 1.6779396533966064, + "learning_rate": 4.56836702609953e-05, + "loss": 5.0776, + "step": 31923 + }, + { + "epoch": 0.1898610714625559, + "grad_norm": 1.5314429998397827, + "learning_rate": 4.568340789214314e-05, + "loss": 4.7417, + "step": 31924 + }, + { + "epoch": 0.1898670187458369, + "grad_norm": 2.9489622116088867, + "learning_rate": 4.568314551607061e-05, + "loss": 3.0551, + "step": 31925 + }, + { + "epoch": 0.1898729660291179, + "grad_norm": 2.9115781784057617, + "learning_rate": 4.568288313277781e-05, + "loss": 3.5898, + "step": 31926 + }, + { + "epoch": 0.1898789133123989, + "grad_norm": 2.426448345184326, + "learning_rate": 4.568262074226483e-05, + "loss": 3.763, + "step": 31927 + }, + { + "epoch": 0.18988486059567988, + "grad_norm": 2.463843822479248, + "learning_rate": 4.568235834453176e-05, + "loss": 3.6311, + "step": 31928 + }, + { + "epoch": 0.1898908078789609, + "grad_norm": 2.4178626537323, + "learning_rate": 4.568209593957869e-05, + "loss": 4.0005, + "step": 31929 + }, + { + "epoch": 0.1898967551622419, + "grad_norm": 2.1303043365478516, + "learning_rate": 4.568183352740571e-05, + "loss": 4.7775, + "step": 31930 + }, + { + "epoch": 0.18990270244552288, + "grad_norm": 2.039669990539551, + "learning_rate": 4.568157110801293e-05, + "loss": 4.4208, + "step": 31931 + }, + { + "epoch": 0.1899086497288039, + "grad_norm": 2.6738369464874268, + "learning_rate": 4.568130868140041e-05, + "loss": 3.5947, + "step": 31932 + }, + { + "epoch": 0.18991459701208488, + "grad_norm": 2.8675057888031006, + "learning_rate": 4.5681046247568273e-05, + "loss": 2.9846, + "step": 31933 + }, + { + "epoch": 0.18992054429536587, + "grad_norm": 2.8975415229797363, + "learning_rate": 4.5680783806516595e-05, + "loss": 2.8397, + "step": 31934 + }, + { + "epoch": 0.18992649157864688, + "grad_norm": 2.770543098449707, + "learning_rate": 4.568052135824545e-05, + "loss": 2.5696, + "step": 31935 + }, + { + "epoch": 0.18993243886192787, + "grad_norm": 2.7730634212493896, + "learning_rate": 4.568025890275497e-05, + "loss": 2.6406, + "step": 31936 + }, + { + "epoch": 0.18993838614520886, + "grad_norm": 1.8441970348358154, + "learning_rate": 4.5679996440045216e-05, + "loss": 4.7971, + "step": 31937 + }, + { + "epoch": 0.18994433342848988, + "grad_norm": 2.130506992340088, + "learning_rate": 4.5679733970116293e-05, + "loss": 4.6538, + "step": 31938 + }, + { + "epoch": 0.18995028071177086, + "grad_norm": 3.2489874362945557, + "learning_rate": 4.5679471492968286e-05, + "loss": 3.7206, + "step": 31939 + }, + { + "epoch": 0.18995622799505185, + "grad_norm": 3.3406145572662354, + "learning_rate": 4.567920900860129e-05, + "loss": 3.457, + "step": 31940 + }, + { + "epoch": 0.18996217527833287, + "grad_norm": 3.1997690200805664, + "learning_rate": 4.56789465170154e-05, + "loss": 3.4264, + "step": 31941 + }, + { + "epoch": 0.18996812256161386, + "grad_norm": 3.3533401489257812, + "learning_rate": 4.5678684018210697e-05, + "loss": 3.4097, + "step": 31942 + }, + { + "epoch": 0.18997406984489484, + "grad_norm": 1.8355157375335693, + "learning_rate": 4.5678421512187274e-05, + "loss": 4.9528, + "step": 31943 + }, + { + "epoch": 0.18998001712817586, + "grad_norm": 1.5663495063781738, + "learning_rate": 4.567815899894524e-05, + "loss": 5.2737, + "step": 31944 + }, + { + "epoch": 0.18998596441145685, + "grad_norm": 1.7897334098815918, + "learning_rate": 4.567789647848467e-05, + "loss": 5.2766, + "step": 31945 + }, + { + "epoch": 0.18999191169473784, + "grad_norm": 1.6036760807037354, + "learning_rate": 4.567763395080565e-05, + "loss": 5.1449, + "step": 31946 + }, + { + "epoch": 0.18999785897801885, + "grad_norm": 1.47257661819458, + "learning_rate": 4.56773714159083e-05, + "loss": 5.6391, + "step": 31947 + }, + { + "epoch": 0.19000380626129984, + "grad_norm": 1.5196605920791626, + "learning_rate": 4.567710887379268e-05, + "loss": 5.3161, + "step": 31948 + }, + { + "epoch": 0.19000975354458083, + "grad_norm": 1.9018585681915283, + "learning_rate": 4.56768463244589e-05, + "loss": 4.9531, + "step": 31949 + }, + { + "epoch": 0.19001570082786184, + "grad_norm": 1.56498122215271, + "learning_rate": 4.567658376790704e-05, + "loss": 5.0693, + "step": 31950 + }, + { + "epoch": 0.19002164811114283, + "grad_norm": 1.5698566436767578, + "learning_rate": 4.567632120413721e-05, + "loss": 5.177, + "step": 31951 + }, + { + "epoch": 0.19002759539442382, + "grad_norm": 1.456125259399414, + "learning_rate": 4.5676058633149484e-05, + "loss": 5.145, + "step": 31952 + }, + { + "epoch": 0.19003354267770484, + "grad_norm": 1.3860251903533936, + "learning_rate": 4.5675796054943954e-05, + "loss": 5.24, + "step": 31953 + }, + { + "epoch": 0.19003948996098582, + "grad_norm": 1.4301811456680298, + "learning_rate": 4.567553346952073e-05, + "loss": 5.0656, + "step": 31954 + }, + { + "epoch": 0.1900454372442668, + "grad_norm": 1.8247642517089844, + "learning_rate": 4.567527087687988e-05, + "loss": 4.4608, + "step": 31955 + }, + { + "epoch": 0.19005138452754783, + "grad_norm": 1.5283252000808716, + "learning_rate": 4.567500827702151e-05, + "loss": 4.7095, + "step": 31956 + }, + { + "epoch": 0.19005733181082882, + "grad_norm": 1.5899708271026611, + "learning_rate": 4.567474566994571e-05, + "loss": 5.0809, + "step": 31957 + }, + { + "epoch": 0.1900632790941098, + "grad_norm": 1.4520339965820312, + "learning_rate": 4.567448305565256e-05, + "loss": 5.4298, + "step": 31958 + }, + { + "epoch": 0.19006922637739082, + "grad_norm": 1.658177375793457, + "learning_rate": 4.5674220434142175e-05, + "loss": 5.3595, + "step": 31959 + }, + { + "epoch": 0.1900751736606718, + "grad_norm": 1.4237635135650635, + "learning_rate": 4.5673957805414626e-05, + "loss": 5.2803, + "step": 31960 + }, + { + "epoch": 0.1900811209439528, + "grad_norm": 1.6651546955108643, + "learning_rate": 4.567369516947001e-05, + "loss": 5.2991, + "step": 31961 + }, + { + "epoch": 0.1900870682272338, + "grad_norm": 1.48691987991333, + "learning_rate": 4.5673432526308424e-05, + "loss": 5.4731, + "step": 31962 + }, + { + "epoch": 0.1900930155105148, + "grad_norm": 1.541694164276123, + "learning_rate": 4.5673169875929954e-05, + "loss": 5.7939, + "step": 31963 + }, + { + "epoch": 0.1900989627937958, + "grad_norm": 1.5470638275146484, + "learning_rate": 4.56729072183347e-05, + "loss": 5.5046, + "step": 31964 + }, + { + "epoch": 0.19010491007707678, + "grad_norm": 1.4966381788253784, + "learning_rate": 4.567264455352275e-05, + "loss": 5.5842, + "step": 31965 + }, + { + "epoch": 0.1901108573603578, + "grad_norm": 1.698122262954712, + "learning_rate": 4.5672381881494186e-05, + "loss": 5.3864, + "step": 31966 + }, + { + "epoch": 0.19011680464363878, + "grad_norm": 1.5268694162368774, + "learning_rate": 4.5672119202249104e-05, + "loss": 5.6954, + "step": 31967 + }, + { + "epoch": 0.19012275192691977, + "grad_norm": 1.555295705795288, + "learning_rate": 4.5671856515787606e-05, + "loss": 5.5636, + "step": 31968 + }, + { + "epoch": 0.19012869921020079, + "grad_norm": 2.07952618598938, + "learning_rate": 4.567159382210977e-05, + "loss": 4.9687, + "step": 31969 + }, + { + "epoch": 0.19013464649348177, + "grad_norm": 1.6233636140823364, + "learning_rate": 4.5671331121215696e-05, + "loss": 5.1942, + "step": 31970 + }, + { + "epoch": 0.19014059377676276, + "grad_norm": 1.7472115755081177, + "learning_rate": 4.567106841310548e-05, + "loss": 4.914, + "step": 31971 + }, + { + "epoch": 0.19014654106004378, + "grad_norm": 1.6313844919204712, + "learning_rate": 4.56708056977792e-05, + "loss": 5.0832, + "step": 31972 + }, + { + "epoch": 0.19015248834332477, + "grad_norm": 1.5175881385803223, + "learning_rate": 4.5670542975236965e-05, + "loss": 5.6577, + "step": 31973 + }, + { + "epoch": 0.19015843562660575, + "grad_norm": 1.3097161054611206, + "learning_rate": 4.567028024547885e-05, + "loss": 5.5634, + "step": 31974 + }, + { + "epoch": 0.19016438290988677, + "grad_norm": 1.4037551879882812, + "learning_rate": 4.567001750850495e-05, + "loss": 5.5816, + "step": 31975 + }, + { + "epoch": 0.19017033019316776, + "grad_norm": 1.6129430532455444, + "learning_rate": 4.5669754764315364e-05, + "loss": 5.3667, + "step": 31976 + }, + { + "epoch": 0.19017627747644875, + "grad_norm": 1.309594988822937, + "learning_rate": 4.566949201291018e-05, + "loss": 5.4406, + "step": 31977 + }, + { + "epoch": 0.19018222475972976, + "grad_norm": 1.4947516918182373, + "learning_rate": 4.5669229254289495e-05, + "loss": 4.7619, + "step": 31978 + }, + { + "epoch": 0.19018817204301075, + "grad_norm": 1.9797264337539673, + "learning_rate": 4.5668966488453394e-05, + "loss": 4.7724, + "step": 31979 + }, + { + "epoch": 0.19019411932629174, + "grad_norm": 2.4875199794769287, + "learning_rate": 4.566870371540196e-05, + "loss": 4.6923, + "step": 31980 + }, + { + "epoch": 0.19020006660957275, + "grad_norm": 1.5810712575912476, + "learning_rate": 4.5668440935135305e-05, + "loss": 5.4779, + "step": 31981 + }, + { + "epoch": 0.19020601389285374, + "grad_norm": 1.3546984195709229, + "learning_rate": 4.566817814765351e-05, + "loss": 5.5403, + "step": 31982 + }, + { + "epoch": 0.19021196117613473, + "grad_norm": 1.6855329275131226, + "learning_rate": 4.566791535295666e-05, + "loss": 5.2699, + "step": 31983 + }, + { + "epoch": 0.19021790845941575, + "grad_norm": 1.7854288816452026, + "learning_rate": 4.5667652551044865e-05, + "loss": 5.1219, + "step": 31984 + }, + { + "epoch": 0.19022385574269673, + "grad_norm": 1.589922547340393, + "learning_rate": 4.56673897419182e-05, + "loss": 5.0681, + "step": 31985 + }, + { + "epoch": 0.19022980302597772, + "grad_norm": 1.7833002805709839, + "learning_rate": 4.566712692557677e-05, + "loss": 5.0301, + "step": 31986 + }, + { + "epoch": 0.19023575030925874, + "grad_norm": 1.6957120895385742, + "learning_rate": 4.566686410202064e-05, + "loss": 5.1839, + "step": 31987 + }, + { + "epoch": 0.19024169759253973, + "grad_norm": 1.4636527299880981, + "learning_rate": 4.5666601271249945e-05, + "loss": 4.9968, + "step": 31988 + }, + { + "epoch": 0.19024764487582072, + "grad_norm": 2.6251659393310547, + "learning_rate": 4.566633843326474e-05, + "loss": 4.1913, + "step": 31989 + }, + { + "epoch": 0.19025359215910173, + "grad_norm": 3.022430896759033, + "learning_rate": 4.566607558806513e-05, + "loss": 3.7352, + "step": 31990 + }, + { + "epoch": 0.19025953944238272, + "grad_norm": 2.3903746604919434, + "learning_rate": 4.566581273565122e-05, + "loss": 3.9359, + "step": 31991 + }, + { + "epoch": 0.1902654867256637, + "grad_norm": 1.598952054977417, + "learning_rate": 4.5665549876023076e-05, + "loss": 4.674, + "step": 31992 + }, + { + "epoch": 0.19027143400894472, + "grad_norm": 2.1441328525543213, + "learning_rate": 4.5665287009180796e-05, + "loss": 3.9013, + "step": 31993 + }, + { + "epoch": 0.1902773812922257, + "grad_norm": 1.7473663091659546, + "learning_rate": 4.566502413512449e-05, + "loss": 4.7093, + "step": 31994 + }, + { + "epoch": 0.1902833285755067, + "grad_norm": 1.4611793756484985, + "learning_rate": 4.5664761253854226e-05, + "loss": 5.2113, + "step": 31995 + }, + { + "epoch": 0.19028927585878772, + "grad_norm": 1.7185208797454834, + "learning_rate": 4.566449836537012e-05, + "loss": 4.6621, + "step": 31996 + }, + { + "epoch": 0.1902952231420687, + "grad_norm": 2.710164785385132, + "learning_rate": 4.5664235469672246e-05, + "loss": 3.0121, + "step": 31997 + }, + { + "epoch": 0.1903011704253497, + "grad_norm": 1.7560441493988037, + "learning_rate": 4.5663972566760694e-05, + "loss": 5.158, + "step": 31998 + }, + { + "epoch": 0.1903071177086307, + "grad_norm": 2.3180789947509766, + "learning_rate": 4.566370965663557e-05, + "loss": 3.8676, + "step": 31999 + }, + { + "epoch": 0.1903130649919117, + "grad_norm": 2.8105721473693848, + "learning_rate": 4.5663446739296956e-05, + "loss": 3.9046, + "step": 32000 + }, + { + "epoch": 0.19031901227519268, + "grad_norm": 1.9129337072372437, + "learning_rate": 4.5663183814744946e-05, + "loss": 4.9668, + "step": 32001 + }, + { + "epoch": 0.1903249595584737, + "grad_norm": 2.512235403060913, + "learning_rate": 4.566292088297964e-05, + "loss": 5.2666, + "step": 32002 + }, + { + "epoch": 0.1903309068417547, + "grad_norm": 2.6467642784118652, + "learning_rate": 4.566265794400111e-05, + "loss": 5.1184, + "step": 32003 + }, + { + "epoch": 0.19033685412503568, + "grad_norm": 2.0429317951202393, + "learning_rate": 4.566239499780946e-05, + "loss": 4.6186, + "step": 32004 + }, + { + "epoch": 0.1903428014083167, + "grad_norm": 2.026602029800415, + "learning_rate": 4.566213204440479e-05, + "loss": 2.941, + "step": 32005 + }, + { + "epoch": 0.19034874869159768, + "grad_norm": 2.3440706729888916, + "learning_rate": 4.5661869083787184e-05, + "loss": 2.9276, + "step": 32006 + }, + { + "epoch": 0.19035469597487867, + "grad_norm": 1.5349546670913696, + "learning_rate": 4.566160611595673e-05, + "loss": 5.0741, + "step": 32007 + }, + { + "epoch": 0.19036064325815968, + "grad_norm": 1.4628055095672607, + "learning_rate": 4.566134314091352e-05, + "loss": 5.2766, + "step": 32008 + }, + { + "epoch": 0.19036659054144067, + "grad_norm": 1.768277645111084, + "learning_rate": 4.566108015865765e-05, + "loss": 5.1867, + "step": 32009 + }, + { + "epoch": 0.19037253782472166, + "grad_norm": 1.2792719602584839, + "learning_rate": 4.566081716918921e-05, + "loss": 5.1267, + "step": 32010 + }, + { + "epoch": 0.19037848510800268, + "grad_norm": 1.5036875009536743, + "learning_rate": 4.56605541725083e-05, + "loss": 5.2554, + "step": 32011 + }, + { + "epoch": 0.19038443239128366, + "grad_norm": 1.3368226289749146, + "learning_rate": 4.566029116861499e-05, + "loss": 5.2237, + "step": 32012 + }, + { + "epoch": 0.19039037967456465, + "grad_norm": 1.3333531618118286, + "learning_rate": 4.566002815750939e-05, + "loss": 5.2719, + "step": 32013 + }, + { + "epoch": 0.19039632695784567, + "grad_norm": 1.5420610904693604, + "learning_rate": 4.565976513919159e-05, + "loss": 5.2795, + "step": 32014 + }, + { + "epoch": 0.19040227424112666, + "grad_norm": 1.6228092908859253, + "learning_rate": 4.565950211366168e-05, + "loss": 5.6646, + "step": 32015 + }, + { + "epoch": 0.19040822152440764, + "grad_norm": 1.671846866607666, + "learning_rate": 4.5659239080919746e-05, + "loss": 5.5739, + "step": 32016 + }, + { + "epoch": 0.19041416880768866, + "grad_norm": 1.377333641052246, + "learning_rate": 4.5658976040965893e-05, + "loss": 5.5736, + "step": 32017 + }, + { + "epoch": 0.19042011609096965, + "grad_norm": 1.5366264581680298, + "learning_rate": 4.56587129938002e-05, + "loss": 5.182, + "step": 32018 + }, + { + "epoch": 0.19042606337425064, + "grad_norm": 1.6668857336044312, + "learning_rate": 4.5658449939422765e-05, + "loss": 4.9602, + "step": 32019 + }, + { + "epoch": 0.19043201065753165, + "grad_norm": 1.5666422843933105, + "learning_rate": 4.565818687783368e-05, + "loss": 5.1586, + "step": 32020 + }, + { + "epoch": 0.19043795794081264, + "grad_norm": 1.2899186611175537, + "learning_rate": 4.5657923809033035e-05, + "loss": 5.1735, + "step": 32021 + }, + { + "epoch": 0.19044390522409363, + "grad_norm": 1.7876954078674316, + "learning_rate": 4.565766073302092e-05, + "loss": 5.697, + "step": 32022 + }, + { + "epoch": 0.19044985250737465, + "grad_norm": 1.9547079801559448, + "learning_rate": 4.565739764979743e-05, + "loss": 5.6536, + "step": 32023 + }, + { + "epoch": 0.19045579979065563, + "grad_norm": 2.118058204650879, + "learning_rate": 4.5657134559362655e-05, + "loss": 4.2402, + "step": 32024 + }, + { + "epoch": 0.19046174707393662, + "grad_norm": 1.5718128681182861, + "learning_rate": 4.565687146171669e-05, + "loss": 5.0162, + "step": 32025 + }, + { + "epoch": 0.1904676943572176, + "grad_norm": 1.668532371520996, + "learning_rate": 4.5656608356859624e-05, + "loss": 4.8103, + "step": 32026 + }, + { + "epoch": 0.19047364164049863, + "grad_norm": 1.6259323358535767, + "learning_rate": 4.5656345244791554e-05, + "loss": 5.1993, + "step": 32027 + }, + { + "epoch": 0.1904795889237796, + "grad_norm": 1.5034635066986084, + "learning_rate": 4.565608212551256e-05, + "loss": 5.0077, + "step": 32028 + }, + { + "epoch": 0.1904855362070606, + "grad_norm": 1.434511423110962, + "learning_rate": 4.565581899902274e-05, + "loss": 5.1024, + "step": 32029 + }, + { + "epoch": 0.19049148349034162, + "grad_norm": 1.8334324359893799, + "learning_rate": 4.565555586532219e-05, + "loss": 5.266, + "step": 32030 + }, + { + "epoch": 0.1904974307736226, + "grad_norm": 1.622114896774292, + "learning_rate": 4.5655292724411004e-05, + "loss": 5.066, + "step": 32031 + }, + { + "epoch": 0.1905033780569036, + "grad_norm": 1.6296263933181763, + "learning_rate": 4.565502957628926e-05, + "loss": 4.9455, + "step": 32032 + }, + { + "epoch": 0.1905093253401846, + "grad_norm": 1.504682183265686, + "learning_rate": 4.565476642095706e-05, + "loss": 4.9339, + "step": 32033 + }, + { + "epoch": 0.1905152726234656, + "grad_norm": 1.6852915287017822, + "learning_rate": 4.56545032584145e-05, + "loss": 5.3419, + "step": 32034 + }, + { + "epoch": 0.1905212199067466, + "grad_norm": 1.54212486743927, + "learning_rate": 4.565424008866166e-05, + "loss": 4.9588, + "step": 32035 + }, + { + "epoch": 0.1905271671900276, + "grad_norm": 1.5721091032028198, + "learning_rate": 4.565397691169865e-05, + "loss": 4.9884, + "step": 32036 + }, + { + "epoch": 0.1905331144733086, + "grad_norm": 1.6846574544906616, + "learning_rate": 4.565371372752554e-05, + "loss": 5.0185, + "step": 32037 + }, + { + "epoch": 0.19053906175658958, + "grad_norm": 1.4917422533035278, + "learning_rate": 4.565345053614243e-05, + "loss": 5.0957, + "step": 32038 + }, + { + "epoch": 0.1905450090398706, + "grad_norm": 1.6241521835327148, + "learning_rate": 4.565318733754942e-05, + "loss": 4.9592, + "step": 32039 + }, + { + "epoch": 0.19055095632315158, + "grad_norm": 1.6633590459823608, + "learning_rate": 4.565292413174659e-05, + "loss": 5.1149, + "step": 32040 + }, + { + "epoch": 0.19055690360643257, + "grad_norm": 1.4443227052688599, + "learning_rate": 4.565266091873404e-05, + "loss": 5.3016, + "step": 32041 + }, + { + "epoch": 0.1905628508897136, + "grad_norm": 1.7696523666381836, + "learning_rate": 4.565239769851186e-05, + "loss": 5.3079, + "step": 32042 + }, + { + "epoch": 0.19056879817299457, + "grad_norm": 1.5043975114822388, + "learning_rate": 4.565213447108014e-05, + "loss": 5.2658, + "step": 32043 + }, + { + "epoch": 0.19057474545627556, + "grad_norm": 1.4827871322631836, + "learning_rate": 4.565187123643898e-05, + "loss": 5.2462, + "step": 32044 + }, + { + "epoch": 0.19058069273955658, + "grad_norm": 1.2935054302215576, + "learning_rate": 4.565160799458845e-05, + "loss": 5.2094, + "step": 32045 + }, + { + "epoch": 0.19058664002283757, + "grad_norm": 1.7372486591339111, + "learning_rate": 4.565134474552867e-05, + "loss": 4.6756, + "step": 32046 + }, + { + "epoch": 0.19059258730611855, + "grad_norm": 1.8113619089126587, + "learning_rate": 4.565108148925972e-05, + "loss": 4.3151, + "step": 32047 + }, + { + "epoch": 0.19059853458939957, + "grad_norm": 2.1872177124023438, + "learning_rate": 4.565081822578168e-05, + "loss": 4.2009, + "step": 32048 + }, + { + "epoch": 0.19060448187268056, + "grad_norm": 2.375410556793213, + "learning_rate": 4.565055495509466e-05, + "loss": 4.2222, + "step": 32049 + }, + { + "epoch": 0.19061042915596155, + "grad_norm": 2.441967010498047, + "learning_rate": 4.565029167719874e-05, + "loss": 4.0724, + "step": 32050 + }, + { + "epoch": 0.19061637643924256, + "grad_norm": 1.6841249465942383, + "learning_rate": 4.5650028392094026e-05, + "loss": 4.8674, + "step": 32051 + }, + { + "epoch": 0.19062232372252355, + "grad_norm": 1.6137785911560059, + "learning_rate": 4.564976509978059e-05, + "loss": 5.457, + "step": 32052 + }, + { + "epoch": 0.19062827100580454, + "grad_norm": 1.6453704833984375, + "learning_rate": 4.564950180025854e-05, + "loss": 5.3921, + "step": 32053 + }, + { + "epoch": 0.19063421828908556, + "grad_norm": 1.6670149564743042, + "learning_rate": 4.564923849352796e-05, + "loss": 4.8336, + "step": 32054 + }, + { + "epoch": 0.19064016557236654, + "grad_norm": 1.7052969932556152, + "learning_rate": 4.564897517958895e-05, + "loss": 5.4213, + "step": 32055 + }, + { + "epoch": 0.19064611285564753, + "grad_norm": 1.4670642614364624, + "learning_rate": 4.564871185844159e-05, + "loss": 5.3312, + "step": 32056 + }, + { + "epoch": 0.19065206013892855, + "grad_norm": 1.2755639553070068, + "learning_rate": 4.564844853008598e-05, + "loss": 5.3718, + "step": 32057 + }, + { + "epoch": 0.19065800742220954, + "grad_norm": 1.7414531707763672, + "learning_rate": 4.564818519452221e-05, + "loss": 5.2937, + "step": 32058 + }, + { + "epoch": 0.19066395470549052, + "grad_norm": 1.677453637123108, + "learning_rate": 4.564792185175037e-05, + "loss": 5.3738, + "step": 32059 + }, + { + "epoch": 0.19066990198877154, + "grad_norm": 1.7875051498413086, + "learning_rate": 4.564765850177056e-05, + "loss": 5.082, + "step": 32060 + }, + { + "epoch": 0.19067584927205253, + "grad_norm": 1.8616997003555298, + "learning_rate": 4.564739514458286e-05, + "loss": 4.6828, + "step": 32061 + }, + { + "epoch": 0.19068179655533352, + "grad_norm": 1.7642920017242432, + "learning_rate": 4.564713178018737e-05, + "loss": 4.5539, + "step": 32062 + }, + { + "epoch": 0.19068774383861453, + "grad_norm": 1.6018472909927368, + "learning_rate": 4.5646868408584175e-05, + "loss": 5.2847, + "step": 32063 + }, + { + "epoch": 0.19069369112189552, + "grad_norm": 1.6408270597457886, + "learning_rate": 4.564660502977337e-05, + "loss": 5.3356, + "step": 32064 + }, + { + "epoch": 0.1906996384051765, + "grad_norm": 1.9505417346954346, + "learning_rate": 4.564634164375505e-05, + "loss": 4.6428, + "step": 32065 + }, + { + "epoch": 0.19070558568845752, + "grad_norm": 2.5392873287200928, + "learning_rate": 4.564607825052931e-05, + "loss": 3.5372, + "step": 32066 + }, + { + "epoch": 0.1907115329717385, + "grad_norm": 3.086822509765625, + "learning_rate": 4.564581485009623e-05, + "loss": 4.4435, + "step": 32067 + }, + { + "epoch": 0.1907174802550195, + "grad_norm": 2.6055800914764404, + "learning_rate": 4.564555144245592e-05, + "loss": 4.5866, + "step": 32068 + }, + { + "epoch": 0.19072342753830052, + "grad_norm": 1.843562126159668, + "learning_rate": 4.5645288027608465e-05, + "loss": 5.2971, + "step": 32069 + }, + { + "epoch": 0.1907293748215815, + "grad_norm": 1.7707334756851196, + "learning_rate": 4.5645024605553944e-05, + "loss": 5.6213, + "step": 32070 + }, + { + "epoch": 0.1907353221048625, + "grad_norm": 1.7816311120986938, + "learning_rate": 4.564476117629245e-05, + "loss": 5.2668, + "step": 32071 + }, + { + "epoch": 0.1907412693881435, + "grad_norm": 1.5548468828201294, + "learning_rate": 4.56444977398241e-05, + "loss": 5.2673, + "step": 32072 + }, + { + "epoch": 0.1907472166714245, + "grad_norm": 1.581838607788086, + "learning_rate": 4.5644234296148955e-05, + "loss": 5.0574, + "step": 32073 + }, + { + "epoch": 0.19075316395470548, + "grad_norm": 1.596366047859192, + "learning_rate": 4.564397084526714e-05, + "loss": 5.0789, + "step": 32074 + }, + { + "epoch": 0.1907591112379865, + "grad_norm": 1.4851096868515015, + "learning_rate": 4.564370738717871e-05, + "loss": 5.5059, + "step": 32075 + }, + { + "epoch": 0.1907650585212675, + "grad_norm": 1.7206519842147827, + "learning_rate": 4.564344392188378e-05, + "loss": 5.1603, + "step": 32076 + }, + { + "epoch": 0.19077100580454848, + "grad_norm": 1.7164605855941772, + "learning_rate": 4.5643180449382436e-05, + "loss": 5.2737, + "step": 32077 + }, + { + "epoch": 0.1907769530878295, + "grad_norm": 1.7038495540618896, + "learning_rate": 4.564291696967477e-05, + "loss": 5.3388, + "step": 32078 + }, + { + "epoch": 0.19078290037111048, + "grad_norm": 1.5821107625961304, + "learning_rate": 4.564265348276088e-05, + "loss": 5.3052, + "step": 32079 + }, + { + "epoch": 0.19078884765439147, + "grad_norm": 1.6088097095489502, + "learning_rate": 4.564238998864085e-05, + "loss": 5.373, + "step": 32080 + }, + { + "epoch": 0.19079479493767248, + "grad_norm": 1.8086316585540771, + "learning_rate": 4.564212648731477e-05, + "loss": 5.2465, + "step": 32081 + }, + { + "epoch": 0.19080074222095347, + "grad_norm": 1.8377625942230225, + "learning_rate": 4.5641862978782746e-05, + "loss": 5.4968, + "step": 32082 + }, + { + "epoch": 0.19080668950423446, + "grad_norm": 1.7471630573272705, + "learning_rate": 4.564159946304486e-05, + "loss": 5.2868, + "step": 32083 + }, + { + "epoch": 0.19081263678751545, + "grad_norm": 1.4050308465957642, + "learning_rate": 4.5641335940101196e-05, + "loss": 5.3825, + "step": 32084 + }, + { + "epoch": 0.19081858407079647, + "grad_norm": 1.4709471464157104, + "learning_rate": 4.564107240995187e-05, + "loss": 5.273, + "step": 32085 + }, + { + "epoch": 0.19082453135407745, + "grad_norm": 1.5411866903305054, + "learning_rate": 4.5640808872596944e-05, + "loss": 5.3013, + "step": 32086 + }, + { + "epoch": 0.19083047863735844, + "grad_norm": 1.7834669351577759, + "learning_rate": 4.5640545328036536e-05, + "loss": 4.7055, + "step": 32087 + }, + { + "epoch": 0.19083642592063946, + "grad_norm": 1.9448041915893555, + "learning_rate": 4.564028177627072e-05, + "loss": 4.9035, + "step": 32088 + }, + { + "epoch": 0.19084237320392045, + "grad_norm": 1.5890318155288696, + "learning_rate": 4.5640018217299593e-05, + "loss": 5.2144, + "step": 32089 + }, + { + "epoch": 0.19084832048720143, + "grad_norm": 1.7338842153549194, + "learning_rate": 4.563975465112325e-05, + "loss": 5.435, + "step": 32090 + }, + { + "epoch": 0.19085426777048245, + "grad_norm": 1.6438137292861938, + "learning_rate": 4.5639491077741786e-05, + "loss": 5.39, + "step": 32091 + }, + { + "epoch": 0.19086021505376344, + "grad_norm": 1.5553499460220337, + "learning_rate": 4.563922749715529e-05, + "loss": 5.4373, + "step": 32092 + }, + { + "epoch": 0.19086616233704443, + "grad_norm": 2.2094454765319824, + "learning_rate": 4.5638963909363845e-05, + "loss": 3.9242, + "step": 32093 + }, + { + "epoch": 0.19087210962032544, + "grad_norm": 2.6177477836608887, + "learning_rate": 4.563870031436756e-05, + "loss": 3.6645, + "step": 32094 + }, + { + "epoch": 0.19087805690360643, + "grad_norm": 2.5494978427886963, + "learning_rate": 4.563843671216651e-05, + "loss": 3.6596, + "step": 32095 + }, + { + "epoch": 0.19088400418688742, + "grad_norm": 2.483102798461914, + "learning_rate": 4.56381731027608e-05, + "loss": 3.7689, + "step": 32096 + }, + { + "epoch": 0.19088995147016843, + "grad_norm": 2.2079670429229736, + "learning_rate": 4.563790948615052e-05, + "loss": 3.599, + "step": 32097 + }, + { + "epoch": 0.19089589875344942, + "grad_norm": 2.243823766708374, + "learning_rate": 4.563764586233575e-05, + "loss": 4.1006, + "step": 32098 + }, + { + "epoch": 0.1909018460367304, + "grad_norm": 2.53912091255188, + "learning_rate": 4.5637382231316595e-05, + "loss": 3.9895, + "step": 32099 + }, + { + "epoch": 0.19090779332001143, + "grad_norm": 3.3844449520111084, + "learning_rate": 4.563711859309314e-05, + "loss": 3.0281, + "step": 32100 + }, + { + "epoch": 0.19091374060329241, + "grad_norm": 3.253937244415283, + "learning_rate": 4.563685494766549e-05, + "loss": 2.2503, + "step": 32101 + }, + { + "epoch": 0.1909196878865734, + "grad_norm": 2.649468183517456, + "learning_rate": 4.5636591295033723e-05, + "loss": 2.2933, + "step": 32102 + }, + { + "epoch": 0.19092563516985442, + "grad_norm": 2.899502992630005, + "learning_rate": 4.563632763519793e-05, + "loss": 3.5769, + "step": 32103 + }, + { + "epoch": 0.1909315824531354, + "grad_norm": 2.6918673515319824, + "learning_rate": 4.563606396815821e-05, + "loss": 3.7763, + "step": 32104 + }, + { + "epoch": 0.1909375297364164, + "grad_norm": 2.239382028579712, + "learning_rate": 4.5635800293914654e-05, + "loss": 5.3445, + "step": 32105 + }, + { + "epoch": 0.1909434770196974, + "grad_norm": 3.2034034729003906, + "learning_rate": 4.563553661246736e-05, + "loss": 3.0084, + "step": 32106 + }, + { + "epoch": 0.1909494243029784, + "grad_norm": 3.6095571517944336, + "learning_rate": 4.5635272923816406e-05, + "loss": 3.1999, + "step": 32107 + }, + { + "epoch": 0.1909553715862594, + "grad_norm": 2.625148057937622, + "learning_rate": 4.5635009227961886e-05, + "loss": 2.4227, + "step": 32108 + }, + { + "epoch": 0.1909613188695404, + "grad_norm": 2.8703715801239014, + "learning_rate": 4.563474552490391e-05, + "loss": 2.4958, + "step": 32109 + }, + { + "epoch": 0.1909672661528214, + "grad_norm": 2.1197023391723633, + "learning_rate": 4.563448181464255e-05, + "loss": 4.5549, + "step": 32110 + }, + { + "epoch": 0.19097321343610238, + "grad_norm": 2.1457529067993164, + "learning_rate": 4.563421809717791e-05, + "loss": 5.0002, + "step": 32111 + }, + { + "epoch": 0.1909791607193834, + "grad_norm": 1.7680915594100952, + "learning_rate": 4.563395437251007e-05, + "loss": 5.9225, + "step": 32112 + }, + { + "epoch": 0.19098510800266438, + "grad_norm": 1.8019880056381226, + "learning_rate": 4.5633690640639135e-05, + "loss": 5.5489, + "step": 32113 + }, + { + "epoch": 0.19099105528594537, + "grad_norm": 2.032569408416748, + "learning_rate": 4.563342690156519e-05, + "loss": 4.7542, + "step": 32114 + }, + { + "epoch": 0.1909970025692264, + "grad_norm": 2.230560541152954, + "learning_rate": 4.563316315528834e-05, + "loss": 4.009, + "step": 32115 + }, + { + "epoch": 0.19100294985250738, + "grad_norm": 1.82839834690094, + "learning_rate": 4.563289940180865e-05, + "loss": 4.3989, + "step": 32116 + }, + { + "epoch": 0.19100889713578836, + "grad_norm": 2.581059694290161, + "learning_rate": 4.5632635641126234e-05, + "loss": 3.6806, + "step": 32117 + }, + { + "epoch": 0.19101484441906938, + "grad_norm": 3.031672716140747, + "learning_rate": 4.563237187324118e-05, + "loss": 3.1234, + "step": 32118 + }, + { + "epoch": 0.19102079170235037, + "grad_norm": 2.618824005126953, + "learning_rate": 4.5632108098153576e-05, + "loss": 3.2917, + "step": 32119 + }, + { + "epoch": 0.19102673898563136, + "grad_norm": 2.6035311222076416, + "learning_rate": 4.563184431586351e-05, + "loss": 3.8774, + "step": 32120 + }, + { + "epoch": 0.19103268626891237, + "grad_norm": 2.398284673690796, + "learning_rate": 4.56315805263711e-05, + "loss": 3.6489, + "step": 32121 + }, + { + "epoch": 0.19103863355219336, + "grad_norm": 1.7356419563293457, + "learning_rate": 4.56313167296764e-05, + "loss": 4.4262, + "step": 32122 + }, + { + "epoch": 0.19104458083547435, + "grad_norm": 2.066359281539917, + "learning_rate": 4.5631052925779526e-05, + "loss": 5.0187, + "step": 32123 + }, + { + "epoch": 0.19105052811875536, + "grad_norm": 1.9970104694366455, + "learning_rate": 4.563078911468056e-05, + "loss": 5.2749, + "step": 32124 + }, + { + "epoch": 0.19105647540203635, + "grad_norm": 1.5883231163024902, + "learning_rate": 4.5630525296379604e-05, + "loss": 5.1499, + "step": 32125 + }, + { + "epoch": 0.19106242268531734, + "grad_norm": 1.4975897073745728, + "learning_rate": 4.5630261470876745e-05, + "loss": 5.2505, + "step": 32126 + }, + { + "epoch": 0.19106836996859836, + "grad_norm": 1.7211848497390747, + "learning_rate": 4.562999763817207e-05, + "loss": 5.2076, + "step": 32127 + }, + { + "epoch": 0.19107431725187934, + "grad_norm": 2.7472379207611084, + "learning_rate": 4.562973379826568e-05, + "loss": 3.2447, + "step": 32128 + }, + { + "epoch": 0.19108026453516033, + "grad_norm": 1.8915821313858032, + "learning_rate": 4.5629469951157667e-05, + "loss": 4.7631, + "step": 32129 + }, + { + "epoch": 0.19108621181844135, + "grad_norm": 2.09493350982666, + "learning_rate": 4.562920609684812e-05, + "loss": 5.094, + "step": 32130 + }, + { + "epoch": 0.19109215910172234, + "grad_norm": 2.0612680912017822, + "learning_rate": 4.562894223533712e-05, + "loss": 4.9167, + "step": 32131 + }, + { + "epoch": 0.19109810638500332, + "grad_norm": 2.0020735263824463, + "learning_rate": 4.562867836662478e-05, + "loss": 4.8969, + "step": 32132 + }, + { + "epoch": 0.19110405366828434, + "grad_norm": 1.935276985168457, + "learning_rate": 4.562841449071117e-05, + "loss": 5.1707, + "step": 32133 + }, + { + "epoch": 0.19111000095156533, + "grad_norm": 1.9530506134033203, + "learning_rate": 4.56281506075964e-05, + "loss": 4.84, + "step": 32134 + }, + { + "epoch": 0.19111594823484632, + "grad_norm": 1.9572010040283203, + "learning_rate": 4.5627886717280557e-05, + "loss": 4.5848, + "step": 32135 + }, + { + "epoch": 0.19112189551812733, + "grad_norm": 1.5390974283218384, + "learning_rate": 4.562762281976373e-05, + "loss": 4.8283, + "step": 32136 + }, + { + "epoch": 0.19112784280140832, + "grad_norm": 1.5366077423095703, + "learning_rate": 4.5627358915046015e-05, + "loss": 4.7778, + "step": 32137 + }, + { + "epoch": 0.1911337900846893, + "grad_norm": 2.3752942085266113, + "learning_rate": 4.56270950031275e-05, + "loss": 4.7363, + "step": 32138 + }, + { + "epoch": 0.19113973736797032, + "grad_norm": 2.1061747074127197, + "learning_rate": 4.562683108400828e-05, + "loss": 4.9934, + "step": 32139 + }, + { + "epoch": 0.1911456846512513, + "grad_norm": 1.8647900819778442, + "learning_rate": 4.562656715768844e-05, + "loss": 4.6105, + "step": 32140 + }, + { + "epoch": 0.1911516319345323, + "grad_norm": 3.472999334335327, + "learning_rate": 4.5626303224168085e-05, + "loss": 3.5257, + "step": 32141 + }, + { + "epoch": 0.1911575792178133, + "grad_norm": 3.206305503845215, + "learning_rate": 4.562603928344731e-05, + "loss": 3.2121, + "step": 32142 + }, + { + "epoch": 0.1911635265010943, + "grad_norm": 2.549485683441162, + "learning_rate": 4.562577533552618e-05, + "loss": 4.0823, + "step": 32143 + }, + { + "epoch": 0.1911694737843753, + "grad_norm": 2.314005136489868, + "learning_rate": 4.5625511380404816e-05, + "loss": 4.4944, + "step": 32144 + }, + { + "epoch": 0.19117542106765628, + "grad_norm": 1.9643429517745972, + "learning_rate": 4.562524741808329e-05, + "loss": 4.4158, + "step": 32145 + }, + { + "epoch": 0.1911813683509373, + "grad_norm": 3.2631421089172363, + "learning_rate": 4.562498344856172e-05, + "loss": 2.6561, + "step": 32146 + }, + { + "epoch": 0.19118731563421829, + "grad_norm": 2.985316514968872, + "learning_rate": 4.5624719471840166e-05, + "loss": 2.5877, + "step": 32147 + }, + { + "epoch": 0.19119326291749927, + "grad_norm": 2.864422559738159, + "learning_rate": 4.562445548791874e-05, + "loss": 3.0388, + "step": 32148 + }, + { + "epoch": 0.1911992102007803, + "grad_norm": 3.315479040145874, + "learning_rate": 4.562419149679753e-05, + "loss": 2.1148, + "step": 32149 + }, + { + "epoch": 0.19120515748406128, + "grad_norm": 2.9864017963409424, + "learning_rate": 4.562392749847663e-05, + "loss": 2.0678, + "step": 32150 + }, + { + "epoch": 0.19121110476734227, + "grad_norm": 2.3070502281188965, + "learning_rate": 4.562366349295613e-05, + "loss": 3.8237, + "step": 32151 + }, + { + "epoch": 0.19121705205062328, + "grad_norm": 3.103196620941162, + "learning_rate": 4.562339948023611e-05, + "loss": 2.2358, + "step": 32152 + }, + { + "epoch": 0.19122299933390427, + "grad_norm": 3.050037384033203, + "learning_rate": 4.562313546031669e-05, + "loss": 2.3102, + "step": 32153 + }, + { + "epoch": 0.19122894661718526, + "grad_norm": 3.0590415000915527, + "learning_rate": 4.562287143319794e-05, + "loss": 2.6283, + "step": 32154 + }, + { + "epoch": 0.19123489390046627, + "grad_norm": 3.2240819931030273, + "learning_rate": 4.5622607398879956e-05, + "loss": 1.8508, + "step": 32155 + }, + { + "epoch": 0.19124084118374726, + "grad_norm": 3.0105197429656982, + "learning_rate": 4.562234335736284e-05, + "loss": 2.0124, + "step": 32156 + }, + { + "epoch": 0.19124678846702825, + "grad_norm": 2.1753182411193848, + "learning_rate": 4.562207930864667e-05, + "loss": 4.0, + "step": 32157 + }, + { + "epoch": 0.19125273575030927, + "grad_norm": 1.8794794082641602, + "learning_rate": 4.562181525273155e-05, + "loss": 4.8347, + "step": 32158 + }, + { + "epoch": 0.19125868303359025, + "grad_norm": 1.8856089115142822, + "learning_rate": 4.5621551189617564e-05, + "loss": 4.8426, + "step": 32159 + }, + { + "epoch": 0.19126463031687124, + "grad_norm": 1.557360291481018, + "learning_rate": 4.562128711930481e-05, + "loss": 5.9689, + "step": 32160 + }, + { + "epoch": 0.19127057760015226, + "grad_norm": 1.6035491228103638, + "learning_rate": 4.562102304179338e-05, + "loss": 5.5338, + "step": 32161 + }, + { + "epoch": 0.19127652488343325, + "grad_norm": 1.6492342948913574, + "learning_rate": 4.562075895708335e-05, + "loss": 5.1785, + "step": 32162 + }, + { + "epoch": 0.19128247216671423, + "grad_norm": 1.8426982164382935, + "learning_rate": 4.5620494865174846e-05, + "loss": 4.9595, + "step": 32163 + }, + { + "epoch": 0.19128841944999525, + "grad_norm": 1.4188711643218994, + "learning_rate": 4.562023076606793e-05, + "loss": 5.0459, + "step": 32164 + }, + { + "epoch": 0.19129436673327624, + "grad_norm": 1.7565912008285522, + "learning_rate": 4.5619966659762705e-05, + "loss": 4.7912, + "step": 32165 + }, + { + "epoch": 0.19130031401655723, + "grad_norm": 1.7340164184570312, + "learning_rate": 4.561970254625926e-05, + "loss": 5.7673, + "step": 32166 + }, + { + "epoch": 0.19130626129983824, + "grad_norm": 1.6327145099639893, + "learning_rate": 4.561943842555769e-05, + "loss": 5.7126, + "step": 32167 + }, + { + "epoch": 0.19131220858311923, + "grad_norm": 1.762123703956604, + "learning_rate": 4.56191742976581e-05, + "loss": 4.8158, + "step": 32168 + }, + { + "epoch": 0.19131815586640022, + "grad_norm": 1.8846794366836548, + "learning_rate": 4.561891016256055e-05, + "loss": 4.9619, + "step": 32169 + }, + { + "epoch": 0.19132410314968123, + "grad_norm": 1.7014399766921997, + "learning_rate": 4.5618646020265165e-05, + "loss": 5.3071, + "step": 32170 + }, + { + "epoch": 0.19133005043296222, + "grad_norm": 1.6020660400390625, + "learning_rate": 4.561838187077202e-05, + "loss": 5.1499, + "step": 32171 + }, + { + "epoch": 0.1913359977162432, + "grad_norm": 1.6537081003189087, + "learning_rate": 4.561811771408121e-05, + "loss": 5.0837, + "step": 32172 + }, + { + "epoch": 0.19134194499952423, + "grad_norm": 1.7917143106460571, + "learning_rate": 4.5617853550192826e-05, + "loss": 4.9645, + "step": 32173 + }, + { + "epoch": 0.19134789228280522, + "grad_norm": 1.5786539316177368, + "learning_rate": 4.561758937910696e-05, + "loss": 5.1049, + "step": 32174 + }, + { + "epoch": 0.1913538395660862, + "grad_norm": 1.4922531843185425, + "learning_rate": 4.5617325200823715e-05, + "loss": 5.8555, + "step": 32175 + }, + { + "epoch": 0.19135978684936722, + "grad_norm": 1.5561562776565552, + "learning_rate": 4.561706101534317e-05, + "loss": 5.4443, + "step": 32176 + }, + { + "epoch": 0.1913657341326482, + "grad_norm": 1.5478456020355225, + "learning_rate": 4.5616796822665425e-05, + "loss": 5.1066, + "step": 32177 + }, + { + "epoch": 0.1913716814159292, + "grad_norm": 1.5855305194854736, + "learning_rate": 4.561653262279057e-05, + "loss": 4.898, + "step": 32178 + }, + { + "epoch": 0.1913776286992102, + "grad_norm": 1.540865182876587, + "learning_rate": 4.5616268415718686e-05, + "loss": 5.6622, + "step": 32179 + }, + { + "epoch": 0.1913835759824912, + "grad_norm": 1.6557115316390991, + "learning_rate": 4.561600420144989e-05, + "loss": 5.6751, + "step": 32180 + }, + { + "epoch": 0.1913895232657722, + "grad_norm": 1.9517576694488525, + "learning_rate": 4.561573997998425e-05, + "loss": 5.058, + "step": 32181 + }, + { + "epoch": 0.1913954705490532, + "grad_norm": 1.6348973512649536, + "learning_rate": 4.561547575132188e-05, + "loss": 4.8167, + "step": 32182 + }, + { + "epoch": 0.1914014178323342, + "grad_norm": 1.6906663179397583, + "learning_rate": 4.561521151546284e-05, + "loss": 4.9711, + "step": 32183 + }, + { + "epoch": 0.19140736511561518, + "grad_norm": 2.0812265872955322, + "learning_rate": 4.561494727240726e-05, + "loss": 5.4894, + "step": 32184 + }, + { + "epoch": 0.1914133123988962, + "grad_norm": 1.509619116783142, + "learning_rate": 4.561468302215521e-05, + "loss": 5.646, + "step": 32185 + }, + { + "epoch": 0.19141925968217718, + "grad_norm": 1.4971113204956055, + "learning_rate": 4.561441876470679e-05, + "loss": 5.3408, + "step": 32186 + }, + { + "epoch": 0.19142520696545817, + "grad_norm": 1.7571582794189453, + "learning_rate": 4.5614154500062084e-05, + "loss": 4.5655, + "step": 32187 + }, + { + "epoch": 0.1914311542487392, + "grad_norm": 1.7431119680404663, + "learning_rate": 4.561389022822119e-05, + "loss": 4.6405, + "step": 32188 + }, + { + "epoch": 0.19143710153202018, + "grad_norm": 1.694812297821045, + "learning_rate": 4.56136259491842e-05, + "loss": 4.6362, + "step": 32189 + }, + { + "epoch": 0.19144304881530116, + "grad_norm": 1.6445132493972778, + "learning_rate": 4.5613361662951206e-05, + "loss": 4.5245, + "step": 32190 + }, + { + "epoch": 0.19144899609858218, + "grad_norm": 1.6429940462112427, + "learning_rate": 4.561309736952231e-05, + "loss": 4.3761, + "step": 32191 + }, + { + "epoch": 0.19145494338186317, + "grad_norm": 1.5726126432418823, + "learning_rate": 4.561283306889759e-05, + "loss": 4.4205, + "step": 32192 + }, + { + "epoch": 0.19146089066514416, + "grad_norm": 1.6214704513549805, + "learning_rate": 4.561256876107713e-05, + "loss": 3.8221, + "step": 32193 + }, + { + "epoch": 0.19146683794842517, + "grad_norm": 1.7851347923278809, + "learning_rate": 4.561230444606105e-05, + "loss": 3.9998, + "step": 32194 + }, + { + "epoch": 0.19147278523170616, + "grad_norm": 1.6264827251434326, + "learning_rate": 4.561204012384942e-05, + "loss": 3.8884, + "step": 32195 + }, + { + "epoch": 0.19147873251498715, + "grad_norm": 1.7741085290908813, + "learning_rate": 4.5611775794442346e-05, + "loss": 4.003, + "step": 32196 + }, + { + "epoch": 0.19148467979826816, + "grad_norm": 1.7139368057250977, + "learning_rate": 4.561151145783991e-05, + "loss": 4.7756, + "step": 32197 + }, + { + "epoch": 0.19149062708154915, + "grad_norm": 2.100501775741577, + "learning_rate": 4.56112471140422e-05, + "loss": 4.8871, + "step": 32198 + }, + { + "epoch": 0.19149657436483014, + "grad_norm": 2.702474355697632, + "learning_rate": 4.561098276304933e-05, + "loss": 4.7146, + "step": 32199 + }, + { + "epoch": 0.19150252164811113, + "grad_norm": 2.8174052238464355, + "learning_rate": 4.5610718404861375e-05, + "loss": 4.7004, + "step": 32200 + }, + { + "epoch": 0.19150846893139215, + "grad_norm": 2.7826762199401855, + "learning_rate": 4.561045403947843e-05, + "loss": 4.4944, + "step": 32201 + }, + { + "epoch": 0.19151441621467313, + "grad_norm": 2.7929837703704834, + "learning_rate": 4.5610189666900585e-05, + "loss": 4.5329, + "step": 32202 + }, + { + "epoch": 0.19152036349795412, + "grad_norm": 1.817031741142273, + "learning_rate": 4.560992528712794e-05, + "loss": 4.3808, + "step": 32203 + }, + { + "epoch": 0.19152631078123514, + "grad_norm": 1.8460679054260254, + "learning_rate": 4.5609660900160584e-05, + "loss": 5.214, + "step": 32204 + }, + { + "epoch": 0.19153225806451613, + "grad_norm": 1.5890216827392578, + "learning_rate": 4.5609396505998604e-05, + "loss": 5.8593, + "step": 32205 + }, + { + "epoch": 0.1915382053477971, + "grad_norm": 1.7008686065673828, + "learning_rate": 4.56091321046421e-05, + "loss": 5.0469, + "step": 32206 + }, + { + "epoch": 0.19154415263107813, + "grad_norm": 1.6503829956054688, + "learning_rate": 4.560886769609116e-05, + "loss": 4.8837, + "step": 32207 + }, + { + "epoch": 0.19155009991435912, + "grad_norm": 2.075839042663574, + "learning_rate": 4.560860328034588e-05, + "loss": 4.7162, + "step": 32208 + }, + { + "epoch": 0.1915560471976401, + "grad_norm": 1.648373007774353, + "learning_rate": 4.560833885740635e-05, + "loss": 4.7624, + "step": 32209 + }, + { + "epoch": 0.19156199448092112, + "grad_norm": 1.5314377546310425, + "learning_rate": 4.560807442727265e-05, + "loss": 4.5067, + "step": 32210 + }, + { + "epoch": 0.1915679417642021, + "grad_norm": 1.555922269821167, + "learning_rate": 4.5607809989944894e-05, + "loss": 4.6025, + "step": 32211 + }, + { + "epoch": 0.1915738890474831, + "grad_norm": 1.4894582033157349, + "learning_rate": 4.5607545545423166e-05, + "loss": 5.1213, + "step": 32212 + }, + { + "epoch": 0.19157983633076411, + "grad_norm": 1.6521800756454468, + "learning_rate": 4.5607281093707554e-05, + "loss": 5.2245, + "step": 32213 + }, + { + "epoch": 0.1915857836140451, + "grad_norm": 1.6810579299926758, + "learning_rate": 4.560701663479815e-05, + "loss": 5.1212, + "step": 32214 + }, + { + "epoch": 0.1915917308973261, + "grad_norm": 1.622002124786377, + "learning_rate": 4.5606752168695055e-05, + "loss": 4.6669, + "step": 32215 + }, + { + "epoch": 0.1915976781806071, + "grad_norm": 1.5422900915145874, + "learning_rate": 4.560648769539835e-05, + "loss": 5.0407, + "step": 32216 + }, + { + "epoch": 0.1916036254638881, + "grad_norm": 1.907077431678772, + "learning_rate": 4.560622321490814e-05, + "loss": 4.6231, + "step": 32217 + }, + { + "epoch": 0.19160957274716908, + "grad_norm": 1.7939534187316895, + "learning_rate": 4.560595872722451e-05, + "loss": 5.0616, + "step": 32218 + }, + { + "epoch": 0.1916155200304501, + "grad_norm": 1.643505573272705, + "learning_rate": 4.560569423234755e-05, + "loss": 5.1006, + "step": 32219 + }, + { + "epoch": 0.1916214673137311, + "grad_norm": 1.5651081800460815, + "learning_rate": 4.560542973027735e-05, + "loss": 4.9603, + "step": 32220 + }, + { + "epoch": 0.19162741459701207, + "grad_norm": 1.8621745109558105, + "learning_rate": 4.560516522101401e-05, + "loss": 4.4891, + "step": 32221 + }, + { + "epoch": 0.1916333618802931, + "grad_norm": 1.6403456926345825, + "learning_rate": 4.560490070455762e-05, + "loss": 4.7257, + "step": 32222 + }, + { + "epoch": 0.19163930916357408, + "grad_norm": 1.6413493156433105, + "learning_rate": 4.560463618090828e-05, + "loss": 4.707, + "step": 32223 + }, + { + "epoch": 0.19164525644685507, + "grad_norm": 1.6639118194580078, + "learning_rate": 4.560437165006606e-05, + "loss": 4.8485, + "step": 32224 + }, + { + "epoch": 0.19165120373013608, + "grad_norm": 1.8064452409744263, + "learning_rate": 4.560410711203108e-05, + "loss": 4.568, + "step": 32225 + }, + { + "epoch": 0.19165715101341707, + "grad_norm": 1.610862135887146, + "learning_rate": 4.560384256680341e-05, + "loss": 4.509, + "step": 32226 + }, + { + "epoch": 0.19166309829669806, + "grad_norm": 1.6460599899291992, + "learning_rate": 4.560357801438315e-05, + "loss": 4.5947, + "step": 32227 + }, + { + "epoch": 0.19166904557997907, + "grad_norm": 1.7876906394958496, + "learning_rate": 4.5603313454770404e-05, + "loss": 5.1632, + "step": 32228 + }, + { + "epoch": 0.19167499286326006, + "grad_norm": 1.642273187637329, + "learning_rate": 4.560304888796525e-05, + "loss": 5.3845, + "step": 32229 + }, + { + "epoch": 0.19168094014654105, + "grad_norm": 1.5736563205718994, + "learning_rate": 4.5602784313967784e-05, + "loss": 4.8355, + "step": 32230 + }, + { + "epoch": 0.19168688742982207, + "grad_norm": 1.5724636316299438, + "learning_rate": 4.56025197327781e-05, + "loss": 4.6927, + "step": 32231 + }, + { + "epoch": 0.19169283471310306, + "grad_norm": 1.7795807123184204, + "learning_rate": 4.560225514439628e-05, + "loss": 4.9267, + "step": 32232 + }, + { + "epoch": 0.19169878199638404, + "grad_norm": 1.7334511280059814, + "learning_rate": 4.5601990548822436e-05, + "loss": 4.9678, + "step": 32233 + }, + { + "epoch": 0.19170472927966506, + "grad_norm": 1.7917529344558716, + "learning_rate": 4.560172594605665e-05, + "loss": 4.4641, + "step": 32234 + }, + { + "epoch": 0.19171067656294605, + "grad_norm": 1.7616432905197144, + "learning_rate": 4.560146133609901e-05, + "loss": 4.2135, + "step": 32235 + }, + { + "epoch": 0.19171662384622704, + "grad_norm": 1.7462329864501953, + "learning_rate": 4.5601196718949614e-05, + "loss": 4.2736, + "step": 32236 + }, + { + "epoch": 0.19172257112950805, + "grad_norm": 1.7398426532745361, + "learning_rate": 4.560093209460855e-05, + "loss": 4.5267, + "step": 32237 + }, + { + "epoch": 0.19172851841278904, + "grad_norm": 2.057189464569092, + "learning_rate": 4.5600667463075916e-05, + "loss": 4.4011, + "step": 32238 + }, + { + "epoch": 0.19173446569607003, + "grad_norm": 1.757520318031311, + "learning_rate": 4.560040282435181e-05, + "loss": 4.883, + "step": 32239 + }, + { + "epoch": 0.19174041297935104, + "grad_norm": 1.9065808057785034, + "learning_rate": 4.56001381784363e-05, + "loss": 5.2427, + "step": 32240 + }, + { + "epoch": 0.19174636026263203, + "grad_norm": 1.8721452951431274, + "learning_rate": 4.5599873525329505e-05, + "loss": 4.6069, + "step": 32241 + }, + { + "epoch": 0.19175230754591302, + "grad_norm": 1.7365145683288574, + "learning_rate": 4.559960886503151e-05, + "loss": 4.7842, + "step": 32242 + }, + { + "epoch": 0.19175825482919404, + "grad_norm": 1.9017609357833862, + "learning_rate": 4.559934419754239e-05, + "loss": 4.9601, + "step": 32243 + }, + { + "epoch": 0.19176420211247502, + "grad_norm": 1.5734943151474, + "learning_rate": 4.559907952286226e-05, + "loss": 4.9961, + "step": 32244 + }, + { + "epoch": 0.191770149395756, + "grad_norm": 1.8231675624847412, + "learning_rate": 4.559881484099121e-05, + "loss": 4.8581, + "step": 32245 + }, + { + "epoch": 0.19177609667903703, + "grad_norm": 1.657576560974121, + "learning_rate": 4.559855015192933e-05, + "loss": 4.739, + "step": 32246 + }, + { + "epoch": 0.19178204396231802, + "grad_norm": 1.801846981048584, + "learning_rate": 4.5598285455676694e-05, + "loss": 4.9883, + "step": 32247 + }, + { + "epoch": 0.191787991245599, + "grad_norm": 1.6971237659454346, + "learning_rate": 4.5598020752233414e-05, + "loss": 5.0313, + "step": 32248 + }, + { + "epoch": 0.19179393852888002, + "grad_norm": 1.7070435285568237, + "learning_rate": 4.559775604159958e-05, + "loss": 5.1385, + "step": 32249 + }, + { + "epoch": 0.191799885812161, + "grad_norm": 1.8786835670471191, + "learning_rate": 4.559749132377529e-05, + "loss": 4.5535, + "step": 32250 + }, + { + "epoch": 0.191805833095442, + "grad_norm": 2.4562580585479736, + "learning_rate": 4.559722659876061e-05, + "loss": 4.3224, + "step": 32251 + }, + { + "epoch": 0.191811780378723, + "grad_norm": 2.2160046100616455, + "learning_rate": 4.5596961866555665e-05, + "loss": 4.1118, + "step": 32252 + }, + { + "epoch": 0.191817727662004, + "grad_norm": 1.9994086027145386, + "learning_rate": 4.559669712716053e-05, + "loss": 4.2414, + "step": 32253 + }, + { + "epoch": 0.191823674945285, + "grad_norm": 1.820043921470642, + "learning_rate": 4.55964323805753e-05, + "loss": 4.2737, + "step": 32254 + }, + { + "epoch": 0.191829622228566, + "grad_norm": 1.5855984687805176, + "learning_rate": 4.559616762680008e-05, + "loss": 5.3039, + "step": 32255 + }, + { + "epoch": 0.191835569511847, + "grad_norm": 2.3250484466552734, + "learning_rate": 4.5595902865834924e-05, + "loss": 4.2797, + "step": 32256 + }, + { + "epoch": 0.19184151679512798, + "grad_norm": 2.333883285522461, + "learning_rate": 4.559563809767997e-05, + "loss": 3.8071, + "step": 32257 + }, + { + "epoch": 0.19184746407840897, + "grad_norm": 2.718810796737671, + "learning_rate": 4.559537332233529e-05, + "loss": 3.7749, + "step": 32258 + }, + { + "epoch": 0.19185341136168998, + "grad_norm": 2.366673469543457, + "learning_rate": 4.559510853980098e-05, + "loss": 3.7677, + "step": 32259 + }, + { + "epoch": 0.19185935864497097, + "grad_norm": 2.129708766937256, + "learning_rate": 4.559484375007713e-05, + "loss": 4.2157, + "step": 32260 + }, + { + "epoch": 0.19186530592825196, + "grad_norm": 1.6533547639846802, + "learning_rate": 4.559457895316382e-05, + "loss": 5.3304, + "step": 32261 + }, + { + "epoch": 0.19187125321153298, + "grad_norm": 1.7954961061477661, + "learning_rate": 4.5594314149061166e-05, + "loss": 5.5018, + "step": 32262 + }, + { + "epoch": 0.19187720049481397, + "grad_norm": 1.6768550872802734, + "learning_rate": 4.559404933776925e-05, + "loss": 5.1384, + "step": 32263 + }, + { + "epoch": 0.19188314777809495, + "grad_norm": 1.719743013381958, + "learning_rate": 4.5593784519288165e-05, + "loss": 4.7757, + "step": 32264 + }, + { + "epoch": 0.19188909506137597, + "grad_norm": 1.8016587495803833, + "learning_rate": 4.5593519693618e-05, + "loss": 4.6744, + "step": 32265 + }, + { + "epoch": 0.19189504234465696, + "grad_norm": 2.5645673274993896, + "learning_rate": 4.559325486075885e-05, + "loss": 3.7642, + "step": 32266 + }, + { + "epoch": 0.19190098962793795, + "grad_norm": 1.8583413362503052, + "learning_rate": 4.559299002071081e-05, + "loss": 4.0051, + "step": 32267 + }, + { + "epoch": 0.19190693691121896, + "grad_norm": 1.8633100986480713, + "learning_rate": 4.5592725173473964e-05, + "loss": 4.9233, + "step": 32268 + }, + { + "epoch": 0.19191288419449995, + "grad_norm": 1.5865737199783325, + "learning_rate": 4.5592460319048415e-05, + "loss": 4.9211, + "step": 32269 + }, + { + "epoch": 0.19191883147778094, + "grad_norm": 1.6763908863067627, + "learning_rate": 4.559219545743425e-05, + "loss": 5.0468, + "step": 32270 + }, + { + "epoch": 0.19192477876106195, + "grad_norm": 1.7344523668289185, + "learning_rate": 4.559193058863156e-05, + "loss": 4.9828, + "step": 32271 + }, + { + "epoch": 0.19193072604434294, + "grad_norm": 1.5607928037643433, + "learning_rate": 4.559166571264045e-05, + "loss": 5.0621, + "step": 32272 + }, + { + "epoch": 0.19193667332762393, + "grad_norm": 1.6489591598510742, + "learning_rate": 4.559140082946099e-05, + "loss": 5.075, + "step": 32273 + }, + { + "epoch": 0.19194262061090495, + "grad_norm": 2.018446207046509, + "learning_rate": 4.5591135939093286e-05, + "loss": 4.6623, + "step": 32274 + }, + { + "epoch": 0.19194856789418593, + "grad_norm": 1.7344367504119873, + "learning_rate": 4.559087104153743e-05, + "loss": 4.7845, + "step": 32275 + }, + { + "epoch": 0.19195451517746692, + "grad_norm": 1.705333948135376, + "learning_rate": 4.5590606136793524e-05, + "loss": 4.7642, + "step": 32276 + }, + { + "epoch": 0.19196046246074794, + "grad_norm": 1.784136176109314, + "learning_rate": 4.5590341224861635e-05, + "loss": 4.7684, + "step": 32277 + }, + { + "epoch": 0.19196640974402893, + "grad_norm": 2.4740941524505615, + "learning_rate": 4.559007630574188e-05, + "loss": 4.5288, + "step": 32278 + }, + { + "epoch": 0.19197235702730991, + "grad_norm": 1.569096326828003, + "learning_rate": 4.558981137943434e-05, + "loss": 4.3353, + "step": 32279 + }, + { + "epoch": 0.19197830431059093, + "grad_norm": 1.7482889890670776, + "learning_rate": 4.558954644593911e-05, + "loss": 4.7269, + "step": 32280 + }, + { + "epoch": 0.19198425159387192, + "grad_norm": 1.8372119665145874, + "learning_rate": 4.558928150525628e-05, + "loss": 5.0292, + "step": 32281 + }, + { + "epoch": 0.1919901988771529, + "grad_norm": 1.5978213548660278, + "learning_rate": 4.558901655738594e-05, + "loss": 5.2528, + "step": 32282 + }, + { + "epoch": 0.19199614616043392, + "grad_norm": 1.761826515197754, + "learning_rate": 4.55887516023282e-05, + "loss": 5.1056, + "step": 32283 + }, + { + "epoch": 0.1920020934437149, + "grad_norm": 1.9713767766952515, + "learning_rate": 4.5588486640083134e-05, + "loss": 4.6393, + "step": 32284 + }, + { + "epoch": 0.1920080407269959, + "grad_norm": 2.8644440174102783, + "learning_rate": 4.558822167065084e-05, + "loss": 3.3245, + "step": 32285 + }, + { + "epoch": 0.19201398801027691, + "grad_norm": 2.9571681022644043, + "learning_rate": 4.558795669403141e-05, + "loss": 3.1213, + "step": 32286 + }, + { + "epoch": 0.1920199352935579, + "grad_norm": 2.428436279296875, + "learning_rate": 4.5587691710224935e-05, + "loss": 4.4437, + "step": 32287 + }, + { + "epoch": 0.1920258825768389, + "grad_norm": 2.6146740913391113, + "learning_rate": 4.5587426719231506e-05, + "loss": 3.8897, + "step": 32288 + }, + { + "epoch": 0.1920318298601199, + "grad_norm": 1.8975647687911987, + "learning_rate": 4.5587161721051226e-05, + "loss": 4.214, + "step": 32289 + }, + { + "epoch": 0.1920377771434009, + "grad_norm": 2.0018997192382812, + "learning_rate": 4.558689671568418e-05, + "loss": 4.4845, + "step": 32290 + }, + { + "epoch": 0.19204372442668188, + "grad_norm": 1.9768357276916504, + "learning_rate": 4.558663170313046e-05, + "loss": 3.9529, + "step": 32291 + }, + { + "epoch": 0.1920496717099629, + "grad_norm": 2.3069944381713867, + "learning_rate": 4.558636668339016e-05, + "loss": 4.0115, + "step": 32292 + }, + { + "epoch": 0.1920556189932439, + "grad_norm": 3.4548919200897217, + "learning_rate": 4.5586101656463365e-05, + "loss": 4.1852, + "step": 32293 + }, + { + "epoch": 0.19206156627652488, + "grad_norm": 2.8961174488067627, + "learning_rate": 4.558583662235018e-05, + "loss": 4.1678, + "step": 32294 + }, + { + "epoch": 0.1920675135598059, + "grad_norm": 2.465935468673706, + "learning_rate": 4.558557158105069e-05, + "loss": 4.5003, + "step": 32295 + }, + { + "epoch": 0.19207346084308688, + "grad_norm": 1.8684260845184326, + "learning_rate": 4.5585306532564995e-05, + "loss": 5.1349, + "step": 32296 + }, + { + "epoch": 0.19207940812636787, + "grad_norm": 1.5592044591903687, + "learning_rate": 4.558504147689317e-05, + "loss": 4.959, + "step": 32297 + }, + { + "epoch": 0.19208535540964888, + "grad_norm": 1.5255820751190186, + "learning_rate": 4.5584776414035334e-05, + "loss": 5.3422, + "step": 32298 + }, + { + "epoch": 0.19209130269292987, + "grad_norm": 1.5954620838165283, + "learning_rate": 4.5584511343991566e-05, + "loss": 5.5565, + "step": 32299 + }, + { + "epoch": 0.19209724997621086, + "grad_norm": 1.7753039598464966, + "learning_rate": 4.5584246266761957e-05, + "loss": 4.8505, + "step": 32300 + }, + { + "epoch": 0.19210319725949188, + "grad_norm": 1.7820825576782227, + "learning_rate": 4.5583981182346594e-05, + "loss": 4.8029, + "step": 32301 + }, + { + "epoch": 0.19210914454277286, + "grad_norm": 2.3863165378570557, + "learning_rate": 4.558371609074557e-05, + "loss": 4.3447, + "step": 32302 + }, + { + "epoch": 0.19211509182605385, + "grad_norm": 1.9576959609985352, + "learning_rate": 4.558345099195899e-05, + "loss": 4.5883, + "step": 32303 + }, + { + "epoch": 0.19212103910933487, + "grad_norm": 3.7398250102996826, + "learning_rate": 4.558318588598694e-05, + "loss": 4.473, + "step": 32304 + }, + { + "epoch": 0.19212698639261586, + "grad_norm": 1.9002548456192017, + "learning_rate": 4.5582920772829515e-05, + "loss": 4.4629, + "step": 32305 + }, + { + "epoch": 0.19213293367589684, + "grad_norm": 1.493038535118103, + "learning_rate": 4.55826556524868e-05, + "loss": 5.047, + "step": 32306 + }, + { + "epoch": 0.19213888095917786, + "grad_norm": 1.5841251611709595, + "learning_rate": 4.558239052495889e-05, + "loss": 5.2142, + "step": 32307 + }, + { + "epoch": 0.19214482824245885, + "grad_norm": 1.573392629623413, + "learning_rate": 4.558212539024589e-05, + "loss": 5.0779, + "step": 32308 + }, + { + "epoch": 0.19215077552573984, + "grad_norm": 1.6538444757461548, + "learning_rate": 4.558186024834788e-05, + "loss": 5.0981, + "step": 32309 + }, + { + "epoch": 0.19215672280902085, + "grad_norm": 1.6234486103057861, + "learning_rate": 4.5581595099264954e-05, + "loss": 4.4117, + "step": 32310 + }, + { + "epoch": 0.19216267009230184, + "grad_norm": 1.9895765781402588, + "learning_rate": 4.55813299429972e-05, + "loss": 4.9125, + "step": 32311 + }, + { + "epoch": 0.19216861737558283, + "grad_norm": 1.8990195989608765, + "learning_rate": 4.558106477954473e-05, + "loss": 4.4664, + "step": 32312 + }, + { + "epoch": 0.19217456465886384, + "grad_norm": 2.297137498855591, + "learning_rate": 4.558079960890761e-05, + "loss": 4.1712, + "step": 32313 + }, + { + "epoch": 0.19218051194214483, + "grad_norm": 1.7623494863510132, + "learning_rate": 4.5580534431085955e-05, + "loss": 4.6109, + "step": 32314 + }, + { + "epoch": 0.19218645922542582, + "grad_norm": 1.7693278789520264, + "learning_rate": 4.558026924607984e-05, + "loss": 4.2927, + "step": 32315 + }, + { + "epoch": 0.1921924065087068, + "grad_norm": 1.873820424079895, + "learning_rate": 4.5580004053889366e-05, + "loss": 4.2318, + "step": 32316 + }, + { + "epoch": 0.19219835379198782, + "grad_norm": 2.195477247238159, + "learning_rate": 4.557973885451463e-05, + "loss": 4.0132, + "step": 32317 + }, + { + "epoch": 0.1922043010752688, + "grad_norm": 1.898896336555481, + "learning_rate": 4.5579473647955714e-05, + "loss": 4.2662, + "step": 32318 + }, + { + "epoch": 0.1922102483585498, + "grad_norm": 1.6225823163986206, + "learning_rate": 4.5579208434212725e-05, + "loss": 4.5132, + "step": 32319 + }, + { + "epoch": 0.19221619564183082, + "grad_norm": 1.6939207315444946, + "learning_rate": 4.557894321328574e-05, + "loss": 4.268, + "step": 32320 + }, + { + "epoch": 0.1922221429251118, + "grad_norm": 2.0803871154785156, + "learning_rate": 4.5578677985174854e-05, + "loss": 3.8686, + "step": 32321 + }, + { + "epoch": 0.1922280902083928, + "grad_norm": 2.1536972522735596, + "learning_rate": 4.5578412749880176e-05, + "loss": 4.0257, + "step": 32322 + }, + { + "epoch": 0.1922340374916738, + "grad_norm": 2.5257532596588135, + "learning_rate": 4.5578147507401784e-05, + "loss": 4.1924, + "step": 32323 + }, + { + "epoch": 0.1922399847749548, + "grad_norm": 2.1187551021575928, + "learning_rate": 4.5577882257739766e-05, + "loss": 4.4954, + "step": 32324 + }, + { + "epoch": 0.19224593205823579, + "grad_norm": 1.5388280153274536, + "learning_rate": 4.5577617000894225e-05, + "loss": 4.3945, + "step": 32325 + }, + { + "epoch": 0.1922518793415168, + "grad_norm": 1.9172996282577515, + "learning_rate": 4.5577351736865255e-05, + "loss": 3.8684, + "step": 32326 + }, + { + "epoch": 0.1922578266247978, + "grad_norm": 1.9113341569900513, + "learning_rate": 4.5577086465652944e-05, + "loss": 4.1434, + "step": 32327 + }, + { + "epoch": 0.19226377390807878, + "grad_norm": 2.022688865661621, + "learning_rate": 4.557682118725738e-05, + "loss": 3.8283, + "step": 32328 + }, + { + "epoch": 0.1922697211913598, + "grad_norm": 1.9376680850982666, + "learning_rate": 4.5576555901678665e-05, + "loss": 3.8662, + "step": 32329 + }, + { + "epoch": 0.19227566847464078, + "grad_norm": 1.8243870735168457, + "learning_rate": 4.557629060891688e-05, + "loss": 3.8863, + "step": 32330 + }, + { + "epoch": 0.19228161575792177, + "grad_norm": 2.059737205505371, + "learning_rate": 4.557602530897213e-05, + "loss": 4.0976, + "step": 32331 + }, + { + "epoch": 0.19228756304120279, + "grad_norm": 2.3299999237060547, + "learning_rate": 4.5575760001844494e-05, + "loss": 4.1038, + "step": 32332 + }, + { + "epoch": 0.19229351032448377, + "grad_norm": 2.2769482135772705, + "learning_rate": 4.557549468753408e-05, + "loss": 4.1553, + "step": 32333 + }, + { + "epoch": 0.19229945760776476, + "grad_norm": 1.806193470954895, + "learning_rate": 4.5575229366040975e-05, + "loss": 5.0664, + "step": 32334 + }, + { + "epoch": 0.19230540489104578, + "grad_norm": 1.7117464542388916, + "learning_rate": 4.557496403736527e-05, + "loss": 4.9323, + "step": 32335 + }, + { + "epoch": 0.19231135217432677, + "grad_norm": 1.8777096271514893, + "learning_rate": 4.557469870150706e-05, + "loss": 4.1108, + "step": 32336 + }, + { + "epoch": 0.19231729945760775, + "grad_norm": 2.475425958633423, + "learning_rate": 4.557443335846643e-05, + "loss": 3.5492, + "step": 32337 + }, + { + "epoch": 0.19232324674088877, + "grad_norm": 1.8026037216186523, + "learning_rate": 4.5574168008243474e-05, + "loss": 4.6844, + "step": 32338 + }, + { + "epoch": 0.19232919402416976, + "grad_norm": 1.6337968111038208, + "learning_rate": 4.557390265083829e-05, + "loss": 5.6839, + "step": 32339 + }, + { + "epoch": 0.19233514130745075, + "grad_norm": 1.5935887098312378, + "learning_rate": 4.557363728625098e-05, + "loss": 4.8699, + "step": 32340 + }, + { + "epoch": 0.19234108859073176, + "grad_norm": 1.7847586870193481, + "learning_rate": 4.557337191448161e-05, + "loss": 5.2093, + "step": 32341 + }, + { + "epoch": 0.19234703587401275, + "grad_norm": 1.8020373582839966, + "learning_rate": 4.5573106535530295e-05, + "loss": 4.7497, + "step": 32342 + }, + { + "epoch": 0.19235298315729374, + "grad_norm": 1.8254616260528564, + "learning_rate": 4.557284114939713e-05, + "loss": 4.8339, + "step": 32343 + }, + { + "epoch": 0.19235893044057475, + "grad_norm": 1.8874919414520264, + "learning_rate": 4.5572575756082184e-05, + "loss": 4.9181, + "step": 32344 + }, + { + "epoch": 0.19236487772385574, + "grad_norm": 2.2326526641845703, + "learning_rate": 4.5572310355585574e-05, + "loss": 4.7761, + "step": 32345 + }, + { + "epoch": 0.19237082500713673, + "grad_norm": 1.892404556274414, + "learning_rate": 4.557204494790738e-05, + "loss": 4.6584, + "step": 32346 + }, + { + "epoch": 0.19237677229041775, + "grad_norm": 1.560922622680664, + "learning_rate": 4.55717795330477e-05, + "loss": 5.5746, + "step": 32347 + }, + { + "epoch": 0.19238271957369873, + "grad_norm": 1.8915635347366333, + "learning_rate": 4.5571514111006616e-05, + "loss": 5.3499, + "step": 32348 + }, + { + "epoch": 0.19238866685697972, + "grad_norm": 2.5687575340270996, + "learning_rate": 4.557124868178424e-05, + "loss": 3.8322, + "step": 32349 + }, + { + "epoch": 0.19239461414026074, + "grad_norm": 2.195842742919922, + "learning_rate": 4.557098324538065e-05, + "loss": 3.7871, + "step": 32350 + }, + { + "epoch": 0.19240056142354173, + "grad_norm": 2.0727920532226562, + "learning_rate": 4.557071780179594e-05, + "loss": 4.1199, + "step": 32351 + }, + { + "epoch": 0.19240650870682272, + "grad_norm": 1.653414011001587, + "learning_rate": 4.557045235103021e-05, + "loss": 4.7838, + "step": 32352 + }, + { + "epoch": 0.19241245599010373, + "grad_norm": 2.0768418312072754, + "learning_rate": 4.557018689308354e-05, + "loss": 4.5005, + "step": 32353 + }, + { + "epoch": 0.19241840327338472, + "grad_norm": 2.001793622970581, + "learning_rate": 4.5569921427956034e-05, + "loss": 4.1306, + "step": 32354 + }, + { + "epoch": 0.1924243505566657, + "grad_norm": 2.1622116565704346, + "learning_rate": 4.556965595564778e-05, + "loss": 3.8781, + "step": 32355 + }, + { + "epoch": 0.19243029783994672, + "grad_norm": 2.073871374130249, + "learning_rate": 4.556939047615888e-05, + "loss": 3.8632, + "step": 32356 + }, + { + "epoch": 0.1924362451232277, + "grad_norm": 2.1704652309417725, + "learning_rate": 4.5569124989489404e-05, + "loss": 3.7858, + "step": 32357 + }, + { + "epoch": 0.1924421924065087, + "grad_norm": 1.822009801864624, + "learning_rate": 4.556885949563947e-05, + "loss": 4.1138, + "step": 32358 + }, + { + "epoch": 0.19244813968978972, + "grad_norm": 1.777799367904663, + "learning_rate": 4.556859399460916e-05, + "loss": 4.6384, + "step": 32359 + }, + { + "epoch": 0.1924540869730707, + "grad_norm": 1.926173448562622, + "learning_rate": 4.556832848639855e-05, + "loss": 5.0295, + "step": 32360 + }, + { + "epoch": 0.1924600342563517, + "grad_norm": 1.8721709251403809, + "learning_rate": 4.5568062971007764e-05, + "loss": 5.0287, + "step": 32361 + }, + { + "epoch": 0.1924659815396327, + "grad_norm": 1.7049319744110107, + "learning_rate": 4.556779744843688e-05, + "loss": 4.3578, + "step": 32362 + }, + { + "epoch": 0.1924719288229137, + "grad_norm": 1.873555302619934, + "learning_rate": 4.5567531918685984e-05, + "loss": 3.983, + "step": 32363 + }, + { + "epoch": 0.19247787610619468, + "grad_norm": 2.7111735343933105, + "learning_rate": 4.556726638175518e-05, + "loss": 4.0963, + "step": 32364 + }, + { + "epoch": 0.1924838233894757, + "grad_norm": 2.063129425048828, + "learning_rate": 4.5567000837644555e-05, + "loss": 3.8396, + "step": 32365 + }, + { + "epoch": 0.1924897706727567, + "grad_norm": 2.247694969177246, + "learning_rate": 4.55667352863542e-05, + "loss": 3.8202, + "step": 32366 + }, + { + "epoch": 0.19249571795603768, + "grad_norm": 2.430349349975586, + "learning_rate": 4.556646972788421e-05, + "loss": 3.9607, + "step": 32367 + }, + { + "epoch": 0.1925016652393187, + "grad_norm": 2.3638129234313965, + "learning_rate": 4.556620416223468e-05, + "loss": 4.2398, + "step": 32368 + }, + { + "epoch": 0.19250761252259968, + "grad_norm": 2.057927370071411, + "learning_rate": 4.55659385894057e-05, + "loss": 4.3298, + "step": 32369 + }, + { + "epoch": 0.19251355980588067, + "grad_norm": 1.7141249179840088, + "learning_rate": 4.5565673009397366e-05, + "loss": 4.7427, + "step": 32370 + }, + { + "epoch": 0.19251950708916168, + "grad_norm": 1.7085816860198975, + "learning_rate": 4.556540742220976e-05, + "loss": 4.9883, + "step": 32371 + }, + { + "epoch": 0.19252545437244267, + "grad_norm": 1.463494896888733, + "learning_rate": 4.5565141827842996e-05, + "loss": 4.7302, + "step": 32372 + }, + { + "epoch": 0.19253140165572366, + "grad_norm": 1.647187352180481, + "learning_rate": 4.556487622629714e-05, + "loss": 4.7999, + "step": 32373 + }, + { + "epoch": 0.19253734893900465, + "grad_norm": 1.526756763458252, + "learning_rate": 4.55646106175723e-05, + "loss": 4.8183, + "step": 32374 + }, + { + "epoch": 0.19254329622228566, + "grad_norm": 1.2896729707717896, + "learning_rate": 4.556434500166858e-05, + "loss": 4.6084, + "step": 32375 + }, + { + "epoch": 0.19254924350556665, + "grad_norm": 1.6381428241729736, + "learning_rate": 4.556407937858605e-05, + "loss": 5.186, + "step": 32376 + }, + { + "epoch": 0.19255519078884764, + "grad_norm": 3.1183688640594482, + "learning_rate": 4.5563813748324804e-05, + "loss": 4.0471, + "step": 32377 + }, + { + "epoch": 0.19256113807212866, + "grad_norm": 2.0422890186309814, + "learning_rate": 4.556354811088496e-05, + "loss": 4.7993, + "step": 32378 + }, + { + "epoch": 0.19256708535540965, + "grad_norm": 1.9046860933303833, + "learning_rate": 4.5563282466266574e-05, + "loss": 4.5938, + "step": 32379 + }, + { + "epoch": 0.19257303263869063, + "grad_norm": 1.9312288761138916, + "learning_rate": 4.5563016814469776e-05, + "loss": 4.4985, + "step": 32380 + }, + { + "epoch": 0.19257897992197165, + "grad_norm": 1.828894853591919, + "learning_rate": 4.556275115549464e-05, + "loss": 4.3507, + "step": 32381 + }, + { + "epoch": 0.19258492720525264, + "grad_norm": 1.8356082439422607, + "learning_rate": 4.5562485489341256e-05, + "loss": 4.8413, + "step": 32382 + }, + { + "epoch": 0.19259087448853363, + "grad_norm": 1.6310971975326538, + "learning_rate": 4.5562219816009716e-05, + "loss": 4.9384, + "step": 32383 + }, + { + "epoch": 0.19259682177181464, + "grad_norm": 1.6916502714157104, + "learning_rate": 4.556195413550012e-05, + "loss": 4.7824, + "step": 32384 + }, + { + "epoch": 0.19260276905509563, + "grad_norm": 1.468487024307251, + "learning_rate": 4.556168844781256e-05, + "loss": 4.635, + "step": 32385 + }, + { + "epoch": 0.19260871633837662, + "grad_norm": 1.5585215091705322, + "learning_rate": 4.5561422752947124e-05, + "loss": 4.6277, + "step": 32386 + }, + { + "epoch": 0.19261466362165763, + "grad_norm": 1.7868255376815796, + "learning_rate": 4.556115705090391e-05, + "loss": 4.7244, + "step": 32387 + }, + { + "epoch": 0.19262061090493862, + "grad_norm": 1.7397072315216064, + "learning_rate": 4.556089134168301e-05, + "loss": 4.4101, + "step": 32388 + }, + { + "epoch": 0.1926265581882196, + "grad_norm": 2.364893674850464, + "learning_rate": 4.556062562528452e-05, + "loss": 3.7639, + "step": 32389 + }, + { + "epoch": 0.19263250547150063, + "grad_norm": 2.690023899078369, + "learning_rate": 4.5560359901708524e-05, + "loss": 3.6474, + "step": 32390 + }, + { + "epoch": 0.19263845275478161, + "grad_norm": 2.4105823040008545, + "learning_rate": 4.5560094170955116e-05, + "loss": 3.5294, + "step": 32391 + }, + { + "epoch": 0.1926444000380626, + "grad_norm": 2.0659773349761963, + "learning_rate": 4.5559828433024385e-05, + "loss": 3.7319, + "step": 32392 + }, + { + "epoch": 0.19265034732134362, + "grad_norm": 2.075104236602783, + "learning_rate": 4.5559562687916445e-05, + "loss": 5.1514, + "step": 32393 + }, + { + "epoch": 0.1926562946046246, + "grad_norm": 1.8932725191116333, + "learning_rate": 4.5559296935631365e-05, + "loss": 4.7867, + "step": 32394 + }, + { + "epoch": 0.1926622418879056, + "grad_norm": 2.2901084423065186, + "learning_rate": 4.5559031176169246e-05, + "loss": 3.2693, + "step": 32395 + }, + { + "epoch": 0.1926681891711866, + "grad_norm": 2.2299394607543945, + "learning_rate": 4.555876540953019e-05, + "loss": 3.5186, + "step": 32396 + }, + { + "epoch": 0.1926741364544676, + "grad_norm": 2.254751443862915, + "learning_rate": 4.555849963571427e-05, + "loss": 3.4051, + "step": 32397 + }, + { + "epoch": 0.1926800837377486, + "grad_norm": 1.9714645147323608, + "learning_rate": 4.55582338547216e-05, + "loss": 3.3963, + "step": 32398 + }, + { + "epoch": 0.1926860310210296, + "grad_norm": 2.350437641143799, + "learning_rate": 4.555796806655226e-05, + "loss": 3.3453, + "step": 32399 + }, + { + "epoch": 0.1926919783043106, + "grad_norm": 2.113746166229248, + "learning_rate": 4.555770227120634e-05, + "loss": 3.5508, + "step": 32400 + }, + { + "epoch": 0.19269792558759158, + "grad_norm": 2.558175563812256, + "learning_rate": 4.555743646868395e-05, + "loss": 3.4324, + "step": 32401 + }, + { + "epoch": 0.1927038728708726, + "grad_norm": 2.097472667694092, + "learning_rate": 4.555717065898516e-05, + "loss": 3.4113, + "step": 32402 + }, + { + "epoch": 0.19270982015415358, + "grad_norm": 2.507054567337036, + "learning_rate": 4.555690484211008e-05, + "loss": 4.0971, + "step": 32403 + }, + { + "epoch": 0.19271576743743457, + "grad_norm": 1.6816004514694214, + "learning_rate": 4.5556639018058793e-05, + "loss": 5.3368, + "step": 32404 + }, + { + "epoch": 0.1927217147207156, + "grad_norm": 1.6590732336044312, + "learning_rate": 4.55563731868314e-05, + "loss": 4.9217, + "step": 32405 + }, + { + "epoch": 0.19272766200399657, + "grad_norm": 1.6414915323257446, + "learning_rate": 4.555610734842799e-05, + "loss": 4.0729, + "step": 32406 + }, + { + "epoch": 0.19273360928727756, + "grad_norm": 1.5531092882156372, + "learning_rate": 4.555584150284865e-05, + "loss": 3.8463, + "step": 32407 + }, + { + "epoch": 0.19273955657055858, + "grad_norm": 1.6675087213516235, + "learning_rate": 4.5555575650093484e-05, + "loss": 3.7309, + "step": 32408 + }, + { + "epoch": 0.19274550385383957, + "grad_norm": 1.5836836099624634, + "learning_rate": 4.555530979016257e-05, + "loss": 4.0221, + "step": 32409 + }, + { + "epoch": 0.19275145113712056, + "grad_norm": 2.2653143405914307, + "learning_rate": 4.5555043923056015e-05, + "loss": 4.4883, + "step": 32410 + }, + { + "epoch": 0.19275739842040157, + "grad_norm": 2.1392593383789062, + "learning_rate": 4.555477804877392e-05, + "loss": 4.3895, + "step": 32411 + }, + { + "epoch": 0.19276334570368256, + "grad_norm": 1.947454571723938, + "learning_rate": 4.555451216731634e-05, + "loss": 4.4879, + "step": 32412 + }, + { + "epoch": 0.19276929298696355, + "grad_norm": 1.9660381078720093, + "learning_rate": 4.555424627868341e-05, + "loss": 4.399, + "step": 32413 + }, + { + "epoch": 0.19277524027024456, + "grad_norm": 1.8891009092330933, + "learning_rate": 4.555398038287519e-05, + "loss": 3.8344, + "step": 32414 + }, + { + "epoch": 0.19278118755352555, + "grad_norm": 1.7115179300308228, + "learning_rate": 4.5553714479891804e-05, + "loss": 3.8118, + "step": 32415 + }, + { + "epoch": 0.19278713483680654, + "grad_norm": 2.0297632217407227, + "learning_rate": 4.555344856973332e-05, + "loss": 3.9253, + "step": 32416 + }, + { + "epoch": 0.19279308212008756, + "grad_norm": 1.6160376071929932, + "learning_rate": 4.555318265239984e-05, + "loss": 4.1249, + "step": 32417 + }, + { + "epoch": 0.19279902940336854, + "grad_norm": 1.6909234523773193, + "learning_rate": 4.555291672789146e-05, + "loss": 5.1648, + "step": 32418 + }, + { + "epoch": 0.19280497668664953, + "grad_norm": 1.8374849557876587, + "learning_rate": 4.5552650796208265e-05, + "loss": 5.3176, + "step": 32419 + }, + { + "epoch": 0.19281092396993055, + "grad_norm": 1.8304452896118164, + "learning_rate": 4.555238485735035e-05, + "loss": 4.7696, + "step": 32420 + }, + { + "epoch": 0.19281687125321154, + "grad_norm": 1.974797010421753, + "learning_rate": 4.555211891131782e-05, + "loss": 4.2615, + "step": 32421 + }, + { + "epoch": 0.19282281853649252, + "grad_norm": 1.8388688564300537, + "learning_rate": 4.555185295811075e-05, + "loss": 4.5941, + "step": 32422 + }, + { + "epoch": 0.19282876581977354, + "grad_norm": 1.852777361869812, + "learning_rate": 4.555158699772924e-05, + "loss": 4.379, + "step": 32423 + }, + { + "epoch": 0.19283471310305453, + "grad_norm": 1.903781771659851, + "learning_rate": 4.5551321030173376e-05, + "loss": 5.0642, + "step": 32424 + }, + { + "epoch": 0.19284066038633552, + "grad_norm": 1.830812692642212, + "learning_rate": 4.555105505544327e-05, + "loss": 5.2191, + "step": 32425 + }, + { + "epoch": 0.19284660766961653, + "grad_norm": 1.7071088552474976, + "learning_rate": 4.5550789073539e-05, + "loss": 4.6368, + "step": 32426 + }, + { + "epoch": 0.19285255495289752, + "grad_norm": 1.6677404642105103, + "learning_rate": 4.5550523084460664e-05, + "loss": 4.9055, + "step": 32427 + }, + { + "epoch": 0.1928585022361785, + "grad_norm": 1.7404626607894897, + "learning_rate": 4.555025708820835e-05, + "loss": 4.9614, + "step": 32428 + }, + { + "epoch": 0.19286444951945952, + "grad_norm": 1.6599600315093994, + "learning_rate": 4.554999108478215e-05, + "loss": 4.8015, + "step": 32429 + }, + { + "epoch": 0.1928703968027405, + "grad_norm": 2.5620381832122803, + "learning_rate": 4.554972507418217e-05, + "loss": 3.6728, + "step": 32430 + }, + { + "epoch": 0.1928763440860215, + "grad_norm": 2.435203790664673, + "learning_rate": 4.554945905640848e-05, + "loss": 4.3076, + "step": 32431 + }, + { + "epoch": 0.1928822913693025, + "grad_norm": 2.521820068359375, + "learning_rate": 4.55491930314612e-05, + "loss": 3.5146, + "step": 32432 + }, + { + "epoch": 0.1928882386525835, + "grad_norm": 2.866119861602783, + "learning_rate": 4.55489269993404e-05, + "loss": 2.8378, + "step": 32433 + }, + { + "epoch": 0.1928941859358645, + "grad_norm": 2.237283945083618, + "learning_rate": 4.554866096004619e-05, + "loss": 3.974, + "step": 32434 + }, + { + "epoch": 0.19290013321914548, + "grad_norm": 1.55573308467865, + "learning_rate": 4.5548394913578643e-05, + "loss": 5.4577, + "step": 32435 + }, + { + "epoch": 0.1929060805024265, + "grad_norm": 1.571730613708496, + "learning_rate": 4.554812885993787e-05, + "loss": 4.9712, + "step": 32436 + }, + { + "epoch": 0.19291202778570748, + "grad_norm": 2.1712872982025146, + "learning_rate": 4.554786279912395e-05, + "loss": 4.862, + "step": 32437 + }, + { + "epoch": 0.19291797506898847, + "grad_norm": 2.0782408714294434, + "learning_rate": 4.5547596731137e-05, + "loss": 4.4303, + "step": 32438 + }, + { + "epoch": 0.1929239223522695, + "grad_norm": 1.590576171875, + "learning_rate": 4.554733065597708e-05, + "loss": 4.885, + "step": 32439 + }, + { + "epoch": 0.19292986963555048, + "grad_norm": 1.8684148788452148, + "learning_rate": 4.5547064573644306e-05, + "loss": 5.0804, + "step": 32440 + }, + { + "epoch": 0.19293581691883147, + "grad_norm": 2.731905937194824, + "learning_rate": 4.554679848413876e-05, + "loss": 3.6933, + "step": 32441 + }, + { + "epoch": 0.19294176420211248, + "grad_norm": 3.1184794902801514, + "learning_rate": 4.554653238746055e-05, + "loss": 3.3802, + "step": 32442 + }, + { + "epoch": 0.19294771148539347, + "grad_norm": 1.865108847618103, + "learning_rate": 4.554626628360975e-05, + "loss": 4.882, + "step": 32443 + }, + { + "epoch": 0.19295365876867446, + "grad_norm": 2.3064188957214355, + "learning_rate": 4.554600017258646e-05, + "loss": 4.858, + "step": 32444 + }, + { + "epoch": 0.19295960605195547, + "grad_norm": 2.0467426776885986, + "learning_rate": 4.554573405439078e-05, + "loss": 4.5301, + "step": 32445 + }, + { + "epoch": 0.19296555333523646, + "grad_norm": 1.878140926361084, + "learning_rate": 4.554546792902279e-05, + "loss": 4.7595, + "step": 32446 + }, + { + "epoch": 0.19297150061851745, + "grad_norm": 1.7915738821029663, + "learning_rate": 4.554520179648259e-05, + "loss": 4.4789, + "step": 32447 + }, + { + "epoch": 0.19297744790179847, + "grad_norm": 1.7169902324676514, + "learning_rate": 4.554493565677027e-05, + "loss": 5.3695, + "step": 32448 + }, + { + "epoch": 0.19298339518507945, + "grad_norm": 1.7827154397964478, + "learning_rate": 4.554466950988593e-05, + "loss": 5.4014, + "step": 32449 + }, + { + "epoch": 0.19298934246836044, + "grad_norm": 2.4304897785186768, + "learning_rate": 4.5544403355829656e-05, + "loss": 3.4442, + "step": 32450 + }, + { + "epoch": 0.19299528975164146, + "grad_norm": 2.8224079608917236, + "learning_rate": 4.554413719460154e-05, + "loss": 3.2425, + "step": 32451 + }, + { + "epoch": 0.19300123703492245, + "grad_norm": 2.2338883876800537, + "learning_rate": 4.554387102620169e-05, + "loss": 3.3087, + "step": 32452 + }, + { + "epoch": 0.19300718431820343, + "grad_norm": 1.79100501537323, + "learning_rate": 4.5543604850630174e-05, + "loss": 5.0432, + "step": 32453 + }, + { + "epoch": 0.19301313160148445, + "grad_norm": 1.68960440158844, + "learning_rate": 4.5543338667887104e-05, + "loss": 5.031, + "step": 32454 + }, + { + "epoch": 0.19301907888476544, + "grad_norm": 2.2218265533447266, + "learning_rate": 4.554307247797256e-05, + "loss": 3.4563, + "step": 32455 + }, + { + "epoch": 0.19302502616804643, + "grad_norm": 2.5037896633148193, + "learning_rate": 4.554280628088665e-05, + "loss": 3.4917, + "step": 32456 + }, + { + "epoch": 0.19303097345132744, + "grad_norm": 2.1465871334075928, + "learning_rate": 4.554254007662946e-05, + "loss": 3.3443, + "step": 32457 + }, + { + "epoch": 0.19303692073460843, + "grad_norm": 2.032118320465088, + "learning_rate": 4.554227386520107e-05, + "loss": 3.5841, + "step": 32458 + }, + { + "epoch": 0.19304286801788942, + "grad_norm": 2.566612482070923, + "learning_rate": 4.554200764660159e-05, + "loss": 4.4753, + "step": 32459 + }, + { + "epoch": 0.19304881530117043, + "grad_norm": 3.184678077697754, + "learning_rate": 4.5541741420831105e-05, + "loss": 3.56, + "step": 32460 + }, + { + "epoch": 0.19305476258445142, + "grad_norm": 2.433135986328125, + "learning_rate": 4.554147518788972e-05, + "loss": 3.6974, + "step": 32461 + }, + { + "epoch": 0.1930607098677324, + "grad_norm": 2.502509355545044, + "learning_rate": 4.554120894777751e-05, + "loss": 3.5679, + "step": 32462 + }, + { + "epoch": 0.19306665715101343, + "grad_norm": 2.334136724472046, + "learning_rate": 4.5540942700494585e-05, + "loss": 3.6318, + "step": 32463 + }, + { + "epoch": 0.19307260443429441, + "grad_norm": 2.52958083152771, + "learning_rate": 4.554067644604102e-05, + "loss": 3.372, + "step": 32464 + }, + { + "epoch": 0.1930785517175754, + "grad_norm": 2.6455636024475098, + "learning_rate": 4.554041018441692e-05, + "loss": 3.5264, + "step": 32465 + }, + { + "epoch": 0.19308449900085642, + "grad_norm": 3.023738145828247, + "learning_rate": 4.554014391562237e-05, + "loss": 3.2593, + "step": 32466 + }, + { + "epoch": 0.1930904462841374, + "grad_norm": 2.817189931869507, + "learning_rate": 4.553987763965747e-05, + "loss": 3.199, + "step": 32467 + }, + { + "epoch": 0.1930963935674184, + "grad_norm": 2.676410675048828, + "learning_rate": 4.553961135652232e-05, + "loss": 3.5504, + "step": 32468 + }, + { + "epoch": 0.1931023408506994, + "grad_norm": 2.2987060546875, + "learning_rate": 4.553934506621699e-05, + "loss": 3.4431, + "step": 32469 + }, + { + "epoch": 0.1931082881339804, + "grad_norm": 2.421534538269043, + "learning_rate": 4.5539078768741596e-05, + "loss": 3.6071, + "step": 32470 + }, + { + "epoch": 0.1931142354172614, + "grad_norm": 2.2620744705200195, + "learning_rate": 4.553881246409622e-05, + "loss": 3.5397, + "step": 32471 + }, + { + "epoch": 0.1931201827005424, + "grad_norm": 1.4946821928024292, + "learning_rate": 4.5538546152280956e-05, + "loss": 5.0372, + "step": 32472 + }, + { + "epoch": 0.1931261299838234, + "grad_norm": 2.9421093463897705, + "learning_rate": 4.55382798332959e-05, + "loss": 4.3724, + "step": 32473 + }, + { + "epoch": 0.19313207726710438, + "grad_norm": 2.7655880451202393, + "learning_rate": 4.553801350714114e-05, + "loss": 4.1008, + "step": 32474 + }, + { + "epoch": 0.1931380245503854, + "grad_norm": 2.118710994720459, + "learning_rate": 4.553774717381677e-05, + "loss": 3.538, + "step": 32475 + }, + { + "epoch": 0.19314397183366638, + "grad_norm": 2.1854286193847656, + "learning_rate": 4.5537480833322886e-05, + "loss": 3.5067, + "step": 32476 + }, + { + "epoch": 0.19314991911694737, + "grad_norm": 2.528470039367676, + "learning_rate": 4.553721448565959e-05, + "loss": 3.4925, + "step": 32477 + }, + { + "epoch": 0.1931558664002284, + "grad_norm": 2.235788583755493, + "learning_rate": 4.553694813082695e-05, + "loss": 3.492, + "step": 32478 + }, + { + "epoch": 0.19316181368350938, + "grad_norm": 2.3836355209350586, + "learning_rate": 4.5536681768825076e-05, + "loss": 3.2728, + "step": 32479 + }, + { + "epoch": 0.19316776096679036, + "grad_norm": 2.189574956893921, + "learning_rate": 4.5536415399654066e-05, + "loss": 3.3595, + "step": 32480 + }, + { + "epoch": 0.19317370825007138, + "grad_norm": 2.3354239463806152, + "learning_rate": 4.5536149023314e-05, + "loss": 3.5092, + "step": 32481 + }, + { + "epoch": 0.19317965553335237, + "grad_norm": 2.3847224712371826, + "learning_rate": 4.553588263980498e-05, + "loss": 3.4803, + "step": 32482 + }, + { + "epoch": 0.19318560281663336, + "grad_norm": 2.399078845977783, + "learning_rate": 4.553561624912709e-05, + "loss": 3.5816, + "step": 32483 + }, + { + "epoch": 0.19319155009991437, + "grad_norm": 2.423222064971924, + "learning_rate": 4.553534985128043e-05, + "loss": 3.5522, + "step": 32484 + }, + { + "epoch": 0.19319749738319536, + "grad_norm": 2.08549427986145, + "learning_rate": 4.553508344626509e-05, + "loss": 3.5339, + "step": 32485 + }, + { + "epoch": 0.19320344466647635, + "grad_norm": 1.9425020217895508, + "learning_rate": 4.553481703408118e-05, + "loss": 3.6995, + "step": 32486 + }, + { + "epoch": 0.19320939194975736, + "grad_norm": 1.8637107610702515, + "learning_rate": 4.553455061472876e-05, + "loss": 5.0999, + "step": 32487 + }, + { + "epoch": 0.19321533923303835, + "grad_norm": 1.5940055847167969, + "learning_rate": 4.553428418820794e-05, + "loss": 5.0079, + "step": 32488 + }, + { + "epoch": 0.19322128651631934, + "grad_norm": 1.6175649166107178, + "learning_rate": 4.553401775451882e-05, + "loss": 5.2676, + "step": 32489 + }, + { + "epoch": 0.19322723379960036, + "grad_norm": 1.4351513385772705, + "learning_rate": 4.553375131366149e-05, + "loss": 5.1476, + "step": 32490 + }, + { + "epoch": 0.19323318108288134, + "grad_norm": 1.4783397912979126, + "learning_rate": 4.553348486563603e-05, + "loss": 5.2757, + "step": 32491 + }, + { + "epoch": 0.19323912836616233, + "grad_norm": 1.5456229448318481, + "learning_rate": 4.5533218410442556e-05, + "loss": 5.1947, + "step": 32492 + }, + { + "epoch": 0.19324507564944332, + "grad_norm": 1.7031913995742798, + "learning_rate": 4.553295194808114e-05, + "loss": 4.8797, + "step": 32493 + }, + { + "epoch": 0.19325102293272434, + "grad_norm": 2.224454879760742, + "learning_rate": 4.553268547855188e-05, + "loss": 4.7078, + "step": 32494 + }, + { + "epoch": 0.19325697021600532, + "grad_norm": 2.446502685546875, + "learning_rate": 4.553241900185488e-05, + "loss": 4.085, + "step": 32495 + }, + { + "epoch": 0.1932629174992863, + "grad_norm": 1.8770337104797363, + "learning_rate": 4.553215251799021e-05, + "loss": 5.3268, + "step": 32496 + }, + { + "epoch": 0.19326886478256733, + "grad_norm": 2.499891996383667, + "learning_rate": 4.5531886026957994e-05, + "loss": 4.6427, + "step": 32497 + }, + { + "epoch": 0.19327481206584832, + "grad_norm": 2.7995948791503906, + "learning_rate": 4.553161952875829e-05, + "loss": 3.4704, + "step": 32498 + }, + { + "epoch": 0.1932807593491293, + "grad_norm": 2.4758827686309814, + "learning_rate": 4.553135302339123e-05, + "loss": 3.6428, + "step": 32499 + }, + { + "epoch": 0.19328670663241032, + "grad_norm": 1.7038213014602661, + "learning_rate": 4.553108651085688e-05, + "loss": 5.3407, + "step": 32500 + }, + { + "epoch": 0.1932926539156913, + "grad_norm": 1.8451898097991943, + "learning_rate": 4.5530819991155325e-05, + "loss": 5.1482, + "step": 32501 + }, + { + "epoch": 0.1932986011989723, + "grad_norm": 3.1001696586608887, + "learning_rate": 4.553055346428669e-05, + "loss": 3.8696, + "step": 32502 + }, + { + "epoch": 0.1933045484822533, + "grad_norm": 3.6699612140655518, + "learning_rate": 4.553028693025105e-05, + "loss": 4.1102, + "step": 32503 + }, + { + "epoch": 0.1933104957655343, + "grad_norm": 2.5202810764312744, + "learning_rate": 4.553002038904849e-05, + "loss": 4.1498, + "step": 32504 + }, + { + "epoch": 0.1933164430488153, + "grad_norm": 1.6970324516296387, + "learning_rate": 4.552975384067912e-05, + "loss": 4.9378, + "step": 32505 + }, + { + "epoch": 0.1933223903320963, + "grad_norm": 1.8077476024627686, + "learning_rate": 4.5529487285143026e-05, + "loss": 5.3876, + "step": 32506 + }, + { + "epoch": 0.1933283376153773, + "grad_norm": 1.61594820022583, + "learning_rate": 4.552922072244029e-05, + "loss": 5.316, + "step": 32507 + }, + { + "epoch": 0.19333428489865828, + "grad_norm": 1.7117811441421509, + "learning_rate": 4.552895415257102e-05, + "loss": 5.1787, + "step": 32508 + }, + { + "epoch": 0.1933402321819393, + "grad_norm": 1.8115290403366089, + "learning_rate": 4.5528687575535314e-05, + "loss": 5.347, + "step": 32509 + }, + { + "epoch": 0.19334617946522029, + "grad_norm": 1.6437400579452515, + "learning_rate": 4.552842099133324e-05, + "loss": 5.4069, + "step": 32510 + }, + { + "epoch": 0.19335212674850127, + "grad_norm": 1.8343757390975952, + "learning_rate": 4.5528154399964915e-05, + "loss": 4.6641, + "step": 32511 + }, + { + "epoch": 0.1933580740317823, + "grad_norm": 1.716610312461853, + "learning_rate": 4.552788780143042e-05, + "loss": 5.2652, + "step": 32512 + }, + { + "epoch": 0.19336402131506328, + "grad_norm": 1.73993980884552, + "learning_rate": 4.552762119572985e-05, + "loss": 5.1183, + "step": 32513 + }, + { + "epoch": 0.19336996859834427, + "grad_norm": 1.729629635810852, + "learning_rate": 4.55273545828633e-05, + "loss": 5.1342, + "step": 32514 + }, + { + "epoch": 0.19337591588162528, + "grad_norm": 1.6299128532409668, + "learning_rate": 4.552708796283087e-05, + "loss": 5.1742, + "step": 32515 + }, + { + "epoch": 0.19338186316490627, + "grad_norm": 2.0269429683685303, + "learning_rate": 4.552682133563264e-05, + "loss": 4.0218, + "step": 32516 + }, + { + "epoch": 0.19338781044818726, + "grad_norm": 2.795447826385498, + "learning_rate": 4.552655470126871e-05, + "loss": 3.5319, + "step": 32517 + }, + { + "epoch": 0.19339375773146827, + "grad_norm": 2.5553972721099854, + "learning_rate": 4.552628805973917e-05, + "loss": 3.6563, + "step": 32518 + }, + { + "epoch": 0.19339970501474926, + "grad_norm": 2.5591487884521484, + "learning_rate": 4.552602141104412e-05, + "loss": 3.5933, + "step": 32519 + }, + { + "epoch": 0.19340565229803025, + "grad_norm": 2.5582263469696045, + "learning_rate": 4.552575475518364e-05, + "loss": 3.325, + "step": 32520 + }, + { + "epoch": 0.19341159958131127, + "grad_norm": 1.9097342491149902, + "learning_rate": 4.552548809215784e-05, + "loss": 3.7953, + "step": 32521 + }, + { + "epoch": 0.19341754686459225, + "grad_norm": 1.9214484691619873, + "learning_rate": 4.552522142196679e-05, + "loss": 4.5439, + "step": 32522 + }, + { + "epoch": 0.19342349414787324, + "grad_norm": 2.408139944076538, + "learning_rate": 4.5524954744610614e-05, + "loss": 3.5626, + "step": 32523 + }, + { + "epoch": 0.19342944143115426, + "grad_norm": 2.6337690353393555, + "learning_rate": 4.552468806008938e-05, + "loss": 3.5864, + "step": 32524 + }, + { + "epoch": 0.19343538871443525, + "grad_norm": 2.5147154331207275, + "learning_rate": 4.552442136840319e-05, + "loss": 3.669, + "step": 32525 + }, + { + "epoch": 0.19344133599771623, + "grad_norm": 2.5548198223114014, + "learning_rate": 4.5524154669552136e-05, + "loss": 3.502, + "step": 32526 + }, + { + "epoch": 0.19344728328099725, + "grad_norm": 2.8102235794067383, + "learning_rate": 4.5523887963536316e-05, + "loss": 3.6769, + "step": 32527 + }, + { + "epoch": 0.19345323056427824, + "grad_norm": 2.849118947982788, + "learning_rate": 4.552362125035581e-05, + "loss": 3.3322, + "step": 32528 + }, + { + "epoch": 0.19345917784755923, + "grad_norm": 3.095203161239624, + "learning_rate": 4.552335453001073e-05, + "loss": 2.9217, + "step": 32529 + }, + { + "epoch": 0.19346512513084024, + "grad_norm": 2.7572739124298096, + "learning_rate": 4.5523087802501155e-05, + "loss": 3.0508, + "step": 32530 + }, + { + "epoch": 0.19347107241412123, + "grad_norm": 2.872610330581665, + "learning_rate": 4.5522821067827174e-05, + "loss": 3.501, + "step": 32531 + }, + { + "epoch": 0.19347701969740222, + "grad_norm": 2.6060242652893066, + "learning_rate": 4.5522554325988894e-05, + "loss": 3.0258, + "step": 32532 + }, + { + "epoch": 0.19348296698068324, + "grad_norm": 2.3665926456451416, + "learning_rate": 4.552228757698641e-05, + "loss": 3.551, + "step": 32533 + }, + { + "epoch": 0.19348891426396422, + "grad_norm": 2.1231276988983154, + "learning_rate": 4.55220208208198e-05, + "loss": 5.2629, + "step": 32534 + }, + { + "epoch": 0.1934948615472452, + "grad_norm": 1.810520052909851, + "learning_rate": 4.5521754057489166e-05, + "loss": 5.1596, + "step": 32535 + }, + { + "epoch": 0.19350080883052623, + "grad_norm": 2.078847885131836, + "learning_rate": 4.55214872869946e-05, + "loss": 5.1359, + "step": 32536 + }, + { + "epoch": 0.19350675611380722, + "grad_norm": 1.823213815689087, + "learning_rate": 4.5521220509336194e-05, + "loss": 5.3621, + "step": 32537 + }, + { + "epoch": 0.1935127033970882, + "grad_norm": 1.9431284666061401, + "learning_rate": 4.5520953724514034e-05, + "loss": 5.1437, + "step": 32538 + }, + { + "epoch": 0.19351865068036922, + "grad_norm": 2.144991636276245, + "learning_rate": 4.552068693252823e-05, + "loss": 4.718, + "step": 32539 + }, + { + "epoch": 0.1935245979636502, + "grad_norm": 1.8919559717178345, + "learning_rate": 4.552042013337887e-05, + "loss": 5.3912, + "step": 32540 + }, + { + "epoch": 0.1935305452469312, + "grad_norm": 2.217273473739624, + "learning_rate": 4.552015332706604e-05, + "loss": 5.0869, + "step": 32541 + }, + { + "epoch": 0.1935364925302122, + "grad_norm": 1.923957109451294, + "learning_rate": 4.5519886513589835e-05, + "loss": 4.5278, + "step": 32542 + }, + { + "epoch": 0.1935424398134932, + "grad_norm": 1.3886886835098267, + "learning_rate": 4.551961969295035e-05, + "loss": 5.2535, + "step": 32543 + }, + { + "epoch": 0.1935483870967742, + "grad_norm": 1.6783514022827148, + "learning_rate": 4.551935286514768e-05, + "loss": 5.23, + "step": 32544 + }, + { + "epoch": 0.1935543343800552, + "grad_norm": 2.014293670654297, + "learning_rate": 4.551908603018191e-05, + "loss": 4.3551, + "step": 32545 + }, + { + "epoch": 0.1935602816633362, + "grad_norm": 2.4212424755096436, + "learning_rate": 4.551881918805314e-05, + "loss": 4.1387, + "step": 32546 + }, + { + "epoch": 0.19356622894661718, + "grad_norm": 1.541458010673523, + "learning_rate": 4.5518552338761466e-05, + "loss": 5.1392, + "step": 32547 + }, + { + "epoch": 0.1935721762298982, + "grad_norm": 1.645020604133606, + "learning_rate": 4.551828548230698e-05, + "loss": 4.9932, + "step": 32548 + }, + { + "epoch": 0.19357812351317918, + "grad_norm": 1.833045244216919, + "learning_rate": 4.551801861868977e-05, + "loss": 4.5334, + "step": 32549 + }, + { + "epoch": 0.19358407079646017, + "grad_norm": 1.7046024799346924, + "learning_rate": 4.5517751747909925e-05, + "loss": 4.718, + "step": 32550 + }, + { + "epoch": 0.19359001807974116, + "grad_norm": 1.642063856124878, + "learning_rate": 4.551748486996755e-05, + "loss": 5.3216, + "step": 32551 + }, + { + "epoch": 0.19359596536302218, + "grad_norm": 1.8512293100357056, + "learning_rate": 4.5517217984862736e-05, + "loss": 4.2428, + "step": 32552 + }, + { + "epoch": 0.19360191264630316, + "grad_norm": 1.4418857097625732, + "learning_rate": 4.551695109259557e-05, + "loss": 5.1654, + "step": 32553 + }, + { + "epoch": 0.19360785992958415, + "grad_norm": 1.3343303203582764, + "learning_rate": 4.551668419316615e-05, + "loss": 4.236, + "step": 32554 + }, + { + "epoch": 0.19361380721286517, + "grad_norm": 1.4303019046783447, + "learning_rate": 4.5516417286574567e-05, + "loss": 5.2193, + "step": 32555 + }, + { + "epoch": 0.19361975449614616, + "grad_norm": 2.1168198585510254, + "learning_rate": 4.551615037282091e-05, + "loss": 4.3826, + "step": 32556 + }, + { + "epoch": 0.19362570177942715, + "grad_norm": 1.8565425872802734, + "learning_rate": 4.551588345190528e-05, + "loss": 4.4285, + "step": 32557 + }, + { + "epoch": 0.19363164906270816, + "grad_norm": 1.4676978588104248, + "learning_rate": 4.551561652382777e-05, + "loss": 5.1846, + "step": 32558 + }, + { + "epoch": 0.19363759634598915, + "grad_norm": 1.3559863567352295, + "learning_rate": 4.5515349588588465e-05, + "loss": 5.4113, + "step": 32559 + }, + { + "epoch": 0.19364354362927014, + "grad_norm": 1.5853360891342163, + "learning_rate": 4.551508264618747e-05, + "loss": 5.0185, + "step": 32560 + }, + { + "epoch": 0.19364949091255115, + "grad_norm": 1.5268198251724243, + "learning_rate": 4.551481569662487e-05, + "loss": 5.5332, + "step": 32561 + }, + { + "epoch": 0.19365543819583214, + "grad_norm": 1.873355507850647, + "learning_rate": 4.551454873990075e-05, + "loss": 4.9289, + "step": 32562 + }, + { + "epoch": 0.19366138547911313, + "grad_norm": 1.559546709060669, + "learning_rate": 4.5514281776015225e-05, + "loss": 4.7161, + "step": 32563 + }, + { + "epoch": 0.19366733276239415, + "grad_norm": 1.5937471389770508, + "learning_rate": 4.551401480496837e-05, + "loss": 5.2034, + "step": 32564 + }, + { + "epoch": 0.19367328004567513, + "grad_norm": 1.3408461809158325, + "learning_rate": 4.551374782676029e-05, + "loss": 5.4465, + "step": 32565 + }, + { + "epoch": 0.19367922732895612, + "grad_norm": 1.5724444389343262, + "learning_rate": 4.551348084139107e-05, + "loss": 4.877, + "step": 32566 + }, + { + "epoch": 0.19368517461223714, + "grad_norm": 1.6047838926315308, + "learning_rate": 4.55132138488608e-05, + "loss": 4.8731, + "step": 32567 + }, + { + "epoch": 0.19369112189551813, + "grad_norm": 1.4955263137817383, + "learning_rate": 4.5512946849169586e-05, + "loss": 5.1981, + "step": 32568 + }, + { + "epoch": 0.19369706917879911, + "grad_norm": 1.8634296655654907, + "learning_rate": 4.5512679842317506e-05, + "loss": 4.9656, + "step": 32569 + }, + { + "epoch": 0.19370301646208013, + "grad_norm": 1.7974209785461426, + "learning_rate": 4.551241282830467e-05, + "loss": 5.0912, + "step": 32570 + }, + { + "epoch": 0.19370896374536112, + "grad_norm": 1.4037576913833618, + "learning_rate": 4.551214580713116e-05, + "loss": 5.0925, + "step": 32571 + }, + { + "epoch": 0.1937149110286421, + "grad_norm": 1.3789407014846802, + "learning_rate": 4.551187877879707e-05, + "loss": 4.9709, + "step": 32572 + }, + { + "epoch": 0.19372085831192312, + "grad_norm": 1.8228954076766968, + "learning_rate": 4.5511611743302504e-05, + "loss": 5.0125, + "step": 32573 + }, + { + "epoch": 0.1937268055952041, + "grad_norm": 1.6477984189987183, + "learning_rate": 4.5511344700647534e-05, + "loss": 4.6782, + "step": 32574 + }, + { + "epoch": 0.1937327528784851, + "grad_norm": 1.5533115863800049, + "learning_rate": 4.5511077650832275e-05, + "loss": 5.1811, + "step": 32575 + }, + { + "epoch": 0.19373870016176611, + "grad_norm": 1.4982826709747314, + "learning_rate": 4.5510810593856804e-05, + "loss": 5.4532, + "step": 32576 + }, + { + "epoch": 0.1937446474450471, + "grad_norm": 1.2856029272079468, + "learning_rate": 4.551054352972122e-05, + "loss": 5.3471, + "step": 32577 + }, + { + "epoch": 0.1937505947283281, + "grad_norm": 1.7265102863311768, + "learning_rate": 4.551027645842562e-05, + "loss": 4.7828, + "step": 32578 + }, + { + "epoch": 0.1937565420116091, + "grad_norm": 1.6499298810958862, + "learning_rate": 4.55100093799701e-05, + "loss": 5.0823, + "step": 32579 + }, + { + "epoch": 0.1937624892948901, + "grad_norm": 1.6047910451889038, + "learning_rate": 4.550974229435474e-05, + "loss": 4.8312, + "step": 32580 + }, + { + "epoch": 0.19376843657817108, + "grad_norm": 1.4857003688812256, + "learning_rate": 4.5509475201579645e-05, + "loss": 5.0103, + "step": 32581 + }, + { + "epoch": 0.1937743838614521, + "grad_norm": 1.4241074323654175, + "learning_rate": 4.55092081016449e-05, + "loss": 4.994, + "step": 32582 + }, + { + "epoch": 0.1937803311447331, + "grad_norm": 1.4176710844039917, + "learning_rate": 4.55089409945506e-05, + "loss": 5.0398, + "step": 32583 + }, + { + "epoch": 0.19378627842801407, + "grad_norm": 1.2380996942520142, + "learning_rate": 4.5508673880296846e-05, + "loss": 5.2739, + "step": 32584 + }, + { + "epoch": 0.1937922257112951, + "grad_norm": 1.4685777425765991, + "learning_rate": 4.550840675888373e-05, + "loss": 5.5471, + "step": 32585 + }, + { + "epoch": 0.19379817299457608, + "grad_norm": 2.9389889240264893, + "learning_rate": 4.5508139630311333e-05, + "loss": 4.1122, + "step": 32586 + }, + { + "epoch": 0.19380412027785707, + "grad_norm": 2.7261459827423096, + "learning_rate": 4.550787249457976e-05, + "loss": 4.0543, + "step": 32587 + }, + { + "epoch": 0.19381006756113808, + "grad_norm": 2.227731704711914, + "learning_rate": 4.55076053516891e-05, + "loss": 3.5322, + "step": 32588 + }, + { + "epoch": 0.19381601484441907, + "grad_norm": 2.80881929397583, + "learning_rate": 4.550733820163945e-05, + "loss": 3.7871, + "step": 32589 + }, + { + "epoch": 0.19382196212770006, + "grad_norm": 1.8524302244186401, + "learning_rate": 4.5507071044430894e-05, + "loss": 5.2918, + "step": 32590 + }, + { + "epoch": 0.19382790941098108, + "grad_norm": 1.3771488666534424, + "learning_rate": 4.550680388006353e-05, + "loss": 5.3438, + "step": 32591 + }, + { + "epoch": 0.19383385669426206, + "grad_norm": 2.4697203636169434, + "learning_rate": 4.5506536708537464e-05, + "loss": 4.6906, + "step": 32592 + }, + { + "epoch": 0.19383980397754305, + "grad_norm": 1.4540528059005737, + "learning_rate": 4.550626952985276e-05, + "loss": 5.2692, + "step": 32593 + }, + { + "epoch": 0.19384575126082407, + "grad_norm": 1.4477177858352661, + "learning_rate": 4.550600234400954e-05, + "loss": 5.2064, + "step": 32594 + }, + { + "epoch": 0.19385169854410506, + "grad_norm": 1.533417820930481, + "learning_rate": 4.550573515100789e-05, + "loss": 5.235, + "step": 32595 + }, + { + "epoch": 0.19385764582738604, + "grad_norm": 2.2805964946746826, + "learning_rate": 4.5505467950847904e-05, + "loss": 3.8858, + "step": 32596 + }, + { + "epoch": 0.19386359311066706, + "grad_norm": 2.02608323097229, + "learning_rate": 4.550520074352966e-05, + "loss": 4.1653, + "step": 32597 + }, + { + "epoch": 0.19386954039394805, + "grad_norm": 1.5796022415161133, + "learning_rate": 4.5504933529053264e-05, + "loss": 5.434, + "step": 32598 + }, + { + "epoch": 0.19387548767722904, + "grad_norm": 1.2967963218688965, + "learning_rate": 4.550466630741881e-05, + "loss": 5.5094, + "step": 32599 + }, + { + "epoch": 0.19388143496051005, + "grad_norm": 1.4472898244857788, + "learning_rate": 4.5504399078626384e-05, + "loss": 5.5817, + "step": 32600 + }, + { + "epoch": 0.19388738224379104, + "grad_norm": 1.9111692905426025, + "learning_rate": 4.550413184267609e-05, + "loss": 5.2645, + "step": 32601 + }, + { + "epoch": 0.19389332952707203, + "grad_norm": 2.3285624980926514, + "learning_rate": 4.5503864599568014e-05, + "loss": 4.6221, + "step": 32602 + }, + { + "epoch": 0.19389927681035304, + "grad_norm": 1.628674864768982, + "learning_rate": 4.550359734930225e-05, + "loss": 4.8921, + "step": 32603 + }, + { + "epoch": 0.19390522409363403, + "grad_norm": 1.6365528106689453, + "learning_rate": 4.550333009187889e-05, + "loss": 4.9367, + "step": 32604 + }, + { + "epoch": 0.19391117137691502, + "grad_norm": 1.8951784372329712, + "learning_rate": 4.5503062827298026e-05, + "loss": 4.7174, + "step": 32605 + }, + { + "epoch": 0.19391711866019604, + "grad_norm": 1.7012661695480347, + "learning_rate": 4.550279555555976e-05, + "loss": 5.0791, + "step": 32606 + }, + { + "epoch": 0.19392306594347702, + "grad_norm": 1.5482909679412842, + "learning_rate": 4.550252827666418e-05, + "loss": 5.3443, + "step": 32607 + }, + { + "epoch": 0.193929013226758, + "grad_norm": 1.7100435495376587, + "learning_rate": 4.5502260990611385e-05, + "loss": 5.3665, + "step": 32608 + }, + { + "epoch": 0.193934960510039, + "grad_norm": 1.7301504611968994, + "learning_rate": 4.550199369740146e-05, + "loss": 5.2521, + "step": 32609 + }, + { + "epoch": 0.19394090779332002, + "grad_norm": 1.4652512073516846, + "learning_rate": 4.550172639703449e-05, + "loss": 5.3211, + "step": 32610 + }, + { + "epoch": 0.193946855076601, + "grad_norm": 1.6663676500320435, + "learning_rate": 4.550145908951059e-05, + "loss": 4.8544, + "step": 32611 + }, + { + "epoch": 0.193952802359882, + "grad_norm": 2.0611562728881836, + "learning_rate": 4.550119177482983e-05, + "loss": 4.3234, + "step": 32612 + }, + { + "epoch": 0.193958749643163, + "grad_norm": 1.612246036529541, + "learning_rate": 4.5500924452992335e-05, + "loss": 5.1136, + "step": 32613 + }, + { + "epoch": 0.193964696926444, + "grad_norm": 1.622652530670166, + "learning_rate": 4.550065712399816e-05, + "loss": 4.9411, + "step": 32614 + }, + { + "epoch": 0.19397064420972498, + "grad_norm": 1.7339041233062744, + "learning_rate": 4.5500389787847434e-05, + "loss": 4.7983, + "step": 32615 + }, + { + "epoch": 0.193976591493006, + "grad_norm": 1.4528504610061646, + "learning_rate": 4.550012244454022e-05, + "loss": 5.2277, + "step": 32616 + }, + { + "epoch": 0.193982538776287, + "grad_norm": 1.3635590076446533, + "learning_rate": 4.5499855094076634e-05, + "loss": 5.1081, + "step": 32617 + }, + { + "epoch": 0.19398848605956798, + "grad_norm": 1.7279419898986816, + "learning_rate": 4.549958773645676e-05, + "loss": 5.1649, + "step": 32618 + }, + { + "epoch": 0.193994433342849, + "grad_norm": 1.3046659231185913, + "learning_rate": 4.549932037168069e-05, + "loss": 5.1288, + "step": 32619 + }, + { + "epoch": 0.19400038062612998, + "grad_norm": 1.3744393587112427, + "learning_rate": 4.549905299974852e-05, + "loss": 5.1108, + "step": 32620 + }, + { + "epoch": 0.19400632790941097, + "grad_norm": 1.4627171754837036, + "learning_rate": 4.5498785620660334e-05, + "loss": 5.1959, + "step": 32621 + }, + { + "epoch": 0.19401227519269199, + "grad_norm": 1.3577818870544434, + "learning_rate": 4.549851823441624e-05, + "loss": 5.3846, + "step": 32622 + }, + { + "epoch": 0.19401822247597297, + "grad_norm": 1.5022718906402588, + "learning_rate": 4.5498250841016324e-05, + "loss": 5.1238, + "step": 32623 + }, + { + "epoch": 0.19402416975925396, + "grad_norm": 1.5609856843948364, + "learning_rate": 4.549798344046068e-05, + "loss": 5.0529, + "step": 32624 + }, + { + "epoch": 0.19403011704253498, + "grad_norm": 1.523977279663086, + "learning_rate": 4.5497716032749404e-05, + "loss": 5.1331, + "step": 32625 + }, + { + "epoch": 0.19403606432581597, + "grad_norm": 1.4886420965194702, + "learning_rate": 4.549744861788259e-05, + "loss": 5.1282, + "step": 32626 + }, + { + "epoch": 0.19404201160909695, + "grad_norm": 2.039761781692505, + "learning_rate": 4.549718119586032e-05, + "loss": 3.9558, + "step": 32627 + }, + { + "epoch": 0.19404795889237797, + "grad_norm": 3.320199728012085, + "learning_rate": 4.54969137666827e-05, + "loss": 2.5416, + "step": 32628 + }, + { + "epoch": 0.19405390617565896, + "grad_norm": 2.6014201641082764, + "learning_rate": 4.549664633034982e-05, + "loss": 3.4481, + "step": 32629 + }, + { + "epoch": 0.19405985345893995, + "grad_norm": 2.4465911388397217, + "learning_rate": 4.549637888686177e-05, + "loss": 2.8785, + "step": 32630 + }, + { + "epoch": 0.19406580074222096, + "grad_norm": 2.668752670288086, + "learning_rate": 4.549611143621865e-05, + "loss": 3.2297, + "step": 32631 + }, + { + "epoch": 0.19407174802550195, + "grad_norm": 2.3197052478790283, + "learning_rate": 4.549584397842055e-05, + "loss": 3.6405, + "step": 32632 + }, + { + "epoch": 0.19407769530878294, + "grad_norm": 1.5895888805389404, + "learning_rate": 4.549557651346756e-05, + "loss": 4.9316, + "step": 32633 + }, + { + "epoch": 0.19408364259206395, + "grad_norm": 1.7346808910369873, + "learning_rate": 4.5495309041359774e-05, + "loss": 4.9319, + "step": 32634 + }, + { + "epoch": 0.19408958987534494, + "grad_norm": 1.7234487533569336, + "learning_rate": 4.549504156209729e-05, + "loss": 5.0863, + "step": 32635 + }, + { + "epoch": 0.19409553715862593, + "grad_norm": 2.0025033950805664, + "learning_rate": 4.5494774075680204e-05, + "loss": 5.3009, + "step": 32636 + }, + { + "epoch": 0.19410148444190695, + "grad_norm": 1.7096216678619385, + "learning_rate": 4.549450658210859e-05, + "loss": 5.3195, + "step": 32637 + }, + { + "epoch": 0.19410743172518793, + "grad_norm": 2.219808578491211, + "learning_rate": 4.549423908138257e-05, + "loss": 4.5839, + "step": 32638 + }, + { + "epoch": 0.19411337900846892, + "grad_norm": 1.8379898071289062, + "learning_rate": 4.549397157350221e-05, + "loss": 4.5265, + "step": 32639 + }, + { + "epoch": 0.19411932629174994, + "grad_norm": 2.1760711669921875, + "learning_rate": 4.549370405846762e-05, + "loss": 4.161, + "step": 32640 + }, + { + "epoch": 0.19412527357503093, + "grad_norm": 1.9466861486434937, + "learning_rate": 4.54934365362789e-05, + "loss": 4.6235, + "step": 32641 + }, + { + "epoch": 0.19413122085831191, + "grad_norm": 1.8308939933776855, + "learning_rate": 4.549316900693612e-05, + "loss": 4.6028, + "step": 32642 + }, + { + "epoch": 0.19413716814159293, + "grad_norm": 1.7800029516220093, + "learning_rate": 4.54929014704394e-05, + "loss": 4.721, + "step": 32643 + }, + { + "epoch": 0.19414311542487392, + "grad_norm": 2.0616424083709717, + "learning_rate": 4.5492633926788806e-05, + "loss": 4.4319, + "step": 32644 + }, + { + "epoch": 0.1941490627081549, + "grad_norm": 1.8670353889465332, + "learning_rate": 4.549236637598445e-05, + "loss": 4.2249, + "step": 32645 + }, + { + "epoch": 0.19415500999143592, + "grad_norm": 1.6615244150161743, + "learning_rate": 4.5492098818026425e-05, + "loss": 4.1405, + "step": 32646 + }, + { + "epoch": 0.1941609572747169, + "grad_norm": 1.727453589439392, + "learning_rate": 4.549183125291481e-05, + "loss": 4.2608, + "step": 32647 + }, + { + "epoch": 0.1941669045579979, + "grad_norm": 1.7687768936157227, + "learning_rate": 4.549156368064972e-05, + "loss": 4.2649, + "step": 32648 + }, + { + "epoch": 0.19417285184127892, + "grad_norm": 1.9458198547363281, + "learning_rate": 4.549129610123123e-05, + "loss": 4.6472, + "step": 32649 + }, + { + "epoch": 0.1941787991245599, + "grad_norm": 1.8845311403274536, + "learning_rate": 4.549102851465944e-05, + "loss": 4.5324, + "step": 32650 + }, + { + "epoch": 0.1941847464078409, + "grad_norm": 1.973351240158081, + "learning_rate": 4.549076092093445e-05, + "loss": 4.0662, + "step": 32651 + }, + { + "epoch": 0.1941906936911219, + "grad_norm": 1.5977756977081299, + "learning_rate": 4.549049332005634e-05, + "loss": 4.4065, + "step": 32652 + }, + { + "epoch": 0.1941966409744029, + "grad_norm": 1.5310672521591187, + "learning_rate": 4.5490225712025215e-05, + "loss": 4.3943, + "step": 32653 + }, + { + "epoch": 0.19420258825768388, + "grad_norm": 1.4160810708999634, + "learning_rate": 4.548995809684116e-05, + "loss": 4.4643, + "step": 32654 + }, + { + "epoch": 0.1942085355409649, + "grad_norm": 1.9193739891052246, + "learning_rate": 4.548969047450428e-05, + "loss": 4.5345, + "step": 32655 + }, + { + "epoch": 0.1942144828242459, + "grad_norm": 1.8352816104888916, + "learning_rate": 4.548942284501465e-05, + "loss": 4.6301, + "step": 32656 + }, + { + "epoch": 0.19422043010752688, + "grad_norm": 1.87077796459198, + "learning_rate": 4.5489155208372384e-05, + "loss": 4.4388, + "step": 32657 + }, + { + "epoch": 0.1942263773908079, + "grad_norm": 1.719300389289856, + "learning_rate": 4.5488887564577555e-05, + "loss": 4.5482, + "step": 32658 + }, + { + "epoch": 0.19423232467408888, + "grad_norm": 1.7464433908462524, + "learning_rate": 4.548861991363028e-05, + "loss": 4.5615, + "step": 32659 + }, + { + "epoch": 0.19423827195736987, + "grad_norm": 2.0196592807769775, + "learning_rate": 4.548835225553063e-05, + "loss": 4.7149, + "step": 32660 + }, + { + "epoch": 0.19424421924065088, + "grad_norm": 1.725612998008728, + "learning_rate": 4.548808459027871e-05, + "loss": 4.4756, + "step": 32661 + }, + { + "epoch": 0.19425016652393187, + "grad_norm": 2.12505841255188, + "learning_rate": 4.548781691787461e-05, + "loss": 4.4415, + "step": 32662 + }, + { + "epoch": 0.19425611380721286, + "grad_norm": 1.8461819887161255, + "learning_rate": 4.548754923831843e-05, + "loss": 4.7161, + "step": 32663 + }, + { + "epoch": 0.19426206109049388, + "grad_norm": 1.680954933166504, + "learning_rate": 4.548728155161025e-05, + "loss": 4.3207, + "step": 32664 + }, + { + "epoch": 0.19426800837377486, + "grad_norm": 1.8344814777374268, + "learning_rate": 4.548701385775018e-05, + "loss": 4.0626, + "step": 32665 + }, + { + "epoch": 0.19427395565705585, + "grad_norm": 1.6902893781661987, + "learning_rate": 4.54867461567383e-05, + "loss": 4.7755, + "step": 32666 + }, + { + "epoch": 0.19427990294033684, + "grad_norm": 1.5608021020889282, + "learning_rate": 4.548647844857471e-05, + "loss": 4.8721, + "step": 32667 + }, + { + "epoch": 0.19428585022361786, + "grad_norm": 1.6685339212417603, + "learning_rate": 4.54862107332595e-05, + "loss": 4.6452, + "step": 32668 + }, + { + "epoch": 0.19429179750689884, + "grad_norm": 1.8371236324310303, + "learning_rate": 4.548594301079277e-05, + "loss": 4.4041, + "step": 32669 + }, + { + "epoch": 0.19429774479017983, + "grad_norm": 1.7567802667617798, + "learning_rate": 4.548567528117461e-05, + "loss": 4.4144, + "step": 32670 + }, + { + "epoch": 0.19430369207346085, + "grad_norm": 1.7036006450653076, + "learning_rate": 4.54854075444051e-05, + "loss": 4.5322, + "step": 32671 + }, + { + "epoch": 0.19430963935674184, + "grad_norm": 2.345913887023926, + "learning_rate": 4.5485139800484356e-05, + "loss": 4.0522, + "step": 32672 + }, + { + "epoch": 0.19431558664002282, + "grad_norm": 2.7532145977020264, + "learning_rate": 4.5484872049412465e-05, + "loss": 3.836, + "step": 32673 + }, + { + "epoch": 0.19432153392330384, + "grad_norm": 2.086850166320801, + "learning_rate": 4.548460429118951e-05, + "loss": 4.1181, + "step": 32674 + }, + { + "epoch": 0.19432748120658483, + "grad_norm": 2.241453170776367, + "learning_rate": 4.5484336525815596e-05, + "loss": 3.7434, + "step": 32675 + }, + { + "epoch": 0.19433342848986582, + "grad_norm": 2.2940404415130615, + "learning_rate": 4.548406875329081e-05, + "loss": 3.6265, + "step": 32676 + }, + { + "epoch": 0.19433937577314683, + "grad_norm": 2.1785483360290527, + "learning_rate": 4.5483800973615245e-05, + "loss": 3.4999, + "step": 32677 + }, + { + "epoch": 0.19434532305642782, + "grad_norm": 1.7296667098999023, + "learning_rate": 4.5483533186788996e-05, + "loss": 4.0844, + "step": 32678 + }, + { + "epoch": 0.1943512703397088, + "grad_norm": 1.4519742727279663, + "learning_rate": 4.5483265392812156e-05, + "loss": 4.9145, + "step": 32679 + }, + { + "epoch": 0.19435721762298983, + "grad_norm": 1.702333927154541, + "learning_rate": 4.5482997591684826e-05, + "loss": 4.7128, + "step": 32680 + }, + { + "epoch": 0.1943631649062708, + "grad_norm": 1.63100266456604, + "learning_rate": 4.548272978340709e-05, + "loss": 4.5922, + "step": 32681 + }, + { + "epoch": 0.1943691121895518, + "grad_norm": 1.5324856042861938, + "learning_rate": 4.548246196797904e-05, + "loss": 5.0184, + "step": 32682 + }, + { + "epoch": 0.19437505947283282, + "grad_norm": 1.7499281167984009, + "learning_rate": 4.548219414540078e-05, + "loss": 4.9284, + "step": 32683 + }, + { + "epoch": 0.1943810067561138, + "grad_norm": 1.6011301279067993, + "learning_rate": 4.5481926315672395e-05, + "loss": 4.8825, + "step": 32684 + }, + { + "epoch": 0.1943869540393948, + "grad_norm": 1.8732880353927612, + "learning_rate": 4.5481658478793986e-05, + "loss": 4.7278, + "step": 32685 + }, + { + "epoch": 0.1943929013226758, + "grad_norm": 1.6948355436325073, + "learning_rate": 4.548139063476564e-05, + "loss": 4.647, + "step": 32686 + }, + { + "epoch": 0.1943988486059568, + "grad_norm": 1.7804508209228516, + "learning_rate": 4.548112278358745e-05, + "loss": 4.6336, + "step": 32687 + }, + { + "epoch": 0.19440479588923779, + "grad_norm": 2.1667730808258057, + "learning_rate": 4.548085492525951e-05, + "loss": 4.7747, + "step": 32688 + }, + { + "epoch": 0.1944107431725188, + "grad_norm": 1.5616405010223389, + "learning_rate": 4.5480587059781916e-05, + "loss": 4.8905, + "step": 32689 + }, + { + "epoch": 0.1944166904557998, + "grad_norm": 1.6781290769577026, + "learning_rate": 4.548031918715476e-05, + "loss": 4.5202, + "step": 32690 + }, + { + "epoch": 0.19442263773908078, + "grad_norm": 1.858265995979309, + "learning_rate": 4.548005130737814e-05, + "loss": 4.8571, + "step": 32691 + }, + { + "epoch": 0.1944285850223618, + "grad_norm": 1.4587072134017944, + "learning_rate": 4.5479783420452144e-05, + "loss": 4.8802, + "step": 32692 + }, + { + "epoch": 0.19443453230564278, + "grad_norm": 1.6977627277374268, + "learning_rate": 4.5479515526376866e-05, + "loss": 4.4907, + "step": 32693 + }, + { + "epoch": 0.19444047958892377, + "grad_norm": 1.749733805656433, + "learning_rate": 4.54792476251524e-05, + "loss": 4.4612, + "step": 32694 + }, + { + "epoch": 0.1944464268722048, + "grad_norm": 1.9689366817474365, + "learning_rate": 4.547897971677885e-05, + "loss": 4.222, + "step": 32695 + }, + { + "epoch": 0.19445237415548577, + "grad_norm": 2.6120550632476807, + "learning_rate": 4.547871180125628e-05, + "loss": 3.7294, + "step": 32696 + }, + { + "epoch": 0.19445832143876676, + "grad_norm": 2.5318052768707275, + "learning_rate": 4.547844387858482e-05, + "loss": 3.9223, + "step": 32697 + }, + { + "epoch": 0.19446426872204778, + "grad_norm": 2.4473683834075928, + "learning_rate": 4.547817594876454e-05, + "loss": 3.801, + "step": 32698 + }, + { + "epoch": 0.19447021600532877, + "grad_norm": 1.6112592220306396, + "learning_rate": 4.5477908011795546e-05, + "loss": 4.8024, + "step": 32699 + }, + { + "epoch": 0.19447616328860975, + "grad_norm": 1.7253385782241821, + "learning_rate": 4.5477640067677915e-05, + "loss": 4.6842, + "step": 32700 + }, + { + "epoch": 0.19448211057189077, + "grad_norm": 2.232095718383789, + "learning_rate": 4.547737211641176e-05, + "loss": 3.5904, + "step": 32701 + }, + { + "epoch": 0.19448805785517176, + "grad_norm": 2.517429828643799, + "learning_rate": 4.547710415799716e-05, + "loss": 3.5036, + "step": 32702 + }, + { + "epoch": 0.19449400513845275, + "grad_norm": 2.414701461791992, + "learning_rate": 4.547683619243423e-05, + "loss": 3.6109, + "step": 32703 + }, + { + "epoch": 0.19449995242173376, + "grad_norm": 2.3504600524902344, + "learning_rate": 4.5476568219723027e-05, + "loss": 3.9853, + "step": 32704 + }, + { + "epoch": 0.19450589970501475, + "grad_norm": 2.6596758365631104, + "learning_rate": 4.547630023986368e-05, + "loss": 3.7625, + "step": 32705 + }, + { + "epoch": 0.19451184698829574, + "grad_norm": 2.6922054290771484, + "learning_rate": 4.547603225285626e-05, + "loss": 4.1548, + "step": 32706 + }, + { + "epoch": 0.19451779427157675, + "grad_norm": 2.4801747798919678, + "learning_rate": 4.547576425870087e-05, + "loss": 4.0531, + "step": 32707 + }, + { + "epoch": 0.19452374155485774, + "grad_norm": 2.2931368350982666, + "learning_rate": 4.547549625739761e-05, + "loss": 3.9223, + "step": 32708 + }, + { + "epoch": 0.19452968883813873, + "grad_norm": 2.383759021759033, + "learning_rate": 4.547522824894655e-05, + "loss": 3.6651, + "step": 32709 + }, + { + "epoch": 0.19453563612141975, + "grad_norm": 2.0957138538360596, + "learning_rate": 4.547496023334782e-05, + "loss": 3.3919, + "step": 32710 + }, + { + "epoch": 0.19454158340470074, + "grad_norm": 2.0263047218322754, + "learning_rate": 4.547469221060148e-05, + "loss": 3.8709, + "step": 32711 + }, + { + "epoch": 0.19454753068798172, + "grad_norm": 1.7334496974945068, + "learning_rate": 4.5474424180707634e-05, + "loss": 3.9556, + "step": 32712 + }, + { + "epoch": 0.19455347797126274, + "grad_norm": 1.9237737655639648, + "learning_rate": 4.547415614366639e-05, + "loss": 4.3447, + "step": 32713 + }, + { + "epoch": 0.19455942525454373, + "grad_norm": 1.7965775728225708, + "learning_rate": 4.547388809947782e-05, + "loss": 4.3011, + "step": 32714 + }, + { + "epoch": 0.19456537253782472, + "grad_norm": 2.085796356201172, + "learning_rate": 4.547362004814203e-05, + "loss": 4.1599, + "step": 32715 + }, + { + "epoch": 0.19457131982110573, + "grad_norm": 2.460947275161743, + "learning_rate": 4.547335198965911e-05, + "loss": 3.1574, + "step": 32716 + }, + { + "epoch": 0.19457726710438672, + "grad_norm": 2.9936110973358154, + "learning_rate": 4.547308392402915e-05, + "loss": 3.1313, + "step": 32717 + }, + { + "epoch": 0.1945832143876677, + "grad_norm": 1.802701473236084, + "learning_rate": 4.547281585125225e-05, + "loss": 4.513, + "step": 32718 + }, + { + "epoch": 0.19458916167094872, + "grad_norm": 1.5326614379882812, + "learning_rate": 4.547254777132851e-05, + "loss": 4.6847, + "step": 32719 + }, + { + "epoch": 0.1945951089542297, + "grad_norm": 1.3601535558700562, + "learning_rate": 4.547227968425801e-05, + "loss": 4.8147, + "step": 32720 + }, + { + "epoch": 0.1946010562375107, + "grad_norm": 1.5872572660446167, + "learning_rate": 4.5472011590040845e-05, + "loss": 4.6806, + "step": 32721 + }, + { + "epoch": 0.19460700352079172, + "grad_norm": 1.700873851776123, + "learning_rate": 4.547174348867712e-05, + "loss": 5.0069, + "step": 32722 + }, + { + "epoch": 0.1946129508040727, + "grad_norm": 2.1521737575531006, + "learning_rate": 4.547147538016691e-05, + "loss": 3.8698, + "step": 32723 + }, + { + "epoch": 0.1946188980873537, + "grad_norm": 1.4435259103775024, + "learning_rate": 4.547120726451033e-05, + "loss": 4.5179, + "step": 32724 + }, + { + "epoch": 0.19462484537063468, + "grad_norm": 1.4912521839141846, + "learning_rate": 4.547093914170746e-05, + "loss": 4.9731, + "step": 32725 + }, + { + "epoch": 0.1946307926539157, + "grad_norm": 1.36370050907135, + "learning_rate": 4.5470671011758395e-05, + "loss": 4.6799, + "step": 32726 + }, + { + "epoch": 0.19463673993719668, + "grad_norm": 1.2558645009994507, + "learning_rate": 4.5470402874663226e-05, + "loss": 4.6845, + "step": 32727 + }, + { + "epoch": 0.19464268722047767, + "grad_norm": 1.3222334384918213, + "learning_rate": 4.5470134730422053e-05, + "loss": 4.7477, + "step": 32728 + }, + { + "epoch": 0.1946486345037587, + "grad_norm": 1.6657606363296509, + "learning_rate": 4.546986657903497e-05, + "loss": 4.5929, + "step": 32729 + }, + { + "epoch": 0.19465458178703968, + "grad_norm": 1.6633927822113037, + "learning_rate": 4.546959842050207e-05, + "loss": 4.9297, + "step": 32730 + }, + { + "epoch": 0.19466052907032066, + "grad_norm": 1.549243450164795, + "learning_rate": 4.546933025482344e-05, + "loss": 5.1384, + "step": 32731 + }, + { + "epoch": 0.19466647635360168, + "grad_norm": 1.4809843301773071, + "learning_rate": 4.5469062081999184e-05, + "loss": 5.1748, + "step": 32732 + }, + { + "epoch": 0.19467242363688267, + "grad_norm": 1.3342254161834717, + "learning_rate": 4.546879390202938e-05, + "loss": 5.1949, + "step": 32733 + }, + { + "epoch": 0.19467837092016366, + "grad_norm": 1.558632254600525, + "learning_rate": 4.5468525714914146e-05, + "loss": 4.7441, + "step": 32734 + }, + { + "epoch": 0.19468431820344467, + "grad_norm": 1.9341686964035034, + "learning_rate": 4.546825752065355e-05, + "loss": 4.5898, + "step": 32735 + }, + { + "epoch": 0.19469026548672566, + "grad_norm": 2.236103057861328, + "learning_rate": 4.54679893192477e-05, + "loss": 3.8901, + "step": 32736 + }, + { + "epoch": 0.19469621277000665, + "grad_norm": 1.7344380617141724, + "learning_rate": 4.5467721110696685e-05, + "loss": 5.2491, + "step": 32737 + }, + { + "epoch": 0.19470216005328767, + "grad_norm": 1.894675850868225, + "learning_rate": 4.5467452895000606e-05, + "loss": 4.9341, + "step": 32738 + }, + { + "epoch": 0.19470810733656865, + "grad_norm": 1.3538182973861694, + "learning_rate": 4.5467184672159546e-05, + "loss": 5.1136, + "step": 32739 + }, + { + "epoch": 0.19471405461984964, + "grad_norm": 1.481584072113037, + "learning_rate": 4.54669164421736e-05, + "loss": 4.7057, + "step": 32740 + }, + { + "epoch": 0.19472000190313066, + "grad_norm": 1.7495735883712769, + "learning_rate": 4.546664820504287e-05, + "loss": 4.5717, + "step": 32741 + }, + { + "epoch": 0.19472594918641165, + "grad_norm": 1.5889508724212646, + "learning_rate": 4.546637996076744e-05, + "loss": 4.6641, + "step": 32742 + }, + { + "epoch": 0.19473189646969263, + "grad_norm": 1.6097511053085327, + "learning_rate": 4.5466111709347415e-05, + "loss": 4.8375, + "step": 32743 + }, + { + "epoch": 0.19473784375297365, + "grad_norm": 1.790899634361267, + "learning_rate": 4.5465843450782876e-05, + "loss": 4.8971, + "step": 32744 + }, + { + "epoch": 0.19474379103625464, + "grad_norm": 1.564828872680664, + "learning_rate": 4.546557518507392e-05, + "loss": 4.8392, + "step": 32745 + }, + { + "epoch": 0.19474973831953563, + "grad_norm": 1.536988615989685, + "learning_rate": 4.546530691222065e-05, + "loss": 5.1304, + "step": 32746 + }, + { + "epoch": 0.19475568560281664, + "grad_norm": 1.7013825178146362, + "learning_rate": 4.546503863222315e-05, + "loss": 4.5505, + "step": 32747 + }, + { + "epoch": 0.19476163288609763, + "grad_norm": 1.6183735132217407, + "learning_rate": 4.546477034508152e-05, + "loss": 4.6205, + "step": 32748 + }, + { + "epoch": 0.19476758016937862, + "grad_norm": 1.8244844675064087, + "learning_rate": 4.5464502050795844e-05, + "loss": 4.5857, + "step": 32749 + }, + { + "epoch": 0.19477352745265963, + "grad_norm": 2.38612961769104, + "learning_rate": 4.546423374936623e-05, + "loss": 4.3756, + "step": 32750 + }, + { + "epoch": 0.19477947473594062, + "grad_norm": 2.186896324157715, + "learning_rate": 4.5463965440792755e-05, + "loss": 4.578, + "step": 32751 + }, + { + "epoch": 0.1947854220192216, + "grad_norm": 2.066823959350586, + "learning_rate": 4.546369712507552e-05, + "loss": 4.198, + "step": 32752 + }, + { + "epoch": 0.19479136930250263, + "grad_norm": 2.2022926807403564, + "learning_rate": 4.5463428802214624e-05, + "loss": 3.6525, + "step": 32753 + }, + { + "epoch": 0.19479731658578361, + "grad_norm": 1.8750653266906738, + "learning_rate": 4.546316047221016e-05, + "loss": 4.2666, + "step": 32754 + }, + { + "epoch": 0.1948032638690646, + "grad_norm": 2.1228365898132324, + "learning_rate": 4.5462892135062215e-05, + "loss": 3.4313, + "step": 32755 + }, + { + "epoch": 0.19480921115234562, + "grad_norm": 2.177910804748535, + "learning_rate": 4.546262379077089e-05, + "loss": 3.4433, + "step": 32756 + }, + { + "epoch": 0.1948151584356266, + "grad_norm": 2.1423957347869873, + "learning_rate": 4.546235543933626e-05, + "loss": 3.4016, + "step": 32757 + }, + { + "epoch": 0.1948211057189076, + "grad_norm": 1.9580178260803223, + "learning_rate": 4.546208708075844e-05, + "loss": 3.36, + "step": 32758 + }, + { + "epoch": 0.1948270530021886, + "grad_norm": 2.11665940284729, + "learning_rate": 4.546181871503752e-05, + "loss": 3.7196, + "step": 32759 + }, + { + "epoch": 0.1948330002854696, + "grad_norm": 2.0595879554748535, + "learning_rate": 4.54615503421736e-05, + "loss": 3.6423, + "step": 32760 + }, + { + "epoch": 0.1948389475687506, + "grad_norm": 2.0790436267852783, + "learning_rate": 4.546128196216675e-05, + "loss": 3.3795, + "step": 32761 + }, + { + "epoch": 0.1948448948520316, + "grad_norm": 2.1012542247772217, + "learning_rate": 4.546101357501708e-05, + "loss": 3.3949, + "step": 32762 + }, + { + "epoch": 0.1948508421353126, + "grad_norm": 2.005147933959961, + "learning_rate": 4.5460745180724684e-05, + "loss": 3.34, + "step": 32763 + }, + { + "epoch": 0.19485678941859358, + "grad_norm": 2.040877342224121, + "learning_rate": 4.546047677928965e-05, + "loss": 3.471, + "step": 32764 + }, + { + "epoch": 0.1948627367018746, + "grad_norm": 1.92231285572052, + "learning_rate": 4.5460208370712085e-05, + "loss": 3.6301, + "step": 32765 + }, + { + "epoch": 0.19486868398515558, + "grad_norm": 2.008256435394287, + "learning_rate": 4.545993995499206e-05, + "loss": 3.5078, + "step": 32766 + }, + { + "epoch": 0.19487463126843657, + "grad_norm": 1.9159399271011353, + "learning_rate": 4.545967153212969e-05, + "loss": 3.2598, + "step": 32767 + }, + { + "epoch": 0.1948805785517176, + "grad_norm": 2.008863687515259, + "learning_rate": 4.545940310212505e-05, + "loss": 3.2273, + "step": 32768 + }, + { + "epoch": 0.19488652583499858, + "grad_norm": 2.072593927383423, + "learning_rate": 4.545913466497825e-05, + "loss": 3.245, + "step": 32769 + }, + { + "epoch": 0.19489247311827956, + "grad_norm": 2.1123046875, + "learning_rate": 4.5458866220689386e-05, + "loss": 3.337, + "step": 32770 + }, + { + "epoch": 0.19489842040156058, + "grad_norm": 2.036161422729492, + "learning_rate": 4.5458597769258535e-05, + "loss": 3.4031, + "step": 32771 + }, + { + "epoch": 0.19490436768484157, + "grad_norm": 1.7956360578536987, + "learning_rate": 4.54583293106858e-05, + "loss": 3.3922, + "step": 32772 + }, + { + "epoch": 0.19491031496812256, + "grad_norm": 1.9955687522888184, + "learning_rate": 4.545806084497127e-05, + "loss": 3.398, + "step": 32773 + }, + { + "epoch": 0.19491626225140357, + "grad_norm": 1.9657707214355469, + "learning_rate": 4.545779237211504e-05, + "loss": 3.3049, + "step": 32774 + }, + { + "epoch": 0.19492220953468456, + "grad_norm": 2.042170286178589, + "learning_rate": 4.545752389211722e-05, + "loss": 3.3357, + "step": 32775 + }, + { + "epoch": 0.19492815681796555, + "grad_norm": 1.8873474597930908, + "learning_rate": 4.545725540497787e-05, + "loss": 3.3893, + "step": 32776 + }, + { + "epoch": 0.19493410410124656, + "grad_norm": 2.1129064559936523, + "learning_rate": 4.545698691069712e-05, + "loss": 4.1575, + "step": 32777 + }, + { + "epoch": 0.19494005138452755, + "grad_norm": 1.821212887763977, + "learning_rate": 4.545671840927504e-05, + "loss": 4.0723, + "step": 32778 + }, + { + "epoch": 0.19494599866780854, + "grad_norm": 1.8211898803710938, + "learning_rate": 4.545644990071174e-05, + "loss": 3.9323, + "step": 32779 + }, + { + "epoch": 0.19495194595108956, + "grad_norm": 2.6873621940612793, + "learning_rate": 4.545618138500729e-05, + "loss": 4.0404, + "step": 32780 + }, + { + "epoch": 0.19495789323437054, + "grad_norm": 1.9011731147766113, + "learning_rate": 4.54559128621618e-05, + "loss": 4.6881, + "step": 32781 + }, + { + "epoch": 0.19496384051765153, + "grad_norm": 2.0312952995300293, + "learning_rate": 4.5455644332175374e-05, + "loss": 3.2722, + "step": 32782 + }, + { + "epoch": 0.19496978780093252, + "grad_norm": 1.7892794609069824, + "learning_rate": 4.5455375795048086e-05, + "loss": 3.9904, + "step": 32783 + }, + { + "epoch": 0.19497573508421354, + "grad_norm": 2.0266246795654297, + "learning_rate": 4.545510725078004e-05, + "loss": 4.0798, + "step": 32784 + }, + { + "epoch": 0.19498168236749452, + "grad_norm": 1.9282901287078857, + "learning_rate": 4.545483869937133e-05, + "loss": 3.7749, + "step": 32785 + }, + { + "epoch": 0.1949876296507755, + "grad_norm": 1.7957758903503418, + "learning_rate": 4.545457014082204e-05, + "loss": 3.5671, + "step": 32786 + }, + { + "epoch": 0.19499357693405653, + "grad_norm": 2.084536075592041, + "learning_rate": 4.545430157513227e-05, + "loss": 3.3633, + "step": 32787 + }, + { + "epoch": 0.19499952421733752, + "grad_norm": 1.8941612243652344, + "learning_rate": 4.545403300230212e-05, + "loss": 4.1718, + "step": 32788 + }, + { + "epoch": 0.1950054715006185, + "grad_norm": 2.0162341594696045, + "learning_rate": 4.545376442233168e-05, + "loss": 4.8271, + "step": 32789 + }, + { + "epoch": 0.19501141878389952, + "grad_norm": 2.3421337604522705, + "learning_rate": 4.545349583522104e-05, + "loss": 4.3666, + "step": 32790 + }, + { + "epoch": 0.1950173660671805, + "grad_norm": 2.9880783557891846, + "learning_rate": 4.54532272409703e-05, + "loss": 3.6112, + "step": 32791 + }, + { + "epoch": 0.1950233133504615, + "grad_norm": 2.165861129760742, + "learning_rate": 4.545295863957955e-05, + "loss": 3.9271, + "step": 32792 + }, + { + "epoch": 0.1950292606337425, + "grad_norm": 1.7366812229156494, + "learning_rate": 4.545269003104887e-05, + "loss": 4.4979, + "step": 32793 + }, + { + "epoch": 0.1950352079170235, + "grad_norm": 1.8814879655838013, + "learning_rate": 4.5452421415378374e-05, + "loss": 4.5148, + "step": 32794 + }, + { + "epoch": 0.1950411552003045, + "grad_norm": 1.8868308067321777, + "learning_rate": 4.545215279256815e-05, + "loss": 4.9256, + "step": 32795 + }, + { + "epoch": 0.1950471024835855, + "grad_norm": 2.2767741680145264, + "learning_rate": 4.54518841626183e-05, + "loss": 4.7479, + "step": 32796 + }, + { + "epoch": 0.1950530497668665, + "grad_norm": 1.6899062395095825, + "learning_rate": 4.54516155255289e-05, + "loss": 4.6015, + "step": 32797 + }, + { + "epoch": 0.19505899705014748, + "grad_norm": 2.2540504932403564, + "learning_rate": 4.545134688130005e-05, + "loss": 3.93, + "step": 32798 + }, + { + "epoch": 0.1950649443334285, + "grad_norm": 1.7807284593582153, + "learning_rate": 4.5451078229931846e-05, + "loss": 4.8511, + "step": 32799 + }, + { + "epoch": 0.19507089161670949, + "grad_norm": 1.7292965650558472, + "learning_rate": 4.5450809571424384e-05, + "loss": 5.0537, + "step": 32800 + }, + { + "epoch": 0.19507683889999047, + "grad_norm": 1.7457902431488037, + "learning_rate": 4.545054090577776e-05, + "loss": 4.8452, + "step": 32801 + }, + { + "epoch": 0.1950827861832715, + "grad_norm": 1.799851417541504, + "learning_rate": 4.5450272232992056e-05, + "loss": 4.5091, + "step": 32802 + }, + { + "epoch": 0.19508873346655248, + "grad_norm": 1.6106951236724854, + "learning_rate": 4.545000355306738e-05, + "loss": 5.3681, + "step": 32803 + }, + { + "epoch": 0.19509468074983347, + "grad_norm": 1.6085110902786255, + "learning_rate": 4.5449734866003815e-05, + "loss": 5.3412, + "step": 32804 + }, + { + "epoch": 0.19510062803311448, + "grad_norm": 1.7096600532531738, + "learning_rate": 4.5449466171801456e-05, + "loss": 5.1475, + "step": 32805 + }, + { + "epoch": 0.19510657531639547, + "grad_norm": 1.7066271305084229, + "learning_rate": 4.5449197470460405e-05, + "loss": 4.9972, + "step": 32806 + }, + { + "epoch": 0.19511252259967646, + "grad_norm": 1.731095552444458, + "learning_rate": 4.544892876198075e-05, + "loss": 5.1551, + "step": 32807 + }, + { + "epoch": 0.19511846988295747, + "grad_norm": 1.9543027877807617, + "learning_rate": 4.544866004636259e-05, + "loss": 4.6056, + "step": 32808 + }, + { + "epoch": 0.19512441716623846, + "grad_norm": 1.8629963397979736, + "learning_rate": 4.5448391323606e-05, + "loss": 4.3998, + "step": 32809 + }, + { + "epoch": 0.19513036444951945, + "grad_norm": 2.1942734718322754, + "learning_rate": 4.54481225937111e-05, + "loss": 3.8038, + "step": 32810 + }, + { + "epoch": 0.19513631173280047, + "grad_norm": 2.3515050411224365, + "learning_rate": 4.544785385667797e-05, + "loss": 4.2252, + "step": 32811 + }, + { + "epoch": 0.19514225901608145, + "grad_norm": 1.9163727760314941, + "learning_rate": 4.54475851125067e-05, + "loss": 4.5602, + "step": 32812 + }, + { + "epoch": 0.19514820629936244, + "grad_norm": 1.7677435874938965, + "learning_rate": 4.544731636119739e-05, + "loss": 4.4722, + "step": 32813 + }, + { + "epoch": 0.19515415358264346, + "grad_norm": 2.2984094619750977, + "learning_rate": 4.544704760275014e-05, + "loss": 4.7956, + "step": 32814 + }, + { + "epoch": 0.19516010086592445, + "grad_norm": 1.6422353982925415, + "learning_rate": 4.544677883716503e-05, + "loss": 4.5415, + "step": 32815 + }, + { + "epoch": 0.19516604814920543, + "grad_norm": 1.931949496269226, + "learning_rate": 4.544651006444216e-05, + "loss": 4.22, + "step": 32816 + }, + { + "epoch": 0.19517199543248645, + "grad_norm": 1.7817195653915405, + "learning_rate": 4.5446241284581634e-05, + "loss": 4.501, + "step": 32817 + }, + { + "epoch": 0.19517794271576744, + "grad_norm": 1.7133018970489502, + "learning_rate": 4.544597249758353e-05, + "loss": 5.3803, + "step": 32818 + }, + { + "epoch": 0.19518388999904843, + "grad_norm": 1.8452601432800293, + "learning_rate": 4.544570370344795e-05, + "loss": 4.819, + "step": 32819 + }, + { + "epoch": 0.19518983728232944, + "grad_norm": 1.8337496519088745, + "learning_rate": 4.544543490217498e-05, + "loss": 4.4296, + "step": 32820 + }, + { + "epoch": 0.19519578456561043, + "grad_norm": 1.8516271114349365, + "learning_rate": 4.544516609376472e-05, + "loss": 4.4751, + "step": 32821 + }, + { + "epoch": 0.19520173184889142, + "grad_norm": 1.8621165752410889, + "learning_rate": 4.5444897278217276e-05, + "loss": 4.5967, + "step": 32822 + }, + { + "epoch": 0.19520767913217243, + "grad_norm": 1.7688180208206177, + "learning_rate": 4.5444628455532725e-05, + "loss": 4.4673, + "step": 32823 + }, + { + "epoch": 0.19521362641545342, + "grad_norm": 2.06526517868042, + "learning_rate": 4.5444359625711156e-05, + "loss": 4.4147, + "step": 32824 + }, + { + "epoch": 0.1952195736987344, + "grad_norm": 1.914939045906067, + "learning_rate": 4.544409078875268e-05, + "loss": 4.4997, + "step": 32825 + }, + { + "epoch": 0.19522552098201543, + "grad_norm": 1.7460870742797852, + "learning_rate": 4.544382194465738e-05, + "loss": 4.4229, + "step": 32826 + }, + { + "epoch": 0.19523146826529642, + "grad_norm": 1.801352620124817, + "learning_rate": 4.5443553093425353e-05, + "loss": 4.4512, + "step": 32827 + }, + { + "epoch": 0.1952374155485774, + "grad_norm": 1.7984275817871094, + "learning_rate": 4.544328423505669e-05, + "loss": 4.4493, + "step": 32828 + }, + { + "epoch": 0.19524336283185842, + "grad_norm": 1.717602014541626, + "learning_rate": 4.54430153695515e-05, + "loss": 4.2443, + "step": 32829 + }, + { + "epoch": 0.1952493101151394, + "grad_norm": 1.5940791368484497, + "learning_rate": 4.5442746496909856e-05, + "loss": 5.0274, + "step": 32830 + }, + { + "epoch": 0.1952552573984204, + "grad_norm": 1.8372310400009155, + "learning_rate": 4.544247761713186e-05, + "loss": 4.3161, + "step": 32831 + }, + { + "epoch": 0.1952612046817014, + "grad_norm": 1.54202401638031, + "learning_rate": 4.544220873021761e-05, + "loss": 4.8721, + "step": 32832 + }, + { + "epoch": 0.1952671519649824, + "grad_norm": 1.751818299293518, + "learning_rate": 4.544193983616719e-05, + "loss": 4.7725, + "step": 32833 + }, + { + "epoch": 0.1952730992482634, + "grad_norm": 1.7168902158737183, + "learning_rate": 4.5441670934980704e-05, + "loss": 4.583, + "step": 32834 + }, + { + "epoch": 0.1952790465315444, + "grad_norm": 1.6986567974090576, + "learning_rate": 4.544140202665824e-05, + "loss": 4.6351, + "step": 32835 + }, + { + "epoch": 0.1952849938148254, + "grad_norm": 1.9313454627990723, + "learning_rate": 4.54411331111999e-05, + "loss": 4.5758, + "step": 32836 + }, + { + "epoch": 0.19529094109810638, + "grad_norm": 1.725821614265442, + "learning_rate": 4.544086418860576e-05, + "loss": 4.569, + "step": 32837 + }, + { + "epoch": 0.1952968883813874, + "grad_norm": 1.6717588901519775, + "learning_rate": 4.5440595258875935e-05, + "loss": 4.3477, + "step": 32838 + }, + { + "epoch": 0.19530283566466838, + "grad_norm": 1.7990453243255615, + "learning_rate": 4.544032632201051e-05, + "loss": 4.063, + "step": 32839 + }, + { + "epoch": 0.19530878294794937, + "grad_norm": 1.9088106155395508, + "learning_rate": 4.544005737800957e-05, + "loss": 3.9041, + "step": 32840 + }, + { + "epoch": 0.19531473023123036, + "grad_norm": 1.69024658203125, + "learning_rate": 4.543978842687322e-05, + "loss": 4.6119, + "step": 32841 + }, + { + "epoch": 0.19532067751451138, + "grad_norm": 1.9073139429092407, + "learning_rate": 4.5439519468601553e-05, + "loss": 5.0721, + "step": 32842 + }, + { + "epoch": 0.19532662479779236, + "grad_norm": 1.8873580694198608, + "learning_rate": 4.543925050319466e-05, + "loss": 4.6936, + "step": 32843 + }, + { + "epoch": 0.19533257208107335, + "grad_norm": 1.8995583057403564, + "learning_rate": 4.543898153065264e-05, + "loss": 4.6549, + "step": 32844 + }, + { + "epoch": 0.19533851936435437, + "grad_norm": 1.682868480682373, + "learning_rate": 4.543871255097558e-05, + "loss": 4.7243, + "step": 32845 + }, + { + "epoch": 0.19534446664763536, + "grad_norm": 1.9877535104751587, + "learning_rate": 4.543844356416358e-05, + "loss": 4.8988, + "step": 32846 + }, + { + "epoch": 0.19535041393091634, + "grad_norm": 1.8145191669464111, + "learning_rate": 4.5438174570216716e-05, + "loss": 4.4799, + "step": 32847 + }, + { + "epoch": 0.19535636121419736, + "grad_norm": 2.011967182159424, + "learning_rate": 4.5437905569135106e-05, + "loss": 4.7395, + "step": 32848 + }, + { + "epoch": 0.19536230849747835, + "grad_norm": 2.4764623641967773, + "learning_rate": 4.5437636560918837e-05, + "loss": 4.6929, + "step": 32849 + }, + { + "epoch": 0.19536825578075934, + "grad_norm": 2.0704572200775146, + "learning_rate": 4.5437367545568e-05, + "loss": 4.748, + "step": 32850 + }, + { + "epoch": 0.19537420306404035, + "grad_norm": 2.423081159591675, + "learning_rate": 4.543709852308269e-05, + "loss": 4.4915, + "step": 32851 + }, + { + "epoch": 0.19538015034732134, + "grad_norm": 1.9316084384918213, + "learning_rate": 4.5436829493462995e-05, + "loss": 4.647, + "step": 32852 + }, + { + "epoch": 0.19538609763060233, + "grad_norm": 1.9612782001495361, + "learning_rate": 4.543656045670902e-05, + "loss": 4.5981, + "step": 32853 + }, + { + "epoch": 0.19539204491388334, + "grad_norm": 1.8731095790863037, + "learning_rate": 4.543629141282085e-05, + "loss": 4.7771, + "step": 32854 + }, + { + "epoch": 0.19539799219716433, + "grad_norm": 2.0212156772613525, + "learning_rate": 4.543602236179858e-05, + "loss": 4.825, + "step": 32855 + }, + { + "epoch": 0.19540393948044532, + "grad_norm": 1.7706294059753418, + "learning_rate": 4.5435753303642305e-05, + "loss": 4.5824, + "step": 32856 + }, + { + "epoch": 0.19540988676372634, + "grad_norm": 1.7553569078445435, + "learning_rate": 4.543548423835212e-05, + "loss": 4.5136, + "step": 32857 + }, + { + "epoch": 0.19541583404700733, + "grad_norm": 1.8563607931137085, + "learning_rate": 4.5435215165928126e-05, + "loss": 4.6181, + "step": 32858 + }, + { + "epoch": 0.1954217813302883, + "grad_norm": 1.7440742254257202, + "learning_rate": 4.54349460863704e-05, + "loss": 4.7325, + "step": 32859 + }, + { + "epoch": 0.19542772861356933, + "grad_norm": 1.9677025079727173, + "learning_rate": 4.543467699967905e-05, + "loss": 4.4658, + "step": 32860 + }, + { + "epoch": 0.19543367589685032, + "grad_norm": 1.8595266342163086, + "learning_rate": 4.543440790585417e-05, + "loss": 4.89, + "step": 32861 + }, + { + "epoch": 0.1954396231801313, + "grad_norm": 1.582334280014038, + "learning_rate": 4.543413880489584e-05, + "loss": 5.5308, + "step": 32862 + }, + { + "epoch": 0.19544557046341232, + "grad_norm": 1.445690631866455, + "learning_rate": 4.543386969680417e-05, + "loss": 5.403, + "step": 32863 + }, + { + "epoch": 0.1954515177466933, + "grad_norm": 2.2700390815734863, + "learning_rate": 4.543360058157924e-05, + "loss": 4.45, + "step": 32864 + }, + { + "epoch": 0.1954574650299743, + "grad_norm": 1.6211910247802734, + "learning_rate": 4.543333145922116e-05, + "loss": 5.1182, + "step": 32865 + }, + { + "epoch": 0.1954634123132553, + "grad_norm": 1.6810754537582397, + "learning_rate": 4.543306232973001e-05, + "loss": 4.3004, + "step": 32866 + }, + { + "epoch": 0.1954693595965363, + "grad_norm": 1.7132052183151245, + "learning_rate": 4.5432793193105884e-05, + "loss": 4.1847, + "step": 32867 + }, + { + "epoch": 0.1954753068798173, + "grad_norm": 1.7639859914779663, + "learning_rate": 4.543252404934889e-05, + "loss": 3.8669, + "step": 32868 + }, + { + "epoch": 0.1954812541630983, + "grad_norm": 1.8150482177734375, + "learning_rate": 4.543225489845911e-05, + "loss": 4.6428, + "step": 32869 + }, + { + "epoch": 0.1954872014463793, + "grad_norm": 1.6536823511123657, + "learning_rate": 4.5431985740436636e-05, + "loss": 5.0678, + "step": 32870 + }, + { + "epoch": 0.19549314872966028, + "grad_norm": 1.6762903928756714, + "learning_rate": 4.5431716575281574e-05, + "loss": 4.7561, + "step": 32871 + }, + { + "epoch": 0.1954990960129413, + "grad_norm": 1.7050037384033203, + "learning_rate": 4.543144740299401e-05, + "loss": 4.5412, + "step": 32872 + }, + { + "epoch": 0.1955050432962223, + "grad_norm": 1.747406244277954, + "learning_rate": 4.5431178223574034e-05, + "loss": 4.3635, + "step": 32873 + }, + { + "epoch": 0.19551099057950327, + "grad_norm": 1.8921083211898804, + "learning_rate": 4.5430909037021754e-05, + "loss": 4.2937, + "step": 32874 + }, + { + "epoch": 0.1955169378627843, + "grad_norm": 1.8949475288391113, + "learning_rate": 4.543063984333724e-05, + "loss": 4.3948, + "step": 32875 + }, + { + "epoch": 0.19552288514606528, + "grad_norm": 1.673620343208313, + "learning_rate": 4.543037064252061e-05, + "loss": 4.3489, + "step": 32876 + }, + { + "epoch": 0.19552883242934627, + "grad_norm": 1.3811769485473633, + "learning_rate": 4.543010143457195e-05, + "loss": 5.1095, + "step": 32877 + }, + { + "epoch": 0.19553477971262728, + "grad_norm": 1.7001514434814453, + "learning_rate": 4.5429832219491354e-05, + "loss": 4.3213, + "step": 32878 + }, + { + "epoch": 0.19554072699590827, + "grad_norm": 1.6770343780517578, + "learning_rate": 4.5429562997278905e-05, + "loss": 4.2158, + "step": 32879 + }, + { + "epoch": 0.19554667427918926, + "grad_norm": 1.7170015573501587, + "learning_rate": 4.542929376793472e-05, + "loss": 4.2278, + "step": 32880 + }, + { + "epoch": 0.19555262156247027, + "grad_norm": 1.381479263305664, + "learning_rate": 4.542902453145886e-05, + "loss": 4.084, + "step": 32881 + }, + { + "epoch": 0.19555856884575126, + "grad_norm": 1.6345101594924927, + "learning_rate": 4.542875528785145e-05, + "loss": 4.9881, + "step": 32882 + }, + { + "epoch": 0.19556451612903225, + "grad_norm": 1.6286730766296387, + "learning_rate": 4.542848603711258e-05, + "loss": 4.9177, + "step": 32883 + }, + { + "epoch": 0.19557046341231327, + "grad_norm": 1.800990104675293, + "learning_rate": 4.5428216779242336e-05, + "loss": 4.7879, + "step": 32884 + }, + { + "epoch": 0.19557641069559425, + "grad_norm": 1.6580811738967896, + "learning_rate": 4.54279475142408e-05, + "loss": 4.924, + "step": 32885 + }, + { + "epoch": 0.19558235797887524, + "grad_norm": 1.6891523599624634, + "learning_rate": 4.542767824210808e-05, + "loss": 5.1019, + "step": 32886 + }, + { + "epoch": 0.19558830526215626, + "grad_norm": 1.9298063516616821, + "learning_rate": 4.542740896284428e-05, + "loss": 3.242, + "step": 32887 + }, + { + "epoch": 0.19559425254543725, + "grad_norm": 1.5368505716323853, + "learning_rate": 4.542713967644948e-05, + "loss": 4.5356, + "step": 32888 + }, + { + "epoch": 0.19560019982871824, + "grad_norm": 1.5489712953567505, + "learning_rate": 4.542687038292377e-05, + "loss": 5.2564, + "step": 32889 + }, + { + "epoch": 0.19560614711199925, + "grad_norm": 1.367336392402649, + "learning_rate": 4.5426601082267254e-05, + "loss": 5.2064, + "step": 32890 + }, + { + "epoch": 0.19561209439528024, + "grad_norm": 1.6218714714050293, + "learning_rate": 4.542633177448002e-05, + "loss": 5.1743, + "step": 32891 + }, + { + "epoch": 0.19561804167856123, + "grad_norm": 1.8278539180755615, + "learning_rate": 4.542606245956217e-05, + "loss": 5.2703, + "step": 32892 + }, + { + "epoch": 0.19562398896184224, + "grad_norm": 1.579430341720581, + "learning_rate": 4.542579313751379e-05, + "loss": 5.3688, + "step": 32893 + }, + { + "epoch": 0.19562993624512323, + "grad_norm": 1.5411518812179565, + "learning_rate": 4.542552380833498e-05, + "loss": 4.9046, + "step": 32894 + }, + { + "epoch": 0.19563588352840422, + "grad_norm": 1.7552369832992554, + "learning_rate": 4.542525447202582e-05, + "loss": 4.7059, + "step": 32895 + }, + { + "epoch": 0.19564183081168524, + "grad_norm": 1.7786237001419067, + "learning_rate": 4.5424985128586424e-05, + "loss": 4.6587, + "step": 32896 + }, + { + "epoch": 0.19564777809496622, + "grad_norm": 1.7300502061843872, + "learning_rate": 4.542471577801687e-05, + "loss": 4.8136, + "step": 32897 + }, + { + "epoch": 0.1956537253782472, + "grad_norm": 1.612258791923523, + "learning_rate": 4.542444642031727e-05, + "loss": 4.8126, + "step": 32898 + }, + { + "epoch": 0.19565967266152823, + "grad_norm": 1.6730974912643433, + "learning_rate": 4.5424177055487693e-05, + "loss": 4.6028, + "step": 32899 + }, + { + "epoch": 0.19566561994480922, + "grad_norm": 1.7678196430206299, + "learning_rate": 4.5423907683528256e-05, + "loss": 4.8532, + "step": 32900 + }, + { + "epoch": 0.1956715672280902, + "grad_norm": 1.8062410354614258, + "learning_rate": 4.5423638304439044e-05, + "loss": 4.8629, + "step": 32901 + }, + { + "epoch": 0.1956775145113712, + "grad_norm": 1.6573866605758667, + "learning_rate": 4.5423368918220146e-05, + "loss": 4.6673, + "step": 32902 + }, + { + "epoch": 0.1956834617946522, + "grad_norm": 1.6542189121246338, + "learning_rate": 4.5423099524871656e-05, + "loss": 4.6684, + "step": 32903 + }, + { + "epoch": 0.1956894090779332, + "grad_norm": 1.6946748495101929, + "learning_rate": 4.542283012439369e-05, + "loss": 4.7429, + "step": 32904 + }, + { + "epoch": 0.19569535636121418, + "grad_norm": 1.6120235919952393, + "learning_rate": 4.542256071678631e-05, + "loss": 4.917, + "step": 32905 + }, + { + "epoch": 0.1957013036444952, + "grad_norm": 1.635718822479248, + "learning_rate": 4.5422291302049634e-05, + "loss": 4.9375, + "step": 32906 + }, + { + "epoch": 0.1957072509277762, + "grad_norm": 1.5894275903701782, + "learning_rate": 4.542202188018374e-05, + "loss": 4.8344, + "step": 32907 + }, + { + "epoch": 0.19571319821105718, + "grad_norm": 1.6269679069519043, + "learning_rate": 4.542175245118873e-05, + "loss": 5.1075, + "step": 32908 + }, + { + "epoch": 0.1957191454943382, + "grad_norm": 1.5488533973693848, + "learning_rate": 4.5421483015064703e-05, + "loss": 4.7309, + "step": 32909 + }, + { + "epoch": 0.19572509277761918, + "grad_norm": 1.6821287870407104, + "learning_rate": 4.5421213571811736e-05, + "loss": 5.0394, + "step": 32910 + }, + { + "epoch": 0.19573104006090017, + "grad_norm": 1.798214316368103, + "learning_rate": 4.542094412142995e-05, + "loss": 4.68, + "step": 32911 + }, + { + "epoch": 0.19573698734418118, + "grad_norm": 1.5563793182373047, + "learning_rate": 4.542067466391942e-05, + "loss": 5.062, + "step": 32912 + }, + { + "epoch": 0.19574293462746217, + "grad_norm": 1.9282946586608887, + "learning_rate": 4.542040519928024e-05, + "loss": 4.676, + "step": 32913 + }, + { + "epoch": 0.19574888191074316, + "grad_norm": 1.7567148208618164, + "learning_rate": 4.5420135727512504e-05, + "loss": 4.8652, + "step": 32914 + }, + { + "epoch": 0.19575482919402418, + "grad_norm": 1.8503345251083374, + "learning_rate": 4.54198662486163e-05, + "loss": 4.6167, + "step": 32915 + }, + { + "epoch": 0.19576077647730517, + "grad_norm": 1.3916583061218262, + "learning_rate": 4.5419596762591755e-05, + "loss": 4.9467, + "step": 32916 + }, + { + "epoch": 0.19576672376058615, + "grad_norm": 1.5608447790145874, + "learning_rate": 4.5419327269438925e-05, + "loss": 4.9949, + "step": 32917 + }, + { + "epoch": 0.19577267104386717, + "grad_norm": 1.7178374528884888, + "learning_rate": 4.5419057769157927e-05, + "loss": 5.0352, + "step": 32918 + }, + { + "epoch": 0.19577861832714816, + "grad_norm": 2.536865234375, + "learning_rate": 4.5418788261748834e-05, + "loss": 4.6149, + "step": 32919 + }, + { + "epoch": 0.19578456561042915, + "grad_norm": 2.1935441493988037, + "learning_rate": 4.541851874721176e-05, + "loss": 4.6066, + "step": 32920 + }, + { + "epoch": 0.19579051289371016, + "grad_norm": 1.8435254096984863, + "learning_rate": 4.5418249225546794e-05, + "loss": 4.7086, + "step": 32921 + }, + { + "epoch": 0.19579646017699115, + "grad_norm": 2.078380584716797, + "learning_rate": 4.541797969675403e-05, + "loss": 4.0338, + "step": 32922 + }, + { + "epoch": 0.19580240746027214, + "grad_norm": 1.5899152755737305, + "learning_rate": 4.541771016083356e-05, + "loss": 4.9363, + "step": 32923 + }, + { + "epoch": 0.19580835474355315, + "grad_norm": 1.5216234922409058, + "learning_rate": 4.541744061778547e-05, + "loss": 4.9327, + "step": 32924 + }, + { + "epoch": 0.19581430202683414, + "grad_norm": 1.815258502960205, + "learning_rate": 4.541717106760987e-05, + "loss": 4.8025, + "step": 32925 + }, + { + "epoch": 0.19582024931011513, + "grad_norm": 1.6201529502868652, + "learning_rate": 4.541690151030684e-05, + "loss": 4.6954, + "step": 32926 + }, + { + "epoch": 0.19582619659339615, + "grad_norm": 2.1462132930755615, + "learning_rate": 4.5416631945876494e-05, + "loss": 4.0121, + "step": 32927 + }, + { + "epoch": 0.19583214387667713, + "grad_norm": 1.832979679107666, + "learning_rate": 4.54163623743189e-05, + "loss": 4.1431, + "step": 32928 + }, + { + "epoch": 0.19583809115995812, + "grad_norm": 1.5806697607040405, + "learning_rate": 4.5416092795634167e-05, + "loss": 5.179, + "step": 32929 + }, + { + "epoch": 0.19584403844323914, + "grad_norm": 1.6073065996170044, + "learning_rate": 4.5415823209822397e-05, + "loss": 4.9575, + "step": 32930 + }, + { + "epoch": 0.19584998572652013, + "grad_norm": 1.9261529445648193, + "learning_rate": 4.541555361688366e-05, + "loss": 4.5455, + "step": 32931 + }, + { + "epoch": 0.19585593300980111, + "grad_norm": 1.7330681085586548, + "learning_rate": 4.541528401681807e-05, + "loss": 4.6912, + "step": 32932 + }, + { + "epoch": 0.19586188029308213, + "grad_norm": 1.682050347328186, + "learning_rate": 4.541501440962572e-05, + "loss": 4.7136, + "step": 32933 + }, + { + "epoch": 0.19586782757636312, + "grad_norm": 1.949378252029419, + "learning_rate": 4.541474479530669e-05, + "loss": 4.7249, + "step": 32934 + }, + { + "epoch": 0.1958737748596441, + "grad_norm": 1.828692078590393, + "learning_rate": 4.541447517386109e-05, + "loss": 4.1328, + "step": 32935 + }, + { + "epoch": 0.19587972214292512, + "grad_norm": 1.5934466123580933, + "learning_rate": 4.5414205545289e-05, + "loss": 4.1072, + "step": 32936 + }, + { + "epoch": 0.1958856694262061, + "grad_norm": 1.539414882659912, + "learning_rate": 4.541393590959053e-05, + "loss": 4.9876, + "step": 32937 + }, + { + "epoch": 0.1958916167094871, + "grad_norm": 1.6584326028823853, + "learning_rate": 4.5413666266765765e-05, + "loss": 5.1275, + "step": 32938 + }, + { + "epoch": 0.19589756399276811, + "grad_norm": 1.6987926959991455, + "learning_rate": 4.5413396616814797e-05, + "loss": 4.6132, + "step": 32939 + }, + { + "epoch": 0.1959035112760491, + "grad_norm": 1.741507649421692, + "learning_rate": 4.5413126959737727e-05, + "loss": 4.8676, + "step": 32940 + }, + { + "epoch": 0.1959094585593301, + "grad_norm": 1.660169005393982, + "learning_rate": 4.5412857295534636e-05, + "loss": 4.5619, + "step": 32941 + }, + { + "epoch": 0.1959154058426111, + "grad_norm": 1.785941481590271, + "learning_rate": 4.541258762420564e-05, + "loss": 4.6198, + "step": 32942 + }, + { + "epoch": 0.1959213531258921, + "grad_norm": 1.7086410522460938, + "learning_rate": 4.5412317945750814e-05, + "loss": 4.4215, + "step": 32943 + }, + { + "epoch": 0.19592730040917308, + "grad_norm": 1.8102631568908691, + "learning_rate": 4.541204826017026e-05, + "loss": 4.5202, + "step": 32944 + }, + { + "epoch": 0.1959332476924541, + "grad_norm": 1.8132258653640747, + "learning_rate": 4.541177856746407e-05, + "loss": 4.5079, + "step": 32945 + }, + { + "epoch": 0.1959391949757351, + "grad_norm": 2.1485495567321777, + "learning_rate": 4.541150886763234e-05, + "loss": 4.3483, + "step": 32946 + }, + { + "epoch": 0.19594514225901608, + "grad_norm": 1.4712555408477783, + "learning_rate": 4.541123916067516e-05, + "loss": 4.2898, + "step": 32947 + }, + { + "epoch": 0.1959510895422971, + "grad_norm": 1.8281893730163574, + "learning_rate": 4.541096944659263e-05, + "loss": 4.2012, + "step": 32948 + }, + { + "epoch": 0.19595703682557808, + "grad_norm": 1.8990435600280762, + "learning_rate": 4.541069972538484e-05, + "loss": 3.783, + "step": 32949 + }, + { + "epoch": 0.19596298410885907, + "grad_norm": 1.5741428136825562, + "learning_rate": 4.541042999705189e-05, + "loss": 5.6173, + "step": 32950 + }, + { + "epoch": 0.19596893139214008, + "grad_norm": 1.9983577728271484, + "learning_rate": 4.541016026159387e-05, + "loss": 5.3004, + "step": 32951 + }, + { + "epoch": 0.19597487867542107, + "grad_norm": 1.514825701713562, + "learning_rate": 4.5409890519010866e-05, + "loss": 4.927, + "step": 32952 + }, + { + "epoch": 0.19598082595870206, + "grad_norm": 1.4992481470108032, + "learning_rate": 4.5409620769302985e-05, + "loss": 5.0936, + "step": 32953 + }, + { + "epoch": 0.19598677324198308, + "grad_norm": 1.5728949308395386, + "learning_rate": 4.5409351012470316e-05, + "loss": 5.3866, + "step": 32954 + }, + { + "epoch": 0.19599272052526406, + "grad_norm": 1.4812259674072266, + "learning_rate": 4.5409081248512955e-05, + "loss": 5.3022, + "step": 32955 + }, + { + "epoch": 0.19599866780854505, + "grad_norm": 1.670962929725647, + "learning_rate": 4.540881147743099e-05, + "loss": 4.9182, + "step": 32956 + }, + { + "epoch": 0.19600461509182607, + "grad_norm": 2.0521981716156006, + "learning_rate": 4.540854169922453e-05, + "loss": 3.4115, + "step": 32957 + }, + { + "epoch": 0.19601056237510706, + "grad_norm": 1.4718791246414185, + "learning_rate": 4.5408271913893646e-05, + "loss": 4.712, + "step": 32958 + }, + { + "epoch": 0.19601650965838804, + "grad_norm": 1.6191232204437256, + "learning_rate": 4.540800212143845e-05, + "loss": 5.2245, + "step": 32959 + }, + { + "epoch": 0.19602245694166903, + "grad_norm": 1.2227195501327515, + "learning_rate": 4.540773232185903e-05, + "loss": 5.4206, + "step": 32960 + }, + { + "epoch": 0.19602840422495005, + "grad_norm": 1.417944073677063, + "learning_rate": 4.540746251515549e-05, + "loss": 5.2218, + "step": 32961 + }, + { + "epoch": 0.19603435150823104, + "grad_norm": 1.7467671632766724, + "learning_rate": 4.5407192701327904e-05, + "loss": 3.9432, + "step": 32962 + }, + { + "epoch": 0.19604029879151202, + "grad_norm": 1.7392356395721436, + "learning_rate": 4.5406922880376386e-05, + "loss": 4.4476, + "step": 32963 + }, + { + "epoch": 0.19604624607479304, + "grad_norm": 1.9860972166061401, + "learning_rate": 4.5406653052301017e-05, + "loss": 4.663, + "step": 32964 + }, + { + "epoch": 0.19605219335807403, + "grad_norm": 2.1099915504455566, + "learning_rate": 4.54063832171019e-05, + "loss": 5.0918, + "step": 32965 + }, + { + "epoch": 0.19605814064135502, + "grad_norm": 1.7715723514556885, + "learning_rate": 4.540611337477913e-05, + "loss": 5.0995, + "step": 32966 + }, + { + "epoch": 0.19606408792463603, + "grad_norm": 1.490571141242981, + "learning_rate": 4.5405843525332784e-05, + "loss": 5.2124, + "step": 32967 + }, + { + "epoch": 0.19607003520791702, + "grad_norm": 1.6321748495101929, + "learning_rate": 4.5405573668762975e-05, + "loss": 5.3355, + "step": 32968 + }, + { + "epoch": 0.196075982491198, + "grad_norm": 1.6419252157211304, + "learning_rate": 4.540530380506979e-05, + "loss": 5.004, + "step": 32969 + }, + { + "epoch": 0.19608192977447902, + "grad_norm": 1.8405059576034546, + "learning_rate": 4.5405033934253326e-05, + "loss": 4.7141, + "step": 32970 + }, + { + "epoch": 0.19608787705776, + "grad_norm": 2.000082492828369, + "learning_rate": 4.5404764056313675e-05, + "loss": 4.2939, + "step": 32971 + }, + { + "epoch": 0.196093824341041, + "grad_norm": 2.117877960205078, + "learning_rate": 4.540449417125093e-05, + "loss": 4.3797, + "step": 32972 + }, + { + "epoch": 0.19609977162432202, + "grad_norm": 2.1574883460998535, + "learning_rate": 4.540422427906519e-05, + "loss": 4.1668, + "step": 32973 + }, + { + "epoch": 0.196105718907603, + "grad_norm": 2.0455899238586426, + "learning_rate": 4.5403954379756544e-05, + "loss": 4.5018, + "step": 32974 + }, + { + "epoch": 0.196111666190884, + "grad_norm": 1.5043025016784668, + "learning_rate": 4.540368447332509e-05, + "loss": 5.0477, + "step": 32975 + }, + { + "epoch": 0.196117613474165, + "grad_norm": 2.334475040435791, + "learning_rate": 4.5403414559770917e-05, + "loss": 4.673, + "step": 32976 + }, + { + "epoch": 0.196123560757446, + "grad_norm": 2.1780107021331787, + "learning_rate": 4.540314463909413e-05, + "loss": 4.7737, + "step": 32977 + }, + { + "epoch": 0.19612950804072699, + "grad_norm": 1.887604832649231, + "learning_rate": 4.540287471129481e-05, + "loss": 4.8789, + "step": 32978 + }, + { + "epoch": 0.196135455324008, + "grad_norm": 1.7331857681274414, + "learning_rate": 4.540260477637306e-05, + "loss": 5.079, + "step": 32979 + }, + { + "epoch": 0.196141402607289, + "grad_norm": 1.5630770921707153, + "learning_rate": 4.540233483432896e-05, + "loss": 5.1134, + "step": 32980 + }, + { + "epoch": 0.19614734989056998, + "grad_norm": 2.662470817565918, + "learning_rate": 4.540206488516263e-05, + "loss": 4.0535, + "step": 32981 + }, + { + "epoch": 0.196153297173851, + "grad_norm": 2.1369266510009766, + "learning_rate": 4.5401794928874145e-05, + "loss": 4.3121, + "step": 32982 + }, + { + "epoch": 0.19615924445713198, + "grad_norm": 2.7305498123168945, + "learning_rate": 4.5401524965463604e-05, + "loss": 3.7048, + "step": 32983 + }, + { + "epoch": 0.19616519174041297, + "grad_norm": 1.8726544380187988, + "learning_rate": 4.540125499493111e-05, + "loss": 4.3633, + "step": 32984 + }, + { + "epoch": 0.19617113902369399, + "grad_norm": 1.4531916379928589, + "learning_rate": 4.5400985017276735e-05, + "loss": 4.9491, + "step": 32985 + }, + { + "epoch": 0.19617708630697497, + "grad_norm": 2.7652368545532227, + "learning_rate": 4.5400715032500595e-05, + "loss": 4.4811, + "step": 32986 + }, + { + "epoch": 0.19618303359025596, + "grad_norm": 2.069976568222046, + "learning_rate": 4.540044504060277e-05, + "loss": 4.6528, + "step": 32987 + }, + { + "epoch": 0.19618898087353698, + "grad_norm": 2.0444564819335938, + "learning_rate": 4.540017504158337e-05, + "loss": 4.6799, + "step": 32988 + }, + { + "epoch": 0.19619492815681797, + "grad_norm": 1.6998240947723389, + "learning_rate": 4.5399905035442467e-05, + "loss": 4.8338, + "step": 32989 + }, + { + "epoch": 0.19620087544009895, + "grad_norm": 1.84773588180542, + "learning_rate": 4.5399635022180175e-05, + "loss": 4.7789, + "step": 32990 + }, + { + "epoch": 0.19620682272337997, + "grad_norm": 1.6940088272094727, + "learning_rate": 4.5399365001796586e-05, + "loss": 4.9627, + "step": 32991 + }, + { + "epoch": 0.19621277000666096, + "grad_norm": 1.6246799230575562, + "learning_rate": 4.539909497429178e-05, + "loss": 4.996, + "step": 32992 + }, + { + "epoch": 0.19621871728994195, + "grad_norm": 1.7621272802352905, + "learning_rate": 4.539882493966587e-05, + "loss": 4.9238, + "step": 32993 + }, + { + "epoch": 0.19622466457322296, + "grad_norm": 1.5167536735534668, + "learning_rate": 4.539855489791893e-05, + "loss": 5.2917, + "step": 32994 + }, + { + "epoch": 0.19623061185650395, + "grad_norm": 1.728780746459961, + "learning_rate": 4.539828484905107e-05, + "loss": 5.515, + "step": 32995 + }, + { + "epoch": 0.19623655913978494, + "grad_norm": 1.7221986055374146, + "learning_rate": 4.5398014793062386e-05, + "loss": 4.7529, + "step": 32996 + }, + { + "epoch": 0.19624250642306595, + "grad_norm": 2.0246353149414062, + "learning_rate": 4.5397744729952964e-05, + "loss": 4.4317, + "step": 32997 + }, + { + "epoch": 0.19624845370634694, + "grad_norm": 1.7018826007843018, + "learning_rate": 4.5397474659722896e-05, + "loss": 4.7282, + "step": 32998 + }, + { + "epoch": 0.19625440098962793, + "grad_norm": 1.7294108867645264, + "learning_rate": 4.5397204582372276e-05, + "loss": 4.4915, + "step": 32999 + }, + { + "epoch": 0.19626034827290895, + "grad_norm": 1.8619226217269897, + "learning_rate": 4.539693449790121e-05, + "loss": 4.3318, + "step": 33000 + }, + { + "epoch": 0.19626629555618993, + "grad_norm": 1.7333225011825562, + "learning_rate": 4.5396664406309785e-05, + "loss": 4.371, + "step": 33001 + }, + { + "epoch": 0.19627224283947092, + "grad_norm": 1.843048095703125, + "learning_rate": 4.539639430759809e-05, + "loss": 4.4083, + "step": 33002 + }, + { + "epoch": 0.19627819012275194, + "grad_norm": 1.607068419456482, + "learning_rate": 4.5396124201766226e-05, + "loss": 4.3408, + "step": 33003 + }, + { + "epoch": 0.19628413740603293, + "grad_norm": 1.7347930669784546, + "learning_rate": 4.539585408881429e-05, + "loss": 4.0526, + "step": 33004 + }, + { + "epoch": 0.19629008468931392, + "grad_norm": 1.7406977415084839, + "learning_rate": 4.539558396874237e-05, + "loss": 4.1061, + "step": 33005 + }, + { + "epoch": 0.19629603197259493, + "grad_norm": 1.804682731628418, + "learning_rate": 4.5395313841550555e-05, + "loss": 4.4277, + "step": 33006 + }, + { + "epoch": 0.19630197925587592, + "grad_norm": 1.443662405014038, + "learning_rate": 4.5395043707238954e-05, + "loss": 4.5178, + "step": 33007 + }, + { + "epoch": 0.1963079265391569, + "grad_norm": 1.7877321243286133, + "learning_rate": 4.5394773565807655e-05, + "loss": 4.5201, + "step": 33008 + }, + { + "epoch": 0.19631387382243792, + "grad_norm": 2.0785722732543945, + "learning_rate": 4.539450341725675e-05, + "loss": 4.3904, + "step": 33009 + }, + { + "epoch": 0.1963198211057189, + "grad_norm": 1.7074304819107056, + "learning_rate": 4.5394233261586336e-05, + "loss": 4.8033, + "step": 33010 + }, + { + "epoch": 0.1963257683889999, + "grad_norm": 1.7941499948501587, + "learning_rate": 4.53939630987965e-05, + "loss": 5.0883, + "step": 33011 + }, + { + "epoch": 0.19633171567228092, + "grad_norm": 1.5039217472076416, + "learning_rate": 4.539369292888734e-05, + "loss": 4.5738, + "step": 33012 + }, + { + "epoch": 0.1963376629555619, + "grad_norm": 1.7019708156585693, + "learning_rate": 4.539342275185896e-05, + "loss": 4.2565, + "step": 33013 + }, + { + "epoch": 0.1963436102388429, + "grad_norm": 1.853834867477417, + "learning_rate": 4.539315256771145e-05, + "loss": 4.0714, + "step": 33014 + }, + { + "epoch": 0.1963495575221239, + "grad_norm": 1.663608193397522, + "learning_rate": 4.5392882376444896e-05, + "loss": 4.3966, + "step": 33015 + }, + { + "epoch": 0.1963555048054049, + "grad_norm": 1.6027350425720215, + "learning_rate": 4.539261217805939e-05, + "loss": 4.6439, + "step": 33016 + }, + { + "epoch": 0.19636145208868588, + "grad_norm": 1.6448129415512085, + "learning_rate": 4.539234197255505e-05, + "loss": 4.8542, + "step": 33017 + }, + { + "epoch": 0.19636739937196687, + "grad_norm": 1.5828901529312134, + "learning_rate": 4.539207175993194e-05, + "loss": 4.8388, + "step": 33018 + }, + { + "epoch": 0.1963733466552479, + "grad_norm": 1.5006245374679565, + "learning_rate": 4.5391801540190184e-05, + "loss": 4.8398, + "step": 33019 + }, + { + "epoch": 0.19637929393852888, + "grad_norm": 1.584307312965393, + "learning_rate": 4.5391531313329846e-05, + "loss": 4.6774, + "step": 33020 + }, + { + "epoch": 0.19638524122180986, + "grad_norm": 1.5445975065231323, + "learning_rate": 4.5391261079351036e-05, + "loss": 4.9719, + "step": 33021 + }, + { + "epoch": 0.19639118850509088, + "grad_norm": 1.3577830791473389, + "learning_rate": 4.5390990838253856e-05, + "loss": 4.9825, + "step": 33022 + }, + { + "epoch": 0.19639713578837187, + "grad_norm": 1.3781458139419556, + "learning_rate": 4.539072059003838e-05, + "loss": 4.8167, + "step": 33023 + }, + { + "epoch": 0.19640308307165286, + "grad_norm": 1.602210521697998, + "learning_rate": 4.5390450334704725e-05, + "loss": 4.8044, + "step": 33024 + }, + { + "epoch": 0.19640903035493387, + "grad_norm": 1.4063019752502441, + "learning_rate": 4.539018007225298e-05, + "loss": 4.8966, + "step": 33025 + }, + { + "epoch": 0.19641497763821486, + "grad_norm": 1.4408751726150513, + "learning_rate": 4.538990980268322e-05, + "loss": 5.0514, + "step": 33026 + }, + { + "epoch": 0.19642092492149585, + "grad_norm": 1.548294186592102, + "learning_rate": 4.538963952599555e-05, + "loss": 4.3517, + "step": 33027 + }, + { + "epoch": 0.19642687220477686, + "grad_norm": 1.462956428527832, + "learning_rate": 4.5389369242190075e-05, + "loss": 4.6278, + "step": 33028 + }, + { + "epoch": 0.19643281948805785, + "grad_norm": 1.5403681993484497, + "learning_rate": 4.538909895126689e-05, + "loss": 5.233, + "step": 33029 + }, + { + "epoch": 0.19643876677133884, + "grad_norm": 1.4483433961868286, + "learning_rate": 4.538882865322607e-05, + "loss": 4.8854, + "step": 33030 + }, + { + "epoch": 0.19644471405461986, + "grad_norm": 1.6093387603759766, + "learning_rate": 4.5388558348067725e-05, + "loss": 4.9251, + "step": 33031 + }, + { + "epoch": 0.19645066133790084, + "grad_norm": 1.6804461479187012, + "learning_rate": 4.5388288035791934e-05, + "loss": 4.5588, + "step": 33032 + }, + { + "epoch": 0.19645660862118183, + "grad_norm": 1.799657940864563, + "learning_rate": 4.5388017716398816e-05, + "loss": 4.4804, + "step": 33033 + }, + { + "epoch": 0.19646255590446285, + "grad_norm": 1.4390314817428589, + "learning_rate": 4.538774738988845e-05, + "loss": 4.7733, + "step": 33034 + }, + { + "epoch": 0.19646850318774384, + "grad_norm": 1.8508771657943726, + "learning_rate": 4.538747705626093e-05, + "loss": 4.6182, + "step": 33035 + }, + { + "epoch": 0.19647445047102483, + "grad_norm": 1.6584879159927368, + "learning_rate": 4.538720671551635e-05, + "loss": 4.4793, + "step": 33036 + }, + { + "epoch": 0.19648039775430584, + "grad_norm": 1.6483509540557861, + "learning_rate": 4.538693636765481e-05, + "loss": 4.5021, + "step": 33037 + }, + { + "epoch": 0.19648634503758683, + "grad_norm": 1.6273133754730225, + "learning_rate": 4.53866660126764e-05, + "loss": 4.615, + "step": 33038 + }, + { + "epoch": 0.19649229232086782, + "grad_norm": 1.80341637134552, + "learning_rate": 4.5386395650581215e-05, + "loss": 4.9965, + "step": 33039 + }, + { + "epoch": 0.19649823960414883, + "grad_norm": 1.5780657529830933, + "learning_rate": 4.538612528136935e-05, + "loss": 5.5826, + "step": 33040 + }, + { + "epoch": 0.19650418688742982, + "grad_norm": 1.6564321517944336, + "learning_rate": 4.53858549050409e-05, + "loss": 4.9127, + "step": 33041 + }, + { + "epoch": 0.1965101341707108, + "grad_norm": 1.5042874813079834, + "learning_rate": 4.538558452159596e-05, + "loss": 5.0219, + "step": 33042 + }, + { + "epoch": 0.19651608145399183, + "grad_norm": 1.621291160583496, + "learning_rate": 4.538531413103462e-05, + "loss": 4.8946, + "step": 33043 + }, + { + "epoch": 0.1965220287372728, + "grad_norm": 1.7965176105499268, + "learning_rate": 4.5385043733356976e-05, + "loss": 4.3394, + "step": 33044 + }, + { + "epoch": 0.1965279760205538, + "grad_norm": 1.7505266666412354, + "learning_rate": 4.5384773328563124e-05, + "loss": 4.6966, + "step": 33045 + }, + { + "epoch": 0.19653392330383482, + "grad_norm": 1.4543168544769287, + "learning_rate": 4.538450291665316e-05, + "loss": 5.189, + "step": 33046 + }, + { + "epoch": 0.1965398705871158, + "grad_norm": 1.7490246295928955, + "learning_rate": 4.538423249762718e-05, + "loss": 4.5155, + "step": 33047 + }, + { + "epoch": 0.1965458178703968, + "grad_norm": 1.5133061408996582, + "learning_rate": 4.538396207148528e-05, + "loss": 4.8084, + "step": 33048 + }, + { + "epoch": 0.1965517651536778, + "grad_norm": 1.908988356590271, + "learning_rate": 4.5383691638227534e-05, + "loss": 5.0611, + "step": 33049 + }, + { + "epoch": 0.1965577124369588, + "grad_norm": 1.6494390964508057, + "learning_rate": 4.5383421197854056e-05, + "loss": 5.0496, + "step": 33050 + }, + { + "epoch": 0.1965636597202398, + "grad_norm": 1.5613998174667358, + "learning_rate": 4.5383150750364946e-05, + "loss": 4.6471, + "step": 33051 + }, + { + "epoch": 0.1965696070035208, + "grad_norm": 1.5566452741622925, + "learning_rate": 4.5382880295760284e-05, + "loss": 4.8781, + "step": 33052 + }, + { + "epoch": 0.1965755542868018, + "grad_norm": 1.4820610284805298, + "learning_rate": 4.5382609834040166e-05, + "loss": 4.9933, + "step": 33053 + }, + { + "epoch": 0.19658150157008278, + "grad_norm": 1.6967642307281494, + "learning_rate": 4.5382339365204694e-05, + "loss": 4.981, + "step": 33054 + }, + { + "epoch": 0.1965874488533638, + "grad_norm": 1.6705995798110962, + "learning_rate": 4.538206888925395e-05, + "loss": 4.8135, + "step": 33055 + }, + { + "epoch": 0.19659339613664478, + "grad_norm": 1.6412502527236938, + "learning_rate": 4.5381798406188044e-05, + "loss": 4.3751, + "step": 33056 + }, + { + "epoch": 0.19659934341992577, + "grad_norm": 1.8060193061828613, + "learning_rate": 4.5381527916007063e-05, + "loss": 4.6845, + "step": 33057 + }, + { + "epoch": 0.1966052907032068, + "grad_norm": 1.8145633935928345, + "learning_rate": 4.5381257418711094e-05, + "loss": 4.9956, + "step": 33058 + }, + { + "epoch": 0.19661123798648777, + "grad_norm": 1.7470539808273315, + "learning_rate": 4.538098691430024e-05, + "loss": 4.583, + "step": 33059 + }, + { + "epoch": 0.19661718526976876, + "grad_norm": 1.6165781021118164, + "learning_rate": 4.5380716402774596e-05, + "loss": 4.4499, + "step": 33060 + }, + { + "epoch": 0.19662313255304978, + "grad_norm": 1.5280836820602417, + "learning_rate": 4.538044588413426e-05, + "loss": 4.5728, + "step": 33061 + }, + { + "epoch": 0.19662907983633077, + "grad_norm": 1.750088095664978, + "learning_rate": 4.5380175358379316e-05, + "loss": 4.9698, + "step": 33062 + }, + { + "epoch": 0.19663502711961175, + "grad_norm": 2.1302971839904785, + "learning_rate": 4.537990482550986e-05, + "loss": 4.3426, + "step": 33063 + }, + { + "epoch": 0.19664097440289277, + "grad_norm": 2.5557992458343506, + "learning_rate": 4.5379634285526e-05, + "loss": 4.2919, + "step": 33064 + }, + { + "epoch": 0.19664692168617376, + "grad_norm": 1.9230780601501465, + "learning_rate": 4.5379363738427806e-05, + "loss": 4.4673, + "step": 33065 + }, + { + "epoch": 0.19665286896945475, + "grad_norm": 1.7957717180252075, + "learning_rate": 4.537909318421539e-05, + "loss": 4.0586, + "step": 33066 + }, + { + "epoch": 0.19665881625273576, + "grad_norm": 1.8782682418823242, + "learning_rate": 4.537882262288885e-05, + "loss": 4.449, + "step": 33067 + }, + { + "epoch": 0.19666476353601675, + "grad_norm": 1.7372145652770996, + "learning_rate": 4.5378552054448276e-05, + "loss": 4.8808, + "step": 33068 + }, + { + "epoch": 0.19667071081929774, + "grad_norm": 2.0615148544311523, + "learning_rate": 4.537828147889376e-05, + "loss": 3.8952, + "step": 33069 + }, + { + "epoch": 0.19667665810257876, + "grad_norm": 1.7238409519195557, + "learning_rate": 4.537801089622539e-05, + "loss": 4.489, + "step": 33070 + }, + { + "epoch": 0.19668260538585974, + "grad_norm": 2.1890852451324463, + "learning_rate": 4.537774030644326e-05, + "loss": 4.208, + "step": 33071 + }, + { + "epoch": 0.19668855266914073, + "grad_norm": 2.126760482788086, + "learning_rate": 4.5377469709547485e-05, + "loss": 4.3385, + "step": 33072 + }, + { + "epoch": 0.19669449995242175, + "grad_norm": 1.8360297679901123, + "learning_rate": 4.537719910553814e-05, + "loss": 4.2634, + "step": 33073 + }, + { + "epoch": 0.19670044723570274, + "grad_norm": 1.8070091009140015, + "learning_rate": 4.5376928494415326e-05, + "loss": 4.7106, + "step": 33074 + }, + { + "epoch": 0.19670639451898372, + "grad_norm": 1.9259190559387207, + "learning_rate": 4.537665787617913e-05, + "loss": 4.3962, + "step": 33075 + }, + { + "epoch": 0.1967123418022647, + "grad_norm": 1.8697553873062134, + "learning_rate": 4.5376387250829664e-05, + "loss": 4.4294, + "step": 33076 + }, + { + "epoch": 0.19671828908554573, + "grad_norm": 2.0083229541778564, + "learning_rate": 4.537611661836701e-05, + "loss": 4.3672, + "step": 33077 + }, + { + "epoch": 0.19672423636882672, + "grad_norm": 1.8586071729660034, + "learning_rate": 4.537584597879126e-05, + "loss": 4.2949, + "step": 33078 + }, + { + "epoch": 0.1967301836521077, + "grad_norm": 2.0329997539520264, + "learning_rate": 4.5375575332102514e-05, + "loss": 4.0786, + "step": 33079 + }, + { + "epoch": 0.19673613093538872, + "grad_norm": 1.8664171695709229, + "learning_rate": 4.537530467830087e-05, + "loss": 4.8517, + "step": 33080 + }, + { + "epoch": 0.1967420782186697, + "grad_norm": 1.5570780038833618, + "learning_rate": 4.5375034017386406e-05, + "loss": 4.6582, + "step": 33081 + }, + { + "epoch": 0.1967480255019507, + "grad_norm": 1.5720075368881226, + "learning_rate": 4.537476334935924e-05, + "loss": 4.7118, + "step": 33082 + }, + { + "epoch": 0.1967539727852317, + "grad_norm": 1.888211965560913, + "learning_rate": 4.537449267421945e-05, + "loss": 4.4994, + "step": 33083 + }, + { + "epoch": 0.1967599200685127, + "grad_norm": 1.670282006263733, + "learning_rate": 4.5374221991967136e-05, + "loss": 5.0491, + "step": 33084 + }, + { + "epoch": 0.1967658673517937, + "grad_norm": 1.7795008420944214, + "learning_rate": 4.5373951302602394e-05, + "loss": 4.6723, + "step": 33085 + }, + { + "epoch": 0.1967718146350747, + "grad_norm": 1.8241984844207764, + "learning_rate": 4.537368060612531e-05, + "loss": 4.6965, + "step": 33086 + }, + { + "epoch": 0.1967777619183557, + "grad_norm": 1.6808873414993286, + "learning_rate": 4.537340990253599e-05, + "loss": 4.6631, + "step": 33087 + }, + { + "epoch": 0.19678370920163668, + "grad_norm": 2.037264585494995, + "learning_rate": 4.537313919183451e-05, + "loss": 4.2909, + "step": 33088 + }, + { + "epoch": 0.1967896564849177, + "grad_norm": 1.729772925376892, + "learning_rate": 4.5372868474020996e-05, + "loss": 4.4949, + "step": 33089 + }, + { + "epoch": 0.19679560376819868, + "grad_norm": 1.4116592407226562, + "learning_rate": 4.537259774909551e-05, + "loss": 5.1077, + "step": 33090 + }, + { + "epoch": 0.19680155105147967, + "grad_norm": 1.5582292079925537, + "learning_rate": 4.537232701705817e-05, + "loss": 4.9148, + "step": 33091 + }, + { + "epoch": 0.1968074983347607, + "grad_norm": 1.4327534437179565, + "learning_rate": 4.5372056277909055e-05, + "loss": 5.0867, + "step": 33092 + }, + { + "epoch": 0.19681344561804168, + "grad_norm": 1.9894887208938599, + "learning_rate": 4.537178553164827e-05, + "loss": 4.3336, + "step": 33093 + }, + { + "epoch": 0.19681939290132267, + "grad_norm": 1.8432674407958984, + "learning_rate": 4.5371514778275904e-05, + "loss": 4.325, + "step": 33094 + }, + { + "epoch": 0.19682534018460368, + "grad_norm": 2.4664008617401123, + "learning_rate": 4.537124401779206e-05, + "loss": 4.3697, + "step": 33095 + }, + { + "epoch": 0.19683128746788467, + "grad_norm": 1.7858588695526123, + "learning_rate": 4.537097325019681e-05, + "loss": 4.5536, + "step": 33096 + }, + { + "epoch": 0.19683723475116566, + "grad_norm": 1.5062922239303589, + "learning_rate": 4.537070247549028e-05, + "loss": 4.7737, + "step": 33097 + }, + { + "epoch": 0.19684318203444667, + "grad_norm": 1.8330934047698975, + "learning_rate": 4.537043169367253e-05, + "loss": 4.3142, + "step": 33098 + }, + { + "epoch": 0.19684912931772766, + "grad_norm": 1.6842762231826782, + "learning_rate": 4.5370160904743686e-05, + "loss": 4.1331, + "step": 33099 + }, + { + "epoch": 0.19685507660100865, + "grad_norm": 1.6639212369918823, + "learning_rate": 4.5369890108703824e-05, + "loss": 4.3134, + "step": 33100 + }, + { + "epoch": 0.19686102388428967, + "grad_norm": 1.7178279161453247, + "learning_rate": 4.5369619305553047e-05, + "loss": 4.2592, + "step": 33101 + }, + { + "epoch": 0.19686697116757065, + "grad_norm": 1.693440318107605, + "learning_rate": 4.536934849529144e-05, + "loss": 4.3392, + "step": 33102 + }, + { + "epoch": 0.19687291845085164, + "grad_norm": 1.7481168508529663, + "learning_rate": 4.5369077677919116e-05, + "loss": 5.2228, + "step": 33103 + }, + { + "epoch": 0.19687886573413266, + "grad_norm": 1.4601521492004395, + "learning_rate": 4.5368806853436145e-05, + "loss": 4.7914, + "step": 33104 + }, + { + "epoch": 0.19688481301741365, + "grad_norm": 1.6039336919784546, + "learning_rate": 4.536853602184264e-05, + "loss": 4.3547, + "step": 33105 + }, + { + "epoch": 0.19689076030069463, + "grad_norm": 1.9059422016143799, + "learning_rate": 4.536826518313869e-05, + "loss": 4.1615, + "step": 33106 + }, + { + "epoch": 0.19689670758397565, + "grad_norm": 1.8276565074920654, + "learning_rate": 4.536799433732438e-05, + "loss": 4.3688, + "step": 33107 + }, + { + "epoch": 0.19690265486725664, + "grad_norm": 1.251856803894043, + "learning_rate": 4.5367723484399825e-05, + "loss": 5.0791, + "step": 33108 + }, + { + "epoch": 0.19690860215053763, + "grad_norm": 1.649273157119751, + "learning_rate": 4.5367452624365107e-05, + "loss": 4.6438, + "step": 33109 + }, + { + "epoch": 0.19691454943381864, + "grad_norm": 1.8959378004074097, + "learning_rate": 4.5367181757220326e-05, + "loss": 4.7787, + "step": 33110 + }, + { + "epoch": 0.19692049671709963, + "grad_norm": 1.8937031030654907, + "learning_rate": 4.536691088296556e-05, + "loss": 5.0304, + "step": 33111 + }, + { + "epoch": 0.19692644400038062, + "grad_norm": 1.6562620401382446, + "learning_rate": 4.5366640001600916e-05, + "loss": 4.7406, + "step": 33112 + }, + { + "epoch": 0.19693239128366163, + "grad_norm": 1.491281270980835, + "learning_rate": 4.53663691131265e-05, + "loss": 5.3676, + "step": 33113 + }, + { + "epoch": 0.19693833856694262, + "grad_norm": 1.5142914056777954, + "learning_rate": 4.536609821754239e-05, + "loss": 4.8378, + "step": 33114 + }, + { + "epoch": 0.1969442858502236, + "grad_norm": 1.6782684326171875, + "learning_rate": 4.536582731484868e-05, + "loss": 5.4148, + "step": 33115 + }, + { + "epoch": 0.19695023313350463, + "grad_norm": 1.8408838510513306, + "learning_rate": 4.5365556405045475e-05, + "loss": 5.2331, + "step": 33116 + }, + { + "epoch": 0.19695618041678561, + "grad_norm": 1.7965582609176636, + "learning_rate": 4.536528548813286e-05, + "loss": 4.6984, + "step": 33117 + }, + { + "epoch": 0.1969621277000666, + "grad_norm": 1.7825274467468262, + "learning_rate": 4.536501456411094e-05, + "loss": 4.9418, + "step": 33118 + }, + { + "epoch": 0.19696807498334762, + "grad_norm": 1.957222819328308, + "learning_rate": 4.53647436329798e-05, + "loss": 5.0835, + "step": 33119 + }, + { + "epoch": 0.1969740222666286, + "grad_norm": 1.7899425029754639, + "learning_rate": 4.536447269473954e-05, + "loss": 5.1203, + "step": 33120 + }, + { + "epoch": 0.1969799695499096, + "grad_norm": 1.5552259683609009, + "learning_rate": 4.536420174939025e-05, + "loss": 4.8925, + "step": 33121 + }, + { + "epoch": 0.1969859168331906, + "grad_norm": 1.6164780855178833, + "learning_rate": 4.5363930796932036e-05, + "loss": 4.9743, + "step": 33122 + }, + { + "epoch": 0.1969918641164716, + "grad_norm": 1.7106695175170898, + "learning_rate": 4.536365983736498e-05, + "loss": 4.5289, + "step": 33123 + }, + { + "epoch": 0.1969978113997526, + "grad_norm": 1.6085342168807983, + "learning_rate": 4.5363388870689175e-05, + "loss": 4.9196, + "step": 33124 + }, + { + "epoch": 0.1970037586830336, + "grad_norm": 1.8197940587997437, + "learning_rate": 4.536311789690473e-05, + "loss": 4.1818, + "step": 33125 + }, + { + "epoch": 0.1970097059663146, + "grad_norm": 1.6849818229675293, + "learning_rate": 4.5362846916011724e-05, + "loss": 4.6461, + "step": 33126 + }, + { + "epoch": 0.19701565324959558, + "grad_norm": 1.6120171546936035, + "learning_rate": 4.536257592801026e-05, + "loss": 4.8924, + "step": 33127 + }, + { + "epoch": 0.1970216005328766, + "grad_norm": 1.5898586511611938, + "learning_rate": 4.536230493290043e-05, + "loss": 4.7168, + "step": 33128 + }, + { + "epoch": 0.19702754781615758, + "grad_norm": 1.620339274406433, + "learning_rate": 4.536203393068234e-05, + "loss": 4.9929, + "step": 33129 + }, + { + "epoch": 0.19703349509943857, + "grad_norm": 1.6156890392303467, + "learning_rate": 4.536176292135606e-05, + "loss": 4.7432, + "step": 33130 + }, + { + "epoch": 0.1970394423827196, + "grad_norm": 1.7055577039718628, + "learning_rate": 4.5361491904921704e-05, + "loss": 4.7621, + "step": 33131 + }, + { + "epoch": 0.19704538966600058, + "grad_norm": 1.474246621131897, + "learning_rate": 4.5361220881379364e-05, + "loss": 4.5933, + "step": 33132 + }, + { + "epoch": 0.19705133694928156, + "grad_norm": 1.624182105064392, + "learning_rate": 4.536094985072914e-05, + "loss": 4.518, + "step": 33133 + }, + { + "epoch": 0.19705728423256255, + "grad_norm": 1.8042110204696655, + "learning_rate": 4.5360678812971104e-05, + "loss": 4.9295, + "step": 33134 + }, + { + "epoch": 0.19706323151584357, + "grad_norm": 1.6601067781448364, + "learning_rate": 4.536040776810537e-05, + "loss": 4.9089, + "step": 33135 + }, + { + "epoch": 0.19706917879912456, + "grad_norm": 1.9153242111206055, + "learning_rate": 4.5360136716132026e-05, + "loss": 4.9126, + "step": 33136 + }, + { + "epoch": 0.19707512608240554, + "grad_norm": 1.4137238264083862, + "learning_rate": 4.535986565705118e-05, + "loss": 4.9162, + "step": 33137 + }, + { + "epoch": 0.19708107336568656, + "grad_norm": 1.6325432062149048, + "learning_rate": 4.5359594590862905e-05, + "loss": 4.5288, + "step": 33138 + }, + { + "epoch": 0.19708702064896755, + "grad_norm": 1.8361495733261108, + "learning_rate": 4.5359323517567306e-05, + "loss": 4.3896, + "step": 33139 + }, + { + "epoch": 0.19709296793224854, + "grad_norm": 1.5172821283340454, + "learning_rate": 4.535905243716448e-05, + "loss": 4.8043, + "step": 33140 + }, + { + "epoch": 0.19709891521552955, + "grad_norm": 1.8739675283432007, + "learning_rate": 4.535878134965452e-05, + "loss": 5.7305, + "step": 33141 + }, + { + "epoch": 0.19710486249881054, + "grad_norm": 2.015746831893921, + "learning_rate": 4.535851025503752e-05, + "loss": 5.5957, + "step": 33142 + }, + { + "epoch": 0.19711080978209153, + "grad_norm": 1.5228544473648071, + "learning_rate": 4.535823915331357e-05, + "loss": 5.1121, + "step": 33143 + }, + { + "epoch": 0.19711675706537254, + "grad_norm": 1.8229737281799316, + "learning_rate": 4.5357968044482776e-05, + "loss": 5.3085, + "step": 33144 + }, + { + "epoch": 0.19712270434865353, + "grad_norm": 1.5784951448440552, + "learning_rate": 4.5357696928545215e-05, + "loss": 5.8142, + "step": 33145 + }, + { + "epoch": 0.19712865163193452, + "grad_norm": 1.6534473896026611, + "learning_rate": 4.5357425805501e-05, + "loss": 5.3424, + "step": 33146 + }, + { + "epoch": 0.19713459891521554, + "grad_norm": 2.2714569568634033, + "learning_rate": 4.535715467535022e-05, + "loss": 4.3029, + "step": 33147 + }, + { + "epoch": 0.19714054619849652, + "grad_norm": 1.9718842506408691, + "learning_rate": 4.5356883538092967e-05, + "loss": 4.2328, + "step": 33148 + }, + { + "epoch": 0.1971464934817775, + "grad_norm": 1.6277927160263062, + "learning_rate": 4.5356612393729325e-05, + "loss": 4.4578, + "step": 33149 + }, + { + "epoch": 0.19715244076505853, + "grad_norm": 1.7372039556503296, + "learning_rate": 4.5356341242259416e-05, + "loss": 4.8696, + "step": 33150 + }, + { + "epoch": 0.19715838804833952, + "grad_norm": 1.6973861455917358, + "learning_rate": 4.5356070083683313e-05, + "loss": 4.7767, + "step": 33151 + }, + { + "epoch": 0.1971643353316205, + "grad_norm": 2.262956142425537, + "learning_rate": 4.5355798918001106e-05, + "loss": 3.7523, + "step": 33152 + }, + { + "epoch": 0.19717028261490152, + "grad_norm": 2.558028221130371, + "learning_rate": 4.535552774521291e-05, + "loss": 3.7098, + "step": 33153 + }, + { + "epoch": 0.1971762298981825, + "grad_norm": 2.026247024536133, + "learning_rate": 4.5355256565318804e-05, + "loss": 4.1682, + "step": 33154 + }, + { + "epoch": 0.1971821771814635, + "grad_norm": 1.616791844367981, + "learning_rate": 4.535498537831889e-05, + "loss": 5.0644, + "step": 33155 + }, + { + "epoch": 0.1971881244647445, + "grad_norm": 2.0363311767578125, + "learning_rate": 4.5354714184213274e-05, + "loss": 5.1429, + "step": 33156 + }, + { + "epoch": 0.1971940717480255, + "grad_norm": 1.687772274017334, + "learning_rate": 4.535444298300202e-05, + "loss": 4.8497, + "step": 33157 + }, + { + "epoch": 0.1972000190313065, + "grad_norm": 2.0011701583862305, + "learning_rate": 4.535417177468525e-05, + "loss": 4.5291, + "step": 33158 + }, + { + "epoch": 0.1972059663145875, + "grad_norm": 1.8161364793777466, + "learning_rate": 4.5353900559263044e-05, + "loss": 5.1733, + "step": 33159 + }, + { + "epoch": 0.1972119135978685, + "grad_norm": 1.8265936374664307, + "learning_rate": 4.535362933673551e-05, + "loss": 5.214, + "step": 33160 + }, + { + "epoch": 0.19721786088114948, + "grad_norm": 1.8060880899429321, + "learning_rate": 4.535335810710272e-05, + "loss": 5.1948, + "step": 33161 + }, + { + "epoch": 0.1972238081644305, + "grad_norm": 1.8454203605651855, + "learning_rate": 4.5353086870364795e-05, + "loss": 5.2124, + "step": 33162 + }, + { + "epoch": 0.19722975544771149, + "grad_norm": 1.4462891817092896, + "learning_rate": 4.535281562652181e-05, + "loss": 5.2756, + "step": 33163 + }, + { + "epoch": 0.19723570273099247, + "grad_norm": 1.7189714908599854, + "learning_rate": 4.535254437557387e-05, + "loss": 4.7875, + "step": 33164 + }, + { + "epoch": 0.1972416500142735, + "grad_norm": 1.6477726697921753, + "learning_rate": 4.5352273117521074e-05, + "loss": 4.7295, + "step": 33165 + }, + { + "epoch": 0.19724759729755448, + "grad_norm": 1.5731415748596191, + "learning_rate": 4.535200185236349e-05, + "loss": 5.168, + "step": 33166 + }, + { + "epoch": 0.19725354458083547, + "grad_norm": 1.8423577547073364, + "learning_rate": 4.535173058010125e-05, + "loss": 5.1467, + "step": 33167 + }, + { + "epoch": 0.19725949186411648, + "grad_norm": 1.6812883615493774, + "learning_rate": 4.5351459300734436e-05, + "loss": 5.6017, + "step": 33168 + }, + { + "epoch": 0.19726543914739747, + "grad_norm": 1.5396465063095093, + "learning_rate": 4.535118801426312e-05, + "loss": 5.4747, + "step": 33169 + }, + { + "epoch": 0.19727138643067846, + "grad_norm": 2.3084144592285156, + "learning_rate": 4.535091672068743e-05, + "loss": 4.1229, + "step": 33170 + }, + { + "epoch": 0.19727733371395947, + "grad_norm": 2.0174407958984375, + "learning_rate": 4.535064542000743e-05, + "loss": 5.4288, + "step": 33171 + }, + { + "epoch": 0.19728328099724046, + "grad_norm": 1.5771641731262207, + "learning_rate": 4.535037411222324e-05, + "loss": 5.3271, + "step": 33172 + }, + { + "epoch": 0.19728922828052145, + "grad_norm": 1.7815163135528564, + "learning_rate": 4.5350102797334946e-05, + "loss": 5.7822, + "step": 33173 + }, + { + "epoch": 0.19729517556380247, + "grad_norm": 2.0947186946868896, + "learning_rate": 4.534983147534264e-05, + "loss": 4.737, + "step": 33174 + }, + { + "epoch": 0.19730112284708345, + "grad_norm": 1.4813156127929688, + "learning_rate": 4.534956014624642e-05, + "loss": 4.7643, + "step": 33175 + }, + { + "epoch": 0.19730707013036444, + "grad_norm": 1.9580156803131104, + "learning_rate": 4.534928881004637e-05, + "loss": 4.922, + "step": 33176 + }, + { + "epoch": 0.19731301741364546, + "grad_norm": 1.7537955045700073, + "learning_rate": 4.53490174667426e-05, + "loss": 5.154, + "step": 33177 + }, + { + "epoch": 0.19731896469692645, + "grad_norm": 1.444298267364502, + "learning_rate": 4.534874611633519e-05, + "loss": 5.0872, + "step": 33178 + }, + { + "epoch": 0.19732491198020743, + "grad_norm": 1.4737969636917114, + "learning_rate": 4.5348474758824254e-05, + "loss": 4.9913, + "step": 33179 + }, + { + "epoch": 0.19733085926348845, + "grad_norm": 1.9675285816192627, + "learning_rate": 4.5348203394209874e-05, + "loss": 4.9247, + "step": 33180 + }, + { + "epoch": 0.19733680654676944, + "grad_norm": 1.5464117527008057, + "learning_rate": 4.534793202249215e-05, + "loss": 5.0685, + "step": 33181 + }, + { + "epoch": 0.19734275383005043, + "grad_norm": 1.5645267963409424, + "learning_rate": 4.5347660643671155e-05, + "loss": 5.2382, + "step": 33182 + }, + { + "epoch": 0.19734870111333144, + "grad_norm": 1.5690304040908813, + "learning_rate": 4.534738925774702e-05, + "loss": 5.3408, + "step": 33183 + }, + { + "epoch": 0.19735464839661243, + "grad_norm": 1.8297967910766602, + "learning_rate": 4.534711786471981e-05, + "loss": 5.5102, + "step": 33184 + }, + { + "epoch": 0.19736059567989342, + "grad_norm": 2.669644594192505, + "learning_rate": 4.534684646458964e-05, + "loss": 5.0929, + "step": 33185 + }, + { + "epoch": 0.19736654296317444, + "grad_norm": 2.3194940090179443, + "learning_rate": 4.53465750573566e-05, + "loss": 5.1063, + "step": 33186 + }, + { + "epoch": 0.19737249024645542, + "grad_norm": 1.6241377592086792, + "learning_rate": 4.534630364302077e-05, + "loss": 4.8875, + "step": 33187 + }, + { + "epoch": 0.1973784375297364, + "grad_norm": 1.7001700401306152, + "learning_rate": 4.534603222158226e-05, + "loss": 5.2486, + "step": 33188 + }, + { + "epoch": 0.19738438481301743, + "grad_norm": 1.9493141174316406, + "learning_rate": 4.5345760793041156e-05, + "loss": 4.9473, + "step": 33189 + }, + { + "epoch": 0.19739033209629842, + "grad_norm": 1.5743041038513184, + "learning_rate": 4.534548935739756e-05, + "loss": 4.6756, + "step": 33190 + }, + { + "epoch": 0.1973962793795794, + "grad_norm": 1.6579638719558716, + "learning_rate": 4.5345217914651575e-05, + "loss": 5.2041, + "step": 33191 + }, + { + "epoch": 0.1974022266628604, + "grad_norm": 1.9961597919464111, + "learning_rate": 4.5344946464803264e-05, + "loss": 3.488, + "step": 33192 + }, + { + "epoch": 0.1974081739461414, + "grad_norm": 2.9726107120513916, + "learning_rate": 4.534467500785275e-05, + "loss": 3.443, + "step": 33193 + }, + { + "epoch": 0.1974141212294224, + "grad_norm": 2.6377058029174805, + "learning_rate": 4.5344403543800127e-05, + "loss": 3.6516, + "step": 33194 + }, + { + "epoch": 0.19742006851270338, + "grad_norm": 2.2696948051452637, + "learning_rate": 4.534413207264548e-05, + "loss": 5.2825, + "step": 33195 + }, + { + "epoch": 0.1974260157959844, + "grad_norm": 3.0145084857940674, + "learning_rate": 4.5343860594388906e-05, + "loss": 4.9292, + "step": 33196 + }, + { + "epoch": 0.1974319630792654, + "grad_norm": 1.9451453685760498, + "learning_rate": 4.53435891090305e-05, + "loss": 4.6508, + "step": 33197 + }, + { + "epoch": 0.19743791036254638, + "grad_norm": 1.8160196542739868, + "learning_rate": 4.5343317616570356e-05, + "loss": 4.9695, + "step": 33198 + }, + { + "epoch": 0.1974438576458274, + "grad_norm": 1.522060513496399, + "learning_rate": 4.534304611700858e-05, + "loss": 5.113, + "step": 33199 + }, + { + "epoch": 0.19744980492910838, + "grad_norm": 1.8439273834228516, + "learning_rate": 4.534277461034524e-05, + "loss": 5.3172, + "step": 33200 + }, + { + "epoch": 0.19745575221238937, + "grad_norm": 1.535640001296997, + "learning_rate": 4.5342503096580455e-05, + "loss": 5.1527, + "step": 33201 + }, + { + "epoch": 0.19746169949567038, + "grad_norm": 1.8293157815933228, + "learning_rate": 4.5342231575714314e-05, + "loss": 4.6291, + "step": 33202 + }, + { + "epoch": 0.19746764677895137, + "grad_norm": 1.7495836019515991, + "learning_rate": 4.534196004774691e-05, + "loss": 4.3015, + "step": 33203 + }, + { + "epoch": 0.19747359406223236, + "grad_norm": 1.6512584686279297, + "learning_rate": 4.534168851267834e-05, + "loss": 4.379, + "step": 33204 + }, + { + "epoch": 0.19747954134551338, + "grad_norm": 1.8381824493408203, + "learning_rate": 4.534141697050869e-05, + "loss": 4.3344, + "step": 33205 + }, + { + "epoch": 0.19748548862879436, + "grad_norm": 1.7134857177734375, + "learning_rate": 4.5341145421238065e-05, + "loss": 4.2597, + "step": 33206 + }, + { + "epoch": 0.19749143591207535, + "grad_norm": 1.6309324502944946, + "learning_rate": 4.534087386486656e-05, + "loss": 4.4719, + "step": 33207 + }, + { + "epoch": 0.19749738319535637, + "grad_norm": 1.8172357082366943, + "learning_rate": 4.534060230139426e-05, + "loss": 4.1611, + "step": 33208 + }, + { + "epoch": 0.19750333047863736, + "grad_norm": 1.8709135055541992, + "learning_rate": 4.5340330730821266e-05, + "loss": 4.0607, + "step": 33209 + }, + { + "epoch": 0.19750927776191834, + "grad_norm": 1.7528961896896362, + "learning_rate": 4.534005915314768e-05, + "loss": 4.1092, + "step": 33210 + }, + { + "epoch": 0.19751522504519936, + "grad_norm": 1.7320555448532104, + "learning_rate": 4.533978756837358e-05, + "loss": 4.0203, + "step": 33211 + }, + { + "epoch": 0.19752117232848035, + "grad_norm": 1.8663568496704102, + "learning_rate": 4.533951597649908e-05, + "loss": 4.0895, + "step": 33212 + }, + { + "epoch": 0.19752711961176134, + "grad_norm": 1.8251363039016724, + "learning_rate": 4.533924437752426e-05, + "loss": 4.1971, + "step": 33213 + }, + { + "epoch": 0.19753306689504235, + "grad_norm": 1.9213861227035522, + "learning_rate": 4.533897277144922e-05, + "loss": 5.2216, + "step": 33214 + }, + { + "epoch": 0.19753901417832334, + "grad_norm": 1.9868927001953125, + "learning_rate": 4.533870115827405e-05, + "loss": 5.0257, + "step": 33215 + }, + { + "epoch": 0.19754496146160433, + "grad_norm": 1.8457907438278198, + "learning_rate": 4.5338429537998864e-05, + "loss": 4.9828, + "step": 33216 + }, + { + "epoch": 0.19755090874488535, + "grad_norm": 1.5566126108169556, + "learning_rate": 4.5338157910623724e-05, + "loss": 5.0219, + "step": 33217 + }, + { + "epoch": 0.19755685602816633, + "grad_norm": 1.585070252418518, + "learning_rate": 4.533788627614875e-05, + "loss": 4.8923, + "step": 33218 + }, + { + "epoch": 0.19756280331144732, + "grad_norm": 1.589237928390503, + "learning_rate": 4.5337614634574034e-05, + "loss": 4.973, + "step": 33219 + }, + { + "epoch": 0.19756875059472834, + "grad_norm": 1.6661782264709473, + "learning_rate": 4.533734298589967e-05, + "loss": 5.3401, + "step": 33220 + }, + { + "epoch": 0.19757469787800933, + "grad_norm": 1.6391420364379883, + "learning_rate": 4.533707133012574e-05, + "loss": 5.4286, + "step": 33221 + }, + { + "epoch": 0.1975806451612903, + "grad_norm": 1.8194524049758911, + "learning_rate": 4.533679966725235e-05, + "loss": 5.2341, + "step": 33222 + }, + { + "epoch": 0.19758659244457133, + "grad_norm": 1.606191635131836, + "learning_rate": 4.53365279972796e-05, + "loss": 5.0454, + "step": 33223 + }, + { + "epoch": 0.19759253972785232, + "grad_norm": 1.6205066442489624, + "learning_rate": 4.533625632020757e-05, + "loss": 4.772, + "step": 33224 + }, + { + "epoch": 0.1975984870111333, + "grad_norm": 1.5267688035964966, + "learning_rate": 4.533598463603637e-05, + "loss": 5.0322, + "step": 33225 + }, + { + "epoch": 0.19760443429441432, + "grad_norm": 1.5950186252593994, + "learning_rate": 4.533571294476608e-05, + "loss": 5.1335, + "step": 33226 + }, + { + "epoch": 0.1976103815776953, + "grad_norm": 1.5251303911209106, + "learning_rate": 4.5335441246396814e-05, + "loss": 5.2293, + "step": 33227 + }, + { + "epoch": 0.1976163288609763, + "grad_norm": 1.4677468538284302, + "learning_rate": 4.533516954092865e-05, + "loss": 5.0307, + "step": 33228 + }, + { + "epoch": 0.19762227614425731, + "grad_norm": 1.7119927406311035, + "learning_rate": 4.533489782836169e-05, + "loss": 5.0289, + "step": 33229 + }, + { + "epoch": 0.1976282234275383, + "grad_norm": 1.698115587234497, + "learning_rate": 4.533462610869602e-05, + "loss": 4.7599, + "step": 33230 + }, + { + "epoch": 0.1976341707108193, + "grad_norm": 1.7038428783416748, + "learning_rate": 4.5334354381931755e-05, + "loss": 5.3396, + "step": 33231 + }, + { + "epoch": 0.1976401179941003, + "grad_norm": 2.631863594055176, + "learning_rate": 4.533408264806897e-05, + "loss": 4.3216, + "step": 33232 + }, + { + "epoch": 0.1976460652773813, + "grad_norm": 1.6633570194244385, + "learning_rate": 4.533381090710776e-05, + "loss": 5.5176, + "step": 33233 + }, + { + "epoch": 0.19765201256066228, + "grad_norm": 1.5169641971588135, + "learning_rate": 4.533353915904823e-05, + "loss": 5.5365, + "step": 33234 + }, + { + "epoch": 0.1976579598439433, + "grad_norm": 1.8276102542877197, + "learning_rate": 4.533326740389048e-05, + "loss": 5.1131, + "step": 33235 + }, + { + "epoch": 0.1976639071272243, + "grad_norm": 1.5195908546447754, + "learning_rate": 4.533299564163459e-05, + "loss": 5.0087, + "step": 33236 + }, + { + "epoch": 0.19766985441050527, + "grad_norm": 1.7778794765472412, + "learning_rate": 4.533272387228066e-05, + "loss": 4.638, + "step": 33237 + }, + { + "epoch": 0.1976758016937863, + "grad_norm": 1.6670910120010376, + "learning_rate": 4.533245209582879e-05, + "loss": 5.7267, + "step": 33238 + }, + { + "epoch": 0.19768174897706728, + "grad_norm": 1.620630145072937, + "learning_rate": 4.533218031227907e-05, + "loss": 5.6041, + "step": 33239 + }, + { + "epoch": 0.19768769626034827, + "grad_norm": 1.9409407377243042, + "learning_rate": 4.5331908521631594e-05, + "loss": 5.3162, + "step": 33240 + }, + { + "epoch": 0.19769364354362928, + "grad_norm": 1.670377492904663, + "learning_rate": 4.533163672388646e-05, + "loss": 4.9583, + "step": 33241 + }, + { + "epoch": 0.19769959082691027, + "grad_norm": 1.6396856307983398, + "learning_rate": 4.5331364919043764e-05, + "loss": 5.4872, + "step": 33242 + }, + { + "epoch": 0.19770553811019126, + "grad_norm": 1.6345281600952148, + "learning_rate": 4.5331093107103594e-05, + "loss": 5.1595, + "step": 33243 + }, + { + "epoch": 0.19771148539347227, + "grad_norm": 1.724963903427124, + "learning_rate": 4.533082128806605e-05, + "loss": 4.845, + "step": 33244 + }, + { + "epoch": 0.19771743267675326, + "grad_norm": 1.480117917060852, + "learning_rate": 4.5330549461931223e-05, + "loss": 5.2803, + "step": 33245 + }, + { + "epoch": 0.19772337996003425, + "grad_norm": 1.884554386138916, + "learning_rate": 4.533027762869923e-05, + "loss": 5.307, + "step": 33246 + }, + { + "epoch": 0.19772932724331527, + "grad_norm": 1.9123550653457642, + "learning_rate": 4.5330005788370124e-05, + "loss": 4.961, + "step": 33247 + }, + { + "epoch": 0.19773527452659626, + "grad_norm": 1.6469107866287231, + "learning_rate": 4.532973394094403e-05, + "loss": 4.7209, + "step": 33248 + }, + { + "epoch": 0.19774122180987724, + "grad_norm": 1.5761797428131104, + "learning_rate": 4.532946208642104e-05, + "loss": 5.3125, + "step": 33249 + }, + { + "epoch": 0.19774716909315823, + "grad_norm": 1.590433120727539, + "learning_rate": 4.532919022480124e-05, + "loss": 5.0676, + "step": 33250 + }, + { + "epoch": 0.19775311637643925, + "grad_norm": 1.5116517543792725, + "learning_rate": 4.532891835608474e-05, + "loss": 5.055, + "step": 33251 + }, + { + "epoch": 0.19775906365972024, + "grad_norm": 1.588983178138733, + "learning_rate": 4.5328646480271616e-05, + "loss": 5.0159, + "step": 33252 + }, + { + "epoch": 0.19776501094300122, + "grad_norm": 1.7448828220367432, + "learning_rate": 4.532837459736197e-05, + "loss": 5.2739, + "step": 33253 + }, + { + "epoch": 0.19777095822628224, + "grad_norm": 1.6734801530838013, + "learning_rate": 4.53281027073559e-05, + "loss": 5.0657, + "step": 33254 + }, + { + "epoch": 0.19777690550956323, + "grad_norm": 1.8620131015777588, + "learning_rate": 4.53278308102535e-05, + "loss": 5.2255, + "step": 33255 + }, + { + "epoch": 0.19778285279284422, + "grad_norm": 1.940711259841919, + "learning_rate": 4.532755890605487e-05, + "loss": 5.1548, + "step": 33256 + }, + { + "epoch": 0.19778880007612523, + "grad_norm": 1.5663199424743652, + "learning_rate": 4.532728699476009e-05, + "loss": 5.0682, + "step": 33257 + }, + { + "epoch": 0.19779474735940622, + "grad_norm": 1.5207146406173706, + "learning_rate": 4.532701507636927e-05, + "loss": 5.418, + "step": 33258 + }, + { + "epoch": 0.1978006946426872, + "grad_norm": 1.4254070520401, + "learning_rate": 4.53267431508825e-05, + "loss": 5.4406, + "step": 33259 + }, + { + "epoch": 0.19780664192596822, + "grad_norm": 1.7687779664993286, + "learning_rate": 4.5326471218299873e-05, + "loss": 4.6602, + "step": 33260 + }, + { + "epoch": 0.1978125892092492, + "grad_norm": 1.4113342761993408, + "learning_rate": 4.532619927862148e-05, + "loss": 4.7432, + "step": 33261 + }, + { + "epoch": 0.1978185364925302, + "grad_norm": 1.816176176071167, + "learning_rate": 4.5325927331847424e-05, + "loss": 5.2539, + "step": 33262 + }, + { + "epoch": 0.19782448377581122, + "grad_norm": 1.5218030214309692, + "learning_rate": 4.5325655377977796e-05, + "loss": 4.8937, + "step": 33263 + }, + { + "epoch": 0.1978304310590922, + "grad_norm": 1.8468036651611328, + "learning_rate": 4.532538341701269e-05, + "loss": 5.0516, + "step": 33264 + }, + { + "epoch": 0.1978363783423732, + "grad_norm": 1.8353204727172852, + "learning_rate": 4.532511144895221e-05, + "loss": 4.9425, + "step": 33265 + }, + { + "epoch": 0.1978423256256542, + "grad_norm": 1.6375226974487305, + "learning_rate": 4.532483947379644e-05, + "loss": 4.7085, + "step": 33266 + }, + { + "epoch": 0.1978482729089352, + "grad_norm": 1.9428247213363647, + "learning_rate": 4.532456749154548e-05, + "loss": 4.844, + "step": 33267 + }, + { + "epoch": 0.19785422019221618, + "grad_norm": 3.1580700874328613, + "learning_rate": 4.532429550219941e-05, + "loss": 2.9092, + "step": 33268 + }, + { + "epoch": 0.1978601674754972, + "grad_norm": 1.4939215183258057, + "learning_rate": 4.532402350575835e-05, + "loss": 5.4607, + "step": 33269 + }, + { + "epoch": 0.1978661147587782, + "grad_norm": 1.400402307510376, + "learning_rate": 4.532375150222239e-05, + "loss": 5.3641, + "step": 33270 + }, + { + "epoch": 0.19787206204205918, + "grad_norm": 1.6665794849395752, + "learning_rate": 4.532347949159161e-05, + "loss": 4.5916, + "step": 33271 + }, + { + "epoch": 0.1978780093253402, + "grad_norm": 1.910585641860962, + "learning_rate": 4.532320747386612e-05, + "loss": 5.1528, + "step": 33272 + }, + { + "epoch": 0.19788395660862118, + "grad_norm": 1.7386438846588135, + "learning_rate": 4.5322935449045994e-05, + "loss": 4.8547, + "step": 33273 + }, + { + "epoch": 0.19788990389190217, + "grad_norm": 1.7548339366912842, + "learning_rate": 4.532266341713135e-05, + "loss": 4.9357, + "step": 33274 + }, + { + "epoch": 0.19789585117518319, + "grad_norm": 1.6126796007156372, + "learning_rate": 4.532239137812228e-05, + "loss": 4.8952, + "step": 33275 + }, + { + "epoch": 0.19790179845846417, + "grad_norm": 1.598954677581787, + "learning_rate": 4.5322119332018866e-05, + "loss": 4.6828, + "step": 33276 + }, + { + "epoch": 0.19790774574174516, + "grad_norm": 1.8405553102493286, + "learning_rate": 4.532184727882121e-05, + "loss": 4.5404, + "step": 33277 + }, + { + "epoch": 0.19791369302502618, + "grad_norm": 2.304363965988159, + "learning_rate": 4.5321575218529406e-05, + "loss": 4.3031, + "step": 33278 + }, + { + "epoch": 0.19791964030830717, + "grad_norm": 2.450482130050659, + "learning_rate": 4.532130315114355e-05, + "loss": 4.1944, + "step": 33279 + }, + { + "epoch": 0.19792558759158815, + "grad_norm": 2.3713395595550537, + "learning_rate": 4.532103107666374e-05, + "loss": 4.3168, + "step": 33280 + }, + { + "epoch": 0.19793153487486917, + "grad_norm": 2.1909902095794678, + "learning_rate": 4.5320758995090064e-05, + "loss": 4.4707, + "step": 33281 + }, + { + "epoch": 0.19793748215815016, + "grad_norm": 1.8460273742675781, + "learning_rate": 4.5320486906422624e-05, + "loss": 4.7456, + "step": 33282 + }, + { + "epoch": 0.19794342944143115, + "grad_norm": 1.8075324296951294, + "learning_rate": 4.5320214810661514e-05, + "loss": 4.5133, + "step": 33283 + }, + { + "epoch": 0.19794937672471216, + "grad_norm": 1.9076029062271118, + "learning_rate": 4.531994270780683e-05, + "loss": 4.7133, + "step": 33284 + }, + { + "epoch": 0.19795532400799315, + "grad_norm": 1.7290363311767578, + "learning_rate": 4.531967059785865e-05, + "loss": 4.7903, + "step": 33285 + }, + { + "epoch": 0.19796127129127414, + "grad_norm": 2.2225213050842285, + "learning_rate": 4.53193984808171e-05, + "loss": 4.896, + "step": 33286 + }, + { + "epoch": 0.19796721857455515, + "grad_norm": 1.7413650751113892, + "learning_rate": 4.531912635668224e-05, + "loss": 5.7902, + "step": 33287 + }, + { + "epoch": 0.19797316585783614, + "grad_norm": 1.6677063703536987, + "learning_rate": 4.53188542254542e-05, + "loss": 5.6381, + "step": 33288 + }, + { + "epoch": 0.19797911314111713, + "grad_norm": 1.64964759349823, + "learning_rate": 4.531858208713305e-05, + "loss": 4.8681, + "step": 33289 + }, + { + "epoch": 0.19798506042439815, + "grad_norm": 1.789642333984375, + "learning_rate": 4.531830994171889e-05, + "loss": 4.8042, + "step": 33290 + }, + { + "epoch": 0.19799100770767913, + "grad_norm": 1.920061707496643, + "learning_rate": 4.531803778921182e-05, + "loss": 4.7461, + "step": 33291 + }, + { + "epoch": 0.19799695499096012, + "grad_norm": 1.8320075273513794, + "learning_rate": 4.531776562961194e-05, + "loss": 4.6238, + "step": 33292 + }, + { + "epoch": 0.19800290227424114, + "grad_norm": 1.7324212789535522, + "learning_rate": 4.531749346291933e-05, + "loss": 4.5368, + "step": 33293 + }, + { + "epoch": 0.19800884955752213, + "grad_norm": 2.327019453048706, + "learning_rate": 4.531722128913409e-05, + "loss": 4.1915, + "step": 33294 + }, + { + "epoch": 0.19801479684080311, + "grad_norm": 2.1580569744110107, + "learning_rate": 4.531694910825632e-05, + "loss": 4.2753, + "step": 33295 + }, + { + "epoch": 0.19802074412408413, + "grad_norm": 1.9125664234161377, + "learning_rate": 4.5316676920286125e-05, + "loss": 4.9737, + "step": 33296 + }, + { + "epoch": 0.19802669140736512, + "grad_norm": 2.3731091022491455, + "learning_rate": 4.5316404725223575e-05, + "loss": 5.0, + "step": 33297 + }, + { + "epoch": 0.1980326386906461, + "grad_norm": 2.2052502632141113, + "learning_rate": 4.531613252306879e-05, + "loss": 4.9842, + "step": 33298 + }, + { + "epoch": 0.19803858597392712, + "grad_norm": 1.8605939149856567, + "learning_rate": 4.5315860313821846e-05, + "loss": 4.9534, + "step": 33299 + }, + { + "epoch": 0.1980445332572081, + "grad_norm": 1.9243404865264893, + "learning_rate": 4.531558809748284e-05, + "loss": 4.7275, + "step": 33300 + }, + { + "epoch": 0.1980504805404891, + "grad_norm": 1.8417762517929077, + "learning_rate": 4.531531587405188e-05, + "loss": 4.5768, + "step": 33301 + }, + { + "epoch": 0.19805642782377011, + "grad_norm": 2.7929775714874268, + "learning_rate": 4.531504364352904e-05, + "loss": 3.6382, + "step": 33302 + }, + { + "epoch": 0.1980623751070511, + "grad_norm": 2.665148973464966, + "learning_rate": 4.531477140591444e-05, + "loss": 3.6138, + "step": 33303 + }, + { + "epoch": 0.1980683223903321, + "grad_norm": 2.0774621963500977, + "learning_rate": 4.531449916120816e-05, + "loss": 3.5553, + "step": 33304 + }, + { + "epoch": 0.1980742696736131, + "grad_norm": 1.8317457437515259, + "learning_rate": 4.53142269094103e-05, + "loss": 4.8087, + "step": 33305 + }, + { + "epoch": 0.1980802169568941, + "grad_norm": 1.7544660568237305, + "learning_rate": 4.531395465052095e-05, + "loss": 6.1719, + "step": 33306 + }, + { + "epoch": 0.19808616424017508, + "grad_norm": 3.286212205886841, + "learning_rate": 4.5313682384540216e-05, + "loss": 3.6332, + "step": 33307 + }, + { + "epoch": 0.19809211152345607, + "grad_norm": 3.265216112136841, + "learning_rate": 4.531341011146818e-05, + "loss": 2.1208, + "step": 33308 + }, + { + "epoch": 0.1980980588067371, + "grad_norm": 2.458509683609009, + "learning_rate": 4.531313783130494e-05, + "loss": 3.4689, + "step": 33309 + }, + { + "epoch": 0.19810400609001808, + "grad_norm": 2.342417001724243, + "learning_rate": 4.53128655440506e-05, + "loss": 3.4864, + "step": 33310 + }, + { + "epoch": 0.19810995337329906, + "grad_norm": 2.6172118186950684, + "learning_rate": 4.5312593249705236e-05, + "loss": 3.5505, + "step": 33311 + }, + { + "epoch": 0.19811590065658008, + "grad_norm": 2.6422629356384277, + "learning_rate": 4.5312320948268974e-05, + "loss": 3.7501, + "step": 33312 + }, + { + "epoch": 0.19812184793986107, + "grad_norm": 2.1356923580169678, + "learning_rate": 4.5312048639741875e-05, + "loss": 4.1028, + "step": 33313 + }, + { + "epoch": 0.19812779522314206, + "grad_norm": 1.9619426727294922, + "learning_rate": 4.531177632412406e-05, + "loss": 4.461, + "step": 33314 + }, + { + "epoch": 0.19813374250642307, + "grad_norm": 2.336240768432617, + "learning_rate": 4.531150400141561e-05, + "loss": 3.2982, + "step": 33315 + }, + { + "epoch": 0.19813968978970406, + "grad_norm": 2.5709304809570312, + "learning_rate": 4.531123167161662e-05, + "loss": 3.0417, + "step": 33316 + }, + { + "epoch": 0.19814563707298505, + "grad_norm": 3.9337923526763916, + "learning_rate": 4.531095933472719e-05, + "loss": 2.95, + "step": 33317 + }, + { + "epoch": 0.19815158435626606, + "grad_norm": 2.6982581615448, + "learning_rate": 4.5310686990747416e-05, + "loss": 3.5422, + "step": 33318 + }, + { + "epoch": 0.19815753163954705, + "grad_norm": 2.1642324924468994, + "learning_rate": 4.531041463967738e-05, + "loss": 4.9477, + "step": 33319 + }, + { + "epoch": 0.19816347892282804, + "grad_norm": 1.937697410583496, + "learning_rate": 4.53101422815172e-05, + "loss": 4.7786, + "step": 33320 + }, + { + "epoch": 0.19816942620610906, + "grad_norm": 1.599066138267517, + "learning_rate": 4.530986991626696e-05, + "loss": 5.2083, + "step": 33321 + }, + { + "epoch": 0.19817537348939004, + "grad_norm": 1.5987446308135986, + "learning_rate": 4.530959754392675e-05, + "loss": 5.1838, + "step": 33322 + }, + { + "epoch": 0.19818132077267103, + "grad_norm": 1.5494792461395264, + "learning_rate": 4.530932516449668e-05, + "loss": 5.5557, + "step": 33323 + }, + { + "epoch": 0.19818726805595205, + "grad_norm": 1.662477731704712, + "learning_rate": 4.530905277797682e-05, + "loss": 5.4674, + "step": 33324 + }, + { + "epoch": 0.19819321533923304, + "grad_norm": 1.4203627109527588, + "learning_rate": 4.530878038436729e-05, + "loss": 5.7035, + "step": 33325 + }, + { + "epoch": 0.19819916262251402, + "grad_norm": 1.727128267288208, + "learning_rate": 4.5308507983668165e-05, + "loss": 5.0072, + "step": 33326 + }, + { + "epoch": 0.19820510990579504, + "grad_norm": 1.7568631172180176, + "learning_rate": 4.530823557587955e-05, + "loss": 4.7131, + "step": 33327 + }, + { + "epoch": 0.19821105718907603, + "grad_norm": 1.8544484376907349, + "learning_rate": 4.530796316100155e-05, + "loss": 4.6808, + "step": 33328 + }, + { + "epoch": 0.19821700447235702, + "grad_norm": 1.6898458003997803, + "learning_rate": 4.530769073903424e-05, + "loss": 4.8085, + "step": 33329 + }, + { + "epoch": 0.19822295175563803, + "grad_norm": 2.1594486236572266, + "learning_rate": 4.530741830997773e-05, + "loss": 5.1586, + "step": 33330 + }, + { + "epoch": 0.19822889903891902, + "grad_norm": 1.6536179780960083, + "learning_rate": 4.5307145873832116e-05, + "loss": 5.4879, + "step": 33331 + }, + { + "epoch": 0.1982348463222, + "grad_norm": 1.6635406017303467, + "learning_rate": 4.530687343059748e-05, + "loss": 5.5663, + "step": 33332 + }, + { + "epoch": 0.19824079360548102, + "grad_norm": 1.500622272491455, + "learning_rate": 4.530660098027392e-05, + "loss": 5.3621, + "step": 33333 + }, + { + "epoch": 0.198246740888762, + "grad_norm": 1.6053495407104492, + "learning_rate": 4.530632852286154e-05, + "loss": 4.6813, + "step": 33334 + }, + { + "epoch": 0.198252688172043, + "grad_norm": 2.006056308746338, + "learning_rate": 4.5306056058360424e-05, + "loss": 3.9102, + "step": 33335 + }, + { + "epoch": 0.19825863545532402, + "grad_norm": 1.6927076578140259, + "learning_rate": 4.5305783586770686e-05, + "loss": 5.4046, + "step": 33336 + }, + { + "epoch": 0.198264582738605, + "grad_norm": 1.7682117223739624, + "learning_rate": 4.53055111080924e-05, + "loss": 5.3556, + "step": 33337 + }, + { + "epoch": 0.198270530021886, + "grad_norm": 1.7829780578613281, + "learning_rate": 4.5305238622325676e-05, + "loss": 5.2791, + "step": 33338 + }, + { + "epoch": 0.198276477305167, + "grad_norm": 1.6257526874542236, + "learning_rate": 4.53049661294706e-05, + "loss": 5.2301, + "step": 33339 + }, + { + "epoch": 0.198282424588448, + "grad_norm": 1.6963531970977783, + "learning_rate": 4.530469362952727e-05, + "loss": 5.1795, + "step": 33340 + }, + { + "epoch": 0.19828837187172899, + "grad_norm": 1.9438083171844482, + "learning_rate": 4.5304421122495774e-05, + "loss": 5.0384, + "step": 33341 + }, + { + "epoch": 0.19829431915501, + "grad_norm": 1.8972619771957397, + "learning_rate": 4.530414860837623e-05, + "loss": 5.6679, + "step": 33342 + }, + { + "epoch": 0.198300266438291, + "grad_norm": 1.9090536832809448, + "learning_rate": 4.530387608716871e-05, + "loss": 5.5994, + "step": 33343 + }, + { + "epoch": 0.19830621372157198, + "grad_norm": 1.7110793590545654, + "learning_rate": 4.530360355887331e-05, + "loss": 5.4753, + "step": 33344 + }, + { + "epoch": 0.198312161004853, + "grad_norm": 1.8114757537841797, + "learning_rate": 4.5303331023490136e-05, + "loss": 5.456, + "step": 33345 + }, + { + "epoch": 0.19831810828813398, + "grad_norm": 1.671255111694336, + "learning_rate": 4.530305848101928e-05, + "loss": 5.4312, + "step": 33346 + }, + { + "epoch": 0.19832405557141497, + "grad_norm": 3.440305471420288, + "learning_rate": 4.5302785931460836e-05, + "loss": 2.4397, + "step": 33347 + }, + { + "epoch": 0.19833000285469599, + "grad_norm": 3.13948655128479, + "learning_rate": 4.53025133748149e-05, + "loss": 2.306, + "step": 33348 + }, + { + "epoch": 0.19833595013797697, + "grad_norm": 2.2596566677093506, + "learning_rate": 4.5302240811081566e-05, + "loss": 3.5614, + "step": 33349 + }, + { + "epoch": 0.19834189742125796, + "grad_norm": 1.682121753692627, + "learning_rate": 4.530196824026093e-05, + "loss": 4.9683, + "step": 33350 + }, + { + "epoch": 0.19834784470453898, + "grad_norm": 1.9128234386444092, + "learning_rate": 4.530169566235308e-05, + "loss": 5.4938, + "step": 33351 + }, + { + "epoch": 0.19835379198781997, + "grad_norm": 2.0970449447631836, + "learning_rate": 4.530142307735813e-05, + "loss": 5.2809, + "step": 33352 + }, + { + "epoch": 0.19835973927110095, + "grad_norm": 1.815956711769104, + "learning_rate": 4.5301150485276156e-05, + "loss": 4.8106, + "step": 33353 + }, + { + "epoch": 0.19836568655438197, + "grad_norm": 2.473682403564453, + "learning_rate": 4.5300877886107264e-05, + "loss": 3.9659, + "step": 33354 + }, + { + "epoch": 0.19837163383766296, + "grad_norm": 4.671222686767578, + "learning_rate": 4.530060527985154e-05, + "loss": 2.8541, + "step": 33355 + }, + { + "epoch": 0.19837758112094395, + "grad_norm": 1.7974921464920044, + "learning_rate": 4.530033266650908e-05, + "loss": 5.2783, + "step": 33356 + }, + { + "epoch": 0.19838352840422496, + "grad_norm": 4.036770820617676, + "learning_rate": 4.5300060046079996e-05, + "loss": 3.7766, + "step": 33357 + }, + { + "epoch": 0.19838947568750595, + "grad_norm": 3.522930860519409, + "learning_rate": 4.529978741856436e-05, + "loss": 2.3974, + "step": 33358 + }, + { + "epoch": 0.19839542297078694, + "grad_norm": 2.413550615310669, + "learning_rate": 4.5299514783962285e-05, + "loss": 3.1974, + "step": 33359 + }, + { + "epoch": 0.19840137025406795, + "grad_norm": 1.75148606300354, + "learning_rate": 4.529924214227386e-05, + "loss": 4.9708, + "step": 33360 + }, + { + "epoch": 0.19840731753734894, + "grad_norm": 1.5809080600738525, + "learning_rate": 4.5298969493499165e-05, + "loss": 4.8973, + "step": 33361 + }, + { + "epoch": 0.19841326482062993, + "grad_norm": 1.7478617429733276, + "learning_rate": 4.5298696837638325e-05, + "loss": 5.2721, + "step": 33362 + }, + { + "epoch": 0.19841921210391095, + "grad_norm": 1.6357113122940063, + "learning_rate": 4.5298424174691417e-05, + "loss": 5.1701, + "step": 33363 + }, + { + "epoch": 0.19842515938719194, + "grad_norm": 1.5457570552825928, + "learning_rate": 4.5298151504658536e-05, + "loss": 5.1177, + "step": 33364 + }, + { + "epoch": 0.19843110667047292, + "grad_norm": 2.2305829524993896, + "learning_rate": 4.5297878827539784e-05, + "loss": 4.1489, + "step": 33365 + }, + { + "epoch": 0.19843705395375394, + "grad_norm": 3.119000196456909, + "learning_rate": 4.529760614333525e-05, + "loss": 3.8102, + "step": 33366 + }, + { + "epoch": 0.19844300123703493, + "grad_norm": 2.1986236572265625, + "learning_rate": 4.5297333452045025e-05, + "loss": 4.9004, + "step": 33367 + }, + { + "epoch": 0.19844894852031592, + "grad_norm": 1.6912589073181152, + "learning_rate": 4.5297060753669216e-05, + "loss": 5.6423, + "step": 33368 + }, + { + "epoch": 0.1984548958035969, + "grad_norm": 1.681021809577942, + "learning_rate": 4.5296788048207915e-05, + "loss": 4.3601, + "step": 33369 + }, + { + "epoch": 0.19846084308687792, + "grad_norm": 1.6064156293869019, + "learning_rate": 4.529651533566122e-05, + "loss": 4.565, + "step": 33370 + }, + { + "epoch": 0.1984667903701589, + "grad_norm": 1.5751850605010986, + "learning_rate": 4.5296242616029204e-05, + "loss": 5.108, + "step": 33371 + }, + { + "epoch": 0.1984727376534399, + "grad_norm": 1.5508745908737183, + "learning_rate": 4.5295969889312e-05, + "loss": 5.8937, + "step": 33372 + }, + { + "epoch": 0.1984786849367209, + "grad_norm": 1.5728036165237427, + "learning_rate": 4.5295697155509665e-05, + "loss": 5.6068, + "step": 33373 + }, + { + "epoch": 0.1984846322200019, + "grad_norm": 1.8891894817352295, + "learning_rate": 4.5295424414622315e-05, + "loss": 4.8044, + "step": 33374 + }, + { + "epoch": 0.1984905795032829, + "grad_norm": 2.7243154048919678, + "learning_rate": 4.529515166665005e-05, + "loss": 3.5559, + "step": 33375 + }, + { + "epoch": 0.1984965267865639, + "grad_norm": 2.6664438247680664, + "learning_rate": 4.529487891159295e-05, + "loss": 3.6558, + "step": 33376 + }, + { + "epoch": 0.1985024740698449, + "grad_norm": 1.6460233926773071, + "learning_rate": 4.5294606149451125e-05, + "loss": 4.8593, + "step": 33377 + }, + { + "epoch": 0.19850842135312588, + "grad_norm": 1.710748553276062, + "learning_rate": 4.5294333380224655e-05, + "loss": 5.2335, + "step": 33378 + }, + { + "epoch": 0.1985143686364069, + "grad_norm": 1.633082628250122, + "learning_rate": 4.529406060391365e-05, + "loss": 5.0631, + "step": 33379 + }, + { + "epoch": 0.19852031591968788, + "grad_norm": 1.5868422985076904, + "learning_rate": 4.529378782051819e-05, + "loss": 4.4778, + "step": 33380 + }, + { + "epoch": 0.19852626320296887, + "grad_norm": 1.3042185306549072, + "learning_rate": 4.529351503003838e-05, + "loss": 4.5756, + "step": 33381 + }, + { + "epoch": 0.1985322104862499, + "grad_norm": 1.6254199743270874, + "learning_rate": 4.529324223247432e-05, + "loss": 5.177, + "step": 33382 + }, + { + "epoch": 0.19853815776953088, + "grad_norm": 1.570239782333374, + "learning_rate": 4.529296942782609e-05, + "loss": 5.0367, + "step": 33383 + }, + { + "epoch": 0.19854410505281186, + "grad_norm": 1.8509953022003174, + "learning_rate": 4.52926966160938e-05, + "loss": 4.5187, + "step": 33384 + }, + { + "epoch": 0.19855005233609288, + "grad_norm": 1.6336568593978882, + "learning_rate": 4.529242379727754e-05, + "loss": 5.3582, + "step": 33385 + }, + { + "epoch": 0.19855599961937387, + "grad_norm": 1.587899923324585, + "learning_rate": 4.52921509713774e-05, + "loss": 4.8127, + "step": 33386 + }, + { + "epoch": 0.19856194690265486, + "grad_norm": 1.5488510131835938, + "learning_rate": 4.529187813839349e-05, + "loss": 5.5138, + "step": 33387 + }, + { + "epoch": 0.19856789418593587, + "grad_norm": 1.808288335800171, + "learning_rate": 4.5291605298325884e-05, + "loss": 4.5717, + "step": 33388 + }, + { + "epoch": 0.19857384146921686, + "grad_norm": 1.478675365447998, + "learning_rate": 4.5291332451174687e-05, + "loss": 4.53, + "step": 33389 + }, + { + "epoch": 0.19857978875249785, + "grad_norm": 1.9420822858810425, + "learning_rate": 4.5291059596940004e-05, + "loss": 4.5866, + "step": 33390 + }, + { + "epoch": 0.19858573603577886, + "grad_norm": 2.0074143409729004, + "learning_rate": 4.5290786735621916e-05, + "loss": 4.9823, + "step": 33391 + }, + { + "epoch": 0.19859168331905985, + "grad_norm": 1.8389657735824585, + "learning_rate": 4.529051386722053e-05, + "loss": 5.1662, + "step": 33392 + }, + { + "epoch": 0.19859763060234084, + "grad_norm": 1.6590776443481445, + "learning_rate": 4.5290240991735934e-05, + "loss": 5.5359, + "step": 33393 + }, + { + "epoch": 0.19860357788562186, + "grad_norm": 1.7295751571655273, + "learning_rate": 4.5289968109168216e-05, + "loss": 3.9299, + "step": 33394 + }, + { + "epoch": 0.19860952516890285, + "grad_norm": 1.7071540355682373, + "learning_rate": 4.5289695219517486e-05, + "loss": 3.8308, + "step": 33395 + }, + { + "epoch": 0.19861547245218383, + "grad_norm": 1.7689669132232666, + "learning_rate": 4.528942232278383e-05, + "loss": 3.9113, + "step": 33396 + }, + { + "epoch": 0.19862141973546485, + "grad_norm": 1.9830238819122314, + "learning_rate": 4.5289149418967345e-05, + "loss": 4.1391, + "step": 33397 + }, + { + "epoch": 0.19862736701874584, + "grad_norm": 2.3440747261047363, + "learning_rate": 4.5288876508068136e-05, + "loss": 3.2765, + "step": 33398 + }, + { + "epoch": 0.19863331430202683, + "grad_norm": 1.9929230213165283, + "learning_rate": 4.528860359008629e-05, + "loss": 4.2321, + "step": 33399 + }, + { + "epoch": 0.19863926158530784, + "grad_norm": 1.9815763235092163, + "learning_rate": 4.528833066502189e-05, + "loss": 4.7819, + "step": 33400 + }, + { + "epoch": 0.19864520886858883, + "grad_norm": 1.6043485403060913, + "learning_rate": 4.528805773287506e-05, + "loss": 5.1746, + "step": 33401 + }, + { + "epoch": 0.19865115615186982, + "grad_norm": 1.8365287780761719, + "learning_rate": 4.528778479364586e-05, + "loss": 4.8251, + "step": 33402 + }, + { + "epoch": 0.19865710343515083, + "grad_norm": 1.966765284538269, + "learning_rate": 4.5287511847334416e-05, + "loss": 5.223, + "step": 33403 + }, + { + "epoch": 0.19866305071843182, + "grad_norm": 1.7002321481704712, + "learning_rate": 4.528723889394081e-05, + "loss": 4.4082, + "step": 33404 + }, + { + "epoch": 0.1986689980017128, + "grad_norm": 2.144162893295288, + "learning_rate": 4.528696593346513e-05, + "loss": 4.077, + "step": 33405 + }, + { + "epoch": 0.19867494528499383, + "grad_norm": 1.9121687412261963, + "learning_rate": 4.528669296590749e-05, + "loss": 4.2574, + "step": 33406 + }, + { + "epoch": 0.19868089256827481, + "grad_norm": 1.817332148551941, + "learning_rate": 4.5286419991267966e-05, + "loss": 4.4668, + "step": 33407 + }, + { + "epoch": 0.1986868398515558, + "grad_norm": 2.071458578109741, + "learning_rate": 4.528614700954667e-05, + "loss": 5.1306, + "step": 33408 + }, + { + "epoch": 0.19869278713483682, + "grad_norm": 1.7303532361984253, + "learning_rate": 4.528587402074369e-05, + "loss": 4.905, + "step": 33409 + }, + { + "epoch": 0.1986987344181178, + "grad_norm": 1.7372905015945435, + "learning_rate": 4.528560102485912e-05, + "loss": 5.1694, + "step": 33410 + }, + { + "epoch": 0.1987046817013988, + "grad_norm": 1.7425367832183838, + "learning_rate": 4.528532802189306e-05, + "loss": 3.9595, + "step": 33411 + }, + { + "epoch": 0.1987106289846798, + "grad_norm": 1.6367287635803223, + "learning_rate": 4.528505501184559e-05, + "loss": 3.797, + "step": 33412 + }, + { + "epoch": 0.1987165762679608, + "grad_norm": 1.4426088333129883, + "learning_rate": 4.5284781994716826e-05, + "loss": 4.0362, + "step": 33413 + }, + { + "epoch": 0.1987225235512418, + "grad_norm": 1.4077881574630737, + "learning_rate": 4.528450897050685e-05, + "loss": 4.0144, + "step": 33414 + }, + { + "epoch": 0.1987284708345228, + "grad_norm": 1.437828540802002, + "learning_rate": 4.5284235939215765e-05, + "loss": 4.0895, + "step": 33415 + }, + { + "epoch": 0.1987344181178038, + "grad_norm": 1.3500796556472778, + "learning_rate": 4.5283962900843654e-05, + "loss": 4.1622, + "step": 33416 + }, + { + "epoch": 0.19874036540108478, + "grad_norm": 2.1117026805877686, + "learning_rate": 4.528368985539063e-05, + "loss": 5.0076, + "step": 33417 + }, + { + "epoch": 0.1987463126843658, + "grad_norm": 1.337552547454834, + "learning_rate": 4.528341680285678e-05, + "loss": 5.1327, + "step": 33418 + }, + { + "epoch": 0.19875225996764678, + "grad_norm": 2.3471126556396484, + "learning_rate": 4.5283143743242197e-05, + "loss": 3.321, + "step": 33419 + }, + { + "epoch": 0.19875820725092777, + "grad_norm": 2.268986940383911, + "learning_rate": 4.528287067654697e-05, + "loss": 3.1999, + "step": 33420 + }, + { + "epoch": 0.1987641545342088, + "grad_norm": 1.8402795791625977, + "learning_rate": 4.5282597602771215e-05, + "loss": 4.2179, + "step": 33421 + }, + { + "epoch": 0.19877010181748977, + "grad_norm": 1.7201100587844849, + "learning_rate": 4.528232452191501e-05, + "loss": 4.1152, + "step": 33422 + }, + { + "epoch": 0.19877604910077076, + "grad_norm": 1.4088517427444458, + "learning_rate": 4.528205143397846e-05, + "loss": 4.6504, + "step": 33423 + }, + { + "epoch": 0.19878199638405178, + "grad_norm": 1.7721384763717651, + "learning_rate": 4.5281778338961644e-05, + "loss": 5.1871, + "step": 33424 + }, + { + "epoch": 0.19878794366733277, + "grad_norm": 2.0416159629821777, + "learning_rate": 4.528150523686468e-05, + "loss": 4.2345, + "step": 33425 + }, + { + "epoch": 0.19879389095061376, + "grad_norm": 1.943342924118042, + "learning_rate": 4.528123212768764e-05, + "loss": 5.1177, + "step": 33426 + }, + { + "epoch": 0.19879983823389474, + "grad_norm": 1.8057464361190796, + "learning_rate": 4.528095901143063e-05, + "loss": 4.8638, + "step": 33427 + }, + { + "epoch": 0.19880578551717576, + "grad_norm": 1.9637550115585327, + "learning_rate": 4.5280685888093764e-05, + "loss": 4.5566, + "step": 33428 + }, + { + "epoch": 0.19881173280045675, + "grad_norm": 1.7107211351394653, + "learning_rate": 4.5280412757677104e-05, + "loss": 5.3038, + "step": 33429 + }, + { + "epoch": 0.19881768008373774, + "grad_norm": 1.9364093542099, + "learning_rate": 4.5280139620180773e-05, + "loss": 5.1847, + "step": 33430 + }, + { + "epoch": 0.19882362736701875, + "grad_norm": 1.9583579301834106, + "learning_rate": 4.5279866475604846e-05, + "loss": 5.1695, + "step": 33431 + }, + { + "epoch": 0.19882957465029974, + "grad_norm": 1.931999683380127, + "learning_rate": 4.527959332394943e-05, + "loss": 5.2474, + "step": 33432 + }, + { + "epoch": 0.19883552193358073, + "grad_norm": 1.7884893417358398, + "learning_rate": 4.5279320165214623e-05, + "loss": 5.1989, + "step": 33433 + }, + { + "epoch": 0.19884146921686174, + "grad_norm": 1.706418752670288, + "learning_rate": 4.527904699940051e-05, + "loss": 5.076, + "step": 33434 + }, + { + "epoch": 0.19884741650014273, + "grad_norm": 1.7451330423355103, + "learning_rate": 4.5278773826507195e-05, + "loss": 5.1754, + "step": 33435 + }, + { + "epoch": 0.19885336378342372, + "grad_norm": 1.9312299489974976, + "learning_rate": 4.5278500646534764e-05, + "loss": 4.4978, + "step": 33436 + }, + { + "epoch": 0.19885931106670474, + "grad_norm": 2.42375111579895, + "learning_rate": 4.527822745948332e-05, + "loss": 4.2296, + "step": 33437 + }, + { + "epoch": 0.19886525834998572, + "grad_norm": 1.8249690532684326, + "learning_rate": 4.5277954265352956e-05, + "loss": 4.9476, + "step": 33438 + }, + { + "epoch": 0.1988712056332667, + "grad_norm": 1.886839509010315, + "learning_rate": 4.527768106414377e-05, + "loss": 5.082, + "step": 33439 + }, + { + "epoch": 0.19887715291654773, + "grad_norm": 1.6707491874694824, + "learning_rate": 4.527740785585585e-05, + "loss": 4.887, + "step": 33440 + }, + { + "epoch": 0.19888310019982872, + "grad_norm": 1.8287665843963623, + "learning_rate": 4.5277134640489296e-05, + "loss": 5.2154, + "step": 33441 + }, + { + "epoch": 0.1988890474831097, + "grad_norm": 1.7216829061508179, + "learning_rate": 4.5276861418044214e-05, + "loss": 5.4044, + "step": 33442 + }, + { + "epoch": 0.19889499476639072, + "grad_norm": 1.6618791818618774, + "learning_rate": 4.527658818852068e-05, + "loss": 5.4637, + "step": 33443 + }, + { + "epoch": 0.1989009420496717, + "grad_norm": 1.4226763248443604, + "learning_rate": 4.52763149519188e-05, + "loss": 5.2645, + "step": 33444 + }, + { + "epoch": 0.1989068893329527, + "grad_norm": 1.634399175643921, + "learning_rate": 4.527604170823867e-05, + "loss": 4.8305, + "step": 33445 + }, + { + "epoch": 0.1989128366162337, + "grad_norm": 1.6638933420181274, + "learning_rate": 4.5275768457480384e-05, + "loss": 4.9628, + "step": 33446 + }, + { + "epoch": 0.1989187838995147, + "grad_norm": 1.7389144897460938, + "learning_rate": 4.5275495199644036e-05, + "loss": 5.0198, + "step": 33447 + }, + { + "epoch": 0.1989247311827957, + "grad_norm": 1.6871912479400635, + "learning_rate": 4.527522193472972e-05, + "loss": 4.4683, + "step": 33448 + }, + { + "epoch": 0.1989306784660767, + "grad_norm": 1.8285382986068726, + "learning_rate": 4.527494866273753e-05, + "loss": 5.0369, + "step": 33449 + }, + { + "epoch": 0.1989366257493577, + "grad_norm": 1.6555229425430298, + "learning_rate": 4.527467538366758e-05, + "loss": 5.3217, + "step": 33450 + }, + { + "epoch": 0.19894257303263868, + "grad_norm": 2.1087262630462646, + "learning_rate": 4.5274402097519933e-05, + "loss": 4.3458, + "step": 33451 + }, + { + "epoch": 0.1989485203159197, + "grad_norm": 1.8274654150009155, + "learning_rate": 4.5274128804294705e-05, + "loss": 4.9379, + "step": 33452 + }, + { + "epoch": 0.19895446759920069, + "grad_norm": 2.4635887145996094, + "learning_rate": 4.5273855503991994e-05, + "loss": 4.4695, + "step": 33453 + }, + { + "epoch": 0.19896041488248167, + "grad_norm": 1.6194392442703247, + "learning_rate": 4.527358219661189e-05, + "loss": 4.739, + "step": 33454 + }, + { + "epoch": 0.1989663621657627, + "grad_norm": 1.819419026374817, + "learning_rate": 4.527330888215448e-05, + "loss": 4.7175, + "step": 33455 + }, + { + "epoch": 0.19897230944904368, + "grad_norm": 1.6347033977508545, + "learning_rate": 4.527303556061987e-05, + "loss": 4.9085, + "step": 33456 + }, + { + "epoch": 0.19897825673232467, + "grad_norm": 1.6178079843521118, + "learning_rate": 4.527276223200816e-05, + "loss": 5.0249, + "step": 33457 + }, + { + "epoch": 0.19898420401560568, + "grad_norm": 2.922417163848877, + "learning_rate": 4.5272488896319434e-05, + "loss": 4.4398, + "step": 33458 + }, + { + "epoch": 0.19899015129888667, + "grad_norm": 2.004303455352783, + "learning_rate": 4.527221555355379e-05, + "loss": 4.417, + "step": 33459 + }, + { + "epoch": 0.19899609858216766, + "grad_norm": 1.8364394903182983, + "learning_rate": 4.5271942203711326e-05, + "loss": 4.9475, + "step": 33460 + }, + { + "epoch": 0.19900204586544867, + "grad_norm": 1.5880411863327026, + "learning_rate": 4.5271668846792134e-05, + "loss": 5.062, + "step": 33461 + }, + { + "epoch": 0.19900799314872966, + "grad_norm": 1.5956658124923706, + "learning_rate": 4.5271395482796306e-05, + "loss": 5.0605, + "step": 33462 + }, + { + "epoch": 0.19901394043201065, + "grad_norm": 1.5776112079620361, + "learning_rate": 4.527112211172396e-05, + "loss": 4.8766, + "step": 33463 + }, + { + "epoch": 0.19901988771529167, + "grad_norm": 1.6271411180496216, + "learning_rate": 4.5270848733575166e-05, + "loss": 4.997, + "step": 33464 + }, + { + "epoch": 0.19902583499857265, + "grad_norm": 1.4883582592010498, + "learning_rate": 4.527057534835002e-05, + "loss": 4.9727, + "step": 33465 + }, + { + "epoch": 0.19903178228185364, + "grad_norm": 1.74699866771698, + "learning_rate": 4.527030195604864e-05, + "loss": 4.5348, + "step": 33466 + }, + { + "epoch": 0.19903772956513466, + "grad_norm": 1.9776692390441895, + "learning_rate": 4.52700285566711e-05, + "loss": 4.5182, + "step": 33467 + }, + { + "epoch": 0.19904367684841565, + "grad_norm": 1.4299882650375366, + "learning_rate": 4.52697551502175e-05, + "loss": 5.0135, + "step": 33468 + }, + { + "epoch": 0.19904962413169663, + "grad_norm": 2.07140851020813, + "learning_rate": 4.5269481736687945e-05, + "loss": 4.2703, + "step": 33469 + }, + { + "epoch": 0.19905557141497765, + "grad_norm": 2.40633225440979, + "learning_rate": 4.526920831608252e-05, + "loss": 3.8694, + "step": 33470 + }, + { + "epoch": 0.19906151869825864, + "grad_norm": 1.616706132888794, + "learning_rate": 4.526893488840132e-05, + "loss": 5.2851, + "step": 33471 + }, + { + "epoch": 0.19906746598153963, + "grad_norm": 2.0044960975646973, + "learning_rate": 4.526866145364445e-05, + "loss": 5.5549, + "step": 33472 + }, + { + "epoch": 0.19907341326482064, + "grad_norm": 1.6400177478790283, + "learning_rate": 4.5268388011812e-05, + "loss": 4.9827, + "step": 33473 + }, + { + "epoch": 0.19907936054810163, + "grad_norm": 1.522547960281372, + "learning_rate": 4.526811456290406e-05, + "loss": 4.961, + "step": 33474 + }, + { + "epoch": 0.19908530783138262, + "grad_norm": 2.100389242172241, + "learning_rate": 4.5267841106920724e-05, + "loss": 4.6502, + "step": 33475 + }, + { + "epoch": 0.19909125511466363, + "grad_norm": 1.7050951719284058, + "learning_rate": 4.526756764386211e-05, + "loss": 4.9722, + "step": 33476 + }, + { + "epoch": 0.19909720239794462, + "grad_norm": 2.1410129070281982, + "learning_rate": 4.526729417372829e-05, + "loss": 5.023, + "step": 33477 + }, + { + "epoch": 0.1991031496812256, + "grad_norm": 1.8448638916015625, + "learning_rate": 4.526702069651937e-05, + "loss": 5.2159, + "step": 33478 + }, + { + "epoch": 0.19910909696450663, + "grad_norm": 2.0991101264953613, + "learning_rate": 4.526674721223544e-05, + "loss": 4.1942, + "step": 33479 + }, + { + "epoch": 0.19911504424778761, + "grad_norm": 1.3801043033599854, + "learning_rate": 4.526647372087659e-05, + "loss": 5.1376, + "step": 33480 + }, + { + "epoch": 0.1991209915310686, + "grad_norm": 1.584425926208496, + "learning_rate": 4.526620022244293e-05, + "loss": 4.5196, + "step": 33481 + }, + { + "epoch": 0.19912693881434962, + "grad_norm": 1.665459394454956, + "learning_rate": 4.5265926716934556e-05, + "loss": 5.1194, + "step": 33482 + }, + { + "epoch": 0.1991328860976306, + "grad_norm": 1.5680651664733887, + "learning_rate": 4.526565320435155e-05, + "loss": 4.9814, + "step": 33483 + }, + { + "epoch": 0.1991388333809116, + "grad_norm": 1.9074794054031372, + "learning_rate": 4.526537968469401e-05, + "loss": 5.2678, + "step": 33484 + }, + { + "epoch": 0.19914478066419258, + "grad_norm": 1.5251576900482178, + "learning_rate": 4.526510615796205e-05, + "loss": 5.0521, + "step": 33485 + }, + { + "epoch": 0.1991507279474736, + "grad_norm": 1.5786724090576172, + "learning_rate": 4.526483262415573e-05, + "loss": 4.8732, + "step": 33486 + }, + { + "epoch": 0.1991566752307546, + "grad_norm": 1.6850212812423706, + "learning_rate": 4.5264559083275185e-05, + "loss": 4.8264, + "step": 33487 + }, + { + "epoch": 0.19916262251403558, + "grad_norm": 1.9387089014053345, + "learning_rate": 4.526428553532048e-05, + "loss": 4.6695, + "step": 33488 + }, + { + "epoch": 0.1991685697973166, + "grad_norm": 1.6213630437850952, + "learning_rate": 4.5264011980291724e-05, + "loss": 4.8366, + "step": 33489 + }, + { + "epoch": 0.19917451708059758, + "grad_norm": 1.4320731163024902, + "learning_rate": 4.526373841818901e-05, + "loss": 5.0784, + "step": 33490 + }, + { + "epoch": 0.19918046436387857, + "grad_norm": 1.5601176023483276, + "learning_rate": 4.5263464849012436e-05, + "loss": 4.8712, + "step": 33491 + }, + { + "epoch": 0.19918641164715958, + "grad_norm": 1.610245943069458, + "learning_rate": 4.52631912727621e-05, + "loss": 5.0412, + "step": 33492 + }, + { + "epoch": 0.19919235893044057, + "grad_norm": 1.4566705226898193, + "learning_rate": 4.5262917689438086e-05, + "loss": 4.8381, + "step": 33493 + }, + { + "epoch": 0.19919830621372156, + "grad_norm": 2.0661633014678955, + "learning_rate": 4.52626440990405e-05, + "loss": 4.2362, + "step": 33494 + }, + { + "epoch": 0.19920425349700258, + "grad_norm": 2.077457904815674, + "learning_rate": 4.526237050156944e-05, + "loss": 4.9774, + "step": 33495 + }, + { + "epoch": 0.19921020078028356, + "grad_norm": 1.723219394683838, + "learning_rate": 4.5262096897024985e-05, + "loss": 5.2097, + "step": 33496 + }, + { + "epoch": 0.19921614806356455, + "grad_norm": 1.7461673021316528, + "learning_rate": 4.526182328540725e-05, + "loss": 4.9274, + "step": 33497 + }, + { + "epoch": 0.19922209534684557, + "grad_norm": 2.2677931785583496, + "learning_rate": 4.526154966671632e-05, + "loss": 4.0952, + "step": 33498 + }, + { + "epoch": 0.19922804263012656, + "grad_norm": 3.0971813201904297, + "learning_rate": 4.526127604095229e-05, + "loss": 3.9733, + "step": 33499 + }, + { + "epoch": 0.19923398991340754, + "grad_norm": 3.0695557594299316, + "learning_rate": 4.526100240811526e-05, + "loss": 3.4307, + "step": 33500 + }, + { + "epoch": 0.19923993719668856, + "grad_norm": 2.502638101577759, + "learning_rate": 4.526072876820532e-05, + "loss": 4.0434, + "step": 33501 + }, + { + "epoch": 0.19924588447996955, + "grad_norm": 1.444030523300171, + "learning_rate": 4.5260455121222566e-05, + "loss": 5.0315, + "step": 33502 + }, + { + "epoch": 0.19925183176325054, + "grad_norm": 1.5067824125289917, + "learning_rate": 4.526018146716711e-05, + "loss": 4.7649, + "step": 33503 + }, + { + "epoch": 0.19925777904653155, + "grad_norm": 1.5262528657913208, + "learning_rate": 4.525990780603903e-05, + "loss": 4.9649, + "step": 33504 + }, + { + "epoch": 0.19926372632981254, + "grad_norm": 1.6207854747772217, + "learning_rate": 4.525963413783841e-05, + "loss": 4.7577, + "step": 33505 + }, + { + "epoch": 0.19926967361309353, + "grad_norm": 2.1585114002227783, + "learning_rate": 4.5259360462565377e-05, + "loss": 4.3194, + "step": 33506 + }, + { + "epoch": 0.19927562089637454, + "grad_norm": 3.0893638134002686, + "learning_rate": 4.525908678022001e-05, + "loss": 3.4338, + "step": 33507 + }, + { + "epoch": 0.19928156817965553, + "grad_norm": 3.0618252754211426, + "learning_rate": 4.5258813090802396e-05, + "loss": 3.6044, + "step": 33508 + }, + { + "epoch": 0.19928751546293652, + "grad_norm": 3.0148963928222656, + "learning_rate": 4.525853939431264e-05, + "loss": 3.6999, + "step": 33509 + }, + { + "epoch": 0.19929346274621754, + "grad_norm": 1.7465107440948486, + "learning_rate": 4.5258265690750846e-05, + "loss": 4.6342, + "step": 33510 + }, + { + "epoch": 0.19929941002949852, + "grad_norm": 1.6526566743850708, + "learning_rate": 4.52579919801171e-05, + "loss": 5.1376, + "step": 33511 + }, + { + "epoch": 0.1993053573127795, + "grad_norm": 1.583158254623413, + "learning_rate": 4.525771826241149e-05, + "loss": 4.8617, + "step": 33512 + }, + { + "epoch": 0.19931130459606053, + "grad_norm": 1.6602866649627686, + "learning_rate": 4.5257444537634124e-05, + "loss": 4.5414, + "step": 33513 + }, + { + "epoch": 0.19931725187934152, + "grad_norm": 1.583927035331726, + "learning_rate": 4.5257170805785095e-05, + "loss": 4.8343, + "step": 33514 + }, + { + "epoch": 0.1993231991626225, + "grad_norm": 1.6319681406021118, + "learning_rate": 4.52568970668645e-05, + "loss": 5.2782, + "step": 33515 + }, + { + "epoch": 0.19932914644590352, + "grad_norm": 1.7109445333480835, + "learning_rate": 4.5256623320872424e-05, + "loss": 4.9891, + "step": 33516 + }, + { + "epoch": 0.1993350937291845, + "grad_norm": 1.7144900560379028, + "learning_rate": 4.525634956780897e-05, + "loss": 4.9999, + "step": 33517 + }, + { + "epoch": 0.1993410410124655, + "grad_norm": 1.9427156448364258, + "learning_rate": 4.5256075807674233e-05, + "loss": 5.1138, + "step": 33518 + }, + { + "epoch": 0.1993469882957465, + "grad_norm": 1.6421605348587036, + "learning_rate": 4.525580204046832e-05, + "loss": 4.9218, + "step": 33519 + }, + { + "epoch": 0.1993529355790275, + "grad_norm": 1.7899574041366577, + "learning_rate": 4.52555282661913e-05, + "loss": 4.5285, + "step": 33520 + }, + { + "epoch": 0.1993588828623085, + "grad_norm": 1.706308364868164, + "learning_rate": 4.52552544848433e-05, + "loss": 5.1926, + "step": 33521 + }, + { + "epoch": 0.1993648301455895, + "grad_norm": 2.0579419136047363, + "learning_rate": 4.5254980696424396e-05, + "loss": 4.8708, + "step": 33522 + }, + { + "epoch": 0.1993707774288705, + "grad_norm": 2.4866833686828613, + "learning_rate": 4.5254706900934684e-05, + "loss": 4.33, + "step": 33523 + }, + { + "epoch": 0.19937672471215148, + "grad_norm": 1.4279406070709229, + "learning_rate": 4.525443309837426e-05, + "loss": 4.7774, + "step": 33524 + }, + { + "epoch": 0.1993826719954325, + "grad_norm": 1.9905481338500977, + "learning_rate": 4.525415928874324e-05, + "loss": 5.0473, + "step": 33525 + }, + { + "epoch": 0.19938861927871349, + "grad_norm": 1.6799120903015137, + "learning_rate": 4.525388547204168e-05, + "loss": 4.8557, + "step": 33526 + }, + { + "epoch": 0.19939456656199447, + "grad_norm": 1.8065446615219116, + "learning_rate": 4.525361164826971e-05, + "loss": 5.3703, + "step": 33527 + }, + { + "epoch": 0.1994005138452755, + "grad_norm": 1.5986427068710327, + "learning_rate": 4.525333781742741e-05, + "loss": 5.0066, + "step": 33528 + }, + { + "epoch": 0.19940646112855648, + "grad_norm": 2.090648889541626, + "learning_rate": 4.525306397951488e-05, + "loss": 4.8344, + "step": 33529 + }, + { + "epoch": 0.19941240841183747, + "grad_norm": 1.7685662508010864, + "learning_rate": 4.525279013453221e-05, + "loss": 4.5956, + "step": 33530 + }, + { + "epoch": 0.19941835569511848, + "grad_norm": 1.6398029327392578, + "learning_rate": 4.525251628247951e-05, + "loss": 5.3404, + "step": 33531 + }, + { + "epoch": 0.19942430297839947, + "grad_norm": 1.805405616760254, + "learning_rate": 4.525224242335685e-05, + "loss": 5.2919, + "step": 33532 + }, + { + "epoch": 0.19943025026168046, + "grad_norm": 1.791210651397705, + "learning_rate": 4.525196855716435e-05, + "loss": 5.3577, + "step": 33533 + }, + { + "epoch": 0.19943619754496147, + "grad_norm": 1.7393286228179932, + "learning_rate": 4.52516946839021e-05, + "loss": 5.2954, + "step": 33534 + }, + { + "epoch": 0.19944214482824246, + "grad_norm": 1.9773000478744507, + "learning_rate": 4.525142080357019e-05, + "loss": 4.7362, + "step": 33535 + }, + { + "epoch": 0.19944809211152345, + "grad_norm": 1.4539952278137207, + "learning_rate": 4.5251146916168715e-05, + "loss": 4.9983, + "step": 33536 + }, + { + "epoch": 0.19945403939480447, + "grad_norm": 1.7288161516189575, + "learning_rate": 4.525087302169778e-05, + "loss": 5.1685, + "step": 33537 + }, + { + "epoch": 0.19945998667808545, + "grad_norm": 1.477931261062622, + "learning_rate": 4.525059912015748e-05, + "loss": 4.9073, + "step": 33538 + }, + { + "epoch": 0.19946593396136644, + "grad_norm": 2.294431209564209, + "learning_rate": 4.525032521154789e-05, + "loss": 4.3584, + "step": 33539 + }, + { + "epoch": 0.19947188124464746, + "grad_norm": 1.388110876083374, + "learning_rate": 4.525005129586913e-05, + "loss": 5.2412, + "step": 33540 + }, + { + "epoch": 0.19947782852792845, + "grad_norm": 1.660605788230896, + "learning_rate": 4.5249777373121285e-05, + "loss": 4.7119, + "step": 33541 + }, + { + "epoch": 0.19948377581120944, + "grad_norm": 1.37186861038208, + "learning_rate": 4.524950344330445e-05, + "loss": 5.1161, + "step": 33542 + }, + { + "epoch": 0.19948972309449042, + "grad_norm": 2.0066730976104736, + "learning_rate": 4.5249229506418725e-05, + "loss": 4.9816, + "step": 33543 + }, + { + "epoch": 0.19949567037777144, + "grad_norm": 1.7703311443328857, + "learning_rate": 4.52489555624642e-05, + "loss": 5.0812, + "step": 33544 + }, + { + "epoch": 0.19950161766105243, + "grad_norm": 1.6476131677627563, + "learning_rate": 4.524868161144098e-05, + "loss": 5.1316, + "step": 33545 + }, + { + "epoch": 0.19950756494433342, + "grad_norm": 1.8000843524932861, + "learning_rate": 4.524840765334915e-05, + "loss": 5.1469, + "step": 33546 + }, + { + "epoch": 0.19951351222761443, + "grad_norm": 2.017563581466675, + "learning_rate": 4.524813368818881e-05, + "loss": 5.2712, + "step": 33547 + }, + { + "epoch": 0.19951945951089542, + "grad_norm": 1.9950426816940308, + "learning_rate": 4.524785971596006e-05, + "loss": 5.0172, + "step": 33548 + }, + { + "epoch": 0.1995254067941764, + "grad_norm": 2.131312370300293, + "learning_rate": 4.5247585736662985e-05, + "loss": 4.4589, + "step": 33549 + }, + { + "epoch": 0.19953135407745742, + "grad_norm": 2.1414794921875, + "learning_rate": 4.524731175029769e-05, + "loss": 4.1757, + "step": 33550 + }, + { + "epoch": 0.1995373013607384, + "grad_norm": 1.6311516761779785, + "learning_rate": 4.524703775686426e-05, + "loss": 4.7597, + "step": 33551 + }, + { + "epoch": 0.1995432486440194, + "grad_norm": 1.5711687803268433, + "learning_rate": 4.524676375636281e-05, + "loss": 3.9699, + "step": 33552 + }, + { + "epoch": 0.19954919592730042, + "grad_norm": 2.298886299133301, + "learning_rate": 4.524648974879342e-05, + "loss": 3.5056, + "step": 33553 + }, + { + "epoch": 0.1995551432105814, + "grad_norm": 1.629654049873352, + "learning_rate": 4.5246215734156186e-05, + "loss": 5.0122, + "step": 33554 + }, + { + "epoch": 0.1995610904938624, + "grad_norm": 1.8879503011703491, + "learning_rate": 4.5245941712451215e-05, + "loss": 4.5958, + "step": 33555 + }, + { + "epoch": 0.1995670377771434, + "grad_norm": 1.9116814136505127, + "learning_rate": 4.524566768367859e-05, + "loss": 4.9318, + "step": 33556 + }, + { + "epoch": 0.1995729850604244, + "grad_norm": 1.3296679258346558, + "learning_rate": 4.524539364783841e-05, + "loss": 4.2284, + "step": 33557 + }, + { + "epoch": 0.19957893234370538, + "grad_norm": 1.4925459623336792, + "learning_rate": 4.5245119604930775e-05, + "loss": 5.0751, + "step": 33558 + }, + { + "epoch": 0.1995848796269864, + "grad_norm": 1.524156093597412, + "learning_rate": 4.5244845554955774e-05, + "loss": 3.9814, + "step": 33559 + }, + { + "epoch": 0.1995908269102674, + "grad_norm": 1.7568142414093018, + "learning_rate": 4.524457149791351e-05, + "loss": 3.994, + "step": 33560 + }, + { + "epoch": 0.19959677419354838, + "grad_norm": 1.3893617391586304, + "learning_rate": 4.524429743380407e-05, + "loss": 3.9492, + "step": 33561 + }, + { + "epoch": 0.1996027214768294, + "grad_norm": 1.5644930601119995, + "learning_rate": 4.524402336262756e-05, + "loss": 4.4825, + "step": 33562 + }, + { + "epoch": 0.19960866876011038, + "grad_norm": 1.7970536947250366, + "learning_rate": 4.524374928438407e-05, + "loss": 3.8519, + "step": 33563 + }, + { + "epoch": 0.19961461604339137, + "grad_norm": 1.5144481658935547, + "learning_rate": 4.52434751990737e-05, + "loss": 4.4697, + "step": 33564 + }, + { + "epoch": 0.19962056332667238, + "grad_norm": 1.679702639579773, + "learning_rate": 4.524320110669654e-05, + "loss": 4.8946, + "step": 33565 + }, + { + "epoch": 0.19962651060995337, + "grad_norm": 1.7595206499099731, + "learning_rate": 4.524292700725268e-05, + "loss": 5.0111, + "step": 33566 + }, + { + "epoch": 0.19963245789323436, + "grad_norm": 1.3525060415267944, + "learning_rate": 4.524265290074223e-05, + "loss": 4.9942, + "step": 33567 + }, + { + "epoch": 0.19963840517651538, + "grad_norm": 1.6003968715667725, + "learning_rate": 4.524237878716529e-05, + "loss": 4.4323, + "step": 33568 + }, + { + "epoch": 0.19964435245979636, + "grad_norm": 1.654555082321167, + "learning_rate": 4.524210466652192e-05, + "loss": 3.804, + "step": 33569 + }, + { + "epoch": 0.19965029974307735, + "grad_norm": 1.7716010808944702, + "learning_rate": 4.524183053881226e-05, + "loss": 3.8744, + "step": 33570 + }, + { + "epoch": 0.19965624702635837, + "grad_norm": 1.7306915521621704, + "learning_rate": 4.524155640403638e-05, + "loss": 3.8362, + "step": 33571 + }, + { + "epoch": 0.19966219430963936, + "grad_norm": 1.5759642124176025, + "learning_rate": 4.524128226219438e-05, + "loss": 3.8123, + "step": 33572 + }, + { + "epoch": 0.19966814159292035, + "grad_norm": 1.6143770217895508, + "learning_rate": 4.524100811328636e-05, + "loss": 4.1127, + "step": 33573 + }, + { + "epoch": 0.19967408887620136, + "grad_norm": 1.6612343788146973, + "learning_rate": 4.524073395731241e-05, + "loss": 5.2288, + "step": 33574 + }, + { + "epoch": 0.19968003615948235, + "grad_norm": 2.272780418395996, + "learning_rate": 4.524045979427263e-05, + "loss": 3.6138, + "step": 33575 + }, + { + "epoch": 0.19968598344276334, + "grad_norm": 1.8799057006835938, + "learning_rate": 4.524018562416712e-05, + "loss": 3.7476, + "step": 33576 + }, + { + "epoch": 0.19969193072604435, + "grad_norm": 1.7091578245162964, + "learning_rate": 4.5239911446995966e-05, + "loss": 3.7152, + "step": 33577 + }, + { + "epoch": 0.19969787800932534, + "grad_norm": 1.7033981084823608, + "learning_rate": 4.523963726275926e-05, + "loss": 4.178, + "step": 33578 + }, + { + "epoch": 0.19970382529260633, + "grad_norm": 1.5857266187667847, + "learning_rate": 4.523936307145712e-05, + "loss": 4.2662, + "step": 33579 + }, + { + "epoch": 0.19970977257588735, + "grad_norm": 1.5587173700332642, + "learning_rate": 4.523908887308962e-05, + "loss": 4.7286, + "step": 33580 + }, + { + "epoch": 0.19971571985916833, + "grad_norm": 1.6295536756515503, + "learning_rate": 4.523881466765686e-05, + "loss": 4.7793, + "step": 33581 + }, + { + "epoch": 0.19972166714244932, + "grad_norm": 1.4184001684188843, + "learning_rate": 4.523854045515895e-05, + "loss": 4.4593, + "step": 33582 + }, + { + "epoch": 0.19972761442573034, + "grad_norm": 1.3835517168045044, + "learning_rate": 4.5238266235595964e-05, + "loss": 5.4586, + "step": 33583 + }, + { + "epoch": 0.19973356170901133, + "grad_norm": 2.0789854526519775, + "learning_rate": 4.523799200896801e-05, + "loss": 4.8789, + "step": 33584 + }, + { + "epoch": 0.19973950899229231, + "grad_norm": 1.995231032371521, + "learning_rate": 4.5237717775275184e-05, + "loss": 5.0695, + "step": 33585 + }, + { + "epoch": 0.19974545627557333, + "grad_norm": 1.5125774145126343, + "learning_rate": 4.523744353451758e-05, + "loss": 5.1086, + "step": 33586 + }, + { + "epoch": 0.19975140355885432, + "grad_norm": 1.6523572206497192, + "learning_rate": 4.523716928669529e-05, + "loss": 4.886, + "step": 33587 + }, + { + "epoch": 0.1997573508421353, + "grad_norm": 1.6928048133850098, + "learning_rate": 4.5236895031808425e-05, + "loss": 5.1758, + "step": 33588 + }, + { + "epoch": 0.19976329812541632, + "grad_norm": 1.6727235317230225, + "learning_rate": 4.523662076985706e-05, + "loss": 5.1094, + "step": 33589 + }, + { + "epoch": 0.1997692454086973, + "grad_norm": 1.4500248432159424, + "learning_rate": 4.5236346500841297e-05, + "loss": 4.9389, + "step": 33590 + }, + { + "epoch": 0.1997751926919783, + "grad_norm": 1.9560678005218506, + "learning_rate": 4.523607222476124e-05, + "loss": 4.6893, + "step": 33591 + }, + { + "epoch": 0.19978113997525931, + "grad_norm": 2.10848331451416, + "learning_rate": 4.523579794161697e-05, + "loss": 4.0417, + "step": 33592 + }, + { + "epoch": 0.1997870872585403, + "grad_norm": 2.400477647781372, + "learning_rate": 4.523552365140861e-05, + "loss": 3.408, + "step": 33593 + }, + { + "epoch": 0.1997930345418213, + "grad_norm": 1.886122226715088, + "learning_rate": 4.523524935413622e-05, + "loss": 4.6157, + "step": 33594 + }, + { + "epoch": 0.1997989818251023, + "grad_norm": 1.5088223218917847, + "learning_rate": 4.523497504979992e-05, + "loss": 5.0846, + "step": 33595 + }, + { + "epoch": 0.1998049291083833, + "grad_norm": 1.4798957109451294, + "learning_rate": 4.52347007383998e-05, + "loss": 5.0978, + "step": 33596 + }, + { + "epoch": 0.19981087639166428, + "grad_norm": 1.7828933000564575, + "learning_rate": 4.523442641993596e-05, + "loss": 4.4761, + "step": 33597 + }, + { + "epoch": 0.1998168236749453, + "grad_norm": 2.1810219287872314, + "learning_rate": 4.523415209440848e-05, + "loss": 3.5858, + "step": 33598 + }, + { + "epoch": 0.1998227709582263, + "grad_norm": 2.2807984352111816, + "learning_rate": 4.523387776181747e-05, + "loss": 3.5799, + "step": 33599 + }, + { + "epoch": 0.19982871824150727, + "grad_norm": 2.3635599613189697, + "learning_rate": 4.523360342216303e-05, + "loss": 3.6036, + "step": 33600 + }, + { + "epoch": 0.19983466552478826, + "grad_norm": 2.358201503753662, + "learning_rate": 4.5233329075445244e-05, + "loss": 3.5597, + "step": 33601 + }, + { + "epoch": 0.19984061280806928, + "grad_norm": 2.496837854385376, + "learning_rate": 4.523305472166421e-05, + "loss": 3.4199, + "step": 33602 + }, + { + "epoch": 0.19984656009135027, + "grad_norm": 2.0924534797668457, + "learning_rate": 4.523278036082003e-05, + "loss": 4.3533, + "step": 33603 + }, + { + "epoch": 0.19985250737463126, + "grad_norm": 1.5738506317138672, + "learning_rate": 4.523250599291279e-05, + "loss": 5.2645, + "step": 33604 + }, + { + "epoch": 0.19985845465791227, + "grad_norm": 1.8330590724945068, + "learning_rate": 4.523223161794259e-05, + "loss": 4.6044, + "step": 33605 + }, + { + "epoch": 0.19986440194119326, + "grad_norm": 2.3316526412963867, + "learning_rate": 4.523195723590953e-05, + "loss": 3.661, + "step": 33606 + }, + { + "epoch": 0.19987034922447425, + "grad_norm": 1.7145735025405884, + "learning_rate": 4.52316828468137e-05, + "loss": 4.5833, + "step": 33607 + }, + { + "epoch": 0.19987629650775526, + "grad_norm": 2.236112117767334, + "learning_rate": 4.5231408450655196e-05, + "loss": 5.1055, + "step": 33608 + }, + { + "epoch": 0.19988224379103625, + "grad_norm": 2.227168321609497, + "learning_rate": 4.5231134047434124e-05, + "loss": 4.0876, + "step": 33609 + }, + { + "epoch": 0.19988819107431724, + "grad_norm": 1.6515976190567017, + "learning_rate": 4.523085963715057e-05, + "loss": 4.897, + "step": 33610 + }, + { + "epoch": 0.19989413835759826, + "grad_norm": 1.844726800918579, + "learning_rate": 4.5230585219804636e-05, + "loss": 4.9802, + "step": 33611 + }, + { + "epoch": 0.19990008564087924, + "grad_norm": 1.967348575592041, + "learning_rate": 4.52303107953964e-05, + "loss": 5.0002, + "step": 33612 + }, + { + "epoch": 0.19990603292416023, + "grad_norm": 1.6869394779205322, + "learning_rate": 4.523003636392599e-05, + "loss": 4.5466, + "step": 33613 + }, + { + "epoch": 0.19991198020744125, + "grad_norm": 1.9090338945388794, + "learning_rate": 4.522976192539347e-05, + "loss": 4.4996, + "step": 33614 + }, + { + "epoch": 0.19991792749072224, + "grad_norm": 1.6536940336227417, + "learning_rate": 4.522948747979895e-05, + "loss": 4.7394, + "step": 33615 + }, + { + "epoch": 0.19992387477400322, + "grad_norm": 1.6711348295211792, + "learning_rate": 4.5229213027142526e-05, + "loss": 4.2212, + "step": 33616 + }, + { + "epoch": 0.19992982205728424, + "grad_norm": 1.4655362367630005, + "learning_rate": 4.5228938567424295e-05, + "loss": 4.163, + "step": 33617 + }, + { + "epoch": 0.19993576934056523, + "grad_norm": 1.509748935699463, + "learning_rate": 4.522866410064435e-05, + "loss": 4.6835, + "step": 33618 + }, + { + "epoch": 0.19994171662384622, + "grad_norm": 1.8132991790771484, + "learning_rate": 4.5228389626802794e-05, + "loss": 5.5276, + "step": 33619 + }, + { + "epoch": 0.19994766390712723, + "grad_norm": 2.3421835899353027, + "learning_rate": 4.5228115145899707e-05, + "loss": 3.8201, + "step": 33620 + }, + { + "epoch": 0.19995361119040822, + "grad_norm": 1.4546209573745728, + "learning_rate": 4.52278406579352e-05, + "loss": 5.174, + "step": 33621 + }, + { + "epoch": 0.1999595584736892, + "grad_norm": 1.5802754163742065, + "learning_rate": 4.522756616290935e-05, + "loss": 5.3047, + "step": 33622 + }, + { + "epoch": 0.19996550575697022, + "grad_norm": 1.6700994968414307, + "learning_rate": 4.5227291660822276e-05, + "loss": 5.362, + "step": 33623 + }, + { + "epoch": 0.1999714530402512, + "grad_norm": 1.743464469909668, + "learning_rate": 4.522701715167407e-05, + "loss": 5.1647, + "step": 33624 + }, + { + "epoch": 0.1999774003235322, + "grad_norm": 1.8635927438735962, + "learning_rate": 4.5226742635464805e-05, + "loss": 5.0099, + "step": 33625 + }, + { + "epoch": 0.19998334760681322, + "grad_norm": 1.5073845386505127, + "learning_rate": 4.522646811219461e-05, + "loss": 5.0938, + "step": 33626 + }, + { + "epoch": 0.1999892948900942, + "grad_norm": 1.8857444524765015, + "learning_rate": 4.522619358186355e-05, + "loss": 5.1196, + "step": 33627 + }, + { + "epoch": 0.1999952421733752, + "grad_norm": 1.7090919017791748, + "learning_rate": 4.5225919044471746e-05, + "loss": 5.3662, + "step": 33628 + }, + { + "epoch": 0.2000011894566562, + "grad_norm": 1.6622498035430908, + "learning_rate": 4.522564450001927e-05, + "loss": 5.3546, + "step": 33629 + }, + { + "epoch": 0.2000071367399372, + "grad_norm": 1.5253161191940308, + "learning_rate": 4.522536994850624e-05, + "loss": 5.3692, + "step": 33630 + }, + { + "epoch": 0.20001308402321819, + "grad_norm": 1.6020673513412476, + "learning_rate": 4.522509538993274e-05, + "loss": 5.278, + "step": 33631 + }, + { + "epoch": 0.2000190313064992, + "grad_norm": 1.7955602407455444, + "learning_rate": 4.522482082429887e-05, + "loss": 4.7938, + "step": 33632 + }, + { + "epoch": 0.2000249785897802, + "grad_norm": 1.694838047027588, + "learning_rate": 4.522454625160472e-05, + "loss": 4.5588, + "step": 33633 + }, + { + "epoch": 0.20003092587306118, + "grad_norm": 1.6719664335250854, + "learning_rate": 4.522427167185039e-05, + "loss": 4.6892, + "step": 33634 + }, + { + "epoch": 0.2000368731563422, + "grad_norm": 1.7728748321533203, + "learning_rate": 4.522399708503599e-05, + "loss": 4.9853, + "step": 33635 + }, + { + "epoch": 0.20004282043962318, + "grad_norm": 2.797647476196289, + "learning_rate": 4.522372249116158e-05, + "loss": 4.4462, + "step": 33636 + }, + { + "epoch": 0.20004876772290417, + "grad_norm": 2.5635032653808594, + "learning_rate": 4.522344789022729e-05, + "loss": 4.1496, + "step": 33637 + }, + { + "epoch": 0.20005471500618519, + "grad_norm": 2.256369113922119, + "learning_rate": 4.52231732822332e-05, + "loss": 4.0803, + "step": 33638 + }, + { + "epoch": 0.20006066228946617, + "grad_norm": 3.085843086242676, + "learning_rate": 4.5222898667179404e-05, + "loss": 2.9132, + "step": 33639 + }, + { + "epoch": 0.20006660957274716, + "grad_norm": 1.646597146987915, + "learning_rate": 4.522262404506601e-05, + "loss": 4.2852, + "step": 33640 + }, + { + "epoch": 0.20007255685602818, + "grad_norm": 2.579864740371704, + "learning_rate": 4.5222349415893106e-05, + "loss": 3.5672, + "step": 33641 + }, + { + "epoch": 0.20007850413930917, + "grad_norm": 2.537965774536133, + "learning_rate": 4.5222074779660784e-05, + "loss": 3.3253, + "step": 33642 + }, + { + "epoch": 0.20008445142259015, + "grad_norm": 2.1766700744628906, + "learning_rate": 4.5221800136369155e-05, + "loss": 3.8213, + "step": 33643 + }, + { + "epoch": 0.20009039870587117, + "grad_norm": 1.603519320487976, + "learning_rate": 4.522152548601829e-05, + "loss": 4.7899, + "step": 33644 + }, + { + "epoch": 0.20009634598915216, + "grad_norm": 2.1622631549835205, + "learning_rate": 4.522125082860831e-05, + "loss": 3.0835, + "step": 33645 + }, + { + "epoch": 0.20010229327243315, + "grad_norm": 1.785031795501709, + "learning_rate": 4.522097616413929e-05, + "loss": 4.1423, + "step": 33646 + }, + { + "epoch": 0.20010824055571416, + "grad_norm": 2.3329782485961914, + "learning_rate": 4.522070149261135e-05, + "loss": 3.4989, + "step": 33647 + }, + { + "epoch": 0.20011418783899515, + "grad_norm": 2.6644299030303955, + "learning_rate": 4.5220426814024564e-05, + "loss": 2.7116, + "step": 33648 + }, + { + "epoch": 0.20012013512227614, + "grad_norm": 2.071437358856201, + "learning_rate": 4.522015212837904e-05, + "loss": 4.3735, + "step": 33649 + }, + { + "epoch": 0.20012608240555715, + "grad_norm": 1.4981132745742798, + "learning_rate": 4.521987743567487e-05, + "loss": 5.1696, + "step": 33650 + }, + { + "epoch": 0.20013202968883814, + "grad_norm": 1.7726006507873535, + "learning_rate": 4.521960273591215e-05, + "loss": 4.9392, + "step": 33651 + }, + { + "epoch": 0.20013797697211913, + "grad_norm": 1.9665300846099854, + "learning_rate": 4.5219328029090966e-05, + "loss": 3.5765, + "step": 33652 + }, + { + "epoch": 0.20014392425540015, + "grad_norm": 2.3966944217681885, + "learning_rate": 4.521905331521143e-05, + "loss": 3.5962, + "step": 33653 + }, + { + "epoch": 0.20014987153868113, + "grad_norm": 2.8166298866271973, + "learning_rate": 4.521877859427363e-05, + "loss": 3.5926, + "step": 33654 + }, + { + "epoch": 0.20015581882196212, + "grad_norm": 1.7879718542099, + "learning_rate": 4.521850386627767e-05, + "loss": 4.6034, + "step": 33655 + }, + { + "epoch": 0.20016176610524314, + "grad_norm": 2.0207948684692383, + "learning_rate": 4.521822913122363e-05, + "loss": 5.2371, + "step": 33656 + }, + { + "epoch": 0.20016771338852413, + "grad_norm": 1.6166136264801025, + "learning_rate": 4.5217954389111615e-05, + "loss": 5.6755, + "step": 33657 + }, + { + "epoch": 0.20017366067180511, + "grad_norm": 1.5825445652008057, + "learning_rate": 4.521767963994173e-05, + "loss": 5.5416, + "step": 33658 + }, + { + "epoch": 0.2001796079550861, + "grad_norm": 2.376970052719116, + "learning_rate": 4.521740488371406e-05, + "loss": 4.8429, + "step": 33659 + }, + { + "epoch": 0.20018555523836712, + "grad_norm": 1.9127243757247925, + "learning_rate": 4.52171301204287e-05, + "loss": 5.299, + "step": 33660 + }, + { + "epoch": 0.2001915025216481, + "grad_norm": 2.695713758468628, + "learning_rate": 4.5216855350085745e-05, + "loss": 3.9133, + "step": 33661 + }, + { + "epoch": 0.2001974498049291, + "grad_norm": 1.516388177871704, + "learning_rate": 4.521658057268529e-05, + "loss": 5.4228, + "step": 33662 + }, + { + "epoch": 0.2002033970882101, + "grad_norm": 2.076374053955078, + "learning_rate": 4.521630578822745e-05, + "loss": 4.7299, + "step": 33663 + }, + { + "epoch": 0.2002093443714911, + "grad_norm": 1.7333403825759888, + "learning_rate": 4.52160309967123e-05, + "loss": 5.1839, + "step": 33664 + }, + { + "epoch": 0.2002152916547721, + "grad_norm": 1.616132378578186, + "learning_rate": 4.521575619813995e-05, + "loss": 5.1235, + "step": 33665 + }, + { + "epoch": 0.2002212389380531, + "grad_norm": 1.7438740730285645, + "learning_rate": 4.5215481392510476e-05, + "loss": 5.2177, + "step": 33666 + }, + { + "epoch": 0.2002271862213341, + "grad_norm": 1.4537467956542969, + "learning_rate": 4.521520657982399e-05, + "loss": 4.8443, + "step": 33667 + }, + { + "epoch": 0.20023313350461508, + "grad_norm": 2.2082064151763916, + "learning_rate": 4.521493176008059e-05, + "loss": 4.3767, + "step": 33668 + }, + { + "epoch": 0.2002390807878961, + "grad_norm": 2.066798210144043, + "learning_rate": 4.521465693328036e-05, + "loss": 3.9378, + "step": 33669 + }, + { + "epoch": 0.20024502807117708, + "grad_norm": 2.1056056022644043, + "learning_rate": 4.52143820994234e-05, + "loss": 4.3377, + "step": 33670 + }, + { + "epoch": 0.20025097535445807, + "grad_norm": 2.173313617706299, + "learning_rate": 4.521410725850981e-05, + "loss": 3.8741, + "step": 33671 + }, + { + "epoch": 0.2002569226377391, + "grad_norm": 2.4892916679382324, + "learning_rate": 4.521383241053969e-05, + "loss": 3.9224, + "step": 33672 + }, + { + "epoch": 0.20026286992102008, + "grad_norm": 1.7403076887130737, + "learning_rate": 4.521355755551313e-05, + "loss": 4.6479, + "step": 33673 + }, + { + "epoch": 0.20026881720430106, + "grad_norm": 2.1816036701202393, + "learning_rate": 4.521328269343022e-05, + "loss": 4.3331, + "step": 33674 + }, + { + "epoch": 0.20027476448758208, + "grad_norm": 1.727345585823059, + "learning_rate": 4.521300782429106e-05, + "loss": 4.7984, + "step": 33675 + }, + { + "epoch": 0.20028071177086307, + "grad_norm": 1.813586711883545, + "learning_rate": 4.521273294809575e-05, + "loss": 5.1053, + "step": 33676 + }, + { + "epoch": 0.20028665905414406, + "grad_norm": 1.5746510028839111, + "learning_rate": 4.521245806484439e-05, + "loss": 5.08, + "step": 33677 + }, + { + "epoch": 0.20029260633742507, + "grad_norm": 1.7232789993286133, + "learning_rate": 4.521218317453706e-05, + "loss": 5.1837, + "step": 33678 + }, + { + "epoch": 0.20029855362070606, + "grad_norm": 1.592498540878296, + "learning_rate": 4.521190827717387e-05, + "loss": 4.7453, + "step": 33679 + }, + { + "epoch": 0.20030450090398705, + "grad_norm": 2.30441951751709, + "learning_rate": 4.521163337275492e-05, + "loss": 3.9628, + "step": 33680 + }, + { + "epoch": 0.20031044818726806, + "grad_norm": 1.50408935546875, + "learning_rate": 4.521135846128028e-05, + "loss": 4.7726, + "step": 33681 + }, + { + "epoch": 0.20031639547054905, + "grad_norm": 1.6377472877502441, + "learning_rate": 4.5211083542750074e-05, + "loss": 4.7695, + "step": 33682 + }, + { + "epoch": 0.20032234275383004, + "grad_norm": 1.6713389158248901, + "learning_rate": 4.521080861716439e-05, + "loss": 4.8269, + "step": 33683 + }, + { + "epoch": 0.20032829003711106, + "grad_norm": 1.6516128778457642, + "learning_rate": 4.5210533684523314e-05, + "loss": 4.347, + "step": 33684 + }, + { + "epoch": 0.20033423732039204, + "grad_norm": 2.528104782104492, + "learning_rate": 4.521025874482696e-05, + "loss": 3.0607, + "step": 33685 + }, + { + "epoch": 0.20034018460367303, + "grad_norm": 2.153841495513916, + "learning_rate": 4.520998379807541e-05, + "loss": 3.5991, + "step": 33686 + }, + { + "epoch": 0.20034613188695405, + "grad_norm": 2.264549970626831, + "learning_rate": 4.520970884426876e-05, + "loss": 3.3218, + "step": 33687 + }, + { + "epoch": 0.20035207917023504, + "grad_norm": 2.517428159713745, + "learning_rate": 4.52094338834071e-05, + "loss": 2.7056, + "step": 33688 + }, + { + "epoch": 0.20035802645351602, + "grad_norm": 2.2318918704986572, + "learning_rate": 4.520915891549055e-05, + "loss": 4.1438, + "step": 33689 + }, + { + "epoch": 0.20036397373679704, + "grad_norm": 1.3869786262512207, + "learning_rate": 4.520888394051919e-05, + "loss": 4.7448, + "step": 33690 + }, + { + "epoch": 0.20036992102007803, + "grad_norm": 1.7762783765792847, + "learning_rate": 4.520860895849311e-05, + "loss": 5.587, + "step": 33691 + }, + { + "epoch": 0.20037586830335902, + "grad_norm": 1.5028401613235474, + "learning_rate": 4.520833396941242e-05, + "loss": 5.6461, + "step": 33692 + }, + { + "epoch": 0.20038181558664003, + "grad_norm": 2.091181516647339, + "learning_rate": 4.5208058973277215e-05, + "loss": 4.3468, + "step": 33693 + }, + { + "epoch": 0.20038776286992102, + "grad_norm": 2.1915535926818848, + "learning_rate": 4.520778397008757e-05, + "loss": 3.909, + "step": 33694 + }, + { + "epoch": 0.200393710153202, + "grad_norm": 2.2266931533813477, + "learning_rate": 4.5207508959843606e-05, + "loss": 3.6981, + "step": 33695 + }, + { + "epoch": 0.20039965743648303, + "grad_norm": 1.649043083190918, + "learning_rate": 4.5207233942545406e-05, + "loss": 5.3721, + "step": 33696 + }, + { + "epoch": 0.200405604719764, + "grad_norm": 1.6526726484298706, + "learning_rate": 4.520695891819307e-05, + "loss": 5.0437, + "step": 33697 + }, + { + "epoch": 0.200411552003045, + "grad_norm": 1.662593960762024, + "learning_rate": 4.520668388678669e-05, + "loss": 5.0767, + "step": 33698 + }, + { + "epoch": 0.20041749928632602, + "grad_norm": 2.0036306381225586, + "learning_rate": 4.520640884832638e-05, + "loss": 4.9235, + "step": 33699 + }, + { + "epoch": 0.200423446569607, + "grad_norm": 1.6705793142318726, + "learning_rate": 4.52061338028122e-05, + "loss": 4.9874, + "step": 33700 + }, + { + "epoch": 0.200429393852888, + "grad_norm": 1.6362453699111938, + "learning_rate": 4.520585875024429e-05, + "loss": 4.6005, + "step": 33701 + }, + { + "epoch": 0.200435341136169, + "grad_norm": 1.509127140045166, + "learning_rate": 4.52055836906227e-05, + "loss": 5.0991, + "step": 33702 + }, + { + "epoch": 0.20044128841945, + "grad_norm": 1.5291036367416382, + "learning_rate": 4.520530862394757e-05, + "loss": 5.0673, + "step": 33703 + }, + { + "epoch": 0.20044723570273099, + "grad_norm": 1.4072394371032715, + "learning_rate": 4.5205033550218964e-05, + "loss": 5.2334, + "step": 33704 + }, + { + "epoch": 0.200453182986012, + "grad_norm": 1.7063164710998535, + "learning_rate": 4.520475846943699e-05, + "loss": 5.3813, + "step": 33705 + }, + { + "epoch": 0.200459130269293, + "grad_norm": 1.6799110174179077, + "learning_rate": 4.520448338160175e-05, + "loss": 5.3719, + "step": 33706 + }, + { + "epoch": 0.20046507755257398, + "grad_norm": 1.409774899482727, + "learning_rate": 4.5204208286713326e-05, + "loss": 5.1838, + "step": 33707 + }, + { + "epoch": 0.200471024835855, + "grad_norm": 1.6743974685668945, + "learning_rate": 4.520393318477183e-05, + "loss": 5.1994, + "step": 33708 + }, + { + "epoch": 0.20047697211913598, + "grad_norm": 1.5790249109268188, + "learning_rate": 4.5203658075777344e-05, + "loss": 5.5178, + "step": 33709 + }, + { + "epoch": 0.20048291940241697, + "grad_norm": 1.3018198013305664, + "learning_rate": 4.520338295972997e-05, + "loss": 5.3647, + "step": 33710 + }, + { + "epoch": 0.200488866685698, + "grad_norm": 1.6319355964660645, + "learning_rate": 4.52031078366298e-05, + "loss": 5.499, + "step": 33711 + }, + { + "epoch": 0.20049481396897897, + "grad_norm": 1.6148849725723267, + "learning_rate": 4.520283270647694e-05, + "loss": 5.728, + "step": 33712 + }, + { + "epoch": 0.20050076125225996, + "grad_norm": 1.6749992370605469, + "learning_rate": 4.520255756927147e-05, + "loss": 4.9337, + "step": 33713 + }, + { + "epoch": 0.20050670853554098, + "grad_norm": 1.8952507972717285, + "learning_rate": 4.520228242501351e-05, + "loss": 5.1761, + "step": 33714 + }, + { + "epoch": 0.20051265581882197, + "grad_norm": 1.6296254396438599, + "learning_rate": 4.520200727370314e-05, + "loss": 5.0946, + "step": 33715 + }, + { + "epoch": 0.20051860310210295, + "grad_norm": 1.622511386871338, + "learning_rate": 4.520173211534045e-05, + "loss": 4.9613, + "step": 33716 + }, + { + "epoch": 0.20052455038538394, + "grad_norm": 1.5678802728652954, + "learning_rate": 4.5201456949925547e-05, + "loss": 4.8578, + "step": 33717 + }, + { + "epoch": 0.20053049766866496, + "grad_norm": 1.661635160446167, + "learning_rate": 4.5201181777458526e-05, + "loss": 5.3303, + "step": 33718 + }, + { + "epoch": 0.20053644495194595, + "grad_norm": 1.4430382251739502, + "learning_rate": 4.520090659793948e-05, + "loss": 4.9673, + "step": 33719 + }, + { + "epoch": 0.20054239223522694, + "grad_norm": 1.8783633708953857, + "learning_rate": 4.520063141136851e-05, + "loss": 4.6469, + "step": 33720 + }, + { + "epoch": 0.20054833951850795, + "grad_norm": 1.6063121557235718, + "learning_rate": 4.5200356217745704e-05, + "loss": 4.9283, + "step": 33721 + }, + { + "epoch": 0.20055428680178894, + "grad_norm": 1.7810618877410889, + "learning_rate": 4.520008101707116e-05, + "loss": 4.6507, + "step": 33722 + }, + { + "epoch": 0.20056023408506993, + "grad_norm": 1.9146829843521118, + "learning_rate": 4.519980580934498e-05, + "loss": 4.8663, + "step": 33723 + }, + { + "epoch": 0.20056618136835094, + "grad_norm": 1.5050143003463745, + "learning_rate": 4.519953059456726e-05, + "loss": 4.8159, + "step": 33724 + }, + { + "epoch": 0.20057212865163193, + "grad_norm": 1.4203321933746338, + "learning_rate": 4.519925537273808e-05, + "loss": 5.2271, + "step": 33725 + }, + { + "epoch": 0.20057807593491292, + "grad_norm": 1.7080183029174805, + "learning_rate": 4.519898014385756e-05, + "loss": 4.6244, + "step": 33726 + }, + { + "epoch": 0.20058402321819394, + "grad_norm": 2.466174840927124, + "learning_rate": 4.519870490792578e-05, + "loss": 4.3746, + "step": 33727 + }, + { + "epoch": 0.20058997050147492, + "grad_norm": 1.9741504192352295, + "learning_rate": 4.519842966494284e-05, + "loss": 3.5025, + "step": 33728 + }, + { + "epoch": 0.2005959177847559, + "grad_norm": 1.5923235416412354, + "learning_rate": 4.519815441490884e-05, + "loss": 5.4725, + "step": 33729 + }, + { + "epoch": 0.20060186506803693, + "grad_norm": 1.650692343711853, + "learning_rate": 4.5197879157823874e-05, + "loss": 4.8989, + "step": 33730 + }, + { + "epoch": 0.20060781235131792, + "grad_norm": 2.597038745880127, + "learning_rate": 4.5197603893688034e-05, + "loss": 3.7543, + "step": 33731 + }, + { + "epoch": 0.2006137596345989, + "grad_norm": 2.3229899406433105, + "learning_rate": 4.5197328622501425e-05, + "loss": 3.8646, + "step": 33732 + }, + { + "epoch": 0.20061970691787992, + "grad_norm": 1.6960362195968628, + "learning_rate": 4.519705334426413e-05, + "loss": 5.4444, + "step": 33733 + }, + { + "epoch": 0.2006256542011609, + "grad_norm": 1.6503461599349976, + "learning_rate": 4.5196778058976255e-05, + "loss": 4.9644, + "step": 33734 + }, + { + "epoch": 0.2006316014844419, + "grad_norm": 1.5266268253326416, + "learning_rate": 4.519650276663789e-05, + "loss": 4.9012, + "step": 33735 + }, + { + "epoch": 0.2006375487677229, + "grad_norm": 1.8135932683944702, + "learning_rate": 4.5196227467249144e-05, + "loss": 4.6224, + "step": 33736 + }, + { + "epoch": 0.2006434960510039, + "grad_norm": 1.9666510820388794, + "learning_rate": 4.5195952160810094e-05, + "loss": 4.8198, + "step": 33737 + }, + { + "epoch": 0.2006494433342849, + "grad_norm": 2.169323444366455, + "learning_rate": 4.5195676847320856e-05, + "loss": 4.5872, + "step": 33738 + }, + { + "epoch": 0.2006553906175659, + "grad_norm": 2.2922489643096924, + "learning_rate": 4.5195401526781506e-05, + "loss": 3.3659, + "step": 33739 + }, + { + "epoch": 0.2006613379008469, + "grad_norm": 1.6436244249343872, + "learning_rate": 4.519512619919215e-05, + "loss": 4.824, + "step": 33740 + }, + { + "epoch": 0.20066728518412788, + "grad_norm": 2.283162832260132, + "learning_rate": 4.519485086455289e-05, + "loss": 3.0829, + "step": 33741 + }, + { + "epoch": 0.2006732324674089, + "grad_norm": 1.8069710731506348, + "learning_rate": 4.519457552286381e-05, + "loss": 5.1824, + "step": 33742 + }, + { + "epoch": 0.20067917975068988, + "grad_norm": 1.619968056678772, + "learning_rate": 4.519430017412502e-05, + "loss": 4.9325, + "step": 33743 + }, + { + "epoch": 0.20068512703397087, + "grad_norm": 2.0284674167633057, + "learning_rate": 4.51940248183366e-05, + "loss": 4.6265, + "step": 33744 + }, + { + "epoch": 0.2006910743172519, + "grad_norm": 2.3306424617767334, + "learning_rate": 4.5193749455498664e-05, + "loss": 4.7731, + "step": 33745 + }, + { + "epoch": 0.20069702160053288, + "grad_norm": 2.6167304515838623, + "learning_rate": 4.519347408561129e-05, + "loss": 5.0508, + "step": 33746 + }, + { + "epoch": 0.20070296888381386, + "grad_norm": 1.625686526298523, + "learning_rate": 4.519319870867459e-05, + "loss": 4.6101, + "step": 33747 + }, + { + "epoch": 0.20070891616709488, + "grad_norm": 1.5814995765686035, + "learning_rate": 4.519292332468865e-05, + "loss": 5.195, + "step": 33748 + }, + { + "epoch": 0.20071486345037587, + "grad_norm": 1.4932879209518433, + "learning_rate": 4.5192647933653566e-05, + "loss": 5.2172, + "step": 33749 + }, + { + "epoch": 0.20072081073365686, + "grad_norm": 1.6655844449996948, + "learning_rate": 4.519237253556944e-05, + "loss": 5.0021, + "step": 33750 + }, + { + "epoch": 0.20072675801693787, + "grad_norm": 2.265012264251709, + "learning_rate": 4.519209713043636e-05, + "loss": 4.1171, + "step": 33751 + }, + { + "epoch": 0.20073270530021886, + "grad_norm": 2.524155378341675, + "learning_rate": 4.5191821718254436e-05, + "loss": 3.5122, + "step": 33752 + }, + { + "epoch": 0.20073865258349985, + "grad_norm": 1.620287537574768, + "learning_rate": 4.5191546299023754e-05, + "loss": 4.9341, + "step": 33753 + }, + { + "epoch": 0.20074459986678087, + "grad_norm": 1.6102601289749146, + "learning_rate": 4.519127087274441e-05, + "loss": 5.3899, + "step": 33754 + }, + { + "epoch": 0.20075054715006185, + "grad_norm": 1.5587860345840454, + "learning_rate": 4.51909954394165e-05, + "loss": 4.8153, + "step": 33755 + }, + { + "epoch": 0.20075649443334284, + "grad_norm": 1.8633415699005127, + "learning_rate": 4.5190719999040124e-05, + "loss": 5.0412, + "step": 33756 + }, + { + "epoch": 0.20076244171662386, + "grad_norm": 1.7614189386367798, + "learning_rate": 4.519044455161538e-05, + "loss": 5.299, + "step": 33757 + }, + { + "epoch": 0.20076838899990485, + "grad_norm": 1.6365293264389038, + "learning_rate": 4.5190169097142355e-05, + "loss": 5.2779, + "step": 33758 + }, + { + "epoch": 0.20077433628318583, + "grad_norm": 1.696018099784851, + "learning_rate": 4.518989363562115e-05, + "loss": 4.6489, + "step": 33759 + }, + { + "epoch": 0.20078028356646685, + "grad_norm": 1.7082701921463013, + "learning_rate": 4.5189618167051866e-05, + "loss": 4.9396, + "step": 33760 + }, + { + "epoch": 0.20078623084974784, + "grad_norm": 1.533921241760254, + "learning_rate": 4.518934269143459e-05, + "loss": 5.2822, + "step": 33761 + }, + { + "epoch": 0.20079217813302883, + "grad_norm": 1.9116073846817017, + "learning_rate": 4.518906720876943e-05, + "loss": 5.6835, + "step": 33762 + }, + { + "epoch": 0.20079812541630984, + "grad_norm": 1.6028169393539429, + "learning_rate": 4.5188791719056466e-05, + "loss": 5.6127, + "step": 33763 + }, + { + "epoch": 0.20080407269959083, + "grad_norm": 1.6425648927688599, + "learning_rate": 4.5188516222295814e-05, + "loss": 5.5184, + "step": 33764 + }, + { + "epoch": 0.20081001998287182, + "grad_norm": 1.7046092748641968, + "learning_rate": 4.518824071848755e-05, + "loss": 5.1118, + "step": 33765 + }, + { + "epoch": 0.20081596726615283, + "grad_norm": 1.6231269836425781, + "learning_rate": 4.518796520763179e-05, + "loss": 5.0246, + "step": 33766 + }, + { + "epoch": 0.20082191454943382, + "grad_norm": 1.6386373043060303, + "learning_rate": 4.5187689689728606e-05, + "loss": 5.5888, + "step": 33767 + }, + { + "epoch": 0.2008278618327148, + "grad_norm": 1.582251787185669, + "learning_rate": 4.518741416477812e-05, + "loss": 5.1171, + "step": 33768 + }, + { + "epoch": 0.20083380911599583, + "grad_norm": 1.5769929885864258, + "learning_rate": 4.518713863278041e-05, + "loss": 5.1322, + "step": 33769 + }, + { + "epoch": 0.20083975639927681, + "grad_norm": 1.6422269344329834, + "learning_rate": 4.5186863093735585e-05, + "loss": 4.652, + "step": 33770 + }, + { + "epoch": 0.2008457036825578, + "grad_norm": 1.5146641731262207, + "learning_rate": 4.518658754764373e-05, + "loss": 4.3487, + "step": 33771 + }, + { + "epoch": 0.20085165096583882, + "grad_norm": 1.463438868522644, + "learning_rate": 4.518631199450494e-05, + "loss": 4.2242, + "step": 33772 + }, + { + "epoch": 0.2008575982491198, + "grad_norm": 1.4330111742019653, + "learning_rate": 4.5186036434319324e-05, + "loss": 4.0898, + "step": 33773 + }, + { + "epoch": 0.2008635455324008, + "grad_norm": 1.4045552015304565, + "learning_rate": 4.5185760867086975e-05, + "loss": 4.346, + "step": 33774 + }, + { + "epoch": 0.20086949281568178, + "grad_norm": 1.5829514265060425, + "learning_rate": 4.5185485292807975e-05, + "loss": 4.8288, + "step": 33775 + }, + { + "epoch": 0.2008754400989628, + "grad_norm": 1.363128423690796, + "learning_rate": 4.518520971148244e-05, + "loss": 4.2016, + "step": 33776 + }, + { + "epoch": 0.2008813873822438, + "grad_norm": 1.6097347736358643, + "learning_rate": 4.518493412311045e-05, + "loss": 3.9491, + "step": 33777 + }, + { + "epoch": 0.20088733466552477, + "grad_norm": 1.5099202394485474, + "learning_rate": 4.5184658527692114e-05, + "loss": 3.7755, + "step": 33778 + }, + { + "epoch": 0.2008932819488058, + "grad_norm": 1.602229118347168, + "learning_rate": 4.518438292522752e-05, + "loss": 3.9349, + "step": 33779 + }, + { + "epoch": 0.20089922923208678, + "grad_norm": 1.5963069200515747, + "learning_rate": 4.5184107315716765e-05, + "loss": 4.1626, + "step": 33780 + }, + { + "epoch": 0.20090517651536777, + "grad_norm": 1.5034286975860596, + "learning_rate": 4.518383169915995e-05, + "loss": 4.6347, + "step": 33781 + }, + { + "epoch": 0.20091112379864878, + "grad_norm": 1.4610581398010254, + "learning_rate": 4.518355607555717e-05, + "loss": 4.2356, + "step": 33782 + }, + { + "epoch": 0.20091707108192977, + "grad_norm": 1.468599557876587, + "learning_rate": 4.5183280444908504e-05, + "loss": 3.7972, + "step": 33783 + }, + { + "epoch": 0.20092301836521076, + "grad_norm": 1.265889286994934, + "learning_rate": 4.518300480721408e-05, + "loss": 3.9609, + "step": 33784 + }, + { + "epoch": 0.20092896564849178, + "grad_norm": 1.615130066871643, + "learning_rate": 4.5182729162473967e-05, + "loss": 3.8964, + "step": 33785 + }, + { + "epoch": 0.20093491293177276, + "grad_norm": 1.606234073638916, + "learning_rate": 4.518245351068828e-05, + "loss": 3.6912, + "step": 33786 + }, + { + "epoch": 0.20094086021505375, + "grad_norm": 1.4742984771728516, + "learning_rate": 4.51821778518571e-05, + "loss": 3.7547, + "step": 33787 + }, + { + "epoch": 0.20094680749833477, + "grad_norm": 1.3932676315307617, + "learning_rate": 4.518190218598054e-05, + "loss": 3.922, + "step": 33788 + }, + { + "epoch": 0.20095275478161576, + "grad_norm": 1.5401780605316162, + "learning_rate": 4.518162651305867e-05, + "loss": 3.8633, + "step": 33789 + }, + { + "epoch": 0.20095870206489674, + "grad_norm": 1.9733563661575317, + "learning_rate": 4.5181350833091616e-05, + "loss": 4.2922, + "step": 33790 + }, + { + "epoch": 0.20096464934817776, + "grad_norm": 1.675879955291748, + "learning_rate": 4.5181075146079456e-05, + "loss": 3.6927, + "step": 33791 + }, + { + "epoch": 0.20097059663145875, + "grad_norm": 1.6591668128967285, + "learning_rate": 4.5180799452022294e-05, + "loss": 3.7511, + "step": 33792 + }, + { + "epoch": 0.20097654391473974, + "grad_norm": 1.493525505065918, + "learning_rate": 4.518052375092022e-05, + "loss": 3.9572, + "step": 33793 + }, + { + "epoch": 0.20098249119802075, + "grad_norm": 1.5096441507339478, + "learning_rate": 4.5180248042773344e-05, + "loss": 4.4723, + "step": 33794 + }, + { + "epoch": 0.20098843848130174, + "grad_norm": 1.6198865175247192, + "learning_rate": 4.517997232758174e-05, + "loss": 3.968, + "step": 33795 + }, + { + "epoch": 0.20099438576458273, + "grad_norm": 1.4703052043914795, + "learning_rate": 4.517969660534552e-05, + "loss": 3.962, + "step": 33796 + }, + { + "epoch": 0.20100033304786374, + "grad_norm": 1.6844958066940308, + "learning_rate": 4.5179420876064776e-05, + "loss": 3.9622, + "step": 33797 + }, + { + "epoch": 0.20100628033114473, + "grad_norm": 1.5714399814605713, + "learning_rate": 4.5179145139739605e-05, + "loss": 3.6723, + "step": 33798 + }, + { + "epoch": 0.20101222761442572, + "grad_norm": 1.3336405754089355, + "learning_rate": 4.51788693963701e-05, + "loss": 4.0349, + "step": 33799 + }, + { + "epoch": 0.20101817489770674, + "grad_norm": 2.3248112201690674, + "learning_rate": 4.517859364595637e-05, + "loss": 4.4507, + "step": 33800 + }, + { + "epoch": 0.20102412218098772, + "grad_norm": 1.7180213928222656, + "learning_rate": 4.517831788849849e-05, + "loss": 4.7463, + "step": 33801 + }, + { + "epoch": 0.2010300694642687, + "grad_norm": 1.627234697341919, + "learning_rate": 4.5178042123996565e-05, + "loss": 4.4101, + "step": 33802 + }, + { + "epoch": 0.20103601674754973, + "grad_norm": 1.6691185235977173, + "learning_rate": 4.517776635245071e-05, + "loss": 4.6427, + "step": 33803 + }, + { + "epoch": 0.20104196403083072, + "grad_norm": 1.580978512763977, + "learning_rate": 4.517749057386099e-05, + "loss": 3.8459, + "step": 33804 + }, + { + "epoch": 0.2010479113141117, + "grad_norm": 1.5489826202392578, + "learning_rate": 4.5177214788227526e-05, + "loss": 4.1169, + "step": 33805 + }, + { + "epoch": 0.20105385859739272, + "grad_norm": 2.0057342052459717, + "learning_rate": 4.51769389955504e-05, + "loss": 4.6964, + "step": 33806 + }, + { + "epoch": 0.2010598058806737, + "grad_norm": 1.6826112270355225, + "learning_rate": 4.517666319582972e-05, + "loss": 5.1589, + "step": 33807 + }, + { + "epoch": 0.2010657531639547, + "grad_norm": 1.729201316833496, + "learning_rate": 4.5176387389065564e-05, + "loss": 4.9901, + "step": 33808 + }, + { + "epoch": 0.2010717004472357, + "grad_norm": 1.7442471981048584, + "learning_rate": 4.517611157525805e-05, + "loss": 4.9073, + "step": 33809 + }, + { + "epoch": 0.2010776477305167, + "grad_norm": 1.923149585723877, + "learning_rate": 4.5175835754407256e-05, + "loss": 4.8148, + "step": 33810 + }, + { + "epoch": 0.2010835950137977, + "grad_norm": 2.2062087059020996, + "learning_rate": 4.517555992651329e-05, + "loss": 4.0303, + "step": 33811 + }, + { + "epoch": 0.2010895422970787, + "grad_norm": 1.5704069137573242, + "learning_rate": 4.517528409157624e-05, + "loss": 5.2119, + "step": 33812 + }, + { + "epoch": 0.2010954895803597, + "grad_norm": 1.6825261116027832, + "learning_rate": 4.517500824959621e-05, + "loss": 4.8707, + "step": 33813 + }, + { + "epoch": 0.20110143686364068, + "grad_norm": 1.5696799755096436, + "learning_rate": 4.517473240057329e-05, + "loss": 4.7079, + "step": 33814 + }, + { + "epoch": 0.2011073841469217, + "grad_norm": 1.6693792343139648, + "learning_rate": 4.5174456544507594e-05, + "loss": 5.0667, + "step": 33815 + }, + { + "epoch": 0.20111333143020269, + "grad_norm": 1.5435715913772583, + "learning_rate": 4.517418068139919e-05, + "loss": 5.1521, + "step": 33816 + }, + { + "epoch": 0.20111927871348367, + "grad_norm": 1.5700812339782715, + "learning_rate": 4.517390481124819e-05, + "loss": 5.1064, + "step": 33817 + }, + { + "epoch": 0.2011252259967647, + "grad_norm": 1.550162434577942, + "learning_rate": 4.5173628934054694e-05, + "loss": 4.284, + "step": 33818 + }, + { + "epoch": 0.20113117328004568, + "grad_norm": 1.7881672382354736, + "learning_rate": 4.517335304981878e-05, + "loss": 4.2866, + "step": 33819 + }, + { + "epoch": 0.20113712056332667, + "grad_norm": 1.7079659700393677, + "learning_rate": 4.5173077158540566e-05, + "loss": 4.5039, + "step": 33820 + }, + { + "epoch": 0.20114306784660768, + "grad_norm": 1.5491669178009033, + "learning_rate": 4.517280126022014e-05, + "loss": 4.671, + "step": 33821 + }, + { + "epoch": 0.20114901512988867, + "grad_norm": 1.63919997215271, + "learning_rate": 4.517252535485759e-05, + "loss": 4.7127, + "step": 33822 + }, + { + "epoch": 0.20115496241316966, + "grad_norm": 1.8322843313217163, + "learning_rate": 4.517224944245303e-05, + "loss": 4.6952, + "step": 33823 + }, + { + "epoch": 0.20116090969645067, + "grad_norm": 1.7782399654388428, + "learning_rate": 4.517197352300654e-05, + "loss": 4.6892, + "step": 33824 + }, + { + "epoch": 0.20116685697973166, + "grad_norm": 1.7981961965560913, + "learning_rate": 4.517169759651823e-05, + "loss": 4.5741, + "step": 33825 + }, + { + "epoch": 0.20117280426301265, + "grad_norm": 1.8265764713287354, + "learning_rate": 4.5171421662988175e-05, + "loss": 4.4527, + "step": 33826 + }, + { + "epoch": 0.20117875154629367, + "grad_norm": 1.6261963844299316, + "learning_rate": 4.517114572241649e-05, + "loss": 4.4656, + "step": 33827 + }, + { + "epoch": 0.20118469882957465, + "grad_norm": 1.478434681892395, + "learning_rate": 4.517086977480327e-05, + "loss": 4.674, + "step": 33828 + }, + { + "epoch": 0.20119064611285564, + "grad_norm": 2.420952796936035, + "learning_rate": 4.517059382014861e-05, + "loss": 3.497, + "step": 33829 + }, + { + "epoch": 0.20119659339613666, + "grad_norm": 1.835784912109375, + "learning_rate": 4.51703178584526e-05, + "loss": 4.1869, + "step": 33830 + }, + { + "epoch": 0.20120254067941765, + "grad_norm": 1.6024458408355713, + "learning_rate": 4.517004188971534e-05, + "loss": 4.1086, + "step": 33831 + }, + { + "epoch": 0.20120848796269863, + "grad_norm": 2.3725204467773438, + "learning_rate": 4.516976591393692e-05, + "loss": 3.6407, + "step": 33832 + }, + { + "epoch": 0.20121443524597965, + "grad_norm": 2.743121862411499, + "learning_rate": 4.516948993111746e-05, + "loss": 4.2811, + "step": 33833 + }, + { + "epoch": 0.20122038252926064, + "grad_norm": 1.8155949115753174, + "learning_rate": 4.5169213941257024e-05, + "loss": 4.5768, + "step": 33834 + }, + { + "epoch": 0.20122632981254163, + "grad_norm": 1.7074800729751587, + "learning_rate": 4.516893794435574e-05, + "loss": 4.6348, + "step": 33835 + }, + { + "epoch": 0.20123227709582261, + "grad_norm": 1.7050331830978394, + "learning_rate": 4.516866194041367e-05, + "loss": 4.7784, + "step": 33836 + }, + { + "epoch": 0.20123822437910363, + "grad_norm": 1.6249829530715942, + "learning_rate": 4.516838592943094e-05, + "loss": 4.4591, + "step": 33837 + }, + { + "epoch": 0.20124417166238462, + "grad_norm": 1.6271724700927734, + "learning_rate": 4.516810991140763e-05, + "loss": 4.4689, + "step": 33838 + }, + { + "epoch": 0.2012501189456656, + "grad_norm": 1.787264108657837, + "learning_rate": 4.516783388634385e-05, + "loss": 4.3448, + "step": 33839 + }, + { + "epoch": 0.20125606622894662, + "grad_norm": 1.6502000093460083, + "learning_rate": 4.516755785423967e-05, + "loss": 4.761, + "step": 33840 + }, + { + "epoch": 0.2012620135122276, + "grad_norm": 1.768717885017395, + "learning_rate": 4.5167281815095216e-05, + "loss": 4.6362, + "step": 33841 + }, + { + "epoch": 0.2012679607955086, + "grad_norm": 1.5358744859695435, + "learning_rate": 4.5167005768910573e-05, + "loss": 4.6384, + "step": 33842 + }, + { + "epoch": 0.20127390807878962, + "grad_norm": 1.7373604774475098, + "learning_rate": 4.5166729715685833e-05, + "loss": 4.4974, + "step": 33843 + }, + { + "epoch": 0.2012798553620706, + "grad_norm": 1.7411723136901855, + "learning_rate": 4.51664536554211e-05, + "loss": 4.6346, + "step": 33844 + }, + { + "epoch": 0.2012858026453516, + "grad_norm": 1.4830048084259033, + "learning_rate": 4.516617758811647e-05, + "loss": 4.4465, + "step": 33845 + }, + { + "epoch": 0.2012917499286326, + "grad_norm": 1.7425602674484253, + "learning_rate": 4.5165901513772025e-05, + "loss": 4.3604, + "step": 33846 + }, + { + "epoch": 0.2012976972119136, + "grad_norm": 1.4752614498138428, + "learning_rate": 4.516562543238787e-05, + "loss": 4.4717, + "step": 33847 + }, + { + "epoch": 0.20130364449519458, + "grad_norm": 1.6776503324508667, + "learning_rate": 4.5165349343964115e-05, + "loss": 4.4002, + "step": 33848 + }, + { + "epoch": 0.2013095917784756, + "grad_norm": 2.209038734436035, + "learning_rate": 4.516507324850084e-05, + "loss": 4.381, + "step": 33849 + }, + { + "epoch": 0.2013155390617566, + "grad_norm": 2.257248878479004, + "learning_rate": 4.516479714599814e-05, + "loss": 4.6487, + "step": 33850 + }, + { + "epoch": 0.20132148634503758, + "grad_norm": 1.6058926582336426, + "learning_rate": 4.516452103645613e-05, + "loss": 4.7832, + "step": 33851 + }, + { + "epoch": 0.2013274336283186, + "grad_norm": 2.744135856628418, + "learning_rate": 4.5164244919874885e-05, + "loss": 4.0109, + "step": 33852 + }, + { + "epoch": 0.20133338091159958, + "grad_norm": 1.5897787809371948, + "learning_rate": 4.516396879625451e-05, + "loss": 4.5663, + "step": 33853 + }, + { + "epoch": 0.20133932819488057, + "grad_norm": 1.8678447008132935, + "learning_rate": 4.516369266559511e-05, + "loss": 4.2331, + "step": 33854 + }, + { + "epoch": 0.20134527547816158, + "grad_norm": 2.034632921218872, + "learning_rate": 4.516341652789676e-05, + "loss": 4.3551, + "step": 33855 + }, + { + "epoch": 0.20135122276144257, + "grad_norm": 1.9875417947769165, + "learning_rate": 4.5163140383159586e-05, + "loss": 4.203, + "step": 33856 + }, + { + "epoch": 0.20135717004472356, + "grad_norm": 1.689079999923706, + "learning_rate": 4.516286423138366e-05, + "loss": 4.3866, + "step": 33857 + }, + { + "epoch": 0.20136311732800458, + "grad_norm": 1.6041475534439087, + "learning_rate": 4.516258807256908e-05, + "loss": 4.6978, + "step": 33858 + }, + { + "epoch": 0.20136906461128556, + "grad_norm": 1.5705976486206055, + "learning_rate": 4.516231190671596e-05, + "loss": 4.62, + "step": 33859 + }, + { + "epoch": 0.20137501189456655, + "grad_norm": 1.814264178276062, + "learning_rate": 4.516203573382438e-05, + "loss": 4.3952, + "step": 33860 + }, + { + "epoch": 0.20138095917784757, + "grad_norm": 1.797286868095398, + "learning_rate": 4.516175955389445e-05, + "loss": 4.2057, + "step": 33861 + }, + { + "epoch": 0.20138690646112856, + "grad_norm": 1.5275870561599731, + "learning_rate": 4.516148336692624e-05, + "loss": 4.4812, + "step": 33862 + }, + { + "epoch": 0.20139285374440954, + "grad_norm": 1.4183309078216553, + "learning_rate": 4.5161207172919875e-05, + "loss": 4.6353, + "step": 33863 + }, + { + "epoch": 0.20139880102769056, + "grad_norm": 2.1223907470703125, + "learning_rate": 4.516093097187544e-05, + "loss": 4.6249, + "step": 33864 + }, + { + "epoch": 0.20140474831097155, + "grad_norm": 2.0036821365356445, + "learning_rate": 4.516065476379303e-05, + "loss": 4.9462, + "step": 33865 + }, + { + "epoch": 0.20141069559425254, + "grad_norm": 1.7337446212768555, + "learning_rate": 4.516037854867275e-05, + "loss": 4.7494, + "step": 33866 + }, + { + "epoch": 0.20141664287753355, + "grad_norm": 1.7076916694641113, + "learning_rate": 4.516010232651469e-05, + "loss": 4.0989, + "step": 33867 + }, + { + "epoch": 0.20142259016081454, + "grad_norm": 1.617569088935852, + "learning_rate": 4.5159826097318934e-05, + "loss": 4.2642, + "step": 33868 + }, + { + "epoch": 0.20142853744409553, + "grad_norm": 1.8856641054153442, + "learning_rate": 4.5159549861085604e-05, + "loss": 3.8425, + "step": 33869 + }, + { + "epoch": 0.20143448472737654, + "grad_norm": 2.1982221603393555, + "learning_rate": 4.515927361781478e-05, + "loss": 3.4169, + "step": 33870 + }, + { + "epoch": 0.20144043201065753, + "grad_norm": 2.361307144165039, + "learning_rate": 4.515899736750656e-05, + "loss": 3.1205, + "step": 33871 + }, + { + "epoch": 0.20144637929393852, + "grad_norm": 1.8608986139297485, + "learning_rate": 4.515872111016104e-05, + "loss": 4.3047, + "step": 33872 + }, + { + "epoch": 0.20145232657721954, + "grad_norm": 1.6542391777038574, + "learning_rate": 4.515844484577833e-05, + "loss": 4.7464, + "step": 33873 + }, + { + "epoch": 0.20145827386050053, + "grad_norm": 1.8147705793380737, + "learning_rate": 4.51581685743585e-05, + "loss": 4.2698, + "step": 33874 + }, + { + "epoch": 0.2014642211437815, + "grad_norm": 2.070788621902466, + "learning_rate": 4.515789229590167e-05, + "loss": 4.0678, + "step": 33875 + }, + { + "epoch": 0.20147016842706253, + "grad_norm": 1.9720908403396606, + "learning_rate": 4.5157616010407934e-05, + "loss": 4.0804, + "step": 33876 + }, + { + "epoch": 0.20147611571034352, + "grad_norm": 2.033067464828491, + "learning_rate": 4.5157339717877366e-05, + "loss": 3.8606, + "step": 33877 + }, + { + "epoch": 0.2014820629936245, + "grad_norm": 2.8475182056427, + "learning_rate": 4.5157063418310095e-05, + "loss": 2.9459, + "step": 33878 + }, + { + "epoch": 0.20148801027690552, + "grad_norm": 2.976738929748535, + "learning_rate": 4.5156787111706196e-05, + "loss": 2.9561, + "step": 33879 + }, + { + "epoch": 0.2014939575601865, + "grad_norm": 2.4976749420166016, + "learning_rate": 4.5156510798065764e-05, + "loss": 3.0772, + "step": 33880 + }, + { + "epoch": 0.2014999048434675, + "grad_norm": 1.491884469985962, + "learning_rate": 4.5156234477388914e-05, + "loss": 5.0386, + "step": 33881 + }, + { + "epoch": 0.2015058521267485, + "grad_norm": 1.7481471300125122, + "learning_rate": 4.515595814967573e-05, + "loss": 5.1319, + "step": 33882 + }, + { + "epoch": 0.2015117994100295, + "grad_norm": 1.8939447402954102, + "learning_rate": 4.51556818149263e-05, + "loss": 5.3305, + "step": 33883 + }, + { + "epoch": 0.2015177466933105, + "grad_norm": 1.7944999933242798, + "learning_rate": 4.515540547314073e-05, + "loss": 5.2071, + "step": 33884 + }, + { + "epoch": 0.2015236939765915, + "grad_norm": 1.799474835395813, + "learning_rate": 4.515512912431912e-05, + "loss": 5.1697, + "step": 33885 + }, + { + "epoch": 0.2015296412598725, + "grad_norm": 1.777791976928711, + "learning_rate": 4.515485276846157e-05, + "loss": 4.4441, + "step": 33886 + }, + { + "epoch": 0.20153558854315348, + "grad_norm": 1.6787590980529785, + "learning_rate": 4.5154576405568154e-05, + "loss": 4.5804, + "step": 33887 + }, + { + "epoch": 0.2015415358264345, + "grad_norm": 1.602138876914978, + "learning_rate": 4.515430003563899e-05, + "loss": 4.9907, + "step": 33888 + }, + { + "epoch": 0.2015474831097155, + "grad_norm": 1.4458924531936646, + "learning_rate": 4.515402365867417e-05, + "loss": 5.0, + "step": 33889 + }, + { + "epoch": 0.20155343039299647, + "grad_norm": 1.3784939050674438, + "learning_rate": 4.515374727467379e-05, + "loss": 5.036, + "step": 33890 + }, + { + "epoch": 0.2015593776762775, + "grad_norm": 1.384204626083374, + "learning_rate": 4.515347088363794e-05, + "loss": 4.9079, + "step": 33891 + }, + { + "epoch": 0.20156532495955848, + "grad_norm": 1.595136046409607, + "learning_rate": 4.515319448556673e-05, + "loss": 4.5787, + "step": 33892 + }, + { + "epoch": 0.20157127224283947, + "grad_norm": 1.3380727767944336, + "learning_rate": 4.515291808046024e-05, + "loss": 5.0094, + "step": 33893 + }, + { + "epoch": 0.20157721952612045, + "grad_norm": 1.488208293914795, + "learning_rate": 4.515264166831858e-05, + "loss": 5.0324, + "step": 33894 + }, + { + "epoch": 0.20158316680940147, + "grad_norm": 1.4779205322265625, + "learning_rate": 4.5152365249141835e-05, + "loss": 4.8467, + "step": 33895 + }, + { + "epoch": 0.20158911409268246, + "grad_norm": 1.5820229053497314, + "learning_rate": 4.515208882293011e-05, + "loss": 4.9763, + "step": 33896 + }, + { + "epoch": 0.20159506137596345, + "grad_norm": 1.2227067947387695, + "learning_rate": 4.51518123896835e-05, + "loss": 5.0146, + "step": 33897 + }, + { + "epoch": 0.20160100865924446, + "grad_norm": 1.7960015535354614, + "learning_rate": 4.51515359494021e-05, + "loss": 4.598, + "step": 33898 + }, + { + "epoch": 0.20160695594252545, + "grad_norm": 2.1942708492279053, + "learning_rate": 4.515125950208601e-05, + "loss": 3.9657, + "step": 33899 + }, + { + "epoch": 0.20161290322580644, + "grad_norm": 2.034914016723633, + "learning_rate": 4.5150983047735316e-05, + "loss": 4.5544, + "step": 33900 + }, + { + "epoch": 0.20161885050908745, + "grad_norm": 2.0738654136657715, + "learning_rate": 4.515070658635013e-05, + "loss": 3.9512, + "step": 33901 + }, + { + "epoch": 0.20162479779236844, + "grad_norm": 2.08801007270813, + "learning_rate": 4.515043011793053e-05, + "loss": 3.5748, + "step": 33902 + }, + { + "epoch": 0.20163074507564943, + "grad_norm": 2.220031261444092, + "learning_rate": 4.515015364247664e-05, + "loss": 3.6819, + "step": 33903 + }, + { + "epoch": 0.20163669235893045, + "grad_norm": 2.0764245986938477, + "learning_rate": 4.514987715998853e-05, + "loss": 4.2446, + "step": 33904 + }, + { + "epoch": 0.20164263964221144, + "grad_norm": 2.2536237239837646, + "learning_rate": 4.51496006704663e-05, + "loss": 3.6282, + "step": 33905 + }, + { + "epoch": 0.20164858692549242, + "grad_norm": 1.65569269657135, + "learning_rate": 4.514932417391006e-05, + "loss": 4.477, + "step": 33906 + }, + { + "epoch": 0.20165453420877344, + "grad_norm": 1.6719849109649658, + "learning_rate": 4.51490476703199e-05, + "loss": 4.447, + "step": 33907 + }, + { + "epoch": 0.20166048149205443, + "grad_norm": 1.5780644416809082, + "learning_rate": 4.514877115969591e-05, + "loss": 4.8047, + "step": 33908 + }, + { + "epoch": 0.20166642877533542, + "grad_norm": 1.6983767747879028, + "learning_rate": 4.5148494642038194e-05, + "loss": 4.4235, + "step": 33909 + }, + { + "epoch": 0.20167237605861643, + "grad_norm": 1.9663766622543335, + "learning_rate": 4.514821811734685e-05, + "loss": 4.1565, + "step": 33910 + }, + { + "epoch": 0.20167832334189742, + "grad_norm": 2.1460719108581543, + "learning_rate": 4.5147941585621965e-05, + "loss": 3.8833, + "step": 33911 + }, + { + "epoch": 0.2016842706251784, + "grad_norm": 1.7094260454177856, + "learning_rate": 4.5147665046863655e-05, + "loss": 4.3554, + "step": 33912 + }, + { + "epoch": 0.20169021790845942, + "grad_norm": 2.377586603164673, + "learning_rate": 4.5147388501071984e-05, + "loss": 3.4381, + "step": 33913 + }, + { + "epoch": 0.2016961651917404, + "grad_norm": 1.6335028409957886, + "learning_rate": 4.514711194824708e-05, + "loss": 4.8601, + "step": 33914 + }, + { + "epoch": 0.2017021124750214, + "grad_norm": 2.024763584136963, + "learning_rate": 4.514683538838903e-05, + "loss": 4.5755, + "step": 33915 + }, + { + "epoch": 0.20170805975830242, + "grad_norm": 1.655968427658081, + "learning_rate": 4.514655882149792e-05, + "loss": 4.7452, + "step": 33916 + }, + { + "epoch": 0.2017140070415834, + "grad_norm": 1.5172895193099976, + "learning_rate": 4.5146282247573855e-05, + "loss": 5.0361, + "step": 33917 + }, + { + "epoch": 0.2017199543248644, + "grad_norm": 1.302919626235962, + "learning_rate": 4.514600566661693e-05, + "loss": 5.2368, + "step": 33918 + }, + { + "epoch": 0.2017259016081454, + "grad_norm": 1.4833548069000244, + "learning_rate": 4.514572907862725e-05, + "loss": 5.1284, + "step": 33919 + }, + { + "epoch": 0.2017318488914264, + "grad_norm": 1.5283784866333008, + "learning_rate": 4.514545248360491e-05, + "loss": 5.1214, + "step": 33920 + }, + { + "epoch": 0.20173779617470738, + "grad_norm": 1.643585443496704, + "learning_rate": 4.514517588154998e-05, + "loss": 4.8686, + "step": 33921 + }, + { + "epoch": 0.2017437434579884, + "grad_norm": 1.5718209743499756, + "learning_rate": 4.5144899272462594e-05, + "loss": 4.824, + "step": 33922 + }, + { + "epoch": 0.2017496907412694, + "grad_norm": 1.4388155937194824, + "learning_rate": 4.514462265634283e-05, + "loss": 5.2063, + "step": 33923 + }, + { + "epoch": 0.20175563802455038, + "grad_norm": 1.2471232414245605, + "learning_rate": 4.5144346033190776e-05, + "loss": 5.1597, + "step": 33924 + }, + { + "epoch": 0.2017615853078314, + "grad_norm": 1.626516342163086, + "learning_rate": 4.514406940300655e-05, + "loss": 5.136, + "step": 33925 + }, + { + "epoch": 0.20176753259111238, + "grad_norm": 1.1768821477890015, + "learning_rate": 4.514379276579023e-05, + "loss": 5.1374, + "step": 33926 + }, + { + "epoch": 0.20177347987439337, + "grad_norm": 1.5507917404174805, + "learning_rate": 4.5143516121541926e-05, + "loss": 4.7123, + "step": 33927 + }, + { + "epoch": 0.20177942715767438, + "grad_norm": 1.611994981765747, + "learning_rate": 4.514323947026172e-05, + "loss": 3.8981, + "step": 33928 + }, + { + "epoch": 0.20178537444095537, + "grad_norm": 1.2168185710906982, + "learning_rate": 4.5142962811949724e-05, + "loss": 5.0275, + "step": 33929 + }, + { + "epoch": 0.20179132172423636, + "grad_norm": 1.4680912494659424, + "learning_rate": 4.514268614660603e-05, + "loss": 5.1313, + "step": 33930 + }, + { + "epoch": 0.20179726900751738, + "grad_norm": 1.660117506980896, + "learning_rate": 4.514240947423073e-05, + "loss": 5.1246, + "step": 33931 + }, + { + "epoch": 0.20180321629079837, + "grad_norm": 1.4809633493423462, + "learning_rate": 4.514213279482392e-05, + "loss": 5.2159, + "step": 33932 + }, + { + "epoch": 0.20180916357407935, + "grad_norm": 1.6122835874557495, + "learning_rate": 4.51418561083857e-05, + "loss": 4.674, + "step": 33933 + }, + { + "epoch": 0.20181511085736037, + "grad_norm": 1.5591886043548584, + "learning_rate": 4.5141579414916166e-05, + "loss": 5.2786, + "step": 33934 + }, + { + "epoch": 0.20182105814064136, + "grad_norm": 1.4378422498703003, + "learning_rate": 4.514130271441541e-05, + "loss": 5.3939, + "step": 33935 + }, + { + "epoch": 0.20182700542392235, + "grad_norm": 1.9341799020767212, + "learning_rate": 4.5141026006883543e-05, + "loss": 4.2788, + "step": 33936 + }, + { + "epoch": 0.20183295270720336, + "grad_norm": 1.7629951238632202, + "learning_rate": 4.514074929232065e-05, + "loss": 4.6655, + "step": 33937 + }, + { + "epoch": 0.20183889999048435, + "grad_norm": 2.02024245262146, + "learning_rate": 4.514047257072683e-05, + "loss": 4.1873, + "step": 33938 + }, + { + "epoch": 0.20184484727376534, + "grad_norm": 1.8670521974563599, + "learning_rate": 4.514019584210217e-05, + "loss": 4.762, + "step": 33939 + }, + { + "epoch": 0.20185079455704635, + "grad_norm": 1.35395085811615, + "learning_rate": 4.5139919106446796e-05, + "loss": 4.8601, + "step": 33940 + }, + { + "epoch": 0.20185674184032734, + "grad_norm": 1.3640669584274292, + "learning_rate": 4.5139642363760765e-05, + "loss": 4.6912, + "step": 33941 + }, + { + "epoch": 0.20186268912360833, + "grad_norm": 1.4075101613998413, + "learning_rate": 4.51393656140442e-05, + "loss": 5.0414, + "step": 33942 + }, + { + "epoch": 0.20186863640688935, + "grad_norm": 1.4277760982513428, + "learning_rate": 4.513908885729719e-05, + "loss": 5.2387, + "step": 33943 + }, + { + "epoch": 0.20187458369017033, + "grad_norm": 1.665337085723877, + "learning_rate": 4.5138812093519825e-05, + "loss": 4.6897, + "step": 33944 + }, + { + "epoch": 0.20188053097345132, + "grad_norm": 1.6986275911331177, + "learning_rate": 4.513853532271222e-05, + "loss": 5.1868, + "step": 33945 + }, + { + "epoch": 0.20188647825673234, + "grad_norm": 1.6409507989883423, + "learning_rate": 4.5138258544874455e-05, + "loss": 5.0873, + "step": 33946 + }, + { + "epoch": 0.20189242554001333, + "grad_norm": 1.5691696405410767, + "learning_rate": 4.513798176000663e-05, + "loss": 5.1351, + "step": 33947 + }, + { + "epoch": 0.20189837282329431, + "grad_norm": 1.490713119506836, + "learning_rate": 4.513770496810885e-05, + "loss": 5.1177, + "step": 33948 + }, + { + "epoch": 0.20190432010657533, + "grad_norm": 1.505738377571106, + "learning_rate": 4.51374281691812e-05, + "loss": 5.1999, + "step": 33949 + }, + { + "epoch": 0.20191026738985632, + "grad_norm": 1.6345856189727783, + "learning_rate": 4.5137151363223786e-05, + "loss": 5.1542, + "step": 33950 + }, + { + "epoch": 0.2019162146731373, + "grad_norm": 1.6463525295257568, + "learning_rate": 4.5136874550236696e-05, + "loss": 5.443, + "step": 33951 + }, + { + "epoch": 0.2019221619564183, + "grad_norm": 1.616943359375, + "learning_rate": 4.513659773022003e-05, + "loss": 5.5123, + "step": 33952 + }, + { + "epoch": 0.2019281092396993, + "grad_norm": 1.485422134399414, + "learning_rate": 4.513632090317389e-05, + "loss": 5.2979, + "step": 33953 + }, + { + "epoch": 0.2019340565229803, + "grad_norm": 1.629473328590393, + "learning_rate": 4.513604406909837e-05, + "loss": 5.1169, + "step": 33954 + }, + { + "epoch": 0.2019400038062613, + "grad_norm": 1.6643434762954712, + "learning_rate": 4.513576722799357e-05, + "loss": 4.8612, + "step": 33955 + }, + { + "epoch": 0.2019459510895423, + "grad_norm": 1.694492220878601, + "learning_rate": 4.513549037985957e-05, + "loss": 4.7354, + "step": 33956 + }, + { + "epoch": 0.2019518983728233, + "grad_norm": 1.9222434759140015, + "learning_rate": 4.513521352469648e-05, + "loss": 5.2706, + "step": 33957 + }, + { + "epoch": 0.20195784565610428, + "grad_norm": 1.6370993852615356, + "learning_rate": 4.513493666250439e-05, + "loss": 5.1291, + "step": 33958 + }, + { + "epoch": 0.2019637929393853, + "grad_norm": 1.7546459436416626, + "learning_rate": 4.5134659793283416e-05, + "loss": 5.1042, + "step": 33959 + }, + { + "epoch": 0.20196974022266628, + "grad_norm": 1.6431562900543213, + "learning_rate": 4.513438291703364e-05, + "loss": 4.8202, + "step": 33960 + }, + { + "epoch": 0.20197568750594727, + "grad_norm": 1.6383068561553955, + "learning_rate": 4.513410603375514e-05, + "loss": 5.1127, + "step": 33961 + }, + { + "epoch": 0.2019816347892283, + "grad_norm": 1.5812822580337524, + "learning_rate": 4.513382914344805e-05, + "loss": 5.1321, + "step": 33962 + }, + { + "epoch": 0.20198758207250928, + "grad_norm": 1.462621808052063, + "learning_rate": 4.513355224611244e-05, + "loss": 5.1102, + "step": 33963 + }, + { + "epoch": 0.20199352935579026, + "grad_norm": 1.5409513711929321, + "learning_rate": 4.5133275341748414e-05, + "loss": 5.4329, + "step": 33964 + }, + { + "epoch": 0.20199947663907128, + "grad_norm": 1.2433700561523438, + "learning_rate": 4.513299843035608e-05, + "loss": 4.8886, + "step": 33965 + }, + { + "epoch": 0.20200542392235227, + "grad_norm": 1.6082065105438232, + "learning_rate": 4.513272151193552e-05, + "loss": 4.9496, + "step": 33966 + }, + { + "epoch": 0.20201137120563326, + "grad_norm": 1.6117057800292969, + "learning_rate": 4.513244458648682e-05, + "loss": 4.6895, + "step": 33967 + }, + { + "epoch": 0.20201731848891427, + "grad_norm": 1.5260170698165894, + "learning_rate": 4.513216765401011e-05, + "loss": 4.9767, + "step": 33968 + }, + { + "epoch": 0.20202326577219526, + "grad_norm": 1.6406491994857788, + "learning_rate": 4.513189071450546e-05, + "loss": 5.0693, + "step": 33969 + }, + { + "epoch": 0.20202921305547625, + "grad_norm": 1.4740065336227417, + "learning_rate": 4.5131613767972975e-05, + "loss": 5.2069, + "step": 33970 + }, + { + "epoch": 0.20203516033875726, + "grad_norm": 1.6721255779266357, + "learning_rate": 4.513133681441276e-05, + "loss": 5.0749, + "step": 33971 + }, + { + "epoch": 0.20204110762203825, + "grad_norm": 1.666450023651123, + "learning_rate": 4.513105985382489e-05, + "loss": 5.1417, + "step": 33972 + }, + { + "epoch": 0.20204705490531924, + "grad_norm": 1.6091387271881104, + "learning_rate": 4.5130782886209484e-05, + "loss": 5.0461, + "step": 33973 + }, + { + "epoch": 0.20205300218860026, + "grad_norm": 1.525931715965271, + "learning_rate": 4.5130505911566624e-05, + "loss": 5.3005, + "step": 33974 + }, + { + "epoch": 0.20205894947188124, + "grad_norm": 1.5139743089675903, + "learning_rate": 4.513022892989641e-05, + "loss": 5.2355, + "step": 33975 + }, + { + "epoch": 0.20206489675516223, + "grad_norm": 1.7949497699737549, + "learning_rate": 4.512995194119896e-05, + "loss": 5.0951, + "step": 33976 + }, + { + "epoch": 0.20207084403844325, + "grad_norm": 1.5045291185379028, + "learning_rate": 4.512967494547433e-05, + "loss": 5.158, + "step": 33977 + }, + { + "epoch": 0.20207679132172424, + "grad_norm": 1.7383949756622314, + "learning_rate": 4.512939794272265e-05, + "loss": 5.1696, + "step": 33978 + }, + { + "epoch": 0.20208273860500522, + "grad_norm": 1.7070204019546509, + "learning_rate": 4.5129120932944005e-05, + "loss": 4.844, + "step": 33979 + }, + { + "epoch": 0.20208868588828624, + "grad_norm": 1.4247560501098633, + "learning_rate": 4.512884391613849e-05, + "loss": 5.0324, + "step": 33980 + }, + { + "epoch": 0.20209463317156723, + "grad_norm": 1.5811928510665894, + "learning_rate": 4.5128566892306195e-05, + "loss": 4.8644, + "step": 33981 + }, + { + "epoch": 0.20210058045484822, + "grad_norm": 1.5155131816864014, + "learning_rate": 4.5128289861447235e-05, + "loss": 5.2251, + "step": 33982 + }, + { + "epoch": 0.20210652773812923, + "grad_norm": 1.441920518875122, + "learning_rate": 4.5128012823561697e-05, + "loss": 5.0595, + "step": 33983 + }, + { + "epoch": 0.20211247502141022, + "grad_norm": 1.5248456001281738, + "learning_rate": 4.5127735778649674e-05, + "loss": 4.9004, + "step": 33984 + }, + { + "epoch": 0.2021184223046912, + "grad_norm": 1.614963173866272, + "learning_rate": 4.512745872671126e-05, + "loss": 4.7622, + "step": 33985 + }, + { + "epoch": 0.20212436958797222, + "grad_norm": 1.4812332391738892, + "learning_rate": 4.512718166774657e-05, + "loss": 4.6066, + "step": 33986 + }, + { + "epoch": 0.2021303168712532, + "grad_norm": 1.3561605215072632, + "learning_rate": 4.512690460175568e-05, + "loss": 4.5035, + "step": 33987 + }, + { + "epoch": 0.2021362641545342, + "grad_norm": 1.1321245431900024, + "learning_rate": 4.5126627528738704e-05, + "loss": 4.1359, + "step": 33988 + }, + { + "epoch": 0.20214221143781522, + "grad_norm": 1.7284629344940186, + "learning_rate": 4.512635044869573e-05, + "loss": 4.7861, + "step": 33989 + }, + { + "epoch": 0.2021481587210962, + "grad_norm": 1.4472488164901733, + "learning_rate": 4.512607336162685e-05, + "loss": 5.0742, + "step": 33990 + }, + { + "epoch": 0.2021541060043772, + "grad_norm": 1.425902009010315, + "learning_rate": 4.512579626753216e-05, + "loss": 4.73, + "step": 33991 + }, + { + "epoch": 0.2021600532876582, + "grad_norm": 1.4404271841049194, + "learning_rate": 4.512551916641178e-05, + "loss": 5.0756, + "step": 33992 + }, + { + "epoch": 0.2021660005709392, + "grad_norm": 1.6837798357009888, + "learning_rate": 4.512524205826577e-05, + "loss": 4.7235, + "step": 33993 + }, + { + "epoch": 0.20217194785422019, + "grad_norm": 1.9286775588989258, + "learning_rate": 4.512496494309426e-05, + "loss": 4.9271, + "step": 33994 + }, + { + "epoch": 0.2021778951375012, + "grad_norm": 2.1817314624786377, + "learning_rate": 4.512468782089733e-05, + "loss": 4.5035, + "step": 33995 + }, + { + "epoch": 0.2021838424207822, + "grad_norm": 1.429819107055664, + "learning_rate": 4.512441069167507e-05, + "loss": 4.6988, + "step": 33996 + }, + { + "epoch": 0.20218978970406318, + "grad_norm": 1.3980942964553833, + "learning_rate": 4.512413355542759e-05, + "loss": 5.0089, + "step": 33997 + }, + { + "epoch": 0.2021957369873442, + "grad_norm": 1.4934065341949463, + "learning_rate": 4.512385641215499e-05, + "loss": 5.0263, + "step": 33998 + }, + { + "epoch": 0.20220168427062518, + "grad_norm": 1.3305639028549194, + "learning_rate": 4.5123579261857354e-05, + "loss": 5.1148, + "step": 33999 + }, + { + "epoch": 0.20220763155390617, + "grad_norm": 1.524097204208374, + "learning_rate": 4.512330210453479e-05, + "loss": 4.9961, + "step": 34000 + }, + { + "epoch": 0.20221357883718719, + "grad_norm": 1.5130045413970947, + "learning_rate": 4.512302494018738e-05, + "loss": 5.0517, + "step": 34001 + }, + { + "epoch": 0.20221952612046817, + "grad_norm": 1.4187722206115723, + "learning_rate": 4.512274776881523e-05, + "loss": 5.2811, + "step": 34002 + }, + { + "epoch": 0.20222547340374916, + "grad_norm": 1.3560248613357544, + "learning_rate": 4.5122470590418446e-05, + "loss": 5.1782, + "step": 34003 + }, + { + "epoch": 0.20223142068703018, + "grad_norm": 1.6151503324508667, + "learning_rate": 4.5122193404997115e-05, + "loss": 5.0186, + "step": 34004 + }, + { + "epoch": 0.20223736797031117, + "grad_norm": 1.6382167339324951, + "learning_rate": 4.512191621255133e-05, + "loss": 4.7976, + "step": 34005 + }, + { + "epoch": 0.20224331525359215, + "grad_norm": 1.8903952836990356, + "learning_rate": 4.512163901308118e-05, + "loss": 4.5246, + "step": 34006 + }, + { + "epoch": 0.20224926253687317, + "grad_norm": 1.540955662727356, + "learning_rate": 4.512136180658679e-05, + "loss": 4.7971, + "step": 34007 + }, + { + "epoch": 0.20225520982015416, + "grad_norm": 1.3648852109909058, + "learning_rate": 4.512108459306824e-05, + "loss": 4.9859, + "step": 34008 + }, + { + "epoch": 0.20226115710343515, + "grad_norm": 2.3196678161621094, + "learning_rate": 4.512080737252562e-05, + "loss": 4.4534, + "step": 34009 + }, + { + "epoch": 0.20226710438671613, + "grad_norm": 2.2545480728149414, + "learning_rate": 4.512053014495904e-05, + "loss": 4.256, + "step": 34010 + }, + { + "epoch": 0.20227305166999715, + "grad_norm": 1.7504942417144775, + "learning_rate": 4.512025291036859e-05, + "loss": 5.193, + "step": 34011 + }, + { + "epoch": 0.20227899895327814, + "grad_norm": 1.8206931352615356, + "learning_rate": 4.5119975668754365e-05, + "loss": 4.8621, + "step": 34012 + }, + { + "epoch": 0.20228494623655913, + "grad_norm": 1.5588812828063965, + "learning_rate": 4.5119698420116465e-05, + "loss": 4.8035, + "step": 34013 + }, + { + "epoch": 0.20229089351984014, + "grad_norm": 2.13454532623291, + "learning_rate": 4.511942116445499e-05, + "loss": 4.1108, + "step": 34014 + }, + { + "epoch": 0.20229684080312113, + "grad_norm": 2.353149890899658, + "learning_rate": 4.511914390177002e-05, + "loss": 2.8856, + "step": 34015 + }, + { + "epoch": 0.20230278808640212, + "grad_norm": 1.998806357383728, + "learning_rate": 4.511886663206168e-05, + "loss": 4.9761, + "step": 34016 + }, + { + "epoch": 0.20230873536968313, + "grad_norm": 2.1776490211486816, + "learning_rate": 4.5118589355330045e-05, + "loss": 3.3337, + "step": 34017 + }, + { + "epoch": 0.20231468265296412, + "grad_norm": 1.7580403089523315, + "learning_rate": 4.5118312071575217e-05, + "loss": 4.3142, + "step": 34018 + }, + { + "epoch": 0.2023206299362451, + "grad_norm": 1.6570219993591309, + "learning_rate": 4.51180347807973e-05, + "loss": 4.3844, + "step": 34019 + }, + { + "epoch": 0.20232657721952613, + "grad_norm": 1.995206356048584, + "learning_rate": 4.511775748299638e-05, + "loss": 4.8618, + "step": 34020 + }, + { + "epoch": 0.20233252450280712, + "grad_norm": 3.040178060531616, + "learning_rate": 4.5117480178172555e-05, + "loss": 3.7443, + "step": 34021 + }, + { + "epoch": 0.2023384717860881, + "grad_norm": 1.5867849588394165, + "learning_rate": 4.511720286632593e-05, + "loss": 4.8468, + "step": 34022 + }, + { + "epoch": 0.20234441906936912, + "grad_norm": 1.6994186639785767, + "learning_rate": 4.51169255474566e-05, + "loss": 4.8807, + "step": 34023 + }, + { + "epoch": 0.2023503663526501, + "grad_norm": 1.6643023490905762, + "learning_rate": 4.511664822156465e-05, + "loss": 4.9895, + "step": 34024 + }, + { + "epoch": 0.2023563136359311, + "grad_norm": 2.3625648021698, + "learning_rate": 4.5116370888650195e-05, + "loss": 3.5997, + "step": 34025 + }, + { + "epoch": 0.2023622609192121, + "grad_norm": 2.570551633834839, + "learning_rate": 4.5116093548713324e-05, + "loss": 3.1305, + "step": 34026 + }, + { + "epoch": 0.2023682082024931, + "grad_norm": 2.3419370651245117, + "learning_rate": 4.5115816201754123e-05, + "loss": 3.6007, + "step": 34027 + }, + { + "epoch": 0.2023741554857741, + "grad_norm": 1.8358023166656494, + "learning_rate": 4.511553884777271e-05, + "loss": 4.8102, + "step": 34028 + }, + { + "epoch": 0.2023801027690551, + "grad_norm": 1.8780097961425781, + "learning_rate": 4.511526148676916e-05, + "loss": 4.7336, + "step": 34029 + }, + { + "epoch": 0.2023860500523361, + "grad_norm": 1.93792724609375, + "learning_rate": 4.5114984118743584e-05, + "loss": 4.3768, + "step": 34030 + }, + { + "epoch": 0.20239199733561708, + "grad_norm": 3.4534430503845215, + "learning_rate": 4.511470674369608e-05, + "loss": 3.6955, + "step": 34031 + }, + { + "epoch": 0.2023979446188981, + "grad_norm": 2.6207618713378906, + "learning_rate": 4.511442936162673e-05, + "loss": 3.5511, + "step": 34032 + }, + { + "epoch": 0.20240389190217908, + "grad_norm": 1.6200617551803589, + "learning_rate": 4.5114151972535646e-05, + "loss": 4.5561, + "step": 34033 + }, + { + "epoch": 0.20240983918546007, + "grad_norm": 1.6427030563354492, + "learning_rate": 4.511387457642292e-05, + "loss": 4.9699, + "step": 34034 + }, + { + "epoch": 0.2024157864687411, + "grad_norm": 2.553480863571167, + "learning_rate": 4.511359717328865e-05, + "loss": 3.473, + "step": 34035 + }, + { + "epoch": 0.20242173375202208, + "grad_norm": 2.211226224899292, + "learning_rate": 4.5113319763132924e-05, + "loss": 3.6738, + "step": 34036 + }, + { + "epoch": 0.20242768103530306, + "grad_norm": 2.54076886177063, + "learning_rate": 4.511304234595585e-05, + "loss": 3.5138, + "step": 34037 + }, + { + "epoch": 0.20243362831858408, + "grad_norm": 1.4781157970428467, + "learning_rate": 4.5112764921757524e-05, + "loss": 4.9329, + "step": 34038 + }, + { + "epoch": 0.20243957560186507, + "grad_norm": 2.3486785888671875, + "learning_rate": 4.5112487490538033e-05, + "loss": 3.5019, + "step": 34039 + }, + { + "epoch": 0.20244552288514606, + "grad_norm": 2.678544282913208, + "learning_rate": 4.511221005229748e-05, + "loss": 3.5641, + "step": 34040 + }, + { + "epoch": 0.20245147016842707, + "grad_norm": 2.3444156646728516, + "learning_rate": 4.5111932607035965e-05, + "loss": 3.2577, + "step": 34041 + }, + { + "epoch": 0.20245741745170806, + "grad_norm": 2.382840633392334, + "learning_rate": 4.5111655154753584e-05, + "loss": 3.2974, + "step": 34042 + }, + { + "epoch": 0.20246336473498905, + "grad_norm": 2.189680814743042, + "learning_rate": 4.511137769545043e-05, + "loss": 3.7787, + "step": 34043 + }, + { + "epoch": 0.20246931201827006, + "grad_norm": 2.6685993671417236, + "learning_rate": 4.511110022912661e-05, + "loss": 3.0316, + "step": 34044 + }, + { + "epoch": 0.20247525930155105, + "grad_norm": 2.4069671630859375, + "learning_rate": 4.51108227557822e-05, + "loss": 3.2931, + "step": 34045 + }, + { + "epoch": 0.20248120658483204, + "grad_norm": 2.5283761024475098, + "learning_rate": 4.5110545275417314e-05, + "loss": 3.5309, + "step": 34046 + }, + { + "epoch": 0.20248715386811306, + "grad_norm": 2.511444330215454, + "learning_rate": 4.5110267788032044e-05, + "loss": 3.1936, + "step": 34047 + }, + { + "epoch": 0.20249310115139404, + "grad_norm": 2.352766275405884, + "learning_rate": 4.510999029362649e-05, + "loss": 2.993, + "step": 34048 + }, + { + "epoch": 0.20249904843467503, + "grad_norm": 1.8153971433639526, + "learning_rate": 4.510971279220074e-05, + "loss": 4.5221, + "step": 34049 + }, + { + "epoch": 0.20250499571795605, + "grad_norm": 2.084735155105591, + "learning_rate": 4.510943528375491e-05, + "loss": 4.4174, + "step": 34050 + }, + { + "epoch": 0.20251094300123704, + "grad_norm": 1.5497907400131226, + "learning_rate": 4.510915776828907e-05, + "loss": 4.3793, + "step": 34051 + }, + { + "epoch": 0.20251689028451803, + "grad_norm": 2.8055882453918457, + "learning_rate": 4.510888024580333e-05, + "loss": 2.9198, + "step": 34052 + }, + { + "epoch": 0.20252283756779904, + "grad_norm": 2.464205265045166, + "learning_rate": 4.5108602716297805e-05, + "loss": 2.8499, + "step": 34053 + }, + { + "epoch": 0.20252878485108003, + "grad_norm": 2.158693313598633, + "learning_rate": 4.5108325179772556e-05, + "loss": 4.354, + "step": 34054 + }, + { + "epoch": 0.20253473213436102, + "grad_norm": 1.828605055809021, + "learning_rate": 4.5108047636227715e-05, + "loss": 4.848, + "step": 34055 + }, + { + "epoch": 0.20254067941764203, + "grad_norm": 1.7504563331604004, + "learning_rate": 4.510777008566335e-05, + "loss": 4.8546, + "step": 34056 + }, + { + "epoch": 0.20254662670092302, + "grad_norm": 1.7161656618118286, + "learning_rate": 4.510749252807957e-05, + "loss": 4.8179, + "step": 34057 + }, + { + "epoch": 0.202552573984204, + "grad_norm": 1.610592007637024, + "learning_rate": 4.5107214963476476e-05, + "loss": 4.8187, + "step": 34058 + }, + { + "epoch": 0.20255852126748503, + "grad_norm": 1.55141282081604, + "learning_rate": 4.5106937391854167e-05, + "loss": 4.9095, + "step": 34059 + }, + { + "epoch": 0.202564468550766, + "grad_norm": 1.8562514781951904, + "learning_rate": 4.5106659813212725e-05, + "loss": 4.6624, + "step": 34060 + }, + { + "epoch": 0.202570415834047, + "grad_norm": 2.3251969814300537, + "learning_rate": 4.510638222755226e-05, + "loss": 4.1922, + "step": 34061 + }, + { + "epoch": 0.20257636311732802, + "grad_norm": 1.9926371574401855, + "learning_rate": 4.510610463487286e-05, + "loss": 4.1985, + "step": 34062 + }, + { + "epoch": 0.202582310400609, + "grad_norm": 1.8390743732452393, + "learning_rate": 4.5105827035174634e-05, + "loss": 4.7285, + "step": 34063 + }, + { + "epoch": 0.20258825768389, + "grad_norm": 1.6606966257095337, + "learning_rate": 4.510554942845766e-05, + "loss": 4.9654, + "step": 34064 + }, + { + "epoch": 0.202594204967171, + "grad_norm": 1.6574113368988037, + "learning_rate": 4.510527181472205e-05, + "loss": 5.1899, + "step": 34065 + }, + { + "epoch": 0.202600152250452, + "grad_norm": 2.2811429500579834, + "learning_rate": 4.510499419396791e-05, + "loss": 3.9173, + "step": 34066 + }, + { + "epoch": 0.202606099533733, + "grad_norm": 1.7340202331542969, + "learning_rate": 4.510471656619531e-05, + "loss": 4.7533, + "step": 34067 + }, + { + "epoch": 0.20261204681701397, + "grad_norm": 1.6960166692733765, + "learning_rate": 4.5104438931404366e-05, + "loss": 4.6543, + "step": 34068 + }, + { + "epoch": 0.202617994100295, + "grad_norm": 1.5477968454360962, + "learning_rate": 4.510416128959517e-05, + "loss": 4.913, + "step": 34069 + }, + { + "epoch": 0.20262394138357598, + "grad_norm": 1.810110330581665, + "learning_rate": 4.510388364076782e-05, + "loss": 4.7998, + "step": 34070 + }, + { + "epoch": 0.20262988866685697, + "grad_norm": 2.4154820442199707, + "learning_rate": 4.5103605984922416e-05, + "loss": 3.3427, + "step": 34071 + }, + { + "epoch": 0.20263583595013798, + "grad_norm": 1.464949369430542, + "learning_rate": 4.5103328322059046e-05, + "loss": 4.9067, + "step": 34072 + }, + { + "epoch": 0.20264178323341897, + "grad_norm": 1.944841742515564, + "learning_rate": 4.510305065217781e-05, + "loss": 4.2708, + "step": 34073 + }, + { + "epoch": 0.20264773051669996, + "grad_norm": 1.911776065826416, + "learning_rate": 4.5102772975278805e-05, + "loss": 4.2951, + "step": 34074 + }, + { + "epoch": 0.20265367779998097, + "grad_norm": 2.0011467933654785, + "learning_rate": 4.510249529136213e-05, + "loss": 4.2628, + "step": 34075 + }, + { + "epoch": 0.20265962508326196, + "grad_norm": 1.8548624515533447, + "learning_rate": 4.5102217600427887e-05, + "loss": 3.9783, + "step": 34076 + }, + { + "epoch": 0.20266557236654295, + "grad_norm": 1.7101125717163086, + "learning_rate": 4.510193990247616e-05, + "loss": 4.4064, + "step": 34077 + }, + { + "epoch": 0.20267151964982397, + "grad_norm": 1.4838210344314575, + "learning_rate": 4.510166219750707e-05, + "loss": 5.2525, + "step": 34078 + }, + { + "epoch": 0.20267746693310495, + "grad_norm": 1.4394530057907104, + "learning_rate": 4.510138448552068e-05, + "loss": 5.1155, + "step": 34079 + }, + { + "epoch": 0.20268341421638594, + "grad_norm": 1.5585321187973022, + "learning_rate": 4.510110676651711e-05, + "loss": 5.1436, + "step": 34080 + }, + { + "epoch": 0.20268936149966696, + "grad_norm": 1.3252408504486084, + "learning_rate": 4.510082904049645e-05, + "loss": 5.1799, + "step": 34081 + }, + { + "epoch": 0.20269530878294795, + "grad_norm": 1.5365374088287354, + "learning_rate": 4.51005513074588e-05, + "loss": 4.9753, + "step": 34082 + }, + { + "epoch": 0.20270125606622894, + "grad_norm": 1.5917723178863525, + "learning_rate": 4.510027356740426e-05, + "loss": 4.4666, + "step": 34083 + }, + { + "epoch": 0.20270720334950995, + "grad_norm": 1.5753759145736694, + "learning_rate": 4.509999582033292e-05, + "loss": 5.0185, + "step": 34084 + }, + { + "epoch": 0.20271315063279094, + "grad_norm": 1.7368061542510986, + "learning_rate": 4.5099718066244875e-05, + "loss": 4.8307, + "step": 34085 + }, + { + "epoch": 0.20271909791607193, + "grad_norm": 1.7237951755523682, + "learning_rate": 4.509944030514023e-05, + "loss": 4.736, + "step": 34086 + }, + { + "epoch": 0.20272504519935294, + "grad_norm": 1.337406873703003, + "learning_rate": 4.509916253701907e-05, + "loss": 5.0079, + "step": 34087 + }, + { + "epoch": 0.20273099248263393, + "grad_norm": 1.7189267873764038, + "learning_rate": 4.509888476188151e-05, + "loss": 4.861, + "step": 34088 + }, + { + "epoch": 0.20273693976591492, + "grad_norm": 1.5400909185409546, + "learning_rate": 4.509860697972763e-05, + "loss": 5.025, + "step": 34089 + }, + { + "epoch": 0.20274288704919594, + "grad_norm": 1.5735805034637451, + "learning_rate": 4.509832919055754e-05, + "loss": 5.1001, + "step": 34090 + }, + { + "epoch": 0.20274883433247692, + "grad_norm": 1.5908180475234985, + "learning_rate": 4.5098051394371324e-05, + "loss": 4.0066, + "step": 34091 + }, + { + "epoch": 0.2027547816157579, + "grad_norm": 1.4632060527801514, + "learning_rate": 4.509777359116909e-05, + "loss": 4.9066, + "step": 34092 + }, + { + "epoch": 0.20276072889903893, + "grad_norm": 1.6393321752548218, + "learning_rate": 4.5097495780950926e-05, + "loss": 4.9735, + "step": 34093 + }, + { + "epoch": 0.20276667618231992, + "grad_norm": 1.5407154560089111, + "learning_rate": 4.5097217963716946e-05, + "loss": 5.1568, + "step": 34094 + }, + { + "epoch": 0.2027726234656009, + "grad_norm": 1.3990727663040161, + "learning_rate": 4.509694013946723e-05, + "loss": 5.1463, + "step": 34095 + }, + { + "epoch": 0.20277857074888192, + "grad_norm": 1.9776240587234497, + "learning_rate": 4.509666230820187e-05, + "loss": 4.3307, + "step": 34096 + }, + { + "epoch": 0.2027845180321629, + "grad_norm": 1.737297534942627, + "learning_rate": 4.509638446992098e-05, + "loss": 4.6655, + "step": 34097 + }, + { + "epoch": 0.2027904653154439, + "grad_norm": 1.946708083152771, + "learning_rate": 4.5096106624624647e-05, + "loss": 4.4116, + "step": 34098 + }, + { + "epoch": 0.2027964125987249, + "grad_norm": 1.475724697113037, + "learning_rate": 4.509582877231298e-05, + "loss": 5.2965, + "step": 34099 + }, + { + "epoch": 0.2028023598820059, + "grad_norm": 2.5185513496398926, + "learning_rate": 4.509555091298605e-05, + "loss": 3.333, + "step": 34100 + }, + { + "epoch": 0.2028083071652869, + "grad_norm": 1.4091417789459229, + "learning_rate": 4.5095273046643985e-05, + "loss": 4.9255, + "step": 34101 + }, + { + "epoch": 0.2028142544485679, + "grad_norm": 1.1531707048416138, + "learning_rate": 4.509499517328686e-05, + "loss": 4.8433, + "step": 34102 + }, + { + "epoch": 0.2028202017318489, + "grad_norm": 1.0453072786331177, + "learning_rate": 4.509471729291479e-05, + "loss": 4.6551, + "step": 34103 + }, + { + "epoch": 0.20282614901512988, + "grad_norm": 0.9254010319709778, + "learning_rate": 4.509443940552785e-05, + "loss": 4.619, + "step": 34104 + }, + { + "epoch": 0.2028320962984109, + "grad_norm": 1.067936897277832, + "learning_rate": 4.5094161511126155e-05, + "loss": 4.2688, + "step": 34105 + }, + { + "epoch": 0.20283804358169188, + "grad_norm": 1.2932766675949097, + "learning_rate": 4.50938836097098e-05, + "loss": 4.6168, + "step": 34106 + }, + { + "epoch": 0.20284399086497287, + "grad_norm": 1.522346019744873, + "learning_rate": 4.509360570127887e-05, + "loss": 4.9451, + "step": 34107 + }, + { + "epoch": 0.2028499381482539, + "grad_norm": 1.7008284330368042, + "learning_rate": 4.509332778583347e-05, + "loss": 4.9488, + "step": 34108 + }, + { + "epoch": 0.20285588543153488, + "grad_norm": 1.7703099250793457, + "learning_rate": 4.50930498633737e-05, + "loss": 4.9046, + "step": 34109 + }, + { + "epoch": 0.20286183271481587, + "grad_norm": 1.7272570133209229, + "learning_rate": 4.509277193389965e-05, + "loss": 4.8267, + "step": 34110 + }, + { + "epoch": 0.20286777999809688, + "grad_norm": 1.2832982540130615, + "learning_rate": 4.5092493997411426e-05, + "loss": 4.5058, + "step": 34111 + }, + { + "epoch": 0.20287372728137787, + "grad_norm": 1.063335657119751, + "learning_rate": 4.509221605390912e-05, + "loss": 4.3458, + "step": 34112 + }, + { + "epoch": 0.20287967456465886, + "grad_norm": 0.9658304452896118, + "learning_rate": 4.509193810339283e-05, + "loss": 4.4834, + "step": 34113 + }, + { + "epoch": 0.20288562184793987, + "grad_norm": 1.211989164352417, + "learning_rate": 4.509166014586265e-05, + "loss": 4.3731, + "step": 34114 + }, + { + "epoch": 0.20289156913122086, + "grad_norm": 1.7385507822036743, + "learning_rate": 4.5091382181318675e-05, + "loss": 5.0037, + "step": 34115 + }, + { + "epoch": 0.20289751641450185, + "grad_norm": 1.5110931396484375, + "learning_rate": 4.5091104209761005e-05, + "loss": 5.127, + "step": 34116 + }, + { + "epoch": 0.20290346369778287, + "grad_norm": 1.644289255142212, + "learning_rate": 4.5090826231189745e-05, + "loss": 5.2034, + "step": 34117 + }, + { + "epoch": 0.20290941098106385, + "grad_norm": 1.3904880285263062, + "learning_rate": 4.509054824560498e-05, + "loss": 5.1311, + "step": 34118 + }, + { + "epoch": 0.20291535826434484, + "grad_norm": 1.6756666898727417, + "learning_rate": 4.509027025300682e-05, + "loss": 4.8252, + "step": 34119 + }, + { + "epoch": 0.20292130554762586, + "grad_norm": 1.3861212730407715, + "learning_rate": 4.508999225339534e-05, + "loss": 4.7141, + "step": 34120 + }, + { + "epoch": 0.20292725283090685, + "grad_norm": 1.4065701961517334, + "learning_rate": 4.5089714246770663e-05, + "loss": 4.5625, + "step": 34121 + }, + { + "epoch": 0.20293320011418783, + "grad_norm": 1.336972951889038, + "learning_rate": 4.508943623313288e-05, + "loss": 4.4038, + "step": 34122 + }, + { + "epoch": 0.20293914739746885, + "grad_norm": 1.7632920742034912, + "learning_rate": 4.5089158212482064e-05, + "loss": 5.1773, + "step": 34123 + }, + { + "epoch": 0.20294509468074984, + "grad_norm": 1.5751595497131348, + "learning_rate": 4.508888018481834e-05, + "loss": 4.7093, + "step": 34124 + }, + { + "epoch": 0.20295104196403083, + "grad_norm": 1.4306808710098267, + "learning_rate": 4.5088602150141793e-05, + "loss": 4.8948, + "step": 34125 + }, + { + "epoch": 0.20295698924731181, + "grad_norm": 1.533740758895874, + "learning_rate": 4.5088324108452525e-05, + "loss": 4.8152, + "step": 34126 + }, + { + "epoch": 0.20296293653059283, + "grad_norm": 1.5290772914886475, + "learning_rate": 4.508804605975063e-05, + "loss": 4.6585, + "step": 34127 + }, + { + "epoch": 0.20296888381387382, + "grad_norm": 1.3709888458251953, + "learning_rate": 4.508776800403621e-05, + "loss": 4.7503, + "step": 34128 + }, + { + "epoch": 0.2029748310971548, + "grad_norm": 1.2883923053741455, + "learning_rate": 4.5087489941309356e-05, + "loss": 4.4883, + "step": 34129 + }, + { + "epoch": 0.20298077838043582, + "grad_norm": 1.5060383081436157, + "learning_rate": 4.5087211871570165e-05, + "loss": 4.7866, + "step": 34130 + }, + { + "epoch": 0.2029867256637168, + "grad_norm": 1.5895962715148926, + "learning_rate": 4.5086933794818733e-05, + "loss": 4.8881, + "step": 34131 + }, + { + "epoch": 0.2029926729469978, + "grad_norm": 1.570587396621704, + "learning_rate": 4.5086655711055164e-05, + "loss": 4.7775, + "step": 34132 + }, + { + "epoch": 0.20299862023027881, + "grad_norm": 1.7003437280654907, + "learning_rate": 4.508637762027955e-05, + "loss": 4.9595, + "step": 34133 + }, + { + "epoch": 0.2030045675135598, + "grad_norm": 1.3333162069320679, + "learning_rate": 4.508609952249199e-05, + "loss": 5.1303, + "step": 34134 + }, + { + "epoch": 0.2030105147968408, + "grad_norm": 1.6453673839569092, + "learning_rate": 4.508582141769258e-05, + "loss": 4.1194, + "step": 34135 + }, + { + "epoch": 0.2030164620801218, + "grad_norm": 2.4057064056396484, + "learning_rate": 4.508554330588142e-05, + "loss": 4.0858, + "step": 34136 + }, + { + "epoch": 0.2030224093634028, + "grad_norm": 2.333036184310913, + "learning_rate": 4.508526518705859e-05, + "loss": 4.5886, + "step": 34137 + }, + { + "epoch": 0.20302835664668378, + "grad_norm": 1.5182788372039795, + "learning_rate": 4.5084987061224216e-05, + "loss": 5.316, + "step": 34138 + }, + { + "epoch": 0.2030343039299648, + "grad_norm": 1.2949062585830688, + "learning_rate": 4.5084708928378374e-05, + "loss": 5.1341, + "step": 34139 + }, + { + "epoch": 0.2030402512132458, + "grad_norm": 2.1052892208099365, + "learning_rate": 4.508443078852117e-05, + "loss": 4.5668, + "step": 34140 + }, + { + "epoch": 0.20304619849652678, + "grad_norm": 2.886911153793335, + "learning_rate": 4.50841526416527e-05, + "loss": 3.6236, + "step": 34141 + }, + { + "epoch": 0.2030521457798078, + "grad_norm": 1.5125616788864136, + "learning_rate": 4.5083874487773056e-05, + "loss": 4.6975, + "step": 34142 + }, + { + "epoch": 0.20305809306308878, + "grad_norm": 1.399048089981079, + "learning_rate": 4.5083596326882346e-05, + "loss": 4.2171, + "step": 34143 + }, + { + "epoch": 0.20306404034636977, + "grad_norm": 1.4590729475021362, + "learning_rate": 4.5083318158980656e-05, + "loss": 4.8204, + "step": 34144 + }, + { + "epoch": 0.20306998762965078, + "grad_norm": 1.7433021068572998, + "learning_rate": 4.508303998406809e-05, + "loss": 4.7279, + "step": 34145 + }, + { + "epoch": 0.20307593491293177, + "grad_norm": 1.47339928150177, + "learning_rate": 4.5082761802144736e-05, + "loss": 4.9708, + "step": 34146 + }, + { + "epoch": 0.20308188219621276, + "grad_norm": 2.5525825023651123, + "learning_rate": 4.5082483613210696e-05, + "loss": 4.5048, + "step": 34147 + }, + { + "epoch": 0.20308782947949378, + "grad_norm": 1.897265911102295, + "learning_rate": 4.5082205417266076e-05, + "loss": 4.8667, + "step": 34148 + }, + { + "epoch": 0.20309377676277476, + "grad_norm": 1.443208932876587, + "learning_rate": 4.508192721431096e-05, + "loss": 5.4088, + "step": 34149 + }, + { + "epoch": 0.20309972404605575, + "grad_norm": 2.650792121887207, + "learning_rate": 4.508164900434545e-05, + "loss": 4.1196, + "step": 34150 + }, + { + "epoch": 0.20310567132933677, + "grad_norm": 2.9030683040618896, + "learning_rate": 4.508137078736965e-05, + "loss": 3.1477, + "step": 34151 + }, + { + "epoch": 0.20311161861261776, + "grad_norm": 1.8367629051208496, + "learning_rate": 4.5081092563383645e-05, + "loss": 4.0951, + "step": 34152 + }, + { + "epoch": 0.20311756589589874, + "grad_norm": 1.3655685186386108, + "learning_rate": 4.508081433238754e-05, + "loss": 4.4232, + "step": 34153 + }, + { + "epoch": 0.20312351317917976, + "grad_norm": 1.5286078453063965, + "learning_rate": 4.5080536094381434e-05, + "loss": 4.3104, + "step": 34154 + }, + { + "epoch": 0.20312946046246075, + "grad_norm": 1.593637228012085, + "learning_rate": 4.508025784936542e-05, + "loss": 4.1811, + "step": 34155 + }, + { + "epoch": 0.20313540774574174, + "grad_norm": 1.498099446296692, + "learning_rate": 4.5079979597339586e-05, + "loss": 4.204, + "step": 34156 + }, + { + "epoch": 0.20314135502902275, + "grad_norm": 1.6303921937942505, + "learning_rate": 4.507970133830405e-05, + "loss": 4.3768, + "step": 34157 + }, + { + "epoch": 0.20314730231230374, + "grad_norm": 1.4380861520767212, + "learning_rate": 4.507942307225889e-05, + "loss": 4.4061, + "step": 34158 + }, + { + "epoch": 0.20315324959558473, + "grad_norm": 1.672142744064331, + "learning_rate": 4.5079144799204216e-05, + "loss": 4.8228, + "step": 34159 + }, + { + "epoch": 0.20315919687886574, + "grad_norm": 1.6014958620071411, + "learning_rate": 4.507886651914012e-05, + "loss": 4.7106, + "step": 34160 + }, + { + "epoch": 0.20316514416214673, + "grad_norm": 1.5370984077453613, + "learning_rate": 4.507858823206669e-05, + "loss": 4.8702, + "step": 34161 + }, + { + "epoch": 0.20317109144542772, + "grad_norm": 2.199638605117798, + "learning_rate": 4.507830993798404e-05, + "loss": 3.1011, + "step": 34162 + }, + { + "epoch": 0.20317703872870874, + "grad_norm": 1.298632025718689, + "learning_rate": 4.507803163689226e-05, + "loss": 5.0342, + "step": 34163 + }, + { + "epoch": 0.20318298601198972, + "grad_norm": 1.423470377922058, + "learning_rate": 4.5077753328791446e-05, + "loss": 4.6831, + "step": 34164 + }, + { + "epoch": 0.2031889332952707, + "grad_norm": 1.4942458868026733, + "learning_rate": 4.507747501368169e-05, + "loss": 4.7348, + "step": 34165 + }, + { + "epoch": 0.20319488057855173, + "grad_norm": 1.4068806171417236, + "learning_rate": 4.5077196691563104e-05, + "loss": 4.7561, + "step": 34166 + }, + { + "epoch": 0.20320082786183272, + "grad_norm": 1.4947446584701538, + "learning_rate": 4.5076918362435774e-05, + "loss": 4.6205, + "step": 34167 + }, + { + "epoch": 0.2032067751451137, + "grad_norm": 1.5509511232376099, + "learning_rate": 4.5076640026299794e-05, + "loss": 4.7343, + "step": 34168 + }, + { + "epoch": 0.20321272242839472, + "grad_norm": 1.7500367164611816, + "learning_rate": 4.5076361683155275e-05, + "loss": 4.948, + "step": 34169 + }, + { + "epoch": 0.2032186697116757, + "grad_norm": 1.6232200860977173, + "learning_rate": 4.5076083333002296e-05, + "loss": 5.1769, + "step": 34170 + }, + { + "epoch": 0.2032246169949567, + "grad_norm": 1.635056734085083, + "learning_rate": 4.507580497584097e-05, + "loss": 4.5965, + "step": 34171 + }, + { + "epoch": 0.2032305642782377, + "grad_norm": 1.6716241836547852, + "learning_rate": 4.507552661167138e-05, + "loss": 4.5625, + "step": 34172 + }, + { + "epoch": 0.2032365115615187, + "grad_norm": 1.4650036096572876, + "learning_rate": 4.5075248240493636e-05, + "loss": 4.8342, + "step": 34173 + }, + { + "epoch": 0.2032424588447997, + "grad_norm": 1.595201015472412, + "learning_rate": 4.507496986230784e-05, + "loss": 4.8685, + "step": 34174 + }, + { + "epoch": 0.2032484061280807, + "grad_norm": 1.3592157363891602, + "learning_rate": 4.507469147711406e-05, + "loss": 4.6101, + "step": 34175 + }, + { + "epoch": 0.2032543534113617, + "grad_norm": 1.5765128135681152, + "learning_rate": 4.507441308491242e-05, + "loss": 4.5686, + "step": 34176 + }, + { + "epoch": 0.20326030069464268, + "grad_norm": 1.5563489198684692, + "learning_rate": 4.5074134685703016e-05, + "loss": 5.0364, + "step": 34177 + }, + { + "epoch": 0.2032662479779237, + "grad_norm": 1.671233057975769, + "learning_rate": 4.5073856279485936e-05, + "loss": 4.8082, + "step": 34178 + }, + { + "epoch": 0.20327219526120469, + "grad_norm": 1.494294285774231, + "learning_rate": 4.5073577866261285e-05, + "loss": 5.3636, + "step": 34179 + }, + { + "epoch": 0.20327814254448567, + "grad_norm": 1.52043879032135, + "learning_rate": 4.507329944602915e-05, + "loss": 5.1198, + "step": 34180 + }, + { + "epoch": 0.2032840898277667, + "grad_norm": 2.0095272064208984, + "learning_rate": 4.5073021018789635e-05, + "loss": 4.2995, + "step": 34181 + }, + { + "epoch": 0.20329003711104768, + "grad_norm": 1.7347562313079834, + "learning_rate": 4.507274258454283e-05, + "loss": 4.8881, + "step": 34182 + }, + { + "epoch": 0.20329598439432867, + "grad_norm": 1.5348436832427979, + "learning_rate": 4.5072464143288844e-05, + "loss": 4.6506, + "step": 34183 + }, + { + "epoch": 0.20330193167760965, + "grad_norm": 1.8851455450057983, + "learning_rate": 4.5072185695027766e-05, + "loss": 4.7133, + "step": 34184 + }, + { + "epoch": 0.20330787896089067, + "grad_norm": 1.6985150575637817, + "learning_rate": 4.50719072397597e-05, + "loss": 4.9648, + "step": 34185 + }, + { + "epoch": 0.20331382624417166, + "grad_norm": 2.302384853363037, + "learning_rate": 4.507162877748473e-05, + "loss": 4.061, + "step": 34186 + }, + { + "epoch": 0.20331977352745265, + "grad_norm": 2.0493087768554688, + "learning_rate": 4.507135030820297e-05, + "loss": 4.2728, + "step": 34187 + }, + { + "epoch": 0.20332572081073366, + "grad_norm": 1.9146785736083984, + "learning_rate": 4.5071071831914504e-05, + "loss": 4.6218, + "step": 34188 + }, + { + "epoch": 0.20333166809401465, + "grad_norm": 1.750434160232544, + "learning_rate": 4.507079334861943e-05, + "loss": 4.7743, + "step": 34189 + }, + { + "epoch": 0.20333761537729564, + "grad_norm": 1.74863600730896, + "learning_rate": 4.507051485831786e-05, + "loss": 5.0927, + "step": 34190 + }, + { + "epoch": 0.20334356266057665, + "grad_norm": 1.523288369178772, + "learning_rate": 4.507023636100988e-05, + "loss": 4.9635, + "step": 34191 + }, + { + "epoch": 0.20334950994385764, + "grad_norm": 1.5992393493652344, + "learning_rate": 4.506995785669558e-05, + "loss": 4.9328, + "step": 34192 + }, + { + "epoch": 0.20335545722713863, + "grad_norm": 1.365012764930725, + "learning_rate": 4.5069679345375064e-05, + "loss": 4.9001, + "step": 34193 + }, + { + "epoch": 0.20336140451041965, + "grad_norm": 2.0055863857269287, + "learning_rate": 4.506940082704844e-05, + "loss": 4.8008, + "step": 34194 + }, + { + "epoch": 0.20336735179370063, + "grad_norm": 2.1375856399536133, + "learning_rate": 4.506912230171579e-05, + "loss": 4.1867, + "step": 34195 + }, + { + "epoch": 0.20337329907698162, + "grad_norm": 2.8420791625976562, + "learning_rate": 4.506884376937721e-05, + "loss": 4.1056, + "step": 34196 + }, + { + "epoch": 0.20337924636026264, + "grad_norm": 2.5468852519989014, + "learning_rate": 4.506856523003282e-05, + "loss": 4.1181, + "step": 34197 + }, + { + "epoch": 0.20338519364354363, + "grad_norm": 1.7750333547592163, + "learning_rate": 4.506828668368269e-05, + "loss": 5.1556, + "step": 34198 + }, + { + "epoch": 0.20339114092682462, + "grad_norm": 1.6695621013641357, + "learning_rate": 4.506800813032693e-05, + "loss": 5.8686, + "step": 34199 + }, + { + "epoch": 0.20339708821010563, + "grad_norm": 1.627415418624878, + "learning_rate": 4.506772956996563e-05, + "loss": 5.079, + "step": 34200 + }, + { + "epoch": 0.20340303549338662, + "grad_norm": 1.6603140830993652, + "learning_rate": 4.50674510025989e-05, + "loss": 4.8437, + "step": 34201 + }, + { + "epoch": 0.2034089827766676, + "grad_norm": 1.7012217044830322, + "learning_rate": 4.5067172428226835e-05, + "loss": 5.4832, + "step": 34202 + }, + { + "epoch": 0.20341493005994862, + "grad_norm": 1.4697450399398804, + "learning_rate": 4.506689384684952e-05, + "loss": 5.5992, + "step": 34203 + }, + { + "epoch": 0.2034208773432296, + "grad_norm": 1.4704852104187012, + "learning_rate": 4.506661525846706e-05, + "loss": 5.1104, + "step": 34204 + }, + { + "epoch": 0.2034268246265106, + "grad_norm": 1.6582788228988647, + "learning_rate": 4.5066336663079554e-05, + "loss": 4.9394, + "step": 34205 + }, + { + "epoch": 0.20343277190979162, + "grad_norm": 1.7526438236236572, + "learning_rate": 4.50660580606871e-05, + "loss": 5.2653, + "step": 34206 + }, + { + "epoch": 0.2034387191930726, + "grad_norm": 1.5664905309677124, + "learning_rate": 4.506577945128978e-05, + "loss": 5.5802, + "step": 34207 + }, + { + "epoch": 0.2034446664763536, + "grad_norm": 1.4898643493652344, + "learning_rate": 4.506550083488772e-05, + "loss": 5.606, + "step": 34208 + }, + { + "epoch": 0.2034506137596346, + "grad_norm": 1.4939732551574707, + "learning_rate": 4.5065222211480996e-05, + "loss": 5.4784, + "step": 34209 + }, + { + "epoch": 0.2034565610429156, + "grad_norm": 1.7237370014190674, + "learning_rate": 4.5064943581069705e-05, + "loss": 5.0776, + "step": 34210 + }, + { + "epoch": 0.20346250832619658, + "grad_norm": 1.7304513454437256, + "learning_rate": 4.506466494365395e-05, + "loss": 4.688, + "step": 34211 + }, + { + "epoch": 0.2034684556094776, + "grad_norm": 1.7174402475357056, + "learning_rate": 4.5064386299233826e-05, + "loss": 4.977, + "step": 34212 + }, + { + "epoch": 0.2034744028927586, + "grad_norm": 1.7457705736160278, + "learning_rate": 4.5064107647809436e-05, + "loss": 5.553, + "step": 34213 + }, + { + "epoch": 0.20348035017603958, + "grad_norm": 1.737239956855774, + "learning_rate": 4.5063828989380876e-05, + "loss": 5.4613, + "step": 34214 + }, + { + "epoch": 0.2034862974593206, + "grad_norm": 1.6310393810272217, + "learning_rate": 4.506355032394824e-05, + "loss": 5.51, + "step": 34215 + }, + { + "epoch": 0.20349224474260158, + "grad_norm": 1.660376787185669, + "learning_rate": 4.506327165151162e-05, + "loss": 5.5371, + "step": 34216 + }, + { + "epoch": 0.20349819202588257, + "grad_norm": 1.5626025199890137, + "learning_rate": 4.506299297207113e-05, + "loss": 5.4861, + "step": 34217 + }, + { + "epoch": 0.20350413930916358, + "grad_norm": 1.654665470123291, + "learning_rate": 4.506271428562685e-05, + "loss": 4.5907, + "step": 34218 + }, + { + "epoch": 0.20351008659244457, + "grad_norm": 1.474399447441101, + "learning_rate": 4.506243559217887e-05, + "loss": 5.1529, + "step": 34219 + }, + { + "epoch": 0.20351603387572556, + "grad_norm": 1.4964390993118286, + "learning_rate": 4.506215689172733e-05, + "loss": 5.1153, + "step": 34220 + }, + { + "epoch": 0.20352198115900658, + "grad_norm": 1.9598966836929321, + "learning_rate": 4.506187818427228e-05, + "loss": 4.1903, + "step": 34221 + }, + { + "epoch": 0.20352792844228756, + "grad_norm": 1.8703410625457764, + "learning_rate": 4.506159946981383e-05, + "loss": 4.0715, + "step": 34222 + }, + { + "epoch": 0.20353387572556855, + "grad_norm": 1.718729019165039, + "learning_rate": 4.50613207483521e-05, + "loss": 4.9715, + "step": 34223 + }, + { + "epoch": 0.20353982300884957, + "grad_norm": 1.7516825199127197, + "learning_rate": 4.506104201988716e-05, + "loss": 4.2259, + "step": 34224 + }, + { + "epoch": 0.20354577029213056, + "grad_norm": 1.7814204692840576, + "learning_rate": 4.5060763284419114e-05, + "loss": 5.0681, + "step": 34225 + }, + { + "epoch": 0.20355171757541154, + "grad_norm": 2.5831916332244873, + "learning_rate": 4.506048454194807e-05, + "loss": 4.0293, + "step": 34226 + }, + { + "epoch": 0.20355766485869256, + "grad_norm": 1.8654228448867798, + "learning_rate": 4.506020579247412e-05, + "loss": 4.5101, + "step": 34227 + }, + { + "epoch": 0.20356361214197355, + "grad_norm": 1.6361619234085083, + "learning_rate": 4.5059927035997354e-05, + "loss": 5.4405, + "step": 34228 + }, + { + "epoch": 0.20356955942525454, + "grad_norm": 1.5046677589416504, + "learning_rate": 4.505964827251787e-05, + "loss": 4.8682, + "step": 34229 + }, + { + "epoch": 0.20357550670853555, + "grad_norm": 1.6146504878997803, + "learning_rate": 4.505936950203578e-05, + "loss": 4.345, + "step": 34230 + }, + { + "epoch": 0.20358145399181654, + "grad_norm": 1.7138882875442505, + "learning_rate": 4.5059090724551166e-05, + "loss": 5.3332, + "step": 34231 + }, + { + "epoch": 0.20358740127509753, + "grad_norm": 1.7118430137634277, + "learning_rate": 4.505881194006413e-05, + "loss": 4.8239, + "step": 34232 + }, + { + "epoch": 0.20359334855837855, + "grad_norm": 1.4503207206726074, + "learning_rate": 4.505853314857477e-05, + "loss": 4.7619, + "step": 34233 + }, + { + "epoch": 0.20359929584165953, + "grad_norm": 1.5013114213943481, + "learning_rate": 4.5058254350083185e-05, + "loss": 5.1288, + "step": 34234 + }, + { + "epoch": 0.20360524312494052, + "grad_norm": 1.5356587171554565, + "learning_rate": 4.505797554458947e-05, + "loss": 5.051, + "step": 34235 + }, + { + "epoch": 0.20361119040822154, + "grad_norm": 1.7051646709442139, + "learning_rate": 4.5057696732093724e-05, + "loss": 4.7767, + "step": 34236 + }, + { + "epoch": 0.20361713769150253, + "grad_norm": 2.3335628509521484, + "learning_rate": 4.505741791259605e-05, + "loss": 4.9362, + "step": 34237 + }, + { + "epoch": 0.2036230849747835, + "grad_norm": 1.9061404466629028, + "learning_rate": 4.505713908609653e-05, + "loss": 4.9848, + "step": 34238 + }, + { + "epoch": 0.20362903225806453, + "grad_norm": 1.7989264726638794, + "learning_rate": 4.505686025259527e-05, + "loss": 4.9515, + "step": 34239 + }, + { + "epoch": 0.20363497954134552, + "grad_norm": 2.2343575954437256, + "learning_rate": 4.505658141209237e-05, + "loss": 4.6851, + "step": 34240 + }, + { + "epoch": 0.2036409268246265, + "grad_norm": 1.568610668182373, + "learning_rate": 4.5056302564587924e-05, + "loss": 5.0318, + "step": 34241 + }, + { + "epoch": 0.2036468741079075, + "grad_norm": 1.4426900148391724, + "learning_rate": 4.505602371008203e-05, + "loss": 5.2873, + "step": 34242 + }, + { + "epoch": 0.2036528213911885, + "grad_norm": 1.5718400478363037, + "learning_rate": 4.505574484857478e-05, + "loss": 5.181, + "step": 34243 + }, + { + "epoch": 0.2036587686744695, + "grad_norm": 1.4871337413787842, + "learning_rate": 4.505546598006628e-05, + "loss": 5.275, + "step": 34244 + }, + { + "epoch": 0.2036647159577505, + "grad_norm": 1.4343833923339844, + "learning_rate": 4.505518710455663e-05, + "loss": 5.0945, + "step": 34245 + }, + { + "epoch": 0.2036706632410315, + "grad_norm": 1.514282464981079, + "learning_rate": 4.5054908222045916e-05, + "loss": 5.0487, + "step": 34246 + }, + { + "epoch": 0.2036766105243125, + "grad_norm": 1.5430774688720703, + "learning_rate": 4.5054629332534246e-05, + "loss": 5.0369, + "step": 34247 + }, + { + "epoch": 0.20368255780759348, + "grad_norm": 1.7679804563522339, + "learning_rate": 4.5054350436021706e-05, + "loss": 5.5694, + "step": 34248 + }, + { + "epoch": 0.2036885050908745, + "grad_norm": 1.6152211427688599, + "learning_rate": 4.5054071532508404e-05, + "loss": 5.6343, + "step": 34249 + }, + { + "epoch": 0.20369445237415548, + "grad_norm": 1.6899724006652832, + "learning_rate": 4.505379262199442e-05, + "loss": 5.3575, + "step": 34250 + }, + { + "epoch": 0.20370039965743647, + "grad_norm": 1.8305284976959229, + "learning_rate": 4.505351370447988e-05, + "loss": 5.3694, + "step": 34251 + }, + { + "epoch": 0.2037063469407175, + "grad_norm": 1.396485686302185, + "learning_rate": 4.505323477996486e-05, + "loss": 5.0583, + "step": 34252 + }, + { + "epoch": 0.20371229422399847, + "grad_norm": 1.662598967552185, + "learning_rate": 4.5052955848449465e-05, + "loss": 5.1148, + "step": 34253 + }, + { + "epoch": 0.20371824150727946, + "grad_norm": 1.6300253868103027, + "learning_rate": 4.505267690993378e-05, + "loss": 5.5773, + "step": 34254 + }, + { + "epoch": 0.20372418879056048, + "grad_norm": 1.7322368621826172, + "learning_rate": 4.5052397964417925e-05, + "loss": 5.0738, + "step": 34255 + }, + { + "epoch": 0.20373013607384147, + "grad_norm": 1.4914497137069702, + "learning_rate": 4.5052119011901986e-05, + "loss": 5.1192, + "step": 34256 + }, + { + "epoch": 0.20373608335712245, + "grad_norm": 1.4011354446411133, + "learning_rate": 4.5051840052386044e-05, + "loss": 5.1526, + "step": 34257 + }, + { + "epoch": 0.20374203064040347, + "grad_norm": 1.4619200229644775, + "learning_rate": 4.505156108587022e-05, + "loss": 4.9555, + "step": 34258 + }, + { + "epoch": 0.20374797792368446, + "grad_norm": 1.4376040697097778, + "learning_rate": 4.505128211235461e-05, + "loss": 4.9896, + "step": 34259 + }, + { + "epoch": 0.20375392520696545, + "grad_norm": 1.5649752616882324, + "learning_rate": 4.50510031318393e-05, + "loss": 4.9176, + "step": 34260 + }, + { + "epoch": 0.20375987249024646, + "grad_norm": 1.7832107543945312, + "learning_rate": 4.505072414432439e-05, + "loss": 4.696, + "step": 34261 + }, + { + "epoch": 0.20376581977352745, + "grad_norm": 1.9836961030960083, + "learning_rate": 4.505044514980998e-05, + "loss": 4.6756, + "step": 34262 + }, + { + "epoch": 0.20377176705680844, + "grad_norm": 2.0420374870300293, + "learning_rate": 4.505016614829617e-05, + "loss": 4.991, + "step": 34263 + }, + { + "epoch": 0.20377771434008946, + "grad_norm": 1.666096806526184, + "learning_rate": 4.504988713978305e-05, + "loss": 5.0536, + "step": 34264 + }, + { + "epoch": 0.20378366162337044, + "grad_norm": 1.5408387184143066, + "learning_rate": 4.504960812427072e-05, + "loss": 4.9794, + "step": 34265 + }, + { + "epoch": 0.20378960890665143, + "grad_norm": 2.0508735179901123, + "learning_rate": 4.504932910175929e-05, + "loss": 3.8745, + "step": 34266 + }, + { + "epoch": 0.20379555618993245, + "grad_norm": 2.475095272064209, + "learning_rate": 4.504905007224883e-05, + "loss": 3.7332, + "step": 34267 + }, + { + "epoch": 0.20380150347321344, + "grad_norm": 1.6301664113998413, + "learning_rate": 4.5048771035739466e-05, + "loss": 4.8958, + "step": 34268 + }, + { + "epoch": 0.20380745075649442, + "grad_norm": 1.7478148937225342, + "learning_rate": 4.504849199223128e-05, + "loss": 5.4425, + "step": 34269 + }, + { + "epoch": 0.20381339803977544, + "grad_norm": 1.4645951986312866, + "learning_rate": 4.504821294172438e-05, + "loss": 5.0187, + "step": 34270 + }, + { + "epoch": 0.20381934532305643, + "grad_norm": 1.511397123336792, + "learning_rate": 4.504793388421884e-05, + "loss": 5.0823, + "step": 34271 + }, + { + "epoch": 0.20382529260633742, + "grad_norm": 1.5013232231140137, + "learning_rate": 4.504765481971478e-05, + "loss": 4.9601, + "step": 34272 + }, + { + "epoch": 0.20383123988961843, + "grad_norm": 1.3130029439926147, + "learning_rate": 4.504737574821229e-05, + "loss": 4.9463, + "step": 34273 + }, + { + "epoch": 0.20383718717289942, + "grad_norm": 1.2741039991378784, + "learning_rate": 4.504709666971147e-05, + "loss": 4.9866, + "step": 34274 + }, + { + "epoch": 0.2038431344561804, + "grad_norm": 1.6717923879623413, + "learning_rate": 4.504681758421242e-05, + "loss": 4.8065, + "step": 34275 + }, + { + "epoch": 0.20384908173946142, + "grad_norm": 2.5650248527526855, + "learning_rate": 4.504653849171523e-05, + "loss": 3.2156, + "step": 34276 + }, + { + "epoch": 0.2038550290227424, + "grad_norm": 2.1047005653381348, + "learning_rate": 4.5046259392220006e-05, + "loss": 4.2701, + "step": 34277 + }, + { + "epoch": 0.2038609763060234, + "grad_norm": 1.4460844993591309, + "learning_rate": 4.504598028572683e-05, + "loss": 4.1266, + "step": 34278 + }, + { + "epoch": 0.20386692358930442, + "grad_norm": 1.375220537185669, + "learning_rate": 4.504570117223581e-05, + "loss": 4.6078, + "step": 34279 + }, + { + "epoch": 0.2038728708725854, + "grad_norm": 1.5132031440734863, + "learning_rate": 4.5045422051747046e-05, + "loss": 4.9891, + "step": 34280 + }, + { + "epoch": 0.2038788181558664, + "grad_norm": 1.6141597032546997, + "learning_rate": 4.5045142924260636e-05, + "loss": 4.9529, + "step": 34281 + }, + { + "epoch": 0.2038847654391474, + "grad_norm": 2.2230634689331055, + "learning_rate": 4.504486378977667e-05, + "loss": 4.6287, + "step": 34282 + }, + { + "epoch": 0.2038907127224284, + "grad_norm": 2.391753673553467, + "learning_rate": 4.504458464829525e-05, + "loss": 4.429, + "step": 34283 + }, + { + "epoch": 0.20389666000570938, + "grad_norm": 2.246250867843628, + "learning_rate": 4.504430549981647e-05, + "loss": 4.1301, + "step": 34284 + }, + { + "epoch": 0.2039026072889904, + "grad_norm": 2.15234375, + "learning_rate": 4.504402634434043e-05, + "loss": 3.9196, + "step": 34285 + }, + { + "epoch": 0.2039085545722714, + "grad_norm": 1.6975746154785156, + "learning_rate": 4.504374718186723e-05, + "loss": 5.5317, + "step": 34286 + }, + { + "epoch": 0.20391450185555238, + "grad_norm": 2.212271213531494, + "learning_rate": 4.504346801239696e-05, + "loss": 4.0733, + "step": 34287 + }, + { + "epoch": 0.2039204491388334, + "grad_norm": 1.9130991697311401, + "learning_rate": 4.504318883592973e-05, + "loss": 4.8775, + "step": 34288 + }, + { + "epoch": 0.20392639642211438, + "grad_norm": 1.4307633638381958, + "learning_rate": 4.5042909652465624e-05, + "loss": 4.4828, + "step": 34289 + }, + { + "epoch": 0.20393234370539537, + "grad_norm": 1.3600475788116455, + "learning_rate": 4.504263046200475e-05, + "loss": 4.2668, + "step": 34290 + }, + { + "epoch": 0.20393829098867639, + "grad_norm": 2.528594493865967, + "learning_rate": 4.50423512645472e-05, + "loss": 3.4961, + "step": 34291 + }, + { + "epoch": 0.20394423827195737, + "grad_norm": 2.440265655517578, + "learning_rate": 4.504207206009307e-05, + "loss": 3.3638, + "step": 34292 + }, + { + "epoch": 0.20395018555523836, + "grad_norm": 2.088148832321167, + "learning_rate": 4.5041792848642463e-05, + "loss": 4.19, + "step": 34293 + }, + { + "epoch": 0.20395613283851938, + "grad_norm": 1.9497579336166382, + "learning_rate": 4.504151363019546e-05, + "loss": 4.4115, + "step": 34294 + }, + { + "epoch": 0.20396208012180037, + "grad_norm": 2.358234405517578, + "learning_rate": 4.5041234404752185e-05, + "loss": 4.0831, + "step": 34295 + }, + { + "epoch": 0.20396802740508135, + "grad_norm": 2.400301456451416, + "learning_rate": 4.504095517231273e-05, + "loss": 3.8786, + "step": 34296 + }, + { + "epoch": 0.20397397468836237, + "grad_norm": 2.4365954399108887, + "learning_rate": 4.504067593287717e-05, + "loss": 4.0625, + "step": 34297 + }, + { + "epoch": 0.20397992197164336, + "grad_norm": 1.4779819250106812, + "learning_rate": 4.5040396686445616e-05, + "loss": 5.7232, + "step": 34298 + }, + { + "epoch": 0.20398586925492435, + "grad_norm": 1.4149293899536133, + "learning_rate": 4.504011743301817e-05, + "loss": 5.8752, + "step": 34299 + }, + { + "epoch": 0.20399181653820536, + "grad_norm": 1.4655671119689941, + "learning_rate": 4.5039838172594936e-05, + "loss": 5.1771, + "step": 34300 + }, + { + "epoch": 0.20399776382148635, + "grad_norm": 1.5849196910858154, + "learning_rate": 4.503955890517599e-05, + "loss": 5.6272, + "step": 34301 + }, + { + "epoch": 0.20400371110476734, + "grad_norm": 1.4127448797225952, + "learning_rate": 4.5039279630761445e-05, + "loss": 5.5198, + "step": 34302 + }, + { + "epoch": 0.20400965838804833, + "grad_norm": 2.142515182495117, + "learning_rate": 4.503900034935139e-05, + "loss": 4.8905, + "step": 34303 + }, + { + "epoch": 0.20401560567132934, + "grad_norm": 1.9965078830718994, + "learning_rate": 4.5038721060945935e-05, + "loss": 4.3118, + "step": 34304 + }, + { + "epoch": 0.20402155295461033, + "grad_norm": 1.8355085849761963, + "learning_rate": 4.5038441765545164e-05, + "loss": 4.5507, + "step": 34305 + }, + { + "epoch": 0.20402750023789132, + "grad_norm": 1.819510817527771, + "learning_rate": 4.503816246314918e-05, + "loss": 4.7781, + "step": 34306 + }, + { + "epoch": 0.20403344752117233, + "grad_norm": 1.8253278732299805, + "learning_rate": 4.503788315375809e-05, + "loss": 4.7338, + "step": 34307 + }, + { + "epoch": 0.20403939480445332, + "grad_norm": 1.6907480955123901, + "learning_rate": 4.5037603837371966e-05, + "loss": 4.7709, + "step": 34308 + }, + { + "epoch": 0.2040453420877343, + "grad_norm": 1.4460554122924805, + "learning_rate": 4.503732451399093e-05, + "loss": 5.3781, + "step": 34309 + }, + { + "epoch": 0.20405128937101533, + "grad_norm": 1.8433641195297241, + "learning_rate": 4.503704518361507e-05, + "loss": 5.5734, + "step": 34310 + }, + { + "epoch": 0.20405723665429631, + "grad_norm": 1.6964929103851318, + "learning_rate": 4.503676584624449e-05, + "loss": 5.3381, + "step": 34311 + }, + { + "epoch": 0.2040631839375773, + "grad_norm": 1.965718388557434, + "learning_rate": 4.503648650187927e-05, + "loss": 5.0361, + "step": 34312 + }, + { + "epoch": 0.20406913122085832, + "grad_norm": 1.9891307353973389, + "learning_rate": 4.503620715051953e-05, + "loss": 5.0337, + "step": 34313 + }, + { + "epoch": 0.2040750785041393, + "grad_norm": 1.799054741859436, + "learning_rate": 4.503592779216536e-05, + "loss": 4.4287, + "step": 34314 + }, + { + "epoch": 0.2040810257874203, + "grad_norm": 1.7559428215026855, + "learning_rate": 4.503564842681684e-05, + "loss": 4.4849, + "step": 34315 + }, + { + "epoch": 0.2040869730707013, + "grad_norm": 1.7106789350509644, + "learning_rate": 4.503536905447409e-05, + "loss": 4.5868, + "step": 34316 + }, + { + "epoch": 0.2040929203539823, + "grad_norm": 1.664260745048523, + "learning_rate": 4.50350896751372e-05, + "loss": 5.0689, + "step": 34317 + }, + { + "epoch": 0.2040988676372633, + "grad_norm": 1.566235065460205, + "learning_rate": 4.503481028880627e-05, + "loss": 5.37, + "step": 34318 + }, + { + "epoch": 0.2041048149205443, + "grad_norm": 1.839880108833313, + "learning_rate": 4.503453089548139e-05, + "loss": 5.3544, + "step": 34319 + }, + { + "epoch": 0.2041107622038253, + "grad_norm": 1.5123977661132812, + "learning_rate": 4.5034251495162663e-05, + "loss": 5.0838, + "step": 34320 + }, + { + "epoch": 0.20411670948710628, + "grad_norm": 1.642776608467102, + "learning_rate": 4.5033972087850184e-05, + "loss": 4.7068, + "step": 34321 + }, + { + "epoch": 0.2041226567703873, + "grad_norm": 1.6237605810165405, + "learning_rate": 4.503369267354406e-05, + "loss": 5.2005, + "step": 34322 + }, + { + "epoch": 0.20412860405366828, + "grad_norm": 2.285550355911255, + "learning_rate": 4.503341325224437e-05, + "loss": 4.728, + "step": 34323 + }, + { + "epoch": 0.20413455133694927, + "grad_norm": 3.8627207279205322, + "learning_rate": 4.5033133823951236e-05, + "loss": 3.6855, + "step": 34324 + }, + { + "epoch": 0.2041404986202303, + "grad_norm": 3.054490566253662, + "learning_rate": 4.503285438866473e-05, + "loss": 3.8219, + "step": 34325 + }, + { + "epoch": 0.20414644590351128, + "grad_norm": 2.7683627605438232, + "learning_rate": 4.503257494638497e-05, + "loss": 4.096, + "step": 34326 + }, + { + "epoch": 0.20415239318679226, + "grad_norm": 2.6042439937591553, + "learning_rate": 4.5032295497112035e-05, + "loss": 3.442, + "step": 34327 + }, + { + "epoch": 0.20415834047007328, + "grad_norm": 2.1823248863220215, + "learning_rate": 4.5032016040846045e-05, + "loss": 4.541, + "step": 34328 + }, + { + "epoch": 0.20416428775335427, + "grad_norm": 2.897273540496826, + "learning_rate": 4.5031736577587074e-05, + "loss": 4.0124, + "step": 34329 + }, + { + "epoch": 0.20417023503663526, + "grad_norm": 1.747259259223938, + "learning_rate": 4.503145710733524e-05, + "loss": 4.8034, + "step": 34330 + }, + { + "epoch": 0.20417618231991627, + "grad_norm": 1.7826976776123047, + "learning_rate": 4.5031177630090625e-05, + "loss": 5.2103, + "step": 34331 + }, + { + "epoch": 0.20418212960319726, + "grad_norm": 1.7653011083602905, + "learning_rate": 4.503089814585333e-05, + "loss": 5.1752, + "step": 34332 + }, + { + "epoch": 0.20418807688647825, + "grad_norm": 1.8423312902450562, + "learning_rate": 4.503061865462346e-05, + "loss": 5.1152, + "step": 34333 + }, + { + "epoch": 0.20419402416975926, + "grad_norm": 1.7056430578231812, + "learning_rate": 4.503033915640111e-05, + "loss": 5.1135, + "step": 34334 + }, + { + "epoch": 0.20419997145304025, + "grad_norm": 1.9776579141616821, + "learning_rate": 4.5030059651186376e-05, + "loss": 4.9217, + "step": 34335 + }, + { + "epoch": 0.20420591873632124, + "grad_norm": 1.475510835647583, + "learning_rate": 4.502978013897935e-05, + "loss": 4.7542, + "step": 34336 + }, + { + "epoch": 0.20421186601960226, + "grad_norm": 1.5682835578918457, + "learning_rate": 4.502950061978014e-05, + "loss": 5.1662, + "step": 34337 + }, + { + "epoch": 0.20421781330288324, + "grad_norm": 2.6880135536193848, + "learning_rate": 4.5029221093588836e-05, + "loss": 3.9135, + "step": 34338 + }, + { + "epoch": 0.20422376058616423, + "grad_norm": 2.1446547508239746, + "learning_rate": 4.502894156040553e-05, + "loss": 3.9488, + "step": 34339 + }, + { + "epoch": 0.20422970786944525, + "grad_norm": 1.6459128856658936, + "learning_rate": 4.502866202023034e-05, + "loss": 5.1773, + "step": 34340 + }, + { + "epoch": 0.20423565515272624, + "grad_norm": 1.842704176902771, + "learning_rate": 4.502838247306335e-05, + "loss": 5.0428, + "step": 34341 + }, + { + "epoch": 0.20424160243600722, + "grad_norm": 1.9853084087371826, + "learning_rate": 4.5028102918904644e-05, + "loss": 4.87, + "step": 34342 + }, + { + "epoch": 0.20424754971928824, + "grad_norm": 1.943145990371704, + "learning_rate": 4.502782335775435e-05, + "loss": 4.8169, + "step": 34343 + }, + { + "epoch": 0.20425349700256923, + "grad_norm": 1.7412112951278687, + "learning_rate": 4.502754378961255e-05, + "loss": 4.8134, + "step": 34344 + }, + { + "epoch": 0.20425944428585022, + "grad_norm": 1.7240549325942993, + "learning_rate": 4.502726421447933e-05, + "loss": 5.0885, + "step": 34345 + }, + { + "epoch": 0.20426539156913123, + "grad_norm": 1.4919542074203491, + "learning_rate": 4.502698463235481e-05, + "loss": 5.2307, + "step": 34346 + }, + { + "epoch": 0.20427133885241222, + "grad_norm": 1.32732093334198, + "learning_rate": 4.502670504323907e-05, + "loss": 5.1218, + "step": 34347 + }, + { + "epoch": 0.2042772861356932, + "grad_norm": 1.7667738199234009, + "learning_rate": 4.5026425447132214e-05, + "loss": 4.0633, + "step": 34348 + }, + { + "epoch": 0.20428323341897422, + "grad_norm": 1.4684362411499023, + "learning_rate": 4.502614584403434e-05, + "loss": 5.2084, + "step": 34349 + }, + { + "epoch": 0.2042891807022552, + "grad_norm": 1.3652414083480835, + "learning_rate": 4.5025866233945546e-05, + "loss": 5.2852, + "step": 34350 + }, + { + "epoch": 0.2042951279855362, + "grad_norm": 1.6385377645492554, + "learning_rate": 4.5025586616865926e-05, + "loss": 5.103, + "step": 34351 + }, + { + "epoch": 0.20430107526881722, + "grad_norm": 1.8744497299194336, + "learning_rate": 4.502530699279559e-05, + "loss": 4.42, + "step": 34352 + }, + { + "epoch": 0.2043070225520982, + "grad_norm": 1.8791557550430298, + "learning_rate": 4.502502736173462e-05, + "loss": 4.3255, + "step": 34353 + }, + { + "epoch": 0.2043129698353792, + "grad_norm": 1.8308615684509277, + "learning_rate": 4.502474772368312e-05, + "loss": 4.3123, + "step": 34354 + }, + { + "epoch": 0.2043189171186602, + "grad_norm": 1.897897481918335, + "learning_rate": 4.502446807864118e-05, + "loss": 4.5253, + "step": 34355 + }, + { + "epoch": 0.2043248644019412, + "grad_norm": 1.822041392326355, + "learning_rate": 4.502418842660892e-05, + "loss": 5.4352, + "step": 34356 + }, + { + "epoch": 0.20433081168522219, + "grad_norm": 1.7441822290420532, + "learning_rate": 4.5023908767586416e-05, + "loss": 5.2117, + "step": 34357 + }, + { + "epoch": 0.2043367589685032, + "grad_norm": 1.9075031280517578, + "learning_rate": 4.502362910157377e-05, + "loss": 5.1509, + "step": 34358 + }, + { + "epoch": 0.2043427062517842, + "grad_norm": 1.7022250890731812, + "learning_rate": 4.502334942857108e-05, + "loss": 5.2759, + "step": 34359 + }, + { + "epoch": 0.20434865353506518, + "grad_norm": 1.8610200881958008, + "learning_rate": 4.502306974857845e-05, + "loss": 5.0043, + "step": 34360 + }, + { + "epoch": 0.20435460081834617, + "grad_norm": 1.7256522178649902, + "learning_rate": 4.5022790061595976e-05, + "loss": 5.1169, + "step": 34361 + }, + { + "epoch": 0.20436054810162718, + "grad_norm": 1.9288054704666138, + "learning_rate": 4.502251036762375e-05, + "loss": 5.0786, + "step": 34362 + }, + { + "epoch": 0.20436649538490817, + "grad_norm": 1.950032353401184, + "learning_rate": 4.502223066666187e-05, + "loss": 4.6047, + "step": 34363 + }, + { + "epoch": 0.20437244266818916, + "grad_norm": 1.7432233095169067, + "learning_rate": 4.502195095871044e-05, + "loss": 4.7526, + "step": 34364 + }, + { + "epoch": 0.20437838995147017, + "grad_norm": 2.0420267581939697, + "learning_rate": 4.502167124376955e-05, + "loss": 4.995, + "step": 34365 + }, + { + "epoch": 0.20438433723475116, + "grad_norm": 1.6214263439178467, + "learning_rate": 4.5021391521839304e-05, + "loss": 4.8635, + "step": 34366 + }, + { + "epoch": 0.20439028451803215, + "grad_norm": 1.4966545104980469, + "learning_rate": 4.50211117929198e-05, + "loss": 5.0103, + "step": 34367 + }, + { + "epoch": 0.20439623180131317, + "grad_norm": 1.501697063446045, + "learning_rate": 4.5020832057011127e-05, + "loss": 5.2083, + "step": 34368 + }, + { + "epoch": 0.20440217908459415, + "grad_norm": 1.6379048824310303, + "learning_rate": 4.502055231411339e-05, + "loss": 4.6216, + "step": 34369 + }, + { + "epoch": 0.20440812636787514, + "grad_norm": 1.9041169881820679, + "learning_rate": 4.502027256422668e-05, + "loss": 4.3796, + "step": 34370 + }, + { + "epoch": 0.20441407365115616, + "grad_norm": 1.7477339506149292, + "learning_rate": 4.501999280735111e-05, + "loss": 4.6934, + "step": 34371 + }, + { + "epoch": 0.20442002093443715, + "grad_norm": 1.5829856395721436, + "learning_rate": 4.501971304348676e-05, + "loss": 5.2587, + "step": 34372 + }, + { + "epoch": 0.20442596821771813, + "grad_norm": 1.445803165435791, + "learning_rate": 4.501943327263374e-05, + "loss": 5.1847, + "step": 34373 + }, + { + "epoch": 0.20443191550099915, + "grad_norm": 1.5141373872756958, + "learning_rate": 4.5019153494792144e-05, + "loss": 5.3348, + "step": 34374 + }, + { + "epoch": 0.20443786278428014, + "grad_norm": 1.970505714416504, + "learning_rate": 4.501887370996206e-05, + "loss": 3.957, + "step": 34375 + }, + { + "epoch": 0.20444381006756113, + "grad_norm": 1.7028234004974365, + "learning_rate": 4.5018593918143596e-05, + "loss": 4.4819, + "step": 34376 + }, + { + "epoch": 0.20444975735084214, + "grad_norm": 1.9567445516586304, + "learning_rate": 4.501831411933685e-05, + "loss": 4.3555, + "step": 34377 + }, + { + "epoch": 0.20445570463412313, + "grad_norm": 1.6176704168319702, + "learning_rate": 4.5018034313541925e-05, + "loss": 4.7144, + "step": 34378 + }, + { + "epoch": 0.20446165191740412, + "grad_norm": 1.7398934364318848, + "learning_rate": 4.50177545007589e-05, + "loss": 4.2907, + "step": 34379 + }, + { + "epoch": 0.20446759920068514, + "grad_norm": 1.7774847745895386, + "learning_rate": 4.501747468098789e-05, + "loss": 4.3437, + "step": 34380 + }, + { + "epoch": 0.20447354648396612, + "grad_norm": 1.7353404760360718, + "learning_rate": 4.5017194854228984e-05, + "loss": 4.796, + "step": 34381 + }, + { + "epoch": 0.2044794937672471, + "grad_norm": 1.7452480792999268, + "learning_rate": 4.501691502048227e-05, + "loss": 4.655, + "step": 34382 + }, + { + "epoch": 0.20448544105052813, + "grad_norm": 1.8226715326309204, + "learning_rate": 4.501663517974788e-05, + "loss": 3.9734, + "step": 34383 + }, + { + "epoch": 0.20449138833380912, + "grad_norm": 1.74259352684021, + "learning_rate": 4.501635533202587e-05, + "loss": 4.319, + "step": 34384 + }, + { + "epoch": 0.2044973356170901, + "grad_norm": 1.9349931478500366, + "learning_rate": 4.501607547731637e-05, + "loss": 4.29, + "step": 34385 + }, + { + "epoch": 0.20450328290037112, + "grad_norm": 1.7214958667755127, + "learning_rate": 4.501579561561946e-05, + "loss": 4.3099, + "step": 34386 + }, + { + "epoch": 0.2045092301836521, + "grad_norm": 1.6682885885238647, + "learning_rate": 4.5015515746935246e-05, + "loss": 4.2436, + "step": 34387 + }, + { + "epoch": 0.2045151774669331, + "grad_norm": 1.7586250305175781, + "learning_rate": 4.5015235871263813e-05, + "loss": 4.4776, + "step": 34388 + }, + { + "epoch": 0.2045211247502141, + "grad_norm": 2.00175142288208, + "learning_rate": 4.501495598860528e-05, + "loss": 3.932, + "step": 34389 + }, + { + "epoch": 0.2045270720334951, + "grad_norm": 2.066532611846924, + "learning_rate": 4.501467609895973e-05, + "loss": 4.3484, + "step": 34390 + }, + { + "epoch": 0.2045330193167761, + "grad_norm": 2.031602621078491, + "learning_rate": 4.501439620232726e-05, + "loss": 4.4164, + "step": 34391 + }, + { + "epoch": 0.2045389666000571, + "grad_norm": 1.8264626264572144, + "learning_rate": 4.5014116298707975e-05, + "loss": 4.6198, + "step": 34392 + }, + { + "epoch": 0.2045449138833381, + "grad_norm": 1.8924089670181274, + "learning_rate": 4.501383638810196e-05, + "loss": 4.5491, + "step": 34393 + }, + { + "epoch": 0.20455086116661908, + "grad_norm": 1.8219579458236694, + "learning_rate": 4.501355647050933e-05, + "loss": 4.3047, + "step": 34394 + }, + { + "epoch": 0.2045568084499001, + "grad_norm": 1.65507173538208, + "learning_rate": 4.501327654593017e-05, + "loss": 4.3469, + "step": 34395 + }, + { + "epoch": 0.20456275573318108, + "grad_norm": 1.838722586631775, + "learning_rate": 4.5012996614364584e-05, + "loss": 4.4339, + "step": 34396 + }, + { + "epoch": 0.20456870301646207, + "grad_norm": 1.591939926147461, + "learning_rate": 4.5012716675812664e-05, + "loss": 4.3333, + "step": 34397 + }, + { + "epoch": 0.2045746502997431, + "grad_norm": 1.6647378206253052, + "learning_rate": 4.501243673027452e-05, + "loss": 4.4846, + "step": 34398 + }, + { + "epoch": 0.20458059758302408, + "grad_norm": 1.6199638843536377, + "learning_rate": 4.501215677775023e-05, + "loss": 4.5612, + "step": 34399 + }, + { + "epoch": 0.20458654486630506, + "grad_norm": 1.6180731058120728, + "learning_rate": 4.501187681823991e-05, + "loss": 4.5946, + "step": 34400 + }, + { + "epoch": 0.20459249214958608, + "grad_norm": 1.7323843240737915, + "learning_rate": 4.501159685174365e-05, + "loss": 4.7182, + "step": 34401 + }, + { + "epoch": 0.20459843943286707, + "grad_norm": 1.6931322813034058, + "learning_rate": 4.5011316878261545e-05, + "loss": 4.621, + "step": 34402 + }, + { + "epoch": 0.20460438671614806, + "grad_norm": 1.8126243352890015, + "learning_rate": 4.50110368977937e-05, + "loss": 4.3956, + "step": 34403 + }, + { + "epoch": 0.20461033399942907, + "grad_norm": 1.9750713109970093, + "learning_rate": 4.50107569103402e-05, + "loss": 4.3139, + "step": 34404 + }, + { + "epoch": 0.20461628128271006, + "grad_norm": 1.7892979383468628, + "learning_rate": 4.5010476915901164e-05, + "loss": 4.8248, + "step": 34405 + }, + { + "epoch": 0.20462222856599105, + "grad_norm": 1.7640190124511719, + "learning_rate": 4.501019691447667e-05, + "loss": 4.6024, + "step": 34406 + }, + { + "epoch": 0.20462817584927206, + "grad_norm": 1.7281399965286255, + "learning_rate": 4.500991690606682e-05, + "loss": 4.5826, + "step": 34407 + }, + { + "epoch": 0.20463412313255305, + "grad_norm": 1.7226693630218506, + "learning_rate": 4.500963689067173e-05, + "loss": 4.4828, + "step": 34408 + }, + { + "epoch": 0.20464007041583404, + "grad_norm": 1.9858465194702148, + "learning_rate": 4.5009356868291464e-05, + "loss": 4.4441, + "step": 34409 + }, + { + "epoch": 0.20464601769911506, + "grad_norm": 2.349806547164917, + "learning_rate": 4.500907683892615e-05, + "loss": 3.6723, + "step": 34410 + }, + { + "epoch": 0.20465196498239605, + "grad_norm": 2.7743988037109375, + "learning_rate": 4.500879680257587e-05, + "loss": 3.2242, + "step": 34411 + }, + { + "epoch": 0.20465791226567703, + "grad_norm": 3.5897703170776367, + "learning_rate": 4.5008516759240725e-05, + "loss": 3.8626, + "step": 34412 + }, + { + "epoch": 0.20466385954895805, + "grad_norm": 3.0878939628601074, + "learning_rate": 4.5008236708920816e-05, + "loss": 4.1804, + "step": 34413 + }, + { + "epoch": 0.20466980683223904, + "grad_norm": 2.4697554111480713, + "learning_rate": 4.500795665161623e-05, + "loss": 3.9456, + "step": 34414 + }, + { + "epoch": 0.20467575411552003, + "grad_norm": 1.8446409702301025, + "learning_rate": 4.500767658732708e-05, + "loss": 4.5432, + "step": 34415 + }, + { + "epoch": 0.20468170139880104, + "grad_norm": 2.3806347846984863, + "learning_rate": 4.5007396516053455e-05, + "loss": 4.9125, + "step": 34416 + }, + { + "epoch": 0.20468764868208203, + "grad_norm": 2.268231153488159, + "learning_rate": 4.500711643779546e-05, + "loss": 4.2447, + "step": 34417 + }, + { + "epoch": 0.20469359596536302, + "grad_norm": 2.3919286727905273, + "learning_rate": 4.500683635255318e-05, + "loss": 3.3776, + "step": 34418 + }, + { + "epoch": 0.204699543248644, + "grad_norm": 2.2718050479888916, + "learning_rate": 4.500655626032673e-05, + "loss": 3.3755, + "step": 34419 + }, + { + "epoch": 0.20470549053192502, + "grad_norm": 1.688220739364624, + "learning_rate": 4.500627616111619e-05, + "loss": 4.782, + "step": 34420 + }, + { + "epoch": 0.204711437815206, + "grad_norm": 1.78768789768219, + "learning_rate": 4.500599605492166e-05, + "loss": 4.8989, + "step": 34421 + }, + { + "epoch": 0.204717385098487, + "grad_norm": 2.1640124320983887, + "learning_rate": 4.5005715941743255e-05, + "loss": 4.5965, + "step": 34422 + }, + { + "epoch": 0.20472333238176801, + "grad_norm": 1.9868354797363281, + "learning_rate": 4.5005435821581054e-05, + "loss": 4.6511, + "step": 34423 + }, + { + "epoch": 0.204729279665049, + "grad_norm": 2.1577179431915283, + "learning_rate": 4.500515569443516e-05, + "loss": 4.6006, + "step": 34424 + }, + { + "epoch": 0.20473522694833, + "grad_norm": 2.440462589263916, + "learning_rate": 4.500487556030568e-05, + "loss": 4.4556, + "step": 34425 + }, + { + "epoch": 0.204741174231611, + "grad_norm": 2.115889072418213, + "learning_rate": 4.500459541919271e-05, + "loss": 4.6232, + "step": 34426 + }, + { + "epoch": 0.204747121514892, + "grad_norm": 2.1383614540100098, + "learning_rate": 4.500431527109633e-05, + "loss": 4.709, + "step": 34427 + }, + { + "epoch": 0.20475306879817298, + "grad_norm": 2.339848041534424, + "learning_rate": 4.500403511601665e-05, + "loss": 4.6746, + "step": 34428 + }, + { + "epoch": 0.204759016081454, + "grad_norm": 2.115816831588745, + "learning_rate": 4.500375495395378e-05, + "loss": 4.7467, + "step": 34429 + }, + { + "epoch": 0.204764963364735, + "grad_norm": 2.461951494216919, + "learning_rate": 4.5003474784907796e-05, + "loss": 4.6304, + "step": 34430 + }, + { + "epoch": 0.20477091064801597, + "grad_norm": 2.1366448402404785, + "learning_rate": 4.5003194608878806e-05, + "loss": 4.7499, + "step": 34431 + }, + { + "epoch": 0.204776857931297, + "grad_norm": 1.7789580821990967, + "learning_rate": 4.5002914425866916e-05, + "loss": 4.491, + "step": 34432 + }, + { + "epoch": 0.20478280521457798, + "grad_norm": 1.984432578086853, + "learning_rate": 4.50026342358722e-05, + "loss": 4.4704, + "step": 34433 + }, + { + "epoch": 0.20478875249785897, + "grad_norm": 2.3361284732818604, + "learning_rate": 4.500235403889479e-05, + "loss": 4.5737, + "step": 34434 + }, + { + "epoch": 0.20479469978113998, + "grad_norm": 2.1250100135803223, + "learning_rate": 4.500207383493475e-05, + "loss": 4.3715, + "step": 34435 + }, + { + "epoch": 0.20480064706442097, + "grad_norm": 1.9171262979507446, + "learning_rate": 4.500179362399219e-05, + "loss": 4.5543, + "step": 34436 + }, + { + "epoch": 0.20480659434770196, + "grad_norm": 1.9183216094970703, + "learning_rate": 4.5001513406067224e-05, + "loss": 4.6798, + "step": 34437 + }, + { + "epoch": 0.20481254163098297, + "grad_norm": 2.0602195262908936, + "learning_rate": 4.500123318115993e-05, + "loss": 4.2771, + "step": 34438 + }, + { + "epoch": 0.20481848891426396, + "grad_norm": 1.8789846897125244, + "learning_rate": 4.5000952949270414e-05, + "loss": 3.779, + "step": 34439 + }, + { + "epoch": 0.20482443619754495, + "grad_norm": 1.7185890674591064, + "learning_rate": 4.5000672710398775e-05, + "loss": 3.7439, + "step": 34440 + }, + { + "epoch": 0.20483038348082597, + "grad_norm": 1.758641004562378, + "learning_rate": 4.50003924645451e-05, + "loss": 3.8585, + "step": 34441 + }, + { + "epoch": 0.20483633076410696, + "grad_norm": 1.8966751098632812, + "learning_rate": 4.50001122117095e-05, + "loss": 4.1861, + "step": 34442 + }, + { + "epoch": 0.20484227804738794, + "grad_norm": 2.1946234703063965, + "learning_rate": 4.4999831951892056e-05, + "loss": 4.2264, + "step": 34443 + }, + { + "epoch": 0.20484822533066896, + "grad_norm": 1.9412094354629517, + "learning_rate": 4.4999551685092893e-05, + "loss": 4.5887, + "step": 34444 + }, + { + "epoch": 0.20485417261394995, + "grad_norm": 1.4762779474258423, + "learning_rate": 4.4999271411312086e-05, + "loss": 4.7418, + "step": 34445 + }, + { + "epoch": 0.20486011989723094, + "grad_norm": 1.520828127861023, + "learning_rate": 4.499899113054974e-05, + "loss": 4.6643, + "step": 34446 + }, + { + "epoch": 0.20486606718051195, + "grad_norm": 1.7773675918579102, + "learning_rate": 4.4998710842805955e-05, + "loss": 4.8711, + "step": 34447 + }, + { + "epoch": 0.20487201446379294, + "grad_norm": 1.8981387615203857, + "learning_rate": 4.499843054808082e-05, + "loss": 4.4935, + "step": 34448 + }, + { + "epoch": 0.20487796174707393, + "grad_norm": 2.329200029373169, + "learning_rate": 4.499815024637445e-05, + "loss": 4.5589, + "step": 34449 + }, + { + "epoch": 0.20488390903035494, + "grad_norm": 3.6857268810272217, + "learning_rate": 4.4997869937686926e-05, + "loss": 3.6962, + "step": 34450 + }, + { + "epoch": 0.20488985631363593, + "grad_norm": 2.840702533721924, + "learning_rate": 4.4997589622018354e-05, + "loss": 4.3877, + "step": 34451 + }, + { + "epoch": 0.20489580359691692, + "grad_norm": 2.9141292572021484, + "learning_rate": 4.499730929936883e-05, + "loss": 4.1279, + "step": 34452 + }, + { + "epoch": 0.20490175088019794, + "grad_norm": 2.270629405975342, + "learning_rate": 4.4997028969738456e-05, + "loss": 3.9257, + "step": 34453 + }, + { + "epoch": 0.20490769816347892, + "grad_norm": 1.5963208675384521, + "learning_rate": 4.499674863312732e-05, + "loss": 4.7587, + "step": 34454 + }, + { + "epoch": 0.2049136454467599, + "grad_norm": 1.8655664920806885, + "learning_rate": 4.499646828953552e-05, + "loss": 5.1353, + "step": 34455 + }, + { + "epoch": 0.20491959273004093, + "grad_norm": 1.582878589630127, + "learning_rate": 4.499618793896317e-05, + "loss": 5.1033, + "step": 34456 + }, + { + "epoch": 0.20492554001332192, + "grad_norm": 1.8107730150222778, + "learning_rate": 4.499590758141035e-05, + "loss": 4.9176, + "step": 34457 + }, + { + "epoch": 0.2049314872966029, + "grad_norm": 1.7597856521606445, + "learning_rate": 4.499562721687717e-05, + "loss": 5.1321, + "step": 34458 + }, + { + "epoch": 0.20493743457988392, + "grad_norm": 2.249377489089966, + "learning_rate": 4.4995346845363715e-05, + "loss": 3.7656, + "step": 34459 + }, + { + "epoch": 0.2049433818631649, + "grad_norm": 2.7382209300994873, + "learning_rate": 4.4995066466870106e-05, + "loss": 3.8454, + "step": 34460 + }, + { + "epoch": 0.2049493291464459, + "grad_norm": 1.8879845142364502, + "learning_rate": 4.499478608139641e-05, + "loss": 5.0165, + "step": 34461 + }, + { + "epoch": 0.2049552764297269, + "grad_norm": 1.58770751953125, + "learning_rate": 4.4994505688942757e-05, + "loss": 5.0374, + "step": 34462 + }, + { + "epoch": 0.2049612237130079, + "grad_norm": 1.4101433753967285, + "learning_rate": 4.499422528950922e-05, + "loss": 4.8346, + "step": 34463 + }, + { + "epoch": 0.2049671709962889, + "grad_norm": 1.5214909315109253, + "learning_rate": 4.49939448830959e-05, + "loss": 5.0864, + "step": 34464 + }, + { + "epoch": 0.2049731182795699, + "grad_norm": 1.444889783859253, + "learning_rate": 4.4993664469702914e-05, + "loss": 5.2856, + "step": 34465 + }, + { + "epoch": 0.2049790655628509, + "grad_norm": 1.7581872940063477, + "learning_rate": 4.4993384049330336e-05, + "loss": 4.9138, + "step": 34466 + }, + { + "epoch": 0.20498501284613188, + "grad_norm": 1.5535180568695068, + "learning_rate": 4.4993103621978275e-05, + "loss": 4.6956, + "step": 34467 + }, + { + "epoch": 0.2049909601294129, + "grad_norm": 1.653976559638977, + "learning_rate": 4.4992823187646824e-05, + "loss": 4.6417, + "step": 34468 + }, + { + "epoch": 0.20499690741269389, + "grad_norm": 1.23905611038208, + "learning_rate": 4.499254274633609e-05, + "loss": 4.7466, + "step": 34469 + }, + { + "epoch": 0.20500285469597487, + "grad_norm": 1.6507742404937744, + "learning_rate": 4.499226229804617e-05, + "loss": 4.6249, + "step": 34470 + }, + { + "epoch": 0.2050088019792559, + "grad_norm": 1.6019675731658936, + "learning_rate": 4.4991981842777156e-05, + "loss": 4.7097, + "step": 34471 + }, + { + "epoch": 0.20501474926253688, + "grad_norm": 2.959336519241333, + "learning_rate": 4.4991701380529146e-05, + "loss": 3.2528, + "step": 34472 + }, + { + "epoch": 0.20502069654581787, + "grad_norm": 1.8020458221435547, + "learning_rate": 4.499142091130224e-05, + "loss": 4.6609, + "step": 34473 + }, + { + "epoch": 0.20502664382909888, + "grad_norm": 1.6824336051940918, + "learning_rate": 4.499114043509653e-05, + "loss": 4.5646, + "step": 34474 + }, + { + "epoch": 0.20503259111237987, + "grad_norm": 1.5131100416183472, + "learning_rate": 4.4990859951912124e-05, + "loss": 4.6802, + "step": 34475 + }, + { + "epoch": 0.20503853839566086, + "grad_norm": 1.7208993434906006, + "learning_rate": 4.499057946174911e-05, + "loss": 4.7112, + "step": 34476 + }, + { + "epoch": 0.20504448567894185, + "grad_norm": 1.701457142829895, + "learning_rate": 4.49902989646076e-05, + "loss": 4.6948, + "step": 34477 + }, + { + "epoch": 0.20505043296222286, + "grad_norm": 1.5838974714279175, + "learning_rate": 4.4990018460487683e-05, + "loss": 4.6722, + "step": 34478 + }, + { + "epoch": 0.20505638024550385, + "grad_norm": 1.7028863430023193, + "learning_rate": 4.4989737949389454e-05, + "loss": 4.4673, + "step": 34479 + }, + { + "epoch": 0.20506232752878484, + "grad_norm": 1.5195289850234985, + "learning_rate": 4.498945743131302e-05, + "loss": 4.5802, + "step": 34480 + }, + { + "epoch": 0.20506827481206585, + "grad_norm": 1.2915374040603638, + "learning_rate": 4.498917690625847e-05, + "loss": 4.4341, + "step": 34481 + }, + { + "epoch": 0.20507422209534684, + "grad_norm": 1.3090250492095947, + "learning_rate": 4.49888963742259e-05, + "loss": 4.7476, + "step": 34482 + }, + { + "epoch": 0.20508016937862783, + "grad_norm": 1.5086677074432373, + "learning_rate": 4.498861583521541e-05, + "loss": 4.5027, + "step": 34483 + }, + { + "epoch": 0.20508611666190885, + "grad_norm": 1.279787540435791, + "learning_rate": 4.4988335289227104e-05, + "loss": 4.6363, + "step": 34484 + }, + { + "epoch": 0.20509206394518983, + "grad_norm": 1.5406779050827026, + "learning_rate": 4.498805473626107e-05, + "loss": 4.834, + "step": 34485 + }, + { + "epoch": 0.20509801122847082, + "grad_norm": 1.5772720575332642, + "learning_rate": 4.4987774176317435e-05, + "loss": 4.5374, + "step": 34486 + }, + { + "epoch": 0.20510395851175184, + "grad_norm": 1.5194119215011597, + "learning_rate": 4.498749360939626e-05, + "loss": 4.5295, + "step": 34487 + }, + { + "epoch": 0.20510990579503283, + "grad_norm": 1.4598057270050049, + "learning_rate": 4.498721303549766e-05, + "loss": 4.45, + "step": 34488 + }, + { + "epoch": 0.20511585307831381, + "grad_norm": 1.4096565246582031, + "learning_rate": 4.498693245462172e-05, + "loss": 4.5477, + "step": 34489 + }, + { + "epoch": 0.20512180036159483, + "grad_norm": 1.7675260305404663, + "learning_rate": 4.498665186676856e-05, + "loss": 4.599, + "step": 34490 + }, + { + "epoch": 0.20512774764487582, + "grad_norm": 1.5095406770706177, + "learning_rate": 4.498637127193826e-05, + "loss": 4.9472, + "step": 34491 + }, + { + "epoch": 0.2051336949281568, + "grad_norm": 1.389299750328064, + "learning_rate": 4.498609067013093e-05, + "loss": 4.625, + "step": 34492 + }, + { + "epoch": 0.20513964221143782, + "grad_norm": 1.573198676109314, + "learning_rate": 4.498581006134666e-05, + "loss": 4.5716, + "step": 34493 + }, + { + "epoch": 0.2051455894947188, + "grad_norm": 1.655664324760437, + "learning_rate": 4.498552944558554e-05, + "loss": 4.6848, + "step": 34494 + }, + { + "epoch": 0.2051515367779998, + "grad_norm": 1.6053812503814697, + "learning_rate": 4.498524882284769e-05, + "loss": 4.6676, + "step": 34495 + }, + { + "epoch": 0.20515748406128081, + "grad_norm": 1.6616476774215698, + "learning_rate": 4.49849681931332e-05, + "loss": 5.0148, + "step": 34496 + }, + { + "epoch": 0.2051634313445618, + "grad_norm": 2.0143654346466064, + "learning_rate": 4.4984687556442155e-05, + "loss": 4.4989, + "step": 34497 + }, + { + "epoch": 0.2051693786278428, + "grad_norm": 1.833949089050293, + "learning_rate": 4.498440691277467e-05, + "loss": 4.3259, + "step": 34498 + }, + { + "epoch": 0.2051753259111238, + "grad_norm": 1.5047730207443237, + "learning_rate": 4.4984126262130825e-05, + "loss": 4.6325, + "step": 34499 + }, + { + "epoch": 0.2051812731944048, + "grad_norm": 1.6812539100646973, + "learning_rate": 4.4983845604510736e-05, + "loss": 4.3358, + "step": 34500 + }, + { + "epoch": 0.20518722047768578, + "grad_norm": 1.4969497919082642, + "learning_rate": 4.498356493991449e-05, + "loss": 4.7152, + "step": 34501 + }, + { + "epoch": 0.2051931677609668, + "grad_norm": 1.9464809894561768, + "learning_rate": 4.498328426834218e-05, + "loss": 4.7395, + "step": 34502 + }, + { + "epoch": 0.2051991150442478, + "grad_norm": 1.6289222240447998, + "learning_rate": 4.498300358979393e-05, + "loss": 4.9731, + "step": 34503 + }, + { + "epoch": 0.20520506232752878, + "grad_norm": 1.6166993379592896, + "learning_rate": 4.49827229042698e-05, + "loss": 5.0576, + "step": 34504 + }, + { + "epoch": 0.2052110096108098, + "grad_norm": 1.5953607559204102, + "learning_rate": 4.4982442211769926e-05, + "loss": 5.2747, + "step": 34505 + }, + { + "epoch": 0.20521695689409078, + "grad_norm": 1.8515969514846802, + "learning_rate": 4.4982161512294374e-05, + "loss": 5.2296, + "step": 34506 + }, + { + "epoch": 0.20522290417737177, + "grad_norm": 1.594019889831543, + "learning_rate": 4.498188080584326e-05, + "loss": 5.1967, + "step": 34507 + }, + { + "epoch": 0.20522885146065278, + "grad_norm": 1.6203807592391968, + "learning_rate": 4.498160009241668e-05, + "loss": 4.9829, + "step": 34508 + }, + { + "epoch": 0.20523479874393377, + "grad_norm": 1.5720133781433105, + "learning_rate": 4.498131937201473e-05, + "loss": 4.8464, + "step": 34509 + }, + { + "epoch": 0.20524074602721476, + "grad_norm": 1.3742187023162842, + "learning_rate": 4.498103864463751e-05, + "loss": 4.8129, + "step": 34510 + }, + { + "epoch": 0.20524669331049578, + "grad_norm": 1.432989478111267, + "learning_rate": 4.498075791028511e-05, + "loss": 4.7693, + "step": 34511 + }, + { + "epoch": 0.20525264059377676, + "grad_norm": 1.66667640209198, + "learning_rate": 4.498047716895763e-05, + "loss": 4.9305, + "step": 34512 + }, + { + "epoch": 0.20525858787705775, + "grad_norm": 1.795190691947937, + "learning_rate": 4.4980196420655185e-05, + "loss": 4.6456, + "step": 34513 + }, + { + "epoch": 0.20526453516033877, + "grad_norm": 2.04221510887146, + "learning_rate": 4.4979915665377855e-05, + "loss": 5.2033, + "step": 34514 + }, + { + "epoch": 0.20527048244361976, + "grad_norm": 2.103423595428467, + "learning_rate": 4.497963490312574e-05, + "loss": 5.1652, + "step": 34515 + }, + { + "epoch": 0.20527642972690074, + "grad_norm": 1.7535483837127686, + "learning_rate": 4.497935413389894e-05, + "loss": 4.9614, + "step": 34516 + }, + { + "epoch": 0.20528237701018176, + "grad_norm": 1.7065552473068237, + "learning_rate": 4.497907335769756e-05, + "loss": 4.8857, + "step": 34517 + }, + { + "epoch": 0.20528832429346275, + "grad_norm": 1.8153327703475952, + "learning_rate": 4.497879257452168e-05, + "loss": 4.9539, + "step": 34518 + }, + { + "epoch": 0.20529427157674374, + "grad_norm": 3.195288896560669, + "learning_rate": 4.497851178437142e-05, + "loss": 2.5544, + "step": 34519 + }, + { + "epoch": 0.20530021886002475, + "grad_norm": 2.209017276763916, + "learning_rate": 4.497823098724687e-05, + "loss": 3.4065, + "step": 34520 + }, + { + "epoch": 0.20530616614330574, + "grad_norm": 1.9605618715286255, + "learning_rate": 4.497795018314812e-05, + "loss": 4.4923, + "step": 34521 + }, + { + "epoch": 0.20531211342658673, + "grad_norm": 1.7516566514968872, + "learning_rate": 4.497766937207528e-05, + "loss": 4.3444, + "step": 34522 + }, + { + "epoch": 0.20531806070986774, + "grad_norm": 1.7743666172027588, + "learning_rate": 4.497738855402843e-05, + "loss": 4.8758, + "step": 34523 + }, + { + "epoch": 0.20532400799314873, + "grad_norm": 1.9033406972885132, + "learning_rate": 4.497710772900769e-05, + "loss": 4.8746, + "step": 34524 + }, + { + "epoch": 0.20532995527642972, + "grad_norm": 1.7820247411727905, + "learning_rate": 4.497682689701315e-05, + "loss": 4.4355, + "step": 34525 + }, + { + "epoch": 0.20533590255971074, + "grad_norm": 1.929594874382019, + "learning_rate": 4.49765460580449e-05, + "loss": 4.4736, + "step": 34526 + }, + { + "epoch": 0.20534184984299172, + "grad_norm": 1.8177223205566406, + "learning_rate": 4.497626521210304e-05, + "loss": 4.3102, + "step": 34527 + }, + { + "epoch": 0.2053477971262727, + "grad_norm": 1.7658437490463257, + "learning_rate": 4.497598435918768e-05, + "loss": 4.3177, + "step": 34528 + }, + { + "epoch": 0.20535374440955373, + "grad_norm": 2.5987002849578857, + "learning_rate": 4.497570349929891e-05, + "loss": 3.2716, + "step": 34529 + }, + { + "epoch": 0.20535969169283472, + "grad_norm": 2.0609161853790283, + "learning_rate": 4.497542263243683e-05, + "loss": 3.2153, + "step": 34530 + }, + { + "epoch": 0.2053656389761157, + "grad_norm": 1.6488370895385742, + "learning_rate": 4.497514175860153e-05, + "loss": 4.782, + "step": 34531 + }, + { + "epoch": 0.20537158625939672, + "grad_norm": 1.972416877746582, + "learning_rate": 4.497486087779312e-05, + "loss": 5.144, + "step": 34532 + }, + { + "epoch": 0.2053775335426777, + "grad_norm": 1.8627492189407349, + "learning_rate": 4.497457999001169e-05, + "loss": 5.2683, + "step": 34533 + }, + { + "epoch": 0.2053834808259587, + "grad_norm": 2.3711135387420654, + "learning_rate": 4.497429909525733e-05, + "loss": 3.4938, + "step": 34534 + }, + { + "epoch": 0.20538942810923969, + "grad_norm": 1.9512462615966797, + "learning_rate": 4.4974018193530165e-05, + "loss": 4.6567, + "step": 34535 + }, + { + "epoch": 0.2053953753925207, + "grad_norm": 2.878932476043701, + "learning_rate": 4.497373728483027e-05, + "loss": 3.7388, + "step": 34536 + }, + { + "epoch": 0.2054013226758017, + "grad_norm": 1.8661671876907349, + "learning_rate": 4.497345636915775e-05, + "loss": 4.3109, + "step": 34537 + }, + { + "epoch": 0.20540726995908268, + "grad_norm": 1.4568744897842407, + "learning_rate": 4.49731754465127e-05, + "loss": 4.6756, + "step": 34538 + }, + { + "epoch": 0.2054132172423637, + "grad_norm": 1.2677067518234253, + "learning_rate": 4.497289451689522e-05, + "loss": 4.5082, + "step": 34539 + }, + { + "epoch": 0.20541916452564468, + "grad_norm": 1.466506004333496, + "learning_rate": 4.497261358030542e-05, + "loss": 4.5668, + "step": 34540 + }, + { + "epoch": 0.20542511180892567, + "grad_norm": 1.3032081127166748, + "learning_rate": 4.497233263674338e-05, + "loss": 4.5761, + "step": 34541 + }, + { + "epoch": 0.20543105909220669, + "grad_norm": 1.3104262351989746, + "learning_rate": 4.49720516862092e-05, + "loss": 4.555, + "step": 34542 + }, + { + "epoch": 0.20543700637548767, + "grad_norm": 1.2286089658737183, + "learning_rate": 4.497177072870299e-05, + "loss": 4.5533, + "step": 34543 + }, + { + "epoch": 0.20544295365876866, + "grad_norm": 1.5584557056427002, + "learning_rate": 4.4971489764224836e-05, + "loss": 4.4975, + "step": 34544 + }, + { + "epoch": 0.20544890094204968, + "grad_norm": 1.4209648370742798, + "learning_rate": 4.497120879277485e-05, + "loss": 4.5484, + "step": 34545 + }, + { + "epoch": 0.20545484822533067, + "grad_norm": 1.4486362934112549, + "learning_rate": 4.4970927814353116e-05, + "loss": 4.5861, + "step": 34546 + }, + { + "epoch": 0.20546079550861165, + "grad_norm": 1.3003538846969604, + "learning_rate": 4.4970646828959736e-05, + "loss": 4.6317, + "step": 34547 + }, + { + "epoch": 0.20546674279189267, + "grad_norm": 1.624047040939331, + "learning_rate": 4.497036583659481e-05, + "loss": 4.8953, + "step": 34548 + }, + { + "epoch": 0.20547269007517366, + "grad_norm": 1.9646494388580322, + "learning_rate": 4.497008483725844e-05, + "loss": 4.8024, + "step": 34549 + }, + { + "epoch": 0.20547863735845465, + "grad_norm": 2.3605401515960693, + "learning_rate": 4.496980383095071e-05, + "loss": 5.2422, + "step": 34550 + }, + { + "epoch": 0.20548458464173566, + "grad_norm": 2.1505236625671387, + "learning_rate": 4.496952281767174e-05, + "loss": 4.0803, + "step": 34551 + }, + { + "epoch": 0.20549053192501665, + "grad_norm": 1.591495156288147, + "learning_rate": 4.49692417974216e-05, + "loss": 4.2671, + "step": 34552 + }, + { + "epoch": 0.20549647920829764, + "grad_norm": 1.3606783151626587, + "learning_rate": 4.496896077020042e-05, + "loss": 4.4897, + "step": 34553 + }, + { + "epoch": 0.20550242649157865, + "grad_norm": 2.085550308227539, + "learning_rate": 4.496867973600827e-05, + "loss": 5.0765, + "step": 34554 + }, + { + "epoch": 0.20550837377485964, + "grad_norm": 2.3612008094787598, + "learning_rate": 4.496839869484527e-05, + "loss": 5.0725, + "step": 34555 + }, + { + "epoch": 0.20551432105814063, + "grad_norm": 2.228905439376831, + "learning_rate": 4.4968117646711506e-05, + "loss": 5.139, + "step": 34556 + }, + { + "epoch": 0.20552026834142165, + "grad_norm": 1.8952064514160156, + "learning_rate": 4.496783659160707e-05, + "loss": 5.0891, + "step": 34557 + }, + { + "epoch": 0.20552621562470264, + "grad_norm": 1.8927900791168213, + "learning_rate": 4.496755552953208e-05, + "loss": 4.9674, + "step": 34558 + }, + { + "epoch": 0.20553216290798362, + "grad_norm": 1.776397466659546, + "learning_rate": 4.496727446048662e-05, + "loss": 4.9904, + "step": 34559 + }, + { + "epoch": 0.20553811019126464, + "grad_norm": 1.8404251337051392, + "learning_rate": 4.496699338447078e-05, + "loss": 4.92, + "step": 34560 + }, + { + "epoch": 0.20554405747454563, + "grad_norm": 1.8732088804244995, + "learning_rate": 4.4966712301484685e-05, + "loss": 5.0232, + "step": 34561 + }, + { + "epoch": 0.20555000475782662, + "grad_norm": 1.7764228582382202, + "learning_rate": 4.49664312115284e-05, + "loss": 4.9615, + "step": 34562 + }, + { + "epoch": 0.20555595204110763, + "grad_norm": 1.8939924240112305, + "learning_rate": 4.496615011460206e-05, + "loss": 4.9276, + "step": 34563 + }, + { + "epoch": 0.20556189932438862, + "grad_norm": 1.7696598768234253, + "learning_rate": 4.496586901070573e-05, + "loss": 4.8614, + "step": 34564 + }, + { + "epoch": 0.2055678466076696, + "grad_norm": 1.7702951431274414, + "learning_rate": 4.496558789983952e-05, + "loss": 4.8973, + "step": 34565 + }, + { + "epoch": 0.20557379389095062, + "grad_norm": 1.6990092992782593, + "learning_rate": 4.4965306782003535e-05, + "loss": 4.8359, + "step": 34566 + }, + { + "epoch": 0.2055797411742316, + "grad_norm": 1.6071676015853882, + "learning_rate": 4.496502565719787e-05, + "loss": 4.9214, + "step": 34567 + }, + { + "epoch": 0.2055856884575126, + "grad_norm": 1.7637958526611328, + "learning_rate": 4.4964744525422615e-05, + "loss": 4.8521, + "step": 34568 + }, + { + "epoch": 0.20559163574079362, + "grad_norm": 1.728930115699768, + "learning_rate": 4.496446338667788e-05, + "loss": 4.7967, + "step": 34569 + }, + { + "epoch": 0.2055975830240746, + "grad_norm": 1.597816824913025, + "learning_rate": 4.4964182240963746e-05, + "loss": 4.8614, + "step": 34570 + }, + { + "epoch": 0.2056035303073556, + "grad_norm": 1.9850479364395142, + "learning_rate": 4.496390108828033e-05, + "loss": 4.4582, + "step": 34571 + }, + { + "epoch": 0.2056094775906366, + "grad_norm": 2.1062679290771484, + "learning_rate": 4.496361992862772e-05, + "loss": 5.2632, + "step": 34572 + }, + { + "epoch": 0.2056154248739176, + "grad_norm": 1.8323718309402466, + "learning_rate": 4.496333876200602e-05, + "loss": 5.1667, + "step": 34573 + }, + { + "epoch": 0.20562137215719858, + "grad_norm": 1.64948308467865, + "learning_rate": 4.496305758841532e-05, + "loss": 5.0581, + "step": 34574 + }, + { + "epoch": 0.2056273194404796, + "grad_norm": 1.5759135484695435, + "learning_rate": 4.496277640785573e-05, + "loss": 5.0197, + "step": 34575 + }, + { + "epoch": 0.2056332667237606, + "grad_norm": 1.7933998107910156, + "learning_rate": 4.496249522032734e-05, + "loss": 4.5411, + "step": 34576 + }, + { + "epoch": 0.20563921400704158, + "grad_norm": 2.153806447982788, + "learning_rate": 4.496221402583024e-05, + "loss": 4.3102, + "step": 34577 + }, + { + "epoch": 0.2056451612903226, + "grad_norm": 2.2217137813568115, + "learning_rate": 4.496193282436455e-05, + "loss": 3.6145, + "step": 34578 + }, + { + "epoch": 0.20565110857360358, + "grad_norm": 2.1557652950286865, + "learning_rate": 4.496165161593035e-05, + "loss": 4.4684, + "step": 34579 + }, + { + "epoch": 0.20565705585688457, + "grad_norm": 1.7254935503005981, + "learning_rate": 4.496137040052773e-05, + "loss": 5.1948, + "step": 34580 + }, + { + "epoch": 0.20566300314016558, + "grad_norm": 1.663953185081482, + "learning_rate": 4.496108917815682e-05, + "loss": 5.0254, + "step": 34581 + }, + { + "epoch": 0.20566895042344657, + "grad_norm": 1.463458776473999, + "learning_rate": 4.4960807948817695e-05, + "loss": 4.8313, + "step": 34582 + }, + { + "epoch": 0.20567489770672756, + "grad_norm": 2.062659740447998, + "learning_rate": 4.4960526712510456e-05, + "loss": 4.5435, + "step": 34583 + }, + { + "epoch": 0.20568084499000858, + "grad_norm": 2.5706655979156494, + "learning_rate": 4.4960245469235206e-05, + "loss": 3.9599, + "step": 34584 + }, + { + "epoch": 0.20568679227328956, + "grad_norm": 1.9307024478912354, + "learning_rate": 4.495996421899204e-05, + "loss": 4.274, + "step": 34585 + }, + { + "epoch": 0.20569273955657055, + "grad_norm": 1.5417910814285278, + "learning_rate": 4.4959682961781056e-05, + "loss": 4.8377, + "step": 34586 + }, + { + "epoch": 0.20569868683985157, + "grad_norm": 1.7265223264694214, + "learning_rate": 4.4959401697602354e-05, + "loss": 4.7724, + "step": 34587 + }, + { + "epoch": 0.20570463412313256, + "grad_norm": 1.8895970582962036, + "learning_rate": 4.4959120426456026e-05, + "loss": 4.2129, + "step": 34588 + }, + { + "epoch": 0.20571058140641355, + "grad_norm": 1.9905476570129395, + "learning_rate": 4.495883914834218e-05, + "loss": 4.8612, + "step": 34589 + }, + { + "epoch": 0.20571652868969456, + "grad_norm": 1.8988146781921387, + "learning_rate": 4.495855786326091e-05, + "loss": 4.8655, + "step": 34590 + }, + { + "epoch": 0.20572247597297555, + "grad_norm": 1.9545416831970215, + "learning_rate": 4.495827657121231e-05, + "loss": 4.724, + "step": 34591 + }, + { + "epoch": 0.20572842325625654, + "grad_norm": 1.9015052318572998, + "learning_rate": 4.495799527219648e-05, + "loss": 4.8043, + "step": 34592 + }, + { + "epoch": 0.20573437053953753, + "grad_norm": 1.8014155626296997, + "learning_rate": 4.495771396621353e-05, + "loss": 4.6953, + "step": 34593 + }, + { + "epoch": 0.20574031782281854, + "grad_norm": 1.8659149408340454, + "learning_rate": 4.495743265326354e-05, + "loss": 4.5499, + "step": 34594 + }, + { + "epoch": 0.20574626510609953, + "grad_norm": 1.817817211151123, + "learning_rate": 4.495715133334662e-05, + "loss": 4.8158, + "step": 34595 + }, + { + "epoch": 0.20575221238938052, + "grad_norm": 2.685556411743164, + "learning_rate": 4.495687000646286e-05, + "loss": 3.3173, + "step": 34596 + }, + { + "epoch": 0.20575815967266153, + "grad_norm": 1.909069299697876, + "learning_rate": 4.495658867261237e-05, + "loss": 4.7473, + "step": 34597 + }, + { + "epoch": 0.20576410695594252, + "grad_norm": 1.9138617515563965, + "learning_rate": 4.495630733179524e-05, + "loss": 4.7425, + "step": 34598 + }, + { + "epoch": 0.2057700542392235, + "grad_norm": 1.8828593492507935, + "learning_rate": 4.495602598401156e-05, + "loss": 4.6867, + "step": 34599 + }, + { + "epoch": 0.20577600152250453, + "grad_norm": 1.8820278644561768, + "learning_rate": 4.4955744629261455e-05, + "loss": 4.6925, + "step": 34600 + }, + { + "epoch": 0.20578194880578551, + "grad_norm": 1.833011269569397, + "learning_rate": 4.495546326754499e-05, + "loss": 4.6976, + "step": 34601 + }, + { + "epoch": 0.2057878960890665, + "grad_norm": 1.8647735118865967, + "learning_rate": 4.4955181898862284e-05, + "loss": 4.7463, + "step": 34602 + }, + { + "epoch": 0.20579384337234752, + "grad_norm": 1.8595027923583984, + "learning_rate": 4.495490052321343e-05, + "loss": 4.8303, + "step": 34603 + }, + { + "epoch": 0.2057997906556285, + "grad_norm": 1.90599524974823, + "learning_rate": 4.495461914059853e-05, + "loss": 4.5668, + "step": 34604 + }, + { + "epoch": 0.2058057379389095, + "grad_norm": 1.754970908164978, + "learning_rate": 4.4954337751017675e-05, + "loss": 4.4404, + "step": 34605 + }, + { + "epoch": 0.2058116852221905, + "grad_norm": 1.9905904531478882, + "learning_rate": 4.4954056354470966e-05, + "loss": 4.2951, + "step": 34606 + }, + { + "epoch": 0.2058176325054715, + "grad_norm": 1.8667941093444824, + "learning_rate": 4.49537749509585e-05, + "loss": 4.5369, + "step": 34607 + }, + { + "epoch": 0.2058235797887525, + "grad_norm": 1.883657455444336, + "learning_rate": 4.4953493540480384e-05, + "loss": 4.751, + "step": 34608 + }, + { + "epoch": 0.2058295270720335, + "grad_norm": 1.8843159675598145, + "learning_rate": 4.49532121230367e-05, + "loss": 4.7321, + "step": 34609 + }, + { + "epoch": 0.2058354743553145, + "grad_norm": 1.9638304710388184, + "learning_rate": 4.495293069862756e-05, + "loss": 4.7074, + "step": 34610 + }, + { + "epoch": 0.20584142163859548, + "grad_norm": 1.865575909614563, + "learning_rate": 4.4952649267253065e-05, + "loss": 4.7712, + "step": 34611 + }, + { + "epoch": 0.2058473689218765, + "grad_norm": 1.781821608543396, + "learning_rate": 4.49523678289133e-05, + "loss": 5.0408, + "step": 34612 + }, + { + "epoch": 0.20585331620515748, + "grad_norm": 1.908469796180725, + "learning_rate": 4.495208638360837e-05, + "loss": 5.2425, + "step": 34613 + }, + { + "epoch": 0.20585926348843847, + "grad_norm": 1.6282795667648315, + "learning_rate": 4.495180493133837e-05, + "loss": 4.9834, + "step": 34614 + }, + { + "epoch": 0.2058652107717195, + "grad_norm": 1.9788955450057983, + "learning_rate": 4.495152347210341e-05, + "loss": 4.3966, + "step": 34615 + }, + { + "epoch": 0.20587115805500047, + "grad_norm": 1.7629202604293823, + "learning_rate": 4.495124200590357e-05, + "loss": 4.8765, + "step": 34616 + }, + { + "epoch": 0.20587710533828146, + "grad_norm": 1.6397231817245483, + "learning_rate": 4.495096053273896e-05, + "loss": 4.8986, + "step": 34617 + }, + { + "epoch": 0.20588305262156248, + "grad_norm": 2.045825481414795, + "learning_rate": 4.4950679052609676e-05, + "loss": 4.2573, + "step": 34618 + }, + { + "epoch": 0.20588899990484347, + "grad_norm": 2.2558441162109375, + "learning_rate": 4.495039756551582e-05, + "loss": 4.7439, + "step": 34619 + }, + { + "epoch": 0.20589494718812446, + "grad_norm": 1.8919719457626343, + "learning_rate": 4.495011607145748e-05, + "loss": 4.2334, + "step": 34620 + }, + { + "epoch": 0.20590089447140547, + "grad_norm": 1.94011652469635, + "learning_rate": 4.4949834570434766e-05, + "loss": 4.5599, + "step": 34621 + }, + { + "epoch": 0.20590684175468646, + "grad_norm": 1.7179418802261353, + "learning_rate": 4.494955306244777e-05, + "loss": 4.3644, + "step": 34622 + }, + { + "epoch": 0.20591278903796745, + "grad_norm": 1.7644224166870117, + "learning_rate": 4.494927154749659e-05, + "loss": 4.5259, + "step": 34623 + }, + { + "epoch": 0.20591873632124846, + "grad_norm": 1.7699881792068481, + "learning_rate": 4.494899002558133e-05, + "loss": 4.3367, + "step": 34624 + }, + { + "epoch": 0.20592468360452945, + "grad_norm": 1.8351449966430664, + "learning_rate": 4.494870849670207e-05, + "loss": 4.3265, + "step": 34625 + }, + { + "epoch": 0.20593063088781044, + "grad_norm": 1.7717596292495728, + "learning_rate": 4.494842696085894e-05, + "loss": 4.6112, + "step": 34626 + }, + { + "epoch": 0.20593657817109146, + "grad_norm": 1.813578486442566, + "learning_rate": 4.494814541805201e-05, + "loss": 4.2206, + "step": 34627 + }, + { + "epoch": 0.20594252545437244, + "grad_norm": 1.9783309698104858, + "learning_rate": 4.494786386828139e-05, + "loss": 4.3013, + "step": 34628 + }, + { + "epoch": 0.20594847273765343, + "grad_norm": 2.3097033500671387, + "learning_rate": 4.4947582311547175e-05, + "loss": 2.6195, + "step": 34629 + }, + { + "epoch": 0.20595442002093445, + "grad_norm": 2.7069907188415527, + "learning_rate": 4.4947300747849464e-05, + "loss": 2.0729, + "step": 34630 + }, + { + "epoch": 0.20596036730421544, + "grad_norm": 3.0851471424102783, + "learning_rate": 4.494701917718836e-05, + "loss": 3.1709, + "step": 34631 + }, + { + "epoch": 0.20596631458749642, + "grad_norm": 2.7397348880767822, + "learning_rate": 4.494673759956396e-05, + "loss": 2.7993, + "step": 34632 + }, + { + "epoch": 0.20597226187077744, + "grad_norm": 1.8076826333999634, + "learning_rate": 4.4946456014976356e-05, + "loss": 3.9575, + "step": 34633 + }, + { + "epoch": 0.20597820915405843, + "grad_norm": 1.8170053958892822, + "learning_rate": 4.494617442342565e-05, + "loss": 4.4885, + "step": 34634 + }, + { + "epoch": 0.20598415643733942, + "grad_norm": 2.324014186859131, + "learning_rate": 4.494589282491194e-05, + "loss": 1.5578, + "step": 34635 + }, + { + "epoch": 0.20599010372062043, + "grad_norm": 3.2078003883361816, + "learning_rate": 4.4945611219435326e-05, + "loss": 2.1456, + "step": 34636 + }, + { + "epoch": 0.20599605100390142, + "grad_norm": 2.683720827102661, + "learning_rate": 4.49453296069959e-05, + "loss": 2.3518, + "step": 34637 + }, + { + "epoch": 0.2060019982871824, + "grad_norm": 2.6722137928009033, + "learning_rate": 4.494504798759378e-05, + "loss": 2.8547, + "step": 34638 + }, + { + "epoch": 0.20600794557046342, + "grad_norm": 2.827223539352417, + "learning_rate": 4.494476636122904e-05, + "loss": 2.045, + "step": 34639 + }, + { + "epoch": 0.2060138928537444, + "grad_norm": 1.9061033725738525, + "learning_rate": 4.494448472790179e-05, + "loss": 4.0358, + "step": 34640 + }, + { + "epoch": 0.2060198401370254, + "grad_norm": 1.6916943788528442, + "learning_rate": 4.4944203087612125e-05, + "loss": 4.9196, + "step": 34641 + }, + { + "epoch": 0.20602578742030642, + "grad_norm": 1.6783428192138672, + "learning_rate": 4.494392144036014e-05, + "loss": 5.0467, + "step": 34642 + }, + { + "epoch": 0.2060317347035874, + "grad_norm": 2.8061981201171875, + "learning_rate": 4.494363978614594e-05, + "loss": 3.4047, + "step": 34643 + }, + { + "epoch": 0.2060376819868684, + "grad_norm": 2.3837738037109375, + "learning_rate": 4.4943358124969634e-05, + "loss": 3.3825, + "step": 34644 + }, + { + "epoch": 0.2060436292701494, + "grad_norm": 1.9662126302719116, + "learning_rate": 4.49430764568313e-05, + "loss": 5.0622, + "step": 34645 + }, + { + "epoch": 0.2060495765534304, + "grad_norm": 1.7634485960006714, + "learning_rate": 4.494279478173103e-05, + "loss": 5.2412, + "step": 34646 + }, + { + "epoch": 0.20605552383671139, + "grad_norm": 1.5668162107467651, + "learning_rate": 4.494251309966896e-05, + "loss": 5.1638, + "step": 34647 + }, + { + "epoch": 0.2060614711199924, + "grad_norm": 1.904152512550354, + "learning_rate": 4.4942231410645154e-05, + "loss": 4.5813, + "step": 34648 + }, + { + "epoch": 0.2060674184032734, + "grad_norm": 1.5555822849273682, + "learning_rate": 4.4941949714659714e-05, + "loss": 4.9484, + "step": 34649 + }, + { + "epoch": 0.20607336568655438, + "grad_norm": 1.6825793981552124, + "learning_rate": 4.4941668011712754e-05, + "loss": 4.9136, + "step": 34650 + }, + { + "epoch": 0.20607931296983537, + "grad_norm": 1.7734719514846802, + "learning_rate": 4.494138630180437e-05, + "loss": 4.5992, + "step": 34651 + }, + { + "epoch": 0.20608526025311638, + "grad_norm": 1.7022820711135864, + "learning_rate": 4.494110458493464e-05, + "loss": 4.5084, + "step": 34652 + }, + { + "epoch": 0.20609120753639737, + "grad_norm": 1.6979751586914062, + "learning_rate": 4.4940822861103684e-05, + "loss": 5.0131, + "step": 34653 + }, + { + "epoch": 0.20609715481967836, + "grad_norm": 2.1307291984558105, + "learning_rate": 4.494054113031159e-05, + "loss": 4.6225, + "step": 34654 + }, + { + "epoch": 0.20610310210295937, + "grad_norm": 1.5449354648590088, + "learning_rate": 4.494025939255846e-05, + "loss": 4.8424, + "step": 34655 + }, + { + "epoch": 0.20610904938624036, + "grad_norm": 1.623788833618164, + "learning_rate": 4.493997764784439e-05, + "loss": 4.9259, + "step": 34656 + }, + { + "epoch": 0.20611499666952135, + "grad_norm": 1.5342544317245483, + "learning_rate": 4.4939695896169483e-05, + "loss": 4.7405, + "step": 34657 + }, + { + "epoch": 0.20612094395280237, + "grad_norm": 2.769473075866699, + "learning_rate": 4.493941413753383e-05, + "loss": 3.4611, + "step": 34658 + }, + { + "epoch": 0.20612689123608335, + "grad_norm": 1.6174215078353882, + "learning_rate": 4.493913237193753e-05, + "loss": 4.5891, + "step": 34659 + }, + { + "epoch": 0.20613283851936434, + "grad_norm": 1.6780261993408203, + "learning_rate": 4.49388505993807e-05, + "loss": 4.5571, + "step": 34660 + }, + { + "epoch": 0.20613878580264536, + "grad_norm": 1.503455638885498, + "learning_rate": 4.493856881986341e-05, + "loss": 4.7737, + "step": 34661 + }, + { + "epoch": 0.20614473308592635, + "grad_norm": 1.4152953624725342, + "learning_rate": 4.493828703338577e-05, + "loss": 4.2979, + "step": 34662 + }, + { + "epoch": 0.20615068036920733, + "grad_norm": 1.6800228357315063, + "learning_rate": 4.4938005239947896e-05, + "loss": 4.4647, + "step": 34663 + }, + { + "epoch": 0.20615662765248835, + "grad_norm": 1.8049077987670898, + "learning_rate": 4.4937723439549857e-05, + "loss": 4.7413, + "step": 34664 + }, + { + "epoch": 0.20616257493576934, + "grad_norm": 1.651145577430725, + "learning_rate": 4.493744163219177e-05, + "loss": 5.3694, + "step": 34665 + }, + { + "epoch": 0.20616852221905033, + "grad_norm": 1.7472602128982544, + "learning_rate": 4.493715981787372e-05, + "loss": 4.83, + "step": 34666 + }, + { + "epoch": 0.20617446950233134, + "grad_norm": 1.6628087759017944, + "learning_rate": 4.493687799659583e-05, + "loss": 4.6301, + "step": 34667 + }, + { + "epoch": 0.20618041678561233, + "grad_norm": 1.5734481811523438, + "learning_rate": 4.493659616835816e-05, + "loss": 4.577, + "step": 34668 + }, + { + "epoch": 0.20618636406889332, + "grad_norm": 1.6595523357391357, + "learning_rate": 4.4936314333160844e-05, + "loss": 4.5683, + "step": 34669 + }, + { + "epoch": 0.20619231135217433, + "grad_norm": 1.8388851881027222, + "learning_rate": 4.4936032491003965e-05, + "loss": 4.9936, + "step": 34670 + }, + { + "epoch": 0.20619825863545532, + "grad_norm": 2.1026453971862793, + "learning_rate": 4.493575064188762e-05, + "loss": 5.5576, + "step": 34671 + }, + { + "epoch": 0.2062042059187363, + "grad_norm": 1.7481777667999268, + "learning_rate": 4.4935468785811916e-05, + "loss": 5.2993, + "step": 34672 + }, + { + "epoch": 0.20621015320201733, + "grad_norm": 1.7570959329605103, + "learning_rate": 4.493518692277694e-05, + "loss": 5.2391, + "step": 34673 + }, + { + "epoch": 0.20621610048529831, + "grad_norm": 1.5971003770828247, + "learning_rate": 4.49349050527828e-05, + "loss": 5.2646, + "step": 34674 + }, + { + "epoch": 0.2062220477685793, + "grad_norm": 1.7364708185195923, + "learning_rate": 4.493462317582959e-05, + "loss": 4.7422, + "step": 34675 + }, + { + "epoch": 0.20622799505186032, + "grad_norm": 3.195284128189087, + "learning_rate": 4.493434129191741e-05, + "loss": 3.3892, + "step": 34676 + }, + { + "epoch": 0.2062339423351413, + "grad_norm": 2.5080318450927734, + "learning_rate": 4.493405940104636e-05, + "loss": 4.6867, + "step": 34677 + }, + { + "epoch": 0.2062398896184223, + "grad_norm": 1.9916131496429443, + "learning_rate": 4.4933777503216525e-05, + "loss": 4.8544, + "step": 34678 + }, + { + "epoch": 0.2062458369017033, + "grad_norm": 3.0195977687835693, + "learning_rate": 4.4933495598428024e-05, + "loss": 2.5483, + "step": 34679 + }, + { + "epoch": 0.2062517841849843, + "grad_norm": 2.745537042617798, + "learning_rate": 4.4933213686680944e-05, + "loss": 3.6633, + "step": 34680 + }, + { + "epoch": 0.2062577314682653, + "grad_norm": 2.856989860534668, + "learning_rate": 4.493293176797538e-05, + "loss": 4.8289, + "step": 34681 + }, + { + "epoch": 0.2062636787515463, + "grad_norm": 4.457959175109863, + "learning_rate": 4.4932649842311435e-05, + "loss": 4.8733, + "step": 34682 + }, + { + "epoch": 0.2062696260348273, + "grad_norm": 2.740654230117798, + "learning_rate": 4.493236790968921e-05, + "loss": 4.4471, + "step": 34683 + }, + { + "epoch": 0.20627557331810828, + "grad_norm": 1.9697023630142212, + "learning_rate": 4.493208597010881e-05, + "loss": 4.4508, + "step": 34684 + }, + { + "epoch": 0.2062815206013893, + "grad_norm": 2.9418060779571533, + "learning_rate": 4.493180402357031e-05, + "loss": 4.0835, + "step": 34685 + }, + { + "epoch": 0.20628746788467028, + "grad_norm": 2.65594744682312, + "learning_rate": 4.493152207007383e-05, + "loss": 3.7277, + "step": 34686 + }, + { + "epoch": 0.20629341516795127, + "grad_norm": 2.2085533142089844, + "learning_rate": 4.4931240109619464e-05, + "loss": 3.4425, + "step": 34687 + }, + { + "epoch": 0.2062993624512323, + "grad_norm": 2.824538230895996, + "learning_rate": 4.493095814220731e-05, + "loss": 4.2863, + "step": 34688 + }, + { + "epoch": 0.20630530973451328, + "grad_norm": 1.7613264322280884, + "learning_rate": 4.493067616783746e-05, + "loss": 4.3662, + "step": 34689 + }, + { + "epoch": 0.20631125701779426, + "grad_norm": 2.478560209274292, + "learning_rate": 4.493039418651002e-05, + "loss": 4.4783, + "step": 34690 + }, + { + "epoch": 0.20631720430107528, + "grad_norm": 2.1893646717071533, + "learning_rate": 4.493011219822508e-05, + "loss": 4.6626, + "step": 34691 + }, + { + "epoch": 0.20632315158435627, + "grad_norm": 2.2104086875915527, + "learning_rate": 4.4929830202982745e-05, + "loss": 4.2291, + "step": 34692 + }, + { + "epoch": 0.20632909886763726, + "grad_norm": 2.0803580284118652, + "learning_rate": 4.492954820078312e-05, + "loss": 3.3411, + "step": 34693 + }, + { + "epoch": 0.20633504615091827, + "grad_norm": 2.582167387008667, + "learning_rate": 4.492926619162629e-05, + "loss": 3.3111, + "step": 34694 + }, + { + "epoch": 0.20634099343419926, + "grad_norm": 2.3166258335113525, + "learning_rate": 4.4928984175512354e-05, + "loss": 3.4107, + "step": 34695 + }, + { + "epoch": 0.20634694071748025, + "grad_norm": 2.4472901821136475, + "learning_rate": 4.492870215244142e-05, + "loss": 3.6275, + "step": 34696 + }, + { + "epoch": 0.20635288800076126, + "grad_norm": 2.252182960510254, + "learning_rate": 4.4928420122413584e-05, + "loss": 3.8503, + "step": 34697 + }, + { + "epoch": 0.20635883528404225, + "grad_norm": 1.9681614637374878, + "learning_rate": 4.4928138085428936e-05, + "loss": 3.9637, + "step": 34698 + }, + { + "epoch": 0.20636478256732324, + "grad_norm": 2.356943130493164, + "learning_rate": 4.4927856041487586e-05, + "loss": 3.6805, + "step": 34699 + }, + { + "epoch": 0.20637072985060426, + "grad_norm": 2.424372673034668, + "learning_rate": 4.4927573990589636e-05, + "loss": 3.3821, + "step": 34700 + }, + { + "epoch": 0.20637667713388524, + "grad_norm": 2.569279432296753, + "learning_rate": 4.492729193273516e-05, + "loss": 3.494, + "step": 34701 + }, + { + "epoch": 0.20638262441716623, + "grad_norm": 2.154430627822876, + "learning_rate": 4.492700986792427e-05, + "loss": 3.4451, + "step": 34702 + }, + { + "epoch": 0.20638857170044725, + "grad_norm": 2.0662691593170166, + "learning_rate": 4.4926727796157084e-05, + "loss": 3.5951, + "step": 34703 + }, + { + "epoch": 0.20639451898372824, + "grad_norm": 1.9252958297729492, + "learning_rate": 4.4926445717433674e-05, + "loss": 4.6183, + "step": 34704 + }, + { + "epoch": 0.20640046626700922, + "grad_norm": 1.816887378692627, + "learning_rate": 4.4926163631754146e-05, + "loss": 5.0774, + "step": 34705 + }, + { + "epoch": 0.20640641355029024, + "grad_norm": 1.710744023323059, + "learning_rate": 4.4925881539118606e-05, + "loss": 5.0954, + "step": 34706 + }, + { + "epoch": 0.20641236083357123, + "grad_norm": 1.6318283081054688, + "learning_rate": 4.4925599439527144e-05, + "loss": 4.4587, + "step": 34707 + }, + { + "epoch": 0.20641830811685222, + "grad_norm": 1.6551474332809448, + "learning_rate": 4.4925317332979854e-05, + "loss": 4.2257, + "step": 34708 + }, + { + "epoch": 0.2064242554001332, + "grad_norm": 2.1529228687286377, + "learning_rate": 4.492503521947685e-05, + "loss": 4.0616, + "step": 34709 + }, + { + "epoch": 0.20643020268341422, + "grad_norm": 1.7376173734664917, + "learning_rate": 4.4924753099018225e-05, + "loss": 4.9182, + "step": 34710 + }, + { + "epoch": 0.2064361499666952, + "grad_norm": 1.8715314865112305, + "learning_rate": 4.492447097160407e-05, + "loss": 4.7435, + "step": 34711 + }, + { + "epoch": 0.2064420972499762, + "grad_norm": 1.711469292640686, + "learning_rate": 4.4924188837234483e-05, + "loss": 4.5403, + "step": 34712 + }, + { + "epoch": 0.2064480445332572, + "grad_norm": 1.5449049472808838, + "learning_rate": 4.492390669590957e-05, + "loss": 4.4102, + "step": 34713 + }, + { + "epoch": 0.2064539918165382, + "grad_norm": 1.852977991104126, + "learning_rate": 4.492362454762943e-05, + "loss": 4.394, + "step": 34714 + }, + { + "epoch": 0.2064599390998192, + "grad_norm": 1.880318284034729, + "learning_rate": 4.492334239239416e-05, + "loss": 4.3825, + "step": 34715 + }, + { + "epoch": 0.2064658863831002, + "grad_norm": 1.7306921482086182, + "learning_rate": 4.492306023020385e-05, + "loss": 4.2197, + "step": 34716 + }, + { + "epoch": 0.2064718336663812, + "grad_norm": 1.637911081314087, + "learning_rate": 4.492277806105861e-05, + "loss": 4.3934, + "step": 34717 + }, + { + "epoch": 0.20647778094966218, + "grad_norm": 1.708601713180542, + "learning_rate": 4.492249588495854e-05, + "loss": 4.3054, + "step": 34718 + }, + { + "epoch": 0.2064837282329432, + "grad_norm": 1.9779586791992188, + "learning_rate": 4.492221370190373e-05, + "loss": 4.2863, + "step": 34719 + }, + { + "epoch": 0.20648967551622419, + "grad_norm": 2.324136734008789, + "learning_rate": 4.492193151189427e-05, + "loss": 3.4542, + "step": 34720 + }, + { + "epoch": 0.20649562279950517, + "grad_norm": 2.028463125228882, + "learning_rate": 4.492164931493028e-05, + "loss": 4.1182, + "step": 34721 + }, + { + "epoch": 0.2065015700827862, + "grad_norm": 2.0588998794555664, + "learning_rate": 4.492136711101185e-05, + "loss": 4.8401, + "step": 34722 + }, + { + "epoch": 0.20650751736606718, + "grad_norm": 1.6144108772277832, + "learning_rate": 4.492108490013906e-05, + "loss": 4.666, + "step": 34723 + }, + { + "epoch": 0.20651346464934817, + "grad_norm": 2.0475502014160156, + "learning_rate": 4.4920802682312047e-05, + "loss": 4.8169, + "step": 34724 + }, + { + "epoch": 0.20651941193262918, + "grad_norm": 2.2128946781158447, + "learning_rate": 4.492052045753088e-05, + "loss": 4.6387, + "step": 34725 + }, + { + "epoch": 0.20652535921591017, + "grad_norm": 1.4781862497329712, + "learning_rate": 4.4920238225795654e-05, + "loss": 4.9999, + "step": 34726 + }, + { + "epoch": 0.20653130649919116, + "grad_norm": 1.5465887784957886, + "learning_rate": 4.491995598710649e-05, + "loss": 4.9641, + "step": 34727 + }, + { + "epoch": 0.20653725378247217, + "grad_norm": 1.8786133527755737, + "learning_rate": 4.491967374146347e-05, + "loss": 4.4062, + "step": 34728 + }, + { + "epoch": 0.20654320106575316, + "grad_norm": 1.8625175952911377, + "learning_rate": 4.49193914888667e-05, + "loss": 4.7282, + "step": 34729 + }, + { + "epoch": 0.20654914834903415, + "grad_norm": 1.958048939704895, + "learning_rate": 4.4919109229316274e-05, + "loss": 4.8336, + "step": 34730 + }, + { + "epoch": 0.20655509563231517, + "grad_norm": 1.724219799041748, + "learning_rate": 4.49188269628123e-05, + "loss": 4.3932, + "step": 34731 + }, + { + "epoch": 0.20656104291559615, + "grad_norm": 1.767488718032837, + "learning_rate": 4.491854468935486e-05, + "loss": 4.8139, + "step": 34732 + }, + { + "epoch": 0.20656699019887714, + "grad_norm": 1.734523892402649, + "learning_rate": 4.491826240894407e-05, + "loss": 4.5022, + "step": 34733 + }, + { + "epoch": 0.20657293748215816, + "grad_norm": 1.702898383140564, + "learning_rate": 4.491798012158002e-05, + "loss": 5.2369, + "step": 34734 + }, + { + "epoch": 0.20657888476543915, + "grad_norm": 1.6671706438064575, + "learning_rate": 4.4917697827262795e-05, + "loss": 5.3677, + "step": 34735 + }, + { + "epoch": 0.20658483204872014, + "grad_norm": 1.6979637145996094, + "learning_rate": 4.4917415525992524e-05, + "loss": 5.3411, + "step": 34736 + }, + { + "epoch": 0.20659077933200115, + "grad_norm": 1.7467466592788696, + "learning_rate": 4.491713321776928e-05, + "loss": 4.5008, + "step": 34737 + }, + { + "epoch": 0.20659672661528214, + "grad_norm": 1.5151604413986206, + "learning_rate": 4.491685090259318e-05, + "loss": 5.1486, + "step": 34738 + }, + { + "epoch": 0.20660267389856313, + "grad_norm": 1.8055251836776733, + "learning_rate": 4.49165685804643e-05, + "loss": 4.9527, + "step": 34739 + }, + { + "epoch": 0.20660862118184414, + "grad_norm": 1.7542595863342285, + "learning_rate": 4.4916286251382754e-05, + "loss": 4.9497, + "step": 34740 + }, + { + "epoch": 0.20661456846512513, + "grad_norm": 1.7868531942367554, + "learning_rate": 4.4916003915348645e-05, + "loss": 4.8182, + "step": 34741 + }, + { + "epoch": 0.20662051574840612, + "grad_norm": 1.7146828174591064, + "learning_rate": 4.491572157236206e-05, + "loss": 4.4512, + "step": 34742 + }, + { + "epoch": 0.20662646303168714, + "grad_norm": 1.6494626998901367, + "learning_rate": 4.491543922242311e-05, + "loss": 4.4872, + "step": 34743 + }, + { + "epoch": 0.20663241031496812, + "grad_norm": 1.9803482294082642, + "learning_rate": 4.4915156865531875e-05, + "loss": 4.8061, + "step": 34744 + }, + { + "epoch": 0.2066383575982491, + "grad_norm": 2.0528030395507812, + "learning_rate": 4.4914874501688475e-05, + "loss": 4.4076, + "step": 34745 + }, + { + "epoch": 0.20664430488153013, + "grad_norm": 1.5636694431304932, + "learning_rate": 4.491459213089299e-05, + "loss": 5.0246, + "step": 34746 + }, + { + "epoch": 0.20665025216481112, + "grad_norm": 2.5548834800720215, + "learning_rate": 4.4914309753145534e-05, + "loss": 3.7054, + "step": 34747 + }, + { + "epoch": 0.2066561994480921, + "grad_norm": 2.4566895961761475, + "learning_rate": 4.491402736844619e-05, + "loss": 3.5679, + "step": 34748 + }, + { + "epoch": 0.20666214673137312, + "grad_norm": 1.9277645349502563, + "learning_rate": 4.491374497679507e-05, + "loss": 4.3468, + "step": 34749 + }, + { + "epoch": 0.2066680940146541, + "grad_norm": 1.8425731658935547, + "learning_rate": 4.4913462578192265e-05, + "loss": 4.792, + "step": 34750 + }, + { + "epoch": 0.2066740412979351, + "grad_norm": 1.7215994596481323, + "learning_rate": 4.491318017263788e-05, + "loss": 5.2611, + "step": 34751 + }, + { + "epoch": 0.2066799885812161, + "grad_norm": 1.879885196685791, + "learning_rate": 4.491289776013201e-05, + "loss": 5.0435, + "step": 34752 + }, + { + "epoch": 0.2066859358644971, + "grad_norm": 2.316704511642456, + "learning_rate": 4.491261534067475e-05, + "loss": 4.434, + "step": 34753 + }, + { + "epoch": 0.2066918831477781, + "grad_norm": 2.6675474643707275, + "learning_rate": 4.4912332914266195e-05, + "loss": 4.4805, + "step": 34754 + }, + { + "epoch": 0.2066978304310591, + "grad_norm": 2.7434020042419434, + "learning_rate": 4.4912050480906455e-05, + "loss": 3.8732, + "step": 34755 + }, + { + "epoch": 0.2067037777143401, + "grad_norm": 2.0465853214263916, + "learning_rate": 4.4911768040595624e-05, + "loss": 3.4234, + "step": 34756 + }, + { + "epoch": 0.20670972499762108, + "grad_norm": 2.282705307006836, + "learning_rate": 4.4911485593333804e-05, + "loss": 3.4257, + "step": 34757 + }, + { + "epoch": 0.2067156722809021, + "grad_norm": 2.1085431575775146, + "learning_rate": 4.491120313912109e-05, + "loss": 3.1277, + "step": 34758 + }, + { + "epoch": 0.20672161956418308, + "grad_norm": 2.307992935180664, + "learning_rate": 4.491092067795758e-05, + "loss": 2.9563, + "step": 34759 + }, + { + "epoch": 0.20672756684746407, + "grad_norm": 1.7869884967803955, + "learning_rate": 4.491063820984337e-05, + "loss": 3.4671, + "step": 34760 + }, + { + "epoch": 0.2067335141307451, + "grad_norm": 1.573107361793518, + "learning_rate": 4.4910355734778564e-05, + "loss": 4.6225, + "step": 34761 + }, + { + "epoch": 0.20673946141402608, + "grad_norm": 1.8124967813491821, + "learning_rate": 4.491007325276326e-05, + "loss": 4.8808, + "step": 34762 + }, + { + "epoch": 0.20674540869730706, + "grad_norm": 2.266270875930786, + "learning_rate": 4.4909790763797555e-05, + "loss": 4.1334, + "step": 34763 + }, + { + "epoch": 0.20675135598058808, + "grad_norm": 2.0331921577453613, + "learning_rate": 4.4909508267881545e-05, + "loss": 4.9498, + "step": 34764 + }, + { + "epoch": 0.20675730326386907, + "grad_norm": 1.7160965204238892, + "learning_rate": 4.4909225765015325e-05, + "loss": 4.5748, + "step": 34765 + }, + { + "epoch": 0.20676325054715006, + "grad_norm": 1.5300441980361938, + "learning_rate": 4.490894325519901e-05, + "loss": 4.2806, + "step": 34766 + }, + { + "epoch": 0.20676919783043107, + "grad_norm": 2.390836477279663, + "learning_rate": 4.490866073843269e-05, + "loss": 3.4529, + "step": 34767 + }, + { + "epoch": 0.20677514511371206, + "grad_norm": 1.91972017288208, + "learning_rate": 4.4908378214716454e-05, + "loss": 4.4385, + "step": 34768 + }, + { + "epoch": 0.20678109239699305, + "grad_norm": 1.836112141609192, + "learning_rate": 4.4908095684050416e-05, + "loss": 4.6575, + "step": 34769 + }, + { + "epoch": 0.20678703968027404, + "grad_norm": 1.7108503580093384, + "learning_rate": 4.490781314643466e-05, + "loss": 4.4053, + "step": 34770 + }, + { + "epoch": 0.20679298696355505, + "grad_norm": 1.6383551359176636, + "learning_rate": 4.49075306018693e-05, + "loss": 4.5103, + "step": 34771 + }, + { + "epoch": 0.20679893424683604, + "grad_norm": 1.7861992120742798, + "learning_rate": 4.490724805035442e-05, + "loss": 4.5834, + "step": 34772 + }, + { + "epoch": 0.20680488153011703, + "grad_norm": 1.6550997495651245, + "learning_rate": 4.490696549189014e-05, + "loss": 4.4976, + "step": 34773 + }, + { + "epoch": 0.20681082881339805, + "grad_norm": 1.8998942375183105, + "learning_rate": 4.4906682926476525e-05, + "loss": 4.5288, + "step": 34774 + }, + { + "epoch": 0.20681677609667903, + "grad_norm": 1.920011281967163, + "learning_rate": 4.4906400354113705e-05, + "loss": 4.4106, + "step": 34775 + }, + { + "epoch": 0.20682272337996002, + "grad_norm": 1.5240533351898193, + "learning_rate": 4.490611777480176e-05, + "loss": 4.868, + "step": 34776 + }, + { + "epoch": 0.20682867066324104, + "grad_norm": 1.8516569137573242, + "learning_rate": 4.49058351885408e-05, + "loss": 4.6862, + "step": 34777 + }, + { + "epoch": 0.20683461794652203, + "grad_norm": 1.8184990882873535, + "learning_rate": 4.4905552595330915e-05, + "loss": 4.5043, + "step": 34778 + }, + { + "epoch": 0.20684056522980301, + "grad_norm": 1.9880046844482422, + "learning_rate": 4.490526999517221e-05, + "loss": 4.2611, + "step": 34779 + }, + { + "epoch": 0.20684651251308403, + "grad_norm": 2.5457332134246826, + "learning_rate": 4.490498738806478e-05, + "loss": 4.0233, + "step": 34780 + }, + { + "epoch": 0.20685245979636502, + "grad_norm": 2.4234964847564697, + "learning_rate": 4.490470477400872e-05, + "loss": 3.9144, + "step": 34781 + }, + { + "epoch": 0.206858407079646, + "grad_norm": 2.0977954864501953, + "learning_rate": 4.490442215300413e-05, + "loss": 3.8256, + "step": 34782 + }, + { + "epoch": 0.20686435436292702, + "grad_norm": 2.3387715816497803, + "learning_rate": 4.490413952505113e-05, + "loss": 3.7419, + "step": 34783 + }, + { + "epoch": 0.206870301646208, + "grad_norm": 1.8677074909210205, + "learning_rate": 4.490385689014978e-05, + "loss": 5.0754, + "step": 34784 + }, + { + "epoch": 0.206876248929489, + "grad_norm": 1.5382182598114014, + "learning_rate": 4.490357424830021e-05, + "loss": 5.3484, + "step": 34785 + }, + { + "epoch": 0.20688219621277001, + "grad_norm": 1.6211512088775635, + "learning_rate": 4.4903291599502506e-05, + "loss": 5.2409, + "step": 34786 + }, + { + "epoch": 0.206888143496051, + "grad_norm": 1.8651448488235474, + "learning_rate": 4.4903008943756766e-05, + "loss": 4.7752, + "step": 34787 + }, + { + "epoch": 0.206894090779332, + "grad_norm": 1.579422950744629, + "learning_rate": 4.490272628106309e-05, + "loss": 4.899, + "step": 34788 + }, + { + "epoch": 0.206900038062613, + "grad_norm": 1.7237675189971924, + "learning_rate": 4.490244361142159e-05, + "loss": 4.9186, + "step": 34789 + }, + { + "epoch": 0.206905985345894, + "grad_norm": 2.424854040145874, + "learning_rate": 4.490216093483234e-05, + "loss": 3.9027, + "step": 34790 + }, + { + "epoch": 0.20691193262917498, + "grad_norm": 1.656636357307434, + "learning_rate": 4.490187825129546e-05, + "loss": 4.4577, + "step": 34791 + }, + { + "epoch": 0.206917879912456, + "grad_norm": 2.7975332736968994, + "learning_rate": 4.490159556081103e-05, + "loss": 4.2677, + "step": 34792 + }, + { + "epoch": 0.206923827195737, + "grad_norm": 2.6634609699249268, + "learning_rate": 4.490131286337916e-05, + "loss": 3.5967, + "step": 34793 + }, + { + "epoch": 0.20692977447901797, + "grad_norm": 2.820051431655884, + "learning_rate": 4.4901030158999954e-05, + "loss": 3.791, + "step": 34794 + }, + { + "epoch": 0.206935721762299, + "grad_norm": 1.9154092073440552, + "learning_rate": 4.4900747447673505e-05, + "loss": 4.2114, + "step": 34795 + }, + { + "epoch": 0.20694166904557998, + "grad_norm": 1.6924352645874023, + "learning_rate": 4.490046472939991e-05, + "loss": 5.4559, + "step": 34796 + }, + { + "epoch": 0.20694761632886097, + "grad_norm": 2.0808238983154297, + "learning_rate": 4.490018200417926e-05, + "loss": 4.0044, + "step": 34797 + }, + { + "epoch": 0.20695356361214198, + "grad_norm": 3.8569533824920654, + "learning_rate": 4.489989927201167e-05, + "loss": 3.6529, + "step": 34798 + }, + { + "epoch": 0.20695951089542297, + "grad_norm": 2.5783863067626953, + "learning_rate": 4.489961653289723e-05, + "loss": 3.3469, + "step": 34799 + }, + { + "epoch": 0.20696545817870396, + "grad_norm": 2.322880268096924, + "learning_rate": 4.4899333786836026e-05, + "loss": 3.5252, + "step": 34800 + }, + { + "epoch": 0.20697140546198498, + "grad_norm": 1.4952900409698486, + "learning_rate": 4.489905103382819e-05, + "loss": 4.9481, + "step": 34801 + }, + { + "epoch": 0.20697735274526596, + "grad_norm": 1.5042228698730469, + "learning_rate": 4.4898768273873796e-05, + "loss": 5.0064, + "step": 34802 + }, + { + "epoch": 0.20698330002854695, + "grad_norm": 2.2395477294921875, + "learning_rate": 4.4898485506972945e-05, + "loss": 4.0019, + "step": 34803 + }, + { + "epoch": 0.20698924731182797, + "grad_norm": 2.6849710941314697, + "learning_rate": 4.489820273312573e-05, + "loss": 3.6374, + "step": 34804 + }, + { + "epoch": 0.20699519459510896, + "grad_norm": 2.534201145172119, + "learning_rate": 4.489791995233227e-05, + "loss": 3.7995, + "step": 34805 + }, + { + "epoch": 0.20700114187838994, + "grad_norm": 2.291923761367798, + "learning_rate": 4.489763716459264e-05, + "loss": 3.876, + "step": 34806 + }, + { + "epoch": 0.20700708916167096, + "grad_norm": 2.2157461643218994, + "learning_rate": 4.489735436990696e-05, + "loss": 4.0497, + "step": 34807 + }, + { + "epoch": 0.20701303644495195, + "grad_norm": 2.394935369491577, + "learning_rate": 4.489707156827532e-05, + "loss": 3.4041, + "step": 34808 + }, + { + "epoch": 0.20701898372823294, + "grad_norm": 2.634643077850342, + "learning_rate": 4.4896788759697813e-05, + "loss": 3.4985, + "step": 34809 + }, + { + "epoch": 0.20702493101151395, + "grad_norm": 2.609468460083008, + "learning_rate": 4.489650594417454e-05, + "loss": 3.2843, + "step": 34810 + }, + { + "epoch": 0.20703087829479494, + "grad_norm": 2.5767226219177246, + "learning_rate": 4.4896223121705606e-05, + "loss": 3.3413, + "step": 34811 + }, + { + "epoch": 0.20703682557807593, + "grad_norm": 2.39313006401062, + "learning_rate": 4.489594029229111e-05, + "loss": 3.358, + "step": 34812 + }, + { + "epoch": 0.20704277286135694, + "grad_norm": 2.763227701187134, + "learning_rate": 4.489565745593114e-05, + "loss": 3.6319, + "step": 34813 + }, + { + "epoch": 0.20704872014463793, + "grad_norm": 1.9068472385406494, + "learning_rate": 4.489537461262581e-05, + "loss": 4.35, + "step": 34814 + }, + { + "epoch": 0.20705466742791892, + "grad_norm": 2.774386405944824, + "learning_rate": 4.48950917623752e-05, + "loss": 4.5764, + "step": 34815 + }, + { + "epoch": 0.20706061471119994, + "grad_norm": 2.7725729942321777, + "learning_rate": 4.4894808905179426e-05, + "loss": 4.8665, + "step": 34816 + }, + { + "epoch": 0.20706656199448092, + "grad_norm": 1.7243051528930664, + "learning_rate": 4.4894526041038577e-05, + "loss": 4.5846, + "step": 34817 + }, + { + "epoch": 0.2070725092777619, + "grad_norm": 2.355294704437256, + "learning_rate": 4.4894243169952755e-05, + "loss": 3.4419, + "step": 34818 + }, + { + "epoch": 0.20707845656104293, + "grad_norm": 2.7653069496154785, + "learning_rate": 4.489396029192206e-05, + "loss": 3.8239, + "step": 34819 + }, + { + "epoch": 0.20708440384432392, + "grad_norm": 2.699720621109009, + "learning_rate": 4.489367740694659e-05, + "loss": 4.3421, + "step": 34820 + }, + { + "epoch": 0.2070903511276049, + "grad_norm": 2.5409398078918457, + "learning_rate": 4.489339451502644e-05, + "loss": 4.4411, + "step": 34821 + }, + { + "epoch": 0.20709629841088592, + "grad_norm": 2.486370801925659, + "learning_rate": 4.489311161616171e-05, + "loss": 4.1267, + "step": 34822 + }, + { + "epoch": 0.2071022456941669, + "grad_norm": 1.9662883281707764, + "learning_rate": 4.48928287103525e-05, + "loss": 4.0986, + "step": 34823 + }, + { + "epoch": 0.2071081929774479, + "grad_norm": 1.8960779905319214, + "learning_rate": 4.489254579759891e-05, + "loss": 4.8072, + "step": 34824 + }, + { + "epoch": 0.2071141402607289, + "grad_norm": 1.8817890882492065, + "learning_rate": 4.4892262877901044e-05, + "loss": 4.5285, + "step": 34825 + }, + { + "epoch": 0.2071200875440099, + "grad_norm": 2.148820400238037, + "learning_rate": 4.489197995125899e-05, + "loss": 4.5258, + "step": 34826 + }, + { + "epoch": 0.2071260348272909, + "grad_norm": 2.0745046138763428, + "learning_rate": 4.489169701767285e-05, + "loss": 4.6216, + "step": 34827 + }, + { + "epoch": 0.20713198211057188, + "grad_norm": 1.9720550775527954, + "learning_rate": 4.4891414077142726e-05, + "loss": 4.3668, + "step": 34828 + }, + { + "epoch": 0.2071379293938529, + "grad_norm": 2.2304906845092773, + "learning_rate": 4.489113112966871e-05, + "loss": 4.5013, + "step": 34829 + }, + { + "epoch": 0.20714387667713388, + "grad_norm": 2.174670934677124, + "learning_rate": 4.489084817525091e-05, + "loss": 4.5277, + "step": 34830 + }, + { + "epoch": 0.20714982396041487, + "grad_norm": 2.458003044128418, + "learning_rate": 4.489056521388942e-05, + "loss": 4.4647, + "step": 34831 + }, + { + "epoch": 0.20715577124369589, + "grad_norm": 2.281400203704834, + "learning_rate": 4.489028224558434e-05, + "loss": 4.5083, + "step": 34832 + }, + { + "epoch": 0.20716171852697687, + "grad_norm": 2.4862747192382812, + "learning_rate": 4.4889999270335765e-05, + "loss": 4.7163, + "step": 34833 + }, + { + "epoch": 0.20716766581025786, + "grad_norm": 2.276209592819214, + "learning_rate": 4.48897162881438e-05, + "loss": 4.8147, + "step": 34834 + }, + { + "epoch": 0.20717361309353888, + "grad_norm": 2.0201053619384766, + "learning_rate": 4.488943329900854e-05, + "loss": 4.5599, + "step": 34835 + }, + { + "epoch": 0.20717956037681987, + "grad_norm": 2.284170389175415, + "learning_rate": 4.4889150302930085e-05, + "loss": 4.5729, + "step": 34836 + }, + { + "epoch": 0.20718550766010085, + "grad_norm": 1.691230297088623, + "learning_rate": 4.488886729990853e-05, + "loss": 4.3631, + "step": 34837 + }, + { + "epoch": 0.20719145494338187, + "grad_norm": 2.024777412414551, + "learning_rate": 4.488858428994398e-05, + "loss": 4.3278, + "step": 34838 + }, + { + "epoch": 0.20719740222666286, + "grad_norm": 2.0853986740112305, + "learning_rate": 4.488830127303653e-05, + "loss": 4.533, + "step": 34839 + }, + { + "epoch": 0.20720334950994385, + "grad_norm": 2.2168142795562744, + "learning_rate": 4.488801824918627e-05, + "loss": 4.5766, + "step": 34840 + }, + { + "epoch": 0.20720929679322486, + "grad_norm": 2.369561195373535, + "learning_rate": 4.488773521839332e-05, + "loss": 4.4252, + "step": 34841 + }, + { + "epoch": 0.20721524407650585, + "grad_norm": 1.9899331331253052, + "learning_rate": 4.4887452180657764e-05, + "loss": 4.5878, + "step": 34842 + }, + { + "epoch": 0.20722119135978684, + "grad_norm": 1.9128245115280151, + "learning_rate": 4.48871691359797e-05, + "loss": 4.5696, + "step": 34843 + }, + { + "epoch": 0.20722713864306785, + "grad_norm": 1.8677480220794678, + "learning_rate": 4.488688608435924e-05, + "loss": 4.4652, + "step": 34844 + }, + { + "epoch": 0.20723308592634884, + "grad_norm": 2.1576309204101562, + "learning_rate": 4.488660302579647e-05, + "loss": 4.2685, + "step": 34845 + }, + { + "epoch": 0.20723903320962983, + "grad_norm": 1.9897032976150513, + "learning_rate": 4.488631996029149e-05, + "loss": 4.3295, + "step": 34846 + }, + { + "epoch": 0.20724498049291085, + "grad_norm": 1.9403741359710693, + "learning_rate": 4.488603688784439e-05, + "loss": 4.4789, + "step": 34847 + }, + { + "epoch": 0.20725092777619183, + "grad_norm": 2.024747610092163, + "learning_rate": 4.48857538084553e-05, + "loss": 4.4787, + "step": 34848 + }, + { + "epoch": 0.20725687505947282, + "grad_norm": 1.6366159915924072, + "learning_rate": 4.488547072212429e-05, + "loss": 4.6898, + "step": 34849 + }, + { + "epoch": 0.20726282234275384, + "grad_norm": 2.3541810512542725, + "learning_rate": 4.488518762885147e-05, + "loss": 3.8412, + "step": 34850 + }, + { + "epoch": 0.20726876962603483, + "grad_norm": 2.3278443813323975, + "learning_rate": 4.4884904528636934e-05, + "loss": 4.4837, + "step": 34851 + }, + { + "epoch": 0.20727471690931581, + "grad_norm": 2.1795244216918945, + "learning_rate": 4.488462142148078e-05, + "loss": 4.4418, + "step": 34852 + }, + { + "epoch": 0.20728066419259683, + "grad_norm": 2.204218626022339, + "learning_rate": 4.488433830738312e-05, + "loss": 4.4315, + "step": 34853 + }, + { + "epoch": 0.20728661147587782, + "grad_norm": 2.254626750946045, + "learning_rate": 4.488405518634403e-05, + "loss": 4.6286, + "step": 34854 + }, + { + "epoch": 0.2072925587591588, + "grad_norm": 2.035433530807495, + "learning_rate": 4.4883772058363635e-05, + "loss": 4.6472, + "step": 34855 + }, + { + "epoch": 0.20729850604243982, + "grad_norm": 1.5613362789154053, + "learning_rate": 4.4883488923442006e-05, + "loss": 5.4295, + "step": 34856 + }, + { + "epoch": 0.2073044533257208, + "grad_norm": 2.2521488666534424, + "learning_rate": 4.488320578157927e-05, + "loss": 4.5495, + "step": 34857 + }, + { + "epoch": 0.2073104006090018, + "grad_norm": 2.360024929046631, + "learning_rate": 4.4882922632775506e-05, + "loss": 4.0133, + "step": 34858 + }, + { + "epoch": 0.20731634789228282, + "grad_norm": 2.220082998275757, + "learning_rate": 4.488263947703082e-05, + "loss": 4.4248, + "step": 34859 + }, + { + "epoch": 0.2073222951755638, + "grad_norm": 2.175050735473633, + "learning_rate": 4.4882356314345306e-05, + "loss": 4.6056, + "step": 34860 + }, + { + "epoch": 0.2073282424588448, + "grad_norm": 2.203740358352661, + "learning_rate": 4.488207314471907e-05, + "loss": 4.8203, + "step": 34861 + }, + { + "epoch": 0.2073341897421258, + "grad_norm": 1.7499996423721313, + "learning_rate": 4.488178996815221e-05, + "loss": 4.5105, + "step": 34862 + }, + { + "epoch": 0.2073401370254068, + "grad_norm": 1.7292070388793945, + "learning_rate": 4.488150678464482e-05, + "loss": 4.2345, + "step": 34863 + }, + { + "epoch": 0.20734608430868778, + "grad_norm": 1.886146903038025, + "learning_rate": 4.488122359419701e-05, + "loss": 4.2784, + "step": 34864 + }, + { + "epoch": 0.2073520315919688, + "grad_norm": 1.5068321228027344, + "learning_rate": 4.4880940396808856e-05, + "loss": 5.0306, + "step": 34865 + }, + { + "epoch": 0.2073579788752498, + "grad_norm": 1.4977796077728271, + "learning_rate": 4.488065719248048e-05, + "loss": 4.4446, + "step": 34866 + }, + { + "epoch": 0.20736392615853078, + "grad_norm": 1.4082682132720947, + "learning_rate": 4.488037398121197e-05, + "loss": 4.9294, + "step": 34867 + }, + { + "epoch": 0.2073698734418118, + "grad_norm": 1.662846565246582, + "learning_rate": 4.488009076300343e-05, + "loss": 5.0529, + "step": 34868 + }, + { + "epoch": 0.20737582072509278, + "grad_norm": 1.5319976806640625, + "learning_rate": 4.487980753785495e-05, + "loss": 5.0529, + "step": 34869 + }, + { + "epoch": 0.20738176800837377, + "grad_norm": 1.417098879814148, + "learning_rate": 4.487952430576664e-05, + "loss": 4.9239, + "step": 34870 + }, + { + "epoch": 0.20738771529165478, + "grad_norm": 1.549066424369812, + "learning_rate": 4.487924106673859e-05, + "loss": 5.0236, + "step": 34871 + }, + { + "epoch": 0.20739366257493577, + "grad_norm": 1.6567977666854858, + "learning_rate": 4.48789578207709e-05, + "loss": 4.5281, + "step": 34872 + }, + { + "epoch": 0.20739960985821676, + "grad_norm": 1.597029209136963, + "learning_rate": 4.487867456786367e-05, + "loss": 4.7681, + "step": 34873 + }, + { + "epoch": 0.20740555714149778, + "grad_norm": 1.635974645614624, + "learning_rate": 4.487839130801701e-05, + "loss": 4.8318, + "step": 34874 + }, + { + "epoch": 0.20741150442477876, + "grad_norm": 1.685660481452942, + "learning_rate": 4.4878108041231e-05, + "loss": 4.9574, + "step": 34875 + }, + { + "epoch": 0.20741745170805975, + "grad_norm": 1.482374668121338, + "learning_rate": 4.487782476750575e-05, + "loss": 4.8813, + "step": 34876 + }, + { + "epoch": 0.20742339899134077, + "grad_norm": 1.5559237003326416, + "learning_rate": 4.487754148684136e-05, + "loss": 4.9915, + "step": 34877 + }, + { + "epoch": 0.20742934627462176, + "grad_norm": 1.431868314743042, + "learning_rate": 4.487725819923792e-05, + "loss": 4.4923, + "step": 34878 + }, + { + "epoch": 0.20743529355790274, + "grad_norm": 1.550361156463623, + "learning_rate": 4.4876974904695535e-05, + "loss": 4.9491, + "step": 34879 + }, + { + "epoch": 0.20744124084118376, + "grad_norm": 1.580848217010498, + "learning_rate": 4.487669160321431e-05, + "loss": 4.7541, + "step": 34880 + }, + { + "epoch": 0.20744718812446475, + "grad_norm": 1.7145969867706299, + "learning_rate": 4.487640829479433e-05, + "loss": 5.0173, + "step": 34881 + }, + { + "epoch": 0.20745313540774574, + "grad_norm": 1.5619465112686157, + "learning_rate": 4.4876124979435704e-05, + "loss": 4.7407, + "step": 34882 + }, + { + "epoch": 0.20745908269102675, + "grad_norm": 1.6751627922058105, + "learning_rate": 4.487584165713853e-05, + "loss": 4.4732, + "step": 34883 + }, + { + "epoch": 0.20746502997430774, + "grad_norm": 1.599061131477356, + "learning_rate": 4.4875558327902906e-05, + "loss": 4.6993, + "step": 34884 + }, + { + "epoch": 0.20747097725758873, + "grad_norm": 1.4041860103607178, + "learning_rate": 4.4875274991728925e-05, + "loss": 4.6427, + "step": 34885 + }, + { + "epoch": 0.20747692454086972, + "grad_norm": 1.539746642112732, + "learning_rate": 4.4874991648616694e-05, + "loss": 4.781, + "step": 34886 + }, + { + "epoch": 0.20748287182415073, + "grad_norm": 1.5050103664398193, + "learning_rate": 4.487470829856631e-05, + "loss": 4.7492, + "step": 34887 + }, + { + "epoch": 0.20748881910743172, + "grad_norm": 1.6688284873962402, + "learning_rate": 4.487442494157786e-05, + "loss": 4.3758, + "step": 34888 + }, + { + "epoch": 0.2074947663907127, + "grad_norm": 1.5119291543960571, + "learning_rate": 4.4874141577651476e-05, + "loss": 4.9482, + "step": 34889 + }, + { + "epoch": 0.20750071367399373, + "grad_norm": 1.5905176401138306, + "learning_rate": 4.487385820678722e-05, + "loss": 5.2721, + "step": 34890 + }, + { + "epoch": 0.2075066609572747, + "grad_norm": 1.6275291442871094, + "learning_rate": 4.48735748289852e-05, + "loss": 4.5295, + "step": 34891 + }, + { + "epoch": 0.2075126082405557, + "grad_norm": 1.5032380819320679, + "learning_rate": 4.487329144424552e-05, + "loss": 4.8084, + "step": 34892 + }, + { + "epoch": 0.20751855552383672, + "grad_norm": 1.4824553728103638, + "learning_rate": 4.48730080525683e-05, + "loss": 4.772, + "step": 34893 + }, + { + "epoch": 0.2075245028071177, + "grad_norm": 1.5292681455612183, + "learning_rate": 4.48727246539536e-05, + "loss": 4.6488, + "step": 34894 + }, + { + "epoch": 0.2075304500903987, + "grad_norm": 1.4371155500411987, + "learning_rate": 4.487244124840154e-05, + "loss": 4.9293, + "step": 34895 + }, + { + "epoch": 0.2075363973736797, + "grad_norm": 1.7150744199752808, + "learning_rate": 4.487215783591222e-05, + "loss": 4.8491, + "step": 34896 + }, + { + "epoch": 0.2075423446569607, + "grad_norm": 1.7894489765167236, + "learning_rate": 4.487187441648573e-05, + "loss": 4.2539, + "step": 34897 + }, + { + "epoch": 0.20754829194024169, + "grad_norm": 2.7374889850616455, + "learning_rate": 4.4871590990122174e-05, + "loss": 3.205, + "step": 34898 + }, + { + "epoch": 0.2075542392235227, + "grad_norm": 3.2529096603393555, + "learning_rate": 4.487130755682165e-05, + "loss": 2.6124, + "step": 34899 + }, + { + "epoch": 0.2075601865068037, + "grad_norm": 1.6190886497497559, + "learning_rate": 4.487102411658426e-05, + "loss": 4.8742, + "step": 34900 + }, + { + "epoch": 0.20756613379008468, + "grad_norm": 1.7997056245803833, + "learning_rate": 4.4870740669410104e-05, + "loss": 4.9073, + "step": 34901 + }, + { + "epoch": 0.2075720810733657, + "grad_norm": 1.6690300703048706, + "learning_rate": 4.4870457215299274e-05, + "loss": 4.5691, + "step": 34902 + }, + { + "epoch": 0.20757802835664668, + "grad_norm": 1.5387898683547974, + "learning_rate": 4.4870173754251874e-05, + "loss": 4.5575, + "step": 34903 + }, + { + "epoch": 0.20758397563992767, + "grad_norm": 1.6400445699691772, + "learning_rate": 4.4869890286268006e-05, + "loss": 4.7597, + "step": 34904 + }, + { + "epoch": 0.2075899229232087, + "grad_norm": 1.5093486309051514, + "learning_rate": 4.4869606811347766e-05, + "loss": 5.3104, + "step": 34905 + }, + { + "epoch": 0.20759587020648967, + "grad_norm": 1.6980483531951904, + "learning_rate": 4.486932332949124e-05, + "loss": 4.5805, + "step": 34906 + }, + { + "epoch": 0.20760181748977066, + "grad_norm": 1.4920854568481445, + "learning_rate": 4.4869039840698544e-05, + "loss": 5.2039, + "step": 34907 + }, + { + "epoch": 0.20760776477305168, + "grad_norm": 1.7514317035675049, + "learning_rate": 4.486875634496977e-05, + "loss": 4.5796, + "step": 34908 + }, + { + "epoch": 0.20761371205633267, + "grad_norm": 1.739810585975647, + "learning_rate": 4.486847284230502e-05, + "loss": 4.6184, + "step": 34909 + }, + { + "epoch": 0.20761965933961365, + "grad_norm": 1.4580451250076294, + "learning_rate": 4.486818933270439e-05, + "loss": 4.8263, + "step": 34910 + }, + { + "epoch": 0.20762560662289467, + "grad_norm": 1.5463128089904785, + "learning_rate": 4.4867905816167986e-05, + "loss": 4.5738, + "step": 34911 + }, + { + "epoch": 0.20763155390617566, + "grad_norm": 1.5785971879959106, + "learning_rate": 4.48676222926959e-05, + "loss": 4.7776, + "step": 34912 + }, + { + "epoch": 0.20763750118945665, + "grad_norm": 2.756115674972534, + "learning_rate": 4.486733876228823e-05, + "loss": 4.1323, + "step": 34913 + }, + { + "epoch": 0.20764344847273766, + "grad_norm": 2.10459041595459, + "learning_rate": 4.4867055224945076e-05, + "loss": 4.9213, + "step": 34914 + }, + { + "epoch": 0.20764939575601865, + "grad_norm": 2.2832987308502197, + "learning_rate": 4.4866771680666545e-05, + "loss": 4.8089, + "step": 34915 + }, + { + "epoch": 0.20765534303929964, + "grad_norm": 1.6514110565185547, + "learning_rate": 4.4866488129452725e-05, + "loss": 5.107, + "step": 34916 + }, + { + "epoch": 0.20766129032258066, + "grad_norm": 1.623475193977356, + "learning_rate": 4.486620457130371e-05, + "loss": 4.46, + "step": 34917 + }, + { + "epoch": 0.20766723760586164, + "grad_norm": 1.4691333770751953, + "learning_rate": 4.486592100621961e-05, + "loss": 4.7703, + "step": 34918 + }, + { + "epoch": 0.20767318488914263, + "grad_norm": 1.6401385068893433, + "learning_rate": 4.4865637434200535e-05, + "loss": 4.7759, + "step": 34919 + }, + { + "epoch": 0.20767913217242365, + "grad_norm": 1.4848181009292603, + "learning_rate": 4.486535385524656e-05, + "loss": 5.0626, + "step": 34920 + }, + { + "epoch": 0.20768507945570464, + "grad_norm": 1.5136383771896362, + "learning_rate": 4.48650702693578e-05, + "loss": 5.0347, + "step": 34921 + }, + { + "epoch": 0.20769102673898562, + "grad_norm": 1.4435847997665405, + "learning_rate": 4.4864786676534356e-05, + "loss": 5.1182, + "step": 34922 + }, + { + "epoch": 0.20769697402226664, + "grad_norm": 1.5047261714935303, + "learning_rate": 4.486450307677631e-05, + "loss": 5.6022, + "step": 34923 + }, + { + "epoch": 0.20770292130554763, + "grad_norm": 1.491112232208252, + "learning_rate": 4.486421947008378e-05, + "loss": 5.312, + "step": 34924 + }, + { + "epoch": 0.20770886858882862, + "grad_norm": 1.816465139389038, + "learning_rate": 4.4863935856456844e-05, + "loss": 5.0285, + "step": 34925 + }, + { + "epoch": 0.20771481587210963, + "grad_norm": 1.6388663053512573, + "learning_rate": 4.4863652235895624e-05, + "loss": 4.9933, + "step": 34926 + }, + { + "epoch": 0.20772076315539062, + "grad_norm": 1.7427036762237549, + "learning_rate": 4.4863368608400206e-05, + "loss": 4.7291, + "step": 34927 + }, + { + "epoch": 0.2077267104386716, + "grad_norm": 1.8048992156982422, + "learning_rate": 4.4863084973970684e-05, + "loss": 4.4709, + "step": 34928 + }, + { + "epoch": 0.20773265772195262, + "grad_norm": 1.848111629486084, + "learning_rate": 4.4862801332607175e-05, + "loss": 5.4746, + "step": 34929 + }, + { + "epoch": 0.2077386050052336, + "grad_norm": 1.9766002893447876, + "learning_rate": 4.486251768430976e-05, + "loss": 4.6229, + "step": 34930 + }, + { + "epoch": 0.2077445522885146, + "grad_norm": 2.060670852661133, + "learning_rate": 4.4862234029078545e-05, + "loss": 5.1253, + "step": 34931 + }, + { + "epoch": 0.20775049957179562, + "grad_norm": 1.8634072542190552, + "learning_rate": 4.486195036691363e-05, + "loss": 4.9958, + "step": 34932 + }, + { + "epoch": 0.2077564468550766, + "grad_norm": 1.8860241174697876, + "learning_rate": 4.4861666697815115e-05, + "loss": 5.2148, + "step": 34933 + }, + { + "epoch": 0.2077623941383576, + "grad_norm": 1.6811100244522095, + "learning_rate": 4.4861383021783096e-05, + "loss": 4.9268, + "step": 34934 + }, + { + "epoch": 0.2077683414216386, + "grad_norm": 2.6467933654785156, + "learning_rate": 4.486109933881767e-05, + "loss": 4.2397, + "step": 34935 + }, + { + "epoch": 0.2077742887049196, + "grad_norm": 2.065701484680176, + "learning_rate": 4.486081564891895e-05, + "loss": 4.3548, + "step": 34936 + }, + { + "epoch": 0.20778023598820058, + "grad_norm": 1.5673136711120605, + "learning_rate": 4.4860531952087015e-05, + "loss": 5.1111, + "step": 34937 + }, + { + "epoch": 0.2077861832714816, + "grad_norm": 1.7255089282989502, + "learning_rate": 4.486024824832198e-05, + "loss": 5.2419, + "step": 34938 + }, + { + "epoch": 0.2077921305547626, + "grad_norm": 1.5966598987579346, + "learning_rate": 4.485996453762393e-05, + "loss": 5.3367, + "step": 34939 + }, + { + "epoch": 0.20779807783804358, + "grad_norm": 1.687177062034607, + "learning_rate": 4.485968081999298e-05, + "loss": 4.8, + "step": 34940 + }, + { + "epoch": 0.2078040251213246, + "grad_norm": 1.4911394119262695, + "learning_rate": 4.485939709542921e-05, + "loss": 4.6558, + "step": 34941 + }, + { + "epoch": 0.20780997240460558, + "grad_norm": 1.705232858657837, + "learning_rate": 4.4859113363932735e-05, + "loss": 4.5196, + "step": 34942 + }, + { + "epoch": 0.20781591968788657, + "grad_norm": 2.290523052215576, + "learning_rate": 4.4858829625503654e-05, + "loss": 4.8916, + "step": 34943 + }, + { + "epoch": 0.20782186697116756, + "grad_norm": 1.3084735870361328, + "learning_rate": 4.4858545880142056e-05, + "loss": 4.8922, + "step": 34944 + }, + { + "epoch": 0.20782781425444857, + "grad_norm": 1.6424446105957031, + "learning_rate": 4.485826212784805e-05, + "loss": 4.4804, + "step": 34945 + }, + { + "epoch": 0.20783376153772956, + "grad_norm": 1.5474185943603516, + "learning_rate": 4.485797836862172e-05, + "loss": 4.7975, + "step": 34946 + }, + { + "epoch": 0.20783970882101055, + "grad_norm": 1.5839738845825195, + "learning_rate": 4.485769460246319e-05, + "loss": 4.6328, + "step": 34947 + }, + { + "epoch": 0.20784565610429157, + "grad_norm": 1.7342357635498047, + "learning_rate": 4.485741082937252e-05, + "loss": 4.2369, + "step": 34948 + }, + { + "epoch": 0.20785160338757255, + "grad_norm": 1.6092901229858398, + "learning_rate": 4.4857127049349856e-05, + "loss": 4.4599, + "step": 34949 + }, + { + "epoch": 0.20785755067085354, + "grad_norm": 1.6004117727279663, + "learning_rate": 4.485684326239527e-05, + "loss": 5.1366, + "step": 34950 + }, + { + "epoch": 0.20786349795413456, + "grad_norm": 1.75442373752594, + "learning_rate": 4.485655946850885e-05, + "loss": 4.1551, + "step": 34951 + }, + { + "epoch": 0.20786944523741555, + "grad_norm": 1.4843125343322754, + "learning_rate": 4.485627566769073e-05, + "loss": 5.0695, + "step": 34952 + }, + { + "epoch": 0.20787539252069653, + "grad_norm": 1.3438167572021484, + "learning_rate": 4.485599185994097e-05, + "loss": 4.9293, + "step": 34953 + }, + { + "epoch": 0.20788133980397755, + "grad_norm": 1.9467015266418457, + "learning_rate": 4.48557080452597e-05, + "loss": 4.5889, + "step": 34954 + }, + { + "epoch": 0.20788728708725854, + "grad_norm": 1.8029512166976929, + "learning_rate": 4.4855424223647005e-05, + "loss": 4.9904, + "step": 34955 + }, + { + "epoch": 0.20789323437053953, + "grad_norm": 1.4334784746170044, + "learning_rate": 4.485514039510299e-05, + "loss": 4.3459, + "step": 34956 + }, + { + "epoch": 0.20789918165382054, + "grad_norm": 1.6164671182632446, + "learning_rate": 4.4854856559627746e-05, + "loss": 5.3017, + "step": 34957 + }, + { + "epoch": 0.20790512893710153, + "grad_norm": 1.6465972661972046, + "learning_rate": 4.485457271722138e-05, + "loss": 4.353, + "step": 34958 + }, + { + "epoch": 0.20791107622038252, + "grad_norm": 1.5343241691589355, + "learning_rate": 4.485428886788399e-05, + "loss": 5.112, + "step": 34959 + }, + { + "epoch": 0.20791702350366353, + "grad_norm": 1.5733743906021118, + "learning_rate": 4.485400501161566e-05, + "loss": 5.0707, + "step": 34960 + }, + { + "epoch": 0.20792297078694452, + "grad_norm": 1.1540693044662476, + "learning_rate": 4.4853721148416515e-05, + "loss": 5.0026, + "step": 34961 + }, + { + "epoch": 0.2079289180702255, + "grad_norm": 1.4453891515731812, + "learning_rate": 4.485343727828664e-05, + "loss": 4.2512, + "step": 34962 + }, + { + "epoch": 0.20793486535350653, + "grad_norm": 1.5326753854751587, + "learning_rate": 4.4853153401226135e-05, + "loss": 4.804, + "step": 34963 + }, + { + "epoch": 0.20794081263678751, + "grad_norm": 1.5941990613937378, + "learning_rate": 4.48528695172351e-05, + "loss": 5.1824, + "step": 34964 + }, + { + "epoch": 0.2079467599200685, + "grad_norm": 1.634143590927124, + "learning_rate": 4.485258562631363e-05, + "loss": 4.6857, + "step": 34965 + }, + { + "epoch": 0.20795270720334952, + "grad_norm": 1.5137478113174438, + "learning_rate": 4.485230172846182e-05, + "loss": 5.3183, + "step": 34966 + }, + { + "epoch": 0.2079586544866305, + "grad_norm": 1.673877477645874, + "learning_rate": 4.4852017823679785e-05, + "loss": 4.7819, + "step": 34967 + }, + { + "epoch": 0.2079646017699115, + "grad_norm": 1.770723581314087, + "learning_rate": 4.485173391196761e-05, + "loss": 4.0521, + "step": 34968 + }, + { + "epoch": 0.2079705490531925, + "grad_norm": 1.598290205001831, + "learning_rate": 4.485144999332541e-05, + "loss": 5.344, + "step": 34969 + }, + { + "epoch": 0.2079764963364735, + "grad_norm": 1.3610836267471313, + "learning_rate": 4.4851166067753266e-05, + "loss": 5.0096, + "step": 34970 + }, + { + "epoch": 0.2079824436197545, + "grad_norm": 1.4452751874923706, + "learning_rate": 4.485088213525129e-05, + "loss": 4.2761, + "step": 34971 + }, + { + "epoch": 0.2079883909030355, + "grad_norm": 1.6192525625228882, + "learning_rate": 4.485059819581957e-05, + "loss": 5.2101, + "step": 34972 + }, + { + "epoch": 0.2079943381863165, + "grad_norm": 1.6380634307861328, + "learning_rate": 4.4850314249458215e-05, + "loss": 5.1043, + "step": 34973 + }, + { + "epoch": 0.20800028546959748, + "grad_norm": 1.61093008518219, + "learning_rate": 4.485003029616732e-05, + "loss": 5.0412, + "step": 34974 + }, + { + "epoch": 0.2080062327528785, + "grad_norm": 2.0800046920776367, + "learning_rate": 4.4849746335946986e-05, + "loss": 4.2104, + "step": 34975 + }, + { + "epoch": 0.20801218003615948, + "grad_norm": 1.797250747680664, + "learning_rate": 4.484946236879731e-05, + "loss": 4.6008, + "step": 34976 + }, + { + "epoch": 0.20801812731944047, + "grad_norm": 1.5170632600784302, + "learning_rate": 4.4849178394718394e-05, + "loss": 5.3258, + "step": 34977 + }, + { + "epoch": 0.2080240746027215, + "grad_norm": 1.777762532234192, + "learning_rate": 4.4848894413710326e-05, + "loss": 4.4505, + "step": 34978 + }, + { + "epoch": 0.20803002188600248, + "grad_norm": 1.9124006032943726, + "learning_rate": 4.4848610425773224e-05, + "loss": 4.5747, + "step": 34979 + }, + { + "epoch": 0.20803596916928346, + "grad_norm": 1.5491348505020142, + "learning_rate": 4.4848326430907175e-05, + "loss": 5.6545, + "step": 34980 + }, + { + "epoch": 0.20804191645256448, + "grad_norm": 1.9779603481292725, + "learning_rate": 4.484804242911228e-05, + "loss": 4.5576, + "step": 34981 + }, + { + "epoch": 0.20804786373584547, + "grad_norm": 1.4585378170013428, + "learning_rate": 4.484775842038863e-05, + "loss": 4.989, + "step": 34982 + }, + { + "epoch": 0.20805381101912646, + "grad_norm": 1.6832143068313599, + "learning_rate": 4.4847474404736346e-05, + "loss": 4.9199, + "step": 34983 + }, + { + "epoch": 0.20805975830240747, + "grad_norm": 1.6539632081985474, + "learning_rate": 4.48471903821555e-05, + "loss": 4.3962, + "step": 34984 + }, + { + "epoch": 0.20806570558568846, + "grad_norm": 1.4862840175628662, + "learning_rate": 4.484690635264622e-05, + "loss": 4.9105, + "step": 34985 + }, + { + "epoch": 0.20807165286896945, + "grad_norm": 1.5097556114196777, + "learning_rate": 4.484662231620857e-05, + "loss": 4.8813, + "step": 34986 + }, + { + "epoch": 0.20807760015225046, + "grad_norm": 1.749756932258606, + "learning_rate": 4.484633827284269e-05, + "loss": 4.871, + "step": 34987 + }, + { + "epoch": 0.20808354743553145, + "grad_norm": 1.7925949096679688, + "learning_rate": 4.484605422254865e-05, + "loss": 4.4457, + "step": 34988 + }, + { + "epoch": 0.20808949471881244, + "grad_norm": 2.0869626998901367, + "learning_rate": 4.4845770165326555e-05, + "loss": 4.1676, + "step": 34989 + }, + { + "epoch": 0.20809544200209346, + "grad_norm": 1.7017414569854736, + "learning_rate": 4.484548610117651e-05, + "loss": 4.6058, + "step": 34990 + }, + { + "epoch": 0.20810138928537444, + "grad_norm": 1.696089506149292, + "learning_rate": 4.484520203009861e-05, + "loss": 4.4662, + "step": 34991 + }, + { + "epoch": 0.20810733656865543, + "grad_norm": 1.7537122964859009, + "learning_rate": 4.484491795209296e-05, + "loss": 4.4703, + "step": 34992 + }, + { + "epoch": 0.20811328385193645, + "grad_norm": 1.5926291942596436, + "learning_rate": 4.484463386715965e-05, + "loss": 4.7058, + "step": 34993 + }, + { + "epoch": 0.20811923113521744, + "grad_norm": 1.554070234298706, + "learning_rate": 4.484434977529878e-05, + "loss": 4.4741, + "step": 34994 + }, + { + "epoch": 0.20812517841849842, + "grad_norm": 1.7016302347183228, + "learning_rate": 4.484406567651045e-05, + "loss": 4.3942, + "step": 34995 + }, + { + "epoch": 0.20813112570177944, + "grad_norm": 2.093773603439331, + "learning_rate": 4.484378157079477e-05, + "loss": 3.8448, + "step": 34996 + }, + { + "epoch": 0.20813707298506043, + "grad_norm": 1.7667289972305298, + "learning_rate": 4.484349745815183e-05, + "loss": 4.0643, + "step": 34997 + }, + { + "epoch": 0.20814302026834142, + "grad_norm": 1.6492136716842651, + "learning_rate": 4.484321333858172e-05, + "loss": 4.2844, + "step": 34998 + }, + { + "epoch": 0.20814896755162243, + "grad_norm": 1.8489280939102173, + "learning_rate": 4.484292921208456e-05, + "loss": 4.2982, + "step": 34999 + }, + { + "epoch": 0.20815491483490342, + "grad_norm": 1.7901874780654907, + "learning_rate": 4.484264507866043e-05, + "loss": 4.3482, + "step": 35000 + }, + { + "epoch": 0.2081608621181844, + "grad_norm": 3.0331995487213135, + "learning_rate": 4.484236093830945e-05, + "loss": 4.3448, + "step": 35001 + }, + { + "epoch": 0.2081668094014654, + "grad_norm": 3.182864189147949, + "learning_rate": 4.484207679103169e-05, + "loss": 4.604, + "step": 35002 + }, + { + "epoch": 0.2081727566847464, + "grad_norm": 2.4753639698028564, + "learning_rate": 4.484179263682729e-05, + "loss": 4.4595, + "step": 35003 + }, + { + "epoch": 0.2081787039680274, + "grad_norm": 1.9478391408920288, + "learning_rate": 4.48415084756963e-05, + "loss": 4.1304, + "step": 35004 + }, + { + "epoch": 0.2081846512513084, + "grad_norm": 1.8722947835922241, + "learning_rate": 4.4841224307638856e-05, + "loss": 4.4465, + "step": 35005 + }, + { + "epoch": 0.2081905985345894, + "grad_norm": 1.7963893413543701, + "learning_rate": 4.4840940132655045e-05, + "loss": 4.4874, + "step": 35006 + }, + { + "epoch": 0.2081965458178704, + "grad_norm": 2.4044625759124756, + "learning_rate": 4.4840655950744965e-05, + "loss": 4.3432, + "step": 35007 + }, + { + "epoch": 0.20820249310115138, + "grad_norm": 2.240295648574829, + "learning_rate": 4.484037176190872e-05, + "loss": 4.4625, + "step": 35008 + }, + { + "epoch": 0.2082084403844324, + "grad_norm": 2.3064870834350586, + "learning_rate": 4.48400875661464e-05, + "loss": 4.3748, + "step": 35009 + }, + { + "epoch": 0.20821438766771339, + "grad_norm": 2.2277655601501465, + "learning_rate": 4.483980336345812e-05, + "loss": 4.2621, + "step": 35010 + }, + { + "epoch": 0.20822033495099437, + "grad_norm": 2.3768885135650635, + "learning_rate": 4.483951915384396e-05, + "loss": 4.3309, + "step": 35011 + }, + { + "epoch": 0.2082262822342754, + "grad_norm": 2.423457384109497, + "learning_rate": 4.4839234937304036e-05, + "loss": 4.4512, + "step": 35012 + }, + { + "epoch": 0.20823222951755638, + "grad_norm": 2.166076421737671, + "learning_rate": 4.483895071383843e-05, + "loss": 4.3357, + "step": 35013 + }, + { + "epoch": 0.20823817680083737, + "grad_norm": 2.4752864837646484, + "learning_rate": 4.483866648344727e-05, + "loss": 4.3325, + "step": 35014 + }, + { + "epoch": 0.20824412408411838, + "grad_norm": 2.4272568225860596, + "learning_rate": 4.483838224613062e-05, + "loss": 4.3429, + "step": 35015 + }, + { + "epoch": 0.20825007136739937, + "grad_norm": 2.4228246212005615, + "learning_rate": 4.48380980018886e-05, + "loss": 4.2242, + "step": 35016 + }, + { + "epoch": 0.20825601865068036, + "grad_norm": 2.2205100059509277, + "learning_rate": 4.4837813750721305e-05, + "loss": 4.3772, + "step": 35017 + }, + { + "epoch": 0.20826196593396137, + "grad_norm": 2.3196351528167725, + "learning_rate": 4.483752949262884e-05, + "loss": 4.1687, + "step": 35018 + }, + { + "epoch": 0.20826791321724236, + "grad_norm": 2.2172744274139404, + "learning_rate": 4.483724522761129e-05, + "loss": 4.2437, + "step": 35019 + }, + { + "epoch": 0.20827386050052335, + "grad_norm": 1.8818265199661255, + "learning_rate": 4.4836960955668773e-05, + "loss": 4.1817, + "step": 35020 + }, + { + "epoch": 0.20827980778380437, + "grad_norm": 2.141326904296875, + "learning_rate": 4.483667667680137e-05, + "loss": 4.2353, + "step": 35021 + }, + { + "epoch": 0.20828575506708535, + "grad_norm": 2.064363956451416, + "learning_rate": 4.483639239100919e-05, + "loss": 4.0952, + "step": 35022 + }, + { + "epoch": 0.20829170235036634, + "grad_norm": 1.8391005992889404, + "learning_rate": 4.483610809829232e-05, + "loss": 4.3724, + "step": 35023 + }, + { + "epoch": 0.20829764963364736, + "grad_norm": 2.879714250564575, + "learning_rate": 4.4835823798650884e-05, + "loss": 3.7298, + "step": 35024 + }, + { + "epoch": 0.20830359691692835, + "grad_norm": 2.702657699584961, + "learning_rate": 4.483553949208496e-05, + "loss": 3.5413, + "step": 35025 + }, + { + "epoch": 0.20830954420020933, + "grad_norm": 2.232855796813965, + "learning_rate": 4.483525517859466e-05, + "loss": 4.5523, + "step": 35026 + }, + { + "epoch": 0.20831549148349035, + "grad_norm": 2.239912748336792, + "learning_rate": 4.483497085818007e-05, + "loss": 4.139, + "step": 35027 + }, + { + "epoch": 0.20832143876677134, + "grad_norm": 2.0883944034576416, + "learning_rate": 4.4834686530841296e-05, + "loss": 4.1629, + "step": 35028 + }, + { + "epoch": 0.20832738605005233, + "grad_norm": 1.9795372486114502, + "learning_rate": 4.483440219657845e-05, + "loss": 4.485, + "step": 35029 + }, + { + "epoch": 0.20833333333333334, + "grad_norm": 1.6449062824249268, + "learning_rate": 4.48341178553916e-05, + "loss": 4.457, + "step": 35030 + }, + { + "epoch": 0.20833928061661433, + "grad_norm": 2.1899871826171875, + "learning_rate": 4.4833833507280884e-05, + "loss": 4.7317, + "step": 35031 + }, + { + "epoch": 0.20834522789989532, + "grad_norm": 1.852655291557312, + "learning_rate": 4.483354915224637e-05, + "loss": 4.8662, + "step": 35032 + }, + { + "epoch": 0.20835117518317633, + "grad_norm": 1.612601399421692, + "learning_rate": 4.4833264790288175e-05, + "loss": 4.4567, + "step": 35033 + }, + { + "epoch": 0.20835712246645732, + "grad_norm": 1.7933584451675415, + "learning_rate": 4.483298042140639e-05, + "loss": 4.8182, + "step": 35034 + }, + { + "epoch": 0.2083630697497383, + "grad_norm": 1.7479445934295654, + "learning_rate": 4.483269604560111e-05, + "loss": 4.571, + "step": 35035 + }, + { + "epoch": 0.20836901703301933, + "grad_norm": 1.9602153301239014, + "learning_rate": 4.4832411662872445e-05, + "loss": 4.4672, + "step": 35036 + }, + { + "epoch": 0.20837496431630032, + "grad_norm": 2.1408305168151855, + "learning_rate": 4.4832127273220494e-05, + "loss": 4.8864, + "step": 35037 + }, + { + "epoch": 0.2083809115995813, + "grad_norm": 2.346951961517334, + "learning_rate": 4.483184287664535e-05, + "loss": 4.5457, + "step": 35038 + }, + { + "epoch": 0.20838685888286232, + "grad_norm": 2.623779535293579, + "learning_rate": 4.483155847314712e-05, + "loss": 4.4477, + "step": 35039 + }, + { + "epoch": 0.2083928061661433, + "grad_norm": 2.211894989013672, + "learning_rate": 4.483127406272588e-05, + "loss": 3.9478, + "step": 35040 + }, + { + "epoch": 0.2083987534494243, + "grad_norm": 2.5383923053741455, + "learning_rate": 4.483098964538176e-05, + "loss": 3.6751, + "step": 35041 + }, + { + "epoch": 0.2084047007327053, + "grad_norm": 1.7223814725875854, + "learning_rate": 4.4830705221114845e-05, + "loss": 4.9108, + "step": 35042 + }, + { + "epoch": 0.2084106480159863, + "grad_norm": 1.842361330986023, + "learning_rate": 4.483042078992524e-05, + "loss": 5.1871, + "step": 35043 + }, + { + "epoch": 0.2084165952992673, + "grad_norm": 1.7734646797180176, + "learning_rate": 4.4830136351813035e-05, + "loss": 4.9155, + "step": 35044 + }, + { + "epoch": 0.2084225425825483, + "grad_norm": 1.9333977699279785, + "learning_rate": 4.482985190677834e-05, + "loss": 4.6412, + "step": 35045 + }, + { + "epoch": 0.2084284898658293, + "grad_norm": 1.7834563255310059, + "learning_rate": 4.4829567454821244e-05, + "loss": 4.7411, + "step": 35046 + }, + { + "epoch": 0.20843443714911028, + "grad_norm": 1.4860880374908447, + "learning_rate": 4.4829282995941846e-05, + "loss": 4.899, + "step": 35047 + }, + { + "epoch": 0.2084403844323913, + "grad_norm": 3.1144561767578125, + "learning_rate": 4.482899853014025e-05, + "loss": 3.0548, + "step": 35048 + }, + { + "epoch": 0.20844633171567228, + "grad_norm": 2.9812357425689697, + "learning_rate": 4.482871405741657e-05, + "loss": 2.8871, + "step": 35049 + }, + { + "epoch": 0.20845227899895327, + "grad_norm": 2.895320177078247, + "learning_rate": 4.4828429577770876e-05, + "loss": 2.1233, + "step": 35050 + }, + { + "epoch": 0.2084582262822343, + "grad_norm": 2.7188355922698975, + "learning_rate": 4.482814509120329e-05, + "loss": 2.3401, + "step": 35051 + }, + { + "epoch": 0.20846417356551528, + "grad_norm": 2.81449818611145, + "learning_rate": 4.48278605977139e-05, + "loss": 2.8756, + "step": 35052 + }, + { + "epoch": 0.20847012084879626, + "grad_norm": 2.326000452041626, + "learning_rate": 4.482757609730281e-05, + "loss": 3.6878, + "step": 35053 + }, + { + "epoch": 0.20847606813207728, + "grad_norm": 2.1207025051116943, + "learning_rate": 4.482729158997011e-05, + "loss": 4.064, + "step": 35054 + }, + { + "epoch": 0.20848201541535827, + "grad_norm": 1.9843616485595703, + "learning_rate": 4.482700707571592e-05, + "loss": 4.6437, + "step": 35055 + }, + { + "epoch": 0.20848796269863926, + "grad_norm": 1.9951170682907104, + "learning_rate": 4.4826722554540316e-05, + "loss": 4.7161, + "step": 35056 + }, + { + "epoch": 0.20849390998192027, + "grad_norm": 1.8895988464355469, + "learning_rate": 4.482643802644342e-05, + "loss": 4.8795, + "step": 35057 + }, + { + "epoch": 0.20849985726520126, + "grad_norm": 1.9991610050201416, + "learning_rate": 4.4826153491425307e-05, + "loss": 4.45, + "step": 35058 + }, + { + "epoch": 0.20850580454848225, + "grad_norm": 3.778761386871338, + "learning_rate": 4.4825868949486095e-05, + "loss": 3.1729, + "step": 35059 + }, + { + "epoch": 0.20851175183176324, + "grad_norm": 3.2395191192626953, + "learning_rate": 4.482558440062587e-05, + "loss": 3.8024, + "step": 35060 + }, + { + "epoch": 0.20851769911504425, + "grad_norm": 3.4706618785858154, + "learning_rate": 4.4825299844844746e-05, + "loss": 3.5835, + "step": 35061 + }, + { + "epoch": 0.20852364639832524, + "grad_norm": 3.5210063457489014, + "learning_rate": 4.482501528214282e-05, + "loss": 2.6244, + "step": 35062 + }, + { + "epoch": 0.20852959368160623, + "grad_norm": 3.2918505668640137, + "learning_rate": 4.482473071252018e-05, + "loss": 2.6852, + "step": 35063 + }, + { + "epoch": 0.20853554096488724, + "grad_norm": 2.6687605381011963, + "learning_rate": 4.4824446135976926e-05, + "loss": 1.4744, + "step": 35064 + }, + { + "epoch": 0.20854148824816823, + "grad_norm": 3.270942211151123, + "learning_rate": 4.4824161552513164e-05, + "loss": 2.2422, + "step": 35065 + }, + { + "epoch": 0.20854743553144922, + "grad_norm": 3.1928775310516357, + "learning_rate": 4.4823876962128994e-05, + "loss": 2.8081, + "step": 35066 + }, + { + "epoch": 0.20855338281473024, + "grad_norm": 3.082271099090576, + "learning_rate": 4.482359236482452e-05, + "loss": 2.5842, + "step": 35067 + }, + { + "epoch": 0.20855933009801123, + "grad_norm": 2.944580316543579, + "learning_rate": 4.482330776059983e-05, + "loss": 2.9251, + "step": 35068 + }, + { + "epoch": 0.2085652773812922, + "grad_norm": 3.051842451095581, + "learning_rate": 4.4823023149455024e-05, + "loss": 2.4925, + "step": 35069 + }, + { + "epoch": 0.20857122466457323, + "grad_norm": 2.8280301094055176, + "learning_rate": 4.482273853139021e-05, + "loss": 2.982, + "step": 35070 + }, + { + "epoch": 0.20857717194785422, + "grad_norm": 1.844641089439392, + "learning_rate": 4.482245390640548e-05, + "loss": 5.0901, + "step": 35071 + }, + { + "epoch": 0.2085831192311352, + "grad_norm": 1.7683004140853882, + "learning_rate": 4.4822169274500936e-05, + "loss": 5.0017, + "step": 35072 + }, + { + "epoch": 0.20858906651441622, + "grad_norm": 1.6485508680343628, + "learning_rate": 4.482188463567668e-05, + "loss": 4.7149, + "step": 35073 + }, + { + "epoch": 0.2085950137976972, + "grad_norm": 1.6834670305252075, + "learning_rate": 4.482159998993281e-05, + "loss": 4.898, + "step": 35074 + }, + { + "epoch": 0.2086009610809782, + "grad_norm": 1.6922709941864014, + "learning_rate": 4.482131533726942e-05, + "loss": 4.647, + "step": 35075 + }, + { + "epoch": 0.2086069083642592, + "grad_norm": 2.076922655105591, + "learning_rate": 4.482103067768662e-05, + "loss": 4.0892, + "step": 35076 + }, + { + "epoch": 0.2086128556475402, + "grad_norm": 2.6704981327056885, + "learning_rate": 4.4820746011184496e-05, + "loss": 4.3682, + "step": 35077 + }, + { + "epoch": 0.2086188029308212, + "grad_norm": 2.4579737186431885, + "learning_rate": 4.482046133776316e-05, + "loss": 4.1968, + "step": 35078 + }, + { + "epoch": 0.2086247502141022, + "grad_norm": 2.7950711250305176, + "learning_rate": 4.4820176657422693e-05, + "loss": 4.2779, + "step": 35079 + }, + { + "epoch": 0.2086306974973832, + "grad_norm": 2.204728364944458, + "learning_rate": 4.4819891970163216e-05, + "loss": 4.3001, + "step": 35080 + }, + { + "epoch": 0.20863664478066418, + "grad_norm": 1.8583356142044067, + "learning_rate": 4.4819607275984835e-05, + "loss": 4.3775, + "step": 35081 + }, + { + "epoch": 0.2086425920639452, + "grad_norm": 1.8849300146102905, + "learning_rate": 4.481932257488761e-05, + "loss": 4.6873, + "step": 35082 + }, + { + "epoch": 0.2086485393472262, + "grad_norm": 1.940974235534668, + "learning_rate": 4.481903786687167e-05, + "loss": 4.2494, + "step": 35083 + }, + { + "epoch": 0.20865448663050717, + "grad_norm": 2.0488009452819824, + "learning_rate": 4.481875315193712e-05, + "loss": 4.1133, + "step": 35084 + }, + { + "epoch": 0.2086604339137882, + "grad_norm": 1.903907060623169, + "learning_rate": 4.481846843008404e-05, + "loss": 4.3153, + "step": 35085 + }, + { + "epoch": 0.20866638119706918, + "grad_norm": 2.271176815032959, + "learning_rate": 4.4818183701312534e-05, + "loss": 4.1729, + "step": 35086 + }, + { + "epoch": 0.20867232848035017, + "grad_norm": 1.9775025844573975, + "learning_rate": 4.481789896562271e-05, + "loss": 4.4771, + "step": 35087 + }, + { + "epoch": 0.20867827576363118, + "grad_norm": 2.0481247901916504, + "learning_rate": 4.481761422301466e-05, + "loss": 4.3511, + "step": 35088 + }, + { + "epoch": 0.20868422304691217, + "grad_norm": 1.8747283220291138, + "learning_rate": 4.481732947348849e-05, + "loss": 4.4928, + "step": 35089 + }, + { + "epoch": 0.20869017033019316, + "grad_norm": 1.950616478919983, + "learning_rate": 4.4817044717044285e-05, + "loss": 4.3065, + "step": 35090 + }, + { + "epoch": 0.20869611761347417, + "grad_norm": 2.182492256164551, + "learning_rate": 4.481675995368216e-05, + "loss": 5.4017, + "step": 35091 + }, + { + "epoch": 0.20870206489675516, + "grad_norm": 1.7582393884658813, + "learning_rate": 4.4816475183402215e-05, + "loss": 3.7494, + "step": 35092 + }, + { + "epoch": 0.20870801218003615, + "grad_norm": 1.568738579750061, + "learning_rate": 4.481619040620454e-05, + "loss": 5.4418, + "step": 35093 + }, + { + "epoch": 0.20871395946331717, + "grad_norm": 2.5985677242279053, + "learning_rate": 4.4815905622089226e-05, + "loss": 4.8025, + "step": 35094 + }, + { + "epoch": 0.20871990674659816, + "grad_norm": 3.3421452045440674, + "learning_rate": 4.48156208310564e-05, + "loss": 4.5642, + "step": 35095 + }, + { + "epoch": 0.20872585402987914, + "grad_norm": 3.004498243331909, + "learning_rate": 4.4815336033106137e-05, + "loss": 4.0177, + "step": 35096 + }, + { + "epoch": 0.20873180131316016, + "grad_norm": 3.208674192428589, + "learning_rate": 4.481505122823855e-05, + "loss": 3.7116, + "step": 35097 + }, + { + "epoch": 0.20873774859644115, + "grad_norm": 2.824521064758301, + "learning_rate": 4.481476641645373e-05, + "loss": 4.1009, + "step": 35098 + }, + { + "epoch": 0.20874369587972214, + "grad_norm": 2.6789655685424805, + "learning_rate": 4.481448159775178e-05, + "loss": 3.7663, + "step": 35099 + }, + { + "epoch": 0.20874964316300315, + "grad_norm": 2.2127678394317627, + "learning_rate": 4.4814196772132796e-05, + "loss": 4.2904, + "step": 35100 + }, + { + "epoch": 0.20875559044628414, + "grad_norm": 2.5038135051727295, + "learning_rate": 4.481391193959689e-05, + "loss": 4.494, + "step": 35101 + }, + { + "epoch": 0.20876153772956513, + "grad_norm": 2.7562382221221924, + "learning_rate": 4.481362710014414e-05, + "loss": 4.677, + "step": 35102 + }, + { + "epoch": 0.20876748501284614, + "grad_norm": 2.437014579772949, + "learning_rate": 4.481334225377466e-05, + "loss": 4.451, + "step": 35103 + }, + { + "epoch": 0.20877343229612713, + "grad_norm": 2.1886918544769287, + "learning_rate": 4.481305740048856e-05, + "loss": 4.5499, + "step": 35104 + }, + { + "epoch": 0.20877937957940812, + "grad_norm": 2.4093780517578125, + "learning_rate": 4.481277254028591e-05, + "loss": 4.4421, + "step": 35105 + }, + { + "epoch": 0.20878532686268914, + "grad_norm": 2.2635338306427, + "learning_rate": 4.4812487673166834e-05, + "loss": 4.2136, + "step": 35106 + }, + { + "epoch": 0.20879127414597012, + "grad_norm": 2.370861291885376, + "learning_rate": 4.481220279913142e-05, + "loss": 3.9111, + "step": 35107 + }, + { + "epoch": 0.2087972214292511, + "grad_norm": 2.5577683448791504, + "learning_rate": 4.481191791817977e-05, + "loss": 3.6726, + "step": 35108 + }, + { + "epoch": 0.20880316871253213, + "grad_norm": 2.295682430267334, + "learning_rate": 4.481163303031199e-05, + "loss": 4.3153, + "step": 35109 + }, + { + "epoch": 0.20880911599581312, + "grad_norm": 2.155688762664795, + "learning_rate": 4.4811348135528165e-05, + "loss": 4.3882, + "step": 35110 + }, + { + "epoch": 0.2088150632790941, + "grad_norm": 1.9954904317855835, + "learning_rate": 4.481106323382841e-05, + "loss": 4.3073, + "step": 35111 + }, + { + "epoch": 0.20882101056237512, + "grad_norm": 2.2071473598480225, + "learning_rate": 4.481077832521282e-05, + "loss": 4.0741, + "step": 35112 + }, + { + "epoch": 0.2088269578456561, + "grad_norm": 2.506493330001831, + "learning_rate": 4.4810493409681486e-05, + "loss": 3.9014, + "step": 35113 + }, + { + "epoch": 0.2088329051289371, + "grad_norm": 2.4370062351226807, + "learning_rate": 4.4810208487234515e-05, + "loss": 4.2152, + "step": 35114 + }, + { + "epoch": 0.2088388524122181, + "grad_norm": 2.3963093757629395, + "learning_rate": 4.4809923557872e-05, + "loss": 3.9049, + "step": 35115 + }, + { + "epoch": 0.2088447996954991, + "grad_norm": 2.477271556854248, + "learning_rate": 4.4809638621594054e-05, + "loss": 3.8888, + "step": 35116 + }, + { + "epoch": 0.2088507469787801, + "grad_norm": 2.3511276245117188, + "learning_rate": 4.480935367840076e-05, + "loss": 4.0679, + "step": 35117 + }, + { + "epoch": 0.20885669426206108, + "grad_norm": 2.7005770206451416, + "learning_rate": 4.480906872829223e-05, + "loss": 4.3311, + "step": 35118 + }, + { + "epoch": 0.2088626415453421, + "grad_norm": 2.63441801071167, + "learning_rate": 4.480878377126856e-05, + "loss": 4.4622, + "step": 35119 + }, + { + "epoch": 0.20886858882862308, + "grad_norm": 2.249758005142212, + "learning_rate": 4.480849880732985e-05, + "loss": 4.2341, + "step": 35120 + }, + { + "epoch": 0.20887453611190407, + "grad_norm": 2.5190210342407227, + "learning_rate": 4.480821383647619e-05, + "loss": 4.0408, + "step": 35121 + }, + { + "epoch": 0.20888048339518508, + "grad_norm": 2.223970890045166, + "learning_rate": 4.4807928858707696e-05, + "loss": 4.2521, + "step": 35122 + }, + { + "epoch": 0.20888643067846607, + "grad_norm": 2.336270570755005, + "learning_rate": 4.480764387402445e-05, + "loss": 4.2149, + "step": 35123 + }, + { + "epoch": 0.20889237796174706, + "grad_norm": 2.7396438121795654, + "learning_rate": 4.4807358882426564e-05, + "loss": 4.0704, + "step": 35124 + }, + { + "epoch": 0.20889832524502808, + "grad_norm": 2.5378661155700684, + "learning_rate": 4.480707388391413e-05, + "loss": 3.8361, + "step": 35125 + }, + { + "epoch": 0.20890427252830907, + "grad_norm": 2.328519582748413, + "learning_rate": 4.480678887848726e-05, + "loss": 4.2595, + "step": 35126 + }, + { + "epoch": 0.20891021981159005, + "grad_norm": 2.6499722003936768, + "learning_rate": 4.4806503866146036e-05, + "loss": 4.1182, + "step": 35127 + }, + { + "epoch": 0.20891616709487107, + "grad_norm": 2.24397349357605, + "learning_rate": 4.480621884689057e-05, + "loss": 4.1824, + "step": 35128 + }, + { + "epoch": 0.20892211437815206, + "grad_norm": 2.074115514755249, + "learning_rate": 4.480593382072096e-05, + "loss": 4.2477, + "step": 35129 + }, + { + "epoch": 0.20892806166143305, + "grad_norm": 2.1369383335113525, + "learning_rate": 4.48056487876373e-05, + "loss": 4.138, + "step": 35130 + }, + { + "epoch": 0.20893400894471406, + "grad_norm": 2.288029193878174, + "learning_rate": 4.48053637476397e-05, + "loss": 4.2829, + "step": 35131 + }, + { + "epoch": 0.20893995622799505, + "grad_norm": 2.116546869277954, + "learning_rate": 4.4805078700728235e-05, + "loss": 3.9769, + "step": 35132 + }, + { + "epoch": 0.20894590351127604, + "grad_norm": 2.4647371768951416, + "learning_rate": 4.480479364690303e-05, + "loss": 3.9284, + "step": 35133 + }, + { + "epoch": 0.20895185079455705, + "grad_norm": 2.188466787338257, + "learning_rate": 4.4804508586164184e-05, + "loss": 4.2483, + "step": 35134 + }, + { + "epoch": 0.20895779807783804, + "grad_norm": 2.6960582733154297, + "learning_rate": 4.480422351851178e-05, + "loss": 4.5028, + "step": 35135 + }, + { + "epoch": 0.20896374536111903, + "grad_norm": 2.6402602195739746, + "learning_rate": 4.480393844394592e-05, + "loss": 4.4322, + "step": 35136 + }, + { + "epoch": 0.20896969264440005, + "grad_norm": 2.3040831089019775, + "learning_rate": 4.480365336246673e-05, + "loss": 4.6079, + "step": 35137 + }, + { + "epoch": 0.20897563992768103, + "grad_norm": 2.240013360977173, + "learning_rate": 4.480336827407427e-05, + "loss": 4.4865, + "step": 35138 + }, + { + "epoch": 0.20898158721096202, + "grad_norm": 2.362314462661743, + "learning_rate": 4.4803083178768667e-05, + "loss": 4.0822, + "step": 35139 + }, + { + "epoch": 0.20898753449424304, + "grad_norm": 2.440065622329712, + "learning_rate": 4.480279807655001e-05, + "loss": 4.6024, + "step": 35140 + }, + { + "epoch": 0.20899348177752403, + "grad_norm": 2.482828378677368, + "learning_rate": 4.480251296741841e-05, + "loss": 4.4708, + "step": 35141 + }, + { + "epoch": 0.20899942906080501, + "grad_norm": 2.3551008701324463, + "learning_rate": 4.480222785137395e-05, + "loss": 4.4908, + "step": 35142 + }, + { + "epoch": 0.20900537634408603, + "grad_norm": 2.2475407123565674, + "learning_rate": 4.4801942728416734e-05, + "loss": 4.5739, + "step": 35143 + }, + { + "epoch": 0.20901132362736702, + "grad_norm": 2.2544219493865967, + "learning_rate": 4.4801657598546865e-05, + "loss": 4.6347, + "step": 35144 + }, + { + "epoch": 0.209017270910648, + "grad_norm": 1.9965347051620483, + "learning_rate": 4.480137246176445e-05, + "loss": 4.5584, + "step": 35145 + }, + { + "epoch": 0.20902321819392902, + "grad_norm": 2.303065061569214, + "learning_rate": 4.480108731806957e-05, + "loss": 4.5769, + "step": 35146 + }, + { + "epoch": 0.20902916547721, + "grad_norm": 2.274526596069336, + "learning_rate": 4.480080216746234e-05, + "loss": 4.5248, + "step": 35147 + }, + { + "epoch": 0.209035112760491, + "grad_norm": 2.3997044563293457, + "learning_rate": 4.480051700994286e-05, + "loss": 4.713, + "step": 35148 + }, + { + "epoch": 0.20904106004377201, + "grad_norm": 2.116888999938965, + "learning_rate": 4.480023184551121e-05, + "loss": 4.4275, + "step": 35149 + }, + { + "epoch": 0.209047007327053, + "grad_norm": 2.7580020427703857, + "learning_rate": 4.479994667416751e-05, + "loss": 4.0885, + "step": 35150 + }, + { + "epoch": 0.209052954610334, + "grad_norm": 2.0336437225341797, + "learning_rate": 4.479966149591186e-05, + "loss": 4.1945, + "step": 35151 + }, + { + "epoch": 0.209058901893615, + "grad_norm": 2.166522979736328, + "learning_rate": 4.479937631074435e-05, + "loss": 4.3606, + "step": 35152 + }, + { + "epoch": 0.209064849176896, + "grad_norm": 2.0222151279449463, + "learning_rate": 4.479909111866507e-05, + "loss": 4.2598, + "step": 35153 + }, + { + "epoch": 0.20907079646017698, + "grad_norm": 2.1556873321533203, + "learning_rate": 4.479880591967415e-05, + "loss": 4.2919, + "step": 35154 + }, + { + "epoch": 0.209076743743458, + "grad_norm": 1.9412826299667358, + "learning_rate": 4.4798520713771655e-05, + "loss": 4.3147, + "step": 35155 + }, + { + "epoch": 0.209082691026739, + "grad_norm": 2.263427972793579, + "learning_rate": 4.479823550095771e-05, + "loss": 4.2697, + "step": 35156 + }, + { + "epoch": 0.20908863831001998, + "grad_norm": 2.105473756790161, + "learning_rate": 4.4797950281232405e-05, + "loss": 4.2734, + "step": 35157 + }, + { + "epoch": 0.209094585593301, + "grad_norm": 2.31563138961792, + "learning_rate": 4.479766505459584e-05, + "loss": 4.3769, + "step": 35158 + }, + { + "epoch": 0.20910053287658198, + "grad_norm": 2.2249670028686523, + "learning_rate": 4.479737982104811e-05, + "loss": 4.358, + "step": 35159 + }, + { + "epoch": 0.20910648015986297, + "grad_norm": 1.9306457042694092, + "learning_rate": 4.479709458058933e-05, + "loss": 4.5871, + "step": 35160 + }, + { + "epoch": 0.20911242744314398, + "grad_norm": 2.4048049449920654, + "learning_rate": 4.479680933321958e-05, + "loss": 4.164, + "step": 35161 + }, + { + "epoch": 0.20911837472642497, + "grad_norm": 2.171954393386841, + "learning_rate": 4.4796524078938974e-05, + "loss": 4.3285, + "step": 35162 + }, + { + "epoch": 0.20912432200970596, + "grad_norm": 2.1672539710998535, + "learning_rate": 4.47962388177476e-05, + "loss": 3.9887, + "step": 35163 + }, + { + "epoch": 0.20913026929298698, + "grad_norm": 2.4803264141082764, + "learning_rate": 4.479595354964556e-05, + "loss": 4.2461, + "step": 35164 + }, + { + "epoch": 0.20913621657626796, + "grad_norm": 2.385725975036621, + "learning_rate": 4.4795668274632965e-05, + "loss": 4.3727, + "step": 35165 + }, + { + "epoch": 0.20914216385954895, + "grad_norm": 2.17445707321167, + "learning_rate": 4.4795382992709914e-05, + "loss": 4.386, + "step": 35166 + }, + { + "epoch": 0.20914811114282997, + "grad_norm": 2.45085072517395, + "learning_rate": 4.4795097703876484e-05, + "loss": 4.3429, + "step": 35167 + }, + { + "epoch": 0.20915405842611096, + "grad_norm": 2.0739786624908447, + "learning_rate": 4.4794812408132796e-05, + "loss": 4.2034, + "step": 35168 + }, + { + "epoch": 0.20916000570939194, + "grad_norm": 2.2545764446258545, + "learning_rate": 4.4794527105478946e-05, + "loss": 4.3227, + "step": 35169 + }, + { + "epoch": 0.20916595299267296, + "grad_norm": 2.1697545051574707, + "learning_rate": 4.479424179591503e-05, + "loss": 4.4496, + "step": 35170 + }, + { + "epoch": 0.20917190027595395, + "grad_norm": 2.493567943572998, + "learning_rate": 4.4793956479441144e-05, + "loss": 4.2328, + "step": 35171 + }, + { + "epoch": 0.20917784755923494, + "grad_norm": 2.3742611408233643, + "learning_rate": 4.4793671156057396e-05, + "loss": 4.3438, + "step": 35172 + }, + { + "epoch": 0.20918379484251595, + "grad_norm": 2.3343236446380615, + "learning_rate": 4.4793385825763885e-05, + "loss": 4.1543, + "step": 35173 + }, + { + "epoch": 0.20918974212579694, + "grad_norm": 2.553321599960327, + "learning_rate": 4.47931004885607e-05, + "loss": 4.1855, + "step": 35174 + }, + { + "epoch": 0.20919568940907793, + "grad_norm": 2.2950751781463623, + "learning_rate": 4.4792815144447954e-05, + "loss": 4.2327, + "step": 35175 + }, + { + "epoch": 0.20920163669235892, + "grad_norm": 2.211557388305664, + "learning_rate": 4.4792529793425744e-05, + "loss": 4.2977, + "step": 35176 + }, + { + "epoch": 0.20920758397563993, + "grad_norm": 2.1329169273376465, + "learning_rate": 4.479224443549416e-05, + "loss": 4.0576, + "step": 35177 + }, + { + "epoch": 0.20921353125892092, + "grad_norm": 2.223177194595337, + "learning_rate": 4.4791959070653304e-05, + "loss": 4.2397, + "step": 35178 + }, + { + "epoch": 0.2092194785422019, + "grad_norm": 2.0334205627441406, + "learning_rate": 4.479167369890328e-05, + "loss": 4.3574, + "step": 35179 + }, + { + "epoch": 0.20922542582548292, + "grad_norm": 2.3178441524505615, + "learning_rate": 4.47913883202442e-05, + "loss": 4.1552, + "step": 35180 + }, + { + "epoch": 0.2092313731087639, + "grad_norm": 2.5519886016845703, + "learning_rate": 4.479110293467614e-05, + "loss": 4.0306, + "step": 35181 + }, + { + "epoch": 0.2092373203920449, + "grad_norm": 2.1643712520599365, + "learning_rate": 4.479081754219922e-05, + "loss": 4.2537, + "step": 35182 + }, + { + "epoch": 0.20924326767532592, + "grad_norm": 2.2243382930755615, + "learning_rate": 4.479053214281352e-05, + "loss": 4.271, + "step": 35183 + }, + { + "epoch": 0.2092492149586069, + "grad_norm": 2.1648247241973877, + "learning_rate": 4.4790246736519145e-05, + "loss": 4.1709, + "step": 35184 + }, + { + "epoch": 0.2092551622418879, + "grad_norm": 2.4812254905700684, + "learning_rate": 4.478996132331621e-05, + "loss": 3.6349, + "step": 35185 + }, + { + "epoch": 0.2092611095251689, + "grad_norm": 1.966122031211853, + "learning_rate": 4.4789675903204805e-05, + "loss": 4.0373, + "step": 35186 + }, + { + "epoch": 0.2092670568084499, + "grad_norm": 2.027426242828369, + "learning_rate": 4.4789390476185024e-05, + "loss": 4.005, + "step": 35187 + }, + { + "epoch": 0.20927300409173089, + "grad_norm": 2.40824818611145, + "learning_rate": 4.478910504225697e-05, + "loss": 3.7404, + "step": 35188 + }, + { + "epoch": 0.2092789513750119, + "grad_norm": 2.3484485149383545, + "learning_rate": 4.478881960142075e-05, + "loss": 4.4463, + "step": 35189 + }, + { + "epoch": 0.2092848986582929, + "grad_norm": 2.3539352416992188, + "learning_rate": 4.4788534153676455e-05, + "loss": 4.2863, + "step": 35190 + }, + { + "epoch": 0.20929084594157388, + "grad_norm": 2.364746570587158, + "learning_rate": 4.478824869902418e-05, + "loss": 4.1668, + "step": 35191 + }, + { + "epoch": 0.2092967932248549, + "grad_norm": 1.9087117910385132, + "learning_rate": 4.478796323746404e-05, + "loss": 4.5418, + "step": 35192 + }, + { + "epoch": 0.20930274050813588, + "grad_norm": 1.5683953762054443, + "learning_rate": 4.478767776899612e-05, + "loss": 4.593, + "step": 35193 + }, + { + "epoch": 0.20930868779141687, + "grad_norm": 1.957962155342102, + "learning_rate": 4.478739229362053e-05, + "loss": 4.4794, + "step": 35194 + }, + { + "epoch": 0.20931463507469789, + "grad_norm": 2.0151965618133545, + "learning_rate": 4.478710681133737e-05, + "loss": 4.4208, + "step": 35195 + }, + { + "epoch": 0.20932058235797887, + "grad_norm": 2.0356502532958984, + "learning_rate": 4.4786821322146735e-05, + "loss": 4.3885, + "step": 35196 + }, + { + "epoch": 0.20932652964125986, + "grad_norm": 1.8693149089813232, + "learning_rate": 4.4786535826048714e-05, + "loss": 4.1047, + "step": 35197 + }, + { + "epoch": 0.20933247692454088, + "grad_norm": 1.9223167896270752, + "learning_rate": 4.478625032304343e-05, + "loss": 4.5433, + "step": 35198 + }, + { + "epoch": 0.20933842420782187, + "grad_norm": 1.6214507818222046, + "learning_rate": 4.4785964813130964e-05, + "loss": 4.5297, + "step": 35199 + }, + { + "epoch": 0.20934437149110285, + "grad_norm": 1.4718947410583496, + "learning_rate": 4.478567929631142e-05, + "loss": 4.2412, + "step": 35200 + }, + { + "epoch": 0.20935031877438387, + "grad_norm": 1.8548481464385986, + "learning_rate": 4.478539377258491e-05, + "loss": 4.67, + "step": 35201 + }, + { + "epoch": 0.20935626605766486, + "grad_norm": 1.70490300655365, + "learning_rate": 4.478510824195151e-05, + "loss": 4.484, + "step": 35202 + }, + { + "epoch": 0.20936221334094585, + "grad_norm": 1.9421411752700806, + "learning_rate": 4.478482270441135e-05, + "loss": 4.3252, + "step": 35203 + }, + { + "epoch": 0.20936816062422686, + "grad_norm": 1.811452865600586, + "learning_rate": 4.47845371599645e-05, + "loss": 4.462, + "step": 35204 + }, + { + "epoch": 0.20937410790750785, + "grad_norm": 1.5066571235656738, + "learning_rate": 4.478425160861107e-05, + "loss": 4.8129, + "step": 35205 + }, + { + "epoch": 0.20938005519078884, + "grad_norm": 1.7884474992752075, + "learning_rate": 4.478396605035117e-05, + "loss": 4.5078, + "step": 35206 + }, + { + "epoch": 0.20938600247406985, + "grad_norm": 1.5454497337341309, + "learning_rate": 4.4783680485184885e-05, + "loss": 4.3778, + "step": 35207 + }, + { + "epoch": 0.20939194975735084, + "grad_norm": 1.5749461650848389, + "learning_rate": 4.478339491311233e-05, + "loss": 4.6328, + "step": 35208 + }, + { + "epoch": 0.20939789704063183, + "grad_norm": 1.6245211362838745, + "learning_rate": 4.478310933413359e-05, + "loss": 4.7798, + "step": 35209 + }, + { + "epoch": 0.20940384432391285, + "grad_norm": 1.6137746572494507, + "learning_rate": 4.478282374824878e-05, + "loss": 4.2092, + "step": 35210 + }, + { + "epoch": 0.20940979160719383, + "grad_norm": 1.6076363325119019, + "learning_rate": 4.478253815545798e-05, + "loss": 4.0179, + "step": 35211 + }, + { + "epoch": 0.20941573889047482, + "grad_norm": 2.2339091300964355, + "learning_rate": 4.4782252555761304e-05, + "loss": 4.5163, + "step": 35212 + }, + { + "epoch": 0.20942168617375584, + "grad_norm": 2.1667139530181885, + "learning_rate": 4.478196694915885e-05, + "loss": 4.623, + "step": 35213 + }, + { + "epoch": 0.20942763345703683, + "grad_norm": 1.8673685789108276, + "learning_rate": 4.478168133565071e-05, + "loss": 4.5131, + "step": 35214 + }, + { + "epoch": 0.20943358074031782, + "grad_norm": 1.7350704669952393, + "learning_rate": 4.4781395715237e-05, + "loss": 4.6335, + "step": 35215 + }, + { + "epoch": 0.20943952802359883, + "grad_norm": 2.038442611694336, + "learning_rate": 4.4781110087917796e-05, + "loss": 4.071, + "step": 35216 + }, + { + "epoch": 0.20944547530687982, + "grad_norm": 1.670720100402832, + "learning_rate": 4.478082445369322e-05, + "loss": 4.8153, + "step": 35217 + }, + { + "epoch": 0.2094514225901608, + "grad_norm": 1.5883606672286987, + "learning_rate": 4.4780538812563355e-05, + "loss": 4.6545, + "step": 35218 + }, + { + "epoch": 0.20945736987344182, + "grad_norm": 1.758099913597107, + "learning_rate": 4.478025316452832e-05, + "loss": 5.1267, + "step": 35219 + }, + { + "epoch": 0.2094633171567228, + "grad_norm": 1.836985468864441, + "learning_rate": 4.477996750958819e-05, + "loss": 4.9632, + "step": 35220 + }, + { + "epoch": 0.2094692644400038, + "grad_norm": 1.5703904628753662, + "learning_rate": 4.477968184774308e-05, + "loss": 5.1253, + "step": 35221 + }, + { + "epoch": 0.20947521172328482, + "grad_norm": 1.717297911643982, + "learning_rate": 4.4779396178993094e-05, + "loss": 4.1864, + "step": 35222 + }, + { + "epoch": 0.2094811590065658, + "grad_norm": 1.8805084228515625, + "learning_rate": 4.4779110503338325e-05, + "loss": 4.2417, + "step": 35223 + }, + { + "epoch": 0.2094871062898468, + "grad_norm": 2.0293681621551514, + "learning_rate": 4.477882482077887e-05, + "loss": 4.1028, + "step": 35224 + }, + { + "epoch": 0.2094930535731278, + "grad_norm": 1.868170142173767, + "learning_rate": 4.477853913131483e-05, + "loss": 3.8102, + "step": 35225 + }, + { + "epoch": 0.2094990008564088, + "grad_norm": 1.7893959283828735, + "learning_rate": 4.4778253434946305e-05, + "loss": 4.4876, + "step": 35226 + }, + { + "epoch": 0.20950494813968978, + "grad_norm": 1.837123990058899, + "learning_rate": 4.47779677316734e-05, + "loss": 4.3673, + "step": 35227 + }, + { + "epoch": 0.2095108954229708, + "grad_norm": 2.12733793258667, + "learning_rate": 4.477768202149621e-05, + "loss": 4.2554, + "step": 35228 + }, + { + "epoch": 0.2095168427062518, + "grad_norm": 2.033757209777832, + "learning_rate": 4.477739630441484e-05, + "loss": 4.4083, + "step": 35229 + }, + { + "epoch": 0.20952278998953278, + "grad_norm": 2.0243184566497803, + "learning_rate": 4.477711058042938e-05, + "loss": 4.5907, + "step": 35230 + }, + { + "epoch": 0.2095287372728138, + "grad_norm": 2.0109145641326904, + "learning_rate": 4.4776824849539935e-05, + "loss": 4.1614, + "step": 35231 + }, + { + "epoch": 0.20953468455609478, + "grad_norm": 1.9397257566452026, + "learning_rate": 4.4776539111746604e-05, + "loss": 4.2821, + "step": 35232 + }, + { + "epoch": 0.20954063183937577, + "grad_norm": 1.6508504152297974, + "learning_rate": 4.4776253367049495e-05, + "loss": 4.3756, + "step": 35233 + }, + { + "epoch": 0.20954657912265678, + "grad_norm": 2.013890266418457, + "learning_rate": 4.477596761544869e-05, + "loss": 3.8501, + "step": 35234 + }, + { + "epoch": 0.20955252640593777, + "grad_norm": 1.892042875289917, + "learning_rate": 4.47756818569443e-05, + "loss": 4.1742, + "step": 35235 + }, + { + "epoch": 0.20955847368921876, + "grad_norm": 1.5260576009750366, + "learning_rate": 4.4775396091536425e-05, + "loss": 4.7992, + "step": 35236 + }, + { + "epoch": 0.20956442097249975, + "grad_norm": 1.4659627676010132, + "learning_rate": 4.477511031922517e-05, + "loss": 5.1104, + "step": 35237 + }, + { + "epoch": 0.20957036825578076, + "grad_norm": 1.4066425561904907, + "learning_rate": 4.4774824540010625e-05, + "loss": 5.0196, + "step": 35238 + }, + { + "epoch": 0.20957631553906175, + "grad_norm": 2.000969409942627, + "learning_rate": 4.477453875389289e-05, + "loss": 4.4638, + "step": 35239 + }, + { + "epoch": 0.20958226282234274, + "grad_norm": 1.9423243999481201, + "learning_rate": 4.4774252960872066e-05, + "loss": 4.5488, + "step": 35240 + }, + { + "epoch": 0.20958821010562376, + "grad_norm": 1.9737645387649536, + "learning_rate": 4.477396716094826e-05, + "loss": 4.5551, + "step": 35241 + }, + { + "epoch": 0.20959415738890474, + "grad_norm": 2.2152135372161865, + "learning_rate": 4.477368135412157e-05, + "loss": 4.1769, + "step": 35242 + }, + { + "epoch": 0.20960010467218573, + "grad_norm": 1.604505181312561, + "learning_rate": 4.4773395540392086e-05, + "loss": 4.2283, + "step": 35243 + }, + { + "epoch": 0.20960605195546675, + "grad_norm": 1.5856635570526123, + "learning_rate": 4.477310971975991e-05, + "loss": 4.4282, + "step": 35244 + }, + { + "epoch": 0.20961199923874774, + "grad_norm": 1.7525242567062378, + "learning_rate": 4.4772823892225146e-05, + "loss": 4.4311, + "step": 35245 + }, + { + "epoch": 0.20961794652202873, + "grad_norm": 1.9557976722717285, + "learning_rate": 4.4772538057787904e-05, + "loss": 4.4978, + "step": 35246 + }, + { + "epoch": 0.20962389380530974, + "grad_norm": 1.9850143194198608, + "learning_rate": 4.4772252216448265e-05, + "loss": 4.1881, + "step": 35247 + }, + { + "epoch": 0.20962984108859073, + "grad_norm": 2.2965569496154785, + "learning_rate": 4.4771966368206345e-05, + "loss": 4.4171, + "step": 35248 + }, + { + "epoch": 0.20963578837187172, + "grad_norm": 2.1682398319244385, + "learning_rate": 4.4771680513062224e-05, + "loss": 4.2601, + "step": 35249 + }, + { + "epoch": 0.20964173565515273, + "grad_norm": 2.0759825706481934, + "learning_rate": 4.477139465101602e-05, + "loss": 4.5485, + "step": 35250 + }, + { + "epoch": 0.20964768293843372, + "grad_norm": 1.9251832962036133, + "learning_rate": 4.477110878206783e-05, + "loss": 4.4246, + "step": 35251 + }, + { + "epoch": 0.2096536302217147, + "grad_norm": 1.963021993637085, + "learning_rate": 4.477082290621774e-05, + "loss": 4.5177, + "step": 35252 + }, + { + "epoch": 0.20965957750499573, + "grad_norm": 1.748746633529663, + "learning_rate": 4.4770537023465864e-05, + "loss": 4.7079, + "step": 35253 + }, + { + "epoch": 0.2096655247882767, + "grad_norm": 1.9013080596923828, + "learning_rate": 4.47702511338123e-05, + "loss": 4.6707, + "step": 35254 + }, + { + "epoch": 0.2096714720715577, + "grad_norm": 1.957573413848877, + "learning_rate": 4.476996523725715e-05, + "loss": 4.2977, + "step": 35255 + }, + { + "epoch": 0.20967741935483872, + "grad_norm": 1.7000887393951416, + "learning_rate": 4.476967933380051e-05, + "loss": 4.4874, + "step": 35256 + }, + { + "epoch": 0.2096833666381197, + "grad_norm": 1.8152967691421509, + "learning_rate": 4.476939342344246e-05, + "loss": 4.216, + "step": 35257 + }, + { + "epoch": 0.2096893139214007, + "grad_norm": 1.932236671447754, + "learning_rate": 4.476910750618314e-05, + "loss": 4.4371, + "step": 35258 + }, + { + "epoch": 0.2096952612046817, + "grad_norm": 2.0386545658111572, + "learning_rate": 4.4768821582022625e-05, + "loss": 4.4598, + "step": 35259 + }, + { + "epoch": 0.2097012084879627, + "grad_norm": 1.8710325956344604, + "learning_rate": 4.4768535650961014e-05, + "loss": 4.2673, + "step": 35260 + }, + { + "epoch": 0.2097071557712437, + "grad_norm": 1.9699058532714844, + "learning_rate": 4.476824971299841e-05, + "loss": 4.1835, + "step": 35261 + }, + { + "epoch": 0.2097131030545247, + "grad_norm": 1.6277472972869873, + "learning_rate": 4.476796376813492e-05, + "loss": 5.0931, + "step": 35262 + }, + { + "epoch": 0.2097190503378057, + "grad_norm": 1.709804892539978, + "learning_rate": 4.476767781637064e-05, + "loss": 5.1094, + "step": 35263 + }, + { + "epoch": 0.20972499762108668, + "grad_norm": 1.692352056503296, + "learning_rate": 4.4767391857705654e-05, + "loss": 5.0337, + "step": 35264 + }, + { + "epoch": 0.2097309449043677, + "grad_norm": 1.619791030883789, + "learning_rate": 4.476710589214009e-05, + "loss": 5.044, + "step": 35265 + }, + { + "epoch": 0.20973689218764868, + "grad_norm": 1.5870261192321777, + "learning_rate": 4.4766819919674024e-05, + "loss": 4.9422, + "step": 35266 + }, + { + "epoch": 0.20974283947092967, + "grad_norm": 1.756438136100769, + "learning_rate": 4.4766533940307574e-05, + "loss": 4.9927, + "step": 35267 + }, + { + "epoch": 0.2097487867542107, + "grad_norm": 2.050565004348755, + "learning_rate": 4.476624795404082e-05, + "loss": 5.0727, + "step": 35268 + }, + { + "epoch": 0.20975473403749167, + "grad_norm": 1.7845942974090576, + "learning_rate": 4.476596196087388e-05, + "loss": 4.9263, + "step": 35269 + }, + { + "epoch": 0.20976068132077266, + "grad_norm": 1.4715898036956787, + "learning_rate": 4.476567596080685e-05, + "loss": 5.2048, + "step": 35270 + }, + { + "epoch": 0.20976662860405368, + "grad_norm": 1.4592971801757812, + "learning_rate": 4.4765389953839824e-05, + "loss": 5.5331, + "step": 35271 + }, + { + "epoch": 0.20977257588733467, + "grad_norm": 1.4662095308303833, + "learning_rate": 4.476510393997291e-05, + "loss": 5.398, + "step": 35272 + }, + { + "epoch": 0.20977852317061566, + "grad_norm": 1.626869559288025, + "learning_rate": 4.4764817919206194e-05, + "loss": 4.9234, + "step": 35273 + }, + { + "epoch": 0.20978447045389667, + "grad_norm": 1.792540431022644, + "learning_rate": 4.4764531891539785e-05, + "loss": 5.1534, + "step": 35274 + }, + { + "epoch": 0.20979041773717766, + "grad_norm": 1.702528715133667, + "learning_rate": 4.4764245856973784e-05, + "loss": 5.1792, + "step": 35275 + }, + { + "epoch": 0.20979636502045865, + "grad_norm": 1.5575789213180542, + "learning_rate": 4.476395981550829e-05, + "loss": 5.3819, + "step": 35276 + }, + { + "epoch": 0.20980231230373966, + "grad_norm": 1.7874699831008911, + "learning_rate": 4.47636737671434e-05, + "loss": 5.4195, + "step": 35277 + }, + { + "epoch": 0.20980825958702065, + "grad_norm": 1.5927484035491943, + "learning_rate": 4.4763387711879215e-05, + "loss": 4.7251, + "step": 35278 + }, + { + "epoch": 0.20981420687030164, + "grad_norm": 1.572074294090271, + "learning_rate": 4.476310164971584e-05, + "loss": 5.6226, + "step": 35279 + }, + { + "epoch": 0.20982015415358266, + "grad_norm": 1.5262272357940674, + "learning_rate": 4.476281558065336e-05, + "loss": 5.2816, + "step": 35280 + }, + { + "epoch": 0.20982610143686364, + "grad_norm": 1.8157652616500854, + "learning_rate": 4.47625295046919e-05, + "loss": 5.198, + "step": 35281 + }, + { + "epoch": 0.20983204872014463, + "grad_norm": 1.550521969795227, + "learning_rate": 4.4762243421831536e-05, + "loss": 4.96, + "step": 35282 + }, + { + "epoch": 0.20983799600342565, + "grad_norm": 1.6990987062454224, + "learning_rate": 4.476195733207238e-05, + "loss": 4.6078, + "step": 35283 + }, + { + "epoch": 0.20984394328670664, + "grad_norm": 1.61250901222229, + "learning_rate": 4.476167123541453e-05, + "loss": 4.6525, + "step": 35284 + }, + { + "epoch": 0.20984989056998762, + "grad_norm": 1.630925178527832, + "learning_rate": 4.476138513185808e-05, + "loss": 5.1194, + "step": 35285 + }, + { + "epoch": 0.20985583785326864, + "grad_norm": 1.3319804668426514, + "learning_rate": 4.476109902140313e-05, + "loss": 5.4476, + "step": 35286 + }, + { + "epoch": 0.20986178513654963, + "grad_norm": 1.4637563228607178, + "learning_rate": 4.47608129040498e-05, + "loss": 5.4099, + "step": 35287 + }, + { + "epoch": 0.20986773241983062, + "grad_norm": 1.3934221267700195, + "learning_rate": 4.476052677979816e-05, + "loss": 5.3829, + "step": 35288 + }, + { + "epoch": 0.20987367970311163, + "grad_norm": 1.43252432346344, + "learning_rate": 4.476024064864833e-05, + "loss": 5.3713, + "step": 35289 + }, + { + "epoch": 0.20987962698639262, + "grad_norm": 1.6679284572601318, + "learning_rate": 4.4759954510600404e-05, + "loss": 5.382, + "step": 35290 + }, + { + "epoch": 0.2098855742696736, + "grad_norm": 1.7102776765823364, + "learning_rate": 4.4759668365654484e-05, + "loss": 5.5006, + "step": 35291 + }, + { + "epoch": 0.20989152155295462, + "grad_norm": 1.8174772262573242, + "learning_rate": 4.4759382213810667e-05, + "loss": 5.2365, + "step": 35292 + }, + { + "epoch": 0.2098974688362356, + "grad_norm": 1.3059958219528198, + "learning_rate": 4.475909605506905e-05, + "loss": 5.5279, + "step": 35293 + }, + { + "epoch": 0.2099034161195166, + "grad_norm": 1.3822412490844727, + "learning_rate": 4.4758809889429745e-05, + "loss": 4.8987, + "step": 35294 + }, + { + "epoch": 0.2099093634027976, + "grad_norm": 1.7537777423858643, + "learning_rate": 4.475852371689284e-05, + "loss": 5.4948, + "step": 35295 + }, + { + "epoch": 0.2099153106860786, + "grad_norm": 1.6818406581878662, + "learning_rate": 4.475823753745843e-05, + "loss": 5.2707, + "step": 35296 + }, + { + "epoch": 0.2099212579693596, + "grad_norm": 1.5473475456237793, + "learning_rate": 4.475795135112663e-05, + "loss": 5.246, + "step": 35297 + }, + { + "epoch": 0.20992720525264058, + "grad_norm": 1.9564220905303955, + "learning_rate": 4.475766515789753e-05, + "loss": 5.0882, + "step": 35298 + }, + { + "epoch": 0.2099331525359216, + "grad_norm": 1.9098862409591675, + "learning_rate": 4.475737895777124e-05, + "loss": 4.8618, + "step": 35299 + }, + { + "epoch": 0.20993909981920258, + "grad_norm": 1.313395380973816, + "learning_rate": 4.4757092750747856e-05, + "loss": 5.102, + "step": 35300 + }, + { + "epoch": 0.20994504710248357, + "grad_norm": 1.600813865661621, + "learning_rate": 4.475680653682747e-05, + "loss": 4.5547, + "step": 35301 + }, + { + "epoch": 0.2099509943857646, + "grad_norm": 1.5113312005996704, + "learning_rate": 4.4756520316010183e-05, + "loss": 5.0377, + "step": 35302 + }, + { + "epoch": 0.20995694166904558, + "grad_norm": 1.413179874420166, + "learning_rate": 4.47562340882961e-05, + "loss": 4.1832, + "step": 35303 + }, + { + "epoch": 0.20996288895232657, + "grad_norm": 1.613791584968567, + "learning_rate": 4.475594785368532e-05, + "loss": 4.2055, + "step": 35304 + }, + { + "epoch": 0.20996883623560758, + "grad_norm": 1.739326000213623, + "learning_rate": 4.475566161217795e-05, + "loss": 4.7967, + "step": 35305 + }, + { + "epoch": 0.20997478351888857, + "grad_norm": 1.3964680433273315, + "learning_rate": 4.4755375363774074e-05, + "loss": 4.814, + "step": 35306 + }, + { + "epoch": 0.20998073080216956, + "grad_norm": 1.4224399328231812, + "learning_rate": 4.47550891084738e-05, + "loss": 4.8103, + "step": 35307 + }, + { + "epoch": 0.20998667808545057, + "grad_norm": 1.8177698850631714, + "learning_rate": 4.475480284627723e-05, + "loss": 4.7559, + "step": 35308 + }, + { + "epoch": 0.20999262536873156, + "grad_norm": 2.1109840869903564, + "learning_rate": 4.475451657718447e-05, + "loss": 4.196, + "step": 35309 + }, + { + "epoch": 0.20999857265201255, + "grad_norm": 2.44920015335083, + "learning_rate": 4.47542303011956e-05, + "loss": 4.4954, + "step": 35310 + }, + { + "epoch": 0.21000451993529357, + "grad_norm": 1.8589818477630615, + "learning_rate": 4.4753944018310744e-05, + "loss": 4.7965, + "step": 35311 + }, + { + "epoch": 0.21001046721857455, + "grad_norm": 1.8963855504989624, + "learning_rate": 4.475365772852998e-05, + "loss": 5.0571, + "step": 35312 + }, + { + "epoch": 0.21001641450185554, + "grad_norm": 1.857127070426941, + "learning_rate": 4.475337143185343e-05, + "loss": 5.4925, + "step": 35313 + }, + { + "epoch": 0.21002236178513656, + "grad_norm": 1.6797605752944946, + "learning_rate": 4.475308512828117e-05, + "loss": 5.0388, + "step": 35314 + }, + { + "epoch": 0.21002830906841755, + "grad_norm": 1.6497384309768677, + "learning_rate": 4.4752798817813317e-05, + "loss": 5.1263, + "step": 35315 + }, + { + "epoch": 0.21003425635169853, + "grad_norm": 1.8872557878494263, + "learning_rate": 4.475251250044997e-05, + "loss": 4.5841, + "step": 35316 + }, + { + "epoch": 0.21004020363497955, + "grad_norm": 1.843013048171997, + "learning_rate": 4.475222617619122e-05, + "loss": 4.9794, + "step": 35317 + }, + { + "epoch": 0.21004615091826054, + "grad_norm": 1.5779093503952026, + "learning_rate": 4.475193984503717e-05, + "loss": 5.1123, + "step": 35318 + }, + { + "epoch": 0.21005209820154153, + "grad_norm": 1.647068738937378, + "learning_rate": 4.475165350698793e-05, + "loss": 5.0647, + "step": 35319 + }, + { + "epoch": 0.21005804548482254, + "grad_norm": 1.9030619859695435, + "learning_rate": 4.475136716204359e-05, + "loss": 4.638, + "step": 35320 + }, + { + "epoch": 0.21006399276810353, + "grad_norm": 1.9144114255905151, + "learning_rate": 4.4751080810204244e-05, + "loss": 4.8637, + "step": 35321 + }, + { + "epoch": 0.21006994005138452, + "grad_norm": 2.0055301189422607, + "learning_rate": 4.475079445147e-05, + "loss": 4.7652, + "step": 35322 + }, + { + "epoch": 0.21007588733466553, + "grad_norm": 2.0406198501586914, + "learning_rate": 4.475050808584097e-05, + "loss": 4.5856, + "step": 35323 + }, + { + "epoch": 0.21008183461794652, + "grad_norm": 1.781522274017334, + "learning_rate": 4.475022171331723e-05, + "loss": 5.1992, + "step": 35324 + }, + { + "epoch": 0.2100877819012275, + "grad_norm": 1.5984790325164795, + "learning_rate": 4.4749935333898896e-05, + "loss": 5.1308, + "step": 35325 + }, + { + "epoch": 0.21009372918450853, + "grad_norm": 1.5800871849060059, + "learning_rate": 4.4749648947586065e-05, + "loss": 5.1936, + "step": 35326 + }, + { + "epoch": 0.21009967646778951, + "grad_norm": 1.6657015085220337, + "learning_rate": 4.4749362554378834e-05, + "loss": 5.1147, + "step": 35327 + }, + { + "epoch": 0.2101056237510705, + "grad_norm": 1.6115208864212036, + "learning_rate": 4.47490761542773e-05, + "loss": 5.0284, + "step": 35328 + }, + { + "epoch": 0.21011157103435152, + "grad_norm": 1.7555902004241943, + "learning_rate": 4.4748789747281574e-05, + "loss": 5.0457, + "step": 35329 + }, + { + "epoch": 0.2101175183176325, + "grad_norm": 1.597569227218628, + "learning_rate": 4.474850333339174e-05, + "loss": 4.39, + "step": 35330 + }, + { + "epoch": 0.2101234656009135, + "grad_norm": 1.4470419883728027, + "learning_rate": 4.4748216912607924e-05, + "loss": 4.7876, + "step": 35331 + }, + { + "epoch": 0.2101294128841945, + "grad_norm": 1.2878272533416748, + "learning_rate": 4.47479304849302e-05, + "loss": 5.1986, + "step": 35332 + }, + { + "epoch": 0.2101353601674755, + "grad_norm": 1.3939337730407715, + "learning_rate": 4.4747644050358676e-05, + "loss": 4.9916, + "step": 35333 + }, + { + "epoch": 0.2101413074507565, + "grad_norm": 1.404179573059082, + "learning_rate": 4.4747357608893456e-05, + "loss": 4.8303, + "step": 35334 + }, + { + "epoch": 0.2101472547340375, + "grad_norm": 1.5711162090301514, + "learning_rate": 4.4747071160534635e-05, + "loss": 4.7675, + "step": 35335 + }, + { + "epoch": 0.2101532020173185, + "grad_norm": 1.5045690536499023, + "learning_rate": 4.474678470528232e-05, + "loss": 4.7171, + "step": 35336 + }, + { + "epoch": 0.21015914930059948, + "grad_norm": 1.6560767889022827, + "learning_rate": 4.47464982431366e-05, + "loss": 4.7026, + "step": 35337 + }, + { + "epoch": 0.2101650965838805, + "grad_norm": 1.5439120531082153, + "learning_rate": 4.474621177409759e-05, + "loss": 4.4902, + "step": 35338 + }, + { + "epoch": 0.21017104386716148, + "grad_norm": 1.718030571937561, + "learning_rate": 4.474592529816538e-05, + "loss": 4.6819, + "step": 35339 + }, + { + "epoch": 0.21017699115044247, + "grad_norm": 1.6997952461242676, + "learning_rate": 4.474563881534006e-05, + "loss": 4.5594, + "step": 35340 + }, + { + "epoch": 0.2101829384337235, + "grad_norm": 1.8977982997894287, + "learning_rate": 4.474535232562176e-05, + "loss": 5.0617, + "step": 35341 + }, + { + "epoch": 0.21018888571700448, + "grad_norm": 1.6481338739395142, + "learning_rate": 4.474506582901054e-05, + "loss": 4.8686, + "step": 35342 + }, + { + "epoch": 0.21019483300028546, + "grad_norm": 1.7469749450683594, + "learning_rate": 4.474477932550654e-05, + "loss": 5.0062, + "step": 35343 + }, + { + "epoch": 0.21020078028356648, + "grad_norm": 1.6420084238052368, + "learning_rate": 4.474449281510984e-05, + "loss": 5.0616, + "step": 35344 + }, + { + "epoch": 0.21020672756684747, + "grad_norm": 1.5826870203018188, + "learning_rate": 4.474420629782053e-05, + "loss": 4.7822, + "step": 35345 + }, + { + "epoch": 0.21021267485012846, + "grad_norm": 1.6704856157302856, + "learning_rate": 4.4743919773638724e-05, + "loss": 4.6443, + "step": 35346 + }, + { + "epoch": 0.21021862213340947, + "grad_norm": 1.573956847190857, + "learning_rate": 4.474363324256453e-05, + "loss": 4.5345, + "step": 35347 + }, + { + "epoch": 0.21022456941669046, + "grad_norm": 1.736695408821106, + "learning_rate": 4.474334670459802e-05, + "loss": 4.7278, + "step": 35348 + }, + { + "epoch": 0.21023051669997145, + "grad_norm": 1.8054871559143066, + "learning_rate": 4.474306015973933e-05, + "loss": 4.8814, + "step": 35349 + }, + { + "epoch": 0.21023646398325246, + "grad_norm": 1.9570103883743286, + "learning_rate": 4.474277360798853e-05, + "loss": 4.4891, + "step": 35350 + }, + { + "epoch": 0.21024241126653345, + "grad_norm": 1.7210701704025269, + "learning_rate": 4.474248704934574e-05, + "loss": 5.5031, + "step": 35351 + }, + { + "epoch": 0.21024835854981444, + "grad_norm": 1.6097183227539062, + "learning_rate": 4.474220048381105e-05, + "loss": 5.2063, + "step": 35352 + }, + { + "epoch": 0.21025430583309543, + "grad_norm": 1.2283453941345215, + "learning_rate": 4.4741913911384556e-05, + "loss": 5.4689, + "step": 35353 + }, + { + "epoch": 0.21026025311637644, + "grad_norm": 1.5359746217727661, + "learning_rate": 4.4741627332066364e-05, + "loss": 5.0208, + "step": 35354 + }, + { + "epoch": 0.21026620039965743, + "grad_norm": 1.7506155967712402, + "learning_rate": 4.474134074585658e-05, + "loss": 4.8214, + "step": 35355 + }, + { + "epoch": 0.21027214768293842, + "grad_norm": 1.6390902996063232, + "learning_rate": 4.474105415275529e-05, + "loss": 4.8685, + "step": 35356 + }, + { + "epoch": 0.21027809496621944, + "grad_norm": 1.7952314615249634, + "learning_rate": 4.474076755276261e-05, + "loss": 4.671, + "step": 35357 + }, + { + "epoch": 0.21028404224950042, + "grad_norm": 1.6228652000427246, + "learning_rate": 4.4740480945878624e-05, + "loss": 5.0547, + "step": 35358 + }, + { + "epoch": 0.2102899895327814, + "grad_norm": 1.460041880607605, + "learning_rate": 4.4740194332103444e-05, + "loss": 5.0881, + "step": 35359 + }, + { + "epoch": 0.21029593681606243, + "grad_norm": 1.5461219549179077, + "learning_rate": 4.4739907711437176e-05, + "loss": 4.7493, + "step": 35360 + }, + { + "epoch": 0.21030188409934342, + "grad_norm": 1.6176092624664307, + "learning_rate": 4.4739621083879896e-05, + "loss": 4.797, + "step": 35361 + }, + { + "epoch": 0.2103078313826244, + "grad_norm": 1.599272608757019, + "learning_rate": 4.4739334449431725e-05, + "loss": 4.8203, + "step": 35362 + }, + { + "epoch": 0.21031377866590542, + "grad_norm": 1.6934388875961304, + "learning_rate": 4.473904780809276e-05, + "loss": 4.8852, + "step": 35363 + }, + { + "epoch": 0.2103197259491864, + "grad_norm": 1.7300370931625366, + "learning_rate": 4.473876115986308e-05, + "loss": 4.8187, + "step": 35364 + }, + { + "epoch": 0.2103256732324674, + "grad_norm": 1.8484524488449097, + "learning_rate": 4.473847450474282e-05, + "loss": 4.8067, + "step": 35365 + }, + { + "epoch": 0.2103316205157484, + "grad_norm": 1.6827832460403442, + "learning_rate": 4.473818784273206e-05, + "loss": 4.8757, + "step": 35366 + }, + { + "epoch": 0.2103375677990294, + "grad_norm": 2.2498104572296143, + "learning_rate": 4.47379011738309e-05, + "loss": 3.5606, + "step": 35367 + }, + { + "epoch": 0.2103435150823104, + "grad_norm": 2.074948787689209, + "learning_rate": 4.473761449803944e-05, + "loss": 4.1696, + "step": 35368 + }, + { + "epoch": 0.2103494623655914, + "grad_norm": 1.5519061088562012, + "learning_rate": 4.473732781535778e-05, + "loss": 4.9684, + "step": 35369 + }, + { + "epoch": 0.2103554096488724, + "grad_norm": 1.718672752380371, + "learning_rate": 4.473704112578603e-05, + "loss": 5.1141, + "step": 35370 + }, + { + "epoch": 0.21036135693215338, + "grad_norm": 2.0321526527404785, + "learning_rate": 4.4736754429324276e-05, + "loss": 5.4429, + "step": 35371 + }, + { + "epoch": 0.2103673042154344, + "grad_norm": 1.5147876739501953, + "learning_rate": 4.473646772597263e-05, + "loss": 5.0556, + "step": 35372 + }, + { + "epoch": 0.21037325149871539, + "grad_norm": 1.7505944967269897, + "learning_rate": 4.4736181015731186e-05, + "loss": 4.3422, + "step": 35373 + }, + { + "epoch": 0.21037919878199637, + "grad_norm": 1.6300780773162842, + "learning_rate": 4.473589429860004e-05, + "loss": 4.8597, + "step": 35374 + }, + { + "epoch": 0.2103851460652774, + "grad_norm": 1.5492527484893799, + "learning_rate": 4.473560757457931e-05, + "loss": 4.9378, + "step": 35375 + }, + { + "epoch": 0.21039109334855838, + "grad_norm": 1.7544493675231934, + "learning_rate": 4.4735320843669074e-05, + "loss": 4.8018, + "step": 35376 + }, + { + "epoch": 0.21039704063183937, + "grad_norm": 1.588906168937683, + "learning_rate": 4.4735034105869446e-05, + "loss": 4.9619, + "step": 35377 + }, + { + "epoch": 0.21040298791512038, + "grad_norm": 1.677171230316162, + "learning_rate": 4.473474736118052e-05, + "loss": 4.8541, + "step": 35378 + }, + { + "epoch": 0.21040893519840137, + "grad_norm": 1.5742454528808594, + "learning_rate": 4.473446060960239e-05, + "loss": 4.8147, + "step": 35379 + }, + { + "epoch": 0.21041488248168236, + "grad_norm": 1.5566039085388184, + "learning_rate": 4.473417385113518e-05, + "loss": 4.7967, + "step": 35380 + }, + { + "epoch": 0.21042082976496337, + "grad_norm": 1.377108097076416, + "learning_rate": 4.473388708577896e-05, + "loss": 4.7101, + "step": 35381 + }, + { + "epoch": 0.21042677704824436, + "grad_norm": 1.8263981342315674, + "learning_rate": 4.473360031353384e-05, + "loss": 4.6103, + "step": 35382 + }, + { + "epoch": 0.21043272433152535, + "grad_norm": 1.547569990158081, + "learning_rate": 4.4733313534399934e-05, + "loss": 4.7909, + "step": 35383 + }, + { + "epoch": 0.21043867161480637, + "grad_norm": 1.49032461643219, + "learning_rate": 4.473302674837733e-05, + "loss": 4.7885, + "step": 35384 + }, + { + "epoch": 0.21044461889808735, + "grad_norm": 1.6592745780944824, + "learning_rate": 4.473273995546613e-05, + "loss": 4.8221, + "step": 35385 + }, + { + "epoch": 0.21045056618136834, + "grad_norm": 1.3907108306884766, + "learning_rate": 4.473245315566644e-05, + "loss": 4.8192, + "step": 35386 + }, + { + "epoch": 0.21045651346464936, + "grad_norm": 1.4064911603927612, + "learning_rate": 4.4732166348978345e-05, + "loss": 4.6388, + "step": 35387 + }, + { + "epoch": 0.21046246074793035, + "grad_norm": 1.567564845085144, + "learning_rate": 4.473187953540196e-05, + "loss": 4.7417, + "step": 35388 + }, + { + "epoch": 0.21046840803121133, + "grad_norm": 1.6142017841339111, + "learning_rate": 4.4731592714937375e-05, + "loss": 4.8069, + "step": 35389 + }, + { + "epoch": 0.21047435531449235, + "grad_norm": 1.7662934064865112, + "learning_rate": 4.4731305887584694e-05, + "loss": 4.7536, + "step": 35390 + }, + { + "epoch": 0.21048030259777334, + "grad_norm": 1.4565373659133911, + "learning_rate": 4.473101905334403e-05, + "loss": 4.6508, + "step": 35391 + }, + { + "epoch": 0.21048624988105433, + "grad_norm": 1.4265527725219727, + "learning_rate": 4.473073221221546e-05, + "loss": 4.7285, + "step": 35392 + }, + { + "epoch": 0.21049219716433534, + "grad_norm": 1.4948612451553345, + "learning_rate": 4.47304453641991e-05, + "loss": 4.6359, + "step": 35393 + }, + { + "epoch": 0.21049814444761633, + "grad_norm": 1.4874624013900757, + "learning_rate": 4.473015850929504e-05, + "loss": 4.6045, + "step": 35394 + }, + { + "epoch": 0.21050409173089732, + "grad_norm": 1.5556808710098267, + "learning_rate": 4.4729871647503394e-05, + "loss": 4.6039, + "step": 35395 + }, + { + "epoch": 0.21051003901417834, + "grad_norm": 1.529366374015808, + "learning_rate": 4.4729584778824246e-05, + "loss": 4.6944, + "step": 35396 + }, + { + "epoch": 0.21051598629745932, + "grad_norm": 1.514224648475647, + "learning_rate": 4.4729297903257704e-05, + "loss": 4.6967, + "step": 35397 + }, + { + "epoch": 0.2105219335807403, + "grad_norm": 1.601287841796875, + "learning_rate": 4.4729011020803876e-05, + "loss": 4.8392, + "step": 35398 + }, + { + "epoch": 0.21052788086402133, + "grad_norm": 1.6562103033065796, + "learning_rate": 4.4728724131462844e-05, + "loss": 4.5464, + "step": 35399 + }, + { + "epoch": 0.21053382814730232, + "grad_norm": 1.7306544780731201, + "learning_rate": 4.472843723523472e-05, + "loss": 4.7439, + "step": 35400 + }, + { + "epoch": 0.2105397754305833, + "grad_norm": 2.2626922130584717, + "learning_rate": 4.472815033211961e-05, + "loss": 3.9945, + "step": 35401 + }, + { + "epoch": 0.21054572271386432, + "grad_norm": 1.5204706192016602, + "learning_rate": 4.4727863422117597e-05, + "loss": 5.2211, + "step": 35402 + }, + { + "epoch": 0.2105516699971453, + "grad_norm": 2.4926559925079346, + "learning_rate": 4.4727576505228796e-05, + "loss": 4.4308, + "step": 35403 + }, + { + "epoch": 0.2105576172804263, + "grad_norm": 2.4240057468414307, + "learning_rate": 4.47272895814533e-05, + "loss": 4.5483, + "step": 35404 + }, + { + "epoch": 0.2105635645637073, + "grad_norm": 2.342590570449829, + "learning_rate": 4.4727002650791215e-05, + "loss": 4.4737, + "step": 35405 + }, + { + "epoch": 0.2105695118469883, + "grad_norm": 2.181455373764038, + "learning_rate": 4.472671571324264e-05, + "loss": 4.3658, + "step": 35406 + }, + { + "epoch": 0.2105754591302693, + "grad_norm": 1.8659454584121704, + "learning_rate": 4.472642876880766e-05, + "loss": 4.3592, + "step": 35407 + }, + { + "epoch": 0.2105814064135503, + "grad_norm": 2.0775270462036133, + "learning_rate": 4.47261418174864e-05, + "loss": 4.5404, + "step": 35408 + }, + { + "epoch": 0.2105873536968313, + "grad_norm": 2.4261813163757324, + "learning_rate": 4.4725854859278935e-05, + "loss": 4.5228, + "step": 35409 + }, + { + "epoch": 0.21059330098011228, + "grad_norm": 1.8874660730361938, + "learning_rate": 4.472556789418539e-05, + "loss": 4.68, + "step": 35410 + }, + { + "epoch": 0.21059924826339327, + "grad_norm": 1.7671655416488647, + "learning_rate": 4.472528092220585e-05, + "loss": 4.9307, + "step": 35411 + }, + { + "epoch": 0.21060519554667428, + "grad_norm": 2.424664258956909, + "learning_rate": 4.472499394334041e-05, + "loss": 3.6838, + "step": 35412 + }, + { + "epoch": 0.21061114282995527, + "grad_norm": 2.5734341144561768, + "learning_rate": 4.472470695758919e-05, + "loss": 3.6457, + "step": 35413 + }, + { + "epoch": 0.21061709011323626, + "grad_norm": 2.7775492668151855, + "learning_rate": 4.4724419964952267e-05, + "loss": 3.6932, + "step": 35414 + }, + { + "epoch": 0.21062303739651728, + "grad_norm": 2.4683339595794678, + "learning_rate": 4.4724132965429764e-05, + "loss": 3.3679, + "step": 35415 + }, + { + "epoch": 0.21062898467979826, + "grad_norm": 2.478834629058838, + "learning_rate": 4.472384595902176e-05, + "loss": 3.5933, + "step": 35416 + }, + { + "epoch": 0.21063493196307925, + "grad_norm": 2.095451831817627, + "learning_rate": 4.472355894572837e-05, + "loss": 4.1527, + "step": 35417 + }, + { + "epoch": 0.21064087924636027, + "grad_norm": 1.720664381980896, + "learning_rate": 4.472327192554969e-05, + "loss": 5.2839, + "step": 35418 + }, + { + "epoch": 0.21064682652964126, + "grad_norm": 1.7128255367279053, + "learning_rate": 4.472298489848582e-05, + "loss": 5.2786, + "step": 35419 + }, + { + "epoch": 0.21065277381292224, + "grad_norm": 2.405431032180786, + "learning_rate": 4.472269786453686e-05, + "loss": 4.0237, + "step": 35420 + }, + { + "epoch": 0.21065872109620326, + "grad_norm": 2.2845146656036377, + "learning_rate": 4.472241082370291e-05, + "loss": 4.6355, + "step": 35421 + }, + { + "epoch": 0.21066466837948425, + "grad_norm": 1.5968048572540283, + "learning_rate": 4.4722123775984074e-05, + "loss": 5.0121, + "step": 35422 + }, + { + "epoch": 0.21067061566276524, + "grad_norm": 1.874489188194275, + "learning_rate": 4.472183672138044e-05, + "loss": 4.9313, + "step": 35423 + }, + { + "epoch": 0.21067656294604625, + "grad_norm": 1.8483302593231201, + "learning_rate": 4.472154965989211e-05, + "loss": 4.9481, + "step": 35424 + }, + { + "epoch": 0.21068251022932724, + "grad_norm": 1.7580935955047607, + "learning_rate": 4.47212625915192e-05, + "loss": 5.0041, + "step": 35425 + }, + { + "epoch": 0.21068845751260823, + "grad_norm": 2.090477705001831, + "learning_rate": 4.472097551626181e-05, + "loss": 4.5731, + "step": 35426 + }, + { + "epoch": 0.21069440479588925, + "grad_norm": 1.7280174493789673, + "learning_rate": 4.472068843412002e-05, + "loss": 4.7637, + "step": 35427 + }, + { + "epoch": 0.21070035207917023, + "grad_norm": 1.6075327396392822, + "learning_rate": 4.4720401345093944e-05, + "loss": 5.6445, + "step": 35428 + }, + { + "epoch": 0.21070629936245122, + "grad_norm": 1.6671342849731445, + "learning_rate": 4.472011424918367e-05, + "loss": 5.1241, + "step": 35429 + }, + { + "epoch": 0.21071224664573224, + "grad_norm": 1.8082268238067627, + "learning_rate": 4.471982714638933e-05, + "loss": 4.5997, + "step": 35430 + }, + { + "epoch": 0.21071819392901323, + "grad_norm": 1.6478972434997559, + "learning_rate": 4.4719540036710984e-05, + "loss": 4.8913, + "step": 35431 + }, + { + "epoch": 0.2107241412122942, + "grad_norm": 1.7590205669403076, + "learning_rate": 4.4719252920148756e-05, + "loss": 5.0692, + "step": 35432 + }, + { + "epoch": 0.21073008849557523, + "grad_norm": 1.618296504020691, + "learning_rate": 4.471896579670274e-05, + "loss": 5.036, + "step": 35433 + }, + { + "epoch": 0.21073603577885622, + "grad_norm": 1.7069618701934814, + "learning_rate": 4.471867866637304e-05, + "loss": 4.7965, + "step": 35434 + }, + { + "epoch": 0.2107419830621372, + "grad_norm": 1.7504613399505615, + "learning_rate": 4.471839152915975e-05, + "loss": 4.7929, + "step": 35435 + }, + { + "epoch": 0.21074793034541822, + "grad_norm": 2.9193410873413086, + "learning_rate": 4.471810438506297e-05, + "loss": 3.469, + "step": 35436 + }, + { + "epoch": 0.2107538776286992, + "grad_norm": 3.223361015319824, + "learning_rate": 4.471781723408281e-05, + "loss": 3.3836, + "step": 35437 + }, + { + "epoch": 0.2107598249119802, + "grad_norm": 2.4114229679107666, + "learning_rate": 4.471753007621936e-05, + "loss": 3.9834, + "step": 35438 + }, + { + "epoch": 0.21076577219526121, + "grad_norm": 1.8739566802978516, + "learning_rate": 4.471724291147272e-05, + "loss": 4.1878, + "step": 35439 + }, + { + "epoch": 0.2107717194785422, + "grad_norm": 1.6680757999420166, + "learning_rate": 4.4716955739843004e-05, + "loss": 4.4736, + "step": 35440 + }, + { + "epoch": 0.2107776667618232, + "grad_norm": 1.5248615741729736, + "learning_rate": 4.47166685613303e-05, + "loss": 5.1447, + "step": 35441 + }, + { + "epoch": 0.2107836140451042, + "grad_norm": 1.8997430801391602, + "learning_rate": 4.47163813759347e-05, + "loss": 5.1902, + "step": 35442 + }, + { + "epoch": 0.2107895613283852, + "grad_norm": 1.822595477104187, + "learning_rate": 4.471609418365632e-05, + "loss": 5.119, + "step": 35443 + }, + { + "epoch": 0.21079550861166618, + "grad_norm": 1.5157179832458496, + "learning_rate": 4.471580698449526e-05, + "loss": 5.9801, + "step": 35444 + }, + { + "epoch": 0.2108014558949472, + "grad_norm": 1.7875192165374756, + "learning_rate": 4.471551977845162e-05, + "loss": 5.1399, + "step": 35445 + }, + { + "epoch": 0.2108074031782282, + "grad_norm": 1.8765265941619873, + "learning_rate": 4.471523256552549e-05, + "loss": 5.2492, + "step": 35446 + }, + { + "epoch": 0.21081335046150917, + "grad_norm": 1.514116644859314, + "learning_rate": 4.4714945345716976e-05, + "loss": 5.1928, + "step": 35447 + }, + { + "epoch": 0.2108192977447902, + "grad_norm": 1.7573895454406738, + "learning_rate": 4.471465811902617e-05, + "loss": 5.1439, + "step": 35448 + }, + { + "epoch": 0.21082524502807118, + "grad_norm": 1.641224980354309, + "learning_rate": 4.471437088545319e-05, + "loss": 4.8997, + "step": 35449 + }, + { + "epoch": 0.21083119231135217, + "grad_norm": 4.0163116455078125, + "learning_rate": 4.4714083644998126e-05, + "loss": 3.5777, + "step": 35450 + }, + { + "epoch": 0.21083713959463318, + "grad_norm": 3.2716312408447266, + "learning_rate": 4.471379639766108e-05, + "loss": 3.2344, + "step": 35451 + }, + { + "epoch": 0.21084308687791417, + "grad_norm": 2.3019282817840576, + "learning_rate": 4.4713509143442146e-05, + "loss": 4.5853, + "step": 35452 + }, + { + "epoch": 0.21084903416119516, + "grad_norm": 1.856331467628479, + "learning_rate": 4.4713221882341436e-05, + "loss": 4.6279, + "step": 35453 + }, + { + "epoch": 0.21085498144447617, + "grad_norm": 2.480015277862549, + "learning_rate": 4.471293461435904e-05, + "loss": 3.1665, + "step": 35454 + }, + { + "epoch": 0.21086092872775716, + "grad_norm": 2.5631415843963623, + "learning_rate": 4.471264733949506e-05, + "loss": 3.739, + "step": 35455 + }, + { + "epoch": 0.21086687601103815, + "grad_norm": 2.8379833698272705, + "learning_rate": 4.47123600577496e-05, + "loss": 2.5041, + "step": 35456 + }, + { + "epoch": 0.21087282329431917, + "grad_norm": 2.9057741165161133, + "learning_rate": 4.471207276912276e-05, + "loss": 4.1469, + "step": 35457 + }, + { + "epoch": 0.21087877057760016, + "grad_norm": 1.9106336832046509, + "learning_rate": 4.4711785473614644e-05, + "loss": 5.3251, + "step": 35458 + }, + { + "epoch": 0.21088471786088114, + "grad_norm": 1.7996464967727661, + "learning_rate": 4.471149817122534e-05, + "loss": 5.5621, + "step": 35459 + }, + { + "epoch": 0.21089066514416216, + "grad_norm": 2.341482162475586, + "learning_rate": 4.471121086195496e-05, + "loss": 4.8165, + "step": 35460 + }, + { + "epoch": 0.21089661242744315, + "grad_norm": 2.984218120574951, + "learning_rate": 4.47109235458036e-05, + "loss": 2.3554, + "step": 35461 + }, + { + "epoch": 0.21090255971072414, + "grad_norm": 3.3478970527648926, + "learning_rate": 4.471063622277135e-05, + "loss": 2.5034, + "step": 35462 + }, + { + "epoch": 0.21090850699400515, + "grad_norm": 2.904313325881958, + "learning_rate": 4.4710348892858333e-05, + "loss": 3.3472, + "step": 35463 + }, + { + "epoch": 0.21091445427728614, + "grad_norm": 1.8072670698165894, + "learning_rate": 4.471006155606463e-05, + "loss": 4.8444, + "step": 35464 + }, + { + "epoch": 0.21092040156056713, + "grad_norm": 3.2260665893554688, + "learning_rate": 4.470977421239035e-05, + "loss": 3.1718, + "step": 35465 + }, + { + "epoch": 0.21092634884384814, + "grad_norm": 1.743060827255249, + "learning_rate": 4.470948686183559e-05, + "loss": 5.0387, + "step": 35466 + }, + { + "epoch": 0.21093229612712913, + "grad_norm": 1.634989857673645, + "learning_rate": 4.4709199504400456e-05, + "loss": 4.993, + "step": 35467 + }, + { + "epoch": 0.21093824341041012, + "grad_norm": 1.6594475507736206, + "learning_rate": 4.470891214008505e-05, + "loss": 4.5849, + "step": 35468 + }, + { + "epoch": 0.2109441906936911, + "grad_norm": 1.6074466705322266, + "learning_rate": 4.470862476888945e-05, + "loss": 4.996, + "step": 35469 + }, + { + "epoch": 0.21095013797697212, + "grad_norm": 1.9464056491851807, + "learning_rate": 4.470833739081378e-05, + "loss": 4.5604, + "step": 35470 + }, + { + "epoch": 0.2109560852602531, + "grad_norm": 1.9392175674438477, + "learning_rate": 4.470805000585814e-05, + "loss": 4.8605, + "step": 35471 + }, + { + "epoch": 0.2109620325435341, + "grad_norm": 1.7574516534805298, + "learning_rate": 4.470776261402262e-05, + "loss": 4.8513, + "step": 35472 + }, + { + "epoch": 0.21096797982681512, + "grad_norm": 1.8646680116653442, + "learning_rate": 4.4707475215307315e-05, + "loss": 4.8125, + "step": 35473 + }, + { + "epoch": 0.2109739271100961, + "grad_norm": 1.7200084924697876, + "learning_rate": 4.4707187809712346e-05, + "loss": 4.7822, + "step": 35474 + }, + { + "epoch": 0.2109798743933771, + "grad_norm": 1.8245753049850464, + "learning_rate": 4.4706900397237795e-05, + "loss": 4.4128, + "step": 35475 + }, + { + "epoch": 0.2109858216766581, + "grad_norm": 1.5685904026031494, + "learning_rate": 4.4706612977883765e-05, + "loss": 5.1459, + "step": 35476 + }, + { + "epoch": 0.2109917689599391, + "grad_norm": 1.5470824241638184, + "learning_rate": 4.4706325551650364e-05, + "loss": 4.6915, + "step": 35477 + }, + { + "epoch": 0.21099771624322008, + "grad_norm": 1.3199049234390259, + "learning_rate": 4.470603811853769e-05, + "loss": 4.6157, + "step": 35478 + }, + { + "epoch": 0.2110036635265011, + "grad_norm": 1.665404200553894, + "learning_rate": 4.470575067854584e-05, + "loss": 4.8574, + "step": 35479 + }, + { + "epoch": 0.2110096108097821, + "grad_norm": 1.8050642013549805, + "learning_rate": 4.4705463231674915e-05, + "loss": 4.7309, + "step": 35480 + }, + { + "epoch": 0.21101555809306308, + "grad_norm": 1.8453636169433594, + "learning_rate": 4.4705175777925025e-05, + "loss": 4.6349, + "step": 35481 + }, + { + "epoch": 0.2110215053763441, + "grad_norm": 1.7850289344787598, + "learning_rate": 4.470488831729625e-05, + "loss": 4.6913, + "step": 35482 + }, + { + "epoch": 0.21102745265962508, + "grad_norm": 1.808980107307434, + "learning_rate": 4.4704600849788703e-05, + "loss": 4.6751, + "step": 35483 + }, + { + "epoch": 0.21103339994290607, + "grad_norm": 1.6603264808654785, + "learning_rate": 4.470431337540249e-05, + "loss": 4.5178, + "step": 35484 + }, + { + "epoch": 0.21103934722618709, + "grad_norm": 1.672696590423584, + "learning_rate": 4.47040258941377e-05, + "loss": 5.1297, + "step": 35485 + }, + { + "epoch": 0.21104529450946807, + "grad_norm": 1.8498941659927368, + "learning_rate": 4.4703738405994446e-05, + "loss": 4.4831, + "step": 35486 + }, + { + "epoch": 0.21105124179274906, + "grad_norm": 2.02712345123291, + "learning_rate": 4.470345091097281e-05, + "loss": 5.1251, + "step": 35487 + }, + { + "epoch": 0.21105718907603008, + "grad_norm": 1.5441256761550903, + "learning_rate": 4.470316340907291e-05, + "loss": 5.1557, + "step": 35488 + }, + { + "epoch": 0.21106313635931107, + "grad_norm": 1.5917513370513916, + "learning_rate": 4.470287590029483e-05, + "loss": 4.7467, + "step": 35489 + }, + { + "epoch": 0.21106908364259205, + "grad_norm": 1.693744421005249, + "learning_rate": 4.4702588384638686e-05, + "loss": 5.2251, + "step": 35490 + }, + { + "epoch": 0.21107503092587307, + "grad_norm": 1.5168530941009521, + "learning_rate": 4.470230086210457e-05, + "loss": 5.1613, + "step": 35491 + }, + { + "epoch": 0.21108097820915406, + "grad_norm": 1.5303258895874023, + "learning_rate": 4.4702013332692596e-05, + "loss": 5.0635, + "step": 35492 + }, + { + "epoch": 0.21108692549243505, + "grad_norm": 1.5826016664505005, + "learning_rate": 4.470172579640284e-05, + "loss": 4.6436, + "step": 35493 + }, + { + "epoch": 0.21109287277571606, + "grad_norm": 1.8279229402542114, + "learning_rate": 4.470143825323542e-05, + "loss": 4.772, + "step": 35494 + }, + { + "epoch": 0.21109882005899705, + "grad_norm": 1.7597702741622925, + "learning_rate": 4.4701150703190423e-05, + "loss": 5.0296, + "step": 35495 + }, + { + "epoch": 0.21110476734227804, + "grad_norm": 1.5488911867141724, + "learning_rate": 4.470086314626797e-05, + "loss": 4.9258, + "step": 35496 + }, + { + "epoch": 0.21111071462555905, + "grad_norm": 1.6315878629684448, + "learning_rate": 4.470057558246814e-05, + "loss": 4.5243, + "step": 35497 + }, + { + "epoch": 0.21111666190884004, + "grad_norm": 2.2954094409942627, + "learning_rate": 4.470028801179105e-05, + "loss": 4.2517, + "step": 35498 + }, + { + "epoch": 0.21112260919212103, + "grad_norm": 2.7295327186584473, + "learning_rate": 4.470000043423679e-05, + "loss": 4.2955, + "step": 35499 + }, + { + "epoch": 0.21112855647540205, + "grad_norm": 2.555826425552368, + "learning_rate": 4.469971284980546e-05, + "loss": 4.7908, + "step": 35500 + }, + { + "epoch": 0.21113450375868303, + "grad_norm": 1.5622174739837646, + "learning_rate": 4.4699425258497165e-05, + "loss": 5.0972, + "step": 35501 + }, + { + "epoch": 0.21114045104196402, + "grad_norm": 1.6253665685653687, + "learning_rate": 4.469913766031201e-05, + "loss": 4.5942, + "step": 35502 + }, + { + "epoch": 0.21114639832524504, + "grad_norm": 1.5531213283538818, + "learning_rate": 4.469885005525008e-05, + "loss": 4.9864, + "step": 35503 + }, + { + "epoch": 0.21115234560852603, + "grad_norm": 1.544110655784607, + "learning_rate": 4.4698562443311487e-05, + "loss": 4.7724, + "step": 35504 + }, + { + "epoch": 0.21115829289180701, + "grad_norm": 1.592146396636963, + "learning_rate": 4.4698274824496335e-05, + "loss": 4.9874, + "step": 35505 + }, + { + "epoch": 0.21116424017508803, + "grad_norm": 1.7738112211227417, + "learning_rate": 4.4697987198804713e-05, + "loss": 5.1586, + "step": 35506 + }, + { + "epoch": 0.21117018745836902, + "grad_norm": 1.7359950542449951, + "learning_rate": 4.4697699566236736e-05, + "loss": 4.6817, + "step": 35507 + }, + { + "epoch": 0.21117613474165, + "grad_norm": 1.5513485670089722, + "learning_rate": 4.469741192679249e-05, + "loss": 5.0546, + "step": 35508 + }, + { + "epoch": 0.21118208202493102, + "grad_norm": 1.6848827600479126, + "learning_rate": 4.469712428047208e-05, + "loss": 4.8273, + "step": 35509 + }, + { + "epoch": 0.211188029308212, + "grad_norm": 1.9015002250671387, + "learning_rate": 4.469683662727561e-05, + "loss": 4.5408, + "step": 35510 + }, + { + "epoch": 0.211193976591493, + "grad_norm": 1.6639163494110107, + "learning_rate": 4.469654896720317e-05, + "loss": 5.4344, + "step": 35511 + }, + { + "epoch": 0.21119992387477401, + "grad_norm": 1.6011817455291748, + "learning_rate": 4.469626130025488e-05, + "loss": 4.8481, + "step": 35512 + }, + { + "epoch": 0.211205871158055, + "grad_norm": 1.4600160121917725, + "learning_rate": 4.469597362643082e-05, + "loss": 4.9088, + "step": 35513 + }, + { + "epoch": 0.211211818441336, + "grad_norm": 1.4780898094177246, + "learning_rate": 4.46956859457311e-05, + "loss": 5.0465, + "step": 35514 + }, + { + "epoch": 0.211217765724617, + "grad_norm": 1.4310654401779175, + "learning_rate": 4.469539825815582e-05, + "loss": 4.8735, + "step": 35515 + }, + { + "epoch": 0.211223713007898, + "grad_norm": 1.7487471103668213, + "learning_rate": 4.469511056370508e-05, + "loss": 5.0721, + "step": 35516 + }, + { + "epoch": 0.21122966029117898, + "grad_norm": 1.745934009552002, + "learning_rate": 4.469482286237898e-05, + "loss": 5.0724, + "step": 35517 + }, + { + "epoch": 0.21123560757446, + "grad_norm": 1.542649745941162, + "learning_rate": 4.469453515417763e-05, + "loss": 5.294, + "step": 35518 + }, + { + "epoch": 0.211241554857741, + "grad_norm": 1.6778455972671509, + "learning_rate": 4.469424743910111e-05, + "loss": 4.7619, + "step": 35519 + }, + { + "epoch": 0.21124750214102198, + "grad_norm": 1.6462548971176147, + "learning_rate": 4.4693959717149536e-05, + "loss": 4.7533, + "step": 35520 + }, + { + "epoch": 0.211253449424303, + "grad_norm": 1.833646297454834, + "learning_rate": 4.4693671988323006e-05, + "loss": 4.3012, + "step": 35521 + }, + { + "epoch": 0.21125939670758398, + "grad_norm": 1.5945441722869873, + "learning_rate": 4.469338425262162e-05, + "loss": 4.9917, + "step": 35522 + }, + { + "epoch": 0.21126534399086497, + "grad_norm": 2.1458442211151123, + "learning_rate": 4.469309651004547e-05, + "loss": 4.2472, + "step": 35523 + }, + { + "epoch": 0.21127129127414598, + "grad_norm": 2.0150773525238037, + "learning_rate": 4.469280876059467e-05, + "loss": 4.101, + "step": 35524 + }, + { + "epoch": 0.21127723855742697, + "grad_norm": 1.8790959119796753, + "learning_rate": 4.469252100426931e-05, + "loss": 4.1637, + "step": 35525 + }, + { + "epoch": 0.21128318584070796, + "grad_norm": 1.9778228998184204, + "learning_rate": 4.469223324106949e-05, + "loss": 4.2768, + "step": 35526 + }, + { + "epoch": 0.21128913312398895, + "grad_norm": 2.055441379547119, + "learning_rate": 4.469194547099532e-05, + "loss": 4.1131, + "step": 35527 + }, + { + "epoch": 0.21129508040726996, + "grad_norm": 1.8175396919250488, + "learning_rate": 4.46916576940469e-05, + "loss": 4.5419, + "step": 35528 + }, + { + "epoch": 0.21130102769055095, + "grad_norm": 2.1261353492736816, + "learning_rate": 4.4691369910224315e-05, + "loss": 4.2994, + "step": 35529 + }, + { + "epoch": 0.21130697497383194, + "grad_norm": 1.6003457307815552, + "learning_rate": 4.4691082119527686e-05, + "loss": 4.2859, + "step": 35530 + }, + { + "epoch": 0.21131292225711296, + "grad_norm": 2.1611742973327637, + "learning_rate": 4.4690794321957094e-05, + "loss": 4.0373, + "step": 35531 + }, + { + "epoch": 0.21131886954039394, + "grad_norm": 1.887533187866211, + "learning_rate": 4.469050651751266e-05, + "loss": 4.3426, + "step": 35532 + }, + { + "epoch": 0.21132481682367493, + "grad_norm": 1.844598650932312, + "learning_rate": 4.469021870619447e-05, + "loss": 4.3078, + "step": 35533 + }, + { + "epoch": 0.21133076410695595, + "grad_norm": 1.7349529266357422, + "learning_rate": 4.4689930888002626e-05, + "loss": 4.4804, + "step": 35534 + }, + { + "epoch": 0.21133671139023694, + "grad_norm": 1.7875169515609741, + "learning_rate": 4.468964306293723e-05, + "loss": 4.1911, + "step": 35535 + }, + { + "epoch": 0.21134265867351792, + "grad_norm": 2.0172529220581055, + "learning_rate": 4.468935523099838e-05, + "loss": 4.3372, + "step": 35536 + }, + { + "epoch": 0.21134860595679894, + "grad_norm": 1.7885106801986694, + "learning_rate": 4.468906739218619e-05, + "loss": 4.429, + "step": 35537 + }, + { + "epoch": 0.21135455324007993, + "grad_norm": 1.7366465330123901, + "learning_rate": 4.468877954650074e-05, + "loss": 4.1948, + "step": 35538 + }, + { + "epoch": 0.21136050052336092, + "grad_norm": 1.6380743980407715, + "learning_rate": 4.468849169394215e-05, + "loss": 4.2948, + "step": 35539 + }, + { + "epoch": 0.21136644780664193, + "grad_norm": 1.6585488319396973, + "learning_rate": 4.46882038345105e-05, + "loss": 4.4046, + "step": 35540 + }, + { + "epoch": 0.21137239508992292, + "grad_norm": 1.6397299766540527, + "learning_rate": 4.468791596820591e-05, + "loss": 4.3406, + "step": 35541 + }, + { + "epoch": 0.2113783423732039, + "grad_norm": 1.8481812477111816, + "learning_rate": 4.468762809502847e-05, + "loss": 4.4447, + "step": 35542 + }, + { + "epoch": 0.21138428965648492, + "grad_norm": 1.6186330318450928, + "learning_rate": 4.468734021497828e-05, + "loss": 4.4204, + "step": 35543 + }, + { + "epoch": 0.2113902369397659, + "grad_norm": 1.7163970470428467, + "learning_rate": 4.4687052328055444e-05, + "loss": 4.1462, + "step": 35544 + }, + { + "epoch": 0.2113961842230469, + "grad_norm": 1.6585257053375244, + "learning_rate": 4.468676443426006e-05, + "loss": 4.4792, + "step": 35545 + }, + { + "epoch": 0.21140213150632792, + "grad_norm": 1.6501747369766235, + "learning_rate": 4.468647653359223e-05, + "loss": 4.3497, + "step": 35546 + }, + { + "epoch": 0.2114080787896089, + "grad_norm": 1.636633038520813, + "learning_rate": 4.468618862605205e-05, + "loss": 4.3438, + "step": 35547 + }, + { + "epoch": 0.2114140260728899, + "grad_norm": 1.6481387615203857, + "learning_rate": 4.468590071163964e-05, + "loss": 4.6326, + "step": 35548 + }, + { + "epoch": 0.2114199733561709, + "grad_norm": 1.4565008878707886, + "learning_rate": 4.468561279035508e-05, + "loss": 4.4197, + "step": 35549 + }, + { + "epoch": 0.2114259206394519, + "grad_norm": 1.7687804698944092, + "learning_rate": 4.4685324862198465e-05, + "loss": 4.0757, + "step": 35550 + }, + { + "epoch": 0.21143186792273289, + "grad_norm": 1.5998481512069702, + "learning_rate": 4.468503692716991e-05, + "loss": 4.2909, + "step": 35551 + }, + { + "epoch": 0.2114378152060139, + "grad_norm": 1.8555128574371338, + "learning_rate": 4.468474898526952e-05, + "loss": 4.6799, + "step": 35552 + }, + { + "epoch": 0.2114437624892949, + "grad_norm": 1.8145393133163452, + "learning_rate": 4.4684461036497385e-05, + "loss": 4.8618, + "step": 35553 + }, + { + "epoch": 0.21144970977257588, + "grad_norm": 1.6641209125518799, + "learning_rate": 4.4684173080853606e-05, + "loss": 4.9345, + "step": 35554 + }, + { + "epoch": 0.2114556570558569, + "grad_norm": 1.7130677700042725, + "learning_rate": 4.468388511833828e-05, + "loss": 4.4058, + "step": 35555 + }, + { + "epoch": 0.21146160433913788, + "grad_norm": 1.589738368988037, + "learning_rate": 4.4683597148951515e-05, + "loss": 4.4582, + "step": 35556 + }, + { + "epoch": 0.21146755162241887, + "grad_norm": 1.6870765686035156, + "learning_rate": 4.468330917269342e-05, + "loss": 4.2733, + "step": 35557 + }, + { + "epoch": 0.21147349890569989, + "grad_norm": 1.8612738847732544, + "learning_rate": 4.468302118956408e-05, + "loss": 4.4758, + "step": 35558 + }, + { + "epoch": 0.21147944618898087, + "grad_norm": 1.7577272653579712, + "learning_rate": 4.46827331995636e-05, + "loss": 4.2751, + "step": 35559 + }, + { + "epoch": 0.21148539347226186, + "grad_norm": 2.018022298812866, + "learning_rate": 4.468244520269208e-05, + "loss": 4.0733, + "step": 35560 + }, + { + "epoch": 0.21149134075554288, + "grad_norm": 2.8763539791107178, + "learning_rate": 4.4682157198949614e-05, + "loss": 3.7829, + "step": 35561 + }, + { + "epoch": 0.21149728803882387, + "grad_norm": 2.9004275798797607, + "learning_rate": 4.4681869188336324e-05, + "loss": 3.8345, + "step": 35562 + }, + { + "epoch": 0.21150323532210485, + "grad_norm": 2.5729360580444336, + "learning_rate": 4.4681581170852296e-05, + "loss": 3.088, + "step": 35563 + }, + { + "epoch": 0.21150918260538587, + "grad_norm": 1.752673864364624, + "learning_rate": 4.468129314649762e-05, + "loss": 4.4127, + "step": 35564 + }, + { + "epoch": 0.21151512988866686, + "grad_norm": 1.3503072261810303, + "learning_rate": 4.468100511527241e-05, + "loss": 5.1179, + "step": 35565 + }, + { + "epoch": 0.21152107717194785, + "grad_norm": 1.5340571403503418, + "learning_rate": 4.468071707717677e-05, + "loss": 5.0112, + "step": 35566 + }, + { + "epoch": 0.21152702445522886, + "grad_norm": 2.2375614643096924, + "learning_rate": 4.46804290322108e-05, + "loss": 3.5679, + "step": 35567 + }, + { + "epoch": 0.21153297173850985, + "grad_norm": 2.288891077041626, + "learning_rate": 4.4680140980374584e-05, + "loss": 3.3486, + "step": 35568 + }, + { + "epoch": 0.21153891902179084, + "grad_norm": 2.604292154312134, + "learning_rate": 4.467985292166824e-05, + "loss": 3.5723, + "step": 35569 + }, + { + "epoch": 0.21154486630507185, + "grad_norm": 2.132558822631836, + "learning_rate": 4.467956485609186e-05, + "loss": 3.6876, + "step": 35570 + }, + { + "epoch": 0.21155081358835284, + "grad_norm": 1.9742835760116577, + "learning_rate": 4.4679276783645554e-05, + "loss": 3.5008, + "step": 35571 + }, + { + "epoch": 0.21155676087163383, + "grad_norm": 2.3548946380615234, + "learning_rate": 4.467898870432941e-05, + "loss": 4.3595, + "step": 35572 + }, + { + "epoch": 0.21156270815491485, + "grad_norm": 2.1973586082458496, + "learning_rate": 4.467870061814353e-05, + "loss": 4.5111, + "step": 35573 + }, + { + "epoch": 0.21156865543819584, + "grad_norm": 2.1064834594726562, + "learning_rate": 4.4678412525088025e-05, + "loss": 3.7621, + "step": 35574 + }, + { + "epoch": 0.21157460272147682, + "grad_norm": 2.0585405826568604, + "learning_rate": 4.467812442516299e-05, + "loss": 3.9767, + "step": 35575 + }, + { + "epoch": 0.21158055000475784, + "grad_norm": 2.481163501739502, + "learning_rate": 4.467783631836853e-05, + "loss": 4.3745, + "step": 35576 + }, + { + "epoch": 0.21158649728803883, + "grad_norm": 2.713836193084717, + "learning_rate": 4.4677548204704734e-05, + "loss": 4.2229, + "step": 35577 + }, + { + "epoch": 0.21159244457131982, + "grad_norm": 2.270063877105713, + "learning_rate": 4.46772600841717e-05, + "loss": 4.5278, + "step": 35578 + }, + { + "epoch": 0.21159839185460083, + "grad_norm": 2.3832831382751465, + "learning_rate": 4.4676971956769555e-05, + "loss": 4.1892, + "step": 35579 + }, + { + "epoch": 0.21160433913788182, + "grad_norm": 2.1676185131073, + "learning_rate": 4.467668382249837e-05, + "loss": 4.2975, + "step": 35580 + }, + { + "epoch": 0.2116102864211628, + "grad_norm": 2.134890556335449, + "learning_rate": 4.467639568135826e-05, + "loss": 4.3765, + "step": 35581 + }, + { + "epoch": 0.21161623370444382, + "grad_norm": 2.3156919479370117, + "learning_rate": 4.4676107533349335e-05, + "loss": 4.087, + "step": 35582 + }, + { + "epoch": 0.2116221809877248, + "grad_norm": 2.341125965118408, + "learning_rate": 4.467581937847167e-05, + "loss": 4.1806, + "step": 35583 + }, + { + "epoch": 0.2116281282710058, + "grad_norm": 2.721090793609619, + "learning_rate": 4.467553121672539e-05, + "loss": 4.2418, + "step": 35584 + }, + { + "epoch": 0.2116340755542868, + "grad_norm": 2.1176726818084717, + "learning_rate": 4.467524304811058e-05, + "loss": 4.0049, + "step": 35585 + }, + { + "epoch": 0.2116400228375678, + "grad_norm": 2.553966760635376, + "learning_rate": 4.4674954872627345e-05, + "loss": 4.095, + "step": 35586 + }, + { + "epoch": 0.2116459701208488, + "grad_norm": 2.35528826713562, + "learning_rate": 4.467466669027579e-05, + "loss": 4.2416, + "step": 35587 + }, + { + "epoch": 0.21165191740412978, + "grad_norm": 1.906132459640503, + "learning_rate": 4.467437850105601e-05, + "loss": 4.9423, + "step": 35588 + }, + { + "epoch": 0.2116578646874108, + "grad_norm": 2.400595188140869, + "learning_rate": 4.4674090304968106e-05, + "loss": 4.0596, + "step": 35589 + }, + { + "epoch": 0.21166381197069178, + "grad_norm": 2.12864089012146, + "learning_rate": 4.467380210201218e-05, + "loss": 4.1793, + "step": 35590 + }, + { + "epoch": 0.21166975925397277, + "grad_norm": 2.3407888412475586, + "learning_rate": 4.4673513892188335e-05, + "loss": 3.7511, + "step": 35591 + }, + { + "epoch": 0.2116757065372538, + "grad_norm": 2.0563061237335205, + "learning_rate": 4.467322567549667e-05, + "loss": 4.2335, + "step": 35592 + }, + { + "epoch": 0.21168165382053478, + "grad_norm": 1.9491883516311646, + "learning_rate": 4.467293745193729e-05, + "loss": 4.1301, + "step": 35593 + }, + { + "epoch": 0.21168760110381576, + "grad_norm": 2.4588730335235596, + "learning_rate": 4.467264922151028e-05, + "loss": 4.0841, + "step": 35594 + }, + { + "epoch": 0.21169354838709678, + "grad_norm": 1.9393937587738037, + "learning_rate": 4.467236098421576e-05, + "loss": 4.2677, + "step": 35595 + }, + { + "epoch": 0.21169949567037777, + "grad_norm": 2.00981068611145, + "learning_rate": 4.4672072740053816e-05, + "loss": 4.1659, + "step": 35596 + }, + { + "epoch": 0.21170544295365876, + "grad_norm": 1.8913508653640747, + "learning_rate": 4.467178448902456e-05, + "loss": 3.9792, + "step": 35597 + }, + { + "epoch": 0.21171139023693977, + "grad_norm": 2.168665647506714, + "learning_rate": 4.467149623112809e-05, + "loss": 3.6157, + "step": 35598 + }, + { + "epoch": 0.21171733752022076, + "grad_norm": 2.5305583477020264, + "learning_rate": 4.467120796636449e-05, + "loss": 4.0156, + "step": 35599 + }, + { + "epoch": 0.21172328480350175, + "grad_norm": 2.593087911605835, + "learning_rate": 4.467091969473389e-05, + "loss": 3.9066, + "step": 35600 + }, + { + "epoch": 0.21172923208678276, + "grad_norm": 1.93959641456604, + "learning_rate": 4.4670631416236365e-05, + "loss": 4.0216, + "step": 35601 + }, + { + "epoch": 0.21173517937006375, + "grad_norm": 4.1372246742248535, + "learning_rate": 4.467034313087203e-05, + "loss": 3.5619, + "step": 35602 + }, + { + "epoch": 0.21174112665334474, + "grad_norm": 3.2538001537323, + "learning_rate": 4.4670054838640984e-05, + "loss": 3.2081, + "step": 35603 + }, + { + "epoch": 0.21174707393662576, + "grad_norm": 4.0510640144348145, + "learning_rate": 4.466976653954332e-05, + "loss": 2.6592, + "step": 35604 + }, + { + "epoch": 0.21175302121990675, + "grad_norm": 3.2859723567962646, + "learning_rate": 4.4669478233579143e-05, + "loss": 2.4745, + "step": 35605 + }, + { + "epoch": 0.21175896850318773, + "grad_norm": 2.0423004627227783, + "learning_rate": 4.466918992074856e-05, + "loss": 4.7596, + "step": 35606 + }, + { + "epoch": 0.21176491578646875, + "grad_norm": 4.604837417602539, + "learning_rate": 4.4668901601051663e-05, + "loss": 4.6869, + "step": 35607 + }, + { + "epoch": 0.21177086306974974, + "grad_norm": 4.67194938659668, + "learning_rate": 4.466861327448856e-05, + "loss": 4.217, + "step": 35608 + }, + { + "epoch": 0.21177681035303073, + "grad_norm": 3.2619986534118652, + "learning_rate": 4.466832494105934e-05, + "loss": 2.6688, + "step": 35609 + }, + { + "epoch": 0.21178275763631174, + "grad_norm": 3.189119577407837, + "learning_rate": 4.466803660076411e-05, + "loss": 3.1766, + "step": 35610 + }, + { + "epoch": 0.21178870491959273, + "grad_norm": 3.00148344039917, + "learning_rate": 4.4667748253602976e-05, + "loss": 2.9854, + "step": 35611 + }, + { + "epoch": 0.21179465220287372, + "grad_norm": 3.20414662361145, + "learning_rate": 4.4667459899576034e-05, + "loss": 3.4441, + "step": 35612 + }, + { + "epoch": 0.21180059948615473, + "grad_norm": 2.8852174282073975, + "learning_rate": 4.466717153868338e-05, + "loss": 3.3384, + "step": 35613 + }, + { + "epoch": 0.21180654676943572, + "grad_norm": 3.3265509605407715, + "learning_rate": 4.466688317092513e-05, + "loss": 4.3809, + "step": 35614 + }, + { + "epoch": 0.2118124940527167, + "grad_norm": 1.7819219827651978, + "learning_rate": 4.4666594796301366e-05, + "loss": 5.4982, + "step": 35615 + }, + { + "epoch": 0.21181844133599773, + "grad_norm": 2.841721296310425, + "learning_rate": 4.46663064148122e-05, + "loss": 2.1118, + "step": 35616 + }, + { + "epoch": 0.21182438861927871, + "grad_norm": 2.0219855308532715, + "learning_rate": 4.466601802645773e-05, + "loss": 4.4131, + "step": 35617 + }, + { + "epoch": 0.2118303359025597, + "grad_norm": 1.6084177494049072, + "learning_rate": 4.466572963123805e-05, + "loss": 5.1337, + "step": 35618 + }, + { + "epoch": 0.21183628318584072, + "grad_norm": 1.998936414718628, + "learning_rate": 4.4665441229153285e-05, + "loss": 4.7807, + "step": 35619 + }, + { + "epoch": 0.2118422304691217, + "grad_norm": 2.4785871505737305, + "learning_rate": 4.46651528202035e-05, + "loss": 5.2531, + "step": 35620 + }, + { + "epoch": 0.2118481777524027, + "grad_norm": 1.9801669120788574, + "learning_rate": 4.466486440438882e-05, + "loss": 5.3602, + "step": 35621 + }, + { + "epoch": 0.2118541250356837, + "grad_norm": 1.637373924255371, + "learning_rate": 4.4664575981709333e-05, + "loss": 5.2314, + "step": 35622 + }, + { + "epoch": 0.2118600723189647, + "grad_norm": 1.5949249267578125, + "learning_rate": 4.466428755216515e-05, + "loss": 5.2677, + "step": 35623 + }, + { + "epoch": 0.2118660196022457, + "grad_norm": 1.5839226245880127, + "learning_rate": 4.466399911575637e-05, + "loss": 5.2145, + "step": 35624 + }, + { + "epoch": 0.2118719668855267, + "grad_norm": 1.6766635179519653, + "learning_rate": 4.4663710672483084e-05, + "loss": 4.9226, + "step": 35625 + }, + { + "epoch": 0.2118779141688077, + "grad_norm": 1.5780537128448486, + "learning_rate": 4.466342222234541e-05, + "loss": 5.1708, + "step": 35626 + }, + { + "epoch": 0.21188386145208868, + "grad_norm": 1.5924153327941895, + "learning_rate": 4.4663133765343436e-05, + "loss": 5.1272, + "step": 35627 + }, + { + "epoch": 0.2118898087353697, + "grad_norm": 1.7102172374725342, + "learning_rate": 4.466284530147725e-05, + "loss": 5.0985, + "step": 35628 + }, + { + "epoch": 0.21189575601865068, + "grad_norm": 1.7256853580474854, + "learning_rate": 4.4662556830746985e-05, + "loss": 5.2144, + "step": 35629 + }, + { + "epoch": 0.21190170330193167, + "grad_norm": 1.5665667057037354, + "learning_rate": 4.466226835315272e-05, + "loss": 5.2794, + "step": 35630 + }, + { + "epoch": 0.2119076505852127, + "grad_norm": 1.538317322731018, + "learning_rate": 4.466197986869456e-05, + "loss": 5.0972, + "step": 35631 + }, + { + "epoch": 0.21191359786849367, + "grad_norm": 1.5625393390655518, + "learning_rate": 4.466169137737261e-05, + "loss": 4.7858, + "step": 35632 + }, + { + "epoch": 0.21191954515177466, + "grad_norm": 1.8737174272537231, + "learning_rate": 4.466140287918695e-05, + "loss": 4.6701, + "step": 35633 + }, + { + "epoch": 0.21192549243505568, + "grad_norm": 1.9811254739761353, + "learning_rate": 4.4661114374137716e-05, + "loss": 4.2716, + "step": 35634 + }, + { + "epoch": 0.21193143971833667, + "grad_norm": 1.6436641216278076, + "learning_rate": 4.4660825862224984e-05, + "loss": 4.6396, + "step": 35635 + }, + { + "epoch": 0.21193738700161766, + "grad_norm": 1.5466450452804565, + "learning_rate": 4.466053734344886e-05, + "loss": 4.5979, + "step": 35636 + }, + { + "epoch": 0.21194333428489867, + "grad_norm": 1.7120367288589478, + "learning_rate": 4.4660248817809444e-05, + "loss": 4.8051, + "step": 35637 + }, + { + "epoch": 0.21194928156817966, + "grad_norm": 1.540959119796753, + "learning_rate": 4.4659960285306846e-05, + "loss": 4.9112, + "step": 35638 + }, + { + "epoch": 0.21195522885146065, + "grad_norm": 1.5579898357391357, + "learning_rate": 4.4659671745941147e-05, + "loss": 4.7432, + "step": 35639 + }, + { + "epoch": 0.21196117613474166, + "grad_norm": 1.3252559900283813, + "learning_rate": 4.465938319971247e-05, + "loss": 4.5407, + "step": 35640 + }, + { + "epoch": 0.21196712341802265, + "grad_norm": 1.4672505855560303, + "learning_rate": 4.4659094646620904e-05, + "loss": 4.4972, + "step": 35641 + }, + { + "epoch": 0.21197307070130364, + "grad_norm": 1.603417158126831, + "learning_rate": 4.4658806086666544e-05, + "loss": 4.6884, + "step": 35642 + }, + { + "epoch": 0.21197901798458466, + "grad_norm": 1.8993263244628906, + "learning_rate": 4.465851751984951e-05, + "loss": 4.8931, + "step": 35643 + }, + { + "epoch": 0.21198496526786564, + "grad_norm": 1.5901163816452026, + "learning_rate": 4.4658228946169875e-05, + "loss": 4.6111, + "step": 35644 + }, + { + "epoch": 0.21199091255114663, + "grad_norm": 1.392621397972107, + "learning_rate": 4.465794036562776e-05, + "loss": 4.4493, + "step": 35645 + }, + { + "epoch": 0.21199685983442762, + "grad_norm": 1.6505818367004395, + "learning_rate": 4.465765177822327e-05, + "loss": 4.931, + "step": 35646 + }, + { + "epoch": 0.21200280711770864, + "grad_norm": 2.01570463180542, + "learning_rate": 4.465736318395649e-05, + "loss": 4.3823, + "step": 35647 + }, + { + "epoch": 0.21200875440098962, + "grad_norm": 2.1474528312683105, + "learning_rate": 4.465707458282753e-05, + "loss": 4.3524, + "step": 35648 + }, + { + "epoch": 0.2120147016842706, + "grad_norm": 1.5785243511199951, + "learning_rate": 4.465678597483649e-05, + "loss": 4.287, + "step": 35649 + }, + { + "epoch": 0.21202064896755163, + "grad_norm": 1.863834023475647, + "learning_rate": 4.465649735998346e-05, + "loss": 5.2325, + "step": 35650 + }, + { + "epoch": 0.21202659625083262, + "grad_norm": 1.8547208309173584, + "learning_rate": 4.465620873826856e-05, + "loss": 5.1475, + "step": 35651 + }, + { + "epoch": 0.2120325435341136, + "grad_norm": 1.5947805643081665, + "learning_rate": 4.465592010969187e-05, + "loss": 5.3976, + "step": 35652 + }, + { + "epoch": 0.21203849081739462, + "grad_norm": 1.9001067876815796, + "learning_rate": 4.4655631474253515e-05, + "loss": 5.492, + "step": 35653 + }, + { + "epoch": 0.2120444381006756, + "grad_norm": 1.6406006813049316, + "learning_rate": 4.465534283195357e-05, + "loss": 4.5243, + "step": 35654 + }, + { + "epoch": 0.2120503853839566, + "grad_norm": 1.9014918804168701, + "learning_rate": 4.4655054182792156e-05, + "loss": 4.794, + "step": 35655 + }, + { + "epoch": 0.2120563326672376, + "grad_norm": 1.647063970565796, + "learning_rate": 4.4654765526769365e-05, + "loss": 4.9163, + "step": 35656 + }, + { + "epoch": 0.2120622799505186, + "grad_norm": 1.5540443658828735, + "learning_rate": 4.4654476863885296e-05, + "loss": 5.0264, + "step": 35657 + }, + { + "epoch": 0.2120682272337996, + "grad_norm": 1.7547403573989868, + "learning_rate": 4.465418819414005e-05, + "loss": 4.8722, + "step": 35658 + }, + { + "epoch": 0.2120741745170806, + "grad_norm": 1.6932998895645142, + "learning_rate": 4.4653899517533736e-05, + "loss": 4.9104, + "step": 35659 + }, + { + "epoch": 0.2120801218003616, + "grad_norm": 1.6615930795669556, + "learning_rate": 4.465361083406645e-05, + "loss": 4.9179, + "step": 35660 + }, + { + "epoch": 0.21208606908364258, + "grad_norm": 2.116122007369995, + "learning_rate": 4.465332214373828e-05, + "loss": 4.0842, + "step": 35661 + }, + { + "epoch": 0.2120920163669236, + "grad_norm": 1.4886269569396973, + "learning_rate": 4.465303344654935e-05, + "loss": 4.8633, + "step": 35662 + }, + { + "epoch": 0.21209796365020459, + "grad_norm": 1.5376653671264648, + "learning_rate": 4.4652744742499744e-05, + "loss": 4.9968, + "step": 35663 + }, + { + "epoch": 0.21210391093348557, + "grad_norm": 1.5422334671020508, + "learning_rate": 4.4652456031589565e-05, + "loss": 4.6542, + "step": 35664 + }, + { + "epoch": 0.2121098582167666, + "grad_norm": 1.8304499387741089, + "learning_rate": 4.465216731381891e-05, + "loss": 5.2773, + "step": 35665 + }, + { + "epoch": 0.21211580550004758, + "grad_norm": 1.8360862731933594, + "learning_rate": 4.4651878589187904e-05, + "loss": 5.1749, + "step": 35666 + }, + { + "epoch": 0.21212175278332857, + "grad_norm": 2.1891777515411377, + "learning_rate": 4.465158985769662e-05, + "loss": 4.5996, + "step": 35667 + }, + { + "epoch": 0.21212770006660958, + "grad_norm": 1.8620492219924927, + "learning_rate": 4.4651301119345174e-05, + "loss": 5.1286, + "step": 35668 + }, + { + "epoch": 0.21213364734989057, + "grad_norm": 1.7725592851638794, + "learning_rate": 4.465101237413366e-05, + "loss": 3.8595, + "step": 35669 + }, + { + "epoch": 0.21213959463317156, + "grad_norm": 3.5651681423187256, + "learning_rate": 4.4650723622062174e-05, + "loss": 2.1436, + "step": 35670 + }, + { + "epoch": 0.21214554191645257, + "grad_norm": 2.6675519943237305, + "learning_rate": 4.465043486313083e-05, + "loss": 2.6111, + "step": 35671 + }, + { + "epoch": 0.21215148919973356, + "grad_norm": 2.8939945697784424, + "learning_rate": 4.4650146097339726e-05, + "loss": 1.951, + "step": 35672 + }, + { + "epoch": 0.21215743648301455, + "grad_norm": 2.7901999950408936, + "learning_rate": 4.464985732468895e-05, + "loss": 2.5639, + "step": 35673 + }, + { + "epoch": 0.21216338376629557, + "grad_norm": 2.7896947860717773, + "learning_rate": 4.464956854517862e-05, + "loss": 2.4701, + "step": 35674 + }, + { + "epoch": 0.21216933104957655, + "grad_norm": 3.1296167373657227, + "learning_rate": 4.464927975880882e-05, + "loss": 2.8541, + "step": 35675 + }, + { + "epoch": 0.21217527833285754, + "grad_norm": 2.6969821453094482, + "learning_rate": 4.4648990965579665e-05, + "loss": 2.5398, + "step": 35676 + }, + { + "epoch": 0.21218122561613856, + "grad_norm": 2.5808277130126953, + "learning_rate": 4.4648702165491255e-05, + "loss": 2.0513, + "step": 35677 + }, + { + "epoch": 0.21218717289941955, + "grad_norm": 2.433685064315796, + "learning_rate": 4.464841335854367e-05, + "loss": 4.4278, + "step": 35678 + }, + { + "epoch": 0.21219312018270053, + "grad_norm": 2.1320486068725586, + "learning_rate": 4.464812454473705e-05, + "loss": 4.12, + "step": 35679 + }, + { + "epoch": 0.21219906746598155, + "grad_norm": 2.456299304962158, + "learning_rate": 4.464783572407145e-05, + "loss": 4.5267, + "step": 35680 + }, + { + "epoch": 0.21220501474926254, + "grad_norm": 2.1469194889068604, + "learning_rate": 4.464754689654701e-05, + "loss": 4.3901, + "step": 35681 + }, + { + "epoch": 0.21221096203254353, + "grad_norm": 2.1196210384368896, + "learning_rate": 4.46472580621638e-05, + "loss": 4.4883, + "step": 35682 + }, + { + "epoch": 0.21221690931582454, + "grad_norm": 1.8412578105926514, + "learning_rate": 4.464696922092195e-05, + "loss": 4.3989, + "step": 35683 + }, + { + "epoch": 0.21222285659910553, + "grad_norm": 1.8631144762039185, + "learning_rate": 4.464668037282154e-05, + "loss": 4.3183, + "step": 35684 + }, + { + "epoch": 0.21222880388238652, + "grad_norm": 2.0931034088134766, + "learning_rate": 4.464639151786267e-05, + "loss": 4.4251, + "step": 35685 + }, + { + "epoch": 0.21223475116566753, + "grad_norm": 2.132053852081299, + "learning_rate": 4.464610265604546e-05, + "loss": 4.1585, + "step": 35686 + }, + { + "epoch": 0.21224069844894852, + "grad_norm": 2.145237445831299, + "learning_rate": 4.464581378736999e-05, + "loss": 4.5089, + "step": 35687 + }, + { + "epoch": 0.2122466457322295, + "grad_norm": 1.8298094272613525, + "learning_rate": 4.464552491183637e-05, + "loss": 4.3018, + "step": 35688 + }, + { + "epoch": 0.21225259301551053, + "grad_norm": 1.7321758270263672, + "learning_rate": 4.4645236029444704e-05, + "loss": 4.3424, + "step": 35689 + }, + { + "epoch": 0.21225854029879151, + "grad_norm": 1.5509285926818848, + "learning_rate": 4.464494714019508e-05, + "loss": 4.7001, + "step": 35690 + }, + { + "epoch": 0.2122644875820725, + "grad_norm": 1.5042033195495605, + "learning_rate": 4.464465824408762e-05, + "loss": 4.7668, + "step": 35691 + }, + { + "epoch": 0.21227043486535352, + "grad_norm": 1.405381202697754, + "learning_rate": 4.4644369341122405e-05, + "loss": 4.5569, + "step": 35692 + }, + { + "epoch": 0.2122763821486345, + "grad_norm": 2.495974540710449, + "learning_rate": 4.464408043129955e-05, + "loss": 3.6424, + "step": 35693 + }, + { + "epoch": 0.2122823294319155, + "grad_norm": 1.930151104927063, + "learning_rate": 4.4643791514619146e-05, + "loss": 4.7363, + "step": 35694 + }, + { + "epoch": 0.2122882767151965, + "grad_norm": 2.1979784965515137, + "learning_rate": 4.46435025910813e-05, + "loss": 4.4973, + "step": 35695 + }, + { + "epoch": 0.2122942239984775, + "grad_norm": 1.9017161130905151, + "learning_rate": 4.46432136606861e-05, + "loss": 5.0348, + "step": 35696 + }, + { + "epoch": 0.2123001712817585, + "grad_norm": 2.4398694038391113, + "learning_rate": 4.464292472343367e-05, + "loss": 5.0757, + "step": 35697 + }, + { + "epoch": 0.2123061185650395, + "grad_norm": 2.061084508895874, + "learning_rate": 4.464263577932409e-05, + "loss": 4.9802, + "step": 35698 + }, + { + "epoch": 0.2123120658483205, + "grad_norm": 2.277392864227295, + "learning_rate": 4.4642346828357474e-05, + "loss": 4.7231, + "step": 35699 + }, + { + "epoch": 0.21231801313160148, + "grad_norm": 2.2129130363464355, + "learning_rate": 4.464205787053391e-05, + "loss": 4.7154, + "step": 35700 + }, + { + "epoch": 0.2123239604148825, + "grad_norm": 1.9063429832458496, + "learning_rate": 4.4641768905853506e-05, + "loss": 5.1813, + "step": 35701 + }, + { + "epoch": 0.21232990769816348, + "grad_norm": 1.896718978881836, + "learning_rate": 4.464147993431638e-05, + "loss": 4.9855, + "step": 35702 + }, + { + "epoch": 0.21233585498144447, + "grad_norm": 1.7391164302825928, + "learning_rate": 4.46411909559226e-05, + "loss": 4.9448, + "step": 35703 + }, + { + "epoch": 0.21234180226472546, + "grad_norm": 1.8338813781738281, + "learning_rate": 4.464090197067229e-05, + "loss": 4.9424, + "step": 35704 + }, + { + "epoch": 0.21234774954800648, + "grad_norm": 1.8616620302200317, + "learning_rate": 4.4640612978565536e-05, + "loss": 4.9748, + "step": 35705 + }, + { + "epoch": 0.21235369683128746, + "grad_norm": 1.8451703786849976, + "learning_rate": 4.4640323979602456e-05, + "loss": 4.8245, + "step": 35706 + }, + { + "epoch": 0.21235964411456845, + "grad_norm": 1.9476757049560547, + "learning_rate": 4.464003497378314e-05, + "loss": 4.4859, + "step": 35707 + }, + { + "epoch": 0.21236559139784947, + "grad_norm": 1.9685393571853638, + "learning_rate": 4.463974596110769e-05, + "loss": 4.8838, + "step": 35708 + }, + { + "epoch": 0.21237153868113046, + "grad_norm": 1.8450342416763306, + "learning_rate": 4.463945694157621e-05, + "loss": 4.8105, + "step": 35709 + }, + { + "epoch": 0.21237748596441144, + "grad_norm": 1.8277257680892944, + "learning_rate": 4.463916791518879e-05, + "loss": 4.7422, + "step": 35710 + }, + { + "epoch": 0.21238343324769246, + "grad_norm": 2.5418896675109863, + "learning_rate": 4.463887888194555e-05, + "loss": 3.8398, + "step": 35711 + }, + { + "epoch": 0.21238938053097345, + "grad_norm": 2.198882818222046, + "learning_rate": 4.4638589841846564e-05, + "loss": 4.2856, + "step": 35712 + }, + { + "epoch": 0.21239532781425444, + "grad_norm": 1.7862573862075806, + "learning_rate": 4.463830079489196e-05, + "loss": 4.6743, + "step": 35713 + }, + { + "epoch": 0.21240127509753545, + "grad_norm": 2.3604986667633057, + "learning_rate": 4.463801174108183e-05, + "loss": 3.2922, + "step": 35714 + }, + { + "epoch": 0.21240722238081644, + "grad_norm": 2.7802772521972656, + "learning_rate": 4.463772268041627e-05, + "loss": 3.8924, + "step": 35715 + }, + { + "epoch": 0.21241316966409743, + "grad_norm": 2.6005308628082275, + "learning_rate": 4.46374336128954e-05, + "loss": 3.9048, + "step": 35716 + }, + { + "epoch": 0.21241911694737844, + "grad_norm": 2.9707534313201904, + "learning_rate": 4.463714453851928e-05, + "loss": 4.0335, + "step": 35717 + }, + { + "epoch": 0.21242506423065943, + "grad_norm": 2.741023302078247, + "learning_rate": 4.4636855457288046e-05, + "loss": 3.7947, + "step": 35718 + }, + { + "epoch": 0.21243101151394042, + "grad_norm": 3.775846242904663, + "learning_rate": 4.463656636920179e-05, + "loss": 3.8971, + "step": 35719 + }, + { + "epoch": 0.21243695879722144, + "grad_norm": 2.2663304805755615, + "learning_rate": 4.463627727426061e-05, + "loss": 3.6525, + "step": 35720 + }, + { + "epoch": 0.21244290608050242, + "grad_norm": 2.3076207637786865, + "learning_rate": 4.463598817246461e-05, + "loss": 3.502, + "step": 35721 + }, + { + "epoch": 0.2124488533637834, + "grad_norm": 2.273998260498047, + "learning_rate": 4.4635699063813884e-05, + "loss": 3.5203, + "step": 35722 + }, + { + "epoch": 0.21245480064706443, + "grad_norm": 2.7163095474243164, + "learning_rate": 4.463540994830855e-05, + "loss": 3.9886, + "step": 35723 + }, + { + "epoch": 0.21246074793034542, + "grad_norm": 2.482473850250244, + "learning_rate": 4.463512082594868e-05, + "loss": 3.552, + "step": 35724 + }, + { + "epoch": 0.2124666952136264, + "grad_norm": 1.8834370374679565, + "learning_rate": 4.4634831696734404e-05, + "loss": 4.1334, + "step": 35725 + }, + { + "epoch": 0.21247264249690742, + "grad_norm": 2.005268096923828, + "learning_rate": 4.463454256066581e-05, + "loss": 5.1996, + "step": 35726 + }, + { + "epoch": 0.2124785897801884, + "grad_norm": 1.4959584474563599, + "learning_rate": 4.4634253417743e-05, + "loss": 4.7173, + "step": 35727 + }, + { + "epoch": 0.2124845370634694, + "grad_norm": 1.490785002708435, + "learning_rate": 4.463396426796608e-05, + "loss": 4.6401, + "step": 35728 + }, + { + "epoch": 0.2124904843467504, + "grad_norm": 1.722306251525879, + "learning_rate": 4.463367511133513e-05, + "loss": 4.9166, + "step": 35729 + }, + { + "epoch": 0.2124964316300314, + "grad_norm": 1.8705493211746216, + "learning_rate": 4.463338594785028e-05, + "loss": 4.3841, + "step": 35730 + }, + { + "epoch": 0.2125023789133124, + "grad_norm": 1.6293779611587524, + "learning_rate": 4.4633096777511614e-05, + "loss": 4.7904, + "step": 35731 + }, + { + "epoch": 0.2125083261965934, + "grad_norm": 1.474142074584961, + "learning_rate": 4.4632807600319236e-05, + "loss": 4.7202, + "step": 35732 + }, + { + "epoch": 0.2125142734798744, + "grad_norm": 1.8260791301727295, + "learning_rate": 4.463251841627325e-05, + "loss": 4.7537, + "step": 35733 + }, + { + "epoch": 0.21252022076315538, + "grad_norm": 2.335918664932251, + "learning_rate": 4.463222922537376e-05, + "loss": 4.5175, + "step": 35734 + }, + { + "epoch": 0.2125261680464364, + "grad_norm": 2.1824939250946045, + "learning_rate": 4.463194002762084e-05, + "loss": 4.8903, + "step": 35735 + }, + { + "epoch": 0.21253211532971739, + "grad_norm": 2.128995656967163, + "learning_rate": 4.4631650823014635e-05, + "loss": 4.8174, + "step": 35736 + }, + { + "epoch": 0.21253806261299837, + "grad_norm": 2.001495122909546, + "learning_rate": 4.4631361611555214e-05, + "loss": 4.7711, + "step": 35737 + }, + { + "epoch": 0.2125440098962794, + "grad_norm": 1.9211745262145996, + "learning_rate": 4.463107239324269e-05, + "loss": 4.6105, + "step": 35738 + }, + { + "epoch": 0.21254995717956038, + "grad_norm": 1.8648548126220703, + "learning_rate": 4.463078316807716e-05, + "loss": 4.6635, + "step": 35739 + }, + { + "epoch": 0.21255590446284137, + "grad_norm": 1.7322161197662354, + "learning_rate": 4.4630493936058726e-05, + "loss": 4.7696, + "step": 35740 + }, + { + "epoch": 0.21256185174612238, + "grad_norm": 1.7490285634994507, + "learning_rate": 4.4630204697187495e-05, + "loss": 4.5781, + "step": 35741 + }, + { + "epoch": 0.21256779902940337, + "grad_norm": 2.229279041290283, + "learning_rate": 4.462991545146355e-05, + "loss": 4.9855, + "step": 35742 + }, + { + "epoch": 0.21257374631268436, + "grad_norm": 1.8897117376327515, + "learning_rate": 4.462962619888702e-05, + "loss": 4.7951, + "step": 35743 + }, + { + "epoch": 0.21257969359596537, + "grad_norm": 1.908650279045105, + "learning_rate": 4.4629336939457986e-05, + "loss": 5.3459, + "step": 35744 + }, + { + "epoch": 0.21258564087924636, + "grad_norm": 1.9401918649673462, + "learning_rate": 4.462904767317655e-05, + "loss": 5.2774, + "step": 35745 + }, + { + "epoch": 0.21259158816252735, + "grad_norm": 1.8013694286346436, + "learning_rate": 4.462875840004281e-05, + "loss": 5.1422, + "step": 35746 + }, + { + "epoch": 0.21259753544580837, + "grad_norm": 1.7482459545135498, + "learning_rate": 4.462846912005688e-05, + "loss": 4.9011, + "step": 35747 + }, + { + "epoch": 0.21260348272908935, + "grad_norm": 2.0190155506134033, + "learning_rate": 4.462817983321885e-05, + "loss": 5.1047, + "step": 35748 + }, + { + "epoch": 0.21260943001237034, + "grad_norm": 1.6946347951889038, + "learning_rate": 4.4627890539528836e-05, + "loss": 4.5423, + "step": 35749 + }, + { + "epoch": 0.21261537729565136, + "grad_norm": 1.8933213949203491, + "learning_rate": 4.462760123898692e-05, + "loss": 4.7896, + "step": 35750 + }, + { + "epoch": 0.21262132457893235, + "grad_norm": 1.4609590768814087, + "learning_rate": 4.462731193159321e-05, + "loss": 4.6521, + "step": 35751 + }, + { + "epoch": 0.21262727186221334, + "grad_norm": 1.5144892930984497, + "learning_rate": 4.462702261734781e-05, + "loss": 4.6135, + "step": 35752 + }, + { + "epoch": 0.21263321914549435, + "grad_norm": 1.5038193464279175, + "learning_rate": 4.4626733296250825e-05, + "loss": 4.642, + "step": 35753 + }, + { + "epoch": 0.21263916642877534, + "grad_norm": 1.3743622303009033, + "learning_rate": 4.4626443968302344e-05, + "loss": 4.5229, + "step": 35754 + }, + { + "epoch": 0.21264511371205633, + "grad_norm": 1.39356529712677, + "learning_rate": 4.462615463350247e-05, + "loss": 4.5139, + "step": 35755 + }, + { + "epoch": 0.21265106099533734, + "grad_norm": 1.4335349798202515, + "learning_rate": 4.462586529185132e-05, + "loss": 4.5719, + "step": 35756 + }, + { + "epoch": 0.21265700827861833, + "grad_norm": 1.6316946744918823, + "learning_rate": 4.4625575943348976e-05, + "loss": 4.6748, + "step": 35757 + }, + { + "epoch": 0.21266295556189932, + "grad_norm": 1.7913111448287964, + "learning_rate": 4.462528658799554e-05, + "loss": 4.4836, + "step": 35758 + }, + { + "epoch": 0.21266890284518034, + "grad_norm": 1.8246521949768066, + "learning_rate": 4.462499722579113e-05, + "loss": 5.1016, + "step": 35759 + }, + { + "epoch": 0.21267485012846132, + "grad_norm": 1.8918733596801758, + "learning_rate": 4.462470785673583e-05, + "loss": 5.2359, + "step": 35760 + }, + { + "epoch": 0.2126807974117423, + "grad_norm": 1.6759446859359741, + "learning_rate": 4.4624418480829754e-05, + "loss": 5.0853, + "step": 35761 + }, + { + "epoch": 0.2126867446950233, + "grad_norm": 2.1670455932617188, + "learning_rate": 4.462412909807299e-05, + "loss": 4.4643, + "step": 35762 + }, + { + "epoch": 0.21269269197830432, + "grad_norm": 1.9109561443328857, + "learning_rate": 4.4623839708465646e-05, + "loss": 4.695, + "step": 35763 + }, + { + "epoch": 0.2126986392615853, + "grad_norm": 2.0584371089935303, + "learning_rate": 4.462355031200782e-05, + "loss": 4.5412, + "step": 35764 + }, + { + "epoch": 0.2127045865448663, + "grad_norm": 2.116912364959717, + "learning_rate": 4.462326090869963e-05, + "loss": 4.4798, + "step": 35765 + }, + { + "epoch": 0.2127105338281473, + "grad_norm": 1.6834105253219604, + "learning_rate": 4.4622971498541147e-05, + "loss": 4.1607, + "step": 35766 + }, + { + "epoch": 0.2127164811114283, + "grad_norm": 1.9433541297912598, + "learning_rate": 4.4622682081532484e-05, + "loss": 4.2535, + "step": 35767 + }, + { + "epoch": 0.21272242839470928, + "grad_norm": 1.8354408740997314, + "learning_rate": 4.462239265767376e-05, + "loss": 4.1845, + "step": 35768 + }, + { + "epoch": 0.2127283756779903, + "grad_norm": 1.801477074623108, + "learning_rate": 4.462210322696505e-05, + "loss": 4.1377, + "step": 35769 + }, + { + "epoch": 0.2127343229612713, + "grad_norm": 1.6852128505706787, + "learning_rate": 4.462181378940647e-05, + "loss": 4.0879, + "step": 35770 + }, + { + "epoch": 0.21274027024455228, + "grad_norm": 1.8251643180847168, + "learning_rate": 4.4621524344998124e-05, + "loss": 4.1113, + "step": 35771 + }, + { + "epoch": 0.2127462175278333, + "grad_norm": 2.3179166316986084, + "learning_rate": 4.462123489374009e-05, + "loss": 3.7295, + "step": 35772 + }, + { + "epoch": 0.21275216481111428, + "grad_norm": 3.6453943252563477, + "learning_rate": 4.46209454356325e-05, + "loss": 2.9047, + "step": 35773 + }, + { + "epoch": 0.21275811209439527, + "grad_norm": 1.855807900428772, + "learning_rate": 4.462065597067544e-05, + "loss": 4.4095, + "step": 35774 + }, + { + "epoch": 0.21276405937767628, + "grad_norm": 2.9473495483398438, + "learning_rate": 4.4620366498869e-05, + "loss": 2.6528, + "step": 35775 + }, + { + "epoch": 0.21277000666095727, + "grad_norm": 2.007720947265625, + "learning_rate": 4.462007702021331e-05, + "loss": 4.7348, + "step": 35776 + }, + { + "epoch": 0.21277595394423826, + "grad_norm": 2.2951998710632324, + "learning_rate": 4.461978753470845e-05, + "loss": 4.5622, + "step": 35777 + }, + { + "epoch": 0.21278190122751928, + "grad_norm": 1.5362045764923096, + "learning_rate": 4.461949804235451e-05, + "loss": 4.9606, + "step": 35778 + }, + { + "epoch": 0.21278784851080026, + "grad_norm": 1.6383750438690186, + "learning_rate": 4.461920854315162e-05, + "loss": 4.6241, + "step": 35779 + }, + { + "epoch": 0.21279379579408125, + "grad_norm": 2.050675630569458, + "learning_rate": 4.461891903709986e-05, + "loss": 5.3119, + "step": 35780 + }, + { + "epoch": 0.21279974307736227, + "grad_norm": 1.8561534881591797, + "learning_rate": 4.461862952419934e-05, + "loss": 4.8164, + "step": 35781 + }, + { + "epoch": 0.21280569036064326, + "grad_norm": 1.5282032489776611, + "learning_rate": 4.4618340004450164e-05, + "loss": 4.9202, + "step": 35782 + }, + { + "epoch": 0.21281163764392425, + "grad_norm": 1.5900583267211914, + "learning_rate": 4.4618050477852426e-05, + "loss": 4.9648, + "step": 35783 + }, + { + "epoch": 0.21281758492720526, + "grad_norm": 1.5663514137268066, + "learning_rate": 4.461776094440623e-05, + "loss": 4.7691, + "step": 35784 + }, + { + "epoch": 0.21282353221048625, + "grad_norm": 1.5911515951156616, + "learning_rate": 4.4617471404111665e-05, + "loss": 4.8285, + "step": 35785 + }, + { + "epoch": 0.21282947949376724, + "grad_norm": 1.5293818712234497, + "learning_rate": 4.461718185696886e-05, + "loss": 4.7698, + "step": 35786 + }, + { + "epoch": 0.21283542677704825, + "grad_norm": 1.4674770832061768, + "learning_rate": 4.4616892302977886e-05, + "loss": 4.7565, + "step": 35787 + }, + { + "epoch": 0.21284137406032924, + "grad_norm": 1.7607558965682983, + "learning_rate": 4.461660274213887e-05, + "loss": 4.7961, + "step": 35788 + }, + { + "epoch": 0.21284732134361023, + "grad_norm": 1.4412648677825928, + "learning_rate": 4.461631317445189e-05, + "loss": 4.8314, + "step": 35789 + }, + { + "epoch": 0.21285326862689125, + "grad_norm": 1.4143060445785522, + "learning_rate": 4.461602359991706e-05, + "loss": 4.8198, + "step": 35790 + }, + { + "epoch": 0.21285921591017223, + "grad_norm": 1.4745891094207764, + "learning_rate": 4.4615734018534484e-05, + "loss": 4.8315, + "step": 35791 + }, + { + "epoch": 0.21286516319345322, + "grad_norm": 1.4099732637405396, + "learning_rate": 4.461544443030426e-05, + "loss": 4.8005, + "step": 35792 + }, + { + "epoch": 0.21287111047673424, + "grad_norm": 1.5773065090179443, + "learning_rate": 4.4615154835226474e-05, + "loss": 4.7124, + "step": 35793 + }, + { + "epoch": 0.21287705776001523, + "grad_norm": 1.6054891347885132, + "learning_rate": 4.461486523330125e-05, + "loss": 5.3169, + "step": 35794 + }, + { + "epoch": 0.21288300504329621, + "grad_norm": 1.9800649881362915, + "learning_rate": 4.461457562452868e-05, + "loss": 4.5722, + "step": 35795 + }, + { + "epoch": 0.21288895232657723, + "grad_norm": 1.9030554294586182, + "learning_rate": 4.4614286008908854e-05, + "loss": 4.5221, + "step": 35796 + }, + { + "epoch": 0.21289489960985822, + "grad_norm": 1.8483407497406006, + "learning_rate": 4.4613996386441895e-05, + "loss": 4.7036, + "step": 35797 + }, + { + "epoch": 0.2129008468931392, + "grad_norm": 1.7602765560150146, + "learning_rate": 4.461370675712788e-05, + "loss": 4.5447, + "step": 35798 + }, + { + "epoch": 0.21290679417642022, + "grad_norm": 1.4833706617355347, + "learning_rate": 4.461341712096694e-05, + "loss": 4.6214, + "step": 35799 + }, + { + "epoch": 0.2129127414597012, + "grad_norm": 1.4221755266189575, + "learning_rate": 4.4613127477959146e-05, + "loss": 4.7195, + "step": 35800 + }, + { + "epoch": 0.2129186887429822, + "grad_norm": 1.5532176494598389, + "learning_rate": 4.4612837828104616e-05, + "loss": 4.466, + "step": 35801 + }, + { + "epoch": 0.21292463602626321, + "grad_norm": 1.4992002248764038, + "learning_rate": 4.4612548171403444e-05, + "loss": 4.454, + "step": 35802 + }, + { + "epoch": 0.2129305833095442, + "grad_norm": 1.8023090362548828, + "learning_rate": 4.461225850785574e-05, + "loss": 4.542, + "step": 35803 + }, + { + "epoch": 0.2129365305928252, + "grad_norm": 1.6397573947906494, + "learning_rate": 4.4611968837461595e-05, + "loss": 4.552, + "step": 35804 + }, + { + "epoch": 0.2129424778761062, + "grad_norm": 1.6525506973266602, + "learning_rate": 4.461167916022111e-05, + "loss": 4.5412, + "step": 35805 + }, + { + "epoch": 0.2129484251593872, + "grad_norm": 1.623678207397461, + "learning_rate": 4.46113894761344e-05, + "loss": 4.5214, + "step": 35806 + }, + { + "epoch": 0.21295437244266818, + "grad_norm": 1.4363800287246704, + "learning_rate": 4.461109978520155e-05, + "loss": 4.8244, + "step": 35807 + }, + { + "epoch": 0.2129603197259492, + "grad_norm": 1.6746747493743896, + "learning_rate": 4.461081008742267e-05, + "loss": 4.6021, + "step": 35808 + }, + { + "epoch": 0.2129662670092302, + "grad_norm": 1.4615259170532227, + "learning_rate": 4.4610520382797856e-05, + "loss": 4.5655, + "step": 35809 + }, + { + "epoch": 0.21297221429251117, + "grad_norm": 1.4905229806900024, + "learning_rate": 4.4610230671327215e-05, + "loss": 4.6959, + "step": 35810 + }, + { + "epoch": 0.2129781615757922, + "grad_norm": 2.2128641605377197, + "learning_rate": 4.460994095301084e-05, + "loss": 4.5274, + "step": 35811 + }, + { + "epoch": 0.21298410885907318, + "grad_norm": 1.7724326848983765, + "learning_rate": 4.460965122784885e-05, + "loss": 5.253, + "step": 35812 + }, + { + "epoch": 0.21299005614235417, + "grad_norm": 1.8824642896652222, + "learning_rate": 4.460936149584132e-05, + "loss": 5.3778, + "step": 35813 + }, + { + "epoch": 0.21299600342563518, + "grad_norm": 1.788230538368225, + "learning_rate": 4.460907175698837e-05, + "loss": 5.2686, + "step": 35814 + }, + { + "epoch": 0.21300195070891617, + "grad_norm": 1.6524558067321777, + "learning_rate": 4.46087820112901e-05, + "loss": 5.2969, + "step": 35815 + }, + { + "epoch": 0.21300789799219716, + "grad_norm": 1.5646259784698486, + "learning_rate": 4.460849225874659e-05, + "loss": 5.1971, + "step": 35816 + }, + { + "epoch": 0.21301384527547818, + "grad_norm": 2.1101200580596924, + "learning_rate": 4.460820249935798e-05, + "loss": 4.8999, + "step": 35817 + }, + { + "epoch": 0.21301979255875916, + "grad_norm": 1.7931146621704102, + "learning_rate": 4.460791273312433e-05, + "loss": 5.7204, + "step": 35818 + }, + { + "epoch": 0.21302573984204015, + "grad_norm": 1.9258630275726318, + "learning_rate": 4.460762296004577e-05, + "loss": 5.4044, + "step": 35819 + }, + { + "epoch": 0.21303168712532114, + "grad_norm": 3.792379140853882, + "learning_rate": 4.460733318012239e-05, + "loss": 2.9617, + "step": 35820 + }, + { + "epoch": 0.21303763440860216, + "grad_norm": 3.076469659805298, + "learning_rate": 4.46070433933543e-05, + "loss": 4.0408, + "step": 35821 + }, + { + "epoch": 0.21304358169188314, + "grad_norm": 3.011936902999878, + "learning_rate": 4.460675359974158e-05, + "loss": 2.7702, + "step": 35822 + }, + { + "epoch": 0.21304952897516413, + "grad_norm": 3.1061980724334717, + "learning_rate": 4.460646379928435e-05, + "loss": 2.0251, + "step": 35823 + }, + { + "epoch": 0.21305547625844515, + "grad_norm": 2.8176026344299316, + "learning_rate": 4.460617399198271e-05, + "loss": 3.023, + "step": 35824 + }, + { + "epoch": 0.21306142354172614, + "grad_norm": 3.274871826171875, + "learning_rate": 4.460588417783675e-05, + "loss": 3.5582, + "step": 35825 + }, + { + "epoch": 0.21306737082500712, + "grad_norm": 2.003629684448242, + "learning_rate": 4.4605594356846594e-05, + "loss": 3.5854, + "step": 35826 + }, + { + "epoch": 0.21307331810828814, + "grad_norm": 1.5609272718429565, + "learning_rate": 4.460530452901231e-05, + "loss": 4.5643, + "step": 35827 + }, + { + "epoch": 0.21307926539156913, + "grad_norm": 1.874121904373169, + "learning_rate": 4.4605014694334024e-05, + "loss": 4.8483, + "step": 35828 + }, + { + "epoch": 0.21308521267485012, + "grad_norm": 1.758209228515625, + "learning_rate": 4.460472485281183e-05, + "loss": 4.9385, + "step": 35829 + }, + { + "epoch": 0.21309115995813113, + "grad_norm": 1.755028486251831, + "learning_rate": 4.4604435004445824e-05, + "loss": 5.0858, + "step": 35830 + }, + { + "epoch": 0.21309710724141212, + "grad_norm": 1.6154873371124268, + "learning_rate": 4.460414514923612e-05, + "loss": 4.2717, + "step": 35831 + }, + { + "epoch": 0.2131030545246931, + "grad_norm": 1.7695956230163574, + "learning_rate": 4.4603855287182806e-05, + "loss": 3.6908, + "step": 35832 + }, + { + "epoch": 0.21310900180797412, + "grad_norm": 1.7642066478729248, + "learning_rate": 4.4603565418285996e-05, + "loss": 3.6801, + "step": 35833 + }, + { + "epoch": 0.2131149490912551, + "grad_norm": 1.4951072931289673, + "learning_rate": 4.460327554254578e-05, + "loss": 3.7037, + "step": 35834 + }, + { + "epoch": 0.2131208963745361, + "grad_norm": 1.7861125469207764, + "learning_rate": 4.460298565996226e-05, + "loss": 3.7424, + "step": 35835 + }, + { + "epoch": 0.21312684365781712, + "grad_norm": 2.4425766468048096, + "learning_rate": 4.4602695770535544e-05, + "loss": 4.6082, + "step": 35836 + }, + { + "epoch": 0.2131327909410981, + "grad_norm": 1.7068989276885986, + "learning_rate": 4.460240587426572e-05, + "loss": 3.7878, + "step": 35837 + }, + { + "epoch": 0.2131387382243791, + "grad_norm": 1.6994092464447021, + "learning_rate": 4.4602115971152905e-05, + "loss": 3.781, + "step": 35838 + }, + { + "epoch": 0.2131446855076601, + "grad_norm": 1.6545926332473755, + "learning_rate": 4.46018260611972e-05, + "loss": 3.7755, + "step": 35839 + }, + { + "epoch": 0.2131506327909411, + "grad_norm": 1.654785394668579, + "learning_rate": 4.4601536144398695e-05, + "loss": 3.6985, + "step": 35840 + }, + { + "epoch": 0.21315658007422209, + "grad_norm": 2.636845588684082, + "learning_rate": 4.460124622075749e-05, + "loss": 4.2517, + "step": 35841 + }, + { + "epoch": 0.2131625273575031, + "grad_norm": 2.595813751220703, + "learning_rate": 4.46009562902737e-05, + "loss": 4.2488, + "step": 35842 + }, + { + "epoch": 0.2131684746407841, + "grad_norm": 2.3771018981933594, + "learning_rate": 4.4600666352947416e-05, + "loss": 4.1553, + "step": 35843 + }, + { + "epoch": 0.21317442192406508, + "grad_norm": 2.2217776775360107, + "learning_rate": 4.4600376408778746e-05, + "loss": 4.1849, + "step": 35844 + }, + { + "epoch": 0.2131803692073461, + "grad_norm": 2.8838157653808594, + "learning_rate": 4.4600086457767784e-05, + "loss": 4.0867, + "step": 35845 + }, + { + "epoch": 0.21318631649062708, + "grad_norm": 2.105971574783325, + "learning_rate": 4.459979649991464e-05, + "loss": 4.3621, + "step": 35846 + }, + { + "epoch": 0.21319226377390807, + "grad_norm": 2.231476306915283, + "learning_rate": 4.45995065352194e-05, + "loss": 4.2008, + "step": 35847 + }, + { + "epoch": 0.21319821105718909, + "grad_norm": 2.104140281677246, + "learning_rate": 4.459921656368218e-05, + "loss": 3.914, + "step": 35848 + }, + { + "epoch": 0.21320415834047007, + "grad_norm": 2.1466448307037354, + "learning_rate": 4.459892658530307e-05, + "loss": 4.1908, + "step": 35849 + }, + { + "epoch": 0.21321010562375106, + "grad_norm": 2.4501988887786865, + "learning_rate": 4.459863660008218e-05, + "loss": 4.3681, + "step": 35850 + }, + { + "epoch": 0.21321605290703208, + "grad_norm": 2.4947612285614014, + "learning_rate": 4.459834660801961e-05, + "loss": 4.2168, + "step": 35851 + }, + { + "epoch": 0.21322200019031307, + "grad_norm": 2.237306594848633, + "learning_rate": 4.459805660911546e-05, + "loss": 4.2742, + "step": 35852 + }, + { + "epoch": 0.21322794747359405, + "grad_norm": 2.4983670711517334, + "learning_rate": 4.4597766603369834e-05, + "loss": 4.204, + "step": 35853 + }, + { + "epoch": 0.21323389475687507, + "grad_norm": 2.3211803436279297, + "learning_rate": 4.459747659078283e-05, + "loss": 4.1786, + "step": 35854 + }, + { + "epoch": 0.21323984204015606, + "grad_norm": 2.4706544876098633, + "learning_rate": 4.4597186571354544e-05, + "loss": 4.3427, + "step": 35855 + }, + { + "epoch": 0.21324578932343705, + "grad_norm": 2.552676200866699, + "learning_rate": 4.4596896545085084e-05, + "loss": 4.3238, + "step": 35856 + }, + { + "epoch": 0.21325173660671806, + "grad_norm": 2.366426467895508, + "learning_rate": 4.459660651197455e-05, + "loss": 4.4553, + "step": 35857 + }, + { + "epoch": 0.21325768388999905, + "grad_norm": 2.8086371421813965, + "learning_rate": 4.4596316472023044e-05, + "loss": 4.2748, + "step": 35858 + }, + { + "epoch": 0.21326363117328004, + "grad_norm": 2.2683427333831787, + "learning_rate": 4.459602642523067e-05, + "loss": 4.1705, + "step": 35859 + }, + { + "epoch": 0.21326957845656105, + "grad_norm": 2.0883960723876953, + "learning_rate": 4.459573637159752e-05, + "loss": 4.2445, + "step": 35860 + }, + { + "epoch": 0.21327552573984204, + "grad_norm": 2.2819952964782715, + "learning_rate": 4.45954463111237e-05, + "loss": 4.4289, + "step": 35861 + }, + { + "epoch": 0.21328147302312303, + "grad_norm": 2.1826071739196777, + "learning_rate": 4.459515624380932e-05, + "loss": 4.1867, + "step": 35862 + }, + { + "epoch": 0.21328742030640405, + "grad_norm": 2.458500623703003, + "learning_rate": 4.459486616965447e-05, + "loss": 4.2723, + "step": 35863 + }, + { + "epoch": 0.21329336758968503, + "grad_norm": 2.137686252593994, + "learning_rate": 4.459457608865925e-05, + "loss": 4.2249, + "step": 35864 + }, + { + "epoch": 0.21329931487296602, + "grad_norm": 1.9973599910736084, + "learning_rate": 4.459428600082377e-05, + "loss": 4.2212, + "step": 35865 + }, + { + "epoch": 0.21330526215624704, + "grad_norm": 2.1852917671203613, + "learning_rate": 4.459399590614813e-05, + "loss": 4.1206, + "step": 35866 + }, + { + "epoch": 0.21331120943952803, + "grad_norm": 2.2127127647399902, + "learning_rate": 4.459370580463242e-05, + "loss": 4.2551, + "step": 35867 + }, + { + "epoch": 0.21331715672280901, + "grad_norm": 2.2800424098968506, + "learning_rate": 4.459341569627675e-05, + "loss": 4.142, + "step": 35868 + }, + { + "epoch": 0.21332310400609003, + "grad_norm": 2.3102056980133057, + "learning_rate": 4.459312558108123e-05, + "loss": 4.1007, + "step": 35869 + }, + { + "epoch": 0.21332905128937102, + "grad_norm": 2.5117461681365967, + "learning_rate": 4.459283545904595e-05, + "loss": 3.9965, + "step": 35870 + }, + { + "epoch": 0.213334998572652, + "grad_norm": 2.34240460395813, + "learning_rate": 4.459254533017101e-05, + "loss": 4.0532, + "step": 35871 + }, + { + "epoch": 0.21334094585593302, + "grad_norm": 2.803379774093628, + "learning_rate": 4.459225519445652e-05, + "loss": 3.9698, + "step": 35872 + }, + { + "epoch": 0.213346893139214, + "grad_norm": 2.3026621341705322, + "learning_rate": 4.4591965051902574e-05, + "loss": 4.1482, + "step": 35873 + }, + { + "epoch": 0.213352840422495, + "grad_norm": 1.9966895580291748, + "learning_rate": 4.459167490250927e-05, + "loss": 4.3117, + "step": 35874 + }, + { + "epoch": 0.21335878770577602, + "grad_norm": 3.08508563041687, + "learning_rate": 4.459138474627672e-05, + "loss": 3.4137, + "step": 35875 + }, + { + "epoch": 0.213364734989057, + "grad_norm": 3.1647846698760986, + "learning_rate": 4.459109458320502e-05, + "loss": 3.6346, + "step": 35876 + }, + { + "epoch": 0.213370682272338, + "grad_norm": 2.066763162612915, + "learning_rate": 4.459080441329426e-05, + "loss": 4.5963, + "step": 35877 + }, + { + "epoch": 0.21337662955561898, + "grad_norm": 1.815376877784729, + "learning_rate": 4.4590514236544567e-05, + "loss": 4.8914, + "step": 35878 + }, + { + "epoch": 0.2133825768389, + "grad_norm": 2.9240071773529053, + "learning_rate": 4.459022405295602e-05, + "loss": 3.5932, + "step": 35879 + }, + { + "epoch": 0.21338852412218098, + "grad_norm": 2.881493330001831, + "learning_rate": 4.458993386252874e-05, + "loss": 3.2032, + "step": 35880 + }, + { + "epoch": 0.21339447140546197, + "grad_norm": 2.6276941299438477, + "learning_rate": 4.45896436652628e-05, + "loss": 3.2093, + "step": 35881 + }, + { + "epoch": 0.213400418688743, + "grad_norm": 2.8940045833587646, + "learning_rate": 4.4589353461158335e-05, + "loss": 4.2197, + "step": 35882 + }, + { + "epoch": 0.21340636597202398, + "grad_norm": 2.8076045513153076, + "learning_rate": 4.458906325021541e-05, + "loss": 3.1704, + "step": 35883 + }, + { + "epoch": 0.21341231325530496, + "grad_norm": 2.643134117126465, + "learning_rate": 4.458877303243416e-05, + "loss": 3.3753, + "step": 35884 + }, + { + "epoch": 0.21341826053858598, + "grad_norm": 2.9000542163848877, + "learning_rate": 4.458848280781467e-05, + "loss": 3.6017, + "step": 35885 + }, + { + "epoch": 0.21342420782186697, + "grad_norm": 2.967768907546997, + "learning_rate": 4.4588192576357036e-05, + "loss": 3.2405, + "step": 35886 + }, + { + "epoch": 0.21343015510514796, + "grad_norm": 2.1402149200439453, + "learning_rate": 4.458790233806137e-05, + "loss": 4.0389, + "step": 35887 + }, + { + "epoch": 0.21343610238842897, + "grad_norm": 1.6989480257034302, + "learning_rate": 4.4587612092927774e-05, + "loss": 4.1777, + "step": 35888 + }, + { + "epoch": 0.21344204967170996, + "grad_norm": 1.930235743522644, + "learning_rate": 4.4587321840956336e-05, + "loss": 4.243, + "step": 35889 + }, + { + "epoch": 0.21344799695499095, + "grad_norm": 1.681248426437378, + "learning_rate": 4.4587031582147174e-05, + "loss": 4.3933, + "step": 35890 + }, + { + "epoch": 0.21345394423827196, + "grad_norm": 1.8064907789230347, + "learning_rate": 4.458674131650038e-05, + "loss": 5.1875, + "step": 35891 + }, + { + "epoch": 0.21345989152155295, + "grad_norm": 1.755428671836853, + "learning_rate": 4.458645104401605e-05, + "loss": 4.5157, + "step": 35892 + }, + { + "epoch": 0.21346583880483394, + "grad_norm": 1.8016186952590942, + "learning_rate": 4.45861607646943e-05, + "loss": 4.1718, + "step": 35893 + }, + { + "epoch": 0.21347178608811496, + "grad_norm": 1.8302110433578491, + "learning_rate": 4.458587047853522e-05, + "loss": 4.2413, + "step": 35894 + }, + { + "epoch": 0.21347773337139594, + "grad_norm": 1.8271868228912354, + "learning_rate": 4.458558018553892e-05, + "loss": 4.2252, + "step": 35895 + }, + { + "epoch": 0.21348368065467693, + "grad_norm": 1.774984359741211, + "learning_rate": 4.4585289885705495e-05, + "loss": 4.1691, + "step": 35896 + }, + { + "epoch": 0.21348962793795795, + "grad_norm": 1.678552269935608, + "learning_rate": 4.458499957903505e-05, + "loss": 4.4492, + "step": 35897 + }, + { + "epoch": 0.21349557522123894, + "grad_norm": 1.810869812965393, + "learning_rate": 4.458470926552767e-05, + "loss": 4.302, + "step": 35898 + }, + { + "epoch": 0.21350152250451992, + "grad_norm": 1.6837462186813354, + "learning_rate": 4.458441894518348e-05, + "loss": 4.2768, + "step": 35899 + }, + { + "epoch": 0.21350746978780094, + "grad_norm": 1.5244439840316772, + "learning_rate": 4.458412861800257e-05, + "loss": 4.4128, + "step": 35900 + }, + { + "epoch": 0.21351341707108193, + "grad_norm": 1.6847096681594849, + "learning_rate": 4.4583838283985043e-05, + "loss": 4.0461, + "step": 35901 + }, + { + "epoch": 0.21351936435436292, + "grad_norm": 1.9673925638198853, + "learning_rate": 4.458354794313101e-05, + "loss": 4.11, + "step": 35902 + }, + { + "epoch": 0.21352531163764393, + "grad_norm": 1.8580288887023926, + "learning_rate": 4.458325759544055e-05, + "loss": 4.294, + "step": 35903 + }, + { + "epoch": 0.21353125892092492, + "grad_norm": 1.7768099308013916, + "learning_rate": 4.458296724091379e-05, + "loss": 4.358, + "step": 35904 + }, + { + "epoch": 0.2135372062042059, + "grad_norm": 1.7895678281784058, + "learning_rate": 4.45826768795508e-05, + "loss": 4.3601, + "step": 35905 + }, + { + "epoch": 0.21354315348748693, + "grad_norm": 1.8596075773239136, + "learning_rate": 4.4582386511351714e-05, + "loss": 4.8139, + "step": 35906 + }, + { + "epoch": 0.2135491007707679, + "grad_norm": 1.6231931447982788, + "learning_rate": 4.4582096136316614e-05, + "loss": 5.1909, + "step": 35907 + }, + { + "epoch": 0.2135550480540489, + "grad_norm": 1.7343584299087524, + "learning_rate": 4.458180575444561e-05, + "loss": 4.6373, + "step": 35908 + }, + { + "epoch": 0.21356099533732992, + "grad_norm": 1.5508018732070923, + "learning_rate": 4.45815153657388e-05, + "loss": 4.7082, + "step": 35909 + }, + { + "epoch": 0.2135669426206109, + "grad_norm": 1.599245548248291, + "learning_rate": 4.458122497019628e-05, + "loss": 4.8541, + "step": 35910 + }, + { + "epoch": 0.2135728899038919, + "grad_norm": 1.624755859375, + "learning_rate": 4.4580934567818164e-05, + "loss": 5.0132, + "step": 35911 + }, + { + "epoch": 0.2135788371871729, + "grad_norm": 1.705117106437683, + "learning_rate": 4.458064415860454e-05, + "loss": 5.1333, + "step": 35912 + }, + { + "epoch": 0.2135847844704539, + "grad_norm": 1.5134979486465454, + "learning_rate": 4.4580353742555515e-05, + "loss": 4.9791, + "step": 35913 + }, + { + "epoch": 0.21359073175373489, + "grad_norm": 1.5711792707443237, + "learning_rate": 4.458006331967119e-05, + "loss": 4.8364, + "step": 35914 + }, + { + "epoch": 0.2135966790370159, + "grad_norm": 1.481801986694336, + "learning_rate": 4.457977288995168e-05, + "loss": 4.897, + "step": 35915 + }, + { + "epoch": 0.2136026263202969, + "grad_norm": 1.4712084531784058, + "learning_rate": 4.457948245339706e-05, + "loss": 4.7349, + "step": 35916 + }, + { + "epoch": 0.21360857360357788, + "grad_norm": 1.7659448385238647, + "learning_rate": 4.457919201000745e-05, + "loss": 5.0048, + "step": 35917 + }, + { + "epoch": 0.2136145208868589, + "grad_norm": 1.469093918800354, + "learning_rate": 4.4578901559782944e-05, + "loss": 5.0344, + "step": 35918 + }, + { + "epoch": 0.21362046817013988, + "grad_norm": 1.451316237449646, + "learning_rate": 4.457861110272365e-05, + "loss": 4.7897, + "step": 35919 + }, + { + "epoch": 0.21362641545342087, + "grad_norm": 1.4311710596084595, + "learning_rate": 4.4578320638829655e-05, + "loss": 4.8051, + "step": 35920 + }, + { + "epoch": 0.2136323627367019, + "grad_norm": 1.1662811040878296, + "learning_rate": 4.457803016810108e-05, + "loss": 5.0022, + "step": 35921 + }, + { + "epoch": 0.21363831001998287, + "grad_norm": 1.424172282218933, + "learning_rate": 4.457773969053801e-05, + "loss": 4.921, + "step": 35922 + }, + { + "epoch": 0.21364425730326386, + "grad_norm": 1.396257758140564, + "learning_rate": 4.4577449206140564e-05, + "loss": 5.1584, + "step": 35923 + }, + { + "epoch": 0.21365020458654488, + "grad_norm": 1.4995923042297363, + "learning_rate": 4.457715871490882e-05, + "loss": 5.0498, + "step": 35924 + }, + { + "epoch": 0.21365615186982587, + "grad_norm": 1.8978110551834106, + "learning_rate": 4.4576868216842904e-05, + "loss": 4.8733, + "step": 35925 + }, + { + "epoch": 0.21366209915310685, + "grad_norm": 1.7578836679458618, + "learning_rate": 4.45765777119429e-05, + "loss": 4.9622, + "step": 35926 + }, + { + "epoch": 0.21366804643638787, + "grad_norm": 1.5519132614135742, + "learning_rate": 4.457628720020891e-05, + "loss": 4.8019, + "step": 35927 + }, + { + "epoch": 0.21367399371966886, + "grad_norm": 2.0452818870544434, + "learning_rate": 4.4575996681641054e-05, + "loss": 4.9459, + "step": 35928 + }, + { + "epoch": 0.21367994100294985, + "grad_norm": 1.705398678779602, + "learning_rate": 4.4575706156239405e-05, + "loss": 5.2987, + "step": 35929 + }, + { + "epoch": 0.21368588828623086, + "grad_norm": 1.7086260318756104, + "learning_rate": 4.457541562400409e-05, + "loss": 4.7203, + "step": 35930 + }, + { + "epoch": 0.21369183556951185, + "grad_norm": 1.47868812084198, + "learning_rate": 4.4575125084935186e-05, + "loss": 5.1416, + "step": 35931 + }, + { + "epoch": 0.21369778285279284, + "grad_norm": 1.5206907987594604, + "learning_rate": 4.4574834539032826e-05, + "loss": 5.2395, + "step": 35932 + }, + { + "epoch": 0.21370373013607386, + "grad_norm": 1.540887713432312, + "learning_rate": 4.457454398629708e-05, + "loss": 5.071, + "step": 35933 + }, + { + "epoch": 0.21370967741935484, + "grad_norm": 1.546628713607788, + "learning_rate": 4.4574253426728066e-05, + "loss": 5.1932, + "step": 35934 + }, + { + "epoch": 0.21371562470263583, + "grad_norm": 1.783042550086975, + "learning_rate": 4.457396286032589e-05, + "loss": 5.0621, + "step": 35935 + }, + { + "epoch": 0.21372157198591682, + "grad_norm": 1.8897498846054077, + "learning_rate": 4.4573672287090637e-05, + "loss": 4.7175, + "step": 35936 + }, + { + "epoch": 0.21372751926919784, + "grad_norm": 1.9781421422958374, + "learning_rate": 4.457338170702242e-05, + "loss": 4.7669, + "step": 35937 + }, + { + "epoch": 0.21373346655247882, + "grad_norm": 1.9311988353729248, + "learning_rate": 4.457309112012134e-05, + "loss": 5.278, + "step": 35938 + }, + { + "epoch": 0.2137394138357598, + "grad_norm": 1.77422034740448, + "learning_rate": 4.4572800526387495e-05, + "loss": 4.733, + "step": 35939 + }, + { + "epoch": 0.21374536111904083, + "grad_norm": 1.7609598636627197, + "learning_rate": 4.457250992582098e-05, + "loss": 4.5193, + "step": 35940 + }, + { + "epoch": 0.21375130840232182, + "grad_norm": 1.7665215730667114, + "learning_rate": 4.4572219318421916e-05, + "loss": 4.4299, + "step": 35941 + }, + { + "epoch": 0.2137572556856028, + "grad_norm": 1.8990488052368164, + "learning_rate": 4.4571928704190384e-05, + "loss": 4.395, + "step": 35942 + }, + { + "epoch": 0.21376320296888382, + "grad_norm": 1.751051425933838, + "learning_rate": 4.45716380831265e-05, + "loss": 4.5123, + "step": 35943 + }, + { + "epoch": 0.2137691502521648, + "grad_norm": 2.0244052410125732, + "learning_rate": 4.4571347455230356e-05, + "loss": 5.1076, + "step": 35944 + }, + { + "epoch": 0.2137750975354458, + "grad_norm": 1.783740520477295, + "learning_rate": 4.4571056820502056e-05, + "loss": 5.2825, + "step": 35945 + }, + { + "epoch": 0.2137810448187268, + "grad_norm": 1.5837445259094238, + "learning_rate": 4.4570766178941704e-05, + "loss": 5.2201, + "step": 35946 + }, + { + "epoch": 0.2137869921020078, + "grad_norm": 1.6283888816833496, + "learning_rate": 4.4570475530549394e-05, + "loss": 5.2683, + "step": 35947 + }, + { + "epoch": 0.2137929393852888, + "grad_norm": 1.8020168542861938, + "learning_rate": 4.4570184875325235e-05, + "loss": 5.0123, + "step": 35948 + }, + { + "epoch": 0.2137988866685698, + "grad_norm": 1.8496737480163574, + "learning_rate": 4.4569894213269335e-05, + "loss": 4.9256, + "step": 35949 + }, + { + "epoch": 0.2138048339518508, + "grad_norm": 1.6990101337432861, + "learning_rate": 4.456960354438178e-05, + "loss": 4.5923, + "step": 35950 + }, + { + "epoch": 0.21381078123513178, + "grad_norm": 2.2053756713867188, + "learning_rate": 4.4569312868662686e-05, + "loss": 4.8017, + "step": 35951 + }, + { + "epoch": 0.2138167285184128, + "grad_norm": 1.7671394348144531, + "learning_rate": 4.456902218611214e-05, + "loss": 4.7598, + "step": 35952 + }, + { + "epoch": 0.21382267580169378, + "grad_norm": 1.6970982551574707, + "learning_rate": 4.456873149673025e-05, + "loss": 4.7862, + "step": 35953 + }, + { + "epoch": 0.21382862308497477, + "grad_norm": 1.9007402658462524, + "learning_rate": 4.456844080051712e-05, + "loss": 4.7001, + "step": 35954 + }, + { + "epoch": 0.2138345703682558, + "grad_norm": 1.4777690172195435, + "learning_rate": 4.4568150097472846e-05, + "loss": 4.6834, + "step": 35955 + }, + { + "epoch": 0.21384051765153678, + "grad_norm": 1.5025019645690918, + "learning_rate": 4.4567859387597545e-05, + "loss": 4.7556, + "step": 35956 + }, + { + "epoch": 0.21384646493481776, + "grad_norm": 1.7506788969039917, + "learning_rate": 4.456756867089129e-05, + "loss": 4.7992, + "step": 35957 + }, + { + "epoch": 0.21385241221809878, + "grad_norm": 1.5796469449996948, + "learning_rate": 4.4567277947354215e-05, + "loss": 4.6379, + "step": 35958 + }, + { + "epoch": 0.21385835950137977, + "grad_norm": 1.9620283842086792, + "learning_rate": 4.456698721698639e-05, + "loss": 4.6841, + "step": 35959 + }, + { + "epoch": 0.21386430678466076, + "grad_norm": 2.463955879211426, + "learning_rate": 4.456669647978794e-05, + "loss": 4.2983, + "step": 35960 + }, + { + "epoch": 0.21387025406794177, + "grad_norm": 1.619341492652893, + "learning_rate": 4.456640573575896e-05, + "loss": 4.6181, + "step": 35961 + }, + { + "epoch": 0.21387620135122276, + "grad_norm": 1.8905354738235474, + "learning_rate": 4.456611498489955e-05, + "loss": 5.2374, + "step": 35962 + }, + { + "epoch": 0.21388214863450375, + "grad_norm": 1.745563268661499, + "learning_rate": 4.456582422720981e-05, + "loss": 5.1386, + "step": 35963 + }, + { + "epoch": 0.21388809591778477, + "grad_norm": 2.177422046661377, + "learning_rate": 4.456553346268983e-05, + "loss": 4.2874, + "step": 35964 + }, + { + "epoch": 0.21389404320106575, + "grad_norm": 2.8467772006988525, + "learning_rate": 4.4565242691339746e-05, + "loss": 3.5243, + "step": 35965 + }, + { + "epoch": 0.21389999048434674, + "grad_norm": 2.371044635772705, + "learning_rate": 4.456495191315963e-05, + "loss": 3.4414, + "step": 35966 + }, + { + "epoch": 0.21390593776762776, + "grad_norm": 2.501070737838745, + "learning_rate": 4.456466112814959e-05, + "loss": 3.2322, + "step": 35967 + }, + { + "epoch": 0.21391188505090875, + "grad_norm": 2.228771448135376, + "learning_rate": 4.456437033630972e-05, + "loss": 3.1681, + "step": 35968 + }, + { + "epoch": 0.21391783233418973, + "grad_norm": 2.327960729598999, + "learning_rate": 4.456407953764015e-05, + "loss": 3.8851, + "step": 35969 + }, + { + "epoch": 0.21392377961747075, + "grad_norm": 1.7042254209518433, + "learning_rate": 4.456378873214094e-05, + "loss": 4.8426, + "step": 35970 + }, + { + "epoch": 0.21392972690075174, + "grad_norm": 3.039768695831299, + "learning_rate": 4.456349791981223e-05, + "loss": 3.2656, + "step": 35971 + }, + { + "epoch": 0.21393567418403273, + "grad_norm": 4.14467191696167, + "learning_rate": 4.45632071006541e-05, + "loss": 2.8295, + "step": 35972 + }, + { + "epoch": 0.21394162146731374, + "grad_norm": 3.6734323501586914, + "learning_rate": 4.456291627466665e-05, + "loss": 2.6344, + "step": 35973 + }, + { + "epoch": 0.21394756875059473, + "grad_norm": 3.536844491958618, + "learning_rate": 4.4562625441849994e-05, + "loss": 2.5706, + "step": 35974 + }, + { + "epoch": 0.21395351603387572, + "grad_norm": 2.9864909648895264, + "learning_rate": 4.456233460220423e-05, + "loss": 2.7415, + "step": 35975 + }, + { + "epoch": 0.21395946331715673, + "grad_norm": 3.0675835609436035, + "learning_rate": 4.456204375572945e-05, + "loss": 3.1035, + "step": 35976 + }, + { + "epoch": 0.21396541060043772, + "grad_norm": 3.9746904373168945, + "learning_rate": 4.456175290242577e-05, + "loss": 3.0007, + "step": 35977 + }, + { + "epoch": 0.2139713578837187, + "grad_norm": 5.928574562072754, + "learning_rate": 4.456146204229328e-05, + "loss": 2.6625, + "step": 35978 + }, + { + "epoch": 0.21397730516699973, + "grad_norm": 4.5435309410095215, + "learning_rate": 4.45611711753321e-05, + "loss": 2.2242, + "step": 35979 + }, + { + "epoch": 0.21398325245028071, + "grad_norm": 4.220280647277832, + "learning_rate": 4.4560880301542293e-05, + "loss": 2.2737, + "step": 35980 + }, + { + "epoch": 0.2139891997335617, + "grad_norm": 4.68203592300415, + "learning_rate": 4.4560589420923995e-05, + "loss": 2.8215, + "step": 35981 + }, + { + "epoch": 0.21399514701684272, + "grad_norm": 4.116830348968506, + "learning_rate": 4.4560298533477304e-05, + "loss": 2.6675, + "step": 35982 + }, + { + "epoch": 0.2140010943001237, + "grad_norm": 3.708685874938965, + "learning_rate": 4.456000763920231e-05, + "loss": 2.6958, + "step": 35983 + }, + { + "epoch": 0.2140070415834047, + "grad_norm": 3.6551620960235596, + "learning_rate": 4.455971673809912e-05, + "loss": 2.498, + "step": 35984 + }, + { + "epoch": 0.2140129888666857, + "grad_norm": 3.9570088386535645, + "learning_rate": 4.455942583016783e-05, + "loss": 2.1696, + "step": 35985 + }, + { + "epoch": 0.2140189361499667, + "grad_norm": 3.8783881664276123, + "learning_rate": 4.455913491540855e-05, + "loss": 3.3353, + "step": 35986 + }, + { + "epoch": 0.2140248834332477, + "grad_norm": 3.2755517959594727, + "learning_rate": 4.4558843993821374e-05, + "loss": 3.4415, + "step": 35987 + }, + { + "epoch": 0.2140308307165287, + "grad_norm": 3.13720965385437, + "learning_rate": 4.4558553065406416e-05, + "loss": 2.4986, + "step": 35988 + }, + { + "epoch": 0.2140367779998097, + "grad_norm": 3.374295473098755, + "learning_rate": 4.455826213016376e-05, + "loss": 1.6225, + "step": 35989 + }, + { + "epoch": 0.21404272528309068, + "grad_norm": 2.5789332389831543, + "learning_rate": 4.455797118809353e-05, + "loss": 1.8221, + "step": 35990 + }, + { + "epoch": 0.2140486725663717, + "grad_norm": 1.7907307147979736, + "learning_rate": 4.45576802391958e-05, + "loss": 5.1448, + "step": 35991 + }, + { + "epoch": 0.21405461984965268, + "grad_norm": 1.7972261905670166, + "learning_rate": 4.4557389283470694e-05, + "loss": 5.1784, + "step": 35992 + }, + { + "epoch": 0.21406056713293367, + "grad_norm": 1.7984882593154907, + "learning_rate": 4.455709832091831e-05, + "loss": 5.1594, + "step": 35993 + }, + { + "epoch": 0.21406651441621466, + "grad_norm": 1.9921157360076904, + "learning_rate": 4.455680735153873e-05, + "loss": 5.0676, + "step": 35994 + }, + { + "epoch": 0.21407246169949568, + "grad_norm": 2.3399744033813477, + "learning_rate": 4.455651637533208e-05, + "loss": 4.9908, + "step": 35995 + }, + { + "epoch": 0.21407840898277666, + "grad_norm": 2.1457231044769287, + "learning_rate": 4.455622539229846e-05, + "loss": 4.7422, + "step": 35996 + }, + { + "epoch": 0.21408435626605765, + "grad_norm": 2.49147629737854, + "learning_rate": 4.455593440243795e-05, + "loss": 4.384, + "step": 35997 + }, + { + "epoch": 0.21409030354933867, + "grad_norm": 3.087649345397949, + "learning_rate": 4.455564340575067e-05, + "loss": 4.1671, + "step": 35998 + }, + { + "epoch": 0.21409625083261966, + "grad_norm": 2.1485769748687744, + "learning_rate": 4.4555352402236715e-05, + "loss": 4.7317, + "step": 35999 + }, + { + "epoch": 0.21410219811590064, + "grad_norm": 1.8602612018585205, + "learning_rate": 4.455506139189619e-05, + "loss": 4.6042, + "step": 36000 + }, + { + "epoch": 0.21410814539918166, + "grad_norm": 2.006908655166626, + "learning_rate": 4.45547703747292e-05, + "loss": 4.5201, + "step": 36001 + }, + { + "epoch": 0.21411409268246265, + "grad_norm": 1.8241304159164429, + "learning_rate": 4.4554479350735836e-05, + "loss": 4.5176, + "step": 36002 + }, + { + "epoch": 0.21412003996574364, + "grad_norm": 1.69816255569458, + "learning_rate": 4.455418831991621e-05, + "loss": 4.4631, + "step": 36003 + }, + { + "epoch": 0.21412598724902465, + "grad_norm": 1.821662425994873, + "learning_rate": 4.4553897282270415e-05, + "loss": 4.3294, + "step": 36004 + }, + { + "epoch": 0.21413193453230564, + "grad_norm": 1.6003782749176025, + "learning_rate": 4.455360623779855e-05, + "loss": 4.9333, + "step": 36005 + }, + { + "epoch": 0.21413788181558663, + "grad_norm": 1.675066351890564, + "learning_rate": 4.455331518650073e-05, + "loss": 5.4692, + "step": 36006 + }, + { + "epoch": 0.21414382909886764, + "grad_norm": 1.792047142982483, + "learning_rate": 4.455302412837705e-05, + "loss": 4.8869, + "step": 36007 + }, + { + "epoch": 0.21414977638214863, + "grad_norm": 2.5633020401000977, + "learning_rate": 4.455273306342762e-05, + "loss": 3.7999, + "step": 36008 + }, + { + "epoch": 0.21415572366542962, + "grad_norm": 2.494217872619629, + "learning_rate": 4.455244199165252e-05, + "loss": 3.7959, + "step": 36009 + }, + { + "epoch": 0.21416167094871064, + "grad_norm": 2.7078194618225098, + "learning_rate": 4.4552150913051874e-05, + "loss": 4.072, + "step": 36010 + }, + { + "epoch": 0.21416761823199162, + "grad_norm": 2.8123793601989746, + "learning_rate": 4.4551859827625766e-05, + "loss": 3.8776, + "step": 36011 + }, + { + "epoch": 0.2141735655152726, + "grad_norm": 2.320986270904541, + "learning_rate": 4.455156873537431e-05, + "loss": 3.9132, + "step": 36012 + }, + { + "epoch": 0.21417951279855363, + "grad_norm": 2.0167579650878906, + "learning_rate": 4.455127763629759e-05, + "loss": 4.1274, + "step": 36013 + }, + { + "epoch": 0.21418546008183462, + "grad_norm": 1.4378185272216797, + "learning_rate": 4.4550986530395744e-05, + "loss": 5.4604, + "step": 36014 + }, + { + "epoch": 0.2141914073651156, + "grad_norm": 1.6383719444274902, + "learning_rate": 4.4550695417668836e-05, + "loss": 4.9049, + "step": 36015 + }, + { + "epoch": 0.21419735464839662, + "grad_norm": 2.096296548843384, + "learning_rate": 4.455040429811699e-05, + "loss": 4.8674, + "step": 36016 + }, + { + "epoch": 0.2142033019316776, + "grad_norm": 3.1170642375946045, + "learning_rate": 4.4550113171740295e-05, + "loss": 3.7861, + "step": 36017 + }, + { + "epoch": 0.2142092492149586, + "grad_norm": 2.940704822540283, + "learning_rate": 4.454982203853886e-05, + "loss": 3.4928, + "step": 36018 + }, + { + "epoch": 0.2142151964982396, + "grad_norm": 2.2609074115753174, + "learning_rate": 4.4549530898512784e-05, + "loss": 4.6711, + "step": 36019 + }, + { + "epoch": 0.2142211437815206, + "grad_norm": 1.8117674589157104, + "learning_rate": 4.454923975166216e-05, + "loss": 4.8602, + "step": 36020 + }, + { + "epoch": 0.2142270910648016, + "grad_norm": 2.5625758171081543, + "learning_rate": 4.454894859798711e-05, + "loss": 4.5147, + "step": 36021 + }, + { + "epoch": 0.2142330383480826, + "grad_norm": 2.0803611278533936, + "learning_rate": 4.454865743748772e-05, + "loss": 4.1966, + "step": 36022 + }, + { + "epoch": 0.2142389856313636, + "grad_norm": 2.294630527496338, + "learning_rate": 4.454836627016409e-05, + "loss": 4.3761, + "step": 36023 + }, + { + "epoch": 0.21424493291464458, + "grad_norm": 2.7524173259735107, + "learning_rate": 4.454807509601633e-05, + "loss": 4.1234, + "step": 36024 + }, + { + "epoch": 0.2142508801979256, + "grad_norm": 2.5124104022979736, + "learning_rate": 4.454778391504454e-05, + "loss": 4.0255, + "step": 36025 + }, + { + "epoch": 0.21425682748120659, + "grad_norm": 2.565599203109741, + "learning_rate": 4.4547492727248826e-05, + "loss": 4.0547, + "step": 36026 + }, + { + "epoch": 0.21426277476448757, + "grad_norm": 2.376383066177368, + "learning_rate": 4.454720153262928e-05, + "loss": 4.0174, + "step": 36027 + }, + { + "epoch": 0.2142687220477686, + "grad_norm": 2.1378703117370605, + "learning_rate": 4.454691033118601e-05, + "loss": 3.9006, + "step": 36028 + }, + { + "epoch": 0.21427466933104958, + "grad_norm": 2.2482197284698486, + "learning_rate": 4.454661912291911e-05, + "loss": 3.9213, + "step": 36029 + }, + { + "epoch": 0.21428061661433057, + "grad_norm": 2.2354705333709717, + "learning_rate": 4.454632790782869e-05, + "loss": 3.9075, + "step": 36030 + }, + { + "epoch": 0.21428656389761158, + "grad_norm": 2.235800266265869, + "learning_rate": 4.454603668591485e-05, + "loss": 3.8096, + "step": 36031 + }, + { + "epoch": 0.21429251118089257, + "grad_norm": 2.11818528175354, + "learning_rate": 4.454574545717769e-05, + "loss": 4.4192, + "step": 36032 + }, + { + "epoch": 0.21429845846417356, + "grad_norm": 1.4688904285430908, + "learning_rate": 4.454545422161731e-05, + "loss": 5.1884, + "step": 36033 + }, + { + "epoch": 0.21430440574745457, + "grad_norm": 1.7121012210845947, + "learning_rate": 4.4545162979233815e-05, + "loss": 4.9336, + "step": 36034 + }, + { + "epoch": 0.21431035303073556, + "grad_norm": 1.6486331224441528, + "learning_rate": 4.454487173002731e-05, + "loss": 5.097, + "step": 36035 + }, + { + "epoch": 0.21431630031401655, + "grad_norm": 1.6203352212905884, + "learning_rate": 4.454458047399789e-05, + "loss": 4.97, + "step": 36036 + }, + { + "epoch": 0.21432224759729757, + "grad_norm": 2.226794719696045, + "learning_rate": 4.454428921114565e-05, + "loss": 3.9059, + "step": 36037 + }, + { + "epoch": 0.21432819488057855, + "grad_norm": 2.4930450916290283, + "learning_rate": 4.4543997941470715e-05, + "loss": 2.5628, + "step": 36038 + }, + { + "epoch": 0.21433414216385954, + "grad_norm": 2.5009028911590576, + "learning_rate": 4.4543706664973164e-05, + "loss": 3.21, + "step": 36039 + }, + { + "epoch": 0.21434008944714056, + "grad_norm": 2.357839822769165, + "learning_rate": 4.454341538165311e-05, + "loss": 2.7942, + "step": 36040 + }, + { + "epoch": 0.21434603673042155, + "grad_norm": 2.6789627075195312, + "learning_rate": 4.4543124091510644e-05, + "loss": 2.74, + "step": 36041 + }, + { + "epoch": 0.21435198401370253, + "grad_norm": 2.7236430644989014, + "learning_rate": 4.4542832794545884e-05, + "loss": 2.952, + "step": 36042 + }, + { + "epoch": 0.21435793129698355, + "grad_norm": 2.7634379863739014, + "learning_rate": 4.4542541490758924e-05, + "loss": 2.9004, + "step": 36043 + }, + { + "epoch": 0.21436387858026454, + "grad_norm": 2.463062047958374, + "learning_rate": 4.454225018014986e-05, + "loss": 2.7224, + "step": 36044 + }, + { + "epoch": 0.21436982586354553, + "grad_norm": 2.313150405883789, + "learning_rate": 4.45419588627188e-05, + "loss": 2.6462, + "step": 36045 + }, + { + "epoch": 0.21437577314682654, + "grad_norm": 2.3792331218719482, + "learning_rate": 4.4541667538465835e-05, + "loss": 2.682, + "step": 36046 + }, + { + "epoch": 0.21438172043010753, + "grad_norm": 2.3990492820739746, + "learning_rate": 4.454137620739109e-05, + "loss": 2.8759, + "step": 36047 + }, + { + "epoch": 0.21438766771338852, + "grad_norm": 2.345261335372925, + "learning_rate": 4.4541084869494644e-05, + "loss": 3.1526, + "step": 36048 + }, + { + "epoch": 0.21439361499666953, + "grad_norm": 2.0746614933013916, + "learning_rate": 4.454079352477661e-05, + "loss": 5.2981, + "step": 36049 + }, + { + "epoch": 0.21439956227995052, + "grad_norm": 1.9888861179351807, + "learning_rate": 4.4540502173237086e-05, + "loss": 5.1718, + "step": 36050 + }, + { + "epoch": 0.2144055095632315, + "grad_norm": 1.7000229358673096, + "learning_rate": 4.454021081487618e-05, + "loss": 5.0606, + "step": 36051 + }, + { + "epoch": 0.2144114568465125, + "grad_norm": 1.5641193389892578, + "learning_rate": 4.453991944969398e-05, + "loss": 5.4505, + "step": 36052 + }, + { + "epoch": 0.21441740412979352, + "grad_norm": 1.5807493925094604, + "learning_rate": 4.45396280776906e-05, + "loss": 5.0804, + "step": 36053 + }, + { + "epoch": 0.2144233514130745, + "grad_norm": 1.5401573181152344, + "learning_rate": 4.4539336698866143e-05, + "loss": 5.2455, + "step": 36054 + }, + { + "epoch": 0.2144292986963555, + "grad_norm": 1.6141964197158813, + "learning_rate": 4.453904531322069e-05, + "loss": 4.8019, + "step": 36055 + }, + { + "epoch": 0.2144352459796365, + "grad_norm": 1.8339582681655884, + "learning_rate": 4.453875392075437e-05, + "loss": 4.8166, + "step": 36056 + }, + { + "epoch": 0.2144411932629175, + "grad_norm": 1.4061498641967773, + "learning_rate": 4.453846252146727e-05, + "loss": 5.1891, + "step": 36057 + }, + { + "epoch": 0.21444714054619848, + "grad_norm": 1.4515758752822876, + "learning_rate": 4.45381711153595e-05, + "loss": 5.1881, + "step": 36058 + }, + { + "epoch": 0.2144530878294795, + "grad_norm": 1.8617783784866333, + "learning_rate": 4.453787970243115e-05, + "loss": 4.2562, + "step": 36059 + }, + { + "epoch": 0.2144590351127605, + "grad_norm": 1.7045130729675293, + "learning_rate": 4.453758828268233e-05, + "loss": 4.7226, + "step": 36060 + }, + { + "epoch": 0.21446498239604148, + "grad_norm": 1.6791839599609375, + "learning_rate": 4.4537296856113134e-05, + "loss": 5.6172, + "step": 36061 + }, + { + "epoch": 0.2144709296793225, + "grad_norm": 1.605233073234558, + "learning_rate": 4.4537005422723676e-05, + "loss": 5.4356, + "step": 36062 + }, + { + "epoch": 0.21447687696260348, + "grad_norm": 1.3953781127929688, + "learning_rate": 4.453671398251406e-05, + "loss": 5.3499, + "step": 36063 + }, + { + "epoch": 0.21448282424588447, + "grad_norm": 1.6507956981658936, + "learning_rate": 4.453642253548436e-05, + "loss": 5.4016, + "step": 36064 + }, + { + "epoch": 0.21448877152916548, + "grad_norm": 1.700950264930725, + "learning_rate": 4.4536131081634705e-05, + "loss": 5.0299, + "step": 36065 + }, + { + "epoch": 0.21449471881244647, + "grad_norm": 1.6992217302322388, + "learning_rate": 4.4535839620965195e-05, + "loss": 5.1387, + "step": 36066 + }, + { + "epoch": 0.21450066609572746, + "grad_norm": 1.5594868659973145, + "learning_rate": 4.453554815347592e-05, + "loss": 5.0754, + "step": 36067 + }, + { + "epoch": 0.21450661337900848, + "grad_norm": 1.28568434715271, + "learning_rate": 4.453525667916698e-05, + "loss": 5.1151, + "step": 36068 + }, + { + "epoch": 0.21451256066228946, + "grad_norm": 1.5007739067077637, + "learning_rate": 4.453496519803849e-05, + "loss": 5.1162, + "step": 36069 + }, + { + "epoch": 0.21451850794557045, + "grad_norm": 1.3101109266281128, + "learning_rate": 4.453467371009055e-05, + "loss": 5.0668, + "step": 36070 + }, + { + "epoch": 0.21452445522885147, + "grad_norm": 1.4987660646438599, + "learning_rate": 4.453438221532325e-05, + "loss": 5.0398, + "step": 36071 + }, + { + "epoch": 0.21453040251213246, + "grad_norm": 1.7213561534881592, + "learning_rate": 4.45340907137367e-05, + "loss": 5.0728, + "step": 36072 + }, + { + "epoch": 0.21453634979541344, + "grad_norm": 1.9010142087936401, + "learning_rate": 4.4533799205331006e-05, + "loss": 4.9607, + "step": 36073 + }, + { + "epoch": 0.21454229707869446, + "grad_norm": 1.9126904010772705, + "learning_rate": 4.453350769010626e-05, + "loss": 4.6572, + "step": 36074 + }, + { + "epoch": 0.21454824436197545, + "grad_norm": 2.3189423084259033, + "learning_rate": 4.453321616806257e-05, + "loss": 4.2961, + "step": 36075 + }, + { + "epoch": 0.21455419164525644, + "grad_norm": 2.047769784927368, + "learning_rate": 4.453292463920004e-05, + "loss": 5.2367, + "step": 36076 + }, + { + "epoch": 0.21456013892853745, + "grad_norm": 1.8531605005264282, + "learning_rate": 4.453263310351876e-05, + "loss": 5.0521, + "step": 36077 + }, + { + "epoch": 0.21456608621181844, + "grad_norm": 1.3894751071929932, + "learning_rate": 4.453234156101884e-05, + "loss": 5.238, + "step": 36078 + }, + { + "epoch": 0.21457203349509943, + "grad_norm": 1.6113433837890625, + "learning_rate": 4.453205001170039e-05, + "loss": 4.9134, + "step": 36079 + }, + { + "epoch": 0.21457798077838044, + "grad_norm": 1.8081282377243042, + "learning_rate": 4.4531758455563495e-05, + "loss": 4.9779, + "step": 36080 + }, + { + "epoch": 0.21458392806166143, + "grad_norm": 2.274998188018799, + "learning_rate": 4.4531466892608266e-05, + "loss": 4.3828, + "step": 36081 + }, + { + "epoch": 0.21458987534494242, + "grad_norm": 1.9097249507904053, + "learning_rate": 4.453117532283481e-05, + "loss": 4.5971, + "step": 36082 + }, + { + "epoch": 0.21459582262822344, + "grad_norm": 2.342449903488159, + "learning_rate": 4.4530883746243214e-05, + "loss": 4.5699, + "step": 36083 + }, + { + "epoch": 0.21460176991150443, + "grad_norm": 1.9449174404144287, + "learning_rate": 4.453059216283358e-05, + "loss": 4.0389, + "step": 36084 + }, + { + "epoch": 0.2146077171947854, + "grad_norm": 1.6288878917694092, + "learning_rate": 4.453030057260604e-05, + "loss": 5.2517, + "step": 36085 + }, + { + "epoch": 0.21461366447806643, + "grad_norm": 1.4354645013809204, + "learning_rate": 4.453000897556066e-05, + "loss": 5.1874, + "step": 36086 + }, + { + "epoch": 0.21461961176134742, + "grad_norm": 1.6599136590957642, + "learning_rate": 4.452971737169756e-05, + "loss": 5.1117, + "step": 36087 + }, + { + "epoch": 0.2146255590446284, + "grad_norm": 1.649203896522522, + "learning_rate": 4.4529425761016835e-05, + "loss": 5.2197, + "step": 36088 + }, + { + "epoch": 0.21463150632790942, + "grad_norm": 2.038905143737793, + "learning_rate": 4.452913414351859e-05, + "loss": 5.0925, + "step": 36089 + }, + { + "epoch": 0.2146374536111904, + "grad_norm": 2.200108289718628, + "learning_rate": 4.452884251920293e-05, + "loss": 5.0225, + "step": 36090 + }, + { + "epoch": 0.2146434008944714, + "grad_norm": 1.9698771238327026, + "learning_rate": 4.452855088806995e-05, + "loss": 4.7777, + "step": 36091 + }, + { + "epoch": 0.2146493481777524, + "grad_norm": 1.687897801399231, + "learning_rate": 4.4528259250119756e-05, + "loss": 5.0942, + "step": 36092 + }, + { + "epoch": 0.2146552954610334, + "grad_norm": 1.311324954032898, + "learning_rate": 4.4527967605352446e-05, + "loss": 5.2022, + "step": 36093 + }, + { + "epoch": 0.2146612427443144, + "grad_norm": 1.464908480644226, + "learning_rate": 4.452767595376812e-05, + "loss": 5.1902, + "step": 36094 + }, + { + "epoch": 0.2146671900275954, + "grad_norm": 1.609305500984192, + "learning_rate": 4.4527384295366893e-05, + "loss": 4.7921, + "step": 36095 + }, + { + "epoch": 0.2146731373108764, + "grad_norm": 1.8921763896942139, + "learning_rate": 4.4527092630148854e-05, + "loss": 4.1943, + "step": 36096 + }, + { + "epoch": 0.21467908459415738, + "grad_norm": 1.436725378036499, + "learning_rate": 4.452680095811411e-05, + "loss": 4.7097, + "step": 36097 + }, + { + "epoch": 0.2146850318774384, + "grad_norm": 1.8407703638076782, + "learning_rate": 4.4526509279262764e-05, + "loss": 4.3972, + "step": 36098 + }, + { + "epoch": 0.2146909791607194, + "grad_norm": 1.5586193799972534, + "learning_rate": 4.45262175935949e-05, + "loss": 4.9978, + "step": 36099 + }, + { + "epoch": 0.21469692644400037, + "grad_norm": 1.5589431524276733, + "learning_rate": 4.4525925901110656e-05, + "loss": 5.113, + "step": 36100 + }, + { + "epoch": 0.2147028737272814, + "grad_norm": 1.633216142654419, + "learning_rate": 4.45256342018101e-05, + "loss": 4.0823, + "step": 36101 + }, + { + "epoch": 0.21470882101056238, + "grad_norm": 1.530907392501831, + "learning_rate": 4.452534249569335e-05, + "loss": 4.3349, + "step": 36102 + }, + { + "epoch": 0.21471476829384337, + "grad_norm": 1.7243280410766602, + "learning_rate": 4.45250507827605e-05, + "loss": 5.4634, + "step": 36103 + }, + { + "epoch": 0.21472071557712438, + "grad_norm": 1.7470494508743286, + "learning_rate": 4.452475906301167e-05, + "loss": 5.3959, + "step": 36104 + }, + { + "epoch": 0.21472666286040537, + "grad_norm": 1.5940369367599487, + "learning_rate": 4.452446733644694e-05, + "loss": 5.3744, + "step": 36105 + }, + { + "epoch": 0.21473261014368636, + "grad_norm": 1.5966911315917969, + "learning_rate": 4.452417560306642e-05, + "loss": 5.3308, + "step": 36106 + }, + { + "epoch": 0.21473855742696737, + "grad_norm": 1.665995478630066, + "learning_rate": 4.452388386287021e-05, + "loss": 5.5209, + "step": 36107 + }, + { + "epoch": 0.21474450471024836, + "grad_norm": 1.476784110069275, + "learning_rate": 4.452359211585841e-05, + "loss": 5.3837, + "step": 36108 + }, + { + "epoch": 0.21475045199352935, + "grad_norm": 1.5506277084350586, + "learning_rate": 4.452330036203114e-05, + "loss": 5.4229, + "step": 36109 + }, + { + "epoch": 0.21475639927681037, + "grad_norm": 1.3492239713668823, + "learning_rate": 4.4523008601388475e-05, + "loss": 5.4111, + "step": 36110 + }, + { + "epoch": 0.21476234656009136, + "grad_norm": 1.4536131620407104, + "learning_rate": 4.452271683393053e-05, + "loss": 5.4376, + "step": 36111 + }, + { + "epoch": 0.21476829384337234, + "grad_norm": 1.5479294061660767, + "learning_rate": 4.452242505965741e-05, + "loss": 4.9195, + "step": 36112 + }, + { + "epoch": 0.21477424112665333, + "grad_norm": 1.7930655479431152, + "learning_rate": 4.452213327856922e-05, + "loss": 4.9069, + "step": 36113 + }, + { + "epoch": 0.21478018840993435, + "grad_norm": 2.004514455795288, + "learning_rate": 4.452184149066605e-05, + "loss": 4.7419, + "step": 36114 + }, + { + "epoch": 0.21478613569321534, + "grad_norm": 1.8205454349517822, + "learning_rate": 4.4521549695948004e-05, + "loss": 5.0988, + "step": 36115 + }, + { + "epoch": 0.21479208297649632, + "grad_norm": 1.8684512376785278, + "learning_rate": 4.4521257894415183e-05, + "loss": 5.1889, + "step": 36116 + }, + { + "epoch": 0.21479803025977734, + "grad_norm": 1.5708959102630615, + "learning_rate": 4.45209660860677e-05, + "loss": 5.1908, + "step": 36117 + }, + { + "epoch": 0.21480397754305833, + "grad_norm": 1.7478984594345093, + "learning_rate": 4.452067427090565e-05, + "loss": 5.4033, + "step": 36118 + }, + { + "epoch": 0.21480992482633932, + "grad_norm": 1.8025048971176147, + "learning_rate": 4.452038244892913e-05, + "loss": 5.4858, + "step": 36119 + }, + { + "epoch": 0.21481587210962033, + "grad_norm": 1.5002171993255615, + "learning_rate": 4.4520090620138245e-05, + "loss": 5.6049, + "step": 36120 + }, + { + "epoch": 0.21482181939290132, + "grad_norm": 1.793312668800354, + "learning_rate": 4.45197987845331e-05, + "loss": 5.0971, + "step": 36121 + }, + { + "epoch": 0.2148277666761823, + "grad_norm": 1.6286466121673584, + "learning_rate": 4.45195069421138e-05, + "loss": 5.0552, + "step": 36122 + }, + { + "epoch": 0.21483371395946332, + "grad_norm": 2.28002667427063, + "learning_rate": 4.451921509288043e-05, + "loss": 4.136, + "step": 36123 + }, + { + "epoch": 0.2148396612427443, + "grad_norm": 1.5719590187072754, + "learning_rate": 4.451892323683311e-05, + "loss": 5.3065, + "step": 36124 + }, + { + "epoch": 0.2148456085260253, + "grad_norm": 1.5892250537872314, + "learning_rate": 4.451863137397193e-05, + "loss": 5.4593, + "step": 36125 + }, + { + "epoch": 0.21485155580930632, + "grad_norm": 1.6752700805664062, + "learning_rate": 4.4518339504297013e-05, + "loss": 5.2985, + "step": 36126 + }, + { + "epoch": 0.2148575030925873, + "grad_norm": 1.8093560934066772, + "learning_rate": 4.451804762780843e-05, + "loss": 4.9665, + "step": 36127 + }, + { + "epoch": 0.2148634503758683, + "grad_norm": 1.5342146158218384, + "learning_rate": 4.4517755744506303e-05, + "loss": 4.9299, + "step": 36128 + }, + { + "epoch": 0.2148693976591493, + "grad_norm": 1.5814716815948486, + "learning_rate": 4.451746385439074e-05, + "loss": 5.0069, + "step": 36129 + }, + { + "epoch": 0.2148753449424303, + "grad_norm": 1.4868812561035156, + "learning_rate": 4.4517171957461814e-05, + "loss": 4.9328, + "step": 36130 + }, + { + "epoch": 0.21488129222571128, + "grad_norm": 1.4403107166290283, + "learning_rate": 4.4516880053719655e-05, + "loss": 5.1021, + "step": 36131 + }, + { + "epoch": 0.2148872395089923, + "grad_norm": 2.2056379318237305, + "learning_rate": 4.451658814316435e-05, + "loss": 4.652, + "step": 36132 + }, + { + "epoch": 0.2148931867922733, + "grad_norm": 1.6643704175949097, + "learning_rate": 4.451629622579601e-05, + "loss": 5.6042, + "step": 36133 + }, + { + "epoch": 0.21489913407555428, + "grad_norm": 1.6085230112075806, + "learning_rate": 4.4516004301614734e-05, + "loss": 5.2156, + "step": 36134 + }, + { + "epoch": 0.2149050813588353, + "grad_norm": 1.741129755973816, + "learning_rate": 4.451571237062062e-05, + "loss": 4.7964, + "step": 36135 + }, + { + "epoch": 0.21491102864211628, + "grad_norm": 1.5676339864730835, + "learning_rate": 4.451542043281377e-05, + "loss": 5.1581, + "step": 36136 + }, + { + "epoch": 0.21491697592539727, + "grad_norm": 1.5741878747940063, + "learning_rate": 4.451512848819429e-05, + "loss": 5.1789, + "step": 36137 + }, + { + "epoch": 0.21492292320867828, + "grad_norm": 1.6025103330612183, + "learning_rate": 4.4514836536762286e-05, + "loss": 4.9795, + "step": 36138 + }, + { + "epoch": 0.21492887049195927, + "grad_norm": 1.680410385131836, + "learning_rate": 4.451454457851785e-05, + "loss": 5.0603, + "step": 36139 + }, + { + "epoch": 0.21493481777524026, + "grad_norm": 2.5844266414642334, + "learning_rate": 4.451425261346108e-05, + "loss": 3.9414, + "step": 36140 + }, + { + "epoch": 0.21494076505852128, + "grad_norm": 1.7749565839767456, + "learning_rate": 4.45139606415921e-05, + "loss": 4.5331, + "step": 36141 + }, + { + "epoch": 0.21494671234180227, + "grad_norm": 1.769710898399353, + "learning_rate": 4.451366866291098e-05, + "loss": 4.7675, + "step": 36142 + }, + { + "epoch": 0.21495265962508325, + "grad_norm": 1.5556137561798096, + "learning_rate": 4.451337667741785e-05, + "loss": 4.9267, + "step": 36143 + }, + { + "epoch": 0.21495860690836427, + "grad_norm": 1.4366059303283691, + "learning_rate": 4.45130846851128e-05, + "loss": 5.5483, + "step": 36144 + }, + { + "epoch": 0.21496455419164526, + "grad_norm": 1.356587529182434, + "learning_rate": 4.451279268599594e-05, + "loss": 5.5553, + "step": 36145 + }, + { + "epoch": 0.21497050147492625, + "grad_norm": 1.7132307291030884, + "learning_rate": 4.451250068006736e-05, + "loss": 4.4766, + "step": 36146 + }, + { + "epoch": 0.21497644875820726, + "grad_norm": 1.420796513557434, + "learning_rate": 4.4512208667327175e-05, + "loss": 5.422, + "step": 36147 + }, + { + "epoch": 0.21498239604148825, + "grad_norm": 1.9435526132583618, + "learning_rate": 4.4511916647775474e-05, + "loss": 4.3675, + "step": 36148 + }, + { + "epoch": 0.21498834332476924, + "grad_norm": 2.1586434841156006, + "learning_rate": 4.451162462141236e-05, + "loss": 4.3742, + "step": 36149 + }, + { + "epoch": 0.21499429060805025, + "grad_norm": 1.7578691244125366, + "learning_rate": 4.451133258823795e-05, + "loss": 4.7244, + "step": 36150 + }, + { + "epoch": 0.21500023789133124, + "grad_norm": 1.5594449043273926, + "learning_rate": 4.4511040548252325e-05, + "loss": 4.8837, + "step": 36151 + }, + { + "epoch": 0.21500618517461223, + "grad_norm": 1.5725610256195068, + "learning_rate": 4.45107485014556e-05, + "loss": 4.272, + "step": 36152 + }, + { + "epoch": 0.21501213245789325, + "grad_norm": 1.5220437049865723, + "learning_rate": 4.451045644784788e-05, + "loss": 4.3584, + "step": 36153 + }, + { + "epoch": 0.21501807974117423, + "grad_norm": 1.3363945484161377, + "learning_rate": 4.451016438742925e-05, + "loss": 5.0875, + "step": 36154 + }, + { + "epoch": 0.21502402702445522, + "grad_norm": 1.5395842790603638, + "learning_rate": 4.450987232019984e-05, + "loss": 4.9718, + "step": 36155 + }, + { + "epoch": 0.21502997430773624, + "grad_norm": 1.7091704607009888, + "learning_rate": 4.450958024615972e-05, + "loss": 4.882, + "step": 36156 + }, + { + "epoch": 0.21503592159101723, + "grad_norm": 2.3344812393188477, + "learning_rate": 4.4509288165309015e-05, + "loss": 4.2766, + "step": 36157 + }, + { + "epoch": 0.21504186887429821, + "grad_norm": 1.9190376996994019, + "learning_rate": 4.450899607764782e-05, + "loss": 4.2321, + "step": 36158 + }, + { + "epoch": 0.21504781615757923, + "grad_norm": 1.8463904857635498, + "learning_rate": 4.450870398317623e-05, + "loss": 4.4709, + "step": 36159 + }, + { + "epoch": 0.21505376344086022, + "grad_norm": 1.4867302179336548, + "learning_rate": 4.4508411881894356e-05, + "loss": 4.5178, + "step": 36160 + }, + { + "epoch": 0.2150597107241412, + "grad_norm": 1.4961345195770264, + "learning_rate": 4.45081197738023e-05, + "loss": 4.6105, + "step": 36161 + }, + { + "epoch": 0.21506565800742222, + "grad_norm": 1.3281563520431519, + "learning_rate": 4.450782765890016e-05, + "loss": 4.6637, + "step": 36162 + }, + { + "epoch": 0.2150716052907032, + "grad_norm": 1.4934107065200806, + "learning_rate": 4.450753553718803e-05, + "loss": 4.4112, + "step": 36163 + }, + { + "epoch": 0.2150775525739842, + "grad_norm": 1.3608429431915283, + "learning_rate": 4.450724340866603e-05, + "loss": 4.8015, + "step": 36164 + }, + { + "epoch": 0.21508349985726521, + "grad_norm": 1.5349289178848267, + "learning_rate": 4.450695127333425e-05, + "loss": 4.7622, + "step": 36165 + }, + { + "epoch": 0.2150894471405462, + "grad_norm": 1.5071897506713867, + "learning_rate": 4.450665913119279e-05, + "loss": 4.4918, + "step": 36166 + }, + { + "epoch": 0.2150953944238272, + "grad_norm": 1.6646260023117065, + "learning_rate": 4.4506366982241766e-05, + "loss": 4.625, + "step": 36167 + }, + { + "epoch": 0.2151013417071082, + "grad_norm": 1.5301088094711304, + "learning_rate": 4.450607482648127e-05, + "loss": 4.4959, + "step": 36168 + }, + { + "epoch": 0.2151072889903892, + "grad_norm": 1.5578876733779907, + "learning_rate": 4.45057826639114e-05, + "loss": 4.5457, + "step": 36169 + }, + { + "epoch": 0.21511323627367018, + "grad_norm": 1.5869579315185547, + "learning_rate": 4.450549049453227e-05, + "loss": 4.6881, + "step": 36170 + }, + { + "epoch": 0.21511918355695117, + "grad_norm": 1.5819709300994873, + "learning_rate": 4.450519831834397e-05, + "loss": 4.5569, + "step": 36171 + }, + { + "epoch": 0.2151251308402322, + "grad_norm": 1.6285146474838257, + "learning_rate": 4.45049061353466e-05, + "loss": 4.2395, + "step": 36172 + }, + { + "epoch": 0.21513107812351318, + "grad_norm": 1.5318942070007324, + "learning_rate": 4.450461394554027e-05, + "loss": 4.314, + "step": 36173 + }, + { + "epoch": 0.21513702540679416, + "grad_norm": 1.5344517230987549, + "learning_rate": 4.450432174892509e-05, + "loss": 4.2087, + "step": 36174 + }, + { + "epoch": 0.21514297269007518, + "grad_norm": 1.4494154453277588, + "learning_rate": 4.450402954550115e-05, + "loss": 4.3346, + "step": 36175 + }, + { + "epoch": 0.21514891997335617, + "grad_norm": 1.575466513633728, + "learning_rate": 4.450373733526855e-05, + "loss": 4.6291, + "step": 36176 + }, + { + "epoch": 0.21515486725663716, + "grad_norm": 1.6240023374557495, + "learning_rate": 4.4503445118227396e-05, + "loss": 4.6026, + "step": 36177 + }, + { + "epoch": 0.21516081453991817, + "grad_norm": 1.355594515800476, + "learning_rate": 4.45031528943778e-05, + "loss": 4.2619, + "step": 36178 + }, + { + "epoch": 0.21516676182319916, + "grad_norm": 1.5286892652511597, + "learning_rate": 4.4502860663719846e-05, + "loss": 4.2924, + "step": 36179 + }, + { + "epoch": 0.21517270910648015, + "grad_norm": 1.399736762046814, + "learning_rate": 4.4502568426253646e-05, + "loss": 4.3781, + "step": 36180 + }, + { + "epoch": 0.21517865638976116, + "grad_norm": 1.585708498954773, + "learning_rate": 4.4502276181979306e-05, + "loss": 5.5202, + "step": 36181 + }, + { + "epoch": 0.21518460367304215, + "grad_norm": 1.4541770219802856, + "learning_rate": 4.4501983930896916e-05, + "loss": 5.2694, + "step": 36182 + }, + { + "epoch": 0.21519055095632314, + "grad_norm": 1.3780940771102905, + "learning_rate": 4.4501691673006596e-05, + "loss": 5.1998, + "step": 36183 + }, + { + "epoch": 0.21519649823960416, + "grad_norm": 1.5186759233474731, + "learning_rate": 4.450139940830843e-05, + "loss": 4.7699, + "step": 36184 + }, + { + "epoch": 0.21520244552288514, + "grad_norm": 1.7143422365188599, + "learning_rate": 4.450110713680252e-05, + "loss": 5.135, + "step": 36185 + }, + { + "epoch": 0.21520839280616613, + "grad_norm": 1.5604811906814575, + "learning_rate": 4.4500814858488984e-05, + "loss": 5.2164, + "step": 36186 + }, + { + "epoch": 0.21521434008944715, + "grad_norm": 1.1931864023208618, + "learning_rate": 4.450052257336792e-05, + "loss": 5.1427, + "step": 36187 + }, + { + "epoch": 0.21522028737272814, + "grad_norm": 1.7609212398529053, + "learning_rate": 4.4500230281439416e-05, + "loss": 4.6423, + "step": 36188 + }, + { + "epoch": 0.21522623465600912, + "grad_norm": 1.8664861917495728, + "learning_rate": 4.4499937982703575e-05, + "loss": 4.6855, + "step": 36189 + }, + { + "epoch": 0.21523218193929014, + "grad_norm": 1.6264115571975708, + "learning_rate": 4.449964567716052e-05, + "loss": 4.7096, + "step": 36190 + }, + { + "epoch": 0.21523812922257113, + "grad_norm": 1.592661738395691, + "learning_rate": 4.4499353364810335e-05, + "loss": 4.612, + "step": 36191 + }, + { + "epoch": 0.21524407650585212, + "grad_norm": 1.600019097328186, + "learning_rate": 4.449906104565313e-05, + "loss": 4.835, + "step": 36192 + }, + { + "epoch": 0.21525002378913313, + "grad_norm": 1.672973394393921, + "learning_rate": 4.449876871968901e-05, + "loss": 4.8086, + "step": 36193 + }, + { + "epoch": 0.21525597107241412, + "grad_norm": 1.5232312679290771, + "learning_rate": 4.4498476386918066e-05, + "loss": 5.0278, + "step": 36194 + }, + { + "epoch": 0.2152619183556951, + "grad_norm": 1.459900975227356, + "learning_rate": 4.4498184047340404e-05, + "loss": 4.8873, + "step": 36195 + }, + { + "epoch": 0.21526786563897612, + "grad_norm": 1.6688652038574219, + "learning_rate": 4.449789170095612e-05, + "loss": 5.1418, + "step": 36196 + }, + { + "epoch": 0.2152738129222571, + "grad_norm": 1.9095449447631836, + "learning_rate": 4.449759934776533e-05, + "loss": 4.1144, + "step": 36197 + }, + { + "epoch": 0.2152797602055381, + "grad_norm": 1.5863621234893799, + "learning_rate": 4.449730698776814e-05, + "loss": 4.6239, + "step": 36198 + }, + { + "epoch": 0.21528570748881912, + "grad_norm": 1.4681096076965332, + "learning_rate": 4.449701462096463e-05, + "loss": 4.9615, + "step": 36199 + }, + { + "epoch": 0.2152916547721001, + "grad_norm": 1.204664707183838, + "learning_rate": 4.4496722247354916e-05, + "loss": 5.1881, + "step": 36200 + }, + { + "epoch": 0.2152976020553811, + "grad_norm": 1.3521478176116943, + "learning_rate": 4.449642986693909e-05, + "loss": 4.891, + "step": 36201 + }, + { + "epoch": 0.2153035493386621, + "grad_norm": 1.3406485319137573, + "learning_rate": 4.4496137479717276e-05, + "loss": 4.7794, + "step": 36202 + }, + { + "epoch": 0.2153094966219431, + "grad_norm": 1.1206368207931519, + "learning_rate": 4.449584508568955e-05, + "loss": 4.4378, + "step": 36203 + }, + { + "epoch": 0.21531544390522409, + "grad_norm": 1.1885775327682495, + "learning_rate": 4.449555268485603e-05, + "loss": 4.6944, + "step": 36204 + }, + { + "epoch": 0.2153213911885051, + "grad_norm": 1.4947532415390015, + "learning_rate": 4.449526027721682e-05, + "loss": 4.3875, + "step": 36205 + }, + { + "epoch": 0.2153273384717861, + "grad_norm": 1.6429933309555054, + "learning_rate": 4.449496786277201e-05, + "loss": 4.1325, + "step": 36206 + }, + { + "epoch": 0.21533328575506708, + "grad_norm": 1.6001614332199097, + "learning_rate": 4.449467544152171e-05, + "loss": 4.9123, + "step": 36207 + }, + { + "epoch": 0.2153392330383481, + "grad_norm": 2.3580222129821777, + "learning_rate": 4.449438301346602e-05, + "loss": 4.5931, + "step": 36208 + }, + { + "epoch": 0.21534518032162908, + "grad_norm": 1.3925138711929321, + "learning_rate": 4.449409057860504e-05, + "loss": 5.0114, + "step": 36209 + }, + { + "epoch": 0.21535112760491007, + "grad_norm": 1.1549257040023804, + "learning_rate": 4.449379813693888e-05, + "loss": 4.8184, + "step": 36210 + }, + { + "epoch": 0.21535707488819109, + "grad_norm": 1.3624850511550903, + "learning_rate": 4.449350568846763e-05, + "loss": 4.6685, + "step": 36211 + }, + { + "epoch": 0.21536302217147207, + "grad_norm": 1.8626717329025269, + "learning_rate": 4.44932132331914e-05, + "loss": 4.2869, + "step": 36212 + }, + { + "epoch": 0.21536896945475306, + "grad_norm": 2.0599145889282227, + "learning_rate": 4.449292077111029e-05, + "loss": 4.0511, + "step": 36213 + }, + { + "epoch": 0.21537491673803408, + "grad_norm": 1.567862629890442, + "learning_rate": 4.44926283022244e-05, + "loss": 4.8801, + "step": 36214 + }, + { + "epoch": 0.21538086402131507, + "grad_norm": 1.5523948669433594, + "learning_rate": 4.4492335826533845e-05, + "loss": 4.7761, + "step": 36215 + }, + { + "epoch": 0.21538681130459605, + "grad_norm": 1.818768858909607, + "learning_rate": 4.449204334403871e-05, + "loss": 4.3282, + "step": 36216 + }, + { + "epoch": 0.21539275858787707, + "grad_norm": 1.7761987447738647, + "learning_rate": 4.449175085473911e-05, + "loss": 4.636, + "step": 36217 + }, + { + "epoch": 0.21539870587115806, + "grad_norm": 1.20720636844635, + "learning_rate": 4.449145835863513e-05, + "loss": 4.7549, + "step": 36218 + }, + { + "epoch": 0.21540465315443905, + "grad_norm": 1.3905788660049438, + "learning_rate": 4.449116585572689e-05, + "loss": 4.8965, + "step": 36219 + }, + { + "epoch": 0.21541060043772006, + "grad_norm": 1.7040412425994873, + "learning_rate": 4.449087334601448e-05, + "loss": 4.8117, + "step": 36220 + }, + { + "epoch": 0.21541654772100105, + "grad_norm": 1.7664754390716553, + "learning_rate": 4.449058082949802e-05, + "loss": 5.2555, + "step": 36221 + }, + { + "epoch": 0.21542249500428204, + "grad_norm": 1.8236404657363892, + "learning_rate": 4.449028830617759e-05, + "loss": 5.212, + "step": 36222 + }, + { + "epoch": 0.21542844228756305, + "grad_norm": 1.6441102027893066, + "learning_rate": 4.44899957760533e-05, + "loss": 5.3216, + "step": 36223 + }, + { + "epoch": 0.21543438957084404, + "grad_norm": 1.6687595844268799, + "learning_rate": 4.4489703239125265e-05, + "loss": 4.9983, + "step": 36224 + }, + { + "epoch": 0.21544033685412503, + "grad_norm": 1.7582825422286987, + "learning_rate": 4.448941069539356e-05, + "loss": 4.4715, + "step": 36225 + }, + { + "epoch": 0.21544628413740605, + "grad_norm": 1.653625726699829, + "learning_rate": 4.4489118144858324e-05, + "loss": 4.5712, + "step": 36226 + }, + { + "epoch": 0.21545223142068703, + "grad_norm": 1.70182466506958, + "learning_rate": 4.448882558751962e-05, + "loss": 4.4521, + "step": 36227 + }, + { + "epoch": 0.21545817870396802, + "grad_norm": 1.3696340322494507, + "learning_rate": 4.448853302337758e-05, + "loss": 5.0353, + "step": 36228 + }, + { + "epoch": 0.215464125987249, + "grad_norm": 1.3940473794937134, + "learning_rate": 4.448824045243228e-05, + "loss": 5.0052, + "step": 36229 + }, + { + "epoch": 0.21547007327053003, + "grad_norm": 1.5669533014297485, + "learning_rate": 4.448794787468384e-05, + "loss": 4.8972, + "step": 36230 + }, + { + "epoch": 0.21547602055381102, + "grad_norm": 1.5695853233337402, + "learning_rate": 4.448765529013237e-05, + "loss": 4.9789, + "step": 36231 + }, + { + "epoch": 0.215481967837092, + "grad_norm": 1.5337707996368408, + "learning_rate": 4.4487362698777956e-05, + "loss": 4.5533, + "step": 36232 + }, + { + "epoch": 0.21548791512037302, + "grad_norm": 1.8555629253387451, + "learning_rate": 4.4487070100620705e-05, + "loss": 4.2116, + "step": 36233 + }, + { + "epoch": 0.215493862403654, + "grad_norm": 1.8653151988983154, + "learning_rate": 4.4486777495660715e-05, + "loss": 4.6188, + "step": 36234 + }, + { + "epoch": 0.215499809686935, + "grad_norm": 1.7039722204208374, + "learning_rate": 4.4486484883898096e-05, + "loss": 4.9626, + "step": 36235 + }, + { + "epoch": 0.215505756970216, + "grad_norm": 1.7743360996246338, + "learning_rate": 4.448619226533295e-05, + "loss": 4.7605, + "step": 36236 + }, + { + "epoch": 0.215511704253497, + "grad_norm": 1.6669758558273315, + "learning_rate": 4.4485899639965366e-05, + "loss": 4.6322, + "step": 36237 + }, + { + "epoch": 0.215517651536778, + "grad_norm": 1.4342900514602661, + "learning_rate": 4.448560700779546e-05, + "loss": 5.2187, + "step": 36238 + }, + { + "epoch": 0.215523598820059, + "grad_norm": 1.6044615507125854, + "learning_rate": 4.448531436882333e-05, + "loss": 4.8211, + "step": 36239 + }, + { + "epoch": 0.21552954610334, + "grad_norm": 1.783548355102539, + "learning_rate": 4.448502172304908e-05, + "loss": 5.0975, + "step": 36240 + }, + { + "epoch": 0.21553549338662098, + "grad_norm": 1.6044108867645264, + "learning_rate": 4.4484729070472806e-05, + "loss": 4.8102, + "step": 36241 + }, + { + "epoch": 0.215541440669902, + "grad_norm": 1.4885926246643066, + "learning_rate": 4.448443641109462e-05, + "loss": 5.0961, + "step": 36242 + }, + { + "epoch": 0.21554738795318298, + "grad_norm": 1.358034610748291, + "learning_rate": 4.448414374491462e-05, + "loss": 5.0771, + "step": 36243 + }, + { + "epoch": 0.21555333523646397, + "grad_norm": 1.6421840190887451, + "learning_rate": 4.44838510719329e-05, + "loss": 4.769, + "step": 36244 + }, + { + "epoch": 0.215559282519745, + "grad_norm": 1.5576809644699097, + "learning_rate": 4.448355839214958e-05, + "loss": 4.9235, + "step": 36245 + }, + { + "epoch": 0.21556522980302598, + "grad_norm": 1.6670345067977905, + "learning_rate": 4.4483265705564736e-05, + "loss": 4.4313, + "step": 36246 + }, + { + "epoch": 0.21557117708630696, + "grad_norm": 1.5631513595581055, + "learning_rate": 4.44829730121785e-05, + "loss": 4.7763, + "step": 36247 + }, + { + "epoch": 0.21557712436958798, + "grad_norm": 1.5368024110794067, + "learning_rate": 4.4482680311990944e-05, + "loss": 4.8561, + "step": 36248 + }, + { + "epoch": 0.21558307165286897, + "grad_norm": 1.5789357423782349, + "learning_rate": 4.44823876050022e-05, + "loss": 4.9685, + "step": 36249 + }, + { + "epoch": 0.21558901893614996, + "grad_norm": 1.771773338317871, + "learning_rate": 4.4482094891212345e-05, + "loss": 4.7449, + "step": 36250 + }, + { + "epoch": 0.21559496621943097, + "grad_norm": 1.5449539422988892, + "learning_rate": 4.4481802170621496e-05, + "loss": 5.161, + "step": 36251 + }, + { + "epoch": 0.21560091350271196, + "grad_norm": 1.5021382570266724, + "learning_rate": 4.448150944322975e-05, + "loss": 5.3135, + "step": 36252 + }, + { + "epoch": 0.21560686078599295, + "grad_norm": 1.404275894165039, + "learning_rate": 4.448121670903721e-05, + "loss": 5.0408, + "step": 36253 + }, + { + "epoch": 0.21561280806927396, + "grad_norm": 1.5224852561950684, + "learning_rate": 4.448092396804398e-05, + "loss": 4.8239, + "step": 36254 + }, + { + "epoch": 0.21561875535255495, + "grad_norm": 1.8339958190917969, + "learning_rate": 4.4480631220250156e-05, + "loss": 4.8889, + "step": 36255 + }, + { + "epoch": 0.21562470263583594, + "grad_norm": 1.7479435205459595, + "learning_rate": 4.448033846565586e-05, + "loss": 4.9801, + "step": 36256 + }, + { + "epoch": 0.21563064991911696, + "grad_norm": 1.8542855978012085, + "learning_rate": 4.4480045704261164e-05, + "loss": 3.7599, + "step": 36257 + }, + { + "epoch": 0.21563659720239794, + "grad_norm": 1.655678629875183, + "learning_rate": 4.447975293606619e-05, + "loss": 4.702, + "step": 36258 + }, + { + "epoch": 0.21564254448567893, + "grad_norm": 1.811626672744751, + "learning_rate": 4.447946016107104e-05, + "loss": 4.8734, + "step": 36259 + }, + { + "epoch": 0.21564849176895995, + "grad_norm": 1.7445614337921143, + "learning_rate": 4.4479167379275796e-05, + "loss": 4.6599, + "step": 36260 + }, + { + "epoch": 0.21565443905224094, + "grad_norm": 2.8761303424835205, + "learning_rate": 4.447887459068059e-05, + "loss": 3.4399, + "step": 36261 + }, + { + "epoch": 0.21566038633552193, + "grad_norm": 1.9224152565002441, + "learning_rate": 4.4478581795285515e-05, + "loss": 4.2702, + "step": 36262 + }, + { + "epoch": 0.21566633361880294, + "grad_norm": 1.9684844017028809, + "learning_rate": 4.447828899309066e-05, + "loss": 4.2957, + "step": 36263 + }, + { + "epoch": 0.21567228090208393, + "grad_norm": 1.8181499242782593, + "learning_rate": 4.4477996184096125e-05, + "loss": 4.7013, + "step": 36264 + }, + { + "epoch": 0.21567822818536492, + "grad_norm": 1.7259873151779175, + "learning_rate": 4.447770336830204e-05, + "loss": 4.5011, + "step": 36265 + }, + { + "epoch": 0.21568417546864593, + "grad_norm": 1.6804701089859009, + "learning_rate": 4.447741054570849e-05, + "loss": 4.3317, + "step": 36266 + }, + { + "epoch": 0.21569012275192692, + "grad_norm": 1.349643588066101, + "learning_rate": 4.4477117716315565e-05, + "loss": 5.0237, + "step": 36267 + }, + { + "epoch": 0.2156960700352079, + "grad_norm": 1.9172464609146118, + "learning_rate": 4.447682488012338e-05, + "loss": 4.6157, + "step": 36268 + }, + { + "epoch": 0.21570201731848893, + "grad_norm": 1.5372661352157593, + "learning_rate": 4.447653203713205e-05, + "loss": 4.966, + "step": 36269 + }, + { + "epoch": 0.2157079646017699, + "grad_norm": 2.078467845916748, + "learning_rate": 4.447623918734165e-05, + "loss": 4.3186, + "step": 36270 + }, + { + "epoch": 0.2157139118850509, + "grad_norm": 2.3255176544189453, + "learning_rate": 4.44759463307523e-05, + "loss": 2.9728, + "step": 36271 + }, + { + "epoch": 0.21571985916833192, + "grad_norm": 1.7872892618179321, + "learning_rate": 4.4475653467364106e-05, + "loss": 4.2326, + "step": 36272 + }, + { + "epoch": 0.2157258064516129, + "grad_norm": 1.6925581693649292, + "learning_rate": 4.447536059717715e-05, + "loss": 4.6084, + "step": 36273 + }, + { + "epoch": 0.2157317537348939, + "grad_norm": 1.6806141138076782, + "learning_rate": 4.447506772019155e-05, + "loss": 4.7579, + "step": 36274 + }, + { + "epoch": 0.2157377010181749, + "grad_norm": 2.586641788482666, + "learning_rate": 4.447477483640742e-05, + "loss": 3.3903, + "step": 36275 + }, + { + "epoch": 0.2157436483014559, + "grad_norm": 2.487593173980713, + "learning_rate": 4.447448194582483e-05, + "loss": 4.3451, + "step": 36276 + }, + { + "epoch": 0.2157495955847369, + "grad_norm": 1.8467118740081787, + "learning_rate": 4.4474189048443907e-05, + "loss": 4.7606, + "step": 36277 + }, + { + "epoch": 0.2157555428680179, + "grad_norm": 1.3377431631088257, + "learning_rate": 4.447389614426475e-05, + "loss": 4.9565, + "step": 36278 + }, + { + "epoch": 0.2157614901512989, + "grad_norm": 1.932654857635498, + "learning_rate": 4.4473603233287445e-05, + "loss": 4.5205, + "step": 36279 + }, + { + "epoch": 0.21576743743457988, + "grad_norm": 1.7796809673309326, + "learning_rate": 4.4473310315512116e-05, + "loss": 4.3455, + "step": 36280 + }, + { + "epoch": 0.2157733847178609, + "grad_norm": 1.90752112865448, + "learning_rate": 4.4473017390938854e-05, + "loss": 4.3992, + "step": 36281 + }, + { + "epoch": 0.21577933200114188, + "grad_norm": 1.6692253351211548, + "learning_rate": 4.447272445956776e-05, + "loss": 4.6441, + "step": 36282 + }, + { + "epoch": 0.21578527928442287, + "grad_norm": 2.3043060302734375, + "learning_rate": 4.447243152139894e-05, + "loss": 3.1886, + "step": 36283 + }, + { + "epoch": 0.2157912265677039, + "grad_norm": 1.9240410327911377, + "learning_rate": 4.44721385764325e-05, + "loss": 4.2297, + "step": 36284 + }, + { + "epoch": 0.21579717385098487, + "grad_norm": 2.9011518955230713, + "learning_rate": 4.447184562466853e-05, + "loss": 2.8793, + "step": 36285 + }, + { + "epoch": 0.21580312113426586, + "grad_norm": 1.9503211975097656, + "learning_rate": 4.447155266610714e-05, + "loss": 4.9535, + "step": 36286 + }, + { + "epoch": 0.21580906841754685, + "grad_norm": 1.957065463066101, + "learning_rate": 4.4471259700748436e-05, + "loss": 4.0936, + "step": 36287 + }, + { + "epoch": 0.21581501570082787, + "grad_norm": 2.4622087478637695, + "learning_rate": 4.4470966728592515e-05, + "loss": 3.0493, + "step": 36288 + }, + { + "epoch": 0.21582096298410886, + "grad_norm": 2.5405967235565186, + "learning_rate": 4.447067374963948e-05, + "loss": 3.3334, + "step": 36289 + }, + { + "epoch": 0.21582691026738984, + "grad_norm": 2.6690690517425537, + "learning_rate": 4.447038076388944e-05, + "loss": 3.5309, + "step": 36290 + }, + { + "epoch": 0.21583285755067086, + "grad_norm": 2.7902510166168213, + "learning_rate": 4.447008777134248e-05, + "loss": 3.5418, + "step": 36291 + }, + { + "epoch": 0.21583880483395185, + "grad_norm": 2.6572537422180176, + "learning_rate": 4.4469794771998726e-05, + "loss": 3.4522, + "step": 36292 + }, + { + "epoch": 0.21584475211723284, + "grad_norm": 2.5804686546325684, + "learning_rate": 4.446950176585826e-05, + "loss": 3.5556, + "step": 36293 + }, + { + "epoch": 0.21585069940051385, + "grad_norm": 2.6215248107910156, + "learning_rate": 4.4469208752921196e-05, + "loss": 3.6112, + "step": 36294 + }, + { + "epoch": 0.21585664668379484, + "grad_norm": 2.5278232097625732, + "learning_rate": 4.4468915733187624e-05, + "loss": 3.4373, + "step": 36295 + }, + { + "epoch": 0.21586259396707583, + "grad_norm": 2.2777929306030273, + "learning_rate": 4.446862270665766e-05, + "loss": 3.0397, + "step": 36296 + }, + { + "epoch": 0.21586854125035684, + "grad_norm": 2.003936529159546, + "learning_rate": 4.446832967333141e-05, + "loss": 3.7177, + "step": 36297 + }, + { + "epoch": 0.21587448853363783, + "grad_norm": 2.218179941177368, + "learning_rate": 4.446803663320895e-05, + "loss": 4.398, + "step": 36298 + }, + { + "epoch": 0.21588043581691882, + "grad_norm": 2.1191961765289307, + "learning_rate": 4.4467743586290414e-05, + "loss": 4.3687, + "step": 36299 + }, + { + "epoch": 0.21588638310019984, + "grad_norm": 2.0627639293670654, + "learning_rate": 4.446745053257588e-05, + "loss": 4.4121, + "step": 36300 + }, + { + "epoch": 0.21589233038348082, + "grad_norm": 2.177537441253662, + "learning_rate": 4.446715747206547e-05, + "loss": 4.7772, + "step": 36301 + }, + { + "epoch": 0.2158982776667618, + "grad_norm": 2.4316155910491943, + "learning_rate": 4.446686440475927e-05, + "loss": 3.606, + "step": 36302 + }, + { + "epoch": 0.21590422495004283, + "grad_norm": 1.6192671060562134, + "learning_rate": 4.446657133065739e-05, + "loss": 4.9919, + "step": 36303 + }, + { + "epoch": 0.21591017223332382, + "grad_norm": 1.7824963331222534, + "learning_rate": 4.446627824975993e-05, + "loss": 5.085, + "step": 36304 + }, + { + "epoch": 0.2159161195166048, + "grad_norm": 2.347855806350708, + "learning_rate": 4.446598516206699e-05, + "loss": 3.9342, + "step": 36305 + }, + { + "epoch": 0.21592206679988582, + "grad_norm": 2.2459559440612793, + "learning_rate": 4.446569206757868e-05, + "loss": 3.7066, + "step": 36306 + }, + { + "epoch": 0.2159280140831668, + "grad_norm": 1.8832706212997437, + "learning_rate": 4.44653989662951e-05, + "loss": 4.3872, + "step": 36307 + }, + { + "epoch": 0.2159339613664478, + "grad_norm": 1.6729106903076172, + "learning_rate": 4.4465105858216346e-05, + "loss": 4.5845, + "step": 36308 + }, + { + "epoch": 0.2159399086497288, + "grad_norm": 1.494909644126892, + "learning_rate": 4.446481274334253e-05, + "loss": 5.0, + "step": 36309 + }, + { + "epoch": 0.2159458559330098, + "grad_norm": 1.655707597732544, + "learning_rate": 4.446451962167375e-05, + "loss": 5.1941, + "step": 36310 + }, + { + "epoch": 0.2159518032162908, + "grad_norm": 2.25812029838562, + "learning_rate": 4.4464226493210105e-05, + "loss": 4.5174, + "step": 36311 + }, + { + "epoch": 0.2159577504995718, + "grad_norm": 1.9949771165847778, + "learning_rate": 4.4463933357951695e-05, + "loss": 4.6311, + "step": 36312 + }, + { + "epoch": 0.2159636977828528, + "grad_norm": 1.69150710105896, + "learning_rate": 4.446364021589863e-05, + "loss": 4.9013, + "step": 36313 + }, + { + "epoch": 0.21596964506613378, + "grad_norm": 2.227994680404663, + "learning_rate": 4.4463347067051006e-05, + "loss": 4.3162, + "step": 36314 + }, + { + "epoch": 0.2159755923494148, + "grad_norm": 3.0076286792755127, + "learning_rate": 4.446305391140894e-05, + "loss": 4.0106, + "step": 36315 + }, + { + "epoch": 0.21598153963269578, + "grad_norm": 2.24741268157959, + "learning_rate": 4.4462760748972507e-05, + "loss": 3.759, + "step": 36316 + }, + { + "epoch": 0.21598748691597677, + "grad_norm": 1.5488991737365723, + "learning_rate": 4.4462467579741834e-05, + "loss": 4.5564, + "step": 36317 + }, + { + "epoch": 0.2159934341992578, + "grad_norm": 1.7913551330566406, + "learning_rate": 4.4462174403717016e-05, + "loss": 4.8823, + "step": 36318 + }, + { + "epoch": 0.21599938148253878, + "grad_norm": 2.324786901473999, + "learning_rate": 4.446188122089815e-05, + "loss": 4.1834, + "step": 36319 + }, + { + "epoch": 0.21600532876581977, + "grad_norm": 1.3889487981796265, + "learning_rate": 4.446158803128534e-05, + "loss": 5.1393, + "step": 36320 + }, + { + "epoch": 0.21601127604910078, + "grad_norm": 1.303863525390625, + "learning_rate": 4.44612948348787e-05, + "loss": 5.0815, + "step": 36321 + }, + { + "epoch": 0.21601722333238177, + "grad_norm": 1.2250717878341675, + "learning_rate": 4.446100163167831e-05, + "loss": 5.4439, + "step": 36322 + }, + { + "epoch": 0.21602317061566276, + "grad_norm": 1.3837891817092896, + "learning_rate": 4.4460708421684295e-05, + "loss": 5.0406, + "step": 36323 + }, + { + "epoch": 0.21602911789894377, + "grad_norm": 1.6228313446044922, + "learning_rate": 4.446041520489675e-05, + "loss": 5.0236, + "step": 36324 + }, + { + "epoch": 0.21603506518222476, + "grad_norm": 1.707972764968872, + "learning_rate": 4.446012198131577e-05, + "loss": 4.6587, + "step": 36325 + }, + { + "epoch": 0.21604101246550575, + "grad_norm": 1.5421570539474487, + "learning_rate": 4.4459828750941465e-05, + "loss": 4.8926, + "step": 36326 + }, + { + "epoch": 0.21604695974878677, + "grad_norm": 1.5230952501296997, + "learning_rate": 4.445953551377393e-05, + "loss": 5.042, + "step": 36327 + }, + { + "epoch": 0.21605290703206775, + "grad_norm": 1.3272488117218018, + "learning_rate": 4.445924226981327e-05, + "loss": 5.1375, + "step": 36328 + }, + { + "epoch": 0.21605885431534874, + "grad_norm": 1.1550372838974, + "learning_rate": 4.4458949019059606e-05, + "loss": 5.2326, + "step": 36329 + }, + { + "epoch": 0.21606480159862976, + "grad_norm": 1.3413779735565186, + "learning_rate": 4.445865576151301e-05, + "loss": 4.9879, + "step": 36330 + }, + { + "epoch": 0.21607074888191075, + "grad_norm": 1.5402988195419312, + "learning_rate": 4.44583624971736e-05, + "loss": 5.1719, + "step": 36331 + }, + { + "epoch": 0.21607669616519173, + "grad_norm": 2.013479471206665, + "learning_rate": 4.445806922604148e-05, + "loss": 4.5107, + "step": 36332 + }, + { + "epoch": 0.21608264344847275, + "grad_norm": 1.3441870212554932, + "learning_rate": 4.445777594811674e-05, + "loss": 4.706, + "step": 36333 + }, + { + "epoch": 0.21608859073175374, + "grad_norm": 1.5314089059829712, + "learning_rate": 4.44574826633995e-05, + "loss": 4.737, + "step": 36334 + }, + { + "epoch": 0.21609453801503473, + "grad_norm": 1.3800076246261597, + "learning_rate": 4.445718937188985e-05, + "loss": 4.9501, + "step": 36335 + }, + { + "epoch": 0.21610048529831574, + "grad_norm": 1.5042531490325928, + "learning_rate": 4.4456896073587905e-05, + "loss": 4.8638, + "step": 36336 + }, + { + "epoch": 0.21610643258159673, + "grad_norm": 1.4311203956604004, + "learning_rate": 4.445660276849375e-05, + "loss": 4.8713, + "step": 36337 + }, + { + "epoch": 0.21611237986487772, + "grad_norm": 1.5277742147445679, + "learning_rate": 4.44563094566075e-05, + "loss": 4.9963, + "step": 36338 + }, + { + "epoch": 0.21611832714815873, + "grad_norm": 1.7784839868545532, + "learning_rate": 4.4456016137929246e-05, + "loss": 5.0083, + "step": 36339 + }, + { + "epoch": 0.21612427443143972, + "grad_norm": 1.3861591815948486, + "learning_rate": 4.4455722812459104e-05, + "loss": 4.8264, + "step": 36340 + }, + { + "epoch": 0.2161302217147207, + "grad_norm": 1.4573569297790527, + "learning_rate": 4.445542948019717e-05, + "loss": 4.8561, + "step": 36341 + }, + { + "epoch": 0.21613616899800173, + "grad_norm": 1.3556313514709473, + "learning_rate": 4.445513614114355e-05, + "loss": 4.8997, + "step": 36342 + }, + { + "epoch": 0.21614211628128271, + "grad_norm": 1.5516074895858765, + "learning_rate": 4.445484279529834e-05, + "loss": 4.8283, + "step": 36343 + }, + { + "epoch": 0.2161480635645637, + "grad_norm": 1.4483047723770142, + "learning_rate": 4.445454944266164e-05, + "loss": 4.9132, + "step": 36344 + }, + { + "epoch": 0.2161540108478447, + "grad_norm": 1.6741615533828735, + "learning_rate": 4.4454256083233556e-05, + "loss": 4.6512, + "step": 36345 + }, + { + "epoch": 0.2161599581311257, + "grad_norm": 1.598311424255371, + "learning_rate": 4.445396271701421e-05, + "loss": 4.9215, + "step": 36346 + }, + { + "epoch": 0.2161659054144067, + "grad_norm": 1.5425868034362793, + "learning_rate": 4.445366934400367e-05, + "loss": 4.8261, + "step": 36347 + }, + { + "epoch": 0.21617185269768768, + "grad_norm": 1.6026711463928223, + "learning_rate": 4.445337596420206e-05, + "loss": 4.8442, + "step": 36348 + }, + { + "epoch": 0.2161777999809687, + "grad_norm": 1.524340271949768, + "learning_rate": 4.4453082577609474e-05, + "loss": 4.9961, + "step": 36349 + }, + { + "epoch": 0.2161837472642497, + "grad_norm": 1.5798773765563965, + "learning_rate": 4.445278918422602e-05, + "loss": 4.9096, + "step": 36350 + }, + { + "epoch": 0.21618969454753068, + "grad_norm": 1.4556652307510376, + "learning_rate": 4.44524957840518e-05, + "loss": 4.8589, + "step": 36351 + }, + { + "epoch": 0.2161956418308117, + "grad_norm": 1.522506833076477, + "learning_rate": 4.445220237708692e-05, + "loss": 4.8721, + "step": 36352 + }, + { + "epoch": 0.21620158911409268, + "grad_norm": 1.347317099571228, + "learning_rate": 4.445190896333147e-05, + "loss": 4.9497, + "step": 36353 + }, + { + "epoch": 0.21620753639737367, + "grad_norm": 1.5334205627441406, + "learning_rate": 4.445161554278556e-05, + "loss": 4.9534, + "step": 36354 + }, + { + "epoch": 0.21621348368065468, + "grad_norm": 1.5388821363449097, + "learning_rate": 4.445132211544929e-05, + "loss": 4.8753, + "step": 36355 + }, + { + "epoch": 0.21621943096393567, + "grad_norm": 1.5709154605865479, + "learning_rate": 4.4451028681322764e-05, + "loss": 4.8397, + "step": 36356 + }, + { + "epoch": 0.21622537824721666, + "grad_norm": 1.835668683052063, + "learning_rate": 4.445073524040609e-05, + "loss": 5.2878, + "step": 36357 + }, + { + "epoch": 0.21623132553049768, + "grad_norm": 1.3644315004348755, + "learning_rate": 4.445044179269936e-05, + "loss": 4.6898, + "step": 36358 + }, + { + "epoch": 0.21623727281377866, + "grad_norm": 2.2211451530456543, + "learning_rate": 4.445014833820269e-05, + "loss": 4.1424, + "step": 36359 + }, + { + "epoch": 0.21624322009705965, + "grad_norm": 1.6837176084518433, + "learning_rate": 4.444985487691617e-05, + "loss": 5.2938, + "step": 36360 + }, + { + "epoch": 0.21624916738034067, + "grad_norm": 1.6217468976974487, + "learning_rate": 4.44495614088399e-05, + "loss": 4.6109, + "step": 36361 + }, + { + "epoch": 0.21625511466362166, + "grad_norm": 1.295923113822937, + "learning_rate": 4.4449267933974e-05, + "loss": 4.9983, + "step": 36362 + }, + { + "epoch": 0.21626106194690264, + "grad_norm": 1.8383874893188477, + "learning_rate": 4.444897445231855e-05, + "loss": 4.2286, + "step": 36363 + }, + { + "epoch": 0.21626700923018366, + "grad_norm": 1.6389504671096802, + "learning_rate": 4.4448680963873674e-05, + "loss": 5.3462, + "step": 36364 + }, + { + "epoch": 0.21627295651346465, + "grad_norm": 1.5260887145996094, + "learning_rate": 4.444838746863946e-05, + "loss": 4.7547, + "step": 36365 + }, + { + "epoch": 0.21627890379674564, + "grad_norm": 1.573678970336914, + "learning_rate": 4.4448093966616015e-05, + "loss": 4.4435, + "step": 36366 + }, + { + "epoch": 0.21628485108002665, + "grad_norm": 1.3941434621810913, + "learning_rate": 4.4447800457803444e-05, + "loss": 4.4243, + "step": 36367 + }, + { + "epoch": 0.21629079836330764, + "grad_norm": 1.664817214012146, + "learning_rate": 4.444750694220184e-05, + "loss": 4.4748, + "step": 36368 + }, + { + "epoch": 0.21629674564658863, + "grad_norm": 1.423172116279602, + "learning_rate": 4.444721341981132e-05, + "loss": 4.5332, + "step": 36369 + }, + { + "epoch": 0.21630269292986964, + "grad_norm": 1.7631560564041138, + "learning_rate": 4.444691989063198e-05, + "loss": 5.0456, + "step": 36370 + }, + { + "epoch": 0.21630864021315063, + "grad_norm": 1.3937678337097168, + "learning_rate": 4.444662635466391e-05, + "loss": 5.1116, + "step": 36371 + }, + { + "epoch": 0.21631458749643162, + "grad_norm": 1.5468742847442627, + "learning_rate": 4.444633281190723e-05, + "loss": 5.0551, + "step": 36372 + }, + { + "epoch": 0.21632053477971264, + "grad_norm": 1.6004170179367065, + "learning_rate": 4.444603926236204e-05, + "loss": 4.7766, + "step": 36373 + }, + { + "epoch": 0.21632648206299362, + "grad_norm": 1.4662137031555176, + "learning_rate": 4.444574570602843e-05, + "loss": 4.9473, + "step": 36374 + }, + { + "epoch": 0.2163324293462746, + "grad_norm": 1.4400924444198608, + "learning_rate": 4.4445452142906515e-05, + "loss": 4.9529, + "step": 36375 + }, + { + "epoch": 0.21633837662955563, + "grad_norm": 1.3921599388122559, + "learning_rate": 4.44451585729964e-05, + "loss": 5.5826, + "step": 36376 + }, + { + "epoch": 0.21634432391283662, + "grad_norm": 1.650146722793579, + "learning_rate": 4.444486499629818e-05, + "loss": 4.3338, + "step": 36377 + }, + { + "epoch": 0.2163502711961176, + "grad_norm": 1.5027433633804321, + "learning_rate": 4.4444571412811954e-05, + "loss": 4.5485, + "step": 36378 + }, + { + "epoch": 0.21635621847939862, + "grad_norm": 1.3315762281417847, + "learning_rate": 4.4444277822537826e-05, + "loss": 5.277, + "step": 36379 + }, + { + "epoch": 0.2163621657626796, + "grad_norm": 1.5802031755447388, + "learning_rate": 4.44439842254759e-05, + "loss": 4.9875, + "step": 36380 + }, + { + "epoch": 0.2163681130459606, + "grad_norm": 1.4244681596755981, + "learning_rate": 4.444369062162629e-05, + "loss": 5.0073, + "step": 36381 + }, + { + "epoch": 0.2163740603292416, + "grad_norm": 1.5206032991409302, + "learning_rate": 4.444339701098909e-05, + "loss": 4.8693, + "step": 36382 + }, + { + "epoch": 0.2163800076125226, + "grad_norm": 1.3556402921676636, + "learning_rate": 4.444310339356439e-05, + "loss": 4.8651, + "step": 36383 + }, + { + "epoch": 0.2163859548958036, + "grad_norm": 1.7892037630081177, + "learning_rate": 4.44428097693523e-05, + "loss": 4.5375, + "step": 36384 + }, + { + "epoch": 0.2163919021790846, + "grad_norm": 1.5106563568115234, + "learning_rate": 4.444251613835294e-05, + "loss": 5.0685, + "step": 36385 + }, + { + "epoch": 0.2163978494623656, + "grad_norm": 1.341135859489441, + "learning_rate": 4.444222250056639e-05, + "loss": 5.1778, + "step": 36386 + }, + { + "epoch": 0.21640379674564658, + "grad_norm": 1.385373592376709, + "learning_rate": 4.444192885599276e-05, + "loss": 4.8729, + "step": 36387 + }, + { + "epoch": 0.2164097440289276, + "grad_norm": 1.454485535621643, + "learning_rate": 4.4441635204632156e-05, + "loss": 5.0157, + "step": 36388 + }, + { + "epoch": 0.21641569131220859, + "grad_norm": 1.7790766954421997, + "learning_rate": 4.444134154648468e-05, + "loss": 4.4941, + "step": 36389 + }, + { + "epoch": 0.21642163859548957, + "grad_norm": 1.628504991531372, + "learning_rate": 4.444104788155043e-05, + "loss": 4.8383, + "step": 36390 + }, + { + "epoch": 0.2164275858787706, + "grad_norm": 1.4350956678390503, + "learning_rate": 4.444075420982951e-05, + "loss": 5.241, + "step": 36391 + }, + { + "epoch": 0.21643353316205158, + "grad_norm": 1.3836671113967896, + "learning_rate": 4.444046053132202e-05, + "loss": 4.9542, + "step": 36392 + }, + { + "epoch": 0.21643948044533257, + "grad_norm": 1.3507336378097534, + "learning_rate": 4.4440166846028084e-05, + "loss": 5.1188, + "step": 36393 + }, + { + "epoch": 0.21644542772861358, + "grad_norm": 1.6130249500274658, + "learning_rate": 4.443987315394778e-05, + "loss": 4.6271, + "step": 36394 + }, + { + "epoch": 0.21645137501189457, + "grad_norm": 1.2183295488357544, + "learning_rate": 4.443957945508121e-05, + "loss": 5.0471, + "step": 36395 + }, + { + "epoch": 0.21645732229517556, + "grad_norm": 1.1758854389190674, + "learning_rate": 4.443928574942848e-05, + "loss": 4.9255, + "step": 36396 + }, + { + "epoch": 0.21646326957845657, + "grad_norm": 1.2792357206344604, + "learning_rate": 4.44389920369897e-05, + "loss": 4.9221, + "step": 36397 + }, + { + "epoch": 0.21646921686173756, + "grad_norm": 1.5022220611572266, + "learning_rate": 4.443869831776497e-05, + "loss": 5.1525, + "step": 36398 + }, + { + "epoch": 0.21647516414501855, + "grad_norm": 1.5304787158966064, + "learning_rate": 4.443840459175439e-05, + "loss": 5.3113, + "step": 36399 + }, + { + "epoch": 0.21648111142829957, + "grad_norm": 1.3236007690429688, + "learning_rate": 4.443811085895807e-05, + "loss": 5.0398, + "step": 36400 + }, + { + "epoch": 0.21648705871158055, + "grad_norm": 1.2853519916534424, + "learning_rate": 4.44378171193761e-05, + "loss": 5.2451, + "step": 36401 + }, + { + "epoch": 0.21649300599486154, + "grad_norm": 1.410645842552185, + "learning_rate": 4.443752337300859e-05, + "loss": 4.9105, + "step": 36402 + }, + { + "epoch": 0.21649895327814253, + "grad_norm": 2.0519766807556152, + "learning_rate": 4.443722961985564e-05, + "loss": 3.9625, + "step": 36403 + }, + { + "epoch": 0.21650490056142355, + "grad_norm": 1.5357091426849365, + "learning_rate": 4.443693585991736e-05, + "loss": 4.7008, + "step": 36404 + }, + { + "epoch": 0.21651084784470453, + "grad_norm": 1.5789777040481567, + "learning_rate": 4.443664209319383e-05, + "loss": 5.0572, + "step": 36405 + }, + { + "epoch": 0.21651679512798552, + "grad_norm": 1.5537595748901367, + "learning_rate": 4.443634831968519e-05, + "loss": 4.5723, + "step": 36406 + }, + { + "epoch": 0.21652274241126654, + "grad_norm": 1.5900410413742065, + "learning_rate": 4.4436054539391516e-05, + "loss": 4.9849, + "step": 36407 + }, + { + "epoch": 0.21652868969454753, + "grad_norm": 1.1238914728164673, + "learning_rate": 4.443576075231291e-05, + "loss": 4.66, + "step": 36408 + }, + { + "epoch": 0.21653463697782852, + "grad_norm": 1.427838921546936, + "learning_rate": 4.4435466958449485e-05, + "loss": 4.7631, + "step": 36409 + }, + { + "epoch": 0.21654058426110953, + "grad_norm": 1.6186624765396118, + "learning_rate": 4.4435173157801334e-05, + "loss": 5.1209, + "step": 36410 + }, + { + "epoch": 0.21654653154439052, + "grad_norm": 1.5863722562789917, + "learning_rate": 4.443487935036857e-05, + "loss": 5.0902, + "step": 36411 + }, + { + "epoch": 0.2165524788276715, + "grad_norm": 1.7387241125106812, + "learning_rate": 4.443458553615129e-05, + "loss": 5.1191, + "step": 36412 + }, + { + "epoch": 0.21655842611095252, + "grad_norm": 1.605112910270691, + "learning_rate": 4.4434291715149603e-05, + "loss": 4.8986, + "step": 36413 + }, + { + "epoch": 0.2165643733942335, + "grad_norm": 1.6139943599700928, + "learning_rate": 4.4433997887363595e-05, + "loss": 4.9041, + "step": 36414 + }, + { + "epoch": 0.2165703206775145, + "grad_norm": 1.6797585487365723, + "learning_rate": 4.443370405279338e-05, + "loss": 4.8667, + "step": 36415 + }, + { + "epoch": 0.21657626796079552, + "grad_norm": 1.4826325178146362, + "learning_rate": 4.443341021143906e-05, + "loss": 4.9097, + "step": 36416 + }, + { + "epoch": 0.2165822152440765, + "grad_norm": 1.4120008945465088, + "learning_rate": 4.443311636330074e-05, + "loss": 4.842, + "step": 36417 + }, + { + "epoch": 0.2165881625273575, + "grad_norm": 1.6395269632339478, + "learning_rate": 4.443282250837852e-05, + "loss": 4.798, + "step": 36418 + }, + { + "epoch": 0.2165941098106385, + "grad_norm": 1.6432803869247437, + "learning_rate": 4.44325286466725e-05, + "loss": 4.9683, + "step": 36419 + }, + { + "epoch": 0.2166000570939195, + "grad_norm": 1.3104444742202759, + "learning_rate": 4.443223477818279e-05, + "loss": 5.0913, + "step": 36420 + }, + { + "epoch": 0.21660600437720048, + "grad_norm": 1.674682855606079, + "learning_rate": 4.443194090290949e-05, + "loss": 3.7327, + "step": 36421 + }, + { + "epoch": 0.2166119516604815, + "grad_norm": 1.6492141485214233, + "learning_rate": 4.443164702085269e-05, + "loss": 4.5528, + "step": 36422 + }, + { + "epoch": 0.2166178989437625, + "grad_norm": 1.6470035314559937, + "learning_rate": 4.443135313201251e-05, + "loss": 5.1476, + "step": 36423 + }, + { + "epoch": 0.21662384622704348, + "grad_norm": 1.5849100351333618, + "learning_rate": 4.443105923638904e-05, + "loss": 4.9662, + "step": 36424 + }, + { + "epoch": 0.2166297935103245, + "grad_norm": 1.444566249847412, + "learning_rate": 4.443076533398239e-05, + "loss": 4.9241, + "step": 36425 + }, + { + "epoch": 0.21663574079360548, + "grad_norm": 1.5952868461608887, + "learning_rate": 4.443047142479266e-05, + "loss": 4.766, + "step": 36426 + }, + { + "epoch": 0.21664168807688647, + "grad_norm": 1.3955894708633423, + "learning_rate": 4.443017750881996e-05, + "loss": 5.1851, + "step": 36427 + }, + { + "epoch": 0.21664763536016748, + "grad_norm": 1.377500057220459, + "learning_rate": 4.442988358606438e-05, + "loss": 4.8027, + "step": 36428 + }, + { + "epoch": 0.21665358264344847, + "grad_norm": 1.313023328781128, + "learning_rate": 4.4429589656526024e-05, + "loss": 4.825, + "step": 36429 + }, + { + "epoch": 0.21665952992672946, + "grad_norm": 1.479194164276123, + "learning_rate": 4.442929572020501e-05, + "loss": 4.9135, + "step": 36430 + }, + { + "epoch": 0.21666547721001048, + "grad_norm": 1.3001906871795654, + "learning_rate": 4.442900177710142e-05, + "loss": 4.9644, + "step": 36431 + }, + { + "epoch": 0.21667142449329146, + "grad_norm": 1.6930853128433228, + "learning_rate": 4.4428707827215374e-05, + "loss": 5.0808, + "step": 36432 + }, + { + "epoch": 0.21667737177657245, + "grad_norm": 1.3844190835952759, + "learning_rate": 4.442841387054696e-05, + "loss": 4.7619, + "step": 36433 + }, + { + "epoch": 0.21668331905985347, + "grad_norm": 1.3809784650802612, + "learning_rate": 4.4428119907096285e-05, + "loss": 4.7743, + "step": 36434 + }, + { + "epoch": 0.21668926634313446, + "grad_norm": 1.5848809480667114, + "learning_rate": 4.4427825936863465e-05, + "loss": 5.2092, + "step": 36435 + }, + { + "epoch": 0.21669521362641544, + "grad_norm": 1.2051990032196045, + "learning_rate": 4.442753195984859e-05, + "loss": 5.3504, + "step": 36436 + }, + { + "epoch": 0.21670116090969646, + "grad_norm": 1.4225530624389648, + "learning_rate": 4.4427237976051754e-05, + "loss": 5.5421, + "step": 36437 + }, + { + "epoch": 0.21670710819297745, + "grad_norm": 1.548554539680481, + "learning_rate": 4.442694398547308e-05, + "loss": 5.0913, + "step": 36438 + }, + { + "epoch": 0.21671305547625844, + "grad_norm": 1.8550792932510376, + "learning_rate": 4.4426649988112654e-05, + "loss": 5.1924, + "step": 36439 + }, + { + "epoch": 0.21671900275953945, + "grad_norm": 1.9623850584030151, + "learning_rate": 4.442635598397059e-05, + "loss": 4.5524, + "step": 36440 + }, + { + "epoch": 0.21672495004282044, + "grad_norm": 1.6146697998046875, + "learning_rate": 4.442606197304698e-05, + "loss": 5.2329, + "step": 36441 + }, + { + "epoch": 0.21673089732610143, + "grad_norm": 3.1403307914733887, + "learning_rate": 4.442576795534193e-05, + "loss": 3.5127, + "step": 36442 + }, + { + "epoch": 0.21673684460938245, + "grad_norm": 1.3766248226165771, + "learning_rate": 4.4425473930855554e-05, + "loss": 4.8523, + "step": 36443 + }, + { + "epoch": 0.21674279189266343, + "grad_norm": 1.4641730785369873, + "learning_rate": 4.4425179899587945e-05, + "loss": 4.8117, + "step": 36444 + }, + { + "epoch": 0.21674873917594442, + "grad_norm": 1.4823542833328247, + "learning_rate": 4.442488586153921e-05, + "loss": 4.7413, + "step": 36445 + }, + { + "epoch": 0.21675468645922544, + "grad_norm": 1.401440143585205, + "learning_rate": 4.4424591816709436e-05, + "loss": 4.7065, + "step": 36446 + }, + { + "epoch": 0.21676063374250643, + "grad_norm": 1.3161439895629883, + "learning_rate": 4.4424297765098745e-05, + "loss": 4.9648, + "step": 36447 + }, + { + "epoch": 0.2167665810257874, + "grad_norm": 1.7631399631500244, + "learning_rate": 4.442400370670723e-05, + "loss": 4.9201, + "step": 36448 + }, + { + "epoch": 0.21677252830906843, + "grad_norm": 1.556435227394104, + "learning_rate": 4.4423709641535e-05, + "loss": 5.1891, + "step": 36449 + }, + { + "epoch": 0.21677847559234942, + "grad_norm": 1.5537375211715698, + "learning_rate": 4.442341556958215e-05, + "loss": 5.1704, + "step": 36450 + }, + { + "epoch": 0.2167844228756304, + "grad_norm": 1.9972381591796875, + "learning_rate": 4.4423121490848785e-05, + "loss": 4.9716, + "step": 36451 + }, + { + "epoch": 0.21679037015891142, + "grad_norm": 1.3904248476028442, + "learning_rate": 4.442282740533501e-05, + "loss": 5.1136, + "step": 36452 + }, + { + "epoch": 0.2167963174421924, + "grad_norm": 1.5099358558654785, + "learning_rate": 4.442253331304093e-05, + "loss": 4.9462, + "step": 36453 + }, + { + "epoch": 0.2168022647254734, + "grad_norm": 1.4884952306747437, + "learning_rate": 4.4422239213966645e-05, + "loss": 4.9833, + "step": 36454 + }, + { + "epoch": 0.21680821200875441, + "grad_norm": 1.6606906652450562, + "learning_rate": 4.442194510811225e-05, + "loss": 4.9137, + "step": 36455 + }, + { + "epoch": 0.2168141592920354, + "grad_norm": 2.3339645862579346, + "learning_rate": 4.442165099547786e-05, + "loss": 4.6693, + "step": 36456 + }, + { + "epoch": 0.2168201065753164, + "grad_norm": 1.3238904476165771, + "learning_rate": 4.4421356876063566e-05, + "loss": 4.9108, + "step": 36457 + }, + { + "epoch": 0.2168260538585974, + "grad_norm": 1.689540982246399, + "learning_rate": 4.442106274986949e-05, + "loss": 5.0264, + "step": 36458 + }, + { + "epoch": 0.2168320011418784, + "grad_norm": 1.574047327041626, + "learning_rate": 4.4420768616895714e-05, + "loss": 5.0985, + "step": 36459 + }, + { + "epoch": 0.21683794842515938, + "grad_norm": 1.422987699508667, + "learning_rate": 4.442047447714234e-05, + "loss": 4.98, + "step": 36460 + }, + { + "epoch": 0.21684389570844037, + "grad_norm": 1.5349971055984497, + "learning_rate": 4.442018033060949e-05, + "loss": 4.9211, + "step": 36461 + }, + { + "epoch": 0.2168498429917214, + "grad_norm": 1.6502734422683716, + "learning_rate": 4.441988617729726e-05, + "loss": 5.2832, + "step": 36462 + }, + { + "epoch": 0.21685579027500237, + "grad_norm": 1.5576223134994507, + "learning_rate": 4.4419592017205735e-05, + "loss": 4.7322, + "step": 36463 + }, + { + "epoch": 0.21686173755828336, + "grad_norm": 1.554739236831665, + "learning_rate": 4.4419297850335036e-05, + "loss": 4.9784, + "step": 36464 + }, + { + "epoch": 0.21686768484156438, + "grad_norm": 1.572361946105957, + "learning_rate": 4.441900367668526e-05, + "loss": 4.631, + "step": 36465 + }, + { + "epoch": 0.21687363212484537, + "grad_norm": 1.6870968341827393, + "learning_rate": 4.441870949625652e-05, + "loss": 4.6154, + "step": 36466 + }, + { + "epoch": 0.21687957940812636, + "grad_norm": 1.7353061437606812, + "learning_rate": 4.441841530904889e-05, + "loss": 5.0098, + "step": 36467 + }, + { + "epoch": 0.21688552669140737, + "grad_norm": 1.622704267501831, + "learning_rate": 4.4418121115062506e-05, + "loss": 5.1513, + "step": 36468 + }, + { + "epoch": 0.21689147397468836, + "grad_norm": 1.66656494140625, + "learning_rate": 4.441782691429746e-05, + "loss": 5.1693, + "step": 36469 + }, + { + "epoch": 0.21689742125796935, + "grad_norm": 1.3424537181854248, + "learning_rate": 4.441753270675384e-05, + "loss": 5.17, + "step": 36470 + }, + { + "epoch": 0.21690336854125036, + "grad_norm": 1.405543327331543, + "learning_rate": 4.441723849243177e-05, + "loss": 5.0782, + "step": 36471 + }, + { + "epoch": 0.21690931582453135, + "grad_norm": 1.5939109325408936, + "learning_rate": 4.441694427133133e-05, + "loss": 4.4947, + "step": 36472 + }, + { + "epoch": 0.21691526310781234, + "grad_norm": 1.6071003675460815, + "learning_rate": 4.441665004345265e-05, + "loss": 4.7007, + "step": 36473 + }, + { + "epoch": 0.21692121039109336, + "grad_norm": 1.5466821193695068, + "learning_rate": 4.441635580879581e-05, + "loss": 4.6536, + "step": 36474 + }, + { + "epoch": 0.21692715767437434, + "grad_norm": 1.6554591655731201, + "learning_rate": 4.441606156736092e-05, + "loss": 4.4594, + "step": 36475 + }, + { + "epoch": 0.21693310495765533, + "grad_norm": 1.5430635213851929, + "learning_rate": 4.441576731914808e-05, + "loss": 4.6767, + "step": 36476 + }, + { + "epoch": 0.21693905224093635, + "grad_norm": 1.5925291776657104, + "learning_rate": 4.441547306415741e-05, + "loss": 4.8824, + "step": 36477 + }, + { + "epoch": 0.21694499952421734, + "grad_norm": 1.6011813879013062, + "learning_rate": 4.4415178802388986e-05, + "loss": 4.8297, + "step": 36478 + }, + { + "epoch": 0.21695094680749832, + "grad_norm": 1.3895270824432373, + "learning_rate": 4.4414884533842925e-05, + "loss": 5.1119, + "step": 36479 + }, + { + "epoch": 0.21695689409077934, + "grad_norm": 1.9162174463272095, + "learning_rate": 4.4414590258519334e-05, + "loss": 4.4094, + "step": 36480 + }, + { + "epoch": 0.21696284137406033, + "grad_norm": 2.446828603744507, + "learning_rate": 4.4414295976418306e-05, + "loss": 4.2579, + "step": 36481 + }, + { + "epoch": 0.21696878865734132, + "grad_norm": 1.7227983474731445, + "learning_rate": 4.441400168753995e-05, + "loss": 4.8391, + "step": 36482 + }, + { + "epoch": 0.21697473594062233, + "grad_norm": 1.6229579448699951, + "learning_rate": 4.4413707391884364e-05, + "loss": 4.8293, + "step": 36483 + }, + { + "epoch": 0.21698068322390332, + "grad_norm": 1.9420546293258667, + "learning_rate": 4.441341308945165e-05, + "loss": 3.5347, + "step": 36484 + }, + { + "epoch": 0.2169866305071843, + "grad_norm": 1.9547382593154907, + "learning_rate": 4.4413118780241925e-05, + "loss": 4.3297, + "step": 36485 + }, + { + "epoch": 0.21699257779046532, + "grad_norm": 1.977729082107544, + "learning_rate": 4.441282446425528e-05, + "loss": 4.1395, + "step": 36486 + }, + { + "epoch": 0.2169985250737463, + "grad_norm": 1.5033422708511353, + "learning_rate": 4.441253014149181e-05, + "loss": 5.0993, + "step": 36487 + }, + { + "epoch": 0.2170044723570273, + "grad_norm": 1.4688711166381836, + "learning_rate": 4.441223581195163e-05, + "loss": 5.0349, + "step": 36488 + }, + { + "epoch": 0.21701041964030832, + "grad_norm": 2.3826143741607666, + "learning_rate": 4.4411941475634844e-05, + "loss": 4.3747, + "step": 36489 + }, + { + "epoch": 0.2170163669235893, + "grad_norm": 2.2079734802246094, + "learning_rate": 4.441164713254154e-05, + "loss": 3.9939, + "step": 36490 + }, + { + "epoch": 0.2170223142068703, + "grad_norm": 3.0292141437530518, + "learning_rate": 4.4411352782671835e-05, + "loss": 3.793, + "step": 36491 + }, + { + "epoch": 0.2170282614901513, + "grad_norm": 2.8700766563415527, + "learning_rate": 4.441105842602583e-05, + "loss": 3.8592, + "step": 36492 + }, + { + "epoch": 0.2170342087734323, + "grad_norm": 2.866060972213745, + "learning_rate": 4.4410764062603616e-05, + "loss": 4.7633, + "step": 36493 + }, + { + "epoch": 0.21704015605671328, + "grad_norm": 1.6045869588851929, + "learning_rate": 4.4410469692405314e-05, + "loss": 4.7084, + "step": 36494 + }, + { + "epoch": 0.2170461033399943, + "grad_norm": 1.9329087734222412, + "learning_rate": 4.441017531543101e-05, + "loss": 4.041, + "step": 36495 + }, + { + "epoch": 0.2170520506232753, + "grad_norm": 1.7166354656219482, + "learning_rate": 4.440988093168083e-05, + "loss": 4.5453, + "step": 36496 + }, + { + "epoch": 0.21705799790655628, + "grad_norm": 1.5979840755462646, + "learning_rate": 4.4409586541154846e-05, + "loss": 4.461, + "step": 36497 + }, + { + "epoch": 0.2170639451898373, + "grad_norm": 1.6630820035934448, + "learning_rate": 4.4409292143853184e-05, + "loss": 4.6191, + "step": 36498 + }, + { + "epoch": 0.21706989247311828, + "grad_norm": 2.3144614696502686, + "learning_rate": 4.440899773977593e-05, + "loss": 3.9898, + "step": 36499 + }, + { + "epoch": 0.21707583975639927, + "grad_norm": 1.5565422773361206, + "learning_rate": 4.4408703328923204e-05, + "loss": 4.4901, + "step": 36500 + }, + { + "epoch": 0.21708178703968029, + "grad_norm": 1.6009669303894043, + "learning_rate": 4.4408408911295096e-05, + "loss": 4.2009, + "step": 36501 + }, + { + "epoch": 0.21708773432296127, + "grad_norm": 1.558400273323059, + "learning_rate": 4.4408114486891713e-05, + "loss": 4.5287, + "step": 36502 + }, + { + "epoch": 0.21709368160624226, + "grad_norm": 1.758833885192871, + "learning_rate": 4.440782005571316e-05, + "loss": 4.5359, + "step": 36503 + }, + { + "epoch": 0.21709962888952328, + "grad_norm": 1.6637458801269531, + "learning_rate": 4.440752561775953e-05, + "loss": 4.4195, + "step": 36504 + }, + { + "epoch": 0.21710557617280427, + "grad_norm": 1.729332685470581, + "learning_rate": 4.440723117303094e-05, + "loss": 4.8236, + "step": 36505 + }, + { + "epoch": 0.21711152345608525, + "grad_norm": 1.6611237525939941, + "learning_rate": 4.440693672152749e-05, + "loss": 4.7444, + "step": 36506 + }, + { + "epoch": 0.21711747073936627, + "grad_norm": 1.6037817001342773, + "learning_rate": 4.440664226324927e-05, + "loss": 4.9678, + "step": 36507 + }, + { + "epoch": 0.21712341802264726, + "grad_norm": 1.5919970273971558, + "learning_rate": 4.44063477981964e-05, + "loss": 4.618, + "step": 36508 + }, + { + "epoch": 0.21712936530592825, + "grad_norm": 1.7502778768539429, + "learning_rate": 4.440605332636897e-05, + "loss": 4.6642, + "step": 36509 + }, + { + "epoch": 0.21713531258920926, + "grad_norm": 2.1507225036621094, + "learning_rate": 4.440575884776709e-05, + "loss": 4.5249, + "step": 36510 + }, + { + "epoch": 0.21714125987249025, + "grad_norm": 1.4958648681640625, + "learning_rate": 4.440546436239085e-05, + "loss": 4.6402, + "step": 36511 + }, + { + "epoch": 0.21714720715577124, + "grad_norm": 2.740826368331909, + "learning_rate": 4.440516987024037e-05, + "loss": 2.2294, + "step": 36512 + }, + { + "epoch": 0.21715315443905225, + "grad_norm": 3.044275999069214, + "learning_rate": 4.440487537131575e-05, + "loss": 1.9464, + "step": 36513 + }, + { + "epoch": 0.21715910172233324, + "grad_norm": 2.7739100456237793, + "learning_rate": 4.4404580865617084e-05, + "loss": 1.6097, + "step": 36514 + }, + { + "epoch": 0.21716504900561423, + "grad_norm": 2.5852739810943604, + "learning_rate": 4.4404286353144474e-05, + "loss": 1.306, + "step": 36515 + }, + { + "epoch": 0.21717099628889525, + "grad_norm": 2.5203051567077637, + "learning_rate": 4.440399183389804e-05, + "loss": 1.4329, + "step": 36516 + }, + { + "epoch": 0.21717694357217623, + "grad_norm": 2.085726022720337, + "learning_rate": 4.440369730787787e-05, + "loss": 4.2591, + "step": 36517 + }, + { + "epoch": 0.21718289085545722, + "grad_norm": 1.939746379852295, + "learning_rate": 4.4403402775084066e-05, + "loss": 4.5084, + "step": 36518 + }, + { + "epoch": 0.2171888381387382, + "grad_norm": 1.800667405128479, + "learning_rate": 4.4403108235516735e-05, + "loss": 4.5063, + "step": 36519 + }, + { + "epoch": 0.21719478542201923, + "grad_norm": 2.342440128326416, + "learning_rate": 4.440281368917598e-05, + "loss": 3.8184, + "step": 36520 + }, + { + "epoch": 0.21720073270530021, + "grad_norm": 2.439656972885132, + "learning_rate": 4.4402519136061897e-05, + "loss": 2.9155, + "step": 36521 + }, + { + "epoch": 0.2172066799885812, + "grad_norm": 2.338834047317505, + "learning_rate": 4.44022245761746e-05, + "loss": 2.8786, + "step": 36522 + }, + { + "epoch": 0.21721262727186222, + "grad_norm": 3.4408295154571533, + "learning_rate": 4.4401930009514194e-05, + "loss": 2.6139, + "step": 36523 + }, + { + "epoch": 0.2172185745551432, + "grad_norm": 1.776883602142334, + "learning_rate": 4.440163543608077e-05, + "loss": 4.7785, + "step": 36524 + }, + { + "epoch": 0.2172245218384242, + "grad_norm": 1.5086807012557983, + "learning_rate": 4.440134085587443e-05, + "loss": 4.5959, + "step": 36525 + }, + { + "epoch": 0.2172304691217052, + "grad_norm": 2.188570499420166, + "learning_rate": 4.440104626889529e-05, + "loss": 4.1522, + "step": 36526 + }, + { + "epoch": 0.2172364164049862, + "grad_norm": 2.371351718902588, + "learning_rate": 4.4400751675143436e-05, + "loss": 5.1309, + "step": 36527 + }, + { + "epoch": 0.2172423636882672, + "grad_norm": 2.145080804824829, + "learning_rate": 4.4400457074618987e-05, + "loss": 3.662, + "step": 36528 + }, + { + "epoch": 0.2172483109715482, + "grad_norm": 1.631962776184082, + "learning_rate": 4.4400162467322034e-05, + "loss": 4.1333, + "step": 36529 + }, + { + "epoch": 0.2172542582548292, + "grad_norm": 2.0072202682495117, + "learning_rate": 4.439986785325269e-05, + "loss": 4.9659, + "step": 36530 + }, + { + "epoch": 0.21726020553811018, + "grad_norm": 1.7635982036590576, + "learning_rate": 4.439957323241105e-05, + "loss": 4.8684, + "step": 36531 + }, + { + "epoch": 0.2172661528213912, + "grad_norm": 1.6722582578659058, + "learning_rate": 4.4399278604797225e-05, + "loss": 4.7648, + "step": 36532 + }, + { + "epoch": 0.21727210010467218, + "grad_norm": 1.4029744863510132, + "learning_rate": 4.43989839704113e-05, + "loss": 4.8315, + "step": 36533 + }, + { + "epoch": 0.21727804738795317, + "grad_norm": 1.951603651046753, + "learning_rate": 4.43986893292534e-05, + "loss": 5.0957, + "step": 36534 + }, + { + "epoch": 0.2172839946712342, + "grad_norm": 1.844542145729065, + "learning_rate": 4.439839468132362e-05, + "loss": 4.767, + "step": 36535 + }, + { + "epoch": 0.21728994195451518, + "grad_norm": 1.7336167097091675, + "learning_rate": 4.4398100026622053e-05, + "loss": 4.3439, + "step": 36536 + }, + { + "epoch": 0.21729588923779616, + "grad_norm": 2.220607280731201, + "learning_rate": 4.439780536514881e-05, + "loss": 4.6749, + "step": 36537 + }, + { + "epoch": 0.21730183652107718, + "grad_norm": 1.6565178632736206, + "learning_rate": 4.4397510696904e-05, + "loss": 4.5169, + "step": 36538 + }, + { + "epoch": 0.21730778380435817, + "grad_norm": 1.898890495300293, + "learning_rate": 4.439721602188771e-05, + "loss": 4.5224, + "step": 36539 + }, + { + "epoch": 0.21731373108763916, + "grad_norm": 1.4901927709579468, + "learning_rate": 4.439692134010006e-05, + "loss": 4.4407, + "step": 36540 + }, + { + "epoch": 0.21731967837092017, + "grad_norm": 1.7129950523376465, + "learning_rate": 4.4396626651541144e-05, + "loss": 4.7336, + "step": 36541 + }, + { + "epoch": 0.21732562565420116, + "grad_norm": 1.7661831378936768, + "learning_rate": 4.439633195621107e-05, + "loss": 4.7136, + "step": 36542 + }, + { + "epoch": 0.21733157293748215, + "grad_norm": 1.171346664428711, + "learning_rate": 4.4396037254109926e-05, + "loss": 5.3083, + "step": 36543 + }, + { + "epoch": 0.21733752022076316, + "grad_norm": 1.7209404706954956, + "learning_rate": 4.439574254523783e-05, + "loss": 5.1993, + "step": 36544 + }, + { + "epoch": 0.21734346750404415, + "grad_norm": 1.5561963319778442, + "learning_rate": 4.4395447829594884e-05, + "loss": 4.9057, + "step": 36545 + }, + { + "epoch": 0.21734941478732514, + "grad_norm": 2.2409908771514893, + "learning_rate": 4.439515310718119e-05, + "loss": 3.9832, + "step": 36546 + }, + { + "epoch": 0.21735536207060616, + "grad_norm": 1.7077785730361938, + "learning_rate": 4.4394858377996844e-05, + "loss": 4.7167, + "step": 36547 + }, + { + "epoch": 0.21736130935388714, + "grad_norm": 2.373032569885254, + "learning_rate": 4.439456364204195e-05, + "loss": 3.1113, + "step": 36548 + }, + { + "epoch": 0.21736725663716813, + "grad_norm": 2.4456026554107666, + "learning_rate": 4.439426889931662e-05, + "loss": 2.9454, + "step": 36549 + }, + { + "epoch": 0.21737320392044915, + "grad_norm": 2.256770133972168, + "learning_rate": 4.439397414982095e-05, + "loss": 2.8073, + "step": 36550 + }, + { + "epoch": 0.21737915120373014, + "grad_norm": 2.642658233642578, + "learning_rate": 4.4393679393555045e-05, + "loss": 2.8812, + "step": 36551 + }, + { + "epoch": 0.21738509848701112, + "grad_norm": 2.680724859237671, + "learning_rate": 4.439338463051901e-05, + "loss": 3.0972, + "step": 36552 + }, + { + "epoch": 0.21739104577029214, + "grad_norm": 1.975277304649353, + "learning_rate": 4.439308986071293e-05, + "loss": 3.9978, + "step": 36553 + }, + { + "epoch": 0.21739699305357313, + "grad_norm": 1.7531930208206177, + "learning_rate": 4.4392795084136934e-05, + "loss": 4.764, + "step": 36554 + }, + { + "epoch": 0.21740294033685412, + "grad_norm": 1.7737468481063843, + "learning_rate": 4.4392500300791116e-05, + "loss": 4.5489, + "step": 36555 + }, + { + "epoch": 0.21740888762013513, + "grad_norm": 1.6888933181762695, + "learning_rate": 4.439220551067557e-05, + "loss": 4.4714, + "step": 36556 + }, + { + "epoch": 0.21741483490341612, + "grad_norm": 2.4339828491210938, + "learning_rate": 4.439191071379041e-05, + "loss": 3.4193, + "step": 36557 + }, + { + "epoch": 0.2174207821866971, + "grad_norm": 2.4790961742401123, + "learning_rate": 4.4391615910135734e-05, + "loss": 3.3395, + "step": 36558 + }, + { + "epoch": 0.21742672946997812, + "grad_norm": 4.457415580749512, + "learning_rate": 4.439132109971164e-05, + "loss": 3.7631, + "step": 36559 + }, + { + "epoch": 0.2174326767532591, + "grad_norm": 4.010805130004883, + "learning_rate": 4.4391026282518245e-05, + "loss": 4.032, + "step": 36560 + }, + { + "epoch": 0.2174386240365401, + "grad_norm": 1.6782433986663818, + "learning_rate": 4.439073145855563e-05, + "loss": 3.9182, + "step": 36561 + }, + { + "epoch": 0.21744457131982112, + "grad_norm": 1.3384885787963867, + "learning_rate": 4.4390436627823924e-05, + "loss": 4.0683, + "step": 36562 + }, + { + "epoch": 0.2174505186031021, + "grad_norm": 1.5583269596099854, + "learning_rate": 4.439014179032321e-05, + "loss": 3.9236, + "step": 36563 + }, + { + "epoch": 0.2174564658863831, + "grad_norm": 1.5254034996032715, + "learning_rate": 4.4389846946053605e-05, + "loss": 3.8808, + "step": 36564 + }, + { + "epoch": 0.2174624131696641, + "grad_norm": 1.4916062355041504, + "learning_rate": 4.43895520950152e-05, + "loss": 4.0083, + "step": 36565 + }, + { + "epoch": 0.2174683604529451, + "grad_norm": 1.4041520357131958, + "learning_rate": 4.4389257237208095e-05, + "loss": 3.9174, + "step": 36566 + }, + { + "epoch": 0.21747430773622609, + "grad_norm": 1.4670138359069824, + "learning_rate": 4.438896237263241e-05, + "loss": 3.8039, + "step": 36567 + }, + { + "epoch": 0.2174802550195071, + "grad_norm": 1.4441214799880981, + "learning_rate": 4.4388667501288234e-05, + "loss": 3.8671, + "step": 36568 + }, + { + "epoch": 0.2174862023027881, + "grad_norm": 1.3265955448150635, + "learning_rate": 4.438837262317568e-05, + "loss": 3.8891, + "step": 36569 + }, + { + "epoch": 0.21749214958606908, + "grad_norm": 1.4868719577789307, + "learning_rate": 4.438807773829484e-05, + "loss": 3.8434, + "step": 36570 + }, + { + "epoch": 0.2174980968693501, + "grad_norm": 1.3975825309753418, + "learning_rate": 4.4387782846645826e-05, + "loss": 3.8614, + "step": 36571 + }, + { + "epoch": 0.21750404415263108, + "grad_norm": 1.457074522972107, + "learning_rate": 4.438748794822873e-05, + "loss": 4.0471, + "step": 36572 + }, + { + "epoch": 0.21750999143591207, + "grad_norm": 1.4638450145721436, + "learning_rate": 4.4387193043043675e-05, + "loss": 4.049, + "step": 36573 + }, + { + "epoch": 0.21751593871919309, + "grad_norm": 1.5106879472732544, + "learning_rate": 4.4386898131090744e-05, + "loss": 4.0879, + "step": 36574 + }, + { + "epoch": 0.21752188600247407, + "grad_norm": 1.399688720703125, + "learning_rate": 4.438660321237004e-05, + "loss": 3.9919, + "step": 36575 + }, + { + "epoch": 0.21752783328575506, + "grad_norm": 1.3647470474243164, + "learning_rate": 4.4386308286881685e-05, + "loss": 4.1388, + "step": 36576 + }, + { + "epoch": 0.21753378056903608, + "grad_norm": 1.491940975189209, + "learning_rate": 4.438601335462577e-05, + "loss": 3.9717, + "step": 36577 + }, + { + "epoch": 0.21753972785231707, + "grad_norm": 1.417490839958191, + "learning_rate": 4.438571841560239e-05, + "loss": 4.0151, + "step": 36578 + }, + { + "epoch": 0.21754567513559805, + "grad_norm": 1.3765039443969727, + "learning_rate": 4.438542346981166e-05, + "loss": 4.2077, + "step": 36579 + }, + { + "epoch": 0.21755162241887904, + "grad_norm": 3.1067752838134766, + "learning_rate": 4.438512851725368e-05, + "loss": 4.5656, + "step": 36580 + }, + { + "epoch": 0.21755756970216006, + "grad_norm": 1.3396860361099243, + "learning_rate": 4.4384833557928553e-05, + "loss": 3.9936, + "step": 36581 + }, + { + "epoch": 0.21756351698544105, + "grad_norm": 1.3664778470993042, + "learning_rate": 4.438453859183637e-05, + "loss": 3.9645, + "step": 36582 + }, + { + "epoch": 0.21756946426872203, + "grad_norm": 1.8627538681030273, + "learning_rate": 4.438424361897725e-05, + "loss": 4.3642, + "step": 36583 + }, + { + "epoch": 0.21757541155200305, + "grad_norm": 1.9578425884246826, + "learning_rate": 4.43839486393513e-05, + "loss": 3.8721, + "step": 36584 + }, + { + "epoch": 0.21758135883528404, + "grad_norm": 2.2273995876312256, + "learning_rate": 4.438365365295861e-05, + "loss": 3.5968, + "step": 36585 + }, + { + "epoch": 0.21758730611856503, + "grad_norm": 1.7115248441696167, + "learning_rate": 4.438335865979928e-05, + "loss": 5.0829, + "step": 36586 + }, + { + "epoch": 0.21759325340184604, + "grad_norm": 1.6734553575515747, + "learning_rate": 4.4383063659873426e-05, + "loss": 5.0234, + "step": 36587 + }, + { + "epoch": 0.21759920068512703, + "grad_norm": 1.8772227764129639, + "learning_rate": 4.438276865318114e-05, + "loss": 5.0525, + "step": 36588 + }, + { + "epoch": 0.21760514796840802, + "grad_norm": 2.0212247371673584, + "learning_rate": 4.438247363972253e-05, + "loss": 3.7799, + "step": 36589 + }, + { + "epoch": 0.21761109525168904, + "grad_norm": 2.225306987762451, + "learning_rate": 4.43821786194977e-05, + "loss": 3.6704, + "step": 36590 + }, + { + "epoch": 0.21761704253497002, + "grad_norm": 2.1250686645507812, + "learning_rate": 4.438188359250676e-05, + "loss": 3.5848, + "step": 36591 + }, + { + "epoch": 0.217622989818251, + "grad_norm": 2.0238659381866455, + "learning_rate": 4.43815885587498e-05, + "loss": 3.5389, + "step": 36592 + }, + { + "epoch": 0.21762893710153203, + "grad_norm": 2.118901014328003, + "learning_rate": 4.438129351822692e-05, + "loss": 3.6547, + "step": 36593 + }, + { + "epoch": 0.21763488438481302, + "grad_norm": 1.7090996503829956, + "learning_rate": 4.438099847093824e-05, + "loss": 4.2832, + "step": 36594 + }, + { + "epoch": 0.217640831668094, + "grad_norm": 2.008913993835449, + "learning_rate": 4.438070341688385e-05, + "loss": 3.5492, + "step": 36595 + }, + { + "epoch": 0.21764677895137502, + "grad_norm": 1.9323056936264038, + "learning_rate": 4.438040835606385e-05, + "loss": 3.769, + "step": 36596 + }, + { + "epoch": 0.217652726234656, + "grad_norm": 2.1152114868164062, + "learning_rate": 4.438011328847836e-05, + "loss": 3.5528, + "step": 36597 + }, + { + "epoch": 0.217658673517937, + "grad_norm": 2.274790048599243, + "learning_rate": 4.437981821412746e-05, + "loss": 3.5154, + "step": 36598 + }, + { + "epoch": 0.217664620801218, + "grad_norm": 1.9242825508117676, + "learning_rate": 4.437952313301128e-05, + "loss": 3.5931, + "step": 36599 + }, + { + "epoch": 0.217670568084499, + "grad_norm": 1.7859545946121216, + "learning_rate": 4.437922804512991e-05, + "loss": 4.3912, + "step": 36600 + }, + { + "epoch": 0.21767651536778, + "grad_norm": 2.2939436435699463, + "learning_rate": 4.437893295048344e-05, + "loss": 3.3681, + "step": 36601 + }, + { + "epoch": 0.217682462651061, + "grad_norm": 2.1420838832855225, + "learning_rate": 4.437863784907199e-05, + "loss": 3.576, + "step": 36602 + }, + { + "epoch": 0.217688409934342, + "grad_norm": 2.0002729892730713, + "learning_rate": 4.4378342740895656e-05, + "loss": 3.4117, + "step": 36603 + }, + { + "epoch": 0.21769435721762298, + "grad_norm": 2.1192610263824463, + "learning_rate": 4.4378047625954546e-05, + "loss": 3.3726, + "step": 36604 + }, + { + "epoch": 0.217700304500904, + "grad_norm": 2.092013359069824, + "learning_rate": 4.437775250424876e-05, + "loss": 3.4378, + "step": 36605 + }, + { + "epoch": 0.21770625178418498, + "grad_norm": 1.9066373109817505, + "learning_rate": 4.43774573757784e-05, + "loss": 4.4583, + "step": 36606 + }, + { + "epoch": 0.21771219906746597, + "grad_norm": 1.9746664762496948, + "learning_rate": 4.437716224054357e-05, + "loss": 4.5363, + "step": 36607 + }, + { + "epoch": 0.217718146350747, + "grad_norm": 2.223649263381958, + "learning_rate": 4.4376867098544364e-05, + "loss": 4.7908, + "step": 36608 + }, + { + "epoch": 0.21772409363402798, + "grad_norm": 2.136047840118408, + "learning_rate": 4.43765719497809e-05, + "loss": 4.792, + "step": 36609 + }, + { + "epoch": 0.21773004091730896, + "grad_norm": 1.7172813415527344, + "learning_rate": 4.437627679425328e-05, + "loss": 4.7565, + "step": 36610 + }, + { + "epoch": 0.21773598820058998, + "grad_norm": 2.0952224731445312, + "learning_rate": 4.43759816319616e-05, + "loss": 5.0642, + "step": 36611 + }, + { + "epoch": 0.21774193548387097, + "grad_norm": 1.7799891233444214, + "learning_rate": 4.437568646290596e-05, + "loss": 5.0625, + "step": 36612 + }, + { + "epoch": 0.21774788276715196, + "grad_norm": 1.6467608213424683, + "learning_rate": 4.437539128708647e-05, + "loss": 5.0044, + "step": 36613 + }, + { + "epoch": 0.21775383005043297, + "grad_norm": 1.7772294282913208, + "learning_rate": 4.4375096104503236e-05, + "loss": 4.9322, + "step": 36614 + }, + { + "epoch": 0.21775977733371396, + "grad_norm": 1.634451985359192, + "learning_rate": 4.437480091515635e-05, + "loss": 4.7343, + "step": 36615 + }, + { + "epoch": 0.21776572461699495, + "grad_norm": 1.9060132503509521, + "learning_rate": 4.4374505719045924e-05, + "loss": 5.1219, + "step": 36616 + }, + { + "epoch": 0.21777167190027596, + "grad_norm": 1.6871609687805176, + "learning_rate": 4.437421051617205e-05, + "loss": 4.9647, + "step": 36617 + }, + { + "epoch": 0.21777761918355695, + "grad_norm": 1.503361463546753, + "learning_rate": 4.4373915306534854e-05, + "loss": 5.1347, + "step": 36618 + }, + { + "epoch": 0.21778356646683794, + "grad_norm": 1.5180853605270386, + "learning_rate": 4.4373620090134415e-05, + "loss": 5.1036, + "step": 36619 + }, + { + "epoch": 0.21778951375011896, + "grad_norm": 1.4979952573776245, + "learning_rate": 4.437332486697085e-05, + "loss": 4.7413, + "step": 36620 + }, + { + "epoch": 0.21779546103339995, + "grad_norm": 1.615602731704712, + "learning_rate": 4.437302963704425e-05, + "loss": 5.0448, + "step": 36621 + }, + { + "epoch": 0.21780140831668093, + "grad_norm": 1.4078423976898193, + "learning_rate": 4.437273440035473e-05, + "loss": 5.3924, + "step": 36622 + }, + { + "epoch": 0.21780735559996195, + "grad_norm": 1.4002037048339844, + "learning_rate": 4.437243915690239e-05, + "loss": 5.1455, + "step": 36623 + }, + { + "epoch": 0.21781330288324294, + "grad_norm": 1.3948068618774414, + "learning_rate": 4.4372143906687336e-05, + "loss": 5.186, + "step": 36624 + }, + { + "epoch": 0.21781925016652393, + "grad_norm": 1.7832770347595215, + "learning_rate": 4.4371848649709655e-05, + "loss": 5.2705, + "step": 36625 + }, + { + "epoch": 0.21782519744980494, + "grad_norm": 1.6136474609375, + "learning_rate": 4.437155338596948e-05, + "loss": 5.3217, + "step": 36626 + }, + { + "epoch": 0.21783114473308593, + "grad_norm": 1.701969027519226, + "learning_rate": 4.437125811546687e-05, + "loss": 5.3073, + "step": 36627 + }, + { + "epoch": 0.21783709201636692, + "grad_norm": 1.6547741889953613, + "learning_rate": 4.437096283820198e-05, + "loss": 5.3173, + "step": 36628 + }, + { + "epoch": 0.21784303929964793, + "grad_norm": 1.7442517280578613, + "learning_rate": 4.437066755417487e-05, + "loss": 5.6794, + "step": 36629 + }, + { + "epoch": 0.21784898658292892, + "grad_norm": 1.4823875427246094, + "learning_rate": 4.437037226338566e-05, + "loss": 5.6783, + "step": 36630 + }, + { + "epoch": 0.2178549338662099, + "grad_norm": 1.6044561862945557, + "learning_rate": 4.437007696583446e-05, + "loss": 4.1659, + "step": 36631 + }, + { + "epoch": 0.21786088114949093, + "grad_norm": 1.591406226158142, + "learning_rate": 4.436978166152137e-05, + "loss": 5.0407, + "step": 36632 + }, + { + "epoch": 0.21786682843277191, + "grad_norm": 1.8867915868759155, + "learning_rate": 4.436948635044648e-05, + "loss": 5.2259, + "step": 36633 + }, + { + "epoch": 0.2178727757160529, + "grad_norm": 1.458233118057251, + "learning_rate": 4.4369191032609905e-05, + "loss": 5.1809, + "step": 36634 + }, + { + "epoch": 0.21787872299933392, + "grad_norm": 1.5315808057785034, + "learning_rate": 4.436889570801175e-05, + "loss": 5.5395, + "step": 36635 + }, + { + "epoch": 0.2178846702826149, + "grad_norm": 1.4725675582885742, + "learning_rate": 4.436860037665211e-05, + "loss": 4.6842, + "step": 36636 + }, + { + "epoch": 0.2178906175658959, + "grad_norm": 1.55316162109375, + "learning_rate": 4.4368305038531095e-05, + "loss": 4.8684, + "step": 36637 + }, + { + "epoch": 0.21789656484917688, + "grad_norm": 1.693979024887085, + "learning_rate": 4.43680096936488e-05, + "loss": 5.1906, + "step": 36638 + }, + { + "epoch": 0.2179025121324579, + "grad_norm": 1.5667452812194824, + "learning_rate": 4.436771434200534e-05, + "loss": 4.9815, + "step": 36639 + }, + { + "epoch": 0.2179084594157389, + "grad_norm": 1.8694840669631958, + "learning_rate": 4.436741898360081e-05, + "loss": 4.6853, + "step": 36640 + }, + { + "epoch": 0.21791440669901987, + "grad_norm": 2.4778950214385986, + "learning_rate": 4.43671236184353e-05, + "loss": 5.0781, + "step": 36641 + }, + { + "epoch": 0.2179203539823009, + "grad_norm": 2.2444334030151367, + "learning_rate": 4.436682824650894e-05, + "loss": 4.4174, + "step": 36642 + }, + { + "epoch": 0.21792630126558188, + "grad_norm": 2.237233877182007, + "learning_rate": 4.4366532867821816e-05, + "loss": 5.1541, + "step": 36643 + }, + { + "epoch": 0.21793224854886287, + "grad_norm": 1.7043414115905762, + "learning_rate": 4.436623748237404e-05, + "loss": 5.0169, + "step": 36644 + }, + { + "epoch": 0.21793819583214388, + "grad_norm": 2.007829427719116, + "learning_rate": 4.4365942090165705e-05, + "loss": 4.0678, + "step": 36645 + }, + { + "epoch": 0.21794414311542487, + "grad_norm": 3.07391357421875, + "learning_rate": 4.4365646691196923e-05, + "loss": 2.7182, + "step": 36646 + }, + { + "epoch": 0.21795009039870586, + "grad_norm": 1.5059764385223389, + "learning_rate": 4.4365351285467796e-05, + "loss": 5.1282, + "step": 36647 + }, + { + "epoch": 0.21795603768198687, + "grad_norm": 2.457430839538574, + "learning_rate": 4.4365055872978424e-05, + "loss": 3.8752, + "step": 36648 + }, + { + "epoch": 0.21796198496526786, + "grad_norm": 2.5790088176727295, + "learning_rate": 4.43647604537289e-05, + "loss": 3.3587, + "step": 36649 + }, + { + "epoch": 0.21796793224854885, + "grad_norm": 2.324841260910034, + "learning_rate": 4.4364465027719346e-05, + "loss": 3.1681, + "step": 36650 + }, + { + "epoch": 0.21797387953182987, + "grad_norm": 2.64815092086792, + "learning_rate": 4.436416959494987e-05, + "loss": 2.97, + "step": 36651 + }, + { + "epoch": 0.21797982681511086, + "grad_norm": 1.9580894708633423, + "learning_rate": 4.436387415542055e-05, + "loss": 3.4777, + "step": 36652 + }, + { + "epoch": 0.21798577409839184, + "grad_norm": 2.4816932678222656, + "learning_rate": 4.43635787091315e-05, + "loss": 4.0422, + "step": 36653 + }, + { + "epoch": 0.21799172138167286, + "grad_norm": 2.4435534477233887, + "learning_rate": 4.436328325608283e-05, + "loss": 4.4932, + "step": 36654 + }, + { + "epoch": 0.21799766866495385, + "grad_norm": 1.6799771785736084, + "learning_rate": 4.436298779627463e-05, + "loss": 5.1482, + "step": 36655 + }, + { + "epoch": 0.21800361594823484, + "grad_norm": 1.5511815547943115, + "learning_rate": 4.436269232970702e-05, + "loss": 5.0648, + "step": 36656 + }, + { + "epoch": 0.21800956323151585, + "grad_norm": 1.6648553609848022, + "learning_rate": 4.436239685638008e-05, + "loss": 4.9462, + "step": 36657 + }, + { + "epoch": 0.21801551051479684, + "grad_norm": 1.543513298034668, + "learning_rate": 4.436210137629394e-05, + "loss": 4.8705, + "step": 36658 + }, + { + "epoch": 0.21802145779807783, + "grad_norm": 1.799896478652954, + "learning_rate": 4.436180588944869e-05, + "loss": 4.5315, + "step": 36659 + }, + { + "epoch": 0.21802740508135884, + "grad_norm": 1.4603382349014282, + "learning_rate": 4.4361510395844435e-05, + "loss": 5.2586, + "step": 36660 + }, + { + "epoch": 0.21803335236463983, + "grad_norm": 1.6055859327316284, + "learning_rate": 4.4361214895481277e-05, + "loss": 5.2098, + "step": 36661 + }, + { + "epoch": 0.21803929964792082, + "grad_norm": 1.6412562131881714, + "learning_rate": 4.436091938835931e-05, + "loss": 4.6264, + "step": 36662 + }, + { + "epoch": 0.21804524693120184, + "grad_norm": 1.660805106163025, + "learning_rate": 4.436062387447866e-05, + "loss": 4.0578, + "step": 36663 + }, + { + "epoch": 0.21805119421448282, + "grad_norm": 2.2433223724365234, + "learning_rate": 4.43603283538394e-05, + "loss": 4.6199, + "step": 36664 + }, + { + "epoch": 0.2180571414977638, + "grad_norm": 1.8339132070541382, + "learning_rate": 4.4360032826441654e-05, + "loss": 4.2583, + "step": 36665 + }, + { + "epoch": 0.21806308878104483, + "grad_norm": 1.7962074279785156, + "learning_rate": 4.435973729228553e-05, + "loss": 4.4523, + "step": 36666 + }, + { + "epoch": 0.21806903606432582, + "grad_norm": 1.5702697038650513, + "learning_rate": 4.435944175137111e-05, + "loss": 4.7028, + "step": 36667 + }, + { + "epoch": 0.2180749833476068, + "grad_norm": 1.853299617767334, + "learning_rate": 4.435914620369852e-05, + "loss": 4.4249, + "step": 36668 + }, + { + "epoch": 0.21808093063088782, + "grad_norm": 1.8155862092971802, + "learning_rate": 4.435885064926785e-05, + "loss": 5.0144, + "step": 36669 + }, + { + "epoch": 0.2180868779141688, + "grad_norm": 1.5878140926361084, + "learning_rate": 4.435855508807919e-05, + "loss": 4.848, + "step": 36670 + }, + { + "epoch": 0.2180928251974498, + "grad_norm": 1.7106904983520508, + "learning_rate": 4.435825952013267e-05, + "loss": 4.9348, + "step": 36671 + }, + { + "epoch": 0.2180987724807308, + "grad_norm": 1.9532265663146973, + "learning_rate": 4.435796394542839e-05, + "loss": 5.4063, + "step": 36672 + }, + { + "epoch": 0.2181047197640118, + "grad_norm": 1.745326042175293, + "learning_rate": 4.435766836396643e-05, + "loss": 5.1079, + "step": 36673 + }, + { + "epoch": 0.2181106670472928, + "grad_norm": 1.9781886339187622, + "learning_rate": 4.435737277574692e-05, + "loss": 4.7416, + "step": 36674 + }, + { + "epoch": 0.2181166143305738, + "grad_norm": 1.6559377908706665, + "learning_rate": 4.435707718076994e-05, + "loss": 4.7594, + "step": 36675 + }, + { + "epoch": 0.2181225616138548, + "grad_norm": 1.8455668687820435, + "learning_rate": 4.435678157903561e-05, + "loss": 5.3747, + "step": 36676 + }, + { + "epoch": 0.21812850889713578, + "grad_norm": 1.821624994277954, + "learning_rate": 4.4356485970544026e-05, + "loss": 5.559, + "step": 36677 + }, + { + "epoch": 0.2181344561804168, + "grad_norm": 1.5654447078704834, + "learning_rate": 4.4356190355295295e-05, + "loss": 4.9508, + "step": 36678 + }, + { + "epoch": 0.21814040346369779, + "grad_norm": 1.6591967344284058, + "learning_rate": 4.435589473328952e-05, + "loss": 4.6972, + "step": 36679 + }, + { + "epoch": 0.21814635074697877, + "grad_norm": 1.548817753791809, + "learning_rate": 4.43555991045268e-05, + "loss": 4.7788, + "step": 36680 + }, + { + "epoch": 0.2181522980302598, + "grad_norm": 1.8624293804168701, + "learning_rate": 4.435530346900724e-05, + "loss": 5.0808, + "step": 36681 + }, + { + "epoch": 0.21815824531354078, + "grad_norm": 1.5332111120224, + "learning_rate": 4.435500782673094e-05, + "loss": 4.9267, + "step": 36682 + }, + { + "epoch": 0.21816419259682177, + "grad_norm": 1.3629741668701172, + "learning_rate": 4.435471217769801e-05, + "loss": 4.7387, + "step": 36683 + }, + { + "epoch": 0.21817013988010278, + "grad_norm": 1.6867749691009521, + "learning_rate": 4.4354416521908546e-05, + "loss": 4.9142, + "step": 36684 + }, + { + "epoch": 0.21817608716338377, + "grad_norm": 1.4487711191177368, + "learning_rate": 4.435412085936266e-05, + "loss": 5.7823, + "step": 36685 + }, + { + "epoch": 0.21818203444666476, + "grad_norm": 1.579579472541809, + "learning_rate": 4.435382519006045e-05, + "loss": 5.6165, + "step": 36686 + }, + { + "epoch": 0.21818798172994577, + "grad_norm": 1.6927639245986938, + "learning_rate": 4.435352951400202e-05, + "loss": 4.7974, + "step": 36687 + }, + { + "epoch": 0.21819392901322676, + "grad_norm": 1.4306942224502563, + "learning_rate": 4.4353233831187466e-05, + "loss": 5.0413, + "step": 36688 + }, + { + "epoch": 0.21819987629650775, + "grad_norm": 1.5357601642608643, + "learning_rate": 4.4352938141616906e-05, + "loss": 4.9543, + "step": 36689 + }, + { + "epoch": 0.21820582357978877, + "grad_norm": 1.5910861492156982, + "learning_rate": 4.435264244529044e-05, + "loss": 5.3783, + "step": 36690 + }, + { + "epoch": 0.21821177086306975, + "grad_norm": 1.5916094779968262, + "learning_rate": 4.435234674220816e-05, + "loss": 5.7495, + "step": 36691 + }, + { + "epoch": 0.21821771814635074, + "grad_norm": 1.7037458419799805, + "learning_rate": 4.4352051032370175e-05, + "loss": 4.8695, + "step": 36692 + }, + { + "epoch": 0.21822366542963176, + "grad_norm": 1.7072062492370605, + "learning_rate": 4.435175531577659e-05, + "loss": 4.7444, + "step": 36693 + }, + { + "epoch": 0.21822961271291275, + "grad_norm": 1.3686712980270386, + "learning_rate": 4.43514595924275e-05, + "loss": 4.9637, + "step": 36694 + }, + { + "epoch": 0.21823555999619373, + "grad_norm": 1.5178147554397583, + "learning_rate": 4.435116386232302e-05, + "loss": 5.0143, + "step": 36695 + }, + { + "epoch": 0.21824150727947472, + "grad_norm": 1.7472846508026123, + "learning_rate": 4.4350868125463254e-05, + "loss": 5.0185, + "step": 36696 + }, + { + "epoch": 0.21824745456275574, + "grad_norm": 1.6126089096069336, + "learning_rate": 4.43505723818483e-05, + "loss": 5.2427, + "step": 36697 + }, + { + "epoch": 0.21825340184603673, + "grad_norm": 1.8243975639343262, + "learning_rate": 4.435027663147825e-05, + "loss": 4.7768, + "step": 36698 + }, + { + "epoch": 0.21825934912931771, + "grad_norm": 1.7908824682235718, + "learning_rate": 4.434998087435323e-05, + "loss": 5.1686, + "step": 36699 + }, + { + "epoch": 0.21826529641259873, + "grad_norm": 1.4360270500183105, + "learning_rate": 4.434968511047333e-05, + "loss": 4.9374, + "step": 36700 + }, + { + "epoch": 0.21827124369587972, + "grad_norm": 2.2321109771728516, + "learning_rate": 4.434938933983864e-05, + "loss": 4.9715, + "step": 36701 + }, + { + "epoch": 0.2182771909791607, + "grad_norm": 1.8325153589248657, + "learning_rate": 4.43490935624493e-05, + "loss": 4.8856, + "step": 36702 + }, + { + "epoch": 0.21828313826244172, + "grad_norm": 1.6531904935836792, + "learning_rate": 4.434879777830538e-05, + "loss": 4.5969, + "step": 36703 + }, + { + "epoch": 0.2182890855457227, + "grad_norm": 2.921745777130127, + "learning_rate": 4.434850198740699e-05, + "loss": 4.6352, + "step": 36704 + }, + { + "epoch": 0.2182950328290037, + "grad_norm": 1.4636532068252563, + "learning_rate": 4.434820618975425e-05, + "loss": 5.6227, + "step": 36705 + }, + { + "epoch": 0.21830098011228471, + "grad_norm": 1.5883086919784546, + "learning_rate": 4.434791038534724e-05, + "loss": 5.5488, + "step": 36706 + }, + { + "epoch": 0.2183069273955657, + "grad_norm": 1.7048338651657104, + "learning_rate": 4.434761457418608e-05, + "loss": 4.8811, + "step": 36707 + }, + { + "epoch": 0.2183128746788467, + "grad_norm": 3.260709524154663, + "learning_rate": 4.4347318756270864e-05, + "loss": 3.5482, + "step": 36708 + }, + { + "epoch": 0.2183188219621277, + "grad_norm": 3.1779022216796875, + "learning_rate": 4.43470229316017e-05, + "loss": 3.4502, + "step": 36709 + }, + { + "epoch": 0.2183247692454087, + "grad_norm": 2.1265110969543457, + "learning_rate": 4.4346727100178696e-05, + "loss": 4.5825, + "step": 36710 + }, + { + "epoch": 0.21833071652868968, + "grad_norm": 1.619808316230774, + "learning_rate": 4.434643126200194e-05, + "loss": 5.3028, + "step": 36711 + }, + { + "epoch": 0.2183366638119707, + "grad_norm": 3.3339853286743164, + "learning_rate": 4.434613541707156e-05, + "loss": 3.2459, + "step": 36712 + }, + { + "epoch": 0.2183426110952517, + "grad_norm": 3.406125068664551, + "learning_rate": 4.4345839565387626e-05, + "loss": 2.936, + "step": 36713 + }, + { + "epoch": 0.21834855837853268, + "grad_norm": 2.45475697517395, + "learning_rate": 4.4345543706950274e-05, + "loss": 3.0101, + "step": 36714 + }, + { + "epoch": 0.2183545056618137, + "grad_norm": 2.70192551612854, + "learning_rate": 4.434524784175958e-05, + "loss": 3.2301, + "step": 36715 + }, + { + "epoch": 0.21836045294509468, + "grad_norm": 2.4062118530273438, + "learning_rate": 4.434495196981567e-05, + "loss": 2.8497, + "step": 36716 + }, + { + "epoch": 0.21836640022837567, + "grad_norm": 2.3117856979370117, + "learning_rate": 4.434465609111863e-05, + "loss": 2.8549, + "step": 36717 + }, + { + "epoch": 0.21837234751165668, + "grad_norm": 2.4423859119415283, + "learning_rate": 4.434436020566857e-05, + "loss": 2.849, + "step": 36718 + }, + { + "epoch": 0.21837829479493767, + "grad_norm": 2.377615213394165, + "learning_rate": 4.43440643134656e-05, + "loss": 2.6986, + "step": 36719 + }, + { + "epoch": 0.21838424207821866, + "grad_norm": 2.4111328125, + "learning_rate": 4.434376841450981e-05, + "loss": 2.7673, + "step": 36720 + }, + { + "epoch": 0.21839018936149968, + "grad_norm": 2.4199485778808594, + "learning_rate": 4.434347250880132e-05, + "loss": 3.4974, + "step": 36721 + }, + { + "epoch": 0.21839613664478066, + "grad_norm": 2.2651684284210205, + "learning_rate": 4.434317659634022e-05, + "loss": 3.073, + "step": 36722 + }, + { + "epoch": 0.21840208392806165, + "grad_norm": 2.3775131702423096, + "learning_rate": 4.4342880677126606e-05, + "loss": 3.1214, + "step": 36723 + }, + { + "epoch": 0.21840803121134267, + "grad_norm": 2.3316352367401123, + "learning_rate": 4.434258475116061e-05, + "loss": 3.1617, + "step": 36724 + }, + { + "epoch": 0.21841397849462366, + "grad_norm": 2.312774181365967, + "learning_rate": 4.43422888184423e-05, + "loss": 2.7227, + "step": 36725 + }, + { + "epoch": 0.21841992577790464, + "grad_norm": 2.354297161102295, + "learning_rate": 4.434199287897181e-05, + "loss": 3.3812, + "step": 36726 + }, + { + "epoch": 0.21842587306118566, + "grad_norm": 1.8631865978240967, + "learning_rate": 4.4341696932749224e-05, + "loss": 3.9094, + "step": 36727 + }, + { + "epoch": 0.21843182034446665, + "grad_norm": 2.2943942546844482, + "learning_rate": 4.434140097977465e-05, + "loss": 2.5553, + "step": 36728 + }, + { + "epoch": 0.21843776762774764, + "grad_norm": 2.2607760429382324, + "learning_rate": 4.43411050200482e-05, + "loss": 3.1659, + "step": 36729 + }, + { + "epoch": 0.21844371491102865, + "grad_norm": 1.8395904302597046, + "learning_rate": 4.434080905356997e-05, + "loss": 4.2747, + "step": 36730 + }, + { + "epoch": 0.21844966219430964, + "grad_norm": 2.3018381595611572, + "learning_rate": 4.4340513080340054e-05, + "loss": 3.8753, + "step": 36731 + }, + { + "epoch": 0.21845560947759063, + "grad_norm": 2.3045477867126465, + "learning_rate": 4.434021710035857e-05, + "loss": 3.9674, + "step": 36732 + }, + { + "epoch": 0.21846155676087164, + "grad_norm": 1.9292182922363281, + "learning_rate": 4.433992111362562e-05, + "loss": 4.1323, + "step": 36733 + }, + { + "epoch": 0.21846750404415263, + "grad_norm": 1.6731642484664917, + "learning_rate": 4.433962512014129e-05, + "loss": 4.743, + "step": 36734 + }, + { + "epoch": 0.21847345132743362, + "grad_norm": 2.1067628860473633, + "learning_rate": 4.433932911990571e-05, + "loss": 4.3337, + "step": 36735 + }, + { + "epoch": 0.21847939861071464, + "grad_norm": 3.1354286670684814, + "learning_rate": 4.4339033112918966e-05, + "loss": 4.1579, + "step": 36736 + }, + { + "epoch": 0.21848534589399562, + "grad_norm": 2.441835403442383, + "learning_rate": 4.433873709918116e-05, + "loss": 4.0169, + "step": 36737 + }, + { + "epoch": 0.2184912931772766, + "grad_norm": 2.3072402477264404, + "learning_rate": 4.433844107869241e-05, + "loss": 3.9989, + "step": 36738 + }, + { + "epoch": 0.21849724046055763, + "grad_norm": 2.370055913925171, + "learning_rate": 4.4338145051452804e-05, + "loss": 3.8085, + "step": 36739 + }, + { + "epoch": 0.21850318774383862, + "grad_norm": 1.6106902360916138, + "learning_rate": 4.433784901746245e-05, + "loss": 5.171, + "step": 36740 + }, + { + "epoch": 0.2185091350271196, + "grad_norm": 1.7052632570266724, + "learning_rate": 4.4337552976721466e-05, + "loss": 5.4133, + "step": 36741 + }, + { + "epoch": 0.21851508231040062, + "grad_norm": 1.4769154787063599, + "learning_rate": 4.4337256929229925e-05, + "loss": 5.3987, + "step": 36742 + }, + { + "epoch": 0.2185210295936816, + "grad_norm": 1.929015874862671, + "learning_rate": 4.433696087498795e-05, + "loss": 4.5028, + "step": 36743 + }, + { + "epoch": 0.2185269768769626, + "grad_norm": 3.284090042114258, + "learning_rate": 4.4336664813995654e-05, + "loss": 3.7967, + "step": 36744 + }, + { + "epoch": 0.2185329241602436, + "grad_norm": 1.7083535194396973, + "learning_rate": 4.433636874625312e-05, + "loss": 5.4123, + "step": 36745 + }, + { + "epoch": 0.2185388714435246, + "grad_norm": 1.9117016792297363, + "learning_rate": 4.433607267176045e-05, + "loss": 4.685, + "step": 36746 + }, + { + "epoch": 0.2185448187268056, + "grad_norm": 1.509709119796753, + "learning_rate": 4.433577659051777e-05, + "loss": 4.7325, + "step": 36747 + }, + { + "epoch": 0.2185507660100866, + "grad_norm": 1.774267554283142, + "learning_rate": 4.4335480502525174e-05, + "loss": 4.6373, + "step": 36748 + }, + { + "epoch": 0.2185567132933676, + "grad_norm": 1.41673743724823, + "learning_rate": 4.433518440778275e-05, + "loss": 4.6033, + "step": 36749 + }, + { + "epoch": 0.21856266057664858, + "grad_norm": 1.881105661392212, + "learning_rate": 4.433488830629061e-05, + "loss": 4.5465, + "step": 36750 + }, + { + "epoch": 0.2185686078599296, + "grad_norm": 2.3924362659454346, + "learning_rate": 4.433459219804887e-05, + "loss": 4.5526, + "step": 36751 + }, + { + "epoch": 0.21857455514321059, + "grad_norm": 1.4470226764678955, + "learning_rate": 4.433429608305763e-05, + "loss": 5.0721, + "step": 36752 + }, + { + "epoch": 0.21858050242649157, + "grad_norm": 1.72877836227417, + "learning_rate": 4.4333999961316974e-05, + "loss": 5.0966, + "step": 36753 + }, + { + "epoch": 0.21858644970977256, + "grad_norm": 1.8247528076171875, + "learning_rate": 4.4333703832827026e-05, + "loss": 4.9581, + "step": 36754 + }, + { + "epoch": 0.21859239699305358, + "grad_norm": 1.702294945716858, + "learning_rate": 4.433340769758787e-05, + "loss": 4.7263, + "step": 36755 + }, + { + "epoch": 0.21859834427633457, + "grad_norm": 1.863827109336853, + "learning_rate": 4.433311155559963e-05, + "loss": 4.6154, + "step": 36756 + }, + { + "epoch": 0.21860429155961555, + "grad_norm": 1.580384373664856, + "learning_rate": 4.43328154068624e-05, + "loss": 4.5494, + "step": 36757 + }, + { + "epoch": 0.21861023884289657, + "grad_norm": 1.8330835103988647, + "learning_rate": 4.433251925137628e-05, + "loss": 4.8742, + "step": 36758 + }, + { + "epoch": 0.21861618612617756, + "grad_norm": 1.6677405834197998, + "learning_rate": 4.433222308914138e-05, + "loss": 5.2166, + "step": 36759 + }, + { + "epoch": 0.21862213340945855, + "grad_norm": 2.5527493953704834, + "learning_rate": 4.433192692015781e-05, + "loss": 4.1831, + "step": 36760 + }, + { + "epoch": 0.21862808069273956, + "grad_norm": 1.9266340732574463, + "learning_rate": 4.433163074442564e-05, + "loss": 4.869, + "step": 36761 + }, + { + "epoch": 0.21863402797602055, + "grad_norm": 1.7809799909591675, + "learning_rate": 4.433133456194502e-05, + "loss": 4.6188, + "step": 36762 + }, + { + "epoch": 0.21863997525930154, + "grad_norm": 1.6126807928085327, + "learning_rate": 4.433103837271603e-05, + "loss": 4.7458, + "step": 36763 + }, + { + "epoch": 0.21864592254258255, + "grad_norm": 1.7361348867416382, + "learning_rate": 4.433074217673876e-05, + "loss": 5.255, + "step": 36764 + }, + { + "epoch": 0.21865186982586354, + "grad_norm": 2.3801043033599854, + "learning_rate": 4.433044597401333e-05, + "loss": 4.1598, + "step": 36765 + }, + { + "epoch": 0.21865781710914453, + "grad_norm": 2.6629974842071533, + "learning_rate": 4.433014976453985e-05, + "loss": 4.5282, + "step": 36766 + }, + { + "epoch": 0.21866376439242555, + "grad_norm": 2.1977686882019043, + "learning_rate": 4.432985354831841e-05, + "loss": 3.5004, + "step": 36767 + }, + { + "epoch": 0.21866971167570654, + "grad_norm": 2.285147190093994, + "learning_rate": 4.432955732534912e-05, + "loss": 3.5626, + "step": 36768 + }, + { + "epoch": 0.21867565895898752, + "grad_norm": 1.8421472311019897, + "learning_rate": 4.432926109563208e-05, + "loss": 5.2143, + "step": 36769 + }, + { + "epoch": 0.21868160624226854, + "grad_norm": 1.6054788827896118, + "learning_rate": 4.4328964859167396e-05, + "loss": 5.2803, + "step": 36770 + }, + { + "epoch": 0.21868755352554953, + "grad_norm": 2.29986834526062, + "learning_rate": 4.432866861595517e-05, + "loss": 3.5438, + "step": 36771 + }, + { + "epoch": 0.21869350080883052, + "grad_norm": 2.4712657928466797, + "learning_rate": 4.43283723659955e-05, + "loss": 2.8448, + "step": 36772 + }, + { + "epoch": 0.21869944809211153, + "grad_norm": 2.8239145278930664, + "learning_rate": 4.43280761092885e-05, + "loss": 2.1695, + "step": 36773 + }, + { + "epoch": 0.21870539537539252, + "grad_norm": 2.5487799644470215, + "learning_rate": 4.432777984583427e-05, + "loss": 2.4765, + "step": 36774 + }, + { + "epoch": 0.2187113426586735, + "grad_norm": 2.319502353668213, + "learning_rate": 4.432748357563291e-05, + "loss": 3.483, + "step": 36775 + }, + { + "epoch": 0.21871728994195452, + "grad_norm": 2.333451747894287, + "learning_rate": 4.432718729868453e-05, + "loss": 3.1763, + "step": 36776 + }, + { + "epoch": 0.2187232372252355, + "grad_norm": 2.6187281608581543, + "learning_rate": 4.4326891014989216e-05, + "loss": 1.9022, + "step": 36777 + }, + { + "epoch": 0.2187291845085165, + "grad_norm": 2.4931774139404297, + "learning_rate": 4.432659472454709e-05, + "loss": 1.7533, + "step": 36778 + }, + { + "epoch": 0.21873513179179752, + "grad_norm": 2.9427437782287598, + "learning_rate": 4.4326298427358246e-05, + "loss": 1.5369, + "step": 36779 + }, + { + "epoch": 0.2187410790750785, + "grad_norm": 2.4648377895355225, + "learning_rate": 4.43260021234228e-05, + "loss": 1.7569, + "step": 36780 + }, + { + "epoch": 0.2187470263583595, + "grad_norm": 2.9976446628570557, + "learning_rate": 4.432570581274084e-05, + "loss": 1.672, + "step": 36781 + }, + { + "epoch": 0.2187529736416405, + "grad_norm": 3.176912307739258, + "learning_rate": 4.4325409495312476e-05, + "loss": 1.8063, + "step": 36782 + }, + { + "epoch": 0.2187589209249215, + "grad_norm": 2.667128801345825, + "learning_rate": 4.432511317113781e-05, + "loss": 2.672, + "step": 36783 + }, + { + "epoch": 0.21876486820820248, + "grad_norm": 2.843445301055908, + "learning_rate": 4.432481684021695e-05, + "loss": 3.2058, + "step": 36784 + }, + { + "epoch": 0.2187708154914835, + "grad_norm": 2.820849657058716, + "learning_rate": 4.432452050255e-05, + "loss": 3.3282, + "step": 36785 + }, + { + "epoch": 0.2187767627747645, + "grad_norm": 2.657222032546997, + "learning_rate": 4.432422415813705e-05, + "loss": 3.3505, + "step": 36786 + }, + { + "epoch": 0.21878271005804548, + "grad_norm": 2.6856789588928223, + "learning_rate": 4.4323927806978214e-05, + "loss": 3.3864, + "step": 36787 + }, + { + "epoch": 0.2187886573413265, + "grad_norm": 2.786029815673828, + "learning_rate": 4.432363144907361e-05, + "loss": 3.4048, + "step": 36788 + }, + { + "epoch": 0.21879460462460748, + "grad_norm": 2.5305304527282715, + "learning_rate": 4.4323335084423305e-05, + "loss": 3.2925, + "step": 36789 + }, + { + "epoch": 0.21880055190788847, + "grad_norm": 2.462794065475464, + "learning_rate": 4.432303871302743e-05, + "loss": 3.2506, + "step": 36790 + }, + { + "epoch": 0.21880649919116948, + "grad_norm": 2.3902087211608887, + "learning_rate": 4.4322742334886094e-05, + "loss": 3.5342, + "step": 36791 + }, + { + "epoch": 0.21881244647445047, + "grad_norm": 2.16796875, + "learning_rate": 4.432244594999937e-05, + "loss": 4.4603, + "step": 36792 + }, + { + "epoch": 0.21881839375773146, + "grad_norm": 1.8874982595443726, + "learning_rate": 4.432214955836739e-05, + "loss": 4.3329, + "step": 36793 + }, + { + "epoch": 0.21882434104101248, + "grad_norm": 2.055091381072998, + "learning_rate": 4.4321853159990244e-05, + "loss": 4.5713, + "step": 36794 + }, + { + "epoch": 0.21883028832429346, + "grad_norm": 2.253117322921753, + "learning_rate": 4.432155675486804e-05, + "loss": 4.151, + "step": 36795 + }, + { + "epoch": 0.21883623560757445, + "grad_norm": 2.199066638946533, + "learning_rate": 4.432126034300088e-05, + "loss": 4.5785, + "step": 36796 + }, + { + "epoch": 0.21884218289085547, + "grad_norm": 1.666224718093872, + "learning_rate": 4.432096392438887e-05, + "loss": 4.5748, + "step": 36797 + }, + { + "epoch": 0.21884813017413646, + "grad_norm": 2.1748523712158203, + "learning_rate": 4.432066749903211e-05, + "loss": 4.0945, + "step": 36798 + }, + { + "epoch": 0.21885407745741745, + "grad_norm": 2.570986747741699, + "learning_rate": 4.43203710669307e-05, + "loss": 3.7925, + "step": 36799 + }, + { + "epoch": 0.21886002474069846, + "grad_norm": 2.303675651550293, + "learning_rate": 4.4320074628084754e-05, + "loss": 3.7916, + "step": 36800 + }, + { + "epoch": 0.21886597202397945, + "grad_norm": 2.1665382385253906, + "learning_rate": 4.431977818249436e-05, + "loss": 3.9575, + "step": 36801 + }, + { + "epoch": 0.21887191930726044, + "grad_norm": 2.1685996055603027, + "learning_rate": 4.431948173015964e-05, + "loss": 3.8452, + "step": 36802 + }, + { + "epoch": 0.21887786659054145, + "grad_norm": 2.4096124172210693, + "learning_rate": 4.431918527108069e-05, + "loss": 3.8829, + "step": 36803 + }, + { + "epoch": 0.21888381387382244, + "grad_norm": 2.1310126781463623, + "learning_rate": 4.431888880525761e-05, + "loss": 3.5221, + "step": 36804 + }, + { + "epoch": 0.21888976115710343, + "grad_norm": 2.318202018737793, + "learning_rate": 4.4318592332690504e-05, + "loss": 3.7006, + "step": 36805 + }, + { + "epoch": 0.21889570844038445, + "grad_norm": 2.4116504192352295, + "learning_rate": 4.431829585337948e-05, + "loss": 3.4329, + "step": 36806 + }, + { + "epoch": 0.21890165572366543, + "grad_norm": 2.2227671146392822, + "learning_rate": 4.4317999367324635e-05, + "loss": 3.4883, + "step": 36807 + }, + { + "epoch": 0.21890760300694642, + "grad_norm": 2.3181447982788086, + "learning_rate": 4.431770287452608e-05, + "loss": 3.4717, + "step": 36808 + }, + { + "epoch": 0.21891355029022744, + "grad_norm": 1.950046420097351, + "learning_rate": 4.4317406374983905e-05, + "loss": 4.4156, + "step": 36809 + }, + { + "epoch": 0.21891949757350843, + "grad_norm": 3.2731096744537354, + "learning_rate": 4.431710986869823e-05, + "loss": 3.3211, + "step": 36810 + }, + { + "epoch": 0.21892544485678941, + "grad_norm": 2.419877767562866, + "learning_rate": 4.431681335566915e-05, + "loss": 3.4889, + "step": 36811 + }, + { + "epoch": 0.2189313921400704, + "grad_norm": 2.390082359313965, + "learning_rate": 4.4316516835896773e-05, + "loss": 3.4228, + "step": 36812 + }, + { + "epoch": 0.21893733942335142, + "grad_norm": 2.5118305683135986, + "learning_rate": 4.43162203093812e-05, + "loss": 3.8365, + "step": 36813 + }, + { + "epoch": 0.2189432867066324, + "grad_norm": 2.055748701095581, + "learning_rate": 4.4315923776122524e-05, + "loss": 5.0624, + "step": 36814 + }, + { + "epoch": 0.2189492339899134, + "grad_norm": 2.4033682346343994, + "learning_rate": 4.431562723612087e-05, + "loss": 5.4134, + "step": 36815 + }, + { + "epoch": 0.2189551812731944, + "grad_norm": 1.550174593925476, + "learning_rate": 4.4315330689376325e-05, + "loss": 5.0781, + "step": 36816 + }, + { + "epoch": 0.2189611285564754, + "grad_norm": 1.4097830057144165, + "learning_rate": 4.4315034135889e-05, + "loss": 5.2124, + "step": 36817 + }, + { + "epoch": 0.2189670758397564, + "grad_norm": 1.503827691078186, + "learning_rate": 4.431473757565899e-05, + "loss": 5.3189, + "step": 36818 + }, + { + "epoch": 0.2189730231230374, + "grad_norm": 1.8725134134292603, + "learning_rate": 4.4314441008686414e-05, + "loss": 4.9859, + "step": 36819 + }, + { + "epoch": 0.2189789704063184, + "grad_norm": 1.5328760147094727, + "learning_rate": 4.431414443497136e-05, + "loss": 5.2791, + "step": 36820 + }, + { + "epoch": 0.21898491768959938, + "grad_norm": 1.5473660230636597, + "learning_rate": 4.431384785451395e-05, + "loss": 5.4368, + "step": 36821 + }, + { + "epoch": 0.2189908649728804, + "grad_norm": 1.6382627487182617, + "learning_rate": 4.4313551267314255e-05, + "loss": 5.4049, + "step": 36822 + }, + { + "epoch": 0.21899681225616138, + "grad_norm": 1.6156213283538818, + "learning_rate": 4.4313254673372405e-05, + "loss": 5.4171, + "step": 36823 + }, + { + "epoch": 0.21900275953944237, + "grad_norm": 1.3826895952224731, + "learning_rate": 4.4312958072688504e-05, + "loss": 5.2223, + "step": 36824 + }, + { + "epoch": 0.2190087068227234, + "grad_norm": 5.227144241333008, + "learning_rate": 4.431266146526265e-05, + "loss": 3.7088, + "step": 36825 + }, + { + "epoch": 0.21901465410600438, + "grad_norm": 3.769723415374756, + "learning_rate": 4.431236485109493e-05, + "loss": 4.6714, + "step": 36826 + }, + { + "epoch": 0.21902060138928536, + "grad_norm": 4.917707443237305, + "learning_rate": 4.431206823018548e-05, + "loss": 4.1719, + "step": 36827 + }, + { + "epoch": 0.21902654867256638, + "grad_norm": 2.7149879932403564, + "learning_rate": 4.431177160253438e-05, + "loss": 4.5361, + "step": 36828 + }, + { + "epoch": 0.21903249595584737, + "grad_norm": 4.330033779144287, + "learning_rate": 4.4311474968141745e-05, + "loss": 2.9403, + "step": 36829 + }, + { + "epoch": 0.21903844323912836, + "grad_norm": 4.110903263092041, + "learning_rate": 4.4311178327007664e-05, + "loss": 3.2446, + "step": 36830 + }, + { + "epoch": 0.21904439052240937, + "grad_norm": 3.4869606494903564, + "learning_rate": 4.431088167913225e-05, + "loss": 2.7637, + "step": 36831 + }, + { + "epoch": 0.21905033780569036, + "grad_norm": 3.579864263534546, + "learning_rate": 4.4310585024515615e-05, + "loss": 2.8165, + "step": 36832 + }, + { + "epoch": 0.21905628508897135, + "grad_norm": 1.7594797611236572, + "learning_rate": 4.431028836315786e-05, + "loss": 5.0774, + "step": 36833 + }, + { + "epoch": 0.21906223237225236, + "grad_norm": 1.7493889331817627, + "learning_rate": 4.430999169505907e-05, + "loss": 5.3393, + "step": 36834 + }, + { + "epoch": 0.21906817965553335, + "grad_norm": 1.4827722311019897, + "learning_rate": 4.430969502021937e-05, + "loss": 5.7201, + "step": 36835 + }, + { + "epoch": 0.21907412693881434, + "grad_norm": 1.6048434972763062, + "learning_rate": 4.430939833863884e-05, + "loss": 4.9929, + "step": 36836 + }, + { + "epoch": 0.21908007422209536, + "grad_norm": 3.3946571350097656, + "learning_rate": 4.430910165031761e-05, + "loss": 2.6245, + "step": 36837 + }, + { + "epoch": 0.21908602150537634, + "grad_norm": 1.9543274641036987, + "learning_rate": 4.4308804955255775e-05, + "loss": 4.876, + "step": 36838 + }, + { + "epoch": 0.21909196878865733, + "grad_norm": 1.7576123476028442, + "learning_rate": 4.4308508253453426e-05, + "loss": 4.8688, + "step": 36839 + }, + { + "epoch": 0.21909791607193835, + "grad_norm": 1.5310838222503662, + "learning_rate": 4.430821154491069e-05, + "loss": 4.8038, + "step": 36840 + }, + { + "epoch": 0.21910386335521934, + "grad_norm": 1.8425617218017578, + "learning_rate": 4.430791482962765e-05, + "loss": 4.4143, + "step": 36841 + }, + { + "epoch": 0.21910981063850032, + "grad_norm": 1.6769896745681763, + "learning_rate": 4.430761810760441e-05, + "loss": 4.7782, + "step": 36842 + }, + { + "epoch": 0.21911575792178134, + "grad_norm": 2.571417808532715, + "learning_rate": 4.430732137884109e-05, + "loss": 3.6217, + "step": 36843 + }, + { + "epoch": 0.21912170520506233, + "grad_norm": 1.8328107595443726, + "learning_rate": 4.430702464333777e-05, + "loss": 4.4206, + "step": 36844 + }, + { + "epoch": 0.21912765248834332, + "grad_norm": 1.6589614152908325, + "learning_rate": 4.430672790109458e-05, + "loss": 4.7387, + "step": 36845 + }, + { + "epoch": 0.21913359977162433, + "grad_norm": 2.539179801940918, + "learning_rate": 4.4306431152111604e-05, + "loss": 3.394, + "step": 36846 + }, + { + "epoch": 0.21913954705490532, + "grad_norm": 2.6338255405426025, + "learning_rate": 4.430613439638896e-05, + "loss": 3.477, + "step": 36847 + }, + { + "epoch": 0.2191454943381863, + "grad_norm": 2.6075761318206787, + "learning_rate": 4.430583763392674e-05, + "loss": 3.4526, + "step": 36848 + }, + { + "epoch": 0.21915144162146732, + "grad_norm": 2.717876672744751, + "learning_rate": 4.430554086472505e-05, + "loss": 3.5442, + "step": 36849 + }, + { + "epoch": 0.2191573889047483, + "grad_norm": 1.6439937353134155, + "learning_rate": 4.430524408878399e-05, + "loss": 4.4737, + "step": 36850 + }, + { + "epoch": 0.2191633361880293, + "grad_norm": 2.434872627258301, + "learning_rate": 4.430494730610368e-05, + "loss": 3.5731, + "step": 36851 + }, + { + "epoch": 0.21916928347131032, + "grad_norm": 2.7367117404937744, + "learning_rate": 4.43046505166842e-05, + "loss": 3.5388, + "step": 36852 + }, + { + "epoch": 0.2191752307545913, + "grad_norm": 2.5835742950439453, + "learning_rate": 4.430435372052568e-05, + "loss": 3.8811, + "step": 36853 + }, + { + "epoch": 0.2191811780378723, + "grad_norm": 1.5144959688186646, + "learning_rate": 4.43040569176282e-05, + "loss": 5.0131, + "step": 36854 + }, + { + "epoch": 0.2191871253211533, + "grad_norm": 1.4497205018997192, + "learning_rate": 4.4303760107991874e-05, + "loss": 5.0003, + "step": 36855 + }, + { + "epoch": 0.2191930726044343, + "grad_norm": 1.6756725311279297, + "learning_rate": 4.430346329161681e-05, + "loss": 5.0879, + "step": 36856 + }, + { + "epoch": 0.21919901988771529, + "grad_norm": 1.5844732522964478, + "learning_rate": 4.43031664685031e-05, + "loss": 5.0556, + "step": 36857 + }, + { + "epoch": 0.2192049671709963, + "grad_norm": 1.3791886568069458, + "learning_rate": 4.4302869638650856e-05, + "loss": 4.6868, + "step": 36858 + }, + { + "epoch": 0.2192109144542773, + "grad_norm": 1.4763437509536743, + "learning_rate": 4.430257280206018e-05, + "loss": 4.3887, + "step": 36859 + }, + { + "epoch": 0.21921686173755828, + "grad_norm": 1.5181909799575806, + "learning_rate": 4.430227595873118e-05, + "loss": 4.6743, + "step": 36860 + }, + { + "epoch": 0.2192228090208393, + "grad_norm": 1.728208065032959, + "learning_rate": 4.430197910866395e-05, + "loss": 4.1249, + "step": 36861 + }, + { + "epoch": 0.21922875630412028, + "grad_norm": 1.6396219730377197, + "learning_rate": 4.430168225185859e-05, + "loss": 4.8028, + "step": 36862 + }, + { + "epoch": 0.21923470358740127, + "grad_norm": 1.736222505569458, + "learning_rate": 4.430138538831523e-05, + "loss": 4.9611, + "step": 36863 + }, + { + "epoch": 0.21924065087068229, + "grad_norm": 1.7069100141525269, + "learning_rate": 4.430108851803394e-05, + "loss": 5.2723, + "step": 36864 + }, + { + "epoch": 0.21924659815396327, + "grad_norm": 1.4066749811172485, + "learning_rate": 4.430079164101485e-05, + "loss": 4.8875, + "step": 36865 + }, + { + "epoch": 0.21925254543724426, + "grad_norm": 2.0646684169769287, + "learning_rate": 4.430049475725805e-05, + "loss": 4.4612, + "step": 36866 + }, + { + "epoch": 0.21925849272052528, + "grad_norm": 1.9822113513946533, + "learning_rate": 4.430019786676365e-05, + "loss": 4.3854, + "step": 36867 + }, + { + "epoch": 0.21926444000380627, + "grad_norm": 2.068380355834961, + "learning_rate": 4.429990096953174e-05, + "loss": 3.472, + "step": 36868 + }, + { + "epoch": 0.21927038728708725, + "grad_norm": 2.1574547290802, + "learning_rate": 4.429960406556244e-05, + "loss": 3.3961, + "step": 36869 + }, + { + "epoch": 0.21927633457036824, + "grad_norm": 2.2769057750701904, + "learning_rate": 4.4299307154855855e-05, + "loss": 3.4244, + "step": 36870 + }, + { + "epoch": 0.21928228185364926, + "grad_norm": 2.169564962387085, + "learning_rate": 4.429901023741207e-05, + "loss": 3.3977, + "step": 36871 + }, + { + "epoch": 0.21928822913693025, + "grad_norm": 1.7176889181137085, + "learning_rate": 4.42987133132312e-05, + "loss": 4.2225, + "step": 36872 + }, + { + "epoch": 0.21929417642021123, + "grad_norm": 1.5642355680465698, + "learning_rate": 4.4298416382313355e-05, + "loss": 4.8421, + "step": 36873 + }, + { + "epoch": 0.21930012370349225, + "grad_norm": 2.2092230319976807, + "learning_rate": 4.4298119444658633e-05, + "loss": 3.436, + "step": 36874 + }, + { + "epoch": 0.21930607098677324, + "grad_norm": 1.5236022472381592, + "learning_rate": 4.4297822500267127e-05, + "loss": 3.7891, + "step": 36875 + }, + { + "epoch": 0.21931201827005423, + "grad_norm": 1.6356561183929443, + "learning_rate": 4.4297525549138963e-05, + "loss": 4.4118, + "step": 36876 + }, + { + "epoch": 0.21931796555333524, + "grad_norm": 1.5913872718811035, + "learning_rate": 4.4297228591274225e-05, + "loss": 4.0968, + "step": 36877 + }, + { + "epoch": 0.21932391283661623, + "grad_norm": 1.8036432266235352, + "learning_rate": 4.429693162667302e-05, + "loss": 4.9122, + "step": 36878 + }, + { + "epoch": 0.21932986011989722, + "grad_norm": 1.545316457748413, + "learning_rate": 4.4296634655335464e-05, + "loss": 4.8599, + "step": 36879 + }, + { + "epoch": 0.21933580740317823, + "grad_norm": 1.493503451347351, + "learning_rate": 4.429633767726165e-05, + "loss": 4.0066, + "step": 36880 + }, + { + "epoch": 0.21934175468645922, + "grad_norm": 1.5063025951385498, + "learning_rate": 4.429604069245168e-05, + "loss": 3.7343, + "step": 36881 + }, + { + "epoch": 0.2193477019697402, + "grad_norm": 1.4364261627197266, + "learning_rate": 4.429574370090567e-05, + "loss": 4.0285, + "step": 36882 + }, + { + "epoch": 0.21935364925302123, + "grad_norm": 1.497621774673462, + "learning_rate": 4.429544670262371e-05, + "loss": 4.2485, + "step": 36883 + }, + { + "epoch": 0.21935959653630221, + "grad_norm": 1.4784116744995117, + "learning_rate": 4.429514969760591e-05, + "loss": 4.0498, + "step": 36884 + }, + { + "epoch": 0.2193655438195832, + "grad_norm": 1.4145901203155518, + "learning_rate": 4.4294852685852366e-05, + "loss": 3.9571, + "step": 36885 + }, + { + "epoch": 0.21937149110286422, + "grad_norm": 1.4398488998413086, + "learning_rate": 4.429455566736319e-05, + "loss": 4.0692, + "step": 36886 + }, + { + "epoch": 0.2193774383861452, + "grad_norm": 1.5177552700042725, + "learning_rate": 4.4294258642138495e-05, + "loss": 4.0562, + "step": 36887 + }, + { + "epoch": 0.2193833856694262, + "grad_norm": 1.36580491065979, + "learning_rate": 4.429396161017836e-05, + "loss": 4.0386, + "step": 36888 + }, + { + "epoch": 0.2193893329527072, + "grad_norm": 1.3633042573928833, + "learning_rate": 4.429366457148291e-05, + "loss": 3.8754, + "step": 36889 + }, + { + "epoch": 0.2193952802359882, + "grad_norm": 2.18786883354187, + "learning_rate": 4.4293367526052246e-05, + "loss": 3.5764, + "step": 36890 + }, + { + "epoch": 0.2194012275192692, + "grad_norm": 1.3502684831619263, + "learning_rate": 4.4293070473886456e-05, + "loss": 3.9478, + "step": 36891 + }, + { + "epoch": 0.2194071748025502, + "grad_norm": 1.330976128578186, + "learning_rate": 4.4292773414985656e-05, + "loss": 4.0364, + "step": 36892 + }, + { + "epoch": 0.2194131220858312, + "grad_norm": 1.3216843605041504, + "learning_rate": 4.4292476349349955e-05, + "loss": 3.9309, + "step": 36893 + }, + { + "epoch": 0.21941906936911218, + "grad_norm": 1.4405933618545532, + "learning_rate": 4.429217927697944e-05, + "loss": 3.6253, + "step": 36894 + }, + { + "epoch": 0.2194250166523932, + "grad_norm": 2.24751877784729, + "learning_rate": 4.4291882197874234e-05, + "loss": 3.5166, + "step": 36895 + }, + { + "epoch": 0.21943096393567418, + "grad_norm": 1.3963483572006226, + "learning_rate": 4.4291585112034426e-05, + "loss": 4.0243, + "step": 36896 + }, + { + "epoch": 0.21943691121895517, + "grad_norm": 2.135277509689331, + "learning_rate": 4.429128801946012e-05, + "loss": 3.5312, + "step": 36897 + }, + { + "epoch": 0.2194428585022362, + "grad_norm": 2.387871026992798, + "learning_rate": 4.429099092015144e-05, + "loss": 3.4618, + "step": 36898 + }, + { + "epoch": 0.21944880578551718, + "grad_norm": 2.3422837257385254, + "learning_rate": 4.4290693814108465e-05, + "loss": 3.5615, + "step": 36899 + }, + { + "epoch": 0.21945475306879816, + "grad_norm": 2.3634982109069824, + "learning_rate": 4.429039670133131e-05, + "loss": 3.3284, + "step": 36900 + }, + { + "epoch": 0.21946070035207918, + "grad_norm": 2.0091755390167236, + "learning_rate": 4.429009958182007e-05, + "loss": 4.2969, + "step": 36901 + }, + { + "epoch": 0.21946664763536017, + "grad_norm": 1.939206600189209, + "learning_rate": 4.428980245557486e-05, + "loss": 5.2382, + "step": 36902 + }, + { + "epoch": 0.21947259491864116, + "grad_norm": 1.8103601932525635, + "learning_rate": 4.428950532259578e-05, + "loss": 5.3107, + "step": 36903 + }, + { + "epoch": 0.21947854220192217, + "grad_norm": 1.8780220746994019, + "learning_rate": 4.4289208182882936e-05, + "loss": 4.026, + "step": 36904 + }, + { + "epoch": 0.21948448948520316, + "grad_norm": 2.2127411365509033, + "learning_rate": 4.428891103643642e-05, + "loss": 3.2846, + "step": 36905 + }, + { + "epoch": 0.21949043676848415, + "grad_norm": 1.7850244045257568, + "learning_rate": 4.4288613883256356e-05, + "loss": 4.898, + "step": 36906 + }, + { + "epoch": 0.21949638405176516, + "grad_norm": 1.9830641746520996, + "learning_rate": 4.4288316723342824e-05, + "loss": 4.7774, + "step": 36907 + }, + { + "epoch": 0.21950233133504615, + "grad_norm": 1.9393038749694824, + "learning_rate": 4.428801955669595e-05, + "loss": 5.2107, + "step": 36908 + }, + { + "epoch": 0.21950827861832714, + "grad_norm": 1.9476639032363892, + "learning_rate": 4.428772238331582e-05, + "loss": 5.0619, + "step": 36909 + }, + { + "epoch": 0.21951422590160816, + "grad_norm": 2.0159640312194824, + "learning_rate": 4.428742520320255e-05, + "loss": 5.0202, + "step": 36910 + }, + { + "epoch": 0.21952017318488914, + "grad_norm": 1.6942940950393677, + "learning_rate": 4.428712801635624e-05, + "loss": 4.934, + "step": 36911 + }, + { + "epoch": 0.21952612046817013, + "grad_norm": 2.129357099533081, + "learning_rate": 4.428683082277699e-05, + "loss": 4.5209, + "step": 36912 + }, + { + "epoch": 0.21953206775145115, + "grad_norm": 1.7726565599441528, + "learning_rate": 4.428653362246491e-05, + "loss": 5.0013, + "step": 36913 + }, + { + "epoch": 0.21953801503473214, + "grad_norm": 1.8441473245620728, + "learning_rate": 4.4286236415420094e-05, + "loss": 4.4107, + "step": 36914 + }, + { + "epoch": 0.21954396231801313, + "grad_norm": 2.1780929565429688, + "learning_rate": 4.428593920164266e-05, + "loss": 4.7304, + "step": 36915 + }, + { + "epoch": 0.21954990960129414, + "grad_norm": 1.6376374959945679, + "learning_rate": 4.42856419811327e-05, + "loss": 5.1897, + "step": 36916 + }, + { + "epoch": 0.21955585688457513, + "grad_norm": 1.9564027786254883, + "learning_rate": 4.4285344753890326e-05, + "loss": 4.7024, + "step": 36917 + }, + { + "epoch": 0.21956180416785612, + "grad_norm": 1.499194860458374, + "learning_rate": 4.428504751991562e-05, + "loss": 5.0983, + "step": 36918 + }, + { + "epoch": 0.21956775145113713, + "grad_norm": 1.8426028490066528, + "learning_rate": 4.428475027920873e-05, + "loss": 5.1254, + "step": 36919 + }, + { + "epoch": 0.21957369873441812, + "grad_norm": 1.5901544094085693, + "learning_rate": 4.428445303176971e-05, + "loss": 5.1756, + "step": 36920 + }, + { + "epoch": 0.2195796460176991, + "grad_norm": 1.766211748123169, + "learning_rate": 4.4284155777598704e-05, + "loss": 5.256, + "step": 36921 + }, + { + "epoch": 0.21958559330098013, + "grad_norm": 1.6671624183654785, + "learning_rate": 4.4283858516695786e-05, + "loss": 5.2203, + "step": 36922 + }, + { + "epoch": 0.2195915405842611, + "grad_norm": 1.5648390054702759, + "learning_rate": 4.428356124906108e-05, + "loss": 5.0774, + "step": 36923 + }, + { + "epoch": 0.2195974878675421, + "grad_norm": 1.8170748949050903, + "learning_rate": 4.428326397469468e-05, + "loss": 5.1123, + "step": 36924 + }, + { + "epoch": 0.21960343515082312, + "grad_norm": 1.5695691108703613, + "learning_rate": 4.4282966693596686e-05, + "loss": 5.0659, + "step": 36925 + }, + { + "epoch": 0.2196093824341041, + "grad_norm": 1.7228821516036987, + "learning_rate": 4.428266940576721e-05, + "loss": 4.9237, + "step": 36926 + }, + { + "epoch": 0.2196153297173851, + "grad_norm": 1.7535570859909058, + "learning_rate": 4.428237211120636e-05, + "loss": 5.0222, + "step": 36927 + }, + { + "epoch": 0.21962127700066608, + "grad_norm": 1.8773938417434692, + "learning_rate": 4.428207480991422e-05, + "loss": 4.942, + "step": 36928 + }, + { + "epoch": 0.2196272242839471, + "grad_norm": 1.834860920906067, + "learning_rate": 4.428177750189092e-05, + "loss": 4.9838, + "step": 36929 + }, + { + "epoch": 0.21963317156722809, + "grad_norm": 1.5257384777069092, + "learning_rate": 4.4281480187136546e-05, + "loss": 5.0234, + "step": 36930 + }, + { + "epoch": 0.21963911885050907, + "grad_norm": 1.86570143699646, + "learning_rate": 4.42811828656512e-05, + "loss": 4.7741, + "step": 36931 + }, + { + "epoch": 0.2196450661337901, + "grad_norm": 1.9325884580612183, + "learning_rate": 4.4280885537435e-05, + "loss": 5.0504, + "step": 36932 + }, + { + "epoch": 0.21965101341707108, + "grad_norm": 1.9406647682189941, + "learning_rate": 4.428058820248804e-05, + "loss": 4.919, + "step": 36933 + }, + { + "epoch": 0.21965696070035207, + "grad_norm": 2.314631223678589, + "learning_rate": 4.428029086081043e-05, + "loss": 4.321, + "step": 36934 + }, + { + "epoch": 0.21966290798363308, + "grad_norm": 3.108458995819092, + "learning_rate": 4.4279993512402265e-05, + "loss": 4.5806, + "step": 36935 + }, + { + "epoch": 0.21966885526691407, + "grad_norm": 2.192230463027954, + "learning_rate": 4.427969615726366e-05, + "loss": 4.6003, + "step": 36936 + }, + { + "epoch": 0.21967480255019506, + "grad_norm": 1.8830350637435913, + "learning_rate": 4.42793987953947e-05, + "loss": 4.2452, + "step": 36937 + }, + { + "epoch": 0.21968074983347607, + "grad_norm": 1.664759874343872, + "learning_rate": 4.427910142679551e-05, + "loss": 4.7462, + "step": 36938 + }, + { + "epoch": 0.21968669711675706, + "grad_norm": 1.638677716255188, + "learning_rate": 4.427880405146618e-05, + "loss": 4.7584, + "step": 36939 + }, + { + "epoch": 0.21969264440003805, + "grad_norm": 1.658952236175537, + "learning_rate": 4.427850666940683e-05, + "loss": 5.0226, + "step": 36940 + }, + { + "epoch": 0.21969859168331907, + "grad_norm": 1.8079904317855835, + "learning_rate": 4.427820928061754e-05, + "loss": 4.8951, + "step": 36941 + }, + { + "epoch": 0.21970453896660005, + "grad_norm": 1.5243951082229614, + "learning_rate": 4.427791188509843e-05, + "loss": 4.4953, + "step": 36942 + }, + { + "epoch": 0.21971048624988104, + "grad_norm": 1.5899109840393066, + "learning_rate": 4.42776144828496e-05, + "loss": 4.5787, + "step": 36943 + }, + { + "epoch": 0.21971643353316206, + "grad_norm": 1.787782073020935, + "learning_rate": 4.4277317073871156e-05, + "loss": 4.7603, + "step": 36944 + }, + { + "epoch": 0.21972238081644305, + "grad_norm": 2.6901321411132812, + "learning_rate": 4.4277019658163196e-05, + "loss": 4.2848, + "step": 36945 + }, + { + "epoch": 0.21972832809972404, + "grad_norm": 2.159574508666992, + "learning_rate": 4.427672223572583e-05, + "loss": 4.4136, + "step": 36946 + }, + { + "epoch": 0.21973427538300505, + "grad_norm": 1.6421222686767578, + "learning_rate": 4.427642480655916e-05, + "loss": 4.9688, + "step": 36947 + }, + { + "epoch": 0.21974022266628604, + "grad_norm": 1.9344775676727295, + "learning_rate": 4.4276127370663286e-05, + "loss": 4.4278, + "step": 36948 + }, + { + "epoch": 0.21974616994956703, + "grad_norm": 2.361130714416504, + "learning_rate": 4.427582992803831e-05, + "loss": 4.095, + "step": 36949 + }, + { + "epoch": 0.21975211723284804, + "grad_norm": 3.1432952880859375, + "learning_rate": 4.4275532478684354e-05, + "loss": 3.8301, + "step": 36950 + }, + { + "epoch": 0.21975806451612903, + "grad_norm": 2.9474925994873047, + "learning_rate": 4.4275235022601504e-05, + "loss": 3.3343, + "step": 36951 + }, + { + "epoch": 0.21976401179941002, + "grad_norm": 2.163156032562256, + "learning_rate": 4.427493755978987e-05, + "loss": 3.549, + "step": 36952 + }, + { + "epoch": 0.21976995908269104, + "grad_norm": 2.9386045932769775, + "learning_rate": 4.427464009024955e-05, + "loss": 3.5566, + "step": 36953 + }, + { + "epoch": 0.21977590636597202, + "grad_norm": 2.1227951049804688, + "learning_rate": 4.427434261398066e-05, + "loss": 4.3692, + "step": 36954 + }, + { + "epoch": 0.219781853649253, + "grad_norm": 2.999837875366211, + "learning_rate": 4.427404513098329e-05, + "loss": 3.6306, + "step": 36955 + }, + { + "epoch": 0.21978780093253403, + "grad_norm": 2.0129096508026123, + "learning_rate": 4.4273747641257546e-05, + "loss": 4.3312, + "step": 36956 + }, + { + "epoch": 0.21979374821581502, + "grad_norm": 1.743152141571045, + "learning_rate": 4.427345014480354e-05, + "loss": 5.2778, + "step": 36957 + }, + { + "epoch": 0.219799695499096, + "grad_norm": 1.6184766292572021, + "learning_rate": 4.4273152641621376e-05, + "loss": 4.9119, + "step": 36958 + }, + { + "epoch": 0.21980564278237702, + "grad_norm": 1.468083381652832, + "learning_rate": 4.427285513171115e-05, + "loss": 4.7192, + "step": 36959 + }, + { + "epoch": 0.219811590065658, + "grad_norm": 1.6739065647125244, + "learning_rate": 4.427255761507297e-05, + "loss": 5.0034, + "step": 36960 + }, + { + "epoch": 0.219817537348939, + "grad_norm": 1.8841058015823364, + "learning_rate": 4.427226009170693e-05, + "loss": 4.2297, + "step": 36961 + }, + { + "epoch": 0.21982348463222, + "grad_norm": 1.512008786201477, + "learning_rate": 4.4271962561613156e-05, + "loss": 4.9728, + "step": 36962 + }, + { + "epoch": 0.219829431915501, + "grad_norm": 1.4150924682617188, + "learning_rate": 4.427166502479173e-05, + "loss": 4.9096, + "step": 36963 + }, + { + "epoch": 0.219835379198782, + "grad_norm": 1.4463622570037842, + "learning_rate": 4.427136748124277e-05, + "loss": 4.8228, + "step": 36964 + }, + { + "epoch": 0.219841326482063, + "grad_norm": 1.7630513906478882, + "learning_rate": 4.427106993096638e-05, + "loss": 4.4025, + "step": 36965 + }, + { + "epoch": 0.219847273765344, + "grad_norm": 1.438467025756836, + "learning_rate": 4.427077237396265e-05, + "loss": 4.9638, + "step": 36966 + }, + { + "epoch": 0.21985322104862498, + "grad_norm": 1.6267468929290771, + "learning_rate": 4.42704748102317e-05, + "loss": 4.562, + "step": 36967 + }, + { + "epoch": 0.219859168331906, + "grad_norm": 1.7281938791275024, + "learning_rate": 4.427017723977361e-05, + "loss": 4.663, + "step": 36968 + }, + { + "epoch": 0.21986511561518698, + "grad_norm": 1.7493743896484375, + "learning_rate": 4.426987966258852e-05, + "loss": 4.9956, + "step": 36969 + }, + { + "epoch": 0.21987106289846797, + "grad_norm": 2.017735719680786, + "learning_rate": 4.4269582078676504e-05, + "loss": 4.6434, + "step": 36970 + }, + { + "epoch": 0.219877010181749, + "grad_norm": 1.9060436487197876, + "learning_rate": 4.426928448803768e-05, + "loss": 5.2025, + "step": 36971 + }, + { + "epoch": 0.21988295746502998, + "grad_norm": 1.8023102283477783, + "learning_rate": 4.426898689067214e-05, + "loss": 5.2327, + "step": 36972 + }, + { + "epoch": 0.21988890474831096, + "grad_norm": 1.7578123807907104, + "learning_rate": 4.426868928658e-05, + "loss": 4.853, + "step": 36973 + }, + { + "epoch": 0.21989485203159198, + "grad_norm": 1.8165631294250488, + "learning_rate": 4.426839167576137e-05, + "loss": 4.7873, + "step": 36974 + }, + { + "epoch": 0.21990079931487297, + "grad_norm": 2.621605396270752, + "learning_rate": 4.4268094058216325e-05, + "loss": 3.4732, + "step": 36975 + }, + { + "epoch": 0.21990674659815396, + "grad_norm": 2.3086864948272705, + "learning_rate": 4.4267796433945e-05, + "loss": 3.6192, + "step": 36976 + }, + { + "epoch": 0.21991269388143497, + "grad_norm": 2.730029582977295, + "learning_rate": 4.426749880294748e-05, + "loss": 3.824, + "step": 36977 + }, + { + "epoch": 0.21991864116471596, + "grad_norm": 2.5434679985046387, + "learning_rate": 4.4267201165223885e-05, + "loss": 4.0274, + "step": 36978 + }, + { + "epoch": 0.21992458844799695, + "grad_norm": 1.646130919456482, + "learning_rate": 4.42669035207743e-05, + "loss": 5.3633, + "step": 36979 + }, + { + "epoch": 0.21993053573127797, + "grad_norm": 2.096497058868408, + "learning_rate": 4.426660586959884e-05, + "loss": 5.1765, + "step": 36980 + }, + { + "epoch": 0.21993648301455895, + "grad_norm": 1.6388026475906372, + "learning_rate": 4.4266308211697605e-05, + "loss": 5.0053, + "step": 36981 + }, + { + "epoch": 0.21994243029783994, + "grad_norm": 1.3540456295013428, + "learning_rate": 4.42660105470707e-05, + "loss": 4.9745, + "step": 36982 + }, + { + "epoch": 0.21994837758112096, + "grad_norm": 1.547318696975708, + "learning_rate": 4.426571287571824e-05, + "loss": 4.7473, + "step": 36983 + }, + { + "epoch": 0.21995432486440195, + "grad_norm": 1.4012900590896606, + "learning_rate": 4.426541519764031e-05, + "loss": 4.8873, + "step": 36984 + }, + { + "epoch": 0.21996027214768293, + "grad_norm": 1.3376727104187012, + "learning_rate": 4.4265117512837023e-05, + "loss": 4.8311, + "step": 36985 + }, + { + "epoch": 0.21996621943096392, + "grad_norm": 1.3277151584625244, + "learning_rate": 4.4264819821308484e-05, + "loss": 4.6289, + "step": 36986 + }, + { + "epoch": 0.21997216671424494, + "grad_norm": 1.7237157821655273, + "learning_rate": 4.4264522123054795e-05, + "loss": 4.7096, + "step": 36987 + }, + { + "epoch": 0.21997811399752593, + "grad_norm": 2.6192526817321777, + "learning_rate": 4.4264224418076063e-05, + "loss": 3.8533, + "step": 36988 + }, + { + "epoch": 0.21998406128080691, + "grad_norm": 2.7556326389312744, + "learning_rate": 4.4263926706372385e-05, + "loss": 3.1077, + "step": 36989 + }, + { + "epoch": 0.21999000856408793, + "grad_norm": 2.582603931427002, + "learning_rate": 4.4263628987943875e-05, + "loss": 3.3059, + "step": 36990 + }, + { + "epoch": 0.21999595584736892, + "grad_norm": 2.8123531341552734, + "learning_rate": 4.426333126279062e-05, + "loss": 3.2447, + "step": 36991 + }, + { + "epoch": 0.2200019031306499, + "grad_norm": 2.7251224517822266, + "learning_rate": 4.4263033530912746e-05, + "loss": 3.6572, + "step": 36992 + }, + { + "epoch": 0.22000785041393092, + "grad_norm": 2.870852470397949, + "learning_rate": 4.426273579231034e-05, + "loss": 3.5223, + "step": 36993 + }, + { + "epoch": 0.2200137976972119, + "grad_norm": 2.64644718170166, + "learning_rate": 4.426243804698351e-05, + "loss": 3.1218, + "step": 36994 + }, + { + "epoch": 0.2200197449804929, + "grad_norm": 2.7472634315490723, + "learning_rate": 4.426214029493237e-05, + "loss": 3.3861, + "step": 36995 + }, + { + "epoch": 0.22002569226377391, + "grad_norm": 2.274350643157959, + "learning_rate": 4.4261842536157014e-05, + "loss": 3.8112, + "step": 36996 + }, + { + "epoch": 0.2200316395470549, + "grad_norm": 1.6150776147842407, + "learning_rate": 4.426154477065755e-05, + "loss": 4.7959, + "step": 36997 + }, + { + "epoch": 0.2200375868303359, + "grad_norm": 1.7377833127975464, + "learning_rate": 4.426124699843407e-05, + "loss": 4.5754, + "step": 36998 + }, + { + "epoch": 0.2200435341136169, + "grad_norm": 1.8310593366622925, + "learning_rate": 4.426094921948669e-05, + "loss": 4.8721, + "step": 36999 + }, + { + "epoch": 0.2200494813968979, + "grad_norm": 1.8585983514785767, + "learning_rate": 4.426065143381552e-05, + "loss": 4.8439, + "step": 37000 + }, + { + "epoch": 0.22005542868017888, + "grad_norm": 1.8049155473709106, + "learning_rate": 4.426035364142065e-05, + "loss": 4.2048, + "step": 37001 + }, + { + "epoch": 0.2200613759634599, + "grad_norm": 2.056905746459961, + "learning_rate": 4.426005584230219e-05, + "loss": 4.16, + "step": 37002 + }, + { + "epoch": 0.2200673232467409, + "grad_norm": 1.85906982421875, + "learning_rate": 4.425975803646024e-05, + "loss": 4.9968, + "step": 37003 + }, + { + "epoch": 0.22007327053002188, + "grad_norm": 1.816170573234558, + "learning_rate": 4.4259460223894914e-05, + "loss": 4.9686, + "step": 37004 + }, + { + "epoch": 0.2200792178133029, + "grad_norm": 1.6196742057800293, + "learning_rate": 4.42591624046063e-05, + "loss": 4.9311, + "step": 37005 + }, + { + "epoch": 0.22008516509658388, + "grad_norm": 1.6003400087356567, + "learning_rate": 4.4258864578594524e-05, + "loss": 4.8954, + "step": 37006 + }, + { + "epoch": 0.22009111237986487, + "grad_norm": 1.6523009538650513, + "learning_rate": 4.425856674585967e-05, + "loss": 4.9859, + "step": 37007 + }, + { + "epoch": 0.22009705966314588, + "grad_norm": 1.8884902000427246, + "learning_rate": 4.425826890640185e-05, + "loss": 4.8798, + "step": 37008 + }, + { + "epoch": 0.22010300694642687, + "grad_norm": 1.7629953622817993, + "learning_rate": 4.425797106022117e-05, + "loss": 4.7541, + "step": 37009 + }, + { + "epoch": 0.22010895422970786, + "grad_norm": 1.7024192810058594, + "learning_rate": 4.425767320731773e-05, + "loss": 4.45, + "step": 37010 + }, + { + "epoch": 0.22011490151298888, + "grad_norm": 1.8037065267562866, + "learning_rate": 4.4257375347691635e-05, + "loss": 5.3712, + "step": 37011 + }, + { + "epoch": 0.22012084879626986, + "grad_norm": 1.6815311908721924, + "learning_rate": 4.4257077481342976e-05, + "loss": 5.1085, + "step": 37012 + }, + { + "epoch": 0.22012679607955085, + "grad_norm": 1.4821339845657349, + "learning_rate": 4.425677960827189e-05, + "loss": 5.1132, + "step": 37013 + }, + { + "epoch": 0.22013274336283187, + "grad_norm": 1.645802617073059, + "learning_rate": 4.425648172847845e-05, + "loss": 5.274, + "step": 37014 + }, + { + "epoch": 0.22013869064611286, + "grad_norm": 1.838371753692627, + "learning_rate": 4.4256183841962776e-05, + "loss": 4.7118, + "step": 37015 + }, + { + "epoch": 0.22014463792939384, + "grad_norm": 1.8690693378448486, + "learning_rate": 4.425588594872497e-05, + "loss": 5.0579, + "step": 37016 + }, + { + "epoch": 0.22015058521267486, + "grad_norm": 1.6576164960861206, + "learning_rate": 4.425558804876513e-05, + "loss": 4.6789, + "step": 37017 + }, + { + "epoch": 0.22015653249595585, + "grad_norm": 1.6403518915176392, + "learning_rate": 4.425529014208336e-05, + "loss": 4.2318, + "step": 37018 + }, + { + "epoch": 0.22016247977923684, + "grad_norm": 2.0635440349578857, + "learning_rate": 4.4254992228679774e-05, + "loss": 4.8183, + "step": 37019 + }, + { + "epoch": 0.22016842706251785, + "grad_norm": 1.6289507150650024, + "learning_rate": 4.425469430855446e-05, + "loss": 4.8729, + "step": 37020 + }, + { + "epoch": 0.22017437434579884, + "grad_norm": 1.8985059261322021, + "learning_rate": 4.4254396381707534e-05, + "loss": 5.0734, + "step": 37021 + }, + { + "epoch": 0.22018032162907983, + "grad_norm": 1.793545126914978, + "learning_rate": 4.4254098448139106e-05, + "loss": 4.9311, + "step": 37022 + }, + { + "epoch": 0.22018626891236084, + "grad_norm": 1.7476612329483032, + "learning_rate": 4.4253800507849256e-05, + "loss": 4.9602, + "step": 37023 + }, + { + "epoch": 0.22019221619564183, + "grad_norm": 1.7672104835510254, + "learning_rate": 4.425350256083811e-05, + "loss": 5.0557, + "step": 37024 + }, + { + "epoch": 0.22019816347892282, + "grad_norm": 1.7356926202774048, + "learning_rate": 4.425320460710577e-05, + "loss": 4.9895, + "step": 37025 + }, + { + "epoch": 0.22020411076220384, + "grad_norm": 1.6702399253845215, + "learning_rate": 4.425290664665233e-05, + "loss": 5.5446, + "step": 37026 + }, + { + "epoch": 0.22021005804548482, + "grad_norm": 1.6615018844604492, + "learning_rate": 4.42526086794779e-05, + "loss": 4.8189, + "step": 37027 + }, + { + "epoch": 0.2202160053287658, + "grad_norm": 1.7490131855010986, + "learning_rate": 4.425231070558259e-05, + "loss": 4.3846, + "step": 37028 + }, + { + "epoch": 0.22022195261204683, + "grad_norm": 1.4904981851577759, + "learning_rate": 4.425201272496648e-05, + "loss": 4.9885, + "step": 37029 + }, + { + "epoch": 0.22022789989532782, + "grad_norm": 1.7829434871673584, + "learning_rate": 4.425171473762971e-05, + "loss": 4.6263, + "step": 37030 + }, + { + "epoch": 0.2202338471786088, + "grad_norm": 2.5793685913085938, + "learning_rate": 4.4251416743572364e-05, + "loss": 4.0243, + "step": 37031 + }, + { + "epoch": 0.22023979446188982, + "grad_norm": 1.8999863862991333, + "learning_rate": 4.4251118742794535e-05, + "loss": 5.3582, + "step": 37032 + }, + { + "epoch": 0.2202457417451708, + "grad_norm": 1.8360003232955933, + "learning_rate": 4.4250820735296347e-05, + "loss": 5.1397, + "step": 37033 + }, + { + "epoch": 0.2202516890284518, + "grad_norm": 1.742761492729187, + "learning_rate": 4.4250522721077894e-05, + "loss": 5.149, + "step": 37034 + }, + { + "epoch": 0.2202576363117328, + "grad_norm": 1.606214165687561, + "learning_rate": 4.425022470013928e-05, + "loss": 5.0941, + "step": 37035 + }, + { + "epoch": 0.2202635835950138, + "grad_norm": 2.075307607650757, + "learning_rate": 4.4249926672480615e-05, + "loss": 3.9427, + "step": 37036 + }, + { + "epoch": 0.2202695308782948, + "grad_norm": 3.00763201713562, + "learning_rate": 4.4249628638102005e-05, + "loss": 3.4657, + "step": 37037 + }, + { + "epoch": 0.2202754781615758, + "grad_norm": 2.541367292404175, + "learning_rate": 4.424933059700354e-05, + "loss": 3.3816, + "step": 37038 + }, + { + "epoch": 0.2202814254448568, + "grad_norm": 2.394335985183716, + "learning_rate": 4.4249032549185335e-05, + "loss": 3.0965, + "step": 37039 + }, + { + "epoch": 0.22028737272813778, + "grad_norm": 2.1029319763183594, + "learning_rate": 4.424873449464749e-05, + "loss": 3.0838, + "step": 37040 + }, + { + "epoch": 0.2202933200114188, + "grad_norm": 2.285747766494751, + "learning_rate": 4.424843643339011e-05, + "loss": 3.4436, + "step": 37041 + }, + { + "epoch": 0.22029926729469979, + "grad_norm": 2.01776123046875, + "learning_rate": 4.4248138365413305e-05, + "loss": 3.355, + "step": 37042 + }, + { + "epoch": 0.22030521457798077, + "grad_norm": 2.2943174839019775, + "learning_rate": 4.424784029071717e-05, + "loss": 3.1579, + "step": 37043 + }, + { + "epoch": 0.2203111618612618, + "grad_norm": 2.418867826461792, + "learning_rate": 4.4247542209301815e-05, + "loss": 3.0624, + "step": 37044 + }, + { + "epoch": 0.22031710914454278, + "grad_norm": 1.8208024501800537, + "learning_rate": 4.424724412116734e-05, + "loss": 4.4339, + "step": 37045 + }, + { + "epoch": 0.22032305642782377, + "grad_norm": 1.88398277759552, + "learning_rate": 4.424694602631385e-05, + "loss": 4.0576, + "step": 37046 + }, + { + "epoch": 0.22032900371110475, + "grad_norm": 1.6278939247131348, + "learning_rate": 4.424664792474145e-05, + "loss": 3.7719, + "step": 37047 + }, + { + "epoch": 0.22033495099438577, + "grad_norm": 1.5962262153625488, + "learning_rate": 4.424634981645025e-05, + "loss": 3.8542, + "step": 37048 + }, + { + "epoch": 0.22034089827766676, + "grad_norm": 1.5399160385131836, + "learning_rate": 4.424605170144034e-05, + "loss": 3.7271, + "step": 37049 + }, + { + "epoch": 0.22034684556094775, + "grad_norm": 1.6625111103057861, + "learning_rate": 4.4245753579711837e-05, + "loss": 3.7294, + "step": 37050 + }, + { + "epoch": 0.22035279284422876, + "grad_norm": 1.5418323278427124, + "learning_rate": 4.424545545126484e-05, + "loss": 3.7352, + "step": 37051 + }, + { + "epoch": 0.22035874012750975, + "grad_norm": 1.566845417022705, + "learning_rate": 4.424515731609945e-05, + "loss": 3.681, + "step": 37052 + }, + { + "epoch": 0.22036468741079074, + "grad_norm": 1.4394952058792114, + "learning_rate": 4.424485917421578e-05, + "loss": 3.8459, + "step": 37053 + }, + { + "epoch": 0.22037063469407175, + "grad_norm": 1.5573005676269531, + "learning_rate": 4.4244561025613924e-05, + "loss": 3.6516, + "step": 37054 + }, + { + "epoch": 0.22037658197735274, + "grad_norm": 1.4430670738220215, + "learning_rate": 4.424426287029399e-05, + "loss": 3.8846, + "step": 37055 + }, + { + "epoch": 0.22038252926063373, + "grad_norm": 1.496464490890503, + "learning_rate": 4.4243964708256086e-05, + "loss": 3.6196, + "step": 37056 + }, + { + "epoch": 0.22038847654391475, + "grad_norm": 1.4447180032730103, + "learning_rate": 4.4243666539500314e-05, + "loss": 3.6695, + "step": 37057 + }, + { + "epoch": 0.22039442382719573, + "grad_norm": 1.4226678609848022, + "learning_rate": 4.424336836402677e-05, + "loss": 3.9564, + "step": 37058 + }, + { + "epoch": 0.22040037111047672, + "grad_norm": 1.4498428106307983, + "learning_rate": 4.4243070181835566e-05, + "loss": 3.8076, + "step": 37059 + }, + { + "epoch": 0.22040631839375774, + "grad_norm": 1.4278359413146973, + "learning_rate": 4.4242771992926813e-05, + "loss": 3.7534, + "step": 37060 + }, + { + "epoch": 0.22041226567703873, + "grad_norm": 1.357118010520935, + "learning_rate": 4.42424737973006e-05, + "loss": 3.6754, + "step": 37061 + }, + { + "epoch": 0.22041821296031971, + "grad_norm": 1.3253631591796875, + "learning_rate": 4.424217559495704e-05, + "loss": 3.6916, + "step": 37062 + }, + { + "epoch": 0.22042416024360073, + "grad_norm": 1.3829140663146973, + "learning_rate": 4.4241877385896235e-05, + "loss": 3.8588, + "step": 37063 + }, + { + "epoch": 0.22043010752688172, + "grad_norm": 1.528243899345398, + "learning_rate": 4.424157917011829e-05, + "loss": 4.7923, + "step": 37064 + }, + { + "epoch": 0.2204360548101627, + "grad_norm": 1.7748266458511353, + "learning_rate": 4.424128094762331e-05, + "loss": 4.7928, + "step": 37065 + }, + { + "epoch": 0.22044200209344372, + "grad_norm": 1.7042765617370605, + "learning_rate": 4.42409827184114e-05, + "loss": 5.0612, + "step": 37066 + }, + { + "epoch": 0.2204479493767247, + "grad_norm": 1.7830193042755127, + "learning_rate": 4.424068448248265e-05, + "loss": 5.2614, + "step": 37067 + }, + { + "epoch": 0.2204538966600057, + "grad_norm": 1.6546859741210938, + "learning_rate": 4.424038623983718e-05, + "loss": 4.7816, + "step": 37068 + }, + { + "epoch": 0.22045984394328672, + "grad_norm": 1.5960372686386108, + "learning_rate": 4.42400879904751e-05, + "loss": 3.7849, + "step": 37069 + }, + { + "epoch": 0.2204657912265677, + "grad_norm": 1.5768903493881226, + "learning_rate": 4.42397897343965e-05, + "loss": 3.7381, + "step": 37070 + }, + { + "epoch": 0.2204717385098487, + "grad_norm": 1.6076819896697998, + "learning_rate": 4.423949147160148e-05, + "loss": 3.787, + "step": 37071 + }, + { + "epoch": 0.2204776857931297, + "grad_norm": 1.5903054475784302, + "learning_rate": 4.4239193202090165e-05, + "loss": 3.8237, + "step": 37072 + }, + { + "epoch": 0.2204836330764107, + "grad_norm": 1.6422667503356934, + "learning_rate": 4.423889492586264e-05, + "loss": 3.727, + "step": 37073 + }, + { + "epoch": 0.22048958035969168, + "grad_norm": 1.488077163696289, + "learning_rate": 4.423859664291901e-05, + "loss": 3.5784, + "step": 37074 + }, + { + "epoch": 0.2204955276429727, + "grad_norm": 1.885972499847412, + "learning_rate": 4.423829835325939e-05, + "loss": 4.8389, + "step": 37075 + }, + { + "epoch": 0.2205014749262537, + "grad_norm": 1.730106234550476, + "learning_rate": 4.423800005688388e-05, + "loss": 5.2459, + "step": 37076 + }, + { + "epoch": 0.22050742220953468, + "grad_norm": 1.8794136047363281, + "learning_rate": 4.4237701753792585e-05, + "loss": 5.1238, + "step": 37077 + }, + { + "epoch": 0.2205133694928157, + "grad_norm": 1.6428134441375732, + "learning_rate": 4.42374034439856e-05, + "loss": 4.8715, + "step": 37078 + }, + { + "epoch": 0.22051931677609668, + "grad_norm": 2.0203256607055664, + "learning_rate": 4.423710512746304e-05, + "loss": 3.9266, + "step": 37079 + }, + { + "epoch": 0.22052526405937767, + "grad_norm": 2.743778705596924, + "learning_rate": 4.4236806804225006e-05, + "loss": 3.575, + "step": 37080 + }, + { + "epoch": 0.22053121134265868, + "grad_norm": 2.646286725997925, + "learning_rate": 4.42365084742716e-05, + "loss": 3.2283, + "step": 37081 + }, + { + "epoch": 0.22053715862593967, + "grad_norm": 2.472491502761841, + "learning_rate": 4.423621013760293e-05, + "loss": 3.5303, + "step": 37082 + }, + { + "epoch": 0.22054310590922066, + "grad_norm": 2.5591135025024414, + "learning_rate": 4.423591179421909e-05, + "loss": 3.4337, + "step": 37083 + }, + { + "epoch": 0.22054905319250168, + "grad_norm": 2.592282772064209, + "learning_rate": 4.423561344412021e-05, + "loss": 2.904, + "step": 37084 + }, + { + "epoch": 0.22055500047578266, + "grad_norm": 2.4524197578430176, + "learning_rate": 4.423531508730635e-05, + "loss": 3.3817, + "step": 37085 + }, + { + "epoch": 0.22056094775906365, + "grad_norm": 2.398085594177246, + "learning_rate": 4.4235016723777656e-05, + "loss": 3.2741, + "step": 37086 + }, + { + "epoch": 0.22056689504234467, + "grad_norm": 2.337100028991699, + "learning_rate": 4.423471835353422e-05, + "loss": 3.5931, + "step": 37087 + }, + { + "epoch": 0.22057284232562566, + "grad_norm": 2.589341402053833, + "learning_rate": 4.4234419976576137e-05, + "loss": 3.4136, + "step": 37088 + }, + { + "epoch": 0.22057878960890664, + "grad_norm": 2.466911792755127, + "learning_rate": 4.4234121592903515e-05, + "loss": 3.5329, + "step": 37089 + }, + { + "epoch": 0.22058473689218766, + "grad_norm": 1.5576742887496948, + "learning_rate": 4.423382320251646e-05, + "loss": 4.388, + "step": 37090 + }, + { + "epoch": 0.22059068417546865, + "grad_norm": 1.3827784061431885, + "learning_rate": 4.423352480541508e-05, + "loss": 4.9481, + "step": 37091 + }, + { + "epoch": 0.22059663145874964, + "grad_norm": 1.4498347043991089, + "learning_rate": 4.423322640159947e-05, + "loss": 5.3971, + "step": 37092 + }, + { + "epoch": 0.22060257874203065, + "grad_norm": 1.6802235841751099, + "learning_rate": 4.423292799106974e-05, + "loss": 5.2876, + "step": 37093 + }, + { + "epoch": 0.22060852602531164, + "grad_norm": 1.9586135149002075, + "learning_rate": 4.4232629573826e-05, + "loss": 4.9878, + "step": 37094 + }, + { + "epoch": 0.22061447330859263, + "grad_norm": 1.6873750686645508, + "learning_rate": 4.423233114986834e-05, + "loss": 4.9648, + "step": 37095 + }, + { + "epoch": 0.22062042059187364, + "grad_norm": 1.708585500717163, + "learning_rate": 4.423203271919688e-05, + "loss": 4.9923, + "step": 37096 + }, + { + "epoch": 0.22062636787515463, + "grad_norm": 1.4877114295959473, + "learning_rate": 4.423173428181171e-05, + "loss": 5.2307, + "step": 37097 + }, + { + "epoch": 0.22063231515843562, + "grad_norm": 1.485723614692688, + "learning_rate": 4.423143583771294e-05, + "loss": 4.2279, + "step": 37098 + }, + { + "epoch": 0.22063826244171664, + "grad_norm": 1.5309786796569824, + "learning_rate": 4.423113738690068e-05, + "loss": 4.9294, + "step": 37099 + }, + { + "epoch": 0.22064420972499763, + "grad_norm": 1.7999223470687866, + "learning_rate": 4.4230838929375027e-05, + "loss": 4.7005, + "step": 37100 + }, + { + "epoch": 0.2206501570082786, + "grad_norm": 1.5340321063995361, + "learning_rate": 4.423054046513608e-05, + "loss": 4.7574, + "step": 37101 + }, + { + "epoch": 0.22065610429155963, + "grad_norm": 1.641364574432373, + "learning_rate": 4.423024199418396e-05, + "loss": 5.6716, + "step": 37102 + }, + { + "epoch": 0.22066205157484062, + "grad_norm": 1.973738431930542, + "learning_rate": 4.422994351651875e-05, + "loss": 5.1761, + "step": 37103 + }, + { + "epoch": 0.2206679988581216, + "grad_norm": 2.4407904148101807, + "learning_rate": 4.4229645032140574e-05, + "loss": 3.4594, + "step": 37104 + }, + { + "epoch": 0.2206739461414026, + "grad_norm": 2.4867618083953857, + "learning_rate": 4.4229346541049525e-05, + "loss": 3.3473, + "step": 37105 + }, + { + "epoch": 0.2206798934246836, + "grad_norm": 2.196946144104004, + "learning_rate": 4.422904804324571e-05, + "loss": 3.3534, + "step": 37106 + }, + { + "epoch": 0.2206858407079646, + "grad_norm": 1.8784695863723755, + "learning_rate": 4.422874953872923e-05, + "loss": 4.8781, + "step": 37107 + }, + { + "epoch": 0.22069178799124559, + "grad_norm": 1.9972692728042603, + "learning_rate": 4.4228451027500196e-05, + "loss": 4.244, + "step": 37108 + }, + { + "epoch": 0.2206977352745266, + "grad_norm": 1.573676586151123, + "learning_rate": 4.4228152509558704e-05, + "loss": 4.7457, + "step": 37109 + }, + { + "epoch": 0.2207036825578076, + "grad_norm": 1.4594526290893555, + "learning_rate": 4.422785398490487e-05, + "loss": 4.6177, + "step": 37110 + }, + { + "epoch": 0.22070962984108858, + "grad_norm": 1.6894400119781494, + "learning_rate": 4.4227555453538784e-05, + "loss": 4.9041, + "step": 37111 + }, + { + "epoch": 0.2207155771243696, + "grad_norm": 1.7728346586227417, + "learning_rate": 4.422725691546056e-05, + "loss": 4.9142, + "step": 37112 + }, + { + "epoch": 0.22072152440765058, + "grad_norm": 1.8219943046569824, + "learning_rate": 4.42269583706703e-05, + "loss": 5.0506, + "step": 37113 + }, + { + "epoch": 0.22072747169093157, + "grad_norm": 1.455910086631775, + "learning_rate": 4.422665981916811e-05, + "loss": 5.5078, + "step": 37114 + }, + { + "epoch": 0.2207334189742126, + "grad_norm": 1.5510808229446411, + "learning_rate": 4.422636126095409e-05, + "loss": 4.6677, + "step": 37115 + }, + { + "epoch": 0.22073936625749357, + "grad_norm": 2.251783609390259, + "learning_rate": 4.4226062696028334e-05, + "loss": 3.674, + "step": 37116 + }, + { + "epoch": 0.22074531354077456, + "grad_norm": 2.697303295135498, + "learning_rate": 4.422576412439098e-05, + "loss": 2.9135, + "step": 37117 + }, + { + "epoch": 0.22075126082405558, + "grad_norm": 2.8013083934783936, + "learning_rate": 4.4225465546042096e-05, + "loss": 2.7409, + "step": 37118 + }, + { + "epoch": 0.22075720810733657, + "grad_norm": 2.6208369731903076, + "learning_rate": 4.42251669609818e-05, + "loss": 3.1612, + "step": 37119 + }, + { + "epoch": 0.22076315539061755, + "grad_norm": 2.6360249519348145, + "learning_rate": 4.42248683692102e-05, + "loss": 3.3315, + "step": 37120 + }, + { + "epoch": 0.22076910267389857, + "grad_norm": 2.7215638160705566, + "learning_rate": 4.42245697707274e-05, + "loss": 3.0673, + "step": 37121 + }, + { + "epoch": 0.22077504995717956, + "grad_norm": 2.5598907470703125, + "learning_rate": 4.422427116553349e-05, + "loss": 2.9353, + "step": 37122 + }, + { + "epoch": 0.22078099724046055, + "grad_norm": 2.8053741455078125, + "learning_rate": 4.42239725536286e-05, + "loss": 3.1066, + "step": 37123 + }, + { + "epoch": 0.22078694452374156, + "grad_norm": 2.600266933441162, + "learning_rate": 4.422367393501281e-05, + "loss": 2.7168, + "step": 37124 + }, + { + "epoch": 0.22079289180702255, + "grad_norm": 2.486725091934204, + "learning_rate": 4.422337530968624e-05, + "loss": 2.6892, + "step": 37125 + }, + { + "epoch": 0.22079883909030354, + "grad_norm": 2.9207983016967773, + "learning_rate": 4.422307667764899e-05, + "loss": 2.3465, + "step": 37126 + }, + { + "epoch": 0.22080478637358456, + "grad_norm": 2.5730931758880615, + "learning_rate": 4.422277803890116e-05, + "loss": 2.3872, + "step": 37127 + }, + { + "epoch": 0.22081073365686554, + "grad_norm": 2.5988657474517822, + "learning_rate": 4.422247939344285e-05, + "loss": 3.0179, + "step": 37128 + }, + { + "epoch": 0.22081668094014653, + "grad_norm": 2.7020468711853027, + "learning_rate": 4.422218074127418e-05, + "loss": 2.1756, + "step": 37129 + }, + { + "epoch": 0.22082262822342755, + "grad_norm": 2.697157144546509, + "learning_rate": 4.422188208239524e-05, + "loss": 2.1083, + "step": 37130 + }, + { + "epoch": 0.22082857550670854, + "grad_norm": 2.241924285888672, + "learning_rate": 4.422158341680614e-05, + "loss": 3.8524, + "step": 37131 + }, + { + "epoch": 0.22083452278998952, + "grad_norm": 1.8899613618850708, + "learning_rate": 4.422128474450699e-05, + "loss": 4.1308, + "step": 37132 + }, + { + "epoch": 0.22084047007327054, + "grad_norm": 1.6082144975662231, + "learning_rate": 4.4220986065497884e-05, + "loss": 5.5055, + "step": 37133 + }, + { + "epoch": 0.22084641735655153, + "grad_norm": 1.8562133312225342, + "learning_rate": 4.4220687379778924e-05, + "loss": 5.2945, + "step": 37134 + }, + { + "epoch": 0.22085236463983252, + "grad_norm": 1.9147547483444214, + "learning_rate": 4.422038868735022e-05, + "loss": 5.5291, + "step": 37135 + }, + { + "epoch": 0.22085831192311353, + "grad_norm": 1.6603139638900757, + "learning_rate": 4.422008998821189e-05, + "loss": 5.4428, + "step": 37136 + }, + { + "epoch": 0.22086425920639452, + "grad_norm": 1.6215204000473022, + "learning_rate": 4.421979128236401e-05, + "loss": 5.343, + "step": 37137 + }, + { + "epoch": 0.2208702064896755, + "grad_norm": 1.7748491764068604, + "learning_rate": 4.421949256980671e-05, + "loss": 5.1708, + "step": 37138 + }, + { + "epoch": 0.22087615377295652, + "grad_norm": 1.7499853372573853, + "learning_rate": 4.421919385054008e-05, + "loss": 5.3233, + "step": 37139 + }, + { + "epoch": 0.2208821010562375, + "grad_norm": 1.7295137643814087, + "learning_rate": 4.421889512456423e-05, + "loss": 5.3818, + "step": 37140 + }, + { + "epoch": 0.2208880483395185, + "grad_norm": 1.611734390258789, + "learning_rate": 4.4218596391879264e-05, + "loss": 5.1509, + "step": 37141 + }, + { + "epoch": 0.22089399562279952, + "grad_norm": 1.661490797996521, + "learning_rate": 4.421829765248528e-05, + "loss": 5.1012, + "step": 37142 + }, + { + "epoch": 0.2208999429060805, + "grad_norm": 1.471693992614746, + "learning_rate": 4.4217998906382395e-05, + "loss": 5.2618, + "step": 37143 + }, + { + "epoch": 0.2209058901893615, + "grad_norm": 1.8665975332260132, + "learning_rate": 4.4217700153570694e-05, + "loss": 5.0267, + "step": 37144 + }, + { + "epoch": 0.2209118374726425, + "grad_norm": 1.7666631937026978, + "learning_rate": 4.42174013940503e-05, + "loss": 5.1182, + "step": 37145 + }, + { + "epoch": 0.2209177847559235, + "grad_norm": 1.6956653594970703, + "learning_rate": 4.421710262782131e-05, + "loss": 5.1901, + "step": 37146 + }, + { + "epoch": 0.22092373203920448, + "grad_norm": 1.8353193998336792, + "learning_rate": 4.4216803854883826e-05, + "loss": 5.2775, + "step": 37147 + }, + { + "epoch": 0.2209296793224855, + "grad_norm": 1.8415271043777466, + "learning_rate": 4.4216505075237945e-05, + "loss": 5.1925, + "step": 37148 + }, + { + "epoch": 0.2209356266057665, + "grad_norm": 1.7486032247543335, + "learning_rate": 4.4216206288883794e-05, + "loss": 5.1695, + "step": 37149 + }, + { + "epoch": 0.22094157388904748, + "grad_norm": 1.6249213218688965, + "learning_rate": 4.421590749582146e-05, + "loss": 5.3091, + "step": 37150 + }, + { + "epoch": 0.2209475211723285, + "grad_norm": 1.5573538541793823, + "learning_rate": 4.4215608696051045e-05, + "loss": 5.2265, + "step": 37151 + }, + { + "epoch": 0.22095346845560948, + "grad_norm": 1.4847667217254639, + "learning_rate": 4.421530988957267e-05, + "loss": 5.2365, + "step": 37152 + }, + { + "epoch": 0.22095941573889047, + "grad_norm": 1.5954338312149048, + "learning_rate": 4.421501107638643e-05, + "loss": 5.0707, + "step": 37153 + }, + { + "epoch": 0.22096536302217148, + "grad_norm": 1.6025676727294922, + "learning_rate": 4.421471225649242e-05, + "loss": 5.1383, + "step": 37154 + }, + { + "epoch": 0.22097131030545247, + "grad_norm": 2.082498550415039, + "learning_rate": 4.421441342989075e-05, + "loss": 4.5216, + "step": 37155 + }, + { + "epoch": 0.22097725758873346, + "grad_norm": 2.626246690750122, + "learning_rate": 4.421411459658153e-05, + "loss": 3.4209, + "step": 37156 + }, + { + "epoch": 0.22098320487201448, + "grad_norm": 3.0174107551574707, + "learning_rate": 4.421381575656486e-05, + "loss": 3.3856, + "step": 37157 + }, + { + "epoch": 0.22098915215529547, + "grad_norm": 2.355088472366333, + "learning_rate": 4.421351690984084e-05, + "loss": 3.1049, + "step": 37158 + }, + { + "epoch": 0.22099509943857645, + "grad_norm": 2.104196310043335, + "learning_rate": 4.42132180564096e-05, + "loss": 3.2537, + "step": 37159 + }, + { + "epoch": 0.22100104672185747, + "grad_norm": 2.216770648956299, + "learning_rate": 4.4212919196271205e-05, + "loss": 3.1694, + "step": 37160 + }, + { + "epoch": 0.22100699400513846, + "grad_norm": 2.238762617111206, + "learning_rate": 4.421262032942579e-05, + "loss": 3.128, + "step": 37161 + }, + { + "epoch": 0.22101294128841945, + "grad_norm": 3.443631410598755, + "learning_rate": 4.421232145587344e-05, + "loss": 2.4831, + "step": 37162 + }, + { + "epoch": 0.22101888857170043, + "grad_norm": 3.6234066486358643, + "learning_rate": 4.421202257561427e-05, + "loss": 2.0318, + "step": 37163 + }, + { + "epoch": 0.22102483585498145, + "grad_norm": 3.405298948287964, + "learning_rate": 4.421172368864838e-05, + "loss": 2.1004, + "step": 37164 + }, + { + "epoch": 0.22103078313826244, + "grad_norm": 3.220759391784668, + "learning_rate": 4.4211424794975875e-05, + "loss": 2.2621, + "step": 37165 + }, + { + "epoch": 0.22103673042154343, + "grad_norm": 3.005004644393921, + "learning_rate": 4.4211125894596865e-05, + "loss": 2.1246, + "step": 37166 + }, + { + "epoch": 0.22104267770482444, + "grad_norm": 3.2884764671325684, + "learning_rate": 4.421082698751144e-05, + "loss": 2.0425, + "step": 37167 + }, + { + "epoch": 0.22104862498810543, + "grad_norm": 2.9863510131835938, + "learning_rate": 4.4210528073719727e-05, + "loss": 1.8929, + "step": 37168 + }, + { + "epoch": 0.22105457227138642, + "grad_norm": 2.6555914878845215, + "learning_rate": 4.421022915322181e-05, + "loss": 1.4331, + "step": 37169 + }, + { + "epoch": 0.22106051955466743, + "grad_norm": 2.6842329502105713, + "learning_rate": 4.4209930226017807e-05, + "loss": 1.6554, + "step": 37170 + }, + { + "epoch": 0.22106646683794842, + "grad_norm": 2.9295549392700195, + "learning_rate": 4.42096312921078e-05, + "loss": 3.4228, + "step": 37171 + }, + { + "epoch": 0.2210724141212294, + "grad_norm": 2.749258279800415, + "learning_rate": 4.420933235149192e-05, + "loss": 3.4164, + "step": 37172 + }, + { + "epoch": 0.22107836140451043, + "grad_norm": 2.6917507648468018, + "learning_rate": 4.420903340417026e-05, + "loss": 3.5214, + "step": 37173 + }, + { + "epoch": 0.22108430868779141, + "grad_norm": 2.447829484939575, + "learning_rate": 4.420873445014292e-05, + "loss": 3.428, + "step": 37174 + }, + { + "epoch": 0.2210902559710724, + "grad_norm": 1.9641824960708618, + "learning_rate": 4.420843548941002e-05, + "loss": 3.3165, + "step": 37175 + }, + { + "epoch": 0.22109620325435342, + "grad_norm": 2.1110525131225586, + "learning_rate": 4.4208136521971646e-05, + "loss": 3.434, + "step": 37176 + }, + { + "epoch": 0.2211021505376344, + "grad_norm": 1.9869229793548584, + "learning_rate": 4.4207837547827905e-05, + "loss": 4.4427, + "step": 37177 + }, + { + "epoch": 0.2211080978209154, + "grad_norm": 1.9617522954940796, + "learning_rate": 4.4207538566978915e-05, + "loss": 5.4059, + "step": 37178 + }, + { + "epoch": 0.2211140451041964, + "grad_norm": 2.4324228763580322, + "learning_rate": 4.420723957942477e-05, + "loss": 5.1003, + "step": 37179 + }, + { + "epoch": 0.2211199923874774, + "grad_norm": 1.8642364740371704, + "learning_rate": 4.420694058516557e-05, + "loss": 4.9225, + "step": 37180 + }, + { + "epoch": 0.2211259396707584, + "grad_norm": 1.636181116104126, + "learning_rate": 4.420664158420143e-05, + "loss": 4.8789, + "step": 37181 + }, + { + "epoch": 0.2211318869540394, + "grad_norm": 1.830492377281189, + "learning_rate": 4.420634257653245e-05, + "loss": 4.6706, + "step": 37182 + }, + { + "epoch": 0.2211378342373204, + "grad_norm": 1.7127333879470825, + "learning_rate": 4.420604356215874e-05, + "loss": 4.5773, + "step": 37183 + }, + { + "epoch": 0.22114378152060138, + "grad_norm": 1.882017731666565, + "learning_rate": 4.4205744541080394e-05, + "loss": 4.6778, + "step": 37184 + }, + { + "epoch": 0.2211497288038824, + "grad_norm": 1.7298130989074707, + "learning_rate": 4.420544551329752e-05, + "loss": 4.5224, + "step": 37185 + }, + { + "epoch": 0.22115567608716338, + "grad_norm": 1.6471002101898193, + "learning_rate": 4.420514647881022e-05, + "loss": 4.4796, + "step": 37186 + }, + { + "epoch": 0.22116162337044437, + "grad_norm": 1.6308108568191528, + "learning_rate": 4.420484743761861e-05, + "loss": 4.4928, + "step": 37187 + }, + { + "epoch": 0.2211675706537254, + "grad_norm": 1.8822065591812134, + "learning_rate": 4.420454838972278e-05, + "loss": 4.4417, + "step": 37188 + }, + { + "epoch": 0.22117351793700638, + "grad_norm": 1.8454277515411377, + "learning_rate": 4.420424933512284e-05, + "loss": 4.3258, + "step": 37189 + }, + { + "epoch": 0.22117946522028736, + "grad_norm": 1.7358025312423706, + "learning_rate": 4.42039502738189e-05, + "loss": 4.9944, + "step": 37190 + }, + { + "epoch": 0.22118541250356838, + "grad_norm": 1.7481547594070435, + "learning_rate": 4.420365120581106e-05, + "loss": 4.9499, + "step": 37191 + }, + { + "epoch": 0.22119135978684937, + "grad_norm": 2.411710500717163, + "learning_rate": 4.4203352131099416e-05, + "loss": 4.6692, + "step": 37192 + }, + { + "epoch": 0.22119730707013036, + "grad_norm": 1.3139026165008545, + "learning_rate": 4.420305304968408e-05, + "loss": 5.2405, + "step": 37193 + }, + { + "epoch": 0.22120325435341137, + "grad_norm": 2.4839398860931396, + "learning_rate": 4.4202753961565166e-05, + "loss": 4.192, + "step": 37194 + }, + { + "epoch": 0.22120920163669236, + "grad_norm": 2.9722938537597656, + "learning_rate": 4.4202454866742763e-05, + "loss": 3.9029, + "step": 37195 + }, + { + "epoch": 0.22121514891997335, + "grad_norm": 2.6020498275756836, + "learning_rate": 4.4202155765216976e-05, + "loss": 3.9087, + "step": 37196 + }, + { + "epoch": 0.22122109620325436, + "grad_norm": 2.354983329772949, + "learning_rate": 4.4201856656987926e-05, + "loss": 3.8118, + "step": 37197 + }, + { + "epoch": 0.22122704348653535, + "grad_norm": 2.4077634811401367, + "learning_rate": 4.42015575420557e-05, + "loss": 3.9617, + "step": 37198 + }, + { + "epoch": 0.22123299076981634, + "grad_norm": 2.160682201385498, + "learning_rate": 4.420125842042041e-05, + "loss": 3.8235, + "step": 37199 + }, + { + "epoch": 0.22123893805309736, + "grad_norm": 2.4441070556640625, + "learning_rate": 4.420095929208217e-05, + "loss": 4.5287, + "step": 37200 + }, + { + "epoch": 0.22124488533637834, + "grad_norm": 1.7624927759170532, + "learning_rate": 4.420066015704105e-05, + "loss": 5.0133, + "step": 37201 + }, + { + "epoch": 0.22125083261965933, + "grad_norm": 1.8188538551330566, + "learning_rate": 4.4200361015297196e-05, + "loss": 5.1234, + "step": 37202 + }, + { + "epoch": 0.22125677990294035, + "grad_norm": 1.8965922594070435, + "learning_rate": 4.420006186685069e-05, + "loss": 4.9296, + "step": 37203 + }, + { + "epoch": 0.22126272718622134, + "grad_norm": 1.711780309677124, + "learning_rate": 4.4199762711701646e-05, + "loss": 4.7628, + "step": 37204 + }, + { + "epoch": 0.22126867446950232, + "grad_norm": 2.455254077911377, + "learning_rate": 4.419946354985015e-05, + "loss": 3.7198, + "step": 37205 + }, + { + "epoch": 0.22127462175278334, + "grad_norm": 2.212334156036377, + "learning_rate": 4.419916438129632e-05, + "loss": 3.9509, + "step": 37206 + }, + { + "epoch": 0.22128056903606433, + "grad_norm": 2.515057325363159, + "learning_rate": 4.4198865206040275e-05, + "loss": 4.1821, + "step": 37207 + }, + { + "epoch": 0.22128651631934532, + "grad_norm": 2.1646876335144043, + "learning_rate": 4.4198566024082096e-05, + "loss": 3.7462, + "step": 37208 + }, + { + "epoch": 0.22129246360262633, + "grad_norm": 2.334415912628174, + "learning_rate": 4.4198266835421894e-05, + "loss": 3.8333, + "step": 37209 + }, + { + "epoch": 0.22129841088590732, + "grad_norm": 3.074552536010742, + "learning_rate": 4.419796764005978e-05, + "loss": 4.0844, + "step": 37210 + }, + { + "epoch": 0.2213043581691883, + "grad_norm": 1.8268340826034546, + "learning_rate": 4.419766843799585e-05, + "loss": 4.7415, + "step": 37211 + }, + { + "epoch": 0.22131030545246932, + "grad_norm": 2.2268385887145996, + "learning_rate": 4.419736922923021e-05, + "loss": 4.114, + "step": 37212 + }, + { + "epoch": 0.2213162527357503, + "grad_norm": 1.7792022228240967, + "learning_rate": 4.419707001376297e-05, + "loss": 4.9278, + "step": 37213 + }, + { + "epoch": 0.2213222000190313, + "grad_norm": 3.0060274600982666, + "learning_rate": 4.4196770791594236e-05, + "loss": 2.7797, + "step": 37214 + }, + { + "epoch": 0.22132814730231232, + "grad_norm": 2.738672971725464, + "learning_rate": 4.41964715627241e-05, + "loss": 3.0921, + "step": 37215 + }, + { + "epoch": 0.2213340945855933, + "grad_norm": 1.313151240348816, + "learning_rate": 4.419617232715267e-05, + "loss": 5.1684, + "step": 37216 + }, + { + "epoch": 0.2213400418688743, + "grad_norm": 1.3445955514907837, + "learning_rate": 4.419587308488007e-05, + "loss": 5.3816, + "step": 37217 + }, + { + "epoch": 0.2213459891521553, + "grad_norm": 1.6494323015213013, + "learning_rate": 4.419557383590638e-05, + "loss": 4.7054, + "step": 37218 + }, + { + "epoch": 0.2213519364354363, + "grad_norm": 1.4395712614059448, + "learning_rate": 4.419527458023171e-05, + "loss": 4.5004, + "step": 37219 + }, + { + "epoch": 0.22135788371871729, + "grad_norm": 1.4600639343261719, + "learning_rate": 4.419497531785617e-05, + "loss": 4.6431, + "step": 37220 + }, + { + "epoch": 0.22136383100199827, + "grad_norm": 1.544190764427185, + "learning_rate": 4.419467604877987e-05, + "loss": 4.933, + "step": 37221 + }, + { + "epoch": 0.2213697782852793, + "grad_norm": 1.767937421798706, + "learning_rate": 4.41943767730029e-05, + "loss": 4.8382, + "step": 37222 + }, + { + "epoch": 0.22137572556856028, + "grad_norm": 1.9524779319763184, + "learning_rate": 4.4194077490525373e-05, + "loss": 4.7597, + "step": 37223 + }, + { + "epoch": 0.22138167285184127, + "grad_norm": 1.614169955253601, + "learning_rate": 4.419377820134739e-05, + "loss": 4.9252, + "step": 37224 + }, + { + "epoch": 0.22138762013512228, + "grad_norm": 1.4424415826797485, + "learning_rate": 4.419347890546907e-05, + "loss": 5.3474, + "step": 37225 + }, + { + "epoch": 0.22139356741840327, + "grad_norm": 1.42082941532135, + "learning_rate": 4.419317960289049e-05, + "loss": 5.084, + "step": 37226 + }, + { + "epoch": 0.22139951470168426, + "grad_norm": 1.8457419872283936, + "learning_rate": 4.4192880293611774e-05, + "loss": 5.062, + "step": 37227 + }, + { + "epoch": 0.22140546198496527, + "grad_norm": 1.5780494213104248, + "learning_rate": 4.419258097763301e-05, + "loss": 4.9313, + "step": 37228 + }, + { + "epoch": 0.22141140926824626, + "grad_norm": 1.3056610822677612, + "learning_rate": 4.419228165495433e-05, + "loss": 4.8709, + "step": 37229 + }, + { + "epoch": 0.22141735655152725, + "grad_norm": 1.1669378280639648, + "learning_rate": 4.419198232557582e-05, + "loss": 4.871, + "step": 37230 + }, + { + "epoch": 0.22142330383480827, + "grad_norm": 1.4716078042984009, + "learning_rate": 4.4191682989497584e-05, + "loss": 4.9545, + "step": 37231 + }, + { + "epoch": 0.22142925111808925, + "grad_norm": 2.2933619022369385, + "learning_rate": 4.419138364671973e-05, + "loss": 4.8166, + "step": 37232 + }, + { + "epoch": 0.22143519840137024, + "grad_norm": 1.8404078483581543, + "learning_rate": 4.419108429724236e-05, + "loss": 5.0332, + "step": 37233 + }, + { + "epoch": 0.22144114568465126, + "grad_norm": 1.7566367387771606, + "learning_rate": 4.419078494106559e-05, + "loss": 5.1599, + "step": 37234 + }, + { + "epoch": 0.22144709296793225, + "grad_norm": 1.758940577507019, + "learning_rate": 4.419048557818951e-05, + "loss": 4.9106, + "step": 37235 + }, + { + "epoch": 0.22145304025121323, + "grad_norm": 1.363818645477295, + "learning_rate": 4.4190186208614224e-05, + "loss": 5.6113, + "step": 37236 + }, + { + "epoch": 0.22145898753449425, + "grad_norm": 1.3969568014144897, + "learning_rate": 4.4189886832339857e-05, + "loss": 5.5177, + "step": 37237 + }, + { + "epoch": 0.22146493481777524, + "grad_norm": 1.6910165548324585, + "learning_rate": 4.418958744936648e-05, + "loss": 4.9269, + "step": 37238 + }, + { + "epoch": 0.22147088210105623, + "grad_norm": 1.5654246807098389, + "learning_rate": 4.418928805969423e-05, + "loss": 5.0792, + "step": 37239 + }, + { + "epoch": 0.22147682938433724, + "grad_norm": 1.5806190967559814, + "learning_rate": 4.41889886633232e-05, + "loss": 4.8614, + "step": 37240 + }, + { + "epoch": 0.22148277666761823, + "grad_norm": 2.206296443939209, + "learning_rate": 4.418868926025347e-05, + "loss": 4.1954, + "step": 37241 + }, + { + "epoch": 0.22148872395089922, + "grad_norm": 1.6986488103866577, + "learning_rate": 4.418838985048519e-05, + "loss": 5.2841, + "step": 37242 + }, + { + "epoch": 0.22149467123418023, + "grad_norm": 1.815557599067688, + "learning_rate": 4.418809043401843e-05, + "loss": 5.0666, + "step": 37243 + }, + { + "epoch": 0.22150061851746122, + "grad_norm": 1.6166787147521973, + "learning_rate": 4.418779101085331e-05, + "loss": 5.0519, + "step": 37244 + }, + { + "epoch": 0.2215065658007422, + "grad_norm": 1.575146198272705, + "learning_rate": 4.4187491580989926e-05, + "loss": 4.9857, + "step": 37245 + }, + { + "epoch": 0.22151251308402323, + "grad_norm": 1.4937185049057007, + "learning_rate": 4.4187192144428395e-05, + "loss": 5.3216, + "step": 37246 + }, + { + "epoch": 0.22151846036730422, + "grad_norm": 1.5050480365753174, + "learning_rate": 4.4186892701168805e-05, + "loss": 5.5195, + "step": 37247 + }, + { + "epoch": 0.2215244076505852, + "grad_norm": 1.4928349256515503, + "learning_rate": 4.418659325121128e-05, + "loss": 5.1624, + "step": 37248 + }, + { + "epoch": 0.22153035493386622, + "grad_norm": 1.5524920225143433, + "learning_rate": 4.4186293794555904e-05, + "loss": 4.8779, + "step": 37249 + }, + { + "epoch": 0.2215363022171472, + "grad_norm": 2.0788793563842773, + "learning_rate": 4.4185994331202795e-05, + "loss": 5.0675, + "step": 37250 + }, + { + "epoch": 0.2215422495004282, + "grad_norm": 1.9269020557403564, + "learning_rate": 4.418569486115205e-05, + "loss": 4.9713, + "step": 37251 + }, + { + "epoch": 0.2215481967837092, + "grad_norm": 1.3561869859695435, + "learning_rate": 4.4185395384403784e-05, + "loss": 4.9614, + "step": 37252 + }, + { + "epoch": 0.2215541440669902, + "grad_norm": 1.6398110389709473, + "learning_rate": 4.4185095900958085e-05, + "loss": 5.0054, + "step": 37253 + }, + { + "epoch": 0.2215600913502712, + "grad_norm": 1.5096663236618042, + "learning_rate": 4.418479641081507e-05, + "loss": 5.2334, + "step": 37254 + }, + { + "epoch": 0.2215660386335522, + "grad_norm": 1.9203683137893677, + "learning_rate": 4.418449691397485e-05, + "loss": 4.7206, + "step": 37255 + }, + { + "epoch": 0.2215719859168332, + "grad_norm": 1.8168144226074219, + "learning_rate": 4.4184197410437514e-05, + "loss": 4.6005, + "step": 37256 + }, + { + "epoch": 0.22157793320011418, + "grad_norm": 1.7423299551010132, + "learning_rate": 4.4183897900203164e-05, + "loss": 4.8605, + "step": 37257 + }, + { + "epoch": 0.2215838804833952, + "grad_norm": 1.7743721008300781, + "learning_rate": 4.418359838327193e-05, + "loss": 4.8841, + "step": 37258 + }, + { + "epoch": 0.22158982776667618, + "grad_norm": 1.8115425109863281, + "learning_rate": 4.418329885964389e-05, + "loss": 4.7159, + "step": 37259 + }, + { + "epoch": 0.22159577504995717, + "grad_norm": 1.4087785482406616, + "learning_rate": 4.418299932931916e-05, + "loss": 4.8462, + "step": 37260 + }, + { + "epoch": 0.2216017223332382, + "grad_norm": 1.7165182828903198, + "learning_rate": 4.4182699792297844e-05, + "loss": 4.66, + "step": 37261 + }, + { + "epoch": 0.22160766961651918, + "grad_norm": 1.7734102010726929, + "learning_rate": 4.418240024858004e-05, + "loss": 5.1589, + "step": 37262 + }, + { + "epoch": 0.22161361689980016, + "grad_norm": 1.6220389604568481, + "learning_rate": 4.418210069816586e-05, + "loss": 5.4969, + "step": 37263 + }, + { + "epoch": 0.22161956418308118, + "grad_norm": 1.607691764831543, + "learning_rate": 4.4181801141055415e-05, + "loss": 5.2169, + "step": 37264 + }, + { + "epoch": 0.22162551146636217, + "grad_norm": 2.094848871231079, + "learning_rate": 4.418150157724879e-05, + "loss": 5.2492, + "step": 37265 + }, + { + "epoch": 0.22163145874964316, + "grad_norm": 1.8658332824707031, + "learning_rate": 4.418120200674611e-05, + "loss": 5.299, + "step": 37266 + }, + { + "epoch": 0.22163740603292417, + "grad_norm": 1.4364315271377563, + "learning_rate": 4.418090242954748e-05, + "loss": 5.0549, + "step": 37267 + }, + { + "epoch": 0.22164335331620516, + "grad_norm": 1.4865174293518066, + "learning_rate": 4.4180602845652975e-05, + "loss": 4.9178, + "step": 37268 + }, + { + "epoch": 0.22164930059948615, + "grad_norm": 1.584671974182129, + "learning_rate": 4.4180303255062724e-05, + "loss": 5.0584, + "step": 37269 + }, + { + "epoch": 0.22165524788276716, + "grad_norm": 1.6680519580841064, + "learning_rate": 4.4180003657776834e-05, + "loss": 5.4479, + "step": 37270 + }, + { + "epoch": 0.22166119516604815, + "grad_norm": 2.0023248195648193, + "learning_rate": 4.41797040537954e-05, + "loss": 3.9446, + "step": 37271 + }, + { + "epoch": 0.22166714244932914, + "grad_norm": 2.2941033840179443, + "learning_rate": 4.4179404443118534e-05, + "loss": 2.83, + "step": 37272 + }, + { + "epoch": 0.22167308973261016, + "grad_norm": 2.8651883602142334, + "learning_rate": 4.4179104825746335e-05, + "loss": 3.4193, + "step": 37273 + }, + { + "epoch": 0.22167903701589114, + "grad_norm": 2.1556551456451416, + "learning_rate": 4.4178805201678895e-05, + "loss": 4.4954, + "step": 37274 + }, + { + "epoch": 0.22168498429917213, + "grad_norm": 2.3173985481262207, + "learning_rate": 4.417850557091635e-05, + "loss": 5.131, + "step": 37275 + }, + { + "epoch": 0.22169093158245315, + "grad_norm": 1.8110771179199219, + "learning_rate": 4.417820593345878e-05, + "loss": 5.0599, + "step": 37276 + }, + { + "epoch": 0.22169687886573414, + "grad_norm": 1.6023890972137451, + "learning_rate": 4.417790628930629e-05, + "loss": 4.9864, + "step": 37277 + }, + { + "epoch": 0.22170282614901513, + "grad_norm": 1.5635809898376465, + "learning_rate": 4.4177606638459004e-05, + "loss": 5.6305, + "step": 37278 + }, + { + "epoch": 0.2217087734322961, + "grad_norm": 1.7081363201141357, + "learning_rate": 4.4177306980917e-05, + "loss": 4.4753, + "step": 37279 + }, + { + "epoch": 0.22171472071557713, + "grad_norm": 1.7905879020690918, + "learning_rate": 4.4177007316680404e-05, + "loss": 4.9362, + "step": 37280 + }, + { + "epoch": 0.22172066799885812, + "grad_norm": 1.5954350233078003, + "learning_rate": 4.4176707645749316e-05, + "loss": 5.0016, + "step": 37281 + }, + { + "epoch": 0.2217266152821391, + "grad_norm": 1.5420632362365723, + "learning_rate": 4.4176407968123834e-05, + "loss": 5.1209, + "step": 37282 + }, + { + "epoch": 0.22173256256542012, + "grad_norm": 1.4477598667144775, + "learning_rate": 4.417610828380406e-05, + "loss": 5.4703, + "step": 37283 + }, + { + "epoch": 0.2217385098487011, + "grad_norm": 1.7971065044403076, + "learning_rate": 4.417580859279011e-05, + "loss": 4.1857, + "step": 37284 + }, + { + "epoch": 0.2217444571319821, + "grad_norm": 1.6404802799224854, + "learning_rate": 4.417550889508208e-05, + "loss": 4.6683, + "step": 37285 + }, + { + "epoch": 0.2217504044152631, + "grad_norm": 1.6057367324829102, + "learning_rate": 4.417520919068009e-05, + "loss": 4.7164, + "step": 37286 + }, + { + "epoch": 0.2217563516985441, + "grad_norm": 1.6254706382751465, + "learning_rate": 4.4174909479584214e-05, + "loss": 4.9378, + "step": 37287 + }, + { + "epoch": 0.2217622989818251, + "grad_norm": 1.501516342163086, + "learning_rate": 4.417460976179459e-05, + "loss": 5.6384, + "step": 37288 + }, + { + "epoch": 0.2217682462651061, + "grad_norm": 1.5623992681503296, + "learning_rate": 4.417431003731131e-05, + "loss": 5.4863, + "step": 37289 + }, + { + "epoch": 0.2217741935483871, + "grad_norm": 1.533334493637085, + "learning_rate": 4.417401030613446e-05, + "loss": 5.4763, + "step": 37290 + }, + { + "epoch": 0.22178014083166808, + "grad_norm": 1.5613082647323608, + "learning_rate": 4.417371056826417e-05, + "loss": 5.5083, + "step": 37291 + }, + { + "epoch": 0.2217860881149491, + "grad_norm": 1.5319432020187378, + "learning_rate": 4.417341082370054e-05, + "loss": 5.4524, + "step": 37292 + }, + { + "epoch": 0.2217920353982301, + "grad_norm": 1.9295907020568848, + "learning_rate": 4.417311107244366e-05, + "loss": 5.2326, + "step": 37293 + }, + { + "epoch": 0.22179798268151107, + "grad_norm": 1.6446950435638428, + "learning_rate": 4.417281131449366e-05, + "loss": 5.2152, + "step": 37294 + }, + { + "epoch": 0.2218039299647921, + "grad_norm": 1.6639310121536255, + "learning_rate": 4.417251154985062e-05, + "loss": 5.2117, + "step": 37295 + }, + { + "epoch": 0.22180987724807308, + "grad_norm": 1.6263519525527954, + "learning_rate": 4.417221177851466e-05, + "loss": 5.2871, + "step": 37296 + }, + { + "epoch": 0.22181582453135407, + "grad_norm": 1.3505241870880127, + "learning_rate": 4.4171912000485874e-05, + "loss": 5.4459, + "step": 37297 + }, + { + "epoch": 0.22182177181463508, + "grad_norm": 1.5780766010284424, + "learning_rate": 4.4171612215764366e-05, + "loss": 5.4587, + "step": 37298 + }, + { + "epoch": 0.22182771909791607, + "grad_norm": 1.377548336982727, + "learning_rate": 4.4171312424350253e-05, + "loss": 5.0177, + "step": 37299 + }, + { + "epoch": 0.22183366638119706, + "grad_norm": 1.283535361289978, + "learning_rate": 4.417101262624363e-05, + "loss": 4.9686, + "step": 37300 + }, + { + "epoch": 0.22183961366447807, + "grad_norm": 1.591565489768982, + "learning_rate": 4.4170712821444604e-05, + "loss": 4.9452, + "step": 37301 + }, + { + "epoch": 0.22184556094775906, + "grad_norm": 1.7594454288482666, + "learning_rate": 4.417041300995329e-05, + "loss": 4.8756, + "step": 37302 + }, + { + "epoch": 0.22185150823104005, + "grad_norm": 1.743808388710022, + "learning_rate": 4.417011319176977e-05, + "loss": 4.8011, + "step": 37303 + }, + { + "epoch": 0.22185745551432107, + "grad_norm": 1.5689365863800049, + "learning_rate": 4.416981336689417e-05, + "loss": 4.4118, + "step": 37304 + }, + { + "epoch": 0.22186340279760206, + "grad_norm": 2.2633965015411377, + "learning_rate": 4.4169513535326585e-05, + "loss": 4.8006, + "step": 37305 + }, + { + "epoch": 0.22186935008088304, + "grad_norm": 2.0904433727264404, + "learning_rate": 4.416921369706712e-05, + "loss": 3.6457, + "step": 37306 + }, + { + "epoch": 0.22187529736416406, + "grad_norm": 2.4290525913238525, + "learning_rate": 4.4168913852115876e-05, + "loss": 3.5459, + "step": 37307 + }, + { + "epoch": 0.22188124464744505, + "grad_norm": 2.113612413406372, + "learning_rate": 4.416861400047297e-05, + "loss": 3.6094, + "step": 37308 + }, + { + "epoch": 0.22188719193072604, + "grad_norm": 1.9198821783065796, + "learning_rate": 4.416831414213849e-05, + "loss": 3.6966, + "step": 37309 + }, + { + "epoch": 0.22189313921400705, + "grad_norm": 2.143109083175659, + "learning_rate": 4.4168014277112554e-05, + "loss": 3.36, + "step": 37310 + }, + { + "epoch": 0.22189908649728804, + "grad_norm": 2.0741262435913086, + "learning_rate": 4.4167714405395267e-05, + "loss": 3.8931, + "step": 37311 + }, + { + "epoch": 0.22190503378056903, + "grad_norm": 1.7945109605789185, + "learning_rate": 4.416741452698673e-05, + "loss": 5.0497, + "step": 37312 + }, + { + "epoch": 0.22191098106385004, + "grad_norm": 1.7045809030532837, + "learning_rate": 4.4167114641887033e-05, + "loss": 5.1705, + "step": 37313 + }, + { + "epoch": 0.22191692834713103, + "grad_norm": 2.1909990310668945, + "learning_rate": 4.4166814750096305e-05, + "loss": 4.0491, + "step": 37314 + }, + { + "epoch": 0.22192287563041202, + "grad_norm": 1.9659631252288818, + "learning_rate": 4.416651485161464e-05, + "loss": 4.8632, + "step": 37315 + }, + { + "epoch": 0.22192882291369304, + "grad_norm": 2.046928644180298, + "learning_rate": 4.416621494644214e-05, + "loss": 4.8176, + "step": 37316 + }, + { + "epoch": 0.22193477019697402, + "grad_norm": 1.9225927591323853, + "learning_rate": 4.416591503457891e-05, + "loss": 5.1519, + "step": 37317 + }, + { + "epoch": 0.221940717480255, + "grad_norm": 1.9699875116348267, + "learning_rate": 4.416561511602506e-05, + "loss": 4.6214, + "step": 37318 + }, + { + "epoch": 0.22194666476353603, + "grad_norm": 1.7100906372070312, + "learning_rate": 4.416531519078069e-05, + "loss": 4.6123, + "step": 37319 + }, + { + "epoch": 0.22195261204681702, + "grad_norm": 2.0231447219848633, + "learning_rate": 4.416501525884591e-05, + "loss": 4.7937, + "step": 37320 + }, + { + "epoch": 0.221958559330098, + "grad_norm": 1.9513578414916992, + "learning_rate": 4.4164715320220814e-05, + "loss": 5.5448, + "step": 37321 + }, + { + "epoch": 0.22196450661337902, + "grad_norm": 1.7066813707351685, + "learning_rate": 4.416441537490552e-05, + "loss": 5.2223, + "step": 37322 + }, + { + "epoch": 0.22197045389666, + "grad_norm": 2.141442060470581, + "learning_rate": 4.416411542290013e-05, + "loss": 4.6248, + "step": 37323 + }, + { + "epoch": 0.221976401179941, + "grad_norm": 2.92130970954895, + "learning_rate": 4.416381546420474e-05, + "loss": 3.9375, + "step": 37324 + }, + { + "epoch": 0.221982348463222, + "grad_norm": 1.8970509767532349, + "learning_rate": 4.4163515498819464e-05, + "loss": 4.4692, + "step": 37325 + }, + { + "epoch": 0.221988295746503, + "grad_norm": 1.6135637760162354, + "learning_rate": 4.41632155267444e-05, + "loss": 4.2313, + "step": 37326 + }, + { + "epoch": 0.221994243029784, + "grad_norm": 1.5715364217758179, + "learning_rate": 4.4162915547979655e-05, + "loss": 4.7554, + "step": 37327 + }, + { + "epoch": 0.222000190313065, + "grad_norm": 2.162321090698242, + "learning_rate": 4.416261556252533e-05, + "loss": 4.9361, + "step": 37328 + }, + { + "epoch": 0.222006137596346, + "grad_norm": 1.5135966539382935, + "learning_rate": 4.416231557038154e-05, + "loss": 5.1464, + "step": 37329 + }, + { + "epoch": 0.22201208487962698, + "grad_norm": 1.588383436203003, + "learning_rate": 4.416201557154838e-05, + "loss": 5.0772, + "step": 37330 + }, + { + "epoch": 0.222018032162908, + "grad_norm": 1.5293753147125244, + "learning_rate": 4.416171556602596e-05, + "loss": 4.5423, + "step": 37331 + }, + { + "epoch": 0.22202397944618898, + "grad_norm": 1.4758036136627197, + "learning_rate": 4.416141555381439e-05, + "loss": 5.0102, + "step": 37332 + }, + { + "epoch": 0.22202992672946997, + "grad_norm": 1.5266300439834595, + "learning_rate": 4.4161115534913755e-05, + "loss": 4.7217, + "step": 37333 + }, + { + "epoch": 0.222035874012751, + "grad_norm": 1.528539776802063, + "learning_rate": 4.4160815509324184e-05, + "loss": 4.7979, + "step": 37334 + }, + { + "epoch": 0.22204182129603198, + "grad_norm": 1.58788001537323, + "learning_rate": 4.4160515477045764e-05, + "loss": 4.4164, + "step": 37335 + }, + { + "epoch": 0.22204776857931297, + "grad_norm": 1.5381730794906616, + "learning_rate": 4.416021543807861e-05, + "loss": 4.9237, + "step": 37336 + }, + { + "epoch": 0.22205371586259395, + "grad_norm": 1.800580620765686, + "learning_rate": 4.4159915392422814e-05, + "loss": 4.7554, + "step": 37337 + }, + { + "epoch": 0.22205966314587497, + "grad_norm": 1.5559518337249756, + "learning_rate": 4.4159615340078495e-05, + "loss": 4.8953, + "step": 37338 + }, + { + "epoch": 0.22206561042915596, + "grad_norm": 1.968245029449463, + "learning_rate": 4.415931528104575e-05, + "loss": 4.4203, + "step": 37339 + }, + { + "epoch": 0.22207155771243695, + "grad_norm": 1.6635748147964478, + "learning_rate": 4.4159015215324696e-05, + "loss": 4.3885, + "step": 37340 + }, + { + "epoch": 0.22207750499571796, + "grad_norm": 1.7728335857391357, + "learning_rate": 4.415871514291542e-05, + "loss": 3.8601, + "step": 37341 + }, + { + "epoch": 0.22208345227899895, + "grad_norm": 1.3408424854278564, + "learning_rate": 4.4158415063818025e-05, + "loss": 3.8574, + "step": 37342 + }, + { + "epoch": 0.22208939956227994, + "grad_norm": 1.610424280166626, + "learning_rate": 4.415811497803264e-05, + "loss": 4.7664, + "step": 37343 + }, + { + "epoch": 0.22209534684556095, + "grad_norm": 1.6959972381591797, + "learning_rate": 4.415781488555935e-05, + "loss": 4.9161, + "step": 37344 + }, + { + "epoch": 0.22210129412884194, + "grad_norm": 1.5437208414077759, + "learning_rate": 4.415751478639826e-05, + "loss": 4.9545, + "step": 37345 + }, + { + "epoch": 0.22210724141212293, + "grad_norm": 1.6301335096359253, + "learning_rate": 4.4157214680549485e-05, + "loss": 4.4827, + "step": 37346 + }, + { + "epoch": 0.22211318869540395, + "grad_norm": 1.9365746974945068, + "learning_rate": 4.415691456801313e-05, + "loss": 4.0574, + "step": 37347 + }, + { + "epoch": 0.22211913597868493, + "grad_norm": 1.971279501914978, + "learning_rate": 4.415661444878928e-05, + "loss": 3.8612, + "step": 37348 + }, + { + "epoch": 0.22212508326196592, + "grad_norm": 1.8018229007720947, + "learning_rate": 4.4156314322878064e-05, + "loss": 4.1403, + "step": 37349 + }, + { + "epoch": 0.22213103054524694, + "grad_norm": 1.7496007680892944, + "learning_rate": 4.4156014190279576e-05, + "loss": 4.2662, + "step": 37350 + }, + { + "epoch": 0.22213697782852793, + "grad_norm": 2.0904650688171387, + "learning_rate": 4.415571405099391e-05, + "loss": 4.4694, + "step": 37351 + }, + { + "epoch": 0.22214292511180891, + "grad_norm": 1.7494895458221436, + "learning_rate": 4.41554139050212e-05, + "loss": 4.4199, + "step": 37352 + }, + { + "epoch": 0.22214887239508993, + "grad_norm": 1.7014201879501343, + "learning_rate": 4.415511375236152e-05, + "loss": 4.5209, + "step": 37353 + }, + { + "epoch": 0.22215481967837092, + "grad_norm": 1.5451538562774658, + "learning_rate": 4.415481359301499e-05, + "loss": 4.2152, + "step": 37354 + }, + { + "epoch": 0.2221607669616519, + "grad_norm": 1.3573757410049438, + "learning_rate": 4.4154513426981714e-05, + "loss": 3.6105, + "step": 37355 + }, + { + "epoch": 0.22216671424493292, + "grad_norm": 1.498342752456665, + "learning_rate": 4.41542132542618e-05, + "loss": 3.9755, + "step": 37356 + }, + { + "epoch": 0.2221726615282139, + "grad_norm": 1.7153942584991455, + "learning_rate": 4.4153913074855344e-05, + "loss": 4.0452, + "step": 37357 + }, + { + "epoch": 0.2221786088114949, + "grad_norm": 1.6392310857772827, + "learning_rate": 4.4153612888762455e-05, + "loss": 4.6287, + "step": 37358 + }, + { + "epoch": 0.22218455609477591, + "grad_norm": 1.8928215503692627, + "learning_rate": 4.415331269598324e-05, + "loss": 4.587, + "step": 37359 + }, + { + "epoch": 0.2221905033780569, + "grad_norm": 1.5934067964553833, + "learning_rate": 4.415301249651779e-05, + "loss": 4.626, + "step": 37360 + }, + { + "epoch": 0.2221964506613379, + "grad_norm": 1.2099053859710693, + "learning_rate": 4.415271229036623e-05, + "loss": 4.7737, + "step": 37361 + }, + { + "epoch": 0.2222023979446189, + "grad_norm": 1.5044233798980713, + "learning_rate": 4.415241207752866e-05, + "loss": 4.94, + "step": 37362 + }, + { + "epoch": 0.2222083452278999, + "grad_norm": 1.8237147331237793, + "learning_rate": 4.415211185800517e-05, + "loss": 4.2119, + "step": 37363 + }, + { + "epoch": 0.22221429251118088, + "grad_norm": 1.3939549922943115, + "learning_rate": 4.415181163179589e-05, + "loss": 5.1684, + "step": 37364 + }, + { + "epoch": 0.2222202397944619, + "grad_norm": 1.8115434646606445, + "learning_rate": 4.41515113989009e-05, + "loss": 4.5456, + "step": 37365 + }, + { + "epoch": 0.2222261870777429, + "grad_norm": 1.6453301906585693, + "learning_rate": 4.415121115932031e-05, + "loss": 4.6794, + "step": 37366 + }, + { + "epoch": 0.22223213436102388, + "grad_norm": 1.6238987445831299, + "learning_rate": 4.4150910913054244e-05, + "loss": 4.7563, + "step": 37367 + }, + { + "epoch": 0.2222380816443049, + "grad_norm": 1.9033849239349365, + "learning_rate": 4.415061066010279e-05, + "loss": 4.5263, + "step": 37368 + }, + { + "epoch": 0.22224402892758588, + "grad_norm": 1.6006360054016113, + "learning_rate": 4.415031040046605e-05, + "loss": 4.7523, + "step": 37369 + }, + { + "epoch": 0.22224997621086687, + "grad_norm": 1.5096614360809326, + "learning_rate": 4.415001013414414e-05, + "loss": 4.872, + "step": 37370 + }, + { + "epoch": 0.22225592349414788, + "grad_norm": 1.7860287427902222, + "learning_rate": 4.414970986113716e-05, + "loss": 4.4742, + "step": 37371 + }, + { + "epoch": 0.22226187077742887, + "grad_norm": 1.6604384183883667, + "learning_rate": 4.41494095814452e-05, + "loss": 4.7356, + "step": 37372 + }, + { + "epoch": 0.22226781806070986, + "grad_norm": 1.7952136993408203, + "learning_rate": 4.414910929506839e-05, + "loss": 4.5291, + "step": 37373 + }, + { + "epoch": 0.22227376534399088, + "grad_norm": 1.5343014001846313, + "learning_rate": 4.414880900200682e-05, + "loss": 4.6148, + "step": 37374 + }, + { + "epoch": 0.22227971262727186, + "grad_norm": 1.661389946937561, + "learning_rate": 4.4148508702260605e-05, + "loss": 4.071, + "step": 37375 + }, + { + "epoch": 0.22228565991055285, + "grad_norm": 1.4415347576141357, + "learning_rate": 4.414820839582984e-05, + "loss": 4.4066, + "step": 37376 + }, + { + "epoch": 0.22229160719383387, + "grad_norm": 1.6499462127685547, + "learning_rate": 4.414790808271464e-05, + "loss": 4.3395, + "step": 37377 + }, + { + "epoch": 0.22229755447711486, + "grad_norm": 1.5493072271347046, + "learning_rate": 4.414760776291509e-05, + "loss": 4.3965, + "step": 37378 + }, + { + "epoch": 0.22230350176039584, + "grad_norm": 1.5924429893493652, + "learning_rate": 4.4147307436431316e-05, + "loss": 4.4357, + "step": 37379 + }, + { + "epoch": 0.22230944904367686, + "grad_norm": 1.7015823125839233, + "learning_rate": 4.4147007103263415e-05, + "loss": 4.2155, + "step": 37380 + }, + { + "epoch": 0.22231539632695785, + "grad_norm": 1.7009806632995605, + "learning_rate": 4.414670676341149e-05, + "loss": 4.6103, + "step": 37381 + }, + { + "epoch": 0.22232134361023884, + "grad_norm": 1.572592854499817, + "learning_rate": 4.414640641687564e-05, + "loss": 4.6888, + "step": 37382 + }, + { + "epoch": 0.22232729089351985, + "grad_norm": 1.8123164176940918, + "learning_rate": 4.414610606365599e-05, + "loss": 4.3521, + "step": 37383 + }, + { + "epoch": 0.22233323817680084, + "grad_norm": 1.926174521446228, + "learning_rate": 4.414580570375262e-05, + "loss": 4.2927, + "step": 37384 + }, + { + "epoch": 0.22233918546008183, + "grad_norm": 1.7663146257400513, + "learning_rate": 4.414550533716566e-05, + "loss": 4.4666, + "step": 37385 + }, + { + "epoch": 0.22234513274336284, + "grad_norm": 2.544118881225586, + "learning_rate": 4.414520496389519e-05, + "loss": 2.8958, + "step": 37386 + }, + { + "epoch": 0.22235108002664383, + "grad_norm": 1.5476171970367432, + "learning_rate": 4.414490458394134e-05, + "loss": 4.5871, + "step": 37387 + }, + { + "epoch": 0.22235702730992482, + "grad_norm": 1.7039881944656372, + "learning_rate": 4.414460419730419e-05, + "loss": 4.8665, + "step": 37388 + }, + { + "epoch": 0.22236297459320584, + "grad_norm": 1.6667733192443848, + "learning_rate": 4.414430380398386e-05, + "loss": 4.4097, + "step": 37389 + }, + { + "epoch": 0.22236892187648682, + "grad_norm": 1.839328408241272, + "learning_rate": 4.414400340398045e-05, + "loss": 4.3882, + "step": 37390 + }, + { + "epoch": 0.2223748691597678, + "grad_norm": 1.97493314743042, + "learning_rate": 4.4143702997294066e-05, + "loss": 4.1716, + "step": 37391 + }, + { + "epoch": 0.22238081644304883, + "grad_norm": 1.4891178607940674, + "learning_rate": 4.414340258392482e-05, + "loss": 4.6106, + "step": 37392 + }, + { + "epoch": 0.22238676372632982, + "grad_norm": 1.6419004201889038, + "learning_rate": 4.414310216387281e-05, + "loss": 4.6838, + "step": 37393 + }, + { + "epoch": 0.2223927110096108, + "grad_norm": 1.5355687141418457, + "learning_rate": 4.414280173713813e-05, + "loss": 4.5691, + "step": 37394 + }, + { + "epoch": 0.2223986582928918, + "grad_norm": 1.6541396379470825, + "learning_rate": 4.4142501303720904e-05, + "loss": 4.7063, + "step": 37395 + }, + { + "epoch": 0.2224046055761728, + "grad_norm": 1.459181308746338, + "learning_rate": 4.4142200863621226e-05, + "loss": 4.9176, + "step": 37396 + }, + { + "epoch": 0.2224105528594538, + "grad_norm": 1.7777023315429688, + "learning_rate": 4.4141900416839196e-05, + "loss": 4.4156, + "step": 37397 + }, + { + "epoch": 0.22241650014273479, + "grad_norm": 2.0304362773895264, + "learning_rate": 4.4141599963374944e-05, + "loss": 4.2872, + "step": 37398 + }, + { + "epoch": 0.2224224474260158, + "grad_norm": 1.6349958181381226, + "learning_rate": 4.414129950322854e-05, + "loss": 4.6948, + "step": 37399 + }, + { + "epoch": 0.2224283947092968, + "grad_norm": 2.002707004547119, + "learning_rate": 4.4140999036400116e-05, + "loss": 4.0864, + "step": 37400 + }, + { + "epoch": 0.22243434199257778, + "grad_norm": 1.1629236936569214, + "learning_rate": 4.4140698562889765e-05, + "loss": 5.0643, + "step": 37401 + }, + { + "epoch": 0.2224402892758588, + "grad_norm": 1.2741730213165283, + "learning_rate": 4.414039808269759e-05, + "loss": 4.8496, + "step": 37402 + }, + { + "epoch": 0.22244623655913978, + "grad_norm": 1.4503839015960693, + "learning_rate": 4.41400975958237e-05, + "loss": 5.0413, + "step": 37403 + }, + { + "epoch": 0.22245218384242077, + "grad_norm": 1.4399816989898682, + "learning_rate": 4.413979710226821e-05, + "loss": 5.1163, + "step": 37404 + }, + { + "epoch": 0.22245813112570179, + "grad_norm": 1.431563138961792, + "learning_rate": 4.4139496602031204e-05, + "loss": 4.866, + "step": 37405 + }, + { + "epoch": 0.22246407840898277, + "grad_norm": 1.7766673564910889, + "learning_rate": 4.41391960951128e-05, + "loss": 5.1027, + "step": 37406 + }, + { + "epoch": 0.22247002569226376, + "grad_norm": 1.546772837638855, + "learning_rate": 4.41388955815131e-05, + "loss": 4.9627, + "step": 37407 + }, + { + "epoch": 0.22247597297554478, + "grad_norm": 1.5983178615570068, + "learning_rate": 4.413859506123221e-05, + "loss": 5.0733, + "step": 37408 + }, + { + "epoch": 0.22248192025882577, + "grad_norm": 1.9304602146148682, + "learning_rate": 4.4138294534270234e-05, + "loss": 4.9825, + "step": 37409 + }, + { + "epoch": 0.22248786754210675, + "grad_norm": 1.861481785774231, + "learning_rate": 4.413799400062728e-05, + "loss": 4.7403, + "step": 37410 + }, + { + "epoch": 0.22249381482538777, + "grad_norm": 1.5608370304107666, + "learning_rate": 4.413769346030345e-05, + "loss": 5.0014, + "step": 37411 + }, + { + "epoch": 0.22249976210866876, + "grad_norm": 1.7520523071289062, + "learning_rate": 4.413739291329884e-05, + "loss": 4.8831, + "step": 37412 + }, + { + "epoch": 0.22250570939194975, + "grad_norm": 1.58255136013031, + "learning_rate": 4.413709235961358e-05, + "loss": 4.8975, + "step": 37413 + }, + { + "epoch": 0.22251165667523076, + "grad_norm": 1.6198471784591675, + "learning_rate": 4.413679179924774e-05, + "loss": 4.9793, + "step": 37414 + }, + { + "epoch": 0.22251760395851175, + "grad_norm": 1.5712491273880005, + "learning_rate": 4.4136491232201454e-05, + "loss": 5.107, + "step": 37415 + }, + { + "epoch": 0.22252355124179274, + "grad_norm": 1.3748947381973267, + "learning_rate": 4.413619065847482e-05, + "loss": 5.0109, + "step": 37416 + }, + { + "epoch": 0.22252949852507375, + "grad_norm": 1.292171597480774, + "learning_rate": 4.4135890078067935e-05, + "loss": 4.7851, + "step": 37417 + }, + { + "epoch": 0.22253544580835474, + "grad_norm": 1.2264519929885864, + "learning_rate": 4.413558949098091e-05, + "loss": 4.7857, + "step": 37418 + }, + { + "epoch": 0.22254139309163573, + "grad_norm": 1.3294142484664917, + "learning_rate": 4.413528889721385e-05, + "loss": 4.6828, + "step": 37419 + }, + { + "epoch": 0.22254734037491675, + "grad_norm": 1.415412425994873, + "learning_rate": 4.413498829676685e-05, + "loss": 4.7359, + "step": 37420 + }, + { + "epoch": 0.22255328765819773, + "grad_norm": 1.5666321516036987, + "learning_rate": 4.4134687689640016e-05, + "loss": 4.8958, + "step": 37421 + }, + { + "epoch": 0.22255923494147872, + "grad_norm": 1.5865098237991333, + "learning_rate": 4.4134387075833484e-05, + "loss": 4.7827, + "step": 37422 + }, + { + "epoch": 0.22256518222475974, + "grad_norm": 1.4719741344451904, + "learning_rate": 4.4134086455347325e-05, + "loss": 4.673, + "step": 37423 + }, + { + "epoch": 0.22257112950804073, + "grad_norm": 1.574626088142395, + "learning_rate": 4.413378582818165e-05, + "loss": 4.6705, + "step": 37424 + }, + { + "epoch": 0.22257707679132172, + "grad_norm": 1.4533343315124512, + "learning_rate": 4.413348519433657e-05, + "loss": 4.5065, + "step": 37425 + }, + { + "epoch": 0.22258302407460273, + "grad_norm": 1.488586187362671, + "learning_rate": 4.413318455381219e-05, + "loss": 4.5226, + "step": 37426 + }, + { + "epoch": 0.22258897135788372, + "grad_norm": 1.7579782009124756, + "learning_rate": 4.4132883906608616e-05, + "loss": 4.6338, + "step": 37427 + }, + { + "epoch": 0.2225949186411647, + "grad_norm": 1.8849931955337524, + "learning_rate": 4.413258325272594e-05, + "loss": 4.6397, + "step": 37428 + }, + { + "epoch": 0.22260086592444572, + "grad_norm": 1.6533501148223877, + "learning_rate": 4.4132282592164286e-05, + "loss": 5.01, + "step": 37429 + }, + { + "epoch": 0.2226068132077267, + "grad_norm": 1.648901104927063, + "learning_rate": 4.4131981924923744e-05, + "loss": 4.9204, + "step": 37430 + }, + { + "epoch": 0.2226127604910077, + "grad_norm": 1.7030214071273804, + "learning_rate": 4.413168125100443e-05, + "loss": 4.839, + "step": 37431 + }, + { + "epoch": 0.22261870777428872, + "grad_norm": 1.5959028005599976, + "learning_rate": 4.413138057040644e-05, + "loss": 4.8709, + "step": 37432 + }, + { + "epoch": 0.2226246550575697, + "grad_norm": 1.9714707136154175, + "learning_rate": 4.413107988312988e-05, + "loss": 4.7504, + "step": 37433 + }, + { + "epoch": 0.2226306023408507, + "grad_norm": 1.6847248077392578, + "learning_rate": 4.4130779189174865e-05, + "loss": 4.8802, + "step": 37434 + }, + { + "epoch": 0.2226365496241317, + "grad_norm": 1.743908405303955, + "learning_rate": 4.4130478488541486e-05, + "loss": 4.8833, + "step": 37435 + }, + { + "epoch": 0.2226424969074127, + "grad_norm": 1.3226217031478882, + "learning_rate": 4.4130177781229855e-05, + "loss": 4.9731, + "step": 37436 + }, + { + "epoch": 0.22264844419069368, + "grad_norm": 1.6143287420272827, + "learning_rate": 4.412987706724008e-05, + "loss": 4.8707, + "step": 37437 + }, + { + "epoch": 0.2226543914739747, + "grad_norm": 1.3847980499267578, + "learning_rate": 4.4129576346572264e-05, + "loss": 5.3281, + "step": 37438 + }, + { + "epoch": 0.2226603387572557, + "grad_norm": 1.5923258066177368, + "learning_rate": 4.412927561922651e-05, + "loss": 4.7546, + "step": 37439 + }, + { + "epoch": 0.22266628604053668, + "grad_norm": 1.4530616998672485, + "learning_rate": 4.4128974885202914e-05, + "loss": 4.5942, + "step": 37440 + }, + { + "epoch": 0.2226722333238177, + "grad_norm": 1.5023390054702759, + "learning_rate": 4.4128674144501604e-05, + "loss": 4.7337, + "step": 37441 + }, + { + "epoch": 0.22267818060709868, + "grad_norm": 2.181118965148926, + "learning_rate": 4.4128373397122665e-05, + "loss": 3.9208, + "step": 37442 + }, + { + "epoch": 0.22268412789037967, + "grad_norm": 1.8951972723007202, + "learning_rate": 4.412807264306621e-05, + "loss": 3.7684, + "step": 37443 + }, + { + "epoch": 0.22269007517366068, + "grad_norm": 1.570377230644226, + "learning_rate": 4.412777188233234e-05, + "loss": 5.1016, + "step": 37444 + }, + { + "epoch": 0.22269602245694167, + "grad_norm": 1.399253487586975, + "learning_rate": 4.412747111492116e-05, + "loss": 5.2181, + "step": 37445 + }, + { + "epoch": 0.22270196974022266, + "grad_norm": 1.4472614526748657, + "learning_rate": 4.412717034083279e-05, + "loss": 4.9678, + "step": 37446 + }, + { + "epoch": 0.22270791702350368, + "grad_norm": 1.7032182216644287, + "learning_rate": 4.412686956006731e-05, + "loss": 5.055, + "step": 37447 + }, + { + "epoch": 0.22271386430678466, + "grad_norm": 2.5398552417755127, + "learning_rate": 4.412656877262484e-05, + "loss": 4.0731, + "step": 37448 + }, + { + "epoch": 0.22271981159006565, + "grad_norm": 1.3599528074264526, + "learning_rate": 4.4126267978505486e-05, + "loss": 5.2592, + "step": 37449 + }, + { + "epoch": 0.22272575887334667, + "grad_norm": 1.395141839981079, + "learning_rate": 4.412596717770935e-05, + "loss": 5.4062, + "step": 37450 + }, + { + "epoch": 0.22273170615662766, + "grad_norm": 1.623476505279541, + "learning_rate": 4.4125666370236526e-05, + "loss": 4.8599, + "step": 37451 + }, + { + "epoch": 0.22273765343990864, + "grad_norm": 1.533883810043335, + "learning_rate": 4.412536555608714e-05, + "loss": 4.8718, + "step": 37452 + }, + { + "epoch": 0.22274360072318963, + "grad_norm": 1.4520567655563354, + "learning_rate": 4.412506473526128e-05, + "loss": 5.0113, + "step": 37453 + }, + { + "epoch": 0.22274954800647065, + "grad_norm": 1.4977203607559204, + "learning_rate": 4.4124763907759064e-05, + "loss": 4.7799, + "step": 37454 + }, + { + "epoch": 0.22275549528975164, + "grad_norm": 2.2048726081848145, + "learning_rate": 4.412446307358059e-05, + "loss": 4.2383, + "step": 37455 + }, + { + "epoch": 0.22276144257303263, + "grad_norm": 1.8190462589263916, + "learning_rate": 4.4124162232725964e-05, + "loss": 4.8038, + "step": 37456 + }, + { + "epoch": 0.22276738985631364, + "grad_norm": 1.6494126319885254, + "learning_rate": 4.4123861385195286e-05, + "loss": 4.8618, + "step": 37457 + }, + { + "epoch": 0.22277333713959463, + "grad_norm": 1.6867988109588623, + "learning_rate": 4.412356053098866e-05, + "loss": 4.924, + "step": 37458 + }, + { + "epoch": 0.22277928442287562, + "grad_norm": 2.111293077468872, + "learning_rate": 4.412325967010621e-05, + "loss": 3.7083, + "step": 37459 + }, + { + "epoch": 0.22278523170615663, + "grad_norm": 1.785895586013794, + "learning_rate": 4.412295880254802e-05, + "loss": 4.6681, + "step": 37460 + }, + { + "epoch": 0.22279117898943762, + "grad_norm": 2.56091570854187, + "learning_rate": 4.41226579283142e-05, + "loss": 3.0985, + "step": 37461 + }, + { + "epoch": 0.2227971262727186, + "grad_norm": 1.961890459060669, + "learning_rate": 4.412235704740487e-05, + "loss": 4.2298, + "step": 37462 + }, + { + "epoch": 0.22280307355599963, + "grad_norm": 1.4484755992889404, + "learning_rate": 4.4122056159820116e-05, + "loss": 4.8922, + "step": 37463 + }, + { + "epoch": 0.2228090208392806, + "grad_norm": 1.5370919704437256, + "learning_rate": 4.412175526556004e-05, + "loss": 4.8014, + "step": 37464 + }, + { + "epoch": 0.2228149681225616, + "grad_norm": 1.1378029584884644, + "learning_rate": 4.412145436462477e-05, + "loss": 5.0578, + "step": 37465 + }, + { + "epoch": 0.22282091540584262, + "grad_norm": 1.4581009149551392, + "learning_rate": 4.412115345701439e-05, + "loss": 5.0006, + "step": 37466 + }, + { + "epoch": 0.2228268626891236, + "grad_norm": 1.5039770603179932, + "learning_rate": 4.412085254272902e-05, + "loss": 4.8977, + "step": 37467 + }, + { + "epoch": 0.2228328099724046, + "grad_norm": 2.168529510498047, + "learning_rate": 4.412055162176875e-05, + "loss": 3.9599, + "step": 37468 + }, + { + "epoch": 0.2228387572556856, + "grad_norm": 2.1273956298828125, + "learning_rate": 4.41202506941337e-05, + "loss": 4.1872, + "step": 37469 + }, + { + "epoch": 0.2228447045389666, + "grad_norm": 2.2555415630340576, + "learning_rate": 4.4119949759823965e-05, + "loss": 3.5768, + "step": 37470 + }, + { + "epoch": 0.2228506518222476, + "grad_norm": 1.4979069232940674, + "learning_rate": 4.411964881883965e-05, + "loss": 5.0065, + "step": 37471 + }, + { + "epoch": 0.2228565991055286, + "grad_norm": 1.27516508102417, + "learning_rate": 4.4119347871180865e-05, + "loss": 5.1013, + "step": 37472 + }, + { + "epoch": 0.2228625463888096, + "grad_norm": 1.738444209098816, + "learning_rate": 4.4119046916847715e-05, + "loss": 4.526, + "step": 37473 + }, + { + "epoch": 0.22286849367209058, + "grad_norm": 1.953614592552185, + "learning_rate": 4.4118745955840304e-05, + "loss": 5.1593, + "step": 37474 + }, + { + "epoch": 0.2228744409553716, + "grad_norm": 1.550534725189209, + "learning_rate": 4.411844498815873e-05, + "loss": 5.1234, + "step": 37475 + }, + { + "epoch": 0.22288038823865258, + "grad_norm": 1.301795244216919, + "learning_rate": 4.411814401380311e-05, + "loss": 4.9212, + "step": 37476 + }, + { + "epoch": 0.22288633552193357, + "grad_norm": 1.4100189208984375, + "learning_rate": 4.4117843032773545e-05, + "loss": 4.8568, + "step": 37477 + }, + { + "epoch": 0.2228922828052146, + "grad_norm": 1.6080713272094727, + "learning_rate": 4.4117542045070136e-05, + "loss": 4.8908, + "step": 37478 + }, + { + "epoch": 0.22289823008849557, + "grad_norm": 1.619407296180725, + "learning_rate": 4.411724105069299e-05, + "loss": 5.0473, + "step": 37479 + }, + { + "epoch": 0.22290417737177656, + "grad_norm": 2.0852749347686768, + "learning_rate": 4.411694004964221e-05, + "loss": 4.4932, + "step": 37480 + }, + { + "epoch": 0.22291012465505758, + "grad_norm": 1.6893035173416138, + "learning_rate": 4.411663904191791e-05, + "loss": 4.5006, + "step": 37481 + }, + { + "epoch": 0.22291607193833857, + "grad_norm": 1.794718861579895, + "learning_rate": 4.411633802752019e-05, + "loss": 4.4382, + "step": 37482 + }, + { + "epoch": 0.22292201922161956, + "grad_norm": 1.9049642086029053, + "learning_rate": 4.411603700644914e-05, + "loss": 4.2267, + "step": 37483 + }, + { + "epoch": 0.22292796650490057, + "grad_norm": 1.7459529638290405, + "learning_rate": 4.4115735978704894e-05, + "loss": 4.3071, + "step": 37484 + }, + { + "epoch": 0.22293391378818156, + "grad_norm": 2.4059667587280273, + "learning_rate": 4.4115434944287536e-05, + "loss": 3.3393, + "step": 37485 + }, + { + "epoch": 0.22293986107146255, + "grad_norm": 1.8413442373275757, + "learning_rate": 4.411513390319718e-05, + "loss": 4.3703, + "step": 37486 + }, + { + "epoch": 0.22294580835474356, + "grad_norm": 1.9206432104110718, + "learning_rate": 4.4114832855433916e-05, + "loss": 4.3611, + "step": 37487 + }, + { + "epoch": 0.22295175563802455, + "grad_norm": 1.8674482107162476, + "learning_rate": 4.4114531800997876e-05, + "loss": 3.9913, + "step": 37488 + }, + { + "epoch": 0.22295770292130554, + "grad_norm": 1.7336639165878296, + "learning_rate": 4.411423073988915e-05, + "loss": 4.26, + "step": 37489 + }, + { + "epoch": 0.22296365020458656, + "grad_norm": 2.2799072265625, + "learning_rate": 4.4113929672107834e-05, + "loss": 3.5076, + "step": 37490 + }, + { + "epoch": 0.22296959748786754, + "grad_norm": 1.8522865772247314, + "learning_rate": 4.411362859765405e-05, + "loss": 3.9865, + "step": 37491 + }, + { + "epoch": 0.22297554477114853, + "grad_norm": 2.023106098175049, + "learning_rate": 4.411332751652789e-05, + "loss": 4.3329, + "step": 37492 + }, + { + "epoch": 0.22298149205442955, + "grad_norm": 2.047266721725464, + "learning_rate": 4.4113026428729474e-05, + "loss": 3.833, + "step": 37493 + }, + { + "epoch": 0.22298743933771054, + "grad_norm": 3.233015775680542, + "learning_rate": 4.411272533425889e-05, + "loss": 2.4502, + "step": 37494 + }, + { + "epoch": 0.22299338662099152, + "grad_norm": 2.6674745082855225, + "learning_rate": 4.4112424233116254e-05, + "loss": 2.4631, + "step": 37495 + }, + { + "epoch": 0.22299933390427254, + "grad_norm": 2.8339457511901855, + "learning_rate": 4.411212312530167e-05, + "loss": 2.3195, + "step": 37496 + }, + { + "epoch": 0.22300528118755353, + "grad_norm": 2.9914252758026123, + "learning_rate": 4.411182201081524e-05, + "loss": 1.8622, + "step": 37497 + }, + { + "epoch": 0.22301122847083452, + "grad_norm": 2.7792508602142334, + "learning_rate": 4.411152088965706e-05, + "loss": 2.671, + "step": 37498 + }, + { + "epoch": 0.22301717575411553, + "grad_norm": 3.0347492694854736, + "learning_rate": 4.411121976182726e-05, + "loss": 1.9535, + "step": 37499 + }, + { + "epoch": 0.22302312303739652, + "grad_norm": 4.229783535003662, + "learning_rate": 4.4110918627325924e-05, + "loss": 2.3643, + "step": 37500 + }, + { + "epoch": 0.2230290703206775, + "grad_norm": 4.008993625640869, + "learning_rate": 4.411061748615317e-05, + "loss": 2.4496, + "step": 37501 + }, + { + "epoch": 0.22303501760395852, + "grad_norm": 2.544724702835083, + "learning_rate": 4.4110316338309086e-05, + "loss": 3.348, + "step": 37502 + }, + { + "epoch": 0.2230409648872395, + "grad_norm": 2.404447078704834, + "learning_rate": 4.4110015183793794e-05, + "loss": 3.4087, + "step": 37503 + }, + { + "epoch": 0.2230469121705205, + "grad_norm": 1.6754000186920166, + "learning_rate": 4.41097140226074e-05, + "loss": 5.1169, + "step": 37504 + }, + { + "epoch": 0.22305285945380152, + "grad_norm": 1.438940167427063, + "learning_rate": 4.4109412854749994e-05, + "loss": 5.1149, + "step": 37505 + }, + { + "epoch": 0.2230588067370825, + "grad_norm": 1.4823182821273804, + "learning_rate": 4.4109111680221685e-05, + "loss": 5.3555, + "step": 37506 + }, + { + "epoch": 0.2230647540203635, + "grad_norm": 1.63999342918396, + "learning_rate": 4.410881049902259e-05, + "loss": 5.1622, + "step": 37507 + }, + { + "epoch": 0.2230707013036445, + "grad_norm": 1.972383737564087, + "learning_rate": 4.410850931115281e-05, + "loss": 5.1285, + "step": 37508 + }, + { + "epoch": 0.2230766485869255, + "grad_norm": 1.6305196285247803, + "learning_rate": 4.4108208116612436e-05, + "loss": 5.0795, + "step": 37509 + }, + { + "epoch": 0.22308259587020648, + "grad_norm": 2.873812675476074, + "learning_rate": 4.41079069154016e-05, + "loss": 2.7287, + "step": 37510 + }, + { + "epoch": 0.2230885431534875, + "grad_norm": 2.0002241134643555, + "learning_rate": 4.410760570752037e-05, + "loss": 4.9091, + "step": 37511 + }, + { + "epoch": 0.2230944904367685, + "grad_norm": 1.6034547090530396, + "learning_rate": 4.410730449296889e-05, + "loss": 4.7845, + "step": 37512 + }, + { + "epoch": 0.22310043772004948, + "grad_norm": 1.55063796043396, + "learning_rate": 4.4107003271747236e-05, + "loss": 5.4072, + "step": 37513 + }, + { + "epoch": 0.22310638500333047, + "grad_norm": 1.4179991483688354, + "learning_rate": 4.410670204385553e-05, + "loss": 4.5219, + "step": 37514 + }, + { + "epoch": 0.22311233228661148, + "grad_norm": 1.62294340133667, + "learning_rate": 4.410640080929388e-05, + "loss": 4.7374, + "step": 37515 + }, + { + "epoch": 0.22311827956989247, + "grad_norm": 1.6114813089370728, + "learning_rate": 4.4106099568062367e-05, + "loss": 4.7302, + "step": 37516 + }, + { + "epoch": 0.22312422685317346, + "grad_norm": 1.6104267835617065, + "learning_rate": 4.4105798320161115e-05, + "loss": 4.7967, + "step": 37517 + }, + { + "epoch": 0.22313017413645447, + "grad_norm": 1.6183431148529053, + "learning_rate": 4.410549706559023e-05, + "loss": 4.5677, + "step": 37518 + }, + { + "epoch": 0.22313612141973546, + "grad_norm": 1.3311508893966675, + "learning_rate": 4.410519580434982e-05, + "loss": 4.5629, + "step": 37519 + }, + { + "epoch": 0.22314206870301645, + "grad_norm": 1.5924433469772339, + "learning_rate": 4.4104894536439974e-05, + "loss": 4.4574, + "step": 37520 + }, + { + "epoch": 0.22314801598629747, + "grad_norm": 2.075273275375366, + "learning_rate": 4.410459326186081e-05, + "loss": 3.9978, + "step": 37521 + }, + { + "epoch": 0.22315396326957845, + "grad_norm": 1.531011700630188, + "learning_rate": 4.410429198061243e-05, + "loss": 4.3236, + "step": 37522 + }, + { + "epoch": 0.22315991055285944, + "grad_norm": 1.6738545894622803, + "learning_rate": 4.410399069269494e-05, + "loss": 4.0581, + "step": 37523 + }, + { + "epoch": 0.22316585783614046, + "grad_norm": 1.5245554447174072, + "learning_rate": 4.410368939810844e-05, + "loss": 3.9841, + "step": 37524 + }, + { + "epoch": 0.22317180511942145, + "grad_norm": 1.6631666421890259, + "learning_rate": 4.4103388096853036e-05, + "loss": 4.336, + "step": 37525 + }, + { + "epoch": 0.22317775240270243, + "grad_norm": 1.869472622871399, + "learning_rate": 4.410308678892885e-05, + "loss": 4.8101, + "step": 37526 + }, + { + "epoch": 0.22318369968598345, + "grad_norm": 1.6422040462493896, + "learning_rate": 4.410278547433596e-05, + "loss": 4.8477, + "step": 37527 + }, + { + "epoch": 0.22318964696926444, + "grad_norm": 1.7507972717285156, + "learning_rate": 4.4102484153074496e-05, + "loss": 4.8798, + "step": 37528 + }, + { + "epoch": 0.22319559425254543, + "grad_norm": 1.578799843788147, + "learning_rate": 4.410218282514454e-05, + "loss": 4.6632, + "step": 37529 + }, + { + "epoch": 0.22320154153582644, + "grad_norm": 1.505683422088623, + "learning_rate": 4.410188149054623e-05, + "loss": 4.9293, + "step": 37530 + }, + { + "epoch": 0.22320748881910743, + "grad_norm": 2.2628307342529297, + "learning_rate": 4.410158014927963e-05, + "loss": 4.4323, + "step": 37531 + }, + { + "epoch": 0.22321343610238842, + "grad_norm": 1.7342091798782349, + "learning_rate": 4.4101278801344875e-05, + "loss": 3.8814, + "step": 37532 + }, + { + "epoch": 0.22321938338566943, + "grad_norm": 1.491938829421997, + "learning_rate": 4.4100977446742057e-05, + "loss": 5.0043, + "step": 37533 + }, + { + "epoch": 0.22322533066895042, + "grad_norm": 1.63412606716156, + "learning_rate": 4.4100676085471286e-05, + "loss": 4.9129, + "step": 37534 + }, + { + "epoch": 0.2232312779522314, + "grad_norm": 2.06516695022583, + "learning_rate": 4.4100374717532666e-05, + "loss": 4.1505, + "step": 37535 + }, + { + "epoch": 0.22323722523551243, + "grad_norm": 1.8857531547546387, + "learning_rate": 4.4100073342926304e-05, + "loss": 3.4167, + "step": 37536 + }, + { + "epoch": 0.22324317251879341, + "grad_norm": 1.8068853616714478, + "learning_rate": 4.409977196165231e-05, + "loss": 4.5034, + "step": 37537 + }, + { + "epoch": 0.2232491198020744, + "grad_norm": 1.6763041019439697, + "learning_rate": 4.4099470573710775e-05, + "loss": 4.7625, + "step": 37538 + }, + { + "epoch": 0.22325506708535542, + "grad_norm": 1.791761040687561, + "learning_rate": 4.409916917910181e-05, + "loss": 4.538, + "step": 37539 + }, + { + "epoch": 0.2232610143686364, + "grad_norm": 1.8424322605133057, + "learning_rate": 4.4098867777825526e-05, + "loss": 3.8683, + "step": 37540 + }, + { + "epoch": 0.2232669616519174, + "grad_norm": 2.100870370864868, + "learning_rate": 4.409856636988203e-05, + "loss": 4.2261, + "step": 37541 + }, + { + "epoch": 0.2232729089351984, + "grad_norm": 1.9232033491134644, + "learning_rate": 4.409826495527142e-05, + "loss": 4.8151, + "step": 37542 + }, + { + "epoch": 0.2232788562184794, + "grad_norm": 1.5741419792175293, + "learning_rate": 4.409796353399379e-05, + "loss": 4.9854, + "step": 37543 + }, + { + "epoch": 0.2232848035017604, + "grad_norm": 3.201462507247925, + "learning_rate": 4.4097662106049276e-05, + "loss": 2.4049, + "step": 37544 + }, + { + "epoch": 0.2232907507850414, + "grad_norm": 1.882828950881958, + "learning_rate": 4.4097360671437955e-05, + "loss": 3.4542, + "step": 37545 + }, + { + "epoch": 0.2232966980683224, + "grad_norm": 2.2759206295013428, + "learning_rate": 4.409705923015994e-05, + "loss": 3.3867, + "step": 37546 + }, + { + "epoch": 0.22330264535160338, + "grad_norm": 1.891266107559204, + "learning_rate": 4.409675778221535e-05, + "loss": 3.7851, + "step": 37547 + }, + { + "epoch": 0.2233085926348844, + "grad_norm": 1.5764224529266357, + "learning_rate": 4.409645632760427e-05, + "loss": 4.6758, + "step": 37548 + }, + { + "epoch": 0.22331453991816538, + "grad_norm": 1.5599266290664673, + "learning_rate": 4.409615486632681e-05, + "loss": 4.9474, + "step": 37549 + }, + { + "epoch": 0.22332048720144637, + "grad_norm": 1.725459098815918, + "learning_rate": 4.409585339838309e-05, + "loss": 5.1261, + "step": 37550 + }, + { + "epoch": 0.2233264344847274, + "grad_norm": 1.3846008777618408, + "learning_rate": 4.40955519237732e-05, + "loss": 5.7283, + "step": 37551 + }, + { + "epoch": 0.22333238176800838, + "grad_norm": 1.507738471031189, + "learning_rate": 4.409525044249726e-05, + "loss": 5.6287, + "step": 37552 + }, + { + "epoch": 0.22333832905128936, + "grad_norm": 1.7530328035354614, + "learning_rate": 4.409494895455535e-05, + "loss": 4.9708, + "step": 37553 + }, + { + "epoch": 0.22334427633457038, + "grad_norm": 1.8431001901626587, + "learning_rate": 4.40946474599476e-05, + "loss": 5.145, + "step": 37554 + }, + { + "epoch": 0.22335022361785137, + "grad_norm": 1.5956116914749146, + "learning_rate": 4.409434595867411e-05, + "loss": 5.0481, + "step": 37555 + }, + { + "epoch": 0.22335617090113236, + "grad_norm": 1.5909093618392944, + "learning_rate": 4.409404445073497e-05, + "loss": 4.9317, + "step": 37556 + }, + { + "epoch": 0.22336211818441337, + "grad_norm": 1.5663658380508423, + "learning_rate": 4.40937429361303e-05, + "loss": 4.6503, + "step": 37557 + }, + { + "epoch": 0.22336806546769436, + "grad_norm": 1.413888692855835, + "learning_rate": 4.40934414148602e-05, + "loss": 4.9422, + "step": 37558 + }, + { + "epoch": 0.22337401275097535, + "grad_norm": 1.8675824403762817, + "learning_rate": 4.4093139886924784e-05, + "loss": 5.2428, + "step": 37559 + }, + { + "epoch": 0.22337996003425636, + "grad_norm": 1.8924356698989868, + "learning_rate": 4.409283835232415e-05, + "loss": 4.7687, + "step": 37560 + }, + { + "epoch": 0.22338590731753735, + "grad_norm": 1.582779049873352, + "learning_rate": 4.409253681105839e-05, + "loss": 4.9511, + "step": 37561 + }, + { + "epoch": 0.22339185460081834, + "grad_norm": 1.267486572265625, + "learning_rate": 4.4092235263127634e-05, + "loss": 5.1539, + "step": 37562 + }, + { + "epoch": 0.22339780188409936, + "grad_norm": 1.5250635147094727, + "learning_rate": 4.4091933708531975e-05, + "loss": 5.1331, + "step": 37563 + }, + { + "epoch": 0.22340374916738034, + "grad_norm": 1.4498111009597778, + "learning_rate": 4.409163214727152e-05, + "loss": 5.0517, + "step": 37564 + }, + { + "epoch": 0.22340969645066133, + "grad_norm": 1.1852492094039917, + "learning_rate": 4.409133057934637e-05, + "loss": 5.0065, + "step": 37565 + }, + { + "epoch": 0.22341564373394235, + "grad_norm": 1.3684885501861572, + "learning_rate": 4.409102900475663e-05, + "loss": 5.7824, + "step": 37566 + }, + { + "epoch": 0.22342159101722334, + "grad_norm": 1.485929250717163, + "learning_rate": 4.409072742350242e-05, + "loss": 5.1218, + "step": 37567 + }, + { + "epoch": 0.22342753830050432, + "grad_norm": 1.6653156280517578, + "learning_rate": 4.409042583558383e-05, + "loss": 5.1565, + "step": 37568 + }, + { + "epoch": 0.22343348558378534, + "grad_norm": 1.5432231426239014, + "learning_rate": 4.4090124241000964e-05, + "loss": 5.1212, + "step": 37569 + }, + { + "epoch": 0.22343943286706633, + "grad_norm": 1.2738758325576782, + "learning_rate": 4.408982263975394e-05, + "loss": 4.9139, + "step": 37570 + }, + { + "epoch": 0.22344538015034732, + "grad_norm": 1.5660128593444824, + "learning_rate": 4.408952103184285e-05, + "loss": 5.1038, + "step": 37571 + }, + { + "epoch": 0.2234513274336283, + "grad_norm": 1.8557360172271729, + "learning_rate": 4.408921941726781e-05, + "loss": 3.7227, + "step": 37572 + }, + { + "epoch": 0.22345727471690932, + "grad_norm": 1.9540084600448608, + "learning_rate": 4.408891779602892e-05, + "loss": 4.9764, + "step": 37573 + }, + { + "epoch": 0.2234632220001903, + "grad_norm": 2.0382845401763916, + "learning_rate": 4.408861616812628e-05, + "loss": 3.7373, + "step": 37574 + }, + { + "epoch": 0.2234691692834713, + "grad_norm": 1.766674518585205, + "learning_rate": 4.4088314533560014e-05, + "loss": 3.8062, + "step": 37575 + }, + { + "epoch": 0.2234751165667523, + "grad_norm": 1.572192668914795, + "learning_rate": 4.4088012892330204e-05, + "loss": 3.8451, + "step": 37576 + }, + { + "epoch": 0.2234810638500333, + "grad_norm": 1.5495448112487793, + "learning_rate": 4.408771124443697e-05, + "loss": 4.217, + "step": 37577 + }, + { + "epoch": 0.2234870111333143, + "grad_norm": 1.5044076442718506, + "learning_rate": 4.408740958988041e-05, + "loss": 4.4927, + "step": 37578 + }, + { + "epoch": 0.2234929584165953, + "grad_norm": 1.9189188480377197, + "learning_rate": 4.408710792866064e-05, + "loss": 4.3213, + "step": 37579 + }, + { + "epoch": 0.2234989056998763, + "grad_norm": 1.7820667028427124, + "learning_rate": 4.4086806260777744e-05, + "loss": 4.7006, + "step": 37580 + }, + { + "epoch": 0.22350485298315728, + "grad_norm": 1.445273518562317, + "learning_rate": 4.408650458623186e-05, + "loss": 5.3059, + "step": 37581 + }, + { + "epoch": 0.2235108002664383, + "grad_norm": 1.649367094039917, + "learning_rate": 4.408620290502306e-05, + "loss": 5.2157, + "step": 37582 + }, + { + "epoch": 0.22351674754971929, + "grad_norm": 1.475437879562378, + "learning_rate": 4.408590121715147e-05, + "loss": 5.1782, + "step": 37583 + }, + { + "epoch": 0.22352269483300027, + "grad_norm": 1.7894258499145508, + "learning_rate": 4.408559952261718e-05, + "loss": 4.3475, + "step": 37584 + }, + { + "epoch": 0.2235286421162813, + "grad_norm": 1.75069260597229, + "learning_rate": 4.408529782142031e-05, + "loss": 4.5146, + "step": 37585 + }, + { + "epoch": 0.22353458939956228, + "grad_norm": 1.7912527322769165, + "learning_rate": 4.4084996113560967e-05, + "loss": 5.0784, + "step": 37586 + }, + { + "epoch": 0.22354053668284327, + "grad_norm": 1.85751211643219, + "learning_rate": 4.408469439903924e-05, + "loss": 4.4988, + "step": 37587 + }, + { + "epoch": 0.22354648396612428, + "grad_norm": 1.9540380239486694, + "learning_rate": 4.4084392677855245e-05, + "loss": 4.9008, + "step": 37588 + }, + { + "epoch": 0.22355243124940527, + "grad_norm": 1.750579595565796, + "learning_rate": 4.4084090950009094e-05, + "loss": 4.7877, + "step": 37589 + }, + { + "epoch": 0.22355837853268626, + "grad_norm": 1.8129644393920898, + "learning_rate": 4.408378921550088e-05, + "loss": 4.4876, + "step": 37590 + }, + { + "epoch": 0.22356432581596727, + "grad_norm": 1.6545523405075073, + "learning_rate": 4.40834874743307e-05, + "loss": 4.9386, + "step": 37591 + }, + { + "epoch": 0.22357027309924826, + "grad_norm": 1.6962411403656006, + "learning_rate": 4.4083185726498686e-05, + "loss": 4.8634, + "step": 37592 + }, + { + "epoch": 0.22357622038252925, + "grad_norm": 1.7945470809936523, + "learning_rate": 4.408288397200492e-05, + "loss": 4.8463, + "step": 37593 + }, + { + "epoch": 0.22358216766581027, + "grad_norm": 2.5073330402374268, + "learning_rate": 4.408258221084952e-05, + "loss": 3.1183, + "step": 37594 + }, + { + "epoch": 0.22358811494909125, + "grad_norm": 1.7957638502120972, + "learning_rate": 4.408228044303259e-05, + "loss": 4.1243, + "step": 37595 + }, + { + "epoch": 0.22359406223237224, + "grad_norm": 1.9337985515594482, + "learning_rate": 4.408197866855424e-05, + "loss": 4.6066, + "step": 37596 + }, + { + "epoch": 0.22360000951565326, + "grad_norm": 1.7588727474212646, + "learning_rate": 4.4081676887414555e-05, + "loss": 4.7197, + "step": 37597 + }, + { + "epoch": 0.22360595679893425, + "grad_norm": 1.607006311416626, + "learning_rate": 4.4081375099613656e-05, + "loss": 5.0925, + "step": 37598 + }, + { + "epoch": 0.22361190408221523, + "grad_norm": 1.4255245923995972, + "learning_rate": 4.408107330515165e-05, + "loss": 5.1365, + "step": 37599 + }, + { + "epoch": 0.22361785136549625, + "grad_norm": 1.2731602191925049, + "learning_rate": 4.4080771504028636e-05, + "loss": 5.3427, + "step": 37600 + }, + { + "epoch": 0.22362379864877724, + "grad_norm": 1.595390796661377, + "learning_rate": 4.408046969624472e-05, + "loss": 4.7735, + "step": 37601 + }, + { + "epoch": 0.22362974593205823, + "grad_norm": 1.7148345708847046, + "learning_rate": 4.408016788180002e-05, + "loss": 4.4927, + "step": 37602 + }, + { + "epoch": 0.22363569321533924, + "grad_norm": 1.810319423675537, + "learning_rate": 4.407986606069462e-05, + "loss": 4.799, + "step": 37603 + }, + { + "epoch": 0.22364164049862023, + "grad_norm": 2.190213680267334, + "learning_rate": 4.407956423292863e-05, + "loss": 4.0821, + "step": 37604 + }, + { + "epoch": 0.22364758778190122, + "grad_norm": 1.6705280542373657, + "learning_rate": 4.4079262398502174e-05, + "loss": 5.2685, + "step": 37605 + }, + { + "epoch": 0.22365353506518224, + "grad_norm": 1.8062423467636108, + "learning_rate": 4.407896055741534e-05, + "loss": 4.3002, + "step": 37606 + }, + { + "epoch": 0.22365948234846322, + "grad_norm": 1.9707059860229492, + "learning_rate": 4.407865870966824e-05, + "loss": 5.299, + "step": 37607 + }, + { + "epoch": 0.2236654296317442, + "grad_norm": 1.9427824020385742, + "learning_rate": 4.407835685526097e-05, + "loss": 4.2061, + "step": 37608 + }, + { + "epoch": 0.22367137691502523, + "grad_norm": 2.1432387828826904, + "learning_rate": 4.4078054994193654e-05, + "loss": 3.4683, + "step": 37609 + }, + { + "epoch": 0.22367732419830622, + "grad_norm": 2.56776762008667, + "learning_rate": 4.4077753126466374e-05, + "loss": 4.3432, + "step": 37610 + }, + { + "epoch": 0.2236832714815872, + "grad_norm": 2.1716179847717285, + "learning_rate": 4.407745125207926e-05, + "loss": 3.9791, + "step": 37611 + }, + { + "epoch": 0.22368921876486822, + "grad_norm": 1.9081957340240479, + "learning_rate": 4.4077149371032394e-05, + "loss": 4.2926, + "step": 37612 + }, + { + "epoch": 0.2236951660481492, + "grad_norm": 2.1505470275878906, + "learning_rate": 4.4076847483325903e-05, + "loss": 4.0981, + "step": 37613 + }, + { + "epoch": 0.2237011133314302, + "grad_norm": 2.2152764797210693, + "learning_rate": 4.407654558895987e-05, + "loss": 3.5981, + "step": 37614 + }, + { + "epoch": 0.2237070606147112, + "grad_norm": 2.2406694889068604, + "learning_rate": 4.407624368793442e-05, + "loss": 4.1125, + "step": 37615 + }, + { + "epoch": 0.2237130078979922, + "grad_norm": 1.7292073965072632, + "learning_rate": 4.4075941780249646e-05, + "loss": 4.4267, + "step": 37616 + }, + { + "epoch": 0.2237189551812732, + "grad_norm": 2.047788381576538, + "learning_rate": 4.407563986590566e-05, + "loss": 4.4591, + "step": 37617 + }, + { + "epoch": 0.2237249024645542, + "grad_norm": 2.3711559772491455, + "learning_rate": 4.407533794490256e-05, + "loss": 3.2147, + "step": 37618 + }, + { + "epoch": 0.2237308497478352, + "grad_norm": 2.232598304748535, + "learning_rate": 4.407503601724047e-05, + "loss": 3.1629, + "step": 37619 + }, + { + "epoch": 0.22373679703111618, + "grad_norm": 2.0596656799316406, + "learning_rate": 4.407473408291946e-05, + "loss": 3.2901, + "step": 37620 + }, + { + "epoch": 0.2237427443143972, + "grad_norm": 1.884080171585083, + "learning_rate": 4.407443214193968e-05, + "loss": 3.4791, + "step": 37621 + }, + { + "epoch": 0.22374869159767818, + "grad_norm": 1.9116895198822021, + "learning_rate": 4.40741301943012e-05, + "loss": 3.4078, + "step": 37622 + }, + { + "epoch": 0.22375463888095917, + "grad_norm": 1.9203144311904907, + "learning_rate": 4.4073828240004144e-05, + "loss": 3.2653, + "step": 37623 + }, + { + "epoch": 0.2237605861642402, + "grad_norm": 1.8317451477050781, + "learning_rate": 4.4073526279048616e-05, + "loss": 3.2768, + "step": 37624 + }, + { + "epoch": 0.22376653344752118, + "grad_norm": 1.9589619636535645, + "learning_rate": 4.4073224311434705e-05, + "loss": 3.3164, + "step": 37625 + }, + { + "epoch": 0.22377248073080216, + "grad_norm": 1.97921621799469, + "learning_rate": 4.407292233716254e-05, + "loss": 3.2702, + "step": 37626 + }, + { + "epoch": 0.22377842801408318, + "grad_norm": 1.758956789970398, + "learning_rate": 4.4072620356232205e-05, + "loss": 3.3953, + "step": 37627 + }, + { + "epoch": 0.22378437529736417, + "grad_norm": 1.907297968864441, + "learning_rate": 4.407231836864382e-05, + "loss": 3.2948, + "step": 37628 + }, + { + "epoch": 0.22379032258064516, + "grad_norm": 1.9227485656738281, + "learning_rate": 4.4072016374397485e-05, + "loss": 3.2674, + "step": 37629 + }, + { + "epoch": 0.22379626986392614, + "grad_norm": 2.019357442855835, + "learning_rate": 4.40717143734933e-05, + "loss": 3.349, + "step": 37630 + }, + { + "epoch": 0.22380221714720716, + "grad_norm": 2.0891315937042236, + "learning_rate": 4.4071412365931385e-05, + "loss": 3.1821, + "step": 37631 + }, + { + "epoch": 0.22380816443048815, + "grad_norm": 2.0140082836151123, + "learning_rate": 4.407111035171184e-05, + "loss": 3.3252, + "step": 37632 + }, + { + "epoch": 0.22381411171376914, + "grad_norm": 1.9410052299499512, + "learning_rate": 4.407080833083476e-05, + "loss": 3.3448, + "step": 37633 + }, + { + "epoch": 0.22382005899705015, + "grad_norm": 1.7290452718734741, + "learning_rate": 4.4070506303300266e-05, + "loss": 4.56, + "step": 37634 + }, + { + "epoch": 0.22382600628033114, + "grad_norm": 1.9766449928283691, + "learning_rate": 4.407020426910844e-05, + "loss": 3.263, + "step": 37635 + }, + { + "epoch": 0.22383195356361213, + "grad_norm": 1.8865668773651123, + "learning_rate": 4.4069902228259416e-05, + "loss": 3.4893, + "step": 37636 + }, + { + "epoch": 0.22383790084689315, + "grad_norm": 1.8951842784881592, + "learning_rate": 4.4069600180753276e-05, + "loss": 3.3569, + "step": 37637 + }, + { + "epoch": 0.22384384813017413, + "grad_norm": 1.8201243877410889, + "learning_rate": 4.406929812659014e-05, + "loss": 3.1592, + "step": 37638 + }, + { + "epoch": 0.22384979541345512, + "grad_norm": 1.917259931564331, + "learning_rate": 4.406899606577012e-05, + "loss": 3.2789, + "step": 37639 + }, + { + "epoch": 0.22385574269673614, + "grad_norm": 1.907003402709961, + "learning_rate": 4.40686939982933e-05, + "loss": 3.4995, + "step": 37640 + }, + { + "epoch": 0.22386168998001713, + "grad_norm": 1.937944769859314, + "learning_rate": 4.406839192415979e-05, + "loss": 3.8192, + "step": 37641 + }, + { + "epoch": 0.2238676372632981, + "grad_norm": 2.027071237564087, + "learning_rate": 4.406808984336971e-05, + "loss": 4.5019, + "step": 37642 + }, + { + "epoch": 0.22387358454657913, + "grad_norm": 2.0646047592163086, + "learning_rate": 4.406778775592316e-05, + "loss": 4.6154, + "step": 37643 + }, + { + "epoch": 0.22387953182986012, + "grad_norm": 2.3060433864593506, + "learning_rate": 4.406748566182023e-05, + "loss": 4.4522, + "step": 37644 + }, + { + "epoch": 0.2238854791131411, + "grad_norm": 2.676363706588745, + "learning_rate": 4.406718356106105e-05, + "loss": 4.5443, + "step": 37645 + }, + { + "epoch": 0.22389142639642212, + "grad_norm": 2.4247331619262695, + "learning_rate": 4.406688145364571e-05, + "loss": 4.4311, + "step": 37646 + }, + { + "epoch": 0.2238973736797031, + "grad_norm": 2.202650785446167, + "learning_rate": 4.406657933957431e-05, + "loss": 4.1083, + "step": 37647 + }, + { + "epoch": 0.2239033209629841, + "grad_norm": 2.003657579421997, + "learning_rate": 4.406627721884697e-05, + "loss": 4.5928, + "step": 37648 + }, + { + "epoch": 0.22390926824626511, + "grad_norm": 1.8342602252960205, + "learning_rate": 4.4065975091463796e-05, + "loss": 4.4013, + "step": 37649 + }, + { + "epoch": 0.2239152155295461, + "grad_norm": 2.3087284564971924, + "learning_rate": 4.406567295742488e-05, + "loss": 3.9403, + "step": 37650 + }, + { + "epoch": 0.2239211628128271, + "grad_norm": 2.0790791511535645, + "learning_rate": 4.406537081673034e-05, + "loss": 4.5265, + "step": 37651 + }, + { + "epoch": 0.2239271100961081, + "grad_norm": 1.8618361949920654, + "learning_rate": 4.4065068669380274e-05, + "loss": 4.0404, + "step": 37652 + }, + { + "epoch": 0.2239330573793891, + "grad_norm": 2.4421863555908203, + "learning_rate": 4.406476651537478e-05, + "loss": 4.1454, + "step": 37653 + }, + { + "epoch": 0.22393900466267008, + "grad_norm": 2.2863211631774902, + "learning_rate": 4.4064464354713986e-05, + "loss": 4.0519, + "step": 37654 + }, + { + "epoch": 0.2239449519459511, + "grad_norm": 2.191511392593384, + "learning_rate": 4.406416218739798e-05, + "loss": 4.0931, + "step": 37655 + }, + { + "epoch": 0.2239508992292321, + "grad_norm": 2.0519556999206543, + "learning_rate": 4.406386001342687e-05, + "loss": 4.5592, + "step": 37656 + }, + { + "epoch": 0.22395684651251307, + "grad_norm": 2.1067867279052734, + "learning_rate": 4.406355783280076e-05, + "loss": 3.9904, + "step": 37657 + }, + { + "epoch": 0.2239627937957941, + "grad_norm": 2.265929937362671, + "learning_rate": 4.406325564551977e-05, + "loss": 4.0379, + "step": 37658 + }, + { + "epoch": 0.22396874107907508, + "grad_norm": 2.3162310123443604, + "learning_rate": 4.406295345158399e-05, + "loss": 4.1419, + "step": 37659 + }, + { + "epoch": 0.22397468836235607, + "grad_norm": 2.2604496479034424, + "learning_rate": 4.406265125099353e-05, + "loss": 4.0614, + "step": 37660 + }, + { + "epoch": 0.22398063564563708, + "grad_norm": 2.4090497493743896, + "learning_rate": 4.406234904374849e-05, + "loss": 4.0914, + "step": 37661 + }, + { + "epoch": 0.22398658292891807, + "grad_norm": 2.472126007080078, + "learning_rate": 4.406204682984898e-05, + "loss": 3.7101, + "step": 37662 + }, + { + "epoch": 0.22399253021219906, + "grad_norm": 2.605400323867798, + "learning_rate": 4.406174460929511e-05, + "loss": 4.0506, + "step": 37663 + }, + { + "epoch": 0.22399847749548008, + "grad_norm": 2.2989494800567627, + "learning_rate": 4.406144238208698e-05, + "loss": 4.1113, + "step": 37664 + }, + { + "epoch": 0.22400442477876106, + "grad_norm": 2.052351713180542, + "learning_rate": 4.406114014822471e-05, + "loss": 4.2539, + "step": 37665 + }, + { + "epoch": 0.22401037206204205, + "grad_norm": 2.4834420680999756, + "learning_rate": 4.4060837907708375e-05, + "loss": 4.5214, + "step": 37666 + }, + { + "epoch": 0.22401631934532307, + "grad_norm": 2.3077211380004883, + "learning_rate": 4.406053566053811e-05, + "loss": 4.3625, + "step": 37667 + }, + { + "epoch": 0.22402226662860406, + "grad_norm": 2.109318971633911, + "learning_rate": 4.4060233406714e-05, + "loss": 3.9841, + "step": 37668 + }, + { + "epoch": 0.22402821391188504, + "grad_norm": 2.35363507270813, + "learning_rate": 4.4059931146236165e-05, + "loss": 4.0543, + "step": 37669 + }, + { + "epoch": 0.22403416119516606, + "grad_norm": 2.142488718032837, + "learning_rate": 4.405962887910471e-05, + "loss": 4.4115, + "step": 37670 + }, + { + "epoch": 0.22404010847844705, + "grad_norm": 1.740767240524292, + "learning_rate": 4.405932660531973e-05, + "loss": 4.2541, + "step": 37671 + }, + { + "epoch": 0.22404605576172804, + "grad_norm": 2.355954885482788, + "learning_rate": 4.4059024324881335e-05, + "loss": 4.145, + "step": 37672 + }, + { + "epoch": 0.22405200304500905, + "grad_norm": 2.334618091583252, + "learning_rate": 4.4058722037789635e-05, + "loss": 3.9459, + "step": 37673 + }, + { + "epoch": 0.22405795032829004, + "grad_norm": 2.15889048576355, + "learning_rate": 4.405841974404473e-05, + "loss": 3.9556, + "step": 37674 + }, + { + "epoch": 0.22406389761157103, + "grad_norm": 2.0851681232452393, + "learning_rate": 4.4058117443646724e-05, + "loss": 3.7941, + "step": 37675 + }, + { + "epoch": 0.22406984489485204, + "grad_norm": 2.1203784942626953, + "learning_rate": 4.405781513659572e-05, + "loss": 3.4324, + "step": 37676 + }, + { + "epoch": 0.22407579217813303, + "grad_norm": 2.550434112548828, + "learning_rate": 4.405751282289185e-05, + "loss": 4.2083, + "step": 37677 + }, + { + "epoch": 0.22408173946141402, + "grad_norm": 2.1875874996185303, + "learning_rate": 4.4057210502535184e-05, + "loss": 4.2803, + "step": 37678 + }, + { + "epoch": 0.22408768674469504, + "grad_norm": 1.5319013595581055, + "learning_rate": 4.4056908175525844e-05, + "loss": 5.1222, + "step": 37679 + }, + { + "epoch": 0.22409363402797602, + "grad_norm": 1.5734094381332397, + "learning_rate": 4.405660584186394e-05, + "loss": 4.7175, + "step": 37680 + }, + { + "epoch": 0.224099581311257, + "grad_norm": 2.2757771015167236, + "learning_rate": 4.405630350154957e-05, + "loss": 4.0468, + "step": 37681 + }, + { + "epoch": 0.22410552859453803, + "grad_norm": 1.579903483390808, + "learning_rate": 4.405600115458284e-05, + "loss": 5.1165, + "step": 37682 + }, + { + "epoch": 0.22411147587781902, + "grad_norm": 1.4931329488754272, + "learning_rate": 4.4055698800963855e-05, + "loss": 5.1076, + "step": 37683 + }, + { + "epoch": 0.2241174231611, + "grad_norm": 1.4989633560180664, + "learning_rate": 4.4055396440692724e-05, + "loss": 5.1507, + "step": 37684 + }, + { + "epoch": 0.22412337044438102, + "grad_norm": 1.249399185180664, + "learning_rate": 4.405509407376955e-05, + "loss": 5.0259, + "step": 37685 + }, + { + "epoch": 0.224129317727662, + "grad_norm": 1.4521582126617432, + "learning_rate": 4.4054791700194445e-05, + "loss": 4.5992, + "step": 37686 + }, + { + "epoch": 0.224135265010943, + "grad_norm": 1.8319655656814575, + "learning_rate": 4.405448931996751e-05, + "loss": 4.1182, + "step": 37687 + }, + { + "epoch": 0.22414121229422398, + "grad_norm": 1.6663216352462769, + "learning_rate": 4.4054186933088836e-05, + "loss": 4.8555, + "step": 37688 + }, + { + "epoch": 0.224147159577505, + "grad_norm": 1.56107759475708, + "learning_rate": 4.4053884539558556e-05, + "loss": 5.1188, + "step": 37689 + }, + { + "epoch": 0.224153106860786, + "grad_norm": 1.94844388961792, + "learning_rate": 4.4053582139376756e-05, + "loss": 5.0103, + "step": 37690 + }, + { + "epoch": 0.22415905414406698, + "grad_norm": 2.3126590251922607, + "learning_rate": 4.4053279732543546e-05, + "loss": 5.3361, + "step": 37691 + }, + { + "epoch": 0.224165001427348, + "grad_norm": 2.051386833190918, + "learning_rate": 4.405297731905903e-05, + "loss": 4.8889, + "step": 37692 + }, + { + "epoch": 0.22417094871062898, + "grad_norm": 2.7010273933410645, + "learning_rate": 4.405267489892333e-05, + "loss": 3.7776, + "step": 37693 + }, + { + "epoch": 0.22417689599390997, + "grad_norm": 2.7000935077667236, + "learning_rate": 4.4052372472136526e-05, + "loss": 3.6693, + "step": 37694 + }, + { + "epoch": 0.22418284327719099, + "grad_norm": 2.795950174331665, + "learning_rate": 4.405207003869874e-05, + "loss": 4.1973, + "step": 37695 + }, + { + "epoch": 0.22418879056047197, + "grad_norm": 1.8645349740982056, + "learning_rate": 4.405176759861007e-05, + "loss": 4.5066, + "step": 37696 + }, + { + "epoch": 0.22419473784375296, + "grad_norm": 1.8882391452789307, + "learning_rate": 4.405146515187063e-05, + "loss": 4.4669, + "step": 37697 + }, + { + "epoch": 0.22420068512703398, + "grad_norm": 1.7081935405731201, + "learning_rate": 4.405116269848051e-05, + "loss": 4.4768, + "step": 37698 + }, + { + "epoch": 0.22420663241031497, + "grad_norm": 2.3097574710845947, + "learning_rate": 4.405086023843984e-05, + "loss": 4.5242, + "step": 37699 + }, + { + "epoch": 0.22421257969359595, + "grad_norm": 1.777329921722412, + "learning_rate": 4.40505577717487e-05, + "loss": 4.4065, + "step": 37700 + }, + { + "epoch": 0.22421852697687697, + "grad_norm": 1.9584839344024658, + "learning_rate": 4.405025529840721e-05, + "loss": 4.3524, + "step": 37701 + }, + { + "epoch": 0.22422447426015796, + "grad_norm": 2.2504661083221436, + "learning_rate": 4.4049952818415474e-05, + "loss": 4.4163, + "step": 37702 + }, + { + "epoch": 0.22423042154343895, + "grad_norm": 2.2781872749328613, + "learning_rate": 4.404965033177359e-05, + "loss": 4.8339, + "step": 37703 + }, + { + "epoch": 0.22423636882671996, + "grad_norm": 1.6593425273895264, + "learning_rate": 4.404934783848169e-05, + "loss": 5.1117, + "step": 37704 + }, + { + "epoch": 0.22424231611000095, + "grad_norm": 1.8437799215316772, + "learning_rate": 4.404904533853984e-05, + "loss": 4.5249, + "step": 37705 + }, + { + "epoch": 0.22424826339328194, + "grad_norm": 1.8311305046081543, + "learning_rate": 4.404874283194818e-05, + "loss": 4.3728, + "step": 37706 + }, + { + "epoch": 0.22425421067656295, + "grad_norm": 1.7205126285552979, + "learning_rate": 4.4048440318706784e-05, + "loss": 4.3997, + "step": 37707 + }, + { + "epoch": 0.22426015795984394, + "grad_norm": 1.6154394149780273, + "learning_rate": 4.4048137798815784e-05, + "loss": 4.7922, + "step": 37708 + }, + { + "epoch": 0.22426610524312493, + "grad_norm": 1.695842981338501, + "learning_rate": 4.4047835272275276e-05, + "loss": 4.3493, + "step": 37709 + }, + { + "epoch": 0.22427205252640595, + "grad_norm": 1.6647610664367676, + "learning_rate": 4.404753273908536e-05, + "loss": 4.216, + "step": 37710 + }, + { + "epoch": 0.22427799980968693, + "grad_norm": 1.771431803703308, + "learning_rate": 4.404723019924615e-05, + "loss": 3.4842, + "step": 37711 + }, + { + "epoch": 0.22428394709296792, + "grad_norm": 1.923303484916687, + "learning_rate": 4.4046927652757756e-05, + "loss": 3.6944, + "step": 37712 + }, + { + "epoch": 0.22428989437624894, + "grad_norm": 1.8749626874923706, + "learning_rate": 4.4046625099620264e-05, + "loss": 3.9616, + "step": 37713 + }, + { + "epoch": 0.22429584165952993, + "grad_norm": 2.2243165969848633, + "learning_rate": 4.4046322539833795e-05, + "loss": 3.1859, + "step": 37714 + }, + { + "epoch": 0.22430178894281091, + "grad_norm": 1.9024165868759155, + "learning_rate": 4.4046019973398455e-05, + "loss": 4.1793, + "step": 37715 + }, + { + "epoch": 0.22430773622609193, + "grad_norm": 1.6997089385986328, + "learning_rate": 4.404571740031435e-05, + "loss": 4.0071, + "step": 37716 + }, + { + "epoch": 0.22431368350937292, + "grad_norm": 1.8075357675552368, + "learning_rate": 4.4045414820581574e-05, + "loss": 3.8646, + "step": 37717 + }, + { + "epoch": 0.2243196307926539, + "grad_norm": 1.9021435976028442, + "learning_rate": 4.404511223420024e-05, + "loss": 3.8813, + "step": 37718 + }, + { + "epoch": 0.22432557807593492, + "grad_norm": 1.7350317239761353, + "learning_rate": 4.404480964117046e-05, + "loss": 4.0871, + "step": 37719 + }, + { + "epoch": 0.2243315253592159, + "grad_norm": 2.2858777046203613, + "learning_rate": 4.4044507041492337e-05, + "loss": 3.489, + "step": 37720 + }, + { + "epoch": 0.2243374726424969, + "grad_norm": 2.2867650985717773, + "learning_rate": 4.404420443516596e-05, + "loss": 2.6582, + "step": 37721 + }, + { + "epoch": 0.22434341992577791, + "grad_norm": 1.6754149198532104, + "learning_rate": 4.404390182219146e-05, + "loss": 4.8954, + "step": 37722 + }, + { + "epoch": 0.2243493672090589, + "grad_norm": 3.31197190284729, + "learning_rate": 4.404359920256892e-05, + "loss": 1.4784, + "step": 37723 + }, + { + "epoch": 0.2243553144923399, + "grad_norm": 2.2913246154785156, + "learning_rate": 4.4043296576298464e-05, + "loss": 3.6803, + "step": 37724 + }, + { + "epoch": 0.2243612617756209, + "grad_norm": 1.5901116132736206, + "learning_rate": 4.404299394338019e-05, + "loss": 4.5641, + "step": 37725 + }, + { + "epoch": 0.2243672090589019, + "grad_norm": 1.5679881572723389, + "learning_rate": 4.4042691303814204e-05, + "loss": 4.6357, + "step": 37726 + }, + { + "epoch": 0.22437315634218288, + "grad_norm": 1.945236325263977, + "learning_rate": 4.4042388657600606e-05, + "loss": 3.9994, + "step": 37727 + }, + { + "epoch": 0.2243791036254639, + "grad_norm": 1.9577616453170776, + "learning_rate": 4.404208600473951e-05, + "loss": 3.7444, + "step": 37728 + }, + { + "epoch": 0.2243850509087449, + "grad_norm": 1.7644386291503906, + "learning_rate": 4.404178334523102e-05, + "loss": 3.6799, + "step": 37729 + }, + { + "epoch": 0.22439099819202588, + "grad_norm": 1.9280447959899902, + "learning_rate": 4.4041480679075245e-05, + "loss": 4.2304, + "step": 37730 + }, + { + "epoch": 0.2243969454753069, + "grad_norm": 2.3529813289642334, + "learning_rate": 4.404117800627228e-05, + "loss": 4.1772, + "step": 37731 + }, + { + "epoch": 0.22440289275858788, + "grad_norm": 2.0188229084014893, + "learning_rate": 4.404087532682223e-05, + "loss": 3.7716, + "step": 37732 + }, + { + "epoch": 0.22440884004186887, + "grad_norm": 2.0751125812530518, + "learning_rate": 4.4040572640725215e-05, + "loss": 3.5754, + "step": 37733 + }, + { + "epoch": 0.22441478732514988, + "grad_norm": 2.1539642810821533, + "learning_rate": 4.4040269947981325e-05, + "loss": 3.1711, + "step": 37734 + }, + { + "epoch": 0.22442073460843087, + "grad_norm": 2.405971050262451, + "learning_rate": 4.403996724859069e-05, + "loss": 3.9476, + "step": 37735 + }, + { + "epoch": 0.22442668189171186, + "grad_norm": 2.26133131980896, + "learning_rate": 4.403966454255338e-05, + "loss": 3.2927, + "step": 37736 + }, + { + "epoch": 0.22443262917499288, + "grad_norm": 2.0596282482147217, + "learning_rate": 4.403936182986953e-05, + "loss": 4.1827, + "step": 37737 + }, + { + "epoch": 0.22443857645827386, + "grad_norm": 2.0279719829559326, + "learning_rate": 4.403905911053924e-05, + "loss": 4.2153, + "step": 37738 + }, + { + "epoch": 0.22444452374155485, + "grad_norm": 2.963252067565918, + "learning_rate": 4.4038756384562596e-05, + "loss": 1.2135, + "step": 37739 + }, + { + "epoch": 0.22445047102483587, + "grad_norm": 3.4959542751312256, + "learning_rate": 4.403845365193974e-05, + "loss": 1.5318, + "step": 37740 + }, + { + "epoch": 0.22445641830811686, + "grad_norm": 3.365992784500122, + "learning_rate": 4.403815091267074e-05, + "loss": 1.4584, + "step": 37741 + }, + { + "epoch": 0.22446236559139784, + "grad_norm": 4.033193588256836, + "learning_rate": 4.403784816675572e-05, + "loss": 1.7688, + "step": 37742 + }, + { + "epoch": 0.22446831287467886, + "grad_norm": 3.2633559703826904, + "learning_rate": 4.4037545414194784e-05, + "loss": 1.3533, + "step": 37743 + }, + { + "epoch": 0.22447426015795985, + "grad_norm": 4.7632598876953125, + "learning_rate": 4.403724265498804e-05, + "loss": 1.6567, + "step": 37744 + }, + { + "epoch": 0.22448020744124084, + "grad_norm": 1.7756397724151611, + "learning_rate": 4.40369398891356e-05, + "loss": 4.1304, + "step": 37745 + }, + { + "epoch": 0.22448615472452182, + "grad_norm": 2.0562822818756104, + "learning_rate": 4.403663711663755e-05, + "loss": 4.2739, + "step": 37746 + }, + { + "epoch": 0.22449210200780284, + "grad_norm": 1.9821317195892334, + "learning_rate": 4.4036334337494007e-05, + "loss": 4.1306, + "step": 37747 + }, + { + "epoch": 0.22449804929108383, + "grad_norm": 1.877347469329834, + "learning_rate": 4.403603155170508e-05, + "loss": 4.5766, + "step": 37748 + }, + { + "epoch": 0.22450399657436482, + "grad_norm": 1.9222016334533691, + "learning_rate": 4.403572875927087e-05, + "loss": 4.3211, + "step": 37749 + }, + { + "epoch": 0.22450994385764583, + "grad_norm": 2.12162709236145, + "learning_rate": 4.403542596019148e-05, + "loss": 4.6083, + "step": 37750 + }, + { + "epoch": 0.22451589114092682, + "grad_norm": 1.482627272605896, + "learning_rate": 4.403512315446702e-05, + "loss": 4.6339, + "step": 37751 + }, + { + "epoch": 0.2245218384242078, + "grad_norm": 1.8798069953918457, + "learning_rate": 4.40348203420976e-05, + "loss": 4.365, + "step": 37752 + }, + { + "epoch": 0.22452778570748883, + "grad_norm": 1.9546221494674683, + "learning_rate": 4.403451752308332e-05, + "loss": 4.4222, + "step": 37753 + }, + { + "epoch": 0.2245337329907698, + "grad_norm": 2.216580629348755, + "learning_rate": 4.403421469742428e-05, + "loss": 3.6172, + "step": 37754 + }, + { + "epoch": 0.2245396802740508, + "grad_norm": 1.872523307800293, + "learning_rate": 4.4033911865120606e-05, + "loss": 4.1351, + "step": 37755 + }, + { + "epoch": 0.22454562755733182, + "grad_norm": 1.892671823501587, + "learning_rate": 4.403360902617238e-05, + "loss": 4.4622, + "step": 37756 + }, + { + "epoch": 0.2245515748406128, + "grad_norm": 1.8072375059127808, + "learning_rate": 4.4033306180579713e-05, + "loss": 4.1739, + "step": 37757 + }, + { + "epoch": 0.2245575221238938, + "grad_norm": 1.6658414602279663, + "learning_rate": 4.4033003328342725e-05, + "loss": 4.4121, + "step": 37758 + }, + { + "epoch": 0.2245634694071748, + "grad_norm": 1.9384973049163818, + "learning_rate": 4.403270046946151e-05, + "loss": 4.3525, + "step": 37759 + }, + { + "epoch": 0.2245694166904558, + "grad_norm": 1.798912763595581, + "learning_rate": 4.403239760393617e-05, + "loss": 4.0399, + "step": 37760 + }, + { + "epoch": 0.22457536397373679, + "grad_norm": 2.1004838943481445, + "learning_rate": 4.4032094731766825e-05, + "loss": 4.6759, + "step": 37761 + }, + { + "epoch": 0.2245813112570178, + "grad_norm": 2.2671115398406982, + "learning_rate": 4.403179185295357e-05, + "loss": 4.5474, + "step": 37762 + }, + { + "epoch": 0.2245872585402988, + "grad_norm": 2.3091773986816406, + "learning_rate": 4.403148896749651e-05, + "loss": 4.0547, + "step": 37763 + }, + { + "epoch": 0.22459320582357978, + "grad_norm": 1.733040452003479, + "learning_rate": 4.403118607539576e-05, + "loss": 4.3636, + "step": 37764 + }, + { + "epoch": 0.2245991531068608, + "grad_norm": 1.7878620624542236, + "learning_rate": 4.403088317665142e-05, + "loss": 4.1853, + "step": 37765 + }, + { + "epoch": 0.22460510039014178, + "grad_norm": 1.7317149639129639, + "learning_rate": 4.4030580271263586e-05, + "loss": 4.5654, + "step": 37766 + }, + { + "epoch": 0.22461104767342277, + "grad_norm": 1.7622241973876953, + "learning_rate": 4.403027735923237e-05, + "loss": 4.4526, + "step": 37767 + }, + { + "epoch": 0.22461699495670379, + "grad_norm": 1.70356023311615, + "learning_rate": 4.4029974440557895e-05, + "loss": 4.328, + "step": 37768 + }, + { + "epoch": 0.22462294223998477, + "grad_norm": 1.8185654878616333, + "learning_rate": 4.4029671515240245e-05, + "loss": 4.2525, + "step": 37769 + }, + { + "epoch": 0.22462888952326576, + "grad_norm": 1.8527708053588867, + "learning_rate": 4.402936858327953e-05, + "loss": 4.125, + "step": 37770 + }, + { + "epoch": 0.22463483680654678, + "grad_norm": 1.809019923210144, + "learning_rate": 4.402906564467587e-05, + "loss": 4.0711, + "step": 37771 + }, + { + "epoch": 0.22464078408982777, + "grad_norm": 1.8672319650650024, + "learning_rate": 4.402876269942935e-05, + "loss": 3.9145, + "step": 37772 + }, + { + "epoch": 0.22464673137310875, + "grad_norm": 2.5570287704467773, + "learning_rate": 4.4028459747540086e-05, + "loss": 4.1233, + "step": 37773 + }, + { + "epoch": 0.22465267865638977, + "grad_norm": 1.6708779335021973, + "learning_rate": 4.402815678900819e-05, + "loss": 4.4108, + "step": 37774 + }, + { + "epoch": 0.22465862593967076, + "grad_norm": 2.625601053237915, + "learning_rate": 4.4027853823833755e-05, + "loss": 4.6374, + "step": 37775 + }, + { + "epoch": 0.22466457322295175, + "grad_norm": 2.5341086387634277, + "learning_rate": 4.40275508520169e-05, + "loss": 4.4189, + "step": 37776 + }, + { + "epoch": 0.22467052050623276, + "grad_norm": 2.4302070140838623, + "learning_rate": 4.402724787355771e-05, + "loss": 4.4121, + "step": 37777 + }, + { + "epoch": 0.22467646778951375, + "grad_norm": 2.4907209873199463, + "learning_rate": 4.402694488845631e-05, + "loss": 4.3042, + "step": 37778 + }, + { + "epoch": 0.22468241507279474, + "grad_norm": 2.352330446243286, + "learning_rate": 4.402664189671281e-05, + "loss": 4.4724, + "step": 37779 + }, + { + "epoch": 0.22468836235607575, + "grad_norm": 2.715082883834839, + "learning_rate": 4.40263388983273e-05, + "loss": 4.3558, + "step": 37780 + }, + { + "epoch": 0.22469430963935674, + "grad_norm": 1.915778398513794, + "learning_rate": 4.402603589329989e-05, + "loss": 4.186, + "step": 37781 + }, + { + "epoch": 0.22470025692263773, + "grad_norm": 1.6563055515289307, + "learning_rate": 4.402573288163069e-05, + "loss": 4.5796, + "step": 37782 + }, + { + "epoch": 0.22470620420591875, + "grad_norm": 1.877414584159851, + "learning_rate": 4.40254298633198e-05, + "loss": 5.0202, + "step": 37783 + }, + { + "epoch": 0.22471215148919974, + "grad_norm": 1.7423501014709473, + "learning_rate": 4.402512683836732e-05, + "loss": 5.1002, + "step": 37784 + }, + { + "epoch": 0.22471809877248072, + "grad_norm": 1.7512094974517822, + "learning_rate": 4.402482380677338e-05, + "loss": 4.8946, + "step": 37785 + }, + { + "epoch": 0.22472404605576174, + "grad_norm": 1.822348713874817, + "learning_rate": 4.402452076853807e-05, + "loss": 4.2904, + "step": 37786 + }, + { + "epoch": 0.22472999333904273, + "grad_norm": 2.1487886905670166, + "learning_rate": 4.4024217723661485e-05, + "loss": 3.6705, + "step": 37787 + }, + { + "epoch": 0.22473594062232372, + "grad_norm": 1.9224172830581665, + "learning_rate": 4.402391467214375e-05, + "loss": 3.8513, + "step": 37788 + }, + { + "epoch": 0.22474188790560473, + "grad_norm": 2.0339977741241455, + "learning_rate": 4.4023611613984964e-05, + "loss": 4.1311, + "step": 37789 + }, + { + "epoch": 0.22474783518888572, + "grad_norm": 1.9256433248519897, + "learning_rate": 4.402330854918523e-05, + "loss": 4.3357, + "step": 37790 + }, + { + "epoch": 0.2247537824721667, + "grad_norm": 1.8157620429992676, + "learning_rate": 4.402300547774465e-05, + "loss": 4.2782, + "step": 37791 + }, + { + "epoch": 0.22475972975544772, + "grad_norm": 2.068574905395508, + "learning_rate": 4.402270239966334e-05, + "loss": 4.2032, + "step": 37792 + }, + { + "epoch": 0.2247656770387287, + "grad_norm": 1.9213577508926392, + "learning_rate": 4.40223993149414e-05, + "loss": 4.3021, + "step": 37793 + }, + { + "epoch": 0.2247716243220097, + "grad_norm": 2.1965863704681396, + "learning_rate": 4.402209622357894e-05, + "loss": 4.3002, + "step": 37794 + }, + { + "epoch": 0.22477757160529072, + "grad_norm": 1.570603847503662, + "learning_rate": 4.402179312557606e-05, + "loss": 4.4322, + "step": 37795 + }, + { + "epoch": 0.2247835188885717, + "grad_norm": 1.6555372476577759, + "learning_rate": 4.402149002093288e-05, + "loss": 4.3265, + "step": 37796 + }, + { + "epoch": 0.2247894661718527, + "grad_norm": 1.740679144859314, + "learning_rate": 4.402118690964948e-05, + "loss": 4.2542, + "step": 37797 + }, + { + "epoch": 0.2247954134551337, + "grad_norm": 1.4893536567687988, + "learning_rate": 4.402088379172598e-05, + "loss": 5.4392, + "step": 37798 + }, + { + "epoch": 0.2248013607384147, + "grad_norm": 1.5444672107696533, + "learning_rate": 4.4020580667162494e-05, + "loss": 5.3137, + "step": 37799 + }, + { + "epoch": 0.22480730802169568, + "grad_norm": 1.5143893957138062, + "learning_rate": 4.402027753595911e-05, + "loss": 5.2554, + "step": 37800 + }, + { + "epoch": 0.2248132553049767, + "grad_norm": 1.4042882919311523, + "learning_rate": 4.401997439811595e-05, + "loss": 5.2739, + "step": 37801 + }, + { + "epoch": 0.2248192025882577, + "grad_norm": 1.6029880046844482, + "learning_rate": 4.401967125363311e-05, + "loss": 5.2905, + "step": 37802 + }, + { + "epoch": 0.22482514987153868, + "grad_norm": 1.688639760017395, + "learning_rate": 4.4019368102510705e-05, + "loss": 4.5715, + "step": 37803 + }, + { + "epoch": 0.22483109715481966, + "grad_norm": 1.2697008848190308, + "learning_rate": 4.401906494474883e-05, + "loss": 5.3288, + "step": 37804 + }, + { + "epoch": 0.22483704443810068, + "grad_norm": 1.3962997198104858, + "learning_rate": 4.401876178034761e-05, + "loss": 5.2057, + "step": 37805 + }, + { + "epoch": 0.22484299172138167, + "grad_norm": 1.2445080280303955, + "learning_rate": 4.4018458609307124e-05, + "loss": 5.1935, + "step": 37806 + }, + { + "epoch": 0.22484893900466266, + "grad_norm": 1.4291088581085205, + "learning_rate": 4.401815543162749e-05, + "loss": 5.162, + "step": 37807 + }, + { + "epoch": 0.22485488628794367, + "grad_norm": 1.387148380279541, + "learning_rate": 4.401785224730881e-05, + "loss": 5.1254, + "step": 37808 + }, + { + "epoch": 0.22486083357122466, + "grad_norm": 1.575493574142456, + "learning_rate": 4.40175490563512e-05, + "loss": 4.5117, + "step": 37809 + }, + { + "epoch": 0.22486678085450565, + "grad_norm": 1.5773591995239258, + "learning_rate": 4.4017245858754764e-05, + "loss": 4.9031, + "step": 37810 + }, + { + "epoch": 0.22487272813778666, + "grad_norm": 1.6985411643981934, + "learning_rate": 4.40169426545196e-05, + "loss": 5.3724, + "step": 37811 + }, + { + "epoch": 0.22487867542106765, + "grad_norm": 1.6448116302490234, + "learning_rate": 4.401663944364581e-05, + "loss": 5.2448, + "step": 37812 + }, + { + "epoch": 0.22488462270434864, + "grad_norm": 1.6919282674789429, + "learning_rate": 4.4016336226133524e-05, + "loss": 5.5246, + "step": 37813 + }, + { + "epoch": 0.22489056998762966, + "grad_norm": 1.7464653253555298, + "learning_rate": 4.4016033001982827e-05, + "loss": 4.8954, + "step": 37814 + }, + { + "epoch": 0.22489651727091065, + "grad_norm": 1.6324712038040161, + "learning_rate": 4.401572977119382e-05, + "loss": 4.5895, + "step": 37815 + }, + { + "epoch": 0.22490246455419163, + "grad_norm": 1.9962471723556519, + "learning_rate": 4.4015426533766624e-05, + "loss": 3.9179, + "step": 37816 + }, + { + "epoch": 0.22490841183747265, + "grad_norm": 2.127303123474121, + "learning_rate": 4.401512328970134e-05, + "loss": 3.9693, + "step": 37817 + }, + { + "epoch": 0.22491435912075364, + "grad_norm": 1.8885246515274048, + "learning_rate": 4.401482003899807e-05, + "loss": 3.9877, + "step": 37818 + }, + { + "epoch": 0.22492030640403463, + "grad_norm": 1.7301980257034302, + "learning_rate": 4.4014516781656926e-05, + "loss": 4.4553, + "step": 37819 + }, + { + "epoch": 0.22492625368731564, + "grad_norm": 2.191305160522461, + "learning_rate": 4.401421351767801e-05, + "loss": 3.3981, + "step": 37820 + }, + { + "epoch": 0.22493220097059663, + "grad_norm": 2.289350986480713, + "learning_rate": 4.401391024706142e-05, + "loss": 3.7283, + "step": 37821 + }, + { + "epoch": 0.22493814825387762, + "grad_norm": 3.6579511165618896, + "learning_rate": 4.401360696980729e-05, + "loss": 1.4967, + "step": 37822 + }, + { + "epoch": 0.22494409553715863, + "grad_norm": 3.803406238555908, + "learning_rate": 4.401330368591568e-05, + "loss": 2.8816, + "step": 37823 + }, + { + "epoch": 0.22495004282043962, + "grad_norm": 3.4346766471862793, + "learning_rate": 4.401300039538675e-05, + "loss": 3.2975, + "step": 37824 + }, + { + "epoch": 0.2249559901037206, + "grad_norm": 3.123645305633545, + "learning_rate": 4.4012697098220556e-05, + "loss": 2.6538, + "step": 37825 + }, + { + "epoch": 0.22496193738700163, + "grad_norm": 1.8864881992340088, + "learning_rate": 4.401239379441724e-05, + "loss": 4.2615, + "step": 37826 + }, + { + "epoch": 0.22496788467028261, + "grad_norm": 2.685556173324585, + "learning_rate": 4.401209048397688e-05, + "loss": 3.6858, + "step": 37827 + }, + { + "epoch": 0.2249738319535636, + "grad_norm": 3.324258327484131, + "learning_rate": 4.401178716689961e-05, + "loss": 2.6127, + "step": 37828 + }, + { + "epoch": 0.22497977923684462, + "grad_norm": 3.6253082752227783, + "learning_rate": 4.401148384318551e-05, + "loss": 2.171, + "step": 37829 + }, + { + "epoch": 0.2249857265201256, + "grad_norm": 3.318803071975708, + "learning_rate": 4.4011180512834704e-05, + "loss": 2.4737, + "step": 37830 + }, + { + "epoch": 0.2249916738034066, + "grad_norm": 3.84256911277771, + "learning_rate": 4.401087717584729e-05, + "loss": 3.2811, + "step": 37831 + }, + { + "epoch": 0.2249976210866876, + "grad_norm": 2.7051305770874023, + "learning_rate": 4.401057383222338e-05, + "loss": 3.1816, + "step": 37832 + }, + { + "epoch": 0.2250035683699686, + "grad_norm": 2.243999719619751, + "learning_rate": 4.401027048196307e-05, + "loss": 3.9582, + "step": 37833 + }, + { + "epoch": 0.2250095156532496, + "grad_norm": 2.9261839389801025, + "learning_rate": 4.4009967125066465e-05, + "loss": 1.8198, + "step": 37834 + }, + { + "epoch": 0.2250154629365306, + "grad_norm": 2.6644344329833984, + "learning_rate": 4.4009663761533684e-05, + "loss": 1.0773, + "step": 37835 + }, + { + "epoch": 0.2250214102198116, + "grad_norm": 2.9495484828948975, + "learning_rate": 4.400936039136483e-05, + "loss": 1.5906, + "step": 37836 + }, + { + "epoch": 0.22502735750309258, + "grad_norm": 3.0252106189727783, + "learning_rate": 4.4009057014559996e-05, + "loss": 1.721, + "step": 37837 + }, + { + "epoch": 0.2250333047863736, + "grad_norm": 3.145016670227051, + "learning_rate": 4.4008753631119305e-05, + "loss": 1.8387, + "step": 37838 + }, + { + "epoch": 0.22503925206965458, + "grad_norm": 2.9610512256622314, + "learning_rate": 4.400845024104284e-05, + "loss": 2.0492, + "step": 37839 + }, + { + "epoch": 0.22504519935293557, + "grad_norm": 2.962660789489746, + "learning_rate": 4.400814684433073e-05, + "loss": 1.168, + "step": 37840 + }, + { + "epoch": 0.2250511466362166, + "grad_norm": 3.3425393104553223, + "learning_rate": 4.400784344098308e-05, + "loss": 1.576, + "step": 37841 + }, + { + "epoch": 0.22505709391949758, + "grad_norm": 3.3425233364105225, + "learning_rate": 4.400754003099998e-05, + "loss": 1.6533, + "step": 37842 + }, + { + "epoch": 0.22506304120277856, + "grad_norm": 3.656737804412842, + "learning_rate": 4.4007236614381545e-05, + "loss": 1.3451, + "step": 37843 + }, + { + "epoch": 0.22506898848605958, + "grad_norm": 2.0164568424224854, + "learning_rate": 4.400693319112788e-05, + "loss": 3.6496, + "step": 37844 + }, + { + "epoch": 0.22507493576934057, + "grad_norm": 3.6576480865478516, + "learning_rate": 4.400662976123909e-05, + "loss": 1.5672, + "step": 37845 + }, + { + "epoch": 0.22508088305262156, + "grad_norm": 3.4933855533599854, + "learning_rate": 4.400632632471529e-05, + "loss": 1.3534, + "step": 37846 + }, + { + "epoch": 0.22508683033590257, + "grad_norm": 4.428929805755615, + "learning_rate": 4.400602288155657e-05, + "loss": 2.669, + "step": 37847 + }, + { + "epoch": 0.22509277761918356, + "grad_norm": 3.86712908744812, + "learning_rate": 4.400571943176304e-05, + "loss": 2.175, + "step": 37848 + }, + { + "epoch": 0.22509872490246455, + "grad_norm": 3.457106113433838, + "learning_rate": 4.400541597533482e-05, + "loss": 1.996, + "step": 37849 + }, + { + "epoch": 0.22510467218574556, + "grad_norm": 3.4103970527648926, + "learning_rate": 4.4005112512272e-05, + "loss": 1.8448, + "step": 37850 + }, + { + "epoch": 0.22511061946902655, + "grad_norm": 4.167159557342529, + "learning_rate": 4.400480904257469e-05, + "loss": 2.3319, + "step": 37851 + }, + { + "epoch": 0.22511656675230754, + "grad_norm": 3.852609395980835, + "learning_rate": 4.4004505566243e-05, + "loss": 2.1329, + "step": 37852 + }, + { + "epoch": 0.22512251403558856, + "grad_norm": 3.090017795562744, + "learning_rate": 4.4004202083277034e-05, + "loss": 1.6722, + "step": 37853 + }, + { + "epoch": 0.22512846131886954, + "grad_norm": 2.2390940189361572, + "learning_rate": 4.40038985936769e-05, + "loss": 0.7566, + "step": 37854 + }, + { + "epoch": 0.22513440860215053, + "grad_norm": 2.376133680343628, + "learning_rate": 4.40035950974427e-05, + "loss": 2.1932, + "step": 37855 + }, + { + "epoch": 0.22514035588543155, + "grad_norm": 3.3748831748962402, + "learning_rate": 4.400329159457453e-05, + "loss": 3.3727, + "step": 37856 + }, + { + "epoch": 0.22514630316871254, + "grad_norm": 4.395442962646484, + "learning_rate": 4.400298808507252e-05, + "loss": 3.0625, + "step": 37857 + }, + { + "epoch": 0.22515225045199352, + "grad_norm": 3.7643630504608154, + "learning_rate": 4.400268456893676e-05, + "loss": 2.2564, + "step": 37858 + }, + { + "epoch": 0.22515819773527454, + "grad_norm": 4.105278491973877, + "learning_rate": 4.400238104616736e-05, + "loss": 2.267, + "step": 37859 + }, + { + "epoch": 0.22516414501855553, + "grad_norm": 3.456455945968628, + "learning_rate": 4.400207751676442e-05, + "loss": 1.8341, + "step": 37860 + }, + { + "epoch": 0.22517009230183652, + "grad_norm": 3.196443796157837, + "learning_rate": 4.4001773980728054e-05, + "loss": 2.3685, + "step": 37861 + }, + { + "epoch": 0.2251760395851175, + "grad_norm": 3.148589611053467, + "learning_rate": 4.400147043805837e-05, + "loss": 1.5025, + "step": 37862 + }, + { + "epoch": 0.22518198686839852, + "grad_norm": 3.0841641426086426, + "learning_rate": 4.400116688875546e-05, + "loss": 1.4879, + "step": 37863 + }, + { + "epoch": 0.2251879341516795, + "grad_norm": 3.2474453449249268, + "learning_rate": 4.4000863332819445e-05, + "loss": 1.2444, + "step": 37864 + }, + { + "epoch": 0.2251938814349605, + "grad_norm": 3.0410406589508057, + "learning_rate": 4.400055977025043e-05, + "loss": 1.9955, + "step": 37865 + }, + { + "epoch": 0.2251998287182415, + "grad_norm": 3.0968708992004395, + "learning_rate": 4.400025620104851e-05, + "loss": 2.2284, + "step": 37866 + }, + { + "epoch": 0.2252057760015225, + "grad_norm": 2.2545621395111084, + "learning_rate": 4.399995262521379e-05, + "loss": 3.7539, + "step": 37867 + }, + { + "epoch": 0.2252117232848035, + "grad_norm": 3.4662656784057617, + "learning_rate": 4.399964904274639e-05, + "loss": 2.8159, + "step": 37868 + }, + { + "epoch": 0.2252176705680845, + "grad_norm": 3.1281726360321045, + "learning_rate": 4.3999345453646405e-05, + "loss": 1.7505, + "step": 37869 + }, + { + "epoch": 0.2252236178513655, + "grad_norm": 2.7577738761901855, + "learning_rate": 4.399904185791395e-05, + "loss": 1.6282, + "step": 37870 + }, + { + "epoch": 0.22522956513464648, + "grad_norm": 2.8311750888824463, + "learning_rate": 4.399873825554912e-05, + "loss": 1.6367, + "step": 37871 + }, + { + "epoch": 0.2252355124179275, + "grad_norm": 2.771963596343994, + "learning_rate": 4.3998434646552034e-05, + "loss": 2.1686, + "step": 37872 + }, + { + "epoch": 0.22524145970120849, + "grad_norm": 1.8751553297042847, + "learning_rate": 4.399813103092279e-05, + "loss": 5.0479, + "step": 37873 + }, + { + "epoch": 0.22524740698448947, + "grad_norm": 2.1598060131073, + "learning_rate": 4.399782740866148e-05, + "loss": 4.2219, + "step": 37874 + }, + { + "epoch": 0.2252533542677705, + "grad_norm": 3.289924383163452, + "learning_rate": 4.399752377976825e-05, + "loss": 2.0413, + "step": 37875 + }, + { + "epoch": 0.22525930155105148, + "grad_norm": 3.788972854614258, + "learning_rate": 4.399722014424316e-05, + "loss": 1.941, + "step": 37876 + }, + { + "epoch": 0.22526524883433247, + "grad_norm": 3.5833699703216553, + "learning_rate": 4.3996916502086344e-05, + "loss": 1.8684, + "step": 37877 + }, + { + "epoch": 0.22527119611761348, + "grad_norm": 2.6145071983337402, + "learning_rate": 4.3996612853297906e-05, + "loss": 3.2526, + "step": 37878 + }, + { + "epoch": 0.22527714340089447, + "grad_norm": 1.8506813049316406, + "learning_rate": 4.399630919787794e-05, + "loss": 4.7183, + "step": 37879 + }, + { + "epoch": 0.22528309068417546, + "grad_norm": 1.6560643911361694, + "learning_rate": 4.3996005535826555e-05, + "loss": 5.2118, + "step": 37880 + }, + { + "epoch": 0.22528903796745647, + "grad_norm": 1.5506187677383423, + "learning_rate": 4.3995701867143867e-05, + "loss": 5.2523, + "step": 37881 + }, + { + "epoch": 0.22529498525073746, + "grad_norm": 1.529112458229065, + "learning_rate": 4.3995398191829974e-05, + "loss": 4.7377, + "step": 37882 + }, + { + "epoch": 0.22530093253401845, + "grad_norm": 1.6341055631637573, + "learning_rate": 4.3995094509884986e-05, + "loss": 5.492, + "step": 37883 + }, + { + "epoch": 0.22530687981729947, + "grad_norm": 1.5386645793914795, + "learning_rate": 4.3994790821309004e-05, + "loss": 5.343, + "step": 37884 + }, + { + "epoch": 0.22531282710058045, + "grad_norm": 1.8904048204421997, + "learning_rate": 4.3994487126102137e-05, + "loss": 4.376, + "step": 37885 + }, + { + "epoch": 0.22531877438386144, + "grad_norm": 1.6142657995224, + "learning_rate": 4.39941834242645e-05, + "loss": 5.1632, + "step": 37886 + }, + { + "epoch": 0.22532472166714246, + "grad_norm": 1.5371025800704956, + "learning_rate": 4.399387971579618e-05, + "loss": 5.0741, + "step": 37887 + }, + { + "epoch": 0.22533066895042345, + "grad_norm": 1.6636183261871338, + "learning_rate": 4.399357600069729e-05, + "loss": 5.0503, + "step": 37888 + }, + { + "epoch": 0.22533661623370443, + "grad_norm": 1.4598065614700317, + "learning_rate": 4.3993272278967944e-05, + "loss": 4.9753, + "step": 37889 + }, + { + "epoch": 0.22534256351698545, + "grad_norm": 1.5426924228668213, + "learning_rate": 4.399296855060824e-05, + "loss": 5.1221, + "step": 37890 + }, + { + "epoch": 0.22534851080026644, + "grad_norm": 1.2707856893539429, + "learning_rate": 4.399266481561829e-05, + "loss": 4.883, + "step": 37891 + }, + { + "epoch": 0.22535445808354743, + "grad_norm": 1.5364930629730225, + "learning_rate": 4.39923610739982e-05, + "loss": 4.9607, + "step": 37892 + }, + { + "epoch": 0.22536040536682844, + "grad_norm": 1.5102486610412598, + "learning_rate": 4.3992057325748066e-05, + "loss": 4.9234, + "step": 37893 + }, + { + "epoch": 0.22536635265010943, + "grad_norm": 1.3505035638809204, + "learning_rate": 4.3991753570868e-05, + "loss": 4.9517, + "step": 37894 + }, + { + "epoch": 0.22537229993339042, + "grad_norm": 1.6128617525100708, + "learning_rate": 4.3991449809358115e-05, + "loss": 4.9662, + "step": 37895 + }, + { + "epoch": 0.22537824721667143, + "grad_norm": 1.4386210441589355, + "learning_rate": 4.399114604121851e-05, + "loss": 4.7074, + "step": 37896 + }, + { + "epoch": 0.22538419449995242, + "grad_norm": 1.473679780960083, + "learning_rate": 4.399084226644929e-05, + "loss": 4.4737, + "step": 37897 + }, + { + "epoch": 0.2253901417832334, + "grad_norm": 1.3751991987228394, + "learning_rate": 4.399053848505057e-05, + "loss": 4.9829, + "step": 37898 + }, + { + "epoch": 0.22539608906651443, + "grad_norm": 1.4539976119995117, + "learning_rate": 4.3990234697022434e-05, + "loss": 4.9506, + "step": 37899 + }, + { + "epoch": 0.22540203634979541, + "grad_norm": 1.3100522756576538, + "learning_rate": 4.3989930902365015e-05, + "loss": 4.9638, + "step": 37900 + }, + { + "epoch": 0.2254079836330764, + "grad_norm": 1.3466485738754272, + "learning_rate": 4.3989627101078404e-05, + "loss": 4.8384, + "step": 37901 + }, + { + "epoch": 0.22541393091635742, + "grad_norm": 1.5398622751235962, + "learning_rate": 4.398932329316271e-05, + "loss": 4.9108, + "step": 37902 + }, + { + "epoch": 0.2254198781996384, + "grad_norm": 1.3253413438796997, + "learning_rate": 4.398901947861804e-05, + "loss": 4.929, + "step": 37903 + }, + { + "epoch": 0.2254258254829194, + "grad_norm": 1.4795877933502197, + "learning_rate": 4.39887156574445e-05, + "loss": 4.9455, + "step": 37904 + }, + { + "epoch": 0.2254317727662004, + "grad_norm": 1.6711117029190063, + "learning_rate": 4.398841182964219e-05, + "loss": 4.4684, + "step": 37905 + }, + { + "epoch": 0.2254377200494814, + "grad_norm": 1.282360553741455, + "learning_rate": 4.398810799521124e-05, + "loss": 4.9634, + "step": 37906 + }, + { + "epoch": 0.2254436673327624, + "grad_norm": 1.8916454315185547, + "learning_rate": 4.3987804154151726e-05, + "loss": 5.3391, + "step": 37907 + }, + { + "epoch": 0.2254496146160434, + "grad_norm": 1.8332359790802002, + "learning_rate": 4.3987500306463756e-05, + "loss": 4.7668, + "step": 37908 + }, + { + "epoch": 0.2254555618993244, + "grad_norm": 1.565968632698059, + "learning_rate": 4.398719645214746e-05, + "loss": 4.9305, + "step": 37909 + }, + { + "epoch": 0.22546150918260538, + "grad_norm": 1.66316819190979, + "learning_rate": 4.3986892591202924e-05, + "loss": 5.8043, + "step": 37910 + }, + { + "epoch": 0.2254674564658864, + "grad_norm": 1.5474905967712402, + "learning_rate": 4.398658872363026e-05, + "loss": 5.4856, + "step": 37911 + }, + { + "epoch": 0.22547340374916738, + "grad_norm": 1.5161161422729492, + "learning_rate": 4.398628484942957e-05, + "loss": 5.4491, + "step": 37912 + }, + { + "epoch": 0.22547935103244837, + "grad_norm": 1.4109158515930176, + "learning_rate": 4.398598096860097e-05, + "loss": 5.5486, + "step": 37913 + }, + { + "epoch": 0.2254852983157294, + "grad_norm": 2.6624763011932373, + "learning_rate": 4.398567708114456e-05, + "loss": 4.3903, + "step": 37914 + }, + { + "epoch": 0.22549124559901038, + "grad_norm": 2.456807851791382, + "learning_rate": 4.3985373187060444e-05, + "loss": 4.9181, + "step": 37915 + }, + { + "epoch": 0.22549719288229136, + "grad_norm": 1.541363000869751, + "learning_rate": 4.398506928634873e-05, + "loss": 5.6465, + "step": 37916 + }, + { + "epoch": 0.22550314016557238, + "grad_norm": 2.3970277309417725, + "learning_rate": 4.398476537900953e-05, + "loss": 3.8429, + "step": 37917 + }, + { + "epoch": 0.22550908744885337, + "grad_norm": 1.6470190286636353, + "learning_rate": 4.3984461465042933e-05, + "loss": 5.011, + "step": 37918 + }, + { + "epoch": 0.22551503473213436, + "grad_norm": 1.6763241291046143, + "learning_rate": 4.3984157544449076e-05, + "loss": 5.3823, + "step": 37919 + }, + { + "epoch": 0.22552098201541534, + "grad_norm": 1.6084785461425781, + "learning_rate": 4.398385361722803e-05, + "loss": 5.6275, + "step": 37920 + }, + { + "epoch": 0.22552692929869636, + "grad_norm": 1.4290287494659424, + "learning_rate": 4.398354968337992e-05, + "loss": 5.453, + "step": 37921 + }, + { + "epoch": 0.22553287658197735, + "grad_norm": 1.7292128801345825, + "learning_rate": 4.398324574290485e-05, + "loss": 4.4189, + "step": 37922 + }, + { + "epoch": 0.22553882386525834, + "grad_norm": 1.5806313753128052, + "learning_rate": 4.3982941795802925e-05, + "loss": 4.7038, + "step": 37923 + }, + { + "epoch": 0.22554477114853935, + "grad_norm": 1.7047100067138672, + "learning_rate": 4.398263784207425e-05, + "loss": 5.1124, + "step": 37924 + }, + { + "epoch": 0.22555071843182034, + "grad_norm": 1.7951979637145996, + "learning_rate": 4.398233388171893e-05, + "loss": 5.0583, + "step": 37925 + }, + { + "epoch": 0.22555666571510133, + "grad_norm": 1.7077093124389648, + "learning_rate": 4.3982029914737076e-05, + "loss": 5.0506, + "step": 37926 + }, + { + "epoch": 0.22556261299838234, + "grad_norm": 1.767303228378296, + "learning_rate": 4.3981725941128795e-05, + "loss": 5.3591, + "step": 37927 + }, + { + "epoch": 0.22556856028166333, + "grad_norm": 1.5521531105041504, + "learning_rate": 4.398142196089419e-05, + "loss": 5.2546, + "step": 37928 + }, + { + "epoch": 0.22557450756494432, + "grad_norm": 1.7760071754455566, + "learning_rate": 4.398111797403336e-05, + "loss": 4.2981, + "step": 37929 + }, + { + "epoch": 0.22558045484822534, + "grad_norm": 2.609588146209717, + "learning_rate": 4.398081398054642e-05, + "loss": 3.4733, + "step": 37930 + }, + { + "epoch": 0.22558640213150633, + "grad_norm": 2.9639456272125244, + "learning_rate": 4.3980509980433475e-05, + "loss": 2.1202, + "step": 37931 + }, + { + "epoch": 0.2255923494147873, + "grad_norm": 2.8901517391204834, + "learning_rate": 4.398020597369463e-05, + "loss": 2.3376, + "step": 37932 + }, + { + "epoch": 0.22559829669806833, + "grad_norm": 2.5380642414093018, + "learning_rate": 4.397990196032999e-05, + "loss": 2.2578, + "step": 37933 + }, + { + "epoch": 0.22560424398134932, + "grad_norm": 2.1316301822662354, + "learning_rate": 4.3979597940339664e-05, + "loss": 3.9766, + "step": 37934 + }, + { + "epoch": 0.2256101912646303, + "grad_norm": 1.7142231464385986, + "learning_rate": 4.3979293913723756e-05, + "loss": 4.3819, + "step": 37935 + }, + { + "epoch": 0.22561613854791132, + "grad_norm": 2.933591365814209, + "learning_rate": 4.397898988048238e-05, + "loss": 2.63, + "step": 37936 + }, + { + "epoch": 0.2256220858311923, + "grad_norm": 3.323190927505493, + "learning_rate": 4.397868584061562e-05, + "loss": 2.1169, + "step": 37937 + }, + { + "epoch": 0.2256280331144733, + "grad_norm": 2.8052923679351807, + "learning_rate": 4.3978381794123604e-05, + "loss": 1.8126, + "step": 37938 + }, + { + "epoch": 0.2256339803977543, + "grad_norm": 2.9329421520233154, + "learning_rate": 4.397807774100643e-05, + "loss": 2.0345, + "step": 37939 + }, + { + "epoch": 0.2256399276810353, + "grad_norm": 2.487644910812378, + "learning_rate": 4.3977773681264206e-05, + "loss": 2.0859, + "step": 37940 + }, + { + "epoch": 0.2256458749643163, + "grad_norm": 2.733116388320923, + "learning_rate": 4.3977469614897035e-05, + "loss": 3.8859, + "step": 37941 + }, + { + "epoch": 0.2256518222475973, + "grad_norm": 1.7498195171356201, + "learning_rate": 4.397716554190503e-05, + "loss": 4.8557, + "step": 37942 + }, + { + "epoch": 0.2256577695308783, + "grad_norm": 1.721623420715332, + "learning_rate": 4.397686146228829e-05, + "loss": 4.3423, + "step": 37943 + }, + { + "epoch": 0.22566371681415928, + "grad_norm": 1.6414244174957275, + "learning_rate": 4.397655737604692e-05, + "loss": 4.7495, + "step": 37944 + }, + { + "epoch": 0.2256696640974403, + "grad_norm": 1.493431806564331, + "learning_rate": 4.397625328318104e-05, + "loss": 4.8656, + "step": 37945 + }, + { + "epoch": 0.22567561138072129, + "grad_norm": 1.7861814498901367, + "learning_rate": 4.397594918369074e-05, + "loss": 4.9698, + "step": 37946 + }, + { + "epoch": 0.22568155866400227, + "grad_norm": 1.5873513221740723, + "learning_rate": 4.397564507757613e-05, + "loss": 4.822, + "step": 37947 + }, + { + "epoch": 0.2256875059472833, + "grad_norm": 1.4830878973007202, + "learning_rate": 4.397534096483732e-05, + "loss": 5.19, + "step": 37948 + }, + { + "epoch": 0.22569345323056428, + "grad_norm": 1.7204992771148682, + "learning_rate": 4.397503684547442e-05, + "loss": 5.1106, + "step": 37949 + }, + { + "epoch": 0.22569940051384527, + "grad_norm": 1.515528917312622, + "learning_rate": 4.3974732719487524e-05, + "loss": 5.2533, + "step": 37950 + }, + { + "epoch": 0.22570534779712628, + "grad_norm": 1.438448190689087, + "learning_rate": 4.3974428586876746e-05, + "loss": 5.2196, + "step": 37951 + }, + { + "epoch": 0.22571129508040727, + "grad_norm": 1.884359359741211, + "learning_rate": 4.397412444764219e-05, + "loss": 4.695, + "step": 37952 + }, + { + "epoch": 0.22571724236368826, + "grad_norm": 1.5510938167572021, + "learning_rate": 4.3973820301783965e-05, + "loss": 5.4799, + "step": 37953 + }, + { + "epoch": 0.22572318964696927, + "grad_norm": 1.3831311464309692, + "learning_rate": 4.397351614930217e-05, + "loss": 5.3733, + "step": 37954 + }, + { + "epoch": 0.22572913693025026, + "grad_norm": 1.7165378332138062, + "learning_rate": 4.397321199019693e-05, + "loss": 5.0023, + "step": 37955 + }, + { + "epoch": 0.22573508421353125, + "grad_norm": 1.287805199623108, + "learning_rate": 4.397290782446832e-05, + "loss": 5.0996, + "step": 37956 + }, + { + "epoch": 0.22574103149681227, + "grad_norm": 1.1919018030166626, + "learning_rate": 4.397260365211648e-05, + "loss": 5.4756, + "step": 37957 + }, + { + "epoch": 0.22574697878009325, + "grad_norm": 1.3540643453598022, + "learning_rate": 4.3972299473141494e-05, + "loss": 5.4296, + "step": 37958 + }, + { + "epoch": 0.22575292606337424, + "grad_norm": 1.3627208471298218, + "learning_rate": 4.397199528754348e-05, + "loss": 5.4029, + "step": 37959 + }, + { + "epoch": 0.22575887334665526, + "grad_norm": 1.599832534790039, + "learning_rate": 4.3971691095322526e-05, + "loss": 4.4081, + "step": 37960 + }, + { + "epoch": 0.22576482062993625, + "grad_norm": 1.4021176099777222, + "learning_rate": 4.397138689647876e-05, + "loss": 5.1567, + "step": 37961 + }, + { + "epoch": 0.22577076791321724, + "grad_norm": 1.4261466264724731, + "learning_rate": 4.3971082691012264e-05, + "loss": 5.1062, + "step": 37962 + }, + { + "epoch": 0.22577671519649825, + "grad_norm": 1.5316903591156006, + "learning_rate": 4.397077847892318e-05, + "loss": 5.1274, + "step": 37963 + }, + { + "epoch": 0.22578266247977924, + "grad_norm": 1.535687804222107, + "learning_rate": 4.397047426021158e-05, + "loss": 5.1904, + "step": 37964 + }, + { + "epoch": 0.22578860976306023, + "grad_norm": 1.502808928489685, + "learning_rate": 4.397017003487759e-05, + "loss": 5.0989, + "step": 37965 + }, + { + "epoch": 0.22579455704634124, + "grad_norm": 1.6442323923110962, + "learning_rate": 4.3969865802921306e-05, + "loss": 4.6965, + "step": 37966 + }, + { + "epoch": 0.22580050432962223, + "grad_norm": 1.5118539333343506, + "learning_rate": 4.3969561564342836e-05, + "loss": 5.1223, + "step": 37967 + }, + { + "epoch": 0.22580645161290322, + "grad_norm": 1.9736100435256958, + "learning_rate": 4.396925731914229e-05, + "loss": 5.608, + "step": 37968 + }, + { + "epoch": 0.22581239889618424, + "grad_norm": 1.6377054452896118, + "learning_rate": 4.3968953067319777e-05, + "loss": 5.5701, + "step": 37969 + }, + { + "epoch": 0.22581834617946522, + "grad_norm": 1.588903784751892, + "learning_rate": 4.396864880887539e-05, + "loss": 4.9223, + "step": 37970 + }, + { + "epoch": 0.2258242934627462, + "grad_norm": 1.8022443056106567, + "learning_rate": 4.396834454380925e-05, + "loss": 4.6535, + "step": 37971 + }, + { + "epoch": 0.22583024074602723, + "grad_norm": 1.7296810150146484, + "learning_rate": 4.3968040272121456e-05, + "loss": 4.9426, + "step": 37972 + }, + { + "epoch": 0.22583618802930822, + "grad_norm": 1.7298763990402222, + "learning_rate": 4.3967735993812114e-05, + "loss": 5.2988, + "step": 37973 + }, + { + "epoch": 0.2258421353125892, + "grad_norm": 1.6843442916870117, + "learning_rate": 4.396743170888133e-05, + "loss": 5.0539, + "step": 37974 + }, + { + "epoch": 0.22584808259587022, + "grad_norm": 1.5394823551177979, + "learning_rate": 4.396712741732921e-05, + "loss": 5.0311, + "step": 37975 + }, + { + "epoch": 0.2258540298791512, + "grad_norm": 1.322674036026001, + "learning_rate": 4.396682311915586e-05, + "loss": 4.8644, + "step": 37976 + }, + { + "epoch": 0.2258599771624322, + "grad_norm": 1.8961514234542847, + "learning_rate": 4.39665188143614e-05, + "loss": 4.2571, + "step": 37977 + }, + { + "epoch": 0.2258659244457132, + "grad_norm": 1.3595390319824219, + "learning_rate": 4.3966214502945915e-05, + "loss": 4.7363, + "step": 37978 + }, + { + "epoch": 0.2258718717289942, + "grad_norm": 1.3047689199447632, + "learning_rate": 4.396591018490953e-05, + "loss": 4.8923, + "step": 37979 + }, + { + "epoch": 0.2258778190122752, + "grad_norm": 1.4182853698730469, + "learning_rate": 4.396560586025233e-05, + "loss": 4.883, + "step": 37980 + }, + { + "epoch": 0.22588376629555618, + "grad_norm": 1.3140445947647095, + "learning_rate": 4.3965301528974434e-05, + "loss": 4.7423, + "step": 37981 + }, + { + "epoch": 0.2258897135788372, + "grad_norm": 1.4918787479400635, + "learning_rate": 4.396499719107595e-05, + "loss": 4.6526, + "step": 37982 + }, + { + "epoch": 0.22589566086211818, + "grad_norm": 1.5447934865951538, + "learning_rate": 4.396469284655699e-05, + "loss": 4.7521, + "step": 37983 + }, + { + "epoch": 0.22590160814539917, + "grad_norm": 1.2894394397735596, + "learning_rate": 4.396438849541764e-05, + "loss": 4.8453, + "step": 37984 + }, + { + "epoch": 0.22590755542868018, + "grad_norm": 1.7194790840148926, + "learning_rate": 4.396408413765802e-05, + "loss": 4.7401, + "step": 37985 + }, + { + "epoch": 0.22591350271196117, + "grad_norm": 1.5272841453552246, + "learning_rate": 4.3963779773278234e-05, + "loss": 4.652, + "step": 37986 + }, + { + "epoch": 0.22591944999524216, + "grad_norm": 1.9356179237365723, + "learning_rate": 4.3963475402278395e-05, + "loss": 4.2807, + "step": 37987 + }, + { + "epoch": 0.22592539727852318, + "grad_norm": 1.6426055431365967, + "learning_rate": 4.3963171024658586e-05, + "loss": 4.6845, + "step": 37988 + }, + { + "epoch": 0.22593134456180416, + "grad_norm": 1.666062593460083, + "learning_rate": 4.396286664041895e-05, + "loss": 4.8995, + "step": 37989 + }, + { + "epoch": 0.22593729184508515, + "grad_norm": 1.625488042831421, + "learning_rate": 4.3962562249559556e-05, + "loss": 4.9241, + "step": 37990 + }, + { + "epoch": 0.22594323912836617, + "grad_norm": 1.6422685384750366, + "learning_rate": 4.3962257852080545e-05, + "loss": 4.7108, + "step": 37991 + }, + { + "epoch": 0.22594918641164716, + "grad_norm": 1.582587718963623, + "learning_rate": 4.3961953447982e-05, + "loss": 4.7298, + "step": 37992 + }, + { + "epoch": 0.22595513369492815, + "grad_norm": 1.6675734519958496, + "learning_rate": 4.3961649037264025e-05, + "loss": 4.6544, + "step": 37993 + }, + { + "epoch": 0.22596108097820916, + "grad_norm": 1.6701778173446655, + "learning_rate": 4.3961344619926733e-05, + "loss": 4.1497, + "step": 37994 + }, + { + "epoch": 0.22596702826149015, + "grad_norm": 1.668684959411621, + "learning_rate": 4.396104019597024e-05, + "loss": 4.7531, + "step": 37995 + }, + { + "epoch": 0.22597297554477114, + "grad_norm": 1.6830800771713257, + "learning_rate": 4.396073576539465e-05, + "loss": 4.8988, + "step": 37996 + }, + { + "epoch": 0.22597892282805215, + "grad_norm": 1.5785146951675415, + "learning_rate": 4.3960431328200044e-05, + "loss": 4.868, + "step": 37997 + }, + { + "epoch": 0.22598487011133314, + "grad_norm": 1.5874582529067993, + "learning_rate": 4.396012688438656e-05, + "loss": 4.9942, + "step": 37998 + }, + { + "epoch": 0.22599081739461413, + "grad_norm": 1.4960954189300537, + "learning_rate": 4.395982243395429e-05, + "loss": 4.9711, + "step": 37999 + }, + { + "epoch": 0.22599676467789515, + "grad_norm": 1.6689503192901611, + "learning_rate": 4.3959517976903344e-05, + "loss": 4.6986, + "step": 38000 + }, + { + "epoch": 0.22600271196117613, + "grad_norm": 1.583571195602417, + "learning_rate": 4.3959213513233824e-05, + "loss": 4.9802, + "step": 38001 + }, + { + "epoch": 0.22600865924445712, + "grad_norm": 1.3584336042404175, + "learning_rate": 4.3958909042945826e-05, + "loss": 4.7263, + "step": 38002 + }, + { + "epoch": 0.22601460652773814, + "grad_norm": 1.5296635627746582, + "learning_rate": 4.3958604566039485e-05, + "loss": 4.7501, + "step": 38003 + }, + { + "epoch": 0.22602055381101913, + "grad_norm": 1.541918158531189, + "learning_rate": 4.395830008251489e-05, + "loss": 4.7631, + "step": 38004 + }, + { + "epoch": 0.22602650109430011, + "grad_norm": 1.5141816139221191, + "learning_rate": 4.395799559237214e-05, + "loss": 4.5206, + "step": 38005 + }, + { + "epoch": 0.22603244837758113, + "grad_norm": 2.4596874713897705, + "learning_rate": 4.395769109561136e-05, + "loss": 4.5737, + "step": 38006 + }, + { + "epoch": 0.22603839566086212, + "grad_norm": 1.8154200315475464, + "learning_rate": 4.395738659223264e-05, + "loss": 4.3293, + "step": 38007 + }, + { + "epoch": 0.2260443429441431, + "grad_norm": 1.490979790687561, + "learning_rate": 4.395708208223609e-05, + "loss": 4.7954, + "step": 38008 + }, + { + "epoch": 0.22605029022742412, + "grad_norm": 1.4490966796875, + "learning_rate": 4.395677756562182e-05, + "loss": 4.8062, + "step": 38009 + }, + { + "epoch": 0.2260562375107051, + "grad_norm": 1.597187876701355, + "learning_rate": 4.395647304238993e-05, + "loss": 4.7911, + "step": 38010 + }, + { + "epoch": 0.2260621847939861, + "grad_norm": 1.6224030256271362, + "learning_rate": 4.395616851254054e-05, + "loss": 4.8604, + "step": 38011 + }, + { + "epoch": 0.22606813207726711, + "grad_norm": 1.626530408859253, + "learning_rate": 4.3955863976073744e-05, + "loss": 4.7964, + "step": 38012 + }, + { + "epoch": 0.2260740793605481, + "grad_norm": 1.4275909662246704, + "learning_rate": 4.3955559432989654e-05, + "loss": 4.5712, + "step": 38013 + }, + { + "epoch": 0.2260800266438291, + "grad_norm": 1.3585506677627563, + "learning_rate": 4.3955254883288366e-05, + "loss": 4.5507, + "step": 38014 + }, + { + "epoch": 0.2260859739271101, + "grad_norm": 1.594292402267456, + "learning_rate": 4.395495032697e-05, + "loss": 4.664, + "step": 38015 + }, + { + "epoch": 0.2260919212103911, + "grad_norm": 1.5161908864974976, + "learning_rate": 4.3954645764034666e-05, + "loss": 4.6313, + "step": 38016 + }, + { + "epoch": 0.22609786849367208, + "grad_norm": 1.3704510927200317, + "learning_rate": 4.395434119448244e-05, + "loss": 4.6362, + "step": 38017 + }, + { + "epoch": 0.2261038157769531, + "grad_norm": 1.5801879167556763, + "learning_rate": 4.395403661831346e-05, + "loss": 4.5719, + "step": 38018 + }, + { + "epoch": 0.2261097630602341, + "grad_norm": 1.8262200355529785, + "learning_rate": 4.395373203552783e-05, + "loss": 4.9934, + "step": 38019 + }, + { + "epoch": 0.22611571034351508, + "grad_norm": 1.606871485710144, + "learning_rate": 4.395342744612564e-05, + "loss": 5.4006, + "step": 38020 + }, + { + "epoch": 0.2261216576267961, + "grad_norm": 1.888592004776001, + "learning_rate": 4.395312285010701e-05, + "loss": 5.181, + "step": 38021 + }, + { + "epoch": 0.22612760491007708, + "grad_norm": 1.703467845916748, + "learning_rate": 4.3952818247472025e-05, + "loss": 4.9956, + "step": 38022 + }, + { + "epoch": 0.22613355219335807, + "grad_norm": 1.8185619115829468, + "learning_rate": 4.3952513638220825e-05, + "loss": 4.193, + "step": 38023 + }, + { + "epoch": 0.22613949947663908, + "grad_norm": 1.9167721271514893, + "learning_rate": 4.395220902235349e-05, + "loss": 4.0476, + "step": 38024 + }, + { + "epoch": 0.22614544675992007, + "grad_norm": 1.4851292371749878, + "learning_rate": 4.395190439987014e-05, + "loss": 4.0727, + "step": 38025 + }, + { + "epoch": 0.22615139404320106, + "grad_norm": 2.399094581604004, + "learning_rate": 4.395159977077087e-05, + "loss": 4.1274, + "step": 38026 + }, + { + "epoch": 0.22615734132648208, + "grad_norm": 1.73356032371521, + "learning_rate": 4.395129513505579e-05, + "loss": 3.9943, + "step": 38027 + }, + { + "epoch": 0.22616328860976306, + "grad_norm": 1.5265543460845947, + "learning_rate": 4.395099049272501e-05, + "loss": 5.212, + "step": 38028 + }, + { + "epoch": 0.22616923589304405, + "grad_norm": 1.3660756349563599, + "learning_rate": 4.395068584377864e-05, + "loss": 5.3658, + "step": 38029 + }, + { + "epoch": 0.22617518317632507, + "grad_norm": 1.8043396472930908, + "learning_rate": 4.3950381188216786e-05, + "loss": 4.5968, + "step": 38030 + }, + { + "epoch": 0.22618113045960606, + "grad_norm": 1.3216902017593384, + "learning_rate": 4.395007652603954e-05, + "loss": 5.4593, + "step": 38031 + }, + { + "epoch": 0.22618707774288704, + "grad_norm": 1.3674670457839966, + "learning_rate": 4.394977185724701e-05, + "loss": 5.3019, + "step": 38032 + }, + { + "epoch": 0.22619302502616806, + "grad_norm": 1.3079349994659424, + "learning_rate": 4.3949467181839325e-05, + "loss": 5.3165, + "step": 38033 + }, + { + "epoch": 0.22619897230944905, + "grad_norm": 1.3201943635940552, + "learning_rate": 4.394916249981658e-05, + "loss": 5.1214, + "step": 38034 + }, + { + "epoch": 0.22620491959273004, + "grad_norm": 1.5240484476089478, + "learning_rate": 4.394885781117887e-05, + "loss": 4.6877, + "step": 38035 + }, + { + "epoch": 0.22621086687601105, + "grad_norm": 1.624353289604187, + "learning_rate": 4.3948553115926304e-05, + "loss": 4.9886, + "step": 38036 + }, + { + "epoch": 0.22621681415929204, + "grad_norm": 1.5491869449615479, + "learning_rate": 4.3948248414059004e-05, + "loss": 5.4594, + "step": 38037 + }, + { + "epoch": 0.22622276144257303, + "grad_norm": 1.5223517417907715, + "learning_rate": 4.394794370557706e-05, + "loss": 5.3304, + "step": 38038 + }, + { + "epoch": 0.22622870872585402, + "grad_norm": 1.1592520475387573, + "learning_rate": 4.394763899048059e-05, + "loss": 4.9766, + "step": 38039 + }, + { + "epoch": 0.22623465600913503, + "grad_norm": 1.6094484329223633, + "learning_rate": 4.394733426876969e-05, + "loss": 4.6776, + "step": 38040 + }, + { + "epoch": 0.22624060329241602, + "grad_norm": 1.5425121784210205, + "learning_rate": 4.394702954044447e-05, + "loss": 5.4675, + "step": 38041 + }, + { + "epoch": 0.226246550575697, + "grad_norm": 1.905106544494629, + "learning_rate": 4.3946724805505046e-05, + "loss": 4.2846, + "step": 38042 + }, + { + "epoch": 0.22625249785897802, + "grad_norm": 1.6905035972595215, + "learning_rate": 4.394642006395151e-05, + "loss": 5.0805, + "step": 38043 + }, + { + "epoch": 0.226258445142259, + "grad_norm": 1.8033732175827026, + "learning_rate": 4.3946115315783976e-05, + "loss": 4.765, + "step": 38044 + }, + { + "epoch": 0.22626439242554, + "grad_norm": 1.8025847673416138, + "learning_rate": 4.394581056100255e-05, + "loss": 5.1845, + "step": 38045 + }, + { + "epoch": 0.22627033970882102, + "grad_norm": 1.5593371391296387, + "learning_rate": 4.394550579960734e-05, + "loss": 4.9863, + "step": 38046 + }, + { + "epoch": 0.226276286992102, + "grad_norm": 1.5229204893112183, + "learning_rate": 4.394520103159844e-05, + "loss": 4.9358, + "step": 38047 + }, + { + "epoch": 0.226282234275383, + "grad_norm": 2.367879629135132, + "learning_rate": 4.3944896256975975e-05, + "loss": 3.7138, + "step": 38048 + }, + { + "epoch": 0.226288181558664, + "grad_norm": 2.867363452911377, + "learning_rate": 4.394459147574004e-05, + "loss": 2.8982, + "step": 38049 + }, + { + "epoch": 0.226294128841945, + "grad_norm": 2.827266216278076, + "learning_rate": 4.394428668789074e-05, + "loss": 2.8304, + "step": 38050 + }, + { + "epoch": 0.22630007612522599, + "grad_norm": 2.5809528827667236, + "learning_rate": 4.394398189342819e-05, + "loss": 3.289, + "step": 38051 + }, + { + "epoch": 0.226306023408507, + "grad_norm": 2.0330867767333984, + "learning_rate": 4.3943677092352485e-05, + "loss": 4.059, + "step": 38052 + }, + { + "epoch": 0.226311970691788, + "grad_norm": 2.52701997756958, + "learning_rate": 4.3943372284663745e-05, + "loss": 3.8987, + "step": 38053 + }, + { + "epoch": 0.22631791797506898, + "grad_norm": 2.8097949028015137, + "learning_rate": 4.3943067470362064e-05, + "loss": 3.6962, + "step": 38054 + }, + { + "epoch": 0.22632386525835, + "grad_norm": 2.9416728019714355, + "learning_rate": 4.394276264944757e-05, + "loss": 3.5676, + "step": 38055 + }, + { + "epoch": 0.22632981254163098, + "grad_norm": 2.539630174636841, + "learning_rate": 4.394245782192033e-05, + "loss": 3.1762, + "step": 38056 + }, + { + "epoch": 0.22633575982491197, + "grad_norm": 2.8900463581085205, + "learning_rate": 4.3942152987780485e-05, + "loss": 3.4844, + "step": 38057 + }, + { + "epoch": 0.22634170710819299, + "grad_norm": 2.119063377380371, + "learning_rate": 4.394184814702813e-05, + "loss": 4.0965, + "step": 38058 + }, + { + "epoch": 0.22634765439147397, + "grad_norm": 2.483552932739258, + "learning_rate": 4.394154329966337e-05, + "loss": 3.4461, + "step": 38059 + }, + { + "epoch": 0.22635360167475496, + "grad_norm": 2.3346595764160156, + "learning_rate": 4.394123844568632e-05, + "loss": 3.4765, + "step": 38060 + }, + { + "epoch": 0.22635954895803598, + "grad_norm": 2.0471270084381104, + "learning_rate": 4.394093358509706e-05, + "loss": 4.2491, + "step": 38061 + }, + { + "epoch": 0.22636549624131697, + "grad_norm": 1.5037319660186768, + "learning_rate": 4.3940628717895735e-05, + "loss": 5.1251, + "step": 38062 + }, + { + "epoch": 0.22637144352459795, + "grad_norm": 1.6269645690917969, + "learning_rate": 4.3940323844082426e-05, + "loss": 5.2124, + "step": 38063 + }, + { + "epoch": 0.22637739080787897, + "grad_norm": 2.6097071170806885, + "learning_rate": 4.3940018963657246e-05, + "loss": 3.2288, + "step": 38064 + }, + { + "epoch": 0.22638333809115996, + "grad_norm": 1.9301677942276, + "learning_rate": 4.39397140766203e-05, + "loss": 4.0461, + "step": 38065 + }, + { + "epoch": 0.22638928537444095, + "grad_norm": 1.5494807958602905, + "learning_rate": 4.39394091829717e-05, + "loss": 4.7948, + "step": 38066 + }, + { + "epoch": 0.22639523265772196, + "grad_norm": 1.5757453441619873, + "learning_rate": 4.393910428271154e-05, + "loss": 4.7356, + "step": 38067 + }, + { + "epoch": 0.22640117994100295, + "grad_norm": 1.5312561988830566, + "learning_rate": 4.393879937583994e-05, + "loss": 4.5272, + "step": 38068 + }, + { + "epoch": 0.22640712722428394, + "grad_norm": 1.6051007509231567, + "learning_rate": 4.3938494462356996e-05, + "loss": 4.8203, + "step": 38069 + }, + { + "epoch": 0.22641307450756495, + "grad_norm": 1.6052272319793701, + "learning_rate": 4.3938189542262824e-05, + "loss": 4.7911, + "step": 38070 + }, + { + "epoch": 0.22641902179084594, + "grad_norm": 1.5340666770935059, + "learning_rate": 4.3937884615557526e-05, + "loss": 5.0973, + "step": 38071 + }, + { + "epoch": 0.22642496907412693, + "grad_norm": 1.798746109008789, + "learning_rate": 4.3937579682241204e-05, + "loss": 5.0053, + "step": 38072 + }, + { + "epoch": 0.22643091635740795, + "grad_norm": 1.636568307876587, + "learning_rate": 4.393727474231397e-05, + "loss": 5.0263, + "step": 38073 + }, + { + "epoch": 0.22643686364068893, + "grad_norm": 1.5352871417999268, + "learning_rate": 4.393696979577593e-05, + "loss": 4.991, + "step": 38074 + }, + { + "epoch": 0.22644281092396992, + "grad_norm": 1.6464602947235107, + "learning_rate": 4.3936664842627194e-05, + "loss": 4.8768, + "step": 38075 + }, + { + "epoch": 0.22644875820725094, + "grad_norm": 1.8451437950134277, + "learning_rate": 4.393635988286786e-05, + "loss": 4.2802, + "step": 38076 + }, + { + "epoch": 0.22645470549053193, + "grad_norm": 1.7521929740905762, + "learning_rate": 4.393605491649804e-05, + "loss": 5.1212, + "step": 38077 + }, + { + "epoch": 0.22646065277381291, + "grad_norm": 1.8951425552368164, + "learning_rate": 4.3935749943517834e-05, + "loss": 5.2989, + "step": 38078 + }, + { + "epoch": 0.22646660005709393, + "grad_norm": 1.7104054689407349, + "learning_rate": 4.393544496392735e-05, + "loss": 4.3321, + "step": 38079 + }, + { + "epoch": 0.22647254734037492, + "grad_norm": 1.6039187908172607, + "learning_rate": 4.39351399777267e-05, + "loss": 5.1924, + "step": 38080 + }, + { + "epoch": 0.2264784946236559, + "grad_norm": 1.9055510759353638, + "learning_rate": 4.3934834984916004e-05, + "loss": 4.526, + "step": 38081 + }, + { + "epoch": 0.22648444190693692, + "grad_norm": 1.8152254819869995, + "learning_rate": 4.393452998549534e-05, + "loss": 4.6742, + "step": 38082 + }, + { + "epoch": 0.2264903891902179, + "grad_norm": 2.2788617610931396, + "learning_rate": 4.393422497946482e-05, + "loss": 3.7998, + "step": 38083 + }, + { + "epoch": 0.2264963364734989, + "grad_norm": 2.635610342025757, + "learning_rate": 4.393391996682456e-05, + "loss": 2.8398, + "step": 38084 + }, + { + "epoch": 0.22650228375677992, + "grad_norm": 2.5307788848876953, + "learning_rate": 4.393361494757468e-05, + "loss": 2.9038, + "step": 38085 + }, + { + "epoch": 0.2265082310400609, + "grad_norm": 2.791041612625122, + "learning_rate": 4.393330992171526e-05, + "loss": 2.8516, + "step": 38086 + }, + { + "epoch": 0.2265141783233419, + "grad_norm": 1.9251587390899658, + "learning_rate": 4.393300488924642e-05, + "loss": 3.5744, + "step": 38087 + }, + { + "epoch": 0.2265201256066229, + "grad_norm": 2.3512256145477295, + "learning_rate": 4.3932699850168254e-05, + "loss": 3.1597, + "step": 38088 + }, + { + "epoch": 0.2265260728899039, + "grad_norm": 2.621535539627075, + "learning_rate": 4.3932394804480895e-05, + "loss": 2.7866, + "step": 38089 + }, + { + "epoch": 0.22653202017318488, + "grad_norm": 2.6089329719543457, + "learning_rate": 4.3932089752184416e-05, + "loss": 2.9403, + "step": 38090 + }, + { + "epoch": 0.2265379674564659, + "grad_norm": 3.0819029808044434, + "learning_rate": 4.393178469327895e-05, + "loss": 2.8429, + "step": 38091 + }, + { + "epoch": 0.2265439147397469, + "grad_norm": 2.8655409812927246, + "learning_rate": 4.3931479627764585e-05, + "loss": 3.2155, + "step": 38092 + }, + { + "epoch": 0.22654986202302788, + "grad_norm": 2.843642473220825, + "learning_rate": 4.393117455564145e-05, + "loss": 2.8215, + "step": 38093 + }, + { + "epoch": 0.2265558093063089, + "grad_norm": 2.7354695796966553, + "learning_rate": 4.393086947690963e-05, + "loss": 3.461, + "step": 38094 + }, + { + "epoch": 0.22656175658958988, + "grad_norm": 2.8338818550109863, + "learning_rate": 4.393056439156923e-05, + "loss": 2.9972, + "step": 38095 + }, + { + "epoch": 0.22656770387287087, + "grad_norm": 2.944443464279175, + "learning_rate": 4.393025929962037e-05, + "loss": 2.1083, + "step": 38096 + }, + { + "epoch": 0.22657365115615186, + "grad_norm": 1.9833786487579346, + "learning_rate": 4.392995420106316e-05, + "loss": 2.2152, + "step": 38097 + }, + { + "epoch": 0.22657959843943287, + "grad_norm": 3.4136886596679688, + "learning_rate": 4.39296490958977e-05, + "loss": 2.3506, + "step": 38098 + }, + { + "epoch": 0.22658554572271386, + "grad_norm": 3.2970211505889893, + "learning_rate": 4.392934398412408e-05, + "loss": 2.144, + "step": 38099 + }, + { + "epoch": 0.22659149300599485, + "grad_norm": 2.7194559574127197, + "learning_rate": 4.392903886574243e-05, + "loss": 1.1954, + "step": 38100 + }, + { + "epoch": 0.22659744028927586, + "grad_norm": 2.454838991165161, + "learning_rate": 4.392873374075286e-05, + "loss": 0.8838, + "step": 38101 + }, + { + "epoch": 0.22660338757255685, + "grad_norm": 4.048086166381836, + "learning_rate": 4.392842860915545e-05, + "loss": 3.6415, + "step": 38102 + }, + { + "epoch": 0.22660933485583784, + "grad_norm": 2.8631486892700195, + "learning_rate": 4.392812347095032e-05, + "loss": 2.3429, + "step": 38103 + }, + { + "epoch": 0.22661528213911886, + "grad_norm": 3.0390071868896484, + "learning_rate": 4.392781832613758e-05, + "loss": 3.7672, + "step": 38104 + }, + { + "epoch": 0.22662122942239984, + "grad_norm": 3.5983376502990723, + "learning_rate": 4.392751317471734e-05, + "loss": 3.4215, + "step": 38105 + }, + { + "epoch": 0.22662717670568083, + "grad_norm": 2.0240063667297363, + "learning_rate": 4.3927208016689704e-05, + "loss": 4.8848, + "step": 38106 + }, + { + "epoch": 0.22663312398896185, + "grad_norm": 1.9214985370635986, + "learning_rate": 4.392690285205476e-05, + "loss": 5.3785, + "step": 38107 + }, + { + "epoch": 0.22663907127224284, + "grad_norm": 1.5077999830245972, + "learning_rate": 4.3926597680812644e-05, + "loss": 5.3017, + "step": 38108 + }, + { + "epoch": 0.22664501855552383, + "grad_norm": 1.9332648515701294, + "learning_rate": 4.392629250296344e-05, + "loss": 4.5846, + "step": 38109 + }, + { + "epoch": 0.22665096583880484, + "grad_norm": 1.925647258758545, + "learning_rate": 4.392598731850727e-05, + "loss": 4.1814, + "step": 38110 + }, + { + "epoch": 0.22665691312208583, + "grad_norm": 1.6901360750198364, + "learning_rate": 4.392568212744422e-05, + "loss": 5.4319, + "step": 38111 + }, + { + "epoch": 0.22666286040536682, + "grad_norm": 1.7555872201919556, + "learning_rate": 4.392537692977442e-05, + "loss": 5.2164, + "step": 38112 + }, + { + "epoch": 0.22666880768864783, + "grad_norm": 2.7938854694366455, + "learning_rate": 4.392507172549797e-05, + "loss": 3.4674, + "step": 38113 + }, + { + "epoch": 0.22667475497192882, + "grad_norm": 2.125744342803955, + "learning_rate": 4.3924766514614966e-05, + "loss": 3.5049, + "step": 38114 + }, + { + "epoch": 0.2266807022552098, + "grad_norm": 4.356492042541504, + "learning_rate": 4.392446129712552e-05, + "loss": 1.4819, + "step": 38115 + }, + { + "epoch": 0.22668664953849083, + "grad_norm": 2.790475845336914, + "learning_rate": 4.3924156073029746e-05, + "loss": 2.8582, + "step": 38116 + }, + { + "epoch": 0.2266925968217718, + "grad_norm": 1.6480002403259277, + "learning_rate": 4.392385084232774e-05, + "loss": 4.897, + "step": 38117 + }, + { + "epoch": 0.2266985441050528, + "grad_norm": 1.56727135181427, + "learning_rate": 4.392354560501962e-05, + "loss": 4.431, + "step": 38118 + }, + { + "epoch": 0.22670449138833382, + "grad_norm": 2.931684970855713, + "learning_rate": 4.3923240361105476e-05, + "loss": 1.9593, + "step": 38119 + }, + { + "epoch": 0.2267104386716148, + "grad_norm": 2.1170263290405273, + "learning_rate": 4.3922935110585425e-05, + "loss": 4.5195, + "step": 38120 + }, + { + "epoch": 0.2267163859548958, + "grad_norm": 2.30966854095459, + "learning_rate": 4.392262985345959e-05, + "loss": 4.8419, + "step": 38121 + }, + { + "epoch": 0.2267223332381768, + "grad_norm": 1.6240100860595703, + "learning_rate": 4.3922324589728045e-05, + "loss": 4.95, + "step": 38122 + }, + { + "epoch": 0.2267282805214578, + "grad_norm": 2.074064254760742, + "learning_rate": 4.392201931939091e-05, + "loss": 4.658, + "step": 38123 + }, + { + "epoch": 0.22673422780473879, + "grad_norm": 2.826097011566162, + "learning_rate": 4.3921714042448306e-05, + "loss": 4.3729, + "step": 38124 + }, + { + "epoch": 0.2267401750880198, + "grad_norm": 2.2940187454223633, + "learning_rate": 4.392140875890032e-05, + "loss": 4.7278, + "step": 38125 + }, + { + "epoch": 0.2267461223713008, + "grad_norm": 2.0838935375213623, + "learning_rate": 4.392110346874706e-05, + "loss": 4.8679, + "step": 38126 + }, + { + "epoch": 0.22675206965458178, + "grad_norm": 2.258232593536377, + "learning_rate": 4.392079817198864e-05, + "loss": 4.3999, + "step": 38127 + }, + { + "epoch": 0.2267580169378628, + "grad_norm": 1.7443938255310059, + "learning_rate": 4.392049286862517e-05, + "loss": 4.5488, + "step": 38128 + }, + { + "epoch": 0.22676396422114378, + "grad_norm": 1.6828581094741821, + "learning_rate": 4.392018755865674e-05, + "loss": 4.7467, + "step": 38129 + }, + { + "epoch": 0.22676991150442477, + "grad_norm": 2.271589994430542, + "learning_rate": 4.391988224208349e-05, + "loss": 4.4422, + "step": 38130 + }, + { + "epoch": 0.2267758587877058, + "grad_norm": 3.1619069576263428, + "learning_rate": 4.3919576918905495e-05, + "loss": 4.1281, + "step": 38131 + }, + { + "epoch": 0.22678180607098677, + "grad_norm": 2.716963529586792, + "learning_rate": 4.391927158912286e-05, + "loss": 4.0466, + "step": 38132 + }, + { + "epoch": 0.22678775335426776, + "grad_norm": 2.644996166229248, + "learning_rate": 4.391896625273572e-05, + "loss": 4.0363, + "step": 38133 + }, + { + "epoch": 0.22679370063754878, + "grad_norm": 2.2756507396698, + "learning_rate": 4.391866090974415e-05, + "loss": 4.0494, + "step": 38134 + }, + { + "epoch": 0.22679964792082977, + "grad_norm": 2.7238388061523438, + "learning_rate": 4.3918355560148275e-05, + "loss": 3.9859, + "step": 38135 + }, + { + "epoch": 0.22680559520411075, + "grad_norm": 2.129974842071533, + "learning_rate": 4.39180502039482e-05, + "loss": 3.8574, + "step": 38136 + }, + { + "epoch": 0.22681154248739177, + "grad_norm": 2.485243558883667, + "learning_rate": 4.3917744841144024e-05, + "loss": 3.6808, + "step": 38137 + }, + { + "epoch": 0.22681748977067276, + "grad_norm": 2.380950927734375, + "learning_rate": 4.3917439471735865e-05, + "loss": 3.6885, + "step": 38138 + }, + { + "epoch": 0.22682343705395375, + "grad_norm": 2.5191190242767334, + "learning_rate": 4.391713409572382e-05, + "loss": 3.3835, + "step": 38139 + }, + { + "epoch": 0.22682938433723476, + "grad_norm": 2.4364421367645264, + "learning_rate": 4.3916828713108e-05, + "loss": 3.6194, + "step": 38140 + }, + { + "epoch": 0.22683533162051575, + "grad_norm": 2.982856512069702, + "learning_rate": 4.391652332388851e-05, + "loss": 3.4352, + "step": 38141 + }, + { + "epoch": 0.22684127890379674, + "grad_norm": 3.9172885417938232, + "learning_rate": 4.391621792806546e-05, + "loss": 2.2642, + "step": 38142 + }, + { + "epoch": 0.22684722618707776, + "grad_norm": 2.8479931354522705, + "learning_rate": 4.3915912525638944e-05, + "loss": 3.3206, + "step": 38143 + }, + { + "epoch": 0.22685317347035874, + "grad_norm": 1.826278567314148, + "learning_rate": 4.391560711660909e-05, + "loss": 4.1807, + "step": 38144 + }, + { + "epoch": 0.22685912075363973, + "grad_norm": 2.9397122859954834, + "learning_rate": 4.3915301700975985e-05, + "loss": 3.0936, + "step": 38145 + }, + { + "epoch": 0.22686506803692075, + "grad_norm": 3.786853313446045, + "learning_rate": 4.391499627873975e-05, + "loss": 2.7473, + "step": 38146 + }, + { + "epoch": 0.22687101532020174, + "grad_norm": 3.610485792160034, + "learning_rate": 4.391469084990049e-05, + "loss": 2.7369, + "step": 38147 + }, + { + "epoch": 0.22687696260348272, + "grad_norm": 3.4012134075164795, + "learning_rate": 4.391438541445829e-05, + "loss": 2.8883, + "step": 38148 + }, + { + "epoch": 0.22688290988676374, + "grad_norm": 2.813325881958008, + "learning_rate": 4.3914079972413294e-05, + "loss": 4.0143, + "step": 38149 + }, + { + "epoch": 0.22688885717004473, + "grad_norm": 1.4918662309646606, + "learning_rate": 4.391377452376557e-05, + "loss": 5.3134, + "step": 38150 + }, + { + "epoch": 0.22689480445332572, + "grad_norm": 2.268364906311035, + "learning_rate": 4.391346906851524e-05, + "loss": 4.001, + "step": 38151 + }, + { + "epoch": 0.22690075173660673, + "grad_norm": 2.87863826751709, + "learning_rate": 4.3913163606662436e-05, + "loss": 2.2376, + "step": 38152 + }, + { + "epoch": 0.22690669901988772, + "grad_norm": 2.252005100250244, + "learning_rate": 4.3912858138207225e-05, + "loss": 2.9713, + "step": 38153 + }, + { + "epoch": 0.2269126463031687, + "grad_norm": 1.636132836341858, + "learning_rate": 4.391255266314973e-05, + "loss": 4.9184, + "step": 38154 + }, + { + "epoch": 0.2269185935864497, + "grad_norm": 1.5946236848831177, + "learning_rate": 4.391224718149007e-05, + "loss": 4.8575, + "step": 38155 + }, + { + "epoch": 0.2269245408697307, + "grad_norm": 1.6881130933761597, + "learning_rate": 4.391194169322833e-05, + "loss": 5.0459, + "step": 38156 + }, + { + "epoch": 0.2269304881530117, + "grad_norm": 1.6183825731277466, + "learning_rate": 4.3911636198364627e-05, + "loss": 4.854, + "step": 38157 + }, + { + "epoch": 0.2269364354362927, + "grad_norm": 1.6183366775512695, + "learning_rate": 4.391133069689907e-05, + "loss": 4.4129, + "step": 38158 + }, + { + "epoch": 0.2269423827195737, + "grad_norm": 1.561302900314331, + "learning_rate": 4.391102518883177e-05, + "loss": 5.1313, + "step": 38159 + }, + { + "epoch": 0.2269483300028547, + "grad_norm": 1.5048106908798218, + "learning_rate": 4.391071967416282e-05, + "loss": 4.7604, + "step": 38160 + }, + { + "epoch": 0.22695427728613568, + "grad_norm": 1.5101484060287476, + "learning_rate": 4.3910414152892324e-05, + "loss": 5.1585, + "step": 38161 + }, + { + "epoch": 0.2269602245694167, + "grad_norm": 1.4586681127548218, + "learning_rate": 4.391010862502041e-05, + "loss": 4.821, + "step": 38162 + }, + { + "epoch": 0.22696617185269768, + "grad_norm": 1.507530927658081, + "learning_rate": 4.3909803090547164e-05, + "loss": 4.8342, + "step": 38163 + }, + { + "epoch": 0.22697211913597867, + "grad_norm": 1.4657213687896729, + "learning_rate": 4.390949754947271e-05, + "loss": 5.2703, + "step": 38164 + }, + { + "epoch": 0.2269780664192597, + "grad_norm": 1.2653342485427856, + "learning_rate": 4.390919200179714e-05, + "loss": 5.6871, + "step": 38165 + }, + { + "epoch": 0.22698401370254068, + "grad_norm": 1.371896505355835, + "learning_rate": 4.3908886447520565e-05, + "loss": 5.7635, + "step": 38166 + }, + { + "epoch": 0.22698996098582166, + "grad_norm": 1.38970947265625, + "learning_rate": 4.390858088664309e-05, + "loss": 5.5381, + "step": 38167 + }, + { + "epoch": 0.22699590826910268, + "grad_norm": 1.649964451789856, + "learning_rate": 4.390827531916484e-05, + "loss": 4.0239, + "step": 38168 + }, + { + "epoch": 0.22700185555238367, + "grad_norm": 1.8974918127059937, + "learning_rate": 4.390796974508589e-05, + "loss": 4.1992, + "step": 38169 + }, + { + "epoch": 0.22700780283566466, + "grad_norm": 1.6561975479125977, + "learning_rate": 4.390766416440637e-05, + "loss": 5.049, + "step": 38170 + }, + { + "epoch": 0.22701375011894567, + "grad_norm": 1.6487138271331787, + "learning_rate": 4.390735857712638e-05, + "loss": 4.1484, + "step": 38171 + }, + { + "epoch": 0.22701969740222666, + "grad_norm": 1.9046709537506104, + "learning_rate": 4.3907052983246024e-05, + "loss": 4.2686, + "step": 38172 + }, + { + "epoch": 0.22702564468550765, + "grad_norm": 1.9332301616668701, + "learning_rate": 4.390674738276542e-05, + "loss": 4.2826, + "step": 38173 + }, + { + "epoch": 0.22703159196878867, + "grad_norm": 1.6239649057388306, + "learning_rate": 4.3906441775684654e-05, + "loss": 3.9571, + "step": 38174 + }, + { + "epoch": 0.22703753925206965, + "grad_norm": 1.541169285774231, + "learning_rate": 4.3906136162003855e-05, + "loss": 4.5711, + "step": 38175 + }, + { + "epoch": 0.22704348653535064, + "grad_norm": 1.7647597789764404, + "learning_rate": 4.390583054172311e-05, + "loss": 4.8254, + "step": 38176 + }, + { + "epoch": 0.22704943381863166, + "grad_norm": 1.7445447444915771, + "learning_rate": 4.3905524914842536e-05, + "loss": 4.2946, + "step": 38177 + }, + { + "epoch": 0.22705538110191265, + "grad_norm": 1.5402419567108154, + "learning_rate": 4.390521928136224e-05, + "loss": 3.6745, + "step": 38178 + }, + { + "epoch": 0.22706132838519363, + "grad_norm": 1.652811884880066, + "learning_rate": 4.390491364128233e-05, + "loss": 3.652, + "step": 38179 + }, + { + "epoch": 0.22706727566847465, + "grad_norm": 1.7374279499053955, + "learning_rate": 4.390460799460291e-05, + "loss": 3.859, + "step": 38180 + }, + { + "epoch": 0.22707322295175564, + "grad_norm": 1.6328239440917969, + "learning_rate": 4.390430234132408e-05, + "loss": 3.5833, + "step": 38181 + }, + { + "epoch": 0.22707917023503663, + "grad_norm": 1.8972036838531494, + "learning_rate": 4.390399668144596e-05, + "loss": 3.6967, + "step": 38182 + }, + { + "epoch": 0.22708511751831764, + "grad_norm": 1.620894432067871, + "learning_rate": 4.390369101496865e-05, + "loss": 3.6897, + "step": 38183 + }, + { + "epoch": 0.22709106480159863, + "grad_norm": 1.7540589570999146, + "learning_rate": 4.3903385341892255e-05, + "loss": 3.7719, + "step": 38184 + }, + { + "epoch": 0.22709701208487962, + "grad_norm": 1.9118608236312866, + "learning_rate": 4.390307966221688e-05, + "loss": 4.484, + "step": 38185 + }, + { + "epoch": 0.22710295936816063, + "grad_norm": 1.9014246463775635, + "learning_rate": 4.390277397594264e-05, + "loss": 4.0257, + "step": 38186 + }, + { + "epoch": 0.22710890665144162, + "grad_norm": 2.3183906078338623, + "learning_rate": 4.3902468283069644e-05, + "loss": 3.8989, + "step": 38187 + }, + { + "epoch": 0.2271148539347226, + "grad_norm": 2.24916934967041, + "learning_rate": 4.390216258359798e-05, + "loss": 4.0533, + "step": 38188 + }, + { + "epoch": 0.22712080121800363, + "grad_norm": 2.352621555328369, + "learning_rate": 4.390185687752777e-05, + "loss": 3.9287, + "step": 38189 + }, + { + "epoch": 0.22712674850128461, + "grad_norm": 2.341827154159546, + "learning_rate": 4.390155116485911e-05, + "loss": 4.0099, + "step": 38190 + }, + { + "epoch": 0.2271326957845656, + "grad_norm": 1.7769436836242676, + "learning_rate": 4.3901245445592126e-05, + "loss": 4.5457, + "step": 38191 + }, + { + "epoch": 0.22713864306784662, + "grad_norm": 1.550155520439148, + "learning_rate": 4.390093971972691e-05, + "loss": 4.5249, + "step": 38192 + }, + { + "epoch": 0.2271445903511276, + "grad_norm": 1.677330493927002, + "learning_rate": 4.390063398726356e-05, + "loss": 5.328, + "step": 38193 + }, + { + "epoch": 0.2271505376344086, + "grad_norm": 1.6579426527023315, + "learning_rate": 4.39003282482022e-05, + "loss": 5.6862, + "step": 38194 + }, + { + "epoch": 0.2271564849176896, + "grad_norm": 1.6280534267425537, + "learning_rate": 4.3900022502542937e-05, + "loss": 4.9508, + "step": 38195 + }, + { + "epoch": 0.2271624322009706, + "grad_norm": 1.763191819190979, + "learning_rate": 4.389971675028587e-05, + "loss": 4.379, + "step": 38196 + }, + { + "epoch": 0.2271683794842516, + "grad_norm": 1.6606146097183228, + "learning_rate": 4.38994109914311e-05, + "loss": 4.5277, + "step": 38197 + }, + { + "epoch": 0.2271743267675326, + "grad_norm": 1.5619972944259644, + "learning_rate": 4.3899105225978756e-05, + "loss": 4.8777, + "step": 38198 + }, + { + "epoch": 0.2271802740508136, + "grad_norm": 1.8204611539840698, + "learning_rate": 4.389879945392891e-05, + "loss": 4.5978, + "step": 38199 + }, + { + "epoch": 0.22718622133409458, + "grad_norm": 1.9288609027862549, + "learning_rate": 4.38984936752817e-05, + "loss": 4.2724, + "step": 38200 + }, + { + "epoch": 0.2271921686173756, + "grad_norm": 1.7813549041748047, + "learning_rate": 4.389818789003722e-05, + "loss": 4.5287, + "step": 38201 + }, + { + "epoch": 0.22719811590065658, + "grad_norm": 1.5712015628814697, + "learning_rate": 4.389788209819558e-05, + "loss": 4.9682, + "step": 38202 + }, + { + "epoch": 0.22720406318393757, + "grad_norm": 1.4615222215652466, + "learning_rate": 4.3897576299756875e-05, + "loss": 4.6088, + "step": 38203 + }, + { + "epoch": 0.2272100104672186, + "grad_norm": 1.755202293395996, + "learning_rate": 4.389727049472123e-05, + "loss": 4.2747, + "step": 38204 + }, + { + "epoch": 0.22721595775049958, + "grad_norm": 2.1010637283325195, + "learning_rate": 4.3896964683088736e-05, + "loss": 4.1787, + "step": 38205 + }, + { + "epoch": 0.22722190503378056, + "grad_norm": 1.8035705089569092, + "learning_rate": 4.3896658864859516e-05, + "loss": 4.1858, + "step": 38206 + }, + { + "epoch": 0.22722785231706158, + "grad_norm": 1.9293487071990967, + "learning_rate": 4.3896353040033664e-05, + "loss": 4.3255, + "step": 38207 + }, + { + "epoch": 0.22723379960034257, + "grad_norm": 2.041259527206421, + "learning_rate": 4.389604720861128e-05, + "loss": 4.223, + "step": 38208 + }, + { + "epoch": 0.22723974688362356, + "grad_norm": 1.932900309562683, + "learning_rate": 4.389574137059249e-05, + "loss": 4.4073, + "step": 38209 + }, + { + "epoch": 0.22724569416690457, + "grad_norm": 1.8900656700134277, + "learning_rate": 4.38954355259774e-05, + "loss": 4.6547, + "step": 38210 + }, + { + "epoch": 0.22725164145018556, + "grad_norm": 1.7553825378417969, + "learning_rate": 4.38951296747661e-05, + "loss": 4.5234, + "step": 38211 + }, + { + "epoch": 0.22725758873346655, + "grad_norm": 1.724029779434204, + "learning_rate": 4.3894823816958705e-05, + "loss": 4.1834, + "step": 38212 + }, + { + "epoch": 0.22726353601674754, + "grad_norm": 1.871026873588562, + "learning_rate": 4.389451795255533e-05, + "loss": 4.1964, + "step": 38213 + }, + { + "epoch": 0.22726948330002855, + "grad_norm": 1.8918932676315308, + "learning_rate": 4.389421208155606e-05, + "loss": 4.2043, + "step": 38214 + }, + { + "epoch": 0.22727543058330954, + "grad_norm": 1.5714900493621826, + "learning_rate": 4.389390620396102e-05, + "loss": 4.5157, + "step": 38215 + }, + { + "epoch": 0.22728137786659053, + "grad_norm": 1.7289087772369385, + "learning_rate": 4.389360031977032e-05, + "loss": 4.8163, + "step": 38216 + }, + { + "epoch": 0.22728732514987154, + "grad_norm": 1.7521272897720337, + "learning_rate": 4.389329442898406e-05, + "loss": 4.6082, + "step": 38217 + }, + { + "epoch": 0.22729327243315253, + "grad_norm": 1.6807644367218018, + "learning_rate": 4.389298853160234e-05, + "loss": 4.7829, + "step": 38218 + }, + { + "epoch": 0.22729921971643352, + "grad_norm": 1.5811291933059692, + "learning_rate": 4.389268262762527e-05, + "loss": 4.2259, + "step": 38219 + }, + { + "epoch": 0.22730516699971454, + "grad_norm": 1.6493738889694214, + "learning_rate": 4.3892376717052964e-05, + "loss": 4.7039, + "step": 38220 + }, + { + "epoch": 0.22731111428299552, + "grad_norm": 1.739897608757019, + "learning_rate": 4.389207079988552e-05, + "loss": 4.2059, + "step": 38221 + }, + { + "epoch": 0.2273170615662765, + "grad_norm": 1.8707149028778076, + "learning_rate": 4.389176487612306e-05, + "loss": 4.0741, + "step": 38222 + }, + { + "epoch": 0.22732300884955753, + "grad_norm": 1.7619922161102295, + "learning_rate": 4.389145894576567e-05, + "loss": 4.796, + "step": 38223 + }, + { + "epoch": 0.22732895613283852, + "grad_norm": 1.7651190757751465, + "learning_rate": 4.389115300881347e-05, + "loss": 4.5949, + "step": 38224 + }, + { + "epoch": 0.2273349034161195, + "grad_norm": 1.6352847814559937, + "learning_rate": 4.3890847065266564e-05, + "loss": 4.8217, + "step": 38225 + }, + { + "epoch": 0.22734085069940052, + "grad_norm": 2.0873641967773438, + "learning_rate": 4.389054111512506e-05, + "loss": 4.0475, + "step": 38226 + }, + { + "epoch": 0.2273467979826815, + "grad_norm": 2.021920919418335, + "learning_rate": 4.389023515838906e-05, + "loss": 4.086, + "step": 38227 + }, + { + "epoch": 0.2273527452659625, + "grad_norm": 2.1906380653381348, + "learning_rate": 4.388992919505868e-05, + "loss": 3.4877, + "step": 38228 + }, + { + "epoch": 0.2273586925492435, + "grad_norm": 2.515861749649048, + "learning_rate": 4.3889623225134016e-05, + "loss": 3.6833, + "step": 38229 + }, + { + "epoch": 0.2273646398325245, + "grad_norm": 2.3089938163757324, + "learning_rate": 4.388931724861518e-05, + "loss": 3.4421, + "step": 38230 + }, + { + "epoch": 0.2273705871158055, + "grad_norm": 2.258147716522217, + "learning_rate": 4.388901126550228e-05, + "loss": 3.6366, + "step": 38231 + }, + { + "epoch": 0.2273765343990865, + "grad_norm": 1.763493299484253, + "learning_rate": 4.388870527579542e-05, + "loss": 4.0498, + "step": 38232 + }, + { + "epoch": 0.2273824816823675, + "grad_norm": 2.237896680831909, + "learning_rate": 4.3888399279494705e-05, + "loss": 3.534, + "step": 38233 + }, + { + "epoch": 0.22738842896564848, + "grad_norm": 2.1409060955047607, + "learning_rate": 4.3888093276600254e-05, + "loss": 3.4942, + "step": 38234 + }, + { + "epoch": 0.2273943762489295, + "grad_norm": 2.0923609733581543, + "learning_rate": 4.388778726711216e-05, + "loss": 3.5556, + "step": 38235 + }, + { + "epoch": 0.22740032353221049, + "grad_norm": 2.1950254440307617, + "learning_rate": 4.3887481251030524e-05, + "loss": 3.5715, + "step": 38236 + }, + { + "epoch": 0.22740627081549147, + "grad_norm": 2.0914371013641357, + "learning_rate": 4.388717522835548e-05, + "loss": 3.6515, + "step": 38237 + }, + { + "epoch": 0.2274122180987725, + "grad_norm": 2.1977272033691406, + "learning_rate": 4.388686919908711e-05, + "loss": 3.7544, + "step": 38238 + }, + { + "epoch": 0.22741816538205348, + "grad_norm": 2.4383692741394043, + "learning_rate": 4.388656316322553e-05, + "loss": 3.569, + "step": 38239 + }, + { + "epoch": 0.22742411266533447, + "grad_norm": 2.297513484954834, + "learning_rate": 4.388625712077085e-05, + "loss": 3.6434, + "step": 38240 + }, + { + "epoch": 0.22743005994861548, + "grad_norm": 2.3670594692230225, + "learning_rate": 4.388595107172317e-05, + "loss": 3.7064, + "step": 38241 + }, + { + "epoch": 0.22743600723189647, + "grad_norm": 1.7966325283050537, + "learning_rate": 4.3885645016082596e-05, + "loss": 4.2793, + "step": 38242 + }, + { + "epoch": 0.22744195451517746, + "grad_norm": 1.7755250930786133, + "learning_rate": 4.3885338953849245e-05, + "loss": 5.1079, + "step": 38243 + }, + { + "epoch": 0.22744790179845847, + "grad_norm": 1.92861008644104, + "learning_rate": 4.388503288502321e-05, + "loss": 3.8551, + "step": 38244 + }, + { + "epoch": 0.22745384908173946, + "grad_norm": 1.834679365158081, + "learning_rate": 4.38847268096046e-05, + "loss": 4.9659, + "step": 38245 + }, + { + "epoch": 0.22745979636502045, + "grad_norm": 2.094355821609497, + "learning_rate": 4.388442072759355e-05, + "loss": 4.5049, + "step": 38246 + }, + { + "epoch": 0.22746574364830147, + "grad_norm": 1.6657506227493286, + "learning_rate": 4.388411463899012e-05, + "loss": 4.6536, + "step": 38247 + }, + { + "epoch": 0.22747169093158245, + "grad_norm": 2.0281357765197754, + "learning_rate": 4.388380854379445e-05, + "loss": 4.9256, + "step": 38248 + }, + { + "epoch": 0.22747763821486344, + "grad_norm": 1.6166311502456665, + "learning_rate": 4.388350244200664e-05, + "loss": 4.7357, + "step": 38249 + }, + { + "epoch": 0.22748358549814446, + "grad_norm": 1.9163943529129028, + "learning_rate": 4.38831963336268e-05, + "loss": 4.3306, + "step": 38250 + }, + { + "epoch": 0.22748953278142545, + "grad_norm": 1.8184092044830322, + "learning_rate": 4.3882890218655015e-05, + "loss": 4.4076, + "step": 38251 + }, + { + "epoch": 0.22749548006470643, + "grad_norm": 1.776785135269165, + "learning_rate": 4.388258409709142e-05, + "loss": 5.2, + "step": 38252 + }, + { + "epoch": 0.22750142734798745, + "grad_norm": 1.422700047492981, + "learning_rate": 4.38822779689361e-05, + "loss": 5.3973, + "step": 38253 + }, + { + "epoch": 0.22750737463126844, + "grad_norm": 1.5290848016738892, + "learning_rate": 4.388197183418918e-05, + "loss": 4.6603, + "step": 38254 + }, + { + "epoch": 0.22751332191454943, + "grad_norm": 2.919466972351074, + "learning_rate": 4.388166569285076e-05, + "loss": 4.2282, + "step": 38255 + }, + { + "epoch": 0.22751926919783044, + "grad_norm": 1.4154354333877563, + "learning_rate": 4.3881359544920945e-05, + "loss": 4.9397, + "step": 38256 + }, + { + "epoch": 0.22752521648111143, + "grad_norm": 1.3635774850845337, + "learning_rate": 4.388105339039984e-05, + "loss": 5.5165, + "step": 38257 + }, + { + "epoch": 0.22753116376439242, + "grad_norm": 1.314709186553955, + "learning_rate": 4.388074722928755e-05, + "loss": 5.652, + "step": 38258 + }, + { + "epoch": 0.22753711104767343, + "grad_norm": 1.4117316007614136, + "learning_rate": 4.3880441061584194e-05, + "loss": 5.5875, + "step": 38259 + }, + { + "epoch": 0.22754305833095442, + "grad_norm": 1.605210304260254, + "learning_rate": 4.388013488728986e-05, + "loss": 4.96, + "step": 38260 + }, + { + "epoch": 0.2275490056142354, + "grad_norm": 1.8368546962738037, + "learning_rate": 4.387982870640468e-05, + "loss": 4.722, + "step": 38261 + }, + { + "epoch": 0.22755495289751643, + "grad_norm": 1.5915447473526, + "learning_rate": 4.387952251892874e-05, + "loss": 4.7962, + "step": 38262 + }, + { + "epoch": 0.22756090018079742, + "grad_norm": 1.6231772899627686, + "learning_rate": 4.387921632486215e-05, + "loss": 4.1642, + "step": 38263 + }, + { + "epoch": 0.2275668474640784, + "grad_norm": 1.6547075510025024, + "learning_rate": 4.3878910124205034e-05, + "loss": 4.8041, + "step": 38264 + }, + { + "epoch": 0.22757279474735942, + "grad_norm": 2.497774600982666, + "learning_rate": 4.3878603916957474e-05, + "loss": 4.0578, + "step": 38265 + }, + { + "epoch": 0.2275787420306404, + "grad_norm": 5.403087139129639, + "learning_rate": 4.3878297703119584e-05, + "loss": 2.4185, + "step": 38266 + }, + { + "epoch": 0.2275846893139214, + "grad_norm": 2.073349952697754, + "learning_rate": 4.387799148269148e-05, + "loss": 3.2784, + "step": 38267 + }, + { + "epoch": 0.2275906365972024, + "grad_norm": 2.289799451828003, + "learning_rate": 4.387768525567327e-05, + "loss": 4.2342, + "step": 38268 + }, + { + "epoch": 0.2275965838804834, + "grad_norm": 2.4968581199645996, + "learning_rate": 4.3877379022065056e-05, + "loss": 4.2145, + "step": 38269 + }, + { + "epoch": 0.2276025311637644, + "grad_norm": 2.31097674369812, + "learning_rate": 4.387707278186693e-05, + "loss": 4.2879, + "step": 38270 + }, + { + "epoch": 0.22760847844704538, + "grad_norm": 3.108427047729492, + "learning_rate": 4.387676653507903e-05, + "loss": 2.3614, + "step": 38271 + }, + { + "epoch": 0.2276144257303264, + "grad_norm": 2.122161388397217, + "learning_rate": 4.387646028170144e-05, + "loss": 3.6171, + "step": 38272 + }, + { + "epoch": 0.22762037301360738, + "grad_norm": 1.9830756187438965, + "learning_rate": 4.387615402173427e-05, + "loss": 3.752, + "step": 38273 + }, + { + "epoch": 0.22762632029688837, + "grad_norm": 2.001715660095215, + "learning_rate": 4.387584775517763e-05, + "loss": 3.7583, + "step": 38274 + }, + { + "epoch": 0.22763226758016938, + "grad_norm": 2.0779619216918945, + "learning_rate": 4.3875541482031626e-05, + "loss": 3.8147, + "step": 38275 + }, + { + "epoch": 0.22763821486345037, + "grad_norm": 1.901292324066162, + "learning_rate": 4.387523520229637e-05, + "loss": 3.9703, + "step": 38276 + }, + { + "epoch": 0.22764416214673136, + "grad_norm": 1.7687041759490967, + "learning_rate": 4.3874928915971955e-05, + "loss": 5.009, + "step": 38277 + }, + { + "epoch": 0.22765010943001238, + "grad_norm": 1.8333301544189453, + "learning_rate": 4.3874622623058505e-05, + "loss": 3.7311, + "step": 38278 + }, + { + "epoch": 0.22765605671329336, + "grad_norm": 2.1521081924438477, + "learning_rate": 4.387431632355612e-05, + "loss": 3.5839, + "step": 38279 + }, + { + "epoch": 0.22766200399657435, + "grad_norm": 1.766525387763977, + "learning_rate": 4.38740100174649e-05, + "loss": 3.9625, + "step": 38280 + }, + { + "epoch": 0.22766795127985537, + "grad_norm": 1.5381993055343628, + "learning_rate": 4.3873703704784966e-05, + "loss": 4.7761, + "step": 38281 + }, + { + "epoch": 0.22767389856313636, + "grad_norm": 1.351083517074585, + "learning_rate": 4.387339738551641e-05, + "loss": 5.2671, + "step": 38282 + }, + { + "epoch": 0.22767984584641734, + "grad_norm": 1.9041895866394043, + "learning_rate": 4.387309105965935e-05, + "loss": 4.7621, + "step": 38283 + }, + { + "epoch": 0.22768579312969836, + "grad_norm": 1.7635126113891602, + "learning_rate": 4.387278472721389e-05, + "loss": 4.5413, + "step": 38284 + }, + { + "epoch": 0.22769174041297935, + "grad_norm": 2.1425886154174805, + "learning_rate": 4.3872478388180126e-05, + "loss": 3.5661, + "step": 38285 + }, + { + "epoch": 0.22769768769626034, + "grad_norm": 2.4249305725097656, + "learning_rate": 4.387217204255819e-05, + "loss": 3.2622, + "step": 38286 + }, + { + "epoch": 0.22770363497954135, + "grad_norm": 2.1183717250823975, + "learning_rate": 4.387186569034816e-05, + "loss": 3.4106, + "step": 38287 + }, + { + "epoch": 0.22770958226282234, + "grad_norm": 2.123342990875244, + "learning_rate": 4.3871559331550166e-05, + "loss": 3.7708, + "step": 38288 + }, + { + "epoch": 0.22771552954610333, + "grad_norm": 1.565640926361084, + "learning_rate": 4.38712529661643e-05, + "loss": 4.6837, + "step": 38289 + }, + { + "epoch": 0.22772147682938434, + "grad_norm": 1.516226887702942, + "learning_rate": 4.387094659419068e-05, + "loss": 4.8717, + "step": 38290 + }, + { + "epoch": 0.22772742411266533, + "grad_norm": 1.859891653060913, + "learning_rate": 4.38706402156294e-05, + "loss": 4.4244, + "step": 38291 + }, + { + "epoch": 0.22773337139594632, + "grad_norm": 1.8023730516433716, + "learning_rate": 4.387033383048058e-05, + "loss": 4.3214, + "step": 38292 + }, + { + "epoch": 0.22773931867922734, + "grad_norm": 1.324349045753479, + "learning_rate": 4.387002743874432e-05, + "loss": 4.8791, + "step": 38293 + }, + { + "epoch": 0.22774526596250833, + "grad_norm": 1.505937933921814, + "learning_rate": 4.386972104042073e-05, + "loss": 4.4301, + "step": 38294 + }, + { + "epoch": 0.2277512132457893, + "grad_norm": 1.7081881761550903, + "learning_rate": 4.386941463550992e-05, + "loss": 3.9549, + "step": 38295 + }, + { + "epoch": 0.22775716052907033, + "grad_norm": 1.7600693702697754, + "learning_rate": 4.3869108224011976e-05, + "loss": 3.9081, + "step": 38296 + }, + { + "epoch": 0.22776310781235132, + "grad_norm": 1.8432573080062866, + "learning_rate": 4.386880180592703e-05, + "loss": 3.823, + "step": 38297 + }, + { + "epoch": 0.2277690550956323, + "grad_norm": 1.917293667793274, + "learning_rate": 4.386849538125519e-05, + "loss": 3.6595, + "step": 38298 + }, + { + "epoch": 0.22777500237891332, + "grad_norm": 1.771728754043579, + "learning_rate": 4.386818894999654e-05, + "loss": 3.7419, + "step": 38299 + }, + { + "epoch": 0.2277809496621943, + "grad_norm": 1.9334973096847534, + "learning_rate": 4.3867882512151205e-05, + "loss": 3.6426, + "step": 38300 + }, + { + "epoch": 0.2277868969454753, + "grad_norm": 1.738030195236206, + "learning_rate": 4.386757606771929e-05, + "loss": 3.8732, + "step": 38301 + }, + { + "epoch": 0.2277928442287563, + "grad_norm": 1.9789965152740479, + "learning_rate": 4.38672696167009e-05, + "loss": 3.982, + "step": 38302 + }, + { + "epoch": 0.2277987915120373, + "grad_norm": 1.9081391096115112, + "learning_rate": 4.386696315909613e-05, + "loss": 3.9094, + "step": 38303 + }, + { + "epoch": 0.2278047387953183, + "grad_norm": 2.4843461513519287, + "learning_rate": 4.386665669490511e-05, + "loss": 3.8854, + "step": 38304 + }, + { + "epoch": 0.2278106860785993, + "grad_norm": 1.8012548685073853, + "learning_rate": 4.386635022412793e-05, + "loss": 3.9795, + "step": 38305 + }, + { + "epoch": 0.2278166333618803, + "grad_norm": 2.046868324279785, + "learning_rate": 4.38660437467647e-05, + "loss": 3.9579, + "step": 38306 + }, + { + "epoch": 0.22782258064516128, + "grad_norm": 1.9416875839233398, + "learning_rate": 4.386573726281553e-05, + "loss": 3.8472, + "step": 38307 + }, + { + "epoch": 0.2278285279284423, + "grad_norm": 1.9911282062530518, + "learning_rate": 4.386543077228053e-05, + "loss": 3.7991, + "step": 38308 + }, + { + "epoch": 0.2278344752117233, + "grad_norm": 1.767100214958191, + "learning_rate": 4.3865124275159805e-05, + "loss": 3.9515, + "step": 38309 + }, + { + "epoch": 0.22784042249500427, + "grad_norm": 1.7764270305633545, + "learning_rate": 4.386481777145345e-05, + "loss": 3.8708, + "step": 38310 + }, + { + "epoch": 0.2278463697782853, + "grad_norm": 2.0385255813598633, + "learning_rate": 4.386451126116159e-05, + "loss": 3.8093, + "step": 38311 + }, + { + "epoch": 0.22785231706156628, + "grad_norm": 4.206143379211426, + "learning_rate": 4.3864204744284324e-05, + "loss": 2.8644, + "step": 38312 + }, + { + "epoch": 0.22785826434484727, + "grad_norm": 4.548903465270996, + "learning_rate": 4.3863898220821753e-05, + "loss": 1.5476, + "step": 38313 + }, + { + "epoch": 0.22786421162812828, + "grad_norm": 5.352540969848633, + "learning_rate": 4.3863591690773996e-05, + "loss": 1.7165, + "step": 38314 + }, + { + "epoch": 0.22787015891140927, + "grad_norm": 4.13737154006958, + "learning_rate": 4.3863285154141155e-05, + "loss": 1.6609, + "step": 38315 + }, + { + "epoch": 0.22787610619469026, + "grad_norm": 2.086291790008545, + "learning_rate": 4.3862978610923336e-05, + "loss": 4.7187, + "step": 38316 + }, + { + "epoch": 0.22788205347797127, + "grad_norm": 3.0734682083129883, + "learning_rate": 4.3862672061120637e-05, + "loss": 1.2077, + "step": 38317 + }, + { + "epoch": 0.22788800076125226, + "grad_norm": 3.8052728176116943, + "learning_rate": 4.3862365504733184e-05, + "loss": 1.4626, + "step": 38318 + }, + { + "epoch": 0.22789394804453325, + "grad_norm": 3.8481943607330322, + "learning_rate": 4.3862058941761066e-05, + "loss": 1.015, + "step": 38319 + }, + { + "epoch": 0.22789989532781427, + "grad_norm": 3.951799154281616, + "learning_rate": 4.38617523722044e-05, + "loss": 1.2529, + "step": 38320 + }, + { + "epoch": 0.22790584261109526, + "grad_norm": 3.9880523681640625, + "learning_rate": 4.386144579606329e-05, + "loss": 1.4974, + "step": 38321 + }, + { + "epoch": 0.22791178989437624, + "grad_norm": 3.225381851196289, + "learning_rate": 4.386113921333785e-05, + "loss": 1.3402, + "step": 38322 + }, + { + "epoch": 0.22791773717765726, + "grad_norm": 3.4442830085754395, + "learning_rate": 4.386083262402818e-05, + "loss": 1.7391, + "step": 38323 + }, + { + "epoch": 0.22792368446093825, + "grad_norm": 3.1190624237060547, + "learning_rate": 4.3860526028134385e-05, + "loss": 1.328, + "step": 38324 + }, + { + "epoch": 0.22792963174421924, + "grad_norm": 4.021055698394775, + "learning_rate": 4.3860219425656576e-05, + "loss": 2.5143, + "step": 38325 + }, + { + "epoch": 0.22793557902750025, + "grad_norm": 3.794240951538086, + "learning_rate": 4.385991281659486e-05, + "loss": 2.1306, + "step": 38326 + }, + { + "epoch": 0.22794152631078124, + "grad_norm": 3.4017739295959473, + "learning_rate": 4.385960620094934e-05, + "loss": 1.7635, + "step": 38327 + }, + { + "epoch": 0.22794747359406223, + "grad_norm": 3.104942560195923, + "learning_rate": 4.3859299578720136e-05, + "loss": 1.3225, + "step": 38328 + }, + { + "epoch": 0.22795342087734322, + "grad_norm": 1.854576826095581, + "learning_rate": 4.385899294990734e-05, + "loss": 3.7617, + "step": 38329 + }, + { + "epoch": 0.22795936816062423, + "grad_norm": 1.875930666923523, + "learning_rate": 4.385868631451107e-05, + "loss": 4.7611, + "step": 38330 + }, + { + "epoch": 0.22796531544390522, + "grad_norm": 1.7999051809310913, + "learning_rate": 4.385837967253142e-05, + "loss": 4.874, + "step": 38331 + }, + { + "epoch": 0.2279712627271862, + "grad_norm": 1.803697109222412, + "learning_rate": 4.3858073023968504e-05, + "loss": 4.9271, + "step": 38332 + }, + { + "epoch": 0.22797721001046722, + "grad_norm": 2.002218008041382, + "learning_rate": 4.385776636882243e-05, + "loss": 4.8422, + "step": 38333 + }, + { + "epoch": 0.2279831572937482, + "grad_norm": 1.7723742723464966, + "learning_rate": 4.38574597070933e-05, + "loss": 5.1484, + "step": 38334 + }, + { + "epoch": 0.2279891045770292, + "grad_norm": 1.6561287641525269, + "learning_rate": 4.3857153038781236e-05, + "loss": 5.1151, + "step": 38335 + }, + { + "epoch": 0.22799505186031022, + "grad_norm": 1.9022784233093262, + "learning_rate": 4.3856846363886326e-05, + "loss": 5.3096, + "step": 38336 + }, + { + "epoch": 0.2280009991435912, + "grad_norm": 1.7410986423492432, + "learning_rate": 4.3856539682408695e-05, + "loss": 4.9187, + "step": 38337 + }, + { + "epoch": 0.2280069464268722, + "grad_norm": 1.9505523443222046, + "learning_rate": 4.385623299434843e-05, + "loss": 4.1309, + "step": 38338 + }, + { + "epoch": 0.2280128937101532, + "grad_norm": 2.1584455966949463, + "learning_rate": 4.385592629970566e-05, + "loss": 4.5914, + "step": 38339 + }, + { + "epoch": 0.2280188409934342, + "grad_norm": 1.7182157039642334, + "learning_rate": 4.3855619598480466e-05, + "loss": 4.7145, + "step": 38340 + }, + { + "epoch": 0.22802478827671518, + "grad_norm": 1.7989410161972046, + "learning_rate": 4.3855312890672985e-05, + "loss": 4.6119, + "step": 38341 + }, + { + "epoch": 0.2280307355599962, + "grad_norm": 1.574983835220337, + "learning_rate": 4.385500617628331e-05, + "loss": 4.6344, + "step": 38342 + }, + { + "epoch": 0.2280366828432772, + "grad_norm": 1.7368310689926147, + "learning_rate": 4.385469945531153e-05, + "loss": 4.2213, + "step": 38343 + }, + { + "epoch": 0.22804263012655818, + "grad_norm": 1.7998409271240234, + "learning_rate": 4.385439272775779e-05, + "loss": 4.5357, + "step": 38344 + }, + { + "epoch": 0.2280485774098392, + "grad_norm": 1.6426572799682617, + "learning_rate": 4.385408599362216e-05, + "loss": 4.8873, + "step": 38345 + }, + { + "epoch": 0.22805452469312018, + "grad_norm": 1.6232562065124512, + "learning_rate": 4.385377925290477e-05, + "loss": 4.8186, + "step": 38346 + }, + { + "epoch": 0.22806047197640117, + "grad_norm": 1.5165815353393555, + "learning_rate": 4.385347250560572e-05, + "loss": 4.9799, + "step": 38347 + }, + { + "epoch": 0.22806641925968218, + "grad_norm": 1.58846116065979, + "learning_rate": 4.385316575172512e-05, + "loss": 4.6993, + "step": 38348 + }, + { + "epoch": 0.22807236654296317, + "grad_norm": 2.4419918060302734, + "learning_rate": 4.385285899126307e-05, + "loss": 3.9719, + "step": 38349 + }, + { + "epoch": 0.22807831382624416, + "grad_norm": 2.2855114936828613, + "learning_rate": 4.385255222421968e-05, + "loss": 3.4413, + "step": 38350 + }, + { + "epoch": 0.22808426110952518, + "grad_norm": 2.426321268081665, + "learning_rate": 4.385224545059506e-05, + "loss": 3.5723, + "step": 38351 + }, + { + "epoch": 0.22809020839280617, + "grad_norm": 2.4065475463867188, + "learning_rate": 4.3851938670389325e-05, + "loss": 3.8233, + "step": 38352 + }, + { + "epoch": 0.22809615567608715, + "grad_norm": 1.5618879795074463, + "learning_rate": 4.385163188360256e-05, + "loss": 5.2795, + "step": 38353 + }, + { + "epoch": 0.22810210295936817, + "grad_norm": 2.3342907428741455, + "learning_rate": 4.3851325090234894e-05, + "loss": 3.4984, + "step": 38354 + }, + { + "epoch": 0.22810805024264916, + "grad_norm": 2.280735731124878, + "learning_rate": 4.3851018290286424e-05, + "loss": 3.9731, + "step": 38355 + }, + { + "epoch": 0.22811399752593015, + "grad_norm": 2.4038596153259277, + "learning_rate": 4.3850711483757254e-05, + "loss": 3.4351, + "step": 38356 + }, + { + "epoch": 0.22811994480921116, + "grad_norm": 2.344742774963379, + "learning_rate": 4.3850404670647505e-05, + "loss": 3.4398, + "step": 38357 + }, + { + "epoch": 0.22812589209249215, + "grad_norm": 2.090485095977783, + "learning_rate": 4.385009785095726e-05, + "loss": 3.5157, + "step": 38358 + }, + { + "epoch": 0.22813183937577314, + "grad_norm": 1.652655005455017, + "learning_rate": 4.384979102468666e-05, + "loss": 4.728, + "step": 38359 + }, + { + "epoch": 0.22813778665905415, + "grad_norm": 2.0470733642578125, + "learning_rate": 4.3849484191835775e-05, + "loss": 4.3696, + "step": 38360 + }, + { + "epoch": 0.22814373394233514, + "grad_norm": 1.6509379148483276, + "learning_rate": 4.3849177352404735e-05, + "loss": 5.0528, + "step": 38361 + }, + { + "epoch": 0.22814968122561613, + "grad_norm": 1.8258634805679321, + "learning_rate": 4.3848870506393644e-05, + "loss": 4.995, + "step": 38362 + }, + { + "epoch": 0.22815562850889715, + "grad_norm": 1.6283469200134277, + "learning_rate": 4.384856365380261e-05, + "loss": 4.9513, + "step": 38363 + }, + { + "epoch": 0.22816157579217813, + "grad_norm": 1.6566721200942993, + "learning_rate": 4.3848256794631726e-05, + "loss": 4.6471, + "step": 38364 + }, + { + "epoch": 0.22816752307545912, + "grad_norm": 1.37803316116333, + "learning_rate": 4.3847949928881116e-05, + "loss": 4.7292, + "step": 38365 + }, + { + "epoch": 0.22817347035874014, + "grad_norm": 1.688583254814148, + "learning_rate": 4.384764305655089e-05, + "loss": 4.9555, + "step": 38366 + }, + { + "epoch": 0.22817941764202113, + "grad_norm": 1.525038480758667, + "learning_rate": 4.384733617764113e-05, + "loss": 4.4668, + "step": 38367 + }, + { + "epoch": 0.22818536492530211, + "grad_norm": 1.6260348558425903, + "learning_rate": 4.384702929215198e-05, + "loss": 4.3515, + "step": 38368 + }, + { + "epoch": 0.22819131220858313, + "grad_norm": 2.6498796939849854, + "learning_rate": 4.3846722400083505e-05, + "loss": 4.6671, + "step": 38369 + }, + { + "epoch": 0.22819725949186412, + "grad_norm": 1.8406044244766235, + "learning_rate": 4.384641550143584e-05, + "loss": 4.8844, + "step": 38370 + }, + { + "epoch": 0.2282032067751451, + "grad_norm": 1.9134811162948608, + "learning_rate": 4.38461085962091e-05, + "loss": 3.916, + "step": 38371 + }, + { + "epoch": 0.22820915405842612, + "grad_norm": 2.211760997772217, + "learning_rate": 4.384580168440337e-05, + "loss": 4.1463, + "step": 38372 + }, + { + "epoch": 0.2282151013417071, + "grad_norm": 1.8475337028503418, + "learning_rate": 4.384549476601876e-05, + "loss": 4.6829, + "step": 38373 + }, + { + "epoch": 0.2282210486249881, + "grad_norm": 1.9539520740509033, + "learning_rate": 4.384518784105539e-05, + "loss": 4.3289, + "step": 38374 + }, + { + "epoch": 0.22822699590826911, + "grad_norm": 1.8555973768234253, + "learning_rate": 4.384488090951335e-05, + "loss": 4.3754, + "step": 38375 + }, + { + "epoch": 0.2282329431915501, + "grad_norm": 1.7652846574783325, + "learning_rate": 4.384457397139277e-05, + "loss": 4.7864, + "step": 38376 + }, + { + "epoch": 0.2282388904748311, + "grad_norm": 1.609988808631897, + "learning_rate": 4.3844267026693734e-05, + "loss": 4.378, + "step": 38377 + }, + { + "epoch": 0.2282448377581121, + "grad_norm": 1.6453213691711426, + "learning_rate": 4.3843960075416364e-05, + "loss": 4.3171, + "step": 38378 + }, + { + "epoch": 0.2282507850413931, + "grad_norm": 1.6950875520706177, + "learning_rate": 4.384365311756076e-05, + "loss": 4.2858, + "step": 38379 + }, + { + "epoch": 0.22825673232467408, + "grad_norm": 1.496107816696167, + "learning_rate": 4.384334615312703e-05, + "loss": 4.5498, + "step": 38380 + }, + { + "epoch": 0.2282626796079551, + "grad_norm": 1.898187279701233, + "learning_rate": 4.384303918211529e-05, + "loss": 4.3318, + "step": 38381 + }, + { + "epoch": 0.2282686268912361, + "grad_norm": 1.9541302919387817, + "learning_rate": 4.3842732204525626e-05, + "loss": 4.3783, + "step": 38382 + }, + { + "epoch": 0.22827457417451708, + "grad_norm": 1.8452259302139282, + "learning_rate": 4.384242522035817e-05, + "loss": 4.1647, + "step": 38383 + }, + { + "epoch": 0.2282805214577981, + "grad_norm": 1.7965500354766846, + "learning_rate": 4.3842118229613015e-05, + "loss": 4.2006, + "step": 38384 + }, + { + "epoch": 0.22828646874107908, + "grad_norm": 1.7789390087127686, + "learning_rate": 4.384181123229027e-05, + "loss": 4.3562, + "step": 38385 + }, + { + "epoch": 0.22829241602436007, + "grad_norm": 1.790845274925232, + "learning_rate": 4.384150422839005e-05, + "loss": 4.236, + "step": 38386 + }, + { + "epoch": 0.22829836330764108, + "grad_norm": 2.035334825515747, + "learning_rate": 4.384119721791244e-05, + "loss": 4.2813, + "step": 38387 + }, + { + "epoch": 0.22830431059092207, + "grad_norm": 1.4204626083374023, + "learning_rate": 4.3840890200857575e-05, + "loss": 4.3353, + "step": 38388 + }, + { + "epoch": 0.22831025787420306, + "grad_norm": 1.9478641748428345, + "learning_rate": 4.3840583177225546e-05, + "loss": 4.2326, + "step": 38389 + }, + { + "epoch": 0.22831620515748405, + "grad_norm": 1.750410556793213, + "learning_rate": 4.384027614701647e-05, + "loss": 4.1916, + "step": 38390 + }, + { + "epoch": 0.22832215244076506, + "grad_norm": 2.1178817749023438, + "learning_rate": 4.3839969110230444e-05, + "loss": 4.0682, + "step": 38391 + }, + { + "epoch": 0.22832809972404605, + "grad_norm": 1.7404820919036865, + "learning_rate": 4.3839662066867575e-05, + "loss": 3.9515, + "step": 38392 + }, + { + "epoch": 0.22833404700732704, + "grad_norm": 1.667843222618103, + "learning_rate": 4.383935501692798e-05, + "loss": 4.7055, + "step": 38393 + }, + { + "epoch": 0.22833999429060806, + "grad_norm": 1.9728176593780518, + "learning_rate": 4.383904796041176e-05, + "loss": 5.0967, + "step": 38394 + }, + { + "epoch": 0.22834594157388904, + "grad_norm": 2.2353971004486084, + "learning_rate": 4.383874089731902e-05, + "loss": 4.6252, + "step": 38395 + }, + { + "epoch": 0.22835188885717003, + "grad_norm": 2.1327226161956787, + "learning_rate": 4.383843382764988e-05, + "loss": 4.4956, + "step": 38396 + }, + { + "epoch": 0.22835783614045105, + "grad_norm": 1.6246529817581177, + "learning_rate": 4.3838126751404426e-05, + "loss": 4.8295, + "step": 38397 + }, + { + "epoch": 0.22836378342373204, + "grad_norm": 1.6082868576049805, + "learning_rate": 4.383781966858278e-05, + "loss": 4.799, + "step": 38398 + }, + { + "epoch": 0.22836973070701302, + "grad_norm": 1.6632347106933594, + "learning_rate": 4.383751257918505e-05, + "loss": 4.465, + "step": 38399 + }, + { + "epoch": 0.22837567799029404, + "grad_norm": 1.6943600177764893, + "learning_rate": 4.383720548321133e-05, + "loss": 4.5799, + "step": 38400 + }, + { + "epoch": 0.22838162527357503, + "grad_norm": 1.6693763732910156, + "learning_rate": 4.3836898380661744e-05, + "loss": 4.3767, + "step": 38401 + }, + { + "epoch": 0.22838757255685602, + "grad_norm": 1.7295639514923096, + "learning_rate": 4.383659127153639e-05, + "loss": 4.3434, + "step": 38402 + }, + { + "epoch": 0.22839351984013703, + "grad_norm": 1.5898070335388184, + "learning_rate": 4.3836284155835375e-05, + "loss": 4.6257, + "step": 38403 + }, + { + "epoch": 0.22839946712341802, + "grad_norm": 1.638995885848999, + "learning_rate": 4.3835977033558804e-05, + "loss": 4.3646, + "step": 38404 + }, + { + "epoch": 0.228405414406699, + "grad_norm": 1.7454984188079834, + "learning_rate": 4.38356699047068e-05, + "loss": 4.4236, + "step": 38405 + }, + { + "epoch": 0.22841136168998002, + "grad_norm": 1.4987882375717163, + "learning_rate": 4.383536276927945e-05, + "loss": 4.9411, + "step": 38406 + }, + { + "epoch": 0.228417308973261, + "grad_norm": 1.7034831047058105, + "learning_rate": 4.383505562727687e-05, + "loss": 4.5394, + "step": 38407 + }, + { + "epoch": 0.228423256256542, + "grad_norm": 2.3442165851593018, + "learning_rate": 4.383474847869916e-05, + "loss": 3.9469, + "step": 38408 + }, + { + "epoch": 0.22842920353982302, + "grad_norm": 2.1856658458709717, + "learning_rate": 4.383444132354645e-05, + "loss": 3.8203, + "step": 38409 + }, + { + "epoch": 0.228435150823104, + "grad_norm": 2.810123920440674, + "learning_rate": 4.383413416181882e-05, + "loss": 4.2282, + "step": 38410 + }, + { + "epoch": 0.228441098106385, + "grad_norm": 2.2498672008514404, + "learning_rate": 4.3833826993516384e-05, + "loss": 3.9329, + "step": 38411 + }, + { + "epoch": 0.228447045389666, + "grad_norm": 2.1014022827148438, + "learning_rate": 4.383351981863926e-05, + "loss": 4.0302, + "step": 38412 + }, + { + "epoch": 0.228452992672947, + "grad_norm": 2.276716709136963, + "learning_rate": 4.3833212637187547e-05, + "loss": 4.0824, + "step": 38413 + }, + { + "epoch": 0.22845893995622799, + "grad_norm": 2.0157651901245117, + "learning_rate": 4.383290544916136e-05, + "loss": 4.3826, + "step": 38414 + }, + { + "epoch": 0.228464887239509, + "grad_norm": 1.7391401529312134, + "learning_rate": 4.3832598254560796e-05, + "loss": 4.5819, + "step": 38415 + }, + { + "epoch": 0.22847083452279, + "grad_norm": 1.7038103342056274, + "learning_rate": 4.3832291053385965e-05, + "loss": 3.7208, + "step": 38416 + }, + { + "epoch": 0.22847678180607098, + "grad_norm": 1.7914259433746338, + "learning_rate": 4.383198384563698e-05, + "loss": 3.755, + "step": 38417 + }, + { + "epoch": 0.228482729089352, + "grad_norm": 1.7458772659301758, + "learning_rate": 4.383167663131394e-05, + "loss": 3.7681, + "step": 38418 + }, + { + "epoch": 0.22848867637263298, + "grad_norm": 1.546152949333191, + "learning_rate": 4.383136941041696e-05, + "loss": 3.5506, + "step": 38419 + }, + { + "epoch": 0.22849462365591397, + "grad_norm": 1.6328538656234741, + "learning_rate": 4.383106218294614e-05, + "loss": 3.7039, + "step": 38420 + }, + { + "epoch": 0.22850057093919499, + "grad_norm": 1.8553338050842285, + "learning_rate": 4.383075494890159e-05, + "loss": 4.023, + "step": 38421 + }, + { + "epoch": 0.22850651822247597, + "grad_norm": 2.007420301437378, + "learning_rate": 4.3830447708283416e-05, + "loss": 4.2306, + "step": 38422 + }, + { + "epoch": 0.22851246550575696, + "grad_norm": 1.7648934125900269, + "learning_rate": 4.383014046109173e-05, + "loss": 4.0914, + "step": 38423 + }, + { + "epoch": 0.22851841278903798, + "grad_norm": 1.8605449199676514, + "learning_rate": 4.3829833207326644e-05, + "loss": 4.1892, + "step": 38424 + }, + { + "epoch": 0.22852436007231897, + "grad_norm": 1.7090728282928467, + "learning_rate": 4.382952594698825e-05, + "loss": 4.3818, + "step": 38425 + }, + { + "epoch": 0.22853030735559995, + "grad_norm": 1.8204621076583862, + "learning_rate": 4.382921868007666e-05, + "loss": 4.1313, + "step": 38426 + }, + { + "epoch": 0.22853625463888097, + "grad_norm": 2.0081419944763184, + "learning_rate": 4.382891140659199e-05, + "loss": 4.0119, + "step": 38427 + }, + { + "epoch": 0.22854220192216196, + "grad_norm": 2.096193790435791, + "learning_rate": 4.382860412653434e-05, + "loss": 4.4627, + "step": 38428 + }, + { + "epoch": 0.22854814920544295, + "grad_norm": 2.220578908920288, + "learning_rate": 4.3828296839903814e-05, + "loss": 4.1242, + "step": 38429 + }, + { + "epoch": 0.22855409648872396, + "grad_norm": 1.925628662109375, + "learning_rate": 4.3827989546700534e-05, + "loss": 4.0027, + "step": 38430 + }, + { + "epoch": 0.22856004377200495, + "grad_norm": 1.6829743385314941, + "learning_rate": 4.382768224692459e-05, + "loss": 5.0908, + "step": 38431 + }, + { + "epoch": 0.22856599105528594, + "grad_norm": 1.726646065711975, + "learning_rate": 4.38273749405761e-05, + "loss": 5.75, + "step": 38432 + }, + { + "epoch": 0.22857193833856695, + "grad_norm": 1.7120121717453003, + "learning_rate": 4.3827067627655164e-05, + "loss": 5.6595, + "step": 38433 + }, + { + "epoch": 0.22857788562184794, + "grad_norm": 1.7972990274429321, + "learning_rate": 4.38267603081619e-05, + "loss": 4.8448, + "step": 38434 + }, + { + "epoch": 0.22858383290512893, + "grad_norm": 2.042712450027466, + "learning_rate": 4.38264529820964e-05, + "loss": 4.8918, + "step": 38435 + }, + { + "epoch": 0.22858978018840995, + "grad_norm": 1.766764760017395, + "learning_rate": 4.3826145649458786e-05, + "loss": 4.9263, + "step": 38436 + }, + { + "epoch": 0.22859572747169093, + "grad_norm": 1.5604772567749023, + "learning_rate": 4.3825838310249155e-05, + "loss": 5.3605, + "step": 38437 + }, + { + "epoch": 0.22860167475497192, + "grad_norm": 1.8508424758911133, + "learning_rate": 4.3825530964467626e-05, + "loss": 5.2559, + "step": 38438 + }, + { + "epoch": 0.22860762203825294, + "grad_norm": 1.8071962594985962, + "learning_rate": 4.382522361211429e-05, + "loss": 5.1945, + "step": 38439 + }, + { + "epoch": 0.22861356932153393, + "grad_norm": 1.7534548044204712, + "learning_rate": 4.3824916253189266e-05, + "loss": 4.7507, + "step": 38440 + }, + { + "epoch": 0.22861951660481492, + "grad_norm": 1.6226413249969482, + "learning_rate": 4.3824608887692666e-05, + "loss": 5.1489, + "step": 38441 + }, + { + "epoch": 0.22862546388809593, + "grad_norm": 1.7093009948730469, + "learning_rate": 4.382430151562458e-05, + "loss": 5.6064, + "step": 38442 + }, + { + "epoch": 0.22863141117137692, + "grad_norm": 1.6506770849227905, + "learning_rate": 4.3823994136985126e-05, + "loss": 5.335, + "step": 38443 + }, + { + "epoch": 0.2286373584546579, + "grad_norm": 1.7119227647781372, + "learning_rate": 4.382368675177441e-05, + "loss": 4.4665, + "step": 38444 + }, + { + "epoch": 0.22864330573793892, + "grad_norm": 1.745902419090271, + "learning_rate": 4.382337935999254e-05, + "loss": 4.9238, + "step": 38445 + }, + { + "epoch": 0.2286492530212199, + "grad_norm": 1.7766257524490356, + "learning_rate": 4.382307196163962e-05, + "loss": 5.1864, + "step": 38446 + }, + { + "epoch": 0.2286552003045009, + "grad_norm": 1.6583263874053955, + "learning_rate": 4.382276455671577e-05, + "loss": 5.117, + "step": 38447 + }, + { + "epoch": 0.2286611475877819, + "grad_norm": 1.6775164604187012, + "learning_rate": 4.382245714522107e-05, + "loss": 4.8822, + "step": 38448 + }, + { + "epoch": 0.2286670948710629, + "grad_norm": 1.9329807758331299, + "learning_rate": 4.3822149727155656e-05, + "loss": 4.7758, + "step": 38449 + }, + { + "epoch": 0.2286730421543439, + "grad_norm": 1.7038854360580444, + "learning_rate": 4.382184230251963e-05, + "loss": 4.8293, + "step": 38450 + }, + { + "epoch": 0.22867898943762488, + "grad_norm": 1.7462607622146606, + "learning_rate": 4.3821534871313086e-05, + "loss": 4.6184, + "step": 38451 + }, + { + "epoch": 0.2286849367209059, + "grad_norm": 1.8756234645843506, + "learning_rate": 4.3821227433536135e-05, + "loss": 5.0585, + "step": 38452 + }, + { + "epoch": 0.22869088400418688, + "grad_norm": 1.8120574951171875, + "learning_rate": 4.382091998918889e-05, + "loss": 4.9257, + "step": 38453 + }, + { + "epoch": 0.22869683128746787, + "grad_norm": 1.696677327156067, + "learning_rate": 4.3820612538271465e-05, + "loss": 4.8921, + "step": 38454 + }, + { + "epoch": 0.2287027785707489, + "grad_norm": 1.588819980621338, + "learning_rate": 4.382030508078395e-05, + "loss": 5.5639, + "step": 38455 + }, + { + "epoch": 0.22870872585402988, + "grad_norm": 2.115058660507202, + "learning_rate": 4.381999761672646e-05, + "loss": 4.3399, + "step": 38456 + }, + { + "epoch": 0.22871467313731086, + "grad_norm": 3.0161054134368896, + "learning_rate": 4.3819690146099105e-05, + "loss": 2.4075, + "step": 38457 + }, + { + "epoch": 0.22872062042059188, + "grad_norm": 3.0567798614501953, + "learning_rate": 4.3819382668902e-05, + "loss": 2.6078, + "step": 38458 + }, + { + "epoch": 0.22872656770387287, + "grad_norm": 3.0818064212799072, + "learning_rate": 4.3819075185135226e-05, + "loss": 2.2381, + "step": 38459 + }, + { + "epoch": 0.22873251498715386, + "grad_norm": 2.7542152404785156, + "learning_rate": 4.381876769479892e-05, + "loss": 2.6451, + "step": 38460 + }, + { + "epoch": 0.22873846227043487, + "grad_norm": 2.485642671585083, + "learning_rate": 4.381846019789317e-05, + "loss": 3.3389, + "step": 38461 + }, + { + "epoch": 0.22874440955371586, + "grad_norm": 2.538452625274658, + "learning_rate": 4.381815269441809e-05, + "loss": 4.7806, + "step": 38462 + }, + { + "epoch": 0.22875035683699685, + "grad_norm": 2.311481475830078, + "learning_rate": 4.381784518437379e-05, + "loss": 4.7911, + "step": 38463 + }, + { + "epoch": 0.22875630412027786, + "grad_norm": 2.447681188583374, + "learning_rate": 4.3817537667760375e-05, + "loss": 3.5121, + "step": 38464 + }, + { + "epoch": 0.22876225140355885, + "grad_norm": 2.6825411319732666, + "learning_rate": 4.381723014457795e-05, + "loss": 3.9006, + "step": 38465 + }, + { + "epoch": 0.22876819868683984, + "grad_norm": 2.31955885887146, + "learning_rate": 4.381692261482663e-05, + "loss": 4.0621, + "step": 38466 + }, + { + "epoch": 0.22877414597012086, + "grad_norm": 2.573988199234009, + "learning_rate": 4.3816615078506514e-05, + "loss": 4.0674, + "step": 38467 + }, + { + "epoch": 0.22878009325340184, + "grad_norm": 2.437645673751831, + "learning_rate": 4.381630753561771e-05, + "loss": 3.8, + "step": 38468 + }, + { + "epoch": 0.22878604053668283, + "grad_norm": 1.8713231086730957, + "learning_rate": 4.381599998616033e-05, + "loss": 4.5789, + "step": 38469 + }, + { + "epoch": 0.22879198781996385, + "grad_norm": 1.6233890056610107, + "learning_rate": 4.381569243013448e-05, + "loss": 4.5076, + "step": 38470 + }, + { + "epoch": 0.22879793510324484, + "grad_norm": 1.7696006298065186, + "learning_rate": 4.3815384867540255e-05, + "loss": 4.2695, + "step": 38471 + }, + { + "epoch": 0.22880388238652583, + "grad_norm": 1.7355118989944458, + "learning_rate": 4.381507729837778e-05, + "loss": 4.5847, + "step": 38472 + }, + { + "epoch": 0.22880982966980684, + "grad_norm": 1.544387936592102, + "learning_rate": 4.381476972264716e-05, + "loss": 4.9015, + "step": 38473 + }, + { + "epoch": 0.22881577695308783, + "grad_norm": 1.5550681352615356, + "learning_rate": 4.38144621403485e-05, + "loss": 4.6908, + "step": 38474 + }, + { + "epoch": 0.22882172423636882, + "grad_norm": 1.700878620147705, + "learning_rate": 4.38141545514819e-05, + "loss": 4.7589, + "step": 38475 + }, + { + "epoch": 0.22882767151964983, + "grad_norm": 2.258042812347412, + "learning_rate": 4.381384695604748e-05, + "loss": 4.6368, + "step": 38476 + }, + { + "epoch": 0.22883361880293082, + "grad_norm": 2.191384792327881, + "learning_rate": 4.381353935404533e-05, + "loss": 4.666, + "step": 38477 + }, + { + "epoch": 0.2288395660862118, + "grad_norm": 2.2790069580078125, + "learning_rate": 4.381323174547557e-05, + "loss": 4.7982, + "step": 38478 + }, + { + "epoch": 0.22884551336949283, + "grad_norm": 2.4165420532226562, + "learning_rate": 4.381292413033831e-05, + "loss": 4.8295, + "step": 38479 + }, + { + "epoch": 0.2288514606527738, + "grad_norm": 2.2812304496765137, + "learning_rate": 4.3812616508633654e-05, + "loss": 4.6471, + "step": 38480 + }, + { + "epoch": 0.2288574079360548, + "grad_norm": 2.199652671813965, + "learning_rate": 4.38123088803617e-05, + "loss": 4.7415, + "step": 38481 + }, + { + "epoch": 0.22886335521933582, + "grad_norm": 2.2370831966400146, + "learning_rate": 4.381200124552257e-05, + "loss": 4.6689, + "step": 38482 + }, + { + "epoch": 0.2288693025026168, + "grad_norm": 2.2006328105926514, + "learning_rate": 4.381169360411636e-05, + "loss": 4.7032, + "step": 38483 + }, + { + "epoch": 0.2288752497858978, + "grad_norm": 2.2597954273223877, + "learning_rate": 4.381138595614318e-05, + "loss": 4.6799, + "step": 38484 + }, + { + "epoch": 0.2288811970691788, + "grad_norm": 1.9300129413604736, + "learning_rate": 4.381107830160315e-05, + "loss": 4.8595, + "step": 38485 + }, + { + "epoch": 0.2288871443524598, + "grad_norm": 1.6042248010635376, + "learning_rate": 4.381077064049636e-05, + "loss": 4.8479, + "step": 38486 + }, + { + "epoch": 0.2288930916357408, + "grad_norm": 1.685781478881836, + "learning_rate": 4.381046297282292e-05, + "loss": 4.9784, + "step": 38487 + }, + { + "epoch": 0.2288990389190218, + "grad_norm": 2.3685178756713867, + "learning_rate": 4.3810155298582956e-05, + "loss": 2.661, + "step": 38488 + }, + { + "epoch": 0.2289049862023028, + "grad_norm": 2.790250301361084, + "learning_rate": 4.3809847617776545e-05, + "loss": 1.3592, + "step": 38489 + }, + { + "epoch": 0.22891093348558378, + "grad_norm": 2.53934645652771, + "learning_rate": 4.380953993040382e-05, + "loss": 1.2262, + "step": 38490 + }, + { + "epoch": 0.2289168807688648, + "grad_norm": 2.5149827003479004, + "learning_rate": 4.3809232236464875e-05, + "loss": 1.2036, + "step": 38491 + }, + { + "epoch": 0.22892282805214578, + "grad_norm": 2.73903489112854, + "learning_rate": 4.3808924535959825e-05, + "loss": 1.3444, + "step": 38492 + }, + { + "epoch": 0.22892877533542677, + "grad_norm": 2.8913257122039795, + "learning_rate": 4.380861682888877e-05, + "loss": 1.6929, + "step": 38493 + }, + { + "epoch": 0.2289347226187078, + "grad_norm": 2.765486717224121, + "learning_rate": 4.380830911525182e-05, + "loss": 1.1883, + "step": 38494 + }, + { + "epoch": 0.22894066990198877, + "grad_norm": 3.039799928665161, + "learning_rate": 4.380800139504909e-05, + "loss": 1.2903, + "step": 38495 + }, + { + "epoch": 0.22894661718526976, + "grad_norm": 2.9481117725372314, + "learning_rate": 4.3807693668280676e-05, + "loss": 1.3223, + "step": 38496 + }, + { + "epoch": 0.22895256446855078, + "grad_norm": 2.532102346420288, + "learning_rate": 4.3807385934946696e-05, + "loss": 0.8985, + "step": 38497 + }, + { + "epoch": 0.22895851175183177, + "grad_norm": 3.0393545627593994, + "learning_rate": 4.380707819504725e-05, + "loss": 1.4591, + "step": 38498 + }, + { + "epoch": 0.22896445903511276, + "grad_norm": 3.1826932430267334, + "learning_rate": 4.3806770448582435e-05, + "loss": 1.106, + "step": 38499 + }, + { + "epoch": 0.22897040631839377, + "grad_norm": 3.0488319396972656, + "learning_rate": 4.3806462695552385e-05, + "loss": 1.0806, + "step": 38500 + }, + { + "epoch": 0.22897635360167476, + "grad_norm": 2.639357566833496, + "learning_rate": 4.380615493595719e-05, + "loss": 1.4651, + "step": 38501 + }, + { + "epoch": 0.22898230088495575, + "grad_norm": 1.8026386499404907, + "learning_rate": 4.380584716979696e-05, + "loss": 5.0124, + "step": 38502 + }, + { + "epoch": 0.22898824816823676, + "grad_norm": 2.951977014541626, + "learning_rate": 4.3805539397071806e-05, + "loss": 3.8272, + "step": 38503 + }, + { + "epoch": 0.22899419545151775, + "grad_norm": 2.755704402923584, + "learning_rate": 4.3805231617781823e-05, + "loss": 3.3427, + "step": 38504 + }, + { + "epoch": 0.22900014273479874, + "grad_norm": 2.2187037467956543, + "learning_rate": 4.380492383192714e-05, + "loss": 3.7645, + "step": 38505 + }, + { + "epoch": 0.22900609001807973, + "grad_norm": 1.8150204420089722, + "learning_rate": 4.380461603950784e-05, + "loss": 5.5314, + "step": 38506 + }, + { + "epoch": 0.22901203730136074, + "grad_norm": 2.308591604232788, + "learning_rate": 4.380430824052405e-05, + "loss": 3.7683, + "step": 38507 + }, + { + "epoch": 0.22901798458464173, + "grad_norm": 2.0516018867492676, + "learning_rate": 4.380400043497587e-05, + "loss": 3.6923, + "step": 38508 + }, + { + "epoch": 0.22902393186792272, + "grad_norm": 1.8416396379470825, + "learning_rate": 4.380369262286341e-05, + "loss": 4.9084, + "step": 38509 + }, + { + "epoch": 0.22902987915120374, + "grad_norm": 1.7685813903808594, + "learning_rate": 4.380338480418677e-05, + "loss": 4.8024, + "step": 38510 + }, + { + "epoch": 0.22903582643448472, + "grad_norm": 2.066200017929077, + "learning_rate": 4.3803076978946066e-05, + "loss": 3.3572, + "step": 38511 + }, + { + "epoch": 0.2290417737177657, + "grad_norm": 6.249925136566162, + "learning_rate": 4.3802769147141395e-05, + "loss": 2.0614, + "step": 38512 + }, + { + "epoch": 0.22904772100104673, + "grad_norm": 3.487518310546875, + "learning_rate": 4.380246130877288e-05, + "loss": 1.6695, + "step": 38513 + }, + { + "epoch": 0.22905366828432772, + "grad_norm": 2.93048095703125, + "learning_rate": 4.3802153463840626e-05, + "loss": 1.3208, + "step": 38514 + }, + { + "epoch": 0.2290596155676087, + "grad_norm": 2.9948956966400146, + "learning_rate": 4.3801845612344716e-05, + "loss": 1.256, + "step": 38515 + }, + { + "epoch": 0.22906556285088972, + "grad_norm": 2.7932517528533936, + "learning_rate": 4.380153775428529e-05, + "loss": 2.3507, + "step": 38516 + }, + { + "epoch": 0.2290715101341707, + "grad_norm": 2.4867453575134277, + "learning_rate": 4.380122988966244e-05, + "loss": 2.8348, + "step": 38517 + }, + { + "epoch": 0.2290774574174517, + "grad_norm": 2.4838972091674805, + "learning_rate": 4.380092201847627e-05, + "loss": 2.7133, + "step": 38518 + }, + { + "epoch": 0.2290834047007327, + "grad_norm": 2.3084802627563477, + "learning_rate": 4.3800614140726894e-05, + "loss": 2.9998, + "step": 38519 + }, + { + "epoch": 0.2290893519840137, + "grad_norm": 2.5051803588867188, + "learning_rate": 4.3800306256414415e-05, + "loss": 2.8024, + "step": 38520 + }, + { + "epoch": 0.2290952992672947, + "grad_norm": 2.5238940715789795, + "learning_rate": 4.3799998365538954e-05, + "loss": 2.722, + "step": 38521 + }, + { + "epoch": 0.2291012465505757, + "grad_norm": 2.2784957885742188, + "learning_rate": 4.3799690468100594e-05, + "loss": 3.4847, + "step": 38522 + }, + { + "epoch": 0.2291071938338567, + "grad_norm": 1.9627010822296143, + "learning_rate": 4.3799382564099464e-05, + "loss": 5.611, + "step": 38523 + }, + { + "epoch": 0.22911314111713768, + "grad_norm": 6.111119747161865, + "learning_rate": 4.3799074653535664e-05, + "loss": 3.582, + "step": 38524 + }, + { + "epoch": 0.2291190884004187, + "grad_norm": 7.286647319793701, + "learning_rate": 4.3798766736409304e-05, + "loss": 3.8205, + "step": 38525 + }, + { + "epoch": 0.22912503568369968, + "grad_norm": 4.92535400390625, + "learning_rate": 4.379845881272048e-05, + "loss": 3.4899, + "step": 38526 + }, + { + "epoch": 0.22913098296698067, + "grad_norm": 4.405367851257324, + "learning_rate": 4.3798150882469316e-05, + "loss": 3.1935, + "step": 38527 + }, + { + "epoch": 0.2291369302502617, + "grad_norm": 2.9681317806243896, + "learning_rate": 4.379784294565591e-05, + "loss": 3.5363, + "step": 38528 + }, + { + "epoch": 0.22914287753354268, + "grad_norm": 2.0128326416015625, + "learning_rate": 4.3797535002280366e-05, + "loss": 5.0034, + "step": 38529 + }, + { + "epoch": 0.22914882481682367, + "grad_norm": 1.7648944854736328, + "learning_rate": 4.37972270523428e-05, + "loss": 5.3278, + "step": 38530 + }, + { + "epoch": 0.22915477210010468, + "grad_norm": 1.7553101778030396, + "learning_rate": 4.379691909584332e-05, + "loss": 5.2114, + "step": 38531 + }, + { + "epoch": 0.22916071938338567, + "grad_norm": 1.5683118104934692, + "learning_rate": 4.379661113278203e-05, + "loss": 5.0773, + "step": 38532 + }, + { + "epoch": 0.22916666666666666, + "grad_norm": 1.6441692113876343, + "learning_rate": 4.3796303163159025e-05, + "loss": 5.1614, + "step": 38533 + }, + { + "epoch": 0.22917261394994767, + "grad_norm": 1.4447158575057983, + "learning_rate": 4.379599518697444e-05, + "loss": 5.0567, + "step": 38534 + }, + { + "epoch": 0.22917856123322866, + "grad_norm": 1.6365786790847778, + "learning_rate": 4.379568720422836e-05, + "loss": 5.0077, + "step": 38535 + }, + { + "epoch": 0.22918450851650965, + "grad_norm": 1.7679840326309204, + "learning_rate": 4.3795379214920895e-05, + "loss": 5.3002, + "step": 38536 + }, + { + "epoch": 0.22919045579979067, + "grad_norm": 1.586530327796936, + "learning_rate": 4.379507121905217e-05, + "loss": 5.0253, + "step": 38537 + }, + { + "epoch": 0.22919640308307165, + "grad_norm": 1.5012983083724976, + "learning_rate": 4.379476321662227e-05, + "loss": 5.1246, + "step": 38538 + }, + { + "epoch": 0.22920235036635264, + "grad_norm": 1.649339199066162, + "learning_rate": 4.3794455207631315e-05, + "loss": 5.0695, + "step": 38539 + }, + { + "epoch": 0.22920829764963366, + "grad_norm": 1.5892829895019531, + "learning_rate": 4.379414719207941e-05, + "loss": 4.7051, + "step": 38540 + }, + { + "epoch": 0.22921424493291465, + "grad_norm": 1.3112465143203735, + "learning_rate": 4.379383916996666e-05, + "loss": 5.3511, + "step": 38541 + }, + { + "epoch": 0.22922019221619563, + "grad_norm": 1.2667590379714966, + "learning_rate": 4.3793531141293185e-05, + "loss": 5.3804, + "step": 38542 + }, + { + "epoch": 0.22922613949947665, + "grad_norm": 2.032801389694214, + "learning_rate": 4.3793223106059064e-05, + "loss": 5.1612, + "step": 38543 + }, + { + "epoch": 0.22923208678275764, + "grad_norm": 1.8813502788543701, + "learning_rate": 4.379291506426444e-05, + "loss": 5.1347, + "step": 38544 + }, + { + "epoch": 0.22923803406603863, + "grad_norm": 1.8237574100494385, + "learning_rate": 4.379260701590939e-05, + "loss": 4.978, + "step": 38545 + }, + { + "epoch": 0.22924398134931964, + "grad_norm": 1.621106743812561, + "learning_rate": 4.3792298960994046e-05, + "loss": 4.9011, + "step": 38546 + }, + { + "epoch": 0.22924992863260063, + "grad_norm": 1.3890458345413208, + "learning_rate": 4.3791990899518506e-05, + "loss": 4.8795, + "step": 38547 + }, + { + "epoch": 0.22925587591588162, + "grad_norm": 1.5003517866134644, + "learning_rate": 4.379168283148287e-05, + "loss": 4.8412, + "step": 38548 + }, + { + "epoch": 0.22926182319916263, + "grad_norm": 1.7542293071746826, + "learning_rate": 4.379137475688725e-05, + "loss": 4.6803, + "step": 38549 + }, + { + "epoch": 0.22926777048244362, + "grad_norm": 1.7646663188934326, + "learning_rate": 4.379106667573176e-05, + "loss": 5.1501, + "step": 38550 + }, + { + "epoch": 0.2292737177657246, + "grad_norm": 2.451752185821533, + "learning_rate": 4.37907585880165e-05, + "loss": 4.2773, + "step": 38551 + }, + { + "epoch": 0.22927966504900563, + "grad_norm": 2.2721564769744873, + "learning_rate": 4.379045049374158e-05, + "loss": 4.1409, + "step": 38552 + }, + { + "epoch": 0.22928561233228661, + "grad_norm": 2.1969313621520996, + "learning_rate": 4.379014239290711e-05, + "loss": 4.303, + "step": 38553 + }, + { + "epoch": 0.2292915596155676, + "grad_norm": 2.17642879486084, + "learning_rate": 4.3789834285513195e-05, + "loss": 4.8686, + "step": 38554 + }, + { + "epoch": 0.22929750689884862, + "grad_norm": 1.5282912254333496, + "learning_rate": 4.378952617155994e-05, + "loss": 5.4506, + "step": 38555 + }, + { + "epoch": 0.2293034541821296, + "grad_norm": 1.6670398712158203, + "learning_rate": 4.378921805104746e-05, + "loss": 5.2819, + "step": 38556 + }, + { + "epoch": 0.2293094014654106, + "grad_norm": 1.834071159362793, + "learning_rate": 4.378890992397585e-05, + "loss": 4.9508, + "step": 38557 + }, + { + "epoch": 0.2293153487486916, + "grad_norm": 2.6882967948913574, + "learning_rate": 4.378860179034524e-05, + "loss": 4.546, + "step": 38558 + }, + { + "epoch": 0.2293212960319726, + "grad_norm": 2.468848705291748, + "learning_rate": 4.378829365015571e-05, + "loss": 4.6806, + "step": 38559 + }, + { + "epoch": 0.2293272433152536, + "grad_norm": 1.7329350709915161, + "learning_rate": 4.378798550340739e-05, + "loss": 4.4552, + "step": 38560 + }, + { + "epoch": 0.2293331905985346, + "grad_norm": 1.4378310441970825, + "learning_rate": 4.378767735010037e-05, + "loss": 5.2996, + "step": 38561 + }, + { + "epoch": 0.2293391378818156, + "grad_norm": 1.4221470355987549, + "learning_rate": 4.3787369190234766e-05, + "loss": 5.2644, + "step": 38562 + }, + { + "epoch": 0.22934508516509658, + "grad_norm": 1.4187413454055786, + "learning_rate": 4.378706102381069e-05, + "loss": 5.3766, + "step": 38563 + }, + { + "epoch": 0.22935103244837757, + "grad_norm": 2.0169026851654053, + "learning_rate": 4.378675285082825e-05, + "loss": 4.565, + "step": 38564 + }, + { + "epoch": 0.22935697973165858, + "grad_norm": 1.4599847793579102, + "learning_rate": 4.378644467128754e-05, + "loss": 5.1849, + "step": 38565 + }, + { + "epoch": 0.22936292701493957, + "grad_norm": 1.2419488430023193, + "learning_rate": 4.378613648518868e-05, + "loss": 4.9225, + "step": 38566 + }, + { + "epoch": 0.22936887429822056, + "grad_norm": 1.7379957437515259, + "learning_rate": 4.378582829253177e-05, + "loss": 3.9957, + "step": 38567 + }, + { + "epoch": 0.22937482158150158, + "grad_norm": 1.5496753454208374, + "learning_rate": 4.3785520093316926e-05, + "loss": 3.9657, + "step": 38568 + }, + { + "epoch": 0.22938076886478256, + "grad_norm": 1.5953773260116577, + "learning_rate": 4.3785211887544255e-05, + "loss": 4.1013, + "step": 38569 + }, + { + "epoch": 0.22938671614806355, + "grad_norm": 2.0474889278411865, + "learning_rate": 4.378490367521385e-05, + "loss": 3.9464, + "step": 38570 + }, + { + "epoch": 0.22939266343134457, + "grad_norm": 2.4065568447113037, + "learning_rate": 4.378459545632584e-05, + "loss": 3.8675, + "step": 38571 + }, + { + "epoch": 0.22939861071462556, + "grad_norm": 1.6409612894058228, + "learning_rate": 4.378428723088031e-05, + "loss": 5.0453, + "step": 38572 + }, + { + "epoch": 0.22940455799790654, + "grad_norm": 1.7439429759979248, + "learning_rate": 4.3783978998877385e-05, + "loss": 4.9018, + "step": 38573 + }, + { + "epoch": 0.22941050528118756, + "grad_norm": 1.6023813486099243, + "learning_rate": 4.3783670760317174e-05, + "loss": 5.1573, + "step": 38574 + }, + { + "epoch": 0.22941645256446855, + "grad_norm": 1.7142831087112427, + "learning_rate": 4.378336251519977e-05, + "loss": 5.0621, + "step": 38575 + }, + { + "epoch": 0.22942239984774954, + "grad_norm": 1.5479685068130493, + "learning_rate": 4.3783054263525284e-05, + "loss": 4.871, + "step": 38576 + }, + { + "epoch": 0.22942834713103055, + "grad_norm": 1.2989557981491089, + "learning_rate": 4.3782746005293837e-05, + "loss": 4.9569, + "step": 38577 + }, + { + "epoch": 0.22943429441431154, + "grad_norm": 1.3901426792144775, + "learning_rate": 4.378243774050552e-05, + "loss": 5.0305, + "step": 38578 + }, + { + "epoch": 0.22944024169759253, + "grad_norm": 1.2557393312454224, + "learning_rate": 4.378212946916045e-05, + "loss": 5.0187, + "step": 38579 + }, + { + "epoch": 0.22944618898087354, + "grad_norm": 1.7015252113342285, + "learning_rate": 4.378182119125874e-05, + "loss": 4.7364, + "step": 38580 + }, + { + "epoch": 0.22945213626415453, + "grad_norm": 1.5928224325180054, + "learning_rate": 4.378151290680048e-05, + "loss": 4.7833, + "step": 38581 + }, + { + "epoch": 0.22945808354743552, + "grad_norm": 1.5460565090179443, + "learning_rate": 4.3781204615785795e-05, + "loss": 4.8068, + "step": 38582 + }, + { + "epoch": 0.22946403083071654, + "grad_norm": 1.4052801132202148, + "learning_rate": 4.378089631821478e-05, + "loss": 4.8026, + "step": 38583 + }, + { + "epoch": 0.22946997811399752, + "grad_norm": 1.4528011083602905, + "learning_rate": 4.378058801408755e-05, + "loss": 4.8313, + "step": 38584 + }, + { + "epoch": 0.2294759253972785, + "grad_norm": 1.4408035278320312, + "learning_rate": 4.3780279703404214e-05, + "loss": 4.8099, + "step": 38585 + }, + { + "epoch": 0.22948187268055953, + "grad_norm": 1.5796035528182983, + "learning_rate": 4.3779971386164874e-05, + "loss": 4.9001, + "step": 38586 + }, + { + "epoch": 0.22948781996384052, + "grad_norm": 1.4678764343261719, + "learning_rate": 4.377966306236965e-05, + "loss": 4.7479, + "step": 38587 + }, + { + "epoch": 0.2294937672471215, + "grad_norm": 1.553661584854126, + "learning_rate": 4.377935473201862e-05, + "loss": 4.7, + "step": 38588 + }, + { + "epoch": 0.22949971453040252, + "grad_norm": 1.5192108154296875, + "learning_rate": 4.377904639511192e-05, + "loss": 4.7572, + "step": 38589 + }, + { + "epoch": 0.2295056618136835, + "grad_norm": 1.4428250789642334, + "learning_rate": 4.3778738051649656e-05, + "loss": 4.6175, + "step": 38590 + }, + { + "epoch": 0.2295116090969645, + "grad_norm": 1.589508056640625, + "learning_rate": 4.3778429701631915e-05, + "loss": 4.6856, + "step": 38591 + }, + { + "epoch": 0.2295175563802455, + "grad_norm": 1.4473097324371338, + "learning_rate": 4.377812134505883e-05, + "loss": 4.7122, + "step": 38592 + }, + { + "epoch": 0.2295235036635265, + "grad_norm": 1.5862704515457153, + "learning_rate": 4.377781298193049e-05, + "loss": 4.6001, + "step": 38593 + }, + { + "epoch": 0.2295294509468075, + "grad_norm": 1.5695568323135376, + "learning_rate": 4.377750461224702e-05, + "loss": 4.5452, + "step": 38594 + }, + { + "epoch": 0.2295353982300885, + "grad_norm": 1.5204521417617798, + "learning_rate": 4.3777196236008506e-05, + "loss": 4.7466, + "step": 38595 + }, + { + "epoch": 0.2295413455133695, + "grad_norm": 1.4180539846420288, + "learning_rate": 4.377688785321507e-05, + "loss": 4.583, + "step": 38596 + }, + { + "epoch": 0.22954729279665048, + "grad_norm": 1.6071792840957642, + "learning_rate": 4.377657946386682e-05, + "loss": 5.0753, + "step": 38597 + }, + { + "epoch": 0.2295532400799315, + "grad_norm": 1.9293522834777832, + "learning_rate": 4.3776271067963854e-05, + "loss": 5.2675, + "step": 38598 + }, + { + "epoch": 0.22955918736321249, + "grad_norm": 1.8490512371063232, + "learning_rate": 4.377596266550629e-05, + "loss": 5.1265, + "step": 38599 + }, + { + "epoch": 0.22956513464649347, + "grad_norm": 1.7723667621612549, + "learning_rate": 4.3775654256494223e-05, + "loss": 4.8487, + "step": 38600 + }, + { + "epoch": 0.2295710819297745, + "grad_norm": 1.6557354927062988, + "learning_rate": 4.377534584092778e-05, + "loss": 4.986, + "step": 38601 + }, + { + "epoch": 0.22957702921305548, + "grad_norm": 1.9539614915847778, + "learning_rate": 4.377503741880705e-05, + "loss": 3.9877, + "step": 38602 + }, + { + "epoch": 0.22958297649633647, + "grad_norm": 1.7153127193450928, + "learning_rate": 4.377472899013216e-05, + "loss": 4.1697, + "step": 38603 + }, + { + "epoch": 0.22958892377961748, + "grad_norm": 1.461155652999878, + "learning_rate": 4.37744205549032e-05, + "loss": 4.7448, + "step": 38604 + }, + { + "epoch": 0.22959487106289847, + "grad_norm": 1.764017105102539, + "learning_rate": 4.377411211312027e-05, + "loss": 4.9356, + "step": 38605 + }, + { + "epoch": 0.22960081834617946, + "grad_norm": 1.632539987564087, + "learning_rate": 4.377380366478351e-05, + "loss": 5.008, + "step": 38606 + }, + { + "epoch": 0.22960676562946047, + "grad_norm": 1.527861475944519, + "learning_rate": 4.3773495209893e-05, + "loss": 5.057, + "step": 38607 + }, + { + "epoch": 0.22961271291274146, + "grad_norm": 1.5240938663482666, + "learning_rate": 4.377318674844886e-05, + "loss": 4.9921, + "step": 38608 + }, + { + "epoch": 0.22961866019602245, + "grad_norm": 1.7693698406219482, + "learning_rate": 4.377287828045119e-05, + "loss": 4.7684, + "step": 38609 + }, + { + "epoch": 0.22962460747930347, + "grad_norm": 1.8743075132369995, + "learning_rate": 4.377256980590011e-05, + "loss": 4.9417, + "step": 38610 + }, + { + "epoch": 0.22963055476258445, + "grad_norm": 1.7317231893539429, + "learning_rate": 4.377226132479571e-05, + "loss": 5.4715, + "step": 38611 + }, + { + "epoch": 0.22963650204586544, + "grad_norm": 1.7278715372085571, + "learning_rate": 4.377195283713812e-05, + "loss": 5.4851, + "step": 38612 + }, + { + "epoch": 0.22964244932914646, + "grad_norm": 1.6612467765808105, + "learning_rate": 4.377164434292743e-05, + "loss": 5.6315, + "step": 38613 + }, + { + "epoch": 0.22964839661242745, + "grad_norm": 1.584639549255371, + "learning_rate": 4.377133584216375e-05, + "loss": 5.4698, + "step": 38614 + }, + { + "epoch": 0.22965434389570843, + "grad_norm": 1.4986923933029175, + "learning_rate": 4.377102733484719e-05, + "loss": 4.8665, + "step": 38615 + }, + { + "epoch": 0.22966029117898945, + "grad_norm": 1.7153973579406738, + "learning_rate": 4.377071882097786e-05, + "loss": 4.8084, + "step": 38616 + }, + { + "epoch": 0.22966623846227044, + "grad_norm": 1.4490904808044434, + "learning_rate": 4.377041030055586e-05, + "loss": 5.0178, + "step": 38617 + }, + { + "epoch": 0.22967218574555143, + "grad_norm": 1.4377447366714478, + "learning_rate": 4.377010177358132e-05, + "loss": 5.4579, + "step": 38618 + }, + { + "epoch": 0.22967813302883244, + "grad_norm": 1.5415947437286377, + "learning_rate": 4.376979324005432e-05, + "loss": 5.3875, + "step": 38619 + }, + { + "epoch": 0.22968408031211343, + "grad_norm": 1.3691684007644653, + "learning_rate": 4.3769484699974974e-05, + "loss": 5.4255, + "step": 38620 + }, + { + "epoch": 0.22969002759539442, + "grad_norm": 1.26857328414917, + "learning_rate": 4.3769176153343404e-05, + "loss": 5.4, + "step": 38621 + }, + { + "epoch": 0.2296959748786754, + "grad_norm": 1.210349440574646, + "learning_rate": 4.376886760015971e-05, + "loss": 5.4278, + "step": 38622 + }, + { + "epoch": 0.22970192216195642, + "grad_norm": 1.2842780351638794, + "learning_rate": 4.376855904042399e-05, + "loss": 5.3804, + "step": 38623 + }, + { + "epoch": 0.2297078694452374, + "grad_norm": 1.165520429611206, + "learning_rate": 4.3768250474136364e-05, + "loss": 5.3628, + "step": 38624 + }, + { + "epoch": 0.2297138167285184, + "grad_norm": 1.4088956117630005, + "learning_rate": 4.376794190129694e-05, + "loss": 5.3132, + "step": 38625 + }, + { + "epoch": 0.22971976401179942, + "grad_norm": 1.6567013263702393, + "learning_rate": 4.376763332190581e-05, + "loss": 5.3518, + "step": 38626 + }, + { + "epoch": 0.2297257112950804, + "grad_norm": 1.6979491710662842, + "learning_rate": 4.37673247359631e-05, + "loss": 5.1141, + "step": 38627 + }, + { + "epoch": 0.2297316585783614, + "grad_norm": 1.5642043352127075, + "learning_rate": 4.3767016143468916e-05, + "loss": 5.0352, + "step": 38628 + }, + { + "epoch": 0.2297376058616424, + "grad_norm": 2.544403076171875, + "learning_rate": 4.3766707544423354e-05, + "loss": 4.7447, + "step": 38629 + }, + { + "epoch": 0.2297435531449234, + "grad_norm": 4.404366493225098, + "learning_rate": 4.376639893882653e-05, + "loss": 3.9793, + "step": 38630 + }, + { + "epoch": 0.22974950042820438, + "grad_norm": 2.5350682735443115, + "learning_rate": 4.3766090326678545e-05, + "loss": 4.7711, + "step": 38631 + }, + { + "epoch": 0.2297554477114854, + "grad_norm": 2.2797350883483887, + "learning_rate": 4.376578170797952e-05, + "loss": 4.9254, + "step": 38632 + }, + { + "epoch": 0.2297613949947664, + "grad_norm": 1.9921247959136963, + "learning_rate": 4.3765473082729544e-05, + "loss": 4.9098, + "step": 38633 + }, + { + "epoch": 0.22976734227804738, + "grad_norm": 2.9996330738067627, + "learning_rate": 4.3765164450928744e-05, + "loss": 2.6739, + "step": 38634 + }, + { + "epoch": 0.2297732895613284, + "grad_norm": 2.8430988788604736, + "learning_rate": 4.376485581257722e-05, + "loss": 2.8912, + "step": 38635 + }, + { + "epoch": 0.22977923684460938, + "grad_norm": 1.6286189556121826, + "learning_rate": 4.376454716767508e-05, + "loss": 5.0859, + "step": 38636 + }, + { + "epoch": 0.22978518412789037, + "grad_norm": 2.431854248046875, + "learning_rate": 4.376423851622242e-05, + "loss": 3.4044, + "step": 38637 + }, + { + "epoch": 0.22979113141117138, + "grad_norm": 2.651479482650757, + "learning_rate": 4.376392985821937e-05, + "loss": 3.5497, + "step": 38638 + }, + { + "epoch": 0.22979707869445237, + "grad_norm": 2.285003185272217, + "learning_rate": 4.376362119366601e-05, + "loss": 3.5145, + "step": 38639 + }, + { + "epoch": 0.22980302597773336, + "grad_norm": 2.3178820610046387, + "learning_rate": 4.3763312522562483e-05, + "loss": 3.3635, + "step": 38640 + }, + { + "epoch": 0.22980897326101438, + "grad_norm": 1.6898589134216309, + "learning_rate": 4.376300384490887e-05, + "loss": 4.376, + "step": 38641 + }, + { + "epoch": 0.22981492054429536, + "grad_norm": 1.586006999015808, + "learning_rate": 4.376269516070528e-05, + "loss": 5.0728, + "step": 38642 + }, + { + "epoch": 0.22982086782757635, + "grad_norm": 1.542847752571106, + "learning_rate": 4.376238646995183e-05, + "loss": 5.0111, + "step": 38643 + }, + { + "epoch": 0.22982681511085737, + "grad_norm": 1.7499191761016846, + "learning_rate": 4.376207777264863e-05, + "loss": 5.2223, + "step": 38644 + }, + { + "epoch": 0.22983276239413836, + "grad_norm": 1.7189160585403442, + "learning_rate": 4.376176906879578e-05, + "loss": 4.733, + "step": 38645 + }, + { + "epoch": 0.22983870967741934, + "grad_norm": 1.8861480951309204, + "learning_rate": 4.3761460358393384e-05, + "loss": 4.5083, + "step": 38646 + }, + { + "epoch": 0.22984465696070036, + "grad_norm": 2.286637783050537, + "learning_rate": 4.376115164144157e-05, + "loss": 5.2219, + "step": 38647 + }, + { + "epoch": 0.22985060424398135, + "grad_norm": 1.623051404953003, + "learning_rate": 4.376084291794042e-05, + "loss": 5.4874, + "step": 38648 + }, + { + "epoch": 0.22985655152726234, + "grad_norm": 1.6066075563430786, + "learning_rate": 4.3760534187890056e-05, + "loss": 5.3106, + "step": 38649 + }, + { + "epoch": 0.22986249881054335, + "grad_norm": 1.554235816001892, + "learning_rate": 4.376022545129058e-05, + "loss": 5.118, + "step": 38650 + }, + { + "epoch": 0.22986844609382434, + "grad_norm": 1.5623208284378052, + "learning_rate": 4.375991670814211e-05, + "loss": 5.1645, + "step": 38651 + }, + { + "epoch": 0.22987439337710533, + "grad_norm": 1.68990159034729, + "learning_rate": 4.375960795844474e-05, + "loss": 5.1204, + "step": 38652 + }, + { + "epoch": 0.22988034066038635, + "grad_norm": 2.6499814987182617, + "learning_rate": 4.3759299202198586e-05, + "loss": 3.8581, + "step": 38653 + }, + { + "epoch": 0.22988628794366733, + "grad_norm": 1.704318642616272, + "learning_rate": 4.375899043940376e-05, + "loss": 4.6711, + "step": 38654 + }, + { + "epoch": 0.22989223522694832, + "grad_norm": 1.5703479051589966, + "learning_rate": 4.375868167006036e-05, + "loss": 4.7781, + "step": 38655 + }, + { + "epoch": 0.22989818251022934, + "grad_norm": 1.4788894653320312, + "learning_rate": 4.3758372894168496e-05, + "loss": 5.6499, + "step": 38656 + }, + { + "epoch": 0.22990412979351033, + "grad_norm": 1.5116046667099, + "learning_rate": 4.3758064111728286e-05, + "loss": 5.6359, + "step": 38657 + }, + { + "epoch": 0.2299100770767913, + "grad_norm": 1.4817161560058594, + "learning_rate": 4.375775532273982e-05, + "loss": 5.4063, + "step": 38658 + }, + { + "epoch": 0.22991602436007233, + "grad_norm": 1.7036468982696533, + "learning_rate": 4.375744652720323e-05, + "loss": 5.4815, + "step": 38659 + }, + { + "epoch": 0.22992197164335332, + "grad_norm": 1.6927287578582764, + "learning_rate": 4.375713772511859e-05, + "loss": 4.5907, + "step": 38660 + }, + { + "epoch": 0.2299279189266343, + "grad_norm": 1.5966923236846924, + "learning_rate": 4.3756828916486035e-05, + "loss": 4.4788, + "step": 38661 + }, + { + "epoch": 0.22993386620991532, + "grad_norm": 1.7133103609085083, + "learning_rate": 4.375652010130567e-05, + "loss": 5.1275, + "step": 38662 + }, + { + "epoch": 0.2299398134931963, + "grad_norm": 1.5442966222763062, + "learning_rate": 4.37562112795776e-05, + "loss": 5.0381, + "step": 38663 + }, + { + "epoch": 0.2299457607764773, + "grad_norm": 1.5008933544158936, + "learning_rate": 4.3755902451301924e-05, + "loss": 5.1612, + "step": 38664 + }, + { + "epoch": 0.22995170805975831, + "grad_norm": 1.7190736532211304, + "learning_rate": 4.375559361647875e-05, + "loss": 5.0969, + "step": 38665 + }, + { + "epoch": 0.2299576553430393, + "grad_norm": 1.5287115573883057, + "learning_rate": 4.3755284775108195e-05, + "loss": 5.1347, + "step": 38666 + }, + { + "epoch": 0.2299636026263203, + "grad_norm": 1.6138951778411865, + "learning_rate": 4.375497592719037e-05, + "loss": 5.0052, + "step": 38667 + }, + { + "epoch": 0.2299695499096013, + "grad_norm": 2.543833017349243, + "learning_rate": 4.3754667072725374e-05, + "loss": 3.6834, + "step": 38668 + }, + { + "epoch": 0.2299754971928823, + "grad_norm": 3.930359363555908, + "learning_rate": 4.375435821171332e-05, + "loss": 1.4384, + "step": 38669 + }, + { + "epoch": 0.22998144447616328, + "grad_norm": 3.1169984340667725, + "learning_rate": 4.375404934415431e-05, + "loss": 1.8578, + "step": 38670 + }, + { + "epoch": 0.2299873917594443, + "grad_norm": 4.075973033905029, + "learning_rate": 4.3753740470048456e-05, + "loss": 1.2247, + "step": 38671 + }, + { + "epoch": 0.2299933390427253, + "grad_norm": 2.6648190021514893, + "learning_rate": 4.375343158939587e-05, + "loss": 1.079, + "step": 38672 + }, + { + "epoch": 0.22999928632600627, + "grad_norm": 2.488210916519165, + "learning_rate": 4.375312270219665e-05, + "loss": 1.0368, + "step": 38673 + }, + { + "epoch": 0.2300052336092873, + "grad_norm": 3.6227705478668213, + "learning_rate": 4.3752813808450896e-05, + "loss": 1.4294, + "step": 38674 + }, + { + "epoch": 0.23001118089256828, + "grad_norm": 4.386226654052734, + "learning_rate": 4.3752504908158744e-05, + "loss": 2.0833, + "step": 38675 + }, + { + "epoch": 0.23001712817584927, + "grad_norm": 3.764488697052002, + "learning_rate": 4.375219600132029e-05, + "loss": 1.6208, + "step": 38676 + }, + { + "epoch": 0.23002307545913028, + "grad_norm": 2.4590156078338623, + "learning_rate": 4.375188708793563e-05, + "loss": 0.9148, + "step": 38677 + }, + { + "epoch": 0.23002902274241127, + "grad_norm": 4.069763660430908, + "learning_rate": 4.3751578168004875e-05, + "loss": 2.3122, + "step": 38678 + }, + { + "epoch": 0.23003497002569226, + "grad_norm": 5.092566967010498, + "learning_rate": 4.375126924152815e-05, + "loss": 1.8317, + "step": 38679 + }, + { + "epoch": 0.23004091730897325, + "grad_norm": 3.4340686798095703, + "learning_rate": 4.375096030850554e-05, + "loss": 1.6576, + "step": 38680 + }, + { + "epoch": 0.23004686459225426, + "grad_norm": 2.129098653793335, + "learning_rate": 4.375065136893717e-05, + "loss": 4.0079, + "step": 38681 + }, + { + "epoch": 0.23005281187553525, + "grad_norm": 2.533949851989746, + "learning_rate": 4.3750342422823135e-05, + "loss": 4.1513, + "step": 38682 + }, + { + "epoch": 0.23005875915881624, + "grad_norm": 1.7268353700637817, + "learning_rate": 4.375003347016356e-05, + "loss": 4.2263, + "step": 38683 + }, + { + "epoch": 0.23006470644209726, + "grad_norm": 1.9843735694885254, + "learning_rate": 4.3749724510958535e-05, + "loss": 4.9357, + "step": 38684 + }, + { + "epoch": 0.23007065372537824, + "grad_norm": 1.983786702156067, + "learning_rate": 4.374941554520817e-05, + "loss": 4.569, + "step": 38685 + }, + { + "epoch": 0.23007660100865923, + "grad_norm": 1.839876651763916, + "learning_rate": 4.374910657291258e-05, + "loss": 4.8968, + "step": 38686 + }, + { + "epoch": 0.23008254829194025, + "grad_norm": 1.591556429862976, + "learning_rate": 4.374879759407188e-05, + "loss": 4.809, + "step": 38687 + }, + { + "epoch": 0.23008849557522124, + "grad_norm": 1.8136730194091797, + "learning_rate": 4.374848860868615e-05, + "loss": 5.2378, + "step": 38688 + }, + { + "epoch": 0.23009444285850222, + "grad_norm": 1.4922617673873901, + "learning_rate": 4.374817961675553e-05, + "loss": 5.4901, + "step": 38689 + }, + { + "epoch": 0.23010039014178324, + "grad_norm": 1.5398420095443726, + "learning_rate": 4.374787061828012e-05, + "loss": 5.277, + "step": 38690 + }, + { + "epoch": 0.23010633742506423, + "grad_norm": 1.469018578529358, + "learning_rate": 4.3747561613260013e-05, + "loss": 5.2663, + "step": 38691 + }, + { + "epoch": 0.23011228470834522, + "grad_norm": 1.3555761575698853, + "learning_rate": 4.374725260169533e-05, + "loss": 5.3116, + "step": 38692 + }, + { + "epoch": 0.23011823199162623, + "grad_norm": 1.5147504806518555, + "learning_rate": 4.3746943583586175e-05, + "loss": 5.262, + "step": 38693 + }, + { + "epoch": 0.23012417927490722, + "grad_norm": 1.3536839485168457, + "learning_rate": 4.3746634558932646e-05, + "loss": 5.1616, + "step": 38694 + }, + { + "epoch": 0.2301301265581882, + "grad_norm": 1.3796242475509644, + "learning_rate": 4.374632552773487e-05, + "loss": 5.1077, + "step": 38695 + }, + { + "epoch": 0.23013607384146922, + "grad_norm": 1.4209028482437134, + "learning_rate": 4.374601648999295e-05, + "loss": 5.2542, + "step": 38696 + }, + { + "epoch": 0.2301420211247502, + "grad_norm": 1.4143218994140625, + "learning_rate": 4.374570744570697e-05, + "loss": 5.026, + "step": 38697 + }, + { + "epoch": 0.2301479684080312, + "grad_norm": 1.1739543676376343, + "learning_rate": 4.3745398394877074e-05, + "loss": 5.0517, + "step": 38698 + }, + { + "epoch": 0.23015391569131222, + "grad_norm": 1.769179105758667, + "learning_rate": 4.374508933750335e-05, + "loss": 5.3818, + "step": 38699 + }, + { + "epoch": 0.2301598629745932, + "grad_norm": 1.3909661769866943, + "learning_rate": 4.37447802735859e-05, + "loss": 4.9604, + "step": 38700 + }, + { + "epoch": 0.2301658102578742, + "grad_norm": 1.6927801370620728, + "learning_rate": 4.374447120312486e-05, + "loss": 4.9461, + "step": 38701 + }, + { + "epoch": 0.2301717575411552, + "grad_norm": 1.735437273979187, + "learning_rate": 4.37441621261203e-05, + "loss": 4.9774, + "step": 38702 + }, + { + "epoch": 0.2301777048244362, + "grad_norm": 1.732106328010559, + "learning_rate": 4.3743853042572355e-05, + "loss": 4.7804, + "step": 38703 + }, + { + "epoch": 0.23018365210771718, + "grad_norm": 1.686942219734192, + "learning_rate": 4.3743543952481126e-05, + "loss": 4.8964, + "step": 38704 + }, + { + "epoch": 0.2301895993909982, + "grad_norm": 1.6482768058776855, + "learning_rate": 4.3743234855846716e-05, + "loss": 4.8313, + "step": 38705 + }, + { + "epoch": 0.2301955466742792, + "grad_norm": 1.6313527822494507, + "learning_rate": 4.374292575266924e-05, + "loss": 4.9976, + "step": 38706 + }, + { + "epoch": 0.23020149395756018, + "grad_norm": 1.6183964014053345, + "learning_rate": 4.3742616642948796e-05, + "loss": 5.4597, + "step": 38707 + }, + { + "epoch": 0.2302074412408412, + "grad_norm": 1.8126240968704224, + "learning_rate": 4.37423075266855e-05, + "loss": 5.1256, + "step": 38708 + }, + { + "epoch": 0.23021338852412218, + "grad_norm": 1.8139927387237549, + "learning_rate": 4.374199840387946e-05, + "loss": 5.239, + "step": 38709 + }, + { + "epoch": 0.23021933580740317, + "grad_norm": 1.6314213275909424, + "learning_rate": 4.374168927453078e-05, + "loss": 5.0193, + "step": 38710 + }, + { + "epoch": 0.23022528309068419, + "grad_norm": 1.8324249982833862, + "learning_rate": 4.3741380138639574e-05, + "loss": 4.9343, + "step": 38711 + }, + { + "epoch": 0.23023123037396517, + "grad_norm": 1.4922306537628174, + "learning_rate": 4.3741070996205944e-05, + "loss": 4.9846, + "step": 38712 + }, + { + "epoch": 0.23023717765724616, + "grad_norm": 1.712268352508545, + "learning_rate": 4.374076184723e-05, + "loss": 4.5509, + "step": 38713 + }, + { + "epoch": 0.23024312494052718, + "grad_norm": 1.872574806213379, + "learning_rate": 4.374045269171185e-05, + "loss": 4.6417, + "step": 38714 + }, + { + "epoch": 0.23024907222380817, + "grad_norm": 1.7834235429763794, + "learning_rate": 4.37401435296516e-05, + "loss": 4.4271, + "step": 38715 + }, + { + "epoch": 0.23025501950708915, + "grad_norm": 2.986769437789917, + "learning_rate": 4.3739834361049356e-05, + "loss": 1.9502, + "step": 38716 + }, + { + "epoch": 0.23026096679037017, + "grad_norm": 1.4724233150482178, + "learning_rate": 4.373952518590524e-05, + "loss": 4.7217, + "step": 38717 + }, + { + "epoch": 0.23026691407365116, + "grad_norm": 1.3835045099258423, + "learning_rate": 4.373921600421934e-05, + "loss": 4.5464, + "step": 38718 + }, + { + "epoch": 0.23027286135693215, + "grad_norm": 1.334065318107605, + "learning_rate": 4.373890681599178e-05, + "loss": 4.553, + "step": 38719 + }, + { + "epoch": 0.23027880864021316, + "grad_norm": 1.5014736652374268, + "learning_rate": 4.373859762122266e-05, + "loss": 4.638, + "step": 38720 + }, + { + "epoch": 0.23028475592349415, + "grad_norm": 1.645545244216919, + "learning_rate": 4.373828841991208e-05, + "loss": 4.4559, + "step": 38721 + }, + { + "epoch": 0.23029070320677514, + "grad_norm": 1.8344944715499878, + "learning_rate": 4.373797921206016e-05, + "loss": 4.0921, + "step": 38722 + }, + { + "epoch": 0.23029665049005615, + "grad_norm": 1.7468883991241455, + "learning_rate": 4.373766999766701e-05, + "loss": 4.4421, + "step": 38723 + }, + { + "epoch": 0.23030259777333714, + "grad_norm": 1.9306972026824951, + "learning_rate": 4.3737360776732726e-05, + "loss": 4.0814, + "step": 38724 + }, + { + "epoch": 0.23030854505661813, + "grad_norm": 1.600232481956482, + "learning_rate": 4.3737051549257435e-05, + "loss": 4.1041, + "step": 38725 + }, + { + "epoch": 0.23031449233989915, + "grad_norm": 1.8883020877838135, + "learning_rate": 4.373674231524123e-05, + "loss": 3.8982, + "step": 38726 + }, + { + "epoch": 0.23032043962318013, + "grad_norm": 1.8079781532287598, + "learning_rate": 4.373643307468421e-05, + "loss": 3.8882, + "step": 38727 + }, + { + "epoch": 0.23032638690646112, + "grad_norm": 1.4945000410079956, + "learning_rate": 4.3736123827586506e-05, + "loss": 4.7151, + "step": 38728 + }, + { + "epoch": 0.23033233418974214, + "grad_norm": 2.077974319458008, + "learning_rate": 4.373581457394821e-05, + "loss": 4.167, + "step": 38729 + }, + { + "epoch": 0.23033828147302313, + "grad_norm": 1.9432013034820557, + "learning_rate": 4.3735505313769446e-05, + "loss": 4.0149, + "step": 38730 + }, + { + "epoch": 0.23034422875630411, + "grad_norm": 2.184173107147217, + "learning_rate": 4.37351960470503e-05, + "loss": 4.3668, + "step": 38731 + }, + { + "epoch": 0.23035017603958513, + "grad_norm": 3.3158299922943115, + "learning_rate": 4.3734886773790885e-05, + "loss": 3.8251, + "step": 38732 + }, + { + "epoch": 0.23035612332286612, + "grad_norm": 2.032789468765259, + "learning_rate": 4.3734577493991326e-05, + "loss": 3.38, + "step": 38733 + }, + { + "epoch": 0.2303620706061471, + "grad_norm": 2.079367160797119, + "learning_rate": 4.3734268207651704e-05, + "loss": 3.5598, + "step": 38734 + }, + { + "epoch": 0.23036801788942812, + "grad_norm": 2.6133997440338135, + "learning_rate": 4.373395891477216e-05, + "loss": 3.5329, + "step": 38735 + }, + { + "epoch": 0.2303739651727091, + "grad_norm": 1.6688917875289917, + "learning_rate": 4.373364961535278e-05, + "loss": 4.4199, + "step": 38736 + }, + { + "epoch": 0.2303799124559901, + "grad_norm": 1.7220234870910645, + "learning_rate": 4.373334030939367e-05, + "loss": 4.3154, + "step": 38737 + }, + { + "epoch": 0.2303858597392711, + "grad_norm": 1.7266229391098022, + "learning_rate": 4.3733030996894954e-05, + "loss": 4.5222, + "step": 38738 + }, + { + "epoch": 0.2303918070225521, + "grad_norm": 1.653295636177063, + "learning_rate": 4.373272167785672e-05, + "loss": 4.4132, + "step": 38739 + }, + { + "epoch": 0.2303977543058331, + "grad_norm": 1.6252208948135376, + "learning_rate": 4.373241235227909e-05, + "loss": 4.4224, + "step": 38740 + }, + { + "epoch": 0.23040370158911408, + "grad_norm": 1.8031059503555298, + "learning_rate": 4.373210302016217e-05, + "loss": 4.3607, + "step": 38741 + }, + { + "epoch": 0.2304096488723951, + "grad_norm": 1.5618962049484253, + "learning_rate": 4.3731793681506075e-05, + "loss": 4.3514, + "step": 38742 + }, + { + "epoch": 0.23041559615567608, + "grad_norm": 1.519148588180542, + "learning_rate": 4.373148433631089e-05, + "loss": 4.7573, + "step": 38743 + }, + { + "epoch": 0.23042154343895707, + "grad_norm": 1.2487481832504272, + "learning_rate": 4.373117498457675e-05, + "loss": 4.6323, + "step": 38744 + }, + { + "epoch": 0.2304274907222381, + "grad_norm": 1.4358158111572266, + "learning_rate": 4.373086562630374e-05, + "loss": 4.5298, + "step": 38745 + }, + { + "epoch": 0.23043343800551908, + "grad_norm": 1.5527933835983276, + "learning_rate": 4.373055626149198e-05, + "loss": 4.5041, + "step": 38746 + }, + { + "epoch": 0.23043938528880006, + "grad_norm": 1.6646860837936401, + "learning_rate": 4.373024689014158e-05, + "loss": 4.5401, + "step": 38747 + }, + { + "epoch": 0.23044533257208108, + "grad_norm": 1.8804651498794556, + "learning_rate": 4.372993751225264e-05, + "loss": 4.3742, + "step": 38748 + }, + { + "epoch": 0.23045127985536207, + "grad_norm": 1.8429540395736694, + "learning_rate": 4.372962812782527e-05, + "loss": 4.2555, + "step": 38749 + }, + { + "epoch": 0.23045722713864306, + "grad_norm": 1.473212480545044, + "learning_rate": 4.372931873685959e-05, + "loss": 5.2383, + "step": 38750 + }, + { + "epoch": 0.23046317442192407, + "grad_norm": 1.712003231048584, + "learning_rate": 4.372900933935569e-05, + "loss": 4.3601, + "step": 38751 + }, + { + "epoch": 0.23046912170520506, + "grad_norm": 1.7229880094528198, + "learning_rate": 4.3728699935313687e-05, + "loss": 4.5068, + "step": 38752 + }, + { + "epoch": 0.23047506898848605, + "grad_norm": 1.782601237297058, + "learning_rate": 4.37283905247337e-05, + "loss": 4.1018, + "step": 38753 + }, + { + "epoch": 0.23048101627176706, + "grad_norm": 2.0935075283050537, + "learning_rate": 4.3728081107615814e-05, + "loss": 3.8254, + "step": 38754 + }, + { + "epoch": 0.23048696355504805, + "grad_norm": 1.812392234802246, + "learning_rate": 4.372777168396015e-05, + "loss": 4.1183, + "step": 38755 + }, + { + "epoch": 0.23049291083832904, + "grad_norm": 1.8439925909042358, + "learning_rate": 4.3727462253766816e-05, + "loss": 4.3258, + "step": 38756 + }, + { + "epoch": 0.23049885812161006, + "grad_norm": 1.7139822244644165, + "learning_rate": 4.372715281703592e-05, + "loss": 4.1782, + "step": 38757 + }, + { + "epoch": 0.23050480540489104, + "grad_norm": 1.7018375396728516, + "learning_rate": 4.372684337376756e-05, + "loss": 4.3934, + "step": 38758 + }, + { + "epoch": 0.23051075268817203, + "grad_norm": 1.8785852193832397, + "learning_rate": 4.3726533923961854e-05, + "loss": 4.0242, + "step": 38759 + }, + { + "epoch": 0.23051669997145305, + "grad_norm": 1.8708945512771606, + "learning_rate": 4.372622446761891e-05, + "loss": 4.2316, + "step": 38760 + }, + { + "epoch": 0.23052264725473404, + "grad_norm": 1.5988284349441528, + "learning_rate": 4.3725915004738846e-05, + "loss": 4.2701, + "step": 38761 + }, + { + "epoch": 0.23052859453801502, + "grad_norm": 1.6898179054260254, + "learning_rate": 4.372560553532175e-05, + "loss": 4.1239, + "step": 38762 + }, + { + "epoch": 0.23053454182129604, + "grad_norm": 2.0045530796051025, + "learning_rate": 4.3725296059367735e-05, + "loss": 3.9543, + "step": 38763 + }, + { + "epoch": 0.23054048910457703, + "grad_norm": 2.071551561355591, + "learning_rate": 4.372498657687691e-05, + "loss": 4.0462, + "step": 38764 + }, + { + "epoch": 0.23054643638785802, + "grad_norm": 2.0475335121154785, + "learning_rate": 4.3724677087849394e-05, + "loss": 4.2556, + "step": 38765 + }, + { + "epoch": 0.23055238367113903, + "grad_norm": 1.7819331884384155, + "learning_rate": 4.372436759228529e-05, + "loss": 4.4156, + "step": 38766 + }, + { + "epoch": 0.23055833095442002, + "grad_norm": 1.7513604164123535, + "learning_rate": 4.372405809018469e-05, + "loss": 4.6457, + "step": 38767 + }, + { + "epoch": 0.230564278237701, + "grad_norm": 1.6513689756393433, + "learning_rate": 4.372374858154773e-05, + "loss": 4.8633, + "step": 38768 + }, + { + "epoch": 0.23057022552098203, + "grad_norm": 1.607474684715271, + "learning_rate": 4.372343906637449e-05, + "loss": 4.9432, + "step": 38769 + }, + { + "epoch": 0.230576172804263, + "grad_norm": 1.624382734298706, + "learning_rate": 4.3723129544665097e-05, + "loss": 5.0585, + "step": 38770 + }, + { + "epoch": 0.230582120087544, + "grad_norm": 1.963037133216858, + "learning_rate": 4.3722820016419655e-05, + "loss": 4.6664, + "step": 38771 + }, + { + "epoch": 0.23058806737082502, + "grad_norm": 2.7608580589294434, + "learning_rate": 4.372251048163827e-05, + "loss": 4.6565, + "step": 38772 + }, + { + "epoch": 0.230594014654106, + "grad_norm": 2.3045732975006104, + "learning_rate": 4.372220094032104e-05, + "loss": 4.3961, + "step": 38773 + }, + { + "epoch": 0.230599961937387, + "grad_norm": 1.717413067817688, + "learning_rate": 4.372189139246809e-05, + "loss": 4.6134, + "step": 38774 + }, + { + "epoch": 0.230605909220668, + "grad_norm": 1.5653536319732666, + "learning_rate": 4.372158183807952e-05, + "loss": 4.6429, + "step": 38775 + }, + { + "epoch": 0.230611856503949, + "grad_norm": 1.5364784002304077, + "learning_rate": 4.372127227715544e-05, + "loss": 4.8575, + "step": 38776 + }, + { + "epoch": 0.23061780378722999, + "grad_norm": 1.5625269412994385, + "learning_rate": 4.372096270969595e-05, + "loss": 4.7371, + "step": 38777 + }, + { + "epoch": 0.230623751070511, + "grad_norm": 1.5205355882644653, + "learning_rate": 4.3720653135701185e-05, + "loss": 4.6553, + "step": 38778 + }, + { + "epoch": 0.230629698353792, + "grad_norm": 1.3826833963394165, + "learning_rate": 4.372034355517122e-05, + "loss": 4.5907, + "step": 38779 + }, + { + "epoch": 0.23063564563707298, + "grad_norm": 1.336030125617981, + "learning_rate": 4.3720033968106175e-05, + "loss": 4.4755, + "step": 38780 + }, + { + "epoch": 0.230641592920354, + "grad_norm": 1.2729898691177368, + "learning_rate": 4.371972437450616e-05, + "loss": 4.4421, + "step": 38781 + }, + { + "epoch": 0.23064754020363498, + "grad_norm": 1.8722045421600342, + "learning_rate": 4.371941477437128e-05, + "loss": 4.3859, + "step": 38782 + }, + { + "epoch": 0.23065348748691597, + "grad_norm": 1.5908498764038086, + "learning_rate": 4.3719105167701654e-05, + "loss": 4.4019, + "step": 38783 + }, + { + "epoch": 0.23065943477019699, + "grad_norm": 2.3267743587493896, + "learning_rate": 4.3718795554497385e-05, + "loss": 3.559, + "step": 38784 + }, + { + "epoch": 0.23066538205347797, + "grad_norm": 2.3056678771972656, + "learning_rate": 4.371848593475856e-05, + "loss": 3.5686, + "step": 38785 + }, + { + "epoch": 0.23067132933675896, + "grad_norm": 1.8726112842559814, + "learning_rate": 4.371817630848532e-05, + "loss": 5.0275, + "step": 38786 + }, + { + "epoch": 0.23067727662003998, + "grad_norm": 1.9857810735702515, + "learning_rate": 4.371786667567775e-05, + "loss": 4.9584, + "step": 38787 + }, + { + "epoch": 0.23068322390332097, + "grad_norm": 1.8844271898269653, + "learning_rate": 4.371755703633598e-05, + "loss": 4.0207, + "step": 38788 + }, + { + "epoch": 0.23068917118660195, + "grad_norm": 2.8062679767608643, + "learning_rate": 4.3717247390460095e-05, + "loss": 4.2182, + "step": 38789 + }, + { + "epoch": 0.23069511846988297, + "grad_norm": 2.765418529510498, + "learning_rate": 4.37169377380502e-05, + "loss": 4.2685, + "step": 38790 + }, + { + "epoch": 0.23070106575316396, + "grad_norm": 1.712620735168457, + "learning_rate": 4.371662807910643e-05, + "loss": 4.6177, + "step": 38791 + }, + { + "epoch": 0.23070701303644495, + "grad_norm": 1.7626475095748901, + "learning_rate": 4.371631841362888e-05, + "loss": 4.8108, + "step": 38792 + }, + { + "epoch": 0.23071296031972596, + "grad_norm": 1.7972310781478882, + "learning_rate": 4.371600874161765e-05, + "loss": 4.5635, + "step": 38793 + }, + { + "epoch": 0.23071890760300695, + "grad_norm": 2.1518964767456055, + "learning_rate": 4.3715699063072854e-05, + "loss": 5.0052, + "step": 38794 + }, + { + "epoch": 0.23072485488628794, + "grad_norm": 2.6726882457733154, + "learning_rate": 4.371538937799461e-05, + "loss": 3.2855, + "step": 38795 + }, + { + "epoch": 0.23073080216956893, + "grad_norm": 2.3370792865753174, + "learning_rate": 4.371507968638301e-05, + "loss": 3.358, + "step": 38796 + }, + { + "epoch": 0.23073674945284994, + "grad_norm": 1.9931254386901855, + "learning_rate": 4.371476998823817e-05, + "loss": 4.6237, + "step": 38797 + }, + { + "epoch": 0.23074269673613093, + "grad_norm": 1.9586458206176758, + "learning_rate": 4.371446028356019e-05, + "loss": 4.5883, + "step": 38798 + }, + { + "epoch": 0.23074864401941192, + "grad_norm": 2.060645341873169, + "learning_rate": 4.3714150572349194e-05, + "loss": 4.637, + "step": 38799 + }, + { + "epoch": 0.23075459130269294, + "grad_norm": 1.6580359935760498, + "learning_rate": 4.3713840854605284e-05, + "loss": 4.4867, + "step": 38800 + }, + { + "epoch": 0.23076053858597392, + "grad_norm": 1.6574506759643555, + "learning_rate": 4.3713531130328554e-05, + "loss": 4.5072, + "step": 38801 + }, + { + "epoch": 0.2307664858692549, + "grad_norm": 1.7564977407455444, + "learning_rate": 4.371322139951913e-05, + "loss": 4.4847, + "step": 38802 + }, + { + "epoch": 0.23077243315253593, + "grad_norm": 1.671775221824646, + "learning_rate": 4.371291166217712e-05, + "loss": 4.4913, + "step": 38803 + }, + { + "epoch": 0.23077838043581692, + "grad_norm": 1.7091946601867676, + "learning_rate": 4.371260191830261e-05, + "loss": 4.4429, + "step": 38804 + }, + { + "epoch": 0.2307843277190979, + "grad_norm": 1.5660812854766846, + "learning_rate": 4.371229216789574e-05, + "loss": 4.3089, + "step": 38805 + }, + { + "epoch": 0.23079027500237892, + "grad_norm": 1.6085116863250732, + "learning_rate": 4.3711982410956596e-05, + "loss": 4.4015, + "step": 38806 + }, + { + "epoch": 0.2307962222856599, + "grad_norm": 1.703705072402954, + "learning_rate": 4.3711672647485294e-05, + "loss": 4.1894, + "step": 38807 + }, + { + "epoch": 0.2308021695689409, + "grad_norm": 1.681896686553955, + "learning_rate": 4.371136287748193e-05, + "loss": 4.3658, + "step": 38808 + }, + { + "epoch": 0.2308081168522219, + "grad_norm": 1.5659677982330322, + "learning_rate": 4.371105310094664e-05, + "loss": 4.3422, + "step": 38809 + }, + { + "epoch": 0.2308140641355029, + "grad_norm": 1.5186078548431396, + "learning_rate": 4.3710743317879504e-05, + "loss": 4.0806, + "step": 38810 + }, + { + "epoch": 0.2308200114187839, + "grad_norm": 1.5934349298477173, + "learning_rate": 4.3710433528280644e-05, + "loss": 4.2978, + "step": 38811 + }, + { + "epoch": 0.2308259587020649, + "grad_norm": 1.951122522354126, + "learning_rate": 4.371012373215016e-05, + "loss": 5.2274, + "step": 38812 + }, + { + "epoch": 0.2308319059853459, + "grad_norm": 1.8681738376617432, + "learning_rate": 4.370981392948817e-05, + "loss": 4.4869, + "step": 38813 + }, + { + "epoch": 0.23083785326862688, + "grad_norm": 1.753472924232483, + "learning_rate": 4.3709504120294775e-05, + "loss": 4.3416, + "step": 38814 + }, + { + "epoch": 0.2308438005519079, + "grad_norm": 1.6463114023208618, + "learning_rate": 4.370919430457009e-05, + "loss": 4.3267, + "step": 38815 + }, + { + "epoch": 0.23084974783518888, + "grad_norm": 1.634188175201416, + "learning_rate": 4.3708884482314215e-05, + "loss": 4.2345, + "step": 38816 + }, + { + "epoch": 0.23085569511846987, + "grad_norm": 1.6019244194030762, + "learning_rate": 4.370857465352726e-05, + "loss": 4.249, + "step": 38817 + }, + { + "epoch": 0.2308616424017509, + "grad_norm": 1.555641531944275, + "learning_rate": 4.3708264818209335e-05, + "loss": 4.3064, + "step": 38818 + }, + { + "epoch": 0.23086758968503188, + "grad_norm": 1.6986194849014282, + "learning_rate": 4.3707954976360556e-05, + "loss": 4.2284, + "step": 38819 + }, + { + "epoch": 0.23087353696831286, + "grad_norm": 1.656322956085205, + "learning_rate": 4.370764512798101e-05, + "loss": 4.351, + "step": 38820 + }, + { + "epoch": 0.23087948425159388, + "grad_norm": 1.7241042852401733, + "learning_rate": 4.370733527307083e-05, + "loss": 4.3169, + "step": 38821 + }, + { + "epoch": 0.23088543153487487, + "grad_norm": 1.7275463342666626, + "learning_rate": 4.370702541163011e-05, + "loss": 4.4002, + "step": 38822 + }, + { + "epoch": 0.23089137881815586, + "grad_norm": 1.549770712852478, + "learning_rate": 4.3706715543658957e-05, + "loss": 4.0978, + "step": 38823 + }, + { + "epoch": 0.23089732610143687, + "grad_norm": 1.6060540676116943, + "learning_rate": 4.370640566915748e-05, + "loss": 4.435, + "step": 38824 + }, + { + "epoch": 0.23090327338471786, + "grad_norm": 1.6758986711502075, + "learning_rate": 4.3706095788125795e-05, + "loss": 4.4059, + "step": 38825 + }, + { + "epoch": 0.23090922066799885, + "grad_norm": 1.7185044288635254, + "learning_rate": 4.3705785900564e-05, + "loss": 4.3024, + "step": 38826 + }, + { + "epoch": 0.23091516795127986, + "grad_norm": 1.5720844268798828, + "learning_rate": 4.370547600647222e-05, + "loss": 4.4142, + "step": 38827 + }, + { + "epoch": 0.23092111523456085, + "grad_norm": 1.5926580429077148, + "learning_rate": 4.370516610585054e-05, + "loss": 4.3386, + "step": 38828 + }, + { + "epoch": 0.23092706251784184, + "grad_norm": 1.4387127161026, + "learning_rate": 4.3704856198699085e-05, + "loss": 4.2481, + "step": 38829 + }, + { + "epoch": 0.23093300980112286, + "grad_norm": 1.7872234582901, + "learning_rate": 4.3704546285017954e-05, + "loss": 4.1025, + "step": 38830 + }, + { + "epoch": 0.23093895708440385, + "grad_norm": 1.7201859951019287, + "learning_rate": 4.3704236364807264e-05, + "loss": 4.1159, + "step": 38831 + }, + { + "epoch": 0.23094490436768483, + "grad_norm": 1.7127646207809448, + "learning_rate": 4.370392643806712e-05, + "loss": 4.2173, + "step": 38832 + }, + { + "epoch": 0.23095085165096585, + "grad_norm": 1.982006549835205, + "learning_rate": 4.370361650479763e-05, + "loss": 4.1575, + "step": 38833 + }, + { + "epoch": 0.23095679893424684, + "grad_norm": 1.5777769088745117, + "learning_rate": 4.370330656499889e-05, + "loss": 4.0942, + "step": 38834 + }, + { + "epoch": 0.23096274621752783, + "grad_norm": 2.2118375301361084, + "learning_rate": 4.370299661867103e-05, + "loss": 3.961, + "step": 38835 + }, + { + "epoch": 0.23096869350080884, + "grad_norm": 2.4417662620544434, + "learning_rate": 4.3702686665814144e-05, + "loss": 3.5483, + "step": 38836 + }, + { + "epoch": 0.23097464078408983, + "grad_norm": 2.306452751159668, + "learning_rate": 4.3702376706428335e-05, + "loss": 4.0667, + "step": 38837 + }, + { + "epoch": 0.23098058806737082, + "grad_norm": 2.5894603729248047, + "learning_rate": 4.3702066740513726e-05, + "loss": 3.7923, + "step": 38838 + }, + { + "epoch": 0.23098653535065183, + "grad_norm": 2.077296257019043, + "learning_rate": 4.370175676807042e-05, + "loss": 4.1861, + "step": 38839 + }, + { + "epoch": 0.23099248263393282, + "grad_norm": 2.2528553009033203, + "learning_rate": 4.3701446789098523e-05, + "loss": 4.1747, + "step": 38840 + }, + { + "epoch": 0.2309984299172138, + "grad_norm": 1.9800342321395874, + "learning_rate": 4.370113680359814e-05, + "loss": 4.1947, + "step": 38841 + }, + { + "epoch": 0.23100437720049483, + "grad_norm": 1.7648961544036865, + "learning_rate": 4.370082681156939e-05, + "loss": 4.5052, + "step": 38842 + }, + { + "epoch": 0.23101032448377581, + "grad_norm": 1.8483437299728394, + "learning_rate": 4.3700516813012374e-05, + "loss": 4.157, + "step": 38843 + }, + { + "epoch": 0.2310162717670568, + "grad_norm": 2.423189163208008, + "learning_rate": 4.370020680792719e-05, + "loss": 3.8596, + "step": 38844 + }, + { + "epoch": 0.23102221905033782, + "grad_norm": 2.211770534515381, + "learning_rate": 4.369989679631397e-05, + "loss": 3.6305, + "step": 38845 + }, + { + "epoch": 0.2310281663336188, + "grad_norm": 1.9826966524124146, + "learning_rate": 4.36995867781728e-05, + "loss": 4.0012, + "step": 38846 + }, + { + "epoch": 0.2310341136168998, + "grad_norm": 1.9183090925216675, + "learning_rate": 4.3699276753503804e-05, + "loss": 4.1553, + "step": 38847 + }, + { + "epoch": 0.2310400609001808, + "grad_norm": 1.7809723615646362, + "learning_rate": 4.3698966722307085e-05, + "loss": 4.8814, + "step": 38848 + }, + { + "epoch": 0.2310460081834618, + "grad_norm": 1.6132829189300537, + "learning_rate": 4.369865668458274e-05, + "loss": 5.2379, + "step": 38849 + }, + { + "epoch": 0.2310519554667428, + "grad_norm": 1.4619427919387817, + "learning_rate": 4.36983466403309e-05, + "loss": 5.0469, + "step": 38850 + }, + { + "epoch": 0.2310579027500238, + "grad_norm": 1.6037229299545288, + "learning_rate": 4.369803658955165e-05, + "loss": 4.6583, + "step": 38851 + }, + { + "epoch": 0.2310638500333048, + "grad_norm": 1.3536498546600342, + "learning_rate": 4.369772653224512e-05, + "loss": 5.2745, + "step": 38852 + }, + { + "epoch": 0.23106979731658578, + "grad_norm": 1.6615324020385742, + "learning_rate": 4.36974164684114e-05, + "loss": 4.6271, + "step": 38853 + }, + { + "epoch": 0.2310757445998668, + "grad_norm": 1.6488821506500244, + "learning_rate": 4.36971063980506e-05, + "loss": 4.7926, + "step": 38854 + }, + { + "epoch": 0.23108169188314778, + "grad_norm": 1.3780089616775513, + "learning_rate": 4.3696796321162836e-05, + "loss": 4.5641, + "step": 38855 + }, + { + "epoch": 0.23108763916642877, + "grad_norm": 1.5264968872070312, + "learning_rate": 4.3696486237748215e-05, + "loss": 4.7264, + "step": 38856 + }, + { + "epoch": 0.23109358644970976, + "grad_norm": 1.828169822692871, + "learning_rate": 4.369617614780685e-05, + "loss": 4.441, + "step": 38857 + }, + { + "epoch": 0.23109953373299078, + "grad_norm": 1.3571844100952148, + "learning_rate": 4.369586605133883e-05, + "loss": 4.861, + "step": 38858 + }, + { + "epoch": 0.23110548101627176, + "grad_norm": 1.5678229331970215, + "learning_rate": 4.369555594834429e-05, + "loss": 4.6285, + "step": 38859 + }, + { + "epoch": 0.23111142829955275, + "grad_norm": 1.6185591220855713, + "learning_rate": 4.369524583882332e-05, + "loss": 4.7623, + "step": 38860 + }, + { + "epoch": 0.23111737558283377, + "grad_norm": 1.7087242603302002, + "learning_rate": 4.369493572277603e-05, + "loss": 4.7057, + "step": 38861 + }, + { + "epoch": 0.23112332286611476, + "grad_norm": 1.356367588043213, + "learning_rate": 4.3694625600202534e-05, + "loss": 4.8264, + "step": 38862 + }, + { + "epoch": 0.23112927014939574, + "grad_norm": 1.3614306449890137, + "learning_rate": 4.3694315471102934e-05, + "loss": 4.5589, + "step": 38863 + }, + { + "epoch": 0.23113521743267676, + "grad_norm": 2.0256147384643555, + "learning_rate": 4.369400533547734e-05, + "loss": 4.1103, + "step": 38864 + }, + { + "epoch": 0.23114116471595775, + "grad_norm": 1.8039603233337402, + "learning_rate": 4.369369519332586e-05, + "loss": 4.522, + "step": 38865 + }, + { + "epoch": 0.23114711199923874, + "grad_norm": 1.923120141029358, + "learning_rate": 4.3693385044648614e-05, + "loss": 4.6754, + "step": 38866 + }, + { + "epoch": 0.23115305928251975, + "grad_norm": 1.618260145187378, + "learning_rate": 4.3693074889445695e-05, + "loss": 4.7447, + "step": 38867 + }, + { + "epoch": 0.23115900656580074, + "grad_norm": 1.4669636487960815, + "learning_rate": 4.3692764727717214e-05, + "loss": 5.0623, + "step": 38868 + }, + { + "epoch": 0.23116495384908173, + "grad_norm": 1.5794733762741089, + "learning_rate": 4.3692454559463286e-05, + "loss": 4.8538, + "step": 38869 + }, + { + "epoch": 0.23117090113236274, + "grad_norm": 1.8218353986740112, + "learning_rate": 4.369214438468402e-05, + "loss": 4.4285, + "step": 38870 + }, + { + "epoch": 0.23117684841564373, + "grad_norm": 1.5657826662063599, + "learning_rate": 4.369183420337951e-05, + "loss": 4.7327, + "step": 38871 + }, + { + "epoch": 0.23118279569892472, + "grad_norm": 1.5812371969223022, + "learning_rate": 4.369152401554988e-05, + "loss": 4.7874, + "step": 38872 + }, + { + "epoch": 0.23118874298220574, + "grad_norm": 1.9417638778686523, + "learning_rate": 4.369121382119523e-05, + "loss": 4.9785, + "step": 38873 + }, + { + "epoch": 0.23119469026548672, + "grad_norm": 1.6311239004135132, + "learning_rate": 4.369090362031567e-05, + "loss": 4.5811, + "step": 38874 + }, + { + "epoch": 0.2312006375487677, + "grad_norm": 1.3812321424484253, + "learning_rate": 4.369059341291131e-05, + "loss": 5.1751, + "step": 38875 + }, + { + "epoch": 0.23120658483204873, + "grad_norm": 2.401395082473755, + "learning_rate": 4.3690283198982253e-05, + "loss": 4.0534, + "step": 38876 + }, + { + "epoch": 0.23121253211532972, + "grad_norm": 1.577271819114685, + "learning_rate": 4.368997297852861e-05, + "loss": 5.0016, + "step": 38877 + }, + { + "epoch": 0.2312184793986107, + "grad_norm": 1.571954607963562, + "learning_rate": 4.36896627515505e-05, + "loss": 5.1936, + "step": 38878 + }, + { + "epoch": 0.23122442668189172, + "grad_norm": 1.5858561992645264, + "learning_rate": 4.368935251804801e-05, + "loss": 4.3015, + "step": 38879 + }, + { + "epoch": 0.2312303739651727, + "grad_norm": 1.5386252403259277, + "learning_rate": 4.368904227802127e-05, + "loss": 4.541, + "step": 38880 + }, + { + "epoch": 0.2312363212484537, + "grad_norm": 1.5563592910766602, + "learning_rate": 4.368873203147037e-05, + "loss": 4.676, + "step": 38881 + }, + { + "epoch": 0.2312422685317347, + "grad_norm": 1.441646933555603, + "learning_rate": 4.368842177839544e-05, + "loss": 4.9754, + "step": 38882 + }, + { + "epoch": 0.2312482158150157, + "grad_norm": 1.9202433824539185, + "learning_rate": 4.3688111518796556e-05, + "loss": 4.3441, + "step": 38883 + }, + { + "epoch": 0.2312541630982967, + "grad_norm": 1.5717604160308838, + "learning_rate": 4.368780125267387e-05, + "loss": 4.4186, + "step": 38884 + }, + { + "epoch": 0.2312601103815777, + "grad_norm": 1.633315920829773, + "learning_rate": 4.3687490980027444e-05, + "loss": 4.6497, + "step": 38885 + }, + { + "epoch": 0.2312660576648587, + "grad_norm": 1.545074462890625, + "learning_rate": 4.368718070085741e-05, + "loss": 4.8076, + "step": 38886 + }, + { + "epoch": 0.23127200494813968, + "grad_norm": 2.005859851837158, + "learning_rate": 4.368687041516388e-05, + "loss": 4.2126, + "step": 38887 + }, + { + "epoch": 0.2312779522314207, + "grad_norm": 1.443214774131775, + "learning_rate": 4.368656012294696e-05, + "loss": 4.8105, + "step": 38888 + }, + { + "epoch": 0.23128389951470169, + "grad_norm": 1.7497129440307617, + "learning_rate": 4.368624982420675e-05, + "loss": 4.5052, + "step": 38889 + }, + { + "epoch": 0.23128984679798267, + "grad_norm": 1.630719780921936, + "learning_rate": 4.368593951894336e-05, + "loss": 4.8466, + "step": 38890 + }, + { + "epoch": 0.2312957940812637, + "grad_norm": 1.871222734451294, + "learning_rate": 4.368562920715692e-05, + "loss": 4.8048, + "step": 38891 + }, + { + "epoch": 0.23130174136454468, + "grad_norm": 1.5791672468185425, + "learning_rate": 4.36853188888475e-05, + "loss": 4.7656, + "step": 38892 + }, + { + "epoch": 0.23130768864782567, + "grad_norm": 1.7593334913253784, + "learning_rate": 4.368500856401523e-05, + "loss": 4.2775, + "step": 38893 + }, + { + "epoch": 0.23131363593110668, + "grad_norm": 1.6288632154464722, + "learning_rate": 4.3684698232660225e-05, + "loss": 5.005, + "step": 38894 + }, + { + "epoch": 0.23131958321438767, + "grad_norm": 1.7398391962051392, + "learning_rate": 4.368438789478258e-05, + "loss": 4.4744, + "step": 38895 + }, + { + "epoch": 0.23132553049766866, + "grad_norm": 1.9362190961837769, + "learning_rate": 4.3684077550382407e-05, + "loss": 4.3996, + "step": 38896 + }, + { + "epoch": 0.23133147778094967, + "grad_norm": 1.8506320714950562, + "learning_rate": 4.3683767199459826e-05, + "loss": 5.2249, + "step": 38897 + }, + { + "epoch": 0.23133742506423066, + "grad_norm": 1.5073530673980713, + "learning_rate": 4.3683456842014916e-05, + "loss": 5.1084, + "step": 38898 + }, + { + "epoch": 0.23134337234751165, + "grad_norm": 1.642914056777954, + "learning_rate": 4.368314647804782e-05, + "loss": 5.0734, + "step": 38899 + }, + { + "epoch": 0.23134931963079267, + "grad_norm": 1.613958477973938, + "learning_rate": 4.368283610755862e-05, + "loss": 4.9509, + "step": 38900 + }, + { + "epoch": 0.23135526691407365, + "grad_norm": 1.6028896570205688, + "learning_rate": 4.368252573054744e-05, + "loss": 4.7896, + "step": 38901 + }, + { + "epoch": 0.23136121419735464, + "grad_norm": 1.6952170133590698, + "learning_rate": 4.368221534701439e-05, + "loss": 4.7555, + "step": 38902 + }, + { + "epoch": 0.23136716148063566, + "grad_norm": 1.5203158855438232, + "learning_rate": 4.3681904956959565e-05, + "loss": 4.7926, + "step": 38903 + }, + { + "epoch": 0.23137310876391665, + "grad_norm": 1.4692882299423218, + "learning_rate": 4.3681594560383075e-05, + "loss": 4.7705, + "step": 38904 + }, + { + "epoch": 0.23137905604719763, + "grad_norm": 1.6336463689804077, + "learning_rate": 4.368128415728504e-05, + "loss": 4.8079, + "step": 38905 + }, + { + "epoch": 0.23138500333047865, + "grad_norm": 1.5728296041488647, + "learning_rate": 4.368097374766556e-05, + "loss": 4.8961, + "step": 38906 + }, + { + "epoch": 0.23139095061375964, + "grad_norm": 1.7239409685134888, + "learning_rate": 4.368066333152474e-05, + "loss": 4.4861, + "step": 38907 + }, + { + "epoch": 0.23139689789704063, + "grad_norm": 1.3485053777694702, + "learning_rate": 4.3680352908862705e-05, + "loss": 4.8358, + "step": 38908 + }, + { + "epoch": 0.23140284518032164, + "grad_norm": 1.6450964212417603, + "learning_rate": 4.3680042479679546e-05, + "loss": 4.623, + "step": 38909 + }, + { + "epoch": 0.23140879246360263, + "grad_norm": 1.6974247694015503, + "learning_rate": 4.367973204397537e-05, + "loss": 4.3432, + "step": 38910 + }, + { + "epoch": 0.23141473974688362, + "grad_norm": 1.710485577583313, + "learning_rate": 4.36794216017503e-05, + "loss": 4.4643, + "step": 38911 + }, + { + "epoch": 0.23142068703016463, + "grad_norm": 1.679517388343811, + "learning_rate": 4.367911115300444e-05, + "loss": 4.6119, + "step": 38912 + }, + { + "epoch": 0.23142663431344562, + "grad_norm": 2.93080997467041, + "learning_rate": 4.367880069773789e-05, + "loss": 3.169, + "step": 38913 + }, + { + "epoch": 0.2314325815967266, + "grad_norm": 2.643122911453247, + "learning_rate": 4.367849023595076e-05, + "loss": 2.9466, + "step": 38914 + }, + { + "epoch": 0.2314385288800076, + "grad_norm": 2.2763819694519043, + "learning_rate": 4.367817976764317e-05, + "loss": 3.5133, + "step": 38915 + }, + { + "epoch": 0.23144447616328861, + "grad_norm": 2.1890201568603516, + "learning_rate": 4.367786929281522e-05, + "loss": 4.2592, + "step": 38916 + }, + { + "epoch": 0.2314504234465696, + "grad_norm": 1.3603123426437378, + "learning_rate": 4.367755881146701e-05, + "loss": 4.8718, + "step": 38917 + }, + { + "epoch": 0.2314563707298506, + "grad_norm": 1.6598271131515503, + "learning_rate": 4.367724832359867e-05, + "loss": 4.5127, + "step": 38918 + }, + { + "epoch": 0.2314623180131316, + "grad_norm": 1.445361614227295, + "learning_rate": 4.367693782921029e-05, + "loss": 4.9478, + "step": 38919 + }, + { + "epoch": 0.2314682652964126, + "grad_norm": 1.606594443321228, + "learning_rate": 4.3676627328301976e-05, + "loss": 5.0595, + "step": 38920 + }, + { + "epoch": 0.23147421257969358, + "grad_norm": 1.5703539848327637, + "learning_rate": 4.367631682087385e-05, + "loss": 5.0988, + "step": 38921 + }, + { + "epoch": 0.2314801598629746, + "grad_norm": 1.5487500429153442, + "learning_rate": 4.3676006306926024e-05, + "loss": 4.929, + "step": 38922 + }, + { + "epoch": 0.2314861071462556, + "grad_norm": 1.4153128862380981, + "learning_rate": 4.3675695786458584e-05, + "loss": 4.9945, + "step": 38923 + }, + { + "epoch": 0.23149205442953658, + "grad_norm": 1.2444169521331787, + "learning_rate": 4.3675385259471655e-05, + "loss": 4.9692, + "step": 38924 + }, + { + "epoch": 0.2314980017128176, + "grad_norm": 1.5902601480484009, + "learning_rate": 4.367507472596535e-05, + "loss": 5.0088, + "step": 38925 + }, + { + "epoch": 0.23150394899609858, + "grad_norm": 1.5130057334899902, + "learning_rate": 4.3674764185939763e-05, + "loss": 5.0361, + "step": 38926 + }, + { + "epoch": 0.23150989627937957, + "grad_norm": 1.6979068517684937, + "learning_rate": 4.3674453639395005e-05, + "loss": 4.8099, + "step": 38927 + }, + { + "epoch": 0.23151584356266058, + "grad_norm": 1.6237205266952515, + "learning_rate": 4.36741430863312e-05, + "loss": 4.8471, + "step": 38928 + }, + { + "epoch": 0.23152179084594157, + "grad_norm": 1.7361104488372803, + "learning_rate": 4.3673832526748434e-05, + "loss": 4.4522, + "step": 38929 + }, + { + "epoch": 0.23152773812922256, + "grad_norm": 1.6142919063568115, + "learning_rate": 4.3673521960646824e-05, + "loss": 4.8079, + "step": 38930 + }, + { + "epoch": 0.23153368541250358, + "grad_norm": 1.5747629404067993, + "learning_rate": 4.367321138802649e-05, + "loss": 4.9752, + "step": 38931 + }, + { + "epoch": 0.23153963269578456, + "grad_norm": 1.5985512733459473, + "learning_rate": 4.3672900808887516e-05, + "loss": 4.9448, + "step": 38932 + }, + { + "epoch": 0.23154557997906555, + "grad_norm": 1.5115282535552979, + "learning_rate": 4.3672590223230036e-05, + "loss": 4.7868, + "step": 38933 + }, + { + "epoch": 0.23155152726234657, + "grad_norm": 1.294195532798767, + "learning_rate": 4.367227963105415e-05, + "loss": 4.9308, + "step": 38934 + }, + { + "epoch": 0.23155747454562756, + "grad_norm": 1.7012540102005005, + "learning_rate": 4.367196903235996e-05, + "loss": 4.4029, + "step": 38935 + }, + { + "epoch": 0.23156342182890854, + "grad_norm": 1.611894130706787, + "learning_rate": 4.3671658427147584e-05, + "loss": 5.2145, + "step": 38936 + }, + { + "epoch": 0.23156936911218956, + "grad_norm": 1.9200711250305176, + "learning_rate": 4.3671347815417116e-05, + "loss": 4.2015, + "step": 38937 + }, + { + "epoch": 0.23157531639547055, + "grad_norm": 1.3902099132537842, + "learning_rate": 4.367103719716868e-05, + "loss": 4.6965, + "step": 38938 + }, + { + "epoch": 0.23158126367875154, + "grad_norm": 1.5783464908599854, + "learning_rate": 4.3670726572402375e-05, + "loss": 4.7771, + "step": 38939 + }, + { + "epoch": 0.23158721096203255, + "grad_norm": 1.519589900970459, + "learning_rate": 4.367041594111831e-05, + "loss": 4.434, + "step": 38940 + }, + { + "epoch": 0.23159315824531354, + "grad_norm": 1.6310521364212036, + "learning_rate": 4.36701053033166e-05, + "loss": 4.4936, + "step": 38941 + }, + { + "epoch": 0.23159910552859453, + "grad_norm": 1.7462193965911865, + "learning_rate": 4.366979465899734e-05, + "loss": 4.6498, + "step": 38942 + }, + { + "epoch": 0.23160505281187554, + "grad_norm": 1.9061944484710693, + "learning_rate": 4.366948400816066e-05, + "loss": 4.3522, + "step": 38943 + }, + { + "epoch": 0.23161100009515653, + "grad_norm": 1.6201283931732178, + "learning_rate": 4.3669173350806655e-05, + "loss": 4.2227, + "step": 38944 + }, + { + "epoch": 0.23161694737843752, + "grad_norm": 1.670607566833496, + "learning_rate": 4.366886268693543e-05, + "loss": 4.815, + "step": 38945 + }, + { + "epoch": 0.23162289466171854, + "grad_norm": 1.6773320436477661, + "learning_rate": 4.36685520165471e-05, + "loss": 4.6114, + "step": 38946 + }, + { + "epoch": 0.23162884194499953, + "grad_norm": 1.6075963973999023, + "learning_rate": 4.366824133964177e-05, + "loss": 4.8761, + "step": 38947 + }, + { + "epoch": 0.2316347892282805, + "grad_norm": 1.6519663333892822, + "learning_rate": 4.366793065621955e-05, + "loss": 4.9682, + "step": 38948 + }, + { + "epoch": 0.23164073651156153, + "grad_norm": 1.391345500946045, + "learning_rate": 4.366761996628054e-05, + "loss": 4.986, + "step": 38949 + }, + { + "epoch": 0.23164668379484252, + "grad_norm": 1.5131144523620605, + "learning_rate": 4.366730926982487e-05, + "loss": 4.9342, + "step": 38950 + }, + { + "epoch": 0.2316526310781235, + "grad_norm": 1.61806058883667, + "learning_rate": 4.366699856685263e-05, + "loss": 4.9785, + "step": 38951 + }, + { + "epoch": 0.23165857836140452, + "grad_norm": 1.5832924842834473, + "learning_rate": 4.366668785736393e-05, + "loss": 5.0118, + "step": 38952 + }, + { + "epoch": 0.2316645256446855, + "grad_norm": 1.4623991250991821, + "learning_rate": 4.3666377141358885e-05, + "loss": 4.9817, + "step": 38953 + }, + { + "epoch": 0.2316704729279665, + "grad_norm": 1.470866322517395, + "learning_rate": 4.3666066418837605e-05, + "loss": 4.9762, + "step": 38954 + }, + { + "epoch": 0.2316764202112475, + "grad_norm": 1.5453153848648071, + "learning_rate": 4.3665755689800195e-05, + "loss": 4.6325, + "step": 38955 + }, + { + "epoch": 0.2316823674945285, + "grad_norm": 1.753753900527954, + "learning_rate": 4.366544495424675e-05, + "loss": 4.62, + "step": 38956 + }, + { + "epoch": 0.2316883147778095, + "grad_norm": 1.5852446556091309, + "learning_rate": 4.36651342121774e-05, + "loss": 4.4564, + "step": 38957 + }, + { + "epoch": 0.2316942620610905, + "grad_norm": 1.5405995845794678, + "learning_rate": 4.3664823463592244e-05, + "loss": 4.4978, + "step": 38958 + }, + { + "epoch": 0.2317002093443715, + "grad_norm": 1.674048662185669, + "learning_rate": 4.366451270849139e-05, + "loss": 4.4858, + "step": 38959 + }, + { + "epoch": 0.23170615662765248, + "grad_norm": 1.6053109169006348, + "learning_rate": 4.366420194687495e-05, + "loss": 4.4447, + "step": 38960 + }, + { + "epoch": 0.2317121039109335, + "grad_norm": 1.3716999292373657, + "learning_rate": 4.366389117874302e-05, + "loss": 4.8299, + "step": 38961 + }, + { + "epoch": 0.23171805119421449, + "grad_norm": 1.6440811157226562, + "learning_rate": 4.366358040409573e-05, + "loss": 4.6378, + "step": 38962 + }, + { + "epoch": 0.23172399847749547, + "grad_norm": 1.8842734098434448, + "learning_rate": 4.366326962293317e-05, + "loss": 4.3705, + "step": 38963 + }, + { + "epoch": 0.2317299457607765, + "grad_norm": 1.5826550722122192, + "learning_rate": 4.3662958835255466e-05, + "loss": 4.4738, + "step": 38964 + }, + { + "epoch": 0.23173589304405748, + "grad_norm": 1.4358820915222168, + "learning_rate": 4.36626480410627e-05, + "loss": 4.9931, + "step": 38965 + }, + { + "epoch": 0.23174184032733847, + "grad_norm": 1.7191013097763062, + "learning_rate": 4.3662337240355e-05, + "loss": 4.9418, + "step": 38966 + }, + { + "epoch": 0.23174778761061948, + "grad_norm": 1.8441758155822754, + "learning_rate": 4.366202643313249e-05, + "loss": 3.6016, + "step": 38967 + }, + { + "epoch": 0.23175373489390047, + "grad_norm": 1.8906590938568115, + "learning_rate": 4.366171561939524e-05, + "loss": 4.0659, + "step": 38968 + }, + { + "epoch": 0.23175968217718146, + "grad_norm": 1.9503329992294312, + "learning_rate": 4.366140479914338e-05, + "loss": 4.4378, + "step": 38969 + }, + { + "epoch": 0.23176562946046247, + "grad_norm": 1.487377643585205, + "learning_rate": 4.366109397237702e-05, + "loss": 5.0203, + "step": 38970 + }, + { + "epoch": 0.23177157674374346, + "grad_norm": 1.49003005027771, + "learning_rate": 4.366078313909626e-05, + "loss": 5.0222, + "step": 38971 + }, + { + "epoch": 0.23177752402702445, + "grad_norm": 1.5293556451797485, + "learning_rate": 4.3660472299301216e-05, + "loss": 4.9737, + "step": 38972 + }, + { + "epoch": 0.23178347131030544, + "grad_norm": 1.6720876693725586, + "learning_rate": 4.3660161452992e-05, + "loss": 4.9417, + "step": 38973 + }, + { + "epoch": 0.23178941859358645, + "grad_norm": 1.5357182025909424, + "learning_rate": 4.3659850600168713e-05, + "loss": 4.95, + "step": 38974 + }, + { + "epoch": 0.23179536587686744, + "grad_norm": 1.4877994060516357, + "learning_rate": 4.365953974083146e-05, + "loss": 4.8377, + "step": 38975 + }, + { + "epoch": 0.23180131316014843, + "grad_norm": 1.477420687675476, + "learning_rate": 4.365922887498035e-05, + "loss": 4.8592, + "step": 38976 + }, + { + "epoch": 0.23180726044342945, + "grad_norm": 1.5136134624481201, + "learning_rate": 4.365891800261551e-05, + "loss": 5.1658, + "step": 38977 + }, + { + "epoch": 0.23181320772671044, + "grad_norm": 1.9418365955352783, + "learning_rate": 4.365860712373702e-05, + "loss": 4.8201, + "step": 38978 + }, + { + "epoch": 0.23181915500999142, + "grad_norm": 1.5137678384780884, + "learning_rate": 4.3658296238345006e-05, + "loss": 5.4408, + "step": 38979 + }, + { + "epoch": 0.23182510229327244, + "grad_norm": 1.629871129989624, + "learning_rate": 4.3657985346439586e-05, + "loss": 4.6527, + "step": 38980 + }, + { + "epoch": 0.23183104957655343, + "grad_norm": 1.397018313407898, + "learning_rate": 4.365767444802085e-05, + "loss": 4.8008, + "step": 38981 + }, + { + "epoch": 0.23183699685983442, + "grad_norm": 1.5167710781097412, + "learning_rate": 4.36573635430889e-05, + "loss": 4.8846, + "step": 38982 + }, + { + "epoch": 0.23184294414311543, + "grad_norm": 1.3749078512191772, + "learning_rate": 4.365705263164387e-05, + "loss": 4.9298, + "step": 38983 + }, + { + "epoch": 0.23184889142639642, + "grad_norm": 1.4250109195709229, + "learning_rate": 4.3656741713685855e-05, + "loss": 4.8517, + "step": 38984 + }, + { + "epoch": 0.2318548387096774, + "grad_norm": 1.4567049741744995, + "learning_rate": 4.365643078921496e-05, + "loss": 4.8569, + "step": 38985 + }, + { + "epoch": 0.23186078599295842, + "grad_norm": 1.3662344217300415, + "learning_rate": 4.36561198582313e-05, + "loss": 4.992, + "step": 38986 + }, + { + "epoch": 0.2318667332762394, + "grad_norm": 1.3737530708312988, + "learning_rate": 4.3655808920734976e-05, + "loss": 4.7912, + "step": 38987 + }, + { + "epoch": 0.2318726805595204, + "grad_norm": 1.371505618095398, + "learning_rate": 4.365549797672611e-05, + "loss": 4.7928, + "step": 38988 + }, + { + "epoch": 0.23187862784280142, + "grad_norm": 1.371051549911499, + "learning_rate": 4.36551870262048e-05, + "loss": 5.1194, + "step": 38989 + }, + { + "epoch": 0.2318845751260824, + "grad_norm": 1.409606695175171, + "learning_rate": 4.365487606917116e-05, + "loss": 5.6715, + "step": 38990 + }, + { + "epoch": 0.2318905224093634, + "grad_norm": 1.4108288288116455, + "learning_rate": 4.365456510562529e-05, + "loss": 5.7418, + "step": 38991 + }, + { + "epoch": 0.2318964696926444, + "grad_norm": 1.3956706523895264, + "learning_rate": 4.36542541355673e-05, + "loss": 5.3216, + "step": 38992 + }, + { + "epoch": 0.2319024169759254, + "grad_norm": 1.6888619661331177, + "learning_rate": 4.365394315899731e-05, + "loss": 4.2571, + "step": 38993 + }, + { + "epoch": 0.23190836425920638, + "grad_norm": 1.6837373971939087, + "learning_rate": 4.365363217591542e-05, + "loss": 4.5193, + "step": 38994 + }, + { + "epoch": 0.2319143115424874, + "grad_norm": 1.6545789241790771, + "learning_rate": 4.365332118632174e-05, + "loss": 4.265, + "step": 38995 + }, + { + "epoch": 0.2319202588257684, + "grad_norm": 1.6533746719360352, + "learning_rate": 4.365301019021638e-05, + "loss": 4.1409, + "step": 38996 + }, + { + "epoch": 0.23192620610904938, + "grad_norm": 1.5293077230453491, + "learning_rate": 4.365269918759944e-05, + "loss": 4.2099, + "step": 38997 + }, + { + "epoch": 0.2319321533923304, + "grad_norm": 1.5665626525878906, + "learning_rate": 4.365238817847104e-05, + "loss": 5.0483, + "step": 38998 + }, + { + "epoch": 0.23193810067561138, + "grad_norm": 1.9797513484954834, + "learning_rate": 4.365207716283128e-05, + "loss": 4.2643, + "step": 38999 + }, + { + "epoch": 0.23194404795889237, + "grad_norm": 1.7830750942230225, + "learning_rate": 4.365176614068028e-05, + "loss": 4.0168, + "step": 39000 + }, + { + "epoch": 0.23194999524217338, + "grad_norm": 1.7557772397994995, + "learning_rate": 4.365145511201813e-05, + "loss": 4.0527, + "step": 39001 + }, + { + "epoch": 0.23195594252545437, + "grad_norm": 1.6213778257369995, + "learning_rate": 4.3651144076844963e-05, + "loss": 4.2034, + "step": 39002 + }, + { + "epoch": 0.23196188980873536, + "grad_norm": 1.683821439743042, + "learning_rate": 4.365083303516087e-05, + "loss": 3.9285, + "step": 39003 + }, + { + "epoch": 0.23196783709201638, + "grad_norm": 1.7961101531982422, + "learning_rate": 4.3650521986965964e-05, + "loss": 4.4137, + "step": 39004 + }, + { + "epoch": 0.23197378437529736, + "grad_norm": 1.6424694061279297, + "learning_rate": 4.365021093226035e-05, + "loss": 3.97, + "step": 39005 + }, + { + "epoch": 0.23197973165857835, + "grad_norm": 1.7446128129959106, + "learning_rate": 4.3649899871044143e-05, + "loss": 4.1543, + "step": 39006 + }, + { + "epoch": 0.23198567894185937, + "grad_norm": 1.7776148319244385, + "learning_rate": 4.3649588803317445e-05, + "loss": 4.4003, + "step": 39007 + }, + { + "epoch": 0.23199162622514036, + "grad_norm": 1.6425302028656006, + "learning_rate": 4.3649277729080376e-05, + "loss": 4.2777, + "step": 39008 + }, + { + "epoch": 0.23199757350842135, + "grad_norm": 1.7701246738433838, + "learning_rate": 4.364896664833302e-05, + "loss": 3.98, + "step": 39009 + }, + { + "epoch": 0.23200352079170236, + "grad_norm": 1.7276018857955933, + "learning_rate": 4.364865556107552e-05, + "loss": 3.9601, + "step": 39010 + }, + { + "epoch": 0.23200946807498335, + "grad_norm": 1.7356696128845215, + "learning_rate": 4.364834446730796e-05, + "loss": 4.1327, + "step": 39011 + }, + { + "epoch": 0.23201541535826434, + "grad_norm": 1.790169358253479, + "learning_rate": 4.364803336703046e-05, + "loss": 4.2021, + "step": 39012 + }, + { + "epoch": 0.23202136264154535, + "grad_norm": 2.118116617202759, + "learning_rate": 4.364772226024312e-05, + "loss": 3.3244, + "step": 39013 + }, + { + "epoch": 0.23202730992482634, + "grad_norm": 1.806924819946289, + "learning_rate": 4.364741114694605e-05, + "loss": 3.9069, + "step": 39014 + }, + { + "epoch": 0.23203325720810733, + "grad_norm": 1.7346820831298828, + "learning_rate": 4.364710002713937e-05, + "loss": 3.8036, + "step": 39015 + }, + { + "epoch": 0.23203920449138835, + "grad_norm": 1.5005122423171997, + "learning_rate": 4.364678890082317e-05, + "loss": 4.5573, + "step": 39016 + }, + { + "epoch": 0.23204515177466933, + "grad_norm": 1.723801612854004, + "learning_rate": 4.364647776799757e-05, + "loss": 4.2554, + "step": 39017 + }, + { + "epoch": 0.23205109905795032, + "grad_norm": 1.719353199005127, + "learning_rate": 4.3646166628662686e-05, + "loss": 4.1124, + "step": 39018 + }, + { + "epoch": 0.23205704634123134, + "grad_norm": 1.8241521120071411, + "learning_rate": 4.364585548281861e-05, + "loss": 3.8887, + "step": 39019 + }, + { + "epoch": 0.23206299362451233, + "grad_norm": 1.7841458320617676, + "learning_rate": 4.364554433046546e-05, + "loss": 4.0111, + "step": 39020 + }, + { + "epoch": 0.23206894090779331, + "grad_norm": 1.7524330615997314, + "learning_rate": 4.364523317160335e-05, + "loss": 4.2447, + "step": 39021 + }, + { + "epoch": 0.23207488819107433, + "grad_norm": 1.8006513118743896, + "learning_rate": 4.3644922006232366e-05, + "loss": 3.9111, + "step": 39022 + }, + { + "epoch": 0.23208083547435532, + "grad_norm": 1.8408151865005493, + "learning_rate": 4.3644610834352654e-05, + "loss": 3.7837, + "step": 39023 + }, + { + "epoch": 0.2320867827576363, + "grad_norm": 1.7600802183151245, + "learning_rate": 4.3644299655964285e-05, + "loss": 3.8076, + "step": 39024 + }, + { + "epoch": 0.23209273004091732, + "grad_norm": 1.5894376039505005, + "learning_rate": 4.364398847106739e-05, + "loss": 3.9888, + "step": 39025 + }, + { + "epoch": 0.2320986773241983, + "grad_norm": 1.9288008213043213, + "learning_rate": 4.3643677279662063e-05, + "loss": 3.7298, + "step": 39026 + }, + { + "epoch": 0.2321046246074793, + "grad_norm": 1.9717549085617065, + "learning_rate": 4.364336608174843e-05, + "loss": 3.7533, + "step": 39027 + }, + { + "epoch": 0.23211057189076031, + "grad_norm": 1.8057029247283936, + "learning_rate": 4.364305487732659e-05, + "loss": 3.6243, + "step": 39028 + }, + { + "epoch": 0.2321165191740413, + "grad_norm": 1.6260126829147339, + "learning_rate": 4.3642743666396645e-05, + "loss": 4.2258, + "step": 39029 + }, + { + "epoch": 0.2321224664573223, + "grad_norm": 1.8007012605667114, + "learning_rate": 4.3642432448958716e-05, + "loss": 3.975, + "step": 39030 + }, + { + "epoch": 0.23212841374060328, + "grad_norm": 1.9593114852905273, + "learning_rate": 4.364212122501291e-05, + "loss": 3.7982, + "step": 39031 + }, + { + "epoch": 0.2321343610238843, + "grad_norm": 1.8035818338394165, + "learning_rate": 4.3641809994559325e-05, + "loss": 3.8347, + "step": 39032 + }, + { + "epoch": 0.23214030830716528, + "grad_norm": 1.7887778282165527, + "learning_rate": 4.364149875759808e-05, + "loss": 4.0232, + "step": 39033 + }, + { + "epoch": 0.23214625559044627, + "grad_norm": 2.112762451171875, + "learning_rate": 4.3641187514129276e-05, + "loss": 3.4156, + "step": 39034 + }, + { + "epoch": 0.2321522028737273, + "grad_norm": 1.6227257251739502, + "learning_rate": 4.364087626415304e-05, + "loss": 3.6872, + "step": 39035 + }, + { + "epoch": 0.23215815015700828, + "grad_norm": 1.7840327024459839, + "learning_rate": 4.364056500766945e-05, + "loss": 3.9829, + "step": 39036 + }, + { + "epoch": 0.23216409744028926, + "grad_norm": 1.7417759895324707, + "learning_rate": 4.364025374467864e-05, + "loss": 3.9742, + "step": 39037 + }, + { + "epoch": 0.23217004472357028, + "grad_norm": 1.6508119106292725, + "learning_rate": 4.36399424751807e-05, + "loss": 3.9325, + "step": 39038 + }, + { + "epoch": 0.23217599200685127, + "grad_norm": 1.8196989297866821, + "learning_rate": 4.3639631199175765e-05, + "loss": 4.0765, + "step": 39039 + }, + { + "epoch": 0.23218193929013226, + "grad_norm": 1.6848537921905518, + "learning_rate": 4.363931991666392e-05, + "loss": 3.926, + "step": 39040 + }, + { + "epoch": 0.23218788657341327, + "grad_norm": 1.689846158027649, + "learning_rate": 4.3639008627645283e-05, + "loss": 4.0929, + "step": 39041 + }, + { + "epoch": 0.23219383385669426, + "grad_norm": 1.9357181787490845, + "learning_rate": 4.3638697332119956e-05, + "loss": 3.4246, + "step": 39042 + }, + { + "epoch": 0.23219978113997525, + "grad_norm": 1.8697538375854492, + "learning_rate": 4.3638386030088054e-05, + "loss": 3.7608, + "step": 39043 + }, + { + "epoch": 0.23220572842325626, + "grad_norm": 2.1151657104492188, + "learning_rate": 4.3638074721549685e-05, + "loss": 3.2884, + "step": 39044 + }, + { + "epoch": 0.23221167570653725, + "grad_norm": 1.979285717010498, + "learning_rate": 4.363776340650495e-05, + "loss": 3.4265, + "step": 39045 + }, + { + "epoch": 0.23221762298981824, + "grad_norm": 1.954432487487793, + "learning_rate": 4.3637452084953975e-05, + "loss": 3.5171, + "step": 39046 + }, + { + "epoch": 0.23222357027309926, + "grad_norm": 2.206760883331299, + "learning_rate": 4.3637140756896856e-05, + "loss": 3.2805, + "step": 39047 + }, + { + "epoch": 0.23222951755638024, + "grad_norm": 1.9962438344955444, + "learning_rate": 4.363682942233369e-05, + "loss": 3.5797, + "step": 39048 + }, + { + "epoch": 0.23223546483966123, + "grad_norm": 1.9898980855941772, + "learning_rate": 4.3636518081264616e-05, + "loss": 3.1749, + "step": 39049 + }, + { + "epoch": 0.23224141212294225, + "grad_norm": 2.0162951946258545, + "learning_rate": 4.3636206733689724e-05, + "loss": 2.8138, + "step": 39050 + }, + { + "epoch": 0.23224735940622324, + "grad_norm": 2.061389923095703, + "learning_rate": 4.363589537960912e-05, + "loss": 2.8714, + "step": 39051 + }, + { + "epoch": 0.23225330668950422, + "grad_norm": 1.993212342262268, + "learning_rate": 4.363558401902292e-05, + "loss": 2.9435, + "step": 39052 + }, + { + "epoch": 0.23225925397278524, + "grad_norm": 2.071394443511963, + "learning_rate": 4.363527265193122e-05, + "loss": 2.893, + "step": 39053 + }, + { + "epoch": 0.23226520125606623, + "grad_norm": 2.181269407272339, + "learning_rate": 4.363496127833415e-05, + "loss": 2.8135, + "step": 39054 + }, + { + "epoch": 0.23227114853934722, + "grad_norm": 2.2280220985412598, + "learning_rate": 4.3634649898231804e-05, + "loss": 3.0349, + "step": 39055 + }, + { + "epoch": 0.23227709582262823, + "grad_norm": 2.2977817058563232, + "learning_rate": 4.36343385116243e-05, + "loss": 3.0387, + "step": 39056 + }, + { + "epoch": 0.23228304310590922, + "grad_norm": 2.1697254180908203, + "learning_rate": 4.363402711851173e-05, + "loss": 3.1483, + "step": 39057 + }, + { + "epoch": 0.2322889903891902, + "grad_norm": 2.1386520862579346, + "learning_rate": 4.3633715718894226e-05, + "loss": 3.0296, + "step": 39058 + }, + { + "epoch": 0.23229493767247122, + "grad_norm": 2.0731868743896484, + "learning_rate": 4.3633404312771875e-05, + "loss": 3.0588, + "step": 39059 + }, + { + "epoch": 0.2323008849557522, + "grad_norm": 1.5297818183898926, + "learning_rate": 4.36330929001448e-05, + "loss": 4.8685, + "step": 39060 + }, + { + "epoch": 0.2323068322390332, + "grad_norm": 2.4762682914733887, + "learning_rate": 4.3632781481013105e-05, + "loss": 3.4948, + "step": 39061 + }, + { + "epoch": 0.23231277952231422, + "grad_norm": 2.357487201690674, + "learning_rate": 4.36324700553769e-05, + "loss": 3.7714, + "step": 39062 + }, + { + "epoch": 0.2323187268055952, + "grad_norm": 1.713942527770996, + "learning_rate": 4.363215862323628e-05, + "loss": 4.0288, + "step": 39063 + }, + { + "epoch": 0.2323246740888762, + "grad_norm": 2.204071283340454, + "learning_rate": 4.3631847184591376e-05, + "loss": 3.2584, + "step": 39064 + }, + { + "epoch": 0.2323306213721572, + "grad_norm": 1.647165060043335, + "learning_rate": 4.363153573944229e-05, + "loss": 4.762, + "step": 39065 + }, + { + "epoch": 0.2323365686554382, + "grad_norm": 2.2899770736694336, + "learning_rate": 4.3631224287789116e-05, + "loss": 4.5968, + "step": 39066 + }, + { + "epoch": 0.23234251593871919, + "grad_norm": 2.3352129459381104, + "learning_rate": 4.3630912829631986e-05, + "loss": 4.6158, + "step": 39067 + }, + { + "epoch": 0.2323484632220002, + "grad_norm": 2.2160227298736572, + "learning_rate": 4.363060136497099e-05, + "loss": 4.5646, + "step": 39068 + }, + { + "epoch": 0.2323544105052812, + "grad_norm": 1.5986429452896118, + "learning_rate": 4.363028989380625e-05, + "loss": 5.1399, + "step": 39069 + }, + { + "epoch": 0.23236035778856218, + "grad_norm": 1.4894126653671265, + "learning_rate": 4.362997841613786e-05, + "loss": 4.8201, + "step": 39070 + }, + { + "epoch": 0.2323663050718432, + "grad_norm": 2.388699531555176, + "learning_rate": 4.362966693196594e-05, + "loss": 4.1083, + "step": 39071 + }, + { + "epoch": 0.23237225235512418, + "grad_norm": 2.0085203647613525, + "learning_rate": 4.3629355441290596e-05, + "loss": 4.317, + "step": 39072 + }, + { + "epoch": 0.23237819963840517, + "grad_norm": 2.012711763381958, + "learning_rate": 4.362904394411194e-05, + "loss": 4.4285, + "step": 39073 + }, + { + "epoch": 0.23238414692168619, + "grad_norm": 2.2238574028015137, + "learning_rate": 4.362873244043007e-05, + "loss": 4.5044, + "step": 39074 + }, + { + "epoch": 0.23239009420496717, + "grad_norm": 2.2293858528137207, + "learning_rate": 4.3628420930245103e-05, + "loss": 4.5223, + "step": 39075 + }, + { + "epoch": 0.23239604148824816, + "grad_norm": 1.8992894887924194, + "learning_rate": 4.362810941355715e-05, + "loss": 4.9021, + "step": 39076 + }, + { + "epoch": 0.23240198877152918, + "grad_norm": 1.51563560962677, + "learning_rate": 4.362779789036632e-05, + "loss": 4.5958, + "step": 39077 + }, + { + "epoch": 0.23240793605481017, + "grad_norm": 2.2260918617248535, + "learning_rate": 4.362748636067272e-05, + "loss": 4.3224, + "step": 39078 + }, + { + "epoch": 0.23241388333809115, + "grad_norm": 1.7403556108474731, + "learning_rate": 4.362717482447645e-05, + "loss": 4.2088, + "step": 39079 + }, + { + "epoch": 0.23241983062137217, + "grad_norm": 1.914040207862854, + "learning_rate": 4.362686328177762e-05, + "loss": 4.3207, + "step": 39080 + }, + { + "epoch": 0.23242577790465316, + "grad_norm": 1.7242257595062256, + "learning_rate": 4.3626551732576346e-05, + "loss": 4.5082, + "step": 39081 + }, + { + "epoch": 0.23243172518793415, + "grad_norm": 1.698724627494812, + "learning_rate": 4.3626240176872746e-05, + "loss": 4.6776, + "step": 39082 + }, + { + "epoch": 0.23243767247121516, + "grad_norm": 1.7210109233856201, + "learning_rate": 4.362592861466691e-05, + "loss": 4.3961, + "step": 39083 + }, + { + "epoch": 0.23244361975449615, + "grad_norm": 2.1987760066986084, + "learning_rate": 4.362561704595896e-05, + "loss": 4.1219, + "step": 39084 + }, + { + "epoch": 0.23244956703777714, + "grad_norm": 1.7698177099227905, + "learning_rate": 4.3625305470749e-05, + "loss": 4.5081, + "step": 39085 + }, + { + "epoch": 0.23245551432105815, + "grad_norm": 1.9433329105377197, + "learning_rate": 4.362499388903713e-05, + "loss": 4.6824, + "step": 39086 + }, + { + "epoch": 0.23246146160433914, + "grad_norm": 1.6914910078048706, + "learning_rate": 4.3624682300823473e-05, + "loss": 4.572, + "step": 39087 + }, + { + "epoch": 0.23246740888762013, + "grad_norm": 1.7068865299224854, + "learning_rate": 4.362437070610813e-05, + "loss": 4.4017, + "step": 39088 + }, + { + "epoch": 0.23247335617090112, + "grad_norm": 1.7159522771835327, + "learning_rate": 4.3624059104891216e-05, + "loss": 4.6159, + "step": 39089 + }, + { + "epoch": 0.23247930345418213, + "grad_norm": 1.7849717140197754, + "learning_rate": 4.362374749717283e-05, + "loss": 4.8789, + "step": 39090 + }, + { + "epoch": 0.23248525073746312, + "grad_norm": 2.447394847869873, + "learning_rate": 4.362343588295309e-05, + "loss": 3.2631, + "step": 39091 + }, + { + "epoch": 0.2324911980207441, + "grad_norm": 2.2207345962524414, + "learning_rate": 4.36231242622321e-05, + "loss": 3.4677, + "step": 39092 + }, + { + "epoch": 0.23249714530402513, + "grad_norm": 2.4080615043640137, + "learning_rate": 4.3622812635009967e-05, + "loss": 3.3182, + "step": 39093 + }, + { + "epoch": 0.23250309258730611, + "grad_norm": 2.1918601989746094, + "learning_rate": 4.3622501001286806e-05, + "loss": 3.3297, + "step": 39094 + }, + { + "epoch": 0.2325090398705871, + "grad_norm": 2.7159063816070557, + "learning_rate": 4.362218936106272e-05, + "loss": 3.4462, + "step": 39095 + }, + { + "epoch": 0.23251498715386812, + "grad_norm": 2.3878097534179688, + "learning_rate": 4.362187771433782e-05, + "loss": 3.2406, + "step": 39096 + }, + { + "epoch": 0.2325209344371491, + "grad_norm": 2.8980376720428467, + "learning_rate": 4.362156606111222e-05, + "loss": 3.4155, + "step": 39097 + }, + { + "epoch": 0.2325268817204301, + "grad_norm": 2.0443594455718994, + "learning_rate": 4.362125440138601e-05, + "loss": 3.4641, + "step": 39098 + }, + { + "epoch": 0.2325328290037111, + "grad_norm": 2.1212189197540283, + "learning_rate": 4.362094273515933e-05, + "loss": 4.1194, + "step": 39099 + }, + { + "epoch": 0.2325387762869921, + "grad_norm": 2.186098575592041, + "learning_rate": 4.362063106243226e-05, + "loss": 4.1914, + "step": 39100 + }, + { + "epoch": 0.2325447235702731, + "grad_norm": 1.6000093221664429, + "learning_rate": 4.362031938320492e-05, + "loss": 4.987, + "step": 39101 + }, + { + "epoch": 0.2325506708535541, + "grad_norm": 1.4070879220962524, + "learning_rate": 4.362000769747743e-05, + "loss": 5.2052, + "step": 39102 + }, + { + "epoch": 0.2325566181368351, + "grad_norm": 1.739212989807129, + "learning_rate": 4.361969600524988e-05, + "loss": 4.398, + "step": 39103 + }, + { + "epoch": 0.23256256542011608, + "grad_norm": 2.334226369857788, + "learning_rate": 4.361938430652238e-05, + "loss": 3.6885, + "step": 39104 + }, + { + "epoch": 0.2325685127033971, + "grad_norm": 1.7967642545700073, + "learning_rate": 4.361907260129505e-05, + "loss": 4.3779, + "step": 39105 + }, + { + "epoch": 0.23257445998667808, + "grad_norm": 1.9032526016235352, + "learning_rate": 4.3618760889568e-05, + "loss": 4.4896, + "step": 39106 + }, + { + "epoch": 0.23258040726995907, + "grad_norm": 1.9198237657546997, + "learning_rate": 4.3618449171341324e-05, + "loss": 4.3165, + "step": 39107 + }, + { + "epoch": 0.2325863545532401, + "grad_norm": 2.0512235164642334, + "learning_rate": 4.3618137446615146e-05, + "loss": 4.025, + "step": 39108 + }, + { + "epoch": 0.23259230183652108, + "grad_norm": 1.6357065439224243, + "learning_rate": 4.361782571538957e-05, + "loss": 4.2988, + "step": 39109 + }, + { + "epoch": 0.23259824911980206, + "grad_norm": 2.0023303031921387, + "learning_rate": 4.3617513977664695e-05, + "loss": 3.7465, + "step": 39110 + }, + { + "epoch": 0.23260419640308308, + "grad_norm": 2.0384459495544434, + "learning_rate": 4.3617202233440646e-05, + "loss": 3.9628, + "step": 39111 + }, + { + "epoch": 0.23261014368636407, + "grad_norm": 2.0617830753326416, + "learning_rate": 4.361689048271752e-05, + "loss": 4.2029, + "step": 39112 + }, + { + "epoch": 0.23261609096964506, + "grad_norm": 2.023827314376831, + "learning_rate": 4.361657872549544e-05, + "loss": 4.1771, + "step": 39113 + }, + { + "epoch": 0.23262203825292607, + "grad_norm": 1.7333523035049438, + "learning_rate": 4.361626696177449e-05, + "loss": 4.4811, + "step": 39114 + }, + { + "epoch": 0.23262798553620706, + "grad_norm": 1.7693727016448975, + "learning_rate": 4.36159551915548e-05, + "loss": 4.4168, + "step": 39115 + }, + { + "epoch": 0.23263393281948805, + "grad_norm": 1.8410542011260986, + "learning_rate": 4.3615643414836474e-05, + "loss": 4.3063, + "step": 39116 + }, + { + "epoch": 0.23263988010276906, + "grad_norm": 1.760138750076294, + "learning_rate": 4.3615331631619615e-05, + "loss": 4.4036, + "step": 39117 + }, + { + "epoch": 0.23264582738605005, + "grad_norm": 1.694985270500183, + "learning_rate": 4.361501984190434e-05, + "loss": 4.4535, + "step": 39118 + }, + { + "epoch": 0.23265177466933104, + "grad_norm": 1.8033300638198853, + "learning_rate": 4.361470804569075e-05, + "loss": 4.5579, + "step": 39119 + }, + { + "epoch": 0.23265772195261206, + "grad_norm": 1.8068331480026245, + "learning_rate": 4.361439624297896e-05, + "loss": 4.1694, + "step": 39120 + }, + { + "epoch": 0.23266366923589304, + "grad_norm": 1.616873025894165, + "learning_rate": 4.361408443376908e-05, + "loss": 4.4546, + "step": 39121 + }, + { + "epoch": 0.23266961651917403, + "grad_norm": 1.3989241123199463, + "learning_rate": 4.361377261806121e-05, + "loss": 5.0763, + "step": 39122 + }, + { + "epoch": 0.23267556380245505, + "grad_norm": 1.6721818447113037, + "learning_rate": 4.361346079585546e-05, + "loss": 4.9419, + "step": 39123 + }, + { + "epoch": 0.23268151108573604, + "grad_norm": 1.5129653215408325, + "learning_rate": 4.361314896715195e-05, + "loss": 5.3604, + "step": 39124 + }, + { + "epoch": 0.23268745836901703, + "grad_norm": 1.467267394065857, + "learning_rate": 4.3612837131950784e-05, + "loss": 5.3583, + "step": 39125 + }, + { + "epoch": 0.23269340565229804, + "grad_norm": 1.2865101099014282, + "learning_rate": 4.3612525290252066e-05, + "loss": 5.3188, + "step": 39126 + }, + { + "epoch": 0.23269935293557903, + "grad_norm": 1.2326202392578125, + "learning_rate": 4.36122134420559e-05, + "loss": 5.2643, + "step": 39127 + }, + { + "epoch": 0.23270530021886002, + "grad_norm": 1.490971565246582, + "learning_rate": 4.361190158736242e-05, + "loss": 5.0676, + "step": 39128 + }, + { + "epoch": 0.23271124750214103, + "grad_norm": 1.7493031024932861, + "learning_rate": 4.361158972617171e-05, + "loss": 4.5299, + "step": 39129 + }, + { + "epoch": 0.23271719478542202, + "grad_norm": 1.8924753665924072, + "learning_rate": 4.361127785848388e-05, + "loss": 4.4365, + "step": 39130 + }, + { + "epoch": 0.232723142068703, + "grad_norm": 1.9791909456253052, + "learning_rate": 4.3610965984299045e-05, + "loss": 3.9715, + "step": 39131 + }, + { + "epoch": 0.23272908935198403, + "grad_norm": 2.780217170715332, + "learning_rate": 4.3610654103617323e-05, + "loss": 3.503, + "step": 39132 + }, + { + "epoch": 0.232735036635265, + "grad_norm": 1.5088809728622437, + "learning_rate": 4.361034221643881e-05, + "loss": 5.2044, + "step": 39133 + }, + { + "epoch": 0.232740983918546, + "grad_norm": 1.5319719314575195, + "learning_rate": 4.361003032276362e-05, + "loss": 5.2335, + "step": 39134 + }, + { + "epoch": 0.23274693120182702, + "grad_norm": 1.5246729850769043, + "learning_rate": 4.360971842259186e-05, + "loss": 5.1811, + "step": 39135 + }, + { + "epoch": 0.232752878485108, + "grad_norm": 1.5429359674453735, + "learning_rate": 4.3609406515923634e-05, + "loss": 5.5938, + "step": 39136 + }, + { + "epoch": 0.232758825768389, + "grad_norm": 1.4963136911392212, + "learning_rate": 4.360909460275906e-05, + "loss": 5.6445, + "step": 39137 + }, + { + "epoch": 0.23276477305167, + "grad_norm": 1.2514957189559937, + "learning_rate": 4.360878268309825e-05, + "loss": 5.3839, + "step": 39138 + }, + { + "epoch": 0.232770720334951, + "grad_norm": 1.644110918045044, + "learning_rate": 4.36084707569413e-05, + "loss": 4.8794, + "step": 39139 + }, + { + "epoch": 0.23277666761823199, + "grad_norm": 1.8288604021072388, + "learning_rate": 4.360815882428832e-05, + "loss": 4.5039, + "step": 39140 + }, + { + "epoch": 0.232782614901513, + "grad_norm": 2.1350958347320557, + "learning_rate": 4.3607846885139434e-05, + "loss": 3.9829, + "step": 39141 + }, + { + "epoch": 0.232788562184794, + "grad_norm": 2.164173126220703, + "learning_rate": 4.360753493949473e-05, + "loss": 4.2228, + "step": 39142 + }, + { + "epoch": 0.23279450946807498, + "grad_norm": 2.1720077991485596, + "learning_rate": 4.3607222987354335e-05, + "loss": 4.222, + "step": 39143 + }, + { + "epoch": 0.232800456751356, + "grad_norm": 2.344161033630371, + "learning_rate": 4.360691102871835e-05, + "loss": 4.1798, + "step": 39144 + }, + { + "epoch": 0.23280640403463698, + "grad_norm": 2.174213409423828, + "learning_rate": 4.3606599063586886e-05, + "loss": 4.1144, + "step": 39145 + }, + { + "epoch": 0.23281235131791797, + "grad_norm": 2.1762256622314453, + "learning_rate": 4.360628709196005e-05, + "loss": 4.3624, + "step": 39146 + }, + { + "epoch": 0.23281829860119896, + "grad_norm": 1.9964789152145386, + "learning_rate": 4.360597511383795e-05, + "loss": 4.3162, + "step": 39147 + }, + { + "epoch": 0.23282424588447997, + "grad_norm": 2.0995092391967773, + "learning_rate": 4.36056631292207e-05, + "loss": 4.3076, + "step": 39148 + }, + { + "epoch": 0.23283019316776096, + "grad_norm": 1.65229070186615, + "learning_rate": 4.36053511381084e-05, + "loss": 5.1219, + "step": 39149 + }, + { + "epoch": 0.23283614045104195, + "grad_norm": 1.7693278789520264, + "learning_rate": 4.360503914050116e-05, + "loss": 5.0181, + "step": 39150 + }, + { + "epoch": 0.23284208773432297, + "grad_norm": 1.9694136381149292, + "learning_rate": 4.3604727136399105e-05, + "loss": 4.4619, + "step": 39151 + }, + { + "epoch": 0.23284803501760395, + "grad_norm": 2.2055177688598633, + "learning_rate": 4.360441512580232e-05, + "loss": 4.2537, + "step": 39152 + }, + { + "epoch": 0.23285398230088494, + "grad_norm": 1.8748949766159058, + "learning_rate": 4.360410310871094e-05, + "loss": 4.4831, + "step": 39153 + }, + { + "epoch": 0.23285992958416596, + "grad_norm": 1.6037344932556152, + "learning_rate": 4.360379108512504e-05, + "loss": 5.0677, + "step": 39154 + }, + { + "epoch": 0.23286587686744695, + "grad_norm": 1.627509355545044, + "learning_rate": 4.3603479055044774e-05, + "loss": 4.7995, + "step": 39155 + }, + { + "epoch": 0.23287182415072794, + "grad_norm": 1.7337157726287842, + "learning_rate": 4.36031670184702e-05, + "loss": 5.0422, + "step": 39156 + }, + { + "epoch": 0.23287777143400895, + "grad_norm": 1.6469885110855103, + "learning_rate": 4.360285497540148e-05, + "loss": 4.7927, + "step": 39157 + }, + { + "epoch": 0.23288371871728994, + "grad_norm": 1.5350927114486694, + "learning_rate": 4.360254292583867e-05, + "loss": 4.7906, + "step": 39158 + }, + { + "epoch": 0.23288966600057093, + "grad_norm": 1.4045746326446533, + "learning_rate": 4.360223086978191e-05, + "loss": 4.6738, + "step": 39159 + }, + { + "epoch": 0.23289561328385194, + "grad_norm": 1.599743127822876, + "learning_rate": 4.360191880723131e-05, + "loss": 5.1313, + "step": 39160 + }, + { + "epoch": 0.23290156056713293, + "grad_norm": 1.453123927116394, + "learning_rate": 4.360160673818697e-05, + "loss": 5.1246, + "step": 39161 + }, + { + "epoch": 0.23290750785041392, + "grad_norm": 1.555281639099121, + "learning_rate": 4.360129466264901e-05, + "loss": 5.1001, + "step": 39162 + }, + { + "epoch": 0.23291345513369494, + "grad_norm": 1.3762109279632568, + "learning_rate": 4.360098258061752e-05, + "loss": 5.156, + "step": 39163 + }, + { + "epoch": 0.23291940241697592, + "grad_norm": 1.4758714437484741, + "learning_rate": 4.360067049209262e-05, + "loss": 5.0616, + "step": 39164 + }, + { + "epoch": 0.2329253497002569, + "grad_norm": 1.4332698583602905, + "learning_rate": 4.360035839707442e-05, + "loss": 5.0706, + "step": 39165 + }, + { + "epoch": 0.23293129698353793, + "grad_norm": 1.5097830295562744, + "learning_rate": 4.360004629556302e-05, + "loss": 4.9428, + "step": 39166 + }, + { + "epoch": 0.23293724426681892, + "grad_norm": 1.6742161512374878, + "learning_rate": 4.359973418755855e-05, + "loss": 4.8498, + "step": 39167 + }, + { + "epoch": 0.2329431915500999, + "grad_norm": 1.6985595226287842, + "learning_rate": 4.359942207306109e-05, + "loss": 4.7057, + "step": 39168 + }, + { + "epoch": 0.23294913883338092, + "grad_norm": 1.560068964958191, + "learning_rate": 4.359910995207078e-05, + "loss": 4.9408, + "step": 39169 + }, + { + "epoch": 0.2329550861166619, + "grad_norm": 1.4312219619750977, + "learning_rate": 4.35987978245877e-05, + "loss": 5.0214, + "step": 39170 + }, + { + "epoch": 0.2329610333999429, + "grad_norm": 1.3766241073608398, + "learning_rate": 4.359848569061198e-05, + "loss": 4.8688, + "step": 39171 + }, + { + "epoch": 0.2329669806832239, + "grad_norm": 1.4122978448867798, + "learning_rate": 4.359817355014371e-05, + "loss": 5.0583, + "step": 39172 + }, + { + "epoch": 0.2329729279665049, + "grad_norm": 1.368056297302246, + "learning_rate": 4.359786140318302e-05, + "loss": 4.9295, + "step": 39173 + }, + { + "epoch": 0.2329788752497859, + "grad_norm": 1.5163987874984741, + "learning_rate": 4.3597549249730003e-05, + "loss": 4.9142, + "step": 39174 + }, + { + "epoch": 0.2329848225330669, + "grad_norm": 1.3942031860351562, + "learning_rate": 4.359723708978478e-05, + "loss": 4.9853, + "step": 39175 + }, + { + "epoch": 0.2329907698163479, + "grad_norm": 1.6139392852783203, + "learning_rate": 4.3596924923347446e-05, + "loss": 5.0879, + "step": 39176 + }, + { + "epoch": 0.23299671709962888, + "grad_norm": 1.4891060590744019, + "learning_rate": 4.359661275041812e-05, + "loss": 5.4327, + "step": 39177 + }, + { + "epoch": 0.2330026643829099, + "grad_norm": 1.6758902072906494, + "learning_rate": 4.3596300570996905e-05, + "loss": 4.9565, + "step": 39178 + }, + { + "epoch": 0.23300861166619088, + "grad_norm": 1.9034432172775269, + "learning_rate": 4.359598838508392e-05, + "loss": 3.8357, + "step": 39179 + }, + { + "epoch": 0.23301455894947187, + "grad_norm": 1.4876021146774292, + "learning_rate": 4.359567619267927e-05, + "loss": 5.1076, + "step": 39180 + }, + { + "epoch": 0.2330205062327529, + "grad_norm": 1.623687744140625, + "learning_rate": 4.359536399378306e-05, + "loss": 4.9354, + "step": 39181 + }, + { + "epoch": 0.23302645351603388, + "grad_norm": 1.6403672695159912, + "learning_rate": 4.359505178839539e-05, + "loss": 4.8971, + "step": 39182 + }, + { + "epoch": 0.23303240079931486, + "grad_norm": 1.4088045358657837, + "learning_rate": 4.359473957651639e-05, + "loss": 4.696, + "step": 39183 + }, + { + "epoch": 0.23303834808259588, + "grad_norm": 1.4575159549713135, + "learning_rate": 4.3594427358146155e-05, + "loss": 5.0652, + "step": 39184 + }, + { + "epoch": 0.23304429536587687, + "grad_norm": 1.4742602109909058, + "learning_rate": 4.35941151332848e-05, + "loss": 4.663, + "step": 39185 + }, + { + "epoch": 0.23305024264915786, + "grad_norm": 1.45940101146698, + "learning_rate": 4.3593802901932434e-05, + "loss": 4.8016, + "step": 39186 + }, + { + "epoch": 0.23305618993243887, + "grad_norm": 1.4725275039672852, + "learning_rate": 4.359349066408915e-05, + "loss": 5.4615, + "step": 39187 + }, + { + "epoch": 0.23306213721571986, + "grad_norm": 1.3877811431884766, + "learning_rate": 4.3593178419755086e-05, + "loss": 5.362, + "step": 39188 + }, + { + "epoch": 0.23306808449900085, + "grad_norm": 1.5631160736083984, + "learning_rate": 4.359286616893033e-05, + "loss": 4.6152, + "step": 39189 + }, + { + "epoch": 0.23307403178228187, + "grad_norm": 1.55159592628479, + "learning_rate": 4.3592553911615e-05, + "loss": 5.1536, + "step": 39190 + }, + { + "epoch": 0.23307997906556285, + "grad_norm": 1.3627732992172241, + "learning_rate": 4.35922416478092e-05, + "loss": 5.3487, + "step": 39191 + }, + { + "epoch": 0.23308592634884384, + "grad_norm": 1.2899950742721558, + "learning_rate": 4.359192937751303e-05, + "loss": 5.4265, + "step": 39192 + }, + { + "epoch": 0.23309187363212486, + "grad_norm": 1.3154022693634033, + "learning_rate": 4.359161710072662e-05, + "loss": 5.2734, + "step": 39193 + }, + { + "epoch": 0.23309782091540585, + "grad_norm": 1.618874430656433, + "learning_rate": 4.3591304817450066e-05, + "loss": 5.0206, + "step": 39194 + }, + { + "epoch": 0.23310376819868683, + "grad_norm": 1.5921950340270996, + "learning_rate": 4.359099252768348e-05, + "loss": 4.6433, + "step": 39195 + }, + { + "epoch": 0.23310971548196785, + "grad_norm": 1.7619107961654663, + "learning_rate": 4.359068023142697e-05, + "loss": 4.5171, + "step": 39196 + }, + { + "epoch": 0.23311566276524884, + "grad_norm": 1.6703497171401978, + "learning_rate": 4.359036792868064e-05, + "loss": 4.5936, + "step": 39197 + }, + { + "epoch": 0.23312161004852983, + "grad_norm": 1.5407284498214722, + "learning_rate": 4.359005561944461e-05, + "loss": 4.7157, + "step": 39198 + }, + { + "epoch": 0.23312755733181084, + "grad_norm": 1.6638575792312622, + "learning_rate": 4.358974330371899e-05, + "loss": 4.3966, + "step": 39199 + }, + { + "epoch": 0.23313350461509183, + "grad_norm": 1.5696804523468018, + "learning_rate": 4.3589430981503875e-05, + "loss": 4.6406, + "step": 39200 + }, + { + "epoch": 0.23313945189837282, + "grad_norm": 1.6106598377227783, + "learning_rate": 4.358911865279939e-05, + "loss": 4.9825, + "step": 39201 + }, + { + "epoch": 0.23314539918165383, + "grad_norm": 1.5994102954864502, + "learning_rate": 4.3588806317605624e-05, + "loss": 5.1257, + "step": 39202 + }, + { + "epoch": 0.23315134646493482, + "grad_norm": 1.5106816291809082, + "learning_rate": 4.3588493975922704e-05, + "loss": 4.9696, + "step": 39203 + }, + { + "epoch": 0.2331572937482158, + "grad_norm": 1.8663996458053589, + "learning_rate": 4.358818162775073e-05, + "loss": 4.9723, + "step": 39204 + }, + { + "epoch": 0.2331632410314968, + "grad_norm": 1.5792741775512695, + "learning_rate": 4.3587869273089824e-05, + "loss": 5.6912, + "step": 39205 + }, + { + "epoch": 0.23316918831477781, + "grad_norm": 2.0187623500823975, + "learning_rate": 4.358755691194007e-05, + "loss": 4.0808, + "step": 39206 + }, + { + "epoch": 0.2331751355980588, + "grad_norm": 1.7841250896453857, + "learning_rate": 4.35872445443016e-05, + "loss": 4.8503, + "step": 39207 + }, + { + "epoch": 0.2331810828813398, + "grad_norm": 1.5512248277664185, + "learning_rate": 4.358693217017451e-05, + "loss": 4.7434, + "step": 39208 + }, + { + "epoch": 0.2331870301646208, + "grad_norm": 1.6511754989624023, + "learning_rate": 4.358661978955892e-05, + "loss": 4.7894, + "step": 39209 + }, + { + "epoch": 0.2331929774479018, + "grad_norm": 1.4584643840789795, + "learning_rate": 4.358630740245493e-05, + "loss": 4.4343, + "step": 39210 + }, + { + "epoch": 0.23319892473118278, + "grad_norm": 1.4316320419311523, + "learning_rate": 4.3585995008862654e-05, + "loss": 5.2399, + "step": 39211 + }, + { + "epoch": 0.2332048720144638, + "grad_norm": 1.9914770126342773, + "learning_rate": 4.35856826087822e-05, + "loss": 4.037, + "step": 39212 + }, + { + "epoch": 0.2332108192977448, + "grad_norm": 2.455686569213867, + "learning_rate": 4.358537020221367e-05, + "loss": 3.4472, + "step": 39213 + }, + { + "epoch": 0.23321676658102578, + "grad_norm": 2.050672769546509, + "learning_rate": 4.358505778915718e-05, + "loss": 3.5811, + "step": 39214 + }, + { + "epoch": 0.2332227138643068, + "grad_norm": 1.9716814756393433, + "learning_rate": 4.358474536961285e-05, + "loss": 4.2366, + "step": 39215 + }, + { + "epoch": 0.23322866114758778, + "grad_norm": 1.5395363569259644, + "learning_rate": 4.358443294358077e-05, + "loss": 4.7607, + "step": 39216 + }, + { + "epoch": 0.23323460843086877, + "grad_norm": 1.3587455749511719, + "learning_rate": 4.3584120511061045e-05, + "loss": 4.88, + "step": 39217 + }, + { + "epoch": 0.23324055571414978, + "grad_norm": 1.9777147769927979, + "learning_rate": 4.358380807205381e-05, + "loss": 4.0743, + "step": 39218 + }, + { + "epoch": 0.23324650299743077, + "grad_norm": 2.6593072414398193, + "learning_rate": 4.3583495626559155e-05, + "loss": 3.4124, + "step": 39219 + }, + { + "epoch": 0.23325245028071176, + "grad_norm": 2.5443968772888184, + "learning_rate": 4.358318317457719e-05, + "loss": 3.6349, + "step": 39220 + }, + { + "epoch": 0.23325839756399278, + "grad_norm": 1.8876529932022095, + "learning_rate": 4.358287071610804e-05, + "loss": 4.0327, + "step": 39221 + }, + { + "epoch": 0.23326434484727376, + "grad_norm": 2.037179708480835, + "learning_rate": 4.358255825115179e-05, + "loss": 3.7585, + "step": 39222 + }, + { + "epoch": 0.23327029213055475, + "grad_norm": 2.7959325313568115, + "learning_rate": 4.358224577970857e-05, + "loss": 3.5884, + "step": 39223 + }, + { + "epoch": 0.23327623941383577, + "grad_norm": 1.7984586954116821, + "learning_rate": 4.3581933301778466e-05, + "loss": 4.3233, + "step": 39224 + }, + { + "epoch": 0.23328218669711676, + "grad_norm": 1.8435425758361816, + "learning_rate": 4.358162081736161e-05, + "loss": 3.8402, + "step": 39225 + }, + { + "epoch": 0.23328813398039774, + "grad_norm": 1.887158989906311, + "learning_rate": 4.35813083264581e-05, + "loss": 4.4948, + "step": 39226 + }, + { + "epoch": 0.23329408126367876, + "grad_norm": 1.758460283279419, + "learning_rate": 4.358099582906806e-05, + "loss": 4.501, + "step": 39227 + }, + { + "epoch": 0.23330002854695975, + "grad_norm": 1.8516051769256592, + "learning_rate": 4.3580683325191576e-05, + "loss": 4.2182, + "step": 39228 + }, + { + "epoch": 0.23330597583024074, + "grad_norm": 1.7806837558746338, + "learning_rate": 4.3580370814828766e-05, + "loss": 4.108, + "step": 39229 + }, + { + "epoch": 0.23331192311352175, + "grad_norm": 1.9674243927001953, + "learning_rate": 4.358005829797974e-05, + "loss": 3.8963, + "step": 39230 + }, + { + "epoch": 0.23331787039680274, + "grad_norm": 1.6735156774520874, + "learning_rate": 4.357974577464461e-05, + "loss": 4.2549, + "step": 39231 + }, + { + "epoch": 0.23332381768008373, + "grad_norm": 1.656535029411316, + "learning_rate": 4.357943324482348e-05, + "loss": 4.5354, + "step": 39232 + }, + { + "epoch": 0.23332976496336474, + "grad_norm": 1.5299835205078125, + "learning_rate": 4.357912070851646e-05, + "loss": 4.4048, + "step": 39233 + }, + { + "epoch": 0.23333571224664573, + "grad_norm": 1.5637716054916382, + "learning_rate": 4.357880816572366e-05, + "loss": 4.3426, + "step": 39234 + }, + { + "epoch": 0.23334165952992672, + "grad_norm": 1.5826610326766968, + "learning_rate": 4.3578495616445205e-05, + "loss": 4.0884, + "step": 39235 + }, + { + "epoch": 0.23334760681320774, + "grad_norm": 1.7511299848556519, + "learning_rate": 4.357818306068117e-05, + "loss": 3.9539, + "step": 39236 + }, + { + "epoch": 0.23335355409648872, + "grad_norm": 2.037144422531128, + "learning_rate": 4.357787049843169e-05, + "loss": 3.8867, + "step": 39237 + }, + { + "epoch": 0.2333595013797697, + "grad_norm": 1.641832947731018, + "learning_rate": 4.3577557929696875e-05, + "loss": 4.6045, + "step": 39238 + }, + { + "epoch": 0.23336544866305073, + "grad_norm": 1.5627217292785645, + "learning_rate": 4.357724535447682e-05, + "loss": 5.0244, + "step": 39239 + }, + { + "epoch": 0.23337139594633172, + "grad_norm": 1.6410858631134033, + "learning_rate": 4.357693277277164e-05, + "loss": 5.1096, + "step": 39240 + }, + { + "epoch": 0.2333773432296127, + "grad_norm": 1.3919554948806763, + "learning_rate": 4.357662018458145e-05, + "loss": 5.0005, + "step": 39241 + }, + { + "epoch": 0.23338329051289372, + "grad_norm": 1.5992203950881958, + "learning_rate": 4.357630758990634e-05, + "loss": 4.9081, + "step": 39242 + }, + { + "epoch": 0.2333892377961747, + "grad_norm": 1.3488589525222778, + "learning_rate": 4.3575994988746446e-05, + "loss": 4.9424, + "step": 39243 + }, + { + "epoch": 0.2333951850794557, + "grad_norm": 1.512978196144104, + "learning_rate": 4.357568238110186e-05, + "loss": 4.9572, + "step": 39244 + }, + { + "epoch": 0.2334011323627367, + "grad_norm": 1.5400868654251099, + "learning_rate": 4.35753697669727e-05, + "loss": 4.9061, + "step": 39245 + }, + { + "epoch": 0.2334070796460177, + "grad_norm": 1.6540729999542236, + "learning_rate": 4.3575057146359065e-05, + "loss": 5.0591, + "step": 39246 + }, + { + "epoch": 0.2334130269292987, + "grad_norm": 1.5842341184616089, + "learning_rate": 4.357474451926107e-05, + "loss": 4.7289, + "step": 39247 + }, + { + "epoch": 0.2334189742125797, + "grad_norm": 1.6024073362350464, + "learning_rate": 4.3574431885678824e-05, + "loss": 4.7724, + "step": 39248 + }, + { + "epoch": 0.2334249214958607, + "grad_norm": 1.4475951194763184, + "learning_rate": 4.3574119245612435e-05, + "loss": 4.9537, + "step": 39249 + }, + { + "epoch": 0.23343086877914168, + "grad_norm": 1.5023313760757446, + "learning_rate": 4.3573806599062015e-05, + "loss": 4.7418, + "step": 39250 + }, + { + "epoch": 0.2334368160624227, + "grad_norm": 1.4264357089996338, + "learning_rate": 4.357349394602767e-05, + "loss": 4.8347, + "step": 39251 + }, + { + "epoch": 0.23344276334570369, + "grad_norm": 1.4076826572418213, + "learning_rate": 4.357318128650951e-05, + "loss": 4.875, + "step": 39252 + }, + { + "epoch": 0.23344871062898467, + "grad_norm": 1.416060447692871, + "learning_rate": 4.357286862050764e-05, + "loss": 4.856, + "step": 39253 + }, + { + "epoch": 0.2334546579122657, + "grad_norm": 1.4814610481262207, + "learning_rate": 4.357255594802218e-05, + "loss": 4.8171, + "step": 39254 + }, + { + "epoch": 0.23346060519554668, + "grad_norm": 1.5342730283737183, + "learning_rate": 4.357224326905323e-05, + "loss": 4.8825, + "step": 39255 + }, + { + "epoch": 0.23346655247882767, + "grad_norm": 1.477780818939209, + "learning_rate": 4.357193058360091e-05, + "loss": 4.7599, + "step": 39256 + }, + { + "epoch": 0.23347249976210868, + "grad_norm": 1.4768816232681274, + "learning_rate": 4.35716178916653e-05, + "loss": 4.8948, + "step": 39257 + }, + { + "epoch": 0.23347844704538967, + "grad_norm": 1.5315289497375488, + "learning_rate": 4.357130519324655e-05, + "loss": 4.808, + "step": 39258 + }, + { + "epoch": 0.23348439432867066, + "grad_norm": 1.530362606048584, + "learning_rate": 4.357099248834474e-05, + "loss": 4.7999, + "step": 39259 + }, + { + "epoch": 0.23349034161195167, + "grad_norm": 1.496216058731079, + "learning_rate": 4.3570679776959994e-05, + "loss": 4.8051, + "step": 39260 + }, + { + "epoch": 0.23349628889523266, + "grad_norm": 1.5579525232315063, + "learning_rate": 4.357036705909241e-05, + "loss": 4.7486, + "step": 39261 + }, + { + "epoch": 0.23350223617851365, + "grad_norm": 1.4911562204360962, + "learning_rate": 4.357005433474211e-05, + "loss": 4.7698, + "step": 39262 + }, + { + "epoch": 0.23350818346179464, + "grad_norm": 1.5691003799438477, + "learning_rate": 4.3569741603909185e-05, + "loss": 4.8152, + "step": 39263 + }, + { + "epoch": 0.23351413074507565, + "grad_norm": 1.4888197183609009, + "learning_rate": 4.3569428866593764e-05, + "loss": 4.8731, + "step": 39264 + }, + { + "epoch": 0.23352007802835664, + "grad_norm": 1.3809503316879272, + "learning_rate": 4.356911612279594e-05, + "loss": 4.8566, + "step": 39265 + }, + { + "epoch": 0.23352602531163763, + "grad_norm": 1.520364761352539, + "learning_rate": 4.356880337251584e-05, + "loss": 4.919, + "step": 39266 + }, + { + "epoch": 0.23353197259491865, + "grad_norm": 1.36786949634552, + "learning_rate": 4.3568490615753556e-05, + "loss": 4.9626, + "step": 39267 + }, + { + "epoch": 0.23353791987819963, + "grad_norm": 1.5701583623886108, + "learning_rate": 4.35681778525092e-05, + "loss": 4.989, + "step": 39268 + }, + { + "epoch": 0.23354386716148062, + "grad_norm": 1.272887945175171, + "learning_rate": 4.3567865082782886e-05, + "loss": 4.9161, + "step": 39269 + }, + { + "epoch": 0.23354981444476164, + "grad_norm": 1.4108949899673462, + "learning_rate": 4.3567552306574734e-05, + "loss": 4.9134, + "step": 39270 + }, + { + "epoch": 0.23355576172804263, + "grad_norm": 1.3697681427001953, + "learning_rate": 4.356723952388483e-05, + "loss": 4.8394, + "step": 39271 + }, + { + "epoch": 0.23356170901132361, + "grad_norm": 1.5496501922607422, + "learning_rate": 4.356692673471329e-05, + "loss": 4.7908, + "step": 39272 + }, + { + "epoch": 0.23356765629460463, + "grad_norm": 1.1896272897720337, + "learning_rate": 4.3566613939060244e-05, + "loss": 5.0675, + "step": 39273 + }, + { + "epoch": 0.23357360357788562, + "grad_norm": 1.4039702415466309, + "learning_rate": 4.356630113692577e-05, + "loss": 4.8312, + "step": 39274 + }, + { + "epoch": 0.2335795508611666, + "grad_norm": 1.480469822883606, + "learning_rate": 4.3565988328309994e-05, + "loss": 4.8627, + "step": 39275 + }, + { + "epoch": 0.23358549814444762, + "grad_norm": 1.5712021589279175, + "learning_rate": 4.356567551321303e-05, + "loss": 4.9239, + "step": 39276 + }, + { + "epoch": 0.2335914454277286, + "grad_norm": 1.4389845132827759, + "learning_rate": 4.356536269163497e-05, + "loss": 4.8025, + "step": 39277 + }, + { + "epoch": 0.2335973927110096, + "grad_norm": 1.3848538398742676, + "learning_rate": 4.356504986357595e-05, + "loss": 4.7643, + "step": 39278 + }, + { + "epoch": 0.23360333999429062, + "grad_norm": 1.4079018831253052, + "learning_rate": 4.356473702903605e-05, + "loss": 4.8716, + "step": 39279 + }, + { + "epoch": 0.2336092872775716, + "grad_norm": 1.4892137050628662, + "learning_rate": 4.356442418801539e-05, + "loss": 4.9984, + "step": 39280 + }, + { + "epoch": 0.2336152345608526, + "grad_norm": 1.305431604385376, + "learning_rate": 4.356411134051409e-05, + "loss": 4.8958, + "step": 39281 + }, + { + "epoch": 0.2336211818441336, + "grad_norm": 1.395667552947998, + "learning_rate": 4.356379848653225e-05, + "loss": 4.7811, + "step": 39282 + }, + { + "epoch": 0.2336271291274146, + "grad_norm": 1.485810399055481, + "learning_rate": 4.356348562606998e-05, + "loss": 4.7185, + "step": 39283 + }, + { + "epoch": 0.23363307641069558, + "grad_norm": 1.3166253566741943, + "learning_rate": 4.356317275912739e-05, + "loss": 4.6181, + "step": 39284 + }, + { + "epoch": 0.2336390236939766, + "grad_norm": 1.387439489364624, + "learning_rate": 4.356285988570458e-05, + "loss": 4.6781, + "step": 39285 + }, + { + "epoch": 0.2336449709772576, + "grad_norm": 1.4118428230285645, + "learning_rate": 4.356254700580167e-05, + "loss": 4.767, + "step": 39286 + }, + { + "epoch": 0.23365091826053858, + "grad_norm": 1.4941892623901367, + "learning_rate": 4.356223411941877e-05, + "loss": 4.7696, + "step": 39287 + }, + { + "epoch": 0.2336568655438196, + "grad_norm": 1.5000381469726562, + "learning_rate": 4.356192122655599e-05, + "loss": 4.7611, + "step": 39288 + }, + { + "epoch": 0.23366281282710058, + "grad_norm": 1.4702129364013672, + "learning_rate": 4.3561608327213424e-05, + "loss": 4.6394, + "step": 39289 + }, + { + "epoch": 0.23366876011038157, + "grad_norm": 1.28829824924469, + "learning_rate": 4.35612954213912e-05, + "loss": 4.7623, + "step": 39290 + }, + { + "epoch": 0.23367470739366258, + "grad_norm": 1.360546350479126, + "learning_rate": 4.3560982509089416e-05, + "loss": 4.8757, + "step": 39291 + }, + { + "epoch": 0.23368065467694357, + "grad_norm": 1.375982403755188, + "learning_rate": 4.3560669590308184e-05, + "loss": 4.8164, + "step": 39292 + }, + { + "epoch": 0.23368660196022456, + "grad_norm": 1.4468094110488892, + "learning_rate": 4.356035666504762e-05, + "loss": 4.8452, + "step": 39293 + }, + { + "epoch": 0.23369254924350558, + "grad_norm": 1.5260746479034424, + "learning_rate": 4.356004373330782e-05, + "loss": 4.6426, + "step": 39294 + }, + { + "epoch": 0.23369849652678656, + "grad_norm": 1.4539531469345093, + "learning_rate": 4.355973079508891e-05, + "loss": 4.4602, + "step": 39295 + }, + { + "epoch": 0.23370444381006755, + "grad_norm": 2.001392126083374, + "learning_rate": 4.3559417850390974e-05, + "loss": 4.1524, + "step": 39296 + }, + { + "epoch": 0.23371039109334857, + "grad_norm": 2.0820767879486084, + "learning_rate": 4.355910489921415e-05, + "loss": 4.1542, + "step": 39297 + }, + { + "epoch": 0.23371633837662956, + "grad_norm": 2.4654488563537598, + "learning_rate": 4.355879194155853e-05, + "loss": 3.8269, + "step": 39298 + }, + { + "epoch": 0.23372228565991054, + "grad_norm": 2.231116533279419, + "learning_rate": 4.355847897742423e-05, + "loss": 3.6304, + "step": 39299 + }, + { + "epoch": 0.23372823294319156, + "grad_norm": 2.2285022735595703, + "learning_rate": 4.355816600681135e-05, + "loss": 4.0732, + "step": 39300 + }, + { + "epoch": 0.23373418022647255, + "grad_norm": 1.592862606048584, + "learning_rate": 4.355785302972001e-05, + "loss": 4.5091, + "step": 39301 + }, + { + "epoch": 0.23374012750975354, + "grad_norm": 1.5630459785461426, + "learning_rate": 4.355754004615032e-05, + "loss": 4.8329, + "step": 39302 + }, + { + "epoch": 0.23374607479303455, + "grad_norm": 1.876397967338562, + "learning_rate": 4.355722705610238e-05, + "loss": 4.5441, + "step": 39303 + }, + { + "epoch": 0.23375202207631554, + "grad_norm": 1.4842675924301147, + "learning_rate": 4.35569140595763e-05, + "loss": 4.6346, + "step": 39304 + }, + { + "epoch": 0.23375796935959653, + "grad_norm": 1.9447697401046753, + "learning_rate": 4.35566010565722e-05, + "loss": 4.4493, + "step": 39305 + }, + { + "epoch": 0.23376391664287755, + "grad_norm": 1.618638277053833, + "learning_rate": 4.355628804709019e-05, + "loss": 4.2402, + "step": 39306 + }, + { + "epoch": 0.23376986392615853, + "grad_norm": 1.5623420476913452, + "learning_rate": 4.355597503113035e-05, + "loss": 4.2127, + "step": 39307 + }, + { + "epoch": 0.23377581120943952, + "grad_norm": 1.6455943584442139, + "learning_rate": 4.3555662008692824e-05, + "loss": 4.9097, + "step": 39308 + }, + { + "epoch": 0.23378175849272054, + "grad_norm": 1.5201420783996582, + "learning_rate": 4.355534897977771e-05, + "loss": 3.9863, + "step": 39309 + }, + { + "epoch": 0.23378770577600153, + "grad_norm": 1.621320128440857, + "learning_rate": 4.355503594438511e-05, + "loss": 4.1855, + "step": 39310 + }, + { + "epoch": 0.2337936530592825, + "grad_norm": 1.6072403192520142, + "learning_rate": 4.355472290251514e-05, + "loss": 4.1234, + "step": 39311 + }, + { + "epoch": 0.23379960034256353, + "grad_norm": 1.7549769878387451, + "learning_rate": 4.355440985416791e-05, + "loss": 3.9104, + "step": 39312 + }, + { + "epoch": 0.23380554762584452, + "grad_norm": 1.7547187805175781, + "learning_rate": 4.355409679934352e-05, + "loss": 5.3788, + "step": 39313 + }, + { + "epoch": 0.2338114949091255, + "grad_norm": 1.6373289823532104, + "learning_rate": 4.35537837380421e-05, + "loss": 4.2316, + "step": 39314 + }, + { + "epoch": 0.23381744219240652, + "grad_norm": 1.8216767311096191, + "learning_rate": 4.355347067026374e-05, + "loss": 3.9499, + "step": 39315 + }, + { + "epoch": 0.2338233894756875, + "grad_norm": 1.811559796333313, + "learning_rate": 4.3553157596008544e-05, + "loss": 3.6429, + "step": 39316 + }, + { + "epoch": 0.2338293367589685, + "grad_norm": 1.8670121431350708, + "learning_rate": 4.3552844515276645e-05, + "loss": 3.5439, + "step": 39317 + }, + { + "epoch": 0.2338352840422495, + "grad_norm": 1.8059818744659424, + "learning_rate": 4.3552531428068136e-05, + "loss": 3.6974, + "step": 39318 + }, + { + "epoch": 0.2338412313255305, + "grad_norm": 1.666537880897522, + "learning_rate": 4.3552218334383124e-05, + "loss": 3.8927, + "step": 39319 + }, + { + "epoch": 0.2338471786088115, + "grad_norm": 1.8872041702270508, + "learning_rate": 4.355190523422173e-05, + "loss": 3.6902, + "step": 39320 + }, + { + "epoch": 0.2338531258920925, + "grad_norm": 1.8386341333389282, + "learning_rate": 4.355159212758406e-05, + "loss": 3.7288, + "step": 39321 + }, + { + "epoch": 0.2338590731753735, + "grad_norm": 1.7837934494018555, + "learning_rate": 4.355127901447022e-05, + "loss": 3.8012, + "step": 39322 + }, + { + "epoch": 0.23386502045865448, + "grad_norm": 1.7940096855163574, + "learning_rate": 4.3550965894880315e-05, + "loss": 3.5241, + "step": 39323 + }, + { + "epoch": 0.23387096774193547, + "grad_norm": 1.997152328491211, + "learning_rate": 4.355065276881447e-05, + "loss": 3.6414, + "step": 39324 + }, + { + "epoch": 0.2338769150252165, + "grad_norm": 1.6963967084884644, + "learning_rate": 4.3550339636272775e-05, + "loss": 3.6445, + "step": 39325 + }, + { + "epoch": 0.23388286230849747, + "grad_norm": 2.0039589405059814, + "learning_rate": 4.3550026497255346e-05, + "loss": 3.7224, + "step": 39326 + }, + { + "epoch": 0.23388880959177846, + "grad_norm": 1.8233304023742676, + "learning_rate": 4.35497133517623e-05, + "loss": 3.6538, + "step": 39327 + }, + { + "epoch": 0.23389475687505948, + "grad_norm": 1.8335343599319458, + "learning_rate": 4.354940019979374e-05, + "loss": 3.614, + "step": 39328 + }, + { + "epoch": 0.23390070415834047, + "grad_norm": 1.7205346822738647, + "learning_rate": 4.3549087041349774e-05, + "loss": 3.7804, + "step": 39329 + }, + { + "epoch": 0.23390665144162145, + "grad_norm": 1.8699969053268433, + "learning_rate": 4.354877387643052e-05, + "loss": 3.7193, + "step": 39330 + }, + { + "epoch": 0.23391259872490247, + "grad_norm": 1.853049397468567, + "learning_rate": 4.354846070503607e-05, + "loss": 3.7105, + "step": 39331 + }, + { + "epoch": 0.23391854600818346, + "grad_norm": 1.8523236513137817, + "learning_rate": 4.354814752716655e-05, + "loss": 3.6334, + "step": 39332 + }, + { + "epoch": 0.23392449329146445, + "grad_norm": 1.8499833345413208, + "learning_rate": 4.354783434282206e-05, + "loss": 3.8866, + "step": 39333 + }, + { + "epoch": 0.23393044057474546, + "grad_norm": 1.9343851804733276, + "learning_rate": 4.354752115200271e-05, + "loss": 3.6149, + "step": 39334 + }, + { + "epoch": 0.23393638785802645, + "grad_norm": 2.0238258838653564, + "learning_rate": 4.3547207954708624e-05, + "loss": 3.7493, + "step": 39335 + }, + { + "epoch": 0.23394233514130744, + "grad_norm": 1.860219120979309, + "learning_rate": 4.354689475093989e-05, + "loss": 3.6736, + "step": 39336 + }, + { + "epoch": 0.23394828242458846, + "grad_norm": 1.8583927154541016, + "learning_rate": 4.3546581540696626e-05, + "loss": 3.7352, + "step": 39337 + }, + { + "epoch": 0.23395422970786944, + "grad_norm": 1.753945231437683, + "learning_rate": 4.354626832397895e-05, + "loss": 3.5608, + "step": 39338 + }, + { + "epoch": 0.23396017699115043, + "grad_norm": 1.8373562097549438, + "learning_rate": 4.3545955100786956e-05, + "loss": 3.5984, + "step": 39339 + }, + { + "epoch": 0.23396612427443145, + "grad_norm": 1.7085545063018799, + "learning_rate": 4.354564187112076e-05, + "loss": 3.5978, + "step": 39340 + }, + { + "epoch": 0.23397207155771244, + "grad_norm": 1.6872609853744507, + "learning_rate": 4.354532863498047e-05, + "loss": 3.6985, + "step": 39341 + }, + { + "epoch": 0.23397801884099342, + "grad_norm": 1.8018085956573486, + "learning_rate": 4.3545015392366205e-05, + "loss": 3.5865, + "step": 39342 + }, + { + "epoch": 0.23398396612427444, + "grad_norm": 1.8761341571807861, + "learning_rate": 4.354470214327807e-05, + "loss": 3.6066, + "step": 39343 + }, + { + "epoch": 0.23398991340755543, + "grad_norm": 1.9360228776931763, + "learning_rate": 4.3544388887716167e-05, + "loss": 3.5594, + "step": 39344 + }, + { + "epoch": 0.23399586069083642, + "grad_norm": 1.6296970844268799, + "learning_rate": 4.3544075625680606e-05, + "loss": 3.4926, + "step": 39345 + }, + { + "epoch": 0.23400180797411743, + "grad_norm": 1.6063154935836792, + "learning_rate": 4.3543762357171504e-05, + "loss": 3.5129, + "step": 39346 + }, + { + "epoch": 0.23400775525739842, + "grad_norm": 1.8837140798568726, + "learning_rate": 4.354344908218896e-05, + "loss": 3.6908, + "step": 39347 + }, + { + "epoch": 0.2340137025406794, + "grad_norm": 1.7227901220321655, + "learning_rate": 4.354313580073309e-05, + "loss": 3.3831, + "step": 39348 + }, + { + "epoch": 0.23401964982396042, + "grad_norm": 1.6574761867523193, + "learning_rate": 4.3542822512804005e-05, + "loss": 3.5345, + "step": 39349 + }, + { + "epoch": 0.2340255971072414, + "grad_norm": 1.5883370637893677, + "learning_rate": 4.354250921840182e-05, + "loss": 4.9003, + "step": 39350 + }, + { + "epoch": 0.2340315443905224, + "grad_norm": 1.8118661642074585, + "learning_rate": 4.3542195917526625e-05, + "loss": 4.7851, + "step": 39351 + }, + { + "epoch": 0.23403749167380342, + "grad_norm": 1.4651440382003784, + "learning_rate": 4.354188261017855e-05, + "loss": 4.9024, + "step": 39352 + }, + { + "epoch": 0.2340434389570844, + "grad_norm": 1.6591970920562744, + "learning_rate": 4.3541569296357684e-05, + "loss": 4.523, + "step": 39353 + }, + { + "epoch": 0.2340493862403654, + "grad_norm": 1.9845739603042603, + "learning_rate": 4.354125597606415e-05, + "loss": 4.6265, + "step": 39354 + }, + { + "epoch": 0.2340553335236464, + "grad_norm": 1.547351360321045, + "learning_rate": 4.354094264929807e-05, + "loss": 4.7699, + "step": 39355 + }, + { + "epoch": 0.2340612808069274, + "grad_norm": 1.6275198459625244, + "learning_rate": 4.354062931605952e-05, + "loss": 4.8717, + "step": 39356 + }, + { + "epoch": 0.23406722809020838, + "grad_norm": 1.460517168045044, + "learning_rate": 4.354031597634864e-05, + "loss": 4.6653, + "step": 39357 + }, + { + "epoch": 0.2340731753734894, + "grad_norm": 1.3596324920654297, + "learning_rate": 4.354000263016552e-05, + "loss": 4.8387, + "step": 39358 + }, + { + "epoch": 0.2340791226567704, + "grad_norm": 1.639746904373169, + "learning_rate": 4.353968927751029e-05, + "loss": 4.5672, + "step": 39359 + }, + { + "epoch": 0.23408506994005138, + "grad_norm": 1.4255754947662354, + "learning_rate": 4.353937591838303e-05, + "loss": 4.6677, + "step": 39360 + }, + { + "epoch": 0.2340910172233324, + "grad_norm": 1.8433109521865845, + "learning_rate": 4.353906255278387e-05, + "loss": 4.4957, + "step": 39361 + }, + { + "epoch": 0.23409696450661338, + "grad_norm": 3.0114545822143555, + "learning_rate": 4.353874918071292e-05, + "loss": 4.411, + "step": 39362 + }, + { + "epoch": 0.23410291178989437, + "grad_norm": 2.885272264480591, + "learning_rate": 4.353843580217028e-05, + "loss": 4.1936, + "step": 39363 + }, + { + "epoch": 0.23410885907317538, + "grad_norm": 2.6946053504943848, + "learning_rate": 4.3538122417156065e-05, + "loss": 4.0882, + "step": 39364 + }, + { + "epoch": 0.23411480635645637, + "grad_norm": 1.7744818925857544, + "learning_rate": 4.3537809025670384e-05, + "loss": 5.003, + "step": 39365 + }, + { + "epoch": 0.23412075363973736, + "grad_norm": 1.6615489721298218, + "learning_rate": 4.353749562771334e-05, + "loss": 5.0962, + "step": 39366 + }, + { + "epoch": 0.23412670092301838, + "grad_norm": 1.8033353090286255, + "learning_rate": 4.353718222328506e-05, + "loss": 4.6759, + "step": 39367 + }, + { + "epoch": 0.23413264820629937, + "grad_norm": 1.5590002536773682, + "learning_rate": 4.3536868812385626e-05, + "loss": 4.7335, + "step": 39368 + }, + { + "epoch": 0.23413859548958035, + "grad_norm": 1.649537444114685, + "learning_rate": 4.3536555395015166e-05, + "loss": 4.7863, + "step": 39369 + }, + { + "epoch": 0.23414454277286137, + "grad_norm": 1.8537696599960327, + "learning_rate": 4.353624197117379e-05, + "loss": 4.89, + "step": 39370 + }, + { + "epoch": 0.23415049005614236, + "grad_norm": 1.4917176961898804, + "learning_rate": 4.35359285408616e-05, + "loss": 5.4885, + "step": 39371 + }, + { + "epoch": 0.23415643733942335, + "grad_norm": 1.3445696830749512, + "learning_rate": 4.353561510407872e-05, + "loss": 5.0474, + "step": 39372 + }, + { + "epoch": 0.23416238462270436, + "grad_norm": 1.8747609853744507, + "learning_rate": 4.3535301660825235e-05, + "loss": 4.9231, + "step": 39373 + }, + { + "epoch": 0.23416833190598535, + "grad_norm": 1.5974386930465698, + "learning_rate": 4.353498821110127e-05, + "loss": 4.4127, + "step": 39374 + }, + { + "epoch": 0.23417427918926634, + "grad_norm": 1.4220181703567505, + "learning_rate": 4.353467475490694e-05, + "loss": 4.4635, + "step": 39375 + }, + { + "epoch": 0.23418022647254735, + "grad_norm": 1.5561754703521729, + "learning_rate": 4.353436129224233e-05, + "loss": 4.7118, + "step": 39376 + }, + { + "epoch": 0.23418617375582834, + "grad_norm": 1.7198326587677002, + "learning_rate": 4.353404782310758e-05, + "loss": 4.9206, + "step": 39377 + }, + { + "epoch": 0.23419212103910933, + "grad_norm": 1.6239429712295532, + "learning_rate": 4.353373434750279e-05, + "loss": 4.9794, + "step": 39378 + }, + { + "epoch": 0.23419806832239035, + "grad_norm": 1.6422456502914429, + "learning_rate": 4.353342086542806e-05, + "loss": 5.5298, + "step": 39379 + }, + { + "epoch": 0.23420401560567133, + "grad_norm": 1.455367922782898, + "learning_rate": 4.353310737688349e-05, + "loss": 5.1242, + "step": 39380 + }, + { + "epoch": 0.23420996288895232, + "grad_norm": 1.3635815382003784, + "learning_rate": 4.3532793881869215e-05, + "loss": 5.1464, + "step": 39381 + }, + { + "epoch": 0.2342159101722333, + "grad_norm": 1.425964593887329, + "learning_rate": 4.3532480380385335e-05, + "loss": 5.1384, + "step": 39382 + }, + { + "epoch": 0.23422185745551433, + "grad_norm": 1.7653659582138062, + "learning_rate": 4.353216687243196e-05, + "loss": 5.2893, + "step": 39383 + }, + { + "epoch": 0.23422780473879531, + "grad_norm": 1.5686538219451904, + "learning_rate": 4.353185335800919e-05, + "loss": 4.9431, + "step": 39384 + }, + { + "epoch": 0.2342337520220763, + "grad_norm": 1.7350513935089111, + "learning_rate": 4.353153983711714e-05, + "loss": 5.0707, + "step": 39385 + }, + { + "epoch": 0.23423969930535732, + "grad_norm": 1.449817419052124, + "learning_rate": 4.353122630975593e-05, + "loss": 5.1242, + "step": 39386 + }, + { + "epoch": 0.2342456465886383, + "grad_norm": 1.4347769021987915, + "learning_rate": 4.3530912775925656e-05, + "loss": 4.9507, + "step": 39387 + }, + { + "epoch": 0.2342515938719193, + "grad_norm": 1.5073580741882324, + "learning_rate": 4.353059923562643e-05, + "loss": 4.9784, + "step": 39388 + }, + { + "epoch": 0.2342575411552003, + "grad_norm": 1.5459281206130981, + "learning_rate": 4.3530285688858365e-05, + "loss": 5.2701, + "step": 39389 + }, + { + "epoch": 0.2342634884384813, + "grad_norm": 1.6322553157806396, + "learning_rate": 4.3529972135621564e-05, + "loss": 5.5123, + "step": 39390 + }, + { + "epoch": 0.2342694357217623, + "grad_norm": 1.8771675825119019, + "learning_rate": 4.352965857591614e-05, + "loss": 4.513, + "step": 39391 + }, + { + "epoch": 0.2342753830050433, + "grad_norm": 1.77946138381958, + "learning_rate": 4.352934500974222e-05, + "loss": 4.8122, + "step": 39392 + }, + { + "epoch": 0.2342813302883243, + "grad_norm": 1.5952132940292358, + "learning_rate": 4.352903143709988e-05, + "loss": 4.9388, + "step": 39393 + }, + { + "epoch": 0.23428727757160528, + "grad_norm": 1.7433884143829346, + "learning_rate": 4.352871785798925e-05, + "loss": 4.7188, + "step": 39394 + }, + { + "epoch": 0.2342932248548863, + "grad_norm": 1.8550909757614136, + "learning_rate": 4.3528404272410443e-05, + "loss": 4.8925, + "step": 39395 + }, + { + "epoch": 0.23429917213816728, + "grad_norm": 1.7110971212387085, + "learning_rate": 4.3528090680363555e-05, + "loss": 4.9685, + "step": 39396 + }, + { + "epoch": 0.23430511942144827, + "grad_norm": 1.8549284934997559, + "learning_rate": 4.352777708184871e-05, + "loss": 4.728, + "step": 39397 + }, + { + "epoch": 0.2343110667047293, + "grad_norm": 1.521715521812439, + "learning_rate": 4.3527463476865995e-05, + "loss": 4.41, + "step": 39398 + }, + { + "epoch": 0.23431701398801028, + "grad_norm": 1.7038367986679077, + "learning_rate": 4.352714986541555e-05, + "loss": 4.301, + "step": 39399 + }, + { + "epoch": 0.23432296127129126, + "grad_norm": 1.6658827066421509, + "learning_rate": 4.352683624749746e-05, + "loss": 5.3613, + "step": 39400 + }, + { + "epoch": 0.23432890855457228, + "grad_norm": 1.850461483001709, + "learning_rate": 4.3526522623111843e-05, + "loss": 5.0851, + "step": 39401 + }, + { + "epoch": 0.23433485583785327, + "grad_norm": 1.600386142730713, + "learning_rate": 4.352620899225881e-05, + "loss": 4.8299, + "step": 39402 + }, + { + "epoch": 0.23434080312113426, + "grad_norm": 1.6664706468582153, + "learning_rate": 4.352589535493846e-05, + "loss": 5.2305, + "step": 39403 + }, + { + "epoch": 0.23434675040441527, + "grad_norm": 1.576553225517273, + "learning_rate": 4.352558171115092e-05, + "loss": 5.5279, + "step": 39404 + }, + { + "epoch": 0.23435269768769626, + "grad_norm": 1.6637064218521118, + "learning_rate": 4.3525268060896296e-05, + "loss": 5.5165, + "step": 39405 + }, + { + "epoch": 0.23435864497097725, + "grad_norm": 1.6022828817367554, + "learning_rate": 4.352495440417468e-05, + "loss": 4.6025, + "step": 39406 + }, + { + "epoch": 0.23436459225425826, + "grad_norm": 2.1038920879364014, + "learning_rate": 4.352464074098621e-05, + "loss": 4.2571, + "step": 39407 + }, + { + "epoch": 0.23437053953753925, + "grad_norm": 1.9733129739761353, + "learning_rate": 4.352432707133096e-05, + "loss": 5.0075, + "step": 39408 + }, + { + "epoch": 0.23437648682082024, + "grad_norm": 3.435161590576172, + "learning_rate": 4.3524013395209074e-05, + "loss": 4.0373, + "step": 39409 + }, + { + "epoch": 0.23438243410410126, + "grad_norm": 3.5970253944396973, + "learning_rate": 4.352369971262064e-05, + "loss": 3.7217, + "step": 39410 + }, + { + "epoch": 0.23438838138738224, + "grad_norm": 3.314128875732422, + "learning_rate": 4.352338602356577e-05, + "loss": 3.5539, + "step": 39411 + }, + { + "epoch": 0.23439432867066323, + "grad_norm": 1.9025384187698364, + "learning_rate": 4.352307232804459e-05, + "loss": 4.6692, + "step": 39412 + }, + { + "epoch": 0.23440027595394425, + "grad_norm": 1.7677268981933594, + "learning_rate": 4.3522758626057184e-05, + "loss": 5.0206, + "step": 39413 + }, + { + "epoch": 0.23440622323722524, + "grad_norm": 1.8457081317901611, + "learning_rate": 4.3522444917603676e-05, + "loss": 4.7544, + "step": 39414 + }, + { + "epoch": 0.23441217052050622, + "grad_norm": 2.1236960887908936, + "learning_rate": 4.352213120268418e-05, + "loss": 3.549, + "step": 39415 + }, + { + "epoch": 0.23441811780378724, + "grad_norm": 2.5540764331817627, + "learning_rate": 4.35218174812988e-05, + "loss": 3.3407, + "step": 39416 + }, + { + "epoch": 0.23442406508706823, + "grad_norm": 2.1707522869110107, + "learning_rate": 4.352150375344763e-05, + "loss": 3.3798, + "step": 39417 + }, + { + "epoch": 0.23443001237034922, + "grad_norm": 1.5410895347595215, + "learning_rate": 4.352119001913081e-05, + "loss": 3.7162, + "step": 39418 + }, + { + "epoch": 0.23443595965363023, + "grad_norm": 1.5335618257522583, + "learning_rate": 4.352087627834843e-05, + "loss": 4.6438, + "step": 39419 + }, + { + "epoch": 0.23444190693691122, + "grad_norm": 1.642561435699463, + "learning_rate": 4.352056253110061e-05, + "loss": 4.8075, + "step": 39420 + }, + { + "epoch": 0.2344478542201922, + "grad_norm": 1.8755661249160767, + "learning_rate": 4.352024877738744e-05, + "loss": 4.8355, + "step": 39421 + }, + { + "epoch": 0.23445380150347322, + "grad_norm": 1.9932796955108643, + "learning_rate": 4.3519935017209045e-05, + "loss": 4.8492, + "step": 39422 + }, + { + "epoch": 0.2344597487867542, + "grad_norm": 1.5432254076004028, + "learning_rate": 4.351962125056553e-05, + "loss": 4.8187, + "step": 39423 + }, + { + "epoch": 0.2344656960700352, + "grad_norm": 1.849226951599121, + "learning_rate": 4.351930747745702e-05, + "loss": 4.3849, + "step": 39424 + }, + { + "epoch": 0.23447164335331622, + "grad_norm": 1.8147178888320923, + "learning_rate": 4.35189936978836e-05, + "loss": 4.4224, + "step": 39425 + }, + { + "epoch": 0.2344775906365972, + "grad_norm": 1.742662787437439, + "learning_rate": 4.35186799118454e-05, + "loss": 4.2055, + "step": 39426 + }, + { + "epoch": 0.2344835379198782, + "grad_norm": 1.741877794265747, + "learning_rate": 4.3518366119342504e-05, + "loss": 4.2056, + "step": 39427 + }, + { + "epoch": 0.2344894852031592, + "grad_norm": 1.7838786840438843, + "learning_rate": 4.3518052320375055e-05, + "loss": 5.4981, + "step": 39428 + }, + { + "epoch": 0.2344954324864402, + "grad_norm": 1.5131925344467163, + "learning_rate": 4.351773851494313e-05, + "loss": 5.1125, + "step": 39429 + }, + { + "epoch": 0.23450137976972119, + "grad_norm": 1.7058484554290771, + "learning_rate": 4.3517424703046864e-05, + "loss": 4.7788, + "step": 39430 + }, + { + "epoch": 0.2345073270530022, + "grad_norm": 1.871307134628296, + "learning_rate": 4.351711088468635e-05, + "loss": 4.5781, + "step": 39431 + }, + { + "epoch": 0.2345132743362832, + "grad_norm": 1.476925253868103, + "learning_rate": 4.351679705986171e-05, + "loss": 4.7101, + "step": 39432 + }, + { + "epoch": 0.23451922161956418, + "grad_norm": 1.629453182220459, + "learning_rate": 4.351648322857304e-05, + "loss": 4.4409, + "step": 39433 + }, + { + "epoch": 0.2345251689028452, + "grad_norm": 1.6458929777145386, + "learning_rate": 4.351616939082047e-05, + "loss": 4.6916, + "step": 39434 + }, + { + "epoch": 0.23453111618612618, + "grad_norm": 1.5822285413742065, + "learning_rate": 4.351585554660409e-05, + "loss": 4.6374, + "step": 39435 + }, + { + "epoch": 0.23453706346940717, + "grad_norm": 1.6070079803466797, + "learning_rate": 4.351554169592401e-05, + "loss": 5.3531, + "step": 39436 + }, + { + "epoch": 0.23454301075268819, + "grad_norm": 1.527276635169983, + "learning_rate": 4.351522783878035e-05, + "loss": 5.5012, + "step": 39437 + }, + { + "epoch": 0.23454895803596917, + "grad_norm": 1.8088719844818115, + "learning_rate": 4.3514913975173224e-05, + "loss": 4.6404, + "step": 39438 + }, + { + "epoch": 0.23455490531925016, + "grad_norm": 1.815889596939087, + "learning_rate": 4.351460010510272e-05, + "loss": 4.4818, + "step": 39439 + }, + { + "epoch": 0.23456085260253115, + "grad_norm": 1.7771681547164917, + "learning_rate": 4.351428622856897e-05, + "loss": 4.3622, + "step": 39440 + }, + { + "epoch": 0.23456679988581217, + "grad_norm": 1.737944483757019, + "learning_rate": 4.351397234557206e-05, + "loss": 4.4178, + "step": 39441 + }, + { + "epoch": 0.23457274716909315, + "grad_norm": 1.5377593040466309, + "learning_rate": 4.3513658456112126e-05, + "loss": 4.4063, + "step": 39442 + }, + { + "epoch": 0.23457869445237414, + "grad_norm": 1.757392168045044, + "learning_rate": 4.351334456018926e-05, + "loss": 4.4072, + "step": 39443 + }, + { + "epoch": 0.23458464173565516, + "grad_norm": 1.8958923816680908, + "learning_rate": 4.3513030657803575e-05, + "loss": 4.4878, + "step": 39444 + }, + { + "epoch": 0.23459058901893615, + "grad_norm": 1.893446683883667, + "learning_rate": 4.351271674895518e-05, + "loss": 4.2177, + "step": 39445 + }, + { + "epoch": 0.23459653630221713, + "grad_norm": 1.8133530616760254, + "learning_rate": 4.35124028336442e-05, + "loss": 4.31, + "step": 39446 + }, + { + "epoch": 0.23460248358549815, + "grad_norm": 1.6795728206634521, + "learning_rate": 4.3512088911870724e-05, + "loss": 4.4789, + "step": 39447 + }, + { + "epoch": 0.23460843086877914, + "grad_norm": 1.7759149074554443, + "learning_rate": 4.351177498363487e-05, + "loss": 4.8038, + "step": 39448 + }, + { + "epoch": 0.23461437815206013, + "grad_norm": 1.7960399389266968, + "learning_rate": 4.3511461048936744e-05, + "loss": 4.9761, + "step": 39449 + }, + { + "epoch": 0.23462032543534114, + "grad_norm": 1.7421083450317383, + "learning_rate": 4.3511147107776465e-05, + "loss": 4.2165, + "step": 39450 + }, + { + "epoch": 0.23462627271862213, + "grad_norm": 1.9104562997817993, + "learning_rate": 4.3510833160154124e-05, + "loss": 4.3507, + "step": 39451 + }, + { + "epoch": 0.23463222000190312, + "grad_norm": 1.6309458017349243, + "learning_rate": 4.351051920606985e-05, + "loss": 5.0622, + "step": 39452 + }, + { + "epoch": 0.23463816728518413, + "grad_norm": 1.7124251127243042, + "learning_rate": 4.3510205245523744e-05, + "loss": 4.9691, + "step": 39453 + }, + { + "epoch": 0.23464411456846512, + "grad_norm": 1.589788556098938, + "learning_rate": 4.3509891278515916e-05, + "loss": 5.0264, + "step": 39454 + }, + { + "epoch": 0.2346500618517461, + "grad_norm": 1.7935584783554077, + "learning_rate": 4.350957730504648e-05, + "loss": 4.9153, + "step": 39455 + }, + { + "epoch": 0.23465600913502713, + "grad_norm": 1.4762649536132812, + "learning_rate": 4.350926332511554e-05, + "loss": 4.9941, + "step": 39456 + }, + { + "epoch": 0.23466195641830812, + "grad_norm": 1.5399127006530762, + "learning_rate": 4.35089493387232e-05, + "loss": 4.8539, + "step": 39457 + }, + { + "epoch": 0.2346679037015891, + "grad_norm": 1.581641674041748, + "learning_rate": 4.350863534586958e-05, + "loss": 4.8709, + "step": 39458 + }, + { + "epoch": 0.23467385098487012, + "grad_norm": 1.7906439304351807, + "learning_rate": 4.350832134655479e-05, + "loss": 4.6924, + "step": 39459 + }, + { + "epoch": 0.2346797982681511, + "grad_norm": 1.7961326837539673, + "learning_rate": 4.3508007340778936e-05, + "loss": 4.7806, + "step": 39460 + }, + { + "epoch": 0.2346857455514321, + "grad_norm": 1.387488842010498, + "learning_rate": 4.350769332854212e-05, + "loss": 5.3746, + "step": 39461 + }, + { + "epoch": 0.2346916928347131, + "grad_norm": 1.3593922853469849, + "learning_rate": 4.3507379309844475e-05, + "loss": 5.0035, + "step": 39462 + }, + { + "epoch": 0.2346976401179941, + "grad_norm": 1.7202558517456055, + "learning_rate": 4.3507065284686086e-05, + "loss": 5.0444, + "step": 39463 + }, + { + "epoch": 0.2347035874012751, + "grad_norm": 1.5007917881011963, + "learning_rate": 4.3506751253067066e-05, + "loss": 4.7126, + "step": 39464 + }, + { + "epoch": 0.2347095346845561, + "grad_norm": 1.666621208190918, + "learning_rate": 4.350643721498754e-05, + "loss": 4.4873, + "step": 39465 + }, + { + "epoch": 0.2347154819678371, + "grad_norm": 2.67529296875, + "learning_rate": 4.350612317044761e-05, + "loss": 4.3927, + "step": 39466 + }, + { + "epoch": 0.23472142925111808, + "grad_norm": 2.530919313430786, + "learning_rate": 4.350580911944737e-05, + "loss": 3.8997, + "step": 39467 + }, + { + "epoch": 0.2347273765343991, + "grad_norm": 2.31793212890625, + "learning_rate": 4.3505495061986954e-05, + "loss": 4.007, + "step": 39468 + }, + { + "epoch": 0.23473332381768008, + "grad_norm": 1.7136186361312866, + "learning_rate": 4.350518099806646e-05, + "loss": 5.1124, + "step": 39469 + }, + { + "epoch": 0.23473927110096107, + "grad_norm": 1.7256274223327637, + "learning_rate": 4.3504866927686e-05, + "loss": 3.8912, + "step": 39470 + }, + { + "epoch": 0.2347452183842421, + "grad_norm": 1.7337493896484375, + "learning_rate": 4.350455285084568e-05, + "loss": 4.2857, + "step": 39471 + }, + { + "epoch": 0.23475116566752308, + "grad_norm": 1.6132410764694214, + "learning_rate": 4.350423876754561e-05, + "loss": 4.9954, + "step": 39472 + }, + { + "epoch": 0.23475711295080406, + "grad_norm": 1.4260412454605103, + "learning_rate": 4.35039246777859e-05, + "loss": 5.1, + "step": 39473 + }, + { + "epoch": 0.23476306023408508, + "grad_norm": 1.6036821603775024, + "learning_rate": 4.3503610581566664e-05, + "loss": 5.2813, + "step": 39474 + }, + { + "epoch": 0.23476900751736607, + "grad_norm": 1.385947823524475, + "learning_rate": 4.350329647888801e-05, + "loss": 4.882, + "step": 39475 + }, + { + "epoch": 0.23477495480064706, + "grad_norm": 1.7176967859268188, + "learning_rate": 4.3502982369750045e-05, + "loss": 4.4356, + "step": 39476 + }, + { + "epoch": 0.23478090208392807, + "grad_norm": 1.7131975889205933, + "learning_rate": 4.350266825415288e-05, + "loss": 4.4235, + "step": 39477 + }, + { + "epoch": 0.23478684936720906, + "grad_norm": 1.6809394359588623, + "learning_rate": 4.3502354132096624e-05, + "loss": 4.7656, + "step": 39478 + }, + { + "epoch": 0.23479279665049005, + "grad_norm": 1.5170011520385742, + "learning_rate": 4.3502040003581385e-05, + "loss": 5.1865, + "step": 39479 + }, + { + "epoch": 0.23479874393377106, + "grad_norm": 1.4915732145309448, + "learning_rate": 4.350172586860728e-05, + "loss": 5.6408, + "step": 39480 + }, + { + "epoch": 0.23480469121705205, + "grad_norm": 1.4026539325714111, + "learning_rate": 4.3501411727174404e-05, + "loss": 5.6604, + "step": 39481 + }, + { + "epoch": 0.23481063850033304, + "grad_norm": 1.4482790231704712, + "learning_rate": 4.350109757928289e-05, + "loss": 4.9783, + "step": 39482 + }, + { + "epoch": 0.23481658578361406, + "grad_norm": 1.2592450380325317, + "learning_rate": 4.350078342493282e-05, + "loss": 4.7902, + "step": 39483 + }, + { + "epoch": 0.23482253306689505, + "grad_norm": 1.7065273523330688, + "learning_rate": 4.350046926412433e-05, + "loss": 4.4604, + "step": 39484 + }, + { + "epoch": 0.23482848035017603, + "grad_norm": 2.8723208904266357, + "learning_rate": 4.350015509685752e-05, + "loss": 3.297, + "step": 39485 + }, + { + "epoch": 0.23483442763345705, + "grad_norm": 2.372694253921509, + "learning_rate": 4.3499840923132484e-05, + "loss": 3.8019, + "step": 39486 + }, + { + "epoch": 0.23484037491673804, + "grad_norm": 1.394163727760315, + "learning_rate": 4.3499526742949346e-05, + "loss": 5.1203, + "step": 39487 + }, + { + "epoch": 0.23484632220001903, + "grad_norm": 1.5372754335403442, + "learning_rate": 4.349921255630822e-05, + "loss": 4.6551, + "step": 39488 + }, + { + "epoch": 0.23485226948330004, + "grad_norm": 1.8942519426345825, + "learning_rate": 4.349889836320921e-05, + "loss": 3.5038, + "step": 39489 + }, + { + "epoch": 0.23485821676658103, + "grad_norm": 2.2233126163482666, + "learning_rate": 4.3498584163652424e-05, + "loss": 3.0863, + "step": 39490 + }, + { + "epoch": 0.23486416404986202, + "grad_norm": 2.37809419631958, + "learning_rate": 4.349826995763797e-05, + "loss": 3.0556, + "step": 39491 + }, + { + "epoch": 0.23487011133314303, + "grad_norm": 2.368769645690918, + "learning_rate": 4.3497955745165966e-05, + "loss": 2.8985, + "step": 39492 + }, + { + "epoch": 0.23487605861642402, + "grad_norm": 1.9883050918579102, + "learning_rate": 4.349764152623652e-05, + "loss": 3.425, + "step": 39493 + }, + { + "epoch": 0.234882005899705, + "grad_norm": 1.586185097694397, + "learning_rate": 4.349732730084973e-05, + "loss": 5.1936, + "step": 39494 + }, + { + "epoch": 0.23488795318298603, + "grad_norm": 2.2893435955047607, + "learning_rate": 4.3497013069005724e-05, + "loss": 2.8021, + "step": 39495 + }, + { + "epoch": 0.234893900466267, + "grad_norm": 2.5505330562591553, + "learning_rate": 4.349669883070459e-05, + "loss": 2.9212, + "step": 39496 + }, + { + "epoch": 0.234899847749548, + "grad_norm": 2.5210187435150146, + "learning_rate": 4.3496384585946455e-05, + "loss": 2.9037, + "step": 39497 + }, + { + "epoch": 0.234905795032829, + "grad_norm": 2.4007105827331543, + "learning_rate": 4.3496070334731425e-05, + "loss": 2.7009, + "step": 39498 + }, + { + "epoch": 0.23491174231611, + "grad_norm": 2.381126642227173, + "learning_rate": 4.34957560770596e-05, + "loss": 2.5355, + "step": 39499 + }, + { + "epoch": 0.234917689599391, + "grad_norm": 2.500800848007202, + "learning_rate": 4.349544181293111e-05, + "loss": 2.6994, + "step": 39500 + }, + { + "epoch": 0.23492363688267198, + "grad_norm": 2.550818681716919, + "learning_rate": 4.3495127542346045e-05, + "loss": 2.7628, + "step": 39501 + }, + { + "epoch": 0.234929584165953, + "grad_norm": 2.438868522644043, + "learning_rate": 4.349481326530453e-05, + "loss": 2.7773, + "step": 39502 + }, + { + "epoch": 0.234935531449234, + "grad_norm": 2.3863131999969482, + "learning_rate": 4.349449898180665e-05, + "loss": 2.565, + "step": 39503 + }, + { + "epoch": 0.23494147873251497, + "grad_norm": 2.595072031021118, + "learning_rate": 4.3494184691852544e-05, + "loss": 2.6882, + "step": 39504 + }, + { + "epoch": 0.234947426015796, + "grad_norm": 2.2971251010894775, + "learning_rate": 4.349387039544231e-05, + "loss": 2.635, + "step": 39505 + }, + { + "epoch": 0.23495337329907698, + "grad_norm": 2.3225739002227783, + "learning_rate": 4.349355609257605e-05, + "loss": 3.3878, + "step": 39506 + }, + { + "epoch": 0.23495932058235797, + "grad_norm": 2.412611722946167, + "learning_rate": 4.349324178325389e-05, + "loss": 3.0068, + "step": 39507 + }, + { + "epoch": 0.23496526786563898, + "grad_norm": 1.950431227684021, + "learning_rate": 4.3492927467475924e-05, + "loss": 3.9725, + "step": 39508 + }, + { + "epoch": 0.23497121514891997, + "grad_norm": 1.5830776691436768, + "learning_rate": 4.3492613145242264e-05, + "loss": 5.1513, + "step": 39509 + }, + { + "epoch": 0.23497716243220096, + "grad_norm": 1.5284711122512817, + "learning_rate": 4.3492298816553034e-05, + "loss": 5.1141, + "step": 39510 + }, + { + "epoch": 0.23498310971548197, + "grad_norm": 2.3426167964935303, + "learning_rate": 4.349198448140833e-05, + "loss": 2.8968, + "step": 39511 + }, + { + "epoch": 0.23498905699876296, + "grad_norm": 1.8177212476730347, + "learning_rate": 4.3491670139808263e-05, + "loss": 5.201, + "step": 39512 + }, + { + "epoch": 0.23499500428204395, + "grad_norm": 1.8724275827407837, + "learning_rate": 4.349135579175294e-05, + "loss": 5.1527, + "step": 39513 + }, + { + "epoch": 0.23500095156532497, + "grad_norm": 1.576641321182251, + "learning_rate": 4.3491041437242486e-05, + "loss": 5.0082, + "step": 39514 + }, + { + "epoch": 0.23500689884860596, + "grad_norm": 1.6473275423049927, + "learning_rate": 4.349072707627699e-05, + "loss": 5.0796, + "step": 39515 + }, + { + "epoch": 0.23501284613188694, + "grad_norm": 1.5497204065322876, + "learning_rate": 4.3490412708856584e-05, + "loss": 4.8808, + "step": 39516 + }, + { + "epoch": 0.23501879341516796, + "grad_norm": 1.9269403219223022, + "learning_rate": 4.349009833498136e-05, + "loss": 5.0166, + "step": 39517 + }, + { + "epoch": 0.23502474069844895, + "grad_norm": 1.795958161354065, + "learning_rate": 4.348978395465143e-05, + "loss": 4.9874, + "step": 39518 + }, + { + "epoch": 0.23503068798172994, + "grad_norm": 1.7254250049591064, + "learning_rate": 4.348946956786691e-05, + "loss": 4.793, + "step": 39519 + }, + { + "epoch": 0.23503663526501095, + "grad_norm": 1.8075964450836182, + "learning_rate": 4.348915517462791e-05, + "loss": 4.9552, + "step": 39520 + }, + { + "epoch": 0.23504258254829194, + "grad_norm": 1.6607565879821777, + "learning_rate": 4.348884077493454e-05, + "loss": 4.9673, + "step": 39521 + }, + { + "epoch": 0.23504852983157293, + "grad_norm": 2.0513784885406494, + "learning_rate": 4.3488526368786897e-05, + "loss": 4.9962, + "step": 39522 + }, + { + "epoch": 0.23505447711485394, + "grad_norm": 1.448609709739685, + "learning_rate": 4.348821195618511e-05, + "loss": 5.2576, + "step": 39523 + }, + { + "epoch": 0.23506042439813493, + "grad_norm": 1.7218849658966064, + "learning_rate": 4.3487897537129266e-05, + "loss": 5.0945, + "step": 39524 + }, + { + "epoch": 0.23506637168141592, + "grad_norm": 1.8158671855926514, + "learning_rate": 4.34875831116195e-05, + "loss": 5.1053, + "step": 39525 + }, + { + "epoch": 0.23507231896469694, + "grad_norm": 1.6465450525283813, + "learning_rate": 4.348726867965591e-05, + "loss": 4.9159, + "step": 39526 + }, + { + "epoch": 0.23507826624797792, + "grad_norm": 1.6938047409057617, + "learning_rate": 4.3486954241238595e-05, + "loss": 5.0443, + "step": 39527 + }, + { + "epoch": 0.2350842135312589, + "grad_norm": 1.5726323127746582, + "learning_rate": 4.348663979636768e-05, + "loss": 4.7019, + "step": 39528 + }, + { + "epoch": 0.23509016081453993, + "grad_norm": 1.506906509399414, + "learning_rate": 4.3486325345043275e-05, + "loss": 5.5914, + "step": 39529 + }, + { + "epoch": 0.23509610809782092, + "grad_norm": 1.6261730194091797, + "learning_rate": 4.3486010887265485e-05, + "loss": 5.9064, + "step": 39530 + }, + { + "epoch": 0.2351020553811019, + "grad_norm": 1.477356195449829, + "learning_rate": 4.3485696423034415e-05, + "loss": 5.748, + "step": 39531 + }, + { + "epoch": 0.23510800266438292, + "grad_norm": 1.7038002014160156, + "learning_rate": 4.348538195235018e-05, + "loss": 5.2618, + "step": 39532 + }, + { + "epoch": 0.2351139499476639, + "grad_norm": 1.9998583793640137, + "learning_rate": 4.348506747521289e-05, + "loss": 4.4769, + "step": 39533 + }, + { + "epoch": 0.2351198972309449, + "grad_norm": 1.7400646209716797, + "learning_rate": 4.348475299162266e-05, + "loss": 4.6152, + "step": 39534 + }, + { + "epoch": 0.2351258445142259, + "grad_norm": 1.4792178869247437, + "learning_rate": 4.348443850157958e-05, + "loss": 4.9018, + "step": 39535 + }, + { + "epoch": 0.2351317917975069, + "grad_norm": 1.8028171062469482, + "learning_rate": 4.348412400508378e-05, + "loss": 4.9801, + "step": 39536 + }, + { + "epoch": 0.2351377390807879, + "grad_norm": 1.7561883926391602, + "learning_rate": 4.3483809502135365e-05, + "loss": 4.9372, + "step": 39537 + }, + { + "epoch": 0.2351436863640689, + "grad_norm": 2.0931129455566406, + "learning_rate": 4.348349499273444e-05, + "loss": 4.8422, + "step": 39538 + }, + { + "epoch": 0.2351496336473499, + "grad_norm": 2.22172474861145, + "learning_rate": 4.3483180476881124e-05, + "loss": 3.7755, + "step": 39539 + }, + { + "epoch": 0.23515558093063088, + "grad_norm": 2.2393369674682617, + "learning_rate": 4.348286595457552e-05, + "loss": 3.6435, + "step": 39540 + }, + { + "epoch": 0.2351615282139119, + "grad_norm": 2.1963789463043213, + "learning_rate": 4.3482551425817735e-05, + "loss": 4.0413, + "step": 39541 + }, + { + "epoch": 0.23516747549719288, + "grad_norm": 1.6470162868499756, + "learning_rate": 4.348223689060788e-05, + "loss": 5.1944, + "step": 39542 + }, + { + "epoch": 0.23517342278047387, + "grad_norm": 1.440127968788147, + "learning_rate": 4.3481922348946066e-05, + "loss": 5.0694, + "step": 39543 + }, + { + "epoch": 0.2351793700637549, + "grad_norm": 1.7956537008285522, + "learning_rate": 4.3481607800832416e-05, + "loss": 5.466, + "step": 39544 + }, + { + "epoch": 0.23518531734703588, + "grad_norm": 1.5988211631774902, + "learning_rate": 4.348129324626702e-05, + "loss": 5.3832, + "step": 39545 + }, + { + "epoch": 0.23519126463031687, + "grad_norm": 1.7652308940887451, + "learning_rate": 4.348097868524999e-05, + "loss": 5.1408, + "step": 39546 + }, + { + "epoch": 0.23519721191359788, + "grad_norm": 1.96556556224823, + "learning_rate": 4.348066411778144e-05, + "loss": 5.1057, + "step": 39547 + }, + { + "epoch": 0.23520315919687887, + "grad_norm": 1.6262363195419312, + "learning_rate": 4.348034954386149e-05, + "loss": 5.1536, + "step": 39548 + }, + { + "epoch": 0.23520910648015986, + "grad_norm": 1.9983383417129517, + "learning_rate": 4.3480034963490244e-05, + "loss": 4.4115, + "step": 39549 + }, + { + "epoch": 0.23521505376344087, + "grad_norm": 1.8249067068099976, + "learning_rate": 4.34797203766678e-05, + "loss": 4.7155, + "step": 39550 + }, + { + "epoch": 0.23522100104672186, + "grad_norm": 1.874074935913086, + "learning_rate": 4.347940578339428e-05, + "loss": 5.1159, + "step": 39551 + }, + { + "epoch": 0.23522694833000285, + "grad_norm": 1.861391305923462, + "learning_rate": 4.347909118366978e-05, + "loss": 4.6431, + "step": 39552 + }, + { + "epoch": 0.23523289561328387, + "grad_norm": 2.319910764694214, + "learning_rate": 4.347877657749444e-05, + "loss": 3.926, + "step": 39553 + }, + { + "epoch": 0.23523884289656485, + "grad_norm": 2.0125155448913574, + "learning_rate": 4.3478461964868336e-05, + "loss": 4.3153, + "step": 39554 + }, + { + "epoch": 0.23524479017984584, + "grad_norm": 2.236830234527588, + "learning_rate": 4.34781473457916e-05, + "loss": 5.1541, + "step": 39555 + }, + { + "epoch": 0.23525073746312683, + "grad_norm": 1.7119864225387573, + "learning_rate": 4.347783272026432e-05, + "loss": 5.0258, + "step": 39556 + }, + { + "epoch": 0.23525668474640785, + "grad_norm": 1.8801114559173584, + "learning_rate": 4.347751808828664e-05, + "loss": 5.0044, + "step": 39557 + }, + { + "epoch": 0.23526263202968883, + "grad_norm": 1.584053874015808, + "learning_rate": 4.347720344985863e-05, + "loss": 5.1926, + "step": 39558 + }, + { + "epoch": 0.23526857931296982, + "grad_norm": 2.160999059677124, + "learning_rate": 4.347688880498043e-05, + "loss": 4.1634, + "step": 39559 + }, + { + "epoch": 0.23527452659625084, + "grad_norm": 1.8408249616622925, + "learning_rate": 4.347657415365214e-05, + "loss": 4.5682, + "step": 39560 + }, + { + "epoch": 0.23528047387953183, + "grad_norm": 2.1002492904663086, + "learning_rate": 4.347625949587386e-05, + "loss": 5.0053, + "step": 39561 + }, + { + "epoch": 0.23528642116281281, + "grad_norm": 1.8429839611053467, + "learning_rate": 4.3475944831645715e-05, + "loss": 4.7472, + "step": 39562 + }, + { + "epoch": 0.23529236844609383, + "grad_norm": 1.9861998558044434, + "learning_rate": 4.3475630160967807e-05, + "loss": 3.7446, + "step": 39563 + }, + { + "epoch": 0.23529831572937482, + "grad_norm": 1.7070415019989014, + "learning_rate": 4.347531548384025e-05, + "loss": 4.8468, + "step": 39564 + }, + { + "epoch": 0.2353042630126558, + "grad_norm": 1.8060168027877808, + "learning_rate": 4.347500080026314e-05, + "loss": 5.0668, + "step": 39565 + }, + { + "epoch": 0.23531021029593682, + "grad_norm": 1.8883711099624634, + "learning_rate": 4.3474686110236616e-05, + "loss": 4.8557, + "step": 39566 + }, + { + "epoch": 0.2353161575792178, + "grad_norm": 2.1879305839538574, + "learning_rate": 4.347437141376076e-05, + "loss": 3.8515, + "step": 39567 + }, + { + "epoch": 0.2353221048624988, + "grad_norm": 1.619714379310608, + "learning_rate": 4.347405671083569e-05, + "loss": 5.1808, + "step": 39568 + }, + { + "epoch": 0.23532805214577981, + "grad_norm": 1.4702348709106445, + "learning_rate": 4.347374200146152e-05, + "loss": 5.0461, + "step": 39569 + }, + { + "epoch": 0.2353339994290608, + "grad_norm": 2.2921080589294434, + "learning_rate": 4.347342728563836e-05, + "loss": 4.0864, + "step": 39570 + }, + { + "epoch": 0.2353399467123418, + "grad_norm": 1.6588904857635498, + "learning_rate": 4.347311256336631e-05, + "loss": 5.0127, + "step": 39571 + }, + { + "epoch": 0.2353458939956228, + "grad_norm": 1.6477138996124268, + "learning_rate": 4.34727978346455e-05, + "loss": 4.7629, + "step": 39572 + }, + { + "epoch": 0.2353518412789038, + "grad_norm": 1.5137696266174316, + "learning_rate": 4.347248309947601e-05, + "loss": 4.9401, + "step": 39573 + }, + { + "epoch": 0.23535778856218478, + "grad_norm": 1.4825645685195923, + "learning_rate": 4.347216835785798e-05, + "loss": 5.9218, + "step": 39574 + }, + { + "epoch": 0.2353637358454658, + "grad_norm": 2.803783655166626, + "learning_rate": 4.3471853609791505e-05, + "loss": 2.9973, + "step": 39575 + }, + { + "epoch": 0.2353696831287468, + "grad_norm": 2.7221133708953857, + "learning_rate": 4.3471538855276695e-05, + "loss": 3.0297, + "step": 39576 + }, + { + "epoch": 0.23537563041202778, + "grad_norm": 2.5941596031188965, + "learning_rate": 4.347122409431367e-05, + "loss": 3.7102, + "step": 39577 + }, + { + "epoch": 0.2353815776953088, + "grad_norm": 2.1006662845611572, + "learning_rate": 4.347090932690252e-05, + "loss": 4.0978, + "step": 39578 + }, + { + "epoch": 0.23538752497858978, + "grad_norm": 2.5507373809814453, + "learning_rate": 4.347059455304337e-05, + "loss": 3.0659, + "step": 39579 + }, + { + "epoch": 0.23539347226187077, + "grad_norm": 2.7012178897857666, + "learning_rate": 4.3470279772736325e-05, + "loss": 2.9277, + "step": 39580 + }, + { + "epoch": 0.23539941954515178, + "grad_norm": 2.677558422088623, + "learning_rate": 4.3469964985981496e-05, + "loss": 2.7663, + "step": 39581 + }, + { + "epoch": 0.23540536682843277, + "grad_norm": 3.0192692279815674, + "learning_rate": 4.346965019277899e-05, + "loss": 2.9613, + "step": 39582 + }, + { + "epoch": 0.23541131411171376, + "grad_norm": 2.792264461517334, + "learning_rate": 4.346933539312893e-05, + "loss": 3.248, + "step": 39583 + }, + { + "epoch": 0.23541726139499478, + "grad_norm": 2.1806282997131348, + "learning_rate": 4.346902058703141e-05, + "loss": 3.8647, + "step": 39584 + }, + { + "epoch": 0.23542320867827576, + "grad_norm": 2.3454999923706055, + "learning_rate": 4.346870577448655e-05, + "loss": 3.9804, + "step": 39585 + }, + { + "epoch": 0.23542915596155675, + "grad_norm": 2.145684003829956, + "learning_rate": 4.346839095549445e-05, + "loss": 4.7823, + "step": 39586 + }, + { + "epoch": 0.23543510324483777, + "grad_norm": 2.0232717990875244, + "learning_rate": 4.346807613005523e-05, + "loss": 4.9594, + "step": 39587 + }, + { + "epoch": 0.23544105052811876, + "grad_norm": 2.0676941871643066, + "learning_rate": 4.3467761298168985e-05, + "loss": 4.2215, + "step": 39588 + }, + { + "epoch": 0.23544699781139974, + "grad_norm": 2.122058391571045, + "learning_rate": 4.346744645983584e-05, + "loss": 3.8307, + "step": 39589 + }, + { + "epoch": 0.23545294509468076, + "grad_norm": 1.8523790836334229, + "learning_rate": 4.3467131615055906e-05, + "loss": 4.9136, + "step": 39590 + }, + { + "epoch": 0.23545889237796175, + "grad_norm": 1.6085901260375977, + "learning_rate": 4.346681676382928e-05, + "loss": 5.096, + "step": 39591 + }, + { + "epoch": 0.23546483966124274, + "grad_norm": 1.5861530303955078, + "learning_rate": 4.346650190615609e-05, + "loss": 4.8874, + "step": 39592 + }, + { + "epoch": 0.23547078694452375, + "grad_norm": 2.204979181289673, + "learning_rate": 4.346618704203642e-05, + "loss": 3.8377, + "step": 39593 + }, + { + "epoch": 0.23547673422780474, + "grad_norm": 2.2924487590789795, + "learning_rate": 4.3465872171470406e-05, + "loss": 3.8936, + "step": 39594 + }, + { + "epoch": 0.23548268151108573, + "grad_norm": 2.0946760177612305, + "learning_rate": 4.346555729445815e-05, + "loss": 3.7599, + "step": 39595 + }, + { + "epoch": 0.23548862879436674, + "grad_norm": 1.7681735754013062, + "learning_rate": 4.346524241099975e-05, + "loss": 5.0385, + "step": 39596 + }, + { + "epoch": 0.23549457607764773, + "grad_norm": 1.4723299741744995, + "learning_rate": 4.3464927521095325e-05, + "loss": 5.105, + "step": 39597 + }, + { + "epoch": 0.23550052336092872, + "grad_norm": 1.6298372745513916, + "learning_rate": 4.346461262474499e-05, + "loss": 5.3164, + "step": 39598 + }, + { + "epoch": 0.23550647064420974, + "grad_norm": 1.8214343786239624, + "learning_rate": 4.346429772194884e-05, + "loss": 4.1696, + "step": 39599 + }, + { + "epoch": 0.23551241792749072, + "grad_norm": 1.6282157897949219, + "learning_rate": 4.3463982812707005e-05, + "loss": 5.2517, + "step": 39600 + }, + { + "epoch": 0.2355183652107717, + "grad_norm": 1.5538759231567383, + "learning_rate": 4.346366789701958e-05, + "loss": 4.8288, + "step": 39601 + }, + { + "epoch": 0.23552431249405273, + "grad_norm": 1.349043846130371, + "learning_rate": 4.346335297488668e-05, + "loss": 5.6924, + "step": 39602 + }, + { + "epoch": 0.23553025977733372, + "grad_norm": 1.605543851852417, + "learning_rate": 4.3463038046308414e-05, + "loss": 4.5651, + "step": 39603 + }, + { + "epoch": 0.2355362070606147, + "grad_norm": 1.511016607284546, + "learning_rate": 4.3462723111284885e-05, + "loss": 5.1926, + "step": 39604 + }, + { + "epoch": 0.23554215434389572, + "grad_norm": 1.6200916767120361, + "learning_rate": 4.346240816981622e-05, + "loss": 5.1843, + "step": 39605 + }, + { + "epoch": 0.2355481016271767, + "grad_norm": 1.439429521560669, + "learning_rate": 4.346209322190252e-05, + "loss": 4.5373, + "step": 39606 + }, + { + "epoch": 0.2355540489104577, + "grad_norm": 1.8972511291503906, + "learning_rate": 4.3461778267543885e-05, + "loss": 4.389, + "step": 39607 + }, + { + "epoch": 0.2355599961937387, + "grad_norm": 1.8233554363250732, + "learning_rate": 4.346146330674044e-05, + "loss": 4.141, + "step": 39608 + }, + { + "epoch": 0.2355659434770197, + "grad_norm": 1.9290480613708496, + "learning_rate": 4.3461148339492283e-05, + "loss": 4.7057, + "step": 39609 + }, + { + "epoch": 0.2355718907603007, + "grad_norm": 1.6103575229644775, + "learning_rate": 4.346083336579953e-05, + "loss": 5.2774, + "step": 39610 + }, + { + "epoch": 0.2355778380435817, + "grad_norm": 1.5069211721420288, + "learning_rate": 4.3460518385662305e-05, + "loss": 5.0212, + "step": 39611 + }, + { + "epoch": 0.2355837853268627, + "grad_norm": 1.6505247354507446, + "learning_rate": 4.3460203399080694e-05, + "loss": 4.371, + "step": 39612 + }, + { + "epoch": 0.23558973261014368, + "grad_norm": 1.5396970510482788, + "learning_rate": 4.3459888406054816e-05, + "loss": 4.3237, + "step": 39613 + }, + { + "epoch": 0.23559567989342467, + "grad_norm": 1.6712186336517334, + "learning_rate": 4.3459573406584786e-05, + "loss": 4.3366, + "step": 39614 + }, + { + "epoch": 0.23560162717670569, + "grad_norm": 1.9782822132110596, + "learning_rate": 4.3459258400670705e-05, + "loss": 4.6864, + "step": 39615 + }, + { + "epoch": 0.23560757445998667, + "grad_norm": 2.00964617729187, + "learning_rate": 4.345894338831269e-05, + "loss": 4.5942, + "step": 39616 + }, + { + "epoch": 0.23561352174326766, + "grad_norm": 1.8180450201034546, + "learning_rate": 4.3458628369510846e-05, + "loss": 4.5192, + "step": 39617 + }, + { + "epoch": 0.23561946902654868, + "grad_norm": 1.7940545082092285, + "learning_rate": 4.345831334426529e-05, + "loss": 4.8344, + "step": 39618 + }, + { + "epoch": 0.23562541630982967, + "grad_norm": 1.7346389293670654, + "learning_rate": 4.345799831257612e-05, + "loss": 4.4739, + "step": 39619 + }, + { + "epoch": 0.23563136359311065, + "grad_norm": 1.5963873863220215, + "learning_rate": 4.345768327444346e-05, + "loss": 5.6441, + "step": 39620 + }, + { + "epoch": 0.23563731087639167, + "grad_norm": 1.9023709297180176, + "learning_rate": 4.345736822986741e-05, + "loss": 4.774, + "step": 39621 + }, + { + "epoch": 0.23564325815967266, + "grad_norm": 1.6928259134292603, + "learning_rate": 4.3457053178848085e-05, + "loss": 4.508, + "step": 39622 + }, + { + "epoch": 0.23564920544295365, + "grad_norm": 2.6728034019470215, + "learning_rate": 4.345673812138559e-05, + "loss": 4.2878, + "step": 39623 + }, + { + "epoch": 0.23565515272623466, + "grad_norm": 3.916083574295044, + "learning_rate": 4.3456423057480045e-05, + "loss": 3.7933, + "step": 39624 + }, + { + "epoch": 0.23566110000951565, + "grad_norm": 4.121766567230225, + "learning_rate": 4.3456107987131544e-05, + "loss": 3.7085, + "step": 39625 + }, + { + "epoch": 0.23566704729279664, + "grad_norm": 3.7750494480133057, + "learning_rate": 4.345579291034021e-05, + "loss": 3.3967, + "step": 39626 + }, + { + "epoch": 0.23567299457607765, + "grad_norm": 3.047283887863159, + "learning_rate": 4.345547782710616e-05, + "loss": 3.7059, + "step": 39627 + }, + { + "epoch": 0.23567894185935864, + "grad_norm": 2.731980562210083, + "learning_rate": 4.3455162737429486e-05, + "loss": 3.4824, + "step": 39628 + }, + { + "epoch": 0.23568488914263963, + "grad_norm": 3.2570066452026367, + "learning_rate": 4.3454847641310306e-05, + "loss": 3.7479, + "step": 39629 + }, + { + "epoch": 0.23569083642592065, + "grad_norm": 3.029534339904785, + "learning_rate": 4.345453253874873e-05, + "loss": 3.4877, + "step": 39630 + }, + { + "epoch": 0.23569678370920163, + "grad_norm": 2.8466241359710693, + "learning_rate": 4.345421742974486e-05, + "loss": 3.3035, + "step": 39631 + }, + { + "epoch": 0.23570273099248262, + "grad_norm": 2.638864517211914, + "learning_rate": 4.345390231429882e-05, + "loss": 3.285, + "step": 39632 + }, + { + "epoch": 0.23570867827576364, + "grad_norm": 2.597590446472168, + "learning_rate": 4.345358719241071e-05, + "loss": 3.3196, + "step": 39633 + }, + { + "epoch": 0.23571462555904463, + "grad_norm": 2.5003716945648193, + "learning_rate": 4.345327206408064e-05, + "loss": 3.2357, + "step": 39634 + }, + { + "epoch": 0.23572057284232562, + "grad_norm": 2.7217302322387695, + "learning_rate": 4.345295692930873e-05, + "loss": 3.2867, + "step": 39635 + }, + { + "epoch": 0.23572652012560663, + "grad_norm": 2.5010008811950684, + "learning_rate": 4.3452641788095084e-05, + "loss": 3.1874, + "step": 39636 + }, + { + "epoch": 0.23573246740888762, + "grad_norm": 2.312774658203125, + "learning_rate": 4.345232664043981e-05, + "loss": 3.4379, + "step": 39637 + }, + { + "epoch": 0.2357384146921686, + "grad_norm": 1.739977478981018, + "learning_rate": 4.3452011486343015e-05, + "loss": 4.4238, + "step": 39638 + }, + { + "epoch": 0.23574436197544962, + "grad_norm": 1.7827277183532715, + "learning_rate": 4.3451696325804825e-05, + "loss": 4.8541, + "step": 39639 + }, + { + "epoch": 0.2357503092587306, + "grad_norm": 1.7119590044021606, + "learning_rate": 4.345138115882533e-05, + "loss": 4.7584, + "step": 39640 + }, + { + "epoch": 0.2357562565420116, + "grad_norm": 1.6340469121932983, + "learning_rate": 4.345106598540465e-05, + "loss": 4.7786, + "step": 39641 + }, + { + "epoch": 0.23576220382529262, + "grad_norm": 1.9640412330627441, + "learning_rate": 4.345075080554289e-05, + "loss": 4.8616, + "step": 39642 + }, + { + "epoch": 0.2357681511085736, + "grad_norm": 1.6392191648483276, + "learning_rate": 4.345043561924016e-05, + "loss": 4.5769, + "step": 39643 + }, + { + "epoch": 0.2357740983918546, + "grad_norm": 1.7787988185882568, + "learning_rate": 4.345012042649658e-05, + "loss": 4.5608, + "step": 39644 + }, + { + "epoch": 0.2357800456751356, + "grad_norm": 1.6990470886230469, + "learning_rate": 4.3449805227312254e-05, + "loss": 4.7891, + "step": 39645 + }, + { + "epoch": 0.2357859929584166, + "grad_norm": 1.9699727296829224, + "learning_rate": 4.3449490021687287e-05, + "loss": 4.1808, + "step": 39646 + }, + { + "epoch": 0.23579194024169758, + "grad_norm": 1.6719646453857422, + "learning_rate": 4.3449174809621805e-05, + "loss": 4.6185, + "step": 39647 + }, + { + "epoch": 0.2357978875249786, + "grad_norm": 2.851654052734375, + "learning_rate": 4.34488595911159e-05, + "loss": 4.1725, + "step": 39648 + }, + { + "epoch": 0.2358038348082596, + "grad_norm": 1.8505662679672241, + "learning_rate": 4.344854436616968e-05, + "loss": 4.053, + "step": 39649 + }, + { + "epoch": 0.23580978209154058, + "grad_norm": 1.769942045211792, + "learning_rate": 4.344822913478328e-05, + "loss": 4.2291, + "step": 39650 + }, + { + "epoch": 0.2358157293748216, + "grad_norm": 1.834368109703064, + "learning_rate": 4.344791389695678e-05, + "loss": 5.0813, + "step": 39651 + }, + { + "epoch": 0.23582167665810258, + "grad_norm": 1.6249654293060303, + "learning_rate": 4.344759865269031e-05, + "loss": 5.1234, + "step": 39652 + }, + { + "epoch": 0.23582762394138357, + "grad_norm": 1.7159777879714966, + "learning_rate": 4.3447283401983975e-05, + "loss": 4.9201, + "step": 39653 + }, + { + "epoch": 0.23583357122466458, + "grad_norm": 1.4406479597091675, + "learning_rate": 4.344696814483788e-05, + "loss": 4.9575, + "step": 39654 + }, + { + "epoch": 0.23583951850794557, + "grad_norm": 1.507131814956665, + "learning_rate": 4.3446652881252144e-05, + "loss": 5.0084, + "step": 39655 + }, + { + "epoch": 0.23584546579122656, + "grad_norm": 1.706028699874878, + "learning_rate": 4.344633761122687e-05, + "loss": 4.5784, + "step": 39656 + }, + { + "epoch": 0.23585141307450758, + "grad_norm": 1.5523202419281006, + "learning_rate": 4.344602233476217e-05, + "loss": 4.5634, + "step": 39657 + }, + { + "epoch": 0.23585736035778856, + "grad_norm": 1.5952317714691162, + "learning_rate": 4.344570705185815e-05, + "loss": 4.7964, + "step": 39658 + }, + { + "epoch": 0.23586330764106955, + "grad_norm": 1.7145177125930786, + "learning_rate": 4.3445391762514934e-05, + "loss": 4.8856, + "step": 39659 + }, + { + "epoch": 0.23586925492435057, + "grad_norm": 1.5456407070159912, + "learning_rate": 4.344507646673261e-05, + "loss": 4.9571, + "step": 39660 + }, + { + "epoch": 0.23587520220763156, + "grad_norm": 1.6312416791915894, + "learning_rate": 4.34447611645113e-05, + "loss": 4.6212, + "step": 39661 + }, + { + "epoch": 0.23588114949091255, + "grad_norm": 1.6123836040496826, + "learning_rate": 4.3444445855851126e-05, + "loss": 4.7611, + "step": 39662 + }, + { + "epoch": 0.23588709677419356, + "grad_norm": 1.5431879758834839, + "learning_rate": 4.3444130540752185e-05, + "loss": 4.5731, + "step": 39663 + }, + { + "epoch": 0.23589304405747455, + "grad_norm": 1.378326654434204, + "learning_rate": 4.344381521921458e-05, + "loss": 4.4747, + "step": 39664 + }, + { + "epoch": 0.23589899134075554, + "grad_norm": 1.6408112049102783, + "learning_rate": 4.344349989123844e-05, + "loss": 4.9272, + "step": 39665 + }, + { + "epoch": 0.23590493862403655, + "grad_norm": 1.6013070344924927, + "learning_rate": 4.3443184556823854e-05, + "loss": 4.3682, + "step": 39666 + }, + { + "epoch": 0.23591088590731754, + "grad_norm": 1.744828462600708, + "learning_rate": 4.344286921597095e-05, + "loss": 4.1018, + "step": 39667 + }, + { + "epoch": 0.23591683319059853, + "grad_norm": 1.688822627067566, + "learning_rate": 4.344255386867983e-05, + "loss": 4.6118, + "step": 39668 + }, + { + "epoch": 0.23592278047387955, + "grad_norm": 1.7823905944824219, + "learning_rate": 4.34422385149506e-05, + "loss": 4.6959, + "step": 39669 + }, + { + "epoch": 0.23592872775716053, + "grad_norm": 1.740500807762146, + "learning_rate": 4.344192315478338e-05, + "loss": 4.8013, + "step": 39670 + }, + { + "epoch": 0.23593467504044152, + "grad_norm": 1.7970569133758545, + "learning_rate": 4.344160778817827e-05, + "loss": 4.8883, + "step": 39671 + }, + { + "epoch": 0.2359406223237225, + "grad_norm": 1.6192377805709839, + "learning_rate": 4.344129241513539e-05, + "loss": 4.8943, + "step": 39672 + }, + { + "epoch": 0.23594656960700353, + "grad_norm": 1.698466420173645, + "learning_rate": 4.344097703565485e-05, + "loss": 4.5755, + "step": 39673 + }, + { + "epoch": 0.2359525168902845, + "grad_norm": 1.6837269067764282, + "learning_rate": 4.344066164973675e-05, + "loss": 4.7297, + "step": 39674 + }, + { + "epoch": 0.2359584641735655, + "grad_norm": 1.6579086780548096, + "learning_rate": 4.3440346257381204e-05, + "loss": 4.9919, + "step": 39675 + }, + { + "epoch": 0.23596441145684652, + "grad_norm": 1.6303355693817139, + "learning_rate": 4.344003085858833e-05, + "loss": 5.1976, + "step": 39676 + }, + { + "epoch": 0.2359703587401275, + "grad_norm": 2.267923355102539, + "learning_rate": 4.343971545335822e-05, + "loss": 4.204, + "step": 39677 + }, + { + "epoch": 0.2359763060234085, + "grad_norm": 1.9635370969772339, + "learning_rate": 4.3439400041691005e-05, + "loss": 5.0505, + "step": 39678 + }, + { + "epoch": 0.2359822533066895, + "grad_norm": 1.9441829919815063, + "learning_rate": 4.343908462358679e-05, + "loss": 4.5925, + "step": 39679 + }, + { + "epoch": 0.2359882005899705, + "grad_norm": 1.6971355676651, + "learning_rate": 4.343876919904567e-05, + "loss": 4.8558, + "step": 39680 + }, + { + "epoch": 0.2359941478732515, + "grad_norm": 1.5483894348144531, + "learning_rate": 4.343845376806777e-05, + "loss": 4.6838, + "step": 39681 + }, + { + "epoch": 0.2360000951565325, + "grad_norm": 1.6236921548843384, + "learning_rate": 4.34381383306532e-05, + "loss": 5.047, + "step": 39682 + }, + { + "epoch": 0.2360060424398135, + "grad_norm": 1.7564657926559448, + "learning_rate": 4.343782288680206e-05, + "loss": 4.6391, + "step": 39683 + }, + { + "epoch": 0.23601198972309448, + "grad_norm": 1.7207711935043335, + "learning_rate": 4.343750743651448e-05, + "loss": 5.0933, + "step": 39684 + }, + { + "epoch": 0.2360179370063755, + "grad_norm": 1.5402401685714722, + "learning_rate": 4.343719197979054e-05, + "loss": 5.0135, + "step": 39685 + }, + { + "epoch": 0.23602388428965648, + "grad_norm": 1.732639193534851, + "learning_rate": 4.343687651663038e-05, + "loss": 4.9194, + "step": 39686 + }, + { + "epoch": 0.23602983157293747, + "grad_norm": 1.5156683921813965, + "learning_rate": 4.343656104703408e-05, + "loss": 4.9513, + "step": 39687 + }, + { + "epoch": 0.2360357788562185, + "grad_norm": 1.4682366847991943, + "learning_rate": 4.343624557100178e-05, + "loss": 4.959, + "step": 39688 + }, + { + "epoch": 0.23604172613949947, + "grad_norm": 1.4715865850448608, + "learning_rate": 4.3435930088533575e-05, + "loss": 4.9556, + "step": 39689 + }, + { + "epoch": 0.23604767342278046, + "grad_norm": 1.731086015701294, + "learning_rate": 4.343561459962958e-05, + "loss": 4.8839, + "step": 39690 + }, + { + "epoch": 0.23605362070606148, + "grad_norm": 1.482099175453186, + "learning_rate": 4.34352991042899e-05, + "loss": 4.9354, + "step": 39691 + }, + { + "epoch": 0.23605956798934247, + "grad_norm": 1.880521535873413, + "learning_rate": 4.343498360251465e-05, + "loss": 4.9111, + "step": 39692 + }, + { + "epoch": 0.23606551527262346, + "grad_norm": 1.752835988998413, + "learning_rate": 4.343466809430393e-05, + "loss": 4.9135, + "step": 39693 + }, + { + "epoch": 0.23607146255590447, + "grad_norm": 1.6565130949020386, + "learning_rate": 4.3434352579657864e-05, + "loss": 4.8368, + "step": 39694 + }, + { + "epoch": 0.23607740983918546, + "grad_norm": 1.6498968601226807, + "learning_rate": 4.3434037058576556e-05, + "loss": 4.9898, + "step": 39695 + }, + { + "epoch": 0.23608335712246645, + "grad_norm": 1.846377968788147, + "learning_rate": 4.3433721531060115e-05, + "loss": 4.7886, + "step": 39696 + }, + { + "epoch": 0.23608930440574746, + "grad_norm": 1.7037628889083862, + "learning_rate": 4.343340599710865e-05, + "loss": 4.9677, + "step": 39697 + }, + { + "epoch": 0.23609525168902845, + "grad_norm": 1.7007973194122314, + "learning_rate": 4.3433090456722275e-05, + "loss": 4.8799, + "step": 39698 + }, + { + "epoch": 0.23610119897230944, + "grad_norm": 1.7689772844314575, + "learning_rate": 4.34327749099011e-05, + "loss": 4.8756, + "step": 39699 + }, + { + "epoch": 0.23610714625559046, + "grad_norm": 1.5716655254364014, + "learning_rate": 4.343245935664523e-05, + "loss": 4.886, + "step": 39700 + }, + { + "epoch": 0.23611309353887144, + "grad_norm": 1.6320520639419556, + "learning_rate": 4.3432143796954785e-05, + "loss": 4.9558, + "step": 39701 + }, + { + "epoch": 0.23611904082215243, + "grad_norm": 1.616171956062317, + "learning_rate": 4.3431828230829864e-05, + "loss": 4.8599, + "step": 39702 + }, + { + "epoch": 0.23612498810543345, + "grad_norm": 1.6095540523529053, + "learning_rate": 4.343151265827058e-05, + "loss": 4.752, + "step": 39703 + }, + { + "epoch": 0.23613093538871444, + "grad_norm": 1.4963219165802002, + "learning_rate": 4.343119707927705e-05, + "loss": 4.7252, + "step": 39704 + }, + { + "epoch": 0.23613688267199542, + "grad_norm": 1.5957183837890625, + "learning_rate": 4.343088149384938e-05, + "loss": 4.5582, + "step": 39705 + }, + { + "epoch": 0.23614282995527644, + "grad_norm": 1.5361696481704712, + "learning_rate": 4.3430565901987674e-05, + "loss": 4.7854, + "step": 39706 + }, + { + "epoch": 0.23614877723855743, + "grad_norm": 1.5576621294021606, + "learning_rate": 4.3430250303692055e-05, + "loss": 4.7568, + "step": 39707 + }, + { + "epoch": 0.23615472452183842, + "grad_norm": 1.5435431003570557, + "learning_rate": 4.3429934698962625e-05, + "loss": 4.5362, + "step": 39708 + }, + { + "epoch": 0.23616067180511943, + "grad_norm": 1.5170698165893555, + "learning_rate": 4.342961908779949e-05, + "loss": 4.6373, + "step": 39709 + }, + { + "epoch": 0.23616661908840042, + "grad_norm": 1.588207721710205, + "learning_rate": 4.342930347020277e-05, + "loss": 4.7936, + "step": 39710 + }, + { + "epoch": 0.2361725663716814, + "grad_norm": 1.751194715499878, + "learning_rate": 4.342898784617257e-05, + "loss": 4.5122, + "step": 39711 + }, + { + "epoch": 0.23617851365496242, + "grad_norm": 1.8367148637771606, + "learning_rate": 4.3428672215709e-05, + "loss": 4.8803, + "step": 39712 + }, + { + "epoch": 0.2361844609382434, + "grad_norm": 1.6473870277404785, + "learning_rate": 4.342835657881217e-05, + "loss": 4.8684, + "step": 39713 + }, + { + "epoch": 0.2361904082215244, + "grad_norm": 1.5764669179916382, + "learning_rate": 4.3428040935482195e-05, + "loss": 4.8413, + "step": 39714 + }, + { + "epoch": 0.23619635550480542, + "grad_norm": 1.715018391609192, + "learning_rate": 4.342772528571918e-05, + "loss": 4.7669, + "step": 39715 + }, + { + "epoch": 0.2362023027880864, + "grad_norm": 1.5597290992736816, + "learning_rate": 4.342740962952323e-05, + "loss": 4.6682, + "step": 39716 + }, + { + "epoch": 0.2362082500713674, + "grad_norm": 1.5768378973007202, + "learning_rate": 4.342709396689447e-05, + "loss": 4.8165, + "step": 39717 + }, + { + "epoch": 0.2362141973546484, + "grad_norm": 1.661075234413147, + "learning_rate": 4.3426778297832995e-05, + "loss": 4.5506, + "step": 39718 + }, + { + "epoch": 0.2362201446379294, + "grad_norm": 1.584989070892334, + "learning_rate": 4.342646262233892e-05, + "loss": 4.8426, + "step": 39719 + }, + { + "epoch": 0.23622609192121038, + "grad_norm": 1.5978823900222778, + "learning_rate": 4.342614694041237e-05, + "loss": 4.7612, + "step": 39720 + }, + { + "epoch": 0.2362320392044914, + "grad_norm": 1.4043251276016235, + "learning_rate": 4.3425831252053436e-05, + "loss": 4.6123, + "step": 39721 + }, + { + "epoch": 0.2362379864877724, + "grad_norm": 1.534112572669983, + "learning_rate": 4.342551555726223e-05, + "loss": 5.0231, + "step": 39722 + }, + { + "epoch": 0.23624393377105338, + "grad_norm": 1.4414738416671753, + "learning_rate": 4.342519985603887e-05, + "loss": 4.7686, + "step": 39723 + }, + { + "epoch": 0.2362498810543344, + "grad_norm": 1.5299564599990845, + "learning_rate": 4.342488414838346e-05, + "loss": 4.7504, + "step": 39724 + }, + { + "epoch": 0.23625582833761538, + "grad_norm": 1.8014576435089111, + "learning_rate": 4.342456843429612e-05, + "loss": 4.5203, + "step": 39725 + }, + { + "epoch": 0.23626177562089637, + "grad_norm": 1.3567272424697876, + "learning_rate": 4.342425271377695e-05, + "loss": 4.638, + "step": 39726 + }, + { + "epoch": 0.23626772290417739, + "grad_norm": 1.448305606842041, + "learning_rate": 4.342393698682607e-05, + "loss": 4.5339, + "step": 39727 + }, + { + "epoch": 0.23627367018745837, + "grad_norm": 1.2754170894622803, + "learning_rate": 4.3423621253443574e-05, + "loss": 4.5913, + "step": 39728 + }, + { + "epoch": 0.23627961747073936, + "grad_norm": 1.2582231760025024, + "learning_rate": 4.342330551362959e-05, + "loss": 4.5565, + "step": 39729 + }, + { + "epoch": 0.23628556475402035, + "grad_norm": 1.4124832153320312, + "learning_rate": 4.3422989767384214e-05, + "loss": 4.6425, + "step": 39730 + }, + { + "epoch": 0.23629151203730137, + "grad_norm": 1.6686687469482422, + "learning_rate": 4.3422674014707564e-05, + "loss": 4.7609, + "step": 39731 + }, + { + "epoch": 0.23629745932058235, + "grad_norm": 1.5445233583450317, + "learning_rate": 4.342235825559975e-05, + "loss": 4.5896, + "step": 39732 + }, + { + "epoch": 0.23630340660386334, + "grad_norm": 1.6400084495544434, + "learning_rate": 4.3422042490060887e-05, + "loss": 4.5642, + "step": 39733 + }, + { + "epoch": 0.23630935388714436, + "grad_norm": 1.735404133796692, + "learning_rate": 4.342172671809107e-05, + "loss": 4.5955, + "step": 39734 + }, + { + "epoch": 0.23631530117042535, + "grad_norm": 1.5258411169052124, + "learning_rate": 4.342141093969042e-05, + "loss": 4.5038, + "step": 39735 + }, + { + "epoch": 0.23632124845370633, + "grad_norm": 1.6231786012649536, + "learning_rate": 4.3421095154859046e-05, + "loss": 4.8466, + "step": 39736 + }, + { + "epoch": 0.23632719573698735, + "grad_norm": 1.498099446296692, + "learning_rate": 4.342077936359706e-05, + "loss": 4.7716, + "step": 39737 + }, + { + "epoch": 0.23633314302026834, + "grad_norm": 1.6102790832519531, + "learning_rate": 4.3420463565904576e-05, + "loss": 4.5928, + "step": 39738 + }, + { + "epoch": 0.23633909030354933, + "grad_norm": 1.6059402227401733, + "learning_rate": 4.342014776178169e-05, + "loss": 4.6719, + "step": 39739 + }, + { + "epoch": 0.23634503758683034, + "grad_norm": 1.6634807586669922, + "learning_rate": 4.341983195122853e-05, + "loss": 4.3684, + "step": 39740 + }, + { + "epoch": 0.23635098487011133, + "grad_norm": 1.5562560558319092, + "learning_rate": 4.3419516134245185e-05, + "loss": 4.3445, + "step": 39741 + }, + { + "epoch": 0.23635693215339232, + "grad_norm": 1.7897953987121582, + "learning_rate": 4.3419200310831784e-05, + "loss": 4.829, + "step": 39742 + }, + { + "epoch": 0.23636287943667333, + "grad_norm": 1.5983352661132812, + "learning_rate": 4.341888448098843e-05, + "loss": 5.426, + "step": 39743 + }, + { + "epoch": 0.23636882671995432, + "grad_norm": 1.7689006328582764, + "learning_rate": 4.341856864471523e-05, + "loss": 5.0936, + "step": 39744 + }, + { + "epoch": 0.2363747740032353, + "grad_norm": 1.8115812540054321, + "learning_rate": 4.34182528020123e-05, + "loss": 4.7046, + "step": 39745 + }, + { + "epoch": 0.23638072128651633, + "grad_norm": 1.6517319679260254, + "learning_rate": 4.3417936952879745e-05, + "loss": 4.9471, + "step": 39746 + }, + { + "epoch": 0.23638666856979731, + "grad_norm": 1.9833317995071411, + "learning_rate": 4.341762109731768e-05, + "loss": 4.259, + "step": 39747 + }, + { + "epoch": 0.2363926158530783, + "grad_norm": 1.6605212688446045, + "learning_rate": 4.341730523532622e-05, + "loss": 4.6667, + "step": 39748 + }, + { + "epoch": 0.23639856313635932, + "grad_norm": 1.8738734722137451, + "learning_rate": 4.341698936690547e-05, + "loss": 4.5505, + "step": 39749 + }, + { + "epoch": 0.2364045104196403, + "grad_norm": 1.7416300773620605, + "learning_rate": 4.3416673492055534e-05, + "loss": 4.5346, + "step": 39750 + }, + { + "epoch": 0.2364104577029213, + "grad_norm": 1.7728749513626099, + "learning_rate": 4.341635761077653e-05, + "loss": 5.125, + "step": 39751 + }, + { + "epoch": 0.2364164049862023, + "grad_norm": 1.8593645095825195, + "learning_rate": 4.3416041723068555e-05, + "loss": 4.9069, + "step": 39752 + }, + { + "epoch": 0.2364223522694833, + "grad_norm": 1.8321951627731323, + "learning_rate": 4.341572582893174e-05, + "loss": 4.4018, + "step": 39753 + }, + { + "epoch": 0.2364282995527643, + "grad_norm": 2.6567561626434326, + "learning_rate": 4.341540992836619e-05, + "loss": 4.1624, + "step": 39754 + }, + { + "epoch": 0.2364342468360453, + "grad_norm": 2.788621425628662, + "learning_rate": 4.3415094021372004e-05, + "loss": 3.9062, + "step": 39755 + }, + { + "epoch": 0.2364401941193263, + "grad_norm": 3.0069475173950195, + "learning_rate": 4.34147781079493e-05, + "loss": 3.9894, + "step": 39756 + }, + { + "epoch": 0.23644614140260728, + "grad_norm": 2.6209957599639893, + "learning_rate": 4.3414462188098186e-05, + "loss": 3.9002, + "step": 39757 + }, + { + "epoch": 0.2364520886858883, + "grad_norm": 2.197650194168091, + "learning_rate": 4.341414626181878e-05, + "loss": 3.8847, + "step": 39758 + }, + { + "epoch": 0.23645803596916928, + "grad_norm": 2.5086872577667236, + "learning_rate": 4.341383032911117e-05, + "loss": 3.7872, + "step": 39759 + }, + { + "epoch": 0.23646398325245027, + "grad_norm": 2.1555652618408203, + "learning_rate": 4.34135143899755e-05, + "loss": 3.4424, + "step": 39760 + }, + { + "epoch": 0.2364699305357313, + "grad_norm": 2.588146924972534, + "learning_rate": 4.341319844441185e-05, + "loss": 3.5797, + "step": 39761 + }, + { + "epoch": 0.23647587781901228, + "grad_norm": 2.394775867462158, + "learning_rate": 4.341288249242035e-05, + "loss": 4.1704, + "step": 39762 + }, + { + "epoch": 0.23648182510229326, + "grad_norm": 2.6768877506256104, + "learning_rate": 4.34125665340011e-05, + "loss": 3.926, + "step": 39763 + }, + { + "epoch": 0.23648777238557428, + "grad_norm": 2.8283958435058594, + "learning_rate": 4.341225056915421e-05, + "loss": 4.2181, + "step": 39764 + }, + { + "epoch": 0.23649371966885527, + "grad_norm": 2.7577545642852783, + "learning_rate": 4.34119345978798e-05, + "loss": 4.0358, + "step": 39765 + }, + { + "epoch": 0.23649966695213626, + "grad_norm": 2.6630911827087402, + "learning_rate": 4.341161862017797e-05, + "loss": 3.9368, + "step": 39766 + }, + { + "epoch": 0.23650561423541727, + "grad_norm": 3.082396984100342, + "learning_rate": 4.341130263604883e-05, + "loss": 3.5809, + "step": 39767 + }, + { + "epoch": 0.23651156151869826, + "grad_norm": 2.6369762420654297, + "learning_rate": 4.34109866454925e-05, + "loss": 3.657, + "step": 39768 + }, + { + "epoch": 0.23651750880197925, + "grad_norm": 2.8537046909332275, + "learning_rate": 4.341067064850909e-05, + "loss": 3.8611, + "step": 39769 + }, + { + "epoch": 0.23652345608526026, + "grad_norm": 2.628800392150879, + "learning_rate": 4.34103546450987e-05, + "loss": 3.6899, + "step": 39770 + }, + { + "epoch": 0.23652940336854125, + "grad_norm": 2.121014356613159, + "learning_rate": 4.341003863526144e-05, + "loss": 3.8184, + "step": 39771 + }, + { + "epoch": 0.23653535065182224, + "grad_norm": 2.651318073272705, + "learning_rate": 4.340972261899743e-05, + "loss": 3.9339, + "step": 39772 + }, + { + "epoch": 0.23654129793510326, + "grad_norm": 2.6411614418029785, + "learning_rate": 4.340940659630678e-05, + "loss": 3.9456, + "step": 39773 + }, + { + "epoch": 0.23654724521838424, + "grad_norm": 2.308238983154297, + "learning_rate": 4.340909056718959e-05, + "loss": 3.9969, + "step": 39774 + }, + { + "epoch": 0.23655319250166523, + "grad_norm": 3.0689187049865723, + "learning_rate": 4.340877453164599e-05, + "loss": 3.7694, + "step": 39775 + }, + { + "epoch": 0.23655913978494625, + "grad_norm": 2.6818394660949707, + "learning_rate": 4.3408458489676064e-05, + "loss": 3.487, + "step": 39776 + }, + { + "epoch": 0.23656508706822724, + "grad_norm": 2.5647828578948975, + "learning_rate": 4.340814244127993e-05, + "loss": 4.0624, + "step": 39777 + }, + { + "epoch": 0.23657103435150822, + "grad_norm": 2.7362442016601562, + "learning_rate": 4.340782638645772e-05, + "loss": 3.9097, + "step": 39778 + }, + { + "epoch": 0.23657698163478924, + "grad_norm": 2.7237389087677, + "learning_rate": 4.340751032520952e-05, + "loss": 3.6611, + "step": 39779 + }, + { + "epoch": 0.23658292891807023, + "grad_norm": 2.5460264682769775, + "learning_rate": 4.3407194257535445e-05, + "loss": 3.8765, + "step": 39780 + }, + { + "epoch": 0.23658887620135122, + "grad_norm": 2.6930301189422607, + "learning_rate": 4.340687818343561e-05, + "loss": 3.5759, + "step": 39781 + }, + { + "epoch": 0.23659482348463223, + "grad_norm": 2.4678304195404053, + "learning_rate": 4.340656210291013e-05, + "loss": 3.8256, + "step": 39782 + }, + { + "epoch": 0.23660077076791322, + "grad_norm": 2.215928316116333, + "learning_rate": 4.34062460159591e-05, + "loss": 3.8329, + "step": 39783 + }, + { + "epoch": 0.2366067180511942, + "grad_norm": 2.689182758331299, + "learning_rate": 4.3405929922582645e-05, + "loss": 3.5107, + "step": 39784 + }, + { + "epoch": 0.23661266533447523, + "grad_norm": 2.366183042526245, + "learning_rate": 4.3405613822780875e-05, + "loss": 3.6994, + "step": 39785 + }, + { + "epoch": 0.2366186126177562, + "grad_norm": 2.7422029972076416, + "learning_rate": 4.3405297716553886e-05, + "loss": 3.8842, + "step": 39786 + }, + { + "epoch": 0.2366245599010372, + "grad_norm": 2.643587589263916, + "learning_rate": 4.34049816039018e-05, + "loss": 3.8722, + "step": 39787 + }, + { + "epoch": 0.23663050718431822, + "grad_norm": 2.4537999629974365, + "learning_rate": 4.340466548482473e-05, + "loss": 3.9502, + "step": 39788 + }, + { + "epoch": 0.2366364544675992, + "grad_norm": 2.574368953704834, + "learning_rate": 4.340434935932277e-05, + "loss": 3.6153, + "step": 39789 + }, + { + "epoch": 0.2366424017508802, + "grad_norm": 2.779705762863159, + "learning_rate": 4.3404033227396054e-05, + "loss": 3.8353, + "step": 39790 + }, + { + "epoch": 0.23664834903416118, + "grad_norm": 2.8180508613586426, + "learning_rate": 4.340371708904468e-05, + "loss": 3.3763, + "step": 39791 + }, + { + "epoch": 0.2366542963174422, + "grad_norm": 2.660182476043701, + "learning_rate": 4.340340094426876e-05, + "loss": 3.6168, + "step": 39792 + }, + { + "epoch": 0.23666024360072319, + "grad_norm": 2.663090229034424, + "learning_rate": 4.340308479306839e-05, + "loss": 3.7262, + "step": 39793 + }, + { + "epoch": 0.23666619088400417, + "grad_norm": 2.3928894996643066, + "learning_rate": 4.34027686354437e-05, + "loss": 3.8692, + "step": 39794 + }, + { + "epoch": 0.2366721381672852, + "grad_norm": 2.654780149459839, + "learning_rate": 4.340245247139479e-05, + "loss": 3.7075, + "step": 39795 + }, + { + "epoch": 0.23667808545056618, + "grad_norm": 2.345092296600342, + "learning_rate": 4.340213630092178e-05, + "loss": 3.7682, + "step": 39796 + }, + { + "epoch": 0.23668403273384717, + "grad_norm": 2.4375827312469482, + "learning_rate": 4.3401820124024776e-05, + "loss": 3.6996, + "step": 39797 + }, + { + "epoch": 0.23668998001712818, + "grad_norm": 2.407268762588501, + "learning_rate": 4.340150394070388e-05, + "loss": 3.8321, + "step": 39798 + }, + { + "epoch": 0.23669592730040917, + "grad_norm": 2.3417975902557373, + "learning_rate": 4.3401187750959216e-05, + "loss": 4.0033, + "step": 39799 + }, + { + "epoch": 0.23670187458369016, + "grad_norm": 2.420501708984375, + "learning_rate": 4.340087155479089e-05, + "loss": 3.739, + "step": 39800 + }, + { + "epoch": 0.23670782186697117, + "grad_norm": 1.9763052463531494, + "learning_rate": 4.3400555352199e-05, + "loss": 4.7612, + "step": 39801 + }, + { + "epoch": 0.23671376915025216, + "grad_norm": 1.9385654926300049, + "learning_rate": 4.340023914318367e-05, + "loss": 4.9685, + "step": 39802 + }, + { + "epoch": 0.23671971643353315, + "grad_norm": 2.449619770050049, + "learning_rate": 4.339992292774501e-05, + "loss": 3.6381, + "step": 39803 + }, + { + "epoch": 0.23672566371681417, + "grad_norm": 1.925249457359314, + "learning_rate": 4.339960670588312e-05, + "loss": 5.0143, + "step": 39804 + }, + { + "epoch": 0.23673161100009515, + "grad_norm": 2.3256402015686035, + "learning_rate": 4.339929047759812e-05, + "loss": 3.7777, + "step": 39805 + }, + { + "epoch": 0.23673755828337614, + "grad_norm": 2.0616559982299805, + "learning_rate": 4.3398974242890124e-05, + "loss": 4.2865, + "step": 39806 + }, + { + "epoch": 0.23674350556665716, + "grad_norm": 2.7752761840820312, + "learning_rate": 4.339865800175923e-05, + "loss": 3.3378, + "step": 39807 + }, + { + "epoch": 0.23674945284993815, + "grad_norm": 2.9076433181762695, + "learning_rate": 4.339834175420555e-05, + "loss": 3.3096, + "step": 39808 + }, + { + "epoch": 0.23675540013321913, + "grad_norm": 2.4606168270111084, + "learning_rate": 4.339802550022921e-05, + "loss": 3.26, + "step": 39809 + }, + { + "epoch": 0.23676134741650015, + "grad_norm": 2.631068468093872, + "learning_rate": 4.3397709239830295e-05, + "loss": 3.31, + "step": 39810 + }, + { + "epoch": 0.23676729469978114, + "grad_norm": 2.1262693405151367, + "learning_rate": 4.339739297300894e-05, + "loss": 4.0044, + "step": 39811 + }, + { + "epoch": 0.23677324198306213, + "grad_norm": 1.440590262413025, + "learning_rate": 4.339707669976525e-05, + "loss": 4.6205, + "step": 39812 + }, + { + "epoch": 0.23677918926634314, + "grad_norm": 1.6795618534088135, + "learning_rate": 4.339676042009933e-05, + "loss": 4.7781, + "step": 39813 + }, + { + "epoch": 0.23678513654962413, + "grad_norm": 1.5972740650177002, + "learning_rate": 4.3396444134011275e-05, + "loss": 4.8949, + "step": 39814 + }, + { + "epoch": 0.23679108383290512, + "grad_norm": 1.659780502319336, + "learning_rate": 4.339612784150122e-05, + "loss": 4.9178, + "step": 39815 + }, + { + "epoch": 0.23679703111618614, + "grad_norm": 1.523654818534851, + "learning_rate": 4.339581154256928e-05, + "loss": 4.9818, + "step": 39816 + }, + { + "epoch": 0.23680297839946712, + "grad_norm": 1.7680178880691528, + "learning_rate": 4.3395495237215535e-05, + "loss": 4.7116, + "step": 39817 + }, + { + "epoch": 0.2368089256827481, + "grad_norm": 2.39174222946167, + "learning_rate": 4.339517892544012e-05, + "loss": 3.4239, + "step": 39818 + }, + { + "epoch": 0.23681487296602913, + "grad_norm": 2.550715923309326, + "learning_rate": 4.3394862607243134e-05, + "loss": 3.31, + "step": 39819 + }, + { + "epoch": 0.23682082024931012, + "grad_norm": 2.13712215423584, + "learning_rate": 4.339454628262469e-05, + "loss": 3.4468, + "step": 39820 + }, + { + "epoch": 0.2368267675325911, + "grad_norm": 1.8183554410934448, + "learning_rate": 4.3394229951584905e-05, + "loss": 4.6941, + "step": 39821 + }, + { + "epoch": 0.23683271481587212, + "grad_norm": 2.161360740661621, + "learning_rate": 4.3393913614123885e-05, + "loss": 3.2488, + "step": 39822 + }, + { + "epoch": 0.2368386620991531, + "grad_norm": 2.250622510910034, + "learning_rate": 4.339359727024174e-05, + "loss": 2.9575, + "step": 39823 + }, + { + "epoch": 0.2368446093824341, + "grad_norm": 1.722509503364563, + "learning_rate": 4.339328091993857e-05, + "loss": 4.599, + "step": 39824 + }, + { + "epoch": 0.2368505566657151, + "grad_norm": 1.7138597965240479, + "learning_rate": 4.339296456321451e-05, + "loss": 4.8187, + "step": 39825 + }, + { + "epoch": 0.2368565039489961, + "grad_norm": 2.0628795623779297, + "learning_rate": 4.339264820006965e-05, + "loss": 4.4325, + "step": 39826 + }, + { + "epoch": 0.2368624512322771, + "grad_norm": 2.7999677658081055, + "learning_rate": 4.339233183050411e-05, + "loss": 3.9543, + "step": 39827 + }, + { + "epoch": 0.2368683985155581, + "grad_norm": 2.9181196689605713, + "learning_rate": 4.339201545451799e-05, + "loss": 4.0597, + "step": 39828 + }, + { + "epoch": 0.2368743457988391, + "grad_norm": 1.9542845487594604, + "learning_rate": 4.3391699072111415e-05, + "loss": 4.9241, + "step": 39829 + }, + { + "epoch": 0.23688029308212008, + "grad_norm": 2.2268269062042236, + "learning_rate": 4.339138268328448e-05, + "loss": 4.1254, + "step": 39830 + }, + { + "epoch": 0.2368862403654011, + "grad_norm": 2.4981327056884766, + "learning_rate": 4.339106628803731e-05, + "loss": 3.7606, + "step": 39831 + }, + { + "epoch": 0.23689218764868208, + "grad_norm": 2.370508909225464, + "learning_rate": 4.339074988637001e-05, + "loss": 3.6986, + "step": 39832 + }, + { + "epoch": 0.23689813493196307, + "grad_norm": 2.0736162662506104, + "learning_rate": 4.339043347828268e-05, + "loss": 4.1281, + "step": 39833 + }, + { + "epoch": 0.2369040822152441, + "grad_norm": 1.6644777059555054, + "learning_rate": 4.339011706377545e-05, + "loss": 4.5578, + "step": 39834 + }, + { + "epoch": 0.23691002949852508, + "grad_norm": 1.5516061782836914, + "learning_rate": 4.338980064284841e-05, + "loss": 4.5777, + "step": 39835 + }, + { + "epoch": 0.23691597678180606, + "grad_norm": 2.0236034393310547, + "learning_rate": 4.338948421550169e-05, + "loss": 4.5546, + "step": 39836 + }, + { + "epoch": 0.23692192406508708, + "grad_norm": 1.72527277469635, + "learning_rate": 4.3389167781735385e-05, + "loss": 4.6173, + "step": 39837 + }, + { + "epoch": 0.23692787134836807, + "grad_norm": 1.8590136766433716, + "learning_rate": 4.338885134154961e-05, + "loss": 4.3471, + "step": 39838 + }, + { + "epoch": 0.23693381863164906, + "grad_norm": 1.7647113800048828, + "learning_rate": 4.3388534894944484e-05, + "loss": 3.9139, + "step": 39839 + }, + { + "epoch": 0.23693976591493007, + "grad_norm": 2.5895087718963623, + "learning_rate": 4.33882184419201e-05, + "loss": 1.952, + "step": 39840 + }, + { + "epoch": 0.23694571319821106, + "grad_norm": 1.8349545001983643, + "learning_rate": 4.3387901982476586e-05, + "loss": 4.2027, + "step": 39841 + }, + { + "epoch": 0.23695166048149205, + "grad_norm": 2.599876880645752, + "learning_rate": 4.338758551661405e-05, + "loss": 1.963, + "step": 39842 + }, + { + "epoch": 0.23695760776477306, + "grad_norm": 2.0699830055236816, + "learning_rate": 4.338726904433259e-05, + "loss": 2.6252, + "step": 39843 + }, + { + "epoch": 0.23696355504805405, + "grad_norm": 2.20517635345459, + "learning_rate": 4.338695256563233e-05, + "loss": 1.1781, + "step": 39844 + }, + { + "epoch": 0.23696950233133504, + "grad_norm": 2.3173837661743164, + "learning_rate": 4.338663608051337e-05, + "loss": 0.9283, + "step": 39845 + }, + { + "epoch": 0.23697544961461606, + "grad_norm": 2.2324037551879883, + "learning_rate": 4.3386319588975836e-05, + "loss": 1.5728, + "step": 39846 + }, + { + "epoch": 0.23698139689789705, + "grad_norm": 1.6736335754394531, + "learning_rate": 4.338600309101981e-05, + "loss": 4.7163, + "step": 39847 + }, + { + "epoch": 0.23698734418117803, + "grad_norm": 1.7723100185394287, + "learning_rate": 4.338568658664543e-05, + "loss": 4.3372, + "step": 39848 + }, + { + "epoch": 0.23699329146445902, + "grad_norm": 2.2402212619781494, + "learning_rate": 4.33853700758528e-05, + "loss": 5.344, + "step": 39849 + }, + { + "epoch": 0.23699923874774004, + "grad_norm": 1.8501451015472412, + "learning_rate": 4.338505355864202e-05, + "loss": 5.0796, + "step": 39850 + }, + { + "epoch": 0.23700518603102103, + "grad_norm": 2.0170962810516357, + "learning_rate": 4.338473703501321e-05, + "loss": 4.6974, + "step": 39851 + }, + { + "epoch": 0.237011133314302, + "grad_norm": 1.8159914016723633, + "learning_rate": 4.338442050496648e-05, + "loss": 4.7729, + "step": 39852 + }, + { + "epoch": 0.23701708059758303, + "grad_norm": 1.6395304203033447, + "learning_rate": 4.338410396850194e-05, + "loss": 4.4962, + "step": 39853 + }, + { + "epoch": 0.23702302788086402, + "grad_norm": 1.4548068046569824, + "learning_rate": 4.338378742561969e-05, + "loss": 4.5226, + "step": 39854 + }, + { + "epoch": 0.237028975164145, + "grad_norm": 1.401894450187683, + "learning_rate": 4.3383470876319864e-05, + "loss": 4.4299, + "step": 39855 + }, + { + "epoch": 0.23703492244742602, + "grad_norm": 1.4954034090042114, + "learning_rate": 4.3383154320602556e-05, + "loss": 4.4422, + "step": 39856 + }, + { + "epoch": 0.237040869730707, + "grad_norm": 1.4662593603134155, + "learning_rate": 4.338283775846786e-05, + "loss": 4.4502, + "step": 39857 + }, + { + "epoch": 0.237046817013988, + "grad_norm": 1.4960722923278809, + "learning_rate": 4.3382521189915924e-05, + "loss": 4.2204, + "step": 39858 + }, + { + "epoch": 0.23705276429726901, + "grad_norm": 1.5249221324920654, + "learning_rate": 4.3382204614946845e-05, + "loss": 4.326, + "step": 39859 + }, + { + "epoch": 0.23705871158055, + "grad_norm": 1.4498658180236816, + "learning_rate": 4.338188803356071e-05, + "loss": 4.3848, + "step": 39860 + }, + { + "epoch": 0.237064658863831, + "grad_norm": 1.4516419172286987, + "learning_rate": 4.338157144575766e-05, + "loss": 4.3933, + "step": 39861 + }, + { + "epoch": 0.237070606147112, + "grad_norm": 1.4814791679382324, + "learning_rate": 4.338125485153779e-05, + "loss": 4.2899, + "step": 39862 + }, + { + "epoch": 0.237076553430393, + "grad_norm": 1.4463040828704834, + "learning_rate": 4.3380938250901216e-05, + "loss": 4.3085, + "step": 39863 + }, + { + "epoch": 0.23708250071367398, + "grad_norm": 1.3938040733337402, + "learning_rate": 4.338062164384804e-05, + "loss": 4.3688, + "step": 39864 + }, + { + "epoch": 0.237088447996955, + "grad_norm": 1.382372260093689, + "learning_rate": 4.3380305030378385e-05, + "loss": 4.2662, + "step": 39865 + }, + { + "epoch": 0.237094395280236, + "grad_norm": 1.8836345672607422, + "learning_rate": 4.337998841049235e-05, + "loss": 4.5557, + "step": 39866 + }, + { + "epoch": 0.23710034256351697, + "grad_norm": 1.430780053138733, + "learning_rate": 4.3379671784190056e-05, + "loss": 4.3896, + "step": 39867 + }, + { + "epoch": 0.237106289846798, + "grad_norm": 1.4985473155975342, + "learning_rate": 4.3379355151471606e-05, + "loss": 4.3321, + "step": 39868 + }, + { + "epoch": 0.23711223713007898, + "grad_norm": 1.4790635108947754, + "learning_rate": 4.337903851233711e-05, + "loss": 4.4599, + "step": 39869 + }, + { + "epoch": 0.23711818441335997, + "grad_norm": 1.7420741319656372, + "learning_rate": 4.337872186678669e-05, + "loss": 4.5288, + "step": 39870 + }, + { + "epoch": 0.23712413169664098, + "grad_norm": 2.139042615890503, + "learning_rate": 4.337840521482044e-05, + "loss": 4.4184, + "step": 39871 + }, + { + "epoch": 0.23713007897992197, + "grad_norm": 2.679811954498291, + "learning_rate": 4.337808855643848e-05, + "loss": 4.491, + "step": 39872 + }, + { + "epoch": 0.23713602626320296, + "grad_norm": 2.33467173576355, + "learning_rate": 4.3377771891640925e-05, + "loss": 4.4055, + "step": 39873 + }, + { + "epoch": 0.23714197354648398, + "grad_norm": 2.19638991355896, + "learning_rate": 4.3377455220427876e-05, + "loss": 4.4909, + "step": 39874 + }, + { + "epoch": 0.23714792082976496, + "grad_norm": 2.1641952991485596, + "learning_rate": 4.337713854279945e-05, + "loss": 4.3292, + "step": 39875 + }, + { + "epoch": 0.23715386811304595, + "grad_norm": 2.2148971557617188, + "learning_rate": 4.3376821858755746e-05, + "loss": 4.2625, + "step": 39876 + }, + { + "epoch": 0.23715981539632697, + "grad_norm": 2.1260080337524414, + "learning_rate": 4.337650516829689e-05, + "loss": 4.3795, + "step": 39877 + }, + { + "epoch": 0.23716576267960796, + "grad_norm": 1.9064221382141113, + "learning_rate": 4.3376188471422984e-05, + "loss": 4.28, + "step": 39878 + }, + { + "epoch": 0.23717170996288894, + "grad_norm": 1.8643522262573242, + "learning_rate": 4.337587176813414e-05, + "loss": 4.2244, + "step": 39879 + }, + { + "epoch": 0.23717765724616996, + "grad_norm": 2.170990467071533, + "learning_rate": 4.337555505843047e-05, + "loss": 4.2965, + "step": 39880 + }, + { + "epoch": 0.23718360452945095, + "grad_norm": 1.8632001876831055, + "learning_rate": 4.3375238342312084e-05, + "loss": 4.3296, + "step": 39881 + }, + { + "epoch": 0.23718955181273194, + "grad_norm": 1.8718262910842896, + "learning_rate": 4.33749216197791e-05, + "loss": 4.2258, + "step": 39882 + }, + { + "epoch": 0.23719549909601295, + "grad_norm": 1.9377762079238892, + "learning_rate": 4.3374604890831605e-05, + "loss": 4.1339, + "step": 39883 + }, + { + "epoch": 0.23720144637929394, + "grad_norm": 1.8045750856399536, + "learning_rate": 4.3374288155469736e-05, + "loss": 4.1913, + "step": 39884 + }, + { + "epoch": 0.23720739366257493, + "grad_norm": 2.4247703552246094, + "learning_rate": 4.3373971413693584e-05, + "loss": 4.4062, + "step": 39885 + }, + { + "epoch": 0.23721334094585594, + "grad_norm": 2.441964864730835, + "learning_rate": 4.337365466550328e-05, + "loss": 4.3062, + "step": 39886 + }, + { + "epoch": 0.23721928822913693, + "grad_norm": 2.0665531158447266, + "learning_rate": 4.3373337910898914e-05, + "loss": 4.1877, + "step": 39887 + }, + { + "epoch": 0.23722523551241792, + "grad_norm": 1.751538872718811, + "learning_rate": 4.3373021149880614e-05, + "loss": 4.0803, + "step": 39888 + }, + { + "epoch": 0.23723118279569894, + "grad_norm": 3.0823750495910645, + "learning_rate": 4.337270438244847e-05, + "loss": 3.9962, + "step": 39889 + }, + { + "epoch": 0.23723713007897992, + "grad_norm": 1.7145901918411255, + "learning_rate": 4.337238760860261e-05, + "loss": 4.9209, + "step": 39890 + }, + { + "epoch": 0.2372430773622609, + "grad_norm": 1.8586928844451904, + "learning_rate": 4.337207082834315e-05, + "loss": 4.7399, + "step": 39891 + }, + { + "epoch": 0.23724902464554193, + "grad_norm": 1.9576743841171265, + "learning_rate": 4.337175404167018e-05, + "loss": 4.8797, + "step": 39892 + }, + { + "epoch": 0.23725497192882292, + "grad_norm": 1.6683032512664795, + "learning_rate": 4.337143724858381e-05, + "loss": 5.1349, + "step": 39893 + }, + { + "epoch": 0.2372609192121039, + "grad_norm": 1.7969902753829956, + "learning_rate": 4.337112044908418e-05, + "loss": 5.1658, + "step": 39894 + }, + { + "epoch": 0.23726686649538492, + "grad_norm": 2.2213234901428223, + "learning_rate": 4.337080364317137e-05, + "loss": 4.9434, + "step": 39895 + }, + { + "epoch": 0.2372728137786659, + "grad_norm": 2.1538355350494385, + "learning_rate": 4.3370486830845507e-05, + "loss": 4.5511, + "step": 39896 + }, + { + "epoch": 0.2372787610619469, + "grad_norm": 2.237603187561035, + "learning_rate": 4.3370170012106694e-05, + "loss": 3.7551, + "step": 39897 + }, + { + "epoch": 0.2372847083452279, + "grad_norm": 3.0955090522766113, + "learning_rate": 4.336985318695505e-05, + "loss": 2.503, + "step": 39898 + }, + { + "epoch": 0.2372906556285089, + "grad_norm": 1.9793435335159302, + "learning_rate": 4.3369536355390675e-05, + "loss": 4.4538, + "step": 39899 + }, + { + "epoch": 0.2372966029117899, + "grad_norm": 2.1285853385925293, + "learning_rate": 4.3369219517413684e-05, + "loss": 4.3584, + "step": 39900 + }, + { + "epoch": 0.2373025501950709, + "grad_norm": 1.7009873390197754, + "learning_rate": 4.3368902673024194e-05, + "loss": 4.6289, + "step": 39901 + }, + { + "epoch": 0.2373084974783519, + "grad_norm": 1.7879126071929932, + "learning_rate": 4.3368585822222304e-05, + "loss": 4.2106, + "step": 39902 + }, + { + "epoch": 0.23731444476163288, + "grad_norm": 1.7139616012573242, + "learning_rate": 4.336826896500814e-05, + "loss": 4.5923, + "step": 39903 + }, + { + "epoch": 0.2373203920449139, + "grad_norm": 1.5922623872756958, + "learning_rate": 4.336795210138179e-05, + "loss": 4.4593, + "step": 39904 + }, + { + "epoch": 0.23732633932819489, + "grad_norm": 1.6713234186172485, + "learning_rate": 4.3367635231343384e-05, + "loss": 4.2166, + "step": 39905 + }, + { + "epoch": 0.23733228661147587, + "grad_norm": 1.633577585220337, + "learning_rate": 4.3367318354893025e-05, + "loss": 4.4896, + "step": 39906 + }, + { + "epoch": 0.23733823389475686, + "grad_norm": 1.6591612100601196, + "learning_rate": 4.3367001472030824e-05, + "loss": 4.173, + "step": 39907 + }, + { + "epoch": 0.23734418117803788, + "grad_norm": 1.8667633533477783, + "learning_rate": 4.3366684582756895e-05, + "loss": 4.2637, + "step": 39908 + }, + { + "epoch": 0.23735012846131887, + "grad_norm": 1.6186610460281372, + "learning_rate": 4.3366367687071346e-05, + "loss": 4.3127, + "step": 39909 + }, + { + "epoch": 0.23735607574459985, + "grad_norm": 1.8370599746704102, + "learning_rate": 4.336605078497429e-05, + "loss": 4.4424, + "step": 39910 + }, + { + "epoch": 0.23736202302788087, + "grad_norm": 1.812067985534668, + "learning_rate": 4.336573387646583e-05, + "loss": 5.2419, + "step": 39911 + }, + { + "epoch": 0.23736797031116186, + "grad_norm": 2.028104066848755, + "learning_rate": 4.336541696154608e-05, + "loss": 4.8321, + "step": 39912 + }, + { + "epoch": 0.23737391759444285, + "grad_norm": 2.516324996948242, + "learning_rate": 4.336510004021516e-05, + "loss": 3.8673, + "step": 39913 + }, + { + "epoch": 0.23737986487772386, + "grad_norm": 2.5107903480529785, + "learning_rate": 4.336478311247317e-05, + "loss": 3.7475, + "step": 39914 + }, + { + "epoch": 0.23738581216100485, + "grad_norm": 2.4185755252838135, + "learning_rate": 4.336446617832023e-05, + "loss": 3.7617, + "step": 39915 + }, + { + "epoch": 0.23739175944428584, + "grad_norm": 1.93293297290802, + "learning_rate": 4.336414923775644e-05, + "loss": 4.3185, + "step": 39916 + }, + { + "epoch": 0.23739770672756685, + "grad_norm": 1.7484050989151, + "learning_rate": 4.336383229078191e-05, + "loss": 5.1168, + "step": 39917 + }, + { + "epoch": 0.23740365401084784, + "grad_norm": 1.5135313272476196, + "learning_rate": 4.336351533739676e-05, + "loss": 4.9183, + "step": 39918 + }, + { + "epoch": 0.23740960129412883, + "grad_norm": 1.8860149383544922, + "learning_rate": 4.33631983776011e-05, + "loss": 5.0206, + "step": 39919 + }, + { + "epoch": 0.23741554857740985, + "grad_norm": 1.641844391822815, + "learning_rate": 4.336288141139503e-05, + "loss": 4.6539, + "step": 39920 + }, + { + "epoch": 0.23742149586069083, + "grad_norm": 1.7509504556655884, + "learning_rate": 4.336256443877867e-05, + "loss": 4.6623, + "step": 39921 + }, + { + "epoch": 0.23742744314397182, + "grad_norm": 1.7655612230300903, + "learning_rate": 4.3362247459752135e-05, + "loss": 5.5298, + "step": 39922 + }, + { + "epoch": 0.23743339042725284, + "grad_norm": 2.3678815364837646, + "learning_rate": 4.3361930474315524e-05, + "loss": 3.7642, + "step": 39923 + }, + { + "epoch": 0.23743933771053383, + "grad_norm": 3.0474207401275635, + "learning_rate": 4.3361613482468954e-05, + "loss": 3.3554, + "step": 39924 + }, + { + "epoch": 0.23744528499381481, + "grad_norm": 3.052656412124634, + "learning_rate": 4.3361296484212534e-05, + "loss": 3.2537, + "step": 39925 + }, + { + "epoch": 0.23745123227709583, + "grad_norm": 3.0903141498565674, + "learning_rate": 4.336097947954637e-05, + "loss": 3.2136, + "step": 39926 + }, + { + "epoch": 0.23745717956037682, + "grad_norm": 2.6233386993408203, + "learning_rate": 4.336066246847058e-05, + "loss": 3.3688, + "step": 39927 + }, + { + "epoch": 0.2374631268436578, + "grad_norm": 3.0395944118499756, + "learning_rate": 4.336034545098528e-05, + "loss": 3.4725, + "step": 39928 + }, + { + "epoch": 0.23746907412693882, + "grad_norm": 2.7053802013397217, + "learning_rate": 4.336002842709057e-05, + "loss": 3.299, + "step": 39929 + }, + { + "epoch": 0.2374750214102198, + "grad_norm": 2.8455517292022705, + "learning_rate": 4.3359711396786554e-05, + "loss": 3.2357, + "step": 39930 + }, + { + "epoch": 0.2374809686935008, + "grad_norm": 2.790203332901001, + "learning_rate": 4.335939436007336e-05, + "loss": 3.0324, + "step": 39931 + }, + { + "epoch": 0.23748691597678181, + "grad_norm": 2.6323273181915283, + "learning_rate": 4.3359077316951096e-05, + "loss": 3.3338, + "step": 39932 + }, + { + "epoch": 0.2374928632600628, + "grad_norm": 1.6055479049682617, + "learning_rate": 4.335876026741986e-05, + "loss": 4.5655, + "step": 39933 + }, + { + "epoch": 0.2374988105433438, + "grad_norm": 2.0111827850341797, + "learning_rate": 4.335844321147978e-05, + "loss": 5.0737, + "step": 39934 + }, + { + "epoch": 0.2375047578266248, + "grad_norm": 1.6341081857681274, + "learning_rate": 4.3358126149130944e-05, + "loss": 5.0941, + "step": 39935 + }, + { + "epoch": 0.2375107051099058, + "grad_norm": 1.9143885374069214, + "learning_rate": 4.3357809080373484e-05, + "loss": 4.1743, + "step": 39936 + }, + { + "epoch": 0.23751665239318678, + "grad_norm": 1.6839019060134888, + "learning_rate": 4.33574920052075e-05, + "loss": 4.6477, + "step": 39937 + }, + { + "epoch": 0.2375225996764678, + "grad_norm": 1.8571311235427856, + "learning_rate": 4.335717492363311e-05, + "loss": 4.8121, + "step": 39938 + }, + { + "epoch": 0.2375285469597488, + "grad_norm": 1.5011353492736816, + "learning_rate": 4.335685783565041e-05, + "loss": 4.7521, + "step": 39939 + }, + { + "epoch": 0.23753449424302978, + "grad_norm": 2.686401844024658, + "learning_rate": 4.335654074125953e-05, + "loss": 3.4226, + "step": 39940 + }, + { + "epoch": 0.2375404415263108, + "grad_norm": 3.0526058673858643, + "learning_rate": 4.335622364046057e-05, + "loss": 3.235, + "step": 39941 + }, + { + "epoch": 0.23754638880959178, + "grad_norm": 3.0678353309631348, + "learning_rate": 4.3355906533253636e-05, + "loss": 3.3255, + "step": 39942 + }, + { + "epoch": 0.23755233609287277, + "grad_norm": 2.445336103439331, + "learning_rate": 4.335558941963885e-05, + "loss": 3.5664, + "step": 39943 + }, + { + "epoch": 0.23755828337615378, + "grad_norm": 2.646639823913574, + "learning_rate": 4.3355272299616314e-05, + "loss": 3.4817, + "step": 39944 + }, + { + "epoch": 0.23756423065943477, + "grad_norm": 2.4064605236053467, + "learning_rate": 4.335495517318614e-05, + "loss": 3.326, + "step": 39945 + }, + { + "epoch": 0.23757017794271576, + "grad_norm": 2.666252613067627, + "learning_rate": 4.335463804034845e-05, + "loss": 2.9059, + "step": 39946 + }, + { + "epoch": 0.23757612522599678, + "grad_norm": 2.5794105529785156, + "learning_rate": 4.3354320901103344e-05, + "loss": 3.6414, + "step": 39947 + }, + { + "epoch": 0.23758207250927776, + "grad_norm": 2.137204885482788, + "learning_rate": 4.3354003755450925e-05, + "loss": 4.2312, + "step": 39948 + }, + { + "epoch": 0.23758801979255875, + "grad_norm": 2.2329344749450684, + "learning_rate": 4.335368660339132e-05, + "loss": 4.108, + "step": 39949 + }, + { + "epoch": 0.23759396707583977, + "grad_norm": 2.0138213634490967, + "learning_rate": 4.335336944492463e-05, + "loss": 4.0529, + "step": 39950 + }, + { + "epoch": 0.23759991435912076, + "grad_norm": 2.558507204055786, + "learning_rate": 4.335305228005097e-05, + "loss": 3.784, + "step": 39951 + }, + { + "epoch": 0.23760586164240174, + "grad_norm": 2.3928165435791016, + "learning_rate": 4.335273510877045e-05, + "loss": 3.8229, + "step": 39952 + }, + { + "epoch": 0.23761180892568276, + "grad_norm": 2.2186508178710938, + "learning_rate": 4.335241793108318e-05, + "loss": 3.7454, + "step": 39953 + }, + { + "epoch": 0.23761775620896375, + "grad_norm": 2.49245285987854, + "learning_rate": 4.3352100746989264e-05, + "loss": 3.5622, + "step": 39954 + }, + { + "epoch": 0.23762370349224474, + "grad_norm": 2.2493436336517334, + "learning_rate": 4.335178355648882e-05, + "loss": 4.0303, + "step": 39955 + }, + { + "epoch": 0.23762965077552575, + "grad_norm": 2.332967519760132, + "learning_rate": 4.335146635958197e-05, + "loss": 3.5922, + "step": 39956 + }, + { + "epoch": 0.23763559805880674, + "grad_norm": 2.505335569381714, + "learning_rate": 4.33511491562688e-05, + "loss": 3.7643, + "step": 39957 + }, + { + "epoch": 0.23764154534208773, + "grad_norm": 2.923208713531494, + "learning_rate": 4.335083194654944e-05, + "loss": 3.5873, + "step": 39958 + }, + { + "epoch": 0.23764749262536874, + "grad_norm": 2.361135244369507, + "learning_rate": 4.3350514730424e-05, + "loss": 3.877, + "step": 39959 + }, + { + "epoch": 0.23765343990864973, + "grad_norm": 2.3764545917510986, + "learning_rate": 4.335019750789257e-05, + "loss": 3.8729, + "step": 39960 + }, + { + "epoch": 0.23765938719193072, + "grad_norm": 2.5335628986358643, + "learning_rate": 4.334988027895528e-05, + "loss": 3.7511, + "step": 39961 + }, + { + "epoch": 0.23766533447521174, + "grad_norm": 2.3174009323120117, + "learning_rate": 4.334956304361224e-05, + "loss": 3.575, + "step": 39962 + }, + { + "epoch": 0.23767128175849273, + "grad_norm": 2.284850597381592, + "learning_rate": 4.334924580186356e-05, + "loss": 3.6594, + "step": 39963 + }, + { + "epoch": 0.2376772290417737, + "grad_norm": 2.640793561935425, + "learning_rate": 4.3348928553709345e-05, + "loss": 3.7082, + "step": 39964 + }, + { + "epoch": 0.2376831763250547, + "grad_norm": 2.5589759349823, + "learning_rate": 4.33486112991497e-05, + "loss": 3.461, + "step": 39965 + }, + { + "epoch": 0.23768912360833572, + "grad_norm": 2.692124605178833, + "learning_rate": 4.334829403818476e-05, + "loss": 3.6977, + "step": 39966 + }, + { + "epoch": 0.2376950708916167, + "grad_norm": 2.029341220855713, + "learning_rate": 4.3347976770814605e-05, + "loss": 4.5998, + "step": 39967 + }, + { + "epoch": 0.2377010181748977, + "grad_norm": 2.0593783855438232, + "learning_rate": 4.3347659497039373e-05, + "loss": 4.7179, + "step": 39968 + }, + { + "epoch": 0.2377069654581787, + "grad_norm": 1.934889793395996, + "learning_rate": 4.3347342216859156e-05, + "loss": 4.5182, + "step": 39969 + }, + { + "epoch": 0.2377129127414597, + "grad_norm": 1.9339655637741089, + "learning_rate": 4.334702493027407e-05, + "loss": 4.9809, + "step": 39970 + }, + { + "epoch": 0.23771886002474069, + "grad_norm": 1.7704025506973267, + "learning_rate": 4.3346707637284234e-05, + "loss": 4.657, + "step": 39971 + }, + { + "epoch": 0.2377248073080217, + "grad_norm": 1.9846539497375488, + "learning_rate": 4.3346390337889745e-05, + "loss": 4.9341, + "step": 39972 + }, + { + "epoch": 0.2377307545913027, + "grad_norm": 1.8515028953552246, + "learning_rate": 4.3346073032090725e-05, + "loss": 4.6859, + "step": 39973 + }, + { + "epoch": 0.23773670187458368, + "grad_norm": 1.6280958652496338, + "learning_rate": 4.334575571988728e-05, + "loss": 5.1042, + "step": 39974 + }, + { + "epoch": 0.2377426491578647, + "grad_norm": 2.0795865058898926, + "learning_rate": 4.334543840127952e-05, + "loss": 4.9098, + "step": 39975 + }, + { + "epoch": 0.23774859644114568, + "grad_norm": 1.8528962135314941, + "learning_rate": 4.334512107626756e-05, + "loss": 4.892, + "step": 39976 + }, + { + "epoch": 0.23775454372442667, + "grad_norm": 1.7945277690887451, + "learning_rate": 4.33448037448515e-05, + "loss": 4.9786, + "step": 39977 + }, + { + "epoch": 0.23776049100770769, + "grad_norm": 1.6035569906234741, + "learning_rate": 4.334448640703147e-05, + "loss": 4.7732, + "step": 39978 + }, + { + "epoch": 0.23776643829098867, + "grad_norm": 1.4391299486160278, + "learning_rate": 4.334416906280756e-05, + "loss": 4.9093, + "step": 39979 + }, + { + "epoch": 0.23777238557426966, + "grad_norm": 1.7167659997940063, + "learning_rate": 4.33438517121799e-05, + "loss": 4.9274, + "step": 39980 + }, + { + "epoch": 0.23777833285755068, + "grad_norm": 1.7619572877883911, + "learning_rate": 4.334353435514857e-05, + "loss": 4.4422, + "step": 39981 + }, + { + "epoch": 0.23778428014083167, + "grad_norm": 1.413558840751648, + "learning_rate": 4.334321699171372e-05, + "loss": 4.3499, + "step": 39982 + }, + { + "epoch": 0.23779022742411265, + "grad_norm": 1.6296491622924805, + "learning_rate": 4.334289962187544e-05, + "loss": 4.6277, + "step": 39983 + }, + { + "epoch": 0.23779617470739367, + "grad_norm": 1.4150809049606323, + "learning_rate": 4.334258224563384e-05, + "loss": 4.5372, + "step": 39984 + }, + { + "epoch": 0.23780212199067466, + "grad_norm": 1.6175013780593872, + "learning_rate": 4.334226486298904e-05, + "loss": 4.9062, + "step": 39985 + }, + { + "epoch": 0.23780806927395565, + "grad_norm": 1.5687006711959839, + "learning_rate": 4.334194747394114e-05, + "loss": 4.9876, + "step": 39986 + }, + { + "epoch": 0.23781401655723666, + "grad_norm": 1.5041331052780151, + "learning_rate": 4.3341630078490254e-05, + "loss": 4.8733, + "step": 39987 + }, + { + "epoch": 0.23781996384051765, + "grad_norm": 1.4065840244293213, + "learning_rate": 4.334131267663649e-05, + "loss": 5.0474, + "step": 39988 + }, + { + "epoch": 0.23782591112379864, + "grad_norm": 1.4845675230026245, + "learning_rate": 4.334099526837997e-05, + "loss": 5.2594, + "step": 39989 + }, + { + "epoch": 0.23783185840707965, + "grad_norm": 1.597825050354004, + "learning_rate": 4.33406778537208e-05, + "loss": 4.6358, + "step": 39990 + }, + { + "epoch": 0.23783780569036064, + "grad_norm": 1.440100073814392, + "learning_rate": 4.334036043265909e-05, + "loss": 4.3406, + "step": 39991 + }, + { + "epoch": 0.23784375297364163, + "grad_norm": 1.7690058946609497, + "learning_rate": 4.334004300519494e-05, + "loss": 5.0523, + "step": 39992 + }, + { + "epoch": 0.23784970025692265, + "grad_norm": 1.6725897789001465, + "learning_rate": 4.333972557132848e-05, + "loss": 4.3085, + "step": 39993 + }, + { + "epoch": 0.23785564754020364, + "grad_norm": 1.5345070362091064, + "learning_rate": 4.333940813105981e-05, + "loss": 4.8832, + "step": 39994 + }, + { + "epoch": 0.23786159482348462, + "grad_norm": 1.42098069190979, + "learning_rate": 4.333909068438904e-05, + "loss": 4.3638, + "step": 39995 + }, + { + "epoch": 0.23786754210676564, + "grad_norm": 1.3835852146148682, + "learning_rate": 4.333877323131628e-05, + "loss": 4.4911, + "step": 39996 + }, + { + "epoch": 0.23787348939004663, + "grad_norm": 1.519081950187683, + "learning_rate": 4.3338455771841645e-05, + "loss": 4.4199, + "step": 39997 + }, + { + "epoch": 0.23787943667332762, + "grad_norm": 1.619184970855713, + "learning_rate": 4.333813830596525e-05, + "loss": 4.9006, + "step": 39998 + }, + { + "epoch": 0.23788538395660863, + "grad_norm": 1.7782379388809204, + "learning_rate": 4.333782083368719e-05, + "loss": 4.9271, + "step": 39999 + }, + { + "epoch": 0.23789133123988962, + "grad_norm": 1.3998247385025024, + "learning_rate": 4.33375033550076e-05, + "loss": 4.9854, + "step": 40000 + }, + { + "epoch": 0.2378972785231706, + "grad_norm": 1.5742835998535156, + "learning_rate": 4.333718586992657e-05, + "loss": 4.8919, + "step": 40001 + }, + { + "epoch": 0.23790322580645162, + "grad_norm": 1.4051644802093506, + "learning_rate": 4.333686837844422e-05, + "loss": 4.6338, + "step": 40002 + }, + { + "epoch": 0.2379091730897326, + "grad_norm": 1.5271607637405396, + "learning_rate": 4.333655088056065e-05, + "loss": 4.8598, + "step": 40003 + }, + { + "epoch": 0.2379151203730136, + "grad_norm": 1.592507004737854, + "learning_rate": 4.3336233376275986e-05, + "loss": 4.8012, + "step": 40004 + }, + { + "epoch": 0.23792106765629462, + "grad_norm": 1.5740526914596558, + "learning_rate": 4.3335915865590335e-05, + "loss": 4.7146, + "step": 40005 + }, + { + "epoch": 0.2379270149395756, + "grad_norm": 1.5120583772659302, + "learning_rate": 4.3335598348503805e-05, + "loss": 4.877, + "step": 40006 + }, + { + "epoch": 0.2379329622228566, + "grad_norm": 1.8907583951950073, + "learning_rate": 4.33352808250165e-05, + "loss": 4.6501, + "step": 40007 + }, + { + "epoch": 0.2379389095061376, + "grad_norm": 2.0611941814422607, + "learning_rate": 4.333496329512854e-05, + "loss": 4.5835, + "step": 40008 + }, + { + "epoch": 0.2379448567894186, + "grad_norm": 2.002488374710083, + "learning_rate": 4.333464575884004e-05, + "loss": 4.7173, + "step": 40009 + }, + { + "epoch": 0.23795080407269958, + "grad_norm": 1.9386903047561646, + "learning_rate": 4.33343282161511e-05, + "loss": 4.7096, + "step": 40010 + }, + { + "epoch": 0.2379567513559806, + "grad_norm": 2.2310421466827393, + "learning_rate": 4.3334010667061834e-05, + "loss": 5.0001, + "step": 40011 + }, + { + "epoch": 0.2379626986392616, + "grad_norm": 2.2596018314361572, + "learning_rate": 4.3333693111572354e-05, + "loss": 4.4366, + "step": 40012 + }, + { + "epoch": 0.23796864592254258, + "grad_norm": 2.055948495864868, + "learning_rate": 4.3333375549682776e-05, + "loss": 4.6691, + "step": 40013 + }, + { + "epoch": 0.2379745932058236, + "grad_norm": 1.7055249214172363, + "learning_rate": 4.33330579813932e-05, + "loss": 4.3938, + "step": 40014 + }, + { + "epoch": 0.23798054048910458, + "grad_norm": 2.071561574935913, + "learning_rate": 4.3332740406703746e-05, + "loss": 4.4613, + "step": 40015 + }, + { + "epoch": 0.23798648777238557, + "grad_norm": 2.3190622329711914, + "learning_rate": 4.3332422825614525e-05, + "loss": 3.6661, + "step": 40016 + }, + { + "epoch": 0.23799243505566658, + "grad_norm": 1.5481516122817993, + "learning_rate": 4.333210523812564e-05, + "loss": 4.7479, + "step": 40017 + }, + { + "epoch": 0.23799838233894757, + "grad_norm": 2.146573781967163, + "learning_rate": 4.333178764423721e-05, + "loss": 4.1624, + "step": 40018 + }, + { + "epoch": 0.23800432962222856, + "grad_norm": 1.9364041090011597, + "learning_rate": 4.333147004394934e-05, + "loss": 3.9933, + "step": 40019 + }, + { + "epoch": 0.23801027690550958, + "grad_norm": 1.8397248983383179, + "learning_rate": 4.333115243726214e-05, + "loss": 4.2996, + "step": 40020 + }, + { + "epoch": 0.23801622418879056, + "grad_norm": 1.4978671073913574, + "learning_rate": 4.333083482417573e-05, + "loss": 4.2617, + "step": 40021 + }, + { + "epoch": 0.23802217147207155, + "grad_norm": 1.7238423824310303, + "learning_rate": 4.3330517204690216e-05, + "loss": 4.7362, + "step": 40022 + }, + { + "epoch": 0.23802811875535254, + "grad_norm": 1.2663583755493164, + "learning_rate": 4.3330199578805705e-05, + "loss": 4.2942, + "step": 40023 + }, + { + "epoch": 0.23803406603863356, + "grad_norm": 1.5082536935806274, + "learning_rate": 4.332988194652231e-05, + "loss": 4.5709, + "step": 40024 + }, + { + "epoch": 0.23804001332191455, + "grad_norm": 1.4612239599227905, + "learning_rate": 4.332956430784014e-05, + "loss": 4.9265, + "step": 40025 + }, + { + "epoch": 0.23804596060519553, + "grad_norm": 1.694326400756836, + "learning_rate": 4.3329246662759314e-05, + "loss": 4.8142, + "step": 40026 + }, + { + "epoch": 0.23805190788847655, + "grad_norm": 1.7012838125228882, + "learning_rate": 4.3328929011279935e-05, + "loss": 4.4969, + "step": 40027 + }, + { + "epoch": 0.23805785517175754, + "grad_norm": 1.7286322116851807, + "learning_rate": 4.3328611353402116e-05, + "loss": 4.5914, + "step": 40028 + }, + { + "epoch": 0.23806380245503853, + "grad_norm": 1.7502204179763794, + "learning_rate": 4.3328293689125966e-05, + "loss": 4.5044, + "step": 40029 + }, + { + "epoch": 0.23806974973831954, + "grad_norm": 1.6707364320755005, + "learning_rate": 4.33279760184516e-05, + "loss": 4.6059, + "step": 40030 + }, + { + "epoch": 0.23807569702160053, + "grad_norm": 1.8060898780822754, + "learning_rate": 4.332765834137913e-05, + "loss": 4.465, + "step": 40031 + }, + { + "epoch": 0.23808164430488152, + "grad_norm": 1.6567474603652954, + "learning_rate": 4.3327340657908654e-05, + "loss": 4.5328, + "step": 40032 + }, + { + "epoch": 0.23808759158816253, + "grad_norm": 1.4679162502288818, + "learning_rate": 4.3327022968040305e-05, + "loss": 4.4821, + "step": 40033 + }, + { + "epoch": 0.23809353887144352, + "grad_norm": 2.05436372756958, + "learning_rate": 4.3326705271774174e-05, + "loss": 4.5437, + "step": 40034 + }, + { + "epoch": 0.2380994861547245, + "grad_norm": 1.8400733470916748, + "learning_rate": 4.332638756911038e-05, + "loss": 4.5604, + "step": 40035 + }, + { + "epoch": 0.23810543343800553, + "grad_norm": 1.5162451267242432, + "learning_rate": 4.332606986004903e-05, + "loss": 4.3718, + "step": 40036 + }, + { + "epoch": 0.23811138072128651, + "grad_norm": 2.398054838180542, + "learning_rate": 4.3325752144590245e-05, + "loss": 4.0524, + "step": 40037 + }, + { + "epoch": 0.2381173280045675, + "grad_norm": 2.91900372505188, + "learning_rate": 4.332543442273414e-05, + "loss": 3.5671, + "step": 40038 + }, + { + "epoch": 0.23812327528784852, + "grad_norm": 3.0178847312927246, + "learning_rate": 4.3325116694480796e-05, + "loss": 3.1679, + "step": 40039 + }, + { + "epoch": 0.2381292225711295, + "grad_norm": 3.197305679321289, + "learning_rate": 4.332479895983035e-05, + "loss": 3.2881, + "step": 40040 + }, + { + "epoch": 0.2381351698544105, + "grad_norm": 2.9422004222869873, + "learning_rate": 4.332448121878291e-05, + "loss": 3.4425, + "step": 40041 + }, + { + "epoch": 0.2381411171376915, + "grad_norm": 2.550016164779663, + "learning_rate": 4.332416347133858e-05, + "loss": 3.1234, + "step": 40042 + }, + { + "epoch": 0.2381470644209725, + "grad_norm": 3.0863475799560547, + "learning_rate": 4.332384571749747e-05, + "loss": 3.3676, + "step": 40043 + }, + { + "epoch": 0.2381530117042535, + "grad_norm": 2.9887166023254395, + "learning_rate": 4.332352795725969e-05, + "loss": 3.255, + "step": 40044 + }, + { + "epoch": 0.2381589589875345, + "grad_norm": 2.4473114013671875, + "learning_rate": 4.332321019062537e-05, + "loss": 3.4324, + "step": 40045 + }, + { + "epoch": 0.2381649062708155, + "grad_norm": 2.2765254974365234, + "learning_rate": 4.33228924175946e-05, + "loss": 4.1904, + "step": 40046 + }, + { + "epoch": 0.23817085355409648, + "grad_norm": 1.8198360204696655, + "learning_rate": 4.332257463816749e-05, + "loss": 5.3108, + "step": 40047 + }, + { + "epoch": 0.2381768008373775, + "grad_norm": 2.0592474937438965, + "learning_rate": 4.3322256852344166e-05, + "loss": 4.6324, + "step": 40048 + }, + { + "epoch": 0.23818274812065848, + "grad_norm": 1.931580901145935, + "learning_rate": 4.332193906012474e-05, + "loss": 4.6619, + "step": 40049 + }, + { + "epoch": 0.23818869540393947, + "grad_norm": 2.0462899208068848, + "learning_rate": 4.3321621261509306e-05, + "loss": 4.8194, + "step": 40050 + }, + { + "epoch": 0.2381946426872205, + "grad_norm": 2.037546396255493, + "learning_rate": 4.332130345649798e-05, + "loss": 4.4479, + "step": 40051 + }, + { + "epoch": 0.23820058997050148, + "grad_norm": 1.5595238208770752, + "learning_rate": 4.332098564509087e-05, + "loss": 4.6994, + "step": 40052 + }, + { + "epoch": 0.23820653725378246, + "grad_norm": 1.9503674507141113, + "learning_rate": 4.332066782728811e-05, + "loss": 4.4081, + "step": 40053 + }, + { + "epoch": 0.23821248453706348, + "grad_norm": 1.7241960763931274, + "learning_rate": 4.332035000308978e-05, + "loss": 4.3859, + "step": 40054 + }, + { + "epoch": 0.23821843182034447, + "grad_norm": 1.6559135913848877, + "learning_rate": 4.3320032172496016e-05, + "loss": 4.3564, + "step": 40055 + }, + { + "epoch": 0.23822437910362546, + "grad_norm": 1.6500284671783447, + "learning_rate": 4.3319714335506914e-05, + "loss": 4.3654, + "step": 40056 + }, + { + "epoch": 0.23823032638690647, + "grad_norm": 1.5794562101364136, + "learning_rate": 4.331939649212259e-05, + "loss": 4.5338, + "step": 40057 + }, + { + "epoch": 0.23823627367018746, + "grad_norm": 1.6785868406295776, + "learning_rate": 4.331907864234315e-05, + "loss": 4.2581, + "step": 40058 + }, + { + "epoch": 0.23824222095346845, + "grad_norm": 1.7870590686798096, + "learning_rate": 4.331876078616871e-05, + "loss": 4.4147, + "step": 40059 + }, + { + "epoch": 0.23824816823674946, + "grad_norm": 1.5093984603881836, + "learning_rate": 4.331844292359939e-05, + "loss": 4.3236, + "step": 40060 + }, + { + "epoch": 0.23825411552003045, + "grad_norm": 1.8252619504928589, + "learning_rate": 4.3318125054635275e-05, + "loss": 4.3121, + "step": 40061 + }, + { + "epoch": 0.23826006280331144, + "grad_norm": 1.6924525499343872, + "learning_rate": 4.33178071792765e-05, + "loss": 4.3311, + "step": 40062 + }, + { + "epoch": 0.23826601008659246, + "grad_norm": 1.5435535907745361, + "learning_rate": 4.331748929752317e-05, + "loss": 4.0738, + "step": 40063 + }, + { + "epoch": 0.23827195736987344, + "grad_norm": 1.8041505813598633, + "learning_rate": 4.331717140937539e-05, + "loss": 3.8226, + "step": 40064 + }, + { + "epoch": 0.23827790465315443, + "grad_norm": 1.6551004648208618, + "learning_rate": 4.3316853514833276e-05, + "loss": 4.0668, + "step": 40065 + }, + { + "epoch": 0.23828385193643545, + "grad_norm": 1.5097968578338623, + "learning_rate": 4.3316535613896934e-05, + "loss": 4.234, + "step": 40066 + }, + { + "epoch": 0.23828979921971644, + "grad_norm": 1.7234817743301392, + "learning_rate": 4.331621770656648e-05, + "loss": 4.0366, + "step": 40067 + }, + { + "epoch": 0.23829574650299742, + "grad_norm": 1.79156494140625, + "learning_rate": 4.331589979284203e-05, + "loss": 4.0189, + "step": 40068 + }, + { + "epoch": 0.23830169378627844, + "grad_norm": 1.6798564195632935, + "learning_rate": 4.3315581872723684e-05, + "loss": 4.0849, + "step": 40069 + }, + { + "epoch": 0.23830764106955943, + "grad_norm": 1.832800030708313, + "learning_rate": 4.3315263946211555e-05, + "loss": 4.1124, + "step": 40070 + }, + { + "epoch": 0.23831358835284042, + "grad_norm": 1.976760983467102, + "learning_rate": 4.3314946013305756e-05, + "loss": 4.2555, + "step": 40071 + }, + { + "epoch": 0.23831953563612143, + "grad_norm": 2.0142438411712646, + "learning_rate": 4.3314628074006404e-05, + "loss": 4.2094, + "step": 40072 + }, + { + "epoch": 0.23832548291940242, + "grad_norm": 2.068956136703491, + "learning_rate": 4.33143101283136e-05, + "loss": 4.9876, + "step": 40073 + }, + { + "epoch": 0.2383314302026834, + "grad_norm": 1.8501027822494507, + "learning_rate": 4.3313992176227465e-05, + "loss": 5.0769, + "step": 40074 + }, + { + "epoch": 0.23833737748596442, + "grad_norm": 1.8420876264572144, + "learning_rate": 4.33136742177481e-05, + "loss": 5.0828, + "step": 40075 + }, + { + "epoch": 0.2383433247692454, + "grad_norm": 1.8233364820480347, + "learning_rate": 4.331335625287562e-05, + "loss": 4.7292, + "step": 40076 + }, + { + "epoch": 0.2383492720525264, + "grad_norm": 1.6087543964385986, + "learning_rate": 4.331303828161014e-05, + "loss": 4.4481, + "step": 40077 + }, + { + "epoch": 0.23835521933580742, + "grad_norm": 1.5220482349395752, + "learning_rate": 4.331272030395177e-05, + "loss": 4.8301, + "step": 40078 + }, + { + "epoch": 0.2383611666190884, + "grad_norm": 1.5099562406539917, + "learning_rate": 4.331240231990061e-05, + "loss": 5.0819, + "step": 40079 + }, + { + "epoch": 0.2383671139023694, + "grad_norm": 1.813644528388977, + "learning_rate": 4.331208432945678e-05, + "loss": 5.299, + "step": 40080 + }, + { + "epoch": 0.23837306118565038, + "grad_norm": 1.924375057220459, + "learning_rate": 4.3311766332620394e-05, + "loss": 4.7681, + "step": 40081 + }, + { + "epoch": 0.2383790084689314, + "grad_norm": 1.7753795385360718, + "learning_rate": 4.331144832939156e-05, + "loss": 4.754, + "step": 40082 + }, + { + "epoch": 0.23838495575221239, + "grad_norm": 2.1708898544311523, + "learning_rate": 4.331113031977039e-05, + "loss": 3.8251, + "step": 40083 + }, + { + "epoch": 0.23839090303549337, + "grad_norm": 2.2168617248535156, + "learning_rate": 4.3310812303757e-05, + "loss": 3.7123, + "step": 40084 + }, + { + "epoch": 0.2383968503187744, + "grad_norm": 2.437830924987793, + "learning_rate": 4.3310494281351475e-05, + "loss": 3.7366, + "step": 40085 + }, + { + "epoch": 0.23840279760205538, + "grad_norm": 2.3509836196899414, + "learning_rate": 4.3310176252553964e-05, + "loss": 3.8105, + "step": 40086 + }, + { + "epoch": 0.23840874488533637, + "grad_norm": 2.418760061264038, + "learning_rate": 4.330985821736455e-05, + "loss": 3.8426, + "step": 40087 + }, + { + "epoch": 0.23841469216861738, + "grad_norm": 2.029452085494995, + "learning_rate": 4.330954017578336e-05, + "loss": 3.6936, + "step": 40088 + }, + { + "epoch": 0.23842063945189837, + "grad_norm": 2.286858558654785, + "learning_rate": 4.33092221278105e-05, + "loss": 3.7361, + "step": 40089 + }, + { + "epoch": 0.23842658673517936, + "grad_norm": 2.1391983032226562, + "learning_rate": 4.330890407344606e-05, + "loss": 3.8871, + "step": 40090 + }, + { + "epoch": 0.23843253401846037, + "grad_norm": 1.741774082183838, + "learning_rate": 4.3308586012690196e-05, + "loss": 4.8544, + "step": 40091 + }, + { + "epoch": 0.23843848130174136, + "grad_norm": 1.854085922241211, + "learning_rate": 4.330826794554298e-05, + "loss": 4.8906, + "step": 40092 + }, + { + "epoch": 0.23844442858502235, + "grad_norm": 1.841774582862854, + "learning_rate": 4.3307949872004536e-05, + "loss": 5.0489, + "step": 40093 + }, + { + "epoch": 0.23845037586830337, + "grad_norm": 1.5245493650436401, + "learning_rate": 4.3307631792074986e-05, + "loss": 4.842, + "step": 40094 + }, + { + "epoch": 0.23845632315158435, + "grad_norm": 1.8230993747711182, + "learning_rate": 4.3307313705754425e-05, + "loss": 4.0699, + "step": 40095 + }, + { + "epoch": 0.23846227043486534, + "grad_norm": 2.567065954208374, + "learning_rate": 4.3306995613042966e-05, + "loss": 3.8251, + "step": 40096 + }, + { + "epoch": 0.23846821771814636, + "grad_norm": 2.490562915802002, + "learning_rate": 4.330667751394073e-05, + "loss": 3.8139, + "step": 40097 + }, + { + "epoch": 0.23847416500142735, + "grad_norm": 2.2854549884796143, + "learning_rate": 4.330635940844782e-05, + "loss": 3.8938, + "step": 40098 + }, + { + "epoch": 0.23848011228470833, + "grad_norm": 2.383312463760376, + "learning_rate": 4.330604129656435e-05, + "loss": 3.7447, + "step": 40099 + }, + { + "epoch": 0.23848605956798935, + "grad_norm": 2.2034518718719482, + "learning_rate": 4.330572317829043e-05, + "loss": 3.7163, + "step": 40100 + }, + { + "epoch": 0.23849200685127034, + "grad_norm": 2.3206586837768555, + "learning_rate": 4.330540505362617e-05, + "loss": 3.7997, + "step": 40101 + }, + { + "epoch": 0.23849795413455133, + "grad_norm": 2.3994975090026855, + "learning_rate": 4.330508692257168e-05, + "loss": 3.5833, + "step": 40102 + }, + { + "epoch": 0.23850390141783234, + "grad_norm": 2.430852174758911, + "learning_rate": 4.330476878512708e-05, + "loss": 3.7725, + "step": 40103 + }, + { + "epoch": 0.23850984870111333, + "grad_norm": 2.181950092315674, + "learning_rate": 4.330445064129247e-05, + "loss": 3.7733, + "step": 40104 + }, + { + "epoch": 0.23851579598439432, + "grad_norm": 2.597951889038086, + "learning_rate": 4.330413249106796e-05, + "loss": 3.1029, + "step": 40105 + }, + { + "epoch": 0.23852174326767533, + "grad_norm": 2.3912487030029297, + "learning_rate": 4.330381433445367e-05, + "loss": 3.5213, + "step": 40106 + }, + { + "epoch": 0.23852769055095632, + "grad_norm": 1.650768518447876, + "learning_rate": 4.330349617144971e-05, + "loss": 4.745, + "step": 40107 + }, + { + "epoch": 0.2385336378342373, + "grad_norm": 1.4997506141662598, + "learning_rate": 4.330317800205619e-05, + "loss": 4.6794, + "step": 40108 + }, + { + "epoch": 0.23853958511751833, + "grad_norm": 1.3891698122024536, + "learning_rate": 4.330285982627322e-05, + "loss": 4.704, + "step": 40109 + }, + { + "epoch": 0.23854553240079931, + "grad_norm": 1.5003094673156738, + "learning_rate": 4.3302541644100915e-05, + "loss": 5.1165, + "step": 40110 + }, + { + "epoch": 0.2385514796840803, + "grad_norm": 1.5402982234954834, + "learning_rate": 4.330222345553937e-05, + "loss": 5.1335, + "step": 40111 + }, + { + "epoch": 0.23855742696736132, + "grad_norm": 2.2450990676879883, + "learning_rate": 4.330190526058872e-05, + "loss": 4.4541, + "step": 40112 + }, + { + "epoch": 0.2385633742506423, + "grad_norm": 2.956296920776367, + "learning_rate": 4.330158705924905e-05, + "loss": 3.8851, + "step": 40113 + }, + { + "epoch": 0.2385693215339233, + "grad_norm": 1.7868328094482422, + "learning_rate": 4.330126885152049e-05, + "loss": 4.6332, + "step": 40114 + }, + { + "epoch": 0.2385752688172043, + "grad_norm": 2.269697427749634, + "learning_rate": 4.330095063740315e-05, + "loss": 3.9059, + "step": 40115 + }, + { + "epoch": 0.2385812161004853, + "grad_norm": 1.6447689533233643, + "learning_rate": 4.330063241689713e-05, + "loss": 4.3145, + "step": 40116 + }, + { + "epoch": 0.2385871633837663, + "grad_norm": 2.1394059658050537, + "learning_rate": 4.330031419000256e-05, + "loss": 4.4816, + "step": 40117 + }, + { + "epoch": 0.2385931106670473, + "grad_norm": 1.8869203329086304, + "learning_rate": 4.329999595671953e-05, + "loss": 4.2623, + "step": 40118 + }, + { + "epoch": 0.2385990579503283, + "grad_norm": 2.2256832122802734, + "learning_rate": 4.3299677717048164e-05, + "loss": 3.5135, + "step": 40119 + }, + { + "epoch": 0.23860500523360928, + "grad_norm": 2.431140184402466, + "learning_rate": 4.329935947098857e-05, + "loss": 3.5405, + "step": 40120 + }, + { + "epoch": 0.2386109525168903, + "grad_norm": 1.703413724899292, + "learning_rate": 4.329904121854086e-05, + "loss": 4.0305, + "step": 40121 + }, + { + "epoch": 0.23861689980017128, + "grad_norm": 1.944323182106018, + "learning_rate": 4.329872295970514e-05, + "loss": 4.8757, + "step": 40122 + }, + { + "epoch": 0.23862284708345227, + "grad_norm": 2.000685453414917, + "learning_rate": 4.3298404694481524e-05, + "loss": 5.0184, + "step": 40123 + }, + { + "epoch": 0.2386287943667333, + "grad_norm": 2.024580955505371, + "learning_rate": 4.329808642287013e-05, + "loss": 5.024, + "step": 40124 + }, + { + "epoch": 0.23863474165001428, + "grad_norm": 2.07774019241333, + "learning_rate": 4.329776814487106e-05, + "loss": 4.9439, + "step": 40125 + }, + { + "epoch": 0.23864068893329526, + "grad_norm": 1.8376563787460327, + "learning_rate": 4.329744986048443e-05, + "loss": 4.8507, + "step": 40126 + }, + { + "epoch": 0.23864663621657628, + "grad_norm": 1.7948901653289795, + "learning_rate": 4.329713156971034e-05, + "loss": 5.2397, + "step": 40127 + }, + { + "epoch": 0.23865258349985727, + "grad_norm": 1.8104145526885986, + "learning_rate": 4.329681327254892e-05, + "loss": 5.1029, + "step": 40128 + }, + { + "epoch": 0.23865853078313826, + "grad_norm": 1.7246794700622559, + "learning_rate": 4.329649496900027e-05, + "loss": 4.9139, + "step": 40129 + }, + { + "epoch": 0.23866447806641927, + "grad_norm": 1.7831335067749023, + "learning_rate": 4.32961766590645e-05, + "loss": 5.3007, + "step": 40130 + }, + { + "epoch": 0.23867042534970026, + "grad_norm": 3.468231678009033, + "learning_rate": 4.329585834274172e-05, + "loss": 3.8966, + "step": 40131 + }, + { + "epoch": 0.23867637263298125, + "grad_norm": 3.5706474781036377, + "learning_rate": 4.329554002003205e-05, + "loss": 3.7617, + "step": 40132 + }, + { + "epoch": 0.23868231991626226, + "grad_norm": 2.587364673614502, + "learning_rate": 4.3295221690935606e-05, + "loss": 3.4487, + "step": 40133 + }, + { + "epoch": 0.23868826719954325, + "grad_norm": 2.6896677017211914, + "learning_rate": 4.3294903355452476e-05, + "loss": 3.5369, + "step": 40134 + }, + { + "epoch": 0.23869421448282424, + "grad_norm": 2.134666919708252, + "learning_rate": 4.3294585013582786e-05, + "loss": 3.2653, + "step": 40135 + }, + { + "epoch": 0.23870016176610526, + "grad_norm": 1.7431190013885498, + "learning_rate": 4.329426666532665e-05, + "loss": 4.6138, + "step": 40136 + }, + { + "epoch": 0.23870610904938624, + "grad_norm": 1.8424272537231445, + "learning_rate": 4.329394831068417e-05, + "loss": 5.1224, + "step": 40137 + }, + { + "epoch": 0.23871205633266723, + "grad_norm": 1.7081012725830078, + "learning_rate": 4.329362994965546e-05, + "loss": 5.0746, + "step": 40138 + }, + { + "epoch": 0.23871800361594822, + "grad_norm": 1.5560182332992554, + "learning_rate": 4.3293311582240634e-05, + "loss": 4.9513, + "step": 40139 + }, + { + "epoch": 0.23872395089922924, + "grad_norm": 1.5948264598846436, + "learning_rate": 4.3292993208439814e-05, + "loss": 4.9205, + "step": 40140 + }, + { + "epoch": 0.23872989818251023, + "grad_norm": 1.946006417274475, + "learning_rate": 4.329267482825309e-05, + "loss": 5.4552, + "step": 40141 + }, + { + "epoch": 0.2387358454657912, + "grad_norm": 1.7580515146255493, + "learning_rate": 4.329235644168058e-05, + "loss": 5.6614, + "step": 40142 + }, + { + "epoch": 0.23874179274907223, + "grad_norm": 1.667794942855835, + "learning_rate": 4.3292038048722394e-05, + "loss": 4.7174, + "step": 40143 + }, + { + "epoch": 0.23874774003235322, + "grad_norm": 1.6075485944747925, + "learning_rate": 4.3291719649378645e-05, + "loss": 5.0176, + "step": 40144 + }, + { + "epoch": 0.2387536873156342, + "grad_norm": 1.8009587526321411, + "learning_rate": 4.3291401243649454e-05, + "loss": 5.3401, + "step": 40145 + }, + { + "epoch": 0.23875963459891522, + "grad_norm": 1.9497926235198975, + "learning_rate": 4.329108283153492e-05, + "loss": 4.6112, + "step": 40146 + }, + { + "epoch": 0.2387655818821962, + "grad_norm": 2.408142566680908, + "learning_rate": 4.329076441303516e-05, + "loss": 4.2315, + "step": 40147 + }, + { + "epoch": 0.2387715291654772, + "grad_norm": 1.6906830072402954, + "learning_rate": 4.329044598815029e-05, + "loss": 4.9044, + "step": 40148 + }, + { + "epoch": 0.2387774764487582, + "grad_norm": 2.0913238525390625, + "learning_rate": 4.32901275568804e-05, + "loss": 4.3751, + "step": 40149 + }, + { + "epoch": 0.2387834237320392, + "grad_norm": 1.6698623895645142, + "learning_rate": 4.3289809119225625e-05, + "loss": 4.8674, + "step": 40150 + }, + { + "epoch": 0.2387893710153202, + "grad_norm": 1.8850531578063965, + "learning_rate": 4.328949067518606e-05, + "loss": 4.8449, + "step": 40151 + }, + { + "epoch": 0.2387953182986012, + "grad_norm": 1.7283515930175781, + "learning_rate": 4.3289172224761835e-05, + "loss": 5.0853, + "step": 40152 + }, + { + "epoch": 0.2388012655818822, + "grad_norm": 1.38634192943573, + "learning_rate": 4.328885376795303e-05, + "loss": 5.2427, + "step": 40153 + }, + { + "epoch": 0.23880721286516318, + "grad_norm": 1.601338267326355, + "learning_rate": 4.328853530475979e-05, + "loss": 5.5275, + "step": 40154 + }, + { + "epoch": 0.2388131601484442, + "grad_norm": 1.5904576778411865, + "learning_rate": 4.3288216835182205e-05, + "loss": 5.7852, + "step": 40155 + }, + { + "epoch": 0.23881910743172519, + "grad_norm": 1.5887131690979004, + "learning_rate": 4.328789835922039e-05, + "loss": 5.8099, + "step": 40156 + }, + { + "epoch": 0.23882505471500617, + "grad_norm": 1.6383719444274902, + "learning_rate": 4.328757987687447e-05, + "loss": 5.3932, + "step": 40157 + }, + { + "epoch": 0.2388310019982872, + "grad_norm": 1.5981476306915283, + "learning_rate": 4.328726138814454e-05, + "loss": 5.2322, + "step": 40158 + }, + { + "epoch": 0.23883694928156818, + "grad_norm": 1.7827104330062866, + "learning_rate": 4.3286942893030714e-05, + "loss": 5.0454, + "step": 40159 + }, + { + "epoch": 0.23884289656484917, + "grad_norm": 1.7469054460525513, + "learning_rate": 4.3286624391533105e-05, + "loss": 4.7948, + "step": 40160 + }, + { + "epoch": 0.23884884384813018, + "grad_norm": 1.6620041131973267, + "learning_rate": 4.328630588365182e-05, + "loss": 5.0628, + "step": 40161 + }, + { + "epoch": 0.23885479113141117, + "grad_norm": 1.6000419855117798, + "learning_rate": 4.3285987369386985e-05, + "loss": 4.5915, + "step": 40162 + }, + { + "epoch": 0.23886073841469216, + "grad_norm": 1.590676188468933, + "learning_rate": 4.328566884873869e-05, + "loss": 4.8309, + "step": 40163 + }, + { + "epoch": 0.23886668569797317, + "grad_norm": 1.6045578718185425, + "learning_rate": 4.3285350321707064e-05, + "loss": 4.9628, + "step": 40164 + }, + { + "epoch": 0.23887263298125416, + "grad_norm": 2.0469465255737305, + "learning_rate": 4.32850317882922e-05, + "loss": 4.6688, + "step": 40165 + }, + { + "epoch": 0.23887858026453515, + "grad_norm": 2.214313507080078, + "learning_rate": 4.328471324849424e-05, + "loss": 4.5173, + "step": 40166 + }, + { + "epoch": 0.23888452754781617, + "grad_norm": 2.107633352279663, + "learning_rate": 4.328439470231326e-05, + "loss": 5.0816, + "step": 40167 + }, + { + "epoch": 0.23889047483109715, + "grad_norm": 2.0030016899108887, + "learning_rate": 4.328407614974939e-05, + "loss": 4.8087, + "step": 40168 + }, + { + "epoch": 0.23889642211437814, + "grad_norm": 2.017014980316162, + "learning_rate": 4.328375759080274e-05, + "loss": 5.0027, + "step": 40169 + }, + { + "epoch": 0.23890236939765916, + "grad_norm": 1.8261082172393799, + "learning_rate": 4.328343902547342e-05, + "loss": 4.934, + "step": 40170 + }, + { + "epoch": 0.23890831668094015, + "grad_norm": 1.8473634719848633, + "learning_rate": 4.3283120453761534e-05, + "loss": 4.8915, + "step": 40171 + }, + { + "epoch": 0.23891426396422114, + "grad_norm": 1.5712432861328125, + "learning_rate": 4.32828018756672e-05, + "loss": 4.8155, + "step": 40172 + }, + { + "epoch": 0.23892021124750215, + "grad_norm": 1.6202071905136108, + "learning_rate": 4.328248329119053e-05, + "loss": 4.9866, + "step": 40173 + }, + { + "epoch": 0.23892615853078314, + "grad_norm": 1.7991951704025269, + "learning_rate": 4.328216470033164e-05, + "loss": 5.0245, + "step": 40174 + }, + { + "epoch": 0.23893210581406413, + "grad_norm": 1.5664708614349365, + "learning_rate": 4.3281846103090634e-05, + "loss": 4.9553, + "step": 40175 + }, + { + "epoch": 0.23893805309734514, + "grad_norm": 1.5590832233428955, + "learning_rate": 4.3281527499467624e-05, + "loss": 4.6687, + "step": 40176 + }, + { + "epoch": 0.23894400038062613, + "grad_norm": 1.696151614189148, + "learning_rate": 4.3281208889462715e-05, + "loss": 5.1766, + "step": 40177 + }, + { + "epoch": 0.23894994766390712, + "grad_norm": 1.5222536325454712, + "learning_rate": 4.3280890273076036e-05, + "loss": 5.344, + "step": 40178 + }, + { + "epoch": 0.23895589494718814, + "grad_norm": 3.0764362812042236, + "learning_rate": 4.3280571650307676e-05, + "loss": 4.0438, + "step": 40179 + }, + { + "epoch": 0.23896184223046912, + "grad_norm": 2.8695647716522217, + "learning_rate": 4.328025302115776e-05, + "loss": 4.135, + "step": 40180 + }, + { + "epoch": 0.2389677895137501, + "grad_norm": 1.716057538986206, + "learning_rate": 4.32799343856264e-05, + "loss": 5.1753, + "step": 40181 + }, + { + "epoch": 0.23897373679703113, + "grad_norm": 1.7035515308380127, + "learning_rate": 4.3279615743713704e-05, + "loss": 5.2037, + "step": 40182 + }, + { + "epoch": 0.23897968408031212, + "grad_norm": 1.6288505792617798, + "learning_rate": 4.327929709541978e-05, + "loss": 4.9865, + "step": 40183 + }, + { + "epoch": 0.2389856313635931, + "grad_norm": 1.8112092018127441, + "learning_rate": 4.327897844074474e-05, + "loss": 5.527, + "step": 40184 + }, + { + "epoch": 0.23899157864687412, + "grad_norm": 1.5201170444488525, + "learning_rate": 4.327865977968871e-05, + "loss": 5.2219, + "step": 40185 + }, + { + "epoch": 0.2389975259301551, + "grad_norm": 1.4934576749801636, + "learning_rate": 4.3278341112251775e-05, + "loss": 4.9885, + "step": 40186 + }, + { + "epoch": 0.2390034732134361, + "grad_norm": 1.6058255434036255, + "learning_rate": 4.327802243843406e-05, + "loss": 4.9511, + "step": 40187 + }, + { + "epoch": 0.2390094204967171, + "grad_norm": 1.599021553993225, + "learning_rate": 4.327770375823569e-05, + "loss": 5.0246, + "step": 40188 + }, + { + "epoch": 0.2390153677799981, + "grad_norm": 1.3563846349716187, + "learning_rate": 4.327738507165675e-05, + "loss": 5.0033, + "step": 40189 + }, + { + "epoch": 0.2390213150632791, + "grad_norm": 1.9109387397766113, + "learning_rate": 4.3277066378697365e-05, + "loss": 4.4247, + "step": 40190 + }, + { + "epoch": 0.2390272623465601, + "grad_norm": 1.3799777030944824, + "learning_rate": 4.3276747679357646e-05, + "loss": 4.9291, + "step": 40191 + }, + { + "epoch": 0.2390332096298411, + "grad_norm": 1.5303558111190796, + "learning_rate": 4.32764289736377e-05, + "loss": 4.7506, + "step": 40192 + }, + { + "epoch": 0.23903915691312208, + "grad_norm": 1.6117104291915894, + "learning_rate": 4.327611026153765e-05, + "loss": 4.8273, + "step": 40193 + }, + { + "epoch": 0.2390451041964031, + "grad_norm": 1.6165852546691895, + "learning_rate": 4.3275791543057595e-05, + "loss": 4.4733, + "step": 40194 + }, + { + "epoch": 0.23905105147968408, + "grad_norm": 2.1129150390625, + "learning_rate": 4.327547281819765e-05, + "loss": 4.0824, + "step": 40195 + }, + { + "epoch": 0.23905699876296507, + "grad_norm": 2.5206809043884277, + "learning_rate": 4.327515408695792e-05, + "loss": 4.2373, + "step": 40196 + }, + { + "epoch": 0.23906294604624606, + "grad_norm": 2.991682767868042, + "learning_rate": 4.3274835349338536e-05, + "loss": 4.3935, + "step": 40197 + }, + { + "epoch": 0.23906889332952708, + "grad_norm": 2.2124128341674805, + "learning_rate": 4.327451660533959e-05, + "loss": 3.8134, + "step": 40198 + }, + { + "epoch": 0.23907484061280806, + "grad_norm": 1.643195390701294, + "learning_rate": 4.3274197854961194e-05, + "loss": 4.2748, + "step": 40199 + }, + { + "epoch": 0.23908078789608905, + "grad_norm": 1.4482327699661255, + "learning_rate": 4.3273879098203463e-05, + "loss": 4.7637, + "step": 40200 + }, + { + "epoch": 0.23908673517937007, + "grad_norm": 1.4993923902511597, + "learning_rate": 4.327356033506652e-05, + "loss": 5.0844, + "step": 40201 + }, + { + "epoch": 0.23909268246265106, + "grad_norm": 1.472908616065979, + "learning_rate": 4.327324156555046e-05, + "loss": 5.5644, + "step": 40202 + }, + { + "epoch": 0.23909862974593205, + "grad_norm": 1.4157438278198242, + "learning_rate": 4.32729227896554e-05, + "loss": 5.4812, + "step": 40203 + }, + { + "epoch": 0.23910457702921306, + "grad_norm": 1.5970433950424194, + "learning_rate": 4.327260400738145e-05, + "loss": 5.5038, + "step": 40204 + }, + { + "epoch": 0.23911052431249405, + "grad_norm": 1.4509187936782837, + "learning_rate": 4.327228521872872e-05, + "loss": 5.4633, + "step": 40205 + }, + { + "epoch": 0.23911647159577504, + "grad_norm": 1.6185815334320068, + "learning_rate": 4.327196642369733e-05, + "loss": 4.5747, + "step": 40206 + }, + { + "epoch": 0.23912241887905605, + "grad_norm": 1.7226256132125854, + "learning_rate": 4.327164762228738e-05, + "loss": 4.5997, + "step": 40207 + }, + { + "epoch": 0.23912836616233704, + "grad_norm": 1.9961953163146973, + "learning_rate": 4.327132881449899e-05, + "loss": 4.2045, + "step": 40208 + }, + { + "epoch": 0.23913431344561803, + "grad_norm": 1.505850911140442, + "learning_rate": 4.3271010000332274e-05, + "loss": 5.1053, + "step": 40209 + }, + { + "epoch": 0.23914026072889905, + "grad_norm": 1.7519837617874146, + "learning_rate": 4.3270691179787325e-05, + "loss": 5.2226, + "step": 40210 + }, + { + "epoch": 0.23914620801218003, + "grad_norm": 1.6657217741012573, + "learning_rate": 4.327037235286427e-05, + "loss": 4.6223, + "step": 40211 + }, + { + "epoch": 0.23915215529546102, + "grad_norm": 1.4809834957122803, + "learning_rate": 4.327005351956322e-05, + "loss": 5.457, + "step": 40212 + }, + { + "epoch": 0.23915810257874204, + "grad_norm": 1.3957144021987915, + "learning_rate": 4.3269734679884287e-05, + "loss": 5.435, + "step": 40213 + }, + { + "epoch": 0.23916404986202303, + "grad_norm": 1.3809940814971924, + "learning_rate": 4.326941583382758e-05, + "loss": 5.3506, + "step": 40214 + }, + { + "epoch": 0.23916999714530401, + "grad_norm": 1.4588847160339355, + "learning_rate": 4.32690969813932e-05, + "loss": 5.3814, + "step": 40215 + }, + { + "epoch": 0.23917594442858503, + "grad_norm": 1.6800473928451538, + "learning_rate": 4.326877812258127e-05, + "loss": 4.1795, + "step": 40216 + }, + { + "epoch": 0.23918189171186602, + "grad_norm": 1.7868841886520386, + "learning_rate": 4.326845925739189e-05, + "loss": 4.1081, + "step": 40217 + }, + { + "epoch": 0.239187838995147, + "grad_norm": 1.7348510026931763, + "learning_rate": 4.32681403858252e-05, + "loss": 4.1702, + "step": 40218 + }, + { + "epoch": 0.23919378627842802, + "grad_norm": 1.6819672584533691, + "learning_rate": 4.326782150788127e-05, + "loss": 4.2444, + "step": 40219 + }, + { + "epoch": 0.239199733561709, + "grad_norm": 1.7941629886627197, + "learning_rate": 4.326750262356024e-05, + "loss": 4.4136, + "step": 40220 + }, + { + "epoch": 0.23920568084499, + "grad_norm": 1.7884337902069092, + "learning_rate": 4.3267183732862215e-05, + "loss": 4.852, + "step": 40221 + }, + { + "epoch": 0.23921162812827101, + "grad_norm": 1.6406954526901245, + "learning_rate": 4.32668648357873e-05, + "loss": 5.3795, + "step": 40222 + }, + { + "epoch": 0.239217575411552, + "grad_norm": 1.7505568265914917, + "learning_rate": 4.326654593233562e-05, + "loss": 5.2565, + "step": 40223 + }, + { + "epoch": 0.239223522694833, + "grad_norm": 1.4968135356903076, + "learning_rate": 4.3266227022507275e-05, + "loss": 5.0566, + "step": 40224 + }, + { + "epoch": 0.239229469978114, + "grad_norm": 1.7073537111282349, + "learning_rate": 4.3265908106302374e-05, + "loss": 4.866, + "step": 40225 + }, + { + "epoch": 0.239235417261395, + "grad_norm": 2.1224329471588135, + "learning_rate": 4.326558918372103e-05, + "loss": 4.2514, + "step": 40226 + }, + { + "epoch": 0.23924136454467598, + "grad_norm": 2.044198513031006, + "learning_rate": 4.3265270254763365e-05, + "loss": 4.2316, + "step": 40227 + }, + { + "epoch": 0.239247311827957, + "grad_norm": 1.8667954206466675, + "learning_rate": 4.326495131942948e-05, + "loss": 4.2186, + "step": 40228 + }, + { + "epoch": 0.239253259111238, + "grad_norm": 1.8041352033615112, + "learning_rate": 4.3264632377719496e-05, + "loss": 4.3103, + "step": 40229 + }, + { + "epoch": 0.23925920639451898, + "grad_norm": 1.6333084106445312, + "learning_rate": 4.326431342963351e-05, + "loss": 4.7349, + "step": 40230 + }, + { + "epoch": 0.2392651536778, + "grad_norm": 2.1168692111968994, + "learning_rate": 4.326399447517164e-05, + "loss": 4.7575, + "step": 40231 + }, + { + "epoch": 0.23927110096108098, + "grad_norm": 1.759518027305603, + "learning_rate": 4.3263675514334e-05, + "loss": 4.5206, + "step": 40232 + }, + { + "epoch": 0.23927704824436197, + "grad_norm": 1.8432530164718628, + "learning_rate": 4.32633565471207e-05, + "loss": 5.0177, + "step": 40233 + }, + { + "epoch": 0.23928299552764298, + "grad_norm": 1.865638256072998, + "learning_rate": 4.326303757353185e-05, + "loss": 5.0656, + "step": 40234 + }, + { + "epoch": 0.23928894281092397, + "grad_norm": 1.6412417888641357, + "learning_rate": 4.326271859356757e-05, + "loss": 5.0272, + "step": 40235 + }, + { + "epoch": 0.23929489009420496, + "grad_norm": 1.5613924264907837, + "learning_rate": 4.326239960722796e-05, + "loss": 4.0963, + "step": 40236 + }, + { + "epoch": 0.23930083737748598, + "grad_norm": 1.4786453247070312, + "learning_rate": 4.3262080614513126e-05, + "loss": 4.6872, + "step": 40237 + }, + { + "epoch": 0.23930678466076696, + "grad_norm": 1.5714530944824219, + "learning_rate": 4.32617616154232e-05, + "loss": 4.9956, + "step": 40238 + }, + { + "epoch": 0.23931273194404795, + "grad_norm": 1.9535592794418335, + "learning_rate": 4.326144260995827e-05, + "loss": 4.1646, + "step": 40239 + }, + { + "epoch": 0.23931867922732897, + "grad_norm": 3.2320947647094727, + "learning_rate": 4.3261123598118466e-05, + "loss": 3.4972, + "step": 40240 + }, + { + "epoch": 0.23932462651060996, + "grad_norm": 3.898958683013916, + "learning_rate": 4.326080457990389e-05, + "loss": 3.7067, + "step": 40241 + }, + { + "epoch": 0.23933057379389094, + "grad_norm": 3.3669137954711914, + "learning_rate": 4.326048555531466e-05, + "loss": 3.5045, + "step": 40242 + }, + { + "epoch": 0.23933652107717196, + "grad_norm": 2.947758436203003, + "learning_rate": 4.3260166524350884e-05, + "loss": 3.4174, + "step": 40243 + }, + { + "epoch": 0.23934246836045295, + "grad_norm": 2.9454586505889893, + "learning_rate": 4.325984748701267e-05, + "loss": 3.352, + "step": 40244 + }, + { + "epoch": 0.23934841564373394, + "grad_norm": 2.492859363555908, + "learning_rate": 4.325952844330013e-05, + "loss": 3.2367, + "step": 40245 + }, + { + "epoch": 0.23935436292701495, + "grad_norm": 1.7303574085235596, + "learning_rate": 4.325920939321338e-05, + "loss": 4.547, + "step": 40246 + }, + { + "epoch": 0.23936031021029594, + "grad_norm": 1.6265568733215332, + "learning_rate": 4.325889033675252e-05, + "loss": 4.9546, + "step": 40247 + }, + { + "epoch": 0.23936625749357693, + "grad_norm": 1.9568085670471191, + "learning_rate": 4.325857127391768e-05, + "loss": 4.9857, + "step": 40248 + }, + { + "epoch": 0.23937220477685794, + "grad_norm": 1.6795035600662231, + "learning_rate": 4.325825220470896e-05, + "loss": 4.8708, + "step": 40249 + }, + { + "epoch": 0.23937815206013893, + "grad_norm": 1.5937986373901367, + "learning_rate": 4.325793312912647e-05, + "loss": 4.9685, + "step": 40250 + }, + { + "epoch": 0.23938409934341992, + "grad_norm": 1.582703948020935, + "learning_rate": 4.3257614047170325e-05, + "loss": 4.9875, + "step": 40251 + }, + { + "epoch": 0.23939004662670094, + "grad_norm": 1.5072987079620361, + "learning_rate": 4.325729495884063e-05, + "loss": 4.6901, + "step": 40252 + }, + { + "epoch": 0.23939599390998192, + "grad_norm": 1.802185297012329, + "learning_rate": 4.325697586413752e-05, + "loss": 4.3054, + "step": 40253 + }, + { + "epoch": 0.2394019411932629, + "grad_norm": 2.0097274780273438, + "learning_rate": 4.325665676306107e-05, + "loss": 4.9268, + "step": 40254 + }, + { + "epoch": 0.23940788847654393, + "grad_norm": 1.5677427053451538, + "learning_rate": 4.325633765561142e-05, + "loss": 4.7458, + "step": 40255 + }, + { + "epoch": 0.23941383575982492, + "grad_norm": 1.8183891773223877, + "learning_rate": 4.325601854178867e-05, + "loss": 4.8506, + "step": 40256 + }, + { + "epoch": 0.2394197830431059, + "grad_norm": 1.5347108840942383, + "learning_rate": 4.325569942159292e-05, + "loss": 4.944, + "step": 40257 + }, + { + "epoch": 0.2394257303263869, + "grad_norm": 1.5523120164871216, + "learning_rate": 4.3255380295024304e-05, + "loss": 5.4315, + "step": 40258 + }, + { + "epoch": 0.2394316776096679, + "grad_norm": 1.6976925134658813, + "learning_rate": 4.3255061162082924e-05, + "loss": 5.1673, + "step": 40259 + }, + { + "epoch": 0.2394376248929489, + "grad_norm": 1.5445364713668823, + "learning_rate": 4.325474202276889e-05, + "loss": 5.1083, + "step": 40260 + }, + { + "epoch": 0.23944357217622989, + "grad_norm": 2.715904474258423, + "learning_rate": 4.325442287708231e-05, + "loss": 3.218, + "step": 40261 + }, + { + "epoch": 0.2394495194595109, + "grad_norm": 2.14359450340271, + "learning_rate": 4.325410372502331e-05, + "loss": 3.8361, + "step": 40262 + }, + { + "epoch": 0.2394554667427919, + "grad_norm": 1.5560270547866821, + "learning_rate": 4.325378456659198e-05, + "loss": 5.0651, + "step": 40263 + }, + { + "epoch": 0.23946141402607288, + "grad_norm": 1.5449919700622559, + "learning_rate": 4.325346540178845e-05, + "loss": 5.0358, + "step": 40264 + }, + { + "epoch": 0.2394673613093539, + "grad_norm": 1.4158093929290771, + "learning_rate": 4.3253146230612824e-05, + "loss": 5.0739, + "step": 40265 + }, + { + "epoch": 0.23947330859263488, + "grad_norm": 1.3598387241363525, + "learning_rate": 4.3252827053065206e-05, + "loss": 4.9613, + "step": 40266 + }, + { + "epoch": 0.23947925587591587, + "grad_norm": 1.7238245010375977, + "learning_rate": 4.325250786914572e-05, + "loss": 4.3289, + "step": 40267 + }, + { + "epoch": 0.23948520315919689, + "grad_norm": 1.9285497665405273, + "learning_rate": 4.325218867885447e-05, + "loss": 4.4479, + "step": 40268 + }, + { + "epoch": 0.23949115044247787, + "grad_norm": 1.9772894382476807, + "learning_rate": 4.325186948219157e-05, + "loss": 4.9901, + "step": 40269 + }, + { + "epoch": 0.23949709772575886, + "grad_norm": 1.8530864715576172, + "learning_rate": 4.325155027915713e-05, + "loss": 5.0925, + "step": 40270 + }, + { + "epoch": 0.23950304500903988, + "grad_norm": 1.7781996726989746, + "learning_rate": 4.325123106975126e-05, + "loss": 4.9495, + "step": 40271 + }, + { + "epoch": 0.23950899229232087, + "grad_norm": 1.7261242866516113, + "learning_rate": 4.3250911853974084e-05, + "loss": 4.7727, + "step": 40272 + }, + { + "epoch": 0.23951493957560185, + "grad_norm": 1.7483747005462646, + "learning_rate": 4.3250592631825694e-05, + "loss": 5.1011, + "step": 40273 + }, + { + "epoch": 0.23952088685888287, + "grad_norm": 1.506209373474121, + "learning_rate": 4.3250273403306214e-05, + "loss": 4.9651, + "step": 40274 + }, + { + "epoch": 0.23952683414216386, + "grad_norm": 1.691672444343567, + "learning_rate": 4.324995416841575e-05, + "loss": 4.9805, + "step": 40275 + }, + { + "epoch": 0.23953278142544485, + "grad_norm": 1.9487658739089966, + "learning_rate": 4.324963492715442e-05, + "loss": 4.9689, + "step": 40276 + }, + { + "epoch": 0.23953872870872586, + "grad_norm": 1.5965826511383057, + "learning_rate": 4.324931567952233e-05, + "loss": 5.0126, + "step": 40277 + }, + { + "epoch": 0.23954467599200685, + "grad_norm": 1.42833411693573, + "learning_rate": 4.3248996425519586e-05, + "loss": 5.1352, + "step": 40278 + }, + { + "epoch": 0.23955062327528784, + "grad_norm": 1.537211298942566, + "learning_rate": 4.3248677165146314e-05, + "loss": 4.9506, + "step": 40279 + }, + { + "epoch": 0.23955657055856885, + "grad_norm": 1.4943971633911133, + "learning_rate": 4.324835789840261e-05, + "loss": 5.0453, + "step": 40280 + }, + { + "epoch": 0.23956251784184984, + "grad_norm": 1.9677119255065918, + "learning_rate": 4.324803862528859e-05, + "loss": 5.3472, + "step": 40281 + }, + { + "epoch": 0.23956846512513083, + "grad_norm": 1.81831955909729, + "learning_rate": 4.324771934580438e-05, + "loss": 5.2047, + "step": 40282 + }, + { + "epoch": 0.23957441240841185, + "grad_norm": 1.6789662837982178, + "learning_rate": 4.3247400059950073e-05, + "loss": 5.0798, + "step": 40283 + }, + { + "epoch": 0.23958035969169283, + "grad_norm": 1.661208987236023, + "learning_rate": 4.324708076772579e-05, + "loss": 5.118, + "step": 40284 + }, + { + "epoch": 0.23958630697497382, + "grad_norm": 1.294905662536621, + "learning_rate": 4.324676146913164e-05, + "loss": 5.1212, + "step": 40285 + }, + { + "epoch": 0.23959225425825484, + "grad_norm": 1.696243405342102, + "learning_rate": 4.3246442164167726e-05, + "loss": 5.1118, + "step": 40286 + }, + { + "epoch": 0.23959820154153583, + "grad_norm": 1.5048704147338867, + "learning_rate": 4.324612285283417e-05, + "loss": 5.1757, + "step": 40287 + }, + { + "epoch": 0.23960414882481681, + "grad_norm": 1.519316554069519, + "learning_rate": 4.324580353513109e-05, + "loss": 5.2542, + "step": 40288 + }, + { + "epoch": 0.23961009610809783, + "grad_norm": 1.6322966814041138, + "learning_rate": 4.324548421105858e-05, + "loss": 5.2125, + "step": 40289 + }, + { + "epoch": 0.23961604339137882, + "grad_norm": 1.4820412397384644, + "learning_rate": 4.324516488061676e-05, + "loss": 5.3719, + "step": 40290 + }, + { + "epoch": 0.2396219906746598, + "grad_norm": 1.5021928548812866, + "learning_rate": 4.324484554380575e-05, + "loss": 5.528, + "step": 40291 + }, + { + "epoch": 0.23962793795794082, + "grad_norm": 1.6218903064727783, + "learning_rate": 4.324452620062565e-05, + "loss": 5.3625, + "step": 40292 + }, + { + "epoch": 0.2396338852412218, + "grad_norm": 1.7261576652526855, + "learning_rate": 4.3244206851076565e-05, + "loss": 4.9929, + "step": 40293 + }, + { + "epoch": 0.2396398325245028, + "grad_norm": 1.6539899110794067, + "learning_rate": 4.324388749515863e-05, + "loss": 5.1397, + "step": 40294 + }, + { + "epoch": 0.23964577980778382, + "grad_norm": 1.4695965051651, + "learning_rate": 4.324356813287193e-05, + "loss": 5.175, + "step": 40295 + }, + { + "epoch": 0.2396517270910648, + "grad_norm": 1.7163585424423218, + "learning_rate": 4.3243248764216596e-05, + "loss": 5.0859, + "step": 40296 + }, + { + "epoch": 0.2396576743743458, + "grad_norm": 1.6545008420944214, + "learning_rate": 4.324292938919273e-05, + "loss": 4.7158, + "step": 40297 + }, + { + "epoch": 0.2396636216576268, + "grad_norm": 1.4070156812667847, + "learning_rate": 4.324261000780044e-05, + "loss": 5.2543, + "step": 40298 + }, + { + "epoch": 0.2396695689409078, + "grad_norm": 1.4725805521011353, + "learning_rate": 4.324229062003985e-05, + "loss": 5.1846, + "step": 40299 + }, + { + "epoch": 0.23967551622418878, + "grad_norm": 1.7243998050689697, + "learning_rate": 4.324197122591107e-05, + "loss": 4.4778, + "step": 40300 + }, + { + "epoch": 0.2396814635074698, + "grad_norm": 1.4315868616104126, + "learning_rate": 4.3241651825414195e-05, + "loss": 4.7094, + "step": 40301 + }, + { + "epoch": 0.2396874107907508, + "grad_norm": 2.0608582496643066, + "learning_rate": 4.324133241854936e-05, + "loss": 3.8051, + "step": 40302 + }, + { + "epoch": 0.23969335807403178, + "grad_norm": 2.12819766998291, + "learning_rate": 4.3241013005316653e-05, + "loss": 3.7401, + "step": 40303 + }, + { + "epoch": 0.2396993053573128, + "grad_norm": 2.0762104988098145, + "learning_rate": 4.3240693585716204e-05, + "loss": 3.9509, + "step": 40304 + }, + { + "epoch": 0.23970525264059378, + "grad_norm": 2.0686657428741455, + "learning_rate": 4.324037415974811e-05, + "loss": 3.9504, + "step": 40305 + }, + { + "epoch": 0.23971119992387477, + "grad_norm": 2.2275407314300537, + "learning_rate": 4.3240054727412495e-05, + "loss": 3.9295, + "step": 40306 + }, + { + "epoch": 0.23971714720715578, + "grad_norm": 1.760735034942627, + "learning_rate": 4.3239735288709474e-05, + "loss": 4.9376, + "step": 40307 + }, + { + "epoch": 0.23972309449043677, + "grad_norm": 1.6495243310928345, + "learning_rate": 4.323941584363913e-05, + "loss": 5.2341, + "step": 40308 + }, + { + "epoch": 0.23972904177371776, + "grad_norm": 1.5035258531570435, + "learning_rate": 4.3239096392201604e-05, + "loss": 5.1652, + "step": 40309 + }, + { + "epoch": 0.23973498905699878, + "grad_norm": 1.5091159343719482, + "learning_rate": 4.323877693439701e-05, + "loss": 4.848, + "step": 40310 + }, + { + "epoch": 0.23974093634027976, + "grad_norm": 1.5886348485946655, + "learning_rate": 4.3238457470225426e-05, + "loss": 4.511, + "step": 40311 + }, + { + "epoch": 0.23974688362356075, + "grad_norm": 1.7226413488388062, + "learning_rate": 4.3238137999687004e-05, + "loss": 5.3836, + "step": 40312 + }, + { + "epoch": 0.23975283090684177, + "grad_norm": 1.6806071996688843, + "learning_rate": 4.323781852278183e-05, + "loss": 5.1204, + "step": 40313 + }, + { + "epoch": 0.23975877819012276, + "grad_norm": 1.68580961227417, + "learning_rate": 4.323749903951001e-05, + "loss": 5.0952, + "step": 40314 + }, + { + "epoch": 0.23976472547340374, + "grad_norm": 1.492282509803772, + "learning_rate": 4.3237179549871674e-05, + "loss": 5.1607, + "step": 40315 + }, + { + "epoch": 0.23977067275668473, + "grad_norm": 1.5844674110412598, + "learning_rate": 4.3236860053866935e-05, + "loss": 5.3982, + "step": 40316 + }, + { + "epoch": 0.23977662003996575, + "grad_norm": 1.5444083213806152, + "learning_rate": 4.3236540551495896e-05, + "loss": 4.9807, + "step": 40317 + }, + { + "epoch": 0.23978256732324674, + "grad_norm": 1.6543152332305908, + "learning_rate": 4.3236221042758666e-05, + "loss": 4.9649, + "step": 40318 + }, + { + "epoch": 0.23978851460652773, + "grad_norm": 1.5323008298873901, + "learning_rate": 4.323590152765535e-05, + "loss": 4.9223, + "step": 40319 + }, + { + "epoch": 0.23979446188980874, + "grad_norm": 1.7489529848098755, + "learning_rate": 4.323558200618608e-05, + "loss": 5.016, + "step": 40320 + }, + { + "epoch": 0.23980040917308973, + "grad_norm": 1.7873151302337646, + "learning_rate": 4.323526247835096e-05, + "loss": 4.8219, + "step": 40321 + }, + { + "epoch": 0.23980635645637072, + "grad_norm": 1.684981107711792, + "learning_rate": 4.323494294415009e-05, + "loss": 5.4266, + "step": 40322 + }, + { + "epoch": 0.23981230373965173, + "grad_norm": 1.5989137887954712, + "learning_rate": 4.32346234035836e-05, + "loss": 5.5433, + "step": 40323 + }, + { + "epoch": 0.23981825102293272, + "grad_norm": 1.4995635747909546, + "learning_rate": 4.323430385665158e-05, + "loss": 5.3998, + "step": 40324 + }, + { + "epoch": 0.2398241983062137, + "grad_norm": 1.6312997341156006, + "learning_rate": 4.323398430335415e-05, + "loss": 5.2043, + "step": 40325 + }, + { + "epoch": 0.23983014558949473, + "grad_norm": 1.4109045267105103, + "learning_rate": 4.323366474369144e-05, + "loss": 4.4155, + "step": 40326 + }, + { + "epoch": 0.2398360928727757, + "grad_norm": 1.6372476816177368, + "learning_rate": 4.323334517766353e-05, + "loss": 4.6252, + "step": 40327 + }, + { + "epoch": 0.2398420401560567, + "grad_norm": 1.748339295387268, + "learning_rate": 4.323302560527055e-05, + "loss": 4.6485, + "step": 40328 + }, + { + "epoch": 0.23984798743933772, + "grad_norm": 1.6974200010299683, + "learning_rate": 4.323270602651262e-05, + "loss": 4.2769, + "step": 40329 + }, + { + "epoch": 0.2398539347226187, + "grad_norm": 1.8469691276550293, + "learning_rate": 4.3232386441389836e-05, + "loss": 4.1596, + "step": 40330 + }, + { + "epoch": 0.2398598820058997, + "grad_norm": 1.839586853981018, + "learning_rate": 4.323206684990231e-05, + "loss": 4.458, + "step": 40331 + }, + { + "epoch": 0.2398658292891807, + "grad_norm": 1.697096347808838, + "learning_rate": 4.323174725205016e-05, + "loss": 5.0793, + "step": 40332 + }, + { + "epoch": 0.2398717765724617, + "grad_norm": 1.5891002416610718, + "learning_rate": 4.32314276478335e-05, + "loss": 5.2175, + "step": 40333 + }, + { + "epoch": 0.23987772385574269, + "grad_norm": 1.5909650325775146, + "learning_rate": 4.323110803725243e-05, + "loss": 4.9373, + "step": 40334 + }, + { + "epoch": 0.2398836711390237, + "grad_norm": 1.6762615442276, + "learning_rate": 4.323078842030707e-05, + "loss": 4.7523, + "step": 40335 + }, + { + "epoch": 0.2398896184223047, + "grad_norm": 1.9155479669570923, + "learning_rate": 4.323046879699754e-05, + "loss": 4.5514, + "step": 40336 + }, + { + "epoch": 0.23989556570558568, + "grad_norm": 1.4528714418411255, + "learning_rate": 4.323014916732393e-05, + "loss": 4.4861, + "step": 40337 + }, + { + "epoch": 0.2399015129888667, + "grad_norm": 1.7272597551345825, + "learning_rate": 4.322982953128636e-05, + "loss": 4.3826, + "step": 40338 + }, + { + "epoch": 0.23990746027214768, + "grad_norm": 1.4838794469833374, + "learning_rate": 4.3229509888884955e-05, + "loss": 4.1539, + "step": 40339 + }, + { + "epoch": 0.23991340755542867, + "grad_norm": 2.291356325149536, + "learning_rate": 4.322919024011981e-05, + "loss": 3.5434, + "step": 40340 + }, + { + "epoch": 0.2399193548387097, + "grad_norm": 1.5825977325439453, + "learning_rate": 4.322887058499105e-05, + "loss": 3.8833, + "step": 40341 + }, + { + "epoch": 0.23992530212199067, + "grad_norm": 1.6502342224121094, + "learning_rate": 4.322855092349878e-05, + "loss": 4.4821, + "step": 40342 + }, + { + "epoch": 0.23993124940527166, + "grad_norm": 1.6558442115783691, + "learning_rate": 4.32282312556431e-05, + "loss": 4.6801, + "step": 40343 + }, + { + "epoch": 0.23993719668855268, + "grad_norm": 1.7078803777694702, + "learning_rate": 4.322791158142414e-05, + "loss": 4.7247, + "step": 40344 + }, + { + "epoch": 0.23994314397183367, + "grad_norm": 1.3958712816238403, + "learning_rate": 4.3227591900842004e-05, + "loss": 4.6096, + "step": 40345 + }, + { + "epoch": 0.23994909125511465, + "grad_norm": 1.7768142223358154, + "learning_rate": 4.322727221389681e-05, + "loss": 4.6575, + "step": 40346 + }, + { + "epoch": 0.23995503853839567, + "grad_norm": 2.5947060585021973, + "learning_rate": 4.322695252058865e-05, + "loss": 3.6103, + "step": 40347 + }, + { + "epoch": 0.23996098582167666, + "grad_norm": 1.591221809387207, + "learning_rate": 4.322663282091766e-05, + "loss": 5.065, + "step": 40348 + }, + { + "epoch": 0.23996693310495765, + "grad_norm": 1.579350233078003, + "learning_rate": 4.322631311488394e-05, + "loss": 4.6013, + "step": 40349 + }, + { + "epoch": 0.23997288038823866, + "grad_norm": 1.2807130813598633, + "learning_rate": 4.32259934024876e-05, + "loss": 4.4429, + "step": 40350 + }, + { + "epoch": 0.23997882767151965, + "grad_norm": 1.6630812883377075, + "learning_rate": 4.322567368372875e-05, + "loss": 3.6766, + "step": 40351 + }, + { + "epoch": 0.23998477495480064, + "grad_norm": 1.6853035688400269, + "learning_rate": 4.3225353958607514e-05, + "loss": 3.8294, + "step": 40352 + }, + { + "epoch": 0.23999072223808166, + "grad_norm": 1.7000439167022705, + "learning_rate": 4.3225034227123984e-05, + "loss": 3.5902, + "step": 40353 + }, + { + "epoch": 0.23999666952136264, + "grad_norm": 1.6415079832077026, + "learning_rate": 4.3224714489278294e-05, + "loss": 4.0772, + "step": 40354 + }, + { + "epoch": 0.24000261680464363, + "grad_norm": 1.5755211114883423, + "learning_rate": 4.322439474507054e-05, + "loss": 4.3386, + "step": 40355 + }, + { + "epoch": 0.24000856408792465, + "grad_norm": 1.5829955339431763, + "learning_rate": 4.322407499450084e-05, + "loss": 4.5635, + "step": 40356 + }, + { + "epoch": 0.24001451137120564, + "grad_norm": 1.6148037910461426, + "learning_rate": 4.32237552375693e-05, + "loss": 4.6671, + "step": 40357 + }, + { + "epoch": 0.24002045865448662, + "grad_norm": 1.870450496673584, + "learning_rate": 4.3223435474276034e-05, + "loss": 4.6389, + "step": 40358 + }, + { + "epoch": 0.24002640593776764, + "grad_norm": 1.4417701959609985, + "learning_rate": 4.3223115704621166e-05, + "loss": 4.64, + "step": 40359 + }, + { + "epoch": 0.24003235322104863, + "grad_norm": 1.608487844467163, + "learning_rate": 4.3222795928604783e-05, + "loss": 4.525, + "step": 40360 + }, + { + "epoch": 0.24003830050432962, + "grad_norm": 1.6008473634719849, + "learning_rate": 4.3222476146227014e-05, + "loss": 4.593, + "step": 40361 + }, + { + "epoch": 0.24004424778761063, + "grad_norm": 1.8708992004394531, + "learning_rate": 4.322215635748797e-05, + "loss": 4.4889, + "step": 40362 + }, + { + "epoch": 0.24005019507089162, + "grad_norm": 1.626694917678833, + "learning_rate": 4.3221836562387754e-05, + "loss": 4.5172, + "step": 40363 + }, + { + "epoch": 0.2400561423541726, + "grad_norm": 1.6392260789871216, + "learning_rate": 4.3221516760926495e-05, + "loss": 4.6536, + "step": 40364 + }, + { + "epoch": 0.24006208963745362, + "grad_norm": 1.6445531845092773, + "learning_rate": 4.322119695310428e-05, + "loss": 4.3441, + "step": 40365 + }, + { + "epoch": 0.2400680369207346, + "grad_norm": 1.6682546138763428, + "learning_rate": 4.322087713892124e-05, + "loss": 4.1559, + "step": 40366 + }, + { + "epoch": 0.2400739842040156, + "grad_norm": 1.798362374305725, + "learning_rate": 4.322055731837747e-05, + "loss": 3.9415, + "step": 40367 + }, + { + "epoch": 0.24007993148729662, + "grad_norm": 1.76096773147583, + "learning_rate": 4.322023749147311e-05, + "loss": 3.9653, + "step": 40368 + }, + { + "epoch": 0.2400858787705776, + "grad_norm": 1.718976616859436, + "learning_rate": 4.3219917658208245e-05, + "loss": 3.9423, + "step": 40369 + }, + { + "epoch": 0.2400918260538586, + "grad_norm": 1.6822394132614136, + "learning_rate": 4.321959781858299e-05, + "loss": 3.908, + "step": 40370 + }, + { + "epoch": 0.2400977733371396, + "grad_norm": 1.7151468992233276, + "learning_rate": 4.321927797259747e-05, + "loss": 3.724, + "step": 40371 + }, + { + "epoch": 0.2401037206204206, + "grad_norm": 1.6571736335754395, + "learning_rate": 4.3218958120251776e-05, + "loss": 3.823, + "step": 40372 + }, + { + "epoch": 0.24010966790370158, + "grad_norm": 1.6070542335510254, + "learning_rate": 4.321863826154604e-05, + "loss": 4.2388, + "step": 40373 + }, + { + "epoch": 0.24011561518698257, + "grad_norm": 1.48780357837677, + "learning_rate": 4.321831839648037e-05, + "loss": 5.0254, + "step": 40374 + }, + { + "epoch": 0.2401215624702636, + "grad_norm": 1.5892256498336792, + "learning_rate": 4.321799852505487e-05, + "loss": 4.5356, + "step": 40375 + }, + { + "epoch": 0.24012750975354458, + "grad_norm": 1.5002117156982422, + "learning_rate": 4.321767864726965e-05, + "loss": 4.4253, + "step": 40376 + }, + { + "epoch": 0.24013345703682556, + "grad_norm": 1.4205338954925537, + "learning_rate": 4.321735876312483e-05, + "loss": 4.4213, + "step": 40377 + }, + { + "epoch": 0.24013940432010658, + "grad_norm": 1.438152551651001, + "learning_rate": 4.321703887262052e-05, + "loss": 4.5803, + "step": 40378 + }, + { + "epoch": 0.24014535160338757, + "grad_norm": 1.6766862869262695, + "learning_rate": 4.321671897575682e-05, + "loss": 4.7262, + "step": 40379 + }, + { + "epoch": 0.24015129888666856, + "grad_norm": 2.6488420963287354, + "learning_rate": 4.321639907253386e-05, + "loss": 4.0233, + "step": 40380 + }, + { + "epoch": 0.24015724616994957, + "grad_norm": 2.596217393875122, + "learning_rate": 4.321607916295175e-05, + "loss": 4.4409, + "step": 40381 + }, + { + "epoch": 0.24016319345323056, + "grad_norm": 1.4639328718185425, + "learning_rate": 4.3215759247010584e-05, + "loss": 4.9335, + "step": 40382 + }, + { + "epoch": 0.24016914073651155, + "grad_norm": 1.5613906383514404, + "learning_rate": 4.321543932471049e-05, + "loss": 4.7059, + "step": 40383 + }, + { + "epoch": 0.24017508801979257, + "grad_norm": 1.8699570894241333, + "learning_rate": 4.321511939605157e-05, + "loss": 4.4891, + "step": 40384 + }, + { + "epoch": 0.24018103530307355, + "grad_norm": 1.6762683391571045, + "learning_rate": 4.3214799461033944e-05, + "loss": 4.8633, + "step": 40385 + }, + { + "epoch": 0.24018698258635454, + "grad_norm": 1.8963688611984253, + "learning_rate": 4.321447951965772e-05, + "loss": 4.1533, + "step": 40386 + }, + { + "epoch": 0.24019292986963556, + "grad_norm": 1.736709713935852, + "learning_rate": 4.321415957192301e-05, + "loss": 4.2898, + "step": 40387 + }, + { + "epoch": 0.24019887715291655, + "grad_norm": 1.591309666633606, + "learning_rate": 4.321383961782992e-05, + "loss": 4.6393, + "step": 40388 + }, + { + "epoch": 0.24020482443619753, + "grad_norm": 1.7925829887390137, + "learning_rate": 4.321351965737857e-05, + "loss": 4.3578, + "step": 40389 + }, + { + "epoch": 0.24021077171947855, + "grad_norm": 1.8293836116790771, + "learning_rate": 4.321319969056907e-05, + "loss": 4.7523, + "step": 40390 + }, + { + "epoch": 0.24021671900275954, + "grad_norm": 2.7002294063568115, + "learning_rate": 4.3212879717401524e-05, + "loss": 4.2177, + "step": 40391 + }, + { + "epoch": 0.24022266628604053, + "grad_norm": 1.939072847366333, + "learning_rate": 4.321255973787606e-05, + "loss": 4.1021, + "step": 40392 + }, + { + "epoch": 0.24022861356932154, + "grad_norm": 2.0426032543182373, + "learning_rate": 4.321223975199277e-05, + "loss": 4.4963, + "step": 40393 + }, + { + "epoch": 0.24023456085260253, + "grad_norm": 2.3409979343414307, + "learning_rate": 4.3211919759751776e-05, + "loss": 4.2259, + "step": 40394 + }, + { + "epoch": 0.24024050813588352, + "grad_norm": 1.4650465250015259, + "learning_rate": 4.3211599761153194e-05, + "loss": 4.7195, + "step": 40395 + }, + { + "epoch": 0.24024645541916453, + "grad_norm": 1.739654779434204, + "learning_rate": 4.3211279756197134e-05, + "loss": 4.1926, + "step": 40396 + }, + { + "epoch": 0.24025240270244552, + "grad_norm": 1.7344506978988647, + "learning_rate": 4.321095974488369e-05, + "loss": 4.6745, + "step": 40397 + }, + { + "epoch": 0.2402583499857265, + "grad_norm": 1.7249184846878052, + "learning_rate": 4.3210639727213e-05, + "loss": 4.9107, + "step": 40398 + }, + { + "epoch": 0.24026429726900753, + "grad_norm": 1.5357941389083862, + "learning_rate": 4.3210319703185164e-05, + "loss": 4.4739, + "step": 40399 + }, + { + "epoch": 0.24027024455228851, + "grad_norm": 1.5572863817214966, + "learning_rate": 4.320999967280029e-05, + "loss": 5.054, + "step": 40400 + }, + { + "epoch": 0.2402761918355695, + "grad_norm": 1.5883063077926636, + "learning_rate": 4.320967963605849e-05, + "loss": 4.6538, + "step": 40401 + }, + { + "epoch": 0.24028213911885052, + "grad_norm": 1.9498145580291748, + "learning_rate": 4.3209359592959887e-05, + "loss": 4.3147, + "step": 40402 + }, + { + "epoch": 0.2402880864021315, + "grad_norm": 1.701850175857544, + "learning_rate": 4.320903954350458e-05, + "loss": 4.415, + "step": 40403 + }, + { + "epoch": 0.2402940336854125, + "grad_norm": 1.5937882661819458, + "learning_rate": 4.320871948769268e-05, + "loss": 4.9554, + "step": 40404 + }, + { + "epoch": 0.2402999809686935, + "grad_norm": 1.6359901428222656, + "learning_rate": 4.3208399425524316e-05, + "loss": 4.8231, + "step": 40405 + }, + { + "epoch": 0.2403059282519745, + "grad_norm": 1.5297181606292725, + "learning_rate": 4.320807935699957e-05, + "loss": 4.9592, + "step": 40406 + }, + { + "epoch": 0.2403118755352555, + "grad_norm": 1.4518460035324097, + "learning_rate": 4.320775928211859e-05, + "loss": 5.1742, + "step": 40407 + }, + { + "epoch": 0.2403178228185365, + "grad_norm": 2.228965997695923, + "learning_rate": 4.320743920088147e-05, + "loss": 3.7957, + "step": 40408 + }, + { + "epoch": 0.2403237701018175, + "grad_norm": 1.480629563331604, + "learning_rate": 4.3207119113288306e-05, + "loss": 4.8929, + "step": 40409 + }, + { + "epoch": 0.24032971738509848, + "grad_norm": 1.5657299757003784, + "learning_rate": 4.320679901933924e-05, + "loss": 4.9747, + "step": 40410 + }, + { + "epoch": 0.2403356646683795, + "grad_norm": 1.7055550813674927, + "learning_rate": 4.320647891903436e-05, + "loss": 4.8292, + "step": 40411 + }, + { + "epoch": 0.24034161195166048, + "grad_norm": 1.3745795488357544, + "learning_rate": 4.320615881237379e-05, + "loss": 4.7452, + "step": 40412 + }, + { + "epoch": 0.24034755923494147, + "grad_norm": 1.5124225616455078, + "learning_rate": 4.3205838699357635e-05, + "loss": 4.0596, + "step": 40413 + }, + { + "epoch": 0.2403535065182225, + "grad_norm": 1.5361684560775757, + "learning_rate": 4.3205518579986004e-05, + "loss": 4.6563, + "step": 40414 + }, + { + "epoch": 0.24035945380150348, + "grad_norm": 1.850304365158081, + "learning_rate": 4.320519845425902e-05, + "loss": 4.2293, + "step": 40415 + }, + { + "epoch": 0.24036540108478446, + "grad_norm": 2.629342794418335, + "learning_rate": 4.32048783221768e-05, + "loss": 3.0815, + "step": 40416 + }, + { + "epoch": 0.24037134836806548, + "grad_norm": 2.5012240409851074, + "learning_rate": 4.320455818373943e-05, + "loss": 3.1449, + "step": 40417 + }, + { + "epoch": 0.24037729565134647, + "grad_norm": 2.0380454063415527, + "learning_rate": 4.320423803894704e-05, + "loss": 4.0501, + "step": 40418 + }, + { + "epoch": 0.24038324293462746, + "grad_norm": 1.3713030815124512, + "learning_rate": 4.3203917887799736e-05, + "loss": 4.9033, + "step": 40419 + }, + { + "epoch": 0.24038919021790847, + "grad_norm": 1.3717049360275269, + "learning_rate": 4.320359773029764e-05, + "loss": 4.4798, + "step": 40420 + }, + { + "epoch": 0.24039513750118946, + "grad_norm": 1.362446665763855, + "learning_rate": 4.320327756644085e-05, + "loss": 4.5595, + "step": 40421 + }, + { + "epoch": 0.24040108478447045, + "grad_norm": 1.645869255065918, + "learning_rate": 4.320295739622949e-05, + "loss": 4.3585, + "step": 40422 + }, + { + "epoch": 0.24040703206775146, + "grad_norm": 1.503118872642517, + "learning_rate": 4.320263721966367e-05, + "loss": 4.6053, + "step": 40423 + }, + { + "epoch": 0.24041297935103245, + "grad_norm": 1.7341761589050293, + "learning_rate": 4.320231703674348e-05, + "loss": 4.717, + "step": 40424 + }, + { + "epoch": 0.24041892663431344, + "grad_norm": 1.5121181011199951, + "learning_rate": 4.320199684746906e-05, + "loss": 4.5785, + "step": 40425 + }, + { + "epoch": 0.24042487391759446, + "grad_norm": 1.7739570140838623, + "learning_rate": 4.320167665184052e-05, + "loss": 4.2038, + "step": 40426 + }, + { + "epoch": 0.24043082120087544, + "grad_norm": 1.622102975845337, + "learning_rate": 4.320135644985794e-05, + "loss": 4.4672, + "step": 40427 + }, + { + "epoch": 0.24043676848415643, + "grad_norm": 1.571637511253357, + "learning_rate": 4.320103624152148e-05, + "loss": 4.1832, + "step": 40428 + }, + { + "epoch": 0.24044271576743745, + "grad_norm": 1.5558611154556274, + "learning_rate": 4.320071602683121e-05, + "loss": 4.529, + "step": 40429 + }, + { + "epoch": 0.24044866305071844, + "grad_norm": 1.4468823671340942, + "learning_rate": 4.320039580578726e-05, + "loss": 4.641, + "step": 40430 + }, + { + "epoch": 0.24045461033399942, + "grad_norm": 1.4183605909347534, + "learning_rate": 4.3200075578389745e-05, + "loss": 4.7441, + "step": 40431 + }, + { + "epoch": 0.2404605576172804, + "grad_norm": 1.4913756847381592, + "learning_rate": 4.319975534463876e-05, + "loss": 4.9144, + "step": 40432 + }, + { + "epoch": 0.24046650490056143, + "grad_norm": 1.6980154514312744, + "learning_rate": 4.3199435104534444e-05, + "loss": 4.4817, + "step": 40433 + }, + { + "epoch": 0.24047245218384242, + "grad_norm": 1.3807332515716553, + "learning_rate": 4.3199114858076884e-05, + "loss": 4.4927, + "step": 40434 + }, + { + "epoch": 0.2404783994671234, + "grad_norm": 1.4912723302841187, + "learning_rate": 4.3198794605266204e-05, + "loss": 4.4839, + "step": 40435 + }, + { + "epoch": 0.24048434675040442, + "grad_norm": 1.5393762588500977, + "learning_rate": 4.319847434610251e-05, + "loss": 4.6246, + "step": 40436 + }, + { + "epoch": 0.2404902940336854, + "grad_norm": 1.5918090343475342, + "learning_rate": 4.319815408058592e-05, + "loss": 4.7018, + "step": 40437 + }, + { + "epoch": 0.2404962413169664, + "grad_norm": 1.4923152923583984, + "learning_rate": 4.3197833808716536e-05, + "loss": 4.8207, + "step": 40438 + }, + { + "epoch": 0.2405021886002474, + "grad_norm": 1.5670098066329956, + "learning_rate": 4.319751353049448e-05, + "loss": 4.6249, + "step": 40439 + }, + { + "epoch": 0.2405081358835284, + "grad_norm": 1.7240232229232788, + "learning_rate": 4.319719324591986e-05, + "loss": 4.1793, + "step": 40440 + }, + { + "epoch": 0.2405140831668094, + "grad_norm": 1.6572893857955933, + "learning_rate": 4.3196872954992795e-05, + "loss": 4.8213, + "step": 40441 + }, + { + "epoch": 0.2405200304500904, + "grad_norm": 1.5851125717163086, + "learning_rate": 4.319655265771339e-05, + "loss": 4.5309, + "step": 40442 + }, + { + "epoch": 0.2405259777333714, + "grad_norm": 1.7426246404647827, + "learning_rate": 4.3196232354081746e-05, + "loss": 4.1475, + "step": 40443 + }, + { + "epoch": 0.24053192501665238, + "grad_norm": 1.4856244325637817, + "learning_rate": 4.319591204409799e-05, + "loss": 4.8924, + "step": 40444 + }, + { + "epoch": 0.2405378722999334, + "grad_norm": 1.5679676532745361, + "learning_rate": 4.319559172776222e-05, + "loss": 3.9395, + "step": 40445 + }, + { + "epoch": 0.24054381958321439, + "grad_norm": 1.6583149433135986, + "learning_rate": 4.3195271405074564e-05, + "loss": 3.6517, + "step": 40446 + }, + { + "epoch": 0.24054976686649537, + "grad_norm": 1.4699217081069946, + "learning_rate": 4.3194951076035134e-05, + "loss": 4.8774, + "step": 40447 + }, + { + "epoch": 0.2405557141497764, + "grad_norm": 1.4718286991119385, + "learning_rate": 4.319463074064403e-05, + "loss": 5.0944, + "step": 40448 + }, + { + "epoch": 0.24056166143305738, + "grad_norm": 1.8027266263961792, + "learning_rate": 4.3194310398901366e-05, + "loss": 4.8006, + "step": 40449 + }, + { + "epoch": 0.24056760871633837, + "grad_norm": 1.8137298822402954, + "learning_rate": 4.319399005080725e-05, + "loss": 4.3831, + "step": 40450 + }, + { + "epoch": 0.24057355599961938, + "grad_norm": 1.7204256057739258, + "learning_rate": 4.319366969636181e-05, + "loss": 4.6236, + "step": 40451 + }, + { + "epoch": 0.24057950328290037, + "grad_norm": 1.5995945930480957, + "learning_rate": 4.3193349335565146e-05, + "loss": 4.8069, + "step": 40452 + }, + { + "epoch": 0.24058545056618136, + "grad_norm": 1.726744294166565, + "learning_rate": 4.319302896841737e-05, + "loss": 4.3986, + "step": 40453 + }, + { + "epoch": 0.24059139784946237, + "grad_norm": 1.816493034362793, + "learning_rate": 4.31927085949186e-05, + "loss": 4.3756, + "step": 40454 + }, + { + "epoch": 0.24059734513274336, + "grad_norm": 1.4733374118804932, + "learning_rate": 4.319238821506894e-05, + "loss": 5.1779, + "step": 40455 + }, + { + "epoch": 0.24060329241602435, + "grad_norm": 1.761458158493042, + "learning_rate": 4.31920678288685e-05, + "loss": 4.7514, + "step": 40456 + }, + { + "epoch": 0.24060923969930537, + "grad_norm": 1.6330347061157227, + "learning_rate": 4.319174743631741e-05, + "loss": 4.86, + "step": 40457 + }, + { + "epoch": 0.24061518698258635, + "grad_norm": 1.720502495765686, + "learning_rate": 4.319142703741576e-05, + "loss": 4.8738, + "step": 40458 + }, + { + "epoch": 0.24062113426586734, + "grad_norm": 1.8455824851989746, + "learning_rate": 4.319110663216367e-05, + "loss": 4.7941, + "step": 40459 + }, + { + "epoch": 0.24062708154914836, + "grad_norm": 1.6219062805175781, + "learning_rate": 4.3190786220561255e-05, + "loss": 5.104, + "step": 40460 + }, + { + "epoch": 0.24063302883242935, + "grad_norm": 1.6036477088928223, + "learning_rate": 4.319046580260862e-05, + "loss": 4.7218, + "step": 40461 + }, + { + "epoch": 0.24063897611571033, + "grad_norm": 1.6729902029037476, + "learning_rate": 4.319014537830589e-05, + "loss": 4.7354, + "step": 40462 + }, + { + "epoch": 0.24064492339899135, + "grad_norm": 1.7297226190567017, + "learning_rate": 4.318982494765316e-05, + "loss": 4.6462, + "step": 40463 + }, + { + "epoch": 0.24065087068227234, + "grad_norm": 1.5223788022994995, + "learning_rate": 4.318950451065055e-05, + "loss": 4.7275, + "step": 40464 + }, + { + "epoch": 0.24065681796555333, + "grad_norm": 1.5921378135681152, + "learning_rate": 4.318918406729818e-05, + "loss": 4.6989, + "step": 40465 + }, + { + "epoch": 0.24066276524883434, + "grad_norm": 1.527626395225525, + "learning_rate": 4.3188863617596145e-05, + "loss": 5.0686, + "step": 40466 + }, + { + "epoch": 0.24066871253211533, + "grad_norm": 1.5577787160873413, + "learning_rate": 4.318854316154457e-05, + "loss": 4.8386, + "step": 40467 + }, + { + "epoch": 0.24067465981539632, + "grad_norm": 2.178595542907715, + "learning_rate": 4.3188222699143564e-05, + "loss": 3.7511, + "step": 40468 + }, + { + "epoch": 0.24068060709867733, + "grad_norm": 2.2041494846343994, + "learning_rate": 4.318790223039324e-05, + "loss": 3.7051, + "step": 40469 + }, + { + "epoch": 0.24068655438195832, + "grad_norm": 1.6436792612075806, + "learning_rate": 4.3187581755293696e-05, + "loss": 4.337, + "step": 40470 + }, + { + "epoch": 0.2406925016652393, + "grad_norm": 1.721710205078125, + "learning_rate": 4.3187261273845066e-05, + "loss": 4.6658, + "step": 40471 + }, + { + "epoch": 0.24069844894852033, + "grad_norm": 1.6310043334960938, + "learning_rate": 4.318694078604745e-05, + "loss": 4.6236, + "step": 40472 + }, + { + "epoch": 0.24070439623180132, + "grad_norm": 1.4949095249176025, + "learning_rate": 4.3186620291900956e-05, + "loss": 4.6444, + "step": 40473 + }, + { + "epoch": 0.2407103435150823, + "grad_norm": 1.5411988496780396, + "learning_rate": 4.3186299791405706e-05, + "loss": 4.4757, + "step": 40474 + }, + { + "epoch": 0.24071629079836332, + "grad_norm": 1.7229669094085693, + "learning_rate": 4.3185979284561795e-05, + "loss": 4.6702, + "step": 40475 + }, + { + "epoch": 0.2407222380816443, + "grad_norm": 1.5889441967010498, + "learning_rate": 4.318565877136935e-05, + "loss": 4.8576, + "step": 40476 + }, + { + "epoch": 0.2407281853649253, + "grad_norm": 1.9724572896957397, + "learning_rate": 4.318533825182849e-05, + "loss": 4.1747, + "step": 40477 + }, + { + "epoch": 0.2407341326482063, + "grad_norm": 1.6053643226623535, + "learning_rate": 4.3185017725939315e-05, + "loss": 4.7893, + "step": 40478 + }, + { + "epoch": 0.2407400799314873, + "grad_norm": 1.7052903175354004, + "learning_rate": 4.3184697193701926e-05, + "loss": 5.1706, + "step": 40479 + }, + { + "epoch": 0.2407460272147683, + "grad_norm": 1.5560256242752075, + "learning_rate": 4.318437665511645e-05, + "loss": 4.9775, + "step": 40480 + }, + { + "epoch": 0.2407519744980493, + "grad_norm": 1.7095351219177246, + "learning_rate": 4.318405611018301e-05, + "loss": 4.9416, + "step": 40481 + }, + { + "epoch": 0.2407579217813303, + "grad_norm": 1.6378494501113892, + "learning_rate": 4.3183735558901694e-05, + "loss": 5.0545, + "step": 40482 + }, + { + "epoch": 0.24076386906461128, + "grad_norm": 1.2932690382003784, + "learning_rate": 4.318341500127262e-05, + "loss": 5.3011, + "step": 40483 + }, + { + "epoch": 0.2407698163478923, + "grad_norm": 1.654869556427002, + "learning_rate": 4.318309443729591e-05, + "loss": 4.5987, + "step": 40484 + }, + { + "epoch": 0.24077576363117328, + "grad_norm": 1.5318657159805298, + "learning_rate": 4.318277386697167e-05, + "loss": 4.9222, + "step": 40485 + }, + { + "epoch": 0.24078171091445427, + "grad_norm": 1.8937638998031616, + "learning_rate": 4.3182453290300004e-05, + "loss": 4.4559, + "step": 40486 + }, + { + "epoch": 0.2407876581977353, + "grad_norm": 1.5016818046569824, + "learning_rate": 4.3182132707281044e-05, + "loss": 4.3193, + "step": 40487 + }, + { + "epoch": 0.24079360548101628, + "grad_norm": 1.7896372079849243, + "learning_rate": 4.3181812117914874e-05, + "loss": 4.4757, + "step": 40488 + }, + { + "epoch": 0.24079955276429726, + "grad_norm": 1.5730087757110596, + "learning_rate": 4.3181491522201635e-05, + "loss": 4.4021, + "step": 40489 + }, + { + "epoch": 0.24080550004757825, + "grad_norm": 1.8039954900741577, + "learning_rate": 4.3181170920141415e-05, + "loss": 4.9458, + "step": 40490 + }, + { + "epoch": 0.24081144733085927, + "grad_norm": 1.6354478597640991, + "learning_rate": 4.3180850311734344e-05, + "loss": 4.1364, + "step": 40491 + }, + { + "epoch": 0.24081739461414026, + "grad_norm": 1.6513251066207886, + "learning_rate": 4.318052969698052e-05, + "loss": 4.3631, + "step": 40492 + }, + { + "epoch": 0.24082334189742124, + "grad_norm": 1.3984806537628174, + "learning_rate": 4.3180209075880066e-05, + "loss": 4.9405, + "step": 40493 + }, + { + "epoch": 0.24082928918070226, + "grad_norm": 1.6394158601760864, + "learning_rate": 4.317988844843308e-05, + "loss": 4.6277, + "step": 40494 + }, + { + "epoch": 0.24083523646398325, + "grad_norm": 1.8858355283737183, + "learning_rate": 4.317956781463969e-05, + "loss": 3.9912, + "step": 40495 + }, + { + "epoch": 0.24084118374726424, + "grad_norm": 1.5893759727478027, + "learning_rate": 4.3179247174500005e-05, + "loss": 4.7179, + "step": 40496 + }, + { + "epoch": 0.24084713103054525, + "grad_norm": 1.5338923931121826, + "learning_rate": 4.317892652801413e-05, + "loss": 4.7104, + "step": 40497 + }, + { + "epoch": 0.24085307831382624, + "grad_norm": 1.3246487379074097, + "learning_rate": 4.317860587518217e-05, + "loss": 4.9648, + "step": 40498 + }, + { + "epoch": 0.24085902559710723, + "grad_norm": 1.5209506750106812, + "learning_rate": 4.3178285216004256e-05, + "loss": 4.8658, + "step": 40499 + }, + { + "epoch": 0.24086497288038825, + "grad_norm": 1.277113914489746, + "learning_rate": 4.3177964550480486e-05, + "loss": 4.9573, + "step": 40500 + }, + { + "epoch": 0.24087092016366923, + "grad_norm": 1.3813836574554443, + "learning_rate": 4.3177643878610975e-05, + "loss": 5.1197, + "step": 40501 + }, + { + "epoch": 0.24087686744695022, + "grad_norm": 1.4858213663101196, + "learning_rate": 4.317732320039584e-05, + "loss": 4.5417, + "step": 40502 + }, + { + "epoch": 0.24088281473023124, + "grad_norm": 1.3848856687545776, + "learning_rate": 4.3177002515835186e-05, + "loss": 4.9915, + "step": 40503 + }, + { + "epoch": 0.24088876201351223, + "grad_norm": 1.4599226713180542, + "learning_rate": 4.317668182492913e-05, + "loss": 4.8854, + "step": 40504 + }, + { + "epoch": 0.2408947092967932, + "grad_norm": 1.62251615524292, + "learning_rate": 4.317636112767779e-05, + "loss": 4.7812, + "step": 40505 + }, + { + "epoch": 0.24090065658007423, + "grad_norm": 1.4397335052490234, + "learning_rate": 4.317604042408126e-05, + "loss": 4.9095, + "step": 40506 + }, + { + "epoch": 0.24090660386335522, + "grad_norm": 1.5139696598052979, + "learning_rate": 4.3175719714139664e-05, + "loss": 5.111, + "step": 40507 + }, + { + "epoch": 0.2409125511466362, + "grad_norm": 1.4132105112075806, + "learning_rate": 4.3175398997853114e-05, + "loss": 4.8558, + "step": 40508 + }, + { + "epoch": 0.24091849842991722, + "grad_norm": 1.4312713146209717, + "learning_rate": 4.317507827522172e-05, + "loss": 4.8636, + "step": 40509 + }, + { + "epoch": 0.2409244457131982, + "grad_norm": 1.4198713302612305, + "learning_rate": 4.317475754624559e-05, + "loss": 4.8175, + "step": 40510 + }, + { + "epoch": 0.2409303929964792, + "grad_norm": 1.3795651197433472, + "learning_rate": 4.3174436810924844e-05, + "loss": 4.9265, + "step": 40511 + }, + { + "epoch": 0.2409363402797602, + "grad_norm": 1.554825782775879, + "learning_rate": 4.317411606925959e-05, + "loss": 4.9263, + "step": 40512 + }, + { + "epoch": 0.2409422875630412, + "grad_norm": 1.2066971063613892, + "learning_rate": 4.317379532124993e-05, + "loss": 5.0776, + "step": 40513 + }, + { + "epoch": 0.2409482348463222, + "grad_norm": 1.3147616386413574, + "learning_rate": 4.3173474566896e-05, + "loss": 4.7452, + "step": 40514 + }, + { + "epoch": 0.2409541821296032, + "grad_norm": 1.411190390586853, + "learning_rate": 4.3173153806197895e-05, + "loss": 4.5922, + "step": 40515 + }, + { + "epoch": 0.2409601294128842, + "grad_norm": 1.7078477144241333, + "learning_rate": 4.317283303915572e-05, + "loss": 4.641, + "step": 40516 + }, + { + "epoch": 0.24096607669616518, + "grad_norm": 1.7670397758483887, + "learning_rate": 4.317251226576961e-05, + "loss": 4.5015, + "step": 40517 + }, + { + "epoch": 0.2409720239794462, + "grad_norm": 1.6643019914627075, + "learning_rate": 4.317219148603966e-05, + "loss": 4.4542, + "step": 40518 + }, + { + "epoch": 0.2409779712627272, + "grad_norm": 1.3657439947128296, + "learning_rate": 4.3171870699965975e-05, + "loss": 4.512, + "step": 40519 + }, + { + "epoch": 0.24098391854600817, + "grad_norm": 1.5493052005767822, + "learning_rate": 4.317154990754869e-05, + "loss": 4.4967, + "step": 40520 + }, + { + "epoch": 0.2409898658292892, + "grad_norm": 1.6112905740737915, + "learning_rate": 4.31712291087879e-05, + "loss": 4.262, + "step": 40521 + }, + { + "epoch": 0.24099581311257018, + "grad_norm": 2.0505878925323486, + "learning_rate": 4.317090830368372e-05, + "loss": 4.9236, + "step": 40522 + }, + { + "epoch": 0.24100176039585117, + "grad_norm": 2.3958640098571777, + "learning_rate": 4.317058749223627e-05, + "loss": 5.0165, + "step": 40523 + }, + { + "epoch": 0.24100770767913218, + "grad_norm": 2.0765507221221924, + "learning_rate": 4.317026667444565e-05, + "loss": 4.9959, + "step": 40524 + }, + { + "epoch": 0.24101365496241317, + "grad_norm": 1.4790958166122437, + "learning_rate": 4.316994585031198e-05, + "loss": 5.119, + "step": 40525 + }, + { + "epoch": 0.24101960224569416, + "grad_norm": 1.4160449504852295, + "learning_rate": 4.3169625019835366e-05, + "loss": 5.1832, + "step": 40526 + }, + { + "epoch": 0.24102554952897517, + "grad_norm": 1.6198077201843262, + "learning_rate": 4.3169304183015927e-05, + "loss": 5.0807, + "step": 40527 + }, + { + "epoch": 0.24103149681225616, + "grad_norm": 1.8107160329818726, + "learning_rate": 4.3168983339853776e-05, + "loss": 4.8454, + "step": 40528 + }, + { + "epoch": 0.24103744409553715, + "grad_norm": 1.5346699953079224, + "learning_rate": 4.316866249034901e-05, + "loss": 5.1825, + "step": 40529 + }, + { + "epoch": 0.24104339137881817, + "grad_norm": 1.5611132383346558, + "learning_rate": 4.316834163450176e-05, + "loss": 5.273, + "step": 40530 + }, + { + "epoch": 0.24104933866209916, + "grad_norm": 1.5221023559570312, + "learning_rate": 4.3168020772312124e-05, + "loss": 5.0283, + "step": 40531 + }, + { + "epoch": 0.24105528594538014, + "grad_norm": 1.441698670387268, + "learning_rate": 4.3167699903780225e-05, + "loss": 4.9875, + "step": 40532 + }, + { + "epoch": 0.24106123322866116, + "grad_norm": 1.7530490159988403, + "learning_rate": 4.3167379028906163e-05, + "loss": 4.6516, + "step": 40533 + }, + { + "epoch": 0.24106718051194215, + "grad_norm": 1.7354764938354492, + "learning_rate": 4.316705814769007e-05, + "loss": 4.9458, + "step": 40534 + }, + { + "epoch": 0.24107312779522314, + "grad_norm": 1.8037127256393433, + "learning_rate": 4.316673726013203e-05, + "loss": 4.7518, + "step": 40535 + }, + { + "epoch": 0.24107907507850415, + "grad_norm": 2.5831401348114014, + "learning_rate": 4.3166416366232175e-05, + "loss": 3.5643, + "step": 40536 + }, + { + "epoch": 0.24108502236178514, + "grad_norm": 2.215723991394043, + "learning_rate": 4.316609546599061e-05, + "loss": 3.7712, + "step": 40537 + }, + { + "epoch": 0.24109096964506613, + "grad_norm": 1.8091305494308472, + "learning_rate": 4.316577455940745e-05, + "loss": 4.4812, + "step": 40538 + }, + { + "epoch": 0.24109691692834714, + "grad_norm": 2.0324342250823975, + "learning_rate": 4.31654536464828e-05, + "loss": 3.9794, + "step": 40539 + }, + { + "epoch": 0.24110286421162813, + "grad_norm": 2.8087878227233887, + "learning_rate": 4.3165132727216795e-05, + "loss": 3.6354, + "step": 40540 + }, + { + "epoch": 0.24110881149490912, + "grad_norm": 2.4571850299835205, + "learning_rate": 4.316481180160952e-05, + "loss": 4.3712, + "step": 40541 + }, + { + "epoch": 0.24111475877819014, + "grad_norm": 1.7085291147232056, + "learning_rate": 4.316449086966109e-05, + "loss": 3.9684, + "step": 40542 + }, + { + "epoch": 0.24112070606147112, + "grad_norm": 2.2950375080108643, + "learning_rate": 4.3164169931371636e-05, + "loss": 3.298, + "step": 40543 + }, + { + "epoch": 0.2411266533447521, + "grad_norm": 2.7846076488494873, + "learning_rate": 4.316384898674125e-05, + "loss": 3.4768, + "step": 40544 + }, + { + "epoch": 0.24113260062803313, + "grad_norm": 2.7075061798095703, + "learning_rate": 4.316352803577005e-05, + "loss": 3.6938, + "step": 40545 + }, + { + "epoch": 0.24113854791131412, + "grad_norm": 2.694310426712036, + "learning_rate": 4.3163207078458154e-05, + "loss": 3.4619, + "step": 40546 + }, + { + "epoch": 0.2411444951945951, + "grad_norm": 2.5002973079681396, + "learning_rate": 4.316288611480567e-05, + "loss": 3.4072, + "step": 40547 + }, + { + "epoch": 0.2411504424778761, + "grad_norm": 1.715087890625, + "learning_rate": 4.316256514481271e-05, + "loss": 4.3693, + "step": 40548 + }, + { + "epoch": 0.2411563897611571, + "grad_norm": 1.7517629861831665, + "learning_rate": 4.3162244168479385e-05, + "loss": 3.6944, + "step": 40549 + }, + { + "epoch": 0.2411623370444381, + "grad_norm": 2.1850292682647705, + "learning_rate": 4.3161923185805805e-05, + "loss": 3.5392, + "step": 40550 + }, + { + "epoch": 0.24116828432771908, + "grad_norm": 2.2902944087982178, + "learning_rate": 4.3161602196792095e-05, + "loss": 3.7953, + "step": 40551 + }, + { + "epoch": 0.2411742316110001, + "grad_norm": 1.864230751991272, + "learning_rate": 4.316128120143835e-05, + "loss": 5.0214, + "step": 40552 + }, + { + "epoch": 0.2411801788942811, + "grad_norm": 1.505345344543457, + "learning_rate": 4.3160960199744694e-05, + "loss": 5.0611, + "step": 40553 + }, + { + "epoch": 0.24118612617756208, + "grad_norm": 1.7585827112197876, + "learning_rate": 4.316063919171123e-05, + "loss": 5.326, + "step": 40554 + }, + { + "epoch": 0.2411920734608431, + "grad_norm": 1.6775903701782227, + "learning_rate": 4.3160318177338074e-05, + "loss": 4.3297, + "step": 40555 + }, + { + "epoch": 0.24119802074412408, + "grad_norm": 1.7692220211029053, + "learning_rate": 4.315999715662534e-05, + "loss": 4.1937, + "step": 40556 + }, + { + "epoch": 0.24120396802740507, + "grad_norm": 1.8271069526672363, + "learning_rate": 4.3159676129573144e-05, + "loss": 4.1663, + "step": 40557 + }, + { + "epoch": 0.24120991531068608, + "grad_norm": 1.7876747846603394, + "learning_rate": 4.315935509618158e-05, + "loss": 4.246, + "step": 40558 + }, + { + "epoch": 0.24121586259396707, + "grad_norm": 1.5747913122177124, + "learning_rate": 4.3159034056450786e-05, + "loss": 4.4188, + "step": 40559 + }, + { + "epoch": 0.24122180987724806, + "grad_norm": 1.5375590324401855, + "learning_rate": 4.3158713010380856e-05, + "loss": 4.7125, + "step": 40560 + }, + { + "epoch": 0.24122775716052908, + "grad_norm": 1.6270637512207031, + "learning_rate": 4.3158391957971914e-05, + "loss": 4.3451, + "step": 40561 + }, + { + "epoch": 0.24123370444381007, + "grad_norm": 1.79709792137146, + "learning_rate": 4.315807089922405e-05, + "loss": 4.5274, + "step": 40562 + }, + { + "epoch": 0.24123965172709105, + "grad_norm": 1.265010118484497, + "learning_rate": 4.3157749834137395e-05, + "loss": 5.1969, + "step": 40563 + }, + { + "epoch": 0.24124559901037207, + "grad_norm": 1.3156484365463257, + "learning_rate": 4.315742876271207e-05, + "loss": 5.5351, + "step": 40564 + }, + { + "epoch": 0.24125154629365306, + "grad_norm": 1.3786804676055908, + "learning_rate": 4.3157107684948164e-05, + "loss": 5.2817, + "step": 40565 + }, + { + "epoch": 0.24125749357693405, + "grad_norm": 1.4848926067352295, + "learning_rate": 4.31567866008458e-05, + "loss": 5.4798, + "step": 40566 + }, + { + "epoch": 0.24126344086021506, + "grad_norm": 1.669960618019104, + "learning_rate": 4.315646551040508e-05, + "loss": 4.687, + "step": 40567 + }, + { + "epoch": 0.24126938814349605, + "grad_norm": 1.9244717359542847, + "learning_rate": 4.315614441362614e-05, + "loss": 3.8258, + "step": 40568 + }, + { + "epoch": 0.24127533542677704, + "grad_norm": 2.9237818717956543, + "learning_rate": 4.315582331050907e-05, + "loss": 4.1739, + "step": 40569 + }, + { + "epoch": 0.24128128271005805, + "grad_norm": 3.028883934020996, + "learning_rate": 4.3155502201054e-05, + "loss": 4.2054, + "step": 40570 + }, + { + "epoch": 0.24128722999333904, + "grad_norm": 3.0935630798339844, + "learning_rate": 4.315518108526102e-05, + "loss": 4.2096, + "step": 40571 + }, + { + "epoch": 0.24129317727662003, + "grad_norm": 1.7454851865768433, + "learning_rate": 4.3154859963130255e-05, + "loss": 4.6698, + "step": 40572 + }, + { + "epoch": 0.24129912455990105, + "grad_norm": 1.3664960861206055, + "learning_rate": 4.315453883466182e-05, + "loss": 4.8047, + "step": 40573 + }, + { + "epoch": 0.24130507184318203, + "grad_norm": 1.2120872735977173, + "learning_rate": 4.315421769985582e-05, + "loss": 4.7992, + "step": 40574 + }, + { + "epoch": 0.24131101912646302, + "grad_norm": 1.3705421686172485, + "learning_rate": 4.315389655871237e-05, + "loss": 4.7221, + "step": 40575 + }, + { + "epoch": 0.24131696640974404, + "grad_norm": 1.5563099384307861, + "learning_rate": 4.315357541123159e-05, + "loss": 4.9588, + "step": 40576 + }, + { + "epoch": 0.24132291369302503, + "grad_norm": 1.4895544052124023, + "learning_rate": 4.315325425741358e-05, + "loss": 5.4245, + "step": 40577 + }, + { + "epoch": 0.24132886097630601, + "grad_norm": 1.4575716257095337, + "learning_rate": 4.315293309725845e-05, + "loss": 5.1502, + "step": 40578 + }, + { + "epoch": 0.24133480825958703, + "grad_norm": 1.4507086277008057, + "learning_rate": 4.315261193076632e-05, + "loss": 5.0112, + "step": 40579 + }, + { + "epoch": 0.24134075554286802, + "grad_norm": 1.5704593658447266, + "learning_rate": 4.315229075793731e-05, + "loss": 4.8826, + "step": 40580 + }, + { + "epoch": 0.241346702826149, + "grad_norm": 1.5441209077835083, + "learning_rate": 4.315196957877151e-05, + "loss": 4.9448, + "step": 40581 + }, + { + "epoch": 0.24135265010943002, + "grad_norm": 1.5289744138717651, + "learning_rate": 4.315164839326906e-05, + "loss": 4.8663, + "step": 40582 + }, + { + "epoch": 0.241358597392711, + "grad_norm": 1.610520362854004, + "learning_rate": 4.315132720143005e-05, + "loss": 4.865, + "step": 40583 + }, + { + "epoch": 0.241364544675992, + "grad_norm": 1.4503530263900757, + "learning_rate": 4.3151006003254596e-05, + "loss": 4.9839, + "step": 40584 + }, + { + "epoch": 0.24137049195927301, + "grad_norm": 1.581776738166809, + "learning_rate": 4.3150684798742816e-05, + "loss": 4.87, + "step": 40585 + }, + { + "epoch": 0.241376439242554, + "grad_norm": 1.5973701477050781, + "learning_rate": 4.3150363587894816e-05, + "loss": 4.4253, + "step": 40586 + }, + { + "epoch": 0.241382386525835, + "grad_norm": 1.4599217176437378, + "learning_rate": 4.315004237071072e-05, + "loss": 5.2499, + "step": 40587 + }, + { + "epoch": 0.241388333809116, + "grad_norm": 1.8869445323944092, + "learning_rate": 4.3149721147190625e-05, + "loss": 4.6595, + "step": 40588 + }, + { + "epoch": 0.241394281092397, + "grad_norm": 1.5399142503738403, + "learning_rate": 4.314939991733465e-05, + "loss": 4.8666, + "step": 40589 + }, + { + "epoch": 0.24140022837567798, + "grad_norm": 1.2901896238327026, + "learning_rate": 4.314907868114291e-05, + "loss": 4.8569, + "step": 40590 + }, + { + "epoch": 0.241406175658959, + "grad_norm": 1.568609356880188, + "learning_rate": 4.314875743861551e-05, + "loss": 4.5609, + "step": 40591 + }, + { + "epoch": 0.24141212294224, + "grad_norm": 1.5415786504745483, + "learning_rate": 4.314843618975257e-05, + "loss": 4.7633, + "step": 40592 + }, + { + "epoch": 0.24141807022552098, + "grad_norm": 1.4981814622879028, + "learning_rate": 4.31481149345542e-05, + "loss": 4.9199, + "step": 40593 + }, + { + "epoch": 0.241424017508802, + "grad_norm": 1.4273128509521484, + "learning_rate": 4.3147793673020506e-05, + "loss": 5.0187, + "step": 40594 + }, + { + "epoch": 0.24142996479208298, + "grad_norm": 1.4518111944198608, + "learning_rate": 4.31474724051516e-05, + "loss": 4.9031, + "step": 40595 + }, + { + "epoch": 0.24143591207536397, + "grad_norm": 1.65366792678833, + "learning_rate": 4.314715113094761e-05, + "loss": 4.1667, + "step": 40596 + }, + { + "epoch": 0.24144185935864498, + "grad_norm": 1.696748971939087, + "learning_rate": 4.3146829850408635e-05, + "loss": 4.4033, + "step": 40597 + }, + { + "epoch": 0.24144780664192597, + "grad_norm": 1.4657249450683594, + "learning_rate": 4.314650856353478e-05, + "loss": 4.729, + "step": 40598 + }, + { + "epoch": 0.24145375392520696, + "grad_norm": 1.462398886680603, + "learning_rate": 4.3146187270326176e-05, + "loss": 4.3848, + "step": 40599 + }, + { + "epoch": 0.24145970120848798, + "grad_norm": 1.436286211013794, + "learning_rate": 4.314586597078292e-05, + "loss": 4.2975, + "step": 40600 + }, + { + "epoch": 0.24146564849176896, + "grad_norm": 1.5341529846191406, + "learning_rate": 4.314554466490514e-05, + "loss": 4.3528, + "step": 40601 + }, + { + "epoch": 0.24147159577504995, + "grad_norm": 1.5852245092391968, + "learning_rate": 4.314522335269292e-05, + "loss": 4.4556, + "step": 40602 + }, + { + "epoch": 0.24147754305833097, + "grad_norm": 1.539803385734558, + "learning_rate": 4.3144902034146405e-05, + "loss": 4.4091, + "step": 40603 + }, + { + "epoch": 0.24148349034161196, + "grad_norm": 1.5505869388580322, + "learning_rate": 4.314458070926569e-05, + "loss": 4.66, + "step": 40604 + }, + { + "epoch": 0.24148943762489294, + "grad_norm": 1.4669098854064941, + "learning_rate": 4.314425937805088e-05, + "loss": 4.4381, + "step": 40605 + }, + { + "epoch": 0.24149538490817393, + "grad_norm": 1.4475486278533936, + "learning_rate": 4.314393804050211e-05, + "loss": 4.2177, + "step": 40606 + }, + { + "epoch": 0.24150133219145495, + "grad_norm": 1.45993173122406, + "learning_rate": 4.3143616696619474e-05, + "loss": 4.6229, + "step": 40607 + }, + { + "epoch": 0.24150727947473594, + "grad_norm": 2.277418613433838, + "learning_rate": 4.314329534640308e-05, + "loss": 4.5984, + "step": 40608 + }, + { + "epoch": 0.24151322675801692, + "grad_norm": 1.7645221948623657, + "learning_rate": 4.314297398985305e-05, + "loss": 4.7838, + "step": 40609 + }, + { + "epoch": 0.24151917404129794, + "grad_norm": 1.837776780128479, + "learning_rate": 4.31426526269695e-05, + "loss": 4.5681, + "step": 40610 + }, + { + "epoch": 0.24152512132457893, + "grad_norm": 1.534732699394226, + "learning_rate": 4.3142331257752546e-05, + "loss": 4.948, + "step": 40611 + }, + { + "epoch": 0.24153106860785992, + "grad_norm": 1.4155583381652832, + "learning_rate": 4.314200988220228e-05, + "loss": 5.1041, + "step": 40612 + }, + { + "epoch": 0.24153701589114093, + "grad_norm": 1.383591651916504, + "learning_rate": 4.3141688500318836e-05, + "loss": 5.1692, + "step": 40613 + }, + { + "epoch": 0.24154296317442192, + "grad_norm": 1.4934002161026, + "learning_rate": 4.3141367112102306e-05, + "loss": 4.8285, + "step": 40614 + }, + { + "epoch": 0.2415489104577029, + "grad_norm": 1.2760400772094727, + "learning_rate": 4.3141045717552817e-05, + "loss": 5.1332, + "step": 40615 + }, + { + "epoch": 0.24155485774098392, + "grad_norm": 1.488547682762146, + "learning_rate": 4.3140724316670475e-05, + "loss": 5.0669, + "step": 40616 + }, + { + "epoch": 0.2415608050242649, + "grad_norm": 1.6372644901275635, + "learning_rate": 4.314040290945539e-05, + "loss": 5.1044, + "step": 40617 + }, + { + "epoch": 0.2415667523075459, + "grad_norm": 1.4470680952072144, + "learning_rate": 4.314008149590768e-05, + "loss": 4.9669, + "step": 40618 + }, + { + "epoch": 0.24157269959082692, + "grad_norm": 1.306457281112671, + "learning_rate": 4.3139760076027455e-05, + "loss": 5.4537, + "step": 40619 + }, + { + "epoch": 0.2415786468741079, + "grad_norm": 1.4715678691864014, + "learning_rate": 4.313943864981484e-05, + "loss": 5.0089, + "step": 40620 + }, + { + "epoch": 0.2415845941573889, + "grad_norm": 1.4831035137176514, + "learning_rate": 4.3139117217269916e-05, + "loss": 4.9291, + "step": 40621 + }, + { + "epoch": 0.2415905414406699, + "grad_norm": 1.1923593282699585, + "learning_rate": 4.313879577839282e-05, + "loss": 5.2527, + "step": 40622 + }, + { + "epoch": 0.2415964887239509, + "grad_norm": 1.61532723903656, + "learning_rate": 4.313847433318365e-05, + "loss": 4.4126, + "step": 40623 + }, + { + "epoch": 0.24160243600723189, + "grad_norm": 2.1270103454589844, + "learning_rate": 4.313815288164254e-05, + "loss": 4.1042, + "step": 40624 + }, + { + "epoch": 0.2416083832905129, + "grad_norm": 1.6079933643341064, + "learning_rate": 4.3137831423769584e-05, + "loss": 4.8034, + "step": 40625 + }, + { + "epoch": 0.2416143305737939, + "grad_norm": 1.1989984512329102, + "learning_rate": 4.3137509959564896e-05, + "loss": 4.684, + "step": 40626 + }, + { + "epoch": 0.24162027785707488, + "grad_norm": 1.316107988357544, + "learning_rate": 4.313718848902859e-05, + "loss": 5.0338, + "step": 40627 + }, + { + "epoch": 0.2416262251403559, + "grad_norm": 1.0591543912887573, + "learning_rate": 4.3136867012160784e-05, + "loss": 5.0675, + "step": 40628 + }, + { + "epoch": 0.24163217242363688, + "grad_norm": 1.7170097827911377, + "learning_rate": 4.313654552896158e-05, + "loss": 5.1597, + "step": 40629 + }, + { + "epoch": 0.24163811970691787, + "grad_norm": 1.5194562673568726, + "learning_rate": 4.313622403943109e-05, + "loss": 5.0904, + "step": 40630 + }, + { + "epoch": 0.24164406699019889, + "grad_norm": 1.5801897048950195, + "learning_rate": 4.313590254356945e-05, + "loss": 4.4032, + "step": 40631 + }, + { + "epoch": 0.24165001427347987, + "grad_norm": 1.294654369354248, + "learning_rate": 4.3135581041376735e-05, + "loss": 4.9908, + "step": 40632 + }, + { + "epoch": 0.24165596155676086, + "grad_norm": 1.1565773487091064, + "learning_rate": 4.313525953285309e-05, + "loss": 5.1604, + "step": 40633 + }, + { + "epoch": 0.24166190884004188, + "grad_norm": 1.5131183862686157, + "learning_rate": 4.3134938017998606e-05, + "loss": 5.0017, + "step": 40634 + }, + { + "epoch": 0.24166785612332287, + "grad_norm": 1.2678660154342651, + "learning_rate": 4.3134616496813395e-05, + "loss": 4.9457, + "step": 40635 + }, + { + "epoch": 0.24167380340660385, + "grad_norm": 1.2688038349151611, + "learning_rate": 4.313429496929759e-05, + "loss": 4.917, + "step": 40636 + }, + { + "epoch": 0.24167975068988487, + "grad_norm": 2.0157265663146973, + "learning_rate": 4.313397343545128e-05, + "loss": 4.1208, + "step": 40637 + }, + { + "epoch": 0.24168569797316586, + "grad_norm": 2.595851182937622, + "learning_rate": 4.3133651895274593e-05, + "loss": 3.9493, + "step": 40638 + }, + { + "epoch": 0.24169164525644685, + "grad_norm": 2.5459067821502686, + "learning_rate": 4.313333034876764e-05, + "loss": 3.5545, + "step": 40639 + }, + { + "epoch": 0.24169759253972786, + "grad_norm": 2.4376139640808105, + "learning_rate": 4.313300879593052e-05, + "loss": 3.7562, + "step": 40640 + }, + { + "epoch": 0.24170353982300885, + "grad_norm": 2.503230571746826, + "learning_rate": 4.313268723676336e-05, + "loss": 3.8789, + "step": 40641 + }, + { + "epoch": 0.24170948710628984, + "grad_norm": 2.7403228282928467, + "learning_rate": 4.313236567126626e-05, + "loss": 3.876, + "step": 40642 + }, + { + "epoch": 0.24171543438957085, + "grad_norm": 2.327927827835083, + "learning_rate": 4.313204409943934e-05, + "loss": 4.1905, + "step": 40643 + }, + { + "epoch": 0.24172138167285184, + "grad_norm": 2.128430128097534, + "learning_rate": 4.3131722521282716e-05, + "loss": 4.1648, + "step": 40644 + }, + { + "epoch": 0.24172732895613283, + "grad_norm": 1.8504353761672974, + "learning_rate": 4.313140093679649e-05, + "loss": 4.081, + "step": 40645 + }, + { + "epoch": 0.24173327623941385, + "grad_norm": 1.6212143898010254, + "learning_rate": 4.313107934598078e-05, + "loss": 3.9973, + "step": 40646 + }, + { + "epoch": 0.24173922352269483, + "grad_norm": 1.7473843097686768, + "learning_rate": 4.313075774883569e-05, + "loss": 4.081, + "step": 40647 + }, + { + "epoch": 0.24174517080597582, + "grad_norm": 1.7053524255752563, + "learning_rate": 4.313043614536135e-05, + "loss": 4.1126, + "step": 40648 + }, + { + "epoch": 0.24175111808925684, + "grad_norm": 1.8111735582351685, + "learning_rate": 4.3130114535557866e-05, + "loss": 3.9482, + "step": 40649 + }, + { + "epoch": 0.24175706537253783, + "grad_norm": 1.8270765542984009, + "learning_rate": 4.3129792919425336e-05, + "loss": 3.9344, + "step": 40650 + }, + { + "epoch": 0.24176301265581882, + "grad_norm": 1.79839026927948, + "learning_rate": 4.312947129696389e-05, + "loss": 3.8781, + "step": 40651 + }, + { + "epoch": 0.24176895993909983, + "grad_norm": 1.7094690799713135, + "learning_rate": 4.312914966817363e-05, + "loss": 3.9407, + "step": 40652 + }, + { + "epoch": 0.24177490722238082, + "grad_norm": 1.6647990942001343, + "learning_rate": 4.312882803305467e-05, + "loss": 3.8724, + "step": 40653 + }, + { + "epoch": 0.2417808545056618, + "grad_norm": 2.143502712249756, + "learning_rate": 4.3128506391607125e-05, + "loss": 4.0214, + "step": 40654 + }, + { + "epoch": 0.24178680178894282, + "grad_norm": 1.6539605855941772, + "learning_rate": 4.3128184743831105e-05, + "loss": 4.4245, + "step": 40655 + }, + { + "epoch": 0.2417927490722238, + "grad_norm": 2.0731682777404785, + "learning_rate": 4.3127863089726725e-05, + "loss": 4.0422, + "step": 40656 + }, + { + "epoch": 0.2417986963555048, + "grad_norm": 1.5883654356002808, + "learning_rate": 4.3127541429294094e-05, + "loss": 4.386, + "step": 40657 + }, + { + "epoch": 0.24180464363878582, + "grad_norm": 1.9233680963516235, + "learning_rate": 4.312721976253332e-05, + "loss": 3.7945, + "step": 40658 + }, + { + "epoch": 0.2418105909220668, + "grad_norm": 1.8956714868545532, + "learning_rate": 4.312689808944453e-05, + "loss": 3.5475, + "step": 40659 + }, + { + "epoch": 0.2418165382053478, + "grad_norm": 1.8800183534622192, + "learning_rate": 4.312657641002782e-05, + "loss": 3.6183, + "step": 40660 + }, + { + "epoch": 0.2418224854886288, + "grad_norm": 1.6430262327194214, + "learning_rate": 4.312625472428331e-05, + "loss": 3.9247, + "step": 40661 + }, + { + "epoch": 0.2418284327719098, + "grad_norm": 1.8633005619049072, + "learning_rate": 4.312593303221112e-05, + "loss": 4.0027, + "step": 40662 + }, + { + "epoch": 0.24183438005519078, + "grad_norm": 1.7102001905441284, + "learning_rate": 4.3125611333811344e-05, + "loss": 3.8616, + "step": 40663 + }, + { + "epoch": 0.24184032733847177, + "grad_norm": 1.690869688987732, + "learning_rate": 4.312528962908411e-05, + "loss": 4.02, + "step": 40664 + }, + { + "epoch": 0.2418462746217528, + "grad_norm": 1.5433909893035889, + "learning_rate": 4.312496791802952e-05, + "loss": 4.0232, + "step": 40665 + }, + { + "epoch": 0.24185222190503378, + "grad_norm": 1.6856790781021118, + "learning_rate": 4.31246462006477e-05, + "loss": 4.332, + "step": 40666 + }, + { + "epoch": 0.24185816918831476, + "grad_norm": 1.7443509101867676, + "learning_rate": 4.312432447693875e-05, + "loss": 4.2948, + "step": 40667 + }, + { + "epoch": 0.24186411647159578, + "grad_norm": 1.7759877443313599, + "learning_rate": 4.312400274690278e-05, + "loss": 4.1299, + "step": 40668 + }, + { + "epoch": 0.24187006375487677, + "grad_norm": 1.7922500371932983, + "learning_rate": 4.3123681010539905e-05, + "loss": 3.7013, + "step": 40669 + }, + { + "epoch": 0.24187601103815776, + "grad_norm": 1.860171914100647, + "learning_rate": 4.312335926785025e-05, + "loss": 3.488, + "step": 40670 + }, + { + "epoch": 0.24188195832143877, + "grad_norm": 1.9903833866119385, + "learning_rate": 4.312303751883392e-05, + "loss": 3.2618, + "step": 40671 + }, + { + "epoch": 0.24188790560471976, + "grad_norm": 1.7031559944152832, + "learning_rate": 4.312271576349102e-05, + "loss": 4.4311, + "step": 40672 + }, + { + "epoch": 0.24189385288800075, + "grad_norm": 2.0829532146453857, + "learning_rate": 4.312239400182166e-05, + "loss": 3.2503, + "step": 40673 + }, + { + "epoch": 0.24189980017128176, + "grad_norm": 1.8687355518341064, + "learning_rate": 4.312207223382597e-05, + "loss": 3.3861, + "step": 40674 + }, + { + "epoch": 0.24190574745456275, + "grad_norm": 1.899161696434021, + "learning_rate": 4.312175045950404e-05, + "loss": 3.3442, + "step": 40675 + }, + { + "epoch": 0.24191169473784374, + "grad_norm": 1.7589390277862549, + "learning_rate": 4.3121428678856e-05, + "loss": 3.3777, + "step": 40676 + }, + { + "epoch": 0.24191764202112476, + "grad_norm": 1.9061682224273682, + "learning_rate": 4.312110689188196e-05, + "loss": 3.3444, + "step": 40677 + }, + { + "epoch": 0.24192358930440575, + "grad_norm": 1.851563811302185, + "learning_rate": 4.3120785098582026e-05, + "loss": 3.4659, + "step": 40678 + }, + { + "epoch": 0.24192953658768673, + "grad_norm": 1.6008967161178589, + "learning_rate": 4.3120463298956315e-05, + "loss": 4.3622, + "step": 40679 + }, + { + "epoch": 0.24193548387096775, + "grad_norm": 1.5306041240692139, + "learning_rate": 4.3120141493004935e-05, + "loss": 4.4683, + "step": 40680 + }, + { + "epoch": 0.24194143115424874, + "grad_norm": 2.1796019077301025, + "learning_rate": 4.3119819680728e-05, + "loss": 3.2448, + "step": 40681 + }, + { + "epoch": 0.24194737843752973, + "grad_norm": 2.085318088531494, + "learning_rate": 4.311949786212562e-05, + "loss": 3.327, + "step": 40682 + }, + { + "epoch": 0.24195332572081074, + "grad_norm": 1.8397084474563599, + "learning_rate": 4.311917603719792e-05, + "loss": 3.412, + "step": 40683 + }, + { + "epoch": 0.24195927300409173, + "grad_norm": 1.7967350482940674, + "learning_rate": 4.3118854205945e-05, + "loss": 3.4321, + "step": 40684 + }, + { + "epoch": 0.24196522028737272, + "grad_norm": 1.8356339931488037, + "learning_rate": 4.311853236836697e-05, + "loss": 3.3935, + "step": 40685 + }, + { + "epoch": 0.24197116757065373, + "grad_norm": 1.835762619972229, + "learning_rate": 4.3118210524463953e-05, + "loss": 3.4281, + "step": 40686 + }, + { + "epoch": 0.24197711485393472, + "grad_norm": 2.0315866470336914, + "learning_rate": 4.3117888674236054e-05, + "loss": 3.2427, + "step": 40687 + }, + { + "epoch": 0.2419830621372157, + "grad_norm": 1.9090626239776611, + "learning_rate": 4.311756681768339e-05, + "loss": 3.4709, + "step": 40688 + }, + { + "epoch": 0.24198900942049673, + "grad_norm": 1.7753013372421265, + "learning_rate": 4.311724495480607e-05, + "loss": 3.4062, + "step": 40689 + }, + { + "epoch": 0.2419949567037777, + "grad_norm": 1.8800350427627563, + "learning_rate": 4.31169230856042e-05, + "loss": 3.2989, + "step": 40690 + }, + { + "epoch": 0.2420009039870587, + "grad_norm": 1.7852457761764526, + "learning_rate": 4.311660121007791e-05, + "loss": 3.2725, + "step": 40691 + }, + { + "epoch": 0.24200685127033972, + "grad_norm": 1.9425667524337769, + "learning_rate": 4.3116279328227295e-05, + "loss": 3.4108, + "step": 40692 + }, + { + "epoch": 0.2420127985536207, + "grad_norm": 1.9320729970932007, + "learning_rate": 4.311595744005247e-05, + "loss": 3.3323, + "step": 40693 + }, + { + "epoch": 0.2420187458369017, + "grad_norm": 1.8617607355117798, + "learning_rate": 4.311563554555356e-05, + "loss": 3.5877, + "step": 40694 + }, + { + "epoch": 0.2420246931201827, + "grad_norm": 1.741678237915039, + "learning_rate": 4.311531364473066e-05, + "loss": 4.5812, + "step": 40695 + }, + { + "epoch": 0.2420306404034637, + "grad_norm": 1.6345674991607666, + "learning_rate": 4.31149917375839e-05, + "loss": 4.885, + "step": 40696 + }, + { + "epoch": 0.2420365876867447, + "grad_norm": 2.021026134490967, + "learning_rate": 4.311466982411337e-05, + "loss": 3.9399, + "step": 40697 + }, + { + "epoch": 0.2420425349700257, + "grad_norm": 2.0129010677337646, + "learning_rate": 4.3114347904319205e-05, + "loss": 4.5695, + "step": 40698 + }, + { + "epoch": 0.2420484822533067, + "grad_norm": 1.8386090993881226, + "learning_rate": 4.311402597820151e-05, + "loss": 4.209, + "step": 40699 + }, + { + "epoch": 0.24205442953658768, + "grad_norm": 1.555873990058899, + "learning_rate": 4.31137040457604e-05, + "loss": 4.6194, + "step": 40700 + }, + { + "epoch": 0.2420603768198687, + "grad_norm": 1.5462002754211426, + "learning_rate": 4.311338210699598e-05, + "loss": 4.9506, + "step": 40701 + }, + { + "epoch": 0.24206632410314968, + "grad_norm": 1.6088194847106934, + "learning_rate": 4.311306016190836e-05, + "loss": 4.9103, + "step": 40702 + }, + { + "epoch": 0.24207227138643067, + "grad_norm": 1.7072161436080933, + "learning_rate": 4.311273821049766e-05, + "loss": 4.3935, + "step": 40703 + }, + { + "epoch": 0.2420782186697117, + "grad_norm": 1.6076041460037231, + "learning_rate": 4.311241625276399e-05, + "loss": 4.3345, + "step": 40704 + }, + { + "epoch": 0.24208416595299267, + "grad_norm": 1.5828070640563965, + "learning_rate": 4.311209428870747e-05, + "loss": 4.5313, + "step": 40705 + }, + { + "epoch": 0.24209011323627366, + "grad_norm": 2.251319169998169, + "learning_rate": 4.3111772318328195e-05, + "loss": 4.6689, + "step": 40706 + }, + { + "epoch": 0.24209606051955468, + "grad_norm": 2.2358388900756836, + "learning_rate": 4.311145034162629e-05, + "loss": 4.0881, + "step": 40707 + }, + { + "epoch": 0.24210200780283567, + "grad_norm": 1.9755709171295166, + "learning_rate": 4.3111128358601865e-05, + "loss": 3.9149, + "step": 40708 + }, + { + "epoch": 0.24210795508611666, + "grad_norm": 1.5188883543014526, + "learning_rate": 4.311080636925504e-05, + "loss": 4.522, + "step": 40709 + }, + { + "epoch": 0.24211390236939767, + "grad_norm": 1.8588414192199707, + "learning_rate": 4.311048437358591e-05, + "loss": 3.1784, + "step": 40710 + }, + { + "epoch": 0.24211984965267866, + "grad_norm": 1.7585194110870361, + "learning_rate": 4.3110162371594596e-05, + "loss": 4.4094, + "step": 40711 + }, + { + "epoch": 0.24212579693595965, + "grad_norm": 1.9771790504455566, + "learning_rate": 4.310984036328122e-05, + "loss": 4.4354, + "step": 40712 + }, + { + "epoch": 0.24213174421924066, + "grad_norm": 1.675470232963562, + "learning_rate": 4.3109518348645877e-05, + "loss": 4.7392, + "step": 40713 + }, + { + "epoch": 0.24213769150252165, + "grad_norm": 1.7909470796585083, + "learning_rate": 4.3109196327688695e-05, + "loss": 4.561, + "step": 40714 + }, + { + "epoch": 0.24214363878580264, + "grad_norm": 1.6594129800796509, + "learning_rate": 4.3108874300409774e-05, + "loss": 4.4911, + "step": 40715 + }, + { + "epoch": 0.24214958606908366, + "grad_norm": 1.9344260692596436, + "learning_rate": 4.310855226680923e-05, + "loss": 4.4969, + "step": 40716 + }, + { + "epoch": 0.24215553335236464, + "grad_norm": 1.7312724590301514, + "learning_rate": 4.310823022688719e-05, + "loss": 4.5528, + "step": 40717 + }, + { + "epoch": 0.24216148063564563, + "grad_norm": 1.7015362977981567, + "learning_rate": 4.310790818064374e-05, + "loss": 4.5264, + "step": 40718 + }, + { + "epoch": 0.24216742791892665, + "grad_norm": 1.8012820482254028, + "learning_rate": 4.3107586128079014e-05, + "loss": 4.4733, + "step": 40719 + }, + { + "epoch": 0.24217337520220764, + "grad_norm": 1.592873454093933, + "learning_rate": 4.310726406919312e-05, + "loss": 4.4979, + "step": 40720 + }, + { + "epoch": 0.24217932248548862, + "grad_norm": 1.5152897834777832, + "learning_rate": 4.3106942003986165e-05, + "loss": 4.6018, + "step": 40721 + }, + { + "epoch": 0.24218526976876964, + "grad_norm": 1.631734013557434, + "learning_rate": 4.3106619932458256e-05, + "loss": 4.492, + "step": 40722 + }, + { + "epoch": 0.24219121705205063, + "grad_norm": 1.702994465827942, + "learning_rate": 4.310629785460952e-05, + "loss": 4.4727, + "step": 40723 + }, + { + "epoch": 0.24219716433533162, + "grad_norm": 1.507614016532898, + "learning_rate": 4.310597577044006e-05, + "loss": 4.4504, + "step": 40724 + }, + { + "epoch": 0.2422031116186126, + "grad_norm": 1.6884573698043823, + "learning_rate": 4.3105653679949994e-05, + "loss": 4.5977, + "step": 40725 + }, + { + "epoch": 0.24220905890189362, + "grad_norm": 1.5742093324661255, + "learning_rate": 4.310533158313943e-05, + "loss": 4.6582, + "step": 40726 + }, + { + "epoch": 0.2422150061851746, + "grad_norm": 1.4618597030639648, + "learning_rate": 4.3105009480008475e-05, + "loss": 4.7719, + "step": 40727 + }, + { + "epoch": 0.2422209534684556, + "grad_norm": 1.4904793500900269, + "learning_rate": 4.3104687370557255e-05, + "loss": 4.8699, + "step": 40728 + }, + { + "epoch": 0.2422269007517366, + "grad_norm": 1.3156102895736694, + "learning_rate": 4.3104365254785875e-05, + "loss": 4.7017, + "step": 40729 + }, + { + "epoch": 0.2422328480350176, + "grad_norm": 1.6596823930740356, + "learning_rate": 4.310404313269445e-05, + "loss": 4.4918, + "step": 40730 + }, + { + "epoch": 0.2422387953182986, + "grad_norm": 1.3937008380889893, + "learning_rate": 4.310372100428309e-05, + "loss": 4.7379, + "step": 40731 + }, + { + "epoch": 0.2422447426015796, + "grad_norm": 1.7472684383392334, + "learning_rate": 4.31033988695519e-05, + "loss": 4.3028, + "step": 40732 + }, + { + "epoch": 0.2422506898848606, + "grad_norm": 2.593435764312744, + "learning_rate": 4.310307672850101e-05, + "loss": 3.1148, + "step": 40733 + }, + { + "epoch": 0.24225663716814158, + "grad_norm": 1.9588080644607544, + "learning_rate": 4.3102754581130526e-05, + "loss": 4.4143, + "step": 40734 + }, + { + "epoch": 0.2422625844514226, + "grad_norm": 2.0162808895111084, + "learning_rate": 4.310243242744055e-05, + "loss": 4.1027, + "step": 40735 + }, + { + "epoch": 0.24226853173470358, + "grad_norm": 1.7647767066955566, + "learning_rate": 4.3102110267431194e-05, + "loss": 4.5093, + "step": 40736 + }, + { + "epoch": 0.24227447901798457, + "grad_norm": 1.8591192960739136, + "learning_rate": 4.310178810110259e-05, + "loss": 4.4871, + "step": 40737 + }, + { + "epoch": 0.2422804263012656, + "grad_norm": 1.6977115869522095, + "learning_rate": 4.310146592845484e-05, + "loss": 4.6907, + "step": 40738 + }, + { + "epoch": 0.24228637358454658, + "grad_norm": 1.7207955121994019, + "learning_rate": 4.310114374948805e-05, + "loss": 4.4077, + "step": 40739 + }, + { + "epoch": 0.24229232086782757, + "grad_norm": 1.894718050956726, + "learning_rate": 4.310082156420234e-05, + "loss": 4.1866, + "step": 40740 + }, + { + "epoch": 0.24229826815110858, + "grad_norm": 1.8595994710922241, + "learning_rate": 4.310049937259782e-05, + "loss": 4.637, + "step": 40741 + }, + { + "epoch": 0.24230421543438957, + "grad_norm": 1.78658127784729, + "learning_rate": 4.310017717467461e-05, + "loss": 4.8195, + "step": 40742 + }, + { + "epoch": 0.24231016271767056, + "grad_norm": 1.8761820793151855, + "learning_rate": 4.30998549704328e-05, + "loss": 3.9211, + "step": 40743 + }, + { + "epoch": 0.24231611000095157, + "grad_norm": 1.7521884441375732, + "learning_rate": 4.309953275987252e-05, + "loss": 4.425, + "step": 40744 + }, + { + "epoch": 0.24232205728423256, + "grad_norm": 1.562634825706482, + "learning_rate": 4.309921054299389e-05, + "loss": 4.7597, + "step": 40745 + }, + { + "epoch": 0.24232800456751355, + "grad_norm": 1.3659818172454834, + "learning_rate": 4.309888831979701e-05, + "loss": 5.0314, + "step": 40746 + }, + { + "epoch": 0.24233395185079457, + "grad_norm": 1.9272608757019043, + "learning_rate": 4.309856609028199e-05, + "loss": 4.9025, + "step": 40747 + }, + { + "epoch": 0.24233989913407555, + "grad_norm": 1.6547073125839233, + "learning_rate": 4.309824385444895e-05, + "loss": 4.8865, + "step": 40748 + }, + { + "epoch": 0.24234584641735654, + "grad_norm": 1.6992721557617188, + "learning_rate": 4.3097921612298e-05, + "loss": 4.6187, + "step": 40749 + }, + { + "epoch": 0.24235179370063756, + "grad_norm": 1.6967442035675049, + "learning_rate": 4.3097599363829256e-05, + "loss": 4.8685, + "step": 40750 + }, + { + "epoch": 0.24235774098391855, + "grad_norm": 1.4530926942825317, + "learning_rate": 4.309727710904282e-05, + "loss": 4.3482, + "step": 40751 + }, + { + "epoch": 0.24236368826719953, + "grad_norm": 1.6114997863769531, + "learning_rate": 4.309695484793882e-05, + "loss": 4.7868, + "step": 40752 + }, + { + "epoch": 0.24236963555048055, + "grad_norm": 1.5402837991714478, + "learning_rate": 4.309663258051735e-05, + "loss": 4.7623, + "step": 40753 + }, + { + "epoch": 0.24237558283376154, + "grad_norm": 1.8134127855300903, + "learning_rate": 4.3096310306778545e-05, + "loss": 4.6236, + "step": 40754 + }, + { + "epoch": 0.24238153011704253, + "grad_norm": 1.5305649042129517, + "learning_rate": 4.309598802672249e-05, + "loss": 4.6248, + "step": 40755 + }, + { + "epoch": 0.24238747740032354, + "grad_norm": 1.9484304189682007, + "learning_rate": 4.309566574034932e-05, + "loss": 4.393, + "step": 40756 + }, + { + "epoch": 0.24239342468360453, + "grad_norm": 1.9508004188537598, + "learning_rate": 4.309534344765914e-05, + "loss": 4.2379, + "step": 40757 + }, + { + "epoch": 0.24239937196688552, + "grad_norm": 2.072239875793457, + "learning_rate": 4.3095021148652064e-05, + "loss": 3.797, + "step": 40758 + }, + { + "epoch": 0.24240531925016653, + "grad_norm": 1.7128373384475708, + "learning_rate": 4.309469884332821e-05, + "loss": 4.4688, + "step": 40759 + }, + { + "epoch": 0.24241126653344752, + "grad_norm": 2.1416311264038086, + "learning_rate": 4.309437653168767e-05, + "loss": 3.8853, + "step": 40760 + }, + { + "epoch": 0.2424172138167285, + "grad_norm": 2.521324634552002, + "learning_rate": 4.3094054213730576e-05, + "loss": 3.9263, + "step": 40761 + }, + { + "epoch": 0.24242316110000953, + "grad_norm": 2.163778781890869, + "learning_rate": 4.3093731889457035e-05, + "loss": 3.4404, + "step": 40762 + }, + { + "epoch": 0.24242910838329051, + "grad_norm": 2.0350229740142822, + "learning_rate": 4.309340955886716e-05, + "loss": 3.4868, + "step": 40763 + }, + { + "epoch": 0.2424350556665715, + "grad_norm": 2.596527099609375, + "learning_rate": 4.309308722196106e-05, + "loss": 3.7857, + "step": 40764 + }, + { + "epoch": 0.24244100294985252, + "grad_norm": 1.5718177556991577, + "learning_rate": 4.309276487873885e-05, + "loss": 4.9721, + "step": 40765 + }, + { + "epoch": 0.2424469502331335, + "grad_norm": 1.5584903955459595, + "learning_rate": 4.309244252920064e-05, + "loss": 4.6593, + "step": 40766 + }, + { + "epoch": 0.2424528975164145, + "grad_norm": 2.2689459323883057, + "learning_rate": 4.309212017334655e-05, + "loss": 3.2329, + "step": 40767 + }, + { + "epoch": 0.2424588447996955, + "grad_norm": 1.8709869384765625, + "learning_rate": 4.309179781117669e-05, + "loss": 4.1958, + "step": 40768 + }, + { + "epoch": 0.2424647920829765, + "grad_norm": 2.162081718444824, + "learning_rate": 4.309147544269116e-05, + "loss": 3.5044, + "step": 40769 + }, + { + "epoch": 0.2424707393662575, + "grad_norm": 2.3666305541992188, + "learning_rate": 4.30911530678901e-05, + "loss": 3.1745, + "step": 40770 + }, + { + "epoch": 0.2424766866495385, + "grad_norm": 2.37850284576416, + "learning_rate": 4.3090830686773595e-05, + "loss": 3.7441, + "step": 40771 + }, + { + "epoch": 0.2424826339328195, + "grad_norm": 2.2422964572906494, + "learning_rate": 4.309050829934177e-05, + "loss": 3.7426, + "step": 40772 + }, + { + "epoch": 0.24248858121610048, + "grad_norm": 2.015289068222046, + "learning_rate": 4.3090185905594734e-05, + "loss": 3.9919, + "step": 40773 + }, + { + "epoch": 0.2424945284993815, + "grad_norm": 1.7223831415176392, + "learning_rate": 4.30898635055326e-05, + "loss": 4.7788, + "step": 40774 + }, + { + "epoch": 0.24250047578266248, + "grad_norm": 1.9557386636734009, + "learning_rate": 4.3089541099155486e-05, + "loss": 4.2601, + "step": 40775 + }, + { + "epoch": 0.24250642306594347, + "grad_norm": 1.9925966262817383, + "learning_rate": 4.3089218686463497e-05, + "loss": 3.8858, + "step": 40776 + }, + { + "epoch": 0.2425123703492245, + "grad_norm": 1.899418830871582, + "learning_rate": 4.308889626745675e-05, + "loss": 4.2539, + "step": 40777 + }, + { + "epoch": 0.24251831763250548, + "grad_norm": 1.804815649986267, + "learning_rate": 4.308857384213535e-05, + "loss": 4.5311, + "step": 40778 + }, + { + "epoch": 0.24252426491578646, + "grad_norm": 1.8750076293945312, + "learning_rate": 4.308825141049941e-05, + "loss": 4.1808, + "step": 40779 + }, + { + "epoch": 0.24253021219906748, + "grad_norm": 2.0163345336914062, + "learning_rate": 4.308792897254907e-05, + "loss": 3.7091, + "step": 40780 + }, + { + "epoch": 0.24253615948234847, + "grad_norm": 2.029501438140869, + "learning_rate": 4.308760652828441e-05, + "loss": 4.0403, + "step": 40781 + }, + { + "epoch": 0.24254210676562946, + "grad_norm": 1.7323468923568726, + "learning_rate": 4.308728407770555e-05, + "loss": 4.1641, + "step": 40782 + }, + { + "epoch": 0.24254805404891044, + "grad_norm": 2.0379345417022705, + "learning_rate": 4.308696162081261e-05, + "loss": 3.8785, + "step": 40783 + }, + { + "epoch": 0.24255400133219146, + "grad_norm": 2.1508617401123047, + "learning_rate": 4.3086639157605694e-05, + "loss": 3.9855, + "step": 40784 + }, + { + "epoch": 0.24255994861547245, + "grad_norm": 2.17386794090271, + "learning_rate": 4.308631668808493e-05, + "loss": 4.2589, + "step": 40785 + }, + { + "epoch": 0.24256589589875344, + "grad_norm": 2.3214833736419678, + "learning_rate": 4.30859942122504e-05, + "loss": 3.8631, + "step": 40786 + }, + { + "epoch": 0.24257184318203445, + "grad_norm": 2.1896212100982666, + "learning_rate": 4.308567173010226e-05, + "loss": 4.2428, + "step": 40787 + }, + { + "epoch": 0.24257779046531544, + "grad_norm": 2.0620832443237305, + "learning_rate": 4.308534924164058e-05, + "loss": 4.1131, + "step": 40788 + }, + { + "epoch": 0.24258373774859643, + "grad_norm": 1.9905093908309937, + "learning_rate": 4.30850267468655e-05, + "loss": 4.2066, + "step": 40789 + }, + { + "epoch": 0.24258968503187744, + "grad_norm": 1.6120655536651611, + "learning_rate": 4.308470424577712e-05, + "loss": 4.8749, + "step": 40790 + }, + { + "epoch": 0.24259563231515843, + "grad_norm": 1.4304571151733398, + "learning_rate": 4.308438173837556e-05, + "loss": 5.1001, + "step": 40791 + }, + { + "epoch": 0.24260157959843942, + "grad_norm": 1.6846857070922852, + "learning_rate": 4.308405922466093e-05, + "loss": 5.4681, + "step": 40792 + }, + { + "epoch": 0.24260752688172044, + "grad_norm": 1.762327790260315, + "learning_rate": 4.308373670463334e-05, + "loss": 4.7038, + "step": 40793 + }, + { + "epoch": 0.24261347416500142, + "grad_norm": 2.5008389949798584, + "learning_rate": 4.30834141782929e-05, + "loss": 3.8586, + "step": 40794 + }, + { + "epoch": 0.2426194214482824, + "grad_norm": 1.596795678138733, + "learning_rate": 4.308309164563974e-05, + "loss": 3.5037, + "step": 40795 + }, + { + "epoch": 0.24262536873156343, + "grad_norm": 1.756638765335083, + "learning_rate": 4.308276910667395e-05, + "loss": 4.386, + "step": 40796 + }, + { + "epoch": 0.24263131601484442, + "grad_norm": 2.369478225708008, + "learning_rate": 4.308244656139565e-05, + "loss": 3.3374, + "step": 40797 + }, + { + "epoch": 0.2426372632981254, + "grad_norm": 1.7718374729156494, + "learning_rate": 4.308212400980496e-05, + "loss": 4.7393, + "step": 40798 + }, + { + "epoch": 0.24264321058140642, + "grad_norm": 1.5686911344528198, + "learning_rate": 4.3081801451901985e-05, + "loss": 4.8103, + "step": 40799 + }, + { + "epoch": 0.2426491578646874, + "grad_norm": 1.5892243385314941, + "learning_rate": 4.308147888768684e-05, + "loss": 4.8997, + "step": 40800 + }, + { + "epoch": 0.2426551051479684, + "grad_norm": 1.4071136713027954, + "learning_rate": 4.308115631715964e-05, + "loss": 4.7905, + "step": 40801 + }, + { + "epoch": 0.2426610524312494, + "grad_norm": 1.5548112392425537, + "learning_rate": 4.308083374032049e-05, + "loss": 4.556, + "step": 40802 + }, + { + "epoch": 0.2426669997145304, + "grad_norm": 1.567017912864685, + "learning_rate": 4.308051115716951e-05, + "loss": 4.586, + "step": 40803 + }, + { + "epoch": 0.2426729469978114, + "grad_norm": 1.601426601409912, + "learning_rate": 4.308018856770681e-05, + "loss": 4.7066, + "step": 40804 + }, + { + "epoch": 0.2426788942810924, + "grad_norm": 1.3991650342941284, + "learning_rate": 4.307986597193251e-05, + "loss": 4.8651, + "step": 40805 + }, + { + "epoch": 0.2426848415643734, + "grad_norm": 1.6117945909500122, + "learning_rate": 4.307954336984671e-05, + "loss": 4.693, + "step": 40806 + }, + { + "epoch": 0.24269078884765438, + "grad_norm": 1.317292332649231, + "learning_rate": 4.3079220761449525e-05, + "loss": 4.7344, + "step": 40807 + }, + { + "epoch": 0.2426967361309354, + "grad_norm": 1.6683956384658813, + "learning_rate": 4.307889814674108e-05, + "loss": 4.7349, + "step": 40808 + }, + { + "epoch": 0.24270268341421639, + "grad_norm": 1.4652185440063477, + "learning_rate": 4.3078575525721464e-05, + "loss": 4.9087, + "step": 40809 + }, + { + "epoch": 0.24270863069749737, + "grad_norm": 1.6410892009735107, + "learning_rate": 4.307825289839082e-05, + "loss": 4.8851, + "step": 40810 + }, + { + "epoch": 0.2427145779807784, + "grad_norm": 1.4221254587173462, + "learning_rate": 4.3077930264749226e-05, + "loss": 4.7336, + "step": 40811 + }, + { + "epoch": 0.24272052526405938, + "grad_norm": 1.9018584489822388, + "learning_rate": 4.3077607624796826e-05, + "loss": 4.236, + "step": 40812 + }, + { + "epoch": 0.24272647254734037, + "grad_norm": 1.5978950262069702, + "learning_rate": 4.307728497853372e-05, + "loss": 4.8736, + "step": 40813 + }, + { + "epoch": 0.24273241983062138, + "grad_norm": 1.6080654859542847, + "learning_rate": 4.3076962325960013e-05, + "loss": 4.5117, + "step": 40814 + }, + { + "epoch": 0.24273836711390237, + "grad_norm": 1.6557093858718872, + "learning_rate": 4.307663966707584e-05, + "loss": 4.7839, + "step": 40815 + }, + { + "epoch": 0.24274431439718336, + "grad_norm": 1.2807927131652832, + "learning_rate": 4.307631700188129e-05, + "loss": 4.9058, + "step": 40816 + }, + { + "epoch": 0.24275026168046437, + "grad_norm": 1.4476991891860962, + "learning_rate": 4.3075994330376476e-05, + "loss": 4.9427, + "step": 40817 + }, + { + "epoch": 0.24275620896374536, + "grad_norm": 1.4097175598144531, + "learning_rate": 4.307567165256153e-05, + "loss": 4.7172, + "step": 40818 + }, + { + "epoch": 0.24276215624702635, + "grad_norm": 1.538185477256775, + "learning_rate": 4.307534896843655e-05, + "loss": 4.8644, + "step": 40819 + }, + { + "epoch": 0.24276810353030737, + "grad_norm": 1.3202414512634277, + "learning_rate": 4.307502627800165e-05, + "loss": 4.5133, + "step": 40820 + }, + { + "epoch": 0.24277405081358835, + "grad_norm": 1.3869314193725586, + "learning_rate": 4.307470358125695e-05, + "loss": 4.9297, + "step": 40821 + }, + { + "epoch": 0.24277999809686934, + "grad_norm": 1.7689937353134155, + "learning_rate": 4.307438087820256e-05, + "loss": 4.262, + "step": 40822 + }, + { + "epoch": 0.24278594538015036, + "grad_norm": 1.5636351108551025, + "learning_rate": 4.307405816883858e-05, + "loss": 4.441, + "step": 40823 + }, + { + "epoch": 0.24279189266343135, + "grad_norm": 1.4583852291107178, + "learning_rate": 4.3073735453165144e-05, + "loss": 4.2958, + "step": 40824 + }, + { + "epoch": 0.24279783994671233, + "grad_norm": 1.514542818069458, + "learning_rate": 4.307341273118235e-05, + "loss": 4.346, + "step": 40825 + }, + { + "epoch": 0.24280378722999335, + "grad_norm": 1.7213584184646606, + "learning_rate": 4.307309000289031e-05, + "loss": 4.2734, + "step": 40826 + }, + { + "epoch": 0.24280973451327434, + "grad_norm": 1.3767199516296387, + "learning_rate": 4.307276726828915e-05, + "loss": 4.0775, + "step": 40827 + }, + { + "epoch": 0.24281568179655533, + "grad_norm": 1.5295010805130005, + "learning_rate": 4.307244452737896e-05, + "loss": 4.4634, + "step": 40828 + }, + { + "epoch": 0.24282162907983634, + "grad_norm": 1.9133466482162476, + "learning_rate": 4.307212178015988e-05, + "loss": 4.1071, + "step": 40829 + }, + { + "epoch": 0.24282757636311733, + "grad_norm": 1.738234519958496, + "learning_rate": 4.3071799026632e-05, + "loss": 4.2347, + "step": 40830 + }, + { + "epoch": 0.24283352364639832, + "grad_norm": 1.7086889743804932, + "learning_rate": 4.307147626679545e-05, + "loss": 4.5607, + "step": 40831 + }, + { + "epoch": 0.24283947092967934, + "grad_norm": 1.5271501541137695, + "learning_rate": 4.307115350065033e-05, + "loss": 4.5501, + "step": 40832 + }, + { + "epoch": 0.24284541821296032, + "grad_norm": 1.739466905593872, + "learning_rate": 4.3070830728196755e-05, + "loss": 4.1087, + "step": 40833 + }, + { + "epoch": 0.2428513654962413, + "grad_norm": 1.8605539798736572, + "learning_rate": 4.307050794943484e-05, + "loss": 4.2108, + "step": 40834 + }, + { + "epoch": 0.24285731277952233, + "grad_norm": 2.0834100246429443, + "learning_rate": 4.3070185164364696e-05, + "loss": 4.3993, + "step": 40835 + }, + { + "epoch": 0.24286326006280332, + "grad_norm": 1.6742836236953735, + "learning_rate": 4.306986237298644e-05, + "loss": 4.543, + "step": 40836 + }, + { + "epoch": 0.2428692073460843, + "grad_norm": 1.8829210996627808, + "learning_rate": 4.306953957530018e-05, + "loss": 4.136, + "step": 40837 + }, + { + "epoch": 0.24287515462936532, + "grad_norm": 1.3959993124008179, + "learning_rate": 4.3069216771306035e-05, + "loss": 4.6114, + "step": 40838 + }, + { + "epoch": 0.2428811019126463, + "grad_norm": 1.4523507356643677, + "learning_rate": 4.306889396100411e-05, + "loss": 4.3831, + "step": 40839 + }, + { + "epoch": 0.2428870491959273, + "grad_norm": 1.5585583448410034, + "learning_rate": 4.306857114439452e-05, + "loss": 4.2833, + "step": 40840 + }, + { + "epoch": 0.24289299647920828, + "grad_norm": 1.4281262159347534, + "learning_rate": 4.306824832147738e-05, + "loss": 4.4033, + "step": 40841 + }, + { + "epoch": 0.2428989437624893, + "grad_norm": 1.4685243368148804, + "learning_rate": 4.30679254922528e-05, + "loss": 4.3966, + "step": 40842 + }, + { + "epoch": 0.2429048910457703, + "grad_norm": 1.3135545253753662, + "learning_rate": 4.306760265672089e-05, + "loss": 4.4072, + "step": 40843 + }, + { + "epoch": 0.24291083832905128, + "grad_norm": 1.3892806768417358, + "learning_rate": 4.3067279814881775e-05, + "loss": 4.3949, + "step": 40844 + }, + { + "epoch": 0.2429167856123323, + "grad_norm": 1.5712388753890991, + "learning_rate": 4.306695696673556e-05, + "loss": 4.2461, + "step": 40845 + }, + { + "epoch": 0.24292273289561328, + "grad_norm": 1.5378739833831787, + "learning_rate": 4.306663411228235e-05, + "loss": 4.2986, + "step": 40846 + }, + { + "epoch": 0.24292868017889427, + "grad_norm": 1.6112825870513916, + "learning_rate": 4.306631125152226e-05, + "loss": 4.5469, + "step": 40847 + }, + { + "epoch": 0.24293462746217528, + "grad_norm": 1.5549256801605225, + "learning_rate": 4.3065988384455425e-05, + "loss": 4.2573, + "step": 40848 + }, + { + "epoch": 0.24294057474545627, + "grad_norm": 1.3730874061584473, + "learning_rate": 4.306566551108192e-05, + "loss": 4.3374, + "step": 40849 + }, + { + "epoch": 0.24294652202873726, + "grad_norm": 1.5148937702178955, + "learning_rate": 4.30653426314019e-05, + "loss": 4.3192, + "step": 40850 + }, + { + "epoch": 0.24295246931201828, + "grad_norm": 1.3946799039840698, + "learning_rate": 4.3065019745415435e-05, + "loss": 4.2775, + "step": 40851 + }, + { + "epoch": 0.24295841659529926, + "grad_norm": 1.4700652360916138, + "learning_rate": 4.3064696853122664e-05, + "loss": 4.2381, + "step": 40852 + }, + { + "epoch": 0.24296436387858025, + "grad_norm": 1.466031551361084, + "learning_rate": 4.30643739545237e-05, + "loss": 4.3163, + "step": 40853 + }, + { + "epoch": 0.24297031116186127, + "grad_norm": 1.526807188987732, + "learning_rate": 4.306405104961864e-05, + "loss": 4.3044, + "step": 40854 + }, + { + "epoch": 0.24297625844514226, + "grad_norm": 1.6413366794586182, + "learning_rate": 4.306372813840761e-05, + "loss": 4.4896, + "step": 40855 + }, + { + "epoch": 0.24298220572842325, + "grad_norm": 1.5746127367019653, + "learning_rate": 4.306340522089072e-05, + "loss": 4.6458, + "step": 40856 + }, + { + "epoch": 0.24298815301170426, + "grad_norm": 1.5127861499786377, + "learning_rate": 4.306308229706809e-05, + "loss": 4.3737, + "step": 40857 + }, + { + "epoch": 0.24299410029498525, + "grad_norm": 1.7377697229385376, + "learning_rate": 4.306275936693982e-05, + "loss": 4.4064, + "step": 40858 + }, + { + "epoch": 0.24300004757826624, + "grad_norm": 1.739450454711914, + "learning_rate": 4.306243643050602e-05, + "loss": 4.5044, + "step": 40859 + }, + { + "epoch": 0.24300599486154725, + "grad_norm": 1.7389893531799316, + "learning_rate": 4.306211348776682e-05, + "loss": 4.3369, + "step": 40860 + }, + { + "epoch": 0.24301194214482824, + "grad_norm": 1.4828088283538818, + "learning_rate": 4.3061790538722313e-05, + "loss": 4.4732, + "step": 40861 + }, + { + "epoch": 0.24301788942810923, + "grad_norm": 1.746364951133728, + "learning_rate": 4.306146758337263e-05, + "loss": 3.9872, + "step": 40862 + }, + { + "epoch": 0.24302383671139025, + "grad_norm": 1.4443856477737427, + "learning_rate": 4.306114462171786e-05, + "loss": 4.3709, + "step": 40863 + }, + { + "epoch": 0.24302978399467123, + "grad_norm": 1.6344696283340454, + "learning_rate": 4.306082165375815e-05, + "loss": 4.2683, + "step": 40864 + }, + { + "epoch": 0.24303573127795222, + "grad_norm": 1.5531675815582275, + "learning_rate": 4.306049867949359e-05, + "loss": 4.3741, + "step": 40865 + }, + { + "epoch": 0.24304167856123324, + "grad_norm": 1.3848797082901, + "learning_rate": 4.3060175698924286e-05, + "loss": 4.5003, + "step": 40866 + }, + { + "epoch": 0.24304762584451423, + "grad_norm": 1.5593068599700928, + "learning_rate": 4.3059852712050366e-05, + "loss": 4.3469, + "step": 40867 + }, + { + "epoch": 0.2430535731277952, + "grad_norm": 1.4839234352111816, + "learning_rate": 4.3059529718871945e-05, + "loss": 4.6884, + "step": 40868 + }, + { + "epoch": 0.24305952041107623, + "grad_norm": 1.812975287437439, + "learning_rate": 4.305920671938912e-05, + "loss": 4.8147, + "step": 40869 + }, + { + "epoch": 0.24306546769435722, + "grad_norm": 1.7608894109725952, + "learning_rate": 4.305888371360202e-05, + "loss": 4.2904, + "step": 40870 + }, + { + "epoch": 0.2430714149776382, + "grad_norm": 1.8927644491195679, + "learning_rate": 4.305856070151074e-05, + "loss": 4.2368, + "step": 40871 + }, + { + "epoch": 0.24307736226091922, + "grad_norm": 1.4928992986679077, + "learning_rate": 4.305823768311542e-05, + "loss": 4.8483, + "step": 40872 + }, + { + "epoch": 0.2430833095442002, + "grad_norm": 1.7606216669082642, + "learning_rate": 4.305791465841614e-05, + "loss": 4.2566, + "step": 40873 + }, + { + "epoch": 0.2430892568274812, + "grad_norm": 1.942518711090088, + "learning_rate": 4.305759162741303e-05, + "loss": 3.7995, + "step": 40874 + }, + { + "epoch": 0.24309520411076221, + "grad_norm": 1.5128263235092163, + "learning_rate": 4.305726859010621e-05, + "loss": 5.0252, + "step": 40875 + }, + { + "epoch": 0.2431011513940432, + "grad_norm": 2.000913381576538, + "learning_rate": 4.305694554649577e-05, + "loss": 3.8943, + "step": 40876 + }, + { + "epoch": 0.2431070986773242, + "grad_norm": 2.054539680480957, + "learning_rate": 4.305662249658186e-05, + "loss": 3.3014, + "step": 40877 + }, + { + "epoch": 0.2431130459606052, + "grad_norm": 1.960503101348877, + "learning_rate": 4.305629944036455e-05, + "loss": 4.253, + "step": 40878 + }, + { + "epoch": 0.2431189932438862, + "grad_norm": 1.772520899772644, + "learning_rate": 4.305597637784398e-05, + "loss": 5.2153, + "step": 40879 + }, + { + "epoch": 0.24312494052716718, + "grad_norm": 2.1986074447631836, + "learning_rate": 4.305565330902025e-05, + "loss": 3.8033, + "step": 40880 + }, + { + "epoch": 0.2431308878104482, + "grad_norm": 3.0522027015686035, + "learning_rate": 4.305533023389349e-05, + "loss": 1.9647, + "step": 40881 + }, + { + "epoch": 0.2431368350937292, + "grad_norm": 2.5987842082977295, + "learning_rate": 4.305500715246378e-05, + "loss": 2.347, + "step": 40882 + }, + { + "epoch": 0.24314278237701017, + "grad_norm": 2.5366225242614746, + "learning_rate": 4.305468406473127e-05, + "loss": 2.13, + "step": 40883 + }, + { + "epoch": 0.2431487296602912, + "grad_norm": 2.426287889480591, + "learning_rate": 4.305436097069605e-05, + "loss": 2.0357, + "step": 40884 + }, + { + "epoch": 0.24315467694357218, + "grad_norm": 1.8614312410354614, + "learning_rate": 4.305403787035824e-05, + "loss": 4.3234, + "step": 40885 + }, + { + "epoch": 0.24316062422685317, + "grad_norm": 2.250182628631592, + "learning_rate": 4.3053714763717956e-05, + "loss": 2.2716, + "step": 40886 + }, + { + "epoch": 0.24316657151013418, + "grad_norm": 2.617781400680542, + "learning_rate": 4.3053391650775304e-05, + "loss": 1.779, + "step": 40887 + }, + { + "epoch": 0.24317251879341517, + "grad_norm": 2.729217052459717, + "learning_rate": 4.30530685315304e-05, + "loss": 2.001, + "step": 40888 + }, + { + "epoch": 0.24317846607669616, + "grad_norm": 2.7897894382476807, + "learning_rate": 4.305274540598335e-05, + "loss": 1.6962, + "step": 40889 + }, + { + "epoch": 0.24318441335997718, + "grad_norm": 2.9909181594848633, + "learning_rate": 4.305242227413429e-05, + "loss": 2.0023, + "step": 40890 + }, + { + "epoch": 0.24319036064325816, + "grad_norm": 2.3317973613739014, + "learning_rate": 4.30520991359833e-05, + "loss": 3.5866, + "step": 40891 + }, + { + "epoch": 0.24319630792653915, + "grad_norm": 2.1457417011260986, + "learning_rate": 4.3051775991530505e-05, + "loss": 4.4999, + "step": 40892 + }, + { + "epoch": 0.24320225520982017, + "grad_norm": 2.0349507331848145, + "learning_rate": 4.305145284077604e-05, + "loss": 4.6699, + "step": 40893 + }, + { + "epoch": 0.24320820249310116, + "grad_norm": 1.5677685737609863, + "learning_rate": 4.305112968371998e-05, + "loss": 5.3208, + "step": 40894 + }, + { + "epoch": 0.24321414977638214, + "grad_norm": 2.100574016571045, + "learning_rate": 4.3050806520362475e-05, + "loss": 4.5239, + "step": 40895 + }, + { + "epoch": 0.24322009705966316, + "grad_norm": 1.463118076324463, + "learning_rate": 4.305048335070361e-05, + "loss": 5.3208, + "step": 40896 + }, + { + "epoch": 0.24322604434294415, + "grad_norm": 2.0108590126037598, + "learning_rate": 4.30501601747435e-05, + "loss": 4.2856, + "step": 40897 + }, + { + "epoch": 0.24323199162622514, + "grad_norm": 2.1134378910064697, + "learning_rate": 4.304983699248228e-05, + "loss": 4.6191, + "step": 40898 + }, + { + "epoch": 0.24323793890950612, + "grad_norm": 1.4506118297576904, + "learning_rate": 4.3049513803920045e-05, + "loss": 5.36, + "step": 40899 + }, + { + "epoch": 0.24324388619278714, + "grad_norm": 1.3808585405349731, + "learning_rate": 4.3049190609056913e-05, + "loss": 5.1801, + "step": 40900 + }, + { + "epoch": 0.24324983347606813, + "grad_norm": 1.3783729076385498, + "learning_rate": 4.304886740789298e-05, + "loss": 5.3289, + "step": 40901 + }, + { + "epoch": 0.24325578075934912, + "grad_norm": 2.7179758548736572, + "learning_rate": 4.3048544200428385e-05, + "loss": 4.3577, + "step": 40902 + }, + { + "epoch": 0.24326172804263013, + "grad_norm": 1.3101098537445068, + "learning_rate": 4.304822098666323e-05, + "loss": 5.3251, + "step": 40903 + }, + { + "epoch": 0.24326767532591112, + "grad_norm": 1.3166999816894531, + "learning_rate": 4.304789776659763e-05, + "loss": 5.2887, + "step": 40904 + }, + { + "epoch": 0.2432736226091921, + "grad_norm": 1.4472644329071045, + "learning_rate": 4.304757454023169e-05, + "loss": 5.0836, + "step": 40905 + }, + { + "epoch": 0.24327956989247312, + "grad_norm": 1.396707534790039, + "learning_rate": 4.3047251307565533e-05, + "loss": 5.2682, + "step": 40906 + }, + { + "epoch": 0.2432855171757541, + "grad_norm": 1.5540151596069336, + "learning_rate": 4.3046928068599267e-05, + "loss": 5.5187, + "step": 40907 + }, + { + "epoch": 0.2432914644590351, + "grad_norm": 1.4554765224456787, + "learning_rate": 4.3046604823333006e-05, + "loss": 5.3479, + "step": 40908 + }, + { + "epoch": 0.24329741174231612, + "grad_norm": 1.5977081060409546, + "learning_rate": 4.304628157176685e-05, + "loss": 4.6994, + "step": 40909 + }, + { + "epoch": 0.2433033590255971, + "grad_norm": 2.1193137168884277, + "learning_rate": 4.3045958313900936e-05, + "loss": 4.5746, + "step": 40910 + }, + { + "epoch": 0.2433093063088781, + "grad_norm": 1.2054013013839722, + "learning_rate": 4.304563504973536e-05, + "loss": 5.3998, + "step": 40911 + }, + { + "epoch": 0.2433152535921591, + "grad_norm": 1.1461800336837769, + "learning_rate": 4.304531177927024e-05, + "loss": 5.4345, + "step": 40912 + }, + { + "epoch": 0.2433212008754401, + "grad_norm": 1.4586174488067627, + "learning_rate": 4.304498850250568e-05, + "loss": 5.3653, + "step": 40913 + }, + { + "epoch": 0.24332714815872108, + "grad_norm": 1.3150430917739868, + "learning_rate": 4.304466521944182e-05, + "loss": 5.3609, + "step": 40914 + }, + { + "epoch": 0.2433330954420021, + "grad_norm": 1.5476224422454834, + "learning_rate": 4.304434193007874e-05, + "loss": 5.3171, + "step": 40915 + }, + { + "epoch": 0.2433390427252831, + "grad_norm": 1.1344034671783447, + "learning_rate": 4.3044018634416564e-05, + "loss": 5.4425, + "step": 40916 + }, + { + "epoch": 0.24334499000856408, + "grad_norm": 1.305803656578064, + "learning_rate": 4.304369533245541e-05, + "loss": 5.3628, + "step": 40917 + }, + { + "epoch": 0.2433509372918451, + "grad_norm": 1.464320421218872, + "learning_rate": 4.304337202419539e-05, + "loss": 5.3284, + "step": 40918 + }, + { + "epoch": 0.24335688457512608, + "grad_norm": 2.0982282161712646, + "learning_rate": 4.304304870963662e-05, + "loss": 4.578, + "step": 40919 + }, + { + "epoch": 0.24336283185840707, + "grad_norm": 1.8409801721572876, + "learning_rate": 4.30427253887792e-05, + "loss": 4.4535, + "step": 40920 + }, + { + "epoch": 0.24336877914168809, + "grad_norm": 1.6202764511108398, + "learning_rate": 4.304240206162326e-05, + "loss": 4.8971, + "step": 40921 + }, + { + "epoch": 0.24337472642496907, + "grad_norm": 1.5874160528182983, + "learning_rate": 4.3042078728168894e-05, + "loss": 4.7857, + "step": 40922 + }, + { + "epoch": 0.24338067370825006, + "grad_norm": 1.6703484058380127, + "learning_rate": 4.304175538841623e-05, + "loss": 5.0078, + "step": 40923 + }, + { + "epoch": 0.24338662099153108, + "grad_norm": 1.3898638486862183, + "learning_rate": 4.3041432042365375e-05, + "loss": 5.0381, + "step": 40924 + }, + { + "epoch": 0.24339256827481207, + "grad_norm": 2.649547576904297, + "learning_rate": 4.304110869001644e-05, + "loss": 4.4229, + "step": 40925 + }, + { + "epoch": 0.24339851555809305, + "grad_norm": 1.6595269441604614, + "learning_rate": 4.304078533136955e-05, + "loss": 4.7262, + "step": 40926 + }, + { + "epoch": 0.24340446284137407, + "grad_norm": 1.7074862718582153, + "learning_rate": 4.30404619664248e-05, + "loss": 4.6733, + "step": 40927 + }, + { + "epoch": 0.24341041012465506, + "grad_norm": 1.5717021226882935, + "learning_rate": 4.304013859518231e-05, + "loss": 4.8695, + "step": 40928 + }, + { + "epoch": 0.24341635740793605, + "grad_norm": 1.719994068145752, + "learning_rate": 4.30398152176422e-05, + "loss": 4.6263, + "step": 40929 + }, + { + "epoch": 0.24342230469121706, + "grad_norm": 2.0325846672058105, + "learning_rate": 4.3039491833804565e-05, + "loss": 4.8415, + "step": 40930 + }, + { + "epoch": 0.24342825197449805, + "grad_norm": 1.7203433513641357, + "learning_rate": 4.303916844366954e-05, + "loss": 5.0702, + "step": 40931 + }, + { + "epoch": 0.24343419925777904, + "grad_norm": 1.5848060846328735, + "learning_rate": 4.3038845047237234e-05, + "loss": 5.0175, + "step": 40932 + }, + { + "epoch": 0.24344014654106005, + "grad_norm": 1.815674066543579, + "learning_rate": 4.303852164450774e-05, + "loss": 4.9468, + "step": 40933 + }, + { + "epoch": 0.24344609382434104, + "grad_norm": 1.570547103881836, + "learning_rate": 4.303819823548119e-05, + "loss": 5.1829, + "step": 40934 + }, + { + "epoch": 0.24345204110762203, + "grad_norm": 1.5470898151397705, + "learning_rate": 4.303787482015769e-05, + "loss": 5.1505, + "step": 40935 + }, + { + "epoch": 0.24345798839090305, + "grad_norm": 1.8365795612335205, + "learning_rate": 4.303755139853736e-05, + "loss": 4.841, + "step": 40936 + }, + { + "epoch": 0.24346393567418403, + "grad_norm": 1.8081260919570923, + "learning_rate": 4.3037227970620295e-05, + "loss": 4.3994, + "step": 40937 + }, + { + "epoch": 0.24346988295746502, + "grad_norm": 1.5101466178894043, + "learning_rate": 4.303690453640663e-05, + "loss": 4.6279, + "step": 40938 + }, + { + "epoch": 0.24347583024074604, + "grad_norm": 2.1658167839050293, + "learning_rate": 4.3036581095896464e-05, + "loss": 4.0401, + "step": 40939 + }, + { + "epoch": 0.24348177752402703, + "grad_norm": 3.9648211002349854, + "learning_rate": 4.3036257649089915e-05, + "loss": 3.4328, + "step": 40940 + }, + { + "epoch": 0.24348772480730801, + "grad_norm": 2.311387062072754, + "learning_rate": 4.303593419598709e-05, + "loss": 4.6111, + "step": 40941 + }, + { + "epoch": 0.24349367209058903, + "grad_norm": 2.0188567638397217, + "learning_rate": 4.303561073658812e-05, + "loss": 4.9447, + "step": 40942 + }, + { + "epoch": 0.24349961937387002, + "grad_norm": 1.6500351428985596, + "learning_rate": 4.303528727089309e-05, + "loss": 4.9736, + "step": 40943 + }, + { + "epoch": 0.243505566657151, + "grad_norm": 1.8556336164474487, + "learning_rate": 4.303496379890214e-05, + "loss": 4.9585, + "step": 40944 + }, + { + "epoch": 0.24351151394043202, + "grad_norm": 1.5998358726501465, + "learning_rate": 4.303464032061536e-05, + "loss": 5.3284, + "step": 40945 + }, + { + "epoch": 0.243517461223713, + "grad_norm": 1.5853228569030762, + "learning_rate": 4.303431683603288e-05, + "loss": 5.3262, + "step": 40946 + }, + { + "epoch": 0.243523408506994, + "grad_norm": 1.3080404996871948, + "learning_rate": 4.30339933451548e-05, + "loss": 5.3692, + "step": 40947 + }, + { + "epoch": 0.24352935579027502, + "grad_norm": 1.581566333770752, + "learning_rate": 4.303366984798125e-05, + "loss": 4.7888, + "step": 40948 + }, + { + "epoch": 0.243535303073556, + "grad_norm": 1.4258782863616943, + "learning_rate": 4.303334634451233e-05, + "loss": 4.95, + "step": 40949 + }, + { + "epoch": 0.243541250356837, + "grad_norm": 1.8425894975662231, + "learning_rate": 4.3033022834748143e-05, + "loss": 4.2642, + "step": 40950 + }, + { + "epoch": 0.243547197640118, + "grad_norm": 1.984083652496338, + "learning_rate": 4.3032699318688824e-05, + "loss": 4.4491, + "step": 40951 + }, + { + "epoch": 0.243553144923399, + "grad_norm": 1.8594402074813843, + "learning_rate": 4.303237579633447e-05, + "loss": 4.3912, + "step": 40952 + }, + { + "epoch": 0.24355909220667998, + "grad_norm": 1.9003740549087524, + "learning_rate": 4.3032052267685206e-05, + "loss": 4.6963, + "step": 40953 + }, + { + "epoch": 0.243565039489961, + "grad_norm": 1.7233352661132812, + "learning_rate": 4.303172873274114e-05, + "loss": 4.5202, + "step": 40954 + }, + { + "epoch": 0.243570986773242, + "grad_norm": 1.5012717247009277, + "learning_rate": 4.303140519150237e-05, + "loss": 4.443, + "step": 40955 + }, + { + "epoch": 0.24357693405652298, + "grad_norm": 1.4498696327209473, + "learning_rate": 4.303108164396904e-05, + "loss": 5.344, + "step": 40956 + }, + { + "epoch": 0.24358288133980396, + "grad_norm": 1.7039128541946411, + "learning_rate": 4.303075809014124e-05, + "loss": 4.6563, + "step": 40957 + }, + { + "epoch": 0.24358882862308498, + "grad_norm": 1.9477099180221558, + "learning_rate": 4.303043453001909e-05, + "loss": 4.3009, + "step": 40958 + }, + { + "epoch": 0.24359477590636597, + "grad_norm": 1.7088162899017334, + "learning_rate": 4.30301109636027e-05, + "loss": 4.2408, + "step": 40959 + }, + { + "epoch": 0.24360072318964696, + "grad_norm": 1.8010348081588745, + "learning_rate": 4.302978739089218e-05, + "loss": 4.4174, + "step": 40960 + }, + { + "epoch": 0.24360667047292797, + "grad_norm": 1.6640205383300781, + "learning_rate": 4.302946381188765e-05, + "loss": 4.2926, + "step": 40961 + }, + { + "epoch": 0.24361261775620896, + "grad_norm": 1.5833102464675903, + "learning_rate": 4.3029140226589215e-05, + "loss": 4.7418, + "step": 40962 + }, + { + "epoch": 0.24361856503948995, + "grad_norm": 1.4376726150512695, + "learning_rate": 4.3028816634997006e-05, + "loss": 4.9325, + "step": 40963 + }, + { + "epoch": 0.24362451232277096, + "grad_norm": 1.400916576385498, + "learning_rate": 4.302849303711112e-05, + "loss": 5.0494, + "step": 40964 + }, + { + "epoch": 0.24363045960605195, + "grad_norm": 1.5311365127563477, + "learning_rate": 4.302816943293166e-05, + "loss": 5.3073, + "step": 40965 + }, + { + "epoch": 0.24363640688933294, + "grad_norm": 1.15052330493927, + "learning_rate": 4.3027845822458764e-05, + "loss": 5.3031, + "step": 40966 + }, + { + "epoch": 0.24364235417261396, + "grad_norm": 1.6844408512115479, + "learning_rate": 4.302752220569254e-05, + "loss": 4.0691, + "step": 40967 + }, + { + "epoch": 0.24364830145589494, + "grad_norm": 2.608067274093628, + "learning_rate": 4.3027198582633085e-05, + "loss": 4.113, + "step": 40968 + }, + { + "epoch": 0.24365424873917593, + "grad_norm": 2.823824167251587, + "learning_rate": 4.302687495328052e-05, + "loss": 3.8541, + "step": 40969 + }, + { + "epoch": 0.24366019602245695, + "grad_norm": 2.93241810798645, + "learning_rate": 4.302655131763497e-05, + "loss": 3.4994, + "step": 40970 + }, + { + "epoch": 0.24366614330573794, + "grad_norm": 1.753179907798767, + "learning_rate": 4.302622767569652e-05, + "loss": 4.3206, + "step": 40971 + }, + { + "epoch": 0.24367209058901892, + "grad_norm": 1.4714876413345337, + "learning_rate": 4.3025904027465316e-05, + "loss": 4.8053, + "step": 40972 + }, + { + "epoch": 0.24367803787229994, + "grad_norm": 2.551591396331787, + "learning_rate": 4.3025580372941446e-05, + "loss": 4.1185, + "step": 40973 + }, + { + "epoch": 0.24368398515558093, + "grad_norm": 3.168712854385376, + "learning_rate": 4.302525671212503e-05, + "loss": 3.7754, + "step": 40974 + }, + { + "epoch": 0.24368993243886192, + "grad_norm": 2.867417097091675, + "learning_rate": 4.302493304501619e-05, + "loss": 3.6598, + "step": 40975 + }, + { + "epoch": 0.24369587972214293, + "grad_norm": 2.5059595108032227, + "learning_rate": 4.3024609371615024e-05, + "loss": 3.7137, + "step": 40976 + }, + { + "epoch": 0.24370182700542392, + "grad_norm": 2.6387035846710205, + "learning_rate": 4.3024285691921665e-05, + "loss": 3.9535, + "step": 40977 + }, + { + "epoch": 0.2437077742887049, + "grad_norm": 2.29276442527771, + "learning_rate": 4.30239620059362e-05, + "loss": 3.64, + "step": 40978 + }, + { + "epoch": 0.24371372157198593, + "grad_norm": 2.1491353511810303, + "learning_rate": 4.302363831365876e-05, + "loss": 4.0323, + "step": 40979 + }, + { + "epoch": 0.2437196688552669, + "grad_norm": 2.386383056640625, + "learning_rate": 4.3023314615089465e-05, + "loss": 3.5426, + "step": 40980 + }, + { + "epoch": 0.2437256161385479, + "grad_norm": 2.6178345680236816, + "learning_rate": 4.302299091022841e-05, + "loss": 3.6789, + "step": 40981 + }, + { + "epoch": 0.24373156342182892, + "grad_norm": 1.862566351890564, + "learning_rate": 4.3022667199075714e-05, + "loss": 4.334, + "step": 40982 + }, + { + "epoch": 0.2437375107051099, + "grad_norm": 2.606733798980713, + "learning_rate": 4.3022343481631486e-05, + "loss": 3.6383, + "step": 40983 + }, + { + "epoch": 0.2437434579883909, + "grad_norm": 2.6810405254364014, + "learning_rate": 4.3022019757895856e-05, + "loss": 3.8941, + "step": 40984 + }, + { + "epoch": 0.2437494052716719, + "grad_norm": 2.0065436363220215, + "learning_rate": 4.302169602786892e-05, + "loss": 4.3215, + "step": 40985 + }, + { + "epoch": 0.2437553525549529, + "grad_norm": 2.058779239654541, + "learning_rate": 4.30213722915508e-05, + "loss": 3.9057, + "step": 40986 + }, + { + "epoch": 0.24376129983823389, + "grad_norm": 2.527791976928711, + "learning_rate": 4.30210485489416e-05, + "loss": 3.6897, + "step": 40987 + }, + { + "epoch": 0.2437672471215149, + "grad_norm": 2.6768147945404053, + "learning_rate": 4.3020724800041435e-05, + "loss": 3.5104, + "step": 40988 + }, + { + "epoch": 0.2437731944047959, + "grad_norm": 2.2928082942962646, + "learning_rate": 4.3020401044850423e-05, + "loss": 3.8977, + "step": 40989 + }, + { + "epoch": 0.24377914168807688, + "grad_norm": 2.3013970851898193, + "learning_rate": 4.3020077283368674e-05, + "loss": 3.5069, + "step": 40990 + }, + { + "epoch": 0.2437850889713579, + "grad_norm": 2.6524245738983154, + "learning_rate": 4.301975351559631e-05, + "loss": 3.428, + "step": 40991 + }, + { + "epoch": 0.24379103625463888, + "grad_norm": 2.988168716430664, + "learning_rate": 4.301942974153344e-05, + "loss": 3.2827, + "step": 40992 + }, + { + "epoch": 0.24379698353791987, + "grad_norm": 2.6804091930389404, + "learning_rate": 4.3019105961180154e-05, + "loss": 3.5121, + "step": 40993 + }, + { + "epoch": 0.24380293082120089, + "grad_norm": 2.727897882461548, + "learning_rate": 4.30187821745366e-05, + "loss": 3.1645, + "step": 40994 + }, + { + "epoch": 0.24380887810448187, + "grad_norm": 2.7669837474823, + "learning_rate": 4.3018458381602864e-05, + "loss": 2.8175, + "step": 40995 + }, + { + "epoch": 0.24381482538776286, + "grad_norm": 2.3155932426452637, + "learning_rate": 4.3018134582379075e-05, + "loss": 3.8946, + "step": 40996 + }, + { + "epoch": 0.24382077267104388, + "grad_norm": 2.9237122535705566, + "learning_rate": 4.301781077686535e-05, + "loss": 2.8126, + "step": 40997 + }, + { + "epoch": 0.24382671995432487, + "grad_norm": 2.927705764770508, + "learning_rate": 4.301748696506178e-05, + "loss": 2.9585, + "step": 40998 + }, + { + "epoch": 0.24383266723760585, + "grad_norm": 2.6318113803863525, + "learning_rate": 4.3017163146968494e-05, + "loss": 3.2235, + "step": 40999 + }, + { + "epoch": 0.24383861452088687, + "grad_norm": 2.7591280937194824, + "learning_rate": 4.301683932258561e-05, + "loss": 3.1535, + "step": 41000 + }, + { + "epoch": 0.24384456180416786, + "grad_norm": 2.8677873611450195, + "learning_rate": 4.301651549191324e-05, + "loss": 3.0601, + "step": 41001 + }, + { + "epoch": 0.24385050908744885, + "grad_norm": 2.8618898391723633, + "learning_rate": 4.3016191654951474e-05, + "loss": 3.7775, + "step": 41002 + }, + { + "epoch": 0.24385645637072986, + "grad_norm": 3.121734619140625, + "learning_rate": 4.301586781170045e-05, + "loss": 3.6666, + "step": 41003 + }, + { + "epoch": 0.24386240365401085, + "grad_norm": 2.842517852783203, + "learning_rate": 4.301554396216026e-05, + "loss": 3.5266, + "step": 41004 + }, + { + "epoch": 0.24386835093729184, + "grad_norm": 2.9145760536193848, + "learning_rate": 4.3015220106331043e-05, + "loss": 3.7376, + "step": 41005 + }, + { + "epoch": 0.24387429822057285, + "grad_norm": 3.010977029800415, + "learning_rate": 4.3014896244212897e-05, + "loss": 3.7787, + "step": 41006 + }, + { + "epoch": 0.24388024550385384, + "grad_norm": 2.6671133041381836, + "learning_rate": 4.301457237580594e-05, + "loss": 3.4427, + "step": 41007 + }, + { + "epoch": 0.24388619278713483, + "grad_norm": 2.4177465438842773, + "learning_rate": 4.301424850111028e-05, + "loss": 3.6345, + "step": 41008 + }, + { + "epoch": 0.24389214007041585, + "grad_norm": 2.2710280418395996, + "learning_rate": 4.3013924620126025e-05, + "loss": 3.6763, + "step": 41009 + }, + { + "epoch": 0.24389808735369684, + "grad_norm": 2.5731492042541504, + "learning_rate": 4.3013600732853297e-05, + "loss": 3.6626, + "step": 41010 + }, + { + "epoch": 0.24390403463697782, + "grad_norm": 2.4105541706085205, + "learning_rate": 4.3013276839292216e-05, + "loss": 3.7997, + "step": 41011 + }, + { + "epoch": 0.24390998192025884, + "grad_norm": 2.3912150859832764, + "learning_rate": 4.301295293944287e-05, + "loss": 3.7265, + "step": 41012 + }, + { + "epoch": 0.24391592920353983, + "grad_norm": 2.4968008995056152, + "learning_rate": 4.3012629033305405e-05, + "loss": 3.4953, + "step": 41013 + }, + { + "epoch": 0.24392187648682082, + "grad_norm": 2.232212543487549, + "learning_rate": 4.301230512087991e-05, + "loss": 4.112, + "step": 41014 + }, + { + "epoch": 0.2439278237701018, + "grad_norm": 1.6712101697921753, + "learning_rate": 4.3011981202166506e-05, + "loss": 4.7613, + "step": 41015 + }, + { + "epoch": 0.24393377105338282, + "grad_norm": 1.5058718919754028, + "learning_rate": 4.3011657277165304e-05, + "loss": 4.9787, + "step": 41016 + }, + { + "epoch": 0.2439397183366638, + "grad_norm": 2.5461862087249756, + "learning_rate": 4.301133334587643e-05, + "loss": 4.9995, + "step": 41017 + }, + { + "epoch": 0.2439456656199448, + "grad_norm": 1.8090366125106812, + "learning_rate": 4.301100940829997e-05, + "loss": 4.8258, + "step": 41018 + }, + { + "epoch": 0.2439516129032258, + "grad_norm": 1.7511998414993286, + "learning_rate": 4.3010685464436056e-05, + "loss": 4.6605, + "step": 41019 + }, + { + "epoch": 0.2439575601865068, + "grad_norm": 1.6499818563461304, + "learning_rate": 4.30103615142848e-05, + "loss": 5.0107, + "step": 41020 + }, + { + "epoch": 0.2439635074697878, + "grad_norm": 1.4957252740859985, + "learning_rate": 4.3010037557846315e-05, + "loss": 4.7193, + "step": 41021 + }, + { + "epoch": 0.2439694547530688, + "grad_norm": 1.5488169193267822, + "learning_rate": 4.300971359512071e-05, + "loss": 4.672, + "step": 41022 + }, + { + "epoch": 0.2439754020363498, + "grad_norm": 1.4313315153121948, + "learning_rate": 4.30093896261081e-05, + "loss": 4.7678, + "step": 41023 + }, + { + "epoch": 0.24398134931963078, + "grad_norm": 1.651253581047058, + "learning_rate": 4.30090656508086e-05, + "loss": 4.9307, + "step": 41024 + }, + { + "epoch": 0.2439872966029118, + "grad_norm": 1.526384949684143, + "learning_rate": 4.300874166922232e-05, + "loss": 4.609, + "step": 41025 + }, + { + "epoch": 0.24399324388619278, + "grad_norm": 1.4737952947616577, + "learning_rate": 4.3008417681349365e-05, + "loss": 5.0853, + "step": 41026 + }, + { + "epoch": 0.24399919116947377, + "grad_norm": 1.6411961317062378, + "learning_rate": 4.300809368718987e-05, + "loss": 5.073, + "step": 41027 + }, + { + "epoch": 0.2440051384527548, + "grad_norm": 1.672352910041809, + "learning_rate": 4.300776968674393e-05, + "loss": 4.3503, + "step": 41028 + }, + { + "epoch": 0.24401108573603578, + "grad_norm": 1.7454437017440796, + "learning_rate": 4.3007445680011664e-05, + "loss": 4.7877, + "step": 41029 + }, + { + "epoch": 0.24401703301931676, + "grad_norm": 1.877962350845337, + "learning_rate": 4.300712166699319e-05, + "loss": 5.1702, + "step": 41030 + }, + { + "epoch": 0.24402298030259778, + "grad_norm": 1.8893183469772339, + "learning_rate": 4.300679764768861e-05, + "loss": 4.7888, + "step": 41031 + }, + { + "epoch": 0.24402892758587877, + "grad_norm": 1.3907794952392578, + "learning_rate": 4.3006473622098044e-05, + "loss": 5.2973, + "step": 41032 + }, + { + "epoch": 0.24403487486915976, + "grad_norm": 1.6076308488845825, + "learning_rate": 4.3006149590221606e-05, + "loss": 4.9623, + "step": 41033 + }, + { + "epoch": 0.24404082215244077, + "grad_norm": 2.513042449951172, + "learning_rate": 4.300582555205941e-05, + "loss": 3.8306, + "step": 41034 + }, + { + "epoch": 0.24404676943572176, + "grad_norm": 2.4783194065093994, + "learning_rate": 4.3005501507611554e-05, + "loss": 3.8139, + "step": 41035 + }, + { + "epoch": 0.24405271671900275, + "grad_norm": 1.9703311920166016, + "learning_rate": 4.300517745687818e-05, + "loss": 4.4589, + "step": 41036 + }, + { + "epoch": 0.24405866400228377, + "grad_norm": 1.7620080709457397, + "learning_rate": 4.3004853399859376e-05, + "loss": 4.6103, + "step": 41037 + }, + { + "epoch": 0.24406461128556475, + "grad_norm": 2.643313407897949, + "learning_rate": 4.300452933655526e-05, + "loss": 3.655, + "step": 41038 + }, + { + "epoch": 0.24407055856884574, + "grad_norm": 2.428417444229126, + "learning_rate": 4.300420526696595e-05, + "loss": 3.5739, + "step": 41039 + }, + { + "epoch": 0.24407650585212676, + "grad_norm": 2.639162063598633, + "learning_rate": 4.3003881191091566e-05, + "loss": 3.8378, + "step": 41040 + }, + { + "epoch": 0.24408245313540775, + "grad_norm": 2.0753536224365234, + "learning_rate": 4.300355710893221e-05, + "loss": 3.7645, + "step": 41041 + }, + { + "epoch": 0.24408840041868873, + "grad_norm": 2.1097209453582764, + "learning_rate": 4.300323302048799e-05, + "loss": 3.8043, + "step": 41042 + }, + { + "epoch": 0.24409434770196975, + "grad_norm": 2.293515682220459, + "learning_rate": 4.300290892575903e-05, + "loss": 3.8089, + "step": 41043 + }, + { + "epoch": 0.24410029498525074, + "grad_norm": 1.7682806253433228, + "learning_rate": 4.3002584824745454e-05, + "loss": 4.5613, + "step": 41044 + }, + { + "epoch": 0.24410624226853173, + "grad_norm": 2.427494764328003, + "learning_rate": 4.300226071744735e-05, + "loss": 3.5672, + "step": 41045 + }, + { + "epoch": 0.24411218955181274, + "grad_norm": 2.6539034843444824, + "learning_rate": 4.300193660386484e-05, + "loss": 3.6894, + "step": 41046 + }, + { + "epoch": 0.24411813683509373, + "grad_norm": 4.131994247436523, + "learning_rate": 4.3001612483998046e-05, + "loss": 3.7901, + "step": 41047 + }, + { + "epoch": 0.24412408411837472, + "grad_norm": 2.2330989837646484, + "learning_rate": 4.300128835784708e-05, + "loss": 3.6282, + "step": 41048 + }, + { + "epoch": 0.24413003140165573, + "grad_norm": 2.315194606781006, + "learning_rate": 4.300096422541204e-05, + "loss": 3.6663, + "step": 41049 + }, + { + "epoch": 0.24413597868493672, + "grad_norm": 2.495371103286743, + "learning_rate": 4.300064008669305e-05, + "loss": 3.8228, + "step": 41050 + }, + { + "epoch": 0.2441419259682177, + "grad_norm": 2.223499298095703, + "learning_rate": 4.300031594169023e-05, + "loss": 3.845, + "step": 41051 + }, + { + "epoch": 0.24414787325149873, + "grad_norm": 2.948883295059204, + "learning_rate": 4.2999991790403676e-05, + "loss": 3.8526, + "step": 41052 + }, + { + "epoch": 0.24415382053477971, + "grad_norm": 1.9984116554260254, + "learning_rate": 4.299966763283352e-05, + "loss": 3.6063, + "step": 41053 + }, + { + "epoch": 0.2441597678180607, + "grad_norm": 2.2907629013061523, + "learning_rate": 4.299934346897986e-05, + "loss": 3.5943, + "step": 41054 + }, + { + "epoch": 0.24416571510134172, + "grad_norm": 2.324422597885132, + "learning_rate": 4.299901929884282e-05, + "loss": 3.6081, + "step": 41055 + }, + { + "epoch": 0.2441716623846227, + "grad_norm": 2.293504476547241, + "learning_rate": 4.29986951224225e-05, + "loss": 3.5047, + "step": 41056 + }, + { + "epoch": 0.2441776096679037, + "grad_norm": 1.9256181716918945, + "learning_rate": 4.299837093971903e-05, + "loss": 3.7635, + "step": 41057 + }, + { + "epoch": 0.2441835569511847, + "grad_norm": 2.576510190963745, + "learning_rate": 4.2998046750732514e-05, + "loss": 3.7126, + "step": 41058 + }, + { + "epoch": 0.2441895042344657, + "grad_norm": 3.809711456298828, + "learning_rate": 4.299772255546306e-05, + "loss": 3.6119, + "step": 41059 + }, + { + "epoch": 0.2441954515177467, + "grad_norm": 1.6247094869613647, + "learning_rate": 4.2997398353910786e-05, + "loss": 4.4132, + "step": 41060 + }, + { + "epoch": 0.2442013988010277, + "grad_norm": 1.4020733833312988, + "learning_rate": 4.299707414607581e-05, + "loss": 4.9133, + "step": 41061 + }, + { + "epoch": 0.2442073460843087, + "grad_norm": 1.979539394378662, + "learning_rate": 4.299674993195825e-05, + "loss": 4.8097, + "step": 41062 + }, + { + "epoch": 0.24421329336758968, + "grad_norm": 1.7198399305343628, + "learning_rate": 4.2996425711558195e-05, + "loss": 4.7149, + "step": 41063 + }, + { + "epoch": 0.2442192406508707, + "grad_norm": 1.6382774114608765, + "learning_rate": 4.299610148487579e-05, + "loss": 4.8164, + "step": 41064 + }, + { + "epoch": 0.24422518793415168, + "grad_norm": 1.3646199703216553, + "learning_rate": 4.299577725191112e-05, + "loss": 4.8313, + "step": 41065 + }, + { + "epoch": 0.24423113521743267, + "grad_norm": 1.543046236038208, + "learning_rate": 4.299545301266431e-05, + "loss": 4.2541, + "step": 41066 + }, + { + "epoch": 0.2442370825007137, + "grad_norm": 1.6106550693511963, + "learning_rate": 4.299512876713548e-05, + "loss": 4.6502, + "step": 41067 + }, + { + "epoch": 0.24424302978399468, + "grad_norm": 1.499096393585205, + "learning_rate": 4.299480451532473e-05, + "loss": 4.7382, + "step": 41068 + }, + { + "epoch": 0.24424897706727566, + "grad_norm": 1.539903163909912, + "learning_rate": 4.2994480257232185e-05, + "loss": 4.6095, + "step": 41069 + }, + { + "epoch": 0.24425492435055668, + "grad_norm": 1.5277403593063354, + "learning_rate": 4.299415599285795e-05, + "loss": 4.6175, + "step": 41070 + }, + { + "epoch": 0.24426087163383767, + "grad_norm": 1.6708862781524658, + "learning_rate": 4.299383172220214e-05, + "loss": 4.6582, + "step": 41071 + }, + { + "epoch": 0.24426681891711866, + "grad_norm": 1.517624020576477, + "learning_rate": 4.299350744526487e-05, + "loss": 4.4372, + "step": 41072 + }, + { + "epoch": 0.24427276620039964, + "grad_norm": 1.5195921659469604, + "learning_rate": 4.299318316204626e-05, + "loss": 4.6724, + "step": 41073 + }, + { + "epoch": 0.24427871348368066, + "grad_norm": 1.7260321378707886, + "learning_rate": 4.2992858872546404e-05, + "loss": 4.9461, + "step": 41074 + }, + { + "epoch": 0.24428466076696165, + "grad_norm": 1.9820029735565186, + "learning_rate": 4.2992534576765434e-05, + "loss": 5.0441, + "step": 41075 + }, + { + "epoch": 0.24429060805024264, + "grad_norm": 1.6406270265579224, + "learning_rate": 4.299221027470345e-05, + "loss": 5.2864, + "step": 41076 + }, + { + "epoch": 0.24429655533352365, + "grad_norm": 1.6725775003433228, + "learning_rate": 4.299188596636058e-05, + "loss": 4.5714, + "step": 41077 + }, + { + "epoch": 0.24430250261680464, + "grad_norm": 1.6156655550003052, + "learning_rate": 4.299156165173692e-05, + "loss": 4.9557, + "step": 41078 + }, + { + "epoch": 0.24430844990008563, + "grad_norm": 1.8281142711639404, + "learning_rate": 4.299123733083259e-05, + "loss": 3.9674, + "step": 41079 + }, + { + "epoch": 0.24431439718336664, + "grad_norm": 1.776897668838501, + "learning_rate": 4.2990913003647714e-05, + "loss": 3.9115, + "step": 41080 + }, + { + "epoch": 0.24432034446664763, + "grad_norm": 1.8234760761260986, + "learning_rate": 4.299058867018239e-05, + "loss": 5.0246, + "step": 41081 + }, + { + "epoch": 0.24432629174992862, + "grad_norm": 1.7198151350021362, + "learning_rate": 4.2990264330436744e-05, + "loss": 5.2425, + "step": 41082 + }, + { + "epoch": 0.24433223903320964, + "grad_norm": 1.5141315460205078, + "learning_rate": 4.2989939984410876e-05, + "loss": 4.6319, + "step": 41083 + }, + { + "epoch": 0.24433818631649062, + "grad_norm": 1.9050178527832031, + "learning_rate": 4.298961563210491e-05, + "loss": 4.5006, + "step": 41084 + }, + { + "epoch": 0.2443441335997716, + "grad_norm": 1.6235878467559814, + "learning_rate": 4.298929127351895e-05, + "loss": 4.5223, + "step": 41085 + }, + { + "epoch": 0.24435008088305263, + "grad_norm": 1.6890273094177246, + "learning_rate": 4.2988966908653115e-05, + "loss": 5.1169, + "step": 41086 + }, + { + "epoch": 0.24435602816633362, + "grad_norm": 1.7842737436294556, + "learning_rate": 4.2988642537507523e-05, + "loss": 4.7057, + "step": 41087 + }, + { + "epoch": 0.2443619754496146, + "grad_norm": 1.6248615980148315, + "learning_rate": 4.298831816008228e-05, + "loss": 4.7137, + "step": 41088 + }, + { + "epoch": 0.24436792273289562, + "grad_norm": 1.478296160697937, + "learning_rate": 4.29879937763775e-05, + "loss": 4.4864, + "step": 41089 + }, + { + "epoch": 0.2443738700161766, + "grad_norm": 1.2408661842346191, + "learning_rate": 4.298766938639329e-05, + "loss": 4.4853, + "step": 41090 + }, + { + "epoch": 0.2443798172994576, + "grad_norm": 1.9558448791503906, + "learning_rate": 4.298734499012979e-05, + "loss": 5.0121, + "step": 41091 + }, + { + "epoch": 0.2443857645827386, + "grad_norm": 1.5483293533325195, + "learning_rate": 4.2987020587587076e-05, + "loss": 5.1048, + "step": 41092 + }, + { + "epoch": 0.2443917118660196, + "grad_norm": 1.7755616903305054, + "learning_rate": 4.298669617876528e-05, + "loss": 5.106, + "step": 41093 + }, + { + "epoch": 0.2443976591493006, + "grad_norm": 1.4265565872192383, + "learning_rate": 4.2986371763664515e-05, + "loss": 5.113, + "step": 41094 + }, + { + "epoch": 0.2444036064325816, + "grad_norm": 1.5687288045883179, + "learning_rate": 4.29860473422849e-05, + "loss": 5.352, + "step": 41095 + }, + { + "epoch": 0.2444095537158626, + "grad_norm": 2.025449275970459, + "learning_rate": 4.298572291462653e-05, + "loss": 4.1789, + "step": 41096 + }, + { + "epoch": 0.24441550099914358, + "grad_norm": 2.0055124759674072, + "learning_rate": 4.298539848068955e-05, + "loss": 4.2927, + "step": 41097 + }, + { + "epoch": 0.2444214482824246, + "grad_norm": 1.5463171005249023, + "learning_rate": 4.298507404047404e-05, + "loss": 4.9065, + "step": 41098 + }, + { + "epoch": 0.24442739556570559, + "grad_norm": 1.6055642366409302, + "learning_rate": 4.298474959398012e-05, + "loss": 4.7739, + "step": 41099 + }, + { + "epoch": 0.24443334284898657, + "grad_norm": 2.0380098819732666, + "learning_rate": 4.2984425141207924e-05, + "loss": 4.0613, + "step": 41100 + }, + { + "epoch": 0.2444392901322676, + "grad_norm": 2.0383105278015137, + "learning_rate": 4.2984100682157544e-05, + "loss": 3.9511, + "step": 41101 + }, + { + "epoch": 0.24444523741554858, + "grad_norm": 1.4175503253936768, + "learning_rate": 4.29837762168291e-05, + "loss": 4.9093, + "step": 41102 + }, + { + "epoch": 0.24445118469882957, + "grad_norm": 1.5565396547317505, + "learning_rate": 4.29834517452227e-05, + "loss": 4.8711, + "step": 41103 + }, + { + "epoch": 0.24445713198211058, + "grad_norm": 1.5805503129959106, + "learning_rate": 4.298312726733847e-05, + "loss": 4.3712, + "step": 41104 + }, + { + "epoch": 0.24446307926539157, + "grad_norm": 1.700554370880127, + "learning_rate": 4.2982802783176526e-05, + "loss": 4.6637, + "step": 41105 + }, + { + "epoch": 0.24446902654867256, + "grad_norm": 1.6254773139953613, + "learning_rate": 4.2982478292736955e-05, + "loss": 4.6652, + "step": 41106 + }, + { + "epoch": 0.24447497383195357, + "grad_norm": 1.711637020111084, + "learning_rate": 4.2982153796019895e-05, + "loss": 4.8535, + "step": 41107 + }, + { + "epoch": 0.24448092111523456, + "grad_norm": 2.0624165534973145, + "learning_rate": 4.2981829293025446e-05, + "loss": 3.7016, + "step": 41108 + }, + { + "epoch": 0.24448686839851555, + "grad_norm": 2.5427539348602295, + "learning_rate": 4.298150478375373e-05, + "loss": 3.7576, + "step": 41109 + }, + { + "epoch": 0.24449281568179657, + "grad_norm": 1.6543644666671753, + "learning_rate": 4.298118026820485e-05, + "loss": 4.6616, + "step": 41110 + }, + { + "epoch": 0.24449876296507755, + "grad_norm": 1.8081955909729004, + "learning_rate": 4.2980855746378935e-05, + "loss": 4.4641, + "step": 41111 + }, + { + "epoch": 0.24450471024835854, + "grad_norm": 2.1449787616729736, + "learning_rate": 4.298053121827608e-05, + "loss": 3.6515, + "step": 41112 + }, + { + "epoch": 0.24451065753163956, + "grad_norm": 2.9826364517211914, + "learning_rate": 4.298020668389641e-05, + "loss": 3.7221, + "step": 41113 + }, + { + "epoch": 0.24451660481492055, + "grad_norm": 2.5362765789031982, + "learning_rate": 4.297988214324004e-05, + "loss": 3.8037, + "step": 41114 + }, + { + "epoch": 0.24452255209820153, + "grad_norm": 1.859877347946167, + "learning_rate": 4.2979557596307075e-05, + "loss": 4.6756, + "step": 41115 + }, + { + "epoch": 0.24452849938148255, + "grad_norm": 1.7453864812850952, + "learning_rate": 4.297923304309763e-05, + "loss": 5.0898, + "step": 41116 + }, + { + "epoch": 0.24453444666476354, + "grad_norm": 1.6057218313217163, + "learning_rate": 4.2978908483611826e-05, + "loss": 4.6394, + "step": 41117 + }, + { + "epoch": 0.24454039394804453, + "grad_norm": 2.2967329025268555, + "learning_rate": 4.297858391784977e-05, + "loss": 3.4872, + "step": 41118 + }, + { + "epoch": 0.24454634123132554, + "grad_norm": 2.3595829010009766, + "learning_rate": 4.297825934581158e-05, + "loss": 3.3766, + "step": 41119 + }, + { + "epoch": 0.24455228851460653, + "grad_norm": 1.9303618669509888, + "learning_rate": 4.2977934767497353e-05, + "loss": 4.4959, + "step": 41120 + }, + { + "epoch": 0.24455823579788752, + "grad_norm": 2.6735446453094482, + "learning_rate": 4.2977610182907225e-05, + "loss": 3.3082, + "step": 41121 + }, + { + "epoch": 0.24456418308116853, + "grad_norm": 2.5268654823303223, + "learning_rate": 4.297728559204129e-05, + "loss": 3.5869, + "step": 41122 + }, + { + "epoch": 0.24457013036444952, + "grad_norm": 2.3503427505493164, + "learning_rate": 4.297696099489968e-05, + "loss": 3.5166, + "step": 41123 + }, + { + "epoch": 0.2445760776477305, + "grad_norm": 2.3436567783355713, + "learning_rate": 4.29766363914825e-05, + "loss": 4.489, + "step": 41124 + }, + { + "epoch": 0.24458202493101153, + "grad_norm": 1.9524338245391846, + "learning_rate": 4.2976311781789856e-05, + "loss": 4.6862, + "step": 41125 + }, + { + "epoch": 0.24458797221429252, + "grad_norm": 1.9234352111816406, + "learning_rate": 4.2975987165821866e-05, + "loss": 4.6067, + "step": 41126 + }, + { + "epoch": 0.2445939194975735, + "grad_norm": 1.8987208604812622, + "learning_rate": 4.297566254357864e-05, + "loss": 4.27, + "step": 41127 + }, + { + "epoch": 0.24459986678085452, + "grad_norm": 1.7181370258331299, + "learning_rate": 4.297533791506031e-05, + "loss": 4.5791, + "step": 41128 + }, + { + "epoch": 0.2446058140641355, + "grad_norm": 2.0697638988494873, + "learning_rate": 4.297501328026696e-05, + "loss": 5.2166, + "step": 41129 + }, + { + "epoch": 0.2446117613474165, + "grad_norm": 1.705910563468933, + "learning_rate": 4.297468863919874e-05, + "loss": 5.0293, + "step": 41130 + }, + { + "epoch": 0.2446177086306975, + "grad_norm": 1.7740181684494019, + "learning_rate": 4.297436399185572e-05, + "loss": 5.1075, + "step": 41131 + }, + { + "epoch": 0.2446236559139785, + "grad_norm": 1.762485146522522, + "learning_rate": 4.297403933823805e-05, + "loss": 5.104, + "step": 41132 + }, + { + "epoch": 0.2446296031972595, + "grad_norm": 1.5525120496749878, + "learning_rate": 4.2973714678345824e-05, + "loss": 4.9474, + "step": 41133 + }, + { + "epoch": 0.24463555048054048, + "grad_norm": 1.7745859622955322, + "learning_rate": 4.297339001217915e-05, + "loss": 4.5411, + "step": 41134 + }, + { + "epoch": 0.2446414977638215, + "grad_norm": 1.636746883392334, + "learning_rate": 4.297306533973816e-05, + "loss": 4.9474, + "step": 41135 + }, + { + "epoch": 0.24464744504710248, + "grad_norm": 1.5808149576187134, + "learning_rate": 4.297274066102296e-05, + "loss": 5.0419, + "step": 41136 + }, + { + "epoch": 0.24465339233038347, + "grad_norm": 1.6532201766967773, + "learning_rate": 4.2972415976033656e-05, + "loss": 4.4803, + "step": 41137 + }, + { + "epoch": 0.24465933961366448, + "grad_norm": 1.6433192491531372, + "learning_rate": 4.297209128477038e-05, + "loss": 4.7654, + "step": 41138 + }, + { + "epoch": 0.24466528689694547, + "grad_norm": 1.8247480392456055, + "learning_rate": 4.297176658723322e-05, + "loss": 4.2026, + "step": 41139 + }, + { + "epoch": 0.24467123418022646, + "grad_norm": 1.7517924308776855, + "learning_rate": 4.29714418834223e-05, + "loss": 4.103, + "step": 41140 + }, + { + "epoch": 0.24467718146350748, + "grad_norm": 2.2277400493621826, + "learning_rate": 4.297111717333774e-05, + "loss": 3.7755, + "step": 41141 + }, + { + "epoch": 0.24468312874678846, + "grad_norm": 3.088578224182129, + "learning_rate": 4.297079245697965e-05, + "loss": 2.7939, + "step": 41142 + }, + { + "epoch": 0.24468907603006945, + "grad_norm": 2.8365633487701416, + "learning_rate": 4.2970467734348144e-05, + "loss": 2.8626, + "step": 41143 + }, + { + "epoch": 0.24469502331335047, + "grad_norm": 2.259674072265625, + "learning_rate": 4.297014300544333e-05, + "loss": 3.9811, + "step": 41144 + }, + { + "epoch": 0.24470097059663146, + "grad_norm": 2.3700835704803467, + "learning_rate": 4.296981827026533e-05, + "loss": 3.9624, + "step": 41145 + }, + { + "epoch": 0.24470691787991244, + "grad_norm": 2.8020901679992676, + "learning_rate": 4.296949352881424e-05, + "loss": 2.6127, + "step": 41146 + }, + { + "epoch": 0.24471286516319346, + "grad_norm": 2.688469886779785, + "learning_rate": 4.296916878109019e-05, + "loss": 2.8875, + "step": 41147 + }, + { + "epoch": 0.24471881244647445, + "grad_norm": 2.0712220668792725, + "learning_rate": 4.2968844027093306e-05, + "loss": 4.1022, + "step": 41148 + }, + { + "epoch": 0.24472475972975544, + "grad_norm": 2.0175349712371826, + "learning_rate": 4.2968519266823665e-05, + "loss": 4.9499, + "step": 41149 + }, + { + "epoch": 0.24473070701303645, + "grad_norm": 1.7208139896392822, + "learning_rate": 4.29681945002814e-05, + "loss": 5.0939, + "step": 41150 + }, + { + "epoch": 0.24473665429631744, + "grad_norm": 2.666321039199829, + "learning_rate": 4.296786972746664e-05, + "loss": 3.1336, + "step": 41151 + }, + { + "epoch": 0.24474260157959843, + "grad_norm": 2.14564847946167, + "learning_rate": 4.296754494837946e-05, + "loss": 4.2202, + "step": 41152 + }, + { + "epoch": 0.24474854886287944, + "grad_norm": 2.0214006900787354, + "learning_rate": 4.2967220163020014e-05, + "loss": 4.9139, + "step": 41153 + }, + { + "epoch": 0.24475449614616043, + "grad_norm": 1.6933860778808594, + "learning_rate": 4.296689537138838e-05, + "loss": 4.8497, + "step": 41154 + }, + { + "epoch": 0.24476044342944142, + "grad_norm": 1.5670782327651978, + "learning_rate": 4.29665705734847e-05, + "loss": 4.8261, + "step": 41155 + }, + { + "epoch": 0.24476639071272244, + "grad_norm": 1.4112871885299683, + "learning_rate": 4.2966245769309074e-05, + "loss": 4.794, + "step": 41156 + }, + { + "epoch": 0.24477233799600343, + "grad_norm": 1.522514820098877, + "learning_rate": 4.296592095886162e-05, + "loss": 5.2487, + "step": 41157 + }, + { + "epoch": 0.2447782852792844, + "grad_norm": 1.5901944637298584, + "learning_rate": 4.296559614214245e-05, + "loss": 4.2704, + "step": 41158 + }, + { + "epoch": 0.24478423256256543, + "grad_norm": 1.517966866493225, + "learning_rate": 4.296527131915167e-05, + "loss": 5.1282, + "step": 41159 + }, + { + "epoch": 0.24479017984584642, + "grad_norm": 1.5397552251815796, + "learning_rate": 4.2964946489889405e-05, + "loss": 5.0766, + "step": 41160 + }, + { + "epoch": 0.2447961271291274, + "grad_norm": 1.578361988067627, + "learning_rate": 4.296462165435575e-05, + "loss": 4.8612, + "step": 41161 + }, + { + "epoch": 0.24480207441240842, + "grad_norm": 1.5172598361968994, + "learning_rate": 4.296429681255084e-05, + "loss": 4.7173, + "step": 41162 + }, + { + "epoch": 0.2448080216956894, + "grad_norm": 1.7074819803237915, + "learning_rate": 4.296397196447478e-05, + "loss": 4.9219, + "step": 41163 + }, + { + "epoch": 0.2448139689789704, + "grad_norm": 1.5568503141403198, + "learning_rate": 4.2963647110127685e-05, + "loss": 5.2043, + "step": 41164 + }, + { + "epoch": 0.2448199162622514, + "grad_norm": 1.6374152898788452, + "learning_rate": 4.296332224950967e-05, + "loss": 4.3301, + "step": 41165 + }, + { + "epoch": 0.2448258635455324, + "grad_norm": 1.7414122819900513, + "learning_rate": 4.296299738262084e-05, + "loss": 4.3918, + "step": 41166 + }, + { + "epoch": 0.2448318108288134, + "grad_norm": 1.8078655004501343, + "learning_rate": 4.296267250946131e-05, + "loss": 4.2435, + "step": 41167 + }, + { + "epoch": 0.2448377581120944, + "grad_norm": 1.7082903385162354, + "learning_rate": 4.29623476300312e-05, + "loss": 4.7491, + "step": 41168 + }, + { + "epoch": 0.2448437053953754, + "grad_norm": 1.9262185096740723, + "learning_rate": 4.2962022744330616e-05, + "loss": 3.9733, + "step": 41169 + }, + { + "epoch": 0.24484965267865638, + "grad_norm": 1.732582688331604, + "learning_rate": 4.296169785235968e-05, + "loss": 4.0969, + "step": 41170 + }, + { + "epoch": 0.2448555999619374, + "grad_norm": 1.594199776649475, + "learning_rate": 4.296137295411849e-05, + "loss": 4.5611, + "step": 41171 + }, + { + "epoch": 0.24486154724521839, + "grad_norm": 1.7887487411499023, + "learning_rate": 4.296104804960719e-05, + "loss": 4.1133, + "step": 41172 + }, + { + "epoch": 0.24486749452849937, + "grad_norm": 1.6945092678070068, + "learning_rate": 4.296072313882586e-05, + "loss": 4.6036, + "step": 41173 + }, + { + "epoch": 0.2448734418117804, + "grad_norm": 1.6527233123779297, + "learning_rate": 4.2960398221774636e-05, + "loss": 4.5933, + "step": 41174 + }, + { + "epoch": 0.24487938909506138, + "grad_norm": 1.656940221786499, + "learning_rate": 4.2960073298453606e-05, + "loss": 4.4874, + "step": 41175 + }, + { + "epoch": 0.24488533637834237, + "grad_norm": 1.6104899644851685, + "learning_rate": 4.295974836886292e-05, + "loss": 4.6909, + "step": 41176 + }, + { + "epoch": 0.24489128366162338, + "grad_norm": 1.838976263999939, + "learning_rate": 4.295942343300266e-05, + "loss": 4.603, + "step": 41177 + }, + { + "epoch": 0.24489723094490437, + "grad_norm": 1.7401745319366455, + "learning_rate": 4.295909849087295e-05, + "loss": 4.6184, + "step": 41178 + }, + { + "epoch": 0.24490317822818536, + "grad_norm": 1.6933304071426392, + "learning_rate": 4.295877354247391e-05, + "loss": 4.9282, + "step": 41179 + }, + { + "epoch": 0.24490912551146637, + "grad_norm": 1.808008074760437, + "learning_rate": 4.2958448587805646e-05, + "loss": 4.9925, + "step": 41180 + }, + { + "epoch": 0.24491507279474736, + "grad_norm": 1.6490589380264282, + "learning_rate": 4.295812362686826e-05, + "loss": 4.573, + "step": 41181 + }, + { + "epoch": 0.24492102007802835, + "grad_norm": 2.7031307220458984, + "learning_rate": 4.29577986596619e-05, + "loss": 3.9662, + "step": 41182 + }, + { + "epoch": 0.24492696736130937, + "grad_norm": 2.4806251525878906, + "learning_rate": 4.2957473686186645e-05, + "loss": 3.8522, + "step": 41183 + }, + { + "epoch": 0.24493291464459035, + "grad_norm": 1.9338045120239258, + "learning_rate": 4.295714870644263e-05, + "loss": 4.2029, + "step": 41184 + }, + { + "epoch": 0.24493886192787134, + "grad_norm": 1.6402406692504883, + "learning_rate": 4.295682372042995e-05, + "loss": 4.8524, + "step": 41185 + }, + { + "epoch": 0.24494480921115236, + "grad_norm": 1.8141027688980103, + "learning_rate": 4.295649872814874e-05, + "loss": 4.5026, + "step": 41186 + }, + { + "epoch": 0.24495075649443335, + "grad_norm": 2.163184642791748, + "learning_rate": 4.295617372959909e-05, + "loss": 3.8722, + "step": 41187 + }, + { + "epoch": 0.24495670377771434, + "grad_norm": 3.1402690410614014, + "learning_rate": 4.295584872478113e-05, + "loss": 3.2739, + "step": 41188 + }, + { + "epoch": 0.24496265106099535, + "grad_norm": 3.160930871963501, + "learning_rate": 4.295552371369497e-05, + "loss": 3.3447, + "step": 41189 + }, + { + "epoch": 0.24496859834427634, + "grad_norm": 1.699285626411438, + "learning_rate": 4.295519869634072e-05, + "loss": 5.3937, + "step": 41190 + }, + { + "epoch": 0.24497454562755733, + "grad_norm": 1.9642579555511475, + "learning_rate": 4.29548736727185e-05, + "loss": 4.7502, + "step": 41191 + }, + { + "epoch": 0.24498049291083832, + "grad_norm": 1.8282638788223267, + "learning_rate": 4.295454864282841e-05, + "loss": 4.9233, + "step": 41192 + }, + { + "epoch": 0.24498644019411933, + "grad_norm": 1.6737186908721924, + "learning_rate": 4.295422360667058e-05, + "loss": 4.2996, + "step": 41193 + }, + { + "epoch": 0.24499238747740032, + "grad_norm": 1.8413697481155396, + "learning_rate": 4.295389856424511e-05, + "loss": 4.137, + "step": 41194 + }, + { + "epoch": 0.2449983347606813, + "grad_norm": 1.6254045963287354, + "learning_rate": 4.2953573515552134e-05, + "loss": 4.0804, + "step": 41195 + }, + { + "epoch": 0.24500428204396232, + "grad_norm": 1.445175290107727, + "learning_rate": 4.295324846059174e-05, + "loss": 4.5626, + "step": 41196 + }, + { + "epoch": 0.2450102293272433, + "grad_norm": 1.5746971368789673, + "learning_rate": 4.295292339936405e-05, + "loss": 4.2401, + "step": 41197 + }, + { + "epoch": 0.2450161766105243, + "grad_norm": 1.673425316810608, + "learning_rate": 4.295259833186919e-05, + "loss": 3.976, + "step": 41198 + }, + { + "epoch": 0.24502212389380532, + "grad_norm": 1.570590853691101, + "learning_rate": 4.295227325810725e-05, + "loss": 4.8022, + "step": 41199 + }, + { + "epoch": 0.2450280711770863, + "grad_norm": 1.7124768495559692, + "learning_rate": 4.295194817807836e-05, + "loss": 4.1595, + "step": 41200 + }, + { + "epoch": 0.2450340184603673, + "grad_norm": 2.6111247539520264, + "learning_rate": 4.295162309178264e-05, + "loss": 2.9296, + "step": 41201 + }, + { + "epoch": 0.2450399657436483, + "grad_norm": 2.5340325832366943, + "learning_rate": 4.2951297999220185e-05, + "loss": 3.3475, + "step": 41202 + }, + { + "epoch": 0.2450459130269293, + "grad_norm": 2.5804336071014404, + "learning_rate": 4.2950972900391116e-05, + "loss": 3.2054, + "step": 41203 + }, + { + "epoch": 0.24505186031021028, + "grad_norm": 1.7329760789871216, + "learning_rate": 4.2950647795295555e-05, + "loss": 4.5483, + "step": 41204 + }, + { + "epoch": 0.2450578075934913, + "grad_norm": 1.752955436706543, + "learning_rate": 4.295032268393361e-05, + "loss": 4.8503, + "step": 41205 + }, + { + "epoch": 0.2450637548767723, + "grad_norm": 1.7768440246582031, + "learning_rate": 4.294999756630539e-05, + "loss": 4.762, + "step": 41206 + }, + { + "epoch": 0.24506970216005328, + "grad_norm": 1.7080912590026855, + "learning_rate": 4.2949672442411004e-05, + "loss": 4.9721, + "step": 41207 + }, + { + "epoch": 0.2450756494433343, + "grad_norm": 2.3687589168548584, + "learning_rate": 4.294934731225058e-05, + "loss": 3.3681, + "step": 41208 + }, + { + "epoch": 0.24508159672661528, + "grad_norm": 1.9023398160934448, + "learning_rate": 4.2949022175824224e-05, + "loss": 4.504, + "step": 41209 + }, + { + "epoch": 0.24508754400989627, + "grad_norm": 1.710294246673584, + "learning_rate": 4.294869703313205e-05, + "loss": 4.6654, + "step": 41210 + }, + { + "epoch": 0.24509349129317728, + "grad_norm": 1.5453555583953857, + "learning_rate": 4.294837188417417e-05, + "loss": 4.6644, + "step": 41211 + }, + { + "epoch": 0.24509943857645827, + "grad_norm": 1.6260664463043213, + "learning_rate": 4.2948046728950695e-05, + "loss": 4.6555, + "step": 41212 + }, + { + "epoch": 0.24510538585973926, + "grad_norm": 1.7364970445632935, + "learning_rate": 4.294772156746174e-05, + "loss": 4.1288, + "step": 41213 + }, + { + "epoch": 0.24511133314302028, + "grad_norm": 2.3027360439300537, + "learning_rate": 4.294739639970743e-05, + "loss": 3.432, + "step": 41214 + }, + { + "epoch": 0.24511728042630127, + "grad_norm": 1.415194034576416, + "learning_rate": 4.294707122568787e-05, + "loss": 4.2648, + "step": 41215 + }, + { + "epoch": 0.24512322770958225, + "grad_norm": 1.568647027015686, + "learning_rate": 4.294674604540316e-05, + "loss": 4.4638, + "step": 41216 + }, + { + "epoch": 0.24512917499286327, + "grad_norm": 1.791579246520996, + "learning_rate": 4.294642085885344e-05, + "loss": 4.6169, + "step": 41217 + }, + { + "epoch": 0.24513512227614426, + "grad_norm": 1.7615410089492798, + "learning_rate": 4.29460956660388e-05, + "loss": 4.7849, + "step": 41218 + }, + { + "epoch": 0.24514106955942525, + "grad_norm": 1.5849919319152832, + "learning_rate": 4.294577046695938e-05, + "loss": 4.5799, + "step": 41219 + }, + { + "epoch": 0.24514701684270626, + "grad_norm": 1.8950798511505127, + "learning_rate": 4.294544526161526e-05, + "loss": 4.6733, + "step": 41220 + }, + { + "epoch": 0.24515296412598725, + "grad_norm": 1.6297848224639893, + "learning_rate": 4.294512005000657e-05, + "loss": 4.7923, + "step": 41221 + }, + { + "epoch": 0.24515891140926824, + "grad_norm": 1.6687774658203125, + "learning_rate": 4.294479483213344e-05, + "loss": 3.5378, + "step": 41222 + }, + { + "epoch": 0.24516485869254925, + "grad_norm": 1.6851519346237183, + "learning_rate": 4.294446960799595e-05, + "loss": 4.1377, + "step": 41223 + }, + { + "epoch": 0.24517080597583024, + "grad_norm": 1.6944013833999634, + "learning_rate": 4.2944144377594245e-05, + "loss": 4.5677, + "step": 41224 + }, + { + "epoch": 0.24517675325911123, + "grad_norm": 1.8697381019592285, + "learning_rate": 4.294381914092841e-05, + "loss": 3.9056, + "step": 41225 + }, + { + "epoch": 0.24518270054239225, + "grad_norm": 2.144491195678711, + "learning_rate": 4.294349389799859e-05, + "loss": 3.4198, + "step": 41226 + }, + { + "epoch": 0.24518864782567323, + "grad_norm": 1.7533553838729858, + "learning_rate": 4.294316864880487e-05, + "loss": 4.1167, + "step": 41227 + }, + { + "epoch": 0.24519459510895422, + "grad_norm": 2.3705077171325684, + "learning_rate": 4.294284339334738e-05, + "loss": 3.1281, + "step": 41228 + }, + { + "epoch": 0.24520054239223524, + "grad_norm": 2.389765977859497, + "learning_rate": 4.294251813162622e-05, + "loss": 3.2898, + "step": 41229 + }, + { + "epoch": 0.24520648967551623, + "grad_norm": 2.664384603500366, + "learning_rate": 4.294219286364152e-05, + "loss": 4.3431, + "step": 41230 + }, + { + "epoch": 0.24521243695879721, + "grad_norm": 2.107266664505005, + "learning_rate": 4.294186758939339e-05, + "loss": 3.1668, + "step": 41231 + }, + { + "epoch": 0.24521838424207823, + "grad_norm": 2.450221538543701, + "learning_rate": 4.294154230888193e-05, + "loss": 3.0391, + "step": 41232 + }, + { + "epoch": 0.24522433152535922, + "grad_norm": 1.7757872343063354, + "learning_rate": 4.294121702210726e-05, + "loss": 4.2076, + "step": 41233 + }, + { + "epoch": 0.2452302788086402, + "grad_norm": 1.6536731719970703, + "learning_rate": 4.294089172906951e-05, + "loss": 4.2851, + "step": 41234 + }, + { + "epoch": 0.24523622609192122, + "grad_norm": 2.4021427631378174, + "learning_rate": 4.294056642976878e-05, + "loss": 3.1836, + "step": 41235 + }, + { + "epoch": 0.2452421733752022, + "grad_norm": 2.1811892986297607, + "learning_rate": 4.294024112420517e-05, + "loss": 3.3722, + "step": 41236 + }, + { + "epoch": 0.2452481206584832, + "grad_norm": 1.7215348482131958, + "learning_rate": 4.2939915812378817e-05, + "loss": 4.5578, + "step": 41237 + }, + { + "epoch": 0.24525406794176421, + "grad_norm": 2.4095611572265625, + "learning_rate": 4.293959049428982e-05, + "loss": 4.6109, + "step": 41238 + }, + { + "epoch": 0.2452600152250452, + "grad_norm": 1.6738545894622803, + "learning_rate": 4.2939265169938304e-05, + "loss": 4.6163, + "step": 41239 + }, + { + "epoch": 0.2452659625083262, + "grad_norm": 1.9711627960205078, + "learning_rate": 4.293893983932437e-05, + "loss": 4.088, + "step": 41240 + }, + { + "epoch": 0.2452719097916072, + "grad_norm": 1.4845963716506958, + "learning_rate": 4.293861450244814e-05, + "loss": 4.7406, + "step": 41241 + }, + { + "epoch": 0.2452778570748882, + "grad_norm": 1.7822811603546143, + "learning_rate": 4.293828915930972e-05, + "loss": 5.3728, + "step": 41242 + }, + { + "epoch": 0.24528380435816918, + "grad_norm": 1.8018428087234497, + "learning_rate": 4.2937963809909235e-05, + "loss": 4.7761, + "step": 41243 + }, + { + "epoch": 0.2452897516414502, + "grad_norm": 1.61797297000885, + "learning_rate": 4.2937638454246786e-05, + "loss": 4.5249, + "step": 41244 + }, + { + "epoch": 0.2452956989247312, + "grad_norm": 1.7524518966674805, + "learning_rate": 4.2937313092322494e-05, + "loss": 4.8264, + "step": 41245 + }, + { + "epoch": 0.24530164620801218, + "grad_norm": 1.4540188312530518, + "learning_rate": 4.293698772413648e-05, + "loss": 4.7962, + "step": 41246 + }, + { + "epoch": 0.2453075934912932, + "grad_norm": 1.9985069036483765, + "learning_rate": 4.293666234968884e-05, + "loss": 4.045, + "step": 41247 + }, + { + "epoch": 0.24531354077457418, + "grad_norm": 1.6976054906845093, + "learning_rate": 4.29363369689797e-05, + "loss": 4.9937, + "step": 41248 + }, + { + "epoch": 0.24531948805785517, + "grad_norm": 1.9082869291305542, + "learning_rate": 4.293601158200917e-05, + "loss": 4.6684, + "step": 41249 + }, + { + "epoch": 0.24532543534113616, + "grad_norm": 1.8951133489608765, + "learning_rate": 4.2935686188777364e-05, + "loss": 4.086, + "step": 41250 + }, + { + "epoch": 0.24533138262441717, + "grad_norm": 1.3001377582550049, + "learning_rate": 4.2935360789284395e-05, + "loss": 4.8662, + "step": 41251 + }, + { + "epoch": 0.24533732990769816, + "grad_norm": 1.3994601964950562, + "learning_rate": 4.2935035383530376e-05, + "loss": 4.6469, + "step": 41252 + }, + { + "epoch": 0.24534327719097915, + "grad_norm": 1.4965636730194092, + "learning_rate": 4.293470997151542e-05, + "loss": 4.6322, + "step": 41253 + }, + { + "epoch": 0.24534922447426016, + "grad_norm": 1.3269574642181396, + "learning_rate": 4.293438455323964e-05, + "loss": 4.5695, + "step": 41254 + }, + { + "epoch": 0.24535517175754115, + "grad_norm": 1.2129048109054565, + "learning_rate": 4.293405912870316e-05, + "loss": 4.5086, + "step": 41255 + }, + { + "epoch": 0.24536111904082214, + "grad_norm": 1.4762561321258545, + "learning_rate": 4.293373369790608e-05, + "loss": 4.5776, + "step": 41256 + }, + { + "epoch": 0.24536706632410316, + "grad_norm": 1.5371118783950806, + "learning_rate": 4.293340826084852e-05, + "loss": 4.5818, + "step": 41257 + }, + { + "epoch": 0.24537301360738414, + "grad_norm": 1.5850943326950073, + "learning_rate": 4.293308281753059e-05, + "loss": 4.506, + "step": 41258 + }, + { + "epoch": 0.24537896089066513, + "grad_norm": 1.2904220819473267, + "learning_rate": 4.2932757367952406e-05, + "loss": 4.3578, + "step": 41259 + }, + { + "epoch": 0.24538490817394615, + "grad_norm": 1.271513819694519, + "learning_rate": 4.293243191211408e-05, + "loss": 4.4341, + "step": 41260 + }, + { + "epoch": 0.24539085545722714, + "grad_norm": 1.8632032871246338, + "learning_rate": 4.293210645001573e-05, + "loss": 4.8761, + "step": 41261 + }, + { + "epoch": 0.24539680274050812, + "grad_norm": 1.8458746671676636, + "learning_rate": 4.293178098165747e-05, + "loss": 4.6665, + "step": 41262 + }, + { + "epoch": 0.24540275002378914, + "grad_norm": 1.5924400091171265, + "learning_rate": 4.293145550703941e-05, + "loss": 4.8206, + "step": 41263 + }, + { + "epoch": 0.24540869730707013, + "grad_norm": 1.5775995254516602, + "learning_rate": 4.293113002616166e-05, + "loss": 4.845, + "step": 41264 + }, + { + "epoch": 0.24541464459035112, + "grad_norm": 1.3876579999923706, + "learning_rate": 4.293080453902434e-05, + "loss": 4.3575, + "step": 41265 + }, + { + "epoch": 0.24542059187363213, + "grad_norm": 1.7224400043487549, + "learning_rate": 4.293047904562756e-05, + "loss": 4.597, + "step": 41266 + }, + { + "epoch": 0.24542653915691312, + "grad_norm": 1.4577008485794067, + "learning_rate": 4.293015354597144e-05, + "loss": 4.5443, + "step": 41267 + }, + { + "epoch": 0.2454324864401941, + "grad_norm": 1.761185646057129, + "learning_rate": 4.292982804005608e-05, + "loss": 4.8578, + "step": 41268 + }, + { + "epoch": 0.24543843372347512, + "grad_norm": 1.770370364189148, + "learning_rate": 4.292950252788161e-05, + "loss": 4.7744, + "step": 41269 + }, + { + "epoch": 0.2454443810067561, + "grad_norm": 1.9236057996749878, + "learning_rate": 4.292917700944813e-05, + "loss": 3.7888, + "step": 41270 + }, + { + "epoch": 0.2454503282900371, + "grad_norm": 1.9749789237976074, + "learning_rate": 4.2928851484755764e-05, + "loss": 3.0964, + "step": 41271 + }, + { + "epoch": 0.24545627557331812, + "grad_norm": 2.01385498046875, + "learning_rate": 4.2928525953804624e-05, + "loss": 3.2013, + "step": 41272 + }, + { + "epoch": 0.2454622228565991, + "grad_norm": 2.055846691131592, + "learning_rate": 4.292820041659481e-05, + "loss": 3.1101, + "step": 41273 + }, + { + "epoch": 0.2454681701398801, + "grad_norm": 2.270141363143921, + "learning_rate": 4.2927874873126456e-05, + "loss": 3.0158, + "step": 41274 + }, + { + "epoch": 0.2454741174231611, + "grad_norm": 2.3147029876708984, + "learning_rate": 4.2927549323399665e-05, + "loss": 3.8985, + "step": 41275 + }, + { + "epoch": 0.2454800647064421, + "grad_norm": 2.0988235473632812, + "learning_rate": 4.292722376741455e-05, + "loss": 3.5468, + "step": 41276 + }, + { + "epoch": 0.24548601198972309, + "grad_norm": 1.8630890846252441, + "learning_rate": 4.2926898205171226e-05, + "loss": 3.6264, + "step": 41277 + }, + { + "epoch": 0.2454919592730041, + "grad_norm": 1.7082550525665283, + "learning_rate": 4.29265726366698e-05, + "loss": 3.3496, + "step": 41278 + }, + { + "epoch": 0.2454979065562851, + "grad_norm": 1.707007646560669, + "learning_rate": 4.292624706191041e-05, + "loss": 3.6066, + "step": 41279 + }, + { + "epoch": 0.24550385383956608, + "grad_norm": 1.660125494003296, + "learning_rate": 4.2925921480893136e-05, + "loss": 3.4233, + "step": 41280 + }, + { + "epoch": 0.2455098011228471, + "grad_norm": 1.701245903968811, + "learning_rate": 4.2925595893618114e-05, + "loss": 3.4428, + "step": 41281 + }, + { + "epoch": 0.24551574840612808, + "grad_norm": 1.5797131061553955, + "learning_rate": 4.292527030008545e-05, + "loss": 3.4532, + "step": 41282 + }, + { + "epoch": 0.24552169568940907, + "grad_norm": 1.6474331617355347, + "learning_rate": 4.292494470029526e-05, + "loss": 3.3225, + "step": 41283 + }, + { + "epoch": 0.24552764297269009, + "grad_norm": 1.8327745199203491, + "learning_rate": 4.2924619094247666e-05, + "loss": 3.4901, + "step": 41284 + }, + { + "epoch": 0.24553359025597107, + "grad_norm": 1.5839154720306396, + "learning_rate": 4.292429348194276e-05, + "loss": 3.5175, + "step": 41285 + }, + { + "epoch": 0.24553953753925206, + "grad_norm": 1.614549160003662, + "learning_rate": 4.292396786338068e-05, + "loss": 3.3579, + "step": 41286 + }, + { + "epoch": 0.24554548482253308, + "grad_norm": 1.7751861810684204, + "learning_rate": 4.2923642238561515e-05, + "loss": 3.3254, + "step": 41287 + }, + { + "epoch": 0.24555143210581407, + "grad_norm": 1.7855550050735474, + "learning_rate": 4.2923316607485396e-05, + "loss": 4.3098, + "step": 41288 + }, + { + "epoch": 0.24555737938909505, + "grad_norm": 1.803556203842163, + "learning_rate": 4.292299097015244e-05, + "loss": 4.646, + "step": 41289 + }, + { + "epoch": 0.24556332667237607, + "grad_norm": 1.922265887260437, + "learning_rate": 4.292266532656275e-05, + "loss": 4.3824, + "step": 41290 + }, + { + "epoch": 0.24556927395565706, + "grad_norm": 1.5468477010726929, + "learning_rate": 4.292233967671644e-05, + "loss": 4.8942, + "step": 41291 + }, + { + "epoch": 0.24557522123893805, + "grad_norm": 2.930257558822632, + "learning_rate": 4.292201402061362e-05, + "loss": 3.9007, + "step": 41292 + }, + { + "epoch": 0.24558116852221906, + "grad_norm": 1.744093656539917, + "learning_rate": 4.292168835825442e-05, + "loss": 4.7998, + "step": 41293 + }, + { + "epoch": 0.24558711580550005, + "grad_norm": 1.8667901754379272, + "learning_rate": 4.292136268963894e-05, + "loss": 3.7838, + "step": 41294 + }, + { + "epoch": 0.24559306308878104, + "grad_norm": 1.863940715789795, + "learning_rate": 4.2921037014767293e-05, + "loss": 3.575, + "step": 41295 + }, + { + "epoch": 0.24559901037206205, + "grad_norm": 1.942530632019043, + "learning_rate": 4.2920711333639604e-05, + "loss": 3.7234, + "step": 41296 + }, + { + "epoch": 0.24560495765534304, + "grad_norm": 1.8463658094406128, + "learning_rate": 4.292038564625598e-05, + "loss": 3.8096, + "step": 41297 + }, + { + "epoch": 0.24561090493862403, + "grad_norm": 1.7197407484054565, + "learning_rate": 4.292005995261653e-05, + "loss": 3.4733, + "step": 41298 + }, + { + "epoch": 0.24561685222190505, + "grad_norm": 1.6365114450454712, + "learning_rate": 4.2919734252721375e-05, + "loss": 3.5513, + "step": 41299 + }, + { + "epoch": 0.24562279950518603, + "grad_norm": 1.886258840560913, + "learning_rate": 4.2919408546570625e-05, + "loss": 3.3417, + "step": 41300 + }, + { + "epoch": 0.24562874678846702, + "grad_norm": 1.886362910270691, + "learning_rate": 4.2919082834164394e-05, + "loss": 4.1528, + "step": 41301 + }, + { + "epoch": 0.24563469407174804, + "grad_norm": 1.4107470512390137, + "learning_rate": 4.29187571155028e-05, + "loss": 4.6982, + "step": 41302 + }, + { + "epoch": 0.24564064135502903, + "grad_norm": 2.3657891750335693, + "learning_rate": 4.2918431390585944e-05, + "loss": 3.697, + "step": 41303 + }, + { + "epoch": 0.24564658863831002, + "grad_norm": 2.734924554824829, + "learning_rate": 4.291810565941396e-05, + "loss": 2.953, + "step": 41304 + }, + { + "epoch": 0.24565253592159103, + "grad_norm": 2.3490958213806152, + "learning_rate": 4.291777992198695e-05, + "loss": 3.0114, + "step": 41305 + }, + { + "epoch": 0.24565848320487202, + "grad_norm": 2.4048731327056885, + "learning_rate": 4.2917454178305014e-05, + "loss": 2.9227, + "step": 41306 + }, + { + "epoch": 0.245664430488153, + "grad_norm": 2.707777500152588, + "learning_rate": 4.291712842836829e-05, + "loss": 2.9642, + "step": 41307 + }, + { + "epoch": 0.245670377771434, + "grad_norm": 2.707149028778076, + "learning_rate": 4.291680267217689e-05, + "loss": 2.8176, + "step": 41308 + }, + { + "epoch": 0.245676325054715, + "grad_norm": 1.9388954639434814, + "learning_rate": 4.2916476909730905e-05, + "loss": 3.9478, + "step": 41309 + }, + { + "epoch": 0.245682272337996, + "grad_norm": 2.0490567684173584, + "learning_rate": 4.291615114103047e-05, + "loss": 5.1891, + "step": 41310 + }, + { + "epoch": 0.245688219621277, + "grad_norm": 1.7325481176376343, + "learning_rate": 4.291582536607569e-05, + "loss": 5.0308, + "step": 41311 + }, + { + "epoch": 0.245694166904558, + "grad_norm": 1.8592088222503662, + "learning_rate": 4.291549958486668e-05, + "loss": 4.311, + "step": 41312 + }, + { + "epoch": 0.245700114187839, + "grad_norm": 1.66029953956604, + "learning_rate": 4.291517379740355e-05, + "loss": 4.1044, + "step": 41313 + }, + { + "epoch": 0.24570606147111998, + "grad_norm": 1.7004597187042236, + "learning_rate": 4.2914848003686425e-05, + "loss": 3.9538, + "step": 41314 + }, + { + "epoch": 0.245712008754401, + "grad_norm": 1.6518200635910034, + "learning_rate": 4.291452220371541e-05, + "loss": 4.1111, + "step": 41315 + }, + { + "epoch": 0.24571795603768198, + "grad_norm": 1.7953903675079346, + "learning_rate": 4.2914196397490626e-05, + "loss": 4.0902, + "step": 41316 + }, + { + "epoch": 0.24572390332096297, + "grad_norm": 2.02091121673584, + "learning_rate": 4.2913870585012176e-05, + "loss": 4.3198, + "step": 41317 + }, + { + "epoch": 0.245729850604244, + "grad_norm": 3.080003023147583, + "learning_rate": 4.2913544766280176e-05, + "loss": 3.5521, + "step": 41318 + }, + { + "epoch": 0.24573579788752498, + "grad_norm": 2.626922607421875, + "learning_rate": 4.291321894129474e-05, + "loss": 3.1989, + "step": 41319 + }, + { + "epoch": 0.24574174517080596, + "grad_norm": 2.9860098361968994, + "learning_rate": 4.291289311005599e-05, + "loss": 3.0656, + "step": 41320 + }, + { + "epoch": 0.24574769245408698, + "grad_norm": 3.361588954925537, + "learning_rate": 4.291256727256403e-05, + "loss": 3.4866, + "step": 41321 + }, + { + "epoch": 0.24575363973736797, + "grad_norm": 2.7741544246673584, + "learning_rate": 4.291224142881898e-05, + "loss": 3.3805, + "step": 41322 + }, + { + "epoch": 0.24575958702064896, + "grad_norm": 2.5784263610839844, + "learning_rate": 4.291191557882095e-05, + "loss": 3.2413, + "step": 41323 + }, + { + "epoch": 0.24576553430392997, + "grad_norm": 2.6534743309020996, + "learning_rate": 4.291158972257006e-05, + "loss": 3.2255, + "step": 41324 + }, + { + "epoch": 0.24577148158721096, + "grad_norm": 2.7462055683135986, + "learning_rate": 4.291126386006642e-05, + "loss": 2.9782, + "step": 41325 + }, + { + "epoch": 0.24577742887049195, + "grad_norm": 2.5625321865081787, + "learning_rate": 4.291093799131013e-05, + "loss": 2.968, + "step": 41326 + }, + { + "epoch": 0.24578337615377296, + "grad_norm": 2.6306958198547363, + "learning_rate": 4.291061211630133e-05, + "loss": 3.1492, + "step": 41327 + }, + { + "epoch": 0.24578932343705395, + "grad_norm": 2.235661268234253, + "learning_rate": 4.291028623504012e-05, + "loss": 3.2123, + "step": 41328 + }, + { + "epoch": 0.24579527072033494, + "grad_norm": 2.3616068363189697, + "learning_rate": 4.2909960347526605e-05, + "loss": 3.0685, + "step": 41329 + }, + { + "epoch": 0.24580121800361596, + "grad_norm": 1.7451351881027222, + "learning_rate": 4.2909634453760916e-05, + "loss": 4.9747, + "step": 41330 + }, + { + "epoch": 0.24580716528689694, + "grad_norm": 1.7776366472244263, + "learning_rate": 4.2909308553743156e-05, + "loss": 4.6492, + "step": 41331 + }, + { + "epoch": 0.24581311257017793, + "grad_norm": 2.358022451400757, + "learning_rate": 4.290898264747344e-05, + "loss": 3.5503, + "step": 41332 + }, + { + "epoch": 0.24581905985345895, + "grad_norm": 2.1874308586120605, + "learning_rate": 4.2908656734951885e-05, + "loss": 3.4157, + "step": 41333 + }, + { + "epoch": 0.24582500713673994, + "grad_norm": 1.5560458898544312, + "learning_rate": 4.29083308161786e-05, + "loss": 4.4183, + "step": 41334 + }, + { + "epoch": 0.24583095442002093, + "grad_norm": 1.5695600509643555, + "learning_rate": 4.290800489115371e-05, + "loss": 4.5047, + "step": 41335 + }, + { + "epoch": 0.24583690170330194, + "grad_norm": 1.5378185510635376, + "learning_rate": 4.2907678959877315e-05, + "loss": 5.0204, + "step": 41336 + }, + { + "epoch": 0.24584284898658293, + "grad_norm": 2.545009136199951, + "learning_rate": 4.290735302234953e-05, + "loss": 4.6532, + "step": 41337 + }, + { + "epoch": 0.24584879626986392, + "grad_norm": 2.2433645725250244, + "learning_rate": 4.290702707857048e-05, + "loss": 4.3877, + "step": 41338 + }, + { + "epoch": 0.24585474355314493, + "grad_norm": 1.910528540611267, + "learning_rate": 4.290670112854027e-05, + "loss": 4.6029, + "step": 41339 + }, + { + "epoch": 0.24586069083642592, + "grad_norm": 1.5221855640411377, + "learning_rate": 4.290637517225901e-05, + "loss": 4.4521, + "step": 41340 + }, + { + "epoch": 0.2458666381197069, + "grad_norm": 1.6966966390609741, + "learning_rate": 4.290604920972682e-05, + "loss": 4.4465, + "step": 41341 + }, + { + "epoch": 0.24587258540298793, + "grad_norm": 1.9209175109863281, + "learning_rate": 4.290572324094382e-05, + "loss": 4.1481, + "step": 41342 + }, + { + "epoch": 0.2458785326862689, + "grad_norm": 1.9980040788650513, + "learning_rate": 4.290539726591011e-05, + "loss": 4.1367, + "step": 41343 + }, + { + "epoch": 0.2458844799695499, + "grad_norm": 2.0062034130096436, + "learning_rate": 4.290507128462582e-05, + "loss": 4.0393, + "step": 41344 + }, + { + "epoch": 0.24589042725283092, + "grad_norm": 1.5066325664520264, + "learning_rate": 4.290474529709105e-05, + "loss": 4.0303, + "step": 41345 + }, + { + "epoch": 0.2458963745361119, + "grad_norm": 1.6619290113449097, + "learning_rate": 4.290441930330592e-05, + "loss": 4.2671, + "step": 41346 + }, + { + "epoch": 0.2459023218193929, + "grad_norm": 1.5947566032409668, + "learning_rate": 4.290409330327054e-05, + "loss": 4.0346, + "step": 41347 + }, + { + "epoch": 0.2459082691026739, + "grad_norm": 1.7103302478790283, + "learning_rate": 4.290376729698502e-05, + "loss": 4.184, + "step": 41348 + }, + { + "epoch": 0.2459142163859549, + "grad_norm": 1.9664976596832275, + "learning_rate": 4.2903441284449485e-05, + "loss": 5.2688, + "step": 41349 + }, + { + "epoch": 0.24592016366923589, + "grad_norm": 1.7853831052780151, + "learning_rate": 4.290311526566405e-05, + "loss": 5.1327, + "step": 41350 + }, + { + "epoch": 0.2459261109525169, + "grad_norm": 1.9614903926849365, + "learning_rate": 4.2902789240628815e-05, + "loss": 5.1291, + "step": 41351 + }, + { + "epoch": 0.2459320582357979, + "grad_norm": 1.7202919721603394, + "learning_rate": 4.2902463209343905e-05, + "loss": 5.0616, + "step": 41352 + }, + { + "epoch": 0.24593800551907888, + "grad_norm": 1.4817719459533691, + "learning_rate": 4.2902137171809424e-05, + "loss": 5.1021, + "step": 41353 + }, + { + "epoch": 0.2459439528023599, + "grad_norm": 3.3103199005126953, + "learning_rate": 4.29018111280255e-05, + "loss": 4.0005, + "step": 41354 + }, + { + "epoch": 0.24594990008564088, + "grad_norm": 1.43356192111969, + "learning_rate": 4.2901485077992234e-05, + "loss": 5.2535, + "step": 41355 + }, + { + "epoch": 0.24595584736892187, + "grad_norm": 1.6131337881088257, + "learning_rate": 4.290115902170975e-05, + "loss": 5.384, + "step": 41356 + }, + { + "epoch": 0.2459617946522029, + "grad_norm": 1.5717452764511108, + "learning_rate": 4.290083295917815e-05, + "loss": 5.2935, + "step": 41357 + }, + { + "epoch": 0.24596774193548387, + "grad_norm": 1.7218366861343384, + "learning_rate": 4.2900506890397555e-05, + "loss": 4.809, + "step": 41358 + }, + { + "epoch": 0.24597368921876486, + "grad_norm": 1.8395469188690186, + "learning_rate": 4.2900180815368076e-05, + "loss": 4.7941, + "step": 41359 + }, + { + "epoch": 0.24597963650204588, + "grad_norm": 1.3671373128890991, + "learning_rate": 4.289985473408983e-05, + "loss": 4.8297, + "step": 41360 + }, + { + "epoch": 0.24598558378532687, + "grad_norm": 1.6784130334854126, + "learning_rate": 4.289952864656294e-05, + "loss": 4.9523, + "step": 41361 + }, + { + "epoch": 0.24599153106860785, + "grad_norm": 1.7584325075149536, + "learning_rate": 4.2899202552787494e-05, + "loss": 4.8671, + "step": 41362 + }, + { + "epoch": 0.24599747835188887, + "grad_norm": 1.695522427558899, + "learning_rate": 4.2898876452763625e-05, + "loss": 4.7995, + "step": 41363 + }, + { + "epoch": 0.24600342563516986, + "grad_norm": 1.725178599357605, + "learning_rate": 4.289855034649145e-05, + "loss": 4.7757, + "step": 41364 + }, + { + "epoch": 0.24600937291845085, + "grad_norm": 1.5508530139923096, + "learning_rate": 4.2898224233971074e-05, + "loss": 4.7327, + "step": 41365 + }, + { + "epoch": 0.24601532020173184, + "grad_norm": 1.4109222888946533, + "learning_rate": 4.289789811520261e-05, + "loss": 4.6937, + "step": 41366 + }, + { + "epoch": 0.24602126748501285, + "grad_norm": 1.657230257987976, + "learning_rate": 4.289757199018617e-05, + "loss": 4.723, + "step": 41367 + }, + { + "epoch": 0.24602721476829384, + "grad_norm": 1.6058872938156128, + "learning_rate": 4.289724585892188e-05, + "loss": 4.6303, + "step": 41368 + }, + { + "epoch": 0.24603316205157483, + "grad_norm": 1.4774922132492065, + "learning_rate": 4.289691972140985e-05, + "loss": 4.5082, + "step": 41369 + }, + { + "epoch": 0.24603910933485584, + "grad_norm": 1.7104802131652832, + "learning_rate": 4.289659357765018e-05, + "loss": 5.4016, + "step": 41370 + }, + { + "epoch": 0.24604505661813683, + "grad_norm": 1.724193811416626, + "learning_rate": 4.2896267427643e-05, + "loss": 5.0259, + "step": 41371 + }, + { + "epoch": 0.24605100390141782, + "grad_norm": 2.5920002460479736, + "learning_rate": 4.289594127138842e-05, + "loss": 3.3145, + "step": 41372 + }, + { + "epoch": 0.24605695118469884, + "grad_norm": 2.7469708919525146, + "learning_rate": 4.2895615108886544e-05, + "loss": 2.8388, + "step": 41373 + }, + { + "epoch": 0.24606289846797982, + "grad_norm": 2.5402188301086426, + "learning_rate": 4.2895288940137504e-05, + "loss": 3.0461, + "step": 41374 + }, + { + "epoch": 0.2460688457512608, + "grad_norm": 2.536111831665039, + "learning_rate": 4.289496276514139e-05, + "loss": 3.0833, + "step": 41375 + }, + { + "epoch": 0.24607479303454183, + "grad_norm": 2.442873954772949, + "learning_rate": 4.289463658389834e-05, + "loss": 2.8886, + "step": 41376 + }, + { + "epoch": 0.24608074031782282, + "grad_norm": 2.245288610458374, + "learning_rate": 4.289431039640845e-05, + "loss": 2.9276, + "step": 41377 + }, + { + "epoch": 0.2460866876011038, + "grad_norm": 2.2979931831359863, + "learning_rate": 4.289398420267185e-05, + "loss": 2.9733, + "step": 41378 + }, + { + "epoch": 0.24609263488438482, + "grad_norm": 1.9695994853973389, + "learning_rate": 4.289365800268864e-05, + "loss": 4.0488, + "step": 41379 + }, + { + "epoch": 0.2460985821676658, + "grad_norm": 1.661198377609253, + "learning_rate": 4.289333179645894e-05, + "loss": 5.0065, + "step": 41380 + }, + { + "epoch": 0.2461045294509468, + "grad_norm": 2.5014381408691406, + "learning_rate": 4.289300558398286e-05, + "loss": 3.5984, + "step": 41381 + }, + { + "epoch": 0.2461104767342278, + "grad_norm": 2.25174617767334, + "learning_rate": 4.289267936526052e-05, + "loss": 3.6635, + "step": 41382 + }, + { + "epoch": 0.2461164240175088, + "grad_norm": 1.797683835029602, + "learning_rate": 4.289235314029203e-05, + "loss": 4.5333, + "step": 41383 + }, + { + "epoch": 0.2461223713007898, + "grad_norm": 1.9048019647598267, + "learning_rate": 4.28920269090775e-05, + "loss": 4.5948, + "step": 41384 + }, + { + "epoch": 0.2461283185840708, + "grad_norm": 1.9738743305206299, + "learning_rate": 4.2891700671617053e-05, + "loss": 4.7325, + "step": 41385 + }, + { + "epoch": 0.2461342658673518, + "grad_norm": 1.8170987367630005, + "learning_rate": 4.2891374427910795e-05, + "loss": 4.5803, + "step": 41386 + }, + { + "epoch": 0.24614021315063278, + "grad_norm": 1.8157132863998413, + "learning_rate": 4.289104817795885e-05, + "loss": 4.7475, + "step": 41387 + }, + { + "epoch": 0.2461461604339138, + "grad_norm": 1.8674625158309937, + "learning_rate": 4.2890721921761314e-05, + "loss": 4.928, + "step": 41388 + }, + { + "epoch": 0.24615210771719478, + "grad_norm": 2.205418586730957, + "learning_rate": 4.289039565931832e-05, + "loss": 4.0812, + "step": 41389 + }, + { + "epoch": 0.24615805500047577, + "grad_norm": 2.510709524154663, + "learning_rate": 4.289006939062997e-05, + "loss": 3.4493, + "step": 41390 + }, + { + "epoch": 0.2461640022837568, + "grad_norm": 2.3947231769561768, + "learning_rate": 4.288974311569639e-05, + "loss": 3.3019, + "step": 41391 + }, + { + "epoch": 0.24616994956703778, + "grad_norm": 2.386324644088745, + "learning_rate": 4.288941683451767e-05, + "loss": 3.6027, + "step": 41392 + }, + { + "epoch": 0.24617589685031877, + "grad_norm": 2.362227439880371, + "learning_rate": 4.2889090547093954e-05, + "loss": 3.4731, + "step": 41393 + }, + { + "epoch": 0.24618184413359978, + "grad_norm": 2.3975284099578857, + "learning_rate": 4.288876425342533e-05, + "loss": 3.2023, + "step": 41394 + }, + { + "epoch": 0.24618779141688077, + "grad_norm": 1.9983441829681396, + "learning_rate": 4.288843795351193e-05, + "loss": 3.2785, + "step": 41395 + }, + { + "epoch": 0.24619373870016176, + "grad_norm": 1.6496403217315674, + "learning_rate": 4.2888111647353865e-05, + "loss": 4.4661, + "step": 41396 + }, + { + "epoch": 0.24619968598344277, + "grad_norm": 1.6973501443862915, + "learning_rate": 4.2887785334951236e-05, + "loss": 5.0356, + "step": 41397 + }, + { + "epoch": 0.24620563326672376, + "grad_norm": 2.176926612854004, + "learning_rate": 4.2887459016304165e-05, + "loss": 4.3611, + "step": 41398 + }, + { + "epoch": 0.24621158055000475, + "grad_norm": 2.118968963623047, + "learning_rate": 4.2887132691412773e-05, + "loss": 3.8369, + "step": 41399 + }, + { + "epoch": 0.24621752783328577, + "grad_norm": 1.7359727621078491, + "learning_rate": 4.2886806360277164e-05, + "loss": 4.6305, + "step": 41400 + }, + { + "epoch": 0.24622347511656675, + "grad_norm": 1.839827299118042, + "learning_rate": 4.2886480022897464e-05, + "loss": 4.4252, + "step": 41401 + }, + { + "epoch": 0.24622942239984774, + "grad_norm": 1.6231486797332764, + "learning_rate": 4.288615367927376e-05, + "loss": 4.5911, + "step": 41402 + }, + { + "epoch": 0.24623536968312876, + "grad_norm": 1.8057701587677002, + "learning_rate": 4.288582732940621e-05, + "loss": 4.3309, + "step": 41403 + }, + { + "epoch": 0.24624131696640975, + "grad_norm": 1.5501629114151, + "learning_rate": 4.2885500973294885e-05, + "loss": 4.4666, + "step": 41404 + }, + { + "epoch": 0.24624726424969073, + "grad_norm": 1.6201649904251099, + "learning_rate": 4.2885174610939924e-05, + "loss": 4.8269, + "step": 41405 + }, + { + "epoch": 0.24625321153297175, + "grad_norm": 2.763577938079834, + "learning_rate": 4.288484824234142e-05, + "loss": 3.5564, + "step": 41406 + }, + { + "epoch": 0.24625915881625274, + "grad_norm": 2.48594069480896, + "learning_rate": 4.2884521867499516e-05, + "loss": 3.6224, + "step": 41407 + }, + { + "epoch": 0.24626510609953373, + "grad_norm": 2.29768705368042, + "learning_rate": 4.2884195486414306e-05, + "loss": 3.4872, + "step": 41408 + }, + { + "epoch": 0.24627105338281474, + "grad_norm": 2.11868953704834, + "learning_rate": 4.288386909908591e-05, + "loss": 3.4964, + "step": 41409 + }, + { + "epoch": 0.24627700066609573, + "grad_norm": 1.7269035577774048, + "learning_rate": 4.2883542705514435e-05, + "loss": 4.7874, + "step": 41410 + }, + { + "epoch": 0.24628294794937672, + "grad_norm": 1.8122918605804443, + "learning_rate": 4.2883216305700005e-05, + "loss": 4.9118, + "step": 41411 + }, + { + "epoch": 0.24628889523265773, + "grad_norm": 1.7068716287612915, + "learning_rate": 4.288288989964272e-05, + "loss": 4.8488, + "step": 41412 + }, + { + "epoch": 0.24629484251593872, + "grad_norm": 1.4457144737243652, + "learning_rate": 4.288256348734271e-05, + "loss": 4.8033, + "step": 41413 + }, + { + "epoch": 0.2463007897992197, + "grad_norm": 1.4766167402267456, + "learning_rate": 4.288223706880008e-05, + "loss": 4.8193, + "step": 41414 + }, + { + "epoch": 0.24630673708250073, + "grad_norm": 1.603278636932373, + "learning_rate": 4.288191064401495e-05, + "loss": 5.2585, + "step": 41415 + }, + { + "epoch": 0.24631268436578171, + "grad_norm": 2.253368854522705, + "learning_rate": 4.2881584212987426e-05, + "loss": 4.5337, + "step": 41416 + }, + { + "epoch": 0.2463186316490627, + "grad_norm": 1.6935124397277832, + "learning_rate": 4.288125777571762e-05, + "loss": 5.6289, + "step": 41417 + }, + { + "epoch": 0.24632457893234372, + "grad_norm": 1.6933794021606445, + "learning_rate": 4.288093133220566e-05, + "loss": 4.8644, + "step": 41418 + }, + { + "epoch": 0.2463305262156247, + "grad_norm": 1.584424376487732, + "learning_rate": 4.2880604882451645e-05, + "loss": 5.4481, + "step": 41419 + }, + { + "epoch": 0.2463364734989057, + "grad_norm": 1.3035473823547363, + "learning_rate": 4.28802784264557e-05, + "loss": 5.2701, + "step": 41420 + }, + { + "epoch": 0.2463424207821867, + "grad_norm": 1.519955039024353, + "learning_rate": 4.287995196421793e-05, + "loss": 5.4217, + "step": 41421 + }, + { + "epoch": 0.2463483680654677, + "grad_norm": 1.4247819185256958, + "learning_rate": 4.2879625495738464e-05, + "loss": 5.6054, + "step": 41422 + }, + { + "epoch": 0.2463543153487487, + "grad_norm": 1.3893578052520752, + "learning_rate": 4.28792990210174e-05, + "loss": 5.117, + "step": 41423 + }, + { + "epoch": 0.24636026263202968, + "grad_norm": 1.9416782855987549, + "learning_rate": 4.2878972540054855e-05, + "loss": 4.1911, + "step": 41424 + }, + { + "epoch": 0.2463662099153107, + "grad_norm": 1.6644415855407715, + "learning_rate": 4.2878646052850945e-05, + "loss": 4.678, + "step": 41425 + }, + { + "epoch": 0.24637215719859168, + "grad_norm": 1.641124963760376, + "learning_rate": 4.287831955940579e-05, + "loss": 4.6987, + "step": 41426 + }, + { + "epoch": 0.24637810448187267, + "grad_norm": 1.5957715511322021, + "learning_rate": 4.287799305971949e-05, + "loss": 5.2159, + "step": 41427 + }, + { + "epoch": 0.24638405176515368, + "grad_norm": 1.818450927734375, + "learning_rate": 4.287766655379217e-05, + "loss": 5.0238, + "step": 41428 + }, + { + "epoch": 0.24638999904843467, + "grad_norm": 1.7083373069763184, + "learning_rate": 4.287734004162395e-05, + "loss": 5.0811, + "step": 41429 + }, + { + "epoch": 0.24639594633171566, + "grad_norm": 1.7492380142211914, + "learning_rate": 4.287701352321493e-05, + "loss": 5.0496, + "step": 41430 + }, + { + "epoch": 0.24640189361499668, + "grad_norm": 1.647814393043518, + "learning_rate": 4.287668699856523e-05, + "loss": 4.8725, + "step": 41431 + }, + { + "epoch": 0.24640784089827766, + "grad_norm": 1.8636233806610107, + "learning_rate": 4.2876360467674956e-05, + "loss": 5.1098, + "step": 41432 + }, + { + "epoch": 0.24641378818155865, + "grad_norm": 1.7179633378982544, + "learning_rate": 4.2876033930544234e-05, + "loss": 5.0283, + "step": 41433 + }, + { + "epoch": 0.24641973546483967, + "grad_norm": 1.5947309732437134, + "learning_rate": 4.2875707387173176e-05, + "loss": 4.9338, + "step": 41434 + }, + { + "epoch": 0.24642568274812066, + "grad_norm": 1.7582926750183105, + "learning_rate": 4.2875380837561884e-05, + "loss": 4.8757, + "step": 41435 + }, + { + "epoch": 0.24643163003140164, + "grad_norm": 1.5903277397155762, + "learning_rate": 4.287505428171049e-05, + "loss": 5.1738, + "step": 41436 + }, + { + "epoch": 0.24643757731468266, + "grad_norm": 1.8649073839187622, + "learning_rate": 4.28747277196191e-05, + "loss": 4.4861, + "step": 41437 + }, + { + "epoch": 0.24644352459796365, + "grad_norm": 2.3210461139678955, + "learning_rate": 4.287440115128782e-05, + "loss": 4.9202, + "step": 41438 + }, + { + "epoch": 0.24644947188124464, + "grad_norm": 1.5786616802215576, + "learning_rate": 4.287407457671678e-05, + "loss": 4.4372, + "step": 41439 + }, + { + "epoch": 0.24645541916452565, + "grad_norm": 1.4731504917144775, + "learning_rate": 4.287374799590608e-05, + "loss": 4.7791, + "step": 41440 + }, + { + "epoch": 0.24646136644780664, + "grad_norm": 1.8809871673583984, + "learning_rate": 4.2873421408855844e-05, + "loss": 4.9031, + "step": 41441 + }, + { + "epoch": 0.24646731373108763, + "grad_norm": 1.6248157024383545, + "learning_rate": 4.287309481556617e-05, + "loss": 4.7323, + "step": 41442 + }, + { + "epoch": 0.24647326101436864, + "grad_norm": 1.5632293224334717, + "learning_rate": 4.2872768216037196e-05, + "loss": 4.8241, + "step": 41443 + }, + { + "epoch": 0.24647920829764963, + "grad_norm": 1.551485300064087, + "learning_rate": 4.2872441610269016e-05, + "loss": 4.7961, + "step": 41444 + }, + { + "epoch": 0.24648515558093062, + "grad_norm": 1.51264488697052, + "learning_rate": 4.287211499826175e-05, + "loss": 4.7999, + "step": 41445 + }, + { + "epoch": 0.24649110286421164, + "grad_norm": 1.5302774906158447, + "learning_rate": 4.2871788380015523e-05, + "loss": 4.8487, + "step": 41446 + }, + { + "epoch": 0.24649705014749262, + "grad_norm": 1.3063452243804932, + "learning_rate": 4.287146175553043e-05, + "loss": 4.8119, + "step": 41447 + }, + { + "epoch": 0.2465029974307736, + "grad_norm": 1.5256316661834717, + "learning_rate": 4.287113512480659e-05, + "loss": 4.6335, + "step": 41448 + }, + { + "epoch": 0.24650894471405463, + "grad_norm": 1.6915894746780396, + "learning_rate": 4.287080848784414e-05, + "loss": 4.8614, + "step": 41449 + }, + { + "epoch": 0.24651489199733562, + "grad_norm": 1.7030476331710815, + "learning_rate": 4.287048184464316e-05, + "loss": 4.8858, + "step": 41450 + }, + { + "epoch": 0.2465208392806166, + "grad_norm": 1.638736367225647, + "learning_rate": 4.287015519520378e-05, + "loss": 4.6359, + "step": 41451 + }, + { + "epoch": 0.24652678656389762, + "grad_norm": 2.3777272701263428, + "learning_rate": 4.286982853952612e-05, + "loss": 4.4567, + "step": 41452 + }, + { + "epoch": 0.2465327338471786, + "grad_norm": 1.6538316011428833, + "learning_rate": 4.2869501877610284e-05, + "loss": 4.8817, + "step": 41453 + }, + { + "epoch": 0.2465386811304596, + "grad_norm": 1.6781548261642456, + "learning_rate": 4.286917520945639e-05, + "loss": 4.9174, + "step": 41454 + }, + { + "epoch": 0.2465446284137406, + "grad_norm": 1.6529505252838135, + "learning_rate": 4.286884853506455e-05, + "loss": 5.3633, + "step": 41455 + }, + { + "epoch": 0.2465505756970216, + "grad_norm": 3.0862789154052734, + "learning_rate": 4.286852185443488e-05, + "loss": 4.3292, + "step": 41456 + }, + { + "epoch": 0.2465565229803026, + "grad_norm": 2.499046564102173, + "learning_rate": 4.2868195167567495e-05, + "loss": 4.299, + "step": 41457 + }, + { + "epoch": 0.2465624702635836, + "grad_norm": 1.6349997520446777, + "learning_rate": 4.2867868474462506e-05, + "loss": 5.1009, + "step": 41458 + }, + { + "epoch": 0.2465684175468646, + "grad_norm": 1.59446120262146, + "learning_rate": 4.286754177512003e-05, + "loss": 5.0926, + "step": 41459 + }, + { + "epoch": 0.24657436483014558, + "grad_norm": 1.61570143699646, + "learning_rate": 4.286721506954018e-05, + "loss": 4.7639, + "step": 41460 + }, + { + "epoch": 0.2465803121134266, + "grad_norm": 1.542502760887146, + "learning_rate": 4.286688835772307e-05, + "loss": 5.1672, + "step": 41461 + }, + { + "epoch": 0.24658625939670759, + "grad_norm": 1.364316701889038, + "learning_rate": 4.286656163966881e-05, + "loss": 5.2594, + "step": 41462 + }, + { + "epoch": 0.24659220667998857, + "grad_norm": 1.3938337564468384, + "learning_rate": 4.286623491537752e-05, + "loss": 5.292, + "step": 41463 + }, + { + "epoch": 0.2465981539632696, + "grad_norm": 1.7389088869094849, + "learning_rate": 4.2865908184849316e-05, + "loss": 4.9581, + "step": 41464 + }, + { + "epoch": 0.24660410124655058, + "grad_norm": 1.5333000421524048, + "learning_rate": 4.2865581448084305e-05, + "loss": 4.9295, + "step": 41465 + }, + { + "epoch": 0.24661004852983157, + "grad_norm": 1.631771206855774, + "learning_rate": 4.2865254705082604e-05, + "loss": 5.0161, + "step": 41466 + }, + { + "epoch": 0.24661599581311258, + "grad_norm": 1.563915729522705, + "learning_rate": 4.286492795584433e-05, + "loss": 4.9929, + "step": 41467 + }, + { + "epoch": 0.24662194309639357, + "grad_norm": 1.4208866357803345, + "learning_rate": 4.286460120036959e-05, + "loss": 4.9965, + "step": 41468 + }, + { + "epoch": 0.24662789037967456, + "grad_norm": 1.558342456817627, + "learning_rate": 4.28642744386585e-05, + "loss": 4.8266, + "step": 41469 + }, + { + "epoch": 0.24663383766295557, + "grad_norm": 1.4343364238739014, + "learning_rate": 4.2863947670711184e-05, + "loss": 4.9031, + "step": 41470 + }, + { + "epoch": 0.24663978494623656, + "grad_norm": 1.6914136409759521, + "learning_rate": 4.286362089652775e-05, + "loss": 3.9444, + "step": 41471 + }, + { + "epoch": 0.24664573222951755, + "grad_norm": 1.875369906425476, + "learning_rate": 4.28632941161083e-05, + "loss": 4.002, + "step": 41472 + }, + { + "epoch": 0.24665167951279857, + "grad_norm": 1.7706148624420166, + "learning_rate": 4.2862967329452966e-05, + "loss": 4.2848, + "step": 41473 + }, + { + "epoch": 0.24665762679607955, + "grad_norm": 1.762385368347168, + "learning_rate": 4.286264053656185e-05, + "loss": 4.1696, + "step": 41474 + }, + { + "epoch": 0.24666357407936054, + "grad_norm": 1.8044275045394897, + "learning_rate": 4.286231373743507e-05, + "loss": 3.9788, + "step": 41475 + }, + { + "epoch": 0.24666952136264156, + "grad_norm": 1.6572545766830444, + "learning_rate": 4.286198693207275e-05, + "loss": 4.1219, + "step": 41476 + }, + { + "epoch": 0.24667546864592255, + "grad_norm": 1.5549501180648804, + "learning_rate": 4.286166012047499e-05, + "loss": 4.7072, + "step": 41477 + }, + { + "epoch": 0.24668141592920353, + "grad_norm": 1.7132916450500488, + "learning_rate": 4.2861333302641903e-05, + "loss": 4.6447, + "step": 41478 + }, + { + "epoch": 0.24668736321248455, + "grad_norm": 1.6320955753326416, + "learning_rate": 4.286100647857362e-05, + "loss": 4.7856, + "step": 41479 + }, + { + "epoch": 0.24669331049576554, + "grad_norm": 1.4338788986206055, + "learning_rate": 4.286067964827023e-05, + "loss": 5.6327, + "step": 41480 + }, + { + "epoch": 0.24669925777904653, + "grad_norm": 1.4106473922729492, + "learning_rate": 4.2860352811731876e-05, + "loss": 4.7096, + "step": 41481 + }, + { + "epoch": 0.24670520506232752, + "grad_norm": 1.5406266450881958, + "learning_rate": 4.286002596895865e-05, + "loss": 5.3074, + "step": 41482 + }, + { + "epoch": 0.24671115234560853, + "grad_norm": 2.390526533126831, + "learning_rate": 4.285969911995067e-05, + "loss": 3.6973, + "step": 41483 + }, + { + "epoch": 0.24671709962888952, + "grad_norm": 2.3091604709625244, + "learning_rate": 4.285937226470806e-05, + "loss": 3.6778, + "step": 41484 + }, + { + "epoch": 0.2467230469121705, + "grad_norm": 2.1337802410125732, + "learning_rate": 4.285904540323092e-05, + "loss": 4.2962, + "step": 41485 + }, + { + "epoch": 0.24672899419545152, + "grad_norm": 1.5539207458496094, + "learning_rate": 4.285871853551938e-05, + "loss": 4.9943, + "step": 41486 + }, + { + "epoch": 0.2467349414787325, + "grad_norm": 1.4843453168869019, + "learning_rate": 4.2858391661573543e-05, + "loss": 4.9896, + "step": 41487 + }, + { + "epoch": 0.2467408887620135, + "grad_norm": 1.6902412176132202, + "learning_rate": 4.285806478139353e-05, + "loss": 4.9135, + "step": 41488 + }, + { + "epoch": 0.24674683604529452, + "grad_norm": 1.2611852884292603, + "learning_rate": 4.2857737894979446e-05, + "loss": 4.9237, + "step": 41489 + }, + { + "epoch": 0.2467527833285755, + "grad_norm": 1.5044881105422974, + "learning_rate": 4.285741100233141e-05, + "loss": 4.7655, + "step": 41490 + }, + { + "epoch": 0.2467587306118565, + "grad_norm": 1.9322335720062256, + "learning_rate": 4.285708410344954e-05, + "loss": 4.7752, + "step": 41491 + }, + { + "epoch": 0.2467646778951375, + "grad_norm": 2.077342987060547, + "learning_rate": 4.285675719833394e-05, + "loss": 4.2554, + "step": 41492 + }, + { + "epoch": 0.2467706251784185, + "grad_norm": 1.7955752611160278, + "learning_rate": 4.285643028698474e-05, + "loss": 4.4103, + "step": 41493 + }, + { + "epoch": 0.24677657246169948, + "grad_norm": 1.9531134366989136, + "learning_rate": 4.285610336940203e-05, + "loss": 4.5822, + "step": 41494 + }, + { + "epoch": 0.2467825197449805, + "grad_norm": 1.9599354267120361, + "learning_rate": 4.285577644558595e-05, + "loss": 4.4888, + "step": 41495 + }, + { + "epoch": 0.2467884670282615, + "grad_norm": 1.8767081499099731, + "learning_rate": 4.2855449515536605e-05, + "loss": 4.7169, + "step": 41496 + }, + { + "epoch": 0.24679441431154248, + "grad_norm": 1.9004005193710327, + "learning_rate": 4.28551225792541e-05, + "loss": 4.4506, + "step": 41497 + }, + { + "epoch": 0.2468003615948235, + "grad_norm": 1.701491117477417, + "learning_rate": 4.285479563673857e-05, + "loss": 4.6447, + "step": 41498 + }, + { + "epoch": 0.24680630887810448, + "grad_norm": 1.9039287567138672, + "learning_rate": 4.28544686879901e-05, + "loss": 4.5204, + "step": 41499 + }, + { + "epoch": 0.24681225616138547, + "grad_norm": 2.1572320461273193, + "learning_rate": 4.285414173300882e-05, + "loss": 4.5656, + "step": 41500 + }, + { + "epoch": 0.24681820344466648, + "grad_norm": 1.9507417678833008, + "learning_rate": 4.285381477179485e-05, + "loss": 5.0572, + "step": 41501 + }, + { + "epoch": 0.24682415072794747, + "grad_norm": 1.7995469570159912, + "learning_rate": 4.285348780434829e-05, + "loss": 4.9742, + "step": 41502 + }, + { + "epoch": 0.24683009801122846, + "grad_norm": 1.8193576335906982, + "learning_rate": 4.285316083066927e-05, + "loss": 4.9325, + "step": 41503 + }, + { + "epoch": 0.24683604529450948, + "grad_norm": 1.404066801071167, + "learning_rate": 4.285283385075789e-05, + "loss": 5.3406, + "step": 41504 + }, + { + "epoch": 0.24684199257779046, + "grad_norm": 1.780181646347046, + "learning_rate": 4.2852506864614275e-05, + "loss": 5.0612, + "step": 41505 + }, + { + "epoch": 0.24684793986107145, + "grad_norm": 1.6208029985427856, + "learning_rate": 4.285217987223853e-05, + "loss": 4.661, + "step": 41506 + }, + { + "epoch": 0.24685388714435247, + "grad_norm": 1.6438084840774536, + "learning_rate": 4.285185287363078e-05, + "loss": 4.9083, + "step": 41507 + }, + { + "epoch": 0.24685983442763346, + "grad_norm": 1.4165291786193848, + "learning_rate": 4.2851525868791126e-05, + "loss": 5.123, + "step": 41508 + }, + { + "epoch": 0.24686578171091444, + "grad_norm": 3.2440237998962402, + "learning_rate": 4.285119885771969e-05, + "loss": 3.7803, + "step": 41509 + }, + { + "epoch": 0.24687172899419546, + "grad_norm": 2.2748913764953613, + "learning_rate": 4.285087184041659e-05, + "loss": 4.3417, + "step": 41510 + }, + { + "epoch": 0.24687767627747645, + "grad_norm": 1.5412343740463257, + "learning_rate": 4.2850544816881935e-05, + "loss": 5.0292, + "step": 41511 + }, + { + "epoch": 0.24688362356075744, + "grad_norm": 1.4306635856628418, + "learning_rate": 4.2850217787115834e-05, + "loss": 5.3081, + "step": 41512 + }, + { + "epoch": 0.24688957084403845, + "grad_norm": 1.3997128009796143, + "learning_rate": 4.28498907511184e-05, + "loss": 5.1089, + "step": 41513 + }, + { + "epoch": 0.24689551812731944, + "grad_norm": 2.91410493850708, + "learning_rate": 4.284956370888977e-05, + "loss": 3.5597, + "step": 41514 + }, + { + "epoch": 0.24690146541060043, + "grad_norm": 2.4513397216796875, + "learning_rate": 4.2849236660430025e-05, + "loss": 3.2308, + "step": 41515 + }, + { + "epoch": 0.24690741269388145, + "grad_norm": 1.8147131204605103, + "learning_rate": 4.284890960573931e-05, + "loss": 4.3899, + "step": 41516 + }, + { + "epoch": 0.24691335997716243, + "grad_norm": 1.472812294960022, + "learning_rate": 4.2848582544817717e-05, + "loss": 4.6822, + "step": 41517 + }, + { + "epoch": 0.24691930726044342, + "grad_norm": 1.528937816619873, + "learning_rate": 4.2848255477665363e-05, + "loss": 5.1308, + "step": 41518 + }, + { + "epoch": 0.24692525454372444, + "grad_norm": 1.4871689081192017, + "learning_rate": 4.284792840428238e-05, + "loss": 5.1752, + "step": 41519 + }, + { + "epoch": 0.24693120182700543, + "grad_norm": 2.3771328926086426, + "learning_rate": 4.284760132466886e-05, + "loss": 4.5928, + "step": 41520 + }, + { + "epoch": 0.2469371491102864, + "grad_norm": 1.5277022123336792, + "learning_rate": 4.284727423882494e-05, + "loss": 5.1997, + "step": 41521 + }, + { + "epoch": 0.24694309639356743, + "grad_norm": 1.5140366554260254, + "learning_rate": 4.284694714675071e-05, + "loss": 5.0481, + "step": 41522 + }, + { + "epoch": 0.24694904367684842, + "grad_norm": 1.285230040550232, + "learning_rate": 4.284662004844629e-05, + "loss": 5.2826, + "step": 41523 + }, + { + "epoch": 0.2469549909601294, + "grad_norm": 1.2479149103164673, + "learning_rate": 4.2846292943911814e-05, + "loss": 5.3551, + "step": 41524 + }, + { + "epoch": 0.24696093824341042, + "grad_norm": 1.2818599939346313, + "learning_rate": 4.2845965833147374e-05, + "loss": 5.3186, + "step": 41525 + }, + { + "epoch": 0.2469668855266914, + "grad_norm": 1.2601613998413086, + "learning_rate": 4.284563871615309e-05, + "loss": 5.3166, + "step": 41526 + }, + { + "epoch": 0.2469728328099724, + "grad_norm": 1.189115285873413, + "learning_rate": 4.2845311592929085e-05, + "loss": 5.4736, + "step": 41527 + }, + { + "epoch": 0.2469787800932534, + "grad_norm": 1.4777158498764038, + "learning_rate": 4.284498446347546e-05, + "loss": 5.3298, + "step": 41528 + }, + { + "epoch": 0.2469847273765344, + "grad_norm": 1.6576899290084839, + "learning_rate": 4.2844657327792334e-05, + "loss": 4.9878, + "step": 41529 + }, + { + "epoch": 0.2469906746598154, + "grad_norm": 1.626532793045044, + "learning_rate": 4.284433018587982e-05, + "loss": 4.8777, + "step": 41530 + }, + { + "epoch": 0.2469966219430964, + "grad_norm": 1.5477149486541748, + "learning_rate": 4.284400303773805e-05, + "loss": 5.4019, + "step": 41531 + }, + { + "epoch": 0.2470025692263774, + "grad_norm": 1.4073281288146973, + "learning_rate": 4.28436758833671e-05, + "loss": 5.2885, + "step": 41532 + }, + { + "epoch": 0.24700851650965838, + "grad_norm": 1.4984304904937744, + "learning_rate": 4.284334872276712e-05, + "loss": 5.1378, + "step": 41533 + }, + { + "epoch": 0.2470144637929394, + "grad_norm": 1.4340741634368896, + "learning_rate": 4.284302155593821e-05, + "loss": 4.8074, + "step": 41534 + }, + { + "epoch": 0.2470204110762204, + "grad_norm": 1.3787440061569214, + "learning_rate": 4.284269438288049e-05, + "loss": 5.4027, + "step": 41535 + }, + { + "epoch": 0.24702635835950137, + "grad_norm": 1.7872439622879028, + "learning_rate": 4.2842367203594064e-05, + "loss": 4.5259, + "step": 41536 + }, + { + "epoch": 0.2470323056427824, + "grad_norm": 1.481523036956787, + "learning_rate": 4.2842040018079054e-05, + "loss": 5.1352, + "step": 41537 + }, + { + "epoch": 0.24703825292606338, + "grad_norm": 1.6009521484375, + "learning_rate": 4.284171282633557e-05, + "loss": 4.5765, + "step": 41538 + }, + { + "epoch": 0.24704420020934437, + "grad_norm": 2.60427188873291, + "learning_rate": 4.2841385628363726e-05, + "loss": 2.074, + "step": 41539 + }, + { + "epoch": 0.24705014749262535, + "grad_norm": 1.923537015914917, + "learning_rate": 4.284105842416365e-05, + "loss": 4.0844, + "step": 41540 + }, + { + "epoch": 0.24705609477590637, + "grad_norm": 2.742117166519165, + "learning_rate": 4.284073121373544e-05, + "loss": 2.2021, + "step": 41541 + }, + { + "epoch": 0.24706204205918736, + "grad_norm": 2.8925631046295166, + "learning_rate": 4.284040399707921e-05, + "loss": 1.9525, + "step": 41542 + }, + { + "epoch": 0.24706798934246835, + "grad_norm": 2.7862610816955566, + "learning_rate": 4.2840076774195084e-05, + "loss": 2.0348, + "step": 41543 + }, + { + "epoch": 0.24707393662574936, + "grad_norm": 2.685729503631592, + "learning_rate": 4.283974954508317e-05, + "loss": 2.0564, + "step": 41544 + }, + { + "epoch": 0.24707988390903035, + "grad_norm": 2.6670422554016113, + "learning_rate": 4.283942230974358e-05, + "loss": 2.1483, + "step": 41545 + }, + { + "epoch": 0.24708583119231134, + "grad_norm": 2.7850704193115234, + "learning_rate": 4.2839095068176436e-05, + "loss": 1.9493, + "step": 41546 + }, + { + "epoch": 0.24709177847559236, + "grad_norm": 2.7873125076293945, + "learning_rate": 4.2838767820381845e-05, + "loss": 2.3616, + "step": 41547 + }, + { + "epoch": 0.24709772575887334, + "grad_norm": 2.7237610816955566, + "learning_rate": 4.283844056635993e-05, + "loss": 2.0008, + "step": 41548 + }, + { + "epoch": 0.24710367304215433, + "grad_norm": 2.8882663249969482, + "learning_rate": 4.28381133061108e-05, + "loss": 1.6532, + "step": 41549 + }, + { + "epoch": 0.24710962032543535, + "grad_norm": 2.8746354579925537, + "learning_rate": 4.283778603963456e-05, + "loss": 1.7217, + "step": 41550 + }, + { + "epoch": 0.24711556760871634, + "grad_norm": 2.6724801063537598, + "learning_rate": 4.2837458766931335e-05, + "loss": 1.3737, + "step": 41551 + }, + { + "epoch": 0.24712151489199732, + "grad_norm": 2.7616825103759766, + "learning_rate": 4.283713148800125e-05, + "loss": 1.3785, + "step": 41552 + }, + { + "epoch": 0.24712746217527834, + "grad_norm": 3.0792760848999023, + "learning_rate": 4.2836804202844395e-05, + "loss": 1.5478, + "step": 41553 + }, + { + "epoch": 0.24713340945855933, + "grad_norm": 3.2739858627319336, + "learning_rate": 4.283647691146089e-05, + "loss": 2.0787, + "step": 41554 + }, + { + "epoch": 0.24713935674184032, + "grad_norm": 3.249685525894165, + "learning_rate": 4.283614961385087e-05, + "loss": 2.2913, + "step": 41555 + }, + { + "epoch": 0.24714530402512133, + "grad_norm": 3.302060604095459, + "learning_rate": 4.2835822310014426e-05, + "loss": 2.0036, + "step": 41556 + }, + { + "epoch": 0.24715125130840232, + "grad_norm": 3.252377986907959, + "learning_rate": 4.2835494999951685e-05, + "loss": 1.8167, + "step": 41557 + }, + { + "epoch": 0.2471571985916833, + "grad_norm": 2.0271360874176025, + "learning_rate": 4.283516768366276e-05, + "loss": 5.063, + "step": 41558 + }, + { + "epoch": 0.24716314587496432, + "grad_norm": 2.049788236618042, + "learning_rate": 4.283484036114775e-05, + "loss": 5.5499, + "step": 41559 + }, + { + "epoch": 0.2471690931582453, + "grad_norm": 1.741096019744873, + "learning_rate": 4.283451303240679e-05, + "loss": 5.5656, + "step": 41560 + }, + { + "epoch": 0.2471750404415263, + "grad_norm": 1.63157320022583, + "learning_rate": 4.2834185697439986e-05, + "loss": 5.3934, + "step": 41561 + }, + { + "epoch": 0.24718098772480732, + "grad_norm": 1.5827828645706177, + "learning_rate": 4.283385835624745e-05, + "loss": 5.5183, + "step": 41562 + }, + { + "epoch": 0.2471869350080883, + "grad_norm": 1.8760567903518677, + "learning_rate": 4.2833531008829296e-05, + "loss": 5.1018, + "step": 41563 + }, + { + "epoch": 0.2471928822913693, + "grad_norm": 1.9263092279434204, + "learning_rate": 4.2833203655185646e-05, + "loss": 5.1563, + "step": 41564 + }, + { + "epoch": 0.2471988295746503, + "grad_norm": 1.824190616607666, + "learning_rate": 4.28328762953166e-05, + "loss": 5.094, + "step": 41565 + }, + { + "epoch": 0.2472047768579313, + "grad_norm": 1.4991658926010132, + "learning_rate": 4.2832548929222294e-05, + "loss": 5.0506, + "step": 41566 + }, + { + "epoch": 0.24721072414121228, + "grad_norm": 1.518554449081421, + "learning_rate": 4.283222155690282e-05, + "loss": 4.7055, + "step": 41567 + }, + { + "epoch": 0.2472166714244933, + "grad_norm": 1.7035754919052124, + "learning_rate": 4.283189417835831e-05, + "loss": 4.8926, + "step": 41568 + }, + { + "epoch": 0.2472226187077743, + "grad_norm": 1.5655994415283203, + "learning_rate": 4.283156679358886e-05, + "loss": 4.9501, + "step": 41569 + }, + { + "epoch": 0.24722856599105528, + "grad_norm": 1.5640186071395874, + "learning_rate": 4.28312394025946e-05, + "loss": 5.0312, + "step": 41570 + }, + { + "epoch": 0.2472345132743363, + "grad_norm": 1.5380947589874268, + "learning_rate": 4.283091200537563e-05, + "loss": 4.873, + "step": 41571 + }, + { + "epoch": 0.24724046055761728, + "grad_norm": 1.569738507270813, + "learning_rate": 4.283058460193209e-05, + "loss": 4.7878, + "step": 41572 + }, + { + "epoch": 0.24724640784089827, + "grad_norm": 1.5573902130126953, + "learning_rate": 4.283025719226406e-05, + "loss": 4.9733, + "step": 41573 + }, + { + "epoch": 0.24725235512417928, + "grad_norm": 1.5197664499282837, + "learning_rate": 4.282992977637168e-05, + "loss": 4.6537, + "step": 41574 + }, + { + "epoch": 0.24725830240746027, + "grad_norm": 1.5234429836273193, + "learning_rate": 4.282960235425506e-05, + "loss": 4.8128, + "step": 41575 + }, + { + "epoch": 0.24726424969074126, + "grad_norm": 1.315214991569519, + "learning_rate": 4.2829274925914295e-05, + "loss": 4.9767, + "step": 41576 + }, + { + "epoch": 0.24727019697402228, + "grad_norm": 1.6761963367462158, + "learning_rate": 4.282894749134953e-05, + "loss": 5.6203, + "step": 41577 + }, + { + "epoch": 0.24727614425730327, + "grad_norm": 1.5430665016174316, + "learning_rate": 4.2828620050560853e-05, + "loss": 5.5069, + "step": 41578 + }, + { + "epoch": 0.24728209154058425, + "grad_norm": 1.8086758852005005, + "learning_rate": 4.2828292603548396e-05, + "loss": 5.2278, + "step": 41579 + }, + { + "epoch": 0.24728803882386527, + "grad_norm": 1.7571301460266113, + "learning_rate": 4.2827965150312266e-05, + "loss": 5.3366, + "step": 41580 + }, + { + "epoch": 0.24729398610714626, + "grad_norm": 3.0991084575653076, + "learning_rate": 4.282763769085257e-05, + "loss": 3.3195, + "step": 41581 + }, + { + "epoch": 0.24729993339042725, + "grad_norm": 2.703636884689331, + "learning_rate": 4.2827310225169435e-05, + "loss": 3.3058, + "step": 41582 + }, + { + "epoch": 0.24730588067370826, + "grad_norm": 2.7449722290039062, + "learning_rate": 4.282698275326297e-05, + "loss": 3.2995, + "step": 41583 + }, + { + "epoch": 0.24731182795698925, + "grad_norm": 2.5195131301879883, + "learning_rate": 4.2826655275133294e-05, + "loss": 3.3007, + "step": 41584 + }, + { + "epoch": 0.24731777524027024, + "grad_norm": 2.277855396270752, + "learning_rate": 4.282632779078051e-05, + "loss": 3.3027, + "step": 41585 + }, + { + "epoch": 0.24732372252355125, + "grad_norm": 2.4499454498291016, + "learning_rate": 4.2826000300204746e-05, + "loss": 3.6651, + "step": 41586 + }, + { + "epoch": 0.24732966980683224, + "grad_norm": 2.6884753704071045, + "learning_rate": 4.2825672803406104e-05, + "loss": 4.4218, + "step": 41587 + }, + { + "epoch": 0.24733561709011323, + "grad_norm": 1.8385380506515503, + "learning_rate": 4.28253453003847e-05, + "loss": 4.3016, + "step": 41588 + }, + { + "epoch": 0.24734156437339425, + "grad_norm": 1.8261674642562866, + "learning_rate": 4.282501779114066e-05, + "loss": 4.4568, + "step": 41589 + }, + { + "epoch": 0.24734751165667523, + "grad_norm": 1.8640961647033691, + "learning_rate": 4.2824690275674085e-05, + "loss": 4.5102, + "step": 41590 + }, + { + "epoch": 0.24735345893995622, + "grad_norm": 1.8842672109603882, + "learning_rate": 4.2824362753985096e-05, + "loss": 4.1822, + "step": 41591 + }, + { + "epoch": 0.24735940622323724, + "grad_norm": 1.590097427368164, + "learning_rate": 4.282403522607381e-05, + "loss": 4.722, + "step": 41592 + }, + { + "epoch": 0.24736535350651823, + "grad_norm": 1.4966411590576172, + "learning_rate": 4.282370769194033e-05, + "loss": 4.7253, + "step": 41593 + }, + { + "epoch": 0.24737130078979921, + "grad_norm": 1.392972469329834, + "learning_rate": 4.2823380151584785e-05, + "loss": 4.871, + "step": 41594 + }, + { + "epoch": 0.24737724807308023, + "grad_norm": 1.571126103401184, + "learning_rate": 4.282305260500728e-05, + "loss": 4.4781, + "step": 41595 + }, + { + "epoch": 0.24738319535636122, + "grad_norm": 1.620725393295288, + "learning_rate": 4.2822725052207934e-05, + "loss": 4.6088, + "step": 41596 + }, + { + "epoch": 0.2473891426396422, + "grad_norm": 1.1969114542007446, + "learning_rate": 4.282239749318685e-05, + "loss": 5.0126, + "step": 41597 + }, + { + "epoch": 0.24739508992292322, + "grad_norm": 1.4958524703979492, + "learning_rate": 4.282206992794416e-05, + "loss": 4.7766, + "step": 41598 + }, + { + "epoch": 0.2474010372062042, + "grad_norm": 1.7994697093963623, + "learning_rate": 4.282174235647997e-05, + "loss": 4.4989, + "step": 41599 + }, + { + "epoch": 0.2474069844894852, + "grad_norm": 1.4291449785232544, + "learning_rate": 4.282141477879439e-05, + "loss": 4.7013, + "step": 41600 + }, + { + "epoch": 0.2474129317727662, + "grad_norm": 1.178472876548767, + "learning_rate": 4.282108719488753e-05, + "loss": 4.667, + "step": 41601 + }, + { + "epoch": 0.2474188790560472, + "grad_norm": 1.8116931915283203, + "learning_rate": 4.282075960475953e-05, + "loss": 4.3325, + "step": 41602 + }, + { + "epoch": 0.2474248263393282, + "grad_norm": 1.8909903764724731, + "learning_rate": 4.282043200841047e-05, + "loss": 4.5865, + "step": 41603 + }, + { + "epoch": 0.24743077362260918, + "grad_norm": 2.23349928855896, + "learning_rate": 4.282010440584049e-05, + "loss": 3.6082, + "step": 41604 + }, + { + "epoch": 0.2474367209058902, + "grad_norm": 2.037947654724121, + "learning_rate": 4.28197767970497e-05, + "loss": 4.504, + "step": 41605 + }, + { + "epoch": 0.24744266818917118, + "grad_norm": 1.537375569343567, + "learning_rate": 4.281944918203821e-05, + "loss": 4.46, + "step": 41606 + }, + { + "epoch": 0.24744861547245217, + "grad_norm": 1.8853685855865479, + "learning_rate": 4.2819121560806124e-05, + "loss": 4.6, + "step": 41607 + }, + { + "epoch": 0.2474545627557332, + "grad_norm": 1.7587653398513794, + "learning_rate": 4.281879393335357e-05, + "loss": 4.5117, + "step": 41608 + }, + { + "epoch": 0.24746051003901418, + "grad_norm": 1.6597636938095093, + "learning_rate": 4.281846629968066e-05, + "loss": 4.4246, + "step": 41609 + }, + { + "epoch": 0.24746645732229516, + "grad_norm": 2.143099308013916, + "learning_rate": 4.281813865978751e-05, + "loss": 3.8686, + "step": 41610 + }, + { + "epoch": 0.24747240460557618, + "grad_norm": 1.4232978820800781, + "learning_rate": 4.2817811013674234e-05, + "loss": 4.3431, + "step": 41611 + }, + { + "epoch": 0.24747835188885717, + "grad_norm": 1.5772578716278076, + "learning_rate": 4.2817483361340936e-05, + "loss": 4.936, + "step": 41612 + }, + { + "epoch": 0.24748429917213816, + "grad_norm": 1.5004979372024536, + "learning_rate": 4.281715570278775e-05, + "loss": 4.2743, + "step": 41613 + }, + { + "epoch": 0.24749024645541917, + "grad_norm": 1.6273713111877441, + "learning_rate": 4.2816828038014774e-05, + "loss": 4.2775, + "step": 41614 + }, + { + "epoch": 0.24749619373870016, + "grad_norm": 2.0540378093719482, + "learning_rate": 4.281650036702212e-05, + "loss": 4.2464, + "step": 41615 + }, + { + "epoch": 0.24750214102198115, + "grad_norm": 2.0336031913757324, + "learning_rate": 4.281617268980992e-05, + "loss": 4.1819, + "step": 41616 + }, + { + "epoch": 0.24750808830526216, + "grad_norm": 1.5847617387771606, + "learning_rate": 4.2815845006378274e-05, + "loss": 4.4881, + "step": 41617 + }, + { + "epoch": 0.24751403558854315, + "grad_norm": 1.501085638999939, + "learning_rate": 4.28155173167273e-05, + "loss": 4.0969, + "step": 41618 + }, + { + "epoch": 0.24751998287182414, + "grad_norm": 1.613006830215454, + "learning_rate": 4.281518962085711e-05, + "loss": 4.1363, + "step": 41619 + }, + { + "epoch": 0.24752593015510516, + "grad_norm": 1.8100777864456177, + "learning_rate": 4.281486191876783e-05, + "loss": 4.1612, + "step": 41620 + }, + { + "epoch": 0.24753187743838614, + "grad_norm": 1.7178977727890015, + "learning_rate": 4.2814534210459553e-05, + "loss": 4.2334, + "step": 41621 + }, + { + "epoch": 0.24753782472166713, + "grad_norm": 1.4852858781814575, + "learning_rate": 4.281420649593242e-05, + "loss": 4.1164, + "step": 41622 + }, + { + "epoch": 0.24754377200494815, + "grad_norm": 1.5761849880218506, + "learning_rate": 4.281387877518652e-05, + "loss": 4.0891, + "step": 41623 + }, + { + "epoch": 0.24754971928822914, + "grad_norm": 1.6079566478729248, + "learning_rate": 4.281355104822199e-05, + "loss": 4.0689, + "step": 41624 + }, + { + "epoch": 0.24755566657151012, + "grad_norm": 1.7425400018692017, + "learning_rate": 4.2813223315038924e-05, + "loss": 4.1025, + "step": 41625 + }, + { + "epoch": 0.24756161385479114, + "grad_norm": 1.5375959873199463, + "learning_rate": 4.2812895575637456e-05, + "loss": 3.9993, + "step": 41626 + }, + { + "epoch": 0.24756756113807213, + "grad_norm": 1.3419246673583984, + "learning_rate": 4.281256783001768e-05, + "loss": 4.4141, + "step": 41627 + }, + { + "epoch": 0.24757350842135312, + "grad_norm": 1.3607546091079712, + "learning_rate": 4.2812240078179724e-05, + "loss": 4.6126, + "step": 41628 + }, + { + "epoch": 0.24757945570463413, + "grad_norm": 1.3211710453033447, + "learning_rate": 4.28119123201237e-05, + "loss": 4.7076, + "step": 41629 + }, + { + "epoch": 0.24758540298791512, + "grad_norm": 1.444361686706543, + "learning_rate": 4.2811584555849717e-05, + "loss": 4.3535, + "step": 41630 + }, + { + "epoch": 0.2475913502711961, + "grad_norm": 1.7592238187789917, + "learning_rate": 4.28112567853579e-05, + "loss": 4.2072, + "step": 41631 + }, + { + "epoch": 0.24759729755447712, + "grad_norm": 1.9219753742218018, + "learning_rate": 4.281092900864836e-05, + "loss": 3.8195, + "step": 41632 + }, + { + "epoch": 0.2476032448377581, + "grad_norm": 1.7093669176101685, + "learning_rate": 4.28106012257212e-05, + "loss": 3.9143, + "step": 41633 + }, + { + "epoch": 0.2476091921210391, + "grad_norm": 2.4754652976989746, + "learning_rate": 4.281027343657654e-05, + "loss": 4.2308, + "step": 41634 + }, + { + "epoch": 0.24761513940432012, + "grad_norm": 2.131906270980835, + "learning_rate": 4.280994564121451e-05, + "loss": 4.0484, + "step": 41635 + }, + { + "epoch": 0.2476210866876011, + "grad_norm": 1.3441541194915771, + "learning_rate": 4.280961783963521e-05, + "loss": 4.8571, + "step": 41636 + }, + { + "epoch": 0.2476270339708821, + "grad_norm": 1.744143009185791, + "learning_rate": 4.2809290031838745e-05, + "loss": 4.7186, + "step": 41637 + }, + { + "epoch": 0.2476329812541631, + "grad_norm": 1.6974855661392212, + "learning_rate": 4.2808962217825255e-05, + "loss": 4.5878, + "step": 41638 + }, + { + "epoch": 0.2476389285374441, + "grad_norm": 1.4456303119659424, + "learning_rate": 4.280863439759483e-05, + "loss": 4.8338, + "step": 41639 + }, + { + "epoch": 0.24764487582072509, + "grad_norm": 1.236769437789917, + "learning_rate": 4.28083065711476e-05, + "loss": 4.7715, + "step": 41640 + }, + { + "epoch": 0.2476508231040061, + "grad_norm": 1.6453012228012085, + "learning_rate": 4.280797873848367e-05, + "loss": 4.2757, + "step": 41641 + }, + { + "epoch": 0.2476567703872871, + "grad_norm": 1.6874072551727295, + "learning_rate": 4.280765089960316e-05, + "loss": 4.3971, + "step": 41642 + }, + { + "epoch": 0.24766271767056808, + "grad_norm": 1.7028913497924805, + "learning_rate": 4.2807323054506186e-05, + "loss": 4.3724, + "step": 41643 + }, + { + "epoch": 0.2476686649538491, + "grad_norm": 1.5302543640136719, + "learning_rate": 4.2806995203192855e-05, + "loss": 4.2759, + "step": 41644 + }, + { + "epoch": 0.24767461223713008, + "grad_norm": 1.4613587856292725, + "learning_rate": 4.280666734566329e-05, + "loss": 4.9211, + "step": 41645 + }, + { + "epoch": 0.24768055952041107, + "grad_norm": 1.5691584348678589, + "learning_rate": 4.2806339481917603e-05, + "loss": 4.4679, + "step": 41646 + }, + { + "epoch": 0.24768650680369209, + "grad_norm": 1.7756667137145996, + "learning_rate": 4.280601161195591e-05, + "loss": 4.174, + "step": 41647 + }, + { + "epoch": 0.24769245408697307, + "grad_norm": 1.728438138961792, + "learning_rate": 4.280568373577831e-05, + "loss": 4.3694, + "step": 41648 + }, + { + "epoch": 0.24769840137025406, + "grad_norm": 1.5720018148422241, + "learning_rate": 4.2805355853384934e-05, + "loss": 4.8773, + "step": 41649 + }, + { + "epoch": 0.24770434865353508, + "grad_norm": 1.6362850666046143, + "learning_rate": 4.28050279647759e-05, + "loss": 4.6421, + "step": 41650 + }, + { + "epoch": 0.24771029593681607, + "grad_norm": 1.7506542205810547, + "learning_rate": 4.28047000699513e-05, + "loss": 4.4961, + "step": 41651 + }, + { + "epoch": 0.24771624322009705, + "grad_norm": 1.814595341682434, + "learning_rate": 4.2804372168911275e-05, + "loss": 4.5943, + "step": 41652 + }, + { + "epoch": 0.24772219050337807, + "grad_norm": 1.674403429031372, + "learning_rate": 4.280404426165593e-05, + "loss": 4.512, + "step": 41653 + }, + { + "epoch": 0.24772813778665906, + "grad_norm": 1.5866286754608154, + "learning_rate": 4.280371634818537e-05, + "loss": 5.148, + "step": 41654 + }, + { + "epoch": 0.24773408506994005, + "grad_norm": 1.5019527673721313, + "learning_rate": 4.2803388428499716e-05, + "loss": 4.7288, + "step": 41655 + }, + { + "epoch": 0.24774003235322106, + "grad_norm": 1.624301791191101, + "learning_rate": 4.280306050259909e-05, + "loss": 4.2905, + "step": 41656 + }, + { + "epoch": 0.24774597963650205, + "grad_norm": 1.6445655822753906, + "learning_rate": 4.280273257048359e-05, + "loss": 4.5807, + "step": 41657 + }, + { + "epoch": 0.24775192691978304, + "grad_norm": 1.6700493097305298, + "learning_rate": 4.2802404632153346e-05, + "loss": 4.6052, + "step": 41658 + }, + { + "epoch": 0.24775787420306403, + "grad_norm": 1.5200526714324951, + "learning_rate": 4.280207668760847e-05, + "loss": 4.1582, + "step": 41659 + }, + { + "epoch": 0.24776382148634504, + "grad_norm": 1.4779654741287231, + "learning_rate": 4.280174873684907e-05, + "loss": 4.0057, + "step": 41660 + }, + { + "epoch": 0.24776976876962603, + "grad_norm": 1.4804577827453613, + "learning_rate": 4.280142077987526e-05, + "loss": 4.4818, + "step": 41661 + }, + { + "epoch": 0.24777571605290702, + "grad_norm": 1.4826716184616089, + "learning_rate": 4.2801092816687156e-05, + "loss": 4.5406, + "step": 41662 + }, + { + "epoch": 0.24778166333618803, + "grad_norm": 1.4799234867095947, + "learning_rate": 4.280076484728488e-05, + "loss": 4.4078, + "step": 41663 + }, + { + "epoch": 0.24778761061946902, + "grad_norm": 1.9700071811676025, + "learning_rate": 4.280043687166854e-05, + "loss": 4.3113, + "step": 41664 + }, + { + "epoch": 0.24779355790275, + "grad_norm": 1.7328786849975586, + "learning_rate": 4.2800108889838244e-05, + "loss": 4.2737, + "step": 41665 + }, + { + "epoch": 0.24779950518603103, + "grad_norm": 1.6574100255966187, + "learning_rate": 4.2799780901794124e-05, + "loss": 4.4579, + "step": 41666 + }, + { + "epoch": 0.24780545246931202, + "grad_norm": 1.5007452964782715, + "learning_rate": 4.279945290753628e-05, + "loss": 4.3976, + "step": 41667 + }, + { + "epoch": 0.247811399752593, + "grad_norm": 1.502923846244812, + "learning_rate": 4.279912490706483e-05, + "loss": 4.4302, + "step": 41668 + }, + { + "epoch": 0.24781734703587402, + "grad_norm": 1.560198426246643, + "learning_rate": 4.279879690037989e-05, + "loss": 4.9795, + "step": 41669 + }, + { + "epoch": 0.247823294319155, + "grad_norm": 1.6574666500091553, + "learning_rate": 4.279846888748158e-05, + "loss": 4.3246, + "step": 41670 + }, + { + "epoch": 0.247829241602436, + "grad_norm": 1.7317218780517578, + "learning_rate": 4.279814086837e-05, + "loss": 4.4425, + "step": 41671 + }, + { + "epoch": 0.247835188885717, + "grad_norm": 1.8932639360427856, + "learning_rate": 4.279781284304528e-05, + "loss": 4.1565, + "step": 41672 + }, + { + "epoch": 0.247841136168998, + "grad_norm": 2.7030253410339355, + "learning_rate": 4.279748481150751e-05, + "loss": 3.3813, + "step": 41673 + }, + { + "epoch": 0.247847083452279, + "grad_norm": 1.8590961694717407, + "learning_rate": 4.279715677375684e-05, + "loss": 4.3073, + "step": 41674 + }, + { + "epoch": 0.24785303073556, + "grad_norm": 1.984152913093567, + "learning_rate": 4.279682872979337e-05, + "loss": 3.6732, + "step": 41675 + }, + { + "epoch": 0.247858978018841, + "grad_norm": 2.0756382942199707, + "learning_rate": 4.27965006796172e-05, + "loss": 3.9057, + "step": 41676 + }, + { + "epoch": 0.24786492530212198, + "grad_norm": 1.8423583507537842, + "learning_rate": 4.279617262322846e-05, + "loss": 3.9269, + "step": 41677 + }, + { + "epoch": 0.247870872585403, + "grad_norm": 1.588869571685791, + "learning_rate": 4.279584456062726e-05, + "loss": 4.716, + "step": 41678 + }, + { + "epoch": 0.24787681986868398, + "grad_norm": 1.4872053861618042, + "learning_rate": 4.279551649181371e-05, + "loss": 5.0216, + "step": 41679 + }, + { + "epoch": 0.24788276715196497, + "grad_norm": 1.3518850803375244, + "learning_rate": 4.279518841678793e-05, + "loss": 4.8039, + "step": 41680 + }, + { + "epoch": 0.247888714435246, + "grad_norm": 1.7091758251190186, + "learning_rate": 4.279486033555004e-05, + "loss": 4.4062, + "step": 41681 + }, + { + "epoch": 0.24789466171852698, + "grad_norm": 1.6548142433166504, + "learning_rate": 4.279453224810014e-05, + "loss": 4.8385, + "step": 41682 + }, + { + "epoch": 0.24790060900180796, + "grad_norm": 1.2776515483856201, + "learning_rate": 4.2794204154438354e-05, + "loss": 5.0074, + "step": 41683 + }, + { + "epoch": 0.24790655628508898, + "grad_norm": 1.3731651306152344, + "learning_rate": 4.2793876054564797e-05, + "loss": 5.0161, + "step": 41684 + }, + { + "epoch": 0.24791250356836997, + "grad_norm": 1.1943491697311401, + "learning_rate": 4.279354794847958e-05, + "loss": 5.0404, + "step": 41685 + }, + { + "epoch": 0.24791845085165096, + "grad_norm": 1.5101165771484375, + "learning_rate": 4.279321983618282e-05, + "loss": 4.6056, + "step": 41686 + }, + { + "epoch": 0.24792439813493197, + "grad_norm": 1.6762828826904297, + "learning_rate": 4.2792891717674635e-05, + "loss": 4.1686, + "step": 41687 + }, + { + "epoch": 0.24793034541821296, + "grad_norm": 1.3679381608963013, + "learning_rate": 4.2792563592955135e-05, + "loss": 4.596, + "step": 41688 + }, + { + "epoch": 0.24793629270149395, + "grad_norm": 1.5756072998046875, + "learning_rate": 4.279223546202443e-05, + "loss": 4.7621, + "step": 41689 + }, + { + "epoch": 0.24794223998477496, + "grad_norm": 1.492398977279663, + "learning_rate": 4.279190732488264e-05, + "loss": 5.258, + "step": 41690 + }, + { + "epoch": 0.24794818726805595, + "grad_norm": 1.6215550899505615, + "learning_rate": 4.279157918152988e-05, + "loss": 5.1277, + "step": 41691 + }, + { + "epoch": 0.24795413455133694, + "grad_norm": 1.661486268043518, + "learning_rate": 4.279125103196627e-05, + "loss": 4.6897, + "step": 41692 + }, + { + "epoch": 0.24796008183461796, + "grad_norm": 1.3000272512435913, + "learning_rate": 4.2790922876191905e-05, + "loss": 4.7736, + "step": 41693 + }, + { + "epoch": 0.24796602911789895, + "grad_norm": 1.834218144416809, + "learning_rate": 4.279059471420692e-05, + "loss": 4.7911, + "step": 41694 + }, + { + "epoch": 0.24797197640117993, + "grad_norm": 1.4950984716415405, + "learning_rate": 4.279026654601143e-05, + "loss": 4.8048, + "step": 41695 + }, + { + "epoch": 0.24797792368446095, + "grad_norm": 1.8602609634399414, + "learning_rate": 4.278993837160553e-05, + "loss": 4.9673, + "step": 41696 + }, + { + "epoch": 0.24798387096774194, + "grad_norm": 1.9887093305587769, + "learning_rate": 4.278961019098935e-05, + "loss": 4.2284, + "step": 41697 + }, + { + "epoch": 0.24798981825102293, + "grad_norm": 1.4452109336853027, + "learning_rate": 4.2789282004163e-05, + "loss": 4.3978, + "step": 41698 + }, + { + "epoch": 0.24799576553430394, + "grad_norm": 1.405418038368225, + "learning_rate": 4.2788953811126595e-05, + "loss": 4.6529, + "step": 41699 + }, + { + "epoch": 0.24800171281758493, + "grad_norm": 1.3983663320541382, + "learning_rate": 4.278862561188025e-05, + "loss": 4.8499, + "step": 41700 + }, + { + "epoch": 0.24800766010086592, + "grad_norm": 1.8862656354904175, + "learning_rate": 4.2788297406424084e-05, + "loss": 4.6191, + "step": 41701 + }, + { + "epoch": 0.24801360738414693, + "grad_norm": 1.7237356901168823, + "learning_rate": 4.2787969194758204e-05, + "loss": 4.6822, + "step": 41702 + }, + { + "epoch": 0.24801955466742792, + "grad_norm": 1.6588315963745117, + "learning_rate": 4.2787640976882725e-05, + "loss": 5.5256, + "step": 41703 + }, + { + "epoch": 0.2480255019507089, + "grad_norm": 1.4179097414016724, + "learning_rate": 4.278731275279777e-05, + "loss": 4.5208, + "step": 41704 + }, + { + "epoch": 0.24803144923398993, + "grad_norm": 1.4965027570724487, + "learning_rate": 4.2786984522503446e-05, + "loss": 4.3522, + "step": 41705 + }, + { + "epoch": 0.2480373965172709, + "grad_norm": 1.4985955953598022, + "learning_rate": 4.278665628599987e-05, + "loss": 4.5982, + "step": 41706 + }, + { + "epoch": 0.2480433438005519, + "grad_norm": 1.2907823324203491, + "learning_rate": 4.278632804328715e-05, + "loss": 4.7208, + "step": 41707 + }, + { + "epoch": 0.24804929108383292, + "grad_norm": 1.383506178855896, + "learning_rate": 4.278599979436542e-05, + "loss": 4.5654, + "step": 41708 + }, + { + "epoch": 0.2480552383671139, + "grad_norm": 1.4668956995010376, + "learning_rate": 4.278567153923477e-05, + "loss": 4.8213, + "step": 41709 + }, + { + "epoch": 0.2480611856503949, + "grad_norm": 1.4509391784667969, + "learning_rate": 4.278534327789533e-05, + "loss": 4.9181, + "step": 41710 + }, + { + "epoch": 0.2480671329336759, + "grad_norm": 1.4143444299697876, + "learning_rate": 4.2785015010347204e-05, + "loss": 4.8814, + "step": 41711 + }, + { + "epoch": 0.2480730802169569, + "grad_norm": 1.1586863994598389, + "learning_rate": 4.2784686736590516e-05, + "loss": 4.7088, + "step": 41712 + }, + { + "epoch": 0.2480790275002379, + "grad_norm": 1.4085332155227661, + "learning_rate": 4.278435845662539e-05, + "loss": 4.9168, + "step": 41713 + }, + { + "epoch": 0.2480849747835189, + "grad_norm": 1.3023967742919922, + "learning_rate": 4.278403017045191e-05, + "loss": 5.0307, + "step": 41714 + }, + { + "epoch": 0.2480909220667999, + "grad_norm": 1.664656162261963, + "learning_rate": 4.278370187807022e-05, + "loss": 4.6702, + "step": 41715 + }, + { + "epoch": 0.24809686935008088, + "grad_norm": 1.323513388633728, + "learning_rate": 4.2783373579480414e-05, + "loss": 4.7793, + "step": 41716 + }, + { + "epoch": 0.24810281663336187, + "grad_norm": 1.4263231754302979, + "learning_rate": 4.2783045274682624e-05, + "loss": 4.7926, + "step": 41717 + }, + { + "epoch": 0.24810876391664288, + "grad_norm": 1.2926543951034546, + "learning_rate": 4.278271696367695e-05, + "loss": 4.5648, + "step": 41718 + }, + { + "epoch": 0.24811471119992387, + "grad_norm": 1.1890965700149536, + "learning_rate": 4.278238864646352e-05, + "loss": 4.7734, + "step": 41719 + }, + { + "epoch": 0.24812065848320486, + "grad_norm": 1.5598522424697876, + "learning_rate": 4.278206032304244e-05, + "loss": 4.8744, + "step": 41720 + }, + { + "epoch": 0.24812660576648587, + "grad_norm": 1.6330052614212036, + "learning_rate": 4.278173199341383e-05, + "loss": 4.2649, + "step": 41721 + }, + { + "epoch": 0.24813255304976686, + "grad_norm": 1.7198753356933594, + "learning_rate": 4.278140365757779e-05, + "loss": 4.1124, + "step": 41722 + }, + { + "epoch": 0.24813850033304785, + "grad_norm": 1.6383026838302612, + "learning_rate": 4.278107531553446e-05, + "loss": 4.3254, + "step": 41723 + }, + { + "epoch": 0.24814444761632887, + "grad_norm": 1.7641968727111816, + "learning_rate": 4.2780746967283925e-05, + "loss": 4.1278, + "step": 41724 + }, + { + "epoch": 0.24815039489960986, + "grad_norm": 1.7090308666229248, + "learning_rate": 4.278041861282632e-05, + "loss": 4.4871, + "step": 41725 + }, + { + "epoch": 0.24815634218289084, + "grad_norm": 1.6807724237442017, + "learning_rate": 4.278009025216176e-05, + "loss": 4.5155, + "step": 41726 + }, + { + "epoch": 0.24816228946617186, + "grad_norm": 1.7505130767822266, + "learning_rate": 4.2779761885290356e-05, + "loss": 4.6876, + "step": 41727 + }, + { + "epoch": 0.24816823674945285, + "grad_norm": 1.6450802087783813, + "learning_rate": 4.277943351221222e-05, + "loss": 4.3668, + "step": 41728 + }, + { + "epoch": 0.24817418403273384, + "grad_norm": 1.6309759616851807, + "learning_rate": 4.2779105132927454e-05, + "loss": 4.5795, + "step": 41729 + }, + { + "epoch": 0.24818013131601485, + "grad_norm": 1.63779878616333, + "learning_rate": 4.27787767474362e-05, + "loss": 4.256, + "step": 41730 + }, + { + "epoch": 0.24818607859929584, + "grad_norm": 1.4453445672988892, + "learning_rate": 4.277844835573855e-05, + "loss": 4.6407, + "step": 41731 + }, + { + "epoch": 0.24819202588257683, + "grad_norm": 1.6876344680786133, + "learning_rate": 4.2778119957834637e-05, + "loss": 4.936, + "step": 41732 + }, + { + "epoch": 0.24819797316585784, + "grad_norm": 1.564404010772705, + "learning_rate": 4.277779155372456e-05, + "loss": 4.7456, + "step": 41733 + }, + { + "epoch": 0.24820392044913883, + "grad_norm": 1.6725449562072754, + "learning_rate": 4.2777463143408445e-05, + "loss": 4.8372, + "step": 41734 + }, + { + "epoch": 0.24820986773241982, + "grad_norm": 1.5560435056686401, + "learning_rate": 4.277713472688639e-05, + "loss": 4.659, + "step": 41735 + }, + { + "epoch": 0.24821581501570084, + "grad_norm": 1.619262933731079, + "learning_rate": 4.277680630415854e-05, + "loss": 4.6715, + "step": 41736 + }, + { + "epoch": 0.24822176229898182, + "grad_norm": 1.3820923566818237, + "learning_rate": 4.277647787522497e-05, + "loss": 4.9512, + "step": 41737 + }, + { + "epoch": 0.2482277095822628, + "grad_norm": 1.6360106468200684, + "learning_rate": 4.277614944008582e-05, + "loss": 4.6624, + "step": 41738 + }, + { + "epoch": 0.24823365686554383, + "grad_norm": 1.4565874338150024, + "learning_rate": 4.277582099874121e-05, + "loss": 4.6084, + "step": 41739 + }, + { + "epoch": 0.24823960414882482, + "grad_norm": 1.657044529914856, + "learning_rate": 4.277549255119123e-05, + "loss": 4.9972, + "step": 41740 + }, + { + "epoch": 0.2482455514321058, + "grad_norm": 1.4542521238327026, + "learning_rate": 4.277516409743602e-05, + "loss": 4.8854, + "step": 41741 + }, + { + "epoch": 0.24825149871538682, + "grad_norm": 1.624891757965088, + "learning_rate": 4.2774835637475686e-05, + "loss": 4.6667, + "step": 41742 + }, + { + "epoch": 0.2482574459986678, + "grad_norm": 1.8339580297470093, + "learning_rate": 4.277450717131033e-05, + "loss": 4.2172, + "step": 41743 + }, + { + "epoch": 0.2482633932819488, + "grad_norm": 1.7660454511642456, + "learning_rate": 4.277417869894008e-05, + "loss": 3.9742, + "step": 41744 + }, + { + "epoch": 0.2482693405652298, + "grad_norm": 1.709517240524292, + "learning_rate": 4.277385022036505e-05, + "loss": 4.2192, + "step": 41745 + }, + { + "epoch": 0.2482752878485108, + "grad_norm": 1.7910828590393066, + "learning_rate": 4.277352173558536e-05, + "loss": 4.3698, + "step": 41746 + }, + { + "epoch": 0.2482812351317918, + "grad_norm": 1.7170581817626953, + "learning_rate": 4.2773193244601106e-05, + "loss": 4.6799, + "step": 41747 + }, + { + "epoch": 0.2482871824150728, + "grad_norm": 1.6852309703826904, + "learning_rate": 4.277286474741242e-05, + "loss": 4.681, + "step": 41748 + }, + { + "epoch": 0.2482931296983538, + "grad_norm": 2.0396511554718018, + "learning_rate": 4.2772536244019414e-05, + "loss": 3.9377, + "step": 41749 + }, + { + "epoch": 0.24829907698163478, + "grad_norm": 1.6816411018371582, + "learning_rate": 4.277220773442219e-05, + "loss": 4.7135, + "step": 41750 + }, + { + "epoch": 0.2483050242649158, + "grad_norm": 1.798041582107544, + "learning_rate": 4.277187921862088e-05, + "loss": 4.2819, + "step": 41751 + }, + { + "epoch": 0.24831097154819678, + "grad_norm": 1.7594702243804932, + "learning_rate": 4.277155069661558e-05, + "loss": 4.5073, + "step": 41752 + }, + { + "epoch": 0.24831691883147777, + "grad_norm": 1.8875921964645386, + "learning_rate": 4.277122216840642e-05, + "loss": 3.9771, + "step": 41753 + }, + { + "epoch": 0.2483228661147588, + "grad_norm": 1.4810000658035278, + "learning_rate": 4.277089363399352e-05, + "loss": 4.4956, + "step": 41754 + }, + { + "epoch": 0.24832881339803978, + "grad_norm": 1.3316317796707153, + "learning_rate": 4.277056509337697e-05, + "loss": 4.9093, + "step": 41755 + }, + { + "epoch": 0.24833476068132077, + "grad_norm": 1.672965407371521, + "learning_rate": 4.277023654655691e-05, + "loss": 4.763, + "step": 41756 + }, + { + "epoch": 0.24834070796460178, + "grad_norm": 1.4626654386520386, + "learning_rate": 4.276990799353344e-05, + "loss": 5.0571, + "step": 41757 + }, + { + "epoch": 0.24834665524788277, + "grad_norm": 1.7332490682601929, + "learning_rate": 4.276957943430667e-05, + "loss": 4.6251, + "step": 41758 + }, + { + "epoch": 0.24835260253116376, + "grad_norm": 1.743257761001587, + "learning_rate": 4.2769250868876735e-05, + "loss": 4.2858, + "step": 41759 + }, + { + "epoch": 0.24835854981444477, + "grad_norm": 1.7251859903335571, + "learning_rate": 4.276892229724374e-05, + "loss": 4.4493, + "step": 41760 + }, + { + "epoch": 0.24836449709772576, + "grad_norm": 1.7114126682281494, + "learning_rate": 4.276859371940779e-05, + "loss": 4.2853, + "step": 41761 + }, + { + "epoch": 0.24837044438100675, + "grad_norm": 1.6168044805526733, + "learning_rate": 4.276826513536901e-05, + "loss": 4.5718, + "step": 41762 + }, + { + "epoch": 0.24837639166428777, + "grad_norm": 1.6046549081802368, + "learning_rate": 4.276793654512751e-05, + "loss": 4.8878, + "step": 41763 + }, + { + "epoch": 0.24838233894756875, + "grad_norm": 1.6358529329299927, + "learning_rate": 4.2767607948683406e-05, + "loss": 4.8358, + "step": 41764 + }, + { + "epoch": 0.24838828623084974, + "grad_norm": 1.5713015794754028, + "learning_rate": 4.276727934603683e-05, + "loss": 4.9079, + "step": 41765 + }, + { + "epoch": 0.24839423351413076, + "grad_norm": 1.4701517820358276, + "learning_rate": 4.276695073718786e-05, + "loss": 4.5652, + "step": 41766 + }, + { + "epoch": 0.24840018079741175, + "grad_norm": 1.5550785064697266, + "learning_rate": 4.2766622122136633e-05, + "loss": 4.3251, + "step": 41767 + }, + { + "epoch": 0.24840612808069273, + "grad_norm": 1.654237985610962, + "learning_rate": 4.276629350088327e-05, + "loss": 4.3713, + "step": 41768 + }, + { + "epoch": 0.24841207536397375, + "grad_norm": 1.6649044752120972, + "learning_rate": 4.2765964873427875e-05, + "loss": 4.2443, + "step": 41769 + }, + { + "epoch": 0.24841802264725474, + "grad_norm": 1.7000197172164917, + "learning_rate": 4.2765636239770566e-05, + "loss": 4.7086, + "step": 41770 + }, + { + "epoch": 0.24842396993053573, + "grad_norm": 1.6534442901611328, + "learning_rate": 4.276530759991145e-05, + "loss": 5.0483, + "step": 41771 + }, + { + "epoch": 0.24842991721381674, + "grad_norm": 1.6658438444137573, + "learning_rate": 4.276497895385066e-05, + "loss": 5.1887, + "step": 41772 + }, + { + "epoch": 0.24843586449709773, + "grad_norm": 1.4516960382461548, + "learning_rate": 4.2764650301588294e-05, + "loss": 5.2065, + "step": 41773 + }, + { + "epoch": 0.24844181178037872, + "grad_norm": 1.5533652305603027, + "learning_rate": 4.2764321643124474e-05, + "loss": 4.6192, + "step": 41774 + }, + { + "epoch": 0.2484477590636597, + "grad_norm": 2.237070083618164, + "learning_rate": 4.276399297845931e-05, + "loss": 3.3388, + "step": 41775 + }, + { + "epoch": 0.24845370634694072, + "grad_norm": 1.5573471784591675, + "learning_rate": 4.276366430759292e-05, + "loss": 3.8328, + "step": 41776 + }, + { + "epoch": 0.2484596536302217, + "grad_norm": 1.4539844989776611, + "learning_rate": 4.276333563052542e-05, + "loss": 3.8692, + "step": 41777 + }, + { + "epoch": 0.2484656009135027, + "grad_norm": 1.8052716255187988, + "learning_rate": 4.2763006947256915e-05, + "loss": 4.3728, + "step": 41778 + }, + { + "epoch": 0.24847154819678371, + "grad_norm": 1.8088088035583496, + "learning_rate": 4.2762678257787535e-05, + "loss": 5.0608, + "step": 41779 + }, + { + "epoch": 0.2484774954800647, + "grad_norm": 1.5384361743927002, + "learning_rate": 4.2762349562117386e-05, + "loss": 4.9009, + "step": 41780 + }, + { + "epoch": 0.2484834427633457, + "grad_norm": 1.6384252309799194, + "learning_rate": 4.276202086024659e-05, + "loss": 4.9818, + "step": 41781 + }, + { + "epoch": 0.2484893900466267, + "grad_norm": 1.6779409646987915, + "learning_rate": 4.276169215217525e-05, + "loss": 4.8874, + "step": 41782 + }, + { + "epoch": 0.2484953373299077, + "grad_norm": 1.5710731744766235, + "learning_rate": 4.276136343790349e-05, + "loss": 5.0753, + "step": 41783 + }, + { + "epoch": 0.24850128461318868, + "grad_norm": 1.555067777633667, + "learning_rate": 4.2761034717431417e-05, + "loss": 4.7908, + "step": 41784 + }, + { + "epoch": 0.2485072318964697, + "grad_norm": 1.5200762748718262, + "learning_rate": 4.276070599075915e-05, + "loss": 4.735, + "step": 41785 + }, + { + "epoch": 0.2485131791797507, + "grad_norm": 1.5547939538955688, + "learning_rate": 4.2760377257886806e-05, + "loss": 4.7182, + "step": 41786 + }, + { + "epoch": 0.24851912646303168, + "grad_norm": 1.4693087339401245, + "learning_rate": 4.27600485188145e-05, + "loss": 4.7592, + "step": 41787 + }, + { + "epoch": 0.2485250737463127, + "grad_norm": 1.887117862701416, + "learning_rate": 4.2759719773542345e-05, + "loss": 4.7165, + "step": 41788 + }, + { + "epoch": 0.24853102102959368, + "grad_norm": 1.4935531616210938, + "learning_rate": 4.275939102207046e-05, + "loss": 5.0007, + "step": 41789 + }, + { + "epoch": 0.24853696831287467, + "grad_norm": 1.5121372938156128, + "learning_rate": 4.275906226439894e-05, + "loss": 5.1388, + "step": 41790 + }, + { + "epoch": 0.24854291559615568, + "grad_norm": 1.3200914859771729, + "learning_rate": 4.275873350052793e-05, + "loss": 5.0297, + "step": 41791 + }, + { + "epoch": 0.24854886287943667, + "grad_norm": 1.4220794439315796, + "learning_rate": 4.275840473045752e-05, + "loss": 4.9737, + "step": 41792 + }, + { + "epoch": 0.24855481016271766, + "grad_norm": 1.4793764352798462, + "learning_rate": 4.275807595418784e-05, + "loss": 4.9998, + "step": 41793 + }, + { + "epoch": 0.24856075744599868, + "grad_norm": 1.6136835813522339, + "learning_rate": 4.275774717171899e-05, + "loss": 5.2556, + "step": 41794 + }, + { + "epoch": 0.24856670472927966, + "grad_norm": 1.7071197032928467, + "learning_rate": 4.27574183830511e-05, + "loss": 4.9799, + "step": 41795 + }, + { + "epoch": 0.24857265201256065, + "grad_norm": 1.5223671197891235, + "learning_rate": 4.275708958818429e-05, + "loss": 4.8399, + "step": 41796 + }, + { + "epoch": 0.24857859929584167, + "grad_norm": 1.2538731098175049, + "learning_rate": 4.275676078711864e-05, + "loss": 4.8047, + "step": 41797 + }, + { + "epoch": 0.24858454657912266, + "grad_norm": 1.5250166654586792, + "learning_rate": 4.275643197985431e-05, + "loss": 4.892, + "step": 41798 + }, + { + "epoch": 0.24859049386240364, + "grad_norm": 1.3885079622268677, + "learning_rate": 4.275610316639138e-05, + "loss": 4.8102, + "step": 41799 + }, + { + "epoch": 0.24859644114568466, + "grad_norm": 1.3203582763671875, + "learning_rate": 4.275577434672998e-05, + "loss": 4.8241, + "step": 41800 + }, + { + "epoch": 0.24860238842896565, + "grad_norm": 1.3166993856430054, + "learning_rate": 4.275544552087022e-05, + "loss": 4.7102, + "step": 41801 + }, + { + "epoch": 0.24860833571224664, + "grad_norm": 1.415543794631958, + "learning_rate": 4.2755116688812225e-05, + "loss": 4.8779, + "step": 41802 + }, + { + "epoch": 0.24861428299552765, + "grad_norm": 1.6693099737167358, + "learning_rate": 4.27547878505561e-05, + "loss": 5.0447, + "step": 41803 + }, + { + "epoch": 0.24862023027880864, + "grad_norm": 1.6733319759368896, + "learning_rate": 4.275445900610196e-05, + "loss": 4.703, + "step": 41804 + }, + { + "epoch": 0.24862617756208963, + "grad_norm": 1.666702389717102, + "learning_rate": 4.2754130155449926e-05, + "loss": 4.9924, + "step": 41805 + }, + { + "epoch": 0.24863212484537064, + "grad_norm": 1.5713837146759033, + "learning_rate": 4.275380129860011e-05, + "loss": 5.2833, + "step": 41806 + }, + { + "epoch": 0.24863807212865163, + "grad_norm": 1.6866449117660522, + "learning_rate": 4.275347243555261e-05, + "loss": 5.25, + "step": 41807 + }, + { + "epoch": 0.24864401941193262, + "grad_norm": 1.4337143898010254, + "learning_rate": 4.2753143566307573e-05, + "loss": 5.0664, + "step": 41808 + }, + { + "epoch": 0.24864996669521364, + "grad_norm": 1.4095462560653687, + "learning_rate": 4.275281469086509e-05, + "loss": 4.984, + "step": 41809 + }, + { + "epoch": 0.24865591397849462, + "grad_norm": 1.3389499187469482, + "learning_rate": 4.2752485809225286e-05, + "loss": 5.0437, + "step": 41810 + }, + { + "epoch": 0.2486618612617756, + "grad_norm": 1.6648070812225342, + "learning_rate": 4.2752156921388264e-05, + "loss": 4.7178, + "step": 41811 + }, + { + "epoch": 0.24866780854505663, + "grad_norm": 1.6354150772094727, + "learning_rate": 4.275182802735416e-05, + "loss": 4.6446, + "step": 41812 + }, + { + "epoch": 0.24867375582833762, + "grad_norm": 1.5120378732681274, + "learning_rate": 4.275149912712306e-05, + "loss": 5.091, + "step": 41813 + }, + { + "epoch": 0.2486797031116186, + "grad_norm": 1.5787734985351562, + "learning_rate": 4.2751170220695115e-05, + "loss": 5.303, + "step": 41814 + }, + { + "epoch": 0.24868565039489962, + "grad_norm": 1.4155902862548828, + "learning_rate": 4.275084130807041e-05, + "loss": 5.2274, + "step": 41815 + }, + { + "epoch": 0.2486915976781806, + "grad_norm": 1.3568220138549805, + "learning_rate": 4.275051238924907e-05, + "loss": 5.1496, + "step": 41816 + }, + { + "epoch": 0.2486975449614616, + "grad_norm": 1.859883427619934, + "learning_rate": 4.275018346423121e-05, + "loss": 4.2534, + "step": 41817 + }, + { + "epoch": 0.2487034922447426, + "grad_norm": 2.287292242050171, + "learning_rate": 4.274985453301694e-05, + "loss": 3.3315, + "step": 41818 + }, + { + "epoch": 0.2487094395280236, + "grad_norm": 2.6090712547302246, + "learning_rate": 4.2749525595606385e-05, + "loss": 3.5919, + "step": 41819 + }, + { + "epoch": 0.2487153868113046, + "grad_norm": 1.632788896560669, + "learning_rate": 4.274919665199964e-05, + "loss": 4.9194, + "step": 41820 + }, + { + "epoch": 0.2487213340945856, + "grad_norm": 1.3882077932357788, + "learning_rate": 4.2748867702196855e-05, + "loss": 5.1649, + "step": 41821 + }, + { + "epoch": 0.2487272813778666, + "grad_norm": 2.0455174446105957, + "learning_rate": 4.2748538746198116e-05, + "loss": 3.7828, + "step": 41822 + }, + { + "epoch": 0.24873322866114758, + "grad_norm": 1.4556447267532349, + "learning_rate": 4.274820978400354e-05, + "loss": 5.1036, + "step": 41823 + }, + { + "epoch": 0.2487391759444286, + "grad_norm": 1.6253867149353027, + "learning_rate": 4.274788081561325e-05, + "loss": 4.4603, + "step": 41824 + }, + { + "epoch": 0.24874512322770959, + "grad_norm": 2.4729366302490234, + "learning_rate": 4.274755184102736e-05, + "loss": 3.297, + "step": 41825 + }, + { + "epoch": 0.24875107051099057, + "grad_norm": 2.1666860580444336, + "learning_rate": 4.274722286024599e-05, + "loss": 3.2082, + "step": 41826 + }, + { + "epoch": 0.2487570177942716, + "grad_norm": 2.0941646099090576, + "learning_rate": 4.2746893873269234e-05, + "loss": 2.543, + "step": 41827 + }, + { + "epoch": 0.24876296507755258, + "grad_norm": 2.035738229751587, + "learning_rate": 4.274656488009723e-05, + "loss": 3.0068, + "step": 41828 + }, + { + "epoch": 0.24876891236083357, + "grad_norm": 2.7828142642974854, + "learning_rate": 4.2746235880730076e-05, + "loss": 3.1992, + "step": 41829 + }, + { + "epoch": 0.24877485964411458, + "grad_norm": 2.3640835285186768, + "learning_rate": 4.2745906875167895e-05, + "loss": 3.5248, + "step": 41830 + }, + { + "epoch": 0.24878080692739557, + "grad_norm": 2.4107437133789062, + "learning_rate": 4.274557786341081e-05, + "loss": 3.1192, + "step": 41831 + }, + { + "epoch": 0.24878675421067656, + "grad_norm": 1.6559900045394897, + "learning_rate": 4.274524884545892e-05, + "loss": 4.9137, + "step": 41832 + }, + { + "epoch": 0.24879270149395755, + "grad_norm": 1.7239961624145508, + "learning_rate": 4.274491982131235e-05, + "loss": 4.6582, + "step": 41833 + }, + { + "epoch": 0.24879864877723856, + "grad_norm": 1.6645326614379883, + "learning_rate": 4.274459079097121e-05, + "loss": 4.4348, + "step": 41834 + }, + { + "epoch": 0.24880459606051955, + "grad_norm": 1.4783381223678589, + "learning_rate": 4.2744261754435614e-05, + "loss": 5.0516, + "step": 41835 + }, + { + "epoch": 0.24881054334380054, + "grad_norm": 1.3773260116577148, + "learning_rate": 4.274393271170568e-05, + "loss": 5.1634, + "step": 41836 + }, + { + "epoch": 0.24881649062708155, + "grad_norm": 1.7531635761260986, + "learning_rate": 4.2743603662781526e-05, + "loss": 4.589, + "step": 41837 + }, + { + "epoch": 0.24882243791036254, + "grad_norm": 1.396453619003296, + "learning_rate": 4.274327460766326e-05, + "loss": 5.1252, + "step": 41838 + }, + { + "epoch": 0.24882838519364353, + "grad_norm": 1.3288476467132568, + "learning_rate": 4.274294554635101e-05, + "loss": 4.9922, + "step": 41839 + }, + { + "epoch": 0.24883433247692455, + "grad_norm": 1.6296675205230713, + "learning_rate": 4.274261647884488e-05, + "loss": 5.1394, + "step": 41840 + }, + { + "epoch": 0.24884027976020553, + "grad_norm": 1.7111517190933228, + "learning_rate": 4.2742287405144976e-05, + "loss": 4.7482, + "step": 41841 + }, + { + "epoch": 0.24884622704348652, + "grad_norm": 1.5978788137435913, + "learning_rate": 4.2741958325251426e-05, + "loss": 4.8781, + "step": 41842 + }, + { + "epoch": 0.24885217432676754, + "grad_norm": 1.399695634841919, + "learning_rate": 4.274162923916434e-05, + "loss": 4.7524, + "step": 41843 + }, + { + "epoch": 0.24885812161004853, + "grad_norm": 1.6449240446090698, + "learning_rate": 4.274130014688383e-05, + "loss": 4.633, + "step": 41844 + }, + { + "epoch": 0.24886406889332952, + "grad_norm": 1.4799854755401611, + "learning_rate": 4.274097104841003e-05, + "loss": 4.7642, + "step": 41845 + }, + { + "epoch": 0.24887001617661053, + "grad_norm": 1.5485951900482178, + "learning_rate": 4.274064194374303e-05, + "loss": 4.4262, + "step": 41846 + }, + { + "epoch": 0.24887596345989152, + "grad_norm": 1.6339895725250244, + "learning_rate": 4.274031283288296e-05, + "loss": 4.5101, + "step": 41847 + }, + { + "epoch": 0.2488819107431725, + "grad_norm": 1.5045832395553589, + "learning_rate": 4.273998371582992e-05, + "loss": 4.4667, + "step": 41848 + }, + { + "epoch": 0.24888785802645352, + "grad_norm": 1.4841949939727783, + "learning_rate": 4.273965459258405e-05, + "loss": 4.4464, + "step": 41849 + }, + { + "epoch": 0.2488938053097345, + "grad_norm": 1.6251416206359863, + "learning_rate": 4.273932546314544e-05, + "loss": 4.5534, + "step": 41850 + }, + { + "epoch": 0.2488997525930155, + "grad_norm": 1.4797073602676392, + "learning_rate": 4.273899632751422e-05, + "loss": 5.0323, + "step": 41851 + }, + { + "epoch": 0.24890569987629652, + "grad_norm": 1.3413639068603516, + "learning_rate": 4.2738667185690494e-05, + "loss": 4.8894, + "step": 41852 + }, + { + "epoch": 0.2489116471595775, + "grad_norm": 1.6203540563583374, + "learning_rate": 4.273833803767439e-05, + "loss": 4.7845, + "step": 41853 + }, + { + "epoch": 0.2489175944428585, + "grad_norm": 1.6643965244293213, + "learning_rate": 4.273800888346601e-05, + "loss": 4.7247, + "step": 41854 + }, + { + "epoch": 0.2489235417261395, + "grad_norm": 1.48214590549469, + "learning_rate": 4.273767972306547e-05, + "loss": 5.0028, + "step": 41855 + }, + { + "epoch": 0.2489294890094205, + "grad_norm": 1.5234065055847168, + "learning_rate": 4.27373505564729e-05, + "loss": 5.2908, + "step": 41856 + }, + { + "epoch": 0.24893543629270148, + "grad_norm": 1.4000482559204102, + "learning_rate": 4.27370213836884e-05, + "loss": 5.3256, + "step": 41857 + }, + { + "epoch": 0.2489413835759825, + "grad_norm": 2.1062707901000977, + "learning_rate": 4.273669220471208e-05, + "loss": 5.2864, + "step": 41858 + }, + { + "epoch": 0.2489473308592635, + "grad_norm": 1.7664251327514648, + "learning_rate": 4.273636301954407e-05, + "loss": 5.0748, + "step": 41859 + }, + { + "epoch": 0.24895327814254448, + "grad_norm": 1.6302720308303833, + "learning_rate": 4.273603382818449e-05, + "loss": 4.657, + "step": 41860 + }, + { + "epoch": 0.2489592254258255, + "grad_norm": 1.262400507926941, + "learning_rate": 4.273570463063343e-05, + "loss": 5.2303, + "step": 41861 + }, + { + "epoch": 0.24896517270910648, + "grad_norm": 1.2864395380020142, + "learning_rate": 4.273537542689102e-05, + "loss": 5.0949, + "step": 41862 + }, + { + "epoch": 0.24897111999238747, + "grad_norm": 1.7176706790924072, + "learning_rate": 4.2735046216957375e-05, + "loss": 4.3825, + "step": 41863 + }, + { + "epoch": 0.24897706727566848, + "grad_norm": 1.6844499111175537, + "learning_rate": 4.2734717000832616e-05, + "loss": 4.4183, + "step": 41864 + }, + { + "epoch": 0.24898301455894947, + "grad_norm": 1.2050845623016357, + "learning_rate": 4.2734387778516845e-05, + "loss": 4.8328, + "step": 41865 + }, + { + "epoch": 0.24898896184223046, + "grad_norm": 1.4561463594436646, + "learning_rate": 4.273405855001018e-05, + "loss": 4.8338, + "step": 41866 + }, + { + "epoch": 0.24899490912551148, + "grad_norm": 1.5499346256256104, + "learning_rate": 4.2733729315312744e-05, + "loss": 4.819, + "step": 41867 + }, + { + "epoch": 0.24900085640879246, + "grad_norm": 1.0368562936782837, + "learning_rate": 4.273340007442464e-05, + "loss": 4.7931, + "step": 41868 + }, + { + "epoch": 0.24900680369207345, + "grad_norm": 1.3501802682876587, + "learning_rate": 4.273307082734599e-05, + "loss": 4.6919, + "step": 41869 + }, + { + "epoch": 0.24901275097535447, + "grad_norm": 1.1897794008255005, + "learning_rate": 4.2732741574076916e-05, + "loss": 4.912, + "step": 41870 + }, + { + "epoch": 0.24901869825863546, + "grad_norm": 1.3097944259643555, + "learning_rate": 4.273241231461752e-05, + "loss": 5.2349, + "step": 41871 + }, + { + "epoch": 0.24902464554191645, + "grad_norm": 1.7714036703109741, + "learning_rate": 4.273208304896792e-05, + "loss": 5.4615, + "step": 41872 + }, + { + "epoch": 0.24903059282519746, + "grad_norm": 1.5608513355255127, + "learning_rate": 4.2731753777128234e-05, + "loss": 4.7093, + "step": 41873 + }, + { + "epoch": 0.24903654010847845, + "grad_norm": 1.3599073886871338, + "learning_rate": 4.273142449909857e-05, + "loss": 4.6813, + "step": 41874 + }, + { + "epoch": 0.24904248739175944, + "grad_norm": 1.1686546802520752, + "learning_rate": 4.273109521487906e-05, + "loss": 4.6943, + "step": 41875 + }, + { + "epoch": 0.24904843467504045, + "grad_norm": 1.7033603191375732, + "learning_rate": 4.273076592446981e-05, + "loss": 4.4448, + "step": 41876 + }, + { + "epoch": 0.24905438195832144, + "grad_norm": 1.771150827407837, + "learning_rate": 4.2730436627870916e-05, + "loss": 4.0168, + "step": 41877 + }, + { + "epoch": 0.24906032924160243, + "grad_norm": 2.1038219928741455, + "learning_rate": 4.273010732508252e-05, + "loss": 3.8277, + "step": 41878 + }, + { + "epoch": 0.24906627652488345, + "grad_norm": 1.944787621498108, + "learning_rate": 4.272977801610473e-05, + "loss": 3.8826, + "step": 41879 + }, + { + "epoch": 0.24907222380816443, + "grad_norm": 1.669796347618103, + "learning_rate": 4.2729448700937656e-05, + "loss": 3.8119, + "step": 41880 + }, + { + "epoch": 0.24907817109144542, + "grad_norm": 2.007327079772949, + "learning_rate": 4.272911937958141e-05, + "loss": 3.7147, + "step": 41881 + }, + { + "epoch": 0.24908411837472644, + "grad_norm": 1.5439294576644897, + "learning_rate": 4.2728790052036116e-05, + "loss": 5.2234, + "step": 41882 + }, + { + "epoch": 0.24909006565800743, + "grad_norm": 1.9356173276901245, + "learning_rate": 4.272846071830189e-05, + "loss": 3.4341, + "step": 41883 + }, + { + "epoch": 0.2490960129412884, + "grad_norm": 1.8506463766098022, + "learning_rate": 4.2728131378378826e-05, + "loss": 3.5799, + "step": 41884 + }, + { + "epoch": 0.24910196022456943, + "grad_norm": 1.9116549491882324, + "learning_rate": 4.272780203226707e-05, + "loss": 3.7388, + "step": 41885 + }, + { + "epoch": 0.24910790750785042, + "grad_norm": 1.8231347799301147, + "learning_rate": 4.272747267996671e-05, + "loss": 3.655, + "step": 41886 + }, + { + "epoch": 0.2491138547911314, + "grad_norm": 1.919110894203186, + "learning_rate": 4.272714332147788e-05, + "loss": 3.565, + "step": 41887 + }, + { + "epoch": 0.24911980207441242, + "grad_norm": 1.7250345945358276, + "learning_rate": 4.272681395680068e-05, + "loss": 3.6458, + "step": 41888 + }, + { + "epoch": 0.2491257493576934, + "grad_norm": 1.8194400072097778, + "learning_rate": 4.272648458593524e-05, + "loss": 3.516, + "step": 41889 + }, + { + "epoch": 0.2491316966409744, + "grad_norm": 1.9018691778182983, + "learning_rate": 4.272615520888166e-05, + "loss": 3.4639, + "step": 41890 + }, + { + "epoch": 0.2491376439242554, + "grad_norm": 1.783469557762146, + "learning_rate": 4.272582582564007e-05, + "loss": 3.4894, + "step": 41891 + }, + { + "epoch": 0.2491435912075364, + "grad_norm": 2.4507062435150146, + "learning_rate": 4.2725496436210575e-05, + "loss": 3.8091, + "step": 41892 + }, + { + "epoch": 0.2491495384908174, + "grad_norm": 1.7215583324432373, + "learning_rate": 4.272516704059329e-05, + "loss": 3.5433, + "step": 41893 + }, + { + "epoch": 0.24915548577409838, + "grad_norm": 1.862190842628479, + "learning_rate": 4.2724837638788334e-05, + "loss": 3.7669, + "step": 41894 + }, + { + "epoch": 0.2491614330573794, + "grad_norm": 1.9380687475204468, + "learning_rate": 4.272450823079582e-05, + "loss": 3.5056, + "step": 41895 + }, + { + "epoch": 0.24916738034066038, + "grad_norm": 1.9195544719696045, + "learning_rate": 4.2724178816615865e-05, + "loss": 3.5728, + "step": 41896 + }, + { + "epoch": 0.24917332762394137, + "grad_norm": 1.8414493799209595, + "learning_rate": 4.272384939624858e-05, + "loss": 3.6718, + "step": 41897 + }, + { + "epoch": 0.2491792749072224, + "grad_norm": 1.601354956626892, + "learning_rate": 4.272351996969408e-05, + "loss": 3.7175, + "step": 41898 + }, + { + "epoch": 0.24918522219050337, + "grad_norm": 1.6822525262832642, + "learning_rate": 4.2723190536952485e-05, + "loss": 3.8649, + "step": 41899 + }, + { + "epoch": 0.24919116947378436, + "grad_norm": 1.6870274543762207, + "learning_rate": 4.27228610980239e-05, + "loss": 5.1115, + "step": 41900 + }, + { + "epoch": 0.24919711675706538, + "grad_norm": 1.7757606506347656, + "learning_rate": 4.272253165290846e-05, + "loss": 3.6391, + "step": 41901 + }, + { + "epoch": 0.24920306404034637, + "grad_norm": 1.661707878112793, + "learning_rate": 4.2722202201606264e-05, + "loss": 3.8096, + "step": 41902 + }, + { + "epoch": 0.24920901132362736, + "grad_norm": 1.847411870956421, + "learning_rate": 4.272187274411743e-05, + "loss": 3.5457, + "step": 41903 + }, + { + "epoch": 0.24921495860690837, + "grad_norm": 1.8748857975006104, + "learning_rate": 4.2721543280442063e-05, + "loss": 3.6347, + "step": 41904 + }, + { + "epoch": 0.24922090589018936, + "grad_norm": 1.7488704919815063, + "learning_rate": 4.2721213810580304e-05, + "loss": 3.6073, + "step": 41905 + }, + { + "epoch": 0.24922685317347035, + "grad_norm": 1.6388825178146362, + "learning_rate": 4.272088433453224e-05, + "loss": 3.5956, + "step": 41906 + }, + { + "epoch": 0.24923280045675136, + "grad_norm": 1.6146447658538818, + "learning_rate": 4.272055485229801e-05, + "loss": 3.4853, + "step": 41907 + }, + { + "epoch": 0.24923874774003235, + "grad_norm": 1.8111872673034668, + "learning_rate": 4.272022536387771e-05, + "loss": 3.3789, + "step": 41908 + }, + { + "epoch": 0.24924469502331334, + "grad_norm": 1.8881454467773438, + "learning_rate": 4.271989586927146e-05, + "loss": 3.4572, + "step": 41909 + }, + { + "epoch": 0.24925064230659436, + "grad_norm": 1.7342952489852905, + "learning_rate": 4.271956636847938e-05, + "loss": 3.493, + "step": 41910 + }, + { + "epoch": 0.24925658958987534, + "grad_norm": 1.5744178295135498, + "learning_rate": 4.2719236861501586e-05, + "loss": 3.602, + "step": 41911 + }, + { + "epoch": 0.24926253687315633, + "grad_norm": 1.8583720922470093, + "learning_rate": 4.271890734833819e-05, + "loss": 4.6376, + "step": 41912 + }, + { + "epoch": 0.24926848415643735, + "grad_norm": 1.8117424249649048, + "learning_rate": 4.27185778289893e-05, + "loss": 3.557, + "step": 41913 + }, + { + "epoch": 0.24927443143971834, + "grad_norm": 2.052053928375244, + "learning_rate": 4.271824830345504e-05, + "loss": 3.6966, + "step": 41914 + }, + { + "epoch": 0.24928037872299932, + "grad_norm": 1.7351378202438354, + "learning_rate": 4.271791877173552e-05, + "loss": 3.4319, + "step": 41915 + }, + { + "epoch": 0.24928632600628034, + "grad_norm": 1.7777881622314453, + "learning_rate": 4.271758923383086e-05, + "loss": 3.414, + "step": 41916 + }, + { + "epoch": 0.24929227328956133, + "grad_norm": 1.6943188905715942, + "learning_rate": 4.271725968974117e-05, + "loss": 3.5698, + "step": 41917 + }, + { + "epoch": 0.24929822057284232, + "grad_norm": 1.9173147678375244, + "learning_rate": 4.271693013946658e-05, + "loss": 3.2204, + "step": 41918 + }, + { + "epoch": 0.24930416785612333, + "grad_norm": 1.7989587783813477, + "learning_rate": 4.2716600583007184e-05, + "loss": 3.3528, + "step": 41919 + }, + { + "epoch": 0.24931011513940432, + "grad_norm": 1.69678795337677, + "learning_rate": 4.27162710203631e-05, + "loss": 3.5693, + "step": 41920 + }, + { + "epoch": 0.2493160624226853, + "grad_norm": 1.872271180152893, + "learning_rate": 4.271594145153446e-05, + "loss": 3.4395, + "step": 41921 + }, + { + "epoch": 0.24932200970596632, + "grad_norm": 1.6961418390274048, + "learning_rate": 4.271561187652136e-05, + "loss": 3.4208, + "step": 41922 + }, + { + "epoch": 0.2493279569892473, + "grad_norm": 1.722124695777893, + "learning_rate": 4.2715282295323924e-05, + "loss": 3.484, + "step": 41923 + }, + { + "epoch": 0.2493339042725283, + "grad_norm": 1.6818629503250122, + "learning_rate": 4.2714952707942266e-05, + "loss": 3.7308, + "step": 41924 + }, + { + "epoch": 0.24933985155580932, + "grad_norm": 1.7677801847457886, + "learning_rate": 4.271462311437651e-05, + "loss": 3.3223, + "step": 41925 + }, + { + "epoch": 0.2493457988390903, + "grad_norm": 1.8028302192687988, + "learning_rate": 4.2714293514626745e-05, + "loss": 3.3936, + "step": 41926 + }, + { + "epoch": 0.2493517461223713, + "grad_norm": 1.6981196403503418, + "learning_rate": 4.2713963908693114e-05, + "loss": 3.2983, + "step": 41927 + }, + { + "epoch": 0.2493576934056523, + "grad_norm": 1.5057685375213623, + "learning_rate": 4.271363429657572e-05, + "loss": 3.5299, + "step": 41928 + }, + { + "epoch": 0.2493636406889333, + "grad_norm": 1.7220723628997803, + "learning_rate": 4.271330467827468e-05, + "loss": 2.9646, + "step": 41929 + }, + { + "epoch": 0.24936958797221428, + "grad_norm": 1.8191083669662476, + "learning_rate": 4.271297505379011e-05, + "loss": 3.3575, + "step": 41930 + }, + { + "epoch": 0.2493755352554953, + "grad_norm": 1.831428050994873, + "learning_rate": 4.271264542312212e-05, + "loss": 3.4435, + "step": 41931 + }, + { + "epoch": 0.2493814825387763, + "grad_norm": 1.9449255466461182, + "learning_rate": 4.271231578627083e-05, + "loss": 3.4892, + "step": 41932 + }, + { + "epoch": 0.24938742982205728, + "grad_norm": 1.9572843313217163, + "learning_rate": 4.271198614323635e-05, + "loss": 3.4599, + "step": 41933 + }, + { + "epoch": 0.2493933771053383, + "grad_norm": 1.8705164194107056, + "learning_rate": 4.2711656494018806e-05, + "loss": 3.4144, + "step": 41934 + }, + { + "epoch": 0.24939932438861928, + "grad_norm": 2.046175241470337, + "learning_rate": 4.2711326838618295e-05, + "loss": 3.5113, + "step": 41935 + }, + { + "epoch": 0.24940527167190027, + "grad_norm": 1.9441622495651245, + "learning_rate": 4.271099717703495e-05, + "loss": 3.5757, + "step": 41936 + }, + { + "epoch": 0.24941121895518129, + "grad_norm": 1.9650280475616455, + "learning_rate": 4.271066750926888e-05, + "loss": 3.4573, + "step": 41937 + }, + { + "epoch": 0.24941716623846227, + "grad_norm": 2.037990093231201, + "learning_rate": 4.2710337835320194e-05, + "loss": 3.2876, + "step": 41938 + }, + { + "epoch": 0.24942311352174326, + "grad_norm": 1.8767162561416626, + "learning_rate": 4.271000815518902e-05, + "loss": 3.4425, + "step": 41939 + }, + { + "epoch": 0.24942906080502428, + "grad_norm": 1.734762191772461, + "learning_rate": 4.2709678468875456e-05, + "loss": 3.6552, + "step": 41940 + }, + { + "epoch": 0.24943500808830527, + "grad_norm": 1.9267102479934692, + "learning_rate": 4.270934877637963e-05, + "loss": 3.5253, + "step": 41941 + }, + { + "epoch": 0.24944095537158625, + "grad_norm": 1.939378023147583, + "learning_rate": 4.270901907770165e-05, + "loss": 3.4638, + "step": 41942 + }, + { + "epoch": 0.24944690265486727, + "grad_norm": 1.6441813707351685, + "learning_rate": 4.270868937284164e-05, + "loss": 3.6354, + "step": 41943 + }, + { + "epoch": 0.24945284993814826, + "grad_norm": 1.706351637840271, + "learning_rate": 4.270835966179971e-05, + "loss": 3.3666, + "step": 41944 + }, + { + "epoch": 0.24945879722142925, + "grad_norm": 1.8754465579986572, + "learning_rate": 4.2708029944575965e-05, + "loss": 3.4919, + "step": 41945 + }, + { + "epoch": 0.24946474450471026, + "grad_norm": 1.766389012336731, + "learning_rate": 4.2707700221170546e-05, + "loss": 3.5696, + "step": 41946 + }, + { + "epoch": 0.24947069178799125, + "grad_norm": 1.633712887763977, + "learning_rate": 4.2707370491583535e-05, + "loss": 4.0483, + "step": 41947 + }, + { + "epoch": 0.24947663907127224, + "grad_norm": 2.067017078399658, + "learning_rate": 4.2707040755815074e-05, + "loss": 3.9583, + "step": 41948 + }, + { + "epoch": 0.24948258635455323, + "grad_norm": 1.7265002727508545, + "learning_rate": 4.2706711013865264e-05, + "loss": 3.9791, + "step": 41949 + }, + { + "epoch": 0.24948853363783424, + "grad_norm": 1.6950510740280151, + "learning_rate": 4.270638126573423e-05, + "loss": 4.0367, + "step": 41950 + }, + { + "epoch": 0.24949448092111523, + "grad_norm": 2.1413466930389404, + "learning_rate": 4.2706051511422075e-05, + "loss": 3.7057, + "step": 41951 + }, + { + "epoch": 0.24950042820439622, + "grad_norm": 2.090174436569214, + "learning_rate": 4.270572175092892e-05, + "loss": 3.5776, + "step": 41952 + }, + { + "epoch": 0.24950637548767723, + "grad_norm": 1.92526113986969, + "learning_rate": 4.270539198425488e-05, + "loss": 3.8576, + "step": 41953 + }, + { + "epoch": 0.24951232277095822, + "grad_norm": 1.4453459978103638, + "learning_rate": 4.270506221140008e-05, + "loss": 4.8279, + "step": 41954 + }, + { + "epoch": 0.2495182700542392, + "grad_norm": 1.9510153532028198, + "learning_rate": 4.270473243236461e-05, + "loss": 3.9956, + "step": 41955 + }, + { + "epoch": 0.24952421733752023, + "grad_norm": 1.9541722536087036, + "learning_rate": 4.270440264714861e-05, + "loss": 3.5111, + "step": 41956 + }, + { + "epoch": 0.24953016462080121, + "grad_norm": 1.621846318244934, + "learning_rate": 4.270407285575219e-05, + "loss": 4.394, + "step": 41957 + }, + { + "epoch": 0.2495361119040822, + "grad_norm": 1.8765959739685059, + "learning_rate": 4.270374305817546e-05, + "loss": 3.8144, + "step": 41958 + }, + { + "epoch": 0.24954205918736322, + "grad_norm": 1.7183656692504883, + "learning_rate": 4.270341325441853e-05, + "loss": 4.8727, + "step": 41959 + }, + { + "epoch": 0.2495480064706442, + "grad_norm": 1.6017158031463623, + "learning_rate": 4.270308344448152e-05, + "loss": 5.3048, + "step": 41960 + }, + { + "epoch": 0.2495539537539252, + "grad_norm": 1.6048821210861206, + "learning_rate": 4.270275362836455e-05, + "loss": 4.9251, + "step": 41961 + }, + { + "epoch": 0.2495599010372062, + "grad_norm": 1.6261155605316162, + "learning_rate": 4.270242380606774e-05, + "loss": 4.5564, + "step": 41962 + }, + { + "epoch": 0.2495658483204872, + "grad_norm": 1.5177656412124634, + "learning_rate": 4.2702093977591184e-05, + "loss": 5.1959, + "step": 41963 + }, + { + "epoch": 0.2495717956037682, + "grad_norm": 1.4645196199417114, + "learning_rate": 4.270176414293502e-05, + "loss": 5.0972, + "step": 41964 + }, + { + "epoch": 0.2495777428870492, + "grad_norm": 2.7350735664367676, + "learning_rate": 4.270143430209934e-05, + "loss": 3.9562, + "step": 41965 + }, + { + "epoch": 0.2495836901703302, + "grad_norm": 1.7712966203689575, + "learning_rate": 4.270110445508428e-05, + "loss": 3.9755, + "step": 41966 + }, + { + "epoch": 0.24958963745361118, + "grad_norm": 1.6321653127670288, + "learning_rate": 4.270077460188995e-05, + "loss": 4.6292, + "step": 41967 + }, + { + "epoch": 0.2495955847368922, + "grad_norm": 1.2106133699417114, + "learning_rate": 4.270044474251646e-05, + "loss": 5.431, + "step": 41968 + }, + { + "epoch": 0.24960153202017318, + "grad_norm": 1.8054934740066528, + "learning_rate": 4.270011487696393e-05, + "loss": 3.9998, + "step": 41969 + }, + { + "epoch": 0.24960747930345417, + "grad_norm": 2.054405450820923, + "learning_rate": 4.269978500523247e-05, + "loss": 3.5762, + "step": 41970 + }, + { + "epoch": 0.2496134265867352, + "grad_norm": 2.2991819381713867, + "learning_rate": 4.2699455127322195e-05, + "loss": 3.0228, + "step": 41971 + }, + { + "epoch": 0.24961937387001618, + "grad_norm": 2.240530014038086, + "learning_rate": 4.269912524323323e-05, + "loss": 3.0045, + "step": 41972 + }, + { + "epoch": 0.24962532115329716, + "grad_norm": 1.8001304864883423, + "learning_rate": 4.2698795352965675e-05, + "loss": 4.0653, + "step": 41973 + }, + { + "epoch": 0.24963126843657818, + "grad_norm": 1.63943350315094, + "learning_rate": 4.269846545651966e-05, + "loss": 4.783, + "step": 41974 + }, + { + "epoch": 0.24963721571985917, + "grad_norm": 2.408764600753784, + "learning_rate": 4.269813555389529e-05, + "loss": 3.1798, + "step": 41975 + }, + { + "epoch": 0.24964316300314016, + "grad_norm": 2.0278191566467285, + "learning_rate": 4.269780564509268e-05, + "loss": 3.2102, + "step": 41976 + }, + { + "epoch": 0.24964911028642117, + "grad_norm": 2.367107391357422, + "learning_rate": 4.2697475730111955e-05, + "loss": 3.1621, + "step": 41977 + }, + { + "epoch": 0.24965505756970216, + "grad_norm": 2.4795262813568115, + "learning_rate": 4.2697145808953224e-05, + "loss": 3.203, + "step": 41978 + }, + { + "epoch": 0.24966100485298315, + "grad_norm": 2.1575825214385986, + "learning_rate": 4.26968158816166e-05, + "loss": 3.1548, + "step": 41979 + }, + { + "epoch": 0.24966695213626416, + "grad_norm": 2.2089765071868896, + "learning_rate": 4.2696485948102205e-05, + "loss": 3.0741, + "step": 41980 + }, + { + "epoch": 0.24967289941954515, + "grad_norm": 2.030851364135742, + "learning_rate": 4.269615600841015e-05, + "loss": 3.3582, + "step": 41981 + }, + { + "epoch": 0.24967884670282614, + "grad_norm": 1.8664506673812866, + "learning_rate": 4.2695826062540545e-05, + "loss": 4.7618, + "step": 41982 + }, + { + "epoch": 0.24968479398610716, + "grad_norm": 2.0165164470672607, + "learning_rate": 4.2695496110493504e-05, + "loss": 3.8343, + "step": 41983 + }, + { + "epoch": 0.24969074126938814, + "grad_norm": 1.9938582181930542, + "learning_rate": 4.269516615226916e-05, + "loss": 3.7492, + "step": 41984 + }, + { + "epoch": 0.24969668855266913, + "grad_norm": 1.3306089639663696, + "learning_rate": 4.2694836187867607e-05, + "loss": 4.838, + "step": 41985 + }, + { + "epoch": 0.24970263583595015, + "grad_norm": 1.563686490058899, + "learning_rate": 4.2694506217288975e-05, + "loss": 4.9128, + "step": 41986 + }, + { + "epoch": 0.24970858311923114, + "grad_norm": 1.8690180778503418, + "learning_rate": 4.269417624053337e-05, + "loss": 4.1879, + "step": 41987 + }, + { + "epoch": 0.24971453040251212, + "grad_norm": 1.797075629234314, + "learning_rate": 4.2693846257600914e-05, + "loss": 4.2136, + "step": 41988 + }, + { + "epoch": 0.24972047768579314, + "grad_norm": 1.7833689451217651, + "learning_rate": 4.2693516268491715e-05, + "loss": 4.5013, + "step": 41989 + }, + { + "epoch": 0.24972642496907413, + "grad_norm": 1.386513113975525, + "learning_rate": 4.26931862732059e-05, + "loss": 4.714, + "step": 41990 + }, + { + "epoch": 0.24973237225235512, + "grad_norm": 1.8435289859771729, + "learning_rate": 4.269285627174358e-05, + "loss": 4.298, + "step": 41991 + }, + { + "epoch": 0.24973831953563613, + "grad_norm": 1.7148083448410034, + "learning_rate": 4.2692526264104855e-05, + "loss": 4.3501, + "step": 41992 + }, + { + "epoch": 0.24974426681891712, + "grad_norm": 2.1411547660827637, + "learning_rate": 4.2692196250289854e-05, + "loss": 3.7185, + "step": 41993 + }, + { + "epoch": 0.2497502141021981, + "grad_norm": 2.058345079421997, + "learning_rate": 4.2691866230298694e-05, + "loss": 3.8084, + "step": 41994 + }, + { + "epoch": 0.24975616138547913, + "grad_norm": 2.2572317123413086, + "learning_rate": 4.2691536204131486e-05, + "loss": 3.2308, + "step": 41995 + }, + { + "epoch": 0.2497621086687601, + "grad_norm": 1.9866859912872314, + "learning_rate": 4.2691206171788336e-05, + "loss": 3.3118, + "step": 41996 + }, + { + "epoch": 0.2497680559520411, + "grad_norm": 1.7559192180633545, + "learning_rate": 4.269087613326938e-05, + "loss": 3.8865, + "step": 41997 + }, + { + "epoch": 0.24977400323532212, + "grad_norm": 2.048192262649536, + "learning_rate": 4.269054608857472e-05, + "loss": 3.7505, + "step": 41998 + }, + { + "epoch": 0.2497799505186031, + "grad_norm": 1.8734562397003174, + "learning_rate": 4.269021603770447e-05, + "loss": 3.3218, + "step": 41999 + }, + { + "epoch": 0.2497858978018841, + "grad_norm": 1.73810613155365, + "learning_rate": 4.268988598065875e-05, + "loss": 4.0167, + "step": 42000 + }, + { + "epoch": 0.2497918450851651, + "grad_norm": 2.1857659816741943, + "learning_rate": 4.268955591743767e-05, + "loss": 3.4139, + "step": 42001 + }, + { + "epoch": 0.2497977923684461, + "grad_norm": 2.1950178146362305, + "learning_rate": 4.2689225848041356e-05, + "loss": 4.1516, + "step": 42002 + }, + { + "epoch": 0.24980373965172709, + "grad_norm": 1.830642819404602, + "learning_rate": 4.268889577246991e-05, + "loss": 3.8779, + "step": 42003 + }, + { + "epoch": 0.2498096869350081, + "grad_norm": 1.5347234010696411, + "learning_rate": 4.268856569072346e-05, + "loss": 4.4696, + "step": 42004 + }, + { + "epoch": 0.2498156342182891, + "grad_norm": 1.4467964172363281, + "learning_rate": 4.268823560280211e-05, + "loss": 4.5637, + "step": 42005 + }, + { + "epoch": 0.24982158150157008, + "grad_norm": 1.7737587690353394, + "learning_rate": 4.2687905508705974e-05, + "loss": 4.1176, + "step": 42006 + }, + { + "epoch": 0.24982752878485107, + "grad_norm": 1.5177428722381592, + "learning_rate": 4.268757540843517e-05, + "loss": 4.7145, + "step": 42007 + }, + { + "epoch": 0.24983347606813208, + "grad_norm": 1.7481968402862549, + "learning_rate": 4.2687245301989834e-05, + "loss": 3.9384, + "step": 42008 + }, + { + "epoch": 0.24983942335141307, + "grad_norm": 1.6744271516799927, + "learning_rate": 4.268691518937005e-05, + "loss": 4.6336, + "step": 42009 + }, + { + "epoch": 0.24984537063469406, + "grad_norm": 1.5110716819763184, + "learning_rate": 4.2686585070575955e-05, + "loss": 4.831, + "step": 42010 + }, + { + "epoch": 0.24985131791797507, + "grad_norm": 1.51256263256073, + "learning_rate": 4.268625494560765e-05, + "loss": 4.7713, + "step": 42011 + }, + { + "epoch": 0.24985726520125606, + "grad_norm": 2.0537006855010986, + "learning_rate": 4.268592481446526e-05, + "loss": 3.7941, + "step": 42012 + }, + { + "epoch": 0.24986321248453705, + "grad_norm": 1.9153227806091309, + "learning_rate": 4.268559467714889e-05, + "loss": 4.4509, + "step": 42013 + }, + { + "epoch": 0.24986915976781807, + "grad_norm": 1.644529938697815, + "learning_rate": 4.268526453365867e-05, + "loss": 4.6224, + "step": 42014 + }, + { + "epoch": 0.24987510705109905, + "grad_norm": 1.649513602256775, + "learning_rate": 4.2684934383994704e-05, + "loss": 4.3099, + "step": 42015 + }, + { + "epoch": 0.24988105433438004, + "grad_norm": 1.5516279935836792, + "learning_rate": 4.2684604228157105e-05, + "loss": 4.5685, + "step": 42016 + }, + { + "epoch": 0.24988700161766106, + "grad_norm": 1.5081168413162231, + "learning_rate": 4.2684274066146e-05, + "loss": 4.3069, + "step": 42017 + }, + { + "epoch": 0.24989294890094205, + "grad_norm": 1.398681640625, + "learning_rate": 4.2683943897961485e-05, + "loss": 4.455, + "step": 42018 + }, + { + "epoch": 0.24989889618422303, + "grad_norm": 1.4521502256393433, + "learning_rate": 4.268361372360371e-05, + "loss": 4.2511, + "step": 42019 + }, + { + "epoch": 0.24990484346750405, + "grad_norm": 1.5764449834823608, + "learning_rate": 4.268328354307275e-05, + "loss": 4.5032, + "step": 42020 + }, + { + "epoch": 0.24991079075078504, + "grad_norm": 1.9262803792953491, + "learning_rate": 4.2682953356368746e-05, + "loss": 4.6235, + "step": 42021 + }, + { + "epoch": 0.24991673803406603, + "grad_norm": 1.482417345046997, + "learning_rate": 4.26826231634918e-05, + "loss": 4.7804, + "step": 42022 + }, + { + "epoch": 0.24992268531734704, + "grad_norm": 1.8463369607925415, + "learning_rate": 4.268229296444204e-05, + "loss": 5.177, + "step": 42023 + }, + { + "epoch": 0.24992863260062803, + "grad_norm": 1.6022560596466064, + "learning_rate": 4.268196275921957e-05, + "loss": 4.6181, + "step": 42024 + }, + { + "epoch": 0.24993457988390902, + "grad_norm": 1.3762532472610474, + "learning_rate": 4.268163254782451e-05, + "loss": 4.6078, + "step": 42025 + }, + { + "epoch": 0.24994052716719004, + "grad_norm": 1.5606143474578857, + "learning_rate": 4.2681302330256973e-05, + "loss": 4.5704, + "step": 42026 + }, + { + "epoch": 0.24994647445047102, + "grad_norm": 1.5168718099594116, + "learning_rate": 4.268097210651707e-05, + "loss": 4.7955, + "step": 42027 + }, + { + "epoch": 0.249952421733752, + "grad_norm": 1.7735203504562378, + "learning_rate": 4.268064187660494e-05, + "loss": 4.0775, + "step": 42028 + }, + { + "epoch": 0.24995836901703303, + "grad_norm": 1.357080101966858, + "learning_rate": 4.2680311640520665e-05, + "loss": 4.6047, + "step": 42029 + }, + { + "epoch": 0.24996431630031402, + "grad_norm": 2.0982635021209717, + "learning_rate": 4.267998139826439e-05, + "loss": 4.6932, + "step": 42030 + }, + { + "epoch": 0.249970263583595, + "grad_norm": 1.7636460065841675, + "learning_rate": 4.26796511498362e-05, + "loss": 4.6443, + "step": 42031 + }, + { + "epoch": 0.24997621086687602, + "grad_norm": 1.4254063367843628, + "learning_rate": 4.267932089523624e-05, + "loss": 5.0191, + "step": 42032 + }, + { + "epoch": 0.249982158150157, + "grad_norm": 1.5380072593688965, + "learning_rate": 4.2678990634464596e-05, + "loss": 4.4603, + "step": 42033 + }, + { + "epoch": 0.249988105433438, + "grad_norm": 1.6421701908111572, + "learning_rate": 4.2678660367521415e-05, + "loss": 4.6337, + "step": 42034 + }, + { + "epoch": 0.249994052716719, + "grad_norm": 1.4354192018508911, + "learning_rate": 4.267833009440679e-05, + "loss": 4.529, + "step": 42035 + }, + { + "epoch": 0.25, + "grad_norm": 1.528843879699707, + "learning_rate": 4.2677999815120836e-05, + "loss": 4.7061, + "step": 42036 + }, + { + "epoch": 0.250005947283281, + "grad_norm": 1.5022860765457153, + "learning_rate": 4.267766952966369e-05, + "loss": 4.584, + "step": 42037 + }, + { + "epoch": 0.250011894566562, + "grad_norm": 1.5518206357955933, + "learning_rate": 4.267733923803544e-05, + "loss": 4.2487, + "step": 42038 + }, + { + "epoch": 0.250017841849843, + "grad_norm": 1.6720249652862549, + "learning_rate": 4.2677008940236224e-05, + "loss": 4.2693, + "step": 42039 + }, + { + "epoch": 0.250023789133124, + "grad_norm": 1.9149385690689087, + "learning_rate": 4.267667863626614e-05, + "loss": 4.1177, + "step": 42040 + }, + { + "epoch": 0.25002973641640497, + "grad_norm": 1.8054261207580566, + "learning_rate": 4.2676348326125304e-05, + "loss": 2.3348, + "step": 42041 + }, + { + "epoch": 0.250035683699686, + "grad_norm": 1.350380301475525, + "learning_rate": 4.267601800981385e-05, + "loss": 4.5777, + "step": 42042 + }, + { + "epoch": 0.250041630982967, + "grad_norm": 1.3127453327178955, + "learning_rate": 4.267568768733187e-05, + "loss": 4.5939, + "step": 42043 + }, + { + "epoch": 0.25004757826624796, + "grad_norm": 2.013850688934326, + "learning_rate": 4.2675357358679493e-05, + "loss": 4.1044, + "step": 42044 + }, + { + "epoch": 0.250053525549529, + "grad_norm": 2.967409133911133, + "learning_rate": 4.267502702385684e-05, + "loss": 2.9048, + "step": 42045 + }, + { + "epoch": 0.25005947283281, + "grad_norm": 2.284320592880249, + "learning_rate": 4.267469668286401e-05, + "loss": 3.569, + "step": 42046 + }, + { + "epoch": 0.25006542011609095, + "grad_norm": 1.3512487411499023, + "learning_rate": 4.267436633570112e-05, + "loss": 4.4494, + "step": 42047 + }, + { + "epoch": 0.25007136739937197, + "grad_norm": 1.6354233026504517, + "learning_rate": 4.26740359823683e-05, + "loss": 4.3988, + "step": 42048 + }, + { + "epoch": 0.250077314682653, + "grad_norm": 1.5703768730163574, + "learning_rate": 4.2673705622865655e-05, + "loss": 4.4087, + "step": 42049 + }, + { + "epoch": 0.25008326196593395, + "grad_norm": 1.5815764665603638, + "learning_rate": 4.26733752571933e-05, + "loss": 3.9309, + "step": 42050 + }, + { + "epoch": 0.25008920924921496, + "grad_norm": 1.8369239568710327, + "learning_rate": 4.267304488535136e-05, + "loss": 4.2202, + "step": 42051 + }, + { + "epoch": 0.250095156532496, + "grad_norm": 1.6301475763320923, + "learning_rate": 4.267271450733994e-05, + "loss": 4.1156, + "step": 42052 + }, + { + "epoch": 0.25010110381577694, + "grad_norm": 1.7693965435028076, + "learning_rate": 4.267238412315915e-05, + "loss": 4.7073, + "step": 42053 + }, + { + "epoch": 0.25010705109905795, + "grad_norm": 1.7500617504119873, + "learning_rate": 4.267205373280912e-05, + "loss": 4.5884, + "step": 42054 + }, + { + "epoch": 0.25011299838233897, + "grad_norm": 1.4837998151779175, + "learning_rate": 4.267172333628995e-05, + "loss": 4.6624, + "step": 42055 + }, + { + "epoch": 0.25011894566561993, + "grad_norm": 1.9075064659118652, + "learning_rate": 4.267139293360177e-05, + "loss": 4.5834, + "step": 42056 + }, + { + "epoch": 0.25012489294890095, + "grad_norm": 1.6215492486953735, + "learning_rate": 4.26710625247447e-05, + "loss": 4.6217, + "step": 42057 + }, + { + "epoch": 0.25013084023218196, + "grad_norm": 1.433763027191162, + "learning_rate": 4.2670732109718826e-05, + "loss": 4.4596, + "step": 42058 + }, + { + "epoch": 0.2501367875154629, + "grad_norm": 1.525185465812683, + "learning_rate": 4.267040168852429e-05, + "loss": 4.3899, + "step": 42059 + }, + { + "epoch": 0.25014273479874394, + "grad_norm": 1.7129361629486084, + "learning_rate": 4.267007126116121e-05, + "loss": 4.5663, + "step": 42060 + }, + { + "epoch": 0.25014868208202495, + "grad_norm": 1.5235825777053833, + "learning_rate": 4.266974082762967e-05, + "loss": 4.649, + "step": 42061 + }, + { + "epoch": 0.2501546293653059, + "grad_norm": 1.44434654712677, + "learning_rate": 4.2669410387929823e-05, + "loss": 4.5442, + "step": 42062 + }, + { + "epoch": 0.25016057664858693, + "grad_norm": 1.4174541234970093, + "learning_rate": 4.266907994206176e-05, + "loss": 4.5513, + "step": 42063 + }, + { + "epoch": 0.25016652393186795, + "grad_norm": 1.7328169345855713, + "learning_rate": 4.26687494900256e-05, + "loss": 4.5108, + "step": 42064 + }, + { + "epoch": 0.2501724712151489, + "grad_norm": 1.5344183444976807, + "learning_rate": 4.266841903182147e-05, + "loss": 4.3182, + "step": 42065 + }, + { + "epoch": 0.2501784184984299, + "grad_norm": 1.5908042192459106, + "learning_rate": 4.266808856744947e-05, + "loss": 4.474, + "step": 42066 + }, + { + "epoch": 0.25018436578171094, + "grad_norm": 1.3672078847885132, + "learning_rate": 4.266775809690973e-05, + "loss": 4.4647, + "step": 42067 + }, + { + "epoch": 0.2501903130649919, + "grad_norm": 1.7701510190963745, + "learning_rate": 4.266742762020235e-05, + "loss": 4.3699, + "step": 42068 + }, + { + "epoch": 0.2501962603482729, + "grad_norm": 1.456886887550354, + "learning_rate": 4.266709713732746e-05, + "loss": 4.4679, + "step": 42069 + }, + { + "epoch": 0.25020220763155393, + "grad_norm": 1.5491695404052734, + "learning_rate": 4.2666766648285174e-05, + "loss": 4.3308, + "step": 42070 + }, + { + "epoch": 0.2502081549148349, + "grad_norm": 1.707772970199585, + "learning_rate": 4.266643615307559e-05, + "loss": 4.4662, + "step": 42071 + }, + { + "epoch": 0.2502141021981159, + "grad_norm": 1.6241666078567505, + "learning_rate": 4.266610565169884e-05, + "loss": 4.4731, + "step": 42072 + }, + { + "epoch": 0.2502200494813969, + "grad_norm": 1.611916184425354, + "learning_rate": 4.266577514415504e-05, + "loss": 4.2956, + "step": 42073 + }, + { + "epoch": 0.2502259967646779, + "grad_norm": 1.7912410497665405, + "learning_rate": 4.26654446304443e-05, + "loss": 4.4148, + "step": 42074 + }, + { + "epoch": 0.2502319440479589, + "grad_norm": 1.4658359289169312, + "learning_rate": 4.266511411056673e-05, + "loss": 4.4398, + "step": 42075 + }, + { + "epoch": 0.2502378913312399, + "grad_norm": 1.44290030002594, + "learning_rate": 4.2664783584522446e-05, + "loss": 4.3622, + "step": 42076 + }, + { + "epoch": 0.2502438386145209, + "grad_norm": 1.5542776584625244, + "learning_rate": 4.266445305231158e-05, + "loss": 4.3709, + "step": 42077 + }, + { + "epoch": 0.2502497858978019, + "grad_norm": 1.4779601097106934, + "learning_rate": 4.2664122513934236e-05, + "loss": 4.4298, + "step": 42078 + }, + { + "epoch": 0.2502557331810829, + "grad_norm": 1.459057092666626, + "learning_rate": 4.266379196939052e-05, + "loss": 4.3654, + "step": 42079 + }, + { + "epoch": 0.25026168046436387, + "grad_norm": 2.0005850791931152, + "learning_rate": 4.266346141868057e-05, + "loss": 3.7136, + "step": 42080 + }, + { + "epoch": 0.2502676277476449, + "grad_norm": 1.5165681838989258, + "learning_rate": 4.266313086180448e-05, + "loss": 4.2271, + "step": 42081 + }, + { + "epoch": 0.2502735750309259, + "grad_norm": 2.0624310970306396, + "learning_rate": 4.266280029876237e-05, + "loss": 4.2353, + "step": 42082 + }, + { + "epoch": 0.25027952231420686, + "grad_norm": 1.7126058340072632, + "learning_rate": 4.2662469729554365e-05, + "loss": 4.2384, + "step": 42083 + }, + { + "epoch": 0.2502854695974879, + "grad_norm": 2.071988344192505, + "learning_rate": 4.266213915418057e-05, + "loss": 3.5715, + "step": 42084 + }, + { + "epoch": 0.2502914168807689, + "grad_norm": 2.084437131881714, + "learning_rate": 4.266180857264111e-05, + "loss": 3.3741, + "step": 42085 + }, + { + "epoch": 0.25029736416404985, + "grad_norm": 2.3012595176696777, + "learning_rate": 4.2661477984936086e-05, + "loss": 3.7379, + "step": 42086 + }, + { + "epoch": 0.25030331144733087, + "grad_norm": 2.5668399333953857, + "learning_rate": 4.266114739106564e-05, + "loss": 3.6191, + "step": 42087 + }, + { + "epoch": 0.2503092587306119, + "grad_norm": 1.4688360691070557, + "learning_rate": 4.266081679102985e-05, + "loss": 3.89, + "step": 42088 + }, + { + "epoch": 0.25031520601389284, + "grad_norm": 1.6073497533798218, + "learning_rate": 4.266048618482886e-05, + "loss": 3.8644, + "step": 42089 + }, + { + "epoch": 0.25032115329717386, + "grad_norm": 2.1671597957611084, + "learning_rate": 4.266015557246278e-05, + "loss": 4.1577, + "step": 42090 + }, + { + "epoch": 0.2503271005804549, + "grad_norm": 1.8815988302230835, + "learning_rate": 4.265982495393172e-05, + "loss": 4.4337, + "step": 42091 + }, + { + "epoch": 0.25033304786373584, + "grad_norm": 1.6217626333236694, + "learning_rate": 4.2659494329235795e-05, + "loss": 4.5618, + "step": 42092 + }, + { + "epoch": 0.25033899514701685, + "grad_norm": 1.5449515581130981, + "learning_rate": 4.265916369837512e-05, + "loss": 4.7727, + "step": 42093 + }, + { + "epoch": 0.25034494243029787, + "grad_norm": 1.5954989194869995, + "learning_rate": 4.265883306134982e-05, + "loss": 4.5698, + "step": 42094 + }, + { + "epoch": 0.25035088971357883, + "grad_norm": 2.05464768409729, + "learning_rate": 4.265850241816e-05, + "loss": 4.508, + "step": 42095 + }, + { + "epoch": 0.25035683699685984, + "grad_norm": 1.6314268112182617, + "learning_rate": 4.265817176880578e-05, + "loss": 4.4186, + "step": 42096 + }, + { + "epoch": 0.2503627842801408, + "grad_norm": 1.8411312103271484, + "learning_rate": 4.265784111328728e-05, + "loss": 4.2244, + "step": 42097 + }, + { + "epoch": 0.2503687315634218, + "grad_norm": 2.144435405731201, + "learning_rate": 4.26575104516046e-05, + "loss": 4.5806, + "step": 42098 + }, + { + "epoch": 0.25037467884670284, + "grad_norm": 1.7587811946868896, + "learning_rate": 4.265717978375787e-05, + "loss": 4.2473, + "step": 42099 + }, + { + "epoch": 0.2503806261299838, + "grad_norm": 1.9845777750015259, + "learning_rate": 4.26568491097472e-05, + "loss": 4.4811, + "step": 42100 + }, + { + "epoch": 0.2503865734132648, + "grad_norm": 1.6762166023254395, + "learning_rate": 4.265651842957271e-05, + "loss": 4.5717, + "step": 42101 + }, + { + "epoch": 0.25039252069654583, + "grad_norm": 2.034278392791748, + "learning_rate": 4.265618774323451e-05, + "loss": 4.4307, + "step": 42102 + }, + { + "epoch": 0.2503984679798268, + "grad_norm": 1.592279314994812, + "learning_rate": 4.2655857050732715e-05, + "loss": 4.5204, + "step": 42103 + }, + { + "epoch": 0.2504044152631078, + "grad_norm": 1.3252283334732056, + "learning_rate": 4.265552635206745e-05, + "loss": 4.6936, + "step": 42104 + }, + { + "epoch": 0.2504103625463888, + "grad_norm": 1.6764671802520752, + "learning_rate": 4.265519564723882e-05, + "loss": 4.3694, + "step": 42105 + }, + { + "epoch": 0.2504163098296698, + "grad_norm": 1.5191102027893066, + "learning_rate": 4.265486493624694e-05, + "loss": 4.802, + "step": 42106 + }, + { + "epoch": 0.2504222571129508, + "grad_norm": 1.558970332145691, + "learning_rate": 4.265453421909193e-05, + "loss": 4.6372, + "step": 42107 + }, + { + "epoch": 0.2504282043962318, + "grad_norm": 1.7597107887268066, + "learning_rate": 4.2654203495773904e-05, + "loss": 4.5093, + "step": 42108 + }, + { + "epoch": 0.2504341516795128, + "grad_norm": 1.9210811853408813, + "learning_rate": 4.265387276629298e-05, + "loss": 4.2764, + "step": 42109 + }, + { + "epoch": 0.2504400989627938, + "grad_norm": 1.5814526081085205, + "learning_rate": 4.265354203064927e-05, + "loss": 4.251, + "step": 42110 + }, + { + "epoch": 0.2504460462460748, + "grad_norm": 1.8115946054458618, + "learning_rate": 4.265321128884289e-05, + "loss": 4.2888, + "step": 42111 + }, + { + "epoch": 0.25045199352935577, + "grad_norm": 1.5294297933578491, + "learning_rate": 4.265288054087396e-05, + "loss": 4.2098, + "step": 42112 + }, + { + "epoch": 0.2504579408126368, + "grad_norm": 1.7386729717254639, + "learning_rate": 4.265254978674259e-05, + "loss": 4.0718, + "step": 42113 + }, + { + "epoch": 0.2504638880959178, + "grad_norm": 1.5904954671859741, + "learning_rate": 4.265221902644889e-05, + "loss": 4.0554, + "step": 42114 + }, + { + "epoch": 0.25046983537919876, + "grad_norm": 1.462743878364563, + "learning_rate": 4.265188825999299e-05, + "loss": 3.8362, + "step": 42115 + }, + { + "epoch": 0.2504757826624798, + "grad_norm": 1.4958914518356323, + "learning_rate": 4.2651557487375005e-05, + "loss": 3.7237, + "step": 42116 + }, + { + "epoch": 0.2504817299457608, + "grad_norm": 1.617293119430542, + "learning_rate": 4.2651226708595035e-05, + "loss": 3.6459, + "step": 42117 + }, + { + "epoch": 0.25048767722904175, + "grad_norm": 1.6562820672988892, + "learning_rate": 4.2650895923653205e-05, + "loss": 3.8475, + "step": 42118 + }, + { + "epoch": 0.25049362451232277, + "grad_norm": 1.307482123374939, + "learning_rate": 4.265056513254963e-05, + "loss": 3.9954, + "step": 42119 + }, + { + "epoch": 0.2504995717956038, + "grad_norm": 1.614328145980835, + "learning_rate": 4.265023433528442e-05, + "loss": 3.6821, + "step": 42120 + }, + { + "epoch": 0.25050551907888474, + "grad_norm": 1.6071339845657349, + "learning_rate": 4.26499035318577e-05, + "loss": 3.7577, + "step": 42121 + }, + { + "epoch": 0.25051146636216576, + "grad_norm": 1.4299596548080444, + "learning_rate": 4.264957272226959e-05, + "loss": 3.7763, + "step": 42122 + }, + { + "epoch": 0.2505174136454468, + "grad_norm": 1.5733373165130615, + "learning_rate": 4.264924190652019e-05, + "loss": 3.7481, + "step": 42123 + }, + { + "epoch": 0.25052336092872773, + "grad_norm": 1.534377932548523, + "learning_rate": 4.2648911084609614e-05, + "loss": 3.663, + "step": 42124 + }, + { + "epoch": 0.25052930821200875, + "grad_norm": 1.7360037565231323, + "learning_rate": 4.2648580256538e-05, + "loss": 3.8282, + "step": 42125 + }, + { + "epoch": 0.25053525549528977, + "grad_norm": 1.7501121759414673, + "learning_rate": 4.264824942230543e-05, + "loss": 3.963, + "step": 42126 + }, + { + "epoch": 0.2505412027785707, + "grad_norm": 1.5419350862503052, + "learning_rate": 4.264791858191205e-05, + "loss": 4.0473, + "step": 42127 + }, + { + "epoch": 0.25054715006185174, + "grad_norm": 1.4971565008163452, + "learning_rate": 4.264758773535796e-05, + "loss": 3.8914, + "step": 42128 + }, + { + "epoch": 0.25055309734513276, + "grad_norm": 1.5687803030014038, + "learning_rate": 4.264725688264329e-05, + "loss": 3.8356, + "step": 42129 + }, + { + "epoch": 0.2505590446284137, + "grad_norm": 1.7192223072052002, + "learning_rate": 4.2646926023768135e-05, + "loss": 4.0895, + "step": 42130 + }, + { + "epoch": 0.25056499191169473, + "grad_norm": 1.727156162261963, + "learning_rate": 4.2646595158732625e-05, + "loss": 4.7627, + "step": 42131 + }, + { + "epoch": 0.25057093919497575, + "grad_norm": 1.6877679824829102, + "learning_rate": 4.264626428753687e-05, + "loss": 5.005, + "step": 42132 + }, + { + "epoch": 0.2505768864782567, + "grad_norm": 1.3795956373214722, + "learning_rate": 4.2645933410180985e-05, + "loss": 4.6297, + "step": 42133 + }, + { + "epoch": 0.2505828337615377, + "grad_norm": 1.704483151435852, + "learning_rate": 4.2645602526665086e-05, + "loss": 5.4002, + "step": 42134 + }, + { + "epoch": 0.25058878104481874, + "grad_norm": 1.6167612075805664, + "learning_rate": 4.264527163698929e-05, + "loss": 5.0479, + "step": 42135 + }, + { + "epoch": 0.2505947283280997, + "grad_norm": 2.043224811553955, + "learning_rate": 4.264494074115372e-05, + "loss": 4.2093, + "step": 42136 + }, + { + "epoch": 0.2506006756113807, + "grad_norm": 1.465388536453247, + "learning_rate": 4.264460983915847e-05, + "loss": 5.3243, + "step": 42137 + }, + { + "epoch": 0.25060662289466173, + "grad_norm": 1.9489941596984863, + "learning_rate": 4.264427893100368e-05, + "loss": 4.3789, + "step": 42138 + }, + { + "epoch": 0.2506125701779427, + "grad_norm": 2.0791471004486084, + "learning_rate": 4.264394801668945e-05, + "loss": 3.8678, + "step": 42139 + }, + { + "epoch": 0.2506185174612237, + "grad_norm": 1.805111289024353, + "learning_rate": 4.2643617096215895e-05, + "loss": 4.6286, + "step": 42140 + }, + { + "epoch": 0.2506244647445047, + "grad_norm": 1.6936253309249878, + "learning_rate": 4.264328616958314e-05, + "loss": 4.9935, + "step": 42141 + }, + { + "epoch": 0.2506304120277857, + "grad_norm": 1.9786884784698486, + "learning_rate": 4.2642955236791294e-05, + "loss": 4.4806, + "step": 42142 + }, + { + "epoch": 0.2506363593110667, + "grad_norm": 1.6515127420425415, + "learning_rate": 4.2642624297840484e-05, + "loss": 4.896, + "step": 42143 + }, + { + "epoch": 0.2506423065943477, + "grad_norm": 1.5793496370315552, + "learning_rate": 4.26422933527308e-05, + "loss": 4.9196, + "step": 42144 + }, + { + "epoch": 0.2506482538776287, + "grad_norm": 1.5564234256744385, + "learning_rate": 4.264196240146239e-05, + "loss": 4.1682, + "step": 42145 + }, + { + "epoch": 0.2506542011609097, + "grad_norm": 1.424389123916626, + "learning_rate": 4.2641631444035345e-05, + "loss": 4.1293, + "step": 42146 + }, + { + "epoch": 0.2506601484441907, + "grad_norm": 1.909745216369629, + "learning_rate": 4.264130048044979e-05, + "loss": 4.2544, + "step": 42147 + }, + { + "epoch": 0.25066609572747167, + "grad_norm": 3.716282844543457, + "learning_rate": 4.264096951070583e-05, + "loss": 4.3144, + "step": 42148 + }, + { + "epoch": 0.2506720430107527, + "grad_norm": 2.5402214527130127, + "learning_rate": 4.26406385348036e-05, + "loss": 3.7083, + "step": 42149 + }, + { + "epoch": 0.2506779902940337, + "grad_norm": 2.350353240966797, + "learning_rate": 4.2640307552743206e-05, + "loss": 3.7031, + "step": 42150 + }, + { + "epoch": 0.25068393757731466, + "grad_norm": 2.772550344467163, + "learning_rate": 4.2639976564524765e-05, + "loss": 3.8104, + "step": 42151 + }, + { + "epoch": 0.2506898848605957, + "grad_norm": 2.059577703475952, + "learning_rate": 4.263964557014839e-05, + "loss": 3.7054, + "step": 42152 + }, + { + "epoch": 0.2506958321438767, + "grad_norm": 2.032790422439575, + "learning_rate": 4.263931456961419e-05, + "loss": 4.0354, + "step": 42153 + }, + { + "epoch": 0.25070177942715766, + "grad_norm": 1.601123332977295, + "learning_rate": 4.2638983562922284e-05, + "loss": 4.7986, + "step": 42154 + }, + { + "epoch": 0.25070772671043867, + "grad_norm": 2.1980252265930176, + "learning_rate": 4.26386525500728e-05, + "loss": 4.5324, + "step": 42155 + }, + { + "epoch": 0.2507136739937197, + "grad_norm": 2.581681251525879, + "learning_rate": 4.2638321531065845e-05, + "loss": 3.8675, + "step": 42156 + }, + { + "epoch": 0.25071962127700065, + "grad_norm": 2.3103926181793213, + "learning_rate": 4.2637990505901534e-05, + "loss": 3.8985, + "step": 42157 + }, + { + "epoch": 0.25072556856028166, + "grad_norm": 2.6509687900543213, + "learning_rate": 4.263765947457998e-05, + "loss": 3.8234, + "step": 42158 + }, + { + "epoch": 0.2507315158435627, + "grad_norm": 2.092885971069336, + "learning_rate": 4.26373284371013e-05, + "loss": 3.8692, + "step": 42159 + }, + { + "epoch": 0.25073746312684364, + "grad_norm": 1.6061232089996338, + "learning_rate": 4.263699739346561e-05, + "loss": 4.7274, + "step": 42160 + }, + { + "epoch": 0.25074341041012466, + "grad_norm": 2.443000316619873, + "learning_rate": 4.263666634367303e-05, + "loss": 4.219, + "step": 42161 + }, + { + "epoch": 0.2507493576934057, + "grad_norm": 2.089714765548706, + "learning_rate": 4.2636335287723675e-05, + "loss": 4.703, + "step": 42162 + }, + { + "epoch": 0.25075530497668663, + "grad_norm": 1.6264740228652954, + "learning_rate": 4.2636004225617655e-05, + "loss": 4.7095, + "step": 42163 + }, + { + "epoch": 0.25076125225996765, + "grad_norm": 1.652134895324707, + "learning_rate": 4.263567315735508e-05, + "loss": 5.2302, + "step": 42164 + }, + { + "epoch": 0.25076719954324866, + "grad_norm": 1.6445026397705078, + "learning_rate": 4.263534208293608e-05, + "loss": 5.0313, + "step": 42165 + }, + { + "epoch": 0.2507731468265296, + "grad_norm": 1.6038885116577148, + "learning_rate": 4.2635011002360767e-05, + "loss": 4.8616, + "step": 42166 + }, + { + "epoch": 0.25077909410981064, + "grad_norm": 1.3363405466079712, + "learning_rate": 4.2634679915629256e-05, + "loss": 5.2041, + "step": 42167 + }, + { + "epoch": 0.25078504139309166, + "grad_norm": 1.4973708391189575, + "learning_rate": 4.263434882274165e-05, + "loss": 5.0672, + "step": 42168 + }, + { + "epoch": 0.2507909886763726, + "grad_norm": 2.285611152648926, + "learning_rate": 4.2634017723698084e-05, + "loss": 4.4075, + "step": 42169 + }, + { + "epoch": 0.25079693595965363, + "grad_norm": 2.313666582107544, + "learning_rate": 4.263368661849866e-05, + "loss": 3.7644, + "step": 42170 + }, + { + "epoch": 0.25080288324293465, + "grad_norm": 1.2857494354248047, + "learning_rate": 4.26333555071435e-05, + "loss": 4.9426, + "step": 42171 + }, + { + "epoch": 0.2508088305262156, + "grad_norm": 1.4575355052947998, + "learning_rate": 4.263302438963271e-05, + "loss": 4.7574, + "step": 42172 + }, + { + "epoch": 0.2508147778094966, + "grad_norm": 1.2783634662628174, + "learning_rate": 4.263269326596643e-05, + "loss": 4.8722, + "step": 42173 + }, + { + "epoch": 0.25082072509277764, + "grad_norm": 1.39179527759552, + "learning_rate": 4.263236213614475e-05, + "loss": 4.8145, + "step": 42174 + }, + { + "epoch": 0.2508266723760586, + "grad_norm": 1.3608152866363525, + "learning_rate": 4.263203100016779e-05, + "loss": 4.8968, + "step": 42175 + }, + { + "epoch": 0.2508326196593396, + "grad_norm": 1.4043036699295044, + "learning_rate": 4.263169985803568e-05, + "loss": 4.8275, + "step": 42176 + }, + { + "epoch": 0.25083856694262063, + "grad_norm": 1.534237027168274, + "learning_rate": 4.263136870974852e-05, + "loss": 4.7976, + "step": 42177 + }, + { + "epoch": 0.2508445142259016, + "grad_norm": 1.3425095081329346, + "learning_rate": 4.263103755530643e-05, + "loss": 4.8976, + "step": 42178 + }, + { + "epoch": 0.2508504615091826, + "grad_norm": 1.3370463848114014, + "learning_rate": 4.263070639470953e-05, + "loss": 4.8768, + "step": 42179 + }, + { + "epoch": 0.2508564087924636, + "grad_norm": 1.4002089500427246, + "learning_rate": 4.263037522795793e-05, + "loss": 4.8731, + "step": 42180 + }, + { + "epoch": 0.2508623560757446, + "grad_norm": 1.4456000328063965, + "learning_rate": 4.263004405505175e-05, + "loss": 4.7337, + "step": 42181 + }, + { + "epoch": 0.2508683033590256, + "grad_norm": 1.5707948207855225, + "learning_rate": 4.26297128759911e-05, + "loss": 4.8188, + "step": 42182 + }, + { + "epoch": 0.2508742506423066, + "grad_norm": 1.3954793214797974, + "learning_rate": 4.262938169077611e-05, + "loss": 4.7276, + "step": 42183 + }, + { + "epoch": 0.2508801979255876, + "grad_norm": 1.3676509857177734, + "learning_rate": 4.2629050499406876e-05, + "loss": 4.7902, + "step": 42184 + }, + { + "epoch": 0.2508861452088686, + "grad_norm": 1.4125148057937622, + "learning_rate": 4.2628719301883524e-05, + "loss": 4.7988, + "step": 42185 + }, + { + "epoch": 0.2508920924921496, + "grad_norm": 1.3077661991119385, + "learning_rate": 4.262838809820616e-05, + "loss": 4.828, + "step": 42186 + }, + { + "epoch": 0.25089803977543057, + "grad_norm": 1.3228203058242798, + "learning_rate": 4.262805688837492e-05, + "loss": 4.8118, + "step": 42187 + }, + { + "epoch": 0.2509039870587116, + "grad_norm": 1.31549870967865, + "learning_rate": 4.2627725672389905e-05, + "loss": 4.7873, + "step": 42188 + }, + { + "epoch": 0.2509099343419926, + "grad_norm": 1.3714364767074585, + "learning_rate": 4.262739445025123e-05, + "loss": 4.7144, + "step": 42189 + }, + { + "epoch": 0.25091588162527356, + "grad_norm": 1.4840455055236816, + "learning_rate": 4.262706322195901e-05, + "loss": 4.6474, + "step": 42190 + }, + { + "epoch": 0.2509218289085546, + "grad_norm": 1.1778860092163086, + "learning_rate": 4.2626731987513376e-05, + "loss": 4.8507, + "step": 42191 + }, + { + "epoch": 0.2509277761918356, + "grad_norm": 1.4688011407852173, + "learning_rate": 4.262640074691443e-05, + "loss": 4.7676, + "step": 42192 + }, + { + "epoch": 0.25093372347511655, + "grad_norm": 1.3755977153778076, + "learning_rate": 4.262606950016228e-05, + "loss": 4.7572, + "step": 42193 + }, + { + "epoch": 0.25093967075839757, + "grad_norm": 1.3589004278182983, + "learning_rate": 4.262573824725706e-05, + "loss": 4.7052, + "step": 42194 + }, + { + "epoch": 0.2509456180416786, + "grad_norm": 1.4021984338760376, + "learning_rate": 4.2625406988198865e-05, + "loss": 4.7197, + "step": 42195 + }, + { + "epoch": 0.25095156532495955, + "grad_norm": 1.428701639175415, + "learning_rate": 4.2625075722987836e-05, + "loss": 4.7006, + "step": 42196 + }, + { + "epoch": 0.25095751260824056, + "grad_norm": 1.4698964357376099, + "learning_rate": 4.262474445162407e-05, + "loss": 4.6469, + "step": 42197 + }, + { + "epoch": 0.2509634598915216, + "grad_norm": 1.265511155128479, + "learning_rate": 4.262441317410769e-05, + "loss": 4.6119, + "step": 42198 + }, + { + "epoch": 0.25096940717480254, + "grad_norm": 1.3775991201400757, + "learning_rate": 4.2624081890438804e-05, + "loss": 4.8126, + "step": 42199 + }, + { + "epoch": 0.25097535445808355, + "grad_norm": 1.3639841079711914, + "learning_rate": 4.262375060061754e-05, + "loss": 5.1488, + "step": 42200 + }, + { + "epoch": 0.25098130174136457, + "grad_norm": 1.5410248041152954, + "learning_rate": 4.2623419304644e-05, + "loss": 4.8318, + "step": 42201 + }, + { + "epoch": 0.25098724902464553, + "grad_norm": 1.3717808723449707, + "learning_rate": 4.2623088002518306e-05, + "loss": 4.7982, + "step": 42202 + }, + { + "epoch": 0.25099319630792655, + "grad_norm": 1.3667842149734497, + "learning_rate": 4.2622756694240585e-05, + "loss": 4.7716, + "step": 42203 + }, + { + "epoch": 0.25099914359120756, + "grad_norm": 1.3935704231262207, + "learning_rate": 4.2622425379810924e-05, + "loss": 4.7903, + "step": 42204 + }, + { + "epoch": 0.2510050908744885, + "grad_norm": 1.5518251657485962, + "learning_rate": 4.2622094059229473e-05, + "loss": 4.8826, + "step": 42205 + }, + { + "epoch": 0.25101103815776954, + "grad_norm": 1.3695787191390991, + "learning_rate": 4.262176273249633e-05, + "loss": 4.7791, + "step": 42206 + }, + { + "epoch": 0.25101698544105056, + "grad_norm": 1.414014458656311, + "learning_rate": 4.26214313996116e-05, + "loss": 4.8473, + "step": 42207 + }, + { + "epoch": 0.2510229327243315, + "grad_norm": 1.4037078619003296, + "learning_rate": 4.2621100060575416e-05, + "loss": 4.7894, + "step": 42208 + }, + { + "epoch": 0.25102888000761253, + "grad_norm": 1.3612172603607178, + "learning_rate": 4.2620768715387896e-05, + "loss": 4.8292, + "step": 42209 + }, + { + "epoch": 0.25103482729089355, + "grad_norm": 1.3169488906860352, + "learning_rate": 4.262043736404914e-05, + "loss": 4.8003, + "step": 42210 + }, + { + "epoch": 0.2510407745741745, + "grad_norm": 1.2903103828430176, + "learning_rate": 4.2620106006559266e-05, + "loss": 4.7412, + "step": 42211 + }, + { + "epoch": 0.2510467218574555, + "grad_norm": 1.9887945652008057, + "learning_rate": 4.26197746429184e-05, + "loss": 5.0426, + "step": 42212 + }, + { + "epoch": 0.2510526691407365, + "grad_norm": 1.4946104288101196, + "learning_rate": 4.261944327312665e-05, + "loss": 4.713, + "step": 42213 + }, + { + "epoch": 0.2510586164240175, + "grad_norm": 1.5016955137252808, + "learning_rate": 4.2619111897184145e-05, + "loss": 4.7583, + "step": 42214 + }, + { + "epoch": 0.2510645637072985, + "grad_norm": 1.9494094848632812, + "learning_rate": 4.2618780515090986e-05, + "loss": 4.7309, + "step": 42215 + }, + { + "epoch": 0.2510705109905795, + "grad_norm": 1.740973711013794, + "learning_rate": 4.261844912684728e-05, + "loss": 5.0331, + "step": 42216 + }, + { + "epoch": 0.2510764582738605, + "grad_norm": 1.5565954446792603, + "learning_rate": 4.2618117732453165e-05, + "loss": 4.7532, + "step": 42217 + }, + { + "epoch": 0.2510824055571415, + "grad_norm": 1.826711893081665, + "learning_rate": 4.261778633190875e-05, + "loss": 5.3567, + "step": 42218 + }, + { + "epoch": 0.25108835284042247, + "grad_norm": 1.800734519958496, + "learning_rate": 4.2617454925214147e-05, + "loss": 5.3804, + "step": 42219 + }, + { + "epoch": 0.2510943001237035, + "grad_norm": 1.6805843114852905, + "learning_rate": 4.261712351236947e-05, + "loss": 5.4218, + "step": 42220 + }, + { + "epoch": 0.2511002474069845, + "grad_norm": 1.653990387916565, + "learning_rate": 4.261679209337484e-05, + "loss": 4.8638, + "step": 42221 + }, + { + "epoch": 0.25110619469026546, + "grad_norm": 1.6348603963851929, + "learning_rate": 4.261646066823036e-05, + "loss": 4.8519, + "step": 42222 + }, + { + "epoch": 0.2511121419735465, + "grad_norm": 1.6120195388793945, + "learning_rate": 4.261612923693617e-05, + "loss": 5.2828, + "step": 42223 + }, + { + "epoch": 0.2511180892568275, + "grad_norm": 1.8813776969909668, + "learning_rate": 4.261579779949236e-05, + "loss": 5.0558, + "step": 42224 + }, + { + "epoch": 0.25112403654010845, + "grad_norm": 1.6883058547973633, + "learning_rate": 4.2615466355899066e-05, + "loss": 5.1081, + "step": 42225 + }, + { + "epoch": 0.25112998382338947, + "grad_norm": 1.956989049911499, + "learning_rate": 4.261513490615639e-05, + "loss": 4.7026, + "step": 42226 + }, + { + "epoch": 0.2511359311066705, + "grad_norm": 2.011707305908203, + "learning_rate": 4.261480345026445e-05, + "loss": 4.7518, + "step": 42227 + }, + { + "epoch": 0.25114187838995145, + "grad_norm": 1.9115012884140015, + "learning_rate": 4.261447198822337e-05, + "loss": 4.6957, + "step": 42228 + }, + { + "epoch": 0.25114782567323246, + "grad_norm": 1.8363462686538696, + "learning_rate": 4.2614140520033254e-05, + "loss": 4.6248, + "step": 42229 + }, + { + "epoch": 0.2511537729565135, + "grad_norm": 1.718764066696167, + "learning_rate": 4.261380904569423e-05, + "loss": 4.7508, + "step": 42230 + }, + { + "epoch": 0.25115972023979444, + "grad_norm": 1.7057690620422363, + "learning_rate": 4.26134775652064e-05, + "loss": 4.6195, + "step": 42231 + }, + { + "epoch": 0.25116566752307545, + "grad_norm": 1.7434269189834595, + "learning_rate": 4.261314607856989e-05, + "loss": 4.3864, + "step": 42232 + }, + { + "epoch": 0.25117161480635647, + "grad_norm": 1.8253660202026367, + "learning_rate": 4.261281458578481e-05, + "loss": 4.5725, + "step": 42233 + }, + { + "epoch": 0.25117756208963743, + "grad_norm": 2.0239713191986084, + "learning_rate": 4.261248308685129e-05, + "loss": 4.6443, + "step": 42234 + }, + { + "epoch": 0.25118350937291845, + "grad_norm": 1.7808157205581665, + "learning_rate": 4.261215158176942e-05, + "loss": 4.31, + "step": 42235 + }, + { + "epoch": 0.25118945665619946, + "grad_norm": 1.7399829626083374, + "learning_rate": 4.261182007053933e-05, + "loss": 5.0601, + "step": 42236 + }, + { + "epoch": 0.2511954039394804, + "grad_norm": 1.7569636106491089, + "learning_rate": 4.2611488553161145e-05, + "loss": 4.5969, + "step": 42237 + }, + { + "epoch": 0.25120135122276144, + "grad_norm": 1.63919198513031, + "learning_rate": 4.261115702963497e-05, + "loss": 4.4188, + "step": 42238 + }, + { + "epoch": 0.25120729850604245, + "grad_norm": 1.4322408437728882, + "learning_rate": 4.2610825499960916e-05, + "loss": 4.849, + "step": 42239 + }, + { + "epoch": 0.2512132457893234, + "grad_norm": 1.7055526971817017, + "learning_rate": 4.261049396413911e-05, + "loss": 4.4286, + "step": 42240 + }, + { + "epoch": 0.25121919307260443, + "grad_norm": 1.7443459033966064, + "learning_rate": 4.261016242216966e-05, + "loss": 4.6569, + "step": 42241 + }, + { + "epoch": 0.25122514035588545, + "grad_norm": 2.401498556137085, + "learning_rate": 4.2609830874052684e-05, + "loss": 4.1224, + "step": 42242 + }, + { + "epoch": 0.2512310876391664, + "grad_norm": 2.233206272125244, + "learning_rate": 4.2609499319788294e-05, + "loss": 3.9756, + "step": 42243 + }, + { + "epoch": 0.2512370349224474, + "grad_norm": 2.2802693843841553, + "learning_rate": 4.260916775937662e-05, + "loss": 4.2114, + "step": 42244 + }, + { + "epoch": 0.25124298220572844, + "grad_norm": 1.7859820127487183, + "learning_rate": 4.2608836192817756e-05, + "loss": 5.6588, + "step": 42245 + }, + { + "epoch": 0.2512489294890094, + "grad_norm": 2.029710531234741, + "learning_rate": 4.260850462011183e-05, + "loss": 5.785, + "step": 42246 + }, + { + "epoch": 0.2512548767722904, + "grad_norm": 1.708290696144104, + "learning_rate": 4.260817304125897e-05, + "loss": 5.0338, + "step": 42247 + }, + { + "epoch": 0.25126082405557143, + "grad_norm": 2.1795923709869385, + "learning_rate": 4.2607841456259265e-05, + "loss": 3.7574, + "step": 42248 + }, + { + "epoch": 0.2512667713388524, + "grad_norm": 2.0711874961853027, + "learning_rate": 4.260750986511285e-05, + "loss": 3.7574, + "step": 42249 + }, + { + "epoch": 0.2512727186221334, + "grad_norm": 2.09904408454895, + "learning_rate": 4.2607178267819845e-05, + "loss": 4.0686, + "step": 42250 + }, + { + "epoch": 0.2512786659054144, + "grad_norm": 1.7023870944976807, + "learning_rate": 4.260684666438034e-05, + "loss": 4.3634, + "step": 42251 + }, + { + "epoch": 0.2512846131886954, + "grad_norm": 1.4160494804382324, + "learning_rate": 4.2606515054794474e-05, + "loss": 4.7699, + "step": 42252 + }, + { + "epoch": 0.2512905604719764, + "grad_norm": 1.5604162216186523, + "learning_rate": 4.260618343906235e-05, + "loss": 4.8067, + "step": 42253 + }, + { + "epoch": 0.2512965077552574, + "grad_norm": 1.7139579057693481, + "learning_rate": 4.26058518171841e-05, + "loss": 5.025, + "step": 42254 + }, + { + "epoch": 0.2513024550385384, + "grad_norm": 1.576891303062439, + "learning_rate": 4.2605520189159817e-05, + "loss": 5.3247, + "step": 42255 + }, + { + "epoch": 0.2513084023218194, + "grad_norm": 1.5426198244094849, + "learning_rate": 4.260518855498964e-05, + "loss": 4.2285, + "step": 42256 + }, + { + "epoch": 0.2513143496051004, + "grad_norm": 1.4877163171768188, + "learning_rate": 4.260485691467366e-05, + "loss": 3.7871, + "step": 42257 + }, + { + "epoch": 0.25132029688838137, + "grad_norm": 1.3813968896865845, + "learning_rate": 4.260452526821202e-05, + "loss": 4.5351, + "step": 42258 + }, + { + "epoch": 0.2513262441716624, + "grad_norm": 1.4487627744674683, + "learning_rate": 4.260419361560481e-05, + "loss": 3.9632, + "step": 42259 + }, + { + "epoch": 0.2513321914549434, + "grad_norm": 1.4642643928527832, + "learning_rate": 4.260386195685218e-05, + "loss": 3.9356, + "step": 42260 + }, + { + "epoch": 0.25133813873822436, + "grad_norm": 1.4104883670806885, + "learning_rate": 4.26035302919542e-05, + "loss": 3.7553, + "step": 42261 + }, + { + "epoch": 0.2513440860215054, + "grad_norm": 1.447296142578125, + "learning_rate": 4.2603198620911024e-05, + "loss": 3.7585, + "step": 42262 + }, + { + "epoch": 0.2513500333047864, + "grad_norm": 1.3973095417022705, + "learning_rate": 4.260286694372274e-05, + "loss": 4.074, + "step": 42263 + }, + { + "epoch": 0.25135598058806735, + "grad_norm": 1.4217946529388428, + "learning_rate": 4.260253526038949e-05, + "loss": 3.8131, + "step": 42264 + }, + { + "epoch": 0.25136192787134837, + "grad_norm": 1.4482734203338623, + "learning_rate": 4.260220357091137e-05, + "loss": 3.9344, + "step": 42265 + }, + { + "epoch": 0.2513678751546294, + "grad_norm": 1.4450335502624512, + "learning_rate": 4.26018718752885e-05, + "loss": 3.8789, + "step": 42266 + }, + { + "epoch": 0.25137382243791034, + "grad_norm": 1.5608181953430176, + "learning_rate": 4.2601540173521e-05, + "loss": 3.6978, + "step": 42267 + }, + { + "epoch": 0.25137976972119136, + "grad_norm": 1.2641302347183228, + "learning_rate": 4.2601208465608986e-05, + "loss": 3.9232, + "step": 42268 + }, + { + "epoch": 0.2513857170044724, + "grad_norm": 1.3807915449142456, + "learning_rate": 4.260087675155257e-05, + "loss": 3.9125, + "step": 42269 + }, + { + "epoch": 0.25139166428775334, + "grad_norm": 1.414483666419983, + "learning_rate": 4.260054503135187e-05, + "loss": 3.656, + "step": 42270 + }, + { + "epoch": 0.25139761157103435, + "grad_norm": 1.485327124595642, + "learning_rate": 4.260021330500701e-05, + "loss": 3.8683, + "step": 42271 + }, + { + "epoch": 0.25140355885431537, + "grad_norm": 1.4255670309066772, + "learning_rate": 4.259988157251809e-05, + "loss": 3.8991, + "step": 42272 + }, + { + "epoch": 0.25140950613759633, + "grad_norm": 1.3821054697036743, + "learning_rate": 4.259954983388523e-05, + "loss": 3.7831, + "step": 42273 + }, + { + "epoch": 0.25141545342087734, + "grad_norm": 1.5175960063934326, + "learning_rate": 4.259921808910855e-05, + "loss": 3.857, + "step": 42274 + }, + { + "epoch": 0.25142140070415836, + "grad_norm": 1.4302997589111328, + "learning_rate": 4.259888633818817e-05, + "loss": 3.8767, + "step": 42275 + }, + { + "epoch": 0.2514273479874393, + "grad_norm": 1.502303123474121, + "learning_rate": 4.2598554581124196e-05, + "loss": 3.9402, + "step": 42276 + }, + { + "epoch": 0.25143329527072034, + "grad_norm": 1.441290259361267, + "learning_rate": 4.2598222817916754e-05, + "loss": 3.76, + "step": 42277 + }, + { + "epoch": 0.25143924255400135, + "grad_norm": 1.3729798793792725, + "learning_rate": 4.2597891048565945e-05, + "loss": 3.8809, + "step": 42278 + }, + { + "epoch": 0.2514451898372823, + "grad_norm": 1.3272294998168945, + "learning_rate": 4.2597559273071904e-05, + "loss": 3.7201, + "step": 42279 + }, + { + "epoch": 0.25145113712056333, + "grad_norm": 1.3483726978302002, + "learning_rate": 4.259722749143473e-05, + "loss": 3.9442, + "step": 42280 + }, + { + "epoch": 0.25145708440384434, + "grad_norm": 1.268142580986023, + "learning_rate": 4.259689570365455e-05, + "loss": 4.0606, + "step": 42281 + }, + { + "epoch": 0.2514630316871253, + "grad_norm": 1.3267277479171753, + "learning_rate": 4.2596563909731475e-05, + "loss": 4.9879, + "step": 42282 + }, + { + "epoch": 0.2514689789704063, + "grad_norm": 1.4602577686309814, + "learning_rate": 4.259623210966562e-05, + "loss": 4.9398, + "step": 42283 + }, + { + "epoch": 0.25147492625368734, + "grad_norm": 1.8150848150253296, + "learning_rate": 4.2595900303457105e-05, + "loss": 5.1686, + "step": 42284 + }, + { + "epoch": 0.2514808735369683, + "grad_norm": 1.194347858428955, + "learning_rate": 4.259556849110604e-05, + "loss": 4.7966, + "step": 42285 + }, + { + "epoch": 0.2514868208202493, + "grad_norm": 1.2956757545471191, + "learning_rate": 4.259523667261254e-05, + "loss": 4.6158, + "step": 42286 + }, + { + "epoch": 0.25149276810353033, + "grad_norm": 1.3853676319122314, + "learning_rate": 4.259490484797673e-05, + "loss": 4.8407, + "step": 42287 + }, + { + "epoch": 0.2514987153868113, + "grad_norm": 1.5076686143875122, + "learning_rate": 4.259457301719871e-05, + "loss": 4.9458, + "step": 42288 + }, + { + "epoch": 0.2515046626700923, + "grad_norm": 1.39446222782135, + "learning_rate": 4.259424118027863e-05, + "loss": 4.6302, + "step": 42289 + }, + { + "epoch": 0.2515106099533733, + "grad_norm": 1.7547638416290283, + "learning_rate": 4.2593909337216566e-05, + "loss": 5.2932, + "step": 42290 + }, + { + "epoch": 0.2515165572366543, + "grad_norm": 1.5107359886169434, + "learning_rate": 4.259357748801265e-05, + "loss": 5.545, + "step": 42291 + }, + { + "epoch": 0.2515225045199353, + "grad_norm": 2.244978427886963, + "learning_rate": 4.2593245632667e-05, + "loss": 4.0349, + "step": 42292 + }, + { + "epoch": 0.2515284518032163, + "grad_norm": 2.604090690612793, + "learning_rate": 4.259291377117973e-05, + "loss": 4.1475, + "step": 42293 + }, + { + "epoch": 0.2515343990864973, + "grad_norm": 2.3909788131713867, + "learning_rate": 4.259258190355095e-05, + "loss": 3.8321, + "step": 42294 + }, + { + "epoch": 0.2515403463697783, + "grad_norm": 2.13722825050354, + "learning_rate": 4.259225002978079e-05, + "loss": 3.7263, + "step": 42295 + }, + { + "epoch": 0.2515462936530593, + "grad_norm": 1.9325134754180908, + "learning_rate": 4.2591918149869345e-05, + "loss": 3.5505, + "step": 42296 + }, + { + "epoch": 0.25155224093634027, + "grad_norm": 2.2680022716522217, + "learning_rate": 4.2591586263816755e-05, + "loss": 3.6316, + "step": 42297 + }, + { + "epoch": 0.2515581882196213, + "grad_norm": 2.8128905296325684, + "learning_rate": 4.259125437162312e-05, + "loss": 3.9502, + "step": 42298 + }, + { + "epoch": 0.2515641355029023, + "grad_norm": 2.6826255321502686, + "learning_rate": 4.259092247328856e-05, + "loss": 3.7742, + "step": 42299 + }, + { + "epoch": 0.25157008278618326, + "grad_norm": 2.1351256370544434, + "learning_rate": 4.2590590568813196e-05, + "loss": 3.5036, + "step": 42300 + }, + { + "epoch": 0.2515760300694643, + "grad_norm": 2.5705363750457764, + "learning_rate": 4.259025865819713e-05, + "loss": 3.5843, + "step": 42301 + }, + { + "epoch": 0.2515819773527453, + "grad_norm": 2.2728681564331055, + "learning_rate": 4.2589926741440487e-05, + "loss": 3.194, + "step": 42302 + }, + { + "epoch": 0.25158792463602625, + "grad_norm": 2.8511147499084473, + "learning_rate": 4.258959481854339e-05, + "loss": 3.1104, + "step": 42303 + }, + { + "epoch": 0.25159387191930727, + "grad_norm": 2.645820140838623, + "learning_rate": 4.258926288950594e-05, + "loss": 3.62, + "step": 42304 + }, + { + "epoch": 0.2515998192025883, + "grad_norm": 1.9722230434417725, + "learning_rate": 4.258893095432827e-05, + "loss": 4.011, + "step": 42305 + }, + { + "epoch": 0.25160576648586924, + "grad_norm": 1.966966152191162, + "learning_rate": 4.258859901301048e-05, + "loss": 4.0235, + "step": 42306 + }, + { + "epoch": 0.25161171376915026, + "grad_norm": 2.7009634971618652, + "learning_rate": 4.258826706555268e-05, + "loss": 3.9512, + "step": 42307 + }, + { + "epoch": 0.2516176610524313, + "grad_norm": 1.726420283317566, + "learning_rate": 4.2587935111955016e-05, + "loss": 5.176, + "step": 42308 + }, + { + "epoch": 0.25162360833571223, + "grad_norm": 1.6425206661224365, + "learning_rate": 4.258760315221757e-05, + "loss": 5.1907, + "step": 42309 + }, + { + "epoch": 0.25162955561899325, + "grad_norm": 1.7863664627075195, + "learning_rate": 4.2587271186340484e-05, + "loss": 5.0012, + "step": 42310 + }, + { + "epoch": 0.25163550290227427, + "grad_norm": 1.7763088941574097, + "learning_rate": 4.258693921432386e-05, + "loss": 4.8241, + "step": 42311 + }, + { + "epoch": 0.2516414501855552, + "grad_norm": 1.7003499269485474, + "learning_rate": 4.258660723616782e-05, + "loss": 4.6754, + "step": 42312 + }, + { + "epoch": 0.25164739746883624, + "grad_norm": 1.672153115272522, + "learning_rate": 4.258627525187248e-05, + "loss": 4.6459, + "step": 42313 + }, + { + "epoch": 0.25165334475211726, + "grad_norm": 1.6353453397750854, + "learning_rate": 4.2585943261437945e-05, + "loss": 4.8672, + "step": 42314 + }, + { + "epoch": 0.2516592920353982, + "grad_norm": 1.47422456741333, + "learning_rate": 4.2585611264864334e-05, + "loss": 5.1915, + "step": 42315 + }, + { + "epoch": 0.25166523931867923, + "grad_norm": 1.586459994316101, + "learning_rate": 4.258527926215178e-05, + "loss": 5.2223, + "step": 42316 + }, + { + "epoch": 0.25167118660196025, + "grad_norm": 1.6699520349502563, + "learning_rate": 4.258494725330038e-05, + "loss": 4.3497, + "step": 42317 + }, + { + "epoch": 0.2516771338852412, + "grad_norm": 1.61715567111969, + "learning_rate": 4.258461523831026e-05, + "loss": 4.8146, + "step": 42318 + }, + { + "epoch": 0.2516830811685222, + "grad_norm": 1.6011897325515747, + "learning_rate": 4.258428321718153e-05, + "loss": 5.3881, + "step": 42319 + }, + { + "epoch": 0.25168902845180324, + "grad_norm": 1.6565579175949097, + "learning_rate": 4.258395118991431e-05, + "loss": 5.2416, + "step": 42320 + }, + { + "epoch": 0.2516949757350842, + "grad_norm": 1.497970461845398, + "learning_rate": 4.258361915650871e-05, + "loss": 5.0704, + "step": 42321 + }, + { + "epoch": 0.2517009230183652, + "grad_norm": 1.5135687589645386, + "learning_rate": 4.258328711696486e-05, + "loss": 5.0225, + "step": 42322 + }, + { + "epoch": 0.25170687030164623, + "grad_norm": 1.5391225814819336, + "learning_rate": 4.2582955071282856e-05, + "loss": 4.8095, + "step": 42323 + }, + { + "epoch": 0.2517128175849272, + "grad_norm": 2.266392230987549, + "learning_rate": 4.258262301946283e-05, + "loss": 3.5784, + "step": 42324 + }, + { + "epoch": 0.2517187648682082, + "grad_norm": 2.237560749053955, + "learning_rate": 4.258229096150489e-05, + "loss": 3.4793, + "step": 42325 + }, + { + "epoch": 0.2517247121514892, + "grad_norm": 2.3638131618499756, + "learning_rate": 4.258195889740916e-05, + "loss": 3.2846, + "step": 42326 + }, + { + "epoch": 0.2517306594347702, + "grad_norm": 1.9989261627197266, + "learning_rate": 4.2581626827175734e-05, + "loss": 4.0283, + "step": 42327 + }, + { + "epoch": 0.2517366067180512, + "grad_norm": 1.6162580251693726, + "learning_rate": 4.2581294750804755e-05, + "loss": 5.0362, + "step": 42328 + }, + { + "epoch": 0.25174255400133216, + "grad_norm": 1.9527994394302368, + "learning_rate": 4.258096266829633e-05, + "loss": 3.6603, + "step": 42329 + }, + { + "epoch": 0.2517485012846132, + "grad_norm": 2.0064406394958496, + "learning_rate": 4.258063057965057e-05, + "loss": 3.5238, + "step": 42330 + }, + { + "epoch": 0.2517544485678942, + "grad_norm": 1.9845999479293823, + "learning_rate": 4.258029848486759e-05, + "loss": 3.4397, + "step": 42331 + }, + { + "epoch": 0.25176039585117516, + "grad_norm": 1.7945690155029297, + "learning_rate": 4.257996638394751e-05, + "loss": 3.5046, + "step": 42332 + }, + { + "epoch": 0.25176634313445617, + "grad_norm": 1.7447595596313477, + "learning_rate": 4.257963427689046e-05, + "loss": 4.9361, + "step": 42333 + }, + { + "epoch": 0.2517722904177372, + "grad_norm": 1.589492917060852, + "learning_rate": 4.257930216369652e-05, + "loss": 5.1128, + "step": 42334 + }, + { + "epoch": 0.25177823770101815, + "grad_norm": 1.4836667776107788, + "learning_rate": 4.2578970044365835e-05, + "loss": 4.953, + "step": 42335 + }, + { + "epoch": 0.25178418498429916, + "grad_norm": 1.458371877670288, + "learning_rate": 4.2578637918898516e-05, + "loss": 5.0778, + "step": 42336 + }, + { + "epoch": 0.2517901322675802, + "grad_norm": 1.4877443313598633, + "learning_rate": 4.257830578729467e-05, + "loss": 4.9147, + "step": 42337 + }, + { + "epoch": 0.25179607955086114, + "grad_norm": 1.3295477628707886, + "learning_rate": 4.257797364955442e-05, + "loss": 4.9695, + "step": 42338 + }, + { + "epoch": 0.25180202683414216, + "grad_norm": 1.2827715873718262, + "learning_rate": 4.257764150567789e-05, + "loss": 5.0666, + "step": 42339 + }, + { + "epoch": 0.2518079741174232, + "grad_norm": 1.7956442832946777, + "learning_rate": 4.257730935566518e-05, + "loss": 4.2923, + "step": 42340 + }, + { + "epoch": 0.25181392140070413, + "grad_norm": 2.2688310146331787, + "learning_rate": 4.257697719951641e-05, + "loss": 3.2734, + "step": 42341 + }, + { + "epoch": 0.25181986868398515, + "grad_norm": 2.436635971069336, + "learning_rate": 4.257664503723171e-05, + "loss": 3.4666, + "step": 42342 + }, + { + "epoch": 0.25182581596726616, + "grad_norm": 1.9986000061035156, + "learning_rate": 4.2576312868811174e-05, + "loss": 3.9229, + "step": 42343 + }, + { + "epoch": 0.2518317632505471, + "grad_norm": 1.5124166011810303, + "learning_rate": 4.2575980694254935e-05, + "loss": 4.9557, + "step": 42344 + }, + { + "epoch": 0.25183771053382814, + "grad_norm": 1.8567988872528076, + "learning_rate": 4.25756485135631e-05, + "loss": 4.1164, + "step": 42345 + }, + { + "epoch": 0.25184365781710916, + "grad_norm": 1.8096282482147217, + "learning_rate": 4.2575316326735786e-05, + "loss": 4.7459, + "step": 42346 + }, + { + "epoch": 0.2518496051003901, + "grad_norm": 2.2504680156707764, + "learning_rate": 4.257498413377311e-05, + "loss": 3.2659, + "step": 42347 + }, + { + "epoch": 0.25185555238367113, + "grad_norm": 2.292250156402588, + "learning_rate": 4.257465193467519e-05, + "loss": 3.2556, + "step": 42348 + }, + { + "epoch": 0.25186149966695215, + "grad_norm": 2.154960870742798, + "learning_rate": 4.257431972944215e-05, + "loss": 3.2162, + "step": 42349 + }, + { + "epoch": 0.2518674469502331, + "grad_norm": 2.0289480686187744, + "learning_rate": 4.257398751807408e-05, + "loss": 3.3831, + "step": 42350 + }, + { + "epoch": 0.2518733942335141, + "grad_norm": 2.337229013442993, + "learning_rate": 4.2573655300571124e-05, + "loss": 3.3659, + "step": 42351 + }, + { + "epoch": 0.25187934151679514, + "grad_norm": 2.126600503921509, + "learning_rate": 4.257332307693338e-05, + "loss": 4.5654, + "step": 42352 + }, + { + "epoch": 0.2518852888000761, + "grad_norm": 2.2384402751922607, + "learning_rate": 4.257299084716098e-05, + "loss": 3.7087, + "step": 42353 + }, + { + "epoch": 0.2518912360833571, + "grad_norm": 1.6807070970535278, + "learning_rate": 4.2572658611254027e-05, + "loss": 5.1142, + "step": 42354 + }, + { + "epoch": 0.25189718336663813, + "grad_norm": 1.499300241470337, + "learning_rate": 4.257232636921263e-05, + "loss": 4.9737, + "step": 42355 + }, + { + "epoch": 0.2519031306499191, + "grad_norm": 2.5552096366882324, + "learning_rate": 4.257199412103692e-05, + "loss": 3.2399, + "step": 42356 + }, + { + "epoch": 0.2519090779332001, + "grad_norm": 2.461867332458496, + "learning_rate": 4.2571661866727017e-05, + "loss": 2.9925, + "step": 42357 + }, + { + "epoch": 0.2519150252164811, + "grad_norm": 2.3984034061431885, + "learning_rate": 4.257132960628302e-05, + "loss": 2.8908, + "step": 42358 + }, + { + "epoch": 0.2519209724997621, + "grad_norm": 2.6801137924194336, + "learning_rate": 4.257099733970505e-05, + "loss": 2.8465, + "step": 42359 + }, + { + "epoch": 0.2519269197830431, + "grad_norm": 2.9234890937805176, + "learning_rate": 4.257066506699324e-05, + "loss": 3.0245, + "step": 42360 + }, + { + "epoch": 0.2519328670663241, + "grad_norm": 2.592780351638794, + "learning_rate": 4.257033278814768e-05, + "loss": 2.9193, + "step": 42361 + }, + { + "epoch": 0.2519388143496051, + "grad_norm": 2.276496648788452, + "learning_rate": 4.2570000503168505e-05, + "loss": 3.047, + "step": 42362 + }, + { + "epoch": 0.2519447616328861, + "grad_norm": 2.5146477222442627, + "learning_rate": 4.256966821205582e-05, + "loss": 3.0099, + "step": 42363 + }, + { + "epoch": 0.2519507089161671, + "grad_norm": 1.896694540977478, + "learning_rate": 4.256933591480975e-05, + "loss": 3.5191, + "step": 42364 + }, + { + "epoch": 0.25195665619944807, + "grad_norm": 2.476254463195801, + "learning_rate": 4.25690036114304e-05, + "loss": 3.2063, + "step": 42365 + }, + { + "epoch": 0.2519626034827291, + "grad_norm": 1.825239896774292, + "learning_rate": 4.25686713019179e-05, + "loss": 3.6603, + "step": 42366 + }, + { + "epoch": 0.2519685507660101, + "grad_norm": 2.0071303844451904, + "learning_rate": 4.256833898627236e-05, + "loss": 3.3556, + "step": 42367 + }, + { + "epoch": 0.25197449804929106, + "grad_norm": 1.9497332572937012, + "learning_rate": 4.2568006664493884e-05, + "loss": 3.3209, + "step": 42368 + }, + { + "epoch": 0.2519804453325721, + "grad_norm": 1.826940894126892, + "learning_rate": 4.256767433658261e-05, + "loss": 3.2457, + "step": 42369 + }, + { + "epoch": 0.2519863926158531, + "grad_norm": 1.7894257307052612, + "learning_rate": 4.256734200253863e-05, + "loss": 3.2811, + "step": 42370 + }, + { + "epoch": 0.25199233989913405, + "grad_norm": 1.893983244895935, + "learning_rate": 4.256700966236208e-05, + "loss": 2.8749, + "step": 42371 + }, + { + "epoch": 0.25199828718241507, + "grad_norm": 1.7583115100860596, + "learning_rate": 4.256667731605307e-05, + "loss": 3.1035, + "step": 42372 + }, + { + "epoch": 0.2520042344656961, + "grad_norm": 2.0725173950195312, + "learning_rate": 4.2566344963611704e-05, + "loss": 3.1571, + "step": 42373 + }, + { + "epoch": 0.25201018174897705, + "grad_norm": 1.9925744533538818, + "learning_rate": 4.256601260503812e-05, + "loss": 2.9709, + "step": 42374 + }, + { + "epoch": 0.25201612903225806, + "grad_norm": 2.0076467990875244, + "learning_rate": 4.2565680240332416e-05, + "loss": 3.2379, + "step": 42375 + }, + { + "epoch": 0.2520220763155391, + "grad_norm": 1.9381129741668701, + "learning_rate": 4.256534786949472e-05, + "loss": 4.0664, + "step": 42376 + }, + { + "epoch": 0.25202802359882004, + "grad_norm": 2.2271857261657715, + "learning_rate": 4.256501549252513e-05, + "loss": 4.2355, + "step": 42377 + }, + { + "epoch": 0.25203397088210105, + "grad_norm": 1.8720593452453613, + "learning_rate": 4.256468310942379e-05, + "loss": 4.1503, + "step": 42378 + }, + { + "epoch": 0.25203991816538207, + "grad_norm": 1.7730683088302612, + "learning_rate": 4.256435072019079e-05, + "loss": 4.6919, + "step": 42379 + }, + { + "epoch": 0.25204586544866303, + "grad_norm": 2.6914446353912354, + "learning_rate": 4.256401832482626e-05, + "loss": 4.4065, + "step": 42380 + }, + { + "epoch": 0.25205181273194405, + "grad_norm": 1.845653772354126, + "learning_rate": 4.256368592333032e-05, + "loss": 4.7895, + "step": 42381 + }, + { + "epoch": 0.25205776001522506, + "grad_norm": 1.5524494647979736, + "learning_rate": 4.256335351570307e-05, + "loss": 4.6486, + "step": 42382 + }, + { + "epoch": 0.252063707298506, + "grad_norm": 1.849144458770752, + "learning_rate": 4.256302110194463e-05, + "loss": 4.7298, + "step": 42383 + }, + { + "epoch": 0.25206965458178704, + "grad_norm": 1.5291454792022705, + "learning_rate": 4.256268868205513e-05, + "loss": 4.6848, + "step": 42384 + }, + { + "epoch": 0.25207560186506806, + "grad_norm": 1.5987776517868042, + "learning_rate": 4.256235625603468e-05, + "loss": 4.6988, + "step": 42385 + }, + { + "epoch": 0.252081549148349, + "grad_norm": 2.7424044609069824, + "learning_rate": 4.256202382388339e-05, + "loss": 3.3129, + "step": 42386 + }, + { + "epoch": 0.25208749643163003, + "grad_norm": 3.104961395263672, + "learning_rate": 4.256169138560138e-05, + "loss": 3.6556, + "step": 42387 + }, + { + "epoch": 0.25209344371491105, + "grad_norm": 1.573859691619873, + "learning_rate": 4.256135894118876e-05, + "loss": 5.044, + "step": 42388 + }, + { + "epoch": 0.252099390998192, + "grad_norm": 1.5963214635849, + "learning_rate": 4.256102649064564e-05, + "loss": 4.9279, + "step": 42389 + }, + { + "epoch": 0.252105338281473, + "grad_norm": 2.802353858947754, + "learning_rate": 4.256069403397217e-05, + "loss": 3.1048, + "step": 42390 + }, + { + "epoch": 0.25211128556475404, + "grad_norm": 2.601747989654541, + "learning_rate": 4.2560361571168424e-05, + "loss": 2.8991, + "step": 42391 + }, + { + "epoch": 0.252117232848035, + "grad_norm": 2.742805004119873, + "learning_rate": 4.256002910223455e-05, + "loss": 2.7119, + "step": 42392 + }, + { + "epoch": 0.252123180131316, + "grad_norm": 1.951058268547058, + "learning_rate": 4.2559696627170646e-05, + "loss": 3.8559, + "step": 42393 + }, + { + "epoch": 0.25212912741459703, + "grad_norm": 2.280738353729248, + "learning_rate": 4.255936414597684e-05, + "loss": 3.6677, + "step": 42394 + }, + { + "epoch": 0.252135074697878, + "grad_norm": 2.843604803085327, + "learning_rate": 4.255903165865323e-05, + "loss": 2.589, + "step": 42395 + }, + { + "epoch": 0.252141021981159, + "grad_norm": 2.6617918014526367, + "learning_rate": 4.255869916519995e-05, + "loss": 2.9321, + "step": 42396 + }, + { + "epoch": 0.25214696926444, + "grad_norm": 1.456131100654602, + "learning_rate": 4.2558366665617114e-05, + "loss": 4.6274, + "step": 42397 + }, + { + "epoch": 0.252152916547721, + "grad_norm": 1.4423390626907349, + "learning_rate": 4.2558034159904826e-05, + "loss": 4.4176, + "step": 42398 + }, + { + "epoch": 0.252158863831002, + "grad_norm": 1.3807919025421143, + "learning_rate": 4.255770164806321e-05, + "loss": 4.3683, + "step": 42399 + }, + { + "epoch": 0.252164811114283, + "grad_norm": 1.9001542329788208, + "learning_rate": 4.2557369130092384e-05, + "loss": 4.8731, + "step": 42400 + }, + { + "epoch": 0.252170758397564, + "grad_norm": 1.7233256101608276, + "learning_rate": 4.255703660599246e-05, + "loss": 5.0439, + "step": 42401 + }, + { + "epoch": 0.252176705680845, + "grad_norm": 1.6880770921707153, + "learning_rate": 4.255670407576356e-05, + "loss": 5.047, + "step": 42402 + }, + { + "epoch": 0.252182652964126, + "grad_norm": 1.6298986673355103, + "learning_rate": 4.255637153940579e-05, + "loss": 4.7798, + "step": 42403 + }, + { + "epoch": 0.25218860024740697, + "grad_norm": 1.5940220355987549, + "learning_rate": 4.255603899691928e-05, + "loss": 5.0696, + "step": 42404 + }, + { + "epoch": 0.252194547530688, + "grad_norm": 1.5237146615982056, + "learning_rate": 4.2555706448304133e-05, + "loss": 4.4781, + "step": 42405 + }, + { + "epoch": 0.252200494813969, + "grad_norm": 1.5851911306381226, + "learning_rate": 4.255537389356047e-05, + "loss": 4.8684, + "step": 42406 + }, + { + "epoch": 0.25220644209724996, + "grad_norm": 1.3572262525558472, + "learning_rate": 4.2555041332688406e-05, + "loss": 5.1869, + "step": 42407 + }, + { + "epoch": 0.252212389380531, + "grad_norm": 1.6163116693496704, + "learning_rate": 4.2554708765688067e-05, + "loss": 4.9661, + "step": 42408 + }, + { + "epoch": 0.252218336663812, + "grad_norm": 1.5014004707336426, + "learning_rate": 4.255437619255955e-05, + "loss": 5.1834, + "step": 42409 + }, + { + "epoch": 0.25222428394709295, + "grad_norm": 1.544518232345581, + "learning_rate": 4.2554043613302985e-05, + "loss": 5.2196, + "step": 42410 + }, + { + "epoch": 0.25223023123037397, + "grad_norm": 1.4306154251098633, + "learning_rate": 4.255371102791849e-05, + "loss": 5.4943, + "step": 42411 + }, + { + "epoch": 0.252236178513655, + "grad_norm": 1.4807231426239014, + "learning_rate": 4.255337843640617e-05, + "loss": 4.6582, + "step": 42412 + }, + { + "epoch": 0.25224212579693595, + "grad_norm": 1.8054877519607544, + "learning_rate": 4.255304583876615e-05, + "loss": 4.2748, + "step": 42413 + }, + { + "epoch": 0.25224807308021696, + "grad_norm": 1.4608259201049805, + "learning_rate": 4.255271323499854e-05, + "loss": 5.12, + "step": 42414 + }, + { + "epoch": 0.252254020363498, + "grad_norm": 1.4595166444778442, + "learning_rate": 4.255238062510346e-05, + "loss": 5.3055, + "step": 42415 + }, + { + "epoch": 0.25225996764677894, + "grad_norm": 1.573690414428711, + "learning_rate": 4.2552048009081026e-05, + "loss": 5.2568, + "step": 42416 + }, + { + "epoch": 0.25226591493005995, + "grad_norm": 2.601102828979492, + "learning_rate": 4.2551715386931354e-05, + "loss": 3.7512, + "step": 42417 + }, + { + "epoch": 0.25227186221334097, + "grad_norm": 1.918142557144165, + "learning_rate": 4.255138275865456e-05, + "loss": 4.9331, + "step": 42418 + }, + { + "epoch": 0.25227780949662193, + "grad_norm": 1.4744904041290283, + "learning_rate": 4.255105012425076e-05, + "loss": 5.5836, + "step": 42419 + }, + { + "epoch": 0.25228375677990295, + "grad_norm": 1.853691816329956, + "learning_rate": 4.255071748372007e-05, + "loss": 4.2478, + "step": 42420 + }, + { + "epoch": 0.25228970406318396, + "grad_norm": 2.372061252593994, + "learning_rate": 4.25503848370626e-05, + "loss": 3.4274, + "step": 42421 + }, + { + "epoch": 0.2522956513464649, + "grad_norm": 2.235891580581665, + "learning_rate": 4.2550052184278475e-05, + "loss": 3.6335, + "step": 42422 + }, + { + "epoch": 0.25230159862974594, + "grad_norm": 1.547494888305664, + "learning_rate": 4.254971952536782e-05, + "loss": 5.1368, + "step": 42423 + }, + { + "epoch": 0.25230754591302695, + "grad_norm": 1.5865620374679565, + "learning_rate": 4.254938686033072e-05, + "loss": 4.6857, + "step": 42424 + }, + { + "epoch": 0.2523134931963079, + "grad_norm": 1.6528912782669067, + "learning_rate": 4.2549054189167316e-05, + "loss": 5.085, + "step": 42425 + }, + { + "epoch": 0.25231944047958893, + "grad_norm": 1.4850986003875732, + "learning_rate": 4.254872151187772e-05, + "loss": 4.7344, + "step": 42426 + }, + { + "epoch": 0.25232538776286995, + "grad_norm": 1.3711305856704712, + "learning_rate": 4.254838882846205e-05, + "loss": 4.9147, + "step": 42427 + }, + { + "epoch": 0.2523313350461509, + "grad_norm": 1.8215047121047974, + "learning_rate": 4.2548056138920417e-05, + "loss": 4.4929, + "step": 42428 + }, + { + "epoch": 0.2523372823294319, + "grad_norm": 2.1080965995788574, + "learning_rate": 4.254772344325294e-05, + "loss": 4.4021, + "step": 42429 + }, + { + "epoch": 0.25234322961271294, + "grad_norm": 1.871265172958374, + "learning_rate": 4.254739074145973e-05, + "loss": 4.5034, + "step": 42430 + }, + { + "epoch": 0.2523491768959939, + "grad_norm": 1.9266678094863892, + "learning_rate": 4.254705803354091e-05, + "loss": 4.4875, + "step": 42431 + }, + { + "epoch": 0.2523551241792749, + "grad_norm": 2.7231884002685547, + "learning_rate": 4.254672531949659e-05, + "loss": 3.5947, + "step": 42432 + }, + { + "epoch": 0.25236107146255593, + "grad_norm": 2.3819639682769775, + "learning_rate": 4.25463925993269e-05, + "loss": 3.5379, + "step": 42433 + }, + { + "epoch": 0.2523670187458369, + "grad_norm": 2.0479509830474854, + "learning_rate": 4.254605987303193e-05, + "loss": 3.4373, + "step": 42434 + }, + { + "epoch": 0.2523729660291179, + "grad_norm": 1.9716160297393799, + "learning_rate": 4.254572714061182e-05, + "loss": 3.6241, + "step": 42435 + }, + { + "epoch": 0.2523789133123989, + "grad_norm": 1.8995682001113892, + "learning_rate": 4.254539440206668e-05, + "loss": 3.4088, + "step": 42436 + }, + { + "epoch": 0.2523848605956799, + "grad_norm": 2.1230528354644775, + "learning_rate": 4.254506165739662e-05, + "loss": 3.3049, + "step": 42437 + }, + { + "epoch": 0.2523908078789609, + "grad_norm": 1.9821257591247559, + "learning_rate": 4.254472890660177e-05, + "loss": 3.3462, + "step": 42438 + }, + { + "epoch": 0.2523967551622419, + "grad_norm": 1.9816112518310547, + "learning_rate": 4.254439614968222e-05, + "loss": 3.5349, + "step": 42439 + }, + { + "epoch": 0.2524027024455229, + "grad_norm": 2.0097389221191406, + "learning_rate": 4.254406338663811e-05, + "loss": 3.2505, + "step": 42440 + }, + { + "epoch": 0.2524086497288039, + "grad_norm": 1.8473939895629883, + "learning_rate": 4.2543730617469555e-05, + "loss": 3.3096, + "step": 42441 + }, + { + "epoch": 0.2524145970120849, + "grad_norm": 1.795224666595459, + "learning_rate": 4.254339784217666e-05, + "loss": 3.4072, + "step": 42442 + }, + { + "epoch": 0.25242054429536587, + "grad_norm": 2.621525526046753, + "learning_rate": 4.2543065060759544e-05, + "loss": 2.6127, + "step": 42443 + }, + { + "epoch": 0.2524264915786469, + "grad_norm": 2.5257089138031006, + "learning_rate": 4.2542732273218334e-05, + "loss": 2.5166, + "step": 42444 + }, + { + "epoch": 0.25243243886192784, + "grad_norm": 2.290695905685425, + "learning_rate": 4.254239947955312e-05, + "loss": 2.8484, + "step": 42445 + }, + { + "epoch": 0.25243838614520886, + "grad_norm": 2.0568196773529053, + "learning_rate": 4.254206667976405e-05, + "loss": 4.2651, + "step": 42446 + }, + { + "epoch": 0.2524443334284899, + "grad_norm": 2.104011058807373, + "learning_rate": 4.2541733873851227e-05, + "loss": 3.6843, + "step": 42447 + }, + { + "epoch": 0.25245028071177084, + "grad_norm": 2.910224437713623, + "learning_rate": 4.254140106181476e-05, + "loss": 3.3772, + "step": 42448 + }, + { + "epoch": 0.25245622799505185, + "grad_norm": 2.693826198577881, + "learning_rate": 4.254106824365477e-05, + "loss": 3.5004, + "step": 42449 + }, + { + "epoch": 0.25246217527833287, + "grad_norm": 2.637441396713257, + "learning_rate": 4.254073541937137e-05, + "loss": 3.5446, + "step": 42450 + }, + { + "epoch": 0.25246812256161383, + "grad_norm": 2.530426263809204, + "learning_rate": 4.254040258896469e-05, + "loss": 3.8279, + "step": 42451 + }, + { + "epoch": 0.25247406984489484, + "grad_norm": 2.6893649101257324, + "learning_rate": 4.2540069752434835e-05, + "loss": 3.4944, + "step": 42452 + }, + { + "epoch": 0.25248001712817586, + "grad_norm": 2.4873671531677246, + "learning_rate": 4.253973690978192e-05, + "loss": 3.41, + "step": 42453 + }, + { + "epoch": 0.2524859644114568, + "grad_norm": 2.5949559211730957, + "learning_rate": 4.253940406100606e-05, + "loss": 3.2942, + "step": 42454 + }, + { + "epoch": 0.25249191169473784, + "grad_norm": 2.3007960319519043, + "learning_rate": 4.253907120610738e-05, + "loss": 3.4055, + "step": 42455 + }, + { + "epoch": 0.25249785897801885, + "grad_norm": 2.1987087726593018, + "learning_rate": 4.2538738345086e-05, + "loss": 3.1182, + "step": 42456 + }, + { + "epoch": 0.2525038062612998, + "grad_norm": 2.2741310596466064, + "learning_rate": 4.253840547794201e-05, + "loss": 3.5611, + "step": 42457 + }, + { + "epoch": 0.25250975354458083, + "grad_norm": 2.0958199501037598, + "learning_rate": 4.253807260467556e-05, + "loss": 3.4078, + "step": 42458 + }, + { + "epoch": 0.25251570082786184, + "grad_norm": 2.5133697986602783, + "learning_rate": 4.253773972528674e-05, + "loss": 3.889, + "step": 42459 + }, + { + "epoch": 0.2525216481111428, + "grad_norm": 2.137808084487915, + "learning_rate": 4.2537406839775684e-05, + "loss": 3.3046, + "step": 42460 + }, + { + "epoch": 0.2525275953944238, + "grad_norm": 2.241925001144409, + "learning_rate": 4.2537073948142493e-05, + "loss": 3.5808, + "step": 42461 + }, + { + "epoch": 0.25253354267770484, + "grad_norm": 2.470928192138672, + "learning_rate": 4.2536741050387294e-05, + "loss": 3.454, + "step": 42462 + }, + { + "epoch": 0.2525394899609858, + "grad_norm": 2.1945853233337402, + "learning_rate": 4.25364081465102e-05, + "loss": 3.2031, + "step": 42463 + }, + { + "epoch": 0.2525454372442668, + "grad_norm": 2.241377592086792, + "learning_rate": 4.253607523651133e-05, + "loss": 2.9057, + "step": 42464 + }, + { + "epoch": 0.25255138452754783, + "grad_norm": 2.178135633468628, + "learning_rate": 4.253574232039079e-05, + "loss": 3.3497, + "step": 42465 + }, + { + "epoch": 0.2525573318108288, + "grad_norm": 1.9755403995513916, + "learning_rate": 4.253540939814871e-05, + "loss": 3.1946, + "step": 42466 + }, + { + "epoch": 0.2525632790941098, + "grad_norm": 1.7415516376495361, + "learning_rate": 4.2535076469785194e-05, + "loss": 4.8258, + "step": 42467 + }, + { + "epoch": 0.2525692263773908, + "grad_norm": 1.5476256608963013, + "learning_rate": 4.253474353530037e-05, + "loss": 4.9239, + "step": 42468 + }, + { + "epoch": 0.2525751736606718, + "grad_norm": 1.9755531549453735, + "learning_rate": 4.253441059469434e-05, + "loss": 4.1113, + "step": 42469 + }, + { + "epoch": 0.2525811209439528, + "grad_norm": 1.559035062789917, + "learning_rate": 4.253407764796724e-05, + "loss": 5.1112, + "step": 42470 + }, + { + "epoch": 0.2525870682272338, + "grad_norm": 2.7660059928894043, + "learning_rate": 4.253374469511917e-05, + "loss": 3.5828, + "step": 42471 + }, + { + "epoch": 0.2525930155105148, + "grad_norm": 1.9223004579544067, + "learning_rate": 4.253341173615025e-05, + "loss": 3.7234, + "step": 42472 + }, + { + "epoch": 0.2525989627937958, + "grad_norm": 1.4881011247634888, + "learning_rate": 4.25330787710606e-05, + "loss": 4.5227, + "step": 42473 + }, + { + "epoch": 0.2526049100770768, + "grad_norm": 1.5777751207351685, + "learning_rate": 4.253274579985033e-05, + "loss": 4.576, + "step": 42474 + }, + { + "epoch": 0.25261085736035777, + "grad_norm": 1.5549952983856201, + "learning_rate": 4.253241282251955e-05, + "loss": 4.4684, + "step": 42475 + }, + { + "epoch": 0.2526168046436388, + "grad_norm": 1.4972749948501587, + "learning_rate": 4.25320798390684e-05, + "loss": 4.8804, + "step": 42476 + }, + { + "epoch": 0.2526227519269198, + "grad_norm": 1.3800556659698486, + "learning_rate": 4.253174684949698e-05, + "loss": 4.6603, + "step": 42477 + }, + { + "epoch": 0.25262869921020076, + "grad_norm": 1.151010274887085, + "learning_rate": 4.25314138538054e-05, + "loss": 4.2479, + "step": 42478 + }, + { + "epoch": 0.2526346464934818, + "grad_norm": 1.432078242301941, + "learning_rate": 4.253108085199379e-05, + "loss": 4.6991, + "step": 42479 + }, + { + "epoch": 0.2526405937767628, + "grad_norm": 1.7505486011505127, + "learning_rate": 4.2530747844062265e-05, + "loss": 4.8565, + "step": 42480 + }, + { + "epoch": 0.25264654106004375, + "grad_norm": 1.6795767545700073, + "learning_rate": 4.2530414830010926e-05, + "loss": 4.8696, + "step": 42481 + }, + { + "epoch": 0.25265248834332477, + "grad_norm": 1.7523690462112427, + "learning_rate": 4.2530081809839916e-05, + "loss": 3.8997, + "step": 42482 + }, + { + "epoch": 0.2526584356266058, + "grad_norm": 2.191622257232666, + "learning_rate": 4.2529748783549324e-05, + "loss": 3.3839, + "step": 42483 + }, + { + "epoch": 0.25266438290988674, + "grad_norm": 2.2652409076690674, + "learning_rate": 4.252941575113928e-05, + "loss": 3.3621, + "step": 42484 + }, + { + "epoch": 0.25267033019316776, + "grad_norm": 2.235106945037842, + "learning_rate": 4.25290827126099e-05, + "loss": 3.308, + "step": 42485 + }, + { + "epoch": 0.2526762774764488, + "grad_norm": 2.3401544094085693, + "learning_rate": 4.25287496679613e-05, + "loss": 3.3699, + "step": 42486 + }, + { + "epoch": 0.25268222475972973, + "grad_norm": 3.289886236190796, + "learning_rate": 4.252841661719359e-05, + "loss": 3.434, + "step": 42487 + }, + { + "epoch": 0.25268817204301075, + "grad_norm": 2.4178500175476074, + "learning_rate": 4.2528083560306884e-05, + "loss": 3.3936, + "step": 42488 + }, + { + "epoch": 0.25269411932629177, + "grad_norm": 2.2965221405029297, + "learning_rate": 4.2527750497301323e-05, + "loss": 3.2787, + "step": 42489 + }, + { + "epoch": 0.2527000666095727, + "grad_norm": 3.2637665271759033, + "learning_rate": 4.252741742817699e-05, + "loss": 2.8806, + "step": 42490 + }, + { + "epoch": 0.25270601389285374, + "grad_norm": 2.6283178329467773, + "learning_rate": 4.252708435293403e-05, + "loss": 3.0053, + "step": 42491 + }, + { + "epoch": 0.25271196117613476, + "grad_norm": 2.382254123687744, + "learning_rate": 4.252675127157253e-05, + "loss": 3.2284, + "step": 42492 + }, + { + "epoch": 0.2527179084594157, + "grad_norm": 2.2079687118530273, + "learning_rate": 4.252641818409263e-05, + "loss": 3.3179, + "step": 42493 + }, + { + "epoch": 0.25272385574269673, + "grad_norm": 2.2829689979553223, + "learning_rate": 4.2526085090494444e-05, + "loss": 3.2623, + "step": 42494 + }, + { + "epoch": 0.25272980302597775, + "grad_norm": 2.4345366954803467, + "learning_rate": 4.2525751990778075e-05, + "loss": 3.3064, + "step": 42495 + }, + { + "epoch": 0.2527357503092587, + "grad_norm": 2.4733173847198486, + "learning_rate": 4.252541888494365e-05, + "loss": 3.2664, + "step": 42496 + }, + { + "epoch": 0.2527416975925397, + "grad_norm": 2.167036533355713, + "learning_rate": 4.2525085772991286e-05, + "loss": 4.0621, + "step": 42497 + }, + { + "epoch": 0.25274764487582074, + "grad_norm": 2.1195971965789795, + "learning_rate": 4.252475265492109e-05, + "loss": 4.7048, + "step": 42498 + }, + { + "epoch": 0.2527535921591017, + "grad_norm": 2.8585541248321533, + "learning_rate": 4.252441953073319e-05, + "loss": 3.5919, + "step": 42499 + }, + { + "epoch": 0.2527595394423827, + "grad_norm": 2.360858678817749, + "learning_rate": 4.252408640042769e-05, + "loss": 4.4677, + "step": 42500 + }, + { + "epoch": 0.25276548672566373, + "grad_norm": 1.5480945110321045, + "learning_rate": 4.252375326400471e-05, + "loss": 4.8004, + "step": 42501 + }, + { + "epoch": 0.2527714340089447, + "grad_norm": 1.6938588619232178, + "learning_rate": 4.252342012146438e-05, + "loss": 4.8569, + "step": 42502 + }, + { + "epoch": 0.2527773812922257, + "grad_norm": 1.608335256576538, + "learning_rate": 4.25230869728068e-05, + "loss": 4.8393, + "step": 42503 + }, + { + "epoch": 0.2527833285755067, + "grad_norm": 1.4042285680770874, + "learning_rate": 4.252275381803208e-05, + "loss": 4.826, + "step": 42504 + }, + { + "epoch": 0.2527892758587877, + "grad_norm": 1.3554788827896118, + "learning_rate": 4.252242065714036e-05, + "loss": 4.5882, + "step": 42505 + }, + { + "epoch": 0.2527952231420687, + "grad_norm": 1.6296404600143433, + "learning_rate": 4.252208749013175e-05, + "loss": 4.7762, + "step": 42506 + }, + { + "epoch": 0.2528011704253497, + "grad_norm": 1.8994042873382568, + "learning_rate": 4.252175431700635e-05, + "loss": 4.7527, + "step": 42507 + }, + { + "epoch": 0.2528071177086307, + "grad_norm": 1.9139682054519653, + "learning_rate": 4.2521421137764285e-05, + "loss": 4.388, + "step": 42508 + }, + { + "epoch": 0.2528130649919117, + "grad_norm": 2.0092434883117676, + "learning_rate": 4.2521087952405675e-05, + "loss": 4.7794, + "step": 42509 + }, + { + "epoch": 0.2528190122751927, + "grad_norm": 1.9171737432479858, + "learning_rate": 4.2520754760930634e-05, + "loss": 4.4662, + "step": 42510 + }, + { + "epoch": 0.25282495955847367, + "grad_norm": 1.655983567237854, + "learning_rate": 4.252042156333928e-05, + "loss": 4.8947, + "step": 42511 + }, + { + "epoch": 0.2528309068417547, + "grad_norm": 1.4871597290039062, + "learning_rate": 4.2520088359631724e-05, + "loss": 5.0391, + "step": 42512 + }, + { + "epoch": 0.2528368541250357, + "grad_norm": 1.5848089456558228, + "learning_rate": 4.251975514980809e-05, + "loss": 4.8699, + "step": 42513 + }, + { + "epoch": 0.25284280140831666, + "grad_norm": 1.4499403238296509, + "learning_rate": 4.25194219338685e-05, + "loss": 4.777, + "step": 42514 + }, + { + "epoch": 0.2528487486915977, + "grad_norm": 1.3256508111953735, + "learning_rate": 4.2519088711813046e-05, + "loss": 4.7451, + "step": 42515 + }, + { + "epoch": 0.2528546959748787, + "grad_norm": 1.279702067375183, + "learning_rate": 4.251875548364187e-05, + "loss": 4.8232, + "step": 42516 + }, + { + "epoch": 0.25286064325815966, + "grad_norm": 1.7813013792037964, + "learning_rate": 4.2518422249355064e-05, + "loss": 4.5963, + "step": 42517 + }, + { + "epoch": 0.2528665905414407, + "grad_norm": 1.5852144956588745, + "learning_rate": 4.2518089008952764e-05, + "loss": 4.9773, + "step": 42518 + }, + { + "epoch": 0.2528725378247217, + "grad_norm": 1.6487088203430176, + "learning_rate": 4.251775576243508e-05, + "loss": 5.4712, + "step": 42519 + }, + { + "epoch": 0.25287848510800265, + "grad_norm": 1.7761071920394897, + "learning_rate": 4.2517422509802135e-05, + "loss": 4.6892, + "step": 42520 + }, + { + "epoch": 0.25288443239128366, + "grad_norm": 1.573292851448059, + "learning_rate": 4.251708925105403e-05, + "loss": 4.866, + "step": 42521 + }, + { + "epoch": 0.2528903796745647, + "grad_norm": 1.6427650451660156, + "learning_rate": 4.2516755986190895e-05, + "loss": 4.8201, + "step": 42522 + }, + { + "epoch": 0.25289632695784564, + "grad_norm": 1.8255342245101929, + "learning_rate": 4.251642271521285e-05, + "loss": 4.5685, + "step": 42523 + }, + { + "epoch": 0.25290227424112666, + "grad_norm": 1.5587304830551147, + "learning_rate": 4.251608943811999e-05, + "loss": 4.7972, + "step": 42524 + }, + { + "epoch": 0.2529082215244077, + "grad_norm": 1.4785772562026978, + "learning_rate": 4.251575615491244e-05, + "loss": 4.6395, + "step": 42525 + }, + { + "epoch": 0.25291416880768863, + "grad_norm": 1.3406004905700684, + "learning_rate": 4.251542286559033e-05, + "loss": 4.5403, + "step": 42526 + }, + { + "epoch": 0.25292011609096965, + "grad_norm": 2.720374584197998, + "learning_rate": 4.2515089570153766e-05, + "loss": 4.4205, + "step": 42527 + }, + { + "epoch": 0.25292606337425066, + "grad_norm": 3.176406145095825, + "learning_rate": 4.251475626860286e-05, + "loss": 4.5482, + "step": 42528 + }, + { + "epoch": 0.2529320106575316, + "grad_norm": 3.334611654281616, + "learning_rate": 4.251442296093774e-05, + "loss": 4.6271, + "step": 42529 + }, + { + "epoch": 0.25293795794081264, + "grad_norm": 2.3106868267059326, + "learning_rate": 4.251408964715852e-05, + "loss": 4.7845, + "step": 42530 + }, + { + "epoch": 0.25294390522409366, + "grad_norm": 2.0996861457824707, + "learning_rate": 4.25137563272653e-05, + "loss": 4.94, + "step": 42531 + }, + { + "epoch": 0.2529498525073746, + "grad_norm": 1.6225746870040894, + "learning_rate": 4.251342300125821e-05, + "loss": 5.0029, + "step": 42532 + }, + { + "epoch": 0.25295579979065563, + "grad_norm": 2.4678306579589844, + "learning_rate": 4.2513089669137374e-05, + "loss": 4.4738, + "step": 42533 + }, + { + "epoch": 0.25296174707393665, + "grad_norm": 2.3553192615509033, + "learning_rate": 4.25127563309029e-05, + "loss": 3.9229, + "step": 42534 + }, + { + "epoch": 0.2529676943572176, + "grad_norm": 1.5579992532730103, + "learning_rate": 4.251242298655489e-05, + "loss": 5.1221, + "step": 42535 + }, + { + "epoch": 0.2529736416404986, + "grad_norm": 1.9477174282073975, + "learning_rate": 4.2512089636093485e-05, + "loss": 4.0463, + "step": 42536 + }, + { + "epoch": 0.25297958892377964, + "grad_norm": 2.5758211612701416, + "learning_rate": 4.251175627951879e-05, + "loss": 3.003, + "step": 42537 + }, + { + "epoch": 0.2529855362070606, + "grad_norm": 2.3936142921447754, + "learning_rate": 4.2511422916830916e-05, + "loss": 2.744, + "step": 42538 + }, + { + "epoch": 0.2529914834903416, + "grad_norm": 2.2958381175994873, + "learning_rate": 4.2511089548029995e-05, + "loss": 2.686, + "step": 42539 + }, + { + "epoch": 0.25299743077362263, + "grad_norm": 2.279111385345459, + "learning_rate": 4.2510756173116125e-05, + "loss": 2.7398, + "step": 42540 + }, + { + "epoch": 0.2530033780569036, + "grad_norm": 2.147298574447632, + "learning_rate": 4.2510422792089436e-05, + "loss": 3.0212, + "step": 42541 + }, + { + "epoch": 0.2530093253401846, + "grad_norm": 1.580404281616211, + "learning_rate": 4.2510089404950035e-05, + "loss": 4.7861, + "step": 42542 + }, + { + "epoch": 0.2530152726234656, + "grad_norm": 1.8899710178375244, + "learning_rate": 4.2509756011698044e-05, + "loss": 4.3639, + "step": 42543 + }, + { + "epoch": 0.2530212199067466, + "grad_norm": 2.1244473457336426, + "learning_rate": 4.250942261233358e-05, + "loss": 4.2631, + "step": 42544 + }, + { + "epoch": 0.2530271671900276, + "grad_norm": 1.8744895458221436, + "learning_rate": 4.2509089206856755e-05, + "loss": 4.2686, + "step": 42545 + }, + { + "epoch": 0.2530331144733086, + "grad_norm": 2.142871618270874, + "learning_rate": 4.2508755795267686e-05, + "loss": 5.3293, + "step": 42546 + }, + { + "epoch": 0.2530390617565896, + "grad_norm": 2.0173709392547607, + "learning_rate": 4.250842237756649e-05, + "loss": 5.244, + "step": 42547 + }, + { + "epoch": 0.2530450090398706, + "grad_norm": 1.8850864171981812, + "learning_rate": 4.250808895375329e-05, + "loss": 5.1886, + "step": 42548 + }, + { + "epoch": 0.2530509563231516, + "grad_norm": 1.6481382846832275, + "learning_rate": 4.25077555238282e-05, + "loss": 4.9516, + "step": 42549 + }, + { + "epoch": 0.25305690360643257, + "grad_norm": 1.7065858840942383, + "learning_rate": 4.2507422087791325e-05, + "loss": 4.9445, + "step": 42550 + }, + { + "epoch": 0.2530628508897136, + "grad_norm": 1.8216016292572021, + "learning_rate": 4.250708864564279e-05, + "loss": 4.8446, + "step": 42551 + }, + { + "epoch": 0.2530687981729946, + "grad_norm": 2.345737934112549, + "learning_rate": 4.250675519738272e-05, + "loss": 4.4317, + "step": 42552 + }, + { + "epoch": 0.25307474545627556, + "grad_norm": 2.747398853302002, + "learning_rate": 4.250642174301122e-05, + "loss": 4.5337, + "step": 42553 + }, + { + "epoch": 0.2530806927395566, + "grad_norm": 2.019347667694092, + "learning_rate": 4.2506088282528405e-05, + "loss": 4.518, + "step": 42554 + }, + { + "epoch": 0.2530866400228376, + "grad_norm": 1.7942535877227783, + "learning_rate": 4.250575481593439e-05, + "loss": 4.6254, + "step": 42555 + }, + { + "epoch": 0.25309258730611855, + "grad_norm": 2.0116257667541504, + "learning_rate": 4.250542134322931e-05, + "loss": 4.4959, + "step": 42556 + }, + { + "epoch": 0.25309853458939957, + "grad_norm": 1.791983723640442, + "learning_rate": 4.250508786441326e-05, + "loss": 4.5039, + "step": 42557 + }, + { + "epoch": 0.2531044818726806, + "grad_norm": 1.6015015840530396, + "learning_rate": 4.250475437948637e-05, + "loss": 4.552, + "step": 42558 + }, + { + "epoch": 0.25311042915596155, + "grad_norm": 1.5469990968704224, + "learning_rate": 4.2504420888448746e-05, + "loss": 4.6645, + "step": 42559 + }, + { + "epoch": 0.25311637643924256, + "grad_norm": 1.906301498413086, + "learning_rate": 4.2504087391300514e-05, + "loss": 4.6274, + "step": 42560 + }, + { + "epoch": 0.2531223237225236, + "grad_norm": 1.8907713890075684, + "learning_rate": 4.2503753888041786e-05, + "loss": 4.6035, + "step": 42561 + }, + { + "epoch": 0.25312827100580454, + "grad_norm": 1.6200282573699951, + "learning_rate": 4.250342037867267e-05, + "loss": 4.8408, + "step": 42562 + }, + { + "epoch": 0.25313421828908556, + "grad_norm": 1.9498435258865356, + "learning_rate": 4.25030868631933e-05, + "loss": 4.5159, + "step": 42563 + }, + { + "epoch": 0.2531401655723665, + "grad_norm": 1.8380780220031738, + "learning_rate": 4.250275334160378e-05, + "loss": 4.4993, + "step": 42564 + }, + { + "epoch": 0.25314611285564753, + "grad_norm": 1.676686406135559, + "learning_rate": 4.250241981390423e-05, + "loss": 4.5457, + "step": 42565 + }, + { + "epoch": 0.25315206013892855, + "grad_norm": 1.618243932723999, + "learning_rate": 4.2502086280094764e-05, + "loss": 5.2673, + "step": 42566 + }, + { + "epoch": 0.2531580074222095, + "grad_norm": 1.8503484725952148, + "learning_rate": 4.25017527401755e-05, + "loss": 4.7984, + "step": 42567 + }, + { + "epoch": 0.2531639547054905, + "grad_norm": 1.6484590768814087, + "learning_rate": 4.250141919414656e-05, + "loss": 5.0665, + "step": 42568 + }, + { + "epoch": 0.25316990198877154, + "grad_norm": 1.5873533487319946, + "learning_rate": 4.250108564200805e-05, + "loss": 4.8822, + "step": 42569 + }, + { + "epoch": 0.2531758492720525, + "grad_norm": 2.021108388900757, + "learning_rate": 4.250075208376009e-05, + "loss": 4.4823, + "step": 42570 + }, + { + "epoch": 0.2531817965553335, + "grad_norm": 1.7180837392807007, + "learning_rate": 4.2500418519402805e-05, + "loss": 5.1688, + "step": 42571 + }, + { + "epoch": 0.25318774383861453, + "grad_norm": 1.7250596284866333, + "learning_rate": 4.25000849489363e-05, + "loss": 4.8874, + "step": 42572 + }, + { + "epoch": 0.2531936911218955, + "grad_norm": 1.6275099515914917, + "learning_rate": 4.24997513723607e-05, + "loss": 4.7809, + "step": 42573 + }, + { + "epoch": 0.2531996384051765, + "grad_norm": 1.586955189704895, + "learning_rate": 4.249941778967612e-05, + "loss": 4.6629, + "step": 42574 + }, + { + "epoch": 0.2532055856884575, + "grad_norm": 1.6709104776382446, + "learning_rate": 4.249908420088266e-05, + "loss": 4.6403, + "step": 42575 + }, + { + "epoch": 0.2532115329717385, + "grad_norm": 1.6892684698104858, + "learning_rate": 4.249875060598047e-05, + "loss": 5.0257, + "step": 42576 + }, + { + "epoch": 0.2532174802550195, + "grad_norm": 1.4856452941894531, + "learning_rate": 4.2498417004969637e-05, + "loss": 5.1931, + "step": 42577 + }, + { + "epoch": 0.2532234275383005, + "grad_norm": 1.978171467781067, + "learning_rate": 4.2498083397850283e-05, + "loss": 4.7726, + "step": 42578 + }, + { + "epoch": 0.2532293748215815, + "grad_norm": 1.859830617904663, + "learning_rate": 4.2497749784622535e-05, + "loss": 4.9147, + "step": 42579 + }, + { + "epoch": 0.2532353221048625, + "grad_norm": 1.7917394638061523, + "learning_rate": 4.249741616528651e-05, + "loss": 4.8351, + "step": 42580 + }, + { + "epoch": 0.2532412693881435, + "grad_norm": 1.436583161354065, + "learning_rate": 4.24970825398423e-05, + "loss": 4.8081, + "step": 42581 + }, + { + "epoch": 0.25324721667142447, + "grad_norm": 1.6560750007629395, + "learning_rate": 4.249674890829005e-05, + "loss": 4.6613, + "step": 42582 + }, + { + "epoch": 0.2532531639547055, + "grad_norm": 1.7613948583602905, + "learning_rate": 4.249641527062987e-05, + "loss": 4.6875, + "step": 42583 + }, + { + "epoch": 0.2532591112379865, + "grad_norm": 1.5537686347961426, + "learning_rate": 4.249608162686186e-05, + "loss": 4.7465, + "step": 42584 + }, + { + "epoch": 0.25326505852126746, + "grad_norm": 1.8550828695297241, + "learning_rate": 4.2495747976986154e-05, + "loss": 4.6628, + "step": 42585 + }, + { + "epoch": 0.2532710058045485, + "grad_norm": 1.4868930578231812, + "learning_rate": 4.2495414321002866e-05, + "loss": 4.7069, + "step": 42586 + }, + { + "epoch": 0.2532769530878295, + "grad_norm": 1.4797731637954712, + "learning_rate": 4.249508065891211e-05, + "loss": 4.8393, + "step": 42587 + }, + { + "epoch": 0.25328290037111045, + "grad_norm": 1.6670103073120117, + "learning_rate": 4.2494746990714005e-05, + "loss": 4.5247, + "step": 42588 + }, + { + "epoch": 0.25328884765439147, + "grad_norm": 1.5516252517700195, + "learning_rate": 4.249441331640866e-05, + "loss": 4.713, + "step": 42589 + }, + { + "epoch": 0.2532947949376725, + "grad_norm": 1.6883665323257446, + "learning_rate": 4.24940796359962e-05, + "loss": 4.7604, + "step": 42590 + }, + { + "epoch": 0.25330074222095345, + "grad_norm": 1.7170480489730835, + "learning_rate": 4.249374594947673e-05, + "loss": 4.8235, + "step": 42591 + }, + { + "epoch": 0.25330668950423446, + "grad_norm": 1.5353639125823975, + "learning_rate": 4.2493412256850376e-05, + "loss": 4.6567, + "step": 42592 + }, + { + "epoch": 0.2533126367875155, + "grad_norm": 1.405881404876709, + "learning_rate": 4.249307855811725e-05, + "loss": 4.7168, + "step": 42593 + }, + { + "epoch": 0.25331858407079644, + "grad_norm": 1.6876722574234009, + "learning_rate": 4.249274485327748e-05, + "loss": 4.5179, + "step": 42594 + }, + { + "epoch": 0.25332453135407745, + "grad_norm": 1.808559775352478, + "learning_rate": 4.2492411142331164e-05, + "loss": 5.2088, + "step": 42595 + }, + { + "epoch": 0.25333047863735847, + "grad_norm": 1.7939791679382324, + "learning_rate": 4.2492077425278434e-05, + "loss": 5.1593, + "step": 42596 + }, + { + "epoch": 0.25333642592063943, + "grad_norm": 1.8243491649627686, + "learning_rate": 4.24917437021194e-05, + "loss": 5.2049, + "step": 42597 + }, + { + "epoch": 0.25334237320392045, + "grad_norm": 2.1501033306121826, + "learning_rate": 4.249140997285418e-05, + "loss": 3.4541, + "step": 42598 + }, + { + "epoch": 0.25334832048720146, + "grad_norm": 1.7840768098831177, + "learning_rate": 4.249107623748288e-05, + "loss": 4.07, + "step": 42599 + }, + { + "epoch": 0.2533542677704824, + "grad_norm": 1.6957765817642212, + "learning_rate": 4.249074249600564e-05, + "loss": 4.1273, + "step": 42600 + }, + { + "epoch": 0.25336021505376344, + "grad_norm": 1.6441603899002075, + "learning_rate": 4.2490408748422555e-05, + "loss": 4.1511, + "step": 42601 + }, + { + "epoch": 0.25336616233704445, + "grad_norm": 1.6739991903305054, + "learning_rate": 4.249007499473375e-05, + "loss": 4.8187, + "step": 42602 + }, + { + "epoch": 0.2533721096203254, + "grad_norm": 1.5142462253570557, + "learning_rate": 4.248974123493934e-05, + "loss": 4.3656, + "step": 42603 + }, + { + "epoch": 0.25337805690360643, + "grad_norm": 1.4762675762176514, + "learning_rate": 4.2489407469039445e-05, + "loss": 4.5738, + "step": 42604 + }, + { + "epoch": 0.25338400418688745, + "grad_norm": 1.6087926626205444, + "learning_rate": 4.248907369703418e-05, + "loss": 4.2183, + "step": 42605 + }, + { + "epoch": 0.2533899514701684, + "grad_norm": 2.1500778198242188, + "learning_rate": 4.248873991892365e-05, + "loss": 3.8103, + "step": 42606 + }, + { + "epoch": 0.2533958987534494, + "grad_norm": 2.37050724029541, + "learning_rate": 4.248840613470799e-05, + "loss": 3.4346, + "step": 42607 + }, + { + "epoch": 0.25340184603673044, + "grad_norm": 2.65510892868042, + "learning_rate": 4.24880723443873e-05, + "loss": 3.369, + "step": 42608 + }, + { + "epoch": 0.2534077933200114, + "grad_norm": 2.3285646438598633, + "learning_rate": 4.2487738547961714e-05, + "loss": 3.2826, + "step": 42609 + }, + { + "epoch": 0.2534137406032924, + "grad_norm": 2.327310085296631, + "learning_rate": 4.248740474543134e-05, + "loss": 3.1819, + "step": 42610 + }, + { + "epoch": 0.25341968788657343, + "grad_norm": 2.2286763191223145, + "learning_rate": 4.248707093679628e-05, + "loss": 3.3033, + "step": 42611 + }, + { + "epoch": 0.2534256351698544, + "grad_norm": 1.8175380229949951, + "learning_rate": 4.248673712205668e-05, + "loss": 4.1151, + "step": 42612 + }, + { + "epoch": 0.2534315824531354, + "grad_norm": 1.787113904953003, + "learning_rate": 4.248640330121263e-05, + "loss": 4.3068, + "step": 42613 + }, + { + "epoch": 0.2534375297364164, + "grad_norm": 2.8377792835235596, + "learning_rate": 4.2486069474264266e-05, + "loss": 3.2816, + "step": 42614 + }, + { + "epoch": 0.2534434770196974, + "grad_norm": 2.6886258125305176, + "learning_rate": 4.248573564121169e-05, + "loss": 3.5237, + "step": 42615 + }, + { + "epoch": 0.2534494243029784, + "grad_norm": 2.278836250305176, + "learning_rate": 4.2485401802055024e-05, + "loss": 3.2107, + "step": 42616 + }, + { + "epoch": 0.2534553715862594, + "grad_norm": 2.1735024452209473, + "learning_rate": 4.248506795679439e-05, + "loss": 3.5181, + "step": 42617 + }, + { + "epoch": 0.2534613188695404, + "grad_norm": 2.087636709213257, + "learning_rate": 4.248473410542989e-05, + "loss": 3.3997, + "step": 42618 + }, + { + "epoch": 0.2534672661528214, + "grad_norm": 2.3116719722747803, + "learning_rate": 4.248440024796166e-05, + "loss": 2.9808, + "step": 42619 + }, + { + "epoch": 0.2534732134361024, + "grad_norm": 2.581827163696289, + "learning_rate": 4.2484066384389795e-05, + "loss": 2.9991, + "step": 42620 + }, + { + "epoch": 0.25347916071938337, + "grad_norm": 2.61434268951416, + "learning_rate": 4.248373251471443e-05, + "loss": 3.2072, + "step": 42621 + }, + { + "epoch": 0.2534851080026644, + "grad_norm": 2.502610445022583, + "learning_rate": 4.248339863893568e-05, + "loss": 3.5486, + "step": 42622 + }, + { + "epoch": 0.2534910552859454, + "grad_norm": 2.405918836593628, + "learning_rate": 4.2483064757053655e-05, + "loss": 3.0745, + "step": 42623 + }, + { + "epoch": 0.25349700256922636, + "grad_norm": 1.6593096256256104, + "learning_rate": 4.248273086906846e-05, + "loss": 5.6345, + "step": 42624 + }, + { + "epoch": 0.2535029498525074, + "grad_norm": 1.8466362953186035, + "learning_rate": 4.248239697498024e-05, + "loss": 4.0068, + "step": 42625 + }, + { + "epoch": 0.2535088971357884, + "grad_norm": 1.911318302154541, + "learning_rate": 4.248206307478909e-05, + "loss": 4.4992, + "step": 42626 + }, + { + "epoch": 0.25351484441906935, + "grad_norm": 2.6518044471740723, + "learning_rate": 4.2481729168495124e-05, + "loss": 3.4995, + "step": 42627 + }, + { + "epoch": 0.25352079170235037, + "grad_norm": 3.160210371017456, + "learning_rate": 4.2481395256098476e-05, + "loss": 2.8131, + "step": 42628 + }, + { + "epoch": 0.2535267389856314, + "grad_norm": 3.1896450519561768, + "learning_rate": 4.2481061337599254e-05, + "loss": 2.7946, + "step": 42629 + }, + { + "epoch": 0.25353268626891234, + "grad_norm": 2.6981887817382812, + "learning_rate": 4.248072741299757e-05, + "loss": 2.6004, + "step": 42630 + }, + { + "epoch": 0.25353863355219336, + "grad_norm": 2.9441938400268555, + "learning_rate": 4.248039348229355e-05, + "loss": 2.6603, + "step": 42631 + }, + { + "epoch": 0.2535445808354744, + "grad_norm": 2.5899014472961426, + "learning_rate": 4.248005954548731e-05, + "loss": 2.4422, + "step": 42632 + }, + { + "epoch": 0.25355052811875534, + "grad_norm": 2.690110206604004, + "learning_rate": 4.247972560257895e-05, + "loss": 2.507, + "step": 42633 + }, + { + "epoch": 0.25355647540203635, + "grad_norm": 2.5802059173583984, + "learning_rate": 4.2479391653568603e-05, + "loss": 2.575, + "step": 42634 + }, + { + "epoch": 0.25356242268531737, + "grad_norm": 2.0235345363616943, + "learning_rate": 4.2479057698456375e-05, + "loss": 4.034, + "step": 42635 + }, + { + "epoch": 0.25356836996859833, + "grad_norm": 2.2500898838043213, + "learning_rate": 4.2478723737242387e-05, + "loss": 3.5131, + "step": 42636 + }, + { + "epoch": 0.25357431725187934, + "grad_norm": 2.231600761413574, + "learning_rate": 4.247838976992677e-05, + "loss": 3.3792, + "step": 42637 + }, + { + "epoch": 0.25358026453516036, + "grad_norm": 2.043940544128418, + "learning_rate": 4.247805579650962e-05, + "loss": 3.2422, + "step": 42638 + }, + { + "epoch": 0.2535862118184413, + "grad_norm": 1.8581470251083374, + "learning_rate": 4.247772181699106e-05, + "loss": 3.2392, + "step": 42639 + }, + { + "epoch": 0.25359215910172234, + "grad_norm": 1.8958791494369507, + "learning_rate": 4.247738783137122e-05, + "loss": 3.3189, + "step": 42640 + }, + { + "epoch": 0.25359810638500335, + "grad_norm": 1.8973298072814941, + "learning_rate": 4.2477053839650186e-05, + "loss": 3.2994, + "step": 42641 + }, + { + "epoch": 0.2536040536682843, + "grad_norm": 1.8182251453399658, + "learning_rate": 4.2476719841828104e-05, + "loss": 3.2506, + "step": 42642 + }, + { + "epoch": 0.25361000095156533, + "grad_norm": 1.9134297370910645, + "learning_rate": 4.247638583790508e-05, + "loss": 3.3197, + "step": 42643 + }, + { + "epoch": 0.25361594823484634, + "grad_norm": 1.9987361431121826, + "learning_rate": 4.2476051827881225e-05, + "loss": 3.2988, + "step": 42644 + }, + { + "epoch": 0.2536218955181273, + "grad_norm": 1.7997865676879883, + "learning_rate": 4.2475717811756665e-05, + "loss": 3.4833, + "step": 42645 + }, + { + "epoch": 0.2536278428014083, + "grad_norm": 1.8120460510253906, + "learning_rate": 4.2475383789531516e-05, + "loss": 3.4602, + "step": 42646 + }, + { + "epoch": 0.25363379008468934, + "grad_norm": 1.835911750793457, + "learning_rate": 4.247504976120589e-05, + "loss": 3.3893, + "step": 42647 + }, + { + "epoch": 0.2536397373679703, + "grad_norm": 1.796010136604309, + "learning_rate": 4.247471572677989e-05, + "loss": 3.3781, + "step": 42648 + }, + { + "epoch": 0.2536456846512513, + "grad_norm": 1.8301551342010498, + "learning_rate": 4.247438168625366e-05, + "loss": 3.7588, + "step": 42649 + }, + { + "epoch": 0.25365163193453233, + "grad_norm": 2.099531888961792, + "learning_rate": 4.24740476396273e-05, + "loss": 4.4443, + "step": 42650 + }, + { + "epoch": 0.2536575792178133, + "grad_norm": 2.012749195098877, + "learning_rate": 4.247371358690093e-05, + "loss": 4.5643, + "step": 42651 + }, + { + "epoch": 0.2536635265010943, + "grad_norm": 1.754624843597412, + "learning_rate": 4.2473379528074676e-05, + "loss": 4.3702, + "step": 42652 + }, + { + "epoch": 0.2536694737843753, + "grad_norm": 1.6757663488388062, + "learning_rate": 4.2473045463148645e-05, + "loss": 4.3166, + "step": 42653 + }, + { + "epoch": 0.2536754210676563, + "grad_norm": 1.8429919481277466, + "learning_rate": 4.2472711392122947e-05, + "loss": 4.4779, + "step": 42654 + }, + { + "epoch": 0.2536813683509373, + "grad_norm": 1.5572214126586914, + "learning_rate": 4.24723773149977e-05, + "loss": 4.417, + "step": 42655 + }, + { + "epoch": 0.2536873156342183, + "grad_norm": 1.6131296157836914, + "learning_rate": 4.247204323177304e-05, + "loss": 4.7333, + "step": 42656 + }, + { + "epoch": 0.2536932629174993, + "grad_norm": 1.4000458717346191, + "learning_rate": 4.247170914244906e-05, + "loss": 4.516, + "step": 42657 + }, + { + "epoch": 0.2536992102007803, + "grad_norm": 1.9727498292922974, + "learning_rate": 4.247137504702589e-05, + "loss": 4.3962, + "step": 42658 + }, + { + "epoch": 0.2537051574840613, + "grad_norm": 1.534562587738037, + "learning_rate": 4.247104094550365e-05, + "loss": 5.3564, + "step": 42659 + }, + { + "epoch": 0.25371110476734227, + "grad_norm": 1.5723401308059692, + "learning_rate": 4.2470706837882446e-05, + "loss": 4.9789, + "step": 42660 + }, + { + "epoch": 0.2537170520506233, + "grad_norm": 1.9463605880737305, + "learning_rate": 4.24703727241624e-05, + "loss": 4.2523, + "step": 42661 + }, + { + "epoch": 0.2537229993339043, + "grad_norm": 2.341726779937744, + "learning_rate": 4.247003860434362e-05, + "loss": 3.5043, + "step": 42662 + }, + { + "epoch": 0.25372894661718526, + "grad_norm": 2.2063679695129395, + "learning_rate": 4.2469704478426233e-05, + "loss": 3.1857, + "step": 42663 + }, + { + "epoch": 0.2537348939004663, + "grad_norm": 2.2621538639068604, + "learning_rate": 4.246937034641036e-05, + "loss": 3.2143, + "step": 42664 + }, + { + "epoch": 0.2537408411837473, + "grad_norm": 2.133404016494751, + "learning_rate": 4.246903620829611e-05, + "loss": 4.0347, + "step": 42665 + }, + { + "epoch": 0.25374678846702825, + "grad_norm": 1.5305193662643433, + "learning_rate": 4.2468702064083586e-05, + "loss": 5.1865, + "step": 42666 + }, + { + "epoch": 0.25375273575030927, + "grad_norm": 1.5581790208816528, + "learning_rate": 4.2468367913772934e-05, + "loss": 5.2932, + "step": 42667 + }, + { + "epoch": 0.2537586830335903, + "grad_norm": 1.7165672779083252, + "learning_rate": 4.246803375736425e-05, + "loss": 5.4158, + "step": 42668 + }, + { + "epoch": 0.25376463031687124, + "grad_norm": 1.5029596090316772, + "learning_rate": 4.246769959485764e-05, + "loss": 5.3813, + "step": 42669 + }, + { + "epoch": 0.25377057760015226, + "grad_norm": 1.4712028503417969, + "learning_rate": 4.246736542625326e-05, + "loss": 5.4072, + "step": 42670 + }, + { + "epoch": 0.2537765248834333, + "grad_norm": 1.6006098985671997, + "learning_rate": 4.24670312515512e-05, + "loss": 5.3419, + "step": 42671 + }, + { + "epoch": 0.25378247216671423, + "grad_norm": 1.721290111541748, + "learning_rate": 4.2466697070751574e-05, + "loss": 5.1645, + "step": 42672 + }, + { + "epoch": 0.25378841944999525, + "grad_norm": 1.6121009588241577, + "learning_rate": 4.2466362883854506e-05, + "loss": 5.1316, + "step": 42673 + }, + { + "epoch": 0.25379436673327627, + "grad_norm": 1.363467812538147, + "learning_rate": 4.24660286908601e-05, + "loss": 5.5796, + "step": 42674 + }, + { + "epoch": 0.2538003140165572, + "grad_norm": 1.5062321424484253, + "learning_rate": 4.2465694491768504e-05, + "loss": 5.4835, + "step": 42675 + }, + { + "epoch": 0.25380626129983824, + "grad_norm": 1.763193964958191, + "learning_rate": 4.24653602865798e-05, + "loss": 4.8117, + "step": 42676 + }, + { + "epoch": 0.25381220858311926, + "grad_norm": 1.7404911518096924, + "learning_rate": 4.2465026075294126e-05, + "loss": 4.8563, + "step": 42677 + }, + { + "epoch": 0.2538181558664002, + "grad_norm": 1.3482542037963867, + "learning_rate": 4.246469185791159e-05, + "loss": 4.878, + "step": 42678 + }, + { + "epoch": 0.25382410314968123, + "grad_norm": 1.5077338218688965, + "learning_rate": 4.246435763443231e-05, + "loss": 4.8613, + "step": 42679 + }, + { + "epoch": 0.2538300504329622, + "grad_norm": 1.503841757774353, + "learning_rate": 4.24640234048564e-05, + "loss": 4.9849, + "step": 42680 + }, + { + "epoch": 0.2538359977162432, + "grad_norm": 1.6105703115463257, + "learning_rate": 4.2463689169183984e-05, + "loss": 4.9247, + "step": 42681 + }, + { + "epoch": 0.2538419449995242, + "grad_norm": 1.6417720317840576, + "learning_rate": 4.246335492741518e-05, + "loss": 4.8256, + "step": 42682 + }, + { + "epoch": 0.2538478922828052, + "grad_norm": 1.4955658912658691, + "learning_rate": 4.246302067955009e-05, + "loss": 4.9588, + "step": 42683 + }, + { + "epoch": 0.2538538395660862, + "grad_norm": 1.552788257598877, + "learning_rate": 4.2462686425588836e-05, + "loss": 5.0579, + "step": 42684 + }, + { + "epoch": 0.2538597868493672, + "grad_norm": 1.9682592153549194, + "learning_rate": 4.2462352165531546e-05, + "loss": 4.1407, + "step": 42685 + }, + { + "epoch": 0.2538657341326482, + "grad_norm": 3.035689115524292, + "learning_rate": 4.246201789937833e-05, + "loss": 2.9328, + "step": 42686 + }, + { + "epoch": 0.2538716814159292, + "grad_norm": 2.8879168033599854, + "learning_rate": 4.24616836271293e-05, + "loss": 2.877, + "step": 42687 + }, + { + "epoch": 0.2538776286992102, + "grad_norm": 2.6074929237365723, + "learning_rate": 4.2461349348784587e-05, + "loss": 2.7149, + "step": 42688 + }, + { + "epoch": 0.25388357598249117, + "grad_norm": 2.538850784301758, + "learning_rate": 4.246101506434428e-05, + "loss": 2.7963, + "step": 42689 + }, + { + "epoch": 0.2538895232657722, + "grad_norm": 2.521348237991333, + "learning_rate": 4.2460680773808525e-05, + "loss": 3.1103, + "step": 42690 + }, + { + "epoch": 0.2538954705490532, + "grad_norm": 2.6903038024902344, + "learning_rate": 4.2460346477177424e-05, + "loss": 3.0654, + "step": 42691 + }, + { + "epoch": 0.25390141783233416, + "grad_norm": 2.676309585571289, + "learning_rate": 4.2460012174451094e-05, + "loss": 2.8576, + "step": 42692 + }, + { + "epoch": 0.2539073651156152, + "grad_norm": 2.7831082344055176, + "learning_rate": 4.2459677865629664e-05, + "loss": 3.1207, + "step": 42693 + }, + { + "epoch": 0.2539133123988962, + "grad_norm": 2.8206958770751953, + "learning_rate": 4.245934355071323e-05, + "loss": 2.6306, + "step": 42694 + }, + { + "epoch": 0.25391925968217716, + "grad_norm": 2.6332650184631348, + "learning_rate": 4.2459009229701924e-05, + "loss": 2.8751, + "step": 42695 + }, + { + "epoch": 0.2539252069654582, + "grad_norm": 1.8874300718307495, + "learning_rate": 4.2458674902595865e-05, + "loss": 4.5213, + "step": 42696 + }, + { + "epoch": 0.2539311542487392, + "grad_norm": 1.6376129388809204, + "learning_rate": 4.2458340569395147e-05, + "loss": 4.9723, + "step": 42697 + }, + { + "epoch": 0.25393710153202015, + "grad_norm": 1.5270167589187622, + "learning_rate": 4.245800623009991e-05, + "loss": 4.9661, + "step": 42698 + }, + { + "epoch": 0.25394304881530116, + "grad_norm": 1.5328229665756226, + "learning_rate": 4.245767188471027e-05, + "loss": 4.6954, + "step": 42699 + }, + { + "epoch": 0.2539489960985822, + "grad_norm": 1.5438541173934937, + "learning_rate": 4.245733753322633e-05, + "loss": 5.0825, + "step": 42700 + }, + { + "epoch": 0.25395494338186314, + "grad_norm": 1.3641287088394165, + "learning_rate": 4.245700317564821e-05, + "loss": 5.7029, + "step": 42701 + }, + { + "epoch": 0.25396089066514416, + "grad_norm": 1.5299535989761353, + "learning_rate": 4.2456668811976045e-05, + "loss": 4.8341, + "step": 42702 + }, + { + "epoch": 0.2539668379484252, + "grad_norm": 1.427050232887268, + "learning_rate": 4.245633444220992e-05, + "loss": 4.8113, + "step": 42703 + }, + { + "epoch": 0.25397278523170613, + "grad_norm": 1.4848414659500122, + "learning_rate": 4.2456000066349976e-05, + "loss": 4.8599, + "step": 42704 + }, + { + "epoch": 0.25397873251498715, + "grad_norm": 1.5817290544509888, + "learning_rate": 4.245566568439633e-05, + "loss": 5.1289, + "step": 42705 + }, + { + "epoch": 0.25398467979826816, + "grad_norm": 1.943488597869873, + "learning_rate": 4.2455331296349084e-05, + "loss": 5.302, + "step": 42706 + }, + { + "epoch": 0.2539906270815491, + "grad_norm": 2.132399082183838, + "learning_rate": 4.2454996902208363e-05, + "loss": 5.3135, + "step": 42707 + }, + { + "epoch": 0.25399657436483014, + "grad_norm": 1.9411309957504272, + "learning_rate": 4.2454662501974284e-05, + "loss": 5.3548, + "step": 42708 + }, + { + "epoch": 0.25400252164811116, + "grad_norm": 2.0834579467773438, + "learning_rate": 4.245432809564696e-05, + "loss": 3.4557, + "step": 42709 + }, + { + "epoch": 0.2540084689313921, + "grad_norm": 2.0068488121032715, + "learning_rate": 4.245399368322651e-05, + "loss": 3.2146, + "step": 42710 + }, + { + "epoch": 0.25401441621467313, + "grad_norm": 1.8891756534576416, + "learning_rate": 4.245365926471306e-05, + "loss": 3.2437, + "step": 42711 + }, + { + "epoch": 0.25402036349795415, + "grad_norm": 1.9212063550949097, + "learning_rate": 4.245332484010671e-05, + "loss": 3.0635, + "step": 42712 + }, + { + "epoch": 0.2540263107812351, + "grad_norm": 1.8484230041503906, + "learning_rate": 4.245299040940758e-05, + "loss": 2.9479, + "step": 42713 + }, + { + "epoch": 0.2540322580645161, + "grad_norm": 1.8337727785110474, + "learning_rate": 4.2452655972615805e-05, + "loss": 3.1404, + "step": 42714 + }, + { + "epoch": 0.25403820534779714, + "grad_norm": 1.890859842300415, + "learning_rate": 4.2452321529731475e-05, + "loss": 3.0689, + "step": 42715 + }, + { + "epoch": 0.2540441526310781, + "grad_norm": 2.071683645248413, + "learning_rate": 4.245198708075473e-05, + "loss": 2.9632, + "step": 42716 + }, + { + "epoch": 0.2540500999143591, + "grad_norm": 2.116879940032959, + "learning_rate": 4.2451652625685675e-05, + "loss": 3.3957, + "step": 42717 + }, + { + "epoch": 0.25405604719764013, + "grad_norm": 1.9206409454345703, + "learning_rate": 4.245131816452442e-05, + "loss": 4.8377, + "step": 42718 + }, + { + "epoch": 0.2540619944809211, + "grad_norm": 1.9834216833114624, + "learning_rate": 4.2450983697271096e-05, + "loss": 3.1973, + "step": 42719 + }, + { + "epoch": 0.2540679417642021, + "grad_norm": 1.9708844423294067, + "learning_rate": 4.2450649223925815e-05, + "loss": 3.1048, + "step": 42720 + }, + { + "epoch": 0.2540738890474831, + "grad_norm": 1.872748613357544, + "learning_rate": 4.2450314744488686e-05, + "loss": 3.5175, + "step": 42721 + }, + { + "epoch": 0.2540798363307641, + "grad_norm": 2.1473896503448486, + "learning_rate": 4.244998025895984e-05, + "loss": 3.1265, + "step": 42722 + }, + { + "epoch": 0.2540857836140451, + "grad_norm": 1.969822883605957, + "learning_rate": 4.244964576733938e-05, + "loss": 3.0091, + "step": 42723 + }, + { + "epoch": 0.2540917308973261, + "grad_norm": 2.346958875656128, + "learning_rate": 4.244931126962744e-05, + "loss": 3.0155, + "step": 42724 + }, + { + "epoch": 0.2540976781806071, + "grad_norm": 2.7399938106536865, + "learning_rate": 4.244897676582411e-05, + "loss": 2.8983, + "step": 42725 + }, + { + "epoch": 0.2541036254638881, + "grad_norm": 2.916187047958374, + "learning_rate": 4.2448642255929535e-05, + "loss": 2.5157, + "step": 42726 + }, + { + "epoch": 0.2541095727471691, + "grad_norm": 3.213395118713379, + "learning_rate": 4.2448307739943816e-05, + "loss": 2.8312, + "step": 42727 + }, + { + "epoch": 0.25411552003045007, + "grad_norm": 2.626457452774048, + "learning_rate": 4.244797321786707e-05, + "loss": 3.0202, + "step": 42728 + }, + { + "epoch": 0.2541214673137311, + "grad_norm": 2.366530179977417, + "learning_rate": 4.244763868969942e-05, + "loss": 2.9616, + "step": 42729 + }, + { + "epoch": 0.2541274145970121, + "grad_norm": 2.5884974002838135, + "learning_rate": 4.244730415544098e-05, + "loss": 2.7768, + "step": 42730 + }, + { + "epoch": 0.25413336188029306, + "grad_norm": 2.6333789825439453, + "learning_rate": 4.2446969615091864e-05, + "loss": 2.6431, + "step": 42731 + }, + { + "epoch": 0.2541393091635741, + "grad_norm": 2.9141335487365723, + "learning_rate": 4.2446635068652193e-05, + "loss": 2.787, + "step": 42732 + }, + { + "epoch": 0.2541452564468551, + "grad_norm": 1.967450499534607, + "learning_rate": 4.2446300516122085e-05, + "loss": 5.1618, + "step": 42733 + }, + { + "epoch": 0.25415120373013605, + "grad_norm": 1.599597454071045, + "learning_rate": 4.244596595750164e-05, + "loss": 5.2181, + "step": 42734 + }, + { + "epoch": 0.25415715101341707, + "grad_norm": 1.6861554384231567, + "learning_rate": 4.244563139279101e-05, + "loss": 4.6825, + "step": 42735 + }, + { + "epoch": 0.2541630982966981, + "grad_norm": 2.498715400695801, + "learning_rate": 4.244529682199027e-05, + "loss": 4.1577, + "step": 42736 + }, + { + "epoch": 0.25416904557997905, + "grad_norm": 1.9850361347198486, + "learning_rate": 4.244496224509956e-05, + "loss": 4.9835, + "step": 42737 + }, + { + "epoch": 0.25417499286326006, + "grad_norm": 2.6186370849609375, + "learning_rate": 4.2444627662119005e-05, + "loss": 3.235, + "step": 42738 + }, + { + "epoch": 0.2541809401465411, + "grad_norm": 2.5452895164489746, + "learning_rate": 4.24442930730487e-05, + "loss": 3.1766, + "step": 42739 + }, + { + "epoch": 0.25418688742982204, + "grad_norm": 2.739374876022339, + "learning_rate": 4.2443958477888786e-05, + "loss": 3.0646, + "step": 42740 + }, + { + "epoch": 0.25419283471310306, + "grad_norm": 2.2580180168151855, + "learning_rate": 4.244362387663936e-05, + "loss": 2.9561, + "step": 42741 + }, + { + "epoch": 0.25419878199638407, + "grad_norm": 2.3250229358673096, + "learning_rate": 4.244328926930054e-05, + "loss": 3.2013, + "step": 42742 + }, + { + "epoch": 0.25420472927966503, + "grad_norm": 2.087367296218872, + "learning_rate": 4.2442954655872445e-05, + "loss": 3.2908, + "step": 42743 + }, + { + "epoch": 0.25421067656294605, + "grad_norm": 2.095390558242798, + "learning_rate": 4.244262003635521e-05, + "loss": 3.2589, + "step": 42744 + }, + { + "epoch": 0.25421662384622706, + "grad_norm": 2.4969546794891357, + "learning_rate": 4.244228541074893e-05, + "loss": 3.3509, + "step": 42745 + }, + { + "epoch": 0.254222571129508, + "grad_norm": 2.0570015907287598, + "learning_rate": 4.2441950779053716e-05, + "loss": 3.1941, + "step": 42746 + }, + { + "epoch": 0.25422851841278904, + "grad_norm": 2.1485908031463623, + "learning_rate": 4.2441616141269714e-05, + "loss": 3.2251, + "step": 42747 + }, + { + "epoch": 0.25423446569607006, + "grad_norm": 1.9246056079864502, + "learning_rate": 4.244128149739701e-05, + "loss": 3.3326, + "step": 42748 + }, + { + "epoch": 0.254240412979351, + "grad_norm": 2.0402774810791016, + "learning_rate": 4.244094684743575e-05, + "loss": 3.1603, + "step": 42749 + }, + { + "epoch": 0.25424636026263203, + "grad_norm": 1.9111288785934448, + "learning_rate": 4.2440612191386025e-05, + "loss": 3.1179, + "step": 42750 + }, + { + "epoch": 0.25425230754591305, + "grad_norm": 2.1031599044799805, + "learning_rate": 4.244027752924796e-05, + "loss": 3.1387, + "step": 42751 + }, + { + "epoch": 0.254258254829194, + "grad_norm": 2.016507148742676, + "learning_rate": 4.243994286102168e-05, + "loss": 3.2276, + "step": 42752 + }, + { + "epoch": 0.254264202112475, + "grad_norm": 2.1872031688690186, + "learning_rate": 4.24396081867073e-05, + "loss": 3.1944, + "step": 42753 + }, + { + "epoch": 0.25427014939575604, + "grad_norm": 1.757365107536316, + "learning_rate": 4.243927350630492e-05, + "loss": 4.1217, + "step": 42754 + }, + { + "epoch": 0.254276096679037, + "grad_norm": 1.7136365175247192, + "learning_rate": 4.2438938819814684e-05, + "loss": 5.0177, + "step": 42755 + }, + { + "epoch": 0.254282043962318, + "grad_norm": 1.7748982906341553, + "learning_rate": 4.243860412723669e-05, + "loss": 4.9788, + "step": 42756 + }, + { + "epoch": 0.25428799124559903, + "grad_norm": 1.418142318725586, + "learning_rate": 4.2438269428571056e-05, + "loss": 4.9127, + "step": 42757 + }, + { + "epoch": 0.25429393852888, + "grad_norm": 1.6478784084320068, + "learning_rate": 4.2437934723817906e-05, + "loss": 4.9848, + "step": 42758 + }, + { + "epoch": 0.254299885812161, + "grad_norm": 1.6177592277526855, + "learning_rate": 4.243760001297735e-05, + "loss": 4.8533, + "step": 42759 + }, + { + "epoch": 0.254305833095442, + "grad_norm": 1.8947839736938477, + "learning_rate": 4.243726529604951e-05, + "loss": 4.9163, + "step": 42760 + }, + { + "epoch": 0.254311780378723, + "grad_norm": 2.0188169479370117, + "learning_rate": 4.24369305730345e-05, + "loss": 4.0653, + "step": 42761 + }, + { + "epoch": 0.254317727662004, + "grad_norm": 2.4203720092773438, + "learning_rate": 4.243659584393244e-05, + "loss": 3.6917, + "step": 42762 + }, + { + "epoch": 0.254323674945285, + "grad_norm": 2.0568482875823975, + "learning_rate": 4.2436261108743444e-05, + "loss": 3.593, + "step": 42763 + }, + { + "epoch": 0.254329622228566, + "grad_norm": 1.997248888015747, + "learning_rate": 4.243592636746763e-05, + "loss": 3.482, + "step": 42764 + }, + { + "epoch": 0.254335569511847, + "grad_norm": 1.9005250930786133, + "learning_rate": 4.243559162010511e-05, + "loss": 3.4571, + "step": 42765 + }, + { + "epoch": 0.254341516795128, + "grad_norm": 1.7676974534988403, + "learning_rate": 4.243525686665601e-05, + "loss": 3.3561, + "step": 42766 + }, + { + "epoch": 0.25434746407840897, + "grad_norm": 1.8042411804199219, + "learning_rate": 4.243492210712043e-05, + "loss": 3.3382, + "step": 42767 + }, + { + "epoch": 0.25435341136169, + "grad_norm": 1.907562017440796, + "learning_rate": 4.243458734149851e-05, + "loss": 3.5174, + "step": 42768 + }, + { + "epoch": 0.254359358644971, + "grad_norm": 1.742384672164917, + "learning_rate": 4.243425256979035e-05, + "loss": 3.4399, + "step": 42769 + }, + { + "epoch": 0.25436530592825196, + "grad_norm": 1.7647852897644043, + "learning_rate": 4.243391779199608e-05, + "loss": 4.2769, + "step": 42770 + }, + { + "epoch": 0.254371253211533, + "grad_norm": 2.254493236541748, + "learning_rate": 4.243358300811581e-05, + "loss": 4.7744, + "step": 42771 + }, + { + "epoch": 0.254377200494814, + "grad_norm": 3.4410202503204346, + "learning_rate": 4.2433248218149646e-05, + "loss": 3.6043, + "step": 42772 + }, + { + "epoch": 0.25438314777809495, + "grad_norm": 2.8561770915985107, + "learning_rate": 4.243291342209773e-05, + "loss": 3.4328, + "step": 42773 + }, + { + "epoch": 0.25438909506137597, + "grad_norm": 2.5998551845550537, + "learning_rate": 4.243257861996015e-05, + "loss": 3.5425, + "step": 42774 + }, + { + "epoch": 0.254395042344657, + "grad_norm": 2.3973851203918457, + "learning_rate": 4.2432243811737045e-05, + "loss": 3.2125, + "step": 42775 + }, + { + "epoch": 0.25440098962793795, + "grad_norm": 2.258265256881714, + "learning_rate": 4.243190899742852e-05, + "loss": 3.2857, + "step": 42776 + }, + { + "epoch": 0.25440693691121896, + "grad_norm": 2.3596723079681396, + "learning_rate": 4.2431574177034695e-05, + "loss": 3.2964, + "step": 42777 + }, + { + "epoch": 0.2544128841945, + "grad_norm": 2.4012296199798584, + "learning_rate": 4.243123935055569e-05, + "loss": 3.2297, + "step": 42778 + }, + { + "epoch": 0.25441883147778094, + "grad_norm": 1.7272783517837524, + "learning_rate": 4.243090451799162e-05, + "loss": 3.9215, + "step": 42779 + }, + { + "epoch": 0.25442477876106195, + "grad_norm": 1.5712306499481201, + "learning_rate": 4.2430569679342604e-05, + "loss": 4.7925, + "step": 42780 + }, + { + "epoch": 0.25443072604434297, + "grad_norm": 1.6643586158752441, + "learning_rate": 4.243023483460875e-05, + "loss": 5.2133, + "step": 42781 + }, + { + "epoch": 0.25443667332762393, + "grad_norm": 1.6288481950759888, + "learning_rate": 4.242989998379018e-05, + "loss": 5.0755, + "step": 42782 + }, + { + "epoch": 0.25444262061090495, + "grad_norm": 1.974412202835083, + "learning_rate": 4.2429565126887015e-05, + "loss": 4.8579, + "step": 42783 + }, + { + "epoch": 0.25444856789418596, + "grad_norm": 1.5399186611175537, + "learning_rate": 4.242923026389937e-05, + "loss": 5.3397, + "step": 42784 + }, + { + "epoch": 0.2544545151774669, + "grad_norm": 1.5505890846252441, + "learning_rate": 4.242889539482736e-05, + "loss": 5.0366, + "step": 42785 + }, + { + "epoch": 0.25446046246074794, + "grad_norm": 1.58431077003479, + "learning_rate": 4.2428560519671104e-05, + "loss": 4.9397, + "step": 42786 + }, + { + "epoch": 0.25446640974402895, + "grad_norm": 1.5791058540344238, + "learning_rate": 4.242822563843071e-05, + "loss": 4.5032, + "step": 42787 + }, + { + "epoch": 0.2544723570273099, + "grad_norm": 1.4889925718307495, + "learning_rate": 4.242789075110632e-05, + "loss": 4.603, + "step": 42788 + }, + { + "epoch": 0.25447830431059093, + "grad_norm": 1.7387279272079468, + "learning_rate": 4.242755585769802e-05, + "loss": 5.0065, + "step": 42789 + }, + { + "epoch": 0.25448425159387195, + "grad_norm": 1.6512243747711182, + "learning_rate": 4.242722095820594e-05, + "loss": 4.7733, + "step": 42790 + }, + { + "epoch": 0.2544901988771529, + "grad_norm": 1.7325931787490845, + "learning_rate": 4.2426886052630196e-05, + "loss": 4.5567, + "step": 42791 + }, + { + "epoch": 0.2544961461604339, + "grad_norm": 3.029825448989868, + "learning_rate": 4.242655114097091e-05, + "loss": 3.1722, + "step": 42792 + }, + { + "epoch": 0.25450209344371494, + "grad_norm": 2.675048828125, + "learning_rate": 4.24262162232282e-05, + "loss": 2.8238, + "step": 42793 + }, + { + "epoch": 0.2545080407269959, + "grad_norm": 2.4639089107513428, + "learning_rate": 4.2425881299402163e-05, + "loss": 2.9579, + "step": 42794 + }, + { + "epoch": 0.2545139880102769, + "grad_norm": 2.3679380416870117, + "learning_rate": 4.2425546369492944e-05, + "loss": 3.359, + "step": 42795 + }, + { + "epoch": 0.2545199352935579, + "grad_norm": 2.76708722114563, + "learning_rate": 4.2425211433500646e-05, + "loss": 3.5057, + "step": 42796 + }, + { + "epoch": 0.2545258825768389, + "grad_norm": 2.3757643699645996, + "learning_rate": 4.242487649142538e-05, + "loss": 3.2107, + "step": 42797 + }, + { + "epoch": 0.2545318298601199, + "grad_norm": 2.750659227371216, + "learning_rate": 4.242454154326727e-05, + "loss": 2.9027, + "step": 42798 + }, + { + "epoch": 0.25453777714340087, + "grad_norm": 2.3816795349121094, + "learning_rate": 4.2424206589026436e-05, + "loss": 3.0768, + "step": 42799 + }, + { + "epoch": 0.2545437244266819, + "grad_norm": 2.239981174468994, + "learning_rate": 4.2423871628703e-05, + "loss": 2.9757, + "step": 42800 + }, + { + "epoch": 0.2545496717099629, + "grad_norm": 2.4425148963928223, + "learning_rate": 4.242353666229706e-05, + "loss": 2.795, + "step": 42801 + }, + { + "epoch": 0.25455561899324386, + "grad_norm": 2.3390276432037354, + "learning_rate": 4.2423201689808745e-05, + "loss": 2.9859, + "step": 42802 + }, + { + "epoch": 0.2545615662765249, + "grad_norm": 2.3578922748565674, + "learning_rate": 4.2422866711238174e-05, + "loss": 3.2095, + "step": 42803 + }, + { + "epoch": 0.2545675135598059, + "grad_norm": 2.4439854621887207, + "learning_rate": 4.242253172658546e-05, + "loss": 2.9952, + "step": 42804 + }, + { + "epoch": 0.25457346084308685, + "grad_norm": 2.3026540279388428, + "learning_rate": 4.242219673585071e-05, + "loss": 2.9317, + "step": 42805 + }, + { + "epoch": 0.25457940812636787, + "grad_norm": 2.460914134979248, + "learning_rate": 4.2421861739034066e-05, + "loss": 3.2437, + "step": 42806 + }, + { + "epoch": 0.2545853554096489, + "grad_norm": 2.576829671859741, + "learning_rate": 4.242152673613562e-05, + "loss": 3.0656, + "step": 42807 + }, + { + "epoch": 0.25459130269292984, + "grad_norm": 2.504059076309204, + "learning_rate": 4.242119172715549e-05, + "loss": 3.2045, + "step": 42808 + }, + { + "epoch": 0.25459724997621086, + "grad_norm": 2.9600090980529785, + "learning_rate": 4.242085671209382e-05, + "loss": 3.2902, + "step": 42809 + }, + { + "epoch": 0.2546031972594919, + "grad_norm": 2.346127510070801, + "learning_rate": 4.24205216909507e-05, + "loss": 3.122, + "step": 42810 + }, + { + "epoch": 0.25460914454277284, + "grad_norm": 2.311306953430176, + "learning_rate": 4.2420186663726255e-05, + "loss": 2.803, + "step": 42811 + }, + { + "epoch": 0.25461509182605385, + "grad_norm": 2.6364426612854004, + "learning_rate": 4.241985163042062e-05, + "loss": 2.8998, + "step": 42812 + }, + { + "epoch": 0.25462103910933487, + "grad_norm": 2.3891239166259766, + "learning_rate": 4.241951659103387e-05, + "loss": 3.1831, + "step": 42813 + }, + { + "epoch": 0.25462698639261583, + "grad_norm": 2.3192813396453857, + "learning_rate": 4.241918154556616e-05, + "loss": 2.8678, + "step": 42814 + }, + { + "epoch": 0.25463293367589684, + "grad_norm": 2.1162426471710205, + "learning_rate": 4.2418846494017594e-05, + "loss": 2.9311, + "step": 42815 + }, + { + "epoch": 0.25463888095917786, + "grad_norm": 2.7260122299194336, + "learning_rate": 4.241851143638829e-05, + "loss": 2.9015, + "step": 42816 + }, + { + "epoch": 0.2546448282424588, + "grad_norm": 2.538696527481079, + "learning_rate": 4.241817637267835e-05, + "loss": 3.5787, + "step": 42817 + }, + { + "epoch": 0.25465077552573984, + "grad_norm": 2.389418840408325, + "learning_rate": 4.2417841302887914e-05, + "loss": 3.3325, + "step": 42818 + }, + { + "epoch": 0.25465672280902085, + "grad_norm": 2.284156084060669, + "learning_rate": 4.241750622701709e-05, + "loss": 2.8368, + "step": 42819 + }, + { + "epoch": 0.2546626700923018, + "grad_norm": 2.426504135131836, + "learning_rate": 4.241717114506599e-05, + "loss": 3.2719, + "step": 42820 + }, + { + "epoch": 0.25466861737558283, + "grad_norm": 2.3729910850524902, + "learning_rate": 4.241683605703475e-05, + "loss": 3.252, + "step": 42821 + }, + { + "epoch": 0.25467456465886384, + "grad_norm": 1.9809880256652832, + "learning_rate": 4.2416500962923456e-05, + "loss": 3.016, + "step": 42822 + }, + { + "epoch": 0.2546805119421448, + "grad_norm": 2.9466865062713623, + "learning_rate": 4.241616586273225e-05, + "loss": 3.857, + "step": 42823 + }, + { + "epoch": 0.2546864592254258, + "grad_norm": 2.121504306793213, + "learning_rate": 4.2415830756461236e-05, + "loss": 4.939, + "step": 42824 + }, + { + "epoch": 0.25469240650870684, + "grad_norm": 1.7552341222763062, + "learning_rate": 4.241549564411054e-05, + "loss": 5.1757, + "step": 42825 + }, + { + "epoch": 0.2546983537919878, + "grad_norm": 1.5379441976547241, + "learning_rate": 4.241516052568027e-05, + "loss": 5.0392, + "step": 42826 + }, + { + "epoch": 0.2547043010752688, + "grad_norm": 1.675937294960022, + "learning_rate": 4.2414825401170546e-05, + "loss": 4.7444, + "step": 42827 + }, + { + "epoch": 0.25471024835854983, + "grad_norm": 1.6102116107940674, + "learning_rate": 4.2414490270581495e-05, + "loss": 4.6721, + "step": 42828 + }, + { + "epoch": 0.2547161956418308, + "grad_norm": 1.6356509923934937, + "learning_rate": 4.2414155133913214e-05, + "loss": 4.781, + "step": 42829 + }, + { + "epoch": 0.2547221429251118, + "grad_norm": 1.439266562461853, + "learning_rate": 4.2413819991165845e-05, + "loss": 4.7864, + "step": 42830 + }, + { + "epoch": 0.2547280902083928, + "grad_norm": 1.3949726819992065, + "learning_rate": 4.2413484842339476e-05, + "loss": 5.0144, + "step": 42831 + }, + { + "epoch": 0.2547340374916738, + "grad_norm": 1.3451765775680542, + "learning_rate": 4.241314968743425e-05, + "loss": 5.0162, + "step": 42832 + }, + { + "epoch": 0.2547399847749548, + "grad_norm": 1.5919439792633057, + "learning_rate": 4.2412814526450275e-05, + "loss": 4.8159, + "step": 42833 + }, + { + "epoch": 0.2547459320582358, + "grad_norm": 1.42246675491333, + "learning_rate": 4.241247935938766e-05, + "loss": 5.0204, + "step": 42834 + }, + { + "epoch": 0.2547518793415168, + "grad_norm": 1.7013001441955566, + "learning_rate": 4.2412144186246526e-05, + "loss": 4.443, + "step": 42835 + }, + { + "epoch": 0.2547578266247978, + "grad_norm": 1.5632095336914062, + "learning_rate": 4.2411809007026996e-05, + "loss": 5.1668, + "step": 42836 + }, + { + "epoch": 0.2547637739080788, + "grad_norm": 1.3716801404953003, + "learning_rate": 4.2411473821729185e-05, + "loss": 4.8851, + "step": 42837 + }, + { + "epoch": 0.25476972119135977, + "grad_norm": 1.715590238571167, + "learning_rate": 4.24111386303532e-05, + "loss": 5.1323, + "step": 42838 + }, + { + "epoch": 0.2547756684746408, + "grad_norm": 1.8547813892364502, + "learning_rate": 4.2410803432899185e-05, + "loss": 4.0504, + "step": 42839 + }, + { + "epoch": 0.2547816157579218, + "grad_norm": 2.0663695335388184, + "learning_rate": 4.2410468229367214e-05, + "loss": 3.7099, + "step": 42840 + }, + { + "epoch": 0.25478756304120276, + "grad_norm": 2.688699960708618, + "learning_rate": 4.241013301975745e-05, + "loss": 3.7881, + "step": 42841 + }, + { + "epoch": 0.2547935103244838, + "grad_norm": 1.4516850709915161, + "learning_rate": 4.240979780406998e-05, + "loss": 4.8262, + "step": 42842 + }, + { + "epoch": 0.2547994576077648, + "grad_norm": 2.254209280014038, + "learning_rate": 4.2409462582304926e-05, + "loss": 3.7218, + "step": 42843 + }, + { + "epoch": 0.25480540489104575, + "grad_norm": 2.555819034576416, + "learning_rate": 4.2409127354462405e-05, + "loss": 3.8632, + "step": 42844 + }, + { + "epoch": 0.25481135217432677, + "grad_norm": 2.581888437271118, + "learning_rate": 4.240879212054255e-05, + "loss": 3.8653, + "step": 42845 + }, + { + "epoch": 0.2548172994576078, + "grad_norm": 2.4291422367095947, + "learning_rate": 4.240845688054546e-05, + "loss": 3.7951, + "step": 42846 + }, + { + "epoch": 0.25482324674088874, + "grad_norm": 2.1990966796875, + "learning_rate": 4.240812163447125e-05, + "loss": 3.7175, + "step": 42847 + }, + { + "epoch": 0.25482919402416976, + "grad_norm": 2.394774913787842, + "learning_rate": 4.2407786382320056e-05, + "loss": 3.7646, + "step": 42848 + }, + { + "epoch": 0.2548351413074508, + "grad_norm": 2.372478723526001, + "learning_rate": 4.240745112409198e-05, + "loss": 3.6992, + "step": 42849 + }, + { + "epoch": 0.25484108859073173, + "grad_norm": 2.565598249435425, + "learning_rate": 4.2407115859787146e-05, + "loss": 3.7881, + "step": 42850 + }, + { + "epoch": 0.25484703587401275, + "grad_norm": 1.8435368537902832, + "learning_rate": 4.240678058940567e-05, + "loss": 4.8524, + "step": 42851 + }, + { + "epoch": 0.25485298315729377, + "grad_norm": 1.6270239353179932, + "learning_rate": 4.240644531294765e-05, + "loss": 5.3825, + "step": 42852 + }, + { + "epoch": 0.2548589304405747, + "grad_norm": 1.534239649772644, + "learning_rate": 4.2406110030413234e-05, + "loss": 5.2838, + "step": 42853 + }, + { + "epoch": 0.25486487772385574, + "grad_norm": 1.5531002283096313, + "learning_rate": 4.2405774741802516e-05, + "loss": 4.532, + "step": 42854 + }, + { + "epoch": 0.25487082500713676, + "grad_norm": 1.6149439811706543, + "learning_rate": 4.240543944711563e-05, + "loss": 4.7218, + "step": 42855 + }, + { + "epoch": 0.2548767722904177, + "grad_norm": 1.7873821258544922, + "learning_rate": 4.240510414635268e-05, + "loss": 4.6547, + "step": 42856 + }, + { + "epoch": 0.25488271957369873, + "grad_norm": 2.8095626831054688, + "learning_rate": 4.240476883951379e-05, + "loss": 4.7335, + "step": 42857 + }, + { + "epoch": 0.25488866685697975, + "grad_norm": 1.747715950012207, + "learning_rate": 4.2404433526599076e-05, + "loss": 4.6874, + "step": 42858 + }, + { + "epoch": 0.2548946141402607, + "grad_norm": 2.5131938457489014, + "learning_rate": 4.2404098207608654e-05, + "loss": 3.6086, + "step": 42859 + }, + { + "epoch": 0.2549005614235417, + "grad_norm": 1.9979506731033325, + "learning_rate": 4.240376288254264e-05, + "loss": 4.2312, + "step": 42860 + }, + { + "epoch": 0.25490650870682274, + "grad_norm": 1.40987229347229, + "learning_rate": 4.240342755140115e-05, + "loss": 5.3377, + "step": 42861 + }, + { + "epoch": 0.2549124559901037, + "grad_norm": 1.7415529489517212, + "learning_rate": 4.24030922141843e-05, + "loss": 4.8075, + "step": 42862 + }, + { + "epoch": 0.2549184032733847, + "grad_norm": 1.360360026359558, + "learning_rate": 4.2402756870892225e-05, + "loss": 4.9963, + "step": 42863 + }, + { + "epoch": 0.25492435055666574, + "grad_norm": 1.6450083255767822, + "learning_rate": 4.240242152152501e-05, + "loss": 4.8999, + "step": 42864 + }, + { + "epoch": 0.2549302978399467, + "grad_norm": 1.4342448711395264, + "learning_rate": 4.24020861660828e-05, + "loss": 4.8488, + "step": 42865 + }, + { + "epoch": 0.2549362451232277, + "grad_norm": 1.4736297130584717, + "learning_rate": 4.2401750804565705e-05, + "loss": 4.844, + "step": 42866 + }, + { + "epoch": 0.2549421924065087, + "grad_norm": 1.6522016525268555, + "learning_rate": 4.2401415436973826e-05, + "loss": 5.1027, + "step": 42867 + }, + { + "epoch": 0.2549481396897897, + "grad_norm": 1.620509147644043, + "learning_rate": 4.2401080063307305e-05, + "loss": 5.0294, + "step": 42868 + }, + { + "epoch": 0.2549540869730707, + "grad_norm": 1.40915846824646, + "learning_rate": 4.240074468356624e-05, + "loss": 4.8482, + "step": 42869 + }, + { + "epoch": 0.2549600342563517, + "grad_norm": 1.5177733898162842, + "learning_rate": 4.2400409297750756e-05, + "loss": 4.6464, + "step": 42870 + }, + { + "epoch": 0.2549659815396327, + "grad_norm": 1.6618454456329346, + "learning_rate": 4.2400073905860965e-05, + "loss": 4.4485, + "step": 42871 + }, + { + "epoch": 0.2549719288229137, + "grad_norm": 1.6977269649505615, + "learning_rate": 4.2399738507897e-05, + "loss": 5.2843, + "step": 42872 + }, + { + "epoch": 0.2549778761061947, + "grad_norm": 1.5427062511444092, + "learning_rate": 4.2399403103858946e-05, + "loss": 5.0165, + "step": 42873 + }, + { + "epoch": 0.2549838233894757, + "grad_norm": 1.577762484550476, + "learning_rate": 4.2399067693746955e-05, + "loss": 4.8376, + "step": 42874 + }, + { + "epoch": 0.2549897706727567, + "grad_norm": 1.4877510070800781, + "learning_rate": 4.239873227756113e-05, + "loss": 4.9332, + "step": 42875 + }, + { + "epoch": 0.2549957179560377, + "grad_norm": 1.5041389465332031, + "learning_rate": 4.239839685530158e-05, + "loss": 4.8976, + "step": 42876 + }, + { + "epoch": 0.25500166523931866, + "grad_norm": 1.379104733467102, + "learning_rate": 4.2398061426968434e-05, + "loss": 4.7886, + "step": 42877 + }, + { + "epoch": 0.2550076125225997, + "grad_norm": 1.5060315132141113, + "learning_rate": 4.2397725992561804e-05, + "loss": 4.8738, + "step": 42878 + }, + { + "epoch": 0.2550135598058807, + "grad_norm": 1.2979097366333008, + "learning_rate": 4.2397390552081805e-05, + "loss": 4.8131, + "step": 42879 + }, + { + "epoch": 0.25501950708916166, + "grad_norm": 1.549382209777832, + "learning_rate": 4.239705510552856e-05, + "loss": 4.8359, + "step": 42880 + }, + { + "epoch": 0.2550254543724427, + "grad_norm": 2.0422356128692627, + "learning_rate": 4.239671965290218e-05, + "loss": 5.3039, + "step": 42881 + }, + { + "epoch": 0.2550314016557237, + "grad_norm": 1.4753718376159668, + "learning_rate": 4.239638419420279e-05, + "loss": 5.4668, + "step": 42882 + }, + { + "epoch": 0.25503734893900465, + "grad_norm": 1.55765700340271, + "learning_rate": 4.2396048729430496e-05, + "loss": 5.1111, + "step": 42883 + }, + { + "epoch": 0.25504329622228566, + "grad_norm": 2.207946538925171, + "learning_rate": 4.2395713258585424e-05, + "loss": 4.1843, + "step": 42884 + }, + { + "epoch": 0.2550492435055667, + "grad_norm": 2.2537567615509033, + "learning_rate": 4.239537778166769e-05, + "loss": 4.6661, + "step": 42885 + }, + { + "epoch": 0.25505519078884764, + "grad_norm": 1.4334690570831299, + "learning_rate": 4.23950422986774e-05, + "loss": 5.3634, + "step": 42886 + }, + { + "epoch": 0.25506113807212866, + "grad_norm": 1.6390823125839233, + "learning_rate": 4.239470680961469e-05, + "loss": 5.0077, + "step": 42887 + }, + { + "epoch": 0.2550670853554097, + "grad_norm": 1.7837761640548706, + "learning_rate": 4.2394371314479666e-05, + "loss": 4.7343, + "step": 42888 + }, + { + "epoch": 0.25507303263869063, + "grad_norm": 1.6694730520248413, + "learning_rate": 4.239403581327245e-05, + "loss": 4.5067, + "step": 42889 + }, + { + "epoch": 0.25507897992197165, + "grad_norm": 1.611169457435608, + "learning_rate": 4.2393700305993146e-05, + "loss": 5.1524, + "step": 42890 + }, + { + "epoch": 0.25508492720525267, + "grad_norm": 1.4928163290023804, + "learning_rate": 4.2393364792641887e-05, + "loss": 5.4591, + "step": 42891 + }, + { + "epoch": 0.2550908744885336, + "grad_norm": 1.4692052602767944, + "learning_rate": 4.239302927321878e-05, + "loss": 5.4909, + "step": 42892 + }, + { + "epoch": 0.25509682177181464, + "grad_norm": 1.8263428211212158, + "learning_rate": 4.239269374772396e-05, + "loss": 4.5741, + "step": 42893 + }, + { + "epoch": 0.25510276905509566, + "grad_norm": 1.403737187385559, + "learning_rate": 4.239235821615751e-05, + "loss": 4.8038, + "step": 42894 + }, + { + "epoch": 0.2551087163383766, + "grad_norm": 2.17471981048584, + "learning_rate": 4.2392022678519575e-05, + "loss": 4.2877, + "step": 42895 + }, + { + "epoch": 0.25511466362165763, + "grad_norm": 1.6368629932403564, + "learning_rate": 4.2391687134810275e-05, + "loss": 4.7603, + "step": 42896 + }, + { + "epoch": 0.25512061090493865, + "grad_norm": 1.9061046838760376, + "learning_rate": 4.239135158502971e-05, + "loss": 4.956, + "step": 42897 + }, + { + "epoch": 0.2551265581882196, + "grad_norm": 2.1279733180999756, + "learning_rate": 4.2391016029178e-05, + "loss": 3.9189, + "step": 42898 + }, + { + "epoch": 0.2551325054715006, + "grad_norm": 2.299910068511963, + "learning_rate": 4.239068046725527e-05, + "loss": 3.9025, + "step": 42899 + }, + { + "epoch": 0.25513845275478164, + "grad_norm": 2.734386444091797, + "learning_rate": 4.239034489926162e-05, + "loss": 3.6964, + "step": 42900 + }, + { + "epoch": 0.2551444000380626, + "grad_norm": 2.1284496784210205, + "learning_rate": 4.23900093251972e-05, + "loss": 4.2739, + "step": 42901 + }, + { + "epoch": 0.2551503473213436, + "grad_norm": 1.9406425952911377, + "learning_rate": 4.23896737450621e-05, + "loss": 4.9075, + "step": 42902 + }, + { + "epoch": 0.25515629460462463, + "grad_norm": 1.5792365074157715, + "learning_rate": 4.2389338158856436e-05, + "loss": 5.1761, + "step": 42903 + }, + { + "epoch": 0.2551622418879056, + "grad_norm": 1.7873964309692383, + "learning_rate": 4.238900256658035e-05, + "loss": 4.6316, + "step": 42904 + }, + { + "epoch": 0.2551681891711866, + "grad_norm": 1.6098829507827759, + "learning_rate": 4.238866696823394e-05, + "loss": 5.2159, + "step": 42905 + }, + { + "epoch": 0.2551741364544676, + "grad_norm": 1.257751226425171, + "learning_rate": 4.2388331363817314e-05, + "loss": 5.1277, + "step": 42906 + }, + { + "epoch": 0.2551800837377486, + "grad_norm": 1.3469421863555908, + "learning_rate": 4.23879957533306e-05, + "loss": 5.0715, + "step": 42907 + }, + { + "epoch": 0.2551860310210296, + "grad_norm": 1.4890174865722656, + "learning_rate": 4.238766013677393e-05, + "loss": 4.9803, + "step": 42908 + }, + { + "epoch": 0.2551919783043106, + "grad_norm": 1.6360969543457031, + "learning_rate": 4.23873245141474e-05, + "loss": 4.2758, + "step": 42909 + }, + { + "epoch": 0.2551979255875916, + "grad_norm": 1.5495041608810425, + "learning_rate": 4.2386988885451136e-05, + "loss": 4.2313, + "step": 42910 + }, + { + "epoch": 0.2552038728708726, + "grad_norm": 1.5025399923324585, + "learning_rate": 4.238665325068525e-05, + "loss": 4.2612, + "step": 42911 + }, + { + "epoch": 0.25520982015415355, + "grad_norm": 1.7930890321731567, + "learning_rate": 4.238631760984987e-05, + "loss": 3.9816, + "step": 42912 + }, + { + "epoch": 0.25521576743743457, + "grad_norm": 2.158691644668579, + "learning_rate": 4.23859819629451e-05, + "loss": 4.3204, + "step": 42913 + }, + { + "epoch": 0.2552217147207156, + "grad_norm": 1.496211051940918, + "learning_rate": 4.238564630997107e-05, + "loss": 4.9884, + "step": 42914 + }, + { + "epoch": 0.25522766200399655, + "grad_norm": 1.51675283908844, + "learning_rate": 4.238531065092789e-05, + "loss": 5.1829, + "step": 42915 + }, + { + "epoch": 0.25523360928727756, + "grad_norm": 1.6707005500793457, + "learning_rate": 4.2384974985815674e-05, + "loss": 4.5603, + "step": 42916 + }, + { + "epoch": 0.2552395565705586, + "grad_norm": 1.7133342027664185, + "learning_rate": 4.238463931463454e-05, + "loss": 4.8164, + "step": 42917 + }, + { + "epoch": 0.25524550385383954, + "grad_norm": 2.003431797027588, + "learning_rate": 4.238430363738461e-05, + "loss": 4.8138, + "step": 42918 + }, + { + "epoch": 0.25525145113712056, + "grad_norm": 1.4825133085250854, + "learning_rate": 4.2383967954066005e-05, + "loss": 4.6925, + "step": 42919 + }, + { + "epoch": 0.25525739842040157, + "grad_norm": 1.4926689863204956, + "learning_rate": 4.238363226467883e-05, + "loss": 5.0215, + "step": 42920 + }, + { + "epoch": 0.25526334570368253, + "grad_norm": 1.6124340295791626, + "learning_rate": 4.238329656922321e-05, + "loss": 4.6719, + "step": 42921 + }, + { + "epoch": 0.25526929298696355, + "grad_norm": 1.6046967506408691, + "learning_rate": 4.2382960867699265e-05, + "loss": 4.789, + "step": 42922 + }, + { + "epoch": 0.25527524027024456, + "grad_norm": 1.3441354036331177, + "learning_rate": 4.23826251601071e-05, + "loss": 4.6958, + "step": 42923 + }, + { + "epoch": 0.2552811875535255, + "grad_norm": 1.377679467201233, + "learning_rate": 4.238228944644684e-05, + "loss": 4.7425, + "step": 42924 + }, + { + "epoch": 0.25528713483680654, + "grad_norm": 1.2678924798965454, + "learning_rate": 4.238195372671861e-05, + "loss": 4.6579, + "step": 42925 + }, + { + "epoch": 0.25529308212008756, + "grad_norm": 1.3505538702011108, + "learning_rate": 4.238161800092252e-05, + "loss": 4.6046, + "step": 42926 + }, + { + "epoch": 0.2552990294033685, + "grad_norm": 2.390437126159668, + "learning_rate": 4.238128226905868e-05, + "loss": 3.7909, + "step": 42927 + }, + { + "epoch": 0.25530497668664953, + "grad_norm": 2.8989269733428955, + "learning_rate": 4.238094653112722e-05, + "loss": 3.2905, + "step": 42928 + }, + { + "epoch": 0.25531092396993055, + "grad_norm": 2.3931877613067627, + "learning_rate": 4.238061078712824e-05, + "loss": 3.3508, + "step": 42929 + }, + { + "epoch": 0.2553168712532115, + "grad_norm": 2.2671902179718018, + "learning_rate": 4.2380275037061876e-05, + "loss": 3.2134, + "step": 42930 + }, + { + "epoch": 0.2553228185364925, + "grad_norm": 2.1988470554351807, + "learning_rate": 4.2379939280928246e-05, + "loss": 3.0276, + "step": 42931 + }, + { + "epoch": 0.25532876581977354, + "grad_norm": 2.0057432651519775, + "learning_rate": 4.237960351872745e-05, + "loss": 3.8259, + "step": 42932 + }, + { + "epoch": 0.2553347131030545, + "grad_norm": 2.174008369445801, + "learning_rate": 4.23792677504596e-05, + "loss": 4.2264, + "step": 42933 + }, + { + "epoch": 0.2553406603863355, + "grad_norm": 1.9741261005401611, + "learning_rate": 4.2378931976124854e-05, + "loss": 4.2854, + "step": 42934 + }, + { + "epoch": 0.25534660766961653, + "grad_norm": 2.0072884559631348, + "learning_rate": 4.237859619572328e-05, + "loss": 4.3977, + "step": 42935 + }, + { + "epoch": 0.2553525549528975, + "grad_norm": 1.895383358001709, + "learning_rate": 4.237826040925503e-05, + "loss": 4.2944, + "step": 42936 + }, + { + "epoch": 0.2553585022361785, + "grad_norm": 1.622261643409729, + "learning_rate": 4.237792461672021e-05, + "loss": 4.6483, + "step": 42937 + }, + { + "epoch": 0.2553644495194595, + "grad_norm": 1.6638612747192383, + "learning_rate": 4.2377588818118935e-05, + "loss": 5.2137, + "step": 42938 + }, + { + "epoch": 0.2553703968027405, + "grad_norm": 1.6736805438995361, + "learning_rate": 4.237725301345132e-05, + "loss": 4.9517, + "step": 42939 + }, + { + "epoch": 0.2553763440860215, + "grad_norm": 1.4034185409545898, + "learning_rate": 4.237691720271749e-05, + "loss": 4.1907, + "step": 42940 + }, + { + "epoch": 0.2553822913693025, + "grad_norm": 1.4335837364196777, + "learning_rate": 4.2376581385917547e-05, + "loss": 4.6375, + "step": 42941 + }, + { + "epoch": 0.2553882386525835, + "grad_norm": 1.8043371438980103, + "learning_rate": 4.237624556305163e-05, + "loss": 4.7377, + "step": 42942 + }, + { + "epoch": 0.2553941859358645, + "grad_norm": 1.6251717805862427, + "learning_rate": 4.237590973411984e-05, + "loss": 4.4102, + "step": 42943 + }, + { + "epoch": 0.2554001332191455, + "grad_norm": 1.650655746459961, + "learning_rate": 4.23755738991223e-05, + "loss": 4.2391, + "step": 42944 + }, + { + "epoch": 0.25540608050242647, + "grad_norm": 2.1757442951202393, + "learning_rate": 4.237523805805913e-05, + "loss": 4.5531, + "step": 42945 + }, + { + "epoch": 0.2554120277857075, + "grad_norm": 2.209822177886963, + "learning_rate": 4.237490221093044e-05, + "loss": 4.3982, + "step": 42946 + }, + { + "epoch": 0.2554179750689885, + "grad_norm": 1.9945921897888184, + "learning_rate": 4.2374566357736355e-05, + "loss": 4.2501, + "step": 42947 + }, + { + "epoch": 0.25542392235226946, + "grad_norm": 1.9562445878982544, + "learning_rate": 4.2374230498476986e-05, + "loss": 4.2565, + "step": 42948 + }, + { + "epoch": 0.2554298696355505, + "grad_norm": 1.63339364528656, + "learning_rate": 4.237389463315245e-05, + "loss": 4.2725, + "step": 42949 + }, + { + "epoch": 0.2554358169188315, + "grad_norm": 1.5598825216293335, + "learning_rate": 4.237355876176287e-05, + "loss": 4.6487, + "step": 42950 + }, + { + "epoch": 0.25544176420211245, + "grad_norm": 1.7433812618255615, + "learning_rate": 4.2373222884308356e-05, + "loss": 4.4382, + "step": 42951 + }, + { + "epoch": 0.25544771148539347, + "grad_norm": 1.8628782033920288, + "learning_rate": 4.2372887000789035e-05, + "loss": 4.1494, + "step": 42952 + }, + { + "epoch": 0.2554536587686745, + "grad_norm": 1.9606530666351318, + "learning_rate": 4.237255111120502e-05, + "loss": 4.1578, + "step": 42953 + }, + { + "epoch": 0.25545960605195545, + "grad_norm": 1.7285792827606201, + "learning_rate": 4.237221521555642e-05, + "loss": 4.0696, + "step": 42954 + }, + { + "epoch": 0.25546555333523646, + "grad_norm": 1.7164204120635986, + "learning_rate": 4.237187931384337e-05, + "loss": 4.3176, + "step": 42955 + }, + { + "epoch": 0.2554715006185175, + "grad_norm": 1.5506356954574585, + "learning_rate": 4.237154340606596e-05, + "loss": 4.5388, + "step": 42956 + }, + { + "epoch": 0.25547744790179844, + "grad_norm": 1.5675361156463623, + "learning_rate": 4.237120749222433e-05, + "loss": 4.4863, + "step": 42957 + }, + { + "epoch": 0.25548339518507945, + "grad_norm": 1.471785545349121, + "learning_rate": 4.237087157231859e-05, + "loss": 5.1864, + "step": 42958 + }, + { + "epoch": 0.25548934246836047, + "grad_norm": 1.5832899808883667, + "learning_rate": 4.2370535646348866e-05, + "loss": 5.8484, + "step": 42959 + }, + { + "epoch": 0.25549528975164143, + "grad_norm": 1.7636146545410156, + "learning_rate": 4.2370199714315265e-05, + "loss": 4.9841, + "step": 42960 + }, + { + "epoch": 0.25550123703492245, + "grad_norm": 1.790609359741211, + "learning_rate": 4.2369863776217904e-05, + "loss": 4.3233, + "step": 42961 + }, + { + "epoch": 0.25550718431820346, + "grad_norm": 1.6654504537582397, + "learning_rate": 4.2369527832056905e-05, + "loss": 4.601, + "step": 42962 + }, + { + "epoch": 0.2555131316014844, + "grad_norm": 1.2872564792633057, + "learning_rate": 4.2369191881832384e-05, + "loss": 4.4561, + "step": 42963 + }, + { + "epoch": 0.25551907888476544, + "grad_norm": 1.5974862575531006, + "learning_rate": 4.236885592554446e-05, + "loss": 4.4616, + "step": 42964 + }, + { + "epoch": 0.25552502616804645, + "grad_norm": 1.4415959119796753, + "learning_rate": 4.2368519963193234e-05, + "loss": 4.733, + "step": 42965 + }, + { + "epoch": 0.2555309734513274, + "grad_norm": 1.7395740747451782, + "learning_rate": 4.236818399477885e-05, + "loss": 4.3526, + "step": 42966 + }, + { + "epoch": 0.25553692073460843, + "grad_norm": 1.516814112663269, + "learning_rate": 4.236784802030141e-05, + "loss": 5.0013, + "step": 42967 + }, + { + "epoch": 0.25554286801788945, + "grad_norm": 1.9528799057006836, + "learning_rate": 4.236751203976103e-05, + "loss": 4.5031, + "step": 42968 + }, + { + "epoch": 0.2555488153011704, + "grad_norm": 1.5492702722549438, + "learning_rate": 4.2367176053157834e-05, + "loss": 4.6474, + "step": 42969 + }, + { + "epoch": 0.2555547625844514, + "grad_norm": 1.4582209587097168, + "learning_rate": 4.236684006049194e-05, + "loss": 4.7654, + "step": 42970 + }, + { + "epoch": 0.25556070986773244, + "grad_norm": 1.577636480331421, + "learning_rate": 4.236650406176346e-05, + "loss": 4.988, + "step": 42971 + }, + { + "epoch": 0.2555666571510134, + "grad_norm": 1.4271653890609741, + "learning_rate": 4.23661680569725e-05, + "loss": 4.6651, + "step": 42972 + }, + { + "epoch": 0.2555726044342944, + "grad_norm": 2.391796112060547, + "learning_rate": 4.236583204611921e-05, + "loss": 4.2467, + "step": 42973 + }, + { + "epoch": 0.25557855171757543, + "grad_norm": 1.742438554763794, + "learning_rate": 4.236549602920367e-05, + "loss": 4.2432, + "step": 42974 + }, + { + "epoch": 0.2555844990008564, + "grad_norm": 1.5662610530853271, + "learning_rate": 4.236516000622602e-05, + "loss": 4.7442, + "step": 42975 + }, + { + "epoch": 0.2555904462841374, + "grad_norm": 1.697512149810791, + "learning_rate": 4.2364823977186384e-05, + "loss": 4.2553, + "step": 42976 + }, + { + "epoch": 0.2555963935674184, + "grad_norm": 2.1971137523651123, + "learning_rate": 4.236448794208485e-05, + "loss": 3.9478, + "step": 42977 + }, + { + "epoch": 0.2556023408506994, + "grad_norm": 1.6382701396942139, + "learning_rate": 4.2364151900921566e-05, + "loss": 4.3603, + "step": 42978 + }, + { + "epoch": 0.2556082881339804, + "grad_norm": 1.6407127380371094, + "learning_rate": 4.2363815853696634e-05, + "loss": 4.3639, + "step": 42979 + }, + { + "epoch": 0.2556142354172614, + "grad_norm": 1.481398344039917, + "learning_rate": 4.2363479800410164e-05, + "loss": 4.6001, + "step": 42980 + }, + { + "epoch": 0.2556201827005424, + "grad_norm": 1.483715534210205, + "learning_rate": 4.2363143741062294e-05, + "loss": 4.5856, + "step": 42981 + }, + { + "epoch": 0.2556261299838234, + "grad_norm": 1.3939441442489624, + "learning_rate": 4.236280767565313e-05, + "loss": 4.456, + "step": 42982 + }, + { + "epoch": 0.2556320772671044, + "grad_norm": 1.6363621950149536, + "learning_rate": 4.2362471604182774e-05, + "loss": 4.5924, + "step": 42983 + }, + { + "epoch": 0.25563802455038537, + "grad_norm": 1.5880547761917114, + "learning_rate": 4.236213552665137e-05, + "loss": 4.9882, + "step": 42984 + }, + { + "epoch": 0.2556439718336664, + "grad_norm": 1.5102934837341309, + "learning_rate": 4.2361799443059023e-05, + "loss": 4.8473, + "step": 42985 + }, + { + "epoch": 0.2556499191169474, + "grad_norm": 1.6375333070755005, + "learning_rate": 4.236146335340585e-05, + "loss": 4.8496, + "step": 42986 + }, + { + "epoch": 0.25565586640022836, + "grad_norm": 1.367787480354309, + "learning_rate": 4.236112725769197e-05, + "loss": 4.7681, + "step": 42987 + }, + { + "epoch": 0.2556618136835094, + "grad_norm": 1.4229615926742554, + "learning_rate": 4.23607911559175e-05, + "loss": 4.9309, + "step": 42988 + }, + { + "epoch": 0.2556677609667904, + "grad_norm": 1.3737068176269531, + "learning_rate": 4.2360455048082556e-05, + "loss": 4.7508, + "step": 42989 + }, + { + "epoch": 0.25567370825007135, + "grad_norm": 1.3718249797821045, + "learning_rate": 4.2360118934187254e-05, + "loss": 4.6246, + "step": 42990 + }, + { + "epoch": 0.25567965553335237, + "grad_norm": 1.6098353862762451, + "learning_rate": 4.2359782814231716e-05, + "loss": 4.8015, + "step": 42991 + }, + { + "epoch": 0.2556856028166334, + "grad_norm": 1.5109996795654297, + "learning_rate": 4.2359446688216064e-05, + "loss": 4.6226, + "step": 42992 + }, + { + "epoch": 0.25569155009991434, + "grad_norm": 1.3449816703796387, + "learning_rate": 4.23591105561404e-05, + "loss": 3.845, + "step": 42993 + }, + { + "epoch": 0.25569749738319536, + "grad_norm": 1.6198228597640991, + "learning_rate": 4.235877441800485e-05, + "loss": 4.0501, + "step": 42994 + }, + { + "epoch": 0.2557034446664764, + "grad_norm": 1.3672032356262207, + "learning_rate": 4.235843827380954e-05, + "loss": 4.3991, + "step": 42995 + }, + { + "epoch": 0.25570939194975734, + "grad_norm": 1.4542659521102905, + "learning_rate": 4.2358102123554564e-05, + "loss": 4.2151, + "step": 42996 + }, + { + "epoch": 0.25571533923303835, + "grad_norm": 1.43723726272583, + "learning_rate": 4.235776596724007e-05, + "loss": 3.9346, + "step": 42997 + }, + { + "epoch": 0.25572128651631937, + "grad_norm": 1.626293420791626, + "learning_rate": 4.235742980486615e-05, + "loss": 3.9756, + "step": 42998 + }, + { + "epoch": 0.25572723379960033, + "grad_norm": 1.4869940280914307, + "learning_rate": 4.235709363643293e-05, + "loss": 4.7645, + "step": 42999 + }, + { + "epoch": 0.25573318108288134, + "grad_norm": 1.3130428791046143, + "learning_rate": 4.2356757461940534e-05, + "loss": 4.973, + "step": 43000 + }, + { + "epoch": 0.25573912836616236, + "grad_norm": 1.4443066120147705, + "learning_rate": 4.235642128138907e-05, + "loss": 5.0007, + "step": 43001 + }, + { + "epoch": 0.2557450756494433, + "grad_norm": 1.3307623863220215, + "learning_rate": 4.2356085094778655e-05, + "loss": 5.0853, + "step": 43002 + }, + { + "epoch": 0.25575102293272434, + "grad_norm": 1.4218367338180542, + "learning_rate": 4.235574890210942e-05, + "loss": 5.0412, + "step": 43003 + }, + { + "epoch": 0.25575697021600535, + "grad_norm": 1.560186505317688, + "learning_rate": 4.2355412703381456e-05, + "loss": 4.8382, + "step": 43004 + }, + { + "epoch": 0.2557629174992863, + "grad_norm": 1.506420373916626, + "learning_rate": 4.235507649859491e-05, + "loss": 4.8596, + "step": 43005 + }, + { + "epoch": 0.25576886478256733, + "grad_norm": 1.6403411626815796, + "learning_rate": 4.2354740287749885e-05, + "loss": 5.0205, + "step": 43006 + }, + { + "epoch": 0.25577481206584834, + "grad_norm": 1.3661879301071167, + "learning_rate": 4.235440407084649e-05, + "loss": 4.8756, + "step": 43007 + }, + { + "epoch": 0.2557807593491293, + "grad_norm": 1.5292987823486328, + "learning_rate": 4.235406784788486e-05, + "loss": 4.7258, + "step": 43008 + }, + { + "epoch": 0.2557867066324103, + "grad_norm": 1.6307228803634644, + "learning_rate": 4.2353731618865106e-05, + "loss": 4.1473, + "step": 43009 + }, + { + "epoch": 0.25579265391569134, + "grad_norm": 1.974385380744934, + "learning_rate": 4.235339538378734e-05, + "loss": 4.1572, + "step": 43010 + }, + { + "epoch": 0.2557986011989723, + "grad_norm": 1.7650569677352905, + "learning_rate": 4.2353059142651686e-05, + "loss": 4.4362, + "step": 43011 + }, + { + "epoch": 0.2558045484822533, + "grad_norm": 1.9455785751342773, + "learning_rate": 4.235272289545825e-05, + "loss": 3.9795, + "step": 43012 + }, + { + "epoch": 0.25581049576553433, + "grad_norm": 1.799568772315979, + "learning_rate": 4.2352386642207167e-05, + "loss": 3.8603, + "step": 43013 + }, + { + "epoch": 0.2558164430488153, + "grad_norm": 1.8050156831741333, + "learning_rate": 4.235205038289854e-05, + "loss": 3.9744, + "step": 43014 + }, + { + "epoch": 0.2558223903320963, + "grad_norm": 1.7686059474945068, + "learning_rate": 4.235171411753249e-05, + "loss": 3.9633, + "step": 43015 + }, + { + "epoch": 0.2558283376153773, + "grad_norm": 1.7363677024841309, + "learning_rate": 4.235137784610914e-05, + "loss": 3.8763, + "step": 43016 + }, + { + "epoch": 0.2558342848986583, + "grad_norm": 1.686257004737854, + "learning_rate": 4.23510415686286e-05, + "loss": 3.9619, + "step": 43017 + }, + { + "epoch": 0.2558402321819393, + "grad_norm": 1.2227449417114258, + "learning_rate": 4.2350705285090996e-05, + "loss": 4.8171, + "step": 43018 + }, + { + "epoch": 0.2558461794652203, + "grad_norm": 1.3346046209335327, + "learning_rate": 4.2350368995496436e-05, + "loss": 5.8711, + "step": 43019 + }, + { + "epoch": 0.2558521267485013, + "grad_norm": 1.5756354331970215, + "learning_rate": 4.235003269984504e-05, + "loss": 4.9404, + "step": 43020 + }, + { + "epoch": 0.2558580740317823, + "grad_norm": 1.4645862579345703, + "learning_rate": 4.234969639813693e-05, + "loss": 5.1545, + "step": 43021 + }, + { + "epoch": 0.2558640213150633, + "grad_norm": 1.4182645082473755, + "learning_rate": 4.234936009037222e-05, + "loss": 5.0089, + "step": 43022 + }, + { + "epoch": 0.25586996859834427, + "grad_norm": 1.57442045211792, + "learning_rate": 4.234902377655103e-05, + "loss": 4.8821, + "step": 43023 + }, + { + "epoch": 0.2558759158816253, + "grad_norm": 2.0422308444976807, + "learning_rate": 4.234868745667347e-05, + "loss": 3.9853, + "step": 43024 + }, + { + "epoch": 0.2558818631649063, + "grad_norm": 1.972324013710022, + "learning_rate": 4.234835113073966e-05, + "loss": 4.0511, + "step": 43025 + }, + { + "epoch": 0.25588781044818726, + "grad_norm": 1.707190990447998, + "learning_rate": 4.2348014798749725e-05, + "loss": 3.9732, + "step": 43026 + }, + { + "epoch": 0.2558937577314683, + "grad_norm": 1.6809606552124023, + "learning_rate": 4.2347678460703776e-05, + "loss": 4.0588, + "step": 43027 + }, + { + "epoch": 0.2558997050147493, + "grad_norm": 2.1431753635406494, + "learning_rate": 4.234734211660194e-05, + "loss": 4.1794, + "step": 43028 + }, + { + "epoch": 0.25590565229803025, + "grad_norm": 1.3636233806610107, + "learning_rate": 4.2347005766444315e-05, + "loss": 4.0609, + "step": 43029 + }, + { + "epoch": 0.25591159958131127, + "grad_norm": 1.5193006992340088, + "learning_rate": 4.2346669410231034e-05, + "loss": 4.3395, + "step": 43030 + }, + { + "epoch": 0.2559175468645922, + "grad_norm": 1.5848474502563477, + "learning_rate": 4.2346333047962206e-05, + "loss": 4.277, + "step": 43031 + }, + { + "epoch": 0.25592349414787324, + "grad_norm": 1.6371947526931763, + "learning_rate": 4.234599667963796e-05, + "loss": 4.8179, + "step": 43032 + }, + { + "epoch": 0.25592944143115426, + "grad_norm": 1.652442216873169, + "learning_rate": 4.2345660305258396e-05, + "loss": 5.43, + "step": 43033 + }, + { + "epoch": 0.2559353887144352, + "grad_norm": 2.298644542694092, + "learning_rate": 4.2345323924823646e-05, + "loss": 4.4751, + "step": 43034 + }, + { + "epoch": 0.25594133599771623, + "grad_norm": 2.433499574661255, + "learning_rate": 4.234498753833382e-05, + "loss": 4.2119, + "step": 43035 + }, + { + "epoch": 0.25594728328099725, + "grad_norm": 2.096745729446411, + "learning_rate": 4.234465114578904e-05, + "loss": 4.3741, + "step": 43036 + }, + { + "epoch": 0.2559532305642782, + "grad_norm": 2.1208436489105225, + "learning_rate": 4.2344314747189425e-05, + "loss": 4.3675, + "step": 43037 + }, + { + "epoch": 0.2559591778475592, + "grad_norm": 1.960035800933838, + "learning_rate": 4.234397834253508e-05, + "loss": 4.7118, + "step": 43038 + }, + { + "epoch": 0.25596512513084024, + "grad_norm": 1.9775071144104004, + "learning_rate": 4.234364193182615e-05, + "loss": 4.6115, + "step": 43039 + }, + { + "epoch": 0.2559710724141212, + "grad_norm": 1.9097189903259277, + "learning_rate": 4.2343305515062716e-05, + "loss": 4.2635, + "step": 43040 + }, + { + "epoch": 0.2559770196974022, + "grad_norm": 1.738128662109375, + "learning_rate": 4.234296909224492e-05, + "loss": 4.4808, + "step": 43041 + }, + { + "epoch": 0.25598296698068324, + "grad_norm": 1.745830774307251, + "learning_rate": 4.2342632663372864e-05, + "loss": 4.5024, + "step": 43042 + }, + { + "epoch": 0.2559889142639642, + "grad_norm": 1.7696079015731812, + "learning_rate": 4.234229622844669e-05, + "loss": 4.4244, + "step": 43043 + }, + { + "epoch": 0.2559948615472452, + "grad_norm": 1.5932589769363403, + "learning_rate": 4.234195978746649e-05, + "loss": 4.4273, + "step": 43044 + }, + { + "epoch": 0.2560008088305262, + "grad_norm": 1.4973605871200562, + "learning_rate": 4.234162334043239e-05, + "loss": 4.3911, + "step": 43045 + }, + { + "epoch": 0.2560067561138072, + "grad_norm": 1.4542737007141113, + "learning_rate": 4.234128688734451e-05, + "loss": 4.4597, + "step": 43046 + }, + { + "epoch": 0.2560127033970882, + "grad_norm": 1.7038471698760986, + "learning_rate": 4.234095042820296e-05, + "loss": 3.8551, + "step": 43047 + }, + { + "epoch": 0.2560186506803692, + "grad_norm": 1.649430274963379, + "learning_rate": 4.234061396300787e-05, + "loss": 4.0982, + "step": 43048 + }, + { + "epoch": 0.2560245979636502, + "grad_norm": 1.7206834554672241, + "learning_rate": 4.234027749175935e-05, + "loss": 4.0143, + "step": 43049 + }, + { + "epoch": 0.2560305452469312, + "grad_norm": 1.6867774724960327, + "learning_rate": 4.233994101445752e-05, + "loss": 4.1288, + "step": 43050 + }, + { + "epoch": 0.2560364925302122, + "grad_norm": 1.6311057806015015, + "learning_rate": 4.2339604531102493e-05, + "loss": 4.2241, + "step": 43051 + }, + { + "epoch": 0.2560424398134932, + "grad_norm": 1.3798069953918457, + "learning_rate": 4.233926804169439e-05, + "loss": 4.4376, + "step": 43052 + }, + { + "epoch": 0.2560483870967742, + "grad_norm": 1.6904020309448242, + "learning_rate": 4.233893154623333e-05, + "loss": 3.5427, + "step": 43053 + }, + { + "epoch": 0.2560543343800552, + "grad_norm": 1.6020249128341675, + "learning_rate": 4.233859504471943e-05, + "loss": 4.9388, + "step": 43054 + }, + { + "epoch": 0.25606028166333616, + "grad_norm": 1.6302107572555542, + "learning_rate": 4.23382585371528e-05, + "loss": 4.9386, + "step": 43055 + }, + { + "epoch": 0.2560662289466172, + "grad_norm": 1.773834466934204, + "learning_rate": 4.2337922023533564e-05, + "loss": 5.2741, + "step": 43056 + }, + { + "epoch": 0.2560721762298982, + "grad_norm": 1.6756560802459717, + "learning_rate": 4.233758550386184e-05, + "loss": 5.6138, + "step": 43057 + }, + { + "epoch": 0.25607812351317916, + "grad_norm": 1.8410719633102417, + "learning_rate": 4.233724897813775e-05, + "loss": 4.2566, + "step": 43058 + }, + { + "epoch": 0.2560840707964602, + "grad_norm": 1.8338381052017212, + "learning_rate": 4.233691244636139e-05, + "loss": 4.7592, + "step": 43059 + }, + { + "epoch": 0.2560900180797412, + "grad_norm": 1.7630482912063599, + "learning_rate": 4.233657590853291e-05, + "loss": 4.9416, + "step": 43060 + }, + { + "epoch": 0.25609596536302215, + "grad_norm": 1.3655109405517578, + "learning_rate": 4.2336239364652396e-05, + "loss": 4.8122, + "step": 43061 + }, + { + "epoch": 0.25610191264630316, + "grad_norm": 1.57066011428833, + "learning_rate": 4.233590281471998e-05, + "loss": 4.3536, + "step": 43062 + }, + { + "epoch": 0.2561078599295842, + "grad_norm": 1.531220555305481, + "learning_rate": 4.233556625873579e-05, + "loss": 4.2934, + "step": 43063 + }, + { + "epoch": 0.25611380721286514, + "grad_norm": 1.89093017578125, + "learning_rate": 4.233522969669993e-05, + "loss": 4.3546, + "step": 43064 + }, + { + "epoch": 0.25611975449614616, + "grad_norm": 1.5941638946533203, + "learning_rate": 4.233489312861252e-05, + "loss": 4.4679, + "step": 43065 + }, + { + "epoch": 0.2561257017794272, + "grad_norm": 1.6822439432144165, + "learning_rate": 4.233455655447368e-05, + "loss": 4.5079, + "step": 43066 + }, + { + "epoch": 0.25613164906270813, + "grad_norm": 1.6822086572647095, + "learning_rate": 4.233421997428352e-05, + "loss": 4.329, + "step": 43067 + }, + { + "epoch": 0.25613759634598915, + "grad_norm": 1.4634648561477661, + "learning_rate": 4.233388338804217e-05, + "loss": 4.4318, + "step": 43068 + }, + { + "epoch": 0.25614354362927017, + "grad_norm": 1.7368948459625244, + "learning_rate": 4.2333546795749734e-05, + "loss": 4.1261, + "step": 43069 + }, + { + "epoch": 0.2561494909125511, + "grad_norm": 1.6513512134552002, + "learning_rate": 4.233321019740634e-05, + "loss": 3.7218, + "step": 43070 + }, + { + "epoch": 0.25615543819583214, + "grad_norm": 1.4913064241409302, + "learning_rate": 4.23328735930121e-05, + "loss": 4.6571, + "step": 43071 + }, + { + "epoch": 0.25616138547911316, + "grad_norm": 1.557802438735962, + "learning_rate": 4.233253698256713e-05, + "loss": 4.3658, + "step": 43072 + }, + { + "epoch": 0.2561673327623941, + "grad_norm": 1.5025599002838135, + "learning_rate": 4.233220036607155e-05, + "loss": 4.4935, + "step": 43073 + }, + { + "epoch": 0.25617328004567513, + "grad_norm": 1.6365175247192383, + "learning_rate": 4.233186374352548e-05, + "loss": 4.7049, + "step": 43074 + }, + { + "epoch": 0.25617922732895615, + "grad_norm": 1.5349626541137695, + "learning_rate": 4.233152711492903e-05, + "loss": 4.2829, + "step": 43075 + }, + { + "epoch": 0.2561851746122371, + "grad_norm": 1.5812784433364868, + "learning_rate": 4.2331190480282336e-05, + "loss": 4.2931, + "step": 43076 + }, + { + "epoch": 0.2561911218955181, + "grad_norm": 1.693792462348938, + "learning_rate": 4.233085383958549e-05, + "loss": 4.3754, + "step": 43077 + }, + { + "epoch": 0.25619706917879914, + "grad_norm": 1.279063105583191, + "learning_rate": 4.233051719283863e-05, + "loss": 4.9319, + "step": 43078 + }, + { + "epoch": 0.2562030164620801, + "grad_norm": 1.8365424871444702, + "learning_rate": 4.233018054004186e-05, + "loss": 3.8051, + "step": 43079 + }, + { + "epoch": 0.2562089637453611, + "grad_norm": 1.8287618160247803, + "learning_rate": 4.232984388119531e-05, + "loss": 4.0867, + "step": 43080 + }, + { + "epoch": 0.25621491102864213, + "grad_norm": 1.7833629846572876, + "learning_rate": 4.232950721629908e-05, + "loss": 4.2847, + "step": 43081 + }, + { + "epoch": 0.2562208583119231, + "grad_norm": 1.6823785305023193, + "learning_rate": 4.2329170545353306e-05, + "loss": 4.2403, + "step": 43082 + }, + { + "epoch": 0.2562268055952041, + "grad_norm": 1.5821982622146606, + "learning_rate": 4.23288338683581e-05, + "loss": 4.406, + "step": 43083 + }, + { + "epoch": 0.2562327528784851, + "grad_norm": 1.6936614513397217, + "learning_rate": 4.232849718531357e-05, + "loss": 4.4036, + "step": 43084 + }, + { + "epoch": 0.2562387001617661, + "grad_norm": 1.3656049966812134, + "learning_rate": 4.2328160496219835e-05, + "loss": 5.0881, + "step": 43085 + }, + { + "epoch": 0.2562446474450471, + "grad_norm": 1.5634727478027344, + "learning_rate": 4.232782380107704e-05, + "loss": 4.505, + "step": 43086 + }, + { + "epoch": 0.2562505947283281, + "grad_norm": 1.5538195371627808, + "learning_rate": 4.232748709988526e-05, + "loss": 4.2491, + "step": 43087 + }, + { + "epoch": 0.2562565420116091, + "grad_norm": 1.4724664688110352, + "learning_rate": 4.2327150392644644e-05, + "loss": 4.931, + "step": 43088 + }, + { + "epoch": 0.2562624892948901, + "grad_norm": 1.2705706357955933, + "learning_rate": 4.232681367935529e-05, + "loss": 5.3793, + "step": 43089 + }, + { + "epoch": 0.2562684365781711, + "grad_norm": 1.529640555381775, + "learning_rate": 4.2326476960017334e-05, + "loss": 4.2528, + "step": 43090 + }, + { + "epoch": 0.25627438386145207, + "grad_norm": 1.5609214305877686, + "learning_rate": 4.232614023463088e-05, + "loss": 4.943, + "step": 43091 + }, + { + "epoch": 0.2562803311447331, + "grad_norm": 1.6164865493774414, + "learning_rate": 4.232580350319605e-05, + "loss": 5.1223, + "step": 43092 + }, + { + "epoch": 0.2562862784280141, + "grad_norm": 1.6765780448913574, + "learning_rate": 4.232546676571296e-05, + "loss": 3.3336, + "step": 43093 + }, + { + "epoch": 0.25629222571129506, + "grad_norm": 1.8657526969909668, + "learning_rate": 4.232513002218173e-05, + "loss": 3.7427, + "step": 43094 + }, + { + "epoch": 0.2562981729945761, + "grad_norm": 1.639125943183899, + "learning_rate": 4.2324793272602476e-05, + "loss": 4.1894, + "step": 43095 + }, + { + "epoch": 0.2563041202778571, + "grad_norm": 1.7900320291519165, + "learning_rate": 4.232445651697531e-05, + "loss": 4.2716, + "step": 43096 + }, + { + "epoch": 0.25631006756113806, + "grad_norm": 1.6650129556655884, + "learning_rate": 4.2324119755300365e-05, + "loss": 4.3341, + "step": 43097 + }, + { + "epoch": 0.25631601484441907, + "grad_norm": 2.0144588947296143, + "learning_rate": 4.232378298757774e-05, + "loss": 3.8112, + "step": 43098 + }, + { + "epoch": 0.2563219621277001, + "grad_norm": 1.5504724979400635, + "learning_rate": 4.232344621380756e-05, + "loss": 4.3494, + "step": 43099 + }, + { + "epoch": 0.25632790941098105, + "grad_norm": 1.8729650974273682, + "learning_rate": 4.232310943398995e-05, + "loss": 4.4983, + "step": 43100 + }, + { + "epoch": 0.25633385669426206, + "grad_norm": 1.261535882949829, + "learning_rate": 4.2322772648125024e-05, + "loss": 5.3851, + "step": 43101 + }, + { + "epoch": 0.2563398039775431, + "grad_norm": 1.4299671649932861, + "learning_rate": 4.2322435856212896e-05, + "loss": 5.0575, + "step": 43102 + }, + { + "epoch": 0.25634575126082404, + "grad_norm": 1.6319053173065186, + "learning_rate": 4.232209905825368e-05, + "loss": 5.2558, + "step": 43103 + }, + { + "epoch": 0.25635169854410506, + "grad_norm": 1.4645347595214844, + "learning_rate": 4.23217622542475e-05, + "loss": 5.0387, + "step": 43104 + }, + { + "epoch": 0.25635764582738607, + "grad_norm": 1.443571925163269, + "learning_rate": 4.232142544419447e-05, + "loss": 4.6575, + "step": 43105 + }, + { + "epoch": 0.25636359311066703, + "grad_norm": 1.662807822227478, + "learning_rate": 4.232108862809471e-05, + "loss": 4.6509, + "step": 43106 + }, + { + "epoch": 0.25636954039394805, + "grad_norm": 1.6740074157714844, + "learning_rate": 4.232075180594834e-05, + "loss": 3.967, + "step": 43107 + }, + { + "epoch": 0.25637548767722906, + "grad_norm": 1.6887696981430054, + "learning_rate": 4.2320414977755475e-05, + "loss": 4.9417, + "step": 43108 + }, + { + "epoch": 0.25638143496051, + "grad_norm": 1.2795442342758179, + "learning_rate": 4.232007814351623e-05, + "loss": 5.0844, + "step": 43109 + }, + { + "epoch": 0.25638738224379104, + "grad_norm": 1.4696543216705322, + "learning_rate": 4.231974130323072e-05, + "loss": 5.0683, + "step": 43110 + }, + { + "epoch": 0.25639332952707206, + "grad_norm": 1.5746068954467773, + "learning_rate": 4.231940445689907e-05, + "loss": 4.9257, + "step": 43111 + }, + { + "epoch": 0.256399276810353, + "grad_norm": 1.380847454071045, + "learning_rate": 4.23190676045214e-05, + "loss": 5.0572, + "step": 43112 + }, + { + "epoch": 0.25640522409363403, + "grad_norm": 1.4553128480911255, + "learning_rate": 4.231873074609782e-05, + "loss": 4.4129, + "step": 43113 + }, + { + "epoch": 0.25641117137691505, + "grad_norm": 1.4296677112579346, + "learning_rate": 4.231839388162845e-05, + "loss": 4.3708, + "step": 43114 + }, + { + "epoch": 0.256417118660196, + "grad_norm": 1.2771967649459839, + "learning_rate": 4.23180570111134e-05, + "loss": 4.5955, + "step": 43115 + }, + { + "epoch": 0.256423065943477, + "grad_norm": 1.4211199283599854, + "learning_rate": 4.231772013455281e-05, + "loss": 4.6043, + "step": 43116 + }, + { + "epoch": 0.25642901322675804, + "grad_norm": 1.8074533939361572, + "learning_rate": 4.2317383251946775e-05, + "loss": 3.3828, + "step": 43117 + }, + { + "epoch": 0.256434960510039, + "grad_norm": 1.5118231773376465, + "learning_rate": 4.2317046363295407e-05, + "loss": 3.9694, + "step": 43118 + }, + { + "epoch": 0.25644090779332, + "grad_norm": 1.3408869504928589, + "learning_rate": 4.231670946859886e-05, + "loss": 4.2928, + "step": 43119 + }, + { + "epoch": 0.25644685507660103, + "grad_norm": 1.2333532571792603, + "learning_rate": 4.2316372567857214e-05, + "loss": 4.5457, + "step": 43120 + }, + { + "epoch": 0.256452802359882, + "grad_norm": 1.2678139209747314, + "learning_rate": 4.2316035661070606e-05, + "loss": 4.7376, + "step": 43121 + }, + { + "epoch": 0.256458749643163, + "grad_norm": 1.341230034828186, + "learning_rate": 4.231569874823915e-05, + "loss": 4.4601, + "step": 43122 + }, + { + "epoch": 0.256464696926444, + "grad_norm": 2.2130229473114014, + "learning_rate": 4.231536182936296e-05, + "loss": 3.1232, + "step": 43123 + }, + { + "epoch": 0.256470644209725, + "grad_norm": 1.635401725769043, + "learning_rate": 4.231502490444216e-05, + "loss": 4.5444, + "step": 43124 + }, + { + "epoch": 0.256476591493006, + "grad_norm": 1.5828396081924438, + "learning_rate": 4.231468797347686e-05, + "loss": 5.0282, + "step": 43125 + }, + { + "epoch": 0.256482538776287, + "grad_norm": 1.5375844240188599, + "learning_rate": 4.231435103646718e-05, + "loss": 4.9248, + "step": 43126 + }, + { + "epoch": 0.256488486059568, + "grad_norm": 1.9251611232757568, + "learning_rate": 4.2314014093413246e-05, + "loss": 3.9548, + "step": 43127 + }, + { + "epoch": 0.256494433342849, + "grad_norm": 1.5522249937057495, + "learning_rate": 4.231367714431516e-05, + "loss": 4.7037, + "step": 43128 + }, + { + "epoch": 0.25650038062613, + "grad_norm": 1.2368069887161255, + "learning_rate": 4.2313340189173055e-05, + "loss": 5.0249, + "step": 43129 + }, + { + "epoch": 0.25650632790941097, + "grad_norm": 1.5798066854476929, + "learning_rate": 4.231300322798704e-05, + "loss": 5.1824, + "step": 43130 + }, + { + "epoch": 0.256512275192692, + "grad_norm": 1.7944278717041016, + "learning_rate": 4.231266626075723e-05, + "loss": 3.9667, + "step": 43131 + }, + { + "epoch": 0.256518222475973, + "grad_norm": 1.7134262323379517, + "learning_rate": 4.2312329287483745e-05, + "loss": 3.3349, + "step": 43132 + }, + { + "epoch": 0.25652416975925396, + "grad_norm": 1.4317337274551392, + "learning_rate": 4.2311992308166716e-05, + "loss": 4.5869, + "step": 43133 + }, + { + "epoch": 0.256530117042535, + "grad_norm": 1.326095700263977, + "learning_rate": 4.231165532280624e-05, + "loss": 4.1624, + "step": 43134 + }, + { + "epoch": 0.256536064325816, + "grad_norm": 1.912487506866455, + "learning_rate": 4.231131833140245e-05, + "loss": 4.0551, + "step": 43135 + }, + { + "epoch": 0.25654201160909695, + "grad_norm": 1.4460991621017456, + "learning_rate": 4.2310981333955455e-05, + "loss": 4.4333, + "step": 43136 + }, + { + "epoch": 0.25654795889237797, + "grad_norm": 1.608791708946228, + "learning_rate": 4.231064433046538e-05, + "loss": 4.0884, + "step": 43137 + }, + { + "epoch": 0.256553906175659, + "grad_norm": 1.5375332832336426, + "learning_rate": 4.2310307320932327e-05, + "loss": 4.1706, + "step": 43138 + }, + { + "epoch": 0.25655985345893995, + "grad_norm": 1.5566539764404297, + "learning_rate": 4.2309970305356425e-05, + "loss": 4.5494, + "step": 43139 + }, + { + "epoch": 0.25656580074222096, + "grad_norm": 1.827623963356018, + "learning_rate": 4.2309633283737805e-05, + "loss": 4.2033, + "step": 43140 + }, + { + "epoch": 0.256571748025502, + "grad_norm": 1.6951141357421875, + "learning_rate": 4.230929625607656e-05, + "loss": 4.0754, + "step": 43141 + }, + { + "epoch": 0.25657769530878294, + "grad_norm": 1.7523956298828125, + "learning_rate": 4.230895922237282e-05, + "loss": 3.9005, + "step": 43142 + }, + { + "epoch": 0.25658364259206395, + "grad_norm": 1.572729468345642, + "learning_rate": 4.23086221826267e-05, + "loss": 4.2947, + "step": 43143 + }, + { + "epoch": 0.25658958987534497, + "grad_norm": 1.8115313053131104, + "learning_rate": 4.2308285136838323e-05, + "loss": 3.7993, + "step": 43144 + }, + { + "epoch": 0.25659553715862593, + "grad_norm": 1.3805307149887085, + "learning_rate": 4.23079480850078e-05, + "loss": 4.6046, + "step": 43145 + }, + { + "epoch": 0.25660148444190695, + "grad_norm": 1.3980140686035156, + "learning_rate": 4.2307611027135254e-05, + "loss": 5.0077, + "step": 43146 + }, + { + "epoch": 0.2566074317251879, + "grad_norm": 1.4530518054962158, + "learning_rate": 4.2307273963220796e-05, + "loss": 5.0739, + "step": 43147 + }, + { + "epoch": 0.2566133790084689, + "grad_norm": 1.430055022239685, + "learning_rate": 4.2306936893264546e-05, + "loss": 4.9562, + "step": 43148 + }, + { + "epoch": 0.25661932629174994, + "grad_norm": 1.2699596881866455, + "learning_rate": 4.230659981726663e-05, + "loss": 5.0366, + "step": 43149 + }, + { + "epoch": 0.2566252735750309, + "grad_norm": 1.2998731136322021, + "learning_rate": 4.2306262735227154e-05, + "loss": 4.9707, + "step": 43150 + }, + { + "epoch": 0.2566312208583119, + "grad_norm": 1.1286110877990723, + "learning_rate": 4.230592564714624e-05, + "loss": 4.8796, + "step": 43151 + }, + { + "epoch": 0.25663716814159293, + "grad_norm": 1.8929100036621094, + "learning_rate": 4.2305588553024014e-05, + "loss": 4.2822, + "step": 43152 + }, + { + "epoch": 0.2566431154248739, + "grad_norm": 1.373321294784546, + "learning_rate": 4.230525145286057e-05, + "loss": 4.8825, + "step": 43153 + }, + { + "epoch": 0.2566490627081549, + "grad_norm": 1.9584448337554932, + "learning_rate": 4.2304914346656054e-05, + "loss": 5.1444, + "step": 43154 + }, + { + "epoch": 0.2566550099914359, + "grad_norm": 1.3918389081954956, + "learning_rate": 4.2304577234410566e-05, + "loss": 4.6077, + "step": 43155 + }, + { + "epoch": 0.2566609572747169, + "grad_norm": 1.5585544109344482, + "learning_rate": 4.2304240116124236e-05, + "loss": 4.5059, + "step": 43156 + }, + { + "epoch": 0.2566669045579979, + "grad_norm": 1.4481827020645142, + "learning_rate": 4.2303902991797165e-05, + "loss": 5.1569, + "step": 43157 + }, + { + "epoch": 0.2566728518412789, + "grad_norm": 1.4634238481521606, + "learning_rate": 4.230356586142948e-05, + "loss": 4.9278, + "step": 43158 + }, + { + "epoch": 0.2566787991245599, + "grad_norm": 1.8230242729187012, + "learning_rate": 4.230322872502131e-05, + "loss": 4.6858, + "step": 43159 + }, + { + "epoch": 0.2566847464078409, + "grad_norm": 1.4537755250930786, + "learning_rate": 4.2302891582572754e-05, + "loss": 4.8858, + "step": 43160 + }, + { + "epoch": 0.2566906936911219, + "grad_norm": 1.3659029006958008, + "learning_rate": 4.230255443408394e-05, + "loss": 5.1746, + "step": 43161 + }, + { + "epoch": 0.25669664097440287, + "grad_norm": 1.9517377614974976, + "learning_rate": 4.230221727955498e-05, + "loss": 4.7308, + "step": 43162 + }, + { + "epoch": 0.2567025882576839, + "grad_norm": 1.2879133224487305, + "learning_rate": 4.2301880118986006e-05, + "loss": 5.007, + "step": 43163 + }, + { + "epoch": 0.2567085355409649, + "grad_norm": 1.3308732509613037, + "learning_rate": 4.23015429523771e-05, + "loss": 4.6422, + "step": 43164 + }, + { + "epoch": 0.25671448282424586, + "grad_norm": 1.4256603717803955, + "learning_rate": 4.230120577972843e-05, + "loss": 4.8982, + "step": 43165 + }, + { + "epoch": 0.2567204301075269, + "grad_norm": 1.4734355211257935, + "learning_rate": 4.230086860104008e-05, + "loss": 4.8368, + "step": 43166 + }, + { + "epoch": 0.2567263773908079, + "grad_norm": 1.369103193283081, + "learning_rate": 4.2300531416312164e-05, + "loss": 5.0286, + "step": 43167 + }, + { + "epoch": 0.25673232467408885, + "grad_norm": 1.2995558977127075, + "learning_rate": 4.230019422554482e-05, + "loss": 4.9908, + "step": 43168 + }, + { + "epoch": 0.25673827195736987, + "grad_norm": 1.5617388486862183, + "learning_rate": 4.2299857028738155e-05, + "loss": 5.0924, + "step": 43169 + }, + { + "epoch": 0.2567442192406509, + "grad_norm": 1.6256508827209473, + "learning_rate": 4.2299519825892295e-05, + "loss": 4.6485, + "step": 43170 + }, + { + "epoch": 0.25675016652393184, + "grad_norm": 1.5643647909164429, + "learning_rate": 4.229918261700734e-05, + "loss": 4.7556, + "step": 43171 + }, + { + "epoch": 0.25675611380721286, + "grad_norm": 1.6305166482925415, + "learning_rate": 4.229884540208343e-05, + "loss": 4.7683, + "step": 43172 + }, + { + "epoch": 0.2567620610904939, + "grad_norm": 1.9377024173736572, + "learning_rate": 4.2298508181120666e-05, + "loss": 5.3033, + "step": 43173 + }, + { + "epoch": 0.25676800837377484, + "grad_norm": 1.5644497871398926, + "learning_rate": 4.229817095411917e-05, + "loss": 4.9073, + "step": 43174 + }, + { + "epoch": 0.25677395565705585, + "grad_norm": 1.505139946937561, + "learning_rate": 4.2297833721079064e-05, + "loss": 5.2425, + "step": 43175 + }, + { + "epoch": 0.25677990294033687, + "grad_norm": 1.4733290672302246, + "learning_rate": 4.229749648200046e-05, + "loss": 4.9922, + "step": 43176 + }, + { + "epoch": 0.25678585022361783, + "grad_norm": 1.6022508144378662, + "learning_rate": 4.229715923688349e-05, + "loss": 4.8273, + "step": 43177 + }, + { + "epoch": 0.25679179750689884, + "grad_norm": 1.482138752937317, + "learning_rate": 4.229682198572825e-05, + "loss": 5.208, + "step": 43178 + }, + { + "epoch": 0.25679774479017986, + "grad_norm": 1.471617341041565, + "learning_rate": 4.229648472853487e-05, + "loss": 4.7155, + "step": 43179 + }, + { + "epoch": 0.2568036920734608, + "grad_norm": 1.4491442441940308, + "learning_rate": 4.229614746530346e-05, + "loss": 4.8412, + "step": 43180 + }, + { + "epoch": 0.25680963935674184, + "grad_norm": 1.3009063005447388, + "learning_rate": 4.229581019603415e-05, + "loss": 5.0575, + "step": 43181 + }, + { + "epoch": 0.25681558664002285, + "grad_norm": 1.385319709777832, + "learning_rate": 4.2295472920727055e-05, + "loss": 4.816, + "step": 43182 + }, + { + "epoch": 0.2568215339233038, + "grad_norm": 1.3707847595214844, + "learning_rate": 4.229513563938228e-05, + "loss": 4.9508, + "step": 43183 + }, + { + "epoch": 0.25682748120658483, + "grad_norm": 1.4195948839187622, + "learning_rate": 4.2294798351999966e-05, + "loss": 5.2487, + "step": 43184 + }, + { + "epoch": 0.25683342848986584, + "grad_norm": 1.6506091356277466, + "learning_rate": 4.22944610585802e-05, + "loss": 4.845, + "step": 43185 + }, + { + "epoch": 0.2568393757731468, + "grad_norm": 2.40161395072937, + "learning_rate": 4.2294123759123126e-05, + "loss": 3.5648, + "step": 43186 + }, + { + "epoch": 0.2568453230564278, + "grad_norm": 1.6847604513168335, + "learning_rate": 4.229378645362885e-05, + "loss": 5.0318, + "step": 43187 + }, + { + "epoch": 0.25685127033970884, + "grad_norm": 1.3881254196166992, + "learning_rate": 4.229344914209749e-05, + "loss": 5.0661, + "step": 43188 + }, + { + "epoch": 0.2568572176229898, + "grad_norm": 1.4622201919555664, + "learning_rate": 4.2293111824529166e-05, + "loss": 5.0312, + "step": 43189 + }, + { + "epoch": 0.2568631649062708, + "grad_norm": 1.3677910566329956, + "learning_rate": 4.2292774500924e-05, + "loss": 5.1842, + "step": 43190 + }, + { + "epoch": 0.25686911218955183, + "grad_norm": 1.5241538286209106, + "learning_rate": 4.22924371712821e-05, + "loss": 4.1594, + "step": 43191 + }, + { + "epoch": 0.2568750594728328, + "grad_norm": 1.5977345705032349, + "learning_rate": 4.229209983560359e-05, + "loss": 4.272, + "step": 43192 + }, + { + "epoch": 0.2568810067561138, + "grad_norm": 1.4661868810653687, + "learning_rate": 4.2291762493888584e-05, + "loss": 4.0345, + "step": 43193 + }, + { + "epoch": 0.2568869540393948, + "grad_norm": 1.3094950914382935, + "learning_rate": 4.229142514613721e-05, + "loss": 4.6915, + "step": 43194 + }, + { + "epoch": 0.2568929013226758, + "grad_norm": 1.896764874458313, + "learning_rate": 4.229108779234957e-05, + "loss": 4.4629, + "step": 43195 + }, + { + "epoch": 0.2568988486059568, + "grad_norm": 1.9571253061294556, + "learning_rate": 4.2290750432525796e-05, + "loss": 4.1855, + "step": 43196 + }, + { + "epoch": 0.2569047958892378, + "grad_norm": 1.9055061340332031, + "learning_rate": 4.2290413066666e-05, + "loss": 4.1953, + "step": 43197 + }, + { + "epoch": 0.2569107431725188, + "grad_norm": 1.8839290142059326, + "learning_rate": 4.229007569477029e-05, + "loss": 4.1121, + "step": 43198 + }, + { + "epoch": 0.2569166904557998, + "grad_norm": 1.9361969232559204, + "learning_rate": 4.2289738316838806e-05, + "loss": 3.9952, + "step": 43199 + }, + { + "epoch": 0.2569226377390808, + "grad_norm": 1.6735652685165405, + "learning_rate": 4.228940093287164e-05, + "loss": 4.3438, + "step": 43200 + }, + { + "epoch": 0.25692858502236177, + "grad_norm": 1.8988773822784424, + "learning_rate": 4.228906354286893e-05, + "loss": 4.5012, + "step": 43201 + }, + { + "epoch": 0.2569345323056428, + "grad_norm": 1.4583948850631714, + "learning_rate": 4.2288726146830795e-05, + "loss": 4.34, + "step": 43202 + }, + { + "epoch": 0.2569404795889238, + "grad_norm": 1.7715948820114136, + "learning_rate": 4.228838874475733e-05, + "loss": 4.782, + "step": 43203 + }, + { + "epoch": 0.25694642687220476, + "grad_norm": 1.7171437740325928, + "learning_rate": 4.228805133664867e-05, + "loss": 4.4417, + "step": 43204 + }, + { + "epoch": 0.2569523741554858, + "grad_norm": 1.4371916055679321, + "learning_rate": 4.228771392250493e-05, + "loss": 5.0357, + "step": 43205 + }, + { + "epoch": 0.2569583214387668, + "grad_norm": 1.5498114824295044, + "learning_rate": 4.2287376502326234e-05, + "loss": 4.6623, + "step": 43206 + }, + { + "epoch": 0.25696426872204775, + "grad_norm": 1.4082818031311035, + "learning_rate": 4.228703907611269e-05, + "loss": 5.0939, + "step": 43207 + }, + { + "epoch": 0.25697021600532877, + "grad_norm": 1.4305419921875, + "learning_rate": 4.2286701643864416e-05, + "loss": 5.0629, + "step": 43208 + }, + { + "epoch": 0.2569761632886098, + "grad_norm": 1.2979556322097778, + "learning_rate": 4.2286364205581534e-05, + "loss": 5.106, + "step": 43209 + }, + { + "epoch": 0.25698211057189074, + "grad_norm": 1.275685429573059, + "learning_rate": 4.2286026761264166e-05, + "loss": 4.7557, + "step": 43210 + }, + { + "epoch": 0.25698805785517176, + "grad_norm": 1.3694913387298584, + "learning_rate": 4.2285689310912426e-05, + "loss": 5.2708, + "step": 43211 + }, + { + "epoch": 0.2569940051384528, + "grad_norm": 1.2320306301116943, + "learning_rate": 4.2285351854526423e-05, + "loss": 5.0665, + "step": 43212 + }, + { + "epoch": 0.25699995242173373, + "grad_norm": 1.5341854095458984, + "learning_rate": 4.228501439210628e-05, + "loss": 5.1946, + "step": 43213 + }, + { + "epoch": 0.25700589970501475, + "grad_norm": 1.8960272073745728, + "learning_rate": 4.2284676923652125e-05, + "loss": 4.5715, + "step": 43214 + }, + { + "epoch": 0.25701184698829577, + "grad_norm": 1.4854930639266968, + "learning_rate": 4.2284339449164067e-05, + "loss": 5.0219, + "step": 43215 + }, + { + "epoch": 0.2570177942715767, + "grad_norm": 1.3975600004196167, + "learning_rate": 4.228400196864222e-05, + "loss": 5.0916, + "step": 43216 + }, + { + "epoch": 0.25702374155485774, + "grad_norm": 1.5578317642211914, + "learning_rate": 4.228366448208671e-05, + "loss": 4.9308, + "step": 43217 + }, + { + "epoch": 0.25702968883813876, + "grad_norm": 2.199573278427124, + "learning_rate": 4.228332698949765e-05, + "loss": 3.9354, + "step": 43218 + }, + { + "epoch": 0.2570356361214197, + "grad_norm": 2.0980987548828125, + "learning_rate": 4.228298949087516e-05, + "loss": 3.6333, + "step": 43219 + }, + { + "epoch": 0.25704158340470074, + "grad_norm": 1.4530901908874512, + "learning_rate": 4.228265198621936e-05, + "loss": 5.0876, + "step": 43220 + }, + { + "epoch": 0.25704753068798175, + "grad_norm": 1.2363462448120117, + "learning_rate": 4.228231447553035e-05, + "loss": 5.107, + "step": 43221 + }, + { + "epoch": 0.2570534779712627, + "grad_norm": 1.4726265668869019, + "learning_rate": 4.2281976958808276e-05, + "loss": 5.1866, + "step": 43222 + }, + { + "epoch": 0.2570594252545437, + "grad_norm": 1.497187852859497, + "learning_rate": 4.2281639436053244e-05, + "loss": 5.0903, + "step": 43223 + }, + { + "epoch": 0.25706537253782474, + "grad_norm": 1.609535574913025, + "learning_rate": 4.228130190726537e-05, + "loss": 4.8737, + "step": 43224 + }, + { + "epoch": 0.2570713198211057, + "grad_norm": 1.3542191982269287, + "learning_rate": 4.228096437244476e-05, + "loss": 5.0888, + "step": 43225 + }, + { + "epoch": 0.2570772671043867, + "grad_norm": 1.6935125589370728, + "learning_rate": 4.228062683159156e-05, + "loss": 4.5861, + "step": 43226 + }, + { + "epoch": 0.25708321438766774, + "grad_norm": 1.4494273662567139, + "learning_rate": 4.228028928470586e-05, + "loss": 4.8438, + "step": 43227 + }, + { + "epoch": 0.2570891616709487, + "grad_norm": 1.59843909740448, + "learning_rate": 4.2279951731787794e-05, + "loss": 5.2755, + "step": 43228 + }, + { + "epoch": 0.2570951089542297, + "grad_norm": 2.059173345565796, + "learning_rate": 4.227961417283748e-05, + "loss": 4.8519, + "step": 43229 + }, + { + "epoch": 0.25710105623751073, + "grad_norm": 1.7140424251556396, + "learning_rate": 4.227927660785502e-05, + "loss": 4.8418, + "step": 43230 + }, + { + "epoch": 0.2571070035207917, + "grad_norm": 1.7206095457077026, + "learning_rate": 4.2278939036840545e-05, + "loss": 5.1713, + "step": 43231 + }, + { + "epoch": 0.2571129508040727, + "grad_norm": 1.9964714050292969, + "learning_rate": 4.2278601459794185e-05, + "loss": 3.7697, + "step": 43232 + }, + { + "epoch": 0.2571188980873537, + "grad_norm": 2.4650697708129883, + "learning_rate": 4.227826387671603e-05, + "loss": 3.116, + "step": 43233 + }, + { + "epoch": 0.2571248453706347, + "grad_norm": 1.4508239030838013, + "learning_rate": 4.227792628760621e-05, + "loss": 4.7236, + "step": 43234 + }, + { + "epoch": 0.2571307926539157, + "grad_norm": 1.6110681295394897, + "learning_rate": 4.2277588692464854e-05, + "loss": 4.193, + "step": 43235 + }, + { + "epoch": 0.2571367399371967, + "grad_norm": 1.5010842084884644, + "learning_rate": 4.227725109129206e-05, + "loss": 4.2579, + "step": 43236 + }, + { + "epoch": 0.2571426872204777, + "grad_norm": 1.5093326568603516, + "learning_rate": 4.2276913484087966e-05, + "loss": 4.7765, + "step": 43237 + }, + { + "epoch": 0.2571486345037587, + "grad_norm": 1.5133007764816284, + "learning_rate": 4.227657587085267e-05, + "loss": 5.0535, + "step": 43238 + }, + { + "epoch": 0.2571545817870397, + "grad_norm": 1.6125152111053467, + "learning_rate": 4.227623825158631e-05, + "loss": 5.0559, + "step": 43239 + }, + { + "epoch": 0.25716052907032066, + "grad_norm": 1.876983642578125, + "learning_rate": 4.227590062628899e-05, + "loss": 4.2287, + "step": 43240 + }, + { + "epoch": 0.2571664763536017, + "grad_norm": 1.7906239032745361, + "learning_rate": 4.2275562994960836e-05, + "loss": 3.8665, + "step": 43241 + }, + { + "epoch": 0.2571724236368827, + "grad_norm": 1.7727525234222412, + "learning_rate": 4.2275225357601955e-05, + "loss": 4.061, + "step": 43242 + }, + { + "epoch": 0.25717837092016366, + "grad_norm": 1.6999188661575317, + "learning_rate": 4.227488771421248e-05, + "loss": 4.0218, + "step": 43243 + }, + { + "epoch": 0.2571843182034447, + "grad_norm": 1.9035230875015259, + "learning_rate": 4.2274550064792505e-05, + "loss": 3.5294, + "step": 43244 + }, + { + "epoch": 0.2571902654867257, + "grad_norm": 1.9648165702819824, + "learning_rate": 4.227421240934217e-05, + "loss": 3.4873, + "step": 43245 + }, + { + "epoch": 0.25719621277000665, + "grad_norm": 2.1814818382263184, + "learning_rate": 4.227387474786159e-05, + "loss": 3.4061, + "step": 43246 + }, + { + "epoch": 0.25720216005328767, + "grad_norm": 1.6438370943069458, + "learning_rate": 4.2273537080350876e-05, + "loss": 4.5244, + "step": 43247 + }, + { + "epoch": 0.2572081073365687, + "grad_norm": 1.3943599462509155, + "learning_rate": 4.227319940681015e-05, + "loss": 5.3853, + "step": 43248 + }, + { + "epoch": 0.25721405461984964, + "grad_norm": 1.7308249473571777, + "learning_rate": 4.2272861727239523e-05, + "loss": 5.3466, + "step": 43249 + }, + { + "epoch": 0.25722000190313066, + "grad_norm": 2.6011159420013428, + "learning_rate": 4.227252404163913e-05, + "loss": 3.7412, + "step": 43250 + }, + { + "epoch": 0.2572259491864117, + "grad_norm": 1.985224723815918, + "learning_rate": 4.2272186350009064e-05, + "loss": 3.6594, + "step": 43251 + }, + { + "epoch": 0.25723189646969263, + "grad_norm": 1.749206304550171, + "learning_rate": 4.227184865234946e-05, + "loss": 4.1535, + "step": 43252 + }, + { + "epoch": 0.25723784375297365, + "grad_norm": 2.1999223232269287, + "learning_rate": 4.227151094866044e-05, + "loss": 3.6557, + "step": 43253 + }, + { + "epoch": 0.25724379103625467, + "grad_norm": 1.619663953781128, + "learning_rate": 4.22711732389421e-05, + "loss": 4.3421, + "step": 43254 + }, + { + "epoch": 0.2572497383195356, + "grad_norm": 1.6855885982513428, + "learning_rate": 4.2270835523194583e-05, + "loss": 4.1513, + "step": 43255 + }, + { + "epoch": 0.25725568560281664, + "grad_norm": 2.0317888259887695, + "learning_rate": 4.227049780141799e-05, + "loss": 4.2924, + "step": 43256 + }, + { + "epoch": 0.25726163288609766, + "grad_norm": 1.6957508325576782, + "learning_rate": 4.227016007361245e-05, + "loss": 4.2607, + "step": 43257 + }, + { + "epoch": 0.2572675801693786, + "grad_norm": 1.645423173904419, + "learning_rate": 4.226982233977808e-05, + "loss": 4.8695, + "step": 43258 + }, + { + "epoch": 0.25727352745265963, + "grad_norm": 2.2041473388671875, + "learning_rate": 4.2269484599914976e-05, + "loss": 2.9163, + "step": 43259 + }, + { + "epoch": 0.25727947473594065, + "grad_norm": 1.2540580034255981, + "learning_rate": 4.226914685402329e-05, + "loss": 5.1203, + "step": 43260 + }, + { + "epoch": 0.2572854220192216, + "grad_norm": 1.5821120738983154, + "learning_rate": 4.2268809102103116e-05, + "loss": 4.5349, + "step": 43261 + }, + { + "epoch": 0.2572913693025026, + "grad_norm": 1.9403797388076782, + "learning_rate": 4.226847134415458e-05, + "loss": 3.1325, + "step": 43262 + }, + { + "epoch": 0.2572973165857836, + "grad_norm": 2.370906352996826, + "learning_rate": 4.22681335801778e-05, + "loss": 3.2792, + "step": 43263 + }, + { + "epoch": 0.2573032638690646, + "grad_norm": 2.7376670837402344, + "learning_rate": 4.2267795810172894e-05, + "loss": 2.8926, + "step": 43264 + }, + { + "epoch": 0.2573092111523456, + "grad_norm": 2.2706377506256104, + "learning_rate": 4.2267458034139975e-05, + "loss": 3.106, + "step": 43265 + }, + { + "epoch": 0.2573151584356266, + "grad_norm": 2.307666778564453, + "learning_rate": 4.226712025207916e-05, + "loss": 2.9405, + "step": 43266 + }, + { + "epoch": 0.2573211057189076, + "grad_norm": 2.3371567726135254, + "learning_rate": 4.2266782463990585e-05, + "loss": 3.073, + "step": 43267 + }, + { + "epoch": 0.2573270530021886, + "grad_norm": 2.1351914405822754, + "learning_rate": 4.226644466987435e-05, + "loss": 2.99, + "step": 43268 + }, + { + "epoch": 0.25733300028546957, + "grad_norm": 2.5197086334228516, + "learning_rate": 4.226610686973057e-05, + "loss": 3.0658, + "step": 43269 + }, + { + "epoch": 0.2573389475687506, + "grad_norm": 1.9839684963226318, + "learning_rate": 4.226576906355938e-05, + "loss": 3.569, + "step": 43270 + }, + { + "epoch": 0.2573448948520316, + "grad_norm": 1.9971115589141846, + "learning_rate": 4.2265431251360885e-05, + "loss": 3.3695, + "step": 43271 + }, + { + "epoch": 0.25735084213531256, + "grad_norm": 2.6981942653656006, + "learning_rate": 4.226509343313521e-05, + "loss": 3.189, + "step": 43272 + }, + { + "epoch": 0.2573567894185936, + "grad_norm": 2.3094723224639893, + "learning_rate": 4.226475560888246e-05, + "loss": 2.9784, + "step": 43273 + }, + { + "epoch": 0.2573627367018746, + "grad_norm": 2.3319153785705566, + "learning_rate": 4.226441777860277e-05, + "loss": 3.0117, + "step": 43274 + }, + { + "epoch": 0.25736868398515556, + "grad_norm": 2.5115654468536377, + "learning_rate": 4.226407994229624e-05, + "loss": 3.0624, + "step": 43275 + }, + { + "epoch": 0.25737463126843657, + "grad_norm": 1.8657209873199463, + "learning_rate": 4.226374209996301e-05, + "loss": 3.9099, + "step": 43276 + }, + { + "epoch": 0.2573805785517176, + "grad_norm": 1.996996283531189, + "learning_rate": 4.226340425160318e-05, + "loss": 3.077, + "step": 43277 + }, + { + "epoch": 0.25738652583499855, + "grad_norm": 2.1062278747558594, + "learning_rate": 4.226306639721688e-05, + "loss": 3.1485, + "step": 43278 + }, + { + "epoch": 0.25739247311827956, + "grad_norm": 2.51224422454834, + "learning_rate": 4.226272853680422e-05, + "loss": 3.0816, + "step": 43279 + }, + { + "epoch": 0.2573984204015606, + "grad_norm": 2.223710536956787, + "learning_rate": 4.226239067036531e-05, + "loss": 2.8685, + "step": 43280 + }, + { + "epoch": 0.25740436768484154, + "grad_norm": 1.7438758611679077, + "learning_rate": 4.2262052797900285e-05, + "loss": 4.0409, + "step": 43281 + }, + { + "epoch": 0.25741031496812256, + "grad_norm": 1.7250003814697266, + "learning_rate": 4.226171491940926e-05, + "loss": 3.7053, + "step": 43282 + }, + { + "epoch": 0.25741626225140357, + "grad_norm": 1.7706702947616577, + "learning_rate": 4.226137703489234e-05, + "loss": 3.8165, + "step": 43283 + }, + { + "epoch": 0.25742220953468453, + "grad_norm": 1.736322045326233, + "learning_rate": 4.2261039144349655e-05, + "loss": 3.7993, + "step": 43284 + }, + { + "epoch": 0.25742815681796555, + "grad_norm": 1.7486449480056763, + "learning_rate": 4.226070124778132e-05, + "loss": 3.6148, + "step": 43285 + }, + { + "epoch": 0.25743410410124656, + "grad_norm": 1.7472317218780518, + "learning_rate": 4.226036334518745e-05, + "loss": 3.7453, + "step": 43286 + }, + { + "epoch": 0.2574400513845275, + "grad_norm": 1.709546446800232, + "learning_rate": 4.2260025436568174e-05, + "loss": 3.7611, + "step": 43287 + }, + { + "epoch": 0.25744599866780854, + "grad_norm": 1.6224968433380127, + "learning_rate": 4.22596875219236e-05, + "loss": 3.7477, + "step": 43288 + }, + { + "epoch": 0.25745194595108956, + "grad_norm": 2.009153127670288, + "learning_rate": 4.225934960125384e-05, + "loss": 3.7414, + "step": 43289 + }, + { + "epoch": 0.2574578932343705, + "grad_norm": 1.929237723350525, + "learning_rate": 4.225901167455902e-05, + "loss": 3.336, + "step": 43290 + }, + { + "epoch": 0.25746384051765153, + "grad_norm": 1.7390176057815552, + "learning_rate": 4.2258673741839263e-05, + "loss": 3.6894, + "step": 43291 + }, + { + "epoch": 0.25746978780093255, + "grad_norm": 2.3558826446533203, + "learning_rate": 4.2258335803094676e-05, + "loss": 3.6438, + "step": 43292 + }, + { + "epoch": 0.2574757350842135, + "grad_norm": 2.258578300476074, + "learning_rate": 4.225799785832538e-05, + "loss": 3.3834, + "step": 43293 + }, + { + "epoch": 0.2574816823674945, + "grad_norm": 1.728671908378601, + "learning_rate": 4.2257659907531505e-05, + "loss": 3.8268, + "step": 43294 + }, + { + "epoch": 0.25748762965077554, + "grad_norm": 1.7048178911209106, + "learning_rate": 4.225732195071315e-05, + "loss": 3.8888, + "step": 43295 + }, + { + "epoch": 0.2574935769340565, + "grad_norm": 1.8538347482681274, + "learning_rate": 4.225698398787045e-05, + "loss": 3.8268, + "step": 43296 + }, + { + "epoch": 0.2574995242173375, + "grad_norm": 1.7856497764587402, + "learning_rate": 4.2256646019003506e-05, + "loss": 3.7566, + "step": 43297 + }, + { + "epoch": 0.25750547150061853, + "grad_norm": 2.0070464611053467, + "learning_rate": 4.225630804411246e-05, + "loss": 3.5748, + "step": 43298 + }, + { + "epoch": 0.2575114187838995, + "grad_norm": 2.048724889755249, + "learning_rate": 4.22559700631974e-05, + "loss": 3.1367, + "step": 43299 + }, + { + "epoch": 0.2575173660671805, + "grad_norm": 1.817712664604187, + "learning_rate": 4.225563207625847e-05, + "loss": 3.1732, + "step": 43300 + }, + { + "epoch": 0.2575233133504615, + "grad_norm": 2.0463626384735107, + "learning_rate": 4.225529408329577e-05, + "loss": 2.8685, + "step": 43301 + }, + { + "epoch": 0.2575292606337425, + "grad_norm": 1.6576095819473267, + "learning_rate": 4.225495608430942e-05, + "loss": 3.5344, + "step": 43302 + }, + { + "epoch": 0.2575352079170235, + "grad_norm": 2.170841932296753, + "learning_rate": 4.225461807929956e-05, + "loss": 3.6123, + "step": 43303 + }, + { + "epoch": 0.2575411552003045, + "grad_norm": 2.0660924911499023, + "learning_rate": 4.225428006826627e-05, + "loss": 3.4349, + "step": 43304 + }, + { + "epoch": 0.2575471024835855, + "grad_norm": 1.9692975282669067, + "learning_rate": 4.225394205120971e-05, + "loss": 3.5184, + "step": 43305 + }, + { + "epoch": 0.2575530497668665, + "grad_norm": 1.9471170902252197, + "learning_rate": 4.225360402812997e-05, + "loss": 3.5546, + "step": 43306 + }, + { + "epoch": 0.2575589970501475, + "grad_norm": 2.0945026874542236, + "learning_rate": 4.225326599902717e-05, + "loss": 3.4062, + "step": 43307 + }, + { + "epoch": 0.25756494433342847, + "grad_norm": 1.861249327659607, + "learning_rate": 4.2252927963901434e-05, + "loss": 3.4066, + "step": 43308 + }, + { + "epoch": 0.2575708916167095, + "grad_norm": 1.8911516666412354, + "learning_rate": 4.225258992275288e-05, + "loss": 3.3581, + "step": 43309 + }, + { + "epoch": 0.2575768388999905, + "grad_norm": 1.8019460439682007, + "learning_rate": 4.2252251875581625e-05, + "loss": 4.2912, + "step": 43310 + }, + { + "epoch": 0.25758278618327146, + "grad_norm": 1.7808783054351807, + "learning_rate": 4.225191382238778e-05, + "loss": 4.2538, + "step": 43311 + }, + { + "epoch": 0.2575887334665525, + "grad_norm": 1.8653813600540161, + "learning_rate": 4.225157576317149e-05, + "loss": 4.139, + "step": 43312 + }, + { + "epoch": 0.2575946807498335, + "grad_norm": 1.7058701515197754, + "learning_rate": 4.2251237697932835e-05, + "loss": 4.1534, + "step": 43313 + }, + { + "epoch": 0.25760062803311445, + "grad_norm": 1.8248401880264282, + "learning_rate": 4.2250899626671955e-05, + "loss": 4.229, + "step": 43314 + }, + { + "epoch": 0.25760657531639547, + "grad_norm": 1.7566015720367432, + "learning_rate": 4.225056154938897e-05, + "loss": 3.9251, + "step": 43315 + }, + { + "epoch": 0.2576125225996765, + "grad_norm": 1.8038471937179565, + "learning_rate": 4.2250223466083986e-05, + "loss": 3.5071, + "step": 43316 + }, + { + "epoch": 0.25761846988295745, + "grad_norm": 2.1854422092437744, + "learning_rate": 4.2249885376757134e-05, + "loss": 3.258, + "step": 43317 + }, + { + "epoch": 0.25762441716623846, + "grad_norm": 2.1822922229766846, + "learning_rate": 4.224954728140852e-05, + "loss": 3.3648, + "step": 43318 + }, + { + "epoch": 0.2576303644495195, + "grad_norm": 2.556295156478882, + "learning_rate": 4.224920918003826e-05, + "loss": 3.6528, + "step": 43319 + }, + { + "epoch": 0.25763631173280044, + "grad_norm": 2.8911261558532715, + "learning_rate": 4.224887107264649e-05, + "loss": 3.3745, + "step": 43320 + }, + { + "epoch": 0.25764225901608145, + "grad_norm": 1.919585108757019, + "learning_rate": 4.2248532959233325e-05, + "loss": 3.5155, + "step": 43321 + }, + { + "epoch": 0.25764820629936247, + "grad_norm": 2.260694980621338, + "learning_rate": 4.224819483979886e-05, + "loss": 3.3132, + "step": 43322 + }, + { + "epoch": 0.25765415358264343, + "grad_norm": 2.48893666267395, + "learning_rate": 4.224785671434323e-05, + "loss": 3.4205, + "step": 43323 + }, + { + "epoch": 0.25766010086592445, + "grad_norm": 2.8506858348846436, + "learning_rate": 4.224751858286656e-05, + "loss": 3.3402, + "step": 43324 + }, + { + "epoch": 0.25766604814920546, + "grad_norm": 2.6919894218444824, + "learning_rate": 4.224718044536895e-05, + "loss": 3.4701, + "step": 43325 + }, + { + "epoch": 0.2576719954324864, + "grad_norm": 2.6547420024871826, + "learning_rate": 4.224684230185053e-05, + "loss": 3.6512, + "step": 43326 + }, + { + "epoch": 0.25767794271576744, + "grad_norm": 2.0187129974365234, + "learning_rate": 4.2246504152311425e-05, + "loss": 3.8933, + "step": 43327 + }, + { + "epoch": 0.25768388999904845, + "grad_norm": 1.7470301389694214, + "learning_rate": 4.224616599675173e-05, + "loss": 4.347, + "step": 43328 + }, + { + "epoch": 0.2576898372823294, + "grad_norm": 1.9760146141052246, + "learning_rate": 4.224582783517158e-05, + "loss": 4.0767, + "step": 43329 + }, + { + "epoch": 0.25769578456561043, + "grad_norm": 2.2625625133514404, + "learning_rate": 4.22454896675711e-05, + "loss": 3.1935, + "step": 43330 + }, + { + "epoch": 0.25770173184889145, + "grad_norm": 2.4731624126434326, + "learning_rate": 4.2245151493950386e-05, + "loss": 2.9998, + "step": 43331 + }, + { + "epoch": 0.2577076791321724, + "grad_norm": 2.1505284309387207, + "learning_rate": 4.2244813314309574e-05, + "loss": 3.0019, + "step": 43332 + }, + { + "epoch": 0.2577136264154534, + "grad_norm": 1.743402123451233, + "learning_rate": 4.224447512864878e-05, + "loss": 3.5488, + "step": 43333 + }, + { + "epoch": 0.25771957369873444, + "grad_norm": 2.0020081996917725, + "learning_rate": 4.22441369369681e-05, + "loss": 4.19, + "step": 43334 + }, + { + "epoch": 0.2577255209820154, + "grad_norm": 1.805166482925415, + "learning_rate": 4.224379873926768e-05, + "loss": 4.5243, + "step": 43335 + }, + { + "epoch": 0.2577314682652964, + "grad_norm": 2.0471789836883545, + "learning_rate": 4.224346053554763e-05, + "loss": 4.2426, + "step": 43336 + }, + { + "epoch": 0.25773741554857743, + "grad_norm": 1.8476710319519043, + "learning_rate": 4.224312232580807e-05, + "loss": 3.8633, + "step": 43337 + }, + { + "epoch": 0.2577433628318584, + "grad_norm": 1.8500791788101196, + "learning_rate": 4.2242784110049114e-05, + "loss": 4.4488, + "step": 43338 + }, + { + "epoch": 0.2577493101151394, + "grad_norm": 1.904895544052124, + "learning_rate": 4.224244588827088e-05, + "loss": 3.243, + "step": 43339 + }, + { + "epoch": 0.2577552573984204, + "grad_norm": 2.128445863723755, + "learning_rate": 4.224210766047347e-05, + "loss": 3.0504, + "step": 43340 + }, + { + "epoch": 0.2577612046817014, + "grad_norm": 2.2312495708465576, + "learning_rate": 4.2241769426657036e-05, + "loss": 2.8462, + "step": 43341 + }, + { + "epoch": 0.2577671519649824, + "grad_norm": 1.8640774488449097, + "learning_rate": 4.224143118682168e-05, + "loss": 2.9009, + "step": 43342 + }, + { + "epoch": 0.2577730992482634, + "grad_norm": 1.64212965965271, + "learning_rate": 4.224109294096751e-05, + "loss": 4.8844, + "step": 43343 + }, + { + "epoch": 0.2577790465315444, + "grad_norm": 1.6742535829544067, + "learning_rate": 4.2240754689094647e-05, + "loss": 3.377, + "step": 43344 + }, + { + "epoch": 0.2577849938148254, + "grad_norm": 1.207567811012268, + "learning_rate": 4.224041643120322e-05, + "loss": 5.1677, + "step": 43345 + }, + { + "epoch": 0.2577909410981064, + "grad_norm": 1.4349946975708008, + "learning_rate": 4.224007816729335e-05, + "loss": 4.3715, + "step": 43346 + }, + { + "epoch": 0.25779688838138737, + "grad_norm": 1.8728833198547363, + "learning_rate": 4.223973989736514e-05, + "loss": 3.7373, + "step": 43347 + }, + { + "epoch": 0.2578028356646684, + "grad_norm": 1.7028321027755737, + "learning_rate": 4.223940162141872e-05, + "loss": 3.7906, + "step": 43348 + }, + { + "epoch": 0.2578087829479494, + "grad_norm": 1.8328166007995605, + "learning_rate": 4.2239063339454185e-05, + "loss": 4.5205, + "step": 43349 + }, + { + "epoch": 0.25781473023123036, + "grad_norm": 2.241309404373169, + "learning_rate": 4.2238725051471686e-05, + "loss": 3.6683, + "step": 43350 + }, + { + "epoch": 0.2578206775145114, + "grad_norm": 2.539138078689575, + "learning_rate": 4.223838675747133e-05, + "loss": 2.8771, + "step": 43351 + }, + { + "epoch": 0.2578266247977924, + "grad_norm": 2.5086073875427246, + "learning_rate": 4.223804845745323e-05, + "loss": 2.9798, + "step": 43352 + }, + { + "epoch": 0.25783257208107335, + "grad_norm": 2.562700033187866, + "learning_rate": 4.22377101514175e-05, + "loss": 3.0597, + "step": 43353 + }, + { + "epoch": 0.25783851936435437, + "grad_norm": 1.702529788017273, + "learning_rate": 4.223737183936426e-05, + "loss": 4.2368, + "step": 43354 + }, + { + "epoch": 0.2578444666476354, + "grad_norm": 1.228440284729004, + "learning_rate": 4.223703352129363e-05, + "loss": 5.1617, + "step": 43355 + }, + { + "epoch": 0.25785041393091634, + "grad_norm": 2.19476318359375, + "learning_rate": 4.2236695197205745e-05, + "loss": 4.1553, + "step": 43356 + }, + { + "epoch": 0.25785636121419736, + "grad_norm": 1.8752772808074951, + "learning_rate": 4.22363568671007e-05, + "loss": 4.7758, + "step": 43357 + }, + { + "epoch": 0.2578623084974784, + "grad_norm": 1.9239437580108643, + "learning_rate": 4.223601853097862e-05, + "loss": 3.6839, + "step": 43358 + }, + { + "epoch": 0.25786825578075934, + "grad_norm": 1.397732138633728, + "learning_rate": 4.2235680188839624e-05, + "loss": 4.6496, + "step": 43359 + }, + { + "epoch": 0.25787420306404035, + "grad_norm": 1.7359789609909058, + "learning_rate": 4.223534184068383e-05, + "loss": 4.0183, + "step": 43360 + }, + { + "epoch": 0.25788015034732137, + "grad_norm": 1.455500841140747, + "learning_rate": 4.223500348651135e-05, + "loss": 4.4836, + "step": 43361 + }, + { + "epoch": 0.25788609763060233, + "grad_norm": 1.4980499744415283, + "learning_rate": 4.2234665126322314e-05, + "loss": 4.6871, + "step": 43362 + }, + { + "epoch": 0.25789204491388334, + "grad_norm": 1.3940479755401611, + "learning_rate": 4.223432676011684e-05, + "loss": 4.5961, + "step": 43363 + }, + { + "epoch": 0.25789799219716436, + "grad_norm": 1.6948307752609253, + "learning_rate": 4.223398838789503e-05, + "loss": 4.2234, + "step": 43364 + }, + { + "epoch": 0.2579039394804453, + "grad_norm": 1.6055898666381836, + "learning_rate": 4.223365000965702e-05, + "loss": 4.3019, + "step": 43365 + }, + { + "epoch": 0.25790988676372634, + "grad_norm": 1.7240407466888428, + "learning_rate": 4.223331162540292e-05, + "loss": 4.2087, + "step": 43366 + }, + { + "epoch": 0.25791583404700735, + "grad_norm": 1.498740792274475, + "learning_rate": 4.223297323513284e-05, + "loss": 4.2515, + "step": 43367 + }, + { + "epoch": 0.2579217813302883, + "grad_norm": 1.6280021667480469, + "learning_rate": 4.2232634838846915e-05, + "loss": 4.7121, + "step": 43368 + }, + { + "epoch": 0.25792772861356933, + "grad_norm": 1.695422649383545, + "learning_rate": 4.223229643654526e-05, + "loss": 4.1561, + "step": 43369 + }, + { + "epoch": 0.25793367589685035, + "grad_norm": 1.600670337677002, + "learning_rate": 4.2231958028227986e-05, + "loss": 4.311, + "step": 43370 + }, + { + "epoch": 0.2579396231801313, + "grad_norm": 1.5950666666030884, + "learning_rate": 4.2231619613895215e-05, + "loss": 4.2716, + "step": 43371 + }, + { + "epoch": 0.2579455704634123, + "grad_norm": 1.4690937995910645, + "learning_rate": 4.2231281193547054e-05, + "loss": 4.668, + "step": 43372 + }, + { + "epoch": 0.25795151774669334, + "grad_norm": 1.642809271812439, + "learning_rate": 4.2230942767183637e-05, + "loss": 4.7371, + "step": 43373 + }, + { + "epoch": 0.2579574650299743, + "grad_norm": 1.94060218334198, + "learning_rate": 4.223060433480508e-05, + "loss": 4.5958, + "step": 43374 + }, + { + "epoch": 0.2579634123132553, + "grad_norm": 1.948462963104248, + "learning_rate": 4.223026589641149e-05, + "loss": 4.5583, + "step": 43375 + }, + { + "epoch": 0.25796935959653633, + "grad_norm": 1.6549667119979858, + "learning_rate": 4.222992745200299e-05, + "loss": 4.6127, + "step": 43376 + }, + { + "epoch": 0.2579753068798173, + "grad_norm": 1.6783031225204468, + "learning_rate": 4.2229589001579706e-05, + "loss": 4.2983, + "step": 43377 + }, + { + "epoch": 0.2579812541630983, + "grad_norm": 1.8058072328567505, + "learning_rate": 4.2229250545141754e-05, + "loss": 4.4397, + "step": 43378 + }, + { + "epoch": 0.25798720144637927, + "grad_norm": 1.7903434038162231, + "learning_rate": 4.2228912082689245e-05, + "loss": 4.7296, + "step": 43379 + }, + { + "epoch": 0.2579931487296603, + "grad_norm": 1.561268925666809, + "learning_rate": 4.22285736142223e-05, + "loss": 4.643, + "step": 43380 + }, + { + "epoch": 0.2579990960129413, + "grad_norm": 2.0092406272888184, + "learning_rate": 4.2228235139741035e-05, + "loss": 3.9516, + "step": 43381 + }, + { + "epoch": 0.25800504329622226, + "grad_norm": 1.9187169075012207, + "learning_rate": 4.222789665924557e-05, + "loss": 4.0659, + "step": 43382 + }, + { + "epoch": 0.2580109905795033, + "grad_norm": 1.8158773183822632, + "learning_rate": 4.222755817273603e-05, + "loss": 4.6159, + "step": 43383 + }, + { + "epoch": 0.2580169378627843, + "grad_norm": 1.8136028051376343, + "learning_rate": 4.2227219680212525e-05, + "loss": 4.8663, + "step": 43384 + }, + { + "epoch": 0.25802288514606525, + "grad_norm": 1.5933421850204468, + "learning_rate": 4.222688118167518e-05, + "loss": 4.3133, + "step": 43385 + }, + { + "epoch": 0.25802883242934627, + "grad_norm": 1.7087833881378174, + "learning_rate": 4.22265426771241e-05, + "loss": 4.3916, + "step": 43386 + }, + { + "epoch": 0.2580347797126273, + "grad_norm": 2.0734188556671143, + "learning_rate": 4.222620416655942e-05, + "loss": 4.0027, + "step": 43387 + }, + { + "epoch": 0.25804072699590824, + "grad_norm": 1.8769564628601074, + "learning_rate": 4.2225865649981245e-05, + "loss": 4.092, + "step": 43388 + }, + { + "epoch": 0.25804667427918926, + "grad_norm": 1.6172574758529663, + "learning_rate": 4.22255271273897e-05, + "loss": 4.2249, + "step": 43389 + }, + { + "epoch": 0.2580526215624703, + "grad_norm": 2.1016736030578613, + "learning_rate": 4.22251885987849e-05, + "loss": 4.607, + "step": 43390 + }, + { + "epoch": 0.25805856884575123, + "grad_norm": 1.743820071220398, + "learning_rate": 4.222485006416697e-05, + "loss": 4.4059, + "step": 43391 + }, + { + "epoch": 0.25806451612903225, + "grad_norm": 1.9316726922988892, + "learning_rate": 4.222451152353601e-05, + "loss": 4.2161, + "step": 43392 + }, + { + "epoch": 0.25807046341231327, + "grad_norm": 1.8563159704208374, + "learning_rate": 4.222417297689217e-05, + "loss": 4.4311, + "step": 43393 + }, + { + "epoch": 0.2580764106955942, + "grad_norm": 1.8751715421676636, + "learning_rate": 4.222383442423554e-05, + "loss": 4.2312, + "step": 43394 + }, + { + "epoch": 0.25808235797887524, + "grad_norm": 1.9330755472183228, + "learning_rate": 4.2223495865566244e-05, + "loss": 4.1115, + "step": 43395 + }, + { + "epoch": 0.25808830526215626, + "grad_norm": 1.739328145980835, + "learning_rate": 4.222315730088441e-05, + "loss": 4.2409, + "step": 43396 + }, + { + "epoch": 0.2580942525454372, + "grad_norm": 1.8452540636062622, + "learning_rate": 4.222281873019014e-05, + "loss": 4.1949, + "step": 43397 + }, + { + "epoch": 0.25810019982871824, + "grad_norm": 1.9402611255645752, + "learning_rate": 4.2222480153483566e-05, + "loss": 4.3308, + "step": 43398 + }, + { + "epoch": 0.25810614711199925, + "grad_norm": 1.252078890800476, + "learning_rate": 4.2222141570764806e-05, + "loss": 4.84, + "step": 43399 + }, + { + "epoch": 0.2581120943952802, + "grad_norm": 1.629943609237671, + "learning_rate": 4.222180298203397e-05, + "loss": 4.2728, + "step": 43400 + }, + { + "epoch": 0.2581180416785612, + "grad_norm": 1.9061886072158813, + "learning_rate": 4.222146438729119e-05, + "loss": 4.3395, + "step": 43401 + }, + { + "epoch": 0.25812398896184224, + "grad_norm": 2.328352451324463, + "learning_rate": 4.222112578653656e-05, + "loss": 3.7654, + "step": 43402 + }, + { + "epoch": 0.2581299362451232, + "grad_norm": 1.7604764699935913, + "learning_rate": 4.222078717977022e-05, + "loss": 4.4427, + "step": 43403 + }, + { + "epoch": 0.2581358835284042, + "grad_norm": 1.5162769556045532, + "learning_rate": 4.2220448566992286e-05, + "loss": 4.8248, + "step": 43404 + }, + { + "epoch": 0.25814183081168524, + "grad_norm": 1.5359110832214355, + "learning_rate": 4.2220109948202865e-05, + "loss": 4.8252, + "step": 43405 + }, + { + "epoch": 0.2581477780949662, + "grad_norm": 1.8130043745040894, + "learning_rate": 4.221977132340208e-05, + "loss": 4.4175, + "step": 43406 + }, + { + "epoch": 0.2581537253782472, + "grad_norm": 1.543370246887207, + "learning_rate": 4.2219432692590054e-05, + "loss": 4.6298, + "step": 43407 + }, + { + "epoch": 0.25815967266152823, + "grad_norm": 1.608904480934143, + "learning_rate": 4.221909405576691e-05, + "loss": 4.4185, + "step": 43408 + }, + { + "epoch": 0.2581656199448092, + "grad_norm": 1.6779776811599731, + "learning_rate": 4.221875541293274e-05, + "loss": 4.0762, + "step": 43409 + }, + { + "epoch": 0.2581715672280902, + "grad_norm": 1.7395251989364624, + "learning_rate": 4.22184167640877e-05, + "loss": 4.2612, + "step": 43410 + }, + { + "epoch": 0.2581775145113712, + "grad_norm": 1.8426402807235718, + "learning_rate": 4.221807810923187e-05, + "loss": 4.4147, + "step": 43411 + }, + { + "epoch": 0.2581834617946522, + "grad_norm": 1.7401176691055298, + "learning_rate": 4.22177394483654e-05, + "loss": 4.326, + "step": 43412 + }, + { + "epoch": 0.2581894090779332, + "grad_norm": 2.09885835647583, + "learning_rate": 4.221740078148839e-05, + "loss": 4.0842, + "step": 43413 + }, + { + "epoch": 0.2581953563612142, + "grad_norm": 1.718242883682251, + "learning_rate": 4.221706210860096e-05, + "loss": 4.0513, + "step": 43414 + }, + { + "epoch": 0.2582013036444952, + "grad_norm": 2.1970937252044678, + "learning_rate": 4.221672342970324e-05, + "loss": 4.4889, + "step": 43415 + }, + { + "epoch": 0.2582072509277762, + "grad_norm": 1.4324294328689575, + "learning_rate": 4.221638474479533e-05, + "loss": 4.4515, + "step": 43416 + }, + { + "epoch": 0.2582131982110572, + "grad_norm": 1.831308364868164, + "learning_rate": 4.221604605387737e-05, + "loss": 4.5556, + "step": 43417 + }, + { + "epoch": 0.25821914549433816, + "grad_norm": 1.90105402469635, + "learning_rate": 4.221570735694945e-05, + "loss": 4.7206, + "step": 43418 + }, + { + "epoch": 0.2582250927776192, + "grad_norm": 2.1148085594177246, + "learning_rate": 4.2215368654011725e-05, + "loss": 4.5548, + "step": 43419 + }, + { + "epoch": 0.2582310400609002, + "grad_norm": 1.6271718740463257, + "learning_rate": 4.221502994506428e-05, + "loss": 4.8675, + "step": 43420 + }, + { + "epoch": 0.25823698734418116, + "grad_norm": 1.3636060953140259, + "learning_rate": 4.221469123010724e-05, + "loss": 4.8212, + "step": 43421 + }, + { + "epoch": 0.2582429346274622, + "grad_norm": 1.558492660522461, + "learning_rate": 4.221435250914074e-05, + "loss": 4.6823, + "step": 43422 + }, + { + "epoch": 0.2582488819107432, + "grad_norm": 1.7085415124893188, + "learning_rate": 4.221401378216488e-05, + "loss": 4.546, + "step": 43423 + }, + { + "epoch": 0.25825482919402415, + "grad_norm": 1.9768619537353516, + "learning_rate": 4.2213675049179794e-05, + "loss": 3.9392, + "step": 43424 + }, + { + "epoch": 0.25826077647730517, + "grad_norm": 2.0775978565216064, + "learning_rate": 4.221333631018559e-05, + "loss": 3.2508, + "step": 43425 + }, + { + "epoch": 0.2582667237605862, + "grad_norm": 2.0933282375335693, + "learning_rate": 4.2212997565182384e-05, + "loss": 2.9118, + "step": 43426 + }, + { + "epoch": 0.25827267104386714, + "grad_norm": 2.248816967010498, + "learning_rate": 4.22126588141703e-05, + "loss": 3.7372, + "step": 43427 + }, + { + "epoch": 0.25827861832714816, + "grad_norm": 2.4197070598602295, + "learning_rate": 4.221232005714946e-05, + "loss": 3.6892, + "step": 43428 + }, + { + "epoch": 0.2582845656104292, + "grad_norm": 2.433349370956421, + "learning_rate": 4.221198129411996e-05, + "loss": 3.3577, + "step": 43429 + }, + { + "epoch": 0.25829051289371013, + "grad_norm": 2.4574573040008545, + "learning_rate": 4.2211642525081954e-05, + "loss": 3.5768, + "step": 43430 + }, + { + "epoch": 0.25829646017699115, + "grad_norm": 2.359774351119995, + "learning_rate": 4.2211303750035534e-05, + "loss": 3.9991, + "step": 43431 + }, + { + "epoch": 0.25830240746027217, + "grad_norm": 1.4036859273910522, + "learning_rate": 4.2210964968980824e-05, + "loss": 4.6994, + "step": 43432 + }, + { + "epoch": 0.2583083547435531, + "grad_norm": 1.687827706336975, + "learning_rate": 4.221062618191795e-05, + "loss": 4.1956, + "step": 43433 + }, + { + "epoch": 0.25831430202683414, + "grad_norm": 1.6159050464630127, + "learning_rate": 4.221028738884701e-05, + "loss": 4.4683, + "step": 43434 + }, + { + "epoch": 0.25832024931011516, + "grad_norm": 1.5456722974777222, + "learning_rate": 4.220994858976815e-05, + "loss": 4.6592, + "step": 43435 + }, + { + "epoch": 0.2583261965933961, + "grad_norm": 1.7457486391067505, + "learning_rate": 4.220960978468147e-05, + "loss": 4.2377, + "step": 43436 + }, + { + "epoch": 0.25833214387667713, + "grad_norm": 1.6378988027572632, + "learning_rate": 4.2209270973587104e-05, + "loss": 4.588, + "step": 43437 + }, + { + "epoch": 0.25833809115995815, + "grad_norm": 1.7123868465423584, + "learning_rate": 4.2208932156485145e-05, + "loss": 4.3455, + "step": 43438 + }, + { + "epoch": 0.2583440384432391, + "grad_norm": 1.78044855594635, + "learning_rate": 4.2208593333375724e-05, + "loss": 4.2574, + "step": 43439 + }, + { + "epoch": 0.2583499857265201, + "grad_norm": 1.9464309215545654, + "learning_rate": 4.220825450425897e-05, + "loss": 4.0239, + "step": 43440 + }, + { + "epoch": 0.25835593300980114, + "grad_norm": 1.7884212732315063, + "learning_rate": 4.220791566913499e-05, + "loss": 4.1301, + "step": 43441 + }, + { + "epoch": 0.2583618802930821, + "grad_norm": 1.626954197883606, + "learning_rate": 4.2207576828003905e-05, + "loss": 4.3971, + "step": 43442 + }, + { + "epoch": 0.2583678275763631, + "grad_norm": 1.8232189416885376, + "learning_rate": 4.220723798086583e-05, + "loss": 4.0822, + "step": 43443 + }, + { + "epoch": 0.25837377485964413, + "grad_norm": 2.0403873920440674, + "learning_rate": 4.220689912772089e-05, + "loss": 3.9189, + "step": 43444 + }, + { + "epoch": 0.2583797221429251, + "grad_norm": 1.736331582069397, + "learning_rate": 4.220656026856919e-05, + "loss": 4.183, + "step": 43445 + }, + { + "epoch": 0.2583856694262061, + "grad_norm": 1.7627276182174683, + "learning_rate": 4.2206221403410866e-05, + "loss": 4.4585, + "step": 43446 + }, + { + "epoch": 0.2583916167094871, + "grad_norm": 1.60670006275177, + "learning_rate": 4.220588253224602e-05, + "loss": 4.5829, + "step": 43447 + }, + { + "epoch": 0.2583975639927681, + "grad_norm": 1.628663182258606, + "learning_rate": 4.220554365507479e-05, + "loss": 4.3981, + "step": 43448 + }, + { + "epoch": 0.2584035112760491, + "grad_norm": 1.9794678688049316, + "learning_rate": 4.220520477189728e-05, + "loss": 4.5196, + "step": 43449 + }, + { + "epoch": 0.2584094585593301, + "grad_norm": 1.932854413986206, + "learning_rate": 4.2204865882713604e-05, + "loss": 4.653, + "step": 43450 + }, + { + "epoch": 0.2584154058426111, + "grad_norm": 1.7190862894058228, + "learning_rate": 4.220452698752389e-05, + "loss": 5.0905, + "step": 43451 + }, + { + "epoch": 0.2584213531258921, + "grad_norm": 1.5871145725250244, + "learning_rate": 4.2204188086328255e-05, + "loss": 4.6589, + "step": 43452 + }, + { + "epoch": 0.2584273004091731, + "grad_norm": 2.0079994201660156, + "learning_rate": 4.220384917912681e-05, + "loss": 3.881, + "step": 43453 + }, + { + "epoch": 0.25843324769245407, + "grad_norm": 1.7606770992279053, + "learning_rate": 4.220351026591969e-05, + "loss": 4.4818, + "step": 43454 + }, + { + "epoch": 0.2584391949757351, + "grad_norm": 1.4636285305023193, + "learning_rate": 4.2203171346706994e-05, + "loss": 5.3602, + "step": 43455 + }, + { + "epoch": 0.2584451422590161, + "grad_norm": 1.3749452829360962, + "learning_rate": 4.220283242148885e-05, + "loss": 5.2526, + "step": 43456 + }, + { + "epoch": 0.25845108954229706, + "grad_norm": 1.547986388206482, + "learning_rate": 4.220249349026537e-05, + "loss": 5.2876, + "step": 43457 + }, + { + "epoch": 0.2584570368255781, + "grad_norm": 1.6168211698532104, + "learning_rate": 4.220215455303669e-05, + "loss": 4.7648, + "step": 43458 + }, + { + "epoch": 0.2584629841088591, + "grad_norm": 1.586313009262085, + "learning_rate": 4.2201815609802906e-05, + "loss": 4.6921, + "step": 43459 + }, + { + "epoch": 0.25846893139214006, + "grad_norm": 1.3717976808547974, + "learning_rate": 4.2201476660564146e-05, + "loss": 5.1019, + "step": 43460 + }, + { + "epoch": 0.25847487867542107, + "grad_norm": 1.6613398790359497, + "learning_rate": 4.220113770532053e-05, + "loss": 4.7737, + "step": 43461 + }, + { + "epoch": 0.2584808259587021, + "grad_norm": 1.814727783203125, + "learning_rate": 4.220079874407218e-05, + "loss": 3.9091, + "step": 43462 + }, + { + "epoch": 0.25848677324198305, + "grad_norm": 1.7494934797286987, + "learning_rate": 4.22004597768192e-05, + "loss": 4.6581, + "step": 43463 + }, + { + "epoch": 0.25849272052526406, + "grad_norm": 1.6119458675384521, + "learning_rate": 4.220012080356172e-05, + "loss": 4.2396, + "step": 43464 + }, + { + "epoch": 0.2584986678085451, + "grad_norm": 1.503002643585205, + "learning_rate": 4.2199781824299854e-05, + "loss": 4.3899, + "step": 43465 + }, + { + "epoch": 0.25850461509182604, + "grad_norm": 1.7611950635910034, + "learning_rate": 4.219944283903372e-05, + "loss": 5.1231, + "step": 43466 + }, + { + "epoch": 0.25851056237510706, + "grad_norm": 1.5735167264938354, + "learning_rate": 4.219910384776345e-05, + "loss": 5.2808, + "step": 43467 + }, + { + "epoch": 0.25851650965838807, + "grad_norm": 1.6485339403152466, + "learning_rate": 4.2198764850489135e-05, + "loss": 5.279, + "step": 43468 + }, + { + "epoch": 0.25852245694166903, + "grad_norm": 1.404492974281311, + "learning_rate": 4.219842584721092e-05, + "loss": 5.2422, + "step": 43469 + }, + { + "epoch": 0.25852840422495005, + "grad_norm": 1.23466956615448, + "learning_rate": 4.2198086837928915e-05, + "loss": 5.1592, + "step": 43470 + }, + { + "epoch": 0.25853435150823106, + "grad_norm": 1.430601716041565, + "learning_rate": 4.219774782264323e-05, + "loss": 5.0983, + "step": 43471 + }, + { + "epoch": 0.258540298791512, + "grad_norm": 1.4110273122787476, + "learning_rate": 4.2197408801353984e-05, + "loss": 5.2564, + "step": 43472 + }, + { + "epoch": 0.25854624607479304, + "grad_norm": 1.4095579385757446, + "learning_rate": 4.21970697740613e-05, + "loss": 5.2232, + "step": 43473 + }, + { + "epoch": 0.25855219335807406, + "grad_norm": 1.3536268472671509, + "learning_rate": 4.21967307407653e-05, + "loss": 5.0875, + "step": 43474 + }, + { + "epoch": 0.258558140641355, + "grad_norm": 1.4623044729232788, + "learning_rate": 4.219639170146611e-05, + "loss": 5.2252, + "step": 43475 + }, + { + "epoch": 0.25856408792463603, + "grad_norm": 1.254266619682312, + "learning_rate": 4.2196052656163826e-05, + "loss": 5.2961, + "step": 43476 + }, + { + "epoch": 0.25857003520791705, + "grad_norm": 1.9226226806640625, + "learning_rate": 4.219571360485858e-05, + "loss": 4.2126, + "step": 43477 + }, + { + "epoch": 0.258575982491198, + "grad_norm": 1.4564887285232544, + "learning_rate": 4.2195374547550484e-05, + "loss": 5.3427, + "step": 43478 + }, + { + "epoch": 0.258581929774479, + "grad_norm": 1.3654704093933105, + "learning_rate": 4.219503548423966e-05, + "loss": 5.2675, + "step": 43479 + }, + { + "epoch": 0.25858787705776004, + "grad_norm": 1.389340877532959, + "learning_rate": 4.219469641492624e-05, + "loss": 4.9074, + "step": 43480 + }, + { + "epoch": 0.258593824341041, + "grad_norm": 1.511040449142456, + "learning_rate": 4.219435733961031e-05, + "loss": 5.1753, + "step": 43481 + }, + { + "epoch": 0.258599771624322, + "grad_norm": 1.7021970748901367, + "learning_rate": 4.2194018258292023e-05, + "loss": 5.0146, + "step": 43482 + }, + { + "epoch": 0.25860571890760303, + "grad_norm": 1.523794174194336, + "learning_rate": 4.219367917097148e-05, + "loss": 4.552, + "step": 43483 + }, + { + "epoch": 0.258611666190884, + "grad_norm": 1.8338403701782227, + "learning_rate": 4.219334007764879e-05, + "loss": 4.531, + "step": 43484 + }, + { + "epoch": 0.258617613474165, + "grad_norm": 1.5908867120742798, + "learning_rate": 4.219300097832409e-05, + "loss": 4.8325, + "step": 43485 + }, + { + "epoch": 0.258623560757446, + "grad_norm": 1.6518182754516602, + "learning_rate": 4.2192661872997495e-05, + "loss": 4.9789, + "step": 43486 + }, + { + "epoch": 0.258629508040727, + "grad_norm": 1.5878726243972778, + "learning_rate": 4.219232276166911e-05, + "loss": 4.8405, + "step": 43487 + }, + { + "epoch": 0.258635455324008, + "grad_norm": 2.1013569831848145, + "learning_rate": 4.219198364433907e-05, + "loss": 3.8274, + "step": 43488 + }, + { + "epoch": 0.258641402607289, + "grad_norm": 2.3177928924560547, + "learning_rate": 4.2191644521007484e-05, + "loss": 4.311, + "step": 43489 + }, + { + "epoch": 0.25864734989057, + "grad_norm": 1.7503350973129272, + "learning_rate": 4.2191305391674476e-05, + "loss": 4.6374, + "step": 43490 + }, + { + "epoch": 0.258653297173851, + "grad_norm": 2.8047239780426025, + "learning_rate": 4.219096625634016e-05, + "loss": 4.608, + "step": 43491 + }, + { + "epoch": 0.258659244457132, + "grad_norm": 2.370485544204712, + "learning_rate": 4.219062711500464e-05, + "loss": 4.6716, + "step": 43492 + }, + { + "epoch": 0.25866519174041297, + "grad_norm": 1.4450424909591675, + "learning_rate": 4.219028796766807e-05, + "loss": 5.0533, + "step": 43493 + }, + { + "epoch": 0.258671139023694, + "grad_norm": 1.6102733612060547, + "learning_rate": 4.2189948814330535e-05, + "loss": 4.7405, + "step": 43494 + }, + { + "epoch": 0.258677086306975, + "grad_norm": 1.8024570941925049, + "learning_rate": 4.218960965499218e-05, + "loss": 4.5802, + "step": 43495 + }, + { + "epoch": 0.25868303359025596, + "grad_norm": 1.766394019126892, + "learning_rate": 4.21892704896531e-05, + "loss": 4.5544, + "step": 43496 + }, + { + "epoch": 0.258688980873537, + "grad_norm": 1.311376690864563, + "learning_rate": 4.2188931318313426e-05, + "loss": 5.2545, + "step": 43497 + }, + { + "epoch": 0.25869492815681794, + "grad_norm": 1.5241717100143433, + "learning_rate": 4.218859214097327e-05, + "loss": 5.2676, + "step": 43498 + }, + { + "epoch": 0.25870087544009895, + "grad_norm": 1.3403328657150269, + "learning_rate": 4.2188252957632756e-05, + "loss": 5.3019, + "step": 43499 + }, + { + "epoch": 0.25870682272337997, + "grad_norm": 1.2095310688018799, + "learning_rate": 4.2187913768292004e-05, + "loss": 5.2215, + "step": 43500 + }, + { + "epoch": 0.25871277000666093, + "grad_norm": 1.2385238409042358, + "learning_rate": 4.218757457295113e-05, + "loss": 5.1297, + "step": 43501 + }, + { + "epoch": 0.25871871728994195, + "grad_norm": 1.396173357963562, + "learning_rate": 4.2187235371610244e-05, + "loss": 5.1275, + "step": 43502 + }, + { + "epoch": 0.25872466457322296, + "grad_norm": 1.4472711086273193, + "learning_rate": 4.2186896164269473e-05, + "loss": 5.1652, + "step": 43503 + }, + { + "epoch": 0.2587306118565039, + "grad_norm": 1.5361378192901611, + "learning_rate": 4.218655695092894e-05, + "loss": 5.1961, + "step": 43504 + }, + { + "epoch": 0.25873655913978494, + "grad_norm": 1.4499026536941528, + "learning_rate": 4.218621773158875e-05, + "loss": 4.3755, + "step": 43505 + }, + { + "epoch": 0.25874250642306595, + "grad_norm": 1.9081149101257324, + "learning_rate": 4.218587850624903e-05, + "loss": 4.6822, + "step": 43506 + }, + { + "epoch": 0.2587484537063469, + "grad_norm": 1.7490051984786987, + "learning_rate": 4.21855392749099e-05, + "loss": 4.9423, + "step": 43507 + }, + { + "epoch": 0.25875440098962793, + "grad_norm": 1.4787182807922363, + "learning_rate": 4.218520003757148e-05, + "loss": 5.0207, + "step": 43508 + }, + { + "epoch": 0.25876034827290895, + "grad_norm": 1.5725704431533813, + "learning_rate": 4.218486079423388e-05, + "loss": 5.189, + "step": 43509 + }, + { + "epoch": 0.2587662955561899, + "grad_norm": 2.0944018363952637, + "learning_rate": 4.218452154489722e-05, + "loss": 5.2323, + "step": 43510 + }, + { + "epoch": 0.2587722428394709, + "grad_norm": 1.8133397102355957, + "learning_rate": 4.218418228956163e-05, + "loss": 4.0283, + "step": 43511 + }, + { + "epoch": 0.25877819012275194, + "grad_norm": 1.5718897581100464, + "learning_rate": 4.218384302822721e-05, + "loss": 5.5057, + "step": 43512 + }, + { + "epoch": 0.2587841374060329, + "grad_norm": 1.6020581722259521, + "learning_rate": 4.218350376089409e-05, + "loss": 5.4354, + "step": 43513 + }, + { + "epoch": 0.2587900846893139, + "grad_norm": 1.7575736045837402, + "learning_rate": 4.218316448756239e-05, + "loss": 5.2237, + "step": 43514 + }, + { + "epoch": 0.25879603197259493, + "grad_norm": 1.7071092128753662, + "learning_rate": 4.218282520823222e-05, + "loss": 4.8575, + "step": 43515 + }, + { + "epoch": 0.2588019792558759, + "grad_norm": 1.4955140352249146, + "learning_rate": 4.2182485922903705e-05, + "loss": 4.8867, + "step": 43516 + }, + { + "epoch": 0.2588079265391569, + "grad_norm": 1.539275050163269, + "learning_rate": 4.218214663157696e-05, + "loss": 4.6438, + "step": 43517 + }, + { + "epoch": 0.2588138738224379, + "grad_norm": 1.5487936735153198, + "learning_rate": 4.218180733425211e-05, + "loss": 5.0964, + "step": 43518 + }, + { + "epoch": 0.2588198211057189, + "grad_norm": 1.4182380437850952, + "learning_rate": 4.218146803092927e-05, + "loss": 5.1198, + "step": 43519 + }, + { + "epoch": 0.2588257683889999, + "grad_norm": 1.3292893171310425, + "learning_rate": 4.2181128721608555e-05, + "loss": 5.0804, + "step": 43520 + }, + { + "epoch": 0.2588317156722809, + "grad_norm": 1.5781522989273071, + "learning_rate": 4.218078940629008e-05, + "loss": 4.8536, + "step": 43521 + }, + { + "epoch": 0.2588376629555619, + "grad_norm": 1.8098161220550537, + "learning_rate": 4.218045008497396e-05, + "loss": 4.7922, + "step": 43522 + }, + { + "epoch": 0.2588436102388429, + "grad_norm": 1.658117413520813, + "learning_rate": 4.2180110757660346e-05, + "loss": 4.1216, + "step": 43523 + }, + { + "epoch": 0.2588495575221239, + "grad_norm": 1.7896106243133545, + "learning_rate": 4.2179771424349323e-05, + "loss": 4.1603, + "step": 43524 + }, + { + "epoch": 0.25885550480540487, + "grad_norm": 1.8870915174484253, + "learning_rate": 4.2179432085041016e-05, + "loss": 4.3535, + "step": 43525 + }, + { + "epoch": 0.2588614520886859, + "grad_norm": 1.8355402946472168, + "learning_rate": 4.2179092739735546e-05, + "loss": 4.2863, + "step": 43526 + }, + { + "epoch": 0.2588673993719669, + "grad_norm": 1.5476624965667725, + "learning_rate": 4.2178753388433035e-05, + "loss": 4.4037, + "step": 43527 + }, + { + "epoch": 0.25887334665524786, + "grad_norm": 1.6192187070846558, + "learning_rate": 4.21784140311336e-05, + "loss": 4.4177, + "step": 43528 + }, + { + "epoch": 0.2588792939385289, + "grad_norm": 1.5931448936462402, + "learning_rate": 4.217807466783736e-05, + "loss": 4.1762, + "step": 43529 + }, + { + "epoch": 0.2588852412218099, + "grad_norm": 1.7006281614303589, + "learning_rate": 4.217773529854443e-05, + "loss": 4.3782, + "step": 43530 + }, + { + "epoch": 0.25889118850509085, + "grad_norm": 1.648154854774475, + "learning_rate": 4.217739592325493e-05, + "loss": 4.2508, + "step": 43531 + }, + { + "epoch": 0.25889713578837187, + "grad_norm": 1.6257084608078003, + "learning_rate": 4.2177056541968976e-05, + "loss": 4.0759, + "step": 43532 + }, + { + "epoch": 0.2589030830716529, + "grad_norm": 1.5800609588623047, + "learning_rate": 4.217671715468669e-05, + "loss": 4.1733, + "step": 43533 + }, + { + "epoch": 0.25890903035493384, + "grad_norm": 1.5987197160720825, + "learning_rate": 4.2176377761408184e-05, + "loss": 4.2616, + "step": 43534 + }, + { + "epoch": 0.25891497763821486, + "grad_norm": 1.9651854038238525, + "learning_rate": 4.217603836213359e-05, + "loss": 4.1646, + "step": 43535 + }, + { + "epoch": 0.2589209249214959, + "grad_norm": 1.5349578857421875, + "learning_rate": 4.217569895686301e-05, + "loss": 5.0326, + "step": 43536 + }, + { + "epoch": 0.25892687220477684, + "grad_norm": 1.569021463394165, + "learning_rate": 4.2175359545596585e-05, + "loss": 5.3457, + "step": 43537 + }, + { + "epoch": 0.25893281948805785, + "grad_norm": 1.886405348777771, + "learning_rate": 4.2175020128334406e-05, + "loss": 5.2578, + "step": 43538 + }, + { + "epoch": 0.25893876677133887, + "grad_norm": 1.6710288524627686, + "learning_rate": 4.217468070507661e-05, + "loss": 4.2893, + "step": 43539 + }, + { + "epoch": 0.25894471405461983, + "grad_norm": 1.803294062614441, + "learning_rate": 4.217434127582331e-05, + "loss": 4.0889, + "step": 43540 + }, + { + "epoch": 0.25895066133790084, + "grad_norm": 1.5589473247528076, + "learning_rate": 4.217400184057463e-05, + "loss": 4.4069, + "step": 43541 + }, + { + "epoch": 0.25895660862118186, + "grad_norm": 1.7611786127090454, + "learning_rate": 4.2173662399330676e-05, + "loss": 4.3062, + "step": 43542 + }, + { + "epoch": 0.2589625559044628, + "grad_norm": 1.7417824268341064, + "learning_rate": 4.2173322952091574e-05, + "loss": 4.1893, + "step": 43543 + }, + { + "epoch": 0.25896850318774384, + "grad_norm": 1.8695626258850098, + "learning_rate": 4.217298349885745e-05, + "loss": 4.1494, + "step": 43544 + }, + { + "epoch": 0.25897445047102485, + "grad_norm": 1.8539936542510986, + "learning_rate": 4.2172644039628406e-05, + "loss": 4.1103, + "step": 43545 + }, + { + "epoch": 0.2589803977543058, + "grad_norm": 1.6804437637329102, + "learning_rate": 4.217230457440458e-05, + "loss": 4.2133, + "step": 43546 + }, + { + "epoch": 0.25898634503758683, + "grad_norm": 1.764915943145752, + "learning_rate": 4.2171965103186075e-05, + "loss": 4.1305, + "step": 43547 + }, + { + "epoch": 0.25899229232086785, + "grad_norm": 1.9837608337402344, + "learning_rate": 4.2171625625973004e-05, + "loss": 4.1788, + "step": 43548 + }, + { + "epoch": 0.2589982396041488, + "grad_norm": 1.8293904066085815, + "learning_rate": 4.21712861427655e-05, + "loss": 3.9959, + "step": 43549 + }, + { + "epoch": 0.2590041868874298, + "grad_norm": 1.7542941570281982, + "learning_rate": 4.217094665356369e-05, + "loss": 3.9396, + "step": 43550 + }, + { + "epoch": 0.25901013417071084, + "grad_norm": 1.745635986328125, + "learning_rate": 4.217060715836767e-05, + "loss": 4.0095, + "step": 43551 + }, + { + "epoch": 0.2590160814539918, + "grad_norm": 1.6854296922683716, + "learning_rate": 4.217026765717757e-05, + "loss": 4.2763, + "step": 43552 + }, + { + "epoch": 0.2590220287372728, + "grad_norm": 1.8780038356781006, + "learning_rate": 4.216992814999351e-05, + "loss": 4.0878, + "step": 43553 + }, + { + "epoch": 0.25902797602055383, + "grad_norm": 1.7932052612304688, + "learning_rate": 4.21695886368156e-05, + "loss": 4.2142, + "step": 43554 + }, + { + "epoch": 0.2590339233038348, + "grad_norm": 2.0965373516082764, + "learning_rate": 4.216924911764396e-05, + "loss": 4.1074, + "step": 43555 + }, + { + "epoch": 0.2590398705871158, + "grad_norm": 2.024886131286621, + "learning_rate": 4.216890959247873e-05, + "loss": 3.9733, + "step": 43556 + }, + { + "epoch": 0.2590458178703968, + "grad_norm": 1.7944450378417969, + "learning_rate": 4.2168570061319994e-05, + "loss": 3.9898, + "step": 43557 + }, + { + "epoch": 0.2590517651536778, + "grad_norm": 1.459204077720642, + "learning_rate": 4.21682305241679e-05, + "loss": 4.4935, + "step": 43558 + }, + { + "epoch": 0.2590577124369588, + "grad_norm": 2.4131886959075928, + "learning_rate": 4.216789098102255e-05, + "loss": 3.9719, + "step": 43559 + }, + { + "epoch": 0.2590636597202398, + "grad_norm": 2.145207405090332, + "learning_rate": 4.216755143188405e-05, + "loss": 4.0188, + "step": 43560 + }, + { + "epoch": 0.2590696070035208, + "grad_norm": 1.9670813083648682, + "learning_rate": 4.216721187675256e-05, + "loss": 3.7985, + "step": 43561 + }, + { + "epoch": 0.2590755542868018, + "grad_norm": 1.6982098817825317, + "learning_rate": 4.216687231562816e-05, + "loss": 3.8662, + "step": 43562 + }, + { + "epoch": 0.2590815015700828, + "grad_norm": 1.8614575862884521, + "learning_rate": 4.216653274851099e-05, + "loss": 3.7156, + "step": 43563 + }, + { + "epoch": 0.25908744885336377, + "grad_norm": 1.7926217317581177, + "learning_rate": 4.2166193175401154e-05, + "loss": 3.8764, + "step": 43564 + }, + { + "epoch": 0.2590933961366448, + "grad_norm": 1.7686653137207031, + "learning_rate": 4.2165853596298774e-05, + "loss": 3.9954, + "step": 43565 + }, + { + "epoch": 0.2590993434199258, + "grad_norm": 1.782195806503296, + "learning_rate": 4.216551401120398e-05, + "loss": 3.9894, + "step": 43566 + }, + { + "epoch": 0.25910529070320676, + "grad_norm": 1.7924538850784302, + "learning_rate": 4.216517442011688e-05, + "loss": 3.9695, + "step": 43567 + }, + { + "epoch": 0.2591112379864878, + "grad_norm": 1.6568260192871094, + "learning_rate": 4.21648348230376e-05, + "loss": 3.966, + "step": 43568 + }, + { + "epoch": 0.2591171852697688, + "grad_norm": 1.7667843103408813, + "learning_rate": 4.216449521996625e-05, + "loss": 4.3729, + "step": 43569 + }, + { + "epoch": 0.25912313255304975, + "grad_norm": 1.8628977537155151, + "learning_rate": 4.2164155610902946e-05, + "loss": 4.1325, + "step": 43570 + }, + { + "epoch": 0.25912907983633077, + "grad_norm": 2.250692129135132, + "learning_rate": 4.216381599584782e-05, + "loss": 3.9909, + "step": 43571 + }, + { + "epoch": 0.2591350271196118, + "grad_norm": 2.053028106689453, + "learning_rate": 4.216347637480098e-05, + "loss": 3.937, + "step": 43572 + }, + { + "epoch": 0.25914097440289274, + "grad_norm": 2.2938973903656006, + "learning_rate": 4.2163136747762544e-05, + "loss": 4.1443, + "step": 43573 + }, + { + "epoch": 0.25914692168617376, + "grad_norm": 2.1817948818206787, + "learning_rate": 4.2162797114732646e-05, + "loss": 3.8868, + "step": 43574 + }, + { + "epoch": 0.2591528689694548, + "grad_norm": 1.7872873544692993, + "learning_rate": 4.216245747571138e-05, + "loss": 3.8975, + "step": 43575 + }, + { + "epoch": 0.25915881625273574, + "grad_norm": 1.8986769914627075, + "learning_rate": 4.216211783069888e-05, + "loss": 3.8121, + "step": 43576 + }, + { + "epoch": 0.25916476353601675, + "grad_norm": 2.033832550048828, + "learning_rate": 4.216177817969527e-05, + "loss": 3.8638, + "step": 43577 + }, + { + "epoch": 0.25917071081929777, + "grad_norm": 1.6750317811965942, + "learning_rate": 4.2161438522700654e-05, + "loss": 3.9407, + "step": 43578 + }, + { + "epoch": 0.2591766581025787, + "grad_norm": 1.6709991693496704, + "learning_rate": 4.216109885971515e-05, + "loss": 4.0518, + "step": 43579 + }, + { + "epoch": 0.25918260538585974, + "grad_norm": 1.7736555337905884, + "learning_rate": 4.2160759190738896e-05, + "loss": 3.8276, + "step": 43580 + }, + { + "epoch": 0.25918855266914076, + "grad_norm": 1.5770437717437744, + "learning_rate": 4.2160419515772e-05, + "loss": 4.7641, + "step": 43581 + }, + { + "epoch": 0.2591944999524217, + "grad_norm": 1.774965524673462, + "learning_rate": 4.216007983481457e-05, + "loss": 5.2038, + "step": 43582 + }, + { + "epoch": 0.25920044723570274, + "grad_norm": 1.6962759494781494, + "learning_rate": 4.2159740147866736e-05, + "loss": 5.2847, + "step": 43583 + }, + { + "epoch": 0.25920639451898375, + "grad_norm": 1.694316029548645, + "learning_rate": 4.215940045492861e-05, + "loss": 3.8762, + "step": 43584 + }, + { + "epoch": 0.2592123418022647, + "grad_norm": 1.754571557044983, + "learning_rate": 4.215906075600032e-05, + "loss": 3.7589, + "step": 43585 + }, + { + "epoch": 0.25921828908554573, + "grad_norm": 1.8617408275604248, + "learning_rate": 4.2158721051081986e-05, + "loss": 4.7452, + "step": 43586 + }, + { + "epoch": 0.25922423636882674, + "grad_norm": 1.7356998920440674, + "learning_rate": 4.2158381340173704e-05, + "loss": 4.4337, + "step": 43587 + }, + { + "epoch": 0.2592301836521077, + "grad_norm": 1.7501081228256226, + "learning_rate": 4.215804162327561e-05, + "loss": 4.7445, + "step": 43588 + }, + { + "epoch": 0.2592361309353887, + "grad_norm": 1.776637315750122, + "learning_rate": 4.215770190038784e-05, + "loss": 4.7991, + "step": 43589 + }, + { + "epoch": 0.25924207821866974, + "grad_norm": 1.5631941556930542, + "learning_rate": 4.215736217151047e-05, + "loss": 4.8362, + "step": 43590 + }, + { + "epoch": 0.2592480255019507, + "grad_norm": 1.7912548780441284, + "learning_rate": 4.215702243664366e-05, + "loss": 4.5902, + "step": 43591 + }, + { + "epoch": 0.2592539727852317, + "grad_norm": 1.3836278915405273, + "learning_rate": 4.21566826957875e-05, + "loss": 4.8491, + "step": 43592 + }, + { + "epoch": 0.25925992006851273, + "grad_norm": 1.3787510395050049, + "learning_rate": 4.215634294894212e-05, + "loss": 4.8103, + "step": 43593 + }, + { + "epoch": 0.2592658673517937, + "grad_norm": 1.6407415866851807, + "learning_rate": 4.215600319610764e-05, + "loss": 4.9311, + "step": 43594 + }, + { + "epoch": 0.2592718146350747, + "grad_norm": 1.57007896900177, + "learning_rate": 4.215566343728418e-05, + "loss": 4.7047, + "step": 43595 + }, + { + "epoch": 0.2592777619183557, + "grad_norm": 1.8817800283432007, + "learning_rate": 4.215532367247186e-05, + "loss": 4.6079, + "step": 43596 + }, + { + "epoch": 0.2592837092016367, + "grad_norm": 1.562889575958252, + "learning_rate": 4.215498390167078e-05, + "loss": 5.0101, + "step": 43597 + }, + { + "epoch": 0.2592896564849177, + "grad_norm": 1.62864351272583, + "learning_rate": 4.215464412488108e-05, + "loss": 4.5876, + "step": 43598 + }, + { + "epoch": 0.2592956037681987, + "grad_norm": 1.5303740501403809, + "learning_rate": 4.215430434210287e-05, + "loss": 4.9113, + "step": 43599 + }, + { + "epoch": 0.2593015510514797, + "grad_norm": 1.7238622903823853, + "learning_rate": 4.2153964553336265e-05, + "loss": 4.4306, + "step": 43600 + }, + { + "epoch": 0.2593074983347607, + "grad_norm": 1.4936445951461792, + "learning_rate": 4.21536247585814e-05, + "loss": 4.9322, + "step": 43601 + }, + { + "epoch": 0.2593134456180417, + "grad_norm": 1.6320984363555908, + "learning_rate": 4.2153284957838367e-05, + "loss": 4.9043, + "step": 43602 + }, + { + "epoch": 0.25931939290132267, + "grad_norm": 1.8814071416854858, + "learning_rate": 4.21529451511073e-05, + "loss": 4.0384, + "step": 43603 + }, + { + "epoch": 0.2593253401846037, + "grad_norm": 1.5768373012542725, + "learning_rate": 4.215260533838833e-05, + "loss": 4.5735, + "step": 43604 + }, + { + "epoch": 0.2593312874678847, + "grad_norm": 1.5696864128112793, + "learning_rate": 4.215226551968156e-05, + "loss": 4.3154, + "step": 43605 + }, + { + "epoch": 0.25933723475116566, + "grad_norm": 2.4774317741394043, + "learning_rate": 4.2151925694987105e-05, + "loss": 3.758, + "step": 43606 + }, + { + "epoch": 0.2593431820344467, + "grad_norm": 2.331195116043091, + "learning_rate": 4.215158586430509e-05, + "loss": 4.0144, + "step": 43607 + }, + { + "epoch": 0.2593491293177277, + "grad_norm": 2.352433681488037, + "learning_rate": 4.215124602763564e-05, + "loss": 3.9787, + "step": 43608 + }, + { + "epoch": 0.25935507660100865, + "grad_norm": 2.1282148361206055, + "learning_rate": 4.215090618497887e-05, + "loss": 3.9029, + "step": 43609 + }, + { + "epoch": 0.25936102388428967, + "grad_norm": 1.967679500579834, + "learning_rate": 4.2150566336334885e-05, + "loss": 3.7754, + "step": 43610 + }, + { + "epoch": 0.2593669711675707, + "grad_norm": 2.3253588676452637, + "learning_rate": 4.2150226481703815e-05, + "loss": 3.8364, + "step": 43611 + }, + { + "epoch": 0.25937291845085164, + "grad_norm": 2.0784995555877686, + "learning_rate": 4.214988662108578e-05, + "loss": 4.1333, + "step": 43612 + }, + { + "epoch": 0.25937886573413266, + "grad_norm": 1.9548943042755127, + "learning_rate": 4.214954675448091e-05, + "loss": 4.3586, + "step": 43613 + }, + { + "epoch": 0.2593848130174136, + "grad_norm": 2.0792782306671143, + "learning_rate": 4.214920688188929e-05, + "loss": 3.9939, + "step": 43614 + }, + { + "epoch": 0.25939076030069463, + "grad_norm": 2.1791350841522217, + "learning_rate": 4.2148867003311074e-05, + "loss": 3.9069, + "step": 43615 + }, + { + "epoch": 0.25939670758397565, + "grad_norm": 2.242664337158203, + "learning_rate": 4.214852711874635e-05, + "loss": 4.2853, + "step": 43616 + }, + { + "epoch": 0.2594026548672566, + "grad_norm": 1.7135580778121948, + "learning_rate": 4.2148187228195265e-05, + "loss": 4.135, + "step": 43617 + }, + { + "epoch": 0.2594086021505376, + "grad_norm": 2.0211315155029297, + "learning_rate": 4.2147847331657924e-05, + "loss": 4.1603, + "step": 43618 + }, + { + "epoch": 0.25941454943381864, + "grad_norm": 2.0321481227874756, + "learning_rate": 4.2147507429134445e-05, + "loss": 3.4679, + "step": 43619 + }, + { + "epoch": 0.2594204967170996, + "grad_norm": 2.029299259185791, + "learning_rate": 4.214716752062495e-05, + "loss": 3.1943, + "step": 43620 + }, + { + "epoch": 0.2594264440003806, + "grad_norm": 1.7095369100570679, + "learning_rate": 4.2146827606129555e-05, + "loss": 3.8634, + "step": 43621 + }, + { + "epoch": 0.25943239128366163, + "grad_norm": 1.7681798934936523, + "learning_rate": 4.2146487685648375e-05, + "loss": 3.7176, + "step": 43622 + }, + { + "epoch": 0.2594383385669426, + "grad_norm": 2.1918227672576904, + "learning_rate": 4.214614775918153e-05, + "loss": 3.3413, + "step": 43623 + }, + { + "epoch": 0.2594442858502236, + "grad_norm": 2.2707924842834473, + "learning_rate": 4.214580782672916e-05, + "loss": 3.629, + "step": 43624 + }, + { + "epoch": 0.2594502331335046, + "grad_norm": 2.106889486312866, + "learning_rate": 4.214546788829135e-05, + "loss": 3.5067, + "step": 43625 + }, + { + "epoch": 0.2594561804167856, + "grad_norm": 2.324676513671875, + "learning_rate": 4.214512794386823e-05, + "loss": 3.8558, + "step": 43626 + }, + { + "epoch": 0.2594621277000666, + "grad_norm": 2.2028746604919434, + "learning_rate": 4.214478799345995e-05, + "loss": 3.6737, + "step": 43627 + }, + { + "epoch": 0.2594680749833476, + "grad_norm": 2.2514851093292236, + "learning_rate": 4.2144448037066576e-05, + "loss": 3.6251, + "step": 43628 + }, + { + "epoch": 0.2594740222666286, + "grad_norm": 2.4968016147613525, + "learning_rate": 4.214410807468826e-05, + "loss": 3.6778, + "step": 43629 + }, + { + "epoch": 0.2594799695499096, + "grad_norm": 2.334904909133911, + "learning_rate": 4.214376810632511e-05, + "loss": 3.271, + "step": 43630 + }, + { + "epoch": 0.2594859168331906, + "grad_norm": 2.111959457397461, + "learning_rate": 4.2143428131977244e-05, + "loss": 3.3424, + "step": 43631 + }, + { + "epoch": 0.25949186411647157, + "grad_norm": 2.0344126224517822, + "learning_rate": 4.21430881516448e-05, + "loss": 3.2893, + "step": 43632 + }, + { + "epoch": 0.2594978113997526, + "grad_norm": 1.4427379369735718, + "learning_rate": 4.214274816532787e-05, + "loss": 4.1771, + "step": 43633 + }, + { + "epoch": 0.2595037586830336, + "grad_norm": 2.1719114780426025, + "learning_rate": 4.2142408173026585e-05, + "loss": 3.3125, + "step": 43634 + }, + { + "epoch": 0.25950970596631456, + "grad_norm": 2.852724552154541, + "learning_rate": 4.214206817474106e-05, + "loss": 3.5465, + "step": 43635 + }, + { + "epoch": 0.2595156532495956, + "grad_norm": 2.8317763805389404, + "learning_rate": 4.2141728170471425e-05, + "loss": 4.033, + "step": 43636 + }, + { + "epoch": 0.2595216005328766, + "grad_norm": 2.5915002822875977, + "learning_rate": 4.2141388160217776e-05, + "loss": 3.683, + "step": 43637 + }, + { + "epoch": 0.25952754781615756, + "grad_norm": 2.9431540966033936, + "learning_rate": 4.214104814398025e-05, + "loss": 3.7083, + "step": 43638 + }, + { + "epoch": 0.25953349509943857, + "grad_norm": 2.7508184909820557, + "learning_rate": 4.214070812175897e-05, + "loss": 3.6067, + "step": 43639 + }, + { + "epoch": 0.2595394423827196, + "grad_norm": 2.2551815509796143, + "learning_rate": 4.2140368093554037e-05, + "loss": 3.7017, + "step": 43640 + }, + { + "epoch": 0.25954538966600055, + "grad_norm": 2.1251437664031982, + "learning_rate": 4.214002805936559e-05, + "loss": 3.6996, + "step": 43641 + }, + { + "epoch": 0.25955133694928156, + "grad_norm": 2.2045071125030518, + "learning_rate": 4.2139688019193726e-05, + "loss": 3.2238, + "step": 43642 + }, + { + "epoch": 0.2595572842325626, + "grad_norm": 1.4603911638259888, + "learning_rate": 4.213934797303858e-05, + "loss": 3.7919, + "step": 43643 + }, + { + "epoch": 0.25956323151584354, + "grad_norm": 1.6459100246429443, + "learning_rate": 4.2139007920900255e-05, + "loss": 3.797, + "step": 43644 + }, + { + "epoch": 0.25956917879912456, + "grad_norm": 1.922767162322998, + "learning_rate": 4.213866786277889e-05, + "loss": 5.0645, + "step": 43645 + }, + { + "epoch": 0.25957512608240557, + "grad_norm": 1.6992758512496948, + "learning_rate": 4.213832779867458e-05, + "loss": 5.2533, + "step": 43646 + }, + { + "epoch": 0.25958107336568653, + "grad_norm": 1.6604686975479126, + "learning_rate": 4.213798772858747e-05, + "loss": 4.915, + "step": 43647 + }, + { + "epoch": 0.25958702064896755, + "grad_norm": 2.1406044960021973, + "learning_rate": 4.213764765251766e-05, + "loss": 4.9198, + "step": 43648 + }, + { + "epoch": 0.25959296793224856, + "grad_norm": 1.5920265913009644, + "learning_rate": 4.213730757046528e-05, + "loss": 5.2211, + "step": 43649 + }, + { + "epoch": 0.2595989152155295, + "grad_norm": 1.344879388809204, + "learning_rate": 4.213696748243044e-05, + "loss": 5.1717, + "step": 43650 + }, + { + "epoch": 0.25960486249881054, + "grad_norm": 1.610145926475525, + "learning_rate": 4.213662738841325e-05, + "loss": 4.8863, + "step": 43651 + }, + { + "epoch": 0.25961080978209156, + "grad_norm": 1.7724106311798096, + "learning_rate": 4.2136287288413854e-05, + "loss": 4.7862, + "step": 43652 + }, + { + "epoch": 0.2596167570653725, + "grad_norm": 1.7007393836975098, + "learning_rate": 4.2135947182432356e-05, + "loss": 4.7812, + "step": 43653 + }, + { + "epoch": 0.25962270434865353, + "grad_norm": 1.512393593788147, + "learning_rate": 4.2135607070468875e-05, + "loss": 4.6225, + "step": 43654 + }, + { + "epoch": 0.25962865163193455, + "grad_norm": 1.5738706588745117, + "learning_rate": 4.2135266952523525e-05, + "loss": 4.562, + "step": 43655 + }, + { + "epoch": 0.2596345989152155, + "grad_norm": 1.752463459968567, + "learning_rate": 4.2134926828596435e-05, + "loss": 4.6895, + "step": 43656 + }, + { + "epoch": 0.2596405461984965, + "grad_norm": 1.6982327699661255, + "learning_rate": 4.213458669868772e-05, + "loss": 4.9289, + "step": 43657 + }, + { + "epoch": 0.25964649348177754, + "grad_norm": 1.4321925640106201, + "learning_rate": 4.213424656279749e-05, + "loss": 4.6239, + "step": 43658 + }, + { + "epoch": 0.2596524407650585, + "grad_norm": 1.5791555643081665, + "learning_rate": 4.213390642092588e-05, + "loss": 4.5527, + "step": 43659 + }, + { + "epoch": 0.2596583880483395, + "grad_norm": 1.6857123374938965, + "learning_rate": 4.2133566273073e-05, + "loss": 4.7893, + "step": 43660 + }, + { + "epoch": 0.25966433533162053, + "grad_norm": 1.5595061779022217, + "learning_rate": 4.2133226119238965e-05, + "loss": 4.382, + "step": 43661 + }, + { + "epoch": 0.2596702826149015, + "grad_norm": 3.4526281356811523, + "learning_rate": 4.2132885959423896e-05, + "loss": 1.7968, + "step": 43662 + }, + { + "epoch": 0.2596762298981825, + "grad_norm": 2.181797504425049, + "learning_rate": 4.213254579362792e-05, + "loss": 3.3448, + "step": 43663 + }, + { + "epoch": 0.2596821771814635, + "grad_norm": 3.2300848960876465, + "learning_rate": 4.2132205621851146e-05, + "loss": 1.4692, + "step": 43664 + }, + { + "epoch": 0.2596881244647445, + "grad_norm": 2.849907875061035, + "learning_rate": 4.2131865444093696e-05, + "loss": 1.414, + "step": 43665 + }, + { + "epoch": 0.2596940717480255, + "grad_norm": 2.633863687515259, + "learning_rate": 4.2131525260355685e-05, + "loss": 1.2473, + "step": 43666 + }, + { + "epoch": 0.2597000190313065, + "grad_norm": 2.2491796016693115, + "learning_rate": 4.213118507063725e-05, + "loss": 1.8509, + "step": 43667 + }, + { + "epoch": 0.2597059663145875, + "grad_norm": 1.9353699684143066, + "learning_rate": 4.213084487493848e-05, + "loss": 3.9734, + "step": 43668 + }, + { + "epoch": 0.2597119135978685, + "grad_norm": 1.7159816026687622, + "learning_rate": 4.213050467325952e-05, + "loss": 5.1665, + "step": 43669 + }, + { + "epoch": 0.2597178608811495, + "grad_norm": 1.5517643690109253, + "learning_rate": 4.2130164465600465e-05, + "loss": 5.5657, + "step": 43670 + }, + { + "epoch": 0.25972380816443047, + "grad_norm": 2.460841178894043, + "learning_rate": 4.212982425196145e-05, + "loss": 3.9037, + "step": 43671 + }, + { + "epoch": 0.2597297554477115, + "grad_norm": 3.088391065597534, + "learning_rate": 4.21294840323426e-05, + "loss": 3.8775, + "step": 43672 + }, + { + "epoch": 0.2597357027309925, + "grad_norm": 3.022916316986084, + "learning_rate": 4.2129143806744015e-05, + "loss": 3.457, + "step": 43673 + }, + { + "epoch": 0.25974165001427346, + "grad_norm": 2.435349702835083, + "learning_rate": 4.212880357516582e-05, + "loss": 3.6603, + "step": 43674 + }, + { + "epoch": 0.2597475972975545, + "grad_norm": 2.296440601348877, + "learning_rate": 4.212846333760815e-05, + "loss": 3.3424, + "step": 43675 + }, + { + "epoch": 0.2597535445808355, + "grad_norm": 2.369565010070801, + "learning_rate": 4.21281230940711e-05, + "loss": 3.7657, + "step": 43676 + }, + { + "epoch": 0.25975949186411645, + "grad_norm": 2.115079641342163, + "learning_rate": 4.2127782844554804e-05, + "loss": 4.4485, + "step": 43677 + }, + { + "epoch": 0.25976543914739747, + "grad_norm": 2.719297170639038, + "learning_rate": 4.2127442589059374e-05, + "loss": 3.977, + "step": 43678 + }, + { + "epoch": 0.2597713864306785, + "grad_norm": 2.200578451156616, + "learning_rate": 4.2127102327584935e-05, + "loss": 3.8632, + "step": 43679 + }, + { + "epoch": 0.25977733371395945, + "grad_norm": 2.363403797149658, + "learning_rate": 4.212676206013159e-05, + "loss": 3.3055, + "step": 43680 + }, + { + "epoch": 0.25978328099724046, + "grad_norm": 2.286637306213379, + "learning_rate": 4.2126421786699476e-05, + "loss": 3.2768, + "step": 43681 + }, + { + "epoch": 0.2597892282805215, + "grad_norm": 2.6021573543548584, + "learning_rate": 4.212608150728871e-05, + "loss": 3.2071, + "step": 43682 + }, + { + "epoch": 0.25979517556380244, + "grad_norm": 2.3232452869415283, + "learning_rate": 4.21257412218994e-05, + "loss": 3.0491, + "step": 43683 + }, + { + "epoch": 0.25980112284708345, + "grad_norm": 2.2177515029907227, + "learning_rate": 4.212540093053167e-05, + "loss": 3.134, + "step": 43684 + }, + { + "epoch": 0.25980707013036447, + "grad_norm": 2.1753644943237305, + "learning_rate": 4.212506063318564e-05, + "loss": 2.9988, + "step": 43685 + }, + { + "epoch": 0.25981301741364543, + "grad_norm": 1.8257017135620117, + "learning_rate": 4.212472032986143e-05, + "loss": 4.059, + "step": 43686 + }, + { + "epoch": 0.25981896469692645, + "grad_norm": 1.8626933097839355, + "learning_rate": 4.212438002055916e-05, + "loss": 5.0625, + "step": 43687 + }, + { + "epoch": 0.25982491198020746, + "grad_norm": 1.682747721672058, + "learning_rate": 4.212403970527894e-05, + "loss": 4.8714, + "step": 43688 + }, + { + "epoch": 0.2598308592634884, + "grad_norm": 2.1634573936462402, + "learning_rate": 4.21236993840209e-05, + "loss": 2.7932, + "step": 43689 + }, + { + "epoch": 0.25983680654676944, + "grad_norm": 2.1826813220977783, + "learning_rate": 4.212335905678515e-05, + "loss": 2.9916, + "step": 43690 + }, + { + "epoch": 0.25984275383005045, + "grad_norm": 2.0527384281158447, + "learning_rate": 4.212301872357181e-05, + "loss": 3.5239, + "step": 43691 + }, + { + "epoch": 0.2598487011133314, + "grad_norm": 2.4769909381866455, + "learning_rate": 4.2122678384381006e-05, + "loss": 3.6413, + "step": 43692 + }, + { + "epoch": 0.25985464839661243, + "grad_norm": 2.3631722927093506, + "learning_rate": 4.212233803921285e-05, + "loss": 3.5735, + "step": 43693 + }, + { + "epoch": 0.25986059567989345, + "grad_norm": 2.2326080799102783, + "learning_rate": 4.212199768806746e-05, + "loss": 3.6424, + "step": 43694 + }, + { + "epoch": 0.2598665429631744, + "grad_norm": 2.1511552333831787, + "learning_rate": 4.2121657330944964e-05, + "loss": 3.0825, + "step": 43695 + }, + { + "epoch": 0.2598724902464554, + "grad_norm": 2.4460346698760986, + "learning_rate": 4.212131696784547e-05, + "loss": 2.9605, + "step": 43696 + }, + { + "epoch": 0.25987843752973644, + "grad_norm": 1.6751445531845093, + "learning_rate": 4.21209765987691e-05, + "loss": 4.1174, + "step": 43697 + }, + { + "epoch": 0.2598843848130174, + "grad_norm": 1.377069115638733, + "learning_rate": 4.2120636223715973e-05, + "loss": 4.244, + "step": 43698 + }, + { + "epoch": 0.2598903320962984, + "grad_norm": 2.1393306255340576, + "learning_rate": 4.212029584268622e-05, + "loss": 2.9108, + "step": 43699 + }, + { + "epoch": 0.25989627937957943, + "grad_norm": 2.0861568450927734, + "learning_rate": 4.211995545567994e-05, + "loss": 2.8975, + "step": 43700 + }, + { + "epoch": 0.2599022266628604, + "grad_norm": 2.4001259803771973, + "learning_rate": 4.2119615062697253e-05, + "loss": 3.1248, + "step": 43701 + }, + { + "epoch": 0.2599081739461414, + "grad_norm": 2.6331357955932617, + "learning_rate": 4.2119274663738294e-05, + "loss": 4.277, + "step": 43702 + }, + { + "epoch": 0.2599141212294224, + "grad_norm": 1.5544040203094482, + "learning_rate": 4.211893425880317e-05, + "loss": 5.0113, + "step": 43703 + }, + { + "epoch": 0.2599200685127034, + "grad_norm": 2.3288755416870117, + "learning_rate": 4.211859384789201e-05, + "loss": 3.1044, + "step": 43704 + }, + { + "epoch": 0.2599260157959844, + "grad_norm": 3.5167014598846436, + "learning_rate": 4.211825343100492e-05, + "loss": 3.3934, + "step": 43705 + }, + { + "epoch": 0.2599319630792654, + "grad_norm": 1.8692322969436646, + "learning_rate": 4.211791300814202e-05, + "loss": 4.689, + "step": 43706 + }, + { + "epoch": 0.2599379103625464, + "grad_norm": 1.5348780155181885, + "learning_rate": 4.2117572579303446e-05, + "loss": 5.8488, + "step": 43707 + }, + { + "epoch": 0.2599438576458274, + "grad_norm": 2.0329549312591553, + "learning_rate": 4.211723214448929e-05, + "loss": 4.3408, + "step": 43708 + }, + { + "epoch": 0.2599498049291084, + "grad_norm": 2.3132266998291016, + "learning_rate": 4.211689170369969e-05, + "loss": 3.8393, + "step": 43709 + }, + { + "epoch": 0.25995575221238937, + "grad_norm": 1.9871792793273926, + "learning_rate": 4.211655125693477e-05, + "loss": 4.9565, + "step": 43710 + }, + { + "epoch": 0.2599616994956704, + "grad_norm": 1.9368735551834106, + "learning_rate": 4.211621080419463e-05, + "loss": 3.6609, + "step": 43711 + }, + { + "epoch": 0.2599676467789514, + "grad_norm": 1.5552960634231567, + "learning_rate": 4.211587034547939e-05, + "loss": 4.8183, + "step": 43712 + }, + { + "epoch": 0.25997359406223236, + "grad_norm": 1.7051454782485962, + "learning_rate": 4.211552988078918e-05, + "loss": 4.8822, + "step": 43713 + }, + { + "epoch": 0.2599795413455134, + "grad_norm": 2.176149606704712, + "learning_rate": 4.2115189410124133e-05, + "loss": 3.2095, + "step": 43714 + }, + { + "epoch": 0.2599854886287944, + "grad_norm": 2.0081160068511963, + "learning_rate": 4.211484893348433e-05, + "loss": 3.5666, + "step": 43715 + }, + { + "epoch": 0.25999143591207535, + "grad_norm": 2.3614957332611084, + "learning_rate": 4.211450845086992e-05, + "loss": 2.5875, + "step": 43716 + }, + { + "epoch": 0.25999738319535637, + "grad_norm": 2.5878303050994873, + "learning_rate": 4.211416796228101e-05, + "loss": 2.3247, + "step": 43717 + }, + { + "epoch": 0.2600033304786374, + "grad_norm": 2.6374716758728027, + "learning_rate": 4.211382746771772e-05, + "loss": 2.044, + "step": 43718 + }, + { + "epoch": 0.26000927776191834, + "grad_norm": 2.2762982845306396, + "learning_rate": 4.211348696718017e-05, + "loss": 2.121, + "step": 43719 + }, + { + "epoch": 0.26001522504519936, + "grad_norm": 2.3474221229553223, + "learning_rate": 4.211314646066848e-05, + "loss": 2.061, + "step": 43720 + }, + { + "epoch": 0.2600211723284804, + "grad_norm": 2.354339122772217, + "learning_rate": 4.2112805948182764e-05, + "loss": 2.2322, + "step": 43721 + }, + { + "epoch": 0.26002711961176134, + "grad_norm": 2.5821359157562256, + "learning_rate": 4.211246542972314e-05, + "loss": 2.0971, + "step": 43722 + }, + { + "epoch": 0.26003306689504235, + "grad_norm": 2.8099939823150635, + "learning_rate": 4.211212490528974e-05, + "loss": 1.9378, + "step": 43723 + }, + { + "epoch": 0.26003901417832337, + "grad_norm": 2.1069459915161133, + "learning_rate": 4.211178437488267e-05, + "loss": 4.7, + "step": 43724 + }, + { + "epoch": 0.26004496146160433, + "grad_norm": 2.3739068508148193, + "learning_rate": 4.2111443838502054e-05, + "loss": 4.4947, + "step": 43725 + }, + { + "epoch": 0.26005090874488535, + "grad_norm": 1.8083316087722778, + "learning_rate": 4.211110329614801e-05, + "loss": 4.9472, + "step": 43726 + }, + { + "epoch": 0.26005685602816636, + "grad_norm": 2.360046625137329, + "learning_rate": 4.2110762747820655e-05, + "loss": 3.62, + "step": 43727 + }, + { + "epoch": 0.2600628033114473, + "grad_norm": 1.7155252695083618, + "learning_rate": 4.211042219352011e-05, + "loss": 4.5742, + "step": 43728 + }, + { + "epoch": 0.26006875059472834, + "grad_norm": 1.7128796577453613, + "learning_rate": 4.21100816332465e-05, + "loss": 4.6703, + "step": 43729 + }, + { + "epoch": 0.2600746978780093, + "grad_norm": 1.6975926160812378, + "learning_rate": 4.210974106699993e-05, + "loss": 4.9525, + "step": 43730 + }, + { + "epoch": 0.2600806451612903, + "grad_norm": 1.753231167793274, + "learning_rate": 4.210940049478053e-05, + "loss": 5.0193, + "step": 43731 + }, + { + "epoch": 0.26008659244457133, + "grad_norm": 1.5660897493362427, + "learning_rate": 4.2109059916588414e-05, + "loss": 4.9803, + "step": 43732 + }, + { + "epoch": 0.2600925397278523, + "grad_norm": 1.7302523851394653, + "learning_rate": 4.2108719332423695e-05, + "loss": 4.7854, + "step": 43733 + }, + { + "epoch": 0.2600984870111333, + "grad_norm": 1.5534487962722778, + "learning_rate": 4.21083787422865e-05, + "loss": 5.3298, + "step": 43734 + }, + { + "epoch": 0.2601044342944143, + "grad_norm": 1.5700414180755615, + "learning_rate": 4.2108038146176956e-05, + "loss": 5.4738, + "step": 43735 + }, + { + "epoch": 0.2601103815776953, + "grad_norm": 1.4145196676254272, + "learning_rate": 4.2107697544095166e-05, + "loss": 5.0322, + "step": 43736 + }, + { + "epoch": 0.2601163288609763, + "grad_norm": 1.388768196105957, + "learning_rate": 4.210735693604126e-05, + "loss": 5.1312, + "step": 43737 + }, + { + "epoch": 0.2601222761442573, + "grad_norm": 1.6465142965316772, + "learning_rate": 4.210701632201535e-05, + "loss": 5.1582, + "step": 43738 + }, + { + "epoch": 0.2601282234275383, + "grad_norm": 1.451076865196228, + "learning_rate": 4.210667570201755e-05, + "loss": 4.9172, + "step": 43739 + }, + { + "epoch": 0.2601341707108193, + "grad_norm": 4.92765998840332, + "learning_rate": 4.2106335076047996e-05, + "loss": 4.2151, + "step": 43740 + }, + { + "epoch": 0.2601401179941003, + "grad_norm": 5.801846504211426, + "learning_rate": 4.21059944441068e-05, + "loss": 3.994, + "step": 43741 + }, + { + "epoch": 0.26014606527738127, + "grad_norm": 5.827388763427734, + "learning_rate": 4.210565380619407e-05, + "loss": 3.5739, + "step": 43742 + }, + { + "epoch": 0.2601520125606623, + "grad_norm": 2.1075680255889893, + "learning_rate": 4.210531316230993e-05, + "loss": 4.7096, + "step": 43743 + }, + { + "epoch": 0.2601579598439433, + "grad_norm": 5.036509037017822, + "learning_rate": 4.210497251245451e-05, + "loss": 3.5512, + "step": 43744 + }, + { + "epoch": 0.26016390712722426, + "grad_norm": 3.6639130115509033, + "learning_rate": 4.210463185662792e-05, + "loss": 3.107, + "step": 43745 + }, + { + "epoch": 0.2601698544105053, + "grad_norm": 3.2976036071777344, + "learning_rate": 4.2104291194830275e-05, + "loss": 3.6668, + "step": 43746 + }, + { + "epoch": 0.2601758016937863, + "grad_norm": 3.5384552478790283, + "learning_rate": 4.21039505270617e-05, + "loss": 2.8666, + "step": 43747 + }, + { + "epoch": 0.26018174897706725, + "grad_norm": 3.216059446334839, + "learning_rate": 4.2103609853322315e-05, + "loss": 3.0724, + "step": 43748 + }, + { + "epoch": 0.26018769626034827, + "grad_norm": 1.756908655166626, + "learning_rate": 4.210326917361224e-05, + "loss": 5.0017, + "step": 43749 + }, + { + "epoch": 0.2601936435436293, + "grad_norm": 1.7715728282928467, + "learning_rate": 4.2102928487931584e-05, + "loss": 5.1715, + "step": 43750 + }, + { + "epoch": 0.26019959082691024, + "grad_norm": 1.5197134017944336, + "learning_rate": 4.210258779628047e-05, + "loss": 4.8331, + "step": 43751 + }, + { + "epoch": 0.26020553811019126, + "grad_norm": 1.820982813835144, + "learning_rate": 4.210224709865902e-05, + "loss": 4.7245, + "step": 43752 + }, + { + "epoch": 0.2602114853934723, + "grad_norm": 2.652844190597534, + "learning_rate": 4.210190639506736e-05, + "loss": 3.6996, + "step": 43753 + }, + { + "epoch": 0.26021743267675324, + "grad_norm": 1.923531413078308, + "learning_rate": 4.2101565685505595e-05, + "loss": 4.3191, + "step": 43754 + }, + { + "epoch": 0.26022337996003425, + "grad_norm": 1.6866570711135864, + "learning_rate": 4.210122496997386e-05, + "loss": 4.1298, + "step": 43755 + }, + { + "epoch": 0.26022932724331527, + "grad_norm": 1.6367350816726685, + "learning_rate": 4.210088424847225e-05, + "loss": 4.2635, + "step": 43756 + }, + { + "epoch": 0.2602352745265962, + "grad_norm": 1.697527527809143, + "learning_rate": 4.2100543521000905e-05, + "loss": 4.9115, + "step": 43757 + }, + { + "epoch": 0.26024122180987724, + "grad_norm": 1.659485101699829, + "learning_rate": 4.210020278755994e-05, + "loss": 5.1207, + "step": 43758 + }, + { + "epoch": 0.26024716909315826, + "grad_norm": 2.436439275741577, + "learning_rate": 4.209986204814946e-05, + "loss": 4.4074, + "step": 43759 + }, + { + "epoch": 0.2602531163764392, + "grad_norm": 2.6073365211486816, + "learning_rate": 4.2099521302769606e-05, + "loss": 4.2211, + "step": 43760 + }, + { + "epoch": 0.26025906365972024, + "grad_norm": 2.4823858737945557, + "learning_rate": 4.209918055142048e-05, + "loss": 4.158, + "step": 43761 + }, + { + "epoch": 0.26026501094300125, + "grad_norm": 2.478266477584839, + "learning_rate": 4.209883979410221e-05, + "loss": 4.0581, + "step": 43762 + }, + { + "epoch": 0.2602709582262822, + "grad_norm": 2.3521065711975098, + "learning_rate": 4.209849903081491e-05, + "loss": 4.0461, + "step": 43763 + }, + { + "epoch": 0.26027690550956323, + "grad_norm": 1.9846135377883911, + "learning_rate": 4.20981582615587e-05, + "loss": 4.3878, + "step": 43764 + }, + { + "epoch": 0.26028285279284424, + "grad_norm": 2.1699838638305664, + "learning_rate": 4.20978174863337e-05, + "loss": 4.2783, + "step": 43765 + }, + { + "epoch": 0.2602888000761252, + "grad_norm": 2.1066083908081055, + "learning_rate": 4.209747670514002e-05, + "loss": 4.2301, + "step": 43766 + }, + { + "epoch": 0.2602947473594062, + "grad_norm": 2.583462715148926, + "learning_rate": 4.20971359179778e-05, + "loss": 3.967, + "step": 43767 + }, + { + "epoch": 0.26030069464268724, + "grad_norm": 2.425373077392578, + "learning_rate": 4.209679512484715e-05, + "loss": 4.1313, + "step": 43768 + }, + { + "epoch": 0.2603066419259682, + "grad_norm": 2.325225830078125, + "learning_rate": 4.209645432574818e-05, + "loss": 4.1912, + "step": 43769 + }, + { + "epoch": 0.2603125892092492, + "grad_norm": 2.2580695152282715, + "learning_rate": 4.209611352068101e-05, + "loss": 3.9999, + "step": 43770 + }, + { + "epoch": 0.26031853649253023, + "grad_norm": 1.9659509658813477, + "learning_rate": 4.209577270964576e-05, + "loss": 4.375, + "step": 43771 + }, + { + "epoch": 0.2603244837758112, + "grad_norm": 2.080805778503418, + "learning_rate": 4.209543189264256e-05, + "loss": 4.9262, + "step": 43772 + }, + { + "epoch": 0.2603304310590922, + "grad_norm": 2.4444894790649414, + "learning_rate": 4.209509106967152e-05, + "loss": 4.1877, + "step": 43773 + }, + { + "epoch": 0.2603363783423732, + "grad_norm": 2.184331178665161, + "learning_rate": 4.209475024073276e-05, + "loss": 4.0013, + "step": 43774 + }, + { + "epoch": 0.2603423256256542, + "grad_norm": 2.173593282699585, + "learning_rate": 4.20944094058264e-05, + "loss": 4.1496, + "step": 43775 + }, + { + "epoch": 0.2603482729089352, + "grad_norm": 2.2656874656677246, + "learning_rate": 4.209406856495256e-05, + "loss": 3.9767, + "step": 43776 + }, + { + "epoch": 0.2603542201922162, + "grad_norm": 2.214384078979492, + "learning_rate": 4.2093727718111355e-05, + "loss": 3.8653, + "step": 43777 + }, + { + "epoch": 0.2603601674754972, + "grad_norm": 1.719295859336853, + "learning_rate": 4.209338686530291e-05, + "loss": 4.5318, + "step": 43778 + }, + { + "epoch": 0.2603661147587782, + "grad_norm": 2.5395655632019043, + "learning_rate": 4.209304600652733e-05, + "loss": 4.1579, + "step": 43779 + }, + { + "epoch": 0.2603720620420592, + "grad_norm": 2.6289501190185547, + "learning_rate": 4.209270514178475e-05, + "loss": 3.8592, + "step": 43780 + }, + { + "epoch": 0.26037800932534017, + "grad_norm": 1.8439676761627197, + "learning_rate": 4.209236427107528e-05, + "loss": 4.4369, + "step": 43781 + }, + { + "epoch": 0.2603839566086212, + "grad_norm": 1.5355761051177979, + "learning_rate": 4.209202339439905e-05, + "loss": 5.1627, + "step": 43782 + }, + { + "epoch": 0.2603899038919022, + "grad_norm": 2.112711191177368, + "learning_rate": 4.209168251175617e-05, + "loss": 4.3452, + "step": 43783 + }, + { + "epoch": 0.26039585117518316, + "grad_norm": 1.5635485649108887, + "learning_rate": 4.209134162314675e-05, + "loss": 4.6171, + "step": 43784 + }, + { + "epoch": 0.2604017984584642, + "grad_norm": 1.8448772430419922, + "learning_rate": 4.209100072857093e-05, + "loss": 4.8536, + "step": 43785 + }, + { + "epoch": 0.2604077457417452, + "grad_norm": 1.7786424160003662, + "learning_rate": 4.2090659828028816e-05, + "loss": 5.274, + "step": 43786 + }, + { + "epoch": 0.26041369302502615, + "grad_norm": 1.5822620391845703, + "learning_rate": 4.209031892152053e-05, + "loss": 4.6732, + "step": 43787 + }, + { + "epoch": 0.26041964030830717, + "grad_norm": 1.780371904373169, + "learning_rate": 4.208997800904619e-05, + "loss": 4.4453, + "step": 43788 + }, + { + "epoch": 0.2604255875915882, + "grad_norm": 2.6257293224334717, + "learning_rate": 4.2089637090605914e-05, + "loss": 4.0128, + "step": 43789 + }, + { + "epoch": 0.26043153487486914, + "grad_norm": 2.9115796089172363, + "learning_rate": 4.2089296166199824e-05, + "loss": 3.8934, + "step": 43790 + }, + { + "epoch": 0.26043748215815016, + "grad_norm": 2.3657307624816895, + "learning_rate": 4.208895523582803e-05, + "loss": 3.999, + "step": 43791 + }, + { + "epoch": 0.2604434294414312, + "grad_norm": 1.7664525508880615, + "learning_rate": 4.2088614299490664e-05, + "loss": 4.2614, + "step": 43792 + }, + { + "epoch": 0.26044937672471213, + "grad_norm": 2.1703991889953613, + "learning_rate": 4.208827335718783e-05, + "loss": 3.8303, + "step": 43793 + }, + { + "epoch": 0.26045532400799315, + "grad_norm": 2.117560386657715, + "learning_rate": 4.208793240891967e-05, + "loss": 3.6489, + "step": 43794 + }, + { + "epoch": 0.26046127129127417, + "grad_norm": 2.260103225708008, + "learning_rate": 4.2087591454686285e-05, + "loss": 3.7058, + "step": 43795 + }, + { + "epoch": 0.2604672185745551, + "grad_norm": 2.047292470932007, + "learning_rate": 4.20872504944878e-05, + "loss": 3.9868, + "step": 43796 + }, + { + "epoch": 0.26047316585783614, + "grad_norm": 1.6423842906951904, + "learning_rate": 4.208690952832433e-05, + "loss": 4.6769, + "step": 43797 + }, + { + "epoch": 0.26047911314111716, + "grad_norm": 1.676830768585205, + "learning_rate": 4.2086568556195996e-05, + "loss": 4.6606, + "step": 43798 + }, + { + "epoch": 0.2604850604243981, + "grad_norm": 1.72567617893219, + "learning_rate": 4.208622757810292e-05, + "loss": 5.113, + "step": 43799 + }, + { + "epoch": 0.26049100770767913, + "grad_norm": 1.5860447883605957, + "learning_rate": 4.2085886594045224e-05, + "loss": 5.4806, + "step": 43800 + }, + { + "epoch": 0.26049695499096015, + "grad_norm": 1.5377877950668335, + "learning_rate": 4.208554560402301e-05, + "loss": 5.0165, + "step": 43801 + }, + { + "epoch": 0.2605029022742411, + "grad_norm": 1.9787931442260742, + "learning_rate": 4.2085204608036414e-05, + "loss": 3.1313, + "step": 43802 + }, + { + "epoch": 0.2605088495575221, + "grad_norm": 1.6187163591384888, + "learning_rate": 4.208486360608555e-05, + "loss": 4.1075, + "step": 43803 + }, + { + "epoch": 0.26051479684080314, + "grad_norm": 1.5007753372192383, + "learning_rate": 4.208452259817053e-05, + "loss": 5.0715, + "step": 43804 + }, + { + "epoch": 0.2605207441240841, + "grad_norm": 2.5438923835754395, + "learning_rate": 4.208418158429148e-05, + "loss": 3.749, + "step": 43805 + }, + { + "epoch": 0.2605266914073651, + "grad_norm": 1.7396166324615479, + "learning_rate": 4.208384056444853e-05, + "loss": 4.6432, + "step": 43806 + }, + { + "epoch": 0.26053263869064613, + "grad_norm": 1.7303906679153442, + "learning_rate": 4.208349953864177e-05, + "loss": 4.5198, + "step": 43807 + }, + { + "epoch": 0.2605385859739271, + "grad_norm": 2.299602746963501, + "learning_rate": 4.2083158506871355e-05, + "loss": 3.7481, + "step": 43808 + }, + { + "epoch": 0.2605445332572081, + "grad_norm": 2.0325980186462402, + "learning_rate": 4.208281746913738e-05, + "loss": 3.6237, + "step": 43809 + }, + { + "epoch": 0.2605504805404891, + "grad_norm": 2.0476815700531006, + "learning_rate": 4.208247642543997e-05, + "loss": 3.297, + "step": 43810 + }, + { + "epoch": 0.2605564278237701, + "grad_norm": 2.013732433319092, + "learning_rate": 4.208213537577924e-05, + "loss": 3.3159, + "step": 43811 + }, + { + "epoch": 0.2605623751070511, + "grad_norm": 1.9484293460845947, + "learning_rate": 4.208179432015532e-05, + "loss": 3.6085, + "step": 43812 + }, + { + "epoch": 0.2605683223903321, + "grad_norm": 2.1379404067993164, + "learning_rate": 4.208145325856832e-05, + "loss": 3.0376, + "step": 43813 + }, + { + "epoch": 0.2605742696736131, + "grad_norm": 2.0082638263702393, + "learning_rate": 4.208111219101834e-05, + "loss": 3.3974, + "step": 43814 + }, + { + "epoch": 0.2605802169568941, + "grad_norm": 2.130889654159546, + "learning_rate": 4.2080771117505546e-05, + "loss": 3.4759, + "step": 43815 + }, + { + "epoch": 0.2605861642401751, + "grad_norm": 2.8863978385925293, + "learning_rate": 4.208043003803003e-05, + "loss": 4.3619, + "step": 43816 + }, + { + "epoch": 0.26059211152345607, + "grad_norm": 1.5749999284744263, + "learning_rate": 4.20800889525919e-05, + "loss": 4.5485, + "step": 43817 + }, + { + "epoch": 0.2605980588067371, + "grad_norm": 1.4774787425994873, + "learning_rate": 4.20797478611913e-05, + "loss": 4.7951, + "step": 43818 + }, + { + "epoch": 0.2606040060900181, + "grad_norm": 1.6331696510314941, + "learning_rate": 4.207940676382833e-05, + "loss": 4.8081, + "step": 43819 + }, + { + "epoch": 0.26060995337329906, + "grad_norm": 1.5321431159973145, + "learning_rate": 4.207906566050311e-05, + "loss": 4.8551, + "step": 43820 + }, + { + "epoch": 0.2606159006565801, + "grad_norm": 1.5664358139038086, + "learning_rate": 4.207872455121578e-05, + "loss": 4.8759, + "step": 43821 + }, + { + "epoch": 0.2606218479398611, + "grad_norm": 1.3907297849655151, + "learning_rate": 4.207838343596643e-05, + "loss": 4.7356, + "step": 43822 + }, + { + "epoch": 0.26062779522314206, + "grad_norm": 1.7805851697921753, + "learning_rate": 4.2078042314755194e-05, + "loss": 4.7503, + "step": 43823 + }, + { + "epoch": 0.26063374250642307, + "grad_norm": 2.3875648975372314, + "learning_rate": 4.20777011875822e-05, + "loss": 3.8172, + "step": 43824 + }, + { + "epoch": 0.2606396897897041, + "grad_norm": 2.2289814949035645, + "learning_rate": 4.2077360054447544e-05, + "loss": 3.0347, + "step": 43825 + }, + { + "epoch": 0.26064563707298505, + "grad_norm": 2.19435453414917, + "learning_rate": 4.2077018915351365e-05, + "loss": 3.2414, + "step": 43826 + }, + { + "epoch": 0.26065158435626606, + "grad_norm": 1.8712583780288696, + "learning_rate": 4.207667777029377e-05, + "loss": 3.6313, + "step": 43827 + }, + { + "epoch": 0.2606575316395471, + "grad_norm": 1.8495346307754517, + "learning_rate": 4.207633661927489e-05, + "loss": 3.8391, + "step": 43828 + }, + { + "epoch": 0.26066347892282804, + "grad_norm": 2.178816795349121, + "learning_rate": 4.207599546229483e-05, + "loss": 3.2323, + "step": 43829 + }, + { + "epoch": 0.26066942620610906, + "grad_norm": 2.3887369632720947, + "learning_rate": 4.207565429935373e-05, + "loss": 2.9307, + "step": 43830 + }, + { + "epoch": 0.26067537348939007, + "grad_norm": 2.2321536540985107, + "learning_rate": 4.2075313130451675e-05, + "loss": 3.6505, + "step": 43831 + }, + { + "epoch": 0.26068132077267103, + "grad_norm": 2.3202054500579834, + "learning_rate": 4.207497195558882e-05, + "loss": 3.5978, + "step": 43832 + }, + { + "epoch": 0.26068726805595205, + "grad_norm": 2.3106048107147217, + "learning_rate": 4.207463077476527e-05, + "loss": 3.6602, + "step": 43833 + }, + { + "epoch": 0.26069321533923306, + "grad_norm": 1.9130654335021973, + "learning_rate": 4.207428958798113e-05, + "loss": 4.8536, + "step": 43834 + }, + { + "epoch": 0.260699162622514, + "grad_norm": 2.1980602741241455, + "learning_rate": 4.207394839523654e-05, + "loss": 3.3797, + "step": 43835 + }, + { + "epoch": 0.26070510990579504, + "grad_norm": 1.928426742553711, + "learning_rate": 4.2073607196531616e-05, + "loss": 3.8108, + "step": 43836 + }, + { + "epoch": 0.26071105718907606, + "grad_norm": 1.960654854774475, + "learning_rate": 4.207326599186646e-05, + "loss": 3.345, + "step": 43837 + }, + { + "epoch": 0.260717004472357, + "grad_norm": 3.0166497230529785, + "learning_rate": 4.2072924781241215e-05, + "loss": 3.6514, + "step": 43838 + }, + { + "epoch": 0.26072295175563803, + "grad_norm": 2.2432823181152344, + "learning_rate": 4.207258356465598e-05, + "loss": 3.3611, + "step": 43839 + }, + { + "epoch": 0.26072889903891905, + "grad_norm": 2.1097803115844727, + "learning_rate": 4.207224234211089e-05, + "loss": 3.0116, + "step": 43840 + }, + { + "epoch": 0.2607348463222, + "grad_norm": 2.332404613494873, + "learning_rate": 4.2071901113606047e-05, + "loss": 3.1982, + "step": 43841 + }, + { + "epoch": 0.260740793605481, + "grad_norm": 2.250012159347534, + "learning_rate": 4.207155987914159e-05, + "loss": 3.1531, + "step": 43842 + }, + { + "epoch": 0.26074674088876204, + "grad_norm": 1.9700276851654053, + "learning_rate": 4.207121863871762e-05, + "loss": 2.9021, + "step": 43843 + }, + { + "epoch": 0.260752688172043, + "grad_norm": 2.3083548545837402, + "learning_rate": 4.207087739233427e-05, + "loss": 3.0583, + "step": 43844 + }, + { + "epoch": 0.260758635455324, + "grad_norm": 1.9300483465194702, + "learning_rate": 4.2070536139991644e-05, + "loss": 2.9899, + "step": 43845 + }, + { + "epoch": 0.260764582738605, + "grad_norm": 1.6486116647720337, + "learning_rate": 4.2070194881689876e-05, + "loss": 4.5022, + "step": 43846 + }, + { + "epoch": 0.260770530021886, + "grad_norm": 2.1554980278015137, + "learning_rate": 4.206985361742908e-05, + "loss": 4.518, + "step": 43847 + }, + { + "epoch": 0.260776477305167, + "grad_norm": 1.7218091487884521, + "learning_rate": 4.2069512347209377e-05, + "loss": 5.2807, + "step": 43848 + }, + { + "epoch": 0.26078242458844797, + "grad_norm": 1.4285657405853271, + "learning_rate": 4.2069171071030876e-05, + "loss": 5.2486, + "step": 43849 + }, + { + "epoch": 0.260788371871729, + "grad_norm": 1.8890986442565918, + "learning_rate": 4.2068829788893716e-05, + "loss": 4.3536, + "step": 43850 + }, + { + "epoch": 0.26079431915501, + "grad_norm": 1.6282395124435425, + "learning_rate": 4.206848850079799e-05, + "loss": 4.6917, + "step": 43851 + }, + { + "epoch": 0.26080026643829096, + "grad_norm": 2.2156565189361572, + "learning_rate": 4.2068147206743836e-05, + "loss": 4.008, + "step": 43852 + }, + { + "epoch": 0.260806213721572, + "grad_norm": 2.854062080383301, + "learning_rate": 4.206780590673137e-05, + "loss": 2.3916, + "step": 43853 + }, + { + "epoch": 0.260812161004853, + "grad_norm": 2.4396989345550537, + "learning_rate": 4.206746460076071e-05, + "loss": 3.4256, + "step": 43854 + }, + { + "epoch": 0.26081810828813395, + "grad_norm": 1.7372397184371948, + "learning_rate": 4.206712328883197e-05, + "loss": 4.6217, + "step": 43855 + }, + { + "epoch": 0.26082405557141497, + "grad_norm": 1.9950940608978271, + "learning_rate": 4.206678197094528e-05, + "loss": 4.1448, + "step": 43856 + }, + { + "epoch": 0.260830002854696, + "grad_norm": 2.4391424655914307, + "learning_rate": 4.2066440647100744e-05, + "loss": 2.297, + "step": 43857 + }, + { + "epoch": 0.26083595013797695, + "grad_norm": 2.6013128757476807, + "learning_rate": 4.20660993172985e-05, + "loss": 2.5795, + "step": 43858 + }, + { + "epoch": 0.26084189742125796, + "grad_norm": 2.5399587154388428, + "learning_rate": 4.206575798153866e-05, + "loss": 2.3713, + "step": 43859 + }, + { + "epoch": 0.260847844704539, + "grad_norm": 1.5722734928131104, + "learning_rate": 4.206541663982132e-05, + "loss": 4.9223, + "step": 43860 + }, + { + "epoch": 0.26085379198781994, + "grad_norm": 1.85478675365448, + "learning_rate": 4.2065075292146634e-05, + "loss": 5.0371, + "step": 43861 + }, + { + "epoch": 0.26085973927110095, + "grad_norm": 1.5921869277954102, + "learning_rate": 4.206473393851471e-05, + "loss": 4.9353, + "step": 43862 + }, + { + "epoch": 0.26086568655438197, + "grad_norm": 2.4613542556762695, + "learning_rate": 4.206439257892565e-05, + "loss": 3.7802, + "step": 43863 + }, + { + "epoch": 0.26087163383766293, + "grad_norm": 1.5190463066101074, + "learning_rate": 4.2064051213379595e-05, + "loss": 4.8565, + "step": 43864 + }, + { + "epoch": 0.26087758112094395, + "grad_norm": 1.6301043033599854, + "learning_rate": 4.206370984187666e-05, + "loss": 4.6807, + "step": 43865 + }, + { + "epoch": 0.26088352840422496, + "grad_norm": 1.2854225635528564, + "learning_rate": 4.206336846441695e-05, + "loss": 4.6531, + "step": 43866 + }, + { + "epoch": 0.2608894756875059, + "grad_norm": 1.557374119758606, + "learning_rate": 4.20630270810006e-05, + "loss": 4.5066, + "step": 43867 + }, + { + "epoch": 0.26089542297078694, + "grad_norm": 1.5259637832641602, + "learning_rate": 4.206268569162773e-05, + "loss": 4.8869, + "step": 43868 + }, + { + "epoch": 0.26090137025406795, + "grad_norm": 1.3940881490707397, + "learning_rate": 4.2062344296298446e-05, + "loss": 4.7398, + "step": 43869 + }, + { + "epoch": 0.2609073175373489, + "grad_norm": 2.021683931350708, + "learning_rate": 4.206200289501287e-05, + "loss": 4.3939, + "step": 43870 + }, + { + "epoch": 0.26091326482062993, + "grad_norm": 1.8435275554656982, + "learning_rate": 4.206166148777113e-05, + "loss": 3.949, + "step": 43871 + }, + { + "epoch": 0.26091921210391095, + "grad_norm": 1.5161638259887695, + "learning_rate": 4.206132007457334e-05, + "loss": 5.0271, + "step": 43872 + }, + { + "epoch": 0.2609251593871919, + "grad_norm": 1.5707337856292725, + "learning_rate": 4.2060978655419616e-05, + "loss": 5.3548, + "step": 43873 + }, + { + "epoch": 0.2609311066704729, + "grad_norm": 1.4160536527633667, + "learning_rate": 4.2060637230310084e-05, + "loss": 5.0665, + "step": 43874 + }, + { + "epoch": 0.26093705395375394, + "grad_norm": 1.8128427267074585, + "learning_rate": 4.206029579924487e-05, + "loss": 4.5791, + "step": 43875 + }, + { + "epoch": 0.2609430012370349, + "grad_norm": 1.8588122129440308, + "learning_rate": 4.2059954362224066e-05, + "loss": 5.0793, + "step": 43876 + }, + { + "epoch": 0.2609489485203159, + "grad_norm": 1.8820576667785645, + "learning_rate": 4.205961291924782e-05, + "loss": 4.7094, + "step": 43877 + }, + { + "epoch": 0.26095489580359693, + "grad_norm": 1.6647305488586426, + "learning_rate": 4.205927147031623e-05, + "loss": 4.7829, + "step": 43878 + }, + { + "epoch": 0.2609608430868779, + "grad_norm": 1.5867985486984253, + "learning_rate": 4.2058930015429424e-05, + "loss": 4.5999, + "step": 43879 + }, + { + "epoch": 0.2609667903701589, + "grad_norm": 2.1225719451904297, + "learning_rate": 4.205858855458753e-05, + "loss": 3.3069, + "step": 43880 + }, + { + "epoch": 0.2609727376534399, + "grad_norm": 1.9657559394836426, + "learning_rate": 4.205824708779066e-05, + "loss": 3.6601, + "step": 43881 + }, + { + "epoch": 0.2609786849367209, + "grad_norm": 1.6870774030685425, + "learning_rate": 4.205790561503892e-05, + "loss": 4.8419, + "step": 43882 + }, + { + "epoch": 0.2609846322200019, + "grad_norm": 1.8193249702453613, + "learning_rate": 4.205756413633245e-05, + "loss": 3.2216, + "step": 43883 + }, + { + "epoch": 0.2609905795032829, + "grad_norm": 1.923021912574768, + "learning_rate": 4.205722265167136e-05, + "loss": 2.9374, + "step": 43884 + }, + { + "epoch": 0.2609965267865639, + "grad_norm": 1.9798295497894287, + "learning_rate": 4.205688116105577e-05, + "loss": 2.8323, + "step": 43885 + }, + { + "epoch": 0.2610024740698449, + "grad_norm": 2.069042682647705, + "learning_rate": 4.205653966448579e-05, + "loss": 3.2655, + "step": 43886 + }, + { + "epoch": 0.2610084213531259, + "grad_norm": 1.8947601318359375, + "learning_rate": 4.205619816196156e-05, + "loss": 2.8884, + "step": 43887 + }, + { + "epoch": 0.26101436863640687, + "grad_norm": 2.0481157302856445, + "learning_rate": 4.2055856653483186e-05, + "loss": 2.7379, + "step": 43888 + }, + { + "epoch": 0.2610203159196879, + "grad_norm": 2.0647590160369873, + "learning_rate": 4.205551513905079e-05, + "loss": 2.9569, + "step": 43889 + }, + { + "epoch": 0.2610262632029689, + "grad_norm": 2.4229438304901123, + "learning_rate": 4.2055173618664485e-05, + "loss": 3.109, + "step": 43890 + }, + { + "epoch": 0.26103221048624986, + "grad_norm": 2.089310646057129, + "learning_rate": 4.20548320923244e-05, + "loss": 2.6263, + "step": 43891 + }, + { + "epoch": 0.2610381577695309, + "grad_norm": 2.0113818645477295, + "learning_rate": 4.205449056003065e-05, + "loss": 2.8855, + "step": 43892 + }, + { + "epoch": 0.2610441050528119, + "grad_norm": 2.0283143520355225, + "learning_rate": 4.205414902178334e-05, + "loss": 3.7071, + "step": 43893 + }, + { + "epoch": 0.26105005233609285, + "grad_norm": 2.313258647918701, + "learning_rate": 4.205380747758261e-05, + "loss": 3.0416, + "step": 43894 + }, + { + "epoch": 0.26105599961937387, + "grad_norm": 2.422583818435669, + "learning_rate": 4.205346592742858e-05, + "loss": 3.1554, + "step": 43895 + }, + { + "epoch": 0.2610619469026549, + "grad_norm": 1.9249426126480103, + "learning_rate": 4.205312437132135e-05, + "loss": 4.0574, + "step": 43896 + }, + { + "epoch": 0.26106789418593584, + "grad_norm": 1.800457239151001, + "learning_rate": 4.205278280926106e-05, + "loss": 4.5929, + "step": 43897 + }, + { + "epoch": 0.26107384146921686, + "grad_norm": 1.8653388023376465, + "learning_rate": 4.2052441241247814e-05, + "loss": 4.7119, + "step": 43898 + }, + { + "epoch": 0.2610797887524979, + "grad_norm": 2.304799795150757, + "learning_rate": 4.205209966728174e-05, + "loss": 3.3567, + "step": 43899 + }, + { + "epoch": 0.26108573603577884, + "grad_norm": 1.7579888105392456, + "learning_rate": 4.205175808736295e-05, + "loss": 4.6017, + "step": 43900 + }, + { + "epoch": 0.26109168331905985, + "grad_norm": 1.787979245185852, + "learning_rate": 4.205141650149157e-05, + "loss": 4.614, + "step": 43901 + }, + { + "epoch": 0.26109763060234087, + "grad_norm": 1.6716340780258179, + "learning_rate": 4.2051074909667724e-05, + "loss": 4.5721, + "step": 43902 + }, + { + "epoch": 0.26110357788562183, + "grad_norm": 1.7041585445404053, + "learning_rate": 4.205073331189151e-05, + "loss": 4.5765, + "step": 43903 + }, + { + "epoch": 0.26110952516890285, + "grad_norm": 2.274689197540283, + "learning_rate": 4.205039170816307e-05, + "loss": 4.5065, + "step": 43904 + }, + { + "epoch": 0.26111547245218386, + "grad_norm": 1.749114751815796, + "learning_rate": 4.205005009848251e-05, + "loss": 4.6173, + "step": 43905 + }, + { + "epoch": 0.2611214197354648, + "grad_norm": 1.5340276956558228, + "learning_rate": 4.204970848284996e-05, + "loss": 4.1062, + "step": 43906 + }, + { + "epoch": 0.26112736701874584, + "grad_norm": 1.4719023704528809, + "learning_rate": 4.2049366861265525e-05, + "loss": 3.8905, + "step": 43907 + }, + { + "epoch": 0.26113331430202685, + "grad_norm": 1.465850591659546, + "learning_rate": 4.2049025233729335e-05, + "loss": 3.8567, + "step": 43908 + }, + { + "epoch": 0.2611392615853078, + "grad_norm": 1.8070909976959229, + "learning_rate": 4.204868360024151e-05, + "loss": 4.6226, + "step": 43909 + }, + { + "epoch": 0.26114520886858883, + "grad_norm": 1.5699423551559448, + "learning_rate": 4.2048341960802165e-05, + "loss": 3.7377, + "step": 43910 + }, + { + "epoch": 0.26115115615186985, + "grad_norm": 1.4250156879425049, + "learning_rate": 4.2048000315411415e-05, + "loss": 3.6397, + "step": 43911 + }, + { + "epoch": 0.2611571034351508, + "grad_norm": 1.437283992767334, + "learning_rate": 4.204765866406939e-05, + "loss": 3.9927, + "step": 43912 + }, + { + "epoch": 0.2611630507184318, + "grad_norm": 1.393722414970398, + "learning_rate": 4.20473170067762e-05, + "loss": 3.5361, + "step": 43913 + }, + { + "epoch": 0.26116899800171284, + "grad_norm": 1.5683196783065796, + "learning_rate": 4.204697534353197e-05, + "loss": 3.5249, + "step": 43914 + }, + { + "epoch": 0.2611749452849938, + "grad_norm": 1.4112268686294556, + "learning_rate": 4.204663367433681e-05, + "loss": 3.7224, + "step": 43915 + }, + { + "epoch": 0.2611808925682748, + "grad_norm": 1.5035923719406128, + "learning_rate": 4.2046291999190856e-05, + "loss": 3.6233, + "step": 43916 + }, + { + "epoch": 0.26118683985155583, + "grad_norm": 1.5066548585891724, + "learning_rate": 4.204595031809421e-05, + "loss": 3.7876, + "step": 43917 + }, + { + "epoch": 0.2611927871348368, + "grad_norm": 1.63347327709198, + "learning_rate": 4.2045608631047004e-05, + "loss": 3.5837, + "step": 43918 + }, + { + "epoch": 0.2611987344181178, + "grad_norm": 1.5301264524459839, + "learning_rate": 4.204526693804936e-05, + "loss": 3.496, + "step": 43919 + }, + { + "epoch": 0.2612046817013988, + "grad_norm": 1.5526230335235596, + "learning_rate": 4.204492523910137e-05, + "loss": 3.6315, + "step": 43920 + }, + { + "epoch": 0.2612106289846798, + "grad_norm": 1.525733232498169, + "learning_rate": 4.204458353420318e-05, + "loss": 3.541, + "step": 43921 + }, + { + "epoch": 0.2612165762679608, + "grad_norm": 1.5471832752227783, + "learning_rate": 4.2044241823354905e-05, + "loss": 3.5783, + "step": 43922 + }, + { + "epoch": 0.2612225235512418, + "grad_norm": 1.6249003410339355, + "learning_rate": 4.204390010655667e-05, + "loss": 3.3877, + "step": 43923 + }, + { + "epoch": 0.2612284708345228, + "grad_norm": 1.4975247383117676, + "learning_rate": 4.204355838380857e-05, + "loss": 3.596, + "step": 43924 + }, + { + "epoch": 0.2612344181178038, + "grad_norm": 1.6215808391571045, + "learning_rate": 4.204321665511075e-05, + "loss": 3.309, + "step": 43925 + }, + { + "epoch": 0.2612403654010848, + "grad_norm": 1.5863890647888184, + "learning_rate": 4.204287492046332e-05, + "loss": 3.3916, + "step": 43926 + }, + { + "epoch": 0.26124631268436577, + "grad_norm": 1.5267432928085327, + "learning_rate": 4.204253317986638e-05, + "loss": 3.6247, + "step": 43927 + }, + { + "epoch": 0.2612522599676468, + "grad_norm": 1.6551170349121094, + "learning_rate": 4.204219143332009e-05, + "loss": 3.6105, + "step": 43928 + }, + { + "epoch": 0.2612582072509278, + "grad_norm": 1.6001818180084229, + "learning_rate": 4.204184968082453e-05, + "loss": 3.4658, + "step": 43929 + }, + { + "epoch": 0.26126415453420876, + "grad_norm": 1.565783143043518, + "learning_rate": 4.2041507922379855e-05, + "loss": 3.7963, + "step": 43930 + }, + { + "epoch": 0.2612701018174898, + "grad_norm": 1.412794828414917, + "learning_rate": 4.204116615798615e-05, + "loss": 3.5528, + "step": 43931 + }, + { + "epoch": 0.2612760491007708, + "grad_norm": 1.5581347942352295, + "learning_rate": 4.2040824387643556e-05, + "loss": 3.4484, + "step": 43932 + }, + { + "epoch": 0.26128199638405175, + "grad_norm": 1.90574312210083, + "learning_rate": 4.2040482611352185e-05, + "loss": 4.1196, + "step": 43933 + }, + { + "epoch": 0.26128794366733277, + "grad_norm": 1.60520339012146, + "learning_rate": 4.204014082911216e-05, + "loss": 4.1148, + "step": 43934 + }, + { + "epoch": 0.2612938909506138, + "grad_norm": 1.5585938692092896, + "learning_rate": 4.203979904092359e-05, + "loss": 3.4158, + "step": 43935 + }, + { + "epoch": 0.26129983823389474, + "grad_norm": 1.6262990236282349, + "learning_rate": 4.203945724678661e-05, + "loss": 3.4676, + "step": 43936 + }, + { + "epoch": 0.26130578551717576, + "grad_norm": 1.6183390617370605, + "learning_rate": 4.203911544670134e-05, + "loss": 3.4177, + "step": 43937 + }, + { + "epoch": 0.2613117328004568, + "grad_norm": 1.4017186164855957, + "learning_rate": 4.203877364066787e-05, + "loss": 3.5616, + "step": 43938 + }, + { + "epoch": 0.26131768008373774, + "grad_norm": 1.4669580459594727, + "learning_rate": 4.2038431828686355e-05, + "loss": 3.3555, + "step": 43939 + }, + { + "epoch": 0.26132362736701875, + "grad_norm": 1.584997534751892, + "learning_rate": 4.203809001075689e-05, + "loss": 3.3528, + "step": 43940 + }, + { + "epoch": 0.26132957465029977, + "grad_norm": 1.6068940162658691, + "learning_rate": 4.203774818687961e-05, + "loss": 3.3479, + "step": 43941 + }, + { + "epoch": 0.26133552193358073, + "grad_norm": 1.5392388105392456, + "learning_rate": 4.203740635705463e-05, + "loss": 3.5802, + "step": 43942 + }, + { + "epoch": 0.26134146921686174, + "grad_norm": 1.5036492347717285, + "learning_rate": 4.2037064521282066e-05, + "loss": 3.5218, + "step": 43943 + }, + { + "epoch": 0.26134741650014276, + "grad_norm": 1.4162819385528564, + "learning_rate": 4.2036722679562035e-05, + "loss": 3.6952, + "step": 43944 + }, + { + "epoch": 0.2613533637834237, + "grad_norm": 1.196006178855896, + "learning_rate": 4.203638083189466e-05, + "loss": 3.8799, + "step": 43945 + }, + { + "epoch": 0.26135931106670474, + "grad_norm": 1.8164929151535034, + "learning_rate": 4.2036038978280065e-05, + "loss": 4.3211, + "step": 43946 + }, + { + "epoch": 0.26136525834998575, + "grad_norm": 2.5604140758514404, + "learning_rate": 4.203569711871836e-05, + "loss": 4.2261, + "step": 43947 + }, + { + "epoch": 0.2613712056332667, + "grad_norm": 1.8493188619613647, + "learning_rate": 4.2035355253209675e-05, + "loss": 4.2201, + "step": 43948 + }, + { + "epoch": 0.26137715291654773, + "grad_norm": 1.8889459371566772, + "learning_rate": 4.2035013381754126e-05, + "loss": 4.2093, + "step": 43949 + }, + { + "epoch": 0.26138310019982874, + "grad_norm": 1.8021023273468018, + "learning_rate": 4.2034671504351816e-05, + "loss": 4.5704, + "step": 43950 + }, + { + "epoch": 0.2613890474831097, + "grad_norm": 1.7852281332015991, + "learning_rate": 4.2034329621002894e-05, + "loss": 4.4172, + "step": 43951 + }, + { + "epoch": 0.2613949947663907, + "grad_norm": 1.687654733657837, + "learning_rate": 4.2033987731707455e-05, + "loss": 4.858, + "step": 43952 + }, + { + "epoch": 0.26140094204967174, + "grad_norm": 1.8646222352981567, + "learning_rate": 4.203364583646563e-05, + "loss": 3.9498, + "step": 43953 + }, + { + "epoch": 0.2614068893329527, + "grad_norm": 2.1273350715637207, + "learning_rate": 4.2033303935277535e-05, + "loss": 3.115, + "step": 43954 + }, + { + "epoch": 0.2614128366162337, + "grad_norm": 1.9874173402786255, + "learning_rate": 4.2032962028143285e-05, + "loss": 3.1408, + "step": 43955 + }, + { + "epoch": 0.26141878389951473, + "grad_norm": 1.9708490371704102, + "learning_rate": 4.2032620115063005e-05, + "loss": 2.9456, + "step": 43956 + }, + { + "epoch": 0.2614247311827957, + "grad_norm": 1.8360321521759033, + "learning_rate": 4.203227819603682e-05, + "loss": 3.7938, + "step": 43957 + }, + { + "epoch": 0.2614306784660767, + "grad_norm": 1.788934350013733, + "learning_rate": 4.203193627106483e-05, + "loss": 4.7568, + "step": 43958 + }, + { + "epoch": 0.2614366257493577, + "grad_norm": 1.6715306043624878, + "learning_rate": 4.203159434014718e-05, + "loss": 4.6878, + "step": 43959 + }, + { + "epoch": 0.2614425730326387, + "grad_norm": 1.7724926471710205, + "learning_rate": 4.2031252403283974e-05, + "loss": 4.2026, + "step": 43960 + }, + { + "epoch": 0.2614485203159197, + "grad_norm": 1.4331222772598267, + "learning_rate": 4.2030910460475325e-05, + "loss": 5.1872, + "step": 43961 + }, + { + "epoch": 0.2614544675992007, + "grad_norm": 1.5300036668777466, + "learning_rate": 4.203056851172137e-05, + "loss": 5.1307, + "step": 43962 + }, + { + "epoch": 0.2614604148824817, + "grad_norm": 2.108797311782837, + "learning_rate": 4.203022655702221e-05, + "loss": 3.7719, + "step": 43963 + }, + { + "epoch": 0.2614663621657627, + "grad_norm": 1.623107671737671, + "learning_rate": 4.202988459637799e-05, + "loss": 4.778, + "step": 43964 + }, + { + "epoch": 0.26147230944904365, + "grad_norm": 1.6697406768798828, + "learning_rate": 4.20295426297888e-05, + "loss": 4.2759, + "step": 43965 + }, + { + "epoch": 0.26147825673232467, + "grad_norm": 1.6436759233474731, + "learning_rate": 4.2029200657254776e-05, + "loss": 4.4209, + "step": 43966 + }, + { + "epoch": 0.2614842040156057, + "grad_norm": 1.646278977394104, + "learning_rate": 4.202885867877604e-05, + "loss": 4.842, + "step": 43967 + }, + { + "epoch": 0.26149015129888664, + "grad_norm": 1.7414830923080444, + "learning_rate": 4.202851669435269e-05, + "loss": 4.8518, + "step": 43968 + }, + { + "epoch": 0.26149609858216766, + "grad_norm": 1.4670186042785645, + "learning_rate": 4.202817470398487e-05, + "loss": 4.8397, + "step": 43969 + }, + { + "epoch": 0.2615020458654487, + "grad_norm": 1.5615248680114746, + "learning_rate": 4.2027832707672695e-05, + "loss": 4.876, + "step": 43970 + }, + { + "epoch": 0.26150799314872963, + "grad_norm": 1.5006674528121948, + "learning_rate": 4.202749070541627e-05, + "loss": 5.063, + "step": 43971 + }, + { + "epoch": 0.26151394043201065, + "grad_norm": 1.7002580165863037, + "learning_rate": 4.202714869721574e-05, + "loss": 4.7634, + "step": 43972 + }, + { + "epoch": 0.26151988771529167, + "grad_norm": 1.5361725091934204, + "learning_rate": 4.2026806683071186e-05, + "loss": 4.8082, + "step": 43973 + }, + { + "epoch": 0.2615258349985726, + "grad_norm": 1.5829212665557861, + "learning_rate": 4.202646466298276e-05, + "loss": 4.8985, + "step": 43974 + }, + { + "epoch": 0.26153178228185364, + "grad_norm": 2.924140691757202, + "learning_rate": 4.202612263695057e-05, + "loss": 2.0461, + "step": 43975 + }, + { + "epoch": 0.26153772956513466, + "grad_norm": 2.8245744705200195, + "learning_rate": 4.202578060497474e-05, + "loss": 2.2129, + "step": 43976 + }, + { + "epoch": 0.2615436768484156, + "grad_norm": 2.5161335468292236, + "learning_rate": 4.2025438567055385e-05, + "loss": 2.0474, + "step": 43977 + }, + { + "epoch": 0.26154962413169663, + "grad_norm": 2.600510358810425, + "learning_rate": 4.202509652319263e-05, + "loss": 2.183, + "step": 43978 + }, + { + "epoch": 0.26155557141497765, + "grad_norm": 2.7661221027374268, + "learning_rate": 4.202475447338658e-05, + "loss": 1.7626, + "step": 43979 + }, + { + "epoch": 0.2615615186982586, + "grad_norm": 2.585782051086426, + "learning_rate": 4.202441241763737e-05, + "loss": 2.0485, + "step": 43980 + }, + { + "epoch": 0.2615674659815396, + "grad_norm": 2.9208240509033203, + "learning_rate": 4.202407035594511e-05, + "loss": 2.1489, + "step": 43981 + }, + { + "epoch": 0.26157341326482064, + "grad_norm": 2.608488082885742, + "learning_rate": 4.202372828830993e-05, + "loss": 1.8628, + "step": 43982 + }, + { + "epoch": 0.2615793605481016, + "grad_norm": 2.4945156574249268, + "learning_rate": 4.202338621473193e-05, + "loss": 1.7749, + "step": 43983 + }, + { + "epoch": 0.2615853078313826, + "grad_norm": 3.2789676189422607, + "learning_rate": 4.2023044135211254e-05, + "loss": 2.554, + "step": 43984 + }, + { + "epoch": 0.26159125511466363, + "grad_norm": 3.394545793533325, + "learning_rate": 4.2022702049748e-05, + "loss": 2.4436, + "step": 43985 + }, + { + "epoch": 0.2615972023979446, + "grad_norm": 2.576068162918091, + "learning_rate": 4.20223599583423e-05, + "loss": 2.8879, + "step": 43986 + }, + { + "epoch": 0.2616031496812256, + "grad_norm": 1.637431025505066, + "learning_rate": 4.202201786099427e-05, + "loss": 5.2305, + "step": 43987 + }, + { + "epoch": 0.2616090969645066, + "grad_norm": 1.5969815254211426, + "learning_rate": 4.202167575770403e-05, + "loss": 5.0085, + "step": 43988 + }, + { + "epoch": 0.2616150442477876, + "grad_norm": 1.6447721719741821, + "learning_rate": 4.20213336484717e-05, + "loss": 5.2214, + "step": 43989 + }, + { + "epoch": 0.2616209915310686, + "grad_norm": 1.4363104104995728, + "learning_rate": 4.20209915332974e-05, + "loss": 5.189, + "step": 43990 + }, + { + "epoch": 0.2616269388143496, + "grad_norm": 1.7469923496246338, + "learning_rate": 4.2020649412181244e-05, + "loss": 4.5397, + "step": 43991 + }, + { + "epoch": 0.2616328860976306, + "grad_norm": 1.6246528625488281, + "learning_rate": 4.202030728512335e-05, + "loss": 4.3031, + "step": 43992 + }, + { + "epoch": 0.2616388333809116, + "grad_norm": 1.6394681930541992, + "learning_rate": 4.201996515212385e-05, + "loss": 4.5781, + "step": 43993 + }, + { + "epoch": 0.2616447806641926, + "grad_norm": 1.75271737575531, + "learning_rate": 4.201962301318286e-05, + "loss": 4.9676, + "step": 43994 + }, + { + "epoch": 0.26165072794747357, + "grad_norm": 1.4730366468429565, + "learning_rate": 4.2019280868300486e-05, + "loss": 4.5153, + "step": 43995 + }, + { + "epoch": 0.2616566752307546, + "grad_norm": 2.661264181137085, + "learning_rate": 4.201893871747686e-05, + "loss": 4.8221, + "step": 43996 + }, + { + "epoch": 0.2616626225140356, + "grad_norm": 1.8854268789291382, + "learning_rate": 4.20185965607121e-05, + "loss": 4.5045, + "step": 43997 + }, + { + "epoch": 0.26166856979731656, + "grad_norm": 2.658061981201172, + "learning_rate": 4.2018254398006326e-05, + "loss": 4.4132, + "step": 43998 + }, + { + "epoch": 0.2616745170805976, + "grad_norm": 3.223952293395996, + "learning_rate": 4.201791222935965e-05, + "loss": 4.0028, + "step": 43999 + }, + { + "epoch": 0.2616804643638786, + "grad_norm": 3.2344250679016113, + "learning_rate": 4.20175700547722e-05, + "loss": 3.872, + "step": 44000 + }, + { + "epoch": 0.26168641164715956, + "grad_norm": 2.717089891433716, + "learning_rate": 4.201722787424409e-05, + "loss": 3.4169, + "step": 44001 + }, + { + "epoch": 0.26169235893044057, + "grad_norm": 2.401637077331543, + "learning_rate": 4.2016885687775445e-05, + "loss": 3.7274, + "step": 44002 + }, + { + "epoch": 0.2616983062137216, + "grad_norm": 2.4535679817199707, + "learning_rate": 4.201654349536638e-05, + "loss": 3.7893, + "step": 44003 + }, + { + "epoch": 0.26170425349700255, + "grad_norm": 2.252840042114258, + "learning_rate": 4.201620129701701e-05, + "loss": 3.0338, + "step": 44004 + }, + { + "epoch": 0.26171020078028356, + "grad_norm": 2.381234645843506, + "learning_rate": 4.201585909272746e-05, + "loss": 2.8514, + "step": 44005 + }, + { + "epoch": 0.2617161480635646, + "grad_norm": 2.1162943840026855, + "learning_rate": 4.201551688249785e-05, + "loss": 2.9103, + "step": 44006 + }, + { + "epoch": 0.26172209534684554, + "grad_norm": 2.1789536476135254, + "learning_rate": 4.2015174666328306e-05, + "loss": 3.3951, + "step": 44007 + }, + { + "epoch": 0.26172804263012656, + "grad_norm": 2.4236698150634766, + "learning_rate": 4.2014832444218945e-05, + "loss": 3.1719, + "step": 44008 + }, + { + "epoch": 0.26173398991340757, + "grad_norm": 2.2837328910827637, + "learning_rate": 4.2014490216169864e-05, + "loss": 3.5952, + "step": 44009 + }, + { + "epoch": 0.26173993719668853, + "grad_norm": 2.083569049835205, + "learning_rate": 4.201414798218121e-05, + "loss": 3.8288, + "step": 44010 + }, + { + "epoch": 0.26174588447996955, + "grad_norm": 1.7045209407806396, + "learning_rate": 4.2013805742253096e-05, + "loss": 4.6409, + "step": 44011 + }, + { + "epoch": 0.26175183176325056, + "grad_norm": 1.6923311948776245, + "learning_rate": 4.201346349638563e-05, + "loss": 4.5084, + "step": 44012 + }, + { + "epoch": 0.2617577790465315, + "grad_norm": 2.2869343757629395, + "learning_rate": 4.2013121244578944e-05, + "loss": 2.8791, + "step": 44013 + }, + { + "epoch": 0.26176372632981254, + "grad_norm": 1.988997459411621, + "learning_rate": 4.201277898683316e-05, + "loss": 3.0681, + "step": 44014 + }, + { + "epoch": 0.26176967361309356, + "grad_norm": 2.3942532539367676, + "learning_rate": 4.201243672314838e-05, + "loss": 2.9469, + "step": 44015 + }, + { + "epoch": 0.2617756208963745, + "grad_norm": 2.356499433517456, + "learning_rate": 4.201209445352474e-05, + "loss": 2.9252, + "step": 44016 + }, + { + "epoch": 0.26178156817965553, + "grad_norm": 2.2919883728027344, + "learning_rate": 4.2011752177962346e-05, + "loss": 2.9315, + "step": 44017 + }, + { + "epoch": 0.26178751546293655, + "grad_norm": 2.456901788711548, + "learning_rate": 4.201140989646133e-05, + "loss": 3.4781, + "step": 44018 + }, + { + "epoch": 0.2617934627462175, + "grad_norm": 1.7106175422668457, + "learning_rate": 4.201106760902181e-05, + "loss": 4.5234, + "step": 44019 + }, + { + "epoch": 0.2617994100294985, + "grad_norm": 1.9880445003509521, + "learning_rate": 4.20107253156439e-05, + "loss": 4.8552, + "step": 44020 + }, + { + "epoch": 0.26180535731277954, + "grad_norm": 1.497719407081604, + "learning_rate": 4.201038301632772e-05, + "loss": 5.0637, + "step": 44021 + }, + { + "epoch": 0.2618113045960605, + "grad_norm": 1.3691424131393433, + "learning_rate": 4.2010040711073394e-05, + "loss": 5.242, + "step": 44022 + }, + { + "epoch": 0.2618172518793415, + "grad_norm": 1.4491578340530396, + "learning_rate": 4.2009698399881036e-05, + "loss": 4.7637, + "step": 44023 + }, + { + "epoch": 0.26182319916262253, + "grad_norm": 1.5590442419052124, + "learning_rate": 4.2009356082750766e-05, + "loss": 4.6588, + "step": 44024 + }, + { + "epoch": 0.2618291464459035, + "grad_norm": 1.5390005111694336, + "learning_rate": 4.200901375968271e-05, + "loss": 4.6129, + "step": 44025 + }, + { + "epoch": 0.2618350937291845, + "grad_norm": 1.497473120689392, + "learning_rate": 4.200867143067698e-05, + "loss": 4.5559, + "step": 44026 + }, + { + "epoch": 0.2618410410124655, + "grad_norm": 1.5687944889068604, + "learning_rate": 4.20083290957337e-05, + "loss": 4.5712, + "step": 44027 + }, + { + "epoch": 0.2618469882957465, + "grad_norm": 1.6586084365844727, + "learning_rate": 4.2007986754852984e-05, + "loss": 4.736, + "step": 44028 + }, + { + "epoch": 0.2618529355790275, + "grad_norm": 2.502234935760498, + "learning_rate": 4.200764440803496e-05, + "loss": 3.851, + "step": 44029 + }, + { + "epoch": 0.2618588828623085, + "grad_norm": 3.4344186782836914, + "learning_rate": 4.200730205527974e-05, + "loss": 3.9768, + "step": 44030 + }, + { + "epoch": 0.2618648301455895, + "grad_norm": 2.6750826835632324, + "learning_rate": 4.200695969658745e-05, + "loss": 3.1349, + "step": 44031 + }, + { + "epoch": 0.2618707774288705, + "grad_norm": 2.6792654991149902, + "learning_rate": 4.200661733195821e-05, + "loss": 3.0674, + "step": 44032 + }, + { + "epoch": 0.2618767247121515, + "grad_norm": 2.531879186630249, + "learning_rate": 4.200627496139212e-05, + "loss": 3.1366, + "step": 44033 + }, + { + "epoch": 0.26188267199543247, + "grad_norm": 2.5653562545776367, + "learning_rate": 4.2005932584889326e-05, + "loss": 3.7536, + "step": 44034 + }, + { + "epoch": 0.2618886192787135, + "grad_norm": 2.6923725605010986, + "learning_rate": 4.2005590202449944e-05, + "loss": 3.662, + "step": 44035 + }, + { + "epoch": 0.2618945665619945, + "grad_norm": 2.7154061794281006, + "learning_rate": 4.200524781407408e-05, + "loss": 3.6434, + "step": 44036 + }, + { + "epoch": 0.26190051384527546, + "grad_norm": 2.4629621505737305, + "learning_rate": 4.2004905419761856e-05, + "loss": 3.0537, + "step": 44037 + }, + { + "epoch": 0.2619064611285565, + "grad_norm": 2.3902618885040283, + "learning_rate": 4.2004563019513396e-05, + "loss": 3.6104, + "step": 44038 + }, + { + "epoch": 0.2619124084118375, + "grad_norm": 2.2485692501068115, + "learning_rate": 4.200422061332881e-05, + "loss": 3.7009, + "step": 44039 + }, + { + "epoch": 0.26191835569511845, + "grad_norm": 1.7479822635650635, + "learning_rate": 4.2003878201208244e-05, + "loss": 5.0112, + "step": 44040 + }, + { + "epoch": 0.26192430297839947, + "grad_norm": 1.7502336502075195, + "learning_rate": 4.200353578315179e-05, + "loss": 4.9218, + "step": 44041 + }, + { + "epoch": 0.2619302502616805, + "grad_norm": 2.1153388023376465, + "learning_rate": 4.200319335915957e-05, + "loss": 4.4507, + "step": 44042 + }, + { + "epoch": 0.26193619754496145, + "grad_norm": 2.748811721801758, + "learning_rate": 4.200285092923173e-05, + "loss": 4.0147, + "step": 44043 + }, + { + "epoch": 0.26194214482824246, + "grad_norm": 2.679307699203491, + "learning_rate": 4.2002508493368365e-05, + "loss": 3.791, + "step": 44044 + }, + { + "epoch": 0.2619480921115235, + "grad_norm": 2.5827720165252686, + "learning_rate": 4.200216605156959e-05, + "loss": 3.8473, + "step": 44045 + }, + { + "epoch": 0.26195403939480444, + "grad_norm": 1.9016433954238892, + "learning_rate": 4.200182360383554e-05, + "loss": 4.467, + "step": 44046 + }, + { + "epoch": 0.26195998667808545, + "grad_norm": 1.6765040159225464, + "learning_rate": 4.200148115016633e-05, + "loss": 5.0843, + "step": 44047 + }, + { + "epoch": 0.26196593396136647, + "grad_norm": 1.6655350923538208, + "learning_rate": 4.200113869056208e-05, + "loss": 4.9477, + "step": 44048 + }, + { + "epoch": 0.26197188124464743, + "grad_norm": 2.3625662326812744, + "learning_rate": 4.2000796225022897e-05, + "loss": 4.335, + "step": 44049 + }, + { + "epoch": 0.26197782852792845, + "grad_norm": 2.006031036376953, + "learning_rate": 4.2000453753548926e-05, + "loss": 4.1622, + "step": 44050 + }, + { + "epoch": 0.26198377581120946, + "grad_norm": 2.2358195781707764, + "learning_rate": 4.2000111276140264e-05, + "loss": 4.144, + "step": 44051 + }, + { + "epoch": 0.2619897230944904, + "grad_norm": 2.397412061691284, + "learning_rate": 4.1999768792797045e-05, + "loss": 4.0388, + "step": 44052 + }, + { + "epoch": 0.26199567037777144, + "grad_norm": 1.8881558179855347, + "learning_rate": 4.199942630351938e-05, + "loss": 4.4978, + "step": 44053 + }, + { + "epoch": 0.26200161766105245, + "grad_norm": 2.2055280208587646, + "learning_rate": 4.199908380830738e-05, + "loss": 3.9849, + "step": 44054 + }, + { + "epoch": 0.2620075649443334, + "grad_norm": 2.129683494567871, + "learning_rate": 4.199874130716119e-05, + "loss": 3.9663, + "step": 44055 + }, + { + "epoch": 0.26201351222761443, + "grad_norm": 1.9502878189086914, + "learning_rate": 4.199839880008091e-05, + "loss": 3.9157, + "step": 44056 + }, + { + "epoch": 0.26201945951089545, + "grad_norm": 2.3300681114196777, + "learning_rate": 4.199805628706667e-05, + "loss": 3.7662, + "step": 44057 + }, + { + "epoch": 0.2620254067941764, + "grad_norm": 2.1431922912597656, + "learning_rate": 4.1997713768118576e-05, + "loss": 3.956, + "step": 44058 + }, + { + "epoch": 0.2620313540774574, + "grad_norm": 2.3916378021240234, + "learning_rate": 4.199737124323676e-05, + "loss": 4.0347, + "step": 44059 + }, + { + "epoch": 0.26203730136073844, + "grad_norm": 2.191599130630493, + "learning_rate": 4.1997028712421334e-05, + "loss": 4.2019, + "step": 44060 + }, + { + "epoch": 0.2620432486440194, + "grad_norm": 2.353742837905884, + "learning_rate": 4.1996686175672416e-05, + "loss": 3.9519, + "step": 44061 + }, + { + "epoch": 0.2620491959273004, + "grad_norm": 2.335209846496582, + "learning_rate": 4.199634363299014e-05, + "loss": 4.0935, + "step": 44062 + }, + { + "epoch": 0.26205514321058143, + "grad_norm": 2.124500274658203, + "learning_rate": 4.199600108437462e-05, + "loss": 4.0786, + "step": 44063 + }, + { + "epoch": 0.2620610904938624, + "grad_norm": 2.0568792819976807, + "learning_rate": 4.199565852982595e-05, + "loss": 4.0269, + "step": 44064 + }, + { + "epoch": 0.2620670377771434, + "grad_norm": 2.2597358226776123, + "learning_rate": 4.19953159693443e-05, + "loss": 4.0276, + "step": 44065 + }, + { + "epoch": 0.2620729850604244, + "grad_norm": 2.0981128215789795, + "learning_rate": 4.199497340292974e-05, + "loss": 4.0263, + "step": 44066 + }, + { + "epoch": 0.2620789323437054, + "grad_norm": 2.3113160133361816, + "learning_rate": 4.199463083058242e-05, + "loss": 3.9109, + "step": 44067 + }, + { + "epoch": 0.2620848796269864, + "grad_norm": 2.3302650451660156, + "learning_rate": 4.199428825230245e-05, + "loss": 4.0293, + "step": 44068 + }, + { + "epoch": 0.2620908269102674, + "grad_norm": 2.414778709411621, + "learning_rate": 4.199394566808994e-05, + "loss": 3.8504, + "step": 44069 + }, + { + "epoch": 0.2620967741935484, + "grad_norm": 2.3160486221313477, + "learning_rate": 4.199360307794503e-05, + "loss": 3.8867, + "step": 44070 + }, + { + "epoch": 0.2621027214768294, + "grad_norm": 2.1485066413879395, + "learning_rate": 4.199326048186782e-05, + "loss": 4.0048, + "step": 44071 + }, + { + "epoch": 0.2621086687601104, + "grad_norm": 2.2402124404907227, + "learning_rate": 4.199291787985844e-05, + "loss": 4.0307, + "step": 44072 + }, + { + "epoch": 0.26211461604339137, + "grad_norm": 2.1730403900146484, + "learning_rate": 4.199257527191701e-05, + "loss": 3.9843, + "step": 44073 + }, + { + "epoch": 0.2621205633266724, + "grad_norm": 2.7257535457611084, + "learning_rate": 4.199223265804365e-05, + "loss": 4.0468, + "step": 44074 + }, + { + "epoch": 0.2621265106099534, + "grad_norm": 2.5445992946624756, + "learning_rate": 4.199189003823848e-05, + "loss": 3.9254, + "step": 44075 + }, + { + "epoch": 0.26213245789323436, + "grad_norm": 2.295536756515503, + "learning_rate": 4.199154741250161e-05, + "loss": 4.1562, + "step": 44076 + }, + { + "epoch": 0.2621384051765154, + "grad_norm": 2.2047746181488037, + "learning_rate": 4.199120478083317e-05, + "loss": 4.1273, + "step": 44077 + }, + { + "epoch": 0.2621443524597964, + "grad_norm": 2.3367745876312256, + "learning_rate": 4.199086214323327e-05, + "loss": 3.9324, + "step": 44078 + }, + { + "epoch": 0.26215029974307735, + "grad_norm": 2.416872262954712, + "learning_rate": 4.199051949970204e-05, + "loss": 4.0975, + "step": 44079 + }, + { + "epoch": 0.26215624702635837, + "grad_norm": 2.5495285987854004, + "learning_rate": 4.19901768502396e-05, + "loss": 4.0489, + "step": 44080 + }, + { + "epoch": 0.26216219430963933, + "grad_norm": 2.2779881954193115, + "learning_rate": 4.1989834194846054e-05, + "loss": 4.0856, + "step": 44081 + }, + { + "epoch": 0.26216814159292035, + "grad_norm": 2.0107266902923584, + "learning_rate": 4.198949153352155e-05, + "loss": 5.0038, + "step": 44082 + }, + { + "epoch": 0.26217408887620136, + "grad_norm": 2.25978422164917, + "learning_rate": 4.198914886626617e-05, + "loss": 4.153, + "step": 44083 + }, + { + "epoch": 0.2621800361594823, + "grad_norm": 2.453813314437866, + "learning_rate": 4.1988806193080066e-05, + "loss": 3.8837, + "step": 44084 + }, + { + "epoch": 0.26218598344276334, + "grad_norm": 2.52192759513855, + "learning_rate": 4.198846351396334e-05, + "loss": 3.797, + "step": 44085 + }, + { + "epoch": 0.26219193072604435, + "grad_norm": 2.335920810699463, + "learning_rate": 4.198812082891612e-05, + "loss": 3.9714, + "step": 44086 + }, + { + "epoch": 0.2621978780093253, + "grad_norm": 2.330185651779175, + "learning_rate": 4.1987778137938514e-05, + "loss": 3.8577, + "step": 44087 + }, + { + "epoch": 0.26220382529260633, + "grad_norm": 2.365217447280884, + "learning_rate": 4.198743544103066e-05, + "loss": 3.9381, + "step": 44088 + }, + { + "epoch": 0.26220977257588735, + "grad_norm": 2.254457950592041, + "learning_rate": 4.198709273819267e-05, + "loss": 3.9019, + "step": 44089 + }, + { + "epoch": 0.2622157198591683, + "grad_norm": 2.503740072250366, + "learning_rate": 4.198675002942465e-05, + "loss": 3.9369, + "step": 44090 + }, + { + "epoch": 0.2622216671424493, + "grad_norm": 2.0757358074188232, + "learning_rate": 4.198640731472674e-05, + "loss": 4.0693, + "step": 44091 + }, + { + "epoch": 0.26222761442573034, + "grad_norm": 2.1759204864501953, + "learning_rate": 4.198606459409905e-05, + "loss": 4.0927, + "step": 44092 + }, + { + "epoch": 0.2622335617090113, + "grad_norm": 2.0640549659729004, + "learning_rate": 4.198572186754169e-05, + "loss": 3.8604, + "step": 44093 + }, + { + "epoch": 0.2622395089922923, + "grad_norm": 2.6722636222839355, + "learning_rate": 4.1985379135054804e-05, + "loss": 3.9663, + "step": 44094 + }, + { + "epoch": 0.26224545627557333, + "grad_norm": 2.1026225090026855, + "learning_rate": 4.198503639663849e-05, + "loss": 4.1675, + "step": 44095 + }, + { + "epoch": 0.2622514035588543, + "grad_norm": 2.359506368637085, + "learning_rate": 4.1984693652292875e-05, + "loss": 4.1184, + "step": 44096 + }, + { + "epoch": 0.2622573508421353, + "grad_norm": 1.8636878728866577, + "learning_rate": 4.1984350902018085e-05, + "loss": 4.2384, + "step": 44097 + }, + { + "epoch": 0.2622632981254163, + "grad_norm": 1.715948224067688, + "learning_rate": 4.1984008145814227e-05, + "loss": 4.7518, + "step": 44098 + }, + { + "epoch": 0.2622692454086973, + "grad_norm": 1.5115265846252441, + "learning_rate": 4.198366538368143e-05, + "loss": 5.0773, + "step": 44099 + }, + { + "epoch": 0.2622751926919783, + "grad_norm": 1.6105283498764038, + "learning_rate": 4.1983322615619814e-05, + "loss": 5.3176, + "step": 44100 + }, + { + "epoch": 0.2622811399752593, + "grad_norm": 1.5279958248138428, + "learning_rate": 4.198297984162949e-05, + "loss": 5.7206, + "step": 44101 + }, + { + "epoch": 0.2622870872585403, + "grad_norm": 1.599890947341919, + "learning_rate": 4.198263706171059e-05, + "loss": 5.6809, + "step": 44102 + }, + { + "epoch": 0.2622930345418213, + "grad_norm": 1.6186431646347046, + "learning_rate": 4.198229427586322e-05, + "loss": 4.9857, + "step": 44103 + }, + { + "epoch": 0.2622989818251023, + "grad_norm": 1.9991430044174194, + "learning_rate": 4.1981951484087506e-05, + "loss": 3.9664, + "step": 44104 + }, + { + "epoch": 0.26230492910838327, + "grad_norm": 2.7736895084381104, + "learning_rate": 4.198160868638357e-05, + "loss": 3.0758, + "step": 44105 + }, + { + "epoch": 0.2623108763916643, + "grad_norm": 2.8208062648773193, + "learning_rate": 4.198126588275153e-05, + "loss": 2.9229, + "step": 44106 + }, + { + "epoch": 0.2623168236749453, + "grad_norm": 2.680412769317627, + "learning_rate": 4.1980923073191507e-05, + "loss": 2.9657, + "step": 44107 + }, + { + "epoch": 0.26232277095822626, + "grad_norm": 2.495391607284546, + "learning_rate": 4.1980580257703616e-05, + "loss": 2.7797, + "step": 44108 + }, + { + "epoch": 0.2623287182415073, + "grad_norm": 1.8868261575698853, + "learning_rate": 4.1980237436287985e-05, + "loss": 4.1827, + "step": 44109 + }, + { + "epoch": 0.2623346655247883, + "grad_norm": 2.0129642486572266, + "learning_rate": 4.197989460894472e-05, + "loss": 4.6399, + "step": 44110 + }, + { + "epoch": 0.26234061280806925, + "grad_norm": 1.596671462059021, + "learning_rate": 4.1979551775673965e-05, + "loss": 4.9433, + "step": 44111 + }, + { + "epoch": 0.26234656009135027, + "grad_norm": 1.5336277484893799, + "learning_rate": 4.197920893647581e-05, + "loss": 4.9537, + "step": 44112 + }, + { + "epoch": 0.2623525073746313, + "grad_norm": 1.7672969102859497, + "learning_rate": 4.1978866091350395e-05, + "loss": 4.9916, + "step": 44113 + }, + { + "epoch": 0.26235845465791224, + "grad_norm": 1.6198004484176636, + "learning_rate": 4.197852324029783e-05, + "loss": 4.9232, + "step": 44114 + }, + { + "epoch": 0.26236440194119326, + "grad_norm": 1.6348659992218018, + "learning_rate": 4.1978180383318244e-05, + "loss": 4.9908, + "step": 44115 + }, + { + "epoch": 0.2623703492244743, + "grad_norm": 1.4460006952285767, + "learning_rate": 4.197783752041174e-05, + "loss": 5.4412, + "step": 44116 + }, + { + "epoch": 0.26237629650775524, + "grad_norm": 1.486946702003479, + "learning_rate": 4.1977494651578454e-05, + "loss": 4.4583, + "step": 44117 + }, + { + "epoch": 0.26238224379103625, + "grad_norm": 1.6212738752365112, + "learning_rate": 4.1977151776818505e-05, + "loss": 4.5885, + "step": 44118 + }, + { + "epoch": 0.26238819107431727, + "grad_norm": 1.813104510307312, + "learning_rate": 4.1976808896131994e-05, + "loss": 4.5691, + "step": 44119 + }, + { + "epoch": 0.26239413835759823, + "grad_norm": 1.4851202964782715, + "learning_rate": 4.1976466009519065e-05, + "loss": 4.3396, + "step": 44120 + }, + { + "epoch": 0.26240008564087924, + "grad_norm": 1.6301406621932983, + "learning_rate": 4.1976123116979826e-05, + "loss": 4.7036, + "step": 44121 + }, + { + "epoch": 0.26240603292416026, + "grad_norm": 1.6084003448486328, + "learning_rate": 4.197578021851439e-05, + "loss": 5.4579, + "step": 44122 + }, + { + "epoch": 0.2624119802074412, + "grad_norm": 1.5889935493469238, + "learning_rate": 4.19754373141229e-05, + "loss": 4.9881, + "step": 44123 + }, + { + "epoch": 0.26241792749072224, + "grad_norm": 1.7256262302398682, + "learning_rate": 4.197509440380545e-05, + "loss": 4.5436, + "step": 44124 + }, + { + "epoch": 0.26242387477400325, + "grad_norm": 1.58371901512146, + "learning_rate": 4.197475148756217e-05, + "loss": 4.5359, + "step": 44125 + }, + { + "epoch": 0.2624298220572842, + "grad_norm": 1.5584211349487305, + "learning_rate": 4.197440856539319e-05, + "loss": 4.5264, + "step": 44126 + }, + { + "epoch": 0.26243576934056523, + "grad_norm": 1.5799497365951538, + "learning_rate": 4.1974065637298604e-05, + "loss": 4.5841, + "step": 44127 + }, + { + "epoch": 0.26244171662384624, + "grad_norm": 1.6470918655395508, + "learning_rate": 4.197372270327855e-05, + "loss": 4.441, + "step": 44128 + }, + { + "epoch": 0.2624476639071272, + "grad_norm": 1.7103855609893799, + "learning_rate": 4.1973379763333146e-05, + "loss": 4.5933, + "step": 44129 + }, + { + "epoch": 0.2624536111904082, + "grad_norm": 1.6349234580993652, + "learning_rate": 4.197303681746252e-05, + "loss": 4.3912, + "step": 44130 + }, + { + "epoch": 0.26245955847368924, + "grad_norm": 1.5366933345794678, + "learning_rate": 4.197269386566677e-05, + "loss": 4.3587, + "step": 44131 + }, + { + "epoch": 0.2624655057569702, + "grad_norm": 1.6163691282272339, + "learning_rate": 4.1972350907946036e-05, + "loss": 4.4288, + "step": 44132 + }, + { + "epoch": 0.2624714530402512, + "grad_norm": 2.096633195877075, + "learning_rate": 4.197200794430043e-05, + "loss": 3.6771, + "step": 44133 + }, + { + "epoch": 0.26247740032353223, + "grad_norm": 2.648409128189087, + "learning_rate": 4.197166497473006e-05, + "loss": 2.3358, + "step": 44134 + }, + { + "epoch": 0.2624833476068132, + "grad_norm": 2.646113395690918, + "learning_rate": 4.197132199923507e-05, + "loss": 2.2044, + "step": 44135 + }, + { + "epoch": 0.2624892948900942, + "grad_norm": 2.636021852493286, + "learning_rate": 4.197097901781556e-05, + "loss": 2.3754, + "step": 44136 + }, + { + "epoch": 0.2624952421733752, + "grad_norm": 2.378528356552124, + "learning_rate": 4.197063603047166e-05, + "loss": 3.2268, + "step": 44137 + }, + { + "epoch": 0.2625011894566562, + "grad_norm": 2.079437255859375, + "learning_rate": 4.197029303720348e-05, + "loss": 3.7033, + "step": 44138 + }, + { + "epoch": 0.2625071367399372, + "grad_norm": 1.7137008905410767, + "learning_rate": 4.1969950038011155e-05, + "loss": 4.3828, + "step": 44139 + }, + { + "epoch": 0.2625130840232182, + "grad_norm": 1.6996052265167236, + "learning_rate": 4.196960703289479e-05, + "loss": 5.0651, + "step": 44140 + }, + { + "epoch": 0.2625190313064992, + "grad_norm": 1.5860140323638916, + "learning_rate": 4.196926402185451e-05, + "loss": 5.2774, + "step": 44141 + }, + { + "epoch": 0.2625249785897802, + "grad_norm": 1.3971854448318481, + "learning_rate": 4.196892100489044e-05, + "loss": 5.1563, + "step": 44142 + }, + { + "epoch": 0.2625309258730612, + "grad_norm": 2.145833969116211, + "learning_rate": 4.196857798200269e-05, + "loss": 5.0171, + "step": 44143 + }, + { + "epoch": 0.26253687315634217, + "grad_norm": 2.032012939453125, + "learning_rate": 4.1968234953191385e-05, + "loss": 4.9083, + "step": 44144 + }, + { + "epoch": 0.2625428204396232, + "grad_norm": 2.442633867263794, + "learning_rate": 4.1967891918456646e-05, + "loss": 3.763, + "step": 44145 + }, + { + "epoch": 0.2625487677229042, + "grad_norm": 2.8035483360290527, + "learning_rate": 4.19675488777986e-05, + "loss": 3.1453, + "step": 44146 + }, + { + "epoch": 0.26255471500618516, + "grad_norm": 2.7200026512145996, + "learning_rate": 4.1967205831217345e-05, + "loss": 3.2262, + "step": 44147 + }, + { + "epoch": 0.2625606622894662, + "grad_norm": 2.689765453338623, + "learning_rate": 4.196686277871302e-05, + "loss": 3.1688, + "step": 44148 + }, + { + "epoch": 0.2625666095727472, + "grad_norm": 1.7186956405639648, + "learning_rate": 4.196651972028573e-05, + "loss": 3.7177, + "step": 44149 + }, + { + "epoch": 0.26257255685602815, + "grad_norm": 1.6178631782531738, + "learning_rate": 4.196617665593561e-05, + "loss": 5.6355, + "step": 44150 + }, + { + "epoch": 0.26257850413930917, + "grad_norm": 2.2261226177215576, + "learning_rate": 4.1965833585662785e-05, + "loss": 4.0169, + "step": 44151 + }, + { + "epoch": 0.2625844514225902, + "grad_norm": 1.9213829040527344, + "learning_rate": 4.196549050946734e-05, + "loss": 3.0525, + "step": 44152 + }, + { + "epoch": 0.26259039870587114, + "grad_norm": 2.3488712310791016, + "learning_rate": 4.1965147427349424e-05, + "loss": 2.8859, + "step": 44153 + }, + { + "epoch": 0.26259634598915216, + "grad_norm": 2.141206979751587, + "learning_rate": 4.196480433930916e-05, + "loss": 3.0892, + "step": 44154 + }, + { + "epoch": 0.2626022932724332, + "grad_norm": 2.2545976638793945, + "learning_rate": 4.196446124534665e-05, + "loss": 3.2111, + "step": 44155 + }, + { + "epoch": 0.26260824055571413, + "grad_norm": 2.1111128330230713, + "learning_rate": 4.196411814546203e-05, + "loss": 2.9655, + "step": 44156 + }, + { + "epoch": 0.26261418783899515, + "grad_norm": 1.9993940591812134, + "learning_rate": 4.1963775039655404e-05, + "loss": 2.9162, + "step": 44157 + }, + { + "epoch": 0.26262013512227617, + "grad_norm": 1.972664475440979, + "learning_rate": 4.196343192792691e-05, + "loss": 2.9825, + "step": 44158 + }, + { + "epoch": 0.2626260824055571, + "grad_norm": 2.1971402168273926, + "learning_rate": 4.196308881027664e-05, + "loss": 3.0623, + "step": 44159 + }, + { + "epoch": 0.26263202968883814, + "grad_norm": 1.742624282836914, + "learning_rate": 4.196274568670474e-05, + "loss": 3.826, + "step": 44160 + }, + { + "epoch": 0.26263797697211916, + "grad_norm": 1.8869210481643677, + "learning_rate": 4.196240255721132e-05, + "loss": 4.0562, + "step": 44161 + }, + { + "epoch": 0.2626439242554001, + "grad_norm": 2.271061420440674, + "learning_rate": 4.1962059421796506e-05, + "loss": 3.0422, + "step": 44162 + }, + { + "epoch": 0.26264987153868113, + "grad_norm": 2.325456380844116, + "learning_rate": 4.196171628046041e-05, + "loss": 2.8511, + "step": 44163 + }, + { + "epoch": 0.26265581882196215, + "grad_norm": 2.3472139835357666, + "learning_rate": 4.196137313320315e-05, + "loss": 2.933, + "step": 44164 + }, + { + "epoch": 0.2626617661052431, + "grad_norm": 2.0047194957733154, + "learning_rate": 4.1961029980024844e-05, + "loss": 2.8647, + "step": 44165 + }, + { + "epoch": 0.2626677133885241, + "grad_norm": 2.118291139602661, + "learning_rate": 4.196068682092563e-05, + "loss": 2.8347, + "step": 44166 + }, + { + "epoch": 0.26267366067180514, + "grad_norm": 2.4748799800872803, + "learning_rate": 4.196034365590561e-05, + "loss": 3.0931, + "step": 44167 + }, + { + "epoch": 0.2626796079550861, + "grad_norm": 2.1693809032440186, + "learning_rate": 4.196000048496491e-05, + "loss": 3.836, + "step": 44168 + }, + { + "epoch": 0.2626855552383671, + "grad_norm": 2.3726394176483154, + "learning_rate": 4.1959657308103654e-05, + "loss": 2.9737, + "step": 44169 + }, + { + "epoch": 0.26269150252164813, + "grad_norm": 2.508892774581909, + "learning_rate": 4.195931412532195e-05, + "loss": 3.0933, + "step": 44170 + }, + { + "epoch": 0.2626974498049291, + "grad_norm": 2.133579969406128, + "learning_rate": 4.195897093661992e-05, + "loss": 3.0617, + "step": 44171 + }, + { + "epoch": 0.2627033970882101, + "grad_norm": 2.3983969688415527, + "learning_rate": 4.19586277419977e-05, + "loss": 3.066, + "step": 44172 + }, + { + "epoch": 0.2627093443714911, + "grad_norm": 2.5762155055999756, + "learning_rate": 4.195828454145539e-05, + "loss": 3.1986, + "step": 44173 + }, + { + "epoch": 0.2627152916547721, + "grad_norm": 2.5651729106903076, + "learning_rate": 4.195794133499312e-05, + "loss": 3.112, + "step": 44174 + }, + { + "epoch": 0.2627212389380531, + "grad_norm": 2.3620264530181885, + "learning_rate": 4.195759812261101e-05, + "loss": 3.0425, + "step": 44175 + }, + { + "epoch": 0.2627271862213341, + "grad_norm": 2.7804360389709473, + "learning_rate": 4.195725490430917e-05, + "loss": 3.3752, + "step": 44176 + }, + { + "epoch": 0.2627331335046151, + "grad_norm": 2.0118839740753174, + "learning_rate": 4.195691168008774e-05, + "loss": 4.0877, + "step": 44177 + }, + { + "epoch": 0.2627390807878961, + "grad_norm": 2.3460769653320312, + "learning_rate": 4.195656844994682e-05, + "loss": 3.8663, + "step": 44178 + }, + { + "epoch": 0.2627450280711771, + "grad_norm": 2.1471691131591797, + "learning_rate": 4.195622521388654e-05, + "loss": 3.6439, + "step": 44179 + }, + { + "epoch": 0.26275097535445807, + "grad_norm": 1.5556572675704956, + "learning_rate": 4.195588197190702e-05, + "loss": 4.3497, + "step": 44180 + }, + { + "epoch": 0.2627569226377391, + "grad_norm": 1.412746548652649, + "learning_rate": 4.195553872400837e-05, + "loss": 5.2618, + "step": 44181 + }, + { + "epoch": 0.2627628699210201, + "grad_norm": 1.3836572170257568, + "learning_rate": 4.1955195470190714e-05, + "loss": 5.1353, + "step": 44182 + }, + { + "epoch": 0.26276881720430106, + "grad_norm": 1.3846430778503418, + "learning_rate": 4.1954852210454174e-05, + "loss": 4.8671, + "step": 44183 + }, + { + "epoch": 0.2627747644875821, + "grad_norm": 1.9570037126541138, + "learning_rate": 4.195450894479889e-05, + "loss": 3.6837, + "step": 44184 + }, + { + "epoch": 0.2627807117708631, + "grad_norm": 3.1948282718658447, + "learning_rate": 4.195416567322494e-05, + "loss": 1.824, + "step": 44185 + }, + { + "epoch": 0.26278665905414406, + "grad_norm": 2.6910207271575928, + "learning_rate": 4.195382239573247e-05, + "loss": 1.9739, + "step": 44186 + }, + { + "epoch": 0.26279260633742507, + "grad_norm": 2.610029458999634, + "learning_rate": 4.1953479112321604e-05, + "loss": 1.5966, + "step": 44187 + }, + { + "epoch": 0.2627985536207061, + "grad_norm": 2.4338159561157227, + "learning_rate": 4.195313582299245e-05, + "loss": 1.5244, + "step": 44188 + }, + { + "epoch": 0.26280450090398705, + "grad_norm": 2.249502658843994, + "learning_rate": 4.1952792527745125e-05, + "loss": 2.2845, + "step": 44189 + }, + { + "epoch": 0.26281044818726806, + "grad_norm": 2.263521432876587, + "learning_rate": 4.1952449226579764e-05, + "loss": 3.5758, + "step": 44190 + }, + { + "epoch": 0.2628163954705491, + "grad_norm": 1.8042908906936646, + "learning_rate": 4.1952105919496475e-05, + "loss": 4.1778, + "step": 44191 + }, + { + "epoch": 0.26282234275383004, + "grad_norm": 1.88785719871521, + "learning_rate": 4.195176260649538e-05, + "loss": 4.5892, + "step": 44192 + }, + { + "epoch": 0.26282829003711106, + "grad_norm": 1.9621638059616089, + "learning_rate": 4.19514192875766e-05, + "loss": 4.8349, + "step": 44193 + }, + { + "epoch": 0.2628342373203921, + "grad_norm": 2.054367780685425, + "learning_rate": 4.195107596274026e-05, + "loss": 4.9566, + "step": 44194 + }, + { + "epoch": 0.26284018460367303, + "grad_norm": 1.8205450773239136, + "learning_rate": 4.1950732631986466e-05, + "loss": 4.6926, + "step": 44195 + }, + { + "epoch": 0.26284613188695405, + "grad_norm": 1.699273943901062, + "learning_rate": 4.1950389295315354e-05, + "loss": 4.4285, + "step": 44196 + }, + { + "epoch": 0.262852079170235, + "grad_norm": 1.6817026138305664, + "learning_rate": 4.195004595272703e-05, + "loss": 4.5246, + "step": 44197 + }, + { + "epoch": 0.262858026453516, + "grad_norm": 1.6267107725143433, + "learning_rate": 4.194970260422162e-05, + "loss": 4.3785, + "step": 44198 + }, + { + "epoch": 0.26286397373679704, + "grad_norm": 1.81754732131958, + "learning_rate": 4.194935924979925e-05, + "loss": 4.629, + "step": 44199 + }, + { + "epoch": 0.262869921020078, + "grad_norm": 1.728158950805664, + "learning_rate": 4.194901588946003e-05, + "loss": 4.7465, + "step": 44200 + }, + { + "epoch": 0.262875868303359, + "grad_norm": 1.6799763441085815, + "learning_rate": 4.194867252320408e-05, + "loss": 4.6788, + "step": 44201 + }, + { + "epoch": 0.26288181558664003, + "grad_norm": 1.4966890811920166, + "learning_rate": 4.194832915103153e-05, + "loss": 4.6816, + "step": 44202 + }, + { + "epoch": 0.262887762869921, + "grad_norm": 1.698941707611084, + "learning_rate": 4.1947985772942485e-05, + "loss": 4.6484, + "step": 44203 + }, + { + "epoch": 0.262893710153202, + "grad_norm": 1.7820701599121094, + "learning_rate": 4.1947642388937085e-05, + "loss": 4.4778, + "step": 44204 + }, + { + "epoch": 0.262899657436483, + "grad_norm": 1.5127617120742798, + "learning_rate": 4.194729899901543e-05, + "loss": 4.5335, + "step": 44205 + }, + { + "epoch": 0.262905604719764, + "grad_norm": 2.1560637950897217, + "learning_rate": 4.194695560317765e-05, + "loss": 4.9966, + "step": 44206 + }, + { + "epoch": 0.262911552003045, + "grad_norm": 1.4712954759597778, + "learning_rate": 4.194661220142386e-05, + "loss": 4.5765, + "step": 44207 + }, + { + "epoch": 0.262917499286326, + "grad_norm": 1.4645200967788696, + "learning_rate": 4.194626879375419e-05, + "loss": 4.396, + "step": 44208 + }, + { + "epoch": 0.262923446569607, + "grad_norm": 1.6264413595199585, + "learning_rate": 4.194592538016875e-05, + "loss": 4.5356, + "step": 44209 + }, + { + "epoch": 0.262929393852888, + "grad_norm": 1.5492967367172241, + "learning_rate": 4.194558196066766e-05, + "loss": 4.5123, + "step": 44210 + }, + { + "epoch": 0.262935341136169, + "grad_norm": 1.4675558805465698, + "learning_rate": 4.194523853525104e-05, + "loss": 4.727, + "step": 44211 + }, + { + "epoch": 0.26294128841944997, + "grad_norm": 1.5543279647827148, + "learning_rate": 4.1944895103919015e-05, + "loss": 5.2128, + "step": 44212 + }, + { + "epoch": 0.262947235702731, + "grad_norm": 2.1130967140197754, + "learning_rate": 4.1944551666671705e-05, + "loss": 5.2702, + "step": 44213 + }, + { + "epoch": 0.262953182986012, + "grad_norm": 1.6615848541259766, + "learning_rate": 4.194420822350923e-05, + "loss": 5.5149, + "step": 44214 + }, + { + "epoch": 0.26295913026929296, + "grad_norm": 1.6505664587020874, + "learning_rate": 4.19438647744317e-05, + "loss": 4.8498, + "step": 44215 + }, + { + "epoch": 0.262965077552574, + "grad_norm": 1.4742753505706787, + "learning_rate": 4.194352131943924e-05, + "loss": 4.7452, + "step": 44216 + }, + { + "epoch": 0.262971024835855, + "grad_norm": 1.3409984111785889, + "learning_rate": 4.194317785853198e-05, + "loss": 4.6858, + "step": 44217 + }, + { + "epoch": 0.26297697211913595, + "grad_norm": 1.9667925834655762, + "learning_rate": 4.1942834391710014e-05, + "loss": 4.6855, + "step": 44218 + }, + { + "epoch": 0.26298291940241697, + "grad_norm": 1.7673627138137817, + "learning_rate": 4.19424909189735e-05, + "loss": 5.2378, + "step": 44219 + }, + { + "epoch": 0.262988866685698, + "grad_norm": 1.707457423210144, + "learning_rate": 4.194214744032253e-05, + "loss": 5.2572, + "step": 44220 + }, + { + "epoch": 0.26299481396897895, + "grad_norm": 1.5715718269348145, + "learning_rate": 4.194180395575723e-05, + "loss": 5.2223, + "step": 44221 + }, + { + "epoch": 0.26300076125225996, + "grad_norm": 1.3970218896865845, + "learning_rate": 4.194146046527773e-05, + "loss": 5.0812, + "step": 44222 + }, + { + "epoch": 0.263006708535541, + "grad_norm": 1.8977161645889282, + "learning_rate": 4.194111696888413e-05, + "loss": 4.0085, + "step": 44223 + }, + { + "epoch": 0.26301265581882194, + "grad_norm": 1.7168632745742798, + "learning_rate": 4.194077346657657e-05, + "loss": 4.3523, + "step": 44224 + }, + { + "epoch": 0.26301860310210295, + "grad_norm": 3.469172239303589, + "learning_rate": 4.1940429958355155e-05, + "loss": 3.4219, + "step": 44225 + }, + { + "epoch": 0.26302455038538397, + "grad_norm": 2.6600615978240967, + "learning_rate": 4.194008644422001e-05, + "loss": 2.9454, + "step": 44226 + }, + { + "epoch": 0.26303049766866493, + "grad_norm": 5.29782247543335, + "learning_rate": 4.193974292417126e-05, + "loss": 3.1514, + "step": 44227 + }, + { + "epoch": 0.26303644495194595, + "grad_norm": 2.1534676551818848, + "learning_rate": 4.193939939820901e-05, + "loss": 3.1103, + "step": 44228 + }, + { + "epoch": 0.26304239223522696, + "grad_norm": 2.5763165950775146, + "learning_rate": 4.19390558663334e-05, + "loss": 3.4094, + "step": 44229 + }, + { + "epoch": 0.2630483395185079, + "grad_norm": 2.409266233444214, + "learning_rate": 4.1938712328544536e-05, + "loss": 2.9619, + "step": 44230 + }, + { + "epoch": 0.26305428680178894, + "grad_norm": 2.097928285598755, + "learning_rate": 4.193836878484255e-05, + "loss": 3.2015, + "step": 44231 + }, + { + "epoch": 0.26306023408506995, + "grad_norm": 2.565640926361084, + "learning_rate": 4.1938025235227554e-05, + "loss": 3.2778, + "step": 44232 + }, + { + "epoch": 0.2630661813683509, + "grad_norm": 2.3798775672912598, + "learning_rate": 4.193768167969966e-05, + "loss": 3.2333, + "step": 44233 + }, + { + "epoch": 0.26307212865163193, + "grad_norm": 2.242220401763916, + "learning_rate": 4.1937338118259006e-05, + "loss": 2.7949, + "step": 44234 + }, + { + "epoch": 0.26307807593491295, + "grad_norm": 2.1417369842529297, + "learning_rate": 4.1936994550905694e-05, + "loss": 3.0677, + "step": 44235 + }, + { + "epoch": 0.2630840232181939, + "grad_norm": 3.0530290603637695, + "learning_rate": 4.1936650977639855e-05, + "loss": 3.2208, + "step": 44236 + }, + { + "epoch": 0.2630899705014749, + "grad_norm": 2.2744104862213135, + "learning_rate": 4.193630739846162e-05, + "loss": 3.0182, + "step": 44237 + }, + { + "epoch": 0.26309591778475594, + "grad_norm": 2.1448254585266113, + "learning_rate": 4.193596381337107e-05, + "loss": 3.3144, + "step": 44238 + }, + { + "epoch": 0.2631018650680369, + "grad_norm": 2.4136734008789062, + "learning_rate": 4.193562022236837e-05, + "loss": 3.0323, + "step": 44239 + }, + { + "epoch": 0.2631078123513179, + "grad_norm": 1.6952052116394043, + "learning_rate": 4.193527662545361e-05, + "loss": 4.6693, + "step": 44240 + }, + { + "epoch": 0.26311375963459893, + "grad_norm": 1.646305799484253, + "learning_rate": 4.193493302262692e-05, + "loss": 5.0304, + "step": 44241 + }, + { + "epoch": 0.2631197069178799, + "grad_norm": 1.6322429180145264, + "learning_rate": 4.193458941388842e-05, + "loss": 4.7443, + "step": 44242 + }, + { + "epoch": 0.2631256542011609, + "grad_norm": 1.6753485202789307, + "learning_rate": 4.193424579923824e-05, + "loss": 4.6242, + "step": 44243 + }, + { + "epoch": 0.2631316014844419, + "grad_norm": 1.5180407762527466, + "learning_rate": 4.193390217867648e-05, + "loss": 5.1214, + "step": 44244 + }, + { + "epoch": 0.2631375487677229, + "grad_norm": 1.6033923625946045, + "learning_rate": 4.193355855220327e-05, + "loss": 4.6865, + "step": 44245 + }, + { + "epoch": 0.2631434960510039, + "grad_norm": 1.889499545097351, + "learning_rate": 4.193321491981873e-05, + "loss": 4.667, + "step": 44246 + }, + { + "epoch": 0.2631494433342849, + "grad_norm": 1.425058126449585, + "learning_rate": 4.193287128152298e-05, + "loss": 5.233, + "step": 44247 + }, + { + "epoch": 0.2631553906175659, + "grad_norm": 1.6737630367279053, + "learning_rate": 4.1932527637316134e-05, + "loss": 4.9811, + "step": 44248 + }, + { + "epoch": 0.2631613379008469, + "grad_norm": 1.4094657897949219, + "learning_rate": 4.1932183987198325e-05, + "loss": 4.9476, + "step": 44249 + }, + { + "epoch": 0.2631672851841279, + "grad_norm": 1.5965756177902222, + "learning_rate": 4.193184033116967e-05, + "loss": 4.8481, + "step": 44250 + }, + { + "epoch": 0.26317323246740887, + "grad_norm": 1.3912063837051392, + "learning_rate": 4.1931496669230275e-05, + "loss": 4.8307, + "step": 44251 + }, + { + "epoch": 0.2631791797506899, + "grad_norm": 1.5243635177612305, + "learning_rate": 4.193115300138027e-05, + "loss": 4.5538, + "step": 44252 + }, + { + "epoch": 0.2631851270339709, + "grad_norm": 1.4995478391647339, + "learning_rate": 4.193080932761978e-05, + "loss": 4.7359, + "step": 44253 + }, + { + "epoch": 0.26319107431725186, + "grad_norm": 1.5378903150558472, + "learning_rate": 4.193046564794891e-05, + "loss": 5.3985, + "step": 44254 + }, + { + "epoch": 0.2631970216005329, + "grad_norm": 1.4544713497161865, + "learning_rate": 4.1930121962367804e-05, + "loss": 4.9152, + "step": 44255 + }, + { + "epoch": 0.2632029688838139, + "grad_norm": 1.790795087814331, + "learning_rate": 4.1929778270876555e-05, + "loss": 5.1871, + "step": 44256 + }, + { + "epoch": 0.26320891616709485, + "grad_norm": 1.8072378635406494, + "learning_rate": 4.19294345734753e-05, + "loss": 4.3196, + "step": 44257 + }, + { + "epoch": 0.26321486345037587, + "grad_norm": 1.8150403499603271, + "learning_rate": 4.1929090870164154e-05, + "loss": 3.7952, + "step": 44258 + }, + { + "epoch": 0.2632208107336569, + "grad_norm": 1.806275486946106, + "learning_rate": 4.192874716094324e-05, + "loss": 3.8769, + "step": 44259 + }, + { + "epoch": 0.26322675801693785, + "grad_norm": 1.9231536388397217, + "learning_rate": 4.192840344581267e-05, + "loss": 4.1666, + "step": 44260 + }, + { + "epoch": 0.26323270530021886, + "grad_norm": 2.008859157562256, + "learning_rate": 4.1928059724772575e-05, + "loss": 4.3009, + "step": 44261 + }, + { + "epoch": 0.2632386525834999, + "grad_norm": 1.7712688446044922, + "learning_rate": 4.192771599782306e-05, + "loss": 4.1068, + "step": 44262 + }, + { + "epoch": 0.26324459986678084, + "grad_norm": 1.5747573375701904, + "learning_rate": 4.192737226496426e-05, + "loss": 4.0867, + "step": 44263 + }, + { + "epoch": 0.26325054715006185, + "grad_norm": 1.612884521484375, + "learning_rate": 4.192702852619629e-05, + "loss": 3.9795, + "step": 44264 + }, + { + "epoch": 0.26325649443334287, + "grad_norm": 1.7868505716323853, + "learning_rate": 4.192668478151927e-05, + "loss": 4.1012, + "step": 44265 + }, + { + "epoch": 0.26326244171662383, + "grad_norm": 2.0763051509857178, + "learning_rate": 4.1926341030933315e-05, + "loss": 4.1013, + "step": 44266 + }, + { + "epoch": 0.26326838899990485, + "grad_norm": 1.639460802078247, + "learning_rate": 4.1925997274438556e-05, + "loss": 5.149, + "step": 44267 + }, + { + "epoch": 0.26327433628318586, + "grad_norm": 1.9854601621627808, + "learning_rate": 4.19256535120351e-05, + "loss": 4.9406, + "step": 44268 + }, + { + "epoch": 0.2632802835664668, + "grad_norm": 1.797264575958252, + "learning_rate": 4.192530974372307e-05, + "loss": 4.7687, + "step": 44269 + }, + { + "epoch": 0.26328623084974784, + "grad_norm": 1.7914938926696777, + "learning_rate": 4.19249659695026e-05, + "loss": 4.2309, + "step": 44270 + }, + { + "epoch": 0.26329217813302885, + "grad_norm": 1.7729867696762085, + "learning_rate": 4.1924622189373794e-05, + "loss": 4.351, + "step": 44271 + }, + { + "epoch": 0.2632981254163098, + "grad_norm": 1.6646796464920044, + "learning_rate": 4.192427840333678e-05, + "loss": 4.3481, + "step": 44272 + }, + { + "epoch": 0.26330407269959083, + "grad_norm": 1.5477715730667114, + "learning_rate": 4.1923934611391674e-05, + "loss": 4.202, + "step": 44273 + }, + { + "epoch": 0.26331001998287185, + "grad_norm": 1.4746849536895752, + "learning_rate": 4.1923590813538594e-05, + "loss": 4.5872, + "step": 44274 + }, + { + "epoch": 0.2633159672661528, + "grad_norm": 1.4269436597824097, + "learning_rate": 4.192324700977767e-05, + "loss": 5.1767, + "step": 44275 + }, + { + "epoch": 0.2633219145494338, + "grad_norm": 1.4426214694976807, + "learning_rate": 4.192290320010901e-05, + "loss": 5.0241, + "step": 44276 + }, + { + "epoch": 0.26332786183271484, + "grad_norm": 1.4978283643722534, + "learning_rate": 4.1922559384532736e-05, + "loss": 5.3912, + "step": 44277 + }, + { + "epoch": 0.2633338091159958, + "grad_norm": 2.3393499851226807, + "learning_rate": 4.1922215563048974e-05, + "loss": 4.479, + "step": 44278 + }, + { + "epoch": 0.2633397563992768, + "grad_norm": 2.1661744117736816, + "learning_rate": 4.192187173565784e-05, + "loss": 4.2899, + "step": 44279 + }, + { + "epoch": 0.26334570368255783, + "grad_norm": 1.8311398029327393, + "learning_rate": 4.192152790235946e-05, + "loss": 4.2441, + "step": 44280 + }, + { + "epoch": 0.2633516509658388, + "grad_norm": 1.4945337772369385, + "learning_rate": 4.192118406315395e-05, + "loss": 4.7873, + "step": 44281 + }, + { + "epoch": 0.2633575982491198, + "grad_norm": 1.5370876789093018, + "learning_rate": 4.192084021804142e-05, + "loss": 4.7735, + "step": 44282 + }, + { + "epoch": 0.2633635455324008, + "grad_norm": 1.6721419095993042, + "learning_rate": 4.1920496367022014e-05, + "loss": 4.7355, + "step": 44283 + }, + { + "epoch": 0.2633694928156818, + "grad_norm": 1.7982444763183594, + "learning_rate": 4.1920152510095825e-05, + "loss": 4.6264, + "step": 44284 + }, + { + "epoch": 0.2633754400989628, + "grad_norm": 1.8393453359603882, + "learning_rate": 4.191980864726299e-05, + "loss": 5.0275, + "step": 44285 + }, + { + "epoch": 0.2633813873822438, + "grad_norm": 1.9318984746932983, + "learning_rate": 4.191946477852362e-05, + "loss": 4.6839, + "step": 44286 + }, + { + "epoch": 0.2633873346655248, + "grad_norm": 1.8694677352905273, + "learning_rate": 4.191912090387785e-05, + "loss": 4.9918, + "step": 44287 + }, + { + "epoch": 0.2633932819488058, + "grad_norm": 1.524317979812622, + "learning_rate": 4.191877702332578e-05, + "loss": 5.1714, + "step": 44288 + }, + { + "epoch": 0.2633992292320868, + "grad_norm": 1.4260708093643188, + "learning_rate": 4.191843313686755e-05, + "loss": 5.1693, + "step": 44289 + }, + { + "epoch": 0.26340517651536777, + "grad_norm": 1.4938278198242188, + "learning_rate": 4.191808924450326e-05, + "loss": 5.197, + "step": 44290 + }, + { + "epoch": 0.2634111237986488, + "grad_norm": 1.247518539428711, + "learning_rate": 4.191774534623304e-05, + "loss": 4.7594, + "step": 44291 + }, + { + "epoch": 0.2634170710819298, + "grad_norm": 1.501539707183838, + "learning_rate": 4.191740144205702e-05, + "loss": 5.2712, + "step": 44292 + }, + { + "epoch": 0.26342301836521076, + "grad_norm": 1.623030662536621, + "learning_rate": 4.19170575319753e-05, + "loss": 5.498, + "step": 44293 + }, + { + "epoch": 0.2634289656484918, + "grad_norm": 1.489660620689392, + "learning_rate": 4.191671361598802e-05, + "loss": 5.4397, + "step": 44294 + }, + { + "epoch": 0.2634349129317728, + "grad_norm": 1.5173815488815308, + "learning_rate": 4.191636969409528e-05, + "loss": 4.9509, + "step": 44295 + }, + { + "epoch": 0.26344086021505375, + "grad_norm": 1.7969000339508057, + "learning_rate": 4.1916025766297214e-05, + "loss": 4.3802, + "step": 44296 + }, + { + "epoch": 0.26344680749833477, + "grad_norm": 2.5753283500671387, + "learning_rate": 4.191568183259394e-05, + "loss": 4.0231, + "step": 44297 + }, + { + "epoch": 0.2634527547816158, + "grad_norm": 1.7087059020996094, + "learning_rate": 4.191533789298557e-05, + "loss": 5.2594, + "step": 44298 + }, + { + "epoch": 0.26345870206489674, + "grad_norm": 1.4498878717422485, + "learning_rate": 4.191499394747224e-05, + "loss": 4.7304, + "step": 44299 + }, + { + "epoch": 0.26346464934817776, + "grad_norm": 1.8380701541900635, + "learning_rate": 4.191464999605405e-05, + "loss": 4.7035, + "step": 44300 + }, + { + "epoch": 0.2634705966314588, + "grad_norm": 1.6263514757156372, + "learning_rate": 4.191430603873113e-05, + "loss": 4.794, + "step": 44301 + }, + { + "epoch": 0.26347654391473974, + "grad_norm": 1.6794257164001465, + "learning_rate": 4.191396207550361e-05, + "loss": 4.9048, + "step": 44302 + }, + { + "epoch": 0.26348249119802075, + "grad_norm": 1.5709028244018555, + "learning_rate": 4.19136181063716e-05, + "loss": 4.444, + "step": 44303 + }, + { + "epoch": 0.26348843848130177, + "grad_norm": 1.4377458095550537, + "learning_rate": 4.191327413133521e-05, + "loss": 4.1321, + "step": 44304 + }, + { + "epoch": 0.26349438576458273, + "grad_norm": 1.4277467727661133, + "learning_rate": 4.191293015039458e-05, + "loss": 4.0694, + "step": 44305 + }, + { + "epoch": 0.26350033304786374, + "grad_norm": 1.6463450193405151, + "learning_rate": 4.1912586163549815e-05, + "loss": 3.9148, + "step": 44306 + }, + { + "epoch": 0.26350628033114476, + "grad_norm": 1.8896197080612183, + "learning_rate": 4.191224217080105e-05, + "loss": 4.4208, + "step": 44307 + }, + { + "epoch": 0.2635122276144257, + "grad_norm": 1.5293149948120117, + "learning_rate": 4.191189817214839e-05, + "loss": 4.9936, + "step": 44308 + }, + { + "epoch": 0.26351817489770674, + "grad_norm": 1.5973055362701416, + "learning_rate": 4.191155416759196e-05, + "loss": 5.0515, + "step": 44309 + }, + { + "epoch": 0.26352412218098775, + "grad_norm": 1.8326791524887085, + "learning_rate": 4.191121015713188e-05, + "loss": 4.8351, + "step": 44310 + }, + { + "epoch": 0.2635300694642687, + "grad_norm": 1.6618173122406006, + "learning_rate": 4.1910866140768276e-05, + "loss": 5.3126, + "step": 44311 + }, + { + "epoch": 0.26353601674754973, + "grad_norm": 1.6470005512237549, + "learning_rate": 4.191052211850126e-05, + "loss": 5.2093, + "step": 44312 + }, + { + "epoch": 0.2635419640308307, + "grad_norm": 1.5924838781356812, + "learning_rate": 4.191017809033095e-05, + "loss": 5.3005, + "step": 44313 + }, + { + "epoch": 0.2635479113141117, + "grad_norm": 1.4908145666122437, + "learning_rate": 4.1909834056257485e-05, + "loss": 5.1395, + "step": 44314 + }, + { + "epoch": 0.2635538585973927, + "grad_norm": 1.546149730682373, + "learning_rate": 4.1909490016280964e-05, + "loss": 4.6211, + "step": 44315 + }, + { + "epoch": 0.2635598058806737, + "grad_norm": 1.933510184288025, + "learning_rate": 4.190914597040152e-05, + "loss": 4.2523, + "step": 44316 + }, + { + "epoch": 0.2635657531639547, + "grad_norm": 1.753180742263794, + "learning_rate": 4.190880191861926e-05, + "loss": 4.4537, + "step": 44317 + }, + { + "epoch": 0.2635717004472357, + "grad_norm": 1.7619810104370117, + "learning_rate": 4.190845786093431e-05, + "loss": 4.4995, + "step": 44318 + }, + { + "epoch": 0.2635776477305167, + "grad_norm": 1.6341608762741089, + "learning_rate": 4.19081137973468e-05, + "loss": 5.3048, + "step": 44319 + }, + { + "epoch": 0.2635835950137977, + "grad_norm": 1.5508331060409546, + "learning_rate": 4.190776972785684e-05, + "loss": 5.1219, + "step": 44320 + }, + { + "epoch": 0.2635895422970787, + "grad_norm": 1.5298925638198853, + "learning_rate": 4.190742565246455e-05, + "loss": 4.6232, + "step": 44321 + }, + { + "epoch": 0.26359548958035967, + "grad_norm": 1.3888485431671143, + "learning_rate": 4.1907081571170056e-05, + "loss": 4.9889, + "step": 44322 + }, + { + "epoch": 0.2636014368636407, + "grad_norm": 1.2898778915405273, + "learning_rate": 4.1906737483973475e-05, + "loss": 4.5643, + "step": 44323 + }, + { + "epoch": 0.2636073841469217, + "grad_norm": 1.253771185874939, + "learning_rate": 4.1906393390874924e-05, + "loss": 4.7725, + "step": 44324 + }, + { + "epoch": 0.26361333143020266, + "grad_norm": 1.553749442100525, + "learning_rate": 4.1906049291874525e-05, + "loss": 4.4265, + "step": 44325 + }, + { + "epoch": 0.2636192787134837, + "grad_norm": 1.6197221279144287, + "learning_rate": 4.19057051869724e-05, + "loss": 4.3423, + "step": 44326 + }, + { + "epoch": 0.2636252259967647, + "grad_norm": 1.4876919984817505, + "learning_rate": 4.190536107616867e-05, + "loss": 4.2909, + "step": 44327 + }, + { + "epoch": 0.26363117328004565, + "grad_norm": 1.589916467666626, + "learning_rate": 4.190501695946345e-05, + "loss": 4.0825, + "step": 44328 + }, + { + "epoch": 0.26363712056332667, + "grad_norm": 1.8253183364868164, + "learning_rate": 4.1904672836856865e-05, + "loss": 5.0671, + "step": 44329 + }, + { + "epoch": 0.2636430678466077, + "grad_norm": 1.5927884578704834, + "learning_rate": 4.190432870834903e-05, + "loss": 5.007, + "step": 44330 + }, + { + "epoch": 0.26364901512988864, + "grad_norm": 1.5164340734481812, + "learning_rate": 4.190398457394007e-05, + "loss": 5.3541, + "step": 44331 + }, + { + "epoch": 0.26365496241316966, + "grad_norm": 1.3955689668655396, + "learning_rate": 4.190364043363011e-05, + "loss": 5.3008, + "step": 44332 + }, + { + "epoch": 0.2636609096964507, + "grad_norm": 1.792508602142334, + "learning_rate": 4.190329628741925e-05, + "loss": 4.7917, + "step": 44333 + }, + { + "epoch": 0.26366685697973163, + "grad_norm": 1.550174355506897, + "learning_rate": 4.1902952135307635e-05, + "loss": 4.7754, + "step": 44334 + }, + { + "epoch": 0.26367280426301265, + "grad_norm": 1.7096366882324219, + "learning_rate": 4.190260797729537e-05, + "loss": 4.6674, + "step": 44335 + }, + { + "epoch": 0.26367875154629367, + "grad_norm": 1.5525634288787842, + "learning_rate": 4.1902263813382584e-05, + "loss": 4.6349, + "step": 44336 + }, + { + "epoch": 0.2636846988295746, + "grad_norm": 1.425101637840271, + "learning_rate": 4.190191964356939e-05, + "loss": 4.1127, + "step": 44337 + }, + { + "epoch": 0.26369064611285564, + "grad_norm": 2.2883384227752686, + "learning_rate": 4.190157546785591e-05, + "loss": 4.7449, + "step": 44338 + }, + { + "epoch": 0.26369659339613666, + "grad_norm": 3.6731979846954346, + "learning_rate": 4.190123128624226e-05, + "loss": 5.2709, + "step": 44339 + }, + { + "epoch": 0.2637025406794176, + "grad_norm": 2.7111990451812744, + "learning_rate": 4.190088709872857e-05, + "loss": 4.9095, + "step": 44340 + }, + { + "epoch": 0.26370848796269863, + "grad_norm": 1.6416468620300293, + "learning_rate": 4.190054290531495e-05, + "loss": 5.0892, + "step": 44341 + }, + { + "epoch": 0.26371443524597965, + "grad_norm": 1.8738130331039429, + "learning_rate": 4.190019870600153e-05, + "loss": 4.5611, + "step": 44342 + }, + { + "epoch": 0.2637203825292606, + "grad_norm": 1.7024880647659302, + "learning_rate": 4.1899854500788426e-05, + "loss": 4.4556, + "step": 44343 + }, + { + "epoch": 0.2637263298125416, + "grad_norm": 1.6708500385284424, + "learning_rate": 4.1899510289675756e-05, + "loss": 4.5168, + "step": 44344 + }, + { + "epoch": 0.26373227709582264, + "grad_norm": 1.6910686492919922, + "learning_rate": 4.1899166072663644e-05, + "loss": 4.6738, + "step": 44345 + }, + { + "epoch": 0.2637382243791036, + "grad_norm": 1.6861565113067627, + "learning_rate": 4.1898821849752204e-05, + "loss": 4.4949, + "step": 44346 + }, + { + "epoch": 0.2637441716623846, + "grad_norm": 1.595641851425171, + "learning_rate": 4.1898477620941553e-05, + "loss": 4.4812, + "step": 44347 + }, + { + "epoch": 0.26375011894566563, + "grad_norm": 1.5702672004699707, + "learning_rate": 4.189813338623183e-05, + "loss": 4.557, + "step": 44348 + }, + { + "epoch": 0.2637560662289466, + "grad_norm": 1.378595232963562, + "learning_rate": 4.189778914562313e-05, + "loss": 4.536, + "step": 44349 + }, + { + "epoch": 0.2637620135122276, + "grad_norm": 1.5845041275024414, + "learning_rate": 4.1897444899115604e-05, + "loss": 4.7077, + "step": 44350 + }, + { + "epoch": 0.2637679607955086, + "grad_norm": 1.7358351945877075, + "learning_rate": 4.189710064670934e-05, + "loss": 4.9487, + "step": 44351 + }, + { + "epoch": 0.2637739080787896, + "grad_norm": 1.4444407224655151, + "learning_rate": 4.189675638840448e-05, + "loss": 4.986, + "step": 44352 + }, + { + "epoch": 0.2637798553620706, + "grad_norm": 1.6772398948669434, + "learning_rate": 4.189641212420114e-05, + "loss": 4.3122, + "step": 44353 + }, + { + "epoch": 0.2637858026453516, + "grad_norm": 1.6764408349990845, + "learning_rate": 4.189606785409943e-05, + "loss": 4.5868, + "step": 44354 + }, + { + "epoch": 0.2637917499286326, + "grad_norm": 1.6065212488174438, + "learning_rate": 4.189572357809948e-05, + "loss": 5.1155, + "step": 44355 + }, + { + "epoch": 0.2637976972119136, + "grad_norm": 1.573319673538208, + "learning_rate": 4.189537929620141e-05, + "loss": 4.808, + "step": 44356 + }, + { + "epoch": 0.2638036444951946, + "grad_norm": 1.2701600790023804, + "learning_rate": 4.189503500840533e-05, + "loss": 4.7894, + "step": 44357 + }, + { + "epoch": 0.26380959177847557, + "grad_norm": 1.3029990196228027, + "learning_rate": 4.189469071471137e-05, + "loss": 4.8856, + "step": 44358 + }, + { + "epoch": 0.2638155390617566, + "grad_norm": 1.7397860288619995, + "learning_rate": 4.1894346415119655e-05, + "loss": 4.2945, + "step": 44359 + }, + { + "epoch": 0.2638214863450376, + "grad_norm": 1.621174693107605, + "learning_rate": 4.189400210963029e-05, + "loss": 4.2628, + "step": 44360 + }, + { + "epoch": 0.26382743362831856, + "grad_norm": 1.3935346603393555, + "learning_rate": 4.189365779824341e-05, + "loss": 4.3442, + "step": 44361 + }, + { + "epoch": 0.2638333809115996, + "grad_norm": 1.4850194454193115, + "learning_rate": 4.189331348095913e-05, + "loss": 4.8291, + "step": 44362 + }, + { + "epoch": 0.2638393281948806, + "grad_norm": 1.7516988515853882, + "learning_rate": 4.189296915777756e-05, + "loss": 4.7276, + "step": 44363 + }, + { + "epoch": 0.26384527547816156, + "grad_norm": 1.719159722328186, + "learning_rate": 4.1892624828698836e-05, + "loss": 4.39, + "step": 44364 + }, + { + "epoch": 0.26385122276144257, + "grad_norm": 1.599805474281311, + "learning_rate": 4.189228049372307e-05, + "loss": 4.3549, + "step": 44365 + }, + { + "epoch": 0.2638571700447236, + "grad_norm": 1.956787347793579, + "learning_rate": 4.189193615285038e-05, + "loss": 4.5801, + "step": 44366 + }, + { + "epoch": 0.26386311732800455, + "grad_norm": 1.787524938583374, + "learning_rate": 4.189159180608089e-05, + "loss": 4.6073, + "step": 44367 + }, + { + "epoch": 0.26386906461128556, + "grad_norm": 1.5580451488494873, + "learning_rate": 4.189124745341472e-05, + "loss": 4.4314, + "step": 44368 + }, + { + "epoch": 0.2638750118945666, + "grad_norm": 1.5949817895889282, + "learning_rate": 4.1890903094851993e-05, + "loss": 4.4212, + "step": 44369 + }, + { + "epoch": 0.26388095917784754, + "grad_norm": 1.6135327816009521, + "learning_rate": 4.1890558730392824e-05, + "loss": 4.7577, + "step": 44370 + }, + { + "epoch": 0.26388690646112856, + "grad_norm": 1.7452692985534668, + "learning_rate": 4.189021436003734e-05, + "loss": 4.4638, + "step": 44371 + }, + { + "epoch": 0.2638928537444096, + "grad_norm": 1.8316709995269775, + "learning_rate": 4.188986998378565e-05, + "loss": 4.4129, + "step": 44372 + }, + { + "epoch": 0.26389880102769053, + "grad_norm": 1.6061393022537231, + "learning_rate": 4.188952560163788e-05, + "loss": 4.3602, + "step": 44373 + }, + { + "epoch": 0.26390474831097155, + "grad_norm": 1.762807846069336, + "learning_rate": 4.1889181213594164e-05, + "loss": 4.3063, + "step": 44374 + }, + { + "epoch": 0.26391069559425256, + "grad_norm": 1.6644129753112793, + "learning_rate": 4.1888836819654596e-05, + "loss": 4.65, + "step": 44375 + }, + { + "epoch": 0.2639166428775335, + "grad_norm": 1.4112393856048584, + "learning_rate": 4.188849241981931e-05, + "loss": 4.8683, + "step": 44376 + }, + { + "epoch": 0.26392259016081454, + "grad_norm": 1.6447399854660034, + "learning_rate": 4.1888148014088436e-05, + "loss": 5.0925, + "step": 44377 + }, + { + "epoch": 0.26392853744409556, + "grad_norm": 1.3712302446365356, + "learning_rate": 4.1887803602462074e-05, + "loss": 4.8157, + "step": 44378 + }, + { + "epoch": 0.2639344847273765, + "grad_norm": 1.8014813661575317, + "learning_rate": 4.188745918494036e-05, + "loss": 4.3025, + "step": 44379 + }, + { + "epoch": 0.26394043201065753, + "grad_norm": 1.6901695728302002, + "learning_rate": 4.1887114761523405e-05, + "loss": 4.6897, + "step": 44380 + }, + { + "epoch": 0.26394637929393855, + "grad_norm": 1.5734832286834717, + "learning_rate": 4.1886770332211334e-05, + "loss": 4.8288, + "step": 44381 + }, + { + "epoch": 0.2639523265772195, + "grad_norm": 1.718942642211914, + "learning_rate": 4.188642589700427e-05, + "loss": 4.3847, + "step": 44382 + }, + { + "epoch": 0.2639582738605005, + "grad_norm": 2.046342611312866, + "learning_rate": 4.188608145590232e-05, + "loss": 3.8896, + "step": 44383 + }, + { + "epoch": 0.26396422114378154, + "grad_norm": 1.7381677627563477, + "learning_rate": 4.1885737008905615e-05, + "loss": 3.8917, + "step": 44384 + }, + { + "epoch": 0.2639701684270625, + "grad_norm": 1.7127550840377808, + "learning_rate": 4.188539255601428e-05, + "loss": 3.9891, + "step": 44385 + }, + { + "epoch": 0.2639761157103435, + "grad_norm": 1.6272212266921997, + "learning_rate": 4.188504809722843e-05, + "loss": 4.3116, + "step": 44386 + }, + { + "epoch": 0.26398206299362453, + "grad_norm": 1.7754982709884644, + "learning_rate": 4.188470363254817e-05, + "loss": 4.1257, + "step": 44387 + }, + { + "epoch": 0.2639880102769055, + "grad_norm": 1.816174864768982, + "learning_rate": 4.1884359161973655e-05, + "loss": 3.8787, + "step": 44388 + }, + { + "epoch": 0.2639939575601865, + "grad_norm": 1.909925103187561, + "learning_rate": 4.188401468550497e-05, + "loss": 3.85, + "step": 44389 + }, + { + "epoch": 0.2639999048434675, + "grad_norm": 1.6785274744033813, + "learning_rate": 4.1883670203142256e-05, + "loss": 4.0363, + "step": 44390 + }, + { + "epoch": 0.2640058521267485, + "grad_norm": 1.5531467199325562, + "learning_rate": 4.1883325714885624e-05, + "loss": 4.256, + "step": 44391 + }, + { + "epoch": 0.2640117994100295, + "grad_norm": 2.0373566150665283, + "learning_rate": 4.18829812207352e-05, + "loss": 5.0077, + "step": 44392 + }, + { + "epoch": 0.2640177466933105, + "grad_norm": 1.5275565385818481, + "learning_rate": 4.18826367206911e-05, + "loss": 4.2558, + "step": 44393 + }, + { + "epoch": 0.2640236939765915, + "grad_norm": 1.5591754913330078, + "learning_rate": 4.188229221475345e-05, + "loss": 4.6401, + "step": 44394 + }, + { + "epoch": 0.2640296412598725, + "grad_norm": 1.6134898662567139, + "learning_rate": 4.1881947702922365e-05, + "loss": 4.8254, + "step": 44395 + }, + { + "epoch": 0.2640355885431535, + "grad_norm": 1.9456785917282104, + "learning_rate": 4.188160318519796e-05, + "loss": 4.1935, + "step": 44396 + }, + { + "epoch": 0.26404153582643447, + "grad_norm": 2.2705562114715576, + "learning_rate": 4.1881258661580355e-05, + "loss": 4.5885, + "step": 44397 + }, + { + "epoch": 0.2640474831097155, + "grad_norm": 1.972326636314392, + "learning_rate": 4.18809141320697e-05, + "loss": 4.3105, + "step": 44398 + }, + { + "epoch": 0.2640534303929965, + "grad_norm": 1.7552393674850464, + "learning_rate": 4.1880569596666077e-05, + "loss": 4.4614, + "step": 44399 + }, + { + "epoch": 0.26405937767627746, + "grad_norm": 1.8272089958190918, + "learning_rate": 4.188022505536962e-05, + "loss": 4.5382, + "step": 44400 + }, + { + "epoch": 0.2640653249595585, + "grad_norm": 2.1998274326324463, + "learning_rate": 4.1879880508180456e-05, + "loss": 4.4205, + "step": 44401 + }, + { + "epoch": 0.2640712722428395, + "grad_norm": 2.2072410583496094, + "learning_rate": 4.1879535955098705e-05, + "loss": 4.1656, + "step": 44402 + }, + { + "epoch": 0.26407721952612045, + "grad_norm": 1.8476535081863403, + "learning_rate": 4.187919139612447e-05, + "loss": 4.1585, + "step": 44403 + }, + { + "epoch": 0.26408316680940147, + "grad_norm": 1.5991207361221313, + "learning_rate": 4.18788468312579e-05, + "loss": 4.7168, + "step": 44404 + }, + { + "epoch": 0.2640891140926825, + "grad_norm": 1.760554313659668, + "learning_rate": 4.187850226049909e-05, + "loss": 4.6142, + "step": 44405 + }, + { + "epoch": 0.26409506137596345, + "grad_norm": 1.4018532037734985, + "learning_rate": 4.1878157683848164e-05, + "loss": 4.8787, + "step": 44406 + }, + { + "epoch": 0.26410100865924446, + "grad_norm": 1.3053864240646362, + "learning_rate": 4.187781310130525e-05, + "loss": 4.8779, + "step": 44407 + }, + { + "epoch": 0.2641069559425255, + "grad_norm": 1.405696153640747, + "learning_rate": 4.187746851287047e-05, + "loss": 4.9943, + "step": 44408 + }, + { + "epoch": 0.26411290322580644, + "grad_norm": 1.636129379272461, + "learning_rate": 4.1877123918543936e-05, + "loss": 4.7575, + "step": 44409 + }, + { + "epoch": 0.26411885050908745, + "grad_norm": 1.5747100114822388, + "learning_rate": 4.187677931832578e-05, + "loss": 4.7824, + "step": 44410 + }, + { + "epoch": 0.26412479779236847, + "grad_norm": 1.462334394454956, + "learning_rate": 4.1876434712216104e-05, + "loss": 5.0156, + "step": 44411 + }, + { + "epoch": 0.26413074507564943, + "grad_norm": 1.629607915878296, + "learning_rate": 4.1876090100215046e-05, + "loss": 4.7052, + "step": 44412 + }, + { + "epoch": 0.26413669235893045, + "grad_norm": 1.6749529838562012, + "learning_rate": 4.187574548232272e-05, + "loss": 4.9159, + "step": 44413 + }, + { + "epoch": 0.26414263964221146, + "grad_norm": 1.6381701231002808, + "learning_rate": 4.187540085853924e-05, + "loss": 4.8699, + "step": 44414 + }, + { + "epoch": 0.2641485869254924, + "grad_norm": 1.5214076042175293, + "learning_rate": 4.187505622886474e-05, + "loss": 4.6886, + "step": 44415 + }, + { + "epoch": 0.26415453420877344, + "grad_norm": 2.884861469268799, + "learning_rate": 4.1874711593299326e-05, + "loss": 3.6172, + "step": 44416 + }, + { + "epoch": 0.26416048149205446, + "grad_norm": 1.608079433441162, + "learning_rate": 4.1874366951843124e-05, + "loss": 4.0651, + "step": 44417 + }, + { + "epoch": 0.2641664287753354, + "grad_norm": 1.62821626663208, + "learning_rate": 4.1874022304496255e-05, + "loss": 4.1528, + "step": 44418 + }, + { + "epoch": 0.26417237605861643, + "grad_norm": 1.438390851020813, + "learning_rate": 4.187367765125885e-05, + "loss": 4.06, + "step": 44419 + }, + { + "epoch": 0.26417832334189745, + "grad_norm": 1.6364163160324097, + "learning_rate": 4.1873332992131004e-05, + "loss": 4.7438, + "step": 44420 + }, + { + "epoch": 0.2641842706251784, + "grad_norm": 1.453660249710083, + "learning_rate": 4.187298832711285e-05, + "loss": 4.5426, + "step": 44421 + }, + { + "epoch": 0.2641902179084594, + "grad_norm": 1.5949327945709229, + "learning_rate": 4.187264365620452e-05, + "loss": 5.0288, + "step": 44422 + }, + { + "epoch": 0.26419616519174044, + "grad_norm": 1.7219691276550293, + "learning_rate": 4.187229897940612e-05, + "loss": 4.6348, + "step": 44423 + }, + { + "epoch": 0.2642021124750214, + "grad_norm": 2.138735294342041, + "learning_rate": 4.187195429671778e-05, + "loss": 4.566, + "step": 44424 + }, + { + "epoch": 0.2642080597583024, + "grad_norm": 2.2517752647399902, + "learning_rate": 4.1871609608139616e-05, + "loss": 4.6505, + "step": 44425 + }, + { + "epoch": 0.26421400704158343, + "grad_norm": 1.7850415706634521, + "learning_rate": 4.1871264913671735e-05, + "loss": 4.4805, + "step": 44426 + }, + { + "epoch": 0.2642199543248644, + "grad_norm": 1.4031089544296265, + "learning_rate": 4.187092021331428e-05, + "loss": 4.14, + "step": 44427 + }, + { + "epoch": 0.2642259016081454, + "grad_norm": 1.1059420108795166, + "learning_rate": 4.187057550706737e-05, + "loss": 4.1736, + "step": 44428 + }, + { + "epoch": 0.2642318488914264, + "grad_norm": 1.5494786500930786, + "learning_rate": 4.18702307949311e-05, + "loss": 4.8708, + "step": 44429 + }, + { + "epoch": 0.2642377961747074, + "grad_norm": 3.1284420490264893, + "learning_rate": 4.186988607690561e-05, + "loss": 3.4627, + "step": 44430 + }, + { + "epoch": 0.2642437434579884, + "grad_norm": 1.7745755910873413, + "learning_rate": 4.1869541352991015e-05, + "loss": 4.8121, + "step": 44431 + }, + { + "epoch": 0.26424969074126936, + "grad_norm": 1.301859974861145, + "learning_rate": 4.1869196623187447e-05, + "loss": 5.141, + "step": 44432 + }, + { + "epoch": 0.2642556380245504, + "grad_norm": 1.6684423685073853, + "learning_rate": 4.186885188749501e-05, + "loss": 4.897, + "step": 44433 + }, + { + "epoch": 0.2642615853078314, + "grad_norm": 2.1292169094085693, + "learning_rate": 4.1868507145913826e-05, + "loss": 3.579, + "step": 44434 + }, + { + "epoch": 0.26426753259111235, + "grad_norm": 2.0009758472442627, + "learning_rate": 4.186816239844403e-05, + "loss": 3.7077, + "step": 44435 + }, + { + "epoch": 0.26427347987439337, + "grad_norm": 1.4679924249649048, + "learning_rate": 4.186781764508573e-05, + "loss": 4.7541, + "step": 44436 + }, + { + "epoch": 0.2642794271576744, + "grad_norm": 1.4512333869934082, + "learning_rate": 4.1867472885839045e-05, + "loss": 4.698, + "step": 44437 + }, + { + "epoch": 0.26428537444095535, + "grad_norm": 1.5124703645706177, + "learning_rate": 4.18671281207041e-05, + "loss": 4.7864, + "step": 44438 + }, + { + "epoch": 0.26429132172423636, + "grad_norm": 1.3340590000152588, + "learning_rate": 4.186678334968103e-05, + "loss": 4.7641, + "step": 44439 + }, + { + "epoch": 0.2642972690075174, + "grad_norm": 2.103215456008911, + "learning_rate": 4.186643857276992e-05, + "loss": 3.6378, + "step": 44440 + }, + { + "epoch": 0.26430321629079834, + "grad_norm": 2.651094913482666, + "learning_rate": 4.1866093789970916e-05, + "loss": 2.9295, + "step": 44441 + }, + { + "epoch": 0.26430916357407935, + "grad_norm": 1.5863869190216064, + "learning_rate": 4.186574900128414e-05, + "loss": 5.0737, + "step": 44442 + }, + { + "epoch": 0.26431511085736037, + "grad_norm": 1.630387783050537, + "learning_rate": 4.18654042067097e-05, + "loss": 4.5533, + "step": 44443 + }, + { + "epoch": 0.26432105814064133, + "grad_norm": 1.591261863708496, + "learning_rate": 4.186505940624772e-05, + "loss": 4.8375, + "step": 44444 + }, + { + "epoch": 0.26432700542392235, + "grad_norm": 1.881212592124939, + "learning_rate": 4.186471459989833e-05, + "loss": 4.2641, + "step": 44445 + }, + { + "epoch": 0.26433295270720336, + "grad_norm": 1.9205832481384277, + "learning_rate": 4.186436978766164e-05, + "loss": 3.8533, + "step": 44446 + }, + { + "epoch": 0.2643388999904843, + "grad_norm": 1.6119612455368042, + "learning_rate": 4.186402496953776e-05, + "loss": 4.6785, + "step": 44447 + }, + { + "epoch": 0.26434484727376534, + "grad_norm": 1.5099133253097534, + "learning_rate": 4.1863680145526836e-05, + "loss": 4.9522, + "step": 44448 + }, + { + "epoch": 0.26435079455704635, + "grad_norm": 1.7052550315856934, + "learning_rate": 4.186333531562897e-05, + "loss": 5.1073, + "step": 44449 + }, + { + "epoch": 0.2643567418403273, + "grad_norm": 1.7110322713851929, + "learning_rate": 4.186299047984429e-05, + "loss": 5.0528, + "step": 44450 + }, + { + "epoch": 0.26436268912360833, + "grad_norm": 1.8642323017120361, + "learning_rate": 4.1862645638172916e-05, + "loss": 4.1086, + "step": 44451 + }, + { + "epoch": 0.26436863640688935, + "grad_norm": 1.9081451892852783, + "learning_rate": 4.186230079061496e-05, + "loss": 3.8863, + "step": 44452 + }, + { + "epoch": 0.2643745836901703, + "grad_norm": 2.1499578952789307, + "learning_rate": 4.186195593717056e-05, + "loss": 4.202, + "step": 44453 + }, + { + "epoch": 0.2643805309734513, + "grad_norm": 2.057044267654419, + "learning_rate": 4.1861611077839816e-05, + "loss": 4.8614, + "step": 44454 + }, + { + "epoch": 0.26438647825673234, + "grad_norm": 1.6361968517303467, + "learning_rate": 4.186126621262286e-05, + "loss": 4.3973, + "step": 44455 + }, + { + "epoch": 0.2643924255400133, + "grad_norm": 1.5054078102111816, + "learning_rate": 4.186092134151981e-05, + "loss": 4.7559, + "step": 44456 + }, + { + "epoch": 0.2643983728232943, + "grad_norm": 1.5899674892425537, + "learning_rate": 4.186057646453079e-05, + "loss": 4.8936, + "step": 44457 + }, + { + "epoch": 0.26440432010657533, + "grad_norm": 1.5167386531829834, + "learning_rate": 4.186023158165592e-05, + "loss": 4.7519, + "step": 44458 + }, + { + "epoch": 0.2644102673898563, + "grad_norm": 1.6306471824645996, + "learning_rate": 4.18598866928953e-05, + "loss": 4.7858, + "step": 44459 + }, + { + "epoch": 0.2644162146731373, + "grad_norm": 1.7253559827804565, + "learning_rate": 4.185954179824909e-05, + "loss": 4.795, + "step": 44460 + }, + { + "epoch": 0.2644221619564183, + "grad_norm": 1.80239737033844, + "learning_rate": 4.1859196897717376e-05, + "loss": 4.215, + "step": 44461 + }, + { + "epoch": 0.2644281092396993, + "grad_norm": 1.6273021697998047, + "learning_rate": 4.185885199130029e-05, + "loss": 4.6437, + "step": 44462 + }, + { + "epoch": 0.2644340565229803, + "grad_norm": 1.6760021448135376, + "learning_rate": 4.185850707899796e-05, + "loss": 3.8325, + "step": 44463 + }, + { + "epoch": 0.2644400038062613, + "grad_norm": 2.166614294052124, + "learning_rate": 4.185816216081049e-05, + "loss": 3.5507, + "step": 44464 + }, + { + "epoch": 0.2644459510895423, + "grad_norm": 1.9144881963729858, + "learning_rate": 4.185781723673801e-05, + "loss": 4.848, + "step": 44465 + }, + { + "epoch": 0.2644518983728233, + "grad_norm": 1.7374454736709595, + "learning_rate": 4.1857472306780656e-05, + "loss": 4.8642, + "step": 44466 + }, + { + "epoch": 0.2644578456561043, + "grad_norm": 1.769124984741211, + "learning_rate": 4.1857127370938506e-05, + "loss": 4.7724, + "step": 44467 + }, + { + "epoch": 0.26446379293938527, + "grad_norm": 1.6731394529342651, + "learning_rate": 4.1856782429211726e-05, + "loss": 4.6569, + "step": 44468 + }, + { + "epoch": 0.2644697402226663, + "grad_norm": 1.5239163637161255, + "learning_rate": 4.185643748160042e-05, + "loss": 4.5378, + "step": 44469 + }, + { + "epoch": 0.2644756875059473, + "grad_norm": 1.6668143272399902, + "learning_rate": 4.18560925281047e-05, + "loss": 4.9803, + "step": 44470 + }, + { + "epoch": 0.26448163478922826, + "grad_norm": 1.8222264051437378, + "learning_rate": 4.185574756872469e-05, + "loss": 5.0216, + "step": 44471 + }, + { + "epoch": 0.2644875820725093, + "grad_norm": 1.5379369258880615, + "learning_rate": 4.185540260346052e-05, + "loss": 4.4671, + "step": 44472 + }, + { + "epoch": 0.2644935293557903, + "grad_norm": 1.6178473234176636, + "learning_rate": 4.185505763231229e-05, + "loss": 3.9988, + "step": 44473 + }, + { + "epoch": 0.26449947663907125, + "grad_norm": 1.7923990488052368, + "learning_rate": 4.1854712655280135e-05, + "loss": 4.5763, + "step": 44474 + }, + { + "epoch": 0.26450542392235227, + "grad_norm": 1.7223914861679077, + "learning_rate": 4.185436767236419e-05, + "loss": 4.6339, + "step": 44475 + }, + { + "epoch": 0.2645113712056333, + "grad_norm": 1.6696316003799438, + "learning_rate": 4.185402268356454e-05, + "loss": 4.8198, + "step": 44476 + }, + { + "epoch": 0.26451731848891424, + "grad_norm": 1.4479647874832153, + "learning_rate": 4.185367768888134e-05, + "loss": 4.9223, + "step": 44477 + }, + { + "epoch": 0.26452326577219526, + "grad_norm": 1.7012540102005005, + "learning_rate": 4.185333268831469e-05, + "loss": 4.7951, + "step": 44478 + }, + { + "epoch": 0.2645292130554763, + "grad_norm": 1.8000649213790894, + "learning_rate": 4.1852987681864706e-05, + "loss": 4.919, + "step": 44479 + }, + { + "epoch": 0.26453516033875724, + "grad_norm": 1.7648906707763672, + "learning_rate": 4.1852642669531536e-05, + "loss": 4.7654, + "step": 44480 + }, + { + "epoch": 0.26454110762203825, + "grad_norm": 1.7443323135375977, + "learning_rate": 4.185229765131527e-05, + "loss": 4.8655, + "step": 44481 + }, + { + "epoch": 0.26454705490531927, + "grad_norm": 1.8035615682601929, + "learning_rate": 4.1851952627216046e-05, + "loss": 5.0288, + "step": 44482 + }, + { + "epoch": 0.26455300218860023, + "grad_norm": 1.4905991554260254, + "learning_rate": 4.1851607597233976e-05, + "loss": 5.1976, + "step": 44483 + }, + { + "epoch": 0.26455894947188124, + "grad_norm": 1.5323477983474731, + "learning_rate": 4.185126256136919e-05, + "loss": 4.4726, + "step": 44484 + }, + { + "epoch": 0.26456489675516226, + "grad_norm": 1.3365819454193115, + "learning_rate": 4.18509175196218e-05, + "loss": 4.7939, + "step": 44485 + }, + { + "epoch": 0.2645708440384432, + "grad_norm": 1.46924889087677, + "learning_rate": 4.1850572471991924e-05, + "loss": 5.3312, + "step": 44486 + }, + { + "epoch": 0.26457679132172424, + "grad_norm": 1.4705312252044678, + "learning_rate": 4.185022741847969e-05, + "loss": 5.0508, + "step": 44487 + }, + { + "epoch": 0.26458273860500525, + "grad_norm": 1.7894493341445923, + "learning_rate": 4.184988235908521e-05, + "loss": 4.5773, + "step": 44488 + }, + { + "epoch": 0.2645886858882862, + "grad_norm": 1.6382434368133545, + "learning_rate": 4.184953729380862e-05, + "loss": 4.7888, + "step": 44489 + }, + { + "epoch": 0.26459463317156723, + "grad_norm": 1.145738124847412, + "learning_rate": 4.184919222265003e-05, + "loss": 5.0777, + "step": 44490 + }, + { + "epoch": 0.26460058045484824, + "grad_norm": 1.6631017923355103, + "learning_rate": 4.184884714560955e-05, + "loss": 4.924, + "step": 44491 + }, + { + "epoch": 0.2646065277381292, + "grad_norm": 1.6619197130203247, + "learning_rate": 4.1848502062687326e-05, + "loss": 4.8674, + "step": 44492 + }, + { + "epoch": 0.2646124750214102, + "grad_norm": 1.5310927629470825, + "learning_rate": 4.184815697388346e-05, + "loss": 5.0818, + "step": 44493 + }, + { + "epoch": 0.26461842230469124, + "grad_norm": 1.592101812362671, + "learning_rate": 4.1847811879198074e-05, + "loss": 4.8197, + "step": 44494 + }, + { + "epoch": 0.2646243695879722, + "grad_norm": 1.492659330368042, + "learning_rate": 4.1847466778631285e-05, + "loss": 4.9084, + "step": 44495 + }, + { + "epoch": 0.2646303168712532, + "grad_norm": 1.2908282279968262, + "learning_rate": 4.184712167218323e-05, + "loss": 5.1877, + "step": 44496 + }, + { + "epoch": 0.26463626415453423, + "grad_norm": 1.315101981163025, + "learning_rate": 4.184677655985401e-05, + "loss": 4.4279, + "step": 44497 + }, + { + "epoch": 0.2646422114378152, + "grad_norm": 1.3789472579956055, + "learning_rate": 4.1846431441643756e-05, + "loss": 4.474, + "step": 44498 + }, + { + "epoch": 0.2646481587210962, + "grad_norm": 1.3357831239700317, + "learning_rate": 4.184608631755259e-05, + "loss": 4.7651, + "step": 44499 + }, + { + "epoch": 0.2646541060043772, + "grad_norm": 1.9647319316864014, + "learning_rate": 4.184574118758063e-05, + "loss": 4.075, + "step": 44500 + }, + { + "epoch": 0.2646600532876582, + "grad_norm": 1.6944422721862793, + "learning_rate": 4.184539605172799e-05, + "loss": 4.3851, + "step": 44501 + }, + { + "epoch": 0.2646660005709392, + "grad_norm": 1.4939371347427368, + "learning_rate": 4.1845050909994804e-05, + "loss": 5.0137, + "step": 44502 + }, + { + "epoch": 0.2646719478542202, + "grad_norm": 1.4747744798660278, + "learning_rate": 4.184470576238118e-05, + "loss": 5.1392, + "step": 44503 + }, + { + "epoch": 0.2646778951375012, + "grad_norm": 1.6428667306900024, + "learning_rate": 4.184436060888725e-05, + "loss": 5.2413, + "step": 44504 + }, + { + "epoch": 0.2646838424207822, + "grad_norm": 1.8096879720687866, + "learning_rate": 4.1844015449513116e-05, + "loss": 4.608, + "step": 44505 + }, + { + "epoch": 0.2646897897040632, + "grad_norm": 1.776768684387207, + "learning_rate": 4.1843670284258916e-05, + "loss": 4.6171, + "step": 44506 + }, + { + "epoch": 0.26469573698734417, + "grad_norm": 1.7841585874557495, + "learning_rate": 4.184332511312476e-05, + "loss": 4.7407, + "step": 44507 + }, + { + "epoch": 0.2647016842706252, + "grad_norm": 1.585642695426941, + "learning_rate": 4.184297993611079e-05, + "loss": 4.9116, + "step": 44508 + }, + { + "epoch": 0.2647076315539062, + "grad_norm": 1.4495071172714233, + "learning_rate": 4.1842634753217094e-05, + "loss": 4.8139, + "step": 44509 + }, + { + "epoch": 0.26471357883718716, + "grad_norm": 1.4647594690322876, + "learning_rate": 4.184228956444382e-05, + "loss": 4.8523, + "step": 44510 + }, + { + "epoch": 0.2647195261204682, + "grad_norm": 1.4046803712844849, + "learning_rate": 4.184194436979106e-05, + "loss": 4.5684, + "step": 44511 + }, + { + "epoch": 0.2647254734037492, + "grad_norm": 1.3598742485046387, + "learning_rate": 4.184159916925896e-05, + "loss": 4.9112, + "step": 44512 + }, + { + "epoch": 0.26473142068703015, + "grad_norm": 1.4518396854400635, + "learning_rate": 4.184125396284764e-05, + "loss": 4.9966, + "step": 44513 + }, + { + "epoch": 0.26473736797031117, + "grad_norm": 1.2945561408996582, + "learning_rate": 4.184090875055719e-05, + "loss": 5.0988, + "step": 44514 + }, + { + "epoch": 0.2647433152535922, + "grad_norm": 1.4905147552490234, + "learning_rate": 4.184056353238778e-05, + "loss": 5.0976, + "step": 44515 + }, + { + "epoch": 0.26474926253687314, + "grad_norm": 1.884575605392456, + "learning_rate": 4.1840218308339484e-05, + "loss": 4.8548, + "step": 44516 + }, + { + "epoch": 0.26475520982015416, + "grad_norm": 1.3827574253082275, + "learning_rate": 4.183987307841245e-05, + "loss": 5.0264, + "step": 44517 + }, + { + "epoch": 0.2647611571034352, + "grad_norm": 1.2933017015457153, + "learning_rate": 4.1839527842606785e-05, + "loss": 5.0778, + "step": 44518 + }, + { + "epoch": 0.26476710438671613, + "grad_norm": 1.2319328784942627, + "learning_rate": 4.183918260092262e-05, + "loss": 5.1451, + "step": 44519 + }, + { + "epoch": 0.26477305166999715, + "grad_norm": 1.5795148611068726, + "learning_rate": 4.183883735336007e-05, + "loss": 4.6384, + "step": 44520 + }, + { + "epoch": 0.26477899895327817, + "grad_norm": 1.3848471641540527, + "learning_rate": 4.183849209991925e-05, + "loss": 4.9044, + "step": 44521 + }, + { + "epoch": 0.2647849462365591, + "grad_norm": 1.311155915260315, + "learning_rate": 4.183814684060029e-05, + "loss": 5.4899, + "step": 44522 + }, + { + "epoch": 0.26479089351984014, + "grad_norm": 1.3258235454559326, + "learning_rate": 4.1837801575403315e-05, + "loss": 5.2376, + "step": 44523 + }, + { + "epoch": 0.26479684080312116, + "grad_norm": 1.6703416109085083, + "learning_rate": 4.183745630432843e-05, + "loss": 4.9581, + "step": 44524 + }, + { + "epoch": 0.2648027880864021, + "grad_norm": 1.4993889331817627, + "learning_rate": 4.1837111027375756e-05, + "loss": 4.6822, + "step": 44525 + }, + { + "epoch": 0.26480873536968313, + "grad_norm": 1.8015929460525513, + "learning_rate": 4.183676574454543e-05, + "loss": 4.3205, + "step": 44526 + }, + { + "epoch": 0.26481468265296415, + "grad_norm": 1.4305357933044434, + "learning_rate": 4.183642045583756e-05, + "loss": 5.0792, + "step": 44527 + }, + { + "epoch": 0.2648206299362451, + "grad_norm": 1.3541231155395508, + "learning_rate": 4.1836075161252273e-05, + "loss": 5.5398, + "step": 44528 + }, + { + "epoch": 0.2648265772195261, + "grad_norm": 1.4076385498046875, + "learning_rate": 4.1835729860789687e-05, + "loss": 5.5443, + "step": 44529 + }, + { + "epoch": 0.26483252450280714, + "grad_norm": 1.5057895183563232, + "learning_rate": 4.183538455444991e-05, + "loss": 4.9106, + "step": 44530 + }, + { + "epoch": 0.2648384717860881, + "grad_norm": 1.4861382246017456, + "learning_rate": 4.183503924223309e-05, + "loss": 5.04, + "step": 44531 + }, + { + "epoch": 0.2648444190693691, + "grad_norm": 1.7882814407348633, + "learning_rate": 4.183469392413932e-05, + "loss": 5.1171, + "step": 44532 + }, + { + "epoch": 0.26485036635265014, + "grad_norm": 1.5043587684631348, + "learning_rate": 4.183434860016874e-05, + "loss": 5.0188, + "step": 44533 + }, + { + "epoch": 0.2648563136359311, + "grad_norm": 1.2780033349990845, + "learning_rate": 4.183400327032145e-05, + "loss": 5.151, + "step": 44534 + }, + { + "epoch": 0.2648622609192121, + "grad_norm": 1.3413909673690796, + "learning_rate": 4.1833657934597595e-05, + "loss": 4.9041, + "step": 44535 + }, + { + "epoch": 0.2648682082024931, + "grad_norm": 1.7962464094161987, + "learning_rate": 4.183331259299728e-05, + "loss": 4.9223, + "step": 44536 + }, + { + "epoch": 0.2648741554857741, + "grad_norm": 1.782593846321106, + "learning_rate": 4.183296724552063e-05, + "loss": 4.5413, + "step": 44537 + }, + { + "epoch": 0.2648801027690551, + "grad_norm": 1.538835048675537, + "learning_rate": 4.183262189216777e-05, + "loss": 4.686, + "step": 44538 + }, + { + "epoch": 0.2648860500523361, + "grad_norm": 1.5429004430770874, + "learning_rate": 4.183227653293881e-05, + "loss": 4.6442, + "step": 44539 + }, + { + "epoch": 0.2648919973356171, + "grad_norm": 1.6414607763290405, + "learning_rate": 4.183193116783388e-05, + "loss": 4.8771, + "step": 44540 + }, + { + "epoch": 0.2648979446188981, + "grad_norm": 1.563875436782837, + "learning_rate": 4.183158579685309e-05, + "loss": 4.7209, + "step": 44541 + }, + { + "epoch": 0.2649038919021791, + "grad_norm": 1.4140207767486572, + "learning_rate": 4.1831240419996574e-05, + "loss": 4.8934, + "step": 44542 + }, + { + "epoch": 0.26490983918546007, + "grad_norm": 1.2576130628585815, + "learning_rate": 4.183089503726444e-05, + "loss": 4.5912, + "step": 44543 + }, + { + "epoch": 0.2649157864687411, + "grad_norm": 1.458964228630066, + "learning_rate": 4.183054964865683e-05, + "loss": 5.3954, + "step": 44544 + }, + { + "epoch": 0.2649217337520221, + "grad_norm": 1.256784439086914, + "learning_rate": 4.183020425417382e-05, + "loss": 5.4978, + "step": 44545 + }, + { + "epoch": 0.26492768103530306, + "grad_norm": 1.2958076000213623, + "learning_rate": 4.182985885381558e-05, + "loss": 5.4385, + "step": 44546 + }, + { + "epoch": 0.2649336283185841, + "grad_norm": 1.3461213111877441, + "learning_rate": 4.182951344758221e-05, + "loss": 5.0082, + "step": 44547 + }, + { + "epoch": 0.26493957560186504, + "grad_norm": 1.3739278316497803, + "learning_rate": 4.182916803547382e-05, + "loss": 5.0632, + "step": 44548 + }, + { + "epoch": 0.26494552288514606, + "grad_norm": 1.2561041116714478, + "learning_rate": 4.182882261749056e-05, + "loss": 4.9871, + "step": 44549 + }, + { + "epoch": 0.2649514701684271, + "grad_norm": 1.318873643875122, + "learning_rate": 4.1828477193632516e-05, + "loss": 5.2308, + "step": 44550 + }, + { + "epoch": 0.26495741745170803, + "grad_norm": 1.4355168342590332, + "learning_rate": 4.1828131763899826e-05, + "loss": 4.7181, + "step": 44551 + }, + { + "epoch": 0.26496336473498905, + "grad_norm": 1.3671860694885254, + "learning_rate": 4.182778632829262e-05, + "loss": 4.917, + "step": 44552 + }, + { + "epoch": 0.26496931201827006, + "grad_norm": 1.4364533424377441, + "learning_rate": 4.1827440886811e-05, + "loss": 4.7978, + "step": 44553 + }, + { + "epoch": 0.264975259301551, + "grad_norm": 1.3228423595428467, + "learning_rate": 4.182709543945509e-05, + "loss": 4.8869, + "step": 44554 + }, + { + "epoch": 0.26498120658483204, + "grad_norm": 1.244361400604248, + "learning_rate": 4.1826749986225016e-05, + "loss": 4.6875, + "step": 44555 + }, + { + "epoch": 0.26498715386811306, + "grad_norm": 1.5338916778564453, + "learning_rate": 4.1826404527120896e-05, + "loss": 4.7743, + "step": 44556 + }, + { + "epoch": 0.264993101151394, + "grad_norm": 1.5648335218429565, + "learning_rate": 4.182605906214286e-05, + "loss": 4.7188, + "step": 44557 + }, + { + "epoch": 0.26499904843467503, + "grad_norm": 1.456516146659851, + "learning_rate": 4.182571359129102e-05, + "loss": 4.7477, + "step": 44558 + }, + { + "epoch": 0.26500499571795605, + "grad_norm": 1.7858186960220337, + "learning_rate": 4.182536811456549e-05, + "loss": 4.8591, + "step": 44559 + }, + { + "epoch": 0.265010943001237, + "grad_norm": 1.4679070711135864, + "learning_rate": 4.1825022631966403e-05, + "loss": 5.0243, + "step": 44560 + }, + { + "epoch": 0.265016890284518, + "grad_norm": 1.5250959396362305, + "learning_rate": 4.1824677143493876e-05, + "loss": 5.1644, + "step": 44561 + }, + { + "epoch": 0.26502283756779904, + "grad_norm": 1.281774878501892, + "learning_rate": 4.1824331649148016e-05, + "loss": 5.0672, + "step": 44562 + }, + { + "epoch": 0.26502878485108, + "grad_norm": 1.3179807662963867, + "learning_rate": 4.182398614892897e-05, + "loss": 4.6461, + "step": 44563 + }, + { + "epoch": 0.265034732134361, + "grad_norm": 1.716835379600525, + "learning_rate": 4.182364064283684e-05, + "loss": 4.167, + "step": 44564 + }, + { + "epoch": 0.26504067941764203, + "grad_norm": 1.3578506708145142, + "learning_rate": 4.182329513087176e-05, + "loss": 4.404, + "step": 44565 + }, + { + "epoch": 0.265046626700923, + "grad_norm": 1.4736641645431519, + "learning_rate": 4.182294961303382e-05, + "loss": 4.4839, + "step": 44566 + }, + { + "epoch": 0.265052573984204, + "grad_norm": 1.3891223669052124, + "learning_rate": 4.1822604089323174e-05, + "loss": 4.6377, + "step": 44567 + }, + { + "epoch": 0.265058521267485, + "grad_norm": 1.2743990421295166, + "learning_rate": 4.182225855973993e-05, + "loss": 5.2828, + "step": 44568 + }, + { + "epoch": 0.265064468550766, + "grad_norm": 1.5238193273544312, + "learning_rate": 4.182191302428421e-05, + "loss": 4.5825, + "step": 44569 + }, + { + "epoch": 0.265070415834047, + "grad_norm": 1.7793322801589966, + "learning_rate": 4.182156748295614e-05, + "loss": 3.8015, + "step": 44570 + }, + { + "epoch": 0.265076363117328, + "grad_norm": 1.6769336462020874, + "learning_rate": 4.1821221935755816e-05, + "loss": 4.5106, + "step": 44571 + }, + { + "epoch": 0.265082310400609, + "grad_norm": 2.0601863861083984, + "learning_rate": 4.182087638268339e-05, + "loss": 3.4267, + "step": 44572 + }, + { + "epoch": 0.26508825768389, + "grad_norm": 1.60547935962677, + "learning_rate": 4.182053082373897e-05, + "loss": 4.7883, + "step": 44573 + }, + { + "epoch": 0.265094204967171, + "grad_norm": 1.441867709159851, + "learning_rate": 4.182018525892268e-05, + "loss": 4.589, + "step": 44574 + }, + { + "epoch": 0.26510015225045197, + "grad_norm": 1.704620599746704, + "learning_rate": 4.181983968823463e-05, + "loss": 3.9997, + "step": 44575 + }, + { + "epoch": 0.265106099533733, + "grad_norm": 1.8227137327194214, + "learning_rate": 4.181949411167495e-05, + "loss": 4.2816, + "step": 44576 + }, + { + "epoch": 0.265112046817014, + "grad_norm": 1.5894221067428589, + "learning_rate": 4.181914852924376e-05, + "loss": 4.9875, + "step": 44577 + }, + { + "epoch": 0.26511799410029496, + "grad_norm": 2.002525568008423, + "learning_rate": 4.181880294094118e-05, + "loss": 4.8029, + "step": 44578 + }, + { + "epoch": 0.265123941383576, + "grad_norm": 1.9520806074142456, + "learning_rate": 4.181845734676733e-05, + "loss": 4.3401, + "step": 44579 + }, + { + "epoch": 0.265129888666857, + "grad_norm": 1.9992318153381348, + "learning_rate": 4.181811174672232e-05, + "loss": 3.7063, + "step": 44580 + }, + { + "epoch": 0.26513583595013795, + "grad_norm": 1.5485553741455078, + "learning_rate": 4.181776614080629e-05, + "loss": 4.5387, + "step": 44581 + }, + { + "epoch": 0.26514178323341897, + "grad_norm": 1.4552894830703735, + "learning_rate": 4.181742052901935e-05, + "loss": 4.6612, + "step": 44582 + }, + { + "epoch": 0.2651477305167, + "grad_norm": 1.5455929040908813, + "learning_rate": 4.181707491136163e-05, + "loss": 4.3696, + "step": 44583 + }, + { + "epoch": 0.26515367779998095, + "grad_norm": 1.873120665550232, + "learning_rate": 4.1816729287833235e-05, + "loss": 4.2884, + "step": 44584 + }, + { + "epoch": 0.26515962508326196, + "grad_norm": 1.4713197946548462, + "learning_rate": 4.181638365843429e-05, + "loss": 4.808, + "step": 44585 + }, + { + "epoch": 0.265165572366543, + "grad_norm": 1.7079193592071533, + "learning_rate": 4.181603802316492e-05, + "loss": 3.8387, + "step": 44586 + }, + { + "epoch": 0.26517151964982394, + "grad_norm": 1.9637492895126343, + "learning_rate": 4.1815692382025254e-05, + "loss": 3.7513, + "step": 44587 + }, + { + "epoch": 0.26517746693310495, + "grad_norm": 1.7298352718353271, + "learning_rate": 4.181534673501539e-05, + "loss": 4.5503, + "step": 44588 + }, + { + "epoch": 0.26518341421638597, + "grad_norm": 2.033831834793091, + "learning_rate": 4.181500108213547e-05, + "loss": 3.1096, + "step": 44589 + }, + { + "epoch": 0.26518936149966693, + "grad_norm": 1.9345415830612183, + "learning_rate": 4.1814655423385605e-05, + "loss": 3.2854, + "step": 44590 + }, + { + "epoch": 0.26519530878294795, + "grad_norm": 2.169245719909668, + "learning_rate": 4.181430975876592e-05, + "loss": 3.136, + "step": 44591 + }, + { + "epoch": 0.26520125606622896, + "grad_norm": 2.6305088996887207, + "learning_rate": 4.181396408827654e-05, + "loss": 2.9795, + "step": 44592 + }, + { + "epoch": 0.2652072033495099, + "grad_norm": 2.072291851043701, + "learning_rate": 4.181361841191757e-05, + "loss": 3.0023, + "step": 44593 + }, + { + "epoch": 0.26521315063279094, + "grad_norm": 2.3233275413513184, + "learning_rate": 4.181327272968914e-05, + "loss": 2.9775, + "step": 44594 + }, + { + "epoch": 0.26521909791607196, + "grad_norm": 2.4132814407348633, + "learning_rate": 4.1812927041591365e-05, + "loss": 2.9042, + "step": 44595 + }, + { + "epoch": 0.2652250451993529, + "grad_norm": 2.194164991378784, + "learning_rate": 4.181258134762438e-05, + "loss": 2.9668, + "step": 44596 + }, + { + "epoch": 0.26523099248263393, + "grad_norm": 2.4457786083221436, + "learning_rate": 4.1812235647788296e-05, + "loss": 2.9496, + "step": 44597 + }, + { + "epoch": 0.26523693976591495, + "grad_norm": 2.2278661727905273, + "learning_rate": 4.1811889942083226e-05, + "loss": 3.0122, + "step": 44598 + }, + { + "epoch": 0.2652428870491959, + "grad_norm": 2.3289971351623535, + "learning_rate": 4.18115442305093e-05, + "loss": 3.2484, + "step": 44599 + }, + { + "epoch": 0.2652488343324769, + "grad_norm": 2.568901538848877, + "learning_rate": 4.181119851306665e-05, + "loss": 3.0203, + "step": 44600 + }, + { + "epoch": 0.26525478161575794, + "grad_norm": 2.501455545425415, + "learning_rate": 4.181085278975537e-05, + "loss": 2.8977, + "step": 44601 + }, + { + "epoch": 0.2652607288990389, + "grad_norm": 2.6667025089263916, + "learning_rate": 4.181050706057561e-05, + "loss": 2.8247, + "step": 44602 + }, + { + "epoch": 0.2652666761823199, + "grad_norm": 2.3176016807556152, + "learning_rate": 4.181016132552746e-05, + "loss": 2.9341, + "step": 44603 + }, + { + "epoch": 0.26527262346560093, + "grad_norm": 2.3285279273986816, + "learning_rate": 4.180981558461107e-05, + "loss": 3.6854, + "step": 44604 + }, + { + "epoch": 0.2652785707488819, + "grad_norm": 2.32017183303833, + "learning_rate": 4.1809469837826535e-05, + "loss": 3.5191, + "step": 44605 + }, + { + "epoch": 0.2652845180321629, + "grad_norm": 2.3273637294769287, + "learning_rate": 4.180912408517399e-05, + "loss": 3.1213, + "step": 44606 + }, + { + "epoch": 0.2652904653154439, + "grad_norm": 2.5718185901641846, + "learning_rate": 4.1808778326653565e-05, + "loss": 3.183, + "step": 44607 + }, + { + "epoch": 0.2652964125987249, + "grad_norm": 2.5938668251037598, + "learning_rate": 4.1808432562265355e-05, + "loss": 3.348, + "step": 44608 + }, + { + "epoch": 0.2653023598820059, + "grad_norm": 2.1955223083496094, + "learning_rate": 4.18080867920095e-05, + "loss": 3.9517, + "step": 44609 + }, + { + "epoch": 0.2653083071652869, + "grad_norm": 1.7729969024658203, + "learning_rate": 4.180774101588612e-05, + "loss": 4.5031, + "step": 44610 + }, + { + "epoch": 0.2653142544485679, + "grad_norm": 1.9359110593795776, + "learning_rate": 4.180739523389532e-05, + "loss": 4.0511, + "step": 44611 + }, + { + "epoch": 0.2653202017318489, + "grad_norm": 2.061647653579712, + "learning_rate": 4.180704944603725e-05, + "loss": 4.0218, + "step": 44612 + }, + { + "epoch": 0.2653261490151299, + "grad_norm": 2.204232692718506, + "learning_rate": 4.1806703652311994e-05, + "loss": 3.4198, + "step": 44613 + }, + { + "epoch": 0.26533209629841087, + "grad_norm": 1.698838233947754, + "learning_rate": 4.1806357852719705e-05, + "loss": 3.7276, + "step": 44614 + }, + { + "epoch": 0.2653380435816919, + "grad_norm": 2.0758578777313232, + "learning_rate": 4.180601204726048e-05, + "loss": 3.4666, + "step": 44615 + }, + { + "epoch": 0.2653439908649729, + "grad_norm": 1.6586347818374634, + "learning_rate": 4.180566623593446e-05, + "loss": 4.2268, + "step": 44616 + }, + { + "epoch": 0.26534993814825386, + "grad_norm": 1.9874439239501953, + "learning_rate": 4.180532041874175e-05, + "loss": 3.5684, + "step": 44617 + }, + { + "epoch": 0.2653558854315349, + "grad_norm": 2.1620664596557617, + "learning_rate": 4.180497459568247e-05, + "loss": 3.5552, + "step": 44618 + }, + { + "epoch": 0.2653618327148159, + "grad_norm": 2.0487561225891113, + "learning_rate": 4.180462876675676e-05, + "loss": 3.5929, + "step": 44619 + }, + { + "epoch": 0.26536777999809685, + "grad_norm": 2.2639942169189453, + "learning_rate": 4.180428293196471e-05, + "loss": 3.5319, + "step": 44620 + }, + { + "epoch": 0.26537372728137787, + "grad_norm": 2.201364040374756, + "learning_rate": 4.1803937091306475e-05, + "loss": 3.6511, + "step": 44621 + }, + { + "epoch": 0.2653796745646589, + "grad_norm": 2.0189554691314697, + "learning_rate": 4.180359124478215e-05, + "loss": 3.4723, + "step": 44622 + }, + { + "epoch": 0.26538562184793985, + "grad_norm": 2.1025359630584717, + "learning_rate": 4.180324539239187e-05, + "loss": 3.5921, + "step": 44623 + }, + { + "epoch": 0.26539156913122086, + "grad_norm": 2.318145990371704, + "learning_rate": 4.1802899534135754e-05, + "loss": 3.5165, + "step": 44624 + }, + { + "epoch": 0.2653975164145019, + "grad_norm": 2.384580135345459, + "learning_rate": 4.1802553670013913e-05, + "loss": 3.6749, + "step": 44625 + }, + { + "epoch": 0.26540346369778284, + "grad_norm": 2.503988742828369, + "learning_rate": 4.180220780002647e-05, + "loss": 3.6619, + "step": 44626 + }, + { + "epoch": 0.26540941098106385, + "grad_norm": 2.731323003768921, + "learning_rate": 4.1801861924173556e-05, + "loss": 3.7065, + "step": 44627 + }, + { + "epoch": 0.26541535826434487, + "grad_norm": 1.798121452331543, + "learning_rate": 4.180151604245529e-05, + "loss": 4.5768, + "step": 44628 + }, + { + "epoch": 0.26542130554762583, + "grad_norm": 1.7570469379425049, + "learning_rate": 4.180117015487178e-05, + "loss": 4.4065, + "step": 44629 + }, + { + "epoch": 0.26542725283090685, + "grad_norm": 1.5109697580337524, + "learning_rate": 4.180082426142316e-05, + "loss": 4.3961, + "step": 44630 + }, + { + "epoch": 0.26543320011418786, + "grad_norm": 1.548931360244751, + "learning_rate": 4.180047836210954e-05, + "loss": 4.7604, + "step": 44631 + }, + { + "epoch": 0.2654391473974688, + "grad_norm": 1.7589303255081177, + "learning_rate": 4.180013245693105e-05, + "loss": 4.3227, + "step": 44632 + }, + { + "epoch": 0.26544509468074984, + "grad_norm": 1.530359148979187, + "learning_rate": 4.179978654588781e-05, + "loss": 4.6462, + "step": 44633 + }, + { + "epoch": 0.26545104196403085, + "grad_norm": 1.7537821531295776, + "learning_rate": 4.1799440628979935e-05, + "loss": 4.2849, + "step": 44634 + }, + { + "epoch": 0.2654569892473118, + "grad_norm": 1.8115952014923096, + "learning_rate": 4.1799094706207555e-05, + "loss": 4.2838, + "step": 44635 + }, + { + "epoch": 0.26546293653059283, + "grad_norm": 1.7052654027938843, + "learning_rate": 4.179874877757077e-05, + "loss": 4.2787, + "step": 44636 + }, + { + "epoch": 0.26546888381387385, + "grad_norm": 1.6505331993103027, + "learning_rate": 4.179840284306973e-05, + "loss": 4.1908, + "step": 44637 + }, + { + "epoch": 0.2654748310971548, + "grad_norm": 1.8127820491790771, + "learning_rate": 4.1798056902704535e-05, + "loss": 4.4719, + "step": 44638 + }, + { + "epoch": 0.2654807783804358, + "grad_norm": 1.8945435285568237, + "learning_rate": 4.179771095647532e-05, + "loss": 4.2363, + "step": 44639 + }, + { + "epoch": 0.26548672566371684, + "grad_norm": 1.9244765043258667, + "learning_rate": 4.179736500438218e-05, + "loss": 4.7559, + "step": 44640 + }, + { + "epoch": 0.2654926729469978, + "grad_norm": 1.8450827598571777, + "learning_rate": 4.1797019046425264e-05, + "loss": 4.2332, + "step": 44641 + }, + { + "epoch": 0.2654986202302788, + "grad_norm": 1.8524264097213745, + "learning_rate": 4.1796673082604684e-05, + "loss": 4.0433, + "step": 44642 + }, + { + "epoch": 0.26550456751355983, + "grad_norm": 1.895990014076233, + "learning_rate": 4.179632711292056e-05, + "loss": 3.8282, + "step": 44643 + }, + { + "epoch": 0.2655105147968408, + "grad_norm": 1.7820470333099365, + "learning_rate": 4.179598113737301e-05, + "loss": 4.104, + "step": 44644 + }, + { + "epoch": 0.2655164620801218, + "grad_norm": 1.5829105377197266, + "learning_rate": 4.1795635155962156e-05, + "loss": 4.2408, + "step": 44645 + }, + { + "epoch": 0.2655224093634028, + "grad_norm": 1.6039918661117554, + "learning_rate": 4.1795289168688115e-05, + "loss": 4.4578, + "step": 44646 + }, + { + "epoch": 0.2655283566466838, + "grad_norm": 1.7924679517745972, + "learning_rate": 4.1794943175551013e-05, + "loss": 3.8887, + "step": 44647 + }, + { + "epoch": 0.2655343039299648, + "grad_norm": 1.6446036100387573, + "learning_rate": 4.179459717655097e-05, + "loss": 4.0495, + "step": 44648 + }, + { + "epoch": 0.2655402512132458, + "grad_norm": 1.6563348770141602, + "learning_rate": 4.179425117168812e-05, + "loss": 4.2638, + "step": 44649 + }, + { + "epoch": 0.2655461984965268, + "grad_norm": 1.537456750869751, + "learning_rate": 4.179390516096255e-05, + "loss": 4.3276, + "step": 44650 + }, + { + "epoch": 0.2655521457798078, + "grad_norm": 1.5253088474273682, + "learning_rate": 4.1793559144374415e-05, + "loss": 4.4104, + "step": 44651 + }, + { + "epoch": 0.2655580930630888, + "grad_norm": 1.5286083221435547, + "learning_rate": 4.179321312192382e-05, + "loss": 4.5538, + "step": 44652 + }, + { + "epoch": 0.26556404034636977, + "grad_norm": 1.6366928815841675, + "learning_rate": 4.1792867093610884e-05, + "loss": 4.2896, + "step": 44653 + }, + { + "epoch": 0.2655699876296508, + "grad_norm": 1.539981722831726, + "learning_rate": 4.1792521059435734e-05, + "loss": 4.4756, + "step": 44654 + }, + { + "epoch": 0.2655759349129318, + "grad_norm": 1.7110430002212524, + "learning_rate": 4.1792175019398485e-05, + "loss": 4.5273, + "step": 44655 + }, + { + "epoch": 0.26558188219621276, + "grad_norm": 1.7648017406463623, + "learning_rate": 4.179182897349926e-05, + "loss": 4.5176, + "step": 44656 + }, + { + "epoch": 0.2655878294794938, + "grad_norm": 1.6959450244903564, + "learning_rate": 4.179148292173819e-05, + "loss": 4.2689, + "step": 44657 + }, + { + "epoch": 0.2655937767627748, + "grad_norm": 1.6900670528411865, + "learning_rate": 4.1791136864115385e-05, + "loss": 3.8673, + "step": 44658 + }, + { + "epoch": 0.26559972404605575, + "grad_norm": 1.569080114364624, + "learning_rate": 4.1790790800630964e-05, + "loss": 4.3041, + "step": 44659 + }, + { + "epoch": 0.26560567132933677, + "grad_norm": 1.6221123933792114, + "learning_rate": 4.179044473128505e-05, + "loss": 3.9897, + "step": 44660 + }, + { + "epoch": 0.2656116186126178, + "grad_norm": 1.4900437593460083, + "learning_rate": 4.1790098656077765e-05, + "loss": 4.8163, + "step": 44661 + }, + { + "epoch": 0.26561756589589874, + "grad_norm": 1.6000319719314575, + "learning_rate": 4.178975257500923e-05, + "loss": 4.2026, + "step": 44662 + }, + { + "epoch": 0.26562351317917976, + "grad_norm": 1.6269030570983887, + "learning_rate": 4.178940648807957e-05, + "loss": 4.2151, + "step": 44663 + }, + { + "epoch": 0.2656294604624607, + "grad_norm": 2.6022377014160156, + "learning_rate": 4.17890603952889e-05, + "loss": 3.9252, + "step": 44664 + }, + { + "epoch": 0.26563540774574174, + "grad_norm": 1.5095230340957642, + "learning_rate": 4.178871429663734e-05, + "loss": 3.9899, + "step": 44665 + }, + { + "epoch": 0.26564135502902275, + "grad_norm": 1.6044921875, + "learning_rate": 4.1788368192125016e-05, + "loss": 3.784, + "step": 44666 + }, + { + "epoch": 0.2656473023123037, + "grad_norm": 1.555045485496521, + "learning_rate": 4.178802208175204e-05, + "loss": 3.8848, + "step": 44667 + }, + { + "epoch": 0.26565324959558473, + "grad_norm": 1.6293545961380005, + "learning_rate": 4.178767596551855e-05, + "loss": 3.7462, + "step": 44668 + }, + { + "epoch": 0.26565919687886574, + "grad_norm": 1.765380620956421, + "learning_rate": 4.178732984342465e-05, + "loss": 3.8061, + "step": 44669 + }, + { + "epoch": 0.2656651441621467, + "grad_norm": 1.8369882106781006, + "learning_rate": 4.178698371547046e-05, + "loss": 3.8434, + "step": 44670 + }, + { + "epoch": 0.2656710914454277, + "grad_norm": 1.609936237335205, + "learning_rate": 4.178663758165612e-05, + "loss": 3.7511, + "step": 44671 + }, + { + "epoch": 0.26567703872870874, + "grad_norm": 1.8826910257339478, + "learning_rate": 4.178629144198173e-05, + "loss": 3.8678, + "step": 44672 + }, + { + "epoch": 0.2656829860119897, + "grad_norm": 1.3886384963989258, + "learning_rate": 4.178594529644742e-05, + "loss": 4.1986, + "step": 44673 + }, + { + "epoch": 0.2656889332952707, + "grad_norm": 1.6065675020217896, + "learning_rate": 4.1785599145053314e-05, + "loss": 4.2947, + "step": 44674 + }, + { + "epoch": 0.26569488057855173, + "grad_norm": 1.7534046173095703, + "learning_rate": 4.1785252987799526e-05, + "loss": 4.595, + "step": 44675 + }, + { + "epoch": 0.2657008278618327, + "grad_norm": 1.6902917623519897, + "learning_rate": 4.1784906824686175e-05, + "loss": 4.4759, + "step": 44676 + }, + { + "epoch": 0.2657067751451137, + "grad_norm": 1.7619075775146484, + "learning_rate": 4.1784560655713396e-05, + "loss": 4.3369, + "step": 44677 + }, + { + "epoch": 0.2657127224283947, + "grad_norm": 1.5576555728912354, + "learning_rate": 4.1784214480881296e-05, + "loss": 4.4016, + "step": 44678 + }, + { + "epoch": 0.2657186697116757, + "grad_norm": 2.063873291015625, + "learning_rate": 4.178386830018999e-05, + "loss": 3.2773, + "step": 44679 + }, + { + "epoch": 0.2657246169949567, + "grad_norm": 2.3515148162841797, + "learning_rate": 4.1783522113639625e-05, + "loss": 3.5542, + "step": 44680 + }, + { + "epoch": 0.2657305642782377, + "grad_norm": 2.5831058025360107, + "learning_rate": 4.17831759212303e-05, + "loss": 3.5672, + "step": 44681 + }, + { + "epoch": 0.2657365115615187, + "grad_norm": 2.102221965789795, + "learning_rate": 4.178282972296214e-05, + "loss": 3.5764, + "step": 44682 + }, + { + "epoch": 0.2657424588447997, + "grad_norm": 2.393381118774414, + "learning_rate": 4.178248351883527e-05, + "loss": 3.6593, + "step": 44683 + }, + { + "epoch": 0.2657484061280807, + "grad_norm": 1.7163208723068237, + "learning_rate": 4.17821373088498e-05, + "loss": 4.0782, + "step": 44684 + }, + { + "epoch": 0.26575435341136167, + "grad_norm": 2.0232999324798584, + "learning_rate": 4.1781791093005873e-05, + "loss": 4.2632, + "step": 44685 + }, + { + "epoch": 0.2657603006946427, + "grad_norm": 1.8365758657455444, + "learning_rate": 4.1781444871303585e-05, + "loss": 3.8812, + "step": 44686 + }, + { + "epoch": 0.2657662479779237, + "grad_norm": 1.554626226425171, + "learning_rate": 4.1781098643743075e-05, + "loss": 4.1383, + "step": 44687 + }, + { + "epoch": 0.26577219526120466, + "grad_norm": 1.612596869468689, + "learning_rate": 4.178075241032445e-05, + "loss": 4.1092, + "step": 44688 + }, + { + "epoch": 0.2657781425444857, + "grad_norm": 1.6738524436950684, + "learning_rate": 4.178040617104785e-05, + "loss": 3.9696, + "step": 44689 + }, + { + "epoch": 0.2657840898277667, + "grad_norm": 1.5379225015640259, + "learning_rate": 4.1780059925913376e-05, + "loss": 4.14, + "step": 44690 + }, + { + "epoch": 0.26579003711104765, + "grad_norm": 1.3822754621505737, + "learning_rate": 4.177971367492114e-05, + "loss": 3.9013, + "step": 44691 + }, + { + "epoch": 0.26579598439432867, + "grad_norm": 1.4104714393615723, + "learning_rate": 4.1779367418071304e-05, + "loss": 4.1956, + "step": 44692 + }, + { + "epoch": 0.2658019316776097, + "grad_norm": 1.3262616395950317, + "learning_rate": 4.1779021155363954e-05, + "loss": 4.2489, + "step": 44693 + }, + { + "epoch": 0.26580787896089064, + "grad_norm": 1.262908697128296, + "learning_rate": 4.1778674886799215e-05, + "loss": 4.272, + "step": 44694 + }, + { + "epoch": 0.26581382624417166, + "grad_norm": 1.3683199882507324, + "learning_rate": 4.1778328612377224e-05, + "loss": 4.0416, + "step": 44695 + }, + { + "epoch": 0.2658197735274527, + "grad_norm": 1.3908733129501343, + "learning_rate": 4.1777982332098094e-05, + "loss": 3.9849, + "step": 44696 + }, + { + "epoch": 0.26582572081073363, + "grad_norm": 1.4028868675231934, + "learning_rate": 4.1777636045961935e-05, + "loss": 3.8356, + "step": 44697 + }, + { + "epoch": 0.26583166809401465, + "grad_norm": 1.4257240295410156, + "learning_rate": 4.177728975396888e-05, + "loss": 3.8719, + "step": 44698 + }, + { + "epoch": 0.26583761537729567, + "grad_norm": 1.587026596069336, + "learning_rate": 4.177694345611905e-05, + "loss": 3.8405, + "step": 44699 + }, + { + "epoch": 0.2658435626605766, + "grad_norm": 1.6489218473434448, + "learning_rate": 4.1776597152412555e-05, + "loss": 3.9029, + "step": 44700 + }, + { + "epoch": 0.26584950994385764, + "grad_norm": 1.6973336935043335, + "learning_rate": 4.177625084284953e-05, + "loss": 3.6944, + "step": 44701 + }, + { + "epoch": 0.26585545722713866, + "grad_norm": 1.9634872674942017, + "learning_rate": 4.177590452743009e-05, + "loss": 4.8242, + "step": 44702 + }, + { + "epoch": 0.2658614045104196, + "grad_norm": 1.7017827033996582, + "learning_rate": 4.177555820615435e-05, + "loss": 3.494, + "step": 44703 + }, + { + "epoch": 0.26586735179370063, + "grad_norm": 1.6737470626831055, + "learning_rate": 4.177521187902244e-05, + "loss": 3.4545, + "step": 44704 + }, + { + "epoch": 0.26587329907698165, + "grad_norm": 1.789060115814209, + "learning_rate": 4.1774865546034466e-05, + "loss": 3.7419, + "step": 44705 + }, + { + "epoch": 0.2658792463602626, + "grad_norm": 1.7907310724258423, + "learning_rate": 4.177451920719057e-05, + "loss": 3.5279, + "step": 44706 + }, + { + "epoch": 0.2658851936435436, + "grad_norm": 1.7443631887435913, + "learning_rate": 4.1774172862490866e-05, + "loss": 3.7222, + "step": 44707 + }, + { + "epoch": 0.26589114092682464, + "grad_norm": 1.639036774635315, + "learning_rate": 4.1773826511935466e-05, + "loss": 4.5663, + "step": 44708 + }, + { + "epoch": 0.2658970882101056, + "grad_norm": 1.816623568534851, + "learning_rate": 4.177348015552449e-05, + "loss": 3.7925, + "step": 44709 + }, + { + "epoch": 0.2659030354933866, + "grad_norm": 1.762968897819519, + "learning_rate": 4.1773133793258077e-05, + "loss": 3.54, + "step": 44710 + }, + { + "epoch": 0.26590898277666764, + "grad_norm": 1.755726933479309, + "learning_rate": 4.177278742513633e-05, + "loss": 3.6448, + "step": 44711 + }, + { + "epoch": 0.2659149300599486, + "grad_norm": 1.8794560432434082, + "learning_rate": 4.177244105115937e-05, + "loss": 3.6665, + "step": 44712 + }, + { + "epoch": 0.2659208773432296, + "grad_norm": 1.8440061807632446, + "learning_rate": 4.1772094671327336e-05, + "loss": 3.4609, + "step": 44713 + }, + { + "epoch": 0.2659268246265106, + "grad_norm": 1.605864405632019, + "learning_rate": 4.177174828564033e-05, + "loss": 3.4746, + "step": 44714 + }, + { + "epoch": 0.2659327719097916, + "grad_norm": 1.7897979021072388, + "learning_rate": 4.177140189409848e-05, + "loss": 3.5198, + "step": 44715 + }, + { + "epoch": 0.2659387191930726, + "grad_norm": 1.904184341430664, + "learning_rate": 4.177105549670192e-05, + "loss": 3.3788, + "step": 44716 + }, + { + "epoch": 0.2659446664763536, + "grad_norm": 1.8025257587432861, + "learning_rate": 4.177070909345074e-05, + "loss": 3.5587, + "step": 44717 + }, + { + "epoch": 0.2659506137596346, + "grad_norm": 1.5975382328033447, + "learning_rate": 4.177036268434509e-05, + "loss": 3.7007, + "step": 44718 + }, + { + "epoch": 0.2659565610429156, + "grad_norm": 1.5539461374282837, + "learning_rate": 4.177001626938508e-05, + "loss": 3.5582, + "step": 44719 + }, + { + "epoch": 0.2659625083261966, + "grad_norm": 1.744336485862732, + "learning_rate": 4.176966984857082e-05, + "loss": 3.5188, + "step": 44720 + }, + { + "epoch": 0.26596845560947757, + "grad_norm": 1.8452039957046509, + "learning_rate": 4.176932342190245e-05, + "loss": 3.3829, + "step": 44721 + }, + { + "epoch": 0.2659744028927586, + "grad_norm": 1.7184338569641113, + "learning_rate": 4.176897698938008e-05, + "loss": 3.4251, + "step": 44722 + }, + { + "epoch": 0.2659803501760396, + "grad_norm": 1.7290966510772705, + "learning_rate": 4.176863055100384e-05, + "loss": 3.4495, + "step": 44723 + }, + { + "epoch": 0.26598629745932056, + "grad_norm": 1.8628392219543457, + "learning_rate": 4.176828410677383e-05, + "loss": 3.4897, + "step": 44724 + }, + { + "epoch": 0.2659922447426016, + "grad_norm": 1.7976295948028564, + "learning_rate": 4.1767937656690195e-05, + "loss": 3.3975, + "step": 44725 + }, + { + "epoch": 0.2659981920258826, + "grad_norm": 1.8656774759292603, + "learning_rate": 4.176759120075304e-05, + "loss": 3.5692, + "step": 44726 + }, + { + "epoch": 0.26600413930916356, + "grad_norm": 1.7409969568252563, + "learning_rate": 4.1767244738962496e-05, + "loss": 3.5053, + "step": 44727 + }, + { + "epoch": 0.2660100865924446, + "grad_norm": 1.8479092121124268, + "learning_rate": 4.176689827131869e-05, + "loss": 3.5209, + "step": 44728 + }, + { + "epoch": 0.2660160338757256, + "grad_norm": 1.9560660123825073, + "learning_rate": 4.1766551797821716e-05, + "loss": 3.4681, + "step": 44729 + }, + { + "epoch": 0.26602198115900655, + "grad_norm": 1.758405089378357, + "learning_rate": 4.176620531847172e-05, + "loss": 3.3802, + "step": 44730 + }, + { + "epoch": 0.26602792844228756, + "grad_norm": 1.8140959739685059, + "learning_rate": 4.1765858833268815e-05, + "loss": 3.4595, + "step": 44731 + }, + { + "epoch": 0.2660338757255686, + "grad_norm": 1.9919538497924805, + "learning_rate": 4.176551234221312e-05, + "loss": 3.5323, + "step": 44732 + }, + { + "epoch": 0.26603982300884954, + "grad_norm": 1.7757211923599243, + "learning_rate": 4.176516584530476e-05, + "loss": 3.4033, + "step": 44733 + }, + { + "epoch": 0.26604577029213056, + "grad_norm": 1.7876242399215698, + "learning_rate": 4.176481934254386e-05, + "loss": 3.5527, + "step": 44734 + }, + { + "epoch": 0.2660517175754116, + "grad_norm": 1.774839162826538, + "learning_rate": 4.1764472833930526e-05, + "loss": 3.3905, + "step": 44735 + }, + { + "epoch": 0.26605766485869253, + "grad_norm": 1.6881837844848633, + "learning_rate": 4.1764126319464886e-05, + "loss": 3.473, + "step": 44736 + }, + { + "epoch": 0.26606361214197355, + "grad_norm": 1.82596755027771, + "learning_rate": 4.176377979914706e-05, + "loss": 3.3221, + "step": 44737 + }, + { + "epoch": 0.26606955942525456, + "grad_norm": 1.6632107496261597, + "learning_rate": 4.176343327297718e-05, + "loss": 3.4838, + "step": 44738 + }, + { + "epoch": 0.2660755067085355, + "grad_norm": 1.6555064916610718, + "learning_rate": 4.176308674095536e-05, + "loss": 3.4159, + "step": 44739 + }, + { + "epoch": 0.26608145399181654, + "grad_norm": 1.9076722860336304, + "learning_rate": 4.176274020308172e-05, + "loss": 3.3118, + "step": 44740 + }, + { + "epoch": 0.26608740127509756, + "grad_norm": 1.6504920721054077, + "learning_rate": 4.176239365935637e-05, + "loss": 3.5324, + "step": 44741 + }, + { + "epoch": 0.2660933485583785, + "grad_norm": 1.741859793663025, + "learning_rate": 4.1762047109779454e-05, + "loss": 3.3802, + "step": 44742 + }, + { + "epoch": 0.26609929584165953, + "grad_norm": 1.7939344644546509, + "learning_rate": 4.176170055435108e-05, + "loss": 3.2951, + "step": 44743 + }, + { + "epoch": 0.26610524312494055, + "grad_norm": 1.750080943107605, + "learning_rate": 4.176135399307136e-05, + "loss": 3.3427, + "step": 44744 + }, + { + "epoch": 0.2661111904082215, + "grad_norm": 1.7999868392944336, + "learning_rate": 4.1761007425940426e-05, + "loss": 3.1913, + "step": 44745 + }, + { + "epoch": 0.2661171376915025, + "grad_norm": 1.658857822418213, + "learning_rate": 4.17606608529584e-05, + "loss": 3.2835, + "step": 44746 + }, + { + "epoch": 0.26612308497478354, + "grad_norm": 1.7446309328079224, + "learning_rate": 4.17603142741254e-05, + "loss": 3.3934, + "step": 44747 + }, + { + "epoch": 0.2661290322580645, + "grad_norm": 1.8266913890838623, + "learning_rate": 4.175996768944155e-05, + "loss": 4.7316, + "step": 44748 + }, + { + "epoch": 0.2661349795413455, + "grad_norm": 1.5139610767364502, + "learning_rate": 4.175962109890696e-05, + "loss": 5.0379, + "step": 44749 + }, + { + "epoch": 0.26614092682462653, + "grad_norm": 1.3831099271774292, + "learning_rate": 4.175927450252177e-05, + "loss": 5.0421, + "step": 44750 + }, + { + "epoch": 0.2661468741079075, + "grad_norm": 1.136552333831787, + "learning_rate": 4.175892790028609e-05, + "loss": 5.0958, + "step": 44751 + }, + { + "epoch": 0.2661528213911885, + "grad_norm": 1.2888416051864624, + "learning_rate": 4.175858129220004e-05, + "loss": 4.942, + "step": 44752 + }, + { + "epoch": 0.2661587686744695, + "grad_norm": 1.3422982692718506, + "learning_rate": 4.1758234678263736e-05, + "loss": 5.0723, + "step": 44753 + }, + { + "epoch": 0.2661647159577505, + "grad_norm": 1.3976556062698364, + "learning_rate": 4.1757888058477314e-05, + "loss": 4.8802, + "step": 44754 + }, + { + "epoch": 0.2661706632410315, + "grad_norm": 1.6158572435379028, + "learning_rate": 4.175754143284088e-05, + "loss": 4.8769, + "step": 44755 + }, + { + "epoch": 0.2661766105243125, + "grad_norm": 1.32417631149292, + "learning_rate": 4.175719480135456e-05, + "loss": 5.1602, + "step": 44756 + }, + { + "epoch": 0.2661825578075935, + "grad_norm": 2.4854114055633545, + "learning_rate": 4.175684816401848e-05, + "loss": 4.497, + "step": 44757 + }, + { + "epoch": 0.2661885050908745, + "grad_norm": 2.0246262550354004, + "learning_rate": 4.1756501520832755e-05, + "loss": 4.4878, + "step": 44758 + }, + { + "epoch": 0.2661944523741555, + "grad_norm": 1.612572431564331, + "learning_rate": 4.175615487179751e-05, + "loss": 5.3039, + "step": 44759 + }, + { + "epoch": 0.26620039965743647, + "grad_norm": 1.5639597177505493, + "learning_rate": 4.175580821691287e-05, + "loss": 4.9807, + "step": 44760 + }, + { + "epoch": 0.2662063469407175, + "grad_norm": 2.797496795654297, + "learning_rate": 4.175546155617894e-05, + "loss": 4.1947, + "step": 44761 + }, + { + "epoch": 0.2662122942239985, + "grad_norm": 2.5868144035339355, + "learning_rate": 4.175511488959586e-05, + "loss": 4.2248, + "step": 44762 + }, + { + "epoch": 0.26621824150727946, + "grad_norm": 2.407508134841919, + "learning_rate": 4.1754768217163745e-05, + "loss": 3.8637, + "step": 44763 + }, + { + "epoch": 0.2662241887905605, + "grad_norm": 1.7029684782028198, + "learning_rate": 4.1754421538882705e-05, + "loss": 4.5826, + "step": 44764 + }, + { + "epoch": 0.2662301360738415, + "grad_norm": 2.4208004474639893, + "learning_rate": 4.175407485475287e-05, + "loss": 3.7107, + "step": 44765 + }, + { + "epoch": 0.26623608335712245, + "grad_norm": 2.5416510105133057, + "learning_rate": 4.1753728164774366e-05, + "loss": 3.4808, + "step": 44766 + }, + { + "epoch": 0.26624203064040347, + "grad_norm": 2.593438148498535, + "learning_rate": 4.17533814689473e-05, + "loss": 4.412, + "step": 44767 + }, + { + "epoch": 0.2662479779236845, + "grad_norm": 2.520878314971924, + "learning_rate": 4.1753034767271817e-05, + "loss": 3.9044, + "step": 44768 + }, + { + "epoch": 0.26625392520696545, + "grad_norm": 2.661273241043091, + "learning_rate": 4.175268805974801e-05, + "loss": 4.0524, + "step": 44769 + }, + { + "epoch": 0.26625987249024646, + "grad_norm": 1.5261204242706299, + "learning_rate": 4.175234134637602e-05, + "loss": 5.1174, + "step": 44770 + }, + { + "epoch": 0.2662658197735275, + "grad_norm": 1.470487117767334, + "learning_rate": 4.1751994627155953e-05, + "loss": 5.0833, + "step": 44771 + }, + { + "epoch": 0.26627176705680844, + "grad_norm": 2.0932576656341553, + "learning_rate": 4.1751647902087945e-05, + "loss": 3.8601, + "step": 44772 + }, + { + "epoch": 0.26627771434008946, + "grad_norm": 1.719422698020935, + "learning_rate": 4.17513011711721e-05, + "loss": 5.0868, + "step": 44773 + }, + { + "epoch": 0.26628366162337047, + "grad_norm": 1.4879006147384644, + "learning_rate": 4.175095443440856e-05, + "loss": 5.5392, + "step": 44774 + }, + { + "epoch": 0.26628960890665143, + "grad_norm": 1.418983817100525, + "learning_rate": 4.175060769179743e-05, + "loss": 5.3652, + "step": 44775 + }, + { + "epoch": 0.26629555618993245, + "grad_norm": 2.7831554412841797, + "learning_rate": 4.1750260943338835e-05, + "loss": 4.519, + "step": 44776 + }, + { + "epoch": 0.26630150347321346, + "grad_norm": 2.4824514389038086, + "learning_rate": 4.1749914189032895e-05, + "loss": 4.2598, + "step": 44777 + }, + { + "epoch": 0.2663074507564944, + "grad_norm": 1.633848786354065, + "learning_rate": 4.174956742887973e-05, + "loss": 4.6508, + "step": 44778 + }, + { + "epoch": 0.26631339803977544, + "grad_norm": 1.5738093852996826, + "learning_rate": 4.174922066287947e-05, + "loss": 4.6815, + "step": 44779 + }, + { + "epoch": 0.26631934532305646, + "grad_norm": 1.5213748216629028, + "learning_rate": 4.174887389103224e-05, + "loss": 5.0009, + "step": 44780 + }, + { + "epoch": 0.2663252926063374, + "grad_norm": 1.7112046480178833, + "learning_rate": 4.1748527113338134e-05, + "loss": 4.6291, + "step": 44781 + }, + { + "epoch": 0.26633123988961843, + "grad_norm": 1.7962313890457153, + "learning_rate": 4.17481803297973e-05, + "loss": 4.519, + "step": 44782 + }, + { + "epoch": 0.2663371871728994, + "grad_norm": 1.4231399297714233, + "learning_rate": 4.174783354040984e-05, + "loss": 4.5978, + "step": 44783 + }, + { + "epoch": 0.2663431344561804, + "grad_norm": 1.566726565361023, + "learning_rate": 4.17474867451759e-05, + "loss": 4.8854, + "step": 44784 + }, + { + "epoch": 0.2663490817394614, + "grad_norm": 1.5004982948303223, + "learning_rate": 4.174713994409557e-05, + "loss": 5.0593, + "step": 44785 + }, + { + "epoch": 0.2663550290227424, + "grad_norm": 1.5360815525054932, + "learning_rate": 4.1746793137168996e-05, + "loss": 5.4753, + "step": 44786 + }, + { + "epoch": 0.2663609763060234, + "grad_norm": 1.492956280708313, + "learning_rate": 4.174644632439629e-05, + "loss": 5.117, + "step": 44787 + }, + { + "epoch": 0.2663669235893044, + "grad_norm": 1.777766227722168, + "learning_rate": 4.174609950577757e-05, + "loss": 4.98, + "step": 44788 + }, + { + "epoch": 0.2663728708725854, + "grad_norm": 1.4328038692474365, + "learning_rate": 4.1745752681312955e-05, + "loss": 5.0755, + "step": 44789 + }, + { + "epoch": 0.2663788181558664, + "grad_norm": 1.690836787223816, + "learning_rate": 4.174540585100257e-05, + "loss": 5.0668, + "step": 44790 + }, + { + "epoch": 0.2663847654391474, + "grad_norm": 1.302195429801941, + "learning_rate": 4.1745059014846545e-05, + "loss": 5.042, + "step": 44791 + }, + { + "epoch": 0.26639071272242837, + "grad_norm": 1.5513596534729004, + "learning_rate": 4.174471217284499e-05, + "loss": 5.0142, + "step": 44792 + }, + { + "epoch": 0.2663966600057094, + "grad_norm": 1.4038656949996948, + "learning_rate": 4.174436532499803e-05, + "loss": 4.6583, + "step": 44793 + }, + { + "epoch": 0.2664026072889904, + "grad_norm": 1.575278401374817, + "learning_rate": 4.174401847130578e-05, + "loss": 4.9807, + "step": 44794 + }, + { + "epoch": 0.26640855457227136, + "grad_norm": 1.6982972621917725, + "learning_rate": 4.1743671611768366e-05, + "loss": 4.5665, + "step": 44795 + }, + { + "epoch": 0.2664145018555524, + "grad_norm": 1.5767321586608887, + "learning_rate": 4.1743324746385914e-05, + "loss": 4.1863, + "step": 44796 + }, + { + "epoch": 0.2664204491388334, + "grad_norm": 1.4724262952804565, + "learning_rate": 4.174297787515854e-05, + "loss": 4.5283, + "step": 44797 + }, + { + "epoch": 0.26642639642211435, + "grad_norm": 1.7934913635253906, + "learning_rate": 4.1742630998086363e-05, + "loss": 4.9567, + "step": 44798 + }, + { + "epoch": 0.26643234370539537, + "grad_norm": 2.0480926036834717, + "learning_rate": 4.17422841151695e-05, + "loss": 4.9042, + "step": 44799 + }, + { + "epoch": 0.2664382909886764, + "grad_norm": 1.5076093673706055, + "learning_rate": 4.174193722640809e-05, + "loss": 4.2392, + "step": 44800 + }, + { + "epoch": 0.26644423827195735, + "grad_norm": 1.4948232173919678, + "learning_rate": 4.174159033180224e-05, + "loss": 4.3256, + "step": 44801 + }, + { + "epoch": 0.26645018555523836, + "grad_norm": 1.4526655673980713, + "learning_rate": 4.1741243431352074e-05, + "loss": 4.2431, + "step": 44802 + }, + { + "epoch": 0.2664561328385194, + "grad_norm": 1.496447205543518, + "learning_rate": 4.174089652505771e-05, + "loss": 4.0642, + "step": 44803 + }, + { + "epoch": 0.26646208012180034, + "grad_norm": 1.4257051944732666, + "learning_rate": 4.174054961291928e-05, + "loss": 4.0764, + "step": 44804 + }, + { + "epoch": 0.26646802740508135, + "grad_norm": 1.3080171346664429, + "learning_rate": 4.174020269493689e-05, + "loss": 4.8977, + "step": 44805 + }, + { + "epoch": 0.26647397468836237, + "grad_norm": 2.905646800994873, + "learning_rate": 4.173985577111067e-05, + "loss": 3.399, + "step": 44806 + }, + { + "epoch": 0.26647992197164333, + "grad_norm": 2.022432565689087, + "learning_rate": 4.173950884144073e-05, + "loss": 4.43, + "step": 44807 + }, + { + "epoch": 0.26648586925492435, + "grad_norm": 1.7160505056381226, + "learning_rate": 4.173916190592722e-05, + "loss": 4.7435, + "step": 44808 + }, + { + "epoch": 0.26649181653820536, + "grad_norm": 1.631000280380249, + "learning_rate": 4.173881496457022e-05, + "loss": 4.537, + "step": 44809 + }, + { + "epoch": 0.2664977638214863, + "grad_norm": 1.6864910125732422, + "learning_rate": 4.173846801736988e-05, + "loss": 4.5979, + "step": 44810 + }, + { + "epoch": 0.26650371110476734, + "grad_norm": 1.9149236679077148, + "learning_rate": 4.173812106432632e-05, + "loss": 4.4995, + "step": 44811 + }, + { + "epoch": 0.26650965838804835, + "grad_norm": 2.0941431522369385, + "learning_rate": 4.173777410543965e-05, + "loss": 4.5468, + "step": 44812 + }, + { + "epoch": 0.2665156056713293, + "grad_norm": 2.146393299102783, + "learning_rate": 4.173742714070999e-05, + "loss": 4.4057, + "step": 44813 + }, + { + "epoch": 0.26652155295461033, + "grad_norm": 1.908378005027771, + "learning_rate": 4.1737080170137487e-05, + "loss": 4.4444, + "step": 44814 + }, + { + "epoch": 0.26652750023789135, + "grad_norm": 2.0410706996917725, + "learning_rate": 4.173673319372222e-05, + "loss": 4.3312, + "step": 44815 + }, + { + "epoch": 0.2665334475211723, + "grad_norm": 2.034675359725952, + "learning_rate": 4.173638621146435e-05, + "loss": 4.3943, + "step": 44816 + }, + { + "epoch": 0.2665393948044533, + "grad_norm": 1.8281843662261963, + "learning_rate": 4.173603922336397e-05, + "loss": 4.5516, + "step": 44817 + }, + { + "epoch": 0.26654534208773434, + "grad_norm": 1.6864619255065918, + "learning_rate": 4.173569222942121e-05, + "loss": 5.5557, + "step": 44818 + }, + { + "epoch": 0.2665512893710153, + "grad_norm": 1.600462555885315, + "learning_rate": 4.1735345229636205e-05, + "loss": 5.1712, + "step": 44819 + }, + { + "epoch": 0.2665572366542963, + "grad_norm": 1.5538115501403809, + "learning_rate": 4.1734998224009044e-05, + "loss": 5.1026, + "step": 44820 + }, + { + "epoch": 0.26656318393757733, + "grad_norm": 1.3862335681915283, + "learning_rate": 4.173465121253989e-05, + "loss": 4.7142, + "step": 44821 + }, + { + "epoch": 0.2665691312208583, + "grad_norm": 1.5282399654388428, + "learning_rate": 4.1734304195228825e-05, + "loss": 4.7787, + "step": 44822 + }, + { + "epoch": 0.2665750785041393, + "grad_norm": 1.6287338733673096, + "learning_rate": 4.1733957172075996e-05, + "loss": 4.884, + "step": 44823 + }, + { + "epoch": 0.2665810257874203, + "grad_norm": 1.6088956594467163, + "learning_rate": 4.1733610143081514e-05, + "loss": 4.7789, + "step": 44824 + }, + { + "epoch": 0.2665869730707013, + "grad_norm": 1.6539313793182373, + "learning_rate": 4.17332631082455e-05, + "loss": 4.3541, + "step": 44825 + }, + { + "epoch": 0.2665929203539823, + "grad_norm": 1.8307095766067505, + "learning_rate": 4.173291606756808e-05, + "loss": 4.5578, + "step": 44826 + }, + { + "epoch": 0.2665988676372633, + "grad_norm": 1.7657279968261719, + "learning_rate": 4.173256902104937e-05, + "loss": 4.253, + "step": 44827 + }, + { + "epoch": 0.2666048149205443, + "grad_norm": 1.452724814414978, + "learning_rate": 4.173222196868949e-05, + "loss": 5.1436, + "step": 44828 + }, + { + "epoch": 0.2666107622038253, + "grad_norm": 1.3948582410812378, + "learning_rate": 4.1731874910488564e-05, + "loss": 5.0796, + "step": 44829 + }, + { + "epoch": 0.2666167094871063, + "grad_norm": 1.6431764364242554, + "learning_rate": 4.173152784644672e-05, + "loss": 5.1312, + "step": 44830 + }, + { + "epoch": 0.26662265677038727, + "grad_norm": 1.7326582670211792, + "learning_rate": 4.173118077656407e-05, + "loss": 4.7078, + "step": 44831 + }, + { + "epoch": 0.2666286040536683, + "grad_norm": 1.4052305221557617, + "learning_rate": 4.173083370084073e-05, + "loss": 4.6167, + "step": 44832 + }, + { + "epoch": 0.2666345513369493, + "grad_norm": 1.524970531463623, + "learning_rate": 4.1730486619276845e-05, + "loss": 4.4399, + "step": 44833 + }, + { + "epoch": 0.26664049862023026, + "grad_norm": 1.532120704650879, + "learning_rate": 4.17301395318725e-05, + "loss": 5.019, + "step": 44834 + }, + { + "epoch": 0.2666464459035113, + "grad_norm": 1.990350604057312, + "learning_rate": 4.172979243862786e-05, + "loss": 4.3052, + "step": 44835 + }, + { + "epoch": 0.2666523931867923, + "grad_norm": 1.612270712852478, + "learning_rate": 4.1729445339543e-05, + "loss": 4.8996, + "step": 44836 + }, + { + "epoch": 0.26665834047007325, + "grad_norm": 1.4720107316970825, + "learning_rate": 4.172909823461807e-05, + "loss": 4.8294, + "step": 44837 + }, + { + "epoch": 0.26666428775335427, + "grad_norm": 1.6740754842758179, + "learning_rate": 4.172875112385319e-05, + "loss": 4.559, + "step": 44838 + }, + { + "epoch": 0.2666702350366353, + "grad_norm": 1.57502019405365, + "learning_rate": 4.172840400724847e-05, + "loss": 4.3107, + "step": 44839 + }, + { + "epoch": 0.26667618231991624, + "grad_norm": 1.5605181455612183, + "learning_rate": 4.172805688480404e-05, + "loss": 4.6809, + "step": 44840 + }, + { + "epoch": 0.26668212960319726, + "grad_norm": 1.3988057374954224, + "learning_rate": 4.172770975652002e-05, + "loss": 4.6622, + "step": 44841 + }, + { + "epoch": 0.2666880768864783, + "grad_norm": 1.3739773035049438, + "learning_rate": 4.172736262239653e-05, + "loss": 4.6195, + "step": 44842 + }, + { + "epoch": 0.26669402416975924, + "grad_norm": 1.265779972076416, + "learning_rate": 4.172701548243368e-05, + "loss": 4.8072, + "step": 44843 + }, + { + "epoch": 0.26669997145304025, + "grad_norm": 2.226224422454834, + "learning_rate": 4.1726668336631616e-05, + "loss": 4.4928, + "step": 44844 + }, + { + "epoch": 0.26670591873632127, + "grad_norm": 1.7928279638290405, + "learning_rate": 4.1726321184990435e-05, + "loss": 5.1046, + "step": 44845 + }, + { + "epoch": 0.26671186601960223, + "grad_norm": 2.310781717300415, + "learning_rate": 4.172597402751027e-05, + "loss": 4.4983, + "step": 44846 + }, + { + "epoch": 0.26671781330288324, + "grad_norm": 1.9209214448928833, + "learning_rate": 4.172562686419125e-05, + "loss": 4.9626, + "step": 44847 + }, + { + "epoch": 0.26672376058616426, + "grad_norm": 1.6110069751739502, + "learning_rate": 4.172527969503347e-05, + "loss": 5.2246, + "step": 44848 + }, + { + "epoch": 0.2667297078694452, + "grad_norm": 1.2800827026367188, + "learning_rate": 4.1724932520037076e-05, + "loss": 5.1541, + "step": 44849 + }, + { + "epoch": 0.26673565515272624, + "grad_norm": 1.2889529466629028, + "learning_rate": 4.172458533920218e-05, + "loss": 5.2821, + "step": 44850 + }, + { + "epoch": 0.26674160243600725, + "grad_norm": 1.6496832370758057, + "learning_rate": 4.17242381525289e-05, + "loss": 4.909, + "step": 44851 + }, + { + "epoch": 0.2667475497192882, + "grad_norm": 1.7576298713684082, + "learning_rate": 4.172389096001737e-05, + "loss": 4.5706, + "step": 44852 + }, + { + "epoch": 0.26675349700256923, + "grad_norm": 1.6735689640045166, + "learning_rate": 4.17235437616677e-05, + "loss": 4.8364, + "step": 44853 + }, + { + "epoch": 0.26675944428585024, + "grad_norm": 2.0282371044158936, + "learning_rate": 4.172319655748001e-05, + "loss": 3.3696, + "step": 44854 + }, + { + "epoch": 0.2667653915691312, + "grad_norm": 2.1266045570373535, + "learning_rate": 4.1722849347454427e-05, + "loss": 3.1915, + "step": 44855 + }, + { + "epoch": 0.2667713388524122, + "grad_norm": 1.6461578607559204, + "learning_rate": 4.1722502131591066e-05, + "loss": 3.7508, + "step": 44856 + }, + { + "epoch": 0.26677728613569324, + "grad_norm": 1.5543279647827148, + "learning_rate": 4.172215490989006e-05, + "loss": 4.3607, + "step": 44857 + }, + { + "epoch": 0.2667832334189742, + "grad_norm": 1.9258612394332886, + "learning_rate": 4.172180768235152e-05, + "loss": 3.4565, + "step": 44858 + }, + { + "epoch": 0.2667891807022552, + "grad_norm": 2.1396331787109375, + "learning_rate": 4.172146044897557e-05, + "loss": 3.1492, + "step": 44859 + }, + { + "epoch": 0.26679512798553623, + "grad_norm": 2.11887788772583, + "learning_rate": 4.172111320976233e-05, + "loss": 3.0833, + "step": 44860 + }, + { + "epoch": 0.2668010752688172, + "grad_norm": 2.398845672607422, + "learning_rate": 4.172076596471192e-05, + "loss": 3.2274, + "step": 44861 + }, + { + "epoch": 0.2668070225520982, + "grad_norm": 2.193784236907959, + "learning_rate": 4.1720418713824464e-05, + "loss": 3.7018, + "step": 44862 + }, + { + "epoch": 0.2668129698353792, + "grad_norm": 1.626880407333374, + "learning_rate": 4.172007145710008e-05, + "loss": 4.1425, + "step": 44863 + }, + { + "epoch": 0.2668189171186602, + "grad_norm": 1.7322933673858643, + "learning_rate": 4.1719724194538896e-05, + "loss": 4.3085, + "step": 44864 + }, + { + "epoch": 0.2668248644019412, + "grad_norm": 1.611504077911377, + "learning_rate": 4.171937692614103e-05, + "loss": 4.3017, + "step": 44865 + }, + { + "epoch": 0.2668308116852222, + "grad_norm": 1.7528389692306519, + "learning_rate": 4.171902965190659e-05, + "loss": 3.5744, + "step": 44866 + }, + { + "epoch": 0.2668367589685032, + "grad_norm": 1.679460883140564, + "learning_rate": 4.171868237183573e-05, + "loss": 4.344, + "step": 44867 + }, + { + "epoch": 0.2668427062517842, + "grad_norm": 2.230706214904785, + "learning_rate": 4.171833508592854e-05, + "loss": 3.4002, + "step": 44868 + }, + { + "epoch": 0.2668486535350652, + "grad_norm": 2.462716579437256, + "learning_rate": 4.1717987794185146e-05, + "loss": 3.2262, + "step": 44869 + }, + { + "epoch": 0.26685460081834617, + "grad_norm": 1.6178597211837769, + "learning_rate": 4.1717640496605684e-05, + "loss": 4.6636, + "step": 44870 + }, + { + "epoch": 0.2668605481016272, + "grad_norm": 2.010730028152466, + "learning_rate": 4.171729319319027e-05, + "loss": 4.5138, + "step": 44871 + }, + { + "epoch": 0.2668664953849082, + "grad_norm": 2.7950356006622314, + "learning_rate": 4.171694588393901e-05, + "loss": 3.0432, + "step": 44872 + }, + { + "epoch": 0.26687244266818916, + "grad_norm": 3.291163921356201, + "learning_rate": 4.171659856885204e-05, + "loss": 3.12, + "step": 44873 + }, + { + "epoch": 0.2668783899514702, + "grad_norm": 2.233228921890259, + "learning_rate": 4.171625124792948e-05, + "loss": 2.9445, + "step": 44874 + }, + { + "epoch": 0.2668843372347512, + "grad_norm": 2.132716655731201, + "learning_rate": 4.171590392117145e-05, + "loss": 4.0259, + "step": 44875 + }, + { + "epoch": 0.26689028451803215, + "grad_norm": 3.1809020042419434, + "learning_rate": 4.171555658857807e-05, + "loss": 2.6854, + "step": 44876 + }, + { + "epoch": 0.26689623180131317, + "grad_norm": 3.4349706172943115, + "learning_rate": 4.1715209250149464e-05, + "loss": 2.5494, + "step": 44877 + }, + { + "epoch": 0.2669021790845942, + "grad_norm": 2.7041451930999756, + "learning_rate": 4.171486190588575e-05, + "loss": 2.6787, + "step": 44878 + }, + { + "epoch": 0.26690812636787514, + "grad_norm": 2.7119736671447754, + "learning_rate": 4.171451455578705e-05, + "loss": 2.4448, + "step": 44879 + }, + { + "epoch": 0.26691407365115616, + "grad_norm": 2.9431769847869873, + "learning_rate": 4.1714167199853484e-05, + "loss": 2.284, + "step": 44880 + }, + { + "epoch": 0.2669200209344372, + "grad_norm": 2.422168016433716, + "learning_rate": 4.1713819838085175e-05, + "loss": 3.2949, + "step": 44881 + }, + { + "epoch": 0.26692596821771813, + "grad_norm": 2.330723285675049, + "learning_rate": 4.171347247048225e-05, + "loss": 5.0551, + "step": 44882 + }, + { + "epoch": 0.26693191550099915, + "grad_norm": 2.3261985778808594, + "learning_rate": 4.171312509704482e-05, + "loss": 4.6906, + "step": 44883 + }, + { + "epoch": 0.26693786278428017, + "grad_norm": 3.1015167236328125, + "learning_rate": 4.171277771777301e-05, + "loss": 5.5515, + "step": 44884 + }, + { + "epoch": 0.2669438100675611, + "grad_norm": 1.9202542304992676, + "learning_rate": 4.171243033266694e-05, + "loss": 5.3912, + "step": 44885 + }, + { + "epoch": 0.26694975735084214, + "grad_norm": 1.9454153776168823, + "learning_rate": 4.171208294172674e-05, + "loss": 4.9861, + "step": 44886 + }, + { + "epoch": 0.26695570463412316, + "grad_norm": 1.922673225402832, + "learning_rate": 4.171173554495252e-05, + "loss": 5.0023, + "step": 44887 + }, + { + "epoch": 0.2669616519174041, + "grad_norm": 1.9921283721923828, + "learning_rate": 4.171138814234441e-05, + "loss": 4.638, + "step": 44888 + }, + { + "epoch": 0.26696759920068514, + "grad_norm": 1.742746114730835, + "learning_rate": 4.1711040733902526e-05, + "loss": 4.9514, + "step": 44889 + }, + { + "epoch": 0.26697354648396615, + "grad_norm": 1.7445249557495117, + "learning_rate": 4.171069331962698e-05, + "loss": 5.0904, + "step": 44890 + }, + { + "epoch": 0.2669794937672471, + "grad_norm": 1.6388905048370361, + "learning_rate": 4.171034589951791e-05, + "loss": 4.6503, + "step": 44891 + }, + { + "epoch": 0.2669854410505281, + "grad_norm": 1.6425681114196777, + "learning_rate": 4.170999847357544e-05, + "loss": 4.8481, + "step": 44892 + }, + { + "epoch": 0.26699138833380914, + "grad_norm": 1.6846109628677368, + "learning_rate": 4.170965104179967e-05, + "loss": 4.7117, + "step": 44893 + }, + { + "epoch": 0.2669973356170901, + "grad_norm": 1.5886638164520264, + "learning_rate": 4.170930360419074e-05, + "loss": 4.8335, + "step": 44894 + }, + { + "epoch": 0.2670032829003711, + "grad_norm": 1.5863268375396729, + "learning_rate": 4.1708956160748764e-05, + "loss": 4.9467, + "step": 44895 + }, + { + "epoch": 0.26700923018365214, + "grad_norm": 1.67328941822052, + "learning_rate": 4.170860871147386e-05, + "loss": 4.7729, + "step": 44896 + }, + { + "epoch": 0.2670151774669331, + "grad_norm": 1.6387516260147095, + "learning_rate": 4.170826125636615e-05, + "loss": 4.8288, + "step": 44897 + }, + { + "epoch": 0.2670211247502141, + "grad_norm": 1.561490535736084, + "learning_rate": 4.170791379542577e-05, + "loss": 4.9642, + "step": 44898 + }, + { + "epoch": 0.26702707203349507, + "grad_norm": 1.5701967477798462, + "learning_rate": 4.170756632865282e-05, + "loss": 4.7098, + "step": 44899 + }, + { + "epoch": 0.2670330193167761, + "grad_norm": 1.3398517370224, + "learning_rate": 4.1707218856047447e-05, + "loss": 4.8037, + "step": 44900 + }, + { + "epoch": 0.2670389666000571, + "grad_norm": 1.7106722593307495, + "learning_rate": 4.170687137760974e-05, + "loss": 5.4335, + "step": 44901 + }, + { + "epoch": 0.26704491388333806, + "grad_norm": 1.6243544816970825, + "learning_rate": 4.1706523893339846e-05, + "loss": 5.0461, + "step": 44902 + }, + { + "epoch": 0.2670508611666191, + "grad_norm": 1.3763346672058105, + "learning_rate": 4.170617640323787e-05, + "loss": 4.8406, + "step": 44903 + }, + { + "epoch": 0.2670568084499001, + "grad_norm": 1.6516683101654053, + "learning_rate": 4.170582890730394e-05, + "loss": 4.6725, + "step": 44904 + }, + { + "epoch": 0.26706275573318106, + "grad_norm": 1.5014539957046509, + "learning_rate": 4.170548140553818e-05, + "loss": 4.7096, + "step": 44905 + }, + { + "epoch": 0.2670687030164621, + "grad_norm": 1.4920170307159424, + "learning_rate": 4.170513389794072e-05, + "loss": 5.0861, + "step": 44906 + }, + { + "epoch": 0.2670746502997431, + "grad_norm": 1.426103949546814, + "learning_rate": 4.170478638451165e-05, + "loss": 5.1929, + "step": 44907 + }, + { + "epoch": 0.26708059758302405, + "grad_norm": 1.4118753671646118, + "learning_rate": 4.1704438865251126e-05, + "loss": 5.237, + "step": 44908 + }, + { + "epoch": 0.26708654486630506, + "grad_norm": 1.3313462734222412, + "learning_rate": 4.170409134015926e-05, + "loss": 5.086, + "step": 44909 + }, + { + "epoch": 0.2670924921495861, + "grad_norm": 1.4781039953231812, + "learning_rate": 4.170374380923615e-05, + "loss": 5.1318, + "step": 44910 + }, + { + "epoch": 0.26709843943286704, + "grad_norm": 1.5895652770996094, + "learning_rate": 4.170339627248195e-05, + "loss": 4.8332, + "step": 44911 + }, + { + "epoch": 0.26710438671614806, + "grad_norm": 2.138152837753296, + "learning_rate": 4.170304872989676e-05, + "loss": 5.3117, + "step": 44912 + }, + { + "epoch": 0.2671103339994291, + "grad_norm": 1.823987364768982, + "learning_rate": 4.170270118148071e-05, + "loss": 5.1843, + "step": 44913 + }, + { + "epoch": 0.26711628128271003, + "grad_norm": 1.5316396951675415, + "learning_rate": 4.170235362723392e-05, + "loss": 5.1787, + "step": 44914 + }, + { + "epoch": 0.26712222856599105, + "grad_norm": 1.479590654373169, + "learning_rate": 4.170200606715651e-05, + "loss": 5.3082, + "step": 44915 + }, + { + "epoch": 0.26712817584927206, + "grad_norm": 1.5397835969924927, + "learning_rate": 4.1701658501248605e-05, + "loss": 4.8396, + "step": 44916 + }, + { + "epoch": 0.267134123132553, + "grad_norm": 1.3746646642684937, + "learning_rate": 4.1701310929510315e-05, + "loss": 4.8159, + "step": 44917 + }, + { + "epoch": 0.26714007041583404, + "grad_norm": 1.5651111602783203, + "learning_rate": 4.170096335194178e-05, + "loss": 4.7319, + "step": 44918 + }, + { + "epoch": 0.26714601769911506, + "grad_norm": 1.6642135381698608, + "learning_rate": 4.170061576854311e-05, + "loss": 5.0725, + "step": 44919 + }, + { + "epoch": 0.267151964982396, + "grad_norm": 1.549047827720642, + "learning_rate": 4.1700268179314426e-05, + "loss": 5.0725, + "step": 44920 + }, + { + "epoch": 0.26715791226567703, + "grad_norm": 1.4577093124389648, + "learning_rate": 4.169992058425585e-05, + "loss": 4.9049, + "step": 44921 + }, + { + "epoch": 0.26716385954895805, + "grad_norm": 1.5805410146713257, + "learning_rate": 4.169957298336751e-05, + "loss": 4.7839, + "step": 44922 + }, + { + "epoch": 0.267169806832239, + "grad_norm": 1.3022712469100952, + "learning_rate": 4.169922537664951e-05, + "loss": 4.6275, + "step": 44923 + }, + { + "epoch": 0.26717575411552, + "grad_norm": 1.9110902547836304, + "learning_rate": 4.1698877764101996e-05, + "loss": 4.3402, + "step": 44924 + }, + { + "epoch": 0.26718170139880104, + "grad_norm": 2.217172622680664, + "learning_rate": 4.169853014572507e-05, + "loss": 3.8283, + "step": 44925 + }, + { + "epoch": 0.267187648682082, + "grad_norm": 2.4509329795837402, + "learning_rate": 4.169818252151885e-05, + "loss": 3.9137, + "step": 44926 + }, + { + "epoch": 0.267193595965363, + "grad_norm": 2.1873271465301514, + "learning_rate": 4.169783489148348e-05, + "loss": 3.8801, + "step": 44927 + }, + { + "epoch": 0.26719954324864403, + "grad_norm": 2.6063313484191895, + "learning_rate": 4.169748725561906e-05, + "loss": 3.7632, + "step": 44928 + }, + { + "epoch": 0.267205490531925, + "grad_norm": 2.389047622680664, + "learning_rate": 4.169713961392573e-05, + "loss": 3.6809, + "step": 44929 + }, + { + "epoch": 0.267211437815206, + "grad_norm": 2.4377310276031494, + "learning_rate": 4.169679196640359e-05, + "loss": 3.7699, + "step": 44930 + }, + { + "epoch": 0.267217385098487, + "grad_norm": 2.451704740524292, + "learning_rate": 4.169644431305278e-05, + "loss": 3.9257, + "step": 44931 + }, + { + "epoch": 0.267223332381768, + "grad_norm": 2.468982458114624, + "learning_rate": 4.169609665387341e-05, + "loss": 3.8099, + "step": 44932 + }, + { + "epoch": 0.267229279665049, + "grad_norm": 2.394261598587036, + "learning_rate": 4.169574898886561e-05, + "loss": 3.8413, + "step": 44933 + }, + { + "epoch": 0.26723522694833, + "grad_norm": 1.9629268646240234, + "learning_rate": 4.1695401318029494e-05, + "loss": 3.8539, + "step": 44934 + }, + { + "epoch": 0.267241174231611, + "grad_norm": 1.5220867395401, + "learning_rate": 4.169505364136518e-05, + "loss": 4.4678, + "step": 44935 + }, + { + "epoch": 0.267247121514892, + "grad_norm": 1.4336282014846802, + "learning_rate": 4.1694705958872805e-05, + "loss": 4.6436, + "step": 44936 + }, + { + "epoch": 0.267253068798173, + "grad_norm": 3.3870887756347656, + "learning_rate": 4.169435827055247e-05, + "loss": 3.2654, + "step": 44937 + }, + { + "epoch": 0.26725901608145397, + "grad_norm": 2.843445062637329, + "learning_rate": 4.169401057640431e-05, + "loss": 3.4761, + "step": 44938 + }, + { + "epoch": 0.267264963364735, + "grad_norm": 1.9298120737075806, + "learning_rate": 4.169366287642845e-05, + "loss": 4.6962, + "step": 44939 + }, + { + "epoch": 0.267270910648016, + "grad_norm": 1.617946982383728, + "learning_rate": 4.1693315170625e-05, + "loss": 5.0759, + "step": 44940 + }, + { + "epoch": 0.26727685793129696, + "grad_norm": 2.930448532104492, + "learning_rate": 4.1692967458994095e-05, + "loss": 4.2719, + "step": 44941 + }, + { + "epoch": 0.267282805214578, + "grad_norm": 2.3794949054718018, + "learning_rate": 4.1692619741535834e-05, + "loss": 4.6604, + "step": 44942 + }, + { + "epoch": 0.267288752497859, + "grad_norm": 1.6227281093597412, + "learning_rate": 4.169227201825036e-05, + "loss": 4.7597, + "step": 44943 + }, + { + "epoch": 0.26729469978113995, + "grad_norm": 1.5215003490447998, + "learning_rate": 4.1691924289137785e-05, + "loss": 4.773, + "step": 44944 + }, + { + "epoch": 0.26730064706442097, + "grad_norm": 1.516844630241394, + "learning_rate": 4.169157655419823e-05, + "loss": 4.7887, + "step": 44945 + }, + { + "epoch": 0.267306594347702, + "grad_norm": 1.5001994371414185, + "learning_rate": 4.1691228813431816e-05, + "loss": 4.3542, + "step": 44946 + }, + { + "epoch": 0.26731254163098295, + "grad_norm": 1.682621955871582, + "learning_rate": 4.1690881066838664e-05, + "loss": 4.798, + "step": 44947 + }, + { + "epoch": 0.26731848891426396, + "grad_norm": 1.6496332883834839, + "learning_rate": 4.169053331441891e-05, + "loss": 4.8795, + "step": 44948 + }, + { + "epoch": 0.267324436197545, + "grad_norm": 1.5416514873504639, + "learning_rate": 4.1690185556172657e-05, + "loss": 5.1838, + "step": 44949 + }, + { + "epoch": 0.26733038348082594, + "grad_norm": 1.789942979812622, + "learning_rate": 4.168983779210003e-05, + "loss": 4.6714, + "step": 44950 + }, + { + "epoch": 0.26733633076410696, + "grad_norm": 1.8521051406860352, + "learning_rate": 4.1689490022201154e-05, + "loss": 4.5464, + "step": 44951 + }, + { + "epoch": 0.26734227804738797, + "grad_norm": 3.3200771808624268, + "learning_rate": 4.1689142246476154e-05, + "loss": 1.8905, + "step": 44952 + }, + { + "epoch": 0.26734822533066893, + "grad_norm": 1.8375818729400635, + "learning_rate": 4.168879446492514e-05, + "loss": 3.5833, + "step": 44953 + }, + { + "epoch": 0.26735417261394995, + "grad_norm": 1.5513676404953003, + "learning_rate": 4.1688446677548246e-05, + "loss": 4.4305, + "step": 44954 + }, + { + "epoch": 0.26736011989723096, + "grad_norm": 1.5303900241851807, + "learning_rate": 4.168809888434558e-05, + "loss": 4.9038, + "step": 44955 + }, + { + "epoch": 0.2673660671805119, + "grad_norm": 1.695238471031189, + "learning_rate": 4.1687751085317275e-05, + "loss": 4.9308, + "step": 44956 + }, + { + "epoch": 0.26737201446379294, + "grad_norm": 1.7103848457336426, + "learning_rate": 4.1687403280463456e-05, + "loss": 4.8448, + "step": 44957 + }, + { + "epoch": 0.26737796174707396, + "grad_norm": 1.7584383487701416, + "learning_rate": 4.168705546978423e-05, + "loss": 4.7062, + "step": 44958 + }, + { + "epoch": 0.2673839090303549, + "grad_norm": 1.7030738592147827, + "learning_rate": 4.1686707653279725e-05, + "loss": 5.1925, + "step": 44959 + }, + { + "epoch": 0.26738985631363593, + "grad_norm": 1.7474168539047241, + "learning_rate": 4.1686359830950064e-05, + "loss": 5.2619, + "step": 44960 + }, + { + "epoch": 0.26739580359691695, + "grad_norm": 1.493651032447815, + "learning_rate": 4.168601200279536e-05, + "loss": 5.1767, + "step": 44961 + }, + { + "epoch": 0.2674017508801979, + "grad_norm": 1.4242385625839233, + "learning_rate": 4.1685664168815755e-05, + "loss": 5.249, + "step": 44962 + }, + { + "epoch": 0.2674076981634789, + "grad_norm": 1.598497986793518, + "learning_rate": 4.168531632901135e-05, + "loss": 5.0281, + "step": 44963 + }, + { + "epoch": 0.26741364544675994, + "grad_norm": 1.41771399974823, + "learning_rate": 4.168496848338227e-05, + "loss": 5.0774, + "step": 44964 + }, + { + "epoch": 0.2674195927300409, + "grad_norm": 1.4421809911727905, + "learning_rate": 4.168462063192865e-05, + "loss": 4.9884, + "step": 44965 + }, + { + "epoch": 0.2674255400133219, + "grad_norm": 1.573534607887268, + "learning_rate": 4.168427277465059e-05, + "loss": 5.0198, + "step": 44966 + }, + { + "epoch": 0.26743148729660293, + "grad_norm": 1.523478627204895, + "learning_rate": 4.1683924911548236e-05, + "loss": 5.0081, + "step": 44967 + }, + { + "epoch": 0.2674374345798839, + "grad_norm": 1.478969931602478, + "learning_rate": 4.168357704262168e-05, + "loss": 5.0395, + "step": 44968 + }, + { + "epoch": 0.2674433818631649, + "grad_norm": 1.6111054420471191, + "learning_rate": 4.1683229167871075e-05, + "loss": 4.9562, + "step": 44969 + }, + { + "epoch": 0.2674493291464459, + "grad_norm": 1.4306634664535522, + "learning_rate": 4.1682881287296515e-05, + "loss": 4.9678, + "step": 44970 + }, + { + "epoch": 0.2674552764297269, + "grad_norm": 1.417372703552246, + "learning_rate": 4.1682533400898136e-05, + "loss": 5.0219, + "step": 44971 + }, + { + "epoch": 0.2674612237130079, + "grad_norm": 1.5598911046981812, + "learning_rate": 4.1682185508676066e-05, + "loss": 5.0268, + "step": 44972 + }, + { + "epoch": 0.2674671709962889, + "grad_norm": 1.3261314630508423, + "learning_rate": 4.16818376106304e-05, + "loss": 4.9339, + "step": 44973 + }, + { + "epoch": 0.2674731182795699, + "grad_norm": 1.4829868078231812, + "learning_rate": 4.168148970676129e-05, + "loss": 4.9303, + "step": 44974 + }, + { + "epoch": 0.2674790655628509, + "grad_norm": 1.6004172563552856, + "learning_rate": 4.168114179706885e-05, + "loss": 4.8241, + "step": 44975 + }, + { + "epoch": 0.2674850128461319, + "grad_norm": 3.3796932697296143, + "learning_rate": 4.168079388155318e-05, + "loss": 1.727, + "step": 44976 + }, + { + "epoch": 0.26749096012941287, + "grad_norm": 3.4295766353607178, + "learning_rate": 4.168044596021442e-05, + "loss": 1.7883, + "step": 44977 + }, + { + "epoch": 0.2674969074126939, + "grad_norm": 3.2719385623931885, + "learning_rate": 4.16800980330527e-05, + "loss": 2.414, + "step": 44978 + }, + { + "epoch": 0.2675028546959749, + "grad_norm": 3.1819472312927246, + "learning_rate": 4.1679750100068124e-05, + "loss": 1.5773, + "step": 44979 + }, + { + "epoch": 0.26750880197925586, + "grad_norm": 2.26723575592041, + "learning_rate": 4.1679402161260814e-05, + "loss": 3.2616, + "step": 44980 + }, + { + "epoch": 0.2675147492625369, + "grad_norm": 3.1736738681793213, + "learning_rate": 4.1679054216630905e-05, + "loss": 1.7553, + "step": 44981 + }, + { + "epoch": 0.2675206965458179, + "grad_norm": 3.0920023918151855, + "learning_rate": 4.1678706266178505e-05, + "loss": 1.4577, + "step": 44982 + }, + { + "epoch": 0.26752664382909885, + "grad_norm": 3.494394063949585, + "learning_rate": 4.1678358309903744e-05, + "loss": 1.6962, + "step": 44983 + }, + { + "epoch": 0.26753259111237987, + "grad_norm": 3.5199012756347656, + "learning_rate": 4.167801034780675e-05, + "loss": 1.2153, + "step": 44984 + }, + { + "epoch": 0.2675385383956609, + "grad_norm": 3.1854586601257324, + "learning_rate": 4.1677662379887614e-05, + "loss": 1.6202, + "step": 44985 + }, + { + "epoch": 0.26754448567894185, + "grad_norm": 2.9949400424957275, + "learning_rate": 4.16773144061465e-05, + "loss": 1.2172, + "step": 44986 + }, + { + "epoch": 0.26755043296222286, + "grad_norm": 3.5968992710113525, + "learning_rate": 4.167696642658349e-05, + "loss": 1.5175, + "step": 44987 + }, + { + "epoch": 0.2675563802455039, + "grad_norm": 3.391664981842041, + "learning_rate": 4.1676618441198736e-05, + "loss": 1.9557, + "step": 44988 + }, + { + "epoch": 0.26756232752878484, + "grad_norm": 3.4057774543762207, + "learning_rate": 4.1676270449992336e-05, + "loss": 1.2582, + "step": 44989 + }, + { + "epoch": 0.26756827481206585, + "grad_norm": 3.234083414077759, + "learning_rate": 4.1675922452964427e-05, + "loss": 1.2907, + "step": 44990 + }, + { + "epoch": 0.26757422209534687, + "grad_norm": 2.98756742477417, + "learning_rate": 4.167557445011513e-05, + "loss": 1.4685, + "step": 44991 + }, + { + "epoch": 0.26758016937862783, + "grad_norm": 2.138429641723633, + "learning_rate": 4.167522644144455e-05, + "loss": 5.1168, + "step": 44992 + }, + { + "epoch": 0.26758611666190885, + "grad_norm": 1.7690861225128174, + "learning_rate": 4.1674878426952835e-05, + "loss": 4.878, + "step": 44993 + }, + { + "epoch": 0.26759206394518986, + "grad_norm": 1.8630417585372925, + "learning_rate": 4.1674530406640085e-05, + "loss": 4.6626, + "step": 44994 + }, + { + "epoch": 0.2675980112284708, + "grad_norm": 1.8691571950912476, + "learning_rate": 4.167418238050643e-05, + "loss": 4.302, + "step": 44995 + }, + { + "epoch": 0.26760395851175184, + "grad_norm": 1.701425552368164, + "learning_rate": 4.1673834348551985e-05, + "loss": 4.7983, + "step": 44996 + }, + { + "epoch": 0.26760990579503285, + "grad_norm": 1.627034068107605, + "learning_rate": 4.1673486310776885e-05, + "loss": 4.7047, + "step": 44997 + }, + { + "epoch": 0.2676158530783138, + "grad_norm": 2.047670602798462, + "learning_rate": 4.1673138267181246e-05, + "loss": 4.4429, + "step": 44998 + }, + { + "epoch": 0.26762180036159483, + "grad_norm": 2.4033613204956055, + "learning_rate": 4.1672790217765175e-05, + "loss": 4.5724, + "step": 44999 + }, + { + "epoch": 0.26762774764487585, + "grad_norm": 2.3485798835754395, + "learning_rate": 4.167244216252881e-05, + "loss": 4.53, + "step": 45000 + }, + { + "epoch": 0.2676336949281568, + "grad_norm": 2.0790064334869385, + "learning_rate": 4.167209410147227e-05, + "loss": 4.3759, + "step": 45001 + }, + { + "epoch": 0.2676396422114378, + "grad_norm": 2.1958518028259277, + "learning_rate": 4.1671746034595674e-05, + "loss": 4.4732, + "step": 45002 + }, + { + "epoch": 0.26764558949471884, + "grad_norm": 1.9634493589401245, + "learning_rate": 4.167139796189914e-05, + "loss": 4.4637, + "step": 45003 + }, + { + "epoch": 0.2676515367779998, + "grad_norm": 1.8961070775985718, + "learning_rate": 4.167104988338279e-05, + "loss": 4.0385, + "step": 45004 + }, + { + "epoch": 0.2676574840612808, + "grad_norm": 1.987813949584961, + "learning_rate": 4.1670701799046766e-05, + "loss": 4.1339, + "step": 45005 + }, + { + "epoch": 0.26766343134456183, + "grad_norm": 2.8310563564300537, + "learning_rate": 4.1670353708891155e-05, + "loss": 4.4947, + "step": 45006 + }, + { + "epoch": 0.2676693786278428, + "grad_norm": 1.9314924478530884, + "learning_rate": 4.167000561291611e-05, + "loss": 4.4835, + "step": 45007 + }, + { + "epoch": 0.2676753259111238, + "grad_norm": 1.904053807258606, + "learning_rate": 4.1669657511121724e-05, + "loss": 4.4244, + "step": 45008 + }, + { + "epoch": 0.2676812731944048, + "grad_norm": 2.471681833267212, + "learning_rate": 4.166930940350814e-05, + "loss": 3.9742, + "step": 45009 + }, + { + "epoch": 0.2676872204776858, + "grad_norm": 4.1948771476745605, + "learning_rate": 4.166896129007547e-05, + "loss": 3.5188, + "step": 45010 + }, + { + "epoch": 0.2676931677609668, + "grad_norm": 2.47522234916687, + "learning_rate": 4.1668613170823837e-05, + "loss": 3.3331, + "step": 45011 + }, + { + "epoch": 0.2676991150442478, + "grad_norm": 1.805282473564148, + "learning_rate": 4.1668265045753365e-05, + "loss": 4.7405, + "step": 45012 + }, + { + "epoch": 0.2677050623275288, + "grad_norm": 2.1411356925964355, + "learning_rate": 4.166791691486417e-05, + "loss": 4.3898, + "step": 45013 + }, + { + "epoch": 0.2677110096108098, + "grad_norm": 1.7209608554840088, + "learning_rate": 4.166756877815639e-05, + "loss": 4.8457, + "step": 45014 + }, + { + "epoch": 0.26771695689409075, + "grad_norm": 1.6909042596817017, + "learning_rate": 4.166722063563012e-05, + "loss": 4.8121, + "step": 45015 + }, + { + "epoch": 0.26772290417737177, + "grad_norm": 1.6689767837524414, + "learning_rate": 4.16668724872855e-05, + "loss": 4.4581, + "step": 45016 + }, + { + "epoch": 0.2677288514606528, + "grad_norm": 2.768547296524048, + "learning_rate": 4.1666524333122646e-05, + "loss": 3.154, + "step": 45017 + }, + { + "epoch": 0.26773479874393374, + "grad_norm": 2.2010326385498047, + "learning_rate": 4.1666176173141685e-05, + "loss": 3.316, + "step": 45018 + }, + { + "epoch": 0.26774074602721476, + "grad_norm": 1.7558506727218628, + "learning_rate": 4.166582800734273e-05, + "loss": 3.9394, + "step": 45019 + }, + { + "epoch": 0.2677466933104958, + "grad_norm": 1.6086955070495605, + "learning_rate": 4.166547983572591e-05, + "loss": 5.0878, + "step": 45020 + }, + { + "epoch": 0.26775264059377674, + "grad_norm": 1.6220130920410156, + "learning_rate": 4.166513165829135e-05, + "loss": 5.3992, + "step": 45021 + }, + { + "epoch": 0.26775858787705775, + "grad_norm": 1.7909603118896484, + "learning_rate": 4.166478347503915e-05, + "loss": 4.6841, + "step": 45022 + }, + { + "epoch": 0.26776453516033877, + "grad_norm": 1.6866240501403809, + "learning_rate": 4.166443528596945e-05, + "loss": 4.7818, + "step": 45023 + }, + { + "epoch": 0.26777048244361973, + "grad_norm": 1.9061051607131958, + "learning_rate": 4.166408709108237e-05, + "loss": 5.0573, + "step": 45024 + }, + { + "epoch": 0.26777642972690074, + "grad_norm": 3.528120994567871, + "learning_rate": 4.166373889037803e-05, + "loss": 5.5464, + "step": 45025 + }, + { + "epoch": 0.26778237701018176, + "grad_norm": 2.5489025115966797, + "learning_rate": 4.1663390683856554e-05, + "loss": 4.9241, + "step": 45026 + }, + { + "epoch": 0.2677883242934627, + "grad_norm": 1.766144871711731, + "learning_rate": 4.166304247151805e-05, + "loss": 4.1579, + "step": 45027 + }, + { + "epoch": 0.26779427157674374, + "grad_norm": 1.5608155727386475, + "learning_rate": 4.166269425336266e-05, + "loss": 4.3653, + "step": 45028 + }, + { + "epoch": 0.26780021886002475, + "grad_norm": 1.4890693426132202, + "learning_rate": 4.1662346029390486e-05, + "loss": 4.28, + "step": 45029 + }, + { + "epoch": 0.2678061661433057, + "grad_norm": 1.5218217372894287, + "learning_rate": 4.166199779960167e-05, + "loss": 4.5902, + "step": 45030 + }, + { + "epoch": 0.26781211342658673, + "grad_norm": 1.532460331916809, + "learning_rate": 4.166164956399632e-05, + "loss": 4.646, + "step": 45031 + }, + { + "epoch": 0.26781806070986774, + "grad_norm": 1.621333360671997, + "learning_rate": 4.166130132257455e-05, + "loss": 4.4141, + "step": 45032 + }, + { + "epoch": 0.2678240079931487, + "grad_norm": 2.0247929096221924, + "learning_rate": 4.1660953075336504e-05, + "loss": 3.8083, + "step": 45033 + }, + { + "epoch": 0.2678299552764297, + "grad_norm": 2.024416446685791, + "learning_rate": 4.166060482228229e-05, + "loss": 3.9879, + "step": 45034 + }, + { + "epoch": 0.26783590255971074, + "grad_norm": 1.9664689302444458, + "learning_rate": 4.166025656341203e-05, + "loss": 3.8626, + "step": 45035 + }, + { + "epoch": 0.2678418498429917, + "grad_norm": 1.3970807790756226, + "learning_rate": 4.165990829872584e-05, + "loss": 4.9432, + "step": 45036 + }, + { + "epoch": 0.2678477971262727, + "grad_norm": 1.4867807626724243, + "learning_rate": 4.165956002822386e-05, + "loss": 4.4359, + "step": 45037 + }, + { + "epoch": 0.26785374440955373, + "grad_norm": 1.549329400062561, + "learning_rate": 4.165921175190619e-05, + "loss": 4.822, + "step": 45038 + }, + { + "epoch": 0.2678596916928347, + "grad_norm": 1.658059000968933, + "learning_rate": 4.1658863469772964e-05, + "loss": 4.571, + "step": 45039 + }, + { + "epoch": 0.2678656389761157, + "grad_norm": 1.9523513317108154, + "learning_rate": 4.1658515181824305e-05, + "loss": 3.5395, + "step": 45040 + }, + { + "epoch": 0.2678715862593967, + "grad_norm": 1.8113435506820679, + "learning_rate": 4.1658166888060324e-05, + "loss": 4.0721, + "step": 45041 + }, + { + "epoch": 0.2678775335426777, + "grad_norm": 1.6775193214416504, + "learning_rate": 4.165781858848115e-05, + "loss": 4.205, + "step": 45042 + }, + { + "epoch": 0.2678834808259587, + "grad_norm": 1.634573221206665, + "learning_rate": 4.1657470283086905e-05, + "loss": 4.3155, + "step": 45043 + }, + { + "epoch": 0.2678894281092397, + "grad_norm": 2.17527174949646, + "learning_rate": 4.165712197187771e-05, + "loss": 4.9016, + "step": 45044 + }, + { + "epoch": 0.2678953753925207, + "grad_norm": 1.873072624206543, + "learning_rate": 4.1656773654853684e-05, + "loss": 4.7519, + "step": 45045 + }, + { + "epoch": 0.2679013226758017, + "grad_norm": 1.9081487655639648, + "learning_rate": 4.165642533201496e-05, + "loss": 4.7215, + "step": 45046 + }, + { + "epoch": 0.2679072699590827, + "grad_norm": 1.6898850202560425, + "learning_rate": 4.1656077003361636e-05, + "loss": 4.605, + "step": 45047 + }, + { + "epoch": 0.26791321724236367, + "grad_norm": 1.6649774312973022, + "learning_rate": 4.165572866889385e-05, + "loss": 4.7733, + "step": 45048 + }, + { + "epoch": 0.2679191645256447, + "grad_norm": 1.48060142993927, + "learning_rate": 4.1655380328611724e-05, + "loss": 4.8459, + "step": 45049 + }, + { + "epoch": 0.2679251118089257, + "grad_norm": 1.4947261810302734, + "learning_rate": 4.165503198251538e-05, + "loss": 4.7293, + "step": 45050 + }, + { + "epoch": 0.26793105909220666, + "grad_norm": 1.5990488529205322, + "learning_rate": 4.1654683630604936e-05, + "loss": 4.4529, + "step": 45051 + }, + { + "epoch": 0.2679370063754877, + "grad_norm": 1.7723937034606934, + "learning_rate": 4.165433527288051e-05, + "loss": 4.4941, + "step": 45052 + }, + { + "epoch": 0.2679429536587687, + "grad_norm": 1.95448637008667, + "learning_rate": 4.165398690934223e-05, + "loss": 4.0273, + "step": 45053 + }, + { + "epoch": 0.26794890094204965, + "grad_norm": 2.4019408226013184, + "learning_rate": 4.1653638539990213e-05, + "loss": 2.9014, + "step": 45054 + }, + { + "epoch": 0.26795484822533067, + "grad_norm": 2.5110535621643066, + "learning_rate": 4.165329016482459e-05, + "loss": 2.9284, + "step": 45055 + }, + { + "epoch": 0.2679607955086117, + "grad_norm": 2.471153736114502, + "learning_rate": 4.165294178384547e-05, + "loss": 2.8639, + "step": 45056 + }, + { + "epoch": 0.26796674279189264, + "grad_norm": 2.939606189727783, + "learning_rate": 4.165259339705297e-05, + "loss": 2.6203, + "step": 45057 + }, + { + "epoch": 0.26797269007517366, + "grad_norm": 1.5852913856506348, + "learning_rate": 4.165224500444724e-05, + "loss": 5.0977, + "step": 45058 + }, + { + "epoch": 0.2679786373584547, + "grad_norm": 1.5449445247650146, + "learning_rate": 4.165189660602837e-05, + "loss": 5.4967, + "step": 45059 + }, + { + "epoch": 0.26798458464173563, + "grad_norm": 1.9472177028656006, + "learning_rate": 4.1651548201796496e-05, + "loss": 4.4273, + "step": 45060 + }, + { + "epoch": 0.26799053192501665, + "grad_norm": 1.917468547821045, + "learning_rate": 4.165119979175175e-05, + "loss": 4.3074, + "step": 45061 + }, + { + "epoch": 0.26799647920829767, + "grad_norm": 1.9383246898651123, + "learning_rate": 4.1650851375894225e-05, + "loss": 4.3159, + "step": 45062 + }, + { + "epoch": 0.2680024264915786, + "grad_norm": 1.9988701343536377, + "learning_rate": 4.165050295422408e-05, + "loss": 4.2502, + "step": 45063 + }, + { + "epoch": 0.26800837377485964, + "grad_norm": 1.9182997941970825, + "learning_rate": 4.16501545267414e-05, + "loss": 4.2524, + "step": 45064 + }, + { + "epoch": 0.26801432105814066, + "grad_norm": 2.057504415512085, + "learning_rate": 4.1649806093446334e-05, + "loss": 4.3736, + "step": 45065 + }, + { + "epoch": 0.2680202683414216, + "grad_norm": 2.195603609085083, + "learning_rate": 4.164945765433899e-05, + "loss": 4.2957, + "step": 45066 + }, + { + "epoch": 0.26802621562470264, + "grad_norm": 1.928267002105713, + "learning_rate": 4.164910920941949e-05, + "loss": 4.3625, + "step": 45067 + }, + { + "epoch": 0.26803216290798365, + "grad_norm": 1.6979244947433472, + "learning_rate": 4.164876075868796e-05, + "loss": 4.2724, + "step": 45068 + }, + { + "epoch": 0.2680381101912646, + "grad_norm": 2.136141538619995, + "learning_rate": 4.164841230214451e-05, + "loss": 4.2537, + "step": 45069 + }, + { + "epoch": 0.2680440574745456, + "grad_norm": 1.994243860244751, + "learning_rate": 4.164806383978929e-05, + "loss": 4.6613, + "step": 45070 + }, + { + "epoch": 0.26805000475782664, + "grad_norm": 1.8712176084518433, + "learning_rate": 4.164771537162238e-05, + "loss": 4.6407, + "step": 45071 + }, + { + "epoch": 0.2680559520411076, + "grad_norm": 2.7467265129089355, + "learning_rate": 4.1647366897643936e-05, + "loss": 3.9354, + "step": 45072 + }, + { + "epoch": 0.2680618993243886, + "grad_norm": 1.6572506427764893, + "learning_rate": 4.164701841785407e-05, + "loss": 4.7691, + "step": 45073 + }, + { + "epoch": 0.26806784660766964, + "grad_norm": 1.839673638343811, + "learning_rate": 4.16466699322529e-05, + "loss": 4.5501, + "step": 45074 + }, + { + "epoch": 0.2680737938909506, + "grad_norm": 1.6280244588851929, + "learning_rate": 4.164632144084055e-05, + "loss": 4.6585, + "step": 45075 + }, + { + "epoch": 0.2680797411742316, + "grad_norm": 1.6525644063949585, + "learning_rate": 4.164597294361714e-05, + "loss": 4.493, + "step": 45076 + }, + { + "epoch": 0.2680856884575126, + "grad_norm": 1.8435158729553223, + "learning_rate": 4.16456244405828e-05, + "loss": 4.9857, + "step": 45077 + }, + { + "epoch": 0.2680916357407936, + "grad_norm": 1.6794331073760986, + "learning_rate": 4.1645275931737635e-05, + "loss": 4.6822, + "step": 45078 + }, + { + "epoch": 0.2680975830240746, + "grad_norm": 3.114895820617676, + "learning_rate": 4.164492741708178e-05, + "loss": 2.2549, + "step": 45079 + }, + { + "epoch": 0.2681035303073556, + "grad_norm": 3.6029012203216553, + "learning_rate": 4.1644578896615346e-05, + "loss": 1.7492, + "step": 45080 + }, + { + "epoch": 0.2681094775906366, + "grad_norm": 3.749537706375122, + "learning_rate": 4.164423037033847e-05, + "loss": 1.4617, + "step": 45081 + }, + { + "epoch": 0.2681154248739176, + "grad_norm": 3.5417327880859375, + "learning_rate": 4.1643881838251266e-05, + "loss": 1.6569, + "step": 45082 + }, + { + "epoch": 0.2681213721571986, + "grad_norm": 3.3716530799865723, + "learning_rate": 4.164353330035386e-05, + "loss": 1.6263, + "step": 45083 + }, + { + "epoch": 0.2681273194404796, + "grad_norm": 3.5294878482818604, + "learning_rate": 4.164318475664635e-05, + "loss": 1.6589, + "step": 45084 + }, + { + "epoch": 0.2681332667237606, + "grad_norm": 2.948202133178711, + "learning_rate": 4.164283620712889e-05, + "loss": 2.1344, + "step": 45085 + }, + { + "epoch": 0.2681392140070416, + "grad_norm": 1.7405534982681274, + "learning_rate": 4.164248765180158e-05, + "loss": 4.4605, + "step": 45086 + }, + { + "epoch": 0.26814516129032256, + "grad_norm": 2.0014660358428955, + "learning_rate": 4.1642139090664556e-05, + "loss": 4.4436, + "step": 45087 + }, + { + "epoch": 0.2681511085736036, + "grad_norm": 2.276367664337158, + "learning_rate": 4.1641790523717935e-05, + "loss": 4.4976, + "step": 45088 + }, + { + "epoch": 0.2681570558568846, + "grad_norm": 1.7605714797973633, + "learning_rate": 4.1641441950961826e-05, + "loss": 4.977, + "step": 45089 + }, + { + "epoch": 0.26816300314016556, + "grad_norm": 1.7236353158950806, + "learning_rate": 4.164109337239637e-05, + "loss": 5.0177, + "step": 45090 + }, + { + "epoch": 0.2681689504234466, + "grad_norm": 1.62550950050354, + "learning_rate": 4.164074478802168e-05, + "loss": 4.6764, + "step": 45091 + }, + { + "epoch": 0.2681748977067276, + "grad_norm": 1.7095445394515991, + "learning_rate": 4.164039619783788e-05, + "loss": 4.814, + "step": 45092 + }, + { + "epoch": 0.26818084499000855, + "grad_norm": 1.6945042610168457, + "learning_rate": 4.164004760184508e-05, + "loss": 4.5017, + "step": 45093 + }, + { + "epoch": 0.26818679227328956, + "grad_norm": 1.4827356338500977, + "learning_rate": 4.163969900004342e-05, + "loss": 4.9723, + "step": 45094 + }, + { + "epoch": 0.2681927395565706, + "grad_norm": 1.5020477771759033, + "learning_rate": 4.163935039243301e-05, + "loss": 4.8828, + "step": 45095 + }, + { + "epoch": 0.26819868683985154, + "grad_norm": 1.1833282709121704, + "learning_rate": 4.1639001779013974e-05, + "loss": 4.8189, + "step": 45096 + }, + { + "epoch": 0.26820463412313256, + "grad_norm": 1.3105418682098389, + "learning_rate": 4.163865315978643e-05, + "loss": 4.8858, + "step": 45097 + }, + { + "epoch": 0.2682105814064136, + "grad_norm": 1.5570876598358154, + "learning_rate": 4.1638304534750516e-05, + "loss": 4.9179, + "step": 45098 + }, + { + "epoch": 0.26821652868969453, + "grad_norm": 1.7335184812545776, + "learning_rate": 4.163795590390633e-05, + "loss": 4.4151, + "step": 45099 + }, + { + "epoch": 0.26822247597297555, + "grad_norm": 1.7214198112487793, + "learning_rate": 4.163760726725402e-05, + "loss": 4.9482, + "step": 45100 + }, + { + "epoch": 0.26822842325625657, + "grad_norm": 1.4318100214004517, + "learning_rate": 4.163725862479368e-05, + "loss": 4.8601, + "step": 45101 + }, + { + "epoch": 0.2682343705395375, + "grad_norm": 1.4414254426956177, + "learning_rate": 4.1636909976525446e-05, + "loss": 4.9021, + "step": 45102 + }, + { + "epoch": 0.26824031782281854, + "grad_norm": 1.5736068487167358, + "learning_rate": 4.1636561322449445e-05, + "loss": 5.047, + "step": 45103 + }, + { + "epoch": 0.26824626510609956, + "grad_norm": 3.1472246646881104, + "learning_rate": 4.1636212662565786e-05, + "loss": 4.1098, + "step": 45104 + }, + { + "epoch": 0.2682522123893805, + "grad_norm": 4.526726245880127, + "learning_rate": 4.1635863996874605e-05, + "loss": 1.9441, + "step": 45105 + }, + { + "epoch": 0.26825815967266153, + "grad_norm": 3.9022672176361084, + "learning_rate": 4.163551532537601e-05, + "loss": 1.9128, + "step": 45106 + }, + { + "epoch": 0.26826410695594255, + "grad_norm": 3.7294249534606934, + "learning_rate": 4.163516664807012e-05, + "loss": 1.6389, + "step": 45107 + }, + { + "epoch": 0.2682700542392235, + "grad_norm": 3.7149007320404053, + "learning_rate": 4.163481796495708e-05, + "loss": 1.8109, + "step": 45108 + }, + { + "epoch": 0.2682760015225045, + "grad_norm": 3.503451108932495, + "learning_rate": 4.163446927603699e-05, + "loss": 1.8438, + "step": 45109 + }, + { + "epoch": 0.26828194880578554, + "grad_norm": 3.392585515975952, + "learning_rate": 4.163412058130998e-05, + "loss": 1.7471, + "step": 45110 + }, + { + "epoch": 0.2682878960890665, + "grad_norm": 3.269810914993286, + "learning_rate": 4.1633771880776173e-05, + "loss": 1.8286, + "step": 45111 + }, + { + "epoch": 0.2682938433723475, + "grad_norm": 2.728454351425171, + "learning_rate": 4.1633423174435685e-05, + "loss": 3.0632, + "step": 45112 + }, + { + "epoch": 0.26829979065562853, + "grad_norm": 1.7180217504501343, + "learning_rate": 4.163307446228864e-05, + "loss": 4.5924, + "step": 45113 + }, + { + "epoch": 0.2683057379389095, + "grad_norm": 1.7121518850326538, + "learning_rate": 4.1632725744335156e-05, + "loss": 4.8235, + "step": 45114 + }, + { + "epoch": 0.2683116852221905, + "grad_norm": 1.9331588745117188, + "learning_rate": 4.1632377020575374e-05, + "loss": 4.7685, + "step": 45115 + }, + { + "epoch": 0.2683176325054715, + "grad_norm": 1.3814637660980225, + "learning_rate": 4.1632028291009384e-05, + "loss": 4.5754, + "step": 45116 + }, + { + "epoch": 0.2683235797887525, + "grad_norm": 1.5759391784667969, + "learning_rate": 4.1631679555637336e-05, + "loss": 4.6105, + "step": 45117 + }, + { + "epoch": 0.2683295270720335, + "grad_norm": 1.4795459508895874, + "learning_rate": 4.163133081445934e-05, + "loss": 4.7206, + "step": 45118 + }, + { + "epoch": 0.2683354743553145, + "grad_norm": 1.9243206977844238, + "learning_rate": 4.163098206747551e-05, + "loss": 4.836, + "step": 45119 + }, + { + "epoch": 0.2683414216385955, + "grad_norm": 1.5225938558578491, + "learning_rate": 4.1630633314685985e-05, + "loss": 4.8658, + "step": 45120 + }, + { + "epoch": 0.2683473689218765, + "grad_norm": 1.4583731889724731, + "learning_rate": 4.163028455609087e-05, + "loss": 4.4844, + "step": 45121 + }, + { + "epoch": 0.2683533162051575, + "grad_norm": 2.7442190647125244, + "learning_rate": 4.16299357916903e-05, + "loss": 3.4207, + "step": 45122 + }, + { + "epoch": 0.26835926348843847, + "grad_norm": 2.1740164756774902, + "learning_rate": 4.162958702148439e-05, + "loss": 3.4622, + "step": 45123 + }, + { + "epoch": 0.2683652107717195, + "grad_norm": 1.9711397886276245, + "learning_rate": 4.162923824547327e-05, + "loss": 3.9153, + "step": 45124 + }, + { + "epoch": 0.2683711580550005, + "grad_norm": 2.49839186668396, + "learning_rate": 4.162888946365704e-05, + "loss": 4.1174, + "step": 45125 + }, + { + "epoch": 0.26837710533828146, + "grad_norm": 1.902565598487854, + "learning_rate": 4.1628540676035846e-05, + "loss": 4.073, + "step": 45126 + }, + { + "epoch": 0.2683830526215625, + "grad_norm": 1.7201104164123535, + "learning_rate": 4.16281918826098e-05, + "loss": 4.3122, + "step": 45127 + }, + { + "epoch": 0.2683889999048435, + "grad_norm": 1.6342188119888306, + "learning_rate": 4.1627843083379015e-05, + "loss": 4.8937, + "step": 45128 + }, + { + "epoch": 0.26839494718812446, + "grad_norm": 1.9282046556472778, + "learning_rate": 4.162749427834363e-05, + "loss": 5.1048, + "step": 45129 + }, + { + "epoch": 0.26840089447140547, + "grad_norm": 1.4654732942581177, + "learning_rate": 4.162714546750376e-05, + "loss": 4.804, + "step": 45130 + }, + { + "epoch": 0.26840684175468643, + "grad_norm": 1.4040499925613403, + "learning_rate": 4.162679665085952e-05, + "loss": 5.1178, + "step": 45131 + }, + { + "epoch": 0.26841278903796745, + "grad_norm": 1.4663563966751099, + "learning_rate": 4.1626447828411043e-05, + "loss": 4.8456, + "step": 45132 + }, + { + "epoch": 0.26841873632124846, + "grad_norm": 1.4744921922683716, + "learning_rate": 4.1626099000158444e-05, + "loss": 4.8564, + "step": 45133 + }, + { + "epoch": 0.2684246836045294, + "grad_norm": 1.5892945528030396, + "learning_rate": 4.162575016610184e-05, + "loss": 5.0841, + "step": 45134 + }, + { + "epoch": 0.26843063088781044, + "grad_norm": 2.02508807182312, + "learning_rate": 4.1625401326241366e-05, + "loss": 5.4942, + "step": 45135 + }, + { + "epoch": 0.26843657817109146, + "grad_norm": 1.6238635778427124, + "learning_rate": 4.162505248057713e-05, + "loss": 4.5491, + "step": 45136 + }, + { + "epoch": 0.2684425254543724, + "grad_norm": 1.49809730052948, + "learning_rate": 4.162470362910926e-05, + "loss": 4.9328, + "step": 45137 + }, + { + "epoch": 0.26844847273765343, + "grad_norm": 1.522220253944397, + "learning_rate": 4.1624354771837884e-05, + "loss": 4.8736, + "step": 45138 + }, + { + "epoch": 0.26845442002093445, + "grad_norm": 1.4564679861068726, + "learning_rate": 4.162400590876311e-05, + "loss": 5.4119, + "step": 45139 + }, + { + "epoch": 0.2684603673042154, + "grad_norm": 1.5990132093429565, + "learning_rate": 4.162365703988507e-05, + "loss": 4.9567, + "step": 45140 + }, + { + "epoch": 0.2684663145874964, + "grad_norm": 1.5001357793807983, + "learning_rate": 4.162330816520389e-05, + "loss": 5.0234, + "step": 45141 + }, + { + "epoch": 0.26847226187077744, + "grad_norm": 1.5393893718719482, + "learning_rate": 4.162295928471967e-05, + "loss": 5.2641, + "step": 45142 + }, + { + "epoch": 0.2684782091540584, + "grad_norm": 1.3494468927383423, + "learning_rate": 4.162261039843255e-05, + "loss": 5.5239, + "step": 45143 + }, + { + "epoch": 0.2684841564373394, + "grad_norm": 1.4378764629364014, + "learning_rate": 4.1622261506342665e-05, + "loss": 5.0958, + "step": 45144 + }, + { + "epoch": 0.26849010372062043, + "grad_norm": 1.517665982246399, + "learning_rate": 4.16219126084501e-05, + "loss": 4.9106, + "step": 45145 + }, + { + "epoch": 0.2684960510039014, + "grad_norm": 1.709091305732727, + "learning_rate": 4.162156370475501e-05, + "loss": 4.8077, + "step": 45146 + }, + { + "epoch": 0.2685019982871824, + "grad_norm": 1.5005824565887451, + "learning_rate": 4.16212147952575e-05, + "loss": 5.1425, + "step": 45147 + }, + { + "epoch": 0.2685079455704634, + "grad_norm": 1.705865502357483, + "learning_rate": 4.162086587995769e-05, + "loss": 4.8899, + "step": 45148 + }, + { + "epoch": 0.2685138928537444, + "grad_norm": 1.7069107294082642, + "learning_rate": 4.162051695885571e-05, + "loss": 4.5867, + "step": 45149 + }, + { + "epoch": 0.2685198401370254, + "grad_norm": 1.6658052206039429, + "learning_rate": 4.1620168031951686e-05, + "loss": 4.6717, + "step": 45150 + }, + { + "epoch": 0.2685257874203064, + "grad_norm": 1.5296261310577393, + "learning_rate": 4.161981909924573e-05, + "loss": 4.6791, + "step": 45151 + }, + { + "epoch": 0.2685317347035874, + "grad_norm": 1.3490979671478271, + "learning_rate": 4.161947016073796e-05, + "loss": 4.9549, + "step": 45152 + }, + { + "epoch": 0.2685376819868684, + "grad_norm": 1.5525199174880981, + "learning_rate": 4.161912121642851e-05, + "loss": 4.7118, + "step": 45153 + }, + { + "epoch": 0.2685436292701494, + "grad_norm": 1.6430385112762451, + "learning_rate": 4.1618772266317505e-05, + "loss": 4.5586, + "step": 45154 + }, + { + "epoch": 0.26854957655343037, + "grad_norm": 1.5711143016815186, + "learning_rate": 4.1618423310405046e-05, + "loss": 4.6321, + "step": 45155 + }, + { + "epoch": 0.2685555238367114, + "grad_norm": 1.7416385412216187, + "learning_rate": 4.161807434869127e-05, + "loss": 4.8503, + "step": 45156 + }, + { + "epoch": 0.2685614711199924, + "grad_norm": 1.5509675741195679, + "learning_rate": 4.16177253811763e-05, + "loss": 4.7719, + "step": 45157 + }, + { + "epoch": 0.26856741840327336, + "grad_norm": 1.7978019714355469, + "learning_rate": 4.1617376407860255e-05, + "loss": 5.1219, + "step": 45158 + }, + { + "epoch": 0.2685733656865544, + "grad_norm": 1.7109897136688232, + "learning_rate": 4.161702742874325e-05, + "loss": 5.2575, + "step": 45159 + }, + { + "epoch": 0.2685793129698354, + "grad_norm": 1.7822604179382324, + "learning_rate": 4.1616678443825414e-05, + "loss": 4.9172, + "step": 45160 + }, + { + "epoch": 0.26858526025311635, + "grad_norm": 1.688461184501648, + "learning_rate": 4.161632945310686e-05, + "loss": 4.8575, + "step": 45161 + }, + { + "epoch": 0.26859120753639737, + "grad_norm": 1.667033076286316, + "learning_rate": 4.161598045658773e-05, + "loss": 4.8626, + "step": 45162 + }, + { + "epoch": 0.2685971548196784, + "grad_norm": 1.664591908454895, + "learning_rate": 4.161563145426812e-05, + "loss": 4.8664, + "step": 45163 + }, + { + "epoch": 0.26860310210295935, + "grad_norm": 1.6867667436599731, + "learning_rate": 4.1615282446148176e-05, + "loss": 5.204, + "step": 45164 + }, + { + "epoch": 0.26860904938624036, + "grad_norm": 1.9987390041351318, + "learning_rate": 4.161493343222801e-05, + "loss": 4.7482, + "step": 45165 + }, + { + "epoch": 0.2686149966695214, + "grad_norm": 1.720126986503601, + "learning_rate": 4.161458441250773e-05, + "loss": 5.0324, + "step": 45166 + }, + { + "epoch": 0.26862094395280234, + "grad_norm": 2.121156692504883, + "learning_rate": 4.1614235386987474e-05, + "loss": 4.4114, + "step": 45167 + }, + { + "epoch": 0.26862689123608335, + "grad_norm": 3.0883936882019043, + "learning_rate": 4.161388635566737e-05, + "loss": 3.2082, + "step": 45168 + }, + { + "epoch": 0.26863283851936437, + "grad_norm": 3.620790958404541, + "learning_rate": 4.161353731854752e-05, + "loss": 2.9188, + "step": 45169 + }, + { + "epoch": 0.26863878580264533, + "grad_norm": 3.200678586959839, + "learning_rate": 4.1613188275628055e-05, + "loss": 2.558, + "step": 45170 + }, + { + "epoch": 0.26864473308592635, + "grad_norm": 2.9885995388031006, + "learning_rate": 4.1612839226909104e-05, + "loss": 2.4443, + "step": 45171 + }, + { + "epoch": 0.26865068036920736, + "grad_norm": 2.4678642749786377, + "learning_rate": 4.161249017239078e-05, + "loss": 3.5268, + "step": 45172 + }, + { + "epoch": 0.2686566276524883, + "grad_norm": 1.510361671447754, + "learning_rate": 4.161214111207321e-05, + "loss": 4.8531, + "step": 45173 + }, + { + "epoch": 0.26866257493576934, + "grad_norm": 1.884350061416626, + "learning_rate": 4.161179204595651e-05, + "loss": 4.1152, + "step": 45174 + }, + { + "epoch": 0.26866852221905035, + "grad_norm": 2.2617175579071045, + "learning_rate": 4.16114429740408e-05, + "loss": 3.618, + "step": 45175 + }, + { + "epoch": 0.2686744695023313, + "grad_norm": 2.0552704334259033, + "learning_rate": 4.161109389632621e-05, + "loss": 3.3861, + "step": 45176 + }, + { + "epoch": 0.26868041678561233, + "grad_norm": 2.2621490955352783, + "learning_rate": 4.161074481281287e-05, + "loss": 3.421, + "step": 45177 + }, + { + "epoch": 0.26868636406889335, + "grad_norm": 1.7225425243377686, + "learning_rate": 4.161039572350088e-05, + "loss": 4.5136, + "step": 45178 + }, + { + "epoch": 0.2686923113521743, + "grad_norm": 1.7635231018066406, + "learning_rate": 4.1610046628390374e-05, + "loss": 4.7967, + "step": 45179 + }, + { + "epoch": 0.2686982586354553, + "grad_norm": 2.1247239112854004, + "learning_rate": 4.1609697527481475e-05, + "loss": 4.6168, + "step": 45180 + }, + { + "epoch": 0.26870420591873634, + "grad_norm": 2.2815146446228027, + "learning_rate": 4.1609348420774295e-05, + "loss": 3.2609, + "step": 45181 + }, + { + "epoch": 0.2687101532020173, + "grad_norm": 2.157419204711914, + "learning_rate": 4.1608999308268966e-05, + "loss": 3.4968, + "step": 45182 + }, + { + "epoch": 0.2687161004852983, + "grad_norm": 1.995869517326355, + "learning_rate": 4.1608650189965614e-05, + "loss": 3.2542, + "step": 45183 + }, + { + "epoch": 0.26872204776857933, + "grad_norm": 2.107966184616089, + "learning_rate": 4.160830106586434e-05, + "loss": 3.4266, + "step": 45184 + }, + { + "epoch": 0.2687279950518603, + "grad_norm": 2.2513065338134766, + "learning_rate": 4.160795193596529e-05, + "loss": 3.1297, + "step": 45185 + }, + { + "epoch": 0.2687339423351413, + "grad_norm": 2.3252804279327393, + "learning_rate": 4.160760280026857e-05, + "loss": 3.0666, + "step": 45186 + }, + { + "epoch": 0.2687398896184223, + "grad_norm": 2.3385813236236572, + "learning_rate": 4.1607253658774313e-05, + "loss": 3.0852, + "step": 45187 + }, + { + "epoch": 0.2687458369017033, + "grad_norm": 2.310908555984497, + "learning_rate": 4.1606904511482635e-05, + "loss": 3.0599, + "step": 45188 + }, + { + "epoch": 0.2687517841849843, + "grad_norm": 2.239288330078125, + "learning_rate": 4.160655535839366e-05, + "loss": 3.0082, + "step": 45189 + }, + { + "epoch": 0.2687577314682653, + "grad_norm": 2.7274320125579834, + "learning_rate": 4.16062061995075e-05, + "loss": 3.0338, + "step": 45190 + }, + { + "epoch": 0.2687636787515463, + "grad_norm": 2.4087820053100586, + "learning_rate": 4.160585703482429e-05, + "loss": 2.8448, + "step": 45191 + }, + { + "epoch": 0.2687696260348273, + "grad_norm": 2.6315417289733887, + "learning_rate": 4.160550786434415e-05, + "loss": 2.8813, + "step": 45192 + }, + { + "epoch": 0.2687755733181083, + "grad_norm": 1.7151169776916504, + "learning_rate": 4.160515868806719e-05, + "loss": 4.1168, + "step": 45193 + }, + { + "epoch": 0.26878152060138927, + "grad_norm": 1.980391025543213, + "learning_rate": 4.1604809505993545e-05, + "loss": 4.521, + "step": 45194 + }, + { + "epoch": 0.2687874678846703, + "grad_norm": 1.8919858932495117, + "learning_rate": 4.160446031812334e-05, + "loss": 4.0828, + "step": 45195 + }, + { + "epoch": 0.2687934151679513, + "grad_norm": 2.2979109287261963, + "learning_rate": 4.160411112445668e-05, + "loss": 4.3651, + "step": 45196 + }, + { + "epoch": 0.26879936245123226, + "grad_norm": 1.6645711660385132, + "learning_rate": 4.16037619249937e-05, + "loss": 5.0539, + "step": 45197 + }, + { + "epoch": 0.2688053097345133, + "grad_norm": 1.8215328454971313, + "learning_rate": 4.1603412719734515e-05, + "loss": 4.7348, + "step": 45198 + }, + { + "epoch": 0.2688112570177943, + "grad_norm": 1.6146751642227173, + "learning_rate": 4.1603063508679254e-05, + "loss": 5.0237, + "step": 45199 + }, + { + "epoch": 0.26881720430107525, + "grad_norm": 2.294412612915039, + "learning_rate": 4.160271429182804e-05, + "loss": 4.4175, + "step": 45200 + }, + { + "epoch": 0.26882315158435627, + "grad_norm": 1.982631802558899, + "learning_rate": 4.160236506918098e-05, + "loss": 4.7002, + "step": 45201 + }, + { + "epoch": 0.2688290988676373, + "grad_norm": 2.190749168395996, + "learning_rate": 4.1602015840738205e-05, + "loss": 4.5036, + "step": 45202 + }, + { + "epoch": 0.26883504615091824, + "grad_norm": 2.505601406097412, + "learning_rate": 4.160166660649985e-05, + "loss": 4.3735, + "step": 45203 + }, + { + "epoch": 0.26884099343419926, + "grad_norm": 1.8448855876922607, + "learning_rate": 4.160131736646602e-05, + "loss": 4.499, + "step": 45204 + }, + { + "epoch": 0.2688469407174803, + "grad_norm": 1.661379098892212, + "learning_rate": 4.160096812063683e-05, + "loss": 5.1729, + "step": 45205 + }, + { + "epoch": 0.26885288800076124, + "grad_norm": 1.6127456426620483, + "learning_rate": 4.160061886901242e-05, + "loss": 5.3388, + "step": 45206 + }, + { + "epoch": 0.26885883528404225, + "grad_norm": 1.354529619216919, + "learning_rate": 4.160026961159291e-05, + "loss": 5.2378, + "step": 45207 + }, + { + "epoch": 0.26886478256732327, + "grad_norm": 1.2830134630203247, + "learning_rate": 4.159992034837842e-05, + "loss": 5.3219, + "step": 45208 + }, + { + "epoch": 0.26887072985060423, + "grad_norm": 1.5937238931655884, + "learning_rate": 4.159957107936906e-05, + "loss": 4.9178, + "step": 45209 + }, + { + "epoch": 0.26887667713388524, + "grad_norm": 1.596405267715454, + "learning_rate": 4.159922180456497e-05, + "loss": 5.3576, + "step": 45210 + }, + { + "epoch": 0.26888262441716626, + "grad_norm": 1.4937127828598022, + "learning_rate": 4.1598872523966256e-05, + "loss": 5.0954, + "step": 45211 + }, + { + "epoch": 0.2688885717004472, + "grad_norm": 2.0205538272857666, + "learning_rate": 4.159852323757305e-05, + "loss": 4.1442, + "step": 45212 + }, + { + "epoch": 0.26889451898372824, + "grad_norm": 1.5805648565292358, + "learning_rate": 4.1598173945385463e-05, + "loss": 5.3696, + "step": 45213 + }, + { + "epoch": 0.26890046626700925, + "grad_norm": 1.3900147676467896, + "learning_rate": 4.1597824647403635e-05, + "loss": 5.3203, + "step": 45214 + }, + { + "epoch": 0.2689064135502902, + "grad_norm": 1.4014190435409546, + "learning_rate": 4.159747534362768e-05, + "loss": 4.5714, + "step": 45215 + }, + { + "epoch": 0.26891236083357123, + "grad_norm": 1.6457473039627075, + "learning_rate": 4.159712603405772e-05, + "loss": 4.5655, + "step": 45216 + }, + { + "epoch": 0.26891830811685224, + "grad_norm": 1.2965730428695679, + "learning_rate": 4.159677671869386e-05, + "loss": 4.5826, + "step": 45217 + }, + { + "epoch": 0.2689242554001332, + "grad_norm": 1.419721007347107, + "learning_rate": 4.159642739753624e-05, + "loss": 5.5408, + "step": 45218 + }, + { + "epoch": 0.2689302026834142, + "grad_norm": 1.4104684591293335, + "learning_rate": 4.159607807058499e-05, + "loss": 4.9862, + "step": 45219 + }, + { + "epoch": 0.26893614996669524, + "grad_norm": 1.5869590044021606, + "learning_rate": 4.159572873784021e-05, + "loss": 4.7457, + "step": 45220 + }, + { + "epoch": 0.2689420972499762, + "grad_norm": 1.8555985689163208, + "learning_rate": 4.159537939930204e-05, + "loss": 4.6241, + "step": 45221 + }, + { + "epoch": 0.2689480445332572, + "grad_norm": 2.49819278717041, + "learning_rate": 4.1595030054970595e-05, + "loss": 3.9443, + "step": 45222 + }, + { + "epoch": 0.26895399181653823, + "grad_norm": 2.2385292053222656, + "learning_rate": 4.159468070484599e-05, + "loss": 3.5958, + "step": 45223 + }, + { + "epoch": 0.2689599390998192, + "grad_norm": 1.8468725681304932, + "learning_rate": 4.159433134892836e-05, + "loss": 4.1199, + "step": 45224 + }, + { + "epoch": 0.2689658863831002, + "grad_norm": 1.487992525100708, + "learning_rate": 4.159398198721781e-05, + "loss": 4.8563, + "step": 45225 + }, + { + "epoch": 0.2689718336663812, + "grad_norm": 1.3250800371170044, + "learning_rate": 4.159363261971449e-05, + "loss": 4.7746, + "step": 45226 + }, + { + "epoch": 0.2689777809496622, + "grad_norm": 1.2223737239837646, + "learning_rate": 4.159328324641848e-05, + "loss": 4.9704, + "step": 45227 + }, + { + "epoch": 0.2689837282329432, + "grad_norm": 1.315438151359558, + "learning_rate": 4.159293386732994e-05, + "loss": 5.0291, + "step": 45228 + }, + { + "epoch": 0.2689896755162242, + "grad_norm": 1.3781858682632446, + "learning_rate": 4.159258448244898e-05, + "loss": 4.8171, + "step": 45229 + }, + { + "epoch": 0.2689956227995052, + "grad_norm": 1.442996859550476, + "learning_rate": 4.1592235091775715e-05, + "loss": 4.6739, + "step": 45230 + }, + { + "epoch": 0.2690015700827862, + "grad_norm": 1.6625727415084839, + "learning_rate": 4.159188569531028e-05, + "loss": 4.7094, + "step": 45231 + }, + { + "epoch": 0.2690075173660672, + "grad_norm": 1.454010248184204, + "learning_rate": 4.159153629305278e-05, + "loss": 4.68, + "step": 45232 + }, + { + "epoch": 0.26901346464934817, + "grad_norm": 1.2799397706985474, + "learning_rate": 4.159118688500335e-05, + "loss": 4.6726, + "step": 45233 + }, + { + "epoch": 0.2690194119326292, + "grad_norm": 1.5622241497039795, + "learning_rate": 4.1590837471162114e-05, + "loss": 4.5788, + "step": 45234 + }, + { + "epoch": 0.2690253592159102, + "grad_norm": 1.4982719421386719, + "learning_rate": 4.159048805152918e-05, + "loss": 4.7062, + "step": 45235 + }, + { + "epoch": 0.26903130649919116, + "grad_norm": 1.5213323831558228, + "learning_rate": 4.159013862610468e-05, + "loss": 4.3746, + "step": 45236 + }, + { + "epoch": 0.2690372537824722, + "grad_norm": 1.6101295948028564, + "learning_rate": 4.158978919488873e-05, + "loss": 4.9217, + "step": 45237 + }, + { + "epoch": 0.2690432010657532, + "grad_norm": 1.6249178647994995, + "learning_rate": 4.158943975788147e-05, + "loss": 5.0315, + "step": 45238 + }, + { + "epoch": 0.26904914834903415, + "grad_norm": 1.840031385421753, + "learning_rate": 4.158909031508299e-05, + "loss": 4.8024, + "step": 45239 + }, + { + "epoch": 0.26905509563231517, + "grad_norm": 1.673559308052063, + "learning_rate": 4.158874086649344e-05, + "loss": 4.8258, + "step": 45240 + }, + { + "epoch": 0.2690610429155962, + "grad_norm": 1.4725693464279175, + "learning_rate": 4.158839141211293e-05, + "loss": 4.9339, + "step": 45241 + }, + { + "epoch": 0.26906699019887714, + "grad_norm": 1.470138430595398, + "learning_rate": 4.1588041951941584e-05, + "loss": 5.3922, + "step": 45242 + }, + { + "epoch": 0.26907293748215816, + "grad_norm": 1.6061598062515259, + "learning_rate": 4.158769248597952e-05, + "loss": 5.5168, + "step": 45243 + }, + { + "epoch": 0.2690788847654392, + "grad_norm": 1.5873727798461914, + "learning_rate": 4.158734301422688e-05, + "loss": 5.4919, + "step": 45244 + }, + { + "epoch": 0.26908483204872014, + "grad_norm": 1.4291692972183228, + "learning_rate": 4.158699353668375e-05, + "loss": 5.3072, + "step": 45245 + }, + { + "epoch": 0.26909077933200115, + "grad_norm": 1.5576152801513672, + "learning_rate": 4.158664405335028e-05, + "loss": 4.9143, + "step": 45246 + }, + { + "epoch": 0.26909672661528217, + "grad_norm": 1.3948214054107666, + "learning_rate": 4.1586294564226585e-05, + "loss": 4.331, + "step": 45247 + }, + { + "epoch": 0.2691026738985631, + "grad_norm": 1.6155285835266113, + "learning_rate": 4.158594506931278e-05, + "loss": 4.3831, + "step": 45248 + }, + { + "epoch": 0.26910862118184414, + "grad_norm": 1.457095742225647, + "learning_rate": 4.1585595568609006e-05, + "loss": 4.6051, + "step": 45249 + }, + { + "epoch": 0.2691145684651251, + "grad_norm": 2.2184219360351562, + "learning_rate": 4.1585246062115354e-05, + "loss": 4.6874, + "step": 45250 + }, + { + "epoch": 0.2691205157484061, + "grad_norm": 2.2661657333374023, + "learning_rate": 4.158489654983198e-05, + "loss": 4.6655, + "step": 45251 + }, + { + "epoch": 0.26912646303168714, + "grad_norm": 2.131653070449829, + "learning_rate": 4.158454703175898e-05, + "loss": 4.4279, + "step": 45252 + }, + { + "epoch": 0.2691324103149681, + "grad_norm": 1.7249187231063843, + "learning_rate": 4.158419750789649e-05, + "loss": 4.809, + "step": 45253 + }, + { + "epoch": 0.2691383575982491, + "grad_norm": 1.4816153049468994, + "learning_rate": 4.158384797824463e-05, + "loss": 4.8176, + "step": 45254 + }, + { + "epoch": 0.2691443048815301, + "grad_norm": 2.034562826156616, + "learning_rate": 4.1583498442803516e-05, + "loss": 4.7426, + "step": 45255 + }, + { + "epoch": 0.2691502521648111, + "grad_norm": 2.6174912452697754, + "learning_rate": 4.158314890157328e-05, + "loss": 3.6678, + "step": 45256 + }, + { + "epoch": 0.2691561994480921, + "grad_norm": 1.6394503116607666, + "learning_rate": 4.158279935455403e-05, + "loss": 4.9539, + "step": 45257 + }, + { + "epoch": 0.2691621467313731, + "grad_norm": 1.4783178567886353, + "learning_rate": 4.15824498017459e-05, + "loss": 4.6294, + "step": 45258 + }, + { + "epoch": 0.2691680940146541, + "grad_norm": 1.7626376152038574, + "learning_rate": 4.158210024314901e-05, + "loss": 4.1115, + "step": 45259 + }, + { + "epoch": 0.2691740412979351, + "grad_norm": 1.9266237020492554, + "learning_rate": 4.1581750678763475e-05, + "loss": 4.7835, + "step": 45260 + }, + { + "epoch": 0.2691799885812161, + "grad_norm": 1.7535372972488403, + "learning_rate": 4.1581401108589425e-05, + "loss": 4.1109, + "step": 45261 + }, + { + "epoch": 0.2691859358644971, + "grad_norm": 1.687429428100586, + "learning_rate": 4.158105153262698e-05, + "loss": 4.1195, + "step": 45262 + }, + { + "epoch": 0.2691918831477781, + "grad_norm": 1.4456332921981812, + "learning_rate": 4.158070195087626e-05, + "loss": 4.2273, + "step": 45263 + }, + { + "epoch": 0.2691978304310591, + "grad_norm": 1.2849599123001099, + "learning_rate": 4.1580352363337386e-05, + "loss": 4.7358, + "step": 45264 + }, + { + "epoch": 0.26920377771434006, + "grad_norm": 2.1525962352752686, + "learning_rate": 4.1580002770010483e-05, + "loss": 4.2447, + "step": 45265 + }, + { + "epoch": 0.2692097249976211, + "grad_norm": 1.5100188255310059, + "learning_rate": 4.1579653170895676e-05, + "loss": 4.5816, + "step": 45266 + }, + { + "epoch": 0.2692156722809021, + "grad_norm": 1.5975242853164673, + "learning_rate": 4.1579303565993085e-05, + "loss": 4.8254, + "step": 45267 + }, + { + "epoch": 0.26922161956418306, + "grad_norm": 1.6642156839370728, + "learning_rate": 4.157895395530283e-05, + "loss": 4.8283, + "step": 45268 + }, + { + "epoch": 0.2692275668474641, + "grad_norm": 1.4097075462341309, + "learning_rate": 4.157860433882503e-05, + "loss": 4.8824, + "step": 45269 + }, + { + "epoch": 0.2692335141307451, + "grad_norm": 1.480600357055664, + "learning_rate": 4.15782547165598e-05, + "loss": 4.7832, + "step": 45270 + }, + { + "epoch": 0.26923946141402605, + "grad_norm": 1.5501184463500977, + "learning_rate": 4.1577905088507293e-05, + "loss": 4.9739, + "step": 45271 + }, + { + "epoch": 0.26924540869730706, + "grad_norm": 1.581547737121582, + "learning_rate": 4.15775554546676e-05, + "loss": 4.8496, + "step": 45272 + }, + { + "epoch": 0.2692513559805881, + "grad_norm": 1.6000069379806519, + "learning_rate": 4.157720581504085e-05, + "loss": 5.0918, + "step": 45273 + }, + { + "epoch": 0.26925730326386904, + "grad_norm": 1.3872038125991821, + "learning_rate": 4.1576856169627175e-05, + "loss": 4.9787, + "step": 45274 + }, + { + "epoch": 0.26926325054715006, + "grad_norm": 1.5059139728546143, + "learning_rate": 4.157650651842669e-05, + "loss": 4.993, + "step": 45275 + }, + { + "epoch": 0.2692691978304311, + "grad_norm": 1.6387094259262085, + "learning_rate": 4.157615686143952e-05, + "loss": 4.5998, + "step": 45276 + }, + { + "epoch": 0.26927514511371203, + "grad_norm": 1.856116771697998, + "learning_rate": 4.157580719866578e-05, + "loss": 4.3386, + "step": 45277 + }, + { + "epoch": 0.26928109239699305, + "grad_norm": 1.6976975202560425, + "learning_rate": 4.15754575301056e-05, + "loss": 4.7816, + "step": 45278 + }, + { + "epoch": 0.26928703968027407, + "grad_norm": 1.7654017210006714, + "learning_rate": 4.15751078557591e-05, + "loss": 4.5316, + "step": 45279 + }, + { + "epoch": 0.269292986963555, + "grad_norm": 2.5005435943603516, + "learning_rate": 4.157475817562639e-05, + "loss": 4.0431, + "step": 45280 + }, + { + "epoch": 0.26929893424683604, + "grad_norm": 2.1804897785186768, + "learning_rate": 4.1574408489707614e-05, + "loss": 4.0131, + "step": 45281 + }, + { + "epoch": 0.26930488153011706, + "grad_norm": 2.16951584815979, + "learning_rate": 4.157405879800288e-05, + "loss": 4.3192, + "step": 45282 + }, + { + "epoch": 0.269310828813398, + "grad_norm": 1.6389447450637817, + "learning_rate": 4.1573709100512314e-05, + "loss": 4.2761, + "step": 45283 + }, + { + "epoch": 0.26931677609667903, + "grad_norm": 1.50558602809906, + "learning_rate": 4.157335939723604e-05, + "loss": 4.9992, + "step": 45284 + }, + { + "epoch": 0.26932272337996005, + "grad_norm": 1.5040929317474365, + "learning_rate": 4.157300968817418e-05, + "loss": 4.8714, + "step": 45285 + }, + { + "epoch": 0.269328670663241, + "grad_norm": 1.606389045715332, + "learning_rate": 4.1572659973326844e-05, + "loss": 5.4729, + "step": 45286 + }, + { + "epoch": 0.269334617946522, + "grad_norm": 1.4508239030838013, + "learning_rate": 4.1572310252694176e-05, + "loss": 5.2953, + "step": 45287 + }, + { + "epoch": 0.26934056522980304, + "grad_norm": 1.5248329639434814, + "learning_rate": 4.157196052627628e-05, + "loss": 4.6599, + "step": 45288 + }, + { + "epoch": 0.269346512513084, + "grad_norm": 1.730546236038208, + "learning_rate": 4.157161079407328e-05, + "loss": 4.011, + "step": 45289 + }, + { + "epoch": 0.269352459796365, + "grad_norm": 2.593759059906006, + "learning_rate": 4.1571261056085304e-05, + "loss": 3.7895, + "step": 45290 + }, + { + "epoch": 0.26935840707964603, + "grad_norm": 2.395089626312256, + "learning_rate": 4.157091131231247e-05, + "loss": 3.743, + "step": 45291 + }, + { + "epoch": 0.269364354362927, + "grad_norm": 2.319007396697998, + "learning_rate": 4.1570561562754905e-05, + "loss": 4.1144, + "step": 45292 + }, + { + "epoch": 0.269370301646208, + "grad_norm": 2.1763808727264404, + "learning_rate": 4.157021180741273e-05, + "loss": 4.0702, + "step": 45293 + }, + { + "epoch": 0.269376248929489, + "grad_norm": 1.8559261560440063, + "learning_rate": 4.1569862046286054e-05, + "loss": 4.6081, + "step": 45294 + }, + { + "epoch": 0.26938219621277, + "grad_norm": 1.6956634521484375, + "learning_rate": 4.156951227937502e-05, + "loss": 5.212, + "step": 45295 + }, + { + "epoch": 0.269388143496051, + "grad_norm": 1.7633353471755981, + "learning_rate": 4.156916250667975e-05, + "loss": 4.9385, + "step": 45296 + }, + { + "epoch": 0.269394090779332, + "grad_norm": 1.8750886917114258, + "learning_rate": 4.156881272820034e-05, + "loss": 4.6691, + "step": 45297 + }, + { + "epoch": 0.269400038062613, + "grad_norm": 1.509953260421753, + "learning_rate": 4.156846294393693e-05, + "loss": 4.3353, + "step": 45298 + }, + { + "epoch": 0.269405985345894, + "grad_norm": 1.7284568548202515, + "learning_rate": 4.1568113153889645e-05, + "loss": 4.9655, + "step": 45299 + }, + { + "epoch": 0.269411932629175, + "grad_norm": 1.7475882768630981, + "learning_rate": 4.156776335805861e-05, + "loss": 4.9491, + "step": 45300 + }, + { + "epoch": 0.26941787991245597, + "grad_norm": 1.7685381174087524, + "learning_rate": 4.156741355644393e-05, + "loss": 4.874, + "step": 45301 + }, + { + "epoch": 0.269423827195737, + "grad_norm": 1.8454065322875977, + "learning_rate": 4.1567063749045746e-05, + "loss": 4.759, + "step": 45302 + }, + { + "epoch": 0.269429774479018, + "grad_norm": 1.4455806016921997, + "learning_rate": 4.156671393586416e-05, + "loss": 4.8639, + "step": 45303 + }, + { + "epoch": 0.26943572176229896, + "grad_norm": 1.7731835842132568, + "learning_rate": 4.156636411689931e-05, + "loss": 5.0179, + "step": 45304 + }, + { + "epoch": 0.26944166904558, + "grad_norm": 1.6547385454177856, + "learning_rate": 4.156601429215131e-05, + "loss": 4.6922, + "step": 45305 + }, + { + "epoch": 0.269447616328861, + "grad_norm": 1.5052129030227661, + "learning_rate": 4.1565664461620294e-05, + "loss": 4.7813, + "step": 45306 + }, + { + "epoch": 0.26945356361214196, + "grad_norm": 1.2690560817718506, + "learning_rate": 4.156531462530637e-05, + "loss": 4.7494, + "step": 45307 + }, + { + "epoch": 0.26945951089542297, + "grad_norm": 1.8947676420211792, + "learning_rate": 4.156496478320967e-05, + "loss": 4.0592, + "step": 45308 + }, + { + "epoch": 0.269465458178704, + "grad_norm": 1.7554521560668945, + "learning_rate": 4.15646149353303e-05, + "loss": 4.8568, + "step": 45309 + }, + { + "epoch": 0.26947140546198495, + "grad_norm": 1.509595513343811, + "learning_rate": 4.1564265081668405e-05, + "loss": 4.6047, + "step": 45310 + }, + { + "epoch": 0.26947735274526596, + "grad_norm": 1.6433113813400269, + "learning_rate": 4.1563915222224106e-05, + "loss": 4.8093, + "step": 45311 + }, + { + "epoch": 0.269483300028547, + "grad_norm": 1.7800555229187012, + "learning_rate": 4.15635653569975e-05, + "loss": 4.606, + "step": 45312 + }, + { + "epoch": 0.26948924731182794, + "grad_norm": 2.2926464080810547, + "learning_rate": 4.156321548598873e-05, + "loss": 3.432, + "step": 45313 + }, + { + "epoch": 0.26949519459510896, + "grad_norm": 1.7395141124725342, + "learning_rate": 4.156286560919791e-05, + "loss": 4.461, + "step": 45314 + }, + { + "epoch": 0.26950114187838997, + "grad_norm": 1.519547939300537, + "learning_rate": 4.156251572662516e-05, + "loss": 4.8243, + "step": 45315 + }, + { + "epoch": 0.26950708916167093, + "grad_norm": 1.39389967918396, + "learning_rate": 4.156216583827062e-05, + "loss": 4.6617, + "step": 45316 + }, + { + "epoch": 0.26951303644495195, + "grad_norm": 1.559107780456543, + "learning_rate": 4.1561815944134385e-05, + "loss": 4.6393, + "step": 45317 + }, + { + "epoch": 0.26951898372823296, + "grad_norm": 1.5894973278045654, + "learning_rate": 4.15614660442166e-05, + "loss": 4.7027, + "step": 45318 + }, + { + "epoch": 0.2695249310115139, + "grad_norm": 1.6897424459457397, + "learning_rate": 4.156111613851737e-05, + "loss": 4.4399, + "step": 45319 + }, + { + "epoch": 0.26953087829479494, + "grad_norm": 1.738806962966919, + "learning_rate": 4.156076622703684e-05, + "loss": 4.586, + "step": 45320 + }, + { + "epoch": 0.26953682557807596, + "grad_norm": 1.5997931957244873, + "learning_rate": 4.156041630977511e-05, + "loss": 4.7356, + "step": 45321 + }, + { + "epoch": 0.2695427728613569, + "grad_norm": 1.5197926759719849, + "learning_rate": 4.15600663867323e-05, + "loss": 4.7764, + "step": 45322 + }, + { + "epoch": 0.26954872014463793, + "grad_norm": 1.750075340270996, + "learning_rate": 4.155971645790856e-05, + "loss": 4.6418, + "step": 45323 + }, + { + "epoch": 0.26955466742791895, + "grad_norm": 1.7129992246627808, + "learning_rate": 4.1559366523303976e-05, + "loss": 4.6499, + "step": 45324 + }, + { + "epoch": 0.2695606147111999, + "grad_norm": 1.7532551288604736, + "learning_rate": 4.15590165829187e-05, + "loss": 4.0583, + "step": 45325 + }, + { + "epoch": 0.2695665619944809, + "grad_norm": 1.9777498245239258, + "learning_rate": 4.155866663675284e-05, + "loss": 4.2708, + "step": 45326 + }, + { + "epoch": 0.26957250927776194, + "grad_norm": 1.5517151355743408, + "learning_rate": 4.155831668480652e-05, + "loss": 4.7388, + "step": 45327 + }, + { + "epoch": 0.2695784565610429, + "grad_norm": 1.830837368965149, + "learning_rate": 4.155796672707986e-05, + "loss": 4.7233, + "step": 45328 + }, + { + "epoch": 0.2695844038443239, + "grad_norm": 1.5601505041122437, + "learning_rate": 4.1557616763572996e-05, + "loss": 4.663, + "step": 45329 + }, + { + "epoch": 0.26959035112760493, + "grad_norm": 1.4813709259033203, + "learning_rate": 4.155726679428602e-05, + "loss": 4.7905, + "step": 45330 + }, + { + "epoch": 0.2695962984108859, + "grad_norm": 1.4916036128997803, + "learning_rate": 4.155691681921908e-05, + "loss": 4.5976, + "step": 45331 + }, + { + "epoch": 0.2696022456941669, + "grad_norm": 1.4317198991775513, + "learning_rate": 4.15565668383723e-05, + "loss": 4.6274, + "step": 45332 + }, + { + "epoch": 0.2696081929774479, + "grad_norm": 1.6035404205322266, + "learning_rate": 4.155621685174579e-05, + "loss": 4.3346, + "step": 45333 + }, + { + "epoch": 0.2696141402607289, + "grad_norm": 1.5786033868789673, + "learning_rate": 4.155586685933968e-05, + "loss": 4.3077, + "step": 45334 + }, + { + "epoch": 0.2696200875440099, + "grad_norm": 1.4528864622116089, + "learning_rate": 4.155551686115407e-05, + "loss": 4.3631, + "step": 45335 + }, + { + "epoch": 0.2696260348272909, + "grad_norm": 1.4764782190322876, + "learning_rate": 4.155516685718912e-05, + "loss": 4.5608, + "step": 45336 + }, + { + "epoch": 0.2696319821105719, + "grad_norm": 1.5010504722595215, + "learning_rate": 4.1554816847444925e-05, + "loss": 4.5372, + "step": 45337 + }, + { + "epoch": 0.2696379293938529, + "grad_norm": 1.4953066110610962, + "learning_rate": 4.155446683192161e-05, + "loss": 4.3928, + "step": 45338 + }, + { + "epoch": 0.2696438766771339, + "grad_norm": 1.3396492004394531, + "learning_rate": 4.155411681061931e-05, + "loss": 4.4482, + "step": 45339 + }, + { + "epoch": 0.26964982396041487, + "grad_norm": 1.4031981229782104, + "learning_rate": 4.155376678353813e-05, + "loss": 4.1887, + "step": 45340 + }, + { + "epoch": 0.2696557712436959, + "grad_norm": 1.8941442966461182, + "learning_rate": 4.155341675067822e-05, + "loss": 4.4202, + "step": 45341 + }, + { + "epoch": 0.2696617185269769, + "grad_norm": 1.7295804023742676, + "learning_rate": 4.1553066712039654e-05, + "loss": 4.5712, + "step": 45342 + }, + { + "epoch": 0.26966766581025786, + "grad_norm": 2.5602784156799316, + "learning_rate": 4.1552716667622604e-05, + "loss": 4.2175, + "step": 45343 + }, + { + "epoch": 0.2696736130935389, + "grad_norm": 1.8582526445388794, + "learning_rate": 4.155236661742717e-05, + "loss": 4.1436, + "step": 45344 + }, + { + "epoch": 0.2696795603768199, + "grad_norm": 2.190192699432373, + "learning_rate": 4.1552016561453467e-05, + "loss": 4.0795, + "step": 45345 + }, + { + "epoch": 0.26968550766010085, + "grad_norm": 1.8088953495025635, + "learning_rate": 4.155166649970164e-05, + "loss": 4.6311, + "step": 45346 + }, + { + "epoch": 0.26969145494338187, + "grad_norm": 1.751497745513916, + "learning_rate": 4.155131643217178e-05, + "loss": 4.0817, + "step": 45347 + }, + { + "epoch": 0.2696974022266629, + "grad_norm": 1.759722352027893, + "learning_rate": 4.155096635886404e-05, + "loss": 4.0556, + "step": 45348 + }, + { + "epoch": 0.26970334950994385, + "grad_norm": 1.7222769260406494, + "learning_rate": 4.155061627977852e-05, + "loss": 4.3549, + "step": 45349 + }, + { + "epoch": 0.26970929679322486, + "grad_norm": 1.8313007354736328, + "learning_rate": 4.1550266194915354e-05, + "loss": 3.8818, + "step": 45350 + }, + { + "epoch": 0.2697152440765059, + "grad_norm": 1.554319143295288, + "learning_rate": 4.154991610427465e-05, + "loss": 4.0182, + "step": 45351 + }, + { + "epoch": 0.26972119135978684, + "grad_norm": 1.6546438932418823, + "learning_rate": 4.154956600785656e-05, + "loss": 4.2347, + "step": 45352 + }, + { + "epoch": 0.26972713864306785, + "grad_norm": 1.6885563135147095, + "learning_rate": 4.154921590566118e-05, + "loss": 4.0306, + "step": 45353 + }, + { + "epoch": 0.26973308592634887, + "grad_norm": 1.9724035263061523, + "learning_rate": 4.154886579768864e-05, + "loss": 4.2288, + "step": 45354 + }, + { + "epoch": 0.26973903320962983, + "grad_norm": 1.738640546798706, + "learning_rate": 4.154851568393906e-05, + "loss": 4.7614, + "step": 45355 + }, + { + "epoch": 0.26974498049291085, + "grad_norm": 1.6155599355697632, + "learning_rate": 4.1548165564412565e-05, + "loss": 4.7512, + "step": 45356 + }, + { + "epoch": 0.26975092777619186, + "grad_norm": 1.732848048210144, + "learning_rate": 4.154781543910928e-05, + "loss": 5.0405, + "step": 45357 + }, + { + "epoch": 0.2697568750594728, + "grad_norm": 1.7560569047927856, + "learning_rate": 4.1547465308029324e-05, + "loss": 4.5675, + "step": 45358 + }, + { + "epoch": 0.26976282234275384, + "grad_norm": 2.27547287940979, + "learning_rate": 4.154711517117281e-05, + "loss": 4.1865, + "step": 45359 + }, + { + "epoch": 0.26976876962603485, + "grad_norm": 1.7281955480575562, + "learning_rate": 4.154676502853988e-05, + "loss": 4.4252, + "step": 45360 + }, + { + "epoch": 0.2697747169093158, + "grad_norm": 1.7061762809753418, + "learning_rate": 4.1546414880130635e-05, + "loss": 4.6966, + "step": 45361 + }, + { + "epoch": 0.26978066419259683, + "grad_norm": 1.7866569757461548, + "learning_rate": 4.154606472594522e-05, + "loss": 4.675, + "step": 45362 + }, + { + "epoch": 0.26978661147587785, + "grad_norm": 1.6847171783447266, + "learning_rate": 4.154571456598373e-05, + "loss": 4.6749, + "step": 45363 + }, + { + "epoch": 0.2697925587591588, + "grad_norm": 1.9265351295471191, + "learning_rate": 4.154536440024631e-05, + "loss": 4.9363, + "step": 45364 + }, + { + "epoch": 0.2697985060424398, + "grad_norm": 1.6847586631774902, + "learning_rate": 4.154501422873307e-05, + "loss": 4.4616, + "step": 45365 + }, + { + "epoch": 0.2698044533257208, + "grad_norm": 1.8689684867858887, + "learning_rate": 4.154466405144414e-05, + "loss": 4.5675, + "step": 45366 + }, + { + "epoch": 0.2698104006090018, + "grad_norm": 1.9145435094833374, + "learning_rate": 4.1544313868379645e-05, + "loss": 4.6704, + "step": 45367 + }, + { + "epoch": 0.2698163478922828, + "grad_norm": 1.6350491046905518, + "learning_rate": 4.154396367953969e-05, + "loss": 4.774, + "step": 45368 + }, + { + "epoch": 0.2698222951755638, + "grad_norm": 1.5319669246673584, + "learning_rate": 4.154361348492441e-05, + "loss": 4.3755, + "step": 45369 + }, + { + "epoch": 0.2698282424588448, + "grad_norm": 1.9081478118896484, + "learning_rate": 4.154326328453394e-05, + "loss": 4.5894, + "step": 45370 + }, + { + "epoch": 0.2698341897421258, + "grad_norm": 2.0075998306274414, + "learning_rate": 4.154291307836837e-05, + "loss": 4.2506, + "step": 45371 + }, + { + "epoch": 0.26984013702540677, + "grad_norm": 1.8922226428985596, + "learning_rate": 4.154256286642784e-05, + "loss": 4.2935, + "step": 45372 + }, + { + "epoch": 0.2698460843086878, + "grad_norm": 1.7847208976745605, + "learning_rate": 4.1542212648712485e-05, + "loss": 4.6454, + "step": 45373 + }, + { + "epoch": 0.2698520315919688, + "grad_norm": 1.4745028018951416, + "learning_rate": 4.1541862425222404e-05, + "loss": 4.7223, + "step": 45374 + }, + { + "epoch": 0.26985797887524976, + "grad_norm": 1.465288519859314, + "learning_rate": 4.154151219595773e-05, + "loss": 4.7509, + "step": 45375 + }, + { + "epoch": 0.2698639261585308, + "grad_norm": 1.6630356311798096, + "learning_rate": 4.1541161960918595e-05, + "loss": 4.7253, + "step": 45376 + }, + { + "epoch": 0.2698698734418118, + "grad_norm": 1.4838476181030273, + "learning_rate": 4.15408117201051e-05, + "loss": 4.7869, + "step": 45377 + }, + { + "epoch": 0.26987582072509275, + "grad_norm": 1.3794244527816772, + "learning_rate": 4.154046147351738e-05, + "loss": 4.305, + "step": 45378 + }, + { + "epoch": 0.26988176800837377, + "grad_norm": 1.7701784372329712, + "learning_rate": 4.154011122115556e-05, + "loss": 5.114, + "step": 45379 + }, + { + "epoch": 0.2698877152916548, + "grad_norm": 1.2363604307174683, + "learning_rate": 4.153976096301976e-05, + "loss": 4.6272, + "step": 45380 + }, + { + "epoch": 0.26989366257493574, + "grad_norm": 1.5319936275482178, + "learning_rate": 4.153941069911009e-05, + "loss": 4.5383, + "step": 45381 + }, + { + "epoch": 0.26989960985821676, + "grad_norm": 1.5952107906341553, + "learning_rate": 4.15390604294267e-05, + "loss": 4.2603, + "step": 45382 + }, + { + "epoch": 0.2699055571414978, + "grad_norm": 1.7796322107315063, + "learning_rate": 4.153871015396968e-05, + "loss": 4.2951, + "step": 45383 + }, + { + "epoch": 0.26991150442477874, + "grad_norm": 1.7291762828826904, + "learning_rate": 4.153835987273917e-05, + "loss": 4.7991, + "step": 45384 + }, + { + "epoch": 0.26991745170805975, + "grad_norm": 1.7211867570877075, + "learning_rate": 4.1538009585735296e-05, + "loss": 4.4295, + "step": 45385 + }, + { + "epoch": 0.26992339899134077, + "grad_norm": 1.651484727859497, + "learning_rate": 4.1537659292958164e-05, + "loss": 4.4364, + "step": 45386 + }, + { + "epoch": 0.26992934627462173, + "grad_norm": 1.5708593130111694, + "learning_rate": 4.1537308994407906e-05, + "loss": 4.3647, + "step": 45387 + }, + { + "epoch": 0.26993529355790274, + "grad_norm": 1.7063499689102173, + "learning_rate": 4.153695869008465e-05, + "loss": 4.3185, + "step": 45388 + }, + { + "epoch": 0.26994124084118376, + "grad_norm": 1.6411081552505493, + "learning_rate": 4.1536608379988514e-05, + "loss": 4.1311, + "step": 45389 + }, + { + "epoch": 0.2699471881244647, + "grad_norm": 1.5756430625915527, + "learning_rate": 4.153625806411962e-05, + "loss": 4.3648, + "step": 45390 + }, + { + "epoch": 0.26995313540774574, + "grad_norm": 1.676545262336731, + "learning_rate": 4.153590774247809e-05, + "loss": 4.3153, + "step": 45391 + }, + { + "epoch": 0.26995908269102675, + "grad_norm": 1.5113835334777832, + "learning_rate": 4.153555741506404e-05, + "loss": 4.6034, + "step": 45392 + }, + { + "epoch": 0.2699650299743077, + "grad_norm": 1.5585875511169434, + "learning_rate": 4.1535207081877594e-05, + "loss": 4.2533, + "step": 45393 + }, + { + "epoch": 0.26997097725758873, + "grad_norm": 1.4961894750595093, + "learning_rate": 4.1534856742918884e-05, + "loss": 4.3773, + "step": 45394 + }, + { + "epoch": 0.26997692454086974, + "grad_norm": 1.607676386833191, + "learning_rate": 4.1534506398188023e-05, + "loss": 4.4531, + "step": 45395 + }, + { + "epoch": 0.2699828718241507, + "grad_norm": 1.5358102321624756, + "learning_rate": 4.1534156047685135e-05, + "loss": 4.4957, + "step": 45396 + }, + { + "epoch": 0.2699888191074317, + "grad_norm": 1.6208070516586304, + "learning_rate": 4.1533805691410354e-05, + "loss": 4.5914, + "step": 45397 + }, + { + "epoch": 0.26999476639071274, + "grad_norm": 1.6858468055725098, + "learning_rate": 4.153345532936378e-05, + "loss": 4.3687, + "step": 45398 + }, + { + "epoch": 0.2700007136739937, + "grad_norm": 1.6302781105041504, + "learning_rate": 4.1533104961545555e-05, + "loss": 4.5068, + "step": 45399 + }, + { + "epoch": 0.2700066609572747, + "grad_norm": 1.5371813774108887, + "learning_rate": 4.1532754587955795e-05, + "loss": 4.2886, + "step": 45400 + }, + { + "epoch": 0.27001260824055573, + "grad_norm": 1.5555007457733154, + "learning_rate": 4.153240420859462e-05, + "loss": 4.4532, + "step": 45401 + }, + { + "epoch": 0.2700185555238367, + "grad_norm": 1.7897270917892456, + "learning_rate": 4.153205382346215e-05, + "loss": 4.3212, + "step": 45402 + }, + { + "epoch": 0.2700245028071177, + "grad_norm": 1.4424046277999878, + "learning_rate": 4.153170343255851e-05, + "loss": 4.548, + "step": 45403 + }, + { + "epoch": 0.2700304500903987, + "grad_norm": 1.4192469120025635, + "learning_rate": 4.153135303588383e-05, + "loss": 4.4693, + "step": 45404 + }, + { + "epoch": 0.2700363973736797, + "grad_norm": 1.4552066326141357, + "learning_rate": 4.153100263343822e-05, + "loss": 4.4964, + "step": 45405 + }, + { + "epoch": 0.2700423446569607, + "grad_norm": 1.35254967212677, + "learning_rate": 4.153065222522181e-05, + "loss": 4.4062, + "step": 45406 + }, + { + "epoch": 0.2700482919402417, + "grad_norm": 1.6641688346862793, + "learning_rate": 4.153030181123472e-05, + "loss": 4.6898, + "step": 45407 + }, + { + "epoch": 0.2700542392235227, + "grad_norm": 1.5209919214248657, + "learning_rate": 4.1529951391477076e-05, + "loss": 5.1041, + "step": 45408 + }, + { + "epoch": 0.2700601865068037, + "grad_norm": 1.5515828132629395, + "learning_rate": 4.152960096594899e-05, + "loss": 4.957, + "step": 45409 + }, + { + "epoch": 0.2700661337900847, + "grad_norm": 1.6254849433898926, + "learning_rate": 4.152925053465059e-05, + "loss": 4.7769, + "step": 45410 + }, + { + "epoch": 0.27007208107336567, + "grad_norm": 1.6211360692977905, + "learning_rate": 4.1528900097582005e-05, + "loss": 4.8779, + "step": 45411 + }, + { + "epoch": 0.2700780283566467, + "grad_norm": 1.4891462326049805, + "learning_rate": 4.152854965474335e-05, + "loss": 4.8319, + "step": 45412 + }, + { + "epoch": 0.2700839756399277, + "grad_norm": 1.5016798973083496, + "learning_rate": 4.152819920613475e-05, + "loss": 4.9787, + "step": 45413 + }, + { + "epoch": 0.27008992292320866, + "grad_norm": 1.7365268468856812, + "learning_rate": 4.152784875175633e-05, + "loss": 4.9441, + "step": 45414 + }, + { + "epoch": 0.2700958702064897, + "grad_norm": 1.70844304561615, + "learning_rate": 4.15274982916082e-05, + "loss": 4.8555, + "step": 45415 + }, + { + "epoch": 0.2701018174897707, + "grad_norm": 1.7165144681930542, + "learning_rate": 4.1527147825690495e-05, + "loss": 4.9688, + "step": 45416 + }, + { + "epoch": 0.27010776477305165, + "grad_norm": 1.604483962059021, + "learning_rate": 4.1526797354003335e-05, + "loss": 4.9636, + "step": 45417 + }, + { + "epoch": 0.27011371205633267, + "grad_norm": 1.8009312152862549, + "learning_rate": 4.1526446876546836e-05, + "loss": 4.7773, + "step": 45418 + }, + { + "epoch": 0.2701196593396137, + "grad_norm": 1.516053318977356, + "learning_rate": 4.152609639332112e-05, + "loss": 4.9257, + "step": 45419 + }, + { + "epoch": 0.27012560662289464, + "grad_norm": 1.8544390201568604, + "learning_rate": 4.152574590432633e-05, + "loss": 4.0953, + "step": 45420 + }, + { + "epoch": 0.27013155390617566, + "grad_norm": 1.8744558095932007, + "learning_rate": 4.152539540956257e-05, + "loss": 4.4399, + "step": 45421 + }, + { + "epoch": 0.2701375011894567, + "grad_norm": 1.79426908493042, + "learning_rate": 4.152504490902995e-05, + "loss": 4.1051, + "step": 45422 + }, + { + "epoch": 0.27014344847273764, + "grad_norm": 1.8286224603652954, + "learning_rate": 4.152469440272863e-05, + "loss": 4.1342, + "step": 45423 + }, + { + "epoch": 0.27014939575601865, + "grad_norm": 1.7815488576889038, + "learning_rate": 4.15243438906587e-05, + "loss": 4.0833, + "step": 45424 + }, + { + "epoch": 0.27015534303929967, + "grad_norm": 1.794610857963562, + "learning_rate": 4.152399337282028e-05, + "loss": 4.0715, + "step": 45425 + }, + { + "epoch": 0.2701612903225806, + "grad_norm": 1.7218855619430542, + "learning_rate": 4.152364284921352e-05, + "loss": 4.1626, + "step": 45426 + }, + { + "epoch": 0.27016723760586164, + "grad_norm": 1.6630866527557373, + "learning_rate": 4.1523292319838524e-05, + "loss": 4.0948, + "step": 45427 + }, + { + "epoch": 0.27017318488914266, + "grad_norm": 1.7478611469268799, + "learning_rate": 4.1522941784695416e-05, + "loss": 4.6037, + "step": 45428 + }, + { + "epoch": 0.2701791321724236, + "grad_norm": 1.725718379020691, + "learning_rate": 4.152259124378431e-05, + "loss": 5.1731, + "step": 45429 + }, + { + "epoch": 0.27018507945570464, + "grad_norm": 1.4789243936538696, + "learning_rate": 4.152224069710536e-05, + "loss": 4.6803, + "step": 45430 + }, + { + "epoch": 0.27019102673898565, + "grad_norm": 2.782249927520752, + "learning_rate": 4.152189014465865e-05, + "loss": 2.664, + "step": 45431 + }, + { + "epoch": 0.2701969740222666, + "grad_norm": 2.842268705368042, + "learning_rate": 4.1521539586444324e-05, + "loss": 2.4398, + "step": 45432 + }, + { + "epoch": 0.2702029213055476, + "grad_norm": 1.5652813911437988, + "learning_rate": 4.1521189022462495e-05, + "loss": 4.4908, + "step": 45433 + }, + { + "epoch": 0.27020886858882864, + "grad_norm": 1.904374599456787, + "learning_rate": 4.1520838452713294e-05, + "loss": 4.3833, + "step": 45434 + }, + { + "epoch": 0.2702148158721096, + "grad_norm": 1.6057889461517334, + "learning_rate": 4.1520487877196835e-05, + "loss": 5.054, + "step": 45435 + }, + { + "epoch": 0.2702207631553906, + "grad_norm": 1.7325540781021118, + "learning_rate": 4.1520137295913254e-05, + "loss": 4.2102, + "step": 45436 + }, + { + "epoch": 0.27022671043867164, + "grad_norm": 2.262547492980957, + "learning_rate": 4.151978670886265e-05, + "loss": 3.9127, + "step": 45437 + }, + { + "epoch": 0.2702326577219526, + "grad_norm": 1.8462448120117188, + "learning_rate": 4.1519436116045167e-05, + "loss": 3.7183, + "step": 45438 + }, + { + "epoch": 0.2702386050052336, + "grad_norm": 1.5460753440856934, + "learning_rate": 4.151908551746092e-05, + "loss": 4.7195, + "step": 45439 + }, + { + "epoch": 0.27024455228851463, + "grad_norm": 1.747831106185913, + "learning_rate": 4.151873491311003e-05, + "loss": 4.4097, + "step": 45440 + }, + { + "epoch": 0.2702504995717956, + "grad_norm": 1.8991520404815674, + "learning_rate": 4.151838430299262e-05, + "loss": 4.1884, + "step": 45441 + }, + { + "epoch": 0.2702564468550766, + "grad_norm": 1.913098931312561, + "learning_rate": 4.151803368710881e-05, + "loss": 3.9492, + "step": 45442 + }, + { + "epoch": 0.2702623941383576, + "grad_norm": 1.9971439838409424, + "learning_rate": 4.1517683065458724e-05, + "loss": 3.5851, + "step": 45443 + }, + { + "epoch": 0.2702683414216386, + "grad_norm": 2.0197196006774902, + "learning_rate": 4.151733243804249e-05, + "loss": 3.7401, + "step": 45444 + }, + { + "epoch": 0.2702742887049196, + "grad_norm": 1.7689564228057861, + "learning_rate": 4.151698180486023e-05, + "loss": 4.1056, + "step": 45445 + }, + { + "epoch": 0.2702802359882006, + "grad_norm": 1.5069823265075684, + "learning_rate": 4.1516631165912056e-05, + "loss": 4.698, + "step": 45446 + }, + { + "epoch": 0.2702861832714816, + "grad_norm": 1.447420597076416, + "learning_rate": 4.1516280521198095e-05, + "loss": 4.8548, + "step": 45447 + }, + { + "epoch": 0.2702921305547626, + "grad_norm": 1.4722094535827637, + "learning_rate": 4.1515929870718475e-05, + "loss": 4.8907, + "step": 45448 + }, + { + "epoch": 0.2702980778380436, + "grad_norm": 2.9522159099578857, + "learning_rate": 4.151557921447331e-05, + "loss": 3.0102, + "step": 45449 + }, + { + "epoch": 0.27030402512132456, + "grad_norm": 3.679572105407715, + "learning_rate": 4.1515228552462734e-05, + "loss": 1.8492, + "step": 45450 + }, + { + "epoch": 0.2703099724046056, + "grad_norm": 3.4166676998138428, + "learning_rate": 4.151487788468686e-05, + "loss": 1.9642, + "step": 45451 + }, + { + "epoch": 0.2703159196878866, + "grad_norm": 1.8156254291534424, + "learning_rate": 4.151452721114581e-05, + "loss": 4.1597, + "step": 45452 + }, + { + "epoch": 0.27032186697116756, + "grad_norm": 1.704831600189209, + "learning_rate": 4.151417653183971e-05, + "loss": 4.3796, + "step": 45453 + }, + { + "epoch": 0.2703278142544486, + "grad_norm": 1.8766907453536987, + "learning_rate": 4.1513825846768684e-05, + "loss": 4.0001, + "step": 45454 + }, + { + "epoch": 0.2703337615377296, + "grad_norm": 1.7565842866897583, + "learning_rate": 4.151347515593284e-05, + "loss": 4.2984, + "step": 45455 + }, + { + "epoch": 0.27033970882101055, + "grad_norm": 1.5994961261749268, + "learning_rate": 4.151312445933233e-05, + "loss": 4.0882, + "step": 45456 + }, + { + "epoch": 0.27034565610429157, + "grad_norm": 1.4769684076309204, + "learning_rate": 4.151277375696725e-05, + "loss": 4.5471, + "step": 45457 + }, + { + "epoch": 0.2703516033875726, + "grad_norm": 1.6228564977645874, + "learning_rate": 4.151242304883773e-05, + "loss": 4.1457, + "step": 45458 + }, + { + "epoch": 0.27035755067085354, + "grad_norm": 1.8437076807022095, + "learning_rate": 4.15120723349439e-05, + "loss": 4.3527, + "step": 45459 + }, + { + "epoch": 0.27036349795413456, + "grad_norm": 1.582453966140747, + "learning_rate": 4.151172161528587e-05, + "loss": 4.5284, + "step": 45460 + }, + { + "epoch": 0.2703694452374156, + "grad_norm": 1.7032443284988403, + "learning_rate": 4.151137088986378e-05, + "loss": 3.8714, + "step": 45461 + }, + { + "epoch": 0.27037539252069653, + "grad_norm": 2.980224370956421, + "learning_rate": 4.151102015867773e-05, + "loss": 2.8804, + "step": 45462 + }, + { + "epoch": 0.27038133980397755, + "grad_norm": 3.3576743602752686, + "learning_rate": 4.1510669421727856e-05, + "loss": 1.9865, + "step": 45463 + }, + { + "epoch": 0.27038728708725857, + "grad_norm": 3.4433701038360596, + "learning_rate": 4.151031867901427e-05, + "loss": 1.7529, + "step": 45464 + }, + { + "epoch": 0.2703932343705395, + "grad_norm": 3.3765008449554443, + "learning_rate": 4.150996793053711e-05, + "loss": 1.8268, + "step": 45465 + }, + { + "epoch": 0.27039918165382054, + "grad_norm": 2.7790439128875732, + "learning_rate": 4.15096171762965e-05, + "loss": 2.201, + "step": 45466 + }, + { + "epoch": 0.27040512893710156, + "grad_norm": 3.181091547012329, + "learning_rate": 4.1509266416292536e-05, + "loss": 2.072, + "step": 45467 + }, + { + "epoch": 0.2704110762203825, + "grad_norm": 2.8253087997436523, + "learning_rate": 4.150891565052537e-05, + "loss": 2.2114, + "step": 45468 + }, + { + "epoch": 0.27041702350366353, + "grad_norm": 2.1138641834259033, + "learning_rate": 4.1508564878995104e-05, + "loss": 3.7134, + "step": 45469 + }, + { + "epoch": 0.27042297078694455, + "grad_norm": 1.9405546188354492, + "learning_rate": 4.150821410170187e-05, + "loss": 4.2933, + "step": 45470 + }, + { + "epoch": 0.2704289180702255, + "grad_norm": 1.9795273542404175, + "learning_rate": 4.1507863318645794e-05, + "loss": 3.8854, + "step": 45471 + }, + { + "epoch": 0.2704348653535065, + "grad_norm": 1.892856478691101, + "learning_rate": 4.150751252982699e-05, + "loss": 4.1269, + "step": 45472 + }, + { + "epoch": 0.27044081263678754, + "grad_norm": 1.7281402349472046, + "learning_rate": 4.1507161735245586e-05, + "loss": 4.226, + "step": 45473 + }, + { + "epoch": 0.2704467599200685, + "grad_norm": 2.1520118713378906, + "learning_rate": 4.150681093490169e-05, + "loss": 3.6829, + "step": 45474 + }, + { + "epoch": 0.2704527072033495, + "grad_norm": 1.615175724029541, + "learning_rate": 4.150646012879546e-05, + "loss": 4.915, + "step": 45475 + }, + { + "epoch": 0.27045865448663053, + "grad_norm": 1.682582139968872, + "learning_rate": 4.1506109316926976e-05, + "loss": 4.3962, + "step": 45476 + }, + { + "epoch": 0.2704646017699115, + "grad_norm": 1.4924980401992798, + "learning_rate": 4.150575849929639e-05, + "loss": 4.668, + "step": 45477 + }, + { + "epoch": 0.2704705490531925, + "grad_norm": 2.016237258911133, + "learning_rate": 4.1505407675903815e-05, + "loss": 3.9356, + "step": 45478 + }, + { + "epoch": 0.2704764963364735, + "grad_norm": 1.5473662614822388, + "learning_rate": 4.150505684674937e-05, + "loss": 4.3328, + "step": 45479 + }, + { + "epoch": 0.2704824436197545, + "grad_norm": 1.6433203220367432, + "learning_rate": 4.150470601183317e-05, + "loss": 4.1874, + "step": 45480 + }, + { + "epoch": 0.2704883909030355, + "grad_norm": 1.8605577945709229, + "learning_rate": 4.1504355171155366e-05, + "loss": 4.1437, + "step": 45481 + }, + { + "epoch": 0.27049433818631646, + "grad_norm": 1.552626609802246, + "learning_rate": 4.150400432471605e-05, + "loss": 4.4354, + "step": 45482 + }, + { + "epoch": 0.2705002854695975, + "grad_norm": 1.7157816886901855, + "learning_rate": 4.1503653472515356e-05, + "loss": 4.199, + "step": 45483 + }, + { + "epoch": 0.2705062327528785, + "grad_norm": 1.5386461019515991, + "learning_rate": 4.1503302614553417e-05, + "loss": 4.8999, + "step": 45484 + }, + { + "epoch": 0.27051218003615946, + "grad_norm": 1.522830843925476, + "learning_rate": 4.150295175083033e-05, + "loss": 4.9016, + "step": 45485 + }, + { + "epoch": 0.27051812731944047, + "grad_norm": 1.538665533065796, + "learning_rate": 4.150260088134625e-05, + "loss": 4.6921, + "step": 45486 + }, + { + "epoch": 0.2705240746027215, + "grad_norm": 1.7002607583999634, + "learning_rate": 4.1502250006101274e-05, + "loss": 4.3499, + "step": 45487 + }, + { + "epoch": 0.27053002188600245, + "grad_norm": 1.498531699180603, + "learning_rate": 4.150189912509553e-05, + "loss": 4.4562, + "step": 45488 + }, + { + "epoch": 0.27053596916928346, + "grad_norm": 1.7062883377075195, + "learning_rate": 4.1501548238329146e-05, + "loss": 4.6771, + "step": 45489 + }, + { + "epoch": 0.2705419164525645, + "grad_norm": 1.7236918210983276, + "learning_rate": 4.1501197345802244e-05, + "loss": 4.1608, + "step": 45490 + }, + { + "epoch": 0.27054786373584544, + "grad_norm": 2.0408663749694824, + "learning_rate": 4.150084644751494e-05, + "loss": 4.3713, + "step": 45491 + }, + { + "epoch": 0.27055381101912646, + "grad_norm": 1.6065289974212646, + "learning_rate": 4.1500495543467364e-05, + "loss": 4.7314, + "step": 45492 + }, + { + "epoch": 0.27055975830240747, + "grad_norm": 1.6159225702285767, + "learning_rate": 4.150014463365964e-05, + "loss": 4.782, + "step": 45493 + }, + { + "epoch": 0.27056570558568843, + "grad_norm": 1.4237419366836548, + "learning_rate": 4.1499793718091886e-05, + "loss": 4.8035, + "step": 45494 + }, + { + "epoch": 0.27057165286896945, + "grad_norm": 1.5189039707183838, + "learning_rate": 4.149944279676422e-05, + "loss": 4.5629, + "step": 45495 + }, + { + "epoch": 0.27057760015225046, + "grad_norm": 1.7843505144119263, + "learning_rate": 4.1499091869676766e-05, + "loss": 4.2376, + "step": 45496 + }, + { + "epoch": 0.2705835474355314, + "grad_norm": 1.8719360828399658, + "learning_rate": 4.149874093682965e-05, + "loss": 3.6525, + "step": 45497 + }, + { + "epoch": 0.27058949471881244, + "grad_norm": 2.00188946723938, + "learning_rate": 4.1498389998223e-05, + "loss": 4.0455, + "step": 45498 + }, + { + "epoch": 0.27059544200209346, + "grad_norm": 2.0914130210876465, + "learning_rate": 4.149803905385693e-05, + "loss": 3.7508, + "step": 45499 + }, + { + "epoch": 0.2706013892853744, + "grad_norm": 2.124814748764038, + "learning_rate": 4.1497688103731566e-05, + "loss": 3.1916, + "step": 45500 + }, + { + "epoch": 0.27060733656865543, + "grad_norm": 1.9400310516357422, + "learning_rate": 4.149733714784703e-05, + "loss": 3.66, + "step": 45501 + }, + { + "epoch": 0.27061328385193645, + "grad_norm": 1.641392469406128, + "learning_rate": 4.149698618620344e-05, + "loss": 4.4779, + "step": 45502 + }, + { + "epoch": 0.2706192311352174, + "grad_norm": 1.3333534002304077, + "learning_rate": 4.1496635218800924e-05, + "loss": 4.753, + "step": 45503 + }, + { + "epoch": 0.2706251784184984, + "grad_norm": 1.5643290281295776, + "learning_rate": 4.14962842456396e-05, + "loss": 5.0351, + "step": 45504 + }, + { + "epoch": 0.27063112570177944, + "grad_norm": 1.6055015325546265, + "learning_rate": 4.14959332667196e-05, + "loss": 4.8754, + "step": 45505 + }, + { + "epoch": 0.2706370729850604, + "grad_norm": 1.5835665464401245, + "learning_rate": 4.149558228204103e-05, + "loss": 4.4593, + "step": 45506 + }, + { + "epoch": 0.2706430202683414, + "grad_norm": 1.3916015625, + "learning_rate": 4.1495231291604035e-05, + "loss": 4.4369, + "step": 45507 + }, + { + "epoch": 0.27064896755162243, + "grad_norm": 1.469442367553711, + "learning_rate": 4.1494880295408724e-05, + "loss": 4.5998, + "step": 45508 + }, + { + "epoch": 0.2706549148349034, + "grad_norm": 1.603328824043274, + "learning_rate": 4.149452929345521e-05, + "loss": 4.4328, + "step": 45509 + }, + { + "epoch": 0.2706608621181844, + "grad_norm": 1.4020243883132935, + "learning_rate": 4.149417828574364e-05, + "loss": 4.7581, + "step": 45510 + }, + { + "epoch": 0.2706668094014654, + "grad_norm": 1.6328089237213135, + "learning_rate": 4.149382727227411e-05, + "loss": 4.5075, + "step": 45511 + }, + { + "epoch": 0.2706727566847464, + "grad_norm": 2.3434324264526367, + "learning_rate": 4.149347625304676e-05, + "loss": 4.2546, + "step": 45512 + }, + { + "epoch": 0.2706787039680274, + "grad_norm": 1.8417221307754517, + "learning_rate": 4.1493125228061716e-05, + "loss": 4.8782, + "step": 45513 + }, + { + "epoch": 0.2706846512513084, + "grad_norm": 1.4760382175445557, + "learning_rate": 4.149277419731908e-05, + "loss": 4.8885, + "step": 45514 + }, + { + "epoch": 0.2706905985345894, + "grad_norm": 1.4877973794937134, + "learning_rate": 4.1492423160818996e-05, + "loss": 4.96, + "step": 45515 + }, + { + "epoch": 0.2706965458178704, + "grad_norm": 1.3876078128814697, + "learning_rate": 4.149207211856158e-05, + "loss": 4.9459, + "step": 45516 + }, + { + "epoch": 0.2707024931011514, + "grad_norm": 1.534789800643921, + "learning_rate": 4.149172107054694e-05, + "loss": 4.4798, + "step": 45517 + }, + { + "epoch": 0.27070844038443237, + "grad_norm": 1.592059850692749, + "learning_rate": 4.149137001677521e-05, + "loss": 4.8978, + "step": 45518 + }, + { + "epoch": 0.2707143876677134, + "grad_norm": 1.4876341819763184, + "learning_rate": 4.1491018957246525e-05, + "loss": 5.0495, + "step": 45519 + }, + { + "epoch": 0.2707203349509944, + "grad_norm": 2.91133975982666, + "learning_rate": 4.1490667891960986e-05, + "loss": 2.5169, + "step": 45520 + }, + { + "epoch": 0.27072628223427536, + "grad_norm": 1.7767587900161743, + "learning_rate": 4.149031682091873e-05, + "loss": 4.2423, + "step": 45521 + }, + { + "epoch": 0.2707322295175564, + "grad_norm": 1.4157235622406006, + "learning_rate": 4.1489965744119876e-05, + "loss": 4.884, + "step": 45522 + }, + { + "epoch": 0.2707381768008374, + "grad_norm": 1.4526671171188354, + "learning_rate": 4.148961466156455e-05, + "loss": 4.9056, + "step": 45523 + }, + { + "epoch": 0.27074412408411835, + "grad_norm": 1.612915277481079, + "learning_rate": 4.148926357325286e-05, + "loss": 4.8927, + "step": 45524 + }, + { + "epoch": 0.27075007136739937, + "grad_norm": 1.4129230976104736, + "learning_rate": 4.148891247918494e-05, + "loss": 4.7532, + "step": 45525 + }, + { + "epoch": 0.2707560186506804, + "grad_norm": 1.5496562719345093, + "learning_rate": 4.148856137936091e-05, + "loss": 4.8072, + "step": 45526 + }, + { + "epoch": 0.27076196593396135, + "grad_norm": 1.5368938446044922, + "learning_rate": 4.14882102737809e-05, + "loss": 4.7277, + "step": 45527 + }, + { + "epoch": 0.27076791321724236, + "grad_norm": 1.5925108194351196, + "learning_rate": 4.148785916244502e-05, + "loss": 4.504, + "step": 45528 + }, + { + "epoch": 0.2707738605005234, + "grad_norm": 1.808827519416809, + "learning_rate": 4.1487508045353404e-05, + "loss": 4.554, + "step": 45529 + }, + { + "epoch": 0.27077980778380434, + "grad_norm": 1.578127145767212, + "learning_rate": 4.148715692250616e-05, + "loss": 4.4809, + "step": 45530 + }, + { + "epoch": 0.27078575506708535, + "grad_norm": 1.700423240661621, + "learning_rate": 4.148680579390343e-05, + "loss": 4.5196, + "step": 45531 + }, + { + "epoch": 0.27079170235036637, + "grad_norm": 1.7421356439590454, + "learning_rate": 4.148645465954532e-05, + "loss": 4.3156, + "step": 45532 + }, + { + "epoch": 0.27079764963364733, + "grad_norm": 1.99474036693573, + "learning_rate": 4.148610351943196e-05, + "loss": 4.2203, + "step": 45533 + }, + { + "epoch": 0.27080359691692835, + "grad_norm": 1.9260919094085693, + "learning_rate": 4.148575237356348e-05, + "loss": 4.4471, + "step": 45534 + }, + { + "epoch": 0.27080954420020936, + "grad_norm": 1.7691805362701416, + "learning_rate": 4.148540122193998e-05, + "loss": 4.2963, + "step": 45535 + }, + { + "epoch": 0.2708154914834903, + "grad_norm": 1.6244817972183228, + "learning_rate": 4.148505006456161e-05, + "loss": 4.0752, + "step": 45536 + }, + { + "epoch": 0.27082143876677134, + "grad_norm": 1.9723597764968872, + "learning_rate": 4.1484698901428475e-05, + "loss": 4.1421, + "step": 45537 + }, + { + "epoch": 0.27082738605005235, + "grad_norm": 1.8787959814071655, + "learning_rate": 4.14843477325407e-05, + "loss": 4.2781, + "step": 45538 + }, + { + "epoch": 0.2708333333333333, + "grad_norm": 2.0435667037963867, + "learning_rate": 4.148399655789841e-05, + "loss": 3.2886, + "step": 45539 + }, + { + "epoch": 0.27083928061661433, + "grad_norm": 2.5162880420684814, + "learning_rate": 4.148364537750172e-05, + "loss": 2.0259, + "step": 45540 + }, + { + "epoch": 0.27084522789989535, + "grad_norm": 2.481290817260742, + "learning_rate": 4.1483294191350774e-05, + "loss": 1.8204, + "step": 45541 + }, + { + "epoch": 0.2708511751831763, + "grad_norm": 2.8724734783172607, + "learning_rate": 4.148294299944567e-05, + "loss": 1.6548, + "step": 45542 + }, + { + "epoch": 0.2708571224664573, + "grad_norm": 1.6882710456848145, + "learning_rate": 4.1482591801786545e-05, + "loss": 4.0198, + "step": 45543 + }, + { + "epoch": 0.27086306974973834, + "grad_norm": 1.705988883972168, + "learning_rate": 4.1482240598373514e-05, + "loss": 3.9562, + "step": 45544 + }, + { + "epoch": 0.2708690170330193, + "grad_norm": 1.6828304529190063, + "learning_rate": 4.148188938920671e-05, + "loss": 3.98, + "step": 45545 + }, + { + "epoch": 0.2708749643163003, + "grad_norm": 1.5732545852661133, + "learning_rate": 4.148153817428624e-05, + "loss": 4.0711, + "step": 45546 + }, + { + "epoch": 0.27088091159958133, + "grad_norm": 1.6509430408477783, + "learning_rate": 4.148118695361224e-05, + "loss": 4.1743, + "step": 45547 + }, + { + "epoch": 0.2708868588828623, + "grad_norm": 1.7840385437011719, + "learning_rate": 4.148083572718483e-05, + "loss": 4.1592, + "step": 45548 + }, + { + "epoch": 0.2708928061661433, + "grad_norm": 1.8041337728500366, + "learning_rate": 4.148048449500412e-05, + "loss": 4.2173, + "step": 45549 + }, + { + "epoch": 0.2708987534494243, + "grad_norm": 1.6608580350875854, + "learning_rate": 4.148013325707026e-05, + "loss": 4.299, + "step": 45550 + }, + { + "epoch": 0.2709047007327053, + "grad_norm": 1.52150297164917, + "learning_rate": 4.147978201338334e-05, + "loss": 4.1063, + "step": 45551 + }, + { + "epoch": 0.2709106480159863, + "grad_norm": 1.6953823566436768, + "learning_rate": 4.147943076394351e-05, + "loss": 4.118, + "step": 45552 + }, + { + "epoch": 0.2709165952992673, + "grad_norm": 1.6396862268447876, + "learning_rate": 4.1479079508750875e-05, + "loss": 4.1257, + "step": 45553 + }, + { + "epoch": 0.2709225425825483, + "grad_norm": 1.6998536586761475, + "learning_rate": 4.147872824780556e-05, + "loss": 4.0613, + "step": 45554 + }, + { + "epoch": 0.2709284898658293, + "grad_norm": 1.820737361907959, + "learning_rate": 4.1478376981107706e-05, + "loss": 3.9828, + "step": 45555 + }, + { + "epoch": 0.2709344371491103, + "grad_norm": 1.5857799053192139, + "learning_rate": 4.147802570865741e-05, + "loss": 3.9252, + "step": 45556 + }, + { + "epoch": 0.27094038443239127, + "grad_norm": 1.8434388637542725, + "learning_rate": 4.14776744304548e-05, + "loss": 4.2718, + "step": 45557 + }, + { + "epoch": 0.2709463317156723, + "grad_norm": 1.6643686294555664, + "learning_rate": 4.147732314650001e-05, + "loss": 3.9284, + "step": 45558 + }, + { + "epoch": 0.2709522789989533, + "grad_norm": 1.794216275215149, + "learning_rate": 4.1476971856793156e-05, + "loss": 4.069, + "step": 45559 + }, + { + "epoch": 0.27095822628223426, + "grad_norm": 1.8100641965866089, + "learning_rate": 4.147662056133437e-05, + "loss": 4.2536, + "step": 45560 + }, + { + "epoch": 0.2709641735655153, + "grad_norm": 1.664797306060791, + "learning_rate": 4.1476269260123754e-05, + "loss": 4.5304, + "step": 45561 + }, + { + "epoch": 0.2709701208487963, + "grad_norm": 1.6153063774108887, + "learning_rate": 4.1475917953161446e-05, + "loss": 4.659, + "step": 45562 + }, + { + "epoch": 0.27097606813207725, + "grad_norm": 1.5055850744247437, + "learning_rate": 4.147556664044757e-05, + "loss": 4.6929, + "step": 45563 + }, + { + "epoch": 0.27098201541535827, + "grad_norm": 1.721571922302246, + "learning_rate": 4.147521532198224e-05, + "loss": 4.3167, + "step": 45564 + }, + { + "epoch": 0.2709879626986393, + "grad_norm": 1.5521399974822998, + "learning_rate": 4.147486399776558e-05, + "loss": 4.4502, + "step": 45565 + }, + { + "epoch": 0.27099390998192024, + "grad_norm": 1.4705181121826172, + "learning_rate": 4.1474512667797716e-05, + "loss": 4.2459, + "step": 45566 + }, + { + "epoch": 0.27099985726520126, + "grad_norm": 1.6402742862701416, + "learning_rate": 4.1474161332078776e-05, + "loss": 4.3909, + "step": 45567 + }, + { + "epoch": 0.2710058045484823, + "grad_norm": 1.6460199356079102, + "learning_rate": 4.147380999060887e-05, + "loss": 4.7949, + "step": 45568 + }, + { + "epoch": 0.27101175183176324, + "grad_norm": 1.4793797731399536, + "learning_rate": 4.147345864338813e-05, + "loss": 4.7606, + "step": 45569 + }, + { + "epoch": 0.27101769911504425, + "grad_norm": 1.5499674081802368, + "learning_rate": 4.147310729041667e-05, + "loss": 4.7565, + "step": 45570 + }, + { + "epoch": 0.27102364639832527, + "grad_norm": 1.6083790063858032, + "learning_rate": 4.1472755931694626e-05, + "loss": 4.1103, + "step": 45571 + }, + { + "epoch": 0.27102959368160623, + "grad_norm": 1.390085220336914, + "learning_rate": 4.147240456722211e-05, + "loss": 4.851, + "step": 45572 + }, + { + "epoch": 0.27103554096488724, + "grad_norm": 1.6026060581207275, + "learning_rate": 4.147205319699925e-05, + "loss": 4.7314, + "step": 45573 + }, + { + "epoch": 0.27104148824816826, + "grad_norm": 1.477052927017212, + "learning_rate": 4.1471701821026163e-05, + "loss": 4.5826, + "step": 45574 + }, + { + "epoch": 0.2710474355314492, + "grad_norm": 1.4805597066879272, + "learning_rate": 4.147135043930298e-05, + "loss": 4.7502, + "step": 45575 + }, + { + "epoch": 0.27105338281473024, + "grad_norm": 1.3246674537658691, + "learning_rate": 4.147099905182982e-05, + "loss": 4.704, + "step": 45576 + }, + { + "epoch": 0.27105933009801125, + "grad_norm": 1.5396431684494019, + "learning_rate": 4.14706476586068e-05, + "loss": 4.6537, + "step": 45577 + }, + { + "epoch": 0.2710652773812922, + "grad_norm": 1.3756814002990723, + "learning_rate": 4.147029625963405e-05, + "loss": 4.59, + "step": 45578 + }, + { + "epoch": 0.27107122466457323, + "grad_norm": 1.3621467351913452, + "learning_rate": 4.146994485491168e-05, + "loss": 4.5353, + "step": 45579 + }, + { + "epoch": 0.27107717194785425, + "grad_norm": 1.4778308868408203, + "learning_rate": 4.146959344443982e-05, + "loss": 4.8554, + "step": 45580 + }, + { + "epoch": 0.2710831192311352, + "grad_norm": 1.5913684368133545, + "learning_rate": 4.146924202821861e-05, + "loss": 4.6117, + "step": 45581 + }, + { + "epoch": 0.2710890665144162, + "grad_norm": 1.641024112701416, + "learning_rate": 4.1468890606248156e-05, + "loss": 4.3609, + "step": 45582 + }, + { + "epoch": 0.27109501379769724, + "grad_norm": 1.7049418687820435, + "learning_rate": 4.1468539178528576e-05, + "loss": 4.4909, + "step": 45583 + }, + { + "epoch": 0.2711009610809782, + "grad_norm": 1.8046053647994995, + "learning_rate": 4.146818774506e-05, + "loss": 4.3939, + "step": 45584 + }, + { + "epoch": 0.2711069083642592, + "grad_norm": 1.5276776552200317, + "learning_rate": 4.146783630584256e-05, + "loss": 4.4418, + "step": 45585 + }, + { + "epoch": 0.27111285564754023, + "grad_norm": 1.328989863395691, + "learning_rate": 4.146748486087636e-05, + "loss": 4.4195, + "step": 45586 + }, + { + "epoch": 0.2711188029308212, + "grad_norm": 1.5300047397613525, + "learning_rate": 4.146713341016153e-05, + "loss": 4.3471, + "step": 45587 + }, + { + "epoch": 0.2711247502141022, + "grad_norm": 1.9326783418655396, + "learning_rate": 4.146678195369819e-05, + "loss": 4.0835, + "step": 45588 + }, + { + "epoch": 0.2711306974973832, + "grad_norm": 1.9693584442138672, + "learning_rate": 4.146643049148647e-05, + "loss": 4.2193, + "step": 45589 + }, + { + "epoch": 0.2711366447806642, + "grad_norm": 1.9791293144226074, + "learning_rate": 4.146607902352649e-05, + "loss": 4.0518, + "step": 45590 + }, + { + "epoch": 0.2711425920639452, + "grad_norm": 1.4835667610168457, + "learning_rate": 4.1465727549818375e-05, + "loss": 4.4398, + "step": 45591 + }, + { + "epoch": 0.2711485393472262, + "grad_norm": 1.6459711790084839, + "learning_rate": 4.146537607036224e-05, + "loss": 4.6577, + "step": 45592 + }, + { + "epoch": 0.2711544866305072, + "grad_norm": 1.6922674179077148, + "learning_rate": 4.146502458515822e-05, + "loss": 4.4848, + "step": 45593 + }, + { + "epoch": 0.2711604339137882, + "grad_norm": 1.6461238861083984, + "learning_rate": 4.146467309420642e-05, + "loss": 4.3382, + "step": 45594 + }, + { + "epoch": 0.2711663811970692, + "grad_norm": 1.4770185947418213, + "learning_rate": 4.146432159750698e-05, + "loss": 4.4614, + "step": 45595 + }, + { + "epoch": 0.27117232848035017, + "grad_norm": 1.7166494131088257, + "learning_rate": 4.146397009506001e-05, + "loss": 4.2216, + "step": 45596 + }, + { + "epoch": 0.2711782757636312, + "grad_norm": 1.4403802156448364, + "learning_rate": 4.146361858686564e-05, + "loss": 4.7561, + "step": 45597 + }, + { + "epoch": 0.27118422304691214, + "grad_norm": 1.2737112045288086, + "learning_rate": 4.1463267072924e-05, + "loss": 4.9127, + "step": 45598 + }, + { + "epoch": 0.27119017033019316, + "grad_norm": 1.6008663177490234, + "learning_rate": 4.146291555323519e-05, + "loss": 4.5096, + "step": 45599 + }, + { + "epoch": 0.2711961176134742, + "grad_norm": 1.8164516687393188, + "learning_rate": 4.146256402779935e-05, + "loss": 3.6753, + "step": 45600 + }, + { + "epoch": 0.27120206489675514, + "grad_norm": 1.734933614730835, + "learning_rate": 4.1462212496616607e-05, + "loss": 3.7546, + "step": 45601 + }, + { + "epoch": 0.27120801218003615, + "grad_norm": 1.8741623163223267, + "learning_rate": 4.146186095968706e-05, + "loss": 4.0909, + "step": 45602 + }, + { + "epoch": 0.27121395946331717, + "grad_norm": 1.89378023147583, + "learning_rate": 4.1461509417010855e-05, + "loss": 4.243, + "step": 45603 + }, + { + "epoch": 0.2712199067465981, + "grad_norm": 1.8543760776519775, + "learning_rate": 4.146115786858811e-05, + "loss": 4.1874, + "step": 45604 + }, + { + "epoch": 0.27122585402987914, + "grad_norm": 1.5912622213363647, + "learning_rate": 4.1460806314418946e-05, + "loss": 4.7074, + "step": 45605 + }, + { + "epoch": 0.27123180131316016, + "grad_norm": 1.491672396659851, + "learning_rate": 4.146045475450348e-05, + "loss": 4.5535, + "step": 45606 + }, + { + "epoch": 0.2712377485964411, + "grad_norm": 1.380670428276062, + "learning_rate": 4.146010318884185e-05, + "loss": 4.7431, + "step": 45607 + }, + { + "epoch": 0.27124369587972214, + "grad_norm": 1.4712095260620117, + "learning_rate": 4.1459751617434155e-05, + "loss": 4.1904, + "step": 45608 + }, + { + "epoch": 0.27124964316300315, + "grad_norm": 1.5953558683395386, + "learning_rate": 4.145940004028054e-05, + "loss": 3.4822, + "step": 45609 + }, + { + "epoch": 0.2712555904462841, + "grad_norm": 1.5644789934158325, + "learning_rate": 4.145904845738111e-05, + "loss": 3.6604, + "step": 45610 + }, + { + "epoch": 0.2712615377295651, + "grad_norm": 1.499889850616455, + "learning_rate": 4.1458696868736004e-05, + "loss": 3.4989, + "step": 45611 + }, + { + "epoch": 0.27126748501284614, + "grad_norm": 1.4083800315856934, + "learning_rate": 4.145834527434533e-05, + "loss": 3.5593, + "step": 45612 + }, + { + "epoch": 0.2712734322961271, + "grad_norm": 1.6068228483200073, + "learning_rate": 4.145799367420922e-05, + "loss": 3.3804, + "step": 45613 + }, + { + "epoch": 0.2712793795794081, + "grad_norm": 1.4701377153396606, + "learning_rate": 4.145764206832779e-05, + "loss": 3.597, + "step": 45614 + }, + { + "epoch": 0.27128532686268914, + "grad_norm": 1.6232653856277466, + "learning_rate": 4.145729045670118e-05, + "loss": 3.4047, + "step": 45615 + }, + { + "epoch": 0.2712912741459701, + "grad_norm": 1.4535387754440308, + "learning_rate": 4.145693883932949e-05, + "loss": 4.5506, + "step": 45616 + }, + { + "epoch": 0.2712972214292511, + "grad_norm": 1.2943978309631348, + "learning_rate": 4.1456587216212854e-05, + "loss": 4.479, + "step": 45617 + }, + { + "epoch": 0.27130316871253213, + "grad_norm": 1.2956044673919678, + "learning_rate": 4.1456235587351396e-05, + "loss": 4.9315, + "step": 45618 + }, + { + "epoch": 0.2713091159958131, + "grad_norm": 1.3774962425231934, + "learning_rate": 4.145588395274523e-05, + "loss": 4.3054, + "step": 45619 + }, + { + "epoch": 0.2713150632790941, + "grad_norm": 1.5703076124191284, + "learning_rate": 4.1455532312394496e-05, + "loss": 3.5382, + "step": 45620 + }, + { + "epoch": 0.2713210105623751, + "grad_norm": 1.6718165874481201, + "learning_rate": 4.14551806662993e-05, + "loss": 3.454, + "step": 45621 + }, + { + "epoch": 0.2713269578456561, + "grad_norm": 1.5548449754714966, + "learning_rate": 4.145482901445976e-05, + "loss": 3.5277, + "step": 45622 + }, + { + "epoch": 0.2713329051289371, + "grad_norm": 1.4837538003921509, + "learning_rate": 4.1454477356876026e-05, + "loss": 3.4608, + "step": 45623 + }, + { + "epoch": 0.2713388524122181, + "grad_norm": 1.5471229553222656, + "learning_rate": 4.1454125693548194e-05, + "loss": 3.4272, + "step": 45624 + }, + { + "epoch": 0.2713447996954991, + "grad_norm": 1.6447452306747437, + "learning_rate": 4.14537740244764e-05, + "loss": 3.6768, + "step": 45625 + }, + { + "epoch": 0.2713507469787801, + "grad_norm": 1.3987075090408325, + "learning_rate": 4.145342234966076e-05, + "loss": 4.7875, + "step": 45626 + }, + { + "epoch": 0.2713566942620611, + "grad_norm": 1.485988974571228, + "learning_rate": 4.145307066910141e-05, + "loss": 3.7172, + "step": 45627 + }, + { + "epoch": 0.27136264154534206, + "grad_norm": 1.40952730178833, + "learning_rate": 4.145271898279846e-05, + "loss": 3.7236, + "step": 45628 + }, + { + "epoch": 0.2713685888286231, + "grad_norm": 1.5430121421813965, + "learning_rate": 4.145236729075203e-05, + "loss": 3.4975, + "step": 45629 + }, + { + "epoch": 0.2713745361119041, + "grad_norm": 1.3525866270065308, + "learning_rate": 4.145201559296225e-05, + "loss": 4.1432, + "step": 45630 + }, + { + "epoch": 0.27138048339518506, + "grad_norm": 1.4204624891281128, + "learning_rate": 4.145166388942924e-05, + "loss": 4.9692, + "step": 45631 + }, + { + "epoch": 0.2713864306784661, + "grad_norm": 1.456061840057373, + "learning_rate": 4.145131218015312e-05, + "loss": 4.781, + "step": 45632 + }, + { + "epoch": 0.2713923779617471, + "grad_norm": 1.534145474433899, + "learning_rate": 4.1450960465134025e-05, + "loss": 4.3726, + "step": 45633 + }, + { + "epoch": 0.27139832524502805, + "grad_norm": 1.3881795406341553, + "learning_rate": 4.145060874437207e-05, + "loss": 3.779, + "step": 45634 + }, + { + "epoch": 0.27140427252830907, + "grad_norm": 1.520431399345398, + "learning_rate": 4.1450257017867375e-05, + "loss": 3.6128, + "step": 45635 + }, + { + "epoch": 0.2714102198115901, + "grad_norm": 1.4434179067611694, + "learning_rate": 4.1449905285620063e-05, + "loss": 3.6607, + "step": 45636 + }, + { + "epoch": 0.27141616709487104, + "grad_norm": 1.4835580587387085, + "learning_rate": 4.144955354763026e-05, + "loss": 3.6701, + "step": 45637 + }, + { + "epoch": 0.27142211437815206, + "grad_norm": 1.4195868968963623, + "learning_rate": 4.144920180389809e-05, + "loss": 3.6281, + "step": 45638 + }, + { + "epoch": 0.2714280616614331, + "grad_norm": 1.4771740436553955, + "learning_rate": 4.144885005442367e-05, + "loss": 3.5301, + "step": 45639 + }, + { + "epoch": 0.27143400894471403, + "grad_norm": 1.38057279586792, + "learning_rate": 4.144849829920713e-05, + "loss": 3.7239, + "step": 45640 + }, + { + "epoch": 0.27143995622799505, + "grad_norm": 1.4808566570281982, + "learning_rate": 4.144814653824859e-05, + "loss": 3.6313, + "step": 45641 + }, + { + "epoch": 0.27144590351127607, + "grad_norm": 1.5475244522094727, + "learning_rate": 4.144779477154817e-05, + "loss": 3.4547, + "step": 45642 + }, + { + "epoch": 0.271451850794557, + "grad_norm": 1.3620257377624512, + "learning_rate": 4.144744299910599e-05, + "loss": 3.7241, + "step": 45643 + }, + { + "epoch": 0.27145779807783804, + "grad_norm": 1.5721651315689087, + "learning_rate": 4.1447091220922185e-05, + "loss": 3.722, + "step": 45644 + }, + { + "epoch": 0.27146374536111906, + "grad_norm": 1.4613244533538818, + "learning_rate": 4.1446739436996865e-05, + "loss": 3.4952, + "step": 45645 + }, + { + "epoch": 0.2714696926444, + "grad_norm": 1.512589931488037, + "learning_rate": 4.1446387647330166e-05, + "loss": 3.5335, + "step": 45646 + }, + { + "epoch": 0.27147563992768103, + "grad_norm": 1.4605076313018799, + "learning_rate": 4.1446035851922196e-05, + "loss": 3.7552, + "step": 45647 + }, + { + "epoch": 0.27148158721096205, + "grad_norm": 1.5016175508499146, + "learning_rate": 4.1445684050773084e-05, + "loss": 4.1359, + "step": 45648 + }, + { + "epoch": 0.271487534494243, + "grad_norm": 1.6717475652694702, + "learning_rate": 4.144533224388296e-05, + "loss": 4.6536, + "step": 45649 + }, + { + "epoch": 0.271493481777524, + "grad_norm": 2.1210367679595947, + "learning_rate": 4.144498043125194e-05, + "loss": 4.2318, + "step": 45650 + }, + { + "epoch": 0.27149942906080504, + "grad_norm": 1.782103180885315, + "learning_rate": 4.1444628612880145e-05, + "loss": 4.6622, + "step": 45651 + }, + { + "epoch": 0.271505376344086, + "grad_norm": 1.6243693828582764, + "learning_rate": 4.1444276788767685e-05, + "loss": 4.7034, + "step": 45652 + }, + { + "epoch": 0.271511323627367, + "grad_norm": 1.4133120775222778, + "learning_rate": 4.144392495891473e-05, + "loss": 4.91, + "step": 45653 + }, + { + "epoch": 0.27151727091064803, + "grad_norm": 1.8201396465301514, + "learning_rate": 4.1443573123321346e-05, + "loss": 4.1111, + "step": 45654 + }, + { + "epoch": 0.271523218193929, + "grad_norm": 1.4930140972137451, + "learning_rate": 4.144322128198769e-05, + "loss": 4.8025, + "step": 45655 + }, + { + "epoch": 0.27152916547721, + "grad_norm": 1.5047277212142944, + "learning_rate": 4.144286943491387e-05, + "loss": 4.8015, + "step": 45656 + }, + { + "epoch": 0.271535112760491, + "grad_norm": 1.4906986951828003, + "learning_rate": 4.1442517582100016e-05, + "loss": 4.6641, + "step": 45657 + }, + { + "epoch": 0.271541060043772, + "grad_norm": 1.7398515939712524, + "learning_rate": 4.144216572354624e-05, + "loss": 4.7474, + "step": 45658 + }, + { + "epoch": 0.271547007327053, + "grad_norm": 1.641348958015442, + "learning_rate": 4.1441813859252694e-05, + "loss": 4.8535, + "step": 45659 + }, + { + "epoch": 0.271552954610334, + "grad_norm": 1.4287338256835938, + "learning_rate": 4.144146198921947e-05, + "loss": 4.8272, + "step": 45660 + }, + { + "epoch": 0.271558901893615, + "grad_norm": 1.5773226022720337, + "learning_rate": 4.14411101134467e-05, + "loss": 4.5501, + "step": 45661 + }, + { + "epoch": 0.271564849176896, + "grad_norm": 1.7274303436279297, + "learning_rate": 4.1440758231934504e-05, + "loss": 4.0273, + "step": 45662 + }, + { + "epoch": 0.271570796460177, + "grad_norm": 1.6156284809112549, + "learning_rate": 4.144040634468302e-05, + "loss": 4.0813, + "step": 45663 + }, + { + "epoch": 0.27157674374345797, + "grad_norm": 1.644073247909546, + "learning_rate": 4.144005445169236e-05, + "loss": 4.4637, + "step": 45664 + }, + { + "epoch": 0.271582691026739, + "grad_norm": 1.7066395282745361, + "learning_rate": 4.143970255296264e-05, + "loss": 4.3481, + "step": 45665 + }, + { + "epoch": 0.27158863831002, + "grad_norm": 1.479235053062439, + "learning_rate": 4.143935064849399e-05, + "loss": 4.71, + "step": 45666 + }, + { + "epoch": 0.27159458559330096, + "grad_norm": 1.618563175201416, + "learning_rate": 4.143899873828653e-05, + "loss": 4.565, + "step": 45667 + }, + { + "epoch": 0.271600532876582, + "grad_norm": 1.6066486835479736, + "learning_rate": 4.143864682234039e-05, + "loss": 4.074, + "step": 45668 + }, + { + "epoch": 0.271606480159863, + "grad_norm": 1.7792696952819824, + "learning_rate": 4.1438294900655686e-05, + "loss": 4.3305, + "step": 45669 + }, + { + "epoch": 0.27161242744314396, + "grad_norm": 1.6858419179916382, + "learning_rate": 4.143794297323255e-05, + "loss": 4.2085, + "step": 45670 + }, + { + "epoch": 0.27161837472642497, + "grad_norm": 1.542989730834961, + "learning_rate": 4.143759104007109e-05, + "loss": 3.9541, + "step": 45671 + }, + { + "epoch": 0.271624322009706, + "grad_norm": 1.7063134908676147, + "learning_rate": 4.143723910117144e-05, + "loss": 3.9953, + "step": 45672 + }, + { + "epoch": 0.27163026929298695, + "grad_norm": 1.8261122703552246, + "learning_rate": 4.143688715653372e-05, + "loss": 4.3073, + "step": 45673 + }, + { + "epoch": 0.27163621657626796, + "grad_norm": 1.8582186698913574, + "learning_rate": 4.143653520615805e-05, + "loss": 4.3783, + "step": 45674 + }, + { + "epoch": 0.271642163859549, + "grad_norm": 1.8675076961517334, + "learning_rate": 4.1436183250044555e-05, + "loss": 4.4737, + "step": 45675 + }, + { + "epoch": 0.27164811114282994, + "grad_norm": 2.0176503658294678, + "learning_rate": 4.1435831288193364e-05, + "loss": 4.3522, + "step": 45676 + }, + { + "epoch": 0.27165405842611096, + "grad_norm": 1.7546420097351074, + "learning_rate": 4.1435479320604595e-05, + "loss": 4.3644, + "step": 45677 + }, + { + "epoch": 0.27166000570939197, + "grad_norm": 2.30362606048584, + "learning_rate": 4.143512734727836e-05, + "loss": 4.2348, + "step": 45678 + }, + { + "epoch": 0.27166595299267293, + "grad_norm": 2.3323347568511963, + "learning_rate": 4.1434775368214794e-05, + "loss": 4.3034, + "step": 45679 + }, + { + "epoch": 0.27167190027595395, + "grad_norm": 1.9769312143325806, + "learning_rate": 4.1434423383414024e-05, + "loss": 4.2748, + "step": 45680 + }, + { + "epoch": 0.27167784755923496, + "grad_norm": 2.441744327545166, + "learning_rate": 4.143407139287616e-05, + "loss": 4.3436, + "step": 45681 + }, + { + "epoch": 0.2716837948425159, + "grad_norm": 1.8756784200668335, + "learning_rate": 4.143371939660133e-05, + "loss": 4.1687, + "step": 45682 + }, + { + "epoch": 0.27168974212579694, + "grad_norm": 1.7808586359024048, + "learning_rate": 4.1433367394589674e-05, + "loss": 4.3087, + "step": 45683 + }, + { + "epoch": 0.27169568940907796, + "grad_norm": 2.00399112701416, + "learning_rate": 4.143301538684129e-05, + "loss": 4.2019, + "step": 45684 + }, + { + "epoch": 0.2717016366923589, + "grad_norm": 2.1260390281677246, + "learning_rate": 4.143266337335631e-05, + "loss": 4.1273, + "step": 45685 + }, + { + "epoch": 0.27170758397563993, + "grad_norm": 1.7633872032165527, + "learning_rate": 4.143231135413485e-05, + "loss": 4.1965, + "step": 45686 + }, + { + "epoch": 0.27171353125892095, + "grad_norm": 1.8990553617477417, + "learning_rate": 4.143195932917705e-05, + "loss": 4.3397, + "step": 45687 + }, + { + "epoch": 0.2717194785422019, + "grad_norm": 1.6923203468322754, + "learning_rate": 4.143160729848301e-05, + "loss": 4.2032, + "step": 45688 + }, + { + "epoch": 0.2717254258254829, + "grad_norm": 2.117751121520996, + "learning_rate": 4.143125526205288e-05, + "loss": 4.2573, + "step": 45689 + }, + { + "epoch": 0.27173137310876394, + "grad_norm": 2.3104493618011475, + "learning_rate": 4.143090321988676e-05, + "loss": 4.094, + "step": 45690 + }, + { + "epoch": 0.2717373203920449, + "grad_norm": 2.0259883403778076, + "learning_rate": 4.1430551171984784e-05, + "loss": 4.0456, + "step": 45691 + }, + { + "epoch": 0.2717432676753259, + "grad_norm": 1.905178427696228, + "learning_rate": 4.143019911834707e-05, + "loss": 4.2973, + "step": 45692 + }, + { + "epoch": 0.27174921495860693, + "grad_norm": 1.9607056379318237, + "learning_rate": 4.142984705897375e-05, + "loss": 4.3301, + "step": 45693 + }, + { + "epoch": 0.2717551622418879, + "grad_norm": 2.0590391159057617, + "learning_rate": 4.1429494993864934e-05, + "loss": 4.438, + "step": 45694 + }, + { + "epoch": 0.2717611095251689, + "grad_norm": 1.8446234464645386, + "learning_rate": 4.1429142923020755e-05, + "loss": 4.1278, + "step": 45695 + }, + { + "epoch": 0.2717670568084499, + "grad_norm": 2.087512493133545, + "learning_rate": 4.142879084644132e-05, + "loss": 4.1619, + "step": 45696 + }, + { + "epoch": 0.2717730040917309, + "grad_norm": 1.7806531190872192, + "learning_rate": 4.1428438764126774e-05, + "loss": 4.0997, + "step": 45697 + }, + { + "epoch": 0.2717789513750119, + "grad_norm": 1.9245685338974, + "learning_rate": 4.1428086676077235e-05, + "loss": 4.199, + "step": 45698 + }, + { + "epoch": 0.2717848986582929, + "grad_norm": 2.039696216583252, + "learning_rate": 4.142773458229281e-05, + "loss": 4.1541, + "step": 45699 + }, + { + "epoch": 0.2717908459415739, + "grad_norm": 1.9681401252746582, + "learning_rate": 4.142738248277364e-05, + "loss": 4.053, + "step": 45700 + }, + { + "epoch": 0.2717967932248549, + "grad_norm": 1.6302409172058105, + "learning_rate": 4.142703037751984e-05, + "loss": 4.2163, + "step": 45701 + }, + { + "epoch": 0.2718027405081359, + "grad_norm": 1.6277996301651, + "learning_rate": 4.142667826653153e-05, + "loss": 4.1636, + "step": 45702 + }, + { + "epoch": 0.27180868779141687, + "grad_norm": 1.2965755462646484, + "learning_rate": 4.1426326149808833e-05, + "loss": 5.2555, + "step": 45703 + }, + { + "epoch": 0.2718146350746979, + "grad_norm": 1.4621070623397827, + "learning_rate": 4.142597402735189e-05, + "loss": 4.9044, + "step": 45704 + }, + { + "epoch": 0.2718205823579789, + "grad_norm": 1.4032635688781738, + "learning_rate": 4.1425621899160794e-05, + "loss": 4.9623, + "step": 45705 + }, + { + "epoch": 0.27182652964125986, + "grad_norm": 1.6306707859039307, + "learning_rate": 4.142526976523569e-05, + "loss": 4.742, + "step": 45706 + }, + { + "epoch": 0.2718324769245409, + "grad_norm": 1.6080851554870605, + "learning_rate": 4.1424917625576695e-05, + "loss": 4.7845, + "step": 45707 + }, + { + "epoch": 0.2718384242078219, + "grad_norm": 1.6504677534103394, + "learning_rate": 4.142456548018393e-05, + "loss": 4.6292, + "step": 45708 + }, + { + "epoch": 0.27184437149110285, + "grad_norm": 1.3719233274459839, + "learning_rate": 4.142421332905751e-05, + "loss": 4.6439, + "step": 45709 + }, + { + "epoch": 0.27185031877438387, + "grad_norm": 1.7337923049926758, + "learning_rate": 4.142386117219758e-05, + "loss": 4.8867, + "step": 45710 + }, + { + "epoch": 0.2718562660576649, + "grad_norm": 1.5738022327423096, + "learning_rate": 4.1423509009604234e-05, + "loss": 4.8499, + "step": 45711 + }, + { + "epoch": 0.27186221334094585, + "grad_norm": 1.619258165359497, + "learning_rate": 4.142315684127762e-05, + "loss": 4.8854, + "step": 45712 + }, + { + "epoch": 0.27186816062422686, + "grad_norm": 1.6378552913665771, + "learning_rate": 4.142280466721786e-05, + "loss": 4.6578, + "step": 45713 + }, + { + "epoch": 0.2718741079075079, + "grad_norm": 1.6418770551681519, + "learning_rate": 4.1422452487425056e-05, + "loss": 4.9919, + "step": 45714 + }, + { + "epoch": 0.27188005519078884, + "grad_norm": 1.6741249561309814, + "learning_rate": 4.1422100301899345e-05, + "loss": 4.862, + "step": 45715 + }, + { + "epoch": 0.27188600247406985, + "grad_norm": 2.161534070968628, + "learning_rate": 4.1421748110640855e-05, + "loss": 3.9693, + "step": 45716 + }, + { + "epoch": 0.2718919497573508, + "grad_norm": 2.143327236175537, + "learning_rate": 4.14213959136497e-05, + "loss": 3.9582, + "step": 45717 + }, + { + "epoch": 0.27189789704063183, + "grad_norm": 1.7915785312652588, + "learning_rate": 4.1421043710926e-05, + "loss": 4.1991, + "step": 45718 + }, + { + "epoch": 0.27190384432391285, + "grad_norm": 2.1864686012268066, + "learning_rate": 4.142069150246989e-05, + "loss": 3.6884, + "step": 45719 + }, + { + "epoch": 0.2719097916071938, + "grad_norm": 1.488088846206665, + "learning_rate": 4.1420339288281484e-05, + "loss": 4.6147, + "step": 45720 + }, + { + "epoch": 0.2719157388904748, + "grad_norm": 1.859358549118042, + "learning_rate": 4.141998706836091e-05, + "loss": 4.7451, + "step": 45721 + }, + { + "epoch": 0.27192168617375584, + "grad_norm": 1.9069536924362183, + "learning_rate": 4.141963484270828e-05, + "loss": 4.7734, + "step": 45722 + }, + { + "epoch": 0.2719276334570368, + "grad_norm": 1.694136142730713, + "learning_rate": 4.141928261132373e-05, + "loss": 3.9798, + "step": 45723 + }, + { + "epoch": 0.2719335807403178, + "grad_norm": 1.7452820539474487, + "learning_rate": 4.1418930374207385e-05, + "loss": 3.9592, + "step": 45724 + }, + { + "epoch": 0.27193952802359883, + "grad_norm": 1.7202202081680298, + "learning_rate": 4.141857813135935e-05, + "loss": 4.3746, + "step": 45725 + }, + { + "epoch": 0.2719454753068798, + "grad_norm": 1.6104356050491333, + "learning_rate": 4.141822588277976e-05, + "loss": 4.3259, + "step": 45726 + }, + { + "epoch": 0.2719514225901608, + "grad_norm": 1.7273341417312622, + "learning_rate": 4.141787362846874e-05, + "loss": 4.8346, + "step": 45727 + }, + { + "epoch": 0.2719573698734418, + "grad_norm": 1.6156651973724365, + "learning_rate": 4.141752136842642e-05, + "loss": 4.6286, + "step": 45728 + }, + { + "epoch": 0.2719633171567228, + "grad_norm": 1.9321742057800293, + "learning_rate": 4.1417169102652894e-05, + "loss": 4.1916, + "step": 45729 + }, + { + "epoch": 0.2719692644400038, + "grad_norm": 2.2666730880737305, + "learning_rate": 4.141681683114831e-05, + "loss": 3.6072, + "step": 45730 + }, + { + "epoch": 0.2719752117232848, + "grad_norm": 1.6268110275268555, + "learning_rate": 4.141646455391279e-05, + "loss": 4.86, + "step": 45731 + }, + { + "epoch": 0.2719811590065658, + "grad_norm": 1.729621171951294, + "learning_rate": 4.141611227094645e-05, + "loss": 4.4223, + "step": 45732 + }, + { + "epoch": 0.2719871062898468, + "grad_norm": 1.665299415588379, + "learning_rate": 4.1415759982249416e-05, + "loss": 4.3089, + "step": 45733 + }, + { + "epoch": 0.2719930535731278, + "grad_norm": 1.7539700269699097, + "learning_rate": 4.1415407687821804e-05, + "loss": 4.1102, + "step": 45734 + }, + { + "epoch": 0.27199900085640877, + "grad_norm": 1.92647385597229, + "learning_rate": 4.141505538766375e-05, + "loss": 4.3514, + "step": 45735 + }, + { + "epoch": 0.2720049481396898, + "grad_norm": 1.6388932466506958, + "learning_rate": 4.141470308177536e-05, + "loss": 4.2779, + "step": 45736 + }, + { + "epoch": 0.2720108954229708, + "grad_norm": 1.5056126117706299, + "learning_rate": 4.1414350770156783e-05, + "loss": 4.5824, + "step": 45737 + }, + { + "epoch": 0.27201684270625176, + "grad_norm": 1.8660333156585693, + "learning_rate": 4.1413998452808114e-05, + "loss": 4.3252, + "step": 45738 + }, + { + "epoch": 0.2720227899895328, + "grad_norm": 1.433262825012207, + "learning_rate": 4.141364612972949e-05, + "loss": 4.7779, + "step": 45739 + }, + { + "epoch": 0.2720287372728138, + "grad_norm": 1.434385895729065, + "learning_rate": 4.1413293800921036e-05, + "loss": 4.9565, + "step": 45740 + }, + { + "epoch": 0.27203468455609475, + "grad_norm": 1.4323807954788208, + "learning_rate": 4.141294146638286e-05, + "loss": 4.8929, + "step": 45741 + }, + { + "epoch": 0.27204063183937577, + "grad_norm": 1.5555981397628784, + "learning_rate": 4.141258912611511e-05, + "loss": 4.853, + "step": 45742 + }, + { + "epoch": 0.2720465791226568, + "grad_norm": 1.6074632406234741, + "learning_rate": 4.141223678011788e-05, + "loss": 4.8701, + "step": 45743 + }, + { + "epoch": 0.27205252640593774, + "grad_norm": 1.9886776208877563, + "learning_rate": 4.141188442839131e-05, + "loss": 3.3043, + "step": 45744 + }, + { + "epoch": 0.27205847368921876, + "grad_norm": 1.770959734916687, + "learning_rate": 4.1411532070935536e-05, + "loss": 4.2259, + "step": 45745 + }, + { + "epoch": 0.2720644209724998, + "grad_norm": 1.4865915775299072, + "learning_rate": 4.141117970775065e-05, + "loss": 4.2256, + "step": 45746 + }, + { + "epoch": 0.27207036825578074, + "grad_norm": 1.531137228012085, + "learning_rate": 4.1410827338836795e-05, + "loss": 4.4481, + "step": 45747 + }, + { + "epoch": 0.27207631553906175, + "grad_norm": 1.4891735315322876, + "learning_rate": 4.141047496419409e-05, + "loss": 5.1401, + "step": 45748 + }, + { + "epoch": 0.27208226282234277, + "grad_norm": 1.7053202390670776, + "learning_rate": 4.1410122583822655e-05, + "loss": 4.4018, + "step": 45749 + }, + { + "epoch": 0.27208821010562373, + "grad_norm": 1.7726733684539795, + "learning_rate": 4.140977019772262e-05, + "loss": 4.0819, + "step": 45750 + }, + { + "epoch": 0.27209415738890474, + "grad_norm": 1.488122820854187, + "learning_rate": 4.14094178058941e-05, + "loss": 4.1679, + "step": 45751 + }, + { + "epoch": 0.27210010467218576, + "grad_norm": 1.7534294128417969, + "learning_rate": 4.140906540833723e-05, + "loss": 4.1901, + "step": 45752 + }, + { + "epoch": 0.2721060519554667, + "grad_norm": 1.8314956426620483, + "learning_rate": 4.1408713005052116e-05, + "loss": 4.2567, + "step": 45753 + }, + { + "epoch": 0.27211199923874774, + "grad_norm": 1.9421186447143555, + "learning_rate": 4.1408360596038896e-05, + "loss": 4.1898, + "step": 45754 + }, + { + "epoch": 0.27211794652202875, + "grad_norm": 2.3662304878234863, + "learning_rate": 4.140800818129768e-05, + "loss": 3.431, + "step": 45755 + }, + { + "epoch": 0.2721238938053097, + "grad_norm": 1.7412488460540771, + "learning_rate": 4.14076557608286e-05, + "loss": 4.8128, + "step": 45756 + }, + { + "epoch": 0.27212984108859073, + "grad_norm": 2.3575375080108643, + "learning_rate": 4.1407303334631784e-05, + "loss": 3.1746, + "step": 45757 + }, + { + "epoch": 0.27213578837187175, + "grad_norm": 2.503161907196045, + "learning_rate": 4.140695090270734e-05, + "loss": 3.6558, + "step": 45758 + }, + { + "epoch": 0.2721417356551527, + "grad_norm": 1.5914537906646729, + "learning_rate": 4.14065984650554e-05, + "loss": 4.6278, + "step": 45759 + }, + { + "epoch": 0.2721476829384337, + "grad_norm": 1.4927513599395752, + "learning_rate": 4.1406246021676084e-05, + "loss": 4.7342, + "step": 45760 + }, + { + "epoch": 0.27215363022171474, + "grad_norm": 1.4221673011779785, + "learning_rate": 4.140589357256952e-05, + "loss": 4.6447, + "step": 45761 + }, + { + "epoch": 0.2721595775049957, + "grad_norm": 1.5402408838272095, + "learning_rate": 4.140554111773583e-05, + "loss": 4.7994, + "step": 45762 + }, + { + "epoch": 0.2721655247882767, + "grad_norm": 1.8942402601242065, + "learning_rate": 4.1405188657175135e-05, + "loss": 4.0065, + "step": 45763 + }, + { + "epoch": 0.27217147207155773, + "grad_norm": 1.5937339067459106, + "learning_rate": 4.140483619088756e-05, + "loss": 5.2103, + "step": 45764 + }, + { + "epoch": 0.2721774193548387, + "grad_norm": 1.6887754201889038, + "learning_rate": 4.140448371887322e-05, + "loss": 5.0002, + "step": 45765 + }, + { + "epoch": 0.2721833666381197, + "grad_norm": 1.4366587400436401, + "learning_rate": 4.140413124113225e-05, + "loss": 4.7529, + "step": 45766 + }, + { + "epoch": 0.2721893139214007, + "grad_norm": 1.9130007028579712, + "learning_rate": 4.1403778757664756e-05, + "loss": 4.1926, + "step": 45767 + }, + { + "epoch": 0.2721952612046817, + "grad_norm": 1.926134705543518, + "learning_rate": 4.140342626847088e-05, + "loss": 4.313, + "step": 45768 + }, + { + "epoch": 0.2722012084879627, + "grad_norm": 2.0191617012023926, + "learning_rate": 4.140307377355074e-05, + "loss": 4.2371, + "step": 45769 + }, + { + "epoch": 0.2722071557712437, + "grad_norm": 1.772196888923645, + "learning_rate": 4.1402721272904454e-05, + "loss": 4.5711, + "step": 45770 + }, + { + "epoch": 0.2722131030545247, + "grad_norm": 1.5213438272476196, + "learning_rate": 4.140236876653215e-05, + "loss": 5.1339, + "step": 45771 + }, + { + "epoch": 0.2722190503378057, + "grad_norm": 1.4946285486221313, + "learning_rate": 4.140201625443395e-05, + "loss": 5.0859, + "step": 45772 + }, + { + "epoch": 0.2722249976210867, + "grad_norm": 1.931522250175476, + "learning_rate": 4.140166373660997e-05, + "loss": 4.2241, + "step": 45773 + }, + { + "epoch": 0.27223094490436767, + "grad_norm": 1.7954785823822021, + "learning_rate": 4.140131121306034e-05, + "loss": 4.2163, + "step": 45774 + }, + { + "epoch": 0.2722368921876487, + "grad_norm": 2.1641974449157715, + "learning_rate": 4.1400958683785186e-05, + "loss": 3.2977, + "step": 45775 + }, + { + "epoch": 0.2722428394709297, + "grad_norm": 2.3063745498657227, + "learning_rate": 4.140060614878462e-05, + "loss": 2.9921, + "step": 45776 + }, + { + "epoch": 0.27224878675421066, + "grad_norm": 2.275221586227417, + "learning_rate": 4.140025360805877e-05, + "loss": 2.862, + "step": 45777 + }, + { + "epoch": 0.2722547340374917, + "grad_norm": 2.270885467529297, + "learning_rate": 4.139990106160777e-05, + "loss": 2.8799, + "step": 45778 + }, + { + "epoch": 0.2722606813207727, + "grad_norm": 2.141406297683716, + "learning_rate": 4.139954850943173e-05, + "loss": 3.6443, + "step": 45779 + }, + { + "epoch": 0.27226662860405365, + "grad_norm": 1.7428767681121826, + "learning_rate": 4.139919595153078e-05, + "loss": 4.2266, + "step": 45780 + }, + { + "epoch": 0.27227257588733467, + "grad_norm": 2.6759707927703857, + "learning_rate": 4.139884338790503e-05, + "loss": 2.735, + "step": 45781 + }, + { + "epoch": 0.2722785231706157, + "grad_norm": 2.3398332595825195, + "learning_rate": 4.139849081855463e-05, + "loss": 2.818, + "step": 45782 + }, + { + "epoch": 0.27228447045389664, + "grad_norm": 2.6236865520477295, + "learning_rate": 4.1398138243479665e-05, + "loss": 2.7798, + "step": 45783 + }, + { + "epoch": 0.27229041773717766, + "grad_norm": 2.637564182281494, + "learning_rate": 4.1397785662680296e-05, + "loss": 2.7549, + "step": 45784 + }, + { + "epoch": 0.2722963650204587, + "grad_norm": 2.7038493156433105, + "learning_rate": 4.1397433076156625e-05, + "loss": 2.8953, + "step": 45785 + }, + { + "epoch": 0.27230231230373964, + "grad_norm": 2.71801495552063, + "learning_rate": 4.139708048390878e-05, + "loss": 3.1648, + "step": 45786 + }, + { + "epoch": 0.27230825958702065, + "grad_norm": 2.741086483001709, + "learning_rate": 4.139672788593688e-05, + "loss": 2.5846, + "step": 45787 + }, + { + "epoch": 0.27231420687030167, + "grad_norm": 2.7149674892425537, + "learning_rate": 4.139637528224105e-05, + "loss": 2.8117, + "step": 45788 + }, + { + "epoch": 0.2723201541535826, + "grad_norm": 3.3475289344787598, + "learning_rate": 4.139602267282142e-05, + "loss": 3.0361, + "step": 45789 + }, + { + "epoch": 0.27232610143686364, + "grad_norm": 2.8297181129455566, + "learning_rate": 4.139567005767811e-05, + "loss": 2.9801, + "step": 45790 + }, + { + "epoch": 0.27233204872014466, + "grad_norm": 2.694627046585083, + "learning_rate": 4.139531743681123e-05, + "loss": 3.0691, + "step": 45791 + }, + { + "epoch": 0.2723379960034256, + "grad_norm": 2.3945062160491943, + "learning_rate": 4.139496481022092e-05, + "loss": 3.065, + "step": 45792 + }, + { + "epoch": 0.27234394328670664, + "grad_norm": 2.3583245277404785, + "learning_rate": 4.13946121779073e-05, + "loss": 2.7556, + "step": 45793 + }, + { + "epoch": 0.27234989056998765, + "grad_norm": 2.3534677028656006, + "learning_rate": 4.1394259539870495e-05, + "loss": 2.5879, + "step": 45794 + }, + { + "epoch": 0.2723558378532686, + "grad_norm": 2.544018268585205, + "learning_rate": 4.139390689611061e-05, + "loss": 2.6023, + "step": 45795 + }, + { + "epoch": 0.27236178513654963, + "grad_norm": 2.387767791748047, + "learning_rate": 4.1393554246627795e-05, + "loss": 2.5724, + "step": 45796 + }, + { + "epoch": 0.27236773241983064, + "grad_norm": 2.741077423095703, + "learning_rate": 4.139320159142215e-05, + "loss": 3.073, + "step": 45797 + }, + { + "epoch": 0.2723736797031116, + "grad_norm": 2.0167064666748047, + "learning_rate": 4.139284893049381e-05, + "loss": 3.5779, + "step": 45798 + }, + { + "epoch": 0.2723796269863926, + "grad_norm": 2.1938252449035645, + "learning_rate": 4.1392496263842906e-05, + "loss": 2.7166, + "step": 45799 + }, + { + "epoch": 0.27238557426967364, + "grad_norm": 2.269270420074463, + "learning_rate": 4.139214359146953e-05, + "loss": 3.0474, + "step": 45800 + }, + { + "epoch": 0.2723915215529546, + "grad_norm": 2.40761399269104, + "learning_rate": 4.139179091337385e-05, + "loss": 3.042, + "step": 45801 + }, + { + "epoch": 0.2723974688362356, + "grad_norm": 2.2754950523376465, + "learning_rate": 4.1391438229555946e-05, + "loss": 2.9627, + "step": 45802 + }, + { + "epoch": 0.27240341611951663, + "grad_norm": 2.121917963027954, + "learning_rate": 4.139108554001597e-05, + "loss": 3.5859, + "step": 45803 + }, + { + "epoch": 0.2724093634027976, + "grad_norm": 2.1618494987487793, + "learning_rate": 4.139073284475403e-05, + "loss": 2.8559, + "step": 45804 + }, + { + "epoch": 0.2724153106860786, + "grad_norm": 2.197054624557495, + "learning_rate": 4.139038014377026e-05, + "loss": 3.0019, + "step": 45805 + }, + { + "epoch": 0.2724212579693596, + "grad_norm": 2.14561128616333, + "learning_rate": 4.1390027437064775e-05, + "loss": 3.0007, + "step": 45806 + }, + { + "epoch": 0.2724272052526406, + "grad_norm": 2.1867761611938477, + "learning_rate": 4.13896747246377e-05, + "loss": 3.1488, + "step": 45807 + }, + { + "epoch": 0.2724331525359216, + "grad_norm": 2.184882640838623, + "learning_rate": 4.138932200648916e-05, + "loss": 4.316, + "step": 45808 + }, + { + "epoch": 0.2724390998192026, + "grad_norm": 1.512900710105896, + "learning_rate": 4.138896928261928e-05, + "loss": 4.8643, + "step": 45809 + }, + { + "epoch": 0.2724450471024836, + "grad_norm": 1.4973748922348022, + "learning_rate": 4.138861655302817e-05, + "loss": 4.8553, + "step": 45810 + }, + { + "epoch": 0.2724509943857646, + "grad_norm": 1.682739496231079, + "learning_rate": 4.1388263817715975e-05, + "loss": 4.406, + "step": 45811 + }, + { + "epoch": 0.2724569416690456, + "grad_norm": 2.2688896656036377, + "learning_rate": 4.1387911076682795e-05, + "loss": 4.2252, + "step": 45812 + }, + { + "epoch": 0.27246288895232657, + "grad_norm": 1.8768399953842163, + "learning_rate": 4.1387558329928774e-05, + "loss": 4.0202, + "step": 45813 + }, + { + "epoch": 0.2724688362356076, + "grad_norm": 1.5891594886779785, + "learning_rate": 4.138720557745402e-05, + "loss": 4.1243, + "step": 45814 + }, + { + "epoch": 0.2724747835188886, + "grad_norm": 2.2248220443725586, + "learning_rate": 4.138685281925867e-05, + "loss": 3.8279, + "step": 45815 + }, + { + "epoch": 0.27248073080216956, + "grad_norm": 2.0752015113830566, + "learning_rate": 4.138650005534283e-05, + "loss": 3.5229, + "step": 45816 + }, + { + "epoch": 0.2724866780854506, + "grad_norm": 2.36653470993042, + "learning_rate": 4.138614728570664e-05, + "loss": 2.9254, + "step": 45817 + }, + { + "epoch": 0.2724926253687316, + "grad_norm": 1.702856183052063, + "learning_rate": 4.138579451035021e-05, + "loss": 5.0164, + "step": 45818 + }, + { + "epoch": 0.27249857265201255, + "grad_norm": 2.3220980167388916, + "learning_rate": 4.138544172927367e-05, + "loss": 2.7902, + "step": 45819 + }, + { + "epoch": 0.27250451993529357, + "grad_norm": 1.5887811183929443, + "learning_rate": 4.138508894247714e-05, + "loss": 4.9648, + "step": 45820 + }, + { + "epoch": 0.2725104672185746, + "grad_norm": 1.6520518064498901, + "learning_rate": 4.138473614996075e-05, + "loss": 4.2482, + "step": 45821 + }, + { + "epoch": 0.27251641450185554, + "grad_norm": 1.4699745178222656, + "learning_rate": 4.138438335172461e-05, + "loss": 5.1075, + "step": 45822 + }, + { + "epoch": 0.27252236178513656, + "grad_norm": 1.9718328714370728, + "learning_rate": 4.138403054776885e-05, + "loss": 2.8332, + "step": 45823 + }, + { + "epoch": 0.2725283090684176, + "grad_norm": 2.1508724689483643, + "learning_rate": 4.13836777380936e-05, + "loss": 2.8701, + "step": 45824 + }, + { + "epoch": 0.27253425635169853, + "grad_norm": 2.320861577987671, + "learning_rate": 4.1383324922698976e-05, + "loss": 2.785, + "step": 45825 + }, + { + "epoch": 0.27254020363497955, + "grad_norm": 2.077779769897461, + "learning_rate": 4.1382972101585105e-05, + "loss": 2.8917, + "step": 45826 + }, + { + "epoch": 0.27254615091826057, + "grad_norm": 2.1475489139556885, + "learning_rate": 4.13826192747521e-05, + "loss": 2.7707, + "step": 45827 + }, + { + "epoch": 0.2725520982015415, + "grad_norm": 2.122730016708374, + "learning_rate": 4.13822664422001e-05, + "loss": 2.7452, + "step": 45828 + }, + { + "epoch": 0.27255804548482254, + "grad_norm": 2.1770613193511963, + "learning_rate": 4.138191360392922e-05, + "loss": 2.8046, + "step": 45829 + }, + { + "epoch": 0.27256399276810356, + "grad_norm": 2.2645339965820312, + "learning_rate": 4.138156075993958e-05, + "loss": 2.7233, + "step": 45830 + }, + { + "epoch": 0.2725699400513845, + "grad_norm": 2.0627102851867676, + "learning_rate": 4.1381207910231304e-05, + "loss": 3.8023, + "step": 45831 + }, + { + "epoch": 0.27257588733466553, + "grad_norm": 2.2931413650512695, + "learning_rate": 4.138085505480452e-05, + "loss": 2.6298, + "step": 45832 + }, + { + "epoch": 0.2725818346179465, + "grad_norm": 1.7870010137557983, + "learning_rate": 4.1380502193659345e-05, + "loss": 4.6438, + "step": 45833 + }, + { + "epoch": 0.2725877819012275, + "grad_norm": 1.533590316772461, + "learning_rate": 4.138014932679592e-05, + "loss": 4.2038, + "step": 45834 + }, + { + "epoch": 0.2725937291845085, + "grad_norm": 1.5541303157806396, + "learning_rate": 4.137979645421434e-05, + "loss": 4.372, + "step": 45835 + }, + { + "epoch": 0.2725996764677895, + "grad_norm": 2.0355629920959473, + "learning_rate": 4.1379443575914746e-05, + "loss": 3.5096, + "step": 45836 + }, + { + "epoch": 0.2726056237510705, + "grad_norm": 2.9099228382110596, + "learning_rate": 4.137909069189726e-05, + "loss": 3.1599, + "step": 45837 + }, + { + "epoch": 0.2726115710343515, + "grad_norm": 2.018514633178711, + "learning_rate": 4.137873780216199e-05, + "loss": 3.9061, + "step": 45838 + }, + { + "epoch": 0.2726175183176325, + "grad_norm": 1.3519535064697266, + "learning_rate": 4.137838490670909e-05, + "loss": 3.9924, + "step": 45839 + }, + { + "epoch": 0.2726234656009135, + "grad_norm": 1.6445012092590332, + "learning_rate": 4.137803200553866e-05, + "loss": 4.2776, + "step": 45840 + }, + { + "epoch": 0.2726294128841945, + "grad_norm": 1.8708311319351196, + "learning_rate": 4.137767909865082e-05, + "loss": 4.695, + "step": 45841 + }, + { + "epoch": 0.27263536016747547, + "grad_norm": 2.1926140785217285, + "learning_rate": 4.1377326186045703e-05, + "loss": 3.8077, + "step": 45842 + }, + { + "epoch": 0.2726413074507565, + "grad_norm": 2.731046199798584, + "learning_rate": 4.137697326772344e-05, + "loss": 3.1441, + "step": 45843 + }, + { + "epoch": 0.2726472547340375, + "grad_norm": 2.940537452697754, + "learning_rate": 4.137662034368414e-05, + "loss": 3.0425, + "step": 45844 + }, + { + "epoch": 0.27265320201731846, + "grad_norm": 2.5874125957489014, + "learning_rate": 4.137626741392793e-05, + "loss": 3.2962, + "step": 45845 + }, + { + "epoch": 0.2726591493005995, + "grad_norm": 3.064425230026245, + "learning_rate": 4.137591447845494e-05, + "loss": 3.1336, + "step": 45846 + }, + { + "epoch": 0.2726650965838805, + "grad_norm": 2.115105628967285, + "learning_rate": 4.137556153726527e-05, + "loss": 3.3425, + "step": 45847 + }, + { + "epoch": 0.27267104386716146, + "grad_norm": 1.841457486152649, + "learning_rate": 4.137520859035908e-05, + "loss": 4.4737, + "step": 45848 + }, + { + "epoch": 0.27267699115044247, + "grad_norm": 1.6755503416061401, + "learning_rate": 4.1374855637736465e-05, + "loss": 4.9373, + "step": 45849 + }, + { + "epoch": 0.2726829384337235, + "grad_norm": 1.6270753145217896, + "learning_rate": 4.1374502679397565e-05, + "loss": 4.7678, + "step": 45850 + }, + { + "epoch": 0.27268888571700445, + "grad_norm": 1.6697624921798706, + "learning_rate": 4.137414971534248e-05, + "loss": 4.7836, + "step": 45851 + }, + { + "epoch": 0.27269483300028546, + "grad_norm": 1.4891542196273804, + "learning_rate": 4.1373796745571366e-05, + "loss": 4.6831, + "step": 45852 + }, + { + "epoch": 0.2727007802835665, + "grad_norm": 1.4874987602233887, + "learning_rate": 4.137344377008432e-05, + "loss": 4.6359, + "step": 45853 + }, + { + "epoch": 0.27270672756684744, + "grad_norm": 1.316391944885254, + "learning_rate": 4.137309078888148e-05, + "loss": 4.459, + "step": 45854 + }, + { + "epoch": 0.27271267485012846, + "grad_norm": 1.4526925086975098, + "learning_rate": 4.1372737801962947e-05, + "loss": 4.6636, + "step": 45855 + }, + { + "epoch": 0.27271862213340947, + "grad_norm": 1.5415966510772705, + "learning_rate": 4.1372384809328875e-05, + "loss": 4.5748, + "step": 45856 + }, + { + "epoch": 0.27272456941669043, + "grad_norm": 1.6211330890655518, + "learning_rate": 4.1372031810979364e-05, + "loss": 5.1219, + "step": 45857 + }, + { + "epoch": 0.27273051669997145, + "grad_norm": 1.4959384202957153, + "learning_rate": 4.1371678806914555e-05, + "loss": 4.9811, + "step": 45858 + }, + { + "epoch": 0.27273646398325246, + "grad_norm": 1.5326218605041504, + "learning_rate": 4.137132579713455e-05, + "loss": 5.1055, + "step": 45859 + }, + { + "epoch": 0.2727424112665334, + "grad_norm": 1.1683393716812134, + "learning_rate": 4.1370972781639494e-05, + "loss": 5.0161, + "step": 45860 + }, + { + "epoch": 0.27274835854981444, + "grad_norm": 1.4726523160934448, + "learning_rate": 4.13706197604295e-05, + "loss": 5.1143, + "step": 45861 + }, + { + "epoch": 0.27275430583309546, + "grad_norm": 1.3792412281036377, + "learning_rate": 4.1370266733504694e-05, + "loss": 4.9406, + "step": 45862 + }, + { + "epoch": 0.2727602531163764, + "grad_norm": 1.2844411134719849, + "learning_rate": 4.136991370086519e-05, + "loss": 4.9289, + "step": 45863 + }, + { + "epoch": 0.27276620039965743, + "grad_norm": 1.277153730392456, + "learning_rate": 4.136956066251112e-05, + "loss": 4.9407, + "step": 45864 + }, + { + "epoch": 0.27277214768293845, + "grad_norm": 3.8550291061401367, + "learning_rate": 4.136920761844261e-05, + "loss": 2.7953, + "step": 45865 + }, + { + "epoch": 0.2727780949662194, + "grad_norm": 3.2808563709259033, + "learning_rate": 4.136885456865977e-05, + "loss": 1.6185, + "step": 45866 + }, + { + "epoch": 0.2727840422495004, + "grad_norm": 2.703369379043579, + "learning_rate": 4.136850151316274e-05, + "loss": 2.2097, + "step": 45867 + }, + { + "epoch": 0.27278998953278144, + "grad_norm": 2.6420631408691406, + "learning_rate": 4.1368148451951626e-05, + "loss": 0.9513, + "step": 45868 + }, + { + "epoch": 0.2727959368160624, + "grad_norm": 2.8043675422668457, + "learning_rate": 4.136779538502657e-05, + "loss": 1.3474, + "step": 45869 + }, + { + "epoch": 0.2728018840993434, + "grad_norm": 2.646263837814331, + "learning_rate": 4.1367442312387683e-05, + "loss": 1.3508, + "step": 45870 + }, + { + "epoch": 0.27280783138262443, + "grad_norm": 2.6238725185394287, + "learning_rate": 4.13670892340351e-05, + "loss": 1.0979, + "step": 45871 + }, + { + "epoch": 0.2728137786659054, + "grad_norm": 2.4581644535064697, + "learning_rate": 4.136673614996892e-05, + "loss": 0.8381, + "step": 45872 + }, + { + "epoch": 0.2728197259491864, + "grad_norm": 2.819242477416992, + "learning_rate": 4.1366383060189285e-05, + "loss": 0.9155, + "step": 45873 + }, + { + "epoch": 0.2728256732324674, + "grad_norm": 2.758807420730591, + "learning_rate": 4.1366029964696316e-05, + "loss": 0.8899, + "step": 45874 + }, + { + "epoch": 0.2728316205157484, + "grad_norm": 2.65903902053833, + "learning_rate": 4.136567686349014e-05, + "loss": 0.7523, + "step": 45875 + }, + { + "epoch": 0.2728375677990294, + "grad_norm": 2.938854932785034, + "learning_rate": 4.136532375657087e-05, + "loss": 0.9999, + "step": 45876 + }, + { + "epoch": 0.2728435150823104, + "grad_norm": 2.355612277984619, + "learning_rate": 4.136497064393864e-05, + "loss": 0.6597, + "step": 45877 + }, + { + "epoch": 0.2728494623655914, + "grad_norm": 2.7789440155029297, + "learning_rate": 4.136461752559356e-05, + "loss": 0.8495, + "step": 45878 + }, + { + "epoch": 0.2728554096488724, + "grad_norm": 2.9362080097198486, + "learning_rate": 4.1364264401535767e-05, + "loss": 0.8824, + "step": 45879 + }, + { + "epoch": 0.2728613569321534, + "grad_norm": 2.012599229812622, + "learning_rate": 4.1363911271765373e-05, + "loss": 5.1281, + "step": 45880 + }, + { + "epoch": 0.27286730421543437, + "grad_norm": 2.6936557292938232, + "learning_rate": 4.136355813628251e-05, + "loss": 1.1141, + "step": 45881 + }, + { + "epoch": 0.2728732514987154, + "grad_norm": 1.6431304216384888, + "learning_rate": 4.13632049950873e-05, + "loss": 5.1049, + "step": 45882 + }, + { + "epoch": 0.2728791987819964, + "grad_norm": 1.760867714881897, + "learning_rate": 4.136285184817986e-05, + "loss": 5.2685, + "step": 45883 + }, + { + "epoch": 0.27288514606527736, + "grad_norm": 1.8635131120681763, + "learning_rate": 4.136249869556032e-05, + "loss": 3.3709, + "step": 45884 + }, + { + "epoch": 0.2728910933485584, + "grad_norm": 1.7839796543121338, + "learning_rate": 4.13621455372288e-05, + "loss": 5.0233, + "step": 45885 + }, + { + "epoch": 0.2728970406318394, + "grad_norm": 1.5997925996780396, + "learning_rate": 4.136179237318543e-05, + "loss": 4.7756, + "step": 45886 + }, + { + "epoch": 0.27290298791512035, + "grad_norm": 1.713348627090454, + "learning_rate": 4.136143920343032e-05, + "loss": 4.5267, + "step": 45887 + }, + { + "epoch": 0.27290893519840137, + "grad_norm": 1.5991480350494385, + "learning_rate": 4.13610860279636e-05, + "loss": 5.0532, + "step": 45888 + }, + { + "epoch": 0.2729148824816824, + "grad_norm": 1.935451626777649, + "learning_rate": 4.1360732846785396e-05, + "loss": 4.0349, + "step": 45889 + }, + { + "epoch": 0.27292082976496335, + "grad_norm": 1.5842171907424927, + "learning_rate": 4.136037965989583e-05, + "loss": 4.5102, + "step": 45890 + }, + { + "epoch": 0.27292677704824436, + "grad_norm": 1.7205383777618408, + "learning_rate": 4.136002646729503e-05, + "loss": 4.4857, + "step": 45891 + }, + { + "epoch": 0.2729327243315254, + "grad_norm": 1.8236289024353027, + "learning_rate": 4.13596732689831e-05, + "loss": 4.4506, + "step": 45892 + }, + { + "epoch": 0.27293867161480634, + "grad_norm": 1.785609483718872, + "learning_rate": 4.135932006496018e-05, + "loss": 5.0731, + "step": 45893 + }, + { + "epoch": 0.27294461889808735, + "grad_norm": 1.5576841831207275, + "learning_rate": 4.13589668552264e-05, + "loss": 4.7852, + "step": 45894 + }, + { + "epoch": 0.27295056618136837, + "grad_norm": 1.660879135131836, + "learning_rate": 4.135861363978187e-05, + "loss": 4.9263, + "step": 45895 + }, + { + "epoch": 0.27295651346464933, + "grad_norm": 2.1092357635498047, + "learning_rate": 4.135826041862671e-05, + "loss": 4.3078, + "step": 45896 + }, + { + "epoch": 0.27296246074793035, + "grad_norm": 1.6840873956680298, + "learning_rate": 4.135790719176106e-05, + "loss": 4.9837, + "step": 45897 + }, + { + "epoch": 0.27296840803121136, + "grad_norm": 1.8102649450302124, + "learning_rate": 4.135755395918504e-05, + "loss": 4.9484, + "step": 45898 + }, + { + "epoch": 0.2729743553144923, + "grad_norm": 1.768088459968567, + "learning_rate": 4.135720072089875e-05, + "loss": 4.7116, + "step": 45899 + }, + { + "epoch": 0.27298030259777334, + "grad_norm": 2.1797573566436768, + "learning_rate": 4.1356847476902337e-05, + "loss": 4.5798, + "step": 45900 + }, + { + "epoch": 0.27298624988105435, + "grad_norm": 1.5335637331008911, + "learning_rate": 4.1356494227195913e-05, + "loss": 4.9659, + "step": 45901 + }, + { + "epoch": 0.2729921971643353, + "grad_norm": 1.3662492036819458, + "learning_rate": 4.1356140971779614e-05, + "loss": 4.6622, + "step": 45902 + }, + { + "epoch": 0.27299814444761633, + "grad_norm": 1.535912275314331, + "learning_rate": 4.135578771065355e-05, + "loss": 5.1542, + "step": 45903 + }, + { + "epoch": 0.27300409173089735, + "grad_norm": 1.6926705837249756, + "learning_rate": 4.1355434443817855e-05, + "loss": 4.6342, + "step": 45904 + }, + { + "epoch": 0.2730100390141783, + "grad_norm": 1.5903944969177246, + "learning_rate": 4.135508117127264e-05, + "loss": 4.7846, + "step": 45905 + }, + { + "epoch": 0.2730159862974593, + "grad_norm": 1.6216188669204712, + "learning_rate": 4.1354727893018033e-05, + "loss": 4.8621, + "step": 45906 + }, + { + "epoch": 0.27302193358074034, + "grad_norm": 1.539426326751709, + "learning_rate": 4.135437460905417e-05, + "loss": 4.8781, + "step": 45907 + }, + { + "epoch": 0.2730278808640213, + "grad_norm": 2.2017428874969482, + "learning_rate": 4.1354021319381154e-05, + "loss": 4.7447, + "step": 45908 + }, + { + "epoch": 0.2730338281473023, + "grad_norm": 2.0850274562835693, + "learning_rate": 4.135366802399912e-05, + "loss": 4.9759, + "step": 45909 + }, + { + "epoch": 0.27303977543058333, + "grad_norm": 1.9090203046798706, + "learning_rate": 4.13533147229082e-05, + "loss": 4.6192, + "step": 45910 + }, + { + "epoch": 0.2730457227138643, + "grad_norm": 1.4683018922805786, + "learning_rate": 4.1352961416108495e-05, + "loss": 4.8374, + "step": 45911 + }, + { + "epoch": 0.2730516699971453, + "grad_norm": 1.5177065134048462, + "learning_rate": 4.135260810360014e-05, + "loss": 5.1412, + "step": 45912 + }, + { + "epoch": 0.2730576172804263, + "grad_norm": 1.3990085124969482, + "learning_rate": 4.135225478538326e-05, + "loss": 5.127, + "step": 45913 + }, + { + "epoch": 0.2730635645637073, + "grad_norm": 2.9210364818573, + "learning_rate": 4.135190146145798e-05, + "loss": 4.2369, + "step": 45914 + }, + { + "epoch": 0.2730695118469883, + "grad_norm": 2.4924728870391846, + "learning_rate": 4.1351548131824425e-05, + "loss": 3.8646, + "step": 45915 + }, + { + "epoch": 0.2730754591302693, + "grad_norm": 2.2290756702423096, + "learning_rate": 4.135119479648271e-05, + "loss": 4.0241, + "step": 45916 + }, + { + "epoch": 0.2730814064135503, + "grad_norm": 1.4042515754699707, + "learning_rate": 4.1350841455432956e-05, + "loss": 4.7974, + "step": 45917 + }, + { + "epoch": 0.2730873536968313, + "grad_norm": 1.5158053636550903, + "learning_rate": 4.135048810867529e-05, + "loss": 4.9242, + "step": 45918 + }, + { + "epoch": 0.2730933009801123, + "grad_norm": 1.4903844594955444, + "learning_rate": 4.135013475620985e-05, + "loss": 4.7247, + "step": 45919 + }, + { + "epoch": 0.27309924826339327, + "grad_norm": 3.114199638366699, + "learning_rate": 4.1349781398036736e-05, + "loss": 3.8438, + "step": 45920 + }, + { + "epoch": 0.2731051955466743, + "grad_norm": 2.6636056900024414, + "learning_rate": 4.134942803415609e-05, + "loss": 4.2169, + "step": 45921 + }, + { + "epoch": 0.2731111428299553, + "grad_norm": 1.6928660869598389, + "learning_rate": 4.134907466456802e-05, + "loss": 4.8794, + "step": 45922 + }, + { + "epoch": 0.27311709011323626, + "grad_norm": 1.3839513063430786, + "learning_rate": 4.134872128927266e-05, + "loss": 4.7131, + "step": 45923 + }, + { + "epoch": 0.2731230373965173, + "grad_norm": 1.8975213766098022, + "learning_rate": 4.134836790827014e-05, + "loss": 4.4266, + "step": 45924 + }, + { + "epoch": 0.2731289846797983, + "grad_norm": 2.5544333457946777, + "learning_rate": 4.1348014521560554e-05, + "loss": 3.8379, + "step": 45925 + }, + { + "epoch": 0.27313493196307925, + "grad_norm": 2.631831407546997, + "learning_rate": 4.1347661129144055e-05, + "loss": 3.6614, + "step": 45926 + }, + { + "epoch": 0.27314087924636027, + "grad_norm": 2.657472610473633, + "learning_rate": 4.1347307731020756e-05, + "loss": 3.8393, + "step": 45927 + }, + { + "epoch": 0.2731468265296413, + "grad_norm": 2.7966439723968506, + "learning_rate": 4.1346954327190786e-05, + "loss": 3.717, + "step": 45928 + }, + { + "epoch": 0.27315277381292224, + "grad_norm": 2.1598854064941406, + "learning_rate": 4.134660091765425e-05, + "loss": 3.6558, + "step": 45929 + }, + { + "epoch": 0.27315872109620326, + "grad_norm": 1.6271541118621826, + "learning_rate": 4.13462475024113e-05, + "loss": 4.4283, + "step": 45930 + }, + { + "epoch": 0.2731646683794843, + "grad_norm": 2.0812439918518066, + "learning_rate": 4.134589408146204e-05, + "loss": 3.6354, + "step": 45931 + }, + { + "epoch": 0.27317061566276524, + "grad_norm": 2.363731861114502, + "learning_rate": 4.1345540654806594e-05, + "loss": 3.4903, + "step": 45932 + }, + { + "epoch": 0.27317656294604625, + "grad_norm": 1.9803516864776611, + "learning_rate": 4.134518722244509e-05, + "loss": 3.6608, + "step": 45933 + }, + { + "epoch": 0.27318251022932727, + "grad_norm": 2.0540778636932373, + "learning_rate": 4.1344833784377644e-05, + "loss": 3.3859, + "step": 45934 + }, + { + "epoch": 0.27318845751260823, + "grad_norm": 2.1983706951141357, + "learning_rate": 4.134448034060439e-05, + "loss": 3.4206, + "step": 45935 + }, + { + "epoch": 0.27319440479588925, + "grad_norm": 2.01708984375, + "learning_rate": 4.134412689112545e-05, + "loss": 3.2826, + "step": 45936 + }, + { + "epoch": 0.27320035207917026, + "grad_norm": 2.0594847202301025, + "learning_rate": 4.1343773435940944e-05, + "loss": 3.6769, + "step": 45937 + }, + { + "epoch": 0.2732062993624512, + "grad_norm": 2.0174307823181152, + "learning_rate": 4.134341997505099e-05, + "loss": 3.3296, + "step": 45938 + }, + { + "epoch": 0.27321224664573224, + "grad_norm": 1.9671130180358887, + "learning_rate": 4.134306650845572e-05, + "loss": 3.5501, + "step": 45939 + }, + { + "epoch": 0.27321819392901325, + "grad_norm": 1.8939313888549805, + "learning_rate": 4.134271303615526e-05, + "loss": 3.6715, + "step": 45940 + }, + { + "epoch": 0.2732241412122942, + "grad_norm": 1.4830917119979858, + "learning_rate": 4.134235955814971e-05, + "loss": 4.4942, + "step": 45941 + }, + { + "epoch": 0.27323008849557523, + "grad_norm": 1.8571546077728271, + "learning_rate": 4.1342006074439235e-05, + "loss": 4.7893, + "step": 45942 + }, + { + "epoch": 0.27323603577885625, + "grad_norm": 1.881441593170166, + "learning_rate": 4.134165258502392e-05, + "loss": 4.1751, + "step": 45943 + }, + { + "epoch": 0.2732419830621372, + "grad_norm": 1.9504128694534302, + "learning_rate": 4.13412990899039e-05, + "loss": 4.1929, + "step": 45944 + }, + { + "epoch": 0.2732479303454182, + "grad_norm": 1.7446962594985962, + "learning_rate": 4.134094558907931e-05, + "loss": 4.4648, + "step": 45945 + }, + { + "epoch": 0.27325387762869924, + "grad_norm": 1.4023666381835938, + "learning_rate": 4.134059208255026e-05, + "loss": 5.2005, + "step": 45946 + }, + { + "epoch": 0.2732598249119802, + "grad_norm": 1.4828554391860962, + "learning_rate": 4.1340238570316884e-05, + "loss": 5.2506, + "step": 45947 + }, + { + "epoch": 0.2732657721952612, + "grad_norm": 1.5881046056747437, + "learning_rate": 4.133988505237929e-05, + "loss": 4.9077, + "step": 45948 + }, + { + "epoch": 0.2732717194785422, + "grad_norm": 2.08815336227417, + "learning_rate": 4.133953152873762e-05, + "loss": 4.7851, + "step": 45949 + }, + { + "epoch": 0.2732776667618232, + "grad_norm": 1.6958694458007812, + "learning_rate": 4.1339177999391985e-05, + "loss": 4.3513, + "step": 45950 + }, + { + "epoch": 0.2732836140451042, + "grad_norm": 1.5674899816513062, + "learning_rate": 4.133882446434251e-05, + "loss": 4.5285, + "step": 45951 + }, + { + "epoch": 0.27328956132838517, + "grad_norm": 1.5510520935058594, + "learning_rate": 4.1338470923589325e-05, + "loss": 4.0905, + "step": 45952 + }, + { + "epoch": 0.2732955086116662, + "grad_norm": 1.4501006603240967, + "learning_rate": 4.1338117377132546e-05, + "loss": 4.2072, + "step": 45953 + }, + { + "epoch": 0.2733014558949472, + "grad_norm": 1.5558110475540161, + "learning_rate": 4.13377638249723e-05, + "loss": 4.328, + "step": 45954 + }, + { + "epoch": 0.27330740317822816, + "grad_norm": 1.5014795064926147, + "learning_rate": 4.13374102671087e-05, + "loss": 5.3153, + "step": 45955 + }, + { + "epoch": 0.2733133504615092, + "grad_norm": 1.4516448974609375, + "learning_rate": 4.133705670354189e-05, + "loss": 5.2418, + "step": 45956 + }, + { + "epoch": 0.2733192977447902, + "grad_norm": 2.3129799365997314, + "learning_rate": 4.1336703134271976e-05, + "loss": 4.0243, + "step": 45957 + }, + { + "epoch": 0.27332524502807115, + "grad_norm": 1.598675012588501, + "learning_rate": 4.1336349559299094e-05, + "loss": 4.3298, + "step": 45958 + }, + { + "epoch": 0.27333119231135217, + "grad_norm": 1.4738377332687378, + "learning_rate": 4.1335995978623356e-05, + "loss": 4.1964, + "step": 45959 + }, + { + "epoch": 0.2733371395946332, + "grad_norm": 1.8057414293289185, + "learning_rate": 4.133564239224489e-05, + "loss": 4.4141, + "step": 45960 + }, + { + "epoch": 0.27334308687791414, + "grad_norm": 2.3154780864715576, + "learning_rate": 4.133528880016382e-05, + "loss": 4.0038, + "step": 45961 + }, + { + "epoch": 0.27334903416119516, + "grad_norm": 2.2677793502807617, + "learning_rate": 4.133493520238027e-05, + "loss": 4.3152, + "step": 45962 + }, + { + "epoch": 0.2733549814444762, + "grad_norm": 1.5904322862625122, + "learning_rate": 4.1334581598894364e-05, + "loss": 5.0116, + "step": 45963 + }, + { + "epoch": 0.27336092872775714, + "grad_norm": 1.3789727687835693, + "learning_rate": 4.133422798970623e-05, + "loss": 4.9569, + "step": 45964 + }, + { + "epoch": 0.27336687601103815, + "grad_norm": 1.7124282121658325, + "learning_rate": 4.1333874374815985e-05, + "loss": 4.7244, + "step": 45965 + }, + { + "epoch": 0.27337282329431917, + "grad_norm": 1.8244225978851318, + "learning_rate": 4.1333520754223745e-05, + "loss": 3.3812, + "step": 45966 + }, + { + "epoch": 0.2733787705776001, + "grad_norm": 2.5032339096069336, + "learning_rate": 4.133316712792964e-05, + "loss": 3.9093, + "step": 45967 + }, + { + "epoch": 0.27338471786088114, + "grad_norm": 2.6138384342193604, + "learning_rate": 4.133281349593381e-05, + "loss": 3.9669, + "step": 45968 + }, + { + "epoch": 0.27339066514416216, + "grad_norm": 2.2653026580810547, + "learning_rate": 4.133245985823635e-05, + "loss": 3.8471, + "step": 45969 + }, + { + "epoch": 0.2733966124274431, + "grad_norm": 2.6168301105499268, + "learning_rate": 4.13321062148374e-05, + "loss": 3.8262, + "step": 45970 + }, + { + "epoch": 0.27340255971072414, + "grad_norm": 2.4054365158081055, + "learning_rate": 4.133175256573708e-05, + "loss": 3.8225, + "step": 45971 + }, + { + "epoch": 0.27340850699400515, + "grad_norm": 1.8998020887374878, + "learning_rate": 4.1331398910935515e-05, + "loss": 4.6269, + "step": 45972 + }, + { + "epoch": 0.2734144542772861, + "grad_norm": 1.564599871635437, + "learning_rate": 4.133104525043283e-05, + "loss": 4.653, + "step": 45973 + }, + { + "epoch": 0.27342040156056713, + "grad_norm": 1.350683331489563, + "learning_rate": 4.133069158422914e-05, + "loss": 5.2957, + "step": 45974 + }, + { + "epoch": 0.27342634884384814, + "grad_norm": 1.6742913722991943, + "learning_rate": 4.133033791232458e-05, + "loss": 4.7808, + "step": 45975 + }, + { + "epoch": 0.2734322961271291, + "grad_norm": 1.4501500129699707, + "learning_rate": 4.132998423471926e-05, + "loss": 5.2365, + "step": 45976 + }, + { + "epoch": 0.2734382434104101, + "grad_norm": 1.4782482385635376, + "learning_rate": 4.1329630551413326e-05, + "loss": 5.2283, + "step": 45977 + }, + { + "epoch": 0.27344419069369114, + "grad_norm": 1.8492647409439087, + "learning_rate": 4.132927686240687e-05, + "loss": 4.5303, + "step": 45978 + }, + { + "epoch": 0.2734501379769721, + "grad_norm": 2.290571689605713, + "learning_rate": 4.132892316770004e-05, + "loss": 3.6953, + "step": 45979 + }, + { + "epoch": 0.2734560852602531, + "grad_norm": 2.2079381942749023, + "learning_rate": 4.132856946729296e-05, + "loss": 3.7561, + "step": 45980 + }, + { + "epoch": 0.27346203254353413, + "grad_norm": 2.2443594932556152, + "learning_rate": 4.132821576118573e-05, + "loss": 3.7334, + "step": 45981 + }, + { + "epoch": 0.2734679798268151, + "grad_norm": 1.9817705154418945, + "learning_rate": 4.1327862049378494e-05, + "loss": 4.4162, + "step": 45982 + }, + { + "epoch": 0.2734739271100961, + "grad_norm": 1.815726399421692, + "learning_rate": 4.1327508331871373e-05, + "loss": 4.179, + "step": 45983 + }, + { + "epoch": 0.2734798743933771, + "grad_norm": 1.494133472442627, + "learning_rate": 4.1327154608664486e-05, + "loss": 4.3671, + "step": 45984 + }, + { + "epoch": 0.2734858216766581, + "grad_norm": 1.7324997186660767, + "learning_rate": 4.1326800879757956e-05, + "loss": 4.5004, + "step": 45985 + }, + { + "epoch": 0.2734917689599391, + "grad_norm": 2.179760456085205, + "learning_rate": 4.1326447145151916e-05, + "loss": 3.0176, + "step": 45986 + }, + { + "epoch": 0.2734977162432201, + "grad_norm": 2.261204719543457, + "learning_rate": 4.132609340484647e-05, + "loss": 2.9607, + "step": 45987 + }, + { + "epoch": 0.2735036635265011, + "grad_norm": 2.4133429527282715, + "learning_rate": 4.132573965884176e-05, + "loss": 3.1349, + "step": 45988 + }, + { + "epoch": 0.2735096108097821, + "grad_norm": 1.899513840675354, + "learning_rate": 4.1325385907137905e-05, + "loss": 3.8781, + "step": 45989 + }, + { + "epoch": 0.2735155580930631, + "grad_norm": 2.058727502822876, + "learning_rate": 4.1325032149735024e-05, + "loss": 2.9425, + "step": 45990 + }, + { + "epoch": 0.27352150537634407, + "grad_norm": 1.7935078144073486, + "learning_rate": 4.1324678386633245e-05, + "loss": 3.2557, + "step": 45991 + }, + { + "epoch": 0.2735274526596251, + "grad_norm": 1.8680756092071533, + "learning_rate": 4.132432461783269e-05, + "loss": 3.3131, + "step": 45992 + }, + { + "epoch": 0.2735333999429061, + "grad_norm": 1.9284679889678955, + "learning_rate": 4.132397084333347e-05, + "loss": 3.2385, + "step": 45993 + }, + { + "epoch": 0.27353934722618706, + "grad_norm": 1.8108971118927002, + "learning_rate": 4.1323617063135734e-05, + "loss": 3.2843, + "step": 45994 + }, + { + "epoch": 0.2735452945094681, + "grad_norm": 1.829428791999817, + "learning_rate": 4.1323263277239586e-05, + "loss": 3.1038, + "step": 45995 + }, + { + "epoch": 0.2735512417927491, + "grad_norm": 1.96796452999115, + "learning_rate": 4.1322909485645156e-05, + "loss": 3.172, + "step": 45996 + }, + { + "epoch": 0.27355718907603005, + "grad_norm": 1.8986676931381226, + "learning_rate": 4.132255568835257e-05, + "loss": 3.1249, + "step": 45997 + }, + { + "epoch": 0.27356313635931107, + "grad_norm": 1.9387098550796509, + "learning_rate": 4.1322201885361945e-05, + "loss": 3.1325, + "step": 45998 + }, + { + "epoch": 0.2735690836425921, + "grad_norm": 1.9495916366577148, + "learning_rate": 4.132184807667341e-05, + "loss": 3.2136, + "step": 45999 + }, + { + "epoch": 0.27357503092587304, + "grad_norm": 1.8179353475570679, + "learning_rate": 4.1321494262287084e-05, + "loss": 3.1767, + "step": 46000 + }, + { + "epoch": 0.27358097820915406, + "grad_norm": 2.035282850265503, + "learning_rate": 4.13211404422031e-05, + "loss": 3.2224, + "step": 46001 + }, + { + "epoch": 0.2735869254924351, + "grad_norm": 1.9789769649505615, + "learning_rate": 4.132078661642157e-05, + "loss": 3.1956, + "step": 46002 + }, + { + "epoch": 0.27359287277571603, + "grad_norm": 1.8381719589233398, + "learning_rate": 4.132043278494261e-05, + "loss": 3.1753, + "step": 46003 + }, + { + "epoch": 0.27359882005899705, + "grad_norm": 1.6455564498901367, + "learning_rate": 4.132007894776637e-05, + "loss": 4.3503, + "step": 46004 + }, + { + "epoch": 0.27360476734227807, + "grad_norm": 1.8518131971359253, + "learning_rate": 4.131972510489296e-05, + "loss": 3.6206, + "step": 46005 + }, + { + "epoch": 0.273610714625559, + "grad_norm": 1.99249267578125, + "learning_rate": 4.1319371256322494e-05, + "loss": 3.8363, + "step": 46006 + }, + { + "epoch": 0.27361666190884004, + "grad_norm": 1.8295862674713135, + "learning_rate": 4.131901740205511e-05, + "loss": 3.7536, + "step": 46007 + }, + { + "epoch": 0.27362260919212106, + "grad_norm": 1.437567114830017, + "learning_rate": 4.131866354209092e-05, + "loss": 5.1468, + "step": 46008 + }, + { + "epoch": 0.273628556475402, + "grad_norm": 1.6735337972640991, + "learning_rate": 4.131830967643006e-05, + "loss": 4.7048, + "step": 46009 + }, + { + "epoch": 0.27363450375868303, + "grad_norm": 1.536350965499878, + "learning_rate": 4.131795580507265e-05, + "loss": 4.9671, + "step": 46010 + }, + { + "epoch": 0.27364045104196405, + "grad_norm": 1.702219843864441, + "learning_rate": 4.13176019280188e-05, + "loss": 4.7451, + "step": 46011 + }, + { + "epoch": 0.273646398325245, + "grad_norm": 1.3554284572601318, + "learning_rate": 4.131724804526865e-05, + "loss": 4.9505, + "step": 46012 + }, + { + "epoch": 0.273652345608526, + "grad_norm": 1.736100673675537, + "learning_rate": 4.131689415682232e-05, + "loss": 4.6512, + "step": 46013 + }, + { + "epoch": 0.27365829289180704, + "grad_norm": 1.4740862846374512, + "learning_rate": 4.131654026267991e-05, + "loss": 5.3985, + "step": 46014 + }, + { + "epoch": 0.273664240175088, + "grad_norm": 1.4957942962646484, + "learning_rate": 4.131618636284159e-05, + "loss": 5.0734, + "step": 46015 + }, + { + "epoch": 0.273670187458369, + "grad_norm": 1.3050843477249146, + "learning_rate": 4.1315832457307446e-05, + "loss": 5.1645, + "step": 46016 + }, + { + "epoch": 0.27367613474165003, + "grad_norm": 1.6906099319458008, + "learning_rate": 4.1315478546077615e-05, + "loss": 5.0143, + "step": 46017 + }, + { + "epoch": 0.273682082024931, + "grad_norm": 1.3086351156234741, + "learning_rate": 4.131512462915223e-05, + "loss": 5.1698, + "step": 46018 + }, + { + "epoch": 0.273688029308212, + "grad_norm": 1.45748770236969, + "learning_rate": 4.131477070653139e-05, + "loss": 5.0925, + "step": 46019 + }, + { + "epoch": 0.273693976591493, + "grad_norm": 1.2530378103256226, + "learning_rate": 4.131441677821523e-05, + "loss": 5.0094, + "step": 46020 + }, + { + "epoch": 0.273699923874774, + "grad_norm": 1.8628050088882446, + "learning_rate": 4.1314062844203885e-05, + "loss": 4.7651, + "step": 46021 + }, + { + "epoch": 0.273705871158055, + "grad_norm": 1.7665044069290161, + "learning_rate": 4.1313708904497464e-05, + "loss": 4.7185, + "step": 46022 + }, + { + "epoch": 0.273711818441336, + "grad_norm": 1.8174282312393188, + "learning_rate": 4.1313354959096095e-05, + "loss": 4.7243, + "step": 46023 + }, + { + "epoch": 0.273717765724617, + "grad_norm": 2.4195492267608643, + "learning_rate": 4.13130010079999e-05, + "loss": 3.7404, + "step": 46024 + }, + { + "epoch": 0.273723713007898, + "grad_norm": 2.8579397201538086, + "learning_rate": 4.131264705120902e-05, + "loss": 3.28, + "step": 46025 + }, + { + "epoch": 0.273729660291179, + "grad_norm": 2.7402520179748535, + "learning_rate": 4.131229308872354e-05, + "loss": 3.2536, + "step": 46026 + }, + { + "epoch": 0.27373560757445997, + "grad_norm": 2.614023208618164, + "learning_rate": 4.1311939120543624e-05, + "loss": 3.2952, + "step": 46027 + }, + { + "epoch": 0.273741554857741, + "grad_norm": 1.332194209098816, + "learning_rate": 4.131158514666937e-05, + "loss": 4.8776, + "step": 46028 + }, + { + "epoch": 0.273747502141022, + "grad_norm": 1.8293730020523071, + "learning_rate": 4.1311231167100915e-05, + "loss": 4.1559, + "step": 46029 + }, + { + "epoch": 0.27375344942430296, + "grad_norm": 2.043550729751587, + "learning_rate": 4.131087718183838e-05, + "loss": 3.5028, + "step": 46030 + }, + { + "epoch": 0.273759396707584, + "grad_norm": 1.3267338275909424, + "learning_rate": 4.131052319088189e-05, + "loss": 4.789, + "step": 46031 + }, + { + "epoch": 0.273765343990865, + "grad_norm": 1.5835071802139282, + "learning_rate": 4.131016919423155e-05, + "loss": 5.103, + "step": 46032 + }, + { + "epoch": 0.27377129127414596, + "grad_norm": 1.4171127080917358, + "learning_rate": 4.1309815191887504e-05, + "loss": 5.0974, + "step": 46033 + }, + { + "epoch": 0.27377723855742697, + "grad_norm": 1.7296243906021118, + "learning_rate": 4.1309461183849885e-05, + "loss": 5.029, + "step": 46034 + }, + { + "epoch": 0.273783185840708, + "grad_norm": 1.7149790525436401, + "learning_rate": 4.130910717011878e-05, + "loss": 4.6327, + "step": 46035 + }, + { + "epoch": 0.27378913312398895, + "grad_norm": 1.9778841733932495, + "learning_rate": 4.130875315069435e-05, + "loss": 4.6353, + "step": 46036 + }, + { + "epoch": 0.27379508040726996, + "grad_norm": 1.5404632091522217, + "learning_rate": 4.130839912557669e-05, + "loss": 5.0213, + "step": 46037 + }, + { + "epoch": 0.273801027690551, + "grad_norm": 1.8477352857589722, + "learning_rate": 4.1308045094765946e-05, + "loss": 4.8905, + "step": 46038 + }, + { + "epoch": 0.27380697497383194, + "grad_norm": 1.87166166305542, + "learning_rate": 4.1307691058262226e-05, + "loss": 4.8188, + "step": 46039 + }, + { + "epoch": 0.27381292225711296, + "grad_norm": 1.6224892139434814, + "learning_rate": 4.130733701606566e-05, + "loss": 5.2502, + "step": 46040 + }, + { + "epoch": 0.27381886954039397, + "grad_norm": 1.8564457893371582, + "learning_rate": 4.130698296817638e-05, + "loss": 4.3182, + "step": 46041 + }, + { + "epoch": 0.27382481682367493, + "grad_norm": 1.6321443319320679, + "learning_rate": 4.130662891459449e-05, + "loss": 4.4192, + "step": 46042 + }, + { + "epoch": 0.27383076410695595, + "grad_norm": 1.409175992012024, + "learning_rate": 4.130627485532013e-05, + "loss": 4.9612, + "step": 46043 + }, + { + "epoch": 0.27383671139023696, + "grad_norm": 1.64139723777771, + "learning_rate": 4.1305920790353414e-05, + "loss": 4.9591, + "step": 46044 + }, + { + "epoch": 0.2738426586735179, + "grad_norm": 1.4769237041473389, + "learning_rate": 4.130556671969447e-05, + "loss": 4.9653, + "step": 46045 + }, + { + "epoch": 0.27384860595679894, + "grad_norm": 1.4463738203048706, + "learning_rate": 4.130521264334343e-05, + "loss": 4.9873, + "step": 46046 + }, + { + "epoch": 0.27385455324007996, + "grad_norm": 1.468766689300537, + "learning_rate": 4.13048585613004e-05, + "loss": 5.2235, + "step": 46047 + }, + { + "epoch": 0.2738605005233609, + "grad_norm": 1.3875845670700073, + "learning_rate": 4.130450447356551e-05, + "loss": 4.9985, + "step": 46048 + }, + { + "epoch": 0.27386644780664193, + "grad_norm": 1.4667325019836426, + "learning_rate": 4.130415038013889e-05, + "loss": 4.6518, + "step": 46049 + }, + { + "epoch": 0.27387239508992295, + "grad_norm": 1.545955777168274, + "learning_rate": 4.1303796281020654e-05, + "loss": 4.7566, + "step": 46050 + }, + { + "epoch": 0.2738783423732039, + "grad_norm": 1.7413856983184814, + "learning_rate": 4.1303442176210944e-05, + "loss": 4.3948, + "step": 46051 + }, + { + "epoch": 0.2738842896564849, + "grad_norm": 1.7199031114578247, + "learning_rate": 4.130308806570986e-05, + "loss": 4.4907, + "step": 46052 + }, + { + "epoch": 0.27389023693976594, + "grad_norm": 1.7889595031738281, + "learning_rate": 4.130273394951754e-05, + "loss": 4.6851, + "step": 46053 + }, + { + "epoch": 0.2738961842230469, + "grad_norm": 1.688315987586975, + "learning_rate": 4.1302379827634106e-05, + "loss": 4.612, + "step": 46054 + }, + { + "epoch": 0.2739021315063279, + "grad_norm": 1.405844807624817, + "learning_rate": 4.1302025700059673e-05, + "loss": 4.4579, + "step": 46055 + }, + { + "epoch": 0.27390807878960893, + "grad_norm": 1.508713722229004, + "learning_rate": 4.130167156679438e-05, + "loss": 4.2925, + "step": 46056 + }, + { + "epoch": 0.2739140260728899, + "grad_norm": 1.5473136901855469, + "learning_rate": 4.130131742783834e-05, + "loss": 4.7401, + "step": 46057 + }, + { + "epoch": 0.2739199733561709, + "grad_norm": 1.6587575674057007, + "learning_rate": 4.130096328319167e-05, + "loss": 4.1987, + "step": 46058 + }, + { + "epoch": 0.2739259206394519, + "grad_norm": 1.642226219177246, + "learning_rate": 4.130060913285451e-05, + "loss": 4.3238, + "step": 46059 + }, + { + "epoch": 0.2739318679227329, + "grad_norm": 1.808990240097046, + "learning_rate": 4.130025497682697e-05, + "loss": 4.307, + "step": 46060 + }, + { + "epoch": 0.2739378152060139, + "grad_norm": 1.4312530755996704, + "learning_rate": 4.129990081510919e-05, + "loss": 4.8036, + "step": 46061 + }, + { + "epoch": 0.2739437624892949, + "grad_norm": 1.5979284048080444, + "learning_rate": 4.129954664770127e-05, + "loss": 4.7046, + "step": 46062 + }, + { + "epoch": 0.2739497097725759, + "grad_norm": 1.874185562133789, + "learning_rate": 4.129919247460335e-05, + "loss": 4.6121, + "step": 46063 + }, + { + "epoch": 0.2739556570558569, + "grad_norm": 1.829045295715332, + "learning_rate": 4.129883829581556e-05, + "loss": 3.7505, + "step": 46064 + }, + { + "epoch": 0.27396160433913785, + "grad_norm": 1.425889253616333, + "learning_rate": 4.129848411133801e-05, + "loss": 4.4536, + "step": 46065 + }, + { + "epoch": 0.27396755162241887, + "grad_norm": 1.4821641445159912, + "learning_rate": 4.1298129921170823e-05, + "loss": 4.5866, + "step": 46066 + }, + { + "epoch": 0.2739734989056999, + "grad_norm": 1.41972017288208, + "learning_rate": 4.129777572531413e-05, + "loss": 4.498, + "step": 46067 + }, + { + "epoch": 0.27397944618898085, + "grad_norm": 1.4291393756866455, + "learning_rate": 4.129742152376805e-05, + "loss": 4.8755, + "step": 46068 + }, + { + "epoch": 0.27398539347226186, + "grad_norm": 1.6124067306518555, + "learning_rate": 4.1297067316532714e-05, + "loss": 4.865, + "step": 46069 + }, + { + "epoch": 0.2739913407555429, + "grad_norm": 1.7016314268112183, + "learning_rate": 4.129671310360823e-05, + "loss": 4.1753, + "step": 46070 + }, + { + "epoch": 0.27399728803882384, + "grad_norm": 1.5110547542572021, + "learning_rate": 4.129635888499473e-05, + "loss": 4.2611, + "step": 46071 + }, + { + "epoch": 0.27400323532210485, + "grad_norm": 1.515769124031067, + "learning_rate": 4.1296004660692356e-05, + "loss": 4.703, + "step": 46072 + }, + { + "epoch": 0.27400918260538587, + "grad_norm": 1.4375163316726685, + "learning_rate": 4.1295650430701205e-05, + "loss": 4.7766, + "step": 46073 + }, + { + "epoch": 0.27401512988866683, + "grad_norm": 1.5133380889892578, + "learning_rate": 4.129529619502141e-05, + "loss": 5.2005, + "step": 46074 + }, + { + "epoch": 0.27402107717194785, + "grad_norm": 1.257857322692871, + "learning_rate": 4.12949419536531e-05, + "loss": 5.2605, + "step": 46075 + }, + { + "epoch": 0.27402702445522886, + "grad_norm": 1.5891172885894775, + "learning_rate": 4.129458770659639e-05, + "loss": 4.6419, + "step": 46076 + }, + { + "epoch": 0.2740329717385098, + "grad_norm": 1.7749360799789429, + "learning_rate": 4.1294233453851406e-05, + "loss": 4.3876, + "step": 46077 + }, + { + "epoch": 0.27403891902179084, + "grad_norm": 1.7822747230529785, + "learning_rate": 4.1293879195418284e-05, + "loss": 4.441, + "step": 46078 + }, + { + "epoch": 0.27404486630507185, + "grad_norm": 1.3561606407165527, + "learning_rate": 4.1293524931297126e-05, + "loss": 5.1156, + "step": 46079 + }, + { + "epoch": 0.2740508135883528, + "grad_norm": 1.5693987607955933, + "learning_rate": 4.129317066148807e-05, + "loss": 5.1505, + "step": 46080 + }, + { + "epoch": 0.27405676087163383, + "grad_norm": 1.3826020956039429, + "learning_rate": 4.129281638599124e-05, + "loss": 4.8983, + "step": 46081 + }, + { + "epoch": 0.27406270815491485, + "grad_norm": 1.5621029138565063, + "learning_rate": 4.129246210480675e-05, + "loss": 4.9512, + "step": 46082 + }, + { + "epoch": 0.2740686554381958, + "grad_norm": 1.844801902770996, + "learning_rate": 4.129210781793473e-05, + "loss": 4.8548, + "step": 46083 + }, + { + "epoch": 0.2740746027214768, + "grad_norm": 1.5608266592025757, + "learning_rate": 4.129175352537531e-05, + "loss": 5.5325, + "step": 46084 + }, + { + "epoch": 0.27408055000475784, + "grad_norm": 1.5083396434783936, + "learning_rate": 4.1291399227128605e-05, + "loss": 5.1546, + "step": 46085 + }, + { + "epoch": 0.2740864972880388, + "grad_norm": 1.2983343601226807, + "learning_rate": 4.129104492319473e-05, + "loss": 4.8237, + "step": 46086 + }, + { + "epoch": 0.2740924445713198, + "grad_norm": 1.5975979566574097, + "learning_rate": 4.1290690613573835e-05, + "loss": 4.6323, + "step": 46087 + }, + { + "epoch": 0.27409839185460083, + "grad_norm": 1.4610346555709839, + "learning_rate": 4.129033629826602e-05, + "loss": 5.1776, + "step": 46088 + }, + { + "epoch": 0.2741043391378818, + "grad_norm": 1.516589641571045, + "learning_rate": 4.128998197727142e-05, + "loss": 4.8577, + "step": 46089 + }, + { + "epoch": 0.2741102864211628, + "grad_norm": 1.7498186826705933, + "learning_rate": 4.1289627650590156e-05, + "loss": 4.6629, + "step": 46090 + }, + { + "epoch": 0.2741162337044438, + "grad_norm": 1.8648617267608643, + "learning_rate": 4.128927331822235e-05, + "loss": 4.6112, + "step": 46091 + }, + { + "epoch": 0.2741221809877248, + "grad_norm": 1.8441025018692017, + "learning_rate": 4.128891898016812e-05, + "loss": 4.8756, + "step": 46092 + }, + { + "epoch": 0.2741281282710058, + "grad_norm": 1.5838325023651123, + "learning_rate": 4.128856463642761e-05, + "loss": 5.1558, + "step": 46093 + }, + { + "epoch": 0.2741340755542868, + "grad_norm": 1.5196138620376587, + "learning_rate": 4.128821028700092e-05, + "loss": 5.0189, + "step": 46094 + }, + { + "epoch": 0.2741400228375678, + "grad_norm": 1.74821937084198, + "learning_rate": 4.1287855931888185e-05, + "loss": 4.6813, + "step": 46095 + }, + { + "epoch": 0.2741459701208488, + "grad_norm": 1.5487388372421265, + "learning_rate": 4.128750157108954e-05, + "loss": 4.823, + "step": 46096 + }, + { + "epoch": 0.2741519174041298, + "grad_norm": 1.3546361923217773, + "learning_rate": 4.128714720460508e-05, + "loss": 5.2278, + "step": 46097 + }, + { + "epoch": 0.27415786468741077, + "grad_norm": 1.6515135765075684, + "learning_rate": 4.1286792832434955e-05, + "loss": 4.6681, + "step": 46098 + }, + { + "epoch": 0.2741638119706918, + "grad_norm": 1.19709312915802, + "learning_rate": 4.128643845457928e-05, + "loss": 5.0052, + "step": 46099 + }, + { + "epoch": 0.2741697592539728, + "grad_norm": 1.4359065294265747, + "learning_rate": 4.128608407103817e-05, + "loss": 5.006, + "step": 46100 + }, + { + "epoch": 0.27417570653725376, + "grad_norm": 1.5085959434509277, + "learning_rate": 4.128572968181176e-05, + "loss": 5.1144, + "step": 46101 + }, + { + "epoch": 0.2741816538205348, + "grad_norm": 1.7420308589935303, + "learning_rate": 4.128537528690018e-05, + "loss": 4.6152, + "step": 46102 + }, + { + "epoch": 0.2741876011038158, + "grad_norm": 2.497251272201538, + "learning_rate": 4.1285020886303526e-05, + "loss": 3.2921, + "step": 46103 + }, + { + "epoch": 0.27419354838709675, + "grad_norm": 2.4012861251831055, + "learning_rate": 4.128466648002194e-05, + "loss": 3.279, + "step": 46104 + }, + { + "epoch": 0.27419949567037777, + "grad_norm": 1.3649269342422485, + "learning_rate": 4.128431206805557e-05, + "loss": 5.3161, + "step": 46105 + }, + { + "epoch": 0.2742054429536588, + "grad_norm": 1.450717806816101, + "learning_rate": 4.128395765040448e-05, + "loss": 5.0505, + "step": 46106 + }, + { + "epoch": 0.27421139023693974, + "grad_norm": 1.5006554126739502, + "learning_rate": 4.128360322706885e-05, + "loss": 5.0896, + "step": 46107 + }, + { + "epoch": 0.27421733752022076, + "grad_norm": 1.7914799451828003, + "learning_rate": 4.128324879804878e-05, + "loss": 4.746, + "step": 46108 + }, + { + "epoch": 0.2742232848035018, + "grad_norm": 1.8827074766159058, + "learning_rate": 4.128289436334439e-05, + "loss": 4.6234, + "step": 46109 + }, + { + "epoch": 0.27422923208678274, + "grad_norm": 1.826460361480713, + "learning_rate": 4.128253992295582e-05, + "loss": 4.5912, + "step": 46110 + }, + { + "epoch": 0.27423517937006375, + "grad_norm": 1.5975192785263062, + "learning_rate": 4.1282185476883176e-05, + "loss": 5.1718, + "step": 46111 + }, + { + "epoch": 0.27424112665334477, + "grad_norm": 1.4200212955474854, + "learning_rate": 4.1281831025126595e-05, + "loss": 5.056, + "step": 46112 + }, + { + "epoch": 0.27424707393662573, + "grad_norm": 1.444554090499878, + "learning_rate": 4.1281476567686186e-05, + "loss": 5.1297, + "step": 46113 + }, + { + "epoch": 0.27425302121990675, + "grad_norm": 1.5609126091003418, + "learning_rate": 4.1281122104562084e-05, + "loss": 5.0368, + "step": 46114 + }, + { + "epoch": 0.27425896850318776, + "grad_norm": 1.299084186553955, + "learning_rate": 4.128076763575441e-05, + "loss": 5.1584, + "step": 46115 + }, + { + "epoch": 0.2742649157864687, + "grad_norm": 1.845511555671692, + "learning_rate": 4.12804131612633e-05, + "loss": 4.9824, + "step": 46116 + }, + { + "epoch": 0.27427086306974974, + "grad_norm": 2.482079267501831, + "learning_rate": 4.1280058681088856e-05, + "loss": 4.0647, + "step": 46117 + }, + { + "epoch": 0.27427681035303075, + "grad_norm": 1.8575667142868042, + "learning_rate": 4.127970419523121e-05, + "loss": 4.8056, + "step": 46118 + }, + { + "epoch": 0.2742827576363117, + "grad_norm": 1.702210545539856, + "learning_rate": 4.1279349703690494e-05, + "loss": 4.7824, + "step": 46119 + }, + { + "epoch": 0.27428870491959273, + "grad_norm": 1.661238193511963, + "learning_rate": 4.127899520646682e-05, + "loss": 4.7408, + "step": 46120 + }, + { + "epoch": 0.27429465220287375, + "grad_norm": 1.5828320980072021, + "learning_rate": 4.127864070356032e-05, + "loss": 4.5296, + "step": 46121 + }, + { + "epoch": 0.2743005994861547, + "grad_norm": 1.4086449146270752, + "learning_rate": 4.127828619497112e-05, + "loss": 5.0774, + "step": 46122 + }, + { + "epoch": 0.2743065467694357, + "grad_norm": 1.496623158454895, + "learning_rate": 4.127793168069933e-05, + "loss": 4.8957, + "step": 46123 + }, + { + "epoch": 0.27431249405271674, + "grad_norm": 1.3799477815628052, + "learning_rate": 4.127757716074508e-05, + "loss": 5.6368, + "step": 46124 + }, + { + "epoch": 0.2743184413359977, + "grad_norm": 1.2933743000030518, + "learning_rate": 4.12772226351085e-05, + "loss": 5.6765, + "step": 46125 + }, + { + "epoch": 0.2743243886192787, + "grad_norm": 1.5048894882202148, + "learning_rate": 4.1276868103789715e-05, + "loss": 5.2929, + "step": 46126 + }, + { + "epoch": 0.27433033590255973, + "grad_norm": 1.3347049951553345, + "learning_rate": 4.127651356678883e-05, + "loss": 5.4191, + "step": 46127 + }, + { + "epoch": 0.2743362831858407, + "grad_norm": 1.8256614208221436, + "learning_rate": 4.127615902410599e-05, + "loss": 4.7039, + "step": 46128 + }, + { + "epoch": 0.2743422304691217, + "grad_norm": 1.738516092300415, + "learning_rate": 4.127580447574131e-05, + "loss": 4.4506, + "step": 46129 + }, + { + "epoch": 0.2743481777524027, + "grad_norm": 1.7977484464645386, + "learning_rate": 4.127544992169492e-05, + "loss": 3.4941, + "step": 46130 + }, + { + "epoch": 0.2743541250356837, + "grad_norm": 1.8887935876846313, + "learning_rate": 4.127509536196694e-05, + "loss": 3.2411, + "step": 46131 + }, + { + "epoch": 0.2743600723189647, + "grad_norm": 1.8072736263275146, + "learning_rate": 4.1274740796557484e-05, + "loss": 3.2083, + "step": 46132 + }, + { + "epoch": 0.2743660196022457, + "grad_norm": 1.7755714654922485, + "learning_rate": 4.1274386225466687e-05, + "loss": 3.1123, + "step": 46133 + }, + { + "epoch": 0.2743719668855267, + "grad_norm": 1.9266663789749146, + "learning_rate": 4.127403164869467e-05, + "loss": 3.2307, + "step": 46134 + }, + { + "epoch": 0.2743779141688077, + "grad_norm": 1.9516514539718628, + "learning_rate": 4.1273677066241556e-05, + "loss": 3.2973, + "step": 46135 + }, + { + "epoch": 0.2743838614520887, + "grad_norm": 1.8159171342849731, + "learning_rate": 4.127332247810747e-05, + "loss": 3.2798, + "step": 46136 + }, + { + "epoch": 0.27438980873536967, + "grad_norm": 1.9370509386062622, + "learning_rate": 4.1272967884292537e-05, + "loss": 3.0966, + "step": 46137 + }, + { + "epoch": 0.2743957560186507, + "grad_norm": 1.8243327140808105, + "learning_rate": 4.127261328479688e-05, + "loss": 3.2332, + "step": 46138 + }, + { + "epoch": 0.2744017033019317, + "grad_norm": 1.9843389987945557, + "learning_rate": 4.127225867962062e-05, + "loss": 3.1726, + "step": 46139 + }, + { + "epoch": 0.27440765058521266, + "grad_norm": 1.795864224433899, + "learning_rate": 4.127190406876388e-05, + "loss": 3.8035, + "step": 46140 + }, + { + "epoch": 0.2744135978684937, + "grad_norm": 1.810988426208496, + "learning_rate": 4.1271549452226784e-05, + "loss": 3.6477, + "step": 46141 + }, + { + "epoch": 0.2744195451517747, + "grad_norm": 2.187892436981201, + "learning_rate": 4.127119483000947e-05, + "loss": 3.0375, + "step": 46142 + }, + { + "epoch": 0.27442549243505565, + "grad_norm": 2.6590614318847656, + "learning_rate": 4.127084020211204e-05, + "loss": 2.8673, + "step": 46143 + }, + { + "epoch": 0.27443143971833667, + "grad_norm": 2.1317801475524902, + "learning_rate": 4.1270485568534636e-05, + "loss": 2.7975, + "step": 46144 + }, + { + "epoch": 0.2744373870016177, + "grad_norm": 2.015737533569336, + "learning_rate": 4.1270130929277365e-05, + "loss": 2.8884, + "step": 46145 + }, + { + "epoch": 0.27444333428489864, + "grad_norm": 2.2858848571777344, + "learning_rate": 4.126977628434036e-05, + "loss": 3.0289, + "step": 46146 + }, + { + "epoch": 0.27444928156817966, + "grad_norm": 2.237933874130249, + "learning_rate": 4.126942163372375e-05, + "loss": 2.9913, + "step": 46147 + }, + { + "epoch": 0.2744552288514607, + "grad_norm": 2.1306540966033936, + "learning_rate": 4.1269066977427653e-05, + "loss": 2.9033, + "step": 46148 + }, + { + "epoch": 0.27446117613474164, + "grad_norm": 2.3869965076446533, + "learning_rate": 4.1268712315452186e-05, + "loss": 2.8959, + "step": 46149 + }, + { + "epoch": 0.27446712341802265, + "grad_norm": 2.354491949081421, + "learning_rate": 4.1268357647797485e-05, + "loss": 3.3744, + "step": 46150 + }, + { + "epoch": 0.27447307070130367, + "grad_norm": 1.739743709564209, + "learning_rate": 4.126800297446366e-05, + "loss": 4.9974, + "step": 46151 + }, + { + "epoch": 0.27447901798458463, + "grad_norm": 1.7905662059783936, + "learning_rate": 4.1267648295450854e-05, + "loss": 5.0062, + "step": 46152 + }, + { + "epoch": 0.27448496526786564, + "grad_norm": 2.412614583969116, + "learning_rate": 4.1267293610759176e-05, + "loss": 3.8329, + "step": 46153 + }, + { + "epoch": 0.27449091255114666, + "grad_norm": 2.481851100921631, + "learning_rate": 4.126693892038875e-05, + "loss": 3.8474, + "step": 46154 + }, + { + "epoch": 0.2744968598344276, + "grad_norm": 2.4358737468719482, + "learning_rate": 4.126658422433971e-05, + "loss": 3.7941, + "step": 46155 + }, + { + "epoch": 0.27450280711770864, + "grad_norm": 2.1686971187591553, + "learning_rate": 4.1266229522612173e-05, + "loss": 3.7575, + "step": 46156 + }, + { + "epoch": 0.27450875440098965, + "grad_norm": 2.371582269668579, + "learning_rate": 4.126587481520626e-05, + "loss": 3.6304, + "step": 46157 + }, + { + "epoch": 0.2745147016842706, + "grad_norm": 2.6072447299957275, + "learning_rate": 4.12655201021221e-05, + "loss": 3.6583, + "step": 46158 + }, + { + "epoch": 0.27452064896755163, + "grad_norm": 2.2264256477355957, + "learning_rate": 4.126516538335982e-05, + "loss": 3.6508, + "step": 46159 + }, + { + "epoch": 0.27452659625083264, + "grad_norm": 2.3935751914978027, + "learning_rate": 4.1264810658919526e-05, + "loss": 3.6762, + "step": 46160 + }, + { + "epoch": 0.2745325435341136, + "grad_norm": 2.2404563426971436, + "learning_rate": 4.126445592880136e-05, + "loss": 3.5681, + "step": 46161 + }, + { + "epoch": 0.2745384908173946, + "grad_norm": 1.6988742351531982, + "learning_rate": 4.126410119300544e-05, + "loss": 4.612, + "step": 46162 + }, + { + "epoch": 0.27454443810067564, + "grad_norm": 1.9629842042922974, + "learning_rate": 4.12637464515319e-05, + "loss": 4.3595, + "step": 46163 + }, + { + "epoch": 0.2745503853839566, + "grad_norm": 1.8073927164077759, + "learning_rate": 4.1263391704380846e-05, + "loss": 4.1392, + "step": 46164 + }, + { + "epoch": 0.2745563326672376, + "grad_norm": 1.8148568868637085, + "learning_rate": 4.1263036951552415e-05, + "loss": 4.4874, + "step": 46165 + }, + { + "epoch": 0.27456227995051863, + "grad_norm": 1.6860933303833008, + "learning_rate": 4.126268219304672e-05, + "loss": 4.4578, + "step": 46166 + }, + { + "epoch": 0.2745682272337996, + "grad_norm": 1.530277132987976, + "learning_rate": 4.126232742886389e-05, + "loss": 4.4506, + "step": 46167 + }, + { + "epoch": 0.2745741745170806, + "grad_norm": 1.3951627016067505, + "learning_rate": 4.1261972659004056e-05, + "loss": 4.8383, + "step": 46168 + }, + { + "epoch": 0.2745801218003616, + "grad_norm": 1.8678330183029175, + "learning_rate": 4.126161788346733e-05, + "loss": 4.6718, + "step": 46169 + }, + { + "epoch": 0.2745860690836426, + "grad_norm": 2.836393356323242, + "learning_rate": 4.1261263102253834e-05, + "loss": 3.5316, + "step": 46170 + }, + { + "epoch": 0.2745920163669236, + "grad_norm": 2.210344076156616, + "learning_rate": 4.1260908315363714e-05, + "loss": 3.4998, + "step": 46171 + }, + { + "epoch": 0.2745979636502046, + "grad_norm": 1.8740160465240479, + "learning_rate": 4.126055352279707e-05, + "loss": 4.7826, + "step": 46172 + }, + { + "epoch": 0.2746039109334856, + "grad_norm": 1.6485148668289185, + "learning_rate": 4.126019872455403e-05, + "loss": 4.1021, + "step": 46173 + }, + { + "epoch": 0.2746098582167666, + "grad_norm": 1.5942952632904053, + "learning_rate": 4.1259843920634735e-05, + "loss": 4.2744, + "step": 46174 + }, + { + "epoch": 0.2746158055000476, + "grad_norm": 1.3920525312423706, + "learning_rate": 4.1259489111039294e-05, + "loss": 4.5022, + "step": 46175 + }, + { + "epoch": 0.27462175278332857, + "grad_norm": 1.465614914894104, + "learning_rate": 4.125913429576783e-05, + "loss": 5.1012, + "step": 46176 + }, + { + "epoch": 0.2746277000666096, + "grad_norm": 1.3128267526626587, + "learning_rate": 4.1258779474820475e-05, + "loss": 4.8686, + "step": 46177 + }, + { + "epoch": 0.2746336473498906, + "grad_norm": 1.3610827922821045, + "learning_rate": 4.125842464819733e-05, + "loss": 4.9375, + "step": 46178 + }, + { + "epoch": 0.27463959463317156, + "grad_norm": 1.534157633781433, + "learning_rate": 4.1258069815898556e-05, + "loss": 4.7529, + "step": 46179 + }, + { + "epoch": 0.2746455419164526, + "grad_norm": 2.0995116233825684, + "learning_rate": 4.125771497792426e-05, + "loss": 4.3146, + "step": 46180 + }, + { + "epoch": 0.2746514891997336, + "grad_norm": 1.9438445568084717, + "learning_rate": 4.125736013427455e-05, + "loss": 4.3028, + "step": 46181 + }, + { + "epoch": 0.27465743648301455, + "grad_norm": 2.422401189804077, + "learning_rate": 4.125700528494957e-05, + "loss": 5.1968, + "step": 46182 + }, + { + "epoch": 0.27466338376629557, + "grad_norm": 1.8259403705596924, + "learning_rate": 4.125665042994944e-05, + "loss": 5.2148, + "step": 46183 + }, + { + "epoch": 0.2746693310495765, + "grad_norm": 3.0869333744049072, + "learning_rate": 4.1256295569274276e-05, + "loss": 3.6571, + "step": 46184 + }, + { + "epoch": 0.27467527833285754, + "grad_norm": 2.3924150466918945, + "learning_rate": 4.125594070292421e-05, + "loss": 3.6707, + "step": 46185 + }, + { + "epoch": 0.27468122561613856, + "grad_norm": 2.4759631156921387, + "learning_rate": 4.1255585830899366e-05, + "loss": 3.3317, + "step": 46186 + }, + { + "epoch": 0.2746871728994195, + "grad_norm": 2.2190024852752686, + "learning_rate": 4.125523095319986e-05, + "loss": 3.5257, + "step": 46187 + }, + { + "epoch": 0.27469312018270053, + "grad_norm": 2.2584967613220215, + "learning_rate": 4.125487606982582e-05, + "loss": 3.6052, + "step": 46188 + }, + { + "epoch": 0.27469906746598155, + "grad_norm": 2.4576122760772705, + "learning_rate": 4.125452118077737e-05, + "loss": 3.2953, + "step": 46189 + }, + { + "epoch": 0.2747050147492625, + "grad_norm": 1.809865117073059, + "learning_rate": 4.125416628605464e-05, + "loss": 4.6378, + "step": 46190 + }, + { + "epoch": 0.2747109620325435, + "grad_norm": 2.1988916397094727, + "learning_rate": 4.125381138565775e-05, + "loss": 3.574, + "step": 46191 + }, + { + "epoch": 0.27471690931582454, + "grad_norm": 2.2043166160583496, + "learning_rate": 4.1253456479586816e-05, + "loss": 3.7004, + "step": 46192 + }, + { + "epoch": 0.2747228565991055, + "grad_norm": 2.2249927520751953, + "learning_rate": 4.1253101567841966e-05, + "loss": 3.8639, + "step": 46193 + }, + { + "epoch": 0.2747288038823865, + "grad_norm": 2.189068555831909, + "learning_rate": 4.125274665042334e-05, + "loss": 3.5551, + "step": 46194 + }, + { + "epoch": 0.27473475116566753, + "grad_norm": 1.9947869777679443, + "learning_rate": 4.125239172733103e-05, + "loss": 3.4196, + "step": 46195 + }, + { + "epoch": 0.2747406984489485, + "grad_norm": 1.9912019968032837, + "learning_rate": 4.125203679856519e-05, + "loss": 3.3918, + "step": 46196 + }, + { + "epoch": 0.2747466457322295, + "grad_norm": 1.8800561428070068, + "learning_rate": 4.125168186412593e-05, + "loss": 4.7122, + "step": 46197 + }, + { + "epoch": 0.2747525930155105, + "grad_norm": 1.7100132703781128, + "learning_rate": 4.1251326924013375e-05, + "loss": 5.0089, + "step": 46198 + }, + { + "epoch": 0.2747585402987915, + "grad_norm": 1.2724366188049316, + "learning_rate": 4.125097197822765e-05, + "loss": 4.59, + "step": 46199 + }, + { + "epoch": 0.2747644875820725, + "grad_norm": 1.4347710609436035, + "learning_rate": 4.125061702676888e-05, + "loss": 4.6822, + "step": 46200 + }, + { + "epoch": 0.2747704348653535, + "grad_norm": 1.3129082918167114, + "learning_rate": 4.125026206963718e-05, + "loss": 4.8355, + "step": 46201 + }, + { + "epoch": 0.2747763821486345, + "grad_norm": 1.2473912239074707, + "learning_rate": 4.1249907106832686e-05, + "loss": 4.767, + "step": 46202 + }, + { + "epoch": 0.2747823294319155, + "grad_norm": 1.2876032590866089, + "learning_rate": 4.124955213835552e-05, + "loss": 4.5873, + "step": 46203 + }, + { + "epoch": 0.2747882767151965, + "grad_norm": 1.6242700815200806, + "learning_rate": 4.12491971642058e-05, + "loss": 4.5541, + "step": 46204 + }, + { + "epoch": 0.27479422399847747, + "grad_norm": 1.3276612758636475, + "learning_rate": 4.124884218438365e-05, + "loss": 4.5673, + "step": 46205 + }, + { + "epoch": 0.2748001712817585, + "grad_norm": 1.4928336143493652, + "learning_rate": 4.1248487198889205e-05, + "loss": 4.7504, + "step": 46206 + }, + { + "epoch": 0.2748061185650395, + "grad_norm": 1.4044466018676758, + "learning_rate": 4.1248132207722575e-05, + "loss": 4.8063, + "step": 46207 + }, + { + "epoch": 0.27481206584832046, + "grad_norm": 1.49996817111969, + "learning_rate": 4.12477772108839e-05, + "loss": 4.7879, + "step": 46208 + }, + { + "epoch": 0.2748180131316015, + "grad_norm": 1.4280827045440674, + "learning_rate": 4.124742220837329e-05, + "loss": 4.6763, + "step": 46209 + }, + { + "epoch": 0.2748239604148825, + "grad_norm": 1.456724762916565, + "learning_rate": 4.124706720019086e-05, + "loss": 4.682, + "step": 46210 + }, + { + "epoch": 0.27482990769816346, + "grad_norm": 1.6407270431518555, + "learning_rate": 4.124671218633676e-05, + "loss": 4.7249, + "step": 46211 + }, + { + "epoch": 0.27483585498144447, + "grad_norm": 1.367529034614563, + "learning_rate": 4.124635716681109e-05, + "loss": 4.7027, + "step": 46212 + }, + { + "epoch": 0.2748418022647255, + "grad_norm": 1.7050827741622925, + "learning_rate": 4.124600214161399e-05, + "loss": 5.578, + "step": 46213 + }, + { + "epoch": 0.27484774954800645, + "grad_norm": 1.6925042867660522, + "learning_rate": 4.124564711074558e-05, + "loss": 5.554, + "step": 46214 + }, + { + "epoch": 0.27485369683128746, + "grad_norm": 1.4739962816238403, + "learning_rate": 4.124529207420599e-05, + "loss": 5.2641, + "step": 46215 + }, + { + "epoch": 0.2748596441145685, + "grad_norm": 3.2829408645629883, + "learning_rate": 4.1244937031995324e-05, + "loss": 4.6718, + "step": 46216 + }, + { + "epoch": 0.27486559139784944, + "grad_norm": 1.7848970890045166, + "learning_rate": 4.124458198411372e-05, + "loss": 4.9073, + "step": 46217 + }, + { + "epoch": 0.27487153868113046, + "grad_norm": 1.7990310192108154, + "learning_rate": 4.1244226930561304e-05, + "loss": 5.0811, + "step": 46218 + }, + { + "epoch": 0.27487748596441147, + "grad_norm": 1.558308482170105, + "learning_rate": 4.124387187133819e-05, + "loss": 5.0451, + "step": 46219 + }, + { + "epoch": 0.27488343324769243, + "grad_norm": 1.8958772420883179, + "learning_rate": 4.1243516806444514e-05, + "loss": 5.1993, + "step": 46220 + }, + { + "epoch": 0.27488938053097345, + "grad_norm": 1.9984842538833618, + "learning_rate": 4.12431617358804e-05, + "loss": 4.8248, + "step": 46221 + }, + { + "epoch": 0.27489532781425446, + "grad_norm": 1.5728363990783691, + "learning_rate": 4.124280665964596e-05, + "loss": 4.4414, + "step": 46222 + }, + { + "epoch": 0.2749012750975354, + "grad_norm": 1.7830369472503662, + "learning_rate": 4.124245157774132e-05, + "loss": 4.2311, + "step": 46223 + }, + { + "epoch": 0.27490722238081644, + "grad_norm": 1.601627230644226, + "learning_rate": 4.1242096490166615e-05, + "loss": 4.8935, + "step": 46224 + }, + { + "epoch": 0.27491316966409746, + "grad_norm": 1.466048002243042, + "learning_rate": 4.124174139692195e-05, + "loss": 4.8115, + "step": 46225 + }, + { + "epoch": 0.2749191169473784, + "grad_norm": 1.5829848051071167, + "learning_rate": 4.1241386298007465e-05, + "loss": 4.8696, + "step": 46226 + }, + { + "epoch": 0.27492506423065943, + "grad_norm": 1.4965269565582275, + "learning_rate": 4.12410311934233e-05, + "loss": 4.7518, + "step": 46227 + }, + { + "epoch": 0.27493101151394045, + "grad_norm": 1.4047186374664307, + "learning_rate": 4.1240676083169536e-05, + "loss": 4.531, + "step": 46228 + }, + { + "epoch": 0.2749369587972214, + "grad_norm": 1.2659904956817627, + "learning_rate": 4.124032096724632e-05, + "loss": 4.6599, + "step": 46229 + }, + { + "epoch": 0.2749429060805024, + "grad_norm": 1.6452581882476807, + "learning_rate": 4.1239965845653784e-05, + "loss": 4.9355, + "step": 46230 + }, + { + "epoch": 0.27494885336378344, + "grad_norm": 1.5029197931289673, + "learning_rate": 4.123961071839204e-05, + "loss": 5.1925, + "step": 46231 + }, + { + "epoch": 0.2749548006470644, + "grad_norm": 2.31976056098938, + "learning_rate": 4.1239255585461225e-05, + "loss": 3.6962, + "step": 46232 + }, + { + "epoch": 0.2749607479303454, + "grad_norm": 2.0741798877716064, + "learning_rate": 4.1238900446861436e-05, + "loss": 3.93, + "step": 46233 + }, + { + "epoch": 0.27496669521362643, + "grad_norm": 2.285489559173584, + "learning_rate": 4.1238545302592835e-05, + "loss": 4.048, + "step": 46234 + }, + { + "epoch": 0.2749726424969074, + "grad_norm": 2.0764570236206055, + "learning_rate": 4.123819015265551e-05, + "loss": 3.6747, + "step": 46235 + }, + { + "epoch": 0.2749785897801884, + "grad_norm": 1.9376609325408936, + "learning_rate": 4.1237834997049604e-05, + "loss": 3.785, + "step": 46236 + }, + { + "epoch": 0.2749845370634694, + "grad_norm": 1.4978312253952026, + "learning_rate": 4.1237479835775244e-05, + "loss": 4.6769, + "step": 46237 + }, + { + "epoch": 0.2749904843467504, + "grad_norm": 1.5898009538650513, + "learning_rate": 4.123712466883254e-05, + "loss": 4.7458, + "step": 46238 + }, + { + "epoch": 0.2749964316300314, + "grad_norm": 1.5089272260665894, + "learning_rate": 4.123676949622163e-05, + "loss": 4.8081, + "step": 46239 + }, + { + "epoch": 0.2750023789133124, + "grad_norm": 1.6137049198150635, + "learning_rate": 4.123641431794263e-05, + "loss": 4.6074, + "step": 46240 + }, + { + "epoch": 0.2750083261965934, + "grad_norm": 1.6270198822021484, + "learning_rate": 4.1236059133995656e-05, + "loss": 5.1282, + "step": 46241 + }, + { + "epoch": 0.2750142734798744, + "grad_norm": 1.6744282245635986, + "learning_rate": 4.1235703944380855e-05, + "loss": 4.7591, + "step": 46242 + }, + { + "epoch": 0.2750202207631554, + "grad_norm": 2.5355544090270996, + "learning_rate": 4.1235348749098326e-05, + "loss": 3.6028, + "step": 46243 + }, + { + "epoch": 0.27502616804643637, + "grad_norm": 2.172025203704834, + "learning_rate": 4.123499354814821e-05, + "loss": 4.3216, + "step": 46244 + }, + { + "epoch": 0.2750321153297174, + "grad_norm": 1.534201741218567, + "learning_rate": 4.123463834153063e-05, + "loss": 5.0017, + "step": 46245 + }, + { + "epoch": 0.2750380626129984, + "grad_norm": 1.5022629499435425, + "learning_rate": 4.12342831292457e-05, + "loss": 5.0232, + "step": 46246 + }, + { + "epoch": 0.27504400989627936, + "grad_norm": 1.546716570854187, + "learning_rate": 4.123392791129355e-05, + "loss": 5.0124, + "step": 46247 + }, + { + "epoch": 0.2750499571795604, + "grad_norm": 2.320678234100342, + "learning_rate": 4.1233572687674304e-05, + "loss": 3.1384, + "step": 46248 + }, + { + "epoch": 0.2750559044628414, + "grad_norm": 3.2179276943206787, + "learning_rate": 4.1233217458388085e-05, + "loss": 1.5689, + "step": 46249 + }, + { + "epoch": 0.27506185174612235, + "grad_norm": 3.1816229820251465, + "learning_rate": 4.123286222343502e-05, + "loss": 1.953, + "step": 46250 + }, + { + "epoch": 0.27506779902940337, + "grad_norm": 1.7213493585586548, + "learning_rate": 4.123250698281523e-05, + "loss": 4.7868, + "step": 46251 + }, + { + "epoch": 0.2750737463126844, + "grad_norm": 1.6481515169143677, + "learning_rate": 4.123215173652883e-05, + "loss": 4.4338, + "step": 46252 + }, + { + "epoch": 0.27507969359596535, + "grad_norm": 1.5531747341156006, + "learning_rate": 4.123179648457597e-05, + "loss": 4.9726, + "step": 46253 + }, + { + "epoch": 0.27508564087924636, + "grad_norm": 1.5992461442947388, + "learning_rate": 4.1231441226956744e-05, + "loss": 5.0061, + "step": 46254 + }, + { + "epoch": 0.2750915881625274, + "grad_norm": 1.5691217184066772, + "learning_rate": 4.123108596367129e-05, + "loss": 4.9996, + "step": 46255 + }, + { + "epoch": 0.27509753544580834, + "grad_norm": 1.825803518295288, + "learning_rate": 4.123073069471974e-05, + "loss": 4.304, + "step": 46256 + }, + { + "epoch": 0.27510348272908935, + "grad_norm": 1.7822233438491821, + "learning_rate": 4.123037542010221e-05, + "loss": 4.6646, + "step": 46257 + }, + { + "epoch": 0.27510943001237037, + "grad_norm": 1.6936640739440918, + "learning_rate": 4.123002013981881e-05, + "loss": 4.6191, + "step": 46258 + }, + { + "epoch": 0.27511537729565133, + "grad_norm": 1.7635455131530762, + "learning_rate": 4.122966485386969e-05, + "loss": 4.7878, + "step": 46259 + }, + { + "epoch": 0.27512132457893235, + "grad_norm": 1.8808941841125488, + "learning_rate": 4.122930956225496e-05, + "loss": 4.6753, + "step": 46260 + }, + { + "epoch": 0.27512727186221336, + "grad_norm": 1.6380410194396973, + "learning_rate": 4.1228954264974744e-05, + "loss": 4.5652, + "step": 46261 + }, + { + "epoch": 0.2751332191454943, + "grad_norm": 1.600900411605835, + "learning_rate": 4.122859896202917e-05, + "loss": 4.9486, + "step": 46262 + }, + { + "epoch": 0.27513916642877534, + "grad_norm": 1.7685341835021973, + "learning_rate": 4.122824365341836e-05, + "loss": 4.5296, + "step": 46263 + }, + { + "epoch": 0.27514511371205636, + "grad_norm": 2.6545424461364746, + "learning_rate": 4.122788833914244e-05, + "loss": 3.247, + "step": 46264 + }, + { + "epoch": 0.2751510609953373, + "grad_norm": 2.481581926345825, + "learning_rate": 4.122753301920153e-05, + "loss": 3.2329, + "step": 46265 + }, + { + "epoch": 0.27515700827861833, + "grad_norm": 2.5032413005828857, + "learning_rate": 4.122717769359575e-05, + "loss": 3.2995, + "step": 46266 + }, + { + "epoch": 0.27516295556189935, + "grad_norm": 2.40047025680542, + "learning_rate": 4.122682236232523e-05, + "loss": 3.2761, + "step": 46267 + }, + { + "epoch": 0.2751689028451803, + "grad_norm": 2.1254444122314453, + "learning_rate": 4.1226467025390096e-05, + "loss": 3.1498, + "step": 46268 + }, + { + "epoch": 0.2751748501284613, + "grad_norm": 2.169954776763916, + "learning_rate": 4.122611168279047e-05, + "loss": 3.1385, + "step": 46269 + }, + { + "epoch": 0.27518079741174234, + "grad_norm": 2.2174501419067383, + "learning_rate": 4.1225756334526474e-05, + "loss": 3.2042, + "step": 46270 + }, + { + "epoch": 0.2751867446950233, + "grad_norm": 2.067911386489868, + "learning_rate": 4.122540098059824e-05, + "loss": 3.0722, + "step": 46271 + }, + { + "epoch": 0.2751926919783043, + "grad_norm": 2.389986991882324, + "learning_rate": 4.122504562100589e-05, + "loss": 3.0334, + "step": 46272 + }, + { + "epoch": 0.27519863926158533, + "grad_norm": 2.319524049758911, + "learning_rate": 4.1224690255749534e-05, + "loss": 3.0209, + "step": 46273 + }, + { + "epoch": 0.2752045865448663, + "grad_norm": 1.9961400032043457, + "learning_rate": 4.122433488482931e-05, + "loss": 3.0383, + "step": 46274 + }, + { + "epoch": 0.2752105338281473, + "grad_norm": 2.1424777507781982, + "learning_rate": 4.1223979508245335e-05, + "loss": 3.046, + "step": 46275 + }, + { + "epoch": 0.2752164811114283, + "grad_norm": 2.2459633350372314, + "learning_rate": 4.122362412599774e-05, + "loss": 3.1613, + "step": 46276 + }, + { + "epoch": 0.2752224283947093, + "grad_norm": 1.6937342882156372, + "learning_rate": 4.122326873808665e-05, + "loss": 4.3508, + "step": 46277 + }, + { + "epoch": 0.2752283756779903, + "grad_norm": 1.5506948232650757, + "learning_rate": 4.122291334451218e-05, + "loss": 4.3938, + "step": 46278 + }, + { + "epoch": 0.2752343229612713, + "grad_norm": 1.6345840692520142, + "learning_rate": 4.1222557945274454e-05, + "loss": 4.8578, + "step": 46279 + }, + { + "epoch": 0.2752402702445523, + "grad_norm": 1.733811855316162, + "learning_rate": 4.1222202540373605e-05, + "loss": 4.6352, + "step": 46280 + }, + { + "epoch": 0.2752462175278333, + "grad_norm": 2.25563907623291, + "learning_rate": 4.122184712980975e-05, + "loss": 3.1655, + "step": 46281 + }, + { + "epoch": 0.2752521648111143, + "grad_norm": 2.4891695976257324, + "learning_rate": 4.122149171358302e-05, + "loss": 2.7708, + "step": 46282 + }, + { + "epoch": 0.27525811209439527, + "grad_norm": 1.8358043432235718, + "learning_rate": 4.122113629169354e-05, + "loss": 3.062, + "step": 46283 + }, + { + "epoch": 0.2752640593776763, + "grad_norm": 1.5947325229644775, + "learning_rate": 4.1220780864141416e-05, + "loss": 4.9931, + "step": 46284 + }, + { + "epoch": 0.2752700066609573, + "grad_norm": 1.6800808906555176, + "learning_rate": 4.122042543092679e-05, + "loss": 4.9757, + "step": 46285 + }, + { + "epoch": 0.27527595394423826, + "grad_norm": 1.3695666790008545, + "learning_rate": 4.122006999204978e-05, + "loss": 4.9193, + "step": 46286 + }, + { + "epoch": 0.2752819012275193, + "grad_norm": 1.5466188192367554, + "learning_rate": 4.1219714547510516e-05, + "loss": 4.9138, + "step": 46287 + }, + { + "epoch": 0.2752878485108003, + "grad_norm": 3.5648727416992188, + "learning_rate": 4.121935909730912e-05, + "loss": 3.8563, + "step": 46288 + }, + { + "epoch": 0.27529379579408125, + "grad_norm": 3.945028781890869, + "learning_rate": 4.1219003641445706e-05, + "loss": 3.6281, + "step": 46289 + }, + { + "epoch": 0.27529974307736227, + "grad_norm": 2.4756417274475098, + "learning_rate": 4.1218648179920406e-05, + "loss": 4.0891, + "step": 46290 + }, + { + "epoch": 0.2753056903606433, + "grad_norm": 1.7653741836547852, + "learning_rate": 4.121829271273334e-05, + "loss": 4.2678, + "step": 46291 + }, + { + "epoch": 0.27531163764392425, + "grad_norm": 1.8334170579910278, + "learning_rate": 4.121793723988464e-05, + "loss": 4.7658, + "step": 46292 + }, + { + "epoch": 0.27531758492720526, + "grad_norm": 1.6278693675994873, + "learning_rate": 4.1217581761374434e-05, + "loss": 4.5434, + "step": 46293 + }, + { + "epoch": 0.2753235322104863, + "grad_norm": 1.936920166015625, + "learning_rate": 4.121722627720283e-05, + "loss": 3.5598, + "step": 46294 + }, + { + "epoch": 0.27532947949376724, + "grad_norm": 2.592510223388672, + "learning_rate": 4.121687078736995e-05, + "loss": 2.9075, + "step": 46295 + }, + { + "epoch": 0.27533542677704825, + "grad_norm": 2.2087111473083496, + "learning_rate": 4.121651529187595e-05, + "loss": 2.7632, + "step": 46296 + }, + { + "epoch": 0.27534137406032927, + "grad_norm": 2.1168899536132812, + "learning_rate": 4.121615979072091e-05, + "loss": 2.922, + "step": 46297 + }, + { + "epoch": 0.27534732134361023, + "grad_norm": 2.3921732902526855, + "learning_rate": 4.121580428390499e-05, + "loss": 2.8304, + "step": 46298 + }, + { + "epoch": 0.27535326862689125, + "grad_norm": 2.176006317138672, + "learning_rate": 4.1215448771428294e-05, + "loss": 2.9653, + "step": 46299 + }, + { + "epoch": 0.2753592159101722, + "grad_norm": 2.330838441848755, + "learning_rate": 4.121509325329095e-05, + "loss": 2.829, + "step": 46300 + }, + { + "epoch": 0.2753651631934532, + "grad_norm": 1.9856199026107788, + "learning_rate": 4.121473772949309e-05, + "loss": 2.899, + "step": 46301 + }, + { + "epoch": 0.27537111047673424, + "grad_norm": 2.330117702484131, + "learning_rate": 4.1214382200034834e-05, + "loss": 2.9223, + "step": 46302 + }, + { + "epoch": 0.2753770577600152, + "grad_norm": 2.4450840950012207, + "learning_rate": 4.12140266649163e-05, + "loss": 2.8901, + "step": 46303 + }, + { + "epoch": 0.2753830050432962, + "grad_norm": 2.4854440689086914, + "learning_rate": 4.121367112413762e-05, + "loss": 3.1449, + "step": 46304 + }, + { + "epoch": 0.27538895232657723, + "grad_norm": 2.1713833808898926, + "learning_rate": 4.121331557769892e-05, + "loss": 2.9562, + "step": 46305 + }, + { + "epoch": 0.2753948996098582, + "grad_norm": 2.3882718086242676, + "learning_rate": 4.121296002560031e-05, + "loss": 2.8517, + "step": 46306 + }, + { + "epoch": 0.2754008468931392, + "grad_norm": 2.776643991470337, + "learning_rate": 4.121260446784193e-05, + "loss": 3.0075, + "step": 46307 + }, + { + "epoch": 0.2754067941764202, + "grad_norm": 2.5317656993865967, + "learning_rate": 4.121224890442389e-05, + "loss": 2.8884, + "step": 46308 + }, + { + "epoch": 0.2754127414597012, + "grad_norm": 2.326202630996704, + "learning_rate": 4.121189333534632e-05, + "loss": 3.0132, + "step": 46309 + }, + { + "epoch": 0.2754186887429822, + "grad_norm": 2.312110662460327, + "learning_rate": 4.1211537760609354e-05, + "loss": 2.7694, + "step": 46310 + }, + { + "epoch": 0.2754246360262632, + "grad_norm": 2.1942336559295654, + "learning_rate": 4.1211182180213104e-05, + "loss": 2.7676, + "step": 46311 + }, + { + "epoch": 0.2754305833095442, + "grad_norm": 2.1001670360565186, + "learning_rate": 4.121082659415769e-05, + "loss": 2.8293, + "step": 46312 + }, + { + "epoch": 0.2754365305928252, + "grad_norm": 2.2445085048675537, + "learning_rate": 4.121047100244326e-05, + "loss": 2.8534, + "step": 46313 + }, + { + "epoch": 0.2754424778761062, + "grad_norm": 1.6833618879318237, + "learning_rate": 4.121011540506991e-05, + "loss": 4.2664, + "step": 46314 + }, + { + "epoch": 0.27544842515938717, + "grad_norm": 1.8225903511047363, + "learning_rate": 4.120975980203778e-05, + "loss": 4.2333, + "step": 46315 + }, + { + "epoch": 0.2754543724426682, + "grad_norm": 1.9510418176651, + "learning_rate": 4.120940419334699e-05, + "loss": 3.4616, + "step": 46316 + }, + { + "epoch": 0.2754603197259492, + "grad_norm": 2.547820806503296, + "learning_rate": 4.120904857899767e-05, + "loss": 2.9013, + "step": 46317 + }, + { + "epoch": 0.27546626700923016, + "grad_norm": 2.28924822807312, + "learning_rate": 4.120869295898992e-05, + "loss": 2.7114, + "step": 46318 + }, + { + "epoch": 0.2754722142925112, + "grad_norm": 2.1096386909484863, + "learning_rate": 4.12083373333239e-05, + "loss": 2.6965, + "step": 46319 + }, + { + "epoch": 0.2754781615757922, + "grad_norm": 2.003237724304199, + "learning_rate": 4.1207981701999714e-05, + "loss": 3.0116, + "step": 46320 + }, + { + "epoch": 0.27548410885907315, + "grad_norm": 2.199387311935425, + "learning_rate": 4.120762606501749e-05, + "loss": 2.9351, + "step": 46321 + }, + { + "epoch": 0.27549005614235417, + "grad_norm": 2.1074635982513428, + "learning_rate": 4.120727042237735e-05, + "loss": 2.5115, + "step": 46322 + }, + { + "epoch": 0.2754960034256352, + "grad_norm": 2.570005178451538, + "learning_rate": 4.120691477407942e-05, + "loss": 2.6922, + "step": 46323 + }, + { + "epoch": 0.27550195070891614, + "grad_norm": 2.4060990810394287, + "learning_rate": 4.1206559120123814e-05, + "loss": 2.7573, + "step": 46324 + }, + { + "epoch": 0.27550789799219716, + "grad_norm": 2.348395347595215, + "learning_rate": 4.120620346051068e-05, + "loss": 2.7512, + "step": 46325 + }, + { + "epoch": 0.2755138452754782, + "grad_norm": 2.5664632320404053, + "learning_rate": 4.1205847795240114e-05, + "loss": 2.578, + "step": 46326 + }, + { + "epoch": 0.27551979255875914, + "grad_norm": 2.5395994186401367, + "learning_rate": 4.1205492124312265e-05, + "loss": 2.8958, + "step": 46327 + }, + { + "epoch": 0.27552573984204015, + "grad_norm": 2.6211094856262207, + "learning_rate": 4.1205136447727245e-05, + "loss": 2.7713, + "step": 46328 + }, + { + "epoch": 0.27553168712532117, + "grad_norm": 2.725020170211792, + "learning_rate": 4.1204780765485176e-05, + "loss": 2.8726, + "step": 46329 + }, + { + "epoch": 0.27553763440860213, + "grad_norm": 3.040479898452759, + "learning_rate": 4.120442507758618e-05, + "loss": 2.7506, + "step": 46330 + }, + { + "epoch": 0.27554358169188314, + "grad_norm": 2.511183977127075, + "learning_rate": 4.1204069384030396e-05, + "loss": 2.6129, + "step": 46331 + }, + { + "epoch": 0.27554952897516416, + "grad_norm": 2.4508872032165527, + "learning_rate": 4.1203713684817934e-05, + "loss": 2.5745, + "step": 46332 + }, + { + "epoch": 0.2755554762584451, + "grad_norm": 2.451998233795166, + "learning_rate": 4.1203357979948925e-05, + "loss": 2.5259, + "step": 46333 + }, + { + "epoch": 0.27556142354172614, + "grad_norm": 2.6334755420684814, + "learning_rate": 4.12030022694235e-05, + "loss": 2.7732, + "step": 46334 + }, + { + "epoch": 0.27556737082500715, + "grad_norm": 2.47263503074646, + "learning_rate": 4.120264655324176e-05, + "loss": 2.7606, + "step": 46335 + }, + { + "epoch": 0.2755733181082881, + "grad_norm": 2.516122817993164, + "learning_rate": 4.1202290831403844e-05, + "loss": 2.8457, + "step": 46336 + }, + { + "epoch": 0.27557926539156913, + "grad_norm": 2.42368483543396, + "learning_rate": 4.120193510390988e-05, + "loss": 2.6613, + "step": 46337 + }, + { + "epoch": 0.27558521267485014, + "grad_norm": 2.566934823989868, + "learning_rate": 4.1201579370759993e-05, + "loss": 2.8725, + "step": 46338 + }, + { + "epoch": 0.2755911599581311, + "grad_norm": 2.638122320175171, + "learning_rate": 4.120122363195429e-05, + "loss": 2.8593, + "step": 46339 + }, + { + "epoch": 0.2755971072414121, + "grad_norm": 2.6111538410186768, + "learning_rate": 4.120086788749292e-05, + "loss": 2.9179, + "step": 46340 + }, + { + "epoch": 0.27560305452469314, + "grad_norm": 2.3661746978759766, + "learning_rate": 4.1200512137375983e-05, + "loss": 2.6156, + "step": 46341 + }, + { + "epoch": 0.2756090018079741, + "grad_norm": 2.161673069000244, + "learning_rate": 4.120015638160362e-05, + "loss": 2.7477, + "step": 46342 + }, + { + "epoch": 0.2756149490912551, + "grad_norm": 2.3958606719970703, + "learning_rate": 4.119980062017595e-05, + "loss": 2.8345, + "step": 46343 + }, + { + "epoch": 0.27562089637453613, + "grad_norm": 2.36853289604187, + "learning_rate": 4.11994448530931e-05, + "loss": 2.7415, + "step": 46344 + }, + { + "epoch": 0.2756268436578171, + "grad_norm": 1.8760372400283813, + "learning_rate": 4.119908908035519e-05, + "loss": 4.7091, + "step": 46345 + }, + { + "epoch": 0.2756327909410981, + "grad_norm": 1.6887768507003784, + "learning_rate": 4.1198733301962346e-05, + "loss": 4.9376, + "step": 46346 + }, + { + "epoch": 0.2756387382243791, + "grad_norm": 2.022223711013794, + "learning_rate": 4.119837751791469e-05, + "loss": 4.1152, + "step": 46347 + }, + { + "epoch": 0.2756446855076601, + "grad_norm": 1.9374958276748657, + "learning_rate": 4.1198021728212336e-05, + "loss": 3.8572, + "step": 46348 + }, + { + "epoch": 0.2756506327909411, + "grad_norm": 1.8152077198028564, + "learning_rate": 4.1197665932855435e-05, + "loss": 4.7934, + "step": 46349 + }, + { + "epoch": 0.2756565800742221, + "grad_norm": 1.676936149597168, + "learning_rate": 4.1197310131844094e-05, + "loss": 4.7711, + "step": 46350 + }, + { + "epoch": 0.2756625273575031, + "grad_norm": 1.6014472246170044, + "learning_rate": 4.1196954325178435e-05, + "loss": 5.0566, + "step": 46351 + }, + { + "epoch": 0.2756684746407841, + "grad_norm": 2.5453591346740723, + "learning_rate": 4.119659851285859e-05, + "loss": 4.9387, + "step": 46352 + }, + { + "epoch": 0.2756744219240651, + "grad_norm": 1.6661057472229004, + "learning_rate": 4.119624269488468e-05, + "loss": 5.0797, + "step": 46353 + }, + { + "epoch": 0.27568036920734607, + "grad_norm": 1.6584420204162598, + "learning_rate": 4.119588687125682e-05, + "loss": 4.6116, + "step": 46354 + }, + { + "epoch": 0.2756863164906271, + "grad_norm": 1.7872332334518433, + "learning_rate": 4.119553104197515e-05, + "loss": 4.2994, + "step": 46355 + }, + { + "epoch": 0.2756922637739081, + "grad_norm": 1.956977367401123, + "learning_rate": 4.1195175207039796e-05, + "loss": 4.5507, + "step": 46356 + }, + { + "epoch": 0.27569821105718906, + "grad_norm": 2.0353682041168213, + "learning_rate": 4.119481936645087e-05, + "loss": 4.3621, + "step": 46357 + }, + { + "epoch": 0.2757041583404701, + "grad_norm": 2.236233949661255, + "learning_rate": 4.1194463520208495e-05, + "loss": 3.4907, + "step": 46358 + }, + { + "epoch": 0.2757101056237511, + "grad_norm": 1.667649269104004, + "learning_rate": 4.1194107668312795e-05, + "loss": 4.5093, + "step": 46359 + }, + { + "epoch": 0.27571605290703205, + "grad_norm": 1.646866798400879, + "learning_rate": 4.11937518107639e-05, + "loss": 4.391, + "step": 46360 + }, + { + "epoch": 0.27572200019031307, + "grad_norm": 1.6360965967178345, + "learning_rate": 4.119339594756194e-05, + "loss": 3.7837, + "step": 46361 + }, + { + "epoch": 0.2757279474735941, + "grad_norm": 1.5513288974761963, + "learning_rate": 4.119304007870703e-05, + "loss": 4.2487, + "step": 46362 + }, + { + "epoch": 0.27573389475687504, + "grad_norm": 1.6423419713974, + "learning_rate": 4.119268420419931e-05, + "loss": 4.7782, + "step": 46363 + }, + { + "epoch": 0.27573984204015606, + "grad_norm": 1.805232286453247, + "learning_rate": 4.1192328324038876e-05, + "loss": 4.7219, + "step": 46364 + }, + { + "epoch": 0.2757457893234371, + "grad_norm": 1.6557554006576538, + "learning_rate": 4.119197243822587e-05, + "loss": 4.7449, + "step": 46365 + }, + { + "epoch": 0.27575173660671803, + "grad_norm": 1.821544885635376, + "learning_rate": 4.1191616546760415e-05, + "loss": 4.5332, + "step": 46366 + }, + { + "epoch": 0.27575768388999905, + "grad_norm": 1.6391602754592896, + "learning_rate": 4.119126064964263e-05, + "loss": 4.0218, + "step": 46367 + }, + { + "epoch": 0.27576363117328007, + "grad_norm": 1.6593505144119263, + "learning_rate": 4.1190904746872646e-05, + "loss": 4.2965, + "step": 46368 + }, + { + "epoch": 0.275769578456561, + "grad_norm": 1.7055820226669312, + "learning_rate": 4.119054883845059e-05, + "loss": 4.5568, + "step": 46369 + }, + { + "epoch": 0.27577552573984204, + "grad_norm": 1.8621621131896973, + "learning_rate": 4.119019292437658e-05, + "loss": 4.636, + "step": 46370 + }, + { + "epoch": 0.27578147302312306, + "grad_norm": 1.8205857276916504, + "learning_rate": 4.118983700465073e-05, + "loss": 4.3227, + "step": 46371 + }, + { + "epoch": 0.275787420306404, + "grad_norm": 1.771716833114624, + "learning_rate": 4.118948107927319e-05, + "loss": 4.6952, + "step": 46372 + }, + { + "epoch": 0.27579336758968503, + "grad_norm": 2.027379035949707, + "learning_rate": 4.118912514824406e-05, + "loss": 3.4197, + "step": 46373 + }, + { + "epoch": 0.27579931487296605, + "grad_norm": 1.861850380897522, + "learning_rate": 4.118876921156347e-05, + "loss": 4.0007, + "step": 46374 + }, + { + "epoch": 0.275805262156247, + "grad_norm": 1.353298544883728, + "learning_rate": 4.118841326923155e-05, + "loss": 4.7036, + "step": 46375 + }, + { + "epoch": 0.275811209439528, + "grad_norm": 1.9911999702453613, + "learning_rate": 4.118805732124843e-05, + "loss": 4.9063, + "step": 46376 + }, + { + "epoch": 0.27581715672280904, + "grad_norm": 1.3303937911987305, + "learning_rate": 4.118770136761422e-05, + "loss": 4.7665, + "step": 46377 + }, + { + "epoch": 0.27582310400609, + "grad_norm": 1.63236403465271, + "learning_rate": 4.118734540832905e-05, + "loss": 4.4452, + "step": 46378 + }, + { + "epoch": 0.275829051289371, + "grad_norm": 1.8619455099105835, + "learning_rate": 4.118698944339305e-05, + "loss": 4.0223, + "step": 46379 + }, + { + "epoch": 0.27583499857265203, + "grad_norm": 1.71010422706604, + "learning_rate": 4.118663347280634e-05, + "loss": 4.5706, + "step": 46380 + }, + { + "epoch": 0.275840945855933, + "grad_norm": 1.7219008207321167, + "learning_rate": 4.118627749656904e-05, + "loss": 4.9622, + "step": 46381 + }, + { + "epoch": 0.275846893139214, + "grad_norm": 1.804564356803894, + "learning_rate": 4.118592151468128e-05, + "loss": 4.3485, + "step": 46382 + }, + { + "epoch": 0.275852840422495, + "grad_norm": 1.6228156089782715, + "learning_rate": 4.1185565527143174e-05, + "loss": 4.4276, + "step": 46383 + }, + { + "epoch": 0.275858787705776, + "grad_norm": 1.8870218992233276, + "learning_rate": 4.118520953395486e-05, + "loss": 4.1577, + "step": 46384 + }, + { + "epoch": 0.275864734989057, + "grad_norm": 1.6808301210403442, + "learning_rate": 4.118485353511646e-05, + "loss": 4.1096, + "step": 46385 + }, + { + "epoch": 0.275870682272338, + "grad_norm": 2.0159380435943604, + "learning_rate": 4.118449753062808e-05, + "loss": 4.0438, + "step": 46386 + }, + { + "epoch": 0.275876629555619, + "grad_norm": 1.7474654912948608, + "learning_rate": 4.1184141520489884e-05, + "loss": 4.2386, + "step": 46387 + }, + { + "epoch": 0.2758825768389, + "grad_norm": 1.5364694595336914, + "learning_rate": 4.1183785504701955e-05, + "loss": 4.3685, + "step": 46388 + }, + { + "epoch": 0.275888524122181, + "grad_norm": 1.7362271547317505, + "learning_rate": 4.118342948326444e-05, + "loss": 4.8762, + "step": 46389 + }, + { + "epoch": 0.27589447140546197, + "grad_norm": 1.8091872930526733, + "learning_rate": 4.118307345617745e-05, + "loss": 4.2197, + "step": 46390 + }, + { + "epoch": 0.275900418688743, + "grad_norm": 1.7467803955078125, + "learning_rate": 4.118271742344112e-05, + "loss": 3.996, + "step": 46391 + }, + { + "epoch": 0.275906365972024, + "grad_norm": 1.8009520769119263, + "learning_rate": 4.118236138505557e-05, + "loss": 4.1719, + "step": 46392 + }, + { + "epoch": 0.27591231325530496, + "grad_norm": 1.7004477977752686, + "learning_rate": 4.1182005341020926e-05, + "loss": 4.0192, + "step": 46393 + }, + { + "epoch": 0.275918260538586, + "grad_norm": 1.914019227027893, + "learning_rate": 4.118164929133731e-05, + "loss": 3.8898, + "step": 46394 + }, + { + "epoch": 0.275924207821867, + "grad_norm": 1.8572369813919067, + "learning_rate": 4.1181293236004845e-05, + "loss": 4.0783, + "step": 46395 + }, + { + "epoch": 0.27593015510514796, + "grad_norm": 1.946156620979309, + "learning_rate": 4.1180937175023657e-05, + "loss": 4.7292, + "step": 46396 + }, + { + "epoch": 0.27593610238842897, + "grad_norm": 2.0651285648345947, + "learning_rate": 4.1180581108393876e-05, + "loss": 3.5111, + "step": 46397 + }, + { + "epoch": 0.27594204967171, + "grad_norm": 2.157212734222412, + "learning_rate": 4.1180225036115626e-05, + "loss": 4.1653, + "step": 46398 + }, + { + "epoch": 0.27594799695499095, + "grad_norm": 1.7785751819610596, + "learning_rate": 4.1179868958189014e-05, + "loss": 3.9993, + "step": 46399 + }, + { + "epoch": 0.27595394423827196, + "grad_norm": 2.273373603820801, + "learning_rate": 4.117951287461419e-05, + "loss": 4.0431, + "step": 46400 + }, + { + "epoch": 0.275959891521553, + "grad_norm": 1.6159999370574951, + "learning_rate": 4.117915678539126e-05, + "loss": 4.1846, + "step": 46401 + }, + { + "epoch": 0.27596583880483394, + "grad_norm": 1.7439570426940918, + "learning_rate": 4.117880069052035e-05, + "loss": 4.734, + "step": 46402 + }, + { + "epoch": 0.27597178608811496, + "grad_norm": 1.63904869556427, + "learning_rate": 4.117844459000159e-05, + "loss": 5.0073, + "step": 46403 + }, + { + "epoch": 0.275977733371396, + "grad_norm": 2.9482760429382324, + "learning_rate": 4.1178088483835105e-05, + "loss": 4.0737, + "step": 46404 + }, + { + "epoch": 0.27598368065467693, + "grad_norm": 1.9497867822647095, + "learning_rate": 4.117773237202101e-05, + "loss": 4.2378, + "step": 46405 + }, + { + "epoch": 0.27598962793795795, + "grad_norm": 2.000408411026001, + "learning_rate": 4.117737625455944e-05, + "loss": 3.9546, + "step": 46406 + }, + { + "epoch": 0.27599557522123896, + "grad_norm": 1.7994492053985596, + "learning_rate": 4.117702013145052e-05, + "loss": 4.0921, + "step": 46407 + }, + { + "epoch": 0.2760015225045199, + "grad_norm": 1.7621219158172607, + "learning_rate": 4.1176664002694354e-05, + "loss": 3.8804, + "step": 46408 + }, + { + "epoch": 0.27600746978780094, + "grad_norm": 1.9557427167892456, + "learning_rate": 4.1176307868291096e-05, + "loss": 3.8642, + "step": 46409 + }, + { + "epoch": 0.27601341707108196, + "grad_norm": 1.888837218284607, + "learning_rate": 4.117595172824085e-05, + "loss": 3.7376, + "step": 46410 + }, + { + "epoch": 0.2760193643543629, + "grad_norm": 1.9168221950531006, + "learning_rate": 4.1175595582543744e-05, + "loss": 3.6243, + "step": 46411 + }, + { + "epoch": 0.27602531163764393, + "grad_norm": 1.8254179954528809, + "learning_rate": 4.117523943119992e-05, + "loss": 4.2094, + "step": 46412 + }, + { + "epoch": 0.27603125892092495, + "grad_norm": 1.8136615753173828, + "learning_rate": 4.117488327420947e-05, + "loss": 4.0229, + "step": 46413 + }, + { + "epoch": 0.2760372062042059, + "grad_norm": 1.84183669090271, + "learning_rate": 4.117452711157254e-05, + "loss": 3.8174, + "step": 46414 + }, + { + "epoch": 0.2760431534874869, + "grad_norm": 1.6920356750488281, + "learning_rate": 4.117417094328925e-05, + "loss": 3.8512, + "step": 46415 + }, + { + "epoch": 0.2760491007707679, + "grad_norm": 1.789772391319275, + "learning_rate": 4.117381476935973e-05, + "loss": 3.6766, + "step": 46416 + }, + { + "epoch": 0.2760550480540489, + "grad_norm": 1.7010245323181152, + "learning_rate": 4.1173458589784094e-05, + "loss": 4.0047, + "step": 46417 + }, + { + "epoch": 0.2760609953373299, + "grad_norm": 1.4292240142822266, + "learning_rate": 4.117310240456247e-05, + "loss": 4.8655, + "step": 46418 + }, + { + "epoch": 0.2760669426206109, + "grad_norm": 1.5990276336669922, + "learning_rate": 4.117274621369498e-05, + "loss": 3.9876, + "step": 46419 + }, + { + "epoch": 0.2760728899038919, + "grad_norm": 1.7409169673919678, + "learning_rate": 4.117239001718176e-05, + "loss": 4.0033, + "step": 46420 + }, + { + "epoch": 0.2760788371871729, + "grad_norm": 1.726759672164917, + "learning_rate": 4.117203381502292e-05, + "loss": 3.8523, + "step": 46421 + }, + { + "epoch": 0.27608478447045387, + "grad_norm": 1.825735092163086, + "learning_rate": 4.117167760721858e-05, + "loss": 3.8477, + "step": 46422 + }, + { + "epoch": 0.2760907317537349, + "grad_norm": 1.853615164756775, + "learning_rate": 4.1171321393768894e-05, + "loss": 3.8451, + "step": 46423 + }, + { + "epoch": 0.2760966790370159, + "grad_norm": 1.8566569089889526, + "learning_rate": 4.1170965174673956e-05, + "loss": 3.8457, + "step": 46424 + }, + { + "epoch": 0.27610262632029686, + "grad_norm": 1.7497293949127197, + "learning_rate": 4.1170608949933895e-05, + "loss": 3.9097, + "step": 46425 + }, + { + "epoch": 0.2761085736035779, + "grad_norm": 1.6266469955444336, + "learning_rate": 4.1170252719548854e-05, + "loss": 3.9246, + "step": 46426 + }, + { + "epoch": 0.2761145208868589, + "grad_norm": 1.7242881059646606, + "learning_rate": 4.1169896483518935e-05, + "loss": 3.8475, + "step": 46427 + }, + { + "epoch": 0.27612046817013985, + "grad_norm": 1.7757164239883423, + "learning_rate": 4.116954024184428e-05, + "loss": 3.9295, + "step": 46428 + }, + { + "epoch": 0.27612641545342087, + "grad_norm": 1.6675481796264648, + "learning_rate": 4.1169183994525004e-05, + "loss": 3.9384, + "step": 46429 + }, + { + "epoch": 0.2761323627367019, + "grad_norm": 1.7716281414031982, + "learning_rate": 4.116882774156123e-05, + "loss": 3.9287, + "step": 46430 + }, + { + "epoch": 0.27613831001998285, + "grad_norm": 1.7790355682373047, + "learning_rate": 4.1168471482953085e-05, + "loss": 3.9003, + "step": 46431 + }, + { + "epoch": 0.27614425730326386, + "grad_norm": 1.654343605041504, + "learning_rate": 4.11681152187007e-05, + "loss": 3.8324, + "step": 46432 + }, + { + "epoch": 0.2761502045865449, + "grad_norm": 1.645513892173767, + "learning_rate": 4.116775894880419e-05, + "loss": 3.8656, + "step": 46433 + }, + { + "epoch": 0.27615615186982584, + "grad_norm": 1.6767617464065552, + "learning_rate": 4.116740267326368e-05, + "loss": 4.0902, + "step": 46434 + }, + { + "epoch": 0.27616209915310685, + "grad_norm": 1.7248356342315674, + "learning_rate": 4.116704639207929e-05, + "loss": 3.9715, + "step": 46435 + }, + { + "epoch": 0.27616804643638787, + "grad_norm": 1.8066498041152954, + "learning_rate": 4.116669010525116e-05, + "loss": 3.9562, + "step": 46436 + }, + { + "epoch": 0.27617399371966883, + "grad_norm": 1.7753214836120605, + "learning_rate": 4.1166333812779405e-05, + "loss": 3.8829, + "step": 46437 + }, + { + "epoch": 0.27617994100294985, + "grad_norm": 1.526048183441162, + "learning_rate": 4.116597751466415e-05, + "loss": 4.9131, + "step": 46438 + }, + { + "epoch": 0.27618588828623086, + "grad_norm": 1.6890324354171753, + "learning_rate": 4.116562121090552e-05, + "loss": 3.9636, + "step": 46439 + }, + { + "epoch": 0.2761918355695118, + "grad_norm": 1.6544886827468872, + "learning_rate": 4.116526490150363e-05, + "loss": 3.9754, + "step": 46440 + }, + { + "epoch": 0.27619778285279284, + "grad_norm": 1.5963088274002075, + "learning_rate": 4.116490858645863e-05, + "loss": 3.8811, + "step": 46441 + }, + { + "epoch": 0.27620373013607386, + "grad_norm": 1.594773292541504, + "learning_rate": 4.116455226577061e-05, + "loss": 3.9238, + "step": 46442 + }, + { + "epoch": 0.2762096774193548, + "grad_norm": 1.8818258047103882, + "learning_rate": 4.116419593943972e-05, + "loss": 3.7582, + "step": 46443 + }, + { + "epoch": 0.27621562470263583, + "grad_norm": 1.7719999551773071, + "learning_rate": 4.1163839607466084e-05, + "loss": 3.7898, + "step": 46444 + }, + { + "epoch": 0.27622157198591685, + "grad_norm": 1.760658860206604, + "learning_rate": 4.11634832698498e-05, + "loss": 3.7625, + "step": 46445 + }, + { + "epoch": 0.2762275192691978, + "grad_norm": 1.6701582670211792, + "learning_rate": 4.116312692659102e-05, + "loss": 3.821, + "step": 46446 + }, + { + "epoch": 0.2762334665524788, + "grad_norm": 1.5499728918075562, + "learning_rate": 4.116277057768987e-05, + "loss": 3.8524, + "step": 46447 + }, + { + "epoch": 0.27623941383575984, + "grad_norm": 1.7063299417495728, + "learning_rate": 4.116241422314645e-05, + "loss": 3.7978, + "step": 46448 + }, + { + "epoch": 0.2762453611190408, + "grad_norm": 1.8259849548339844, + "learning_rate": 4.11620578629609e-05, + "loss": 4.2779, + "step": 46449 + }, + { + "epoch": 0.2762513084023218, + "grad_norm": 1.6993992328643799, + "learning_rate": 4.1161701497133346e-05, + "loss": 3.8587, + "step": 46450 + }, + { + "epoch": 0.27625725568560283, + "grad_norm": 1.7869553565979004, + "learning_rate": 4.116134512566391e-05, + "loss": 3.7736, + "step": 46451 + }, + { + "epoch": 0.2762632029688838, + "grad_norm": 1.7702515125274658, + "learning_rate": 4.116098874855271e-05, + "loss": 3.7861, + "step": 46452 + }, + { + "epoch": 0.2762691502521648, + "grad_norm": 1.8474318981170654, + "learning_rate": 4.116063236579988e-05, + "loss": 3.855, + "step": 46453 + }, + { + "epoch": 0.2762750975354458, + "grad_norm": 1.9911600351333618, + "learning_rate": 4.116027597740554e-05, + "loss": 3.548, + "step": 46454 + }, + { + "epoch": 0.2762810448187268, + "grad_norm": 1.651448369026184, + "learning_rate": 4.115991958336981e-05, + "loss": 4.3608, + "step": 46455 + }, + { + "epoch": 0.2762869921020078, + "grad_norm": 1.4188848733901978, + "learning_rate": 4.115956318369283e-05, + "loss": 4.947, + "step": 46456 + }, + { + "epoch": 0.2762929393852888, + "grad_norm": 1.57933509349823, + "learning_rate": 4.1159206778374707e-05, + "loss": 5.0298, + "step": 46457 + }, + { + "epoch": 0.2762988866685698, + "grad_norm": 1.6797744035720825, + "learning_rate": 4.115885036741557e-05, + "loss": 5.1935, + "step": 46458 + }, + { + "epoch": 0.2763048339518508, + "grad_norm": 1.8477085828781128, + "learning_rate": 4.115849395081555e-05, + "loss": 4.388, + "step": 46459 + }, + { + "epoch": 0.2763107812351318, + "grad_norm": 2.0115890502929688, + "learning_rate": 4.115813752857476e-05, + "loss": 3.7329, + "step": 46460 + }, + { + "epoch": 0.27631672851841277, + "grad_norm": 2.071540594100952, + "learning_rate": 4.115778110069333e-05, + "loss": 3.9329, + "step": 46461 + }, + { + "epoch": 0.2763226758016938, + "grad_norm": 2.199815511703491, + "learning_rate": 4.115742466717139e-05, + "loss": 3.6608, + "step": 46462 + }, + { + "epoch": 0.2763286230849748, + "grad_norm": 1.7199076414108276, + "learning_rate": 4.115706822800906e-05, + "loss": 4.8937, + "step": 46463 + }, + { + "epoch": 0.27633457036825576, + "grad_norm": 2.29736065864563, + "learning_rate": 4.1156711783206455e-05, + "loss": 3.8735, + "step": 46464 + }, + { + "epoch": 0.2763405176515368, + "grad_norm": 1.82344388961792, + "learning_rate": 4.115635533276373e-05, + "loss": 4.4667, + "step": 46465 + }, + { + "epoch": 0.2763464649348178, + "grad_norm": 2.007000207901001, + "learning_rate": 4.115599887668097e-05, + "loss": 4.7112, + "step": 46466 + }, + { + "epoch": 0.27635241221809875, + "grad_norm": 2.1531834602355957, + "learning_rate": 4.115564241495833e-05, + "loss": 4.7594, + "step": 46467 + }, + { + "epoch": 0.27635835950137977, + "grad_norm": 1.4306474924087524, + "learning_rate": 4.115528594759591e-05, + "loss": 4.6347, + "step": 46468 + }, + { + "epoch": 0.2763643067846608, + "grad_norm": 1.6452287435531616, + "learning_rate": 4.1154929474593855e-05, + "loss": 4.8658, + "step": 46469 + }, + { + "epoch": 0.27637025406794175, + "grad_norm": 1.8173059225082397, + "learning_rate": 4.115457299595228e-05, + "loss": 4.7261, + "step": 46470 + }, + { + "epoch": 0.27637620135122276, + "grad_norm": 1.9495271444320679, + "learning_rate": 4.115421651167131e-05, + "loss": 4.1717, + "step": 46471 + }, + { + "epoch": 0.2763821486345038, + "grad_norm": 1.9909406900405884, + "learning_rate": 4.115386002175107e-05, + "loss": 3.7473, + "step": 46472 + }, + { + "epoch": 0.27638809591778474, + "grad_norm": 2.117976427078247, + "learning_rate": 4.115350352619168e-05, + "loss": 4.7196, + "step": 46473 + }, + { + "epoch": 0.27639404320106575, + "grad_norm": 1.8679407835006714, + "learning_rate": 4.115314702499328e-05, + "loss": 4.6627, + "step": 46474 + }, + { + "epoch": 0.27639999048434677, + "grad_norm": 1.7012509107589722, + "learning_rate": 4.115279051815597e-05, + "loss": 4.607, + "step": 46475 + }, + { + "epoch": 0.27640593776762773, + "grad_norm": 1.6842422485351562, + "learning_rate": 4.1152434005679894e-05, + "loss": 4.5624, + "step": 46476 + }, + { + "epoch": 0.27641188505090875, + "grad_norm": 2.83176589012146, + "learning_rate": 4.115207748756517e-05, + "loss": 2.9981, + "step": 46477 + }, + { + "epoch": 0.27641783233418976, + "grad_norm": 2.882211446762085, + "learning_rate": 4.115172096381192e-05, + "loss": 2.8138, + "step": 46478 + }, + { + "epoch": 0.2764237796174707, + "grad_norm": 2.5308356285095215, + "learning_rate": 4.115136443442027e-05, + "loss": 2.9673, + "step": 46479 + }, + { + "epoch": 0.27642972690075174, + "grad_norm": 1.9194347858428955, + "learning_rate": 4.115100789939035e-05, + "loss": 4.0069, + "step": 46480 + }, + { + "epoch": 0.27643567418403275, + "grad_norm": 2.5724070072174072, + "learning_rate": 4.115065135872228e-05, + "loss": 3.3893, + "step": 46481 + }, + { + "epoch": 0.2764416214673137, + "grad_norm": 2.3267436027526855, + "learning_rate": 4.1150294812416176e-05, + "loss": 3.4233, + "step": 46482 + }, + { + "epoch": 0.27644756875059473, + "grad_norm": 2.2949254512786865, + "learning_rate": 4.114993826047219e-05, + "loss": 3.273, + "step": 46483 + }, + { + "epoch": 0.27645351603387575, + "grad_norm": 2.2126100063323975, + "learning_rate": 4.114958170289041e-05, + "loss": 3.2145, + "step": 46484 + }, + { + "epoch": 0.2764594633171567, + "grad_norm": 2.2324166297912598, + "learning_rate": 4.114922513967098e-05, + "loss": 3.0003, + "step": 46485 + }, + { + "epoch": 0.2764654106004377, + "grad_norm": 2.302678346633911, + "learning_rate": 4.114886857081403e-05, + "loss": 2.9861, + "step": 46486 + }, + { + "epoch": 0.27647135788371874, + "grad_norm": 2.6376290321350098, + "learning_rate": 4.114851199631967e-05, + "loss": 3.2721, + "step": 46487 + }, + { + "epoch": 0.2764773051669997, + "grad_norm": 1.6052318811416626, + "learning_rate": 4.1148155416188036e-05, + "loss": 4.6641, + "step": 46488 + }, + { + "epoch": 0.2764832524502807, + "grad_norm": 1.7190817594528198, + "learning_rate": 4.114779883041925e-05, + "loss": 4.7487, + "step": 46489 + }, + { + "epoch": 0.27648919973356173, + "grad_norm": 2.0914087295532227, + "learning_rate": 4.1147442239013426e-05, + "loss": 4.539, + "step": 46490 + }, + { + "epoch": 0.2764951470168427, + "grad_norm": 1.9834105968475342, + "learning_rate": 4.114708564197069e-05, + "loss": 3.839, + "step": 46491 + }, + { + "epoch": 0.2765010943001237, + "grad_norm": 2.7591798305511475, + "learning_rate": 4.11467290392912e-05, + "loss": 2.9418, + "step": 46492 + }, + { + "epoch": 0.2765070415834047, + "grad_norm": 2.6576919555664062, + "learning_rate": 4.114637243097503e-05, + "loss": 3.1906, + "step": 46493 + }, + { + "epoch": 0.2765129888666857, + "grad_norm": 1.8007456064224243, + "learning_rate": 4.114601581702233e-05, + "loss": 4.5075, + "step": 46494 + }, + { + "epoch": 0.2765189361499667, + "grad_norm": 1.925412893295288, + "learning_rate": 4.114565919743324e-05, + "loss": 4.9098, + "step": 46495 + }, + { + "epoch": 0.2765248834332477, + "grad_norm": 1.7347475290298462, + "learning_rate": 4.114530257220785e-05, + "loss": 4.4868, + "step": 46496 + }, + { + "epoch": 0.2765308307165287, + "grad_norm": 2.421402931213379, + "learning_rate": 4.11449459413463e-05, + "loss": 3.5685, + "step": 46497 + }, + { + "epoch": 0.2765367779998097, + "grad_norm": 2.6667304039001465, + "learning_rate": 4.114458930484873e-05, + "loss": 3.2368, + "step": 46498 + }, + { + "epoch": 0.2765427252830907, + "grad_norm": 2.0465261936187744, + "learning_rate": 4.114423266271524e-05, + "loss": 4.4391, + "step": 46499 + }, + { + "epoch": 0.27654867256637167, + "grad_norm": 1.8802244663238525, + "learning_rate": 4.114387601494597e-05, + "loss": 4.5355, + "step": 46500 + }, + { + "epoch": 0.2765546198496527, + "grad_norm": 1.5809935331344604, + "learning_rate": 4.114351936154105e-05, + "loss": 5.2118, + "step": 46501 + }, + { + "epoch": 0.2765605671329337, + "grad_norm": 1.5702213048934937, + "learning_rate": 4.114316270250058e-05, + "loss": 4.5846, + "step": 46502 + }, + { + "epoch": 0.27656651441621466, + "grad_norm": 1.3923139572143555, + "learning_rate": 4.11428060378247e-05, + "loss": 4.7383, + "step": 46503 + }, + { + "epoch": 0.2765724616994957, + "grad_norm": 1.6765409708023071, + "learning_rate": 4.114244936751354e-05, + "loss": 4.6588, + "step": 46504 + }, + { + "epoch": 0.2765784089827767, + "grad_norm": 1.5359033346176147, + "learning_rate": 4.114209269156721e-05, + "loss": 4.6949, + "step": 46505 + }, + { + "epoch": 0.27658435626605765, + "grad_norm": 1.8949376344680786, + "learning_rate": 4.1141736009985846e-05, + "loss": 4.2894, + "step": 46506 + }, + { + "epoch": 0.27659030354933867, + "grad_norm": 2.7315285205841064, + "learning_rate": 4.114137932276956e-05, + "loss": 2.9263, + "step": 46507 + }, + { + "epoch": 0.2765962508326197, + "grad_norm": 1.9027632474899292, + "learning_rate": 4.114102262991849e-05, + "loss": 4.0802, + "step": 46508 + }, + { + "epoch": 0.27660219811590064, + "grad_norm": 2.1476776599884033, + "learning_rate": 4.114066593143276e-05, + "loss": 5.0769, + "step": 46509 + }, + { + "epoch": 0.27660814539918166, + "grad_norm": 1.6374648809432983, + "learning_rate": 4.114030922731249e-05, + "loss": 4.9349, + "step": 46510 + }, + { + "epoch": 0.2766140926824627, + "grad_norm": 1.8896476030349731, + "learning_rate": 4.11399525175578e-05, + "loss": 3.4834, + "step": 46511 + }, + { + "epoch": 0.27662003996574364, + "grad_norm": 1.984359622001648, + "learning_rate": 4.113959580216883e-05, + "loss": 3.9304, + "step": 46512 + }, + { + "epoch": 0.27662598724902465, + "grad_norm": 2.0313098430633545, + "learning_rate": 4.113923908114569e-05, + "loss": 1.9789, + "step": 46513 + }, + { + "epoch": 0.27663193453230567, + "grad_norm": 2.3650319576263428, + "learning_rate": 4.1138882354488504e-05, + "loss": 3.656, + "step": 46514 + }, + { + "epoch": 0.27663788181558663, + "grad_norm": 2.493945360183716, + "learning_rate": 4.11385256221974e-05, + "loss": 3.8533, + "step": 46515 + }, + { + "epoch": 0.27664382909886764, + "grad_norm": 2.7011516094207764, + "learning_rate": 4.113816888427251e-05, + "loss": 3.4951, + "step": 46516 + }, + { + "epoch": 0.27664977638214866, + "grad_norm": 1.90290105342865, + "learning_rate": 4.1137812140713935e-05, + "loss": 4.0458, + "step": 46517 + }, + { + "epoch": 0.2766557236654296, + "grad_norm": 2.355600118637085, + "learning_rate": 4.113745539152184e-05, + "loss": 3.3693, + "step": 46518 + }, + { + "epoch": 0.27666167094871064, + "grad_norm": 2.3680260181427, + "learning_rate": 4.1137098636696314e-05, + "loss": 3.342, + "step": 46519 + }, + { + "epoch": 0.27666761823199165, + "grad_norm": 2.0990779399871826, + "learning_rate": 4.113674187623749e-05, + "loss": 3.7306, + "step": 46520 + }, + { + "epoch": 0.2766735655152726, + "grad_norm": 2.0423362255096436, + "learning_rate": 4.11363851101455e-05, + "loss": 4.1854, + "step": 46521 + }, + { + "epoch": 0.27667951279855363, + "grad_norm": 1.4370611906051636, + "learning_rate": 4.113602833842047e-05, + "loss": 4.8565, + "step": 46522 + }, + { + "epoch": 0.27668546008183464, + "grad_norm": 1.843262791633606, + "learning_rate": 4.1135671561062516e-05, + "loss": 3.7436, + "step": 46523 + }, + { + "epoch": 0.2766914073651156, + "grad_norm": 2.1074652671813965, + "learning_rate": 4.1135314778071764e-05, + "loss": 3.6113, + "step": 46524 + }, + { + "epoch": 0.2766973546483966, + "grad_norm": 2.165070056915283, + "learning_rate": 4.1134957989448345e-05, + "loss": 3.4849, + "step": 46525 + }, + { + "epoch": 0.27670330193167764, + "grad_norm": 2.1475296020507812, + "learning_rate": 4.1134601195192366e-05, + "loss": 3.8376, + "step": 46526 + }, + { + "epoch": 0.2767092492149586, + "grad_norm": 2.60561466217041, + "learning_rate": 4.113424439530398e-05, + "loss": 3.1735, + "step": 46527 + }, + { + "epoch": 0.2767151964982396, + "grad_norm": 2.203435182571411, + "learning_rate": 4.113388758978329e-05, + "loss": 3.965, + "step": 46528 + }, + { + "epoch": 0.27672114378152063, + "grad_norm": 1.9494457244873047, + "learning_rate": 4.113353077863042e-05, + "loss": 3.9724, + "step": 46529 + }, + { + "epoch": 0.2767270910648016, + "grad_norm": 1.5153099298477173, + "learning_rate": 4.1133173961845506e-05, + "loss": 4.5004, + "step": 46530 + }, + { + "epoch": 0.2767330383480826, + "grad_norm": 1.449952483177185, + "learning_rate": 4.113281713942867e-05, + "loss": 4.4672, + "step": 46531 + }, + { + "epoch": 0.27673898563136357, + "grad_norm": 1.4143486022949219, + "learning_rate": 4.113246031138003e-05, + "loss": 4.5514, + "step": 46532 + }, + { + "epoch": 0.2767449329146446, + "grad_norm": 1.708353042602539, + "learning_rate": 4.113210347769972e-05, + "loss": 4.3705, + "step": 46533 + }, + { + "epoch": 0.2767508801979256, + "grad_norm": 1.3894630670547485, + "learning_rate": 4.113174663838786e-05, + "loss": 4.3931, + "step": 46534 + }, + { + "epoch": 0.27675682748120656, + "grad_norm": 1.6113018989562988, + "learning_rate": 4.113138979344456e-05, + "loss": 4.8631, + "step": 46535 + }, + { + "epoch": 0.2767627747644876, + "grad_norm": 1.875211477279663, + "learning_rate": 4.113103294286997e-05, + "loss": 4.1685, + "step": 46536 + }, + { + "epoch": 0.2767687220477686, + "grad_norm": 1.584097146987915, + "learning_rate": 4.113067608666421e-05, + "loss": 4.335, + "step": 46537 + }, + { + "epoch": 0.27677466933104955, + "grad_norm": 1.606046199798584, + "learning_rate": 4.1130319224827386e-05, + "loss": 4.631, + "step": 46538 + }, + { + "epoch": 0.27678061661433057, + "grad_norm": 1.6677935123443604, + "learning_rate": 4.112996235735964e-05, + "loss": 4.3762, + "step": 46539 + }, + { + "epoch": 0.2767865638976116, + "grad_norm": 1.9951443672180176, + "learning_rate": 4.112960548426109e-05, + "loss": 3.9378, + "step": 46540 + }, + { + "epoch": 0.27679251118089254, + "grad_norm": 1.4472769498825073, + "learning_rate": 4.112924860553186e-05, + "loss": 4.3171, + "step": 46541 + }, + { + "epoch": 0.27679845846417356, + "grad_norm": 1.817063808441162, + "learning_rate": 4.112889172117208e-05, + "loss": 4.4548, + "step": 46542 + }, + { + "epoch": 0.2768044057474546, + "grad_norm": 2.205514669418335, + "learning_rate": 4.112853483118186e-05, + "loss": 4.5937, + "step": 46543 + }, + { + "epoch": 0.27681035303073553, + "grad_norm": 1.435754418373108, + "learning_rate": 4.112817793556134e-05, + "loss": 4.4815, + "step": 46544 + }, + { + "epoch": 0.27681630031401655, + "grad_norm": 1.5391920804977417, + "learning_rate": 4.112782103431064e-05, + "loss": 4.5726, + "step": 46545 + }, + { + "epoch": 0.27682224759729757, + "grad_norm": 1.445930004119873, + "learning_rate": 4.112746412742989e-05, + "loss": 4.7463, + "step": 46546 + }, + { + "epoch": 0.2768281948805785, + "grad_norm": 1.5038732290267944, + "learning_rate": 4.11271072149192e-05, + "loss": 4.6216, + "step": 46547 + }, + { + "epoch": 0.27683414216385954, + "grad_norm": 1.2193926572799683, + "learning_rate": 4.112675029677872e-05, + "loss": 4.6108, + "step": 46548 + }, + { + "epoch": 0.27684008944714056, + "grad_norm": 1.3881065845489502, + "learning_rate": 4.1126393373008543e-05, + "loss": 4.86, + "step": 46549 + }, + { + "epoch": 0.2768460367304215, + "grad_norm": 1.5172516107559204, + "learning_rate": 4.1126036443608804e-05, + "loss": 4.9179, + "step": 46550 + }, + { + "epoch": 0.27685198401370253, + "grad_norm": 1.4797431230545044, + "learning_rate": 4.112567950857964e-05, + "loss": 4.7315, + "step": 46551 + }, + { + "epoch": 0.27685793129698355, + "grad_norm": 1.7684022188186646, + "learning_rate": 4.112532256792117e-05, + "loss": 3.9731, + "step": 46552 + }, + { + "epoch": 0.2768638785802645, + "grad_norm": 1.2769299745559692, + "learning_rate": 4.112496562163352e-05, + "loss": 4.4939, + "step": 46553 + }, + { + "epoch": 0.2768698258635455, + "grad_norm": 1.5159220695495605, + "learning_rate": 4.11246086697168e-05, + "loss": 4.6351, + "step": 46554 + }, + { + "epoch": 0.27687577314682654, + "grad_norm": 1.4152077436447144, + "learning_rate": 4.1124251712171154e-05, + "loss": 4.4864, + "step": 46555 + }, + { + "epoch": 0.2768817204301075, + "grad_norm": 1.3684308528900146, + "learning_rate": 4.1123894748996695e-05, + "loss": 4.6781, + "step": 46556 + }, + { + "epoch": 0.2768876677133885, + "grad_norm": 1.591609239578247, + "learning_rate": 4.112353778019355e-05, + "loss": 4.2, + "step": 46557 + }, + { + "epoch": 0.27689361499666953, + "grad_norm": 1.4793378114700317, + "learning_rate": 4.112318080576185e-05, + "loss": 4.4003, + "step": 46558 + }, + { + "epoch": 0.2768995622799505, + "grad_norm": 1.407004952430725, + "learning_rate": 4.1122823825701705e-05, + "loss": 4.5011, + "step": 46559 + }, + { + "epoch": 0.2769055095632315, + "grad_norm": 1.781693935394287, + "learning_rate": 4.112246684001326e-05, + "loss": 4.2605, + "step": 46560 + }, + { + "epoch": 0.2769114568465125, + "grad_norm": 1.580680012702942, + "learning_rate": 4.1122109848696623e-05, + "loss": 4.5542, + "step": 46561 + }, + { + "epoch": 0.2769174041297935, + "grad_norm": 1.6824567317962646, + "learning_rate": 4.1121752851751915e-05, + "loss": 4.3804, + "step": 46562 + }, + { + "epoch": 0.2769233514130745, + "grad_norm": 1.234424352645874, + "learning_rate": 4.112139584917928e-05, + "loss": 4.7019, + "step": 46563 + }, + { + "epoch": 0.2769292986963555, + "grad_norm": 2.0822980403900146, + "learning_rate": 4.112103884097883e-05, + "loss": 4.1923, + "step": 46564 + }, + { + "epoch": 0.2769352459796365, + "grad_norm": 1.9092727899551392, + "learning_rate": 4.1120681827150694e-05, + "loss": 4.5973, + "step": 46565 + }, + { + "epoch": 0.2769411932629175, + "grad_norm": 1.7959601879119873, + "learning_rate": 4.1120324807694995e-05, + "loss": 4.5008, + "step": 46566 + }, + { + "epoch": 0.2769471405461985, + "grad_norm": 1.397923469543457, + "learning_rate": 4.111996778261186e-05, + "loss": 4.9485, + "step": 46567 + }, + { + "epoch": 0.27695308782947947, + "grad_norm": 1.6168447732925415, + "learning_rate": 4.1119610751901395e-05, + "loss": 5.1643, + "step": 46568 + }, + { + "epoch": 0.2769590351127605, + "grad_norm": 1.5668599605560303, + "learning_rate": 4.111925371556375e-05, + "loss": 4.5297, + "step": 46569 + }, + { + "epoch": 0.2769649823960415, + "grad_norm": 1.3261287212371826, + "learning_rate": 4.111889667359905e-05, + "loss": 4.9105, + "step": 46570 + }, + { + "epoch": 0.27697092967932246, + "grad_norm": 1.5422449111938477, + "learning_rate": 4.1118539626007394e-05, + "loss": 4.6793, + "step": 46571 + }, + { + "epoch": 0.2769768769626035, + "grad_norm": 1.609765648841858, + "learning_rate": 4.1118182572788934e-05, + "loss": 4.4891, + "step": 46572 + }, + { + "epoch": 0.2769828242458845, + "grad_norm": 1.5911588668823242, + "learning_rate": 4.1117825513943776e-05, + "loss": 4.4346, + "step": 46573 + }, + { + "epoch": 0.27698877152916546, + "grad_norm": 1.4577205181121826, + "learning_rate": 4.111746844947205e-05, + "loss": 4.6404, + "step": 46574 + }, + { + "epoch": 0.27699471881244647, + "grad_norm": 1.8759377002716064, + "learning_rate": 4.11171113793739e-05, + "loss": 4.249, + "step": 46575 + }, + { + "epoch": 0.2770006660957275, + "grad_norm": 1.33041250705719, + "learning_rate": 4.1116754303649406e-05, + "loss": 4.9197, + "step": 46576 + }, + { + "epoch": 0.27700661337900845, + "grad_norm": 1.4924203157424927, + "learning_rate": 4.111639722229873e-05, + "loss": 4.7323, + "step": 46577 + }, + { + "epoch": 0.27701256066228946, + "grad_norm": 1.6167148351669312, + "learning_rate": 4.1116040135322e-05, + "loss": 4.5661, + "step": 46578 + }, + { + "epoch": 0.2770185079455705, + "grad_norm": 1.8139673471450806, + "learning_rate": 4.111568304271931e-05, + "loss": 4.3257, + "step": 46579 + }, + { + "epoch": 0.27702445522885144, + "grad_norm": 1.5905241966247559, + "learning_rate": 4.111532594449081e-05, + "loss": 4.5966, + "step": 46580 + }, + { + "epoch": 0.27703040251213246, + "grad_norm": 1.693509578704834, + "learning_rate": 4.1114968840636616e-05, + "loss": 4.3991, + "step": 46581 + }, + { + "epoch": 0.2770363497954135, + "grad_norm": 1.6652072668075562, + "learning_rate": 4.111461173115685e-05, + "loss": 4.403, + "step": 46582 + }, + { + "epoch": 0.27704229707869443, + "grad_norm": 1.6170748472213745, + "learning_rate": 4.111425461605163e-05, + "loss": 4.4034, + "step": 46583 + }, + { + "epoch": 0.27704824436197545, + "grad_norm": 1.5030169486999512, + "learning_rate": 4.111389749532111e-05, + "loss": 4.8105, + "step": 46584 + }, + { + "epoch": 0.27705419164525646, + "grad_norm": 1.3292860984802246, + "learning_rate": 4.111354036896538e-05, + "loss": 5.0857, + "step": 46585 + }, + { + "epoch": 0.2770601389285374, + "grad_norm": 1.8751977682113647, + "learning_rate": 4.1113183236984584e-05, + "loss": 4.1901, + "step": 46586 + }, + { + "epoch": 0.27706608621181844, + "grad_norm": 1.738635540008545, + "learning_rate": 4.111282609937884e-05, + "loss": 4.8938, + "step": 46587 + }, + { + "epoch": 0.27707203349509946, + "grad_norm": 1.5313252210617065, + "learning_rate": 4.111246895614828e-05, + "loss": 4.8619, + "step": 46588 + }, + { + "epoch": 0.2770779807783804, + "grad_norm": 1.8797451257705688, + "learning_rate": 4.111211180729302e-05, + "loss": 4.1911, + "step": 46589 + }, + { + "epoch": 0.27708392806166143, + "grad_norm": 1.9033689498901367, + "learning_rate": 4.1111754652813194e-05, + "loss": 4.0362, + "step": 46590 + }, + { + "epoch": 0.27708987534494245, + "grad_norm": 1.812248945236206, + "learning_rate": 4.1111397492708914e-05, + "loss": 4.0856, + "step": 46591 + }, + { + "epoch": 0.2770958226282234, + "grad_norm": 1.4246841669082642, + "learning_rate": 4.111104032698031e-05, + "loss": 4.7389, + "step": 46592 + }, + { + "epoch": 0.2771017699115044, + "grad_norm": 1.6621381044387817, + "learning_rate": 4.111068315562752e-05, + "loss": 4.6305, + "step": 46593 + }, + { + "epoch": 0.27710771719478544, + "grad_norm": 1.5594626665115356, + "learning_rate": 4.1110325978650646e-05, + "loss": 4.8822, + "step": 46594 + }, + { + "epoch": 0.2771136644780664, + "grad_norm": 1.646182894706726, + "learning_rate": 4.110996879604983e-05, + "loss": 4.4138, + "step": 46595 + }, + { + "epoch": 0.2771196117613474, + "grad_norm": 1.5610476732254028, + "learning_rate": 4.1109611607825185e-05, + "loss": 4.857, + "step": 46596 + }, + { + "epoch": 0.27712555904462843, + "grad_norm": 1.8645826578140259, + "learning_rate": 4.110925441397684e-05, + "loss": 4.5517, + "step": 46597 + }, + { + "epoch": 0.2771315063279094, + "grad_norm": 1.7852973937988281, + "learning_rate": 4.110889721450492e-05, + "loss": 4.7924, + "step": 46598 + }, + { + "epoch": 0.2771374536111904, + "grad_norm": 1.2126195430755615, + "learning_rate": 4.110854000940956e-05, + "loss": 4.7179, + "step": 46599 + }, + { + "epoch": 0.2771434008944714, + "grad_norm": 1.5001544952392578, + "learning_rate": 4.1108182798690864e-05, + "loss": 4.7247, + "step": 46600 + }, + { + "epoch": 0.2771493481777524, + "grad_norm": 1.4635547399520874, + "learning_rate": 4.1107825582348976e-05, + "loss": 4.6757, + "step": 46601 + }, + { + "epoch": 0.2771552954610334, + "grad_norm": 1.581316351890564, + "learning_rate": 4.110746836038401e-05, + "loss": 4.5898, + "step": 46602 + }, + { + "epoch": 0.2771612427443144, + "grad_norm": 1.567139744758606, + "learning_rate": 4.110711113279609e-05, + "loss": 4.8667, + "step": 46603 + }, + { + "epoch": 0.2771671900275954, + "grad_norm": 1.4002759456634521, + "learning_rate": 4.110675389958535e-05, + "loss": 4.6755, + "step": 46604 + }, + { + "epoch": 0.2771731373108764, + "grad_norm": 1.5505328178405762, + "learning_rate": 4.110639666075191e-05, + "loss": 4.6042, + "step": 46605 + }, + { + "epoch": 0.2771790845941574, + "grad_norm": 1.7932301759719849, + "learning_rate": 4.1106039416295886e-05, + "loss": 4.472, + "step": 46606 + }, + { + "epoch": 0.27718503187743837, + "grad_norm": 1.4758508205413818, + "learning_rate": 4.110568216621741e-05, + "loss": 4.6877, + "step": 46607 + }, + { + "epoch": 0.2771909791607194, + "grad_norm": 1.7802846431732178, + "learning_rate": 4.110532491051661e-05, + "loss": 4.3435, + "step": 46608 + }, + { + "epoch": 0.2771969264440004, + "grad_norm": 1.6111273765563965, + "learning_rate": 4.1104967649193606e-05, + "loss": 4.7026, + "step": 46609 + }, + { + "epoch": 0.27720287372728136, + "grad_norm": 1.543663501739502, + "learning_rate": 4.110461038224852e-05, + "loss": 4.7537, + "step": 46610 + }, + { + "epoch": 0.2772088210105624, + "grad_norm": 1.590451955795288, + "learning_rate": 4.110425310968149e-05, + "loss": 4.4166, + "step": 46611 + }, + { + "epoch": 0.2772147682938434, + "grad_norm": 1.6533695459365845, + "learning_rate": 4.110389583149263e-05, + "loss": 4.4332, + "step": 46612 + }, + { + "epoch": 0.27722071557712435, + "grad_norm": 1.567581295967102, + "learning_rate": 4.1103538547682065e-05, + "loss": 4.6041, + "step": 46613 + }, + { + "epoch": 0.27722666286040537, + "grad_norm": 2.0828003883361816, + "learning_rate": 4.1103181258249915e-05, + "loss": 4.2512, + "step": 46614 + }, + { + "epoch": 0.2772326101436864, + "grad_norm": 2.496203660964966, + "learning_rate": 4.110282396319632e-05, + "loss": 3.9597, + "step": 46615 + }, + { + "epoch": 0.27723855742696735, + "grad_norm": 1.8653347492218018, + "learning_rate": 4.110246666252139e-05, + "loss": 4.8443, + "step": 46616 + }, + { + "epoch": 0.27724450471024836, + "grad_norm": 2.794694423675537, + "learning_rate": 4.110210935622526e-05, + "loss": 3.4487, + "step": 46617 + }, + { + "epoch": 0.2772504519935294, + "grad_norm": 2.3978664875030518, + "learning_rate": 4.110175204430804e-05, + "loss": 3.3825, + "step": 46618 + }, + { + "epoch": 0.27725639927681034, + "grad_norm": 1.569859504699707, + "learning_rate": 4.110139472676987e-05, + "loss": 4.6945, + "step": 46619 + }, + { + "epoch": 0.27726234656009136, + "grad_norm": 1.7185262441635132, + "learning_rate": 4.1101037403610874e-05, + "loss": 4.0932, + "step": 46620 + }, + { + "epoch": 0.27726829384337237, + "grad_norm": 1.6326043605804443, + "learning_rate": 4.110068007483116e-05, + "loss": 4.6835, + "step": 46621 + }, + { + "epoch": 0.27727424112665333, + "grad_norm": 1.4808011054992676, + "learning_rate": 4.1100322740430887e-05, + "loss": 4.7739, + "step": 46622 + }, + { + "epoch": 0.27728018840993435, + "grad_norm": 1.553889513015747, + "learning_rate": 4.109996540041014e-05, + "loss": 4.5134, + "step": 46623 + }, + { + "epoch": 0.27728613569321536, + "grad_norm": 1.7781357765197754, + "learning_rate": 4.1099608054769066e-05, + "loss": 4.5673, + "step": 46624 + }, + { + "epoch": 0.2772920829764963, + "grad_norm": 1.6408711671829224, + "learning_rate": 4.1099250703507786e-05, + "loss": 4.544, + "step": 46625 + }, + { + "epoch": 0.27729803025977734, + "grad_norm": 1.6988234519958496, + "learning_rate": 4.109889334662642e-05, + "loss": 4.6653, + "step": 46626 + }, + { + "epoch": 0.27730397754305836, + "grad_norm": 1.7587025165557861, + "learning_rate": 4.10985359841251e-05, + "loss": 4.379, + "step": 46627 + }, + { + "epoch": 0.2773099248263393, + "grad_norm": 1.5394182205200195, + "learning_rate": 4.1098178616003944e-05, + "loss": 4.6047, + "step": 46628 + }, + { + "epoch": 0.27731587210962033, + "grad_norm": 1.4645212888717651, + "learning_rate": 4.109782124226309e-05, + "loss": 4.5576, + "step": 46629 + }, + { + "epoch": 0.27732181939290135, + "grad_norm": 1.6716169118881226, + "learning_rate": 4.109746386290264e-05, + "loss": 4.7532, + "step": 46630 + }, + { + "epoch": 0.2773277666761823, + "grad_norm": 1.655103325843811, + "learning_rate": 4.109710647792274e-05, + "loss": 4.3169, + "step": 46631 + }, + { + "epoch": 0.2773337139594633, + "grad_norm": 1.64310622215271, + "learning_rate": 4.10967490873235e-05, + "loss": 4.6022, + "step": 46632 + }, + { + "epoch": 0.27733966124274434, + "grad_norm": 1.787264108657837, + "learning_rate": 4.109639169110506e-05, + "loss": 4.1957, + "step": 46633 + }, + { + "epoch": 0.2773456085260253, + "grad_norm": 1.6696933507919312, + "learning_rate": 4.109603428926753e-05, + "loss": 4.0716, + "step": 46634 + }, + { + "epoch": 0.2773515558093063, + "grad_norm": 1.6592191457748413, + "learning_rate": 4.1095676881811044e-05, + "loss": 4.1968, + "step": 46635 + }, + { + "epoch": 0.27735750309258733, + "grad_norm": 2.119739532470703, + "learning_rate": 4.109531946873572e-05, + "loss": 3.7894, + "step": 46636 + }, + { + "epoch": 0.2773634503758683, + "grad_norm": 1.7451374530792236, + "learning_rate": 4.109496205004169e-05, + "loss": 4.1354, + "step": 46637 + }, + { + "epoch": 0.2773693976591493, + "grad_norm": 1.7495702505111694, + "learning_rate": 4.109460462572907e-05, + "loss": 4.2049, + "step": 46638 + }, + { + "epoch": 0.2773753449424303, + "grad_norm": 2.083242893218994, + "learning_rate": 4.1094247195797994e-05, + "loss": 4.1497, + "step": 46639 + }, + { + "epoch": 0.2773812922257113, + "grad_norm": 1.9318830966949463, + "learning_rate": 4.109388976024858e-05, + "loss": 4.8409, + "step": 46640 + }, + { + "epoch": 0.2773872395089923, + "grad_norm": 1.681640863418579, + "learning_rate": 4.109353231908096e-05, + "loss": 4.8762, + "step": 46641 + }, + { + "epoch": 0.2773931867922733, + "grad_norm": 1.8281761407852173, + "learning_rate": 4.109317487229525e-05, + "loss": 4.9403, + "step": 46642 + }, + { + "epoch": 0.2773991340755543, + "grad_norm": 1.6983622312545776, + "learning_rate": 4.109281741989158e-05, + "loss": 4.5235, + "step": 46643 + }, + { + "epoch": 0.2774050813588353, + "grad_norm": 1.617712140083313, + "learning_rate": 4.109245996187007e-05, + "loss": 4.5634, + "step": 46644 + }, + { + "epoch": 0.2774110286421163, + "grad_norm": 1.518476128578186, + "learning_rate": 4.1092102498230856e-05, + "loss": 4.8359, + "step": 46645 + }, + { + "epoch": 0.27741697592539727, + "grad_norm": 1.6564971208572388, + "learning_rate": 4.109174502897405e-05, + "loss": 4.5071, + "step": 46646 + }, + { + "epoch": 0.2774229232086783, + "grad_norm": 1.5421513319015503, + "learning_rate": 4.109138755409978e-05, + "loss": 4.6472, + "step": 46647 + }, + { + "epoch": 0.2774288704919593, + "grad_norm": 1.5882887840270996, + "learning_rate": 4.1091030073608185e-05, + "loss": 4.491, + "step": 46648 + }, + { + "epoch": 0.27743481777524026, + "grad_norm": 1.7268037796020508, + "learning_rate": 4.109067258749936e-05, + "loss": 4.4266, + "step": 46649 + }, + { + "epoch": 0.2774407650585213, + "grad_norm": 1.4740532636642456, + "learning_rate": 4.1090315095773456e-05, + "loss": 4.7533, + "step": 46650 + }, + { + "epoch": 0.27744671234180224, + "grad_norm": 1.4477468729019165, + "learning_rate": 4.10899575984306e-05, + "loss": 4.4458, + "step": 46651 + }, + { + "epoch": 0.27745265962508325, + "grad_norm": 1.4894468784332275, + "learning_rate": 4.108960009547089e-05, + "loss": 4.5721, + "step": 46652 + }, + { + "epoch": 0.27745860690836427, + "grad_norm": 1.7010372877120972, + "learning_rate": 4.108924258689447e-05, + "loss": 4.3404, + "step": 46653 + }, + { + "epoch": 0.27746455419164523, + "grad_norm": 1.4919700622558594, + "learning_rate": 4.1088885072701466e-05, + "loss": 4.5516, + "step": 46654 + }, + { + "epoch": 0.27747050147492625, + "grad_norm": 1.8478305339813232, + "learning_rate": 4.1088527552892e-05, + "loss": 4.4408, + "step": 46655 + }, + { + "epoch": 0.27747644875820726, + "grad_norm": 1.520166277885437, + "learning_rate": 4.108817002746619e-05, + "loss": 4.8544, + "step": 46656 + }, + { + "epoch": 0.2774823960414882, + "grad_norm": 1.652934193611145, + "learning_rate": 4.108781249642416e-05, + "loss": 4.6079, + "step": 46657 + }, + { + "epoch": 0.27748834332476924, + "grad_norm": 1.4976013898849487, + "learning_rate": 4.108745495976605e-05, + "loss": 4.5565, + "step": 46658 + }, + { + "epoch": 0.27749429060805025, + "grad_norm": 1.7443841695785522, + "learning_rate": 4.1087097417491976e-05, + "loss": 4.6872, + "step": 46659 + }, + { + "epoch": 0.2775002378913312, + "grad_norm": 1.7703777551651, + "learning_rate": 4.1086739869602066e-05, + "loss": 4.6735, + "step": 46660 + }, + { + "epoch": 0.27750618517461223, + "grad_norm": 1.3490688800811768, + "learning_rate": 4.108638231609643e-05, + "loss": 4.6802, + "step": 46661 + }, + { + "epoch": 0.27751213245789325, + "grad_norm": 1.6701551675796509, + "learning_rate": 4.108602475697521e-05, + "loss": 4.4599, + "step": 46662 + }, + { + "epoch": 0.2775180797411742, + "grad_norm": 1.6563005447387695, + "learning_rate": 4.108566719223853e-05, + "loss": 4.5101, + "step": 46663 + }, + { + "epoch": 0.2775240270244552, + "grad_norm": 1.7274457216262817, + "learning_rate": 4.10853096218865e-05, + "loss": 4.1944, + "step": 46664 + }, + { + "epoch": 0.27752997430773624, + "grad_norm": 1.5243332386016846, + "learning_rate": 4.1084952045919254e-05, + "loss": 4.781, + "step": 46665 + }, + { + "epoch": 0.2775359215910172, + "grad_norm": 1.8296623229980469, + "learning_rate": 4.108459446433693e-05, + "loss": 4.3216, + "step": 46666 + }, + { + "epoch": 0.2775418688742982, + "grad_norm": 1.8790827989578247, + "learning_rate": 4.108423687713963e-05, + "loss": 4.2041, + "step": 46667 + }, + { + "epoch": 0.27754781615757923, + "grad_norm": 1.590769648551941, + "learning_rate": 4.10838792843275e-05, + "loss": 4.5927, + "step": 46668 + }, + { + "epoch": 0.2775537634408602, + "grad_norm": 1.7301621437072754, + "learning_rate": 4.108352168590064e-05, + "loss": 4.3792, + "step": 46669 + }, + { + "epoch": 0.2775597107241412, + "grad_norm": 1.4497694969177246, + "learning_rate": 4.10831640818592e-05, + "loss": 4.6335, + "step": 46670 + }, + { + "epoch": 0.2775656580074222, + "grad_norm": 1.4611026048660278, + "learning_rate": 4.1082806472203286e-05, + "loss": 4.5802, + "step": 46671 + }, + { + "epoch": 0.2775716052907032, + "grad_norm": 1.3935585021972656, + "learning_rate": 4.108244885693303e-05, + "loss": 4.7354, + "step": 46672 + }, + { + "epoch": 0.2775775525739842, + "grad_norm": 6.4151291847229, + "learning_rate": 4.108209123604856e-05, + "loss": 4.0627, + "step": 46673 + }, + { + "epoch": 0.2775834998572652, + "grad_norm": 5.552521705627441, + "learning_rate": 4.108173360955e-05, + "loss": 4.2566, + "step": 46674 + }, + { + "epoch": 0.2775894471405462, + "grad_norm": 4.303483486175537, + "learning_rate": 4.1081375977437465e-05, + "loss": 4.1605, + "step": 46675 + }, + { + "epoch": 0.2775953944238272, + "grad_norm": 4.459486961364746, + "learning_rate": 4.10810183397111e-05, + "loss": 3.7275, + "step": 46676 + }, + { + "epoch": 0.2776013417071082, + "grad_norm": 2.863260507583618, + "learning_rate": 4.108066069637101e-05, + "loss": 3.753, + "step": 46677 + }, + { + "epoch": 0.27760728899038917, + "grad_norm": 1.8464617729187012, + "learning_rate": 4.1080303047417326e-05, + "loss": 4.2935, + "step": 46678 + }, + { + "epoch": 0.2776132362736702, + "grad_norm": 3.5195529460906982, + "learning_rate": 4.1079945392850184e-05, + "loss": 3.9518, + "step": 46679 + }, + { + "epoch": 0.2776191835569512, + "grad_norm": 3.344593048095703, + "learning_rate": 4.1079587732669686e-05, + "loss": 3.3773, + "step": 46680 + }, + { + "epoch": 0.27762513084023216, + "grad_norm": 3.3535921573638916, + "learning_rate": 4.107923006687598e-05, + "loss": 3.7141, + "step": 46681 + }, + { + "epoch": 0.2776310781235132, + "grad_norm": 3.267660617828369, + "learning_rate": 4.107887239546918e-05, + "loss": 3.7122, + "step": 46682 + }, + { + "epoch": 0.2776370254067942, + "grad_norm": 2.5228629112243652, + "learning_rate": 4.1078514718449404e-05, + "loss": 3.0766, + "step": 46683 + }, + { + "epoch": 0.27764297269007515, + "grad_norm": 2.195908546447754, + "learning_rate": 4.1078157035816786e-05, + "loss": 3.8678, + "step": 46684 + }, + { + "epoch": 0.27764891997335617, + "grad_norm": 2.6280879974365234, + "learning_rate": 4.107779934757146e-05, + "loss": 3.4534, + "step": 46685 + }, + { + "epoch": 0.2776548672566372, + "grad_norm": 1.7109944820404053, + "learning_rate": 4.107744165371353e-05, + "loss": 4.712, + "step": 46686 + }, + { + "epoch": 0.27766081453991814, + "grad_norm": 2.4528918266296387, + "learning_rate": 4.1077083954243134e-05, + "loss": 3.708, + "step": 46687 + }, + { + "epoch": 0.27766676182319916, + "grad_norm": 3.3653573989868164, + "learning_rate": 4.1076726249160403e-05, + "loss": 3.5637, + "step": 46688 + }, + { + "epoch": 0.2776727091064802, + "grad_norm": 2.6774098873138428, + "learning_rate": 4.1076368538465436e-05, + "loss": 3.498, + "step": 46689 + }, + { + "epoch": 0.27767865638976114, + "grad_norm": 2.769296646118164, + "learning_rate": 4.1076010822158387e-05, + "loss": 3.2957, + "step": 46690 + }, + { + "epoch": 0.27768460367304215, + "grad_norm": 2.5441455841064453, + "learning_rate": 4.1075653100239365e-05, + "loss": 3.2203, + "step": 46691 + }, + { + "epoch": 0.27769055095632317, + "grad_norm": 2.5100290775299072, + "learning_rate": 4.10752953727085e-05, + "loss": 3.4987, + "step": 46692 + }, + { + "epoch": 0.27769649823960413, + "grad_norm": 2.446728467941284, + "learning_rate": 4.107493763956591e-05, + "loss": 3.7065, + "step": 46693 + }, + { + "epoch": 0.27770244552288514, + "grad_norm": 2.339047431945801, + "learning_rate": 4.107457990081174e-05, + "loss": 3.327, + "step": 46694 + }, + { + "epoch": 0.27770839280616616, + "grad_norm": 2.2049381732940674, + "learning_rate": 4.1074222156446085e-05, + "loss": 3.3429, + "step": 46695 + }, + { + "epoch": 0.2777143400894471, + "grad_norm": 2.4554476737976074, + "learning_rate": 4.107386440646909e-05, + "loss": 3.4914, + "step": 46696 + }, + { + "epoch": 0.27772028737272814, + "grad_norm": 1.6699353456497192, + "learning_rate": 4.1073506650880875e-05, + "loss": 4.4989, + "step": 46697 + }, + { + "epoch": 0.27772623465600915, + "grad_norm": 1.533889651298523, + "learning_rate": 4.107314888968157e-05, + "loss": 4.2403, + "step": 46698 + }, + { + "epoch": 0.2777321819392901, + "grad_norm": 1.6007006168365479, + "learning_rate": 4.107279112287128e-05, + "loss": 4.7418, + "step": 46699 + }, + { + "epoch": 0.27773812922257113, + "grad_norm": 1.4934629201889038, + "learning_rate": 4.107243335045016e-05, + "loss": 4.8473, + "step": 46700 + }, + { + "epoch": 0.27774407650585214, + "grad_norm": 1.4539875984191895, + "learning_rate": 4.1072075572418314e-05, + "loss": 4.6629, + "step": 46701 + }, + { + "epoch": 0.2777500237891331, + "grad_norm": 2.1909356117248535, + "learning_rate": 4.107171778877588e-05, + "loss": 3.0438, + "step": 46702 + }, + { + "epoch": 0.2777559710724141, + "grad_norm": 1.4171408414840698, + "learning_rate": 4.107135999952296e-05, + "loss": 4.3432, + "step": 46703 + }, + { + "epoch": 0.27776191835569514, + "grad_norm": 1.4382119178771973, + "learning_rate": 4.10710022046597e-05, + "loss": 4.1575, + "step": 46704 + }, + { + "epoch": 0.2777678656389761, + "grad_norm": 1.4816005229949951, + "learning_rate": 4.1070644404186226e-05, + "loss": 4.2991, + "step": 46705 + }, + { + "epoch": 0.2777738129222571, + "grad_norm": 1.5718729496002197, + "learning_rate": 4.107028659810265e-05, + "loss": 4.4914, + "step": 46706 + }, + { + "epoch": 0.27777976020553813, + "grad_norm": 1.1101219654083252, + "learning_rate": 4.1069928786409106e-05, + "loss": 4.9567, + "step": 46707 + }, + { + "epoch": 0.2777857074888191, + "grad_norm": 1.6189651489257812, + "learning_rate": 4.106957096910571e-05, + "loss": 4.1756, + "step": 46708 + }, + { + "epoch": 0.2777916547721001, + "grad_norm": 1.343074917793274, + "learning_rate": 4.1069213146192607e-05, + "loss": 4.3923, + "step": 46709 + }, + { + "epoch": 0.2777976020553811, + "grad_norm": 1.605444073677063, + "learning_rate": 4.1068855317669894e-05, + "loss": 4.1503, + "step": 46710 + }, + { + "epoch": 0.2778035493386621, + "grad_norm": 1.5400902032852173, + "learning_rate": 4.106849748353771e-05, + "loss": 4.1455, + "step": 46711 + }, + { + "epoch": 0.2778094966219431, + "grad_norm": 1.6649471521377563, + "learning_rate": 4.106813964379618e-05, + "loss": 4.0107, + "step": 46712 + }, + { + "epoch": 0.2778154439052241, + "grad_norm": 1.6272038221359253, + "learning_rate": 4.106778179844544e-05, + "loss": 4.0326, + "step": 46713 + }, + { + "epoch": 0.2778213911885051, + "grad_norm": 1.9331778287887573, + "learning_rate": 4.106742394748559e-05, + "loss": 3.5096, + "step": 46714 + }, + { + "epoch": 0.2778273384717861, + "grad_norm": 1.9401901960372925, + "learning_rate": 4.1067066090916775e-05, + "loss": 2.8117, + "step": 46715 + }, + { + "epoch": 0.2778332857550671, + "grad_norm": 2.487551689147949, + "learning_rate": 4.106670822873911e-05, + "loss": 1.8008, + "step": 46716 + }, + { + "epoch": 0.27783923303834807, + "grad_norm": 2.3082776069641113, + "learning_rate": 4.106635036095272e-05, + "loss": 1.7838, + "step": 46717 + }, + { + "epoch": 0.2778451803216291, + "grad_norm": 1.884759545326233, + "learning_rate": 4.106599248755775e-05, + "loss": 3.4894, + "step": 46718 + }, + { + "epoch": 0.2778511276049101, + "grad_norm": 1.5791832208633423, + "learning_rate": 4.1065634608554285e-05, + "loss": 4.0227, + "step": 46719 + }, + { + "epoch": 0.27785707488819106, + "grad_norm": 1.8383958339691162, + "learning_rate": 4.106527672394248e-05, + "loss": 3.269, + "step": 46720 + }, + { + "epoch": 0.2778630221714721, + "grad_norm": 1.6925534009933472, + "learning_rate": 4.1064918833722465e-05, + "loss": 3.5611, + "step": 46721 + }, + { + "epoch": 0.2778689694547531, + "grad_norm": 2.022456169128418, + "learning_rate": 4.1064560937894345e-05, + "loss": 2.6968, + "step": 46722 + }, + { + "epoch": 0.27787491673803405, + "grad_norm": 1.8386943340301514, + "learning_rate": 4.106420303645825e-05, + "loss": 3.1887, + "step": 46723 + }, + { + "epoch": 0.27788086402131507, + "grad_norm": 1.9167397022247314, + "learning_rate": 4.106384512941431e-05, + "loss": 3.737, + "step": 46724 + }, + { + "epoch": 0.2778868113045961, + "grad_norm": 1.8212589025497437, + "learning_rate": 4.1063487216762645e-05, + "loss": 4.1316, + "step": 46725 + }, + { + "epoch": 0.27789275858787704, + "grad_norm": 2.738624095916748, + "learning_rate": 4.1063129298503385e-05, + "loss": 1.9773, + "step": 46726 + }, + { + "epoch": 0.27789870587115806, + "grad_norm": 1.941107153892517, + "learning_rate": 4.106277137463665e-05, + "loss": 2.8326, + "step": 46727 + }, + { + "epoch": 0.2779046531544391, + "grad_norm": 2.302375555038452, + "learning_rate": 4.1062413445162574e-05, + "loss": 2.3633, + "step": 46728 + }, + { + "epoch": 0.27791060043772003, + "grad_norm": 2.105940341949463, + "learning_rate": 4.106205551008127e-05, + "loss": 3.3325, + "step": 46729 + }, + { + "epoch": 0.27791654772100105, + "grad_norm": 2.175410509109497, + "learning_rate": 4.106169756939287e-05, + "loss": 3.2416, + "step": 46730 + }, + { + "epoch": 0.27792249500428207, + "grad_norm": 2.8404812812805176, + "learning_rate": 4.106133962309749e-05, + "loss": 2.4226, + "step": 46731 + }, + { + "epoch": 0.277928442287563, + "grad_norm": 3.341418981552124, + "learning_rate": 4.106098167119528e-05, + "loss": 1.2972, + "step": 46732 + }, + { + "epoch": 0.27793438957084404, + "grad_norm": 3.1397910118103027, + "learning_rate": 4.106062371368633e-05, + "loss": 1.2315, + "step": 46733 + }, + { + "epoch": 0.27794033685412506, + "grad_norm": 2.082782030105591, + "learning_rate": 4.106026575057079e-05, + "loss": 3.9093, + "step": 46734 + }, + { + "epoch": 0.277946284137406, + "grad_norm": 2.2418317794799805, + "learning_rate": 4.105990778184877e-05, + "loss": 3.6053, + "step": 46735 + }, + { + "epoch": 0.27795223142068703, + "grad_norm": 2.4902756214141846, + "learning_rate": 4.1059549807520415e-05, + "loss": 3.0914, + "step": 46736 + }, + { + "epoch": 0.27795817870396805, + "grad_norm": 1.9674242734909058, + "learning_rate": 4.105919182758583e-05, + "loss": 3.3964, + "step": 46737 + }, + { + "epoch": 0.277964125987249, + "grad_norm": 1.8481380939483643, + "learning_rate": 4.105883384204514e-05, + "loss": 4.3116, + "step": 46738 + }, + { + "epoch": 0.27797007327053, + "grad_norm": 2.047734022140503, + "learning_rate": 4.105847585089849e-05, + "loss": 4.599, + "step": 46739 + }, + { + "epoch": 0.27797602055381104, + "grad_norm": 3.1164069175720215, + "learning_rate": 4.105811785414598e-05, + "loss": 2.8529, + "step": 46740 + }, + { + "epoch": 0.277981967837092, + "grad_norm": 2.673550605773926, + "learning_rate": 4.1057759851787756e-05, + "loss": 2.2711, + "step": 46741 + }, + { + "epoch": 0.277987915120373, + "grad_norm": 2.335216760635376, + "learning_rate": 4.105740184382394e-05, + "loss": 2.7811, + "step": 46742 + }, + { + "epoch": 0.27799386240365404, + "grad_norm": 1.5046519041061401, + "learning_rate": 4.1057043830254635e-05, + "loss": 4.7404, + "step": 46743 + }, + { + "epoch": 0.277999809686935, + "grad_norm": 1.6280672550201416, + "learning_rate": 4.1056685811079985e-05, + "loss": 4.7593, + "step": 46744 + }, + { + "epoch": 0.278005756970216, + "grad_norm": 1.5052810907363892, + "learning_rate": 4.105632778630012e-05, + "loss": 4.8245, + "step": 46745 + }, + { + "epoch": 0.278011704253497, + "grad_norm": 1.6290700435638428, + "learning_rate": 4.105596975591515e-05, + "loss": 4.9406, + "step": 46746 + }, + { + "epoch": 0.278017651536778, + "grad_norm": 1.6377136707305908, + "learning_rate": 4.1055611719925215e-05, + "loss": 5.0752, + "step": 46747 + }, + { + "epoch": 0.278023598820059, + "grad_norm": 1.4774062633514404, + "learning_rate": 4.105525367833042e-05, + "loss": 4.809, + "step": 46748 + }, + { + "epoch": 0.27802954610334, + "grad_norm": 1.9276518821716309, + "learning_rate": 4.105489563113091e-05, + "loss": 4.2515, + "step": 46749 + }, + { + "epoch": 0.278035493386621, + "grad_norm": 1.5889195203781128, + "learning_rate": 4.10545375783268e-05, + "loss": 4.8577, + "step": 46750 + }, + { + "epoch": 0.278041440669902, + "grad_norm": 1.6109524965286255, + "learning_rate": 4.1054179519918215e-05, + "loss": 4.7002, + "step": 46751 + }, + { + "epoch": 0.278047387953183, + "grad_norm": 1.641481637954712, + "learning_rate": 4.1053821455905286e-05, + "loss": 4.2487, + "step": 46752 + }, + { + "epoch": 0.27805333523646397, + "grad_norm": 2.154026985168457, + "learning_rate": 4.105346338628813e-05, + "loss": 3.3245, + "step": 46753 + }, + { + "epoch": 0.278059282519745, + "grad_norm": 2.1679933071136475, + "learning_rate": 4.1053105311066875e-05, + "loss": 3.9969, + "step": 46754 + }, + { + "epoch": 0.278065229803026, + "grad_norm": 1.8858659267425537, + "learning_rate": 4.105274723024165e-05, + "loss": 4.1339, + "step": 46755 + }, + { + "epoch": 0.27807117708630696, + "grad_norm": 1.727256178855896, + "learning_rate": 4.105238914381258e-05, + "loss": 3.6551, + "step": 46756 + }, + { + "epoch": 0.278077124369588, + "grad_norm": 2.025122880935669, + "learning_rate": 4.1052031051779786e-05, + "loss": 4.198, + "step": 46757 + }, + { + "epoch": 0.278083071652869, + "grad_norm": 3.132181167602539, + "learning_rate": 4.105167295414338e-05, + "loss": 2.117, + "step": 46758 + }, + { + "epoch": 0.27808901893614996, + "grad_norm": 3.0021090507507324, + "learning_rate": 4.105131485090351e-05, + "loss": 1.8731, + "step": 46759 + }, + { + "epoch": 0.278094966219431, + "grad_norm": 3.8711633682250977, + "learning_rate": 4.10509567420603e-05, + "loss": 1.7873, + "step": 46760 + }, + { + "epoch": 0.278100913502712, + "grad_norm": 3.0541129112243652, + "learning_rate": 4.105059862761386e-05, + "loss": 1.8924, + "step": 46761 + }, + { + "epoch": 0.27810686078599295, + "grad_norm": 3.220463752746582, + "learning_rate": 4.105024050756431e-05, + "loss": 2.3427, + "step": 46762 + }, + { + "epoch": 0.27811280806927396, + "grad_norm": 3.2135376930236816, + "learning_rate": 4.1049882381911807e-05, + "loss": 2.3005, + "step": 46763 + }, + { + "epoch": 0.278118755352555, + "grad_norm": 2.7550575733184814, + "learning_rate": 4.104952425065645e-05, + "loss": 1.5743, + "step": 46764 + }, + { + "epoch": 0.27812470263583594, + "grad_norm": 2.980232000350952, + "learning_rate": 4.104916611379836e-05, + "loss": 1.9742, + "step": 46765 + }, + { + "epoch": 0.27813064991911696, + "grad_norm": 3.0249102115631104, + "learning_rate": 4.104880797133768e-05, + "loss": 2.2123, + "step": 46766 + }, + { + "epoch": 0.2781365972023979, + "grad_norm": 3.094075918197632, + "learning_rate": 4.1048449823274526e-05, + "loss": 2.157, + "step": 46767 + }, + { + "epoch": 0.27814254448567893, + "grad_norm": 2.8420021533966064, + "learning_rate": 4.104809166960902e-05, + "loss": 2.1125, + "step": 46768 + }, + { + "epoch": 0.27814849176895995, + "grad_norm": 2.738314390182495, + "learning_rate": 4.10477335103413e-05, + "loss": 1.9546, + "step": 46769 + }, + { + "epoch": 0.2781544390522409, + "grad_norm": 2.9046380519866943, + "learning_rate": 4.1047375345471476e-05, + "loss": 2.0959, + "step": 46770 + }, + { + "epoch": 0.2781603863355219, + "grad_norm": 2.7830822467803955, + "learning_rate": 4.104701717499968e-05, + "loss": 2.1278, + "step": 46771 + }, + { + "epoch": 0.27816633361880294, + "grad_norm": 2.544728994369507, + "learning_rate": 4.104665899892603e-05, + "loss": 1.9998, + "step": 46772 + }, + { + "epoch": 0.2781722809020839, + "grad_norm": 2.6995949745178223, + "learning_rate": 4.104630081725067e-05, + "loss": 2.2807, + "step": 46773 + }, + { + "epoch": 0.2781782281853649, + "grad_norm": 2.6362433433532715, + "learning_rate": 4.10459426299737e-05, + "loss": 2.0404, + "step": 46774 + }, + { + "epoch": 0.27818417546864593, + "grad_norm": 2.635755777359009, + "learning_rate": 4.104558443709527e-05, + "loss": 2.1238, + "step": 46775 + }, + { + "epoch": 0.2781901227519269, + "grad_norm": 2.496812343597412, + "learning_rate": 4.104522623861548e-05, + "loss": 1.9943, + "step": 46776 + }, + { + "epoch": 0.2781960700352079, + "grad_norm": 2.727982521057129, + "learning_rate": 4.104486803453448e-05, + "loss": 2.0877, + "step": 46777 + }, + { + "epoch": 0.2782020173184889, + "grad_norm": 1.913494348526001, + "learning_rate": 4.104450982485237e-05, + "loss": 4.1287, + "step": 46778 + }, + { + "epoch": 0.2782079646017699, + "grad_norm": 1.851405382156372, + "learning_rate": 4.1044151609569295e-05, + "loss": 4.8234, + "step": 46779 + }, + { + "epoch": 0.2782139118850509, + "grad_norm": 2.6272244453430176, + "learning_rate": 4.104379338868536e-05, + "loss": 3.3754, + "step": 46780 + }, + { + "epoch": 0.2782198591683319, + "grad_norm": 1.650818109512329, + "learning_rate": 4.1043435162200714e-05, + "loss": 4.5767, + "step": 46781 + }, + { + "epoch": 0.2782258064516129, + "grad_norm": 2.131619930267334, + "learning_rate": 4.104307693011546e-05, + "loss": 3.7643, + "step": 46782 + }, + { + "epoch": 0.2782317537348939, + "grad_norm": 1.6397521495819092, + "learning_rate": 4.104271869242975e-05, + "loss": 4.8097, + "step": 46783 + }, + { + "epoch": 0.2782377010181749, + "grad_norm": 2.1637661457061768, + "learning_rate": 4.104236044914369e-05, + "loss": 3.7239, + "step": 46784 + }, + { + "epoch": 0.27824364830145587, + "grad_norm": 2.164607286453247, + "learning_rate": 4.10420022002574e-05, + "loss": 2.9417, + "step": 46785 + }, + { + "epoch": 0.2782495955847369, + "grad_norm": 1.7863608598709106, + "learning_rate": 4.104164394577102e-05, + "loss": 3.9874, + "step": 46786 + }, + { + "epoch": 0.2782555428680179, + "grad_norm": 1.3696117401123047, + "learning_rate": 4.104128568568466e-05, + "loss": 4.9933, + "step": 46787 + }, + { + "epoch": 0.27826149015129886, + "grad_norm": 1.4619613885879517, + "learning_rate": 4.104092741999845e-05, + "loss": 4.8746, + "step": 46788 + }, + { + "epoch": 0.2782674374345799, + "grad_norm": 1.4306750297546387, + "learning_rate": 4.104056914871253e-05, + "loss": 4.0899, + "step": 46789 + }, + { + "epoch": 0.2782733847178609, + "grad_norm": 1.4780261516571045, + "learning_rate": 4.1040210871827e-05, + "loss": 4.6075, + "step": 46790 + }, + { + "epoch": 0.27827933200114185, + "grad_norm": 1.6758196353912354, + "learning_rate": 4.1039852589342006e-05, + "loss": 4.8894, + "step": 46791 + }, + { + "epoch": 0.27828527928442287, + "grad_norm": 1.6984118223190308, + "learning_rate": 4.1039494301257665e-05, + "loss": 4.9177, + "step": 46792 + }, + { + "epoch": 0.2782912265677039, + "grad_norm": 1.6205167770385742, + "learning_rate": 4.10391360075741e-05, + "loss": 4.8212, + "step": 46793 + }, + { + "epoch": 0.27829717385098485, + "grad_norm": 2.82548189163208, + "learning_rate": 4.1038777708291445e-05, + "loss": 3.7551, + "step": 46794 + }, + { + "epoch": 0.27830312113426586, + "grad_norm": 1.3904260396957397, + "learning_rate": 4.103841940340981e-05, + "loss": 4.147, + "step": 46795 + }, + { + "epoch": 0.2783090684175469, + "grad_norm": 1.4862178564071655, + "learning_rate": 4.103806109292932e-05, + "loss": 3.9494, + "step": 46796 + }, + { + "epoch": 0.27831501570082784, + "grad_norm": 1.6481938362121582, + "learning_rate": 4.103770277685013e-05, + "loss": 4.5754, + "step": 46797 + }, + { + "epoch": 0.27832096298410886, + "grad_norm": 1.5239589214324951, + "learning_rate": 4.103734445517233e-05, + "loss": 4.999, + "step": 46798 + }, + { + "epoch": 0.27832691026738987, + "grad_norm": 1.600292444229126, + "learning_rate": 4.103698612789606e-05, + "loss": 4.6831, + "step": 46799 + }, + { + "epoch": 0.27833285755067083, + "grad_norm": 1.5685478448867798, + "learning_rate": 4.103662779502145e-05, + "loss": 4.9399, + "step": 46800 + }, + { + "epoch": 0.27833880483395185, + "grad_norm": 1.8865615129470825, + "learning_rate": 4.103626945654862e-05, + "loss": 4.1952, + "step": 46801 + }, + { + "epoch": 0.27834475211723286, + "grad_norm": 1.5495024919509888, + "learning_rate": 4.103591111247769e-05, + "loss": 4.7062, + "step": 46802 + }, + { + "epoch": 0.2783506994005138, + "grad_norm": 1.59615957736969, + "learning_rate": 4.103555276280878e-05, + "loss": 4.3299, + "step": 46803 + }, + { + "epoch": 0.27835664668379484, + "grad_norm": 1.5535348653793335, + "learning_rate": 4.1035194407542035e-05, + "loss": 4.5598, + "step": 46804 + }, + { + "epoch": 0.27836259396707586, + "grad_norm": 1.583910346031189, + "learning_rate": 4.103483604667757e-05, + "loss": 4.3943, + "step": 46805 + }, + { + "epoch": 0.2783685412503568, + "grad_norm": 1.53197181224823, + "learning_rate": 4.10344776802155e-05, + "loss": 4.4682, + "step": 46806 + }, + { + "epoch": 0.27837448853363783, + "grad_norm": 1.6811991930007935, + "learning_rate": 4.103411930815597e-05, + "loss": 4.2561, + "step": 46807 + }, + { + "epoch": 0.27838043581691885, + "grad_norm": 1.6066415309906006, + "learning_rate": 4.103376093049909e-05, + "loss": 4.4796, + "step": 46808 + }, + { + "epoch": 0.2783863831001998, + "grad_norm": 1.6465378999710083, + "learning_rate": 4.1033402547244987e-05, + "loss": 4.3328, + "step": 46809 + }, + { + "epoch": 0.2783923303834808, + "grad_norm": 1.6663148403167725, + "learning_rate": 4.10330441583938e-05, + "loss": 4.6754, + "step": 46810 + }, + { + "epoch": 0.27839827766676184, + "grad_norm": 1.634717583656311, + "learning_rate": 4.1032685763945625e-05, + "loss": 4.7626, + "step": 46811 + }, + { + "epoch": 0.2784042249500428, + "grad_norm": 1.6264578104019165, + "learning_rate": 4.103232736390061e-05, + "loss": 4.7038, + "step": 46812 + }, + { + "epoch": 0.2784101722333238, + "grad_norm": 1.4354326725006104, + "learning_rate": 4.1031968958258885e-05, + "loss": 4.555, + "step": 46813 + }, + { + "epoch": 0.27841611951660483, + "grad_norm": 1.4528725147247314, + "learning_rate": 4.103161054702056e-05, + "loss": 4.7271, + "step": 46814 + }, + { + "epoch": 0.2784220667998858, + "grad_norm": 1.6330482959747314, + "learning_rate": 4.103125213018576e-05, + "loss": 4.6022, + "step": 46815 + }, + { + "epoch": 0.2784280140831668, + "grad_norm": 2.0788915157318115, + "learning_rate": 4.103089370775462e-05, + "loss": 3.8156, + "step": 46816 + }, + { + "epoch": 0.2784339613664478, + "grad_norm": 2.5235724449157715, + "learning_rate": 4.1030535279727266e-05, + "loss": 4.0136, + "step": 46817 + }, + { + "epoch": 0.2784399086497288, + "grad_norm": 2.3023970127105713, + "learning_rate": 4.103017684610382e-05, + "loss": 3.4347, + "step": 46818 + }, + { + "epoch": 0.2784458559330098, + "grad_norm": 2.5753304958343506, + "learning_rate": 4.102981840688439e-05, + "loss": 2.9085, + "step": 46819 + }, + { + "epoch": 0.2784518032162908, + "grad_norm": 2.926760196685791, + "learning_rate": 4.102945996206913e-05, + "loss": 3.1996, + "step": 46820 + }, + { + "epoch": 0.2784577504995718, + "grad_norm": 3.155500888824463, + "learning_rate": 4.102910151165814e-05, + "loss": 4.0151, + "step": 46821 + }, + { + "epoch": 0.2784636977828528, + "grad_norm": 2.9261703491210938, + "learning_rate": 4.102874305565157e-05, + "loss": 3.37, + "step": 46822 + }, + { + "epoch": 0.2784696450661338, + "grad_norm": 2.4826197624206543, + "learning_rate": 4.102838459404952e-05, + "loss": 3.0528, + "step": 46823 + }, + { + "epoch": 0.27847559234941477, + "grad_norm": 2.0930888652801514, + "learning_rate": 4.102802612685213e-05, + "loss": 3.7234, + "step": 46824 + }, + { + "epoch": 0.2784815396326958, + "grad_norm": 2.806626081466675, + "learning_rate": 4.102766765405952e-05, + "loss": 3.3851, + "step": 46825 + }, + { + "epoch": 0.2784874869159768, + "grad_norm": 2.573981761932373, + "learning_rate": 4.102730917567182e-05, + "loss": 3.3579, + "step": 46826 + }, + { + "epoch": 0.27849343419925776, + "grad_norm": 2.7713024616241455, + "learning_rate": 4.1026950691689146e-05, + "loss": 3.1161, + "step": 46827 + }, + { + "epoch": 0.2784993814825388, + "grad_norm": 2.6597588062286377, + "learning_rate": 4.102659220211164e-05, + "loss": 2.839, + "step": 46828 + }, + { + "epoch": 0.2785053287658198, + "grad_norm": 2.7039999961853027, + "learning_rate": 4.1026233706939404e-05, + "loss": 3.3876, + "step": 46829 + }, + { + "epoch": 0.27851127604910075, + "grad_norm": 1.5531891584396362, + "learning_rate": 4.1025875206172584e-05, + "loss": 4.4272, + "step": 46830 + }, + { + "epoch": 0.27851722333238177, + "grad_norm": 1.5701428651809692, + "learning_rate": 4.1025516699811295e-05, + "loss": 4.2126, + "step": 46831 + }, + { + "epoch": 0.2785231706156628, + "grad_norm": 1.834628939628601, + "learning_rate": 4.102515818785566e-05, + "loss": 4.1026, + "step": 46832 + }, + { + "epoch": 0.27852911789894375, + "grad_norm": 1.584385871887207, + "learning_rate": 4.10247996703058e-05, + "loss": 4.3411, + "step": 46833 + }, + { + "epoch": 0.27853506518222476, + "grad_norm": 1.7167937755584717, + "learning_rate": 4.102444114716186e-05, + "loss": 4.2669, + "step": 46834 + }, + { + "epoch": 0.2785410124655058, + "grad_norm": 1.4143896102905273, + "learning_rate": 4.1024082618423955e-05, + "loss": 4.5742, + "step": 46835 + }, + { + "epoch": 0.27854695974878674, + "grad_norm": 1.7796612977981567, + "learning_rate": 4.102372408409221e-05, + "loss": 4.3468, + "step": 46836 + }, + { + "epoch": 0.27855290703206775, + "grad_norm": 1.539900302886963, + "learning_rate": 4.102336554416674e-05, + "loss": 4.5944, + "step": 46837 + }, + { + "epoch": 0.27855885431534877, + "grad_norm": 1.3041605949401855, + "learning_rate": 4.102300699864768e-05, + "loss": 4.8733, + "step": 46838 + }, + { + "epoch": 0.27856480159862973, + "grad_norm": 1.5023130178451538, + "learning_rate": 4.102264844753515e-05, + "loss": 4.5062, + "step": 46839 + }, + { + "epoch": 0.27857074888191075, + "grad_norm": 1.4967166185379028, + "learning_rate": 4.102228989082929e-05, + "loss": 4.7341, + "step": 46840 + }, + { + "epoch": 0.27857669616519176, + "grad_norm": 2.1963417530059814, + "learning_rate": 4.1021931328530205e-05, + "loss": 4.5124, + "step": 46841 + }, + { + "epoch": 0.2785826434484727, + "grad_norm": 1.9562112092971802, + "learning_rate": 4.102157276063803e-05, + "loss": 4.6288, + "step": 46842 + }, + { + "epoch": 0.27858859073175374, + "grad_norm": 1.6128381490707397, + "learning_rate": 4.10212141871529e-05, + "loss": 4.5702, + "step": 46843 + }, + { + "epoch": 0.27859453801503475, + "grad_norm": 1.815293550491333, + "learning_rate": 4.102085560807491e-05, + "loss": 4.4514, + "step": 46844 + }, + { + "epoch": 0.2786004852983157, + "grad_norm": 1.7041912078857422, + "learning_rate": 4.102049702340423e-05, + "loss": 4.7105, + "step": 46845 + }, + { + "epoch": 0.27860643258159673, + "grad_norm": 1.7193024158477783, + "learning_rate": 4.1020138433140944e-05, + "loss": 4.6581, + "step": 46846 + }, + { + "epoch": 0.27861237986487775, + "grad_norm": 2.1859796047210693, + "learning_rate": 4.101977983728519e-05, + "loss": 4.5398, + "step": 46847 + }, + { + "epoch": 0.2786183271481587, + "grad_norm": 1.3333501815795898, + "learning_rate": 4.10194212358371e-05, + "loss": 4.4326, + "step": 46848 + }, + { + "epoch": 0.2786242744314397, + "grad_norm": 1.659326195716858, + "learning_rate": 4.1019062628796803e-05, + "loss": 4.6144, + "step": 46849 + }, + { + "epoch": 0.27863022171472074, + "grad_norm": 1.676741600036621, + "learning_rate": 4.101870401616441e-05, + "loss": 4.4832, + "step": 46850 + }, + { + "epoch": 0.2786361689980017, + "grad_norm": 1.6358340978622437, + "learning_rate": 4.101834539794005e-05, + "loss": 4.4099, + "step": 46851 + }, + { + "epoch": 0.2786421162812827, + "grad_norm": 1.9698513746261597, + "learning_rate": 4.101798677412386e-05, + "loss": 4.2939, + "step": 46852 + }, + { + "epoch": 0.27864806356456373, + "grad_norm": 1.6493223905563354, + "learning_rate": 4.1017628144715953e-05, + "loss": 3.9881, + "step": 46853 + }, + { + "epoch": 0.2786540108478447, + "grad_norm": 1.3965966701507568, + "learning_rate": 4.101726950971645e-05, + "loss": 4.568, + "step": 46854 + }, + { + "epoch": 0.2786599581311257, + "grad_norm": 1.0902657508850098, + "learning_rate": 4.1016910869125495e-05, + "loss": 4.5636, + "step": 46855 + }, + { + "epoch": 0.2786659054144067, + "grad_norm": 1.2763757705688477, + "learning_rate": 4.10165522229432e-05, + "loss": 4.4361, + "step": 46856 + }, + { + "epoch": 0.2786718526976877, + "grad_norm": 1.5153323411941528, + "learning_rate": 4.101619357116968e-05, + "loss": 4.5287, + "step": 46857 + }, + { + "epoch": 0.2786777999809687, + "grad_norm": 1.384042739868164, + "learning_rate": 4.1015834913805084e-05, + "loss": 4.568, + "step": 46858 + }, + { + "epoch": 0.2786837472642497, + "grad_norm": 1.539786696434021, + "learning_rate": 4.101547625084953e-05, + "loss": 4.3012, + "step": 46859 + }, + { + "epoch": 0.2786896945475307, + "grad_norm": 1.5321375131607056, + "learning_rate": 4.1015117582303126e-05, + "loss": 4.5862, + "step": 46860 + }, + { + "epoch": 0.2786956418308117, + "grad_norm": 1.599363923072815, + "learning_rate": 4.1014758908166016e-05, + "loss": 4.8755, + "step": 46861 + }, + { + "epoch": 0.2787015891140927, + "grad_norm": 1.5172827243804932, + "learning_rate": 4.101440022843832e-05, + "loss": 4.7691, + "step": 46862 + }, + { + "epoch": 0.27870753639737367, + "grad_norm": 1.700488567352295, + "learning_rate": 4.101404154312015e-05, + "loss": 4.073, + "step": 46863 + }, + { + "epoch": 0.2787134836806547, + "grad_norm": 1.4918625354766846, + "learning_rate": 4.101368285221167e-05, + "loss": 4.5784, + "step": 46864 + }, + { + "epoch": 0.2787194309639357, + "grad_norm": 1.466007947921753, + "learning_rate": 4.101332415571295e-05, + "loss": 4.6633, + "step": 46865 + }, + { + "epoch": 0.27872537824721666, + "grad_norm": 1.4014698266983032, + "learning_rate": 4.1012965453624164e-05, + "loss": 4.7288, + "step": 46866 + }, + { + "epoch": 0.2787313255304977, + "grad_norm": 1.4575530290603638, + "learning_rate": 4.101260674594541e-05, + "loss": 4.4997, + "step": 46867 + }, + { + "epoch": 0.2787372728137787, + "grad_norm": 1.373281717300415, + "learning_rate": 4.1012248032676815e-05, + "loss": 4.669, + "step": 46868 + }, + { + "epoch": 0.27874322009705965, + "grad_norm": 1.651100754737854, + "learning_rate": 4.101188931381852e-05, + "loss": 4.479, + "step": 46869 + }, + { + "epoch": 0.27874916738034067, + "grad_norm": 1.434461236000061, + "learning_rate": 4.1011530589370635e-05, + "loss": 4.3968, + "step": 46870 + }, + { + "epoch": 0.2787551146636217, + "grad_norm": 1.6068624258041382, + "learning_rate": 4.101117185933329e-05, + "loss": 4.585, + "step": 46871 + }, + { + "epoch": 0.27876106194690264, + "grad_norm": 1.7588915824890137, + "learning_rate": 4.1010813123706607e-05, + "loss": 4.5092, + "step": 46872 + }, + { + "epoch": 0.27876700923018366, + "grad_norm": 1.5903866291046143, + "learning_rate": 4.101045438249072e-05, + "loss": 4.4155, + "step": 46873 + }, + { + "epoch": 0.2787729565134647, + "grad_norm": 1.659173607826233, + "learning_rate": 4.101009563568575e-05, + "loss": 4.3935, + "step": 46874 + }, + { + "epoch": 0.27877890379674564, + "grad_norm": 1.558028221130371, + "learning_rate": 4.100973688329182e-05, + "loss": 4.6897, + "step": 46875 + }, + { + "epoch": 0.27878485108002665, + "grad_norm": 1.3691447973251343, + "learning_rate": 4.100937812530905e-05, + "loss": 4.4544, + "step": 46876 + }, + { + "epoch": 0.27879079836330767, + "grad_norm": 1.5753124952316284, + "learning_rate": 4.1009019361737575e-05, + "loss": 4.449, + "step": 46877 + }, + { + "epoch": 0.27879674564658863, + "grad_norm": 1.765770673751831, + "learning_rate": 4.100866059257752e-05, + "loss": 4.3421, + "step": 46878 + }, + { + "epoch": 0.27880269292986964, + "grad_norm": 1.6804298162460327, + "learning_rate": 4.1008301817829e-05, + "loss": 4.5128, + "step": 46879 + }, + { + "epoch": 0.27880864021315066, + "grad_norm": 1.8427013158798218, + "learning_rate": 4.1007943037492153e-05, + "loss": 4.5481, + "step": 46880 + }, + { + "epoch": 0.2788145874964316, + "grad_norm": 1.7125757932662964, + "learning_rate": 4.10075842515671e-05, + "loss": 4.4742, + "step": 46881 + }, + { + "epoch": 0.27882053477971264, + "grad_norm": 1.7859337329864502, + "learning_rate": 4.1007225460053956e-05, + "loss": 4.583, + "step": 46882 + }, + { + "epoch": 0.2788264820629936, + "grad_norm": 1.7780728340148926, + "learning_rate": 4.100686666295286e-05, + "loss": 4.4127, + "step": 46883 + }, + { + "epoch": 0.2788324293462746, + "grad_norm": 1.6163463592529297, + "learning_rate": 4.1006507860263934e-05, + "loss": 4.551, + "step": 46884 + }, + { + "epoch": 0.27883837662955563, + "grad_norm": 1.6414016485214233, + "learning_rate": 4.1006149051987305e-05, + "loss": 4.3457, + "step": 46885 + }, + { + "epoch": 0.2788443239128366, + "grad_norm": 1.6058303117752075, + "learning_rate": 4.1005790238123085e-05, + "loss": 4.5666, + "step": 46886 + }, + { + "epoch": 0.2788502711961176, + "grad_norm": 1.5104053020477295, + "learning_rate": 4.100543141867142e-05, + "loss": 4.5232, + "step": 46887 + }, + { + "epoch": 0.2788562184793986, + "grad_norm": 1.5128785371780396, + "learning_rate": 4.100507259363242e-05, + "loss": 4.7614, + "step": 46888 + }, + { + "epoch": 0.2788621657626796, + "grad_norm": 1.6872987747192383, + "learning_rate": 4.100471376300621e-05, + "loss": 4.5258, + "step": 46889 + }, + { + "epoch": 0.2788681130459606, + "grad_norm": 1.6653237342834473, + "learning_rate": 4.100435492679292e-05, + "loss": 4.1085, + "step": 46890 + }, + { + "epoch": 0.2788740603292416, + "grad_norm": 1.7847630977630615, + "learning_rate": 4.100399608499268e-05, + "loss": 4.1847, + "step": 46891 + }, + { + "epoch": 0.2788800076125226, + "grad_norm": 1.5287562608718872, + "learning_rate": 4.1003637237605606e-05, + "loss": 4.2218, + "step": 46892 + }, + { + "epoch": 0.2788859548958036, + "grad_norm": 1.8888086080551147, + "learning_rate": 4.100327838463183e-05, + "loss": 3.8924, + "step": 46893 + }, + { + "epoch": 0.2788919021790846, + "grad_norm": 1.9423549175262451, + "learning_rate": 4.100291952607147e-05, + "loss": 3.9186, + "step": 46894 + }, + { + "epoch": 0.27889784946236557, + "grad_norm": 1.8974632024765015, + "learning_rate": 4.100256066192466e-05, + "loss": 3.838, + "step": 46895 + }, + { + "epoch": 0.2789037967456466, + "grad_norm": 1.7499369382858276, + "learning_rate": 4.100220179219153e-05, + "loss": 3.8686, + "step": 46896 + }, + { + "epoch": 0.2789097440289276, + "grad_norm": 1.9097923040390015, + "learning_rate": 4.100184291687218e-05, + "loss": 4.3173, + "step": 46897 + }, + { + "epoch": 0.27891569131220856, + "grad_norm": 1.7583283185958862, + "learning_rate": 4.100148403596677e-05, + "loss": 4.2349, + "step": 46898 + }, + { + "epoch": 0.2789216385954896, + "grad_norm": 1.327467679977417, + "learning_rate": 4.1001125149475394e-05, + "loss": 4.9679, + "step": 46899 + }, + { + "epoch": 0.2789275858787706, + "grad_norm": 1.506319522857666, + "learning_rate": 4.100076625739819e-05, + "loss": 4.9083, + "step": 46900 + }, + { + "epoch": 0.27893353316205155, + "grad_norm": 1.5587483644485474, + "learning_rate": 4.100040735973529e-05, + "loss": 4.4787, + "step": 46901 + }, + { + "epoch": 0.27893948044533257, + "grad_norm": 1.3197804689407349, + "learning_rate": 4.1000048456486816e-05, + "loss": 4.7644, + "step": 46902 + }, + { + "epoch": 0.2789454277286136, + "grad_norm": 1.6142008304595947, + "learning_rate": 4.099968954765289e-05, + "loss": 5.2508, + "step": 46903 + }, + { + "epoch": 0.27895137501189454, + "grad_norm": 1.4476282596588135, + "learning_rate": 4.0999330633233626e-05, + "loss": 4.9701, + "step": 46904 + }, + { + "epoch": 0.27895732229517556, + "grad_norm": 1.4201841354370117, + "learning_rate": 4.099897171322917e-05, + "loss": 4.8637, + "step": 46905 + }, + { + "epoch": 0.2789632695784566, + "grad_norm": 1.4085813760757446, + "learning_rate": 4.099861278763963e-05, + "loss": 4.9761, + "step": 46906 + }, + { + "epoch": 0.27896921686173753, + "grad_norm": 1.658010721206665, + "learning_rate": 4.099825385646515e-05, + "loss": 4.6536, + "step": 46907 + }, + { + "epoch": 0.27897516414501855, + "grad_norm": 1.438108205795288, + "learning_rate": 4.099789491970584e-05, + "loss": 4.921, + "step": 46908 + }, + { + "epoch": 0.27898111142829957, + "grad_norm": 1.3506267070770264, + "learning_rate": 4.099753597736183e-05, + "loss": 4.8103, + "step": 46909 + }, + { + "epoch": 0.2789870587115805, + "grad_norm": 1.4993277788162231, + "learning_rate": 4.099717702943325e-05, + "loss": 4.6996, + "step": 46910 + }, + { + "epoch": 0.27899300599486154, + "grad_norm": 1.556593656539917, + "learning_rate": 4.0996818075920216e-05, + "loss": 4.5825, + "step": 46911 + }, + { + "epoch": 0.27899895327814256, + "grad_norm": 1.6432417631149292, + "learning_rate": 4.099645911682286e-05, + "loss": 4.7956, + "step": 46912 + }, + { + "epoch": 0.2790049005614235, + "grad_norm": 1.6310969591140747, + "learning_rate": 4.0996100152141306e-05, + "loss": 4.7127, + "step": 46913 + }, + { + "epoch": 0.27901084784470453, + "grad_norm": 1.6721936464309692, + "learning_rate": 4.099574118187568e-05, + "loss": 3.9662, + "step": 46914 + }, + { + "epoch": 0.27901679512798555, + "grad_norm": 1.5631078481674194, + "learning_rate": 4.09953822060261e-05, + "loss": 4.7079, + "step": 46915 + }, + { + "epoch": 0.2790227424112665, + "grad_norm": 1.7846355438232422, + "learning_rate": 4.0995023224592696e-05, + "loss": 3.9805, + "step": 46916 + }, + { + "epoch": 0.2790286896945475, + "grad_norm": 1.5437078475952148, + "learning_rate": 4.099466423757561e-05, + "loss": 4.3588, + "step": 46917 + }, + { + "epoch": 0.27903463697782854, + "grad_norm": 1.4266031980514526, + "learning_rate": 4.099430524497494e-05, + "loss": 4.905, + "step": 46918 + }, + { + "epoch": 0.2790405842611095, + "grad_norm": 1.652956247329712, + "learning_rate": 4.099394624679082e-05, + "loss": 4.7951, + "step": 46919 + }, + { + "epoch": 0.2790465315443905, + "grad_norm": 1.3079768419265747, + "learning_rate": 4.099358724302339e-05, + "loss": 4.6215, + "step": 46920 + }, + { + "epoch": 0.27905247882767154, + "grad_norm": 1.6245967149734497, + "learning_rate": 4.099322823367275e-05, + "loss": 3.9286, + "step": 46921 + }, + { + "epoch": 0.2790584261109525, + "grad_norm": 2.107887029647827, + "learning_rate": 4.099286921873905e-05, + "loss": 3.8823, + "step": 46922 + }, + { + "epoch": 0.2790643733942335, + "grad_norm": 1.7577862739562988, + "learning_rate": 4.09925101982224e-05, + "loss": 4.1844, + "step": 46923 + }, + { + "epoch": 0.2790703206775145, + "grad_norm": 1.7687357664108276, + "learning_rate": 4.0992151172122925e-05, + "loss": 3.8756, + "step": 46924 + }, + { + "epoch": 0.2790762679607955, + "grad_norm": 2.185346841812134, + "learning_rate": 4.099179214044077e-05, + "loss": 3.2443, + "step": 46925 + }, + { + "epoch": 0.2790822152440765, + "grad_norm": 3.093596935272217, + "learning_rate": 4.099143310317604e-05, + "loss": 3.9481, + "step": 46926 + }, + { + "epoch": 0.2790881625273575, + "grad_norm": 2.629970073699951, + "learning_rate": 4.099107406032886e-05, + "loss": 3.6729, + "step": 46927 + }, + { + "epoch": 0.2790941098106385, + "grad_norm": 2.107973098754883, + "learning_rate": 4.0990715011899364e-05, + "loss": 3.2853, + "step": 46928 + }, + { + "epoch": 0.2791000570939195, + "grad_norm": 2.9708871841430664, + "learning_rate": 4.099035595788767e-05, + "loss": 3.581, + "step": 46929 + }, + { + "epoch": 0.2791060043772005, + "grad_norm": 2.4081954956054688, + "learning_rate": 4.098999689829391e-05, + "loss": 3.3261, + "step": 46930 + }, + { + "epoch": 0.27911195166048147, + "grad_norm": 2.1488754749298096, + "learning_rate": 4.098963783311821e-05, + "loss": 3.3937, + "step": 46931 + }, + { + "epoch": 0.2791178989437625, + "grad_norm": 2.2659666538238525, + "learning_rate": 4.098927876236069e-05, + "loss": 3.2176, + "step": 46932 + }, + { + "epoch": 0.2791238462270435, + "grad_norm": 2.672624349594116, + "learning_rate": 4.0988919686021486e-05, + "loss": 3.5603, + "step": 46933 + }, + { + "epoch": 0.27912979351032446, + "grad_norm": 1.635791540145874, + "learning_rate": 4.098856060410071e-05, + "loss": 4.825, + "step": 46934 + }, + { + "epoch": 0.2791357407936055, + "grad_norm": 2.2416622638702393, + "learning_rate": 4.098820151659849e-05, + "loss": 3.3055, + "step": 46935 + }, + { + "epoch": 0.2791416880768865, + "grad_norm": 2.091495990753174, + "learning_rate": 4.098784242351496e-05, + "loss": 3.2334, + "step": 46936 + }, + { + "epoch": 0.27914763536016746, + "grad_norm": 2.0491209030151367, + "learning_rate": 4.0987483324850234e-05, + "loss": 3.2818, + "step": 46937 + }, + { + "epoch": 0.2791535826434485, + "grad_norm": 2.001509666442871, + "learning_rate": 4.0987124220604444e-05, + "loss": 3.2563, + "step": 46938 + }, + { + "epoch": 0.2791595299267295, + "grad_norm": 2.3121628761291504, + "learning_rate": 4.0986765110777716e-05, + "loss": 3.2678, + "step": 46939 + }, + { + "epoch": 0.27916547721001045, + "grad_norm": 2.6514389514923096, + "learning_rate": 4.0986405995370166e-05, + "loss": 3.5692, + "step": 46940 + }, + { + "epoch": 0.27917142449329146, + "grad_norm": 2.4771690368652344, + "learning_rate": 4.098604687438194e-05, + "loss": 3.4564, + "step": 46941 + }, + { + "epoch": 0.2791773717765725, + "grad_norm": 2.477102518081665, + "learning_rate": 4.098568774781314e-05, + "loss": 3.6737, + "step": 46942 + }, + { + "epoch": 0.27918331905985344, + "grad_norm": 1.543883204460144, + "learning_rate": 4.098532861566391e-05, + "loss": 5.2371, + "step": 46943 + }, + { + "epoch": 0.27918926634313446, + "grad_norm": 1.7491340637207031, + "learning_rate": 4.0984969477934366e-05, + "loss": 4.7324, + "step": 46944 + }, + { + "epoch": 0.2791952136264155, + "grad_norm": 1.7541205883026123, + "learning_rate": 4.098461033462462e-05, + "loss": 4.6862, + "step": 46945 + }, + { + "epoch": 0.27920116090969643, + "grad_norm": 1.7363767623901367, + "learning_rate": 4.098425118573482e-05, + "loss": 4.8391, + "step": 46946 + }, + { + "epoch": 0.27920710819297745, + "grad_norm": 2.029042959213257, + "learning_rate": 4.0983892031265095e-05, + "loss": 4.4704, + "step": 46947 + }, + { + "epoch": 0.27921305547625846, + "grad_norm": 1.8591254949569702, + "learning_rate": 4.098353287121555e-05, + "loss": 4.56, + "step": 46948 + }, + { + "epoch": 0.2792190027595394, + "grad_norm": 2.3744425773620605, + "learning_rate": 4.098317370558631e-05, + "loss": 3.5357, + "step": 46949 + }, + { + "epoch": 0.27922495004282044, + "grad_norm": 2.245680570602417, + "learning_rate": 4.0982814534377516e-05, + "loss": 3.5625, + "step": 46950 + }, + { + "epoch": 0.27923089732610146, + "grad_norm": 2.165235757827759, + "learning_rate": 4.098245535758929e-05, + "loss": 3.4733, + "step": 46951 + }, + { + "epoch": 0.2792368446093824, + "grad_norm": 1.2038118839263916, + "learning_rate": 4.0982096175221746e-05, + "loss": 4.725, + "step": 46952 + }, + { + "epoch": 0.27924279189266343, + "grad_norm": 1.4017865657806396, + "learning_rate": 4.098173698727502e-05, + "loss": 4.7091, + "step": 46953 + }, + { + "epoch": 0.27924873917594445, + "grad_norm": 1.5981968641281128, + "learning_rate": 4.098137779374924e-05, + "loss": 4.8127, + "step": 46954 + }, + { + "epoch": 0.2792546864592254, + "grad_norm": 1.6514302492141724, + "learning_rate": 4.098101859464452e-05, + "loss": 4.3869, + "step": 46955 + }, + { + "epoch": 0.2792606337425064, + "grad_norm": 1.4147202968597412, + "learning_rate": 4.0980659389961e-05, + "loss": 4.6197, + "step": 46956 + }, + { + "epoch": 0.27926658102578744, + "grad_norm": 1.5349702835083008, + "learning_rate": 4.098030017969879e-05, + "loss": 4.7849, + "step": 46957 + }, + { + "epoch": 0.2792725283090684, + "grad_norm": 1.6076208353042603, + "learning_rate": 4.097994096385802e-05, + "loss": 4.6437, + "step": 46958 + }, + { + "epoch": 0.2792784755923494, + "grad_norm": 1.5086344480514526, + "learning_rate": 4.097958174243882e-05, + "loss": 4.8438, + "step": 46959 + }, + { + "epoch": 0.27928442287563043, + "grad_norm": 1.6116726398468018, + "learning_rate": 4.0979222515441316e-05, + "loss": 4.8073, + "step": 46960 + }, + { + "epoch": 0.2792903701589114, + "grad_norm": 1.6911540031433105, + "learning_rate": 4.097886328286563e-05, + "loss": 4.6328, + "step": 46961 + }, + { + "epoch": 0.2792963174421924, + "grad_norm": 1.958189845085144, + "learning_rate": 4.0978504044711886e-05, + "loss": 4.1596, + "step": 46962 + }, + { + "epoch": 0.2793022647254734, + "grad_norm": 1.4493284225463867, + "learning_rate": 4.0978144800980213e-05, + "loss": 4.7706, + "step": 46963 + }, + { + "epoch": 0.2793082120087544, + "grad_norm": 1.4827090501785278, + "learning_rate": 4.0977785551670734e-05, + "loss": 5.0226, + "step": 46964 + }, + { + "epoch": 0.2793141592920354, + "grad_norm": 1.4688717126846313, + "learning_rate": 4.0977426296783576e-05, + "loss": 4.9509, + "step": 46965 + }, + { + "epoch": 0.2793201065753164, + "grad_norm": 1.4776463508605957, + "learning_rate": 4.097706703631886e-05, + "loss": 5.0116, + "step": 46966 + }, + { + "epoch": 0.2793260538585974, + "grad_norm": 1.4927902221679688, + "learning_rate": 4.0976707770276715e-05, + "loss": 4.9932, + "step": 46967 + }, + { + "epoch": 0.2793320011418784, + "grad_norm": 1.4073221683502197, + "learning_rate": 4.0976348498657275e-05, + "loss": 5.0237, + "step": 46968 + }, + { + "epoch": 0.2793379484251594, + "grad_norm": 1.66119384765625, + "learning_rate": 4.0975989221460645e-05, + "loss": 4.5298, + "step": 46969 + }, + { + "epoch": 0.27934389570844037, + "grad_norm": 1.5720603466033936, + "learning_rate": 4.0975629938686975e-05, + "loss": 4.9761, + "step": 46970 + }, + { + "epoch": 0.2793498429917214, + "grad_norm": 1.4667048454284668, + "learning_rate": 4.097527065033637e-05, + "loss": 4.8662, + "step": 46971 + }, + { + "epoch": 0.2793557902750024, + "grad_norm": 1.659524917602539, + "learning_rate": 4.097491135640896e-05, + "loss": 4.6519, + "step": 46972 + }, + { + "epoch": 0.27936173755828336, + "grad_norm": 1.4765175580978394, + "learning_rate": 4.097455205690488e-05, + "loss": 4.8816, + "step": 46973 + }, + { + "epoch": 0.2793676848415644, + "grad_norm": 1.5401256084442139, + "learning_rate": 4.0974192751824245e-05, + "loss": 4.6919, + "step": 46974 + }, + { + "epoch": 0.2793736321248454, + "grad_norm": 1.7305947542190552, + "learning_rate": 4.0973833441167184e-05, + "loss": 4.3053, + "step": 46975 + }, + { + "epoch": 0.27937957940812636, + "grad_norm": 2.3940389156341553, + "learning_rate": 4.097347412493382e-05, + "loss": 3.7019, + "step": 46976 + }, + { + "epoch": 0.27938552669140737, + "grad_norm": 2.3568572998046875, + "learning_rate": 4.097311480312429e-05, + "loss": 3.8051, + "step": 46977 + }, + { + "epoch": 0.2793914739746884, + "grad_norm": 1.9515057802200317, + "learning_rate": 4.09727554757387e-05, + "loss": 4.2226, + "step": 46978 + }, + { + "epoch": 0.27939742125796935, + "grad_norm": 1.7394646406173706, + "learning_rate": 4.09723961427772e-05, + "loss": 4.2558, + "step": 46979 + }, + { + "epoch": 0.27940336854125036, + "grad_norm": 1.7899632453918457, + "learning_rate": 4.0972036804239886e-05, + "loss": 4.249, + "step": 46980 + }, + { + "epoch": 0.2794093158245314, + "grad_norm": 2.7494325637817383, + "learning_rate": 4.0971677460126915e-05, + "loss": 4.3423, + "step": 46981 + }, + { + "epoch": 0.27941526310781234, + "grad_norm": 2.651390790939331, + "learning_rate": 4.0971318110438386e-05, + "loss": 4.377, + "step": 46982 + }, + { + "epoch": 0.27942121039109336, + "grad_norm": 2.1691789627075195, + "learning_rate": 4.0970958755174436e-05, + "loss": 4.0604, + "step": 46983 + }, + { + "epoch": 0.27942715767437437, + "grad_norm": 2.059593439102173, + "learning_rate": 4.0970599394335193e-05, + "loss": 3.8601, + "step": 46984 + }, + { + "epoch": 0.27943310495765533, + "grad_norm": 2.3105475902557373, + "learning_rate": 4.097024002792077e-05, + "loss": 3.542, + "step": 46985 + }, + { + "epoch": 0.27943905224093635, + "grad_norm": 2.380540609359741, + "learning_rate": 4.096988065593132e-05, + "loss": 3.8002, + "step": 46986 + }, + { + "epoch": 0.27944499952421736, + "grad_norm": 2.053758382797241, + "learning_rate": 4.096952127836693e-05, + "loss": 3.8808, + "step": 46987 + }, + { + "epoch": 0.2794509468074983, + "grad_norm": 1.8894034624099731, + "learning_rate": 4.096916189522775e-05, + "loss": 3.9246, + "step": 46988 + }, + { + "epoch": 0.27945689409077934, + "grad_norm": 1.8721152544021606, + "learning_rate": 4.0968802506513904e-05, + "loss": 4.0663, + "step": 46989 + }, + { + "epoch": 0.27946284137406036, + "grad_norm": 2.034374475479126, + "learning_rate": 4.0968443112225506e-05, + "loss": 3.7667, + "step": 46990 + }, + { + "epoch": 0.2794687886573413, + "grad_norm": 1.3391549587249756, + "learning_rate": 4.0968083712362696e-05, + "loss": 4.558, + "step": 46991 + }, + { + "epoch": 0.27947473594062233, + "grad_norm": 1.4439138174057007, + "learning_rate": 4.09677243069256e-05, + "loss": 4.739, + "step": 46992 + }, + { + "epoch": 0.27948068322390335, + "grad_norm": 1.3751527070999146, + "learning_rate": 4.096736489591433e-05, + "loss": 4.4538, + "step": 46993 + }, + { + "epoch": 0.2794866305071843, + "grad_norm": 1.3850855827331543, + "learning_rate": 4.0967005479329016e-05, + "loss": 4.6493, + "step": 46994 + }, + { + "epoch": 0.2794925777904653, + "grad_norm": 1.4974260330200195, + "learning_rate": 4.096664605716979e-05, + "loss": 4.6353, + "step": 46995 + }, + { + "epoch": 0.27949852507374634, + "grad_norm": 1.636195182800293, + "learning_rate": 4.096628662943676e-05, + "loss": 4.8228, + "step": 46996 + }, + { + "epoch": 0.2795044723570273, + "grad_norm": 1.5374054908752441, + "learning_rate": 4.096592719613007e-05, + "loss": 4.4731, + "step": 46997 + }, + { + "epoch": 0.2795104196403083, + "grad_norm": 1.4503624439239502, + "learning_rate": 4.096556775724985e-05, + "loss": 4.724, + "step": 46998 + }, + { + "epoch": 0.2795163669235893, + "grad_norm": 1.4700664281845093, + "learning_rate": 4.096520831279621e-05, + "loss": 4.7594, + "step": 46999 + }, + { + "epoch": 0.2795223142068703, + "grad_norm": 1.5255173444747925, + "learning_rate": 4.096484886276928e-05, + "loss": 4.7129, + "step": 47000 + }, + { + "epoch": 0.2795282614901513, + "grad_norm": 1.5363537073135376, + "learning_rate": 4.096448940716918e-05, + "loss": 4.6096, + "step": 47001 + }, + { + "epoch": 0.27953420877343227, + "grad_norm": 1.6588841676712036, + "learning_rate": 4.096412994599605e-05, + "loss": 4.4701, + "step": 47002 + }, + { + "epoch": 0.2795401560567133, + "grad_norm": 1.4961185455322266, + "learning_rate": 4.096377047925001e-05, + "loss": 4.591, + "step": 47003 + }, + { + "epoch": 0.2795461033399943, + "grad_norm": 1.563030481338501, + "learning_rate": 4.0963411006931174e-05, + "loss": 4.5885, + "step": 47004 + }, + { + "epoch": 0.27955205062327526, + "grad_norm": 1.4748154878616333, + "learning_rate": 4.0963051529039676e-05, + "loss": 4.7287, + "step": 47005 + }, + { + "epoch": 0.2795579979065563, + "grad_norm": 1.5881901979446411, + "learning_rate": 4.096269204557564e-05, + "loss": 4.2973, + "step": 47006 + }, + { + "epoch": 0.2795639451898373, + "grad_norm": 1.6121301651000977, + "learning_rate": 4.096233255653921e-05, + "loss": 4.4623, + "step": 47007 + }, + { + "epoch": 0.27956989247311825, + "grad_norm": 2.1276841163635254, + "learning_rate": 4.096197306193048e-05, + "loss": 3.8873, + "step": 47008 + }, + { + "epoch": 0.27957583975639927, + "grad_norm": 1.8068395853042603, + "learning_rate": 4.096161356174959e-05, + "loss": 4.3699, + "step": 47009 + }, + { + "epoch": 0.2795817870396803, + "grad_norm": 1.6615993976593018, + "learning_rate": 4.096125405599667e-05, + "loss": 4.2425, + "step": 47010 + }, + { + "epoch": 0.27958773432296125, + "grad_norm": 1.5546536445617676, + "learning_rate": 4.096089454467184e-05, + "loss": 4.1474, + "step": 47011 + }, + { + "epoch": 0.27959368160624226, + "grad_norm": 1.5724902153015137, + "learning_rate": 4.0960535027775224e-05, + "loss": 4.268, + "step": 47012 + }, + { + "epoch": 0.2795996288895233, + "grad_norm": 1.7465388774871826, + "learning_rate": 4.096017550530695e-05, + "loss": 3.4931, + "step": 47013 + }, + { + "epoch": 0.27960557617280424, + "grad_norm": 2.3783023357391357, + "learning_rate": 4.0959815977267144e-05, + "loss": 3.5801, + "step": 47014 + }, + { + "epoch": 0.27961152345608525, + "grad_norm": 1.8406641483306885, + "learning_rate": 4.0959456443655935e-05, + "loss": 4.1734, + "step": 47015 + }, + { + "epoch": 0.27961747073936627, + "grad_norm": 1.8811932802200317, + "learning_rate": 4.0959096904473444e-05, + "loss": 4.3659, + "step": 47016 + }, + { + "epoch": 0.27962341802264723, + "grad_norm": 2.026712417602539, + "learning_rate": 4.0958737359719794e-05, + "loss": 3.511, + "step": 47017 + }, + { + "epoch": 0.27962936530592825, + "grad_norm": 2.1268699169158936, + "learning_rate": 4.095837780939512e-05, + "loss": 3.4514, + "step": 47018 + }, + { + "epoch": 0.27963531258920926, + "grad_norm": 2.224076986312866, + "learning_rate": 4.095801825349953e-05, + "loss": 3.3973, + "step": 47019 + }, + { + "epoch": 0.2796412598724902, + "grad_norm": 1.7797030210494995, + "learning_rate": 4.0957658692033176e-05, + "loss": 4.0971, + "step": 47020 + }, + { + "epoch": 0.27964720715577124, + "grad_norm": 2.0381011962890625, + "learning_rate": 4.095729912499615e-05, + "loss": 2.2329, + "step": 47021 + }, + { + "epoch": 0.27965315443905225, + "grad_norm": 1.6463102102279663, + "learning_rate": 4.095693955238861e-05, + "loss": 4.6089, + "step": 47022 + }, + { + "epoch": 0.2796591017223332, + "grad_norm": 1.6098077297210693, + "learning_rate": 4.0956579974210664e-05, + "loss": 5.0333, + "step": 47023 + }, + { + "epoch": 0.27966504900561423, + "grad_norm": 1.5143425464630127, + "learning_rate": 4.095622039046244e-05, + "loss": 5.1585, + "step": 47024 + }, + { + "epoch": 0.27967099628889525, + "grad_norm": 1.4819917678833008, + "learning_rate": 4.0955860801144066e-05, + "loss": 4.7739, + "step": 47025 + }, + { + "epoch": 0.2796769435721762, + "grad_norm": 1.7616368532180786, + "learning_rate": 4.0955501206255654e-05, + "loss": 4.6839, + "step": 47026 + }, + { + "epoch": 0.2796828908554572, + "grad_norm": 2.3574438095092773, + "learning_rate": 4.0955141605797356e-05, + "loss": 4.3207, + "step": 47027 + }, + { + "epoch": 0.27968883813873824, + "grad_norm": 2.450086832046509, + "learning_rate": 4.0954781999769275e-05, + "loss": 2.9202, + "step": 47028 + }, + { + "epoch": 0.2796947854220192, + "grad_norm": 1.4928295612335205, + "learning_rate": 4.0954422388171546e-05, + "loss": 4.6927, + "step": 47029 + }, + { + "epoch": 0.2797007327053002, + "grad_norm": 1.410620927810669, + "learning_rate": 4.09540627710043e-05, + "loss": 5.0977, + "step": 47030 + }, + { + "epoch": 0.27970667998858123, + "grad_norm": 1.7527761459350586, + "learning_rate": 4.0953703148267644e-05, + "loss": 4.3486, + "step": 47031 + }, + { + "epoch": 0.2797126272718622, + "grad_norm": 1.7745978832244873, + "learning_rate": 4.0953343519961716e-05, + "loss": 2.8685, + "step": 47032 + }, + { + "epoch": 0.2797185745551432, + "grad_norm": 1.931699275970459, + "learning_rate": 4.095298388608665e-05, + "loss": 0.5271, + "step": 47033 + }, + { + "epoch": 0.2797245218384242, + "grad_norm": 1.5305982828140259, + "learning_rate": 4.0952624246642554e-05, + "loss": 4.5302, + "step": 47034 + }, + { + "epoch": 0.2797304691217052, + "grad_norm": 1.4441879987716675, + "learning_rate": 4.0952264601629565e-05, + "loss": 5.0693, + "step": 47035 + }, + { + "epoch": 0.2797364164049862, + "grad_norm": 1.4131189584732056, + "learning_rate": 4.0951904951047815e-05, + "loss": 5.0754, + "step": 47036 + }, + { + "epoch": 0.2797423636882672, + "grad_norm": 1.6463286876678467, + "learning_rate": 4.0951545294897406e-05, + "loss": 5.1396, + "step": 47037 + }, + { + "epoch": 0.2797483109715482, + "grad_norm": 2.1084163188934326, + "learning_rate": 4.095118563317848e-05, + "loss": 3.3872, + "step": 47038 + }, + { + "epoch": 0.2797542582548292, + "grad_norm": 2.406245708465576, + "learning_rate": 4.095082596589116e-05, + "loss": 2.8005, + "step": 47039 + }, + { + "epoch": 0.2797602055381102, + "grad_norm": 2.8497884273529053, + "learning_rate": 4.0950466293035575e-05, + "loss": 2.8879, + "step": 47040 + }, + { + "epoch": 0.27976615282139117, + "grad_norm": 2.4831082820892334, + "learning_rate": 4.0950106614611846e-05, + "loss": 2.8488, + "step": 47041 + }, + { + "epoch": 0.2797721001046722, + "grad_norm": 2.505443572998047, + "learning_rate": 4.0949746930620094e-05, + "loss": 3.0446, + "step": 47042 + }, + { + "epoch": 0.2797780473879532, + "grad_norm": 2.558229923248291, + "learning_rate": 4.0949387241060456e-05, + "loss": 3.4369, + "step": 47043 + }, + { + "epoch": 0.27978399467123416, + "grad_norm": 2.6752004623413086, + "learning_rate": 4.094902754593305e-05, + "loss": 3.6969, + "step": 47044 + }, + { + "epoch": 0.2797899419545152, + "grad_norm": 2.400135040283203, + "learning_rate": 4.094866784523801e-05, + "loss": 2.784, + "step": 47045 + }, + { + "epoch": 0.2797958892377962, + "grad_norm": 2.288853406906128, + "learning_rate": 4.0948308138975444e-05, + "loss": 3.0414, + "step": 47046 + }, + { + "epoch": 0.27980183652107715, + "grad_norm": 2.9992876052856445, + "learning_rate": 4.094794842714549e-05, + "loss": 1.8184, + "step": 47047 + }, + { + "epoch": 0.27980778380435817, + "grad_norm": 3.615018367767334, + "learning_rate": 4.0947588709748274e-05, + "loss": 1.9284, + "step": 47048 + }, + { + "epoch": 0.2798137310876392, + "grad_norm": 2.782905340194702, + "learning_rate": 4.094722898678393e-05, + "loss": 2.4755, + "step": 47049 + }, + { + "epoch": 0.27981967837092014, + "grad_norm": 2.590823173522949, + "learning_rate": 4.0946869258252554e-05, + "loss": 2.7456, + "step": 47050 + }, + { + "epoch": 0.27982562565420116, + "grad_norm": 2.5666351318359375, + "learning_rate": 4.0946509524154304e-05, + "loss": 2.7717, + "step": 47051 + }, + { + "epoch": 0.2798315729374822, + "grad_norm": 2.330162763595581, + "learning_rate": 4.0946149784489286e-05, + "loss": 2.8862, + "step": 47052 + }, + { + "epoch": 0.27983752022076314, + "grad_norm": 2.710801601409912, + "learning_rate": 4.0945790039257635e-05, + "loss": 2.9783, + "step": 47053 + }, + { + "epoch": 0.27984346750404415, + "grad_norm": 2.3664517402648926, + "learning_rate": 4.0945430288459474e-05, + "loss": 2.9638, + "step": 47054 + }, + { + "epoch": 0.27984941478732517, + "grad_norm": 2.666433334350586, + "learning_rate": 4.094507053209492e-05, + "loss": 2.9044, + "step": 47055 + }, + { + "epoch": 0.27985536207060613, + "grad_norm": 2.7155919075012207, + "learning_rate": 4.094471077016412e-05, + "loss": 3.0113, + "step": 47056 + }, + { + "epoch": 0.27986130935388714, + "grad_norm": 2.5670907497406006, + "learning_rate": 4.094435100266718e-05, + "loss": 3.2548, + "step": 47057 + }, + { + "epoch": 0.27986725663716816, + "grad_norm": 1.5219577550888062, + "learning_rate": 4.094399122960423e-05, + "loss": 4.8385, + "step": 47058 + }, + { + "epoch": 0.2798732039204491, + "grad_norm": 2.442986011505127, + "learning_rate": 4.0943631450975395e-05, + "loss": 3.1177, + "step": 47059 + }, + { + "epoch": 0.27987915120373014, + "grad_norm": 2.4758689403533936, + "learning_rate": 4.094327166678081e-05, + "loss": 2.7948, + "step": 47060 + }, + { + "epoch": 0.27988509848701115, + "grad_norm": 2.346963405609131, + "learning_rate": 4.0942911877020584e-05, + "loss": 2.7141, + "step": 47061 + }, + { + "epoch": 0.2798910457702921, + "grad_norm": 2.4696412086486816, + "learning_rate": 4.094255208169486e-05, + "loss": 2.6183, + "step": 47062 + }, + { + "epoch": 0.27989699305357313, + "grad_norm": 2.2091565132141113, + "learning_rate": 4.094219228080376e-05, + "loss": 2.688, + "step": 47063 + }, + { + "epoch": 0.27990294033685414, + "grad_norm": 2.2635304927825928, + "learning_rate": 4.09418324743474e-05, + "loss": 2.3725, + "step": 47064 + }, + { + "epoch": 0.2799088876201351, + "grad_norm": 2.400533437728882, + "learning_rate": 4.0941472662325905e-05, + "loss": 2.8192, + "step": 47065 + }, + { + "epoch": 0.2799148349034161, + "grad_norm": 2.819647789001465, + "learning_rate": 4.094111284473942e-05, + "loss": 2.9056, + "step": 47066 + }, + { + "epoch": 0.27992078218669714, + "grad_norm": 2.334005117416382, + "learning_rate": 4.094075302158804e-05, + "loss": 2.8024, + "step": 47067 + }, + { + "epoch": 0.2799267294699781, + "grad_norm": 2.788625478744507, + "learning_rate": 4.0940393192871915e-05, + "loss": 2.9123, + "step": 47068 + }, + { + "epoch": 0.2799326767532591, + "grad_norm": 2.4748618602752686, + "learning_rate": 4.094003335859117e-05, + "loss": 2.7776, + "step": 47069 + }, + { + "epoch": 0.27993862403654013, + "grad_norm": 2.2867391109466553, + "learning_rate": 4.093967351874591e-05, + "loss": 2.4578, + "step": 47070 + }, + { + "epoch": 0.2799445713198211, + "grad_norm": 2.3079612255096436, + "learning_rate": 4.093931367333629e-05, + "loss": 2.5389, + "step": 47071 + }, + { + "epoch": 0.2799505186031021, + "grad_norm": 2.7360622882843018, + "learning_rate": 4.0938953822362415e-05, + "loss": 2.7295, + "step": 47072 + }, + { + "epoch": 0.2799564658863831, + "grad_norm": 3.4087252616882324, + "learning_rate": 4.093859396582442e-05, + "loss": 3.4321, + "step": 47073 + }, + { + "epoch": 0.2799624131696641, + "grad_norm": 2.8043830394744873, + "learning_rate": 4.093823410372242e-05, + "loss": 3.3711, + "step": 47074 + }, + { + "epoch": 0.2799683604529451, + "grad_norm": 2.436272144317627, + "learning_rate": 4.093787423605655e-05, + "loss": 2.6076, + "step": 47075 + }, + { + "epoch": 0.2799743077362261, + "grad_norm": 2.440593957901001, + "learning_rate": 4.0937514362826926e-05, + "loss": 2.7857, + "step": 47076 + }, + { + "epoch": 0.2799802550195071, + "grad_norm": 2.3633320331573486, + "learning_rate": 4.093715448403369e-05, + "loss": 2.8809, + "step": 47077 + }, + { + "epoch": 0.2799862023027881, + "grad_norm": 2.24988055229187, + "learning_rate": 4.093679459967695e-05, + "loss": 3.3788, + "step": 47078 + }, + { + "epoch": 0.2799921495860691, + "grad_norm": 1.8773479461669922, + "learning_rate": 4.093643470975684e-05, + "loss": 4.5561, + "step": 47079 + }, + { + "epoch": 0.27999809686935007, + "grad_norm": 1.8971589803695679, + "learning_rate": 4.093607481427349e-05, + "loss": 4.5665, + "step": 47080 + }, + { + "epoch": 0.2800040441526311, + "grad_norm": 1.7093905210494995, + "learning_rate": 4.093571491322703e-05, + "loss": 4.9552, + "step": 47081 + }, + { + "epoch": 0.2800099914359121, + "grad_norm": 2.1023242473602295, + "learning_rate": 4.093535500661756e-05, + "loss": 3.801, + "step": 47082 + }, + { + "epoch": 0.28001593871919306, + "grad_norm": 2.801088571548462, + "learning_rate": 4.093499509444523e-05, + "loss": 3.2464, + "step": 47083 + }, + { + "epoch": 0.2800218860024741, + "grad_norm": 2.9708735942840576, + "learning_rate": 4.0934635176710157e-05, + "loss": 4.0397, + "step": 47084 + }, + { + "epoch": 0.2800278332857551, + "grad_norm": 3.2219042778015137, + "learning_rate": 4.093427525341247e-05, + "loss": 4.5698, + "step": 47085 + }, + { + "epoch": 0.28003378056903605, + "grad_norm": 1.86604905128479, + "learning_rate": 4.093391532455229e-05, + "loss": 4.5217, + "step": 47086 + }, + { + "epoch": 0.28003972785231707, + "grad_norm": 1.358663558959961, + "learning_rate": 4.093355539012974e-05, + "loss": 4.2475, + "step": 47087 + }, + { + "epoch": 0.2800456751355981, + "grad_norm": 1.582900047302246, + "learning_rate": 4.0933195450144956e-05, + "loss": 4.5109, + "step": 47088 + }, + { + "epoch": 0.28005162241887904, + "grad_norm": 1.6252461671829224, + "learning_rate": 4.093283550459806e-05, + "loss": 5.1559, + "step": 47089 + }, + { + "epoch": 0.28005756970216006, + "grad_norm": 1.6687095165252686, + "learning_rate": 4.0932475553489166e-05, + "loss": 4.7672, + "step": 47090 + }, + { + "epoch": 0.2800635169854411, + "grad_norm": 1.9017409086227417, + "learning_rate": 4.0932115596818413e-05, + "loss": 4.8578, + "step": 47091 + }, + { + "epoch": 0.28006946426872203, + "grad_norm": 1.6521977186203003, + "learning_rate": 4.093175563458593e-05, + "loss": 4.8118, + "step": 47092 + }, + { + "epoch": 0.28007541155200305, + "grad_norm": 1.5884202718734741, + "learning_rate": 4.0931395666791836e-05, + "loss": 4.8467, + "step": 47093 + }, + { + "epoch": 0.28008135883528407, + "grad_norm": 1.6141372919082642, + "learning_rate": 4.093103569343625e-05, + "loss": 4.6768, + "step": 47094 + }, + { + "epoch": 0.280087306118565, + "grad_norm": 1.3681541681289673, + "learning_rate": 4.09306757145193e-05, + "loss": 4.6133, + "step": 47095 + }, + { + "epoch": 0.28009325340184604, + "grad_norm": 1.725956916809082, + "learning_rate": 4.093031573004113e-05, + "loss": 4.8298, + "step": 47096 + }, + { + "epoch": 0.28009920068512706, + "grad_norm": 1.767429232597351, + "learning_rate": 4.092995574000184e-05, + "loss": 4.7436, + "step": 47097 + }, + { + "epoch": 0.280105147968408, + "grad_norm": 1.4766074419021606, + "learning_rate": 4.092959574440156e-05, + "loss": 4.6955, + "step": 47098 + }, + { + "epoch": 0.28011109525168904, + "grad_norm": 1.3639140129089355, + "learning_rate": 4.092923574324044e-05, + "loss": 4.6979, + "step": 47099 + }, + { + "epoch": 0.28011704253497005, + "grad_norm": 1.6338871717453003, + "learning_rate": 4.092887573651858e-05, + "loss": 4.5414, + "step": 47100 + }, + { + "epoch": 0.280122989818251, + "grad_norm": 2.3066577911376953, + "learning_rate": 4.0928515724236115e-05, + "loss": 4.634, + "step": 47101 + }, + { + "epoch": 0.280128937101532, + "grad_norm": 2.780658483505249, + "learning_rate": 4.092815570639317e-05, + "loss": 4.1517, + "step": 47102 + }, + { + "epoch": 0.28013488438481304, + "grad_norm": 1.6032252311706543, + "learning_rate": 4.092779568298987e-05, + "loss": 4.7893, + "step": 47103 + }, + { + "epoch": 0.280140831668094, + "grad_norm": 1.5584734678268433, + "learning_rate": 4.092743565402634e-05, + "loss": 4.8607, + "step": 47104 + }, + { + "epoch": 0.280146778951375, + "grad_norm": 1.5080952644348145, + "learning_rate": 4.092707561950271e-05, + "loss": 4.7183, + "step": 47105 + }, + { + "epoch": 0.28015272623465604, + "grad_norm": 1.3022301197052002, + "learning_rate": 4.09267155794191e-05, + "loss": 4.7612, + "step": 47106 + }, + { + "epoch": 0.280158673517937, + "grad_norm": 1.841722846031189, + "learning_rate": 4.092635553377564e-05, + "loss": 4.3655, + "step": 47107 + }, + { + "epoch": 0.280164620801218, + "grad_norm": 1.3172446489334106, + "learning_rate": 4.092599548257245e-05, + "loss": 4.6127, + "step": 47108 + }, + { + "epoch": 0.280170568084499, + "grad_norm": 1.4613982439041138, + "learning_rate": 4.092563542580966e-05, + "loss": 4.7095, + "step": 47109 + }, + { + "epoch": 0.28017651536778, + "grad_norm": 1.2644788026809692, + "learning_rate": 4.0925275363487395e-05, + "loss": 4.8813, + "step": 47110 + }, + { + "epoch": 0.280182462651061, + "grad_norm": 1.3484947681427002, + "learning_rate": 4.092491529560579e-05, + "loss": 4.6747, + "step": 47111 + }, + { + "epoch": 0.280188409934342, + "grad_norm": 1.421587347984314, + "learning_rate": 4.092455522216495e-05, + "loss": 4.6264, + "step": 47112 + }, + { + "epoch": 0.280194357217623, + "grad_norm": 1.3657293319702148, + "learning_rate": 4.092419514316502e-05, + "loss": 4.8309, + "step": 47113 + }, + { + "epoch": 0.280200304500904, + "grad_norm": 1.1225109100341797, + "learning_rate": 4.092383505860611e-05, + "loss": 4.8122, + "step": 47114 + }, + { + "epoch": 0.280206251784185, + "grad_norm": 1.2942579984664917, + "learning_rate": 4.092347496848836e-05, + "loss": 4.8291, + "step": 47115 + }, + { + "epoch": 0.280212199067466, + "grad_norm": 1.5817344188690186, + "learning_rate": 4.0923114872811886e-05, + "loss": 4.8836, + "step": 47116 + }, + { + "epoch": 0.280218146350747, + "grad_norm": 1.4587318897247314, + "learning_rate": 4.092275477157682e-05, + "loss": 4.7318, + "step": 47117 + }, + { + "epoch": 0.28022409363402795, + "grad_norm": 1.4965972900390625, + "learning_rate": 4.092239466478329e-05, + "loss": 4.6132, + "step": 47118 + }, + { + "epoch": 0.28023004091730896, + "grad_norm": 1.4989173412322998, + "learning_rate": 4.092203455243141e-05, + "loss": 4.4713, + "step": 47119 + }, + { + "epoch": 0.28023598820059, + "grad_norm": 1.6698880195617676, + "learning_rate": 4.092167443452131e-05, + "loss": 4.7726, + "step": 47120 + }, + { + "epoch": 0.28024193548387094, + "grad_norm": 1.521082878112793, + "learning_rate": 4.092131431105312e-05, + "loss": 4.6027, + "step": 47121 + }, + { + "epoch": 0.28024788276715196, + "grad_norm": 1.288023591041565, + "learning_rate": 4.0920954182026965e-05, + "loss": 4.6906, + "step": 47122 + }, + { + "epoch": 0.280253830050433, + "grad_norm": 1.416822075843811, + "learning_rate": 4.092059404744297e-05, + "loss": 4.4814, + "step": 47123 + }, + { + "epoch": 0.28025977733371393, + "grad_norm": 1.4182547330856323, + "learning_rate": 4.092023390730125e-05, + "loss": 4.8374, + "step": 47124 + }, + { + "epoch": 0.28026572461699495, + "grad_norm": 1.6556140184402466, + "learning_rate": 4.091987376160196e-05, + "loss": 4.8312, + "step": 47125 + }, + { + "epoch": 0.28027167190027596, + "grad_norm": 1.717768669128418, + "learning_rate": 4.091951361034519e-05, + "loss": 4.9995, + "step": 47126 + }, + { + "epoch": 0.2802776191835569, + "grad_norm": 1.4626914262771606, + "learning_rate": 4.091915345353109e-05, + "loss": 4.6925, + "step": 47127 + }, + { + "epoch": 0.28028356646683794, + "grad_norm": 1.5747040510177612, + "learning_rate": 4.091879329115978e-05, + "loss": 4.8133, + "step": 47128 + }, + { + "epoch": 0.28028951375011896, + "grad_norm": 1.4351204633712769, + "learning_rate": 4.091843312323138e-05, + "loss": 4.3977, + "step": 47129 + }, + { + "epoch": 0.2802954610333999, + "grad_norm": 1.4422274827957153, + "learning_rate": 4.091807294974602e-05, + "loss": 4.312, + "step": 47130 + }, + { + "epoch": 0.28030140831668093, + "grad_norm": 1.8385319709777832, + "learning_rate": 4.091771277070382e-05, + "loss": 4.5441, + "step": 47131 + }, + { + "epoch": 0.28030735559996195, + "grad_norm": 3.168699264526367, + "learning_rate": 4.091735258610492e-05, + "loss": 3.7055, + "step": 47132 + }, + { + "epoch": 0.2803133028832429, + "grad_norm": 4.521334648132324, + "learning_rate": 4.091699239594943e-05, + "loss": 3.2126, + "step": 47133 + }, + { + "epoch": 0.2803192501665239, + "grad_norm": 2.005889892578125, + "learning_rate": 4.0916632200237486e-05, + "loss": 4.6559, + "step": 47134 + }, + { + "epoch": 0.28032519744980494, + "grad_norm": 1.4916445016860962, + "learning_rate": 4.0916271998969205e-05, + "loss": 4.8925, + "step": 47135 + }, + { + "epoch": 0.2803311447330859, + "grad_norm": 1.6219265460968018, + "learning_rate": 4.091591179214472e-05, + "loss": 4.7638, + "step": 47136 + }, + { + "epoch": 0.2803370920163669, + "grad_norm": 1.6800804138183594, + "learning_rate": 4.091555157976417e-05, + "loss": 4.5196, + "step": 47137 + }, + { + "epoch": 0.28034303929964793, + "grad_norm": 1.3677103519439697, + "learning_rate": 4.091519136182764e-05, + "loss": 5.0572, + "step": 47138 + }, + { + "epoch": 0.2803489865829289, + "grad_norm": 1.6278204917907715, + "learning_rate": 4.0914831138335295e-05, + "loss": 4.5901, + "step": 47139 + }, + { + "epoch": 0.2803549338662099, + "grad_norm": 2.2075212001800537, + "learning_rate": 4.091447090928725e-05, + "loss": 3.6135, + "step": 47140 + }, + { + "epoch": 0.2803608811494909, + "grad_norm": 2.71620774269104, + "learning_rate": 4.091411067468361e-05, + "loss": 3.0272, + "step": 47141 + }, + { + "epoch": 0.2803668284327719, + "grad_norm": 1.504044771194458, + "learning_rate": 4.091375043452453e-05, + "loss": 4.6574, + "step": 47142 + }, + { + "epoch": 0.2803727757160529, + "grad_norm": 1.7621452808380127, + "learning_rate": 4.091339018881013e-05, + "loss": 4.3861, + "step": 47143 + }, + { + "epoch": 0.2803787229993339, + "grad_norm": 1.8085664510726929, + "learning_rate": 4.0913029937540524e-05, + "loss": 4.8827, + "step": 47144 + }, + { + "epoch": 0.2803846702826149, + "grad_norm": 1.5987364053726196, + "learning_rate": 4.091266968071584e-05, + "loss": 4.853, + "step": 47145 + }, + { + "epoch": 0.2803906175658959, + "grad_norm": 1.2195876836776733, + "learning_rate": 4.091230941833622e-05, + "loss": 5.1469, + "step": 47146 + }, + { + "epoch": 0.2803965648491769, + "grad_norm": 1.5924108028411865, + "learning_rate": 4.091194915040177e-05, + "loss": 4.7343, + "step": 47147 + }, + { + "epoch": 0.28040251213245787, + "grad_norm": 1.6297718286514282, + "learning_rate": 4.091158887691262e-05, + "loss": 4.5428, + "step": 47148 + }, + { + "epoch": 0.2804084594157389, + "grad_norm": 1.759567379951477, + "learning_rate": 4.09112285978689e-05, + "loss": 4.3838, + "step": 47149 + }, + { + "epoch": 0.2804144066990199, + "grad_norm": 1.8485347032546997, + "learning_rate": 4.091086831327073e-05, + "loss": 4.266, + "step": 47150 + }, + { + "epoch": 0.28042035398230086, + "grad_norm": 1.895914912223816, + "learning_rate": 4.091050802311825e-05, + "loss": 4.3148, + "step": 47151 + }, + { + "epoch": 0.2804263012655819, + "grad_norm": 1.518804669380188, + "learning_rate": 4.0910147727411575e-05, + "loss": 4.3815, + "step": 47152 + }, + { + "epoch": 0.2804322485488629, + "grad_norm": 1.481026291847229, + "learning_rate": 4.0909787426150824e-05, + "loss": 5.0062, + "step": 47153 + }, + { + "epoch": 0.28043819583214386, + "grad_norm": 1.874463677406311, + "learning_rate": 4.090942711933613e-05, + "loss": 4.0014, + "step": 47154 + }, + { + "epoch": 0.28044414311542487, + "grad_norm": 1.809640645980835, + "learning_rate": 4.0909066806967636e-05, + "loss": 4.0473, + "step": 47155 + }, + { + "epoch": 0.2804500903987059, + "grad_norm": 1.5471316576004028, + "learning_rate": 4.0908706489045436e-05, + "loss": 4.3759, + "step": 47156 + }, + { + "epoch": 0.28045603768198685, + "grad_norm": 1.386448860168457, + "learning_rate": 4.090834616556967e-05, + "loss": 4.8024, + "step": 47157 + }, + { + "epoch": 0.28046198496526786, + "grad_norm": 1.5640754699707031, + "learning_rate": 4.090798583654047e-05, + "loss": 5.2962, + "step": 47158 + }, + { + "epoch": 0.2804679322485489, + "grad_norm": 1.6747843027114868, + "learning_rate": 4.0907625501957955e-05, + "loss": 4.3052, + "step": 47159 + }, + { + "epoch": 0.28047387953182984, + "grad_norm": 2.2154417037963867, + "learning_rate": 4.090726516182225e-05, + "loss": 3.9769, + "step": 47160 + }, + { + "epoch": 0.28047982681511086, + "grad_norm": 1.3183492422103882, + "learning_rate": 4.090690481613349e-05, + "loss": 5.1721, + "step": 47161 + }, + { + "epoch": 0.28048577409839187, + "grad_norm": 1.44633948802948, + "learning_rate": 4.090654446489179e-05, + "loss": 5.4697, + "step": 47162 + }, + { + "epoch": 0.28049172138167283, + "grad_norm": 1.6074961423873901, + "learning_rate": 4.090618410809728e-05, + "loss": 4.8947, + "step": 47163 + }, + { + "epoch": 0.28049766866495385, + "grad_norm": 1.4763460159301758, + "learning_rate": 4.090582374575008e-05, + "loss": 4.7755, + "step": 47164 + }, + { + "epoch": 0.28050361594823486, + "grad_norm": 1.5637197494506836, + "learning_rate": 4.090546337785033e-05, + "loss": 4.6849, + "step": 47165 + }, + { + "epoch": 0.2805095632315158, + "grad_norm": 1.450317621231079, + "learning_rate": 4.0905103004398145e-05, + "loss": 4.7924, + "step": 47166 + }, + { + "epoch": 0.28051551051479684, + "grad_norm": 1.5182768106460571, + "learning_rate": 4.090474262539365e-05, + "loss": 4.8985, + "step": 47167 + }, + { + "epoch": 0.28052145779807786, + "grad_norm": 1.7445292472839355, + "learning_rate": 4.090438224083698e-05, + "loss": 4.8979, + "step": 47168 + }, + { + "epoch": 0.2805274050813588, + "grad_norm": 1.7012017965316772, + "learning_rate": 4.0904021850728245e-05, + "loss": 4.7441, + "step": 47169 + }, + { + "epoch": 0.28053335236463983, + "grad_norm": 1.7025765180587769, + "learning_rate": 4.090366145506758e-05, + "loss": 4.7371, + "step": 47170 + }, + { + "epoch": 0.28053929964792085, + "grad_norm": 1.5595024824142456, + "learning_rate": 4.0903301053855115e-05, + "loss": 4.5501, + "step": 47171 + }, + { + "epoch": 0.2805452469312018, + "grad_norm": 1.6307015419006348, + "learning_rate": 4.090294064709097e-05, + "loss": 4.368, + "step": 47172 + }, + { + "epoch": 0.2805511942144828, + "grad_norm": 1.6086517572402954, + "learning_rate": 4.0902580234775275e-05, + "loss": 4.7455, + "step": 47173 + }, + { + "epoch": 0.28055714149776384, + "grad_norm": 1.6888082027435303, + "learning_rate": 4.090221981690816e-05, + "loss": 4.3844, + "step": 47174 + }, + { + "epoch": 0.2805630887810448, + "grad_norm": 1.6975167989730835, + "learning_rate": 4.090185939348974e-05, + "loss": 4.4723, + "step": 47175 + }, + { + "epoch": 0.2805690360643258, + "grad_norm": 1.6390060186386108, + "learning_rate": 4.090149896452014e-05, + "loss": 4.4825, + "step": 47176 + }, + { + "epoch": 0.28057498334760683, + "grad_norm": 3.0753748416900635, + "learning_rate": 4.09011385299995e-05, + "loss": 3.1531, + "step": 47177 + }, + { + "epoch": 0.2805809306308878, + "grad_norm": 3.0612261295318604, + "learning_rate": 4.090077808992792e-05, + "loss": 2.6555, + "step": 47178 + }, + { + "epoch": 0.2805868779141688, + "grad_norm": 2.9355907440185547, + "learning_rate": 4.090041764430556e-05, + "loss": 2.4773, + "step": 47179 + }, + { + "epoch": 0.2805928251974498, + "grad_norm": 1.3122127056121826, + "learning_rate": 4.090005719313252e-05, + "loss": 4.8975, + "step": 47180 + }, + { + "epoch": 0.2805987724807308, + "grad_norm": 1.4505960941314697, + "learning_rate": 4.089969673640893e-05, + "loss": 4.841, + "step": 47181 + }, + { + "epoch": 0.2806047197640118, + "grad_norm": 2.2947399616241455, + "learning_rate": 4.089933627413494e-05, + "loss": 4.284, + "step": 47182 + }, + { + "epoch": 0.2806106670472928, + "grad_norm": 1.8926546573638916, + "learning_rate": 4.089897580631063e-05, + "loss": 4.4015, + "step": 47183 + }, + { + "epoch": 0.2806166143305738, + "grad_norm": 2.6698930263519287, + "learning_rate": 4.089861533293617e-05, + "loss": 2.1238, + "step": 47184 + }, + { + "epoch": 0.2806225616138548, + "grad_norm": 2.7950007915496826, + "learning_rate": 4.089825485401167e-05, + "loss": 1.8264, + "step": 47185 + }, + { + "epoch": 0.2806285088971358, + "grad_norm": 2.544133186340332, + "learning_rate": 4.0897894369537236e-05, + "loss": 2.6644, + "step": 47186 + }, + { + "epoch": 0.28063445618041677, + "grad_norm": 2.593374729156494, + "learning_rate": 4.089753387951302e-05, + "loss": 2.6336, + "step": 47187 + }, + { + "epoch": 0.2806404034636978, + "grad_norm": 2.7296299934387207, + "learning_rate": 4.089717338393914e-05, + "loss": 2.584, + "step": 47188 + }, + { + "epoch": 0.2806463507469788, + "grad_norm": 2.200904607772827, + "learning_rate": 4.089681288281572e-05, + "loss": 3.5759, + "step": 47189 + }, + { + "epoch": 0.28065229803025976, + "grad_norm": 2.2658698558807373, + "learning_rate": 4.089645237614289e-05, + "loss": 3.8239, + "step": 47190 + }, + { + "epoch": 0.2806582453135408, + "grad_norm": 2.2615089416503906, + "learning_rate": 4.089609186392077e-05, + "loss": 3.5759, + "step": 47191 + }, + { + "epoch": 0.2806641925968218, + "grad_norm": 2.187340259552002, + "learning_rate": 4.089573134614949e-05, + "loss": 3.6576, + "step": 47192 + }, + { + "epoch": 0.28067013988010275, + "grad_norm": 2.025587320327759, + "learning_rate": 4.089537082282917e-05, + "loss": 3.5265, + "step": 47193 + }, + { + "epoch": 0.28067608716338377, + "grad_norm": 1.999898076057434, + "learning_rate": 4.0895010293959946e-05, + "loss": 3.8844, + "step": 47194 + }, + { + "epoch": 0.2806820344466648, + "grad_norm": 1.922324299812317, + "learning_rate": 4.089464975954194e-05, + "loss": 3.7362, + "step": 47195 + }, + { + "epoch": 0.28068798172994575, + "grad_norm": 1.9815521240234375, + "learning_rate": 4.089428921957527e-05, + "loss": 3.7121, + "step": 47196 + }, + { + "epoch": 0.28069392901322676, + "grad_norm": 2.046726703643799, + "learning_rate": 4.0893928674060066e-05, + "loss": 3.6309, + "step": 47197 + }, + { + "epoch": 0.2806998762965078, + "grad_norm": 2.016411304473877, + "learning_rate": 4.0893568122996465e-05, + "loss": 3.6139, + "step": 47198 + }, + { + "epoch": 0.28070582357978874, + "grad_norm": 2.3060572147369385, + "learning_rate": 4.0893207566384575e-05, + "loss": 3.7246, + "step": 47199 + }, + { + "epoch": 0.28071177086306975, + "grad_norm": 1.9322094917297363, + "learning_rate": 4.089284700422453e-05, + "loss": 3.8935, + "step": 47200 + }, + { + "epoch": 0.28071771814635077, + "grad_norm": 1.8403582572937012, + "learning_rate": 4.089248643651646e-05, + "loss": 4.5373, + "step": 47201 + }, + { + "epoch": 0.28072366542963173, + "grad_norm": 1.694286823272705, + "learning_rate": 4.089212586326049e-05, + "loss": 4.057, + "step": 47202 + }, + { + "epoch": 0.28072961271291275, + "grad_norm": 1.7875769138336182, + "learning_rate": 4.089176528445674e-05, + "loss": 4.1434, + "step": 47203 + }, + { + "epoch": 0.28073555999619376, + "grad_norm": 1.6637324094772339, + "learning_rate": 4.089140470010534e-05, + "loss": 3.9714, + "step": 47204 + }, + { + "epoch": 0.2807415072794747, + "grad_norm": 1.8465543985366821, + "learning_rate": 4.089104411020641e-05, + "loss": 4.1855, + "step": 47205 + }, + { + "epoch": 0.28074745456275574, + "grad_norm": 1.7938817739486694, + "learning_rate": 4.089068351476009e-05, + "loss": 4.3715, + "step": 47206 + }, + { + "epoch": 0.28075340184603675, + "grad_norm": 1.973949670791626, + "learning_rate": 4.089032291376649e-05, + "loss": 3.8153, + "step": 47207 + }, + { + "epoch": 0.2807593491293177, + "grad_norm": 2.193922996520996, + "learning_rate": 4.088996230722574e-05, + "loss": 3.7708, + "step": 47208 + }, + { + "epoch": 0.28076529641259873, + "grad_norm": 1.743018388748169, + "learning_rate": 4.088960169513797e-05, + "loss": 4.4168, + "step": 47209 + }, + { + "epoch": 0.28077124369587975, + "grad_norm": 1.9125360250473022, + "learning_rate": 4.0889241077503305e-05, + "loss": 3.9103, + "step": 47210 + }, + { + "epoch": 0.2807771909791607, + "grad_norm": 2.1659035682678223, + "learning_rate": 4.088888045432187e-05, + "loss": 3.6934, + "step": 47211 + }, + { + "epoch": 0.2807831382624417, + "grad_norm": 1.7337592840194702, + "learning_rate": 4.0888519825593797e-05, + "loss": 4.4458, + "step": 47212 + }, + { + "epoch": 0.28078908554572274, + "grad_norm": 2.0856447219848633, + "learning_rate": 4.08881591913192e-05, + "loss": 3.838, + "step": 47213 + }, + { + "epoch": 0.2807950328290037, + "grad_norm": 1.9061206579208374, + "learning_rate": 4.0887798551498205e-05, + "loss": 4.022, + "step": 47214 + }, + { + "epoch": 0.2808009801122847, + "grad_norm": 1.9515572786331177, + "learning_rate": 4.0887437906130957e-05, + "loss": 4.2975, + "step": 47215 + }, + { + "epoch": 0.28080692739556573, + "grad_norm": 2.102130651473999, + "learning_rate": 4.088707725521756e-05, + "loss": 4.2708, + "step": 47216 + }, + { + "epoch": 0.2808128746788467, + "grad_norm": 1.9537633657455444, + "learning_rate": 4.0886716598758144e-05, + "loss": 4.3313, + "step": 47217 + }, + { + "epoch": 0.2808188219621277, + "grad_norm": 1.621932029724121, + "learning_rate": 4.088635593675285e-05, + "loss": 4.2049, + "step": 47218 + }, + { + "epoch": 0.2808247692454087, + "grad_norm": 1.8037450313568115, + "learning_rate": 4.0885995269201796e-05, + "loss": 4.0068, + "step": 47219 + }, + { + "epoch": 0.2808307165286897, + "grad_norm": 1.7357313632965088, + "learning_rate": 4.0885634596105095e-05, + "loss": 4.055, + "step": 47220 + }, + { + "epoch": 0.2808366638119707, + "grad_norm": 1.961786150932312, + "learning_rate": 4.088527391746288e-05, + "loss": 4.394, + "step": 47221 + }, + { + "epoch": 0.2808426110952517, + "grad_norm": 1.7433089017868042, + "learning_rate": 4.088491323327529e-05, + "loss": 4.2795, + "step": 47222 + }, + { + "epoch": 0.2808485583785327, + "grad_norm": 1.871067762374878, + "learning_rate": 4.088455254354243e-05, + "loss": 4.2872, + "step": 47223 + }, + { + "epoch": 0.2808545056618137, + "grad_norm": 1.5156172513961792, + "learning_rate": 4.088419184826445e-05, + "loss": 4.3835, + "step": 47224 + }, + { + "epoch": 0.2808604529450947, + "grad_norm": 1.7633267641067505, + "learning_rate": 4.0883831147441454e-05, + "loss": 4.2819, + "step": 47225 + }, + { + "epoch": 0.28086640022837567, + "grad_norm": 1.5926241874694824, + "learning_rate": 4.0883470441073576e-05, + "loss": 4.0918, + "step": 47226 + }, + { + "epoch": 0.2808723475116567, + "grad_norm": 1.7551199197769165, + "learning_rate": 4.088310972916095e-05, + "loss": 4.0383, + "step": 47227 + }, + { + "epoch": 0.2808782947949377, + "grad_norm": 1.6689196825027466, + "learning_rate": 4.088274901170369e-05, + "loss": 4.2783, + "step": 47228 + }, + { + "epoch": 0.28088424207821866, + "grad_norm": 1.6118255853652954, + "learning_rate": 4.088238828870193e-05, + "loss": 4.2579, + "step": 47229 + }, + { + "epoch": 0.2808901893614997, + "grad_norm": 1.6812528371810913, + "learning_rate": 4.088202756015579e-05, + "loss": 4.153, + "step": 47230 + }, + { + "epoch": 0.2808961366447807, + "grad_norm": 1.6811459064483643, + "learning_rate": 4.08816668260654e-05, + "loss": 4.5954, + "step": 47231 + }, + { + "epoch": 0.28090208392806165, + "grad_norm": 1.8511337041854858, + "learning_rate": 4.088130608643088e-05, + "loss": 4.2865, + "step": 47232 + }, + { + "epoch": 0.28090803121134267, + "grad_norm": 1.9584579467773438, + "learning_rate": 4.088094534125236e-05, + "loss": 4.0539, + "step": 47233 + }, + { + "epoch": 0.28091397849462363, + "grad_norm": 1.6133073568344116, + "learning_rate": 4.088058459052997e-05, + "loss": 4.4968, + "step": 47234 + }, + { + "epoch": 0.28091992577790464, + "grad_norm": 2.474860429763794, + "learning_rate": 4.088022383426383e-05, + "loss": 4.1675, + "step": 47235 + }, + { + "epoch": 0.28092587306118566, + "grad_norm": 2.6176559925079346, + "learning_rate": 4.087986307245406e-05, + "loss": 3.8149, + "step": 47236 + }, + { + "epoch": 0.2809318203444666, + "grad_norm": 2.4516961574554443, + "learning_rate": 4.0879502305100806e-05, + "loss": 3.8753, + "step": 47237 + }, + { + "epoch": 0.28093776762774764, + "grad_norm": 2.264683485031128, + "learning_rate": 4.087914153220418e-05, + "loss": 3.5731, + "step": 47238 + }, + { + "epoch": 0.28094371491102865, + "grad_norm": 1.9328043460845947, + "learning_rate": 4.0878780753764304e-05, + "loss": 4.2979, + "step": 47239 + }, + { + "epoch": 0.2809496621943096, + "grad_norm": 1.8252702951431274, + "learning_rate": 4.087841996978131e-05, + "loss": 4.0294, + "step": 47240 + }, + { + "epoch": 0.28095560947759063, + "grad_norm": 1.4922966957092285, + "learning_rate": 4.087805918025533e-05, + "loss": 4.4301, + "step": 47241 + }, + { + "epoch": 0.28096155676087164, + "grad_norm": 3.058987855911255, + "learning_rate": 4.087769838518648e-05, + "loss": 2.9094, + "step": 47242 + }, + { + "epoch": 0.2809675040441526, + "grad_norm": 2.5754759311676025, + "learning_rate": 4.0877337584574886e-05, + "loss": 2.6967, + "step": 47243 + }, + { + "epoch": 0.2809734513274336, + "grad_norm": 1.965612530708313, + "learning_rate": 4.0876976778420686e-05, + "loss": 3.8349, + "step": 47244 + }, + { + "epoch": 0.28097939861071464, + "grad_norm": 2.33282208442688, + "learning_rate": 4.0876615966723983e-05, + "loss": 2.4812, + "step": 47245 + }, + { + "epoch": 0.2809853458939956, + "grad_norm": 2.096374750137329, + "learning_rate": 4.0876255149484934e-05, + "loss": 3.268, + "step": 47246 + }, + { + "epoch": 0.2809912931772766, + "grad_norm": 2.279294490814209, + "learning_rate": 4.0875894326703647e-05, + "loss": 3.8633, + "step": 47247 + }, + { + "epoch": 0.28099724046055763, + "grad_norm": 4.7746076583862305, + "learning_rate": 4.0875533498380236e-05, + "loss": 2.4183, + "step": 47248 + }, + { + "epoch": 0.2810031877438386, + "grad_norm": 3.8281641006469727, + "learning_rate": 4.087517266451485e-05, + "loss": 1.9141, + "step": 47249 + }, + { + "epoch": 0.2810091350271196, + "grad_norm": 3.275090456008911, + "learning_rate": 4.08748118251076e-05, + "loss": 1.8188, + "step": 47250 + }, + { + "epoch": 0.2810150823104006, + "grad_norm": 2.7925052642822266, + "learning_rate": 4.087445098015862e-05, + "loss": 2.7061, + "step": 47251 + }, + { + "epoch": 0.2810210295936816, + "grad_norm": 1.9693454504013062, + "learning_rate": 4.087409012966803e-05, + "loss": 4.1923, + "step": 47252 + }, + { + "epoch": 0.2810269768769626, + "grad_norm": 1.8888397216796875, + "learning_rate": 4.0873729273635966e-05, + "loss": 4.0925, + "step": 47253 + }, + { + "epoch": 0.2810329241602436, + "grad_norm": 2.0407581329345703, + "learning_rate": 4.087336841206254e-05, + "loss": 4.4591, + "step": 47254 + }, + { + "epoch": 0.2810388714435246, + "grad_norm": 3.530010938644409, + "learning_rate": 4.0873007544947895e-05, + "loss": 2.8308, + "step": 47255 + }, + { + "epoch": 0.2810448187268056, + "grad_norm": 3.0857248306274414, + "learning_rate": 4.0872646672292135e-05, + "loss": 1.9298, + "step": 47256 + }, + { + "epoch": 0.2810507660100866, + "grad_norm": 3.3241636753082275, + "learning_rate": 4.087228579409541e-05, + "loss": 2.417, + "step": 47257 + }, + { + "epoch": 0.28105671329336757, + "grad_norm": 2.8807156085968018, + "learning_rate": 4.0871924910357825e-05, + "loss": 2.3633, + "step": 47258 + }, + { + "epoch": 0.2810626605766486, + "grad_norm": 3.1834874153137207, + "learning_rate": 4.0871564021079525e-05, + "loss": 4.276, + "step": 47259 + }, + { + "epoch": 0.2810686078599296, + "grad_norm": 2.297001361846924, + "learning_rate": 4.087120312626061e-05, + "loss": 4.6067, + "step": 47260 + }, + { + "epoch": 0.28107455514321056, + "grad_norm": 2.2261440753936768, + "learning_rate": 4.087084222590123e-05, + "loss": 4.5215, + "step": 47261 + }, + { + "epoch": 0.2810805024264916, + "grad_norm": 2.072815418243408, + "learning_rate": 4.087048132000151e-05, + "loss": 4.8391, + "step": 47262 + }, + { + "epoch": 0.2810864497097726, + "grad_norm": 2.2139742374420166, + "learning_rate": 4.087012040856156e-05, + "loss": 4.7244, + "step": 47263 + }, + { + "epoch": 0.28109239699305355, + "grad_norm": 2.055962562561035, + "learning_rate": 4.0869759491581515e-05, + "loss": 4.738, + "step": 47264 + }, + { + "epoch": 0.28109834427633457, + "grad_norm": 1.683387041091919, + "learning_rate": 4.08693985690615e-05, + "loss": 4.8671, + "step": 47265 + }, + { + "epoch": 0.2811042915596156, + "grad_norm": 1.78679621219635, + "learning_rate": 4.086903764100165e-05, + "loss": 4.4712, + "step": 47266 + }, + { + "epoch": 0.28111023884289654, + "grad_norm": 2.0929811000823975, + "learning_rate": 4.086867670740208e-05, + "loss": 4.4947, + "step": 47267 + }, + { + "epoch": 0.28111618612617756, + "grad_norm": 1.5685057640075684, + "learning_rate": 4.086831576826292e-05, + "loss": 4.5784, + "step": 47268 + }, + { + "epoch": 0.2811221334094586, + "grad_norm": 1.8982665538787842, + "learning_rate": 4.086795482358429e-05, + "loss": 4.4608, + "step": 47269 + }, + { + "epoch": 0.28112808069273953, + "grad_norm": 1.648539662361145, + "learning_rate": 4.0867593873366316e-05, + "loss": 4.5548, + "step": 47270 + }, + { + "epoch": 0.28113402797602055, + "grad_norm": 2.091188430786133, + "learning_rate": 4.0867232917609143e-05, + "loss": 4.5171, + "step": 47271 + }, + { + "epoch": 0.28113997525930157, + "grad_norm": 1.6676316261291504, + "learning_rate": 4.086687195631287e-05, + "loss": 4.7306, + "step": 47272 + }, + { + "epoch": 0.2811459225425825, + "grad_norm": 1.5900897979736328, + "learning_rate": 4.0866510989477644e-05, + "loss": 4.8096, + "step": 47273 + }, + { + "epoch": 0.28115186982586354, + "grad_norm": 1.5316306352615356, + "learning_rate": 4.0866150017103576e-05, + "loss": 4.6814, + "step": 47274 + }, + { + "epoch": 0.28115781710914456, + "grad_norm": 1.6015536785125732, + "learning_rate": 4.0865789039190806e-05, + "loss": 4.3938, + "step": 47275 + }, + { + "epoch": 0.2811637643924255, + "grad_norm": 1.4728822708129883, + "learning_rate": 4.086542805573945e-05, + "loss": 4.6388, + "step": 47276 + }, + { + "epoch": 0.28116971167570654, + "grad_norm": 1.8356212377548218, + "learning_rate": 4.086506706674964e-05, + "loss": 4.2767, + "step": 47277 + }, + { + "epoch": 0.28117565895898755, + "grad_norm": 1.7676610946655273, + "learning_rate": 4.086470607222149e-05, + "loss": 4.2902, + "step": 47278 + }, + { + "epoch": 0.2811816062422685, + "grad_norm": 1.5766797065734863, + "learning_rate": 4.086434507215514e-05, + "loss": 4.6048, + "step": 47279 + }, + { + "epoch": 0.2811875535255495, + "grad_norm": 1.4793157577514648, + "learning_rate": 4.086398406655072e-05, + "loss": 4.6859, + "step": 47280 + }, + { + "epoch": 0.28119350080883054, + "grad_norm": 1.508762001991272, + "learning_rate": 4.086362305540834e-05, + "loss": 4.7395, + "step": 47281 + }, + { + "epoch": 0.2811994480921115, + "grad_norm": 2.3166887760162354, + "learning_rate": 4.086326203872814e-05, + "loss": 4.5668, + "step": 47282 + }, + { + "epoch": 0.2812053953753925, + "grad_norm": 1.566911220550537, + "learning_rate": 4.0862901016510225e-05, + "loss": 4.0856, + "step": 47283 + }, + { + "epoch": 0.28121134265867354, + "grad_norm": 1.658430576324463, + "learning_rate": 4.0862539988754736e-05, + "loss": 4.6347, + "step": 47284 + }, + { + "epoch": 0.2812172899419545, + "grad_norm": 1.497441291809082, + "learning_rate": 4.086217895546181e-05, + "loss": 4.6745, + "step": 47285 + }, + { + "epoch": 0.2812232372252355, + "grad_norm": 1.5124742984771729, + "learning_rate": 4.086181791663155e-05, + "loss": 4.8805, + "step": 47286 + }, + { + "epoch": 0.2812291845085165, + "grad_norm": 1.8094483613967896, + "learning_rate": 4.08614568722641e-05, + "loss": 4.0882, + "step": 47287 + }, + { + "epoch": 0.2812351317917975, + "grad_norm": 1.578445315361023, + "learning_rate": 4.086109582235958e-05, + "loss": 4.5994, + "step": 47288 + }, + { + "epoch": 0.2812410790750785, + "grad_norm": 1.4294153451919556, + "learning_rate": 4.086073476691812e-05, + "loss": 4.6115, + "step": 47289 + }, + { + "epoch": 0.2812470263583595, + "grad_norm": 1.7775012254714966, + "learning_rate": 4.086037370593982e-05, + "loss": 5.0035, + "step": 47290 + }, + { + "epoch": 0.2812529736416405, + "grad_norm": 1.923750877380371, + "learning_rate": 4.086001263942485e-05, + "loss": 4.8488, + "step": 47291 + }, + { + "epoch": 0.2812589209249215, + "grad_norm": 1.8511744737625122, + "learning_rate": 4.08596515673733e-05, + "loss": 4.8702, + "step": 47292 + }, + { + "epoch": 0.2812648682082025, + "grad_norm": 1.4959499835968018, + "learning_rate": 4.085929048978532e-05, + "loss": 4.9057, + "step": 47293 + }, + { + "epoch": 0.2812708154914835, + "grad_norm": 1.879653811454773, + "learning_rate": 4.0858929406661015e-05, + "loss": 4.2574, + "step": 47294 + }, + { + "epoch": 0.2812767627747645, + "grad_norm": 1.3351924419403076, + "learning_rate": 4.085856831800054e-05, + "loss": 4.8853, + "step": 47295 + }, + { + "epoch": 0.2812827100580455, + "grad_norm": 1.578827977180481, + "learning_rate": 4.085820722380398e-05, + "loss": 4.4689, + "step": 47296 + }, + { + "epoch": 0.28128865734132646, + "grad_norm": 1.5898901224136353, + "learning_rate": 4.0857846124071495e-05, + "loss": 4.9554, + "step": 47297 + }, + { + "epoch": 0.2812946046246075, + "grad_norm": 1.4547984600067139, + "learning_rate": 4.08574850188032e-05, + "loss": 5.0317, + "step": 47298 + }, + { + "epoch": 0.2813005519078885, + "grad_norm": 1.4859349727630615, + "learning_rate": 4.0857123907999216e-05, + "loss": 5.1511, + "step": 47299 + }, + { + "epoch": 0.28130649919116946, + "grad_norm": 1.7473020553588867, + "learning_rate": 4.0856762791659674e-05, + "loss": 4.1816, + "step": 47300 + }, + { + "epoch": 0.2813124464744505, + "grad_norm": 1.4704173803329468, + "learning_rate": 4.08564016697847e-05, + "loss": 4.3923, + "step": 47301 + }, + { + "epoch": 0.2813183937577315, + "grad_norm": 1.649043083190918, + "learning_rate": 4.0856040542374426e-05, + "loss": 4.5374, + "step": 47302 + }, + { + "epoch": 0.28132434104101245, + "grad_norm": 3.3874402046203613, + "learning_rate": 4.085567940942897e-05, + "loss": 2.0908, + "step": 47303 + }, + { + "epoch": 0.28133028832429346, + "grad_norm": 1.617314338684082, + "learning_rate": 4.085531827094846e-05, + "loss": 4.2429, + "step": 47304 + }, + { + "epoch": 0.2813362356075745, + "grad_norm": 1.995607852935791, + "learning_rate": 4.085495712693303e-05, + "loss": 4.7152, + "step": 47305 + }, + { + "epoch": 0.28134218289085544, + "grad_norm": 1.9943618774414062, + "learning_rate": 4.085459597738278e-05, + "loss": 4.247, + "step": 47306 + }, + { + "epoch": 0.28134813017413646, + "grad_norm": 1.746886968612671, + "learning_rate": 4.085423482229786e-05, + "loss": 3.6929, + "step": 47307 + }, + { + "epoch": 0.2813540774574175, + "grad_norm": 1.542462944984436, + "learning_rate": 4.08538736616784e-05, + "loss": 4.5199, + "step": 47308 + }, + { + "epoch": 0.28136002474069843, + "grad_norm": 1.8032737970352173, + "learning_rate": 4.0853512495524504e-05, + "loss": 4.0749, + "step": 47309 + }, + { + "epoch": 0.28136597202397945, + "grad_norm": 1.6474323272705078, + "learning_rate": 4.0853151323836325e-05, + "loss": 4.487, + "step": 47310 + }, + { + "epoch": 0.28137191930726047, + "grad_norm": 1.587638258934021, + "learning_rate": 4.0852790146613965e-05, + "loss": 4.386, + "step": 47311 + }, + { + "epoch": 0.2813778665905414, + "grad_norm": 1.6261966228485107, + "learning_rate": 4.085242896385756e-05, + "loss": 4.8422, + "step": 47312 + }, + { + "epoch": 0.28138381387382244, + "grad_norm": 1.771388053894043, + "learning_rate": 4.085206777556724e-05, + "loss": 4.0728, + "step": 47313 + }, + { + "epoch": 0.28138976115710346, + "grad_norm": 1.497247338294983, + "learning_rate": 4.085170658174312e-05, + "loss": 4.8768, + "step": 47314 + }, + { + "epoch": 0.2813957084403844, + "grad_norm": 1.4101917743682861, + "learning_rate": 4.0851345382385336e-05, + "loss": 5.1174, + "step": 47315 + }, + { + "epoch": 0.28140165572366543, + "grad_norm": 1.2000230550765991, + "learning_rate": 4.0850984177494015e-05, + "loss": 5.2766, + "step": 47316 + }, + { + "epoch": 0.28140760300694645, + "grad_norm": 2.2356553077697754, + "learning_rate": 4.085062296706927e-05, + "loss": 4.182, + "step": 47317 + }, + { + "epoch": 0.2814135502902274, + "grad_norm": 1.9959880113601685, + "learning_rate": 4.0850261751111254e-05, + "loss": 3.768, + "step": 47318 + }, + { + "epoch": 0.2814194975735084, + "grad_norm": 2.797088861465454, + "learning_rate": 4.084990052962006e-05, + "loss": 2.0523, + "step": 47319 + }, + { + "epoch": 0.28142544485678944, + "grad_norm": 2.2386162281036377, + "learning_rate": 4.084953930259583e-05, + "loss": 2.7886, + "step": 47320 + }, + { + "epoch": 0.2814313921400704, + "grad_norm": 2.508023262023926, + "learning_rate": 4.0849178070038696e-05, + "loss": 2.9092, + "step": 47321 + }, + { + "epoch": 0.2814373394233514, + "grad_norm": 1.544407844543457, + "learning_rate": 4.084881683194878e-05, + "loss": 4.4518, + "step": 47322 + }, + { + "epoch": 0.28144328670663243, + "grad_norm": 1.512466311454773, + "learning_rate": 4.084845558832619e-05, + "loss": 4.3317, + "step": 47323 + }, + { + "epoch": 0.2814492339899134, + "grad_norm": 2.2144713401794434, + "learning_rate": 4.0848094339171084e-05, + "loss": 3.8889, + "step": 47324 + }, + { + "epoch": 0.2814551812731944, + "grad_norm": 2.453030824661255, + "learning_rate": 4.084773308448357e-05, + "loss": 3.5672, + "step": 47325 + }, + { + "epoch": 0.2814611285564754, + "grad_norm": 2.457789897918701, + "learning_rate": 4.084737182426377e-05, + "loss": 3.6147, + "step": 47326 + }, + { + "epoch": 0.2814670758397564, + "grad_norm": 2.354918956756592, + "learning_rate": 4.084701055851182e-05, + "loss": 3.2712, + "step": 47327 + }, + { + "epoch": 0.2814730231230374, + "grad_norm": 2.401935338973999, + "learning_rate": 4.084664928722784e-05, + "loss": 2.6574, + "step": 47328 + }, + { + "epoch": 0.2814789704063184, + "grad_norm": 2.061920166015625, + "learning_rate": 4.084628801041196e-05, + "loss": 3.5752, + "step": 47329 + }, + { + "epoch": 0.2814849176895994, + "grad_norm": 2.106905698776245, + "learning_rate": 4.08459267280643e-05, + "loss": 4.038, + "step": 47330 + }, + { + "epoch": 0.2814908649728804, + "grad_norm": 2.3884546756744385, + "learning_rate": 4.0845565440184996e-05, + "loss": 3.7244, + "step": 47331 + }, + { + "epoch": 0.2814968122561614, + "grad_norm": 2.587991237640381, + "learning_rate": 4.084520414677417e-05, + "loss": 3.8196, + "step": 47332 + }, + { + "epoch": 0.28150275953944237, + "grad_norm": 2.2764925956726074, + "learning_rate": 4.0844842847831944e-05, + "loss": 3.3657, + "step": 47333 + }, + { + "epoch": 0.2815087068227234, + "grad_norm": 2.2095186710357666, + "learning_rate": 4.084448154335845e-05, + "loss": 3.1253, + "step": 47334 + }, + { + "epoch": 0.2815146541060044, + "grad_norm": 2.230058431625366, + "learning_rate": 4.084412023335381e-05, + "loss": 3.8999, + "step": 47335 + }, + { + "epoch": 0.28152060138928536, + "grad_norm": 1.8058844804763794, + "learning_rate": 4.084375891781815e-05, + "loss": 4.0255, + "step": 47336 + }, + { + "epoch": 0.2815265486725664, + "grad_norm": 1.9116401672363281, + "learning_rate": 4.08433975967516e-05, + "loss": 3.9503, + "step": 47337 + }, + { + "epoch": 0.2815324959558474, + "grad_norm": 1.9640111923217773, + "learning_rate": 4.084303627015428e-05, + "loss": 3.7228, + "step": 47338 + }, + { + "epoch": 0.28153844323912836, + "grad_norm": 1.937958836555481, + "learning_rate": 4.084267493802631e-05, + "loss": 3.8159, + "step": 47339 + }, + { + "epoch": 0.28154439052240937, + "grad_norm": 2.3433501720428467, + "learning_rate": 4.084231360036784e-05, + "loss": 3.8514, + "step": 47340 + }, + { + "epoch": 0.2815503378056904, + "grad_norm": 1.5947366952896118, + "learning_rate": 4.0841952257178984e-05, + "loss": 4.2627, + "step": 47341 + }, + { + "epoch": 0.28155628508897135, + "grad_norm": 1.6954115629196167, + "learning_rate": 4.0841590908459856e-05, + "loss": 4.382, + "step": 47342 + }, + { + "epoch": 0.28156223237225236, + "grad_norm": 1.5168561935424805, + "learning_rate": 4.0841229554210605e-05, + "loss": 4.3565, + "step": 47343 + }, + { + "epoch": 0.2815681796555334, + "grad_norm": 1.8132891654968262, + "learning_rate": 4.0840868194431334e-05, + "loss": 4.5551, + "step": 47344 + }, + { + "epoch": 0.28157412693881434, + "grad_norm": 1.4330682754516602, + "learning_rate": 4.0840506829122184e-05, + "loss": 4.6737, + "step": 47345 + }, + { + "epoch": 0.28158007422209536, + "grad_norm": 2.04805064201355, + "learning_rate": 4.084014545828327e-05, + "loss": 4.5559, + "step": 47346 + }, + { + "epoch": 0.28158602150537637, + "grad_norm": 1.7776272296905518, + "learning_rate": 4.083978408191473e-05, + "loss": 4.1902, + "step": 47347 + }, + { + "epoch": 0.28159196878865733, + "grad_norm": 1.5903432369232178, + "learning_rate": 4.0839422700016685e-05, + "loss": 4.0466, + "step": 47348 + }, + { + "epoch": 0.28159791607193835, + "grad_norm": 1.5803189277648926, + "learning_rate": 4.083906131258927e-05, + "loss": 4.6029, + "step": 47349 + }, + { + "epoch": 0.2816038633552193, + "grad_norm": 1.4004967212677002, + "learning_rate": 4.0838699919632585e-05, + "loss": 5.4753, + "step": 47350 + }, + { + "epoch": 0.2816098106385003, + "grad_norm": 1.5121500492095947, + "learning_rate": 4.083833852114679e-05, + "loss": 5.3615, + "step": 47351 + }, + { + "epoch": 0.28161575792178134, + "grad_norm": 1.30703866481781, + "learning_rate": 4.083797711713198e-05, + "loss": 5.0388, + "step": 47352 + }, + { + "epoch": 0.2816217052050623, + "grad_norm": 1.3124752044677734, + "learning_rate": 4.08376157075883e-05, + "loss": 4.8961, + "step": 47353 + }, + { + "epoch": 0.2816276524883433, + "grad_norm": 1.9672884941101074, + "learning_rate": 4.083725429251588e-05, + "loss": 4.8757, + "step": 47354 + }, + { + "epoch": 0.28163359977162433, + "grad_norm": 1.6715421676635742, + "learning_rate": 4.083689287191483e-05, + "loss": 4.8345, + "step": 47355 + }, + { + "epoch": 0.2816395470549053, + "grad_norm": 1.605924129486084, + "learning_rate": 4.083653144578529e-05, + "loss": 4.1908, + "step": 47356 + }, + { + "epoch": 0.2816454943381863, + "grad_norm": 1.5605125427246094, + "learning_rate": 4.0836170014127376e-05, + "loss": 4.6701, + "step": 47357 + }, + { + "epoch": 0.2816514416214673, + "grad_norm": 1.2818939685821533, + "learning_rate": 4.083580857694122e-05, + "loss": 4.993, + "step": 47358 + }, + { + "epoch": 0.2816573889047483, + "grad_norm": 1.110399603843689, + "learning_rate": 4.083544713422695e-05, + "loss": 4.8738, + "step": 47359 + }, + { + "epoch": 0.2816633361880293, + "grad_norm": 1.349104404449463, + "learning_rate": 4.0835085685984686e-05, + "loss": 4.7912, + "step": 47360 + }, + { + "epoch": 0.2816692834713103, + "grad_norm": 1.2373125553131104, + "learning_rate": 4.0834724232214546e-05, + "loss": 4.6935, + "step": 47361 + }, + { + "epoch": 0.2816752307545913, + "grad_norm": 0.8535304069519043, + "learning_rate": 4.083436277291668e-05, + "loss": 4.893, + "step": 47362 + }, + { + "epoch": 0.2816811780378723, + "grad_norm": 1.9414591789245605, + "learning_rate": 4.0834001308091204e-05, + "loss": 4.6455, + "step": 47363 + }, + { + "epoch": 0.2816871253211533, + "grad_norm": 1.581627607345581, + "learning_rate": 4.083363983773823e-05, + "loss": 4.3426, + "step": 47364 + }, + { + "epoch": 0.28169307260443427, + "grad_norm": 1.3901536464691162, + "learning_rate": 4.083327836185791e-05, + "loss": 5.0627, + "step": 47365 + }, + { + "epoch": 0.2816990198877153, + "grad_norm": 1.8401137590408325, + "learning_rate": 4.083291688045036e-05, + "loss": 4.488, + "step": 47366 + }, + { + "epoch": 0.2817049671709963, + "grad_norm": 1.5578840970993042, + "learning_rate": 4.083255539351568e-05, + "loss": 4.7287, + "step": 47367 + }, + { + "epoch": 0.28171091445427726, + "grad_norm": 2.0349373817443848, + "learning_rate": 4.083219390105404e-05, + "loss": 4.107, + "step": 47368 + }, + { + "epoch": 0.2817168617375583, + "grad_norm": 1.4486163854599, + "learning_rate": 4.0831832403065526e-05, + "loss": 4.7771, + "step": 47369 + }, + { + "epoch": 0.2817228090208393, + "grad_norm": 1.376185655593872, + "learning_rate": 4.083147089955029e-05, + "loss": 4.9326, + "step": 47370 + }, + { + "epoch": 0.28172875630412025, + "grad_norm": 1.3821784257888794, + "learning_rate": 4.083110939050846e-05, + "loss": 4.7613, + "step": 47371 + }, + { + "epoch": 0.28173470358740127, + "grad_norm": 1.601438283920288, + "learning_rate": 4.0830747875940146e-05, + "loss": 4.3466, + "step": 47372 + }, + { + "epoch": 0.2817406508706823, + "grad_norm": 1.4964946508407593, + "learning_rate": 4.083038635584548e-05, + "loss": 3.9288, + "step": 47373 + }, + { + "epoch": 0.28174659815396325, + "grad_norm": 1.438826322555542, + "learning_rate": 4.083002483022459e-05, + "loss": 3.8949, + "step": 47374 + }, + { + "epoch": 0.28175254543724426, + "grad_norm": 1.3209664821624756, + "learning_rate": 4.08296632990776e-05, + "loss": 4.9456, + "step": 47375 + }, + { + "epoch": 0.2817584927205253, + "grad_norm": 1.45292329788208, + "learning_rate": 4.082930176240465e-05, + "loss": 4.2194, + "step": 47376 + }, + { + "epoch": 0.28176444000380624, + "grad_norm": 2.0271778106689453, + "learning_rate": 4.0828940220205846e-05, + "loss": 4.5282, + "step": 47377 + }, + { + "epoch": 0.28177038728708725, + "grad_norm": 1.6724679470062256, + "learning_rate": 4.082857867248132e-05, + "loss": 5.2486, + "step": 47378 + }, + { + "epoch": 0.28177633457036827, + "grad_norm": 1.4166179895401, + "learning_rate": 4.082821711923121e-05, + "loss": 5.1614, + "step": 47379 + }, + { + "epoch": 0.28178228185364923, + "grad_norm": 1.3614258766174316, + "learning_rate": 4.082785556045562e-05, + "loss": 4.9381, + "step": 47380 + }, + { + "epoch": 0.28178822913693025, + "grad_norm": 1.4493699073791504, + "learning_rate": 4.08274939961547e-05, + "loss": 4.5991, + "step": 47381 + }, + { + "epoch": 0.28179417642021126, + "grad_norm": 1.642398476600647, + "learning_rate": 4.082713242632855e-05, + "loss": 4.418, + "step": 47382 + }, + { + "epoch": 0.2818001237034922, + "grad_norm": 1.754624843597412, + "learning_rate": 4.082677085097733e-05, + "loss": 4.1881, + "step": 47383 + }, + { + "epoch": 0.28180607098677324, + "grad_norm": 1.7301214933395386, + "learning_rate": 4.082640927010113e-05, + "loss": 3.9547, + "step": 47384 + }, + { + "epoch": 0.28181201827005425, + "grad_norm": 1.6082539558410645, + "learning_rate": 4.0826047683700106e-05, + "loss": 4.2762, + "step": 47385 + }, + { + "epoch": 0.2818179655533352, + "grad_norm": 1.485518455505371, + "learning_rate": 4.082568609177437e-05, + "loss": 4.4157, + "step": 47386 + }, + { + "epoch": 0.28182391283661623, + "grad_norm": 1.650743842124939, + "learning_rate": 4.082532449432406e-05, + "loss": 4.6734, + "step": 47387 + }, + { + "epoch": 0.28182986011989725, + "grad_norm": 1.3846533298492432, + "learning_rate": 4.0824962891349276e-05, + "loss": 4.8395, + "step": 47388 + }, + { + "epoch": 0.2818358074031782, + "grad_norm": 1.544817566871643, + "learning_rate": 4.082460128285017e-05, + "loss": 4.7653, + "step": 47389 + }, + { + "epoch": 0.2818417546864592, + "grad_norm": 1.5076045989990234, + "learning_rate": 4.082423966882686e-05, + "loss": 5.3593, + "step": 47390 + }, + { + "epoch": 0.28184770196974024, + "grad_norm": 1.4366170167922974, + "learning_rate": 4.082387804927946e-05, + "loss": 5.018, + "step": 47391 + }, + { + "epoch": 0.2818536492530212, + "grad_norm": 1.4230140447616577, + "learning_rate": 4.082351642420812e-05, + "loss": 4.7313, + "step": 47392 + }, + { + "epoch": 0.2818595965363022, + "grad_norm": 1.475759506225586, + "learning_rate": 4.082315479361296e-05, + "loss": 4.4073, + "step": 47393 + }, + { + "epoch": 0.28186554381958323, + "grad_norm": 1.5621720552444458, + "learning_rate": 4.082279315749408e-05, + "loss": 4.4642, + "step": 47394 + }, + { + "epoch": 0.2818714911028642, + "grad_norm": 1.3348262310028076, + "learning_rate": 4.082243151585164e-05, + "loss": 4.5524, + "step": 47395 + }, + { + "epoch": 0.2818774383861452, + "grad_norm": 1.4885952472686768, + "learning_rate": 4.082206986868575e-05, + "loss": 4.2698, + "step": 47396 + }, + { + "epoch": 0.2818833856694262, + "grad_norm": 1.261887550354004, + "learning_rate": 4.082170821599654e-05, + "loss": 4.2642, + "step": 47397 + }, + { + "epoch": 0.2818893329527072, + "grad_norm": 1.769086241722107, + "learning_rate": 4.082134655778413e-05, + "loss": 3.9851, + "step": 47398 + }, + { + "epoch": 0.2818952802359882, + "grad_norm": 1.5621566772460938, + "learning_rate": 4.082098489404866e-05, + "loss": 4.2613, + "step": 47399 + }, + { + "epoch": 0.2819012275192692, + "grad_norm": 1.7064701318740845, + "learning_rate": 4.082062322479024e-05, + "loss": 4.1101, + "step": 47400 + }, + { + "epoch": 0.2819071748025502, + "grad_norm": 1.6102428436279297, + "learning_rate": 4.0820261550009006e-05, + "loss": 4.3667, + "step": 47401 + }, + { + "epoch": 0.2819131220858312, + "grad_norm": 1.637094497680664, + "learning_rate": 4.081989986970508e-05, + "loss": 3.9559, + "step": 47402 + }, + { + "epoch": 0.2819190693691122, + "grad_norm": 1.574638843536377, + "learning_rate": 4.081953818387859e-05, + "loss": 4.2375, + "step": 47403 + }, + { + "epoch": 0.28192501665239317, + "grad_norm": 1.4759814739227295, + "learning_rate": 4.081917649252967e-05, + "loss": 4.277, + "step": 47404 + }, + { + "epoch": 0.2819309639356742, + "grad_norm": 1.3327956199645996, + "learning_rate": 4.0818814795658435e-05, + "loss": 4.3362, + "step": 47405 + }, + { + "epoch": 0.2819369112189552, + "grad_norm": 1.6142746210098267, + "learning_rate": 4.0818453093265016e-05, + "loss": 3.8015, + "step": 47406 + }, + { + "epoch": 0.28194285850223616, + "grad_norm": 1.659014105796814, + "learning_rate": 4.081809138534953e-05, + "loss": 4.2551, + "step": 47407 + }, + { + "epoch": 0.2819488057855172, + "grad_norm": 1.4951667785644531, + "learning_rate": 4.0817729671912124e-05, + "loss": 4.2635, + "step": 47408 + }, + { + "epoch": 0.2819547530687982, + "grad_norm": 1.4563647508621216, + "learning_rate": 4.081736795295291e-05, + "loss": 4.1968, + "step": 47409 + }, + { + "epoch": 0.28196070035207915, + "grad_norm": 1.5012794733047485, + "learning_rate": 4.081700622847201e-05, + "loss": 4.3888, + "step": 47410 + }, + { + "epoch": 0.28196664763536017, + "grad_norm": 1.61269211769104, + "learning_rate": 4.081664449846956e-05, + "loss": 4.1357, + "step": 47411 + }, + { + "epoch": 0.2819725949186412, + "grad_norm": 1.433693766593933, + "learning_rate": 4.081628276294568e-05, + "loss": 3.8719, + "step": 47412 + }, + { + "epoch": 0.28197854220192214, + "grad_norm": 1.583248496055603, + "learning_rate": 4.0815921021900505e-05, + "loss": 4.0159, + "step": 47413 + }, + { + "epoch": 0.28198448948520316, + "grad_norm": 1.60663640499115, + "learning_rate": 4.0815559275334155e-05, + "loss": 3.9481, + "step": 47414 + }, + { + "epoch": 0.2819904367684842, + "grad_norm": 1.5763381719589233, + "learning_rate": 4.081519752324675e-05, + "loss": 4.2469, + "step": 47415 + }, + { + "epoch": 0.28199638405176514, + "grad_norm": 1.9761042594909668, + "learning_rate": 4.0814835765638424e-05, + "loss": 4.3445, + "step": 47416 + }, + { + "epoch": 0.28200233133504615, + "grad_norm": 1.4668209552764893, + "learning_rate": 4.081447400250931e-05, + "loss": 4.7806, + "step": 47417 + }, + { + "epoch": 0.28200827861832717, + "grad_norm": 1.5351897478103638, + "learning_rate": 4.0814112233859514e-05, + "loss": 4.9078, + "step": 47418 + }, + { + "epoch": 0.28201422590160813, + "grad_norm": 1.6876137256622314, + "learning_rate": 4.081375045968918e-05, + "loss": 4.6465, + "step": 47419 + }, + { + "epoch": 0.28202017318488914, + "grad_norm": 1.620896816253662, + "learning_rate": 4.0813388679998435e-05, + "loss": 4.997, + "step": 47420 + }, + { + "epoch": 0.28202612046817016, + "grad_norm": 1.774833083152771, + "learning_rate": 4.08130268947874e-05, + "loss": 4.3305, + "step": 47421 + }, + { + "epoch": 0.2820320677514511, + "grad_norm": 1.8777393102645874, + "learning_rate": 4.081266510405619e-05, + "loss": 3.6707, + "step": 47422 + }, + { + "epoch": 0.28203801503473214, + "grad_norm": 1.7452644109725952, + "learning_rate": 4.081230330780495e-05, + "loss": 3.6401, + "step": 47423 + }, + { + "epoch": 0.28204396231801315, + "grad_norm": 1.6673654317855835, + "learning_rate": 4.08119415060338e-05, + "loss": 3.5792, + "step": 47424 + }, + { + "epoch": 0.2820499096012941, + "grad_norm": 1.6181286573410034, + "learning_rate": 4.081157969874286e-05, + "loss": 3.5357, + "step": 47425 + }, + { + "epoch": 0.28205585688457513, + "grad_norm": 1.5318615436553955, + "learning_rate": 4.0811217885932264e-05, + "loss": 4.5692, + "step": 47426 + }, + { + "epoch": 0.28206180416785614, + "grad_norm": 1.6641347408294678, + "learning_rate": 4.081085606760213e-05, + "loss": 4.2093, + "step": 47427 + }, + { + "epoch": 0.2820677514511371, + "grad_norm": 1.6612149477005005, + "learning_rate": 4.08104942437526e-05, + "loss": 3.6796, + "step": 47428 + }, + { + "epoch": 0.2820736987344181, + "grad_norm": 1.4429556131362915, + "learning_rate": 4.081013241438377e-05, + "loss": 3.7837, + "step": 47429 + }, + { + "epoch": 0.28207964601769914, + "grad_norm": 2.2900681495666504, + "learning_rate": 4.08097705794958e-05, + "loss": 4.4112, + "step": 47430 + }, + { + "epoch": 0.2820855933009801, + "grad_norm": 1.597537636756897, + "learning_rate": 4.080940873908881e-05, + "loss": 3.5039, + "step": 47431 + }, + { + "epoch": 0.2820915405842611, + "grad_norm": 1.5627999305725098, + "learning_rate": 4.0809046893162905e-05, + "loss": 3.4949, + "step": 47432 + }, + { + "epoch": 0.28209748786754213, + "grad_norm": 1.6882301568984985, + "learning_rate": 4.080868504171823e-05, + "loss": 3.5044, + "step": 47433 + }, + { + "epoch": 0.2821034351508231, + "grad_norm": 1.5515598058700562, + "learning_rate": 4.0808323184754904e-05, + "loss": 3.6642, + "step": 47434 + }, + { + "epoch": 0.2821093824341041, + "grad_norm": 1.7195653915405273, + "learning_rate": 4.080796132227306e-05, + "loss": 3.5561, + "step": 47435 + }, + { + "epoch": 0.2821153297173851, + "grad_norm": 1.8383060693740845, + "learning_rate": 4.080759945427282e-05, + "loss": 3.3585, + "step": 47436 + }, + { + "epoch": 0.2821212770006661, + "grad_norm": 1.6634877920150757, + "learning_rate": 4.08072375807543e-05, + "loss": 3.6546, + "step": 47437 + }, + { + "epoch": 0.2821272242839471, + "grad_norm": 1.614262580871582, + "learning_rate": 4.080687570171765e-05, + "loss": 3.4458, + "step": 47438 + }, + { + "epoch": 0.2821331715672281, + "grad_norm": 1.5854287147521973, + "learning_rate": 4.080651381716298e-05, + "loss": 4.0286, + "step": 47439 + }, + { + "epoch": 0.2821391188505091, + "grad_norm": 1.7223767042160034, + "learning_rate": 4.080615192709041e-05, + "loss": 3.367, + "step": 47440 + }, + { + "epoch": 0.2821450661337901, + "grad_norm": 1.751330018043518, + "learning_rate": 4.080579003150009e-05, + "loss": 3.6558, + "step": 47441 + }, + { + "epoch": 0.2821510134170711, + "grad_norm": 1.4757399559020996, + "learning_rate": 4.080542813039212e-05, + "loss": 3.5235, + "step": 47442 + }, + { + "epoch": 0.28215696070035207, + "grad_norm": 1.5239791870117188, + "learning_rate": 4.0805066223766644e-05, + "loss": 3.3525, + "step": 47443 + }, + { + "epoch": 0.2821629079836331, + "grad_norm": 1.4980828762054443, + "learning_rate": 4.0804704311623784e-05, + "loss": 3.4917, + "step": 47444 + }, + { + "epoch": 0.2821688552669141, + "grad_norm": 1.5913621187210083, + "learning_rate": 4.080434239396366e-05, + "loss": 3.3768, + "step": 47445 + }, + { + "epoch": 0.28217480255019506, + "grad_norm": 1.7235801219940186, + "learning_rate": 4.0803980470786404e-05, + "loss": 3.2569, + "step": 47446 + }, + { + "epoch": 0.2821807498334761, + "grad_norm": 1.6945123672485352, + "learning_rate": 4.080361854209215e-05, + "loss": 3.378, + "step": 47447 + }, + { + "epoch": 0.2821866971167571, + "grad_norm": 1.6431406736373901, + "learning_rate": 4.080325660788101e-05, + "loss": 3.5579, + "step": 47448 + }, + { + "epoch": 0.28219264440003805, + "grad_norm": 1.6484049558639526, + "learning_rate": 4.080289466815311e-05, + "loss": 3.1605, + "step": 47449 + }, + { + "epoch": 0.28219859168331907, + "grad_norm": 1.6851341724395752, + "learning_rate": 4.080253272290859e-05, + "loss": 3.5141, + "step": 47450 + }, + { + "epoch": 0.2822045389666001, + "grad_norm": 1.719098448753357, + "learning_rate": 4.0802170772147576e-05, + "loss": 4.5958, + "step": 47451 + }, + { + "epoch": 0.28221048624988104, + "grad_norm": 1.4291635751724243, + "learning_rate": 4.080180881587017e-05, + "loss": 5.3952, + "step": 47452 + }, + { + "epoch": 0.28221643353316206, + "grad_norm": 1.6204155683517456, + "learning_rate": 4.080144685407653e-05, + "loss": 4.1902, + "step": 47453 + }, + { + "epoch": 0.2822223808164431, + "grad_norm": 1.854830265045166, + "learning_rate": 4.080108488676677e-05, + "loss": 3.4148, + "step": 47454 + }, + { + "epoch": 0.28222832809972404, + "grad_norm": 1.8991382122039795, + "learning_rate": 4.0800722913941005e-05, + "loss": 3.4414, + "step": 47455 + }, + { + "epoch": 0.28223427538300505, + "grad_norm": 1.7827924489974976, + "learning_rate": 4.080036093559937e-05, + "loss": 3.5917, + "step": 47456 + }, + { + "epoch": 0.28224022266628607, + "grad_norm": 2.650742530822754, + "learning_rate": 4.0799998951742005e-05, + "loss": 3.9352, + "step": 47457 + }, + { + "epoch": 0.282246169949567, + "grad_norm": 2.4329490661621094, + "learning_rate": 4.0799636962369006e-05, + "loss": 3.6686, + "step": 47458 + }, + { + "epoch": 0.28225211723284804, + "grad_norm": 2.228701114654541, + "learning_rate": 4.079927496748053e-05, + "loss": 4.1118, + "step": 47459 + }, + { + "epoch": 0.28225806451612906, + "grad_norm": 2.1161670684814453, + "learning_rate": 4.0798912967076685e-05, + "loss": 3.9588, + "step": 47460 + }, + { + "epoch": 0.28226401179941, + "grad_norm": 1.375678539276123, + "learning_rate": 4.07985509611576e-05, + "loss": 5.4283, + "step": 47461 + }, + { + "epoch": 0.28226995908269104, + "grad_norm": 1.719685435295105, + "learning_rate": 4.079818894972341e-05, + "loss": 4.8361, + "step": 47462 + }, + { + "epoch": 0.28227590636597205, + "grad_norm": 1.5346896648406982, + "learning_rate": 4.079782693277423e-05, + "loss": 4.8431, + "step": 47463 + }, + { + "epoch": 0.282281853649253, + "grad_norm": 1.6860010623931885, + "learning_rate": 4.07974649103102e-05, + "loss": 4.8748, + "step": 47464 + }, + { + "epoch": 0.282287800932534, + "grad_norm": 2.5376245975494385, + "learning_rate": 4.0797102882331434e-05, + "loss": 3.5878, + "step": 47465 + }, + { + "epoch": 0.282293748215815, + "grad_norm": 2.483022451400757, + "learning_rate": 4.079674084883806e-05, + "loss": 3.4547, + "step": 47466 + }, + { + "epoch": 0.282299695499096, + "grad_norm": 2.379741668701172, + "learning_rate": 4.079637880983021e-05, + "loss": 3.2707, + "step": 47467 + }, + { + "epoch": 0.282305642782377, + "grad_norm": 1.5518743991851807, + "learning_rate": 4.079601676530801e-05, + "loss": 5.0243, + "step": 47468 + }, + { + "epoch": 0.282311590065658, + "grad_norm": 1.3467580080032349, + "learning_rate": 4.079565471527157e-05, + "loss": 5.2271, + "step": 47469 + }, + { + "epoch": 0.282317537348939, + "grad_norm": 1.4930405616760254, + "learning_rate": 4.0795292659721054e-05, + "loss": 4.9831, + "step": 47470 + }, + { + "epoch": 0.28232348463222, + "grad_norm": 1.5074878931045532, + "learning_rate": 4.079493059865654e-05, + "loss": 4.921, + "step": 47471 + }, + { + "epoch": 0.282329431915501, + "grad_norm": 1.426884412765503, + "learning_rate": 4.079456853207819e-05, + "loss": 4.9208, + "step": 47472 + }, + { + "epoch": 0.282335379198782, + "grad_norm": 1.376688003540039, + "learning_rate": 4.079420645998612e-05, + "loss": 4.9093, + "step": 47473 + }, + { + "epoch": 0.282341326482063, + "grad_norm": 1.497398853302002, + "learning_rate": 4.079384438238045e-05, + "loss": 4.775, + "step": 47474 + }, + { + "epoch": 0.28234727376534396, + "grad_norm": 1.5201846361160278, + "learning_rate": 4.079348229926132e-05, + "loss": 4.8758, + "step": 47475 + }, + { + "epoch": 0.282353221048625, + "grad_norm": 1.2550804615020752, + "learning_rate": 4.0793120210628846e-05, + "loss": 4.7156, + "step": 47476 + }, + { + "epoch": 0.282359168331906, + "grad_norm": 1.3847006559371948, + "learning_rate": 4.0792758116483156e-05, + "loss": 4.5503, + "step": 47477 + }, + { + "epoch": 0.28236511561518696, + "grad_norm": 2.1183295249938965, + "learning_rate": 4.079239601682437e-05, + "loss": 4.4284, + "step": 47478 + }, + { + "epoch": 0.282371062898468, + "grad_norm": 1.462019681930542, + "learning_rate": 4.079203391165264e-05, + "loss": 4.5765, + "step": 47479 + }, + { + "epoch": 0.282377010181749, + "grad_norm": 1.650137186050415, + "learning_rate": 4.079167180096806e-05, + "loss": 4.8076, + "step": 47480 + }, + { + "epoch": 0.28238295746502995, + "grad_norm": 1.658308982849121, + "learning_rate": 4.079130968477077e-05, + "loss": 4.8066, + "step": 47481 + }, + { + "epoch": 0.28238890474831096, + "grad_norm": 1.6171408891677856, + "learning_rate": 4.079094756306091e-05, + "loss": 4.2277, + "step": 47482 + }, + { + "epoch": 0.282394852031592, + "grad_norm": 1.2946069240570068, + "learning_rate": 4.079058543583858e-05, + "loss": 4.3982, + "step": 47483 + }, + { + "epoch": 0.28240079931487294, + "grad_norm": 1.5102818012237549, + "learning_rate": 4.079022330310393e-05, + "loss": 5.2011, + "step": 47484 + }, + { + "epoch": 0.28240674659815396, + "grad_norm": 1.6274359226226807, + "learning_rate": 4.078986116485707e-05, + "loss": 4.5199, + "step": 47485 + }, + { + "epoch": 0.282412693881435, + "grad_norm": 1.406627893447876, + "learning_rate": 4.0789499021098124e-05, + "loss": 4.764, + "step": 47486 + }, + { + "epoch": 0.28241864116471593, + "grad_norm": 1.5968959331512451, + "learning_rate": 4.078913687182724e-05, + "loss": 4.8404, + "step": 47487 + }, + { + "epoch": 0.28242458844799695, + "grad_norm": 1.57871675491333, + "learning_rate": 4.078877471704453e-05, + "loss": 4.7607, + "step": 47488 + }, + { + "epoch": 0.28243053573127797, + "grad_norm": 2.0947883129119873, + "learning_rate": 4.078841255675011e-05, + "loss": 4.3768, + "step": 47489 + }, + { + "epoch": 0.2824364830145589, + "grad_norm": 1.9491031169891357, + "learning_rate": 4.078805039094413e-05, + "loss": 4.2781, + "step": 47490 + }, + { + "epoch": 0.28244243029783994, + "grad_norm": 1.4361661672592163, + "learning_rate": 4.078768821962671e-05, + "loss": 4.7171, + "step": 47491 + }, + { + "epoch": 0.28244837758112096, + "grad_norm": 1.8000178337097168, + "learning_rate": 4.078732604279796e-05, + "loss": 4.5434, + "step": 47492 + }, + { + "epoch": 0.2824543248644019, + "grad_norm": 1.3927206993103027, + "learning_rate": 4.078696386045802e-05, + "loss": 4.6212, + "step": 47493 + }, + { + "epoch": 0.28246027214768293, + "grad_norm": 1.577601671218872, + "learning_rate": 4.078660167260702e-05, + "loss": 4.6999, + "step": 47494 + }, + { + "epoch": 0.28246621943096395, + "grad_norm": 1.3811376094818115, + "learning_rate": 4.078623947924506e-05, + "loss": 4.7157, + "step": 47495 + }, + { + "epoch": 0.2824721667142449, + "grad_norm": 1.3561420440673828, + "learning_rate": 4.0785877280372306e-05, + "loss": 4.7174, + "step": 47496 + }, + { + "epoch": 0.2824781139975259, + "grad_norm": 1.6293087005615234, + "learning_rate": 4.078551507598887e-05, + "loss": 4.7259, + "step": 47497 + }, + { + "epoch": 0.28248406128080694, + "grad_norm": 1.52298104763031, + "learning_rate": 4.0785152866094854e-05, + "loss": 4.8788, + "step": 47498 + }, + { + "epoch": 0.2824900085640879, + "grad_norm": 1.4966613054275513, + "learning_rate": 4.078479065069042e-05, + "loss": 4.8586, + "step": 47499 + }, + { + "epoch": 0.2824959558473689, + "grad_norm": 1.5202277898788452, + "learning_rate": 4.078442842977567e-05, + "loss": 4.7544, + "step": 47500 + }, + { + "epoch": 0.28250190313064993, + "grad_norm": 1.9590730667114258, + "learning_rate": 4.0784066203350745e-05, + "loss": 4.3803, + "step": 47501 + }, + { + "epoch": 0.2825078504139309, + "grad_norm": 2.4173007011413574, + "learning_rate": 4.078370397141577e-05, + "loss": 3.2216, + "step": 47502 + }, + { + "epoch": 0.2825137976972119, + "grad_norm": 1.7320131063461304, + "learning_rate": 4.078334173397086e-05, + "loss": 4.3565, + "step": 47503 + }, + { + "epoch": 0.2825197449804929, + "grad_norm": 1.6844524145126343, + "learning_rate": 4.078297949101615e-05, + "loss": 4.8312, + "step": 47504 + }, + { + "epoch": 0.2825256922637739, + "grad_norm": 1.4573322534561157, + "learning_rate": 4.078261724255176e-05, + "loss": 4.1654, + "step": 47505 + }, + { + "epoch": 0.2825316395470549, + "grad_norm": 1.6994129419326782, + "learning_rate": 4.078225498857783e-05, + "loss": 4.3569, + "step": 47506 + }, + { + "epoch": 0.2825375868303359, + "grad_norm": 1.5810171365737915, + "learning_rate": 4.0781892729094466e-05, + "loss": 4.5662, + "step": 47507 + }, + { + "epoch": 0.2825435341136169, + "grad_norm": 1.941491961479187, + "learning_rate": 4.078153046410181e-05, + "loss": 4.4242, + "step": 47508 + }, + { + "epoch": 0.2825494813968979, + "grad_norm": 1.5889766216278076, + "learning_rate": 4.078116819359999e-05, + "loss": 4.1145, + "step": 47509 + }, + { + "epoch": 0.2825554286801789, + "grad_norm": 1.5780613422393799, + "learning_rate": 4.0780805917589126e-05, + "loss": 4.5202, + "step": 47510 + }, + { + "epoch": 0.28256137596345987, + "grad_norm": 1.6078077554702759, + "learning_rate": 4.078044363606935e-05, + "loss": 4.6244, + "step": 47511 + }, + { + "epoch": 0.2825673232467409, + "grad_norm": 1.563291311264038, + "learning_rate": 4.078008134904078e-05, + "loss": 4.5844, + "step": 47512 + }, + { + "epoch": 0.2825732705300219, + "grad_norm": 1.397734522819519, + "learning_rate": 4.077971905650354e-05, + "loss": 4.5263, + "step": 47513 + }, + { + "epoch": 0.28257921781330286, + "grad_norm": 1.779422402381897, + "learning_rate": 4.077935675845777e-05, + "loss": 4.4644, + "step": 47514 + }, + { + "epoch": 0.2825851650965839, + "grad_norm": 1.6790305376052856, + "learning_rate": 4.07789944549036e-05, + "loss": 4.5988, + "step": 47515 + }, + { + "epoch": 0.2825911123798649, + "grad_norm": 1.6938393115997314, + "learning_rate": 4.077863214584112e-05, + "loss": 5.1462, + "step": 47516 + }, + { + "epoch": 0.28259705966314586, + "grad_norm": 1.4700872898101807, + "learning_rate": 4.07782698312705e-05, + "loss": 5.0467, + "step": 47517 + }, + { + "epoch": 0.28260300694642687, + "grad_norm": 1.8080729246139526, + "learning_rate": 4.077790751119185e-05, + "loss": 4.6871, + "step": 47518 + }, + { + "epoch": 0.2826089542297079, + "grad_norm": 1.3119211196899414, + "learning_rate": 4.077754518560529e-05, + "loss": 4.7198, + "step": 47519 + }, + { + "epoch": 0.28261490151298885, + "grad_norm": 1.7313776016235352, + "learning_rate": 4.077718285451095e-05, + "loss": 3.7802, + "step": 47520 + }, + { + "epoch": 0.28262084879626986, + "grad_norm": 1.8400214910507202, + "learning_rate": 4.0776820517908965e-05, + "loss": 4.1891, + "step": 47521 + }, + { + "epoch": 0.2826267960795509, + "grad_norm": 1.5615036487579346, + "learning_rate": 4.0776458175799456e-05, + "loss": 4.2488, + "step": 47522 + }, + { + "epoch": 0.28263274336283184, + "grad_norm": 1.5766640901565552, + "learning_rate": 4.0776095828182545e-05, + "loss": 4.9276, + "step": 47523 + }, + { + "epoch": 0.28263869064611286, + "grad_norm": 1.7402112483978271, + "learning_rate": 4.077573347505836e-05, + "loss": 4.54, + "step": 47524 + }, + { + "epoch": 0.28264463792939387, + "grad_norm": 1.6231482028961182, + "learning_rate": 4.077537111642703e-05, + "loss": 4.7811, + "step": 47525 + }, + { + "epoch": 0.28265058521267483, + "grad_norm": 1.4401772022247314, + "learning_rate": 4.077500875228868e-05, + "loss": 4.6117, + "step": 47526 + }, + { + "epoch": 0.28265653249595585, + "grad_norm": 1.351122260093689, + "learning_rate": 4.077464638264345e-05, + "loss": 4.7538, + "step": 47527 + }, + { + "epoch": 0.28266247977923686, + "grad_norm": 1.684944987297058, + "learning_rate": 4.0774284007491434e-05, + "loss": 4.6022, + "step": 47528 + }, + { + "epoch": 0.2826684270625178, + "grad_norm": 1.541433334350586, + "learning_rate": 4.077392162683279e-05, + "loss": 4.5178, + "step": 47529 + }, + { + "epoch": 0.28267437434579884, + "grad_norm": 1.9925246238708496, + "learning_rate": 4.077355924066763e-05, + "loss": 4.5217, + "step": 47530 + }, + { + "epoch": 0.28268032162907986, + "grad_norm": 1.5293635129928589, + "learning_rate": 4.0773196848996085e-05, + "loss": 3.8363, + "step": 47531 + }, + { + "epoch": 0.2826862689123608, + "grad_norm": 1.511734962463379, + "learning_rate": 4.0772834451818274e-05, + "loss": 4.4768, + "step": 47532 + }, + { + "epoch": 0.28269221619564183, + "grad_norm": 1.4604301452636719, + "learning_rate": 4.0772472049134335e-05, + "loss": 4.6771, + "step": 47533 + }, + { + "epoch": 0.28269816347892285, + "grad_norm": 1.516412377357483, + "learning_rate": 4.077210964094439e-05, + "loss": 4.5076, + "step": 47534 + }, + { + "epoch": 0.2827041107622038, + "grad_norm": 1.7186903953552246, + "learning_rate": 4.077174722724856e-05, + "loss": 4.6354, + "step": 47535 + }, + { + "epoch": 0.2827100580454848, + "grad_norm": 1.4672865867614746, + "learning_rate": 4.0771384808046986e-05, + "loss": 4.4575, + "step": 47536 + }, + { + "epoch": 0.28271600532876584, + "grad_norm": 1.458319067955017, + "learning_rate": 4.0771022383339776e-05, + "loss": 5.0613, + "step": 47537 + }, + { + "epoch": 0.2827219526120468, + "grad_norm": 1.9925435781478882, + "learning_rate": 4.077065995312706e-05, + "loss": 3.9081, + "step": 47538 + }, + { + "epoch": 0.2827278998953278, + "grad_norm": 1.8763530254364014, + "learning_rate": 4.077029751740898e-05, + "loss": 4.4241, + "step": 47539 + }, + { + "epoch": 0.28273384717860883, + "grad_norm": 1.6116665601730347, + "learning_rate": 4.076993507618565e-05, + "loss": 4.5437, + "step": 47540 + }, + { + "epoch": 0.2827397944618898, + "grad_norm": 1.6972209215164185, + "learning_rate": 4.0769572629457193e-05, + "loss": 4.4403, + "step": 47541 + }, + { + "epoch": 0.2827457417451708, + "grad_norm": 1.596827507019043, + "learning_rate": 4.076921017722375e-05, + "loss": 4.1887, + "step": 47542 + }, + { + "epoch": 0.2827516890284518, + "grad_norm": 1.5342129468917847, + "learning_rate": 4.076884771948543e-05, + "loss": 4.1559, + "step": 47543 + }, + { + "epoch": 0.2827576363117328, + "grad_norm": 1.5532281398773193, + "learning_rate": 4.0768485256242374e-05, + "loss": 4.4592, + "step": 47544 + }, + { + "epoch": 0.2827635835950138, + "grad_norm": 1.5173457860946655, + "learning_rate": 4.0768122787494704e-05, + "loss": 4.5421, + "step": 47545 + }, + { + "epoch": 0.2827695308782948, + "grad_norm": 1.4438527822494507, + "learning_rate": 4.076776031324254e-05, + "loss": 4.5493, + "step": 47546 + }, + { + "epoch": 0.2827754781615758, + "grad_norm": 1.4849708080291748, + "learning_rate": 4.076739783348601e-05, + "loss": 4.7125, + "step": 47547 + }, + { + "epoch": 0.2827814254448568, + "grad_norm": 1.2702475786209106, + "learning_rate": 4.0767035348225253e-05, + "loss": 5.0378, + "step": 47548 + }, + { + "epoch": 0.2827873727281378, + "grad_norm": 1.6593289375305176, + "learning_rate": 4.076667285746038e-05, + "loss": 4.5389, + "step": 47549 + }, + { + "epoch": 0.28279332001141877, + "grad_norm": 1.5687834024429321, + "learning_rate": 4.076631036119153e-05, + "loss": 4.5654, + "step": 47550 + }, + { + "epoch": 0.2827992672946998, + "grad_norm": 1.5691031217575073, + "learning_rate": 4.0765947859418816e-05, + "loss": 4.3328, + "step": 47551 + }, + { + "epoch": 0.2828052145779808, + "grad_norm": 1.6024510860443115, + "learning_rate": 4.076558535214238e-05, + "loss": 4.498, + "step": 47552 + }, + { + "epoch": 0.28281116186126176, + "grad_norm": 1.5641758441925049, + "learning_rate": 4.076522283936234e-05, + "loss": 4.4844, + "step": 47553 + }, + { + "epoch": 0.2828171091445428, + "grad_norm": 1.459583044052124, + "learning_rate": 4.076486032107883e-05, + "loss": 4.5672, + "step": 47554 + }, + { + "epoch": 0.2828230564278238, + "grad_norm": 1.6589759588241577, + "learning_rate": 4.076449779729196e-05, + "loss": 3.9182, + "step": 47555 + }, + { + "epoch": 0.28282900371110475, + "grad_norm": 1.382211446762085, + "learning_rate": 4.076413526800187e-05, + "loss": 4.5111, + "step": 47556 + }, + { + "epoch": 0.28283495099438577, + "grad_norm": 1.533525824546814, + "learning_rate": 4.076377273320868e-05, + "loss": 4.4515, + "step": 47557 + }, + { + "epoch": 0.2828408982776668, + "grad_norm": 1.5081053972244263, + "learning_rate": 4.0763410192912524e-05, + "loss": 4.4007, + "step": 47558 + }, + { + "epoch": 0.28284684556094775, + "grad_norm": 1.4869818687438965, + "learning_rate": 4.076304764711352e-05, + "loss": 4.372, + "step": 47559 + }, + { + "epoch": 0.28285279284422876, + "grad_norm": 1.520935297012329, + "learning_rate": 4.07626850958118e-05, + "loss": 4.447, + "step": 47560 + }, + { + "epoch": 0.2828587401275098, + "grad_norm": 1.7411426305770874, + "learning_rate": 4.076232253900749e-05, + "loss": 4.9008, + "step": 47561 + }, + { + "epoch": 0.28286468741079074, + "grad_norm": 1.3661739826202393, + "learning_rate": 4.0761959976700717e-05, + "loss": 5.0064, + "step": 47562 + }, + { + "epoch": 0.28287063469407175, + "grad_norm": 1.5216995477676392, + "learning_rate": 4.0761597408891614e-05, + "loss": 4.8245, + "step": 47563 + }, + { + "epoch": 0.28287658197735277, + "grad_norm": 1.1977767944335938, + "learning_rate": 4.0761234835580294e-05, + "loss": 4.9813, + "step": 47564 + }, + { + "epoch": 0.28288252926063373, + "grad_norm": 1.3022505044937134, + "learning_rate": 4.0760872256766886e-05, + "loss": 4.9795, + "step": 47565 + }, + { + "epoch": 0.28288847654391475, + "grad_norm": 1.5789545774459839, + "learning_rate": 4.0760509672451524e-05, + "loss": 4.9232, + "step": 47566 + }, + { + "epoch": 0.28289442382719576, + "grad_norm": 1.5313389301300049, + "learning_rate": 4.076014708263433e-05, + "loss": 4.5541, + "step": 47567 + }, + { + "epoch": 0.2829003711104767, + "grad_norm": 1.749443769454956, + "learning_rate": 4.0759784487315434e-05, + "loss": 5.6534, + "step": 47568 + }, + { + "epoch": 0.28290631839375774, + "grad_norm": 1.6116141080856323, + "learning_rate": 4.075942188649496e-05, + "loss": 5.5988, + "step": 47569 + }, + { + "epoch": 0.28291226567703875, + "grad_norm": 1.5812526941299438, + "learning_rate": 4.075905928017303e-05, + "loss": 5.6303, + "step": 47570 + }, + { + "epoch": 0.2829182129603197, + "grad_norm": 1.5668729543685913, + "learning_rate": 4.075869666834977e-05, + "loss": 5.5709, + "step": 47571 + }, + { + "epoch": 0.28292416024360073, + "grad_norm": 1.4153090715408325, + "learning_rate": 4.0758334051025324e-05, + "loss": 5.4066, + "step": 47572 + }, + { + "epoch": 0.28293010752688175, + "grad_norm": 1.4115471839904785, + "learning_rate": 4.0757971428199806e-05, + "loss": 5.4125, + "step": 47573 + }, + { + "epoch": 0.2829360548101627, + "grad_norm": 1.5949673652648926, + "learning_rate": 4.075760879987334e-05, + "loss": 5.6268, + "step": 47574 + }, + { + "epoch": 0.2829420020934437, + "grad_norm": 1.3836122751235962, + "learning_rate": 4.0757246166046046e-05, + "loss": 5.1772, + "step": 47575 + }, + { + "epoch": 0.28294794937672474, + "grad_norm": 1.484968662261963, + "learning_rate": 4.075688352671807e-05, + "loss": 5.4419, + "step": 47576 + }, + { + "epoch": 0.2829538966600057, + "grad_norm": 1.4025777578353882, + "learning_rate": 4.075652088188953e-05, + "loss": 5.6643, + "step": 47577 + }, + { + "epoch": 0.2829598439432867, + "grad_norm": 1.674778938293457, + "learning_rate": 4.0756158231560545e-05, + "loss": 4.31, + "step": 47578 + }, + { + "epoch": 0.28296579122656773, + "grad_norm": 1.8875329494476318, + "learning_rate": 4.075579557573126e-05, + "loss": 4.5234, + "step": 47579 + }, + { + "epoch": 0.2829717385098487, + "grad_norm": 1.6571874618530273, + "learning_rate": 4.0755432914401774e-05, + "loss": 4.7993, + "step": 47580 + }, + { + "epoch": 0.2829776857931297, + "grad_norm": 1.9549039602279663, + "learning_rate": 4.0755070247572243e-05, + "loss": 4.2738, + "step": 47581 + }, + { + "epoch": 0.2829836330764107, + "grad_norm": 1.3335866928100586, + "learning_rate": 4.0754707575242765e-05, + "loss": 4.8386, + "step": 47582 + }, + { + "epoch": 0.2829895803596917, + "grad_norm": 1.3733880519866943, + "learning_rate": 4.075434489741349e-05, + "loss": 5.1271, + "step": 47583 + }, + { + "epoch": 0.2829955276429727, + "grad_norm": 1.9260413646697998, + "learning_rate": 4.075398221408454e-05, + "loss": 3.7919, + "step": 47584 + }, + { + "epoch": 0.28300147492625366, + "grad_norm": 1.8522114753723145, + "learning_rate": 4.0753619525256035e-05, + "loss": 4.2435, + "step": 47585 + }, + { + "epoch": 0.2830074222095347, + "grad_norm": 1.5332872867584229, + "learning_rate": 4.07532568309281e-05, + "loss": 4.7014, + "step": 47586 + }, + { + "epoch": 0.2830133694928157, + "grad_norm": 1.7250827550888062, + "learning_rate": 4.0752894131100864e-05, + "loss": 4.8241, + "step": 47587 + }, + { + "epoch": 0.28301931677609665, + "grad_norm": 1.6424274444580078, + "learning_rate": 4.0752531425774465e-05, + "loss": 4.5835, + "step": 47588 + }, + { + "epoch": 0.28302526405937767, + "grad_norm": 1.7564715147018433, + "learning_rate": 4.075216871494901e-05, + "loss": 4.5787, + "step": 47589 + }, + { + "epoch": 0.2830312113426587, + "grad_norm": 2.3155617713928223, + "learning_rate": 4.0751805998624644e-05, + "loss": 2.3686, + "step": 47590 + }, + { + "epoch": 0.28303715862593964, + "grad_norm": 3.2022042274475098, + "learning_rate": 4.0751443276801485e-05, + "loss": 1.6029, + "step": 47591 + }, + { + "epoch": 0.28304310590922066, + "grad_norm": 3.4521944522857666, + "learning_rate": 4.075108054947965e-05, + "loss": 1.9689, + "step": 47592 + }, + { + "epoch": 0.2830490531925017, + "grad_norm": 3.4900097846984863, + "learning_rate": 4.0750717816659283e-05, + "loss": 1.9822, + "step": 47593 + }, + { + "epoch": 0.28305500047578264, + "grad_norm": 2.6005125045776367, + "learning_rate": 4.07503550783405e-05, + "loss": 1.3933, + "step": 47594 + }, + { + "epoch": 0.28306094775906365, + "grad_norm": 2.5820467472076416, + "learning_rate": 4.074999233452344e-05, + "loss": 1.1087, + "step": 47595 + }, + { + "epoch": 0.28306689504234467, + "grad_norm": 3.333937644958496, + "learning_rate": 4.0749629585208205e-05, + "loss": 2.2522, + "step": 47596 + }, + { + "epoch": 0.28307284232562563, + "grad_norm": 2.8339085578918457, + "learning_rate": 4.074926683039494e-05, + "loss": 1.4308, + "step": 47597 + }, + { + "epoch": 0.28307878960890664, + "grad_norm": 2.696697950363159, + "learning_rate": 4.074890407008378e-05, + "loss": 0.989, + "step": 47598 + }, + { + "epoch": 0.28308473689218766, + "grad_norm": 2.860116958618164, + "learning_rate": 4.074854130427483e-05, + "loss": 1.6987, + "step": 47599 + }, + { + "epoch": 0.2830906841754686, + "grad_norm": 3.1414473056793213, + "learning_rate": 4.074817853296823e-05, + "loss": 1.8325, + "step": 47600 + }, + { + "epoch": 0.28309663145874964, + "grad_norm": 1.7270479202270508, + "learning_rate": 4.074781575616411e-05, + "loss": 4.1933, + "step": 47601 + }, + { + "epoch": 0.28310257874203065, + "grad_norm": 1.6333116292953491, + "learning_rate": 4.074745297386259e-05, + "loss": 3.7757, + "step": 47602 + }, + { + "epoch": 0.2831085260253116, + "grad_norm": 2.9959874153137207, + "learning_rate": 4.0747090186063785e-05, + "loss": 2.7687, + "step": 47603 + }, + { + "epoch": 0.28311447330859263, + "grad_norm": 2.2062761783599854, + "learning_rate": 4.0746727392767844e-05, + "loss": 4.7306, + "step": 47604 + }, + { + "epoch": 0.28312042059187364, + "grad_norm": 3.351557493209839, + "learning_rate": 4.074636459397487e-05, + "loss": 1.5198, + "step": 47605 + }, + { + "epoch": 0.2831263678751546, + "grad_norm": 3.3059229850769043, + "learning_rate": 4.074600178968502e-05, + "loss": 2.6848, + "step": 47606 + }, + { + "epoch": 0.2831323151584356, + "grad_norm": 2.117633581161499, + "learning_rate": 4.0745638979898395e-05, + "loss": 4.2156, + "step": 47607 + }, + { + "epoch": 0.28313826244171664, + "grad_norm": 2.0582082271575928, + "learning_rate": 4.074527616461513e-05, + "loss": 4.2386, + "step": 47608 + }, + { + "epoch": 0.2831442097249976, + "grad_norm": 1.9606236219406128, + "learning_rate": 4.074491334383535e-05, + "loss": 4.1313, + "step": 47609 + }, + { + "epoch": 0.2831501570082786, + "grad_norm": 1.87506902217865, + "learning_rate": 4.074455051755919e-05, + "loss": 4.4117, + "step": 47610 + }, + { + "epoch": 0.28315610429155963, + "grad_norm": 1.6650567054748535, + "learning_rate": 4.074418768578676e-05, + "loss": 4.0955, + "step": 47611 + }, + { + "epoch": 0.2831620515748406, + "grad_norm": 1.6206637620925903, + "learning_rate": 4.07438248485182e-05, + "loss": 4.0705, + "step": 47612 + }, + { + "epoch": 0.2831679988581216, + "grad_norm": 1.682294487953186, + "learning_rate": 4.074346200575364e-05, + "loss": 4.0041, + "step": 47613 + }, + { + "epoch": 0.2831739461414026, + "grad_norm": 1.6040107011795044, + "learning_rate": 4.07430991574932e-05, + "loss": 4.1274, + "step": 47614 + }, + { + "epoch": 0.2831798934246836, + "grad_norm": 1.7143365144729614, + "learning_rate": 4.0742736303737e-05, + "loss": 4.0202, + "step": 47615 + }, + { + "epoch": 0.2831858407079646, + "grad_norm": 1.765280842781067, + "learning_rate": 4.074237344448518e-05, + "loss": 4.6767, + "step": 47616 + }, + { + "epoch": 0.2831917879912456, + "grad_norm": 1.863101840019226, + "learning_rate": 4.0742010579737855e-05, + "loss": 3.9348, + "step": 47617 + }, + { + "epoch": 0.2831977352745266, + "grad_norm": 1.7288613319396973, + "learning_rate": 4.074164770949516e-05, + "loss": 4.0502, + "step": 47618 + }, + { + "epoch": 0.2832036825578076, + "grad_norm": 1.6956647634506226, + "learning_rate": 4.074128483375721e-05, + "loss": 3.8932, + "step": 47619 + }, + { + "epoch": 0.2832096298410886, + "grad_norm": 1.7074220180511475, + "learning_rate": 4.0740921952524144e-05, + "loss": 4.0583, + "step": 47620 + }, + { + "epoch": 0.28321557712436957, + "grad_norm": 1.7099874019622803, + "learning_rate": 4.074055906579609e-05, + "loss": 3.9901, + "step": 47621 + }, + { + "epoch": 0.2832215244076506, + "grad_norm": 1.720198392868042, + "learning_rate": 4.074019617357317e-05, + "loss": 3.971, + "step": 47622 + }, + { + "epoch": 0.2832274716909316, + "grad_norm": 1.7428104877471924, + "learning_rate": 4.0739833275855506e-05, + "loss": 3.9824, + "step": 47623 + }, + { + "epoch": 0.28323341897421256, + "grad_norm": 1.7107813358306885, + "learning_rate": 4.073947037264323e-05, + "loss": 4.3659, + "step": 47624 + }, + { + "epoch": 0.2832393662574936, + "grad_norm": 1.9266271591186523, + "learning_rate": 4.073910746393646e-05, + "loss": 3.8775, + "step": 47625 + }, + { + "epoch": 0.2832453135407746, + "grad_norm": 1.725279450416565, + "learning_rate": 4.0738744549735344e-05, + "loss": 3.8846, + "step": 47626 + }, + { + "epoch": 0.28325126082405555, + "grad_norm": 1.661364197731018, + "learning_rate": 4.073838163003998e-05, + "loss": 3.8912, + "step": 47627 + }, + { + "epoch": 0.28325720810733657, + "grad_norm": 1.6418037414550781, + "learning_rate": 4.073801870485052e-05, + "loss": 3.8795, + "step": 47628 + }, + { + "epoch": 0.2832631553906176, + "grad_norm": 1.5681953430175781, + "learning_rate": 4.0737655774167085e-05, + "loss": 4.1013, + "step": 47629 + }, + { + "epoch": 0.28326910267389854, + "grad_norm": 1.6244785785675049, + "learning_rate": 4.0737292837989785e-05, + "loss": 4.4053, + "step": 47630 + }, + { + "epoch": 0.28327504995717956, + "grad_norm": 1.575813889503479, + "learning_rate": 4.0736929896318764e-05, + "loss": 4.735, + "step": 47631 + }, + { + "epoch": 0.2832809972404606, + "grad_norm": 1.5751910209655762, + "learning_rate": 4.0736566949154146e-05, + "loss": 4.8217, + "step": 47632 + }, + { + "epoch": 0.28328694452374154, + "grad_norm": 1.4995057582855225, + "learning_rate": 4.073620399649605e-05, + "loss": 4.524, + "step": 47633 + }, + { + "epoch": 0.28329289180702255, + "grad_norm": 1.6824071407318115, + "learning_rate": 4.0735841038344615e-05, + "loss": 4.4479, + "step": 47634 + }, + { + "epoch": 0.28329883909030357, + "grad_norm": 2.2006938457489014, + "learning_rate": 4.0735478074699954e-05, + "loss": 4.2696, + "step": 47635 + }, + { + "epoch": 0.2833047863735845, + "grad_norm": 1.554419755935669, + "learning_rate": 4.0735115105562196e-05, + "loss": 4.7794, + "step": 47636 + }, + { + "epoch": 0.28331073365686554, + "grad_norm": 1.5823392868041992, + "learning_rate": 4.0734752130931484e-05, + "loss": 4.4125, + "step": 47637 + }, + { + "epoch": 0.28331668094014656, + "grad_norm": 1.6865885257720947, + "learning_rate": 4.0734389150807925e-05, + "loss": 4.5085, + "step": 47638 + }, + { + "epoch": 0.2833226282234275, + "grad_norm": 1.3601572513580322, + "learning_rate": 4.073402616519165e-05, + "loss": 4.438, + "step": 47639 + }, + { + "epoch": 0.28332857550670854, + "grad_norm": 1.5978810787200928, + "learning_rate": 4.073366317408279e-05, + "loss": 4.5359, + "step": 47640 + }, + { + "epoch": 0.28333452278998955, + "grad_norm": 1.495835781097412, + "learning_rate": 4.0733300177481475e-05, + "loss": 4.5817, + "step": 47641 + }, + { + "epoch": 0.2833404700732705, + "grad_norm": 1.5751873254776, + "learning_rate": 4.0732937175387833e-05, + "loss": 4.6257, + "step": 47642 + }, + { + "epoch": 0.2833464173565515, + "grad_norm": 1.5040920972824097, + "learning_rate": 4.0732574167801976e-05, + "loss": 4.5215, + "step": 47643 + }, + { + "epoch": 0.28335236463983254, + "grad_norm": 1.5814672708511353, + "learning_rate": 4.0732211154724045e-05, + "loss": 4.5179, + "step": 47644 + }, + { + "epoch": 0.2833583119231135, + "grad_norm": 1.5212597846984863, + "learning_rate": 4.0731848136154156e-05, + "loss": 4.6437, + "step": 47645 + }, + { + "epoch": 0.2833642592063945, + "grad_norm": 1.5211423635482788, + "learning_rate": 4.0731485112092444e-05, + "loss": 4.5366, + "step": 47646 + }, + { + "epoch": 0.28337020648967554, + "grad_norm": 1.4408525228500366, + "learning_rate": 4.073112208253903e-05, + "loss": 4.8558, + "step": 47647 + }, + { + "epoch": 0.2833761537729565, + "grad_norm": 1.6730358600616455, + "learning_rate": 4.0730759047494046e-05, + "loss": 4.7246, + "step": 47648 + }, + { + "epoch": 0.2833821010562375, + "grad_norm": 1.39554762840271, + "learning_rate": 4.0730396006957625e-05, + "loss": 4.7103, + "step": 47649 + }, + { + "epoch": 0.28338804833951853, + "grad_norm": 1.5862125158309937, + "learning_rate": 4.0730032960929875e-05, + "loss": 4.6925, + "step": 47650 + }, + { + "epoch": 0.2833939956227995, + "grad_norm": 1.5995444059371948, + "learning_rate": 4.072966990941093e-05, + "loss": 4.3167, + "step": 47651 + }, + { + "epoch": 0.2833999429060805, + "grad_norm": 1.6375890970230103, + "learning_rate": 4.0729306852400933e-05, + "loss": 4.417, + "step": 47652 + }, + { + "epoch": 0.2834058901893615, + "grad_norm": 2.0993168354034424, + "learning_rate": 4.0728943789899985e-05, + "loss": 3.9468, + "step": 47653 + }, + { + "epoch": 0.2834118374726425, + "grad_norm": 1.6754807233810425, + "learning_rate": 4.072858072190824e-05, + "loss": 4.1566, + "step": 47654 + }, + { + "epoch": 0.2834177847559235, + "grad_norm": 1.5585135221481323, + "learning_rate": 4.072821764842579e-05, + "loss": 4.64, + "step": 47655 + }, + { + "epoch": 0.2834237320392045, + "grad_norm": 1.541640043258667, + "learning_rate": 4.07278545694528e-05, + "loss": 4.6505, + "step": 47656 + }, + { + "epoch": 0.2834296793224855, + "grad_norm": 1.5904594659805298, + "learning_rate": 4.072749148498937e-05, + "loss": 4.5444, + "step": 47657 + }, + { + "epoch": 0.2834356266057665, + "grad_norm": 1.6853746175765991, + "learning_rate": 4.0727128395035636e-05, + "loss": 4.0782, + "step": 47658 + }, + { + "epoch": 0.2834415738890475, + "grad_norm": 1.8139864206314087, + "learning_rate": 4.072676529959173e-05, + "loss": 4.1341, + "step": 47659 + }, + { + "epoch": 0.28344752117232846, + "grad_norm": 1.8905798196792603, + "learning_rate": 4.072640219865776e-05, + "loss": 3.2879, + "step": 47660 + }, + { + "epoch": 0.2834534684556095, + "grad_norm": 1.2291264533996582, + "learning_rate": 4.072603909223387e-05, + "loss": 5.2278, + "step": 47661 + }, + { + "epoch": 0.2834594157388905, + "grad_norm": 2.1141836643218994, + "learning_rate": 4.072567598032019e-05, + "loss": 4.2874, + "step": 47662 + }, + { + "epoch": 0.28346536302217146, + "grad_norm": 1.565789818763733, + "learning_rate": 4.072531286291683e-05, + "loss": 4.2573, + "step": 47663 + }, + { + "epoch": 0.2834713103054525, + "grad_norm": 1.8833589553833008, + "learning_rate": 4.0724949740023935e-05, + "loss": 4.2572, + "step": 47664 + }, + { + "epoch": 0.2834772575887335, + "grad_norm": 1.5505675077438354, + "learning_rate": 4.072458661164161e-05, + "loss": 4.2901, + "step": 47665 + }, + { + "epoch": 0.28348320487201445, + "grad_norm": 1.6480659246444702, + "learning_rate": 4.072422347776999e-05, + "loss": 4.0694, + "step": 47666 + }, + { + "epoch": 0.28348915215529547, + "grad_norm": 1.6688870191574097, + "learning_rate": 4.072386033840923e-05, + "loss": 4.4579, + "step": 47667 + }, + { + "epoch": 0.2834950994385765, + "grad_norm": 1.5122462511062622, + "learning_rate": 4.072349719355942e-05, + "loss": 4.2914, + "step": 47668 + }, + { + "epoch": 0.28350104672185744, + "grad_norm": 1.4710053205490112, + "learning_rate": 4.072313404322069e-05, + "loss": 4.173, + "step": 47669 + }, + { + "epoch": 0.28350699400513846, + "grad_norm": 1.8565179109573364, + "learning_rate": 4.0722770887393193e-05, + "loss": 4.7109, + "step": 47670 + }, + { + "epoch": 0.2835129412884195, + "grad_norm": 1.5477312803268433, + "learning_rate": 4.0722407726077036e-05, + "loss": 4.2921, + "step": 47671 + }, + { + "epoch": 0.28351888857170043, + "grad_norm": 1.6451352834701538, + "learning_rate": 4.0722044559272336e-05, + "loss": 4.2688, + "step": 47672 + }, + { + "epoch": 0.28352483585498145, + "grad_norm": 1.5840067863464355, + "learning_rate": 4.072168138697925e-05, + "loss": 4.1154, + "step": 47673 + }, + { + "epoch": 0.28353078313826247, + "grad_norm": 1.6842617988586426, + "learning_rate": 4.0721318209197875e-05, + "loss": 4.2, + "step": 47674 + }, + { + "epoch": 0.2835367304215434, + "grad_norm": 1.4466288089752197, + "learning_rate": 4.072095502592835e-05, + "loss": 4.1515, + "step": 47675 + }, + { + "epoch": 0.28354267770482444, + "grad_norm": 1.628372311592102, + "learning_rate": 4.072059183717081e-05, + "loss": 4.7291, + "step": 47676 + }, + { + "epoch": 0.28354862498810546, + "grad_norm": 2.2922451496124268, + "learning_rate": 4.072022864292537e-05, + "loss": 2.9791, + "step": 47677 + }, + { + "epoch": 0.2835545722713864, + "grad_norm": 1.7802824974060059, + "learning_rate": 4.071986544319216e-05, + "loss": 4.4513, + "step": 47678 + }, + { + "epoch": 0.28356051955466743, + "grad_norm": 2.442058801651001, + "learning_rate": 4.0719502237971305e-05, + "loss": 2.94, + "step": 47679 + }, + { + "epoch": 0.28356646683794845, + "grad_norm": 2.4481863975524902, + "learning_rate": 4.071913902726294e-05, + "loss": 2.8986, + "step": 47680 + }, + { + "epoch": 0.2835724141212294, + "grad_norm": 1.8088563680648804, + "learning_rate": 4.0718775811067175e-05, + "loss": 4.3359, + "step": 47681 + }, + { + "epoch": 0.2835783614045104, + "grad_norm": 1.6016877889633179, + "learning_rate": 4.0718412589384165e-05, + "loss": 4.5372, + "step": 47682 + }, + { + "epoch": 0.28358430868779144, + "grad_norm": 2.2700507640838623, + "learning_rate": 4.071804936221401e-05, + "loss": 3.697, + "step": 47683 + }, + { + "epoch": 0.2835902559710724, + "grad_norm": 1.7683926820755005, + "learning_rate": 4.071768612955684e-05, + "loss": 4.8469, + "step": 47684 + }, + { + "epoch": 0.2835962032543534, + "grad_norm": 1.7496652603149414, + "learning_rate": 4.07173228914128e-05, + "loss": 4.5643, + "step": 47685 + }, + { + "epoch": 0.28360215053763443, + "grad_norm": 2.290438175201416, + "learning_rate": 4.0716959647782e-05, + "loss": 3.1363, + "step": 47686 + }, + { + "epoch": 0.2836080978209154, + "grad_norm": 2.3675036430358887, + "learning_rate": 4.071659639866457e-05, + "loss": 2.8695, + "step": 47687 + }, + { + "epoch": 0.2836140451041964, + "grad_norm": 1.6935306787490845, + "learning_rate": 4.071623314406064e-05, + "loss": 4.0032, + "step": 47688 + }, + { + "epoch": 0.2836199923874774, + "grad_norm": 2.7160236835479736, + "learning_rate": 4.0715869883970336e-05, + "loss": 3.7181, + "step": 47689 + }, + { + "epoch": 0.2836259396707584, + "grad_norm": 2.3751614093780518, + "learning_rate": 4.071550661839378e-05, + "loss": 3.045, + "step": 47690 + }, + { + "epoch": 0.2836318869540394, + "grad_norm": 2.4649250507354736, + "learning_rate": 4.071514334733111e-05, + "loss": 3.7531, + "step": 47691 + }, + { + "epoch": 0.2836378342373204, + "grad_norm": 2.454923152923584, + "learning_rate": 4.071478007078244e-05, + "loss": 3.7167, + "step": 47692 + }, + { + "epoch": 0.2836437815206014, + "grad_norm": 2.3581769466400146, + "learning_rate": 4.071441678874791e-05, + "loss": 3.819, + "step": 47693 + }, + { + "epoch": 0.2836497288038824, + "grad_norm": 2.027658700942993, + "learning_rate": 4.0714053501227635e-05, + "loss": 3.9252, + "step": 47694 + }, + { + "epoch": 0.2836556760871634, + "grad_norm": 1.6303343772888184, + "learning_rate": 4.071369020822174e-05, + "loss": 4.1353, + "step": 47695 + }, + { + "epoch": 0.28366162337044437, + "grad_norm": 1.8927668333053589, + "learning_rate": 4.0713326909730366e-05, + "loss": 4.053, + "step": 47696 + }, + { + "epoch": 0.2836675706537254, + "grad_norm": 1.9401227235794067, + "learning_rate": 4.071296360575363e-05, + "loss": 3.7191, + "step": 47697 + }, + { + "epoch": 0.2836735179370064, + "grad_norm": 1.7327760457992554, + "learning_rate": 4.071260029629166e-05, + "loss": 4.4256, + "step": 47698 + }, + { + "epoch": 0.28367946522028736, + "grad_norm": 2.2773115634918213, + "learning_rate": 4.071223698134459e-05, + "loss": 3.5489, + "step": 47699 + }, + { + "epoch": 0.2836854125035684, + "grad_norm": 1.6289392709732056, + "learning_rate": 4.071187366091253e-05, + "loss": 4.741, + "step": 47700 + }, + { + "epoch": 0.28369135978684934, + "grad_norm": 1.5933581590652466, + "learning_rate": 4.071151033499562e-05, + "loss": 5.6719, + "step": 47701 + }, + { + "epoch": 0.28369730707013036, + "grad_norm": 1.717821717262268, + "learning_rate": 4.071114700359399e-05, + "loss": 5.5212, + "step": 47702 + }, + { + "epoch": 0.28370325435341137, + "grad_norm": 1.3271818161010742, + "learning_rate": 4.0710783666707754e-05, + "loss": 4.7569, + "step": 47703 + }, + { + "epoch": 0.28370920163669233, + "grad_norm": 1.6393016576766968, + "learning_rate": 4.071042032433705e-05, + "loss": 4.1058, + "step": 47704 + }, + { + "epoch": 0.28371514891997335, + "grad_norm": 1.734215497970581, + "learning_rate": 4.0710056976482006e-05, + "loss": 4.2522, + "step": 47705 + }, + { + "epoch": 0.28372109620325436, + "grad_norm": 1.7432475090026855, + "learning_rate": 4.0709693623142734e-05, + "loss": 4.6243, + "step": 47706 + }, + { + "epoch": 0.2837270434865353, + "grad_norm": 1.4218366146087646, + "learning_rate": 4.0709330264319364e-05, + "loss": 5.0773, + "step": 47707 + }, + { + "epoch": 0.28373299076981634, + "grad_norm": 1.6565680503845215, + "learning_rate": 4.0708966900012046e-05, + "loss": 5.0399, + "step": 47708 + }, + { + "epoch": 0.28373893805309736, + "grad_norm": 1.6313856840133667, + "learning_rate": 4.070860353022088e-05, + "loss": 4.9445, + "step": 47709 + }, + { + "epoch": 0.2837448853363783, + "grad_norm": 1.3531447649002075, + "learning_rate": 4.0708240154945996e-05, + "loss": 5.5468, + "step": 47710 + }, + { + "epoch": 0.28375083261965933, + "grad_norm": 1.5564112663269043, + "learning_rate": 4.070787677418754e-05, + "loss": 4.8673, + "step": 47711 + }, + { + "epoch": 0.28375677990294035, + "grad_norm": 2.3593673706054688, + "learning_rate": 4.070751338794562e-05, + "loss": 3.5867, + "step": 47712 + }, + { + "epoch": 0.2837627271862213, + "grad_norm": 2.1041595935821533, + "learning_rate": 4.0707149996220375e-05, + "loss": 3.9424, + "step": 47713 + }, + { + "epoch": 0.2837686744695023, + "grad_norm": 1.5985749959945679, + "learning_rate": 4.0706786599011925e-05, + "loss": 4.708, + "step": 47714 + }, + { + "epoch": 0.28377462175278334, + "grad_norm": 1.4830067157745361, + "learning_rate": 4.07064231963204e-05, + "loss": 4.8374, + "step": 47715 + }, + { + "epoch": 0.2837805690360643, + "grad_norm": 1.324734091758728, + "learning_rate": 4.070605978814592e-05, + "loss": 5.111, + "step": 47716 + }, + { + "epoch": 0.2837865163193453, + "grad_norm": 1.339335560798645, + "learning_rate": 4.0705696374488616e-05, + "loss": 4.9819, + "step": 47717 + }, + { + "epoch": 0.28379246360262633, + "grad_norm": 1.3069384098052979, + "learning_rate": 4.070533295534862e-05, + "loss": 4.7119, + "step": 47718 + }, + { + "epoch": 0.2837984108859073, + "grad_norm": 1.6695239543914795, + "learning_rate": 4.070496953072604e-05, + "loss": 4.6378, + "step": 47719 + }, + { + "epoch": 0.2838043581691883, + "grad_norm": 1.5211060047149658, + "learning_rate": 4.070460610062104e-05, + "loss": 4.7007, + "step": 47720 + }, + { + "epoch": 0.2838103054524693, + "grad_norm": 1.4976946115493774, + "learning_rate": 4.070424266503371e-05, + "loss": 4.7546, + "step": 47721 + }, + { + "epoch": 0.2838162527357503, + "grad_norm": 1.3621399402618408, + "learning_rate": 4.0703879223964196e-05, + "loss": 4.6569, + "step": 47722 + }, + { + "epoch": 0.2838222000190313, + "grad_norm": 1.3793399333953857, + "learning_rate": 4.070351577741262e-05, + "loss": 4.5892, + "step": 47723 + }, + { + "epoch": 0.2838281473023123, + "grad_norm": 1.540433645248413, + "learning_rate": 4.070315232537911e-05, + "loss": 4.4467, + "step": 47724 + }, + { + "epoch": 0.2838340945855933, + "grad_norm": 1.454559326171875, + "learning_rate": 4.0702788867863784e-05, + "loss": 4.7076, + "step": 47725 + }, + { + "epoch": 0.2838400418688743, + "grad_norm": 1.3936578035354614, + "learning_rate": 4.070242540486679e-05, + "loss": 4.9547, + "step": 47726 + }, + { + "epoch": 0.2838459891521553, + "grad_norm": 1.4968972206115723, + "learning_rate": 4.070206193638823e-05, + "loss": 4.9155, + "step": 47727 + }, + { + "epoch": 0.28385193643543627, + "grad_norm": 1.5126311779022217, + "learning_rate": 4.0701698462428244e-05, + "loss": 4.8763, + "step": 47728 + }, + { + "epoch": 0.2838578837187173, + "grad_norm": 1.4562867879867554, + "learning_rate": 4.070133498298696e-05, + "loss": 4.7659, + "step": 47729 + }, + { + "epoch": 0.2838638310019983, + "grad_norm": 1.6195062398910522, + "learning_rate": 4.07009714980645e-05, + "loss": 4.9171, + "step": 47730 + }, + { + "epoch": 0.28386977828527926, + "grad_norm": 1.3965200185775757, + "learning_rate": 4.0700608007661e-05, + "loss": 5.0215, + "step": 47731 + }, + { + "epoch": 0.2838757255685603, + "grad_norm": 1.456604242324829, + "learning_rate": 4.0700244511776576e-05, + "loss": 4.8248, + "step": 47732 + }, + { + "epoch": 0.2838816728518413, + "grad_norm": 1.5574363470077515, + "learning_rate": 4.069988101041136e-05, + "loss": 5.0421, + "step": 47733 + }, + { + "epoch": 0.28388762013512225, + "grad_norm": 1.2828304767608643, + "learning_rate": 4.069951750356546e-05, + "loss": 4.8042, + "step": 47734 + }, + { + "epoch": 0.28389356741840327, + "grad_norm": 1.6649327278137207, + "learning_rate": 4.069915399123905e-05, + "loss": 4.5294, + "step": 47735 + }, + { + "epoch": 0.2838995147016843, + "grad_norm": 1.2040799856185913, + "learning_rate": 4.069879047343222e-05, + "loss": 4.9991, + "step": 47736 + }, + { + "epoch": 0.28390546198496525, + "grad_norm": 1.5222867727279663, + "learning_rate": 4.069842695014509e-05, + "loss": 4.8208, + "step": 47737 + }, + { + "epoch": 0.28391140926824626, + "grad_norm": 1.337493658065796, + "learning_rate": 4.0698063421377805e-05, + "loss": 4.6697, + "step": 47738 + }, + { + "epoch": 0.2839173565515273, + "grad_norm": 1.6246858835220337, + "learning_rate": 4.069769988713049e-05, + "loss": 4.8023, + "step": 47739 + }, + { + "epoch": 0.28392330383480824, + "grad_norm": 1.9442254304885864, + "learning_rate": 4.069733634740328e-05, + "loss": 4.2833, + "step": 47740 + }, + { + "epoch": 0.28392925111808925, + "grad_norm": 1.501524567604065, + "learning_rate": 4.069697280219628e-05, + "loss": 4.8037, + "step": 47741 + }, + { + "epoch": 0.28393519840137027, + "grad_norm": 1.580176591873169, + "learning_rate": 4.069660925150963e-05, + "loss": 4.553, + "step": 47742 + }, + { + "epoch": 0.28394114568465123, + "grad_norm": 1.5192621946334839, + "learning_rate": 4.069624569534346e-05, + "loss": 5.2848, + "step": 47743 + }, + { + "epoch": 0.28394709296793225, + "grad_norm": 1.3308916091918945, + "learning_rate": 4.069588213369789e-05, + "loss": 4.9515, + "step": 47744 + }, + { + "epoch": 0.28395304025121326, + "grad_norm": 1.321602702140808, + "learning_rate": 4.0695518566573053e-05, + "loss": 4.7197, + "step": 47745 + }, + { + "epoch": 0.2839589875344942, + "grad_norm": 1.6509920358657837, + "learning_rate": 4.069515499396908e-05, + "loss": 4.8179, + "step": 47746 + }, + { + "epoch": 0.28396493481777524, + "grad_norm": 1.512269139289856, + "learning_rate": 4.069479141588608e-05, + "loss": 4.4579, + "step": 47747 + }, + { + "epoch": 0.28397088210105625, + "grad_norm": 2.9139606952667236, + "learning_rate": 4.069442783232419e-05, + "loss": 3.9839, + "step": 47748 + }, + { + "epoch": 0.2839768293843372, + "grad_norm": 2.4643383026123047, + "learning_rate": 4.0694064243283545e-05, + "loss": 4.0551, + "step": 47749 + }, + { + "epoch": 0.28398277666761823, + "grad_norm": 2.9284543991088867, + "learning_rate": 4.069370064876425e-05, + "loss": 4.0493, + "step": 47750 + }, + { + "epoch": 0.28398872395089925, + "grad_norm": 2.3073859214782715, + "learning_rate": 4.0693337048766465e-05, + "loss": 3.6191, + "step": 47751 + }, + { + "epoch": 0.2839946712341802, + "grad_norm": 1.597361445426941, + "learning_rate": 4.0692973443290286e-05, + "loss": 5.1025, + "step": 47752 + }, + { + "epoch": 0.2840006185174612, + "grad_norm": 1.4314357042312622, + "learning_rate": 4.069260983233586e-05, + "loss": 4.7922, + "step": 47753 + }, + { + "epoch": 0.28400656580074224, + "grad_norm": 2.168073892593384, + "learning_rate": 4.069224621590329e-05, + "loss": 3.8178, + "step": 47754 + }, + { + "epoch": 0.2840125130840232, + "grad_norm": 1.9179620742797852, + "learning_rate": 4.069188259399274e-05, + "loss": 3.9452, + "step": 47755 + }, + { + "epoch": 0.2840184603673042, + "grad_norm": 2.0645053386688232, + "learning_rate": 4.06915189666043e-05, + "loss": 3.722, + "step": 47756 + }, + { + "epoch": 0.28402440765058523, + "grad_norm": 1.9108998775482178, + "learning_rate": 4.069115533373812e-05, + "loss": 3.8794, + "step": 47757 + }, + { + "epoch": 0.2840303549338662, + "grad_norm": 2.1524856090545654, + "learning_rate": 4.069079169539431e-05, + "loss": 3.7347, + "step": 47758 + }, + { + "epoch": 0.2840363022171472, + "grad_norm": 1.5570611953735352, + "learning_rate": 4.0690428051573027e-05, + "loss": 4.3631, + "step": 47759 + }, + { + "epoch": 0.2840422495004282, + "grad_norm": 2.0423877239227295, + "learning_rate": 4.0690064402274365e-05, + "loss": 4.4021, + "step": 47760 + }, + { + "epoch": 0.2840481967837092, + "grad_norm": 1.509718418121338, + "learning_rate": 4.068970074749846e-05, + "loss": 4.7889, + "step": 47761 + }, + { + "epoch": 0.2840541440669902, + "grad_norm": 1.376019835472107, + "learning_rate": 4.068933708724545e-05, + "loss": 4.9041, + "step": 47762 + }, + { + "epoch": 0.2840600913502712, + "grad_norm": 1.5778697729110718, + "learning_rate": 4.068897342151545e-05, + "loss": 4.5067, + "step": 47763 + }, + { + "epoch": 0.2840660386335522, + "grad_norm": 1.5903059244155884, + "learning_rate": 4.0688609750308595e-05, + "loss": 4.5299, + "step": 47764 + }, + { + "epoch": 0.2840719859168332, + "grad_norm": 1.7636394500732422, + "learning_rate": 4.0688246073625005e-05, + "loss": 5.1074, + "step": 47765 + }, + { + "epoch": 0.2840779332001142, + "grad_norm": 1.601303219795227, + "learning_rate": 4.068788239146481e-05, + "loss": 4.8931, + "step": 47766 + }, + { + "epoch": 0.28408388048339517, + "grad_norm": 1.5800220966339111, + "learning_rate": 4.0687518703828145e-05, + "loss": 4.6363, + "step": 47767 + }, + { + "epoch": 0.2840898277666762, + "grad_norm": 2.176316738128662, + "learning_rate": 4.068715501071512e-05, + "loss": 4.3429, + "step": 47768 + }, + { + "epoch": 0.2840957750499572, + "grad_norm": 1.4932777881622314, + "learning_rate": 4.0686791312125875e-05, + "loss": 4.8659, + "step": 47769 + }, + { + "epoch": 0.28410172233323816, + "grad_norm": 1.6535249948501587, + "learning_rate": 4.068642760806053e-05, + "loss": 4.6821, + "step": 47770 + }, + { + "epoch": 0.2841076696165192, + "grad_norm": 1.562558889389038, + "learning_rate": 4.0686063898519224e-05, + "loss": 4.7737, + "step": 47771 + }, + { + "epoch": 0.2841136168998002, + "grad_norm": 1.5538697242736816, + "learning_rate": 4.068570018350206e-05, + "loss": 4.8306, + "step": 47772 + }, + { + "epoch": 0.28411956418308115, + "grad_norm": 1.5597721338272095, + "learning_rate": 4.0685336463009195e-05, + "loss": 4.584, + "step": 47773 + }, + { + "epoch": 0.28412551146636217, + "grad_norm": 1.4541244506835938, + "learning_rate": 4.0684972737040736e-05, + "loss": 4.9284, + "step": 47774 + }, + { + "epoch": 0.2841314587496432, + "grad_norm": 1.4141789674758911, + "learning_rate": 4.0684609005596805e-05, + "loss": 4.8368, + "step": 47775 + }, + { + "epoch": 0.28413740603292414, + "grad_norm": 1.2516100406646729, + "learning_rate": 4.068424526867756e-05, + "loss": 4.7654, + "step": 47776 + }, + { + "epoch": 0.28414335331620516, + "grad_norm": 1.542492389678955, + "learning_rate": 4.068388152628309e-05, + "loss": 4.9659, + "step": 47777 + }, + { + "epoch": 0.2841493005994862, + "grad_norm": 1.9241633415222168, + "learning_rate": 4.068351777841355e-05, + "loss": 4.6107, + "step": 47778 + }, + { + "epoch": 0.28415524788276714, + "grad_norm": 2.3423962593078613, + "learning_rate": 4.068315402506905e-05, + "loss": 3.9319, + "step": 47779 + }, + { + "epoch": 0.28416119516604815, + "grad_norm": 2.54024076461792, + "learning_rate": 4.068279026624972e-05, + "loss": 3.7219, + "step": 47780 + }, + { + "epoch": 0.28416714244932917, + "grad_norm": 1.6376540660858154, + "learning_rate": 4.0682426501955695e-05, + "loss": 4.0611, + "step": 47781 + }, + { + "epoch": 0.28417308973261013, + "grad_norm": 1.3839681148529053, + "learning_rate": 4.0682062732187095e-05, + "loss": 4.8582, + "step": 47782 + }, + { + "epoch": 0.28417903701589114, + "grad_norm": 1.714393973350525, + "learning_rate": 4.0681698956944046e-05, + "loss": 4.2527, + "step": 47783 + }, + { + "epoch": 0.28418498429917216, + "grad_norm": 1.7640591859817505, + "learning_rate": 4.0681335176226684e-05, + "loss": 4.1465, + "step": 47784 + }, + { + "epoch": 0.2841909315824531, + "grad_norm": 1.7956268787384033, + "learning_rate": 4.0680971390035124e-05, + "loss": 4.3452, + "step": 47785 + }, + { + "epoch": 0.28419687886573414, + "grad_norm": 1.6609461307525635, + "learning_rate": 4.06806075983695e-05, + "loss": 4.2274, + "step": 47786 + }, + { + "epoch": 0.28420282614901515, + "grad_norm": 1.6110033988952637, + "learning_rate": 4.068024380122995e-05, + "loss": 4.1249, + "step": 47787 + }, + { + "epoch": 0.2842087734322961, + "grad_norm": 1.4956796169281006, + "learning_rate": 4.0679879998616573e-05, + "loss": 4.2869, + "step": 47788 + }, + { + "epoch": 0.28421472071557713, + "grad_norm": 1.7638230323791504, + "learning_rate": 4.0679516190529524e-05, + "loss": 4.3113, + "step": 47789 + }, + { + "epoch": 0.28422066799885815, + "grad_norm": 1.6232117414474487, + "learning_rate": 4.067915237696891e-05, + "loss": 4.2567, + "step": 47790 + }, + { + "epoch": 0.2842266152821391, + "grad_norm": 1.5724678039550781, + "learning_rate": 4.067878855793486e-05, + "loss": 4.0472, + "step": 47791 + }, + { + "epoch": 0.2842325625654201, + "grad_norm": 1.5254757404327393, + "learning_rate": 4.067842473342752e-05, + "loss": 4.1481, + "step": 47792 + }, + { + "epoch": 0.28423850984870114, + "grad_norm": 1.5748664140701294, + "learning_rate": 4.0678060903447e-05, + "loss": 4.3797, + "step": 47793 + }, + { + "epoch": 0.2842444571319821, + "grad_norm": 1.6615221500396729, + "learning_rate": 4.067769706799342e-05, + "loss": 4.7261, + "step": 47794 + }, + { + "epoch": 0.2842504044152631, + "grad_norm": 1.6097018718719482, + "learning_rate": 4.0677333227066936e-05, + "loss": 4.9113, + "step": 47795 + }, + { + "epoch": 0.28425635169854413, + "grad_norm": 1.6961654424667358, + "learning_rate": 4.067696938066764e-05, + "loss": 4.9502, + "step": 47796 + }, + { + "epoch": 0.2842622989818251, + "grad_norm": 1.5783705711364746, + "learning_rate": 4.0676605528795686e-05, + "loss": 5.0007, + "step": 47797 + }, + { + "epoch": 0.2842682462651061, + "grad_norm": 1.5393177270889282, + "learning_rate": 4.067624167145119e-05, + "loss": 4.5016, + "step": 47798 + }, + { + "epoch": 0.2842741935483871, + "grad_norm": 1.5636115074157715, + "learning_rate": 4.067587780863428e-05, + "loss": 4.3845, + "step": 47799 + }, + { + "epoch": 0.2842801408316681, + "grad_norm": 1.5470283031463623, + "learning_rate": 4.067551394034508e-05, + "loss": 4.6945, + "step": 47800 + }, + { + "epoch": 0.2842860881149491, + "grad_norm": 1.6324440240859985, + "learning_rate": 4.0675150066583725e-05, + "loss": 4.9575, + "step": 47801 + }, + { + "epoch": 0.2842920353982301, + "grad_norm": 1.7794536352157593, + "learning_rate": 4.067478618735033e-05, + "loss": 3.9251, + "step": 47802 + }, + { + "epoch": 0.2842979826815111, + "grad_norm": 1.5535427331924438, + "learning_rate": 4.067442230264503e-05, + "loss": 4.8217, + "step": 47803 + }, + { + "epoch": 0.2843039299647921, + "grad_norm": 1.7037266492843628, + "learning_rate": 4.067405841246796e-05, + "loss": 4.968, + "step": 47804 + }, + { + "epoch": 0.2843098772480731, + "grad_norm": 1.7339664697647095, + "learning_rate": 4.0673694516819226e-05, + "loss": 3.9234, + "step": 47805 + }, + { + "epoch": 0.28431582453135407, + "grad_norm": 1.5008125305175781, + "learning_rate": 4.067333061569898e-05, + "loss": 4.5055, + "step": 47806 + }, + { + "epoch": 0.2843217718146351, + "grad_norm": 1.5004571676254272, + "learning_rate": 4.067296670910733e-05, + "loss": 4.6595, + "step": 47807 + }, + { + "epoch": 0.2843277190979161, + "grad_norm": 1.3791351318359375, + "learning_rate": 4.067260279704441e-05, + "loss": 4.703, + "step": 47808 + }, + { + "epoch": 0.28433366638119706, + "grad_norm": 1.563214898109436, + "learning_rate": 4.0672238879510336e-05, + "loss": 4.5135, + "step": 47809 + }, + { + "epoch": 0.2843396136644781, + "grad_norm": 1.3903695344924927, + "learning_rate": 4.067187495650526e-05, + "loss": 4.7862, + "step": 47810 + }, + { + "epoch": 0.2843455609477591, + "grad_norm": 1.6354960203170776, + "learning_rate": 4.0671511028029284e-05, + "loss": 4.4731, + "step": 47811 + }, + { + "epoch": 0.28435150823104005, + "grad_norm": 2.8895151615142822, + "learning_rate": 4.067114709408255e-05, + "loss": 3.7465, + "step": 47812 + }, + { + "epoch": 0.28435745551432107, + "grad_norm": 2.6401724815368652, + "learning_rate": 4.0670783154665184e-05, + "loss": 3.9687, + "step": 47813 + }, + { + "epoch": 0.2843634027976021, + "grad_norm": 1.9777662754058838, + "learning_rate": 4.067041920977731e-05, + "loss": 4.2391, + "step": 47814 + }, + { + "epoch": 0.28436935008088304, + "grad_norm": 1.452361822128296, + "learning_rate": 4.067005525941905e-05, + "loss": 4.7414, + "step": 47815 + }, + { + "epoch": 0.28437529736416406, + "grad_norm": 1.5919674634933472, + "learning_rate": 4.066969130359054e-05, + "loss": 4.3663, + "step": 47816 + }, + { + "epoch": 0.284381244647445, + "grad_norm": 2.2284796237945557, + "learning_rate": 4.06693273422919e-05, + "loss": 4.1889, + "step": 47817 + }, + { + "epoch": 0.28438719193072604, + "grad_norm": 2.599515676498413, + "learning_rate": 4.066896337552325e-05, + "loss": 3.8169, + "step": 47818 + }, + { + "epoch": 0.28439313921400705, + "grad_norm": 2.318906307220459, + "learning_rate": 4.066859940328474e-05, + "loss": 3.6905, + "step": 47819 + }, + { + "epoch": 0.284399086497288, + "grad_norm": 2.5084614753723145, + "learning_rate": 4.066823542557648e-05, + "loss": 3.2617, + "step": 47820 + }, + { + "epoch": 0.284405033780569, + "grad_norm": 2.553455114364624, + "learning_rate": 4.0667871442398605e-05, + "loss": 3.1931, + "step": 47821 + }, + { + "epoch": 0.28441098106385004, + "grad_norm": 2.5433766841888428, + "learning_rate": 4.066750745375123e-05, + "loss": 3.6601, + "step": 47822 + }, + { + "epoch": 0.284416928347131, + "grad_norm": 1.82417893409729, + "learning_rate": 4.0667143459634494e-05, + "loss": 4.4546, + "step": 47823 + }, + { + "epoch": 0.284422875630412, + "grad_norm": 1.942894458770752, + "learning_rate": 4.066677946004852e-05, + "loss": 4.7863, + "step": 47824 + }, + { + "epoch": 0.28442882291369304, + "grad_norm": 1.5466086864471436, + "learning_rate": 4.066641545499343e-05, + "loss": 4.3201, + "step": 47825 + }, + { + "epoch": 0.284434770196974, + "grad_norm": 2.0261459350585938, + "learning_rate": 4.066605144446936e-05, + "loss": 4.2785, + "step": 47826 + }, + { + "epoch": 0.284440717480255, + "grad_norm": 2.3932323455810547, + "learning_rate": 4.066568742847644e-05, + "loss": 3.3707, + "step": 47827 + }, + { + "epoch": 0.28444666476353603, + "grad_norm": 1.860241413116455, + "learning_rate": 4.0665323407014776e-05, + "loss": 4.3839, + "step": 47828 + }, + { + "epoch": 0.284452612046817, + "grad_norm": 2.728963851928711, + "learning_rate": 4.066495938008452e-05, + "loss": 3.1472, + "step": 47829 + }, + { + "epoch": 0.284458559330098, + "grad_norm": 2.712418556213379, + "learning_rate": 4.066459534768579e-05, + "loss": 3.6036, + "step": 47830 + }, + { + "epoch": 0.284464506613379, + "grad_norm": 2.845499277114868, + "learning_rate": 4.066423130981871e-05, + "loss": 3.4678, + "step": 47831 + }, + { + "epoch": 0.28447045389666, + "grad_norm": 2.8758037090301514, + "learning_rate": 4.06638672664834e-05, + "loss": 3.6253, + "step": 47832 + }, + { + "epoch": 0.284476401179941, + "grad_norm": 2.805828809738159, + "learning_rate": 4.066350321768e-05, + "loss": 4.0137, + "step": 47833 + }, + { + "epoch": 0.284482348463222, + "grad_norm": 2.5738556385040283, + "learning_rate": 4.0663139163408634e-05, + "loss": 3.6793, + "step": 47834 + }, + { + "epoch": 0.284488295746503, + "grad_norm": 2.4384377002716064, + "learning_rate": 4.066277510366944e-05, + "loss": 3.6679, + "step": 47835 + }, + { + "epoch": 0.284494243029784, + "grad_norm": 2.5400302410125732, + "learning_rate": 4.066241103846252e-05, + "loss": 3.7509, + "step": 47836 + }, + { + "epoch": 0.284500190313065, + "grad_norm": 2.578747034072876, + "learning_rate": 4.0662046967788015e-05, + "loss": 3.2906, + "step": 47837 + }, + { + "epoch": 0.28450613759634596, + "grad_norm": 2.4971630573272705, + "learning_rate": 4.066168289164605e-05, + "loss": 3.3972, + "step": 47838 + }, + { + "epoch": 0.284512084879627, + "grad_norm": 2.4323620796203613, + "learning_rate": 4.0661318810036756e-05, + "loss": 3.5827, + "step": 47839 + }, + { + "epoch": 0.284518032162908, + "grad_norm": 2.0018622875213623, + "learning_rate": 4.066095472296026e-05, + "loss": 4.0336, + "step": 47840 + }, + { + "epoch": 0.28452397944618896, + "grad_norm": 2.44938325881958, + "learning_rate": 4.0660590630416686e-05, + "loss": 3.3362, + "step": 47841 + }, + { + "epoch": 0.28452992672947, + "grad_norm": 2.2430531978607178, + "learning_rate": 4.066022653240616e-05, + "loss": 3.7765, + "step": 47842 + }, + { + "epoch": 0.284535874012751, + "grad_norm": 2.4904708862304688, + "learning_rate": 4.065986242892881e-05, + "loss": 3.5391, + "step": 47843 + }, + { + "epoch": 0.28454182129603195, + "grad_norm": 2.762044906616211, + "learning_rate": 4.0659498319984755e-05, + "loss": 3.6201, + "step": 47844 + }, + { + "epoch": 0.28454776857931297, + "grad_norm": 3.150043249130249, + "learning_rate": 4.065913420557414e-05, + "loss": 3.4401, + "step": 47845 + }, + { + "epoch": 0.284553715862594, + "grad_norm": 3.1169824600219727, + "learning_rate": 4.0658770085697084e-05, + "loss": 3.4982, + "step": 47846 + }, + { + "epoch": 0.28455966314587494, + "grad_norm": 2.760521411895752, + "learning_rate": 4.0658405960353715e-05, + "loss": 3.3408, + "step": 47847 + }, + { + "epoch": 0.28456561042915596, + "grad_norm": 2.6601219177246094, + "learning_rate": 4.065804182954416e-05, + "loss": 3.3421, + "step": 47848 + }, + { + "epoch": 0.284571557712437, + "grad_norm": 2.7372469902038574, + "learning_rate": 4.0657677693268535e-05, + "loss": 3.3785, + "step": 47849 + }, + { + "epoch": 0.28457750499571793, + "grad_norm": 2.6798553466796875, + "learning_rate": 4.065731355152698e-05, + "loss": 3.2766, + "step": 47850 + }, + { + "epoch": 0.28458345227899895, + "grad_norm": 1.9006694555282593, + "learning_rate": 4.0656949404319625e-05, + "loss": 4.1795, + "step": 47851 + }, + { + "epoch": 0.28458939956227997, + "grad_norm": 2.804875135421753, + "learning_rate": 4.065658525164658e-05, + "loss": 3.2024, + "step": 47852 + }, + { + "epoch": 0.2845953468455609, + "grad_norm": 3.0003669261932373, + "learning_rate": 4.065622109350798e-05, + "loss": 3.5492, + "step": 47853 + }, + { + "epoch": 0.28460129412884194, + "grad_norm": 2.802676200866699, + "learning_rate": 4.065585692990397e-05, + "loss": 3.4612, + "step": 47854 + }, + { + "epoch": 0.28460724141212296, + "grad_norm": 2.9565560817718506, + "learning_rate": 4.0655492760834655e-05, + "loss": 3.1435, + "step": 47855 + }, + { + "epoch": 0.2846131886954039, + "grad_norm": 2.8489482402801514, + "learning_rate": 4.065512858630017e-05, + "loss": 3.3042, + "step": 47856 + }, + { + "epoch": 0.28461913597868493, + "grad_norm": 2.413942575454712, + "learning_rate": 4.065476440630064e-05, + "loss": 3.598, + "step": 47857 + }, + { + "epoch": 0.28462508326196595, + "grad_norm": 3.2348580360412598, + "learning_rate": 4.0654400220836195e-05, + "loss": 3.8131, + "step": 47858 + }, + { + "epoch": 0.2846310305452469, + "grad_norm": 3.180793523788452, + "learning_rate": 4.0654036029906964e-05, + "loss": 3.4206, + "step": 47859 + }, + { + "epoch": 0.2846369778285279, + "grad_norm": 2.8119266033172607, + "learning_rate": 4.065367183351306e-05, + "loss": 3.0501, + "step": 47860 + }, + { + "epoch": 0.28464292511180894, + "grad_norm": 2.813636302947998, + "learning_rate": 4.065330763165464e-05, + "loss": 3.0405, + "step": 47861 + }, + { + "epoch": 0.2846488723950899, + "grad_norm": 2.1245789527893066, + "learning_rate": 4.065294342433179e-05, + "loss": 3.7983, + "step": 47862 + }, + { + "epoch": 0.2846548196783709, + "grad_norm": 2.573056936264038, + "learning_rate": 4.065257921154467e-05, + "loss": 3.6523, + "step": 47863 + }, + { + "epoch": 0.28466076696165193, + "grad_norm": 1.7781659364700317, + "learning_rate": 4.0652214993293394e-05, + "loss": 4.2607, + "step": 47864 + }, + { + "epoch": 0.2846667142449329, + "grad_norm": 2.8380625247955322, + "learning_rate": 4.065185076957809e-05, + "loss": 3.019, + "step": 47865 + }, + { + "epoch": 0.2846726615282139, + "grad_norm": 3.496797561645508, + "learning_rate": 4.06514865403989e-05, + "loss": 3.2495, + "step": 47866 + }, + { + "epoch": 0.2846786088114949, + "grad_norm": 3.040003538131714, + "learning_rate": 4.065112230575592e-05, + "loss": 3.4819, + "step": 47867 + }, + { + "epoch": 0.2846845560947759, + "grad_norm": 2.8236396312713623, + "learning_rate": 4.06507580656493e-05, + "loss": 3.4662, + "step": 47868 + }, + { + "epoch": 0.2846905033780569, + "grad_norm": 2.1245195865631104, + "learning_rate": 4.065039382007917e-05, + "loss": 4.1458, + "step": 47869 + }, + { + "epoch": 0.2846964506613379, + "grad_norm": 2.3887927532196045, + "learning_rate": 4.0650029569045636e-05, + "loss": 3.6505, + "step": 47870 + }, + { + "epoch": 0.2847023979446189, + "grad_norm": 2.8746166229248047, + "learning_rate": 4.0649665312548844e-05, + "loss": 4.0939, + "step": 47871 + }, + { + "epoch": 0.2847083452278999, + "grad_norm": 2.351219892501831, + "learning_rate": 4.064930105058892e-05, + "loss": 3.7613, + "step": 47872 + }, + { + "epoch": 0.2847142925111809, + "grad_norm": 2.3109936714172363, + "learning_rate": 4.064893678316599e-05, + "loss": 3.9264, + "step": 47873 + }, + { + "epoch": 0.28472023979446187, + "grad_norm": 1.7708712816238403, + "learning_rate": 4.064857251028017e-05, + "loss": 4.1667, + "step": 47874 + }, + { + "epoch": 0.2847261870777429, + "grad_norm": 2.1446197032928467, + "learning_rate": 4.06482082319316e-05, + "loss": 4.1236, + "step": 47875 + }, + { + "epoch": 0.2847321343610239, + "grad_norm": 2.2072770595550537, + "learning_rate": 4.064784394812039e-05, + "loss": 3.8284, + "step": 47876 + }, + { + "epoch": 0.28473808164430486, + "grad_norm": 1.8538073301315308, + "learning_rate": 4.0647479658846685e-05, + "loss": 4.0035, + "step": 47877 + }, + { + "epoch": 0.2847440289275859, + "grad_norm": 1.7604824304580688, + "learning_rate": 4.064711536411062e-05, + "loss": 4.2313, + "step": 47878 + }, + { + "epoch": 0.2847499762108669, + "grad_norm": 1.887946367263794, + "learning_rate": 4.064675106391229e-05, + "loss": 4.7796, + "step": 47879 + }, + { + "epoch": 0.28475592349414786, + "grad_norm": 1.9790668487548828, + "learning_rate": 4.064638675825185e-05, + "loss": 4.4161, + "step": 47880 + }, + { + "epoch": 0.28476187077742887, + "grad_norm": 1.9319730997085571, + "learning_rate": 4.064602244712942e-05, + "loss": 3.9117, + "step": 47881 + }, + { + "epoch": 0.2847678180607099, + "grad_norm": 1.8790576457977295, + "learning_rate": 4.064565813054512e-05, + "loss": 4.0189, + "step": 47882 + }, + { + "epoch": 0.28477376534399085, + "grad_norm": 2.006030797958374, + "learning_rate": 4.0645293808499084e-05, + "loss": 3.9246, + "step": 47883 + }, + { + "epoch": 0.28477971262727186, + "grad_norm": 2.0662381649017334, + "learning_rate": 4.064492948099145e-05, + "loss": 3.494, + "step": 47884 + }, + { + "epoch": 0.2847856599105529, + "grad_norm": 1.8068300485610962, + "learning_rate": 4.064456514802231e-05, + "loss": 4.1159, + "step": 47885 + }, + { + "epoch": 0.28479160719383384, + "grad_norm": 1.5481529235839844, + "learning_rate": 4.064420080959183e-05, + "loss": 4.8494, + "step": 47886 + }, + { + "epoch": 0.28479755447711486, + "grad_norm": 1.4273723363876343, + "learning_rate": 4.0643836465700114e-05, + "loss": 4.635, + "step": 47887 + }, + { + "epoch": 0.28480350176039587, + "grad_norm": 1.5735753774642944, + "learning_rate": 4.0643472116347295e-05, + "loss": 4.5348, + "step": 47888 + }, + { + "epoch": 0.28480944904367683, + "grad_norm": 1.7091344594955444, + "learning_rate": 4.0643107761533506e-05, + "loss": 4.1814, + "step": 47889 + }, + { + "epoch": 0.28481539632695785, + "grad_norm": 1.8737664222717285, + "learning_rate": 4.064274340125887e-05, + "loss": 4.1463, + "step": 47890 + }, + { + "epoch": 0.28482134361023886, + "grad_norm": 1.5659412145614624, + "learning_rate": 4.0642379035523516e-05, + "loss": 4.4396, + "step": 47891 + }, + { + "epoch": 0.2848272908935198, + "grad_norm": 1.7222011089324951, + "learning_rate": 4.064201466432756e-05, + "loss": 4.0548, + "step": 47892 + }, + { + "epoch": 0.28483323817680084, + "grad_norm": 1.7593505382537842, + "learning_rate": 4.064165028767114e-05, + "loss": 3.9995, + "step": 47893 + }, + { + "epoch": 0.28483918546008186, + "grad_norm": 1.6783292293548584, + "learning_rate": 4.064128590555439e-05, + "loss": 3.841, + "step": 47894 + }, + { + "epoch": 0.2848451327433628, + "grad_norm": 2.069943904876709, + "learning_rate": 4.064092151797743e-05, + "loss": 4.5046, + "step": 47895 + }, + { + "epoch": 0.28485108002664383, + "grad_norm": 1.6485099792480469, + "learning_rate": 4.0640557124940376e-05, + "loss": 4.4672, + "step": 47896 + }, + { + "epoch": 0.28485702730992485, + "grad_norm": 1.6598519086837769, + "learning_rate": 4.064019272644336e-05, + "loss": 4.4204, + "step": 47897 + }, + { + "epoch": 0.2848629745932058, + "grad_norm": 1.6818671226501465, + "learning_rate": 4.063982832248653e-05, + "loss": 4.5855, + "step": 47898 + }, + { + "epoch": 0.2848689218764868, + "grad_norm": 1.6202890872955322, + "learning_rate": 4.063946391306999e-05, + "loss": 4.8584, + "step": 47899 + }, + { + "epoch": 0.28487486915976784, + "grad_norm": 1.7200571298599243, + "learning_rate": 4.063909949819388e-05, + "loss": 4.8423, + "step": 47900 + }, + { + "epoch": 0.2848808164430488, + "grad_norm": 1.609028697013855, + "learning_rate": 4.0638735077858317e-05, + "loss": 4.6045, + "step": 47901 + }, + { + "epoch": 0.2848867637263298, + "grad_norm": 1.5244625806808472, + "learning_rate": 4.063837065206343e-05, + "loss": 4.5557, + "step": 47902 + }, + { + "epoch": 0.28489271100961083, + "grad_norm": 1.6748064756393433, + "learning_rate": 4.0638006220809356e-05, + "loss": 4.4871, + "step": 47903 + }, + { + "epoch": 0.2848986582928918, + "grad_norm": 1.6379817724227905, + "learning_rate": 4.063764178409621e-05, + "loss": 4.2784, + "step": 47904 + }, + { + "epoch": 0.2849046055761728, + "grad_norm": 1.7156316041946411, + "learning_rate": 4.063727734192413e-05, + "loss": 4.3083, + "step": 47905 + }, + { + "epoch": 0.2849105528594538, + "grad_norm": 1.6092203855514526, + "learning_rate": 4.0636912894293235e-05, + "loss": 4.5451, + "step": 47906 + }, + { + "epoch": 0.2849165001427348, + "grad_norm": 1.367505431175232, + "learning_rate": 4.063654844120365e-05, + "loss": 4.4748, + "step": 47907 + }, + { + "epoch": 0.2849224474260158, + "grad_norm": 1.5396873950958252, + "learning_rate": 4.0636183982655515e-05, + "loss": 4.5599, + "step": 47908 + }, + { + "epoch": 0.2849283947092968, + "grad_norm": 1.5088887214660645, + "learning_rate": 4.0635819518648946e-05, + "loss": 5.5547, + "step": 47909 + }, + { + "epoch": 0.2849343419925778, + "grad_norm": 1.421736478805542, + "learning_rate": 4.063545504918408e-05, + "loss": 5.6032, + "step": 47910 + }, + { + "epoch": 0.2849402892758588, + "grad_norm": 1.5053447484970093, + "learning_rate": 4.0635090574261036e-05, + "loss": 5.3252, + "step": 47911 + }, + { + "epoch": 0.2849462365591398, + "grad_norm": 1.5550451278686523, + "learning_rate": 4.063472609387994e-05, + "loss": 4.1781, + "step": 47912 + }, + { + "epoch": 0.28495218384242077, + "grad_norm": 1.5177106857299805, + "learning_rate": 4.063436160804092e-05, + "loss": 4.2166, + "step": 47913 + }, + { + "epoch": 0.2849581311257018, + "grad_norm": 3.5727736949920654, + "learning_rate": 4.0633997116744114e-05, + "loss": 3.2907, + "step": 47914 + }, + { + "epoch": 0.2849640784089828, + "grad_norm": 3.923304796218872, + "learning_rate": 4.063363261998964e-05, + "loss": 3.2458, + "step": 47915 + }, + { + "epoch": 0.28497002569226376, + "grad_norm": 1.7898411750793457, + "learning_rate": 4.063326811777762e-05, + "loss": 4.4375, + "step": 47916 + }, + { + "epoch": 0.2849759729755448, + "grad_norm": 1.346549391746521, + "learning_rate": 4.063290361010819e-05, + "loss": 4.8674, + "step": 47917 + }, + { + "epoch": 0.2849819202588258, + "grad_norm": 1.3667829036712646, + "learning_rate": 4.063253909698148e-05, + "loss": 5.3754, + "step": 47918 + }, + { + "epoch": 0.28498786754210675, + "grad_norm": 1.5947827100753784, + "learning_rate": 4.06321745783976e-05, + "loss": 4.9504, + "step": 47919 + }, + { + "epoch": 0.28499381482538777, + "grad_norm": 1.4826979637145996, + "learning_rate": 4.06318100543567e-05, + "loss": 4.8787, + "step": 47920 + }, + { + "epoch": 0.2849997621086688, + "grad_norm": 3.0287652015686035, + "learning_rate": 4.06314455248589e-05, + "loss": 3.05, + "step": 47921 + }, + { + "epoch": 0.28500570939194975, + "grad_norm": 3.1306962966918945, + "learning_rate": 4.0631080989904315e-05, + "loss": 2.9548, + "step": 47922 + }, + { + "epoch": 0.28501165667523076, + "grad_norm": 3.226750135421753, + "learning_rate": 4.0630716449493084e-05, + "loss": 2.8881, + "step": 47923 + }, + { + "epoch": 0.2850176039585118, + "grad_norm": 3.4286649227142334, + "learning_rate": 4.063035190362533e-05, + "loss": 2.8924, + "step": 47924 + }, + { + "epoch": 0.28502355124179274, + "grad_norm": 2.284959316253662, + "learning_rate": 4.062998735230118e-05, + "loss": 3.8686, + "step": 47925 + }, + { + "epoch": 0.28502949852507375, + "grad_norm": 1.8779680728912354, + "learning_rate": 4.062962279552077e-05, + "loss": 4.0036, + "step": 47926 + }, + { + "epoch": 0.28503544580835477, + "grad_norm": 2.376516819000244, + "learning_rate": 4.0629258233284216e-05, + "loss": 3.0999, + "step": 47927 + }, + { + "epoch": 0.28504139309163573, + "grad_norm": 2.932457447052002, + "learning_rate": 4.062889366559165e-05, + "loss": 2.8739, + "step": 47928 + }, + { + "epoch": 0.28504734037491675, + "grad_norm": 2.8712406158447266, + "learning_rate": 4.06285290924432e-05, + "loss": 3.098, + "step": 47929 + }, + { + "epoch": 0.28505328765819776, + "grad_norm": 3.128164052963257, + "learning_rate": 4.0628164513838985e-05, + "loss": 2.911, + "step": 47930 + }, + { + "epoch": 0.2850592349414787, + "grad_norm": 3.038215398788452, + "learning_rate": 4.062779992977914e-05, + "loss": 2.6092, + "step": 47931 + }, + { + "epoch": 0.28506518222475974, + "grad_norm": 2.107654571533203, + "learning_rate": 4.06274353402638e-05, + "loss": 3.3944, + "step": 47932 + }, + { + "epoch": 0.2850711295080407, + "grad_norm": 2.062223196029663, + "learning_rate": 4.062707074529308e-05, + "loss": 3.6033, + "step": 47933 + }, + { + "epoch": 0.2850770767913217, + "grad_norm": 2.7655563354492188, + "learning_rate": 4.062670614486711e-05, + "loss": 2.794, + "step": 47934 + }, + { + "epoch": 0.28508302407460273, + "grad_norm": 3.086958646774292, + "learning_rate": 4.0626341538986024e-05, + "loss": 2.7822, + "step": 47935 + }, + { + "epoch": 0.2850889713578837, + "grad_norm": 2.7597715854644775, + "learning_rate": 4.062597692764993e-05, + "loss": 2.6865, + "step": 47936 + }, + { + "epoch": 0.2850949186411647, + "grad_norm": 2.5489370822906494, + "learning_rate": 4.062561231085898e-05, + "loss": 2.8314, + "step": 47937 + }, + { + "epoch": 0.2851008659244457, + "grad_norm": 2.4772121906280518, + "learning_rate": 4.062524768861329e-05, + "loss": 2.9056, + "step": 47938 + }, + { + "epoch": 0.2851068132077267, + "grad_norm": 1.6750593185424805, + "learning_rate": 4.062488306091299e-05, + "loss": 4.5013, + "step": 47939 + }, + { + "epoch": 0.2851127604910077, + "grad_norm": 1.6762397289276123, + "learning_rate": 4.06245184277582e-05, + "loss": 4.2632, + "step": 47940 + }, + { + "epoch": 0.2851187077742887, + "grad_norm": 2.5696864128112793, + "learning_rate": 4.062415378914905e-05, + "loss": 2.8112, + "step": 47941 + }, + { + "epoch": 0.2851246550575697, + "grad_norm": 2.8367114067077637, + "learning_rate": 4.0623789145085663e-05, + "loss": 2.5116, + "step": 47942 + }, + { + "epoch": 0.2851306023408507, + "grad_norm": 2.827397108078003, + "learning_rate": 4.062342449556819e-05, + "loss": 2.6491, + "step": 47943 + }, + { + "epoch": 0.2851365496241317, + "grad_norm": 2.7441346645355225, + "learning_rate": 4.0623059840596725e-05, + "loss": 2.8096, + "step": 47944 + }, + { + "epoch": 0.28514249690741267, + "grad_norm": 1.4317700862884521, + "learning_rate": 4.062269518017142e-05, + "loss": 4.8342, + "step": 47945 + }, + { + "epoch": 0.2851484441906937, + "grad_norm": 1.5356826782226562, + "learning_rate": 4.062233051429239e-05, + "loss": 4.9292, + "step": 47946 + }, + { + "epoch": 0.2851543914739747, + "grad_norm": 1.4988296031951904, + "learning_rate": 4.062196584295977e-05, + "loss": 4.7144, + "step": 47947 + }, + { + "epoch": 0.28516033875725566, + "grad_norm": 1.5250886678695679, + "learning_rate": 4.062160116617368e-05, + "loss": 4.7483, + "step": 47948 + }, + { + "epoch": 0.2851662860405367, + "grad_norm": 2.7808406352996826, + "learning_rate": 4.062123648393426e-05, + "loss": 3.2586, + "step": 47949 + }, + { + "epoch": 0.2851722333238177, + "grad_norm": 1.8720093965530396, + "learning_rate": 4.062087179624161e-05, + "loss": 4.5054, + "step": 47950 + }, + { + "epoch": 0.28517818060709865, + "grad_norm": 1.697564959526062, + "learning_rate": 4.0620507103095884e-05, + "loss": 4.4132, + "step": 47951 + }, + { + "epoch": 0.28518412789037967, + "grad_norm": 1.5659594535827637, + "learning_rate": 4.0620142404497205e-05, + "loss": 4.5348, + "step": 47952 + }, + { + "epoch": 0.2851900751736607, + "grad_norm": 1.6579846143722534, + "learning_rate": 4.061977770044569e-05, + "loss": 4.4657, + "step": 47953 + }, + { + "epoch": 0.28519602245694164, + "grad_norm": 2.317568302154541, + "learning_rate": 4.061941299094147e-05, + "loss": 3.2755, + "step": 47954 + }, + { + "epoch": 0.28520196974022266, + "grad_norm": 1.670597791671753, + "learning_rate": 4.0619048275984676e-05, + "loss": 4.0111, + "step": 47955 + }, + { + "epoch": 0.2852079170235037, + "grad_norm": 1.5097215175628662, + "learning_rate": 4.0618683555575434e-05, + "loss": 4.9022, + "step": 47956 + }, + { + "epoch": 0.28521386430678464, + "grad_norm": 1.3319398164749146, + "learning_rate": 4.0618318829713874e-05, + "loss": 4.54, + "step": 47957 + }, + { + "epoch": 0.28521981159006565, + "grad_norm": 1.5602614879608154, + "learning_rate": 4.061795409840011e-05, + "loss": 4.4272, + "step": 47958 + }, + { + "epoch": 0.28522575887334667, + "grad_norm": 1.5902042388916016, + "learning_rate": 4.061758936163429e-05, + "loss": 4.4012, + "step": 47959 + }, + { + "epoch": 0.28523170615662763, + "grad_norm": 1.6312373876571655, + "learning_rate": 4.0617224619416526e-05, + "loss": 4.3425, + "step": 47960 + }, + { + "epoch": 0.28523765343990864, + "grad_norm": 1.6176319122314453, + "learning_rate": 4.061685987174696e-05, + "loss": 4.4257, + "step": 47961 + }, + { + "epoch": 0.28524360072318966, + "grad_norm": 1.4623509645462036, + "learning_rate": 4.0616495118625697e-05, + "loss": 4.4274, + "step": 47962 + }, + { + "epoch": 0.2852495480064706, + "grad_norm": 1.5097548961639404, + "learning_rate": 4.0616130360052883e-05, + "loss": 4.7079, + "step": 47963 + }, + { + "epoch": 0.28525549528975164, + "grad_norm": 1.3193554878234863, + "learning_rate": 4.061576559602864e-05, + "loss": 5.0996, + "step": 47964 + }, + { + "epoch": 0.28526144257303265, + "grad_norm": 1.5294357538223267, + "learning_rate": 4.06154008265531e-05, + "loss": 5.3793, + "step": 47965 + }, + { + "epoch": 0.2852673898563136, + "grad_norm": 1.5566275119781494, + "learning_rate": 4.061503605162636e-05, + "loss": 4.6038, + "step": 47966 + }, + { + "epoch": 0.28527333713959463, + "grad_norm": 1.6348639726638794, + "learning_rate": 4.06146712712486e-05, + "loss": 4.1489, + "step": 47967 + }, + { + "epoch": 0.28527928442287565, + "grad_norm": 1.5167810916900635, + "learning_rate": 4.061430648541991e-05, + "loss": 4.1929, + "step": 47968 + }, + { + "epoch": 0.2852852317061566, + "grad_norm": 1.7577855587005615, + "learning_rate": 4.0613941694140424e-05, + "loss": 4.8337, + "step": 47969 + }, + { + "epoch": 0.2852911789894376, + "grad_norm": 1.7311127185821533, + "learning_rate": 4.0613576897410274e-05, + "loss": 5.0324, + "step": 47970 + }, + { + "epoch": 0.28529712627271864, + "grad_norm": 2.592153310775757, + "learning_rate": 4.061321209522959e-05, + "loss": 4.3922, + "step": 47971 + }, + { + "epoch": 0.2853030735559996, + "grad_norm": 1.713493824005127, + "learning_rate": 4.061284728759849e-05, + "loss": 5.1962, + "step": 47972 + }, + { + "epoch": 0.2853090208392806, + "grad_norm": 1.600043535232544, + "learning_rate": 4.061248247451711e-05, + "loss": 5.2047, + "step": 47973 + }, + { + "epoch": 0.28531496812256163, + "grad_norm": 1.3574808835983276, + "learning_rate": 4.061211765598557e-05, + "loss": 4.8821, + "step": 47974 + }, + { + "epoch": 0.2853209154058426, + "grad_norm": 1.8519340753555298, + "learning_rate": 4.0611752832004e-05, + "loss": 4.504, + "step": 47975 + }, + { + "epoch": 0.2853268626891236, + "grad_norm": 1.8311933279037476, + "learning_rate": 4.061138800257254e-05, + "loss": 4.6821, + "step": 47976 + }, + { + "epoch": 0.2853328099724046, + "grad_norm": 1.665398120880127, + "learning_rate": 4.0611023167691284e-05, + "loss": 4.8242, + "step": 47977 + }, + { + "epoch": 0.2853387572556856, + "grad_norm": 1.2976412773132324, + "learning_rate": 4.06106583273604e-05, + "loss": 4.8496, + "step": 47978 + }, + { + "epoch": 0.2853447045389666, + "grad_norm": 1.715679407119751, + "learning_rate": 4.0610293481579986e-05, + "loss": 4.6694, + "step": 47979 + }, + { + "epoch": 0.2853506518222476, + "grad_norm": 1.4746240377426147, + "learning_rate": 4.0609928630350185e-05, + "loss": 4.7096, + "step": 47980 + }, + { + "epoch": 0.2853565991055286, + "grad_norm": 1.424889087677002, + "learning_rate": 4.0609563773671116e-05, + "loss": 5.0211, + "step": 47981 + }, + { + "epoch": 0.2853625463888096, + "grad_norm": 1.4811475276947021, + "learning_rate": 4.0609198911542914e-05, + "loss": 5.0479, + "step": 47982 + }, + { + "epoch": 0.2853684936720906, + "grad_norm": 1.668017864227295, + "learning_rate": 4.06088340439657e-05, + "loss": 4.6856, + "step": 47983 + }, + { + "epoch": 0.28537444095537157, + "grad_norm": 1.5800610780715942, + "learning_rate": 4.06084691709396e-05, + "loss": 4.6397, + "step": 47984 + }, + { + "epoch": 0.2853803882386526, + "grad_norm": 1.6076956987380981, + "learning_rate": 4.060810429246475e-05, + "loss": 4.4816, + "step": 47985 + }, + { + "epoch": 0.2853863355219336, + "grad_norm": 1.5121824741363525, + "learning_rate": 4.060773940854127e-05, + "loss": 4.664, + "step": 47986 + }, + { + "epoch": 0.28539228280521456, + "grad_norm": 1.4830400943756104, + "learning_rate": 4.0607374519169284e-05, + "loss": 4.8568, + "step": 47987 + }, + { + "epoch": 0.2853982300884956, + "grad_norm": 1.6302392482757568, + "learning_rate": 4.060700962434894e-05, + "loss": 4.4322, + "step": 47988 + }, + { + "epoch": 0.2854041773717766, + "grad_norm": 1.349482536315918, + "learning_rate": 4.0606644724080334e-05, + "loss": 5.1751, + "step": 47989 + }, + { + "epoch": 0.28541012465505755, + "grad_norm": 1.3199208974838257, + "learning_rate": 4.060627981836361e-05, + "loss": 5.1269, + "step": 47990 + }, + { + "epoch": 0.28541607193833857, + "grad_norm": 1.3451273441314697, + "learning_rate": 4.0605914907198906e-05, + "loss": 4.9362, + "step": 47991 + }, + { + "epoch": 0.2854220192216196, + "grad_norm": 1.534620761871338, + "learning_rate": 4.060554999058633e-05, + "loss": 4.697, + "step": 47992 + }, + { + "epoch": 0.28542796650490054, + "grad_norm": 2.391713857650757, + "learning_rate": 4.060518506852602e-05, + "loss": 3.9244, + "step": 47993 + }, + { + "epoch": 0.28543391378818156, + "grad_norm": 2.2553510665893555, + "learning_rate": 4.060482014101811e-05, + "loss": 3.7798, + "step": 47994 + }, + { + "epoch": 0.2854398610714626, + "grad_norm": 1.7567185163497925, + "learning_rate": 4.0604455208062704e-05, + "loss": 4.0396, + "step": 47995 + }, + { + "epoch": 0.28544580835474354, + "grad_norm": 2.167217969894409, + "learning_rate": 4.0604090269659944e-05, + "loss": 3.7591, + "step": 47996 + }, + { + "epoch": 0.28545175563802455, + "grad_norm": 1.6790982484817505, + "learning_rate": 4.060372532580997e-05, + "loss": 3.9903, + "step": 47997 + }, + { + "epoch": 0.28545770292130557, + "grad_norm": 1.6385418176651, + "learning_rate": 4.060336037651289e-05, + "loss": 4.8625, + "step": 47998 + }, + { + "epoch": 0.2854636502045865, + "grad_norm": 1.7836644649505615, + "learning_rate": 4.060299542176883e-05, + "loss": 4.3626, + "step": 47999 + }, + { + "epoch": 0.28546959748786754, + "grad_norm": 1.4158227443695068, + "learning_rate": 4.060263046157794e-05, + "loss": 4.0617, + "step": 48000 + }, + { + "epoch": 0.28547554477114856, + "grad_norm": 2.192005157470703, + "learning_rate": 4.0602265495940325e-05, + "loss": 3.7925, + "step": 48001 + }, + { + "epoch": 0.2854814920544295, + "grad_norm": 2.2592740058898926, + "learning_rate": 4.060190052485612e-05, + "loss": 3.6589, + "step": 48002 + }, + { + "epoch": 0.28548743933771054, + "grad_norm": 2.316134214401245, + "learning_rate": 4.0601535548325454e-05, + "loss": 3.8911, + "step": 48003 + }, + { + "epoch": 0.28549338662099155, + "grad_norm": 1.9475353956222534, + "learning_rate": 4.060117056634846e-05, + "loss": 4.2199, + "step": 48004 + }, + { + "epoch": 0.2854993339042725, + "grad_norm": 1.5084713697433472, + "learning_rate": 4.060080557892525e-05, + "loss": 4.8602, + "step": 48005 + }, + { + "epoch": 0.28550528118755353, + "grad_norm": 1.748268485069275, + "learning_rate": 4.060044058605596e-05, + "loss": 4.5902, + "step": 48006 + }, + { + "epoch": 0.28551122847083454, + "grad_norm": 1.565346598625183, + "learning_rate": 4.0600075587740715e-05, + "loss": 4.5784, + "step": 48007 + }, + { + "epoch": 0.2855171757541155, + "grad_norm": 1.564678430557251, + "learning_rate": 4.059971058397965e-05, + "loss": 4.5993, + "step": 48008 + }, + { + "epoch": 0.2855231230373965, + "grad_norm": 1.5648044347763062, + "learning_rate": 4.059934557477289e-05, + "loss": 4.6643, + "step": 48009 + }, + { + "epoch": 0.28552907032067754, + "grad_norm": 1.527518630027771, + "learning_rate": 4.0598980560120554e-05, + "loss": 4.7479, + "step": 48010 + }, + { + "epoch": 0.2855350176039585, + "grad_norm": 1.4862985610961914, + "learning_rate": 4.0598615540022775e-05, + "loss": 4.4724, + "step": 48011 + }, + { + "epoch": 0.2855409648872395, + "grad_norm": 1.2556414604187012, + "learning_rate": 4.0598250514479685e-05, + "loss": 4.7456, + "step": 48012 + }, + { + "epoch": 0.28554691217052053, + "grad_norm": 1.6448986530303955, + "learning_rate": 4.0597885483491406e-05, + "loss": 4.4264, + "step": 48013 + }, + { + "epoch": 0.2855528594538015, + "grad_norm": 1.3447716236114502, + "learning_rate": 4.059752044705806e-05, + "loss": 4.7505, + "step": 48014 + }, + { + "epoch": 0.2855588067370825, + "grad_norm": 1.7509512901306152, + "learning_rate": 4.0597155405179784e-05, + "loss": 4.4196, + "step": 48015 + }, + { + "epoch": 0.2855647540203635, + "grad_norm": 1.4709949493408203, + "learning_rate": 4.0596790357856706e-05, + "loss": 4.7143, + "step": 48016 + }, + { + "epoch": 0.2855707013036445, + "grad_norm": 1.527347207069397, + "learning_rate": 4.0596425305088947e-05, + "loss": 4.6408, + "step": 48017 + }, + { + "epoch": 0.2855766485869255, + "grad_norm": 1.4303741455078125, + "learning_rate": 4.059606024687664e-05, + "loss": 4.4999, + "step": 48018 + }, + { + "epoch": 0.2855825958702065, + "grad_norm": 1.5345454216003418, + "learning_rate": 4.059569518321991e-05, + "loss": 4.6514, + "step": 48019 + }, + { + "epoch": 0.2855885431534875, + "grad_norm": 1.4583972692489624, + "learning_rate": 4.059533011411888e-05, + "loss": 4.4662, + "step": 48020 + }, + { + "epoch": 0.2855944904367685, + "grad_norm": 1.5172975063323975, + "learning_rate": 4.059496503957368e-05, + "loss": 4.9255, + "step": 48021 + }, + { + "epoch": 0.2856004377200495, + "grad_norm": 1.4326070547103882, + "learning_rate": 4.059459995958444e-05, + "loss": 4.7087, + "step": 48022 + }, + { + "epoch": 0.28560638500333047, + "grad_norm": 1.6605404615402222, + "learning_rate": 4.059423487415129e-05, + "loss": 4.4243, + "step": 48023 + }, + { + "epoch": 0.2856123322866115, + "grad_norm": 1.2536252737045288, + "learning_rate": 4.059386978327435e-05, + "loss": 4.7285, + "step": 48024 + }, + { + "epoch": 0.2856182795698925, + "grad_norm": 1.3363524675369263, + "learning_rate": 4.059350468695376e-05, + "loss": 4.7313, + "step": 48025 + }, + { + "epoch": 0.28562422685317346, + "grad_norm": 1.3292607069015503, + "learning_rate": 4.059313958518962e-05, + "loss": 4.659, + "step": 48026 + }, + { + "epoch": 0.2856301741364545, + "grad_norm": 1.4997082948684692, + "learning_rate": 4.0592774477982086e-05, + "loss": 5.0811, + "step": 48027 + }, + { + "epoch": 0.2856361214197355, + "grad_norm": 1.4802887439727783, + "learning_rate": 4.059240936533128e-05, + "loss": 4.6029, + "step": 48028 + }, + { + "epoch": 0.28564206870301645, + "grad_norm": 1.5034558773040771, + "learning_rate": 4.059204424723732e-05, + "loss": 4.5175, + "step": 48029 + }, + { + "epoch": 0.28564801598629747, + "grad_norm": 1.2874819040298462, + "learning_rate": 4.059167912370034e-05, + "loss": 4.6073, + "step": 48030 + }, + { + "epoch": 0.2856539632695785, + "grad_norm": 1.5509870052337646, + "learning_rate": 4.059131399472047e-05, + "loss": 4.9557, + "step": 48031 + }, + { + "epoch": 0.28565991055285944, + "grad_norm": 1.656214952468872, + "learning_rate": 4.0590948860297825e-05, + "loss": 5.0855, + "step": 48032 + }, + { + "epoch": 0.28566585783614046, + "grad_norm": 1.5845086574554443, + "learning_rate": 4.059058372043255e-05, + "loss": 4.5676, + "step": 48033 + }, + { + "epoch": 0.2856718051194215, + "grad_norm": 1.5399020910263062, + "learning_rate": 4.059021857512475e-05, + "loss": 4.3561, + "step": 48034 + }, + { + "epoch": 0.28567775240270243, + "grad_norm": 1.4237595796585083, + "learning_rate": 4.058985342437457e-05, + "loss": 4.5139, + "step": 48035 + }, + { + "epoch": 0.28568369968598345, + "grad_norm": 1.5519890785217285, + "learning_rate": 4.058948826818214e-05, + "loss": 4.5669, + "step": 48036 + }, + { + "epoch": 0.28568964696926447, + "grad_norm": 1.4318177700042725, + "learning_rate": 4.058912310654758e-05, + "loss": 4.4295, + "step": 48037 + }, + { + "epoch": 0.2856955942525454, + "grad_norm": 1.6663225889205933, + "learning_rate": 4.058875793947102e-05, + "loss": 4.4907, + "step": 48038 + }, + { + "epoch": 0.28570154153582644, + "grad_norm": 1.5897163152694702, + "learning_rate": 4.058839276695258e-05, + "loss": 4.4823, + "step": 48039 + }, + { + "epoch": 0.28570748881910746, + "grad_norm": 1.74172842502594, + "learning_rate": 4.0588027588992395e-05, + "loss": 4.3919, + "step": 48040 + }, + { + "epoch": 0.2857134361023884, + "grad_norm": 1.5161064863204956, + "learning_rate": 4.058766240559059e-05, + "loss": 5.3675, + "step": 48041 + }, + { + "epoch": 0.28571938338566943, + "grad_norm": 1.5821267366409302, + "learning_rate": 4.05872972167473e-05, + "loss": 5.3617, + "step": 48042 + }, + { + "epoch": 0.28572533066895045, + "grad_norm": 1.6923712491989136, + "learning_rate": 4.058693202246263e-05, + "loss": 4.7786, + "step": 48043 + }, + { + "epoch": 0.2857312779522314, + "grad_norm": 1.5084589719772339, + "learning_rate": 4.058656682273673e-05, + "loss": 4.5172, + "step": 48044 + }, + { + "epoch": 0.2857372252355124, + "grad_norm": 1.619836449623108, + "learning_rate": 4.058620161756973e-05, + "loss": 4.4186, + "step": 48045 + }, + { + "epoch": 0.28574317251879344, + "grad_norm": 1.727381944656372, + "learning_rate": 4.058583640696174e-05, + "loss": 4.2667, + "step": 48046 + }, + { + "epoch": 0.2857491198020744, + "grad_norm": 1.7471450567245483, + "learning_rate": 4.058547119091289e-05, + "loss": 4.4192, + "step": 48047 + }, + { + "epoch": 0.2857550670853554, + "grad_norm": 1.6995364427566528, + "learning_rate": 4.058510596942332e-05, + "loss": 4.4416, + "step": 48048 + }, + { + "epoch": 0.28576101436863643, + "grad_norm": 1.7835628986358643, + "learning_rate": 4.058474074249316e-05, + "loss": 4.4084, + "step": 48049 + }, + { + "epoch": 0.2857669616519174, + "grad_norm": 1.7749592065811157, + "learning_rate": 4.0584375510122505e-05, + "loss": 4.5149, + "step": 48050 + }, + { + "epoch": 0.2857729089351984, + "grad_norm": 1.5349910259246826, + "learning_rate": 4.058401027231152e-05, + "loss": 4.2806, + "step": 48051 + }, + { + "epoch": 0.28577885621847937, + "grad_norm": 1.4648661613464355, + "learning_rate": 4.0583645029060323e-05, + "loss": 4.731, + "step": 48052 + }, + { + "epoch": 0.2857848035017604, + "grad_norm": 1.6040616035461426, + "learning_rate": 4.058327978036903e-05, + "loss": 5.1051, + "step": 48053 + }, + { + "epoch": 0.2857907507850414, + "grad_norm": 1.5623559951782227, + "learning_rate": 4.058291452623778e-05, + "loss": 5.0831, + "step": 48054 + }, + { + "epoch": 0.28579669806832236, + "grad_norm": 1.665389895439148, + "learning_rate": 4.058254926666668e-05, + "loss": 5.0194, + "step": 48055 + }, + { + "epoch": 0.2858026453516034, + "grad_norm": 1.6285929679870605, + "learning_rate": 4.058218400165589e-05, + "loss": 4.6249, + "step": 48056 + }, + { + "epoch": 0.2858085926348844, + "grad_norm": 2.042797327041626, + "learning_rate": 4.058181873120551e-05, + "loss": 4.2271, + "step": 48057 + }, + { + "epoch": 0.28581453991816536, + "grad_norm": 1.9991939067840576, + "learning_rate": 4.0581453455315685e-05, + "loss": 4.0648, + "step": 48058 + }, + { + "epoch": 0.28582048720144637, + "grad_norm": 2.0042595863342285, + "learning_rate": 4.0581088173986535e-05, + "loss": 4.2082, + "step": 48059 + }, + { + "epoch": 0.2858264344847274, + "grad_norm": 2.004146099090576, + "learning_rate": 4.058072288721818e-05, + "loss": 4.2263, + "step": 48060 + }, + { + "epoch": 0.28583238176800835, + "grad_norm": 1.6895886659622192, + "learning_rate": 4.0580357595010767e-05, + "loss": 4.5459, + "step": 48061 + }, + { + "epoch": 0.28583832905128936, + "grad_norm": 1.435359239578247, + "learning_rate": 4.057999229736441e-05, + "loss": 4.7282, + "step": 48062 + }, + { + "epoch": 0.2858442763345704, + "grad_norm": 1.536786437034607, + "learning_rate": 4.057962699427923e-05, + "loss": 4.8534, + "step": 48063 + }, + { + "epoch": 0.28585022361785134, + "grad_norm": 1.5850833654403687, + "learning_rate": 4.0579261685755366e-05, + "loss": 4.7092, + "step": 48064 + }, + { + "epoch": 0.28585617090113236, + "grad_norm": 1.6127883195877075, + "learning_rate": 4.057889637179295e-05, + "loss": 4.5208, + "step": 48065 + }, + { + "epoch": 0.28586211818441337, + "grad_norm": 1.673174262046814, + "learning_rate": 4.0578531052392094e-05, + "loss": 4.7882, + "step": 48066 + }, + { + "epoch": 0.28586806546769433, + "grad_norm": 1.5152530670166016, + "learning_rate": 4.0578165727552934e-05, + "loss": 4.6601, + "step": 48067 + }, + { + "epoch": 0.28587401275097535, + "grad_norm": 2.247223138809204, + "learning_rate": 4.0577800397275606e-05, + "loss": 3.9231, + "step": 48068 + }, + { + "epoch": 0.28587996003425636, + "grad_norm": 2.2997560501098633, + "learning_rate": 4.057743506156022e-05, + "loss": 3.6816, + "step": 48069 + }, + { + "epoch": 0.2858859073175373, + "grad_norm": 1.394357681274414, + "learning_rate": 4.057706972040691e-05, + "loss": 4.935, + "step": 48070 + }, + { + "epoch": 0.28589185460081834, + "grad_norm": 1.300133228302002, + "learning_rate": 4.057670437381582e-05, + "loss": 4.8469, + "step": 48071 + }, + { + "epoch": 0.28589780188409936, + "grad_norm": 1.4999524354934692, + "learning_rate": 4.057633902178705e-05, + "loss": 4.7651, + "step": 48072 + }, + { + "epoch": 0.2859037491673803, + "grad_norm": 1.4924778938293457, + "learning_rate": 4.057597366432074e-05, + "loss": 4.6989, + "step": 48073 + }, + { + "epoch": 0.28590969645066133, + "grad_norm": 1.5410842895507812, + "learning_rate": 4.0575608301417034e-05, + "loss": 4.6158, + "step": 48074 + }, + { + "epoch": 0.28591564373394235, + "grad_norm": 1.5142109394073486, + "learning_rate": 4.057524293307603e-05, + "loss": 4.6451, + "step": 48075 + }, + { + "epoch": 0.2859215910172233, + "grad_norm": 1.7736762762069702, + "learning_rate": 4.0574877559297875e-05, + "loss": 4.3503, + "step": 48076 + }, + { + "epoch": 0.2859275383005043, + "grad_norm": 1.7726242542266846, + "learning_rate": 4.057451218008269e-05, + "loss": 3.8144, + "step": 48077 + }, + { + "epoch": 0.28593348558378534, + "grad_norm": 2.3394293785095215, + "learning_rate": 4.05741467954306e-05, + "loss": 3.5593, + "step": 48078 + }, + { + "epoch": 0.2859394328670663, + "grad_norm": 1.8987189531326294, + "learning_rate": 4.0573781405341744e-05, + "loss": 4.0754, + "step": 48079 + }, + { + "epoch": 0.2859453801503473, + "grad_norm": 1.8541213274002075, + "learning_rate": 4.057341600981624e-05, + "loss": 4.6202, + "step": 48080 + }, + { + "epoch": 0.28595132743362833, + "grad_norm": 1.6644541025161743, + "learning_rate": 4.057305060885421e-05, + "loss": 4.7174, + "step": 48081 + }, + { + "epoch": 0.2859572747169093, + "grad_norm": 2.0972177982330322, + "learning_rate": 4.057268520245579e-05, + "loss": 3.8745, + "step": 48082 + }, + { + "epoch": 0.2859632220001903, + "grad_norm": 1.9302637577056885, + "learning_rate": 4.057231979062112e-05, + "loss": 3.7822, + "step": 48083 + }, + { + "epoch": 0.2859691692834713, + "grad_norm": 1.6771146059036255, + "learning_rate": 4.057195437335031e-05, + "loss": 4.3281, + "step": 48084 + }, + { + "epoch": 0.2859751165667523, + "grad_norm": 1.6924768686294556, + "learning_rate": 4.057158895064348e-05, + "loss": 4.5111, + "step": 48085 + }, + { + "epoch": 0.2859810638500333, + "grad_norm": 1.741665005683899, + "learning_rate": 4.0571223522500776e-05, + "loss": 4.2955, + "step": 48086 + }, + { + "epoch": 0.2859870111333143, + "grad_norm": 1.6684457063674927, + "learning_rate": 4.057085808892231e-05, + "loss": 4.4222, + "step": 48087 + }, + { + "epoch": 0.2859929584165953, + "grad_norm": 1.693483591079712, + "learning_rate": 4.057049264990823e-05, + "loss": 4.2315, + "step": 48088 + }, + { + "epoch": 0.2859989056998763, + "grad_norm": 1.6810306310653687, + "learning_rate": 4.057012720545865e-05, + "loss": 4.2423, + "step": 48089 + }, + { + "epoch": 0.2860048529831573, + "grad_norm": 1.955031156539917, + "learning_rate": 4.0569761755573696e-05, + "loss": 4.4471, + "step": 48090 + }, + { + "epoch": 0.28601080026643827, + "grad_norm": 1.8230541944503784, + "learning_rate": 4.05693963002535e-05, + "loss": 4.62, + "step": 48091 + }, + { + "epoch": 0.2860167475497193, + "grad_norm": 1.476322889328003, + "learning_rate": 4.0569030839498196e-05, + "loss": 4.5938, + "step": 48092 + }, + { + "epoch": 0.2860226948330003, + "grad_norm": 1.468917727470398, + "learning_rate": 4.05686653733079e-05, + "loss": 4.6706, + "step": 48093 + }, + { + "epoch": 0.28602864211628126, + "grad_norm": 1.6608482599258423, + "learning_rate": 4.056829990168274e-05, + "loss": 4.5059, + "step": 48094 + }, + { + "epoch": 0.2860345893995623, + "grad_norm": 1.6562528610229492, + "learning_rate": 4.056793442462284e-05, + "loss": 4.5267, + "step": 48095 + }, + { + "epoch": 0.2860405366828433, + "grad_norm": 1.7027219533920288, + "learning_rate": 4.0567568942128354e-05, + "loss": 4.497, + "step": 48096 + }, + { + "epoch": 0.28604648396612425, + "grad_norm": 1.441336989402771, + "learning_rate": 4.0567203454199376e-05, + "loss": 4.4177, + "step": 48097 + }, + { + "epoch": 0.28605243124940527, + "grad_norm": 1.4953875541687012, + "learning_rate": 4.056683796083606e-05, + "loss": 4.4306, + "step": 48098 + }, + { + "epoch": 0.2860583785326863, + "grad_norm": 1.5693111419677734, + "learning_rate": 4.0566472462038504e-05, + "loss": 4.4323, + "step": 48099 + }, + { + "epoch": 0.28606432581596725, + "grad_norm": 1.5012792348861694, + "learning_rate": 4.056610695780687e-05, + "loss": 4.2241, + "step": 48100 + }, + { + "epoch": 0.28607027309924826, + "grad_norm": 1.5177466869354248, + "learning_rate": 4.056574144814126e-05, + "loss": 4.4662, + "step": 48101 + }, + { + "epoch": 0.2860762203825293, + "grad_norm": 1.5597478151321411, + "learning_rate": 4.056537593304181e-05, + "loss": 4.5663, + "step": 48102 + }, + { + "epoch": 0.28608216766581024, + "grad_norm": 1.7838362455368042, + "learning_rate": 4.056501041250865e-05, + "loss": 4.5858, + "step": 48103 + }, + { + "epoch": 0.28608811494909125, + "grad_norm": 1.5498157739639282, + "learning_rate": 4.0564644886541906e-05, + "loss": 4.5222, + "step": 48104 + }, + { + "epoch": 0.28609406223237227, + "grad_norm": 1.3297563791275024, + "learning_rate": 4.056427935514171e-05, + "loss": 4.3406, + "step": 48105 + }, + { + "epoch": 0.28610000951565323, + "grad_norm": 1.5654314756393433, + "learning_rate": 4.056391381830817e-05, + "loss": 4.5817, + "step": 48106 + }, + { + "epoch": 0.28610595679893425, + "grad_norm": 1.6912875175476074, + "learning_rate": 4.0563548276041455e-05, + "loss": 4.3805, + "step": 48107 + }, + { + "epoch": 0.28611190408221526, + "grad_norm": 1.7005550861358643, + "learning_rate": 4.0563182728341645e-05, + "loss": 4.6446, + "step": 48108 + }, + { + "epoch": 0.2861178513654962, + "grad_norm": 1.7637169361114502, + "learning_rate": 4.056281717520889e-05, + "loss": 4.3427, + "step": 48109 + }, + { + "epoch": 0.28612379864877724, + "grad_norm": 1.2513097524642944, + "learning_rate": 4.056245161664333e-05, + "loss": 4.5412, + "step": 48110 + }, + { + "epoch": 0.28612974593205825, + "grad_norm": 1.4240003824234009, + "learning_rate": 4.056208605264506e-05, + "loss": 4.6648, + "step": 48111 + }, + { + "epoch": 0.2861356932153392, + "grad_norm": 1.9560205936431885, + "learning_rate": 4.056172048321424e-05, + "loss": 4.0304, + "step": 48112 + }, + { + "epoch": 0.28614164049862023, + "grad_norm": 1.3672512769699097, + "learning_rate": 4.056135490835098e-05, + "loss": 4.3855, + "step": 48113 + }, + { + "epoch": 0.28614758778190125, + "grad_norm": 1.4462573528289795, + "learning_rate": 4.056098932805541e-05, + "loss": 4.4505, + "step": 48114 + }, + { + "epoch": 0.2861535350651822, + "grad_norm": 1.462638020515442, + "learning_rate": 4.0560623742327656e-05, + "loss": 4.3644, + "step": 48115 + }, + { + "epoch": 0.2861594823484632, + "grad_norm": 1.4741953611373901, + "learning_rate": 4.056025815116786e-05, + "loss": 4.5093, + "step": 48116 + }, + { + "epoch": 0.28616542963174424, + "grad_norm": 1.5345144271850586, + "learning_rate": 4.055989255457613e-05, + "loss": 4.4055, + "step": 48117 + }, + { + "epoch": 0.2861713769150252, + "grad_norm": 1.6451665163040161, + "learning_rate": 4.0559526952552607e-05, + "loss": 4.389, + "step": 48118 + }, + { + "epoch": 0.2861773241983062, + "grad_norm": 1.1861977577209473, + "learning_rate": 4.055916134509741e-05, + "loss": 4.4211, + "step": 48119 + }, + { + "epoch": 0.28618327148158723, + "grad_norm": 1.585995078086853, + "learning_rate": 4.0558795732210674e-05, + "loss": 4.5767, + "step": 48120 + }, + { + "epoch": 0.2861892187648682, + "grad_norm": 1.5503231287002563, + "learning_rate": 4.055843011389252e-05, + "loss": 4.5568, + "step": 48121 + }, + { + "epoch": 0.2861951660481492, + "grad_norm": 1.6259492635726929, + "learning_rate": 4.0558064490143084e-05, + "loss": 4.3765, + "step": 48122 + }, + { + "epoch": 0.2862011133314302, + "grad_norm": 1.4036266803741455, + "learning_rate": 4.055769886096248e-05, + "loss": 4.3381, + "step": 48123 + }, + { + "epoch": 0.2862070606147112, + "grad_norm": 1.4949052333831787, + "learning_rate": 4.055733322635085e-05, + "loss": 4.2464, + "step": 48124 + }, + { + "epoch": 0.2862130078979922, + "grad_norm": 1.5507283210754395, + "learning_rate": 4.055696758630831e-05, + "loss": 4.409, + "step": 48125 + }, + { + "epoch": 0.2862189551812732, + "grad_norm": 1.5315433740615845, + "learning_rate": 4.0556601940835e-05, + "loss": 4.4659, + "step": 48126 + }, + { + "epoch": 0.2862249024645542, + "grad_norm": 1.724506139755249, + "learning_rate": 4.055623628993104e-05, + "loss": 4.6269, + "step": 48127 + }, + { + "epoch": 0.2862308497478352, + "grad_norm": 1.4657305479049683, + "learning_rate": 4.055587063359656e-05, + "loss": 4.3505, + "step": 48128 + }, + { + "epoch": 0.2862367970311162, + "grad_norm": 1.4409478902816772, + "learning_rate": 4.0555504971831684e-05, + "loss": 4.5313, + "step": 48129 + }, + { + "epoch": 0.28624274431439717, + "grad_norm": 1.304361343383789, + "learning_rate": 4.055513930463654e-05, + "loss": 4.3676, + "step": 48130 + }, + { + "epoch": 0.2862486915976782, + "grad_norm": 1.7487343549728394, + "learning_rate": 4.0554773632011255e-05, + "loss": 4.4583, + "step": 48131 + }, + { + "epoch": 0.2862546388809592, + "grad_norm": 1.715856671333313, + "learning_rate": 4.055440795395596e-05, + "loss": 4.3834, + "step": 48132 + }, + { + "epoch": 0.28626058616424016, + "grad_norm": 1.600364327430725, + "learning_rate": 4.055404227047079e-05, + "loss": 4.3376, + "step": 48133 + }, + { + "epoch": 0.2862665334475212, + "grad_norm": 1.5348644256591797, + "learning_rate": 4.0553676581555866e-05, + "loss": 4.4843, + "step": 48134 + }, + { + "epoch": 0.2862724807308022, + "grad_norm": 1.4945783615112305, + "learning_rate": 4.05533108872113e-05, + "loss": 4.6407, + "step": 48135 + }, + { + "epoch": 0.28627842801408315, + "grad_norm": 1.5272855758666992, + "learning_rate": 4.055294518743724e-05, + "loss": 4.628, + "step": 48136 + }, + { + "epoch": 0.28628437529736417, + "grad_norm": 1.731754183769226, + "learning_rate": 4.055257948223382e-05, + "loss": 4.2292, + "step": 48137 + }, + { + "epoch": 0.2862903225806452, + "grad_norm": 1.5535935163497925, + "learning_rate": 4.0552213771601136e-05, + "loss": 4.4261, + "step": 48138 + }, + { + "epoch": 0.28629626986392614, + "grad_norm": 1.4199779033660889, + "learning_rate": 4.0551848055539345e-05, + "loss": 4.6129, + "step": 48139 + }, + { + "epoch": 0.28630221714720716, + "grad_norm": 1.4893712997436523, + "learning_rate": 4.055148233404856e-05, + "loss": 4.3337, + "step": 48140 + }, + { + "epoch": 0.2863081644304882, + "grad_norm": 1.2542802095413208, + "learning_rate": 4.055111660712892e-05, + "loss": 4.426, + "step": 48141 + }, + { + "epoch": 0.28631411171376914, + "grad_norm": 1.3215734958648682, + "learning_rate": 4.055075087478054e-05, + "loss": 4.7908, + "step": 48142 + }, + { + "epoch": 0.28632005899705015, + "grad_norm": 1.6149756908416748, + "learning_rate": 4.055038513700356e-05, + "loss": 4.7603, + "step": 48143 + }, + { + "epoch": 0.28632600628033117, + "grad_norm": 1.6498737335205078, + "learning_rate": 4.0550019393798094e-05, + "loss": 4.752, + "step": 48144 + }, + { + "epoch": 0.28633195356361213, + "grad_norm": 1.3466743230819702, + "learning_rate": 4.0549653645164275e-05, + "loss": 4.4833, + "step": 48145 + }, + { + "epoch": 0.28633790084689315, + "grad_norm": 1.6188888549804688, + "learning_rate": 4.054928789110224e-05, + "loss": 4.2409, + "step": 48146 + }, + { + "epoch": 0.28634384813017416, + "grad_norm": 1.6184176206588745, + "learning_rate": 4.054892213161211e-05, + "loss": 4.3195, + "step": 48147 + }, + { + "epoch": 0.2863497954134551, + "grad_norm": 1.4256833791732788, + "learning_rate": 4.0548556366694e-05, + "loss": 4.5669, + "step": 48148 + }, + { + "epoch": 0.28635574269673614, + "grad_norm": 1.732396125793457, + "learning_rate": 4.054819059634807e-05, + "loss": 4.3201, + "step": 48149 + }, + { + "epoch": 0.28636168998001715, + "grad_norm": 1.7547669410705566, + "learning_rate": 4.054782482057441e-05, + "loss": 4.2851, + "step": 48150 + }, + { + "epoch": 0.2863676372632981, + "grad_norm": 1.716722011566162, + "learning_rate": 4.0547459039373173e-05, + "loss": 4.2299, + "step": 48151 + }, + { + "epoch": 0.28637358454657913, + "grad_norm": 1.7106187343597412, + "learning_rate": 4.054709325274447e-05, + "loss": 4.1161, + "step": 48152 + }, + { + "epoch": 0.28637953182986015, + "grad_norm": 1.5814787149429321, + "learning_rate": 4.054672746068845e-05, + "loss": 4.4645, + "step": 48153 + }, + { + "epoch": 0.2863854791131411, + "grad_norm": 1.5402247905731201, + "learning_rate": 4.054636166320521e-05, + "loss": 5.2303, + "step": 48154 + }, + { + "epoch": 0.2863914263964221, + "grad_norm": 1.4727625846862793, + "learning_rate": 4.0545995860294916e-05, + "loss": 4.8527, + "step": 48155 + }, + { + "epoch": 0.28639737367970314, + "grad_norm": 1.5509167909622192, + "learning_rate": 4.054563005195766e-05, + "loss": 4.2542, + "step": 48156 + }, + { + "epoch": 0.2864033209629841, + "grad_norm": 1.66336190700531, + "learning_rate": 4.0545264238193594e-05, + "loss": 4.0479, + "step": 48157 + }, + { + "epoch": 0.2864092682462651, + "grad_norm": 1.5542004108428955, + "learning_rate": 4.054489841900284e-05, + "loss": 4.1217, + "step": 48158 + }, + { + "epoch": 0.28641521552954613, + "grad_norm": 1.9835578203201294, + "learning_rate": 4.0544532594385515e-05, + "loss": 4.2171, + "step": 48159 + }, + { + "epoch": 0.2864211628128271, + "grad_norm": 2.401210308074951, + "learning_rate": 4.054416676434175e-05, + "loss": 3.3363, + "step": 48160 + }, + { + "epoch": 0.2864271100961081, + "grad_norm": 2.498276710510254, + "learning_rate": 4.054380092887169e-05, + "loss": 3.0873, + "step": 48161 + }, + { + "epoch": 0.2864330573793891, + "grad_norm": 2.4551849365234375, + "learning_rate": 4.054343508797545e-05, + "loss": 3.2907, + "step": 48162 + }, + { + "epoch": 0.2864390046626701, + "grad_norm": 2.355175256729126, + "learning_rate": 4.054306924165314e-05, + "loss": 3.3828, + "step": 48163 + }, + { + "epoch": 0.2864449519459511, + "grad_norm": 1.802620768547058, + "learning_rate": 4.054270338990492e-05, + "loss": 4.6477, + "step": 48164 + }, + { + "epoch": 0.2864508992292321, + "grad_norm": 2.019136905670166, + "learning_rate": 4.05423375327309e-05, + "loss": 3.6255, + "step": 48165 + }, + { + "epoch": 0.2864568465125131, + "grad_norm": 2.5027406215667725, + "learning_rate": 4.05419716701312e-05, + "loss": 3.2583, + "step": 48166 + }, + { + "epoch": 0.2864627937957941, + "grad_norm": 2.7998974323272705, + "learning_rate": 4.054160580210597e-05, + "loss": 3.2982, + "step": 48167 + }, + { + "epoch": 0.28646874107907505, + "grad_norm": 2.777061939239502, + "learning_rate": 4.054123992865533e-05, + "loss": 3.4969, + "step": 48168 + }, + { + "epoch": 0.28647468836235607, + "grad_norm": 2.7497525215148926, + "learning_rate": 4.05408740497794e-05, + "loss": 3.1204, + "step": 48169 + }, + { + "epoch": 0.2864806356456371, + "grad_norm": 2.877765417098999, + "learning_rate": 4.05405081654783e-05, + "loss": 3.6781, + "step": 48170 + }, + { + "epoch": 0.28648658292891804, + "grad_norm": 2.182452917098999, + "learning_rate": 4.054014227575218e-05, + "loss": 4.0911, + "step": 48171 + }, + { + "epoch": 0.28649253021219906, + "grad_norm": 2.4544405937194824, + "learning_rate": 4.053977638060115e-05, + "loss": 3.3076, + "step": 48172 + }, + { + "epoch": 0.2864984774954801, + "grad_norm": 2.732208728790283, + "learning_rate": 4.0539410480025356e-05, + "loss": 2.9113, + "step": 48173 + }, + { + "epoch": 0.28650442477876104, + "grad_norm": 2.012052059173584, + "learning_rate": 4.05390445740249e-05, + "loss": 4.5231, + "step": 48174 + }, + { + "epoch": 0.28651037206204205, + "grad_norm": 3.045241117477417, + "learning_rate": 4.053867866259994e-05, + "loss": 3.1066, + "step": 48175 + }, + { + "epoch": 0.28651631934532307, + "grad_norm": 2.680830717086792, + "learning_rate": 4.053831274575057e-05, + "loss": 3.3648, + "step": 48176 + }, + { + "epoch": 0.286522266628604, + "grad_norm": 2.9118831157684326, + "learning_rate": 4.0537946823476943e-05, + "loss": 2.7693, + "step": 48177 + }, + { + "epoch": 0.28652821391188504, + "grad_norm": 2.96380352973938, + "learning_rate": 4.053758089577919e-05, + "loss": 3.0614, + "step": 48178 + }, + { + "epoch": 0.28653416119516606, + "grad_norm": 2.56490159034729, + "learning_rate": 4.053721496265741e-05, + "loss": 3.2589, + "step": 48179 + }, + { + "epoch": 0.286540108478447, + "grad_norm": 2.580418825149536, + "learning_rate": 4.0536849024111764e-05, + "loss": 3.0617, + "step": 48180 + }, + { + "epoch": 0.28654605576172804, + "grad_norm": 2.92708683013916, + "learning_rate": 4.0536483080142355e-05, + "loss": 2.8504, + "step": 48181 + }, + { + "epoch": 0.28655200304500905, + "grad_norm": 2.813084840774536, + "learning_rate": 4.053611713074933e-05, + "loss": 2.8372, + "step": 48182 + }, + { + "epoch": 0.28655795032829, + "grad_norm": 2.238509178161621, + "learning_rate": 4.0535751175932794e-05, + "loss": 3.8187, + "step": 48183 + }, + { + "epoch": 0.28656389761157103, + "grad_norm": 2.513646364212036, + "learning_rate": 4.05353852156929e-05, + "loss": 3.314, + "step": 48184 + }, + { + "epoch": 0.28656984489485204, + "grad_norm": 2.6954052448272705, + "learning_rate": 4.0535019250029756e-05, + "loss": 3.1925, + "step": 48185 + }, + { + "epoch": 0.286575792178133, + "grad_norm": 2.624965190887451, + "learning_rate": 4.0534653278943496e-05, + "loss": 3.2302, + "step": 48186 + }, + { + "epoch": 0.286581739461414, + "grad_norm": 2.8329126834869385, + "learning_rate": 4.053428730243425e-05, + "loss": 3.4016, + "step": 48187 + }, + { + "epoch": 0.28658768674469504, + "grad_norm": 2.8329548835754395, + "learning_rate": 4.0533921320502155e-05, + "loss": 3.0336, + "step": 48188 + }, + { + "epoch": 0.286593634027976, + "grad_norm": 2.613722562789917, + "learning_rate": 4.053355533314731e-05, + "loss": 3.1271, + "step": 48189 + }, + { + "epoch": 0.286599581311257, + "grad_norm": 2.793921947479248, + "learning_rate": 4.053318934036988e-05, + "loss": 2.8062, + "step": 48190 + }, + { + "epoch": 0.28660552859453803, + "grad_norm": 2.541335105895996, + "learning_rate": 4.053282334216996e-05, + "loss": 3.3663, + "step": 48191 + }, + { + "epoch": 0.286611475877819, + "grad_norm": 2.689073324203491, + "learning_rate": 4.05324573385477e-05, + "loss": 3.2876, + "step": 48192 + }, + { + "epoch": 0.2866174231611, + "grad_norm": 2.8017265796661377, + "learning_rate": 4.0532091329503216e-05, + "loss": 2.8307, + "step": 48193 + }, + { + "epoch": 0.286623370444381, + "grad_norm": 2.8726963996887207, + "learning_rate": 4.0531725315036644e-05, + "loss": 3.6354, + "step": 48194 + }, + { + "epoch": 0.286629317727662, + "grad_norm": 2.4567227363586426, + "learning_rate": 4.05313592951481e-05, + "loss": 3.6065, + "step": 48195 + }, + { + "epoch": 0.286635265010943, + "grad_norm": 2.979849338531494, + "learning_rate": 4.0530993269837724e-05, + "loss": 3.5448, + "step": 48196 + }, + { + "epoch": 0.286641212294224, + "grad_norm": 3.598641872406006, + "learning_rate": 4.053062723910564e-05, + "loss": 3.6159, + "step": 48197 + }, + { + "epoch": 0.286647159577505, + "grad_norm": 2.779681921005249, + "learning_rate": 4.053026120295197e-05, + "loss": 3.0884, + "step": 48198 + }, + { + "epoch": 0.286653106860786, + "grad_norm": 2.753917932510376, + "learning_rate": 4.052989516137685e-05, + "loss": 3.1422, + "step": 48199 + }, + { + "epoch": 0.286659054144067, + "grad_norm": 3.241035223007202, + "learning_rate": 4.052952911438041e-05, + "loss": 3.3401, + "step": 48200 + }, + { + "epoch": 0.28666500142734797, + "grad_norm": 2.8779125213623047, + "learning_rate": 4.0529163061962754e-05, + "loss": 3.579, + "step": 48201 + }, + { + "epoch": 0.286670948710629, + "grad_norm": 2.850275754928589, + "learning_rate": 4.052879700412404e-05, + "loss": 3.3645, + "step": 48202 + }, + { + "epoch": 0.28667689599391, + "grad_norm": 3.0734760761260986, + "learning_rate": 4.052843094086438e-05, + "loss": 3.3356, + "step": 48203 + }, + { + "epoch": 0.28668284327719096, + "grad_norm": 2.6759226322174072, + "learning_rate": 4.052806487218391e-05, + "loss": 3.4273, + "step": 48204 + }, + { + "epoch": 0.286688790560472, + "grad_norm": 2.5141122341156006, + "learning_rate": 4.052769879808275e-05, + "loss": 3.3868, + "step": 48205 + }, + { + "epoch": 0.286694737843753, + "grad_norm": 1.9690476655960083, + "learning_rate": 4.052733271856103e-05, + "loss": 3.6571, + "step": 48206 + }, + { + "epoch": 0.28670068512703395, + "grad_norm": 2.677137851715088, + "learning_rate": 4.052696663361888e-05, + "loss": 3.0408, + "step": 48207 + }, + { + "epoch": 0.28670663241031497, + "grad_norm": 2.6753461360931396, + "learning_rate": 4.052660054325642e-05, + "loss": 2.9935, + "step": 48208 + }, + { + "epoch": 0.286712579693596, + "grad_norm": 2.5648958683013916, + "learning_rate": 4.052623444747379e-05, + "loss": 3.0558, + "step": 48209 + }, + { + "epoch": 0.28671852697687694, + "grad_norm": 2.4786672592163086, + "learning_rate": 4.052586834627111e-05, + "loss": 3.1814, + "step": 48210 + }, + { + "epoch": 0.28672447426015796, + "grad_norm": 2.6792569160461426, + "learning_rate": 4.052550223964851e-05, + "loss": 3.367, + "step": 48211 + }, + { + "epoch": 0.286730421543439, + "grad_norm": 2.226191997528076, + "learning_rate": 4.0525136127606125e-05, + "loss": 4.1104, + "step": 48212 + }, + { + "epoch": 0.28673636882671993, + "grad_norm": 1.7741444110870361, + "learning_rate": 4.0524770010144067e-05, + "loss": 4.4312, + "step": 48213 + }, + { + "epoch": 0.28674231611000095, + "grad_norm": 2.0738019943237305, + "learning_rate": 4.0524403887262475e-05, + "loss": 4.6979, + "step": 48214 + }, + { + "epoch": 0.28674826339328197, + "grad_norm": 2.1479382514953613, + "learning_rate": 4.052403775896147e-05, + "loss": 4.0518, + "step": 48215 + }, + { + "epoch": 0.2867542106765629, + "grad_norm": 2.5590384006500244, + "learning_rate": 4.0523671625241185e-05, + "loss": 4.0535, + "step": 48216 + }, + { + "epoch": 0.28676015795984394, + "grad_norm": 3.232262372970581, + "learning_rate": 4.0523305486101745e-05, + "loss": 3.2663, + "step": 48217 + }, + { + "epoch": 0.28676610524312496, + "grad_norm": 2.0496668815612793, + "learning_rate": 4.0522939341543286e-05, + "loss": 3.6377, + "step": 48218 + }, + { + "epoch": 0.2867720525264059, + "grad_norm": 2.091963291168213, + "learning_rate": 4.0522573191565923e-05, + "loss": 4.9261, + "step": 48219 + }, + { + "epoch": 0.28677799980968693, + "grad_norm": 1.6269742250442505, + "learning_rate": 4.05222070361698e-05, + "loss": 4.829, + "step": 48220 + }, + { + "epoch": 0.28678394709296795, + "grad_norm": 1.5650980472564697, + "learning_rate": 4.052184087535503e-05, + "loss": 4.5206, + "step": 48221 + }, + { + "epoch": 0.2867898943762489, + "grad_norm": 1.647567629814148, + "learning_rate": 4.0521474709121744e-05, + "loss": 4.6652, + "step": 48222 + }, + { + "epoch": 0.2867958416595299, + "grad_norm": 1.6390576362609863, + "learning_rate": 4.052110853747007e-05, + "loss": 4.4378, + "step": 48223 + }, + { + "epoch": 0.28680178894281094, + "grad_norm": 1.7829110622406006, + "learning_rate": 4.052074236040014e-05, + "loss": 4.5694, + "step": 48224 + }, + { + "epoch": 0.2868077362260919, + "grad_norm": 1.507447361946106, + "learning_rate": 4.052037617791208e-05, + "loss": 4.9518, + "step": 48225 + }, + { + "epoch": 0.2868136835093729, + "grad_norm": 1.6718353033065796, + "learning_rate": 4.052000999000601e-05, + "loss": 4.7921, + "step": 48226 + }, + { + "epoch": 0.28681963079265393, + "grad_norm": 1.5926729440689087, + "learning_rate": 4.051964379668207e-05, + "loss": 4.9116, + "step": 48227 + }, + { + "epoch": 0.2868255780759349, + "grad_norm": 1.8152202367782593, + "learning_rate": 4.0519277597940385e-05, + "loss": 4.9635, + "step": 48228 + }, + { + "epoch": 0.2868315253592159, + "grad_norm": 1.615724802017212, + "learning_rate": 4.0518911393781076e-05, + "loss": 4.9233, + "step": 48229 + }, + { + "epoch": 0.2868374726424969, + "grad_norm": 1.9503334760665894, + "learning_rate": 4.051854518420428e-05, + "loss": 4.5958, + "step": 48230 + }, + { + "epoch": 0.2868434199257779, + "grad_norm": 1.6919013261795044, + "learning_rate": 4.051817896921012e-05, + "loss": 4.4643, + "step": 48231 + }, + { + "epoch": 0.2868493672090589, + "grad_norm": 2.013286590576172, + "learning_rate": 4.0517812748798714e-05, + "loss": 4.4565, + "step": 48232 + }, + { + "epoch": 0.2868553144923399, + "grad_norm": 2.330411195755005, + "learning_rate": 4.0517446522970215e-05, + "loss": 3.5204, + "step": 48233 + }, + { + "epoch": 0.2868612617756209, + "grad_norm": 2.828512668609619, + "learning_rate": 4.051708029172472e-05, + "loss": 3.1092, + "step": 48234 + }, + { + "epoch": 0.2868672090589019, + "grad_norm": 1.9096051454544067, + "learning_rate": 4.051671405506239e-05, + "loss": 4.3644, + "step": 48235 + }, + { + "epoch": 0.2868731563421829, + "grad_norm": 1.3458869457244873, + "learning_rate": 4.0516347812983326e-05, + "loss": 4.9824, + "step": 48236 + }, + { + "epoch": 0.28687910362546387, + "grad_norm": 2.3475279808044434, + "learning_rate": 4.051598156548766e-05, + "loss": 3.8134, + "step": 48237 + }, + { + "epoch": 0.2868850509087449, + "grad_norm": 2.536539077758789, + "learning_rate": 4.051561531257553e-05, + "loss": 3.4637, + "step": 48238 + }, + { + "epoch": 0.2868909981920259, + "grad_norm": 2.614776611328125, + "learning_rate": 4.051524905424706e-05, + "loss": 3.4564, + "step": 48239 + }, + { + "epoch": 0.28689694547530686, + "grad_norm": 2.258211135864258, + "learning_rate": 4.051488279050237e-05, + "loss": 3.2307, + "step": 48240 + }, + { + "epoch": 0.2869028927585879, + "grad_norm": 2.290356397628784, + "learning_rate": 4.0514516521341607e-05, + "loss": 3.3556, + "step": 48241 + }, + { + "epoch": 0.2869088400418689, + "grad_norm": 1.997148036956787, + "learning_rate": 4.0514150246764884e-05, + "loss": 4.0022, + "step": 48242 + }, + { + "epoch": 0.28691478732514986, + "grad_norm": 1.8273736238479614, + "learning_rate": 4.051378396677232e-05, + "loss": 4.2595, + "step": 48243 + }, + { + "epoch": 0.28692073460843087, + "grad_norm": 1.6926087141036987, + "learning_rate": 4.0513417681364074e-05, + "loss": 5.5407, + "step": 48244 + }, + { + "epoch": 0.2869266818917119, + "grad_norm": 1.606542706489563, + "learning_rate": 4.051305139054024e-05, + "loss": 5.6044, + "step": 48245 + }, + { + "epoch": 0.28693262917499285, + "grad_norm": 1.3375025987625122, + "learning_rate": 4.051268509430096e-05, + "loss": 5.2901, + "step": 48246 + }, + { + "epoch": 0.28693857645827386, + "grad_norm": 1.4424041509628296, + "learning_rate": 4.051231879264636e-05, + "loss": 4.8623, + "step": 48247 + }, + { + "epoch": 0.2869445237415549, + "grad_norm": 1.4583451747894287, + "learning_rate": 4.0511952485576575e-05, + "loss": 4.5418, + "step": 48248 + }, + { + "epoch": 0.28695047102483584, + "grad_norm": 1.6583433151245117, + "learning_rate": 4.0511586173091734e-05, + "loss": 4.3694, + "step": 48249 + }, + { + "epoch": 0.28695641830811686, + "grad_norm": 1.5407161712646484, + "learning_rate": 4.051121985519195e-05, + "loss": 4.4406, + "step": 48250 + }, + { + "epoch": 0.28696236559139787, + "grad_norm": 1.546233892440796, + "learning_rate": 4.0510853531877355e-05, + "loss": 4.4192, + "step": 48251 + }, + { + "epoch": 0.28696831287467883, + "grad_norm": 1.3666867017745972, + "learning_rate": 4.0510487203148095e-05, + "loss": 4.9766, + "step": 48252 + }, + { + "epoch": 0.28697426015795985, + "grad_norm": 1.3926539421081543, + "learning_rate": 4.0510120869004275e-05, + "loss": 5.1725, + "step": 48253 + }, + { + "epoch": 0.28698020744124086, + "grad_norm": 2.322946548461914, + "learning_rate": 4.050975452944603e-05, + "loss": 3.9867, + "step": 48254 + }, + { + "epoch": 0.2869861547245218, + "grad_norm": 2.3414995670318604, + "learning_rate": 4.05093881844735e-05, + "loss": 3.532, + "step": 48255 + }, + { + "epoch": 0.28699210200780284, + "grad_norm": 2.320796489715576, + "learning_rate": 4.050902183408679e-05, + "loss": 3.7068, + "step": 48256 + }, + { + "epoch": 0.28699804929108386, + "grad_norm": 2.2251224517822266, + "learning_rate": 4.050865547828605e-05, + "loss": 3.6885, + "step": 48257 + }, + { + "epoch": 0.2870039965743648, + "grad_norm": 1.6774431467056274, + "learning_rate": 4.0508289117071395e-05, + "loss": 4.0451, + "step": 48258 + }, + { + "epoch": 0.28700994385764583, + "grad_norm": 1.4545427560806274, + "learning_rate": 4.0507922750442954e-05, + "loss": 4.4545, + "step": 48259 + }, + { + "epoch": 0.28701589114092685, + "grad_norm": 1.577213168144226, + "learning_rate": 4.050755637840086e-05, + "loss": 5.3552, + "step": 48260 + }, + { + "epoch": 0.2870218384242078, + "grad_norm": 1.467389702796936, + "learning_rate": 4.050719000094524e-05, + "loss": 5.1351, + "step": 48261 + }, + { + "epoch": 0.2870277857074888, + "grad_norm": 1.5154500007629395, + "learning_rate": 4.0506823618076214e-05, + "loss": 4.7369, + "step": 48262 + }, + { + "epoch": 0.28703373299076984, + "grad_norm": 1.57632315158844, + "learning_rate": 4.0506457229793925e-05, + "loss": 4.8845, + "step": 48263 + }, + { + "epoch": 0.2870396802740508, + "grad_norm": 2.1295228004455566, + "learning_rate": 4.0506090836098485e-05, + "loss": 3.7982, + "step": 48264 + }, + { + "epoch": 0.2870456275573318, + "grad_norm": 2.591029167175293, + "learning_rate": 4.050572443699004e-05, + "loss": 4.0093, + "step": 48265 + }, + { + "epoch": 0.28705157484061283, + "grad_norm": 2.8312859535217285, + "learning_rate": 4.0505358032468696e-05, + "loss": 3.6605, + "step": 48266 + }, + { + "epoch": 0.2870575221238938, + "grad_norm": 1.9492106437683105, + "learning_rate": 4.050499162253459e-05, + "loss": 4.3939, + "step": 48267 + }, + { + "epoch": 0.2870634694071748, + "grad_norm": 1.4192448854446411, + "learning_rate": 4.0504625207187864e-05, + "loss": 4.8529, + "step": 48268 + }, + { + "epoch": 0.2870694166904558, + "grad_norm": 1.5151803493499756, + "learning_rate": 4.050425878642862e-05, + "loss": 4.6587, + "step": 48269 + }, + { + "epoch": 0.2870753639737368, + "grad_norm": 1.5239298343658447, + "learning_rate": 4.0503892360257006e-05, + "loss": 5.1491, + "step": 48270 + }, + { + "epoch": 0.2870813112570178, + "grad_norm": 2.56900691986084, + "learning_rate": 4.0503525928673145e-05, + "loss": 3.8439, + "step": 48271 + }, + { + "epoch": 0.2870872585402988, + "grad_norm": 2.3408377170562744, + "learning_rate": 4.050315949167716e-05, + "loss": 4.0111, + "step": 48272 + }, + { + "epoch": 0.2870932058235798, + "grad_norm": 2.4338886737823486, + "learning_rate": 4.0502793049269185e-05, + "loss": 3.9367, + "step": 48273 + }, + { + "epoch": 0.2870991531068608, + "grad_norm": 2.3788459300994873, + "learning_rate": 4.050242660144934e-05, + "loss": 3.3412, + "step": 48274 + }, + { + "epoch": 0.2871051003901418, + "grad_norm": 2.457860231399536, + "learning_rate": 4.050206014821776e-05, + "loss": 3.644, + "step": 48275 + }, + { + "epoch": 0.28711104767342277, + "grad_norm": 2.0322635173797607, + "learning_rate": 4.050169368957457e-05, + "loss": 4.0841, + "step": 48276 + }, + { + "epoch": 0.2871169949567038, + "grad_norm": 1.8786789178848267, + "learning_rate": 4.0501327225519905e-05, + "loss": 4.7024, + "step": 48277 + }, + { + "epoch": 0.2871229422399848, + "grad_norm": 1.9084508419036865, + "learning_rate": 4.050096075605388e-05, + "loss": 4.4611, + "step": 48278 + }, + { + "epoch": 0.28712888952326576, + "grad_norm": 2.5023016929626465, + "learning_rate": 4.050059428117663e-05, + "loss": 3.8474, + "step": 48279 + }, + { + "epoch": 0.2871348368065468, + "grad_norm": 1.7272257804870605, + "learning_rate": 4.050022780088829e-05, + "loss": 4.148, + "step": 48280 + }, + { + "epoch": 0.2871407840898278, + "grad_norm": 2.1814019680023193, + "learning_rate": 4.049986131518897e-05, + "loss": 3.7664, + "step": 48281 + }, + { + "epoch": 0.28714673137310875, + "grad_norm": 2.2958340644836426, + "learning_rate": 4.049949482407881e-05, + "loss": 3.4976, + "step": 48282 + }, + { + "epoch": 0.28715267865638977, + "grad_norm": 2.411698579788208, + "learning_rate": 4.0499128327557944e-05, + "loss": 3.8032, + "step": 48283 + }, + { + "epoch": 0.28715862593967073, + "grad_norm": 2.117689371109009, + "learning_rate": 4.049876182562649e-05, + "loss": 3.9773, + "step": 48284 + }, + { + "epoch": 0.28716457322295175, + "grad_norm": 2.170595407485962, + "learning_rate": 4.049839531828458e-05, + "loss": 4.0507, + "step": 48285 + }, + { + "epoch": 0.28717052050623276, + "grad_norm": 1.3638255596160889, + "learning_rate": 4.049802880553234e-05, + "loss": 5.1435, + "step": 48286 + }, + { + "epoch": 0.2871764677895137, + "grad_norm": 1.721694827079773, + "learning_rate": 4.04976622873699e-05, + "loss": 4.202, + "step": 48287 + }, + { + "epoch": 0.28718241507279474, + "grad_norm": 1.7726867198944092, + "learning_rate": 4.049729576379738e-05, + "loss": 4.2631, + "step": 48288 + }, + { + "epoch": 0.28718836235607575, + "grad_norm": 2.5746212005615234, + "learning_rate": 4.0496929234814915e-05, + "loss": 3.668, + "step": 48289 + }, + { + "epoch": 0.2871943096393567, + "grad_norm": 2.086582899093628, + "learning_rate": 4.049656270042263e-05, + "loss": 3.6732, + "step": 48290 + }, + { + "epoch": 0.28720025692263773, + "grad_norm": 2.6165454387664795, + "learning_rate": 4.049619616062066e-05, + "loss": 3.7184, + "step": 48291 + }, + { + "epoch": 0.28720620420591875, + "grad_norm": 2.1196906566619873, + "learning_rate": 4.0495829615409124e-05, + "loss": 3.7176, + "step": 48292 + }, + { + "epoch": 0.2872121514891997, + "grad_norm": 1.8618810176849365, + "learning_rate": 4.049546306478816e-05, + "loss": 4.1115, + "step": 48293 + }, + { + "epoch": 0.2872180987724807, + "grad_norm": 1.6473273038864136, + "learning_rate": 4.0495096508757885e-05, + "loss": 4.5347, + "step": 48294 + }, + { + "epoch": 0.28722404605576174, + "grad_norm": 1.6809290647506714, + "learning_rate": 4.049472994731843e-05, + "loss": 4.2017, + "step": 48295 + }, + { + "epoch": 0.2872299933390427, + "grad_norm": 1.7341912984848022, + "learning_rate": 4.049436338046993e-05, + "loss": 3.8082, + "step": 48296 + }, + { + "epoch": 0.2872359406223237, + "grad_norm": 2.480448007583618, + "learning_rate": 4.049399680821251e-05, + "loss": 3.6222, + "step": 48297 + }, + { + "epoch": 0.28724188790560473, + "grad_norm": 1.3652417659759521, + "learning_rate": 4.0493630230546285e-05, + "loss": 4.7819, + "step": 48298 + }, + { + "epoch": 0.2872478351888857, + "grad_norm": 1.6677241325378418, + "learning_rate": 4.04932636474714e-05, + "loss": 4.2691, + "step": 48299 + }, + { + "epoch": 0.2872537824721667, + "grad_norm": 1.9744430780410767, + "learning_rate": 4.049289705898798e-05, + "loss": 4.6972, + "step": 48300 + }, + { + "epoch": 0.2872597297554477, + "grad_norm": 1.565252423286438, + "learning_rate": 4.049253046509615e-05, + "loss": 4.689, + "step": 48301 + }, + { + "epoch": 0.2872656770387287, + "grad_norm": 1.5808005332946777, + "learning_rate": 4.049216386579603e-05, + "loss": 4.159, + "step": 48302 + }, + { + "epoch": 0.2872716243220097, + "grad_norm": 1.6461420059204102, + "learning_rate": 4.0491797261087765e-05, + "loss": 4.2765, + "step": 48303 + }, + { + "epoch": 0.2872775716052907, + "grad_norm": 1.703823447227478, + "learning_rate": 4.049143065097147e-05, + "loss": 4.2068, + "step": 48304 + }, + { + "epoch": 0.2872835188885717, + "grad_norm": 1.7831089496612549, + "learning_rate": 4.0491064035447276e-05, + "loss": 4.2363, + "step": 48305 + }, + { + "epoch": 0.2872894661718527, + "grad_norm": 1.7858545780181885, + "learning_rate": 4.049069741451531e-05, + "loss": 4.9064, + "step": 48306 + }, + { + "epoch": 0.2872954134551337, + "grad_norm": 2.1502413749694824, + "learning_rate": 4.04903307881757e-05, + "loss": 4.471, + "step": 48307 + }, + { + "epoch": 0.28730136073841467, + "grad_norm": 1.8783429861068726, + "learning_rate": 4.0489964156428576e-05, + "loss": 4.5531, + "step": 48308 + }, + { + "epoch": 0.2873073080216957, + "grad_norm": 1.5892510414123535, + "learning_rate": 4.048959751927407e-05, + "loss": 4.6306, + "step": 48309 + }, + { + "epoch": 0.2873132553049767, + "grad_norm": 1.8774176836013794, + "learning_rate": 4.0489230876712305e-05, + "loss": 4.8363, + "step": 48310 + }, + { + "epoch": 0.28731920258825766, + "grad_norm": 1.698699712753296, + "learning_rate": 4.048886422874341e-05, + "loss": 4.8451, + "step": 48311 + }, + { + "epoch": 0.2873251498715387, + "grad_norm": 1.8031138181686401, + "learning_rate": 4.0488497575367515e-05, + "loss": 3.8237, + "step": 48312 + }, + { + "epoch": 0.2873310971548197, + "grad_norm": 2.524188995361328, + "learning_rate": 4.048813091658473e-05, + "loss": 3.6943, + "step": 48313 + }, + { + "epoch": 0.28733704443810065, + "grad_norm": 1.629144310951233, + "learning_rate": 4.048776425239521e-05, + "loss": 4.5408, + "step": 48314 + }, + { + "epoch": 0.28734299172138167, + "grad_norm": 2.068023204803467, + "learning_rate": 4.0487397582799074e-05, + "loss": 4.263, + "step": 48315 + }, + { + "epoch": 0.2873489390046627, + "grad_norm": 2.07680344581604, + "learning_rate": 4.0487030907796444e-05, + "loss": 4.2398, + "step": 48316 + }, + { + "epoch": 0.28735488628794364, + "grad_norm": 1.8470377922058105, + "learning_rate": 4.048666422738745e-05, + "loss": 4.1494, + "step": 48317 + }, + { + "epoch": 0.28736083357122466, + "grad_norm": 1.5872479677200317, + "learning_rate": 4.048629754157223e-05, + "loss": 4.193, + "step": 48318 + }, + { + "epoch": 0.2873667808545057, + "grad_norm": 1.6266708374023438, + "learning_rate": 4.04859308503509e-05, + "loss": 4.6377, + "step": 48319 + }, + { + "epoch": 0.28737272813778664, + "grad_norm": 1.9413371086120605, + "learning_rate": 4.0485564153723584e-05, + "loss": 3.8857, + "step": 48320 + }, + { + "epoch": 0.28737867542106765, + "grad_norm": 1.6406413316726685, + "learning_rate": 4.0485197451690424e-05, + "loss": 4.5689, + "step": 48321 + }, + { + "epoch": 0.28738462270434867, + "grad_norm": 1.7156422138214111, + "learning_rate": 4.0484830744251535e-05, + "loss": 4.1997, + "step": 48322 + }, + { + "epoch": 0.28739056998762963, + "grad_norm": 1.46681809425354, + "learning_rate": 4.048446403140706e-05, + "loss": 4.8833, + "step": 48323 + }, + { + "epoch": 0.28739651727091065, + "grad_norm": 1.555810570716858, + "learning_rate": 4.048409731315711e-05, + "loss": 4.246, + "step": 48324 + }, + { + "epoch": 0.28740246455419166, + "grad_norm": 1.7147964239120483, + "learning_rate": 4.0483730589501825e-05, + "loss": 3.9927, + "step": 48325 + }, + { + "epoch": 0.2874084118374726, + "grad_norm": 1.619624376296997, + "learning_rate": 4.048336386044133e-05, + "loss": 4.1962, + "step": 48326 + }, + { + "epoch": 0.28741435912075364, + "grad_norm": 1.6225165128707886, + "learning_rate": 4.048299712597576e-05, + "loss": 4.0224, + "step": 48327 + }, + { + "epoch": 0.28742030640403465, + "grad_norm": 1.6558338403701782, + "learning_rate": 4.0482630386105224e-05, + "loss": 3.5279, + "step": 48328 + }, + { + "epoch": 0.2874262536873156, + "grad_norm": 1.530533790588379, + "learning_rate": 4.048226364082987e-05, + "loss": 4.1099, + "step": 48329 + }, + { + "epoch": 0.28743220097059663, + "grad_norm": 1.7101895809173584, + "learning_rate": 4.0481896890149816e-05, + "loss": 4.2747, + "step": 48330 + }, + { + "epoch": 0.28743814825387765, + "grad_norm": 1.912721037864685, + "learning_rate": 4.048153013406519e-05, + "loss": 4.0154, + "step": 48331 + }, + { + "epoch": 0.2874440955371586, + "grad_norm": 1.8920495510101318, + "learning_rate": 4.048116337257612e-05, + "loss": 4.0181, + "step": 48332 + }, + { + "epoch": 0.2874500428204396, + "grad_norm": 2.3642256259918213, + "learning_rate": 4.0480796605682736e-05, + "loss": 3.5648, + "step": 48333 + }, + { + "epoch": 0.28745599010372064, + "grad_norm": 1.9247609376907349, + "learning_rate": 4.048042983338517e-05, + "loss": 4.0489, + "step": 48334 + }, + { + "epoch": 0.2874619373870016, + "grad_norm": 2.294067621231079, + "learning_rate": 4.0480063055683544e-05, + "loss": 4.2441, + "step": 48335 + }, + { + "epoch": 0.2874678846702826, + "grad_norm": 1.3504213094711304, + "learning_rate": 4.0479696272577995e-05, + "loss": 5.2146, + "step": 48336 + }, + { + "epoch": 0.28747383195356363, + "grad_norm": 1.9657520055770874, + "learning_rate": 4.0479329484068626e-05, + "loss": 4.7274, + "step": 48337 + }, + { + "epoch": 0.2874797792368446, + "grad_norm": 1.5924221277236938, + "learning_rate": 4.04789626901556e-05, + "loss": 4.1773, + "step": 48338 + }, + { + "epoch": 0.2874857265201256, + "grad_norm": 1.612351417541504, + "learning_rate": 4.047859589083902e-05, + "loss": 3.7808, + "step": 48339 + }, + { + "epoch": 0.2874916738034066, + "grad_norm": 1.6588338613510132, + "learning_rate": 4.0478229086119026e-05, + "loss": 4.1419, + "step": 48340 + }, + { + "epoch": 0.2874976210866876, + "grad_norm": 1.6228408813476562, + "learning_rate": 4.0477862275995735e-05, + "loss": 5.1101, + "step": 48341 + }, + { + "epoch": 0.2875035683699686, + "grad_norm": 1.5293383598327637, + "learning_rate": 4.047749546046929e-05, + "loss": 4.4566, + "step": 48342 + }, + { + "epoch": 0.2875095156532496, + "grad_norm": 1.663490891456604, + "learning_rate": 4.047712863953981e-05, + "loss": 4.627, + "step": 48343 + }, + { + "epoch": 0.2875154629365306, + "grad_norm": 1.893075704574585, + "learning_rate": 4.047676181320742e-05, + "loss": 4.4328, + "step": 48344 + }, + { + "epoch": 0.2875214102198116, + "grad_norm": 1.5815705060958862, + "learning_rate": 4.0476394981472256e-05, + "loss": 4.559, + "step": 48345 + }, + { + "epoch": 0.2875273575030926, + "grad_norm": 1.524537205696106, + "learning_rate": 4.047602814433444e-05, + "loss": 4.5211, + "step": 48346 + }, + { + "epoch": 0.28753330478637357, + "grad_norm": 1.8773177862167358, + "learning_rate": 4.04756613017941e-05, + "loss": 4.3501, + "step": 48347 + }, + { + "epoch": 0.2875392520696546, + "grad_norm": 1.5378371477127075, + "learning_rate": 4.047529445385138e-05, + "loss": 4.725, + "step": 48348 + }, + { + "epoch": 0.2875451993529356, + "grad_norm": 1.3884105682373047, + "learning_rate": 4.0474927600506374e-05, + "loss": 4.7559, + "step": 48349 + }, + { + "epoch": 0.28755114663621656, + "grad_norm": 2.06453275680542, + "learning_rate": 4.047456074175925e-05, + "loss": 4.3546, + "step": 48350 + }, + { + "epoch": 0.2875570939194976, + "grad_norm": 1.5109692811965942, + "learning_rate": 4.0474193877610105e-05, + "loss": 4.9474, + "step": 48351 + }, + { + "epoch": 0.2875630412027786, + "grad_norm": 1.4487067461013794, + "learning_rate": 4.0473827008059084e-05, + "loss": 4.7332, + "step": 48352 + }, + { + "epoch": 0.28756898848605955, + "grad_norm": 1.4418103694915771, + "learning_rate": 4.04734601331063e-05, + "loss": 4.6621, + "step": 48353 + }, + { + "epoch": 0.28757493576934057, + "grad_norm": 1.3970410823822021, + "learning_rate": 4.0473093252751905e-05, + "loss": 4.8305, + "step": 48354 + }, + { + "epoch": 0.2875808830526216, + "grad_norm": 1.5982037782669067, + "learning_rate": 4.0472726366996005e-05, + "loss": 5.3807, + "step": 48355 + }, + { + "epoch": 0.28758683033590254, + "grad_norm": 1.7210224866867065, + "learning_rate": 4.047235947583874e-05, + "loss": 4.7199, + "step": 48356 + }, + { + "epoch": 0.28759277761918356, + "grad_norm": 1.8215256929397583, + "learning_rate": 4.0471992579280235e-05, + "loss": 4.0364, + "step": 48357 + }, + { + "epoch": 0.2875987249024646, + "grad_norm": 1.528281807899475, + "learning_rate": 4.0471625677320615e-05, + "loss": 4.5672, + "step": 48358 + }, + { + "epoch": 0.28760467218574554, + "grad_norm": 1.4525278806686401, + "learning_rate": 4.047125876996e-05, + "loss": 4.77, + "step": 48359 + }, + { + "epoch": 0.28761061946902655, + "grad_norm": 1.430655837059021, + "learning_rate": 4.047089185719854e-05, + "loss": 4.7167, + "step": 48360 + }, + { + "epoch": 0.28761656675230757, + "grad_norm": 1.6334011554718018, + "learning_rate": 4.0470524939036355e-05, + "loss": 4.7652, + "step": 48361 + }, + { + "epoch": 0.28762251403558853, + "grad_norm": 1.491365909576416, + "learning_rate": 4.047015801547356e-05, + "loss": 4.6595, + "step": 48362 + }, + { + "epoch": 0.28762846131886954, + "grad_norm": 1.6510396003723145, + "learning_rate": 4.04697910865103e-05, + "loss": 4.7828, + "step": 48363 + }, + { + "epoch": 0.28763440860215056, + "grad_norm": 1.5334874391555786, + "learning_rate": 4.046942415214669e-05, + "loss": 4.7319, + "step": 48364 + }, + { + "epoch": 0.2876403558854315, + "grad_norm": 1.574807047843933, + "learning_rate": 4.0469057212382864e-05, + "loss": 4.6019, + "step": 48365 + }, + { + "epoch": 0.28764630316871254, + "grad_norm": 1.4658188819885254, + "learning_rate": 4.046869026721896e-05, + "loss": 4.7255, + "step": 48366 + }, + { + "epoch": 0.28765225045199355, + "grad_norm": 1.4546321630477905, + "learning_rate": 4.0468323316655085e-05, + "loss": 4.926, + "step": 48367 + }, + { + "epoch": 0.2876581977352745, + "grad_norm": 1.4121043682098389, + "learning_rate": 4.0467956360691375e-05, + "loss": 4.6095, + "step": 48368 + }, + { + "epoch": 0.28766414501855553, + "grad_norm": 1.8288159370422363, + "learning_rate": 4.046758939932798e-05, + "loss": 4.1125, + "step": 48369 + }, + { + "epoch": 0.28767009230183654, + "grad_norm": 1.670445203781128, + "learning_rate": 4.0467222432564986e-05, + "loss": 4.4338, + "step": 48370 + }, + { + "epoch": 0.2876760395851175, + "grad_norm": 1.5191582441329956, + "learning_rate": 4.046685546040256e-05, + "loss": 4.9304, + "step": 48371 + }, + { + "epoch": 0.2876819868683985, + "grad_norm": 1.621804118156433, + "learning_rate": 4.0466488482840815e-05, + "loss": 3.9167, + "step": 48372 + }, + { + "epoch": 0.28768793415167954, + "grad_norm": 1.583011269569397, + "learning_rate": 4.0466121499879875e-05, + "loss": 4.653, + "step": 48373 + }, + { + "epoch": 0.2876938814349605, + "grad_norm": 1.737381935119629, + "learning_rate": 4.046575451151986e-05, + "loss": 4.6888, + "step": 48374 + }, + { + "epoch": 0.2876998287182415, + "grad_norm": 1.5464602708816528, + "learning_rate": 4.046538751776092e-05, + "loss": 4.5595, + "step": 48375 + }, + { + "epoch": 0.28770577600152253, + "grad_norm": 1.6191273927688599, + "learning_rate": 4.046502051860318e-05, + "loss": 4.5835, + "step": 48376 + }, + { + "epoch": 0.2877117232848035, + "grad_norm": 1.3728690147399902, + "learning_rate": 4.046465351404675e-05, + "loss": 4.7193, + "step": 48377 + }, + { + "epoch": 0.2877176705680845, + "grad_norm": 1.308577537536621, + "learning_rate": 4.046428650409178e-05, + "loss": 4.7466, + "step": 48378 + }, + { + "epoch": 0.2877236178513655, + "grad_norm": 1.3242303133010864, + "learning_rate": 4.046391948873838e-05, + "loss": 5.2299, + "step": 48379 + }, + { + "epoch": 0.2877295651346465, + "grad_norm": 2.2107748985290527, + "learning_rate": 4.0463552467986685e-05, + "loss": 4.3387, + "step": 48380 + }, + { + "epoch": 0.2877355124179275, + "grad_norm": 2.17682147026062, + "learning_rate": 4.046318544183683e-05, + "loss": 4.2875, + "step": 48381 + }, + { + "epoch": 0.2877414597012085, + "grad_norm": 1.6661877632141113, + "learning_rate": 4.046281841028893e-05, + "loss": 5.0888, + "step": 48382 + }, + { + "epoch": 0.2877474069844895, + "grad_norm": 1.3165565729141235, + "learning_rate": 4.0462451373343114e-05, + "loss": 5.5969, + "step": 48383 + }, + { + "epoch": 0.2877533542677705, + "grad_norm": 1.670307993888855, + "learning_rate": 4.0462084330999527e-05, + "loss": 5.2619, + "step": 48384 + }, + { + "epoch": 0.2877593015510515, + "grad_norm": 1.5856081247329712, + "learning_rate": 4.046171728325828e-05, + "loss": 4.6046, + "step": 48385 + }, + { + "epoch": 0.28776524883433247, + "grad_norm": 1.5700494050979614, + "learning_rate": 4.046135023011951e-05, + "loss": 5.301, + "step": 48386 + }, + { + "epoch": 0.2877711961176135, + "grad_norm": 1.5828990936279297, + "learning_rate": 4.046098317158335e-05, + "loss": 4.9505, + "step": 48387 + }, + { + "epoch": 0.2877771434008945, + "grad_norm": 1.4790046215057373, + "learning_rate": 4.0460616107649906e-05, + "loss": 5.0291, + "step": 48388 + }, + { + "epoch": 0.28778309068417546, + "grad_norm": 1.5915039777755737, + "learning_rate": 4.0460249038319333e-05, + "loss": 5.0452, + "step": 48389 + }, + { + "epoch": 0.2877890379674565, + "grad_norm": 1.5933433771133423, + "learning_rate": 4.045988196359174e-05, + "loss": 5.0452, + "step": 48390 + }, + { + "epoch": 0.2877949852507375, + "grad_norm": 1.3320975303649902, + "learning_rate": 4.045951488346726e-05, + "loss": 4.9794, + "step": 48391 + }, + { + "epoch": 0.28780093253401845, + "grad_norm": 1.7496477365493774, + "learning_rate": 4.045914779794603e-05, + "loss": 4.0091, + "step": 48392 + }, + { + "epoch": 0.28780687981729947, + "grad_norm": 1.485085368156433, + "learning_rate": 4.0458780707028164e-05, + "loss": 5.0085, + "step": 48393 + }, + { + "epoch": 0.2878128271005805, + "grad_norm": 1.4560199975967407, + "learning_rate": 4.0458413610713805e-05, + "loss": 4.9279, + "step": 48394 + }, + { + "epoch": 0.28781877438386144, + "grad_norm": 1.555594801902771, + "learning_rate": 4.045804650900307e-05, + "loss": 4.9671, + "step": 48395 + }, + { + "epoch": 0.28782472166714246, + "grad_norm": 1.5392390489578247, + "learning_rate": 4.0457679401896086e-05, + "loss": 4.952, + "step": 48396 + }, + { + "epoch": 0.2878306689504235, + "grad_norm": 1.5950181484222412, + "learning_rate": 4.045731228939299e-05, + "loss": 4.7421, + "step": 48397 + }, + { + "epoch": 0.28783661623370443, + "grad_norm": 1.451468586921692, + "learning_rate": 4.045694517149391e-05, + "loss": 4.8604, + "step": 48398 + }, + { + "epoch": 0.28784256351698545, + "grad_norm": 1.9220938682556152, + "learning_rate": 4.045657804819897e-05, + "loss": 5.0816, + "step": 48399 + }, + { + "epoch": 0.2878485108002664, + "grad_norm": 1.5969181060791016, + "learning_rate": 4.0456210919508286e-05, + "loss": 5.1214, + "step": 48400 + }, + { + "epoch": 0.2878544580835474, + "grad_norm": 1.6720662117004395, + "learning_rate": 4.045584378542201e-05, + "loss": 4.9788, + "step": 48401 + }, + { + "epoch": 0.28786040536682844, + "grad_norm": 1.3851715326309204, + "learning_rate": 4.045547664594025e-05, + "loss": 4.8751, + "step": 48402 + }, + { + "epoch": 0.2878663526501094, + "grad_norm": 1.3679670095443726, + "learning_rate": 4.045510950106315e-05, + "loss": 4.7455, + "step": 48403 + }, + { + "epoch": 0.2878722999333904, + "grad_norm": 1.7609822750091553, + "learning_rate": 4.0454742350790834e-05, + "loss": 3.9755, + "step": 48404 + }, + { + "epoch": 0.28787824721667143, + "grad_norm": 1.520253300666809, + "learning_rate": 4.045437519512342e-05, + "loss": 4.6881, + "step": 48405 + }, + { + "epoch": 0.2878841944999524, + "grad_norm": 1.445172905921936, + "learning_rate": 4.045400803406104e-05, + "loss": 4.4948, + "step": 48406 + }, + { + "epoch": 0.2878901417832334, + "grad_norm": 1.2735581398010254, + "learning_rate": 4.045364086760384e-05, + "loss": 4.7743, + "step": 48407 + }, + { + "epoch": 0.2878960890665144, + "grad_norm": 1.5666313171386719, + "learning_rate": 4.045327369575192e-05, + "loss": 5.3713, + "step": 48408 + }, + { + "epoch": 0.2879020363497954, + "grad_norm": 1.3400638103485107, + "learning_rate": 4.045290651850543e-05, + "loss": 5.1678, + "step": 48409 + }, + { + "epoch": 0.2879079836330764, + "grad_norm": 1.5064088106155396, + "learning_rate": 4.045253933586448e-05, + "loss": 4.6706, + "step": 48410 + }, + { + "epoch": 0.2879139309163574, + "grad_norm": 1.3179607391357422, + "learning_rate": 4.045217214782921e-05, + "loss": 4.9754, + "step": 48411 + }, + { + "epoch": 0.2879198781996384, + "grad_norm": 1.2016338109970093, + "learning_rate": 4.0451804954399756e-05, + "loss": 4.9795, + "step": 48412 + }, + { + "epoch": 0.2879258254829194, + "grad_norm": 1.3066421747207642, + "learning_rate": 4.045143775557624e-05, + "loss": 4.8163, + "step": 48413 + }, + { + "epoch": 0.2879317727662004, + "grad_norm": 1.6910141706466675, + "learning_rate": 4.045107055135877e-05, + "loss": 5.0501, + "step": 48414 + }, + { + "epoch": 0.28793772004948137, + "grad_norm": 1.816940426826477, + "learning_rate": 4.0450703341747495e-05, + "loss": 4.1972, + "step": 48415 + }, + { + "epoch": 0.2879436673327624, + "grad_norm": 1.4674605131149292, + "learning_rate": 4.045033612674255e-05, + "loss": 4.7159, + "step": 48416 + }, + { + "epoch": 0.2879496146160434, + "grad_norm": 1.4584875106811523, + "learning_rate": 4.044996890634405e-05, + "loss": 4.702, + "step": 48417 + }, + { + "epoch": 0.28795556189932436, + "grad_norm": 1.4943853616714478, + "learning_rate": 4.044960168055212e-05, + "loss": 4.6541, + "step": 48418 + }, + { + "epoch": 0.2879615091826054, + "grad_norm": 1.2919809818267822, + "learning_rate": 4.044923444936689e-05, + "loss": 4.5951, + "step": 48419 + }, + { + "epoch": 0.2879674564658864, + "grad_norm": 1.228176474571228, + "learning_rate": 4.04488672127885e-05, + "loss": 4.688, + "step": 48420 + }, + { + "epoch": 0.28797340374916736, + "grad_norm": 1.4245563745498657, + "learning_rate": 4.044849997081707e-05, + "loss": 5.0124, + "step": 48421 + }, + { + "epoch": 0.28797935103244837, + "grad_norm": 1.3918020725250244, + "learning_rate": 4.044813272345272e-05, + "loss": 4.3683, + "step": 48422 + }, + { + "epoch": 0.2879852983157294, + "grad_norm": 1.5020198822021484, + "learning_rate": 4.0447765470695586e-05, + "loss": 5.0977, + "step": 48423 + }, + { + "epoch": 0.28799124559901035, + "grad_norm": 1.3810712099075317, + "learning_rate": 4.0447398212545804e-05, + "loss": 5.0239, + "step": 48424 + }, + { + "epoch": 0.28799719288229136, + "grad_norm": 1.4143033027648926, + "learning_rate": 4.044703094900349e-05, + "loss": 4.8079, + "step": 48425 + }, + { + "epoch": 0.2880031401655724, + "grad_norm": 1.4393515586853027, + "learning_rate": 4.0446663680068786e-05, + "loss": 4.8917, + "step": 48426 + }, + { + "epoch": 0.28800908744885334, + "grad_norm": 1.8052327632904053, + "learning_rate": 4.04462964057418e-05, + "loss": 4.4808, + "step": 48427 + }, + { + "epoch": 0.28801503473213436, + "grad_norm": 1.639722466468811, + "learning_rate": 4.044592912602268e-05, + "loss": 4.3116, + "step": 48428 + }, + { + "epoch": 0.28802098201541537, + "grad_norm": 1.430612564086914, + "learning_rate": 4.044556184091154e-05, + "loss": 4.8082, + "step": 48429 + }, + { + "epoch": 0.28802692929869633, + "grad_norm": 1.475433111190796, + "learning_rate": 4.0445194550408515e-05, + "loss": 4.6554, + "step": 48430 + }, + { + "epoch": 0.28803287658197735, + "grad_norm": 1.308881402015686, + "learning_rate": 4.044482725451374e-05, + "loss": 4.7294, + "step": 48431 + }, + { + "epoch": 0.28803882386525836, + "grad_norm": 1.2651715278625488, + "learning_rate": 4.044445995322733e-05, + "loss": 4.7782, + "step": 48432 + }, + { + "epoch": 0.2880447711485393, + "grad_norm": 1.4553437232971191, + "learning_rate": 4.0444092646549416e-05, + "loss": 4.6437, + "step": 48433 + }, + { + "epoch": 0.28805071843182034, + "grad_norm": 1.3703798055648804, + "learning_rate": 4.044372533448013e-05, + "loss": 4.8123, + "step": 48434 + }, + { + "epoch": 0.28805666571510136, + "grad_norm": 1.4599709510803223, + "learning_rate": 4.0443358017019594e-05, + "loss": 4.6214, + "step": 48435 + }, + { + "epoch": 0.2880626129983823, + "grad_norm": 1.43120276927948, + "learning_rate": 4.044299069416795e-05, + "loss": 4.8145, + "step": 48436 + }, + { + "epoch": 0.28806856028166333, + "grad_norm": 1.5854673385620117, + "learning_rate": 4.0442623365925315e-05, + "loss": 4.7604, + "step": 48437 + }, + { + "epoch": 0.28807450756494435, + "grad_norm": 1.3502123355865479, + "learning_rate": 4.0442256032291813e-05, + "loss": 4.557, + "step": 48438 + }, + { + "epoch": 0.2880804548482253, + "grad_norm": 1.5399248600006104, + "learning_rate": 4.044188869326759e-05, + "loss": 4.4969, + "step": 48439 + }, + { + "epoch": 0.2880864021315063, + "grad_norm": 1.8190667629241943, + "learning_rate": 4.0441521348852755e-05, + "loss": 4.0771, + "step": 48440 + }, + { + "epoch": 0.28809234941478734, + "grad_norm": 1.4041978120803833, + "learning_rate": 4.044115399904745e-05, + "loss": 4.6215, + "step": 48441 + }, + { + "epoch": 0.2880982966980683, + "grad_norm": 1.483496904373169, + "learning_rate": 4.044078664385179e-05, + "loss": 4.5841, + "step": 48442 + }, + { + "epoch": 0.2881042439813493, + "grad_norm": 1.4129700660705566, + "learning_rate": 4.0440419283265916e-05, + "loss": 4.6289, + "step": 48443 + }, + { + "epoch": 0.28811019126463033, + "grad_norm": 1.5023226737976074, + "learning_rate": 4.044005191728996e-05, + "loss": 4.6585, + "step": 48444 + }, + { + "epoch": 0.2881161385479113, + "grad_norm": 1.399121880531311, + "learning_rate": 4.0439684545924025e-05, + "loss": 4.5373, + "step": 48445 + }, + { + "epoch": 0.2881220858311923, + "grad_norm": 1.4318366050720215, + "learning_rate": 4.0439317169168266e-05, + "loss": 4.6671, + "step": 48446 + }, + { + "epoch": 0.2881280331144733, + "grad_norm": 1.5023772716522217, + "learning_rate": 4.0438949787022795e-05, + "loss": 4.7541, + "step": 48447 + }, + { + "epoch": 0.2881339803977543, + "grad_norm": 1.4889634847640991, + "learning_rate": 4.043858239948775e-05, + "loss": 4.525, + "step": 48448 + }, + { + "epoch": 0.2881399276810353, + "grad_norm": 1.4255242347717285, + "learning_rate": 4.043821500656326e-05, + "loss": 4.5368, + "step": 48449 + }, + { + "epoch": 0.2881458749643163, + "grad_norm": 1.61469304561615, + "learning_rate": 4.043784760824944e-05, + "loss": 4.7103, + "step": 48450 + }, + { + "epoch": 0.2881518222475973, + "grad_norm": 1.8214889764785767, + "learning_rate": 4.043748020454643e-05, + "loss": 3.9519, + "step": 48451 + }, + { + "epoch": 0.2881577695308783, + "grad_norm": 1.497307538986206, + "learning_rate": 4.043711279545436e-05, + "loss": 4.4738, + "step": 48452 + }, + { + "epoch": 0.2881637168141593, + "grad_norm": 1.4986485242843628, + "learning_rate": 4.043674538097334e-05, + "loss": 4.6168, + "step": 48453 + }, + { + "epoch": 0.28816966409744027, + "grad_norm": 1.3574696779251099, + "learning_rate": 4.043637796110352e-05, + "loss": 4.5271, + "step": 48454 + }, + { + "epoch": 0.2881756113807213, + "grad_norm": 1.3591176271438599, + "learning_rate": 4.043601053584503e-05, + "loss": 4.4957, + "step": 48455 + }, + { + "epoch": 0.2881815586640023, + "grad_norm": 1.5331695079803467, + "learning_rate": 4.0435643105197976e-05, + "loss": 4.5409, + "step": 48456 + }, + { + "epoch": 0.28818750594728326, + "grad_norm": 1.9021350145339966, + "learning_rate": 4.04352756691625e-05, + "loss": 4.7375, + "step": 48457 + }, + { + "epoch": 0.2881934532305643, + "grad_norm": 1.611894965171814, + "learning_rate": 4.043490822773873e-05, + "loss": 5.019, + "step": 48458 + }, + { + "epoch": 0.2881994005138453, + "grad_norm": 1.5025843381881714, + "learning_rate": 4.0434540780926786e-05, + "loss": 4.6376, + "step": 48459 + }, + { + "epoch": 0.28820534779712625, + "grad_norm": 1.4834502935409546, + "learning_rate": 4.043417332872681e-05, + "loss": 4.4407, + "step": 48460 + }, + { + "epoch": 0.28821129508040727, + "grad_norm": 1.4962470531463623, + "learning_rate": 4.043380587113893e-05, + "loss": 4.3417, + "step": 48461 + }, + { + "epoch": 0.2882172423636883, + "grad_norm": 2.0504634380340576, + "learning_rate": 4.043343840816326e-05, + "loss": 4.3537, + "step": 48462 + }, + { + "epoch": 0.28822318964696925, + "grad_norm": 1.5904769897460938, + "learning_rate": 4.043307093979993e-05, + "loss": 4.5751, + "step": 48463 + }, + { + "epoch": 0.28822913693025026, + "grad_norm": 1.6363729238510132, + "learning_rate": 4.0432703466049086e-05, + "loss": 4.978, + "step": 48464 + }, + { + "epoch": 0.2882350842135313, + "grad_norm": 1.5435415506362915, + "learning_rate": 4.043233598691084e-05, + "loss": 4.8752, + "step": 48465 + }, + { + "epoch": 0.28824103149681224, + "grad_norm": 1.718302845954895, + "learning_rate": 4.0431968502385325e-05, + "loss": 4.8366, + "step": 48466 + }, + { + "epoch": 0.28824697878009325, + "grad_norm": 1.9553606510162354, + "learning_rate": 4.043160101247267e-05, + "loss": 4.5948, + "step": 48467 + }, + { + "epoch": 0.28825292606337427, + "grad_norm": 1.7707394361495972, + "learning_rate": 4.0431233517173005e-05, + "loss": 4.3244, + "step": 48468 + }, + { + "epoch": 0.28825887334665523, + "grad_norm": 1.9455881118774414, + "learning_rate": 4.0430866016486446e-05, + "loss": 3.3139, + "step": 48469 + }, + { + "epoch": 0.28826482062993625, + "grad_norm": 1.6029179096221924, + "learning_rate": 4.043049851041314e-05, + "loss": 3.5383, + "step": 48470 + }, + { + "epoch": 0.28827076791321726, + "grad_norm": 1.2766669988632202, + "learning_rate": 4.0430130998953205e-05, + "loss": 4.3457, + "step": 48471 + }, + { + "epoch": 0.2882767151964982, + "grad_norm": 1.6281859874725342, + "learning_rate": 4.0429763482106775e-05, + "loss": 4.8239, + "step": 48472 + }, + { + "epoch": 0.28828266247977924, + "grad_norm": 1.6766455173492432, + "learning_rate": 4.042939595987397e-05, + "loss": 4.7894, + "step": 48473 + }, + { + "epoch": 0.28828860976306026, + "grad_norm": 1.910873293876648, + "learning_rate": 4.042902843225492e-05, + "loss": 4.3826, + "step": 48474 + }, + { + "epoch": 0.2882945570463412, + "grad_norm": 1.8023897409439087, + "learning_rate": 4.0428660899249756e-05, + "loss": 3.135, + "step": 48475 + }, + { + "epoch": 0.28830050432962223, + "grad_norm": 1.6924690008163452, + "learning_rate": 4.04282933608586e-05, + "loss": 3.0937, + "step": 48476 + }, + { + "epoch": 0.28830645161290325, + "grad_norm": 1.717442274093628, + "learning_rate": 4.04279258170816e-05, + "loss": 3.1016, + "step": 48477 + }, + { + "epoch": 0.2883123988961842, + "grad_norm": 1.9212316274642944, + "learning_rate": 4.042755826791886e-05, + "loss": 3.3619, + "step": 48478 + }, + { + "epoch": 0.2883183461794652, + "grad_norm": 1.979346513748169, + "learning_rate": 4.042719071337052e-05, + "loss": 3.3914, + "step": 48479 + }, + { + "epoch": 0.28832429346274624, + "grad_norm": 1.722819209098816, + "learning_rate": 4.042682315343671e-05, + "loss": 3.289, + "step": 48480 + }, + { + "epoch": 0.2883302407460272, + "grad_norm": 1.6823031902313232, + "learning_rate": 4.0426455588117554e-05, + "loss": 3.2574, + "step": 48481 + }, + { + "epoch": 0.2883361880293082, + "grad_norm": 1.773632287979126, + "learning_rate": 4.042608801741319e-05, + "loss": 3.1671, + "step": 48482 + }, + { + "epoch": 0.28834213531258923, + "grad_norm": 1.9455041885375977, + "learning_rate": 4.0425720441323725e-05, + "loss": 3.6583, + "step": 48483 + }, + { + "epoch": 0.2883480825958702, + "grad_norm": 2.218496084213257, + "learning_rate": 4.042535285984931e-05, + "loss": 3.9826, + "step": 48484 + }, + { + "epoch": 0.2883540298791512, + "grad_norm": 1.5861988067626953, + "learning_rate": 4.042498527299006e-05, + "loss": 3.4748, + "step": 48485 + }, + { + "epoch": 0.2883599771624322, + "grad_norm": 1.620113492012024, + "learning_rate": 4.04246176807461e-05, + "loss": 3.4654, + "step": 48486 + }, + { + "epoch": 0.2883659244457132, + "grad_norm": 1.920149564743042, + "learning_rate": 4.0424250083117576e-05, + "loss": 3.1782, + "step": 48487 + }, + { + "epoch": 0.2883718717289942, + "grad_norm": 1.6501599550247192, + "learning_rate": 4.0423882480104605e-05, + "loss": 3.2949, + "step": 48488 + }, + { + "epoch": 0.2883778190122752, + "grad_norm": 1.9300236701965332, + "learning_rate": 4.042351487170731e-05, + "loss": 2.7904, + "step": 48489 + }, + { + "epoch": 0.2883837662955562, + "grad_norm": 1.8765393495559692, + "learning_rate": 4.0423147257925826e-05, + "loss": 3.4052, + "step": 48490 + }, + { + "epoch": 0.2883897135788372, + "grad_norm": 1.751577377319336, + "learning_rate": 4.042277963876029e-05, + "loss": 3.2597, + "step": 48491 + }, + { + "epoch": 0.2883956608621182, + "grad_norm": 1.5857847929000854, + "learning_rate": 4.042241201421081e-05, + "loss": 3.347, + "step": 48492 + }, + { + "epoch": 0.28840160814539917, + "grad_norm": 1.5838154554367065, + "learning_rate": 4.0422044384277524e-05, + "loss": 3.2193, + "step": 48493 + }, + { + "epoch": 0.2884075554286802, + "grad_norm": 1.8530023097991943, + "learning_rate": 4.0421676748960566e-05, + "loss": 3.3057, + "step": 48494 + }, + { + "epoch": 0.2884135027119612, + "grad_norm": 1.7914763689041138, + "learning_rate": 4.0421309108260064e-05, + "loss": 2.9352, + "step": 48495 + }, + { + "epoch": 0.28841944999524216, + "grad_norm": 1.6771435737609863, + "learning_rate": 4.042094146217614e-05, + "loss": 3.1357, + "step": 48496 + }, + { + "epoch": 0.2884253972785232, + "grad_norm": 1.846331238746643, + "learning_rate": 4.042057381070892e-05, + "loss": 2.9332, + "step": 48497 + }, + { + "epoch": 0.2884313445618042, + "grad_norm": 2.1371967792510986, + "learning_rate": 4.042020615385854e-05, + "loss": 3.6979, + "step": 48498 + }, + { + "epoch": 0.28843729184508515, + "grad_norm": 1.936006784439087, + "learning_rate": 4.0419838491625125e-05, + "loss": 4.2687, + "step": 48499 + }, + { + "epoch": 0.28844323912836617, + "grad_norm": 1.9521796703338623, + "learning_rate": 4.041947082400881e-05, + "loss": 4.3587, + "step": 48500 + }, + { + "epoch": 0.2884491864116472, + "grad_norm": 1.8276675939559937, + "learning_rate": 4.041910315100971e-05, + "loss": 4.5895, + "step": 48501 + }, + { + "epoch": 0.28845513369492815, + "grad_norm": 1.6901572942733765, + "learning_rate": 4.0418735472627954e-05, + "loss": 4.1528, + "step": 48502 + }, + { + "epoch": 0.28846108097820916, + "grad_norm": 1.8915783166885376, + "learning_rate": 4.0418367788863694e-05, + "loss": 2.9486, + "step": 48503 + }, + { + "epoch": 0.2884670282614902, + "grad_norm": 1.6540932655334473, + "learning_rate": 4.0418000099717027e-05, + "loss": 4.5802, + "step": 48504 + }, + { + "epoch": 0.28847297554477114, + "grad_norm": 1.6581683158874512, + "learning_rate": 4.04176324051881e-05, + "loss": 4.5937, + "step": 48505 + }, + { + "epoch": 0.28847892282805215, + "grad_norm": 1.6079753637313843, + "learning_rate": 4.0417264705277035e-05, + "loss": 4.5164, + "step": 48506 + }, + { + "epoch": 0.28848487011133317, + "grad_norm": 1.5954266786575317, + "learning_rate": 4.041689699998397e-05, + "loss": 4.4486, + "step": 48507 + }, + { + "epoch": 0.28849081739461413, + "grad_norm": 1.3504393100738525, + "learning_rate": 4.041652928930901e-05, + "loss": 4.6465, + "step": 48508 + }, + { + "epoch": 0.28849676467789515, + "grad_norm": 1.4306247234344482, + "learning_rate": 4.0416161573252316e-05, + "loss": 4.4138, + "step": 48509 + }, + { + "epoch": 0.28850271196117616, + "grad_norm": 2.2908880710601807, + "learning_rate": 4.041579385181399e-05, + "loss": 3.7894, + "step": 48510 + }, + { + "epoch": 0.2885086592444571, + "grad_norm": 1.7866730690002441, + "learning_rate": 4.041542612499417e-05, + "loss": 4.4833, + "step": 48511 + }, + { + "epoch": 0.28851460652773814, + "grad_norm": 1.4029203653335571, + "learning_rate": 4.0415058392792987e-05, + "loss": 4.4642, + "step": 48512 + }, + { + "epoch": 0.28852055381101915, + "grad_norm": 1.5664088726043701, + "learning_rate": 4.041469065521056e-05, + "loss": 4.4914, + "step": 48513 + }, + { + "epoch": 0.2885265010943001, + "grad_norm": 1.4743403196334839, + "learning_rate": 4.041432291224703e-05, + "loss": 4.5711, + "step": 48514 + }, + { + "epoch": 0.28853244837758113, + "grad_norm": 1.4379040002822876, + "learning_rate": 4.041395516390252e-05, + "loss": 4.4975, + "step": 48515 + }, + { + "epoch": 0.28853839566086215, + "grad_norm": 1.675445318222046, + "learning_rate": 4.0413587410177155e-05, + "loss": 4.5967, + "step": 48516 + }, + { + "epoch": 0.2885443429441431, + "grad_norm": 1.8142551183700562, + "learning_rate": 4.041321965107107e-05, + "loss": 4.4273, + "step": 48517 + }, + { + "epoch": 0.2885502902274241, + "grad_norm": 1.744807481765747, + "learning_rate": 4.041285188658438e-05, + "loss": 4.5738, + "step": 48518 + }, + { + "epoch": 0.2885562375107051, + "grad_norm": 1.7530431747436523, + "learning_rate": 4.041248411671723e-05, + "loss": 4.6958, + "step": 48519 + }, + { + "epoch": 0.2885621847939861, + "grad_norm": 1.681649923324585, + "learning_rate": 4.041211634146974e-05, + "loss": 4.6286, + "step": 48520 + }, + { + "epoch": 0.2885681320772671, + "grad_norm": 2.0077648162841797, + "learning_rate": 4.041174856084204e-05, + "loss": 4.3197, + "step": 48521 + }, + { + "epoch": 0.2885740793605481, + "grad_norm": 1.6381642818450928, + "learning_rate": 4.0411380774834254e-05, + "loss": 4.2779, + "step": 48522 + }, + { + "epoch": 0.2885800266438291, + "grad_norm": 1.5274027585983276, + "learning_rate": 4.0411012983446515e-05, + "loss": 4.5392, + "step": 48523 + }, + { + "epoch": 0.2885859739271101, + "grad_norm": 1.6546586751937866, + "learning_rate": 4.041064518667895e-05, + "loss": 4.3048, + "step": 48524 + }, + { + "epoch": 0.28859192121039107, + "grad_norm": 1.514075756072998, + "learning_rate": 4.041027738453169e-05, + "loss": 4.6938, + "step": 48525 + }, + { + "epoch": 0.2885978684936721, + "grad_norm": 1.7444247007369995, + "learning_rate": 4.0409909577004867e-05, + "loss": 4.5282, + "step": 48526 + }, + { + "epoch": 0.2886038157769531, + "grad_norm": 1.4996081590652466, + "learning_rate": 4.040954176409859e-05, + "loss": 4.5453, + "step": 48527 + }, + { + "epoch": 0.28860976306023406, + "grad_norm": 1.449715256690979, + "learning_rate": 4.040917394581302e-05, + "loss": 4.8562, + "step": 48528 + }, + { + "epoch": 0.2886157103435151, + "grad_norm": 1.4560139179229736, + "learning_rate": 4.040880612214825e-05, + "loss": 4.5955, + "step": 48529 + }, + { + "epoch": 0.2886216576267961, + "grad_norm": 1.4538536071777344, + "learning_rate": 4.0408438293104437e-05, + "loss": 4.2203, + "step": 48530 + }, + { + "epoch": 0.28862760491007705, + "grad_norm": 1.4881322383880615, + "learning_rate": 4.0408070458681685e-05, + "loss": 4.5426, + "step": 48531 + }, + { + "epoch": 0.28863355219335807, + "grad_norm": 1.375635027885437, + "learning_rate": 4.040770261888014e-05, + "loss": 5.0403, + "step": 48532 + }, + { + "epoch": 0.2886394994766391, + "grad_norm": 1.965229868888855, + "learning_rate": 4.0407334773699926e-05, + "loss": 3.9597, + "step": 48533 + }, + { + "epoch": 0.28864544675992004, + "grad_norm": 1.5878567695617676, + "learning_rate": 4.040696692314117e-05, + "loss": 4.551, + "step": 48534 + }, + { + "epoch": 0.28865139404320106, + "grad_norm": 1.6014858484268188, + "learning_rate": 4.040659906720401e-05, + "loss": 4.3702, + "step": 48535 + }, + { + "epoch": 0.2886573413264821, + "grad_norm": 1.6376333236694336, + "learning_rate": 4.0406231205888554e-05, + "loss": 4.5052, + "step": 48536 + }, + { + "epoch": 0.28866328860976304, + "grad_norm": 1.4894742965698242, + "learning_rate": 4.040586333919494e-05, + "loss": 4.5805, + "step": 48537 + }, + { + "epoch": 0.28866923589304405, + "grad_norm": 1.6042909622192383, + "learning_rate": 4.040549546712331e-05, + "loss": 4.3987, + "step": 48538 + }, + { + "epoch": 0.28867518317632507, + "grad_norm": 1.310497522354126, + "learning_rate": 4.040512758967377e-05, + "loss": 5.0911, + "step": 48539 + }, + { + "epoch": 0.28868113045960603, + "grad_norm": 1.4044091701507568, + "learning_rate": 4.0404759706846465e-05, + "loss": 4.688, + "step": 48540 + }, + { + "epoch": 0.28868707774288704, + "grad_norm": 1.3676711320877075, + "learning_rate": 4.040439181864151e-05, + "loss": 4.8102, + "step": 48541 + }, + { + "epoch": 0.28869302502616806, + "grad_norm": 1.412393569946289, + "learning_rate": 4.040402392505905e-05, + "loss": 4.7051, + "step": 48542 + }, + { + "epoch": 0.288698972309449, + "grad_norm": 1.6966195106506348, + "learning_rate": 4.04036560260992e-05, + "loss": 4.7487, + "step": 48543 + }, + { + "epoch": 0.28870491959273004, + "grad_norm": 2.122330904006958, + "learning_rate": 4.04032881217621e-05, + "loss": 3.3536, + "step": 48544 + }, + { + "epoch": 0.28871086687601105, + "grad_norm": 2.774585008621216, + "learning_rate": 4.0402920212047855e-05, + "loss": 1.5503, + "step": 48545 + }, + { + "epoch": 0.288716814159292, + "grad_norm": 1.7013927698135376, + "learning_rate": 4.0402552296956626e-05, + "loss": 4.4431, + "step": 48546 + }, + { + "epoch": 0.28872276144257303, + "grad_norm": 2.5735442638397217, + "learning_rate": 4.0402184376488514e-05, + "loss": 1.329, + "step": 48547 + }, + { + "epoch": 0.28872870872585404, + "grad_norm": 2.5443317890167236, + "learning_rate": 4.040181645064367e-05, + "loss": 1.3512, + "step": 48548 + }, + { + "epoch": 0.288734656009135, + "grad_norm": 1.9763331413269043, + "learning_rate": 4.04014485194222e-05, + "loss": 3.6037, + "step": 48549 + }, + { + "epoch": 0.288740603292416, + "grad_norm": 1.5573545694351196, + "learning_rate": 4.0401080582824256e-05, + "loss": 4.8586, + "step": 48550 + }, + { + "epoch": 0.28874655057569704, + "grad_norm": 2.3678064346313477, + "learning_rate": 4.0400712640849936e-05, + "loss": 1.6307, + "step": 48551 + }, + { + "epoch": 0.288752497858978, + "grad_norm": 2.6183395385742188, + "learning_rate": 4.0400344693499397e-05, + "loss": 1.0375, + "step": 48552 + }, + { + "epoch": 0.288758445142259, + "grad_norm": 2.6870524883270264, + "learning_rate": 4.039997674077276e-05, + "loss": 1.3079, + "step": 48553 + }, + { + "epoch": 0.28876439242554003, + "grad_norm": 2.8005669116973877, + "learning_rate": 4.0399608782670146e-05, + "loss": 1.283, + "step": 48554 + }, + { + "epoch": 0.288770339708821, + "grad_norm": 2.9511680603027344, + "learning_rate": 4.0399240819191686e-05, + "loss": 1.2284, + "step": 48555 + }, + { + "epoch": 0.288776286992102, + "grad_norm": 2.523969888687134, + "learning_rate": 4.039887285033751e-05, + "loss": 2.6528, + "step": 48556 + }, + { + "epoch": 0.288782234275383, + "grad_norm": 1.9547415971755981, + "learning_rate": 4.0398504876107753e-05, + "loss": 4.5026, + "step": 48557 + }, + { + "epoch": 0.288788181558664, + "grad_norm": 2.2130770683288574, + "learning_rate": 4.0398136896502525e-05, + "loss": 3.0297, + "step": 48558 + }, + { + "epoch": 0.288794128841945, + "grad_norm": 3.074657440185547, + "learning_rate": 4.0397768911521984e-05, + "loss": 1.3502, + "step": 48559 + }, + { + "epoch": 0.288800076125226, + "grad_norm": 2.764124631881714, + "learning_rate": 4.039740092116623e-05, + "loss": 0.9371, + "step": 48560 + }, + { + "epoch": 0.288806023408507, + "grad_norm": 2.9259605407714844, + "learning_rate": 4.0397032925435395e-05, + "loss": 1.1029, + "step": 48561 + }, + { + "epoch": 0.288811970691788, + "grad_norm": 2.988206148147583, + "learning_rate": 4.039666492432963e-05, + "loss": 1.7746, + "step": 48562 + }, + { + "epoch": 0.288817917975069, + "grad_norm": 1.5689125061035156, + "learning_rate": 4.039629691784904e-05, + "loss": 4.9351, + "step": 48563 + }, + { + "epoch": 0.28882386525834997, + "grad_norm": 1.6491665840148926, + "learning_rate": 4.0395928905993766e-05, + "loss": 4.9424, + "step": 48564 + }, + { + "epoch": 0.288829812541631, + "grad_norm": 1.4097989797592163, + "learning_rate": 4.039556088876393e-05, + "loss": 4.8593, + "step": 48565 + }, + { + "epoch": 0.288835759824912, + "grad_norm": 1.2987297773361206, + "learning_rate": 4.0395192866159656e-05, + "loss": 4.7918, + "step": 48566 + }, + { + "epoch": 0.28884170710819296, + "grad_norm": 1.686295986175537, + "learning_rate": 4.039482483818109e-05, + "loss": 4.888, + "step": 48567 + }, + { + "epoch": 0.288847654391474, + "grad_norm": 2.6295835971832275, + "learning_rate": 4.0394456804828345e-05, + "loss": 3.7863, + "step": 48568 + }, + { + "epoch": 0.288853601674755, + "grad_norm": 4.325164794921875, + "learning_rate": 4.039408876610156e-05, + "loss": 3.1336, + "step": 48569 + }, + { + "epoch": 0.28885954895803595, + "grad_norm": 3.6462371349334717, + "learning_rate": 4.039372072200084e-05, + "loss": 2.9676, + "step": 48570 + }, + { + "epoch": 0.28886549624131697, + "grad_norm": 2.7218523025512695, + "learning_rate": 4.039335267252635e-05, + "loss": 3.3606, + "step": 48571 + }, + { + "epoch": 0.288871443524598, + "grad_norm": 1.5817080736160278, + "learning_rate": 4.039298461767819e-05, + "loss": 4.7298, + "step": 48572 + }, + { + "epoch": 0.28887739080787894, + "grad_norm": 1.8308509588241577, + "learning_rate": 4.03926165574565e-05, + "loss": 4.5668, + "step": 48573 + }, + { + "epoch": 0.28888333809115996, + "grad_norm": 2.7297050952911377, + "learning_rate": 4.0392248491861415e-05, + "loss": 2.5273, + "step": 48574 + }, + { + "epoch": 0.288889285374441, + "grad_norm": 3.0482287406921387, + "learning_rate": 4.0391880420893044e-05, + "loss": 2.3041, + "step": 48575 + }, + { + "epoch": 0.28889523265772193, + "grad_norm": 2.748927354812622, + "learning_rate": 4.039151234455153e-05, + "loss": 2.6324, + "step": 48576 + }, + { + "epoch": 0.28890117994100295, + "grad_norm": 2.991065502166748, + "learning_rate": 4.0391144262837e-05, + "loss": 2.3199, + "step": 48577 + }, + { + "epoch": 0.28890712722428397, + "grad_norm": 2.8635358810424805, + "learning_rate": 4.039077617574958e-05, + "loss": 2.709, + "step": 48578 + }, + { + "epoch": 0.2889130745075649, + "grad_norm": 2.462613344192505, + "learning_rate": 4.0390408083289394e-05, + "loss": 3.4806, + "step": 48579 + }, + { + "epoch": 0.28891902179084594, + "grad_norm": 2.741651773452759, + "learning_rate": 4.039003998545659e-05, + "loss": 1.6434, + "step": 48580 + }, + { + "epoch": 0.28892496907412696, + "grad_norm": 2.7177538871765137, + "learning_rate": 4.038967188225126e-05, + "loss": 1.9209, + "step": 48581 + }, + { + "epoch": 0.2889309163574079, + "grad_norm": 2.7705039978027344, + "learning_rate": 4.038930377367357e-05, + "loss": 1.8282, + "step": 48582 + }, + { + "epoch": 0.28893686364068893, + "grad_norm": 2.8833489418029785, + "learning_rate": 4.038893565972362e-05, + "loss": 1.702, + "step": 48583 + }, + { + "epoch": 0.28894281092396995, + "grad_norm": 2.9600822925567627, + "learning_rate": 4.038856754040157e-05, + "loss": 1.1103, + "step": 48584 + }, + { + "epoch": 0.2889487582072509, + "grad_norm": 2.451906442642212, + "learning_rate": 4.038819941570752e-05, + "loss": 2.4058, + "step": 48585 + }, + { + "epoch": 0.2889547054905319, + "grad_norm": 2.798917293548584, + "learning_rate": 4.038783128564161e-05, + "loss": 3.0031, + "step": 48586 + }, + { + "epoch": 0.28896065277381294, + "grad_norm": 2.6230173110961914, + "learning_rate": 4.0387463150203964e-05, + "loss": 2.9583, + "step": 48587 + }, + { + "epoch": 0.2889666000570939, + "grad_norm": 2.211513042449951, + "learning_rate": 4.038709500939472e-05, + "loss": 3.0219, + "step": 48588 + }, + { + "epoch": 0.2889725473403749, + "grad_norm": 2.537851333618164, + "learning_rate": 4.0386726863213996e-05, + "loss": 3.0806, + "step": 48589 + }, + { + "epoch": 0.28897849462365593, + "grad_norm": 2.4352314472198486, + "learning_rate": 4.038635871166192e-05, + "loss": 1.3549, + "step": 48590 + }, + { + "epoch": 0.2889844419069369, + "grad_norm": 2.187575101852417, + "learning_rate": 4.038599055473863e-05, + "loss": 3.3514, + "step": 48591 + }, + { + "epoch": 0.2889903891902179, + "grad_norm": 2.6154446601867676, + "learning_rate": 4.038562239244426e-05, + "loss": 2.7859, + "step": 48592 + }, + { + "epoch": 0.2889963364734989, + "grad_norm": 2.636270523071289, + "learning_rate": 4.0385254224778904e-05, + "loss": 2.745, + "step": 48593 + }, + { + "epoch": 0.2890022837567799, + "grad_norm": 2.5820837020874023, + "learning_rate": 4.038488605174273e-05, + "loss": 2.8681, + "step": 48594 + }, + { + "epoch": 0.2890082310400609, + "grad_norm": 2.3961822986602783, + "learning_rate": 4.0384517873335854e-05, + "loss": 2.7089, + "step": 48595 + }, + { + "epoch": 0.2890141783233419, + "grad_norm": 2.2485404014587402, + "learning_rate": 4.0384149689558395e-05, + "loss": 3.0545, + "step": 48596 + }, + { + "epoch": 0.2890201256066229, + "grad_norm": 1.962079405784607, + "learning_rate": 4.038378150041049e-05, + "loss": 4.7205, + "step": 48597 + }, + { + "epoch": 0.2890260728899039, + "grad_norm": 2.194265842437744, + "learning_rate": 4.0383413305892274e-05, + "loss": 3.2, + "step": 48598 + }, + { + "epoch": 0.2890320201731849, + "grad_norm": 1.929840087890625, + "learning_rate": 4.0383045106003855e-05, + "loss": 3.7757, + "step": 48599 + }, + { + "epoch": 0.28903796745646587, + "grad_norm": 2.4741013050079346, + "learning_rate": 4.0382676900745376e-05, + "loss": 3.0113, + "step": 48600 + }, + { + "epoch": 0.2890439147397469, + "grad_norm": 2.368039131164551, + "learning_rate": 4.0382308690116966e-05, + "loss": 2.9122, + "step": 48601 + }, + { + "epoch": 0.2890498620230279, + "grad_norm": 1.9593697786331177, + "learning_rate": 4.0381940474118754e-05, + "loss": 3.5853, + "step": 48602 + }, + { + "epoch": 0.28905580930630886, + "grad_norm": 1.5807209014892578, + "learning_rate": 4.0381572252750854e-05, + "loss": 5.0002, + "step": 48603 + }, + { + "epoch": 0.2890617565895899, + "grad_norm": 1.4511644840240479, + "learning_rate": 4.038120402601342e-05, + "loss": 4.6785, + "step": 48604 + }, + { + "epoch": 0.2890677038728709, + "grad_norm": 1.382346510887146, + "learning_rate": 4.038083579390656e-05, + "loss": 5.0525, + "step": 48605 + }, + { + "epoch": 0.28907365115615186, + "grad_norm": 1.3840287923812866, + "learning_rate": 4.038046755643041e-05, + "loss": 4.5612, + "step": 48606 + }, + { + "epoch": 0.28907959843943287, + "grad_norm": 1.7086725234985352, + "learning_rate": 4.03800993135851e-05, + "loss": 4.3197, + "step": 48607 + }, + { + "epoch": 0.2890855457227139, + "grad_norm": 1.490468144416809, + "learning_rate": 4.037973106537075e-05, + "loss": 4.7499, + "step": 48608 + }, + { + "epoch": 0.28909149300599485, + "grad_norm": 1.515181303024292, + "learning_rate": 4.0379362811787504e-05, + "loss": 4.5164, + "step": 48609 + }, + { + "epoch": 0.28909744028927586, + "grad_norm": 1.4651373624801636, + "learning_rate": 4.037899455283548e-05, + "loss": 4.7233, + "step": 48610 + }, + { + "epoch": 0.2891033875725569, + "grad_norm": 1.5705190896987915, + "learning_rate": 4.0378626288514795e-05, + "loss": 4.3437, + "step": 48611 + }, + { + "epoch": 0.28910933485583784, + "grad_norm": 1.5533841848373413, + "learning_rate": 4.03782580188256e-05, + "loss": 4.7142, + "step": 48612 + }, + { + "epoch": 0.28911528213911886, + "grad_norm": 1.5699795484542847, + "learning_rate": 4.037788974376802e-05, + "loss": 4.3709, + "step": 48613 + }, + { + "epoch": 0.2891212294223999, + "grad_norm": 1.765863299369812, + "learning_rate": 4.037752146334217e-05, + "loss": 4.012, + "step": 48614 + }, + { + "epoch": 0.28912717670568083, + "grad_norm": 2.053020477294922, + "learning_rate": 4.037715317754819e-05, + "loss": 3.4699, + "step": 48615 + }, + { + "epoch": 0.28913312398896185, + "grad_norm": 1.4990036487579346, + "learning_rate": 4.03767848863862e-05, + "loss": 5.0417, + "step": 48616 + }, + { + "epoch": 0.28913907127224286, + "grad_norm": 1.5021536350250244, + "learning_rate": 4.0376416589856335e-05, + "loss": 4.7859, + "step": 48617 + }, + { + "epoch": 0.2891450185555238, + "grad_norm": 1.8204234838485718, + "learning_rate": 4.037604828795872e-05, + "loss": 4.4247, + "step": 48618 + }, + { + "epoch": 0.28915096583880484, + "grad_norm": 1.6654990911483765, + "learning_rate": 4.0375679980693496e-05, + "loss": 4.3917, + "step": 48619 + }, + { + "epoch": 0.28915691312208586, + "grad_norm": 1.5822808742523193, + "learning_rate": 4.0375311668060763e-05, + "loss": 4.1171, + "step": 48620 + }, + { + "epoch": 0.2891628604053668, + "grad_norm": 2.6426868438720703, + "learning_rate": 4.037494335006068e-05, + "loss": 2.8652, + "step": 48621 + }, + { + "epoch": 0.28916880768864783, + "grad_norm": 2.424532413482666, + "learning_rate": 4.037457502669336e-05, + "loss": 2.8147, + "step": 48622 + }, + { + "epoch": 0.28917475497192885, + "grad_norm": 2.372062921524048, + "learning_rate": 4.037420669795893e-05, + "loss": 2.8713, + "step": 48623 + }, + { + "epoch": 0.2891807022552098, + "grad_norm": 2.3130416870117188, + "learning_rate": 4.037383836385753e-05, + "loss": 2.7492, + "step": 48624 + }, + { + "epoch": 0.2891866495384908, + "grad_norm": 2.2616443634033203, + "learning_rate": 4.0373470024389284e-05, + "loss": 2.9999, + "step": 48625 + }, + { + "epoch": 0.28919259682177184, + "grad_norm": 2.127082586288452, + "learning_rate": 4.037310167955432e-05, + "loss": 3.422, + "step": 48626 + }, + { + "epoch": 0.2891985441050528, + "grad_norm": 2.833974599838257, + "learning_rate": 4.0372733329352753e-05, + "loss": 3.8221, + "step": 48627 + }, + { + "epoch": 0.2892044913883338, + "grad_norm": 2.580775260925293, + "learning_rate": 4.037236497378473e-05, + "loss": 3.5202, + "step": 48628 + }, + { + "epoch": 0.28921043867161483, + "grad_norm": 2.0770137310028076, + "learning_rate": 4.037199661285037e-05, + "loss": 2.8301, + "step": 48629 + }, + { + "epoch": 0.2892163859548958, + "grad_norm": 2.535888433456421, + "learning_rate": 4.037162824654981e-05, + "loss": 2.4132, + "step": 48630 + }, + { + "epoch": 0.2892223332381768, + "grad_norm": 2.270362138748169, + "learning_rate": 4.037125987488317e-05, + "loss": 2.7131, + "step": 48631 + }, + { + "epoch": 0.2892282805214578, + "grad_norm": 2.289530038833618, + "learning_rate": 4.037089149785058e-05, + "loss": 2.7597, + "step": 48632 + }, + { + "epoch": 0.2892342278047388, + "grad_norm": 2.255889415740967, + "learning_rate": 4.0370523115452175e-05, + "loss": 3.7101, + "step": 48633 + }, + { + "epoch": 0.2892401750880198, + "grad_norm": 2.400806188583374, + "learning_rate": 4.037015472768808e-05, + "loss": 3.6684, + "step": 48634 + }, + { + "epoch": 0.28924612237130076, + "grad_norm": 2.4825096130371094, + "learning_rate": 4.036978633455843e-05, + "loss": 3.7117, + "step": 48635 + }, + { + "epoch": 0.2892520696545818, + "grad_norm": 2.4441802501678467, + "learning_rate": 4.0369417936063333e-05, + "loss": 2.8288, + "step": 48636 + }, + { + "epoch": 0.2892580169378628, + "grad_norm": 2.1256818771362305, + "learning_rate": 4.036904953220294e-05, + "loss": 3.9792, + "step": 48637 + }, + { + "epoch": 0.28926396422114375, + "grad_norm": 1.9464805126190186, + "learning_rate": 4.036868112297736e-05, + "loss": 4.1267, + "step": 48638 + }, + { + "epoch": 0.28926991150442477, + "grad_norm": 1.5131932497024536, + "learning_rate": 4.0368312708386736e-05, + "loss": 4.5209, + "step": 48639 + }, + { + "epoch": 0.2892758587877058, + "grad_norm": 1.9469796419143677, + "learning_rate": 4.036794428843119e-05, + "loss": 4.0379, + "step": 48640 + }, + { + "epoch": 0.28928180607098675, + "grad_norm": 2.0641238689422607, + "learning_rate": 4.036757586311086e-05, + "loss": 3.1635, + "step": 48641 + }, + { + "epoch": 0.28928775335426776, + "grad_norm": 1.6236196756362915, + "learning_rate": 4.0367207432425866e-05, + "loss": 3.9066, + "step": 48642 + }, + { + "epoch": 0.2892937006375488, + "grad_norm": 2.238697052001953, + "learning_rate": 4.036683899637635e-05, + "loss": 2.6075, + "step": 48643 + }, + { + "epoch": 0.28929964792082974, + "grad_norm": 2.3151357173919678, + "learning_rate": 4.036647055496241e-05, + "loss": 3.7198, + "step": 48644 + }, + { + "epoch": 0.28930559520411075, + "grad_norm": 2.4355568885803223, + "learning_rate": 4.0366102108184205e-05, + "loss": 3.4637, + "step": 48645 + }, + { + "epoch": 0.28931154248739177, + "grad_norm": 2.272747755050659, + "learning_rate": 4.0365733656041847e-05, + "loss": 4.0194, + "step": 48646 + }, + { + "epoch": 0.28931748977067273, + "grad_norm": 2.175555944442749, + "learning_rate": 4.036536519853548e-05, + "loss": 4.1118, + "step": 48647 + }, + { + "epoch": 0.28932343705395375, + "grad_norm": 1.5691286325454712, + "learning_rate": 4.036499673566521e-05, + "loss": 4.606, + "step": 48648 + }, + { + "epoch": 0.28932938433723476, + "grad_norm": 1.7622272968292236, + "learning_rate": 4.0364628267431185e-05, + "loss": 4.2818, + "step": 48649 + }, + { + "epoch": 0.2893353316205157, + "grad_norm": 1.7049826383590698, + "learning_rate": 4.036425979383352e-05, + "loss": 4.1675, + "step": 48650 + }, + { + "epoch": 0.28934127890379674, + "grad_norm": 1.662204384803772, + "learning_rate": 4.0363891314872366e-05, + "loss": 3.9813, + "step": 48651 + }, + { + "epoch": 0.28934722618707776, + "grad_norm": 1.6835558414459229, + "learning_rate": 4.0363522830547824e-05, + "loss": 3.9942, + "step": 48652 + }, + { + "epoch": 0.2893531734703587, + "grad_norm": 1.6401993036270142, + "learning_rate": 4.036315434086003e-05, + "loss": 4.1264, + "step": 48653 + }, + { + "epoch": 0.28935912075363973, + "grad_norm": 1.743010401725769, + "learning_rate": 4.036278584580913e-05, + "loss": 4.0738, + "step": 48654 + }, + { + "epoch": 0.28936506803692075, + "grad_norm": 1.7357685565948486, + "learning_rate": 4.036241734539524e-05, + "loss": 4.2297, + "step": 48655 + }, + { + "epoch": 0.2893710153202017, + "grad_norm": 1.5808898210525513, + "learning_rate": 4.0362048839618484e-05, + "loss": 4.2919, + "step": 48656 + }, + { + "epoch": 0.2893769626034827, + "grad_norm": 1.4894810914993286, + "learning_rate": 4.036168032847899e-05, + "loss": 4.177, + "step": 48657 + }, + { + "epoch": 0.28938290988676374, + "grad_norm": 1.7470682859420776, + "learning_rate": 4.0361311811976896e-05, + "loss": 3.9774, + "step": 48658 + }, + { + "epoch": 0.2893888571700447, + "grad_norm": 1.4216299057006836, + "learning_rate": 4.036094329011233e-05, + "loss": 4.4087, + "step": 48659 + }, + { + "epoch": 0.2893948044533257, + "grad_norm": 1.5787197351455688, + "learning_rate": 4.0360574762885425e-05, + "loss": 4.3039, + "step": 48660 + }, + { + "epoch": 0.28940075173660673, + "grad_norm": 1.648788332939148, + "learning_rate": 4.036020623029629e-05, + "loss": 4.7811, + "step": 48661 + }, + { + "epoch": 0.2894066990198877, + "grad_norm": 1.742283582687378, + "learning_rate": 4.035983769234507e-05, + "loss": 4.7904, + "step": 48662 + }, + { + "epoch": 0.2894126463031687, + "grad_norm": 1.5426216125488281, + "learning_rate": 4.035946914903188e-05, + "loss": 4.6822, + "step": 48663 + }, + { + "epoch": 0.2894185935864497, + "grad_norm": 1.5267075300216675, + "learning_rate": 4.035910060035687e-05, + "loss": 4.7395, + "step": 48664 + }, + { + "epoch": 0.2894245408697307, + "grad_norm": 1.8483548164367676, + "learning_rate": 4.035873204632015e-05, + "loss": 4.2304, + "step": 48665 + }, + { + "epoch": 0.2894304881530117, + "grad_norm": 1.6125050783157349, + "learning_rate": 4.0358363486921865e-05, + "loss": 5.0535, + "step": 48666 + }, + { + "epoch": 0.2894364354362927, + "grad_norm": 1.8221486806869507, + "learning_rate": 4.0357994922162126e-05, + "loss": 5.3159, + "step": 48667 + }, + { + "epoch": 0.2894423827195737, + "grad_norm": 1.6474167108535767, + "learning_rate": 4.035762635204107e-05, + "loss": 4.8728, + "step": 48668 + }, + { + "epoch": 0.2894483300028547, + "grad_norm": 1.5893381834030151, + "learning_rate": 4.0357257776558827e-05, + "loss": 4.7139, + "step": 48669 + }, + { + "epoch": 0.2894542772861357, + "grad_norm": 1.5395383834838867, + "learning_rate": 4.035688919571552e-05, + "loss": 4.796, + "step": 48670 + }, + { + "epoch": 0.28946022456941667, + "grad_norm": 1.6726943254470825, + "learning_rate": 4.035652060951128e-05, + "loss": 4.2799, + "step": 48671 + }, + { + "epoch": 0.2894661718526977, + "grad_norm": 1.6647456884384155, + "learning_rate": 4.035615201794625e-05, + "loss": 4.5637, + "step": 48672 + }, + { + "epoch": 0.2894721191359787, + "grad_norm": 1.5103570222854614, + "learning_rate": 4.035578342102053e-05, + "loss": 4.9352, + "step": 48673 + }, + { + "epoch": 0.28947806641925966, + "grad_norm": 1.5773378610610962, + "learning_rate": 4.0355414818734276e-05, + "loss": 5.0375, + "step": 48674 + }, + { + "epoch": 0.2894840137025407, + "grad_norm": 1.5347044467926025, + "learning_rate": 4.0355046211087605e-05, + "loss": 4.6534, + "step": 48675 + }, + { + "epoch": 0.2894899609858217, + "grad_norm": 1.5933399200439453, + "learning_rate": 4.0354677598080645e-05, + "loss": 5.0982, + "step": 48676 + }, + { + "epoch": 0.28949590826910265, + "grad_norm": 1.6125696897506714, + "learning_rate": 4.035430897971352e-05, + "loss": 5.118, + "step": 48677 + }, + { + "epoch": 0.28950185555238367, + "grad_norm": 1.783159852027893, + "learning_rate": 4.035394035598637e-05, + "loss": 5.3267, + "step": 48678 + }, + { + "epoch": 0.2895078028356647, + "grad_norm": 1.6109588146209717, + "learning_rate": 4.0353571726899315e-05, + "loss": 5.1348, + "step": 48679 + }, + { + "epoch": 0.28951375011894565, + "grad_norm": 1.596502423286438, + "learning_rate": 4.035320309245249e-05, + "loss": 5.0517, + "step": 48680 + }, + { + "epoch": 0.28951969740222666, + "grad_norm": 1.5393738746643066, + "learning_rate": 4.0352834452646015e-05, + "loss": 5.1637, + "step": 48681 + }, + { + "epoch": 0.2895256446855077, + "grad_norm": 1.5552774667739868, + "learning_rate": 4.035246580748003e-05, + "loss": 5.1276, + "step": 48682 + }, + { + "epoch": 0.28953159196878864, + "grad_norm": 2.513266086578369, + "learning_rate": 4.035209715695465e-05, + "loss": 3.5825, + "step": 48683 + }, + { + "epoch": 0.28953753925206965, + "grad_norm": 1.6648614406585693, + "learning_rate": 4.0351728501070016e-05, + "loss": 4.057, + "step": 48684 + }, + { + "epoch": 0.28954348653535067, + "grad_norm": 1.6635785102844238, + "learning_rate": 4.035135983982625e-05, + "loss": 4.1742, + "step": 48685 + }, + { + "epoch": 0.28954943381863163, + "grad_norm": 1.593962550163269, + "learning_rate": 4.035099117322349e-05, + "loss": 4.9541, + "step": 48686 + }, + { + "epoch": 0.28955538110191265, + "grad_norm": 1.530322551727295, + "learning_rate": 4.035062250126186e-05, + "loss": 5.0942, + "step": 48687 + }, + { + "epoch": 0.28956132838519366, + "grad_norm": 1.8665516376495361, + "learning_rate": 4.035025382394148e-05, + "loss": 4.5536, + "step": 48688 + }, + { + "epoch": 0.2895672756684746, + "grad_norm": 1.6323238611221313, + "learning_rate": 4.034988514126248e-05, + "loss": 4.12, + "step": 48689 + }, + { + "epoch": 0.28957322295175564, + "grad_norm": 1.548807144165039, + "learning_rate": 4.0349516453225e-05, + "loss": 4.3395, + "step": 48690 + }, + { + "epoch": 0.28957917023503665, + "grad_norm": 1.5873095989227295, + "learning_rate": 4.034914775982916e-05, + "loss": 4.4752, + "step": 48691 + }, + { + "epoch": 0.2895851175183176, + "grad_norm": 1.4548470973968506, + "learning_rate": 4.0348779061075085e-05, + "loss": 4.5576, + "step": 48692 + }, + { + "epoch": 0.28959106480159863, + "grad_norm": 1.5172035694122314, + "learning_rate": 4.034841035696292e-05, + "loss": 4.618, + "step": 48693 + }, + { + "epoch": 0.28959701208487965, + "grad_norm": 1.6826099157333374, + "learning_rate": 4.034804164749278e-05, + "loss": 3.9222, + "step": 48694 + }, + { + "epoch": 0.2896029593681606, + "grad_norm": 1.8074713945388794, + "learning_rate": 4.034767293266479e-05, + "loss": 4.0699, + "step": 48695 + }, + { + "epoch": 0.2896089066514416, + "grad_norm": 1.738488793373108, + "learning_rate": 4.03473042124791e-05, + "loss": 4.0777, + "step": 48696 + }, + { + "epoch": 0.28961485393472264, + "grad_norm": 1.906715750694275, + "learning_rate": 4.034693548693582e-05, + "loss": 3.9071, + "step": 48697 + }, + { + "epoch": 0.2896208012180036, + "grad_norm": 1.568897008895874, + "learning_rate": 4.034656675603508e-05, + "loss": 4.1236, + "step": 48698 + }, + { + "epoch": 0.2896267485012846, + "grad_norm": 1.7156492471694946, + "learning_rate": 4.034619801977702e-05, + "loss": 4.3031, + "step": 48699 + }, + { + "epoch": 0.28963269578456563, + "grad_norm": 1.416930913925171, + "learning_rate": 4.034582927816175e-05, + "loss": 5.2475, + "step": 48700 + }, + { + "epoch": 0.2896386430678466, + "grad_norm": 1.399916410446167, + "learning_rate": 4.034546053118941e-05, + "loss": 5.1908, + "step": 48701 + }, + { + "epoch": 0.2896445903511276, + "grad_norm": 1.3431978225708008, + "learning_rate": 4.0345091778860134e-05, + "loss": 4.759, + "step": 48702 + }, + { + "epoch": 0.2896505376344086, + "grad_norm": 1.4652563333511353, + "learning_rate": 4.0344723021174045e-05, + "loss": 4.7625, + "step": 48703 + }, + { + "epoch": 0.2896564849176896, + "grad_norm": 1.7334511280059814, + "learning_rate": 4.0344354258131265e-05, + "loss": 4.3954, + "step": 48704 + }, + { + "epoch": 0.2896624322009706, + "grad_norm": 1.728020191192627, + "learning_rate": 4.034398548973194e-05, + "loss": 4.1766, + "step": 48705 + }, + { + "epoch": 0.2896683794842516, + "grad_norm": 1.688125729560852, + "learning_rate": 4.0343616715976176e-05, + "loss": 4.4577, + "step": 48706 + }, + { + "epoch": 0.2896743267675326, + "grad_norm": 1.7798781394958496, + "learning_rate": 4.034324793686412e-05, + "loss": 3.7503, + "step": 48707 + }, + { + "epoch": 0.2896802740508136, + "grad_norm": 1.4612665176391602, + "learning_rate": 4.03428791523959e-05, + "loss": 4.3379, + "step": 48708 + }, + { + "epoch": 0.2896862213340946, + "grad_norm": 1.7091898918151855, + "learning_rate": 4.034251036257163e-05, + "loss": 4.2336, + "step": 48709 + }, + { + "epoch": 0.28969216861737557, + "grad_norm": 1.295336127281189, + "learning_rate": 4.034214156739146e-05, + "loss": 4.9798, + "step": 48710 + }, + { + "epoch": 0.2896981159006566, + "grad_norm": 1.618178367614746, + "learning_rate": 4.034177276685549e-05, + "loss": 4.4826, + "step": 48711 + }, + { + "epoch": 0.2897040631839376, + "grad_norm": 1.64333176612854, + "learning_rate": 4.0341403960963874e-05, + "loss": 4.4142, + "step": 48712 + }, + { + "epoch": 0.28971001046721856, + "grad_norm": 1.4634541273117065, + "learning_rate": 4.0341035149716734e-05, + "loss": 4.4576, + "step": 48713 + }, + { + "epoch": 0.2897159577504996, + "grad_norm": 1.6719096899032593, + "learning_rate": 4.0340666333114204e-05, + "loss": 4.512, + "step": 48714 + }, + { + "epoch": 0.2897219050337806, + "grad_norm": 1.4717308282852173, + "learning_rate": 4.0340297511156393e-05, + "loss": 4.3024, + "step": 48715 + }, + { + "epoch": 0.28972785231706155, + "grad_norm": 1.5895347595214844, + "learning_rate": 4.033992868384345e-05, + "loss": 4.4701, + "step": 48716 + }, + { + "epoch": 0.28973379960034257, + "grad_norm": 1.6960413455963135, + "learning_rate": 4.03395598511755e-05, + "loss": 4.0704, + "step": 48717 + }, + { + "epoch": 0.2897397468836236, + "grad_norm": 1.4661818742752075, + "learning_rate": 4.033919101315266e-05, + "loss": 4.3, + "step": 48718 + }, + { + "epoch": 0.28974569416690454, + "grad_norm": 1.4765466451644897, + "learning_rate": 4.033882216977506e-05, + "loss": 4.6261, + "step": 48719 + }, + { + "epoch": 0.28975164145018556, + "grad_norm": 1.5224398374557495, + "learning_rate": 4.0338453321042846e-05, + "loss": 5.1541, + "step": 48720 + }, + { + "epoch": 0.2897575887334666, + "grad_norm": 1.6147079467773438, + "learning_rate": 4.033808446695614e-05, + "loss": 4.3877, + "step": 48721 + }, + { + "epoch": 0.28976353601674754, + "grad_norm": 1.5986436605453491, + "learning_rate": 4.033771560751506e-05, + "loss": 4.6607, + "step": 48722 + }, + { + "epoch": 0.28976948330002855, + "grad_norm": 1.5867819786071777, + "learning_rate": 4.033734674271975e-05, + "loss": 4.5589, + "step": 48723 + }, + { + "epoch": 0.28977543058330957, + "grad_norm": 1.4216755628585815, + "learning_rate": 4.033697787257033e-05, + "loss": 4.7178, + "step": 48724 + }, + { + "epoch": 0.28978137786659053, + "grad_norm": 1.730544090270996, + "learning_rate": 4.0336608997066915e-05, + "loss": 4.4976, + "step": 48725 + }, + { + "epoch": 0.28978732514987154, + "grad_norm": 1.4514999389648438, + "learning_rate": 4.033624011620966e-05, + "loss": 4.7646, + "step": 48726 + }, + { + "epoch": 0.28979327243315256, + "grad_norm": 1.393052101135254, + "learning_rate": 4.033587122999868e-05, + "loss": 4.9493, + "step": 48727 + }, + { + "epoch": 0.2897992197164335, + "grad_norm": 1.9847183227539062, + "learning_rate": 4.033550233843411e-05, + "loss": 4.6452, + "step": 48728 + }, + { + "epoch": 0.28980516699971454, + "grad_norm": 1.5068310499191284, + "learning_rate": 4.033513344151606e-05, + "loss": 4.3122, + "step": 48729 + }, + { + "epoch": 0.28981111428299555, + "grad_norm": 1.4881185293197632, + "learning_rate": 4.033476453924469e-05, + "loss": 4.2443, + "step": 48730 + }, + { + "epoch": 0.2898170615662765, + "grad_norm": 1.5815422534942627, + "learning_rate": 4.0334395631620106e-05, + "loss": 4.113, + "step": 48731 + }, + { + "epoch": 0.28982300884955753, + "grad_norm": 1.5892809629440308, + "learning_rate": 4.0334026718642446e-05, + "loss": 4.671, + "step": 48732 + }, + { + "epoch": 0.28982895613283854, + "grad_norm": 1.619676947593689, + "learning_rate": 4.033365780031183e-05, + "loss": 4.9075, + "step": 48733 + }, + { + "epoch": 0.2898349034161195, + "grad_norm": 2.512685537338257, + "learning_rate": 4.0333288876628396e-05, + "loss": 4.1696, + "step": 48734 + }, + { + "epoch": 0.2898408506994005, + "grad_norm": 4.686558246612549, + "learning_rate": 4.0332919947592265e-05, + "loss": 3.664, + "step": 48735 + }, + { + "epoch": 0.28984679798268154, + "grad_norm": 4.153497695922852, + "learning_rate": 4.033255101320358e-05, + "loss": 3.5114, + "step": 48736 + }, + { + "epoch": 0.2898527452659625, + "grad_norm": 1.3913172483444214, + "learning_rate": 4.0332182073462446e-05, + "loss": 4.5497, + "step": 48737 + }, + { + "epoch": 0.2898586925492435, + "grad_norm": 1.4409940242767334, + "learning_rate": 4.033181312836901e-05, + "loss": 4.3337, + "step": 48738 + }, + { + "epoch": 0.28986463983252453, + "grad_norm": 1.9777100086212158, + "learning_rate": 4.03314441779234e-05, + "loss": 4.2553, + "step": 48739 + }, + { + "epoch": 0.2898705871158055, + "grad_norm": 3.958789110183716, + "learning_rate": 4.033107522212573e-05, + "loss": 3.493, + "step": 48740 + }, + { + "epoch": 0.2898765343990865, + "grad_norm": 3.00528621673584, + "learning_rate": 4.0330706260976157e-05, + "loss": 3.5515, + "step": 48741 + }, + { + "epoch": 0.2898824816823675, + "grad_norm": 2.9652535915374756, + "learning_rate": 4.033033729447479e-05, + "loss": 3.3378, + "step": 48742 + }, + { + "epoch": 0.2898884289656485, + "grad_norm": 3.1035873889923096, + "learning_rate": 4.032996832262176e-05, + "loss": 3.1336, + "step": 48743 + }, + { + "epoch": 0.2898943762489295, + "grad_norm": 3.83548641204834, + "learning_rate": 4.032959934541719e-05, + "loss": 3.3763, + "step": 48744 + }, + { + "epoch": 0.2899003235322105, + "grad_norm": 2.168396472930908, + "learning_rate": 4.032923036286122e-05, + "loss": 3.9515, + "step": 48745 + }, + { + "epoch": 0.2899062708154915, + "grad_norm": 3.123269557952881, + "learning_rate": 4.0328861374953973e-05, + "loss": 3.7446, + "step": 48746 + }, + { + "epoch": 0.2899122180987725, + "grad_norm": 2.9076366424560547, + "learning_rate": 4.032849238169558e-05, + "loss": 3.3057, + "step": 48747 + }, + { + "epoch": 0.2899181653820535, + "grad_norm": 2.6528921127319336, + "learning_rate": 4.032812338308617e-05, + "loss": 3.2759, + "step": 48748 + }, + { + "epoch": 0.28992411266533447, + "grad_norm": 2.547285795211792, + "learning_rate": 4.032775437912586e-05, + "loss": 3.4353, + "step": 48749 + }, + { + "epoch": 0.2899300599486155, + "grad_norm": 3.0360116958618164, + "learning_rate": 4.0327385369814803e-05, + "loss": 3.2657, + "step": 48750 + }, + { + "epoch": 0.28993600723189644, + "grad_norm": 1.8615084886550903, + "learning_rate": 4.03270163551531e-05, + "loss": 4.6132, + "step": 48751 + }, + { + "epoch": 0.28994195451517746, + "grad_norm": 1.6503958702087402, + "learning_rate": 4.032664733514091e-05, + "loss": 4.777, + "step": 48752 + }, + { + "epoch": 0.2899479017984585, + "grad_norm": 2.6198294162750244, + "learning_rate": 4.032627830977833e-05, + "loss": 3.2305, + "step": 48753 + }, + { + "epoch": 0.28995384908173943, + "grad_norm": 1.4935234785079956, + "learning_rate": 4.032590927906552e-05, + "loss": 4.7471, + "step": 48754 + }, + { + "epoch": 0.28995979636502045, + "grad_norm": 1.7378379106521606, + "learning_rate": 4.032554024300258e-05, + "loss": 4.3966, + "step": 48755 + }, + { + "epoch": 0.28996574364830147, + "grad_norm": 2.2002787590026855, + "learning_rate": 4.032517120158966e-05, + "loss": 3.4046, + "step": 48756 + }, + { + "epoch": 0.2899716909315824, + "grad_norm": 1.8337101936340332, + "learning_rate": 4.032480215482688e-05, + "loss": 3.9053, + "step": 48757 + }, + { + "epoch": 0.28997763821486344, + "grad_norm": 1.5116788148880005, + "learning_rate": 4.032443310271437e-05, + "loss": 4.941, + "step": 48758 + }, + { + "epoch": 0.28998358549814446, + "grad_norm": 1.3915863037109375, + "learning_rate": 4.032406404525226e-05, + "loss": 4.9749, + "step": 48759 + }, + { + "epoch": 0.2899895327814254, + "grad_norm": 1.5531915426254272, + "learning_rate": 4.032369498244067e-05, + "loss": 4.8492, + "step": 48760 + }, + { + "epoch": 0.28999548006470643, + "grad_norm": 1.7031444311141968, + "learning_rate": 4.032332591427974e-05, + "loss": 4.5701, + "step": 48761 + }, + { + "epoch": 0.29000142734798745, + "grad_norm": 1.4605648517608643, + "learning_rate": 4.03229568407696e-05, + "loss": 4.7838, + "step": 48762 + }, + { + "epoch": 0.2900073746312684, + "grad_norm": 1.7919071912765503, + "learning_rate": 4.032258776191037e-05, + "loss": 5.1495, + "step": 48763 + }, + { + "epoch": 0.2900133219145494, + "grad_norm": 1.3851486444473267, + "learning_rate": 4.0322218677702185e-05, + "loss": 5.2803, + "step": 48764 + }, + { + "epoch": 0.29001926919783044, + "grad_norm": 1.9268335103988647, + "learning_rate": 4.0321849588145165e-05, + "loss": 4.1892, + "step": 48765 + }, + { + "epoch": 0.2900252164811114, + "grad_norm": 1.5137653350830078, + "learning_rate": 4.032148049323946e-05, + "loss": 5.134, + "step": 48766 + }, + { + "epoch": 0.2900311637643924, + "grad_norm": 1.4694112539291382, + "learning_rate": 4.032111139298517e-05, + "loss": 5.5436, + "step": 48767 + }, + { + "epoch": 0.29003711104767343, + "grad_norm": 1.6547881364822388, + "learning_rate": 4.0320742287382446e-05, + "loss": 4.0507, + "step": 48768 + }, + { + "epoch": 0.2900430583309544, + "grad_norm": 1.4319055080413818, + "learning_rate": 4.03203731764314e-05, + "loss": 4.765, + "step": 48769 + }, + { + "epoch": 0.2900490056142354, + "grad_norm": 1.6552740335464478, + "learning_rate": 4.032000406013219e-05, + "loss": 4.756, + "step": 48770 + }, + { + "epoch": 0.2900549528975164, + "grad_norm": 1.4391809701919556, + "learning_rate": 4.0319634938484905e-05, + "loss": 5.1745, + "step": 48771 + }, + { + "epoch": 0.2900609001807974, + "grad_norm": 1.4066566228866577, + "learning_rate": 4.03192658114897e-05, + "loss": 5.5273, + "step": 48772 + }, + { + "epoch": 0.2900668474640784, + "grad_norm": 1.516598105430603, + "learning_rate": 4.03188966791467e-05, + "loss": 5.2001, + "step": 48773 + }, + { + "epoch": 0.2900727947473594, + "grad_norm": 1.5406873226165771, + "learning_rate": 4.031852754145603e-05, + "loss": 4.5917, + "step": 48774 + }, + { + "epoch": 0.2900787420306404, + "grad_norm": 1.5831056833267212, + "learning_rate": 4.031815839841782e-05, + "loss": 4.6201, + "step": 48775 + }, + { + "epoch": 0.2900846893139214, + "grad_norm": 1.4780162572860718, + "learning_rate": 4.03177892500322e-05, + "loss": 4.9291, + "step": 48776 + }, + { + "epoch": 0.2900906365972024, + "grad_norm": 1.3679949045181274, + "learning_rate": 4.0317420096299294e-05, + "loss": 4.9127, + "step": 48777 + }, + { + "epoch": 0.29009658388048337, + "grad_norm": 1.3263890743255615, + "learning_rate": 4.031705093721924e-05, + "loss": 5.0357, + "step": 48778 + }, + { + "epoch": 0.2901025311637644, + "grad_norm": 1.3048044443130493, + "learning_rate": 4.0316681772792164e-05, + "loss": 4.8483, + "step": 48779 + }, + { + "epoch": 0.2901084784470454, + "grad_norm": 1.8248941898345947, + "learning_rate": 4.031631260301818e-05, + "loss": 3.9744, + "step": 48780 + }, + { + "epoch": 0.29011442573032636, + "grad_norm": 1.3976982831954956, + "learning_rate": 4.031594342789744e-05, + "loss": 4.9465, + "step": 48781 + }, + { + "epoch": 0.2901203730136074, + "grad_norm": 1.3176321983337402, + "learning_rate": 4.031557424743007e-05, + "loss": 5.1693, + "step": 48782 + }, + { + "epoch": 0.2901263202968884, + "grad_norm": 1.3665484189987183, + "learning_rate": 4.0315205061616175e-05, + "loss": 5.1644, + "step": 48783 + }, + { + "epoch": 0.29013226758016936, + "grad_norm": 1.3100435733795166, + "learning_rate": 4.03148358704559e-05, + "loss": 5.1682, + "step": 48784 + }, + { + "epoch": 0.29013821486345037, + "grad_norm": 1.2073900699615479, + "learning_rate": 4.031446667394939e-05, + "loss": 5.0288, + "step": 48785 + }, + { + "epoch": 0.2901441621467314, + "grad_norm": 1.3105359077453613, + "learning_rate": 4.0314097472096747e-05, + "loss": 4.8126, + "step": 48786 + }, + { + "epoch": 0.29015010943001235, + "grad_norm": 1.3480547666549683, + "learning_rate": 4.031372826489811e-05, + "loss": 4.9573, + "step": 48787 + }, + { + "epoch": 0.29015605671329336, + "grad_norm": 1.2712851762771606, + "learning_rate": 4.031335905235361e-05, + "loss": 5.0106, + "step": 48788 + }, + { + "epoch": 0.2901620039965744, + "grad_norm": 1.3663522005081177, + "learning_rate": 4.031298983446338e-05, + "loss": 5.2855, + "step": 48789 + }, + { + "epoch": 0.29016795127985534, + "grad_norm": 1.2761088609695435, + "learning_rate": 4.031262061122754e-05, + "loss": 5.1665, + "step": 48790 + }, + { + "epoch": 0.29017389856313636, + "grad_norm": 1.8236379623413086, + "learning_rate": 4.0312251382646215e-05, + "loss": 3.6739, + "step": 48791 + }, + { + "epoch": 0.2901798458464174, + "grad_norm": 1.468214511871338, + "learning_rate": 4.031188214871955e-05, + "loss": 4.7888, + "step": 48792 + }, + { + "epoch": 0.29018579312969833, + "grad_norm": 1.596947193145752, + "learning_rate": 4.0311512909447656e-05, + "loss": 3.7354, + "step": 48793 + }, + { + "epoch": 0.29019174041297935, + "grad_norm": 1.4043610095977783, + "learning_rate": 4.0311143664830684e-05, + "loss": 4.3936, + "step": 48794 + }, + { + "epoch": 0.29019768769626036, + "grad_norm": 1.757031798362732, + "learning_rate": 4.0310774414868744e-05, + "loss": 4.0182, + "step": 48795 + }, + { + "epoch": 0.2902036349795413, + "grad_norm": 1.1736817359924316, + "learning_rate": 4.031040515956197e-05, + "loss": 4.5023, + "step": 48796 + }, + { + "epoch": 0.29020958226282234, + "grad_norm": 1.2446088790893555, + "learning_rate": 4.031003589891049e-05, + "loss": 4.8533, + "step": 48797 + }, + { + "epoch": 0.29021552954610336, + "grad_norm": 1.2565594911575317, + "learning_rate": 4.030966663291444e-05, + "loss": 4.5785, + "step": 48798 + }, + { + "epoch": 0.2902214768293843, + "grad_norm": 1.580053448677063, + "learning_rate": 4.030929736157394e-05, + "loss": 4.6477, + "step": 48799 + }, + { + "epoch": 0.29022742411266533, + "grad_norm": 1.4078195095062256, + "learning_rate": 4.0308928084889126e-05, + "loss": 4.2157, + "step": 48800 + }, + { + "epoch": 0.29023337139594635, + "grad_norm": 1.2704449892044067, + "learning_rate": 4.030855880286012e-05, + "loss": 4.369, + "step": 48801 + }, + { + "epoch": 0.2902393186792273, + "grad_norm": 1.351125717163086, + "learning_rate": 4.030818951548705e-05, + "loss": 4.5776, + "step": 48802 + }, + { + "epoch": 0.2902452659625083, + "grad_norm": 1.4810088872909546, + "learning_rate": 4.030782022277006e-05, + "loss": 5.1183, + "step": 48803 + }, + { + "epoch": 0.29025121324578934, + "grad_norm": 1.1887198686599731, + "learning_rate": 4.030745092470926e-05, + "loss": 4.8454, + "step": 48804 + }, + { + "epoch": 0.2902571605290703, + "grad_norm": 1.4202022552490234, + "learning_rate": 4.0307081621304786e-05, + "loss": 4.6844, + "step": 48805 + }, + { + "epoch": 0.2902631078123513, + "grad_norm": 1.3136558532714844, + "learning_rate": 4.0306712312556775e-05, + "loss": 4.6557, + "step": 48806 + }, + { + "epoch": 0.29026905509563233, + "grad_norm": 1.1731243133544922, + "learning_rate": 4.030634299846534e-05, + "loss": 4.4918, + "step": 48807 + }, + { + "epoch": 0.2902750023789133, + "grad_norm": 1.1994651556015015, + "learning_rate": 4.0305973679030626e-05, + "loss": 4.7117, + "step": 48808 + }, + { + "epoch": 0.2902809496621943, + "grad_norm": 1.5932952165603638, + "learning_rate": 4.030560435425276e-05, + "loss": 5.3561, + "step": 48809 + }, + { + "epoch": 0.2902868969454753, + "grad_norm": 1.5348511934280396, + "learning_rate": 4.030523502413185e-05, + "loss": 5.0726, + "step": 48810 + }, + { + "epoch": 0.2902928442287563, + "grad_norm": 1.2857818603515625, + "learning_rate": 4.030486568866805e-05, + "loss": 5.0983, + "step": 48811 + }, + { + "epoch": 0.2902987915120373, + "grad_norm": 1.3411800861358643, + "learning_rate": 4.030449634786148e-05, + "loss": 5.0314, + "step": 48812 + }, + { + "epoch": 0.2903047387953183, + "grad_norm": 1.3200664520263672, + "learning_rate": 4.030412700171227e-05, + "loss": 5.257, + "step": 48813 + }, + { + "epoch": 0.2903106860785993, + "grad_norm": 1.491490364074707, + "learning_rate": 4.030375765022054e-05, + "loss": 4.6266, + "step": 48814 + }, + { + "epoch": 0.2903166333618803, + "grad_norm": 1.7966352701187134, + "learning_rate": 4.030338829338643e-05, + "loss": 4.1516, + "step": 48815 + }, + { + "epoch": 0.2903225806451613, + "grad_norm": 1.456533670425415, + "learning_rate": 4.030301893121007e-05, + "loss": 4.7651, + "step": 48816 + }, + { + "epoch": 0.29032852792844227, + "grad_norm": 1.3764982223510742, + "learning_rate": 4.030264956369157e-05, + "loss": 4.9792, + "step": 48817 + }, + { + "epoch": 0.2903344752117233, + "grad_norm": 1.5060677528381348, + "learning_rate": 4.030228019083109e-05, + "loss": 4.8496, + "step": 48818 + }, + { + "epoch": 0.2903404224950043, + "grad_norm": 1.421521782875061, + "learning_rate": 4.030191081262873e-05, + "loss": 4.881, + "step": 48819 + }, + { + "epoch": 0.29034636977828526, + "grad_norm": 1.4079557657241821, + "learning_rate": 4.030154142908463e-05, + "loss": 4.9051, + "step": 48820 + }, + { + "epoch": 0.2903523170615663, + "grad_norm": 1.5357475280761719, + "learning_rate": 4.030117204019893e-05, + "loss": 4.6582, + "step": 48821 + }, + { + "epoch": 0.2903582643448473, + "grad_norm": 1.4051580429077148, + "learning_rate": 4.0300802645971745e-05, + "loss": 4.6298, + "step": 48822 + }, + { + "epoch": 0.29036421162812825, + "grad_norm": 1.4588650465011597, + "learning_rate": 4.0300433246403214e-05, + "loss": 4.6314, + "step": 48823 + }, + { + "epoch": 0.29037015891140927, + "grad_norm": 1.4033044576644897, + "learning_rate": 4.030006384149345e-05, + "loss": 4.5966, + "step": 48824 + }, + { + "epoch": 0.2903761061946903, + "grad_norm": 1.3534780740737915, + "learning_rate": 4.029969443124259e-05, + "loss": 4.6343, + "step": 48825 + }, + { + "epoch": 0.29038205347797125, + "grad_norm": 1.9253804683685303, + "learning_rate": 4.0299325015650774e-05, + "loss": 3.7309, + "step": 48826 + }, + { + "epoch": 0.29038800076125226, + "grad_norm": 1.5855273008346558, + "learning_rate": 4.029895559471812e-05, + "loss": 4.6656, + "step": 48827 + }, + { + "epoch": 0.2903939480445333, + "grad_norm": 1.2410694360733032, + "learning_rate": 4.029858616844475e-05, + "loss": 4.8084, + "step": 48828 + }, + { + "epoch": 0.29039989532781424, + "grad_norm": 1.420326590538025, + "learning_rate": 4.0298216736830814e-05, + "loss": 4.8488, + "step": 48829 + }, + { + "epoch": 0.29040584261109526, + "grad_norm": 2.0310580730438232, + "learning_rate": 4.0297847299876424e-05, + "loss": 4.0351, + "step": 48830 + }, + { + "epoch": 0.29041178989437627, + "grad_norm": 1.433782696723938, + "learning_rate": 4.029747785758171e-05, + "loss": 4.5499, + "step": 48831 + }, + { + "epoch": 0.29041773717765723, + "grad_norm": 1.6124972105026245, + "learning_rate": 4.02971084099468e-05, + "loss": 4.8367, + "step": 48832 + }, + { + "epoch": 0.29042368446093825, + "grad_norm": 1.4889618158340454, + "learning_rate": 4.029673895697184e-05, + "loss": 5.0835, + "step": 48833 + }, + { + "epoch": 0.29042963174421926, + "grad_norm": 1.5888493061065674, + "learning_rate": 4.029636949865694e-05, + "loss": 4.9577, + "step": 48834 + }, + { + "epoch": 0.2904355790275002, + "grad_norm": 1.3682975769042969, + "learning_rate": 4.0296000035002236e-05, + "loss": 5.017, + "step": 48835 + }, + { + "epoch": 0.29044152631078124, + "grad_norm": 1.5403677225112915, + "learning_rate": 4.0295630566007856e-05, + "loss": 4.8113, + "step": 48836 + }, + { + "epoch": 0.29044747359406226, + "grad_norm": 1.312519907951355, + "learning_rate": 4.029526109167393e-05, + "loss": 4.548, + "step": 48837 + }, + { + "epoch": 0.2904534208773432, + "grad_norm": 1.7766956090927124, + "learning_rate": 4.0294891612000584e-05, + "loss": 3.9145, + "step": 48838 + }, + { + "epoch": 0.29045936816062423, + "grad_norm": 1.7769094705581665, + "learning_rate": 4.029452212698796e-05, + "loss": 4.8496, + "step": 48839 + }, + { + "epoch": 0.29046531544390525, + "grad_norm": 1.5430243015289307, + "learning_rate": 4.029415263663616e-05, + "loss": 4.6895, + "step": 48840 + }, + { + "epoch": 0.2904712627271862, + "grad_norm": 1.8050990104675293, + "learning_rate": 4.029378314094534e-05, + "loss": 4.6181, + "step": 48841 + }, + { + "epoch": 0.2904772100104672, + "grad_norm": 1.5291662216186523, + "learning_rate": 4.0293413639915616e-05, + "loss": 4.4417, + "step": 48842 + }, + { + "epoch": 0.29048315729374824, + "grad_norm": 1.352257490158081, + "learning_rate": 4.029304413354712e-05, + "loss": 4.4815, + "step": 48843 + }, + { + "epoch": 0.2904891045770292, + "grad_norm": 1.7471299171447754, + "learning_rate": 4.029267462183998e-05, + "loss": 4.2131, + "step": 48844 + }, + { + "epoch": 0.2904950518603102, + "grad_norm": 1.3819122314453125, + "learning_rate": 4.029230510479433e-05, + "loss": 4.7581, + "step": 48845 + }, + { + "epoch": 0.29050099914359123, + "grad_norm": 1.4364279508590698, + "learning_rate": 4.029193558241028e-05, + "loss": 4.6242, + "step": 48846 + }, + { + "epoch": 0.2905069464268722, + "grad_norm": 1.2200459241867065, + "learning_rate": 4.0291566054687984e-05, + "loss": 4.769, + "step": 48847 + }, + { + "epoch": 0.2905128937101532, + "grad_norm": 1.2417066097259521, + "learning_rate": 4.029119652162756e-05, + "loss": 4.7916, + "step": 48848 + }, + { + "epoch": 0.2905188409934342, + "grad_norm": 1.6411775350570679, + "learning_rate": 4.029082698322914e-05, + "loss": 4.8906, + "step": 48849 + }, + { + "epoch": 0.2905247882767152, + "grad_norm": 1.5658551454544067, + "learning_rate": 4.0290457439492835e-05, + "loss": 5.2514, + "step": 48850 + }, + { + "epoch": 0.2905307355599962, + "grad_norm": 1.474279522895813, + "learning_rate": 4.02900878904188e-05, + "loss": 4.9913, + "step": 48851 + }, + { + "epoch": 0.2905366828432772, + "grad_norm": 1.5563794374465942, + "learning_rate": 4.028971833600715e-05, + "loss": 4.9029, + "step": 48852 + }, + { + "epoch": 0.2905426301265582, + "grad_norm": 1.5614770650863647, + "learning_rate": 4.028934877625803e-05, + "loss": 4.7559, + "step": 48853 + }, + { + "epoch": 0.2905485774098392, + "grad_norm": 1.5034037828445435, + "learning_rate": 4.0288979211171544e-05, + "loss": 5.0829, + "step": 48854 + }, + { + "epoch": 0.2905545246931202, + "grad_norm": 1.765204668045044, + "learning_rate": 4.0288609640747835e-05, + "loss": 4.0444, + "step": 48855 + }, + { + "epoch": 0.29056047197640117, + "grad_norm": 1.174601435661316, + "learning_rate": 4.028824006498704e-05, + "loss": 4.5145, + "step": 48856 + }, + { + "epoch": 0.2905664192596822, + "grad_norm": 1.7529675960540771, + "learning_rate": 4.0287870483889264e-05, + "loss": 5.0651, + "step": 48857 + }, + { + "epoch": 0.2905723665429632, + "grad_norm": 1.5085196495056152, + "learning_rate": 4.0287500897454654e-05, + "loss": 4.9384, + "step": 48858 + }, + { + "epoch": 0.29057831382624416, + "grad_norm": 1.4426710605621338, + "learning_rate": 4.0287131305683336e-05, + "loss": 4.8972, + "step": 48859 + }, + { + "epoch": 0.2905842611095252, + "grad_norm": 1.768362283706665, + "learning_rate": 4.0286761708575446e-05, + "loss": 4.85, + "step": 48860 + }, + { + "epoch": 0.2905902083928062, + "grad_norm": 1.6849349737167358, + "learning_rate": 4.02863921061311e-05, + "loss": 3.8507, + "step": 48861 + }, + { + "epoch": 0.29059615567608715, + "grad_norm": 1.784724235534668, + "learning_rate": 4.0286022498350425e-05, + "loss": 3.3468, + "step": 48862 + }, + { + "epoch": 0.29060210295936817, + "grad_norm": 1.9738497734069824, + "learning_rate": 4.028565288523357e-05, + "loss": 4.2217, + "step": 48863 + }, + { + "epoch": 0.2906080502426492, + "grad_norm": 1.6576266288757324, + "learning_rate": 4.028528326678065e-05, + "loss": 4.3566, + "step": 48864 + }, + { + "epoch": 0.29061399752593015, + "grad_norm": 1.2651056051254272, + "learning_rate": 4.0284913642991784e-05, + "loss": 4.8316, + "step": 48865 + }, + { + "epoch": 0.29061994480921116, + "grad_norm": 1.672684907913208, + "learning_rate": 4.028454401386712e-05, + "loss": 4.3857, + "step": 48866 + }, + { + "epoch": 0.2906258920924921, + "grad_norm": 2.02527117729187, + "learning_rate": 4.028417437940678e-05, + "loss": 3.118, + "step": 48867 + }, + { + "epoch": 0.29063183937577314, + "grad_norm": 1.909210205078125, + "learning_rate": 4.02838047396109e-05, + "loss": 3.4199, + "step": 48868 + }, + { + "epoch": 0.29063778665905415, + "grad_norm": 1.9749822616577148, + "learning_rate": 4.02834350944796e-05, + "loss": 3.0761, + "step": 48869 + }, + { + "epoch": 0.2906437339423351, + "grad_norm": 1.9715917110443115, + "learning_rate": 4.028306544401299e-05, + "loss": 2.8237, + "step": 48870 + }, + { + "epoch": 0.29064968122561613, + "grad_norm": 1.7132863998413086, + "learning_rate": 4.0282695788211245e-05, + "loss": 3.3823, + "step": 48871 + }, + { + "epoch": 0.29065562850889715, + "grad_norm": 2.4939446449279785, + "learning_rate": 4.0282326127074454e-05, + "loss": 1.5124, + "step": 48872 + }, + { + "epoch": 0.2906615757921781, + "grad_norm": 1.9347518682479858, + "learning_rate": 4.028195646060276e-05, + "loss": 3.5156, + "step": 48873 + }, + { + "epoch": 0.2906675230754591, + "grad_norm": 1.713352918624878, + "learning_rate": 4.02815867887963e-05, + "loss": 3.487, + "step": 48874 + }, + { + "epoch": 0.29067347035874014, + "grad_norm": 1.638778805732727, + "learning_rate": 4.0281217111655195e-05, + "loss": 3.5358, + "step": 48875 + }, + { + "epoch": 0.2906794176420211, + "grad_norm": 1.5514440536499023, + "learning_rate": 4.028084742917957e-05, + "loss": 3.5468, + "step": 48876 + }, + { + "epoch": 0.2906853649253021, + "grad_norm": 1.5556004047393799, + "learning_rate": 4.028047774136957e-05, + "loss": 3.3674, + "step": 48877 + }, + { + "epoch": 0.29069131220858313, + "grad_norm": 2.1367478370666504, + "learning_rate": 4.02801080482253e-05, + "loss": 2.4942, + "step": 48878 + }, + { + "epoch": 0.2906972594918641, + "grad_norm": 1.6266648769378662, + "learning_rate": 4.027973834974691e-05, + "loss": 4.1541, + "step": 48879 + }, + { + "epoch": 0.2907032067751451, + "grad_norm": 1.7897063493728638, + "learning_rate": 4.0279368645934524e-05, + "loss": 3.7358, + "step": 48880 + }, + { + "epoch": 0.2907091540584261, + "grad_norm": 1.5031864643096924, + "learning_rate": 4.027899893678826e-05, + "loss": 4.0476, + "step": 48881 + }, + { + "epoch": 0.2907151013417071, + "grad_norm": 1.6124920845031738, + "learning_rate": 4.027862922230826e-05, + "loss": 3.8097, + "step": 48882 + }, + { + "epoch": 0.2907210486249881, + "grad_norm": 1.4569734334945679, + "learning_rate": 4.027825950249465e-05, + "loss": 4.1829, + "step": 48883 + }, + { + "epoch": 0.2907269959082691, + "grad_norm": 2.4058725833892822, + "learning_rate": 4.027788977734755e-05, + "loss": 3.9722, + "step": 48884 + }, + { + "epoch": 0.2907329431915501, + "grad_norm": 1.8861230611801147, + "learning_rate": 4.02775200468671e-05, + "loss": 4.4124, + "step": 48885 + }, + { + "epoch": 0.2907388904748311, + "grad_norm": 1.778855562210083, + "learning_rate": 4.027715031105343e-05, + "loss": 4.2808, + "step": 48886 + }, + { + "epoch": 0.2907448377581121, + "grad_norm": 1.7604470252990723, + "learning_rate": 4.0276780569906656e-05, + "loss": 4.2272, + "step": 48887 + }, + { + "epoch": 0.29075078504139307, + "grad_norm": 1.8246943950653076, + "learning_rate": 4.0276410823426925e-05, + "loss": 4.2615, + "step": 48888 + }, + { + "epoch": 0.2907567323246741, + "grad_norm": 1.7257099151611328, + "learning_rate": 4.0276041071614356e-05, + "loss": 4.3549, + "step": 48889 + }, + { + "epoch": 0.2907626796079551, + "grad_norm": 2.161865711212158, + "learning_rate": 4.027567131446908e-05, + "loss": 3.6415, + "step": 48890 + }, + { + "epoch": 0.29076862689123606, + "grad_norm": 1.5089524984359741, + "learning_rate": 4.027530155199121e-05, + "loss": 4.2716, + "step": 48891 + }, + { + "epoch": 0.2907745741745171, + "grad_norm": 1.7150171995162964, + "learning_rate": 4.02749317841809e-05, + "loss": 4.2392, + "step": 48892 + }, + { + "epoch": 0.2907805214577981, + "grad_norm": 1.4542232751846313, + "learning_rate": 4.0274562011038274e-05, + "loss": 4.2113, + "step": 48893 + }, + { + "epoch": 0.29078646874107905, + "grad_norm": 1.5179686546325684, + "learning_rate": 4.027419223256345e-05, + "loss": 4.3233, + "step": 48894 + }, + { + "epoch": 0.29079241602436007, + "grad_norm": 1.8970842361450195, + "learning_rate": 4.027382244875657e-05, + "loss": 3.8185, + "step": 48895 + }, + { + "epoch": 0.2907983633076411, + "grad_norm": 1.828827977180481, + "learning_rate": 4.027345265961775e-05, + "loss": 3.6258, + "step": 48896 + }, + { + "epoch": 0.29080431059092204, + "grad_norm": 1.542285680770874, + "learning_rate": 4.027308286514713e-05, + "loss": 4.479, + "step": 48897 + }, + { + "epoch": 0.29081025787420306, + "grad_norm": 1.550940752029419, + "learning_rate": 4.027271306534483e-05, + "loss": 3.8295, + "step": 48898 + }, + { + "epoch": 0.2908162051574841, + "grad_norm": 1.50169038772583, + "learning_rate": 4.027234326021099e-05, + "loss": 4.4529, + "step": 48899 + }, + { + "epoch": 0.29082215244076504, + "grad_norm": 1.551904559135437, + "learning_rate": 4.0271973449745726e-05, + "loss": 4.0678, + "step": 48900 + }, + { + "epoch": 0.29082809972404605, + "grad_norm": 2.060988187789917, + "learning_rate": 4.0271603633949176e-05, + "loss": 3.71, + "step": 48901 + }, + { + "epoch": 0.29083404700732707, + "grad_norm": 2.261657953262329, + "learning_rate": 4.027123381282146e-05, + "loss": 2.6629, + "step": 48902 + }, + { + "epoch": 0.29083999429060803, + "grad_norm": 2.7257018089294434, + "learning_rate": 4.0270863986362725e-05, + "loss": 2.7273, + "step": 48903 + }, + { + "epoch": 0.29084594157388904, + "grad_norm": 2.3804397583007812, + "learning_rate": 4.027049415457309e-05, + "loss": 2.5695, + "step": 48904 + }, + { + "epoch": 0.29085188885717006, + "grad_norm": 2.363260269165039, + "learning_rate": 4.027012431745267e-05, + "loss": 2.7066, + "step": 48905 + }, + { + "epoch": 0.290857836140451, + "grad_norm": 2.623502731323242, + "learning_rate": 4.026975447500162e-05, + "loss": 2.9057, + "step": 48906 + }, + { + "epoch": 0.29086378342373204, + "grad_norm": 1.6974875926971436, + "learning_rate": 4.026938462722005e-05, + "loss": 3.772, + "step": 48907 + }, + { + "epoch": 0.29086973070701305, + "grad_norm": 1.6273080110549927, + "learning_rate": 4.02690147741081e-05, + "loss": 3.4833, + "step": 48908 + }, + { + "epoch": 0.290875677990294, + "grad_norm": 1.5884971618652344, + "learning_rate": 4.0268644915665896e-05, + "loss": 3.9317, + "step": 48909 + }, + { + "epoch": 0.29088162527357503, + "grad_norm": 1.6877775192260742, + "learning_rate": 4.026827505189356e-05, + "loss": 3.999, + "step": 48910 + }, + { + "epoch": 0.29088757255685604, + "grad_norm": 1.6344088315963745, + "learning_rate": 4.026790518279123e-05, + "loss": 4.0288, + "step": 48911 + }, + { + "epoch": 0.290893519840137, + "grad_norm": 1.5469928979873657, + "learning_rate": 4.026753530835903e-05, + "loss": 4.1162, + "step": 48912 + }, + { + "epoch": 0.290899467123418, + "grad_norm": 1.861738681793213, + "learning_rate": 4.0267165428597096e-05, + "loss": 4.1288, + "step": 48913 + }, + { + "epoch": 0.29090541440669904, + "grad_norm": 2.25207257270813, + "learning_rate": 4.0266795543505547e-05, + "loss": 3.0411, + "step": 48914 + }, + { + "epoch": 0.29091136168998, + "grad_norm": 2.1244592666625977, + "learning_rate": 4.026642565308452e-05, + "loss": 3.1339, + "step": 48915 + }, + { + "epoch": 0.290917308973261, + "grad_norm": 1.5788418054580688, + "learning_rate": 4.0266055757334144e-05, + "loss": 3.8372, + "step": 48916 + }, + { + "epoch": 0.29092325625654203, + "grad_norm": 1.6048648357391357, + "learning_rate": 4.026568585625454e-05, + "loss": 3.965, + "step": 48917 + }, + { + "epoch": 0.290929203539823, + "grad_norm": 1.9737974405288696, + "learning_rate": 4.026531594984584e-05, + "loss": 4.0343, + "step": 48918 + }, + { + "epoch": 0.290935150823104, + "grad_norm": 1.6537405252456665, + "learning_rate": 4.026494603810819e-05, + "loss": 3.9299, + "step": 48919 + }, + { + "epoch": 0.290941098106385, + "grad_norm": 1.9107508659362793, + "learning_rate": 4.0264576121041694e-05, + "loss": 3.499, + "step": 48920 + }, + { + "epoch": 0.290947045389666, + "grad_norm": 1.955490231513977, + "learning_rate": 4.0264206198646495e-05, + "loss": 3.2529, + "step": 48921 + }, + { + "epoch": 0.290952992672947, + "grad_norm": 2.1140494346618652, + "learning_rate": 4.026383627092272e-05, + "loss": 3.0596, + "step": 48922 + }, + { + "epoch": 0.290958939956228, + "grad_norm": 2.05757999420166, + "learning_rate": 4.026346633787049e-05, + "loss": 2.8373, + "step": 48923 + }, + { + "epoch": 0.290964887239509, + "grad_norm": 2.1294806003570557, + "learning_rate": 4.026309639948995e-05, + "loss": 2.9131, + "step": 48924 + }, + { + "epoch": 0.29097083452279, + "grad_norm": 1.9864604473114014, + "learning_rate": 4.026272645578122e-05, + "loss": 3.2776, + "step": 48925 + }, + { + "epoch": 0.290976781806071, + "grad_norm": 1.4157150983810425, + "learning_rate": 4.026235650674442e-05, + "loss": 5.0685, + "step": 48926 + }, + { + "epoch": 0.29098272908935197, + "grad_norm": 2.1668851375579834, + "learning_rate": 4.0261986552379706e-05, + "loss": 3.0759, + "step": 48927 + }, + { + "epoch": 0.290988676372633, + "grad_norm": 1.983353614807129, + "learning_rate": 4.026161659268718e-05, + "loss": 3.6286, + "step": 48928 + }, + { + "epoch": 0.290994623655914, + "grad_norm": 2.079759120941162, + "learning_rate": 4.0261246627666985e-05, + "loss": 3.0663, + "step": 48929 + }, + { + "epoch": 0.29100057093919496, + "grad_norm": 2.2008450031280518, + "learning_rate": 4.026087665731924e-05, + "loss": 2.6483, + "step": 48930 + }, + { + "epoch": 0.291006518222476, + "grad_norm": 2.120918035507202, + "learning_rate": 4.026050668164409e-05, + "loss": 2.8554, + "step": 48931 + }, + { + "epoch": 0.291012465505757, + "grad_norm": 2.096240758895874, + "learning_rate": 4.026013670064165e-05, + "loss": 3.3866, + "step": 48932 + }, + { + "epoch": 0.29101841278903795, + "grad_norm": 2.149122953414917, + "learning_rate": 4.025976671431205e-05, + "loss": 2.9394, + "step": 48933 + }, + { + "epoch": 0.29102436007231897, + "grad_norm": 2.0716795921325684, + "learning_rate": 4.025939672265543e-05, + "loss": 3.229, + "step": 48934 + }, + { + "epoch": 0.2910303073556, + "grad_norm": 2.1312785148620605, + "learning_rate": 4.02590267256719e-05, + "loss": 3.2749, + "step": 48935 + }, + { + "epoch": 0.29103625463888094, + "grad_norm": 2.2195847034454346, + "learning_rate": 4.025865672336162e-05, + "loss": 3.8468, + "step": 48936 + }, + { + "epoch": 0.29104220192216196, + "grad_norm": 1.6672378778457642, + "learning_rate": 4.025828671572469e-05, + "loss": 4.0315, + "step": 48937 + }, + { + "epoch": 0.291048149205443, + "grad_norm": 1.95305597782135, + "learning_rate": 4.025791670276125e-05, + "loss": 3.3711, + "step": 48938 + }, + { + "epoch": 0.29105409648872393, + "grad_norm": 2.1174230575561523, + "learning_rate": 4.0257546684471425e-05, + "loss": 3.5032, + "step": 48939 + }, + { + "epoch": 0.29106004377200495, + "grad_norm": 1.989650011062622, + "learning_rate": 4.0257176660855355e-05, + "loss": 3.2252, + "step": 48940 + }, + { + "epoch": 0.29106599105528597, + "grad_norm": 1.8327046632766724, + "learning_rate": 4.0256806631913155e-05, + "loss": 3.2926, + "step": 48941 + }, + { + "epoch": 0.2910719383385669, + "grad_norm": 1.9986499547958374, + "learning_rate": 4.0256436597644975e-05, + "loss": 3.1623, + "step": 48942 + }, + { + "epoch": 0.29107788562184794, + "grad_norm": 1.723263144493103, + "learning_rate": 4.025606655805092e-05, + "loss": 4.5092, + "step": 48943 + }, + { + "epoch": 0.29108383290512896, + "grad_norm": 1.4608047008514404, + "learning_rate": 4.0255696513131134e-05, + "loss": 4.8424, + "step": 48944 + }, + { + "epoch": 0.2910897801884099, + "grad_norm": 1.5937628746032715, + "learning_rate": 4.025532646288574e-05, + "loss": 4.8025, + "step": 48945 + }, + { + "epoch": 0.29109572747169093, + "grad_norm": 1.722562551498413, + "learning_rate": 4.025495640731487e-05, + "loss": 4.5581, + "step": 48946 + }, + { + "epoch": 0.29110167475497195, + "grad_norm": 1.8554260730743408, + "learning_rate": 4.025458634641865e-05, + "loss": 3.8928, + "step": 48947 + }, + { + "epoch": 0.2911076220382529, + "grad_norm": 1.8971575498580933, + "learning_rate": 4.025421628019721e-05, + "loss": 3.7934, + "step": 48948 + }, + { + "epoch": 0.2911135693215339, + "grad_norm": 1.7589452266693115, + "learning_rate": 4.0253846208650686e-05, + "loss": 4.4327, + "step": 48949 + }, + { + "epoch": 0.29111951660481494, + "grad_norm": 1.8993113040924072, + "learning_rate": 4.02534761317792e-05, + "loss": 4.5888, + "step": 48950 + }, + { + "epoch": 0.2911254638880959, + "grad_norm": 1.3229424953460693, + "learning_rate": 4.0253106049582875e-05, + "loss": 5.0928, + "step": 48951 + }, + { + "epoch": 0.2911314111713769, + "grad_norm": 1.4168795347213745, + "learning_rate": 4.025273596206186e-05, + "loss": 4.9329, + "step": 48952 + }, + { + "epoch": 0.29113735845465794, + "grad_norm": 1.41569983959198, + "learning_rate": 4.025236586921627e-05, + "loss": 4.7559, + "step": 48953 + }, + { + "epoch": 0.2911433057379389, + "grad_norm": 1.634981393814087, + "learning_rate": 4.025199577104624e-05, + "loss": 4.9082, + "step": 48954 + }, + { + "epoch": 0.2911492530212199, + "grad_norm": 1.6606131792068481, + "learning_rate": 4.0251625667551885e-05, + "loss": 4.3854, + "step": 48955 + }, + { + "epoch": 0.2911552003045009, + "grad_norm": 1.7049074172973633, + "learning_rate": 4.0251255558733346e-05, + "loss": 4.1777, + "step": 48956 + }, + { + "epoch": 0.2911611475877819, + "grad_norm": 1.6029120683670044, + "learning_rate": 4.025088544459076e-05, + "loss": 4.4442, + "step": 48957 + }, + { + "epoch": 0.2911670948710629, + "grad_norm": 1.3661177158355713, + "learning_rate": 4.025051532512425e-05, + "loss": 4.5209, + "step": 48958 + }, + { + "epoch": 0.2911730421543439, + "grad_norm": 1.590478777885437, + "learning_rate": 4.025014520033393e-05, + "loss": 4.4723, + "step": 48959 + }, + { + "epoch": 0.2911789894376249, + "grad_norm": 2.3576784133911133, + "learning_rate": 4.0249775070219947e-05, + "loss": 3.7421, + "step": 48960 + }, + { + "epoch": 0.2911849367209059, + "grad_norm": 2.8544883728027344, + "learning_rate": 4.024940493478243e-05, + "loss": 3.1299, + "step": 48961 + }, + { + "epoch": 0.2911908840041869, + "grad_norm": 2.723498821258545, + "learning_rate": 4.024903479402149e-05, + "loss": 2.9374, + "step": 48962 + }, + { + "epoch": 0.29119683128746787, + "grad_norm": 3.994497537612915, + "learning_rate": 4.0248664647937285e-05, + "loss": 2.514, + "step": 48963 + }, + { + "epoch": 0.2912027785707489, + "grad_norm": 2.3900978565216064, + "learning_rate": 4.024829449652992e-05, + "loss": 3.6488, + "step": 48964 + }, + { + "epoch": 0.2912087258540299, + "grad_norm": 2.192394733428955, + "learning_rate": 4.0247924339799536e-05, + "loss": 3.1019, + "step": 48965 + }, + { + "epoch": 0.29121467313731086, + "grad_norm": 2.684246063232422, + "learning_rate": 4.0247554177746264e-05, + "loss": 2.8362, + "step": 48966 + }, + { + "epoch": 0.2912206204205919, + "grad_norm": 3.210010528564453, + "learning_rate": 4.024718401037022e-05, + "loss": 3.2497, + "step": 48967 + }, + { + "epoch": 0.2912265677038729, + "grad_norm": 2.736170530319214, + "learning_rate": 4.024681383767155e-05, + "loss": 2.996, + "step": 48968 + }, + { + "epoch": 0.29123251498715386, + "grad_norm": 2.8896682262420654, + "learning_rate": 4.024644365965036e-05, + "loss": 2.6538, + "step": 48969 + }, + { + "epoch": 0.2912384622704349, + "grad_norm": 3.2804114818573, + "learning_rate": 4.024607347630681e-05, + "loss": 3.132, + "step": 48970 + }, + { + "epoch": 0.2912444095537159, + "grad_norm": 2.4957969188690186, + "learning_rate": 4.0245703287641e-05, + "loss": 3.4103, + "step": 48971 + }, + { + "epoch": 0.29125035683699685, + "grad_norm": 3.0824618339538574, + "learning_rate": 4.024533309365308e-05, + "loss": 2.3744, + "step": 48972 + }, + { + "epoch": 0.29125630412027786, + "grad_norm": 2.9459309577941895, + "learning_rate": 4.0244962894343174e-05, + "loss": 2.2146, + "step": 48973 + }, + { + "epoch": 0.2912622514035589, + "grad_norm": 3.235234260559082, + "learning_rate": 4.02445926897114e-05, + "loss": 1.8873, + "step": 48974 + }, + { + "epoch": 0.29126819868683984, + "grad_norm": 2.7940523624420166, + "learning_rate": 4.0244222479757906e-05, + "loss": 2.043, + "step": 48975 + }, + { + "epoch": 0.29127414597012086, + "grad_norm": 2.8329410552978516, + "learning_rate": 4.0243852264482804e-05, + "loss": 2.3538, + "step": 48976 + }, + { + "epoch": 0.2912800932534019, + "grad_norm": 2.76613450050354, + "learning_rate": 4.024348204388624e-05, + "loss": 1.9649, + "step": 48977 + }, + { + "epoch": 0.29128604053668283, + "grad_norm": 3.2704873085021973, + "learning_rate": 4.0243111817968324e-05, + "loss": 1.4701, + "step": 48978 + }, + { + "epoch": 0.29129198781996385, + "grad_norm": 3.19761061668396, + "learning_rate": 4.0242741586729205e-05, + "loss": 1.7163, + "step": 48979 + }, + { + "epoch": 0.29129793510324486, + "grad_norm": 2.188131809234619, + "learning_rate": 4.0242371350169e-05, + "loss": 4.9598, + "step": 48980 + }, + { + "epoch": 0.2913038823865258, + "grad_norm": 3.183361053466797, + "learning_rate": 4.024200110828783e-05, + "loss": 1.8594, + "step": 48981 + }, + { + "epoch": 0.29130982966980684, + "grad_norm": 3.2943148612976074, + "learning_rate": 4.024163086108585e-05, + "loss": 1.8101, + "step": 48982 + }, + { + "epoch": 0.29131577695308786, + "grad_norm": 3.7092864513397217, + "learning_rate": 4.024126060856316e-05, + "loss": 2.9796, + "step": 48983 + }, + { + "epoch": 0.2913217242363688, + "grad_norm": 3.701791763305664, + "learning_rate": 4.024089035071991e-05, + "loss": 3.7049, + "step": 48984 + }, + { + "epoch": 0.29132767151964983, + "grad_norm": 2.359251022338867, + "learning_rate": 4.024052008755622e-05, + "loss": 4.7868, + "step": 48985 + }, + { + "epoch": 0.2913336188029308, + "grad_norm": 5.375739097595215, + "learning_rate": 4.024014981907223e-05, + "loss": 3.6489, + "step": 48986 + }, + { + "epoch": 0.2913395660862118, + "grad_norm": 4.023863315582275, + "learning_rate": 4.023977954526805e-05, + "loss": 2.8577, + "step": 48987 + }, + { + "epoch": 0.2913455133694928, + "grad_norm": 3.232170581817627, + "learning_rate": 4.023940926614382e-05, + "loss": 1.7378, + "step": 48988 + }, + { + "epoch": 0.2913514606527738, + "grad_norm": 3.292628049850464, + "learning_rate": 4.023903898169968e-05, + "loss": 3.0509, + "step": 48989 + }, + { + "epoch": 0.2913574079360548, + "grad_norm": 2.057915449142456, + "learning_rate": 4.023866869193575e-05, + "loss": 4.538, + "step": 48990 + }, + { + "epoch": 0.2913633552193358, + "grad_norm": 1.9023517370224, + "learning_rate": 4.0238298396852146e-05, + "loss": 4.5666, + "step": 48991 + }, + { + "epoch": 0.2913693025026168, + "grad_norm": 2.0747458934783936, + "learning_rate": 4.023792809644902e-05, + "loss": 4.1791, + "step": 48992 + }, + { + "epoch": 0.2913752497858978, + "grad_norm": 2.5062272548675537, + "learning_rate": 4.0237557790726484e-05, + "loss": 3.6824, + "step": 48993 + }, + { + "epoch": 0.2913811970691788, + "grad_norm": 2.679063081741333, + "learning_rate": 4.023718747968468e-05, + "loss": 3.6907, + "step": 48994 + }, + { + "epoch": 0.29138714435245977, + "grad_norm": 2.3919222354888916, + "learning_rate": 4.023681716332373e-05, + "loss": 3.4674, + "step": 48995 + }, + { + "epoch": 0.2913930916357408, + "grad_norm": 1.6285731792449951, + "learning_rate": 4.023644684164376e-05, + "loss": 4.5006, + "step": 48996 + }, + { + "epoch": 0.2913990389190218, + "grad_norm": 1.711290955543518, + "learning_rate": 4.0236076514644907e-05, + "loss": 4.5186, + "step": 48997 + }, + { + "epoch": 0.29140498620230276, + "grad_norm": 1.7432425022125244, + "learning_rate": 4.023570618232729e-05, + "loss": 4.5889, + "step": 48998 + }, + { + "epoch": 0.2914109334855838, + "grad_norm": 1.5965371131896973, + "learning_rate": 4.023533584469106e-05, + "loss": 4.5926, + "step": 48999 + }, + { + "epoch": 0.2914168807688648, + "grad_norm": 1.5147252082824707, + "learning_rate": 4.023496550173632e-05, + "loss": 4.4117, + "step": 49000 + }, + { + "epoch": 0.29142282805214575, + "grad_norm": 1.6077667474746704, + "learning_rate": 4.0234595153463216e-05, + "loss": 4.7561, + "step": 49001 + }, + { + "epoch": 0.29142877533542677, + "grad_norm": 1.9641988277435303, + "learning_rate": 4.023422479987187e-05, + "loss": 4.5573, + "step": 49002 + }, + { + "epoch": 0.2914347226187078, + "grad_norm": 1.7001874446868896, + "learning_rate": 4.0233854440962413e-05, + "loss": 4.3923, + "step": 49003 + }, + { + "epoch": 0.29144066990198875, + "grad_norm": 1.628224492073059, + "learning_rate": 4.0233484076734974e-05, + "loss": 4.1359, + "step": 49004 + }, + { + "epoch": 0.29144661718526976, + "grad_norm": 1.6306486129760742, + "learning_rate": 4.023311370718969e-05, + "loss": 4.6006, + "step": 49005 + }, + { + "epoch": 0.2914525644685508, + "grad_norm": 1.749487042427063, + "learning_rate": 4.023274333232668e-05, + "loss": 4.5741, + "step": 49006 + }, + { + "epoch": 0.29145851175183174, + "grad_norm": 1.967379093170166, + "learning_rate": 4.023237295214607e-05, + "loss": 3.5958, + "step": 49007 + }, + { + "epoch": 0.29146445903511276, + "grad_norm": 1.907862901687622, + "learning_rate": 4.023200256664801e-05, + "loss": 4.5074, + "step": 49008 + }, + { + "epoch": 0.29147040631839377, + "grad_norm": 2.0075440406799316, + "learning_rate": 4.02316321758326e-05, + "loss": 4.4448, + "step": 49009 + }, + { + "epoch": 0.29147635360167473, + "grad_norm": 1.5046751499176025, + "learning_rate": 4.023126177969999e-05, + "loss": 4.6476, + "step": 49010 + }, + { + "epoch": 0.29148230088495575, + "grad_norm": 1.4729866981506348, + "learning_rate": 4.0230891378250305e-05, + "loss": 4.5847, + "step": 49011 + }, + { + "epoch": 0.29148824816823676, + "grad_norm": 1.424261212348938, + "learning_rate": 4.023052097148367e-05, + "loss": 4.7071, + "step": 49012 + }, + { + "epoch": 0.2914941954515177, + "grad_norm": 1.8476958274841309, + "learning_rate": 4.0230150559400226e-05, + "loss": 4.3992, + "step": 49013 + }, + { + "epoch": 0.29150014273479874, + "grad_norm": 1.6788079738616943, + "learning_rate": 4.0229780142000084e-05, + "loss": 4.3813, + "step": 49014 + }, + { + "epoch": 0.29150609001807976, + "grad_norm": 2.188704490661621, + "learning_rate": 4.022940971928339e-05, + "loss": 3.676, + "step": 49015 + }, + { + "epoch": 0.2915120373013607, + "grad_norm": 1.537778615951538, + "learning_rate": 4.022903929125026e-05, + "loss": 4.7427, + "step": 49016 + }, + { + "epoch": 0.29151798458464173, + "grad_norm": 1.6089632511138916, + "learning_rate": 4.0228668857900836e-05, + "loss": 4.8112, + "step": 49017 + }, + { + "epoch": 0.29152393186792275, + "grad_norm": 1.806222677230835, + "learning_rate": 4.022829841923524e-05, + "loss": 4.2768, + "step": 49018 + }, + { + "epoch": 0.2915298791512037, + "grad_norm": 1.7346224784851074, + "learning_rate": 4.02279279752536e-05, + "loss": 4.0351, + "step": 49019 + }, + { + "epoch": 0.2915358264344847, + "grad_norm": 1.6651338338851929, + "learning_rate": 4.022755752595605e-05, + "loss": 4.8883, + "step": 49020 + }, + { + "epoch": 0.29154177371776574, + "grad_norm": 1.9259669780731201, + "learning_rate": 4.022718707134271e-05, + "loss": 3.9762, + "step": 49021 + }, + { + "epoch": 0.2915477210010467, + "grad_norm": 1.8029941320419312, + "learning_rate": 4.022681661141372e-05, + "loss": 4.1638, + "step": 49022 + }, + { + "epoch": 0.2915536682843277, + "grad_norm": 1.8861573934555054, + "learning_rate": 4.0226446146169214e-05, + "loss": 4.155, + "step": 49023 + }, + { + "epoch": 0.29155961556760873, + "grad_norm": 2.0168399810791016, + "learning_rate": 4.0226075675609304e-05, + "loss": 4.4025, + "step": 49024 + }, + { + "epoch": 0.2915655628508897, + "grad_norm": 2.3168575763702393, + "learning_rate": 4.0225705199734134e-05, + "loss": 3.9563, + "step": 49025 + }, + { + "epoch": 0.2915715101341707, + "grad_norm": 2.8046817779541016, + "learning_rate": 4.0225334718543825e-05, + "loss": 2.8101, + "step": 49026 + }, + { + "epoch": 0.2915774574174517, + "grad_norm": 2.2052721977233887, + "learning_rate": 4.0224964232038506e-05, + "loss": 4.5238, + "step": 49027 + }, + { + "epoch": 0.2915834047007327, + "grad_norm": 1.8135850429534912, + "learning_rate": 4.022459374021831e-05, + "loss": 4.0248, + "step": 49028 + }, + { + "epoch": 0.2915893519840137, + "grad_norm": 2.7616493701934814, + "learning_rate": 4.022422324308337e-05, + "loss": 3.6521, + "step": 49029 + }, + { + "epoch": 0.2915952992672947, + "grad_norm": 1.6787965297698975, + "learning_rate": 4.0223852740633806e-05, + "loss": 4.8203, + "step": 49030 + }, + { + "epoch": 0.2916012465505757, + "grad_norm": 1.5364608764648438, + "learning_rate": 4.022348223286975e-05, + "loss": 4.7936, + "step": 49031 + }, + { + "epoch": 0.2916071938338567, + "grad_norm": 1.47334623336792, + "learning_rate": 4.022311171979134e-05, + "loss": 4.6928, + "step": 49032 + }, + { + "epoch": 0.2916131411171377, + "grad_norm": 1.640153408050537, + "learning_rate": 4.0222741201398694e-05, + "loss": 4.4753, + "step": 49033 + }, + { + "epoch": 0.29161908840041867, + "grad_norm": 1.7532119750976562, + "learning_rate": 4.022237067769195e-05, + "loss": 4.615, + "step": 49034 + }, + { + "epoch": 0.2916250356836997, + "grad_norm": 1.633864164352417, + "learning_rate": 4.022200014867123e-05, + "loss": 4.4376, + "step": 49035 + }, + { + "epoch": 0.2916309829669807, + "grad_norm": 1.7269270420074463, + "learning_rate": 4.022162961433667e-05, + "loss": 4.6406, + "step": 49036 + }, + { + "epoch": 0.29163693025026166, + "grad_norm": 1.952868103981018, + "learning_rate": 4.02212590746884e-05, + "loss": 4.1107, + "step": 49037 + }, + { + "epoch": 0.2916428775335427, + "grad_norm": 1.9080276489257812, + "learning_rate": 4.0220888529726545e-05, + "loss": 4.0768, + "step": 49038 + }, + { + "epoch": 0.2916488248168237, + "grad_norm": 1.8396148681640625, + "learning_rate": 4.022051797945123e-05, + "loss": 4.1752, + "step": 49039 + }, + { + "epoch": 0.29165477210010465, + "grad_norm": 1.657362461090088, + "learning_rate": 4.022014742386259e-05, + "loss": 4.8947, + "step": 49040 + }, + { + "epoch": 0.29166071938338567, + "grad_norm": 1.5216144323349, + "learning_rate": 4.021977686296076e-05, + "loss": 4.6482, + "step": 49041 + }, + { + "epoch": 0.2916666666666667, + "grad_norm": 1.5991315841674805, + "learning_rate": 4.0219406296745856e-05, + "loss": 4.4442, + "step": 49042 + }, + { + "epoch": 0.29167261394994765, + "grad_norm": 1.8341541290283203, + "learning_rate": 4.021903572521802e-05, + "loss": 4.4588, + "step": 49043 + }, + { + "epoch": 0.29167856123322866, + "grad_norm": 1.5571386814117432, + "learning_rate": 4.021866514837737e-05, + "loss": 4.6914, + "step": 49044 + }, + { + "epoch": 0.2916845085165097, + "grad_norm": 1.6046607494354248, + "learning_rate": 4.021829456622405e-05, + "loss": 4.8012, + "step": 49045 + }, + { + "epoch": 0.29169045579979064, + "grad_norm": 1.5444254875183105, + "learning_rate": 4.021792397875817e-05, + "loss": 4.5897, + "step": 49046 + }, + { + "epoch": 0.29169640308307165, + "grad_norm": 1.4090944528579712, + "learning_rate": 4.021755338597988e-05, + "loss": 4.6479, + "step": 49047 + }, + { + "epoch": 0.29170235036635267, + "grad_norm": 1.5755938291549683, + "learning_rate": 4.021718278788929e-05, + "loss": 4.8341, + "step": 49048 + }, + { + "epoch": 0.29170829764963363, + "grad_norm": 1.7611162662506104, + "learning_rate": 4.021681218448654e-05, + "loss": 4.3625, + "step": 49049 + }, + { + "epoch": 0.29171424493291465, + "grad_norm": 1.7568581104278564, + "learning_rate": 4.021644157577176e-05, + "loss": 4.9172, + "step": 49050 + }, + { + "epoch": 0.29172019221619566, + "grad_norm": 1.6059587001800537, + "learning_rate": 4.0216070961745075e-05, + "loss": 5.5185, + "step": 49051 + }, + { + "epoch": 0.2917261394994766, + "grad_norm": 1.5300272703170776, + "learning_rate": 4.0215700342406627e-05, + "loss": 5.1012, + "step": 49052 + }, + { + "epoch": 0.29173208678275764, + "grad_norm": 1.9078943729400635, + "learning_rate": 4.021532971775653e-05, + "loss": 4.2842, + "step": 49053 + }, + { + "epoch": 0.29173803406603865, + "grad_norm": 1.6377966403961182, + "learning_rate": 4.021495908779491e-05, + "loss": 4.5211, + "step": 49054 + }, + { + "epoch": 0.2917439813493196, + "grad_norm": 1.904160737991333, + "learning_rate": 4.021458845252192e-05, + "loss": 4.2394, + "step": 49055 + }, + { + "epoch": 0.29174992863260063, + "grad_norm": 1.6715806722640991, + "learning_rate": 4.021421781193766e-05, + "loss": 4.2967, + "step": 49056 + }, + { + "epoch": 0.29175587591588165, + "grad_norm": 1.6867799758911133, + "learning_rate": 4.021384716604228e-05, + "loss": 4.3171, + "step": 49057 + }, + { + "epoch": 0.2917618231991626, + "grad_norm": 1.739024043083191, + "learning_rate": 4.02134765148359e-05, + "loss": 4.2558, + "step": 49058 + }, + { + "epoch": 0.2917677704824436, + "grad_norm": 1.7100204229354858, + "learning_rate": 4.021310585831866e-05, + "loss": 4.889, + "step": 49059 + }, + { + "epoch": 0.29177371776572464, + "grad_norm": 1.974271297454834, + "learning_rate": 4.021273519649068e-05, + "loss": 4.2635, + "step": 49060 + }, + { + "epoch": 0.2917796650490056, + "grad_norm": 1.8482334613800049, + "learning_rate": 4.0212364529352085e-05, + "loss": 4.673, + "step": 49061 + }, + { + "epoch": 0.2917856123322866, + "grad_norm": 1.675781011581421, + "learning_rate": 4.021199385690302e-05, + "loss": 5.2223, + "step": 49062 + }, + { + "epoch": 0.29179155961556763, + "grad_norm": 1.5875986814498901, + "learning_rate": 4.0211623179143595e-05, + "loss": 5.075, + "step": 49063 + }, + { + "epoch": 0.2917975068988486, + "grad_norm": 1.9315588474273682, + "learning_rate": 4.0211252496073954e-05, + "loss": 4.425, + "step": 49064 + }, + { + "epoch": 0.2918034541821296, + "grad_norm": 1.8294585943222046, + "learning_rate": 4.021088180769422e-05, + "loss": 4.204, + "step": 49065 + }, + { + "epoch": 0.2918094014654106, + "grad_norm": 1.7064039707183838, + "learning_rate": 4.021051111400453e-05, + "loss": 5.2572, + "step": 49066 + }, + { + "epoch": 0.2918153487486916, + "grad_norm": 1.6065455675125122, + "learning_rate": 4.0210140415005005e-05, + "loss": 5.0166, + "step": 49067 + }, + { + "epoch": 0.2918212960319726, + "grad_norm": 1.4948194026947021, + "learning_rate": 4.0209769710695775e-05, + "loss": 5.1553, + "step": 49068 + }, + { + "epoch": 0.2918272433152536, + "grad_norm": 1.546315312385559, + "learning_rate": 4.020939900107698e-05, + "loss": 4.7142, + "step": 49069 + }, + { + "epoch": 0.2918331905985346, + "grad_norm": 1.552028775215149, + "learning_rate": 4.020902828614872e-05, + "loss": 5.6426, + "step": 49070 + }, + { + "epoch": 0.2918391378818156, + "grad_norm": 1.8904566764831543, + "learning_rate": 4.0208657565911165e-05, + "loss": 5.026, + "step": 49071 + }, + { + "epoch": 0.2918450851650966, + "grad_norm": 1.8787254095077515, + "learning_rate": 4.020828684036442e-05, + "loss": 4.5451, + "step": 49072 + }, + { + "epoch": 0.29185103244837757, + "grad_norm": 2.379425525665283, + "learning_rate": 4.0207916109508616e-05, + "loss": 4.7131, + "step": 49073 + }, + { + "epoch": 0.2918569797316586, + "grad_norm": 2.023606538772583, + "learning_rate": 4.0207545373343894e-05, + "loss": 4.9155, + "step": 49074 + }, + { + "epoch": 0.2918629270149396, + "grad_norm": 2.0350520610809326, + "learning_rate": 4.020717463187036e-05, + "loss": 4.6613, + "step": 49075 + }, + { + "epoch": 0.29186887429822056, + "grad_norm": 1.3380069732666016, + "learning_rate": 4.020680388508817e-05, + "loss": 5.1202, + "step": 49076 + }, + { + "epoch": 0.2918748215815016, + "grad_norm": 1.4349443912506104, + "learning_rate": 4.020643313299743e-05, + "loss": 5.2218, + "step": 49077 + }, + { + "epoch": 0.2918807688647826, + "grad_norm": 1.4462963342666626, + "learning_rate": 4.02060623755983e-05, + "loss": 5.3565, + "step": 49078 + }, + { + "epoch": 0.29188671614806355, + "grad_norm": 1.4389230012893677, + "learning_rate": 4.020569161289088e-05, + "loss": 5.0237, + "step": 49079 + }, + { + "epoch": 0.29189266343134457, + "grad_norm": 1.28811514377594, + "learning_rate": 4.020532084487531e-05, + "loss": 4.9839, + "step": 49080 + }, + { + "epoch": 0.2918986107146256, + "grad_norm": 1.4158865213394165, + "learning_rate": 4.0204950071551714e-05, + "loss": 4.9817, + "step": 49081 + }, + { + "epoch": 0.29190455799790654, + "grad_norm": 1.5580554008483887, + "learning_rate": 4.020457929292024e-05, + "loss": 4.9688, + "step": 49082 + }, + { + "epoch": 0.29191050528118756, + "grad_norm": 1.5083537101745605, + "learning_rate": 4.0204208508981e-05, + "loss": 5.086, + "step": 49083 + }, + { + "epoch": 0.2919164525644686, + "grad_norm": 1.4788111448287964, + "learning_rate": 4.0203837719734125e-05, + "loss": 4.983, + "step": 49084 + }, + { + "epoch": 0.29192239984774954, + "grad_norm": 1.481440782546997, + "learning_rate": 4.020346692517974e-05, + "loss": 5.0253, + "step": 49085 + }, + { + "epoch": 0.29192834713103055, + "grad_norm": 1.3477039337158203, + "learning_rate": 4.020309612531799e-05, + "loss": 5.1229, + "step": 49086 + }, + { + "epoch": 0.29193429441431157, + "grad_norm": 1.3533120155334473, + "learning_rate": 4.020272532014901e-05, + "loss": 5.4019, + "step": 49087 + }, + { + "epoch": 0.29194024169759253, + "grad_norm": 1.8901493549346924, + "learning_rate": 4.020235450967289e-05, + "loss": 4.5184, + "step": 49088 + }, + { + "epoch": 0.29194618898087354, + "grad_norm": 1.4900676012039185, + "learning_rate": 4.02019836938898e-05, + "loss": 4.9523, + "step": 49089 + }, + { + "epoch": 0.29195213626415456, + "grad_norm": 1.8112949132919312, + "learning_rate": 4.0201612872799856e-05, + "loss": 5.3704, + "step": 49090 + }, + { + "epoch": 0.2919580835474355, + "grad_norm": 1.6014530658721924, + "learning_rate": 4.0201242046403174e-05, + "loss": 5.0377, + "step": 49091 + }, + { + "epoch": 0.29196403083071654, + "grad_norm": 1.4804733991622925, + "learning_rate": 4.020087121469991e-05, + "loss": 4.9919, + "step": 49092 + }, + { + "epoch": 0.29196997811399755, + "grad_norm": 1.5506525039672852, + "learning_rate": 4.020050037769017e-05, + "loss": 5.4328, + "step": 49093 + }, + { + "epoch": 0.2919759253972785, + "grad_norm": 1.5048047304153442, + "learning_rate": 4.02001295353741e-05, + "loss": 5.1402, + "step": 49094 + }, + { + "epoch": 0.29198187268055953, + "grad_norm": 1.3704020977020264, + "learning_rate": 4.019975868775181e-05, + "loss": 4.8689, + "step": 49095 + }, + { + "epoch": 0.29198781996384054, + "grad_norm": 1.2972177267074585, + "learning_rate": 4.0199387834823446e-05, + "loss": 5.0613, + "step": 49096 + }, + { + "epoch": 0.2919937672471215, + "grad_norm": 1.4629311561584473, + "learning_rate": 4.019901697658913e-05, + "loss": 5.2686, + "step": 49097 + }, + { + "epoch": 0.2919997145304025, + "grad_norm": 1.288181185722351, + "learning_rate": 4.019864611304901e-05, + "loss": 4.987, + "step": 49098 + }, + { + "epoch": 0.29200566181368354, + "grad_norm": 1.379538655281067, + "learning_rate": 4.019827524420318e-05, + "loss": 4.9367, + "step": 49099 + }, + { + "epoch": 0.2920116090969645, + "grad_norm": 1.3563886880874634, + "learning_rate": 4.0197904370051796e-05, + "loss": 4.6801, + "step": 49100 + }, + { + "epoch": 0.2920175563802455, + "grad_norm": 1.2311102151870728, + "learning_rate": 4.0197533490594984e-05, + "loss": 5.0694, + "step": 49101 + }, + { + "epoch": 0.2920235036635265, + "grad_norm": 1.4693949222564697, + "learning_rate": 4.019716260583287e-05, + "loss": 5.0123, + "step": 49102 + }, + { + "epoch": 0.2920294509468075, + "grad_norm": 1.3002338409423828, + "learning_rate": 4.019679171576558e-05, + "loss": 4.9488, + "step": 49103 + }, + { + "epoch": 0.2920353982300885, + "grad_norm": 1.72318696975708, + "learning_rate": 4.019642082039325e-05, + "loss": 4.638, + "step": 49104 + }, + { + "epoch": 0.29204134551336947, + "grad_norm": 1.3760623931884766, + "learning_rate": 4.0196049919716004e-05, + "loss": 5.0441, + "step": 49105 + }, + { + "epoch": 0.2920472927966505, + "grad_norm": 1.389760971069336, + "learning_rate": 4.0195679013733976e-05, + "loss": 4.9034, + "step": 49106 + }, + { + "epoch": 0.2920532400799315, + "grad_norm": 1.2887393236160278, + "learning_rate": 4.019530810244729e-05, + "loss": 5.0247, + "step": 49107 + }, + { + "epoch": 0.29205918736321246, + "grad_norm": 1.2798432111740112, + "learning_rate": 4.019493718585608e-05, + "loss": 5.0062, + "step": 49108 + }, + { + "epoch": 0.2920651346464935, + "grad_norm": 1.40203857421875, + "learning_rate": 4.019456626396048e-05, + "loss": 4.9316, + "step": 49109 + }, + { + "epoch": 0.2920710819297745, + "grad_norm": 1.3244411945343018, + "learning_rate": 4.019419533676061e-05, + "loss": 4.8202, + "step": 49110 + }, + { + "epoch": 0.29207702921305545, + "grad_norm": 1.3732634782791138, + "learning_rate": 4.01938244042566e-05, + "loss": 5.015, + "step": 49111 + }, + { + "epoch": 0.29208297649633647, + "grad_norm": 1.790515661239624, + "learning_rate": 4.019345346644859e-05, + "loss": 4.8685, + "step": 49112 + }, + { + "epoch": 0.2920889237796175, + "grad_norm": 1.8550598621368408, + "learning_rate": 4.0193082523336695e-05, + "loss": 4.9729, + "step": 49113 + }, + { + "epoch": 0.29209487106289844, + "grad_norm": 1.6464852094650269, + "learning_rate": 4.0192711574921054e-05, + "loss": 4.8301, + "step": 49114 + }, + { + "epoch": 0.29210081834617946, + "grad_norm": 1.7711821794509888, + "learning_rate": 4.0192340621201794e-05, + "loss": 4.5644, + "step": 49115 + }, + { + "epoch": 0.2921067656294605, + "grad_norm": 2.008634328842163, + "learning_rate": 4.019196966217905e-05, + "loss": 4.3459, + "step": 49116 + }, + { + "epoch": 0.29211271291274143, + "grad_norm": 2.2943196296691895, + "learning_rate": 4.019159869785294e-05, + "loss": 3.333, + "step": 49117 + }, + { + "epoch": 0.29211866019602245, + "grad_norm": 1.5301496982574463, + "learning_rate": 4.01912277282236e-05, + "loss": 4.8261, + "step": 49118 + }, + { + "epoch": 0.29212460747930347, + "grad_norm": 1.4260891675949097, + "learning_rate": 4.019085675329117e-05, + "loss": 4.777, + "step": 49119 + }, + { + "epoch": 0.2921305547625844, + "grad_norm": 1.3664993047714233, + "learning_rate": 4.019048577305575e-05, + "loss": 4.6613, + "step": 49120 + }, + { + "epoch": 0.29213650204586544, + "grad_norm": 1.6004990339279175, + "learning_rate": 4.019011478751751e-05, + "loss": 4.8603, + "step": 49121 + }, + { + "epoch": 0.29214244932914646, + "grad_norm": 1.453370451927185, + "learning_rate": 4.018974379667654e-05, + "loss": 4.8119, + "step": 49122 + }, + { + "epoch": 0.2921483966124274, + "grad_norm": 1.6023846864700317, + "learning_rate": 4.0189372800532995e-05, + "loss": 4.5399, + "step": 49123 + }, + { + "epoch": 0.29215434389570843, + "grad_norm": 1.5257498025894165, + "learning_rate": 4.0189001799087e-05, + "loss": 4.8834, + "step": 49124 + }, + { + "epoch": 0.29216029117898945, + "grad_norm": 1.6109247207641602, + "learning_rate": 4.018863079233868e-05, + "loss": 5.0107, + "step": 49125 + }, + { + "epoch": 0.2921662384622704, + "grad_norm": 1.3890563249588013, + "learning_rate": 4.018825978028816e-05, + "loss": 4.8849, + "step": 49126 + }, + { + "epoch": 0.2921721857455514, + "grad_norm": 1.4233943223953247, + "learning_rate": 4.0187888762935575e-05, + "loss": 4.824, + "step": 49127 + }, + { + "epoch": 0.29217813302883244, + "grad_norm": 1.3751394748687744, + "learning_rate": 4.018751774028106e-05, + "loss": 4.6153, + "step": 49128 + }, + { + "epoch": 0.2921840803121134, + "grad_norm": 1.2199214696884155, + "learning_rate": 4.018714671232474e-05, + "loss": 4.8291, + "step": 49129 + }, + { + "epoch": 0.2921900275953944, + "grad_norm": 1.2434072494506836, + "learning_rate": 4.0186775679066745e-05, + "loss": 4.9727, + "step": 49130 + }, + { + "epoch": 0.29219597487867544, + "grad_norm": 1.2543809413909912, + "learning_rate": 4.0186404640507195e-05, + "loss": 4.9676, + "step": 49131 + }, + { + "epoch": 0.2922019221619564, + "grad_norm": 1.3311909437179565, + "learning_rate": 4.0186033596646235e-05, + "loss": 5.0732, + "step": 49132 + }, + { + "epoch": 0.2922078694452374, + "grad_norm": 1.374448299407959, + "learning_rate": 4.018566254748399e-05, + "loss": 4.9205, + "step": 49133 + }, + { + "epoch": 0.2922138167285184, + "grad_norm": 1.2923041582107544, + "learning_rate": 4.018529149302058e-05, + "loss": 4.7485, + "step": 49134 + }, + { + "epoch": 0.2922197640117994, + "grad_norm": 1.3668478727340698, + "learning_rate": 4.018492043325614e-05, + "loss": 4.5783, + "step": 49135 + }, + { + "epoch": 0.2922257112950804, + "grad_norm": 1.2057582139968872, + "learning_rate": 4.018454936819082e-05, + "loss": 4.8935, + "step": 49136 + }, + { + "epoch": 0.2922316585783614, + "grad_norm": 1.2213565111160278, + "learning_rate": 4.0184178297824706e-05, + "loss": 4.8052, + "step": 49137 + }, + { + "epoch": 0.2922376058616424, + "grad_norm": 1.1638693809509277, + "learning_rate": 4.0183807222157964e-05, + "loss": 4.857, + "step": 49138 + }, + { + "epoch": 0.2922435531449234, + "grad_norm": 1.268401026725769, + "learning_rate": 4.018343614119071e-05, + "loss": 4.969, + "step": 49139 + }, + { + "epoch": 0.2922495004282044, + "grad_norm": 1.3748703002929688, + "learning_rate": 4.0183065054923076e-05, + "loss": 4.9094, + "step": 49140 + }, + { + "epoch": 0.29225544771148537, + "grad_norm": 1.4235283136367798, + "learning_rate": 4.018269396335519e-05, + "loss": 4.7665, + "step": 49141 + }, + { + "epoch": 0.2922613949947664, + "grad_norm": 1.2364617586135864, + "learning_rate": 4.018232286648719e-05, + "loss": 4.8808, + "step": 49142 + }, + { + "epoch": 0.2922673422780474, + "grad_norm": 1.4014484882354736, + "learning_rate": 4.0181951764319186e-05, + "loss": 4.8861, + "step": 49143 + }, + { + "epoch": 0.29227328956132836, + "grad_norm": 1.3011451959609985, + "learning_rate": 4.018158065685132e-05, + "loss": 5.0496, + "step": 49144 + }, + { + "epoch": 0.2922792368446094, + "grad_norm": 1.411995768547058, + "learning_rate": 4.018120954408373e-05, + "loss": 4.6577, + "step": 49145 + }, + { + "epoch": 0.2922851841278904, + "grad_norm": 1.2160162925720215, + "learning_rate": 4.018083842601653e-05, + "loss": 4.9278, + "step": 49146 + }, + { + "epoch": 0.29229113141117136, + "grad_norm": 1.445405125617981, + "learning_rate": 4.018046730264986e-05, + "loss": 4.8211, + "step": 49147 + }, + { + "epoch": 0.2922970786944524, + "grad_norm": 1.266584873199463, + "learning_rate": 4.018009617398385e-05, + "loss": 4.9068, + "step": 49148 + }, + { + "epoch": 0.2923030259777334, + "grad_norm": 1.4448797702789307, + "learning_rate": 4.017972504001862e-05, + "loss": 4.953, + "step": 49149 + }, + { + "epoch": 0.29230897326101435, + "grad_norm": 1.2853118181228638, + "learning_rate": 4.01793539007543e-05, + "loss": 4.8687, + "step": 49150 + }, + { + "epoch": 0.29231492054429536, + "grad_norm": 1.3338589668273926, + "learning_rate": 4.017898275619103e-05, + "loss": 4.8543, + "step": 49151 + }, + { + "epoch": 0.2923208678275764, + "grad_norm": 1.4180586338043213, + "learning_rate": 4.017861160632893e-05, + "loss": 4.6979, + "step": 49152 + }, + { + "epoch": 0.29232681511085734, + "grad_norm": 1.3098088502883911, + "learning_rate": 4.017824045116814e-05, + "loss": 4.7182, + "step": 49153 + }, + { + "epoch": 0.29233276239413836, + "grad_norm": 1.2089447975158691, + "learning_rate": 4.017786929070878e-05, + "loss": 4.6055, + "step": 49154 + }, + { + "epoch": 0.2923387096774194, + "grad_norm": 1.3824656009674072, + "learning_rate": 4.017749812495098e-05, + "loss": 4.4924, + "step": 49155 + }, + { + "epoch": 0.29234465696070033, + "grad_norm": 1.5357953310012817, + "learning_rate": 4.017712695389487e-05, + "loss": 4.0907, + "step": 49156 + }, + { + "epoch": 0.29235060424398135, + "grad_norm": 1.555621862411499, + "learning_rate": 4.017675577754059e-05, + "loss": 4.2974, + "step": 49157 + }, + { + "epoch": 0.29235655152726236, + "grad_norm": 1.317092776298523, + "learning_rate": 4.017638459588826e-05, + "loss": 4.5639, + "step": 49158 + }, + { + "epoch": 0.2923624988105433, + "grad_norm": 1.3342170715332031, + "learning_rate": 4.0176013408938005e-05, + "loss": 4.5748, + "step": 49159 + }, + { + "epoch": 0.29236844609382434, + "grad_norm": 1.455074667930603, + "learning_rate": 4.017564221668997e-05, + "loss": 4.5908, + "step": 49160 + }, + { + "epoch": 0.29237439337710536, + "grad_norm": 1.1931740045547485, + "learning_rate": 4.017527101914427e-05, + "loss": 4.5332, + "step": 49161 + }, + { + "epoch": 0.2923803406603863, + "grad_norm": 1.25167715549469, + "learning_rate": 4.017489981630103e-05, + "loss": 4.5077, + "step": 49162 + }, + { + "epoch": 0.29238628794366733, + "grad_norm": 1.5132813453674316, + "learning_rate": 4.01745286081604e-05, + "loss": 4.9111, + "step": 49163 + }, + { + "epoch": 0.29239223522694835, + "grad_norm": 1.3877792358398438, + "learning_rate": 4.01741573947225e-05, + "loss": 5.1642, + "step": 49164 + }, + { + "epoch": 0.2923981825102293, + "grad_norm": 1.4559231996536255, + "learning_rate": 4.0173786175987446e-05, + "loss": 5.0545, + "step": 49165 + }, + { + "epoch": 0.2924041297935103, + "grad_norm": 1.4111216068267822, + "learning_rate": 4.0173414951955395e-05, + "loss": 4.6874, + "step": 49166 + }, + { + "epoch": 0.29241007707679134, + "grad_norm": 1.3272346258163452, + "learning_rate": 4.017304372262646e-05, + "loss": 5.0965, + "step": 49167 + }, + { + "epoch": 0.2924160243600723, + "grad_norm": 1.2915570735931396, + "learning_rate": 4.017267248800076e-05, + "loss": 4.7085, + "step": 49168 + }, + { + "epoch": 0.2924219716433533, + "grad_norm": 1.3380718231201172, + "learning_rate": 4.0172301248078445e-05, + "loss": 4.0732, + "step": 49169 + }, + { + "epoch": 0.29242791892663433, + "grad_norm": 1.8665608167648315, + "learning_rate": 4.0171930002859633e-05, + "loss": 3.3707, + "step": 49170 + }, + { + "epoch": 0.2924338662099153, + "grad_norm": 1.38138747215271, + "learning_rate": 4.017155875234446e-05, + "loss": 4.9878, + "step": 49171 + }, + { + "epoch": 0.2924398134931963, + "grad_norm": 1.564352035522461, + "learning_rate": 4.0171187496533055e-05, + "loss": 4.895, + "step": 49172 + }, + { + "epoch": 0.2924457607764773, + "grad_norm": 1.304746150970459, + "learning_rate": 4.0170816235425546e-05, + "loss": 5.1605, + "step": 49173 + }, + { + "epoch": 0.2924517080597583, + "grad_norm": 1.2918643951416016, + "learning_rate": 4.017044496902206e-05, + "loss": 5.122, + "step": 49174 + }, + { + "epoch": 0.2924576553430393, + "grad_norm": 1.591590404510498, + "learning_rate": 4.017007369732273e-05, + "loss": 4.335, + "step": 49175 + }, + { + "epoch": 0.2924636026263203, + "grad_norm": 2.054715394973755, + "learning_rate": 4.0169702420327674e-05, + "loss": 2.9604, + "step": 49176 + }, + { + "epoch": 0.2924695499096013, + "grad_norm": 2.068392276763916, + "learning_rate": 4.0169331138037036e-05, + "loss": 2.967, + "step": 49177 + }, + { + "epoch": 0.2924754971928823, + "grad_norm": 1.9173328876495361, + "learning_rate": 4.016895985045095e-05, + "loss": 2.94, + "step": 49178 + }, + { + "epoch": 0.2924814444761633, + "grad_norm": 2.084082841873169, + "learning_rate": 4.016858855756953e-05, + "loss": 2.916, + "step": 49179 + }, + { + "epoch": 0.29248739175944427, + "grad_norm": 2.043991804122925, + "learning_rate": 4.016821725939291e-05, + "loss": 2.9765, + "step": 49180 + }, + { + "epoch": 0.2924933390427253, + "grad_norm": 1.7212384939193726, + "learning_rate": 4.016784595592122e-05, + "loss": 3.306, + "step": 49181 + }, + { + "epoch": 0.2924992863260063, + "grad_norm": 2.060072660446167, + "learning_rate": 4.01674746471546e-05, + "loss": 2.6376, + "step": 49182 + }, + { + "epoch": 0.29250523360928726, + "grad_norm": 1.9566768407821655, + "learning_rate": 4.016710333309317e-05, + "loss": 2.7047, + "step": 49183 + }, + { + "epoch": 0.2925111808925683, + "grad_norm": 2.0073883533477783, + "learning_rate": 4.016673201373706e-05, + "loss": 2.8602, + "step": 49184 + }, + { + "epoch": 0.2925171281758493, + "grad_norm": 2.1142237186431885, + "learning_rate": 4.01663606890864e-05, + "loss": 2.9765, + "step": 49185 + }, + { + "epoch": 0.29252307545913026, + "grad_norm": 2.1051015853881836, + "learning_rate": 4.0165989359141324e-05, + "loss": 2.6777, + "step": 49186 + }, + { + "epoch": 0.29252902274241127, + "grad_norm": 2.3999698162078857, + "learning_rate": 4.016561802390195e-05, + "loss": 2.5735, + "step": 49187 + }, + { + "epoch": 0.2925349700256923, + "grad_norm": 2.5016870498657227, + "learning_rate": 4.016524668336842e-05, + "loss": 2.779, + "step": 49188 + }, + { + "epoch": 0.29254091730897325, + "grad_norm": 2.387848138809204, + "learning_rate": 4.016487533754085e-05, + "loss": 2.7098, + "step": 49189 + }, + { + "epoch": 0.29254686459225426, + "grad_norm": 2.1959500312805176, + "learning_rate": 4.0164503986419394e-05, + "loss": 2.7801, + "step": 49190 + }, + { + "epoch": 0.2925528118755353, + "grad_norm": 2.293478012084961, + "learning_rate": 4.016413263000416e-05, + "loss": 2.8223, + "step": 49191 + }, + { + "epoch": 0.29255875915881624, + "grad_norm": 2.1132450103759766, + "learning_rate": 4.016376126829528e-05, + "loss": 3.1478, + "step": 49192 + }, + { + "epoch": 0.29256470644209726, + "grad_norm": 2.3130617141723633, + "learning_rate": 4.0163389901292894e-05, + "loss": 2.6302, + "step": 49193 + }, + { + "epoch": 0.29257065372537827, + "grad_norm": 2.2223098278045654, + "learning_rate": 4.016301852899712e-05, + "loss": 2.9168, + "step": 49194 + }, + { + "epoch": 0.29257660100865923, + "grad_norm": 2.1085915565490723, + "learning_rate": 4.01626471514081e-05, + "loss": 2.6954, + "step": 49195 + }, + { + "epoch": 0.29258254829194025, + "grad_norm": 2.337503671646118, + "learning_rate": 4.016227576852595e-05, + "loss": 2.5824, + "step": 49196 + }, + { + "epoch": 0.29258849557522126, + "grad_norm": 2.136174440383911, + "learning_rate": 4.0161904380350804e-05, + "loss": 2.8968, + "step": 49197 + }, + { + "epoch": 0.2925944428585022, + "grad_norm": 1.776828646659851, + "learning_rate": 4.0161532986882795e-05, + "loss": 3.9541, + "step": 49198 + }, + { + "epoch": 0.29260039014178324, + "grad_norm": 1.6567389965057373, + "learning_rate": 4.016116158812205e-05, + "loss": 4.7661, + "step": 49199 + }, + { + "epoch": 0.29260633742506426, + "grad_norm": 1.8943603038787842, + "learning_rate": 4.0160790184068705e-05, + "loss": 4.5612, + "step": 49200 + }, + { + "epoch": 0.2926122847083452, + "grad_norm": 1.8563474416732788, + "learning_rate": 4.016041877472289e-05, + "loss": 4.5584, + "step": 49201 + }, + { + "epoch": 0.29261823199162623, + "grad_norm": 1.6343352794647217, + "learning_rate": 4.016004736008472e-05, + "loss": 4.5892, + "step": 49202 + }, + { + "epoch": 0.29262417927490725, + "grad_norm": 2.1527905464172363, + "learning_rate": 4.015967594015434e-05, + "loss": 3.3653, + "step": 49203 + }, + { + "epoch": 0.2926301265581882, + "grad_norm": 2.0897629261016846, + "learning_rate": 4.0159304514931865e-05, + "loss": 3.3803, + "step": 49204 + }, + { + "epoch": 0.2926360738414692, + "grad_norm": 2.609333038330078, + "learning_rate": 4.015893308441744e-05, + "loss": 2.8098, + "step": 49205 + }, + { + "epoch": 0.29264202112475024, + "grad_norm": 2.2983717918395996, + "learning_rate": 4.0158561648611195e-05, + "loss": 2.9436, + "step": 49206 + }, + { + "epoch": 0.2926479684080312, + "grad_norm": 2.1847212314605713, + "learning_rate": 4.015819020751324e-05, + "loss": 3.1202, + "step": 49207 + }, + { + "epoch": 0.2926539156913122, + "grad_norm": 2.286381244659424, + "learning_rate": 4.015781876112372e-05, + "loss": 2.9506, + "step": 49208 + }, + { + "epoch": 0.29265986297459323, + "grad_norm": 1.9837905168533325, + "learning_rate": 4.0157447309442763e-05, + "loss": 3.1501, + "step": 49209 + }, + { + "epoch": 0.2926658102578742, + "grad_norm": 1.5269136428833008, + "learning_rate": 4.0157075852470506e-05, + "loss": 4.7876, + "step": 49210 + }, + { + "epoch": 0.2926717575411552, + "grad_norm": 1.838835597038269, + "learning_rate": 4.0156704390207064e-05, + "loss": 3.5122, + "step": 49211 + }, + { + "epoch": 0.2926777048244362, + "grad_norm": 2.118927478790283, + "learning_rate": 4.015633292265257e-05, + "loss": 2.9785, + "step": 49212 + }, + { + "epoch": 0.2926836521077172, + "grad_norm": 2.1010994911193848, + "learning_rate": 4.0155961449807156e-05, + "loss": 2.8163, + "step": 49213 + }, + { + "epoch": 0.2926895993909982, + "grad_norm": 2.263800859451294, + "learning_rate": 4.015558997167096e-05, + "loss": 2.9205, + "step": 49214 + }, + { + "epoch": 0.2926955466742792, + "grad_norm": 2.052384376525879, + "learning_rate": 4.0155218488244096e-05, + "loss": 2.9604, + "step": 49215 + }, + { + "epoch": 0.2927014939575602, + "grad_norm": 2.0031611919403076, + "learning_rate": 4.0154846999526704e-05, + "loss": 2.604, + "step": 49216 + }, + { + "epoch": 0.2927074412408412, + "grad_norm": 2.1211140155792236, + "learning_rate": 4.015447550551892e-05, + "loss": 2.9633, + "step": 49217 + }, + { + "epoch": 0.29271338852412215, + "grad_norm": 2.503384590148926, + "learning_rate": 4.015410400622086e-05, + "loss": 2.6711, + "step": 49218 + }, + { + "epoch": 0.29271933580740317, + "grad_norm": 1.774691104888916, + "learning_rate": 4.015373250163266e-05, + "loss": 4.3577, + "step": 49219 + }, + { + "epoch": 0.2927252830906842, + "grad_norm": 1.7078086137771606, + "learning_rate": 4.015336099175444e-05, + "loss": 4.4725, + "step": 49220 + }, + { + "epoch": 0.29273123037396515, + "grad_norm": 2.020296812057495, + "learning_rate": 4.0152989476586344e-05, + "loss": 4.6705, + "step": 49221 + }, + { + "epoch": 0.29273717765724616, + "grad_norm": 1.943189263343811, + "learning_rate": 4.01526179561285e-05, + "loss": 4.5634, + "step": 49222 + }, + { + "epoch": 0.2927431249405272, + "grad_norm": 1.914955973625183, + "learning_rate": 4.015224643038102e-05, + "loss": 4.5239, + "step": 49223 + }, + { + "epoch": 0.29274907222380814, + "grad_norm": 1.9562156200408936, + "learning_rate": 4.015187489934407e-05, + "loss": 4.3737, + "step": 49224 + }, + { + "epoch": 0.29275501950708915, + "grad_norm": 1.9216886758804321, + "learning_rate": 4.015150336301774e-05, + "loss": 4.1816, + "step": 49225 + }, + { + "epoch": 0.29276096679037017, + "grad_norm": 2.047133684158325, + "learning_rate": 4.015113182140218e-05, + "loss": 4.3434, + "step": 49226 + }, + { + "epoch": 0.29276691407365113, + "grad_norm": 2.567417860031128, + "learning_rate": 4.015076027449751e-05, + "loss": 3.2573, + "step": 49227 + }, + { + "epoch": 0.29277286135693215, + "grad_norm": 1.6746103763580322, + "learning_rate": 4.015038872230388e-05, + "loss": 4.5888, + "step": 49228 + }, + { + "epoch": 0.29277880864021316, + "grad_norm": 1.7047680616378784, + "learning_rate": 4.01500171648214e-05, + "loss": 4.2728, + "step": 49229 + }, + { + "epoch": 0.2927847559234941, + "grad_norm": 1.5659304857254028, + "learning_rate": 4.01496456020502e-05, + "loss": 4.4597, + "step": 49230 + }, + { + "epoch": 0.29279070320677514, + "grad_norm": 1.608600378036499, + "learning_rate": 4.014927403399042e-05, + "loss": 4.5786, + "step": 49231 + }, + { + "epoch": 0.29279665049005615, + "grad_norm": 1.2918272018432617, + "learning_rate": 4.014890246064219e-05, + "loss": 4.9019, + "step": 49232 + }, + { + "epoch": 0.2928025977733371, + "grad_norm": 1.56415593624115, + "learning_rate": 4.014853088200562e-05, + "loss": 4.3173, + "step": 49233 + }, + { + "epoch": 0.29280854505661813, + "grad_norm": 2.5124502182006836, + "learning_rate": 4.014815929808087e-05, + "loss": 3.0209, + "step": 49234 + }, + { + "epoch": 0.29281449233989915, + "grad_norm": 2.250667095184326, + "learning_rate": 4.014778770886804e-05, + "loss": 2.999, + "step": 49235 + }, + { + "epoch": 0.2928204396231801, + "grad_norm": 2.125417947769165, + "learning_rate": 4.014741611436729e-05, + "loss": 2.882, + "step": 49236 + }, + { + "epoch": 0.2928263869064611, + "grad_norm": 2.248586416244507, + "learning_rate": 4.014704451457872e-05, + "loss": 3.0276, + "step": 49237 + }, + { + "epoch": 0.29283233418974214, + "grad_norm": 2.1592791080474854, + "learning_rate": 4.0146672909502484e-05, + "loss": 2.9792, + "step": 49238 + }, + { + "epoch": 0.2928382814730231, + "grad_norm": 2.2062795162200928, + "learning_rate": 4.01463012991387e-05, + "loss": 3.0214, + "step": 49239 + }, + { + "epoch": 0.2928442287563041, + "grad_norm": 2.173588514328003, + "learning_rate": 4.014592968348749e-05, + "loss": 2.9809, + "step": 49240 + }, + { + "epoch": 0.29285017603958513, + "grad_norm": 2.0355138778686523, + "learning_rate": 4.0145558062549e-05, + "loss": 3.0342, + "step": 49241 + }, + { + "epoch": 0.2928561233228661, + "grad_norm": 2.06750750541687, + "learning_rate": 4.0145186436323346e-05, + "loss": 3.2033, + "step": 49242 + }, + { + "epoch": 0.2928620706061471, + "grad_norm": 1.9676392078399658, + "learning_rate": 4.014481480481067e-05, + "loss": 2.9099, + "step": 49243 + }, + { + "epoch": 0.2928680178894281, + "grad_norm": 2.030637502670288, + "learning_rate": 4.014444316801109e-05, + "loss": 2.8309, + "step": 49244 + }, + { + "epoch": 0.2928739651727091, + "grad_norm": 1.9066814184188843, + "learning_rate": 4.0144071525924744e-05, + "loss": 3.244, + "step": 49245 + }, + { + "epoch": 0.2928799124559901, + "grad_norm": 1.9869228601455688, + "learning_rate": 4.014369987855175e-05, + "loss": 3.0151, + "step": 49246 + }, + { + "epoch": 0.2928858597392711, + "grad_norm": 2.074063539505005, + "learning_rate": 4.0143328225892265e-05, + "loss": 2.838, + "step": 49247 + }, + { + "epoch": 0.2928918070225521, + "grad_norm": 2.00007963180542, + "learning_rate": 4.014295656794639e-05, + "loss": 2.9394, + "step": 49248 + }, + { + "epoch": 0.2928977543058331, + "grad_norm": 2.083700656890869, + "learning_rate": 4.0142584904714265e-05, + "loss": 3.0571, + "step": 49249 + }, + { + "epoch": 0.2929037015891141, + "grad_norm": 2.108919620513916, + "learning_rate": 4.0142213236196024e-05, + "loss": 3.0027, + "step": 49250 + }, + { + "epoch": 0.29290964887239507, + "grad_norm": 2.1288180351257324, + "learning_rate": 4.014184156239179e-05, + "loss": 3.0036, + "step": 49251 + }, + { + "epoch": 0.2929155961556761, + "grad_norm": 1.9961433410644531, + "learning_rate": 4.0141469883301696e-05, + "loss": 2.9501, + "step": 49252 + }, + { + "epoch": 0.2929215434389571, + "grad_norm": 2.004672050476074, + "learning_rate": 4.014109819892587e-05, + "loss": 2.9512, + "step": 49253 + }, + { + "epoch": 0.29292749072223806, + "grad_norm": 2.047600746154785, + "learning_rate": 4.014072650926444e-05, + "loss": 2.8046, + "step": 49254 + }, + { + "epoch": 0.2929334380055191, + "grad_norm": 2.1862688064575195, + "learning_rate": 4.014035481431755e-05, + "loss": 2.9222, + "step": 49255 + }, + { + "epoch": 0.2929393852888001, + "grad_norm": 2.2067477703094482, + "learning_rate": 4.013998311408531e-05, + "loss": 2.992, + "step": 49256 + }, + { + "epoch": 0.29294533257208105, + "grad_norm": 1.9849894046783447, + "learning_rate": 4.0139611408567854e-05, + "loss": 3.0213, + "step": 49257 + }, + { + "epoch": 0.29295127985536207, + "grad_norm": 2.0480904579162598, + "learning_rate": 4.013923969776533e-05, + "loss": 2.4942, + "step": 49258 + }, + { + "epoch": 0.2929572271386431, + "grad_norm": 2.144587516784668, + "learning_rate": 4.013886798167784e-05, + "loss": 2.4999, + "step": 49259 + }, + { + "epoch": 0.29296317442192404, + "grad_norm": 2.1544368267059326, + "learning_rate": 4.0138496260305535e-05, + "loss": 2.5384, + "step": 49260 + }, + { + "epoch": 0.29296912170520506, + "grad_norm": 2.0462679862976074, + "learning_rate": 4.013812453364854e-05, + "loss": 2.8696, + "step": 49261 + }, + { + "epoch": 0.2929750689884861, + "grad_norm": 1.8164616823196411, + "learning_rate": 4.013775280170698e-05, + "loss": 4.8577, + "step": 49262 + }, + { + "epoch": 0.29298101627176704, + "grad_norm": 1.871985912322998, + "learning_rate": 4.013738106448098e-05, + "loss": 4.1721, + "step": 49263 + }, + { + "epoch": 0.29298696355504805, + "grad_norm": 3.1372358798980713, + "learning_rate": 4.0137009321970684e-05, + "loss": 3.256, + "step": 49264 + }, + { + "epoch": 0.29299291083832907, + "grad_norm": 2.238518714904785, + "learning_rate": 4.0136637574176204e-05, + "loss": 3.6078, + "step": 49265 + }, + { + "epoch": 0.29299885812161003, + "grad_norm": 1.5271133184432983, + "learning_rate": 4.013626582109769e-05, + "loss": 4.3383, + "step": 49266 + }, + { + "epoch": 0.29300480540489104, + "grad_norm": 2.358369827270508, + "learning_rate": 4.0135894062735265e-05, + "loss": 2.9369, + "step": 49267 + }, + { + "epoch": 0.29301075268817206, + "grad_norm": 2.125227212905884, + "learning_rate": 4.0135522299089044e-05, + "loss": 3.1548, + "step": 49268 + }, + { + "epoch": 0.293016699971453, + "grad_norm": 2.3705294132232666, + "learning_rate": 4.013515053015918e-05, + "loss": 2.9562, + "step": 49269 + }, + { + "epoch": 0.29302264725473404, + "grad_norm": 2.1409835815429688, + "learning_rate": 4.013477875594579e-05, + "loss": 2.7551, + "step": 49270 + }, + { + "epoch": 0.29302859453801505, + "grad_norm": 1.884218454360962, + "learning_rate": 4.0134406976449e-05, + "loss": 3.2335, + "step": 49271 + }, + { + "epoch": 0.293034541821296, + "grad_norm": 1.466855764389038, + "learning_rate": 4.013403519166895e-05, + "loss": 4.7425, + "step": 49272 + }, + { + "epoch": 0.29304048910457703, + "grad_norm": 1.3998157978057861, + "learning_rate": 4.013366340160576e-05, + "loss": 4.4706, + "step": 49273 + }, + { + "epoch": 0.29304643638785804, + "grad_norm": 1.3637070655822754, + "learning_rate": 4.013329160625956e-05, + "loss": 4.7855, + "step": 49274 + }, + { + "epoch": 0.293052383671139, + "grad_norm": 2.0565714836120605, + "learning_rate": 4.013291980563049e-05, + "loss": 3.9979, + "step": 49275 + }, + { + "epoch": 0.29305833095442, + "grad_norm": 2.787105083465576, + "learning_rate": 4.013254799971868e-05, + "loss": 3.091, + "step": 49276 + }, + { + "epoch": 0.29306427823770104, + "grad_norm": 2.507822275161743, + "learning_rate": 4.013217618852424e-05, + "loss": 3.1503, + "step": 49277 + }, + { + "epoch": 0.293070225520982, + "grad_norm": 2.3429770469665527, + "learning_rate": 4.013180437204732e-05, + "loss": 3.1053, + "step": 49278 + }, + { + "epoch": 0.293076172804263, + "grad_norm": 1.67976713180542, + "learning_rate": 4.0131432550288054e-05, + "loss": 4.1634, + "step": 49279 + }, + { + "epoch": 0.29308212008754403, + "grad_norm": 1.7044157981872559, + "learning_rate": 4.013106072324655e-05, + "loss": 4.2329, + "step": 49280 + }, + { + "epoch": 0.293088067370825, + "grad_norm": 2.3319027423858643, + "learning_rate": 4.0130688890922955e-05, + "loss": 3.6029, + "step": 49281 + }, + { + "epoch": 0.293094014654106, + "grad_norm": 2.4801554679870605, + "learning_rate": 4.013031705331739e-05, + "loss": 3.747, + "step": 49282 + }, + { + "epoch": 0.293099961937387, + "grad_norm": 1.7234660387039185, + "learning_rate": 4.0129945210429974e-05, + "loss": 4.1915, + "step": 49283 + }, + { + "epoch": 0.293105909220668, + "grad_norm": 1.6249020099639893, + "learning_rate": 4.012957336226087e-05, + "loss": 4.3879, + "step": 49284 + }, + { + "epoch": 0.293111856503949, + "grad_norm": 1.50216805934906, + "learning_rate": 4.0129201508810175e-05, + "loss": 4.5249, + "step": 49285 + }, + { + "epoch": 0.29311780378723, + "grad_norm": 1.6388593912124634, + "learning_rate": 4.012882965007804e-05, + "loss": 4.3703, + "step": 49286 + }, + { + "epoch": 0.293123751070511, + "grad_norm": 1.6741087436676025, + "learning_rate": 4.012845778606459e-05, + "loss": 4.3289, + "step": 49287 + }, + { + "epoch": 0.293129698353792, + "grad_norm": 1.5481507778167725, + "learning_rate": 4.012808591676995e-05, + "loss": 4.4087, + "step": 49288 + }, + { + "epoch": 0.293135645637073, + "grad_norm": 1.7456262111663818, + "learning_rate": 4.0127714042194245e-05, + "loss": 4.1756, + "step": 49289 + }, + { + "epoch": 0.29314159292035397, + "grad_norm": 1.5286809206008911, + "learning_rate": 4.012734216233761e-05, + "loss": 4.6533, + "step": 49290 + }, + { + "epoch": 0.293147540203635, + "grad_norm": 1.9579784870147705, + "learning_rate": 4.012697027720018e-05, + "loss": 3.7228, + "step": 49291 + }, + { + "epoch": 0.293153487486916, + "grad_norm": 1.789642333984375, + "learning_rate": 4.012659838678209e-05, + "loss": 3.0483, + "step": 49292 + }, + { + "epoch": 0.29315943477019696, + "grad_norm": 1.9565316438674927, + "learning_rate": 4.012622649108345e-05, + "loss": 3.7422, + "step": 49293 + }, + { + "epoch": 0.293165382053478, + "grad_norm": 1.7226577997207642, + "learning_rate": 4.0125854590104404e-05, + "loss": 4.2841, + "step": 49294 + }, + { + "epoch": 0.293171329336759, + "grad_norm": 1.6231575012207031, + "learning_rate": 4.0125482683845075e-05, + "loss": 4.2633, + "step": 49295 + }, + { + "epoch": 0.29317727662003995, + "grad_norm": 1.3721699714660645, + "learning_rate": 4.0125110772305594e-05, + "loss": 4.8235, + "step": 49296 + }, + { + "epoch": 0.29318322390332097, + "grad_norm": 1.722458839416504, + "learning_rate": 4.01247388554861e-05, + "loss": 4.2664, + "step": 49297 + }, + { + "epoch": 0.293189171186602, + "grad_norm": 1.7728153467178345, + "learning_rate": 4.0124366933386715e-05, + "loss": 4.9059, + "step": 49298 + }, + { + "epoch": 0.29319511846988294, + "grad_norm": 1.9284926652908325, + "learning_rate": 4.012399500600757e-05, + "loss": 4.828, + "step": 49299 + }, + { + "epoch": 0.29320106575316396, + "grad_norm": 1.508221983909607, + "learning_rate": 4.012362307334879e-05, + "loss": 4.8284, + "step": 49300 + }, + { + "epoch": 0.293207013036445, + "grad_norm": 1.33418869972229, + "learning_rate": 4.012325113541052e-05, + "loss": 4.7746, + "step": 49301 + }, + { + "epoch": 0.29321296031972593, + "grad_norm": 1.4174906015396118, + "learning_rate": 4.012287919219287e-05, + "loss": 4.4996, + "step": 49302 + }, + { + "epoch": 0.29321890760300695, + "grad_norm": 1.6339670419692993, + "learning_rate": 4.012250724369599e-05, + "loss": 4.687, + "step": 49303 + }, + { + "epoch": 0.29322485488628797, + "grad_norm": 1.4915950298309326, + "learning_rate": 4.012213528991998e-05, + "loss": 4.9532, + "step": 49304 + }, + { + "epoch": 0.2932308021695689, + "grad_norm": 1.6776494979858398, + "learning_rate": 4.0121763330865e-05, + "loss": 4.5826, + "step": 49305 + }, + { + "epoch": 0.29323674945284994, + "grad_norm": 1.6790953874588013, + "learning_rate": 4.012139136653117e-05, + "loss": 4.3766, + "step": 49306 + }, + { + "epoch": 0.29324269673613096, + "grad_norm": 1.7504117488861084, + "learning_rate": 4.012101939691861e-05, + "loss": 4.6232, + "step": 49307 + }, + { + "epoch": 0.2932486440194119, + "grad_norm": 1.5456147193908691, + "learning_rate": 4.012064742202747e-05, + "loss": 4.4124, + "step": 49308 + }, + { + "epoch": 0.29325459130269294, + "grad_norm": 1.7577556371688843, + "learning_rate": 4.0120275441857855e-05, + "loss": 4.3379, + "step": 49309 + }, + { + "epoch": 0.29326053858597395, + "grad_norm": 1.8829541206359863, + "learning_rate": 4.011990345640992e-05, + "loss": 4.2815, + "step": 49310 + }, + { + "epoch": 0.2932664858692549, + "grad_norm": 1.7720166444778442, + "learning_rate": 4.011953146568378e-05, + "loss": 4.1442, + "step": 49311 + }, + { + "epoch": 0.2932724331525359, + "grad_norm": 1.5448179244995117, + "learning_rate": 4.011915946967957e-05, + "loss": 4.7356, + "step": 49312 + }, + { + "epoch": 0.29327838043581694, + "grad_norm": 1.4255551099777222, + "learning_rate": 4.011878746839741e-05, + "loss": 4.629, + "step": 49313 + }, + { + "epoch": 0.2932843277190979, + "grad_norm": 3.931576728820801, + "learning_rate": 4.0118415461837436e-05, + "loss": 3.0434, + "step": 49314 + }, + { + "epoch": 0.2932902750023789, + "grad_norm": 3.859776258468628, + "learning_rate": 4.011804344999979e-05, + "loss": 1.4988, + "step": 49315 + }, + { + "epoch": 0.29329622228565994, + "grad_norm": 3.3701717853546143, + "learning_rate": 4.011767143288459e-05, + "loss": 1.5449, + "step": 49316 + }, + { + "epoch": 0.2933021695689409, + "grad_norm": 3.9124388694763184, + "learning_rate": 4.011729941049196e-05, + "loss": 1.3399, + "step": 49317 + }, + { + "epoch": 0.2933081168522219, + "grad_norm": 4.053889274597168, + "learning_rate": 4.011692738282204e-05, + "loss": 1.7824, + "step": 49318 + }, + { + "epoch": 0.2933140641355029, + "grad_norm": 2.512424945831299, + "learning_rate": 4.011655534987496e-05, + "loss": 2.289, + "step": 49319 + }, + { + "epoch": 0.2933200114187839, + "grad_norm": 2.3260364532470703, + "learning_rate": 4.011618331165083e-05, + "loss": 2.8848, + "step": 49320 + }, + { + "epoch": 0.2933259587020649, + "grad_norm": 4.995601654052734, + "learning_rate": 4.0115811268149824e-05, + "loss": 1.3617, + "step": 49321 + }, + { + "epoch": 0.2933319059853459, + "grad_norm": 5.491086959838867, + "learning_rate": 4.0115439219372025e-05, + "loss": 1.427, + "step": 49322 + }, + { + "epoch": 0.2933378532686269, + "grad_norm": 4.341436862945557, + "learning_rate": 4.011506716531759e-05, + "loss": 1.6384, + "step": 49323 + }, + { + "epoch": 0.2933438005519079, + "grad_norm": 3.4932117462158203, + "learning_rate": 4.011469510598664e-05, + "loss": 1.2867, + "step": 49324 + }, + { + "epoch": 0.2933497478351889, + "grad_norm": 2.990868091583252, + "learning_rate": 4.01143230413793e-05, + "loss": 1.7043, + "step": 49325 + }, + { + "epoch": 0.2933556951184699, + "grad_norm": 3.2780425548553467, + "learning_rate": 4.011395097149572e-05, + "loss": 2.9756, + "step": 49326 + }, + { + "epoch": 0.2933616424017509, + "grad_norm": 3.052107095718384, + "learning_rate": 4.0113578896336e-05, + "loss": 1.441, + "step": 49327 + }, + { + "epoch": 0.2933675896850319, + "grad_norm": 3.276918649673462, + "learning_rate": 4.0113206815900296e-05, + "loss": 1.376, + "step": 49328 + }, + { + "epoch": 0.29337353696831286, + "grad_norm": 3.2020423412323, + "learning_rate": 4.011283473018872e-05, + "loss": 2.6815, + "step": 49329 + }, + { + "epoch": 0.2933794842515939, + "grad_norm": 2.910623788833618, + "learning_rate": 4.011246263920142e-05, + "loss": 2.568, + "step": 49330 + }, + { + "epoch": 0.2933854315348749, + "grad_norm": 2.111797571182251, + "learning_rate": 4.011209054293851e-05, + "loss": 4.2438, + "step": 49331 + }, + { + "epoch": 0.29339137881815586, + "grad_norm": 2.812105417251587, + "learning_rate": 4.011171844140012e-05, + "loss": 5.0563, + "step": 49332 + }, + { + "epoch": 0.2933973261014369, + "grad_norm": 2.49458909034729, + "learning_rate": 4.01113463345864e-05, + "loss": 5.0365, + "step": 49333 + }, + { + "epoch": 0.29340327338471783, + "grad_norm": 1.853857398033142, + "learning_rate": 4.011097422249744e-05, + "loss": 4.8974, + "step": 49334 + }, + { + "epoch": 0.29340922066799885, + "grad_norm": 1.6304044723510742, + "learning_rate": 4.011060210513342e-05, + "loss": 4.2853, + "step": 49335 + }, + { + "epoch": 0.29341516795127986, + "grad_norm": 1.8953238725662231, + "learning_rate": 4.011022998249443e-05, + "loss": 3.9648, + "step": 49336 + }, + { + "epoch": 0.2934211152345608, + "grad_norm": 2.351682424545288, + "learning_rate": 4.0109857854580623e-05, + "loss": 3.8257, + "step": 49337 + }, + { + "epoch": 0.29342706251784184, + "grad_norm": 2.4541220664978027, + "learning_rate": 4.0109485721392115e-05, + "loss": 3.5666, + "step": 49338 + }, + { + "epoch": 0.29343300980112286, + "grad_norm": 2.721095323562622, + "learning_rate": 4.010911358292905e-05, + "loss": 3.6346, + "step": 49339 + }, + { + "epoch": 0.2934389570844038, + "grad_norm": 1.8979957103729248, + "learning_rate": 4.010874143919154e-05, + "loss": 4.0227, + "step": 49340 + }, + { + "epoch": 0.29344490436768483, + "grad_norm": 1.7713874578475952, + "learning_rate": 4.0108369290179724e-05, + "loss": 4.8542, + "step": 49341 + }, + { + "epoch": 0.29345085165096585, + "grad_norm": 1.4122660160064697, + "learning_rate": 4.010799713589374e-05, + "loss": 4.5788, + "step": 49342 + }, + { + "epoch": 0.2934567989342468, + "grad_norm": 1.459855079650879, + "learning_rate": 4.01076249763337e-05, + "loss": 4.896, + "step": 49343 + }, + { + "epoch": 0.2934627462175278, + "grad_norm": 1.4390596151351929, + "learning_rate": 4.010725281149976e-05, + "loss": 4.9393, + "step": 49344 + }, + { + "epoch": 0.29346869350080884, + "grad_norm": 1.4203726053237915, + "learning_rate": 4.010688064139202e-05, + "loss": 4.8365, + "step": 49345 + }, + { + "epoch": 0.2934746407840898, + "grad_norm": 2.3326401710510254, + "learning_rate": 4.010650846601063e-05, + "loss": 3.352, + "step": 49346 + }, + { + "epoch": 0.2934805880673708, + "grad_norm": 2.167098045349121, + "learning_rate": 4.0106136285355714e-05, + "loss": 3.2756, + "step": 49347 + }, + { + "epoch": 0.29348653535065183, + "grad_norm": 1.7014461755752563, + "learning_rate": 4.0105764099427404e-05, + "loss": 4.0865, + "step": 49348 + }, + { + "epoch": 0.2934924826339328, + "grad_norm": 1.504925012588501, + "learning_rate": 4.010539190822582e-05, + "loss": 5.1741, + "step": 49349 + }, + { + "epoch": 0.2934984299172138, + "grad_norm": 1.5749180316925049, + "learning_rate": 4.0105019711751115e-05, + "loss": 5.0394, + "step": 49350 + }, + { + "epoch": 0.2935043772004948, + "grad_norm": 1.4985921382904053, + "learning_rate": 4.0104647510003394e-05, + "loss": 4.8565, + "step": 49351 + }, + { + "epoch": 0.2935103244837758, + "grad_norm": 2.3064632415771484, + "learning_rate": 4.0104275302982784e-05, + "loss": 4.1716, + "step": 49352 + }, + { + "epoch": 0.2935162717670568, + "grad_norm": 1.521232008934021, + "learning_rate": 4.010390309068944e-05, + "loss": 5.0234, + "step": 49353 + }, + { + "epoch": 0.2935222190503378, + "grad_norm": 1.3453755378723145, + "learning_rate": 4.010353087312348e-05, + "loss": 4.8423, + "step": 49354 + }, + { + "epoch": 0.2935281663336188, + "grad_norm": 1.3761593103408813, + "learning_rate": 4.010315865028503e-05, + "loss": 4.879, + "step": 49355 + }, + { + "epoch": 0.2935341136168998, + "grad_norm": 1.5858482122421265, + "learning_rate": 4.010278642217423e-05, + "loss": 4.7155, + "step": 49356 + }, + { + "epoch": 0.2935400609001808, + "grad_norm": 1.4788687229156494, + "learning_rate": 4.01024141887912e-05, + "loss": 4.8486, + "step": 49357 + }, + { + "epoch": 0.29354600818346177, + "grad_norm": 1.524683952331543, + "learning_rate": 4.010204195013607e-05, + "loss": 5.0059, + "step": 49358 + }, + { + "epoch": 0.2935519554667428, + "grad_norm": 1.3980497121810913, + "learning_rate": 4.0101669706208974e-05, + "loss": 4.979, + "step": 49359 + }, + { + "epoch": 0.2935579027500238, + "grad_norm": 1.5253469944000244, + "learning_rate": 4.010129745701005e-05, + "loss": 4.6641, + "step": 49360 + }, + { + "epoch": 0.29356385003330476, + "grad_norm": 1.8202985525131226, + "learning_rate": 4.010092520253941e-05, + "loss": 4.9785, + "step": 49361 + }, + { + "epoch": 0.2935697973165858, + "grad_norm": 1.7456949949264526, + "learning_rate": 4.0100552942797194e-05, + "loss": 5.1384, + "step": 49362 + }, + { + "epoch": 0.2935757445998668, + "grad_norm": 1.6954090595245361, + "learning_rate": 4.0100180677783524e-05, + "loss": 4.1121, + "step": 49363 + }, + { + "epoch": 0.29358169188314776, + "grad_norm": 1.5467814207077026, + "learning_rate": 4.009980840749855e-05, + "loss": 4.2, + "step": 49364 + }, + { + "epoch": 0.29358763916642877, + "grad_norm": 1.511171817779541, + "learning_rate": 4.0099436131942386e-05, + "loss": 4.5979, + "step": 49365 + }, + { + "epoch": 0.2935935864497098, + "grad_norm": 2.0819289684295654, + "learning_rate": 4.009906385111516e-05, + "loss": 3.4928, + "step": 49366 + }, + { + "epoch": 0.29359953373299075, + "grad_norm": 1.6854758262634277, + "learning_rate": 4.009869156501701e-05, + "loss": 3.4645, + "step": 49367 + }, + { + "epoch": 0.29360548101627176, + "grad_norm": 1.8262251615524292, + "learning_rate": 4.009831927364807e-05, + "loss": 3.9581, + "step": 49368 + }, + { + "epoch": 0.2936114282995528, + "grad_norm": 3.5364327430725098, + "learning_rate": 4.009794697700845e-05, + "loss": 3.6811, + "step": 49369 + }, + { + "epoch": 0.29361737558283374, + "grad_norm": 1.872059941291809, + "learning_rate": 4.009757467509829e-05, + "loss": 4.4294, + "step": 49370 + }, + { + "epoch": 0.29362332286611476, + "grad_norm": 1.5291401147842407, + "learning_rate": 4.009720236791774e-05, + "loss": 4.7502, + "step": 49371 + }, + { + "epoch": 0.29362927014939577, + "grad_norm": 1.4373022317886353, + "learning_rate": 4.0096830055466896e-05, + "loss": 4.4887, + "step": 49372 + }, + { + "epoch": 0.29363521743267673, + "grad_norm": 1.7501726150512695, + "learning_rate": 4.0096457737745904e-05, + "loss": 4.5895, + "step": 49373 + }, + { + "epoch": 0.29364116471595775, + "grad_norm": 1.4465432167053223, + "learning_rate": 4.009608541475492e-05, + "loss": 4.8763, + "step": 49374 + }, + { + "epoch": 0.29364711199923876, + "grad_norm": 1.9906188249588013, + "learning_rate": 4.0095713086494016e-05, + "loss": 4.4897, + "step": 49375 + }, + { + "epoch": 0.2936530592825197, + "grad_norm": 1.9423469305038452, + "learning_rate": 4.009534075296337e-05, + "loss": 4.761, + "step": 49376 + }, + { + "epoch": 0.29365900656580074, + "grad_norm": 1.958939552307129, + "learning_rate": 4.00949684141631e-05, + "loss": 4.7412, + "step": 49377 + }, + { + "epoch": 0.29366495384908176, + "grad_norm": 1.6118289232254028, + "learning_rate": 4.009459607009333e-05, + "loss": 5.0712, + "step": 49378 + }, + { + "epoch": 0.2936709011323627, + "grad_norm": 2.152541160583496, + "learning_rate": 4.009422372075419e-05, + "loss": 4.0193, + "step": 49379 + }, + { + "epoch": 0.29367684841564373, + "grad_norm": 2.101736307144165, + "learning_rate": 4.0093851366145806e-05, + "loss": 4.2737, + "step": 49380 + }, + { + "epoch": 0.29368279569892475, + "grad_norm": 2.2449464797973633, + "learning_rate": 4.009347900626832e-05, + "loss": 4.2962, + "step": 49381 + }, + { + "epoch": 0.2936887429822057, + "grad_norm": 2.5117874145507812, + "learning_rate": 4.009310664112186e-05, + "loss": 4.1763, + "step": 49382 + }, + { + "epoch": 0.2936946902654867, + "grad_norm": 1.8216824531555176, + "learning_rate": 4.0092734270706556e-05, + "loss": 3.9672, + "step": 49383 + }, + { + "epoch": 0.29370063754876774, + "grad_norm": 1.723238229751587, + "learning_rate": 4.009236189502252e-05, + "loss": 3.9226, + "step": 49384 + }, + { + "epoch": 0.2937065848320487, + "grad_norm": 2.085994005203247, + "learning_rate": 4.0091989514069905e-05, + "loss": 3.4305, + "step": 49385 + }, + { + "epoch": 0.2937125321153297, + "grad_norm": 1.4770087003707886, + "learning_rate": 4.0091617127848836e-05, + "loss": 4.7793, + "step": 49386 + }, + { + "epoch": 0.29371847939861073, + "grad_norm": 1.374234676361084, + "learning_rate": 4.009124473635943e-05, + "loss": 5.1029, + "step": 49387 + }, + { + "epoch": 0.2937244266818917, + "grad_norm": 1.7161891460418701, + "learning_rate": 4.009087233960184e-05, + "loss": 4.8254, + "step": 49388 + }, + { + "epoch": 0.2937303739651727, + "grad_norm": 1.6441552639007568, + "learning_rate": 4.009049993757618e-05, + "loss": 3.8381, + "step": 49389 + }, + { + "epoch": 0.2937363212484537, + "grad_norm": 1.7871599197387695, + "learning_rate": 4.009012753028257e-05, + "loss": 3.9091, + "step": 49390 + }, + { + "epoch": 0.2937422685317347, + "grad_norm": 1.7380318641662598, + "learning_rate": 4.0089755117721155e-05, + "loss": 3.9318, + "step": 49391 + }, + { + "epoch": 0.2937482158150157, + "grad_norm": 2.1632983684539795, + "learning_rate": 4.008938269989206e-05, + "loss": 3.3241, + "step": 49392 + }, + { + "epoch": 0.2937541630982967, + "grad_norm": 1.8484078645706177, + "learning_rate": 4.008901027679543e-05, + "loss": 3.2989, + "step": 49393 + }, + { + "epoch": 0.2937601103815777, + "grad_norm": 1.6658250093460083, + "learning_rate": 4.0088637848431374e-05, + "loss": 4.563, + "step": 49394 + }, + { + "epoch": 0.2937660576648587, + "grad_norm": 1.7671692371368408, + "learning_rate": 4.0088265414800035e-05, + "loss": 4.5613, + "step": 49395 + }, + { + "epoch": 0.2937720049481397, + "grad_norm": 1.838183045387268, + "learning_rate": 4.0087892975901534e-05, + "loss": 4.2083, + "step": 49396 + }, + { + "epoch": 0.29377795223142067, + "grad_norm": 1.7684600353240967, + "learning_rate": 4.008752053173601e-05, + "loss": 4.6833, + "step": 49397 + }, + { + "epoch": 0.2937838995147017, + "grad_norm": 1.6295762062072754, + "learning_rate": 4.0087148082303585e-05, + "loss": 4.2007, + "step": 49398 + }, + { + "epoch": 0.2937898467979827, + "grad_norm": 1.7927545309066772, + "learning_rate": 4.0086775627604396e-05, + "loss": 3.7091, + "step": 49399 + }, + { + "epoch": 0.29379579408126366, + "grad_norm": 1.412373423576355, + "learning_rate": 4.0086403167638565e-05, + "loss": 4.3485, + "step": 49400 + }, + { + "epoch": 0.2938017413645447, + "grad_norm": 1.671017050743103, + "learning_rate": 4.008603070240623e-05, + "loss": 4.6106, + "step": 49401 + }, + { + "epoch": 0.2938076886478257, + "grad_norm": 1.7925090789794922, + "learning_rate": 4.008565823190751e-05, + "loss": 4.548, + "step": 49402 + }, + { + "epoch": 0.29381363593110665, + "grad_norm": 1.4513484239578247, + "learning_rate": 4.008528575614256e-05, + "loss": 4.8012, + "step": 49403 + }, + { + "epoch": 0.29381958321438767, + "grad_norm": 1.4216326475143433, + "learning_rate": 4.008491327511147e-05, + "loss": 4.7813, + "step": 49404 + }, + { + "epoch": 0.2938255304976687, + "grad_norm": 1.610303521156311, + "learning_rate": 4.008454078881441e-05, + "loss": 4.4064, + "step": 49405 + }, + { + "epoch": 0.29383147778094965, + "grad_norm": 1.564060926437378, + "learning_rate": 4.0084168297251485e-05, + "loss": 4.5124, + "step": 49406 + }, + { + "epoch": 0.29383742506423066, + "grad_norm": 1.5551462173461914, + "learning_rate": 4.0083795800422835e-05, + "loss": 4.6284, + "step": 49407 + }, + { + "epoch": 0.2938433723475117, + "grad_norm": 1.5292097330093384, + "learning_rate": 4.008342329832859e-05, + "loss": 4.5528, + "step": 49408 + }, + { + "epoch": 0.29384931963079264, + "grad_norm": 1.5936193466186523, + "learning_rate": 4.0083050790968865e-05, + "loss": 4.5015, + "step": 49409 + }, + { + "epoch": 0.29385526691407365, + "grad_norm": 1.6577214002609253, + "learning_rate": 4.0082678278343824e-05, + "loss": 4.5, + "step": 49410 + }, + { + "epoch": 0.29386121419735467, + "grad_norm": 1.5840816497802734, + "learning_rate": 4.008230576045355e-05, + "loss": 4.4958, + "step": 49411 + }, + { + "epoch": 0.29386716148063563, + "grad_norm": 1.5566798448562622, + "learning_rate": 4.008193323729822e-05, + "loss": 3.7865, + "step": 49412 + }, + { + "epoch": 0.29387310876391665, + "grad_norm": 1.554174542427063, + "learning_rate": 4.008156070887794e-05, + "loss": 3.9855, + "step": 49413 + }, + { + "epoch": 0.29387905604719766, + "grad_norm": 1.8377610445022583, + "learning_rate": 4.008118817519283e-05, + "loss": 4.3256, + "step": 49414 + }, + { + "epoch": 0.2938850033304786, + "grad_norm": 1.3734989166259766, + "learning_rate": 4.0080815636243044e-05, + "loss": 3.7909, + "step": 49415 + }, + { + "epoch": 0.29389095061375964, + "grad_norm": 1.6343480348587036, + "learning_rate": 4.00804430920287e-05, + "loss": 3.9352, + "step": 49416 + }, + { + "epoch": 0.29389689789704065, + "grad_norm": 1.5219416618347168, + "learning_rate": 4.008007054254993e-05, + "loss": 3.7271, + "step": 49417 + }, + { + "epoch": 0.2939028451803216, + "grad_norm": 1.894149899482727, + "learning_rate": 4.007969798780686e-05, + "loss": 3.5894, + "step": 49418 + }, + { + "epoch": 0.29390879246360263, + "grad_norm": 2.0440149307250977, + "learning_rate": 4.007932542779963e-05, + "loss": 2.87, + "step": 49419 + }, + { + "epoch": 0.29391473974688365, + "grad_norm": 2.0040671825408936, + "learning_rate": 4.0078952862528354e-05, + "loss": 3.3013, + "step": 49420 + }, + { + "epoch": 0.2939206870301646, + "grad_norm": 1.8136789798736572, + "learning_rate": 4.007858029199317e-05, + "loss": 4.1438, + "step": 49421 + }, + { + "epoch": 0.2939266343134456, + "grad_norm": 1.5151705741882324, + "learning_rate": 4.007820771619422e-05, + "loss": 4.2833, + "step": 49422 + }, + { + "epoch": 0.29393258159672664, + "grad_norm": 1.5394614934921265, + "learning_rate": 4.007783513513161e-05, + "loss": 3.8134, + "step": 49423 + }, + { + "epoch": 0.2939385288800076, + "grad_norm": 2.108350992202759, + "learning_rate": 4.00774625488055e-05, + "loss": 3.6972, + "step": 49424 + }, + { + "epoch": 0.2939444761632886, + "grad_norm": 1.5557148456573486, + "learning_rate": 4.0077089957215995e-05, + "loss": 4.5164, + "step": 49425 + }, + { + "epoch": 0.29395042344656963, + "grad_norm": 2.1284263134002686, + "learning_rate": 4.007671736036324e-05, + "loss": 3.7212, + "step": 49426 + }, + { + "epoch": 0.2939563707298506, + "grad_norm": 2.100660562515259, + "learning_rate": 4.007634475824734e-05, + "loss": 3.0168, + "step": 49427 + }, + { + "epoch": 0.2939623180131316, + "grad_norm": 2.069596767425537, + "learning_rate": 4.0075972150868464e-05, + "loss": 2.9451, + "step": 49428 + }, + { + "epoch": 0.2939682652964126, + "grad_norm": 2.164553642272949, + "learning_rate": 4.0075599538226714e-05, + "loss": 3.4727, + "step": 49429 + }, + { + "epoch": 0.2939742125796936, + "grad_norm": 1.4731096029281616, + "learning_rate": 4.0075226920322226e-05, + "loss": 4.7716, + "step": 49430 + }, + { + "epoch": 0.2939801598629746, + "grad_norm": 1.7366396188735962, + "learning_rate": 4.007485429715514e-05, + "loss": 4.4632, + "step": 49431 + }, + { + "epoch": 0.2939861071462556, + "grad_norm": 1.5518993139266968, + "learning_rate": 4.007448166872557e-05, + "loss": 5.0244, + "step": 49432 + }, + { + "epoch": 0.2939920544295366, + "grad_norm": 2.154679298400879, + "learning_rate": 4.007410903503365e-05, + "loss": 3.174, + "step": 49433 + }, + { + "epoch": 0.2939980017128176, + "grad_norm": 1.5504206418991089, + "learning_rate": 4.007373639607953e-05, + "loss": 3.8823, + "step": 49434 + }, + { + "epoch": 0.2940039489960986, + "grad_norm": 1.4603132009506226, + "learning_rate": 4.0073363751863314e-05, + "loss": 4.2233, + "step": 49435 + }, + { + "epoch": 0.29400989627937957, + "grad_norm": 1.6073012351989746, + "learning_rate": 4.007299110238514e-05, + "loss": 4.7241, + "step": 49436 + }, + { + "epoch": 0.2940158435626606, + "grad_norm": 2.0462398529052734, + "learning_rate": 4.0072618447645145e-05, + "loss": 4.1314, + "step": 49437 + }, + { + "epoch": 0.2940217908459416, + "grad_norm": 2.026238203048706, + "learning_rate": 4.007224578764345e-05, + "loss": 4.1162, + "step": 49438 + }, + { + "epoch": 0.29402773812922256, + "grad_norm": 1.536693811416626, + "learning_rate": 4.00718731223802e-05, + "loss": 5.1146, + "step": 49439 + }, + { + "epoch": 0.2940336854125036, + "grad_norm": 1.9420758485794067, + "learning_rate": 4.00715004518555e-05, + "loss": 4.2348, + "step": 49440 + }, + { + "epoch": 0.2940396326957846, + "grad_norm": 1.5566378831863403, + "learning_rate": 4.0071127776069507e-05, + "loss": 4.7511, + "step": 49441 + }, + { + "epoch": 0.29404557997906555, + "grad_norm": 1.408695101737976, + "learning_rate": 4.007075509502233e-05, + "loss": 4.44, + "step": 49442 + }, + { + "epoch": 0.29405152726234657, + "grad_norm": 1.5278630256652832, + "learning_rate": 4.0070382408714125e-05, + "loss": 4.3962, + "step": 49443 + }, + { + "epoch": 0.2940574745456276, + "grad_norm": 1.4581726789474487, + "learning_rate": 4.0070009717144984e-05, + "loss": 4.4658, + "step": 49444 + }, + { + "epoch": 0.29406342182890854, + "grad_norm": 1.4200488328933716, + "learning_rate": 4.006963702031507e-05, + "loss": 4.8298, + "step": 49445 + }, + { + "epoch": 0.29406936911218956, + "grad_norm": 1.6573102474212646, + "learning_rate": 4.0069264318224506e-05, + "loss": 4.3435, + "step": 49446 + }, + { + "epoch": 0.2940753163954706, + "grad_norm": 1.8364208936691284, + "learning_rate": 4.0068891610873406e-05, + "loss": 4.1115, + "step": 49447 + }, + { + "epoch": 0.29408126367875154, + "grad_norm": 1.6737971305847168, + "learning_rate": 4.0068518898261913e-05, + "loss": 4.5162, + "step": 49448 + }, + { + "epoch": 0.29408721096203255, + "grad_norm": 1.5980006456375122, + "learning_rate": 4.006814618039016e-05, + "loss": 4.3808, + "step": 49449 + }, + { + "epoch": 0.29409315824531357, + "grad_norm": 1.64032781124115, + "learning_rate": 4.006777345725827e-05, + "loss": 4.6567, + "step": 49450 + }, + { + "epoch": 0.29409910552859453, + "grad_norm": 1.5219091176986694, + "learning_rate": 4.006740072886638e-05, + "loss": 4.8169, + "step": 49451 + }, + { + "epoch": 0.29410505281187554, + "grad_norm": 1.9013652801513672, + "learning_rate": 4.0067027995214614e-05, + "loss": 4.7055, + "step": 49452 + }, + { + "epoch": 0.2941110000951565, + "grad_norm": 1.4495056867599487, + "learning_rate": 4.006665525630311e-05, + "loss": 4.7895, + "step": 49453 + }, + { + "epoch": 0.2941169473784375, + "grad_norm": 1.5288000106811523, + "learning_rate": 4.006628251213198e-05, + "loss": 4.6119, + "step": 49454 + }, + { + "epoch": 0.29412289466171854, + "grad_norm": 1.4522624015808105, + "learning_rate": 4.0065909762701375e-05, + "loss": 4.62, + "step": 49455 + }, + { + "epoch": 0.2941288419449995, + "grad_norm": 1.5066338777542114, + "learning_rate": 4.006553700801141e-05, + "loss": 4.7858, + "step": 49456 + }, + { + "epoch": 0.2941347892282805, + "grad_norm": 1.5668492317199707, + "learning_rate": 4.006516424806223e-05, + "loss": 4.3931, + "step": 49457 + }, + { + "epoch": 0.29414073651156153, + "grad_norm": 1.1262774467468262, + "learning_rate": 4.0064791482853956e-05, + "loss": 4.5079, + "step": 49458 + }, + { + "epoch": 0.2941466837948425, + "grad_norm": 1.7373217344284058, + "learning_rate": 4.006441871238671e-05, + "loss": 4.0142, + "step": 49459 + }, + { + "epoch": 0.2941526310781235, + "grad_norm": 1.3441243171691895, + "learning_rate": 4.0064045936660645e-05, + "loss": 4.7942, + "step": 49460 + }, + { + "epoch": 0.2941585783614045, + "grad_norm": 1.5051817893981934, + "learning_rate": 4.006367315567586e-05, + "loss": 4.9021, + "step": 49461 + }, + { + "epoch": 0.2941645256446855, + "grad_norm": 1.534929871559143, + "learning_rate": 4.006330036943252e-05, + "loss": 4.4706, + "step": 49462 + }, + { + "epoch": 0.2941704729279665, + "grad_norm": 1.4393450021743774, + "learning_rate": 4.006292757793073e-05, + "loss": 4.8562, + "step": 49463 + }, + { + "epoch": 0.2941764202112475, + "grad_norm": 1.6632447242736816, + "learning_rate": 4.0062554781170626e-05, + "loss": 5.0742, + "step": 49464 + }, + { + "epoch": 0.2941823674945285, + "grad_norm": 1.695550560951233, + "learning_rate": 4.006218197915234e-05, + "loss": 4.6261, + "step": 49465 + }, + { + "epoch": 0.2941883147778095, + "grad_norm": 1.588354468345642, + "learning_rate": 4.0061809171875997e-05, + "loss": 4.41, + "step": 49466 + }, + { + "epoch": 0.2941942620610905, + "grad_norm": 1.586192011833191, + "learning_rate": 4.006143635934174e-05, + "loss": 4.619, + "step": 49467 + }, + { + "epoch": 0.29420020934437147, + "grad_norm": 1.3659547567367554, + "learning_rate": 4.006106354154968e-05, + "loss": 4.9828, + "step": 49468 + }, + { + "epoch": 0.2942061566276525, + "grad_norm": 1.4488731622695923, + "learning_rate": 4.0060690718499973e-05, + "loss": 5.0811, + "step": 49469 + }, + { + "epoch": 0.2942121039109335, + "grad_norm": 1.5559865236282349, + "learning_rate": 4.0060317890192725e-05, + "loss": 4.2576, + "step": 49470 + }, + { + "epoch": 0.29421805119421446, + "grad_norm": 1.8377102613449097, + "learning_rate": 4.005994505662808e-05, + "loss": 4.171, + "step": 49471 + }, + { + "epoch": 0.2942239984774955, + "grad_norm": 1.7591997385025024, + "learning_rate": 4.005957221780616e-05, + "loss": 3.9425, + "step": 49472 + }, + { + "epoch": 0.2942299457607765, + "grad_norm": 1.6870343685150146, + "learning_rate": 4.0059199373727106e-05, + "loss": 4.0857, + "step": 49473 + }, + { + "epoch": 0.29423589304405745, + "grad_norm": 1.5010745525360107, + "learning_rate": 4.005882652439104e-05, + "loss": 4.5829, + "step": 49474 + }, + { + "epoch": 0.29424184032733847, + "grad_norm": 1.4338940382003784, + "learning_rate": 4.005845366979809e-05, + "loss": 4.868, + "step": 49475 + }, + { + "epoch": 0.2942477876106195, + "grad_norm": 1.717128038406372, + "learning_rate": 4.005808080994838e-05, + "loss": 4.1339, + "step": 49476 + }, + { + "epoch": 0.29425373489390044, + "grad_norm": 1.553295373916626, + "learning_rate": 4.005770794484206e-05, + "loss": 4.554, + "step": 49477 + }, + { + "epoch": 0.29425968217718146, + "grad_norm": 1.4355870485305786, + "learning_rate": 4.005733507447925e-05, + "loss": 4.8837, + "step": 49478 + }, + { + "epoch": 0.2942656294604625, + "grad_norm": 1.6862362623214722, + "learning_rate": 4.005696219886008e-05, + "loss": 4.4999, + "step": 49479 + }, + { + "epoch": 0.29427157674374343, + "grad_norm": 1.5905489921569824, + "learning_rate": 4.0056589317984675e-05, + "loss": 4.7104, + "step": 49480 + }, + { + "epoch": 0.29427752402702445, + "grad_norm": 1.7178646326065063, + "learning_rate": 4.005621643185318e-05, + "loss": 4.3287, + "step": 49481 + }, + { + "epoch": 0.29428347131030547, + "grad_norm": 1.5274256467819214, + "learning_rate": 4.0055843540465704e-05, + "loss": 4.4944, + "step": 49482 + }, + { + "epoch": 0.2942894185935864, + "grad_norm": 1.7360132932662964, + "learning_rate": 4.005547064382239e-05, + "loss": 3.7114, + "step": 49483 + }, + { + "epoch": 0.29429536587686744, + "grad_norm": 2.3474807739257812, + "learning_rate": 4.0055097741923376e-05, + "loss": 2.9784, + "step": 49484 + }, + { + "epoch": 0.29430131316014846, + "grad_norm": 1.8217111825942993, + "learning_rate": 4.005472483476878e-05, + "loss": 4.2803, + "step": 49485 + }, + { + "epoch": 0.2943072604434294, + "grad_norm": 1.5790483951568604, + "learning_rate": 4.0054351922358727e-05, + "loss": 4.5765, + "step": 49486 + }, + { + "epoch": 0.29431320772671044, + "grad_norm": 1.5367205142974854, + "learning_rate": 4.0053979004693364e-05, + "loss": 4.8433, + "step": 49487 + }, + { + "epoch": 0.29431915500999145, + "grad_norm": 3.0685389041900635, + "learning_rate": 4.005360608177281e-05, + "loss": 3.6697, + "step": 49488 + }, + { + "epoch": 0.2943251022932724, + "grad_norm": 2.312809705734253, + "learning_rate": 4.005323315359719e-05, + "loss": 3.9027, + "step": 49489 + }, + { + "epoch": 0.2943310495765534, + "grad_norm": 1.7252910137176514, + "learning_rate": 4.005286022016665e-05, + "loss": 4.2975, + "step": 49490 + }, + { + "epoch": 0.29433699685983444, + "grad_norm": 1.6337417364120483, + "learning_rate": 4.005248728148131e-05, + "loss": 4.4766, + "step": 49491 + }, + { + "epoch": 0.2943429441431154, + "grad_norm": 2.0553340911865234, + "learning_rate": 4.00521143375413e-05, + "loss": 3.593, + "step": 49492 + }, + { + "epoch": 0.2943488914263964, + "grad_norm": 1.841911792755127, + "learning_rate": 4.0051741388346765e-05, + "loss": 4.0946, + "step": 49493 + }, + { + "epoch": 0.29435483870967744, + "grad_norm": 1.7552002668380737, + "learning_rate": 4.0051368433897804e-05, + "loss": 4.776, + "step": 49494 + }, + { + "epoch": 0.2943607859929584, + "grad_norm": 1.6741359233856201, + "learning_rate": 4.0050995474194576e-05, + "loss": 4.5812, + "step": 49495 + }, + { + "epoch": 0.2943667332762394, + "grad_norm": 1.603284239768982, + "learning_rate": 4.00506225092372e-05, + "loss": 4.3944, + "step": 49496 + }, + { + "epoch": 0.2943726805595204, + "grad_norm": 1.6841472387313843, + "learning_rate": 4.005024953902581e-05, + "loss": 4.244, + "step": 49497 + }, + { + "epoch": 0.2943786278428014, + "grad_norm": 1.6104322671890259, + "learning_rate": 4.004987656356053e-05, + "loss": 5.2796, + "step": 49498 + }, + { + "epoch": 0.2943845751260824, + "grad_norm": 1.8939082622528076, + "learning_rate": 4.00495035828415e-05, + "loss": 3.6665, + "step": 49499 + }, + { + "epoch": 0.2943905224093634, + "grad_norm": 1.8256762027740479, + "learning_rate": 4.004913059686883e-05, + "loss": 4.3734, + "step": 49500 + }, + { + "epoch": 0.2943964696926444, + "grad_norm": 1.8690446615219116, + "learning_rate": 4.004875760564267e-05, + "loss": 4.4812, + "step": 49501 + }, + { + "epoch": 0.2944024169759254, + "grad_norm": 1.8708750009536743, + "learning_rate": 4.004838460916315e-05, + "loss": 4.2545, + "step": 49502 + }, + { + "epoch": 0.2944083642592064, + "grad_norm": 2.0394856929779053, + "learning_rate": 4.004801160743038e-05, + "loss": 4.0692, + "step": 49503 + }, + { + "epoch": 0.2944143115424874, + "grad_norm": 1.5782626867294312, + "learning_rate": 4.004763860044452e-05, + "loss": 4.3395, + "step": 49504 + }, + { + "epoch": 0.2944202588257684, + "grad_norm": 1.763609766960144, + "learning_rate": 4.004726558820568e-05, + "loss": 3.8106, + "step": 49505 + }, + { + "epoch": 0.2944262061090494, + "grad_norm": 2.0640177726745605, + "learning_rate": 4.0046892570714e-05, + "loss": 3.9365, + "step": 49506 + }, + { + "epoch": 0.29443215339233036, + "grad_norm": 1.7292381525039673, + "learning_rate": 4.00465195479696e-05, + "loss": 4.2991, + "step": 49507 + }, + { + "epoch": 0.2944381006756114, + "grad_norm": 1.5970176458358765, + "learning_rate": 4.004614651997261e-05, + "loss": 4.3334, + "step": 49508 + }, + { + "epoch": 0.2944440479588924, + "grad_norm": 1.6984236240386963, + "learning_rate": 4.004577348672317e-05, + "loss": 4.5981, + "step": 49509 + }, + { + "epoch": 0.29444999524217336, + "grad_norm": 1.7372815608978271, + "learning_rate": 4.0045400448221404e-05, + "loss": 4.2252, + "step": 49510 + }, + { + "epoch": 0.2944559425254544, + "grad_norm": 1.6140018701553345, + "learning_rate": 4.004502740446745e-05, + "loss": 4.1993, + "step": 49511 + }, + { + "epoch": 0.2944618898087354, + "grad_norm": 1.8884526491165161, + "learning_rate": 4.004465435546143e-05, + "loss": 3.5821, + "step": 49512 + }, + { + "epoch": 0.29446783709201635, + "grad_norm": 1.919862151145935, + "learning_rate": 4.004428130120347e-05, + "loss": 3.9109, + "step": 49513 + }, + { + "epoch": 0.29447378437529736, + "grad_norm": 1.8930010795593262, + "learning_rate": 4.0043908241693716e-05, + "loss": 3.7321, + "step": 49514 + }, + { + "epoch": 0.2944797316585784, + "grad_norm": 1.9443068504333496, + "learning_rate": 4.004353517693229e-05, + "loss": 3.8104, + "step": 49515 + }, + { + "epoch": 0.29448567894185934, + "grad_norm": 1.8199032545089722, + "learning_rate": 4.0043162106919315e-05, + "loss": 3.8491, + "step": 49516 + }, + { + "epoch": 0.29449162622514036, + "grad_norm": 1.631809949874878, + "learning_rate": 4.004278903165493e-05, + "loss": 4.0398, + "step": 49517 + }, + { + "epoch": 0.2944975735084214, + "grad_norm": 1.70535147190094, + "learning_rate": 4.004241595113926e-05, + "loss": 4.1371, + "step": 49518 + }, + { + "epoch": 0.29450352079170233, + "grad_norm": 1.7419474124908447, + "learning_rate": 4.004204286537244e-05, + "loss": 3.8654, + "step": 49519 + }, + { + "epoch": 0.29450946807498335, + "grad_norm": 1.9221603870391846, + "learning_rate": 4.00416697743546e-05, + "loss": 4.1424, + "step": 49520 + }, + { + "epoch": 0.29451541535826437, + "grad_norm": 1.8201521635055542, + "learning_rate": 4.004129667808587e-05, + "loss": 4.0595, + "step": 49521 + }, + { + "epoch": 0.2945213626415453, + "grad_norm": 1.8054064512252808, + "learning_rate": 4.004092357656637e-05, + "loss": 4.0897, + "step": 49522 + }, + { + "epoch": 0.29452730992482634, + "grad_norm": 1.7593028545379639, + "learning_rate": 4.004055046979625e-05, + "loss": 4.0645, + "step": 49523 + }, + { + "epoch": 0.29453325720810736, + "grad_norm": 1.729135274887085, + "learning_rate": 4.0040177357775625e-05, + "loss": 4.1484, + "step": 49524 + }, + { + "epoch": 0.2945392044913883, + "grad_norm": 1.5444138050079346, + "learning_rate": 4.003980424050462e-05, + "loss": 4.0071, + "step": 49525 + }, + { + "epoch": 0.29454515177466933, + "grad_norm": 1.7230758666992188, + "learning_rate": 4.003943111798339e-05, + "loss": 4.1219, + "step": 49526 + }, + { + "epoch": 0.29455109905795035, + "grad_norm": 1.668558120727539, + "learning_rate": 4.003905799021204e-05, + "loss": 4.0096, + "step": 49527 + }, + { + "epoch": 0.2945570463412313, + "grad_norm": 1.8548455238342285, + "learning_rate": 4.003868485719071e-05, + "loss": 3.8963, + "step": 49528 + }, + { + "epoch": 0.2945629936245123, + "grad_norm": 1.3937740325927734, + "learning_rate": 4.003831171891954e-05, + "loss": 4.0931, + "step": 49529 + }, + { + "epoch": 0.29456894090779334, + "grad_norm": 1.823176622390747, + "learning_rate": 4.0037938575398645e-05, + "loss": 4.0395, + "step": 49530 + }, + { + "epoch": 0.2945748881910743, + "grad_norm": 1.9173475503921509, + "learning_rate": 4.0037565426628156e-05, + "loss": 3.7883, + "step": 49531 + }, + { + "epoch": 0.2945808354743553, + "grad_norm": 1.8854310512542725, + "learning_rate": 4.003719227260822e-05, + "loss": 4.0258, + "step": 49532 + }, + { + "epoch": 0.29458678275763633, + "grad_norm": 1.6384352445602417, + "learning_rate": 4.003681911333894e-05, + "loss": 4.0151, + "step": 49533 + }, + { + "epoch": 0.2945927300409173, + "grad_norm": 1.7315566539764404, + "learning_rate": 4.0036445948820475e-05, + "loss": 4.1048, + "step": 49534 + }, + { + "epoch": 0.2945986773241983, + "grad_norm": 1.8915042877197266, + "learning_rate": 4.0036072779052934e-05, + "loss": 4.1674, + "step": 49535 + }, + { + "epoch": 0.2946046246074793, + "grad_norm": 1.9745839834213257, + "learning_rate": 4.003569960403646e-05, + "loss": 4.3713, + "step": 49536 + }, + { + "epoch": 0.2946105718907603, + "grad_norm": 1.9235193729400635, + "learning_rate": 4.003532642377118e-05, + "loss": 4.1403, + "step": 49537 + }, + { + "epoch": 0.2946165191740413, + "grad_norm": 1.7901313304901123, + "learning_rate": 4.0034953238257226e-05, + "loss": 4.2444, + "step": 49538 + }, + { + "epoch": 0.2946224664573223, + "grad_norm": 1.8983274698257446, + "learning_rate": 4.0034580047494706e-05, + "loss": 4.2871, + "step": 49539 + }, + { + "epoch": 0.2946284137406033, + "grad_norm": 1.3484926223754883, + "learning_rate": 4.0034206851483794e-05, + "loss": 4.1793, + "step": 49540 + }, + { + "epoch": 0.2946343610238843, + "grad_norm": 1.7691391706466675, + "learning_rate": 4.003383365022458e-05, + "loss": 4.1392, + "step": 49541 + }, + { + "epoch": 0.2946403083071653, + "grad_norm": 1.6480101346969604, + "learning_rate": 4.003346044371721e-05, + "loss": 4.1004, + "step": 49542 + }, + { + "epoch": 0.29464625559044627, + "grad_norm": 1.648376226425171, + "learning_rate": 4.003308723196182e-05, + "loss": 4.054, + "step": 49543 + }, + { + "epoch": 0.2946522028737273, + "grad_norm": 1.7476309537887573, + "learning_rate": 4.003271401495854e-05, + "loss": 3.9683, + "step": 49544 + }, + { + "epoch": 0.2946581501570083, + "grad_norm": 1.6771118640899658, + "learning_rate": 4.003234079270748e-05, + "loss": 4.0779, + "step": 49545 + }, + { + "epoch": 0.29466409744028926, + "grad_norm": 1.715004563331604, + "learning_rate": 4.00319675652088e-05, + "loss": 3.9355, + "step": 49546 + }, + { + "epoch": 0.2946700447235703, + "grad_norm": 1.7138326168060303, + "learning_rate": 4.003159433246261e-05, + "loss": 4.2762, + "step": 49547 + }, + { + "epoch": 0.2946759920068513, + "grad_norm": 1.7609963417053223, + "learning_rate": 4.003122109446904e-05, + "loss": 4.2664, + "step": 49548 + }, + { + "epoch": 0.29468193929013226, + "grad_norm": 1.7703452110290527, + "learning_rate": 4.0030847851228225e-05, + "loss": 4.229, + "step": 49549 + }, + { + "epoch": 0.29468788657341327, + "grad_norm": 1.7970507144927979, + "learning_rate": 4.003047460274031e-05, + "loss": 4.0457, + "step": 49550 + }, + { + "epoch": 0.2946938338566943, + "grad_norm": 1.7957407236099243, + "learning_rate": 4.00301013490054e-05, + "loss": 4.2983, + "step": 49551 + }, + { + "epoch": 0.29469978113997525, + "grad_norm": 1.9176546335220337, + "learning_rate": 4.002972809002364e-05, + "loss": 3.8633, + "step": 49552 + }, + { + "epoch": 0.29470572842325626, + "grad_norm": 1.463416576385498, + "learning_rate": 4.0029354825795155e-05, + "loss": 4.2758, + "step": 49553 + }, + { + "epoch": 0.2947116757065373, + "grad_norm": 1.6430444717407227, + "learning_rate": 4.0028981556320087e-05, + "loss": 4.2076, + "step": 49554 + }, + { + "epoch": 0.29471762298981824, + "grad_norm": 1.5744551420211792, + "learning_rate": 4.002860828159854e-05, + "loss": 4.0999, + "step": 49555 + }, + { + "epoch": 0.29472357027309926, + "grad_norm": 1.6330499649047852, + "learning_rate": 4.0028235001630674e-05, + "loss": 4.1038, + "step": 49556 + }, + { + "epoch": 0.29472951755638027, + "grad_norm": 1.8521441221237183, + "learning_rate": 4.00278617164166e-05, + "loss": 4.2874, + "step": 49557 + }, + { + "epoch": 0.29473546483966123, + "grad_norm": 1.5948820114135742, + "learning_rate": 4.002748842595646e-05, + "loss": 4.1418, + "step": 49558 + }, + { + "epoch": 0.29474141212294225, + "grad_norm": 1.6614278554916382, + "learning_rate": 4.002711513025038e-05, + "loss": 4.0778, + "step": 49559 + }, + { + "epoch": 0.29474735940622326, + "grad_norm": 1.5646356344223022, + "learning_rate": 4.002674182929849e-05, + "loss": 4.0904, + "step": 49560 + }, + { + "epoch": 0.2947533066895042, + "grad_norm": 1.3961005210876465, + "learning_rate": 4.002636852310091e-05, + "loss": 4.4322, + "step": 49561 + }, + { + "epoch": 0.29475925397278524, + "grad_norm": 1.7778667211532593, + "learning_rate": 4.0025995211657784e-05, + "loss": 4.4734, + "step": 49562 + }, + { + "epoch": 0.29476520125606626, + "grad_norm": 1.712629795074463, + "learning_rate": 4.002562189496925e-05, + "loss": 4.3834, + "step": 49563 + }, + { + "epoch": 0.2947711485393472, + "grad_norm": 1.728103756904602, + "learning_rate": 4.002524857303541e-05, + "loss": 4.2235, + "step": 49564 + }, + { + "epoch": 0.29477709582262823, + "grad_norm": 1.7359826564788818, + "learning_rate": 4.002487524585642e-05, + "loss": 4.3851, + "step": 49565 + }, + { + "epoch": 0.29478304310590925, + "grad_norm": 1.6893097162246704, + "learning_rate": 4.00245019134324e-05, + "loss": 4.6751, + "step": 49566 + }, + { + "epoch": 0.2947889903891902, + "grad_norm": 1.5411309003829956, + "learning_rate": 4.0024128575763475e-05, + "loss": 4.6629, + "step": 49567 + }, + { + "epoch": 0.2947949376724712, + "grad_norm": 1.4622561931610107, + "learning_rate": 4.002375523284979e-05, + "loss": 4.271, + "step": 49568 + }, + { + "epoch": 0.2948008849557522, + "grad_norm": 1.4749951362609863, + "learning_rate": 4.002338188469147e-05, + "loss": 4.3537, + "step": 49569 + }, + { + "epoch": 0.2948068322390332, + "grad_norm": 1.4696513414382935, + "learning_rate": 4.002300853128864e-05, + "loss": 4.2084, + "step": 49570 + }, + { + "epoch": 0.2948127795223142, + "grad_norm": 1.6024986505508423, + "learning_rate": 4.002263517264142e-05, + "loss": 4.229, + "step": 49571 + }, + { + "epoch": 0.2948187268055952, + "grad_norm": 1.3830721378326416, + "learning_rate": 4.002226180874997e-05, + "loss": 4.0542, + "step": 49572 + }, + { + "epoch": 0.2948246740888762, + "grad_norm": 1.4973113536834717, + "learning_rate": 4.002188843961439e-05, + "loss": 3.9503, + "step": 49573 + }, + { + "epoch": 0.2948306213721572, + "grad_norm": 1.8594828844070435, + "learning_rate": 4.0021515065234836e-05, + "loss": 4.7876, + "step": 49574 + }, + { + "epoch": 0.29483656865543817, + "grad_norm": 1.708579182624817, + "learning_rate": 4.0021141685611426e-05, + "loss": 4.1857, + "step": 49575 + }, + { + "epoch": 0.2948425159387192, + "grad_norm": 2.0169174671173096, + "learning_rate": 4.002076830074428e-05, + "loss": 4.0043, + "step": 49576 + }, + { + "epoch": 0.2948484632220002, + "grad_norm": 2.3416388034820557, + "learning_rate": 4.002039491063355e-05, + "loss": 3.6777, + "step": 49577 + }, + { + "epoch": 0.29485441050528116, + "grad_norm": 1.5536192655563354, + "learning_rate": 4.002002151527935e-05, + "loss": 4.7933, + "step": 49578 + }, + { + "epoch": 0.2948603577885622, + "grad_norm": 1.8498318195343018, + "learning_rate": 4.001964811468182e-05, + "loss": 4.8769, + "step": 49579 + }, + { + "epoch": 0.2948663050718432, + "grad_norm": 1.5901862382888794, + "learning_rate": 4.001927470884108e-05, + "loss": 4.7469, + "step": 49580 + }, + { + "epoch": 0.29487225235512415, + "grad_norm": 1.7927504777908325, + "learning_rate": 4.001890129775726e-05, + "loss": 4.3815, + "step": 49581 + }, + { + "epoch": 0.29487819963840517, + "grad_norm": 1.7521295547485352, + "learning_rate": 4.001852788143051e-05, + "loss": 4.6639, + "step": 49582 + }, + { + "epoch": 0.2948841469216862, + "grad_norm": 1.856636881828308, + "learning_rate": 4.0018154459860946e-05, + "loss": 4.7579, + "step": 49583 + }, + { + "epoch": 0.29489009420496715, + "grad_norm": 1.7989466190338135, + "learning_rate": 4.0017781033048694e-05, + "loss": 4.4527, + "step": 49584 + }, + { + "epoch": 0.29489604148824816, + "grad_norm": 1.820358157157898, + "learning_rate": 4.00174076009939e-05, + "loss": 4.65, + "step": 49585 + }, + { + "epoch": 0.2949019887715292, + "grad_norm": 1.74485445022583, + "learning_rate": 4.0017034163696665e-05, + "loss": 4.265, + "step": 49586 + }, + { + "epoch": 0.29490793605481014, + "grad_norm": 1.4346405267715454, + "learning_rate": 4.001666072115715e-05, + "loss": 4.7346, + "step": 49587 + }, + { + "epoch": 0.29491388333809115, + "grad_norm": 1.4800293445587158, + "learning_rate": 4.0016287273375475e-05, + "loss": 4.7652, + "step": 49588 + }, + { + "epoch": 0.29491983062137217, + "grad_norm": 1.2537246942520142, + "learning_rate": 4.001591382035177e-05, + "loss": 4.4613, + "step": 49589 + }, + { + "epoch": 0.29492577790465313, + "grad_norm": 1.5300284624099731, + "learning_rate": 4.0015540362086154e-05, + "loss": 4.3202, + "step": 49590 + }, + { + "epoch": 0.29493172518793415, + "grad_norm": 1.5037599802017212, + "learning_rate": 4.001516689857878e-05, + "loss": 4.3175, + "step": 49591 + }, + { + "epoch": 0.29493767247121516, + "grad_norm": 1.6633961200714111, + "learning_rate": 4.0014793429829765e-05, + "loss": 4.1952, + "step": 49592 + }, + { + "epoch": 0.2949436197544961, + "grad_norm": 1.5843583345413208, + "learning_rate": 4.0014419955839235e-05, + "loss": 4.3524, + "step": 49593 + }, + { + "epoch": 0.29494956703777714, + "grad_norm": 1.5436551570892334, + "learning_rate": 4.001404647660733e-05, + "loss": 4.2389, + "step": 49594 + }, + { + "epoch": 0.29495551432105815, + "grad_norm": 1.7191097736358643, + "learning_rate": 4.001367299213418e-05, + "loss": 4.2819, + "step": 49595 + }, + { + "epoch": 0.2949614616043391, + "grad_norm": 1.8860387802124023, + "learning_rate": 4.0013299502419906e-05, + "loss": 2.9377, + "step": 49596 + }, + { + "epoch": 0.29496740888762013, + "grad_norm": 2.174821138381958, + "learning_rate": 4.0012926007464646e-05, + "loss": 1.8773, + "step": 49597 + }, + { + "epoch": 0.29497335617090115, + "grad_norm": 1.9264777898788452, + "learning_rate": 4.001255250726853e-05, + "loss": 3.0038, + "step": 49598 + }, + { + "epoch": 0.2949793034541821, + "grad_norm": 2.171025037765503, + "learning_rate": 4.0012179001831684e-05, + "loss": 3.596, + "step": 49599 + }, + { + "epoch": 0.2949852507374631, + "grad_norm": 1.8621580600738525, + "learning_rate": 4.001180549115424e-05, + "loss": 3.7188, + "step": 49600 + }, + { + "epoch": 0.29499119802074414, + "grad_norm": 2.260962963104248, + "learning_rate": 4.0011431975236337e-05, + "loss": 3.8795, + "step": 49601 + }, + { + "epoch": 0.2949971453040251, + "grad_norm": 1.6622896194458008, + "learning_rate": 4.0011058454078085e-05, + "loss": 5.0045, + "step": 49602 + }, + { + "epoch": 0.2950030925873061, + "grad_norm": 1.7766523361206055, + "learning_rate": 4.001068492767964e-05, + "loss": 5.0831, + "step": 49603 + }, + { + "epoch": 0.29500903987058713, + "grad_norm": 1.4932760000228882, + "learning_rate": 4.001031139604112e-05, + "loss": 4.32, + "step": 49604 + }, + { + "epoch": 0.2950149871538681, + "grad_norm": 1.680983543395996, + "learning_rate": 4.000993785916265e-05, + "loss": 4.6026, + "step": 49605 + }, + { + "epoch": 0.2950209344371491, + "grad_norm": 1.703885793685913, + "learning_rate": 4.000956431704437e-05, + "loss": 4.9484, + "step": 49606 + }, + { + "epoch": 0.2950268817204301, + "grad_norm": 1.780962347984314, + "learning_rate": 4.00091907696864e-05, + "loss": 4.4701, + "step": 49607 + }, + { + "epoch": 0.2950328290037111, + "grad_norm": 1.5606530904769897, + "learning_rate": 4.000881721708888e-05, + "loss": 4.4696, + "step": 49608 + }, + { + "epoch": 0.2950387762869921, + "grad_norm": 1.6632500886917114, + "learning_rate": 4.0008443659251935e-05, + "loss": 4.7287, + "step": 49609 + }, + { + "epoch": 0.2950447235702731, + "grad_norm": 1.5903350114822388, + "learning_rate": 4.0008070096175704e-05, + "loss": 4.6384, + "step": 49610 + }, + { + "epoch": 0.2950506708535541, + "grad_norm": 2.1851842403411865, + "learning_rate": 4.00076965278603e-05, + "loss": 4.0043, + "step": 49611 + }, + { + "epoch": 0.2950566181368351, + "grad_norm": 1.6551129817962646, + "learning_rate": 4.000732295430588e-05, + "loss": 4.6133, + "step": 49612 + }, + { + "epoch": 0.2950625654201161, + "grad_norm": 1.4348090887069702, + "learning_rate": 4.000694937551255e-05, + "loss": 4.5358, + "step": 49613 + }, + { + "epoch": 0.29506851270339707, + "grad_norm": 1.8005188703536987, + "learning_rate": 4.000657579148044e-05, + "loss": 4.3213, + "step": 49614 + }, + { + "epoch": 0.2950744599866781, + "grad_norm": 1.7037920951843262, + "learning_rate": 4.000620220220971e-05, + "loss": 4.482, + "step": 49615 + }, + { + "epoch": 0.2950804072699591, + "grad_norm": 1.9451981782913208, + "learning_rate": 4.0005828607700446e-05, + "loss": 4.197, + "step": 49616 + }, + { + "epoch": 0.29508635455324006, + "grad_norm": 2.4335789680480957, + "learning_rate": 4.000545500795282e-05, + "loss": 4.6068, + "step": 49617 + }, + { + "epoch": 0.2950923018365211, + "grad_norm": 2.1564319133758545, + "learning_rate": 4.0005081402966935e-05, + "loss": 4.7385, + "step": 49618 + }, + { + "epoch": 0.2950982491198021, + "grad_norm": 1.355788230895996, + "learning_rate": 4.000470779274293e-05, + "loss": 4.7479, + "step": 49619 + }, + { + "epoch": 0.29510419640308305, + "grad_norm": 2.1692662239074707, + "learning_rate": 4.0004334177280945e-05, + "loss": 4.1508, + "step": 49620 + }, + { + "epoch": 0.29511014368636407, + "grad_norm": 2.8182501792907715, + "learning_rate": 4.00039605565811e-05, + "loss": 3.1746, + "step": 49621 + }, + { + "epoch": 0.2951160909696451, + "grad_norm": 1.9295281171798706, + "learning_rate": 4.000358693064352e-05, + "loss": 4.4448, + "step": 49622 + }, + { + "epoch": 0.29512203825292604, + "grad_norm": 2.178762674331665, + "learning_rate": 4.000321329946835e-05, + "loss": 4.4437, + "step": 49623 + }, + { + "epoch": 0.29512798553620706, + "grad_norm": 2.1404757499694824, + "learning_rate": 4.000283966305571e-05, + "loss": 4.2881, + "step": 49624 + }, + { + "epoch": 0.2951339328194881, + "grad_norm": 1.9991573095321655, + "learning_rate": 4.000246602140574e-05, + "loss": 4.321, + "step": 49625 + }, + { + "epoch": 0.29513988010276904, + "grad_norm": 2.095606803894043, + "learning_rate": 4.000209237451856e-05, + "loss": 4.5507, + "step": 49626 + }, + { + "epoch": 0.29514582738605005, + "grad_norm": 1.7974157333374023, + "learning_rate": 4.00017187223943e-05, + "loss": 4.5999, + "step": 49627 + }, + { + "epoch": 0.29515177466933107, + "grad_norm": 1.9877508878707886, + "learning_rate": 4.0001345065033105e-05, + "loss": 4.3875, + "step": 49628 + }, + { + "epoch": 0.29515772195261203, + "grad_norm": 1.6335086822509766, + "learning_rate": 4.000097140243508e-05, + "loss": 4.8263, + "step": 49629 + }, + { + "epoch": 0.29516366923589304, + "grad_norm": 1.5376596450805664, + "learning_rate": 4.0000597734600385e-05, + "loss": 4.7318, + "step": 49630 + }, + { + "epoch": 0.29516961651917406, + "grad_norm": 1.977810025215149, + "learning_rate": 4.000022406152913e-05, + "loss": 4.4534, + "step": 49631 + }, + { + "epoch": 0.295175563802455, + "grad_norm": 2.3757286071777344, + "learning_rate": 3.9999850383221456e-05, + "loss": 4.3875, + "step": 49632 + }, + { + "epoch": 0.29518151108573604, + "grad_norm": 2.046839475631714, + "learning_rate": 3.999947669967749e-05, + "loss": 4.1331, + "step": 49633 + }, + { + "epoch": 0.29518745836901705, + "grad_norm": 1.9561001062393188, + "learning_rate": 3.999910301089735e-05, + "loss": 3.8252, + "step": 49634 + }, + { + "epoch": 0.295193405652298, + "grad_norm": 1.653836727142334, + "learning_rate": 3.999872931688119e-05, + "loss": 4.8137, + "step": 49635 + }, + { + "epoch": 0.29519935293557903, + "grad_norm": 1.7901431322097778, + "learning_rate": 3.999835561762913e-05, + "loss": 4.7492, + "step": 49636 + }, + { + "epoch": 0.29520530021886005, + "grad_norm": 1.6823443174362183, + "learning_rate": 3.9997981913141294e-05, + "loss": 4.7848, + "step": 49637 + }, + { + "epoch": 0.295211247502141, + "grad_norm": 1.4707965850830078, + "learning_rate": 3.9997608203417815e-05, + "loss": 4.9004, + "step": 49638 + }, + { + "epoch": 0.295217194785422, + "grad_norm": 1.6877021789550781, + "learning_rate": 3.9997234488458825e-05, + "loss": 4.6433, + "step": 49639 + }, + { + "epoch": 0.29522314206870304, + "grad_norm": 1.4300428628921509, + "learning_rate": 3.9996860768264454e-05, + "loss": 5.0854, + "step": 49640 + }, + { + "epoch": 0.295229089351984, + "grad_norm": 1.7333675622940063, + "learning_rate": 3.999648704283484e-05, + "loss": 5.1224, + "step": 49641 + }, + { + "epoch": 0.295235036635265, + "grad_norm": 1.6232799291610718, + "learning_rate": 3.999611331217011e-05, + "loss": 5.1595, + "step": 49642 + }, + { + "epoch": 0.29524098391854603, + "grad_norm": 1.479225754737854, + "learning_rate": 3.999573957627038e-05, + "loss": 5.3184, + "step": 49643 + }, + { + "epoch": 0.295246931201827, + "grad_norm": 1.3999043703079224, + "learning_rate": 3.9995365835135805e-05, + "loss": 5.2123, + "step": 49644 + }, + { + "epoch": 0.295252878485108, + "grad_norm": 1.6477890014648438, + "learning_rate": 3.99949920887665e-05, + "loss": 5.0097, + "step": 49645 + }, + { + "epoch": 0.295258825768389, + "grad_norm": 1.474377155303955, + "learning_rate": 3.999461833716259e-05, + "loss": 5.0678, + "step": 49646 + }, + { + "epoch": 0.29526477305167, + "grad_norm": 1.5901668071746826, + "learning_rate": 3.999424458032422e-05, + "loss": 5.0739, + "step": 49647 + }, + { + "epoch": 0.295270720334951, + "grad_norm": 1.4125560522079468, + "learning_rate": 3.9993870818251514e-05, + "loss": 5.0706, + "step": 49648 + }, + { + "epoch": 0.295276667618232, + "grad_norm": 1.4105820655822754, + "learning_rate": 3.999349705094461e-05, + "loss": 5.092, + "step": 49649 + }, + { + "epoch": 0.295282614901513, + "grad_norm": 2.4986846446990967, + "learning_rate": 3.9993123278403614e-05, + "loss": 3.6766, + "step": 49650 + }, + { + "epoch": 0.295288562184794, + "grad_norm": 3.5093135833740234, + "learning_rate": 3.999274950062869e-05, + "loss": 3.1482, + "step": 49651 + }, + { + "epoch": 0.295294509468075, + "grad_norm": 5.378631114959717, + "learning_rate": 3.999237571761994e-05, + "loss": 2.5429, + "step": 49652 + }, + { + "epoch": 0.29530045675135597, + "grad_norm": 3.084059000015259, + "learning_rate": 3.999200192937751e-05, + "loss": 2.6202, + "step": 49653 + }, + { + "epoch": 0.295306404034637, + "grad_norm": 2.982081413269043, + "learning_rate": 3.9991628135901525e-05, + "loss": 2.2641, + "step": 49654 + }, + { + "epoch": 0.295312351317918, + "grad_norm": 2.5439562797546387, + "learning_rate": 3.9991254337192126e-05, + "loss": 3.7946, + "step": 49655 + }, + { + "epoch": 0.29531829860119896, + "grad_norm": 2.867372512817383, + "learning_rate": 3.9990880533249425e-05, + "loss": 2.7162, + "step": 49656 + }, + { + "epoch": 0.29532424588448, + "grad_norm": 3.7898101806640625, + "learning_rate": 3.999050672407357e-05, + "loss": 3.2117, + "step": 49657 + }, + { + "epoch": 0.295330193167761, + "grad_norm": 3.260589838027954, + "learning_rate": 3.999013290966468e-05, + "loss": 1.9374, + "step": 49658 + }, + { + "epoch": 0.29533614045104195, + "grad_norm": 3.446647882461548, + "learning_rate": 3.9989759090022894e-05, + "loss": 2.5964, + "step": 49659 + }, + { + "epoch": 0.29534208773432297, + "grad_norm": 3.961568593978882, + "learning_rate": 3.9989385265148336e-05, + "loss": 2.9481, + "step": 49660 + }, + { + "epoch": 0.295348035017604, + "grad_norm": 2.9014251232147217, + "learning_rate": 3.998901143504113e-05, + "loss": 2.7904, + "step": 49661 + }, + { + "epoch": 0.29535398230088494, + "grad_norm": 2.5669727325439453, + "learning_rate": 3.998863759970142e-05, + "loss": 4.3882, + "step": 49662 + }, + { + "epoch": 0.29535992958416596, + "grad_norm": 1.296112060546875, + "learning_rate": 3.998826375912934e-05, + "loss": 5.468, + "step": 49663 + }, + { + "epoch": 0.295365876867447, + "grad_norm": 1.6755902767181396, + "learning_rate": 3.9987889913325005e-05, + "loss": 5.6529, + "step": 49664 + }, + { + "epoch": 0.29537182415072794, + "grad_norm": 1.8867144584655762, + "learning_rate": 3.998751606228855e-05, + "loss": 4.7618, + "step": 49665 + }, + { + "epoch": 0.29537777143400895, + "grad_norm": 2.583449602127075, + "learning_rate": 3.9987142206020114e-05, + "loss": 2.6616, + "step": 49666 + }, + { + "epoch": 0.29538371871728997, + "grad_norm": 3.856011390686035, + "learning_rate": 3.998676834451982e-05, + "loss": 3.6383, + "step": 49667 + }, + { + "epoch": 0.2953896660005709, + "grad_norm": 2.526892900466919, + "learning_rate": 3.998639447778779e-05, + "loss": 2.1936, + "step": 49668 + }, + { + "epoch": 0.29539561328385194, + "grad_norm": 2.8222339153289795, + "learning_rate": 3.998602060582418e-05, + "loss": 3.3176, + "step": 49669 + }, + { + "epoch": 0.29540156056713296, + "grad_norm": 3.596616744995117, + "learning_rate": 3.9985646728629096e-05, + "loss": 2.111, + "step": 49670 + }, + { + "epoch": 0.2954075078504139, + "grad_norm": 3.580796241760254, + "learning_rate": 3.998527284620268e-05, + "loss": 2.6595, + "step": 49671 + }, + { + "epoch": 0.29541345513369494, + "grad_norm": 1.9578592777252197, + "learning_rate": 3.998489895854506e-05, + "loss": 4.5493, + "step": 49672 + }, + { + "epoch": 0.29541940241697595, + "grad_norm": 1.9805270433425903, + "learning_rate": 3.998452506565636e-05, + "loss": 4.8805, + "step": 49673 + }, + { + "epoch": 0.2954253497002569, + "grad_norm": 2.051321268081665, + "learning_rate": 3.998415116753673e-05, + "loss": 4.3976, + "step": 49674 + }, + { + "epoch": 0.2954312969835379, + "grad_norm": 2.1939663887023926, + "learning_rate": 3.998377726418628e-05, + "loss": 5.3139, + "step": 49675 + }, + { + "epoch": 0.29543724426681894, + "grad_norm": 1.6567856073379517, + "learning_rate": 3.998340335560515e-05, + "loss": 4.8272, + "step": 49676 + }, + { + "epoch": 0.2954431915500999, + "grad_norm": 1.7940764427185059, + "learning_rate": 3.9983029441793465e-05, + "loss": 4.757, + "step": 49677 + }, + { + "epoch": 0.2954491388333809, + "grad_norm": 1.9081482887268066, + "learning_rate": 3.998265552275137e-05, + "loss": 4.7984, + "step": 49678 + }, + { + "epoch": 0.29545508611666194, + "grad_norm": 2.802637815475464, + "learning_rate": 3.998228159847897e-05, + "loss": 2.9444, + "step": 49679 + }, + { + "epoch": 0.2954610333999429, + "grad_norm": 2.2883880138397217, + "learning_rate": 3.998190766897641e-05, + "loss": 3.7299, + "step": 49680 + }, + { + "epoch": 0.2954669806832239, + "grad_norm": 1.595808506011963, + "learning_rate": 3.9981533734243836e-05, + "loss": 4.7082, + "step": 49681 + }, + { + "epoch": 0.29547292796650493, + "grad_norm": 2.8052775859832764, + "learning_rate": 3.998115979428135e-05, + "loss": 3.8358, + "step": 49682 + }, + { + "epoch": 0.2954788752497859, + "grad_norm": 1.5275253057479858, + "learning_rate": 3.9980785849089104e-05, + "loss": 4.9245, + "step": 49683 + }, + { + "epoch": 0.2954848225330669, + "grad_norm": 1.615110993385315, + "learning_rate": 3.998041189866721e-05, + "loss": 5.1566, + "step": 49684 + }, + { + "epoch": 0.29549076981634786, + "grad_norm": 1.3800278902053833, + "learning_rate": 3.998003794301582e-05, + "loss": 4.9351, + "step": 49685 + }, + { + "epoch": 0.2954967170996289, + "grad_norm": 2.144575834274292, + "learning_rate": 3.997966398213505e-05, + "loss": 4.4089, + "step": 49686 + }, + { + "epoch": 0.2955026643829099, + "grad_norm": 1.6425251960754395, + "learning_rate": 3.997929001602504e-05, + "loss": 4.7137, + "step": 49687 + }, + { + "epoch": 0.29550861166619086, + "grad_norm": 2.0353736877441406, + "learning_rate": 3.997891604468591e-05, + "loss": 4.4495, + "step": 49688 + }, + { + "epoch": 0.2955145589494719, + "grad_norm": 2.5221612453460693, + "learning_rate": 3.997854206811779e-05, + "loss": 4.1942, + "step": 49689 + }, + { + "epoch": 0.2955205062327529, + "grad_norm": 1.7009336948394775, + "learning_rate": 3.9978168086320825e-05, + "loss": 4.3934, + "step": 49690 + }, + { + "epoch": 0.29552645351603385, + "grad_norm": 1.6042786836624146, + "learning_rate": 3.9977794099295123e-05, + "loss": 4.522, + "step": 49691 + }, + { + "epoch": 0.29553240079931486, + "grad_norm": 2.098860740661621, + "learning_rate": 3.9977420107040844e-05, + "loss": 4.9514, + "step": 49692 + }, + { + "epoch": 0.2955383480825959, + "grad_norm": 1.954365611076355, + "learning_rate": 3.997704610955809e-05, + "loss": 4.9297, + "step": 49693 + }, + { + "epoch": 0.29554429536587684, + "grad_norm": 1.5982296466827393, + "learning_rate": 3.9976672106847e-05, + "loss": 4.7232, + "step": 49694 + }, + { + "epoch": 0.29555024264915786, + "grad_norm": 1.8549946546554565, + "learning_rate": 3.997629809890772e-05, + "loss": 4.1578, + "step": 49695 + }, + { + "epoch": 0.2955561899324389, + "grad_norm": 1.4065442085266113, + "learning_rate": 3.997592408574037e-05, + "loss": 4.7685, + "step": 49696 + }, + { + "epoch": 0.29556213721571983, + "grad_norm": 2.232130765914917, + "learning_rate": 3.997555006734507e-05, + "loss": 4.2955, + "step": 49697 + }, + { + "epoch": 0.29556808449900085, + "grad_norm": 1.4366109371185303, + "learning_rate": 3.997517604372197e-05, + "loss": 4.7502, + "step": 49698 + }, + { + "epoch": 0.29557403178228187, + "grad_norm": 2.2148053646087646, + "learning_rate": 3.997480201487118e-05, + "loss": 3.7612, + "step": 49699 + }, + { + "epoch": 0.2955799790655628, + "grad_norm": 1.6477347612380981, + "learning_rate": 3.997442798079285e-05, + "loss": 4.6849, + "step": 49700 + }, + { + "epoch": 0.29558592634884384, + "grad_norm": 1.4450125694274902, + "learning_rate": 3.99740539414871e-05, + "loss": 4.7354, + "step": 49701 + }, + { + "epoch": 0.29559187363212486, + "grad_norm": 1.6120686531066895, + "learning_rate": 3.997367989695406e-05, + "loss": 4.5829, + "step": 49702 + }, + { + "epoch": 0.2955978209154058, + "grad_norm": 1.8677051067352295, + "learning_rate": 3.997330584719386e-05, + "loss": 4.0869, + "step": 49703 + }, + { + "epoch": 0.29560376819868683, + "grad_norm": 2.1834371089935303, + "learning_rate": 3.997293179220665e-05, + "loss": 3.7543, + "step": 49704 + }, + { + "epoch": 0.29560971548196785, + "grad_norm": 1.9770923852920532, + "learning_rate": 3.997255773199253e-05, + "loss": 4.7164, + "step": 49705 + }, + { + "epoch": 0.2956156627652488, + "grad_norm": 1.497696876525879, + "learning_rate": 3.997218366655164e-05, + "loss": 4.8289, + "step": 49706 + }, + { + "epoch": 0.2956216100485298, + "grad_norm": 1.8297520875930786, + "learning_rate": 3.997180959588412e-05, + "loss": 4.249, + "step": 49707 + }, + { + "epoch": 0.29562755733181084, + "grad_norm": 1.6526929140090942, + "learning_rate": 3.997143551999011e-05, + "loss": 5.4964, + "step": 49708 + }, + { + "epoch": 0.2956335046150918, + "grad_norm": 1.511796236038208, + "learning_rate": 3.997106143886971e-05, + "loss": 5.5825, + "step": 49709 + }, + { + "epoch": 0.2956394518983728, + "grad_norm": 1.7848949432373047, + "learning_rate": 3.9970687352523076e-05, + "loss": 5.1086, + "step": 49710 + }, + { + "epoch": 0.29564539918165383, + "grad_norm": 1.7502938508987427, + "learning_rate": 3.997031326095032e-05, + "loss": 4.7495, + "step": 49711 + }, + { + "epoch": 0.2956513464649348, + "grad_norm": 2.867621898651123, + "learning_rate": 3.9969939164151595e-05, + "loss": 4.3492, + "step": 49712 + }, + { + "epoch": 0.2956572937482158, + "grad_norm": 1.8013173341751099, + "learning_rate": 3.996956506212701e-05, + "loss": 4.1992, + "step": 49713 + }, + { + "epoch": 0.2956632410314968, + "grad_norm": 2.3035945892333984, + "learning_rate": 3.9969190954876704e-05, + "loss": 3.7792, + "step": 49714 + }, + { + "epoch": 0.2956691883147778, + "grad_norm": 2.193401575088501, + "learning_rate": 3.9968816842400815e-05, + "loss": 3.6616, + "step": 49715 + }, + { + "epoch": 0.2956751355980588, + "grad_norm": 1.7820420265197754, + "learning_rate": 3.9968442724699465e-05, + "loss": 3.8496, + "step": 49716 + }, + { + "epoch": 0.2956810828813398, + "grad_norm": 1.661763072013855, + "learning_rate": 3.996806860177278e-05, + "loss": 4.1927, + "step": 49717 + }, + { + "epoch": 0.2956870301646208, + "grad_norm": 1.4773069620132446, + "learning_rate": 3.9967694473620895e-05, + "loss": 3.8962, + "step": 49718 + }, + { + "epoch": 0.2956929774479018, + "grad_norm": 1.5075510740280151, + "learning_rate": 3.9967320340243954e-05, + "loss": 5.5557, + "step": 49719 + }, + { + "epoch": 0.2956989247311828, + "grad_norm": 3.0426223278045654, + "learning_rate": 3.996694620164207e-05, + "loss": 2.4617, + "step": 49720 + }, + { + "epoch": 0.29570487201446377, + "grad_norm": 2.8472650051116943, + "learning_rate": 3.9966572057815373e-05, + "loss": 2.7826, + "step": 49721 + }, + { + "epoch": 0.2957108192977448, + "grad_norm": 2.929723024368286, + "learning_rate": 3.996619790876401e-05, + "loss": 2.6018, + "step": 49722 + }, + { + "epoch": 0.2957167665810258, + "grad_norm": 2.8035247325897217, + "learning_rate": 3.9965823754488105e-05, + "loss": 2.9226, + "step": 49723 + }, + { + "epoch": 0.29572271386430676, + "grad_norm": 1.6846131086349487, + "learning_rate": 3.996544959498777e-05, + "loss": 4.5043, + "step": 49724 + }, + { + "epoch": 0.2957286611475878, + "grad_norm": 2.9278085231781006, + "learning_rate": 3.9965075430263165e-05, + "loss": 3.2841, + "step": 49725 + }, + { + "epoch": 0.2957346084308688, + "grad_norm": 2.5642659664154053, + "learning_rate": 3.996470126031441e-05, + "loss": 3.0814, + "step": 49726 + }, + { + "epoch": 0.29574055571414976, + "grad_norm": 1.5601270198822021, + "learning_rate": 3.996432708514162e-05, + "loss": 4.6335, + "step": 49727 + }, + { + "epoch": 0.29574650299743077, + "grad_norm": 1.3759745359420776, + "learning_rate": 3.996395290474494e-05, + "loss": 5.0985, + "step": 49728 + }, + { + "epoch": 0.2957524502807118, + "grad_norm": 1.1369235515594482, + "learning_rate": 3.9963578719124505e-05, + "loss": 5.2466, + "step": 49729 + }, + { + "epoch": 0.29575839756399275, + "grad_norm": 1.4218978881835938, + "learning_rate": 3.996320452828044e-05, + "loss": 4.8224, + "step": 49730 + }, + { + "epoch": 0.29576434484727376, + "grad_norm": 2.0634725093841553, + "learning_rate": 3.9962830332212866e-05, + "loss": 4.0977, + "step": 49731 + }, + { + "epoch": 0.2957702921305548, + "grad_norm": 2.8383750915527344, + "learning_rate": 3.9962456130921934e-05, + "loss": 2.8719, + "step": 49732 + }, + { + "epoch": 0.29577623941383574, + "grad_norm": 4.7170939445495605, + "learning_rate": 3.996208192440775e-05, + "loss": 3.9361, + "step": 49733 + }, + { + "epoch": 0.29578218669711676, + "grad_norm": 3.3451101779937744, + "learning_rate": 3.996170771267047e-05, + "loss": 4.0121, + "step": 49734 + }, + { + "epoch": 0.29578813398039777, + "grad_norm": 1.409624695777893, + "learning_rate": 3.9961333495710206e-05, + "loss": 4.4531, + "step": 49735 + }, + { + "epoch": 0.29579408126367873, + "grad_norm": 1.6994398832321167, + "learning_rate": 3.9960959273527085e-05, + "loss": 4.3239, + "step": 49736 + }, + { + "epoch": 0.29580002854695975, + "grad_norm": 1.780884861946106, + "learning_rate": 3.9960585046121266e-05, + "loss": 4.2165, + "step": 49737 + }, + { + "epoch": 0.29580597583024076, + "grad_norm": 2.106839418411255, + "learning_rate": 3.996021081349285e-05, + "loss": 4.043, + "step": 49738 + }, + { + "epoch": 0.2958119231135217, + "grad_norm": 3.710530996322632, + "learning_rate": 3.995983657564199e-05, + "loss": 3.3579, + "step": 49739 + }, + { + "epoch": 0.29581787039680274, + "grad_norm": 3.593752145767212, + "learning_rate": 3.9959462332568796e-05, + "loss": 3.3476, + "step": 49740 + }, + { + "epoch": 0.29582381768008376, + "grad_norm": 4.125898838043213, + "learning_rate": 3.995908808427341e-05, + "loss": 3.384, + "step": 49741 + }, + { + "epoch": 0.2958297649633647, + "grad_norm": 3.746469497680664, + "learning_rate": 3.995871383075596e-05, + "loss": 3.4972, + "step": 49742 + }, + { + "epoch": 0.29583571224664573, + "grad_norm": 3.221156358718872, + "learning_rate": 3.9958339572016575e-05, + "loss": 3.2342, + "step": 49743 + }, + { + "epoch": 0.29584165952992675, + "grad_norm": 2.2006750106811523, + "learning_rate": 3.9957965308055404e-05, + "loss": 4.0805, + "step": 49744 + }, + { + "epoch": 0.2958476068132077, + "grad_norm": 2.237328052520752, + "learning_rate": 3.9957591038872545e-05, + "loss": 3.8003, + "step": 49745 + }, + { + "epoch": 0.2958535540964887, + "grad_norm": 1.6552081108093262, + "learning_rate": 3.9957216764468155e-05, + "loss": 4.2858, + "step": 49746 + }, + { + "epoch": 0.29585950137976974, + "grad_norm": 1.6699613332748413, + "learning_rate": 3.995684248484235e-05, + "loss": 4.5001, + "step": 49747 + }, + { + "epoch": 0.2958654486630507, + "grad_norm": 1.7908625602722168, + "learning_rate": 3.9956468199995255e-05, + "loss": 3.7537, + "step": 49748 + }, + { + "epoch": 0.2958713959463317, + "grad_norm": 1.6541935205459595, + "learning_rate": 3.995609390992703e-05, + "loss": 4.2343, + "step": 49749 + }, + { + "epoch": 0.29587734322961273, + "grad_norm": 2.096383810043335, + "learning_rate": 3.9955719614637776e-05, + "loss": 3.8708, + "step": 49750 + }, + { + "epoch": 0.2958832905128937, + "grad_norm": 2.238487720489502, + "learning_rate": 3.995534531412765e-05, + "loss": 3.5549, + "step": 49751 + }, + { + "epoch": 0.2958892377961747, + "grad_norm": 2.3754522800445557, + "learning_rate": 3.9954971008396746e-05, + "loss": 3.4731, + "step": 49752 + }, + { + "epoch": 0.2958951850794557, + "grad_norm": 2.1166393756866455, + "learning_rate": 3.995459669744523e-05, + "loss": 3.5132, + "step": 49753 + }, + { + "epoch": 0.2959011323627367, + "grad_norm": 2.1435256004333496, + "learning_rate": 3.995422238127322e-05, + "loss": 3.5957, + "step": 49754 + }, + { + "epoch": 0.2959070796460177, + "grad_norm": 1.7969540357589722, + "learning_rate": 3.995384805988084e-05, + "loss": 4.0865, + "step": 49755 + }, + { + "epoch": 0.2959130269292987, + "grad_norm": 1.4401686191558838, + "learning_rate": 3.995347373326822e-05, + "loss": 4.5583, + "step": 49756 + }, + { + "epoch": 0.2959189742125797, + "grad_norm": 2.614732265472412, + "learning_rate": 3.995309940143551e-05, + "loss": 2.2326, + "step": 49757 + }, + { + "epoch": 0.2959249214958607, + "grad_norm": 1.9897003173828125, + "learning_rate": 3.995272506438282e-05, + "loss": 3.655, + "step": 49758 + }, + { + "epoch": 0.2959308687791417, + "grad_norm": 2.442317485809326, + "learning_rate": 3.995235072211029e-05, + "loss": 3.4436, + "step": 49759 + }, + { + "epoch": 0.29593681606242267, + "grad_norm": 2.4704551696777344, + "learning_rate": 3.995197637461804e-05, + "loss": 3.4952, + "step": 49760 + }, + { + "epoch": 0.2959427633457037, + "grad_norm": 1.726624846458435, + "learning_rate": 3.995160202190622e-05, + "loss": 3.9757, + "step": 49761 + }, + { + "epoch": 0.2959487106289847, + "grad_norm": 4.51773738861084, + "learning_rate": 3.995122766397496e-05, + "loss": 1.8258, + "step": 49762 + }, + { + "epoch": 0.29595465791226566, + "grad_norm": 3.300431728363037, + "learning_rate": 3.9950853300824356e-05, + "loss": 2.095, + "step": 49763 + }, + { + "epoch": 0.2959606051955467, + "grad_norm": 3.456000566482544, + "learning_rate": 3.995047893245458e-05, + "loss": 1.8065, + "step": 49764 + }, + { + "epoch": 0.2959665524788277, + "grad_norm": 3.016997814178467, + "learning_rate": 3.9950104558865745e-05, + "loss": 1.4563, + "step": 49765 + }, + { + "epoch": 0.29597249976210865, + "grad_norm": 2.6589999198913574, + "learning_rate": 3.994973018005798e-05, + "loss": 1.542, + "step": 49766 + }, + { + "epoch": 0.29597844704538967, + "grad_norm": 1.8150291442871094, + "learning_rate": 3.994935579603142e-05, + "loss": 4.2283, + "step": 49767 + }, + { + "epoch": 0.2959843943286707, + "grad_norm": 2.3297739028930664, + "learning_rate": 3.994898140678619e-05, + "loss": 3.0411, + "step": 49768 + }, + { + "epoch": 0.29599034161195165, + "grad_norm": 2.567833185195923, + "learning_rate": 3.994860701232243e-05, + "loss": 3.114, + "step": 49769 + }, + { + "epoch": 0.29599628889523266, + "grad_norm": 1.7797584533691406, + "learning_rate": 3.994823261264027e-05, + "loss": 5.5617, + "step": 49770 + }, + { + "epoch": 0.2960022361785137, + "grad_norm": 1.2979769706726074, + "learning_rate": 3.994785820773983e-05, + "loss": 5.2587, + "step": 49771 + }, + { + "epoch": 0.29600818346179464, + "grad_norm": 1.6278483867645264, + "learning_rate": 3.994748379762124e-05, + "loss": 5.004, + "step": 49772 + }, + { + "epoch": 0.29601413074507565, + "grad_norm": 1.8328921794891357, + "learning_rate": 3.994710938228465e-05, + "loss": 5.0541, + "step": 49773 + }, + { + "epoch": 0.29602007802835667, + "grad_norm": 2.130816698074341, + "learning_rate": 3.9946734961730175e-05, + "loss": 4.9183, + "step": 49774 + }, + { + "epoch": 0.29602602531163763, + "grad_norm": 1.9309639930725098, + "learning_rate": 3.994636053595795e-05, + "loss": 4.9599, + "step": 49775 + }, + { + "epoch": 0.29603197259491865, + "grad_norm": 2.3658323287963867, + "learning_rate": 3.9945986104968106e-05, + "loss": 4.4401, + "step": 49776 + }, + { + "epoch": 0.29603791987819966, + "grad_norm": 2.8836734294891357, + "learning_rate": 3.9945611668760765e-05, + "loss": 3.6775, + "step": 49777 + }, + { + "epoch": 0.2960438671614806, + "grad_norm": 2.5359513759613037, + "learning_rate": 3.9945237227336073e-05, + "loss": 3.7663, + "step": 49778 + }, + { + "epoch": 0.29604981444476164, + "grad_norm": 1.8457893133163452, + "learning_rate": 3.994486278069415e-05, + "loss": 4.7732, + "step": 49779 + }, + { + "epoch": 0.29605576172804265, + "grad_norm": 1.648147463798523, + "learning_rate": 3.994448832883513e-05, + "loss": 4.6239, + "step": 49780 + }, + { + "epoch": 0.2960617090113236, + "grad_norm": 1.587812066078186, + "learning_rate": 3.9944113871759145e-05, + "loss": 4.5104, + "step": 49781 + }, + { + "epoch": 0.29606765629460463, + "grad_norm": 1.7076032161712646, + "learning_rate": 3.9943739409466327e-05, + "loss": 4.8278, + "step": 49782 + }, + { + "epoch": 0.29607360357788565, + "grad_norm": 1.6509233713150024, + "learning_rate": 3.9943364941956795e-05, + "loss": 4.8492, + "step": 49783 + }, + { + "epoch": 0.2960795508611666, + "grad_norm": 1.9540603160858154, + "learning_rate": 3.994299046923069e-05, + "loss": 4.8474, + "step": 49784 + }, + { + "epoch": 0.2960854981444476, + "grad_norm": 2.7318756580352783, + "learning_rate": 3.9942615991288154e-05, + "loss": 3.3743, + "step": 49785 + }, + { + "epoch": 0.29609144542772864, + "grad_norm": 1.77741539478302, + "learning_rate": 3.9942241508129296e-05, + "loss": 4.6556, + "step": 49786 + }, + { + "epoch": 0.2960973927110096, + "grad_norm": 2.518263578414917, + "learning_rate": 3.994186701975425e-05, + "loss": 3.1101, + "step": 49787 + }, + { + "epoch": 0.2961033399942906, + "grad_norm": 2.659133195877075, + "learning_rate": 3.994149252616316e-05, + "loss": 3.1698, + "step": 49788 + }, + { + "epoch": 0.29610928727757163, + "grad_norm": 2.048537015914917, + "learning_rate": 3.994111802735615e-05, + "loss": 4.0607, + "step": 49789 + }, + { + "epoch": 0.2961152345608526, + "grad_norm": 1.601563811302185, + "learning_rate": 3.994074352333334e-05, + "loss": 4.9625, + "step": 49790 + }, + { + "epoch": 0.2961211818441336, + "grad_norm": 2.0308141708374023, + "learning_rate": 3.994036901409489e-05, + "loss": 3.5025, + "step": 49791 + }, + { + "epoch": 0.2961271291274146, + "grad_norm": 2.186243772506714, + "learning_rate": 3.99399944996409e-05, + "loss": 3.1733, + "step": 49792 + }, + { + "epoch": 0.2961330764106956, + "grad_norm": 2.2834272384643555, + "learning_rate": 3.993961997997151e-05, + "loss": 2.9055, + "step": 49793 + }, + { + "epoch": 0.2961390236939766, + "grad_norm": 2.295286178588867, + "learning_rate": 3.993924545508686e-05, + "loss": 3.0028, + "step": 49794 + }, + { + "epoch": 0.2961449709772576, + "grad_norm": 2.1056084632873535, + "learning_rate": 3.9938870924987066e-05, + "loss": 3.1116, + "step": 49795 + }, + { + "epoch": 0.2961509182605386, + "grad_norm": 2.0850186347961426, + "learning_rate": 3.993849638967227e-05, + "loss": 3.9957, + "step": 49796 + }, + { + "epoch": 0.2961568655438196, + "grad_norm": 1.394962191581726, + "learning_rate": 3.99381218491426e-05, + "loss": 5.0844, + "step": 49797 + }, + { + "epoch": 0.2961628128271006, + "grad_norm": 1.3081074953079224, + "learning_rate": 3.993774730339819e-05, + "loss": 5.3012, + "step": 49798 + }, + { + "epoch": 0.29616876011038157, + "grad_norm": 1.5293008089065552, + "learning_rate": 3.993737275243916e-05, + "loss": 4.9727, + "step": 49799 + }, + { + "epoch": 0.2961747073936626, + "grad_norm": 1.5912561416625977, + "learning_rate": 3.9936998196265654e-05, + "loss": 4.6457, + "step": 49800 + }, + { + "epoch": 0.29618065467694354, + "grad_norm": 1.3921664953231812, + "learning_rate": 3.993662363487779e-05, + "loss": 4.5393, + "step": 49801 + }, + { + "epoch": 0.29618660196022456, + "grad_norm": 1.5218373537063599, + "learning_rate": 3.993624906827571e-05, + "loss": 5.026, + "step": 49802 + }, + { + "epoch": 0.2961925492435056, + "grad_norm": 1.453786015510559, + "learning_rate": 3.993587449645953e-05, + "loss": 5.1124, + "step": 49803 + }, + { + "epoch": 0.29619849652678654, + "grad_norm": 1.4766024351119995, + "learning_rate": 3.99354999194294e-05, + "loss": 4.6991, + "step": 49804 + }, + { + "epoch": 0.29620444381006755, + "grad_norm": 1.323459267616272, + "learning_rate": 3.993512533718544e-05, + "loss": 5.0119, + "step": 49805 + }, + { + "epoch": 0.29621039109334857, + "grad_norm": 1.6388576030731201, + "learning_rate": 3.993475074972778e-05, + "loss": 5.5051, + "step": 49806 + }, + { + "epoch": 0.29621633837662953, + "grad_norm": 1.4068721532821655, + "learning_rate": 3.993437615705655e-05, + "loss": 5.6938, + "step": 49807 + }, + { + "epoch": 0.29622228565991054, + "grad_norm": 1.6539525985717773, + "learning_rate": 3.993400155917189e-05, + "loss": 4.8518, + "step": 49808 + }, + { + "epoch": 0.29622823294319156, + "grad_norm": 1.7530128955841064, + "learning_rate": 3.9933626956073915e-05, + "loss": 4.2923, + "step": 49809 + }, + { + "epoch": 0.2962341802264725, + "grad_norm": 1.5065442323684692, + "learning_rate": 3.9933252347762775e-05, + "loss": 5.1154, + "step": 49810 + }, + { + "epoch": 0.29624012750975354, + "grad_norm": 1.7132827043533325, + "learning_rate": 3.993287773423858e-05, + "loss": 4.8618, + "step": 49811 + }, + { + "epoch": 0.29624607479303455, + "grad_norm": 1.5066349506378174, + "learning_rate": 3.9932503115501486e-05, + "loss": 4.7893, + "step": 49812 + }, + { + "epoch": 0.2962520220763155, + "grad_norm": 1.570837378501892, + "learning_rate": 3.99321284915516e-05, + "loss": 5.0258, + "step": 49813 + }, + { + "epoch": 0.29625796935959653, + "grad_norm": 1.7296392917633057, + "learning_rate": 3.9931753862389066e-05, + "loss": 4.5898, + "step": 49814 + }, + { + "epoch": 0.29626391664287755, + "grad_norm": 1.5099238157272339, + "learning_rate": 3.993137922801401e-05, + "loss": 4.7976, + "step": 49815 + }, + { + "epoch": 0.2962698639261585, + "grad_norm": 1.6014658212661743, + "learning_rate": 3.993100458842656e-05, + "loss": 4.9725, + "step": 49816 + }, + { + "epoch": 0.2962758112094395, + "grad_norm": 1.57999849319458, + "learning_rate": 3.9930629943626856e-05, + "loss": 4.6976, + "step": 49817 + }, + { + "epoch": 0.29628175849272054, + "grad_norm": 1.573514699935913, + "learning_rate": 3.993025529361502e-05, + "loss": 4.8048, + "step": 49818 + }, + { + "epoch": 0.2962877057760015, + "grad_norm": 1.7780357599258423, + "learning_rate": 3.9929880638391186e-05, + "loss": 4.8284, + "step": 49819 + }, + { + "epoch": 0.2962936530592825, + "grad_norm": 1.9390937089920044, + "learning_rate": 3.992950597795549e-05, + "loss": 4.5054, + "step": 49820 + }, + { + "epoch": 0.29629960034256353, + "grad_norm": 1.7452247142791748, + "learning_rate": 3.9929131312308056e-05, + "loss": 4.8199, + "step": 49821 + }, + { + "epoch": 0.2963055476258445, + "grad_norm": 1.5998988151550293, + "learning_rate": 3.9928756641449014e-05, + "loss": 4.5025, + "step": 49822 + }, + { + "epoch": 0.2963114949091255, + "grad_norm": 1.5040807723999023, + "learning_rate": 3.99283819653785e-05, + "loss": 5.2524, + "step": 49823 + }, + { + "epoch": 0.2963174421924065, + "grad_norm": 1.5853915214538574, + "learning_rate": 3.992800728409663e-05, + "loss": 4.6749, + "step": 49824 + }, + { + "epoch": 0.2963233894756875, + "grad_norm": 1.548161268234253, + "learning_rate": 3.992763259760356e-05, + "loss": 4.6866, + "step": 49825 + }, + { + "epoch": 0.2963293367589685, + "grad_norm": 1.6494301557540894, + "learning_rate": 3.992725790589941e-05, + "loss": 4.8293, + "step": 49826 + }, + { + "epoch": 0.2963352840422495, + "grad_norm": 1.548136591911316, + "learning_rate": 3.99268832089843e-05, + "loss": 4.5254, + "step": 49827 + }, + { + "epoch": 0.2963412313255305, + "grad_norm": 1.5538948774337769, + "learning_rate": 3.9926508506858376e-05, + "loss": 5.2268, + "step": 49828 + }, + { + "epoch": 0.2963471786088115, + "grad_norm": 1.617442011833191, + "learning_rate": 3.992613379952175e-05, + "loss": 4.7756, + "step": 49829 + }, + { + "epoch": 0.2963531258920925, + "grad_norm": 2.0310211181640625, + "learning_rate": 3.992575908697458e-05, + "loss": 4.9114, + "step": 49830 + }, + { + "epoch": 0.29635907317537347, + "grad_norm": 1.4428722858428955, + "learning_rate": 3.9925384369216964e-05, + "loss": 4.6262, + "step": 49831 + }, + { + "epoch": 0.2963650204586545, + "grad_norm": 1.2674963474273682, + "learning_rate": 3.9925009646249065e-05, + "loss": 5.069, + "step": 49832 + }, + { + "epoch": 0.2963709677419355, + "grad_norm": 1.333789348602295, + "learning_rate": 3.9924634918071005e-05, + "loss": 5.1684, + "step": 49833 + }, + { + "epoch": 0.29637691502521646, + "grad_norm": 1.365368127822876, + "learning_rate": 3.9924260184682894e-05, + "loss": 5.0122, + "step": 49834 + }, + { + "epoch": 0.2963828623084975, + "grad_norm": 1.5121499300003052, + "learning_rate": 3.9923885446084885e-05, + "loss": 5.0457, + "step": 49835 + }, + { + "epoch": 0.2963888095917785, + "grad_norm": 1.4219365119934082, + "learning_rate": 3.99235107022771e-05, + "loss": 5.2184, + "step": 49836 + }, + { + "epoch": 0.29639475687505945, + "grad_norm": 1.6371785402297974, + "learning_rate": 3.9923135953259674e-05, + "loss": 4.8249, + "step": 49837 + }, + { + "epoch": 0.29640070415834047, + "grad_norm": 1.3601634502410889, + "learning_rate": 3.992276119903273e-05, + "loss": 4.8486, + "step": 49838 + }, + { + "epoch": 0.2964066514416215, + "grad_norm": 1.385413408279419, + "learning_rate": 3.992238643959641e-05, + "loss": 4.9409, + "step": 49839 + }, + { + "epoch": 0.29641259872490244, + "grad_norm": 1.6749697923660278, + "learning_rate": 3.992201167495083e-05, + "loss": 4.9864, + "step": 49840 + }, + { + "epoch": 0.29641854600818346, + "grad_norm": 1.183023452758789, + "learning_rate": 3.992163690509615e-05, + "loss": 4.7761, + "step": 49841 + }, + { + "epoch": 0.2964244932914645, + "grad_norm": 1.2770230770111084, + "learning_rate": 3.992126213003246e-05, + "loss": 4.4869, + "step": 49842 + }, + { + "epoch": 0.29643044057474544, + "grad_norm": 2.8127520084381104, + "learning_rate": 3.992088734975992e-05, + "loss": 2.3004, + "step": 49843 + }, + { + "epoch": 0.29643638785802645, + "grad_norm": 2.941417932510376, + "learning_rate": 3.9920512564278656e-05, + "loss": 0.8688, + "step": 49844 + }, + { + "epoch": 0.29644233514130747, + "grad_norm": 3.1857006549835205, + "learning_rate": 3.9920137773588786e-05, + "loss": 0.8504, + "step": 49845 + }, + { + "epoch": 0.2964482824245884, + "grad_norm": 3.2125213146209717, + "learning_rate": 3.9919762977690456e-05, + "loss": 0.9278, + "step": 49846 + }, + { + "epoch": 0.29645422970786944, + "grad_norm": 4.492208003997803, + "learning_rate": 3.991938817658379e-05, + "loss": 2.0244, + "step": 49847 + }, + { + "epoch": 0.29646017699115046, + "grad_norm": 1.8767156600952148, + "learning_rate": 3.9919013370268924e-05, + "loss": 4.134, + "step": 49848 + }, + { + "epoch": 0.2964661242744314, + "grad_norm": 1.624321699142456, + "learning_rate": 3.9918638558745966e-05, + "loss": 4.8019, + "step": 49849 + }, + { + "epoch": 0.29647207155771244, + "grad_norm": 1.730263352394104, + "learning_rate": 3.991826374201509e-05, + "loss": 4.8626, + "step": 49850 + }, + { + "epoch": 0.29647801884099345, + "grad_norm": 1.6932282447814941, + "learning_rate": 3.9917888920076386e-05, + "loss": 5.0517, + "step": 49851 + }, + { + "epoch": 0.2964839661242744, + "grad_norm": 2.243943214416504, + "learning_rate": 3.991751409293001e-05, + "loss": 4.826, + "step": 49852 + }, + { + "epoch": 0.2964899134075554, + "grad_norm": 2.2327442169189453, + "learning_rate": 3.991713926057608e-05, + "loss": 4.8663, + "step": 49853 + }, + { + "epoch": 0.29649586069083644, + "grad_norm": 1.8411988019943237, + "learning_rate": 3.991676442301473e-05, + "loss": 4.6196, + "step": 49854 + }, + { + "epoch": 0.2965018079741174, + "grad_norm": 2.6981637477874756, + "learning_rate": 3.9916389580246096e-05, + "loss": 3.5901, + "step": 49855 + }, + { + "epoch": 0.2965077552573984, + "grad_norm": 5.339611530303955, + "learning_rate": 3.9916014732270303e-05, + "loss": 1.7908, + "step": 49856 + }, + { + "epoch": 0.29651370254067944, + "grad_norm": 1.6844229698181152, + "learning_rate": 3.9915639879087474e-05, + "loss": 4.4419, + "step": 49857 + }, + { + "epoch": 0.2965196498239604, + "grad_norm": 1.8269402980804443, + "learning_rate": 3.9915265020697756e-05, + "loss": 4.8213, + "step": 49858 + }, + { + "epoch": 0.2965255971072414, + "grad_norm": 1.7027592658996582, + "learning_rate": 3.991489015710128e-05, + "loss": 5.2104, + "step": 49859 + }, + { + "epoch": 0.29653154439052243, + "grad_norm": 1.6806622743606567, + "learning_rate": 3.991451528829816e-05, + "loss": 4.8957, + "step": 49860 + }, + { + "epoch": 0.2965374916738034, + "grad_norm": 1.878368616104126, + "learning_rate": 3.9914140414288545e-05, + "loss": 4.5581, + "step": 49861 + }, + { + "epoch": 0.2965434389570844, + "grad_norm": 1.5830408334732056, + "learning_rate": 3.991376553507256e-05, + "loss": 4.6934, + "step": 49862 + }, + { + "epoch": 0.2965493862403654, + "grad_norm": 1.4551148414611816, + "learning_rate": 3.991339065065032e-05, + "loss": 4.7358, + "step": 49863 + }, + { + "epoch": 0.2965553335236464, + "grad_norm": 3.5510079860687256, + "learning_rate": 3.991301576102198e-05, + "loss": 2.8451, + "step": 49864 + }, + { + "epoch": 0.2965612808069274, + "grad_norm": 3.2008535861968994, + "learning_rate": 3.991264086618765e-05, + "loss": 1.5864, + "step": 49865 + }, + { + "epoch": 0.2965672280902084, + "grad_norm": 3.393589735031128, + "learning_rate": 3.9912265966147475e-05, + "loss": 2.0296, + "step": 49866 + }, + { + "epoch": 0.2965731753734894, + "grad_norm": 1.7767319679260254, + "learning_rate": 3.991189106090159e-05, + "loss": 4.8778, + "step": 49867 + }, + { + "epoch": 0.2965791226567704, + "grad_norm": 1.5054337978363037, + "learning_rate": 3.991151615045011e-05, + "loss": 4.9106, + "step": 49868 + }, + { + "epoch": 0.2965850699400514, + "grad_norm": 1.41421377658844, + "learning_rate": 3.991114123479318e-05, + "loss": 4.9011, + "step": 49869 + }, + { + "epoch": 0.29659101722333236, + "grad_norm": 1.4951130151748657, + "learning_rate": 3.991076631393092e-05, + "loss": 5.2375, + "step": 49870 + }, + { + "epoch": 0.2965969645066134, + "grad_norm": 1.5886921882629395, + "learning_rate": 3.9910391387863466e-05, + "loss": 4.7695, + "step": 49871 + }, + { + "epoch": 0.2966029117898944, + "grad_norm": 1.5192679166793823, + "learning_rate": 3.991001645659095e-05, + "loss": 4.863, + "step": 49872 + }, + { + "epoch": 0.29660885907317536, + "grad_norm": 1.5333096981048584, + "learning_rate": 3.99096415201135e-05, + "loss": 4.8501, + "step": 49873 + }, + { + "epoch": 0.2966148063564564, + "grad_norm": 2.772648572921753, + "learning_rate": 3.9909266578431245e-05, + "loss": 3.2048, + "step": 49874 + }, + { + "epoch": 0.2966207536397374, + "grad_norm": 1.525728702545166, + "learning_rate": 3.990889163154433e-05, + "loss": 4.1502, + "step": 49875 + }, + { + "epoch": 0.29662670092301835, + "grad_norm": 1.9298710823059082, + "learning_rate": 3.990851667945287e-05, + "loss": 3.4274, + "step": 49876 + }, + { + "epoch": 0.29663264820629937, + "grad_norm": 1.8860893249511719, + "learning_rate": 3.990814172215699e-05, + "loss": 3.5753, + "step": 49877 + }, + { + "epoch": 0.2966385954895804, + "grad_norm": 2.5655219554901123, + "learning_rate": 3.990776675965685e-05, + "loss": 3.2704, + "step": 49878 + }, + { + "epoch": 0.29664454277286134, + "grad_norm": 1.7881042957305908, + "learning_rate": 3.990739179195255e-05, + "loss": 4.498, + "step": 49879 + }, + { + "epoch": 0.29665049005614236, + "grad_norm": 2.374174118041992, + "learning_rate": 3.9907016819044234e-05, + "loss": 3.8237, + "step": 49880 + }, + { + "epoch": 0.2966564373394234, + "grad_norm": 1.751529335975647, + "learning_rate": 3.9906641840932034e-05, + "loss": 4.3222, + "step": 49881 + }, + { + "epoch": 0.29666238462270433, + "grad_norm": 1.3524526357650757, + "learning_rate": 3.990626685761608e-05, + "loss": 4.7993, + "step": 49882 + }, + { + "epoch": 0.29666833190598535, + "grad_norm": 1.2150917053222656, + "learning_rate": 3.9905891869096504e-05, + "loss": 4.8914, + "step": 49883 + }, + { + "epoch": 0.29667427918926637, + "grad_norm": 1.7444616556167603, + "learning_rate": 3.990551687537343e-05, + "loss": 4.1747, + "step": 49884 + }, + { + "epoch": 0.2966802264725473, + "grad_norm": 2.624868392944336, + "learning_rate": 3.9905141876447e-05, + "loss": 1.2705, + "step": 49885 + }, + { + "epoch": 0.29668617375582834, + "grad_norm": 2.4590952396392822, + "learning_rate": 3.990476687231734e-05, + "loss": 1.5101, + "step": 49886 + }, + { + "epoch": 0.29669212103910936, + "grad_norm": 2.820594549179077, + "learning_rate": 3.9904391862984576e-05, + "loss": 2.1133, + "step": 49887 + }, + { + "epoch": 0.2966980683223903, + "grad_norm": 2.61984920501709, + "learning_rate": 3.990401684844885e-05, + "loss": 1.364, + "step": 49888 + }, + { + "epoch": 0.29670401560567133, + "grad_norm": 3.158041000366211, + "learning_rate": 3.990364182871027e-05, + "loss": 2.4713, + "step": 49889 + }, + { + "epoch": 0.29670996288895235, + "grad_norm": 1.94318425655365, + "learning_rate": 3.990326680376899e-05, + "loss": 4.4431, + "step": 49890 + }, + { + "epoch": 0.2967159101722333, + "grad_norm": 2.469393253326416, + "learning_rate": 3.990289177362513e-05, + "loss": 3.5651, + "step": 49891 + }, + { + "epoch": 0.2967218574555143, + "grad_norm": 3.3347039222717285, + "learning_rate": 3.990251673827884e-05, + "loss": 2.3229, + "step": 49892 + }, + { + "epoch": 0.29672780473879534, + "grad_norm": 3.3408150672912598, + "learning_rate": 3.990214169773022e-05, + "loss": 2.1283, + "step": 49893 + }, + { + "epoch": 0.2967337520220763, + "grad_norm": 3.9837872982025146, + "learning_rate": 3.9901766651979425e-05, + "loss": 2.463, + "step": 49894 + }, + { + "epoch": 0.2967396993053573, + "grad_norm": 1.7600549459457397, + "learning_rate": 3.990139160102657e-05, + "loss": 3.764, + "step": 49895 + }, + { + "epoch": 0.29674564658863833, + "grad_norm": 5.733805179595947, + "learning_rate": 3.990101654487179e-05, + "loss": 1.7646, + "step": 49896 + }, + { + "epoch": 0.2967515938719193, + "grad_norm": 2.1304948329925537, + "learning_rate": 3.9900641483515233e-05, + "loss": 3.7792, + "step": 49897 + }, + { + "epoch": 0.2967575411552003, + "grad_norm": 1.9744631052017212, + "learning_rate": 3.990026641695701e-05, + "loss": 4.258, + "step": 49898 + }, + { + "epoch": 0.2967634884384813, + "grad_norm": 2.0024049282073975, + "learning_rate": 3.989989134519726e-05, + "loss": 4.5154, + "step": 49899 + }, + { + "epoch": 0.2967694357217623, + "grad_norm": 1.649138331413269, + "learning_rate": 3.98995162682361e-05, + "loss": 4.5664, + "step": 49900 + }, + { + "epoch": 0.2967753830050433, + "grad_norm": 1.505368947982788, + "learning_rate": 3.989914118607369e-05, + "loss": 4.1786, + "step": 49901 + }, + { + "epoch": 0.2967813302883243, + "grad_norm": 1.7581552267074585, + "learning_rate": 3.9898766098710135e-05, + "loss": 3.6098, + "step": 49902 + }, + { + "epoch": 0.2967872775716053, + "grad_norm": 1.5638108253479004, + "learning_rate": 3.989839100614557e-05, + "loss": 4.9884, + "step": 49903 + }, + { + "epoch": 0.2967932248548863, + "grad_norm": 1.5890496969223022, + "learning_rate": 3.989801590838014e-05, + "loss": 4.7856, + "step": 49904 + }, + { + "epoch": 0.2967991721381673, + "grad_norm": 2.338252544403076, + "learning_rate": 3.989764080541396e-05, + "loss": 0.8035, + "step": 49905 + }, + { + "epoch": 0.29680511942144827, + "grad_norm": 2.1012187004089355, + "learning_rate": 3.9897265697247164e-05, + "loss": 0.5137, + "step": 49906 + }, + { + "epoch": 0.2968110667047293, + "grad_norm": 1.4714316129684448, + "learning_rate": 3.9896890583879896e-05, + "loss": 4.7646, + "step": 49907 + }, + { + "epoch": 0.2968170139880103, + "grad_norm": 1.397228479385376, + "learning_rate": 3.989651546531227e-05, + "loss": 5.0446, + "step": 49908 + }, + { + "epoch": 0.29682296127129126, + "grad_norm": 1.4740899801254272, + "learning_rate": 3.989614034154443e-05, + "loss": 5.067, + "step": 49909 + }, + { + "epoch": 0.2968289085545723, + "grad_norm": 1.688430905342102, + "learning_rate": 3.9895765212576494e-05, + "loss": 3.9693, + "step": 49910 + }, + { + "epoch": 0.2968348558378533, + "grad_norm": 1.5548009872436523, + "learning_rate": 3.989539007840861e-05, + "loss": 4.7545, + "step": 49911 + }, + { + "epoch": 0.29684080312113426, + "grad_norm": 1.4110270738601685, + "learning_rate": 3.98950149390409e-05, + "loss": 4.9733, + "step": 49912 + }, + { + "epoch": 0.29684675040441527, + "grad_norm": 1.5735834836959839, + "learning_rate": 3.9894639794473485e-05, + "loss": 4.8637, + "step": 49913 + }, + { + "epoch": 0.2968526976876963, + "grad_norm": 1.4462202787399292, + "learning_rate": 3.989426464470651e-05, + "loss": 5.1541, + "step": 49914 + }, + { + "epoch": 0.29685864497097725, + "grad_norm": 1.3521721363067627, + "learning_rate": 3.9893889489740096e-05, + "loss": 5.1258, + "step": 49915 + }, + { + "epoch": 0.29686459225425826, + "grad_norm": 1.6327438354492188, + "learning_rate": 3.9893514329574386e-05, + "loss": 4.5268, + "step": 49916 + }, + { + "epoch": 0.2968705395375393, + "grad_norm": 1.632829189300537, + "learning_rate": 3.98931391642095e-05, + "loss": 4.0097, + "step": 49917 + }, + { + "epoch": 0.29687648682082024, + "grad_norm": 1.9379509687423706, + "learning_rate": 3.989276399364557e-05, + "loss": 5.017, + "step": 49918 + }, + { + "epoch": 0.29688243410410126, + "grad_norm": 2.116899013519287, + "learning_rate": 3.9892388817882734e-05, + "loss": 3.5493, + "step": 49919 + }, + { + "epoch": 0.2968883813873822, + "grad_norm": 2.5413804054260254, + "learning_rate": 3.9892013636921125e-05, + "loss": 2.0193, + "step": 49920 + }, + { + "epoch": 0.29689432867066323, + "grad_norm": 2.5723671913146973, + "learning_rate": 3.989163845076086e-05, + "loss": 2.2086, + "step": 49921 + }, + { + "epoch": 0.29690027595394425, + "grad_norm": 2.572793960571289, + "learning_rate": 3.989126325940208e-05, + "loss": 2.1354, + "step": 49922 + }, + { + "epoch": 0.2969062232372252, + "grad_norm": 1.8901101350784302, + "learning_rate": 3.989088806284491e-05, + "loss": 3.9359, + "step": 49923 + }, + { + "epoch": 0.2969121705205062, + "grad_norm": 2.6483047008514404, + "learning_rate": 3.989051286108949e-05, + "loss": 3.4782, + "step": 49924 + }, + { + "epoch": 0.29691811780378724, + "grad_norm": 2.161928415298462, + "learning_rate": 3.9890137654135946e-05, + "loss": 3.0632, + "step": 49925 + }, + { + "epoch": 0.2969240650870682, + "grad_norm": 2.647360324859619, + "learning_rate": 3.98897624419844e-05, + "loss": 1.8247, + "step": 49926 + }, + { + "epoch": 0.2969300123703492, + "grad_norm": 2.5414388179779053, + "learning_rate": 3.9889387224635004e-05, + "loss": 2.1127, + "step": 49927 + }, + { + "epoch": 0.29693595965363023, + "grad_norm": 2.576676845550537, + "learning_rate": 3.988901200208787e-05, + "loss": 1.9117, + "step": 49928 + }, + { + "epoch": 0.2969419069369112, + "grad_norm": 2.5918867588043213, + "learning_rate": 3.9888636774343136e-05, + "loss": 1.9003, + "step": 49929 + }, + { + "epoch": 0.2969478542201922, + "grad_norm": 2.639163017272949, + "learning_rate": 3.988826154140094e-05, + "loss": 1.9765, + "step": 49930 + }, + { + "epoch": 0.2969538015034732, + "grad_norm": 3.242687463760376, + "learning_rate": 3.988788630326139e-05, + "loss": 3.3009, + "step": 49931 + }, + { + "epoch": 0.2969597487867542, + "grad_norm": 3.405019760131836, + "learning_rate": 3.9887511059924644e-05, + "loss": 3.0978, + "step": 49932 + }, + { + "epoch": 0.2969656960700352, + "grad_norm": 3.4077818393707275, + "learning_rate": 3.988713581139082e-05, + "loss": 3.1175, + "step": 49933 + }, + { + "epoch": 0.2969716433533162, + "grad_norm": 2.877485990524292, + "learning_rate": 3.9886760557660054e-05, + "loss": 3.1991, + "step": 49934 + }, + { + "epoch": 0.2969775906365972, + "grad_norm": 2.5640103816986084, + "learning_rate": 3.9886385298732465e-05, + "loss": 2.2748, + "step": 49935 + }, + { + "epoch": 0.2969835379198782, + "grad_norm": 2.279970407485962, + "learning_rate": 3.98860100346082e-05, + "loss": 2.257, + "step": 49936 + }, + { + "epoch": 0.2969894852031592, + "grad_norm": 2.309983730316162, + "learning_rate": 3.9885634765287386e-05, + "loss": 2.1723, + "step": 49937 + }, + { + "epoch": 0.29699543248644017, + "grad_norm": 2.4421472549438477, + "learning_rate": 3.988525949077014e-05, + "loss": 2.031, + "step": 49938 + }, + { + "epoch": 0.2970013797697212, + "grad_norm": 2.32572865486145, + "learning_rate": 3.9884884211056613e-05, + "loss": 1.7191, + "step": 49939 + }, + { + "epoch": 0.2970073270530022, + "grad_norm": 2.3984267711639404, + "learning_rate": 3.988450892614692e-05, + "loss": 2.0169, + "step": 49940 + }, + { + "epoch": 0.29701327433628316, + "grad_norm": 2.3601222038269043, + "learning_rate": 3.98841336360412e-05, + "loss": 2.0213, + "step": 49941 + }, + { + "epoch": 0.2970192216195642, + "grad_norm": 2.3412487506866455, + "learning_rate": 3.988375834073959e-05, + "loss": 2.1001, + "step": 49942 + }, + { + "epoch": 0.2970251689028452, + "grad_norm": 2.499269962310791, + "learning_rate": 3.988338304024221e-05, + "loss": 2.0023, + "step": 49943 + }, + { + "epoch": 0.29703111618612615, + "grad_norm": 2.577535390853882, + "learning_rate": 3.9883007734549184e-05, + "loss": 1.8211, + "step": 49944 + }, + { + "epoch": 0.29703706346940717, + "grad_norm": 2.8349411487579346, + "learning_rate": 3.988263242366066e-05, + "loss": 2.1849, + "step": 49945 + }, + { + "epoch": 0.2970430107526882, + "grad_norm": 3.718458890914917, + "learning_rate": 3.988225710757677e-05, + "loss": 3.0635, + "step": 49946 + }, + { + "epoch": 0.29704895803596915, + "grad_norm": 3.759523868560791, + "learning_rate": 3.988188178629763e-05, + "loss": 3.1722, + "step": 49947 + }, + { + "epoch": 0.29705490531925016, + "grad_norm": 3.8388211727142334, + "learning_rate": 3.988150645982338e-05, + "loss": 3.2974, + "step": 49948 + }, + { + "epoch": 0.2970608526025312, + "grad_norm": 2.960571050643921, + "learning_rate": 3.988113112815416e-05, + "loss": 2.2598, + "step": 49949 + }, + { + "epoch": 0.29706679988581214, + "grad_norm": 2.801604986190796, + "learning_rate": 3.988075579129008e-05, + "loss": 2.005, + "step": 49950 + }, + { + "epoch": 0.29707274716909315, + "grad_norm": 2.527553081512451, + "learning_rate": 3.988038044923128e-05, + "loss": 1.9263, + "step": 49951 + }, + { + "epoch": 0.29707869445237417, + "grad_norm": 2.5177958011627197, + "learning_rate": 3.98800051019779e-05, + "loss": 2.1521, + "step": 49952 + }, + { + "epoch": 0.29708464173565513, + "grad_norm": 2.3232085704803467, + "learning_rate": 3.987962974953006e-05, + "loss": 2.1148, + "step": 49953 + }, + { + "epoch": 0.29709058901893615, + "grad_norm": 3.1869781017303467, + "learning_rate": 3.98792543918879e-05, + "loss": 2.8977, + "step": 49954 + }, + { + "epoch": 0.29709653630221716, + "grad_norm": 3.0276293754577637, + "learning_rate": 3.987887902905154e-05, + "loss": 3.0472, + "step": 49955 + }, + { + "epoch": 0.2971024835854981, + "grad_norm": 2.480403423309326, + "learning_rate": 3.987850366102111e-05, + "loss": 2.1107, + "step": 49956 + }, + { + "epoch": 0.29710843086877914, + "grad_norm": 2.245222806930542, + "learning_rate": 3.987812828779677e-05, + "loss": 2.3854, + "step": 49957 + }, + { + "epoch": 0.29711437815206015, + "grad_norm": 2.525768280029297, + "learning_rate": 3.987775290937861e-05, + "loss": 2.2183, + "step": 49958 + }, + { + "epoch": 0.2971203254353411, + "grad_norm": 2.5525131225585938, + "learning_rate": 3.987737752576679e-05, + "loss": 2.3799, + "step": 49959 + }, + { + "epoch": 0.29712627271862213, + "grad_norm": 1.9616308212280273, + "learning_rate": 3.987700213696143e-05, + "loss": 3.481, + "step": 49960 + }, + { + "epoch": 0.29713222000190315, + "grad_norm": 1.6877244710922241, + "learning_rate": 3.987662674296266e-05, + "loss": 4.966, + "step": 49961 + }, + { + "epoch": 0.2971381672851841, + "grad_norm": 1.786388635635376, + "learning_rate": 3.987625134377061e-05, + "loss": 5.0309, + "step": 49962 + }, + { + "epoch": 0.2971441145684651, + "grad_norm": 1.7638847827911377, + "learning_rate": 3.987587593938542e-05, + "loss": 4.1921, + "step": 49963 + }, + { + "epoch": 0.29715006185174614, + "grad_norm": 2.05365252494812, + "learning_rate": 3.987550052980721e-05, + "loss": 4.6731, + "step": 49964 + }, + { + "epoch": 0.2971560091350271, + "grad_norm": 1.749969482421875, + "learning_rate": 3.9875125115036116e-05, + "loss": 5.0064, + "step": 49965 + }, + { + "epoch": 0.2971619564183081, + "grad_norm": 1.7283796072006226, + "learning_rate": 3.9874749695072276e-05, + "loss": 4.7506, + "step": 49966 + }, + { + "epoch": 0.29716790370158913, + "grad_norm": 1.654289722442627, + "learning_rate": 3.987437426991582e-05, + "loss": 4.676, + "step": 49967 + }, + { + "epoch": 0.2971738509848701, + "grad_norm": 1.7501864433288574, + "learning_rate": 3.987399883956686e-05, + "loss": 5.4012, + "step": 49968 + }, + { + "epoch": 0.2971797982681511, + "grad_norm": 1.4187536239624023, + "learning_rate": 3.987362340402555e-05, + "loss": 5.2999, + "step": 49969 + }, + { + "epoch": 0.2971857455514321, + "grad_norm": 1.5822296142578125, + "learning_rate": 3.9873247963292005e-05, + "loss": 4.3902, + "step": 49970 + }, + { + "epoch": 0.2971916928347131, + "grad_norm": 1.6415343284606934, + "learning_rate": 3.987287251736636e-05, + "loss": 4.9369, + "step": 49971 + }, + { + "epoch": 0.2971976401179941, + "grad_norm": 1.632893681526184, + "learning_rate": 3.987249706624876e-05, + "loss": 3.7949, + "step": 49972 + }, + { + "epoch": 0.2972035874012751, + "grad_norm": 1.672268271446228, + "learning_rate": 3.9872121609939316e-05, + "loss": 4.5145, + "step": 49973 + }, + { + "epoch": 0.2972095346845561, + "grad_norm": 1.5599249601364136, + "learning_rate": 3.987174614843817e-05, + "loss": 4.2332, + "step": 49974 + }, + { + "epoch": 0.2972154819678371, + "grad_norm": 1.6448252201080322, + "learning_rate": 3.9871370681745456e-05, + "loss": 4.2508, + "step": 49975 + }, + { + "epoch": 0.2972214292511181, + "grad_norm": 1.5785704851150513, + "learning_rate": 3.9870995209861294e-05, + "loss": 3.9872, + "step": 49976 + }, + { + "epoch": 0.29722737653439907, + "grad_norm": 1.4736846685409546, + "learning_rate": 3.987061973278582e-05, + "loss": 4.9521, + "step": 49977 + }, + { + "epoch": 0.2972333238176801, + "grad_norm": 2.325968027114868, + "learning_rate": 3.987024425051917e-05, + "loss": 3.1836, + "step": 49978 + }, + { + "epoch": 0.2972392711009611, + "grad_norm": 2.173560380935669, + "learning_rate": 3.986986876306148e-05, + "loss": 4.0985, + "step": 49979 + }, + { + "epoch": 0.29724521838424206, + "grad_norm": 2.314192295074463, + "learning_rate": 3.986949327041286e-05, + "loss": 3.6884, + "step": 49980 + }, + { + "epoch": 0.2972511656675231, + "grad_norm": 2.354046106338501, + "learning_rate": 3.986911777257345e-05, + "loss": 3.2949, + "step": 49981 + }, + { + "epoch": 0.2972571129508041, + "grad_norm": 1.666724443435669, + "learning_rate": 3.9868742269543395e-05, + "loss": 4.5176, + "step": 49982 + }, + { + "epoch": 0.29726306023408505, + "grad_norm": 2.1634511947631836, + "learning_rate": 3.9868366761322815e-05, + "loss": 4.0271, + "step": 49983 + }, + { + "epoch": 0.29726900751736607, + "grad_norm": 1.7514944076538086, + "learning_rate": 3.986799124791184e-05, + "loss": 4.4904, + "step": 49984 + }, + { + "epoch": 0.2972749548006471, + "grad_norm": 1.253494381904602, + "learning_rate": 3.9867615729310606e-05, + "loss": 5.1403, + "step": 49985 + }, + { + "epoch": 0.29728090208392804, + "grad_norm": 1.5466684103012085, + "learning_rate": 3.986724020551923e-05, + "loss": 4.7228, + "step": 49986 + }, + { + "epoch": 0.29728684936720906, + "grad_norm": 1.6099300384521484, + "learning_rate": 3.9866864676537864e-05, + "loss": 4.572, + "step": 49987 + }, + { + "epoch": 0.2972927966504901, + "grad_norm": 1.756824254989624, + "learning_rate": 3.9866489142366626e-05, + "loss": 3.9699, + "step": 49988 + }, + { + "epoch": 0.29729874393377104, + "grad_norm": 1.5348249673843384, + "learning_rate": 3.986611360300565e-05, + "loss": 4.6443, + "step": 49989 + }, + { + "epoch": 0.29730469121705205, + "grad_norm": 1.677465796470642, + "learning_rate": 3.986573805845507e-05, + "loss": 4.2988, + "step": 49990 + }, + { + "epoch": 0.29731063850033307, + "grad_norm": 2.214557409286499, + "learning_rate": 3.986536250871501e-05, + "loss": 3.4937, + "step": 49991 + }, + { + "epoch": 0.29731658578361403, + "grad_norm": 1.5931626558303833, + "learning_rate": 3.986498695378561e-05, + "loss": 4.6971, + "step": 49992 + }, + { + "epoch": 0.29732253306689505, + "grad_norm": 1.7253235578536987, + "learning_rate": 3.986461139366699e-05, + "loss": 4.9482, + "step": 49993 + }, + { + "epoch": 0.29732848035017606, + "grad_norm": 1.841065764427185, + "learning_rate": 3.98642358283593e-05, + "loss": 3.2955, + "step": 49994 + }, + { + "epoch": 0.297334427633457, + "grad_norm": 2.250295400619507, + "learning_rate": 3.986386025786265e-05, + "loss": 3.1654, + "step": 49995 + }, + { + "epoch": 0.29734037491673804, + "grad_norm": 1.4639066457748413, + "learning_rate": 3.986348468217718e-05, + "loss": 4.0393, + "step": 49996 + }, + { + "epoch": 0.29734632220001905, + "grad_norm": 2.425933361053467, + "learning_rate": 3.986310910130302e-05, + "loss": 3.4937, + "step": 49997 + }, + { + "epoch": 0.2973522694833, + "grad_norm": 2.156404972076416, + "learning_rate": 3.98627335152403e-05, + "loss": 3.4432, + "step": 49998 + }, + { + "epoch": 0.29735821676658103, + "grad_norm": 2.209590196609497, + "learning_rate": 3.986235792398916e-05, + "loss": 3.1032, + "step": 49999 + }, + { + "epoch": 0.29736416404986205, + "grad_norm": 2.117919921875, + "learning_rate": 3.9861982327549724e-05, + "loss": 3.5218, + "step": 50000 + } + ], + "logging_steps": 1, + "max_steps": 168144, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 10000, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": false + }, + "attributes": {} + } + }, + "total_flos": 1.9817126778372096e+17, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +} diff --git a/checkpoint-60000/config.json b/checkpoint-60000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-60000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-60000/generation_config.json b/checkpoint-60000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-60000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-60000/model.safetensors.index.json b/checkpoint-60000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-60000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-60000/rng_state_0.pth b/checkpoint-60000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-60000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-60000/rng_state_1.pth b/checkpoint-60000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-60000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-60000/rng_state_2.pth b/checkpoint-60000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-60000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-60000/rng_state_4.pth b/checkpoint-60000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-60000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-60000/rng_state_5.pth b/checkpoint-60000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-60000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-60000/rng_state_6.pth b/checkpoint-60000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-60000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-60000/rng_state_7.pth b/checkpoint-60000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-60000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-70000/config.json b/checkpoint-70000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-70000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-70000/generation_config.json b/checkpoint-70000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-70000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-70000/model.safetensors.index.json b/checkpoint-70000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-70000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-70000/rng_state_0.pth b/checkpoint-70000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-70000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-70000/rng_state_2.pth b/checkpoint-70000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-70000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-70000/rng_state_3.pth b/checkpoint-70000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-70000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-70000/rng_state_4.pth b/checkpoint-70000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-70000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-70000/rng_state_5.pth b/checkpoint-70000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-70000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-70000/rng_state_6.pth b/checkpoint-70000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-70000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-70000/rng_state_7.pth b/checkpoint-70000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-70000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-70000/scheduler.pt b/checkpoint-70000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..c1311b75cc832b43ff2cbf4dcee4596ea05da09e --- /dev/null +++ b/checkpoint-70000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04b8dee10c9cf51a5cf1e57c4f3bb39c8e2f3ed72faf000d776621559cc21cc0 +size 1064 diff --git a/checkpoint-80000/config.json b/checkpoint-80000/config.json new file mode 100644 index 0000000000000000000000000000000000000000..87dd93f00e47ebb0e5d9883ccc75f5850ac6aedc --- /dev/null +++ b/checkpoint-80000/config.json @@ -0,0 +1,28 @@ +{ + "architectures": [ + "Qwen2ForCausalLM" + ], + "attention_dropout": 0.0, + "bos_token_id": 151643, + "eos_token_id": 151645, + "hidden_act": "silu", + "hidden_size": 1536, + "initializer_range": 0.02, + "intermediate_size": 8960, + "max_position_embeddings": 32768, + "max_window_layers": 21, + "model_type": "qwen2", + "num_attention_heads": 12, + "num_hidden_layers": 28, + "num_key_value_heads": 2, + "rms_norm_eps": 1e-06, + "rope_scaling": null, + "rope_theta": 1000000.0, + "sliding_window": 32768, + "tie_word_embeddings": true, + "torch_dtype": "float32", + "transformers_version": "4.51.3", + "use_cache": true, + "use_sliding_window": false, + "vocab_size": 180348 +} diff --git a/checkpoint-80000/generation_config.json b/checkpoint-80000/generation_config.json new file mode 100644 index 0000000000000000000000000000000000000000..676d263ba4b8835e0d1c53f899a2645043e2e9df --- /dev/null +++ b/checkpoint-80000/generation_config.json @@ -0,0 +1,14 @@ +{ + "bos_token_id": 151643, + "do_sample": true, + "eos_token_id": [ + 151645, + 151643 + ], + "pad_token_id": 151643, + "repetition_penalty": 1.1, + "temperature": 0.7, + "top_k": 20, + "top_p": 0.8, + "transformers_version": "4.51.3" +} diff --git a/checkpoint-80000/model.safetensors.index.json b/checkpoint-80000/model.safetensors.index.json new file mode 100644 index 0000000000000000000000000000000000000000..3237092e4fcd45e652baa15621e9e4b23aa11253 --- /dev/null +++ b/checkpoint-80000/model.safetensors.index.json @@ -0,0 +1,346 @@ +{ + "metadata": { + "total_size": 7457478656 + }, + "weight_map": { + "lm_head.weight": "model-00002-of-00002.safetensors", + "model.embed_tokens.weight": "model-00001-of-00002.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.21.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.input_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.bias": "model-00002-of-00002.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00002-of-00002.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.input_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.bias": "model-00001-of-00002.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00001-of-00002.safetensors", + "model.norm.weight": "model-00002-of-00002.safetensors" + } +} diff --git a/checkpoint-80000/rng_state_0.pth b/checkpoint-80000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-80000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-80000/rng_state_1.pth b/checkpoint-80000/rng_state_1.pth new file mode 100644 index 0000000000000000000000000000000000000000..3450e76a148654c7a712cc0f7f4fbfb8af73468e --- /dev/null +++ b/checkpoint-80000/rng_state_1.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:096e351ace65ff91008be171a45173ed6369cc639fce73a288f671041e24b0ec +size 15984 diff --git a/checkpoint-80000/rng_state_2.pth b/checkpoint-80000/rng_state_2.pth new file mode 100644 index 0000000000000000000000000000000000000000..9b3bdae1f6c60868005426daf443380b27066cba --- /dev/null +++ b/checkpoint-80000/rng_state_2.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f21c61b1a7e793bbdec183de3b52da90042305234bc7e5887986655cd3fc2192 +size 15984 diff --git a/checkpoint-80000/rng_state_3.pth b/checkpoint-80000/rng_state_3.pth new file mode 100644 index 0000000000000000000000000000000000000000..b05597dd523f7c9a629beaa525e6ad7122b018f9 --- /dev/null +++ b/checkpoint-80000/rng_state_3.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:348742498d81780506d9760c655a7a7555185b5fbd70a7ae296d88fd9aeecd84 +size 15984 diff --git a/checkpoint-80000/rng_state_4.pth b/checkpoint-80000/rng_state_4.pth new file mode 100644 index 0000000000000000000000000000000000000000..28f7e8868fd4df63a6756b111fe1ea4a8e7eb6e2 --- /dev/null +++ b/checkpoint-80000/rng_state_4.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:399c4700dab9ae7b754110ce307fb7e26e22cb49b5bb233c435f0f12b77c202f +size 15984 diff --git a/checkpoint-80000/rng_state_5.pth b/checkpoint-80000/rng_state_5.pth new file mode 100644 index 0000000000000000000000000000000000000000..b81cf1f9e78f606b05125af5ec552416d94116f4 --- /dev/null +++ b/checkpoint-80000/rng_state_5.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864ea2379cc907eb4189c52706cb978150d9c26e18abf74679590729a8f0c8e8 +size 15984 diff --git a/checkpoint-80000/rng_state_6.pth b/checkpoint-80000/rng_state_6.pth new file mode 100644 index 0000000000000000000000000000000000000000..d9cc9104b41c399577a5b19f280ae6ba448edc4f --- /dev/null +++ b/checkpoint-80000/rng_state_6.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25755ba07299ee6ff45936dd04df329596319c9f8095af71e6f3a219e7543e26 +size 15984 diff --git a/checkpoint-80000/rng_state_7.pth b/checkpoint-80000/rng_state_7.pth new file mode 100644 index 0000000000000000000000000000000000000000..7736ce8fe27978c921d21f9fd6d1cda8c15a03f9 --- /dev/null +++ b/checkpoint-80000/rng_state_7.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:908f018cd701ed629c41299726da4a25f202f20a1d4bc2075a2266ed4013db3a +size 15984 diff --git a/checkpoint-90000/rng_state_0.pth b/checkpoint-90000/rng_state_0.pth new file mode 100644 index 0000000000000000000000000000000000000000..8be4da50e08ef9215e5f46f9cc7ebcb8fd4593eb --- /dev/null +++ b/checkpoint-90000/rng_state_0.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:955b895101e13c81ba510512313a06782795770a0bf998c90f718166d25f1664 +size 15984 diff --git a/checkpoint-90000/scheduler.pt b/checkpoint-90000/scheduler.pt new file mode 100644 index 0000000000000000000000000000000000000000..ace6a13a89006cd931f945f2fbca870787973112 --- /dev/null +++ b/checkpoint-90000/scheduler.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbdf60fc0395d520aa8a161ae52c654aaef9d995720acef234c73b1f5be8c3ea +size 1064